diff --git a/AUTHORS.md b/AUTHORS.md
deleted file mode 100644
index 5be71c9b2d598c4c9141ec23628fc8be898bf5e8..0000000000000000000000000000000000000000
--- a/AUTHORS.md
+++ /dev/null
@@ -1,70 +0,0 @@
-| Github account | name |
-|---|---|
-| abhinavarora | Abhinav Arora |
-| backyes | Yan-Fei Wang |
-| baiyfbupt | Yi-Fan Bai |
-| beckett1124 | Bin Qi |
-| ChengduoZH | Cheng-Duo Zhao|
-| chengxiaohua1105 | Xiao-Hua Cheng |
-| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
-| cxysteven | Xing-Yi Cheng |
-| dzhwinter | Zhi-Hong Dong |
-| dragonwarrior | Long Wang |
-| dyning | Yuning Du |
-| emailweixu | Wei Xu |
-| gangliao | Gang Liao |
-| gongweibao | Wei-Bao Gong |
-| guru4elephant | Daxiang Dong |
-| Guo Sheng | Sheng Guo |
-| Haichao-Zhang | Hai-Chao Zhang |
-| hedaoyuan | Dao-Yuan He |
-| helinwang | He-Lin Wang |
-| jacquesqiao | Long-Fei Qiao |
-| jczaja | Jacek Czaja |
-| JiayiFeng | Jia-Yi Feng |
-| kbinias | Krzysztof Binias |
-| kexinzhao | Ke-Xin Zhao |
-| kuke | Yi-Bing Liu |
-| lcy-seso | Ying Cao |
-| cjld | Dun Liang |
-| lipeng-unisound | Peng Li |
-| gavin1332 | Yi Liu |
-| liuyuan | Yuan Liu |
-| livc | Zhao Li |
-| llxxxll | Yong-Feng Liu |
-| luotao01 | Tao Luo |
-| lzhao4ever | Liang Zhao |
-| mozga-intel | Mateusz Ozga |
-| NHZlX | Zhao-Long Xing |
-| Noplz | Yuan Gao |
-| pakchoi | Chuan-Jiang Song |
-| panyx0718 | Xin Pan |
-| pengli09 | Peng Li |
-| pkuyym | Ya-Ming Yang |
-| pzelazko-intel | Pawel Zelazko |
-| QiJune | Jun Qi |
-| qingqing01 | Qing-Qing Dang |
-| reyoung | Yang Yu |
-| Sand3r- | Michal Gallus |
-| sfraczek | Sylwester Fraczek |
-| sneaxiy | Jin-Le Zeng |
-| Superjom | Chun-Wei Yan |
-| tensor-tang | Jian Tang |
-| tianbingsz | Tian-Bing Xu |
-| tpatejko | Tomasz Patejko |
-| typhoonzero | Yi Wu |
-| velconia | Qi-Yang Min |
-| wanghaoshuang | Hao-Shuang Wang |
-| wangyang59 | Yang Wang |
-| wangzhen-nlp | Zhen Wang |
-| wen-bo-yang | Wen-Bo Yang |
-| wojtuss | Wojciech Uss |
-| wwhu | Wei-Wei Hu |
-| xinghai-sun | Xing-Hai Sun |
-| Xreki | Yi-Qun Liu |
-| xujun05 | Jun Xu |
-| xushaoyong | Shao-Yong Xu |
-| Yancey1989 | Xu Yan |
-| zhaopu7 | Pu Zhao |
-| zhouxiao-coder | Xiao Zhou |
-| Zrachel | Rui-Qing Zhang |
diff --git a/BCLOUD b/BCLOUD
index 61f8d62370179b8c316ef17c1908318f72ff2358..d1a31ec5084bf21f4bc1d5f7d87f43cab085a24c 100755
--- a/BCLOUD
+++ b/BCLOUD
@@ -5,78 +5,4 @@ GLOBAL_CFLAGS_STR = '-g -O3 -pipe -fopenmp '
 CFLAGS(GLOBAL_CFLAGS_STR)
 GLOBAL_CXXFLAGS_STR = GLOBAL_CFLAGS_STR + ' -std=c++11 '
 CXXFLAGS(GLOBAL_CXXFLAGS_STR)
-INCPATHS('./')
-INCPATHS('$OUT/../')
-INCPATHS('../../third-party')
-INCPATHS('../../third-party/eigen')
-INCPATHS('$OUT_ROOT/baidu/third-party/python/output/include/python2.7')
-LDFLAGS('-lpthread -lcrypto -lrt -ldl -lssl -lz -lgomp -fopenmp')
-#LDFLAGS('-lpthread -lcrypto -lrt -ldl -lssl -lz -lgomp -fopenmp -lasan')
-CONFIGS('baidu/third-party/any@15595d8324be9e8a9a80d9ae442fdd12bd66df5d@git_branch')
-CONFIGS('baidu/third-party/boost@v1.41.0@git_branch')
-CONFIGS('baidu/third-party/c-ares@v1.13.0@git_branch')
-CONFIGS('baidu/third-party/eigen@917060c364181f33a735dc023818d5a54f60e54c@git_branch')
-CONFIGS('baidu/third-party/gflags@77592648e3f3be87d6c7123eb81cbad75f9aef5a@git_branch')
-CONFIGS('baidu/third-party/glog@v0.3.5@git_branch')
-CONFIGS('baidu/third-party/leveldb@v1.18@git_branch')
-CONFIGS('baidu/third-party/OpenBLAS@v0.2.20@git_branch')
-CONFIGS('baidu/third-party/protobuf@9f75c5aa851cd877fb0d93ccc31b8567a6706546@git_branch')
-CONFIGS('baidu/third-party/snappy@v1.1.7@git_branch')
-CONFIGS('baidu/third-party/snappy_stream@0.2.8@git_branch')
-CONFIGS('baidu/third-party/threadpool@9a42ec1329f259a5f4881a291db1dcb8f2ad9040@git_branch')
-CONFIGS('baidu/third-party/warpctc@warp_ctc_head@git_branch')
-CONFIGS('baidu/third-party/zlib@v1.2.8@git_branch')
-# CONFIGS('baidu/third-party/brpc@7dc04defad1fd4173aae170c3fcbde131b65155a@git_branch')
-CONFIGS('baidu/third-party/mklml@v20180406@git_branch')
-CONFIGS('baidu/third-party/xbyak@v5.661@git_branch')
-CONFIGS('baidu/third-party/xxhash@v0.6.5@git_branch')
-CONFIGS('baidu/third-party/dlpack@v0.2@git_branch')
-CONFIGS('baidu/third-party/gzstream@master@git_branch')
-CONFIGS('baidu/third-party/pybind11@v2.2.4@git_branch')
-CONFIGS('baidu/third-party/python@gcc482output@git_branch')
-CONFIGS('baidu/third-party/yaml-cpp@yaml-cpp_0-6-2-0_GEN_PD_BL@git_tag')
-CONFIGS('baidu/third-party/openmpi@openmpi_1-4-5-0-feed_mlarch@git_branch')
-CONFIGS('baidu/feed-mlarch/hopscotch-map@stable')
-CONFIGS('baidu/paddlepaddle/pslib@stable')
-CONFIGS('third-64/gtest@gtest_1-7-0-100_PD_BL')
-HEADERS('paddle/fluid/memory/*.h', '$INC/paddle/fluid/memory/')
-HEADERS('paddle/fluid/memory/detail/*.h', '$INC/paddle/fluid/memory/detail/')
-HEADERS('paddle/fluid/memory/allocation/*.h', '$INC/paddle/fluid/memory/allocation/')
-HEADERS('paddle/fluid/inference/*.h', '$INC/paddle/fluid/inference/')
-HEADERS('paddle/fluid/platform/*.h', '$INC/paddle/fluid/platform/')
-HEADERS('paddle/fluid/platform/dynload/*.h', '$INC/paddle/fluid/platform/dynload/')
-HEADERS('paddle/fluid/platform/details/*.h', '$INC/paddle/fluid/platform/details/')
-HEADERS('paddle/fluid/string/*.h', '$INC/paddle/fluid/string/')
-HEADERS('paddle/fluid/string/tinyformat/*.h', '$INC/paddle/fluid/string/tinyformat/')
-HEADERS('paddle/fluid/framework/*.h', '$INC/paddle/fluid/framework/')
-HEADERS('paddle/fluid/framework/details/*.h', '$INC/paddle/fluid/framework/details/')
-HEADERS('paddle/fluid/framework/ir/memory_optimize_pass/*.h', '$INC/paddle/fluid/framework/ir/memory_optimize_pass/')
-HEADERS('paddle/fluid/framework/ir/*.h', '$INC/paddle/fluid/framework/ir/')
-HEADERS('paddle/fluid/framework/fleet/*.h', '$INC/paddle/fluid/framework/fleet/')
-HEADERS('paddle/fluid/inference/*.h', '$INC/paddle/fluid/inference/')
-HEADERS('paddle/fluid/inference/api/*.h', '$INC/paddle/fluid/inference/api/')
-HEADERS('paddle/fluid/pybind/pybind.h', '$INC/paddle/fluid/pybind')
-HEADERS('paddle/fluid/inference/api/*.h', '$INC/paddle/fluid/inference/api/')
-HEADERS(GLOB_GEN_SRCS('paddle/fluid/framework/*pb.h'), '$INC/paddle/fluid/framework')
-HEADERS(GLOB_GEN_SRCS('paddle/fluid/platform/*pb.h'), '$INC/paddle/fluid/platform')
-PROTOC('../../third-party/protobuf/bin/protoc')
-#proto
-StaticLibrary("fake_paddle_proto", Sources(GLOB("paddle/fluid/framework/*.proto"), GLOB("paddle/fluid/platform/*.proto")))
-NEED_OUTPUT("baidu/third-party/mklml")
-NEED_OUTPUT("baidu/third-party/openmpi")
-
-CPPFLAGS_STR = '-DHPPL_STUB_FUNC -DLAPACK_FOUND -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DCUSTOM_TRAINER -DPADDLE_ON_INFERENCE -DPADDLE_USE_DSO -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_WITH_AVX -DPADDLE_WITH_MKLML -DPADDLE_WITH_XBYAK -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -DPYBIND_AVX_MKLML' + r" -DPADDLE_REVISION=\"%s@%s@%s\"" % (REPO_URL(), REPO_BRANCH(), REPO_REVISION())
-
-CFLAGS_STR = '-m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -Wno-narrowing -Wnarrowing -fopenmp -mavx -O3 -DNDEBUG '
-#CFLAGS_STR = '-m64 -fPIC -fsanitize=address -fsanitize-recover=address -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -Wno-narrowing -Wnarrowing -fopenmp -mavx -O3 -DNDEBUG '
-CXXFLAGS_STR = '-std=c++11 ' + CFLAGS_STR
-
-#SharedLibrary("paddle_fluid_avx_mklml", PreBuilt(True))
-
-application_args = [
-    CppFlags(CPPFLAGS_STR),
-    CFlags(CFLAGS_STR),
-    CxxFlags(CXXFLAGS_STR),
-    Libs(libs=['libpaddle_fluid_avx_mklml.so']),
-    Libs(module='baidu/third-party/openmpi', libs=['libmpi.so', 'libmpi_cxx.so', 'libopen-pal.so', 'libopen-rte.so']),
-]
+CONFIGS('baidu/paddlepaddle/paddle@develop@git_branch')
diff --git a/BCLOUD.paddle b/BCLOUD.paddle
deleted file mode 100644
index 3416c8d2b6905e83a519a78b9e67a1d4cbea03a9..0000000000000000000000000000000000000000
--- a/BCLOUD.paddle
+++ /dev/null
@@ -1,78 +0,0 @@
-WORKROOT('../../../')
-COMPILER('gcc482')
-CPPFLAGS('-D_GNU_SOURCE -DNDEBUG')
-GLOBAL_CFLAGS_STR = '-g -O3 -pipe '
-CFLAGS(GLOBAL_CFLAGS_STR)
-GLOBAL_CXXFLAGS_STR = GLOBAL_CFLAGS_STR + ' -std=c++11 '
-CXXFLAGS(GLOBAL_CXXFLAGS_STR)
-
-INCPATHS('./')
-INCPATHS('$OUT/../')
-INCPATHS('../../third-party')
-INCPATHS('../../third-party/eigen')
-INCPATHS('$OUT_ROOT/baidu/third-party/python/output/include/python2.7')
-LDFLAGS('-lpthread -lcrypto -lrt -ldl -lssl -lz -lrt -lgomp')
-CONFIGS('baidu/third-party/any@15595d8324be9e8a9a80d9ae442fdd12bd66df5d@git_branch')
-CONFIGS('baidu/third-party/boost@v1.41.0@git_branch')
-CONFIGS('baidu/third-party/c-ares@v1.13.0@git_branch')
-CONFIGS('baidu/third-party/eigen@917060c364181f33a735dc023818d5a54f60e54c@git_branch')
-CONFIGS('baidu/third-party/gflags@77592648e3f3be87d6c7123eb81cbad75f9aef5a@git_branch')
-CONFIGS('baidu/third-party/glog@v0.3.5@git_branch')
-CONFIGS('baidu/third-party/leveldb@v1.18@git_branch')
-CONFIGS('baidu/third-party/OpenBLAS@v0.2.20@git_branch')
-CONFIGS('baidu/third-party/protobuf@9f75c5aa851cd877fb0d93ccc31b8567a6706546@git_branch')
-CONFIGS('baidu/third-party/snappy@v1.1.7@git_branch')
-CONFIGS('baidu/third-party/snappy_stream@0.2.8@git_branch')
-CONFIGS('baidu/third-party/threadpool@9a42ec1329f259a5f4881a291db1dcb8f2ad9040@git_branch')
-CONFIGS('baidu/third-party/warpctc@warp_ctc_head@git_branch')
-CONFIGS('baidu/third-party/zlib@v1.2.8@git_branch')
-# CONFIGS('baidu/third-party/brpc@7dc04defad1fd4173aae170c3fcbde131b65155a@git_branch')
-CONFIGS('baidu/third-party/mklml@v20180406@git_branch')
-CONFIGS('baidu/third-party/xbyak@v5.661@git_branch')
-CONFIGS('baidu/third-party/xxhash@v0.6.5@git_branch')
-CONFIGS('baidu/third-party/dlpack@v0.2@git_branch')
-CONFIGS('baidu/third-party/gzstream@master@git_branch')
-CONFIGS('baidu/third-party/pybind11@v2.2.4@git_branch')
-CONFIGS('baidu/third-party/python@gcc482output@git_branch')
-CONFIGS('baidu/third-party/yaml-cpp@yaml-cpp_0-6-2-0_GEN_PD_BL@git_tag')
-CONFIGS('third-64/gtest@base')
-
-HEADERS('paddle/fluid/memory/*.h', '$INC/paddle/fluid/memory/')
-HEADERS('paddle/fluid/memory/detail/*.h', '$INC/paddle/fluid/memory/detail/')
-HEADERS('paddle/fluid/memory/allocation/*.h', '$INC/paddle/fluid/memory/allocation/')
-HEADERS('paddle/fluid/inference/*.h', '$INC/paddle/fluid/inference/')
-HEADERS('paddle/fluid/platform/*.h', '$INC/paddle/fluid/platform/')
-HEADERS('paddle/fluid/platform/dynload/*.h', '$INC/paddle/fluid/platform/dynload/')
-HEADERS('paddle/fluid/platform/details/*.h', '$INC/paddle/fluid/platform/details/')
-HEADERS('paddle/fluid/string/*.h', '$INC/paddle/fluid/string/')
-HEADERS('paddle/fluid/string/tinyformat/*.h', '$INC/paddle/fluid/string/tinyformat/')
-HEADERS('paddle/fluid/framework/*.h', '$INC/paddle/fluid/framework/')
-HEADERS('paddle/fluid/framework/details/*.h', '$INC/paddle/fluid/framework/details/')
-HEADERS('paddle/fluid/framework/ir/memory_optimize_pass/*.h', '$INC/paddle/fluid/framework/ir/memory_optimize_pass/')
-HEADERS('paddle/fluid/framework/ir/*.h', '$INC/paddle/fluid/framework/ir/')
-HEADERS('paddle/fluid/framework/fleet/*.h', '$INC/paddle/fluid/framework/fleet/')
-HEADERS('paddle/fluid/inference/*.h', '$INC/paddle/fluid/inference/')
-HEADERS('paddle/fluid/inference/api/*.h', '$INC/paddle/fluid/inference/api/')
-HEADERS('paddle/fluid/pybind/pybind.h', '$INC/paddle/fluid/pybind')
-HEADERS('paddle/fluid/inference/api/*.h', '$INC/paddle/fluid/inference/api/')
-HEADERS(GLOB_GEN_SRCS('paddle/fluid/framework/*pb.h'), '$INC/paddle/fluid/framework')
-HEADERS(GLOB_GEN_SRCS('paddle/fluid/platform/*pb.h'), '$INC/paddle/fluid/platform')
-
-
-
-
-PROTOC('../../third-party/protobuf/bin/protoc')
-paddle_fluid_avx_mklml_src = "paddle/fluid/memory/detail/memory_block.cc paddle/fluid/memory/detail/memory_block_desc.cc paddle/fluid/memory/detail/meta_cache.cc paddle/fluid/memory/detail/system_allocator.cc paddle/fluid/memory/detail/buddy_allocator.cc paddle/fluid/memory/allocation/allocator.cc paddle/fluid/memory/allocation/cpu_allocator.cc paddle/fluid/memory/allocation/locked_allocator.cc paddle/fluid/memory/allocation/buffered_allocator.cc paddle/fluid/memory/allocation/best_fit_allocator.cc paddle/fluid/memory/allocation/naive_best_fit_allocator.cc paddle/fluid/memory/allocation/retry_allocator.cc paddle/fluid/memory/allocation/aligned_allocator.cc paddle/fluid/memory/allocation/allocator_strategy.cc paddle/fluid/memory/allocation/allocator_facade.cc paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc paddle/fluid/memory/malloc.cc paddle/fluid/memory/memcpy.cc paddle/fluid/platform/profiler.proto paddle/fluid/platform/enforce.cc paddle/fluid/platform/cpu_info.cc paddle/fluid/platform/place.cc paddle/fluid/platform/dynload/dynamic_loader.cc paddle/fluid/platform/dynload/warpctc.cc paddle/fluid/platform/dynload/mklml.cc paddle/fluid/platform/cpu_helper.cc paddle/fluid/platform/temporary_allocator.cc paddle/fluid/platform/device_context.cc paddle/fluid/platform/init.cc paddle/fluid/platform/timer.cc paddle/fluid/platform/lodtensor_printer.cc paddle/fluid/platform/device_tracer.cc paddle/fluid/platform/profiler.cc paddle/fluid/platform/device_memory_aligment.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc paddle/fluid/framework/ir/node.cc paddle/fluid/framework/ir/graph.cc paddle/fluid/framework/ir/graph_helper.cc paddle/fluid/framework/ir/pass.cc paddle/fluid/framework/ir/graph_traits.cc paddle/fluid/framework/ir/graph_pattern_detector.cc paddle/fluid/framework/ir/fuse_pass_base.cc paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc paddle/fluid/framework/ir/graph_to_program_pass.cc paddle/fluid/framework/ir/graph_viz_pass.cc paddle/fluid/framework/ir/lock_free_optimize_pass.cc paddle/fluid/framework/ir/fc_fuse_pass.cc paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc paddle/fluid/framework/ir/infer_clean_graph_pass.cc paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc paddle/fluid/framework/ir/fc_gru_fuse_pass.cc paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc paddle/fluid/framework/ir/multi_batch_merge_pass.cc paddle/fluid/framework/ir/conv_bn_fuse_pass.cc paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc paddle/fluid/framework/ir/is_test_pass.cc paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc paddle/fluid/framework/ir/sync_batch_norm_pass.cc paddle/fluid/framework/ir/runtime_context_cache_pass.cc paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc paddle/fluid/framework/ir/pass_builder.cc paddle/fluid/framework/details/var_handle.cc paddle/fluid/framework/details/op_handle_base.cc paddle/fluid/framework/details/scale_loss_grad_op_handle.cc paddle/fluid/framework/details/fetch_op_handle.cc paddle/fluid/framework/details/computation_op_handle.cc paddle/fluid/framework/details/rpc_op_handle.cc paddle/fluid/framework/details/fetch_barrier_op_handle.cc paddle/fluid/framework/details/multi_devices_helper.cc paddle/fluid/framework/details/variable_visitor.cc paddle/fluid/framework/details/all_reduce_op_handle.cc paddle/fluid/framework/details/fused_all_reduce_op_handle.cc paddle/fluid/framework/details/reduce_op_handle.cc paddle/fluid/framework/details/broadcast_op_handle.cc paddle/fluid/framework/details/fused_broadcast_op_handle.cc paddle/fluid/framework/details/gather_op_handle.cc paddle/fluid/framework/details/eager_deletion_op_handle.cc paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc paddle/fluid/framework/details/ssa_graph_executor.cc paddle/fluid/framework/details/threaded_ssa_graph_executor.cc paddle/fluid/framework/details/parallel_ssa_graph_executor.cc paddle/fluid/framework/details/async_ssa_graph_executor.cc paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc paddle/fluid/framework/details/build_strategy.cc paddle/fluid/framework/fleet/fleet_wrapper.cc paddle/fluid/framework/fleet/nccl_wrapper.cc paddle/fluid/framework/io/fs.cc paddle/fluid/framework/io/shell.cc paddle/fluid/framework/framework.proto paddle/fluid/framework/data_feed.proto paddle/fluid/framework/trainer_desc.proto paddle/fluid/framework/ddim.cc paddle/fluid/framework/data_type.cc paddle/fluid/framework/tensor.cc paddle/fluid/framework/tensor_util.cc paddle/fluid/framework/lod_tensor.cc paddle/fluid/framework/garbage_collector.cc paddle/fluid/framework/reader.cc paddle/fluid/framework/threadpool.cc paddle/fluid/framework/var_type_traits.cc paddle/fluid/framework/scope.cc paddle/fluid/framework/scope_pool.cc paddle/fluid/framework/data_device_transform.cc paddle/fluid/framework/data_type_transform.cc paddle/fluid/framework/data_layout_transform.cc paddle/fluid/framework/data_transform.cc paddle/fluid/framework/attribute.cc paddle/fluid/framework/op_proto_maker.cc paddle/fluid/framework/op_info.cc paddle/fluid/framework/shape_inference.cc paddle/fluid/framework/transfer_scope_cache.cc paddle/fluid/framework/op_kernel_type.cc paddle/fluid/framework/operator.cc paddle/fluid/framework/version.cc paddle/fluid/framework/var_desc.cc paddle/fluid/framework/op_desc.cc paddle/fluid/framework/block_desc.cc paddle/fluid/framework/program_desc.cc paddle/fluid/framework/op_registry.cc paddle/fluid/framework/lod_rank_table.cc paddle/fluid/framework/feed_fetch_method.cc paddle/fluid/framework/variable_helper.cc paddle/fluid/framework/naive_executor.cc paddle/fluid/framework/executor_gc_helper.cc paddle/fluid/framework/executor.cc paddle/fluid/framework/multi_trainer.cc paddle/fluid/framework/pipeline_trainer.cc paddle/fluid/framework/dataset_factory.cc paddle/fluid/framework/dist_multi_trainer.cc paddle/fluid/framework/trainer_factory.cc paddle/fluid/framework/trainer.cc paddle/fluid/framework/data_feed_factory.cc paddle/fluid/framework/data_feed.cc paddle/fluid/framework/device_worker.cc paddle/fluid/framework/hogwild_worker.cc paddle/fluid/framework/downpour_worker.cc paddle/fluid/framework/pull_dense_worker.cc paddle/fluid/framework/section_worker.cc paddle/fluid/framework/device_worker_factory.cc paddle/fluid/framework/data_set.cc paddle/fluid/framework/parallel_executor.cc paddle/fluid/framework/prune.cc paddle/fluid/framework/selected_rows.cc paddle/fluid/framework/dlpack_tensor.cc paddle/fluid/imperative/flags.cc paddle/fluid/operators/math/detail/avx_functions.cc paddle/fluid/operators/math/concat_and_split.cc paddle/fluid/operators/math/context_project.cc paddle/fluid/operators/math/cross_entropy.cc paddle/fluid/operators/math/cos_sim_functor.cc paddle/fluid/operators/math/im2col.cc paddle/fluid/operators/math/sample_prob.cc paddle/fluid/operators/math/sampler.cc paddle/fluid/operators/math/gru_compute.cc paddle/fluid/operators/math/lstm_compute.cc paddle/fluid/operators/math/blas.cc paddle/fluid/operators/math/math_function.cc paddle/fluid/operators/math/maxouting.cc paddle/fluid/operators/math/pooling.cc paddle/fluid/operators/math/selected_rows_functor.cc paddle/fluid/operators/math/sequence2batch.cc paddle/fluid/operators/math/sequence_padding.cc paddle/fluid/operators/math/sequence_pooling.cc paddle/fluid/operators/math/sequence_scale.cc paddle/fluid/operators/math/softmax.cc paddle/fluid/operators/math/beam_search.cc paddle/fluid/operators/math/matrix_bit_code.cc paddle/fluid/operators/math/unpooling.cc paddle/fluid/operators/math/vol2col.cc paddle/fluid/operators/math/tree2col.cc paddle/fluid/operators/controlflow/feed_op.cc paddle/fluid/operators/controlflow/logical_op.cc paddle/fluid/operators/controlflow/while_op.cc paddle/fluid/operators/controlflow/get_places_op.cc paddle/fluid/operators/controlflow/fetch_op.cc paddle/fluid/operators/controlflow/compare_op.cc paddle/fluid/operators/controlflow/conditional_block_infer_op.cc paddle/fluid/operators/controlflow/conditional_block_op.cc paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc paddle/fluid/operators/controlflow/op_variant.cc paddle/fluid/operators/controlflow/recurrent_op_helper.cc paddle/fluid/operators/controlflow/while_op_helper.cc paddle/fluid/operators/detection/bipartite_match_op.cc paddle/fluid/operators/detection/box_coder_op.cc paddle/fluid/operators/detection/iou_similarity_op.cc paddle/fluid/operators/detection/mine_hard_examples_op.cc paddle/fluid/operators/detection/multiclass_nms_op.cc paddle/fluid/operators/detection/poly_util.cc paddle/fluid/operators/detection/gpc.cc paddle/fluid/operators/detection/prior_box_op.cc paddle/fluid/operators/detection/density_prior_box_op.cc paddle/fluid/operators/detection/anchor_generator_op.cc paddle/fluid/operators/detection/target_assign_op.cc paddle/fluid/operators/detection/polygon_box_transform_op.cc paddle/fluid/operators/detection/rpn_target_assign_op.cc paddle/fluid/operators/detection/generate_proposal_labels_op.cc paddle/fluid/operators/detection/box_clip_op.cc paddle/fluid/operators/detection/yolov3_loss_op.cc paddle/fluid/operators/detection/yolo_box_op.cc paddle/fluid/operators/detection/box_decoder_and_assign_op.cc paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc paddle/fluid/operators/detection/retinanet_detection_output_op.cc paddle/fluid/operators/detection/generate_proposals_op.cc paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc paddle/fluid/operators/detection/collect_fpn_proposals_op.cc paddle/fluid/operators/detection/roi_perspective_transform_op.cc paddle/fluid/operators/detection/mask_util.cc paddle/fluid/operators/detection/generate_mask_labels_op.cc paddle/fluid/operators/elementwise/elementwise_mod_op.cc paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc paddle/fluid/operators/elementwise/elementwise_max_op.cc paddle/fluid/operators/elementwise/elementwise_pow_op.cc paddle/fluid/operators/elementwise/elementwise_sub_op.cc paddle/fluid/operators/elementwise/elementwise_add_op.cc paddle/fluid/operators/elementwise/elementwise_min_op.cc paddle/fluid/operators/elementwise/elementwise_div_op.cc paddle/fluid/operators/elementwise/elementwise_mul_op.cc paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc paddle/fluid/operators/fused/fusion_gru_op.cc paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc paddle/fluid/operators/fused/fusion_lstm_op.cc paddle/fluid/operators/fused/fused_elemwise_activation_op.cc paddle/fluid/operators/metrics/accuracy_op.cc paddle/fluid/operators/metrics/precision_recall_op.cc paddle/fluid/operators/metrics/auc_op.cc paddle/fluid/operators/optimizers/adamax_op.cc paddle/fluid/operators/optimizers/sgd_op.cc paddle/fluid/operators/optimizers/lars_momentum_op.cc paddle/fluid/operators/optimizers/adagrad_op.cc paddle/fluid/operators/optimizers/ftrl_op.cc paddle/fluid/operators/optimizers/momentum_op.cc paddle/fluid/operators/optimizers/adadelta_op.cc paddle/fluid/operators/optimizers/rmsprop_op.cc paddle/fluid/operators/optimizers/lamb_op.cc paddle/fluid/operators/optimizers/proximal_gd_op.cc paddle/fluid/operators/optimizers/proximal_adagrad_op.cc paddle/fluid/operators/optimizers/adam_op.cc paddle/fluid/operators/optimizers/decayed_adagrad_op.cc paddle/fluid/operators/reduce_ops/reduce_all_op.cc paddle/fluid/operators/reduce_ops/reduce_min_op.cc paddle/fluid/operators/reduce_ops/reduce_sum_op.cc paddle/fluid/operators/reduce_ops/reduce_any_op.cc paddle/fluid/operators/reduce_ops/reduce_max_op.cc paddle/fluid/operators/reduce_ops/reduce_mean_op.cc paddle/fluid/operators/reduce_ops/reduce_prod_op.cc paddle/fluid/operators/sequence_ops/sequence_erase_op.cc paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc paddle/fluid/operators/sequence_ops/sequence_mask_op.cc paddle/fluid/operators/sequence_ops/sequence_expand_op.cc paddle/fluid/operators/sequence_ops/sequence_pad_op.cc paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc paddle/fluid/operators/sequence_ops/sequence_slice_op.cc paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc paddle/fluid/operators/sequence_ops/sequence_pool_op.cc paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc paddle/fluid/operators/sequence_ops/sequence_conv_op.cc paddle/fluid/operators/sequence_ops/sequence_concat_op.cc paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc paddle/fluid/operators/jit/helper.cc paddle/fluid/operators/jit/kernel_key.cc paddle/fluid/operators/jit/gen_base.cc paddle/fluid/operators/jit/kernel_pool.cc paddle/fluid/operators/jit/refer/refer.cc paddle/fluid/operators/jit/more/mkl/mkl.cc paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc paddle/fluid/operators/jit/more/mix/mix.cc paddle/fluid/operators/jit/gen/sgd.cc paddle/fluid/operators/jit/gen/hopv.cc paddle/fluid/operators/jit/gen/lstm.cc paddle/fluid/operators/jit/gen/gru.cc paddle/fluid/operators/jit/gen/vbroadcast.cc paddle/fluid/operators/jit/gen/matmul.cc paddle/fluid/operators/jit/gen/seqpool.cc paddle/fluid/operators/jit/gen/embseqpool.cc paddle/fluid/operators/jit/gen/act.cc paddle/fluid/operators/jit/gen/blas.cc paddle/fluid/operators/reader/reader_op_registry.cc paddle/fluid/operators/reader/py_reader.cc paddle/fluid/operators/reader/buffered_reader.cc paddle/fluid/operators/reader/open_files_op.cc paddle/fluid/operators/reader/create_random_data_generator_op.cc paddle/fluid/operators/reader/create_shuffle_reader_op.cc paddle/fluid/operators/reader/create_batch_reader_op.cc paddle/fluid/operators/reader/create_recordio_file_reader_op.cc paddle/fluid/operators/reader/create_double_buffer_reader_op.cc paddle/fluid/operators/reader/create_multi_pass_reader_op.cc paddle/fluid/operators/reader/create_custom_reader_op.cc paddle/fluid/operators/reader/create_py_reader_op.cc paddle/fluid/operators/reader/read_op.cc paddle/fluid/operators/increment_op.cc paddle/fluid/operators/stack_op.cc paddle/fluid/operators/fc_op.cc paddle/fluid/operators/assign_op.cc paddle/fluid/operators/load_op.cc paddle/fluid/operators/fill_op.cc paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc paddle/fluid/operators/conv_shift_op.cc paddle/fluid/operators/fill_zeros_like_op.cc paddle/fluid/operators/hash_op.cc paddle/fluid/operators/dequantize_op.cc paddle/fluid/operators/fake_quantize_op.cc paddle/fluid/operators/size_op.cc paddle/fluid/operators/scatter_op.cc paddle/fluid/operators/uniform_random_op.cc paddle/fluid/operators/beam_search_op.cc paddle/fluid/operators/beam_search_decode_op.cc paddle/fluid/operators/dropout_op.cc paddle/fluid/operators/interpolate_op.cc paddle/fluid/operators/sampling_id_op.cc paddle/fluid/operators/lstm_op.cc paddle/fluid/operators/modified_huber_loss_op.cc paddle/fluid/operators/temporal_shift_op.cc paddle/fluid/operators/sum_op.cc paddle/fluid/operators/arg_min_op.cc paddle/fluid/operators/psroi_pool_op.cc paddle/fluid/operators/uniform_random_batch_size_like_op.cc paddle/fluid/operators/rnn_memory_helper_op.cc paddle/fluid/operators/crf_decoding_op.cc paddle/fluid/operators/where_op.cc paddle/fluid/operators/fake_dequantize_op.cc paddle/fluid/operators/mean_iou_op.cc paddle/fluid/operators/roi_align_op.cc paddle/fluid/operators/range_op.cc paddle/fluid/operators/edit_distance_op.cc paddle/fluid/operators/multiplex_op.cc paddle/fluid/operators/clip_op.cc paddle/fluid/operators/gaussian_random_op.cc paddle/fluid/operators/norm_op.cc paddle/fluid/operators/rank_loss_op.cc paddle/fluid/operators/detection_map_op.cc paddle/fluid/operators/lstm_unit_op.cc paddle/fluid/operators/shard_index_op.cc paddle/fluid/operators/shape_op.cc paddle/fluid/operators/arg_max_op.cc paddle/fluid/operators/average_accumulates_op.cc paddle/fluid/operators/requantize_op.cc paddle/fluid/operators/conv_op.cc paddle/fluid/operators/add_position_encoding_op.cc paddle/fluid/operators/gru_unit_op.cc paddle/fluid/operators/batch_norm_op.cc paddle/fluid/operators/chunk_eval_op.cc paddle/fluid/operators/lod_rank_table_op.cc paddle/fluid/operators/unsqueeze_op.cc paddle/fluid/operators/positive_negative_pair_op.cc paddle/fluid/operators/im2sequence_op.cc paddle/fluid/operators/margin_rank_loss_op.cc paddle/fluid/operators/hinge_loss_op.cc paddle/fluid/operators/cvm_op.cc paddle/fluid/operators/huber_loss_op.cc paddle/fluid/operators/crop_op.cc paddle/fluid/operators/activation_op.cc paddle/fluid/operators/hierarchical_sigmoid_op.cc paddle/fluid/operators/unfold_op.cc paddle/fluid/operators/max_sequence_len_op.cc paddle/fluid/operators/mul_op.cc paddle/fluid/operators/attention_lstm_op.cc paddle/fluid/operators/top_k_op.cc paddle/fluid/operators/group_norm_op.cc paddle/fluid/operators/selu_op.cc paddle/fluid/operators/lstmp_op.cc paddle/fluid/operators/merge_lod_tensor_op.cc paddle/fluid/operators/truncated_gaussian_random_op.cc paddle/fluid/operators/label_smooth_op.cc paddle/fluid/operators/matmul_op.cc paddle/fluid/operators/spp_op.cc paddle/fluid/operators/unstack_op.cc paddle/fluid/operators/conv_transpose_op.cc paddle/fluid/operators/diag_op.cc paddle/fluid/operators/unpool_op.cc paddle/fluid/operators/lod_array_length_op.cc paddle/fluid/operators/affine_channel_op.cc paddle/fluid/operators/log_loss_op.cc paddle/fluid/operators/concat_op.cc paddle/fluid/operators/lod_tensor_to_array_op.cc paddle/fluid/operators/gru_op.cc paddle/fluid/operators/coalesce_tensor_op.cc paddle/fluid/operators/fsp_op.cc paddle/fluid/operators/linspace_op.cc paddle/fluid/operators/reverse_op.cc paddle/fluid/operators/recurrent_op.cc paddle/fluid/operators/split_selected_rows_op.cc paddle/fluid/operators/dgc_clip_by_norm_op.cc paddle/fluid/operators/scale_op.cc paddle/fluid/operators/save_op.cc paddle/fluid/operators/load_combine_op.cc paddle/fluid/operators/merge_selected_rows_op.cc paddle/fluid/operators/split_op.cc paddle/fluid/operators/cumsum_op.cc paddle/fluid/operators/deformable_psroi_pooling_op.cc paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc paddle/fluid/operators/transpose_op.cc paddle/fluid/operators/fill_constant_batch_size_like_op.cc paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc paddle/fluid/operators/shuffle_channel_op.cc paddle/fluid/operators/affine_grid_op.cc paddle/fluid/operators/split_lod_tensor_op.cc paddle/fluid/operators/grid_sampler_op.cc paddle/fluid/operators/lookup_table_op.cc paddle/fluid/operators/cos_sim_op.cc paddle/fluid/operators/quantize_op.cc paddle/fluid/operators/spectral_norm_op.cc paddle/fluid/operators/cross_entropy_op.cc paddle/fluid/operators/print_op.cc paddle/fluid/operators/lrn_op.cc paddle/fluid/operators/nce_op.cc paddle/fluid/operators/similarity_focus_op.cc paddle/fluid/operators/get_tensor_from_selected_rows_op.cc paddle/fluid/operators/squared_l2_distance_op.cc paddle/fluid/operators/cudnn_lstm_op.cc paddle/fluid/operators/tree_conv_op.cc paddle/fluid/operators/one_hot_op.cc paddle/fluid/operators/lookup_sparse_table_op.cc paddle/fluid/operators/unique_op.cc paddle/fluid/operators/mean_op.cc paddle/fluid/operators/prelu_op.cc paddle/fluid/operators/delete_var_op.cc paddle/fluid/operators/ctc_align_op.cc paddle/fluid/operators/argsort_op.cc paddle/fluid/operators/data_norm_op.cc paddle/fluid/operators/minus_op.cc paddle/fluid/operators/shrink_rnn_memory_op.cc paddle/fluid/operators/lod_reset_op.cc paddle/fluid/operators/l1_norm_op.cc paddle/fluid/operators/gaussian_random_batch_size_like_op.cc paddle/fluid/operators/is_empty_op.cc paddle/fluid/operators/bilinear_tensor_product_op.cc paddle/fluid/operators/kldiv_loss_op.cc paddle/fluid/operators/squeeze_op.cc paddle/fluid/operators/softmax_op.cc paddle/fluid/operators/clip_by_norm_op.cc paddle/fluid/operators/pool_with_index_op.cc paddle/fluid/operators/linear_chain_crf_op.cc paddle/fluid/operators/reshape_op.cc paddle/fluid/operators/fill_constant_op.cc paddle/fluid/operators/space_to_depth_op.cc paddle/fluid/operators/gather_op.cc paddle/fluid/operators/softmax_with_cross_entropy_op.cc paddle/fluid/operators/slice_op.cc paddle/fluid/operators/sign_op.cc paddle/fluid/operators/expand_op.cc paddle/fluid/operators/smooth_l1_loss_op.cc paddle/fluid/operators/tensor_array_to_tensor_op.cc paddle/fluid/operators/row_conv_op.cc paddle/fluid/operators/pad2d_op.cc paddle/fluid/operators/pixel_shuffle_op.cc paddle/fluid/operators/assign_value_op.cc paddle/fluid/operators/random_crop_op.cc paddle/fluid/operators/squared_l2_norm_op.cc paddle/fluid/operators/save_combine_op.cc paddle/fluid/operators/pool_op.cc paddle/fluid/operators/cast_op.cc paddle/fluid/operators/array_to_lod_tensor_op.cc paddle/fluid/operators/fill_any_like_op.cc paddle/fluid/operators/flatten_op.cc paddle/fluid/operators/sample_logits_op.cc paddle/fluid/operators/pad_op.cc paddle/fluid/operators/bpr_loss_op.cc paddle/fluid/operators/roi_pool_op.cc paddle/fluid/operators/pad_constant_like_op.cc paddle/fluid/operators/isfinite_op.cc paddle/fluid/operators/layer_norm_op.cc paddle/fluid/operators/maxout_op.cc paddle/fluid/operators/warpctc_op.cc paddle/fluid/string/piece.cc paddle/fluid/string/pretty_log.cc paddle/fluid/string/string_helper.cc paddle/fluid/recordio/header.cc paddle/fluid/recordio/chunk.cc paddle/fluid/recordio/writer.cc paddle/fluid/recordio/scanner.cc paddle/fluid/inference/io.cc paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc paddle/fluid/inference/analysis/passes/passes.cc paddle/fluid/inference/analysis/helper.cc paddle/fluid/inference/analysis/ir_pass_manager.cc paddle/fluid/inference/analysis/argument.cc paddle/fluid/inference/analysis/analysis_pass.cc paddle/fluid/inference/analysis/analyzer.cc paddle/fluid/inference/utils/benchmark.cc paddle/fluid/inference/api/api.cc paddle/fluid/inference/api/api_impl.cc paddle/fluid/inference/api/helper.cc paddle/fluid/inference/api/analysis_predictor.cc paddle/fluid/inference/api/details/zero_copy_tensor.cc paddle/fluid/inference/api/details/reset_tensor_array.cc paddle/fluid/inference/api/analysis_config.cc paddle/fluid/inference/api/paddle_pass_builder.cc"
-paddle_fluid_avx_mklml_src += ' paddle/fluid/framework/revision.cc'
-
-StaticLibrary('paddle_fluid_avx_mklml', Sources(paddle_fluid_avx_mklml_src, CppFlags('-DHPPL_STUB_FUNC -DLAPACK_FOUND -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_ON_INFERENCE -DPADDLE_USE_DSO -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_WITH_AVX -DPADDLE_WITH_MKLML -DPADDLE_WITH_XBYAK -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -DPYBIND_AVX_MKLML' + r" -DPADDLE_REVISION=\"%s@%s@%s\"" % (REPO_URL(), REPO_BRANCH(), REPO_REVISION())), CFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -fopenmp -mavx -O3 -DNDEBUG '), CxxFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -fopenmp -mavx -O3 -DNDEBUG ')))
-SharedLibrary('paddle_fluid_avx_mklml', Sources(paddle_fluid_avx_mklml_src,  CppFlags('-DHPPL_STUB_FUNC -DLAPACK_FOUND -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_ON_INFERENCE -DPADDLE_USE_DSO -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_WITH_AVX -DPADDLE_WITH_MKLML -DPADDLE_WITH_XBYAK -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -DPYBIND_AVX_MKLML' + r" -DPADDLE_REVISION=\"%s@%s@%s\"" % (REPO_URL(), REPO_BRANCH(), REPO_REVISION())), CFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -fopenmp -mavx -O3 -DNDEBUG '), CxxFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -fopenmp -mavx -O3 -DNDEBUG ')), LinkDeps(True))
-
-paddle_fluid_noavx_openblas_src = "paddle/fluid/memory/detail/memory_block.cc paddle/fluid/memory/detail/memory_block_desc.cc paddle/fluid/memory/detail/meta_cache.cc paddle/fluid/memory/detail/system_allocator.cc paddle/fluid/memory/detail/buddy_allocator.cc paddle/fluid/memory/allocation/allocator.cc paddle/fluid/memory/allocation/cpu_allocator.cc paddle/fluid/memory/allocation/locked_allocator.cc paddle/fluid/memory/allocation/buffered_allocator.cc paddle/fluid/memory/allocation/best_fit_allocator.cc paddle/fluid/memory/allocation/naive_best_fit_allocator.cc paddle/fluid/memory/allocation/retry_allocator.cc paddle/fluid/memory/allocation/aligned_allocator.cc paddle/fluid/memory/allocation/allocator_strategy.cc paddle/fluid/memory/allocation/allocator_facade.cc paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc paddle/fluid/memory/malloc.cc paddle/fluid/memory/memcpy.cc paddle/fluid/platform/profiler.proto paddle/fluid/platform/enforce.cc paddle/fluid/platform/cpu_info.cc paddle/fluid/platform/place.cc paddle/fluid/platform/dynload/dynamic_loader.cc paddle/fluid/platform/dynload/warpctc.cc paddle/fluid/platform/cpu_helper.cc paddle/fluid/platform/temporary_allocator.cc paddle/fluid/platform/device_context.cc paddle/fluid/platform/init.cc paddle/fluid/platform/timer.cc paddle/fluid/platform/lodtensor_printer.cc paddle/fluid/platform/device_tracer.cc paddle/fluid/platform/profiler.cc paddle/fluid/platform/device_memory_aligment.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc paddle/fluid/framework/ir/node.cc paddle/fluid/framework/ir/graph.cc paddle/fluid/framework/ir/graph_helper.cc paddle/fluid/framework/ir/pass.cc paddle/fluid/framework/ir/graph_traits.cc paddle/fluid/framework/ir/graph_pattern_detector.cc paddle/fluid/framework/ir/fuse_pass_base.cc paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc paddle/fluid/framework/ir/graph_to_program_pass.cc paddle/fluid/framework/ir/graph_viz_pass.cc paddle/fluid/framework/ir/lock_free_optimize_pass.cc paddle/fluid/framework/ir/fc_fuse_pass.cc paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc paddle/fluid/framework/ir/infer_clean_graph_pass.cc paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc paddle/fluid/framework/ir/fc_gru_fuse_pass.cc paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc paddle/fluid/framework/ir/multi_batch_merge_pass.cc paddle/fluid/framework/ir/conv_bn_fuse_pass.cc paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc paddle/fluid/framework/ir/is_test_pass.cc paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc paddle/fluid/framework/ir/sync_batch_norm_pass.cc paddle/fluid/framework/ir/runtime_context_cache_pass.cc paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc paddle/fluid/framework/ir/pass_builder.cc paddle/fluid/framework/details/var_handle.cc paddle/fluid/framework/details/op_handle_base.cc paddle/fluid/framework/details/scale_loss_grad_op_handle.cc paddle/fluid/framework/details/fetch_op_handle.cc paddle/fluid/framework/details/computation_op_handle.cc paddle/fluid/framework/details/rpc_op_handle.cc paddle/fluid/framework/details/fetch_barrier_op_handle.cc paddle/fluid/framework/details/multi_devices_helper.cc paddle/fluid/framework/details/variable_visitor.cc paddle/fluid/framework/details/all_reduce_op_handle.cc paddle/fluid/framework/details/fused_all_reduce_op_handle.cc paddle/fluid/framework/details/reduce_op_handle.cc paddle/fluid/framework/details/broadcast_op_handle.cc paddle/fluid/framework/details/fused_broadcast_op_handle.cc paddle/fluid/framework/details/gather_op_handle.cc paddle/fluid/framework/details/eager_deletion_op_handle.cc paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc paddle/fluid/framework/details/ssa_graph_executor.cc paddle/fluid/framework/details/threaded_ssa_graph_executor.cc paddle/fluid/framework/details/parallel_ssa_graph_executor.cc paddle/fluid/framework/details/async_ssa_graph_executor.cc paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc paddle/fluid/framework/details/build_strategy.cc paddle/fluid/framework/fleet/fleet_wrapper.cc paddle/fluid/framework/fleet/nccl_wrapper.cc paddle/fluid/framework/io/fs.cc paddle/fluid/framework/io/shell.cc paddle/fluid/framework/framework.proto paddle/fluid/framework/data_feed.proto paddle/fluid/framework/trainer_desc.proto paddle/fluid/framework/ddim.cc paddle/fluid/framework/data_type.cc paddle/fluid/framework/tensor.cc paddle/fluid/framework/tensor_util.cc paddle/fluid/framework/lod_tensor.cc paddle/fluid/framework/garbage_collector.cc paddle/fluid/framework/reader.cc paddle/fluid/framework/threadpool.cc paddle/fluid/framework/var_type_traits.cc paddle/fluid/framework/scope.cc paddle/fluid/framework/scope_pool.cc paddle/fluid/framework/data_device_transform.cc paddle/fluid/framework/data_type_transform.cc paddle/fluid/framework/data_layout_transform.cc paddle/fluid/framework/data_transform.cc paddle/fluid/framework/attribute.cc paddle/fluid/framework/op_proto_maker.cc paddle/fluid/framework/op_info.cc paddle/fluid/framework/shape_inference.cc paddle/fluid/framework/transfer_scope_cache.cc paddle/fluid/framework/op_kernel_type.cc paddle/fluid/framework/operator.cc paddle/fluid/framework/version.cc paddle/fluid/framework/var_desc.cc paddle/fluid/framework/op_desc.cc paddle/fluid/framework/block_desc.cc paddle/fluid/framework/program_desc.cc paddle/fluid/framework/op_registry.cc paddle/fluid/framework/lod_rank_table.cc paddle/fluid/framework/feed_fetch_method.cc paddle/fluid/framework/variable_helper.cc paddle/fluid/framework/naive_executor.cc paddle/fluid/framework/executor_gc_helper.cc paddle/fluid/framework/executor.cc paddle/fluid/framework/multi_trainer.cc paddle/fluid/framework/pipeline_trainer.cc paddle/fluid/framework/dataset_factory.cc paddle/fluid/framework/dist_multi_trainer.cc paddle/fluid/framework/trainer_factory.cc paddle/fluid/framework/trainer.cc paddle/fluid/framework/data_feed_factory.cc paddle/fluid/framework/data_feed.cc paddle/fluid/framework/device_worker.cc paddle/fluid/framework/hogwild_worker.cc paddle/fluid/framework/downpour_worker.cc paddle/fluid/framework/pull_dense_worker.cc paddle/fluid/framework/section_worker.cc paddle/fluid/framework/device_worker_factory.cc paddle/fluid/framework/data_set.cc paddle/fluid/framework/parallel_executor.cc paddle/fluid/framework/prune.cc paddle/fluid/framework/selected_rows.cc paddle/fluid/framework/dlpack_tensor.cc paddle/fluid/imperative/flags.cc paddle/fluid/operators/math/detail/avx_functions.cc paddle/fluid/operators/math/concat_and_split.cc paddle/fluid/operators/math/context_project.cc paddle/fluid/operators/math/cross_entropy.cc paddle/fluid/operators/math/cos_sim_functor.cc paddle/fluid/operators/math/im2col.cc paddle/fluid/operators/math/sample_prob.cc paddle/fluid/operators/math/sampler.cc paddle/fluid/operators/math/gru_compute.cc paddle/fluid/operators/math/lstm_compute.cc paddle/fluid/operators/math/blas.cc paddle/fluid/operators/math/math_function.cc paddle/fluid/operators/math/maxouting.cc paddle/fluid/operators/math/pooling.cc paddle/fluid/operators/math/selected_rows_functor.cc paddle/fluid/operators/math/sequence2batch.cc paddle/fluid/operators/math/sequence_padding.cc paddle/fluid/operators/math/sequence_pooling.cc paddle/fluid/operators/math/sequence_scale.cc paddle/fluid/operators/math/softmax.cc paddle/fluid/operators/math/beam_search.cc paddle/fluid/operators/math/matrix_bit_code.cc paddle/fluid/operators/math/unpooling.cc paddle/fluid/operators/math/vol2col.cc paddle/fluid/operators/math/tree2col.cc paddle/fluid/operators/controlflow/feed_op.cc paddle/fluid/operators/controlflow/logical_op.cc paddle/fluid/operators/controlflow/while_op.cc paddle/fluid/operators/controlflow/get_places_op.cc paddle/fluid/operators/controlflow/fetch_op.cc paddle/fluid/operators/controlflow/compare_op.cc paddle/fluid/operators/controlflow/conditional_block_infer_op.cc paddle/fluid/operators/controlflow/conditional_block_op.cc paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc paddle/fluid/operators/controlflow/op_variant.cc paddle/fluid/operators/controlflow/recurrent_op_helper.cc paddle/fluid/operators/controlflow/while_op_helper.cc paddle/fluid/operators/detection/bipartite_match_op.cc paddle/fluid/operators/detection/box_coder_op.cc paddle/fluid/operators/detection/iou_similarity_op.cc paddle/fluid/operators/detection/mine_hard_examples_op.cc paddle/fluid/operators/detection/multiclass_nms_op.cc paddle/fluid/operators/detection/poly_util.cc paddle/fluid/operators/detection/gpc.cc paddle/fluid/operators/detection/prior_box_op.cc paddle/fluid/operators/detection/density_prior_box_op.cc paddle/fluid/operators/detection/anchor_generator_op.cc paddle/fluid/operators/detection/target_assign_op.cc paddle/fluid/operators/detection/polygon_box_transform_op.cc paddle/fluid/operators/detection/rpn_target_assign_op.cc paddle/fluid/operators/detection/generate_proposal_labels_op.cc paddle/fluid/operators/detection/box_clip_op.cc paddle/fluid/operators/detection/yolov3_loss_op.cc paddle/fluid/operators/detection/yolo_box_op.cc paddle/fluid/operators/detection/box_decoder_and_assign_op.cc paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc paddle/fluid/operators/detection/retinanet_detection_output_op.cc paddle/fluid/operators/detection/generate_proposals_op.cc paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc paddle/fluid/operators/detection/collect_fpn_proposals_op.cc paddle/fluid/operators/detection/roi_perspective_transform_op.cc paddle/fluid/operators/detection/mask_util.cc paddle/fluid/operators/detection/generate_mask_labels_op.cc paddle/fluid/operators/elementwise/elementwise_mod_op.cc paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc paddle/fluid/operators/elementwise/elementwise_max_op.cc paddle/fluid/operators/elementwise/elementwise_pow_op.cc paddle/fluid/operators/elementwise/elementwise_sub_op.cc paddle/fluid/operators/elementwise/elementwise_add_op.cc paddle/fluid/operators/elementwise/elementwise_min_op.cc paddle/fluid/operators/elementwise/elementwise_div_op.cc paddle/fluid/operators/elementwise/elementwise_mul_op.cc paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc paddle/fluid/operators/fused/fusion_gru_op.cc paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc paddle/fluid/operators/fused/fusion_lstm_op.cc paddle/fluid/operators/fused/fused_elemwise_activation_op.cc paddle/fluid/operators/metrics/accuracy_op.cc paddle/fluid/operators/metrics/precision_recall_op.cc paddle/fluid/operators/metrics/auc_op.cc paddle/fluid/operators/optimizers/adamax_op.cc paddle/fluid/operators/optimizers/sgd_op.cc paddle/fluid/operators/optimizers/lars_momentum_op.cc paddle/fluid/operators/optimizers/adagrad_op.cc paddle/fluid/operators/optimizers/ftrl_op.cc paddle/fluid/operators/optimizers/momentum_op.cc paddle/fluid/operators/optimizers/adadelta_op.cc paddle/fluid/operators/optimizers/rmsprop_op.cc paddle/fluid/operators/optimizers/lamb_op.cc paddle/fluid/operators/optimizers/proximal_gd_op.cc paddle/fluid/operators/optimizers/proximal_adagrad_op.cc paddle/fluid/operators/optimizers/adam_op.cc paddle/fluid/operators/optimizers/decayed_adagrad_op.cc paddle/fluid/operators/reduce_ops/reduce_all_op.cc paddle/fluid/operators/reduce_ops/reduce_min_op.cc paddle/fluid/operators/reduce_ops/reduce_sum_op.cc paddle/fluid/operators/reduce_ops/reduce_any_op.cc paddle/fluid/operators/reduce_ops/reduce_max_op.cc paddle/fluid/operators/reduce_ops/reduce_mean_op.cc paddle/fluid/operators/reduce_ops/reduce_prod_op.cc paddle/fluid/operators/sequence_ops/sequence_erase_op.cc paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc paddle/fluid/operators/sequence_ops/sequence_mask_op.cc paddle/fluid/operators/sequence_ops/sequence_expand_op.cc paddle/fluid/operators/sequence_ops/sequence_pad_op.cc paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc paddle/fluid/operators/sequence_ops/sequence_slice_op.cc paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc paddle/fluid/operators/sequence_ops/sequence_pool_op.cc paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc paddle/fluid/operators/sequence_ops/sequence_conv_op.cc paddle/fluid/operators/sequence_ops/sequence_concat_op.cc paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc paddle/fluid/operators/jit/helper.cc paddle/fluid/operators/jit/kernel_key.cc paddle/fluid/operators/jit/gen_base.cc paddle/fluid/operators/jit/kernel_pool.cc paddle/fluid/operators/jit/refer/refer.cc paddle/fluid/operators/jit/more/mix/mix.cc paddle/fluid/operators/jit/gen/sgd.cc paddle/fluid/operators/jit/gen/hopv.cc paddle/fluid/operators/jit/gen/lstm.cc paddle/fluid/operators/jit/gen/gru.cc paddle/fluid/operators/jit/gen/vbroadcast.cc paddle/fluid/operators/jit/gen/matmul.cc paddle/fluid/operators/jit/gen/seqpool.cc paddle/fluid/operators/jit/gen/embseqpool.cc paddle/fluid/operators/jit/gen/act.cc paddle/fluid/operators/jit/gen/blas.cc paddle/fluid/operators/reader/reader_op_registry.cc paddle/fluid/operators/reader/py_reader.cc paddle/fluid/operators/reader/buffered_reader.cc paddle/fluid/operators/reader/open_files_op.cc paddle/fluid/operators/reader/create_random_data_generator_op.cc paddle/fluid/operators/reader/create_shuffle_reader_op.cc paddle/fluid/operators/reader/create_batch_reader_op.cc paddle/fluid/operators/reader/create_recordio_file_reader_op.cc paddle/fluid/operators/reader/create_double_buffer_reader_op.cc paddle/fluid/operators/reader/create_multi_pass_reader_op.cc paddle/fluid/operators/reader/create_custom_reader_op.cc paddle/fluid/operators/reader/create_py_reader_op.cc paddle/fluid/operators/reader/read_op.cc paddle/fluid/operators/increment_op.cc paddle/fluid/operators/stack_op.cc paddle/fluid/operators/fc_op.cc paddle/fluid/operators/assign_op.cc paddle/fluid/operators/load_op.cc paddle/fluid/operators/fill_op.cc paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc paddle/fluid/operators/conv_shift_op.cc paddle/fluid/operators/fill_zeros_like_op.cc paddle/fluid/operators/hash_op.cc paddle/fluid/operators/dequantize_op.cc paddle/fluid/operators/fake_quantize_op.cc paddle/fluid/operators/size_op.cc paddle/fluid/operators/scatter_op.cc paddle/fluid/operators/uniform_random_op.cc paddle/fluid/operators/beam_search_op.cc paddle/fluid/operators/beam_search_decode_op.cc paddle/fluid/operators/dropout_op.cc paddle/fluid/operators/interpolate_op.cc paddle/fluid/operators/sampling_id_op.cc paddle/fluid/operators/lstm_op.cc paddle/fluid/operators/modified_huber_loss_op.cc paddle/fluid/operators/temporal_shift_op.cc paddle/fluid/operators/sum_op.cc paddle/fluid/operators/arg_min_op.cc paddle/fluid/operators/psroi_pool_op.cc paddle/fluid/operators/uniform_random_batch_size_like_op.cc paddle/fluid/operators/rnn_memory_helper_op.cc paddle/fluid/operators/crf_decoding_op.cc paddle/fluid/operators/where_op.cc paddle/fluid/operators/fake_dequantize_op.cc paddle/fluid/operators/mean_iou_op.cc paddle/fluid/operators/roi_align_op.cc paddle/fluid/operators/range_op.cc paddle/fluid/operators/edit_distance_op.cc paddle/fluid/operators/multiplex_op.cc paddle/fluid/operators/clip_op.cc paddle/fluid/operators/gaussian_random_op.cc paddle/fluid/operators/norm_op.cc paddle/fluid/operators/rank_loss_op.cc paddle/fluid/operators/detection_map_op.cc paddle/fluid/operators/lstm_unit_op.cc paddle/fluid/operators/shard_index_op.cc paddle/fluid/operators/shape_op.cc paddle/fluid/operators/arg_max_op.cc paddle/fluid/operators/average_accumulates_op.cc paddle/fluid/operators/requantize_op.cc paddle/fluid/operators/conv_op.cc paddle/fluid/operators/add_position_encoding_op.cc paddle/fluid/operators/gru_unit_op.cc paddle/fluid/operators/batch_norm_op.cc paddle/fluid/operators/chunk_eval_op.cc paddle/fluid/operators/lod_rank_table_op.cc paddle/fluid/operators/unsqueeze_op.cc paddle/fluid/operators/positive_negative_pair_op.cc paddle/fluid/operators/im2sequence_op.cc paddle/fluid/operators/margin_rank_loss_op.cc paddle/fluid/operators/hinge_loss_op.cc paddle/fluid/operators/cvm_op.cc paddle/fluid/operators/huber_loss_op.cc paddle/fluid/operators/crop_op.cc paddle/fluid/operators/activation_op.cc paddle/fluid/operators/hierarchical_sigmoid_op.cc paddle/fluid/operators/unfold_op.cc paddle/fluid/operators/max_sequence_len_op.cc paddle/fluid/operators/mul_op.cc paddle/fluid/operators/attention_lstm_op.cc paddle/fluid/operators/top_k_op.cc paddle/fluid/operators/group_norm_op.cc paddle/fluid/operators/selu_op.cc paddle/fluid/operators/lstmp_op.cc paddle/fluid/operators/merge_lod_tensor_op.cc paddle/fluid/operators/truncated_gaussian_random_op.cc paddle/fluid/operators/label_smooth_op.cc paddle/fluid/operators/matmul_op.cc paddle/fluid/operators/spp_op.cc paddle/fluid/operators/unstack_op.cc paddle/fluid/operators/conv_transpose_op.cc paddle/fluid/operators/diag_op.cc paddle/fluid/operators/unpool_op.cc paddle/fluid/operators/lod_array_length_op.cc paddle/fluid/operators/affine_channel_op.cc paddle/fluid/operators/log_loss_op.cc paddle/fluid/operators/concat_op.cc paddle/fluid/operators/lod_tensor_to_array_op.cc paddle/fluid/operators/gru_op.cc paddle/fluid/operators/coalesce_tensor_op.cc paddle/fluid/operators/fsp_op.cc paddle/fluid/operators/linspace_op.cc paddle/fluid/operators/reverse_op.cc paddle/fluid/operators/recurrent_op.cc paddle/fluid/operators/split_selected_rows_op.cc paddle/fluid/operators/dgc_clip_by_norm_op.cc paddle/fluid/operators/scale_op.cc paddle/fluid/operators/save_op.cc paddle/fluid/operators/load_combine_op.cc paddle/fluid/operators/merge_selected_rows_op.cc paddle/fluid/operators/split_op.cc paddle/fluid/operators/cumsum_op.cc paddle/fluid/operators/deformable_psroi_pooling_op.cc paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc paddle/fluid/operators/transpose_op.cc paddle/fluid/operators/fill_constant_batch_size_like_op.cc paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc paddle/fluid/operators/shuffle_channel_op.cc paddle/fluid/operators/affine_grid_op.cc paddle/fluid/operators/split_lod_tensor_op.cc paddle/fluid/operators/grid_sampler_op.cc paddle/fluid/operators/lookup_table_op.cc paddle/fluid/operators/cos_sim_op.cc paddle/fluid/operators/quantize_op.cc paddle/fluid/operators/spectral_norm_op.cc paddle/fluid/operators/cross_entropy_op.cc paddle/fluid/operators/print_op.cc paddle/fluid/operators/lrn_op.cc paddle/fluid/operators/nce_op.cc paddle/fluid/operators/similarity_focus_op.cc paddle/fluid/operators/get_tensor_from_selected_rows_op.cc paddle/fluid/operators/squared_l2_distance_op.cc paddle/fluid/operators/cudnn_lstm_op.cc paddle/fluid/operators/tree_conv_op.cc paddle/fluid/operators/one_hot_op.cc paddle/fluid/operators/lookup_sparse_table_op.cc paddle/fluid/operators/unique_op.cc paddle/fluid/operators/mean_op.cc paddle/fluid/operators/prelu_op.cc paddle/fluid/operators/delete_var_op.cc paddle/fluid/operators/ctc_align_op.cc paddle/fluid/operators/argsort_op.cc paddle/fluid/operators/data_norm_op.cc paddle/fluid/operators/minus_op.cc paddle/fluid/operators/shrink_rnn_memory_op.cc paddle/fluid/operators/lod_reset_op.cc paddle/fluid/operators/l1_norm_op.cc paddle/fluid/operators/gaussian_random_batch_size_like_op.cc paddle/fluid/operators/is_empty_op.cc paddle/fluid/operators/bilinear_tensor_product_op.cc paddle/fluid/operators/kldiv_loss_op.cc paddle/fluid/operators/squeeze_op.cc paddle/fluid/operators/softmax_op.cc paddle/fluid/operators/clip_by_norm_op.cc paddle/fluid/operators/pool_with_index_op.cc paddle/fluid/operators/linear_chain_crf_op.cc paddle/fluid/operators/reshape_op.cc paddle/fluid/operators/fill_constant_op.cc paddle/fluid/operators/space_to_depth_op.cc paddle/fluid/operators/gather_op.cc paddle/fluid/operators/softmax_with_cross_entropy_op.cc paddle/fluid/operators/slice_op.cc paddle/fluid/operators/sign_op.cc paddle/fluid/operators/expand_op.cc paddle/fluid/operators/smooth_l1_loss_op.cc paddle/fluid/operators/tensor_array_to_tensor_op.cc paddle/fluid/operators/row_conv_op.cc paddle/fluid/operators/pad2d_op.cc paddle/fluid/operators/pixel_shuffle_op.cc paddle/fluid/operators/assign_value_op.cc paddle/fluid/operators/random_crop_op.cc paddle/fluid/operators/squared_l2_norm_op.cc paddle/fluid/operators/save_combine_op.cc paddle/fluid/operators/pool_op.cc paddle/fluid/operators/cast_op.cc paddle/fluid/operators/array_to_lod_tensor_op.cc paddle/fluid/operators/fill_any_like_op.cc paddle/fluid/operators/flatten_op.cc paddle/fluid/operators/sample_logits_op.cc paddle/fluid/operators/pad_op.cc paddle/fluid/operators/bpr_loss_op.cc paddle/fluid/operators/roi_pool_op.cc paddle/fluid/operators/pad_constant_like_op.cc paddle/fluid/operators/isfinite_op.cc paddle/fluid/operators/layer_norm_op.cc paddle/fluid/operators/maxout_op.cc paddle/fluid/operators/warpctc_op.cc paddle/fluid/string/piece.cc paddle/fluid/string/pretty_log.cc paddle/fluid/string/string_helper.cc paddle/fluid/recordio/header.cc paddle/fluid/recordio/chunk.cc paddle/fluid/recordio/writer.cc paddle/fluid/recordio/scanner.cc paddle/fluid/inference/io.cc paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc paddle/fluid/inference/analysis/passes/passes.cc paddle/fluid/inference/analysis/helper.cc paddle/fluid/inference/analysis/ir_pass_manager.cc paddle/fluid/inference/analysis/argument.cc paddle/fluid/inference/analysis/analysis_pass.cc paddle/fluid/inference/analysis/analyzer.cc paddle/fluid/inference/utils/benchmark.cc paddle/fluid/inference/api/api.cc paddle/fluid/inference/api/api_impl.cc paddle/fluid/inference/api/helper.cc paddle/fluid/inference/api/analysis_predictor.cc paddle/fluid/inference/api/details/zero_copy_tensor.cc paddle/fluid/inference/api/details/reset_tensor_array.cc paddle/fluid/inference/api/analysis_config.cc paddle/fluid/inference/api/paddle_pass_builder.cc"
-paddle_fluid_noavx_openblas_src += ' paddle/fluid/framework/revision.cc'
-
-
-StaticLibrary('paddle_fluid_noavx_openblas', Sources(paddle_fluid_noavx_openblas_src, CppFlags('-DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_ON_INFERENCE -DPADDLE_USE_DSO -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_WITH_XBYAK -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -DPYBIND_NOAVX_OPENBLAS' + r" -DPADDLE_REVISION=\"%s@%s@%s\"" % (REPO_URL(), REPO_BRANCH(), REPO_REVISION())), CFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -msse3 -O3 -DNDEBUG '), CxxFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -msse3 -O3 -DNDEBUG ')))
-SharedLibrary('paddle_fluid_noavx_openblas', Sources(paddle_fluid_noavx_openblas_src, CppFlags('-DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_ON_INFERENCE -DPADDLE_USE_DSO -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_WITH_XBYAK -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -DPYBIND_NOAVX_OPENBLAS' + r" -DPADDLE_REVISION=\"%s@%s@%s\"" % (REPO_URL(), REPO_BRANCH(), REPO_REVISION())), CFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -msse3 -O3 -DNDEBUG '), CxxFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -msse3 -O3 -DNDEBUG ')), LinkDeps(True))
-
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 8d1c3d4913bf4479aba70b01b81fc3e7200ccb47..0000000000000000000000000000000000000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-
-cmake_minimum_required(VERSION 3.0)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-
-include(system)
-
-project(paddle CXX C)
-message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
-        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
-message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
-        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
-message(STATUS "AR tools: ${CMAKE_AR}")
-
-if(WIN32)
-    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
-
-    set(CMAKE_SUPPRESS_REGENERATION ON)
-    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
-    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-    
-    if (MSVC_STATIC_CRT)
-        message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
-        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-    endif()
-    
-    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
-    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
-    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
-else(WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations")
-endif(WIN32)
-
-find_package(CUDA QUIET)
-find_package(Git REQUIRED)
-find_package(Threads REQUIRED)
-
-include(simd)
-
-################################ Exposed Configurations #######################################
-option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
-option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
-option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
-option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
-option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
-option(ON_INFER         "Turn on inference optimization."               OFF)
-################################ Internal Configurations #######################################
-option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
-option(WITH_NGRAPH      "Compile PaddlePaddle with nGraph support."     OFF)
-option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
-option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
-option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
-option(WITH_PSLIB       "Compile with pslib support"                    OFF)
-option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
-option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
-option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
-option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
-option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
-option(WITH_HIGH_LEVEL_API_TEST   "Test fluid python high-level api interface"  OFF)
-option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
-option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ON)
-option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
-
-# PY_VERSION
-if(NOT PY_VERSION)
-  set(PY_VERSION 2.7)
-endif()
-set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
-
-# CMAKE_BUILD_TYPE
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-      FORCE)
-endif()
-
-if (APPLE)
-    set(WITH_MKL OFF CACHE STRING
-        "Disable MKL for building on mac" FORCE)
-endif()
-
-if (WIN32)
-    set(WITH_DISTRIBUTE OFF CACHE STRING
-            "Disable DISTRIBUTE when compiling for Windows" FORCE)
-endif()
-
-set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
-  "A path setting third party libraries download & build directories.")
-
-set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
-  "A path setting fluid shared and static libraries")
-
-set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
-  "A path setting fluid inference shared and static libraries")
-
-set(THIRD_PARTY_BUILD_TYPE Release)
-
-set(WITH_MKLML ${WITH_MKL})
-if (NOT DEFINED WITH_MKLDNN)
-    if (WITH_MKL AND AVX2_FOUND)
-        set(WITH_MKLDNN ON)
-    else()
-        message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
-        set(WITH_MKLDNN OFF)
-    endif()
-endif()
-
-if (REPLACE_ENFORCE_GLOG)
-  add_definitions("-DREPLACE_ENFORCE_GLOG")
-endif()
-
-if (SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$")
-  message("Choose the correct type of sanitizer")
-  return()
-endif()
-
-########################################################################################
-
-include(external/mklml)     # download mklml package
-include(external/xbyak)     # download xbyak package
-include(external/libxsmm)   # download, build, install libxsmm
-include(external/zlib)      # download, build, install zlib
-include(external/gflags)    # download, build, install gflags
-include(external/glog)      # download, build, install glog
-include(external/gtest)     # download, build, install gtest
-include(external/protobuf)  # download, build, install protobuf
-include(external/python)    # download, build, install python
-include(external/openblas)  # download, build, install openblas
-include(external/mkldnn)    # download, build, install mkldnn
-include(external/ngraph)    # download, build, install nGraph
-include(external/boost)     # download boost
-include(external/eigen)     # download eigen3
-include(external/pybind11)  # download pybind11
-include(external/cares)
-include(external/cub)
-include(external/rocprim)
-include(external/xxhash)    # download xxhash
-include(external/dlpack)
-include(external/warpctc)   # download, build, install warpctc
-
-if (NOT WIN32)
-# there is no official support of nccl, cupti in windows
-include(cupti)
-endif (NOT WIN32)
-
-if(WITH_PSLIB)
-    include(external/libmct)
-    include(external/pslib_brpc)
-    include(external/pslib)
-endif(WITH_PSLIB)
-if(WITH_BOX_PS)
-    include(external/box_ps)
-endif(WITH_BOX_PS)
-
-if(WITH_DISTRIBUTE)
-    if(WITH_GRPC)
-        include(external/grpc)
-        message(STATUS "Use grpc framework.")
-    else()
-        message(STATUS "Use brpc framework.")
-        include(external/leveldb)
-        include(external/brpc)
-    endif()
-endif()
-
-if(WITH_BRPC_RDMA)
-    message(STATUS "Use brpc with rdma.")
-    if(WITH_GRPC)
-        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
-    endif()
-    if(NOT WITH_DISTRIBUTE)
-        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
-    endif()
-endif()
-
-include(anakin_subgraph)
-
-include(external/threadpool)
-include(flags)              # set paddle compile flags
-include(cudnn)              # set cudnn libraries, must before configure
-include(configure)          # add paddle env configuration
-
-if(WITH_GPU)
-    include(cuda)
-    include(tensorrt)
-endif()
-
-if(WIN32 OR APPLE OR NOT WITH_GPU OR ON_INFER)
-    set(WITH_DGC OFF)
-endif()
-
-if(WITH_DGC)
-    message(STATUS "add dgc lib.")
-    include(external/dgc)
-    add_definitions(-DPADDLE_WITH_DGC)
-endif()
-
-if (WITH_PROFILER)
-    find_package(Gperftools REQUIRED)
-    include_directories(${GPERFTOOLS_INCLUDE_DIR})
-    add_definitions(-DWITH_GPERFTOOLS)
-endif()
-
-include(generic)            # simplify cmake module
-include(ccache)             # set ccache for compilation
-include(util)               # set unittest and link libs
-include(version)            # set PADDLE_VERSION
-include(coveralls)          # set code coverage
-include(inference_lib)      # add paddle fluid inference libraries
-
-
-include_directories("${PADDLE_SOURCE_DIR}")
-
-if(WITH_AMD_GPU)
-    find_package(HIP)
-    include(hip)
-endif(WITH_AMD_GPU)
-
-set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
-
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-
-if (ON_INFER)
-    message(STATUS "On inference mode, will take place some specific optimization.")
-    add_definitions(-DPADDLE_ON_INFERENCE)
-else()
-    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
-    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
-endif()
-
-add_subdirectory(paddle)
-if(WITH_PYTHON)
-    add_subdirectory(python)
-endif()
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
deleted file mode 100644
index 54131b48eca463aef817a4b96ba1b64de4b60aab..0000000000000000000000000000000000000000
--- a/CODE_OF_CONDUCT.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# Contributor Covenant Code of Conduct
-
-## Our Pledge
-
-In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
-
-## Our Standards
-
-Examples of behavior that contributes to creating a positive environment include:
-
-* Using welcoming and inclusive language
-* Being respectful of differing viewpoints and experiences
-* Gracefully accepting constructive criticism
-* Focusing on what is best for the community
-* Showing empathy towards other community members
-
-Examples of unacceptable behavior by participants include:
-
-* The use of sexualized language or imagery and unwelcome sexual attention or advances
-* Trolling, insulting/derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or electronic address, without explicit permission
-* Other conduct which could reasonably be considered inappropriate in a professional setting
-
-## Our Responsibilities
-
-Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
-
-Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
-
-## Scope
-
-This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
-
-## Enforcement
-
-Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at paddle-dev@baidu.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
-
-Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
-
-## Attribution
-
-This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
-
-[homepage]: http://contributor-covenant.org
-[version]: http://contributor-covenant.org/version/1/4/
diff --git a/CODE_OF_CONDUCT_cn.md b/CODE_OF_CONDUCT_cn.md
deleted file mode 100644
index 2be794f1f324cf9b6bc304d4e5812076b56f4551..0000000000000000000000000000000000000000
--- a/CODE_OF_CONDUCT_cn.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# 参与者公约
-
-## 我们的保证
-
-为了促进一个开放透明且友好的环境，我们作为贡献者和维护者保证：无论年龄、种族、民族、性别认同和表达（方式）、体型、身体健全与否、经验水平、国籍、个人表现、宗教或性别取向，参与者在我们项目和社区中都免于骚扰。
-
-## 我们的标准
-
-有助于创造正面环境的行为包括但不限于：
-* 使用友好和包容性语言
-* 尊重不同的观点和经历
-* 耐心地接受建设性批评
-* 关注对社区最有利的事情
-* 友善对待其他社区成员
-
-身为参与者不能接受的行为包括但不限于：
-* 使用与性有关的言语或是图像，以及不受欢迎的性骚扰
-* 捣乱/煽动/造谣的行为或进行侮辱/贬损的评论，人身攻击及政治攻击
-* 公开或私下的骚扰
-* 未经许可地发布他人的个人资料，例如住址或是电子地址
-* 其他可以被合理地认定为不恰当或者违反职业操守的行为
-
-## 我们的责任
-
-项目维护者有责任为「可接受的行为」标准做出诠释，以及对已发生的不被接受的行为采取恰当且公平的纠正措施。
-
-项目维护者有权利及责任去删除、编辑、拒绝与本行为标准有所违背的评论(comments)、提交(commits)、代码、wiki 编辑、问题(issues)和其他贡献，以及项目维护者可暂时或永久性的禁止任何他们认为有不适当、威胁、冒犯、有害行为的贡献者。
-
-## 使用范围
-
-当一个人代表该项目或是其社区时，本行为标准适用于其项目平台和公共平台。
-
-代表项目或是社区的情况，举例来说包括使用官方项目的电子邮件地址、通过官方的社区媒体账号发布或线上或线下事件中担任指定代表。
-
-该项目的呈现方式可由其项目维护者进行进一步的定义及解释。
-
-## 强制执行
-
-可以通过paddle-dev@baidu.com，来联系项目团队来举报滥用、骚扰或其他不被接受的行为。
-
-任何维护团队认为有必要且适合的所有投诉都将进行审查及调查，并做出相对应的回应。项目小组有对事件回报者有保密的义务。具体执行的方针近一步细节可能会单独公布。
-
-没有切实地遵守或是执行本行为标准的项目维护人员，可能会因项目领导人或是其他成员的决定，暂时或是永久地取消其参与资格。
-
-## 来源
-
-本行为标准改编自[贡献者公约][主页]，版本 1.4
-可在此观看https://www.contributor-covenant.org/zh-cn/version/1/4/code-of-conduct.html
-
-[主页]: https://www.contributor-covenant.org
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
deleted file mode 100644
index 62b26b99bcbeddc91ed1bd0702b0d6aec2e674bf..0000000000000000000000000000000000000000
--- a/CONTRIBUTING.md
+++ /dev/null
@@ -1,162 +0,0 @@
-# Contribute Code
-
-You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the 
-[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329).
-
-We sincerely appreciate your contribution.  This document explains our workflow and work style.
-
-## Workflow
-
-PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/).  The following steps guide usual contributions.
-
-1. Fork
-
-   Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo.  So, please file Pull Requests from your fork.  To make a fork,  just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/).
-
-1. Clone
-
-   To make a copy of your fork to your local computers, please run
-
-   ```bash
-   git clone https://github.com/your-github-account/paddle
-   cd paddle
-   ```
-
-1. Create the local feature branch
-
-   For daily works like adding a new feature or fixing a bug, please open your feature branch before coding:
-
-   ```bash
-   git checkout -b my-cool-stuff
-   ```
-
-1. Commit
-
-   Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands:
-
-   ```bash
-   pip install pre-commit
-   pre-commit install
-   ```
-
-   Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
-
-   Once installed, `pre-commit` checks the style of code and documentation in every commit.  We will see something like the following when you run `git commit`:
-
-   ```
-   ➜  git commit
-   CRLF end-lines remover...............................(no files to check)Skipped
-   yapf.................................................(no files to check)Skipped
-   Check for added large files..............................................Passed
-   Check for merge conflicts................................................Passed
-   Check for broken symlinks................................................Passed
-   Detect Private Key...................................(no files to check)Skipped
-   Fix End of Files.....................................(no files to check)Skipped
-   clang-formater.......................................(no files to check)Skipped
-   [my-cool-stuff c703c041] add test file
-    1 file changed, 0 insertions(+), 0 deletions(-)
-    create mode 100644 233
-   ```
-
-	NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
-
-1. Build and test
-
-   Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
-
-1. Keep pulling
-
-   An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.
-
-   ```bash
-   git remote add upstream https://github.com/PaddlePaddle/Paddle
-   git pull upstream develop
-   ```
-
-1. Push and file a pull request
-
-   You can "push" your local work into your forked repo:
-
-   ```bash
-   git push origin my-cool-stuff
-   ```
-
-   The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one.
-
-   To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).
-
-   If your change is for fixing an issue, please write ["Fixes <issue-URL>"](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request.  Github would close the issue when the owners merge your pull request.
-
-   Please remember to specify some reviewers for your pull request.  If you don't know who are the right ones, please follow Github's recommendation.
-
-
-1. Delete local and remote branches
-
-   To keep your local workspace and your fork clean, you might want to remove merged branches:
-
-   ```bash
-   git push origin :my-cool-stuff
-   git checkout develop
-   git pull upstream develop
-   git branch -d my-cool-stuff
-   ```
-
-### Code Review
-
--  Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email.  Please do this after your pull request passes the CI.
-
-- Please answer reviewers' every comment.  If you are to follow the comment, please write "Done"; please give a reason otherwise.
-
-- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).
-
-- Reduce the unnecessary commits.  Some developers commit often.  It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`.
-
-
-## Coding Standard
-
-### Code Style
-
-Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html).
-
-Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/).
-
-Our build process helps to check the code style.  In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default.  This flag is on
-
-Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`.  To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43).
-
-### Unit Tests
-
-Please remember to add related unit tests.
-
-- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/primer.md) .
-
-- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).
-
-
-### Writing Logs
-
-We use [glog](https://github.com/google/glog) for logging in our C/C++ code.
-
-For general information, please use `LOG`.  For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose).  The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ).
-
-`VLOG` requires a *verbose level* parameter.  For example:
-
-```c++
-VLOG(3) << "Operator FC is taking " << num_inputs << "inputs."
-```
-
-When we run a PaddlePaddle application or test, we can specify a verbose threshold.  For example:
-
-```bash
-GLOG_vmodule=buddy_allocator=2 \
-GLOG_v=10 \
-python \
-../python/paddle/v2/framework/tests/test_recurrent_op.py
-```
-
-This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
-
-- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework)
-- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)
-- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform)
-- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators/math/)
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 9e4609886a1a4e5c91be68db0a8791d52e33bc34..0000000000000000000000000000000000000000
--- a/Dockerfile
+++ /dev/null
@@ -1,216 +0,0 @@
-# A image for building paddle binaries
-# Use cuda devel base image for both cpu and gpu environment
-# When you modify it, please be aware of cudnn-runtime version
-FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-# ENV variables
-ARG WITH_GPU
-ARG WITH_AVX
-
-ENV WOBOQ OFF
-ENV WITH_GPU=${WITH_GPU:-ON}
-ENV WITH_AVX=${WITH_AVX:-ON}
-
-ENV HOME /root
-# Add bash enhancements
-COPY ./paddle/scripts/docker/root/ /root/
-
-# Prepare packages for Python
-RUN apt-get update && \
-    apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
-    libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
-    xz-utils tk-dev libffi-dev liblzma-dev
-
-# Downgrade gcc&&g++
-RUN apt-get update
-WORKDIR /usr/bin
-RUN apt install -y gcc-4.8 g++-4.8
-RUN cp gcc gcc.bak
-RUN cp g++ g++.bak
-RUN rm gcc
-RUN rm g++
-RUN ln -s gcc-4.8 gcc
-RUN ln -s g++-4.8 g++
-
-# Install Python3.6
-RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
-    tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
-    ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
-    wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
-    tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
-    make -j8 > /dev/null && make altinstall > /dev/null
-
-# Install Python3.7
-RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
-    tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
-    make -j8 > /dev/null && make altinstall > /dev/null
-
-RUN rm -r /root/python_build
-
-RUN apt-get update && \
-    apt-get install -y --allow-downgrades --allow-change-held-packages \
-    patchelf python3 python3-dev python3-pip \
-    git python-pip python-dev python-opencv openssh-server bison \
-    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
-    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
-    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-matplotlib gcc-4.8 g++-4.8 \
-    automake locales clang-format swig cmake  \
-    liblapack-dev liblapacke-dev \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools libtool ccache && \
-    apt-get clean -y
-
-# Install Python2.7.15 to replace original python
-WORKDIR /home
-ENV version=2.7.15
-RUN wget https://www.python.org/ftp/python/$version/Python-$version.tgz
-RUN tar -xvf Python-$version.tgz
-WORKDIR /home/Python-$version
-RUN ./configure --enable-unicode=ucs4 --enable-shared CFLAGS=-fPIC --prefix=/usr/local/python2.7.15
-RUN make && make install
-
-RUN echo "export PATH=/usr/local/python2.7.15/include:${PATH}" >> ~/.bashrc
-RUN echo "export PATH=/usr/local/python2.7.15/bin:${PATH}" >> ~/.bashrc
-RUN echo "export LD_LIBRARY_PATH=/usr/local/python2.7.15/lib:${LD_LIBRARY_PATH}" >> ~/.bashrc
-RUN echo "export CPLUS_INCLUDE_PATH=/usr/local/python2.7.15/include/python2.7:$CPLUS_INCLUDE_PATH" >> ~/.bashrc
-ENV PATH=/usr/local/python2.7.15/include:${PATH}
-ENV PATH=/usr/local/python2.7.15/bin:${PATH}
-ENV LD_LIBRARY_PATH=/usr/local/python2.7.15/lib:${LD_LIBRARY_PATH}
-ENV CPLUS_INCLUDE_PATH=/usr/local/python2.7.15/include/python2.7:$CPLUS_INCLUDE_PATH
-RUN mv /usr/bin/python /usr/bin/python.bak
-RUN ln -s /usr/local/python2.7.15/bin/python2.7 /usr/local/bin/python
-RUN ln -s /usr/local/python2.7.15/bin/python2.7 /usr/bin/python
-WORKDIR /home
-RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e442e80deffbb8d0676b93261d761b1f0ef8fb/setuptools-40.6.2.zip
-RUN apt-get -y install unzip
-RUN unzip setuptools-40.6.2.zip
-WORKDIR /home/setuptools-40.6.2
-RUN python setup.py build
-RUN python setup.py install
-WORKDIR /home
-RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz
-RUN tar -zxvf pip-18.0.tar.gz
-WORKDIR pip-18.0
-RUN python setup.py install
-
-WORKDIR /home
-RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \
-    rm -r Python-$version setuptools-40.6.2 pip-18.0
-
-# Install Go and glide
-RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-# install glide
-RUN curl -s -q https://glide.sh/get | sh
-
-# Install TensorRT
-# following TensorRT.tar.gz is not the default official one, we do two miny changes:
-# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
-#    and its size is only one-third of the official one.
-# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
-#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-
-RUN wget -q https://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
-    tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
-    cp -rf /usr/local/TensorRT/include /usr && \
-    cp -rf /usr/local/TensorRT/lib /usr
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
-# version util jupyter fixes this issue.
-
-# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
-
-RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python && \
-    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python && \
-    pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python && \
-    pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python
-
-#For docstring checker
-RUN pip3 --no-cache-dir install pylint pytest astroid isort
-RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
-RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
-RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
-
-RUN pip3 --no-cache-dir install coverage                
-RUN pip3.6 --no-cache-dir install coverage             
-RUN pip3.7 --no-cache-dir install coverage            
-RUN pip --no-cache-dir install coverage
-
-COPY ./python/requirements.txt /root/
-RUN pip3 --no-cache-dir install -r /root/requirements.txt
-RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
-RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
-RUN pip --no-cache-dir install -r /root/requirements.txt
-
-# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
-# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
-RUN pip3 --no-cache-dir install certifi urllib3[secure]
-RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
-RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
-RUN pip --no-cache-dir install certifi urllib3[secure]
-
-
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
-
-# ar mishandles 4GB files
-# https://sourceware.org/bugzilla/show_bug.cgi?id=14625
-# remove them when apt-get support 2.27 and higher version
-RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/2.27-9ubuntu1/binutils_2.27.orig.tar.gz && \
-    tar -xzf binutils_2.27.orig.tar.gz && \
-    cd binutils-2.27 && \
-    ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
-
-# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-CMD source ~/.bashrc
-EXPOSE 22
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
deleted file mode 100644
index 6b2614b1011081a5e0e03a53fec2012bc7b81333..0000000000000000000000000000000000000000
--- a/ISSUE_TEMPLATE.md
+++ /dev/null
@@ -1,14 +0,0 @@
-Thank you for contributing to PaddlePaddle. Submitting an issue is a great help for us.
-Both Chinese and English issues are welcome.
-
-It's hard to solve a problem when important details are missing.
-Before submitting the issue, look over the following criteria before handing your request in.
-
-- [ ] Was there a similar issue submitted or resolved before ? You could search issue in the github.
-- [ ] Did you retrieve your issue from widespread search engines ?
-- [ ] Is my description of the issue clear enough to reproduce this problem?
-   * If some errors occurred, we need details about `how do you run your code?`, `what system do you use?`, `Are you using GPU or not?`, etc.
-   * If you use an recording [asciinema](https://asciinema.org/) to show what you are doing to make it happen, that's awesome! We could help you solve the problem more quickly.
-- [ ] Is my description of the issue use the github markdown correctly?
-   * Please use the proper markdown syntaxes for styling all forms of writing, e.g, source code, error information, etc.
-   * Check out [this page](https://guides.github.com/features/mastering-markdown/) to find out much more about markdown.
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 5fe86943b37a77970679f826e78c71045569f819..0000000000000000000000000000000000000000
--- a/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/README.md b/README.md
deleted file mode 100644
index c417401a740f3afaf378cae804a6d13b78e66bdd..0000000000000000000000000000000000000000
--- a/README.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# PaddlePaddle
-Fork From  http://icode.baidu.com/repos/baidu/paddlepaddle/paddle/tree/paddle_feed_news_201910  (commitid:f50e701) v1.4
-English | [简体中文](./README_cn.md)
-
-[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)
-[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
-[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
-
-Welcome to the PaddlePaddle GitHub.
-
-PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
-efficient, flexible and scalable deep learning platform, which is originally
-developed by Baidu scientists and engineers for the purpose of applying deep
-learning to many products at Baidu.
-
-Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
-
-### Latest PaddlePaddle Release: [Fluid 1.5.2](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
-### Install Latest Stable Release:
-```
-# Linux CPU
-pip install paddlepaddle
-# Linux GPU cuda10cudnn7
-pip install paddlepaddle-gpu
-# Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.5.2.post87
-# Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.5.2.post97
-
-
-# For installation on other platform, refer to http://paddlepaddle.org/
-```
-Now our developers could acquire Tesla V100 online computing resources for free. If you create a program by AI Studio, you would obtain 12 hours to train models online per day. If you could insist on that for five consecutive days, then you would own extra 48 hours. [Click here to start](http://ai.baidu.com/support/news?action=detail&id=981).
-
-## Features
-
-- **Flexibility**
-
-    PaddlePaddle supports a wide range of neural network architectures and
-    optimization algorithms. It is easy to configure complex models such as
-    neural machine translation model with attention mechanism or complex memory
-    connection.
-
--  **Efficiency**
-
-    In order to unleash the power of heterogeneous computing resource,
-    optimization occurs at different levels of PaddlePaddle, including
-    computing, memory, architecture and communication. The following are some
-    examples:
-
-      - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
-      - Optimized CNN networks through MKL-DNN library.
-      - Highly optimized recurrent networks which can handle **variable-length**
-      sequence without padding.
-      - Optimized local and distributed training for models with high dimensional
-      sparse data.
-
-- **Scalability**
-
-    With PaddlePaddle, it is easy to use many CPUs/GPUs and machines to speed
-    up your training. PaddlePaddle can achieve high throughput and performance
-    via optimized communication.
-
-- **Connected to Products**
-
-    In addition, PaddlePaddle is also designed to be easily deployable. At Baidu,
-    PaddlePaddle has been deployed into products and services with a vast number
-    of users, including ad click-through rate (CTR) prediction, large-scale image
-    classification, optical character recognition(OCR), search ranking, computer
-    virus detection, recommendation, etc. It is widely utilized in products at
-    Baidu and it has achieved a significant impact. We hope you can also explore
-    the capability of PaddlePaddle to make an impact on your product.
-
-## Installation
-
-It is recommended to read [this doc](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) on our website.
-
-## Documentation
-
-We provide [English](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) and
-[Chinese](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) documentation.
-
-- [Deep Learning 101](https://github.com/PaddlePaddle/book)
-
-  You might want to start from this online interactive book that can run in a Jupyter Notebook.
-
-- [Distributed Training](http://paddlepaddle.org.cn/documentation/docs/en/1.5/user_guides/howto/training/multi_node_en.html)
-
-  You can run distributed training jobs on MPI clusters.
-
-- [Python API](http://paddlepaddle.org.cn/documentation/docs/en/1.5/api/index_en.html)
-
-   Our new API enables much shorter programs.
-
-- [How to Contribute](http://paddlepaddle.org.cn/documentation/docs/en/1.5/advanced_usage/development/contribute_to_paddle/index_en.html)
-
-   We appreciate your contributions!
-
-## Communication
-
-- [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
-- QQ discussion group: 796771754 (PaddlePaddle).
-- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
-
-## Copyright and License
-PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/README_cn.md b/README_cn.md
deleted file mode 100644
index cde308c9b1110d1b4a5b485ce3405f0fbdff24db..0000000000000000000000000000000000000000
--- a/README_cn.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# PaddlePaddle
-
-[English](./README.md) | 简体中文
-
-[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)
-[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
-[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
-
-欢迎来到 PaddlePaddle GitHub
-
-PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台，最初由百度科学家和工程师共同开发，目的是将深度学习技术应用到百度的众多产品中。
-
-我们的愿景是让每个人都能通过PaddlePaddle接触深度学习
-
-跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
-
-### PaddlePaddle最新版本: [Fluid 1.5.2](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
-### 安装最新稳定版本:
-```
-# Linux CPU
-pip install paddlepaddle
-# Linux GPU cuda10cudnn7
-pip install paddlepaddle-gpu
-# Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.5.2.post87
-# Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.5.2.post97
-
-
-# 其他平台上的安装指引请参考 http://paddlepaddle.org/
-```
-PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型更高效。**每日登陆即送12小时**，**连续五天运行再加送48小时**，[前往使用免费算力](https://ai.baidu.com/support/news?action=detail&id=981)。
-
-## 特性
-
-- **灵活性**
-
-    PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型，例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。
-
--  **高效性**
-
-    为了高效使用异步计算资源，PaddlePaddle对框架的不同层进行优化，包括计算、存储、架构和通信。下面是一些样例：
-
-    - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。
-    - 通过MKL-DNN库优化CNN网络
-    - 高度优化循环网络，无需执行 `padding` 操作即可处理 **变长** 序列
-    - 针对高维稀疏数据模型，优化了局部和分布式训练。
-
-
-- **稳定性**
-
-    有了 PaddlePaddle，使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。
-
-- **与产品相连**
-
-    另外，PaddlePaddle 的设计也易于部署。在百度，PaddlePaddle 已经部署到含有巨大用户量的产品和服务上，包括广告点击率（CTR）预测、大规模图像分类、光学字符识别（OCR）、搜索排序，计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中，产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力，为您的产品创造新的影响力和效果。
-
-## 安装
-
-推荐阅读官网上的[安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)
-
-## 文档
-
-我们提供[英文](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)和
-[中文](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) 文档
-
-- [深度学习101](https://github.com/PaddlePaddle/book)
-
-  或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
-
-- [分布式训练](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/multi_node.html)
-
-  可以在MPI集群上运行分布式训练任务
-
-- [Python API](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/api_cn/index_cn.html)
-
-   新的API支持代码更少更简洁的程序
-
-- [贡献方式](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/index_cn.html)
-
-   欢迎您的贡献!
-
-## 交流与反馈
-
-- 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
-- QQ群: 796771754 (PaddlePaddle)
-- [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
-
-## 版权和许可证
-PaddlePaddle由[Apache-2.0 license](LICENSE)提供
diff --git a/RELEASE.md b/RELEASE.md
deleted file mode 100644
index 2c64baaaab7d12dde46f6660286ec8475699746b..0000000000000000000000000000000000000000
--- a/RELEASE.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Release Note
-
-Please turn to [here](https://github.com/PaddlePaddle/Paddle/releases) for release note.
diff --git a/build.sh b/build.sh
index 708e63aaa43075cf5efacd5588cc84a3c2e98151..d4a61f19615e53ad5ff718a66d8ebab49c9c713b 100755
--- a/build.sh
+++ b/build.sh
@@ -1,4 +1,6 @@
 #!bash
+RUN_DIR="$(cd "$(dirname "$0")"&&pwd)"
+cd ${RUN_DIR}
 build_mode=$1
 function print_usage() {
     echo "++++++++++++++++++++++++++++++++++++++++++++++++++++"
@@ -25,10 +27,23 @@ if [ ! -f ${python_binary} ];then
     exit -1
 fi
 
-#apply feed code
-if [ -f "paddle/fluid/feed/apply_feed_code.sh" ];then
-    sh paddle/fluid/feed/apply_feed_code.sh
-fi 
+
+function copy_paddle_env() {
+    cd ${RUN_DIR}
+    rm -rf build_env
+    mkdir build_env
+    echo "xxh copy"
+    cp -r ../../paddlepaddle/paddle/* build_env
+    cp -r feed ./build_env/paddlepaddle/paddle/paddle/fluid/
+    cd build_env
+}
+
+function apply_feed_code() {
+    #apply feed code
+    if [ -f "paddle/fluid/feed/apply_feed_code.sh" ];then
+        sh paddle/fluid/feed/apply_feed_code.sh
+    fi 
+}
 
 function makeit() {
     cd build
@@ -44,12 +59,18 @@ function cmake_all() {
     cd ..
 }
 
+if [ ! -d build_env ];then
+    copy_paddle_env
+fi
+cd ${RUN_DIR}/build_env
+
 if [ "${build_mode}" = "all" ];then
     cmake_all
     makeit
 elif [ "${build_mode}" = "make" ];then
     makeit
 elif "${build_mode}" = "clean" ];then
-    cd build
-    make clean
+    copy_paddle_env
+    #cd build
+    #make clean
 fi 
diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake
deleted file mode 100644
index 928f573a4fb82391859e334d50e6c8ed0e26aae2..0000000000000000000000000000000000000000
--- a/cmake/FindGperftools.cmake
+++ /dev/null
@@ -1,63 +0,0 @@
-# Tries to find Gperftools.
-#
-# Usage of this module as follows:
-#
-#     find_package(Gperftools)
-#
-# Variables used by this module, they can change the default behaviour and need
-# to be set before calling find_package:
-#
-#  Gperftools_ROOT_DIR  Set this variable to the root installation of
-#                       Gperftools if the module has problems finding
-#                       the proper installation path.
-#
-# Variables defined by this module:
-#
-#  GPERFTOOLS_FOUND              System has Gperftools libs/headers
-#  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
-#  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
-
-find_library(GPERFTOOLS_TCMALLOC
-  NAMES tcmalloc
-  HINTS ${Gperftools_ROOT_DIR}/lib)
-
-find_library(GPERFTOOLS_PROFILER
-  NAMES profiler
-  HINTS ${Gperftools_ROOT_DIR}/lib)
-
-find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
-  NAMES tcmalloc_and_profiler
-  HINTS ${Gperftools_ROOT_DIR}/lib)
-
-find_path(GPERFTOOLS_INCLUDE_DIR
-  NAMES gperftools/heap-profiler.h
-  HINTS ${Gperftools_ROOT_DIR}/include)
-
-set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(
-  Gperftools
-  DEFAULT_MSG
-  GPERFTOOLS_LIBRARIES
-  GPERFTOOLS_INCLUDE_DIR)
-
-mark_as_advanced(
-  Gperftools_ROOT_DIR
-  GPERFTOOLS_TCMALLOC
-  GPERFTOOLS_PROFILER
-  GPERFTOOLS_TCMALLOC_AND_PROFILER
-  GPERFTOOLS_LIBRARIES
-  GPERFTOOLS_INCLUDE_DIR)
-
-# create IMPORTED targets
-if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc)
-  add_library(gperftools::tcmalloc UNKNOWN IMPORTED)
-  set_target_properties(gperftools::tcmalloc PROPERTIES
-    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC}
-    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
-  add_library(gperftools::profiler UNKNOWN IMPORTED)
-  set_target_properties(gperftools::profiler PROPERTIES
-    IMPORTED_LOCATION ${GPERFTOOLS_PROFILER}
-    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
-endif()
diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake
deleted file mode 100644
index 8cdd642ac01315949f7fee3a981a17d67d1e4198..0000000000000000000000000000000000000000
--- a/cmake/FindNumPy.cmake
+++ /dev/null
@@ -1,38 +0,0 @@
-# Find the Python NumPy package
-# PYTHON_NUMPY_INCLUDE_DIR
-# NUMPY_FOUND
-# will be set by this script
-
-cmake_minimum_required(VERSION 2.6)
-
-if(NOT PYTHON_EXECUTABLE)
-  if(NumPy_FIND_QUIETLY)
-    find_package(PythonInterp QUIET)
-  else()
-    find_package(PythonInterp)
-    set(_numpy_out 1)
-  endif()
-endif()
-
-if (PYTHON_EXECUTABLE)
-  # write a python script that finds the numpy path
-  file(WRITE ${PROJECT_BINARY_DIR}/FindNumpyPath.py
-      "try: import numpy; print(numpy.get_include())\nexcept:pass\n")
-
-  # execute the find script
-  exec_program("${PYTHON_EXECUTABLE}" ${PROJECT_BINARY_DIR}
-    ARGS "FindNumpyPath.py"
-    OUTPUT_VARIABLE NUMPY_PATH)
-elseif(_numpy_out)
-  message(STATUS "Python executable not found.")
-endif(PYTHON_EXECUTABLE)
-
-find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h
-  HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}")
-
-if(PYTHON_NUMPY_INCLUDE_DIR)
-  set(PYTHON_NUMPY_FOUND 1 CACHE INTERNAL "Python numpy found")
-endif(PYTHON_NUMPY_INCLUDE_DIR)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NumPy DEFAULT_MSG PYTHON_NUMPY_INCLUDE_DIR)
diff --git a/cmake/anakin_subgraph.cmake b/cmake/anakin_subgraph.cmake
deleted file mode 100644
index 177f34438d6ac8731390e44255072718039bf01c..0000000000000000000000000000000000000000
--- a/cmake/anakin_subgraph.cmake
+++ /dev/null
@@ -1,45 +0,0 @@
-set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
-find_path(ANAKIN_INCLUDE_DIR anakin_config.h
-    PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
-    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include
-    NO_DEFAULT_PATH
-)
-
-find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
-    PATHS ${ANAKIN_ROOT}
-    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib
-    NO_DEFAULT_PATH
-    DOC "Path to ANAKIN library.")
-
-if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
-    set(ANAKIN_FOUND ON)
-else()
-    set(ANAKIN_FOUND OFF)
-endif()
-
-if(ANAKIN_FOUND)
-    message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
-    include_directories(${ANAKIN_ROOT})
-    include_directories(${ANAKIN_ROOT}/include)
-    include_directories(${ANAKIN_ROOT}/saber)
-    link_directories(${ANAKIN_ROOT})
-    add_definitions(-DPADDLE_WITH_ANAKIN)
-endif()
-
-if(ANAKIN_FOUND)
-  if (ANAKIN_MLU AND NOT WITH_GPU AND NOT ANAKIN_X86)
-    message(STATUS "Compile with anakin mlu place.")
-    add_definitions(-DANAKIN_MLU_PLACE)
-  elseif(ANAKIN_BM AND NOT WITH_GPU AND NOT ANAKIN_X86)
-    message(STATUS "Compile with anakin bm place.")
-    add_definitions(-DANAKIN_BM_PLACE)
-  elseif(ANAKIN_X86)
-    message(STATUS "Compile with anakin x86 place.")
-    add_definitions(-DANAKIN_X86_PLACE)
-  endif()
-endif()
-
-if(ANAKIN_FOUND AND WITH_GPU AND WITH_DSO)
-    message(STATUS "Compile with anakin subgraph.")
-    set(ANAKIN_SUBGRAPH ON)
-endif()
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
deleted file mode 100644
index 52ac31d1d125afb89fb0ae783fba94ab9a0c5a1a..0000000000000000000000000000000000000000
--- a/cmake/cblas.cmake
+++ /dev/null
@@ -1,94 +0,0 @@
-# Find the CBlas and lapack libraries
-#
-# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
-#
-# If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKLML, OPENBLAS, REFERENCE
-#    CBLAS_INC_DIR   # the include directory for cblas.
-#    CBLAS_LIBS      # a list of libraries should be linked by paddle.
-#                    # Each library should be full path to object file.
-
-set(CBLAS_FOUND OFF)
-
-## Find MKLML First.
-if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER MKLML)
-  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
-  set(CBLAS_LIBRARIES ${MKLML_LIB})
-
-  add_definitions(-DPADDLE_WITH_MKLML)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found cblas and lapack in MKLML "
-    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  return()
-endif()
-
-## Then find openblas.
-set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
-set(OPENBLAS_INCLUDE_SEARCH_PATHS
-        ${OPENBLAS_ROOT}/include
-        /usr/include
-        /usr/include/openblas
-        /usr/local/opt/openblas/include)
-set(OPENBLAS_LIB_SEARCH_PATHS
-        ${OPENBLAS_ROOT}/lib
-        /usr/lib
-        /usr/lib/blas/openblas
-        /usr/lib/openblas
-        /usr/local/opt/openblas/lib)
-
-find_path(OPENBLAS_INC_DIR NAMES cblas.h
-  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
-find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
-  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
-find_library(OPENBLAS_LIB NAMES openblas
-  PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
-
-if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER OPENBLAS)
-  set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
-  set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
-
-  add_definitions(-DPADDLE_USE_OPENBLAS)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
-  return()
-endif()
-
-
-## Then find the reference-cblas.  www.netlib.org/blas/
-set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
-  "Folder contains reference-cblas")
-set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/include
-  /usr/include
-  /usr/include/cblas
-)
-
-set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/lib
-  /usr/lib
-  /usr/lib/blas/reference/
-  /usr/lib/reference/
-)
-
-if(WITH_SYSTEM_BLAS)
-  find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
-        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
-  find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
-        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
-
-  if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
-    set(CBLAS_FOUND ON)
-    set(CBLAS_PROVIDER REFERENCE)
-    set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-    set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
-    add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
-    message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  endif()
-endif()
diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake
deleted file mode 100644
index 900f59d4cb83bc9ce1893b2d3bd95f5a08b164bb..0000000000000000000000000000000000000000
--- a/cmake/ccache.cmake
+++ /dev/null
@@ -1,9 +0,0 @@
-# Use ccache if found ccache program
-
-find_program(CCACHE_PATH ccache)
-
-if(CCACHE_PATH)
-    message(STATUS "Ccache is founded, use ccache to speed up compile.")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
-endif(CCACHE_PATH)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
deleted file mode 100644
index 816314ddc6ece68540e01abe262dec3b7227dd07..0000000000000000000000000000000000000000
--- a/cmake/configure.cmake
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT WITH_PYTHON)
-    add_definitions(-DPADDLE_NO_PYTHON)
-endif(NOT WITH_PYTHON)
-
-if(WITH_DSO)
-    add_definitions(-DPADDLE_USE_DSO)
-endif(WITH_DSO)
-
-if(WITH_TESTING)
-    add_definitions(-DPADDLE_WITH_TESTING)
-endif(WITH_TESTING)
-
-if(NOT WITH_PROFILER)
-    add_definitions(-DPADDLE_DISABLE_PROFILER)
-endif(NOT WITH_PROFILER)
-
-if(WITH_AVX AND AVX_FOUND)
-    set(SIMD_FLAG ${AVX_FLAG})
-    add_definitions(-DPADDLE_WITH_AVX)
-elseif(SSE3_FOUND)
-    set(SIMD_FLAG ${SSE3_FLAG})
-endif()
-
-if(WIN32)
-  # windows header option for all targets.
-  add_definitions(-D_XKEYCHECK_H)
-  # Use symbols instead of absolute path, reduce the cmake link command length. 
-  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
-  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
-  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
-  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
-  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
-  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
-  SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@")
-  SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@")
-
-  # Specify the program to use when building static libraries
-  SET(CMAKE_C_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
-  SET(CMAKE_CXX_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
-
-  # set defination for the dll export
-  if (NOT MSVC)
-    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
-  endif(NOT MSVC)
-endif(WIN32)
-
-if(WITH_PSLIB)
-    add_definitions(-DPADDLE_WITH_PSLIB)
-endif()
-
-if(WITH_BOX_PS)
-    add_definitions(-DPADDLE_WITH_BOX_PS)
-endif()
-
-if(WITH_GPU)
-    add_definitions(-DPADDLE_WITH_CUDA)
-    add_definitions(-DEIGEN_USE_GPU)
-
-    FIND_PACKAGE(CUDA REQUIRED)
-
-    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
-    endif()
-
-    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle needs cudnn to compile")
-    endif()
-    if(CUPTI_FOUND)
-        include_directories(${CUPTI_INCLUDE_DIR})
-        add_definitions(-DPADDLE_WITH_CUPTI)
-    else()
-        message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
-    endif()
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
-
-    # Include cuda and cudnn
-    include_directories(${CUDNN_INCLUDE_DIR})
-    include_directories(${CUDA_TOOLKIT_INCLUDE})
-
-    if(TENSORRT_FOUND)
-        if(WIN32)
-            if(${CUDA_VERSION_MAJOR} VERSION_LESS 9)
-                message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows")
-            endif()
-        else()
-            if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
-                message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
-            endif()
-            if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-                message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
-            endif()
-            if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
-                message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
-            endif()
-        endif()
-        include_directories(${TENSORRT_INCLUDE_DIR})
-    endif()
-    if(ANAKIN_FOUND)
-        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
-            message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force ANAKIN_FOUND = OFF")
-            set(ANAKIN_FOUND OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE)
-        endif()
-        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-            message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force ANAKIN_FOUND = OFF")
-            set(ANAKIN_FOUND OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
-        endif()
-    endif()
-elseif(WITH_AMD_GPU)
-    add_definitions(-DPADDLE_WITH_HIP)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
-else()
-    add_definitions(-DHPPL_STUB_FUNC)
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-endif()
-
-if (WITH_MKLML AND MKLML_IOMP_LIB)
-    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if(WIN32)
-        # openmp not support well for now on windows
-        set(OPENMP_FLAGS "")
-    else(WIN32)
-        set(OPENMP_FLAGS "-fopenmp")
-    endif(WIN32)
-    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-endif()
-
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
-
-if(WITH_DISTRIBUTE)
-  add_definitions(-DPADDLE_WITH_DISTRIBUTE)
-endif()
-
-if(WITH_GRPC)
-    add_definitions(-DPADDLE_WITH_GRPC)
-endif(WITH_GRPC)
-
-if(WITH_BRPC_RDMA)
-    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
-endif(WITH_BRPC_RDMA)
-
-if(ON_INFER)
-    add_definitions(-DPADDLE_ON_INFERENCE)
-endif(ON_INFER)
diff --git a/cmake/copyfile.py b/cmake/copyfile.py
deleted file mode 100644
index 7ba4d95049dc76d1f6bd5bb67e116d5d3f4ea23b..0000000000000000000000000000000000000000
--- a/cmake/copyfile.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import shutil
-import glob
-
-
-def main():
-    src = sys.argv[1]
-    dst = sys.argv[2]
-    if os.path.isdir(src):  #copy directory
-        pathList = os.path.split(src)
-        dst = os.path.join(dst, pathList[-1])
-        if not os.path.exists(dst):
-            shutil.copytree(src, dst)
-            print("first copy directory: {0} --->>> {1}".format(src, dst))
-        else:
-            shutil.rmtree(dst)
-            shutil.copytree(src, dst)
-            print("overwritten copy directory: {0} --->>> {1}".format(src, dst))
-    else:  #copy file, wildcard
-        if not os.path.exists(dst):
-            os.makedirs(dst)
-        srcFiles = glob.glob(src)
-        for srcFile in srcFiles:
-            shutil.copy(srcFile, dst)
-            print("copy file: {0} --->>> {1}".format(srcFile, dst))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
deleted file mode 100644
index c0e96e28775f910d02e8c9d913fc3906d93291e0..0000000000000000000000000000000000000000
--- a/cmake/coveralls.cmake
+++ /dev/null
@@ -1,102 +0,0 @@
-# CMake script for code coverage.
-# If _COVERALLS_UPLOAD is ON, it will upload json files to overalls.io automatically.
-
-# Param _COVERAGE_SRCS          A list of coverage source files.
-# Param _COVERALLS_UPLOAD       Upload the result to coveralls.
-# Param _CMAKE_SCRIPT_PATH      CMake script path.
-function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
-    # clean previous gcov data.
-    file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda)
-
-    # find curl for upload JSON soon.
-    if (_COVERALLS_UPLOAD)
-        find_program(CURL_EXECUTABLE curl)
-        if (NOT CURL_EXECUTABLE)
-            message(FATAL_ERROR "Coveralls: curl not found!")
-        endif()
-    endif()
-
-    # When passing a CMake list to an external process, the list
-    # will be converted from the format "1;2;3" to "1 2 3".
-    set(COVERAGE_SRCS "")
-    foreach (SINGLE_SRC ${_COVERAGE_SRCS})
-        set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
-    endforeach()
-
-    # query number of logical cores
-    cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES)
-    # coveralls json file.
-    set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json)
-    add_custom_target(coveralls_generate
-        # Run regress tests.
-        COMMAND ${CMAKE_CTEST_COMMAND}
-                -j ${core_size}
-                --output-on-failure
-        # Generate Gcov and translate it into coveralls JSON.
-        COMMAND ${CMAKE_COMMAND}
-                -DCOVERAGE_SRCS="${COVERAGE_SRCS}"
-                -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}"
-                -DCOV_PATH="${PROJECT_BINARY_DIR}"
-                -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}"
-                -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake"
-        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
-        COMMENT "Coveralls: generating coveralls output..."
-    )
-
-    if (_COVERALLS_UPLOAD)
-        message("COVERALLS UPLOAD: ON")
-        # Upload the JSON to coveralls.
-        add_custom_target(coveralls_upload
-            COMMAND ${CURL_EXECUTABLE}
-                    -S -F json_file=@${COVERALLS_FILE}
-                    https://coveralls.io/api/v1/jobs
-            DEPENDS coveralls_generate
-            WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
-            COMMENT "Coveralls: uploading coveralls output...")
-
-        add_custom_target(coveralls DEPENDS coveralls_upload)
-    else()
-        message("COVERALLS UPLOAD: OFF")
-        add_custom_target(coveralls DEPENDS coveralls_generate)
-    endif()
-endfunction()
-
-if(WITH_COVERAGE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
-
-    set(EXCLUDE_DIRS
-        "demo/"
-        "build/"
-        "tests/"
-        ".test_env/"
-    )
-
-    if(WITH_GPU)
-        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu")
-    else()
-        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c")
-    endif()
-
-    # exclude trivial files in PADDLE_SOURCES
-    foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
-        foreach(TMP_PATH ${PADDLE_SOURCES})
-            string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
-            if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
-                list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
-            endif()
-        endforeach(TMP_PATH)
-    endforeach()
-
-    # convert to absolute path
-    set(PADDLE_SRCS "")
-    foreach(PADDLE_SRC ${PADDLE_SOURCES})
-        set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
-    endforeach()
-
-    code_coverage(
-        "${PADDLE_SRCS}"
-        ${COVERALLS_UPLOAD}
-        "${PROJECT_SOURCE_DIR}/cmake"
-    )
-endif()
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
deleted file mode 100644
index 4641184fcf5273b884524d9b9444209ffb65e000..0000000000000000000000000000000000000000
--- a/cmake/coverallsGcovJsons.cmake
+++ /dev/null
@@ -1,401 +0,0 @@
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-# Copyright (C) 2014 Joakim Söderberg <joakim.soderberg@gmail.com>
-#
-# This is intended to be run by a custom target in a CMake project like this.
-# 0. Compile program with coverage support.
-# 1. Clear coverage data. (Recursively delete *.gcda in build dir)
-# 2. Run the unit tests.
-# 3. Run this script specifying which source files the coverage should be performed on.
-#
-# This script will then use gcov to generate .gcov files in the directory specified
-# via the COV_PATH var. This should probably be the same as your cmake build dir.
-#
-# It then parses the .gcov files to convert them into the Coveralls JSON format:
-# https://coveralls.io/docs/api
-#
-
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
-
-# Since it's not possible to pass a CMake list properly in the
-# "1;2;3" format to an external process, we have replaced the
-# ";" with "*", so reverse that here so we get it back into the
-# CMake list format.
-string(REGEX REPLACE "\\*" ";" COVERAGE_SRCS ${COVERAGE_SRCS})
-
-find_program(GCOV_EXECUTABLE gcov)
-if (NOT GCOV_EXECUTABLE)
-	message(FATAL_ERROR "gcov not found! Aborting...")
-endif()
-
-find_package(Git)
-
-# TODO: Add these git things to the coveralls json.
-if (GIT_FOUND)
-	# Branch.
-	execute_process(
-		COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
-		WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-		OUTPUT_VARIABLE GIT_BRANCH
-		OUTPUT_STRIP_TRAILING_WHITESPACE
-	)
-
-	macro (git_log_format FORMAT_CHARS VAR_NAME)
-		execute_process(
-			COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS}
-			WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-			OUTPUT_VARIABLE ${VAR_NAME}
-			OUTPUT_STRIP_TRAILING_WHITESPACE
-		)
-	endmacro()
-
-	git_log_format(an GIT_AUTHOR_EMAIL)
-	git_log_format(ae GIT_AUTHOR_EMAIL)
-	git_log_format(cn GIT_COMMITTER_NAME)
-	git_log_format(ce GIT_COMMITTER_EMAIL)
-	git_log_format(B GIT_COMMIT_MESSAGE)
-
-	message("Git exe: ${GIT_EXECUTABLE}")
-	message("Git branch: ${GIT_BRANCH}")
-	message("Git author: ${GIT_AUTHOR_NAME}")
-	message("Git e-mail: ${GIT_AUTHOR_EMAIL}")
-	message("Git commiter name: ${GIT_COMMITTER_NAME}")
-	message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}")
-	message("Git commit message: ${GIT_COMMIT_MESSAGE}")
-
-endif()
-
-############################# Macros #########################################
-
-#
-# This macro converts from the full path format gcov outputs:
-#
-#    /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
-#
-# to the original source file path the .gcov is for:
-#
-#   /path/to/project/root/subdir/the_file.c
-#
-macro(get_source_path_from_gcov_filename _SRC_FILENAME _GCOV_FILENAME)
-
-	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
-	# -> 
-	# #path#to#project#root#subdir#the_file.c.gcov   
-	get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME)
-
-	# #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c
-	string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT})
-	string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP})
-	set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}")
-endmacro()
-
-##############################################################################
-
-# Get the coverage data.
-file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
-message("Process GCDA files:")
-message("===============================")
-
-# Get a list of all the object directories needed by gcov
-# (The directories the .gcda files and .o files are found in)
-# and run gcov on those.
-foreach(GCDA ${GCDA_FILES})
-	get_filename_component(GCDA_DIR ${GCDA} PATH)
-
-	#
-	# The -p below refers to "Preserve path components",
-	# This means that the generated gcov filename of a source file will
-	# keep the original files entire filepath, but / is replaced with #.
-	# Example:
-	#
-	# /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda
-	# ------------------------------------------------------------------------------
-	# File '/path/to/project/root/subdir/the_file.c'
-	# Lines executed:68.34% of 199
-	# /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov'
-	#
-	# If -p is not specified then the file is named only "the_file.c.gcov"
-	#
-	execute_process(
-		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null
-		WORKING_DIRECTORY ${GCDA_DIR}
-	)
-endforeach()
-
-# TODO: Make these be absolute path
-file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov")
-
-# Get only the filenames to use for filtering.
-#set(COVERAGE_SRCS_NAMES "")
-#foreach (COVSRC ${COVERAGE_SRCS})
-#	get_filename_component(COVSRC_NAME ${COVSRC} NAME)
-#	message("${COVSRC} -> ${COVSRC_NAME}")
-#	list(APPEND COVERAGE_SRCS_NAMES "${COVSRC_NAME}")
-#endforeach()
-
-#
-# Filter out all but the gcov files we want.
-#
-# We do this by comparing the list of COVERAGE_SRCS filepaths that the
-# user wants the coverage data for with the paths of the generated .gcov files,
-# so that we only keep the relevant gcov files.
-#
-# Example:
-# COVERAGE_SRCS =
-#				/path/to/project/root/subdir/the_file.c
-#
-# ALL_GCOV_FILES =
-#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
-#				/path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov
-# 
-# Result should be:
-# GCOV_FILES = 
-#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
-#
-set(GCOV_FILES "")
-#message("Look in coverage sources: ${COVERAGE_SRCS}")
-message("\nFilter out unwanted GCOV files:")
-message("===============================")
-
-set(COVERAGE_SRCS_REMAINING ${COVERAGE_SRCS})
-
-foreach (GCOV_FILE ${ALL_GCOV_FILES})
-
-	#
-	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
-	# -> 
-	# /path/to/project/root/subdir/the_file.c 
-	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
-
-	# Is this in the list of source files?
-	# TODO: We want to match against relative path filenames from the source file root...
-	list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND)
-
-	if (NOT WAS_FOUND EQUAL -1)
-		message("YES: ${GCOV_FILE}")
-		list(APPEND GCOV_FILES ${GCOV_FILE})
-
-		# We remove it from the list, so we don't bother searching for it again.
-		# Also files left in COVERAGE_SRCS_REMAINING after this loop ends should
-		# have coverage data generated from them (no lines are covered).
-		list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH})
-	else()
-		message("NO:  ${GCOV_FILE}")
-	endif()
-endforeach()
-
-# TODO: Enable setting these
-set(JSON_SERVICE_NAME "travis-ci")
-set(JSON_SERVICE_JOB_ID $ENV{TRAVIS_JOB_ID})
-
-set(JSON_TEMPLATE
-"{
-  \"service_name\": \"\@JSON_SERVICE_NAME\@\",
-  \"service_job_id\": \"\@JSON_SERVICE_JOB_ID\@\",
-  \"source_files\": \@JSON_GCOV_FILES\@
-}"
-)
-
-set(SRC_FILE_TEMPLATE
-"{
-      \"name\": \"\@GCOV_SRC_REL_PATH\@\",
-      \"source_digest\": \"\@GCOV_CONTENTS_MD5\@\",
-      \"coverage\": \@GCOV_FILE_COVERAGE\@
-  }"
-)
-
-message("\nGenerate JSON for files:")
-message("=========================")
-
-set(JSON_GCOV_FILES "[")
-
-# Read the GCOV files line by line and get the coverage data.
-foreach (GCOV_FILE ${GCOV_FILES})
-
-	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
-	file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}")
-
-	# The new coveralls API doesn't need the entire source (Yay!)
-	# However, still keeping that part for now. Will cleanup in the future.
-	file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5)
-	message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
-
-	# Loads the gcov file as a list of lines.
-	# (We first open the file and replace all occurences of [] with _
-	#  because CMake will fail to parse a line containing unmatched brackets...
-	#  also the \ to escaped \n in macros screws up things.)
-	# https://public.kitware.com/Bug/view.php?id=15369
-	file(READ ${GCOV_FILE} GCOV_CONTENTS)
-	string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
-	string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
-	string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
-	file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}")
-
-	file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES)
-	list(LENGTH GCOV_LINES LINE_COUNT)
-
-	# Instead of trying to parse the source from the
-	# gcov file, simply read the file contents from the source file.
-	# (Parsing it from the gcov is hard because C-code uses ; in many places
-	#  which also happens to be the same as the CMake list delimeter).
-	file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE)
-
-	string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	# According to http://json.org/ these should be escaped as well.
-	# Don't know how to do that in CMake however...
-	#string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	#string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	#string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-
-	# We want a json array of coverage data as a single string
-	# start building them from the contents of the .gcov
-	set(GCOV_FILE_COVERAGE "[")
-
-	set(GCOV_LINE_COUNT 1) # Line number for the .gcov.
-	set(DO_SKIP 0)
-	foreach (GCOV_LINE ${GCOV_LINES})
-		#message("${GCOV_LINE}")
-		# Example of what we're parsing:
-		# Hitcount  |Line | Source
-		# "        8:   26:        if (!allowed || (strlen(allowed) == 0))"
-		string(REGEX REPLACE 
-			"^([^:]*):([^:]*):(.*)$" 
-			"\\1;\\2;\\3"
-			RES
-			"${GCOV_LINE}")
-
-		# Check if we should exclude lines using the Lcov syntax.
-		string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}")
-		string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}")
-		string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}")
-
-		set(RESET_SKIP 0)
-		if (LINE_SKIP AND NOT DO_SKIP)
-			set(DO_SKIP 1)
-			set(RESET_SKIP 1)
-		endif()
-
-		if (START_SKIP)
-			set(DO_SKIP 1)
-			message("${GCOV_LINE_COUNT}: Start skip")
-		endif()
-
-		if (END_SKIP)
-			set(DO_SKIP 0)
-		endif()
-
-		list(LENGTH RES RES_COUNT)
-
-		if (RES_COUNT GREATER 2)
-			list(GET RES 0 HITCOUNT)
-			list(GET RES 1 LINE)
-			list(GET RES 2 SOURCE)
-
-			string(STRIP ${HITCOUNT} HITCOUNT)
-			string(STRIP ${LINE} LINE)
-
-			# Lines with 0 line numbers are metadata and can be ignored.
-			if (NOT ${LINE} EQUAL 0)
-				
-				if (DO_SKIP)
-					set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
-				else()
-					# Translate the hitcount into valid JSON values.
-					if (${HITCOUNT} STREQUAL "#####")
-						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
-					elseif (${HITCOUNT} STREQUAL "-")
-						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
-					else()
-						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ")
-					endif()
-				endif()
-			endif()
-		else()
-			message(WARNING "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}")
-		endif()
-
-		if (RESET_SKIP)
-			set(DO_SKIP 0)
-		endif()
-		math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1")
-	endforeach()
-
-	message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!")
-
-	# Advanced way of removing the trailing comma in the JSON array.
-	# "[1, 2, 3, " -> "[1, 2, 3"
-	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
-
-	# Append the trailing ] to complete the JSON array.
-	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
-
-	# Generate the final JSON for this file.
-	message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...")
-	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
-
-	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
-endforeach()
-
-# Loop through all files we couldn't find any coverage for
-# as well, and generate JSON for those as well with 0% coverage.
-foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
-
-	# Loads the source file as a list of lines.
-	file(STRINGS ${NOT_COVERED_SRC} SRC_LINES)
-
-	set(GCOV_FILE_COVERAGE "[")
-	set(GCOV_FILE_SOURCE "")
-
-	foreach (SOURCE ${SRC_LINES})
-		set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
-
-		string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}")
-		string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}")
-		string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}")
-		string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}")
-		set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n")
-	endforeach()
-
-	# Remove trailing comma, and complete JSON array with ]
-	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
-	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
-
-	# Generate the final JSON for this file.
-	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
-	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
-endforeach()
-
-# Get rid of trailing comma.
-string(REGEX REPLACE ",[ ]*$" "" JSON_GCOV_FILES ${JSON_GCOV_FILES})
-set(JSON_GCOV_FILES "${JSON_GCOV_FILES}]")
-
-# Generate the final complete JSON!
-message("Generate final JSON...")
-string(CONFIGURE ${JSON_TEMPLATE} JSON)
-
-file(WRITE "${COVERALLS_OUTPUT_FILE}" "${JSON}")
-message("###########################################################################")
-message("Generated coveralls JSON containing coverage data:") 
-message("${COVERALLS_OUTPUT_FILE}")
-message("###########################################################################")
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
deleted file mode 100644
index 09d713642a153c39a3125f5fc44890a2fedee923..0000000000000000000000000000000000000000
--- a/cmake/cuda.cmake
+++ /dev/null
@@ -1,220 +0,0 @@
-if(NOT WITH_GPU)
-    return()
-endif()
-
-set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
-set(paddle_known_gpu_archs7 "30 35 50 52")
-set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
-set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
-set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
-
-######################################################################################
-# A function for automatic detection of GPUs installed  (if autodetection is enabled)
-# Usage:
-#   detect_installed_gpus(out_variable)
-function(detect_installed_gpus out_variable)
-  if(NOT CUDA_gpu_detect_output)
-    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
-
-    file(WRITE ${cufile} ""
-      "#include <cstdio>\n"
-      "int main() {\n"
-      "  int count = 0;\n"
-      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
-      "  if (count == 0) return -1;\n"
-      "  for (int device = 0; device < count; ++device) {\n"
-      "    cudaDeviceProp prop;\n"
-      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
-      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
-      "  }\n"
-      "  return 0;\n"
-      "}\n")
-
-    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
-                    "--run" "${cufile}"
-                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-    if(nvcc_res EQUAL 0)
-      # only keep the last line of nvcc_out
-      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
-      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
-      list(GET nvcc_out -1 nvcc_out)
-      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
-      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
-    endif()
-  endif()
-
-  if(NOT CUDA_gpu_detect_output)
-    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
-    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
-  else()
-    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
-  endif()
-endfunction()
-
-
-########################################################################
-# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
-# Usage:
-#   select_nvcc_arch_flags(out_variable)
-function(select_nvcc_arch_flags out_variable)
-  # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
-  set(archs_name_default "All")
-  list(APPEND archs_names "Auto")
-
-  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
-  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
-  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
-  mark_as_advanced(CUDA_ARCH_NAME)
-
-  # verify CUDA_ARCH_NAME value
-  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
-    string(REPLACE ";" ", " archs_names "${archs_names}")
-    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
-  endif()
-
-  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
-    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
-    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
-  else()
-    unset(CUDA_ARCH_BIN CACHE)
-    unset(CUDA_ARCH_PTX CACHE)
-  endif()
-
-  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
-    set(cuda_arch_bin "30 35")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
-    set(cuda_arch_bin "50")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
-    set(cuda_arch_bin "60 61")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
-    set(cuda_arch_bin "70")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
-    set(cuda_arch_bin "75")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
-    set(cuda_arch_bin ${paddle_known_gpu_archs})
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
-    detect_installed_gpus(cuda_arch_bin)
-  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
-    set(cuda_arch_bin ${CUDA_ARCH_BIN})
-  endif()
-
-  # remove dots and convert to lists
-  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
-  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
-  list(REMOVE_DUPLICATES cuda_arch_bin)
-  list(REMOVE_DUPLICATES cuda_arch_ptx)
-
-  set(nvcc_flags "")
-  set(nvcc_archs_readable "")
-
-  # Tell NVCC to add binaries for the specified GPUs
-  foreach(arch ${cuda_arch_bin})
-    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified PTX for the concrete BIN
-      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
-      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
-    else()
-      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
-      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
-      list(APPEND nvcc_archs_readable sm_${arch})
-    endif()
-  endforeach()
-
-  # Tell NVCC to add PTX intermediate code for the specified architectures
-  foreach(arch ${cuda_arch_ptx})
-    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
-    list(APPEND nvcc_archs_readable compute_${arch})
-  endforeach()
-
-  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
-  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
-  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
-endfunction()
-
-message(STATUS "CUDA detected: " ${CUDA_VERSION})
-if (${CUDA_VERSION} LESS 7.0)
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
-elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
-  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
-  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
-  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
-  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
-  # warning for now.
-  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
-elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
-  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
-  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
-  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
-  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-endif()
-add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"")
-
-include_directories(${CUDA_INCLUDE_DIRS})
-if(NOT WITH_DSO)
-    if(WIN32)
-      set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
-    endif(WIN32)
-endif(NOT WITH_DSO)
-
-# setting nvcc arch flags
-select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
-list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
-message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
-
-# Set C++11 support
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-
-# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
-# So, don't set these flags here.
-if (NOT WIN32) # windows msvc2015 support c++11 natively. 
-# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
-list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
-endif(NOT WIN32)
-
-# in cuda9, suppress cuda warning on eigen 
-list(APPEND CUDA_NVCC_FLAGS "-w")
-# Set :expt-relaxed-constexpr to suppress Eigen warnings
-list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
-
-if (NOT WIN32)
-  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-  elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-  elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-  elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-      # nvcc 9 does not support -Os. Use Release flags instead
-      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-  endif()
-else(NOT WIN32)
-  list(APPEND CUDA_NVCC_FLAGS  "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
-  list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
-  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
-    # match the cl's _ITERATOR_DEBUG_LEVEL
-    list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
-  elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-    list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
-  else()
-  message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
-endif()
-endif(NOT WIN32)
-
-mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
-mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
deleted file mode 100644
index 98466d44fc0dd91ef0cc8e8eac2660c42a19267c..0000000000000000000000000000000000000000
--- a/cmake/cudnn.cmake
+++ /dev/null
@@ -1,102 +0,0 @@
-if(NOT WITH_GPU)
-    return()
-endif()
-
-if(WIN32)
-    set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
-else(WIN32)
-    set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
-endif(WIN32)
-
-find_path(CUDNN_INCLUDE_DIR cudnn.h
-    PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
-    $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
-    NO_DEFAULT_PATH
-)
-
-get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
-
-set(TARGET_ARCH "x86_64")
-if(NOT ${CMAKE_SYSTEM_PROCESSOR})
-    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
-endif()
-
-list(APPEND CUDNN_CHECK_LIBRARY_DIRS
-    ${CUDNN_ROOT}
-    ${CUDNN_ROOT}/lib64
-    ${CUDNN_ROOT}/lib
-    ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
-    $ENV{CUDNN_ROOT}
-    $ENV{CUDNN_ROOT}/lib64
-    $ENV{CUDNN_ROOT}/lib
-    /usr/lib
-	${CUDA_TOOLKIT_ROOT_DIR}
-	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-	)
-set(CUDNN_LIB_NAME "")
-if (LINUX)
-set(CUDNN_LIB_NAME "libcudnn.so")
-endif(LINUX)
-
-if(WIN32)
-# only support cudnn7
-set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
-endif(WIN32)
-
-if(APPLE)
-set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
-endif(APPLE)
-
-find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
-    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
-          NO_DEFAULT_PATH
-    DOC "Path to cuDNN library.")
-
-
-if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY)
-    set(CUDNN_FOUND ON)
-else()
-    set(CUDNN_FOUND OFF)
-endif()
-
-if(CUDNN_FOUND)
-    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
-
-    get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
-
-    string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
-        CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define CUDNN_VERSION +([0-9]+)" "\\1"
-        CUDNN_VERSION "${CUDNN_VERSION}")
-
-    if("${CUDNN_VERSION}" STREQUAL "2000")
-        message(STATUS "Current cuDNN version is v2. ")
-    else()
-        string(REGEX MATCH "define CUDNN_MAJOR +([0-9]+)" CUDNN_MAJOR_VERSION
-            "${CUDNN_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define CUDNN_MAJOR +([0-9]+)" "\\1"
-            CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}")
-        string(REGEX MATCH "define CUDNN_MINOR +([0-9]+)" CUDNN_MINOR_VERSION
-            "${CUDNN_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define CUDNN_MINOR +([0-9]+)" "\\1"
-            CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}")
-        string(REGEX MATCH "define CUDNN_PATCHLEVEL +([0-9]+)"
-            CUDNN_PATCHLEVEL_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define CUDNN_PATCHLEVEL +([0-9]+)" "\\1"
-            CUDNN_PATCHLEVEL_VERSION "${CUDNN_PATCHLEVEL_VERSION}")
-
-        if(NOT CUDNN_MAJOR_VERSION)
-            set(CUDNN_VERSION "???")
-        else()
-            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
-            math(EXPR CUDNN_VERSION
-                "${CUDNN_MAJOR_VERSION} * 1000 +
-                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
-        endif()
-
-        message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. "
-            "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
-
-    endif()
-endif()
diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
deleted file mode 100644
index 72ed0f1e5858d6d836743ceb038c7f4ad8f194cf..0000000000000000000000000000000000000000
--- a/cmake/cupti.cmake
+++ /dev/null
@@ -1,41 +0,0 @@
-if(NOT WITH_GPU)
-    return()
-endif()
-
-
-set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
-find_path(CUPTI_INCLUDE_DIR cupti.h
-        PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
-        $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
-        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
-        NO_DEFAULT_PATH
-        )
-
-get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
-
-set(TARGET_ARCH "x86_64")
-if(NOT ${CMAKE_SYSTEM_PROCESSOR})
-    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
-endif()
-
-list(APPEND CUPTI_CHECK_LIBRARY_DIRS
-        ${CUPTI_ROOT}
-        ${CUPTI_ROOT}/lib64
-        ${CUPTI_ROOT}/lib
-        ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-        $ENV{CUPTI_ROOT}
-        $ENV{CUPTI_ROOT}/lib64
-        $ENV{CUPTI_ROOT}/lib
-        /usr/lib
-        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
-find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
-       PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
-       NO_DEFAULT_PATH
-       DOC "Path to cuPTI library.")
-
-get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
-if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
-    set(CUPTI_FOUND ON)
-else()
-    set(CUPTI_FOUND OFF)
-endif()
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
deleted file mode 100644
index ba8b5fc6c838b221fcfb559f1f01051fc09072a4..0000000000000000000000000000000000000000
--- a/cmake/external/boost.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-include(ExternalProject)
-
-set(BOOST_PROJECT       "extern_boost")
-# To release PaddlePaddle as a pip package, we have to follow the
-# manylinux1 standard, which features as old Linux kernels and
-# compilers as possible and recommends CentOS 5. Indeed, the earliest
-# CentOS version that works with NVIDIA CUDA is CentOS 6.  And a new
-# version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
-# checked that the devtools package of CentOS 6 installs boost 1.41.0.
-# So we use 1.41.0 here.
-set(BOOST_VER           "1.41.0")
-set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
-
-MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
-
-set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
-set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
-
-set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE)
-set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
-include_directories(${BOOST_INCLUDE_DIR})
-
-ExternalProject_Add(
-    ${BOOST_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
-    URL      ${BOOST_URL}
-    DOWNLOAD_NO_PROGRESS  1
-    PREFIX                ${BOOST_SOURCES_DIR}
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    INSTALL_COMMAND       ""
-    UPDATE_COMMAND        ""
-    )
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
-    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
-    add_library(boost STATIC ${dummyfile})
-else()
-    add_library(boost INTERFACE)
-endif()
-
-add_dependencies(boost ${BOOST_PROJECT})
-set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
diff --git a/cmake/external/box_ps.cmake b/cmake/external/box_ps.cmake
deleted file mode 100644
index ddb4c82e1d4424c8c5305de8ba232d382b28def9..0000000000000000000000000000000000000000
--- a/cmake/external/box_ps.cmake
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_BOX_PS})
-  return()
-ENDIF(NOT ${WITH_BOX_PS})
-
-IF(WIN32 OR APPLE)
-    MESSAGE(WARNING
-        "Windows or Mac is not supported with BOX_PS in Paddle yet."
-        "Force WITH_BOX_PS=OFF")
-    SET(WITH_BOX_PS OFF CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE)
-    return()
-ENDIF()
-
-INCLUDE(ExternalProject)
-
-SET(BOX_PS_PROJECT       "extern_box_ps")
-IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE)
-  SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE)
-  SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps_stub.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}")
-SET(BOX_PS_SOURCE_DIR    "${THIRD_PARTY_PATH}/box_ps")
-SET(BOX_PS_DOWNLOAD_DIR  "${BOX_PS_SOURCE_DIR}/src/${BOX_PS_PROJECT}")
-SET(BOX_PS_DST_DIR       "box_ps")
-SET(BOX_PS_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(BOX_PS_INSTALL_DIR   ${BOX_PS_INSTALL_ROOT}/${BOX_PS_DST_DIR})
-SET(BOX_PS_ROOT          ${BOX_PS_INSTALL_DIR})
-SET(BOX_PS_INC_DIR       ${BOX_PS_ROOT}/include)
-SET(BOX_PS_LIB_DIR       ${BOX_PS_ROOT}/lib)
-SET(BOX_PS_LIB           ${BOX_PS_LIB_DIR}/libbox_ps.so)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BOX_PS_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${BOX_PS_INC_DIR})
-FILE(WRITE ${BOX_PS_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(BOX_PS)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${BOX_PS_NAME}/include ${BOX_PS_NAME}/lib \n"
-  "        DESTINATION ${BOX_PS_DST_DIR})\n")
-ExternalProject_Add(
-    ${BOX_PS_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${BOX_PS_SOURCE_DIR}
-    DOWNLOAD_DIR          ${BOX_PS_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOX_PS_URL} -c -q -O ${BOX_PS_NAME}.tar.gz
-                          && tar zxvf ${BOX_PS_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT}
-)
-ADD_LIBRARY(box_ps SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB})
-ADD_DEPENDENCIES(box_ps ${BOX_PS_PROJECT})
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
deleted file mode 100644
index a5a86afa4a5352f586714041d9f041b610d97b8e..0000000000000000000000000000000000000000
--- a/cmake/external/brpc.cmake
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-find_package(OpenSSL REQUIRED) 
-
-message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY})
-message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY})
-
-ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY})
-
-ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY})
-
-SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
-SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
-SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
-SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE)
-
-INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
-
-# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
-
-# If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
-ExternalProject_Add(
-    extern_brpc
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/apache/incubator-brpc"
-    GIT_TAG         "ad00fe940b4f05225b214131959293bbed8744a0" #rdma branch's head now.
-    PREFIX          ${BRPC_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
-                    -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    -DCMAKE_PREFIX_PATH=${prefix_path}
-                    -DWITH_GLOG=ON
-                    -DIOBUF_WITH_HUGE_BLOCK=ON
-                    -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    LIST_SEPARATOR |
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
-                     -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-)
-ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest)
-ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
-ADD_DEPENDENCIES(brpc extern_brpc)
-
-add_definitions(-DBRPC_WITH_GLOG)
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
deleted file mode 100644
index 52507a6ae4aabe300cf8bf88d0946c45a2c0e79c..0000000000000000000000000000000000000000
--- a/cmake/external/cares.cmake
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-IF(NOT WITH_DISTRIBUTE)
-    return()
-ENDIF()
-
-include (ExternalProject)
-
-# NOTE: c-ares is needed when linking with grpc.
-
-SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
-SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
-SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
-
-ExternalProject_Add(
-    extern_cares
-    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
-    GIT_TAG "cares-1_13_0"
-    PREFIX          ${CARES_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
-    BUILD_IN_SOURCE 1
-    BUILD_COMMAND   make -j8
-    INSTALL_COMMAND make install
-)
-
-ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
-             "${CARES_INSTALL_DIR}/lib/libcares.a")
-
-include_directories(${CARES_INCLUDE_DIR})
-ADD_DEPENDENCIES(cares extern_cares)
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
deleted file mode 100644
index 41ad8207743201fbddd1d678fc5122afe68207ae..0000000000000000000000000000000000000000
--- a/cmake/external/cub.cmake
+++ /dev/null
@@ -1,33 +0,0 @@
-if(NOT WITH_GPU)
-  return()
-endif()
-
-include(ExternalProject)
-
-set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
-set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
-
-include_directories(${CUB_INCLUDE_DIR})
-
-ExternalProject_Add(
-  extern_cub
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
-  GIT_TAG        "v1.8.0"
-  PREFIX         ${CUB_SOURCE_DIR}
-  UPDATE_COMMAND ""
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
-)
-
-if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
-  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
-  add_library(cub STATIC ${dummyfile})
-else()
-  add_library(cub INTERFACE)
-endif()
-
-add_dependencies(cub extern_cub)
diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake
deleted file mode 100644
index 5d5fcc3d4292bb9cfffedbf21bfe16e4485e5175..0000000000000000000000000000000000000000
--- a/cmake/external/dgc.cmake
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc")
-SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
-SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
-SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
-INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
-
-ExternalProject_Add(
-    extern_dgc
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL "http://fleet.bj.bcebos.com/collective.tgz"
-    URL_MD5  "015d565156c3de4e30fe25473f47e7a9"
-    SOURCE_DIR "${DGC_SOURCES_DIR}"
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND make -j
-    INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/  ${DGC_INCLUDE_DIR}/dgc
-        && cp ${DGC_SOURCES_DIR}/build/lib/libdgc.a ${DGC_LIBRARIES}
-        && cp ${DGC_SOURCES_DIR}/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
-    BUILD_IN_SOURCE 1
-)
-
-ADD_LIBRARY(dgc STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES})
-ADD_DEPENDENCIES(dgc extern_dgc)
-
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
deleted file mode 100644
index 63dd16b28e40a0c2d5310bec011c721285049952..0000000000000000000000000000000000000000
--- a/cmake/external/dlpack.cmake
+++ /dev/null
@@ -1,29 +0,0 @@
-include(ExternalProject)
-
-set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack)
-set(DLPACK_INCLUDE_DIR ${DLPACK_SOURCE_DIR}/src/extern_dlpack/include)
-
-include_directories(${DLPACK_INCLUDE_DIR})
-
-ExternalProject_Add(
-  extern_dlpack
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  GIT_REPOSITORY "https://github.com/dmlc/dlpack.git"
-  GIT_TAG        "v0.2"
-  PREFIX         ${DLPACK_SOURCE_DIR}
-  UPDATE_COMMAND ""
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
-)
-
-if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/dlpack_dummy.c)
-  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
-  add_library(dlpack STATIC ${dummyfile})
-else()
-  add_library(dlpack INTERFACE)
-endif()
-
-add_dependencies(dlpack extern_dlpack)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
deleted file mode 100644
index bea65d2d279bc4175ec1c0aab43573d41e622b94..0000000000000000000000000000000000000000
--- a/cmake/external/eigen.cmake
+++ /dev/null
@@ -1,53 +0,0 @@
-INCLUDE(ExternalProject)
-
-SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
-SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
-INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
-
-if(WIN32)
-    set(EIGEN_GIT_REPOSITORY https://github.com/wopeizl/eigen-git-mirror)
-    set(EIGEN_GIT_TAG support_cuda9_win)
-else()
-    set(EIGEN_GIT_REPOSITORY https://github.com/eigenteam/eigen-git-mirror)
-    set(EIGEN_GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c)
-endif()
-if(WITH_AMD_GPU)
-    ExternalProject_Add(
-        extern_eigen3
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
-        PREFIX          ${EIGEN_SOURCE_DIR}
-        UPDATE_COMMAND  ""
-        CONFIGURE_COMMAND ""
-        BUILD_COMMAND     ""
-        INSTALL_COMMAND   ""
-        TEST_COMMAND      ""
-    )
-else()
-    ExternalProject_Add(
-        extern_eigen3
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "${EIGEN_GIT_REPOSITORY}"
-        # eigen on cuda9.1 missing header of math_funtions.hpp
-        # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-        GIT_TAG         ${EIGEN_GIT_TAG}
-        PREFIX          ${EIGEN_SOURCE_DIR}
-        DOWNLOAD_NAME   "eigen"
-        UPDATE_COMMAND  ""
-        CONFIGURE_COMMAND ""
-        BUILD_COMMAND     ""
-        INSTALL_COMMAND   ""
-        TEST_COMMAND      ""
-    )
-endif()
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
-    file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";")
-    add_library(eigen3 STATIC ${dummyfile})
-else()
-    add_library(eigen3 INTERFACE)
-endif()
-
-add_dependencies(eigen3 extern_eigen3)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
deleted file mode 100644
index fae1e20d3bea0eeee3f9756e55e892eb512c3da6..0000000000000000000000000000000000000000
--- a/cmake/external/gflags.cmake
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
-SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
-SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
-IF(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
-ELSE(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
-  set(BUILD_COMMAND $(MAKE) --silent)
-  set(INSTALL_COMMAND $(MAKE) install)
-ENDIF(WIN32)
-
-INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
-
-ExternalProject_Add(
-    extern_gflags
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
-    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
-    PREFIX          ${GFLAGS_SOURCES_DIR}
-    BUILD_COMMAND   ${BUILD_COMMAND}
-    INSTALL_COMMAND ${INSTALL_COMMAND}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DBUILD_STATIC_LIBS=ON
-                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-)
-
-ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
-ADD_DEPENDENCIES(gflags extern_gflags)
-
-# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
-if (WIN32)
-  include(CheckIncludeFileCXX)
-  check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
-  if (HAVE_SHLWAPI)
-    set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
-  endif(HAVE_SHLWAPI)
-endif (WIN32)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
deleted file mode 100644
index ac6294048cf7198651de292f24f97c522a5009e0..0000000000000000000000000000000000000000
--- a/cmake/external/glog.cmake
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-SET(GLOG_SOURCES_DIR ${THIRD_PARTY_PATH}/glog)
-SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
-SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
-
-IF(WIN32)
-  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog library." FORCE)
-  SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
-ELSE(WIN32)
-  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
-  SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-ENDIF(WIN32)
-
-INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
-
-SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
-SET(GLOG_TAG "v0.3.5")
-
-ExternalProject_Add(
-    extern_glog
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DEPENDS gflags
-    GIT_REPOSITORY  ${GLOG_REPOSITORY}
-    GIT_TAG         ${GLOG_TAG}
-    PREFIX          ${GLOG_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-                    -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DWITH_GFLAGS=ON
-                    -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
-                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-)
-
-ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
-ADD_DEPENDENCIES(glog extern_glog gflags)
-LINK_LIBRARIES(glog gflags)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
deleted file mode 100644
index d96da470b3cbbd8092dbf80ec5f500af9afa2ce4..0000000000000000000000000000000000000000
--- a/cmake/external/grpc.cmake
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-IF(NOT WITH_DISTRIBUTE)
-    return()
-ENDIF()
-
-include (ExternalProject)
-
-SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
-SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
-SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
-SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
-
-include(ProcessorCount)
-ProcessorCount(NUM_OF_PROCESSOR)
-
-IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
-ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
-ENDIF()
-
-# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
-ExternalProject_Add(
-    extern_grpc
-    DEPENDS protobuf zlib
-    # NOTE(wuyi):
-    # this package is generated by following steps:
-    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. git submodule update --init
-    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
-    #    checkout and clean other dirs under third_party
-    # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
-    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
-    PREFIX          ${GRPC_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_IN_SOURCE 1
-    PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h
-    # NOTE(yuyang18):
-    # Disable -Werror, otherwise the compile will fail in MacOS.
-    # It seems that we cannot configure that by make command.
-    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
-    BUILD_COMMAND  ${BUILD_CMD}
-    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
-)
-
-ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
-             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
-
-ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
-ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
-
-ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
-
-include_directories(${GRPC_INCLUDE_DIR})
-ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
deleted file mode 100644
index 04189c4fa1b082f1975c2e54cb9ca8dcb40d8a2c..0000000000000000000000000000000000000000
--- a/cmake/external/gtest.cmake
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#FIXME:(gongwb) Move brpc's gtest dependency.
-
-include(GNUInstallDirs)
-
-IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
-    IF(WITH_TESTING)
-        ENABLE_TESTING()
-    ENDIF(WITH_TESTING)
-
-    INCLUDE(ExternalProject)
-
-    SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest)
-    SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest)
-    SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
-
-    INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
-
-    IF(WIN32)
-        set(GTEST_LIBRARIES
-            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
-        set(GTEST_MAIN_LIBRARIES
-            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
-    ELSE(WIN32)
-        set(GTEST_LIBRARIES
-            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
-        set(GTEST_MAIN_LIBRARIES
-            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
-    ENDIF(WIN32)
-
-    IF(WITH_MKLML)
-        # wait for mklml downloading completed
-        SET(GTEST_DEPENDS   ${MKLML_PROJECT})
-    ENDIF()
-
-    ExternalProject_Add(
-        extern_gtest
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        DEPENDS         ${GTEST_DEPENDS}
-        GIT_REPOSITORY  "https://github.com/google/googletest.git"
-        GIT_TAG         "release-1.8.1"
-        PREFIX          ${GTEST_SOURCES_DIR}
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                        -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DBUILD_GMOCK=ON
-                        -Dgtest_disable_pthreads=ON
-                        -Dgtest_force_shared_crt=ON
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
-                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    )
-
-    ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES})
-    ADD_DEPENDENCIES(gtest extern_gtest)
-
-    ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
-    ADD_DEPENDENCIES(gtest_main extern_gtest)
-
-ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
deleted file mode 100644
index 3ba8a466c647f1aeef0ad20d4a540b6926e94054..0000000000000000000000000000000000000000
--- a/cmake/external/leveldb.cmake
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb)
-SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb)
-SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE)
-SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE)
-INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR})
-
-ExternalProject_Add(
-    extern_leveldb
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX ${LEVELDB_SOURCES_DIR}
-    GIT_REPOSITORY "https://github.com/google/leveldb"
-    GIT_TAG v1.18
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
-    INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ 
-        && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES}
-        && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/
-    BUILD_IN_SOURCE 1
-)
-
-ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
-ADD_DEPENDENCIES(leveldb extern_leveldb)
diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake
deleted file mode 100644
index 717e021387ea1d77cde3c4960882ea2bfb0aac63..0000000000000000000000000000000000000000
--- a/cmake/external/libmct.cmake
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_LIBMCT})
-  return()
-ENDIF(NOT ${WITH_LIBMCT})
-
-IF(WIN32 OR APPLE)
-    MESSAGE(WARNING
-        "Windows or Mac is not supported with LIBMCT in Paddle yet."
-        "Force WITH_LIBMCT=OFF")
-    SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE)
-    return()
-ENDIF()
-
-INCLUDE(ExternalProject)
-
-SET(LIBMCT_PROJECT       "extern_libmct")
-IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE)
-  SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE)
-  SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct.tar.gz" CACHE STRING "" FORCE) 
-ENDIF()
-MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}")
-SET(LIBMCT_SOURCE_DIR    "${THIRD_PARTY_PATH}/libmct")
-SET(LIBMCT_DOWNLOAD_DIR  "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}")
-SET(LIBMCT_DST_DIR       "libmct")
-SET(LIBMCT_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(LIBMCT_INSTALL_DIR   ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR})
-SET(LIBMCT_ROOT          ${LIBMCT_INSTALL_DIR})
-SET(LIBMCT_INC_DIR       ${LIBMCT_ROOT}/include)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR})
-
-FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(LIBMCT)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n"
-  "        DESTINATION ${LIBMCT_DST_DIR})\n")
-
-ExternalProject_Add(
-    ${LIBMCT_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${LIBMCT_SOURCE_DIR}
-    DOWNLOAD_DIR          ${LIBMCT_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz
-                          && tar zxvf ${LIBMCT_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT}
-)
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
-    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
-    add_library(libmct STATIC ${dummyfile})
-else()
-    add_library(libmct INTERFACE)
-endif()
-
-ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
deleted file mode 100644
index 69cdba7c5921f14a87172d95791332e364045b26..0000000000000000000000000000000000000000
--- a/cmake/external/libxsmm.cmake
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF)
-
-IF(NOT WITH_LIBXSMM)
-    return()
-ENDIF()
-
-IF(WIN32 OR APPLE)
-    MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet.")
-    SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
-    return()
-ENDIF()
-
-INCLUDE (ExternalProject)
-
-SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm)
-SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
-SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE)
-SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE)
-SET(LIBXSMM_LIBS        "${LIBXSMM_LIBRARY_DIR}/libxsmm.a"
-                        "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
-
-ExternalProject_Add(
-    extern_libxsmm
-    GIT_REPOSITORY  "https://github.com/hfp/libxsmm.git"
-    GIT_TAG         "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
-    PREFIX          ${LIBXSMM_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_IN_SOURCE 1
-    BUILD_COMMAND   $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install
-    INSTALL_COMMAND ""
-)
-ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
-SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
-
-MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
-include_directories(${LIBXSMM_INCLUDE_DIR})
-ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
-ADD_DEPENDENCIES(libxsmm extern_libxsmm)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
deleted file mode 100644
index 0ca37f506f61568bcf9eb76e86e5a6074b8043d1..0000000000000000000000000000000000000000
--- a/cmake/external/mkldnn.cmake
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_MKLDNN})
-  return()
-ENDIF(NOT ${WITH_MKLDNN})
-
-INCLUDE(ExternalProject)
-
-SET(MKLDNN_PROJECT        "extern_mkldnn")
-SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
-SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
-SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-
-IF(APPLE)
-    MESSAGE(WARNING
-        "Mac is not supported with MKLDNN in Paddle yet."
-        "Force WITH_MKLDNN=OFF")
-    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE)
-    return()
-ENDIF()
-
-# Introduce variables:
-# * CMAKE_INSTALL_LIBDIR
-INCLUDE(GNUInstallDirs)
-SET(LIBDIR "lib")
-if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
-  SET(LIBDIR "lib64")
-endif()
-
-MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/l${LIBDIR} to runtime path")
-SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}")
-
-INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
-
-IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
-    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
-    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
-ELSE()
-    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
-ENDIF()
-
-IF(NOT WIN32)
-    SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
-    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
-    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
-    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
-ELSE()
-    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
-ENDIF(NOT WIN32)
-
-ExternalProject_Add(
-    ${MKLDNN_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DEPENDS             ${MKLDNN_DEPENDS}
-    GIT_REPOSITORY      "https://github.com/intel/mkl-dnn.git"
-    GIT_TAG             "aef88b7c233f48f8b945da310f1b973da31ad033"
-    PREFIX              ${MKLDNN_SOURCES_DIR}
-    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-    CMAKE_ARGS          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-    CMAKE_ARGS          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
-    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
-    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
-    CMAKE_ARGS          -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
-    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-                        -DMKLROOT:PATH=${MKLML_ROOT}
-)
-if(WIN32)
-    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
-else(WIN32)
-    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
-endif(WIN32)
-
-ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
-ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
-add_definitions(-DPADDLE_WITH_MKLDNN)
-
-# generate a static dummy target to track mkldnn dependencies
-# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
-SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c)
-FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-ADD_LIBRARY(mkldnn STATIC ${dummyfile})
-TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB})
-ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-
-# copy the real so.0 lib to install dir
-# it can be directly contained in wheel or capi
-if(WIN32)
-    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll)
-else(WIN32)
-    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
-    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
-            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-            DEPENDS mkldnn shared_mkldnn)
-endif(WIN32)
-ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
-ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
deleted file mode 100644
index 17556afec8dfc6a4bfd4fd321c6b6c521bf3bb1d..0000000000000000000000000000000000000000
--- a/cmake/external/mklml.cmake
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_MKLML})
-  return()
-ENDIF(NOT ${WITH_MKLML})
-
-IF(APPLE)
-    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
-    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
-    return()
-ENDIF()
-
-INCLUDE(ExternalProject)
-SET(MKLML_DST_DIR       "mklml")
-SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
-SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
-SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
-SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
-
-SET(TIME_VERSION "2019.0.1.20181227")
-IF(WIN32)
-    SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
-    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
-    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
-    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
-    SET(MKLML_SHARED_LIB_DEPS     ${MKLML_LIB_DIR}/msvcr120.dll)
-    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
-ELSE()
-    #TODO(intel-huying):
-    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
-    SET(MKLML_VER "csrmm2_mklml_lnx_2019.0.2" CACHE STRING "" FORCE)
-    SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
-    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
-    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
-    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
-ENDIF()
-
-SET(MKLML_PROJECT       "extern_mklml")
-MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
-SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
-
-ExternalProject_Add(
-    ${MKLML_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                 ${MKLML_SOURCE_DIR}
-    URL                    ${MKLML_URL}
-    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_NO_PROGRESS  1
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    UPDATE_COMMAND ""
-    INSTALL_COMMAND
-        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} &&
-        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR}
-)
-
-INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
-
-ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
-ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
deleted file mode 100644
index d28cc1c373b2f9ac6f854403921b8d1c32b4aab0..0000000000000000000000000000000000000000
--- a/cmake/external/ngraph.cmake
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-add_library(ngraph INTERFACE)
-
-IF(WIN32 OR APPLE)
-    MESSAGE(WARNING
-        "Windows or Mac is not supported with nGraph in Paddle yet."
-        "Force WITH_NGRAPH=OFF")
-    SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph in Windows and MacOS" FORCE)
-ENDIF()
-
-IF(${WITH_NGRAPH} AND NOT ${WITH_MKLDNN})
-    MESSAGE(WARNING
-        "nGraph needs mkl-dnn to be enabled."
-        "Force WITH_NGRAPH=OFF")
-    SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph if mkl-dnn is disabled" FORCE)
-ENDIF()
-
-IF(NOT ${WITH_NGRAPH})
-    return()
-ENDIF()
-
-INCLUDE(GNUInstallDirs)
-
-INCLUDE(ExternalProject)
-
-SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "e26d602a756f5f83e6c8220f910b61d7089fa951")
-SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
-SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
-SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
-SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
-SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    SET(NGRAPH_TBB_LIB_NAME    libtbb_debug.so.2)
-else()
-    SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
-endif()
-SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
-SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
-SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
-SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
-
-ExternalProject_Add(
-    ${NGRAPH_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DEPENDS                  ${MKLDNN_PROJECT} ${MKLML_PROJECT}
-    GIT_REPOSITORY           ${NGRAPH_GIT_REPO}
-    GIT_TAG                  ${NGRAPH_GIT_TAG}
-    PREFIX                   ${NGRAPH_SOURCES_DIR}
-    UPDATE_COMMAND           ""
-    CMAKE_GENERATOR          ${CMAKE_GENERATOR}
-    CMAKE_GENERATOR_PLATFORM ${CMAKE_GENERATOR_PLATFORM}
-    CMAKE_GENERATOR_TOOLSET  ${CMAKE_GENERATOR_TOOLSET}
-    CMAKE_ARGS               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS               -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS               -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR}
-    CMAKE_ARGS               -DNGRAPH_UNIT_TEST_ENABLE=FALSE
-    CMAKE_ARGS               -DNGRAPH_TOOLS_ENABLE=FALSE
-    CMAKE_ARGS               -DNGRAPH_INTERPRETER_ENABLE=FALSE
-    CMAKE_ARGS               -DNGRAPH_DEX_ONLY=TRUE
-    CMAKE_ARGS               -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-    CMAKE_ARGS               -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
-    CMAKE_ARGS               -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
-    CMAKE_ARGS               -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
-    CMAKE_ARGS               -NGRAPH_USE_LEGACY_MKLDNN=TRUE
-)
-
-add_dependencies(ngraph ${NGRAPH_PROJECT})
-target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
-target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
-target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
deleted file mode 100644
index 1d40ad108fbfba69d86bfa16be6e46ddfbdc25da..0000000000000000000000000000000000000000
--- a/cmake/external/openblas.cmake
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-INCLUDE(cblas)
-
-IF(NOT ${CBLAS_FOUND})
-    INCLUDE(ExternalProject)
-
-    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
-    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
-
-    SET(CBLAS_LIBRARIES
-        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
-        CACHE FILEPATH "openblas library." FORCE)
-
-    ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
-
-    IF (WIN32)
-        SET(CBLAS_FOUND true)
-        MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
-    ENDIF(WIN32)
-
-    IF (NOT WIN32)
-    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
-    SET(OPENBLAS_COMMIT "v0.2.20")
-
-    IF(APPLE)
-        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
-    ENDIF()
-    SET(OPTIONAL_ARGS "")
-    IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
-        SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
-    ENDIF()
-
-    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-    ExternalProject_Add(
-        extern_openblas
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-        GIT_TAG             ${OPENBLAS_COMMIT}
-        PREFIX              ${CBLAS_SOURCES_DIR}
-        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
-        BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
-        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
-                            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
-        UPDATE_COMMAND      ""
-        CONFIGURE_COMMAND   ""
-    )
-    ELSE(NOT WIN32)
-        SET(CBLAS_FOUND false)
-        SET(CBLAS_LIBRARIES
-            "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
-            CACHE FILEPATH "openblas library." FORCE)
-        INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}/openblas) # For openbals code to include its own headers.
-        INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install)
-        ExternalProject_Add(
-            extern_openblas
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-            GIT_TAG            "v0.3.7"
-            PREFIX              ${CBLAS_SOURCES_DIR}
-            INSTALL_DIR         ${CBLAS_INSTALL_DIR}
-            BUILD_IN_SOURCE     0
-            UPDATE_COMMAND      ""
-            CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                                    -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
-                                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                                    -DBUILD_SHARED_LIBS=ON
-                                    -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT}
-                                    ${EXTERNAL_OPTIONAL_ARGS}
-                CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
-                                    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                                    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-            )
-        add_custom_command(TARGET extern_openblas POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX}  ${CBLAS_INSTALL_DIR}/lib )
-        ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES})
-        ADD_DEPENDENCIES(openblas extern_openblas)
-    ENDIF(NOT WIN32)
-    SET(CBLAS_PROVIDER openblas)
-ENDIF(NOT ${CBLAS_FOUND})
-
-MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}")
-INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
-
-# FIXME(gangliao): generate cblas target to track all high performance
-# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
-SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
-FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
-ADD_LIBRARY(cblas STATIC ${dummyfile})
-
-IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
-  TARGET_LINK_LIBRARIES(cblas dynload_mklml)
-ELSE()
-  TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
-ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
-
-IF(WITH_LIBXSMM)
-  TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS})
-  ADD_DEPENDENCIES(cblas extern_libxsmm)
-ENDIF()
-
-IF(NOT ${CBLAS_FOUND})
-    ADD_DEPENDENCIES(cblas extern_openblas)
-ELSE()
-    IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
-        ADD_DEPENDENCIES(cblas mklml)
-    ENDIF()
-ENDIF(NOT ${CBLAS_FOUND})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
deleted file mode 100644
index e746a7a50a8573b8b3c2e8f461cc03cd3906a0c0..0000000000000000000000000000000000000000
--- a/cmake/external/protobuf.cmake
+++ /dev/null
@@ -1,252 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
-IF(NOT WIN32)
-FIND_PACKAGE(Protobuf QUIET)
-ENDIF(NOT WIN32)
-macro(UNSET_VAR VAR_NAME)
-    UNSET(${VAR_NAME} CACHE)
-    UNSET(${VAR_NAME})
-endmacro()
-
-UNSET_VAR(PROTOBUF_INCLUDE_DIR)
-UNSET_VAR(PROTOBUF_FOUND)
-UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
-UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
-UNSET_VAR(PROTOBUF_LITE_LIBRARY)
-UNSET_VAR(PROTOBUF_LIBRARY)
-UNSET_VAR(PROTOBUF_INCLUDE_DIR)
-UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
-function(protobuf_generate_python SRCS)
-    # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
-    if(NOT ARGN)
-        message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
-        return()
-    endif()
-
-    if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
-        # Create an include path for each file specified
-        foreach(FIL ${ARGN})
-            get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-            get_filename_component(ABS_PATH ${ABS_FIL} PATH)
-            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-            if(${_contains_already} EQUAL -1)
-                list(APPEND _protobuf_include_path -I ${ABS_PATH})
-            endif()
-        endforeach()
-    else()
-        set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
-    endif()
-    if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
-        set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
-    endif()
-
-    if(DEFINED Protobuf_IMPORT_DIRS)
-        foreach(DIR ${Protobuf_IMPORT_DIRS})
-            get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
-            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-            if(${_contains_already} EQUAL -1)
-                list(APPEND _protobuf_include_path -I ${ABS_PATH})
-            endif()
-        endforeach()
-    endif()
-
-    set(${SRCS})
-    foreach(FIL ${ARGN})
-        get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-        get_filename_component(FIL_WE ${FIL} NAME_WE)
-        if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
-            get_filename_component(FIL_DIR ${FIL} DIRECTORY)
-            if(FIL_DIR)
-                set(FIL_WE "${FIL_DIR}/${FIL_WE}")
-            endif()
-        endif()
-        list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
-        add_custom_command(
-                OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
-                COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
-                DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
-                COMMENT "Running Python protocol buffer compiler on ${FIL}"
-                VERBATIM )
-    endforeach()
-
-    set(${SRCS} ${${SRCS}} PARENT_SCOPE)
-endfunction()
-
-# Print and set the protobuf library information,
-# finish this cmake process and exit from this file.
-macro(PROMPT_PROTOBUF_LIB)
-    SET(protobuf_DEPS ${ARGN})
-
-    MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
-    MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}")
-    MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
-    MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}")
-    MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
-    INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
-
-    # Assuming that all the protobuf libraries are of the same type.
-    IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX})
-        SET(protobuf_LIBTYPE STATIC)
-    ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
-        SET(protobuf_LIBTYPE SHARED)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}")
-    ENDIF()
-
-    ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY})
-
-    ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY})
-
-    ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
-
-    ADD_EXECUTABLE(protoc IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE})
-    # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
-    # make `protobuf_generate_cpp` happy.
-    SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
-
-    FOREACH(dep ${protobuf_DEPS})
-        ADD_DEPENDENCIES(protobuf ${dep})
-        ADD_DEPENDENCIES(protobuf_lite ${dep})
-        ADD_DEPENDENCIES(libprotoc ${dep})
-        ADD_DEPENDENCIES(protoc ${dep})
-    ENDFOREACH()
-
-    RETURN()
-endmacro()
-macro(SET_PROTOBUF_VERSION)
-    EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
-    STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
-endmacro()
-
-set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
-IF (WIN32)
-    SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
-ENDIF(WIN32)
-
-if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
-    if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
-        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
-        SET(PROTOBUF_FOUND true)
-        SET_PROTOBUF_VERSION()
-        PROMPT_PROTOBUF_LIB()
-    else()
-        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}")
-    endif()
-endif()
-
-FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-    STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}")
-    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME})
-    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME})
-
-    SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
-    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
-    SET(${TARGET_NAME}_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}"
-         PARENT_SCOPE)
-    SET(${TARGET_NAME}_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}"
-         PARENT_SCOPE)
-    SET(${TARGET_NAME}_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}"
-         PARENT_SCOPE)
-    SET(${TARGET_NAME}_PROTOC_EXECUTABLE
-        "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}"
-         PARENT_SCOPE)
-
-    SET(OPTIONAL_CACHE_ARGS "")
-    SET(OPTIONAL_ARGS "")
-    IF(BUILD_FOR_HOST)
-        SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF")
-    ELSE()
-        SET(OPTIONAL_ARGS
-            "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-            "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-            "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
-            "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
-            "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
-            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
-            "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
-            "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
-            "-Dprotobuf_WITH_ZLIB=ON"
-            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
-            ${EXTERNAL_OPTIONAL_ARGS})
-        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
-    ENDIF()
-    IF(WIN32)
-        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
-    ENDIF()
-
-    SET(PROTOBUF_REPO "https://github.com/protocolbuffers/protobuf.git")
-    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
-
-    ExternalProject_Add(
-        ${TARGET_NAME}
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        PREFIX          ${PROTOBUF_SOURCES_DIR}
-        UPDATE_COMMAND  ""
-        DEPENDS         zlib
-        GIT_REPOSITORY  ${PROTOBUF_REPO}
-        GIT_TAG         ${PROTOBUF_TAG}
-        CONFIGURE_COMMAND
-        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
-            ${OPTIONAL_ARGS}
-            -Dprotobuf_BUILD_TESTS=OFF
-            -DCMAKE_SKIP_RPATH=ON
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-            -DCMAKE_INSTALL_LIBDIR=lib
-            -DBUILD_SHARED_LIBS=OFF
-            -Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}
-        CMAKE_CACHE_ARGS
-            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-            ${OPTIONAL_CACHE_ARGS}
-    )
-ENDFUNCTION()
-
-SET(PROTOBUF_VERSION 3.1.0)
-
-IF(NOT PROTOBUF_FOUND)
-    build_protobuf(extern_protobuf FALSE)
-
-    SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR}
-        CACHE PATH "protobuf include directory." FORCE)
-    SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY}
-        CACHE FILEPATH "protobuf lite library." FORCE)
-    SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY}
-        CACHE FILEPATH "protobuf library." FORCE)
-    SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
-        CACHE FILEPATH "protoc library." FORCE)
-
-    SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
-        CACHE FILEPATH "protobuf executable." FORCE)
-    PROMPT_PROTOBUF_LIB(extern_protobuf)
-ENDIF(NOT PROTOBUF_FOUND)
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
deleted file mode 100755
index 983d13e3f2d3ecacdd7b9c558ba8d7b817413284..0000000000000000000000000000000000000000
--- a/cmake/external/pslib.cmake
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_PSLIB})
-  return()
-ENDIF(NOT ${WITH_PSLIB})
-
-IF(WIN32 OR APPLE)
-    MESSAGE(WARNING
-        "Windows or Mac is not supported with PSLIB in Paddle yet."
-        "Force WITH_PSLIB=OFF")
-    SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE)
-    return()
-ENDIF()
-
-INCLUDE(ExternalProject)
-
-SET(PSLIB_PROJECT       "extern_pslib")
-IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(PSLIB_VER "0.1.1" CACHE STRING "" FORCE)
-  SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE)
-  SET(PSLIB_URL "ftp://yq01-ps-201704-m12-tianqi026.yq01.baidu.com/home/work/pslib_online/pslib.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}")
-SET(PSLIB_SOURCE_DIR    "${THIRD_PARTY_PATH}/pslib")
-SET(PSLIB_DOWNLOAD_DIR  "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}")
-SET(PSLIB_DST_DIR       "pslib")
-SET(PSLIB_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(PSLIB_INSTALL_DIR   ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR})
-SET(PSLIB_ROOT          ${PSLIB_INSTALL_DIR})
-SET(PSLIB_INC_DIR       ${PSLIB_ROOT}/include)
-SET(PSLIB_LIB_DIR       ${PSLIB_ROOT}/lib)
-SET(PSLIB_LIB           ${PSLIB_LIB_DIR}/libps.so)
-SET(PSLIB_IOMP_LIB      ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${PSLIB_INC_DIR})
-
-FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(PSLIB)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n"
-  "        DESTINATION ${PSLIB_DST_DIR})\n")
-
-ExternalProject_Add(
-    ${PSLIB_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${PSLIB_SOURCE_DIR}
-    DOWNLOAD_DIR          ${PSLIB_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz
-                          && tar zxvf ${PSLIB_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT}
-)
-
-ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
-ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
deleted file mode 100644
index c1d63089bb5462695735ee2ea7ceb0ba30c598b3..0000000000000000000000000000000000000000
--- a/cmake/external/pslib_brpc.cmake
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_PSLIB_BRPC})
-  return()
-ENDIF(NOT ${WITH_PSLIB_BRPC})
-
-IF(WIN32 OR APPLE)
-    MESSAGE(WARNING
-        "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet."
-        "Force WITH_PSLIB_BRPC=OFF")
-    SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE)
-    return()
-ENDIF()
-
-INCLUDE(ExternalProject)
-
-SET(PSLIB_BRPC_PROJECT       "extern_pslib_brpc")
-IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE)
-  SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE)
-  SET(PSLIB_BRPC_URL "https://pslib.bj.bcebos.com/pslib_brpc.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}")
-SET(PSLIB_BRPC_SOURCE_DIR    "${THIRD_PARTY_PATH}/pslib_brpc")
-SET(PSLIB_BRPC_DOWNLOAD_DIR  "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}")
-SET(PSLIB_BRPC_DST_DIR       "pslib_brpc")
-SET(PSLIB_BRPC_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(PSLIB_BRPC_INSTALL_DIR   ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR})
-SET(PSLIB_BRPC_ROOT          ${PSLIB_BRPC_INSTALL_DIR})
-SET(PSLIB_BRPC_INC_DIR       ${PSLIB_BRPC_ROOT}/include)
-SET(PSLIB_BRPC_LIB_DIR       ${PSLIB_BRPC_ROOT}/lib)
-SET(PSLIB_BRPC_LIB           ${PSLIB_BRPC_LIB_DIR}/libbrpc.a)
-SET(PSLIB_BRPC_IOMP_LIB      ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR})
-
-FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(PSLIB_BRPC)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n"
-  "        DESTINATION ${PSLIB_BRPC_DST_DIR})\n")
-
-ExternalProject_Add(
-    ${PSLIB_BRPC_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${PSLIB_BRPC_SOURCE_DIR}
-    DOWNLOAD_DIR          ${PSLIB_BRPC_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz
-                          && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT}
-)
-
-ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
-ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
deleted file mode 100644
index 3a10ea945d3d1a3fa88954196905872b2668d5a0..0000000000000000000000000000000000000000
--- a/cmake/external/pybind11.cmake
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT WITH_PYTHON)
-    return()
-endif()
-
-include(ExternalProject)
-
-set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
-
-include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
-
-ExternalProject_Add(
-        extern_pybind
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
-        GIT_TAG         "v2.2.4"
-        PREFIX          ${PYBIND_SOURCE_DIR}
-        UPDATE_COMMAND  ""
-        CONFIGURE_COMMAND ""
-        BUILD_COMMAND     ""
-        INSTALL_COMMAND   ""
-        TEST_COMMAND      ""
-)
-
-if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
-    add_library(pybind STATIC ${dummyfile})
-else()
-    add_library(pybind INTERFACE)
-endif()
-
-add_dependencies(pybind extern_pybind)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
deleted file mode 100644
index 623c53f4f75bbd217c157bcdda0cb12c510269ee..0000000000000000000000000000000000000000
--- a/cmake/external/python.cmake
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT WITH_PYTHON)
-    return()
-ENDIF()
-
-INCLUDE(python_module)
-
-FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED)
-FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED)
-
-if(WIN32)
-    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-"from distutils import sysconfig as s;import sys;import struct;
-print(sys.prefix);
-print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
-"
-            RESULT_VARIABLE _PYTHON_SUCCESS
-            OUTPUT_VARIABLE _PYTHON_VALUES
-            ERROR_VARIABLE _PYTHON_ERROR_VALUE)
-
-    if(NOT _PYTHON_SUCCESS MATCHES 0)
-        set(PYTHONLIBS_FOUND FALSE)
-        return()
-    endif()
-
-    # Convert the process output into a list
-    string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
-    string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
-    list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
-    list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
-
-    # Make sure all directory separators are '/'
-    string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
-
-    set(PYTHON_LIBRARY
-            "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
-
-    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
-    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
-    if(NOT EXISTS "${PYTHON_LIBRARY}")
-        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
-        set(PYTHON_LIBRARY
-                "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
-    endif()
-
-    # raise an error if the python libs are still not found.
-    if(NOT EXISTS "${PYTHON_LIBRARY}")
-        message(FATAL_ERROR "Python libraries not found")
-    endif()
-    SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
-endif(WIN32)
-
-# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
-ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
-
-SET(py_env "")
-IF(PYTHONINTERP_FOUND)
-    find_python_module(pip REQUIRED)
-    find_python_module(numpy REQUIRED)
-    find_python_module(wheel REQUIRED)
-    find_python_module(google.protobuf REQUIRED)
-    FIND_PACKAGE(NumPy REQUIRED)
-    IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
-        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
-        "please use pip to upgrade protobuf. pip install -U protobuf")
-    ENDIF()
-ENDIF(PYTHONINTERP_FOUND)
-INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
-INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake
deleted file mode 100644
index 914c06491890574bcdf4374d8e0fd5498e780113..0000000000000000000000000000000000000000
--- a/cmake/external/rocprim.cmake
+++ /dev/null
@@ -1,44 +0,0 @@
-if (NOT WITH_AMD_GPU)
-    return()
-endif()
-
-# rocprim is "ROCm Parallel Primitives" for short.
-# It is a header-only library providing HIP and HC parallel primitives
-# for developing performant GPU-accelerated code on AMD ROCm platform.
-
-if("x${HCC_HOME}" STREQUAL "x")
-  set(HCC_HOME "/opt/rocm/hcc")
-endif()
-
-INCLUDE(ExternalProject)
-
-SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim)
-SET(ROCPRIM_INSTALL_DIR  ${THIRD_PARTY_PATH}/install/rocprim)
-SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include)
-
-ExternalProject_Add(
-    extern_rocprim
-    GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
-    GIT_TAG        5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc 
-    PREFIX         ${ROCPRIM_SOURCE_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS     -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc
-    CMAKE_ARGS     -DONLY_INSTALL=ON
-    CMAKE_ARGS     -DBUILD_TEST=OFF
-    CMAKE_ARGS     -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR}
-
-    INSTALL_DIR    ${ROCPRIM_INSTALL_DIR}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-)
-
-INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR})
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c)
-    file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";")
-    add_library(rocprim STATIC ${dummyfile})
-else()
-    add_library(rocprim INTERFACE)
-endif()
-
-add_dependencies(rocprim extern_rocprim)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
deleted file mode 100644
index 1f56bc7ab056ef0dd95d603ebe3461ef044b2a79..0000000000000000000000000000000000000000
--- a/cmake/external/threadpool.cmake
+++ /dev/null
@@ -1,28 +0,0 @@
-INCLUDE(ExternalProject)
-
-SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
-SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
-INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
-
-ExternalProject_Add(
-    extern_threadpool
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/progschj/ThreadPool.git"
-    GIT_TAG         9a42ec1329f259a5f4881a291db1dcb8f2ad9040
-    PREFIX          ${THREADPOOL_SOURCE_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
-    TEST_COMMAND      ""
-)
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c)
-    file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";")
-    add_library(simple_threadpool STATIC ${dummyfile})
-else()
-    add_library(simple_threadpool INTERFACE)
-endif()
-
-add_dependencies(simple_threadpool extern_threadpool)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
deleted file mode 100644
index 5fc46ae8eb8623ee6677cea7e62ce0329c57e1f2..0000000000000000000000000000000000000000
--- a/cmake/external/warpctc.cmake
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
-SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-
-SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
-    CACHE PATH "Warp-ctc Directory" FORCE)
-# Used in unit test test_WarpCTCLayer
-SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
-    CACHE PATH "Warp-ctc Library Directory" FORCE)
-
-IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32)
-    SET(USE_OMP OFF)
-ELSE()
-    SET(USE_OMP ON)
-ENDIF()
-
-IF(WIN32)
-    SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git")
-ELSE()
-    SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git")
-ENDIF()
-
-ExternalProject_Add(
-    extern_warpctc
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
-    PREFIX          ${WARPCTC_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-                    -DWITH_GPU=${WITH_GPU}
-                    -DWITH_OMP=${USE_OMP}
-                    -DWITH_TORCH=OFF
-                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-                    -DBUILD_SHARED=ON
-                    -DBUILD_TESTS=OFF
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-)
-IF(WIN32)
-    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
-            CACHE FILEPATH "Warp-ctc Library" FORCE)
-else(WIN32)
-    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
-            CACHE FILEPATH "Warp-ctc Library" FORCE)
-ENDIF(WIN32)
-
-MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
-get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
-INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
-
-ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
-ADD_DEPENDENCIES(warpctc extern_warpctc)
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
deleted file mode 100644
index 1d61154c0d45dea795902d6544deb796693db263..0000000000000000000000000000000000000000
--- a/cmake/external/xbyak.cmake
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(WITH_XBYAK ON)
-if(WIN32 OR APPLE)
-    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
-    return()
-endif()
-
-include(ExternalProject)
-
-set(XBYAK_PROJECT       extern_xbyak)
-set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
-set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
-set(XBYAK_INC_DIR       ${XBYAK_INSTALL_ROOT}/include)
-
-include_directories(${XBYAK_INC_DIR})
-include_directories(${XBYAK_INC_DIR}/xbyak)
-
-add_definitions(-DPADDLE_WITH_XBYAK)
-
-# xbyak options
-add_definitions(-DXBYAK64)
-add_definitions(-DXBYAK_NO_OP_NAMES)
-
-ExternalProject_Add(
-    ${XBYAK_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DEPENDS             ""
-    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
-    GIT_TAG             "v5.661"  # Jul 26th
-    PREFIX              ${XBYAK_PREFIX_DIR}
-    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
-)
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
-    file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
-    add_library(xbyak STATIC ${dummyfile})
-else()
-    add_library(xbyak INTERFACE)
-endif()
-
-add_dependencies(xbyak ${XBYAK_PROJECT})
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
deleted file mode 100644
index 262d47f6fd409e6bb6b5402a646c87d8a3dbb4fe..0000000000000000000000000000000000000000
--- a/cmake/external/xxhash.cmake
+++ /dev/null
@@ -1,68 +0,0 @@
-INCLUDE(ExternalProject)
-
-set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
-set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
-set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
-
-IF(WITH_STATIC_LIB)
-  SET(BUILD_CMD make lib)
-ELSE()
-  IF(APPLE)
-    SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
-  ELSE(APPLE)
-    SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
-  ENDIF(APPLE)
-ENDIF()
-
-if(WIN32)
-  ExternalProject_Add(
-          extern_xxhash
-          ${EXTERNAL_PROJECT_LOG_ARGS}
-          GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
-          GIT_TAG         "v0.6.5"
-          PREFIX          ${XXHASH_SOURCE_DIR}
-          DOWNLOAD_NAME   "xxhash"
-          UPDATE_COMMAND  ""
-          BUILD_IN_SOURCE 1
-          PATCH_COMMAND
-          CONFIGURE_COMMAND
-          ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial
-          -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR}
-          -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-          -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-          -DBUILD_XXHSUM=OFF
-          -DCMAKE_GENERATOR_PLATFORM=x64
-          -DBUILD_SHARED_LIBS=OFF
-          ${OPTIONAL_CACHE_ARGS}
-          TEST_COMMAND      ""
-  )
-else()
-  ExternalProject_Add(
-      extern_xxhash
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
-      GIT_TAG         "v0.6.5"
-      PREFIX          ${XXHASH_SOURCE_DIR}
-      DOWNLOAD_NAME   "xxhash"
-      UPDATE_COMMAND  ""
-      CONFIGURE_COMMAND ""
-      BUILD_IN_SOURCE 1
-      PATCH_COMMAND
-      BUILD_COMMAND     ${BUILD_CMD}
-      INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
-      TEST_COMMAND      ""
-  )
-endif()
-
-if (WIN32)
-  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
-else()
-  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
-endif ()
-INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
-
-add_library(xxhash STATIC IMPORTED GLOBAL)
-set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
-include_directories(${XXHASH_INCLUDE_DIR})
-add_dependencies(xxhash extern_xxhash)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
deleted file mode 100644
index 58881ac2206d844acf56c3dd67138ca18f59eb49..0000000000000000000000000000000000000000
--- a/cmake/external/zlib.cmake
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-SET(ZLIB_SOURCES_DIR ${THIRD_PARTY_PATH}/zlib)
-SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
-SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
-SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
-
-INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
-INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
-
-ExternalProject_Add(
-    extern_zlib
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
-    GIT_TAG         "v1.2.8"
-    PREFIX          ${ZLIB_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
-                    -DBUILD_SHARED_LIBS=OFF
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_MACOSX_RPATH=ON
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-)
-IF(WIN32)
-  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
-ELSE(WIN32)
-  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
-ENDIF(WIN32)
-
-ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
-ADD_DEPENDENCIES(zlib extern_zlib)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
deleted file mode 100644
index cfd5e177d921e9b07371cbc3e36bcc849edb59b2..0000000000000000000000000000000000000000
--- a/cmake/flags.cmake
+++ /dev/null
@@ -1,217 +0,0 @@
-# Setting Paddle Compile Flags
-include(CheckCXXCompilerFlag)
-include(CheckCCompilerFlag)
-include(CheckCXXSymbolExists)
-include(CheckTypeSize)
-
-function(CheckCompilerCXX11Flag)
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
-        endif()
-    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
-        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
-        # https://gist.github.com/yamaya/2924292
-        if(APPLE)  # cmake < 3.0 compiler id "Clang" on Mac OS X
-            if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
-                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
-            endif()
-        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
-            endif()
-        endif()
-    endif()
-endfunction()
-
-CheckCompilerCXX11Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-# safe_set_flag
-#
-# Set a compile flag only if compiler is support
-# is_c: is C flag or C++ flag, bool type.
-# src_list: The list name which the flag name will be append to.
-# flag_name: the flag name for compiler, such as '-Werror' '-Wall' etc
-# rest arguments: not used.
-function(safe_set_flag is_c src_list flag_name)
-    string(REPLACE "-" "_" safe_name ${flag_name})
-    string(REPLACE "=" "_" safe_name ${safe_name})
-
-    if(${flag_name} MATCHES "fsanitize")
-        set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
-        set(CMAKE_REQUIRED_FLAGS ${flag_name})
-    endif()
-
-    if(is_c)
-        CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
-        set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
-    else()
-        CHECK_CXX_COMPILER_FLAG(${flag_name} CXX_COMPILER_SUPPORT_FLAG_${safe_name})
-        set(safe_name CXX_COMPILER_SUPPORT_FLAG_${safe_name})
-    endif()
-    if(${safe_name})
-        set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
-    endif()
-
-    if(${flag_name} MATCHES "fsanitize")
-        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-    endif()
-endfunction()
-
-# helper macro to set cflag
-macro(safe_set_cflag src_list flag_name)
-    safe_set_flag(ON ${src_list} ${flag_name})
-endmacro()
-
-# helper macro to set cxxflag
-macro(safe_set_cxxflag src_list flag_name)
-    safe_set_flag(OFF ${src_list} ${flag_name})
-endmacro()
-
-# helper macro to set nvcc flag
-macro(safe_set_nvflag flag_name)
-    string(REPLACE "-" "_" safe_name ${flag_name})
-    string(REPLACE "=" "_" safe_name ${safe_name})
-    CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
-    set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
-    if(${safe_name})
-        LIST(APPEND CUDA_NVCC_FLAGS -Xcompiler ${flag_name})
-    endif()
-endmacro()
-
-macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
-    if (BUILD_SHARED_LIBS) 
-        return() # if build shared libs, the flags keep same with '/MD'
-    endif(BUILD_SHARED_LIBS)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-endmacro()
-
-CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
-if(NOT UINT64_MAX_EXISTS)
-  set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
-  CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE)
-  if(UINT64_MAX_EXISTS_HERE)
-    set(CMAKE_REQUIRED_DEFINITIONS)
-    add_definitions(-D__STDC_LIMIT_MACROS)
-  else()
-    message(FATAL_ERROR "Cannot find symbol UINT64_MAX")
-  endif()
-endif()
-
-SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
-CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND)
-CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND)
-if(SPINLOCK_FOUND)
-  add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
-endif(SPINLOCK_FOUND)
-if(BARRIER_FOUND)
-  add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
-endif(BARRIER_FOUND)
-SET(CMAKE_EXTRA_INCLUDE_FILES "")
-
-# Only one sanitizer is allowed in compile time
-string(TOLOWER "${SANITIZER_TYPE}" sanitizer_type)
-if(sanitizer_type STREQUAL "address")
-    set(fsanitize "-fsanitize=address")
-elseif(sanitizer_type STREQUAL "leak")
-    set(fsanitize "-fsanitize=leak")
-elseif(sanitizer_type STREQUAL "memory")
-    set(fsanitize "-fsanitize=memory")
-elseif(sanitizer_type STREQUAL "thread")
-    set(fsanitize "-fsanitize=thread")
-elseif(sanitizer_type STREQUAL "undefined")
-    set(fsanitize "-fsanitize=undefined")
-endif()
-
-# Common flags. the compiler flag used for C/C++ sources whenever release or debug
-# Do not care if this flag is support for gcc.
-
-# https://github.com/PaddlePaddle/Paddle/issues/12773
-if (NOT WIN32)
-set(COMMON_FLAGS
-    -fPIC
-    -fno-omit-frame-pointer
-    -Werror
-    -Wall
-    -Wextra
-    -Wnon-virtual-dtor
-    -Wdelete-non-virtual-dtor
-    -Wno-unused-parameter
-    -Wno-unused-function
-    -Wno-error=literal-suffix
-    -Wno-error=sign-compare
-    -Wno-error=unused-local-typedefs
-    -Wno-error=parentheses-equality # Warnings in pybind11
-    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
-    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
-    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
-    ${fsanitize}
-)
-
-set(GPU_COMMON_FLAGS
-    -fPIC
-    -fno-omit-frame-pointer
-    -Wnon-virtual-dtor
-    -Wdelete-non-virtual-dtor
-    -Wno-unused-parameter
-    -Wno-unused-function
-    -Wno-error=sign-compare
-    -Wno-error=literal-suffix
-    -Wno-error=unused-local-typedefs
-    -Wno-error=unused-function  # Warnings in Numpy Header.
-    -Wno-error=array-bounds # Warnings in Eigen::array
-)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
-endif(NOT WIN32)
-
-if (APPLE)
-    # On Mac OS X build fat binaries with x86_64 architectures by default.
-    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
-    # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
-    set (COMMON_FLAGS -Wno-deprecated-register)
-endif(APPLE)
-
-if(LINUX)
-    set(GPU_COMMON_FLAGS
-        -Wall
-        -Wextra
-        -Werror
-        ${GPU_COMMON_FLAGS})
-endif(LINUX)
-
-if(UNIX AND NOT APPLE)
-  # except apple from nix*Os family
-  set(LINUX TRUE)
-endif(UNIX AND NOT APPLE)
-
-foreach(flag ${COMMON_FLAGS})
-    safe_set_cflag(CMAKE_C_FLAGS ${flag})
-    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
-endforeach()
-
-foreach(flag ${GPU_COMMON_FLAGS})
-    safe_set_nvflag(${flag})
-endforeach()
-
-if(WIN32 AND MSVC_STATIC_CRT)
-# windows build turn off warnings.
-safe_set_static_flag()
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
-        set(flag_var "${flag_var} /w")
-    endforeach(flag_var)
-endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
deleted file mode 100644
index f6749c2ab858d2daee55ede8cddb8a18d522f90e..0000000000000000000000000000000000000000
--- a/cmake/generic.cmake
+++ /dev/null
@@ -1,814 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-# generic.cmake defines CMakes functions that look like Bazel's
-# building rules (https://bazel.build/).
-#
-#
-# -------------------------------------------
-#     C++        CUDA C++       Go
-# -------------------------------------------
-# cc_library    nv_library   go_library
-# cc_binary     nv_binary    go_binary
-# cc_test       nv_test      go_test
-# -------------------------------------------
-#
-# To build a static library example.a from example.cc using the system
-#  compiler (like GCC):
-#
-#   cc_library(example SRCS example.cc)
-#
-# To build a static library example.a from multiple source files
-# example{1,2,3}.cc:
-#
-#   cc_library(example SRCS example1.cc example2.cc example3.cc)
-#
-# To build a shared library example.so from example.cc:
-#
-#   cc_library(example SHARED SRCS example.cc)
-#
-# To build a library using Nvidia's NVCC from .cu file(s), use the nv_
-# prefixed version:
-#
-#   nv_library(example SRCS example.cu)
-#
-# To specify that a library new_example.a depends on other libraies:
-#
-#   cc_library(new_example SRCS new_example.cc DEPS example)
-#
-# Static libraries can be composed of other static libraries:
-#
-#   cc_library(composed DEPS dependent1 dependent2 dependent3)
-#
-# To build an executable binary file from some source files and
-# dependent libraries:
-#
-#   cc_binary(example SRCS main.cc something.cc DEPS example1 example2)
-#
-# To build an executable binary file using NVCC, use the nv_ prefixed
-# version:
-#
-#   nv_binary(example SRCS main.cc something.cu DEPS example1 example2)
-#
-# To build a unit test binary, which is an executable binary with
-# GoogleTest linked:
-#
-#   cc_test(example_test SRCS example_test.cc DEPS example)
-#
-# To build a unit test binary using NVCC, use the nv_ prefixed version:
-#
-#   nv_test(example_test SRCS example_test.cu DEPS example)
-#
-# It is pretty often that executable and test binaries depend on
-# pre-defined external libaries like glog and gflags defined in
-# /cmake/external/*.cmake:
-#
-#   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
-#
-# To build a go static library using Golang, use the go_ prefixed version:
-#
-#   go_library(example STATIC)
-#
-# To build a go shared library using Golang, use the go_ prefixed version:
-#
-#   go_library(example SHARED)
-#
-
-# including binary directory for generated headers.
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-
-if(NOT APPLE)
-  find_package(Threads REQUIRED)
-  link_libraries(${CMAKE_THREAD_LIBS_INIT})
-  set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
-endif(NOT APPLE)
-
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-# for building inference libs
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
-
-
-function(common_link TARGET_NAME)
-  if (WITH_PROFILER)
-    target_link_libraries(${TARGET_NAME} gperftools::profiler)
-  endif()
-endfunction()
-
-
-# find all third_party modules is used for paddle static library
-# for reduce the dependency when building the inference libs.
-set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
-function(find_fluid_thirdparties TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "third_party" pos)
-  if(pos GREATER 1)
-    get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY)
-    set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}")
-  endif()
-endfunction(find_fluid_thirdparties)
-
-function(merge_static_libs TARGET_NAME)
-  set(libs ${ARGN})
-  list(REMOVE_DUPLICATES libs)
-
-  # Get all propagation dependencies from the merged libraries
-  foreach(lib ${libs})
-    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
-  endforeach()
-  if(libs_deps)
-    list(REMOVE_DUPLICATES libs_deps)
-  endif()
-
-  # To produce a library we need at least one source file.
-  # It is created by add_custom_command below and will helps
-  # also help to track dependencies.
-  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-
-  if(APPLE) # Use OSX's libtool to merge archives
-    # Make the generated dummy source file depended on all static input
-    # libs. If input lib changes,the source file is touched
-    # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${target_SRCS}
-      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
-      DEPENDS ${libs})
-
-    # Generate dummy staic lib
-    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
-    add_library(${TARGET_NAME} STATIC ${target_SRCS})
-    target_link_libraries(${TARGET_NAME} ${libs_deps})
-
-    foreach(lib ${libs})
-      # Get the file names of the libraries to be merged
-      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-    endforeach()
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
-      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
-      )
-  endif(APPLE)
-  if(LINUX) # general UNIX: use "ar" to extract objects and re-add to a common lib
-    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
-
-    foreach(lib ${libs})
-      set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
-      set(objdir ${target_DIR}/${lib}.objdir)
-
-      add_custom_command(OUTPUT ${objdir}
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
-        DEPENDS ${lib})
-
-      add_custom_command(OUTPUT ${objlistfile}
-        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
-        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
-        DEPENDS ${lib} ${objdir}
-        WORKING_DIRECTORY ${objdir})
-
-      list(APPEND target_OBJS "${objlistfile}")
-    endforeach()
-
-    # Make the generated dummy source file depended on all static input
-    # libs. If input lib changes,the source file is touched
-    # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${target_SRCS}
-      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
-      DEPENDS ${libs} ${target_OBJS})
-
-    # Generate dummy staic lib
-    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
-    add_library(${TARGET_NAME} STATIC ${target_SRCS})
-    target_link_libraries(${TARGET_NAME} ${libs_deps})
-
-    # Get the file name of the generated library
-    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
-
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
-        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
-        WORKING_DIRECTORY ${target_DIR})
-  endif(LINUX)
-  if(WIN32) # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs.
-    # Make the generated dummy source file depended on all static input
-    # libs. If input lib changes,the source file is touched
-    # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${target_SRCS}
-      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
-      DEPENDS ${libs})
-
-    # Generate dummy staic lib
-    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
-    add_library(${TARGET_NAME} STATIC ${target_SRCS})
-    target_link_libraries(${TARGET_NAME} ${libs_deps})
-
-    foreach(lib ${libs})
-      # Get the file names of the libraries to be merged
-      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-    endforeach()
-    # msvc will put libarary in directory of "/Release/xxxlib" by default
-    #       COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
-      COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib ${libfiles}
-      )
-  endif(WIN32)
-endfunction(merge_static_libs)
-
-function(cc_library TARGET_NAME)
-  set(options STATIC static SHARED shared)
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if(WIN32)
-      # add libxxx.lib prefix in windows
-      set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
-  endif(WIN32)
-  if(cc_library_SRCS)
-    if(cc_library_SHARED OR cc_library_shared) # build *.so
-      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
-    else()
-      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
-      find_fluid_modules(${TARGET_NAME})
-    endif()
-
-    if(cc_library_DEPS)
-      # Don't need link libwarpctc.so
-      if("${cc_library_DEPS};" MATCHES "warpctc;")
-        list(REMOVE_ITEM cc_library_DEPS warpctc)
-        add_dependencies(${TARGET_NAME} warpctc)
-      endif()
-      # Only deps libmklml.so, not link
-      if("${cc_library_DEPS};" MATCHES "mklml;")
-        list(REMOVE_ITEM cc_library_DEPS mklml)
-        if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml")
-          list(APPEND cc_library_DEPS dynload_mklml)
-        endif()
-        add_dependencies(${TARGET_NAME} mklml)
-        if(WIN32)
-          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else(WIN32)
-          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
-        endif(WIN32)
-      endif()
-      # remove link to python, see notes at:
-      # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
-      if("${cc_library_DEPS};" MATCHES "python;")
-        list(REMOVE_ITEM cc_library_DEPS python)
-        add_dependencies(${TARGET_NAME} python)
-        if(WIN32)
-          target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
-        else()
-          target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
-        endif(WIN32)
-      endif()
-      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-      common_link(${TARGET_NAME})
-    endif()
-
-    # cpplint code style
-    foreach(source_file ${cc_library_SRCS})
-      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-      endif()
-    endforeach()
-  else(cc_library_SRCS)
-    if(cc_library_DEPS)
-      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
-    else()
-      message(FATAL_ERROR "Please specify source files or libraries in cc_library(${TARGET_NAME} ...).")
-    endif()
-  endif(cc_library_SRCS)
-endfunction(cc_library)
-
-# The link operation under windows may exceeds the maximum characters limit, simply break the link command
-# into multiple link opeartion can fix that, say
-# original:
-#     lib /out:target.lib a.lib b.lib c.lib d.lib
-# after:
-#    1. lib /out:dummy_lib_1.lib a.lib b.lib
-#    2. lib /out:dummy_lib_2.lib c.lib d.lib
-#    1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib
-function(sep_library TARGET_NAME)
-  set(options STATIC static SHARED shared)
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  set(dummy_index 1)
-  set(dummy_offset 1)
-  # the dummy target would be consisted of limit size libraries
-  set(dummy_limit 50)
-  list(LENGTH sep_library_DEPS sep_all_len)
-  foreach(v ${sep_library_DEPS})
-    list(APPEND dummy_list ${v})
-    list(LENGTH dummy_list listlen )
-    if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len}))
-      message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}")
-      cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list})
-      foreach(i ${dummy_list})
-        list(REMOVE_AT dummy_list 0)
-      endforeach()
-      list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index})
-      MATH(EXPR dummy_index "${dummy_index}+1")
-    endif()
-    MATH(EXPR dummy_offset "${dummy_offset}+1")
-  endforeach()
-  if(${sep_library_SHARED})
-    cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
-  else(${sep_library_SHARED})
-    cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
-  endif(${sep_library_SHARED})
-endfunction(sep_library)
-
-function(cc_binary TARGET_NAME)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  add_executable(${TARGET_NAME} ${cc_binary_SRCS})
-  if(cc_binary_DEPS)
-    target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
-    add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
-    common_link(${TARGET_NAME})
-  endif()
-  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-  target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
-endfunction(cc_binary)
-
-function(cc_test_build TARGET_NAME)
-  if(WITH_TESTING)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    if(WIN32)
-      if("${cc_test_DEPS};" MATCHES "python;")
-        list(REMOVE_ITEM cc_test_DEPS python)
-        target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
-      endif()
-    endif(WIN32)
-    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
-    common_link(${TARGET_NAME})
-  endif()
-endfunction()
-
-function(cc_test_run TARGET_NAME)
-  if(WITH_TESTING)
-    set(oneValueArgs "")
-    set(multiValueArgs COMMAND ARGS)
-    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    add_test(NAME ${TARGET_NAME}
-	    COMMAND ${cc_test_COMMAND}
-	    ARGS ${cc_test_ARGS}
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
-  endif()
-endfunction()
-
-function(cc_test TARGET_NAME)
-  if(WITH_TESTING)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS)
-    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_test_build(${TARGET_NAME}
-	    SRCS ${cc_test_SRCS}
-	    DEPS ${cc_test_DEPS})
-    cc_test_run(${TARGET_NAME}
-	    COMMAND ${TARGET_NAME}
-	    ARGS ${cc_test_ARGS})
-  endif()
-endfunction(cc_test)
-
-function(nv_library TARGET_NAME)
-  if (WITH_GPU)
-    set(options STATIC static SHARED shared)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    if(nv_library_SRCS)
-      if (nv_library_SHARED OR nv_library_shared) # build *.so
-        cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
-      else()
-        cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
-        find_fluid_modules(${TARGET_NAME})
-      endif()
-      if (nv_library_DEPS)
-        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
-        target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
-      endif()
-      # cpplint code style
-      foreach(source_file ${nv_library_SRCS})
-        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
-        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-        endif()
-      endforeach()
-    else(nv_library_SRCS)
-      if (nv_library_DEPS)
-        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
-      else()
-        message(FATAL "Please specify source file or library in nv_library.")
-      endif()
-    endif(nv_library_SRCS)
-  endif()
-endfunction(nv_library)
-
-function(nv_binary TARGET_NAME)
-  if (WITH_GPU)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
-    if(nv_binary_DEPS)
-      target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
-      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
-      common_link(${TARGET_NAME})
-    endif()
-  endif()
-endfunction(nv_binary)
-
-function(nv_test TARGET_NAME)
-  if (WITH_GPU AND WITH_TESTING)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
-    common_link(${TARGET_NAME})
-    add_test(${TARGET_NAME} ${TARGET_NAME})
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-  endif()
-endfunction(nv_test)
-
-function(hip_library TARGET_NAME)
-  if (WITH_AMD_GPU)
-    set(options STATIC static SHARED shared)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(_sources ${hip_library_SRCS})
-    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
-    if(_source_files)
-      list(REMOVE_ITEM _sources ${_source_files})
-    endif()
-    if(hip_library_SRCS)
-      if (hip_library_SHARED OR hip_library_shared) # build *.so
-        add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
-        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
-      else()
-        add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
-        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
-        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so)
-        find_fluid_modules(${TARGET_NAME})
-      endif()
-      if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
-        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
-        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
-        target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
-        list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END)
-      else()
-        target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
-      endif()
-      # cpplint code style
-      foreach(source_file ${hip_library_SRCS})
-        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
-        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-        endif()
-      endforeach()
-    else(hip_library_SRCS)
-      if (hip_library_DEPS)
-        merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
-      else()
-        message(FATAL "Please specify source file or library in nv_library.")
-      endif()
-    endif(hip_library_SRCS)
-  endif()
-endfunction(hip_library)
-
-function(hip_binary TARGET_NAME)
-  if (WITH_AMD_GPU)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
-    if(hip_binary_DEPS)
-      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
-      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
-      common_link(${TARGET_NAME})
-    endif()
-  endif()
-endfunction(hip_binary)
-
-function(hip_test TARGET_NAME)
-  if (WITH_AMD_GPU AND WITH_TESTING)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(_sources ${hip_test_SRCS})
-    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
-    if(_source_files)
-      list(REMOVE_ITEM _sources ${_source_files})
-    endif()
-    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
-    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
-    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags ${os_dependency_modules})
-    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
-    common_link(${TARGET_NAME})
-    add_test(${TARGET_NAME} ${TARGET_NAME})
-  endif()
-endfunction(hip_test)
-
-function(go_library TARGET_NAME)
-  set(options STATIC static SHARED shared)
-  set(oneValueArgs "")
-  set(multiValueArgs DEPS)
-  cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  if (go_library_SHARED OR go_library_shared)
-    set(BUILD_MODE "-buildmode=c-shared")
-    set(${TARGET_NAME}_LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
-  else()
-    set(BUILD_MODE "-buildmode=c-archive")
-    set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
-  endif()
-
-  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-
-  # This custom command will always run since it depends on a not
-  # existing file.
-  add_custom_command(
-    OUTPUT dummy_rebulid_${TARGET_NAME}
-    COMMAND cmake -E touch ${dummyfile}
-    )
-  # Create a custom target that depends on the custom command output
-  # file, so the custom command can be referenced as a dependency by
-  # `add_dependencies`.
-  add_custom_target(rebuild_${TARGET_NAME}
-    DEPENDS dummy_rebulid_${TARGET_NAME}
-    )
-
-  # Add dummy code to support `make target_name` under Terminal Command
-  file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
-  if (go_library_SHARED OR go_library_shared)
-    add_library(${TARGET_NAME} SHARED ${dummyfile})
-  else()
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
-  endif()
-  if(go_library_DEPS)
-    add_dependencies(${TARGET_NAME} ${go_library_DEPS})
-    common_link(${TARGET_NAME})
-  endif(go_library_DEPS)
-
-  # The "source file" of the library is `${dummyfile}` which never
-  # change, so the target will never rebuild. Make the target depends
-  # on the custom command that touches the library "source file", so
-  # rebuild will always happen.
-  add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME})
-
-  set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}")
-
-  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
-  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-
-  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-    COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
-    # Golang build source code
-    COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-    -o "${${TARGET_NAME}_LIB_PATH}"
-    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
-    # must run under GOPATH
-    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
-  add_dependencies(${TARGET_NAME} go_vendor)
-endfunction(go_library)
-
-function(go_binary TARGET_NAME)
-  set(options OPTIONAL)
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
-    -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
-    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
-  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
-  install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
-endfunction(go_binary)
-
-function(go_test TARGET_NAME)
-  set(options OPTIONAL)
-  set(oneValueArgs "")
-  set(multiValueArgs DEPS)
-  cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
-  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
-    -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
-    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
-  add_test(NAME ${TARGET_NAME}
-    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-endfunction(go_test)
-
-# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
-# Usage:
-#   paddle_protobuf_generate_cpp(<proto_srcs> <proto_hdrs> <proto_files>)
-
-function(paddle_protobuf_generate_cpp SRCS HDRS)
-  if(NOT ARGN)
-    message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files")
-    return()
-  endif()
-
-  set(${SRCS})
-  set(${HDRS})
-
-  foreach(FIL ${ARGN})
-    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-    get_filename_component(FIL_WE ${FIL} NAME_WE)
-
-    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
-    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
-    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
-    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
-
-    add_custom_command(
-      OUTPUT "${_protobuf_protoc_src}"
-             "${_protobuf_protoc_hdr}"
-
-      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
-      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-      -I${CMAKE_CURRENT_SOURCE_DIR}
-      --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
-      DEPENDS ${ABS_FIL} protoc
-      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
-      VERBATIM )
-  endforeach()
-
-  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
-  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
-  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
-endfunction()
-
-
-function(proto_library TARGET_NAME)
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  set(proto_srcs)
-  set(proto_hdrs)
-  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
-  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
-endfunction()
-
-function(py_proto_compile TARGET_NAME)
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS)
-  cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  set(py_srcs)
-  protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs} protobuf)
-endfunction()
-
-function(py_test TARGET_NAME)
-  if(WITH_TESTING)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS ENVS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if(WITH_COVERAGE)
-      add_test(NAME ${TARGET_NAME}
-               COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-               FLAGS_cpu_deterministic=true
-               PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
-               COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-               ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS} ${py_test_ARGS}
-               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    else()
-      add_test(NAME ${TARGET_NAME}
-               COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-               FLAGS_cpu_deterministic=true
-               PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
-               ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
-               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    endif()
-
-    # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
-  endif()
-endfunction()
-
-# grpc_library generate grpc code using grpc_cpp_plugin and protoc
-# then build the generated protobuf code and grpc code with your
-# implementation source codes together. Use SRCS argument for your
-# implementation source files and PROTO argument for your .proto
-# files.
-#
-# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
-
-function(grpc_library TARGET_NAME)
-  set(oneValueArgs PROTO)
-  set(multiValueArgs SRCS DEPS)
-  set(options "")
-  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  message(STATUS "generating grpc ${grpc_library_PROTO}")
-
-  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
-  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
-  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
-
-  #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
-  # somehow it didn't. line 602 to 604 is to patching this. Leaving this here
-  # for now to enable dist CI.
-  paddle_protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
-  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
-  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
-  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
-
-  add_custom_command(
-          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
-          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
-          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
-          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-          ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
-          "${ABS_PROTO}"
-          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
-
-  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
-  # as compiler warnings instead of error. Should try remove the warnings also.
-  set_source_files_properties(
-    ${grpc_grpc_srcs}
-    PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
-
-  set_source_files_properties(
-    ${grpc_library_SRCS}
-    PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
-endfunction()
-
-
-function(brpc_library TARGET_NAME)
-  set(oneValueArgs PROTO)
-  set(multiValueArgs SRCS DEPS)
-  set(options "")
-  cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  message(STATUS "generating brpc ${brpc_library_PROTO}")
-
-  get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE)
-  get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE)
-  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
-
-  paddle_protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}")
-  cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}")
-  cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}")
-endfunction()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
deleted file mode 100644
index c3a748db502037f926dc241e4c3bc26a83ad3468..0000000000000000000000000000000000000000
--- a/cmake/hip.cmake
+++ /dev/null
@@ -1,53 +0,0 @@
-if(NOT WITH_AMD_GPU)
-    return()
-endif()
-
-include_directories("/opt/rocm/include")
-include_directories("/opt/rocm/hip/include")
-include_directories("/opt/rocm/miopen/include")
-include_directories("/opt/rocm/hipblas/include")
-include_directories("/opt/rocm/hiprand/include")
-include_directories("/opt/rocm/rocrand/include")
-include_directories("/opt/rocm/rccl/include")
-include_directories("/opt/rocm/thrust")
-
-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
-
-if(WITH_DSO)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
-endif(WITH_DSO)
-
-if(WITH_TESTING)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
-endif(WITH_TESTING)
-
-if(WITH_DISTRIBUTE)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
-endif(WITH_DISTRIBUTE)
-
-if(WITH_GRPC)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
-endif(WITH_GRPC)
-
-if(WITH_MKLDNN)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
-endif(WITH_MKLDNN)
-
-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
-
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
-endif()
-
-if("x${HCC_HOME}" STREQUAL "x")
-  set(HCC_HOME "/opt/rocm/hcc")
-endif()
-
-set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
-set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
-set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
-
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
deleted file mode 100644
index 9503d1dc76a574df15a5f473c007d4a9c21f0d5e..0000000000000000000000000000000000000000
--- a/cmake/inference_lib.cmake
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# make package for paddle fluid shared and static library
-
-if(WIN32)
-    if(NOT PYTHON_EXECUTABLE)
-	FIND_PACKAGE(PythonInterp REQUIRED)
-    endif()
-endif()
-
-set(COPY_SCRIPT_DIR ${PADDLE_SOURCE_DIR}/cmake)
-function(copy TARGET)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DSTS)
-    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
-    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
-    if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
-        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
-    endif ()
-    math(EXPR len "${copy_lib_SRCS_len} - 1")
-    foreach (index RANGE ${len})
-        list(GET copy_lib_SRCS ${index} src)
-        list(GET copy_lib_DSTS ${index} dst)
-        if (WIN32)   #windows
-            file(TO_NATIVE_PATH ${src} native_src)
-            file(TO_NATIVE_PATH ${dst} native_dst)
-            add_custom_command(TARGET ${TARGET} POST_BUILD
-                    COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py ${native_src} ${native_dst})
-        else (WIN32) #not windows
-            add_custom_command(TARGET ${TARGET} POST_BUILD
-                    COMMAND mkdir -p "${dst}"
-                    COMMAND cp -r "${src}" "${dst}"
-                    COMMENT "copying ${src} -> ${dst}")
-        endif (WIN32) # not windows
-    endforeach ()
-endfunction()
-
-# third party
-set(third_party_deps eigen3 gflags glog boost xxhash zlib)
-if(NOT PROTOBUF_FOUND OR WIN32)
-    list(APPEND third_party_deps extern_protobuf)
-endif ()
-
-if (WITH_MKLML)
-    list(APPEND third_party_deps mklml)
-elseif (NOT CBLAS_FOUND OR WIN32)
-    list(APPEND third_party_deps extern_openblas)
-endif ()
-
-if (WITH_MKLDNN)
-    list(APPEND third_party_deps mkldnn_shared_lib)
-endif ()
-
-if (WITH_NGRAPH)
-    list(APPEND third_party_deps ngraph)
-endif ()
-
-add_custom_target(third_party DEPENDS ${third_party_deps})
-
-# inference-only library
-set(inference_lib_deps third_party paddle_fluid paddle_fluid_shared)
-add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps})
-
-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/eigen3")
-copy(inference_lib_dist
-    SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
-    DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)
-
-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/boost")
-copy(inference_lib_dist
-    SRCS ${BOOST_INCLUDE_DIR}/boost
-    DSTS ${dst_dir})
-
-if(WITH_MKLML)
-    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/mklml")
-    if(WIN32)
-        copy(inference_lib_dist
-            SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB}
-                ${MKLML_SHARED_LIB_DEPS} ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR}
-            DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib
-                ${dst_dir}/lib ${dst_dir}/lib ${dst_dir})
-    else()
-        copy(inference_lib_dist
-            SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
-            DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir})
-    endif()
-elseif (NOT CBLAS_FOUND OR WIN32)
-    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/openblas")
-    copy(inference_lib_dist
-            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
-            DSTS ${dst_dir} ${dst_dir})
-endif ()
-
-if(WITH_MKLDNN)
-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/mkldnn")
-if(WIN32)
-    copy(inference_lib_dist
-        SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB}
-        DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib)
-else()
-    copy(inference_lib_dist
-        SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
-        DSTS ${dst_dir} ${dst_dir}/lib)
-endif()
-endif()
-
-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/gflags")
-copy(inference_lib_dist
-        SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)
-
-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/glog")
-copy(inference_lib_dist
-        SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)
-
-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/xxhash")
-copy(inference_lib_dist
-        SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)
-
-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/zlib")
-copy(inference_lib_dist
-        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)
-
-if (NOT PROTOBUF_FOUND OR WIN32)
-    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/protobuf")
-    copy(inference_lib_dist
-            SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
-            DSTS ${dst_dir} ${dst_dir}/lib)
-endif ()
-
-if (WITH_NGRAPH)
-    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/ngraph")
-    copy(inference_lib_dist
-            SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
-            DSTS ${dst_dir} ${dst_dir})
-endif ()
-
-if (TENSORRT_FOUND)
-    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/tensorrt")
-    copy(inference_lib_dist
-        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/*nvinfer*
-        DSTS ${dst_dir}/include ${dst_dir}/lib)
-endif ()
-
-if (ANAKIN_FOUND)
-    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/anakin")
-    copy(inference_lib_dist
-        SRCS ${ANAKIN_ROOT}/*
-        DSTS ${dst_dir})
-endif ()
-
-copy(inference_lib_dist
-     SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-     DSTS ${FLUID_INFERENCE_INSTALL_DIR})
-
-set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-if(WIN32)
-    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*)
-else(WIN32)
-    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
-endif(WIN32)
-
-copy(inference_lib_dist
-     SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
-     DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
-
-
-# fluid library for both train and inference
-set(fluid_lib_deps inference_lib_dist)
-add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
-
-set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
-set(module "inference")
-copy(fluid_lib_dist
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib} 
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
-)
-
-set(module "framework")
-set(framework_lib_deps framework_proto)
-add_dependencies(fluid_lib_dist ${framework_lib_deps})
-copy(fluid_lib_dist
-    SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
-    ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h
-    DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet)
-
-set(module "memory")
-copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
-        )
-
-set(module "platform")
-set(platform_lib_deps profiler_proto)
-add_dependencies(fluid_lib_dist ${platform_lib_deps})
-copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
-        )
-
-set(module "string")
-copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
-        )
-
-set(module "pybind")
-copy(fluid_lib_dist
-        SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
-        DSTS ${dst_dir}/${module}
-        )
-
-# CMakeCache Info
-copy(fluid_lib_dist
-        SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-        DSTS ${FLUID_INSTALL_DIR} ${FLUID_INSTALL_DIR}
-        )
-
-# paddle fluid version
-function(version version_file)
-    execute_process(
-            COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-            OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-    file(WRITE ${version_file}
-            "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-            "WITH_MKL: ${WITH_MKL}\n"
-            "WITH_MKLDNN: ${WITH_MKLDNN}\n"
-            "WITH_GPU: ${WITH_GPU}\n")
-    if (WITH_GPU)
-        file(APPEND ${version_file}
-                "CUDA version: ${CUDA_VERSION}\n"
-                "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
-    endif ()
-endfunction()
-version(${FLUID_INSTALL_DIR}/version.txt)
-version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
diff --git a/cmake/make_resource.py b/cmake/make_resource.py
deleted file mode 100644
index 09a2ca877dd54243428ee2c730944f75ceeeaa30..0000000000000000000000000000000000000000
--- a/cmake/make_resource.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-import sys
-
-res = sys.argv[1]
-out = sys.argv[2]
-var = re.sub(r'[ .-]', '_', os.path.basename(res))
-
-open(out, "w").write("const unsigned char " + var + "[] = {" + ",".join([
-    "0x%02x" % ord(c) for c in open(res).read()
-]) + ",0};\n" + "const unsigned " + var + "_size = sizeof(" + var + ");\n")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
deleted file mode 100644
index 28e880fb51e7dceccdfe5e8ddeb4bbe92c460fa9..0000000000000000000000000000000000000000
--- a/cmake/operators.cmake
+++ /dev/null
@@ -1,224 +0,0 @@
-set(PART_CUDA_KERNEL_FILES)
-function(op_library TARGET)
-    # op_library is a function to create op library. The interface is same as
-    # cc_library. But it handle split GPU/CPU code and link some common library
-    # for ops.
-    set(cc_srcs)
-    set(cu_srcs)
-    set(hip_cu_srcs)
-    set(miopen_hip_cc_srcs)
-    set(cu_cc_srcs)
-    set(cudnn_cu_cc_srcs)
-    set(CUDNN_FILE)
-    set(mkldnn_cc_srcs)
-    set(MKLDNN_FILE)
-    set(op_common_deps operator op_registry math_function)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    set(pybind_flag 0)
-    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-
-    list(LENGTH op_library_SRCS op_library_SRCS_len)
-    if (${op_library_SRCS_len} EQUAL 0)
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-            list(APPEND cc_srcs ${TARGET}.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
-            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-            list(APPEND cu_srcs ${TARGET}.cu)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
-                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
-            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-        endif()
-
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
-            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
-        endif()
-        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
-            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
-        endif()
-        if(WITH_AMD_GPU)
-            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
-                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
-            endif()
-        endif()
-        if(WITH_MKLDNN)
-            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
-                list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
-            endif()
-        endif()
-    else()
-        foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.hip.cu$")
-                list(APPEND hip_cu_srcs ${src})
-            elseif (${src} MATCHES ".*\\.cu$")
-                list(APPEND cu_srcs ${src})
-            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
-                list(APPEND cudnn_cu_cc_srcs ${src})
-            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
-                list(APPEND miopen_hip_cc_srcs ${src})
-            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
-                list(APPEND mkldnn_cc_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cu.cc$")
-                list(APPEND cu_cc_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cc$")
-                list(APPEND cc_srcs ${src})
-            else()
-                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
-            endif()
-        endforeach()
-    endif()
-
-    list(LENGTH cc_srcs cc_srcs_len)
-    if (${cc_srcs_len} EQUAL 0)
-        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
-    endif()
-    if (WIN32)
-    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
-        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
-          return()
-        endif()
-    endforeach()
-    endif(WIN32)
-    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs")
-
-    list(LENGTH op_library_DEPS op_library_DEPS_len)
-    if (${op_library_DEPS_len} GREATER 0)
-        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
-    endif()
-    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    elseif (WITH_AMD_GPU)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-            ${op_common_deps})
-    endif()
-
-    # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
-"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op")
-        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
-
-    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
-    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
-    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
-    file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
-    if (one_register STREQUAL "")
-        string(REPLACE "_op" "" TARGET "${TARGET}")
-    else ()
-        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
-        string(REPLACE "," "" TARGET "${TARGET}")
-    endif()
-
-    # pybind USE_NO_KERNEL_OP
-    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
-    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
-    string(REPLACE "_op" "" TARGET "${TARGET}")
-    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
-
-    # pybind USE_CPU_ONLY_OP
-    list(LENGTH cu_srcs cu_srcs_len)
-    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
-    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
-        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
-
-    # pybind USE_OP_DEVICE_KERNEL for CUDNN
-    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
-    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
-      if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
-      else()
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
-      endif()
-    endif()
-
-    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
-    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
-    endif()
-
-    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
-    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
-      # Append first implemented MKLDNN activation operator
-      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
-      elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
-        
-      else()
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
-      endif()
-    endif()
-
-    # pybind USE_OP
-    if (${pybind_flag} EQUAL 0)
-      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
-      if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP(relu);\n")
-      elseif(${TARGET} STREQUAL "fake_dequantize")
-        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
-      elseif(${TARGET} STREQUAL "fake_quantize")
-        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
-      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
-          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
-      else()
-        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
-      endif()
-    endif()
-endfunction()
-
-
-function(register_operators)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs EXCLUDES DEPS)
-    cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-
-    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-    string(REPLACE "_mkldnn" "" OPS "${OPS}")
-    string(REPLACE ".cc" "" OPS "${OPS}")
-    list(REMOVE_DUPLICATES OPS)
-    list(LENGTH register_operators_DEPS register_operators_DEPS_len)
-
-    foreach(src ${OPS})
-        list(FIND register_operators_EXCLUDES ${src} _index)
-        if (${_index} EQUAL -1)
-            if (${register_operators_DEPS_len} GREATER 0)
-                op_library(${src} DEPS ${register_operators_DEPS})
-            else()
-                op_library(${src})
-            endif()
-        endif()
-    endforeach()
-endfunction()
diff --git a/cmake/python_module.cmake b/cmake/python_module.cmake
deleted file mode 100644
index 1412b7f7f20600acf95a4a899f5e6529c3b67a35..0000000000000000000000000000000000000000
--- a/cmake/python_module.cmake
+++ /dev/null
@@ -1,43 +0,0 @@
-# Find if a Python module is installed
-# Found at http://www.cmake.org/pipermail/cmake/2011-January/041666.html
-# To use do: find_python_module(PyQt4 REQUIRED)
-function(find_python_module module)
-    string(TOUPPER ${module} module_upper)
-    if(NOT PY_${module_upper})
-        if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED")
-            set(${module}_FIND_REQUIRED TRUE)
-        else()
-            set(${module}_FIND_REQUIRED FALSE)
-        endif()
-        # A module's location is usually a directory, but for binary modules
-        # it's a .so file.
-        execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-            "import re, ${module}; print(re.compile('/__init__.py.*').sub('',${module}.__file__))"
-            RESULT_VARIABLE _${module}_status
-            OUTPUT_VARIABLE _${module}_location
-            ERROR_QUIET
-            OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if(NOT _${module}_status)
-            set(PY_${module_upper} ${_${module}_location} CACHE STRING
-                "Location of Python module ${module}")
-        endif(NOT _${module}_status)
-    endif(NOT PY_${module_upper})
-    find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper})
-    if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
-        message(FATAL_ERROR "python module ${module} is not found")
-    endif()
-
-    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-        "import sys, ${module}; sys.stdout.write(${module}.__version__)"
-        OUTPUT_VARIABLE _${module}_version
-        RESULT_VARIABLE _${module}_status
-        ERROR_QUIET
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(NOT _${module}_status)
-        set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING
-            "Version of Python module ${module}")
-    endif(NOT _${module}_status)
-
-    set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
-    set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE)
-endfunction(find_python_module)
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
deleted file mode 100644
index 566dc75fda019eb66759eb403f60e16f18cffef1..0000000000000000000000000000000000000000
--- a/cmake/simd.cmake
+++ /dev/null
@@ -1,99 +0,0 @@
-# This file is use to check all support level of AVX on your machine
-# so that PaddlePaddle can unleash the vectorization power of muticore.
-
-include(CheckCXXSourceRuns)
-include(CheckCXXSourceCompiles)
-
-if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    set(MMX_FLAG "-mmmx")
-    set(SSE2_FLAG "-msse2")
-    set(SSE3_FLAG "-msse3")
-    set(AVX_FLAG "-mavx")
-    set(AVX2_FLAG "-mavx2")
-    set(AVX512F_FLAG "-mavx512f")
-elseif(MSVC)
-    set(MMX_FLAG "/arch:MMX")
-    set(SSE2_FLAG "/arch:SSE2")
-    set(SSE3_FLAG "/arch:SSE3")
-    SET(AVX_FLAG "/arch:AVX")
-    SET(AVX2_FLAG "/arch:AVX2")
-endif()
-
-set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
-
-# Check  MMX
-set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
-set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <mmintrin.h>
-int main()
-{
-    _mm_setzero_si64();
-    return 0;
-}" MMX_FOUND)
-
-# Check SSE2
-set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
-set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <emmintrin.h>
-int main()
-{
-    _mm_setzero_si128();
-    return 0;
-}" SSE2_FOUND)
-
-# Check SSE3
-set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
-set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <pmmintrin.h>
-int main()
-{
-    __m128d a = _mm_set1_pd(6.28);
-    __m128d b = _mm_set1_pd(3.14);
-    __m128d result = _mm_addsub_pd(a, b);
-    result = _mm_movedup_pd(result);
-    return 0;
-}" SSE3_FOUND)
-
-# Check AVX
-set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-    __m256 result = _mm256_add_ps (a, b);
-    return 0;
-}" AVX_FOUND)
-
-# Check AVX 2
-set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-    __m256i result = _mm256_abs_epi32 (a);
-    return 0;
-}" AVX2_FOUND)
-
-# Check AVX512F
-set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                  13, -5, 6, -7, 9, 2, -6, 3);
-    __m512i result = _mm512_abs_epi32 (a);
-    return 0;
-}" AVX512F_FOUND)
-
-set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/cmake/system.cmake b/cmake/system.cmake
deleted file mode 100644
index 65db05bebe957d740e391847d980e211b0e9e750..0000000000000000000000000000000000000000
--- a/cmake/system.cmake
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Detects the OS and sets appropriate variables.
-# CMAKE_SYSTEM_NAME only give us a coarse-grained name of the OS CMake is
-# building for, but the host processor name like centos is necessary
-# in some scenes to distinguish system for customization.
-#
-# for instance, protobuf libs path is <install_dir>/lib64
-# on CentOS, but <install_dir>/lib on other systems.
-
-IF(WIN32)
-    SET(HOST_SYSTEM "win32")
-ELSE(WIN32)
-    IF(APPLE)
-        SET(HOST_SYSTEM "macosx")
-        EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
-        STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
-        IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
-            # Set cache variable - end user may change this during ccmake or cmake-gui configure.
-            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
-                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
-        ENDIF()
-        set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-    ELSE(APPLE)
-
-        IF(EXISTS "/etc/issue")
-            FILE(READ "/etc/issue" LINUX_ISSUE)
-            IF(LINUX_ISSUE MATCHES "CentOS")
-                SET(HOST_SYSTEM "centos")
-            ELSEIF(LINUX_ISSUE MATCHES "Debian")
-                SET(HOST_SYSTEM "debian")
-            ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
-                SET(HOST_SYSTEM "ubuntu")
-            ELSEIF(LINUX_ISSUE MATCHES "Red Hat")
-                SET(HOST_SYSTEM "redhat")
-            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
-                SET(HOST_SYSTEM "fedora")
-            ENDIF()
-
-            STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}")
-        ENDIF(EXISTS "/etc/issue")
-
-        IF(EXISTS "/etc/redhat-release")
-            FILE(READ "/etc/redhat-release" LINUX_ISSUE)
-            IF(LINUX_ISSUE MATCHES "CentOS")
-                SET(HOST_SYSTEM "centos")
-            ENDIF()
-        ENDIF(EXISTS "/etc/redhat-release")
-
-        IF(NOT HOST_SYSTEM)
-            SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
-        ENDIF()
-
-    ENDIF(APPLE)
-ENDIF(WIN32)
-
-# query number of logical cores
-CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
-
-MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
-
-MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
-MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
-
-# external dependencies log output
-SET(EXTERNAL_PROJECT_LOG_ARGS
-    LOG_DOWNLOAD    0     # Wrap download in script to log output
-    LOG_UPDATE      1     # Wrap update in script to log output
-    LOG_CONFIGURE   1     # Wrap configure in script to log output
-    LOG_BUILD       0     # Wrap build in script to log output
-    LOG_TEST        1     # Wrap test in script to log output
-    LOG_INSTALL     0     # Wrap install in script to log output
-)
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
deleted file mode 100644
index fc97fcbf20a7312afe9667cf735b81357ff2c272..0000000000000000000000000000000000000000
--- a/cmake/tensorrt.cmake
+++ /dev/null
@@ -1,52 +0,0 @@
-if(NOT WITH_GPU)
-    return()
-endif()
-
-if(WIN32)
-    if("${TENSORRT_ROOT}" STREQUAL "")
-        message(WARNING "Please specify the TensorRT root path: TENSORRT_ROOT.")
-    endif()
-    string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}")
-    set(TR_INFER_LIB nvinfer.lib)
-    set(TR_INFER_RT nvinfer.dll)
-    set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll)
-else()
-    set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
-    set(TR_INFER_LIB libnvinfer.a)
-    set(TR_INFER_RT libnvinfer.so)
-    set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so)
-endif()
-
-find_path(TENSORRT_INCLUDE_DIR NvInfer.h
-    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
-    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
-    NO_DEFAULT_PATH
-)
-
-find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
-    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
-    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
-    NO_DEFAULT_PATH
-    DOC "Path to TensorRT library.")
-
-if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
-  if(WITH_DSO)
-    set(TENSORRT_FOUND ON)
-  endif(WITH_DSO)
-else()
-    set(TENSORRT_FOUND OFF)
-endif()
-
-if(TENSORRT_FOUND)
-    file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
-    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
-        "${TENSORRT_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
-        TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
-
-    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
-        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
-    include_directories(${TENSORRT_INCLUDE_DIR})
-    link_directories(${TENSORRT_LIBRARY})
-    add_definitions(-DPADDLE_WITH_TENSORRT)
-endif()
diff --git a/cmake/util.cmake b/cmake/util.cmake
deleted file mode 100644
index 02667dbce69ed159193ff88f38069dd08cdcf678..0000000000000000000000000000000000000000
--- a/cmake/util.cmake
+++ /dev/null
@@ -1,55 +0,0 @@
-# Some common routine for paddle compile.
-
-# target_circle_link_libraries
-# Link libraries to target which has circle dependencies.
-#
-# First Argument: target name want to be linked with libraries
-# Rest Arguments: libraries which link together.
-function(target_circle_link_libraries TARGET_NAME)
-    if(APPLE)
-        set(LIBS)
-        set(inArchive OFF)
-        set(libsInArgn)
-
-        foreach(arg ${ARGN})
-            if(${arg} STREQUAL "ARCHIVE_START")
-                set(inArchive ON)
-            elseif(${arg} STREQUAL "ARCHIVE_END")
-                set(inArchive OFF)
-            else()
-                if(inArchive)
-                    list(APPEND LIBS "-Wl,-force_load")
-                endif()
-                list(APPEND LIBS ${arg})
-                list(APPEND libsInArgn ${arg})
-            endif()
-        endforeach()
-        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-            if(NOT IOS_ENABLE_BITCODE)
-                list(APPEND LIBS "-undefined dynamic_lookup")
-            endif()
-        endif()
-        list(REVERSE libsInArgn)
-        target_link_libraries(${TARGET_NAME}
-            ${LIBS}
-            ${libsInArgn})
-
-    else()  # LINUX
-        set(LIBS)
-
-        foreach(arg ${ARGN})
-            if(${arg} STREQUAL "ARCHIVE_START")
-                list(APPEND LIBS "-Wl,--whole-archive")
-            elseif(${arg} STREQUAL "ARCHIVE_END")
-                list(APPEND LIBS "-Wl,--no-whole-archive")
-            else()
-                list(APPEND LIBS ${arg})
-            endif()
-        endforeach()
-
-        target_link_libraries(${TARGET_NAME}
-                "-Wl,--start-group"
-                ${LIBS}
-                "-Wl,--end-group")
-    endif()
-endfunction()
diff --git a/cmake/version.cmake b/cmake/version.cmake
deleted file mode 100644
index dd57d4ab9969ce530f93ca1694350b1a26b5b543..0000000000000000000000000000000000000000
--- a/cmake/version.cmake
+++ /dev/null
@@ -1,63 +0,0 @@
-# Get the latest git tag.
-set(PADDLE_VERSION $ENV{PADDLE_VERSION})
-set(tmp_version "HEAD")
-set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
-set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
-while ("${PADDLE_VERSION}" STREQUAL "")
-  # Check current branch name
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_BRANCH_NAME
-    RESULT_VARIABLE GIT_BRANCH_RESULT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if (NOT ${GIT_BRANCH_RESULT})
-    execute_process(
-      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-      OUTPUT_VARIABLE GIT_TAG_NAME
-      RESULT_VARIABLE GIT_RESULT
-      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if (NOT ${GIT_RESULT})
-      # Check if current branch is release branch
-      if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
-        # Check the tag is a correct version
-        if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
-          # if no tag was found, set PADDLE_VERSION to 0.0.0 to represent latest
-          set(PADDLE_VERSION "0.0.0")
-        elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
-          string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
-        else()  # otherwise, get the previous git tag name.
-          set(tmp_version "${GIT_TAG_NAME}~1")
-        endif()
-      else()
-        execute_process(
-          COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
-          WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-          OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
-          RESULT_VARIABLE GIT_EXACT_TAG_RESULT
-          ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if (NOT ${GIT_EXACT_TAG_NAME})
-          # Check if current branch is tag branch
-          if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
-            string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
-          else()
-            set(PADDLE_VERSION "0.0.0")
-          endif()
-        else()
-          # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
-          set(PADDLE_VERSION "0.0.0")
-        endif()
-      endif()
-    else()
-      set(PADDLE_VERSION "0.0.0")
-      message(WARNING "Cannot add paddle version from git tag")
-    endif()
-  else()
-    set(PADDLE_VERSION "0.0.0")
-    message(WARNING "Cannot add paddle version for wrong git branch result")
-  endif()
-endwhile()
-
-add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
-message(STATUS "Paddle version is ${PADDLE_VERSION}")
diff --git a/doc/README.md b/doc/README.md
deleted file mode 100644
index 998a39f10699af6d1a391f177a5cf03c9ae170fd..0000000000000000000000000000000000000000
--- a/doc/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# For Readers and Developers
-
-Thanks for reading PaddlePaddle documentation. 
-
-Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [FluidDoc Repo](https://github.com/PaddlePaddle/FluidDoc) and updated there.
-
-Please turn to FluidDoc Repo for the latest documentation.
diff --git a/paddle/fluid/feed/CMakeLists.txt b/feed/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/feed/CMakeLists.txt
rename to feed/CMakeLists.txt
diff --git a/paddle/fluid/feed/apply_feed_code.sh b/feed/apply_feed_code.sh
similarity index 100%
rename from paddle/fluid/feed/apply_feed_code.sh
rename to feed/apply_feed_code.sh
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/job.sh b/feed/feed_deploy/news_jingpai/job.sh
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/job.sh
rename to feed/feed_deploy/news_jingpai/job.sh
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/all_slot.dict b/feed/feed_deploy/news_jingpai/package/format_nets/all_slot.dict
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/all_slot.dict
rename to feed/feed_deploy/news_jingpai/package/format_nets/all_slot.dict
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/clear_ssd.sh b/feed/feed_deploy/news_jingpai/package/format_nets/clear_ssd.sh
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/clear_ssd.sh
rename to feed/feed_deploy/news_jingpai/package/format_nets/clear_ssd.sh
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/config.py b/feed/feed_deploy/news_jingpai/package/format_nets/config.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/config.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/config.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/data_generate_base.py b/feed/feed_deploy/news_jingpai/package/format_nets/data_generate_base.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/data_generate_base.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/data_generate_base.py
diff --git a/feed/feed_deploy/news_jingpai/package/format_nets/feed/layer.py b/feed/feed_deploy/news_jingpai/package/format_nets/feed/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0563f4222719b391c1eb4f59c2d571f7891720a
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/format_nets/feed/layer.py
@@ -0,0 +1,163 @@
+import paddle.fluid as fluid
+from abc import ABCMeta, abstractmethod
+
+class Layer(object):
+    __metaclass__=ABCMeta
+
+    def __init__(self, config):
+        pass
+    
+    def generate(self, mode, param): 
+        if mode == 'fluid':
+            return self.generate_fluid(param)
+        elif mode == 'tensorflow':
+            return self.generate_tensorflow(param)
+        print ('unsupport this mode: ' + mode) 
+        return None,None
+
+    @abstractmethod
+    def generate_fluid(self, param): 
+        pass
+
+    @abstractmethod
+    def generate_tensorflow(self, param): 
+        pass
+
+class EmbeddingInputLayer(Layer):
+    def __init__(self, config):
+        self._cvm = config['cvm']
+        self._name = config['name']
+        self._slots = config['slots']
+        self._mf_dim = config['mf_dim']
+        self._backward = config['backward']
+        self._emb_dim = self._mf_dim
+        if self._cvm:
+            self._emb_dim = self._mf_dim + 2 #append show ctr
+        self._emb_layers = []
+    
+    def generate_fluid(self, param): 
+        show_clk = fluid.layers.concat(
+            [param['layer']['show'], param['layer']['click']], axis=1)
+        show_clk.stop_gradient = True
+        for slot in self._slots:
+            l = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1)
+            emb = fluid.layers.embedding(input=l, size=[10, self._mf_dim + 2], is_sparse = True, is_distributed=True, param_attr=fluid.ParamAttr(name="embedding"))
+            emb = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+            emb = fluid.layers.continuous_value_model(emb, show_clk, self._use_cvm)
+            self._emb_layers.append(emb)
+        output = fluid.layers.concat(input=self._emb_layers, axis=1, name=self._name)
+        return output, None
+
+class LabelInputLayer(Layer):
+    def __init__(self, config):
+        self._name = config['name']
+        self._dim = config.get('dim', 1)
+        self._data_type = config.get('data_type', "int64")
+        self._label_idx = config['label_idx']
+
+    def generate_fluid(self, param): 
+        output = fluid.layers.data(name=self._name, shape=[-1, self._dim], dtype=self._data_type, lod_level=0, append_batch_size=False)
+        return output, None
+
+class TagInputLayer(Layer): 
+    def __init__(self, config):
+        self._name = config['name']
+        self._tag = config['tag']
+        self._dim = config.get('dim', 1)
+        self._data_type = config['data_type']
+
+    def generate_fluid(self, param): 
+        output = fluid.layers.data(name=self._name, shape=[-1, self._dim], dtype=self._data_type, lod_level=0, append_batch_size=False, stop_gradient=Tru)
+        return output, None
+
+class ParamLayer(Layer): 
+    def __init__(self, config):
+        self._name = config['name']
+        self._coln = config['coln']
+        self._init_range = config.get('init_range', 1)
+        self._data_type = config['data_type']
+        self._config = config
+
+    def generate_fluid(self, param): 
+        return config, None
+
+class NormalizetionLayer(Layer): 
+    def __init__(self, config):
+        self._name = config['name']
+        self._input = config['input']
+
+    def generate_fluid(self, param): 
+        input_layer = param['layer'][self._input[0]]
+        if len(self._input) > 0:
+            input_list=[ param['layer'][i] for i in self._input ]
+            input_layer = fluid.layers.concat(input=input_list, axis=1)
+        bn = fluid.layers.data_norm(input=input_layer, name=self._name, epsilon=1e-4, param_attr={
+             "batch_size":1e4,
+             "batch_sum_default":0.0,
+             "batch_square":1e4})
+        inference_param = [ self._name + '.batch_size',  self._name + '.batch_sum',  self._name + '.batch_square_sum' ]
+        return bn, {'inference_param' : inference_param}
+
+class NeuralLayer(Layer): 
+    def __init__(self, config):
+        self._name = config['name']
+        self._param = config['param']
+        self._input = config['input']
+        self._bias = config.get('bias', True)
+        self._act_func = config.get('act_func', None)
+
+    def generate_fluid(self, param): 
+        param_layer = param['layer'][self._param]
+        input_layer = param['layer'][slef._input[0]]
+        if len(self._input) > 0:
+            input_list=[ param['layer'][i] for i in self._input ]
+            input_layer = fluid.layers.concat(input=input_list, axis=1)
+        input_coln = input_layer.shape[1]
+        scale = param_layer['init_range'] / (input_coln ** 0.5)
+        bias = None
+        if self._bias:
+            bias = fluid.ParamAttr(learning_rate=1.0, initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=scale))
+        fc = fluid.layers.fc(
+            name = slef._name,
+            input = input_layer,
+            size = param_layer['coln'],
+            act = self._act_func,
+            param_attr = \
+                fluid.ParamAttr(learning_rate=1.0, \
+                initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=scale)),
+            bias_attr = bias)
+        inference_param = [self._name + '.w_0',  self._name + '.b_0']
+        return fc, {'inference_param' : inference_param}
+
+class SigmoidLossLayer(Layer):
+    def __init__(self, config):
+        self._name = config['name']
+        self._label = config['label']
+        self._input = config['input']
+        self._weight = config.get('weight', None)
+        self._bound = config.get('bound', [-15.0, 15.0])
+        self._extend_output = {}
+
+    def generate_fluid(self, param): 
+        input_layer = param['layer'][slef._input[0]]
+        label_layer = param['layer'][slef._label]
+        output = fluid.layers.clip(input_layer, min=self._bound[0], max=self._bound[1]), name = self._name)
+        norm = fluid.layers.sigmoid(input=output, name=self._name)
+        output = fluid.layers.log_loss(input=norm, label=label_layer)
+        if self._weight:
+            weight_layer = param['layer'][slef._weight]
+            output = fluid.layers.elementwise_mul(output, weight_layer)
+        output = fluid.layers.mean(x=output)
+        
+        #For AUC
+        binary_predict = fluid.layers.concat(
+            input=[fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm), norm], axis=1)
+        self._extend_output['auc'], self._extend_output['batch_auc', [self._extend_output['batch_stat_pos'], \
+            self._extend_output['batch_stat_neg'], self._extend_output['stat_pos', self._extend_output['stat_neg']] = \
+            fluid.layers.auc(input=binary_predict, label=label_layer, curve='ROC', num_thresholds=4096)
+
+        self._extend_output['sqrerr'], self._extend_output['abserr'], self._extend_output['prob'], self._extend_output['q'], \
+            self._extend_output['pos'], self._extend_output['total'] = \
+            fluid.contrib.layers.ctr_metric_bundle(norm, fluid.layers.cast(x=label_layer, dtype='float32'))
+
+        return norm, self._extend_output
diff --git a/feed/feed_deploy/news_jingpai/package/format_nets/feed/layer_model.py b/feed/feed_deploy/news_jingpai/package/format_nets/feed/layer_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fbc72a68e815f2575a1aaa811792aa47d982bb9
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/format_nets/feed/layer_model.py
@@ -0,0 +1,54 @@
+import os
+import copy
+import yaml
+import layer_model
+import paddle.fluid as fluid
+
+mode='fluid'
+f = open('model.layers', 'r')
+
+
+build_nodes = yaml.safe_load(f.read())
+
+
+build_param = {'layer': {}, 'inner_layer':{}, 'layer_extend': {}, 'model': {}}
+build_phase = ['input', 'param', 'layer']
+inference_layer = ['ctr_output']
+inference_meta = {'dependency':{}, 'params': {}}
+for layer in build_nodes['layer']:
+    build_param['inner_layer'][layer['name']] = layer
+
+def get_dependency(layer_graph, dest_layer):
+    dependency_list = []
+    if dest_layer in layer_graph:
+        dependencys = copy.deepcopy(layer_graph[dest_layer]['input'])
+        dependency_list = copy.deepcopy(dependencys)
+        for dependency in dependencys:
+            dependency_list = dependency_list + get_dependency(layer_graph, dependency)
+    return list(set(dependency_list))
+
+# build train model
+if mode == 'fluid':
+    build_param['model']['train_program'] = fluid.Program()
+    build_param['model']['startup_program'] = fluid.Program()
+    with fluid.program_guard(build_param['model']['train_program'], build_param['model']['startup_program']):
+        with fluid.unique_name.guard():
+            for phase in build_phase:
+                for node in build_nodes[phase]:
+                    exec("""layer=layer_model.{}(node)""".format(node['class']))
+                    layer_output, extend_output = layer.generate(mode, build_param)
+                    build_param['layer'][node['name']] = layer_output
+                    build_param['layer_extend'][node['name']] = extend_output
+
+# build inference model
+for layer in inference_layer:
+    inference_meta['param'][layer] = []
+    inference_meta['dependency'][layer] = get_dependency(build_param['inner_layer'], layer)
+    for node in build_nodes['layer']:
+        if node['name'] not in inference_meta['dependency'][layer]:
+            continue
+        if 'inference_param' in build_param['layer_extend'][node['name']]:
+            inference_meta['param'][layer] += build_param['layer_extend'][node['name']]['inference_param'] 
+    print(inference_meta['param'][layer])
+
+
diff --git a/feed/feed_deploy/news_jingpai/package/format_nets/feed/model.layers b/feed/feed_deploy/news_jingpai/package/format_nets/feed/model.layers
new file mode 100644
index 0000000000000000000000000000000000000000..72502c5b47615803cf5379d42b3c7e049433e66f
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/format_nets/feed/model.layers
@@ -0,0 +1,22 @@
+input :
+    - { name : embedding_input, class: EmbeddingLayer, backward: false, cvm: true, mf_dim: 10, slots: [ ]}
+    - { name : label_target, class: label, backward: false }
+    - { name : ins_sample_weight, class: tag, backward: false }
+    - { name : label_with_pred_target, class: label, backward: false }
+summary :
+    - { name : base_summary }
+param :
+    - { name : h1_param, class : param_layer, init_range : 1, coln:511, scale_by_rown : true}
+    - { name : h2_param, class : param_layer, init_range : 1, coln:255, scale_by_rown : true}
+    - { name : h3_param, class : param_layer, init_range : 1, coln:127, scale_by_rown : true}
+    - { name : h4_param, class : param_layer, init_range : 1, coln:127, scale_by_rown : true}
+    - { name : h5_param, class : param_layer, init_range : 1, coln:127, scale_by_rown : true}
+    - { name : h6_param, class : param_layer, init_range : 1, coln:1, scale_by_rown : true}
+layer :
+    - { name : base_input_norm, class : normalization_layer, input : [embedding_input], summary : base_summary}
+    - { name : h1, class : neural_layer, input : [base_input_norm], param : h1_param, bias: true, act_func : relu}
+    - { name : h2, class : neural_layer, input : [h1], param : h2_param, bias : true, act_func : relu}
+    - { name : h3, class : neural_layer, input : [h2], param : h3_param, bias : true, act_func : relu}
+    - { name : h4, class : neural_layer, input : [h3], param : h4_param, bias : true, act_func : relu}
+    - { name : h5, class : neural_layer, input : [h4], param : h5_param, bias : true, act_func : relu}
+    - { name : ctr_output, class : neural_layer, input : [h5], param : h6_param, bias : true, act_func : sig_moid}
diff --git a/feed/feed_deploy/news_jingpai/package/format_nets/feed/test.py b/feed/feed_deploy/news_jingpai/package/format_nets/feed/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/format_nets/feed/test.py
@@ -0,0 +1 @@
+
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/fleet_desc_combinejoincommon.prototxt b/feed/feed_deploy/news_jingpai/package/format_nets/fleet_desc_combinejoincommon.prototxt
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/fleet_desc_combinejoincommon.prototxt
rename to feed/feed_deploy/news_jingpai/package/format_nets/fleet_desc_combinejoincommon.prototxt
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/format_newcate_hotnews.awk b/feed/feed_deploy/news_jingpai/package/format_nets/format_newcate_hotnews.awk
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/format_newcate_hotnews.awk
rename to feed/feed_deploy/news_jingpai/package/format_nets/format_newcate_hotnews.awk
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/ins_weight.py b/feed/feed_deploy/news_jingpai/package/format_nets/ins_weight.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/ins_weight.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/ins_weight.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/jingpai_fleet_desc_new.prototxt b/feed/feed_deploy/news_jingpai/package/format_nets/jingpai_fleet_desc_new.prototxt
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/jingpai_fleet_desc_new.prototxt
rename to feed/feed_deploy/news_jingpai/package/format_nets/jingpai_fleet_desc_new.prototxt
diff --git a/feed/feed_deploy/news_jingpai/package/format_nets/model.layers b/feed/feed_deploy/news_jingpai/package/format_nets/model.layers
new file mode 100644
index 0000000000000000000000000000000000000000..72502c5b47615803cf5379d42b3c7e049433e66f
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/format_nets/model.layers
@@ -0,0 +1,22 @@
+input :
+    - { name : embedding_input, class: EmbeddingLayer, backward: false, cvm: true, mf_dim: 10, slots: [ ]}
+    - { name : label_target, class: label, backward: false }
+    - { name : ins_sample_weight, class: tag, backward: false }
+    - { name : label_with_pred_target, class: label, backward: false }
+summary :
+    - { name : base_summary }
+param :
+    - { name : h1_param, class : param_layer, init_range : 1, coln:511, scale_by_rown : true}
+    - { name : h2_param, class : param_layer, init_range : 1, coln:255, scale_by_rown : true}
+    - { name : h3_param, class : param_layer, init_range : 1, coln:127, scale_by_rown : true}
+    - { name : h4_param, class : param_layer, init_range : 1, coln:127, scale_by_rown : true}
+    - { name : h5_param, class : param_layer, init_range : 1, coln:127, scale_by_rown : true}
+    - { name : h6_param, class : param_layer, init_range : 1, coln:1, scale_by_rown : true}
+layer :
+    - { name : base_input_norm, class : normalization_layer, input : [embedding_input], summary : base_summary}
+    - { name : h1, class : neural_layer, input : [base_input_norm], param : h1_param, bias: true, act_func : relu}
+    - { name : h2, class : neural_layer, input : [h1], param : h2_param, bias : true, act_func : relu}
+    - { name : h3, class : neural_layer, input : [h2], param : h3_param, bias : true, act_func : relu}
+    - { name : h4, class : neural_layer, input : [h3], param : h4_param, bias : true, act_func : relu}
+    - { name : h5, class : neural_layer, input : [h4], param : h5_param, bias : true, act_func : relu}
+    - { name : ctr_output, class : neural_layer, input : [h5], param : h6_param, bias : true, act_func : sig_moid}
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/model_new.py b/feed/feed_deploy/news_jingpai/package/format_nets/model_new.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/model_new.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/model_new.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/model_new_jc.py b/feed/feed_deploy/news_jingpai/package/format_nets/model_new_jc.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/model_new_jc.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/model_new_jc.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/my_data_generator_str.py b/feed/feed_deploy/news_jingpai/package/format_nets/my_data_generator_str.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/my_data_generator_str.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/my_data_generator_str.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_join_common_startup_program.bin b/feed/feed_deploy/news_jingpai/package/format_nets/old_program/old_join_common_startup_program.bin
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_join_common_startup_program.bin
rename to feed/feed_deploy/news_jingpai/package/format_nets/old_program/old_join_common_startup_program.bin
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_join_common_train_program.bin b/feed/feed_deploy/news_jingpai/package/format_nets/old_program/old_join_common_train_program.bin
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_join_common_train_program.bin
rename to feed/feed_deploy/news_jingpai/package/format_nets/old_program/old_join_common_train_program.bin
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_update_main_program.bin b/feed/feed_deploy/news_jingpai/package/format_nets/old_program/old_update_main_program.bin
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_update_main_program.bin
rename to feed/feed_deploy/news_jingpai/package/format_nets/old_program/old_update_main_program.bin
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_update_startup_program.bin b/feed/feed_deploy/news_jingpai/package/format_nets/old_program/old_update_startup_program.bin
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_update_startup_program.bin
rename to feed/feed_deploy/news_jingpai/package/format_nets/old_program/old_update_startup_program.bin
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/slot b/feed/feed_deploy/news_jingpai/package/format_nets/old_slot/slot
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/slot
rename to feed/feed_deploy/news_jingpai/package/format_nets/old_slot/slot
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/slot_common b/feed/feed_deploy/news_jingpai/package/format_nets/old_slot/slot_common
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/slot_common
rename to feed/feed_deploy/news_jingpai/package/format_nets/old_slot/slot_common
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/to.py b/feed/feed_deploy/news_jingpai/package/format_nets/old_slot/to.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/to.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/old_slot/to.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/reqi_fleet_desc b/feed/feed_deploy/news_jingpai/package/format_nets/reqi_fleet_desc
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/reqi_fleet_desc
rename to feed/feed_deploy/news_jingpai/package/format_nets/reqi_fleet_desc
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/scripts/xbox_compressor_mf.py b/feed/feed_deploy/news_jingpai/package/format_nets/scripts/xbox_compressor_mf.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/scripts/xbox_compressor_mf.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/scripts/xbox_compressor_mf.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/scripts/xbox_decompressor_mf.awk b/feed/feed_deploy/news_jingpai/package/format_nets/scripts/xbox_decompressor_mf.awk
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/scripts/xbox_decompressor_mf.awk
rename to feed/feed_deploy/news_jingpai/package/format_nets/scripts/xbox_decompressor_mf.awk
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/slot/slot b/feed/feed_deploy/news_jingpai/package/format_nets/slot/slot
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/slot/slot
rename to feed/feed_deploy/news_jingpai/package/format_nets/slot/slot
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/slot/slot_common b/feed/feed_deploy/news_jingpai/package/format_nets/slot/slot_common
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/slot/slot_common
rename to feed/feed_deploy/news_jingpai/package/format_nets/slot/slot_common
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/slot b/feed/feed_deploy/news_jingpai/package/format_nets/tmp/slot/slot
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/slot
rename to feed/feed_deploy/news_jingpai/package/format_nets/tmp/slot/slot
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/slot_common b/feed/feed_deploy/news_jingpai/package/format_nets/tmp/slot/slot_common
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/slot_common
rename to feed/feed_deploy/news_jingpai/package/format_nets/tmp/slot/slot_common
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/to.py b/feed/feed_deploy/news_jingpai/package/format_nets/tmp/slot/to.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/to.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/tmp/slot/to.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online.py b/feed/feed_deploy/news_jingpai/package/format_nets/trainer_online.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/trainer_online.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online_local.py b/feed/feed_deploy/news_jingpai/package/format_nets/trainer_online_local.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online_local.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/trainer_online_local.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/util.bak.py b/feed/feed_deploy/news_jingpai/package/format_nets/util.bak.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/util.bak.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/util.bak.py
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/util.py b/feed/feed_deploy/news_jingpai/package/format_nets/util.py
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/util.py
rename to feed/feed_deploy/news_jingpai/package/format_nets/util.py
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/all_slot.dict b/feed/feed_deploy/news_jingpai/package/my_nets/all_slot.dict
new file mode 100644
index 0000000000000000000000000000000000000000..8ad76f38e0ab440344be9c05a902a89c730398bd
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/all_slot.dict
@@ -0,0 +1,409 @@
+6048
+6002
+6145
+6202
+6201
+6121
+6738
+6119
+6146
+6120
+6147
+6122
+6123
+6118
+6142
+6143
+6008
+6148
+6151
+6127
+6144
+6094
+6083
+6952
+6739
+6150
+6109
+6003
+6099
+6149
+6129
+6203
+6153
+6152
+6128
+6106
+6251
+7082
+7515
+6951
+6949
+7080
+6066
+7507
+6186
+6007
+7514
+6125
+7506
+10001
+6006
+7023
+6085
+10000
+6098
+6250
+6110
+6124
+6090
+6082
+6067
+6101
+6004
+6191
+7075
+6948
+6157
+6126
+6188
+7077
+6070
+6111
+6087
+6103
+6107
+6194
+6156
+6005
+6247
+6814
+6158
+7122
+6058
+6189
+7058
+6059
+6115
+7079
+7081
+6833
+7024
+6108
+13342
+13345
+13412
+13343
+13350
+13346
+13409
+6009
+6011
+6012
+6013
+6014
+6015
+6019
+6023
+6024
+6027
+6029
+6031
+6050
+6060
+6068
+6069
+6089
+6095
+6105
+6112
+6130
+6131
+6132
+6134
+6161
+6162
+6163
+6166
+6182
+6183
+6185
+6190
+6212
+6213
+6231
+6233
+6234
+6236
+6238
+6239
+6240
+6241
+6242
+6243
+6244
+6245
+6354
+7002
+7005
+7008
+7010
+7013
+7015
+7019
+7020
+7045
+7046
+7048
+7049
+7052
+7054
+7056
+7064
+7066
+7076
+7078
+7083
+7084
+7085
+7086
+7087
+7088
+7089
+7090
+7099
+7100
+7101
+7102
+7103
+7104
+7105
+7109
+7124
+7126
+7136
+7142
+7143
+7144
+7145
+7146
+7147
+7148
+7150
+7151
+7152
+7153
+7154
+7155
+7156
+7157
+7047
+7050
+6257
+6259
+6260
+6261
+7170
+7185
+7186
+6751
+6755
+6757
+6759
+6760
+6763
+6764
+6765
+6766
+6767
+6768
+6769
+6770
+7502
+7503
+7504
+7505
+7510
+7511
+7512
+7513
+6806
+6807
+6808
+6809
+6810
+6811
+6812
+6813
+6815
+6816
+6817
+6819
+6823
+6828
+6831
+6840
+6845
+6875
+6879
+6881
+6888
+6889
+6947
+6950
+6956
+6957
+6959
+10006
+10008
+10009
+10010
+10011
+10016
+10017
+10018
+10019
+10020
+10021
+10022
+10023
+10024
+10029
+10030
+10031
+10032
+10033
+10034
+10035
+10036
+10037
+10038
+10039
+10040
+10041
+10042
+10044
+10045
+10046
+10051
+10052
+10053
+10054
+10055
+10056
+10057
+10060
+10066
+10069
+6820
+6821
+6822
+13333
+13334
+13335
+13336
+13337
+13338
+13339
+13340
+13341
+13351
+13352
+13353
+13359
+13361
+13362
+13363
+13366
+13367
+13368
+13369
+13370
+13371
+13375
+13376
+5700
+5702
+13400
+13401
+13402
+13403
+13404
+13406
+13407
+13408
+13410
+13417
+13418
+13419
+13420
+13422
+13425
+13427
+13428
+13429
+13430
+13431
+13433
+13434
+13436
+13437
+13326
+13330
+13331
+5717
+13442
+13451
+13452
+13455
+13456
+13457
+13458
+13459
+13460
+13461
+13462
+13463
+13464
+13465
+13466
+13467
+13468
+1104
+1106
+1107
+1108
+1109
+1110
+1111
+1112
+1113
+1114
+1115
+1116
+1117
+1119
+1120
+1121
+1122
+1123
+1124
+1125
+1126
+1127
+1128
+1129
+13812
+13813
+6740
+1490
+32915
+32950
+32952
+32953
+32954
+33077
+33085
+33086
+12345
+23456
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/clear_ssd.sh b/feed/feed_deploy/news_jingpai/package/my_nets/clear_ssd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a26c21a0f577623e9c9b90d353b0b090ad212d04
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/clear_ssd.sh
@@ -0,0 +1,38 @@
+#!bash
+
+function check_appid_valid() {
+    appid="$1"
+    num=`echo "${appid}" |awk -F '-' '{print NF}'`
+    if [ $num -ne 4 ];then
+        return 1
+    fi
+    return 0
+}
+
+function appid_running_num() {
+    appid="$1"
+    proc_num=`ps -ef |grep "${appid}"|grep -v grep|wc -l`
+    if [ $? -ne 0 ];then 
+        #if failed, return 1, avoid
+        return 1
+    fi
+    return ${proc_num}
+}
+
+work_dir="$1"
+base_dir=`echo "${work_dir}" |awk -F 'app-user-' '{print $1}'`
+database_list=`find ${base_dir} -type d -name 'database'`
+for element in ${database_list[@]}
+do
+    app_id=`echo "$element"|awk -F 'app-user-' '{print $2}' |awk -F '/' '{print "app-user-"$1}'`
+    check_appid_valid "${app_id}"
+    if [ $? -ne 0 ];then
+        continue
+    fi
+    appid_running_num "${app_id}"
+    if [ $? -eq 0 ];then
+        echo "remove ${element}"
+        rm -rf ${element}
+    fi
+done
+
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/config.py b/feed/feed_deploy/news_jingpai/package/my_nets/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..185c68423e84a9b93ef62e00196023b259e48681
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/config.py
@@ -0,0 +1,40 @@
+dataset_type="InMemoryDataset"
+batch_size=32
+thread_num=12
+shuffle_thread=12
+preload_thread=12
+join_common_thread=16
+update_thread=12
+fs_name="afs://xingtian.afs.baidu.com:9902"
+fs_ugi="mlarch_pro,proisvip"
+train_data_path=["afs:/user/feed/mlarch/samplejoin/mondr_shoubai_dnn_master/feasign"]
+init_model_path=""
+days="{20191201..20191231} {20200101..20200131} {20200201..20200228} {20200301..20200331}"
+hours="{0..23}"
+split_interval=5
+split_per_pass=2
+is_data_hourly_placed=False
+save_first_base=False
+output_path="afs:/user/feed/mlarch/model/feed_muye_news_paddle"
+pipe_command="./read_feasign | python/bin/python ins_weight.py | awk -f format_newcate_hotnews.awk | ./parse_feasign all_slot.dict"
+save_xbox_before_update=True
+check_exist_seconds=30
+checkpoint_per_pass=36
+save_delta_frequency=6
+prefetch=True
+write_stdout_frequency=10
+
+need_reqi_changeslot=True
+hdfs_dnn_plugin_path="afs:/user/feed/mlarch/sequence_generator/wuzhihua02/xujiaqi/test_combinejoincommon_0918_amd/20191006/base/dnn_plugin"
+reqi_dnn_plugin_day=20191006
+reqi_dnn_plugin_pass=0
+
+task_name="feed_production_shoubai_video_ctr_fsort_session_cut"
+nodes=119
+node_memory=100000
+mpi_server="yq01-hpc-lvliang01-smart-master.dmop.baidu.com"
+mpi_queue="feed5"
+mpi_priority="very_high"
+smart_client_home="/home/work/xiexionghang/news_paddle_online/smart_client/"
+local_hadoop_home="/home/work/xiexionghang/news_paddle_online/hadoop-client/hadoop"
+sparse_table_storage="ssd"
diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/feed/feed_deploy/news_jingpai/package/my_nets/data_generate_base.py
similarity index 88%
rename from python/paddle/fluid/incubate/data_generator/__init__.py
rename to feed/feed_deploy/news_jingpai/package/my_nets/data_generate_base.py
index 77c3fc6bf2d4fb75709ba9667860b14b2334f5a1..7abce3bd3bfeea6a442a371b6c40a6c113ce605f 100644
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/data_generate_base.py
@@ -15,7 +15,7 @@
 import os
 import sys
 
-__all__ = ['MultiSlotDataGenerator', 'MultiSlotStringDataGenerator']
+__all__ = ['MultiSlotDataGenerator']
 
 
 class DataGenerator(object):
@@ -98,6 +98,7 @@ class DataGenerator(object):
             for sample in batch_iter():
                 sys.stdout.write(self._gen_str(sample))
 
+
     def run_from_stdin(self):
         '''
         This function reads the data row from stdin, parses it with the
@@ -235,51 +236,8 @@ class DataGenerator(object):
         return local_iter
 
 
-# TODO: guru4elephant
-# add more generalized DataGenerator that can adapt user-defined slot
-# for example, [(name, float_list), (name, str_list), (name, int_list)]
-class MultiSlotStringDataGenerator(DataGenerator):
-    def _gen_str(self, line):
-        '''
-        Further processing the output of the process() function rewritten by
-        user, outputting data that can be directly read by the MultiSlotDataFeed,
-        and updating proto_info infomation.
-
-        The input line will be in this format:
-            >>> [(name, [str(feasign), ...]), ...]
-            >>> or ((name, [str(feasign), ...]), ...)
-        The output will be in this format:
-            >>> [ids_num id1 id2 ...] ...
-
-        For example, if the input is like this:
-            >>> [("words", ["1926", "08", "17"]), ("label", ["1"])]
-            >>> or (("words", ["1926", "08", "17"]), ("label", ["1"]))
-        the output will be:
-            >>> 3 1234 2345 3456 1 1
-
-        Args:
-            line(str): the output of the process() function rewritten by user.
-
-        Returns:
-            Return a string data that can be read directly by the MultiSlotDataFeed.
-        '''
-        if not isinstance(line, list) and not isinstance(line, tuple):
-            raise ValueError(
-                "the output of process() must be in list or tuple type"
-                "Examples: [('words', ['1926', '08', '17']), ('label', ['1'])]")
-        output = ""
-        for index, item in enumerate(line):
-            name, elements = item
-            if output:
-                output += " "
-            out_str = []
-            out_str.append(str(len(elements)))
-            out_str.extend(elements)
-            output += " ".join(out_str)
-        return output + "\n"
-
-
 class MultiSlotDataGenerator(DataGenerator):
+
     def _gen_str(self, line):
         '''
         Further processing the output of the process() function rewritten by
@@ -310,14 +268,24 @@ class MultiSlotDataGenerator(DataGenerator):
         '''
         if not isinstance(line, list) and not isinstance(line, tuple):
             raise ValueError(
-                "the output of process() must be in list or tuple type"
-                "Example: [('words', [1926, 08, 17]), ('label', [1])]")
+                "the output of process() must be in list or tuple type")
         output = ""
 
+        for index, item in enumerate(line):
+            name, elements = item
+            if output:
+                output += " "
+            out_str = []
+            out_str.append(str(len(elements)))
+            out_str.extend(elements)
+            output += " ".join(out_str)
+        return output + "\n"
+
         if self._proto_info is None:
             self._proto_info = []
-            for item in line:
+            for index, item in enumerate(line):
                 name, elements = item
+                '''
                 if not isinstance(name, str):
                     raise ValueError("name%s must be in str type" % type(name))
                 if not isinstance(elements, list):
@@ -340,12 +308,20 @@ class MultiSlotDataGenerator(DataGenerator):
                             "the type of element%s must be in int or float" %
                             type(elem))
                     output += " " + str(elem)
+                '''
+                if output:
+                    output += " "
+                out_str = []
+                out_str.append(str(len(elements)))
+                out_str.extend(elements)
+                output += " ".join(out_str)
         else:
             if len(line) != len(self._proto_info):
                 raise ValueError(
                     "the complete field set of two given line are inconsistent.")
             for index, item in enumerate(line):
                 name, elements = item
+                '''
                 if not isinstance(name, str):
                     raise ValueError("name%s must be in str type" % type(name))
                 if not isinstance(elements, list):
@@ -359,9 +335,15 @@ class MultiSlotDataGenerator(DataGenerator):
                     raise ValueError(
                         "the field name of two given line are not match: require<%s>, get<%s>."
                         % (self._proto_info[index][0], name))
+                '''
                 if output:
                     output += " "
-                output += str(len(elements))
+                out_str = []
+                out_str.append(str(len(elements)))
+                #out_str.extend([str(x) for x in elements])
+                out_str.extend(elements)
+                output += " ".join(out_str)
+                '''
                 for elem in elements:
                     if self._proto_info[index][1] != "float":
                         if isinstance(elem, float):
@@ -372,4 +354,5 @@ class MultiSlotDataGenerator(DataGenerator):
                                 "the type of element%s must be in int or float"
                                 % type(elem))
                     output += " " + str(elem)
+                '''
         return output + "\n"
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/fleet_desc_combinejoincommon.prototxt b/feed/feed_deploy/news_jingpai/package/my_nets/fleet_desc_combinejoincommon.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..e29be5c4794e9e288a9578f52ee739f02d4f78df
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/fleet_desc_combinejoincommon.prototxt
@@ -0,0 +1,1466 @@
+server_param {
+  downpour_server_param {
+    downpour_table_param {
+      table_id: 0
+      table_class: "DownpourSparseTable"
+      shard_num: 1950
+      accessor {
+        accessor_class: "DownpourCtrAccessor"
+        sparse_sgd_param {
+          learning_rate: 0.05
+          initial_g2sum: 3.0
+          initial_range: 0.0001
+          weight_bounds: -10.0
+          weight_bounds: 10.0
+        }
+        fea_dim: 11
+        embedx_dim: 8
+        embedx_threshold: 10
+        downpour_accessor_param {
+          nonclk_coeff: 0.1
+          click_coeff: 1
+          base_threshold: 1.5
+          delta_threshold: 0.25
+          delta_keep_days: 16
+          delete_after_unseen_days: 30
+          show_click_decay_rate: 0.98
+          delete_threshold: 0.8
+        }
+        table_accessor_save_param {
+          param: 1
+          converter: "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)"
+          deconverter: "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)"
+        }
+        table_accessor_save_param {
+          param: 2
+          converter: "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)"
+          deconverter: "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)"
+        }
+      }
+      type: PS_SPARSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 1
+      table_class: "DownpourDenseTable"
+      accessor {
+        accessor_class: "DownpourDenseValueAccessor"
+        dense_sgd_param {
+          name: "adam"
+          adam {
+            learning_rate: 5e-06
+            avg_decay_rate: 0.999993
+            ada_decay_rate: 0.9999
+            ada_epsilon: 1e-08
+            mom_decay_rate: 0.99
+          }
+          naive {
+            learning_rate: 0.0002
+          }
+        }
+        fea_dim: 3405365
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 2
+      table_class: "DownpourDenseDoubleTable"
+      accessor {
+        accessor_class: "DownpourDenseValueDoubleAccessor"
+        dense_sgd_param {
+          name: "summarydouble"
+          summary {
+            summary_decay_rate: 0.999999
+          }
+        }
+        fea_dim: 16731
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 3
+      table_class: "DownpourDenseTable"
+      accessor {
+        accessor_class: "DownpourDenseValueAccessor"
+        dense_sgd_param {
+          name: "adam"
+          adam {
+            learning_rate: 5e-06
+            avg_decay_rate: 0.999993
+            ada_decay_rate: 0.9999
+            ada_epsilon: 1e-08
+            mom_decay_rate: 0.99
+          }
+          naive {
+            learning_rate: 0.0002
+          }
+        }
+        fea_dim: 2072615
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    service_param {
+      server_class: "DownpourBrpcPsServer"
+      client_class: "DownpourBrpcPsClient"
+      service_class: "DownpourPsService"
+      start_server_port: 0
+      server_thread_num: 12
+    }
+  }
+}
+trainer_param {
+  dense_table {
+    table_id: 1
+
+    dense_variable_name: "join_0.w_0"
+    dense_variable_name: "join_0.b_0"
+    dense_variable_name: "join_1.w_0"
+    dense_variable_name: "join_1.b_0"
+    dense_variable_name: "join_2.w_0"
+    dense_variable_name: "join_2.b_0"
+    dense_variable_name: "join_3.w_0"
+    dense_variable_name: "join_3.b_0"
+    dense_variable_name: "join_4.w_0"
+    dense_variable_name: "join_4.b_0"
+    dense_variable_name: "join_5.w_0"
+    dense_variable_name: "join_5.b_0"
+    dense_variable_name: "join_6.w_0"
+    dense_variable_name: "join_6.b_0"
+    dense_variable_name: "join_7.w_0"
+    dense_variable_name: "join_7.b_0"
+
+    dense_variable_name: "common_0.w_0"
+    dense_variable_name: "common_0.b_0"
+    dense_variable_name: "common_1.w_0"
+    dense_variable_name: "common_1.b_0"
+    dense_variable_name: "common_2.w_0"
+    dense_variable_name: "common_2.b_0"
+    dense_variable_name: "common_3.w_0"
+    dense_variable_name: "common_3.b_0"
+    dense_variable_name: "common_4.w_0"
+    dense_variable_name: "common_4.b_0"
+    dense_variable_name: "common_5.w_0"
+    dense_variable_name: "common_5.b_0"
+    dense_variable_name: "common_6.w_0"
+    dense_variable_name: "common_6.b_0"
+    dense_variable_name: "common_7.w_0"
+    dense_variable_name: "common_7.b_0"
+
+    dense_gradient_variable_name: "join_0.w_0@GRAD"
+    dense_gradient_variable_name: "join_0.b_0@GRAD"
+    dense_gradient_variable_name: "join_1.w_0@GRAD"
+    dense_gradient_variable_name: "join_1.b_0@GRAD"
+    dense_gradient_variable_name: "join_2.w_0@GRAD"
+    dense_gradient_variable_name: "join_2.b_0@GRAD"
+    dense_gradient_variable_name: "join_3.w_0@GRAD"
+    dense_gradient_variable_name: "join_3.b_0@GRAD"
+    dense_gradient_variable_name: "join_4.w_0@GRAD"
+    dense_gradient_variable_name: "join_4.b_0@GRAD"
+    dense_gradient_variable_name: "join_5.w_0@GRAD"
+    dense_gradient_variable_name: "join_5.b_0@GRAD"
+    dense_gradient_variable_name: "join_6.w_0@GRAD"
+    dense_gradient_variable_name: "join_6.b_0@GRAD"
+    dense_gradient_variable_name: "join_7.w_0@GRAD"
+    dense_gradient_variable_name: "join_7.b_0@GRAD"
+
+    dense_gradient_variable_name: "common_0.w_0@GRAD"
+    dense_gradient_variable_name: "common_0.b_0@GRAD"
+    dense_gradient_variable_name: "common_1.w_0@GRAD"
+    dense_gradient_variable_name: "common_1.b_0@GRAD"
+    dense_gradient_variable_name: "common_2.w_0@GRAD"
+    dense_gradient_variable_name: "common_2.b_0@GRAD"
+    dense_gradient_variable_name: "common_3.w_0@GRAD"
+    dense_gradient_variable_name: "common_3.b_0@GRAD"
+    dense_gradient_variable_name: "common_4.w_0@GRAD"
+    dense_gradient_variable_name: "common_4.b_0@GRAD"
+    dense_gradient_variable_name: "common_5.w_0@GRAD"
+    dense_gradient_variable_name: "common_5.b_0@GRAD"
+    dense_gradient_variable_name: "common_6.w_0@GRAD"
+    dense_gradient_variable_name: "common_6.b_0@GRAD"
+    dense_gradient_variable_name: "common_7.w_0@GRAD"
+    dense_gradient_variable_name: "common_7.b_0@GRAD"
+  }
+  dense_table {
+    table_id: 2
+    dense_variable_name: "join.batch_size"
+    dense_variable_name: "join.batch_sum"
+    dense_variable_name: "join.batch_square_sum"
+
+    dense_variable_name: "common.batch_size"
+    dense_variable_name: "common.batch_sum"
+    dense_variable_name: "common.batch_square_sum"
+
+    dense_gradient_variable_name: "join.batch_size@GRAD"
+    dense_gradient_variable_name: "join.batch_sum@GRAD"
+    dense_gradient_variable_name: "join.batch_square_sum@GRAD"
+
+    dense_gradient_variable_name: "common.batch_size@GRAD"
+    dense_gradient_variable_name: "common.batch_sum@GRAD"
+    dense_gradient_variable_name: "common.batch_square_sum@GRAD"
+  }
+  dense_table {
+    table_id: 3
+    dense_variable_name: "fc_0.w_0"
+    dense_variable_name: "fc_0.b_0"
+    dense_variable_name: "fc_1.w_0"
+    dense_variable_name: "fc_1.b_0"
+    dense_variable_name: "fc_2.w_0"
+    dense_variable_name: "fc_2.b_0"
+    dense_variable_name: "fc_3.w_0"
+    dense_variable_name: "fc_3.b_0"
+    dense_variable_name: "fc_4.w_0"
+    dense_variable_name: "fc_4.b_0"
+    dense_variable_name: "fc_5.w_0"
+    dense_variable_name: "fc_5.b_0"
+    dense_gradient_variable_name: "fc_0.w_0@GRAD"
+    dense_gradient_variable_name: "fc_0.b_0@GRAD"
+    dense_gradient_variable_name: "fc_1.w_0@GRAD"
+    dense_gradient_variable_name: "fc_1.b_0@GRAD"
+    dense_gradient_variable_name: "fc_2.w_0@GRAD"
+    dense_gradient_variable_name: "fc_2.b_0@GRAD"
+    dense_gradient_variable_name: "fc_3.w_0@GRAD"
+    dense_gradient_variable_name: "fc_3.b_0@GRAD"
+    dense_gradient_variable_name: "fc_4.w_0@GRAD"
+    dense_gradient_variable_name: "fc_4.b_0@GRAD"
+    dense_gradient_variable_name: "fc_5.w_0@GRAD"
+    dense_gradient_variable_name: "fc_5.b_0@GRAD"
+  }
+  sparse_table {
+    table_id: 0
+    slot_key: "6048"
+    slot_key: "6002"
+    slot_key: "6145"
+    slot_key: "6202"
+    slot_key: "6201"
+    slot_key: "6121"
+    slot_key: "6738"
+    slot_key: "6119"
+    slot_key: "6146"
+    slot_key: "6120"
+    slot_key: "6147"
+    slot_key: "6122"
+    slot_key: "6123"
+    slot_key: "6118"
+    slot_key: "6142"
+    slot_key: "6143"
+    slot_key: "6008"
+    slot_key: "6148"
+    slot_key: "6151"
+    slot_key: "6127"
+    slot_key: "6144"
+    slot_key: "6094"
+    slot_key: "6083"
+    slot_key: "6952"
+    slot_key: "6739"
+    slot_key: "6150"
+    slot_key: "6109"
+    slot_key: "6003"
+    slot_key: "6099"
+    slot_key: "6149"
+    slot_key: "6129"
+    slot_key: "6203"
+    slot_key: "6153"
+    slot_key: "6152"
+    slot_key: "6128"
+    slot_key: "6106"
+    slot_key: "6251"
+    slot_key: "7082"
+    slot_key: "7515"
+    slot_key: "6951"
+    slot_key: "6949"
+    slot_key: "7080"
+    slot_key: "6066"
+    slot_key: "7507"
+    slot_key: "6186"
+    slot_key: "6007"
+    slot_key: "7514"
+    slot_key: "6125"
+    slot_key: "7506"
+    slot_key: "10001"
+    slot_key: "6006"
+    slot_key: "7023"
+    slot_key: "6085"
+    slot_key: "10000"
+    slot_key: "6098"
+    slot_key: "6250"
+    slot_key: "6110"
+    slot_key: "6124"
+    slot_key: "6090"
+    slot_key: "6082"
+    slot_key: "6067"
+    slot_key: "6101"
+    slot_key: "6004"
+    slot_key: "6191"
+    slot_key: "7075"
+    slot_key: "6948"
+    slot_key: "6157"
+    slot_key: "6126"
+    slot_key: "6188"
+    slot_key: "7077"
+    slot_key: "6070"
+    slot_key: "6111"
+    slot_key: "6087"
+    slot_key: "6103"
+    slot_key: "6107"
+    slot_key: "6194"
+    slot_key: "6156"
+    slot_key: "6005"
+    slot_key: "6247"
+    slot_key: "6814"
+    slot_key: "6158"
+    slot_key: "7122"
+    slot_key: "6058"
+    slot_key: "6189"
+    slot_key: "7058"
+    slot_key: "6059"
+    slot_key: "6115"
+    slot_key: "7079"
+    slot_key: "7081"
+    slot_key: "6833"
+    slot_key: "7024"
+    slot_key: "6108"
+    slot_key: "13342"
+    slot_key: "13345"
+    slot_key: "13412"
+    slot_key: "13343"
+    slot_key: "13350"
+    slot_key: "13346"
+    slot_key: "13409"
+    slot_key: "6009"
+    slot_key: "6011"
+    slot_key: "6012"
+    slot_key: "6013"
+    slot_key: "6014"
+    slot_key: "6015"
+    slot_key: "6019"
+    slot_key: "6023"
+    slot_key: "6024"
+    slot_key: "6027"
+    slot_key: "6029"
+    slot_key: "6031"
+    slot_key: "6050"
+    slot_key: "6060"
+    slot_key: "6068"
+    slot_key: "6069"
+    slot_key: "6089"
+    slot_key: "6095"
+    slot_key: "6105"
+    slot_key: "6112"
+    slot_key: "6130"
+    slot_key: "6131"
+    slot_key: "6132"
+    slot_key: "6134"
+    slot_key: "6161"
+    slot_key: "6162"
+    slot_key: "6163"
+    slot_key: "6166"
+    slot_key: "6182"
+    slot_key: "6183"
+    slot_key: "6185"
+    slot_key: "6190"
+    slot_key: "6212"
+    slot_key: "6213"
+    slot_key: "6231"
+    slot_key: "6233"
+    slot_key: "6234"
+    slot_key: "6236"
+    slot_key: "6238"
+    slot_key: "6239"
+    slot_key: "6240"
+    slot_key: "6241"
+    slot_key: "6242"
+    slot_key: "6243"
+    slot_key: "6244"
+    slot_key: "6245"
+    slot_key: "6354"
+    slot_key: "7002"
+    slot_key: "7005"
+    slot_key: "7008"
+    slot_key: "7010"
+    slot_key: "7012"
+    slot_key: "7013"
+    slot_key: "7015"
+    slot_key: "7016"
+    slot_key: "7017"
+    slot_key: "7018"
+    slot_key: "7019"
+    slot_key: "7020"
+    slot_key: "7045"
+    slot_key: "7046"
+    slot_key: "7048"
+    slot_key: "7049"
+    slot_key: "7052"
+    slot_key: "7054"
+    slot_key: "7056"
+    slot_key: "7064"
+    slot_key: "7066"
+    slot_key: "7076"
+    slot_key: "7078"
+    slot_key: "7083"
+    slot_key: "7084"
+    slot_key: "7085"
+    slot_key: "7086"
+    slot_key: "7087"
+    slot_key: "7088"
+    slot_key: "7089"
+    slot_key: "7090"
+    slot_key: "7099"
+    slot_key: "7100"
+    slot_key: "7101"
+    slot_key: "7102"
+    slot_key: "7103"
+    slot_key: "7104"
+    slot_key: "7105"
+    slot_key: "7109"
+    slot_key: "7124"
+    slot_key: "7126"
+    slot_key: "7136"
+    slot_key: "7142"
+    slot_key: "7143"
+    slot_key: "7144"
+    slot_key: "7145"
+    slot_key: "7146"
+    slot_key: "7147"
+    slot_key: "7148"
+    slot_key: "7150"
+    slot_key: "7151"
+    slot_key: "7152"
+    slot_key: "7153"
+    slot_key: "7154"
+    slot_key: "7155"
+    slot_key: "7156"
+    slot_key: "7157"
+    slot_key: "7047"
+    slot_key: "7050"
+    slot_key: "6253"
+    slot_key: "6254"
+    slot_key: "6255"
+    slot_key: "6256"
+    slot_key: "6257"
+    slot_key: "6259"
+    slot_key: "6260"
+    slot_key: "6261"
+    slot_key: "7170"
+    slot_key: "7185"
+    slot_key: "7186"
+    slot_key: "6751"
+    slot_key: "6755"
+    slot_key: "6757"
+    slot_key: "6759"
+    slot_key: "6760"
+    slot_key: "6763"
+    slot_key: "6764"
+    slot_key: "6765"
+    slot_key: "6766"
+    slot_key: "6767"
+    slot_key: "6768"
+    slot_key: "6769"
+    slot_key: "6770"
+    slot_key: "7502"
+    slot_key: "7503"
+    slot_key: "7504"
+    slot_key: "7505"
+    slot_key: "7510"
+    slot_key: "7511"
+    slot_key: "7512"
+    slot_key: "7513"
+    slot_key: "6806"
+    slot_key: "6807"
+    slot_key: "6808"
+    slot_key: "6809"
+    slot_key: "6810"
+    slot_key: "6811"
+    slot_key: "6812"
+    slot_key: "6813"
+    slot_key: "6815"
+    slot_key: "6816"
+    slot_key: "6817"
+    slot_key: "6819"
+    slot_key: "6823"
+    slot_key: "6828"
+    slot_key: "6831"
+    slot_key: "6840"
+    slot_key: "6845"
+    slot_key: "6875"
+    slot_key: "6879"
+    slot_key: "6881"
+    slot_key: "6888"
+    slot_key: "6889"
+    slot_key: "6947"
+    slot_key: "6950"
+    slot_key: "6956"
+    slot_key: "6957"
+    slot_key: "6959"
+    slot_key: "10006"
+    slot_key: "10008"
+    slot_key: "10009"
+    slot_key: "10010"
+    slot_key: "10011"
+    slot_key: "10016"
+    slot_key: "10017"
+    slot_key: "10018"
+    slot_key: "10019"
+    slot_key: "10020"
+    slot_key: "10021"
+    slot_key: "10022"
+    slot_key: "10023"
+    slot_key: "10024"
+    slot_key: "10029"
+    slot_key: "10030"
+    slot_key: "10031"
+    slot_key: "10032"
+    slot_key: "10033"
+    slot_key: "10034"
+    slot_key: "10035"
+    slot_key: "10036"
+    slot_key: "10037"
+    slot_key: "10038"
+    slot_key: "10039"
+    slot_key: "10040"
+    slot_key: "10041"
+    slot_key: "10042"
+    slot_key: "10044"
+    slot_key: "10045"
+    slot_key: "10046"
+    slot_key: "10051"
+    slot_key: "10052"
+    slot_key: "10053"
+    slot_key: "10054"
+    slot_key: "10055"
+    slot_key: "10056"
+    slot_key: "10057"
+    slot_key: "10060"
+    slot_key: "10066"
+    slot_key: "10069"
+    slot_key: "6820"
+    slot_key: "6821"
+    slot_key: "6822"
+    slot_key: "13333"
+    slot_key: "13334"
+    slot_key: "13335"
+    slot_key: "13336"
+    slot_key: "13337"
+    slot_key: "13338"
+    slot_key: "13339"
+    slot_key: "13340"
+    slot_key: "13341"
+    slot_key: "13351"
+    slot_key: "13352"
+    slot_key: "13353"
+    slot_key: "13359"
+    slot_key: "13361"
+    slot_key: "13362"
+    slot_key: "13363"
+    slot_key: "13366"
+    slot_key: "13367"
+    slot_key: "13368"
+    slot_key: "13369"
+    slot_key: "13370"
+    slot_key: "13371"
+    slot_key: "13375"
+    slot_key: "13376"
+    slot_key: "5700"
+    slot_key: "5702"
+    slot_key: "13400"
+    slot_key: "13401"
+    slot_key: "13402"
+    slot_key: "13403"
+    slot_key: "13404"
+    slot_key: "13406"
+    slot_key: "13407"
+    slot_key: "13408"
+    slot_key: "13410"
+    slot_key: "13417"
+    slot_key: "13418"
+    slot_key: "13419"
+    slot_key: "13420"
+    slot_key: "13422"
+    slot_key: "13425"
+    slot_key: "13427"
+    slot_key: "13428"
+    slot_key: "13429"
+    slot_key: "13430"
+    slot_key: "13431"
+    slot_key: "13433"
+    slot_key: "13434"
+    slot_key: "13436"
+    slot_key: "13437"
+    slot_key: "13326"
+    slot_key: "13330"
+    slot_key: "13331"
+    slot_key: "5717"
+    slot_key: "13442"
+    slot_key: "13451"
+    slot_key: "13452"
+    slot_key: "13455"
+    slot_key: "13456"
+    slot_key: "13457"
+    slot_key: "13458"
+    slot_key: "13459"
+    slot_key: "13460"
+    slot_key: "13461"
+    slot_key: "13462"
+    slot_key: "13463"
+    slot_key: "13464"
+    slot_key: "13465"
+    slot_key: "13466"
+    slot_key: "13467"
+    slot_key: "13468"
+    slot_key: "1104"
+    slot_key: "1106"
+    slot_key: "1107"
+    slot_key: "1108"
+    slot_key: "1109"
+    slot_key: "1110"
+    slot_key: "1111"
+    slot_key: "1112"
+    slot_key: "1113"
+    slot_key: "1114"
+    slot_key: "1115"
+    slot_key: "1116"
+    slot_key: "1117"
+    slot_key: "1119"
+    slot_key: "1120"
+    slot_key: "1121"
+    slot_key: "1122"
+    slot_key: "1123"
+    slot_key: "1124"
+    slot_key: "1125"
+    slot_key: "1126"
+    slot_key: "1127"
+    slot_key: "1128"
+    slot_key: "1129"
+    slot_key: "13812"
+    slot_key: "13813"
+    slot_key: "6740"
+    slot_key: "1490"
+    slot_key: "1491"
+    slot_value: "embedding_0.tmp_0"
+    slot_value: "embedding_1.tmp_0"
+    slot_value: "embedding_2.tmp_0"
+    slot_value: "embedding_3.tmp_0"
+    slot_value: "embedding_4.tmp_0"
+    slot_value: "embedding_5.tmp_0"
+    slot_value: "embedding_6.tmp_0"
+    slot_value: "embedding_7.tmp_0"
+    slot_value: "embedding_8.tmp_0"
+    slot_value: "embedding_9.tmp_0"
+    slot_value: "embedding_10.tmp_0"
+    slot_value: "embedding_11.tmp_0"
+    slot_value: "embedding_12.tmp_0"
+    slot_value: "embedding_13.tmp_0"
+    slot_value: "embedding_14.tmp_0"
+    slot_value: "embedding_15.tmp_0"
+    slot_value: "embedding_16.tmp_0"
+    slot_value: "embedding_17.tmp_0"
+    slot_value: "embedding_18.tmp_0"
+    slot_value: "embedding_19.tmp_0"
+    slot_value: "embedding_20.tmp_0"
+    slot_value: "embedding_21.tmp_0"
+    slot_value: "embedding_22.tmp_0"
+    slot_value: "embedding_23.tmp_0"
+    slot_value: "embedding_24.tmp_0"
+    slot_value: "embedding_25.tmp_0"
+    slot_value: "embedding_26.tmp_0"
+    slot_value: "embedding_27.tmp_0"
+    slot_value: "embedding_28.tmp_0"
+    slot_value: "embedding_29.tmp_0"
+    slot_value: "embedding_30.tmp_0"
+    slot_value: "embedding_31.tmp_0"
+    slot_value: "embedding_32.tmp_0"
+    slot_value: "embedding_33.tmp_0"
+    slot_value: "embedding_34.tmp_0"
+    slot_value: "embedding_35.tmp_0"
+    slot_value: "embedding_36.tmp_0"
+    slot_value: "embedding_37.tmp_0"
+    slot_value: "embedding_38.tmp_0"
+    slot_value: "embedding_39.tmp_0"
+    slot_value: "embedding_40.tmp_0"
+    slot_value: "embedding_41.tmp_0"
+    slot_value: "embedding_42.tmp_0"
+    slot_value: "embedding_43.tmp_0"
+    slot_value: "embedding_44.tmp_0"
+    slot_value: "embedding_45.tmp_0"
+    slot_value: "embedding_46.tmp_0"
+    slot_value: "embedding_47.tmp_0"
+    slot_value: "embedding_48.tmp_0"
+    slot_value: "embedding_49.tmp_0"
+    slot_value: "embedding_50.tmp_0"
+    slot_value: "embedding_51.tmp_0"
+    slot_value: "embedding_52.tmp_0"
+    slot_value: "embedding_53.tmp_0"
+    slot_value: "embedding_54.tmp_0"
+    slot_value: "embedding_55.tmp_0"
+    slot_value: "embedding_56.tmp_0"
+    slot_value: "embedding_57.tmp_0"
+    slot_value: "embedding_58.tmp_0"
+    slot_value: "embedding_59.tmp_0"
+    slot_value: "embedding_60.tmp_0"
+    slot_value: "embedding_61.tmp_0"
+    slot_value: "embedding_62.tmp_0"
+    slot_value: "embedding_63.tmp_0"
+    slot_value: "embedding_64.tmp_0"
+    slot_value: "embedding_65.tmp_0"
+    slot_value: "embedding_66.tmp_0"
+    slot_value: "embedding_67.tmp_0"
+    slot_value: "embedding_68.tmp_0"
+    slot_value: "embedding_69.tmp_0"
+    slot_value: "embedding_70.tmp_0"
+    slot_value: "embedding_71.tmp_0"
+    slot_value: "embedding_72.tmp_0"
+    slot_value: "embedding_73.tmp_0"
+    slot_value: "embedding_74.tmp_0"
+    slot_value: "embedding_75.tmp_0"
+    slot_value: "embedding_76.tmp_0"
+    slot_value: "embedding_77.tmp_0"
+    slot_value: "embedding_78.tmp_0"
+    slot_value: "embedding_79.tmp_0"
+    slot_value: "embedding_80.tmp_0"
+    slot_value: "embedding_81.tmp_0"
+    slot_value: "embedding_82.tmp_0"
+    slot_value: "embedding_83.tmp_0"
+    slot_value: "embedding_84.tmp_0"
+    slot_value: "embedding_85.tmp_0"
+    slot_value: "embedding_86.tmp_0"
+    slot_value: "embedding_87.tmp_0"
+    slot_value: "embedding_88.tmp_0"
+    slot_value: "embedding_89.tmp_0"
+    slot_value: "embedding_90.tmp_0"
+    slot_value: "embedding_91.tmp_0"
+    slot_value: "embedding_92.tmp_0"
+    slot_value: "embedding_93.tmp_0"
+    slot_value: "embedding_94.tmp_0"
+    slot_value: "embedding_95.tmp_0"
+    slot_value: "embedding_96.tmp_0"
+    slot_value: "embedding_97.tmp_0"
+    slot_value: "embedding_98.tmp_0"
+    slot_value: "embedding_99.tmp_0"
+    slot_value: "embedding_100.tmp_0"
+    slot_value: "embedding_101.tmp_0"
+    slot_value: "embedding_102.tmp_0"
+    slot_value: "embedding_103.tmp_0"
+    slot_value: "embedding_104.tmp_0"
+    slot_value: "embedding_105.tmp_0"
+    slot_value: "embedding_106.tmp_0"
+    slot_value: "embedding_107.tmp_0"
+    slot_value: "embedding_108.tmp_0"
+    slot_value: "embedding_109.tmp_0"
+    slot_value: "embedding_110.tmp_0"
+    slot_value: "embedding_111.tmp_0"
+    slot_value: "embedding_112.tmp_0"
+    slot_value: "embedding_113.tmp_0"
+    slot_value: "embedding_114.tmp_0"
+    slot_value: "embedding_115.tmp_0"
+    slot_value: "embedding_116.tmp_0"
+    slot_value: "embedding_117.tmp_0"
+    slot_value: "embedding_118.tmp_0"
+    slot_value: "embedding_119.tmp_0"
+    slot_value: "embedding_120.tmp_0"
+    slot_value: "embedding_121.tmp_0"
+    slot_value: "embedding_122.tmp_0"
+    slot_value: "embedding_123.tmp_0"
+    slot_value: "embedding_124.tmp_0"
+    slot_value: "embedding_125.tmp_0"
+    slot_value: "embedding_126.tmp_0"
+    slot_value: "embedding_127.tmp_0"
+    slot_value: "embedding_128.tmp_0"
+    slot_value: "embedding_129.tmp_0"
+    slot_value: "embedding_130.tmp_0"
+    slot_value: "embedding_131.tmp_0"
+    slot_value: "embedding_132.tmp_0"
+    slot_value: "embedding_133.tmp_0"
+    slot_value: "embedding_134.tmp_0"
+    slot_value: "embedding_135.tmp_0"
+    slot_value: "embedding_136.tmp_0"
+    slot_value: "embedding_137.tmp_0"
+    slot_value: "embedding_138.tmp_0"
+    slot_value: "embedding_139.tmp_0"
+    slot_value: "embedding_140.tmp_0"
+    slot_value: "embedding_141.tmp_0"
+    slot_value: "embedding_142.tmp_0"
+    slot_value: "embedding_143.tmp_0"
+    slot_value: "embedding_144.tmp_0"
+    slot_value: "embedding_145.tmp_0"
+    slot_value: "embedding_146.tmp_0"
+    slot_value: "embedding_147.tmp_0"
+    slot_value: "embedding_148.tmp_0"
+    slot_value: "embedding_149.tmp_0"
+    slot_value: "embedding_150.tmp_0"
+    slot_value: "embedding_151.tmp_0"
+    slot_value: "embedding_152.tmp_0"
+    slot_value: "embedding_153.tmp_0"
+    slot_value: "embedding_154.tmp_0"
+    slot_value: "embedding_155.tmp_0"
+    slot_value: "embedding_156.tmp_0"
+    slot_value: "embedding_157.tmp_0"
+    slot_value: "embedding_158.tmp_0"
+    slot_value: "embedding_159.tmp_0"
+    slot_value: "embedding_160.tmp_0"
+    slot_value: "embedding_161.tmp_0"
+    slot_value: "embedding_162.tmp_0"
+    slot_value: "embedding_163.tmp_0"
+    slot_value: "embedding_164.tmp_0"
+    slot_value: "embedding_165.tmp_0"
+    slot_value: "embedding_166.tmp_0"
+    slot_value: "embedding_167.tmp_0"
+    slot_value: "embedding_168.tmp_0"
+    slot_value: "embedding_169.tmp_0"
+    slot_value: "embedding_170.tmp_0"
+    slot_value: "embedding_171.tmp_0"
+    slot_value: "embedding_172.tmp_0"
+    slot_value: "embedding_173.tmp_0"
+    slot_value: "embedding_174.tmp_0"
+    slot_value: "embedding_175.tmp_0"
+    slot_value: "embedding_176.tmp_0"
+    slot_value: "embedding_177.tmp_0"
+    slot_value: "embedding_178.tmp_0"
+    slot_value: "embedding_179.tmp_0"
+    slot_value: "embedding_180.tmp_0"
+    slot_value: "embedding_181.tmp_0"
+    slot_value: "embedding_182.tmp_0"
+    slot_value: "embedding_183.tmp_0"
+    slot_value: "embedding_184.tmp_0"
+    slot_value: "embedding_185.tmp_0"
+    slot_value: "embedding_186.tmp_0"
+    slot_value: "embedding_187.tmp_0"
+    slot_value: "embedding_188.tmp_0"
+    slot_value: "embedding_189.tmp_0"
+    slot_value: "embedding_190.tmp_0"
+    slot_value: "embedding_191.tmp_0"
+    slot_value: "embedding_192.tmp_0"
+    slot_value: "embedding_193.tmp_0"
+    slot_value: "embedding_194.tmp_0"
+    slot_value: "embedding_195.tmp_0"
+    slot_value: "embedding_196.tmp_0"
+    slot_value: "embedding_197.tmp_0"
+    slot_value: "embedding_198.tmp_0"
+    slot_value: "embedding_199.tmp_0"
+    slot_value: "embedding_200.tmp_0"
+    slot_value: "embedding_201.tmp_0"
+    slot_value: "embedding_202.tmp_0"
+    slot_value: "embedding_203.tmp_0"
+    slot_value: "embedding_204.tmp_0"
+    slot_value: "embedding_205.tmp_0"
+    slot_value: "embedding_206.tmp_0"
+    slot_value: "embedding_207.tmp_0"
+    slot_value: "embedding_208.tmp_0"
+    slot_value: "embedding_209.tmp_0"
+    slot_value: "embedding_210.tmp_0"
+    slot_value: "embedding_211.tmp_0"
+    slot_value: "embedding_212.tmp_0"
+    slot_value: "embedding_213.tmp_0"
+    slot_value: "embedding_214.tmp_0"
+    slot_value: "embedding_215.tmp_0"
+    slot_value: "embedding_216.tmp_0"
+    slot_value: "embedding_217.tmp_0"
+    slot_value: "embedding_218.tmp_0"
+    slot_value: "embedding_219.tmp_0"
+    slot_value: "embedding_220.tmp_0"
+    slot_value: "embedding_221.tmp_0"
+    slot_value: "embedding_222.tmp_0"
+    slot_value: "embedding_223.tmp_0"
+    slot_value: "embedding_224.tmp_0"
+    slot_value: "embedding_225.tmp_0"
+    slot_value: "embedding_226.tmp_0"
+    slot_value: "embedding_227.tmp_0"
+    slot_value: "embedding_228.tmp_0"
+    slot_value: "embedding_229.tmp_0"
+    slot_value: "embedding_230.tmp_0"
+    slot_value: "embedding_231.tmp_0"
+    slot_value: "embedding_232.tmp_0"
+    slot_value: "embedding_233.tmp_0"
+    slot_value: "embedding_234.tmp_0"
+    slot_value: "embedding_235.tmp_0"
+    slot_value: "embedding_236.tmp_0"
+    slot_value: "embedding_237.tmp_0"
+    slot_value: "embedding_238.tmp_0"
+    slot_value: "embedding_239.tmp_0"
+    slot_value: "embedding_240.tmp_0"
+    slot_value: "embedding_241.tmp_0"
+    slot_value: "embedding_242.tmp_0"
+    slot_value: "embedding_243.tmp_0"
+    slot_value: "embedding_244.tmp_0"
+    slot_value: "embedding_245.tmp_0"
+    slot_value: "embedding_246.tmp_0"
+    slot_value: "embedding_247.tmp_0"
+    slot_value: "embedding_248.tmp_0"
+    slot_value: "embedding_249.tmp_0"
+    slot_value: "embedding_250.tmp_0"
+    slot_value: "embedding_251.tmp_0"
+    slot_value: "embedding_252.tmp_0"
+    slot_value: "embedding_253.tmp_0"
+    slot_value: "embedding_254.tmp_0"
+    slot_value: "embedding_255.tmp_0"
+    slot_value: "embedding_256.tmp_0"
+    slot_value: "embedding_257.tmp_0"
+    slot_value: "embedding_258.tmp_0"
+    slot_value: "embedding_259.tmp_0"
+    slot_value: "embedding_260.tmp_0"
+    slot_value: "embedding_261.tmp_0"
+    slot_value: "embedding_262.tmp_0"
+    slot_value: "embedding_263.tmp_0"
+    slot_value: "embedding_264.tmp_0"
+    slot_value: "embedding_265.tmp_0"
+    slot_value: "embedding_266.tmp_0"
+    slot_value: "embedding_267.tmp_0"
+    slot_value: "embedding_268.tmp_0"
+    slot_value: "embedding_269.tmp_0"
+    slot_value: "embedding_270.tmp_0"
+    slot_value: "embedding_271.tmp_0"
+    slot_value: "embedding_272.tmp_0"
+    slot_value: "embedding_273.tmp_0"
+    slot_value: "embedding_274.tmp_0"
+    slot_value: "embedding_275.tmp_0"
+    slot_value: "embedding_276.tmp_0"
+    slot_value: "embedding_277.tmp_0"
+    slot_value: "embedding_278.tmp_0"
+    slot_value: "embedding_279.tmp_0"
+    slot_value: "embedding_280.tmp_0"
+    slot_value: "embedding_281.tmp_0"
+    slot_value: "embedding_282.tmp_0"
+    slot_value: "embedding_283.tmp_0"
+    slot_value: "embedding_284.tmp_0"
+    slot_value: "embedding_285.tmp_0"
+    slot_value: "embedding_286.tmp_0"
+    slot_value: "embedding_287.tmp_0"
+    slot_value: "embedding_288.tmp_0"
+    slot_value: "embedding_289.tmp_0"
+    slot_value: "embedding_290.tmp_0"
+    slot_value: "embedding_291.tmp_0"
+    slot_value: "embedding_292.tmp_0"
+    slot_value: "embedding_293.tmp_0"
+    slot_value: "embedding_294.tmp_0"
+    slot_value: "embedding_295.tmp_0"
+    slot_value: "embedding_296.tmp_0"
+    slot_value: "embedding_297.tmp_0"
+    slot_value: "embedding_298.tmp_0"
+    slot_value: "embedding_299.tmp_0"
+    slot_value: "embedding_300.tmp_0"
+    slot_value: "embedding_301.tmp_0"
+    slot_value: "embedding_302.tmp_0"
+    slot_value: "embedding_303.tmp_0"
+    slot_value: "embedding_304.tmp_0"
+    slot_value: "embedding_305.tmp_0"
+    slot_value: "embedding_306.tmp_0"
+    slot_value: "embedding_307.tmp_0"
+    slot_value: "embedding_308.tmp_0"
+    slot_value: "embedding_309.tmp_0"
+    slot_value: "embedding_310.tmp_0"
+    slot_value: "embedding_311.tmp_0"
+    slot_value: "embedding_312.tmp_0"
+    slot_value: "embedding_313.tmp_0"
+    slot_value: "embedding_314.tmp_0"
+    slot_value: "embedding_315.tmp_0"
+    slot_value: "embedding_316.tmp_0"
+    slot_value: "embedding_317.tmp_0"
+    slot_value: "embedding_318.tmp_0"
+    slot_value: "embedding_319.tmp_0"
+    slot_value: "embedding_320.tmp_0"
+    slot_value: "embedding_321.tmp_0"
+    slot_value: "embedding_322.tmp_0"
+    slot_value: "embedding_323.tmp_0"
+    slot_value: "embedding_324.tmp_0"
+    slot_value: "embedding_325.tmp_0"
+    slot_value: "embedding_326.tmp_0"
+    slot_value: "embedding_327.tmp_0"
+    slot_value: "embedding_328.tmp_0"
+    slot_value: "embedding_329.tmp_0"
+    slot_value: "embedding_330.tmp_0"
+    slot_value: "embedding_331.tmp_0"
+    slot_value: "embedding_332.tmp_0"
+    slot_value: "embedding_333.tmp_0"
+    slot_value: "embedding_334.tmp_0"
+    slot_value: "embedding_335.tmp_0"
+    slot_value: "embedding_336.tmp_0"
+    slot_value: "embedding_337.tmp_0"
+    slot_value: "embedding_338.tmp_0"
+    slot_value: "embedding_339.tmp_0"
+    slot_value: "embedding_340.tmp_0"
+    slot_value: "embedding_341.tmp_0"
+    slot_value: "embedding_342.tmp_0"
+    slot_value: "embedding_343.tmp_0"
+    slot_value: "embedding_344.tmp_0"
+    slot_value: "embedding_345.tmp_0"
+    slot_value: "embedding_346.tmp_0"
+    slot_value: "embedding_347.tmp_0"
+    slot_value: "embedding_348.tmp_0"
+    slot_value: "embedding_349.tmp_0"
+    slot_value: "embedding_350.tmp_0"
+    slot_value: "embedding_351.tmp_0"
+    slot_value: "embedding_352.tmp_0"
+    slot_value: "embedding_353.tmp_0"
+    slot_value: "embedding_354.tmp_0"
+    slot_value: "embedding_355.tmp_0"
+    slot_value: "embedding_356.tmp_0"
+    slot_value: "embedding_357.tmp_0"
+    slot_value: "embedding_358.tmp_0"
+    slot_value: "embedding_359.tmp_0"
+    slot_value: "embedding_360.tmp_0"
+    slot_value: "embedding_361.tmp_0"
+    slot_value: "embedding_362.tmp_0"
+    slot_value: "embedding_363.tmp_0"
+    slot_value: "embedding_364.tmp_0"
+    slot_value: "embedding_365.tmp_0"
+    slot_value: "embedding_366.tmp_0"
+    slot_value: "embedding_367.tmp_0"
+    slot_value: "embedding_368.tmp_0"
+    slot_value: "embedding_369.tmp_0"
+    slot_value: "embedding_370.tmp_0"
+    slot_value: "embedding_371.tmp_0"
+    slot_value: "embedding_372.tmp_0"
+    slot_value: "embedding_373.tmp_0"
+    slot_value: "embedding_374.tmp_0"
+    slot_value: "embedding_375.tmp_0"
+    slot_value: "embedding_376.tmp_0"
+    slot_value: "embedding_377.tmp_0"
+    slot_value: "embedding_378.tmp_0"
+    slot_value: "embedding_379.tmp_0"
+    slot_value: "embedding_380.tmp_0"
+    slot_value: "embedding_381.tmp_0"
+    slot_value: "embedding_382.tmp_0"
+    slot_value: "embedding_383.tmp_0"
+    slot_value: "embedding_384.tmp_0"
+    slot_value: "embedding_385.tmp_0"
+    slot_value: "embedding_386.tmp_0"
+    slot_value: "embedding_387.tmp_0"
+    slot_value: "embedding_388.tmp_0"
+    slot_value: "embedding_389.tmp_0"
+    slot_value: "embedding_390.tmp_0"
+    slot_value: "embedding_391.tmp_0"
+    slot_value: "embedding_392.tmp_0"
+    slot_value: "embedding_393.tmp_0"
+    slot_value: "embedding_394.tmp_0"
+    slot_value: "embedding_395.tmp_0"
+    slot_value: "embedding_396.tmp_0"
+    slot_value: "embedding_397.tmp_0"
+    slot_value: "embedding_398.tmp_0"
+    slot_value: "embedding_399.tmp_0"
+    slot_value: "embedding_400.tmp_0"
+    slot_value: "embedding_401.tmp_0"
+    slot_value: "embedding_402.tmp_0"
+    slot_value: "embedding_403.tmp_0"
+    slot_value: "embedding_404.tmp_0"
+    slot_value: "embedding_405.tmp_0"
+    slot_value: "embedding_406.tmp_0"
+    slot_value: "embedding_407.tmp_0"
+    slot_gradient: "embedding_0.tmp_0@GRAD"
+    slot_gradient: "embedding_1.tmp_0@GRAD"
+    slot_gradient: "embedding_2.tmp_0@GRAD"
+    slot_gradient: "embedding_3.tmp_0@GRAD"
+    slot_gradient: "embedding_4.tmp_0@GRAD"
+    slot_gradient: "embedding_5.tmp_0@GRAD"
+    slot_gradient: "embedding_6.tmp_0@GRAD"
+    slot_gradient: "embedding_7.tmp_0@GRAD"
+    slot_gradient: "embedding_8.tmp_0@GRAD"
+    slot_gradient: "embedding_9.tmp_0@GRAD"
+    slot_gradient: "embedding_10.tmp_0@GRAD"
+    slot_gradient: "embedding_11.tmp_0@GRAD"
+    slot_gradient: "embedding_12.tmp_0@GRAD"
+    slot_gradient: "embedding_13.tmp_0@GRAD"
+    slot_gradient: "embedding_14.tmp_0@GRAD"
+    slot_gradient: "embedding_15.tmp_0@GRAD"
+    slot_gradient: "embedding_16.tmp_0@GRAD"
+    slot_gradient: "embedding_17.tmp_0@GRAD"
+    slot_gradient: "embedding_18.tmp_0@GRAD"
+    slot_gradient: "embedding_19.tmp_0@GRAD"
+    slot_gradient: "embedding_20.tmp_0@GRAD"
+    slot_gradient: "embedding_21.tmp_0@GRAD"
+    slot_gradient: "embedding_22.tmp_0@GRAD"
+    slot_gradient: "embedding_23.tmp_0@GRAD"
+    slot_gradient: "embedding_24.tmp_0@GRAD"
+    slot_gradient: "embedding_25.tmp_0@GRAD"
+    slot_gradient: "embedding_26.tmp_0@GRAD"
+    slot_gradient: "embedding_27.tmp_0@GRAD"
+    slot_gradient: "embedding_28.tmp_0@GRAD"
+    slot_gradient: "embedding_29.tmp_0@GRAD"
+    slot_gradient: "embedding_30.tmp_0@GRAD"
+    slot_gradient: "embedding_31.tmp_0@GRAD"
+    slot_gradient: "embedding_32.tmp_0@GRAD"
+    slot_gradient: "embedding_33.tmp_0@GRAD"
+    slot_gradient: "embedding_34.tmp_0@GRAD"
+    slot_gradient: "embedding_35.tmp_0@GRAD"
+    slot_gradient: "embedding_36.tmp_0@GRAD"
+    slot_gradient: "embedding_37.tmp_0@GRAD"
+    slot_gradient: "embedding_38.tmp_0@GRAD"
+    slot_gradient: "embedding_39.tmp_0@GRAD"
+    slot_gradient: "embedding_40.tmp_0@GRAD"
+    slot_gradient: "embedding_41.tmp_0@GRAD"
+    slot_gradient: "embedding_42.tmp_0@GRAD"
+    slot_gradient: "embedding_43.tmp_0@GRAD"
+    slot_gradient: "embedding_44.tmp_0@GRAD"
+    slot_gradient: "embedding_45.tmp_0@GRAD"
+    slot_gradient: "embedding_46.tmp_0@GRAD"
+    slot_gradient: "embedding_47.tmp_0@GRAD"
+    slot_gradient: "embedding_48.tmp_0@GRAD"
+    slot_gradient: "embedding_49.tmp_0@GRAD"
+    slot_gradient: "embedding_50.tmp_0@GRAD"
+    slot_gradient: "embedding_51.tmp_0@GRAD"
+    slot_gradient: "embedding_52.tmp_0@GRAD"
+    slot_gradient: "embedding_53.tmp_0@GRAD"
+    slot_gradient: "embedding_54.tmp_0@GRAD"
+    slot_gradient: "embedding_55.tmp_0@GRAD"
+    slot_gradient: "embedding_56.tmp_0@GRAD"
+    slot_gradient: "embedding_57.tmp_0@GRAD"
+    slot_gradient: "embedding_58.tmp_0@GRAD"
+    slot_gradient: "embedding_59.tmp_0@GRAD"
+    slot_gradient: "embedding_60.tmp_0@GRAD"
+    slot_gradient: "embedding_61.tmp_0@GRAD"
+    slot_gradient: "embedding_62.tmp_0@GRAD"
+    slot_gradient: "embedding_63.tmp_0@GRAD"
+    slot_gradient: "embedding_64.tmp_0@GRAD"
+    slot_gradient: "embedding_65.tmp_0@GRAD"
+    slot_gradient: "embedding_66.tmp_0@GRAD"
+    slot_gradient: "embedding_67.tmp_0@GRAD"
+    slot_gradient: "embedding_68.tmp_0@GRAD"
+    slot_gradient: "embedding_69.tmp_0@GRAD"
+    slot_gradient: "embedding_70.tmp_0@GRAD"
+    slot_gradient: "embedding_71.tmp_0@GRAD"
+    slot_gradient: "embedding_72.tmp_0@GRAD"
+    slot_gradient: "embedding_73.tmp_0@GRAD"
+    slot_gradient: "embedding_74.tmp_0@GRAD"
+    slot_gradient: "embedding_75.tmp_0@GRAD"
+    slot_gradient: "embedding_76.tmp_0@GRAD"
+    slot_gradient: "embedding_77.tmp_0@GRAD"
+    slot_gradient: "embedding_78.tmp_0@GRAD"
+    slot_gradient: "embedding_79.tmp_0@GRAD"
+    slot_gradient: "embedding_80.tmp_0@GRAD"
+    slot_gradient: "embedding_81.tmp_0@GRAD"
+    slot_gradient: "embedding_82.tmp_0@GRAD"
+    slot_gradient: "embedding_83.tmp_0@GRAD"
+    slot_gradient: "embedding_84.tmp_0@GRAD"
+    slot_gradient: "embedding_85.tmp_0@GRAD"
+    slot_gradient: "embedding_86.tmp_0@GRAD"
+    slot_gradient: "embedding_87.tmp_0@GRAD"
+    slot_gradient: "embedding_88.tmp_0@GRAD"
+    slot_gradient: "embedding_89.tmp_0@GRAD"
+    slot_gradient: "embedding_90.tmp_0@GRAD"
+    slot_gradient: "embedding_91.tmp_0@GRAD"
+    slot_gradient: "embedding_92.tmp_0@GRAD"
+    slot_gradient: "embedding_93.tmp_0@GRAD"
+    slot_gradient: "embedding_94.tmp_0@GRAD"
+    slot_gradient: "embedding_95.tmp_0@GRAD"
+    slot_gradient: "embedding_96.tmp_0@GRAD"
+    slot_gradient: "embedding_97.tmp_0@GRAD"
+    slot_gradient: "embedding_98.tmp_0@GRAD"
+    slot_gradient: "embedding_99.tmp_0@GRAD"
+    slot_gradient: "embedding_100.tmp_0@GRAD"
+    slot_gradient: "embedding_101.tmp_0@GRAD"
+    slot_gradient: "embedding_102.tmp_0@GRAD"
+    slot_gradient: "embedding_103.tmp_0@GRAD"
+    slot_gradient: "embedding_104.tmp_0@GRAD"
+    slot_gradient: "embedding_105.tmp_0@GRAD"
+    slot_gradient: "embedding_106.tmp_0@GRAD"
+    slot_gradient: "embedding_107.tmp_0@GRAD"
+    slot_gradient: "embedding_108.tmp_0@GRAD"
+    slot_gradient: "embedding_109.tmp_0@GRAD"
+    slot_gradient: "embedding_110.tmp_0@GRAD"
+    slot_gradient: "embedding_111.tmp_0@GRAD"
+    slot_gradient: "embedding_112.tmp_0@GRAD"
+    slot_gradient: "embedding_113.tmp_0@GRAD"
+    slot_gradient: "embedding_114.tmp_0@GRAD"
+    slot_gradient: "embedding_115.tmp_0@GRAD"
+    slot_gradient: "embedding_116.tmp_0@GRAD"
+    slot_gradient: "embedding_117.tmp_0@GRAD"
+    slot_gradient: "embedding_118.tmp_0@GRAD"
+    slot_gradient: "embedding_119.tmp_0@GRAD"
+    slot_gradient: "embedding_120.tmp_0@GRAD"
+    slot_gradient: "embedding_121.tmp_0@GRAD"
+    slot_gradient: "embedding_122.tmp_0@GRAD"
+    slot_gradient: "embedding_123.tmp_0@GRAD"
+    slot_gradient: "embedding_124.tmp_0@GRAD"
+    slot_gradient: "embedding_125.tmp_0@GRAD"
+    slot_gradient: "embedding_126.tmp_0@GRAD"
+    slot_gradient: "embedding_127.tmp_0@GRAD"
+    slot_gradient: "embedding_128.tmp_0@GRAD"
+    slot_gradient: "embedding_129.tmp_0@GRAD"
+    slot_gradient: "embedding_130.tmp_0@GRAD"
+    slot_gradient: "embedding_131.tmp_0@GRAD"
+    slot_gradient: "embedding_132.tmp_0@GRAD"
+    slot_gradient: "embedding_133.tmp_0@GRAD"
+    slot_gradient: "embedding_134.tmp_0@GRAD"
+    slot_gradient: "embedding_135.tmp_0@GRAD"
+    slot_gradient: "embedding_136.tmp_0@GRAD"
+    slot_gradient: "embedding_137.tmp_0@GRAD"
+    slot_gradient: "embedding_138.tmp_0@GRAD"
+    slot_gradient: "embedding_139.tmp_0@GRAD"
+    slot_gradient: "embedding_140.tmp_0@GRAD"
+    slot_gradient: "embedding_141.tmp_0@GRAD"
+    slot_gradient: "embedding_142.tmp_0@GRAD"
+    slot_gradient: "embedding_143.tmp_0@GRAD"
+    slot_gradient: "embedding_144.tmp_0@GRAD"
+    slot_gradient: "embedding_145.tmp_0@GRAD"
+    slot_gradient: "embedding_146.tmp_0@GRAD"
+    slot_gradient: "embedding_147.tmp_0@GRAD"
+    slot_gradient: "embedding_148.tmp_0@GRAD"
+    slot_gradient: "embedding_149.tmp_0@GRAD"
+    slot_gradient: "embedding_150.tmp_0@GRAD"
+    slot_gradient: "embedding_151.tmp_0@GRAD"
+    slot_gradient: "embedding_152.tmp_0@GRAD"
+    slot_gradient: "embedding_153.tmp_0@GRAD"
+    slot_gradient: "embedding_154.tmp_0@GRAD"
+    slot_gradient: "embedding_155.tmp_0@GRAD"
+    slot_gradient: "embedding_156.tmp_0@GRAD"
+    slot_gradient: "embedding_157.tmp_0@GRAD"
+    slot_gradient: "embedding_158.tmp_0@GRAD"
+    slot_gradient: "embedding_159.tmp_0@GRAD"
+    slot_gradient: "embedding_160.tmp_0@GRAD"
+    slot_gradient: "embedding_161.tmp_0@GRAD"
+    slot_gradient: "embedding_162.tmp_0@GRAD"
+    slot_gradient: "embedding_163.tmp_0@GRAD"
+    slot_gradient: "embedding_164.tmp_0@GRAD"
+    slot_gradient: "embedding_165.tmp_0@GRAD"
+    slot_gradient: "embedding_166.tmp_0@GRAD"
+    slot_gradient: "embedding_167.tmp_0@GRAD"
+    slot_gradient: "embedding_168.tmp_0@GRAD"
+    slot_gradient: "embedding_169.tmp_0@GRAD"
+    slot_gradient: "embedding_170.tmp_0@GRAD"
+    slot_gradient: "embedding_171.tmp_0@GRAD"
+    slot_gradient: "embedding_172.tmp_0@GRAD"
+    slot_gradient: "embedding_173.tmp_0@GRAD"
+    slot_gradient: "embedding_174.tmp_0@GRAD"
+    slot_gradient: "embedding_175.tmp_0@GRAD"
+    slot_gradient: "embedding_176.tmp_0@GRAD"
+    slot_gradient: "embedding_177.tmp_0@GRAD"
+    slot_gradient: "embedding_178.tmp_0@GRAD"
+    slot_gradient: "embedding_179.tmp_0@GRAD"
+    slot_gradient: "embedding_180.tmp_0@GRAD"
+    slot_gradient: "embedding_181.tmp_0@GRAD"
+    slot_gradient: "embedding_182.tmp_0@GRAD"
+    slot_gradient: "embedding_183.tmp_0@GRAD"
+    slot_gradient: "embedding_184.tmp_0@GRAD"
+    slot_gradient: "embedding_185.tmp_0@GRAD"
+    slot_gradient: "embedding_186.tmp_0@GRAD"
+    slot_gradient: "embedding_187.tmp_0@GRAD"
+    slot_gradient: "embedding_188.tmp_0@GRAD"
+    slot_gradient: "embedding_189.tmp_0@GRAD"
+    slot_gradient: "embedding_190.tmp_0@GRAD"
+    slot_gradient: "embedding_191.tmp_0@GRAD"
+    slot_gradient: "embedding_192.tmp_0@GRAD"
+    slot_gradient: "embedding_193.tmp_0@GRAD"
+    slot_gradient: "embedding_194.tmp_0@GRAD"
+    slot_gradient: "embedding_195.tmp_0@GRAD"
+    slot_gradient: "embedding_196.tmp_0@GRAD"
+    slot_gradient: "embedding_197.tmp_0@GRAD"
+    slot_gradient: "embedding_198.tmp_0@GRAD"
+    slot_gradient: "embedding_199.tmp_0@GRAD"
+    slot_gradient: "embedding_200.tmp_0@GRAD"
+    slot_gradient: "embedding_201.tmp_0@GRAD"
+    slot_gradient: "embedding_202.tmp_0@GRAD"
+    slot_gradient: "embedding_203.tmp_0@GRAD"
+    slot_gradient: "embedding_204.tmp_0@GRAD"
+    slot_gradient: "embedding_205.tmp_0@GRAD"
+    slot_gradient: "embedding_206.tmp_0@GRAD"
+    slot_gradient: "embedding_207.tmp_0@GRAD"
+    slot_gradient: "embedding_208.tmp_0@GRAD"
+    slot_gradient: "embedding_209.tmp_0@GRAD"
+    slot_gradient: "embedding_210.tmp_0@GRAD"
+    slot_gradient: "embedding_211.tmp_0@GRAD"
+    slot_gradient: "embedding_212.tmp_0@GRAD"
+    slot_gradient: "embedding_213.tmp_0@GRAD"
+    slot_gradient: "embedding_214.tmp_0@GRAD"
+    slot_gradient: "embedding_215.tmp_0@GRAD"
+    slot_gradient: "embedding_216.tmp_0@GRAD"
+    slot_gradient: "embedding_217.tmp_0@GRAD"
+    slot_gradient: "embedding_218.tmp_0@GRAD"
+    slot_gradient: "embedding_219.tmp_0@GRAD"
+    slot_gradient: "embedding_220.tmp_0@GRAD"
+    slot_gradient: "embedding_221.tmp_0@GRAD"
+    slot_gradient: "embedding_222.tmp_0@GRAD"
+    slot_gradient: "embedding_223.tmp_0@GRAD"
+    slot_gradient: "embedding_224.tmp_0@GRAD"
+    slot_gradient: "embedding_225.tmp_0@GRAD"
+    slot_gradient: "embedding_226.tmp_0@GRAD"
+    slot_gradient: "embedding_227.tmp_0@GRAD"
+    slot_gradient: "embedding_228.tmp_0@GRAD"
+    slot_gradient: "embedding_229.tmp_0@GRAD"
+    slot_gradient: "embedding_230.tmp_0@GRAD"
+    slot_gradient: "embedding_231.tmp_0@GRAD"
+    slot_gradient: "embedding_232.tmp_0@GRAD"
+    slot_gradient: "embedding_233.tmp_0@GRAD"
+    slot_gradient: "embedding_234.tmp_0@GRAD"
+    slot_gradient: "embedding_235.tmp_0@GRAD"
+    slot_gradient: "embedding_236.tmp_0@GRAD"
+    slot_gradient: "embedding_237.tmp_0@GRAD"
+    slot_gradient: "embedding_238.tmp_0@GRAD"
+    slot_gradient: "embedding_239.tmp_0@GRAD"
+    slot_gradient: "embedding_240.tmp_0@GRAD"
+    slot_gradient: "embedding_241.tmp_0@GRAD"
+    slot_gradient: "embedding_242.tmp_0@GRAD"
+    slot_gradient: "embedding_243.tmp_0@GRAD"
+    slot_gradient: "embedding_244.tmp_0@GRAD"
+    slot_gradient: "embedding_245.tmp_0@GRAD"
+    slot_gradient: "embedding_246.tmp_0@GRAD"
+    slot_gradient: "embedding_247.tmp_0@GRAD"
+    slot_gradient: "embedding_248.tmp_0@GRAD"
+    slot_gradient: "embedding_249.tmp_0@GRAD"
+    slot_gradient: "embedding_250.tmp_0@GRAD"
+    slot_gradient: "embedding_251.tmp_0@GRAD"
+    slot_gradient: "embedding_252.tmp_0@GRAD"
+    slot_gradient: "embedding_253.tmp_0@GRAD"
+    slot_gradient: "embedding_254.tmp_0@GRAD"
+    slot_gradient: "embedding_255.tmp_0@GRAD"
+    slot_gradient: "embedding_256.tmp_0@GRAD"
+    slot_gradient: "embedding_257.tmp_0@GRAD"
+    slot_gradient: "embedding_258.tmp_0@GRAD"
+    slot_gradient: "embedding_259.tmp_0@GRAD"
+    slot_gradient: "embedding_260.tmp_0@GRAD"
+    slot_gradient: "embedding_261.tmp_0@GRAD"
+    slot_gradient: "embedding_262.tmp_0@GRAD"
+    slot_gradient: "embedding_263.tmp_0@GRAD"
+    slot_gradient: "embedding_264.tmp_0@GRAD"
+    slot_gradient: "embedding_265.tmp_0@GRAD"
+    slot_gradient: "embedding_266.tmp_0@GRAD"
+    slot_gradient: "embedding_267.tmp_0@GRAD"
+    slot_gradient: "embedding_268.tmp_0@GRAD"
+    slot_gradient: "embedding_269.tmp_0@GRAD"
+    slot_gradient: "embedding_270.tmp_0@GRAD"
+    slot_gradient: "embedding_271.tmp_0@GRAD"
+    slot_gradient: "embedding_272.tmp_0@GRAD"
+    slot_gradient: "embedding_273.tmp_0@GRAD"
+    slot_gradient: "embedding_274.tmp_0@GRAD"
+    slot_gradient: "embedding_275.tmp_0@GRAD"
+    slot_gradient: "embedding_276.tmp_0@GRAD"
+    slot_gradient: "embedding_277.tmp_0@GRAD"
+    slot_gradient: "embedding_278.tmp_0@GRAD"
+    slot_gradient: "embedding_279.tmp_0@GRAD"
+    slot_gradient: "embedding_280.tmp_0@GRAD"
+    slot_gradient: "embedding_281.tmp_0@GRAD"
+    slot_gradient: "embedding_282.tmp_0@GRAD"
+    slot_gradient: "embedding_283.tmp_0@GRAD"
+    slot_gradient: "embedding_284.tmp_0@GRAD"
+    slot_gradient: "embedding_285.tmp_0@GRAD"
+    slot_gradient: "embedding_286.tmp_0@GRAD"
+    slot_gradient: "embedding_287.tmp_0@GRAD"
+    slot_gradient: "embedding_288.tmp_0@GRAD"
+    slot_gradient: "embedding_289.tmp_0@GRAD"
+    slot_gradient: "embedding_290.tmp_0@GRAD"
+    slot_gradient: "embedding_291.tmp_0@GRAD"
+    slot_gradient: "embedding_292.tmp_0@GRAD"
+    slot_gradient: "embedding_293.tmp_0@GRAD"
+    slot_gradient: "embedding_294.tmp_0@GRAD"
+    slot_gradient: "embedding_295.tmp_0@GRAD"
+    slot_gradient: "embedding_296.tmp_0@GRAD"
+    slot_gradient: "embedding_297.tmp_0@GRAD"
+    slot_gradient: "embedding_298.tmp_0@GRAD"
+    slot_gradient: "embedding_299.tmp_0@GRAD"
+    slot_gradient: "embedding_300.tmp_0@GRAD"
+    slot_gradient: "embedding_301.tmp_0@GRAD"
+    slot_gradient: "embedding_302.tmp_0@GRAD"
+    slot_gradient: "embedding_303.tmp_0@GRAD"
+    slot_gradient: "embedding_304.tmp_0@GRAD"
+    slot_gradient: "embedding_305.tmp_0@GRAD"
+    slot_gradient: "embedding_306.tmp_0@GRAD"
+    slot_gradient: "embedding_307.tmp_0@GRAD"
+    slot_gradient: "embedding_308.tmp_0@GRAD"
+    slot_gradient: "embedding_309.tmp_0@GRAD"
+    slot_gradient: "embedding_310.tmp_0@GRAD"
+    slot_gradient: "embedding_311.tmp_0@GRAD"
+    slot_gradient: "embedding_312.tmp_0@GRAD"
+    slot_gradient: "embedding_313.tmp_0@GRAD"
+    slot_gradient: "embedding_314.tmp_0@GRAD"
+    slot_gradient: "embedding_315.tmp_0@GRAD"
+    slot_gradient: "embedding_316.tmp_0@GRAD"
+    slot_gradient: "embedding_317.tmp_0@GRAD"
+    slot_gradient: "embedding_318.tmp_0@GRAD"
+    slot_gradient: "embedding_319.tmp_0@GRAD"
+    slot_gradient: "embedding_320.tmp_0@GRAD"
+    slot_gradient: "embedding_321.tmp_0@GRAD"
+    slot_gradient: "embedding_322.tmp_0@GRAD"
+    slot_gradient: "embedding_323.tmp_0@GRAD"
+    slot_gradient: "embedding_324.tmp_0@GRAD"
+    slot_gradient: "embedding_325.tmp_0@GRAD"
+    slot_gradient: "embedding_326.tmp_0@GRAD"
+    slot_gradient: "embedding_327.tmp_0@GRAD"
+    slot_gradient: "embedding_328.tmp_0@GRAD"
+    slot_gradient: "embedding_329.tmp_0@GRAD"
+    slot_gradient: "embedding_330.tmp_0@GRAD"
+    slot_gradient: "embedding_331.tmp_0@GRAD"
+    slot_gradient: "embedding_332.tmp_0@GRAD"
+    slot_gradient: "embedding_333.tmp_0@GRAD"
+    slot_gradient: "embedding_334.tmp_0@GRAD"
+    slot_gradient: "embedding_335.tmp_0@GRAD"
+    slot_gradient: "embedding_336.tmp_0@GRAD"
+    slot_gradient: "embedding_337.tmp_0@GRAD"
+    slot_gradient: "embedding_338.tmp_0@GRAD"
+    slot_gradient: "embedding_339.tmp_0@GRAD"
+    slot_gradient: "embedding_340.tmp_0@GRAD"
+    slot_gradient: "embedding_341.tmp_0@GRAD"
+    slot_gradient: "embedding_342.tmp_0@GRAD"
+    slot_gradient: "embedding_343.tmp_0@GRAD"
+    slot_gradient: "embedding_344.tmp_0@GRAD"
+    slot_gradient: "embedding_345.tmp_0@GRAD"
+    slot_gradient: "embedding_346.tmp_0@GRAD"
+    slot_gradient: "embedding_347.tmp_0@GRAD"
+    slot_gradient: "embedding_348.tmp_0@GRAD"
+    slot_gradient: "embedding_349.tmp_0@GRAD"
+    slot_gradient: "embedding_350.tmp_0@GRAD"
+    slot_gradient: "embedding_351.tmp_0@GRAD"
+    slot_gradient: "embedding_352.tmp_0@GRAD"
+    slot_gradient: "embedding_353.tmp_0@GRAD"
+    slot_gradient: "embedding_354.tmp_0@GRAD"
+    slot_gradient: "embedding_355.tmp_0@GRAD"
+    slot_gradient: "embedding_356.tmp_0@GRAD"
+    slot_gradient: "embedding_357.tmp_0@GRAD"
+    slot_gradient: "embedding_358.tmp_0@GRAD"
+    slot_gradient: "embedding_359.tmp_0@GRAD"
+    slot_gradient: "embedding_360.tmp_0@GRAD"
+    slot_gradient: "embedding_361.tmp_0@GRAD"
+    slot_gradient: "embedding_362.tmp_0@GRAD"
+    slot_gradient: "embedding_363.tmp_0@GRAD"
+    slot_gradient: "embedding_364.tmp_0@GRAD"
+    slot_gradient: "embedding_365.tmp_0@GRAD"
+    slot_gradient: "embedding_366.tmp_0@GRAD"
+    slot_gradient: "embedding_367.tmp_0@GRAD"
+    slot_gradient: "embedding_368.tmp_0@GRAD"
+    slot_gradient: "embedding_369.tmp_0@GRAD"
+    slot_gradient: "embedding_370.tmp_0@GRAD"
+    slot_gradient: "embedding_371.tmp_0@GRAD"
+    slot_gradient: "embedding_372.tmp_0@GRAD"
+    slot_gradient: "embedding_373.tmp_0@GRAD"
+    slot_gradient: "embedding_374.tmp_0@GRAD"
+    slot_gradient: "embedding_375.tmp_0@GRAD"
+    slot_gradient: "embedding_376.tmp_0@GRAD"
+    slot_gradient: "embedding_377.tmp_0@GRAD"
+    slot_gradient: "embedding_378.tmp_0@GRAD"
+    slot_gradient: "embedding_379.tmp_0@GRAD"
+    slot_gradient: "embedding_380.tmp_0@GRAD"
+    slot_gradient: "embedding_381.tmp_0@GRAD"
+    slot_gradient: "embedding_382.tmp_0@GRAD"
+    slot_gradient: "embedding_383.tmp_0@GRAD"
+    slot_gradient: "embedding_384.tmp_0@GRAD"
+    slot_gradient: "embedding_385.tmp_0@GRAD"
+    slot_gradient: "embedding_386.tmp_0@GRAD"
+    slot_gradient: "embedding_387.tmp_0@GRAD"
+    slot_gradient: "embedding_388.tmp_0@GRAD"
+    slot_gradient: "embedding_389.tmp_0@GRAD"
+    slot_gradient: "embedding_390.tmp_0@GRAD"
+    slot_gradient: "embedding_391.tmp_0@GRAD"
+    slot_gradient: "embedding_392.tmp_0@GRAD"
+    slot_gradient: "embedding_393.tmp_0@GRAD"
+    slot_gradient: "embedding_394.tmp_0@GRAD"
+    slot_gradient: "embedding_395.tmp_0@GRAD"
+    slot_gradient: "embedding_396.tmp_0@GRAD"
+    slot_gradient: "embedding_397.tmp_0@GRAD"
+    slot_gradient: "embedding_398.tmp_0@GRAD"
+    slot_gradient: "embedding_399.tmp_0@GRAD"
+    slot_gradient: "embedding_400.tmp_0@GRAD"
+    slot_gradient: "embedding_401.tmp_0@GRAD"
+    slot_gradient: "embedding_402.tmp_0@GRAD"
+    slot_gradient: "embedding_403.tmp_0@GRAD"
+    slot_gradient: "embedding_404.tmp_0@GRAD"
+    slot_gradient: "embedding_405.tmp_0@GRAD"
+    slot_gradient: "embedding_406.tmp_0@GRAD"
+    slot_gradient: "embedding_407.tmp_0@GRAD"
+  }
+  skip_op: "lookup_table"
+  skip_op: "lookup_table_grad"
+}
+fs_client_param {
+  uri: "afs://xingtian.afs.baidu.com:9902"
+  user: "mlarch"
+  passwd: "Fv1M87"
+  hadoop_bin: "$HADOOP_HOME/bin/hadoop"
+}
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/format_newcate_hotnews.awk b/feed/feed_deploy/news_jingpai/package/my_nets/format_newcate_hotnews.awk
new file mode 100755
index 0000000000000000000000000000000000000000..7820d4050110a1e1b59d739c126648d24681dd18
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/format_newcate_hotnews.awk
@@ -0,0 +1,21 @@
+#!/bin/awk -f
+{
+    if ($1 !~ /^([0-9a-zA-Z])+$/ || $2 !~ /^([0-9])+$/ || $3 !~ /^([0-9])+$/) {
+        next;
+    }
+    show = $2;
+    clk = $3;
+    if (clk > show) {
+        clk = show;
+    }
+    for (i = 0; i < clk; i++) {
+        $2 = "1";
+        $3 = "1";
+        print $0;
+    }
+    for (i = 0; i < show - clk; i++) {
+        $2 = "1";
+        $3 = "0";
+        print $0;
+    }
+}
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/ins_weight.py b/feed/feed_deploy/news_jingpai/package/my_nets/ins_weight.py
new file mode 100755
index 0000000000000000000000000000000000000000..8b4d87c34300aaea048c07fd9e9c50aa70e3a07c
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/ins_weight.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python
+import sys
+import re
+import math
+
+del_text_slot = True
+g_ratio = 1
+w_ratio = 0.01
+slots_str = "6048 6145 6202 6201 6121 6119 6146 6120 6147 6122 6123 6118 6142 6143 6008 6148 6151 6127 6144 6150 6109 6003 6096 6149 6129 6203 6153 6152 6128 6106 6251 7082 7515 7080 6066 7507 6186 6007 7514 6054 6125 7506 10001 6006 6080 7023 6085 10000 6250 6110 6124 6090 6082 6067 7516 6101 6004 6191 6188 6070 6194 6247 6814 7512 10007 6058 6189 6059 7517 10005 7510 7024 7502 7503 6183 7511 6060 6806 7504 6185 6810 6248 10004 6815 6182 10068 6069 6073 6196 6816 7513 6071 6809 6072 6817 6190 7505 6813 6192 6807 6808 6195 6826 6184 6197 6068 6812 7107 6811 6823 6824 6819 6818 6821 6822 6820 6094 6083 6952 6099 6951 6949 6098 7075 6948 6157 6126 7077 6111 6087 6103 6107 6156 6005 6158 7122 6155 7058 6115 7079 7081 6833 6108 6840 6837 7147 7129 6097 6231 6957 7145 6956 7143 6130 7149 7142 6212 6827 7144 6089 6161 7055 6233 6105 7057 6237 6828 6850 6163 7124 6354 6162 7146 6830 7123 6160 6235 7056 6081 6841 6132 6954 6131 6236 6831 6845 6832 6953 6839 6950 7125 7054 6138 6166 6076 6851 6353 7076 7148 6858 6842 6860 7126 6829 6835 7078 6866 6869 6871 7052 6134 6855 6947 6862 6215 6852 7128 6092 6112 6213 6232 6863 6113 6165 6214 6216 6873 6865 6870 6077 6234 6861 6164 6217 7127 6218 6962 7053 7051 6961 6002 6738 6739 10105 7064 6751 6770 7100 6014 6765 6755 10021 10022 6010 10056 6011 6756 10055 6768 10024 6023 10003 6769 10002 6767 6759 10018 6024 6064 6012 6050 10042 6168 6253 10010 10020 6015 6018 10033 10041 10039 10031 10016 6764 7083 7152 7066 6171 7150 7085 6255 10044 10008 7102 6167 6240 6238 6095 10017 10046 6019 6031 6763 6256 6169 6254 10034 7108 7186 6257 10019 6757 10040 6025 7019 7086 10029 10011 7104 6261 6013 6766 10106 7105 7153 7089 6057 7134 7151 7045 7005 7008 7101 6035 7137 10023 6036 6172 7099 7087 6239 7185 6170 10006 6243 6350 7103 7090 7157 6259 7171 6875 7084 7154 6242 6260 7155 7017 7048 7156 6959 7047 10053 7135 6244 7136 10030 7063 6760 7016 7065 7179 6881 7018 6876 10081 10052 10054 10038 6886 10069 7004 10051 7007 7109 10057 6029 6888 10009 6889 7021 10047 6245 6878 10067 6879 6884 7180 7182 10071 7002 6880 6890 6887 10061 6027 6877 6892 10060 6893 7050 10036 7049 10012 10025 7012 7183 10058 7181 10086 6891 6258 6894 6883 7046 6037 7106 10043 10048 10045 10087 6885 10013 10028 7187 10037 10035 10050 6895 7011 7170 7172 10026 10063 10095 10082 10084 6960 10092 10075 6038 7010 7015 10015 10027 10064 7184 10014 10059 7013 7020 10072 10066 10080 6896 10083 10090 6039 10049 7164 7165 10091 10099 6963 7166 10079 10103 7006 7009 7169 6034 7028 7029 7030 7034 7035 7036 7040 7041 7042 10032 6009 6241 7003 7014 7088 13326 13330 13331 13352 13353 6198"
+slot_whitelist = slots_str.split(" ")
+
+def calc_ins_weight(params, label):
+    """calc ins weight"""
+    global g_ratio
+    global w_ratio
+    slots = []
+    s_clk_num = 0
+    s_show_num = 0
+    active = 0
+    attclk_num = 0
+    attshow_num = 0
+    attclk_avg = 0
+    for items in params:
+        if len(items) != 2:
+            continue
+        slot_name = items[0]
+        slot_val = items[1]
+        if slot_name not in slots:
+            slots.append(slot_name)
+        if slot_name == "session_click_num":
+            s_clk_num = int(slot_val)
+        if slot_name == "session_show_num":
+            s_show_num = int(slot_val)
+        if slot_name == "activity":
+            active = float(slot_val) / 10000.0
+    w = 1
+    # for inactive user 
+    if active >= 0 and active < 0.4 and s_show_num >=0 and s_show_num < 20:
+        w = math.log(w_ratio * (420 - (active * 50 + 1) * (s_show_num + 1)) + math.e)
+        if label == "0":
+            w = 1 + (w - 1) * g_ratio
+    return w
+
+def filter_whitelist_slot(tmp_line):
+    terms = tmp_line.split()
+    line = "%s %s %s" % (terms[0], terms[1], terms[2])
+    for item in terms[3:]:
+        feasign = item.split(':')
+        if len(feasign) == 2 and \
+            feasign[1] in slot_whitelist:
+            line = "%s %s" %(line, item)
+    return line
+
+def get_sample_type(line):
+    # vertical_type = 20
+    # if line.find("13038012583501790:6738") > 0:
+    #     return 30
+    # vertical_type = 0/5/1/2/9/11/13/16/29/-1
+    if (line.find("7408512894065610:6738") > 0) or \
+        (line.find("8815887816424655:6738") > 0) or \
+        (line.find("7689987878537419:6738") > 0) or \
+        (line.find("7971462863009228:6738") > 0) or \
+        (line.find("9941787754311891:6738") > 0) or \
+        (line.find("10504737723255509:6738") > 0) or \
+        (line.find("11067687692199127:6738") > 0) or \
+        (line.find("11912112645614554:6738") > 0) or \
+        (line.find("15571287443748071:6738") > 0) or \
+        (line.find("7127025017546227:6738") > 0): 
+        return 20
+    return -1
+
+def main():
+    """ins adjust"""
+    global del_text_slot
+    for l in sys.stdin:
+        l = l.rstrip("\n")
+        items = l.split(" ")
+        if len(items) < 3:
+            continue
+        label = items[2]
+        lines = l.split("\t")
+        line = lines[0]
+        # streaming ins include all ins, sample_type only handle NEWS ins
+        sample_type = -1
+        if 'NEWS' in l:
+            sample_type = get_sample_type(line)
+        #line = filter_whitelist_slot(tmp_line)
+        if len(lines) >= 4:
+            if 'VIDEO' in lines[3]:
+                continue
+            params = lines[2]
+            params = params.split(" ")
+            m = [tuple(i.split(":")) for i in params]
+            if m is None or len(m) == 0:
+                if sample_type > 0:
+                    print "%s $%s *1" % (line, sample_type)
+                else:
+                    print "%s *1" % line
+                sys.stdout.flush()
+                continue
+            weight = calc_ins_weight(m, label)
+            if sample_type > 0:
+                print "%s $%s *%s" % (line, sample_type, weight)
+            else:
+                print "%s *%s" % (line, weight)
+            sys.stdout.flush()
+        else:
+            if sample_type > 0:
+                print "%s $%s *1" % (line, sample_type)
+            else:
+                print "%s *1" % line
+            sys.stdout.flush()
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "0":
+            del_text_slot = False
+        if len(sys.argv) > 2:
+            g_ratio = float(sys.argv[2])
+        if len(sys.argv) > 3:
+            w_ratio = float(sys.argv[3])
+    main()
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/jingpai_fleet_desc_new.prototxt b/feed/feed_deploy/news_jingpai/package/my_nets/jingpai_fleet_desc_new.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..baf86c34e42a544ebfee248fcd1126ae2715d762
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/jingpai_fleet_desc_new.prototxt
@@ -0,0 +1,1504 @@
+server_param {
+  downpour_server_param {
+    downpour_table_param {
+      table_id: 0
+      table_class: "DownpourSparseTable"
+      shard_num: 1950
+      accessor {
+        accessor_class: "DownpourCtrAccessor"
+        sparse_sgd_param {
+          learning_rate: 0.05
+          initial_g2sum: 3.0
+          initial_range: 0.0001
+          weight_bounds: -10.0
+          weight_bounds: 10.0
+        }
+        fea_dim: 11
+        embedx_dim: 8
+        embedx_threshold: 10
+        downpour_accessor_param {
+          nonclk_coeff: 0.1
+          click_coeff: 1
+          base_threshold: 1.5
+          delta_threshold: 0.25
+          delta_keep_days: 16
+          delete_after_unseen_days: 30
+          show_click_decay_rate: 0.98
+          delete_threshold: 0.8
+        }
+        table_accessor_save_param {
+          param: 1
+          converter: "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)"
+          deconverter:  "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)"
+        }   
+        table_accessor_save_param {
+          param: 2
+          converter: "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)"
+          deconverter:  "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)"
+        }
+      }
+      type: PS_SPARSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 1
+      table_class: "DownpourDenseTable"
+      accessor {
+        accessor_class: "DownpourDenseValueAccessor"
+        dense_sgd_param {
+          name: "adam"
+          adam {
+            learning_rate: 5e-06
+            avg_decay_rate: 0.999993
+            ada_decay_rate: 0.9999
+            ada_epsilon: 1e-08
+            mom_decay_rate: 0.99
+          }
+          naive {
+            learning_rate: 0.0002
+          }
+        }
+        fea_dim: 2571127
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 2
+      table_class: "DownpourDenseDoubleTable"
+      accessor {
+        accessor_class: "DownpourDenseValueDoubleAccessor"
+        dense_sgd_param {
+          name: "summarydouble"
+          summary {
+            summary_decay_rate: 0.999999
+          }
+        }
+        fea_dim: 13464
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 3
+      table_class: "DownpourDenseTable"
+      accessor {
+        accessor_class: "DownpourDenseValueAccessor"
+        dense_sgd_param {
+          name: "adam"
+          adam {
+            learning_rate: 5e-06
+            avg_decay_rate: 0.999993
+            ada_decay_rate: 0.9999
+            ada_epsilon: 1e-08
+            mom_decay_rate: 0.99
+          }
+          naive {
+            learning_rate: 0.0002
+          }
+        }
+        fea_dim: 834238
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 4
+      table_class: "DownpourDenseDoubleTable"
+      accessor {
+        accessor_class: "DownpourDenseValueDoubleAccessor"
+        dense_sgd_param {
+          name: "summarydouble"
+          summary {
+            summary_decay_rate: 0.999999
+          }
+        }
+        fea_dim: 3267
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 5
+      table_class: "DownpourDenseTable"
+      accessor {
+        accessor_class: "DownpourDenseValueAccessor"
+        dense_sgd_param {
+          name: "adam"
+          adam {
+            learning_rate: 5e-06
+            avg_decay_rate: 0.999993
+            ada_decay_rate: 0.9999
+            ada_epsilon: 1e-08
+            mom_decay_rate: 0.99
+          }
+          naive {
+            learning_rate: 0.0002
+          }
+        }
+        fea_dim: 2072615
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    service_param {
+      server_class: "DownpourBrpcPsServer"
+      client_class: "DownpourBrpcPsClient"
+      service_class: "DownpourPsService"
+      start_server_port: 0
+      server_thread_num: 12
+    }
+  }
+}
+trainer_param {
+  dense_table {
+    table_id: 1
+    dense_variable_name: "fc_0.w_0"
+    dense_variable_name: "fc_0.b_0"
+    dense_variable_name: "fc_1.w_0"
+    dense_variable_name: "fc_1.b_0"
+    dense_variable_name: "fc_2.w_0"
+    dense_variable_name: "fc_2.b_0"
+    dense_variable_name: "fc_3.w_0"
+    dense_variable_name: "fc_3.b_0"
+    dense_variable_name: "fc_4.w_0"
+    dense_variable_name: "fc_4.b_0"
+    dense_variable_name: "fc_5.w_0"
+    dense_variable_name: "fc_5.b_0"
+    dense_variable_name: "fc_6.w_0"
+    dense_variable_name: "fc_6.b_0"
+    dense_variable_name: "fc_7.w_0"
+    dense_variable_name: "fc_7.b_0"
+    dense_gradient_variable_name: "fc_0.w_0@GRAD"
+    dense_gradient_variable_name: "fc_0.b_0@GRAD"
+    dense_gradient_variable_name: "fc_1.w_0@GRAD"
+    dense_gradient_variable_name: "fc_1.b_0@GRAD"
+    dense_gradient_variable_name: "fc_2.w_0@GRAD"
+    dense_gradient_variable_name: "fc_2.b_0@GRAD"
+    dense_gradient_variable_name: "fc_3.w_0@GRAD"
+    dense_gradient_variable_name: "fc_3.b_0@GRAD"
+    dense_gradient_variable_name: "fc_4.w_0@GRAD"
+    dense_gradient_variable_name: "fc_4.b_0@GRAD"
+    dense_gradient_variable_name: "fc_5.w_0@GRAD"
+    dense_gradient_variable_name: "fc_5.b_0@GRAD"
+    dense_gradient_variable_name: "fc_6.w_0@GRAD"
+    dense_gradient_variable_name: "fc_6.b_0@GRAD"
+    dense_gradient_variable_name: "fc_7.w_0@GRAD"
+    dense_gradient_variable_name: "fc_7.b_0@GRAD"
+  }
+  dense_table {
+    table_id: 2
+    dense_variable_name: "bn6048.batch_size"
+    dense_variable_name: "bn6048.batch_sum"
+    dense_variable_name: "bn6048.batch_square_sum"
+    dense_gradient_variable_name: "bn6048.batch_size@GRAD"
+    dense_gradient_variable_name: "bn6048.batch_sum@GRAD"
+    dense_gradient_variable_name: "bn6048.batch_square_sum@GRAD"
+  }
+  dense_table {
+    table_id: 3
+    dense_variable_name: "fc_0.w_0"
+    dense_variable_name: "fc_0.b_0"
+    dense_variable_name: "fc_1.w_0"
+    dense_variable_name: "fc_1.b_0"
+    dense_variable_name: "fc_2.w_0"
+    dense_variable_name: "fc_2.b_0"
+    dense_variable_name: "fc_3.w_0"
+    dense_variable_name: "fc_3.b_0"
+    dense_variable_name: "fc_4.w_0"
+    dense_variable_name: "fc_4.b_0"
+    dense_variable_name: "fc_5.w_0"
+    dense_variable_name: "fc_5.b_0"
+    dense_variable_name: "fc_6.w_0"
+    dense_variable_name: "fc_6.b_0"
+    dense_variable_name: "fc_7.w_0"
+    dense_variable_name: "fc_7.b_0"
+    dense_gradient_variable_name: "fc_0.w_0@GRAD"
+    dense_gradient_variable_name: "fc_0.b_0@GRAD"
+    dense_gradient_variable_name: "fc_1.w_0@GRAD"
+    dense_gradient_variable_name: "fc_1.b_0@GRAD"
+    dense_gradient_variable_name: "fc_2.w_0@GRAD"
+    dense_gradient_variable_name: "fc_2.b_0@GRAD"
+    dense_gradient_variable_name: "fc_3.w_0@GRAD"
+    dense_gradient_variable_name: "fc_3.b_0@GRAD"
+    dense_gradient_variable_name: "fc_4.w_0@GRAD"
+    dense_gradient_variable_name: "fc_4.b_0@GRAD"
+    dense_gradient_variable_name: "fc_5.w_0@GRAD"
+    dense_gradient_variable_name: "fc_5.b_0@GRAD"
+    dense_gradient_variable_name: "fc_6.w_0@GRAD"
+    dense_gradient_variable_name: "fc_6.b_0@GRAD"
+    dense_gradient_variable_name: "fc_7.w_0@GRAD"
+    dense_gradient_variable_name: "fc_7.b_0@GRAD"
+  }
+  dense_table {
+    table_id: 4
+    dense_variable_name: "bn6048.batch_size"
+    dense_variable_name: "bn6048.batch_sum"
+    dense_variable_name: "bn6048.batch_square_sum"
+    dense_gradient_variable_name: "bn6048.batch_size@GRAD"
+    dense_gradient_variable_name: "bn6048.batch_sum@GRAD"
+    dense_gradient_variable_name: "bn6048.batch_square_sum@GRAD"
+  }
+  dense_table {
+    table_id: 5
+    dense_variable_name: "fc_0.w_0"
+    dense_variable_name: "fc_0.b_0"
+    dense_variable_name: "fc_1.w_0"
+    dense_variable_name: "fc_1.b_0"
+    dense_variable_name: "fc_2.w_0"
+    dense_variable_name: "fc_2.b_0"
+    dense_variable_name: "fc_3.w_0"
+    dense_variable_name: "fc_3.b_0"
+    dense_variable_name: "fc_4.w_0"
+    dense_variable_name: "fc_4.b_0"
+    dense_variable_name: "fc_5.w_0"
+    dense_variable_name: "fc_5.b_0"
+    dense_gradient_variable_name: "fc_0.w_0@GRAD"
+    dense_gradient_variable_name: "fc_0.b_0@GRAD"
+    dense_gradient_variable_name: "fc_1.w_0@GRAD"
+    dense_gradient_variable_name: "fc_1.b_0@GRAD"
+    dense_gradient_variable_name: "fc_2.w_0@GRAD"
+    dense_gradient_variable_name: "fc_2.b_0@GRAD"
+    dense_gradient_variable_name: "fc_3.w_0@GRAD"
+    dense_gradient_variable_name: "fc_3.b_0@GRAD"
+    dense_gradient_variable_name: "fc_4.w_0@GRAD"
+    dense_gradient_variable_name: "fc_4.b_0@GRAD"
+    dense_gradient_variable_name: "fc_5.w_0@GRAD"
+    dense_gradient_variable_name: "fc_5.b_0@GRAD"
+  }
+  sparse_table {
+    table_id: 0
+    slot_key: "6048"
+    slot_key: "6002"
+    slot_key: "6145"
+    slot_key: "6202"
+    slot_key: "6201"
+    slot_key: "6121"
+    slot_key: "6738"
+    slot_key: "6119"
+    slot_key: "6146"
+    slot_key: "6120"
+    slot_key: "6147"
+    slot_key: "6122"
+    slot_key: "6123"
+    slot_key: "6118"
+    slot_key: "6142"
+    slot_key: "6143"
+    slot_key: "6008"
+    slot_key: "6148"
+    slot_key: "6151"
+    slot_key: "6127"
+    slot_key: "6144"
+    slot_key: "6094"
+    slot_key: "6083"
+    slot_key: "6952"
+    slot_key: "6739"
+    slot_key: "6150"
+    slot_key: "6109"
+    slot_key: "6003"
+    slot_key: "6099"
+    slot_key: "6149"
+    slot_key: "6129"
+    slot_key: "6203"
+    slot_key: "6153"
+    slot_key: "6152"
+    slot_key: "6128"
+    slot_key: "6106"
+    slot_key: "6251"
+    slot_key: "7082"
+    slot_key: "7515"
+    slot_key: "6951"
+    slot_key: "6949"
+    slot_key: "7080"
+    slot_key: "6066"
+    slot_key: "7507"
+    slot_key: "6186"
+    slot_key: "6007"
+    slot_key: "7514"
+    slot_key: "6125"
+    slot_key: "7506"
+    slot_key: "10001"
+    slot_key: "6006"
+    slot_key: "7023"
+    slot_key: "6085"
+    slot_key: "10000"
+    slot_key: "6098"
+    slot_key: "6250"
+    slot_key: "6110"
+    slot_key: "6124"
+    slot_key: "6090"
+    slot_key: "6082"
+    slot_key: "6067"
+    slot_key: "6101"
+    slot_key: "6004"
+    slot_key: "6191"
+    slot_key: "7075"
+    slot_key: "6948"
+    slot_key: "6157"
+    slot_key: "6126"
+    slot_key: "6188"
+    slot_key: "7077"
+    slot_key: "6070"
+    slot_key: "6111"
+    slot_key: "6087"
+    slot_key: "6103"
+    slot_key: "6107"
+    slot_key: "6194"
+    slot_key: "6156"
+    slot_key: "6005"
+    slot_key: "6247"
+    slot_key: "6814"
+    slot_key: "6158"
+    slot_key: "7122"
+    slot_key: "6058"
+    slot_key: "6189"
+    slot_key: "7058"
+    slot_key: "6059"
+    slot_key: "6115"
+    slot_key: "7079"
+    slot_key: "7081"
+    slot_key: "6833"
+    slot_key: "7024"
+    slot_key: "6108"
+    slot_key: "13342"
+    slot_key: "13345"
+    slot_key: "13412"
+    slot_key: "13343"
+    slot_key: "13350"
+    slot_key: "13346"
+    slot_key: "13409"
+    slot_key: "6009"
+    slot_key: "6011"
+    slot_key: "6012"
+    slot_key: "6013"
+    slot_key: "6014"
+    slot_key: "6015"
+    slot_key: "6019"
+    slot_key: "6023"
+    slot_key: "6024"
+    slot_key: "6027"
+    slot_key: "6029"
+    slot_key: "6031"
+    slot_key: "6050"
+    slot_key: "6060"
+    slot_key: "6068"
+    slot_key: "6069"
+    slot_key: "6089"
+    slot_key: "6095"
+    slot_key: "6105"
+    slot_key: "6112"
+    slot_key: "6130"
+    slot_key: "6131"
+    slot_key: "6132"
+    slot_key: "6134"
+    slot_key: "6161"
+    slot_key: "6162"
+    slot_key: "6163"
+    slot_key: "6166"
+    slot_key: "6182"
+    slot_key: "6183"
+    slot_key: "6185"
+    slot_key: "6190"
+    slot_key: "6212"
+    slot_key: "6213"
+    slot_key: "6231"
+    slot_key: "6233"
+    slot_key: "6234"
+    slot_key: "6236"
+    slot_key: "6238"
+    slot_key: "6239"
+    slot_key: "6240"
+    slot_key: "6241"
+    slot_key: "6242"
+    slot_key: "6243"
+    slot_key: "6244"
+    slot_key: "6245"
+    slot_key: "6354"
+    slot_key: "7002"
+    slot_key: "7005"
+    slot_key: "7008"
+    slot_key: "7010"
+    slot_key: "7012"
+    slot_key: "7013"
+    slot_key: "7015"
+    slot_key: "7016"
+    slot_key: "7017"
+    slot_key: "7018"
+    slot_key: "7019"
+    slot_key: "7020"
+    slot_key: "7045"
+    slot_key: "7046"
+    slot_key: "7048"
+    slot_key: "7049"
+    slot_key: "7052"
+    slot_key: "7054"
+    slot_key: "7056"
+    slot_key: "7064"
+    slot_key: "7066"
+    slot_key: "7076"
+    slot_key: "7078"
+    slot_key: "7083"
+    slot_key: "7084"
+    slot_key: "7085"
+    slot_key: "7086"
+    slot_key: "7087"
+    slot_key: "7088"
+    slot_key: "7089"
+    slot_key: "7090"
+    slot_key: "7099"
+    slot_key: "7100"
+    slot_key: "7101"
+    slot_key: "7102"
+    slot_key: "7103"
+    slot_key: "7104"
+    slot_key: "7105"
+    slot_key: "7109"
+    slot_key: "7124"
+    slot_key: "7126"
+    slot_key: "7136"
+    slot_key: "7142"
+    slot_key: "7143"
+    slot_key: "7144"
+    slot_key: "7145"
+    slot_key: "7146"
+    slot_key: "7147"
+    slot_key: "7148"
+    slot_key: "7150"
+    slot_key: "7151"
+    slot_key: "7152"
+    slot_key: "7153"
+    slot_key: "7154"
+    slot_key: "7155"
+    slot_key: "7156"
+    slot_key: "7157"
+    slot_key: "7047"
+    slot_key: "7050"
+    slot_key: "6253"
+    slot_key: "6254"
+    slot_key: "6255"
+    slot_key: "6256"
+    slot_key: "6257"
+    slot_key: "6259"
+    slot_key: "6260"
+    slot_key: "6261"
+    slot_key: "7170"
+    slot_key: "7185"
+    slot_key: "7186"
+    slot_key: "6751"
+    slot_key: "6755"
+    slot_key: "6757"
+    slot_key: "6759"
+    slot_key: "6760"
+    slot_key: "6763"
+    slot_key: "6764"
+    slot_key: "6765"
+    slot_key: "6766"
+    slot_key: "6767"
+    slot_key: "6768"
+    slot_key: "6769"
+    slot_key: "6770"
+    slot_key: "7502"
+    slot_key: "7503"
+    slot_key: "7504"
+    slot_key: "7505"
+    slot_key: "7510"
+    slot_key: "7511"
+    slot_key: "7512"
+    slot_key: "7513"
+    slot_key: "6806"
+    slot_key: "6807"
+    slot_key: "6808"
+    slot_key: "6809"
+    slot_key: "6810"
+    slot_key: "6811"
+    slot_key: "6812"
+    slot_key: "6813"
+    slot_key: "6815"
+    slot_key: "6816"
+    slot_key: "6817"
+    slot_key: "6819"
+    slot_key: "6823"
+    slot_key: "6828"
+    slot_key: "6831"
+    slot_key: "6840"
+    slot_key: "6845"
+    slot_key: "6875"
+    slot_key: "6879"
+    slot_key: "6881"
+    slot_key: "6888"
+    slot_key: "6889"
+    slot_key: "6947"
+    slot_key: "6950"
+    slot_key: "6956"
+    slot_key: "6957"
+    slot_key: "6959"
+    slot_key: "10006"
+    slot_key: "10008"
+    slot_key: "10009"
+    slot_key: "10010"
+    slot_key: "10011"
+    slot_key: "10016"
+    slot_key: "10017"
+    slot_key: "10018"
+    slot_key: "10019"
+    slot_key: "10020"
+    slot_key: "10021"
+    slot_key: "10022"
+    slot_key: "10023"
+    slot_key: "10024"
+    slot_key: "10029"
+    slot_key: "10030"
+    slot_key: "10031"
+    slot_key: "10032"
+    slot_key: "10033"
+    slot_key: "10034"
+    slot_key: "10035"
+    slot_key: "10036"
+    slot_key: "10037"
+    slot_key: "10038"
+    slot_key: "10039"
+    slot_key: "10040"
+    slot_key: "10041"
+    slot_key: "10042"
+    slot_key: "10044"
+    slot_key: "10045"
+    slot_key: "10046"
+    slot_key: "10051"
+    slot_key: "10052"
+    slot_key: "10053"
+    slot_key: "10054"
+    slot_key: "10055"
+    slot_key: "10056"
+    slot_key: "10057"
+    slot_key: "10060"
+    slot_key: "10066"
+    slot_key: "10069"
+    slot_key: "6820"
+    slot_key: "6821"
+    slot_key: "6822"
+    slot_key: "13333"
+    slot_key: "13334"
+    slot_key: "13335"
+    slot_key: "13336"
+    slot_key: "13337"
+    slot_key: "13338"
+    slot_key: "13339"
+    slot_key: "13340"
+    slot_key: "13341"
+    slot_key: "13351"
+    slot_key: "13352"
+    slot_key: "13353"
+    slot_key: "13359"
+    slot_key: "13361"
+    slot_key: "13362"
+    slot_key: "13363"
+    slot_key: "13366"
+    slot_key: "13367"
+    slot_key: "13368"
+    slot_key: "13369"
+    slot_key: "13370"
+    slot_key: "13371"
+    slot_key: "13375"
+    slot_key: "13376"
+    slot_key: "5700"
+    slot_key: "5702"
+    slot_key: "13400"
+    slot_key: "13401"
+    slot_key: "13402"
+    slot_key: "13403"
+    slot_key: "13404"
+    slot_key: "13406"
+    slot_key: "13407"
+    slot_key: "13408"
+    slot_key: "13410"
+    slot_key: "13417"
+    slot_key: "13418"
+    slot_key: "13419"
+    slot_key: "13420"
+    slot_key: "13422"
+    slot_key: "13425"
+    slot_key: "13427"
+    slot_key: "13428"
+    slot_key: "13429"
+    slot_key: "13430"
+    slot_key: "13431"
+    slot_key: "13433"
+    slot_key: "13434"
+    slot_key: "13436"
+    slot_key: "13437"
+    slot_key: "13326"
+    slot_key: "13330"
+    slot_key: "13331"
+    slot_key: "5717"
+    slot_key: "13442"
+    slot_key: "13451"
+    slot_key: "13452"
+    slot_key: "13455"
+    slot_key: "13456"
+    slot_key: "13457"
+    slot_key: "13458"
+    slot_key: "13459"
+    slot_key: "13460"
+    slot_key: "13461"
+    slot_key: "13462"
+    slot_key: "13463"
+    slot_key: "13464"
+    slot_key: "13465"
+    slot_key: "13466"
+    slot_key: "13467"
+    slot_key: "13468"
+    slot_key: "1104"
+    slot_key: "1106"
+    slot_key: "1107"
+    slot_key: "1108"
+    slot_key: "1109"
+    slot_key: "1110"
+    slot_key: "1111"
+    slot_key: "1112"
+    slot_key: "1113"
+    slot_key: "1114"
+    slot_key: "1115"
+    slot_key: "1116"
+    slot_key: "1117"
+    slot_key: "1119"
+    slot_key: "1120"
+    slot_key: "1121"
+    slot_key: "1122"
+    slot_key: "1123"
+    slot_key: "1124"
+    slot_key: "1125"
+    slot_key: "1126"
+    slot_key: "1127"
+    slot_key: "1128"
+    slot_key: "1129"
+    slot_key: "13812"
+    slot_key: "13813"
+    slot_key: "6740"
+    slot_key: "1490"
+    slot_key: "1491"
+    slot_value: "embedding_0.tmp_0"
+    slot_value: "embedding_1.tmp_0"
+    slot_value: "embedding_2.tmp_0"
+    slot_value: "embedding_3.tmp_0"
+    slot_value: "embedding_4.tmp_0"
+    slot_value: "embedding_5.tmp_0"
+    slot_value: "embedding_6.tmp_0"
+    slot_value: "embedding_7.tmp_0"
+    slot_value: "embedding_8.tmp_0"
+    slot_value: "embedding_9.tmp_0"
+    slot_value: "embedding_10.tmp_0"
+    slot_value: "embedding_11.tmp_0"
+    slot_value: "embedding_12.tmp_0"
+    slot_value: "embedding_13.tmp_0"
+    slot_value: "embedding_14.tmp_0"
+    slot_value: "embedding_15.tmp_0"
+    slot_value: "embedding_16.tmp_0"
+    slot_value: "embedding_17.tmp_0"
+    slot_value: "embedding_18.tmp_0"
+    slot_value: "embedding_19.tmp_0"
+    slot_value: "embedding_20.tmp_0"
+    slot_value: "embedding_21.tmp_0"
+    slot_value: "embedding_22.tmp_0"
+    slot_value: "embedding_23.tmp_0"
+    slot_value: "embedding_24.tmp_0"
+    slot_value: "embedding_25.tmp_0"
+    slot_value: "embedding_26.tmp_0"
+    slot_value: "embedding_27.tmp_0"
+    slot_value: "embedding_28.tmp_0"
+    slot_value: "embedding_29.tmp_0"
+    slot_value: "embedding_30.tmp_0"
+    slot_value: "embedding_31.tmp_0"
+    slot_value: "embedding_32.tmp_0"
+    slot_value: "embedding_33.tmp_0"
+    slot_value: "embedding_34.tmp_0"
+    slot_value: "embedding_35.tmp_0"
+    slot_value: "embedding_36.tmp_0"
+    slot_value: "embedding_37.tmp_0"
+    slot_value: "embedding_38.tmp_0"
+    slot_value: "embedding_39.tmp_0"
+    slot_value: "embedding_40.tmp_0"
+    slot_value: "embedding_41.tmp_0"
+    slot_value: "embedding_42.tmp_0"
+    slot_value: "embedding_43.tmp_0"
+    slot_value: "embedding_44.tmp_0"
+    slot_value: "embedding_45.tmp_0"
+    slot_value: "embedding_46.tmp_0"
+    slot_value: "embedding_47.tmp_0"
+    slot_value: "embedding_48.tmp_0"
+    slot_value: "embedding_49.tmp_0"
+    slot_value: "embedding_50.tmp_0"
+    slot_value: "embedding_51.tmp_0"
+    slot_value: "embedding_52.tmp_0"
+    slot_value: "embedding_53.tmp_0"
+    slot_value: "embedding_54.tmp_0"
+    slot_value: "embedding_55.tmp_0"
+    slot_value: "embedding_56.tmp_0"
+    slot_value: "embedding_57.tmp_0"
+    slot_value: "embedding_58.tmp_0"
+    slot_value: "embedding_59.tmp_0"
+    slot_value: "embedding_60.tmp_0"
+    slot_value: "embedding_61.tmp_0"
+    slot_value: "embedding_62.tmp_0"
+    slot_value: "embedding_63.tmp_0"
+    slot_value: "embedding_64.tmp_0"
+    slot_value: "embedding_65.tmp_0"
+    slot_value: "embedding_66.tmp_0"
+    slot_value: "embedding_67.tmp_0"
+    slot_value: "embedding_68.tmp_0"
+    slot_value: "embedding_69.tmp_0"
+    slot_value: "embedding_70.tmp_0"
+    slot_value: "embedding_71.tmp_0"
+    slot_value: "embedding_72.tmp_0"
+    slot_value: "embedding_73.tmp_0"
+    slot_value: "embedding_74.tmp_0"
+    slot_value: "embedding_75.tmp_0"
+    slot_value: "embedding_76.tmp_0"
+    slot_value: "embedding_77.tmp_0"
+    slot_value: "embedding_78.tmp_0"
+    slot_value: "embedding_79.tmp_0"
+    slot_value: "embedding_80.tmp_0"
+    slot_value: "embedding_81.tmp_0"
+    slot_value: "embedding_82.tmp_0"
+    slot_value: "embedding_83.tmp_0"
+    slot_value: "embedding_84.tmp_0"
+    slot_value: "embedding_85.tmp_0"
+    slot_value: "embedding_86.tmp_0"
+    slot_value: "embedding_87.tmp_0"
+    slot_value: "embedding_88.tmp_0"
+    slot_value: "embedding_89.tmp_0"
+    slot_value: "embedding_90.tmp_0"
+    slot_value: "embedding_91.tmp_0"
+    slot_value: "embedding_92.tmp_0"
+    slot_value: "embedding_93.tmp_0"
+    slot_value: "embedding_94.tmp_0"
+    slot_value: "embedding_95.tmp_0"
+    slot_value: "embedding_96.tmp_0"
+    slot_value: "embedding_97.tmp_0"
+    slot_value: "embedding_98.tmp_0"
+    slot_value: "embedding_99.tmp_0"
+    slot_value: "embedding_100.tmp_0"
+    slot_value: "embedding_101.tmp_0"
+    slot_value: "embedding_102.tmp_0"
+    slot_value: "embedding_103.tmp_0"
+    slot_value: "embedding_104.tmp_0"
+    slot_value: "embedding_105.tmp_0"
+    slot_value: "embedding_106.tmp_0"
+    slot_value: "embedding_107.tmp_0"
+    slot_value: "embedding_108.tmp_0"
+    slot_value: "embedding_109.tmp_0"
+    slot_value: "embedding_110.tmp_0"
+    slot_value: "embedding_111.tmp_0"
+    slot_value: "embedding_112.tmp_0"
+    slot_value: "embedding_113.tmp_0"
+    slot_value: "embedding_114.tmp_0"
+    slot_value: "embedding_115.tmp_0"
+    slot_value: "embedding_116.tmp_0"
+    slot_value: "embedding_117.tmp_0"
+    slot_value: "embedding_118.tmp_0"
+    slot_value: "embedding_119.tmp_0"
+    slot_value: "embedding_120.tmp_0"
+    slot_value: "embedding_121.tmp_0"
+    slot_value: "embedding_122.tmp_0"
+    slot_value: "embedding_123.tmp_0"
+    slot_value: "embedding_124.tmp_0"
+    slot_value: "embedding_125.tmp_0"
+    slot_value: "embedding_126.tmp_0"
+    slot_value: "embedding_127.tmp_0"
+    slot_value: "embedding_128.tmp_0"
+    slot_value: "embedding_129.tmp_0"
+    slot_value: "embedding_130.tmp_0"
+    slot_value: "embedding_131.tmp_0"
+    slot_value: "embedding_132.tmp_0"
+    slot_value: "embedding_133.tmp_0"
+    slot_value: "embedding_134.tmp_0"
+    slot_value: "embedding_135.tmp_0"
+    slot_value: "embedding_136.tmp_0"
+    slot_value: "embedding_137.tmp_0"
+    slot_value: "embedding_138.tmp_0"
+    slot_value: "embedding_139.tmp_0"
+    slot_value: "embedding_140.tmp_0"
+    slot_value: "embedding_141.tmp_0"
+    slot_value: "embedding_142.tmp_0"
+    slot_value: "embedding_143.tmp_0"
+    slot_value: "embedding_144.tmp_0"
+    slot_value: "embedding_145.tmp_0"
+    slot_value: "embedding_146.tmp_0"
+    slot_value: "embedding_147.tmp_0"
+    slot_value: "embedding_148.tmp_0"
+    slot_value: "embedding_149.tmp_0"
+    slot_value: "embedding_150.tmp_0"
+    slot_value: "embedding_151.tmp_0"
+    slot_value: "embedding_152.tmp_0"
+    slot_value: "embedding_153.tmp_0"
+    slot_value: "embedding_154.tmp_0"
+    slot_value: "embedding_155.tmp_0"
+    slot_value: "embedding_156.tmp_0"
+    slot_value: "embedding_157.tmp_0"
+    slot_value: "embedding_158.tmp_0"
+    slot_value: "embedding_159.tmp_0"
+    slot_value: "embedding_160.tmp_0"
+    slot_value: "embedding_161.tmp_0"
+    slot_value: "embedding_162.tmp_0"
+    slot_value: "embedding_163.tmp_0"
+    slot_value: "embedding_164.tmp_0"
+    slot_value: "embedding_165.tmp_0"
+    slot_value: "embedding_166.tmp_0"
+    slot_value: "embedding_167.tmp_0"
+    slot_value: "embedding_168.tmp_0"
+    slot_value: "embedding_169.tmp_0"
+    slot_value: "embedding_170.tmp_0"
+    slot_value: "embedding_171.tmp_0"
+    slot_value: "embedding_172.tmp_0"
+    slot_value: "embedding_173.tmp_0"
+    slot_value: "embedding_174.tmp_0"
+    slot_value: "embedding_175.tmp_0"
+    slot_value: "embedding_176.tmp_0"
+    slot_value: "embedding_177.tmp_0"
+    slot_value: "embedding_178.tmp_0"
+    slot_value: "embedding_179.tmp_0"
+    slot_value: "embedding_180.tmp_0"
+    slot_value: "embedding_181.tmp_0"
+    slot_value: "embedding_182.tmp_0"
+    slot_value: "embedding_183.tmp_0"
+    slot_value: "embedding_184.tmp_0"
+    slot_value: "embedding_185.tmp_0"
+    slot_value: "embedding_186.tmp_0"
+    slot_value: "embedding_187.tmp_0"
+    slot_value: "embedding_188.tmp_0"
+    slot_value: "embedding_189.tmp_0"
+    slot_value: "embedding_190.tmp_0"
+    slot_value: "embedding_191.tmp_0"
+    slot_value: "embedding_192.tmp_0"
+    slot_value: "embedding_193.tmp_0"
+    slot_value: "embedding_194.tmp_0"
+    slot_value: "embedding_195.tmp_0"
+    slot_value: "embedding_196.tmp_0"
+    slot_value: "embedding_197.tmp_0"
+    slot_value: "embedding_198.tmp_0"
+    slot_value: "embedding_199.tmp_0"
+    slot_value: "embedding_200.tmp_0"
+    slot_value: "embedding_201.tmp_0"
+    slot_value: "embedding_202.tmp_0"
+    slot_value: "embedding_203.tmp_0"
+    slot_value: "embedding_204.tmp_0"
+    slot_value: "embedding_205.tmp_0"
+    slot_value: "embedding_206.tmp_0"
+    slot_value: "embedding_207.tmp_0"
+    slot_value: "embedding_208.tmp_0"
+    slot_value: "embedding_209.tmp_0"
+    slot_value: "embedding_210.tmp_0"
+    slot_value: "embedding_211.tmp_0"
+    slot_value: "embedding_212.tmp_0"
+    slot_value: "embedding_213.tmp_0"
+    slot_value: "embedding_214.tmp_0"
+    slot_value: "embedding_215.tmp_0"
+    slot_value: "embedding_216.tmp_0"
+    slot_value: "embedding_217.tmp_0"
+    slot_value: "embedding_218.tmp_0"
+    slot_value: "embedding_219.tmp_0"
+    slot_value: "embedding_220.tmp_0"
+    slot_value: "embedding_221.tmp_0"
+    slot_value: "embedding_222.tmp_0"
+    slot_value: "embedding_223.tmp_0"
+    slot_value: "embedding_224.tmp_0"
+    slot_value: "embedding_225.tmp_0"
+    slot_value: "embedding_226.tmp_0"
+    slot_value: "embedding_227.tmp_0"
+    slot_value: "embedding_228.tmp_0"
+    slot_value: "embedding_229.tmp_0"
+    slot_value: "embedding_230.tmp_0"
+    slot_value: "embedding_231.tmp_0"
+    slot_value: "embedding_232.tmp_0"
+    slot_value: "embedding_233.tmp_0"
+    slot_value: "embedding_234.tmp_0"
+    slot_value: "embedding_235.tmp_0"
+    slot_value: "embedding_236.tmp_0"
+    slot_value: "embedding_237.tmp_0"
+    slot_value: "embedding_238.tmp_0"
+    slot_value: "embedding_239.tmp_0"
+    slot_value: "embedding_240.tmp_0"
+    slot_value: "embedding_241.tmp_0"
+    slot_value: "embedding_242.tmp_0"
+    slot_value: "embedding_243.tmp_0"
+    slot_value: "embedding_244.tmp_0"
+    slot_value: "embedding_245.tmp_0"
+    slot_value: "embedding_246.tmp_0"
+    slot_value: "embedding_247.tmp_0"
+    slot_value: "embedding_248.tmp_0"
+    slot_value: "embedding_249.tmp_0"
+    slot_value: "embedding_250.tmp_0"
+    slot_value: "embedding_251.tmp_0"
+    slot_value: "embedding_252.tmp_0"
+    slot_value: "embedding_253.tmp_0"
+    slot_value: "embedding_254.tmp_0"
+    slot_value: "embedding_255.tmp_0"
+    slot_value: "embedding_256.tmp_0"
+    slot_value: "embedding_257.tmp_0"
+    slot_value: "embedding_258.tmp_0"
+    slot_value: "embedding_259.tmp_0"
+    slot_value: "embedding_260.tmp_0"
+    slot_value: "embedding_261.tmp_0"
+    slot_value: "embedding_262.tmp_0"
+    slot_value: "embedding_263.tmp_0"
+    slot_value: "embedding_264.tmp_0"
+    slot_value: "embedding_265.tmp_0"
+    slot_value: "embedding_266.tmp_0"
+    slot_value: "embedding_267.tmp_0"
+    slot_value: "embedding_268.tmp_0"
+    slot_value: "embedding_269.tmp_0"
+    slot_value: "embedding_270.tmp_0"
+    slot_value: "embedding_271.tmp_0"
+    slot_value: "embedding_272.tmp_0"
+    slot_value: "embedding_273.tmp_0"
+    slot_value: "embedding_274.tmp_0"
+    slot_value: "embedding_275.tmp_0"
+    slot_value: "embedding_276.tmp_0"
+    slot_value: "embedding_277.tmp_0"
+    slot_value: "embedding_278.tmp_0"
+    slot_value: "embedding_279.tmp_0"
+    slot_value: "embedding_280.tmp_0"
+    slot_value: "embedding_281.tmp_0"
+    slot_value: "embedding_282.tmp_0"
+    slot_value: "embedding_283.tmp_0"
+    slot_value: "embedding_284.tmp_0"
+    slot_value: "embedding_285.tmp_0"
+    slot_value: "embedding_286.tmp_0"
+    slot_value: "embedding_287.tmp_0"
+    slot_value: "embedding_288.tmp_0"
+    slot_value: "embedding_289.tmp_0"
+    slot_value: "embedding_290.tmp_0"
+    slot_value: "embedding_291.tmp_0"
+    slot_value: "embedding_292.tmp_0"
+    slot_value: "embedding_293.tmp_0"
+    slot_value: "embedding_294.tmp_0"
+    slot_value: "embedding_295.tmp_0"
+    slot_value: "embedding_296.tmp_0"
+    slot_value: "embedding_297.tmp_0"
+    slot_value: "embedding_298.tmp_0"
+    slot_value: "embedding_299.tmp_0"
+    slot_value: "embedding_300.tmp_0"
+    slot_value: "embedding_301.tmp_0"
+    slot_value: "embedding_302.tmp_0"
+    slot_value: "embedding_303.tmp_0"
+    slot_value: "embedding_304.tmp_0"
+    slot_value: "embedding_305.tmp_0"
+    slot_value: "embedding_306.tmp_0"
+    slot_value: "embedding_307.tmp_0"
+    slot_value: "embedding_308.tmp_0"
+    slot_value: "embedding_309.tmp_0"
+    slot_value: "embedding_310.tmp_0"
+    slot_value: "embedding_311.tmp_0"
+    slot_value: "embedding_312.tmp_0"
+    slot_value: "embedding_313.tmp_0"
+    slot_value: "embedding_314.tmp_0"
+    slot_value: "embedding_315.tmp_0"
+    slot_value: "embedding_316.tmp_0"
+    slot_value: "embedding_317.tmp_0"
+    slot_value: "embedding_318.tmp_0"
+    slot_value: "embedding_319.tmp_0"
+    slot_value: "embedding_320.tmp_0"
+    slot_value: "embedding_321.tmp_0"
+    slot_value: "embedding_322.tmp_0"
+    slot_value: "embedding_323.tmp_0"
+    slot_value: "embedding_324.tmp_0"
+    slot_value: "embedding_325.tmp_0"
+    slot_value: "embedding_326.tmp_0"
+    slot_value: "embedding_327.tmp_0"
+    slot_value: "embedding_328.tmp_0"
+    slot_value: "embedding_329.tmp_0"
+    slot_value: "embedding_330.tmp_0"
+    slot_value: "embedding_331.tmp_0"
+    slot_value: "embedding_332.tmp_0"
+    slot_value: "embedding_333.tmp_0"
+    slot_value: "embedding_334.tmp_0"
+    slot_value: "embedding_335.tmp_0"
+    slot_value: "embedding_336.tmp_0"
+    slot_value: "embedding_337.tmp_0"
+    slot_value: "embedding_338.tmp_0"
+    slot_value: "embedding_339.tmp_0"
+    slot_value: "embedding_340.tmp_0"
+    slot_value: "embedding_341.tmp_0"
+    slot_value: "embedding_342.tmp_0"
+    slot_value: "embedding_343.tmp_0"
+    slot_value: "embedding_344.tmp_0"
+    slot_value: "embedding_345.tmp_0"
+    slot_value: "embedding_346.tmp_0"
+    slot_value: "embedding_347.tmp_0"
+    slot_value: "embedding_348.tmp_0"
+    slot_value: "embedding_349.tmp_0"
+    slot_value: "embedding_350.tmp_0"
+    slot_value: "embedding_351.tmp_0"
+    slot_value: "embedding_352.tmp_0"
+    slot_value: "embedding_353.tmp_0"
+    slot_value: "embedding_354.tmp_0"
+    slot_value: "embedding_355.tmp_0"
+    slot_value: "embedding_356.tmp_0"
+    slot_value: "embedding_357.tmp_0"
+    slot_value: "embedding_358.tmp_0"
+    slot_value: "embedding_359.tmp_0"
+    slot_value: "embedding_360.tmp_0"
+    slot_value: "embedding_361.tmp_0"
+    slot_value: "embedding_362.tmp_0"
+    slot_value: "embedding_363.tmp_0"
+    slot_value: "embedding_364.tmp_0"
+    slot_value: "embedding_365.tmp_0"
+    slot_value: "embedding_366.tmp_0"
+    slot_value: "embedding_367.tmp_0"
+    slot_value: "embedding_368.tmp_0"
+    slot_value: "embedding_369.tmp_0"
+    slot_value: "embedding_370.tmp_0"
+    slot_value: "embedding_371.tmp_0"
+    slot_value: "embedding_372.tmp_0"
+    slot_value: "embedding_373.tmp_0"
+    slot_value: "embedding_374.tmp_0"
+    slot_value: "embedding_375.tmp_0"
+    slot_value: "embedding_376.tmp_0"
+    slot_value: "embedding_377.tmp_0"
+    slot_value: "embedding_378.tmp_0"
+    slot_value: "embedding_379.tmp_0"
+    slot_value: "embedding_380.tmp_0"
+    slot_value: "embedding_381.tmp_0"
+    slot_value: "embedding_382.tmp_0"
+    slot_value: "embedding_383.tmp_0"
+    slot_value: "embedding_384.tmp_0"
+    slot_value: "embedding_385.tmp_0"
+    slot_value: "embedding_386.tmp_0"
+    slot_value: "embedding_387.tmp_0"
+    slot_value: "embedding_388.tmp_0"
+    slot_value: "embedding_389.tmp_0"
+    slot_value: "embedding_390.tmp_0"
+    slot_value: "embedding_391.tmp_0"
+    slot_value: "embedding_392.tmp_0"
+    slot_value: "embedding_393.tmp_0"
+    slot_value: "embedding_394.tmp_0"
+    slot_value: "embedding_395.tmp_0"
+    slot_value: "embedding_396.tmp_0"
+    slot_value: "embedding_397.tmp_0"
+    slot_value: "embedding_398.tmp_0"
+    slot_value: "embedding_399.tmp_0"
+    slot_value: "embedding_400.tmp_0"
+    slot_value: "embedding_401.tmp_0"
+    slot_value: "embedding_402.tmp_0"
+    slot_value: "embedding_403.tmp_0"
+    slot_value: "embedding_404.tmp_0"
+    slot_value: "embedding_405.tmp_0"
+    slot_value: "embedding_406.tmp_0"
+    slot_value: "embedding_407.tmp_0"
+    slot_gradient: "embedding_0.tmp_0@GRAD"
+    slot_gradient: "embedding_1.tmp_0@GRAD"
+    slot_gradient: "embedding_2.tmp_0@GRAD"
+    slot_gradient: "embedding_3.tmp_0@GRAD"
+    slot_gradient: "embedding_4.tmp_0@GRAD"
+    slot_gradient: "embedding_5.tmp_0@GRAD"
+    slot_gradient: "embedding_6.tmp_0@GRAD"
+    slot_gradient: "embedding_7.tmp_0@GRAD"
+    slot_gradient: "embedding_8.tmp_0@GRAD"
+    slot_gradient: "embedding_9.tmp_0@GRAD"
+    slot_gradient: "embedding_10.tmp_0@GRAD"
+    slot_gradient: "embedding_11.tmp_0@GRAD"
+    slot_gradient: "embedding_12.tmp_0@GRAD"
+    slot_gradient: "embedding_13.tmp_0@GRAD"
+    slot_gradient: "embedding_14.tmp_0@GRAD"
+    slot_gradient: "embedding_15.tmp_0@GRAD"
+    slot_gradient: "embedding_16.tmp_0@GRAD"
+    slot_gradient: "embedding_17.tmp_0@GRAD"
+    slot_gradient: "embedding_18.tmp_0@GRAD"
+    slot_gradient: "embedding_19.tmp_0@GRAD"
+    slot_gradient: "embedding_20.tmp_0@GRAD"
+    slot_gradient: "embedding_21.tmp_0@GRAD"
+    slot_gradient: "embedding_22.tmp_0@GRAD"
+    slot_gradient: "embedding_23.tmp_0@GRAD"
+    slot_gradient: "embedding_24.tmp_0@GRAD"
+    slot_gradient: "embedding_25.tmp_0@GRAD"
+    slot_gradient: "embedding_26.tmp_0@GRAD"
+    slot_gradient: "embedding_27.tmp_0@GRAD"
+    slot_gradient: "embedding_28.tmp_0@GRAD"
+    slot_gradient: "embedding_29.tmp_0@GRAD"
+    slot_gradient: "embedding_30.tmp_0@GRAD"
+    slot_gradient: "embedding_31.tmp_0@GRAD"
+    slot_gradient: "embedding_32.tmp_0@GRAD"
+    slot_gradient: "embedding_33.tmp_0@GRAD"
+    slot_gradient: "embedding_34.tmp_0@GRAD"
+    slot_gradient: "embedding_35.tmp_0@GRAD"
+    slot_gradient: "embedding_36.tmp_0@GRAD"
+    slot_gradient: "embedding_37.tmp_0@GRAD"
+    slot_gradient: "embedding_38.tmp_0@GRAD"
+    slot_gradient: "embedding_39.tmp_0@GRAD"
+    slot_gradient: "embedding_40.tmp_0@GRAD"
+    slot_gradient: "embedding_41.tmp_0@GRAD"
+    slot_gradient: "embedding_42.tmp_0@GRAD"
+    slot_gradient: "embedding_43.tmp_0@GRAD"
+    slot_gradient: "embedding_44.tmp_0@GRAD"
+    slot_gradient: "embedding_45.tmp_0@GRAD"
+    slot_gradient: "embedding_46.tmp_0@GRAD"
+    slot_gradient: "embedding_47.tmp_0@GRAD"
+    slot_gradient: "embedding_48.tmp_0@GRAD"
+    slot_gradient: "embedding_49.tmp_0@GRAD"
+    slot_gradient: "embedding_50.tmp_0@GRAD"
+    slot_gradient: "embedding_51.tmp_0@GRAD"
+    slot_gradient: "embedding_52.tmp_0@GRAD"
+    slot_gradient: "embedding_53.tmp_0@GRAD"
+    slot_gradient: "embedding_54.tmp_0@GRAD"
+    slot_gradient: "embedding_55.tmp_0@GRAD"
+    slot_gradient: "embedding_56.tmp_0@GRAD"
+    slot_gradient: "embedding_57.tmp_0@GRAD"
+    slot_gradient: "embedding_58.tmp_0@GRAD"
+    slot_gradient: "embedding_59.tmp_0@GRAD"
+    slot_gradient: "embedding_60.tmp_0@GRAD"
+    slot_gradient: "embedding_61.tmp_0@GRAD"
+    slot_gradient: "embedding_62.tmp_0@GRAD"
+    slot_gradient: "embedding_63.tmp_0@GRAD"
+    slot_gradient: "embedding_64.tmp_0@GRAD"
+    slot_gradient: "embedding_65.tmp_0@GRAD"
+    slot_gradient: "embedding_66.tmp_0@GRAD"
+    slot_gradient: "embedding_67.tmp_0@GRAD"
+    slot_gradient: "embedding_68.tmp_0@GRAD"
+    slot_gradient: "embedding_69.tmp_0@GRAD"
+    slot_gradient: "embedding_70.tmp_0@GRAD"
+    slot_gradient: "embedding_71.tmp_0@GRAD"
+    slot_gradient: "embedding_72.tmp_0@GRAD"
+    slot_gradient: "embedding_73.tmp_0@GRAD"
+    slot_gradient: "embedding_74.tmp_0@GRAD"
+    slot_gradient: "embedding_75.tmp_0@GRAD"
+    slot_gradient: "embedding_76.tmp_0@GRAD"
+    slot_gradient: "embedding_77.tmp_0@GRAD"
+    slot_gradient: "embedding_78.tmp_0@GRAD"
+    slot_gradient: "embedding_79.tmp_0@GRAD"
+    slot_gradient: "embedding_80.tmp_0@GRAD"
+    slot_gradient: "embedding_81.tmp_0@GRAD"
+    slot_gradient: "embedding_82.tmp_0@GRAD"
+    slot_gradient: "embedding_83.tmp_0@GRAD"
+    slot_gradient: "embedding_84.tmp_0@GRAD"
+    slot_gradient: "embedding_85.tmp_0@GRAD"
+    slot_gradient: "embedding_86.tmp_0@GRAD"
+    slot_gradient: "embedding_87.tmp_0@GRAD"
+    slot_gradient: "embedding_88.tmp_0@GRAD"
+    slot_gradient: "embedding_89.tmp_0@GRAD"
+    slot_gradient: "embedding_90.tmp_0@GRAD"
+    slot_gradient: "embedding_91.tmp_0@GRAD"
+    slot_gradient: "embedding_92.tmp_0@GRAD"
+    slot_gradient: "embedding_93.tmp_0@GRAD"
+    slot_gradient: "embedding_94.tmp_0@GRAD"
+    slot_gradient: "embedding_95.tmp_0@GRAD"
+    slot_gradient: "embedding_96.tmp_0@GRAD"
+    slot_gradient: "embedding_97.tmp_0@GRAD"
+    slot_gradient: "embedding_98.tmp_0@GRAD"
+    slot_gradient: "embedding_99.tmp_0@GRAD"
+    slot_gradient: "embedding_100.tmp_0@GRAD"
+    slot_gradient: "embedding_101.tmp_0@GRAD"
+    slot_gradient: "embedding_102.tmp_0@GRAD"
+    slot_gradient: "embedding_103.tmp_0@GRAD"
+    slot_gradient: "embedding_104.tmp_0@GRAD"
+    slot_gradient: "embedding_105.tmp_0@GRAD"
+    slot_gradient: "embedding_106.tmp_0@GRAD"
+    slot_gradient: "embedding_107.tmp_0@GRAD"
+    slot_gradient: "embedding_108.tmp_0@GRAD"
+    slot_gradient: "embedding_109.tmp_0@GRAD"
+    slot_gradient: "embedding_110.tmp_0@GRAD"
+    slot_gradient: "embedding_111.tmp_0@GRAD"
+    slot_gradient: "embedding_112.tmp_0@GRAD"
+    slot_gradient: "embedding_113.tmp_0@GRAD"
+    slot_gradient: "embedding_114.tmp_0@GRAD"
+    slot_gradient: "embedding_115.tmp_0@GRAD"
+    slot_gradient: "embedding_116.tmp_0@GRAD"
+    slot_gradient: "embedding_117.tmp_0@GRAD"
+    slot_gradient: "embedding_118.tmp_0@GRAD"
+    slot_gradient: "embedding_119.tmp_0@GRAD"
+    slot_gradient: "embedding_120.tmp_0@GRAD"
+    slot_gradient: "embedding_121.tmp_0@GRAD"
+    slot_gradient: "embedding_122.tmp_0@GRAD"
+    slot_gradient: "embedding_123.tmp_0@GRAD"
+    slot_gradient: "embedding_124.tmp_0@GRAD"
+    slot_gradient: "embedding_125.tmp_0@GRAD"
+    slot_gradient: "embedding_126.tmp_0@GRAD"
+    slot_gradient: "embedding_127.tmp_0@GRAD"
+    slot_gradient: "embedding_128.tmp_0@GRAD"
+    slot_gradient: "embedding_129.tmp_0@GRAD"
+    slot_gradient: "embedding_130.tmp_0@GRAD"
+    slot_gradient: "embedding_131.tmp_0@GRAD"
+    slot_gradient: "embedding_132.tmp_0@GRAD"
+    slot_gradient: "embedding_133.tmp_0@GRAD"
+    slot_gradient: "embedding_134.tmp_0@GRAD"
+    slot_gradient: "embedding_135.tmp_0@GRAD"
+    slot_gradient: "embedding_136.tmp_0@GRAD"
+    slot_gradient: "embedding_137.tmp_0@GRAD"
+    slot_gradient: "embedding_138.tmp_0@GRAD"
+    slot_gradient: "embedding_139.tmp_0@GRAD"
+    slot_gradient: "embedding_140.tmp_0@GRAD"
+    slot_gradient: "embedding_141.tmp_0@GRAD"
+    slot_gradient: "embedding_142.tmp_0@GRAD"
+    slot_gradient: "embedding_143.tmp_0@GRAD"
+    slot_gradient: "embedding_144.tmp_0@GRAD"
+    slot_gradient: "embedding_145.tmp_0@GRAD"
+    slot_gradient: "embedding_146.tmp_0@GRAD"
+    slot_gradient: "embedding_147.tmp_0@GRAD"
+    slot_gradient: "embedding_148.tmp_0@GRAD"
+    slot_gradient: "embedding_149.tmp_0@GRAD"
+    slot_gradient: "embedding_150.tmp_0@GRAD"
+    slot_gradient: "embedding_151.tmp_0@GRAD"
+    slot_gradient: "embedding_152.tmp_0@GRAD"
+    slot_gradient: "embedding_153.tmp_0@GRAD"
+    slot_gradient: "embedding_154.tmp_0@GRAD"
+    slot_gradient: "embedding_155.tmp_0@GRAD"
+    slot_gradient: "embedding_156.tmp_0@GRAD"
+    slot_gradient: "embedding_157.tmp_0@GRAD"
+    slot_gradient: "embedding_158.tmp_0@GRAD"
+    slot_gradient: "embedding_159.tmp_0@GRAD"
+    slot_gradient: "embedding_160.tmp_0@GRAD"
+    slot_gradient: "embedding_161.tmp_0@GRAD"
+    slot_gradient: "embedding_162.tmp_0@GRAD"
+    slot_gradient: "embedding_163.tmp_0@GRAD"
+    slot_gradient: "embedding_164.tmp_0@GRAD"
+    slot_gradient: "embedding_165.tmp_0@GRAD"
+    slot_gradient: "embedding_166.tmp_0@GRAD"
+    slot_gradient: "embedding_167.tmp_0@GRAD"
+    slot_gradient: "embedding_168.tmp_0@GRAD"
+    slot_gradient: "embedding_169.tmp_0@GRAD"
+    slot_gradient: "embedding_170.tmp_0@GRAD"
+    slot_gradient: "embedding_171.tmp_0@GRAD"
+    slot_gradient: "embedding_172.tmp_0@GRAD"
+    slot_gradient: "embedding_173.tmp_0@GRAD"
+    slot_gradient: "embedding_174.tmp_0@GRAD"
+    slot_gradient: "embedding_175.tmp_0@GRAD"
+    slot_gradient: "embedding_176.tmp_0@GRAD"
+    slot_gradient: "embedding_177.tmp_0@GRAD"
+    slot_gradient: "embedding_178.tmp_0@GRAD"
+    slot_gradient: "embedding_179.tmp_0@GRAD"
+    slot_gradient: "embedding_180.tmp_0@GRAD"
+    slot_gradient: "embedding_181.tmp_0@GRAD"
+    slot_gradient: "embedding_182.tmp_0@GRAD"
+    slot_gradient: "embedding_183.tmp_0@GRAD"
+    slot_gradient: "embedding_184.tmp_0@GRAD"
+    slot_gradient: "embedding_185.tmp_0@GRAD"
+    slot_gradient: "embedding_186.tmp_0@GRAD"
+    slot_gradient: "embedding_187.tmp_0@GRAD"
+    slot_gradient: "embedding_188.tmp_0@GRAD"
+    slot_gradient: "embedding_189.tmp_0@GRAD"
+    slot_gradient: "embedding_190.tmp_0@GRAD"
+    slot_gradient: "embedding_191.tmp_0@GRAD"
+    slot_gradient: "embedding_192.tmp_0@GRAD"
+    slot_gradient: "embedding_193.tmp_0@GRAD"
+    slot_gradient: "embedding_194.tmp_0@GRAD"
+    slot_gradient: "embedding_195.tmp_0@GRAD"
+    slot_gradient: "embedding_196.tmp_0@GRAD"
+    slot_gradient: "embedding_197.tmp_0@GRAD"
+    slot_gradient: "embedding_198.tmp_0@GRAD"
+    slot_gradient: "embedding_199.tmp_0@GRAD"
+    slot_gradient: "embedding_200.tmp_0@GRAD"
+    slot_gradient: "embedding_201.tmp_0@GRAD"
+    slot_gradient: "embedding_202.tmp_0@GRAD"
+    slot_gradient: "embedding_203.tmp_0@GRAD"
+    slot_gradient: "embedding_204.tmp_0@GRAD"
+    slot_gradient: "embedding_205.tmp_0@GRAD"
+    slot_gradient: "embedding_206.tmp_0@GRAD"
+    slot_gradient: "embedding_207.tmp_0@GRAD"
+    slot_gradient: "embedding_208.tmp_0@GRAD"
+    slot_gradient: "embedding_209.tmp_0@GRAD"
+    slot_gradient: "embedding_210.tmp_0@GRAD"
+    slot_gradient: "embedding_211.tmp_0@GRAD"
+    slot_gradient: "embedding_212.tmp_0@GRAD"
+    slot_gradient: "embedding_213.tmp_0@GRAD"
+    slot_gradient: "embedding_214.tmp_0@GRAD"
+    slot_gradient: "embedding_215.tmp_0@GRAD"
+    slot_gradient: "embedding_216.tmp_0@GRAD"
+    slot_gradient: "embedding_217.tmp_0@GRAD"
+    slot_gradient: "embedding_218.tmp_0@GRAD"
+    slot_gradient: "embedding_219.tmp_0@GRAD"
+    slot_gradient: "embedding_220.tmp_0@GRAD"
+    slot_gradient: "embedding_221.tmp_0@GRAD"
+    slot_gradient: "embedding_222.tmp_0@GRAD"
+    slot_gradient: "embedding_223.tmp_0@GRAD"
+    slot_gradient: "embedding_224.tmp_0@GRAD"
+    slot_gradient: "embedding_225.tmp_0@GRAD"
+    slot_gradient: "embedding_226.tmp_0@GRAD"
+    slot_gradient: "embedding_227.tmp_0@GRAD"
+    slot_gradient: "embedding_228.tmp_0@GRAD"
+    slot_gradient: "embedding_229.tmp_0@GRAD"
+    slot_gradient: "embedding_230.tmp_0@GRAD"
+    slot_gradient: "embedding_231.tmp_0@GRAD"
+    slot_gradient: "embedding_232.tmp_0@GRAD"
+    slot_gradient: "embedding_233.tmp_0@GRAD"
+    slot_gradient: "embedding_234.tmp_0@GRAD"
+    slot_gradient: "embedding_235.tmp_0@GRAD"
+    slot_gradient: "embedding_236.tmp_0@GRAD"
+    slot_gradient: "embedding_237.tmp_0@GRAD"
+    slot_gradient: "embedding_238.tmp_0@GRAD"
+    slot_gradient: "embedding_239.tmp_0@GRAD"
+    slot_gradient: "embedding_240.tmp_0@GRAD"
+    slot_gradient: "embedding_241.tmp_0@GRAD"
+    slot_gradient: "embedding_242.tmp_0@GRAD"
+    slot_gradient: "embedding_243.tmp_0@GRAD"
+    slot_gradient: "embedding_244.tmp_0@GRAD"
+    slot_gradient: "embedding_245.tmp_0@GRAD"
+    slot_gradient: "embedding_246.tmp_0@GRAD"
+    slot_gradient: "embedding_247.tmp_0@GRAD"
+    slot_gradient: "embedding_248.tmp_0@GRAD"
+    slot_gradient: "embedding_249.tmp_0@GRAD"
+    slot_gradient: "embedding_250.tmp_0@GRAD"
+    slot_gradient: "embedding_251.tmp_0@GRAD"
+    slot_gradient: "embedding_252.tmp_0@GRAD"
+    slot_gradient: "embedding_253.tmp_0@GRAD"
+    slot_gradient: "embedding_254.tmp_0@GRAD"
+    slot_gradient: "embedding_255.tmp_0@GRAD"
+    slot_gradient: "embedding_256.tmp_0@GRAD"
+    slot_gradient: "embedding_257.tmp_0@GRAD"
+    slot_gradient: "embedding_258.tmp_0@GRAD"
+    slot_gradient: "embedding_259.tmp_0@GRAD"
+    slot_gradient: "embedding_260.tmp_0@GRAD"
+    slot_gradient: "embedding_261.tmp_0@GRAD"
+    slot_gradient: "embedding_262.tmp_0@GRAD"
+    slot_gradient: "embedding_263.tmp_0@GRAD"
+    slot_gradient: "embedding_264.tmp_0@GRAD"
+    slot_gradient: "embedding_265.tmp_0@GRAD"
+    slot_gradient: "embedding_266.tmp_0@GRAD"
+    slot_gradient: "embedding_267.tmp_0@GRAD"
+    slot_gradient: "embedding_268.tmp_0@GRAD"
+    slot_gradient: "embedding_269.tmp_0@GRAD"
+    slot_gradient: "embedding_270.tmp_0@GRAD"
+    slot_gradient: "embedding_271.tmp_0@GRAD"
+    slot_gradient: "embedding_272.tmp_0@GRAD"
+    slot_gradient: "embedding_273.tmp_0@GRAD"
+    slot_gradient: "embedding_274.tmp_0@GRAD"
+    slot_gradient: "embedding_275.tmp_0@GRAD"
+    slot_gradient: "embedding_276.tmp_0@GRAD"
+    slot_gradient: "embedding_277.tmp_0@GRAD"
+    slot_gradient: "embedding_278.tmp_0@GRAD"
+    slot_gradient: "embedding_279.tmp_0@GRAD"
+    slot_gradient: "embedding_280.tmp_0@GRAD"
+    slot_gradient: "embedding_281.tmp_0@GRAD"
+    slot_gradient: "embedding_282.tmp_0@GRAD"
+    slot_gradient: "embedding_283.tmp_0@GRAD"
+    slot_gradient: "embedding_284.tmp_0@GRAD"
+    slot_gradient: "embedding_285.tmp_0@GRAD"
+    slot_gradient: "embedding_286.tmp_0@GRAD"
+    slot_gradient: "embedding_287.tmp_0@GRAD"
+    slot_gradient: "embedding_288.tmp_0@GRAD"
+    slot_gradient: "embedding_289.tmp_0@GRAD"
+    slot_gradient: "embedding_290.tmp_0@GRAD"
+    slot_gradient: "embedding_291.tmp_0@GRAD"
+    slot_gradient: "embedding_292.tmp_0@GRAD"
+    slot_gradient: "embedding_293.tmp_0@GRAD"
+    slot_gradient: "embedding_294.tmp_0@GRAD"
+    slot_gradient: "embedding_295.tmp_0@GRAD"
+    slot_gradient: "embedding_296.tmp_0@GRAD"
+    slot_gradient: "embedding_297.tmp_0@GRAD"
+    slot_gradient: "embedding_298.tmp_0@GRAD"
+    slot_gradient: "embedding_299.tmp_0@GRAD"
+    slot_gradient: "embedding_300.tmp_0@GRAD"
+    slot_gradient: "embedding_301.tmp_0@GRAD"
+    slot_gradient: "embedding_302.tmp_0@GRAD"
+    slot_gradient: "embedding_303.tmp_0@GRAD"
+    slot_gradient: "embedding_304.tmp_0@GRAD"
+    slot_gradient: "embedding_305.tmp_0@GRAD"
+    slot_gradient: "embedding_306.tmp_0@GRAD"
+    slot_gradient: "embedding_307.tmp_0@GRAD"
+    slot_gradient: "embedding_308.tmp_0@GRAD"
+    slot_gradient: "embedding_309.tmp_0@GRAD"
+    slot_gradient: "embedding_310.tmp_0@GRAD"
+    slot_gradient: "embedding_311.tmp_0@GRAD"
+    slot_gradient: "embedding_312.tmp_0@GRAD"
+    slot_gradient: "embedding_313.tmp_0@GRAD"
+    slot_gradient: "embedding_314.tmp_0@GRAD"
+    slot_gradient: "embedding_315.tmp_0@GRAD"
+    slot_gradient: "embedding_316.tmp_0@GRAD"
+    slot_gradient: "embedding_317.tmp_0@GRAD"
+    slot_gradient: "embedding_318.tmp_0@GRAD"
+    slot_gradient: "embedding_319.tmp_0@GRAD"
+    slot_gradient: "embedding_320.tmp_0@GRAD"
+    slot_gradient: "embedding_321.tmp_0@GRAD"
+    slot_gradient: "embedding_322.tmp_0@GRAD"
+    slot_gradient: "embedding_323.tmp_0@GRAD"
+    slot_gradient: "embedding_324.tmp_0@GRAD"
+    slot_gradient: "embedding_325.tmp_0@GRAD"
+    slot_gradient: "embedding_326.tmp_0@GRAD"
+    slot_gradient: "embedding_327.tmp_0@GRAD"
+    slot_gradient: "embedding_328.tmp_0@GRAD"
+    slot_gradient: "embedding_329.tmp_0@GRAD"
+    slot_gradient: "embedding_330.tmp_0@GRAD"
+    slot_gradient: "embedding_331.tmp_0@GRAD"
+    slot_gradient: "embedding_332.tmp_0@GRAD"
+    slot_gradient: "embedding_333.tmp_0@GRAD"
+    slot_gradient: "embedding_334.tmp_0@GRAD"
+    slot_gradient: "embedding_335.tmp_0@GRAD"
+    slot_gradient: "embedding_336.tmp_0@GRAD"
+    slot_gradient: "embedding_337.tmp_0@GRAD"
+    slot_gradient: "embedding_338.tmp_0@GRAD"
+    slot_gradient: "embedding_339.tmp_0@GRAD"
+    slot_gradient: "embedding_340.tmp_0@GRAD"
+    slot_gradient: "embedding_341.tmp_0@GRAD"
+    slot_gradient: "embedding_342.tmp_0@GRAD"
+    slot_gradient: "embedding_343.tmp_0@GRAD"
+    slot_gradient: "embedding_344.tmp_0@GRAD"
+    slot_gradient: "embedding_345.tmp_0@GRAD"
+    slot_gradient: "embedding_346.tmp_0@GRAD"
+    slot_gradient: "embedding_347.tmp_0@GRAD"
+    slot_gradient: "embedding_348.tmp_0@GRAD"
+    slot_gradient: "embedding_349.tmp_0@GRAD"
+    slot_gradient: "embedding_350.tmp_0@GRAD"
+    slot_gradient: "embedding_351.tmp_0@GRAD"
+    slot_gradient: "embedding_352.tmp_0@GRAD"
+    slot_gradient: "embedding_353.tmp_0@GRAD"
+    slot_gradient: "embedding_354.tmp_0@GRAD"
+    slot_gradient: "embedding_355.tmp_0@GRAD"
+    slot_gradient: "embedding_356.tmp_0@GRAD"
+    slot_gradient: "embedding_357.tmp_0@GRAD"
+    slot_gradient: "embedding_358.tmp_0@GRAD"
+    slot_gradient: "embedding_359.tmp_0@GRAD"
+    slot_gradient: "embedding_360.tmp_0@GRAD"
+    slot_gradient: "embedding_361.tmp_0@GRAD"
+    slot_gradient: "embedding_362.tmp_0@GRAD"
+    slot_gradient: "embedding_363.tmp_0@GRAD"
+    slot_gradient: "embedding_364.tmp_0@GRAD"
+    slot_gradient: "embedding_365.tmp_0@GRAD"
+    slot_gradient: "embedding_366.tmp_0@GRAD"
+    slot_gradient: "embedding_367.tmp_0@GRAD"
+    slot_gradient: "embedding_368.tmp_0@GRAD"
+    slot_gradient: "embedding_369.tmp_0@GRAD"
+    slot_gradient: "embedding_370.tmp_0@GRAD"
+    slot_gradient: "embedding_371.tmp_0@GRAD"
+    slot_gradient: "embedding_372.tmp_0@GRAD"
+    slot_gradient: "embedding_373.tmp_0@GRAD"
+    slot_gradient: "embedding_374.tmp_0@GRAD"
+    slot_gradient: "embedding_375.tmp_0@GRAD"
+    slot_gradient: "embedding_376.tmp_0@GRAD"
+    slot_gradient: "embedding_377.tmp_0@GRAD"
+    slot_gradient: "embedding_378.tmp_0@GRAD"
+    slot_gradient: "embedding_379.tmp_0@GRAD"
+    slot_gradient: "embedding_380.tmp_0@GRAD"
+    slot_gradient: "embedding_381.tmp_0@GRAD"
+    slot_gradient: "embedding_382.tmp_0@GRAD"
+    slot_gradient: "embedding_383.tmp_0@GRAD"
+    slot_gradient: "embedding_384.tmp_0@GRAD"
+    slot_gradient: "embedding_385.tmp_0@GRAD"
+    slot_gradient: "embedding_386.tmp_0@GRAD"
+    slot_gradient: "embedding_387.tmp_0@GRAD"
+    slot_gradient: "embedding_388.tmp_0@GRAD"
+    slot_gradient: "embedding_389.tmp_0@GRAD"
+    slot_gradient: "embedding_390.tmp_0@GRAD"
+    slot_gradient: "embedding_391.tmp_0@GRAD"
+    slot_gradient: "embedding_392.tmp_0@GRAD"
+    slot_gradient: "embedding_393.tmp_0@GRAD"
+    slot_gradient: "embedding_394.tmp_0@GRAD"
+    slot_gradient: "embedding_395.tmp_0@GRAD"
+    slot_gradient: "embedding_396.tmp_0@GRAD"
+    slot_gradient: "embedding_397.tmp_0@GRAD"
+    slot_gradient: "embedding_398.tmp_0@GRAD"
+    slot_gradient: "embedding_399.tmp_0@GRAD"
+    slot_gradient: "embedding_400.tmp_0@GRAD"
+    slot_gradient: "embedding_401.tmp_0@GRAD"
+    slot_gradient: "embedding_402.tmp_0@GRAD"
+    slot_gradient: "embedding_403.tmp_0@GRAD"
+    slot_gradient: "embedding_404.tmp_0@GRAD"
+    slot_gradient: "embedding_405.tmp_0@GRAD"
+    slot_gradient: "embedding_406.tmp_0@GRAD"
+    slot_gradient: "embedding_407.tmp_0@GRAD"
+  }
+  skip_op: "lookup_table"
+  skip_op: "lookup_table_grad"
+}
+fs_client_param {
+  uri: "afs://xingtian.afs.baidu.com:9902"
+  user: "mlarch"
+  passwd: "Fv1M87"
+  hadoop_bin: "$HADOOP_HOME/bin/hadoop"
+}
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/model_new.py b/feed/feed_deploy/news_jingpai/package/my_nets/model_new.py
new file mode 100644
index 0000000000000000000000000000000000000000..172ed804a52e8f53b8dbcd35874923408893e5c5
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/model_new.py
@@ -0,0 +1,188 @@
+
+import paddle.fluid as fluid
+from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+
+class Model(object):
+    def __init__(self, slot_file_name, all_slot_file, use_cvm, ins_tag, is_update_model):
+        self._slot_file_name = slot_file_name
+        self._use_cvm = use_cvm
+        self._dict_dim = 10 # it's fake
+        self._emb_dim = 9 + 2
+        self._init_range = 0.2
+        self._all_slot_file = all_slot_file
+        self._not_use_slots = []
+        self._not_use_slotemb = []
+        self._all_slots = []
+        self._ins_tag_value = ins_tag
+        self._is_update_model = is_update_model
+        self._train_program = fluid.Program()
+        self._startup_program = fluid.Program()
+        self.save_vars = []
+        with fluid.program_guard(self._train_program, self._startup_program):
+            with fluid.unique_name.guard():
+                self.show = fluid.layers.data(name="show", shape=[-1, 1], dtype="int64", lod_level=0, append_batch_size=False)
+                self.label = fluid.layers.data(name="click", shape=[-1, 1], dtype="int64", lod_level=0, append_batch_size=False)
+                self.ins_weight = fluid.layers.data(
+                    name="12345",
+                    shape=[-1, 1],
+                    dtype="float32",
+                    lod_level=0,
+                    append_batch_size=False,
+                    stop_gradient=True)
+                self.ins_tag = fluid.layers.data(
+                    name="23456",
+                    shape=[-1, 1],
+                    dtype="int64",
+                    lod_level=0,
+                    append_batch_size=False,
+                    stop_gradient=True)
+                self.slots = []
+                self.slots_name = []
+                self.embs = []
+
+                
+                if self._ins_tag_value != 0:
+                    self.x3_ts = fluid.layers.create_global_var(shape=[1,1], value=self._ins_tag_value, dtype='int64', persistable=True, force_cpu=True, name='X3')
+                    self.x3_ts.stop_gradient=True
+                    self.label_after_filter, self.filter_loss = fluid.layers.filter_by_instag(self.label, self.ins_tag, self.x3_ts, True)
+                    self.label_after_filter.stop_gradient=True
+                    self.show_after_filter, _ = fluid.layers.filter_by_instag(self.show, self.ins_tag, self.x3_ts, True)
+                    self.show_after_filter.stop_gradient=True
+                    self.ins_weight_after_filter, _ = fluid.layers.filter_by_instag(self.ins_weight, self.ins_tag,  self.x3_ts, True)
+                    self.ins_weight_after_filter.stop_gradient=True
+
+                for line in open(self._slot_file_name, 'r'):
+                    slot = line.strip()
+                    self.slots_name.append(slot)
+
+                self.all_slots_name = []
+                for line in open(self._all_slot_file, 'r'):
+                    self.all_slots_name.append(line.strip())
+                for i in self.all_slots_name:
+                    if i == self.ins_weight.name or i == self.ins_tag.name:
+                        pass
+                    elif i not in self.slots_name:
+                        pass
+                    else:
+                        l = fluid.layers.data(name=i, shape=[1], dtype="int64", lod_level=1)
+                        emb = fluid.layers.embedding(input=l, size=[self._dict_dim, self._emb_dim], is_sparse = True, is_distributed=True, param_attr=fluid.ParamAttr(name="embedding"))
+                        self.slots.append(l)
+                        self.embs.append(emb)
+
+                if self._ins_tag_value != 0:
+                    self.emb = self.slot_net(self.slots, self.label_after_filter)
+                else:
+                    self.emb = self.slot_net(self.slots, self.label)
+
+                self.similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(self.emb, min=-15.0, max=15.0), name="similarity_norm")
+                
+                if self._ins_tag_value != 0:
+                    self.cost = fluid.layers.log_loss(input=self.similarity_norm, label=fluid.layers.cast(x=self.label_after_filter, dtype='float32'))
+                else:
+                    self.cost = fluid.layers.log_loss(input=self.similarity_norm, label=fluid.layers.cast(x=self.label, dtype='float32'))
+               
+                if self._ins_tag_value != 0:
+                    self.cost = fluid.layers.elementwise_mul(self.cost, self.ins_weight_after_filter)
+                else:
+                    self.cost = fluid.layers.elementwise_mul(self.cost, self.ins_weight)
+                
+                if self._ins_tag_value != 0:
+                    self.cost = fluid.layers.elementwise_mul(self.cost, self.filter_loss)
+
+                self.avg_cost = fluid.layers.mean(x=self.cost)
+
+                binary_predict = fluid.layers.concat(
+                        input=[fluid.layers.elementwise_sub(fluid.layers.ceil(self.similarity_norm), self.similarity_norm), self.similarity_norm], axis=1)
+                
+                if self._ins_tag_value != 0:
+                    self.auc, batch_auc, [self.batch_stat_pos, self.batch_stat_neg, self.stat_pos, self.stat_neg] = \
+                            fluid.layers.auc(input=binary_predict, label=self.label_after_filter, curve='ROC', num_thresholds=4096)
+                    self.sqrerr, self.abserr, self.prob, self.q, self.pos, self.total = \
+                    fluid.contrib.layers.ctr_metric_bundle(self.similarity_norm, fluid.layers.cast(x=self.label_after_filter, dtype='float32'))
+
+                    #self.precise_ins_num = fluid.layers.create_global_var(persistable=True, dtype='float32', shape=[1])
+                    #batch_ins_num = fluid.layers.reduce_sum(self.filter_loss)
+                    #self.precise_ins_num = fluid.layers.elementwise_add(batch_ins_num, self.precise_ins_num)
+
+                else:
+                    self.auc, batch_auc, [self.batch_stat_pos, self.batch_stat_neg, self.stat_pos, self.stat_neg] = \
+                            fluid.layers.auc(input=binary_predict, label=self.label, curve='ROC', num_thresholds=4096)
+                    self.sqrerr, self.abserr, self.prob, self.q, self.pos, self.total = \
+                    fluid.contrib.layers.ctr_metric_bundle(self.similarity_norm, fluid.layers.cast(x=self.label, dtype='float32'))
+
+
+
+        self.tmp_train_program = fluid.Program()
+        self.tmp_startup_program = fluid.Program()
+        with fluid.program_guard(self.tmp_train_program, self.tmp_startup_program):
+            with fluid.unique_name.guard():
+                self._all_slots = [self.show, self.label]
+                self._merge_slots = []
+                for i in self.all_slots_name:
+                    if i == self.ins_weight.name:
+                        self._all_slots.append(self.ins_weight)
+                    elif i == self.ins_tag.name:
+                        self._all_slots.append(self.ins_tag)
+                    else:
+                        l = fluid.layers.data(name=i, shape=[1], dtype="int64", lod_level=1)
+                        self._all_slots.append(l)
+                        self._merge_slots.append(l)
+
+
+
+
+    def slot_net(self, slots, label, lr_x=1.0):
+        input_data = []
+        cvms = []
+
+        cast_label = fluid.layers.cast(label, dtype='float32')
+        cast_label.stop_gradient = True
+        ones = fluid.layers.fill_constant_batch_size_like(input=label, shape=[-1, 1], dtype="float32", value=1)
+        show_clk = fluid.layers.cast(fluid.layers.concat([ones, cast_label], axis=1), dtype='float32')
+        show_clk.stop_gradient = True
+
+        for index in range(len(slots)):
+            input_data.append(slots[index])
+            emb = self.embs[index]
+            bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+            cvm = fluid.layers.continuous_value_model(bow, show_clk, self._use_cvm)
+            cvms.append(cvm)
+
+        concat = None
+        if self._ins_tag_value != 0:
+            concat = fluid.layers.concat(cvms, axis=1)
+            concat, _ = fluid.layers.filter_by_instag(concat, self.ins_tag, self.x3_ts, False)
+        else:
+            concat = fluid.layers.concat(cvms, axis=1)
+        bn = concat
+        if self._use_cvm:
+            bn = fluid.layers.data_norm(input=concat, name="bn6048", epsilon=1e-4,
+                    param_attr={
+                        "batch_size":1e4,
+                        "batch_sum_default":0.0,
+                        "batch_square":1e4})
+            self.save_vars.append(bn)
+        fc_layers_input = [bn]
+        if self._is_update_model:
+            fc_layers_size = [511, 255, 127, 127, 127, 1]
+        else:
+            fc_layers_size = [511, 255, 255, 127, 127, 127, 127, 1]
+        fc_layers_act = ["relu"] * (len(fc_layers_size) - 1) + [None]
+        scales_tmp = [bn.shape[1]] + fc_layers_size
+        scales = []
+        for i in range(len(scales_tmp)):
+            scales.append(self._init_range / (scales_tmp[i] ** 0.5))
+        for i in range(len(fc_layers_size)):
+            fc = fluid.layers.fc(
+                    input = fc_layers_input[-1],
+                    size = fc_layers_size[i],
+                    act = fc_layers_act[i],
+                    param_attr = \
+                            fluid.ParamAttr(learning_rate=lr_x, \
+                            initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])),
+                    bias_attr = \
+                            fluid.ParamAttr(learning_rate=lr_x, \
+                            initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))
+            fc_layers_input.append(fc)
+            self.save_vars.append(fc)
+        return fc_layers_input[-1]
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/model_new_jc.py b/feed/feed_deploy/news_jingpai/package/my_nets/model_new_jc.py
new file mode 100644
index 0000000000000000000000000000000000000000..31802b4a0f9f321bcbc7ad5ce68dc70e34cae9f6
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/model_new_jc.py
@@ -0,0 +1,166 @@
+
+import paddle.fluid as fluid
+from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+
+class ModelJoinCommon(object):
+    def __init__(self, slot_file_name,  slot_common_file_name, all_slot_file, join_ins_tag):
+        self.slot_file_name = slot_file_name
+        self.slot_common_file_name = slot_common_file_name
+        self.dict_dim = 10 # it's fake
+        self.emb_dim = 9 + 2
+        self.init_range = 0.2
+        self.all_slot_file = all_slot_file
+        self.ins_tag_v = join_ins_tag
+        self._train_program = fluid.Program()
+        self._startup_program = fluid.Program()
+        with fluid.program_guard(self._train_program, self._startup_program):
+            with fluid.unique_name.guard():
+                self.show = fluid.layers.data(name="show", shape=[-1, 1], dtype="int64", lod_level=0, append_batch_size=False)
+                self.label = fluid.layers.data(name="click", shape=[-1, 1], dtype="int64", lod_level=0, append_batch_size=False)
+                self.ins_weight = fluid.layers.data(
+                    name="12345",
+                    shape=[-1, 1],
+                    dtype="float32",
+                    lod_level=0,
+                    append_batch_size=False,
+                    stop_gradient=True)
+                self.ins_tag = fluid.layers.data(
+                    name="23456",
+                    shape=[-1, 1],
+                    dtype="int64",
+                    lod_level=0,
+                    append_batch_size=False,
+                    stop_gradient=True)
+                self.x3_ts = fluid.layers.create_global_var(shape=[1,1], value=self.ins_tag_v, dtype='int64', persistable=True, force_cpu=True, name='X3')
+                self.x3_ts.stop_gradient=True
+                self.label_after_filter, self.filter_loss = fluid.layers.filter_by_instag(self.label, self.ins_tag, self.x3_ts, True)
+                self.label_after_filter.stop_gradient=True
+                self.show_after_filter, _ = fluid.layers.filter_by_instag(self.show, self.ins_tag, self.x3_ts, True)
+                self.show_after_filter.stop_gradient=True
+                self.ins_weight_after_filter, _ = fluid.layers.filter_by_instag(self.ins_weight, self.ins_tag,  self.x3_ts, True)
+                self.ins_weight_after_filter.stop_gradient=True
+                
+                self.slots_name = []
+                for line in open(self.slot_file_name, 'r'):
+                    slot = line.strip()
+                    self.slots_name.append(slot)
+
+                self.all_slots_name = []
+                for line in open(self.all_slot_file, 'r'):
+                    self.all_slots_name.append(line.strip())
+
+                self.slots = []
+                self.embs = []
+                for i in self.all_slots_name:
+                    if i == self.ins_weight.name or i == self.ins_tag.name:
+                        pass
+                    elif i not in self.slots_name:
+                        pass
+                    else:
+                        l = fluid.layers.data(name=i, shape=[1], dtype="int64", lod_level=1)
+                        emb = fluid.layers.embedding(input=l, size=[self.dict_dim, self.emb_dim], is_sparse = True, is_distributed=True, param_attr=fluid.ParamAttr(name="embedding"))
+                        self.slots.append(l)
+                        self.embs.append(emb)
+
+                self.common_slot_name = []
+                for i in open(self.slot_common_file_name, 'r'):
+                    self.common_slot_name.append(i.strip())
+
+                cvms = []
+                cast_label = fluid.layers.cast(self.label, dtype='float32')
+                cast_label.stop_gradient = True
+                ones = fluid.layers.fill_constant_batch_size_like(input=self.label, shape=[-1, 1], dtype="float32", value=1)
+                show_clk = fluid.layers.cast(fluid.layers.concat([ones, cast_label], axis=1), dtype='float32')
+                show_clk.stop_gradient = True
+                for index in range(len(self.embs)):
+                    emb = self.embs[index]
+                    emb.stop_gradient=True
+                    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+                    bow.stop_gradient=True
+                    cvm = fluid.layers.continuous_value_model(bow, show_clk, True)
+                    cvm.stop_gradient=True
+                    cvms.append(cvm)
+                concat_join = fluid.layers.concat(cvms, axis=1)
+                concat_join.stop_gradient=True
+                
+                cvms_common = []
+                for index in range(len(self.common_slot_name)):
+                    cvms_common.append(cvms[index])
+                concat_common = fluid.layers.concat(cvms_common, axis=1)
+                concat_common.stop_gradient=True
+                
+                bn_common = fluid.layers.data_norm(input=concat_common, name="common", epsilon=1e-4, param_attr={"batch_size":1e4,"batch_sum_default":0.0,"batch_square":1e4})
+
+                concat_join, _ = fluid.layers.filter_by_instag(concat_join, self.ins_tag, self.x3_ts, False)
+                concat_join.stop_gradient=True
+                bn_join = fluid.layers.data_norm(input=concat_join, name="join", epsilon=1e-4, param_attr={"batch_size":1e4,"batch_sum_default":0.0,"batch_square":1e4})
+                
+                join_fc = self.fcs(bn_join, "join")
+                join_similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(join_fc, min=-15.0, max=15.0), name="join_similarity_norm")
+                join_cost = fluid.layers.log_loss(input=join_similarity_norm, label=fluid.layers.cast(x=self.label_after_filter, dtype='float32'))
+                join_cost = fluid.layers.elementwise_mul(join_cost, self.ins_weight_after_filter)
+                join_cost = fluid.layers.elementwise_mul(join_cost, self.filter_loss)
+                join_avg_cost = fluid.layers.mean(x=join_cost)
+
+                common_fc = self.fcs(bn_common, "common")
+                common_similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(common_fc, min=-15.0, max=15.0), name="common_similarity_norm")
+                common_cost = fluid.layers.log_loss(input=common_similarity_norm, label=fluid.layers.cast(x=self.label, dtype='float32'))
+                common_cost = fluid.layers.elementwise_mul(common_cost, self.ins_weight)
+                common_avg_cost = fluid.layers.mean(x=common_cost)
+
+                self.joint_cost = join_avg_cost + common_avg_cost
+
+                join_binary_predict = fluid.layers.concat(
+                        input=[fluid.layers.elementwise_sub(fluid.layers.ceil(join_similarity_norm), join_similarity_norm), join_similarity_norm], axis=1)
+                self.join_auc, batch_auc, [self.join_batch_stat_pos, self.join_batch_stat_neg, self.join_stat_pos, self.join_stat_neg] = \
+                        fluid.layers.auc(input=join_binary_predict, label=self.label_after_filter, curve='ROC', num_thresholds=4096)
+                self.join_sqrerr, self.join_abserr, self.join_prob, self.join_q, self.join_pos, self.join_total = \
+                        fluid.contrib.layers.ctr_metric_bundle(join_similarity_norm, fluid.layers.cast(x=self.label_after_filter, dtype='float32'))
+
+                common_binary_predict = fluid.layers.concat(
+                        input=[fluid.layers.elementwise_sub(fluid.layers.ceil(common_similarity_norm), common_similarity_norm), common_similarity_norm], axis=1)
+                self.common_auc, batch_auc, [self.common_batch_stat_pos, self.common_batch_stat_neg, self.common_stat_pos, self.common_stat_neg] = \
+                        fluid.layers.auc(input=common_binary_predict, label=self.label, curve='ROC', num_thresholds=4096)
+                self.common_sqrerr, self.common_abserr, self.common_prob, self.common_q, self.common_pos, self.common_total = \
+                        fluid.contrib.layers.ctr_metric_bundle(common_similarity_norm, fluid.layers.cast(x=self.label, dtype='float32'))
+
+        self.tmp_train_program = fluid.Program()
+        self.tmp_startup_program = fluid.Program()
+        with fluid.program_guard(self.tmp_train_program, self.tmp_startup_program):
+            with fluid.unique_name.guard():
+                self._all_slots = [self.show, self.label]
+                self._merge_slots = []
+                for i in self.all_slots_name:
+                    if i == self.ins_weight.name:
+                        self._all_slots.append(self.ins_weight)
+                    elif i == self.ins_tag.name:
+                        self._all_slots.append(self.ins_tag)
+                    else:
+                        l = fluid.layers.data(name=i, shape=[1], dtype="int64", lod_level=1)
+                        self._all_slots.append(l)
+                        self._merge_slots.append(l)
+
+
+    def fcs(self, bn, prefix):
+        fc_layers_input = [bn]
+        fc_layers_size = [511, 255, 255, 127, 127, 127, 127, 1]
+        fc_layers_act = ["relu"] * (len(fc_layers_size) - 1) + [None]
+        scales_tmp = [bn.shape[1]] + fc_layers_size
+        scales = []
+        for i in range(len(scales_tmp)):
+            scales.append(self.init_range / (scales_tmp[i] ** 0.5))
+        for i in range(len(fc_layers_size)):
+            name = prefix+"_"+str(i)
+            fc = fluid.layers.fc(
+                    input = fc_layers_input[-1],
+                    size = fc_layers_size[i],
+                    act = fc_layers_act[i],
+                    param_attr = \
+                            fluid.ParamAttr(learning_rate=1.0, \
+                            initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])),
+                    bias_attr = \
+                            fluid.ParamAttr(learning_rate=1.0, \
+                            initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])),
+                    name=name)
+            fc_layers_input.append(fc)
+        return fc_layers_input[-1]
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/my_data_generator_str.py b/feed/feed_deploy/news_jingpai/package/my_nets/my_data_generator_str.py
new file mode 100644
index 0000000000000000000000000000000000000000..d47664645704fca47a964c27c55c400a6efae7a4
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/my_data_generator_str.py
@@ -0,0 +1,89 @@
+import sys
+import os
+import paddle
+import re
+import collections
+import time
+#import paddle.fluid.incubate.data_generator as dg
+import data_generate_base as dg
+
+class MyDataset(dg.MultiSlotDataGenerator):
+    def load_resource(self, dictf):
+        self._all_slots_dict = collections.OrderedDict()
+        with open(dictf, 'r') as f:
+            slots = f.readlines()
+        for index, slot in enumerate(slots):
+            #self._all_slots_dict[slot.strip()] = [False, index + 3] #+3 #
+            self._all_slots_dict[slot.strip()] = [False, index + 2]
+
+    def generate_sample(self, line):
+        def data_iter_str():
+            s = line.split('\t')[0].split()#[1:]
+            lineid = s[0]
+            elements = s[1:] #line.split('\t')[0].split()[1:]
+            padding = "0"
+           # output = [("lineid", [lineid]), ("show", [elements[0]]), ("click", [elements[1]])]
+            output = [("show", [elements[0]]), ("click", [elements[1]])]
+            output.extend([(slot, []) for slot in self._all_slots_dict])
+            for elem in elements[2:]:
+                if elem.startswith("*"):                                           
+                    feasign = elem[1:]                                             
+                    slot = "12345"                                                 
+                elif elem.startswith("$"):                                         
+                    feasign = elem[1:]                                             
+                    if feasign == "D":                                             
+                        feasign = "0"                                                
+                    slot = "23456"                                                 
+                else:                                                              
+                    feasign, slot = elem.split(':')
+                #feasign, slot = elem.split(':')
+                if not self._all_slots_dict.has_key(slot):
+                    continue
+                self._all_slots_dict[slot][0] = True
+                index = self._all_slots_dict[slot][1]
+                output[index][1].append(feasign)
+            for slot in self._all_slots_dict:
+                visit, index = self._all_slots_dict[slot]
+                if visit:
+                    self._all_slots_dict[slot][0] = False
+                else:
+                    output[index][1].append(padding)
+            #print output
+            yield output
+
+        return data_iter_str
+
+        def data_iter():
+            elements = line.split('\t')[0].split()[1:]
+            padding = 0
+            output = [("show", [int(elements[0])]), ("click", [int(elements[1])])]
+            #output += [(slot, []) for slot in self._all_slots_dict]
+            output.extend([(slot, []) for slot in self._all_slots_dict])
+            for elem in elements[2:]:
+                feasign, slot = elem.split(':')
+                if slot == "12345":
+                    feasign = float(feasign)
+                else:
+                    feasign = int(feasign)
+                if not self._all_slots_dict.has_key(slot):
+                    continue
+                self._all_slots_dict[slot][0] = True
+                index = self._all_slots_dict[slot][1]
+                output[index][1].append(feasign)
+            for slot in self._all_slots_dict:
+                visit, index = self._all_slots_dict[slot]
+                if visit:
+                    self._all_slots_dict[slot][0] = False
+                else:
+                    output[index][1].append(padding)
+            yield output
+        return data_iter
+
+
+if __name__ == "__main__":
+    #start = time.clock()
+    d = MyDataset()
+    d.load_resource("all_slot.dict")
+    d.run_from_stdin()
+    #elapsed = (time.clock() - start)
+    #print("Time used:",elapsed)
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_join_common_startup_program.bin b/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_join_common_startup_program.bin
new file mode 100644
index 0000000000000000000000000000000000000000..edb43bda80ce2044da2dcd586e90c207e9fe268c
Binary files /dev/null and b/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_join_common_startup_program.bin differ
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_join_common_train_program.bin b/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_join_common_train_program.bin
new file mode 100644
index 0000000000000000000000000000000000000000..89cb5d3dde949c31de7b3ce60b4108ac282a71f1
Binary files /dev/null and b/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_join_common_train_program.bin differ
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_update_main_program.bin b/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_update_main_program.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d06fb007bb1c568b0afcfcb460c7db2362e40503
Binary files /dev/null and b/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_update_main_program.bin differ
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_update_startup_program.bin b/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_update_startup_program.bin
new file mode 100644
index 0000000000000000000000000000000000000000..76b538aca52b9c46cfae8b79b8ffa772f4f5fc2d
Binary files /dev/null and b/feed/feed_deploy/news_jingpai/package/my_nets/old_program/old_update_startup_program.bin differ
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/slot b/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/slot
new file mode 100644
index 0000000000000000000000000000000000000000..3e91b42e36e3bef406efc31c50a997ea7dc58f86
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/slot
@@ -0,0 +1,408 @@
+6048
+6002
+6145
+6202
+6201
+6121
+6738
+6119
+6146
+6120
+6147
+6122
+6123
+6118
+6142
+6143
+6008
+6148
+6151
+6127
+6144
+6094
+6083
+6952
+6739
+6150
+6109
+6003
+6099
+6149
+6129
+6203
+6153
+6152
+6128
+6106
+6251
+7082
+7515
+6951
+6949
+7080
+6066
+7507
+6186
+6007
+7514
+6125
+7506
+10001
+6006
+7023
+6085
+10000
+6098
+6250
+6110
+6124
+6090
+6082
+6067
+6101
+6004
+6191
+7075
+6948
+6157
+6126
+6188
+7077
+6070
+6111
+6087
+6103
+6107
+6194
+6156
+6005
+6247
+6814
+6158
+7122
+6058
+6189
+7058
+6059
+6115
+7079
+7081
+6833
+7024
+6108
+13342
+13345
+13412
+13343
+13350
+13346
+13409
+6009
+6011
+6012
+6013
+6014
+6015
+6019
+6023
+6024
+6027
+6029
+6031
+6050
+6060
+6068
+6069
+6089
+6095
+6105
+6112
+6130
+6131
+6132
+6134
+6161
+6162
+6163
+6166
+6182
+6183
+6185
+6190
+6212
+6213
+6231
+6233
+6234
+6236
+6238
+6239
+6240
+6241
+6242
+6243
+6244
+6245
+6354
+7002
+7005
+7008
+7010
+7012
+7013
+7015
+7016
+7017
+7018
+7019
+7020
+7045
+7046
+7048
+7049
+7052
+7054
+7056
+7064
+7066
+7076
+7078
+7083
+7084
+7085
+7086
+7087
+7088
+7089
+7090
+7099
+7100
+7101
+7102
+7103
+7104
+7105
+7109
+7124
+7126
+7136
+7142
+7143
+7144
+7145
+7146
+7147
+7148
+7150
+7151
+7152
+7153
+7154
+7155
+7156
+7157
+7047
+7050
+6253
+6254
+6255
+6256
+6257
+6259
+6260
+6261
+7170
+7185
+7186
+6751
+6755
+6757
+6759
+6760
+6763
+6764
+6765
+6766
+6767
+6768
+6769
+6770
+7502
+7503
+7504
+7505
+7510
+7511
+7512
+7513
+6806
+6807
+6808
+6809
+6810
+6811
+6812
+6813
+6815
+6816
+6817
+6819
+6823
+6828
+6831
+6840
+6845
+6875
+6879
+6881
+6888
+6889
+6947
+6950
+6956
+6957
+6959
+10006
+10008
+10009
+10010
+10011
+10016
+10017
+10018
+10019
+10020
+10021
+10022
+10023
+10024
+10029
+10030
+10031
+10032
+10033
+10034
+10035
+10036
+10037
+10038
+10039
+10040
+10041
+10042
+10044
+10045
+10046
+10051
+10052
+10053
+10054
+10055
+10056
+10057
+10060
+10066
+10069
+6820
+6821
+6822
+13333
+13334
+13335
+13336
+13337
+13338
+13339
+13340
+13341
+13351
+13352
+13353
+13359
+13361
+13362
+13363
+13366
+13367
+13368
+13369
+13370
+13371
+13375
+13376
+5700
+5702
+13400
+13401
+13402
+13403
+13404
+13406
+13407
+13408
+13410
+13417
+13418
+13419
+13420
+13422
+13425
+13427
+13428
+13429
+13430
+13431
+13433
+13434
+13436
+13437
+13326
+13330
+13331
+5717
+13442
+13451
+13452
+13455
+13456
+13457
+13458
+13459
+13460
+13461
+13462
+13463
+13464
+13465
+13466
+13467
+13468
+1104
+1106
+1107
+1108
+1109
+1110
+1111
+1112
+1113
+1114
+1115
+1116
+1117
+1119
+1120
+1121
+1122
+1123
+1124
+1125
+1126
+1127
+1128
+1129
+13812
+13813
+6740
+1490
+1491
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/slot_common b/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/slot_common
new file mode 100644
index 0000000000000000000000000000000000000000..869fb695282eed4a69928e7af52dd49a62e0d4c6
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/slot_common
@@ -0,0 +1,99 @@
+6048
+6002
+6145
+6202
+6201
+6121
+6738
+6119
+6146
+6120
+6147
+6122
+6123
+6118
+6142
+6143
+6008
+6148
+6151
+6127
+6144
+6094
+6083
+6952
+6739
+6150
+6109
+6003
+6099
+6149
+6129
+6203
+6153
+6152
+6128
+6106
+6251
+7082
+7515
+6951
+6949
+7080
+6066
+7507
+6186
+6007
+7514
+6125
+7506
+10001
+6006
+7023
+6085
+10000
+6098
+6250
+6110
+6124
+6090
+6082
+6067
+6101
+6004
+6191
+7075
+6948
+6157
+6126
+6188
+7077
+6070
+6111
+6087
+6103
+6107
+6194
+6156
+6005
+6247
+6814
+6158
+7122
+6058
+6189
+7058
+6059
+6115
+7079
+7081
+6833
+7024
+6108
+13342
+13345
+13412
+13343
+13350
+13346
+13409
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/to.py b/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/to.py
new file mode 100644
index 0000000000000000000000000000000000000000..638c53647dc2adc1d502ed53630f07dbcfe8ffce
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/old_slot/to.py
@@ -0,0 +1,5 @@
+with open("session_slot", "r") as fin:
+    res = []
+    for i in fin:
+        res.append("\"" + i.strip() + "\"")
+    print ", ".join(res)
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/reqi_fleet_desc b/feed/feed_deploy/news_jingpai/package/my_nets/reqi_fleet_desc
new file mode 100644
index 0000000000000000000000000000000000000000..c0d3ab823170856e9a50f6d9f6b1b4b323833bf2
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/reqi_fleet_desc
@@ -0,0 +1,1461 @@
+server_param {
+  downpour_server_param {
+    downpour_table_param {
+      table_id: 0
+      table_class: "DownpourSparseTable"
+      shard_num: 1950
+      sparse_table_cache_rate: 0.00055
+      accessor {
+        accessor_class: "DownpourCtrAccessor"
+        sparse_sgd_param {
+          learning_rate: 0.05
+          initial_g2sum: 3.0
+          initial_range: 0.0001
+          weight_bounds: -10.0
+          weight_bounds: 10.0
+        }
+        fea_dim: 11
+        embedx_dim: 8
+        embedx_threshold: 10
+        downpour_accessor_param {
+          nonclk_coeff: 0.1
+          click_coeff: 1
+          base_threshold: 1.5
+          delta_threshold: 0.25
+          delta_keep_days: 16
+          delete_after_unseen_days: 30
+          show_click_decay_rate: 0.98
+          delete_threshold: 0.8
+        }
+        table_accessor_save_param {
+          param: 1
+          converter: "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)"
+          deconverter: "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)"
+        }
+        table_accessor_save_param {
+          param: 2
+          converter: "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)"
+          deconverter: "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)"
+        }
+      }
+      type: PS_SPARSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 1
+      table_class: "DownpourDenseTable"
+      accessor {
+        accessor_class: "DownpourDenseValueAccessor"
+        dense_sgd_param {
+          name: "adam"
+          adam {
+            learning_rate: 5e-06
+            avg_decay_rate: 0.999993
+            ada_decay_rate: 0.9999
+            ada_epsilon: 1e-08
+            mom_decay_rate: 0.99
+          }
+          naive {
+            learning_rate: 0.0002
+          }
+        }
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 2
+      table_class: "DownpourDenseDoubleTable"
+      accessor {
+        accessor_class: "DownpourDenseValueDoubleAccessor"
+        dense_sgd_param {
+          name: "summarydouble"
+          summary {
+            summary_decay_rate: 0.999999
+          }
+        }
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 3
+      table_class: "DownpourDenseTable"
+      accessor {
+        accessor_class: "DownpourDenseValueAccessor"
+        dense_sgd_param {
+          name: "adam"
+          adam {
+            learning_rate: 5e-06
+            avg_decay_rate: 0.999993
+            ada_decay_rate: 0.9999
+            ada_epsilon: 1e-08
+            mom_decay_rate: 0.99
+          }
+          naive {
+            learning_rate: 0.0002
+          }
+        }
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    service_param {
+      server_class: "DownpourBrpcPsServer"
+      client_class: "DownpourBrpcPsClient"
+      service_class: "DownpourPsService"
+      start_server_port: 0
+      server_thread_num: 12
+    }
+  }
+}
+trainer_param {
+  dense_table {
+    table_id: 1
+
+    dense_variable_name: "join_0.w_0"
+    dense_variable_name: "join_0.b_0"
+    dense_variable_name: "join_1.w_0"
+    dense_variable_name: "join_1.b_0"
+    dense_variable_name: "join_2.w_0"
+    dense_variable_name: "join_2.b_0"
+    dense_variable_name: "join_3.w_0"
+    dense_variable_name: "join_3.b_0"
+    dense_variable_name: "join_4.w_0"
+    dense_variable_name: "join_4.b_0"
+    dense_variable_name: "join_5.w_0"
+    dense_variable_name: "join_5.b_0"
+    dense_variable_name: "join_6.w_0"
+    dense_variable_name: "join_6.b_0"
+    dense_variable_name: "join_7.w_0"
+    dense_variable_name: "join_7.b_0"
+
+    dense_variable_name: "common_0.w_0"
+    dense_variable_name: "common_0.b_0"
+    dense_variable_name: "common_1.w_0"
+    dense_variable_name: "common_1.b_0"
+    dense_variable_name: "common_2.w_0"
+    dense_variable_name: "common_2.b_0"
+    dense_variable_name: "common_3.w_0"
+    dense_variable_name: "common_3.b_0"
+    dense_variable_name: "common_4.w_0"
+    dense_variable_name: "common_4.b_0"
+    dense_variable_name: "common_5.w_0"
+    dense_variable_name: "common_5.b_0"
+    dense_variable_name: "common_6.w_0"
+    dense_variable_name: "common_6.b_0"
+    dense_variable_name: "common_7.w_0"
+    dense_variable_name: "common_7.b_0"
+
+    dense_gradient_variable_name: "join_0.w_0@GRAD"
+    dense_gradient_variable_name: "join_0.b_0@GRAD"
+    dense_gradient_variable_name: "join_1.w_0@GRAD"
+    dense_gradient_variable_name: "join_1.b_0@GRAD"
+    dense_gradient_variable_name: "join_2.w_0@GRAD"
+    dense_gradient_variable_name: "join_2.b_0@GRAD"
+    dense_gradient_variable_name: "join_3.w_0@GRAD"
+    dense_gradient_variable_name: "join_3.b_0@GRAD"
+    dense_gradient_variable_name: "join_4.w_0@GRAD"
+    dense_gradient_variable_name: "join_4.b_0@GRAD"
+    dense_gradient_variable_name: "join_5.w_0@GRAD"
+    dense_gradient_variable_name: "join_5.b_0@GRAD"
+    dense_gradient_variable_name: "join_6.w_0@GRAD"
+    dense_gradient_variable_name: "join_6.b_0@GRAD"
+    dense_gradient_variable_name: "join_7.w_0@GRAD"
+    dense_gradient_variable_name: "join_7.b_0@GRAD"
+
+    dense_gradient_variable_name: "common_0.w_0@GRAD"
+    dense_gradient_variable_name: "common_0.b_0@GRAD"
+    dense_gradient_variable_name: "common_1.w_0@GRAD"
+    dense_gradient_variable_name: "common_1.b_0@GRAD"
+    dense_gradient_variable_name: "common_2.w_0@GRAD"
+    dense_gradient_variable_name: "common_2.b_0@GRAD"
+    dense_gradient_variable_name: "common_3.w_0@GRAD"
+    dense_gradient_variable_name: "common_3.b_0@GRAD"
+    dense_gradient_variable_name: "common_4.w_0@GRAD"
+    dense_gradient_variable_name: "common_4.b_0@GRAD"
+    dense_gradient_variable_name: "common_5.w_0@GRAD"
+    dense_gradient_variable_name: "common_5.b_0@GRAD"
+    dense_gradient_variable_name: "common_6.w_0@GRAD"
+    dense_gradient_variable_name: "common_6.b_0@GRAD"
+    dense_gradient_variable_name: "common_7.w_0@GRAD"
+    dense_gradient_variable_name: "common_7.b_0@GRAD"
+  }
+  dense_table {
+    table_id: 2
+    dense_variable_name: "join.batch_size"
+    dense_variable_name: "join.batch_sum"
+    dense_variable_name: "join.batch_square_sum"
+
+    dense_variable_name: "common.batch_size"
+    dense_variable_name: "common.batch_sum"
+    dense_variable_name: "common.batch_square_sum"
+
+    dense_gradient_variable_name: "join.batch_size@GRAD"
+    dense_gradient_variable_name: "join.batch_sum@GRAD"
+    dense_gradient_variable_name: "join.batch_square_sum@GRAD"
+
+    dense_gradient_variable_name: "common.batch_size@GRAD"
+    dense_gradient_variable_name: "common.batch_sum@GRAD"
+    dense_gradient_variable_name: "common.batch_square_sum@GRAD"
+  }
+  dense_table {
+    table_id: 3
+    dense_variable_name: "fc_0.w_0"
+    dense_variable_name: "fc_0.b_0"
+    dense_variable_name: "fc_1.w_0"
+    dense_variable_name: "fc_1.b_0"
+    dense_variable_name: "fc_2.w_0"
+    dense_variable_name: "fc_2.b_0"
+    dense_variable_name: "fc_3.w_0"
+    dense_variable_name: "fc_3.b_0"
+    dense_variable_name: "fc_4.w_0"
+    dense_variable_name: "fc_4.b_0"
+    dense_variable_name: "fc_5.w_0"
+    dense_variable_name: "fc_5.b_0"
+    dense_gradient_variable_name: "fc_0.w_0@GRAD"
+    dense_gradient_variable_name: "fc_0.b_0@GRAD"
+    dense_gradient_variable_name: "fc_1.w_0@GRAD"
+    dense_gradient_variable_name: "fc_1.b_0@GRAD"
+    dense_gradient_variable_name: "fc_2.w_0@GRAD"
+    dense_gradient_variable_name: "fc_2.b_0@GRAD"
+    dense_gradient_variable_name: "fc_3.w_0@GRAD"
+    dense_gradient_variable_name: "fc_3.b_0@GRAD"
+    dense_gradient_variable_name: "fc_4.w_0@GRAD"
+    dense_gradient_variable_name: "fc_4.b_0@GRAD"
+    dense_gradient_variable_name: "fc_5.w_0@GRAD"
+    dense_gradient_variable_name: "fc_5.b_0@GRAD"
+  }
+   sparse_table {
+     table_id: 0
+     slot_key: "6048"
+     slot_key: "6002"
+     slot_key: "6145"
+     slot_key: "6202"
+     slot_key: "6201"
+     slot_key: "6121"
+     slot_key: "6738"
+     slot_key: "6119"
+     slot_key: "6146"
+     slot_key: "6120"
+     slot_key: "6147"
+     slot_key: "6122"
+     slot_key: "6123"
+     slot_key: "6118"
+     slot_key: "6142"
+     slot_key: "6143"
+     slot_key: "6008"
+     slot_key: "6148"
+     slot_key: "6151"
+     slot_key: "6127"
+     slot_key: "6144"
+     slot_key: "6094"
+     slot_key: "6083"
+     slot_key: "6952"
+     slot_key: "6739"
+     slot_key: "6150"
+     slot_key: "6109"
+     slot_key: "6003"
+     slot_key: "6099"
+     slot_key: "6149"
+     slot_key: "6129"
+     slot_key: "6203"
+     slot_key: "6153"
+     slot_key: "6152"
+     slot_key: "6128"
+     slot_key: "6106"
+     slot_key: "6251"
+     slot_key: "7082"
+     slot_key: "7515"
+     slot_key: "6951"
+     slot_key: "6949"
+     slot_key: "7080"
+     slot_key: "6066"
+     slot_key: "7507"
+     slot_key: "6186"
+     slot_key: "6007"
+     slot_key: "7514"
+     slot_key: "6125"
+     slot_key: "7506"
+     slot_key: "10001"
+     slot_key: "6006"
+     slot_key: "7023"
+     slot_key: "6085"
+     slot_key: "10000"
+     slot_key: "6098"
+     slot_key: "6250"
+     slot_key: "6110"
+     slot_key: "6124"
+     slot_key: "6090"
+     slot_key: "6082"
+     slot_key: "6067"
+     slot_key: "6101"
+     slot_key: "6004"
+     slot_key: "6191"
+     slot_key: "7075"
+     slot_key: "6948"
+     slot_key: "6157"
+     slot_key: "6126"
+     slot_key: "6188"
+     slot_key: "7077"
+     slot_key: "6070"
+     slot_key: "6111"
+     slot_key: "6087"
+     slot_key: "6103"
+     slot_key: "6107"
+     slot_key: "6194"
+     slot_key: "6156"
+     slot_key: "6005"
+     slot_key: "6247"
+     slot_key: "6814"
+     slot_key: "6158"
+     slot_key: "7122"
+     slot_key: "6058"
+     slot_key: "6189"
+     slot_key: "7058"
+     slot_key: "6059"
+     slot_key: "6115"
+     slot_key: "7079"
+     slot_key: "7081"
+     slot_key: "6833"
+     slot_key: "7024"
+     slot_key: "6108"
+     slot_key: "13342"
+     slot_key: "13345"
+     slot_key: "13412"
+     slot_key: "13343"
+     slot_key: "13350"
+     slot_key: "13346"
+     slot_key: "13409"
+     slot_key: "6009"
+     slot_key: "6011"
+     slot_key: "6012"
+     slot_key: "6013"
+     slot_key: "6014"
+     slot_key: "6015"
+     slot_key: "6019"
+     slot_key: "6023"
+     slot_key: "6024"
+     slot_key: "6027"
+     slot_key: "6029"
+     slot_key: "6031"
+     slot_key: "6050"
+     slot_key: "6060"
+     slot_key: "6068"
+     slot_key: "6069"
+     slot_key: "6089"
+     slot_key: "6095"
+     slot_key: "6105"
+     slot_key: "6112"
+     slot_key: "6130"
+     slot_key: "6131"
+     slot_key: "6132"
+     slot_key: "6134"
+     slot_key: "6161"
+     slot_key: "6162"
+     slot_key: "6163"
+     slot_key: "6166"
+     slot_key: "6182"
+     slot_key: "6183"
+     slot_key: "6185"
+     slot_key: "6190"
+     slot_key: "6212"
+     slot_key: "6213"
+     slot_key: "6231"
+     slot_key: "6233"
+     slot_key: "6234"
+     slot_key: "6236"
+     slot_key: "6238"
+     slot_key: "6239"
+     slot_key: "6240"
+     slot_key: "6241"
+     slot_key: "6242"
+     slot_key: "6243"
+     slot_key: "6244"
+     slot_key: "6245"
+     slot_key: "6354"
+     slot_key: "7002"
+     slot_key: "7005"
+     slot_key: "7008"
+     slot_key: "7010"
+     slot_key: "7013"
+     slot_key: "7015"
+     slot_key: "7019"
+     slot_key: "7020"
+     slot_key: "7045"
+     slot_key: "7046"
+     slot_key: "7048"
+     slot_key: "7049"
+     slot_key: "7052"
+     slot_key: "7054"
+     slot_key: "7056"
+     slot_key: "7064"
+     slot_key: "7066"
+     slot_key: "7076"
+     slot_key: "7078"
+     slot_key: "7083"
+     slot_key: "7084"
+     slot_key: "7085"
+     slot_key: "7086"
+     slot_key: "7087"
+     slot_key: "7088"
+     slot_key: "7089"
+     slot_key: "7090"
+     slot_key: "7099"
+     slot_key: "7100"
+     slot_key: "7101"
+     slot_key: "7102"
+     slot_key: "7103"
+     slot_key: "7104"
+     slot_key: "7105"
+     slot_key: "7109"
+     slot_key: "7124"
+     slot_key: "7126"
+     slot_key: "7136"
+     slot_key: "7142"
+     slot_key: "7143"
+     slot_key: "7144"
+     slot_key: "7145"
+     slot_key: "7146"
+     slot_key: "7147"
+     slot_key: "7148"
+     slot_key: "7150"
+     slot_key: "7151"
+     slot_key: "7152"
+     slot_key: "7153"
+     slot_key: "7154"
+     slot_key: "7155"
+     slot_key: "7156"
+     slot_key: "7157"
+     slot_key: "7047"
+     slot_key: "7050"
+     slot_key: "6257"
+     slot_key: "6259"
+     slot_key: "6260"
+     slot_key: "6261"
+     slot_key: "7170"
+     slot_key: "7185"
+     slot_key: "7186"
+     slot_key: "6751"
+     slot_key: "6755"
+     slot_key: "6757"
+     slot_key: "6759"
+     slot_key: "6760"
+     slot_key: "6763"
+     slot_key: "6764"
+     slot_key: "6765"
+     slot_key: "6766"
+     slot_key: "6767"
+     slot_key: "6768"
+     slot_key: "6769"
+     slot_key: "6770"
+     slot_key: "7502"
+     slot_key: "7503"
+     slot_key: "7504"
+     slot_key: "7505"
+     slot_key: "7510"
+     slot_key: "7511"
+     slot_key: "7512"
+     slot_key: "7513"
+     slot_key: "6806"
+     slot_key: "6807"
+     slot_key: "6808"
+     slot_key: "6809"
+     slot_key: "6810"
+     slot_key: "6811"
+     slot_key: "6812"
+     slot_key: "6813"
+     slot_key: "6815"
+     slot_key: "6816"
+     slot_key: "6817"
+     slot_key: "6819"
+     slot_key: "6823"
+     slot_key: "6828"
+     slot_key: "6831"
+     slot_key: "6840"
+     slot_key: "6845"
+     slot_key: "6875"
+     slot_key: "6879"
+     slot_key: "6881"
+     slot_key: "6888"
+     slot_key: "6889"
+     slot_key: "6947"
+     slot_key: "6950"
+     slot_key: "6956"
+     slot_key: "6957"
+     slot_key: "6959"
+     slot_key: "10006"
+     slot_key: "10008"
+     slot_key: "10009"
+     slot_key: "10010"
+     slot_key: "10011"
+     slot_key: "10016"
+     slot_key: "10017"
+     slot_key: "10018"
+     slot_key: "10019"
+     slot_key: "10020"
+     slot_key: "10021"
+     slot_key: "10022"
+     slot_key: "10023"
+     slot_key: "10024"
+     slot_key: "10029"
+     slot_key: "10030"
+     slot_key: "10031"
+     slot_key: "10032"
+     slot_key: "10033"
+     slot_key: "10034"
+     slot_key: "10035"
+     slot_key: "10036"
+     slot_key: "10037"
+     slot_key: "10038"
+     slot_key: "10039"
+     slot_key: "10040"
+     slot_key: "10041"
+     slot_key: "10042"
+     slot_key: "10044"
+     slot_key: "10045"
+     slot_key: "10046"
+     slot_key: "10051"
+     slot_key: "10052"
+     slot_key: "10053"
+     slot_key: "10054"
+     slot_key: "10055"
+     slot_key: "10056"
+     slot_key: "10057"
+     slot_key: "10060"
+     slot_key: "10066"
+     slot_key: "10069"
+     slot_key: "6820"
+     slot_key: "6821"
+     slot_key: "6822"
+     slot_key: "13333"
+     slot_key: "13334"
+     slot_key: "13335"
+     slot_key: "13336"
+     slot_key: "13337"
+     slot_key: "13338"
+     slot_key: "13339"
+     slot_key: "13340"
+     slot_key: "13341"
+     slot_key: "13351"
+     slot_key: "13352"
+     slot_key: "13353"
+     slot_key: "13359"
+     slot_key: "13361"
+     slot_key: "13362"
+     slot_key: "13363"
+     slot_key: "13366"
+     slot_key: "13367"
+     slot_key: "13368"
+     slot_key: "13369"
+     slot_key: "13370"
+     slot_key: "13371"
+     slot_key: "13375"
+     slot_key: "13376"
+     slot_key: "5700"
+     slot_key: "5702"
+     slot_key: "13400"
+     slot_key: "13401"
+     slot_key: "13402"
+     slot_key: "13403"
+     slot_key: "13404"
+     slot_key: "13406"
+     slot_key: "13407"
+     slot_key: "13408"
+     slot_key: "13410"
+     slot_key: "13417"
+     slot_key: "13418"
+     slot_key: "13419"
+     slot_key: "13420"
+     slot_key: "13422"
+     slot_key: "13425"
+     slot_key: "13427"
+     slot_key: "13428"
+     slot_key: "13429"
+     slot_key: "13430"
+     slot_key: "13431"
+     slot_key: "13433"
+     slot_key: "13434"
+     slot_key: "13436"
+     slot_key: "13437"
+     slot_key: "13326"
+     slot_key: "13330"
+     slot_key: "13331"
+     slot_key: "5717"
+     slot_key: "13442"
+     slot_key: "13451"
+     slot_key: "13452"
+     slot_key: "13455"
+     slot_key: "13456"
+     slot_key: "13457"
+     slot_key: "13458"
+     slot_key: "13459"
+     slot_key: "13460"
+     slot_key: "13461"
+     slot_key: "13462"
+     slot_key: "13463"
+     slot_key: "13464"
+     slot_key: "13465"
+     slot_key: "13466"
+     slot_key: "13467"
+     slot_key: "13468"
+     slot_key: "1104"
+     slot_key: "1106"
+     slot_key: "1107"
+     slot_key: "1108"
+     slot_key: "1109"
+     slot_key: "1110"
+     slot_key: "1111"
+     slot_key: "1112"
+     slot_key: "1113"
+     slot_key: "1114"
+     slot_key: "1115"
+     slot_key: "1116"
+     slot_key: "1117"
+     slot_key: "1119"
+     slot_key: "1120"
+     slot_key: "1121"
+     slot_key: "1122"
+     slot_key: "1123"
+     slot_key: "1124"
+     slot_key: "1125"
+     slot_key: "1126"
+     slot_key: "1127"
+     slot_key: "1128"
+     slot_key: "1129"
+     slot_key: "13812"
+     slot_key: "13813"
+     slot_key: "6740"
+     slot_key: "1490"
+     slot_key: "32915"
+     slot_key: "32950"
+     slot_key: "32952"
+     slot_key: "32953"
+     slot_key: "32954"
+     slot_key: "33077"
+     slot_key: "33085"
+     slot_key: "33086"
+     slot_value: "embedding_0.tmp_0"
+     slot_value: "embedding_1.tmp_0"
+     slot_value: "embedding_2.tmp_0"
+     slot_value: "embedding_3.tmp_0"
+     slot_value: "embedding_4.tmp_0"
+     slot_value: "embedding_5.tmp_0"
+     slot_value: "embedding_6.tmp_0"
+     slot_value: "embedding_7.tmp_0"
+     slot_value: "embedding_8.tmp_0"
+     slot_value: "embedding_9.tmp_0"
+     slot_value: "embedding_10.tmp_0"
+     slot_value: "embedding_11.tmp_0"
+     slot_value: "embedding_12.tmp_0"
+     slot_value: "embedding_13.tmp_0"
+     slot_value: "embedding_14.tmp_0"
+     slot_value: "embedding_15.tmp_0"
+     slot_value: "embedding_16.tmp_0"
+     slot_value: "embedding_17.tmp_0"
+     slot_value: "embedding_18.tmp_0"
+     slot_value: "embedding_19.tmp_0"
+     slot_value: "embedding_20.tmp_0"
+     slot_value: "embedding_21.tmp_0"
+     slot_value: "embedding_22.tmp_0"
+     slot_value: "embedding_23.tmp_0"
+     slot_value: "embedding_24.tmp_0"
+     slot_value: "embedding_25.tmp_0"
+     slot_value: "embedding_26.tmp_0"
+     slot_value: "embedding_27.tmp_0"
+     slot_value: "embedding_28.tmp_0"
+     slot_value: "embedding_29.tmp_0"
+     slot_value: "embedding_30.tmp_0"
+     slot_value: "embedding_31.tmp_0"
+     slot_value: "embedding_32.tmp_0"
+     slot_value: "embedding_33.tmp_0"
+     slot_value: "embedding_34.tmp_0"
+     slot_value: "embedding_35.tmp_0"
+     slot_value: "embedding_36.tmp_0"
+     slot_value: "embedding_37.tmp_0"
+     slot_value: "embedding_38.tmp_0"
+     slot_value: "embedding_39.tmp_0"
+     slot_value: "embedding_40.tmp_0"
+     slot_value: "embedding_41.tmp_0"
+     slot_value: "embedding_42.tmp_0"
+     slot_value: "embedding_43.tmp_0"
+     slot_value: "embedding_44.tmp_0"
+     slot_value: "embedding_45.tmp_0"
+     slot_value: "embedding_46.tmp_0"
+     slot_value: "embedding_47.tmp_0"
+     slot_value: "embedding_48.tmp_0"
+     slot_value: "embedding_49.tmp_0"
+     slot_value: "embedding_50.tmp_0"
+     slot_value: "embedding_51.tmp_0"
+     slot_value: "embedding_52.tmp_0"
+     slot_value: "embedding_53.tmp_0"
+     slot_value: "embedding_54.tmp_0"
+     slot_value: "embedding_55.tmp_0"
+     slot_value: "embedding_56.tmp_0"
+     slot_value: "embedding_57.tmp_0"
+     slot_value: "embedding_58.tmp_0"
+     slot_value: "embedding_59.tmp_0"
+     slot_value: "embedding_60.tmp_0"
+     slot_value: "embedding_61.tmp_0"
+     slot_value: "embedding_62.tmp_0"
+     slot_value: "embedding_63.tmp_0"
+     slot_value: "embedding_64.tmp_0"
+     slot_value: "embedding_65.tmp_0"
+     slot_value: "embedding_66.tmp_0"
+     slot_value: "embedding_67.tmp_0"
+     slot_value: "embedding_68.tmp_0"
+     slot_value: "embedding_69.tmp_0"
+     slot_value: "embedding_70.tmp_0"
+     slot_value: "embedding_71.tmp_0"
+     slot_value: "embedding_72.tmp_0"
+     slot_value: "embedding_73.tmp_0"
+     slot_value: "embedding_74.tmp_0"
+     slot_value: "embedding_75.tmp_0"
+     slot_value: "embedding_76.tmp_0"
+     slot_value: "embedding_77.tmp_0"
+     slot_value: "embedding_78.tmp_0"
+     slot_value: "embedding_79.tmp_0"
+     slot_value: "embedding_80.tmp_0"
+     slot_value: "embedding_81.tmp_0"
+     slot_value: "embedding_82.tmp_0"
+     slot_value: "embedding_83.tmp_0"
+     slot_value: "embedding_84.tmp_0"
+     slot_value: "embedding_85.tmp_0"
+     slot_value: "embedding_86.tmp_0"
+     slot_value: "embedding_87.tmp_0"
+     slot_value: "embedding_88.tmp_0"
+     slot_value: "embedding_89.tmp_0"
+     slot_value: "embedding_90.tmp_0"
+     slot_value: "embedding_91.tmp_0"
+     slot_value: "embedding_92.tmp_0"
+     slot_value: "embedding_93.tmp_0"
+     slot_value: "embedding_94.tmp_0"
+     slot_value: "embedding_95.tmp_0"
+     slot_value: "embedding_96.tmp_0"
+     slot_value: "embedding_97.tmp_0"
+     slot_value: "embedding_98.tmp_0"
+     slot_value: "embedding_99.tmp_0"
+     slot_value: "embedding_100.tmp_0"
+     slot_value: "embedding_101.tmp_0"
+     slot_value: "embedding_102.tmp_0"
+     slot_value: "embedding_103.tmp_0"
+     slot_value: "embedding_104.tmp_0"
+     slot_value: "embedding_105.tmp_0"
+     slot_value: "embedding_106.tmp_0"
+     slot_value: "embedding_107.tmp_0"
+     slot_value: "embedding_108.tmp_0"
+     slot_value: "embedding_109.tmp_0"
+     slot_value: "embedding_110.tmp_0"
+     slot_value: "embedding_111.tmp_0"
+     slot_value: "embedding_112.tmp_0"
+     slot_value: "embedding_113.tmp_0"
+     slot_value: "embedding_114.tmp_0"
+     slot_value: "embedding_115.tmp_0"
+     slot_value: "embedding_116.tmp_0"
+     slot_value: "embedding_117.tmp_0"
+     slot_value: "embedding_118.tmp_0"
+     slot_value: "embedding_119.tmp_0"
+     slot_value: "embedding_120.tmp_0"
+     slot_value: "embedding_121.tmp_0"
+     slot_value: "embedding_122.tmp_0"
+     slot_value: "embedding_123.tmp_0"
+     slot_value: "embedding_124.tmp_0"
+     slot_value: "embedding_125.tmp_0"
+     slot_value: "embedding_126.tmp_0"
+     slot_value: "embedding_127.tmp_0"
+     slot_value: "embedding_128.tmp_0"
+     slot_value: "embedding_129.tmp_0"
+     slot_value: "embedding_130.tmp_0"
+     slot_value: "embedding_131.tmp_0"
+     slot_value: "embedding_132.tmp_0"
+     slot_value: "embedding_133.tmp_0"
+     slot_value: "embedding_134.tmp_0"
+     slot_value: "embedding_135.tmp_0"
+     slot_value: "embedding_136.tmp_0"
+     slot_value: "embedding_137.tmp_0"
+     slot_value: "embedding_138.tmp_0"
+     slot_value: "embedding_139.tmp_0"
+     slot_value: "embedding_140.tmp_0"
+     slot_value: "embedding_141.tmp_0"
+     slot_value: "embedding_142.tmp_0"
+     slot_value: "embedding_143.tmp_0"
+     slot_value: "embedding_144.tmp_0"
+     slot_value: "embedding_145.tmp_0"
+     slot_value: "embedding_146.tmp_0"
+     slot_value: "embedding_147.tmp_0"
+     slot_value: "embedding_148.tmp_0"
+     slot_value: "embedding_149.tmp_0"
+     slot_value: "embedding_150.tmp_0"
+     slot_value: "embedding_151.tmp_0"
+     slot_value: "embedding_152.tmp_0"
+     slot_value: "embedding_153.tmp_0"
+     slot_value: "embedding_154.tmp_0"
+     slot_value: "embedding_155.tmp_0"
+     slot_value: "embedding_156.tmp_0"
+     slot_value: "embedding_157.tmp_0"
+     slot_value: "embedding_158.tmp_0"
+     slot_value: "embedding_159.tmp_0"
+     slot_value: "embedding_160.tmp_0"
+     slot_value: "embedding_161.tmp_0"
+     slot_value: "embedding_162.tmp_0"
+     slot_value: "embedding_163.tmp_0"
+     slot_value: "embedding_164.tmp_0"
+     slot_value: "embedding_165.tmp_0"
+     slot_value: "embedding_166.tmp_0"
+     slot_value: "embedding_167.tmp_0"
+     slot_value: "embedding_168.tmp_0"
+     slot_value: "embedding_169.tmp_0"
+     slot_value: "embedding_170.tmp_0"
+     slot_value: "embedding_171.tmp_0"
+     slot_value: "embedding_172.tmp_0"
+     slot_value: "embedding_173.tmp_0"
+     slot_value: "embedding_174.tmp_0"
+     slot_value: "embedding_175.tmp_0"
+     slot_value: "embedding_176.tmp_0"
+     slot_value: "embedding_177.tmp_0"
+     slot_value: "embedding_178.tmp_0"
+     slot_value: "embedding_179.tmp_0"
+     slot_value: "embedding_180.tmp_0"
+     slot_value: "embedding_181.tmp_0"
+     slot_value: "embedding_182.tmp_0"
+     slot_value: "embedding_183.tmp_0"
+     slot_value: "embedding_184.tmp_0"
+     slot_value: "embedding_185.tmp_0"
+     slot_value: "embedding_186.tmp_0"
+     slot_value: "embedding_187.tmp_0"
+     slot_value: "embedding_188.tmp_0"
+     slot_value: "embedding_189.tmp_0"
+     slot_value: "embedding_190.tmp_0"
+     slot_value: "embedding_191.tmp_0"
+     slot_value: "embedding_192.tmp_0"
+     slot_value: "embedding_193.tmp_0"
+     slot_value: "embedding_194.tmp_0"
+     slot_value: "embedding_195.tmp_0"
+     slot_value: "embedding_196.tmp_0"
+     slot_value: "embedding_197.tmp_0"
+     slot_value: "embedding_198.tmp_0"
+     slot_value: "embedding_199.tmp_0"
+     slot_value: "embedding_200.tmp_0"
+     slot_value: "embedding_201.tmp_0"
+     slot_value: "embedding_202.tmp_0"
+     slot_value: "embedding_203.tmp_0"
+     slot_value: "embedding_204.tmp_0"
+     slot_value: "embedding_205.tmp_0"
+     slot_value: "embedding_206.tmp_0"
+     slot_value: "embedding_207.tmp_0"
+     slot_value: "embedding_208.tmp_0"
+     slot_value: "embedding_209.tmp_0"
+     slot_value: "embedding_210.tmp_0"
+     slot_value: "embedding_211.tmp_0"
+     slot_value: "embedding_212.tmp_0"
+     slot_value: "embedding_213.tmp_0"
+     slot_value: "embedding_214.tmp_0"
+     slot_value: "embedding_215.tmp_0"
+     slot_value: "embedding_216.tmp_0"
+     slot_value: "embedding_217.tmp_0"
+     slot_value: "embedding_218.tmp_0"
+     slot_value: "embedding_219.tmp_0"
+     slot_value: "embedding_220.tmp_0"
+     slot_value: "embedding_221.tmp_0"
+     slot_value: "embedding_222.tmp_0"
+     slot_value: "embedding_223.tmp_0"
+     slot_value: "embedding_224.tmp_0"
+     slot_value: "embedding_225.tmp_0"
+     slot_value: "embedding_226.tmp_0"
+     slot_value: "embedding_227.tmp_0"
+     slot_value: "embedding_228.tmp_0"
+     slot_value: "embedding_229.tmp_0"
+     slot_value: "embedding_230.tmp_0"
+     slot_value: "embedding_231.tmp_0"
+     slot_value: "embedding_232.tmp_0"
+     slot_value: "embedding_233.tmp_0"
+     slot_value: "embedding_234.tmp_0"
+     slot_value: "embedding_235.tmp_0"
+     slot_value: "embedding_236.tmp_0"
+     slot_value: "embedding_237.tmp_0"
+     slot_value: "embedding_238.tmp_0"
+     slot_value: "embedding_239.tmp_0"
+     slot_value: "embedding_240.tmp_0"
+     slot_value: "embedding_241.tmp_0"
+     slot_value: "embedding_242.tmp_0"
+     slot_value: "embedding_243.tmp_0"
+     slot_value: "embedding_244.tmp_0"
+     slot_value: "embedding_245.tmp_0"
+     slot_value: "embedding_246.tmp_0"
+     slot_value: "embedding_247.tmp_0"
+     slot_value: "embedding_248.tmp_0"
+     slot_value: "embedding_249.tmp_0"
+     slot_value: "embedding_250.tmp_0"
+     slot_value: "embedding_251.tmp_0"
+     slot_value: "embedding_252.tmp_0"
+     slot_value: "embedding_253.tmp_0"
+     slot_value: "embedding_254.tmp_0"
+     slot_value: "embedding_255.tmp_0"
+     slot_value: "embedding_256.tmp_0"
+     slot_value: "embedding_257.tmp_0"
+     slot_value: "embedding_258.tmp_0"
+     slot_value: "embedding_259.tmp_0"
+     slot_value: "embedding_260.tmp_0"
+     slot_value: "embedding_261.tmp_0"
+     slot_value: "embedding_262.tmp_0"
+     slot_value: "embedding_263.tmp_0"
+     slot_value: "embedding_264.tmp_0"
+     slot_value: "embedding_265.tmp_0"
+     slot_value: "embedding_266.tmp_0"
+     slot_value: "embedding_267.tmp_0"
+     slot_value: "embedding_268.tmp_0"
+     slot_value: "embedding_269.tmp_0"
+     slot_value: "embedding_270.tmp_0"
+     slot_value: "embedding_271.tmp_0"
+     slot_value: "embedding_272.tmp_0"
+     slot_value: "embedding_273.tmp_0"
+     slot_value: "embedding_274.tmp_0"
+     slot_value: "embedding_275.tmp_0"
+     slot_value: "embedding_276.tmp_0"
+     slot_value: "embedding_277.tmp_0"
+     slot_value: "embedding_278.tmp_0"
+     slot_value: "embedding_279.tmp_0"
+     slot_value: "embedding_280.tmp_0"
+     slot_value: "embedding_281.tmp_0"
+     slot_value: "embedding_282.tmp_0"
+     slot_value: "embedding_283.tmp_0"
+     slot_value: "embedding_284.tmp_0"
+     slot_value: "embedding_285.tmp_0"
+     slot_value: "embedding_286.tmp_0"
+     slot_value: "embedding_287.tmp_0"
+     slot_value: "embedding_288.tmp_0"
+     slot_value: "embedding_289.tmp_0"
+     slot_value: "embedding_290.tmp_0"
+     slot_value: "embedding_291.tmp_0"
+     slot_value: "embedding_292.tmp_0"
+     slot_value: "embedding_293.tmp_0"
+     slot_value: "embedding_294.tmp_0"
+     slot_value: "embedding_295.tmp_0"
+     slot_value: "embedding_296.tmp_0"
+     slot_value: "embedding_297.tmp_0"
+     slot_value: "embedding_298.tmp_0"
+     slot_value: "embedding_299.tmp_0"
+     slot_value: "embedding_300.tmp_0"
+     slot_value: "embedding_301.tmp_0"
+     slot_value: "embedding_302.tmp_0"
+     slot_value: "embedding_303.tmp_0"
+     slot_value: "embedding_304.tmp_0"
+     slot_value: "embedding_305.tmp_0"
+     slot_value: "embedding_306.tmp_0"
+     slot_value: "embedding_307.tmp_0"
+     slot_value: "embedding_308.tmp_0"
+     slot_value: "embedding_309.tmp_0"
+     slot_value: "embedding_310.tmp_0"
+     slot_value: "embedding_311.tmp_0"
+     slot_value: "embedding_312.tmp_0"
+     slot_value: "embedding_313.tmp_0"
+     slot_value: "embedding_314.tmp_0"
+     slot_value: "embedding_315.tmp_0"
+     slot_value: "embedding_316.tmp_0"
+     slot_value: "embedding_317.tmp_0"
+     slot_value: "embedding_318.tmp_0"
+     slot_value: "embedding_319.tmp_0"
+     slot_value: "embedding_320.tmp_0"
+     slot_value: "embedding_321.tmp_0"
+     slot_value: "embedding_322.tmp_0"
+     slot_value: "embedding_323.tmp_0"
+     slot_value: "embedding_324.tmp_0"
+     slot_value: "embedding_325.tmp_0"
+     slot_value: "embedding_326.tmp_0"
+     slot_value: "embedding_327.tmp_0"
+     slot_value: "embedding_328.tmp_0"
+     slot_value: "embedding_329.tmp_0"
+     slot_value: "embedding_330.tmp_0"
+     slot_value: "embedding_331.tmp_0"
+     slot_value: "embedding_332.tmp_0"
+     slot_value: "embedding_333.tmp_0"
+     slot_value: "embedding_334.tmp_0"
+     slot_value: "embedding_335.tmp_0"
+     slot_value: "embedding_336.tmp_0"
+     slot_value: "embedding_337.tmp_0"
+     slot_value: "embedding_338.tmp_0"
+     slot_value: "embedding_339.tmp_0"
+     slot_value: "embedding_340.tmp_0"
+     slot_value: "embedding_341.tmp_0"
+     slot_value: "embedding_342.tmp_0"
+     slot_value: "embedding_343.tmp_0"
+     slot_value: "embedding_344.tmp_0"
+     slot_value: "embedding_345.tmp_0"
+     slot_value: "embedding_346.tmp_0"
+     slot_value: "embedding_347.tmp_0"
+     slot_value: "embedding_348.tmp_0"
+     slot_value: "embedding_349.tmp_0"
+     slot_value: "embedding_350.tmp_0"
+     slot_value: "embedding_351.tmp_0"
+     slot_value: "embedding_352.tmp_0"
+     slot_value: "embedding_353.tmp_0"
+     slot_value: "embedding_354.tmp_0"
+     slot_value: "embedding_355.tmp_0"
+     slot_value: "embedding_356.tmp_0"
+     slot_value: "embedding_357.tmp_0"
+     slot_value: "embedding_358.tmp_0"
+     slot_value: "embedding_359.tmp_0"
+     slot_value: "embedding_360.tmp_0"
+     slot_value: "embedding_361.tmp_0"
+     slot_value: "embedding_362.tmp_0"
+     slot_value: "embedding_363.tmp_0"
+     slot_value: "embedding_364.tmp_0"
+     slot_value: "embedding_365.tmp_0"
+     slot_value: "embedding_366.tmp_0"
+     slot_value: "embedding_367.tmp_0"
+     slot_value: "embedding_368.tmp_0"
+     slot_value: "embedding_369.tmp_0"
+     slot_value: "embedding_370.tmp_0"
+     slot_value: "embedding_371.tmp_0"
+     slot_value: "embedding_372.tmp_0"
+     slot_value: "embedding_373.tmp_0"
+     slot_value: "embedding_374.tmp_0"
+     slot_value: "embedding_375.tmp_0"
+     slot_value: "embedding_376.tmp_0"
+     slot_value: "embedding_377.tmp_0"
+     slot_value: "embedding_378.tmp_0"
+     slot_value: "embedding_379.tmp_0"
+     slot_value: "embedding_380.tmp_0"
+     slot_value: "embedding_381.tmp_0"
+     slot_value: "embedding_382.tmp_0"
+     slot_value: "embedding_383.tmp_0"
+     slot_value: "embedding_384.tmp_0"
+     slot_value: "embedding_385.tmp_0"
+     slot_value: "embedding_386.tmp_0"
+     slot_value: "embedding_387.tmp_0"
+     slot_value: "embedding_388.tmp_0"
+     slot_value: "embedding_389.tmp_0"
+     slot_value: "embedding_390.tmp_0"
+     slot_value: "embedding_391.tmp_0"
+     slot_value: "embedding_392.tmp_0"
+     slot_value: "embedding_393.tmp_0"
+     slot_value: "embedding_394.tmp_0"
+     slot_value: "embedding_395.tmp_0"
+     slot_value: "embedding_396.tmp_0"
+     slot_value: "embedding_397.tmp_0"
+     slot_value: "embedding_398.tmp_0"
+     slot_value: "embedding_399.tmp_0"
+     slot_value: "embedding_400.tmp_0"
+     slot_value: "embedding_401.tmp_0"
+     slot_value: "embedding_402.tmp_0"
+     slot_value: "embedding_403.tmp_0"
+     slot_value: "embedding_404.tmp_0"
+     slot_value: "embedding_405.tmp_0"
+     slot_value: "embedding_406.tmp_0"
+     slot_gradient: "embedding_0.tmp_0@GRAD"
+     slot_gradient: "embedding_1.tmp_0@GRAD"
+     slot_gradient: "embedding_2.tmp_0@GRAD"
+     slot_gradient: "embedding_3.tmp_0@GRAD"
+     slot_gradient: "embedding_4.tmp_0@GRAD"
+     slot_gradient: "embedding_5.tmp_0@GRAD"
+     slot_gradient: "embedding_6.tmp_0@GRAD"
+     slot_gradient: "embedding_7.tmp_0@GRAD"
+     slot_gradient: "embedding_8.tmp_0@GRAD"
+     slot_gradient: "embedding_9.tmp_0@GRAD"
+     slot_gradient: "embedding_10.tmp_0@GRAD"
+     slot_gradient: "embedding_11.tmp_0@GRAD"
+     slot_gradient: "embedding_12.tmp_0@GRAD"
+     slot_gradient: "embedding_13.tmp_0@GRAD"
+     slot_gradient: "embedding_14.tmp_0@GRAD"
+     slot_gradient: "embedding_15.tmp_0@GRAD"
+     slot_gradient: "embedding_16.tmp_0@GRAD"
+     slot_gradient: "embedding_17.tmp_0@GRAD"
+     slot_gradient: "embedding_18.tmp_0@GRAD"
+     slot_gradient: "embedding_19.tmp_0@GRAD"
+     slot_gradient: "embedding_20.tmp_0@GRAD"
+     slot_gradient: "embedding_21.tmp_0@GRAD"
+     slot_gradient: "embedding_22.tmp_0@GRAD"
+     slot_gradient: "embedding_23.tmp_0@GRAD"
+     slot_gradient: "embedding_24.tmp_0@GRAD"
+     slot_gradient: "embedding_25.tmp_0@GRAD"
+     slot_gradient: "embedding_26.tmp_0@GRAD"
+     slot_gradient: "embedding_27.tmp_0@GRAD"
+     slot_gradient: "embedding_28.tmp_0@GRAD"
+     slot_gradient: "embedding_29.tmp_0@GRAD"
+     slot_gradient: "embedding_30.tmp_0@GRAD"
+     slot_gradient: "embedding_31.tmp_0@GRAD"
+     slot_gradient: "embedding_32.tmp_0@GRAD"
+     slot_gradient: "embedding_33.tmp_0@GRAD"
+     slot_gradient: "embedding_34.tmp_0@GRAD"
+     slot_gradient: "embedding_35.tmp_0@GRAD"
+     slot_gradient: "embedding_36.tmp_0@GRAD"
+     slot_gradient: "embedding_37.tmp_0@GRAD"
+     slot_gradient: "embedding_38.tmp_0@GRAD"
+     slot_gradient: "embedding_39.tmp_0@GRAD"
+     slot_gradient: "embedding_40.tmp_0@GRAD"
+     slot_gradient: "embedding_41.tmp_0@GRAD"
+     slot_gradient: "embedding_42.tmp_0@GRAD"
+     slot_gradient: "embedding_43.tmp_0@GRAD"
+     slot_gradient: "embedding_44.tmp_0@GRAD"
+     slot_gradient: "embedding_45.tmp_0@GRAD"
+     slot_gradient: "embedding_46.tmp_0@GRAD"
+     slot_gradient: "embedding_47.tmp_0@GRAD"
+     slot_gradient: "embedding_48.tmp_0@GRAD"
+     slot_gradient: "embedding_49.tmp_0@GRAD"
+     slot_gradient: "embedding_50.tmp_0@GRAD"
+     slot_gradient: "embedding_51.tmp_0@GRAD"
+     slot_gradient: "embedding_52.tmp_0@GRAD"
+     slot_gradient: "embedding_53.tmp_0@GRAD"
+     slot_gradient: "embedding_54.tmp_0@GRAD"
+     slot_gradient: "embedding_55.tmp_0@GRAD"
+     slot_gradient: "embedding_56.tmp_0@GRAD"
+     slot_gradient: "embedding_57.tmp_0@GRAD"
+     slot_gradient: "embedding_58.tmp_0@GRAD"
+     slot_gradient: "embedding_59.tmp_0@GRAD"
+     slot_gradient: "embedding_60.tmp_0@GRAD"
+     slot_gradient: "embedding_61.tmp_0@GRAD"
+     slot_gradient: "embedding_62.tmp_0@GRAD"
+     slot_gradient: "embedding_63.tmp_0@GRAD"
+     slot_gradient: "embedding_64.tmp_0@GRAD"
+     slot_gradient: "embedding_65.tmp_0@GRAD"
+     slot_gradient: "embedding_66.tmp_0@GRAD"
+     slot_gradient: "embedding_67.tmp_0@GRAD"
+     slot_gradient: "embedding_68.tmp_0@GRAD"
+     slot_gradient: "embedding_69.tmp_0@GRAD"
+     slot_gradient: "embedding_70.tmp_0@GRAD"
+     slot_gradient: "embedding_71.tmp_0@GRAD"
+     slot_gradient: "embedding_72.tmp_0@GRAD"
+     slot_gradient: "embedding_73.tmp_0@GRAD"
+     slot_gradient: "embedding_74.tmp_0@GRAD"
+     slot_gradient: "embedding_75.tmp_0@GRAD"
+     slot_gradient: "embedding_76.tmp_0@GRAD"
+     slot_gradient: "embedding_77.tmp_0@GRAD"
+     slot_gradient: "embedding_78.tmp_0@GRAD"
+     slot_gradient: "embedding_79.tmp_0@GRAD"
+     slot_gradient: "embedding_80.tmp_0@GRAD"
+     slot_gradient: "embedding_81.tmp_0@GRAD"
+     slot_gradient: "embedding_82.tmp_0@GRAD"
+     slot_gradient: "embedding_83.tmp_0@GRAD"
+     slot_gradient: "embedding_84.tmp_0@GRAD"
+     slot_gradient: "embedding_85.tmp_0@GRAD"
+     slot_gradient: "embedding_86.tmp_0@GRAD"
+     slot_gradient: "embedding_87.tmp_0@GRAD"
+     slot_gradient: "embedding_88.tmp_0@GRAD"
+     slot_gradient: "embedding_89.tmp_0@GRAD"
+     slot_gradient: "embedding_90.tmp_0@GRAD"
+     slot_gradient: "embedding_91.tmp_0@GRAD"
+     slot_gradient: "embedding_92.tmp_0@GRAD"
+     slot_gradient: "embedding_93.tmp_0@GRAD"
+     slot_gradient: "embedding_94.tmp_0@GRAD"
+     slot_gradient: "embedding_95.tmp_0@GRAD"
+     slot_gradient: "embedding_96.tmp_0@GRAD"
+     slot_gradient: "embedding_97.tmp_0@GRAD"
+     slot_gradient: "embedding_98.tmp_0@GRAD"
+     slot_gradient: "embedding_99.tmp_0@GRAD"
+     slot_gradient: "embedding_100.tmp_0@GRAD"
+     slot_gradient: "embedding_101.tmp_0@GRAD"
+     slot_gradient: "embedding_102.tmp_0@GRAD"
+     slot_gradient: "embedding_103.tmp_0@GRAD"
+     slot_gradient: "embedding_104.tmp_0@GRAD"
+     slot_gradient: "embedding_105.tmp_0@GRAD"
+     slot_gradient: "embedding_106.tmp_0@GRAD"
+     slot_gradient: "embedding_107.tmp_0@GRAD"
+     slot_gradient: "embedding_108.tmp_0@GRAD"
+     slot_gradient: "embedding_109.tmp_0@GRAD"
+     slot_gradient: "embedding_110.tmp_0@GRAD"
+     slot_gradient: "embedding_111.tmp_0@GRAD"
+     slot_gradient: "embedding_112.tmp_0@GRAD"
+     slot_gradient: "embedding_113.tmp_0@GRAD"
+     slot_gradient: "embedding_114.tmp_0@GRAD"
+     slot_gradient: "embedding_115.tmp_0@GRAD"
+     slot_gradient: "embedding_116.tmp_0@GRAD"
+     slot_gradient: "embedding_117.tmp_0@GRAD"
+     slot_gradient: "embedding_118.tmp_0@GRAD"
+     slot_gradient: "embedding_119.tmp_0@GRAD"
+     slot_gradient: "embedding_120.tmp_0@GRAD"
+     slot_gradient: "embedding_121.tmp_0@GRAD"
+     slot_gradient: "embedding_122.tmp_0@GRAD"
+     slot_gradient: "embedding_123.tmp_0@GRAD"
+     slot_gradient: "embedding_124.tmp_0@GRAD"
+     slot_gradient: "embedding_125.tmp_0@GRAD"
+     slot_gradient: "embedding_126.tmp_0@GRAD"
+     slot_gradient: "embedding_127.tmp_0@GRAD"
+     slot_gradient: "embedding_128.tmp_0@GRAD"
+     slot_gradient: "embedding_129.tmp_0@GRAD"
+     slot_gradient: "embedding_130.tmp_0@GRAD"
+     slot_gradient: "embedding_131.tmp_0@GRAD"
+     slot_gradient: "embedding_132.tmp_0@GRAD"
+     slot_gradient: "embedding_133.tmp_0@GRAD"
+     slot_gradient: "embedding_134.tmp_0@GRAD"
+     slot_gradient: "embedding_135.tmp_0@GRAD"
+     slot_gradient: "embedding_136.tmp_0@GRAD"
+     slot_gradient: "embedding_137.tmp_0@GRAD"
+     slot_gradient: "embedding_138.tmp_0@GRAD"
+     slot_gradient: "embedding_139.tmp_0@GRAD"
+     slot_gradient: "embedding_140.tmp_0@GRAD"
+     slot_gradient: "embedding_141.tmp_0@GRAD"
+     slot_gradient: "embedding_142.tmp_0@GRAD"
+     slot_gradient: "embedding_143.tmp_0@GRAD"
+     slot_gradient: "embedding_144.tmp_0@GRAD"
+     slot_gradient: "embedding_145.tmp_0@GRAD"
+     slot_gradient: "embedding_146.tmp_0@GRAD"
+     slot_gradient: "embedding_147.tmp_0@GRAD"
+     slot_gradient: "embedding_148.tmp_0@GRAD"
+     slot_gradient: "embedding_149.tmp_0@GRAD"
+     slot_gradient: "embedding_150.tmp_0@GRAD"
+     slot_gradient: "embedding_151.tmp_0@GRAD"
+     slot_gradient: "embedding_152.tmp_0@GRAD"
+     slot_gradient: "embedding_153.tmp_0@GRAD"
+     slot_gradient: "embedding_154.tmp_0@GRAD"
+     slot_gradient: "embedding_155.tmp_0@GRAD"
+     slot_gradient: "embedding_156.tmp_0@GRAD"
+     slot_gradient: "embedding_157.tmp_0@GRAD"
+     slot_gradient: "embedding_158.tmp_0@GRAD"
+     slot_gradient: "embedding_159.tmp_0@GRAD"
+     slot_gradient: "embedding_160.tmp_0@GRAD"
+     slot_gradient: "embedding_161.tmp_0@GRAD"
+     slot_gradient: "embedding_162.tmp_0@GRAD"
+     slot_gradient: "embedding_163.tmp_0@GRAD"
+     slot_gradient: "embedding_164.tmp_0@GRAD"
+     slot_gradient: "embedding_165.tmp_0@GRAD"
+     slot_gradient: "embedding_166.tmp_0@GRAD"
+     slot_gradient: "embedding_167.tmp_0@GRAD"
+     slot_gradient: "embedding_168.tmp_0@GRAD"
+     slot_gradient: "embedding_169.tmp_0@GRAD"
+     slot_gradient: "embedding_170.tmp_0@GRAD"
+     slot_gradient: "embedding_171.tmp_0@GRAD"
+     slot_gradient: "embedding_172.tmp_0@GRAD"
+     slot_gradient: "embedding_173.tmp_0@GRAD"
+     slot_gradient: "embedding_174.tmp_0@GRAD"
+     slot_gradient: "embedding_175.tmp_0@GRAD"
+     slot_gradient: "embedding_176.tmp_0@GRAD"
+     slot_gradient: "embedding_177.tmp_0@GRAD"
+     slot_gradient: "embedding_178.tmp_0@GRAD"
+     slot_gradient: "embedding_179.tmp_0@GRAD"
+     slot_gradient: "embedding_180.tmp_0@GRAD"
+     slot_gradient: "embedding_181.tmp_0@GRAD"
+     slot_gradient: "embedding_182.tmp_0@GRAD"
+     slot_gradient: "embedding_183.tmp_0@GRAD"
+     slot_gradient: "embedding_184.tmp_0@GRAD"
+     slot_gradient: "embedding_185.tmp_0@GRAD"
+     slot_gradient: "embedding_186.tmp_0@GRAD"
+     slot_gradient: "embedding_187.tmp_0@GRAD"
+     slot_gradient: "embedding_188.tmp_0@GRAD"
+     slot_gradient: "embedding_189.tmp_0@GRAD"
+     slot_gradient: "embedding_190.tmp_0@GRAD"
+     slot_gradient: "embedding_191.tmp_0@GRAD"
+     slot_gradient: "embedding_192.tmp_0@GRAD"
+     slot_gradient: "embedding_193.tmp_0@GRAD"
+     slot_gradient: "embedding_194.tmp_0@GRAD"
+     slot_gradient: "embedding_195.tmp_0@GRAD"
+     slot_gradient: "embedding_196.tmp_0@GRAD"
+     slot_gradient: "embedding_197.tmp_0@GRAD"
+     slot_gradient: "embedding_198.tmp_0@GRAD"
+     slot_gradient: "embedding_199.tmp_0@GRAD"
+     slot_gradient: "embedding_200.tmp_0@GRAD"
+     slot_gradient: "embedding_201.tmp_0@GRAD"
+     slot_gradient: "embedding_202.tmp_0@GRAD"
+     slot_gradient: "embedding_203.tmp_0@GRAD"
+     slot_gradient: "embedding_204.tmp_0@GRAD"
+     slot_gradient: "embedding_205.tmp_0@GRAD"
+     slot_gradient: "embedding_206.tmp_0@GRAD"
+     slot_gradient: "embedding_207.tmp_0@GRAD"
+     slot_gradient: "embedding_208.tmp_0@GRAD"
+     slot_gradient: "embedding_209.tmp_0@GRAD"
+     slot_gradient: "embedding_210.tmp_0@GRAD"
+     slot_gradient: "embedding_211.tmp_0@GRAD"
+     slot_gradient: "embedding_212.tmp_0@GRAD"
+     slot_gradient: "embedding_213.tmp_0@GRAD"
+     slot_gradient: "embedding_214.tmp_0@GRAD"
+     slot_gradient: "embedding_215.tmp_0@GRAD"
+     slot_gradient: "embedding_216.tmp_0@GRAD"
+     slot_gradient: "embedding_217.tmp_0@GRAD"
+     slot_gradient: "embedding_218.tmp_0@GRAD"
+     slot_gradient: "embedding_219.tmp_0@GRAD"
+     slot_gradient: "embedding_220.tmp_0@GRAD"
+     slot_gradient: "embedding_221.tmp_0@GRAD"
+     slot_gradient: "embedding_222.tmp_0@GRAD"
+     slot_gradient: "embedding_223.tmp_0@GRAD"
+     slot_gradient: "embedding_224.tmp_0@GRAD"
+     slot_gradient: "embedding_225.tmp_0@GRAD"
+     slot_gradient: "embedding_226.tmp_0@GRAD"
+     slot_gradient: "embedding_227.tmp_0@GRAD"
+     slot_gradient: "embedding_228.tmp_0@GRAD"
+     slot_gradient: "embedding_229.tmp_0@GRAD"
+     slot_gradient: "embedding_230.tmp_0@GRAD"
+     slot_gradient: "embedding_231.tmp_0@GRAD"
+     slot_gradient: "embedding_232.tmp_0@GRAD"
+     slot_gradient: "embedding_233.tmp_0@GRAD"
+     slot_gradient: "embedding_234.tmp_0@GRAD"
+     slot_gradient: "embedding_235.tmp_0@GRAD"
+     slot_gradient: "embedding_236.tmp_0@GRAD"
+     slot_gradient: "embedding_237.tmp_0@GRAD"
+     slot_gradient: "embedding_238.tmp_0@GRAD"
+     slot_gradient: "embedding_239.tmp_0@GRAD"
+     slot_gradient: "embedding_240.tmp_0@GRAD"
+     slot_gradient: "embedding_241.tmp_0@GRAD"
+     slot_gradient: "embedding_242.tmp_0@GRAD"
+     slot_gradient: "embedding_243.tmp_0@GRAD"
+     slot_gradient: "embedding_244.tmp_0@GRAD"
+     slot_gradient: "embedding_245.tmp_0@GRAD"
+     slot_gradient: "embedding_246.tmp_0@GRAD"
+     slot_gradient: "embedding_247.tmp_0@GRAD"
+     slot_gradient: "embedding_248.tmp_0@GRAD"
+     slot_gradient: "embedding_249.tmp_0@GRAD"
+     slot_gradient: "embedding_250.tmp_0@GRAD"
+     slot_gradient: "embedding_251.tmp_0@GRAD"
+     slot_gradient: "embedding_252.tmp_0@GRAD"
+     slot_gradient: "embedding_253.tmp_0@GRAD"
+     slot_gradient: "embedding_254.tmp_0@GRAD"
+     slot_gradient: "embedding_255.tmp_0@GRAD"
+     slot_gradient: "embedding_256.tmp_0@GRAD"
+     slot_gradient: "embedding_257.tmp_0@GRAD"
+     slot_gradient: "embedding_258.tmp_0@GRAD"
+     slot_gradient: "embedding_259.tmp_0@GRAD"
+     slot_gradient: "embedding_260.tmp_0@GRAD"
+     slot_gradient: "embedding_261.tmp_0@GRAD"
+     slot_gradient: "embedding_262.tmp_0@GRAD"
+     slot_gradient: "embedding_263.tmp_0@GRAD"
+     slot_gradient: "embedding_264.tmp_0@GRAD"
+     slot_gradient: "embedding_265.tmp_0@GRAD"
+     slot_gradient: "embedding_266.tmp_0@GRAD"
+     slot_gradient: "embedding_267.tmp_0@GRAD"
+     slot_gradient: "embedding_268.tmp_0@GRAD"
+     slot_gradient: "embedding_269.tmp_0@GRAD"
+     slot_gradient: "embedding_270.tmp_0@GRAD"
+     slot_gradient: "embedding_271.tmp_0@GRAD"
+     slot_gradient: "embedding_272.tmp_0@GRAD"
+     slot_gradient: "embedding_273.tmp_0@GRAD"
+     slot_gradient: "embedding_274.tmp_0@GRAD"
+     slot_gradient: "embedding_275.tmp_0@GRAD"
+     slot_gradient: "embedding_276.tmp_0@GRAD"
+     slot_gradient: "embedding_277.tmp_0@GRAD"
+     slot_gradient: "embedding_278.tmp_0@GRAD"
+     slot_gradient: "embedding_279.tmp_0@GRAD"
+     slot_gradient: "embedding_280.tmp_0@GRAD"
+     slot_gradient: "embedding_281.tmp_0@GRAD"
+     slot_gradient: "embedding_282.tmp_0@GRAD"
+     slot_gradient: "embedding_283.tmp_0@GRAD"
+     slot_gradient: "embedding_284.tmp_0@GRAD"
+     slot_gradient: "embedding_285.tmp_0@GRAD"
+     slot_gradient: "embedding_286.tmp_0@GRAD"
+     slot_gradient: "embedding_287.tmp_0@GRAD"
+     slot_gradient: "embedding_288.tmp_0@GRAD"
+     slot_gradient: "embedding_289.tmp_0@GRAD"
+     slot_gradient: "embedding_290.tmp_0@GRAD"
+     slot_gradient: "embedding_291.tmp_0@GRAD"
+     slot_gradient: "embedding_292.tmp_0@GRAD"
+     slot_gradient: "embedding_293.tmp_0@GRAD"
+     slot_gradient: "embedding_294.tmp_0@GRAD"
+     slot_gradient: "embedding_295.tmp_0@GRAD"
+     slot_gradient: "embedding_296.tmp_0@GRAD"
+     slot_gradient: "embedding_297.tmp_0@GRAD"
+     slot_gradient: "embedding_298.tmp_0@GRAD"
+     slot_gradient: "embedding_299.tmp_0@GRAD"
+     slot_gradient: "embedding_300.tmp_0@GRAD"
+     slot_gradient: "embedding_301.tmp_0@GRAD"
+     slot_gradient: "embedding_302.tmp_0@GRAD"
+     slot_gradient: "embedding_303.tmp_0@GRAD"
+     slot_gradient: "embedding_304.tmp_0@GRAD"
+     slot_gradient: "embedding_305.tmp_0@GRAD"
+     slot_gradient: "embedding_306.tmp_0@GRAD"
+     slot_gradient: "embedding_307.tmp_0@GRAD"
+     slot_gradient: "embedding_308.tmp_0@GRAD"
+     slot_gradient: "embedding_309.tmp_0@GRAD"
+     slot_gradient: "embedding_310.tmp_0@GRAD"
+     slot_gradient: "embedding_311.tmp_0@GRAD"
+     slot_gradient: "embedding_312.tmp_0@GRAD"
+     slot_gradient: "embedding_313.tmp_0@GRAD"
+     slot_gradient: "embedding_314.tmp_0@GRAD"
+     slot_gradient: "embedding_315.tmp_0@GRAD"
+     slot_gradient: "embedding_316.tmp_0@GRAD"
+     slot_gradient: "embedding_317.tmp_0@GRAD"
+     slot_gradient: "embedding_318.tmp_0@GRAD"
+     slot_gradient: "embedding_319.tmp_0@GRAD"
+     slot_gradient: "embedding_320.tmp_0@GRAD"
+     slot_gradient: "embedding_321.tmp_0@GRAD"
+     slot_gradient: "embedding_322.tmp_0@GRAD"
+     slot_gradient: "embedding_323.tmp_0@GRAD"
+     slot_gradient: "embedding_324.tmp_0@GRAD"
+     slot_gradient: "embedding_325.tmp_0@GRAD"
+     slot_gradient: "embedding_326.tmp_0@GRAD"
+     slot_gradient: "embedding_327.tmp_0@GRAD"
+     slot_gradient: "embedding_328.tmp_0@GRAD"
+     slot_gradient: "embedding_329.tmp_0@GRAD"
+     slot_gradient: "embedding_330.tmp_0@GRAD"
+     slot_gradient: "embedding_331.tmp_0@GRAD"
+     slot_gradient: "embedding_332.tmp_0@GRAD"
+     slot_gradient: "embedding_333.tmp_0@GRAD"
+     slot_gradient: "embedding_334.tmp_0@GRAD"
+     slot_gradient: "embedding_335.tmp_0@GRAD"
+     slot_gradient: "embedding_336.tmp_0@GRAD"
+     slot_gradient: "embedding_337.tmp_0@GRAD"
+     slot_gradient: "embedding_338.tmp_0@GRAD"
+     slot_gradient: "embedding_339.tmp_0@GRAD"
+     slot_gradient: "embedding_340.tmp_0@GRAD"
+     slot_gradient: "embedding_341.tmp_0@GRAD"
+     slot_gradient: "embedding_342.tmp_0@GRAD"
+     slot_gradient: "embedding_343.tmp_0@GRAD"
+     slot_gradient: "embedding_344.tmp_0@GRAD"
+     slot_gradient: "embedding_345.tmp_0@GRAD"
+     slot_gradient: "embedding_346.tmp_0@GRAD"
+     slot_gradient: "embedding_347.tmp_0@GRAD"
+     slot_gradient: "embedding_348.tmp_0@GRAD"
+     slot_gradient: "embedding_349.tmp_0@GRAD"
+     slot_gradient: "embedding_350.tmp_0@GRAD"
+     slot_gradient: "embedding_351.tmp_0@GRAD"
+     slot_gradient: "embedding_352.tmp_0@GRAD"
+     slot_gradient: "embedding_353.tmp_0@GRAD"
+     slot_gradient: "embedding_354.tmp_0@GRAD"
+     slot_gradient: "embedding_355.tmp_0@GRAD"
+     slot_gradient: "embedding_356.tmp_0@GRAD"
+     slot_gradient: "embedding_357.tmp_0@GRAD"
+     slot_gradient: "embedding_358.tmp_0@GRAD"
+     slot_gradient: "embedding_359.tmp_0@GRAD"
+     slot_gradient: "embedding_360.tmp_0@GRAD"
+     slot_gradient: "embedding_361.tmp_0@GRAD"
+     slot_gradient: "embedding_362.tmp_0@GRAD"
+     slot_gradient: "embedding_363.tmp_0@GRAD"
+     slot_gradient: "embedding_364.tmp_0@GRAD"
+     slot_gradient: "embedding_365.tmp_0@GRAD"
+     slot_gradient: "embedding_366.tmp_0@GRAD"
+     slot_gradient: "embedding_367.tmp_0@GRAD"
+     slot_gradient: "embedding_368.tmp_0@GRAD"
+     slot_gradient: "embedding_369.tmp_0@GRAD"
+     slot_gradient: "embedding_370.tmp_0@GRAD"
+     slot_gradient: "embedding_371.tmp_0@GRAD"
+     slot_gradient: "embedding_372.tmp_0@GRAD"
+     slot_gradient: "embedding_373.tmp_0@GRAD"
+     slot_gradient: "embedding_374.tmp_0@GRAD"
+     slot_gradient: "embedding_375.tmp_0@GRAD"
+     slot_gradient: "embedding_376.tmp_0@GRAD"
+     slot_gradient: "embedding_377.tmp_0@GRAD"
+     slot_gradient: "embedding_378.tmp_0@GRAD"
+     slot_gradient: "embedding_379.tmp_0@GRAD"
+     slot_gradient: "embedding_380.tmp_0@GRAD"
+     slot_gradient: "embedding_381.tmp_0@GRAD"
+     slot_gradient: "embedding_382.tmp_0@GRAD"
+     slot_gradient: "embedding_383.tmp_0@GRAD"
+     slot_gradient: "embedding_384.tmp_0@GRAD"
+     slot_gradient: "embedding_385.tmp_0@GRAD"
+     slot_gradient: "embedding_386.tmp_0@GRAD"
+     slot_gradient: "embedding_387.tmp_0@GRAD"
+     slot_gradient: "embedding_388.tmp_0@GRAD"
+     slot_gradient: "embedding_389.tmp_0@GRAD"
+     slot_gradient: "embedding_390.tmp_0@GRAD"
+     slot_gradient: "embedding_391.tmp_0@GRAD"
+     slot_gradient: "embedding_392.tmp_0@GRAD"
+     slot_gradient: "embedding_393.tmp_0@GRAD"
+     slot_gradient: "embedding_394.tmp_0@GRAD"
+     slot_gradient: "embedding_395.tmp_0@GRAD"
+     slot_gradient: "embedding_396.tmp_0@GRAD"
+     slot_gradient: "embedding_397.tmp_0@GRAD"
+     slot_gradient: "embedding_398.tmp_0@GRAD"
+     slot_gradient: "embedding_399.tmp_0@GRAD"
+     slot_gradient: "embedding_400.tmp_0@GRAD"
+     slot_gradient: "embedding_401.tmp_0@GRAD"
+     slot_gradient: "embedding_402.tmp_0@GRAD"
+     slot_gradient: "embedding_403.tmp_0@GRAD"
+     slot_gradient: "embedding_404.tmp_0@GRAD"
+     slot_gradient: "embedding_405.tmp_0@GRAD"
+     slot_gradient: "embedding_406.tmp_0@GRAD"
+   }
+  skip_op: "lookup_table"
+  skip_op: "lookup_table_grad"
+}
+fs_client_param {
+  uri: "afs://xingtian.afs.baidu.com:9902"
+  user: "mlarch_pro"
+  passwd: "proisvip"
+  hadoop_bin: "$HADOOP_HOME/bin/hadoop"
+}
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/scripts/xbox_compressor_mf.py b/feed/feed_deploy/news_jingpai/package/my_nets/scripts/xbox_compressor_mf.py
new file mode 100755
index 0000000000000000000000000000000000000000..b306ddfeb183515c7652b2f0d08cbe98f95033b4
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/scripts/xbox_compressor_mf.py
@@ -0,0 +1,162 @@
+#!/usr/bin/python
+"""
+xbox model compressor
+"""
+
+import sys
+import math
+import time
+import re
+
+#WISE
+#SHOW_COMPRESS_RATIO : 8192
+#CLICK_COMPRESS_RATIO : 8192
+#LR_COMPRESS_RATIO : 1048576
+#MIO_COMPRESS_RATIO:8192
+
+#PC
+#MIO_COMPRESS_RATIO : 1024
+#SHOW_COMPRESS_RATIO : 128
+#CLICK_COMPRESS_RATIO : 1024
+#LR_COMPRESS_RATIO : 8192
+
+#STAMP_COL = 2
+SHOW_COL = 3
+CLICK_COL = 4
+LR_W_COL = 5
+LR_G2SUM_COL = 6
+FM_COL = 9
+
+#DAY_SPAN = 300
+
+#show clk lr = float
+SHOW_RATIO = 1
+#SHOW_RATIO = 1024
+CLK_RATIO = 8
+#CLK_RATIO = 1024
+LR_RATIO = 1024
+MF_RATIO = 1024
+
+base_update_threshold=0.965
+base_xbox_clk_cof=1
+base_xbox_nonclk_cof=0.2
+
+def as_num(x):
+    y='{:.5f}'.format(x)
+    return(y)
+
+def compress_show(xx):
+    """
+    compress show
+    """
+    preci = SHOW_RATIO
+
+    x = float(xx)
+    return str(int(math.floor(x * preci + 0.5)))
+
+
+def compress_clk(xx):
+    """
+    compress clk
+    """
+    preci = CLK_RATIO
+
+    x = float(xx)
+    clk = int(math.floor(x * preci + 0.5))
+    if clk == 0:
+        return ""
+    return str(clk)
+
+
+def compress_lr(xx):
+    """
+    compress lr
+    """
+    preci = LR_RATIO
+
+    x = float(xx)
+    lr = int(math.floor(x * preci + 0.5))
+    if lr == 0:
+        return ""
+    return str(lr)
+
+def compress_mf(xx):
+    """
+    compress mf
+    """
+    preci = MF_RATIO
+
+    x = float(xx)
+    return int(math.floor(x * preci + 0.5))
+
+
+def show_clk_score(show, clk):
+    """
+    calculate show_clk score
+    """
+    return (show - clk) * 0.2 + clk
+
+
+for l in sys.stdin:
+    cols = re.split(r'\s+', l.strip())
+    key = cols[0].strip()
+
+    #day = int(cols[STAMP_COL].strip())
+    #cur_day = int(time.time()/3600/24)
+    #if (day + DAY_SPAN) <= cur_day:
+    #    continue
+
+    # cvm features
+    show = cols[SHOW_COL]
+    click = cols[CLICK_COL]
+    pred = ""
+
+    f_show = float(show)
+    f_clk = float(click)
+    """
+    if f_show != 0:
+        show_log = math.log(f_show)
+    else:
+        show_log = 0
+
+    if f_clk != 0:
+        click_log =  math.log(f_clk) - show_log
+    else:
+        click_log = 0
+    """
+    show_log = f_show
+    click_log = f_clk
+    #print f_show, f_clk
+    #if show_clk_score(f_show, f_clk) < base_update_threshold:
+    #    continue
+
+    #show = compress_show(show)
+    show = compress_show(show_log)
+    #clk = compress_clk(click)
+    clk = compress_clk(click_log)
+
+    # personal lr weight
+    lr_w = cols[LR_W_COL].strip()
+    lr_wei = compress_lr(lr_w)
+
+    # fm weight
+    fm_wei = []
+    fm_sum = 0
+    if len(cols) > 7:
+    #fm_dim = int(cols[FM_COL].strip())
+    #if fm_dim != 0:
+        for v in xrange(FM_COL, len(cols), 1):
+            mf_v = compress_mf(cols[v])
+            #print mf_v
+            fm_wei.append(str(mf_v))
+            fm_sum += (mf_v * mf_v)
+
+    sys.stdout.write("%s\t%s\t%s\t%s" % (key, show, clk, pred))
+    sys.stdout.write("\t")
+    sys.stdout.write("%s" % lr_wei)
+    if len(fm_wei) > 0 and fm_sum > 0:
+        sys.stdout.write("\t%s" % "\t".join(fm_wei))
+    else:
+        sys.stdout.write("\t[\t]")
+    sys.stdout.write("\n")
+
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/scripts/xbox_decompressor_mf.awk b/feed/feed_deploy/news_jingpai/package/my_nets/scripts/xbox_decompressor_mf.awk
new file mode 100755
index 0000000000000000000000000000000000000000..080e84419bc47675cb46a725b4e94480cd3da920
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/scripts/xbox_decompressor_mf.awk
@@ -0,0 +1,52 @@
+#!/bin/awk -f
+{
+    OFS="\t";
+    SHOW_RATIO = 1;
+    CLK_RATIO = 8;
+    LR_RATIO = 1024;
+    MF_RATIO = 1024;
+}
+
+function decompress_show(x) {
+    x = x * 1.0 / SHOW_RATIO;
+    return x;
+}
+
+function decompress_clk(x) {
+    if (x == "") {
+        x = 0;
+    }
+    x = x * 1.0 / CLK_RATIO;
+    return x;
+}
+
+function decompress_lr(x) {
+    return x * 1.0 / LR_RATIO;
+}
+
+function decompress_mf(x) {
+    return x * 1.0 / MF_RATIO;
+}
+
+function show_clk_sore(show, clk, nonclk_coeff, clk_coeff) {
+    return (show - clk) * nonclk_coeff + clk * clk_coeff;
+}
+
+#key, show, clk, pred, lr_w, mf_w or [\t]
+{
+    l=split($0, a, "\t");
+
+    show = decompress_show(a[2]);
+    click = decompress_clk(a[3]);
+    lr = decompress_lr(a[5]);
+    printf("%s\t0\t0\t%s\t%s\t%s\t0\t", a[1], show, click, lr);
+    if (l == 7) {
+        printf("\n");
+    } else {
+        printf("%d", l-5)
+        for(i = 6; i <= l; i++) {
+            printf("\t%s", decompress_mf(a[i]));
+        }
+        printf("\n");
+    }
+}
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/slot/slot b/feed/feed_deploy/news_jingpai/package/my_nets/slot/slot
new file mode 100644
index 0000000000000000000000000000000000000000..dd6723ffb39ee17c44e0119c96d9481bd3ce98ef
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/slot/slot
@@ -0,0 +1,407 @@
+6048
+6002
+6145
+6202
+6201
+6121
+6738
+6119
+6146
+6120
+6147
+6122
+6123
+6118
+6142
+6143
+6008
+6148
+6151
+6127
+6144
+6094
+6083
+6952
+6739
+6150
+6109
+6003
+6099
+6149
+6129
+6203
+6153
+6152
+6128
+6106
+6251
+7082
+7515
+6951
+6949
+7080
+6066
+7507
+6186
+6007
+7514
+6125
+7506
+10001
+6006
+7023
+6085
+10000
+6098
+6250
+6110
+6124
+6090
+6082
+6067
+6101
+6004
+6191
+7075
+6948
+6157
+6126
+6188
+7077
+6070
+6111
+6087
+6103
+6107
+6194
+6156
+6005
+6247
+6814
+6158
+7122
+6058
+6189
+7058
+6059
+6115
+7079
+7081
+6833
+7024
+6108
+13342
+13345
+13412
+13343
+13350
+13346
+13409
+6009
+6011
+6012
+6013
+6014
+6015
+6019
+6023
+6024
+6027
+6029
+6031
+6050
+6060
+6068
+6069
+6089
+6095
+6105
+6112
+6130
+6131
+6132
+6134
+6161
+6162
+6163
+6166
+6182
+6183
+6185
+6190
+6212
+6213
+6231
+6233
+6234
+6236
+6238
+6239
+6240
+6241
+6242
+6243
+6244
+6245
+6354
+7002
+7005
+7008
+7010
+7013
+7015
+7019
+7020
+7045
+7046
+7048
+7049
+7052
+7054
+7056
+7064
+7066
+7076
+7078
+7083
+7084
+7085
+7086
+7087
+7088
+7089
+7090
+7099
+7100
+7101
+7102
+7103
+7104
+7105
+7109
+7124
+7126
+7136
+7142
+7143
+7144
+7145
+7146
+7147
+7148
+7150
+7151
+7152
+7153
+7154
+7155
+7156
+7157
+7047
+7050
+6257
+6259
+6260
+6261
+7170
+7185
+7186
+6751
+6755
+6757
+6759
+6760
+6763
+6764
+6765
+6766
+6767
+6768
+6769
+6770
+7502
+7503
+7504
+7505
+7510
+7511
+7512
+7513
+6806
+6807
+6808
+6809
+6810
+6811
+6812
+6813
+6815
+6816
+6817
+6819
+6823
+6828
+6831
+6840
+6845
+6875
+6879
+6881
+6888
+6889
+6947
+6950
+6956
+6957
+6959
+10006
+10008
+10009
+10010
+10011
+10016
+10017
+10018
+10019
+10020
+10021
+10022
+10023
+10024
+10029
+10030
+10031
+10032
+10033
+10034
+10035
+10036
+10037
+10038
+10039
+10040
+10041
+10042
+10044
+10045
+10046
+10051
+10052
+10053
+10054
+10055
+10056
+10057
+10060
+10066
+10069
+6820
+6821
+6822
+13333
+13334
+13335
+13336
+13337
+13338
+13339
+13340
+13341
+13351
+13352
+13353
+13359
+13361
+13362
+13363
+13366
+13367
+13368
+13369
+13370
+13371
+13375
+13376
+5700
+5702
+13400
+13401
+13402
+13403
+13404
+13406
+13407
+13408
+13410
+13417
+13418
+13419
+13420
+13422
+13425
+13427
+13428
+13429
+13430
+13431
+13433
+13434
+13436
+13437
+13326
+13330
+13331
+5717
+13442
+13451
+13452
+13455
+13456
+13457
+13458
+13459
+13460
+13461
+13462
+13463
+13464
+13465
+13466
+13467
+13468
+1104
+1106
+1107
+1108
+1109
+1110
+1111
+1112
+1113
+1114
+1115
+1116
+1117
+1119
+1120
+1121
+1122
+1123
+1124
+1125
+1126
+1127
+1128
+1129
+13812
+13813
+6740
+1490
+32915
+32950
+32952
+32953
+32954
+33077
+33085
+33086
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/slot/slot_common b/feed/feed_deploy/news_jingpai/package/my_nets/slot/slot_common
new file mode 100644
index 0000000000000000000000000000000000000000..869fb695282eed4a69928e7af52dd49a62e0d4c6
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/slot/slot_common
@@ -0,0 +1,99 @@
+6048
+6002
+6145
+6202
+6201
+6121
+6738
+6119
+6146
+6120
+6147
+6122
+6123
+6118
+6142
+6143
+6008
+6148
+6151
+6127
+6144
+6094
+6083
+6952
+6739
+6150
+6109
+6003
+6099
+6149
+6129
+6203
+6153
+6152
+6128
+6106
+6251
+7082
+7515
+6951
+6949
+7080
+6066
+7507
+6186
+6007
+7514
+6125
+7506
+10001
+6006
+7023
+6085
+10000
+6098
+6250
+6110
+6124
+6090
+6082
+6067
+6101
+6004
+6191
+7075
+6948
+6157
+6126
+6188
+7077
+6070
+6111
+6087
+6103
+6107
+6194
+6156
+6005
+6247
+6814
+6158
+7122
+6058
+6189
+7058
+6059
+6115
+7079
+7081
+6833
+7024
+6108
+13342
+13345
+13412
+13343
+13350
+13346
+13409
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/slot b/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/slot
new file mode 100644
index 0000000000000000000000000000000000000000..3e91b42e36e3bef406efc31c50a997ea7dc58f86
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/slot
@@ -0,0 +1,408 @@
+6048
+6002
+6145
+6202
+6201
+6121
+6738
+6119
+6146
+6120
+6147
+6122
+6123
+6118
+6142
+6143
+6008
+6148
+6151
+6127
+6144
+6094
+6083
+6952
+6739
+6150
+6109
+6003
+6099
+6149
+6129
+6203
+6153
+6152
+6128
+6106
+6251
+7082
+7515
+6951
+6949
+7080
+6066
+7507
+6186
+6007
+7514
+6125
+7506
+10001
+6006
+7023
+6085
+10000
+6098
+6250
+6110
+6124
+6090
+6082
+6067
+6101
+6004
+6191
+7075
+6948
+6157
+6126
+6188
+7077
+6070
+6111
+6087
+6103
+6107
+6194
+6156
+6005
+6247
+6814
+6158
+7122
+6058
+6189
+7058
+6059
+6115
+7079
+7081
+6833
+7024
+6108
+13342
+13345
+13412
+13343
+13350
+13346
+13409
+6009
+6011
+6012
+6013
+6014
+6015
+6019
+6023
+6024
+6027
+6029
+6031
+6050
+6060
+6068
+6069
+6089
+6095
+6105
+6112
+6130
+6131
+6132
+6134
+6161
+6162
+6163
+6166
+6182
+6183
+6185
+6190
+6212
+6213
+6231
+6233
+6234
+6236
+6238
+6239
+6240
+6241
+6242
+6243
+6244
+6245
+6354
+7002
+7005
+7008
+7010
+7012
+7013
+7015
+7016
+7017
+7018
+7019
+7020
+7045
+7046
+7048
+7049
+7052
+7054
+7056
+7064
+7066
+7076
+7078
+7083
+7084
+7085
+7086
+7087
+7088
+7089
+7090
+7099
+7100
+7101
+7102
+7103
+7104
+7105
+7109
+7124
+7126
+7136
+7142
+7143
+7144
+7145
+7146
+7147
+7148
+7150
+7151
+7152
+7153
+7154
+7155
+7156
+7157
+7047
+7050
+6253
+6254
+6255
+6256
+6257
+6259
+6260
+6261
+7170
+7185
+7186
+6751
+6755
+6757
+6759
+6760
+6763
+6764
+6765
+6766
+6767
+6768
+6769
+6770
+7502
+7503
+7504
+7505
+7510
+7511
+7512
+7513
+6806
+6807
+6808
+6809
+6810
+6811
+6812
+6813
+6815
+6816
+6817
+6819
+6823
+6828
+6831
+6840
+6845
+6875
+6879
+6881
+6888
+6889
+6947
+6950
+6956
+6957
+6959
+10006
+10008
+10009
+10010
+10011
+10016
+10017
+10018
+10019
+10020
+10021
+10022
+10023
+10024
+10029
+10030
+10031
+10032
+10033
+10034
+10035
+10036
+10037
+10038
+10039
+10040
+10041
+10042
+10044
+10045
+10046
+10051
+10052
+10053
+10054
+10055
+10056
+10057
+10060
+10066
+10069
+6820
+6821
+6822
+13333
+13334
+13335
+13336
+13337
+13338
+13339
+13340
+13341
+13351
+13352
+13353
+13359
+13361
+13362
+13363
+13366
+13367
+13368
+13369
+13370
+13371
+13375
+13376
+5700
+5702
+13400
+13401
+13402
+13403
+13404
+13406
+13407
+13408
+13410
+13417
+13418
+13419
+13420
+13422
+13425
+13427
+13428
+13429
+13430
+13431
+13433
+13434
+13436
+13437
+13326
+13330
+13331
+5717
+13442
+13451
+13452
+13455
+13456
+13457
+13458
+13459
+13460
+13461
+13462
+13463
+13464
+13465
+13466
+13467
+13468
+1104
+1106
+1107
+1108
+1109
+1110
+1111
+1112
+1113
+1114
+1115
+1116
+1117
+1119
+1120
+1121
+1122
+1123
+1124
+1125
+1126
+1127
+1128
+1129
+13812
+13813
+6740
+1490
+1491
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/slot_common b/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/slot_common
new file mode 100644
index 0000000000000000000000000000000000000000..869fb695282eed4a69928e7af52dd49a62e0d4c6
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/slot_common
@@ -0,0 +1,99 @@
+6048
+6002
+6145
+6202
+6201
+6121
+6738
+6119
+6146
+6120
+6147
+6122
+6123
+6118
+6142
+6143
+6008
+6148
+6151
+6127
+6144
+6094
+6083
+6952
+6739
+6150
+6109
+6003
+6099
+6149
+6129
+6203
+6153
+6152
+6128
+6106
+6251
+7082
+7515
+6951
+6949
+7080
+6066
+7507
+6186
+6007
+7514
+6125
+7506
+10001
+6006
+7023
+6085
+10000
+6098
+6250
+6110
+6124
+6090
+6082
+6067
+6101
+6004
+6191
+7075
+6948
+6157
+6126
+6188
+7077
+6070
+6111
+6087
+6103
+6107
+6194
+6156
+6005
+6247
+6814
+6158
+7122
+6058
+6189
+7058
+6059
+6115
+7079
+7081
+6833
+7024
+6108
+13342
+13345
+13412
+13343
+13350
+13346
+13409
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/to.py b/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/to.py
new file mode 100644
index 0000000000000000000000000000000000000000..638c53647dc2adc1d502ed53630f07dbcfe8ffce
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/tmp/slot/to.py
@@ -0,0 +1,5 @@
+with open("session_slot", "r") as fin:
+    res = []
+    for i in fin:
+        res.append("\"" + i.strip() + "\"")
+    print ", ".join(res)
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online.py b/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f29b42cce434085b0d4e3a969d7d6657e19d109
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online.py
@@ -0,0 +1,593 @@
+import numpy as np
+import os
+import sys
+import paddle
+import paddle.fluid as fluid
+import threading
+import time
+import config
+from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from model_new import Model
+from model_new_jc import ModelJoinCommon
+import util
+from util import *
+
+fleet_util = FleetUtil()
+
+def time_prefix_str():
+    return "\n" + time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) + "[0]:"
+
+auc_record = {}
+def check_auc_ok(auc_label, auc_log, auc_alarm):
+    auc_datas = auc_log.split(' AUC=')
+    if len(auc_datas) < 2:
+        return True
+    if auc_label not in auc_record:
+        auc_record[auc_label] = 0.0
+    auc = float(auc_datas[1].split(' ')[0])
+    if auc < auc_record[auc_label] and auc < auc_alarm:
+        fleet_util.rank0_print("label:%s, auc:%s, check bad" % (auc_label, auc)) 
+        return False
+    auc_record[auc_label] = auc
+    fleet_util.rank0_print("label:%s, auc:%s, check ok" % (auc_label, auc)) 
+    return True
+
+def create_model(slot_file, slot_common_file, all_slot_file):
+    join_common_model = ModelJoinCommon(slot_file, slot_common_file, all_slot_file, 20)
+    update_model = Model(slot_file, all_slot_file, False, 0, True)
+    with open("join_common_main_program.pbtxt", "w") as fout:
+        print >> fout, join_common_model._train_program
+    with open("join_common_startup_program.pbtxt", "w") as fout:
+        print >> fout, join_common_model._startup_program
+    with open("update_main_program.pbtxt", "w") as fout:
+        print >> fout, update_model._train_program
+    with open("update_startup_program.pbtxt", "w") as fout:
+        print >> fout, update_model._startup_program
+    return [join_common_model, update_model]
+
+def create_dataset(use_var_list, my_filelist):
+    dataset = fluid.DatasetFactory().create_dataset(config.dataset_type)
+    dataset.set_batch_size(config.batch_size)
+    dataset.set_thread(config.thread_num)
+    dataset.set_hdfs_config(config.fs_name, config.fs_ugi)
+    dataset.set_pipe_command(config.pipe_command)
+    dataset.set_filelist(my_filelist)
+    dataset.set_use_var(use_var_list)
+    #dataset.set_fleet_send_sleep_seconds(2)
+    #dataset.set_fleet_send_batch_size(80000)
+    return dataset
+
+def hdfs_ls(path):
+    configs = {
+	"fs.default.name": config.fs_name,
+	"hadoop.job.ugi": config.fs_ugi
+    }
+    hdfs_client = HDFSClient("$HADOOP_HOME", configs)
+    filelist = []
+    for i in path:
+        cur_path = hdfs_client.ls(i)
+        if config.fs_name.startswith("hdfs:"):
+            cur_path = ["hdfs:" + j for j in cur_path]
+        elif config.fs_name.startswith("afs:"):
+            cur_path = ["afs:" + j for j in cur_path]
+        filelist += cur_path
+    return filelist
+
+def get_avg_cost_mins(value):
+    t1 = time.time()
+    local_cost = np.array([value])
+    global_cost = np.copy(local_cost) * 0
+    t2 = time.time()
+    fleet._role_maker._node_type_comm.Allreduce(local_cost, global_cost)
+    t3 = time.time()
+    avg_cost = float(global_cost[0]) / fleet.worker_num()
+    avg_cost /= 60.0
+    t4 = time.time()
+    tc = (t2 - t1 + t4 - t3) / 60.0
+    tb = (t3 - t2) / 60.0
+    fleet_util.rank0_print("get_avg_cost_mins calc time %s barrier time %s" % (tc, tb))
+    return avg_cost
+
+def get_max_cost_mins(value):
+    from mpi4py import MPI
+    local_cost = np.array([value])
+    global_cost = np.copy(local_cost) * 0
+    fleet._role_maker._node_type_comm.Allreduce(local_cost, global_cost, op=MPI.MAX)
+    fleet_util.rank0_print("max train time %s mins" % (float(global_cost[0]) / 60.0))
+
+def get_min_cost_mins(value):
+    from mpi4py import MPI
+    local_cost = np.array([value])
+    global_cost = np.copy(local_cost) * 0
+    fleet._role_maker._node_type_comm.Allreduce(local_cost, global_cost, op=MPI.MIN)
+    fleet_util.rank0_print("min train time %s mins" % (float(global_cost[0]) / 60.0))
+
+def get_data_max(value):
+    from mpi4py import MPI
+    local_cost = np.array([value])
+    global_cost = np.copy(local_cost) * 0
+    fleet._role_maker._node_type_comm.Allreduce(local_cost, global_cost, op=MPI.MAX)
+    fleet_util.rank0_print("data size max %s" % global_cost[0])
+
+def get_data_min(value):
+    from mpi4py import MPI
+    local_cost = np.array([value])
+    global_cost = np.copy(local_cost) * 0
+    fleet._role_maker._node_type_comm.Allreduce(local_cost, global_cost, op=MPI.MIN)
+    fleet_util.rank0_print("data size min %s" % global_cost[0])
+
+def clear_metrics(fleet_util, model, scope):
+    fleet_util.set_zero(model.stat_pos.name, scope)
+    fleet_util.set_zero(model.stat_neg.name, scope)
+    fleet_util.set_zero(model.batch_stat_pos.name, scope)
+    fleet_util.set_zero(model.batch_stat_neg.name, scope)
+    fleet_util.set_zero(model.abserr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.sqrerr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.prob.name, scope, param_type="float32")
+    fleet_util.set_zero(model.q.name, scope, param_type="float32")
+    fleet_util.set_zero(model.pos.name, scope, param_type="float32")
+    fleet_util.set_zero(model.total.name, scope, param_type="float32")
+
+def clear_metrics_2(fleet_util, model, scope):
+    fleet_util.set_zero(model.join_stat_pos.name, scope)
+    fleet_util.set_zero(model.join_stat_neg.name, scope)
+    fleet_util.set_zero(model.join_batch_stat_pos.name, scope)
+    fleet_util.set_zero(model.join_batch_stat_neg.name, scope)
+    fleet_util.set_zero(model.join_abserr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.join_sqrerr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.join_prob.name, scope, param_type="float32")
+    fleet_util.set_zero(model.join_q.name, scope, param_type="float32")
+    fleet_util.set_zero(model.join_pos.name, scope, param_type="float32")
+    fleet_util.set_zero(model.join_total.name, scope, param_type="float32")
+
+    fleet_util.set_zero(model.common_stat_pos.name, scope)
+    fleet_util.set_zero(model.common_stat_neg.name, scope)
+    fleet_util.set_zero(model.common_batch_stat_pos.name, scope)
+    fleet_util.set_zero(model.common_batch_stat_neg.name, scope)
+    fleet_util.set_zero(model.common_abserr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.common_sqrerr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.common_prob.name, scope, param_type="float32")
+    fleet_util.set_zero(model.common_q.name, scope, param_type="float32")
+    fleet_util.set_zero(model.common_pos.name, scope, param_type="float32")
+    fleet_util.set_zero(model.common_total.name, scope, param_type="float32")
+
+def save_delta(day, pass_index, xbox_base_key, cur_path, exe, scope_join, scope_common, scope_update, join_model,
+               join_common_model, update_model, join_save_params, common_save_params, update_save_params, monitor_data):
+    stdout_str = ""
+    fleet_util.rank0_print("begin save delta model")
+    begin = time.time()
+    if pass_index == -1:
+        fleet_util.save_xbox_base_model(config.output_path, day)
+    else:
+        fleet_util.save_delta_model(config.output_path, day, pass_index)
+    end = time.time()
+    fleet_util.save_paddle_params(exe, scope_join, join_model._train_program, "paddle_dense.model.0",
+                                  config.output_path, day, pass_index, config.fs_name, config.fs_ugi,
+                                  var_names=join_save_params)
+    fleet_util.save_paddle_params(exe, scope_common, join_common_model._train_program, "paddle_dense.model.1",
+                                  config.output_path, day, pass_index, config.fs_name, config.fs_ugi,
+                                  var_names=common_save_params)
+    fleet_util.save_paddle_params(exe, scope_update, update_model._train_program, "paddle_dense.model.2",
+                                  config.output_path, day, pass_index, config.fs_name, config.fs_ugi,
+                                  var_names=update_save_params)
+    log_str = "end save delta cost %s min" % ((end - begin) / 60.0)
+    fleet_util.rank0_print(log_str)
+    stdout_str += time_prefix_str() + log_str
+    fleet_util.rank0_print("begin save cache")
+    begin = time.time()
+    if pass_index == -1:
+        key_num = fleet_util.save_cache_base_model(config.output_path, day)
+    else:
+        key_num = fleet_util.save_cache_model(config.output_path, day, pass_index)
+    fleet_util.write_cache_donefile(config.output_path, day, pass_index, key_num, config.fs_name, config.fs_ugi)
+    end = time.time()
+    log_str = "end save cache cost %s min, key_num=%s" % ((end - begin) / 60.0, key_num)
+    fleet_util.rank0_print(log_str)
+    stdout_str += time_prefix_str() + log_str
+    write_xbox_donefile(day, pass_index, xbox_base_key, ",".join(cur_path), monitor_data=monitor_data)
+    return stdout_str
+
+if __name__ == "__main__":
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    fleet.init(exe)
+
+    slot_file = "slot/slot"
+    slot_common_file = "slot/slot_common"
+    all_slot_file = "all_slot.dict"
+
+    join_common_model, update_model = create_model(slot_file, slot_common_file, all_slot_file)
+
+    scope2 = fluid.Scope()
+    scope3 = fluid.Scope()
+
+    adjust_ins_weight = { "need_adjust" : True, "nid_slot" : "6002", "nid_adjw_threshold" : 1000, "nid_adjw_ratio": 20,
+                          "ins_weight_slot": update_model.ins_weight.name }
+
+    thread_stat_var_names = []
+    thread_stat_var_names.append(join_common_model.join_stat_pos.name)
+    thread_stat_var_names.append(join_common_model.join_stat_neg.name)
+    thread_stat_var_names.append(join_common_model.join_sqrerr.name)
+    thread_stat_var_names.append(join_common_model.join_abserr.name)
+    thread_stat_var_names.append(join_common_model.join_prob.name)
+    thread_stat_var_names.append(join_common_model.join_q.name)
+    thread_stat_var_names.append(join_common_model.join_pos.name)
+    thread_stat_var_names.append(join_common_model.join_total.name)
+
+    thread_stat_var_names.append(join_common_model.common_stat_pos.name)
+    thread_stat_var_names.append(join_common_model.common_stat_neg.name)
+    thread_stat_var_names.append(join_common_model.common_sqrerr.name)
+    thread_stat_var_names.append(join_common_model.common_abserr.name)
+    thread_stat_var_names.append(join_common_model.common_prob.name)
+    thread_stat_var_names.append(join_common_model.common_q.name)
+    thread_stat_var_names.append(join_common_model.common_pos.name)
+    thread_stat_var_names.append(join_common_model.common_total.name)
+
+    thread_stat_var_names.append(update_model.stat_pos.name)
+    thread_stat_var_names.append(update_model.stat_neg.name)
+    thread_stat_var_names.append(update_model.sqrerr.name)
+    thread_stat_var_names.append(update_model.abserr.name)
+    thread_stat_var_names.append(update_model.prob.name)
+    thread_stat_var_names.append(update_model.q.name)
+    thread_stat_var_names.append(update_model.pos.name)
+    thread_stat_var_names.append(update_model.total.name)
+    
+    thread_stat_var_names = list(set(thread_stat_var_names))
+
+
+    adam = fluid.optimizer.Adam(learning_rate=0.000005)
+    adam = fleet.distributed_optimizer(adam, strategy={"use_cvm" : True, "adjust_ins_weight" : adjust_ins_weight, "scale_datanorm" : 1e-4, "dump_slot": True, "stat_var_names": thread_stat_var_names, "fleet_desc_file": "reqi_fleet_desc"})
+    adam.minimize([join_common_model.joint_cost, update_model.avg_cost], [scope2, scope3])
+
+    join_common_model._train_program._fleet_opt["program_configs"][str(id(join_common_model.joint_cost.block.program))]["push_sparse"] = []
+
+    join_save_params = ["join.batch_size", "join.batch_sum", "join.batch_square_sum",
+                        "join_0.w_0", "join_0.b_0", "join_1.w_0", "join_1.b_0", "join_2.w_0", "join_2.b_0",
+                        "join_3.w_0", "join_3.b_0", "join_4.w_0", "join_4.b_0", "join_5.w_0", "join_5.b_0",
+                        "join_6.w_0", "join_6.b_0", "join_7.w_0", "join_7.b_0"]
+    common_save_params = ["common.batch_size", "common.batch_sum", "common.batch_square_sum",
+                         "common_0.w_0", "common_0.b_0", "common_1.w_0", "common_1.b_0", "common_2.w_0", "common_2.b_0",
+                         "common_3.w_0", "common_3.b_0", "common_4.w_0", "common_4.b_0", "common_5.w_0", "common_5.b_0",
+                         "common_6.w_0", "common_6.b_0", "common_7.w_0", "common_7.b_0"]
+    update_save_params = ["fc_0.w_0", "fc_0.b_0", "fc_1.w_0", "fc_1.b_0",
+                           "fc_2.w_0", "fc_2.b_0", "fc_3.w_0", "fc_3.b_0",
+                           "fc_4.w_0", "fc_4.b_0", "fc_5.w_0", "fc_5.b_0"]
+
+    if fleet.is_server():
+        fleet.run_server()
+    elif fleet.is_worker():
+        with fluid.scope_guard(scope3):
+            exe.run(update_model._startup_program)
+        with fluid.scope_guard(scope2):
+            exe.run(join_common_model._startup_program)
+        
+        configs = {
+            "fs.default.name": config.fs_name,
+            "hadoop.job.ugi": config.fs_ugi
+        }
+        hdfs_client = HDFSClient("$HADOOP_HOME", configs)
+
+        save_first_base = config.save_first_base
+        path = config.train_data_path
+        online_pass_interval = fleet_util.get_online_pass_interval(config.days, config.hours, config.split_interval, config.split_per_pass, False) 
+        pass_per_day = len(online_pass_interval)
+        last_day, last_pass, last_path, xbox_base_key = fleet_util.get_last_save_model(config.output_path, config.fs_name, config.fs_ugi)
+        reqi = True if last_day != -1 else False
+
+        if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass:
+            util.reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3)
+        fleet.init_worker()
+        
+        dataset = None
+        next_dataset = None
+        cur_path = None
+        next_path = None
+        start_train = False
+        days = os.popen("echo -n " + config.days).read().split(" ")
+        hours = os.popen("echo -n " + config.hours).read().split(" ")
+        stdout_str = ""
+        begin_days = {}
+        for day_index in range(len(days)):
+            day = days[day_index]
+            if last_day != -1 and int(day) < last_day:
+                continue
+            for pass_index in range(1, pass_per_day + 1):
+                dataset = next_dataset
+                next_dataset = None
+                cur_path = next_path
+                next_path = None
+                if (last_day != -1 and int(day) == last_day) and (last_pass != -1 and int(pass_index) < last_pass):
+                    continue
+                if reqi:
+                    begin = time.time()
+                    log_str = "going to load model %s" % last_path
+                    fleet_util.rank0_print(log_str)
+                    if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass:
+                        fleet.load_one_table(0, last_path)
+                    else:
+                        fleet_util.load_fleet_model(last_path)
+
+                    end = time.time()
+                    log_str = "load model cost %s min" % ((end - begin) / 60.0)
+                    fleet_util.rank0_print(log_str)
+                    stdout_str += time_prefix_str() + log_str
+                    reqi = False
+                    if (last_day != -1 and int(day) == last_day) and (last_pass != -1 and int(pass_index) == last_pass):
+                        continue
+
+                #log_str = "===========going to train day/pass %s/%s===========" % (day, pass_index)
+                        
+                if begin_days.get(day) is None:
+                    log_str = "======== BEGIN DAY:%s ========" % day
+                    fleet_util.rank0_print(log_str)
+                    stdout_str += time_prefix_str() + log_str
+                    begin_days[day] = True
+
+                log_str = "    ==== begin delta:%s ========" % pass_index
+                fleet_util.rank0_print(log_str)
+                stdout_str += time_prefix_str() + log_str
+
+                if save_first_base:
+                    log_str = "save_first_base=True"
+                    fleet_util.rank0_print(log_str)
+                    save_first_base = False
+                    last_base_day, last_base_path, tmp_xbox_base_key = \
+                        fleet_util.get_last_save_xbox_base(config.output_path, config.fs_name, config.fs_ugi)
+                    if int(day) > last_base_day:
+                        log_str = "going to save xbox base model"
+                        fleet_util.rank0_print(log_str)
+                        stdout_str += time_prefix_str() + log_str
+                        xbox_base_key = int(time.time())
+                        cur = []
+                        for interval in online_pass_interval[pass_index - 1]:
+                            for p in path:
+                                cur.append(p + "/" + day + "/" + interval)
+                        stdout_str += save_delta(day, -1, xbox_base_key, cur, exe, scope2, scope2, scope3,
+                                                 join_common_model, join_common_model, update_model, 
+                                                 join_save_params, common_save_params, update_save_params, "")
+                    elif int(day) == last_base_day:
+                        xbox_base_key = tmp_xbox_base_key
+                        log_str = "xbox base model exists"
+                        fleet_util.rank0_print(log_str)
+                        stdout_str += time_prefix_str() + log_str
+                    else:
+                        log_str = "xbox base model exists"
+                        fleet_util.rank0_print(log_str)
+                        stdout_str += time_prefix_str() + log_str
+
+                start_train = True
+                train_begin = time.time()
+
+                if dataset is not None:
+                    begin = time.time()
+                    dataset.wait_preload_done()
+                    end = time.time()
+                    log_str = "wait data preload done cost %s min" % ((end - begin) / 60.0)
+                    fleet_util.rank0_print(log_str)
+                    stdout_str += time_prefix_str() + log_str
+
+                if dataset is None:
+                    cur_pass = online_pass_interval[pass_index - 1]
+                    cur_path = []
+                    for interval in cur_pass:
+                        for p in path:
+                            cur_path.append(p + "/" + day + "/" + interval)
+                    log_str = "data path: " + ",".join(cur_path)
+                    fleet_util.rank0_print(log_str)
+                    stdout_str += time_prefix_str() + log_str
+                    for i in cur_path:
+                        while not hdfs_client.is_exist(i + "/to.hadoop.done"):
+                            fleet_util.rank0_print("wait for data ready: %s" % i)
+                            time.sleep(config.check_exist_seconds)
+                    my_filelist = fleet.split_files(hdfs_ls(cur_path))
+
+                    dataset = create_dataset(join_common_model._all_slots, my_filelist)
+                    fleet_util.rank0_print("going to load into memory")
+                    begin = time.time()
+                    dataset.load_into_memory()
+                    end = time.time()
+                    log_str = "load into memory done, cost %s min" % ((end - begin) / 60.0)
+                    fleet_util.rank0_print(log_str)
+                    stdout_str += time_prefix_str() + log_str
+
+                fleet_util.rank0_print("going to global shuffle")
+                begin = time.time()
+                dataset.global_shuffle(fleet, config.shuffle_thread)
+                end = time.time()
+                log_str = "global shuffle done, cost %s min, data size %s" % ((end - begin) / 60.0, dataset.get_shuffle_data_size(fleet))
+                fleet_util.rank0_print(log_str)
+                stdout_str += time_prefix_str() + log_str
+                get_data_max(dataset.get_shuffle_data_size())
+                get_data_min(dataset.get_shuffle_data_size())
+
+                if config.prefetch and (pass_index < pass_per_day or pass_index == pass_per_day and day_index < len(days) - 1):
+                    if pass_index < pass_per_day:
+                        next_pass = online_pass_interval[pass_index]
+                        next_day = day
+                    else:
+                        next_pass = online_pass_interval[0]
+                        next_day = days[day_index + 1]
+                    next_path = []
+                    for interval in next_pass:
+                        for p in path:
+                            next_path.append(p + "/" + next_day + "/" + interval)
+                    next_data_ready = True
+                    for i in next_path:
+                        if not hdfs_client.is_exist(i + "/to.hadoop.done"):
+                            next_data_ready = False
+                            fleet_util.rank0_print("next data not ready: %s" % i)
+                    if not next_data_ready:
+                        next_dataset = None
+                    else:
+                        my_filelist = fleet.split_files(hdfs_ls(next_path))
+                        next_dataset = create_dataset(join_common_model._all_slots, my_filelist)
+                        log_str = "next pass data preload %s " % ",".join(next_path)
+                        fleet_util.rank0_print(log_str)
+                        stdout_str += time_prefix_str() + log_str
+                        next_dataset.preload_into_memory(config.preload_thread)
+
+
+                join_cost = 0
+                common_cost = 0
+                update_cost = 0
+                monitor_data = ""
+
+                with fluid.scope_guard(scope2):
+                    fleet_util.rank0_print("Begin join + common pass")
+                    begin = time.time()
+                    exe.train_from_dataset(join_common_model._train_program,
+                                           dataset,
+                                           scope2,
+                                           thread=config.join_common_thread,
+                                           debug=False)
+                    end = time.time()
+                    avg_cost = get_avg_cost_mins(end - begin)
+                    
+                    fleet_util.rank0_print("avg train time %s mins" % avg_cost)
+
+                    get_max_cost_mins(end - begin)
+                    get_min_cost_mins(end - begin)
+
+                    common_cost = avg_cost
+                   
+                    monitor_data = ""
+                    log_str = print_global_metrics(scope2, join_common_model.join_stat_pos.name, join_common_model.join_stat_neg.name,
+                                                   join_common_model.join_sqrerr.name, join_common_model.join_abserr.name,
+                                                   join_common_model.join_prob.name,
+                                                   join_common_model.join_q.name, join_common_model.join_pos.name,
+                                                   join_common_model.join_total.name, "joining pass:")#"join pass:")
+                    check_auc_ok("joining pass:", log_str, 0.79)
+                    monitor_data += log_str
+                    stdout_str += time_prefix_str() + "joining pass:"
+                    stdout_str += time_prefix_str() + log_str
+
+                    log_str = print_global_metrics(scope2, join_common_model.common_stat_pos.name, join_common_model.common_stat_neg.name,
+                                                   join_common_model.common_sqrerr.name, join_common_model.common_abserr.name,
+                                                   join_common_model.common_prob.name,
+                                                   join_common_model.common_q.name, join_common_model.common_pos.name,
+                                                   join_common_model.common_total.name, "common pass:")
+                    check_auc_ok("common pass:", log_str, 0.70)
+                    monitor_data += " " + log_str
+                    stdout_str += time_prefix_str() + "common pass:"
+                    stdout_str += time_prefix_str() + log_str
+                    fleet_util.rank0_print("End join+common pass")
+                    clear_metrics_2(fleet_util, join_common_model, scope2)
+
+                if config.save_xbox_before_update and pass_index % config.save_delta_frequency == 0:
+                    fleet_util.rank0_print("going to save delta model")
+                    last_xbox_day, last_xbox_pass, last_xbox_path, _ = fleet_util.get_last_save_xbox(config.output_path,  config.fs_name, config.fs_ugi)
+                    if int(day) < last_xbox_day or int(day) == last_xbox_day and int(pass_index) <= last_xbox_pass:
+                        log_str = "delta model exists"
+                        fleet_util.rank0_print(log_str)
+                        stdout_str += time_prefix_str() + log_str
+                    else:
+                        stdout_str += save_delta(day, pass_index, xbox_base_key, cur_path, exe, scope2, scope2, scope3,
+                                                 join_common_model, join_common_model, update_model,
+                                                 join_save_params, common_save_params, update_save_params, monitor_data)
+
+                with fluid.scope_guard(scope3):
+                    fleet_util.rank0_print("Begin update pass")
+                    begin = time.time()
+                    exe.train_from_dataset(update_model._train_program,
+                                           dataset,
+                                           scope3,
+                                           thread=config.update_thread,
+                                           debug=False)
+                    end = time.time()
+                    avg_cost = get_avg_cost_mins(end - begin)
+
+                    get_max_cost_mins(end - begin)
+                    get_min_cost_mins(end - begin)
+
+                    update_cost = avg_cost
+                    
+                    log_str = print_global_metrics(scope3, update_model.stat_pos.name, update_model.stat_neg.name,
+                                                   update_model.sqrerr.name, update_model.abserr.name, update_model.prob.name,
+                                                   update_model.q.name, update_model.pos.name, update_model.total.name,
+                                                   "updating pass:")#"update pass:")
+                    check_auc_ok("updating pass:", log_str, 0.79)
+                    stdout_str += time_prefix_str() + "updating pass:"
+                    stdout_str += time_prefix_str() + log_str
+                    fleet_util.rank0_print("End update pass")
+                    clear_metrics(fleet_util, update_model, scope3)
+
+                begin = time.time()
+                dataset.release_memory()
+                end = time.time()
+                fleet_util.rank0_print("release_memory cost %s min" % ((end - begin) / 60.0))
+
+                if (pass_index % config.checkpoint_per_pass) == 0 and pass_index != pass_per_day:
+                    begin = time.time()
+                    fleet_util.save_model(config.output_path, day, pass_index)
+                    fleet_util.write_model_donefile(config.output_path, day, pass_index, xbox_base_key, config.fs_name, config.fs_ugi)
+                    end = time.time()
+                    log_str = "save model cost %s min" % ((end - begin) / 60.0)
+                    fleet_util.rank0_print(log_str)
+                    stdout_str += time_prefix_str() + log_str
+                if not config.save_xbox_before_update and pass_index % config.save_delta_frequency == 0:
+                    fleet_util.rank0_print("going to save delta model")
+                    last_xbox_day, last_xbox_pass, last_xbox_path, _ = fleet_util.get_last_save_xbox(config.output_path,  config.fs_name, config.fs_ugi)
+                    if int(day) < last_xbox_day or int(day) == last_xbox_day and int(pass_index) <= last_xbox_pass:
+                        log_str = "delta model exists"
+                        fleet_util.rank0_print(log_str)
+                        stdout_str += time_prefix_str() + log_str
+                    else:
+                        stdout_str += save_delta(day, pass_index, xbox_base_key, cur_path, exe, scope2, scope2, scope3,
+                                                 join_common_model, join_common_model, update_model,
+                                                 join_save_params, common_save_params, update_save_params, monitor_data)
+
+                train_end = time.time()
+                train_cost = (train_end - train_begin) / 60.0
+                other_cost = train_cost - join_cost - common_cost - update_cost
+                log_str = "finished train day %s pass %s time cost:%s min job time cost" \
+                          ":[join:%s min][join_common:%s min][update:%s min][other:%s min]" \
+                          % (day, pass_index, train_cost, join_cost, common_cost, update_cost, other_cost)
+                fleet_util.rank0_print(log_str)
+                stdout_str += time_prefix_str() + log_str
+
+                if pass_index % config.write_stdout_frequency == 0:
+                    write_stdout(stdout_str)
+                    stdout_str = ""
+            
+            xbox_base_key = int(time.time())
+            if not start_train:
+                write_stdout(stdout_str)
+                stdout_str = ""
+                continue
+
+
+            fleet_util.rank0_print("going to save batch model/base xbox model")
+            last_base_day, last_base_path, _ = fleet_util.get_last_save_xbox_base(config.output_path, config.fs_name, config.fs_ugi)
+            nextday = int(days[day_index + 1])
+            if nextday <= last_base_day:
+                log_str = "batch model/base xbox model exists"
+                fleet_util.rank0_print(log_str)
+                stdout_str += time_prefix_str() + log_str
+            else:
+                stdout_str += save_delta(nextday, -1, xbox_base_key, cur_path, exe, scope2, scope2, scope3,
+                                         join_common_model, join_common_model, update_model,
+                                         join_save_params, common_save_params, update_save_params, monitor_data)
+                
+                fleet_util.rank0_print("shrink table")
+                begin = time.time()
+                fleet.shrink_sparse_table()
+                fleet.shrink_dense_table(0.98, scope=scope2, table_id=1)
+                fleet.shrink_dense_table(0.98, scope=scope2, table_id=2)
+                fleet.shrink_dense_table(0.98, scope=scope3, table_id=3)
+                end = time.time()
+                log_str = "shrink table done, cost %s min" % ((end - begin) / 60.0)
+                fleet_util.rank0_print(log_str)
+                stdout_str += time_prefix_str() + log_str
+
+                begin = time.time()
+                fleet_util.save_batch_model(config.output_path, nextday)
+                fleet_util.write_model_donefile(config.output_path, nextday, -1, xbox_base_key, config.fs_name, config.fs_ugi)
+                end = time.time()
+                log_str = "save batch model cost %s min" % ((end - begin) / 60.0)
+                fleet_util.rank0_print(log_str)
+                stdout_str += time_prefix_str() + log_str
+            write_stdout(stdout_str)
+            stdout_str = ""
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online_local.py b/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online_local.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7e1811e7ad6133bfe2f4aed209064ee42103358
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online_local.py
@@ -0,0 +1,500 @@
+import numpy as np
+import os
+import sys
+import paddle
+import paddle.fluid as fluid
+import threading
+import time
+import config
+from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from model_new import Model
+from model_new_jc import ModelJoinCommon
+
+fleet_util = FleetUtil()
+
+def create_model(slot_file, slot_common_file, all_slot_file):
+    join_common_model = ModelJoinCommon(slot_file, slot_common_file, all_slot_file, 20)
+    update_model = Model(slot_file, all_slot_file, False, 0, True)
+    with open("join_common_main_program.pbtxt", "w") as fout:
+        print >> fout, join_common_model._train_program
+    with open("join_common_startup_program.pbtxt", "w") as fout:
+        print >> fout, join_common_model._startup_program
+    with open("update_main_program.pbtxt", "w") as fout:
+        print >> fout, update_model._train_program
+    with open("update_startup_program.pbtxt", "w") as fout:
+        print >> fout, update_model._startup_program
+    return [join_common_model, update_model]
+
+def create_dataset(use_var_list, my_filelist):
+    dataset = fluid.DatasetFactory().create_dataset(config.dataset_type)
+    dataset.set_batch_size(config.batch_size)
+    dataset.set_thread(config.thread_num)
+    dataset.set_hdfs_config(config.fs_name, config.fs_ugi)
+    dataset.set_pipe_command(config.pipe_command)
+    dataset.set_filelist(my_filelist)
+    dataset.set_use_var(use_var_list)
+    return dataset
+
+def hdfs_ls(path):
+    configs = {
+	"fs.default.name": config.fs_name,
+	"hadoop.job.ugi": config.fs_ugi
+    }
+    hdfs_client = HDFSClient("$HADOOP_HOME", configs)
+    filelist = []
+    for i in path:
+        cur_path = hdfs_client.ls(i)
+        if config.fs_name.startswith("hdfs:"):
+            cur_path = ["hdfs:" + j for j in cur_path]
+        elif config.fs_name.startswith("afs:"):
+            cur_path = ["afs:" + j for j in cur_path]
+        filelist += cur_path
+    return filelist
+
+def get_avg_cost_mins(value):
+    t1 = time.time()
+    local_cost = np.array([value])
+    global_cost = np.copy(local_cost) * 0
+    t2 = time.time()
+    fleet._role_maker._node_type_comm.Allreduce(local_cost, global_cost)
+    t3 = time.time()
+    avg_cost = float(global_cost[0]) / fleet.worker_num()
+    avg_cost /= 60.0
+    t4 = time.time()
+    tc = (t2 - t1 + t4 - t3) / 60.0
+    tb = (t3 - t2) / 60.0
+    fleet_util.rank0_print("get_avg_cost_mins calc time %s barrier time %s" % (tc, tb))
+    return avg_cost
+
+def get_max_cost_mins(value):
+    from mpi4py import MPI
+    local_cost = np.array([value])
+    global_cost = np.copy(local_cost) * 0
+    fleet._role_maker._node_type_comm.Allreduce(local_cost, global_cost, op=MPI.MAX)
+    fleet_util.rank0_print("max train time %s mins" % (float(global_cost[0]) / 60.0))
+
+def get_min_cost_mins(value):
+    from mpi4py import MPI
+    local_cost = np.array([value])
+    global_cost = np.copy(local_cost) * 0
+    fleet._role_maker._node_type_comm.Allreduce(local_cost, global_cost, op=MPI.MIN)
+    fleet_util.rank0_print("min train time %s mins" % (float(global_cost[0]) / 60.0))
+
+def get_data_max(value):
+    from mpi4py import MPI
+    local_cost = np.array([value])
+    global_cost = np.copy(local_cost) * 0
+    fleet._role_maker._node_type_comm.Allreduce(local_cost, global_cost, op=MPI.MAX)
+    fleet_util.rank0_print("data size max %s" % global_cost[0])
+
+def get_data_min(value):
+    from mpi4py import MPI
+    local_cost = np.array([value])
+    global_cost = np.copy(local_cost) * 0
+    fleet._role_maker._node_type_comm.Allreduce(local_cost, global_cost, op=MPI.MIN)
+    fleet_util.rank0_print("data size min %s" % global_cost[0])
+
+def clear_metrics(fleet_util, model, scope):
+    fleet_util.set_zero(model.stat_pos.name, scope)
+    fleet_util.set_zero(model.stat_neg.name, scope)
+    fleet_util.set_zero(model.batch_stat_pos.name, scope)
+    fleet_util.set_zero(model.batch_stat_neg.name, scope)
+    fleet_util.set_zero(model.abserr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.sqrerr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.prob.name, scope, param_type="float32")
+    fleet_util.set_zero(model.q.name, scope, param_type="float32")
+    fleet_util.set_zero(model.pos.name, scope, param_type="float32")
+    fleet_util.set_zero(model.total.name, scope, param_type="float32")
+
+def clear_metrics_2(fleet_util, model, scope):
+    fleet_util.set_zero(model.join_stat_pos.name, scope)
+    fleet_util.set_zero(model.join_stat_neg.name, scope)
+    fleet_util.set_zero(model.join_batch_stat_pos.name, scope)
+    fleet_util.set_zero(model.join_batch_stat_neg.name, scope)
+    fleet_util.set_zero(model.join_abserr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.join_sqrerr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.join_prob.name, scope, param_type="float32")
+    fleet_util.set_zero(model.join_q.name, scope, param_type="float32")
+    fleet_util.set_zero(model.join_pos.name, scope, param_type="float32")
+    fleet_util.set_zero(model.join_total.name, scope, param_type="float32")
+
+    fleet_util.set_zero(model.common_stat_pos.name, scope)
+    fleet_util.set_zero(model.common_stat_neg.name, scope)
+    fleet_util.set_zero(model.common_batch_stat_pos.name, scope)
+    fleet_util.set_zero(model.common_batch_stat_neg.name, scope)
+    fleet_util.set_zero(model.common_abserr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.common_sqrerr.name, scope, param_type="float32")
+    fleet_util.set_zero(model.common_prob.name, scope, param_type="float32")
+    fleet_util.set_zero(model.common_q.name, scope, param_type="float32")
+    fleet_util.set_zero(model.common_pos.name, scope, param_type="float32")
+    fleet_util.set_zero(model.common_total.name, scope, param_type="float32")
+
+def save_delta(day, pass_index, xbox_base_key, cur_path, exe, scope_join, scope_common, scope_update, join_model,
+               join_common_model, update_model, join_save_params, common_save_params, update_save_params):
+    fleet_util.rank0_print("begin save delta model")
+    begin = time.time()
+    if pass_index == -1:
+        fleet_util.save_xbox_base_model(config.output_path, day)
+    else:
+        fleet_util.save_delta_model(config.output_path, day, pass_index)
+    end = time.time()
+    fleet_util.save_paddle_params(exe, scope_join, join_model._train_program, "paddle_dense.model.0",
+                                  config.output_path, day, pass_index, config.fs_name, config.fs_ugi,
+                                  var_names=join_save_params)
+    fleet_util.save_paddle_params(exe, scope_common, join_common_model._train_program, "paddle_dense.model.1",
+                                  config.output_path, day, pass_index, config.fs_name, config.fs_ugi,
+                                  var_names=common_save_params)
+    fleet_util.save_paddle_params(exe, scope_update, update_model._train_program, "paddle_dense.model.2",
+                                  config.output_path, day, pass_index, config.fs_name, config.fs_ugi,
+                                  var_names=update_save_params)
+    fleet_util.rank0_print("end save delta cost %s min" % ((end - begin) / 60.0))
+    fleet_util.rank0_print("begin save cache")
+    begin = time.time()
+    if pass_index == -1:
+        key_num = fleet_util.save_cache_base_model(config.output_path, day)
+    else:
+        key_num = fleet_util.save_cache_model(config.output_path, day, pass_index)
+    fleet_util.write_cache_donefile(config.output_path, day, pass_index, key_num, config.fs_name, config.fs_ugi)
+    end = time.time()
+    fleet_util.rank0_print("end save cache cost %s min, key_num=%s" % ((end - begin) / 60.0, key_num))
+    fleet_util.write_xbox_donefile(config.output_path, day, pass_index, xbox_base_key, ",".join(cur_path),
+                                   config.fs_name, config.fs_ugi)
+
+if __name__ == "__main__":
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    fleet.init(exe)
+
+    slot_file = "slot/slot"
+    slot_common_file = "slot/slot_common"
+    all_slot_file = "all_slot.dict"
+
+    join_common_model, update_model = create_model(slot_file, slot_common_file, all_slot_file)
+
+    scope2 = fluid.Scope()
+    scope3 = fluid.Scope()
+
+    adjust_ins_weight = { "need_adjust" : True, "nid_slot" : "6002", "nid_adjw_threshold" : 1000, "nid_adjw_ratio": 20,
+                          "ins_weight_slot": update_model.ins_weight.name }
+
+    thread_stat_var_names = []
+    thread_stat_var_names.append(join_common_model.join_stat_pos.name)
+    thread_stat_var_names.append(join_common_model.join_stat_neg.name)
+    thread_stat_var_names.append(join_common_model.join_sqrerr.name)
+    thread_stat_var_names.append(join_common_model.join_abserr.name)
+    thread_stat_var_names.append(join_common_model.join_prob.name)
+    thread_stat_var_names.append(join_common_model.join_q.name)
+    thread_stat_var_names.append(join_common_model.join_pos.name)
+    thread_stat_var_names.append(join_common_model.join_total.name)
+
+    thread_stat_var_names.append(join_common_model.common_stat_pos.name)
+    thread_stat_var_names.append(join_common_model.common_stat_neg.name)
+    thread_stat_var_names.append(join_common_model.common_sqrerr.name)
+    thread_stat_var_names.append(join_common_model.common_abserr.name)
+    thread_stat_var_names.append(join_common_model.common_prob.name)
+    thread_stat_var_names.append(join_common_model.common_q.name)
+    thread_stat_var_names.append(join_common_model.common_pos.name)
+    thread_stat_var_names.append(join_common_model.common_total.name)
+
+    thread_stat_var_names.append(update_model.stat_pos.name)
+    thread_stat_var_names.append(update_model.stat_neg.name)
+    thread_stat_var_names.append(update_model.sqrerr.name)
+    thread_stat_var_names.append(update_model.abserr.name)
+    thread_stat_var_names.append(update_model.prob.name)
+    thread_stat_var_names.append(update_model.q.name)
+    thread_stat_var_names.append(update_model.pos.name)
+    thread_stat_var_names.append(update_model.total.name)
+    
+    thread_stat_var_names = list(set(thread_stat_var_names))
+
+
+    adam = fluid.optimizer.Adam(learning_rate=0.000005)
+    adam = fleet.distributed_optimizer(adam, strategy={"use_cvm" : True, "adjust_ins_weight" : adjust_ins_weight, "scale_datanorm" : 1e-4, "dump_slot": True, "stat_var_names": thread_stat_var_names, "fleet_desc_file": "fleet_desc_combinejoincommon.prototxt"})
+    adam.minimize([join_common_model.joint_cost, update_model.avg_cost], [scope2, scope3])
+
+    join_common_model._train_program._fleet_opt["program_configs"][str(id(join_common_model.joint_cost.block.program))]["push_sparse"] = []
+
+    join_save_params = ["join.batch_size", "join.batch_sum", "join.batch_square_sum",
+                        "join_0.w_0", "join_0.b_0", "join_1.w_0", "join_1.b_0", "join_2.w_0", "join_2.b_0",
+                        "join_3.w_0", "join_3.b_0", "join_4.w_0", "join_4.b_0", "join_5.w_0", "join_5.b_0",
+                        "join_6.w_0", "join_6.b_0", "join_7.w_0", "join_7.b_0"]
+    common_save_params = ["common.batch_size", "common.batch_sum", "common.batch_square_sum",
+                         "common_0.w_0", "common_0.b_0", "common_1.w_0", "common_1.b_0", "common_2.w_0", "common_2.b_0",
+                         "common_3.w_0", "common_3.b_0", "common_4.w_0", "common_4.b_0", "common_5.w_0", "common_5.b_0",
+                         "common_6.w_0", "common_6.b_0", "common_7.w_0", "common_7.b_0"]
+    update_save_params = ["fc_0.w_0", "fc_0.b_0", "fc_1.w_0", "fc_1.b_0",
+                           "fc_2.w_0", "fc_2.b_0", "fc_3.w_0", "fc_3.b_0",
+                           "fc_4.w_0", "fc_4.b_0", "fc_5.w_0", "fc_5.b_0"]
+
+    if fleet.is_server():
+        fleet.run_server()
+    elif fleet.is_worker():
+        with fluid.scope_guard(scope3):
+            exe.run(update_model._startup_program)
+        with fluid.scope_guard(scope2):
+            exe.run(join_common_model._startup_program)
+        fleet.init_worker()
+
+        configs = {
+            "fs.default.name": config.fs_name,
+            "hadoop.job.ugi": config.fs_ugi
+        }
+        hdfs_client = HDFSClient("$HADOOP_HOME", configs)
+
+        save_first_base = config.save_first_base
+        path = config.train_data_path
+        online_pass_interval = fleet_util.get_online_pass_interval(config.days, config.hours, config.split_interval, config.split_per_pass, False) 
+        pass_per_day = len(online_pass_interval)
+        last_day, last_pass, last_path, xbox_base_key = fleet_util.get_last_save_model(config.output_path, config.fs_name, config.fs_ugi)
+        reqi = True if last_day != -1 else False
+
+        dataset = None
+        next_dataset = None
+        cur_path = None
+        next_path = None
+        start_train = False
+        days = os.popen("echo -n " + config.days).read().split(" ")
+        hours = os.popen("echo -n " + config.hours).read().split(" ")
+        for day_index in range(len(days)):
+            day = days[day_index]
+            if last_day != -1 and int(day) < last_day:
+                continue
+            for pass_index in range(1, pass_per_day + 1):
+                dataset = next_dataset
+                next_dataset = None
+                cur_path = next_path
+                next_path = None
+                if (last_day != -1 and int(day) == last_day) and (last_pass != -1 and int(pass_index) < last_pass):
+                    continue
+                if reqi:
+                    begin = time.time()
+                    fleet_util.rank0_print("going to load model %s" % last_path)
+                #    fleet_util.load_fleet_model(last_path)
+                 #   fleet.load_one_table(0, last_path)
+                 #   tmppath = "afs:/user/feed/mlarch/sequence_generator/wuzhihua02/xujiaqi/test_combinejoincommon_0921_72/new_model"
+                    #"afs:/user/feed/mlarch/sequence_generator/wuzhihua02/xujiaqi/test_combinejoincommon_0920_108/new_model"
+                    #"afs:/user/feed/mlarch/sequence_generator/wuzhihua02/xujiaqi/test_combinejoincommon_0915/new_model"
+                #    fleet.load_one_table(1,tmppath)
+                #    fleet.load_one_table(2,tmppath)
+                #    fleet.load_one_table(3,tmppath)
+
+                    end = time.time()
+                    fleet_util.rank0_print("load model cost %s min" % ((end - begin) / 60.0))
+                    reqi = False
+                    if (last_day != -1 and int(day) == last_day) and (last_pass != -1 and int(pass_index) == last_pass):
+                        continue
+
+                fleet_util.rank0_print("===========going to train day/pass %s/%s===========" % (day, pass_index))
+
+                if save_first_base:
+                    fleet_util.rank0_print("save_first_base=True")
+                    save_first_base = False
+                    last_base_day, last_base_path, tmp_xbox_base_key = \
+                        fleet_util.get_last_save_xbox_base(config.output_path, config.fs_name, config.fs_ugi)
+                    if int(day) > last_base_day:
+                        fleet_util.rank0_print("going to save xbox base model")
+                        xbox_base_key = int(time.time())
+                        cur = []
+                        for interval in online_pass_interval[pass_index - 1]:
+                            for p in path:
+                                cur.append(p + "/" + day + "/" + interval)
+                        save_delta(day, -1, xbox_base_key, cur, exe, scope2, scope2, scope3,
+                                   join_common_model, join_common_model, update_model, 
+                                   join_save_params, common_save_params, update_save_params)
+                    elif int(day) == last_base_day:
+                        xbox_base_key = tmp_xbox_base_key
+                        fleet_util.rank0_print("xbox base model exists")
+                    else:
+                        fleet_util.rank0_print("xbox base model exists")
+
+                start_train = True
+                train_begin = time.time()
+
+                if dataset is not None:
+                    begin = time.time()
+                    dataset.wait_preload_done()
+                    end = time.time()
+                    fleet_util.rank0_print("wait data preload done cost %s min" % ((end - begin) / 60.0))
+
+                if dataset is None:
+                    cur_pass = online_pass_interval[pass_index - 1]
+                    cur_path = []
+                    for interval in cur_pass:
+                        for p in path:
+                            cur_path.append(p + "/" + day + "/" + interval)
+                    fleet_util.rank0_print("data path: " + ",".join(cur_path))
+                    #for i in cur_path:
+                    #    while not hdfs_client.is_exist(i + "/to.hadoop.done"):
+                    #        fleet_util.rank0_print("wait for data ready: %s" % i)
+                    #        time.sleep(config.check_exist_seconds)
+                    my_filelist = ["part-00000_1"]#fleet.split_files(hdfs_ls(cur_path))
+
+                    dataset = create_dataset(join_common_model._all_slots, my_filelist)
+                    fleet_util.rank0_print("going to load into memory")
+                    begin = time.time()
+                    dataset.load_into_memory()
+                    end = time.time()
+                    fleet_util.rank0_print("load into memory done, cost %s min" % ((end - begin) / 60.0))
+
+                if config.prefetch and (pass_index < pass_per_day or pass_index == pass_per_day and day_index < len(days) - 1):
+                    if pass_index < pass_per_day:
+                        next_pass = online_pass_interval[pass_index]
+                        next_day = day
+                    else:
+                        next_pass = online_pass_interval[0]
+                        next_day = days[day_index + 1]
+                    next_path = []
+                    for interval in next_pass:
+                        for p in path:
+                            next_path.append(p + "/" + next_day + "/" + interval)
+                    next_data_ready = True
+                    #for i in next_path:
+                    #    if not hdfs_client.is_exist(i + "/to.hadoop.done"):
+                    #        next_data_ready = False
+                    #        fleet_util.rank0_print("next data not ready: %s" % i)
+                    if not next_data_ready:
+                        next_dataset = None
+                    else:
+                        my_filelist = ["part-00000_1"]#fleet.split_files(hdfs_ls(next_path))
+                        next_dataset = create_dataset(join_common_model._all_slots, my_filelist)
+                        fleet_util.rank0_print("next pass data preload %s " % ",".join(next_path))
+                        next_dataset.preload_into_memory(config.preload_thread)
+
+                fleet_util.rank0_print("going to global shuffle")
+                begin = time.time()
+                dataset.global_shuffle(fleet, config.shuffle_thread)
+                end = time.time()
+                fleet_util.rank0_print("global shuffle done, cost %s min, data size %s" % ((end - begin) / 60.0, dataset.get_shuffle_data_size(fleet)))
+
+                get_data_max(dataset.get_shuffle_data_size())
+                get_data_min(dataset.get_shuffle_data_size())
+               
+                join_cost = 0
+                common_cost = 0
+                update_cost = 0
+
+                with fluid.scope_guard(scope2):
+                    fleet_util.rank0_print("Begin join + common pass")
+                    begin = time.time()
+                    exe.train_from_dataset(join_common_model._train_program,
+                                           dataset,
+                                           scope2,
+                                           thread=config.join_common_thread,
+                                           debug=False)
+                    end = time.time()
+                    avg_cost = get_avg_cost_mins(end - begin)
+                    
+                    fleet_util.rank0_print("avg train time %s mins" % avg_cost)
+
+                    get_max_cost_mins(end - begin)
+                    get_min_cost_mins(end - begin)
+
+                    common_cost = avg_cost
+                    
+                    fleet_util.print_global_metrics(scope2, join_common_model.join_stat_pos.name, join_common_model.join_stat_neg.name,
+                                                    join_common_model.join_sqrerr.name, join_common_model.join_abserr.name,
+                                                    join_common_model.join_prob.name,
+                                                    join_common_model.join_q.name, join_common_model.join_pos.name,
+                                                    join_common_model.join_total.name,
+                                                    "join pass:")
+
+                    fleet_util.print_global_metrics(scope2, join_common_model.common_stat_pos.name, join_common_model.common_stat_neg.name,
+                                                    join_common_model.common_sqrerr.name, join_common_model.common_abserr.name,
+                                                    join_common_model.common_prob.name,
+                                                    join_common_model.common_q.name, join_common_model.common_pos.name,
+                                                    join_common_model.common_total.name,
+                                                    "common pass:")
+                    fleet_util.rank0_print("End join+common pass")
+                    clear_metrics_2(fleet_util, join_common_model, scope2)
+
+                if config.save_xbox_before_update and pass_index % config.save_delta_frequency == 0:
+                    fleet_util.rank0_print("going to save delta model")
+                    last_xbox_day, last_xbox_pass, last_xbox_path, _ = fleet_util.get_last_save_xbox(config.output_path,  config.fs_name, config.fs_ugi)
+                    if int(day) < last_xbox_day or int(day) == last_xbox_day and int(pass_index) <= last_xbox_pass:
+                        fleet_util.rank0_print("delta model exists")
+                    else:
+                        save_delta(day, pass_index, xbox_base_key, cur_path, exe, scope2, scope2, scope3,
+                                   join_common_model, join_common_model, update_model,
+                                   join_save_params, common_save_params, update_save_params)
+
+                with fluid.scope_guard(scope3):
+                    fleet_util.rank0_print("Begin update pass")
+                    begin = time.time()
+                    exe.train_from_dataset(update_model._train_program,
+                                           dataset,
+                                           scope3,
+                                           thread=config.update_thread,
+                                           debug=False)
+                    end = time.time()
+                    avg_cost = get_avg_cost_mins(end - begin)
+                    update_cost = avg_cost
+                    
+                    fleet_util.print_global_metrics(scope3, update_model.stat_pos.name, update_model.stat_neg.name,
+                                                    update_model.sqrerr.name, update_model.abserr.name, update_model.prob.name,
+                                                    update_model.q.name, update_model.pos.name, update_model.total.name,
+                                                    "update pass:")
+                    fleet_util.rank0_print("End update pass")
+                    clear_metrics(fleet_util, update_model, scope3)
+
+                begin = time.time()
+                dataset.release_memory()
+                end = time.time()
+
+                print pass_index
+                print config.checkpoint_per_pass
+                
+                if (pass_index % config.checkpoint_per_pass) == 0 and pass_index != pass_per_day:
+                    print "save"
+                    begin = time.time()
+                    fleet_util.save_model(config.output_path, day, pass_index)
+                    fleet_util.write_model_donefile(config.output_path, day, pass_index, xbox_base_key, config.fs_name, config.fs_ugi)
+                    end = time.time()
+                    fleet_util.rank0_print("save model cost %s min" % ((end - begin) / 60.0))
+                if not config.save_xbox_before_update and pass_index % config.save_delta_frequency == 0:
+                    fleet_util.rank0_print("going to save delta model")
+                    last_xbox_day, last_xbox_pass, last_xbox_path, _ = fleet_util.get_last_save_xbox(config.output_path,  config.fs_name, config.fs_ugi)
+                    if int(day) < last_xbox_day or int(day) == last_xbox_day and int(pass_index) <= last_xbox_pass:
+                        fleet_util.rank0_print("delta model exists")
+                    else:
+                        save_delta(day, pass_index, xbox_base_key, cur_path, exe, scope2, scope2, scope3,
+                                   join_common_model, join_common_model, update_model,
+                                   join_save_params, common_save_params, update_save_params)
+
+                train_end = time.time()
+                train_cost = (train_end - train_begin) / 60.0
+                other_cost = train_cost - join_cost - common_cost - update_cost
+                fleet_util.rank0_print(\
+                    "finished train day %s pass %s time cost:%s min job time cost"
+                    ":[join:%s min][join_common:%s min][update:%s min][other:%s min]" \
+                    % (day, pass_index, train_cost, join_cost, common_cost, update_cost, other_cost))
+            
+            xbox_base_key = int(time.time())
+            if not start_train:
+                continue
+
+            fleet_util.rank0_print("shrink table")
+            begin = time.time()
+            fleet.shrink_sparse_table()
+            fleet.shrink_dense_table(0.98, scope=scope2, table_id=1)
+            fleet.shrink_dense_table(0.98, scope=scope2, table_id=2)
+            fleet.shrink_dense_table(0.98, scope=scope3, table_id=3)
+            end = time.time()
+            fleet_util.rank0_print("shrink table done, cost %s min" % ((end - begin) / 60.0))
+
+            fleet_util.rank0_print("going to save batch model/base xbox model")
+            last_base_day, last_base_path, _ = fleet_util.get_last_save_xbox_base(config.output_path, config.fs_name, config.fs_ugi)
+            nextday = int(days[day_index + 1])
+            if nextday <= last_base_day:
+                fleet_util.rank0_print("batch model/base xbox model exists")
+            else:
+                save_delta(nextday, -1, xbox_base_key, cur_path, exe, scope2, scope2, scope3,
+                           join_common_model, join_common_model, update_model,
+                           join_save_params, common_save_params, update_save_params)
+                begin = time.time()
+                fleet_util.save_batch_model(config.output_path, nextday)
+                fleet_util.write_model_donefile(config.output_path, nextday, -1, xbox_base_key, config.fs_name, config.fs_ugi)
+                end = time.time()
+                fleet_util.rank0_print("save batch model cost %s min" % ((end - begin) / 60.0))
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/util.bak.py b/feed/feed_deploy/news_jingpai/package/my_nets/util.bak.py
new file mode 100644
index 0000000000000000000000000000000000000000..15e96c9e63bdee985be5bea396195d174c2cdf27
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/util.bak.py
@@ -0,0 +1,135 @@
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+import os
+import numpy as np
+import config
+
+def jingpai_load_paddle_model(old_startup_program_bin,
+                              old_train_program_bin,
+                              old_model_path,
+                              old_slot_list,
+                              new_slot_list,
+                              model_all_vars,
+                              new_scope,
+                              modify_layer_names):
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    
+    old_scope = fluid.Scope()
+    old_program = fluid.Program()
+    old_program = old_program.parse_from_string(open(old_train_program_bin, "rb").read())
+    old_startup_program = fluid.Program()
+    old_startup_program = old_startup_program.parse_from_string(open(old_startup_program_bin, "rb").read())
+    with fluid.scope_guard(old_scope):
+        exe.run(old_startup_program)
+        variables =  [old_program.global_block().var(i) for i in model_all_vars]
+        if os.path.isfile(old_model_path):
+            path = os.path.dirname(old_model_path)
+            path = "./" if path == "" else path
+            filename = os.path.basename(old_model_path)
+            fluid.io.load_vars(exe, path, old_program, vars=variables, filename=filename)
+        else:
+            fluid.io.load_vars(exe, old_model_path, old_program, vars=variables)
+
+    old_pos = {}
+    idx = 0
+    for i in old_slot_list:
+        old_pos[i] = idx
+        idx += 1
+
+    for i in modify_layer_names:
+        if old_scope.find_var(i) is None:
+            print("%s not found in old scope, skip" % i)
+            continue
+        elif new_scope.find_var(i) is None:
+            print("%s not found in new scope, skip" % i)
+            continue
+        old_param = old_scope.var(i).get_tensor()
+        old_param_array =  np.array(old_param).astype("float32")
+        old_shape = old_param_array.shape
+        #print  i," old_shape ", old_shape
+
+        new_param = new_scope.var(i).get_tensor()
+        new_param_array = np.array(new_param).astype("float32")
+        new_shape = new_param_array.shape
+        #print i," new_shape ", new_shape
+
+        per_dim = len(new_param_array) / len(new_slot_list)
+        #print "len(new_param_array) ",len(new_param_array),\
+        #  "len(new_slot_list) ", len(new_slot_list)," per_dim ", per_dim
+
+        idx = -per_dim
+        for s in new_slot_list:
+            idx += per_dim
+            if old_pos.get(s) is None:
+                    continue                
+            for j in range(0, per_dim):
+                #print i," row/value ", idx + j, " copy from ", old_pos[s] * per_dim + j
+                # a row or a value
+                new_param_array[idx + j] = old_param_array[old_pos[s] * per_dim + j]
+
+        new_param.set(new_param_array, place)
+
+    for i in model_all_vars:
+        if i in modify_layer_names:
+            continue
+        old_param = old_scope.find_var(i).get_tensor()
+        old_param_array =  np.array(old_param).astype("float32")
+        new_param = new_scope.find_var(i).get_tensor()
+        new_param.set(old_param_array, place)
+
+
+def reqi_changeslot(hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3):
+    if fleet.worker_index() != 0:
+        return
+
+    print("load paddle model %s" % hdfs_dnn_plugin_path)
+
+    os.system("rm -rf dnn_plugin/ ; hadoop fs -D hadoop.job.ugi=%s -D fs.default.name=%s -get %s ." % (config.fs_ugi, config.fs_name, hdfs_dnn_plugin_path))
+
+    new_join_slot = []
+    for line in open("slot/slot", 'r'):
+        slot = line.strip()
+        new_join_slot.append(slot)
+    old_join_slot = []
+    for line in open("old_slot/slot", 'r'):
+        slot = line.strip()
+        old_join_slot.append(slot)
+
+    new_common_slot = []
+    for line in open("slot/slot_common", 'r'):
+        slot = line.strip()
+        new_common_slot.append(slot)
+    old_common_slot = []
+    for line in open("old_slot/slot_common", 'r'):
+        slot = line.strip()
+        old_common_slot.append(slot)
+
+
+    jingpai_load_paddle_model("old_program/old_join_common_startup_program.bin",
+                              "old_program/old_join_common_train_program.bin",
+                              "dnn_plugin/paddle_dense.model.0",
+                              old_join_slot,
+                              new_join_slot,
+                              join_save_params,
+                              scope2,
+                              ["join.batch_size","join.batch_sum","join.batch_square_sum","join_0.w_0"])
+
+    jingpai_load_paddle_model("old_program/old_join_common_startup_program.bin",
+                              "old_program/old_join_common_train_program.bin",
+                              "dnn_plugin/paddle_dense.model.1",
+                              old_common_slot,
+                              new_common_slot,
+                              common_save_params,
+                              scope2,
+                              ["common.batch_size","common.batch_sum","common.batch_square_sum","common_0.w_0"])
+
+    jingpai_load_paddle_model("old_program/old_update_startup_program.bin",
+                              "old_program/old_update_main_program.bin",
+                              "dnn_plugin/paddle_dense.model.2",
+                              old_join_slot,
+                              new_join_slot,
+                              update_save_params,
+                              scope3,
+                              ["fc_0.w_0"])
diff --git a/feed/feed_deploy/news_jingpai/package/my_nets/util.py b/feed/feed_deploy/news_jingpai/package/my_nets/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..46de454f3e7ec05c8ddc07494cc4c255d28b1ec8
--- /dev/null
+++ b/feed/feed_deploy/news_jingpai/package/my_nets/util.py
@@ -0,0 +1,286 @@
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+import os
+import numpy as np
+import config
+from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+import collections
+import json
+import time
+
+fleet_util = FleetUtil()
+
+def print_global_metrics(scope, stat_pos_name, stat_neg_name, sqrerr_name,
+                         abserr_name, prob_name, q_name, pos_ins_num_name, 
+                         total_ins_num_name, print_prefix):
+        auc, bucket_error, mae, rmse, actual_ctr, predicted_ctr, copc,\
+            mean_predict_qvalue, total_ins_num = fleet_util.get_global_metrics(\
+            scope, stat_pos_name, stat_neg_name, sqrerr_name, abserr_name,\
+            prob_name, q_name, pos_ins_num_name, total_ins_num_name)
+        log_str = "AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f " \
+                  "RMSE=%.6f Actural_CTR=%.6f Predicted_CTR=%.6f " \
+                  "COPC=%.6f MEAN Q_VALUE=%.6f Ins number=%s" % \
+                  (auc, bucket_error, mae, rmse, \
+                  actual_ctr, predicted_ctr, copc, mean_predict_qvalue, \
+                  total_ins_num)
+        fleet_util.rank0_print(print_prefix + " " + log_str)
+        return print_prefix + " " + log_str #print_prefix + "\n " + log_str
+
+def write_stdout(stdout_str):
+    if fleet.worker_index() != 0:
+        fleet._role_maker._barrier_worker()
+        return
+    hadoop_home="$HADOOP_HOME"
+    configs = {"fs.default.name": config.fs_name, "hadoop.job.ugi": config.fs_ugi}
+    client = HDFSClient(hadoop_home, configs)
+    out_dir = config.output_path + "/stdout/"
+    if not client.is_exist(out_dir):
+        client.makedirs(out_dir)
+    job_id_with_host = os.popen("echo -n ${JOB_ID}").read().strip()
+    instance_id = os.popen("echo -n ${INSTANCE_ID}").read().strip()
+    start_pos = instance_id.find(job_id_with_host)
+    end_pos = instance_id.find("--")
+    if start_pos != -1 and end_pos != -1:
+        job_id_with_host = instance_id[start_pos:end_pos]
+    file_path = out_dir + job_id_with_host
+    if client.is_file(file_path):
+        pre_content = client.cat(file_path)
+        with open(job_id_with_host, "w") as f:
+            f.write(pre_content + "\n")
+            f.write(stdout_str + "\n")
+        client.delete(file_path)
+        client.upload(out_dir, job_id_with_host, multi_processes=1, overwrite=False)
+    else:
+        with open(job_id_with_host, "w") as f:
+            f.write(stdout_str + "\n")
+        client.upload(out_dir, job_id_with_host, multi_processes=1, overwrite=False)
+    fleet_util.rank0_info("write %s succeed" % file_path)
+    fleet._role_maker._barrier_worker()
+
+def _get_xbox_str(day, model_path, xbox_base_key, data_path, monitor_data, mode="patch"):
+    xbox_dict = collections.OrderedDict()
+    if mode == "base":
+        xbox_dict["id"] = str(xbox_base_key)
+    elif mode == "patch":
+        xbox_dict["id"] = str(int(time.time()))
+    else:
+        print("warning: unknown mode %s, set it to patch" % mode)
+        mode = "patch"
+        xbox_dict["id"] = str(int(time.time()))
+    xbox_dict["key"] = str(xbox_base_key)
+    if model_path.startswith("hdfs:") or model_path.startswith("afs:"):
+        model_path = model_path[model_path.find(":") + 1:]
+    xbox_dict["input"] = config.fs_name + model_path.rstrip("/") + "/000"
+    xbox_dict["record_count"] = "111111"
+    xbox_dict["partition_type"] = "2"
+    xbox_dict["job_name"] = "default_job_name"
+    xbox_dict["ins_tag"] = "feasign"
+    xbox_dict["ins_path"] = data_path
+    job_id_with_host = os.popen("echo -n ${JOB_ID}").read().strip()
+    instance_id = os.popen("echo -n ${INSTANCE_ID}").read().strip()
+    start_pos = instance_id.find(job_id_with_host)
+    end_pos = instance_id.find("--")
+    if start_pos != -1 and end_pos != -1:
+        job_id_with_host = instance_id[start_pos:end_pos]
+    xbox_dict["job_id"] = job_id_with_host
+    xbox_dict["monitor_data"] = monitor_data
+    xbox_dict["monitor_path"] = config.output_path.rstrip("/") + "/monitor/" \
+                                + day + ".txt"
+    xbox_dict["mpi_size"] = str(fleet.worker_num())
+    return json.dumps(xbox_dict)
+
+def write_xbox_donefile(day, pass_id, xbox_base_key, data_path, donefile_name=None, monitor_data=""):
+    if fleet.worker_index() != 0:
+        fleet._role_maker._barrier_worker()
+        return
+    day = str(day)
+    pass_id = str(pass_id)
+    xbox_base_key = int(xbox_base_key)
+    mode = None
+    if pass_id != "-1":
+        mode = "patch"
+        suffix_name = "/%s/delta-%s/" % (day, pass_id)
+        model_path = config.output_path.rstrip("/") + suffix_name
+        if donefile_name is None:
+            donefile_name = "xbox_patch_done.txt"
+    else:
+        mode = "base"
+        suffix_name = "/%s/base/" % day
+        model_path = config.output_path.rstrip("/") + suffix_name
+        if donefile_name is None:
+            donefile_name = "xbox_base_done.txt"
+    if isinstance(data_path, list):
+        data_path = ",".join(data_path)
+
+    if fleet.worker_index() == 0:
+        donefile_path = config.output_path + "/" + donefile_name
+        xbox_str = _get_xbox_str(day, model_path, xbox_base_key, data_path, monitor_data, mode)
+        configs = {"fs.default.name": config.fs_name, "hadoop.job.ugi": config.fs_ugi}
+        client = HDFSClient("$HADOOP_HOME", configs)
+        if client.is_file(donefile_path):
+            pre_content = client.cat(donefile_path)
+            last_dict = json.loads(pre_content.split("\n")[-1])
+            last_day = last_dict["input"].split("/")[-3]
+            last_pass = last_dict["input"].split("/")[-2].split("-")[-1]
+            exist = False
+            if int(day) < int(last_day) or \
+                    int(day) == int(last_day) and \
+                    int(pass_id) <= int(last_pass):
+                exist = True
+            if not exist:
+                with open(donefile_name, "w") as f:
+                    f.write(pre_content + "\n")
+                    f.write(xbox_str + "\n")
+                client.delete(donefile_path)
+                client.upload(
+                    config.output_path,
+                    donefile_name,
+                    multi_processes=1,
+                    overwrite=False)
+                fleet_util.rank0_info("write %s/%s %s succeed" % \
+                                       (day, pass_id, donefile_name))
+            else:
+                fleet_util.rank0_error("not write %s because %s/%s already "
+                                       "exists" % (donefile_name, day, pass_id))
+        else:
+            with open(donefile_name, "w") as f:
+                f.write(xbox_str + "\n")
+            client.upload(
+                config.output_path,
+                donefile_name,
+                multi_processes=1,
+                overwrite=False)
+            fleet_util.rank0_error("write %s/%s %s succeed" % \
+                                   (day, pass_id, donefile_name))
+    fleet._role_maker._barrier_worker()
+
+def jingpai_load_paddle_model(old_startup_program_bin,
+                              old_train_program_bin,
+                              old_model_path,
+                              old_slot_list,
+                              new_slot_list,
+                              model_all_vars,
+                              new_scope,
+                              modify_layer_names):
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    
+    old_scope = fluid.Scope()
+    old_program = fluid.Program()
+    old_program = old_program.parse_from_string(open(old_train_program_bin, "rb").read())
+    old_startup_program = fluid.Program()
+    old_startup_program = old_startup_program.parse_from_string(open(old_startup_program_bin, "rb").read())
+    with fluid.scope_guard(old_scope):
+        exe.run(old_startup_program)
+        variables =  [old_program.global_block().var(i) for i in model_all_vars]
+        if os.path.isfile(old_model_path):
+            path = os.path.dirname(old_model_path)
+            path = "./" if path == "" else path
+            filename = os.path.basename(old_model_path)
+            fluid.io.load_vars(exe, path, old_program, vars=variables, filename=filename)
+        else:
+            fluid.io.load_vars(exe, old_model_path, old_program, vars=variables)
+
+    old_pos = {}
+    idx = 0
+    for i in old_slot_list:
+        old_pos[i] = idx
+        idx += 1
+
+    for i in modify_layer_names:
+        if old_scope.find_var(i) is None:
+            print("%s not found in old scope, skip" % i)
+            continue
+        elif new_scope.find_var(i) is None:
+            print("%s not found in new scope, skip" % i)
+            continue
+        old_param = old_scope.var(i).get_tensor()
+        old_param_array =  np.array(old_param).astype("float32")
+        old_shape = old_param_array.shape
+        #print  i," old_shape ", old_shape
+
+        new_param = new_scope.var(i).get_tensor()
+        new_param_array = np.array(new_param).astype("float32")
+        new_shape = new_param_array.shape
+        #print i," new_shape ", new_shape
+
+        per_dim = len(new_param_array) / len(new_slot_list)
+        #print "len(new_param_array) ",len(new_param_array),\
+        #  "len(new_slot_list) ", len(new_slot_list)," per_dim ", per_dim
+
+        idx = -per_dim
+        for s in new_slot_list:
+            idx += per_dim
+            if old_pos.get(s) is None:
+                    continue                
+            for j in range(0, per_dim):
+                #print i," row/value ", idx + j, " copy from ", old_pos[s] * per_dim + j
+                # a row or a value
+                new_param_array[idx + j] = old_param_array[old_pos[s] * per_dim + j]
+
+        new_param.set(new_param_array, place)
+
+    for i in model_all_vars:
+        if i in modify_layer_names:
+            continue
+        old_param = old_scope.find_var(i).get_tensor()
+        old_param_array =  np.array(old_param).astype("float32")
+        new_param = new_scope.find_var(i).get_tensor()
+        new_param.set(old_param_array, place)
+
+
+def reqi_changeslot(hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3):
+    if fleet.worker_index() != 0:
+        return
+
+    print("load paddle model %s" % hdfs_dnn_plugin_path)
+
+    os.system("rm -rf dnn_plugin/ ; hadoop fs -D hadoop.job.ugi=%s -D fs.default.name=%s -get %s ." % (config.fs_ugi, config.fs_name, hdfs_dnn_plugin_path))
+
+    new_join_slot = []
+    for line in open("slot/slot", 'r'):
+        slot = line.strip()
+        new_join_slot.append(slot)
+    old_join_slot = []
+    for line in open("old_slot/slot", 'r'):
+        slot = line.strip()
+        old_join_slot.append(slot)
+
+    new_common_slot = []
+    for line in open("slot/slot_common", 'r'):
+        slot = line.strip()
+        new_common_slot.append(slot)
+    old_common_slot = []
+    for line in open("old_slot/slot_common", 'r'):
+        slot = line.strip()
+        old_common_slot.append(slot)
+
+
+    jingpai_load_paddle_model("old_program/old_join_common_startup_program.bin",
+                              "old_program/old_join_common_train_program.bin",
+                              "dnn_plugin/paddle_dense.model.0",
+                              old_join_slot,
+                              new_join_slot,
+                              join_save_params,
+                              scope2,
+                              ["join.batch_size","join.batch_sum","join.batch_square_sum","join_0.w_0"])
+
+    jingpai_load_paddle_model("old_program/old_join_common_startup_program.bin",
+                              "old_program/old_join_common_train_program.bin",
+                              "dnn_plugin/paddle_dense.model.1",
+                              old_common_slot,
+                              new_common_slot,
+                              common_save_params,
+                              scope2,
+                              ["common.batch_size","common.batch_sum","common.batch_square_sum","common_0.w_0"])
+
+    jingpai_load_paddle_model("old_program/old_update_startup_program.bin",
+                              "old_program/old_update_main_program.bin",
+                              "dnn_plugin/paddle_dense.model.2",
+                              old_join_slot,
+                              new_join_slot,
+                              update_save_params,
+                              scope3,
+                              ["fc_0.w_0"])
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/qsub_f.conf b/feed/feed_deploy/news_jingpai/qsub_f.conf
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/qsub_f.conf
rename to feed/feed_deploy/news_jingpai/qsub_f.conf
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/run.sh b/feed/feed_deploy/news_jingpai/run.sh
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/run.sh
rename to feed/feed_deploy/news_jingpai/run.sh
diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/submit.sh b/feed/feed_deploy/news_jingpai/submit.sh
similarity index 100%
rename from paddle/fluid/feed/feed_deploy/news_jingpai/submit.sh
rename to feed/feed_deploy/news_jingpai/submit.sh
diff --git a/paddle/fluid/feed/pybind/CMakeLists.txt b/feed/pybind/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/feed/pybind/CMakeLists.txt
rename to feed/pybind/CMakeLists.txt
diff --git a/paddle/fluid/feed/pybind/expand_api.cc b/feed/pybind/expand_api.cc
similarity index 100%
rename from paddle/fluid/feed/pybind/expand_api.cc
rename to feed/pybind/expand_api.cc
diff --git a/paddle/fluid/feed/pybind/expand_api.h b/feed/pybind/expand_api.h
similarity index 100%
rename from paddle/fluid/feed/pybind/expand_api.h
rename to feed/pybind/expand_api.h
diff --git a/paddle/fluid/feed/src/CMakeLists.txt b/feed/src/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/feed/src/CMakeLists.txt
rename to feed/src/CMakeLists.txt
diff --git a/paddle/fluid/feed/src/common/CMakeLists.txt b/feed/src/common/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/feed/src/common/CMakeLists.txt
rename to feed/src/common/CMakeLists.txt
diff --git a/paddle/fluid/feed/src/common/bhopscotch_map.h b/feed/src/common/bhopscotch_map.h
similarity index 100%
rename from paddle/fluid/feed/src/common/bhopscotch_map.h
rename to feed/src/common/bhopscotch_map.h
diff --git a/paddle/fluid/feed/src/common/bhopscotch_set.h b/feed/src/common/bhopscotch_set.h
similarity index 100%
rename from paddle/fluid/feed/src/common/bhopscotch_set.h
rename to feed/src/common/bhopscotch_set.h
diff --git a/paddle/fluid/feed/src/common/dict_plugin.cc b/feed/src/common/dict_plugin.cc
similarity index 100%
rename from paddle/fluid/feed/src/common/dict_plugin.cc
rename to feed/src/common/dict_plugin.cc
diff --git a/paddle/fluid/feed/src/common/dict_plugin.h b/feed/src/common/dict_plugin.h
similarity index 100%
rename from paddle/fluid/feed/src/common/dict_plugin.h
rename to feed/src/common/dict_plugin.h
diff --git a/paddle/fluid/feed/src/common/hopscotch_growth_policy.h b/feed/src/common/hopscotch_growth_policy.h
similarity index 100%
rename from paddle/fluid/feed/src/common/hopscotch_growth_policy.h
rename to feed/src/common/hopscotch_growth_policy.h
diff --git a/paddle/fluid/feed/src/common/hopscotch_hash.h b/feed/src/common/hopscotch_hash.h
similarity index 100%
rename from paddle/fluid/feed/src/common/hopscotch_hash.h
rename to feed/src/common/hopscotch_hash.h
diff --git a/paddle/fluid/feed/src/common/hopscotch_map.h b/feed/src/common/hopscotch_map.h
similarity index 100%
rename from paddle/fluid/feed/src/common/hopscotch_map.h
rename to feed/src/common/hopscotch_map.h
diff --git a/paddle/fluid/feed/src/common/hopscotch_set.h b/feed/src/common/hopscotch_set.h
similarity index 100%
rename from paddle/fluid/feed/src/common/hopscotch_set.h
rename to feed/src/common/hopscotch_set.h
diff --git a/paddle/fluid/feed/src/data_reader/CMakeLists.txt b/feed/src/data_reader/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/feed/src/data_reader/CMakeLists.txt
rename to feed/src/data_reader/CMakeLists.txt
diff --git a/paddle/fluid/feed/src/data_reader/data_set.cc b/feed/src/data_reader/data_set.cc
similarity index 100%
rename from paddle/fluid/feed/src/data_reader/data_set.cc
rename to feed/src/data_reader/data_set.cc
diff --git a/paddle/fluid/feed/src/data_reader/data_set.h b/feed/src/data_reader/data_set.h
similarity index 100%
rename from paddle/fluid/feed/src/data_reader/data_set.h
rename to feed/src/data_reader/data_set.h
diff --git a/paddle/fluid/feed/tool/CMakeLists.txt b/feed/tool/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/feed/tool/CMakeLists.txt
rename to feed/tool/CMakeLists.txt
diff --git a/paddle/fluid/feed/tool/parse_feasign.cpp b/feed/tool/parse_feasign.cpp
similarity index 100%
rename from paddle/fluid/feed/tool/parse_feasign.cpp
rename to feed/tool/parse_feasign.cpp
diff --git a/paddle/.common_test_util.sh b/paddle/.common_test_util.sh
deleted file mode 100644
index 4681e49a0f53214b1e259c9e138d87756184b00e..0000000000000000000000000000000000000000
--- a/paddle/.common_test_util.sh
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-PORT_FILE=/tmp/paddle_test_ports
-PORT_LOCK_FILE=/tmp/paddle_test_ports.lock
-
-# Create flag file, all user can rw, ignore all error here
-touch $PORT_FILE $PORT_LOCK_FILE 2>/dev/null
-chmod a+rw $PORT_FILE $PORT_LOCK_FILE 2>/dev/null
-
-# acquire a range of ports that not used by other runtests.sh currentlly.
-# return 1 if ports is used by other, otherwise return 0.
-# NOTE: the acquire_ports/release_ports is interprocess mutexed.
-#
-# There are two parameter of this method
-# param 1: the begin of port range
-# param 2: the length of port range.
-# so, the port range is [param1, param1+param2)
-acquire_ports(){
-  (
-    flock -x 200
-    let "len=$1+$2"
-    for((i=$1; i<$len; i++))
-    do
-      grep -q $i $PORT_FILE
-      if [ $? -eq 0 ] ; then
-        return 1 # Port already write to $PORT_FILE
-      fi
-    done
-
-    for((i=$1; i<$len; i++))
-    do
-      echo $i >> $PORT_FILE # Write to $PORT_FILE
-    done
-    return 0
-  )200>$PORT_LOCK_FILE
-}
-
-# release a range of ports. Mark these ports is not used by runtests.sh.
-# NOTE: the acquire_ports/release_ports is interprocess mutexed.
-#
-# The parameter is same as acquire_ports, see acquire_ports' comments.
-release_ports(){
-  (
-    flock -x 200
-    let "len=$1+$2"
-    for((i=$1; i<$len; i++))
-    do
-      tmp=`sed "/$i/d" $PORT_FILE`  # remove port
-      echo $tmp > $PORT_FILE
-    done
-  )200>$PORT_LOCK_FILE
-}
-
-# use set_port  to get a random free port
-# such as    set_port -p port test_fuc   to run  test_fuc --port=random
-# use  -n to set_port test_fuc to get a continuous free port
-# such as    set_port  -n 10 -p port  test_fuc  to get ten continuous free port to run test_fuc --port=random
-set_port()
-{
-    num=1
-
-    port_type="port"
-    unset OPTIND
-    while   getopts  "n:p:"  opt
-    do
-        case  "$opt"   in
-            n)   echo  "get num ${OPTARG}"
-                 num=${OPTARG}
-                 ;;
-            p)   echo  "get port_type ${OPTARG}"
-                 port_type=${OPTARG}
-                 ;;
-        esac
-    done
-    shift $((OPTIND-1))
-    cmd=$@
-    for ((i=1;i<=10000;i++))
-    do
-        declare -i port=$RANDOM+10000
-        port_used_total=0
-        for((n=0;n<=num-1;n++))
-            do
-                declare -i port_check=$port+$n
-                port_used_num=`netstat -a |grep $port_check|wc -l`
-                declare -i port_used_total=$port_used_total+$port_used_num
-            done
-        if [ $port_used_total -ne 0 ]
-            then
-                continue
-        fi
-        # Lock Ports.
-        acquire_ports $port $num
-        if [ $? -ne 0 ]; then
-            continue
-        fi
-        $cmd --$port_type=$port
-        return_val=$?
-        release_ports $port $num
-        if [ $return_val -eq 0 ]; then
-            return 0
-        else
-            echo "$cmd run wrong"
-            return 1
-        fi
-    done
-
-}
diff --git a/paddle/.gitignore b/paddle/.gitignore
deleted file mode 100644
index 01904aa6ef2057afee95ddd6e30cde064b06c52e..0000000000000000000000000000000000000000
--- a/paddle/.gitignore
+++ /dev/null
@@ -1,43 +0,0 @@
-.timestamp
-*.o
-*.a
-.svn
-GPATH
-GRTAGS
-GTAGS
-.idl*
-*~
-*.pyc
-*.pb.cc
-*.pb.h
-*_pb2.py
-output/
-google/
-Makefile
-log/
-.pptool_config
-hf/
-build
-issue.info
-
-ar
-g++
-gcc
-ld
-ld-linux-x86-64.so.2
-x86_64-scm-linux-gnu/
-.lint.*.md5
-
-.idea/
-.test_env
-Paddle_wrap.cxx
-Paddle_wrap.h
-paddle.py
-py_paddle-*.whl
-py_paddle/paddle.py
-.py_paddle_extra_link_flags
-HPPL_ERROR_LOG
-unittest.list
-proto
-dist
-setup.py
diff --git a/paddle/.set_port.sh b/paddle/.set_port.sh
deleted file mode 100755
index 617ac79a24889eef23b66235ace20be80e1ff4dc..0000000000000000000000000000000000000000
--- a/paddle/.set_port.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-DIRNAME=`dirname $0`
-source $DIRNAME/.common_test_util.sh
-set_port $@
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
deleted file mode 100755
index 8fd58925ee4820269572176ff9496f42914652da..0000000000000000000000000000000000000000
--- a/paddle/.set_python_path.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#
-# A simple test driver for cmake. 
-# set PYTHONPATH before run command.
-# Usage:
-#    ./.set_python_pash.sh -p YOUR_PYTHON_PATH {exec...}
-# 
-# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
-#
-PYPATH=""
-set -x
-while getopts "d:" opt; do
-  case $opt in
-    d)
-      PYPATH=$OPTARG
-      ;;
-  esac
-done
-shift $(($OPTIND - 1))
-export PYTHONPATH=$PYPATH:$PYTHONPATH
-$@
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
deleted file mode 100644
index c0c04d475959de2bfd6505b6ed30d5c18cbd99da..0000000000000000000000000000000000000000
--- a/paddle/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_subdirectory(scripts)
-add_subdirectory(testing)
-set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
-add_subdirectory(fluid)
diff --git a/paddle/contrib/float16/.gitignore b/paddle/contrib/float16/.gitignore
deleted file mode 100644
index dd28d354f4160b4be68b46a7bebcdf2097d5811a..0000000000000000000000000000000000000000
--- a/paddle/contrib/float16/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.inference.model
diff --git a/paddle/contrib/float16/README.md b/paddle/contrib/float16/README.md
deleted file mode 100644
index a1f8cb42451dd5e84c97d6830216d284cc8bd819..0000000000000000000000000000000000000000
--- a/paddle/contrib/float16/README.md
+++ /dev/null
@@ -1,171 +0,0 @@
-# Float16 Inference in PaddlePaddle Fluid
-
-Kexin Zhao <zhaokexin01@baidu.com>
-
-## Introduction
-Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data.  The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32).  Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16).
-
-This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.
-
-
-## What is float16?
-float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference.
-
-Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
-
-## Why float16?
-The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold:
-
-1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of the float16 data type. As a result, the whole memory footprint of float16 inference is roughly half of its float counterpart, which is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference.
-
-2. Because float16 occupies less memory than float, in theory, hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, NVIDIA's latest Volta GPUs, including Tesla V100 and Titan V, can deliver significantly higher FLOPS for float16 using Tensor Cores. Moreover, float16 takes less time to read from or write to memory, and hence float16 can make inference more efficient especially in memory-bound applications where the performance is mostly affected by how fast it is to read and write data.
-
-3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less than its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers.
-
-## Fluid implementation of float16 inference
-### Overview
-Fluid use [Program](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block.
-
-### Basic requirement
-When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
-
-If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type.
-
-The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
-
-### float16 transpiler
-Furthermore, we need a transpiler to write float16 inference code similar to the following:
-
-```python
-# Get the float32 inference program and load the associated float32 weights
-[inference_program, feed_target_names,
- fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-# Prepare the float input data
-batch_size = 1
-tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype(numpy.float32)
-
-# Running inference_program in float mode
-float_results = exe.run(inference_program,
-                        feed={feed_target_names[0]: tensor_img},
-                        fetch_list=fetch_targets)
-
-# Use float16 transpiler to speedup
-float16_inference_program = float_inference_program.clone()
-t = Float16Transpiler()
-t.transpile(float16_inference_program, GPUPlace)
-
-# Running float16_inference_program in float16 mode using the same input data
-float16_results = exe.run(float16_inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
-
-# Do some tests to verify the correctness of float16 inference
-...
-np.testing.assert_almost_equal(float_results, float16_results, ...)
-...
-
-# Save the float16 inference program and float16 weights for future deployment
-fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
-                              fetch_targets, exe,
-                              float16_inference_program)
-```
-
-In this scenario, we already have a float32 inference program and some associated float32 weights. We can simply use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights.
-
-We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor.
-
-The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
-
-### Experiment results
-Simply running the following commands to reproduce the experiment results presented in this section:
-
-```bash
-git clone https://github.com/PaddlePaddle/Paddle.git
-cd Paddle
-# This line will generate a paddle development docker image with cuda 8 and cudnn 7
-# If you want test on cuda 9 instead, change the line 5 in Paddle/Dockerfile 
-# from `FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
-# to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
-nvidia-docker build -t paddle:float16 .
-# After running this, different results will be written to different log files in Paddle/contrib/float16/
-nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh
-```
-
-#### Accuracy
-As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incurred by float16, and we want to see how good this tolerance is.
-
-We train a resnet32 model using cifar10 data set, save it when test set accuracy is above 60%, and then test the inference accuracy on the 10000 examples of the cifar10 test set in float16 and float32 mode, respectively.
-
-We repeat the test ten times and get the following results:
-
-|        | float16 | float32  |
-|--------|--------:|--------: |
-| # 1    | 62.75%  | 62.72%   |
-| # 2    | 61.27%  | 61.28%   |
-| # 3    | 62.24%  | 62.23%   |
-| # 4    | 64.16%  | 64.17%   |
-| # 5    | 60.75%  | 60.77%   |
-| # 6    | 63.25%  | 63.24%   |
-| # 7    | 62.15%  | 62.13%   |
-| # 8    | 62.05%  | 62.02%   |
-| # 9    | 65.19%  | 65.20%   |
-| #10    | 62.53%  | 62.48%   |
-| average| 62.63%  | 62.62%   |
-
-We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests.
-
-#### Performance benchmark
-Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart. 
-
-NVIDIA started to support its native float16 data type (which has the same internal memory representation as Fluid's float16 class) on CUDA 7.5. Moreover, float16 speedups on computationally intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cuBLAS 7.5 and cuDNN 5.0.
-
-Recently, the introduction of [Tensor Core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in Volta architecture GPUs and the support of Tensor Core computation in CUDA 9.0 and cuDNN 7 make float16 genuinely superior to float in some deep learning applications.
-
-We thus benchmark the float16 inference performance on a single NVIDIA Tesla V100 GPU (Volta architecture and with Tensor Cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes.
-
-Average inference time for one mini-batch on Vgg16 model tested on ImageNet dataset:
-
-| total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
-|float32| 14.01 | 9.70  | 22.99 | 28.26 | 53.87  | 84.42 | 178.95 | 
-|float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
-|Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
-
-We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes.
-
-Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
-
-|conv op| mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | 
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
-|float32| 11.95 | 6.96  | 18.65 | 21.42 | 41.35  | 60.58 | 130.11 |
-|float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
-|Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
-
-Fluid convolution operator uses cuDNN 7 to implement the kernel, and we can see that with the help of Tensor Core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better.
-
-Similarly, we also list the benchmark results of Resnet50 model tested on the ImageNet dataset:
-
-| total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
-|float32| 7.03  | 7.41  | 9.16  | 12.55 | 21.13  | 38.27 | 67.93  | 127.02 | 
-|float16| 6.13  | 6.32  | 6.24  |  7.40 | 10.90  | 18.18 | 33.20  |  64.52 |
-|Speedup| 1.15  | 1.17  | 1.47  |  1.70 |  1.94  |  2.11 |  2.05  |   1.97 |
-
-|conv op| mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
-|float32| 5.43  | 5.46  | 6.50  | 8.36  | 13.80  | 24.45 | 41.21  | 73.44  |
-|float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
-|Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
-
-We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increases to about 2x for larger batch sizes. A similar trend can be found for the time spent on the convolution operator. Note that right now Tensor Cores will only be utilized in the convolution operation when the input data and filter meet specific dimensional requirements. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than its Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg.
-
-We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference.
-
-Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/float16/float16_benchmark.md) for complete benchmark results.
-
-### Summary
-1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.
-2. The accuracy of float16 inference is verified to be almost identical to its float32 counterpart at least on CNN models.
-3. float16 inference provides a significant speedup on large and computationally intensive Vgg16 model on ImageNet dataset. For the much smaller and simpler Resnet50 model, the speedup provided by float16 inference is less significant than for Vgg16 model but still favorable, especially for large batch sizes.
-4. We cannot achieve the superior float16 inference performance without the help of the newly introduced Tensor Cores on NVIDIA Volta architecture GPUs.
diff --git a/paddle/contrib/float16/float16_benchmark.md b/paddle/contrib/float16/float16_benchmark.md
deleted file mode 100644
index b51d6bde92fa04d2268afa36b9c4bd18bc28fe73..0000000000000000000000000000000000000000
--- a/paddle/contrib/float16/float16_benchmark.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# float16 benchmark
-
-## Description
-We want to compare the inference benchmark of float16 vs float32 on the "image_classification" example on Nvidia Tesla V100 GPU, where we can enable the tensor core computation for float16 mode. We test Vgg16 and Resnet50 on the imagenet data set, and Vgg16 and Resnet32 on the cifar10 data set. For completeness, we also add the inference benchmark of Vgg16 and Resnet50 on imagenet data set tested on Nvidia GeForce GTX 1080 Ti GPU.
-
-For more details about tensor core, please refer to https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
-
-## Test environment
-- GPU: single Nvidia Tesla V100 or single Nvidia GeForce GTX 1080 Ti 
-- CUDNN: 7.1.1
-- CUDA: 9.0
-- Code: https://github.com/PaddlePaddle/Paddle/pull/10331 (Tensor core is enabled in float16 mode)
-
-## Benchmark on V100
-All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia V100 GPU with respective to different mini-batch(mb) sizes.
-
-### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
-
-Total inference time for one batch:
-
-|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
-|float32| 14.01 | 9.70  | 22.99 | 28.26 | 53.87  | 84.42 | 178.95 | 
-|float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
-|Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
-
-Total time spent on conv op for one batch:
-
-|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | 
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
-|float32| 11.95 | 6.96  | 18.65 | 21.42 | 41.35  | 60.58 | 130.11 |
-|float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
-|Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
-
-
-### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
-
-Total inference time for one batch:
-
-|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
-|float32| 7.03  | 7.41  | 9.16  | 12.55 | 21.13  | 38.27 | 67.93  | 127.02 | 
-|float16| 6.13  | 6.32  | 6.24  |  7.40 | 10.90  | 18.18 | 33.20  |  64.52 |
-|Speedup| 1.15  | 1.17  | 1.47  |  1.70 |  1.94  |  2.11 |  2.05  |   1.97 |
-
-Total time spent on conv op for one batch:
-
-|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
-|float32| 5.43  | 5.46  | 6.50  | 8.36  | 13.80  | 24.45 | 41.21  | 73.44  |
-|float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
-|Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
-
-
-### Vgg16 on cifar10 (image.shape = [3, 32, 32]):
-
-Total inference time for one batch:
-
-|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
-|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:| 
-|float32| 3.13 | 3.17 | 3.19 | 3.58 | 3.98  | 6.23  | 8.42  | 13.44  | 24.19  | 44.97  | 
-|float16| 2.72 | 2.77 | 2.76 | 2,88 | 2.96  | 3.24  | 4.01  |  5.78  |  9.65  | 17.37  |
-|Speedup| 1.15 | 1.14 | 1.16 | 1.24 | 1.34  | 1.92  | 2.10  |  2.33  |  2.51  |  2.59  |
-
-
-### Resnet32 on cifar10 (image.shape = [3, 32, 32]):
-
-Total inference time for one batch:
-
-|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
-|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:|
-|float32| 3.11 | 3.14 | 2.99 | 3.04 | 3.10  | 3.28  | 4.47  | 6.86   | 11.63  | 21.16  |
-|float16| 3.70 | 3.81 | 3.75 | 3.83 | 3.77  | 3.97  | 3.92  | 4.15   |  6.41  | 11.02  | 
-|Speedup|      |      |      |      |       |       | 1.14  | 1.65   |  1.81  |  1.92  |
-
-
-## Benchmark on 1080 Ti
-All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia GeForce GTX 1080 Ti GPU with respective to different mini-batch(mb) sizes.
-
-### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
-Total inference time for one batch:
-
-|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32  |
-|-------|-----: |-----: |-----: |-----: |------: |-------:|
-|float32| 5.60  | 9.38  | 15.86 | 29.79 | 57.60  | 117.73 |
-|float16| 4.99  | 7.79  | 13.47 | 26.02 | 52.30  | 102.34 |
-|Speedup| 1.12  | 1.20  |  1.18 |  1.15 |  1.10  |   1.15 |
-
-
-### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
-Total inference time for one batch:
-
-|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32  | mb=64  |
-|-------|-----: |-----: |-----: |-----: |------: |-------:|-------:|
-|float32| 5.63  | 6.23  | 8.85  | 14.71 | 26.07  | 52.86  | 108.95 |
-|float16| 5.89  | 6.44  | 7.94  | 12.57 | 22.03  | 45.06  |  92.68 |
-|Speedup|       |       | 1.12  |  1.17 |  1.18  |  1.17  |   1.18 |
diff --git a/paddle/contrib/float16/float16_inference_demo.py b/paddle/contrib/float16/float16_inference_demo.py
deleted file mode 100644
index 063227d5d2586d66ad4091133a8edf014da839f8..0000000000000000000000000000000000000000
--- a/paddle/contrib/float16/float16_inference_demo.py
+++ /dev/null
@@ -1,362 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from float16_transpiler import Float16Transpiler
-
-import argparse
-import paddle
-import paddle.fluid as fluid
-import contextlib
-import math
-import sys
-import numpy as np
-import os
-
-parser = argparse.ArgumentParser(
-    'Float16 inference accuracy test and benchmark.')
-parser.add_argument(
-    '--train_batch_size', type=int, default=16, help="Batch size for training.")
-parser.add_argument(
-    '--inf_batch_size', type=int, default=32, help="Batch size for inference.")
-parser.add_argument(
-    '--repeat', type=int, default=1, help="How many times to run the test.")
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'imagenet'],
-    help="Optional dataset for benchmark.")
-parser.add_argument(
-    '--model',
-    type=str,
-    default='vgg',
-    choices=['vgg', 'resnet'],
-    help="Optional model for benchmark.")
-parser.add_argument(
-    '--threshold',
-    type=float,
-    default=0.005,
-    help='Save inference model when test accuracy reach this threshold.')
-parser.add_argument('--learning_rate', type=float, default=0.001)
-args = parser.parse_args()
-
-
-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-    conv1 = fluid.layers.conv2d(
-        input=input,
-        filter_size=filter_size,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act)
-
-
-def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1]
-    if ch_in != ch_out:
-        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-    else:
-        return input
-
-
-def basicblock(input, ch_out, stride):
-    short = shortcut(input, ch_out, stride)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def bottleneck(input, ch_out, stride):
-    short = shortcut(input, ch_out * 4, stride)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
-    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
-
-
-def layer_warp(block_func, input, ch_out, count, stride):
-    res_out = block_func(input, ch_out, stride)
-    for i in range(1, count):
-        res_out = block_func(res_out, ch_out, 1)
-    return res_out
-
-
-def resnet_imagenet(input, depth=50):
-    cfg = {
-        18: ([2, 2, 2, 1], basicblock),
-        34: ([3, 4, 6, 3], basicblock),
-        50: ([3, 4, 6, 3], bottleneck),
-        101: ([3, 4, 23, 3], bottleneck),
-        152: ([3, 8, 36, 3], bottleneck)
-    }
-    stages, block_func = cfg[depth]
-    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
-    pool1 = fluid.layers.pool2d(
-        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
-    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
-    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
-    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
-    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
-    pool2 = fluid.layers.pool2d(
-        input=res4,
-        pool_size=7,
-        pool_type='avg',
-        pool_stride=1,
-        global_pooling=True)
-    return pool2
-
-
-def resnet_cifar10(input, depth=32):
-    assert (depth - 2) % 6 == 0
-
-    n = (depth - 2) // 6
-
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    return pool
-
-
-def vgg16(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
-
-
-def train(place, save_dirname):
-    if args.data_set == "cifar10":
-        class_dim = 10
-        data_shape = [3, 32, 32]
-    elif args.data_set == "imagenet":
-        class_dim = 102
-        data_shape = [3, 224, 224]
-    else:
-        raise ValueError("%s dataset is not supported" % data_set)
-
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if args.model == "vgg":
-        print("train vgg")
-        net = vgg16(images)
-    elif args.model == "resnet":
-        print("train resnet")
-        if args.data_set == "cifar10":
-            net = resnet_cifar10(images)
-        elif args.data_set == "imagenet":
-            net = resnet_imagenet(images)
-        else:
-            raise ValueError("%s dataset is not supported" % args.data_set)
-    else:
-        raise ValueError("%s network is not supported" % args.model)
-
-    predict = fluid.layers.fc(input=net, size=class_dim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=predict, label=label)
-
-    #Test program
-    test_program = fluid.default_main_program().clone(for_test=True)
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    optimizer.minimize(avg_cost)
-
-    BATCH_SIZE = args.train_batch_size
-    PASS_NUM = 100
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.flowers.train()
-            if args.data_set == 'imagenet' else paddle.dataset.cifar.train10(),
-            buf_size=128 * 10),
-        batch_size=args.train_batch_size)
-
-    test_reader = paddle.batch(
-        paddle.dataset.flowers.test()
-        if args.data_set == 'imagenet' else paddle.dataset.cifar.test10(),
-        batch_size=args.inf_batch_size)
-
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-
-    exe.run(fluid.default_startup_program())
-    main_program = fluid.default_main_program()
-
-    for pass_id in range(PASS_NUM):
-        for batch_id, data in enumerate(train_reader()):
-            train_image = np.array(
-                map(lambda x: x[0].reshape(data_shape), data)).astype("float32")
-            train_label = np.array(map(lambda x: x[1], data)).astype("int64")
-            train_label = train_label.reshape([-1, 1])
-
-            exe.run(main_program,
-                    feed={'pixel': train_image,
-                          'label': train_label})
-
-            if (batch_id % 100) == 0:
-                acc_list = []
-                avg_loss_list = []
-                for tid, test_data in enumerate(test_reader()):
-                    test_image = np.array(
-                        map(lambda x: x[0].reshape(data_shape),
-                            test_data)).astype("float32")
-                    test_label = np.array(map(lambda x: x[1],
-                                              test_data)).astype("int64")
-                    test_label = test_label.reshape([-1, 1])
-
-                    loss_t, acc_t = exe.run(
-                        program=test_program,
-                        feed={"pixel": test_image,
-                              "label": test_label},
-                        fetch_list=[avg_cost, acc])
-                    if math.isnan(float(loss_t)):
-                        sys.exit("got NaN loss, training failed.")
-                    acc_list.append(float(acc_t))
-                    avg_loss_list.append(float(loss_t))
-
-                acc_value = np.array(acc_list).mean()
-                avg_loss_value = np.array(avg_loss_list).mean()
-
-                print(
-                    'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Accuracy {3:2.2}'.
-                    format(pass_id, batch_id + 1,
-                           float(avg_loss_value), float(acc_value)))
-
-                if acc_value > args.threshold:
-                    print(
-                        'Save inference model with test accuracy of {0} at {1}'.
-                        format(float(acc_value), save_dirname))
-                    fluid.io.save_inference_model(save_dirname, ["pixel"],
-                                                  [predict], exe)
-                    return
-
-
-def test_accuracy(executor, inference_program, feed_target_names,
-                  fetch_targets):
-    if args.data_set == "cifar10":
-        data_shape = [3, 32, 32]
-    elif args.data_set == "imagenet":
-        data_shape = [3, 224, 224]
-    else:
-        raise ValueError("%s dataset is not supported" % data_set)
-
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == "cifar10" else paddle.dataset.flowers.test(),
-        batch_size=args.inf_batch_size)
-
-    test_num = 0
-    correct_num = 0
-
-    for test_data in test_reader():
-        test_image = np.array(
-            map(lambda x: x[0].reshape(data_shape), test_data)).astype(
-                "float32")
-        test_label = np.array(map(lambda x: x[1], test_data)).astype("int64")
-        test_label = test_label.reshape([-1, 1])
-
-        results = executor.run(program=inference_program,
-                               feed={feed_target_names[0]: test_image},
-                               fetch_list=fetch_targets)
-
-        prediction = np.argmax(results[0], axis=1).reshape([-1, 1])
-        correct_num += np.sum(prediction == test_label)
-        test_num += test_label.size
-
-    print("{0} out of {1} predictions are correct.".format(correct_num,
-                                                           test_num))
-    print("Test accuray is {0}.".format(float(correct_num) / float(test_num)))
-
-
-def infer(place, save_dirname):
-    exe = fluid.Executor(place)
-    inference_scope = fluid.core.Scope()
-
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        print("Load inference model from {0}".format(save_dirname))
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        print("The test set accuracy of inference in float mode is:")
-        test_accuracy(exe, inference_program, feed_target_names, fetch_targets)
-
-        float16_inference_program = inference_program.clone()
-        t = Float16Transpiler()
-        t.transpile(float16_inference_program, place)
-
-        print("The test set accuracy of inference in float16 mode is:")
-        test_accuracy(exe, float16_inference_program, feed_target_names,
-                      fetch_targets)
-
-        fp16_save_dirname = "float16_" + save_dirname
-        fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
-                                      fetch_targets, exe,
-                                      float16_inference_program)
-
-
-@contextlib.contextmanager
-def scope_prog_guard():
-    prog = fluid.Program()
-    startup_prog = fluid.Program()
-    scope = fluid.core.Scope()
-    with fluid.scope_guard(scope):
-        with fluid.program_guard(prog, startup_prog):
-            yield
-
-
-if __name__ == "__main__":
-    if not fluid.core.is_compiled_with_cuda():
-        raise Exception("This test requires CUDA GPUs!")
-
-    place = fluid.CUDAPlace(0)
-    if not fluid.core.is_float16_supported(place):
-        raise Exception(
-            "This test requires compute capability of CUDA GPU >= 5.3!")
-
-    for i in range(args.repeat):
-        with scope_prog_guard():
-            save_dirname = "image_classification_" + args.data_set + "_" + args.model + ".inference.model"
-            train(place, save_dirname)
-            infer(place, save_dirname)
diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py
deleted file mode 100644
index 500f64bed9898fa874cbad2ea69aae05df58023e..0000000000000000000000000000000000000000
--- a/paddle/contrib/float16/float16_transpiler.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.framework import Program
-from paddle.fluid.executor import global_scope
-
-
-class Float16Transpiler:
-    def transpile(self, program, place, scope=None):
-        '''
-        Transpile the program desc and cast the weights to float16 data type to
-        enable float16 inference.
-
-        Since the operator in a program desc will automatically choose the
-        right compute kernel to run based on the data type of the input tensor.
-        We actually don't need to change the program desc to run in float16 mode.
-
-        However, in this way, users who are used to feeding and fetching tensors 
-        of float32 data type when running typical inference may find it confusing
-        and difficult to run inference in float16 mode as they need to convert
-        input data to float16 dtype and then convert the results back to float32 
-        dtype to match the rest of code.
-
-        So this function appends cast ops to the program desc where necessary so 
-        that users are able to run inference in float16 mode while providing input 
-        tensor (feed_holder) of float data type and obtaining output tensor 
-        (fetch_holder) of float data type. 
-
-        Moreover, it is desired that when we have the scope and program desc to run
-        inference in float32 mode, we can use a single API to do the necessary 
-        modification and then user can run float16 inference on the fly. To make 
-        this happen, this function also create new parameters in the scope to have the 
-        converted float16 weights and change the operators in program desc to use 
-        these new parameters.
-
-        :param program: program to transpile 
-        :type program: Program
-        :param place: inference place 
-        :type place: Place
-        :param scope: inference scope 
-        :type scope: Scope         
-        '''
-        if not isinstance(program, Program):
-            raise TypeError("program should be as Program type")
-        if not isinstance(place, core.CPUPlace) and not isinstance(
-                place, core.CUDAPlace):
-            raise TypeError("place should be as CPUPlace/CUDAPlace type")
-        if scope is None:
-            scope = global_scope()
-        if not isinstance(scope, core._Scope):
-            raise TypeError("scope should be as Scope type or None")
-
-        self.scope = scope
-        self.place = place
-        self.block = program.block(0)
-        self.input_map = {}  # store the input names should be adjusted 
-
-        self._modify_feed_fetch()
-        self._convert_param_to_float16()
-        self._adjust_input(skip=True)
-        self._remove_unused_var()
-
-        # TODO(luotao): use clone() method to flush the program.desc in force, 
-        # since some large program.desc will not be flushed immediately. 
-        # And a better solution will be considered later.
-        program = program.clone()
-
-    # ====================== private transpiler functions =====================
-    def _adjust_input(self, skip=False):
-        '''
-        Change the input variable name in operators.
-
-        When we are in the process of modifying a program desc, we usually 
-        replace some variables with some other variables, where we create 
-        a dictionary input_map to record the one-to-one correspondence
-        between each old variable and the new one. 
-
-        After that, this function will search all the operators that use the 
-        old variables and change the info in op to use the new variables. There 
-        maybe some exceptions to this rule when we are using the float16 transpiler
-        and insert cast ops to cast float32 variable to float16 one. After we 
-        insert the cast op to cast var_1 to var_1_fp16, we don't want to change 
-        the input of cast op to var_1_fp16 after using this function.     
-        '''
-        skip_ops = {"cast"}
-        for i in range(len(self.block.ops)):
-            current_op = self.block.ops[i]
-            if skip and current_op.type in skip_ops:
-                continue
-            for input_arg in current_op.input_arg_names:
-                if input_arg in self.input_map:
-                    current_op._rename_input(input_arg,
-                                             self.input_map[input_arg])
-
-    def _remove_unused_var(self):
-        '''
-        remove unused varibles in program
-        '''
-        args = []
-        for i in range(len(self.block.ops)):
-            current_op = self.block.ops[i]
-            args += current_op.input_arg_names
-            args += current_op.output_arg_names
-        args = list(set(args))  # unique the input and output arguments
-
-        for var in self.block.vars.keys():
-            if var not in args:
-                self.block._remove_var(var)
-
-    def _modify_feed_fetch(self):
-        '''
-        Modify feed fetch op/vars for float16 inference.
-
-        For each feed op:
-        feed_op->feed_target_var
-        
-        Change it to:
-        feed_op->feed_target_var->cast_op(from other dtype to float16)->tmp_var
-
-        For each fetch op:
-        fetch_target_var->fetch_op
-
-        Change it to:
-        tmp_var->cast_op(from float16 to other dtype)->fetch_target_var->fetch_op
-
-        :return: None
-        '''
-
-        def find_op(var):
-            # It is possible that var.op is not up to date after some 
-            # modifications to program desc. Here we force to make it up to date.
-            var.op = None
-            for op in self.block.ops:
-                if var.name in op.output_arg_names:
-                    var.op = op
-                    break
-
-            if var.op is None:
-                raise ValueError("The target variable must have an "
-                                 "associated operator that generates it.")
-
-        i = 0
-        while i < len(self.block.ops):
-            cur_op = self.block.ops[i]
-            if cur_op.type == "feed":
-                var_name = cur_op.output("Out")[0]
-                tmp_var_name = var_name + ".fp16"
-                var = self.block.vars[var_name]
-                tmp_var = self.block.create_var(
-                    name=tmp_var_name.encode('ascii'),
-                    type=var.type,
-                    dtype=core.VarDesc.VarType.FP16,
-                    shape=var.shape,
-                    persistable=var.persistable)
-                self.block._insert_op(
-                    i + 1,
-                    type="cast",
-                    inputs={"X": var},
-                    outputs={"Out": tmp_var},
-                    attrs={
-                        'in_dtype': int(var.dtype),
-                        'out_dtype': int(tmp_var.dtype)
-                    })
-                self.input_map[var_name] = tmp_var_name
-                i = i + 1
-            elif cur_op.type == "fetch":
-                var_name = cur_op.input("X")[0]
-                tmp_var_name = var_name + ".fp16"
-                var = self.block.vars[var_name]
-                tmp_var = self.block.create_var(
-                    name=tmp_var_name.encode('ascii'),
-                    type=var.type,
-                    dtype=core.VarDesc.VarType.FP16,
-                    shape=var.shape,
-                    persistable=var.persistable)
-                find_op(var)
-                var.op._rename_output(var_name, tmp_var_name)
-                self.block._insert_op(
-                    i,
-                    type="cast",
-                    inputs={"X": tmp_var},
-                    outputs={"Out": var},
-                    attrs={
-                        'in_dtype': int(tmp_var.dtype),
-                        'out_dtype': int(var.dtype)
-                    })
-                i = i + 1
-            i = i + 1
-
-    def _convert_param_to_float16(self):
-        def _get_no_fp16_conversion_var_names():
-            '''
-            Get the set of input variable names that shouldn't be converted to float16.
-
-            When we want to run inference in float16 mode, most parameters need to be 
-            firstly converted to float16. However, there are some parameters that 
-            shouldn't be converted to float16 because the corresponding operator 
-            requires float32 parameters even in float16 mode (when the input data is 
-            of float16 data type). Currently, the only operator that has this exclusion 
-            is the batch norm op.
-
-            :return: set of input variable names 
-            :type var_names: set         
-            '''
-            op_names = {'batch_norm'}
-            var_names = []
-            for op in self.block.ops:
-                if op.type in op_names:
-                    var_names += op.input_arg_names
-            return set(var_names)
-
-        def _should_be_converted(var):
-            return var.persistable and \
-                   var.name not in self.no_conversion_vars and \
-                   var.type != core.VarDesc.VarType.FEED_MINIBATCH and \
-                   var.type != core.VarDesc.VarType.FETCH_LIST
-
-        self.no_conversion_vars = _get_no_fp16_conversion_var_names()
-        conversion_var_list = filter(_should_be_converted,
-                                     self.block.vars.values())
-        for var in conversion_var_list:
-            fp16_var_name = var.name + ".fp16"
-            fp16_var = self.block.create_parameter(
-                name=fp16_var_name.encode('ascii'),
-                type=var.type,
-                dtype=core.VarDesc.VarType.FP16,
-                shape=var.shape)
-
-            # cast the data in the tensor of the original var to float16
-            # data type and store it in the tensor of the new float16 var
-            self.scope.var(fp16_var_name)
-            fp16_tensor = self.scope.find_var(fp16_var_name).get_tensor()
-            tensor = np.array(self.scope.find_var(var.name).get_tensor())
-            # After the old tensor data is converted to np.float16, view(np.uint16)
-            # is used so that the internal memory of the numpy array will be 
-            # reinterpreted to be of np.uint16 data type, which is binded to fluid 
-            # float16 data type via the help of pybind in tensor_py.h. 
-            fp16_tensor.set(
-                tensor.astype(np.float16).view(np.uint16), self.place)
-
-            # old var will be replaced by the fp16 var in program desc
-            self.input_map[var.name] = fp16_var_name
-            self.block._remove_var(var.name)
diff --git a/paddle/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh
deleted file mode 100755
index 34cb7a12db171915f2bc7df8787dd62cd381de68..0000000000000000000000000000000000000000
--- a/paddle/contrib/float16/run_float16_demo.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/bin/bash
-
-BUILD_PATH=/paddle/fp16_build
-WHEEL_PATH=$BUILD_PATH/python/dist
-INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
-DEMO_PATH=/paddle/paddle/contrib/float16
-
-# Use the single most powerful CUDA GPU on your machine
-export CUDA_VISIBLE_DEVICES=0
-
-# Build the PaddlePaddle Fluid wheel package and install it.
-mkdir -p $BUILD_PATH && cd $BUILD_PATH
-cmake .. -DWITH_AVX=OFF \
-         -DWITH_MKL=OFF \
-         -DWITH_GPU=ON \
-         -DWITH_TESTING=ON \
-         -DWITH_PROFILER=ON \
-make -j `nproc`
-pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
-
-cd $DEMO_PATH
-# Clear previous log results
-rm -f *.log
-
-# Test the float16 inference accuracy of resnet32 on cifar10 data set
-stdbuf -oL python float16_inference_demo.py \
-       --data_set=cifar10 \
-       --model=resnet \
-       --threshold=0.6 \
-       --repeat=10 \
-       2>&1 | tee -a float16_inference_accuracy.log
-
-# Sleep to cool down the GPU for consistent benchmarking
-sleep 2m
-
-# benchmarking parameters
-REPEAT=1000
-MAXIMUM_BATCH_SIZE=512
-
-for ((batch_size = 1; batch_size <= MAXIMUM_BATCH_SIZE; batch_size *= 2)); 
-do
-
-  # Test inference benchmark of vgg16 on imagenet
-  stdbuf -oL python float16_inference_demo.py \
-         --data_set=imagenet \
-         --model=vgg \
-         --threshold=0.001 \
-         --repeat=1 \
-
-  $INFER_PATH/test_inference_image_classification_vgg \
-      --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
-      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
-      --repeat=$REPEAT \
-      --batch_size=$batch_size \
-      --skip_cpu=true \
-      2>&1 | tee -a imagenet_vgg16_benchmark.log
-
-  sleep 2m
-
-  # Test inference benchmark of resnet50 on imagenet
-  stdbuf -oL python float16_inference_demo.py \
-         --data_set=imagenet \
-         --model=resnet \
-         --threshold=0.001 \
-         --repeat=1 \
-
-  $INFER_PATH/test_inference_image_classification_resnet \
-      --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
-      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
-      --repeat=$REPEAT \
-      --batch_size=$batch_size \
-      --skip_cpu=true \
-      2>&1 | tee -a imagenet_resnet50_benchmark.log
-
-  sleep 2m
-
-  # Test inference benchmark of vgg16 on cifar10
-  stdbuf -oL python float16_inference_demo.py \
-         --data_set=cifar10 \
-         --model=vgg \
-         --threshold=0.001 \
-         --repeat=1 \
-
-  $INFER_PATH/test_inference_image_classification_vgg \
-      --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
-      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
-      --repeat=$REPEAT \
-      --batch_size=$batch_size \
-      --skip_cpu=true \
-      2>&1 | tee -a cifar10_vgg16_benchmark.log
-
-  sleep 1m
-
-  # Test inference benchmark of resnet32 on cifar10
-  stdbuf -oL python float16_inference_demo.py \
-         --data_set=cifar10 \
-         --model=resnet \
-         --threshold=0.001 \
-         --repeat=1 \
-
-  $INFER_PATH/test_inference_image_classification_vgg \
-      --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
-      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
-      --repeat=$REPEAT \
-      --batch_size=$batch_size \
-      --skip_cpu=true \
-      2>&1 | tee -a cifar10_resnet32_benchmark.log
-
-  sleep 1m
-
-done
diff --git a/paddle/fluid/.clang-format b/paddle/fluid/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
deleted file mode 100644
index 77ad4f5415155a4bc90ba980514711edd67725f1..0000000000000000000000000000000000000000
--- a/paddle/fluid/API.spec
+++ /dev/null
@@ -1,1088 +0,0 @@
-paddle.fluid.Program ('paddle.fluid.framework.Program', ('document', '7364a01d7b9132a435e46162c7fbd6c6'))
-paddle.fluid.Program.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.Program.block (ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None), ('document', '86cd9499e226be661a3d686260ee1150'))
-paddle.fluid.Program.clone (ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)), ('document', '11777d4121a64566a746e55497a4b78c'))
-paddle.fluid.Program.current_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd601c7719e425e3d9cf862ea4ad194ca'))
-paddle.fluid.Program.global_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd64ea1dc96e9f674499ea3006d470aa4'))
-paddle.fluid.Program.list_vars (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '32c14b0f12baae4b352200fa09b5e789'))
-paddle.fluid.Program.parse_from_string (ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None), ('document', 'b6a7ffb239a30bf2ce58cfaca8d8b8d5'))
-paddle.fluid.Program.to_string (ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)), ('document', '89acca639baf00f3ad08b9d827e81706'))
-paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'ba609cb02e4e55e8d626723567ef1778'))
-paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '853718df675e59aea7104f3d61bbf11d'))
-paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', '78fb5c7f70ef76bcf4a1862c3f6b8191'))
-paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '917d313881ff990de5fb18d98a9c7b42'))
-paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '1f2bb6ece651e44117652d2d7bedecf5'))
-paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '956bab564ebc69ffd17195c08cc8ffa0'))
-paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c2562241744aabe3fff1b59af22dd281'))
-paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '301bae0d8e02cc9eec5be02f052f11c6'))
-paddle.fluid.is_compiled_with_cuda (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '60c7f107a5050aeb58bb74eb175672b5'))
-paddle.fluid.Executor ('paddle.fluid.executor.Executor', ('document', '34e8c1769313fbeff7817212dda6259e'))
-paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a584496aa1343f36eebf3c46b323a74'))
-paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', 'bedc29ad01c1b911e99032ee1e19ac59'))
-paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', '4cfcd9c15b766a51b584cc46d38f1ad8'))
-paddle.fluid.Executor.train_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '28f50904a0213f110947a30e0438529c'))
-paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f65788d9ead293ada47551339df12203'))
-paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'e6c073ed237001aaba7bff976b62b122'))
-paddle.fluid.DistributeTranspiler ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspiler', ('document', 'b2b19821c5dffcd11473d6a4eef089af'))
-paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'b1951949c6d21698290aa8ac69afee32'))
-paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'c89fc350f975ef827f5448d68af388cf'))
-paddle.fluid.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '90a40b80e0106f69262cc08b861c3e39'))
-paddle.fluid.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '0e47f020304e2b824e87ff03475c17cd'))
-paddle.fluid.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '418c7e8b268e9be4104f2809e654c2f7'))
-paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, True)), ('document', '2348247f684bfd5bb9466470f35be064'))
-paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd38c5b8b2b2e0bb19bcf1b581a80a7e4'))
-paddle.fluid.DistributeTranspilerConfig ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspilerConfig', ('document', '550b8c767a8ae1a2eb74b18924ddc975'))
-paddle.fluid.DistributeTranspilerConfig.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.ParallelExecutor ('paddle.fluid.parallel_executor.ParallelExecutor', ('document', '2b4d2e859f2e0c6161f4fed995f7956d'))
-paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.ParallelExecutor.drop_local_exe_scopes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '77c739744ea5708b80fb1b37cc89db40'))
-paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '0af092676e5b1320bb4232396154ce4b'))
-paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', 'b82ea20e2dc5ff2372e0643169ca47ff'))
-paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '74dc6d23185d90a7a50fbac19f5b65fb'))
-paddle.fluid.DataFeedDesc ('paddle.fluid.data_feed_desc.DataFeedDesc', ('document', '43877a0d9357db94d3dbc7359cbe8c73'))
-paddle.fluid.DataFeedDesc.__init__ (ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '9c6615854b61caa5f0d3e6ccc5e51338'))
-paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'a34790bff4a2891713ddd644db56418d'))
-paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'fdd07ce63e72bed57f2c0db5bec5720f'))
-paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'c23a79dfa04edd014b477bd4b183da06'))
-paddle.fluid.CompiledProgram ('paddle.fluid.compiler.CompiledProgram', ('document', '598d294107d44d7620bce76527a92c37'))
-paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph', 'build_strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', '1c7c6171bbf6d77f2fce0166aa0ec43b'))
-paddle.fluid.ExecutionStrategy ('paddle.fluid.core_avx.ExecutionStrategy', ('document', '535ce28c4671176386e3cd283a764084'))
-paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.ExecutionStrategy) -> None
-paddle.fluid.BuildStrategy ('paddle.fluid.core_avx.BuildStrategy', ('document', 'eec64b9b7cba58b0a63687b4c34ffe56'))
-paddle.fluid.BuildStrategy.GradientScaleStrategy ('paddle.fluid.core_avx.GradientScaleStrategy', ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
-paddle.fluid.BuildStrategy.ReduceStrategy ('paddle.fluid.core_avx.ReduceStrategy', ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
-paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy) -> None
-paddle.fluid.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b'))
-paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '869104f47e6fd21d897c3fcc426aa942'))
-paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '046d7c43d67e08c2660bb3bd7e081015'))
-paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'ffcee38044975c29f2ab2fec0576f963'))
-paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '1bb9454cf09d71f190bb51550c5a3ac9'))
-paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '116a9ed169e7ff0226faccff3c29364c'))
-paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cfa84ef7c5435625bff4cc132cb8a0e3'))
-paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment', 'program_only'], varargs=None, keywords=None, defaults=(None, None, None, True, False)), ('document', 'fc82bfd137a9b1ab8ebd1651bd35b6e5'))
-paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '2f54d7c206b62f8c10f4f9d78c731cfd'))
-paddle.fluid.io.batch (ArgSpec(args=['reader', 'batch_size', 'drop_last'], varargs=None, keywords=None, defaults=(False,)), ('document', 'cf2869b408b39cadadd95206b4e03b39'))
-paddle.fluid.io.PyReader ('paddle.fluid.reader.PyReader', ('document', 'b03399246f69cd6fc03b43e87af8bd4e'))
-paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable', 'return_list'], varargs=None, keywords=None, defaults=(None, None, True, True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.io.PyReader.decorate_batch_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', '4364e836e3cb8ab5e68e411b763c50c7'))
-paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)), ('document', 'efa4c8b90fe6d99dcbda637b70351bb1'))
-paddle.fluid.io.PyReader.decorate_sample_list_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', '6c11980092720de304863de98074a64a'))
-paddle.fluid.io.PyReader.next (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '08b2fd1463f3ea99d79d17303988349b'))
-paddle.fluid.io.PyReader.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '7432197701fdaab1848063860dc0b97e'))
-paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'a0983fb21a0a51e6a31716009fe9a9c1'))
-paddle.fluid.io.DataLoader ('paddle.fluid.reader.DataLoader', ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.io.DataLoader.__init__ 
-paddle.fluid.io.DataLoader.from_dataset (ArgSpec(args=['dataset', 'places', 'drop_last'], varargs=None, keywords=None, defaults=(True,)), ('document', '58e8bffa033f26b00b256c8bb1daff11'))
-paddle.fluid.io.DataLoader.from_generator (ArgSpec(args=['feed_list', 'capacity', 'use_double_buffer', 'iterable', 'return_list'], varargs=None, keywords=None, defaults=(None, None, True, True, False)), ('document', '8034bdb488fa18d60c4ffb0ba9658337'))
-paddle.fluid.io.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
-paddle.fluid.io.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
-paddle.fluid.io.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
-paddle.fluid.io.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d'))
-paddle.fluid.io.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
-paddle.fluid.io.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
-paddle.fluid.io.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
-paddle.fluid.io.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796'))
-paddle.fluid.io.PipeReader ('paddle.reader.decorator.PipeReader', ('document', 'd3c250618f98c1a5fb646f869016a98e'))
-paddle.fluid.io.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.io.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45'))
-paddle.fluid.io.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
-paddle.fluid.io.Fake ('paddle.reader.decorator.Fake', ('document', '0d8f4847b99bed6d456ade0d903202e1'))
-paddle.fluid.io.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.initializer.ConstantInitializer ('paddle.fluid.initializer.ConstantInitializer', ('document', '798f1fd87cbe9798d001ffb6e616415d'))
-paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.initializer.UniformInitializer ('paddle.fluid.initializer.UniformInitializer', ('document', '587b7035cd1d56f76f2ded617b92521d'))
-paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed', 'diag_num', 'diag_step', 'diag_val'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0, 0, 0, 1.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.initializer.NormalInitializer ('paddle.fluid.initializer.NormalInitializer', ('document', '279a0d89bf01138fbf4c4ba14f22099b'))
-paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.initializer.TruncatedNormalInitializer ('paddle.fluid.initializer.TruncatedNormalInitializer', ('document', 'b8e90aad6ee5687cb5f2b6fd404370d1'))
-paddle.fluid.initializer.TruncatedNormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.initializer.XavierInitializer ('paddle.fluid.initializer.XavierInitializer', ('document', '3d5676f1a5414aa0c815d793a795ccb3'))
-paddle.fluid.initializer.XavierInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.initializer.BilinearInitializer ('paddle.fluid.initializer.BilinearInitializer', ('document', '8a40b54fe33c19c3edcf6624ffae5d03'))
-paddle.fluid.initializer.BilinearInitializer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd389912dc079cbef432335a00017cec0'))
-paddle.fluid.initializer.MSRAInitializer ('paddle.fluid.initializer.MSRAInitializer', ('document', 'b99e0ee95e2fd02640cb4b08a7ae80cc'))
-paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)), ('document', '53c757bed9345f2ad3361902531e7cf5'))
-paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5f55553caf939d270c7fe8dc418084b2'))
-paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'eaa04fd68661a3af59abd0e19b3b6eda'))
-paddle.fluid.initializer.NumpyArrayInitializer ('paddle.fluid.initializer.NumpyArrayInitializer', ('document', '064f134a27c16372967d450f499762ab'))
-paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'd4ac047e0d5e6b7b1c5ff6ef7d7cfff5'))
-paddle.fluid.one_hot (ArgSpec(args=['input', 'depth', 'allow_out_of_range'], varargs=None, keywords=None, defaults=(False,)), ('document', 'eef66730acc806088f9e8ba90252bda1'))
-paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, None)), ('document', '0dc8181f14a33f91fbae9385a9b3d9fd'))
-paddle.fluid.layers.center_loss (ArgSpec(args=['input', 'label', 'num_classes', 'alpha', 'param_attr', 'update_center'], varargs=None, keywords=None, defaults=(True,)), ('document', '7129819d94625c6104054e8187768589'))
-paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'd8e405486a1e4e189b51d6ee28d67b1e'))
-paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', '6d3ee14da70adfa36d85c40b18716ef2'))
-paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'c37d51aad655c8a9f9b045c64717320a'))
-paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3'))
-paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e'))
-paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9045b8971e4232132ec9952695f4c3ae'))
-paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '5ce117258e243be1c81539e254178d90'))
-paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '8e6ce424cf9e261ef32ee229c06a6e66'))
-paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'f43c659ca1749a3f0ff2231e6dfda07d'))
-paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6263dfdeb6c670fa0922c9cbc8fb1bf4'))
-paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'bbb9e708bab250359864fefbdf48e9d9'))
-paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b02844e0ad4bd713c5fe6802aa13219c'))
-paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'padding_start', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, True, None, None, None, None, None)), ('document', '2bf23e7884c380c3b27f2709aa322cb9'))
-paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '06de9adb5994f6f8cb806c75b55550af'))
-paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '71b09227709475fa178c1739dff64af6'))
-paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test', 'pad_value'], varargs=None, keywords=None, defaults=(False, 0.0)), ('document', 'e90a93251c52dc4e6fb34fb3991b3f82'))
-paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'eaa9d0bbd3d4e017c8bc4ecdac483711'))
-paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', 'cee673c79e3ff4582656a24e04f841e5'))
-paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'be7e530dcbd603962e25573a63eb145e'))
-paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '053b1a855f13a066d005759171724bc6'))
-paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '52343203de40afe29607397e13aaf0d2'))
-paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '55db6ae7275fb9678a6814aebab81a9c'))
-paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '9e5a9f4f6d82d34a33d9ca632379cbcc'))
-paddle.fluid.layers.instance_norm (ArgSpec(args=['input', 'epsilon', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None)), ('document', '02972097e089629efdb0ed9404fd36ae'))
-paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '2460b30fb87037555208fa8ac6fc1787'))
-paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '83e08f21af41ac8bac37aeab1f86fdd0'))
-paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', 'ab58296b567bf0c686084add7f3280a4'))
-paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', 'fe15dbfb17d97d3d29b2fa7ee6390ee6'))
-paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '10e122eb755c2bd1f78ef2332b28f1a0'))
-paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '858c432e7cbd8bb952cc2eb555457d50'))
-paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'df08b9c499ab3a90f95d08ab5b6c6c62'))
-paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e478180d5bc010a84f35af958cafa62c'))
-paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', 'fe126c58e4339410e875ab1eba246d21'))
-paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'dd5f06fb7cf39ca06cbab4abd03e6893'))
-paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'a3024789eba11a70c2ef27c358173400'))
-paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '10023caec4d7f78c3b901f023a1feaa7'))
-paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '1a1c91625ce3c32646f69ca10d4d1da7'))
-paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'b386471f0476c80c61d8c8672278063d'))
-paddle.fluid.layers.reduce_all (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '8ab17ab51f68a6e76302b27f928cedf3'))
-paddle.fluid.layers.reduce_any (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '0483ac3b7a99e879ccde583ae8d7a60d'))
-paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'f2dfd65b859de9844e7261e7a4503f63'))
-paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '1af2e3a887e4f914f9d6650406186ab6'))
-paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '39fbc5437be389f6c0c769f82fc1fba2'))
-paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', '558d13133596209190df9a624264f28f'))
-paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '78cf3a7323d1a7697658242e13f63759'))
-paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'input_length', 'padding_value', 'name'], varargs=None, keywords=None, defaults=(None, 0, None)), ('document', '9abb7bb8d267e017620a39a146dc47ea'))
-paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(True, None, None, None)), ('document', '77cbfb28cd2fc589f589c7013c5086cd'))
-paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', 'c1df110ea65998984f564c5c10abc54a'))
-paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', '3720b4a386585094435993deb028b592'))
-paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e50940f3ce5a08cc477b72f517491bf3'))
-paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(0, False, None, None)), ('document', 'a5be881ada816e47ea7a6ee4396da357'))
-paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'f568714a876425004aca4ea2d4a27701'))
-paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8e72db173d4c082e27cb11f31d8c9bfa'))
-paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '33134416fc27dd65a767e5f15116ee16'))
-paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '83d4ca6dfb957912807f535756e76992'))
-paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', 'd4435a63d34203339831ee6a86ef9242'))
-paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', 'b83e7dfa81059b39bb137922dc914f50'))
-paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '1270395ce97a4e1b556104abbb14f096'))
-paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '1d8a1c8b686b55631ba1b77805e4eacf'))
-paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23'))
-paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', '79797f827d89ae72c77960e9696883a9'))
-paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '65231cc8281815124934b1439fbb750c'))
-paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '9461e67095a6fc5d568fb2ce8fef66ff'))
-paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax', 'axis'], varargs=None, keywords=None, defaults=(False, -100, True, False, -1)), ('document', '54e1675aa0364f4a78fa72804ec0f413'))
-paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'ecb75c1b00c4c76c98b482f633b7a10c'))
-paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth', 'allow_out_of_range'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ec4115591be842868c86b2e5334245c6'))
-paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '98e7927f09ee2270535b29f048e481ec'))
-paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'ca73fdc4551c5765c92eb00f24874289'))
-paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbac07662a6e22e8e299ced880c7775'))
-paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b9bd3129d36a70e7c4385df51ff71c62'))
-paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', '74498d37dd622ac472cb36887fce09ea'))
-paddle.fluid.layers.lod_append (ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=None), ('document', '37663c7c179e920838a250ea0e28d909'))
-paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', '73d297256da8954617996958d26ee93d'))
-paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '36b6e58678956585e5b30aa3de123a60'))
-paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '95aa1972983f30fe9b5a3713e523e20f'))
-paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '214f1dfbe95a628600bbe99e836319cf'))
-paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', '49368d724023a66b41b0071be41c0ba5'))
-paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '9a7a3b88a4fae41d58d3ca9b10ba0591'))
-paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '7e8e4bf1f0f8612961ed113e8af8f0c5'))
-paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1, 'NCHW')), ('document', 'd29d829607b5ff12924197a3ba296c89'))
-paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', 'bd97ebfe4bdf5110a5fcb8ecb626a447'))
-paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1, 'NCHW')), ('document', '44da7890c8a362a83a1c0902a1dc1e4d'))
-paddle.fluid.layers.resize_trilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1, 'NCDHW')), ('document', '5b4d0f823f94c260fe5e6f7eec60a797'))
-paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 'NCHW')), ('document', '0107a5cbae1aef3f381d3d769a6068eb'))
-paddle.fluid.layers.gather (ArgSpec(args=['input', 'index', 'overwrite'], varargs=None, keywords=None, defaults=(True,)), ('document', 'f985c9b66e3aec96fa753a8eb44c991c'))
-paddle.fluid.layers.gather_nd (ArgSpec(args=['input', 'index', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3cc24f9cf135770aa6263dba25b457f9'))
-paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name', 'overwrite'], varargs=None, keywords=None, defaults=(None, True)), ('document', '69b22affd4a6326502af166f04c095ab'))
-paddle.fluid.layers.scatter_nd_add (ArgSpec(args=['ref', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c2fa5ee7484b52b95a28abf1d8827cd0'))
-paddle.fluid.layers.scatter_nd (ArgSpec(args=['index', 'updates', 'shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '14b5449ce42f8ff4ac4ce79b41c86cc5'))
-paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'abe3f714120117a5a3d3e639853932bf'))
-paddle.fluid.layers.random_crop (ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)), ('document', '042af0b8abea96b40c22f6e70d99e042'))
-paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', 'e714b4aa7993dfe9c1a38886875dbaac'))
-paddle.fluid.layers.relu (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0942c174f4f6fb274976d4357356f6a2'))
-paddle.fluid.layers.selu (ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'f93c61f5b0bf933cd425a64dca2c4fdd'))
-paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '02f668664e3bfc4df6c00d7363467140'))
-paddle.fluid.layers.crop (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'ba3621917d5beffd3d022b88fbf6dc46'))
-paddle.fluid.layers.crop_tensor (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'cb855453e3506bf54c5c013616ffddfb'))
-paddle.fluid.layers.rank_loss (ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8eb36596bb43d7a907d3397c7aedbdb3'))
-paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '6fc86ed23b420c8a0f6c043563cf3937'))
-paddle.fluid.layers.elu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '9af1926c06711eacef9e82d7a9e4d308'))
-paddle.fluid.layers.relu6 (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)), ('document', '538fc860b2a1734e118b94e4a1a3ee67'))
-paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'ca34f88ff61cf2a7f4c97a493d6000d0'))
-paddle.fluid.layers.stanh (ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None)), ('document', '1e1efad868714425da15c785dfb533a1'))
-paddle.fluid.layers.hard_sigmoid (ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None)), ('document', '607d79ca873bee40eed1c79a96611591'))
-paddle.fluid.layers.swish (ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'e0dc7bc66cba939033bc028d7a62c5f4'))
-paddle.fluid.layers.prelu (ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '2da40e447716338affebfe058d05d9a9'))
-paddle.fluid.layers.brelu (ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None)), ('document', '49580538249a52c857fce75c94ad8af7'))
-paddle.fluid.layers.leaky_relu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None)), ('document', '1eb3009c69060299ec87949ee0d4b9ae'))
-paddle.fluid.layers.soft_relu (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(40.0, None)), ('document', '6455afd2498b00198f53f83d63d6c6a4'))
-paddle.fluid.layers.flatten (ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b52306659a21e6b118eed49fe2c155a1'))
-paddle.fluid.layers.sequence_mask (ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)), ('document', '6c3f916921b24edaad220f1fcbf039de'))
-paddle.fluid.layers.stack (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', 'a76f347bf27ffe21b990340d5d9524d5'))
-paddle.fluid.layers.pad2d (ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)), ('document', '3f3abdb795a5c2aad8c2312249551ce5'))
-paddle.fluid.layers.unstack (ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b0c4ca08d4eb295189e1b107c920d093'))
-paddle.fluid.layers.sequence_enumerate (ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b870fed41abd2aecf929ece65f555fa1'))
-paddle.fluid.layers.unique (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=('int32',)), ('document', 'cab0b06e5683875f12f0efc62fa230a9'))
-paddle.fluid.layers.unique_with_counts (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=('int32',)), ('document', '1cb59c65b41766116944b8ed1e6ad345'))
-paddle.fluid.layers.expand (ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7b97042c3ba55fb5fec6a06308523b73'))
-paddle.fluid.layers.sequence_concat (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b992616c1afbd6b0c2a897ac23036381'))
-paddle.fluid.layers.scale (ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)), ('document', '463e4713806e5adaa4d20a41e2218453'))
-paddle.fluid.layers.elementwise_add (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '5c0fb7298aec32525f96d451ae4c2851'))
-paddle.fluid.layers.elementwise_div (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '1da49b7cda887dd84087ef8c060fcf6a'))
-paddle.fluid.layers.elementwise_sub (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '992559c8327c61babd2ed25fc9047fbf'))
-paddle.fluid.layers.elementwise_mul (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '213db11a61dcb0f31159d343cc35e2f5'))
-paddle.fluid.layers.elementwise_max (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '409167a1409ec31b0d3a2f8852a7943f'))
-paddle.fluid.layers.elementwise_min (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '4e1322836eb69473d5606bfe346c5375'))
-paddle.fluid.layers.elementwise_pow (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'b9e7e9fa1ca28d8b6f07cc59eadb4a02'))
-paddle.fluid.layers.elementwise_mod (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '614984304f810f3ddae6b489ec01296b'))
-paddle.fluid.layers.elementwise_floordiv (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'a8c4b26d899246378e878f169582c7a4'))
-paddle.fluid.layers.uniform_random_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)), ('document', 'cfa120e583cd4a5bfa120c8a26f98a28'))
-paddle.fluid.layers.gaussian_random (ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', 'ebbf399d4e03190ce5dc9488f05c92f4'))
-paddle.fluid.layers.sampling_id (ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', 'c39b647b6cf08e058d96ee503d5284fe'))
-paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', 'b24d0b21361c4bb8ef2cec8c26fb12b2'))
-paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'f4b60847cb0f1ae00823ba6fb1b11310'))
-paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '315b4870f294e33a27ecbdf440bed3ff'))
-paddle.fluid.layers.strided_slice (ArgSpec(args=['input', 'axes', 'starts', 'ends', 'strides'], varargs=None, keywords=None, defaults=None), ('document', '340d8d656272ea396b441aab848429a2'))
-paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'bf61c8f79d795a8371bdb3b5468aa82b'))
-paddle.fluid.layers.rank (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '096df0e0273145ab80ed119a4c294db3'))
-paddle.fluid.layers.size (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'cf2e156beae36378722666c4c33bebfe'))
-paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '12db97c6c459c0f240ec7006737174f2'))
-paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '15adbc561618b7db69671e02009bea67'))
-paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '77ccf37b710c507dd97e03f08ce8bb29'))
-paddle.fluid.layers.logical_not (ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6e2fe8a322ec69811f6507d22acf8f9f'))
-paddle.fluid.layers.clip (ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ce33756573c572da67302499455dbcd'))
-paddle.fluid.layers.clip_by_norm (ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '99a1b9012d9c4495efc89d69958c3be7'))
-paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '597257fb94d0597c404a6a5c91ab5258'))
-paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)), ('document', '784b7e36cea88493f9e37a41b10fbf4d'))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '7637c974f2d749d359acae9062c4d96f'))
-paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '22df6542f3f9aa3f34c0c2dab5dc1d80'))
-paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '26decdea9376b6b9a0d3432d82ca207b'))
-paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f85b263b7b6698d000977529a28f202b'))
-paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65c8362e48810b8226e311c5d046db51'))
-paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', '9f303c67538e468a36c5904a0a3aa110'))
-paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '18ec2e3afeb90e70c8b73d2b71c40fdb'))
-paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'a0b73c21be618cec0281e7903039e5e3'))
-paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5d16663e096d7f04954c70ce1cc5e195'))
-paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'e3993a477c94729526040ff65d95728e'))
-paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e399f9436fed5f7ff480d8532e42c937'))
-paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '45fc3652a8e1aeffbe4eba371c54f756'))
-paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2b0e5d5c155ce24bafc38b78cd0b164'))
-paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3e60aec040a6f740a130353323580bff'))
-paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', 'baa7327ed89df6b7bdd32f9ffdb62f63'))
-paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '276a1213dd431228cefa33c3146df34a'))
-paddle.fluid.layers.temporal_shift (ArgSpec(args=['x', 'seg_num', 'shift_ratio', 'name'], varargs=None, keywords=None, defaults=(0.25, None)), ('document', '13b1cdcb01f5ffdc26591ff9a2ec4669'))
-paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb'))
-paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '42d5155374f69786300d90d751956998'))
-paddle.fluid.layers.prroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(1.0, 1, 1, None)), ('document', '454c7ea8c73313dd41513929d7526303'))
-paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '07cb0d95a646dba1b9cc7cdce89e59f0'))
-paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '11bb8e62cc9256958eff3991fe4834da'))
-paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '18bc95c62d3300456c3c7da5278b47bb'))
-paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '6b6ee1170fe20a79cf0631a1f49b0df2'))
-paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '7e5cac851fd9bad344230e1044b6a565'))
-paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', '20992b20d19c2e5983f366150827b4a6'))
-paddle.fluid.layers.continuous_value_model (ArgSpec(args=['input', 'cvm', 'use_cvm'], varargs=None, keywords=None, defaults=(True,)), ('document', 'c03490ffaa1b78258747157c313db4cd'))
-paddle.fluid.layers.where (ArgSpec(args=['condition'], varargs=None, keywords=None, defaults=None), ('document', 'b1e1487760295e1ff55307b880a99e18'))
-paddle.fluid.layers.sign (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'fa2f457a81714430c5677c2d68744728'))
-paddle.fluid.layers.deformable_conv (ArgSpec(args=['input', 'offset', 'mask', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'deformable_groups', 'im2col_step', 'param_attr', 'bias_attr', 'modulated', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, None, None, True, None)), ('document', '335193ac57d41d7199f8d26d30c069b1'))
-paddle.fluid.layers.unfold (ArgSpec(args=['x', 'kernel_sizes', 'strides', 'paddings', 'dilations', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None)), ('document', '3f884662ad443d9ecc2b3734b4f61ad6'))
-paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '47c5d1c890b36fa00ff3285c9398f613'))
-paddle.fluid.layers.filter_by_instag (ArgSpec(args=['ins', 'ins_tag', 'filter_tag', 'is_lod'], varargs=None, keywords=None, defaults=None), ('document', '7703a2088af8de4128b143ff1164ca4a'))
-paddle.fluid.layers.shard_index (ArgSpec(args=['input', 'index_num', 'nshards', 'shard_id', 'ignore_value'], varargs=None, keywords=None, defaults=(-1,)), ('document', 'c4969dd6bf164f9e6a90414ea4f4e5ad'))
-paddle.fluid.layers.hard_swish (ArgSpec(args=['x', 'threshold', 'scale', 'offset', 'name'], varargs=None, keywords=None, defaults=(6.0, 6.0, 3.0, None)), ('document', '6a5152a7015c62cb8278fc24cb456459'))
-paddle.fluid.layers.mse_loss (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'd9ede6469288636e1b3233b461a165c9'))
-paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '9d7806e31bdf727c1a23b8782a09b545'))
-paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '88367daf9a30c9ab83adc5d7221e23ef'))
-paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '44724c493f41a124abc7531c2740e2e3'))
-paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', 'd78a1c7344955c5caed8dc13adb7beb6'))
-paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '2edf37d57862b24a7a26aa19a3573f73'))
-paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d1a4bc97bbce9fa1d4f7a4200a771ff'))
-paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'aaf0176c743c43e9bc684dd7dfac25c5'))
-paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '021272f30e0cdf7503586815378abfb8'))
-paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', '47ea8b8c91879e50c9036e418b00ef4a'))
-paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '1e44a534cf7d26ab230aa9f5e4e0525a'))
-paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '764c095ba4562ae740f979e970152d6e'))
-paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b3f30feb5dec8f110d7393ffeb30dbd9'))
-paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '5df743d578638cd2bbb9369499b44af4'))
-paddle.fluid.layers.assign (ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)), ('document', '8bd94aef4e123986d9a8c29f67b5532b'))
-paddle.fluid.layers.fill_constant_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0)), ('document', '3551aa494e88d0f271e40cd45d6e3020'))
-paddle.fluid.layers.fill_constant (ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'd6b76c7d2c7129f8d713ca74f1c2c287'))
-paddle.fluid.layers.argmin (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '3dd54487232d05df4d70fba94b7d0b79'))
-paddle.fluid.layers.argmax (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '7f47cc9aa7531b6bd37c5c96bc7f0469'))
-paddle.fluid.layers.argsort (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '9792371e3b66258531225a5551de8961'))
-paddle.fluid.layers.ones (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', '812c623ed52610b9773f9fc05413bc34'))
-paddle.fluid.layers.zeros (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', '95379f9288c2d05356ec0e2375c6bc57'))
-paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None), ('document', '628135603692137d52bcf5a8d8d6816d'))
-paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '51a0fa1cfaf2507c00a215adacdb8a63'))
-paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '129cf426e71452fe8276d616a6dc21ae'))
-paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'b9fff4ffc8d11934cde099f4c39bf841'))
-paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', 'a45b42f21bc5a4e84b60981a3d629ab3'))
-paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '3663d1148946eed4c1c34c81be586b9e'))
-paddle.fluid.layers.zeros_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd88a23bcdc443719b3953593f7cef14a'))
-paddle.fluid.layers.ones_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd18d42059c6b189cbd3fab2fcb206c15'))
-paddle.fluid.layers.diag (ArgSpec(args=['diagonal'], varargs=None, keywords=None, defaults=None), ('document', '88a15e15f0098d549f07a01eaebf9ce3'))
-paddle.fluid.layers.eye (ArgSpec(args=['num_rows', 'num_columns', 'batch_shape', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 'float32')), ('document', '25389d1e239a5d1cda66298f908ec549'))
-paddle.fluid.layers.While ('paddle.fluid.layers.control_flow.While', ('document', '50110155608a00f43d3d3fd1be41dcb4'))
-paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.Switch ('paddle.fluid.layers.control_flow.Switch', ('document', 'a1c5ef8ff117d7d6ba8940ec104f02ce'))
-paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.Switch.case (ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', 'f88b5787bb80ae6b8bf513a70dabbdc1'))
-paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '3f913b5069ad40bd85d89b33e4aa5939'))
-paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '556de793fdf24d515f3fc91260e2c048'))
-paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '04af32422c3a3d8f6040aeb406c82768'))
-paddle.fluid.layers.less_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '7b6d952a9f6340a044cfb91c16aad842'))
-paddle.fluid.layers.greater_than (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '55710e2fafeda70cd1b53d7509712499'))
-paddle.fluid.layers.greater_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '14bff27b2be5e60eaa30e41925265beb'))
-paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '788aa651e8b9fec79d16931ef3a33e90'))
-paddle.fluid.layers.not_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '57adebb8858ffab6be2d86d0522b85dc'))
-paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', 'caf0d94349cdc28e1bda3b8a19411ac0'))
-paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', '6f24a9b872027634ad758ea2826c9727'))
-paddle.fluid.layers.IfElse ('paddle.fluid.layers.control_flow.IfElse', ('document', 'a389f88e19c3b332c3afcbf7df4488a5'))
-paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.IfElse.false_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.IfElse.input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.IfElse.output (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.IfElse.true_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.DynamicRNN ('paddle.fluid.layers.control_flow.DynamicRNN', ('document', 'b71e87285dbd4a43a6cc4f8a473245e6'))
-paddle.fluid.layers.DynamicRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7'))
-paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', '57cdd0a63747f4c670cdb9d250ceb7e1'))
-paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a'))
-paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '55ab9c562edd7dabec0bd6fd6c1a28cc'))
-paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '4b300851b5201891d0e11c406e4c7d07'))
-paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f'))
-paddle.fluid.layers.StaticRNN ('paddle.fluid.layers.control_flow.StaticRNN', ('document', 'f73671cb98696a1962bf5deaf49dc2e9'))
-paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'f1b60dc4194d0bb714d6c6f5921b227f'))
-paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'df6ceab6e6c9bd31e97914d7e7538137'))
-paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7'))
-paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '903387ec11f3d0bf46821d31a68cffa5'))
-paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837'))
-paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f'))
-paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '5b552a1f0f7eb4dacb768a975ba15d08'))
-paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, 20, True, True, True, True, 'both')), ('document', '3130bed32922b9fd84ce2dea6250f635'))
-paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '3011dc695f490afdf504dc24f628319a'))
-paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bdc9a71908d3c9748532ff44c2f31034'))
-paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9a4c346630a042454f727ad5e0cffc11'))
-paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '92bec0a7fdec48ad78effdf30b02c6fa'))
-paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a4af62b6c6ce858c897f74a4f0f'))
-paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2dde114018cbcaff9b24c566bf6704a5'))
-paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '000a76652c8e59e21e7fb6d87cc7a668'))
-paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e3dce5e892ce63cc9c6ed87a7e6206d5'))
-paddle.fluid.layers.rsqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0b90c858d4d71a58896537c1bd7acb09'))
-paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '503f4d5723bbe1b6c9f24058078709ed'))
-paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5602b78da33c4b0ccaea0374411de423'))
-paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a0977ab14448ba472e5c2e152f42a818'))
-paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e94c8569179ffa3a0dca028a5b518dbf'))
-paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5c9a00178c5c28bb824f7d6c25060d3b'))
-paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '20d1d49fe4d13430a63c57fc4b29a677'))
-paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '4441e4e5e9934eb98760e31330e7a13c'))
-paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '40132ef34808ed621c63ed4fd886fd1c'))
-paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '578106495166d0fb65ade2bb51cdf926'))
-paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '728233aff902803f5f62e2d340c3bcbb'))
-paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '74c4e6dfbdfc3453301ea11d722ad3d6'))
-paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a70e9320b113ca33c1299bbc032f09d4'))
-paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '6de6775d9e9ed885056e764982130cfd'))
-paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'alpha'], varargs=None, keywords=None, defaults=(None,)), ('document', '958c7bfdfb0b5e92af6ca4a90d24e5ef'))
-paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '386a4103d2884b2f1312ebc1e8ee6486'))
-paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '5ab9d5721a6734fe127069e4314e1309'))
-paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '9a0464425426a9b9c1b7500ede2836c1'))
-paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', '0fdf82762fd0a5acb2578a72771b5b44'))
-paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', '7a484a0da5e993a7734867a3dfa86571'))
-paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'fd58078fdfffd899b91f992ba224628f'))
-paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '080ce0d54d3f1950ad5a3a8e5ae529e9'))
-paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'e9685f32d21bec8c013626c0254502c5'))
-paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta', 'return_index'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0, False)), ('document', '5485bcaceb0cde2695565a2ffd5bbd40'))
-paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '14d1eeae0f41b6792be43c1c0be0589b'))
-paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '651d98d51879dfa1bc1cd40391786a41'))
-paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', 'fa1d1c9d5e0111684c0db705f86a2595'))
-paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', 'aeac6aae100173b3fc7f102cf3023a3d'))
-paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', '0aaacaf9858b8270a8ab5b0aacdd94b7'))
-paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'a82016342789ba9d85737e405f824ff1'))
-paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', '69def376b42ef0681d0cc7f53a2dac4b'))
-paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', 'b7d707822b6af2a586bce608040235b1'))
-paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None), ('document', 'b319b10ddaf17fb4ddf03518685a17ef'))
-paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '72fca4a39ccf82d5c746ae62d1868a99'))
-paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '4c6225fc1a1c0b84955a8f0013008243'))
-paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e308ce1661cb722b220a6f482f85b9e4'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '400403175718d5a632402cdae88b01b8'))
-paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ed56ff21536ca5c8ad418d0cfaf6a7b9'))
-paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9ddee76cb808db83768bf68010e39b2b'))
-paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'f6e333d76922c6e564413b4d216c245c'))
-paddle.fluid.layers.multiclass_nms2 (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'return_index', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, False, None)), ('document', 'be156186ee7a2ee56ab30b964acb15e5'))
-paddle.fluid.layers.retinanet_detection_output (ArgSpec(args=['bboxes', 'scores', 'anchors', 'im_info', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0.05, 1000, 100, 0.3, 1.0)), ('document', '078d28607ce261a0cba2b965a79f6bb8'))
-paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6c023b9401214ae387a8b2d92638e5e4'))
-paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3619a7847709f5868f5e929065947b38'))
-paddle.fluid.layers.collect_fpn_proposals (ArgSpec(args=['multi_rois', 'multi_scores', 'min_level', 'max_level', 'post_nms_top_n', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80a75103e001ca1ba056fbbe0c6a19f3'))
-paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', 'ef799022a6040597462ae2b3d2f1c407'))
-paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', '34b4575807f955f7e8698b8dead23858'))
-paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'eaf430c5a0380fb11bfe9a8922cd6295'))
-paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'aa3146f64d5d508e4e50687603aa7b15'))
-paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ea37a3a8a0b3ce2254e7bc49a0951dbe'))
-paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', 'a343254c36c2e89512cd8cd8a1960ead'))
-paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'd9f654117542c6b702963dda107a247f'))
-paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'fd57228fb76195e66bbcc8d8e42c494d'))
-paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '1062e487dd3b50a6e58b5703b4f594c9'))
-paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', 'dc7292c456847ba41cfd318e9f7f4363'))
-paddle.fluid.layers.Uniform ('paddle.fluid.layers.distributions.Uniform', ('document', 'af70e7003f437e7a8a9e28cded35c433'))
-paddle.fluid.layers.Uniform.__init__ (ArgSpec(args=['self', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.Uniform.entropy (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ba59f9ce77af3c93e2b4c8af1801a24e'))
-paddle.fluid.layers.Uniform.kl_divergence (ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None), ('document', '3baee52abbed82d47e9588d9dfe2f42f'))
-paddle.fluid.layers.Uniform.log_prob (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', 'b79091014ceaffb6a7372a198a341c23'))
-paddle.fluid.layers.Uniform.sample (ArgSpec(args=['self', 'shape', 'seed'], varargs=None, keywords=None, defaults=(0,)), ('document', 'adac334af13f6984e991b3ecf12b8cb7'))
-paddle.fluid.layers.Normal ('paddle.fluid.layers.distributions.Normal', ('document', '3265262d0d8b3b32c6245979a5cdced9'))
-paddle.fluid.layers.Normal.__init__ (ArgSpec(args=['self', 'loc', 'scale'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.Normal.entropy (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd2db47b1e62c037a2570fc526b93f518'))
-paddle.fluid.layers.Normal.kl_divergence (ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None), ('document', '2e8845cdf1129647e6fa6e816876cd3b'))
-paddle.fluid.layers.Normal.log_prob (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', 'b79091014ceaffb6a7372a198a341c23'))
-paddle.fluid.layers.Normal.sample (ArgSpec(args=['self', 'shape', 'seed'], varargs=None, keywords=None, defaults=(0,)), ('document', 'adac334af13f6984e991b3ecf12b8cb7'))
-paddle.fluid.layers.Categorical ('paddle.fluid.layers.distributions.Categorical', ('document', '865c9dac8af6190e05588486ba091ee8'))
-paddle.fluid.layers.Categorical.__init__ (ArgSpec(args=['self', 'logits'], varargs=None, keywords=None, defaults=None), ('document', '933b96c9ebab8e2c1f6007a50287311e'))
-paddle.fluid.layers.Categorical.entropy (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'b360a2a7a4da07c2d268b329e09c82c1'))
-paddle.fluid.layers.Categorical.kl_divergence (ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None), ('document', 'c2c4c37376584178025f0a4a61c4b862'))
-paddle.fluid.layers.Categorical.log_prob (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', 'c0edd2e2fc76711477b32dc4da9de768'))
-paddle.fluid.layers.Categorical.sample (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '08a2bbcaa20ee176ee7ec3d05737a0f6'))
-paddle.fluid.layers.MultivariateNormalDiag ('paddle.fluid.layers.distributions.MultivariateNormalDiag', ('document', 'f6ee0e8b2898796dcff2a68c9fda19f0'))
-paddle.fluid.layers.MultivariateNormalDiag.__init__ (ArgSpec(args=['self', 'loc', 'scale'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.MultivariateNormalDiag.entropy (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3c679b573ba975c5067c8ebfd4354b02'))
-paddle.fluid.layers.MultivariateNormalDiag.kl_divergence (ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None), ('document', 'd9190d29dbd54c81f747a6436c35f062'))
-paddle.fluid.layers.MultivariateNormalDiag.log_prob (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', 'c0edd2e2fc76711477b32dc4da9de768'))
-paddle.fluid.layers.MultivariateNormalDiag.sample (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '08a2bbcaa20ee176ee7ec3d05737a0f6'))
-paddle.fluid.contrib.InitState ('paddle.fluid.contrib.decoder.beam_search_decoder.InitState', ('document', '3afd1f84232718e628e9e566941c5f05'))
-paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.StateCell ('paddle.fluid.contrib.decoder.beam_search_decoder.StateCell', ('document', 'ecd0066c02867d445d7b461e28220c50'))
-paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.StateCell.compute_state (ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None), ('document', '92973b3f222081a1d17069c683cf4a99'))
-paddle.fluid.contrib.StateCell.get_input (ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None), ('document', '6f24a007cfa184e32f01a960703bfd70'))
-paddle.fluid.contrib.StateCell.get_state (ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None), ('document', '630a4945cfe659ea4f307598fbbce5d2'))
-paddle.fluid.contrib.StateCell.out_state (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '7ad681dff0393ddf13a724194e720f28'))
-paddle.fluid.contrib.StateCell.set_state (ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None), ('document', 'd4e0e08cd5d9d9a571cbc52d114f5ae9'))
-paddle.fluid.contrib.StateCell.state_updater (ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None), ('document', 'd5afe1b7665d94fb023b15cf913ca510'))
-paddle.fluid.contrib.StateCell.update_states (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'fe0b0f1338723516a35a30247899c81b'))
-paddle.fluid.contrib.TrainingDecoder ('paddle.fluid.contrib.decoder.beam_search_decoder.TrainingDecoder', ('document', 'cec7e190c2bdd3b17e8178bc3f177799'))
-paddle.fluid.contrib.TrainingDecoder.__init__ (ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.TrainingDecoder.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '98d88fa1c989748410a12517c6a585bf'))
-paddle.fluid.contrib.TrainingDecoder.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'f0a457dee586559036202087ce2eff69'))
-paddle.fluid.contrib.TrainingDecoder.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'a024c72664fe815068423ba630b7658a'))
-paddle.fluid.contrib.TrainingDecoder.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '4659db7a888a2495e71c1838a0483909'))
-paddle.fluid.contrib.BeamSearchDecoder ('paddle.fluid.contrib.decoder.beam_search_decoder.BeamSearchDecoder', ('document', '102da4a2d2002fbb12d44b8ea36121ed'))
-paddle.fluid.contrib.BeamSearchDecoder.__init__ (ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BeamSearchDecoder.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '98d88fa1c989748410a12517c6a585bf'))
-paddle.fluid.contrib.BeamSearchDecoder.decode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '1e47c60f080c1343ebb6ceaef89656b2'))
-paddle.fluid.contrib.BeamSearchDecoder.early_stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a84a7454ed6707f79b9e954d92a7575'))
-paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'aa89eb8fd5e4cabaf5cc1bcae14665a4'))
-paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7'))
-paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47'))
-paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa'))
-paddle.fluid.contrib.QuantizeTranspiler ('paddle.fluid.contrib.quantize.quantize_transpiler.QuantizeTranspiler', ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
-paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
-paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
-paddle.fluid.contrib.distributed_batch_reader (ArgSpec(args=['batch_reader'], varargs=None, keywords=None, defaults=None), ('document', 'b60796eb0a481484dd34e345f0eaa4d5'))
-paddle.fluid.contrib.Compressor ('paddle.fluid.contrib.slim.core.compressor.Compressor', ('document', 'a5417774a94aa9ae5560a42b96527e7d'))
-paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'eval_func', 'save_eval_model', 'prune_infer_model', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer', 'search_space'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, None, True, None, [], None, None, None, None)), ('document', '05119e0fa0fc07f5cf848ebf0a2cf070'))
-paddle.fluid.contrib.Compressor.config (ArgSpec(args=['self', 'config_file'], varargs=None, keywords=None, defaults=None), ('document', '780d9c007276ccbb95b292400d7807b0'))
-paddle.fluid.contrib.Compressor.run (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'c6e43d6a078d307672283c1f36e04fe9'))
-paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67'))
-paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f'))
-paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b'))
-paddle.fluid.contrib.HDFSClient ('paddle.fluid.contrib.utils.hdfs_utils.HDFSClient', ('document', '31207aa18424eab2249c54fe11724798'))
-paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e'))
-paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634'))
-paddle.fluid.contrib.HDFSClient.is_dir (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)), ('document', '45bde1bae02605a205c8245b58b9156d'))
-paddle.fluid.contrib.HDFSClient.is_exist (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)), ('document', 'be9c94bccff7ba0c1d95883ac62b5864'))
-paddle.fluid.contrib.HDFSClient.ls (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', '808acac504870c7e46594b95674f8a86'))
-paddle.fluid.contrib.HDFSClient.lsr (ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True)), ('document', 'fae835aa3354eb6a0434c0f9ba3c2747'))
-paddle.fluid.contrib.HDFSClient.make_local_dirs (ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None), ('document', 'e76b89c8e7f019b5da576c0026fcf689'))
-paddle.fluid.contrib.HDFSClient.makedirs (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', '44d9972aae390aedf40aaea731a37e4b'))
-paddle.fluid.contrib.HDFSClient.rename (ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,)), ('document', '0eb133644d9a9f4da45bb39261ff0955'))
-paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5)), ('document', '7d053b4bfd6dcfdd2c9dda0e0dbd9665'))
-paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a'))
-paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a'))
-paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4'))
-paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'amp_lists', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(None, 1.0, 1000, 2, 2.0, 0.8, True)), ('document', '5f118631fc8632afb981b3a26daae731'))
-paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists ('paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists', ('document', 'c116ec6bb5d30998792daea8db21ee40'))
-paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists.__init__ (ArgSpec(args=['self', 'custom_white_list', 'custom_black_list'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.fused_elemwise_activation (ArgSpec(args=['x', 'y', 'functor_list', 'axis', 'scale', 'save_intermediate_out'], varargs=None, keywords=None, defaults=(-1, 0.0, True)), ('document', '1c4b247a2858cea8d9d8750693688270'))
-paddle.fluid.contrib.sequence_topk_avg_pooling (ArgSpec(args=['input', 'row', 'col', 'topks', 'channel_num'], varargs=None, keywords=None, defaults=None), ('document', '5218c85dd4122b626da9bb92f3b50042'))
-paddle.fluid.contrib.var_conv_2d (ArgSpec(args=['input', 'row', 'col', 'input_channel', 'output_channel', 'filter_size', 'stride', 'param_attr', 'act', 'dtype', 'name'], varargs=None, keywords=None, defaults=(1, None, None, 'float32', None)), ('document', 'f52a6edf6d3e970568788604da3329c2'))
-paddle.fluid.contrib.match_matrix_tensor (ArgSpec(args=['x', 'y', 'channel_num', 'act', 'param_attr', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, 'float32', None)), ('document', '3bdc4b2891c1460bc630fdcd22766b21'))
-paddle.fluid.contrib.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '7c727562ebdda38274106d1a9b338e5b'))
-paddle.fluid.contrib.BasicGRUUnit ('paddle.fluid.contrib.layers.rnn_impl.BasicGRUUnit', ('document', '2aed2540ed1540f081be9f4d08f2a65e'))
-paddle.fluid.contrib.BasicGRUUnit.__init__ (ArgSpec(args=['self', 'name_scope', 'hidden_size', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'dtype'], varargs=None, keywords=None, defaults=(None, None, None, None, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicGRUUnit.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.contrib.BasicGRUUnit.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.contrib.BasicGRUUnit.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicGRUUnit.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicGRUUnit.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.contrib.BasicGRUUnit.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.contrib.BasicGRUUnit.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicGRUUnit.forward (ArgSpec(args=['self', 'input', 'pre_hidden'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicGRUUnit.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.contrib.BasicGRUUnit.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicGRUUnit.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.contrib.BasicGRUUnit.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicGRUUnit.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.contrib.BasicGRUUnit.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.basic_gru (ArgSpec(args=['input', 'init_hidden', 'hidden_size', 'num_layers', 'sequence_length', 'dropout_prob', 'bidirectional', 'batch_first', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(1, None, 0.0, False, True, None, None, None, None, 'float32', 'basic_gru')), ('document', '0afcbe4fbe1b8c35eda58b4efe48f9fd'))
-paddle.fluid.contrib.BasicLSTMUnit ('paddle.fluid.contrib.layers.rnn_impl.BasicLSTMUnit', ('document', '3d0b2e3172ce58e1304199efee066c99'))
-paddle.fluid.contrib.BasicLSTMUnit.__init__ (ArgSpec(args=['self', 'name_scope', 'hidden_size', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'forget_bias', 'dtype'], varargs=None, keywords=None, defaults=(None, None, None, None, 1.0, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicLSTMUnit.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.contrib.BasicLSTMUnit.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.contrib.BasicLSTMUnit.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicLSTMUnit.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicLSTMUnit.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.contrib.BasicLSTMUnit.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.contrib.BasicLSTMUnit.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicLSTMUnit.forward (ArgSpec(args=['self', 'input', 'pre_hidden', 'pre_cell'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicLSTMUnit.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.contrib.BasicLSTMUnit.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicLSTMUnit.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.contrib.BasicLSTMUnit.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.BasicLSTMUnit.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.contrib.BasicLSTMUnit.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.basic_lstm (ArgSpec(args=['input', 'init_hidden', 'init_cell', 'hidden_size', 'num_layers', 'sequence_length', 'dropout_prob', 'bidirectional', 'batch_first', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'forget_bias', 'dtype', 'name'], varargs=None, keywords=None, defaults=(1, None, 0.0, False, True, None, None, None, None, 1.0, 'float32', 'basic_lstm')), ('document', 'fe4d0c3c55a162b8cfe10b05fabb7ce4'))
-paddle.fluid.contrib.ctr_metric_bundle (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'b68d12366896c41065fc3738393da2aa'))
-paddle.fluid.dygraph.Layer ('paddle.fluid.dygraph.layers.Layer', ('document', 'a889d5affd734ede273e94d4257163ab'))
-paddle.fluid.dygraph.Layer.__init__ (ArgSpec(args=['self', 'name_scope', 'dtype'], varargs=None, keywords=None, defaults=(VarType.FP32,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Layer.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.Layer.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.Layer.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Layer.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Layer.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.Layer.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.Layer.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Layer.forward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Layer.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.Layer.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Layer.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.Layer.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Layer.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.Layer.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.__impl__ (ArgSpec(args=['func'], varargs=None, keywords=None, defaults=()), ('document', 'fa71ad4e6c2b5bf2b5258bd1959f9b2a'))
-paddle.fluid.dygraph.guard (ArgSpec(args=['place'], varargs=None, keywords=None, defaults=(None,)), ('document', '7071320ffe2eec9aacdae574951278c6'))
-paddle.fluid.dygraph.to_variable (ArgSpec(args=['value', 'block', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0e69fa3666f15dd01b6e3e270b9371cd'))
-paddle.fluid.dygraph.Conv2D ('paddle.fluid.dygraph.nn.Conv2D', ('document', 'baafe7ae0d3a61ae79cf4c7443e2c37c'))
-paddle.fluid.dygraph.Conv2D.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'dtype'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2D.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.Conv2D.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.Conv2D.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2D.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2D.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.Conv2D.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.Conv2D.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2D.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2D.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.Conv2D.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2D.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.Conv2D.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2D.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.Conv2D.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3D ('paddle.fluid.dygraph.nn.Conv3D', ('document', '8b756aaca65af9594cc574d9a5d2b055'))
-paddle.fluid.dygraph.Conv3D.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3D.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.Conv3D.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.Conv3D.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3D.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3D.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.Conv3D.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.Conv3D.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3D.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3D.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.Conv3D.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3D.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.Conv3D.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3D.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.Conv3D.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Pool2D ('paddle.fluid.dygraph.nn.Pool2D', ('document', 'e9331666e47a38586c8809a23cbaf7de'))
-paddle.fluid.dygraph.Pool2D.__init__ (ArgSpec(args=['self', 'name_scope', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'exclusive', 'dtype'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, True, VarType.FP32)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Pool2D.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.Pool2D.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.Pool2D.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Pool2D.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Pool2D.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.Pool2D.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.Pool2D.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Pool2D.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Pool2D.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.Pool2D.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Pool2D.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.Pool2D.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Pool2D.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.Pool2D.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.FC ('paddle.fluid.dygraph.nn.FC', ('document', '1d64242f03f2aca2307e94590b552430'))
-paddle.fluid.dygraph.FC.__init__ (ArgSpec(args=['self', 'name_scope', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'dtype'], varargs=None, keywords=None, defaults=(1, None, None, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.FC.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.FC.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.FC.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.FC.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.FC.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.FC.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.FC.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.FC.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.FC.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.FC.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.FC.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.FC.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.FC.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.FC.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BatchNorm ('paddle.fluid.dygraph.nn.BatchNorm', ('document', '0b609e10e4d417c91d346f887d16771c'))
-paddle.fluid.dygraph.BatchNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'num_channels', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'dtype', 'data_layout', 'in_place', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats', 'trainable_statistics'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'float32', 'NCHW', False, None, None, False, False, False, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BatchNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.BatchNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.BatchNorm.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BatchNorm.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BatchNorm.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.BatchNorm.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.BatchNorm.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BatchNorm.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BatchNorm.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.BatchNorm.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BatchNorm.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.BatchNorm.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BatchNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.BatchNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Embedding ('paddle.fluid.dygraph.nn.Embedding', ('document', 'ecf8dc4149f005cd30eddc0dd343454f'))
-paddle.fluid.dygraph.Embedding.__init__ (ArgSpec(args=['self', 'name_scope', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Embedding.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.Embedding.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.Embedding.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Embedding.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Embedding.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.Embedding.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.Embedding.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Embedding.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Embedding.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.Embedding.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Embedding.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.Embedding.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Embedding.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.Embedding.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GRUUnit ('paddle.fluid.dygraph.nn.GRUUnit', ('document', '5308e42b6a6db4681ce5ee9e94983986'))
-paddle.fluid.dygraph.GRUUnit.__init__ (ArgSpec(args=['self', 'name_scope', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GRUUnit.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.GRUUnit.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.GRUUnit.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GRUUnit.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GRUUnit.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.GRUUnit.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.GRUUnit.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GRUUnit.forward (ArgSpec(args=['self', 'input', 'hidden'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GRUUnit.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.GRUUnit.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GRUUnit.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.GRUUnit.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GRUUnit.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.GRUUnit.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.LayerNorm ('paddle.fluid.dygraph.nn.LayerNorm', ('document', 'b44f5d3d10386c460094e21f24ff272b'))
-paddle.fluid.dygraph.LayerNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.LayerNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.LayerNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.LayerNorm.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.LayerNorm.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.LayerNorm.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.LayerNorm.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.LayerNorm.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.LayerNorm.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.LayerNorm.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.LayerNorm.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.LayerNorm.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.LayerNorm.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.LayerNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.LayerNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NCE ('paddle.fluid.dygraph.nn.NCE', ('document', '2d579e8d9ce31bb29e079e5f6108fc73'))
-paddle.fluid.dygraph.NCE.__init__ (ArgSpec(args=['self', 'name_scope', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, 'uniform', None, 0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NCE.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.NCE.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.NCE.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NCE.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NCE.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.NCE.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.NCE.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NCE.forward (ArgSpec(args=['self', 'input', 'label', 'sample_weight'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NCE.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.NCE.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NCE.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.NCE.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NCE.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.NCE.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PRelu ('paddle.fluid.dygraph.nn.PRelu', ('document', 'd395ed163b4cf86e7207968f27bc1c11'))
-paddle.fluid.dygraph.PRelu.__init__ (ArgSpec(args=['self', 'name_scope', 'mode', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PRelu.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.PRelu.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.PRelu.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PRelu.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PRelu.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.PRelu.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.PRelu.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PRelu.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PRelu.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.PRelu.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PRelu.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.PRelu.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PRelu.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.PRelu.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BilinearTensorProduct ('paddle.fluid.dygraph.nn.BilinearTensorProduct', ('document', '310140d784933928a27db9a7af4761e8'))
-paddle.fluid.dygraph.BilinearTensorProduct.__init__ (ArgSpec(args=['self', 'name_scope', 'size', 'name', 'act', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BilinearTensorProduct.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.BilinearTensorProduct.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.BilinearTensorProduct.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BilinearTensorProduct.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BilinearTensorProduct.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.BilinearTensorProduct.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.BilinearTensorProduct.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BilinearTensorProduct.forward (ArgSpec(args=['self', 'x', 'y'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BilinearTensorProduct.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.BilinearTensorProduct.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BilinearTensorProduct.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.BilinearTensorProduct.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BilinearTensorProduct.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.BilinearTensorProduct.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2DTranspose ('paddle.fluid.dygraph.nn.Conv2DTranspose', ('document', '918fa8ad8a62ff424c842fb8a840bf7a'))
-paddle.fluid.dygraph.Conv2DTranspose.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2DTranspose.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.Conv2DTranspose.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.Conv2DTranspose.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2DTranspose.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2DTranspose.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.Conv2DTranspose.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.Conv2DTranspose.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2DTranspose.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2DTranspose.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.Conv2DTranspose.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2DTranspose.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.Conv2DTranspose.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv2DTranspose.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.Conv2DTranspose.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3DTranspose ('paddle.fluid.dygraph.nn.Conv3DTranspose', ('document', 'cd99906d0813609ddea3fb6a2ac900dc'))
-paddle.fluid.dygraph.Conv3DTranspose.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3DTranspose.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.Conv3DTranspose.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.Conv3DTranspose.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3DTranspose.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3DTranspose.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.Conv3DTranspose.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.Conv3DTranspose.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3DTranspose.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3DTranspose.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.Conv3DTranspose.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3DTranspose.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.Conv3DTranspose.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3DTranspose.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.Conv3DTranspose.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GroupNorm ('paddle.fluid.dygraph.nn.GroupNorm', ('document', '4d65fc6b00970e3b5c5dd0abeacd47cb'))
-paddle.fluid.dygraph.GroupNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GroupNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.GroupNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.GroupNorm.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GroupNorm.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GroupNorm.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.GroupNorm.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.GroupNorm.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GroupNorm.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GroupNorm.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.GroupNorm.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GroupNorm.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.GroupNorm.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GroupNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.GroupNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.SpectralNorm ('paddle.fluid.dygraph.nn.SpectralNorm', ('document', 'f400a29393aa95fff829b4a6111e2952'))
-paddle.fluid.dygraph.SpectralNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.SpectralNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.SpectralNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.SpectralNorm.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.SpectralNorm.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.SpectralNorm.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.SpectralNorm.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.SpectralNorm.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.SpectralNorm.forward (ArgSpec(args=['self', 'weight'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.SpectralNorm.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.SpectralNorm.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.SpectralNorm.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.SpectralNorm.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.SpectralNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.SpectralNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.TreeConv ('paddle.fluid.dygraph.nn.TreeConv', ('document', '1e3104dea2482f6b79cf7a7ac9a343ab'))
-paddle.fluid.dygraph.TreeConv.__init__ (ArgSpec(args=['self', 'name_scope', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.TreeConv.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
-paddle.fluid.dygraph.TreeConv.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
-paddle.fluid.dygraph.TreeConv.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.TreeConv.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.TreeConv.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6'))
-paddle.fluid.dygraph.TreeConv.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8'))
-paddle.fluid.dygraph.TreeConv.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.TreeConv.forward (ArgSpec(args=['self', 'nodes_vector', 'edge_set'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.TreeConv.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2'))
-paddle.fluid.dygraph.TreeConv.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.TreeConv.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5'))
-paddle.fluid.dygraph.TreeConv.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.TreeConv.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
-paddle.fluid.dygraph.TreeConv.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Tracer ('paddle.fluid.dygraph.tracer.Tracer', ('document', '28d72409112111274c33e1f07229d5da'))
-paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Tracer.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Tracer.eval_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None  2. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None
-paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'type', 'inputs', 'outputs', 'attrs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Tracer.trace_var (ArgSpec(args=['self', 'name', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Tracer.train_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.prepare_context (ArgSpec(args=['strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.save_persistables (ArgSpec(args=['model_dict', 'dirname', 'optimizers'], varargs=None, keywords=None, defaults=('save_dir', None)), ('document', 'b0b2ec2a502214a737300fb648cb9dc7'))
-paddle.fluid.dygraph.load_persistables (ArgSpec(args=['dirname'], varargs=None, keywords=None, defaults=('save_dir',)), ('document', 'e0709f8259620fdcfd2c0c1b23348852'))
-paddle.fluid.dygraph.NoamDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NoamDecay', ('document', '9ccfea97dbf15134d406a23aae1e1fa2'))
-paddle.fluid.dygraph.NoamDecay.__init__ (ArgSpec(args=['self', 'd_model', 'warmup_steps', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(1, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NoamDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
-paddle.fluid.dygraph.NoamDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PiecewiseDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PiecewiseDecay', ('document', '8f4d37eaad4e2f5b12850f3663856758'))
-paddle.fluid.dygraph.PiecewiseDecay.__init__ (ArgSpec(args=['self', 'boundaries', 'values', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PiecewiseDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
-paddle.fluid.dygraph.PiecewiseDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NaturalExpDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NaturalExpDecay', ('document', '94bed58b392a5a71b6d1abd39eed7111'))
-paddle.fluid.dygraph.NaturalExpDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NaturalExpDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
-paddle.fluid.dygraph.NaturalExpDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.ExponentialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.ExponentialDecay', ('document', 'a259689c649c5f82636536386ce2ef19'))
-paddle.fluid.dygraph.ExponentialDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.ExponentialDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
-paddle.fluid.dygraph.ExponentialDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.InverseTimeDecay ('paddle.fluid.dygraph.learning_rate_scheduler.InverseTimeDecay', ('document', '6a868b2c7cc0f09f57ef71902bbc93ca'))
-paddle.fluid.dygraph.InverseTimeDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.InverseTimeDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
-paddle.fluid.dygraph.InverseTimeDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PolynomialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PolynomialDecay', ('document', 'bb90314cee58952f13522dcd571ca832'))
-paddle.fluid.dygraph.PolynomialDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PolynomialDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
-paddle.fluid.dygraph.PolynomialDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.CosineDecay ('paddle.fluid.dygraph.learning_rate_scheduler.CosineDecay', ('document', '46dadadee1a8a92d70bd277d9345bfb0'))
-paddle.fluid.dygraph.CosineDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'step_each_epoch', 'epochs', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.CosineDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
-paddle.fluid.dygraph.CosineDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BackwardStrategy ('paddle.fluid.core_avx.BackwardStrategy', ('document', '5d9496052ec793810c9f12ffad5c73ce'))
-paddle.fluid.dygraph.BackwardStrategy.__init__ __init__(self: paddle.fluid.core_avx.BackwardStrategy) -> None
-paddle.fluid.transpiler.DistributeTranspiler ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspiler', ('document', 'b2b19821c5dffcd11473d6a4eef089af'))
-paddle.fluid.transpiler.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'b1951949c6d21698290aa8ac69afee32'))
-paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'c89fc350f975ef827f5448d68af388cf'))
-paddle.fluid.transpiler.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '90a40b80e0106f69262cc08b861c3e39'))
-paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '0e47f020304e2b824e87ff03475c17cd'))
-paddle.fluid.transpiler.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '418c7e8b268e9be4104f2809e654c2f7'))
-paddle.fluid.transpiler.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, True)), ('document', '2348247f684bfd5bb9466470f35be064'))
-paddle.fluid.transpiler.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd38c5b8b2b2e0bb19bcf1b581a80a7e4'))
-paddle.fluid.transpiler.HashName ('paddle.fluid.transpiler.ps_dispatcher.HashName', ('document', '8190ddc66ee412441f5d97fd3f702bdd'))
-paddle.fluid.transpiler.HashName.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.transpiler.HashName.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.transpiler.HashName.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.transpiler.RoundRobin ('paddle.fluid.transpiler.ps_dispatcher.RoundRobin', ('document', 'c124359054923c614758c3fbbf666290'))
-paddle.fluid.transpiler.RoundRobin.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.transpiler.RoundRobin.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.transpiler.RoundRobin.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.transpiler.DistributeTranspilerConfig ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspilerConfig', ('document', '550b8c767a8ae1a2eb74b18924ddc975'))
-paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', '13f01ff80e8dfbd3427d90cf49bc62eb'))
-paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', 'd6a1e527b53f5cc15594fee307dfc5cf'))
-paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', 'b87bacfc70dd3477ed25ef14aa01389a'))
-paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', 'b1a07a0000eb9103e3a143ca8c13de5b'))
-paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', '4913d846264f17112bf7bc04273388cc'))
-paddle.fluid.optimizer.SGDOptimizer ('paddle.fluid.optimizer.SGDOptimizer', ('document', 'c3c8dd3193d991adf8bda505560371d6'))
-paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.SGDOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.SGDOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.MomentumOptimizer ('paddle.fluid.optimizer.MomentumOptimizer', ('document', 'a72bd02e5459e64596897d190413d449'))
-paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.MomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.MomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.AdagradOptimizer ('paddle.fluid.optimizer.AdagradOptimizer', ('document', 'a1d4f0682cde43ad34432b1338aadf04'))
-paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.AdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdagradOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.AdamOptimizer ('paddle.fluid.optimizer.AdamOptimizer', ('document', '6fe871b955cab6e267422d5af666dafa'))
-paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.AdamOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdamOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.AdamaxOptimizer ('paddle.fluid.optimizer.AdamaxOptimizer', ('document', '883fc4541214e8343d3a89711936e15d'))
-paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.AdamaxOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdamaxOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.DpsgdOptimizer ('paddle.fluid.optimizer.DpsgdOptimizer', ('document', '71113c30b66c0f4035b10ebd8af8c5ad'))
-paddle.fluid.optimizer.DpsgdOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'clip', 'batch_size', 'sigma'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.DpsgdOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.DpsgdOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.DpsgdOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.DpsgdOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.DpsgdOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.DpsgdOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.DecayedAdagradOptimizer ('paddle.fluid.optimizer.DecayedAdagradOptimizer', ('document', 'e76838a8586bf2e58e6b5cdd2f67f780'))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.FtrlOptimizer ('paddle.fluid.optimizer.FtrlOptimizer', ('document', 'cba8aae0a267b9a4d8833ae79a00fc55'))
-paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.FtrlOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.FtrlOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.RMSPropOptimizer ('paddle.fluid.optimizer.RMSPropOptimizer', ('document', '5217bc4fc399010021d6b70541005780'))
-paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.RMSPropOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.RMSPropOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.AdadeltaOptimizer ('paddle.fluid.optimizer.AdadeltaOptimizer', ('document', '3f1c5385519a3674c18c3a1ab34ac04f'))
-paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.AdadeltaOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdadeltaOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.ModelAverage ('paddle.fluid.optimizer.ModelAverage', ('document', '0a0adcd60230630e21fe1ef46362dbc0'))
-paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '648010d0ac1fa707dac0b89f74b0e35c'))
-paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.ModelAverage.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.ModelAverage.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '5f14ea4adda2791e1c3b37ff327f6a83'))
-paddle.fluid.optimizer.LarsMomentumOptimizer ('paddle.fluid.optimizer.LarsMomentumOptimizer', ('document', '030b9092a96a409b1bf5446bf45d0659'))
-paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.LarsMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LarsMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.DGCMomentumOptimizer ('paddle.fluid.optimizer.DGCMomentumOptimizer', ('document', 'c0384e036f5c78c569f0e2b266812c0f'))
-paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.DGCMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.DGCMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.LambOptimizer ('paddle.fluid.optimizer.LambOptimizer', ('document', '7dd8b270156a52f1f6b4663336960893'))
-paddle.fluid.optimizer.LambOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'lamb_weight_decay', 'beta1', 'beta2', 'epsilon', 'regularization', 'exclude_from_weight_decay_fn', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.01, 0.9, 0.999, 1e-06, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LambOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
-paddle.fluid.optimizer.LambOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.LambOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LambOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.ExponentialMovingAverage ('paddle.fluid.optimizer.ExponentialMovingAverage', ('document', 'a38b7d5b9f17a295ed15d4c1b9ab4cd0'))
-paddle.fluid.optimizer.ExponentialMovingAverage.__init__ (ArgSpec(args=['self', 'decay', 'thres_steps', 'name'], varargs=None, keywords=None, defaults=(0.999, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.ExponentialMovingAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '30f494752ac8921dc5835a63637f453a'))
-paddle.fluid.optimizer.ExponentialMovingAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '8c8a1791608b02a1ede53d6dd3a4fcec'))
-paddle.fluid.optimizer.ExponentialMovingAverage.update (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ea10f08af6d7aac3b7974aa976e4085f'))
-paddle.fluid.optimizer.PipelineOptimizer ('paddle.fluid.optimizer.PipelineOptimizer', ('document', '6f85382abedb922387b08d98e8d0b69c'))
-paddle.fluid.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'cut_list', 'place_list', 'concurrency_list', 'queue_size', 'sync_steps', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(None, None, None, 30, 1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LookaheadOptimizer ('paddle.fluid.optimizer.LookaheadOptimizer', ('document', 'c291cadfa7452c7bf58b9e2f900a3511'))
-paddle.fluid.optimizer.LookaheadOptimizer.__init__ (ArgSpec(args=['self', 'inner_optimizer', 'alpha', 'k'], varargs=None, keywords=None, defaults=(0.5, 5)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LookaheadOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.RecomputeOptimizer ('paddle.fluid.optimizer.RecomputeOptimizer', ('document', '05769ba1182270f808f85488a50c8caa'))
-paddle.fluid.optimizer.RecomputeOptimizer.__init__ (ArgSpec(args=['self', 'optimizer'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.RecomputeOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '7838e157ec5ff4f835f814adf3a2b9cc'))
-paddle.fluid.optimizer.RecomputeOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'ec8dfa14fcd958d7c196f3d1a0ce6fa7'))
-paddle.fluid.optimizer.RecomputeOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks', 'checkpoints'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a26b3dbb0f63ee81d847d92e9fb942dc'))
-paddle.fluid.optimizer.RecomputeOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.RecomputeOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '7b2b8ae72011bc4decb67e97623f2c56'))
-paddle.fluid.optimizer.RecomputeOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks', 'checkpoints'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '52488008103886c793843a3828bacd5e'))
-paddle.fluid.backward.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b'))
-paddle.fluid.regularizer.L1DecayRegularizer ('paddle.fluid.regularizer.L1DecayRegularizer', ('document', '34603757e70974d2fcc730643b382925'))
-paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.regularizer.L2DecayRegularizer ('paddle.fluid.regularizer.L2DecayRegularizer', ('document', 'b94371c3434d7f695bc5b2d6fb5531fd'))
-paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.LoDTensor ('paddle.fluid.core_avx.LoDTensor', ('document', '25e8432ed1b9a375868bc8911359aa0d'))
-paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core_avx.LoDTensor, arg0: List[List[int]]) -> None  2. __init__(self: paddle.fluid.core_avx.LoDTensor) -> None
-paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core_avx.LoDTensor) -> bool
-paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core_avx.LoDTensor) -> List[List[int]]
-paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core_avx.LoDTensor) -> List[List[int]]
-paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core_avx.LoDTensor, lod: List[List[int]]) -> None
-paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core_avx.LoDTensor, recursive_sequence_lengths: List[List[int]]) -> None
-paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core_avx.Tensor) -> List[int]
-paddle.fluid.LoDTensorArray ('paddle.fluid.core_avx.LoDTensorArray', ('document', 'e9895b67ba54438b9c0f7053e18966f5'))
-paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core_avx.LoDTensorArray) -> None
-paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core_avx.LoDTensorArray, tensor: paddle.fluid.core_avx.LoDTensor) -> None
-paddle.fluid.CPUPlace ('paddle.fluid.core_avx.CPUPlace', ('document', '6014005ef2649045b77d502aeb6cd7f9'))
-paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core_avx.CPUPlace) -> None
-paddle.fluid.CUDAPlace ('paddle.fluid.core_avx.CUDAPlace', ('document', '6a6cd8ed607beb951692c4b066d08c94'))
-paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPlace, arg0: int) -> None
-paddle.fluid.CUDAPinnedPlace ('paddle.fluid.core_avx.CUDAPinnedPlace', ('document', 'afd58ea5d390b5ea06ca70291a266d45'))
-paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPinnedPlace) -> None
-paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', 'cd667b4ee96d7d6fca40aa722d67d744'))
-paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', 'b5ae1698ea72d5a9428000b916a67379'))
-paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeeder ('paddle.fluid.data_feeder.DataFeeder', ('document', 'd9e64be617bd5f49dbb08ac2bc8665e6'))
-paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'a0ed5ce816b5d603cb595aacb922335a'))
-paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', 'ce65fe1d81dcd7067d5092a5667f35cc'))
-paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '334c6af750941a4397a2dd2ea8a4d76f'))
-paddle.fluid.clip.set_gradient_clip (ArgSpec(args=['clip', 'param_list', 'program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '77ca02bb37b70d226510df9cf5e45965'))
-paddle.fluid.clip.ErrorClipByValue ('paddle.fluid.clip.ErrorClipByValue', ('document', 'e6f815a03be88dee2537707d9e6b9209'))
-paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.clip.GradientClipByValue ('paddle.fluid.clip.GradientClipByValue', ('document', 'b7a22f687269cae0c338ef3866322db7'))
-paddle.fluid.clip.GradientClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.clip.GradientClipByNorm ('paddle.fluid.clip.GradientClipByNorm', ('document', 'a5c23d96a3d8c8c1183e9469a5d0d52e'))
-paddle.fluid.clip.GradientClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.clip.GradientClipByGlobalNorm ('paddle.fluid.clip.GradientClipByGlobalNorm', ('document', 'ef50acbe212101121d4b82f693ec1733'))
-paddle.fluid.clip.GradientClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph_grad_clip.GradClipByValue ('paddle.fluid.dygraph_grad_clip.GradClipByValue', ('document', '6971a42222de0387a7ee9c59671dd2e3'))
-paddle.fluid.dygraph_grad_clip.GradClipByValue.__init__ (ArgSpec(args=['self', 'min_value', 'max_value'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph_grad_clip.GradClipByNorm ('paddle.fluid.dygraph_grad_clip.GradClipByNorm', ('document', '2039274ea09987ba48eded67999dc280'))
-paddle.fluid.dygraph_grad_clip.GradClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm ('paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm', ('document', 'd1872377e7d7a5fe0dd2e8c42e4c9656'))
-paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'max_global_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '4053b45953807a24e28027dc86829d6c'))
-paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'fd1f25a7a06516ca9a1f4ab0783a4d70'))
-paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a2be24e028dffa06ab28cc55a27c59e4'))
-paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '4c192ea399e6e80b1ab47a8265b022a5'))
-paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'bc8628b859b04242200e48a458c971c4'))
-paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '4d68cde4c4df8f1b8018620b4dc19b42'))
-paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '695a6e91afbcdbafac69a069038811be'))
-paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ead717d6d440a1eb11971695cd1727f4'))
-paddle.fluid.Scope Scope() -> paddle.fluid.core_avx._Scope
-paddle.fluid.install_check.run_check (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '66b7c84a17ed32fec2df9628367be2b9'))
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
deleted file mode 100644
index 16457b564ffc82a4246776dc283261bed0351ec6..0000000000000000000000000000000000000000
--- a/paddle/fluid/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-add_subdirectory(memory)
-add_subdirectory(platform)
-add_subdirectory(framework)
-add_subdirectory(imperative)
-add_subdirectory(operators)
-add_subdirectory(string)
-add_subdirectory(pybind)
-
-# NOTE: please add subdirectory inference at last.
-add_subdirectory(inference)
-add_subdirectory(train)
diff --git a/paddle/fluid/framework/.gitignore b/paddle/fluid/framework/.gitignore
deleted file mode 100644
index 5132131e55e2feee8ae88b4c65ec102fbc9c5fe1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-.tensor_util.cu
-.data_type_transform.cu
\ No newline at end of file
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
deleted file mode 100644
index 51efe60c90121e3e3c50a31b5a3e8972ee5a0202..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/CMakeLists.txt
+++ /dev/null
@@ -1,248 +0,0 @@
-
-#windows treat symbolic file as a real file, which is different with unix
-#We create a hidden file and compile it instead of origin source file.
-function(windows_symbolic TARGET)
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS PATH)
-  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
-  foreach(src ${windows_symbolic_SRCS})
-    get_filename_component(src ${src} NAME_WE)
-    if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
-        message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
-    endif()
-
-    file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
-
-    add_custom_command(OUTPUT ${final_path}/.${src}.cu
-            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
-            COMMENT "create hidden file of ${src}.cu")
-    add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
-  endforeach()
-endfunction()
-
-add_subdirectory(ir)
-add_subdirectory(details)
-add_subdirectory(fleet)
-add_subdirectory(io)
-#ddim lib
-proto_library(framework_proto SRCS framework.proto)
-proto_library(data_feed_proto SRCS data_feed.proto)
-proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
-  data_feed_proto)
-
-cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
-cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
-cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
-cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
-if(WITH_GPU)
-  if (WIN32)
-    windows_symbolic(tensor_util SRCS tensor_util.cu)
-    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
-    add_dependencies(tensor tensor_util)
-  else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
-  endif(WIN32)
-else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
-endif()
-
-cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
-if(WITH_GPU)
-  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
-else()
-  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
-endif()
-
-cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
-
-if(WITH_GPU)
-  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
-else()
-  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
-endif()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
-
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
-
-cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)
-
-cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
-cc_test(reader_test SRCS reader_test.cc DEPS reader)
-
-cc_library(threadpool SRCS threadpool.cc DEPS enforce)
-cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
-
-cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
-if (WITH_GPU)
-  target_link_libraries(var_type_traits dynload_cuda)
-endif()
-cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
-
-cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
-cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
-cc_test(scope_test SRCS scope_test.cc DEPS scope)
-cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
-
-cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
-nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry device_context math_function scope)
-
-if(WITH_GPU)
-  if (WIN32)
-#windows treat symbolic file as a real file, which is different with unix
-#We create a hidden file and compile it instead of origin source file.
-      windows_symbolic(hidden_file SRCS data_type_transform.cu)
-      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
-      add_dependencies(data_type_transform hidden_file)
-  else()
-      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
-  endif(WIN32)
-  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
-else()
-  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
-  cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
-endif()
-
-cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
-cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)
-
-cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
-        framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
-
-cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
-cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
-device_context)
-cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
-cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
-cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
-
-cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
-cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack)
-
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
-
-cc_library(version SRCS version.cc)
-cc_test(version_test SRCS version_test.cc DEPS version)
-
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
-
-cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
-
-nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-
-py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
-py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
-#Generate an empty \
-    #__init__.py to make framework_py_proto as a valid python module.
-add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-add_dependencies(framework_py_proto framework_py_proto_init)
-if (NOT WIN32)
-  add_custom_command(TARGET framework_py_proto POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-      COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
-      COMMENT "Copy generated python proto into directory paddle/fluid/proto."
-      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-else(NOT WIN32)
-  string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
-  add_custom_command(TARGET framework_py_proto POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-          COMMAND copy /Y *.py ${proto_dstpath}
-          COMMENT "Copy generated python proto into directory paddle/fluid/proto."
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif(NOT WIN32)
-
-cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
-
-cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
-cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
-
-cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-
-if(WITH_NGRAPH)
-  set(NGRAPH_EXE_DEPS ngraph_engine)
-else()
-  set(NGRAPH_EXE_DEPS)
-endif()
-
-cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector)
-if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
-  dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
-  data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc
-  pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-  device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper lodtensor_printer
-  lod_rank_table feed_fetch_method sendrecvop_rpc collective_helper ${GLOB_DISTRIBUTE_DEPS}
-  graph_to_program_pass variable_helper data_feed_proto ${NGRAPH_EXE_DEPS} timer)
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-else()
-  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
-  dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
-  data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc
-  pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-  device_context scope framework_proto data_feed_proto trainer_desc_proto glog
-  lod_rank_table fs shell fleet_wrapper lodtensor_printer feed_fetch_method
-  graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS} timer)
-  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
-endif()
-
-target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper)
-
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS
-        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor
-        graph build_strategy
-        fast_threaded_ssa_graph_executor variable_helper)
-
-cc_library(prune SRCS prune.cc DEPS framework_proto boost)
-cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
-cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
-        proto_desc)
-cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
-cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
-
-cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type)
-cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-
-cc_test(tuple_test SRCS tuple_test.cc )
-
-cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
-
-if (NOT WIN32)
-cc_test(rw_lock_test SRCS rw_lock_test.cc)
-endif (NOT WIN32)
-
-cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
-cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
-
-cc_library(op_compatible_info SRCS op_compatible_info DEPS string_helper)
-cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatible_info string_helper glog)
-
-# Get the current working branch
-execute_process(
-  COMMAND git rev-parse --abbrev-ref HEAD
-    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-      OUTPUT_VARIABLE PADDLE_BRANCH
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-	)
-
-# Get the latest abbreviated commit hash of the working branch
-execute_process(
-  COMMAND git log -1 --format=%h
-    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-      OUTPUT_VARIABLE PADDLE_COMMIT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-	)
-
-message(STATUS "commit: ${PADDLE_COMMIT}")
-message(STATUS "branch: ${PADDLE_BRANCH}")
-
-configure_file(commit.h.in commit.h)
diff --git a/paddle/fluid/framework/archive.h b/paddle/fluid/framework/archive.h
deleted file mode 100644
index 73fcc7424e43500d5efc005bf7fb206cbde626b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/archive.h
+++ /dev/null
@@ -1,621 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
-#include <glog/logging.h>
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <valarray>
-#include <vector>
-#include "paddle/fluid/framework/expect.h"
-
-namespace paddle {
-namespace framework {
-
-// not a virtual class
-class ArchiveBase {
- protected:
-  ArchiveBase() {}
-
-  // Archive is not copyable. But to allow move capture by function objects,
-  // check it at runtime rather than at compile time.
-  ArchiveBase(const ArchiveBase&) { LOG(FATAL) << "Not supported"; }
-
-  ArchiveBase(ArchiveBase&& other)
-      : buffer_(other.buffer_),
-        cursor_(other.cursor_),
-        finish_(other.finish_),
-        limit_(other.limit_),
-        deleter_(std::move(other.deleter_)) {
-    other.buffer_ = NULL;
-    other.cursor_ = NULL;
-    other.finish_ = NULL;
-    other.limit_ = NULL;
-    other.deleter_ = nullptr;
-  }
-
-  ~ArchiveBase() { FreeBuffer(); }
-
- public:
-  ArchiveBase& operator=(const ArchiveBase&) {
-    LOG(FATAL) << "Not supported";
-    return *this;
-  }
-
-  ArchiveBase& operator=(ArchiveBase&& other) {
-    if (this != &other) {
-      FreeBuffer();
-      buffer_ = other.buffer_;
-      cursor_ = other.cursor_;
-      finish_ = other.finish_;
-      limit_ = other.limit_;
-      deleter_ = std::move(other.deleter_);
-      other.buffer_ = NULL;
-      other.cursor_ = NULL;
-      other.finish_ = NULL;
-      other.limit_ = NULL;
-      other.deleter_ = nullptr;
-    }
-    return *this;
-  }
-
-  char* Buffer() { return buffer_; }
-
-  void SetReadBuffer(char* buffer, size_t length,
-                     std::function<void(char*)>&& deleter) {
-    SetBuffer(buffer, length, length, std::move(deleter));
-  }
-
-  void SetWriteBuffer(char* buffer, size_t capacity,
-                      std::function<void(char*)>&& deleter) {
-    SetBuffer(buffer, 0, capacity, std::move(deleter));
-  }
-
-  void SetBuffer(char* buffer, size_t length, size_t capacity,
-                 std::function<void(char*)>&& deleter) {
-    CHECK(length <= capacity);
-    FreeBuffer();
-    buffer_ = buffer;
-    cursor_ = buffer_;
-    finish_ = buffer + length;
-    limit_ = buffer + capacity;
-    deleter_ = std::move(deleter);
-  }
-
-  char* Cursor() { return cursor_; }
-
-  void SetCursor(char* cursor) {
-    CHECK(cursor >= buffer_ && cursor <= finish_);
-    cursor_ = cursor;
-  }
-
-  void AdvanceCursor(size_t offset) {
-    CHECK(offset <= size_t(finish_ - cursor_));
-    cursor_ += offset;
-  }
-
-  char* Finish() { return finish_; }
-
-  void SetFinish(char* finish) {
-    CHECK(finish >= cursor_ && finish <= limit_);
-    finish_ = finish;
-  }
-
-  void AdvanceFinish(size_t offset) {
-    CHECK(offset <= size_t(limit_ - finish_));
-    finish_ += offset;
-  }
-
-  char* Limit() { return limit_; }
-
-  size_t Position() { return cursor_ - buffer_; }
-
-  size_t Length() { return finish_ - buffer_; }
-
-  size_t Capacity() { return limit_ - buffer_; }
-
-  bool Empty() { return finish_ == buffer_; }
-
-  void Reset() {
-    FreeBuffer();
-    buffer_ = NULL;
-    cursor_ = NULL;
-    finish_ = NULL;
-    limit_ = NULL;
-  }
-
-  void Clear() {
-    cursor_ = buffer_;
-    finish_ = buffer_;
-  }
-
-  char* Release() {
-    char* buf = buffer_;
-    buffer_ = NULL;
-    cursor_ = NULL;
-    finish_ = NULL;
-    deleter_ = nullptr;
-    return buf;
-  }
-
-  void Resize(size_t newsize) {
-#ifdef _LINUX
-    if (unlikely(newsize > Capacity())) {
-#else
-    if (newsize > Capacity()) {
-#endif
-      Reserve((std::max)(Capacity() * 2, newsize));
-    }
-    finish_ = buffer_ + newsize;
-    cursor_ = (std::min)(cursor_, finish_);
-  }
-
-  void Reserve(size_t newcap) {
-    if (newcap > Capacity()) {
-      char* newbuf = NULL;
-      newbuf = new char[newcap];
-      CHECK(newbuf != nullptr) << "Reserve failed, out of memory";
-      if (Length() > 0) {
-        memcpy(newbuf, buffer_, Length());
-      }
-      cursor_ = newbuf + (cursor_ - buffer_);
-      finish_ = newbuf + (finish_ - buffer_);
-      limit_ = newbuf + newcap;
-      FreeBuffer();
-      buffer_ = newbuf;
-      deleter_ = std::default_delete<char[]>();
-    }
-  }
-
-  void PrepareRead(size_t size) {
-#ifdef _LINUX
-    if (unlikely(!(size <= size_t(finish_ - cursor_)))) {
-#else
-    if (!(size <= size_t(finish_ - cursor_))) {
-#endif
-      CHECK(size <= size_t(finish_ - cursor_));
-    }
-  }
-
-  void PrepareWrite(size_t size) {
-#ifdef _LINUX
-    if (unlikely(size > size_t(limit_ - finish_))) {
-#else
-    if (size > size_t(limit_ - finish_)) {
-#endif
-      Reserve((std::max)(Capacity() * 2, Length() + size));
-    }
-  }
-
-  void Read(void* data, size_t size) {
-    if (size > 0) {
-      PrepareRead(size);
-      memcpy(data, cursor_, size);
-      AdvanceCursor(size);
-    }
-  }
-
-  void ReadBack(void* data, size_t size) {
-    if (size > 0) {
-      CHECK(size <= size_t(finish_ - cursor_));
-      memcpy(data, finish_ - size, size);
-      finish_ -= size;
-    }
-  }
-
-  void Write(const void* data, size_t size) {
-    if (size > 0) {
-      PrepareWrite(size);
-      memcpy(finish_, data, size);
-      AdvanceFinish(size);
-    }
-  }
-
-  template <class T>
-  void GetRaw(T& x) {  // NOLINT
-    PrepareRead(sizeof(T));
-    memcpy(&x, cursor_, sizeof(T));
-    AdvanceCursor(sizeof(T));
-  }
-
-  template <class T>
-  T GetRaw() {
-    T x;
-    GetRaw<T>(x);
-    return x;
-  }
-
-  template <class T>
-  void PutRaw(const T& x) {
-    PrepareWrite(sizeof(T));
-    memcpy(finish_, &x, sizeof(T));
-    AdvanceFinish(sizeof(T));
-  }
-
- protected:
-  char* buffer_ = NULL;
-  char* cursor_ = NULL;
-  char* finish_ = NULL;
-  char* limit_ = NULL;
-  std::function<void(char*)> deleter_ = nullptr;
-
-  void FreeBuffer() {
-    if (deleter_) {
-      deleter_(buffer_);
-    }
-    deleter_ = nullptr;
-  }
-};  // NOLINT
-
-template <class Type>
-class Archive {};
-
-class BinaryArchiveType {};
-
-typedef Archive<BinaryArchiveType> BinaryArchive;
-
-template <>
-class Archive<BinaryArchiveType> : public ArchiveBase {
- public:
-#define ARCHIVE_REPEAT(T)                 \
-  BinaryArchive& operator>>(T& x) {       \
-    GetRaw(x);                            \
-    return *this;                         \
-  }                                       \
-  BinaryArchive& operator<<(const T& x) { \
-    PutRaw(x);                            \
-    return *this;                         \
-  }
-
-  ARCHIVE_REPEAT(int16_t)
-  ARCHIVE_REPEAT(uint16_t)
-  ARCHIVE_REPEAT(int32_t)
-  ARCHIVE_REPEAT(uint32_t)
-  ARCHIVE_REPEAT(int64_t)
-  ARCHIVE_REPEAT(uint64_t)
-  ARCHIVE_REPEAT(float)
-  ARCHIVE_REPEAT(double)
-  ARCHIVE_REPEAT(signed char)
-  ARCHIVE_REPEAT(unsigned char)
-  ARCHIVE_REPEAT(bool)
-
-#undef ARCHIVE_REPEAT
-
-  template <class T>
-  T Get() {
-    T x;
-    *this >> x;
-    return x;
-  }
-
-  template <class... ARGS>
-  void Printf(const char* fmt, ARGS&&... args) {
-    size_t temp = Limit() - Finish();
-    int len = snprintf(Finish(), temp, fmt, args...);
-    CHECK(len >= 0);  // NOLINT
-    if ((size_t)len >= temp) {
-      PrepareWrite(len + 1);
-      CHECK(snprintf(Finish(), (size_t)len + 1, fmt, args...) == len);
-    }
-    AdvanceFinish(len);
-  }
-};
-
-template <class AR, class T, size_t N>
-Archive<AR>& operator<<(Archive<AR>& ar, const T (&p)[N]) {
-  for (size_t i = 0; i < N; i++) {
-    ar << p[i];
-  }
-  return ar;
-}
-
-template <class AR, class T, size_t N>
-Archive<AR>& operator>>(Archive<AR>& ar, T (&p)[N]) {
-  for (size_t i = 0; i < N; i++) {
-    ar >> p[i];
-  }
-  return ar;
-}
-
-template <class AR, class T>
-Archive<AR>& operator<<(Archive<AR>& ar, const std::vector<T>& p) {
-#ifdef _LINUX
-  ar << (size_t)p.size();
-#else
-  ar << (uint64_t)p.size();
-#endif
-  for (const auto& x : p) {
-    ar << x;
-  }
-  return ar;
-}
-
-template <class AR, class T>
-Archive<AR>& operator>>(Archive<AR>& ar, std::vector<T>& p) {
-#ifdef _LINUX
-  p.resize(ar.template Get<size_t>());
-#else
-  p.resize(ar.template Get<uint64_t>());
-#endif
-  for (auto& x : p) {
-    ar >> x;
-  }
-  return ar;
-}
-
-template <class AR, class T>
-Archive<AR>& operator<<(Archive<AR>& ar, const std::valarray<T>& p) {
-#ifdef _LINUX
-  ar << (size_t)p.size();
-#else
-  ar << (uint64_t)p.size();
-#endif
-  for (const auto& x : p) {
-    ar << x;
-  }
-  return ar;
-}
-
-template <class AR, class T>
-Archive<AR>& operator>>(Archive<AR>& ar, std::valarray<T>& p) {
-#ifdef _LINUX
-  p.resize(ar.template Get<size_t>());
-#else
-  p.resize(ar.template Get<uint64_t>());
-#endif
-  for (auto& x : p) {
-    ar >> x;
-  }
-  return ar;
-}
-
-inline BinaryArchive& operator<<(BinaryArchive& ar, const std::string& s) {
-#ifdef _LINUX
-  ar << (size_t)s.length();
-#else
-  ar << (uint64_t)s.length();
-#endif
-  ar.Write(&s[0], s.length());
-  return ar;
-}
-
-inline BinaryArchive& operator>>(BinaryArchive& ar, std::string& s) {
-#ifdef _LINUX
-  size_t len = ar.template Get<size_t>();
-#else
-  size_t len = ar.template Get<uint64_t>();
-#endif
-  ar.PrepareRead(len);
-  s.assign(ar.Cursor(), len);
-  ar.AdvanceCursor(len);
-  return ar;
-}
-
-template <class AR, class T1, class T2>
-Archive<AR>& operator<<(Archive<AR>& ar, const std::pair<T1, T2>& x) {
-  return ar << x.first << x.second;
-}
-
-template <class AR, class T1, class T2>
-Archive<AR>& operator>>(Archive<AR>& ar, std::pair<T1, T2>& x) {  // NOLINT
-  return ar >> x.first >> x.second;
-}
-
-#ifdef _LINUX
-template <class AR, class... T>
-Archive<AR>& SerializeTuple(Archive<AR>& ar,                        // NOLINT
-                            const std::tuple<T...>& x,              // NOLINT
-                            std::integral_constant<size_t, 0> n) {  // NOLINT
-  return ar;
-}
-#else
-template <class AR, class... T>
-Archive<AR>& SerializeTuple(Archive<AR>& ar,                          // NOLINT
-                            const std::tuple<T...>& x,                // NOLINT
-                            std::integral_constant<uint64_t, 0> n) {  // NOLINT
-  return ar;
-}
-#endif
-
-#ifdef _LINUX
-template <class AR, class... T, size_t N>
-Archive<AR>& serialize_tuple(Archive<AR>& ar,                        // NOLINT
-                             const std::tuple<T...>& x,              // NOLINT
-                             std::integral_constant<size_t, N> n) {  // NOLINT
-  return SerializeTuple(ar, x, std::integral_constant<size_t, N - 1>())
-         << std::get<N - 1>(x);
-}
-#else
-template <class AR, class... T, uint64_t N>
-Archive<AR>& serialize_tuple(Archive<AR>& ar,                          // NOLINT
-                             const std::tuple<T...>& x,                // NOLINT
-                             std::integral_constant<uint64_t, N> n) {  // NOLINT
-  return SerializeTuple(ar, x, std::integral_constant<uint64_t, N - 1>())
-         << std::get<N - 1>(x);
-}
-#endif
-
-#ifdef _LINUX
-template <class AR, class... T>
-Archive<AR>& operator<<(Archive<AR>& ar, const std::tuple<T...>& x) {
-  const size_t size = std::tuple_size<std::tuple<T...>>::value;
-  return SerializeTuple(ar, x, std::integral_constant<size_t, size>());
-}
-#else
-template <class AR, class... T>
-Archive<AR>& operator<<(Archive<AR>& ar, const std::tuple<T...>& x) {
-  const uint64_t size = std::tuple_size<std::tuple<T...>>::value;
-  return SerializeTuple(ar, x, std::integral_constant<uint64_t, size>());
-}
-#endif
-
-#ifdef _LINUX
-template <class AR, class... T>
-Archive<AR>& DeserializeTuple(Archive<AR>& ar, std::tuple<T...>& x,  // NOLINT
-                              std::integral_constant<size_t, 0> n) {
-  return ar;
-}
-#else
-template <class AR, class... T>
-Archive<AR>& DeserializeTuple(Archive<AR>& ar, std::tuple<T...>& x,  // NOLINT
-                              std::integral_constant<uint64_t, 0> n) {
-  return ar;
-}
-#endif
-
-#ifdef _LINUX
-template <class AR, class... T, size_t N>
-Archive<AR>& DeserializeTuple(Archive<AR>& ar, std::tuple<T...>& x,  // NOLINT
-                              std::integral_constant<size_t, N> n) {
-  return DeserializeTuple(ar, x, std::integral_constant<size_t, N - 1>()) >>
-         std::get<N - 1>(x);
-}
-#else
-template <class AR, class... T, uint64_t N>
-Archive<AR>& DeserializeTuple(Archive<AR>& ar, std::tuple<T...>& x,  // NOLINT
-                              std::integral_constant<uint64_t, N> n) {
-  return DeserializeTuple(ar, x, std::integral_constant<uint64_t, N - 1>()) >>
-         std::get<N - 1>(x);
-}
-#endif
-
-#ifdef _LINUX
-template <class AR, class... T>
-Archive<AR>& operator>>(Archive<AR>& ar, std::tuple<T...>& x) {
-  const size_t size = std::tuple_size<std::tuple<T...>>::value;
-  return DeserializeTuple(ar, x, std::integral_constant<size_t, size>());
-}
-#else
-template <class AR, class... T>
-Archive<AR>& operator>>(Archive<AR>& ar, std::tuple<T...>& x) {
-  const uint64_t size = std::tuple_size<std::tuple<T...>>::value;
-  return DeserializeTuple(ar, x, std::integral_constant<uint64_t, size>());
-}
-#endif
-
-#ifdef _LINUX
-#define ARCHIVE_REPEAT(MAP_TYPE, RESERVE_STATEMENT)                            \
-  template <class AR, class KEY, class VALUE, class... ARGS>                   \
-  Archive<AR>& operator<<(Archive<AR>& ar,                                     \
-                          const MAP_TYPE<KEY, VALUE, ARGS...>& p) {            \
-    ar << (size_t)p.size();                                                    \
-    for (auto it = p.begin(); it != p.end(); ++it) {                           \
-      ar << *it;                                                               \
-    }                                                                          \
-    return ar;                                                                 \
-  }                                                                            \
-  template <class AR, class KEY, class VALUE, class... ARGS>                   \
-  Archive<AR>& operator>>(Archive<AR>& ar, MAP_TYPE<KEY, VALUE, ARGS...>& p) { \
-    size_t size = ar.template get<size_t>();                                   \
-    p.clear();                                                                 \
-    RESERVE_STATEMENT;                                                         \
-    for (size_t i = 0; i < size; i++) {                                        \
-      p.insert(ar.template get<std::pair<KEY, VALUE>>());                      \
-    }                                                                          \
-    return ar;                                                                 \
-  }
-#else
-#define ARCHIVE_REPEAT(MAP_TYPE, RESERVE_STATEMENT)                            \
-  template <class AR, class KEY, class VALUE, class... ARGS>                   \
-  Archive<AR>& operator<<(Archive<AR>& ar,                                     \
-                          const MAP_TYPE<KEY, VALUE, ARGS...>& p) {            \
-    ar << (uint64_t)p.size();                                                  \
-    for (auto it = p.begin(); it != p.end(); ++it) {                           \
-      ar << *it;                                                               \
-    }                                                                          \
-    return ar;                                                                 \
-  }                                                                            \
-  template <class AR, class KEY, class VALUE, class... ARGS>                   \
-  Archive<AR>& operator>>(Archive<AR>& ar, MAP_TYPE<KEY, VALUE, ARGS...>& p) { \
-    size_t size = ar.template get<uint64_t>();                                 \
-    p.clear();                                                                 \
-    RESERVE_STATEMENT;                                                         \
-    for (size_t i = 0; i < size; i++) {                                        \
-      p.insert(ar.template get<std::pair<KEY, VALUE>>());                      \
-    }                                                                          \
-    return ar;                                                                 \
-  }
-#endif
-
-ARCHIVE_REPEAT(std::map, )
-ARCHIVE_REPEAT(std::multimap, )
-ARCHIVE_REPEAT(std::unordered_map, p.reserve(size))
-ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size))
-
-#undef ARCHIVE_REPEAT
-
-#ifdef _LINUX
-#define ARCHIVE_REPEAT(SET_TYPE, RESERVE_STATEMENT)                           \
-  template <class AR, class KEY, class... ARGS>                               \
-  Archive<AR>& operator<<(Archive<AR>& ar, const SET_TYPE<KEY, ARGS...>& p) { \
-    ar << (size_t)p.size();                                                   \
-    for (auto it = p.begin(); it != p.end(); ++it) {                          \
-      ar << *it;                                                              \
-    }                                                                         \
-    return ar;                                                                \
-  }                                                                           \
-  template <class AR, class KEY, class... ARGS>                               \
-  Archive<AR>& operator>>(Archive<AR>& ar, SET_TYPE<KEY, ARGS...>& p) {       \
-    size_t size = ar.template get<size_t>();                                  \
-    p.clear();                                                                \
-    RESERVE_STATEMENT;                                                        \
-    for (size_t i = 0; i < size; i++) {                                       \
-      p.insert(ar.template get<KEY>());                                       \
-    }                                                                         \
-    return ar;                                                                \
-  }
-#else
-#define ARCHIVE_REPEAT(SET_TYPE, RESERVE_STATEMENT)                           \
-  template <class AR, class KEY, class... ARGS>                               \
-  Archive<AR>& operator<<(Archive<AR>& ar, const SET_TYPE<KEY, ARGS...>& p) { \
-    ar << (uint64_t)p.size();                                                 \
-    for (auto it = p.begin(); it != p.end(); ++it) {                          \
-      ar << *it;                                                              \
-    }                                                                         \
-    return ar;                                                                \
-  }                                                                           \
-  template <class AR, class KEY, class... ARGS>                               \
-  Archive<AR>& operator>>(Archive<AR>& ar, SET_TYPE<KEY, ARGS...>& p) {       \
-    size_t size = ar.template get<uint64_t>();                                \
-    p.clear();                                                                \
-    RESERVE_STATEMENT;                                                        \
-    for (size_t i = 0; i < size; i++) {                                       \
-      p.insert(ar.template get<KEY>());                                       \
-    }                                                                         \
-    return ar;                                                                \
-  }
-#endif
-
-ARCHIVE_REPEAT(std::set, )
-ARCHIVE_REPEAT(std::multiset, )
-ARCHIVE_REPEAT(std::unordered_set, p.reserve(size))
-ARCHIVE_REPEAT(std::unordered_multiset, p.reserve(size))
-
-#undef ARCHIVE_REPEAT
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h
deleted file mode 100644
index b53082986882c80a85826f10d5766525f72c0a97..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/array.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdint>
-#include "paddle/fluid/framework/unroll_array_ops.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T, size_t N>
-class Array {
- public:
-  static constexpr size_t kSize = N;
-
-  HOSTDEVICE inline Array() {}
-
-  template <typename... Args>
-  HOSTDEVICE inline explicit Array(const T &val, Args... args) {
-    static_assert(N == sizeof...(Args) + 1, "Invalid argument");
-    UnrollVarArgsAssign<T>::Run(data_, val, args...);
-  }
-
-  HOSTDEVICE inline void Fill(const T &val) {
-    UnrollFillConstant<N>::Run(data_, val);
-  }
-
-  HOSTDEVICE inline const T *Get() const { return data_; }
-
-  HOSTDEVICE inline T *GetMutable() { return data_; }
-
-  HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); }
-
-  // Writing "return data_[i]" would cause compilation warning/error:
-  // "array subscript is above array bound" in Python 35 CI.
-  // It seems that it is a false warning of GCC if we do not check the bounds
-  // of array index. But for better performance, we do not check in operator[]
-  // like what is in STL. If users want to check the bounds, use at() instead
-  HOSTDEVICE inline const T &operator[](size_t i) const {
-    return *advance(data_, i);
-  }
-
-  HOSTDEVICE inline T &at(size_t i) {
-#ifndef __CUDA_ARCH__
-    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
-#endif
-    return (*this)[i];
-  }
-
-  HOSTDEVICE inline const T &at(size_t i) const {
-#ifndef __CUDA_ARCH__
-    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
-#endif
-    return (*this)[i];
-  }
-
-  HOSTDEVICE constexpr size_t size() const { return N; }
-
-  HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
-    return UnrollCompare<N>::Run(data_, other.data_);
-  }
-
-  HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
-    return !(*this == other);
-  }
-
- private:
-  template <typename U>
-  HOSTDEVICE static inline U *advance(U *ptr, size_t i) {
-    return ptr + i;
-  }
-
-  T data_[N];
-};
-
-template <typename T>
-class Array<T, 0> {
- public:
-  static constexpr size_t kSize = 0;
-
-  HOSTDEVICE inline Array() {}
-
-  HOSTDEVICE inline void Fill(const T &val) {}
-
-  HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
-
-  // Add constexpr to GetMutable() cause warning in MAC
-  HOSTDEVICE inline T *GetMutable() { return nullptr; }
-
-  HOSTDEVICE inline T &operator[](size_t) {
-#ifdef __CUDA_ARCH__
-    static T obj();
-    return obj;
-#else
-    PADDLE_THROW("Array<T, 0> has no element");
-#endif
-  }
-
-  HOSTDEVICE inline const T &operator[](size_t) const {
-#ifdef __CUDA_ARCH__
-    static const T obj();
-    return obj;
-#else
-    PADDLE_THROW("Array<T, 0> has no element");
-#endif
-  }
-
-  HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; }
-
-  HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; }
-
-  HOSTDEVICE constexpr size_t size() const { return 0; }
-
-  HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
-    return true;
-  }
-
-  HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
-    return false;
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
deleted file mode 100644
index 7eb80a4617ae547751a77449977ffeb245226bb0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/async_executor.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/async_executor.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/message.h"
-#include "google/protobuf/text_format.h"
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/executor_thread_worker.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/trainer.h"
-#include "paddle/fluid/framework/trainer_desc.pb.h"
-#include "paddle/fluid/framework/trainer_factory.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/pybind/pybind.h"
-
-namespace paddle {
-namespace framework {
-AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place)
-    : root_scope_(scope), place_(place) {}
-
-void AsyncExecutor::InitServer(const std::string& dist_desc, int index) {
-  fleet_ptr_ = FleetWrapper::GetInstance();
-  fleet_ptr_->InitServer(dist_desc, index);
-}
-
-void AsyncExecutor::InitWorker(const std::string& dist_desc,
-                               const std::vector<uint64_t>& host_sign_list,
-                               int node_num, int index) {
-  fleet_ptr_ = FleetWrapper::GetInstance();
-  fleet_ptr_->InitWorker(dist_desc, host_sign_list, node_num, index);
-}
-
-uint64_t AsyncExecutor::StartServer() { return fleet_ptr_->RunServer(); }
-
-void AsyncExecutor::StopServer() { fleet_ptr_->StopServer(); }
-
-void AsyncExecutor::GatherServers(const std::vector<uint64_t>& host_sign_list,
-                                  int node_num) {
-  fleet_ptr_->GatherServers(host_sign_list, node_num);
-}
-
-// todo InitModel
-void AsyncExecutor::InitModel() {}
-
-// todo SaveModel
-void AsyncExecutor::SaveModel(const std::string& path) {}
-
-void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
-                                const std::string& data_feed_desc_str,
-                                const std::vector<std::string>& filelist,
-                                const int thread_num,
-                                const std::vector<std::string>& fetch_var_names,
-                                const std::string& mode, const bool debug) {
-  std::vector<std::thread> threads;
-
-  auto& block = main_program.Block(0);
-  for (auto var_name : fetch_var_names) {
-    auto var_desc = block.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc, "%s is not found.", var_name);
-    auto shapes = var_desc->GetShape();
-    PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
-                   "var %s: Fetched var has wrong shape, "
-                   "only variables with the last dimension size 1 supported",
-                   var_name);
-  }
-
-  DataFeedDesc data_feed_desc;
-  bool success = data_feed_desc.ParseFromString(data_feed_desc_str);
-  PADDLE_ENFORCE(success, "Fail to parse DataFeedDesc from string:\n%s",
-                 data_feed_desc_str.c_str());
-
-  actual_thread_num_ = thread_num;
-  int file_cnt = filelist.size();
-  PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
-
-  if (actual_thread_num_ > file_cnt) {
-    VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt
-            << ". Changing thread_num = " << file_cnt;
-    actual_thread_num_ = file_cnt;
-  }
-
-  /*
-    readerDesc: protobuf description for reader initlization
-    argument: class_name, batch_size, use_slot, queue_size, buffer_size,
-    padding_index
-
-    reader:
-    1) each thread has a reader, reader will read input data and
-    put it into input queue
-    2) each reader has a Next() iterface, that can fetch an instance
-    from the input queue
-   */
-  // todo: should be factory method for creating datafeed
-  std::vector<std::shared_ptr<DataFeed>> readers;
-  /*
-  PrepareReaders(readers, actual_thread_num_, data_feed_desc, filelist);
-#ifdef PADDLE_WITH_PSLIB
-  PrepareDenseThread(mode);
-#endif
-  */
-  std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
-  workers.resize(actual_thread_num_);
-  for (auto& worker : workers) {
-#ifdef PADDLE_WITH_PSLIB
-    if (mode == "mpi") {
-      worker.reset(new AsyncExecutorThreadWorker);
-    } else {
-      worker.reset(new ExecutorThreadWorker);
-    }
-#else
-    worker.reset(new ExecutorThreadWorker);
-#endif
-  }
-
-  // prepare thread resource here
-  /*
-  for (int thidx = 0; thidx < actual_thread_num_; ++thidx) {
-    CreateThreads(workers[thidx].get(), main_program, readers[thidx],
-                  fetch_var_names, root_scope_, thidx, debug);
-  }
-  */
-
-  // start executing ops in multiple threads
-  for (int thidx = 0; thidx < actual_thread_num_; ++thidx) {
-    if (debug) {
-      threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
-                                    workers[thidx].get()));
-    } else {
-      threads.push_back(
-          std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
-    }
-  }
-
-  for (auto& th : threads) {
-    th.join();
-  }
-  // TODO(guru4elephant): we don't need this
-  /*
-#ifdef PADDLE_WITH_PSLIB
-  if (mode == "mpi") {
-    _pull_dense_thread->stop();
-  }
-#endif
-  */
-  VLOG(3) << "start to run from files in async_executor";
-  VLOG(3) << "Drop current scope kids";
-  root_scope_->DropKids();
-  return;
-}
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h
deleted file mode 100644
index 7b59e1b11ca577d4b03784db50d5fa6ed3d1f12b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/async_executor.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-#include <map>
-#include <memory>
-#include <mutex>   // NOLINT
-#include <random>  // local_random_engine
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <typeinfo>
-#include <vector>
-#include "paddle/fluid/framework/data_feed.pb.h"
-#include "paddle/fluid/framework/data_set.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/executor_thread_worker.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-
-inline double current_realtime() {
-#if !defined(_WIN32)
-  struct timespec tp;
-  clock_gettime(CLOCK_REALTIME, &tp);
-  return tp.tv_sec + tp.tv_nsec * 1e-9;
-#else
-  return 0.0;
-#endif
-}
-
-inline std::default_random_engine& local_random_engine() {
-  struct engine_wrapper_t {
-    std::default_random_engine engine;
-    engine_wrapper_t() {
-      static std::atomic<uint64_t> x(0);
-      std::seed_seq sseq = {x++, x++, x++,
-                            static_cast<uint64_t>(current_realtime() * 1000)};
-      engine.seed(sseq);
-    }
-  };
-  thread_local engine_wrapper_t r;
-  return r.engine;
-}
-
-class AsyncExecutor {
- public:
-  AsyncExecutor(Scope* scope, const platform::Place& place);
-  virtual ~AsyncExecutor() {}
-  void RunFromFile(const ProgramDesc& main_program,
-                   const std::string& data_feed_desc_str,
-                   const std::vector<std::string>& filelist,
-                   const int thread_num,
-                   const std::vector<std::string>& fetch_var_names,
-                   const std::string& mode, const bool debug);
-
-  // TODO(guru4elephant): make init server decoupled from executor
-  void InitServer(const std::string& dist_desc, int index);
-  void InitWorker(const std::string& dist_desc,
-                  const std::vector<uint64_t>& host_sign_list, int node_num,
-                  int index);
-  uint64_t StartServer();
-  void StopServer();
-  void GatherServers(const std::vector<uint64_t>& host_sign_list, int node_num);
-  void InitModel();
-  void SaveModel(const std::string& path);
-
- public:
-  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
-  Scope* root_scope_;
-  platform::Place place_;
-
- private:
-  int actual_thread_num_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
deleted file mode 100644
index fabf2abfc803b8838edb48aa01ab8896799c97ac..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/attribute.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/attribute.h"
-
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
-  switch (attr_desc.type()) {
-    case proto::AttrType::BOOLEAN: {
-      return attr_desc.b();
-    }
-    case proto::AttrType::INT: {
-      return attr_desc.i();
-    }
-    case proto::AttrType::FLOAT: {
-      return attr_desc.f();
-    }
-    case proto::AttrType::STRING: {
-      return attr_desc.s();
-    }
-    case proto::AttrType::BOOLEANS: {
-      std::vector<bool> val(attr_desc.bools_size());
-      for (int i = 0; i < attr_desc.bools_size(); ++i) {
-        val[i] = attr_desc.bools(i);
-      }
-      return val;
-    }
-    case proto::AttrType::INTS: {
-      std::vector<int> val(attr_desc.ints_size());
-      for (int i = 0; i < attr_desc.ints_size(); ++i) {
-        val[i] = attr_desc.ints(i);
-      }
-      return val;
-    }
-    case proto::AttrType::FLOATS: {
-      std::vector<float> val(attr_desc.floats_size());
-      for (int i = 0; i < attr_desc.floats_size(); ++i) {
-        val[i] = attr_desc.floats(i);
-      }
-      return val;
-    }
-    case proto::AttrType::STRINGS: {
-      std::vector<std::string> val(attr_desc.strings_size());
-      for (int i = 0; i < attr_desc.strings_size(); ++i) {
-        val[i] = attr_desc.strings(i);
-      }
-      return val;
-    }
-    case proto::AttrType::LONG: {
-      return attr_desc.l();
-    }
-    case proto::AttrType::LONGS: {
-      std::vector<int64_t> val(attr_desc.longs_size());
-      for (int i = 0; i < attr_desc.longs_size(); ++i) {
-        val[i] = attr_desc.longs(i);
-      }
-      return val;
-    }
-    default:
-      PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
-  }
-  return boost::blank();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
deleted file mode 100644
index aa452ac220ea63bbf7a79c09b90aadfd2764856b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/attribute.h
+++ /dev/null
@@ -1,345 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-struct ExtractAttribute {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  T* operator()(Attribute& attr) const {
-    T* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<T>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
-                   attr_name_, paddle::platform::demangle(typeid(T).name()),
-                   paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-// special handle bool
-// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
-// hard to change the logic there. In another way, we should correct handle
-// if the user set `some_flag=1`.
-//
-// FIX ME anytime if there is a better solution.
-template <>
-struct ExtractAttribute<bool> {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  bool* operator()(Attribute& attr) const {
-    if (attr.type() == typeid(int)) {  // NOLINT
-      int val = boost::get<int>(attr);
-      attr = static_cast<bool>(val);
-    } else if (attr.type() == typeid(float)) {  // NOLINT
-      float val = boost::get<float>(attr);
-      attr = static_cast<bool>(val);
-    }
-    bool* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<bool>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-template <>
-struct ExtractAttribute<int64_t> {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  int64_t* operator()(Attribute& attr) const {
-    if (attr.type() == typeid(int)) {  // NOLINT
-      int val = boost::get<int>(attr);
-      attr = static_cast<int64_t>(val);
-    } else if (attr.type() == typeid(float)) {  // NOLINT
-      int val = boost::get<float>(attr);
-      attr = static_cast<int64_t>(val);
-    }
-    int64_t* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<int64_t>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-template <>
-struct ExtractAttribute<std::vector<int64_t>> {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  std::vector<int64_t>* operator()(Attribute& attr) const {
-    if (attr.type() == typeid(std::vector<int>)) {  // NOLINT
-      std::vector<int> val = boost::get<std::vector<int>>(attr);
-      std::vector<int64_t> vec(val.begin(), val.end());
-      attr = vec;
-    } else if (attr.type() == typeid(std::vector<float>)) {  // NOLINT
-      std::vector<float> val = boost::get<std::vector<float>>(attr);
-      std::vector<int64_t> vec(val.begin(), val.end());
-      attr = vec;
-    }
-    std::vector<int64_t>* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<std::vector<int64_t>>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-template <>
-struct ExtractAttribute<float> {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  float* operator()(Attribute& attr) const {
-    if (attr.type() == typeid(int)) {  // NOLINT
-      int val = boost::get<int>(attr);
-      attr = static_cast<float>(val);
-    } else if (attr.type() == typeid(int64_t)) {  // NOLINT
-      int64_t val = boost::get<int64_t>(attr);
-      attr = static_cast<float>(val);
-    }
-    float* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<float>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type float, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-template <typename T>
-inline proto::AttrType AttrTypeID() {
-  Attribute tmp = T();
-  return static_cast<proto::AttrType>(tmp.which() - 1);
-}
-
-Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc);
-
-class AttrReader {
- public:
-  explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {}
-
-  template <typename T>
-  inline const T& Get(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
-                   name);
-
-    Attribute& attr = const_cast<Attribute&>(attrs_.at(name));
-    ExtractAttribute<T> extract_attr(name);
-    T* attr_value = extract_attr(attr);
-    return *attr_value;
-  }
-
- private:
-  const AttributeMap& attrs_;
-};
-
-// check whether a value(attribute) fit a certain limit
-template <typename T>
-class GreaterThanChecker {
- public:
-  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(const T& value) const {
-    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
-  }
-
- private:
-  T lower_bound_;
-};
-
-template <typename T>
-class EqualGreaterThanChecker {
- public:
-  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(const T& value) const {
-    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
-  }
-
- private:
-  T lower_bound_;
-};
-
-// we can provide users more common Checker, like 'LessThanChecker',
-// 'BetweenChecker'...
-
-template <typename T>
-class DefaultValueSetter {
- public:
-  explicit DefaultValueSetter(T default_value)
-      : default_value_(default_value) {}
-  void operator()(T* value) const { *value = default_value_; }
-
- private:
-  T default_value_;
-};
-
-template <typename T>
-class EnumInContainer {
- public:
-  explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
-  void operator()(const T& val) const {
-    PADDLE_ENFORCE(container_.find(val) != container_.end(),
-                   "Value %s is not in enum container %s", val,
-                   ContainerDebugString());
-  }
-
- private:
-  std::string ContainerDebugString() const {
-    std::ostringstream sout;
-    sout << "[";
-    size_t cnt = 0;
-    for (auto& v : container_) {
-      sout << v;
-      ++cnt;
-      if (cnt != container_.size()) {
-        sout << " ,";
-      }
-    }
-    sout << "]";
-    return sout.str();
-  }
-
-  std::unordered_set<T> container_;
-};
-
-// check whether a certain attribute fit its limits
-// an attribute can have more than one limits
-template <typename T>
-class TypedAttrChecker {
-  typedef std::function<void(T*)> DefaultValueChecker;
-  typedef std::function<void(const T&)> ValueChecker;
-
- public:
-  explicit TypedAttrChecker(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
-    value_checkers_.push_back(EnumInContainer<T>(range));
-    return *this;
-  }
-
-  TypedAttrChecker& GreaterThan(const T& lower_bound) {
-    value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
-    return *this;
-  }
-
-  TypedAttrChecker& EqualGreaterThan(const T& lower_bound) {
-    value_checkers_.push_back(EqualGreaterThanChecker<T>(lower_bound));
-    return *this;
-  }
-
-  // we can add more common limits, like LessThan(), Between()...
-
-  TypedAttrChecker& SetDefault(const T& default_value) {
-    PADDLE_ENFORCE(default_value_setter_.empty(),
-                   "%s can't have more than one default value!", attr_name_);
-    default_value_setter_.push_back(DefaultValueSetter<T>(default_value));
-    return *this;
-  }
-
-  // allow users provide their own checker
-  TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) {
-    value_checkers_.push_back(checker);
-    return *this;
-  }
-
-  void operator()(AttributeMap* attr_map) const {
-    if (!attr_map->count(attr_name_)) {
-      // user do not set this attr
-      PADDLE_ENFORCE(!default_value_setter_.empty(),
-                     "Attribute '%s' is required!", attr_name_);
-      // default_value_setter_ has no more than one element
-      T val;
-      (default_value_setter_[0])(&val);
-      (*attr_map)[attr_name_] = val;
-    }
-    Attribute& attr = attr_map->at(attr_name_);
-    ExtractAttribute<T> extract_attr(attr_name_);
-    T* attr_value = extract_attr(attr);
-    for (const auto& checker : value_checkers_) {
-      checker(*attr_value);
-    }
-  }
-
- private:
-  std::string attr_name_;
-  std::vector<ValueChecker> value_checkers_;
-  std::vector<DefaultValueChecker> default_value_setter_;
-};
-
-// check whether op's all attributes fit their own limits
-class OpAttrChecker {
-  typedef std::function<void(AttributeMap*)> AttrChecker;
-
- public:
-  template <typename T>
-  TypedAttrChecker<T>& AddAttrChecker(const std::string& attr_name) {
-    attr_checkers_.push_back(TypedAttrChecker<T>(attr_name));
-    AttrChecker& checker = attr_checkers_.back();
-    return *(checker.target<TypedAttrChecker<T>>());
-  }
-
-  void Check(AttributeMap* attr_map) const {
-    for (const auto& checker : attr_checkers_) {
-      checker(attr_map);
-    }
-  }
-
- private:
-  std::vector<AttrChecker> attr_checkers_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
deleted file mode 100644
index 0b7aaf11746d1931e10ad7e5368d9e053092500e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/block_desc.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/block_desc.h"
-
-#include <queue>
-#include <unordered_set>
-#include <utility>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-VarDesc *BlockDesc::Var(const std::string &name) {
-  auto it = vars_.find(name);
-  if (it != vars_.end()) {
-    return it->second.get();
-  }
-  need_update_ = true;
-  auto *var = new VarDesc(name);
-  vars_[name].reset(var);
-  return var;
-}
-
-VarDesc *BlockDesc::FindVar(const std::string &name) const {
-  auto it = vars_.find(name);
-  if (it == vars_.end()) {
-    return nullptr;
-  }
-  return it->second.get();
-}
-
-bool BlockDesc::HasVar(const std::string &name) const {
-  return vars_.find(name) != vars_.end();
-}
-
-VarDesc *BlockDesc::RenameVar(const std::string &old_name,
-                              const std::string &new_name) {
-  if (!this->HasVar(old_name)) {
-    return nullptr;
-  }
-  need_update_ = true;
-  auto *var = this->Var(old_name);
-  VarDesc *new_var = new VarDesc(*(var->Proto()));
-  new_var->SetName(new_name);
-  vars_[new_name].reset(new_var);
-  // rename inputs and outputs
-  for (const auto &op : ops_) {
-    auto *it = op.get();
-    it->Rename(old_name, new_name);
-  }
-  vars_.erase(old_name);
-  return new_var;
-}
-
-VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
-  if (name == kEmptyVarName) return nullptr;
-
-  std::queue<const BlockDesc *> frontier;
-  std::unordered_set<const BlockDesc *> visited;
-
-  frontier.push(this);
-
-  while (!frontier.empty()) {  // BFS
-    auto cur = frontier.front();
-    frontier.pop();
-    if (visited.count(cur) != 0) {
-      continue;
-    }
-    auto var = cur->FindVar(name);
-    if (var != nullptr) {
-      return var;
-    }
-
-    auto fwd = cur->ForwardBlock();
-    auto parent = cur->ParentBlock();
-
-    if (fwd != nullptr) {
-      frontier.push(fwd);
-    }
-    if (parent != nullptr) {
-      frontier.push(parent);
-    }
-
-    visited.insert(cur);
-  }
-
-  return nullptr;
-}
-
-VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
-  VarDesc *res = FindVarRecursive(name_bytes);
-  if (res == nullptr) {
-    res = Var(name_bytes);
-  }
-  return *res;
-}
-
-bool BlockDesc::HasVarRecursive(const std::string &name) const {
-  return FindVarRecursive(name) != nullptr;
-}
-
-std::vector<VarDesc *> BlockDesc::AllVars() const {
-  std::vector<VarDesc *> res;
-  for (const auto &p : vars_) {
-    res.push_back(p.second.get());
-  }
-  return res;
-}
-
-OpDesc *BlockDesc::AppendOp() {
-  need_update_ = true;
-  ops_.emplace_back(new OpDesc(this));
-  return ops_.back().get();
-}
-
-void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
-  need_update_ = true;
-  ops_.emplace_back(std::move(op_desc));
-}
-
-OpDesc *BlockDesc::PrependOp() {
-  need_update_ = true;
-  ops_.emplace_front(new OpDesc(this));
-  return ops_.front().get();
-}
-
-void BlockDesc::PrependAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
-  need_update_ = true;
-  ops_.emplace_front(std::move(op_desc));
-}
-
-OpDesc *BlockDesc::InsertOp(size_t index) {
-  need_update_ = true;
-  auto it = ops_.begin() + index;
-  std::unique_ptr<OpDesc> new_op(new OpDesc(this));
-  it = ops_.insert(it, std::move(new_op));
-  return (*it).get();
-}
-
-void BlockDesc::RemoveOp(size_t s, size_t e) {
-  if (ops_.begin() + s >= ops_.end() || ops_.begin() + e > ops_.end()) {
-    return;
-  }
-  need_update_ = true;
-  ops_.erase(ops_.begin() + s, ops_.begin() + e);
-}
-
-void BlockDesc::RemoveOpInternal(const OpDesc *op_desc) {
-  // TODO(minqiyang): make this faster
-  for (auto it = ops_.begin(); it != ops_.end(); ++it) {
-    if (it->get() == op_desc) {
-      ops_.erase(it);
-      break;
-    }
-  }
-}
-
-std::vector<OpDesc *> BlockDesc::AllOps() const {
-  std::vector<OpDesc *> res;
-  for (const auto &op : ops_) {
-    res.push_back(op.get());
-  }
-  return res;
-}
-
-void BlockDesc::Flush() {
-  for (auto &op_desc : ops_) {
-    op_desc->Flush();
-  }
-
-  if (need_update_) {
-    this->desc_->mutable_ops()->Clear();
-    for (auto &op_desc : ops_) {
-      this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto());
-    }
-    this->desc_->mutable_vars()->Clear();
-    for (auto &var_desc : vars_) {
-      this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto());
-    }
-    need_update_ = false;
-  }
-}
-
-BlockDesc *BlockDesc::ParentBlock() const {
-  return prog_->MutableBlock(static_cast<size_t>(desc_->parent_idx()));
-}
-
-proto::BlockDesc *BlockDesc::Proto() {
-  Flush();
-  return desc_;
-}
-
-BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
-    : prog_(prog), desc_(desc), need_update_(false) {
-  for (const proto::VarDesc &var_desc : desc_->vars()) {
-    vars_[var_desc.name()].reset(new VarDesc(var_desc));
-  }
-  for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, this));
-  }
-}
-
-BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
-                     ProgramDesc *prog)
-    : prog_(prog), desc_(desc) {
-  need_update_ = true;
-  for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op, this));
-  }
-  for (auto &it : other.vars_) {
-    auto *var = new VarDesc(*it.second);
-    vars_[it.first].reset(var);
-  }
-}
-
-void BlockDesc::SetForwardBlockID(int32_t forward_block_id) {
-  PADDLE_ENFORCE(!desc_->has_forward_block_idx(),
-                 "Parent block ID has been set to %d. Cannot set to %d",
-                 desc_->forward_block_idx(), forward_block_id);
-  desc_->set_forward_block_idx(forward_block_id);
-}
-
-BlockDesc *BlockDesc::ForwardBlock() const {
-  return prog_->MutableBlock(static_cast<size_t>(desc_->forward_block_idx()));
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
deleted file mode 100644
index 5c6e421516269a9b9865605400efa772f944a96f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/block_desc.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <deque>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/proto_desc.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-
-class ProgramDesc;
-
-// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
-// read/write speed. Only when we want the protobuf message, the local changes
-// will be synchronized (by `Sync` method).
-
-class BlockDesc {
- public:
-  BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc);
-
-  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
-
-  int32_t ID() const { return desc_->idx(); }
-
-  int32_t Parent() const { return desc_->parent_idx(); }
-
-  int32_t ForwardBlockID() const { return desc_->forward_block_idx(); }
-
-  VarDesc *Var(const std::string &name_bytes);
-
-  VarDesc *FindVar(const std::string &name_bytes) const;
-
-  bool HasVar(const std::string &var_name) const;
-
-  VarDesc *RenameVar(const std::string &old_name, const std::string &new_name);
-
-  VarDesc *FindVarRecursive(const std::string &name_bytes) const;
-
-  VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes);
-
-  bool HasVarRecursive(const std::string &var_name) const;
-
-  std::set<std::string> LocalVarNames() const {
-    std::set<std::string> var_names;
-    for (auto &var : vars_) {
-      var_names.insert(var.first);
-    }
-    return var_names;
-  }
-
-  std::vector<VarDesc *> AllVars() const;
-
-  BlockDesc *ParentBlock() const;
-
-  BlockDesc *ForwardBlock() const;
-
-  void SetForwardBlockID(int32_t forward_block_id);
-
-  OpDesc *AppendOp();
-
-  void AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
-
-  OpDesc *PrependOp();
-
-  void PrependAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
-
-  OpDesc *InsertOp(size_t index);
-
-  /*
-   * Only remove op itself,
-   * do nothing to its input and output variables
-   */
-  void RemoveOp(size_t s, size_t e);
-
-  void RemoveOpInternal(const OpDesc *op_desc);
-
-  void RemoveVar(const std::string &name) { vars_.erase(name); }
-
-  std::vector<OpDesc *> AllOps() const;
-
-  size_t OpSize() const { return ops_.size(); }
-
-  OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
-
-  void Flush();
-
-  proto::BlockDesc *Proto();
-
-  ProgramDesc *Program() const { return this->prog_; }
-
- private:
-  ProgramDesc *prog_;       // not_own
-  proto::BlockDesc *desc_;  // not_own
-  bool need_update_;
-
-  std::deque<std::unique_ptr<OpDesc>> ops_;
-  std::unordered_map<std::string, std::unique_ptr<VarDesc>> vars_;
-
-  DISABLE_COPY_AND_ASSIGN(BlockDesc);
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/blocking_queue.h b/paddle/fluid/framework/blocking_queue.h
deleted file mode 100644
index 4f35da402f3ec2b0616c29085d01e8b7f3d0d472..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/blocking_queue.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <deque>
-#include <mutex>  // NOLINT
-#include <utility>
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-class BlockingQueue {
- public:
-  void Push(const T &item) {
-    {
-      std::lock_guard<std::mutex> g(mutex_);
-      q_.emplace_back(item);
-    }
-    cv_.notify_one();
-  }
-
-  void Push(T &&item) {
-    {
-      std::lock_guard<std::mutex> g(mutex_);
-      q_.emplace_back(std::move(item));
-    }
-    cv_.notify_one();
-  }
-
-  template <typename U>
-  void Extend(const U &items) {
-    {
-      std::lock_guard<std::mutex> g(mutex_);
-      for (auto &item : items) {
-        q_.emplace_back(item);
-      }
-    }
-    cv_.notify_all();
-  }
-
-  template <typename U>
-  void Extend(U &&items) {
-    {
-      std::lock_guard<std::mutex> g(mutex_);
-      for (auto &item : items) {
-        q_.emplace_back(std::move(item));
-      }
-    }
-    cv_.notify_all();
-  }
-
-  std::deque<T> PopAll(size_t ms, bool *timeout) {
-    auto time =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(ms);
-    std::unique_lock<std::mutex> lock(mutex_);
-    *timeout = !cv_.wait_until(lock, time, [this] { return !q_.empty(); });
-    std::deque<T> ret;
-    if (!*timeout) {
-      std::swap(ret, q_);
-    }
-    return ret;
-  }
-
-  T Pop() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [=] { return !q_.empty(); });
-    T rc(std::move(q_.front()));
-    q_.pop_front();
-    return rc;
-  }
-
-  void Pop(T *t) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [=] { return !q_.empty(); });
-    *t = std::move(q_.front());
-    q_.pop_front();
-  }
-
-  size_t Size() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return q_.size();
-  }
-
-  void Clear() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    std::deque<T>().swap(q_);
-  }
-
- private:
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  std::deque<T> q_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
deleted file mode 100644
index d186ef1274625827d8e7e0174c6ff8e9475d0dae..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/channel.h
+++ /dev/null
@@ -1,460 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
-#include <glog/logging.h>
-#include <algorithm>
-#include <condition_variable>  // NOLINT
-#include <deque>
-#include <limits>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/expect.h"
-
-namespace paddle {
-namespace framework {
-
-template <class T>
-class ChannelObject {
- public:
-  ChannelObject() {}
-
-  // capacity can be zero
-  explicit ChannelObject(size_t capacity) {
-    capacity_ = (std::min)(MaxCapacity(), capacity);
-  }
-
-  void Clear() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    data_.clear();
-    data_.shrink_to_fit();
-  }
-
-  size_t Capacity() {
-    return capacity_;  // atomic
-  }
-
-  void SetCapacity(size_t x) {  // capacity can be zero
-    std::lock_guard<std::mutex> lock(mutex_);
-    capacity_ = std::min(MaxCapacity(), x);
-    Notify();
-  }
-
-  size_t BlockSize() {
-    return block_size_;  // atomic
-  }
-
-  void SetBlockSize(size_t x) {
-    CHECK(x >= 1) << "block size must be >= 1";
-    std::lock_guard<std::mutex> lock(mutex_);
-    block_size_ = x;
-  }
-
-  template <class U>
-  void InheritFrom(const std::shared_ptr<ChannelObject<U>>& other) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    capacity_ = other->Capacity();
-    block_size_ = other->BlockSize();
-  }
-
-  bool Closed() {
-    return closed_;  // atomic
-  }
-
-  // open channel, then data can be write() to channel
-  void Open() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    closed_ = false;
-    Notify();
-  }
-
-  // close channel, then no more data can be write() to channel
-  void Close() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    closed_ = true;
-    Notify();
-  }
-
-  size_t Size() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return data_.size();
-  }
-
-  bool Empty() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return EmptyUnlocked();
-  }
-
-  // blocking operation
-  bool Get(T& val) { return Read(1, &val) != 0; }  // NOLINT
-
-  // blocking operation
-  // returns 0 if the channel is closed and empty
-  size_t Read(size_t n, T* p) {
-    if (n == 0) {
-      return 0;
-    }
-
-    std::unique_lock<std::mutex> lock(mutex_);
-    size_t finished = Read(n, p, lock);
-    Notify();
-    return finished;
-  }
-
-  // blocking operation
-  bool Put(T&& val) { return WriteMove(1, &val) != 0; }
-
-  // blocking operation
-  bool Put(const T& val) { return Write(1, &val) != 0; }
-
-  // blocking operation
-  // returns value less than n if the channel is closed
-  size_t Write(size_t n, const T* p) {
-    if (n == 0) {
-      return 0;
-    }
-    std::unique_lock<std::mutex> lock(mutex_);
-    size_t finished = Write(n, p, lock);
-    Notify();
-    return finished;
-  }
-
-  // WriteMove() will clear original contents of input array
-  size_t WriteMove(size_t n, T* p) {
-    if (n == 0) {
-      return 0;
-    }
-    std::unique_lock<std::mutex> lock(mutex_);
-    size_t finished = WriteMove(n, p, lock);
-    Notify();
-    return finished;
-  }
-
-  // read data of block size from channel to vector
-  size_t Read(std::vector<T>& p) {  // NOLINT
-    p.resize(block_size_);
-    size_t finished = Read(p.size(), &p[0]);
-    p.resize(finished);
-    return finished;
-  }
-
-  size_t ReadAll(std::vector<T>& p) {  // NOLINT
-    p.clear();
-    size_t finished = 0;
-    size_t n = 0;
-    do {
-      // _block_size may change anytime
-      n = block_size_;
-      p.resize(finished + n);
-      n = Read(n, &p[finished]);
-      finished += n;
-    } while (n != 0);
-    p.resize(finished);
-    return finished;
-  }
-
-  // write data from vector to channel
-  size_t Write(const std::vector<T>& p) { return Write(p.size(), &p[0]); }
-
-  // write data from vector to channel
-  size_t Write(std::vector<T>&& p) { return WriteMove(p.size(), &p[0]); }
-
- private:
-  size_t capacity_ = MaxCapacity();
-  size_t block_size_ = 1024;
-  bool closed_ = false;
-  std::mutex mutex_;
-  // use deque to store data
-  std::deque<T> data_;
-  size_t reading_count_ = 0;
-  int empty_waiters_ = 0;
-  int full_waiters_ = 0;
-  std::condition_variable empty_cond_;
-  std::condition_variable full_cond_;
-
-  static constexpr size_t MaxCapacity() {
-    return (std::numeric_limits<size_t>::max)() / 2;
-  }
-
-  void Notify() {
-    if (empty_waiters_ != 0 && (!EmptyUnlocked() || closed_)) {
-      empty_cond_.notify_one();
-    }
-    if (full_waiters_ != 0 && (!FullUnlocked() || closed_)) {
-      full_cond_.notify_one();
-    }
-  }
-
-  bool EmptyUnlocked() { return data_.empty(); }
-
-  bool FullUnlocked() { return data_.size() >= capacity_ + reading_count_; }
-
-  bool WaitForRead(std::unique_lock<std::mutex>& lock) {  // NOLINT
-#ifdef _LINUX
-    while (unlikely(EmptyUnlocked() && !closed_)) {
-#else
-    while (EmptyUnlocked() && !closed_) {
-#endif
-      if (full_waiters_ != 0) {
-        full_cond_.notify_one();
-      }
-      empty_waiters_++;
-      empty_cond_.wait(lock);
-      empty_waiters_--;
-    }
-    return !EmptyUnlocked();
-  }
-
-  bool WaitForWrite(std::unique_lock<std::mutex>& lock) {  // NOLINT
-#ifdef _LINUX
-    while (unlikely(FullUnlocked() && !closed_)) {
-#else
-    while (FullUnlocked() && !closed_) {
-#endif
-      if (empty_waiters_ != 0) {
-        empty_cond_.notify_one();
-      }
-      full_waiters_++;
-      full_cond_.wait(lock);
-      full_waiters_--;
-    }
-    return !closed_;
-  }
-
-  size_t Read(size_t n, T* p, std::unique_lock<std::mutex>& lock) {  // NOLINT
-    size_t finished = 0;
-    CHECK(n <= MaxCapacity() - reading_count_);
-    reading_count_ += n;
-    while (finished < n && WaitForRead(lock)) {
-      size_t m = std::min(n - finished, data_.size());
-      for (size_t i = 0; i < m; i++) {
-        p[finished++] = std::move(data_.front());
-        data_.pop_front();
-      }
-      reading_count_ -= m;
-    }
-    reading_count_ -= n - finished;
-    return finished;
-  }
-
-  size_t Write(size_t n,
-               const T* p,                            // NOLINT
-               std::unique_lock<std::mutex>& lock) {  // NOLINT
-    size_t finished = 0;
-    while (finished < n && WaitForWrite(lock)) {
-      size_t m =
-          std::min(n - finished, capacity_ + reading_count_ - data_.size());
-      for (size_t i = 0; i < m; i++) {
-        data_.push_back(p[finished++]);
-      }
-    }
-    return finished;
-  }
-
-  size_t WriteMove(size_t n,
-                   T* p,                                  // NOLINT
-                   std::unique_lock<std::mutex>& lock) {  // NOLINT
-    size_t finished = 0;
-    while (finished < n && WaitForWrite(lock)) {
-      size_t m =
-          std::min(n - finished, capacity_ + reading_count_ - data_.size());
-      for (size_t i = 0; i < m; i++) {
-        data_.push_back(std::move(p[finished++]));
-      }
-    }
-    return finished;
-  }
-};  // NOLINT
-
-template <class T>
-using Channel = std::shared_ptr<ChannelObject<T>>;
-
-template <class T>
-Channel<T> MakeChannel(size_t capacity = (std::numeric_limits<size_t>::max)()) {
-  return std::make_shared<ChannelObject<T>>(capacity);
-}
-
-template <class T, class U>
-Channel<T> MakeChannel(const Channel<U>& other) {
-  CHECK(other != nullptr) << "channel can not be NULL";
-  Channel<T> chan = std::make_shared<ChannelObject<T>>();
-  chan->InheritFrom(other);
-  return chan;
-}
-
-// NOTE: ChannelReader is a wrapper for quick read channel with a buffer. It
-// will read a block data from channel, but user can get data one by one. So it
-// is important to notice that user must call operator>> until false, or call
-// get_buffer_remain until false to make sure the buffered data all readed.
-template <class T>
-class ChannelReader {
- public:
-  explicit ChannelReader(ChannelObject<T>* channel = nullptr) {
-    Reset(channel);
-  }
-
-  ~ChannelReader() { CHECK(cursor_ == 0) << "Forgot to read buffer data"; }
-
-  ChannelObject<T>* channel() { return channel_; }
-
-  void Reset(ChannelObject<T>* channel) {
-    CHECK(channel != nullptr) << "Channel can not be nullptr";
-    channel_ = channel;
-    cursor_ = 0;
-    failed_ = !channel;
-  }
-
-  // whether there were read failed
-  operator bool() { return !failed_; }
-
-  ChannelReader<T>& operator>>(T& val) {
-    if (failed_) {
-      return *this;
-    }
-    if (cursor_ >= buffer_.size()) {
-      cursor_ = 0;
-      if (channel_->read(buffer_) == 0) {
-        failed_ = true;
-        return *this;
-      }
-    }
-    val = std::move(buffer_[cursor_++]);
-    return *this;
-  }
-
-  bool GetBufferRemain(T& val) {  // NOLINT
-    if (cursor_ >= buffer_.size()) {
-      cursor_ = 0;
-      return false;
-    }
-    val = std::move(buffer_[cursor_++]);
-    return true;
-  }
-
- private:
-  ChannelObject<T>* channel_ = nullptr;
-  std::vector<T> buffer_;
-  size_t cursor_ = 0;
-  bool failed_ = true;
-};  // NOLINT
-
-template <class T>
-class ChannelWriter {
- public:
-  explicit ChannelWriter(ChannelObject<T>* channel = nullptr) {
-    Reset(channel);
-  }
-
-  ~ChannelWriter() { CHECK(buffer_.empty()) << "Forgot to flush"; }
-
-  ChannelObject<T>* channel() { return channel_; }
-
-  void Reset(ChannelObject<T>* channel) {
-    CHECK(buffer_.empty()) << "Forgot to flush";
-    //    CHECK(channel != nullptr) << "Channel can not be nullptr";
-    channel_ = channel;
-    buffer_.clear();
-    failed_ = !channel;
-  }
-
-  // whether there were write failed
-  operator bool() { return !failed_; }
-
-  ChannelWriter<T>& operator<<(T&& val) {
-    if (failed_) {
-      return *this;
-    }
-    buffer_.push_back(std::move(val));
-    if (buffer_.size() >= channel_->BlockSize()) {
-      Flush();
-    }
-    return *this;
-  }
-
-  ChannelWriter<T>& operator<<(const T& val) {
-    if (failed_) {
-      return *this;
-    }
-    buffer_.push_back(val);
-    if (buffer_.size() >= channel_->BlockSize()) {
-      Flush();
-    }
-    return *this;
-  }
-
-  void Flush() {
-    if (failed_ || buffer_.empty()) {
-      buffer_.clear();
-      return;
-    }
-    failed_ |=
-        channel_->WriteMove(buffer_.size(), &buffer_[0]) != buffer_.size();
-    buffer_.clear();
-  }
-
- private:
-  ChannelObject<T>* channel_ = nullptr;
-  std::vector<T> buffer_;
-  bool failed_ = true;
-};  // NOLINT
-
-// only used for range-for loop
-// for (auto& x : chan) {...}
-template <class T>
-struct ChannelIterator {
-  std::shared_ptr<ChannelReader<T>> reader_;
-  T data_;
-
-  void operator++() {
-    CHECK(reader_ != nullptr) << "reader can not be NULL";
-    if (!(*reader_ >> data_)) {
-      reader_ = nullptr;
-    }
-  }
-
-  T& operator*() { return data_; }
-
-  friend bool operator==(const ChannelIterator<T>& a,
-                         const ChannelIterator<T>& b) {
-    return a.reader_ == b.reader_;
-  }
-
-  friend bool operator!=(const ChannelIterator<T>& a,
-                         const ChannelIterator<T>& b) {
-    return a.reader_ != b.reader_;
-  }
-};  // NOLINT
-
-template <class T>
-ChannelIterator<T> begin(ChannelObject<T>* chan) {
-  ChannelIterator<T> it{std::make_shared<ChannelReader<T>>(chan), T()};
-  ++it;
-  return it;
-}
-
-template <class T>
-ChannelIterator<T> end(ChannelObject<T>* chan) {
-  return {nullptr, T()};
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/commit.h.in b/paddle/fluid/framework/commit.h.in
deleted file mode 100644
index 3a33ece624443a99083ae29abb70254a5ac40a3d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/commit.h.in
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-
-static std::string paddle_commit() {
-  return "@PADDLE_COMMIT@";
-}
-
-static std::string paddle_compile_branch() {
-  return "@PADDLE_BRANCH@";
-}
-
-static std::string paddle_version() {
-  return "@PADDLE_VERSION@";
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
deleted file mode 100644
index fee6ba40047053ed5662fe044eceb0c687bd4db9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_device_transform.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_device_transform.h"
-
-namespace paddle {
-namespace framework {
-
-void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
-                     Tensor *out) {
-  VLOG(3) << "DeviceTransform in, src_place " << in.place()
-          << " dst_place: " << dst_place;
-
-  PADDLE_ENFORCE_NE(
-      in.place().which(), dst_place.which(),
-      "Currently, model parallelism is only supported between CPU and CUDA");
-
-  // NOTE(yy): TransDataDevice should wait for computation of input.
-  platform::DeviceContextPool::Instance().Get(in.place())->Wait();
-  platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
-
-  // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and
-  // the enforced checkings have been done in GetDeviceContext, so the
-  // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
-  // slow, especially when the number of elements is little, for example,
-  // the elements of learning rate are one and it's CPU side.
-  // One solution is to use a CUDA kernel to complete the copy operation when
-  // the transforming is from CPU to GPU and the number of elements is little.
-  // But the embarrassment is that this solution this solution makes training
-  // slower.
-  TensorCopySync(in, dst_place, out);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h
deleted file mode 100644
index 8ff97646cfce7979b1c9c570e6de4f1bd26916c3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_device_transform.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
-                     Tensor* out);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
deleted file mode 100644
index 96a2f9250ff928fe58a5339a25c68c9db515522d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
-};
-
-class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("input", "input1 of test op");
-    AddOutput("output", "output of test op");
-    AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
-    AddComment("This is test op");
-  }
-};
-
-class TestOpWithKernel : public OperatorWithKernel {
- public:
-  using OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-  OpKernelType GetExpectedKernelType(
-      const ExecutionContext& ctx) const override {
-    if (Attr<bool>("use_gpu")) {
-      VLOG(3) << "force use gpu kernel";
-      return OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0));
-    } else {
-      VLOG(3) << "use default kernel";
-      return OpKernelType(proto::VarType::FP32,
-                          ctx.Input<Tensor>("input")->place());
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TestKernel : public OpKernel<float> {
- public:
-  void Compute(const ExecutionContext& ctx) const {
-    std::cout << ctx.op().DebugString() << std::endl;
-
-    const Tensor* input = ctx.Input<Tensor>("input");
-
-    std::cout << "input place:" << input->place() << std::endl;
-    auto* output = ctx.Output<framework::LoDTensor>("output");
-    output->Resize(input->dims());
-    output->mutable_data<T>(ctx.GetPlace());
-
-    operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
-        input, input, output, ctx.template device_context<DeviceContext>(),
-        AddFunctor<T>());
-    functor.Run();
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(
-    test_op, paddle::framework::TestOpWithKernel,
-    paddle::framework::OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(
-    test_op,
-    paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    test_op,
-    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
-
-static void BuildVar(const std::string& param_name,
-                     std::initializer_list<const char*> arguments,
-                     paddle::framework::proto::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    *var->mutable_arguments()->Add() = arg_name;
-  }
-}
-
-TEST(Operator, CPUtoGPU) {
-  paddle::framework::InitDevices(true);
-
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace cpu_place;
-
-  // create an op to run on CPU
-  paddle::framework::proto::OpDesc cpu_op_desc;
-  cpu_op_desc.set_type("test_op");
-  BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs());
-  BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs());
-
-  auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
-  // prepare input
-  auto* in_t = scope.Var("IN1")->GetMutable<paddle::framework::LoDTensor>();
-  auto* src_ptr =
-      in_t->mutable_data<float>({2, 3}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 2 * 3; ++i) {
-    src_ptr[i] = static_cast<float>(i);
-  }
-
-  // get output
-  auto* output = scope.Var("OUT1");
-  cpu_op->Run(scope, cpu_place);
-
-  auto* output_ptr = output->Get<paddle::framework::LoDTensor>().data<float>();
-  for (int i = 0; i < 2 * 3; ++i) {
-    ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
-  }
-
-  // create an op to run on GPU
-  paddle::framework::proto::OpDesc gpu_op_desc;
-  gpu_op_desc.set_type("test_op");
-  BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs());
-  BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs());
-
-  auto attr = gpu_op_desc.mutable_attrs()->Add();
-  attr->set_name("use_gpu");
-  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
-  attr->set_b(true);
-
-  auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc);
-
-  paddle::platform::CUDAPlace cuda_place(0);
-  // get output
-  auto* output2 = scope.Var("OUT2");
-  gpu_op->Run(scope, cuda_place);
-  VLOG(3) << "after gpu_op run";
-
-  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  auto dev_ctx = pool.Get(cuda_place);
-
-  paddle::framework::Tensor output_tensor;
-  paddle::framework::TensorCopy(output2->Get<paddle::framework::LoDTensor>(),
-                                paddle::platform::CPUPlace(), *dev_ctx,
-                                &output_tensor);
-
-  dev_ctx->Wait();
-  float* output2_ptr = output_tensor.data<float>();
-  for (int i = 0; i < 2 * 3; ++i) {
-    ASSERT_EQ(output2_ptr[i], static_cast<float>(i) * 4);
-  }
-}
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
deleted file mode 100644
index bfeb29778efd6811ebcd30ca099281b45d01005c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_feed.cc
+++ /dev/null
@@ -1,1182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
-#include "paddle/fluid/framework/data_feed.h"
-#ifdef _LINUX
-#include <stdio_ext.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#endif
-#include <utility>
-#include "gflags/gflags.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/message.h"
-#include "google/protobuf/text_format.h"
-#include "io/fs.h"
-#include "io/shell.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/platform/timer.h"
-
-namespace paddle {
-namespace framework {
-
-void RecordCandidateList::ReSize(size_t length) {
-  _mutex.lock();
-  _capacity = length;
-  CHECK(_capacity > 0);  // NOLINT
-  _candidate_list.clear();
-  _candidate_list.resize(_capacity);
-  _full = false;
-  _cur_size = 0;
-  _total_size = 0;
-  _mutex.unlock();
-}
-
-void RecordCandidateList::ReInit() {
-  _mutex.lock();
-  _full = false;
-  _cur_size = 0;
-  _total_size = 0;
-  _mutex.unlock();
-}
-
-void RecordCandidateList::AddAndGet(const Record& record,
-                                    RecordCandidate* result) {
-  _mutex.lock();
-  size_t index = 0;
-  ++_total_size;
-  auto fleet_ptr = FleetWrapper::GetInstance();
-  if (!_full) {
-    _candidate_list[_cur_size++] = record;
-    _full = (_cur_size == _capacity);
-  } else {
-    CHECK(_cur_size == _capacity);
-    index = fleet_ptr->LocalRandomEngine()() % _total_size;
-    if (index < _capacity) {
-      _candidate_list[index] = record;
-    }
-  }
-  index = fleet_ptr->LocalRandomEngine()() % _cur_size;
-  *result = _candidate_list[index];
-  _mutex.unlock();
-}
-
-void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
-  CheckInit();
-  for (size_t i = 0; i < use_slots_.size(); ++i) {
-    if (name == use_slots_[i]) {
-      if (var == nullptr) {
-        feed_vec_[i] = nullptr;
-      } else {
-        feed_vec_[i] = var->GetMutable<LoDTensor>();
-      }
-    }
-  }
-}
-
-bool DataFeed::SetFileList(const std::vector<std::string>& files) {
-  std::unique_lock<std::mutex> lock(*mutex_for_pick_file_);
-  CheckInit();
-  // Do not set finish_set_filelist_ flag,
-  // since a user may set file many times after init reader
-  filelist_.assign(files.begin(), files.end());
-
-  finish_set_filelist_ = true;
-  return true;
-}
-
-void DataFeed::SetBatchSize(int batch_size) {
-  PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size);
-  default_batch_size_ = batch_size;
-}
-
-bool DataFeed::PickOneFile(std::string* filename) {
-  PADDLE_ENFORCE(mutex_for_pick_file_ != nullptr,
-                 "should call SetFileListMutex before PickOneFile");
-  PADDLE_ENFORCE(file_idx_ != nullptr,
-                 "should call SetFileListIndex before PickOneFile");
-  std::unique_lock<std::mutex> lock(*mutex_for_pick_file_);
-  if (*file_idx_ == filelist_.size()) {
-    VLOG(3) << "DataFeed::PickOneFile no more file to pick";
-    return false;
-  }
-  VLOG(3) << "file_idx_=" << *file_idx_;
-  *filename = filelist_[(*file_idx_)++];
-  return true;
-}
-
-void DataFeed::CheckInit() {
-  PADDLE_ENFORCE(finish_init_, "Initialization did not succeed.");
-}
-
-void DataFeed::CheckSetFileList() {
-  PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed.");
-}
-
-void DataFeed::CheckStart() {
-  PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet.");
-}
-
-void DataFeed::AssignFeedVar(const Scope& scope) {
-  CheckInit();
-  for (size_t i = 0; i < use_slots_.size(); ++i) {
-    feed_vec_[i] = scope.FindVar(use_slots_[i])->GetMutable<LoDTensor>();
-  }
-}
-
-void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
-  if (platform::is_cpu_place(this->place_)) {
-    memcpy(dst, src, size);
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
-#else
-    PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
-#endif
-  }
-}
-
-template <typename T>
-void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
-  PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
-  queue_size_ = queue_size;
-  queue_ = paddle::framework::MakeChannel<T>();
-  queue_->SetCapacity(queue_size);
-}
-
-template <typename T>
-bool PrivateQueueDataFeed<T>::Start() {
-  CheckSetFileList();
-  read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
-  read_thread_.detach();
-
-  finish_start_ = true;
-  return true;
-}
-
-template <typename T>
-void PrivateQueueDataFeed<T>::ReadThread() {
-#ifdef _LINUX
-  std::string filename;
-  while (PickOneFile(&filename)) {
-    int err_no = 0;
-    fp_ = fs_open_read(filename, &err_no, pipe_command_);
-    __fsetlocking(&*fp_, FSETLOCKING_BYCALLER);
-    T instance;
-    while (ParseOneInstanceFromPipe(&instance)) {
-      queue_->Put(instance);
-    }
-  }
-  queue_->Close();
-#endif
-}
-
-template <typename T>
-int PrivateQueueDataFeed<T>::Next() {
-#ifdef _LINUX
-  CheckStart();
-  int index = 0;
-  T ins_vec;
-  while (index < default_batch_size_) {
-    T instance;
-    if (!queue_->Get(instance)) {
-      break;
-    }
-    AddInstanceToInsVec(&ins_vec, instance, index++);
-  }
-  batch_size_ = index;
-  if (batch_size_ != 0) {
-    PutToFeedVec(ins_vec);
-  }
-  return batch_size_;
-#else
-  return 0;
-#endif
-}
-
-// explicit instantiation
-template class PrivateQueueDataFeed<std::vector<MultiSlotType>>;
-
-template <typename T>
-InMemoryDataFeed<T>::InMemoryDataFeed() {
-  this->file_idx_ = nullptr;
-  this->mutex_for_pick_file_ = nullptr;
-  this->fp_ = nullptr;
-  this->thread_id_ = 0;
-  this->thread_num_ = 1;
-  this->parse_ins_id_ = false;
-  this->parse_content_ = false;
-  this->input_channel_ = nullptr;
-  this->output_channel_ = nullptr;
-  this->consume_channel_ = nullptr;
-}
-
-template <typename T>
-bool InMemoryDataFeed<T>::Start() {
-#ifdef _LINUX
-  this->CheckSetFileList();
-  if (output_channel_->Size() == 0 && input_channel_->Size() != 0) {
-    std::vector<T> data;
-    input_channel_->Read(data);
-    output_channel_->Write(std::move(data));
-  }
-#endif
-  this->finish_start_ = true;
-  return true;
-}
-
-template <typename T>
-int InMemoryDataFeed<T>::Next() {
-#ifdef _LINUX
-  this->CheckStart();
-  CHECK(output_channel_ != nullptr);
-  CHECK(consume_channel_ != nullptr);
-  VLOG(3) << "output_channel_ size=" << output_channel_->Size()
-          << ", consume_channel_ size=" << consume_channel_->Size()
-          << ", thread_id=" << thread_id_;
-  int index = 0;
-  T instance;
-  std::vector<T> ins_vec;
-  ins_vec.reserve(this->default_batch_size_);
-  while (index < this->default_batch_size_) {
-    if (output_channel_->Size() == 0) {
-      break;
-    }
-    output_channel_->Get(instance);
-    ins_vec.push_back(instance);
-    ++index;
-    consume_channel_->Put(std::move(instance));
-  }
-  this->batch_size_ = index;
-  VLOG(3) << "batch_size_=" << this->batch_size_
-          << ", thread_id=" << thread_id_;
-  if (this->batch_size_ != 0) {
-    PutToFeedVec(ins_vec);
-  } else {
-    VLOG(3) << "finish reading, output_channel_ size="
-            << output_channel_->Size()
-            << ", consume_channel_ size=" << consume_channel_->Size()
-            << ", thread_id=" << thread_id_;
-  }
-  return this->batch_size_;
-#else
-  return 0;
-#endif
-}
-
-template <typename T>
-void InMemoryDataFeed<T>::SetInputChannel(void* channel) {
-  input_channel_ = static_cast<paddle::framework::ChannelObject<T>*>(channel);
-}
-
-template <typename T>
-void InMemoryDataFeed<T>::SetOutputChannel(void* channel) {
-  output_channel_ = static_cast<paddle::framework::ChannelObject<T>*>(channel);
-}
-
-template <typename T>
-void InMemoryDataFeed<T>::SetConsumeChannel(void* channel) {
-  consume_channel_ = static_cast<paddle::framework::ChannelObject<T>*>(channel);
-}
-
-template <typename T>
-void InMemoryDataFeed<T>::SetThreadId(int thread_id) {
-  thread_id_ = thread_id;
-}
-
-template <typename T>
-void InMemoryDataFeed<T>::SetThreadNum(int thread_num) {
-  thread_num_ = thread_num;
-}
-
-template <typename T>
-void InMemoryDataFeed<T>::SetParseContent(bool parse_content) {
-  parse_content_ = parse_content;
-}
-
-template <typename T>
-void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
-  parse_ins_id_ = parse_ins_id;
-}
-
-template <typename T>
-void InMemoryDataFeed<T>::LoadIntoMemory() {
-#ifdef _LINUX
-  VLOG(3) << "LoadIntoMemory() begin, thread_id=" << thread_id_;
-  std::string filename;
-  while (this->PickOneFile(&filename)) {
-    VLOG(3) << "PickOneFile, filename=" << filename
-            << ", thread_id=" << thread_id_;
-    int err_no = 0;
-    this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_);
-    CHECK(this->fp_ != nullptr);
-    __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER);
-    paddle::framework::ChannelWriter<T> writer(input_channel_);
-    T instance;
-    platform::Timer timeline;
-    timeline.Start();
-    while (ParseOneInstanceFromPipe(&instance)) {
-      writer << std::move(instance);
-      instance = T();
-    }
-    writer.Flush();
-    timeline.Pause();
-    VLOG(3) << "LoadIntoMemory() read all lines, file=" << filename
-            << ", cost time=" << timeline.ElapsedSec()
-            << " seconds, thread_id=" << thread_id_;
-  }
-  VLOG(3) << "LoadIntoMemory() end, thread_id=" << thread_id_;
-#endif
-}
-
-// explicit instantiation
-template class InMemoryDataFeed<Record>;
-
-void MultiSlotDataFeed::Init(
-    const paddle::framework::DataFeedDesc& data_feed_desc) {
-  finish_init_ = false;
-  finish_set_filelist_ = false;
-  finish_start_ = false;
-
-  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
-                 "Multi_slot_desc has not been set.");
-  paddle::framework::MultiSlotDesc multi_slot_desc =
-      data_feed_desc.multi_slot_desc();
-  SetBatchSize(data_feed_desc.batch_size());
-  // temporarily set queue size = batch size * 100
-  SetQueueSize(data_feed_desc.batch_size() * 100);
-  size_t all_slot_num = multi_slot_desc.slots_size();
-  all_slots_.resize(all_slot_num);
-  all_slots_type_.resize(all_slot_num);
-  use_slots_index_.resize(all_slot_num);
-  total_dims_without_inductive_.resize(all_slot_num);
-  inductive_shape_index_.resize(all_slot_num);
-  use_slots_.clear();
-  use_slots_is_dense_.clear();
-  for (size_t i = 0; i < all_slot_num; ++i) {
-    const auto& slot = multi_slot_desc.slots(i);
-    all_slots_[i] = slot.name();
-    all_slots_type_[i] = slot.type();
-    use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
-    total_dims_without_inductive_[i] = 1;
-    inductive_shape_index_[i] = -1;
-    if (slot.is_used()) {
-      use_slots_.push_back(all_slots_[i]);
-      use_slots_is_dense_.push_back(slot.is_dense());
-      std::vector<int> local_shape;
-      if (slot.is_dense()) {
-        for (size_t j = 0; j < slot.shape_size(); ++j) {
-          if (slot.shape(j) > 0) {
-            total_dims_without_inductive_[i] *= slot.shape(j);
-          }
-          if (slot.shape(j) == -1) {
-            inductive_shape_index_[i] = j;
-          }
-        }
-      }
-      for (size_t j = 0; j < slot.shape_size(); ++j) {
-        local_shape.push_back(slot.shape(j));
-      }
-      use_slots_shape_.push_back(local_shape);
-    }
-  }
-  feed_vec_.resize(use_slots_.size());
-  pipe_command_ = data_feed_desc.pipe_command();
-  finish_init_ = true;
-}
-
-void MultiSlotDataFeed::ReadThread() {
-#ifdef _LINUX
-  std::string filename;
-  while (PickOneFile(&filename)) {
-    int err_no = 0;
-    fp_ = fs_open_read(filename, &err_no, pipe_command_);
-    CHECK(fp_ != nullptr);
-    __fsetlocking(&*fp_, FSETLOCKING_BYCALLER);
-    std::vector<MultiSlotType> instance;
-    int ins_num = 0;
-    while (ParseOneInstanceFromPipe(&instance)) {
-      ins_num++;
-      queue_->Put(instance);
-    }
-    VLOG(3) << "filename: " << filename << " inst num: " << ins_num;
-  }
-  queue_->Close();
-#endif
-}
-
-bool MultiSlotDataFeed::CheckFile(const char* filename) {
-#ifdef _LINUX
-  CheckInit();  // get info of slots
-  std::ifstream fin(filename);
-  if (!fin.good()) {
-    VLOG(1) << "error: open file<" << filename << "> fail";
-    return false;
-  }
-  std::string line;
-  int instance_cout = 0;
-  std::string all_slots_alias = "";
-  for (const auto& alias : all_slots_) {
-    all_slots_alias += alias + " ";
-  }
-  std::string use_slots_alias = "";
-  for (const auto& alias : use_slots_) {
-    use_slots_alias += alias + " ";
-  }
-  VLOG(3) << "total slots num: " << all_slots_.size();
-  VLOG(3) << "total slots alias: " << all_slots_alias;
-  VLOG(3) << "used slots num: " << use_slots_.size();
-  VLOG(3) << "used slots alias: " << use_slots_alias;
-  while (getline(fin, line)) {
-    ++instance_cout;
-    const char* str = line.c_str();
-    char* endptr = const_cast<char*>(str);
-    int len = line.length();
-    for (size_t i = 0; i < all_slots_.size(); ++i) {
-      auto num = strtol(endptr, &endptr, 10);
-      if (num < 0) {
-        VLOG(0) << "error: the number of ids is a negative number: " << num;
-        VLOG(0) << "please check line<" << instance_cout << "> in file<"
-                << filename << ">";
-        return false;
-      } else if (num == 0) {
-        VLOG(0)
-            << "error: the number of ids can not be zero, you need "
-               "padding it in data generator; or if there is something wrong"
-               " with the data, please check if the data contains unresolvable "
-               "characters.";
-        VLOG(0) << "please check line<" << instance_cout << "> in file<"
-                << filename << ">";
-        return false;
-      } else if (errno == ERANGE || num > INT_MAX) {
-        VLOG(0) << "error: the number of ids greater than INT_MAX";
-        VLOG(0) << "please check line<" << instance_cout << "> in file<"
-                << filename << ">";
-        return false;
-      }
-      if (all_slots_type_[i] == "float") {
-        for (int i = 0; i < num; ++i) {
-          strtof(endptr, &endptr);
-          if (errno == ERANGE) {
-            VLOG(0) << "error: the value is out of the range of "
-                       "representable values for float";
-            VLOG(0) << "please check line<" << instance_cout << "> in file<"
-                    << filename << ">";
-            return false;
-          }
-          if (i + 1 != num && endptr - str == len) {
-            VLOG(0) << "error: there is a wrong with the number of ids.";
-            VLOG(0) << "please check line<" << instance_cout << "> in file<"
-                    << filename << ">";
-            return false;
-          }
-        }
-      } else if (all_slots_type_[i] == "uint64") {
-        for (int i = 0; i < num; ++i) {
-          strtoull(endptr, &endptr, 10);
-          if (errno == ERANGE) {
-            VLOG(0) << "error: the value is out of the range of "
-                       "representable values for uint64_t";
-            VLOG(0) << "please check line<" << instance_cout << "> in file<"
-                    << filename << ">";
-            return false;
-          }
-          if (i + 1 != num && endptr - str == len) {
-            VLOG(0) << "error: there is a wrong with the number of ids.";
-            VLOG(0) << "please check line<" << instance_cout << "> in file<"
-                    << filename << ">";
-            return false;
-          }
-        }
-      } else {
-        VLOG(0) << "error: this type<" << all_slots_type_[i]
-                << "> is not supported";
-        return false;
-      }
-    }
-    // It may be added '\t' character to the end of the output of reduce
-    // task when processes data by Hadoop(when the output of the reduce
-    // task of Hadoop has only one field, it will add a '\t' at the end
-    // of the line by default, and you can use this option to avoid it:
-    // `-D mapred.textoutputformat.ignoreseparator=true`), which does
-    // not affect the correctness of the data. Therefore, it should be
-    // judged that the data is not normal when the end of each line of
-    // data contains characters which are not spaces.
-    while (endptr - str != len) {
-      if (!isspace(*(endptr++))) {
-        VLOG(0)
-            << "error: there is some extra characters at the end of the line.";
-        VLOG(0) << "please check line<" << instance_cout << "> in file<"
-                << filename << ">";
-        return false;
-      }
-    }
-  }
-  VLOG(3) << "instances cout: " << instance_cout;
-  VLOG(3) << "The file format is correct";
-#endif
-  return true;
-}
-
-bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
-    std::vector<MultiSlotType>* instance) {
-#ifdef _LINUX
-  thread_local string::LineFileReader reader;
-
-  if (!reader.getline(&*(fp_.get()))) {
-    return false;
-  } else {
-    int use_slots_num = use_slots_.size();
-    instance->resize(use_slots_num);
-
-    const char* str = reader.get();
-    std::string line = std::string(str);
-    // VLOG(3) << line;
-    char* endptr = const_cast<char*>(str);
-    int pos = 0;
-    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
-      int idx = use_slots_index_[i];
-      int num = strtol(&str[pos], &endptr, 10);
-      PADDLE_ENFORCE(
-          num,
-          "The number of ids can not be zero, you need padding "
-          "it in data generator; or if there is something wrong with "
-          "the data, please check if the data contains unresolvable "
-          "characters.\nplease check this error line: %s",
-          str);
-      if (idx != -1) {
-        (*instance)[idx].Init(all_slots_type_[i]);
-        if ((*instance)[idx].GetType()[0] == 'f') {  // float
-          for (int j = 0; j < num; ++j) {
-            float feasign = strtof(endptr, &endptr);
-            (*instance)[idx].AddValue(feasign);
-          }
-        } else if ((*instance)[idx].GetType()[0] == 'u') {  // uint64
-          for (int j = 0; j < num; ++j) {
-            uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
-            (*instance)[idx].AddValue(feasign);
-          }
-        }
-        pos = endptr - str;
-      } else {
-        for (int j = 0; j <= num; ++j) {
-          // pos = line.find_first_of(' ', pos + 1);
-          while (line[pos + 1] != ' ') {
-            pos++;
-          }
-        }
-      }
-    }
-    return true;
-  }
-#else
-  return true;
-#endif
-}
-
-bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
-#ifdef _LINUX
-  std::string line;
-  if (getline(file_, line)) {
-    int use_slots_num = use_slots_.size();
-    instance->resize(use_slots_num);
-    // parse line
-    const char* str = line.c_str();
-    char* endptr = const_cast<char*>(str);
-    int pos = 0;
-    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
-      int idx = use_slots_index_[i];
-      int num = strtol(&str[pos], &endptr, 10);
-      PADDLE_ENFORCE(
-          num,
-          "The number of ids can not be zero, you need padding "
-          "it in data generator; or if there is something wrong with "
-          "the data, please check if the data contains unresolvable "
-          "characters.\nplease check this error line: %s",
-          str);
-
-      if (idx != -1) {
-        (*instance)[idx].Init(all_slots_type_[i]);
-        if ((*instance)[idx].GetType()[0] == 'f') {  // float
-          for (int j = 0; j < num; ++j) {
-            float feasign = strtof(endptr, &endptr);
-            (*instance)[idx].AddValue(feasign);
-          }
-        } else if ((*instance)[idx].GetType()[0] == 'u') {  // uint64
-          for (int j = 0; j < num; ++j) {
-            uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
-            (*instance)[idx].AddValue(feasign);
-          }
-        }
-        pos = endptr - str;
-      } else {
-        for (int j = 0; j <= num; ++j) {
-          pos = line.find_first_of(' ', pos + 1);
-        }
-      }
-    }
-  } else {
-    return false;
-  }
-#endif
-  return false;
-}
-
-void MultiSlotDataFeed::AddInstanceToInsVec(
-    std::vector<MultiSlotType>* ins_vec,
-    const std::vector<MultiSlotType>& instance, int index) {
-#ifdef _LINUX
-  if (index == 0) {
-    ins_vec->resize(instance.size());
-    for (size_t i = 0; i < instance.size(); ++i) {
-      (*ins_vec)[i].Init(instance[i].GetType());
-      (*ins_vec)[i].InitOffset();
-    }
-  }
-
-  for (size_t i = 0; i < instance.size(); ++i) {
-    (*ins_vec)[i].AddIns(instance[i]);
-  }
-#endif
-}
-
-void MultiSlotDataFeed::PutToFeedVec(
-    const std::vector<MultiSlotType>& ins_vec) {
-#ifdef _LINUX
-  for (size_t i = 0; i < use_slots_.size(); ++i) {
-    if (feed_vec_[i] == nullptr) {
-      continue;
-    }
-    const auto& type = ins_vec[i].GetType();
-    const auto& offset = ins_vec[i].GetOffset();
-    int total_instance = static_cast<int>(offset.back());
-
-    if (type[0] == 'f') {  // float
-      const auto& feasign = ins_vec[i].GetFloatData();
-      float* tensor_ptr =
-          feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float));
-    } else if (type[0] == 'u') {  // uint64
-      // no uint64_t type in paddlepaddle
-      const auto& feasign = ins_vec[i].GetUint64Data();
-      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-          {total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, &feasign[0],
-                       total_instance * sizeof(int64_t));
-    }
-
-    LoD data_lod{offset};
-    feed_vec_[i]->set_lod(data_lod);
-    if (use_slots_is_dense_[i]) {
-      if (inductive_shape_index_[i] != -1) {
-        use_slots_shape_[i][inductive_shape_index_[i]] =
-            total_instance / total_dims_without_inductive_[i];
-      }
-      feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
-    }
-  }
-#endif
-}
-
-void MultiSlotInMemoryDataFeed::Init(
-    const paddle::framework::DataFeedDesc& data_feed_desc) {
-  finish_init_ = false;
-  finish_set_filelist_ = false;
-  finish_start_ = false;
-
-  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
-                 "Multi_slot_desc has not been set.");
-  paddle::framework::MultiSlotDesc multi_slot_desc =
-      data_feed_desc.multi_slot_desc();
-  SetBatchSize(data_feed_desc.batch_size());
-  size_t all_slot_num = multi_slot_desc.slots_size();
-  all_slots_.resize(all_slot_num);
-  all_slots_type_.resize(all_slot_num);
-  use_slots_index_.resize(all_slot_num);
-  total_dims_without_inductive_.resize(all_slot_num);
-  inductive_shape_index_.resize(all_slot_num);
-  use_slots_.clear();
-  use_slots_is_dense_.clear();
-  for (size_t i = 0; i < all_slot_num; ++i) {
-    const auto& slot = multi_slot_desc.slots(i);
-    all_slots_[i] = slot.name();
-    all_slots_type_[i] = slot.type();
-    use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
-    total_dims_without_inductive_[i] = 1;
-    inductive_shape_index_[i] = -1;
-    if (slot.is_used()) {
-      use_slots_.push_back(all_slots_[i]);
-      use_slots_is_dense_.push_back(slot.is_dense());
-      std::vector<int> local_shape;
-      if (slot.is_dense()) {
-        for (size_t j = 0; j < slot.shape_size(); ++j) {
-          if (slot.shape(j) > 0) {
-            total_dims_without_inductive_[i] *= slot.shape(j);
-          }
-          if (slot.shape(j) == -1) {
-            inductive_shape_index_[i] = j;
-          }
-        }
-      }
-      for (size_t j = 0; j < slot.shape_size(); ++j) {
-        local_shape.push_back(slot.shape(j));
-      }
-      use_slots_shape_.push_back(local_shape);
-    }
-  }
-  feed_vec_.resize(use_slots_.size());
-  pipe_command_ = data_feed_desc.pipe_command();
-  finish_init_ = true;
-}
-
-bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
-#ifdef _LINUX
-  thread_local string::LineFileReader reader;
-
-  if (!reader.getline(&*(fp_.get()))) {
-    return false;
-  } else {
-    const char* str = reader.get();
-    std::string line = std::string(str);
-    // VLOG(3) << line;
-    char* endptr = const_cast<char*>(str);
-    int pos = 0;
-    if (parse_ins_id_) {
-      int num = strtol(&str[pos], &endptr, 10);
-      CHECK(num == 1);  // NOLINT
-      pos = endptr - str + 1;
-      size_t len = 0;
-      while (str[pos + len] != ' ') {
-        ++len;
-      }
-      instance->ins_id_ = std::string(str + pos, len);
-      pos += len + 1;
-      VLOG(3) << "ins_id " << instance->ins_id_;
-    }
-    if (parse_content_) {
-      int num = strtol(&str[pos], &endptr, 10);
-      CHECK(num == 1);  // NOLINT
-      pos = endptr - str + 1;
-      size_t len = 0;
-      while (str[pos + len] != ' ') {
-        ++len;
-      }
-      instance->content_ = std::string(str + pos, len);
-      pos += len + 1;
-      VLOG(3) << "content " << instance->content_;
-    }
-    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
-      int idx = use_slots_index_[i];
-      int num = strtol(&str[pos], &endptr, 10);
-      PADDLE_ENFORCE(
-          num,
-          "The number of ids can not be zero, you need padding "
-          "it in data generator; or if there is something wrong with "
-          "the data, please check if the data contains unresolvable "
-          "characters.\nplease check this error line: %s",
-          str);
-      if (idx != -1) {
-        if (all_slots_type_[i][0] == 'f') {  // float
-          for (int j = 0; j < num; ++j) {
-            float feasign = strtof(endptr, &endptr);
-            // if float feasign is equal to zero, ignore it
-            // except when slot is dense
-            if (fabs(feasign) < 1e-6 && !use_slots_is_dense_[i]) {
-              continue;
-            }
-            FeatureKey f;
-            f.float_feasign_ = feasign;
-            instance->float_feasigns_.push_back(FeatureItem(f, idx));
-          }
-        } else if (all_slots_type_[i][0] == 'u') {  // uint64
-          for (int j = 0; j < num; ++j) {
-            uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
-            // if uint64 feasign is equal to zero, ignore it
-            // except when slot is dense
-            if (feasign == 0 && !use_slots_is_dense_[i]) {
-              continue;
-            }
-            FeatureKey f;
-            f.uint64_feasign_ = feasign;
-            instance->uint64_feasigns_.push_back(FeatureItem(f, idx));
-          }
-        }
-        pos = endptr - str;
-      } else {
-        for (int j = 0; j <= num; ++j) {
-          // pos = line.find_first_of(' ', pos + 1);
-          while (line[pos + 1] != ' ') {
-            pos++;
-          }
-        }
-      }
-    }
-    instance->float_feasigns_.shrink_to_fit();
-    instance->uint64_feasigns_.shrink_to_fit();
-    return true;
-  }
-#else
-  return false;
-#endif
-}
-
-bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
-#ifdef _LINUX
-  std::string line;
-  if (getline(file_, line)) {
-    VLOG(3) << line;
-    // parse line
-    const char* str = line.c_str();
-    char* endptr = const_cast<char*>(str);
-    int pos = 0;
-    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
-      int idx = use_slots_index_[i];
-      int num = strtol(&str[pos], &endptr, 10);
-      PADDLE_ENFORCE(
-          num,
-          "The number of ids can not be zero, you need padding "
-          "it in data generator; or if there is something wrong with "
-          "the data, please check if the data contains unresolvable "
-          "characters.\nplease check this error line: %s",
-          str);
-
-      if (idx != -1) {
-        if (all_slots_type_[i][0] == 'f') {  // float
-          for (int j = 0; j < num; ++j) {
-            float feasign = strtof(endptr, &endptr);
-            if (fabs(feasign) < 1e-6) {
-              continue;
-            }
-            FeatureKey f;
-            f.float_feasign_ = feasign;
-            instance->float_feasigns_.push_back(FeatureItem(f, idx));
-          }
-        } else if (all_slots_type_[i][0] == 'u') {  // uint64
-          for (int j = 0; j < num; ++j) {
-            uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
-            if (feasign == 0) {
-              continue;
-            }
-            FeatureKey f;
-            f.uint64_feasign_ = feasign;
-            instance->uint64_feasigns_.push_back(FeatureItem(f, idx));
-          }
-        }
-        pos = endptr - str;
-      } else {
-        for (int j = 0; j <= num; ++j) {
-          pos = line.find_first_of(' ', pos + 1);
-        }
-      }
-    }
-    instance->float_feasigns_.shrink_to_fit();
-    instance->uint64_feasigns_.shrink_to_fit();
-    return true;
-  } else {
-    return false;
-  }
-#endif
-  return false;
-}
-
-void MultiSlotInMemoryDataFeed::PutToFeedVec(
-    const std::vector<Record>& ins_vec) {
-#ifdef _LINUX
-  std::vector<std::vector<float>> batch_float_feasigns(use_slots_.size(),
-                                                       std::vector<float>());
-  std::vector<std::vector<uint64_t>> batch_uint64_feasigns(
-      use_slots_.size(), std::vector<uint64_t>());
-  std::vector<std::vector<size_t>> offset(use_slots_.size(),
-                                          std::vector<size_t>{0});
-  std::vector<bool> visit(use_slots_.size(), false);
-  ins_content_vec_.clear();
-  ins_content_vec_.reserve(ins_vec.size());
-  ins_id_vec_.clear();
-  ins_id_vec_.reserve(ins_vec.size());
-  for (size_t i = 0; i < ins_vec.size(); ++i) {
-    auto& r = ins_vec[i];
-    ins_id_vec_.push_back(r.ins_id_);
-    ins_content_vec_.push_back(r.content_);
-    for (auto& item : r.float_feasigns_) {
-      batch_float_feasigns[item.slot()].push_back(item.sign().float_feasign_);
-      visit[item.slot()] = true;
-    }
-    for (auto& item : r.uint64_feasigns_) {
-      batch_uint64_feasigns[item.slot()].push_back(item.sign().uint64_feasign_);
-      visit[item.slot()] = true;
-    }
-    for (size_t j = 0; j < use_slots_.size(); ++j) {
-      const auto& type = all_slots_type_[j];
-      if (visit[j]) {
-        visit[j] = false;
-      } else {
-        // fill slot value with default value 0
-        if (type[0] == 'f') {  // float
-          batch_float_feasigns[j].push_back(0.0);
-        } else if (type[0] == 'u') {  // uint64
-          batch_uint64_feasigns[j].push_back(0);
-        }
-      }
-      // get offset of this ins in this slot
-      if (type[0] == 'f') {  // float
-        offset[j].push_back(batch_float_feasigns[j].size());
-      } else if (type[0] == 'u') {  // uint64
-        offset[j].push_back(batch_uint64_feasigns[j].size());
-      }
-    }
-  }
-
-  for (size_t i = 0; i < use_slots_.size(); ++i) {
-    if (feed_vec_[i] == nullptr) {
-      continue;
-    }
-    int total_instance = offset[i].back();
-    const auto& type = all_slots_type_[i];
-    if (type[0] == 'f') {  // float
-      float* feasign = batch_float_feasigns[i].data();
-      float* tensor_ptr =
-          feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float));
-    } else if (type[0] == 'u') {  // uint64
-      // no uint64_t type in paddlepaddle
-      uint64_t* feasign = batch_uint64_feasigns[i].data();
-      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-          {total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t));
-    }
-    auto& slot_offset = offset[i];
-    LoD data_lod{slot_offset};
-    feed_vec_[i]->set_lod(data_lod);
-    if (use_slots_is_dense_[i]) {
-      if (inductive_shape_index_[i] != -1) {
-        use_slots_shape_[i][inductive_shape_index_[i]] =
-            total_instance / total_dims_without_inductive_[i];
-      }
-      feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
-    }
-  }
-#endif
-}
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-template <typename T>
-void PrivateInstantDataFeed<T>::PutToFeedVec() {
-  for (size_t i = 0; i < use_slots_.size(); ++i) {
-    const auto& type = ins_vec_[i].GetType();
-    const auto& offset = ins_vec_[i].GetOffset();
-    int total_instance = static_cast<int>(offset.back());
-
-    if (type[0] == 'f') {  // float
-      const auto& feasign = ins_vec_[i].GetFloatData();
-      float* tensor_ptr =
-          feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float));
-    } else if (type[0] == 'u') {  // uint64
-      // no uint64_t type in paddlepaddle
-      const auto& feasign = ins_vec_[i].GetUint64Data();
-      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-          {total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, &feasign[0],
-                       total_instance * sizeof(int64_t));
-    }
-
-    LoD data_lod{offset};
-    feed_vec_[i]->set_lod(data_lod);
-    if (use_slots_is_dense_[i]) {
-      int64_t total_dims = 1;
-      for (const auto e : use_slots_shape_[i]) {
-        total_dims *= e;
-      }
-      PADDLE_ENFORCE(
-          total_dims == total_instance,
-          "The actual data size of slot[%s] doesn't match its declaration",
-          use_slots_[i].c_str());
-      feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
-    }
-  }
-}
-
-template <typename T>
-int PrivateInstantDataFeed<T>::Next() {
-  if (ParseOneMiniBatch()) {
-    PutToFeedVec();
-    return ins_vec_[0].GetBatchSize();
-  }
-  Postprocess();
-
-  std::string filename;
-  if (!PickOneFile(&filename)) {
-    return -1;
-  }
-  if (!Preprocess(filename)) {
-    return -1;
-  }
-
-  PADDLE_ENFORCE(true == ParseOneMiniBatch(), "Fail to parse mini-batch data");
-  PutToFeedVec();
-  return ins_vec_[0].GetBatchSize();
-}
-
-template <typename T>
-void PrivateInstantDataFeed<T>::Init(const DataFeedDesc& data_feed_desc) {
-  finish_init_ = false;
-  finish_set_filelist_ = false;
-  finish_start_ = false;
-
-  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
-                 "Multi_slot_desc has not been set.");
-  paddle::framework::MultiSlotDesc multi_slot_desc =
-      data_feed_desc.multi_slot_desc();
-  SetBatchSize(data_feed_desc.batch_size());
-  size_t all_slot_num = multi_slot_desc.slots_size();
-  all_slots_.resize(all_slot_num);
-  all_slots_type_.resize(all_slot_num);
-  use_slots_index_.resize(all_slot_num);
-  multi_inductive_shape_index_.resize(all_slot_num);
-  use_slots_.clear();
-  use_slots_is_dense_.clear();
-  for (size_t i = 0; i < all_slot_num; ++i) {
-    const auto& slot = multi_slot_desc.slots(i);
-    all_slots_[i] = slot.name();
-    all_slots_type_[i] = slot.type();
-    use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
-    if (slot.is_used()) {
-      use_slots_.push_back(all_slots_[i]);
-      use_slots_is_dense_.push_back(slot.is_dense());
-      std::vector<int> local_shape;
-      if (slot.is_dense()) {
-        for (size_t j = 0; j < slot.shape_size(); ++j) {
-          if (slot.shape(j) == -1) {
-            multi_inductive_shape_index_[i].push_back(j);
-          }
-        }
-      }
-      for (size_t j = 0; j < slot.shape_size(); ++j) {
-        local_shape.push_back(slot.shape(j));
-      }
-      use_slots_shape_.push_back(local_shape);
-    }
-  }
-  feed_vec_.resize(use_slots_.size());
-  ins_vec_.resize(use_slots_.size());
-
-  finish_init_ = true;
-}
-
-template class PrivateInstantDataFeed<std::vector<MultiSlotType>>;
-
-bool MultiSlotFileInstantDataFeed::Preprocess(const std::string& filename) {
-  fd_ = open(filename.c_str(), O_RDONLY);
-  PADDLE_ENFORCE(fd_ != -1, "Fail to open file: %s", filename.c_str());
-
-  struct stat sb;
-  fstat(fd_, &sb);
-  end_ = static_cast<size_t>(sb.st_size);
-
-  buffer_ =
-      reinterpret_cast<char*>(mmap(NULL, end_, PROT_READ, MAP_PRIVATE, fd_, 0));
-  PADDLE_ENFORCE(buffer_ != MAP_FAILED, strerror(errno));
-
-  offset_ = 0;
-  return true;
-}
-
-bool MultiSlotFileInstantDataFeed::Postprocess() {
-  if (buffer_ != nullptr) {
-    munmap(buffer_, end_);
-    buffer_ = nullptr;
-  }
-  if (fd_ != -1) {
-    close(fd_);
-    fd_ = -1;
-    end_ = 0;
-    offset_ = 0;
-  }
-  return true;
-}
-
-bool MultiSlotFileInstantDataFeed::ParseOneMiniBatch() {
-  if (offset_ == end_) {
-    return false;
-  }
-
-  batch_size_ = 0;
-  while (batch_size_ < default_batch_size_ && offset_ < end_) {
-    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
-      int idx = use_slots_index_[i];
-      char type = all_slots_type_[i][0];
-
-      uint16_t num = *reinterpret_cast<uint16_t*>(buffer_ + offset_);
-      PADDLE_ENFORCE(
-          num,
-          "The number of ids can not be zero, you need padding "
-          "it in data generator; or if there is something wrong with "
-          "the data, please check if the data contains unresolvable "
-          "characters.");
-      offset_ += sizeof(uint16_t);
-
-      if (idx != -1) {
-        int inductive_size = multi_inductive_shape_index_[i].size();
-        if (UNLIKELY(batch_size_ == 0)) {
-          ins_vec_[idx].Init(all_slots_type_[i], default_batch_size_ * num);
-          ins_vec_[idx].InitOffset(default_batch_size_);
-          uint64_t* inductive_shape =
-              reinterpret_cast<uint64_t*>(buffer_ + offset_);
-          for (int inductive_id = 0; inductive_id < inductive_size;
-               ++inductive_id) {
-            use_slots_shape_[i][multi_inductive_shape_index_[i][inductive_id]] =
-                static_cast<int>(*(inductive_shape + inductive_id));
-          }
-        }
-        num -= inductive_size;
-        offset_ += sizeof(uint64_t) * inductive_size;
-
-        if (type == 'f') {
-          ins_vec_[idx].AppendValues(
-              reinterpret_cast<float*>(buffer_ + offset_), num);
-          offset_ += num * sizeof(float);
-        } else if (type == 'u') {
-          ins_vec_[idx].AppendValues(
-              reinterpret_cast<uint64_t*>(buffer_ + offset_), num);
-          offset_ += num * sizeof(uint64_t);
-        }
-      } else {
-        if (type == 'f') {
-          offset_ += num * sizeof(float);
-        } else if (type == 'u') {
-          offset_ += num * sizeof(uint64_t);
-        }
-      }
-    }
-    ++batch_size_;
-    // OPTIMIZE: It is better to insert check codes between instances for format
-    // checking
-  }
-
-  PADDLE_ENFORCE(batch_size_ == default_batch_size_ || offset_ == end_,
-                 "offset_ != end_");
-  return true;
-}
-#endif
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
deleted file mode 100644
index 9ea9be41999145f69a600598e42ee5cce2d64afa..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_feed.h
+++ /dev/null
@@ -1,612 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
-#include <fstream>
-#include <future>  // NOLINT
-#include <memory>
-#include <mutex>  // NOLINT
-#include <sstream>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/archive.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/data_feed.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace framework {
-
-// DataFeed is the base virtual class for all ohther DataFeeds.
-// It is used to read files and parse the data for subsequent trainer.
-// Example:
-//   DataFeed* reader =
-//   paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name);
-//   reader->Init(data_feed_desc); // data_feed_desc is a protobuf object
-//   reader->SetFileList(filelist);
-//   const std::vector<std::string> & use_slot_alias =
-//   reader->GetUseSlotAlias();
-//   for (auto name: use_slot_alias){ // for binding memory
-//     reader->AddFeedVar(scope->Var(name), name);
-//   }
-//   reader->Start();
-//   while (reader->Next()) {
-//      // trainer do something
-//   }
-class DataFeed {
- public:
-  DataFeed() {
-    mutex_for_pick_file_ = nullptr;
-    file_idx_ = nullptr;
-  }
-  virtual ~DataFeed() {}
-  virtual void Init(const DataFeedDesc& data_feed_desc) = 0;
-  virtual bool CheckFile(const char* filename) {
-    PADDLE_THROW("This function(CheckFile) is not implemented.");
-  }
-  // Set filelist for DataFeed.
-  // Pay attention that it must init all readers before call this function.
-  // Otherwise, Init() function will init finish_set_filelist_ flag.
-  virtual bool SetFileList(const std::vector<std::string>& files);
-  virtual bool Start() = 0;
-
-  // The trainer calls the Next() function, and the DataFeed will load a new
-  // batch to the feed_vec. The return value of this function is the batch
-  // size of the current batch.
-  virtual int Next() = 0;
-  // Get all slots' alias which defined in protofile
-  virtual const std::vector<std::string>& GetAllSlotAlias() {
-    return all_slots_;
-  }
-  // Get used slots' alias which defined in protofile
-  virtual const std::vector<std::string>& GetUseSlotAlias() {
-    return use_slots_;
-  }
-  // This function is used for binding feed_vec memory
-  virtual void AddFeedVar(Variable* var, const std::string& name);
-
-  // This function is used for binding feed_vec memory in a given scope
-  virtual void AssignFeedVar(const Scope& scope);
-
-  // This function will do nothing at default
-  virtual void SetInputChannel(void* channel) {}
-  // This function will do nothing at default
-  virtual void SetOutputChannel(void* channel) {}
-  // This function will do nothing at default
-  virtual void SetConsumeChannel(void* channel) {}
-  // This function will do nothing at default
-  virtual void SetThreadId(int thread_id) {}
-  // This function will do nothing at default
-  virtual void SetThreadNum(int thread_num) {}
-  // This function will do nothing at default
-  virtual void SetParseInsId(bool parse_ins_id) {}
-  virtual void SetParseContent(bool parse_content) {}
-  virtual void SetFileListMutex(std::mutex* mutex) {
-    mutex_for_pick_file_ = mutex;
-  }
-  virtual void SetFileListIndex(size_t* file_index) { file_idx_ = file_index; }
-  virtual const std::vector<std::string>& GetInsIdVec() const {
-    return ins_id_vec_;
-  }
-  virtual const std::vector<std::string>& GetInsContentVec() const {
-    return ins_content_vec_;
-  }
-  virtual int GetCurBatchSize() { return batch_size_; }
-  virtual void LoadIntoMemory() {
-    PADDLE_THROW("This function(LoadIntoMemory) is not implemented.");
-  }
-  virtual void SetPlace(const paddle::platform::Place& place) {
-    place_ = place;
-  }
-  virtual const paddle::platform::Place& GetPlace() const { return place_; }
-
- protected:
-  // The following three functions are used to check if it is executed in this
-  // order:
-  //   Init() -> SetFileList() -> Start() -> Next()
-  virtual void CheckInit();
-  virtual void CheckSetFileList();
-  virtual void CheckStart();
-  virtual void SetBatchSize(
-      int batch);  // batch size will be set in Init() function
-  // This function is used to pick one file from the global filelist(thread
-  // safe).
-  virtual bool PickOneFile(std::string* filename);
-  virtual void CopyToFeedTensor(void* dst, const void* src, size_t size);
-
-  std::vector<std::string> filelist_;
-  size_t* file_idx_;
-  std::mutex* mutex_for_pick_file_;
-
-  // the alias of used slots, and its order is determined by
-  // data_feed_desc(proto object)
-  std::vector<std::string> use_slots_;
-  std::vector<bool> use_slots_is_dense_;
-
-  // the alias of all slots, and its order is determined by data_feed_desc(proto
-  // object)
-  std::vector<std::string> all_slots_;
-  std::vector<std::string> all_slots_type_;
-  std::vector<std::vector<int>> use_slots_shape_;
-  std::vector<int> inductive_shape_index_;
-  std::vector<int> total_dims_without_inductive_;
-  // For the inductive shape passed within data
-  std::vector<std::vector<int>> multi_inductive_shape_index_;
-  std::vector<int>
-      use_slots_index_;  // -1: not used; >=0: the index of use_slots_
-
-  // The data read by DataFeed will be stored here
-  std::vector<LoDTensor*> feed_vec_;
-
-  // the batch size defined by user
-  int default_batch_size_;
-  // current batch size
-  int batch_size_;
-
-  bool finish_init_;
-  bool finish_set_filelist_;
-  bool finish_start_;
-  std::string pipe_command_;
-  std::vector<std::string> ins_id_vec_;
-  std::vector<std::string> ins_content_vec_;
-  platform::Place place_;
-};
-
-// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
-// It use a read-thread to read file and parse data to a private-queue
-// (thread level), and get data from this queue when trainer call Next().
-template <typename T>
-class PrivateQueueDataFeed : public DataFeed {
- public:
-  PrivateQueueDataFeed() {}
-  virtual ~PrivateQueueDataFeed() {}
-  virtual bool Start();
-  virtual int Next();
-
- protected:
-  // The thread implementation function for reading file and parse.
-  virtual void ReadThread();
-  // This function is used to set private-queue size, and the most
-  // efficient when the queue size is close to the batch size.
-  virtual void SetQueueSize(int queue_size);
-  // The reading and parsing method called in the ReadThread.
-  virtual bool ParseOneInstance(T* instance) = 0;
-  virtual bool ParseOneInstanceFromPipe(T* instance) = 0;
-  // This function is used to put instance to vec_ins
-  virtual void AddInstanceToInsVec(T* vec_ins, const T& instance,
-                                   int index) = 0;
-  // This function is used to put ins_vec to feed_vec
-  virtual void PutToFeedVec(const T& ins_vec) = 0;
-
-  // The thread for read files
-  std::thread read_thread_;
-  // using ifstream one line and one line parse is faster
-  // than using fread one buffer and one buffer parse.
-  //   for a 601M real data:
-  //     ifstream one line and one line parse: 6034 ms
-  //     fread one buffer and one buffer parse: 7097 ms
-  std::ifstream file_;
-  std::shared_ptr<FILE> fp_;
-  size_t queue_size_;
-  string::LineFileReader reader_;
-  // The queue for store parsed data
-  std::shared_ptr<paddle::framework::ChannelObject<T>> queue_;
-};
-
-template <typename T>
-class InMemoryDataFeed : public DataFeed {
- public:
-  InMemoryDataFeed();
-  virtual ~InMemoryDataFeed() {}
-  virtual void Init(const DataFeedDesc& data_feed_desc) = 0;
-  virtual bool Start();
-  virtual int Next();
-  virtual void SetInputChannel(void* channel);
-  virtual void SetOutputChannel(void* channel);
-  virtual void SetConsumeChannel(void* channel);
-  virtual void SetThreadId(int thread_id);
-  virtual void SetThreadNum(int thread_num);
-  virtual void SetParseInsId(bool parse_ins_id);
-  virtual void SetParseContent(bool parse_content);
-  virtual void LoadIntoMemory();
-
- protected:
-  virtual bool ParseOneInstance(T* instance) = 0;
-  virtual bool ParseOneInstanceFromPipe(T* instance) = 0;
-  virtual void PutToFeedVec(const std::vector<T>& ins_vec) = 0;
-
-  int thread_id_;
-  int thread_num_;
-  bool parse_ins_id_;
-  bool parse_content_;
-  std::ifstream file_;
-  std::shared_ptr<FILE> fp_;
-  paddle::framework::ChannelObject<T>* input_channel_;
-  paddle::framework::ChannelObject<T>* output_channel_;
-  paddle::framework::ChannelObject<T>* consume_channel_;
-};
-
-// This class define the data type of instance(ins_vec) in MultiSlotDataFeed
-class MultiSlotType {
- public:
-  MultiSlotType() {}
-  ~MultiSlotType() {}
-  void Init(const std::string& type, size_t reserved_size = 0) {
-    CheckType(type);
-    if (type_[0] == 'f') {
-      float_feasign_.clear();
-      if (reserved_size) {
-        float_feasign_.reserve(reserved_size);
-      }
-    } else if (type_[0] == 'u') {
-      uint64_feasign_.clear();
-      if (reserved_size) {
-        uint64_feasign_.reserve(reserved_size);
-      }
-    }
-    type_ = type;
-  }
-  void InitOffset(size_t max_batch_size = 0) {
-    if (max_batch_size > 0) {
-      offset_.reserve(max_batch_size + 1);
-    }
-    offset_.resize(1);
-    // LoDTensor' lod is counted from 0, the size of lod
-    // is one size larger than the size of data.
-    offset_[0] = 0;
-  }
-  const std::vector<size_t>& GetOffset() const { return offset_; }
-  std::vector<size_t>& MutableOffset() { return offset_; }
-  void AddValue(const float v) {
-    CheckFloat();
-    float_feasign_.push_back(v);
-  }
-  void AddValue(const uint64_t v) {
-    CheckUint64();
-    uint64_feasign_.push_back(v);
-  }
-  void CopyValues(const float* input, size_t size) {
-    CheckFloat();
-    float_feasign_.resize(size);
-    memcpy(float_feasign_.data(), input, size * sizeof(float));
-  }
-  void CopyValues(const uint64_t* input, size_t size) {
-    CheckUint64();
-    uint64_feasign_.resize(size);
-    memcpy(uint64_feasign_.data(), input, size * sizeof(uint64_t));
-  }
-  void AddIns(const MultiSlotType& ins) {
-    if (ins.GetType()[0] == 'f') {  // float
-      CheckFloat();
-      auto& vec = ins.GetFloatData();
-      offset_.push_back(offset_.back() + vec.size());
-      float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end());
-    } else if (ins.GetType()[0] == 'u') {  // uint64
-      CheckUint64();
-      auto& vec = ins.GetUint64Data();
-      offset_.push_back(offset_.back() + vec.size());
-      uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end());
-    }
-  }
-  void AppendValues(const uint64_t* input, size_t size) {
-    CheckUint64();
-    offset_.push_back(offset_.back() + size);
-    uint64_feasign_.insert(uint64_feasign_.end(), input, input + size);
-  }
-  void AppendValues(const float* input, size_t size) {
-    CheckFloat();
-    offset_.push_back(offset_.back() + size);
-    float_feasign_.insert(float_feasign_.end(), input, input + size);
-  }
-  const std::vector<float>& GetFloatData() const { return float_feasign_; }
-  std::vector<float>& MutableFloatData() { return float_feasign_; }
-  const std::vector<uint64_t>& GetUint64Data() const { return uint64_feasign_; }
-  std::vector<uint64_t>& MutableUint64Data() { return uint64_feasign_; }
-  const std::string& GetType() const { return type_; }
-  size_t GetBatchSize() { return offset_.size() - 1; }
-  std::string& MutableType() { return type_; }
-
-  std::string DebugString() {
-    std::stringstream ss;
-    ss << "\ntype: " << type_ << "\n";
-    ss << "offset: ";
-    ss << "[";
-    for (const size_t& i : offset_) {
-      ss << offset_[i] << ",";
-    }
-    ss << "]\ndata: [";
-    if (type_[0] == 'f') {
-      for (const float& i : float_feasign_) {
-        ss << i << ",";
-      }
-    } else {
-      for (const uint64_t& i : uint64_feasign_) {
-        ss << i << ",";
-      }
-    }
-    ss << "]\n";
-    return ss.str();
-  }
-
- private:
-  void CheckType(const std::string& type) const {
-    PADDLE_ENFORCE((type == "uint64") || (type == "float"),
-                   "There is no this type<%s>.", type);
-  }
-  void CheckFloat() const {
-    PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_);
-  }
-  void CheckUint64() const {
-    PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_);
-  }
-  std::vector<float> float_feasign_;
-  std::vector<uint64_t> uint64_feasign_;
-  std::string type_;
-  std::vector<size_t> offset_;
-};
-
-template <class AR>
-paddle::framework::Archive<AR>& operator<<(paddle::framework::Archive<AR>& ar,
-                                           const MultiSlotType& ins) {
-  ar << ins.GetType();
-#ifdef _LINUX
-  ar << ins.GetOffset();
-#else
-  const auto& offset = ins.GetOffset();
-  ar << (uint64_t)offset.size();
-  for (const size_t& x : offset) {
-    ar << (const uint64_t)x;
-  }
-#endif
-  ar << ins.GetFloatData();
-  ar << ins.GetUint64Data();
-  return ar;
-}
-
-template <class AR>
-paddle::framework::Archive<AR>& operator>>(paddle::framework::Archive<AR>& ar,
-                                           MultiSlotType& ins) {
-  ar >> ins.MutableType();
-#ifdef _LINUX
-  ar >> ins.MutableOffset();
-#else
-  auto& offset = ins.MutableOffset();
-  offset.resize(ar.template Get<uint64_t>());
-  for (size_t& x : offset) {
-    uint64_t t;
-    ar >> t;
-    x = (size_t)t;
-  }
-#endif
-  ar >> ins.MutableFloatData();
-  ar >> ins.MutableUint64Data();
-  return ar;
-}
-
-union FeatureKey {
-  uint64_t uint64_feasign_;
-  float float_feasign_;
-};
-
-struct FeatureItem {
-  FeatureItem() {}
-  FeatureItem(FeatureKey sign, uint16_t slot) {
-    this->sign() = sign;
-    this->slot() = slot;
-  }
-  FeatureKey& sign() { return *(reinterpret_cast<FeatureKey*>(sign_buffer())); }
-  const FeatureKey& sign() const {
-    const FeatureKey* ret = reinterpret_cast<FeatureKey*>(sign_buffer());
-    return *ret;
-  }
-  uint16_t& slot() { return slot_; }
-  const uint16_t& slot() const { return slot_; }
-
- private:
-  char* sign_buffer() const { return const_cast<char*>(sign_); }
-  char sign_[sizeof(FeatureKey)];
-  uint16_t slot_;
-};
-
-// sizeof Record is much less than std::vector<MultiSlotType>
-struct Record {
-  std::vector<FeatureItem> uint64_feasigns_;
-  std::vector<FeatureItem> float_feasigns_;
-  std::string ins_id_;
-  std::string content_;
-};
-
-struct RecordCandidate {
-  std::string ins_id_;
-  std::unordered_multimap<uint16_t, FeatureKey> feas;
-
-  RecordCandidate& operator=(const Record& rec) {
-    feas.clear();
-    ins_id_ = rec.ins_id_;
-    for (auto& fea : rec.uint64_feasigns_) {
-      feas.insert({fea.slot(), fea.sign()});
-    }
-    return *this;
-  }
-};
-
-class RecordCandidateList {
- public:
-  RecordCandidateList() = default;
-  RecordCandidateList(const RecordCandidateList&) = delete;
-  RecordCandidateList& operator=(const RecordCandidateList&) = delete;
-
-  void ReSize(size_t length);
-
-  void ReInit();
-
-  void AddAndGet(const Record& record, RecordCandidate* result);
-
- private:
-  size_t _capacity = 0;
-  std::mutex _mutex;
-  bool _full = false;
-  size_t _cur_size = 0;
-  size_t _total_size = 0;
-  std::vector<RecordCandidate> _candidate_list;
-};
-
-template <class AR>
-paddle::framework::Archive<AR>& operator<<(paddle::framework::Archive<AR>& ar,
-                                           const FeatureKey& fk) {
-  ar << fk.uint64_feasign_;
-  ar << fk.float_feasign_;
-  return ar;
-}
-
-template <class AR>
-paddle::framework::Archive<AR>& operator>>(paddle::framework::Archive<AR>& ar,
-                                           FeatureKey& fk) {
-  ar >> fk.uint64_feasign_;
-  ar >> fk.float_feasign_;
-  return ar;
-}
-
-template <class AR>
-paddle::framework::Archive<AR>& operator<<(paddle::framework::Archive<AR>& ar,
-                                           const FeatureItem& fi) {
-  ar << fi.sign();
-  ar << fi.slot();
-  return ar;
-}
-
-template <class AR>
-paddle::framework::Archive<AR>& operator>>(paddle::framework::Archive<AR>& ar,
-                                           FeatureItem& fi) {
-  ar >> fi.sign();
-  ar >> fi.slot();
-  return ar;
-}
-
-template <class AR>
-paddle::framework::Archive<AR>& operator<<(paddle::framework::Archive<AR>& ar,
-                                           const Record& r) {
-  ar << r.uint64_feasigns_;
-  ar << r.float_feasigns_;
-  ar << r.ins_id_;
-  return ar;
-}
-
-template <class AR>
-paddle::framework::Archive<AR>& operator>>(paddle::framework::Archive<AR>& ar,
-                                           Record& r) {
-  ar >> r.uint64_feasigns_;
-  ar >> r.float_feasigns_;
-  ar >> r.ins_id_;
-  return ar;
-}
-
-// This DataFeed is used to feed multi-slot type data.
-// The format of multi-slot type data:
-//   [n feasign_0 feasign_1 ... feasign_n]*
-class MultiSlotDataFeed
-    : public PrivateQueueDataFeed<std::vector<MultiSlotType>> {
- public:
-  MultiSlotDataFeed() {}
-  virtual ~MultiSlotDataFeed() {}
-  virtual void Init(const DataFeedDesc& data_feed_desc);
-  virtual bool CheckFile(const char* filename);
-
- protected:
-  virtual void ReadThread();
-  virtual void AddInstanceToInsVec(std::vector<MultiSlotType>* vec_ins,
-                                   const std::vector<MultiSlotType>& instance,
-                                   int index);
-  virtual bool ParseOneInstance(std::vector<MultiSlotType>* instance);
-  virtual bool ParseOneInstanceFromPipe(std::vector<MultiSlotType>* instance);
-  virtual void PutToFeedVec(const std::vector<MultiSlotType>& ins_vec);
-};
-
-class MultiSlotInMemoryDataFeed : public InMemoryDataFeed<Record> {
- public:
-  MultiSlotInMemoryDataFeed() {}
-  virtual ~MultiSlotInMemoryDataFeed() {}
-  virtual void Init(const DataFeedDesc& data_feed_desc);
-
- protected:
-  virtual bool ParseOneInstance(Record* instance);
-  virtual bool ParseOneInstanceFromPipe(Record* instance);
-  virtual void PutToFeedVec(const std::vector<Record>& ins_vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-template <typename T>
-class PrivateInstantDataFeed : public DataFeed {
- public:
-  PrivateInstantDataFeed() {}
-  virtual ~PrivateInstantDataFeed() {}
-  void Init(const DataFeedDesc& data_feed_desc) override;
-  bool Start() override { return true; }
-  int Next() override;
-
- protected:
-  // The batched data buffer
-  std::vector<MultiSlotType> ins_vec_;
-
-  // This function is used to preprocess with a given filename, e.g. open it or
-  // mmap
-  virtual bool Preprocess(const std::string& filename) = 0;
-
-  // This function is used to postprocess system resource such as closing file
-  // NOTICE: Ensure that it is safe to call before Preprocess
-  virtual bool Postprocess() = 0;
-
-  // The reading and parsing method.
-  virtual bool ParseOneMiniBatch() = 0;
-
-  // This function is used to put ins_vec to feed_vec
-  virtual void PutToFeedVec();
-};
-
-class MultiSlotFileInstantDataFeed
-    : public PrivateInstantDataFeed<std::vector<MultiSlotType>> {
- public:
-  MultiSlotFileInstantDataFeed() {}
-  virtual ~MultiSlotFileInstantDataFeed() {}
-
- protected:
-  int fd_{-1};
-  char* buffer_{nullptr};
-  size_t end_{0};
-  size_t offset_{0};
-
-  bool Preprocess(const std::string& filename) override;
-
-  bool Postprocess() override;
-
-  bool ParseOneMiniBatch() override;
-};
-#endif
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
deleted file mode 100644
index 03996e0e20a1729ee300a5ad37abc325876930b7..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_feed.proto
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-package paddle.framework;
-
-message Slot {
-  required string name = 1;
-  required string type = 2;
-  optional bool is_dense = 3 [ default = false ];
-  optional bool is_used = 4 [ default = false ];
-  repeated int32 shape = 5; // we can define N-D Tensor
-}
-
-message MultiSlotDesc { repeated Slot slots = 1; }
-
-message DataFeedDesc {
-  optional string name = 1;
-  optional int32 batch_size = 2 [ default = 32 ];
-  optional MultiSlotDesc multi_slot_desc = 3;
-  optional string pipe_command = 4;
-  optional int32 thread_num = 5;
-}
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
deleted file mode 100644
index ec1acad99bc9b5e96fbe2433ba2bb9a62fb36966..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/framework/data_feed.h"
-
-namespace paddle {
-namespace framework {
-typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
-typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
-data_feedMap g_data_feed_map;
-
-#define REGISTER_DATAFEED_CLASS(data_feed_class)                      \
-  namespace {                                                         \
-  std::shared_ptr<DataFeed> Creator_##data_feed_class() {             \
-    return std::shared_ptr<DataFeed>(new data_feed_class);            \
-  }                                                                   \
-  class __Registerer_##data_feed_class {                              \
-   public:                                                            \
-    __Registerer_##data_feed_class() {                                \
-      g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \
-    }                                                                 \
-  };                                                                  \
-  __Registerer_##data_feed_class g_registerer_##data_feed_class;      \
-  }  // namespace
-
-std::string DataFeedFactory::DataFeedTypeList() {
-  std::string data_feed_types;
-  for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end();
-       ++iter) {
-    if (iter != g_data_feed_map.begin()) {
-      data_feed_types += ", ";
-    }
-    data_feed_types += iter->first;
-  }
-  return data_feed_types;
-}
-
-std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
-    std::string data_feed_class) {
-  if (g_data_feed_map.count(data_feed_class) < 1) {
-    LOG(WARNING) << "Your DataFeed " << data_feed_class
-                 << "is not supported currently";
-    LOG(WARNING) << "Supported DataFeed: " << DataFeedTypeList();
-    exit(-1);
-  }
-  return g_data_feed_map[data_feed_class]();
-}
-
-REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
-REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
-#endif
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed_factory.h b/paddle/fluid/framework/data_feed_factory.h
deleted file mode 100644
index 13678edb0b8d084a0b3016d93f6e1bc32ce0169a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_feed_factory.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/data_feed.h"
-
-namespace paddle {
-namespace framework {
-class DataFeedFactory {
- public:
-  static std::string DataFeedTypeList();
-  static std::shared_ptr<DataFeed> CreateDataFeed(std::string data_feed_class);
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc
deleted file mode 100644
index e1d6246862155509569b25b1fd552c04dcf455df..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_feed_test.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/data_feed.h"
-#include <fcntl.h>
-#include <chrono>  // NOLINT
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <mutex>  // NOLINT
-#include <set>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-
-paddle::framework::DataFeedDesc load_datafeed_param_from_file(
-    const char* filename) {
-  paddle::framework::DataFeedDesc data_feed_desc;
-  int file_descriptor = open(filename, O_RDONLY);
-  PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename);
-  google::protobuf::io::FileInputStream fileInput(file_descriptor);
-  google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc);
-  close(file_descriptor);
-  return data_feed_desc;
-}
-
-const std::vector<std::string> load_filelist_from_file(const char* filename) {
-  std::vector<std::string> filelist;
-  std::ifstream fin(filename);
-  PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename);
-  std::string line;
-  while (getline(fin, line)) {
-    filelist.push_back(line);
-  }
-  fin.close();
-  return filelist;
-}
-
-void GenerateFileForTest(const char* protofile, const char* filelist) {
-  std::ofstream w_protofile(protofile);
-  w_protofile << "name: \"MultiSlotDataFeed\"\n"
-                 "batch_size: 2\n"
-                 "multi_slot_desc {\n"
-                 "    slots {\n"
-                 "        name: \"uint64_sparse_slot\"\n"
-                 "        type: \"uint64\"\n"
-                 "        is_dense: false\n"
-                 "        is_used: true\n"
-                 "    }\n"
-                 "    slots {\n"
-                 "        name: \"float_sparse_slot\"\n"
-                 "        type: \"float\"\n"
-                 "        is_dense: false\n"
-                 "        is_used: true\n"
-                 "    }\n"
-                 "    slots {\n"
-                 "        name: \"uint64_dense_slot\"\n"
-                 "        type: \"uint64\"\n"
-                 "        is_dense: true\n"
-                 "        is_used: true\n"
-                 "    }\n"
-                 "    slots {\n"
-                 "        name: \"float_dense_slot\"\n"
-                 "        type: \"float\"\n"
-                 "        is_dense: true\n"
-                 "        is_used: true\n"
-                 "    }\n"
-                 "    slots {\n"
-                 "        name: \"not_used_slot\"\n"
-                 "        type: \"uint64\"\n"
-                 "        is_dense: false\n"
-                 "        is_used: false\n"
-                 "    }\n"
-                 "}";
-  w_protofile.close();
-  std::ofstream w_filelist(filelist);
-  int total_file = 4;
-  for (int i = 0; i < total_file; ++i) {
-    std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i);
-    w_filelist << filename;
-    if (i + 1 != total_file) {
-      w_filelist << std::endl;
-    }
-    std::ofstream w_datafile(filename.c_str());
-    w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n"
-                  "2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n"
-                  "1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n";
-    w_datafile.close();
-  }
-  w_filelist.close();
-}
-
-class MultiTypeSet {
- public:
-  MultiTypeSet() {
-    uint64_set_.clear();
-    float_set_.clear();
-  }
-  ~MultiTypeSet() {}
-  void AddValue(uint64_t v) { uint64_set_.insert(v); }
-  void AddValue(float v) { float_set_.insert(v); }
-  const std::set<uint64_t>& GetUint64Set() const { return uint64_set_; }
-  const std::set<float>& GetFloatSet() const { return float_set_; }
-
- private:
-  std::set<uint64_t> uint64_set_;
-  std::set<float> float_set_;
-};
-
-void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
-                          const paddle::framework::DataFeedDesc& data_feed_desc,
-                          const std::vector<std::string>& filelist,
-                          const int thread_num) {
-  int used_slot_num = 0;
-  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
-    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
-      ++used_slot_num;
-    }
-  }
-  reader_elem_set->resize(used_slot_num);
-  std::vector<std::thread> threads;
-  std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers;
-  readers.resize(thread_num);
-  for (int i = 0; i < thread_num; ++i) {
-    readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed(
-        data_feed_desc.name());
-    readers[i]->Init(data_feed_desc);
-  }
-  readers[0]->SetFileList(filelist);
-  std::mutex mu;
-  for (int idx = 0; idx < thread_num; ++idx) {
-    threads.emplace_back(std::thread([&, idx] {
-      std::unique_ptr<paddle::framework::Scope> scope(
-          new paddle::framework::Scope());
-      const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
-      std::map<std::string, const paddle::framework::LoDTensor*>
-          lodtensor_targets;
-      for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
-        const auto& slot = multi_slot_desc.slots(i);
-        if (slot.is_used()) {
-          const auto& name = slot.name();
-          readers[idx]->AddFeedVar(scope->Var(name), name);
-          lodtensor_targets[name] =
-              &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
-        }
-      }
-      readers[idx]->Start();
-      while (readers[idx]->Next()) {
-        int index = 0;
-        for (int k = 0; k < multi_slot_desc.slots_size(); ++k) {
-          const auto& slot = multi_slot_desc.slots(k);
-          if (!slot.is_used()) {
-            continue;
-          }
-          const paddle::framework::LoDTensor* tens =
-              lodtensor_targets[slot.name()];
-          if (slot.is_dense()) {  // dense branch
-            if (slot.type() == "uint64") {
-              const int64_t* data = tens->data<int64_t>();
-              int batch_size = tens->dims()[0];
-              int dim = tens->dims()[1];
-              for (int i = 0; i < batch_size; ++i) {
-                for (int j = 0; j < dim; ++j) {
-                  std::lock_guard<std::mutex> lock(mu);
-                  (*reader_elem_set)[index].AddValue(
-                      (uint64_t)data[i * dim + j]);
-                }
-              }
-            } else if (slot.type() == "float") {
-              const float* data = tens->data<float>();
-              int batch_size = tens->dims()[0];
-              int dim = tens->dims()[1];
-              for (int i = 0; i < batch_size; ++i) {
-                for (int j = 0; j < dim; ++j) {
-                  std::lock_guard<std::mutex> lock(mu);
-                  (*reader_elem_set)[index].AddValue(data[i * dim + j]);
-                }
-              }
-            } else {
-              PADDLE_THROW("Error type in proto file.");
-            }
-          } else {  // sparse branch
-            if (slot.type() == "uint64") {
-              const int64_t* data = tens->data<int64_t>();
-              for (size_t i = 0; i < tens->NumElements(); ++i) {
-                std::pair<size_t, size_t> element = tens->lod_element(0, i);
-                for (size_t j = element.first; j < element.second; ++j) {
-                  std::lock_guard<std::mutex> lock(mu);
-                  (*reader_elem_set)[index].AddValue((uint64_t)data[j]);
-                }
-              }
-            } else if (slot.type() == "float") {
-              const float* data = tens->data<float>();
-              for (size_t i = 0; i < tens->NumElements(); ++i) {
-                std::pair<size_t, size_t> element = tens->lod_element(0, i);
-                for (size_t j = element.first; j < element.second; ++j) {
-                  std::lock_guard<std::mutex> lock(mu);
-                  (*reader_elem_set)[index].AddValue(data[j]);
-                }
-              }
-            } else {
-              PADDLE_THROW("Error type in proto file.");
-            }
-          }  // end sparse branch
-          ++index;
-        }  // end slots loop
-      }    // end while Next()
-    }));   // end anonymous function
-  }
-  for (auto& th : threads) {
-    th.join();
-  }
-}
-
-void CheckIsUnorderedSame(const std::vector<MultiTypeSet>& s1,
-                          const std::vector<MultiTypeSet>& s2) {
-  EXPECT_EQ(s1.size(), s2.size());
-  for (size_t i = 0; i < s1.size(); ++i) {
-    // check for uint64
-    const std::set<uint64_t>& uint64_s1 = s1[i].GetUint64Set();
-    const std::set<uint64_t>& uint64_s2 = s2[i].GetUint64Set();
-    EXPECT_EQ(uint64_s1.size(), uint64_s2.size());
-    auto uint64_it1 = uint64_s1.begin();
-    auto uint64_it2 = uint64_s2.begin();
-    while (uint64_it1 != uint64_s1.end()) {
-      EXPECT_EQ(*uint64_it1, *uint64_it2);
-      ++uint64_it1;
-      ++uint64_it2;
-    }
-    // check for float
-    const std::set<float>& float_s1 = s1[i].GetFloatSet();
-    const std::set<float>& float_s2 = s2[i].GetFloatSet();
-    EXPECT_EQ(float_s1.size(), float_s2.size());
-    auto float_it1 = float_s1.begin();
-    auto float_it2 = float_s2.begin();
-    while (float_it1 != float_s1.end()) {
-      EXPECT_EQ(*float_it1, *float_it2);
-      ++float_it1;
-      ++float_it2;
-    }
-  }
-}
-
-void GetElemSetFromFile(std::vector<MultiTypeSet>* file_elem_set,
-                        const paddle::framework::DataFeedDesc& data_feed_desc,
-                        const std::vector<std::string>& filelist) {
-  int used_slot_num = 0;
-  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
-    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
-      ++used_slot_num;
-    }
-  }
-  file_elem_set->resize(used_slot_num);
-  for (const auto& file : filelist) {
-    std::ifstream fin(file.c_str());
-    PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str());
-    while (1) {
-      bool end_flag = false;
-      int index = 0;
-      for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
-        int num;
-        if (fin >> num) {
-          auto slot = data_feed_desc.multi_slot_desc().slots(i);
-          auto type = slot.type();
-          if (type == "uint64") {
-            while (num--) {
-              uint64_t feasign;
-              fin >> feasign;
-              if (slot.is_used()) {
-                (*file_elem_set)[index].AddValue(feasign);
-              }
-            }
-          } else if (type == "float") {
-            while (num--) {
-              float feasign;
-              fin >> feasign;
-              if (slot.is_used()) {
-                (*file_elem_set)[index].AddValue(feasign);
-              }
-            }
-          } else {
-            PADDLE_THROW("Error type in proto file.");
-          }
-          if (slot.is_used()) {
-            ++index;
-          }
-        } else {
-          end_flag = true;
-          break;
-        }
-      }
-      if (end_flag) {
-        break;
-      }
-    }
-    fin.close();
-  }
-}
-
-TEST(DataFeed, MultiSlotUnitTest) {
-  const char* protofile = "data_feed_desc.prototxt";
-  const char* filelist_name = "filelist.txt";
-  GenerateFileForTest(protofile, filelist_name);
-  const std::vector<std::string> filelist =
-      load_filelist_from_file(filelist_name);
-  paddle::framework::DataFeedDesc data_feed_desc =
-      load_datafeed_param_from_file(protofile);
-  std::vector<MultiTypeSet> reader_elem_set;
-  std::vector<MultiTypeSet> file_elem_set;
-  // GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4);
-  // GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist);
-  // CheckIsUnorderedSame(reader_elem_set, file_elem_set);
-}
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
deleted file mode 100644
index b611bb77b4e1ec05b8bd029ac37cefba346c6eb0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_layout.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cctype>
-#include <ostream>
-#include <string>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-enum class DataLayout {
-  kNHWC = 0,
-  kNCHW = 1,
-  kAnyLayout = 2,
-  kMKLDNN = 3,  // all layouts supported by MKLDNN internally
-};
-
-inline DataLayout StringToDataLayout(const std::string& str) {
-  std::string s(str);
-  for (size_t i = 0; i < s.size(); ++i) {
-    s[i] = toupper(s[i]);
-  }
-
-  if (s == "NHWC") {
-    return DataLayout::kNHWC;
-  } else if (s == "NCHW") {
-    return DataLayout::kNCHW;
-  } else if (s == "ANYLAYOUT") {
-    return DataLayout::kAnyLayout;
-  } else if (s == "MKLDNNLAYOUT") {
-    return DataLayout::kMKLDNN;
-  } else {
-    PADDLE_THROW("Unknown storage order string: %s", s);
-  }
-}
-
-inline std::string DataLayoutToString(const DataLayout& data_layout) {
-  switch (data_layout) {
-    case DataLayout::kNHWC:
-      return "NHWC";
-    case DataLayout::kNCHW:
-      return "NCHW";
-    case DataLayout::kAnyLayout:
-      return "ANY_LAYOUT";
-    case DataLayout::kMKLDNN:
-      return "MKLDNNLAYOUT";
-    default:
-      PADDLE_THROW("unknown DataLayout %d", data_layout);
-  }
-}
-
-inline std::ostream& operator<<(std::ostream& out, const DataLayout& l) {
-  out << DataLayoutToString(l);
-  return out;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
deleted file mode 100644
index fe53c6f99d8e691689dcc6e719097fb0a77ee4e4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/math/math_function.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-#endif
-
-namespace paddle {
-namespace framework {
-
-std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
-  PADDLE_ENFORCE_NE(from, to,
-                    "layout transform should transform different layout");
-  if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) {
-    return {0, 2, 3, 1};
-  } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) {
-    return {0, 3, 1, 2};
-  } else {
-    PADDLE_THROW("unsupported transform");
-  }
-}
-
-struct CastDataLayout {
-  CastDataLayout(const platform::DeviceContext* ctx,
-                 const std::vector<int>& axis, const framework::Tensor& in,
-                 framework::Tensor* out)
-      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
-  const framework::Tensor in_;
-  framework::Tensor* out_;
-  const platform::DeviceContext* ctx_;
-  const std::vector<int> axis_;
-
-  template <typename T>
-  void apply() {
-    auto place = ctx_->GetPlace();
-
-    if (platform::is_cpu_place(place)) {
-      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
-      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
-      trans4(*context, in_, out_, axis_);
-    } else {
-      PADDLE_THROW("Unsupport CPU <-> GPU!");
-    }
-  }
-};
-
-void TransDataLayout(const OpKernelType& kernel_type_for_var,
-                     const OpKernelType& expected_kernel_type, const Tensor& in,
-                     Tensor* out) {
-  PADDLE_ENFORCE(
-      platform::places_are_same_class(kernel_type_for_var.place_,
-                                      expected_kernel_type.place_),
-      "TransDataLayout only support DataLayout transform on same place!");
-
-  PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!");
-
-  auto& pool = platform::DeviceContextPool::Instance();
-
-  auto src_dim = in.dims();
-  std::vector<int64_t> dst_dim;
-
-  auto axis = GetAxis(kernel_type_for_var.data_layout_,
-                      expected_kernel_type.data_layout_);
-  dst_dim.resize(axis.size());
-  for (size_t i = 0; i < axis.size(); i++) {
-    dst_dim[i] = src_dim[axis[i]];
-  }
-
-  out->Resize(make_ddim(dst_dim));
-  out->mutable_data(expected_kernel_type.place_, in.type());
-
-  framework::VisitDataType(
-      in.type(),
-      CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out));
-
-  out->set_layout(expected_kernel_type.data_layout_);
-}
-
-#ifdef PADDLE_WITH_MKLDNN
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::reorder;
-
-void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
-  switch (type) {
-    case mkldnn::memory::data_type::f32:
-      return platform::to_void_cast(tensor.data<float>());
-    case mkldnn::memory::data_type::s8:
-      return platform::to_void_cast(tensor.data<int8_t>());
-    case mkldnn::memory::data_type::u8:
-      return platform::to_void_cast(tensor.data<unsigned char>());
-    case mkldnn::memory::data_type::s16:
-      return platform::to_void_cast(tensor.data<int16_t>());
-    case mkldnn::memory::data_type::s32:
-      return platform::to_void_cast(tensor.data<int32_t>());
-    default:
-      PADDLE_THROW("wrong mkldnn type provided");
-  }
-}
-#endif
-
-void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
-                               const OpKernelType& expected_kernel_type,
-                               const Tensor& in, Tensor* out) {
-  auto in_layout = kernel_type_for_var.data_layout_;
-  auto out_layout = expected_kernel_type.data_layout_;
-  auto place = expected_kernel_type.place_;
-
-  PADDLE_ENFORCE(
-      in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
-      "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
-      "non-MKLDNN");
-
-  innerTransDataLayoutFromMKLDNN(in_layout, out_layout, in, out, place);
-}
-
-void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
-                                    const Tensor& in, Tensor* out,
-                                    platform::Place place) {
-#ifdef PADDLE_WITH_MKLDNN
-  PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::format_undef,
-                    "Input tensor should have specified memory format");
-  PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::any,
-                    "Input tensor should have specified memory format");
-
-  // Set default as NCHW in case not specified
-  out_layout =
-      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
-
-  auto& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
-  auto& cpu_engine = dev_ctx->GetEngine();
-
-  auto in_tz = paddle::framework::vectorize<int>(in.dims());
-  auto out_tz = in_tz;
-
-  memory::data_type in_type = ToMKLDNNDataType(in.type());
-  PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
-                 "Input tensor type is not supported: %s", in.type());
-
-  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
-  auto out_format =
-      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
-
-  // output tensor has the same dims as input. Reorder don't change dims
-  out->Resize(in.dims());
-
-  if (in_format != out_format) {
-    void* in_data = GetDataFromTensor(in, in_type);
-    const std::string key = platform::CreateKey(in_tz, in_format, out_format,
-                                                std::to_string(in_type));
-
-    platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type, *dev_ctx,
-                                           cpu_engine, key);
-
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data);
-    auto reorder_dst_memory_p =
-        handler.AcquireDstMemory(out, out_format, place);
-    auto reorder_p =
-        handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-
-    std::vector<mkldnn::primitive> pipeline;
-    pipeline.push_back(*reorder_p);
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-  } else {
-    out->ShareDataWith(in);
-  }
-  out->set_layout(out_layout);
-  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
-  out->set_format(MKLDNNMemoryFormat::format_undef);
-#endif
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
deleted file mode 100644
index d67ea1e5308ede12a1c6a4159bc92e0ee8a177a7..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_layout_transform.h
+++ /dev/null
@@ -1,86 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/variable.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-
-#ifdef PADDLE_WITH_MKLDNN
-using MKLDNNDataType = mkldnn::memory::data_type;
-
-inline MKLDNNMemoryFormat ToMKLDNNFormat(const DataLayout& layout) {
-  switch (layout) {
-    case DataLayout::kNHWC:
-      return MKLDNNMemoryFormat::nhwc;
-    case DataLayout::kNCHW:
-      return MKLDNNMemoryFormat::nchw;
-    default:
-      PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
-                   DataLayoutToString(layout));
-  }
-}
-
-inline DataLayout ToPaddleLayout(const MKLDNNMemoryFormat& format) {
-  switch (format) {
-    case MKLDNNMemoryFormat::nhwc:
-      return DataLayout::kNHWC;
-    case MKLDNNMemoryFormat::nchw:
-      return DataLayout::kNCHW;
-    default:
-      PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
-  }
-}
-
-inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
-  static std::unordered_map<int, MKLDNNDataType> dict{
-      {DataTypeTrait<float>::DataType(), MKLDNNDataType::f32},
-      {DataTypeTrait<int8_t>::DataType(), MKLDNNDataType::s8},
-      {DataTypeTrait<uint8_t>::DataType(), MKLDNNDataType::u8},
-      {DataTypeTrait<int16_t>::DataType(), MKLDNNDataType::s16},
-      {DataTypeTrait<int32_t>::DataType(), MKLDNNDataType::s32}};
-  auto iter = dict.find(static_cast<int>(type));
-  if (iter != dict.end()) return iter->second;
-  return MKLDNNDataType::data_undef;
-}
-
-#endif
-
-void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
-                               const OpKernelType& expected_kernel_type,
-                               const Tensor& in, Tensor* out);
-
-void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
-                                    const Tensor& in, Tensor* out,
-                                    platform::Place place);
-
-std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
-
-void TransDataLayout(const OpKernelType& kernel_type_for_var,
-                     const OpKernelType& expected_kernel_type, const Tensor& in,
-                     Tensor* out);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
deleted file mode 100644
index a0d08826b854fea9256382f0e065fd59dda8c8b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/data_layout_transform.h"
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/platform/device_context.h"
-
-TEST(DataTransform, DataLayoutFunction) {
-  auto place = paddle::platform::CPUPlace();
-  paddle::framework::Tensor in = paddle::framework::Tensor();
-  paddle::framework::Tensor out = paddle::framework::Tensor();
-  in.mutable_data<double>(paddle::framework::make_ddim({2, 3, 1, 2}), place);
-  in.set_layout(paddle::framework::DataLayout::kNHWC);
-
-  auto kernel_nhwc = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::FP32, place,
-      paddle::framework::DataLayout::kNHWC,
-      paddle::framework::LibraryType::kPlain);
-  auto kernel_ncwh = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::FP32, place,
-      paddle::framework::DataLayout::kNCHW,
-      paddle::framework::LibraryType::kPlain);
-
-  paddle::framework::TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
-
-  EXPECT_TRUE(out.layout() == paddle::framework::DataLayout::kNCHW);
-  EXPECT_TRUE(out.dims() == paddle::framework::make_ddim({2, 2, 3, 1}));
-
-  TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
-
-  EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
-  EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
-}
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
deleted file mode 100755
index 471db585cefc0cf59463fd3d67e4c0415ff9ceff..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_set.cc
+++ /dev/null
@@ -1,1001 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#include "paddle/fluid/framework/data_set.h"
-#include <algorithm>
-#include <random>
-#include <unordered_map>
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/message.h"
-#include "google/protobuf/text_format.h"
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/platform/timer.h"
-#include "xxhash.h"  // NOLINT
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
-namespace paddle {
-namespace framework {
-
-// constructor
-template <typename T>
-DatasetImpl<T>::DatasetImpl() {
-  VLOG(3) << "DatasetImpl<T>::DatasetImpl() constructor";
-  thread_num_ = 1;
-  trainer_num_ = 1;
-  channel_num_ = 1;
-  file_idx_ = 0;
-  cur_channel_ = 0;
-  fleet_send_batch_size_ = 1024;
-  fleet_send_sleep_seconds_ = 0;
-  merge_by_insid_ = false;
-  erase_duplicate_feas_ = true;
-  keep_unmerged_ins_ = true;
-  min_merge_size_ = 2;
-  parse_ins_id_ = false;
-  parse_content_ = false;
-  preload_thread_num_ = 0;
-  global_index_ = 0;
-}
-
-// set filelist, file_idx_ will reset to zero.
-template <typename T>
-void DatasetImpl<T>::SetFileList(const std::vector<std::string>& filelist) {
-  VLOG(3) << "filelist size: " << filelist.size();
-  filelist_ = filelist;
-  file_idx_ = 0;
-}
-
-// set expect thread num. actually it may change
-template <typename T>
-void DatasetImpl<T>::SetThreadNum(int thread_num) {
-  VLOG(3) << "SetThreadNum thread_num=" << thread_num;
-  thread_num_ = thread_num;
-}
-
-// if you run distributed, and want to do global shuffle,
-// set this before global shuffle.
-// be sure you call CreateReaders before SetTrainerNum
-template <typename T>
-void DatasetImpl<T>::SetTrainerNum(int trainer_num) {
-  trainer_num_ = trainer_num;
-}
-
-// if you run distributed, and want to do global shuffle,
-// set this before global shuffle.
-// be sure you call CreateReaders before SetFleetSendBatchSize
-template <typename T>
-void DatasetImpl<T>::SetFleetSendBatchSize(int64_t size) {
-  fleet_send_batch_size_ = size;
-}
-
-template <typename T>
-void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name,
-                                   const std::string& fs_ugi) {
-  fs_name_ = fs_name;
-  fs_ugi_ = fs_ugi;
-  std::string cmd = std::string("hadoop fs");
-  cmd += " -D fs.default.name=" + fs_name;
-  cmd += " -D hadoop.job.ugi=" + fs_ugi;
-  paddle::framework::hdfs_set_command(cmd);
-}
-
-template <typename T>
-void DatasetImpl<T>::SetDataFeedDesc(const std::string& data_feed_desc_str) {
-  google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
-                                                &data_feed_desc_);
-}
-
-template <typename T>
-void DatasetImpl<T>::SetChannelNum(int channel_num) {
-  channel_num_ = channel_num;
-}
-
-template <typename T>
-void DatasetImpl<T>::SetParseInsId(bool parse_ins_id) {
-  parse_ins_id_ = parse_ins_id;
-}
-
-template <typename T>
-void DatasetImpl<T>::SetParseContent(bool parse_content) {
-  parse_content_ = parse_content;
-}
-
-template <typename T>
-void DatasetImpl<T>::SetMergeByInsId(
-    const std::vector<std::string>& merge_slot_list, bool erase_duplicate_feas,
-    int min_merge_size, bool keep_unmerged_ins) {
-  merge_by_insid_ = true;
-  parse_ins_id_ = true;
-  merge_slots_list_ = merge_slot_list;
-  erase_duplicate_feas_ = erase_duplicate_feas;
-  min_merge_size_ = min_merge_size;
-  keep_unmerged_ins_ = keep_unmerged_ins;
-}
-
-template <typename T>
-void DatasetImpl<T>::SetFeaEval(bool fea_eval, int record_candidate_size) {
-  slots_shuffle_fea_eval_ = fea_eval;
-  slots_shuffle_rclist_.ReSize(record_candidate_size);
-  VLOG(3) << "SetFeaEval fea eval mode: " << fea_eval
-          << " with record candidate size: " << record_candidate_size;
-}
-
-template <typename T>
-std::vector<paddle::framework::DataFeed*> DatasetImpl<T>::GetReaders() {
-  std::vector<paddle::framework::DataFeed*> ret;
-  ret.reserve(readers_.size());
-  for (auto i : readers_) {
-    ret.push_back(i.get());
-  }
-  return ret;
-}
-
-template <typename T>
-void DatasetImpl<T>::CreateChannel() {
-  if (input_channel_ == nullptr) {
-    input_channel_ = paddle::framework::MakeChannel<T>();
-  }
-  if (multi_output_channel_.size() == 0) {
-    multi_output_channel_.reserve(channel_num_);
-    for (int i = 0; i < channel_num_; ++i) {
-      multi_output_channel_.push_back(paddle::framework::MakeChannel<T>());
-    }
-  }
-  if (multi_consume_channel_.size() == 0) {
-    multi_consume_channel_.reserve(channel_num_);
-    for (int i = 0; i < channel_num_; ++i) {
-      multi_consume_channel_.push_back(paddle::framework::MakeChannel<T>());
-    }
-  }
-}
-
-// if sent message between workers, should first call this function
-template <typename T>
-void DatasetImpl<T>::RegisterClientToClientMsgHandler() {
-  auto fleet_ptr = FleetWrapper::GetInstance();
-  VLOG(3) << "RegisterClientToClientMsgHandler";
-  fleet_ptr->RegisterClientToClientMsgHandler(
-      0, [this](int msg_type, int client_id, const std::string& msg) -> int {
-        return this->ReceiveFromClient(msg_type, client_id, msg);
-      });
-  VLOG(3) << "RegisterClientToClientMsgHandler done";
-}
-
-// load data into memory, Dataset hold this memory,
-// which will later be fed into readers' channel
-template <typename T>
-void DatasetImpl<T>::LoadIntoMemory() {
-  VLOG(3) << "DatasetImpl<T>::LoadIntoMemory() begin";
-  platform::Timer timeline;
-  timeline.Start();
-  std::vector<std::thread> load_threads;
-  for (int64_t i = 0; i < thread_num_; ++i) {
-    load_threads.push_back(std::thread(
-        &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
-  }
-  for (std::thread& t : load_threads) {
-    t.join();
-  }
-  input_channel_->Close();
-  int64_t in_chan_size = input_channel_->Size();
-  input_channel_->SetBlockSize(in_chan_size / thread_num_ + 1);
-  timeline.Pause();
-  VLOG(3) << "DatasetImpl<T>::LoadIntoMemory() end"
-          << ", memory data size=" << input_channel_->Size()
-          << ", cost time=" << timeline.ElapsedSec() << " seconds";
-}
-
-template <typename T>
-void DatasetImpl<T>::PreLoadIntoMemory() {
-  VLOG(3) << "DatasetImpl<T>::PreLoadIntoMemory() begin";
-  if (preload_thread_num_ != 0) {
-    CHECK(preload_thread_num_ == preload_readers_.size());
-    preload_threads_.clear();
-    for (int64_t i = 0; i < preload_thread_num_; ++i) {
-      preload_threads_.push_back(
-          std::thread(&paddle::framework::DataFeed::LoadIntoMemory,
-                      preload_readers_[i].get()));
-    }
-  } else {
-    CHECK(thread_num_ == readers_.size());
-    preload_threads_.clear();
-    for (int64_t i = 0; i < thread_num_; ++i) {
-      preload_threads_.push_back(std::thread(
-          &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
-    }
-  }
-  VLOG(3) << "DatasetImpl<T>::PreLoadIntoMemory() end";
-}
-
-template <typename T>
-void DatasetImpl<T>::WaitPreLoadDone() {
-  VLOG(3) << "DatasetImpl<T>::WaitPreLoadDone() begin";
-  for (std::thread& t : preload_threads_) {
-    t.join();
-  }
-  input_channel_->Close();
-  int64_t in_chan_size = input_channel_->Size();
-  input_channel_->SetBlockSize(in_chan_size / thread_num_ + 1);
-  VLOG(3) << "DatasetImpl<T>::WaitPreLoadDone() end";
-}
-
-// release memory data
-template <typename T>
-void DatasetImpl<T>::ReleaseMemory() {
-  VLOG(3) << "DatasetImpl<T>::ReleaseMemory() begin";
-  if (input_channel_) {
-    input_channel_->Clear();
-    input_channel_ = nullptr;
-  }
-  for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-    if (!multi_output_channel_[i]) {
-      continue;
-    }
-    multi_output_channel_[i]->Clear();
-    multi_output_channel_[i] = nullptr;
-  }
-  std::vector<paddle::framework::Channel<T>>().swap(multi_output_channel_);
-  for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
-    if (!multi_consume_channel_[i]) {
-      continue;
-    }
-    multi_consume_channel_[i]->Clear();
-    multi_consume_channel_[i] = nullptr;
-  }
-  std::vector<paddle::framework::Channel<T>>().swap(multi_consume_channel_);
-  std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(readers_);
-  VLOG(3) << "DatasetImpl<T>::ReleaseMemory() end";
-}
-
-// do local shuffle
-template <typename T>
-void DatasetImpl<T>::LocalShuffle() {
-  VLOG(3) << "DatasetImpl<T>::LocalShuffle() begin";
-  platform::Timer timeline;
-  timeline.Start();
-
-  if (!input_channel_ || input_channel_->Size() == 0) {
-    VLOG(3) << "DatasetImpl<T>::LocalShuffle() end, no data to shuffle";
-    return;
-  }
-  auto fleet_ptr = FleetWrapper::GetInstance();
-  input_channel_->Close();
-  std::vector<T> data;
-  input_channel_->ReadAll(data);
-  std::shuffle(data.begin(), data.end(), fleet_ptr->LocalRandomEngine());
-  input_channel_->Open();
-  input_channel_->Write(std::move(data));
-  data.clear();
-  data.shrink_to_fit();
-  input_channel_->Close();
-
-  timeline.Pause();
-  VLOG(3) << "DatasetImpl<T>::LocalShuffle() end, cost time="
-          << timeline.ElapsedSec() << " seconds";
-}
-
-template <typename T>
-void DatasetImpl<T>::GlobalShuffle(int thread_num) {
-  VLOG(3) << "DatasetImpl<T>::GlobalShuffle() begin";
-  platform::Timer timeline;
-  timeline.Start();
-  auto fleet_ptr = FleetWrapper::GetInstance();
-
-  if (!input_channel_ || input_channel_->Size() == 0) {
-    VLOG(3) << "DatasetImpl<T>::GlobalShuffle() end, no data to shuffle";
-    return;
-  }
-
-  // local shuffle
-  input_channel_->Close();
-  std::vector<T> data;
-  input_channel_->ReadAll(data);
-  std::shuffle(data.begin(), data.end(), fleet_ptr->LocalRandomEngine());
-  input_channel_->Open();
-  input_channel_->Write(std::move(data));
-  data.clear();
-  data.shrink_to_fit();
-
-  input_channel_->Close();
-  input_channel_->SetBlockSize(fleet_send_batch_size_);
-  VLOG(3) << "DatasetImpl<T>::GlobalShuffle() input_channel_ size "
-          << input_channel_->Size();
-
-  auto get_client_id = [this, fleet_ptr](const T& data) -> size_t {
-    if (!this->merge_by_insid_) {
-      return fleet_ptr->LocalRandomEngine()() % this->trainer_num_;
-    } else {
-      return XXH64(data.ins_id_.data(), data.ins_id_.length(), 0) %
-             this->trainer_num_;
-    }
-  };
-
-  auto global_shuffle_func = [this, get_client_id]() {
-    auto fleet_ptr = FleetWrapper::GetInstance();
-    std::vector<T> data;
-    while (this->input_channel_->Read(data)) {
-      std::vector<paddle::framework::BinaryArchive> ars(this->trainer_num_);
-      for (auto& t : data) {
-        auto client_id = get_client_id(t);
-        ars[client_id] << t;
-      }
-      std::vector<std::future<int32_t>> total_status;
-      std::vector<int> send_index(this->trainer_num_);
-      for (int i = 0; i < this->trainer_num_; ++i) {
-        send_index[i] = i;
-      }
-      std::shuffle(send_index.begin(), send_index.end(),
-                   fleet_ptr->LocalRandomEngine());
-      for (auto index = 0u; index < this->trainer_num_; ++index) {
-        int i = send_index[index];
-        if (ars[i].Length() == 0) {
-          continue;
-        }
-        std::string msg(ars[i].Buffer(), ars[i].Length());
-        auto ret = fleet_ptr->SendClientToClientMsg(0, i, msg);
-        total_status.push_back(std::move(ret));
-      }
-      for (auto& t : total_status) {
-        t.wait();
-      }
-      ars.clear();
-      ars.shrink_to_fit();
-      data.clear();
-      data.shrink_to_fit();
-      // currently we find bottleneck is server not able to handle large data
-      // in time, so we can remove this sleep and set fleet_send_batch_size to
-      // 1024, and set server thread to 24.
-      if (fleet_send_sleep_seconds_ != 0) {
-        sleep(this->fleet_send_sleep_seconds_);
-      }
-    }
-  };
-
-  std::vector<std::thread> global_shuffle_threads;
-  if (thread_num == -1) {
-    thread_num = thread_num_;
-  }
-  VLOG(3) << "start global shuffle threads, num = " << thread_num;
-  for (int i = 0; i < thread_num; ++i) {
-    global_shuffle_threads.push_back(std::thread(global_shuffle_func));
-  }
-  for (std::thread& t : global_shuffle_threads) {
-    t.join();
-  }
-  global_shuffle_threads.clear();
-  global_shuffle_threads.shrink_to_fit();
-  input_channel_->Clear();
-  timeline.Pause();
-  VLOG(3) << "DatasetImpl<T>::GlobalShuffle() end, cost time="
-          << timeline.ElapsedSec() << " seconds";
-}
-
-template <typename T>
-void DatasetImpl<T>::DynamicAdjustChannelNum(int channel_num) {
-  if (channel_num_ == channel_num) {
-    VLOG(3) << "DatasetImpl<T>::DynamicAdjustChannelNum channel_num_="
-            << channel_num_ << ", channel_num_=channel_num, no need to adjust";
-    return;
-  }
-  VLOG(3) << "adjust channel num from " << channel_num_ << " to "
-          << channel_num;
-  channel_num_ = channel_num;
-  std::vector<paddle::framework::Channel<T>>* origin_channels = nullptr;
-  std::vector<paddle::framework::Channel<T>>* other_channels = nullptr;
-  // find out which channel (output or consume) has data
-  int cur_channel = 0;
-  uint64_t output_channels_data_size = 0;
-  uint64_t consume_channels_data_size = 0;
-  CHECK(multi_output_channel_.size() == multi_consume_channel_.size());
-  for (int i = 0; i < multi_output_channel_.size(); ++i) {
-    output_channels_data_size += multi_output_channel_[i]->Size();
-    consume_channels_data_size += multi_consume_channel_[i]->Size();
-  }
-  if (output_channels_data_size != 0) {
-    CHECK(consume_channels_data_size == 0);  // NOLINT
-    cur_channel = 0;
-  } else {
-    CHECK(output_channels_data_size == 0);  // NOLINT
-    cur_channel = 1;
-  }
-  if (cur_channel == 0) {
-    origin_channels = &multi_output_channel_;
-    other_channels = &multi_consume_channel_;
-  } else {
-    origin_channels = &multi_consume_channel_;
-    other_channels = &multi_output_channel_;
-  }
-  CHECK(origin_channels != nullptr);  // NOLINT
-  CHECK(other_channels != nullptr);   // NOLINT
-
-  paddle::framework::Channel<T> total_data_channel =
-      paddle::framework::MakeChannel<T>();
-  std::vector<paddle::framework::Channel<T>> new_channels;
-  std::vector<paddle::framework::Channel<T>> new_other_channels;
-  std::vector<T> local_vec;
-  for (int i = 0; i < origin_channels->size(); ++i) {
-    local_vec.clear();
-    (*origin_channels)[i]->Close();
-    (*origin_channels)[i]->ReadAll(local_vec);
-    total_data_channel->Write(std::move(local_vec));
-  }
-  total_data_channel->Close();
-  total_data_channel->SetBlockSize(total_data_channel->Size() / channel_num +
-                                   1);
-
-  for (int i = 0; i < channel_num; ++i) {
-    local_vec.clear();
-    total_data_channel->Read(local_vec);
-    new_other_channels.push_back(paddle::framework::MakeChannel<T>());
-    new_channels.push_back(paddle::framework::MakeChannel<T>());
-    new_channels[i]->Write(std::move(local_vec));
-  }
-
-  total_data_channel->Clear();
-  origin_channels->clear();
-  other_channels->clear();
-  *origin_channels = new_channels;
-  *other_channels = new_other_channels;
-
-  new_channels.clear();
-  new_other_channels.clear();
-  std::vector<paddle::framework::Channel<T>>().swap(new_channels);
-  std::vector<paddle::framework::Channel<T>>().swap(new_other_channels);
-  local_vec.clear();
-  std::vector<T>().swap(local_vec);
-  VLOG(3) << "adjust channel num done";
-}
-
-template <typename T>
-void DatasetImpl<T>::DynamicAdjustReadersNum(int thread_num) {
-  if (thread_num_ == thread_num) {
-    VLOG(3) << "DatasetImpl<T>::DynamicAdjustReadersNum thread_num_="
-            << thread_num_ << ", thread_num_=thread_num, no need to adjust";
-    return;
-  }
-  VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num;
-  thread_num_ = thread_num;
-  std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(readers_);
-  CreateReaders();
-  VLOG(3) << "adjust readers num done";
-}
-
-template <typename T>
-void DatasetImpl<T>::SetFleetSendSleepSeconds(int seconds) {
-  fleet_send_sleep_seconds_ = seconds;
-}
-
-template <typename T>
-void DatasetImpl<T>::CreateReaders() {
-  VLOG(3) << "Calling CreateReaders()";
-  VLOG(3) << "thread num in Dataset: " << thread_num_;
-  VLOG(3) << "Filelist size in Dataset: " << filelist_.size();
-  VLOG(3) << "channel num in Dataset: " << channel_num_;
-  CHECK(thread_num_ > 0) << "thread num should > 0";
-  CHECK(channel_num_ > 0) << "channel num should > 0";
-  CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num";
-  VLOG(3) << "readers size: " << readers_.size();
-  if (readers_.size() != 0) {
-    VLOG(3) << "readers_.size() = " << readers_.size()
-            << ", will not create again";
-    return;
-  }
-  VLOG(3) << "data feed class name: " << data_feed_desc_.name();
-  int channel_idx = 0;
-  for (int i = 0; i < thread_num_; ++i) {
-    readers_.push_back(DataFeedFactory::CreateDataFeed(data_feed_desc_.name()));
-    readers_[i]->Init(data_feed_desc_);
-    readers_[i]->SetThreadId(i);
-    readers_[i]->SetThreadNum(thread_num_);
-    readers_[i]->SetFileListMutex(&mutex_for_pick_file_);
-    readers_[i]->SetFileListIndex(&file_idx_);
-    readers_[i]->SetFileList(filelist_);
-    readers_[i]->SetParseInsId(parse_ins_id_);
-    readers_[i]->SetParseContent(parse_content_);
-    if (input_channel_ != nullptr) {
-      readers_[i]->SetInputChannel(input_channel_.get());
-    }
-    if (cur_channel_ == 0 && channel_idx < multi_output_channel_.size()) {
-      readers_[i]->SetOutputChannel(multi_output_channel_[channel_idx].get());
-      readers_[i]->SetConsumeChannel(multi_consume_channel_[channel_idx].get());
-    } else if (channel_idx < multi_output_channel_.size()) {
-      readers_[i]->SetOutputChannel(multi_consume_channel_[channel_idx].get());
-      readers_[i]->SetConsumeChannel(multi_output_channel_[channel_idx].get());
-    }
-    ++channel_idx;
-    if (channel_idx >= channel_num_) {
-      channel_idx = 0;
-    }
-  }
-  VLOG(3) << "readers size: " << readers_.size();
-}
-
-template <typename T>
-void DatasetImpl<T>::DestroyReaders() {
-  VLOG(3) << "Calling DestroyReaders()";
-  VLOG(3) << "readers size1: " << readers_.size();
-  std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(readers_);
-  VLOG(3) << "readers size: " << readers_.size();
-  file_idx_ = 0;
-  cur_channel_ = 1 - cur_channel_;
-}
-
-template <typename T>
-void DatasetImpl<T>::SetPreLoadThreadNum(int thread_num) {
-  preload_thread_num_ = thread_num;
-}
-
-template <typename T>
-void DatasetImpl<T>::CreatePreLoadReaders() {
-  VLOG(3) << "Begin CreatePreLoadReaders";
-  if (preload_thread_num_ == 0) {
-    preload_thread_num_ = thread_num_;
-  }
-  CHECK(preload_thread_num_ > 0) << "thread num should > 0";
-  CHECK(input_channel_ != nullptr);
-  preload_readers_.clear();
-  for (int i = 0; i < preload_thread_num_; ++i) {
-    preload_readers_.push_back(
-        DataFeedFactory::CreateDataFeed(data_feed_desc_.name()));
-    preload_readers_[i]->Init(data_feed_desc_);
-    preload_readers_[i]->SetThreadId(i);
-    preload_readers_[i]->SetThreadNum(preload_thread_num_);
-    preload_readers_[i]->SetFileListMutex(&mutex_for_pick_file_);
-    preload_readers_[i]->SetFileListIndex(&file_idx_);
-    preload_readers_[i]->SetFileList(filelist_);
-    preload_readers_[i]->SetParseInsId(parse_ins_id_);
-    preload_readers_[i]->SetParseContent(parse_content_);
-    preload_readers_[i]->SetInputChannel(input_channel_.get());
-    preload_readers_[i]->SetOutputChannel(nullptr);
-    preload_readers_[i]->SetConsumeChannel(nullptr);
-  }
-  VLOG(3) << "End CreatePreLoadReaders";
-}
-
-template <typename T>
-void DatasetImpl<T>::DestroyPreLoadReaders() {
-  VLOG(3) << "Begin DestroyPreLoadReaders";
-  preload_readers_.clear();
-  std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(
-      preload_readers_);
-  file_idx_ = 0;
-  VLOG(3) << "End DestroyPreLoadReaders";
-}
-
-template <typename T>
-int64_t DatasetImpl<T>::GetMemoryDataSize() {
-  return input_channel_->Size();
-}
-
-template <typename T>
-int64_t DatasetImpl<T>::GetShuffleDataSize() {
-  int64_t sum = 0;
-  for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-    sum += multi_output_channel_[i]->Size() + multi_consume_channel_[i]->Size();
-  }
-  return sum;
-}
-
-template <typename T>
-int DatasetImpl<T>::ReceiveFromClient(int msg_type, int client_id,
-                                      const std::string& msg) {
-#ifdef _LINUX
-  VLOG(3) << "ReceiveFromClient msg_type=" << msg_type
-          << ", client_id=" << client_id << ", msg length=" << msg.length();
-  if (msg.length() == 0) {
-    return 0;
-  }
-  paddle::framework::BinaryArchive ar;
-  ar.SetReadBuffer(const_cast<char*>(msg.c_str()), msg.length(), nullptr);
-  if (ar.Cursor() == ar.Finish()) {
-    return 0;
-  }
-  std::vector<T> data;
-  while (ar.Cursor() < ar.Finish()) {
-    data.push_back(ar.Get<T>());
-  }
-  CHECK(ar.Cursor() == ar.Finish());
-
-  auto fleet_ptr = FleetWrapper::GetInstance();
-  // not use random because it doesn't perform well here.
-  // to make sure each channel get data equally, we just put data to
-  // channel one by one.
-  // int64_t index = fleet_ptr->LocalRandomEngine()() % channel_num_;
-  int64_t index = 0;
-  {
-    std::unique_lock<std::mutex> lk(global_index_mutex_);
-    index = global_index_++;
-  }
-  index = index % channel_num_;
-  VLOG(3) << "ramdom index=" << index;
-  multi_output_channel_[index]->Write(std::move(data));
-
-  data.clear();
-  data.shrink_to_fit();
-#endif
-  return 0;
-}
-
-// explicit instantiation
-template class DatasetImpl<Record>;
-
-void MultiSlotDataset::MergeByInsId() {
-  VLOG(3) << "MultiSlotDataset::MergeByInsId begin";
-  if (!merge_by_insid_) {
-    VLOG(3) << "merge_by_insid=false, will not MergeByInsId";
-    return;
-  }
-  auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
-  std::unordered_map<int, bool> merge_slots;
-  std::vector<std::string> use_slots;
-  std::vector<bool> use_slots_is_dense;
-  for (size_t i = 0; i < multi_slot_desc.slots_size(); ++i) {
-    const auto& slot = multi_slot_desc.slots(i);
-    if (slot.is_used()) {
-      use_slots.push_back(slot.name());
-      use_slots_is_dense.push_back(slot.is_dense());
-    }
-  }
-  for (size_t i = 0; i < use_slots.size(); ++i) {
-    // currently, we don't merge dense slots
-    if (std::find(merge_slots_list_.begin(), merge_slots_list_.end(),
-                  use_slots[i]) != merge_slots_list_.end() &&
-        !use_slots_is_dense[i]) {
-      merge_slots[i] = true;
-    }
-  }
-  CHECK(multi_output_channel_.size() != 0);  // NOLINT
-  auto channel_data = paddle::framework::MakeChannel<Record>();
-  VLOG(3) << "multi_output_channel_.size() " << multi_output_channel_.size();
-  for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-    std::vector<Record> vec_data;
-    multi_output_channel_[i]->Close();
-    multi_output_channel_[i]->ReadAll(vec_data);
-    channel_data->Write(std::move(vec_data));
-    vec_data.clear();
-    vec_data.shrink_to_fit();
-    multi_output_channel_[i]->Clear();
-  }
-  channel_data->Close();
-  std::vector<Record> recs;
-  recs.reserve(channel_data->Size());
-  channel_data->ReadAll(recs);
-  channel_data->Clear();
-  std::sort(recs.begin(), recs.end(), [](const Record& a, const Record& b) {
-    return a.ins_id_ < b.ins_id_;
-  });
-
-  auto sort_cmp_uint64 = [&merge_slots](const FeatureItem& a,
-                                        const FeatureItem& b) {
-    auto& a_sign = a.sign().uint64_feasign_;
-    auto& b_sign = b.sign().uint64_feasign_;
-    return a_sign < b_sign || (a_sign == b_sign && a.slot() < b.slot());
-  };
-  auto sort_cmp_float = [&merge_slots](const FeatureItem& a,
-                                       const FeatureItem& b) {
-    auto& a_sign = a.sign().float_feasign_;
-    auto& b_sign = b.sign().float_feasign_;
-    return a_sign < b_sign || (a_sign == b_sign && a.slot() < b.slot());
-  };
-  auto unique_eq_uint64 = [&merge_slots](const FeatureItem& a,
-                                         const FeatureItem& b) {
-    if (a.slot() == b.slot() &&
-        merge_slots.find(a.slot()) == merge_slots.end()) {
-      return true;
-    }
-    auto& a_sign = a.sign().uint64_feasign_;
-    auto& b_sign = b.sign().uint64_feasign_;
-    return a_sign == b_sign && a.slot() == b.slot();
-  };
-  auto unique_eq_float = [&merge_slots](const FeatureItem& a,
-                                        const FeatureItem& b) {
-    if (a.slot() == b.slot() &&
-        merge_slots.find(a.slot()) == merge_slots.end()) {
-      return true;
-    }
-    auto& a_sign = a.sign().float_feasign_;
-    auto& b_sign = b.sign().float_feasign_;
-    return a_sign == b_sign && a.slot() == b.slot();
-  };
-
-  std::vector<Record> results;
-  VLOG(3) << "recs.size() " << recs.size();
-  for (size_t i = 0; i < recs.size();) {
-    size_t j = i + 1;
-    while (j < recs.size() && recs[j].ins_id_ == recs[i].ins_id_) {
-      j++;
-    }
-    if (j - i < min_merge_size_) {
-      if (keep_unmerged_ins_) {
-        for (size_t k = i; k < j; ++k) {
-          results.push_back(std::move(recs[k]));
-        }
-      }
-      i = j;
-      continue;
-    }
-
-    std::vector<FeatureItem> merge_uint64_feasigns;
-    std::vector<FeatureItem> merge_float_feasigns;
-    Record rec = std::move(recs[i]);
-
-    for (size_t k = i + 1; k < j; k++) {
-      for (auto& feature : recs[k].uint64_feasigns_) {
-        if (merge_slots.find(feature.slot()) != merge_slots.end()) {
-          merge_uint64_feasigns.push_back(std::move(feature));
-        }
-      }
-      for (auto& feature : recs[k].float_feasigns_) {
-        if (merge_slots.find(feature.slot()) != merge_slots.end()) {
-          merge_float_feasigns.push_back(std::move(feature));
-        }
-      }
-      recs[k] = Record();
-    }
-    i = j;
-
-    if (!erase_duplicate_feas_) {
-      rec.uint64_feasigns_.insert(rec.uint64_feasigns_.end(),
-                                  merge_uint64_feasigns.begin(),
-                                  merge_uint64_feasigns.end());
-      rec.float_feasigns_.insert(rec.float_feasigns_.end(),
-                                 merge_float_feasigns.begin(),
-                                 merge_float_feasigns.end());
-    } else {
-      std::vector<FeatureItem> not_merge_uint64_feasigns;
-      std::vector<FeatureItem> not_merge_float_feasigns;
-
-      for (auto& feature : rec.uint64_feasigns_) {
-        if (merge_slots.find(feature.slot()) != merge_slots.end()) {
-          merge_uint64_feasigns.push_back(std::move(feature));
-        } else {
-          not_merge_uint64_feasigns.push_back(std::move(feature));
-        }
-      }
-      for (auto& feature : rec.float_feasigns_) {
-        if (merge_slots.find(feature.slot()) != merge_slots.end()) {
-          merge_float_feasigns.push_back(std::move(feature));
-        } else {
-          not_merge_float_feasigns.push_back(std::move(feature));
-        }
-      }
-      rec.uint64_feasigns_.clear();
-      rec.float_feasigns_.clear();
-
-      // erase duplicate uint64 feasigns
-      std::sort(merge_uint64_feasigns.begin(), merge_uint64_feasigns.end(),
-                sort_cmp_uint64);
-      merge_uint64_feasigns.erase(
-          std::unique(merge_uint64_feasigns.begin(),
-                      merge_uint64_feasigns.end(), unique_eq_uint64),
-          merge_uint64_feasigns.end());
-      rec.uint64_feasigns_.insert(rec.uint64_feasigns_.end(),
-                                  merge_uint64_feasigns.begin(),
-                                  merge_uint64_feasigns.end());
-      rec.uint64_feasigns_.insert(rec.uint64_feasigns_.end(),
-                                  not_merge_uint64_feasigns.begin(),
-                                  not_merge_uint64_feasigns.end());
-
-      // erase duplicate float feasigns
-      std::sort(merge_float_feasigns.begin(), merge_float_feasigns.end(),
-                sort_cmp_float);
-      merge_float_feasigns.erase(
-          std::unique(merge_float_feasigns.begin(), merge_float_feasigns.end(),
-                      unique_eq_float),
-          merge_float_feasigns.end());
-      rec.float_feasigns_.insert(rec.float_feasigns_.end(),
-                                 merge_float_feasigns.begin(),
-                                 merge_float_feasigns.end());
-      rec.float_feasigns_.insert(rec.float_feasigns_.end(),
-                                 not_merge_float_feasigns.begin(),
-                                 not_merge_float_feasigns.end());
-    }
-    results.push_back(rec);
-  }
-  VLOG(3) << "results size " << results.size();
-  results.shrink_to_fit();
-
-  auto fleet_ptr = FleetWrapper::GetInstance();
-  std::shuffle(results.begin(), results.end(), fleet_ptr->LocalRandomEngine());
-  channel_data->Open();
-  channel_data->Write(std::move(results));
-  channel_data->Close();
-  results.clear();
-  results.shrink_to_fit();
-  VLOG(3) << "channel data size " << channel_data->Size();
-  channel_data->SetBlockSize(channel_data->Size() / channel_num_ + 1);
-  VLOG(3) << "channel data block size " << channel_data->BlockSize();
-  for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-    std::vector<Record> vec_data;
-    channel_data->Read(vec_data);
-    multi_output_channel_[i]->Open();
-    multi_output_channel_[i]->Write(std::move(vec_data));
-    vec_data.clear();
-    vec_data.shrink_to_fit();
-  }
-  CHECK(channel_data->Size() == 0);  // NOLINT
-  channel_data->Clear();
-  VLOG(3) << "MultiSlotDataset::MergeByInsId end";
-}
-
-void MultiSlotDataset::GetRandomData(const std::set<uint16_t>& slots_to_replace,
-                                     std::vector<Record>* result) {
-  int debug_erase_cnt = 0;
-  int debug_push_cnt = 0;
-  auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
-  slots_shuffle_rclist_.ReInit();
-  for (const auto& rec : slots_shuffle_original_data_) {
-    RecordCandidate rand_rec;
-    Record new_rec = rec;
-    slots_shuffle_rclist_.AddAndGet(rec, &rand_rec);
-    for (auto it = new_rec.uint64_feasigns_.begin();
-         it != new_rec.uint64_feasigns_.end();) {
-      if (slots_to_replace.find(it->slot()) != slots_to_replace.end()) {
-        it = new_rec.uint64_feasigns_.erase(it);
-        debug_erase_cnt += 1;
-      } else {
-        ++it;
-      }
-    }
-    for (auto slot : slots_to_replace) {
-      auto range = rand_rec.feas.equal_range(slot);
-      for (auto it = range.first; it != range.second; ++it) {
-        new_rec.uint64_feasigns_.push_back({it->second, it->first});
-        debug_push_cnt += 1;
-      }
-    }
-    result->push_back(std::move(new_rec));
-  }
-  VLOG(2) << "erase feasign num: " << debug_erase_cnt
-          << " repush feasign num: " << debug_push_cnt;
-}
-
-// slots shuffle to input_channel_ with needed-shuffle slots
-void MultiSlotDataset::SlotsShuffle(
-    const std::set<std::string>& slots_to_replace) {
-  int out_channel_size = 0;
-  if (cur_channel_ == 0) {
-    for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-      out_channel_size += multi_output_channel_[i]->Size();
-    }
-  } else {
-    for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
-      out_channel_size += multi_consume_channel_[i]->Size();
-    }
-  }
-  VLOG(2) << "DatasetImpl<T>::SlotsShuffle() begin with input channel size: "
-          << input_channel_->Size()
-          << " output channel size: " << out_channel_size;
-  if (!slots_shuffle_fea_eval_) {
-    VLOG(3) << "DatasetImpl<T>::SlotsShuffle() end,"
-               "fea eval mode off, need to set on for slots shuffle";
-    return;
-  }
-  if ((!input_channel_ || input_channel_->Size() == 0) &&
-      slots_shuffle_original_data_.size() == 0 && out_channel_size == 0) {
-    VLOG(3) << "DatasetImpl<T>::SlotsShuffle() end, no data to slots shuffle";
-    return;
-  }
-  platform::Timer timeline;
-  timeline.Start();
-  auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
-  std::set<uint16_t> index_slots;
-  for (size_t i = 0; i < multi_slot_desc.slots_size(); ++i) {
-    std::string cur_slot = multi_slot_desc.slots(i).name();
-    if (slots_to_replace.find(cur_slot) != slots_to_replace.end()) {
-      index_slots.insert(i);
-    }
-  }
-  if (slots_shuffle_original_data_.size() == 0) {
-    // before first slots shuffle, instances could be in
-    // input_channel, oupput_channel or consume_channel
-    if (input_channel_ && input_channel_->Size() != 0) {
-      slots_shuffle_original_data_.reserve(input_channel_->Size());
-      input_channel_->Close();
-      input_channel_->ReadAll(slots_shuffle_original_data_);
-    } else {
-      CHECK(out_channel_size > 0);  // NOLINT
-      if (cur_channel_ == 0) {
-        for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-          std::vector<Record> vec_data;
-          multi_output_channel_[i]->Close();
-          multi_output_channel_[i]->ReadAll(vec_data);
-          slots_shuffle_original_data_.reserve(
-              slots_shuffle_original_data_.size() + vec_data.size());
-          slots_shuffle_original_data_.insert(
-              slots_shuffle_original_data_.end(),
-              std::make_move_iterator(vec_data.begin()),
-              std::make_move_iterator(vec_data.end()));
-          vec_data.clear();
-          vec_data.shrink_to_fit();
-          multi_output_channel_[i]->Clear();
-        }
-      } else {
-        for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
-          std::vector<Record> vec_data;
-          multi_consume_channel_[i]->Close();
-          multi_consume_channel_[i]->ReadAll(vec_data);
-          slots_shuffle_original_data_.reserve(
-              slots_shuffle_original_data_.size() + vec_data.size());
-          slots_shuffle_original_data_.insert(
-              slots_shuffle_original_data_.end(),
-              std::make_move_iterator(vec_data.begin()),
-              std::make_move_iterator(vec_data.end()));
-          vec_data.clear();
-          vec_data.shrink_to_fit();
-          multi_consume_channel_[i]->Clear();
-        }
-      }
-    }
-  } else {
-    // if already have original data for slots shuffle, clear channel
-    input_channel_->Clear();
-    if (cur_channel_ == 0) {
-      for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-        if (!multi_output_channel_[i]) {
-          continue;
-        }
-        multi_output_channel_[i]->Clear();
-      }
-    } else {
-      for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
-        if (!multi_consume_channel_[i]) {
-          continue;
-        }
-        multi_consume_channel_[i]->Clear();
-      }
-    }
-  }
-  int end_size = 0;
-  if (cur_channel_ == 0) {
-    for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-      if (!multi_output_channel_[i]) {
-        continue;
-      }
-      end_size += multi_output_channel_[i]->Size();
-    }
-  } else {
-    for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
-      if (!multi_consume_channel_[i]) {
-        continue;
-      }
-      end_size += multi_consume_channel_[i]->Size();
-    }
-  }
-  CHECK(input_channel_->Size() == 0)
-      << "input channel should be empty before slots shuffle";
-  std::vector<Record> random_data;
-  random_data.clear();
-  // get slots shuffled random_data
-  GetRandomData(index_slots, &random_data);
-  input_channel_->Open();
-  input_channel_->Write(std::move(random_data));
-  random_data.clear();
-  random_data.shrink_to_fit();
-  input_channel_->Close();
-
-  timeline.Pause();
-  VLOG(2) << "DatasetImpl<T>::SlotsShuffle() end"
-          << ", memory data size for slots shuffle=" << input_channel_->Size()
-          << ", cost time=" << timeline.ElapsedSec() << " seconds";
-}
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
deleted file mode 100644
index bcf344d23a4553d95018081c74f303123632f4c8..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_set.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_feed.h"
-
-namespace paddle {
-namespace framework {
-
-// Dataset is a abstract class, which defines user interfaces
-// Example Usage:
-//    Dataset* dataset = DatasetFactory::CreateDataset("InMemoryDataset")
-//    dataset->SetFileList(std::vector<std::string>{"a.txt", "b.txt"})
-//    dataset->SetThreadNum(1)
-//    dataset->CreateReaders();
-//    dataset->SetDataFeedDesc(your_data_feed_desc);
-//    dataset->LoadIntoMemory();
-//    dataset->SetTrainerNum(2);
-//    dataset->GlobalShuffle();
-class Dataset {
- public:
-  Dataset() {}
-  virtual ~Dataset() {}
-  // set file list
-  virtual void SetFileList(const std::vector<std::string>& filelist) = 0;
-  // set readers' num
-  virtual void SetThreadNum(int thread_num) = 0;
-  // set workers' num
-  virtual void SetTrainerNum(int trainer_num) = 0;
-  // set fleet send batch size
-  virtual void SetFleetSendBatchSize(int64_t size) = 0;
-  // set fs name and ugi
-  virtual void SetHdfsConfig(const std::string& fs_name,
-                             const std::string& fs_ugi) = 0;
-  // set data fedd desc, which contains:
-  //   data feed name, batch size, slots
-  virtual void SetDataFeedDesc(const std::string& data_feed_desc_str) = 0;
-  // set channel num
-  virtual void SetChannelNum(int channel_num) = 0;
-  // set parse ins id
-  virtual void SetParseInsId(bool parse_ins_id) = 0;
-  virtual void SetParseContent(bool parse_content) = 0;
-  // set merge by ins id
-  virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list,
-                               bool erase_duplicate_feas, int min_merge_size,
-                               bool keep_unmerged_ins) = 0;
-  // set fea eval mode
-  virtual void SetFeaEval(bool fea_eval, int record_candidate_size) = 0;
-  // get file list
-  virtual const std::vector<std::string>& GetFileList() = 0;
-  // get thread num
-  virtual int GetThreadNum() = 0;
-  // get worker num
-  virtual int GetTrainerNum() = 0;
-  // get fleet send batch size
-  virtual int64_t GetFleetSendBatchSize() = 0;
-  // get hdfs config
-  virtual std::pair<std::string, std::string> GetHdfsConfig() = 0;
-  // get data fedd desc
-  virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() = 0;
-  // get channel num
-  virtual int GetChannelNum() = 0;
-  // get readers, the reader num depend both on thread num
-  // and filelist size
-  virtual std::vector<paddle::framework::DataFeed*> GetReaders() = 0;
-  // create input channel and output channel
-  virtual void CreateChannel() = 0;
-  // register message handler between workers
-  virtual void RegisterClientToClientMsgHandler() = 0;
-  // load all data into memory
-  virtual void LoadIntoMemory() = 0;
-  // load all data into memory in async mode
-  virtual void PreLoadIntoMemory() = 0;
-  // wait async load done
-  virtual void WaitPreLoadDone() = 0;
-  // release all memory data
-  virtual void ReleaseMemory() = 0;
-  // local shuffle data
-  virtual void LocalShuffle() = 0;
-  // global shuffle data
-  virtual void GlobalShuffle(int thread_num = -1) = 0;
-  // for slots shuffle
-  virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) = 0;
-  virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
-                             std::vector<Record>* result) = 0;
-  // create readers
-  virtual void CreateReaders() = 0;
-  // destroy readers
-  virtual void DestroyReaders() = 0;
-  // get memory data size
-  virtual int64_t GetMemoryDataSize() = 0;
-  // get shuffle data size
-  virtual int64_t GetShuffleDataSize() = 0;
-  // merge by ins id
-  virtual void MergeByInsId() = 0;
-  // create preload readers
-  virtual void CreatePreLoadReaders() = 0;
-  // destroy preload readers after prelaod done
-  virtual void DestroyPreLoadReaders() = 0;
-  // set preload thread num
-  virtual void SetPreLoadThreadNum(int thread_num) = 0;
-  // seperate train thread and dataset thread
-  virtual void DynamicAdjustChannelNum(int channel_num) = 0;
-  virtual void DynamicAdjustReadersNum(int thread_num) = 0;
-  // set fleet send sleep seconds
-  virtual void SetFleetSendSleepSeconds(int seconds) = 0;
-
- protected:
-  virtual int ReceiveFromClient(int msg_type, int client_id,
-                                const std::string& msg) = 0;
-};
-
-// DatasetImpl is the implementation of Dataset,
-// it holds memory data if user calls load_into_memory
-template <typename T>
-class DatasetImpl : public Dataset {
- public:
-  DatasetImpl();
-  virtual ~DatasetImpl() {}
-
-  virtual void SetFileList(const std::vector<std::string>& filelist);
-  virtual void SetThreadNum(int thread_num);
-  virtual void SetTrainerNum(int trainer_num);
-  virtual void SetFleetSendBatchSize(int64_t size);
-  virtual void SetHdfsConfig(const std::string& fs_name,
-                             const std::string& fs_ugi);
-  virtual void SetDataFeedDesc(const std::string& data_feed_desc_str);
-  virtual void SetChannelNum(int channel_num);
-  virtual void SetParseInsId(bool parse_ins_id);
-  virtual void SetParseContent(bool parse_content);
-  virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list,
-                               bool erase_duplicate_feas, int min_merge_size,
-                               bool keep_unmerged_ins);
-
-  virtual void SetFeaEval(bool fea_eval, int record_candidate_size);
-  virtual const std::vector<std::string>& GetFileList() { return filelist_; }
-  virtual int GetThreadNum() { return thread_num_; }
-  virtual int GetTrainerNum() { return trainer_num_; }
-  virtual Channel<T> GetInputChannel() { return input_channel_; }
-  virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; }
-  virtual std::pair<std::string, std::string> GetHdfsConfig() {
-    return std::make_pair(fs_name_, fs_ugi_);
-  }
-  virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() {
-    return data_feed_desc_;
-  }
-  virtual int GetChannelNum() { return channel_num_; }
-  virtual std::vector<paddle::framework::DataFeed*> GetReaders();
-  virtual void CreateChannel();
-  virtual void RegisterClientToClientMsgHandler();
-  virtual void LoadIntoMemory();
-  virtual void PreLoadIntoMemory();
-  virtual void WaitPreLoadDone();
-  virtual void ReleaseMemory();
-  virtual void LocalShuffle();
-  virtual void GlobalShuffle(int thread_num = -1);
-  virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) {}
-  virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
-                             std::vector<Record>* result) {}
-  virtual void CreateReaders();
-  virtual void DestroyReaders();
-  virtual int64_t GetMemoryDataSize();
-  virtual int64_t GetShuffleDataSize();
-  virtual void MergeByInsId() {}
-  virtual void CreatePreLoadReaders();
-  virtual void DestroyPreLoadReaders();
-  virtual void SetPreLoadThreadNum(int thread_num);
-  virtual void DynamicAdjustChannelNum(int channel_num);
-  virtual void DynamicAdjustReadersNum(int thread_num);
-  virtual void SetFleetSendSleepSeconds(int seconds);
-
- protected:
-  virtual int ReceiveFromClient(int msg_type, int client_id,
-                                const std::string& msg);
-  std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers_;
-  std::vector<std::shared_ptr<paddle::framework::DataFeed>> preload_readers_;
-  paddle::framework::Channel<T> input_channel_;
-  int channel_num_;
-  std::vector<paddle::framework::Channel<T>> multi_output_channel_;
-  std::vector<paddle::framework::Channel<T>> multi_consume_channel_;
-  // when read ins, we put ins from one channel to the other,
-  // and when finish reading, we set cur_channel = 1 - cur_channel,
-  // so if cur_channel=0, all data are in output_channel, else consume_channel
-  int cur_channel_;
-  std::vector<T> slots_shuffle_original_data_;
-  RecordCandidateList slots_shuffle_rclist_;
-  int thread_num_;
-  paddle::framework::DataFeedDesc data_feed_desc_;
-  int trainer_num_;
-  std::vector<std::string> filelist_;
-  size_t file_idx_;
-  std::mutex mutex_for_pick_file_;
-  std::string fs_name_;
-  std::string fs_ugi_;
-  int64_t fleet_send_batch_size_;
-  int64_t fleet_send_sleep_seconds_;
-  std::vector<std::thread> preload_threads_;
-  bool merge_by_insid_;
-  bool parse_ins_id_;
-  bool parse_content_;
-  bool erase_duplicate_feas_;
-  bool keep_unmerged_ins_;
-  int min_merge_size_;
-  std::vector<std::string> merge_slots_list_;
-  bool slots_shuffle_fea_eval_ = false;
-  int preload_thread_num_;
-  std::mutex global_index_mutex_;
-  int64_t global_index_ = 0;
-};
-
-// use std::vector<MultiSlotType> or Record as data type
-class MultiSlotDataset : public DatasetImpl<Record> {
- public:
-  MultiSlotDataset() {}
-  virtual void MergeByInsId();
-  virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace);
-  virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
-                             std::vector<Record>* result);
-  virtual ~MultiSlotDataset() {}
-};
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
deleted file mode 100644
index 82872224501709080ff02a13464d58543a0abda8..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_transform.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_transform.h"
-
-#include "paddle/fluid/framework/data_device_transform.h"
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-
-static void PassTensorData(Tensor *from, Tensor *to) {
-  to->ShareDataWith(*from);
-  *from = Tensor();
-}
-
-void TransformData(const OpKernelType &expected_kernel_type,
-                   const OpKernelType &kernel_type_for_var,
-                   const Tensor &input_tensor, Tensor *output_tensor) {
-  bool transformed = false;
-  Tensor in;
-  in.ShareDataWith(input_tensor);
-  Tensor out;
-  DataLayout lin = kernel_type_for_var.data_layout_;
-  DataLayout lout = expected_kernel_type.data_layout_;
-
-  // do layout transform
-  if (NeedTransformLayout(lout, lin)) {
-    if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) {
-      PADDLE_ENFORCE(
-          !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN),
-          "No layout transform needed between two MKLDNN OPKernels");
-
-      if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) {
-#ifdef PADDLE_WITH_MKLDNN
-        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
-        // Just set layout/format. No real transform occur
-
-        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
-                                                        ToMKLDNNFormat(lin));
-
-        out.ShareDataWith(input_tensor);
-        out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(out_format);
-#endif
-      } else {
-        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
-        // Do transform via MKLDNN lib
-        TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in,
-                                  &out);
-      }
-    } else {
-      // Case3 - transfrom between Non-MKLDNN OPKernels
-      TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
-    }
-    transformed = true;
-    PassTensorData(&out, &in);
-  }
-
-  // do data type transform
-  if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
-    TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
-    transformed = true;
-    PassTensorData(&out, &in);
-  }
-
-  // do device transform
-  if (!platform::is_same_place(kernel_type_for_var.place_,
-                               expected_kernel_type.place_)) {
-    TransDataDevice(in, expected_kernel_type.place_, &out);
-    transformed = true;
-    PassTensorData(&out, &in);
-  }
-
-  PADDLE_ENFORCE(transformed, "No transform is applied, please check!");
-  // get output data
-  output_tensor->ShareDataWith(in);
-}
-
-void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
-                         Variable *out_var) {
-  if (in_var.IsType<LoDTensor>()) {
-    auto &in_lod_tensor = in_var.Get<LoDTensor>();
-    auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
-    tran_lod_tensor->set_lod(in_lod_tensor.lod());
-    tran_lod_tensor->set_layout(in_lod_tensor.layout());
-    tran_lod_tensor->ShareDataWith(tensor);
-  } else if (in_var.IsType<SelectedRows>()) {
-    auto &in_selected_rows = in_var.Get<SelectedRows>();
-    auto *trans_selected_rows = out_var->GetMutable<SelectedRows>();
-    trans_selected_rows->set_height(in_selected_rows.height());
-    trans_selected_rows->set_rows(in_selected_rows.rows());
-    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
-  } else {
-    PADDLE_THROW("unknown var type");
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
deleted file mode 100644
index ae3ab051bda2e698801cc6fe6e3ddddf039f5385..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_transform.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace framework {
-
-void TransformData(const OpKernelType &expected_kernel_type,
-                   const OpKernelType &kernel_type_for_var,
-                   const Tensor &input_tensor, Tensor *out);
-
-/**
- * Set OutVar from InVar, except the tensor is shared with `tensor`
- */
-void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
-                         Variable *out_var);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
deleted file mode 100644
index a0248cf3c75690fb9ec3fcc22596af245d042d80..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_type.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/data_type.h"
-#include <stdint.h>
-#include <string>
-#include <unordered_map>
-
-using float16 = paddle::platform::float16;
-
-namespace paddle {
-namespace framework {
-
-struct DataTypeMap {
-  std::unordered_map<std::type_index, proto::VarType::Type> cpp_to_proto_;
-  std::unordered_map<int, std::type_index> proto_to_cpp_;
-  std::unordered_map<int, std::string> proto_to_str_;
-  std::unordered_map<int, size_t> proto_to_size_;
-};
-
-static DataTypeMap* InitDataTypeMap();
-// C++11 removes the need for manual locking. Concurrent execution shall wait if
-// a static local variable is already being initialized.
-// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
-static DataTypeMap& gDataTypeMap() {
-  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
-  return *g_data_type_map_;
-}
-
-template <typename T>
-static inline void RegisterType(DataTypeMap* map,
-                                proto::VarType::Type proto_type,
-                                const std::string& name) {
-  map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
-  map->cpp_to_proto_.emplace(typeid(T), proto_type);
-  map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
-  map->proto_to_size_.emplace(static_cast<int>(proto_type), sizeof(T));
-}
-
-static DataTypeMap* InitDataTypeMap() {
-  auto retv = new DataTypeMap();
-
-#define RegType(cc_type, proto_type) \
-  RegisterType<cc_type>(retv, proto_type, #cc_type)
-
-  _ForEachDataType_(RegType);
-
-#undef RegType
-  return retv;
-}
-
-proto::VarType::Type ToDataType(std::type_index type) {
-  auto it = gDataTypeMap().cpp_to_proto_.find(type);
-  if (it != gDataTypeMap().cpp_to_proto_.end()) {
-    return it->second;
-  }
-  PADDLE_THROW("Not support %s as tensor type", type.name());
-}
-
-std::type_index ToTypeIndex(proto::VarType::Type type) {
-  auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
-  if (it != gDataTypeMap().proto_to_cpp_.end()) {
-    return it->second;
-  }
-  PADDLE_THROW("Not support proto::VarType::Type(%d) as tensor type",
-               static_cast<int>(type));
-}
-
-std::string DataTypeToString(const proto::VarType::Type type) {
-  auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
-  if (it != gDataTypeMap().proto_to_str_.end()) {
-    return it->second;
-  }
-  PADDLE_THROW("Not support proto::VarType::Type(%d) as tensor type",
-               static_cast<int>(type));
-}
-
-size_t SizeOfType(proto::VarType::Type type) {
-  auto it = gDataTypeMap().proto_to_size_.find(static_cast<int>(type));
-  if (it != gDataTypeMap().proto_to_size_.end()) {
-    return it->second;
-  }
-  PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type));
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
deleted file mode 100644
index 60644820df7cd4133c5fd8f24fe693245d68a5f3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_type.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <typeindex>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-struct DataTypeTrait {};
-
-// Stub handle for void
-template <>
-struct DataTypeTrait<void> {
-  constexpr static proto::VarType::Type DataType() {
-    return proto::VarType::RAW;
-  }
-};
-
-#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
-  callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
-
-#define _ForEachDataType_(callback)                                     \
-  _ForEachDataTypeHelper_(callback, float, FP32);                       \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
-  _ForEachDataTypeHelper_(callback, double, FP64);                      \
-  _ForEachDataTypeHelper_(callback, int, INT32);                        \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64);                    \
-  _ForEachDataTypeHelper_(callback, bool, BOOL);                        \
-  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                    \
-  _ForEachDataTypeHelper_(callback, int16_t, INT16);                    \
-  _ForEachDataTypeHelper_(callback, int8_t, INT8)
-
-#define DefineDataTypeTrait(cpp_type, proto_type)                           \
-  template <>                                                               \
-  struct DataTypeTrait<cpp_type> {                                          \
-    constexpr static proto::VarType::Type DataType() { return proto_type; } \
-  }
-
-_ForEachDataType_(DefineDataTypeTrait);
-
-#undef DefineDataTypeTrait
-
-extern proto::VarType::Type ToDataType(std::type_index type);
-extern std::type_index ToTypeIndex(proto::VarType::Type type);
-
-template <typename Visitor>
-inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
-#define VisitDataTypeCallback(cpp_type, proto_type) \
-  do {                                              \
-    if (type == proto_type) {                       \
-      visitor.template apply<cpp_type>();           \
-      return;                                       \
-    }                                               \
-  } while (0)
-
-  _ForEachDataType_(VisitDataTypeCallback);
-#undef VisitDataTypeCallback
-  PADDLE_THROW("Not supported %d", type);
-}
-
-extern std::string DataTypeToString(const proto::VarType::Type type);
-extern size_t SizeOfType(proto::VarType::Type type);
-inline std::ostream& operator<<(std::ostream& out,
-                                const proto::VarType::Type& type) {
-  out << DataTypeToString(type);
-  return out;
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
deleted file mode 100644
index 2a380201f297f42dd82a6809bef9a72660066819..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_type_test.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/data_type.h"
-
-#include <string>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/tensor.h"
-
-TEST(DataType, float16) {
-  using paddle::framework::Tensor;
-  using paddle::platform::CPUPlace;
-  using paddle::platform::float16;
-  namespace f = paddle::framework;
-  f::proto::VarType::Type dtype = f::proto::VarType::FP16;
-
-  Tensor tensor;
-  CPUPlace cpu;
-  tensor.mutable_data(cpu, dtype);
-
-  // test fp16 tensor
-  EXPECT_EQ(tensor.type(), f::ToDataType(typeid(float16)));
-
-  // test fp16 size
-  EXPECT_EQ(f::SizeOfType(dtype), 2u);
-
-  // test debug info
-  std::string type = "::paddle::platform::float16";
-  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
-}
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
deleted file mode 100644
index d79f8cacb5f4727defc77380371e57bcea65f068..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_type_transform.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type_transform.h"
-
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename InType, typename OutType>
-struct CastDataTypeFunctor {
-  HOSTDEVICE inline OutType operator()(InType in) const {
-    return static_cast<OutType>(in);
-  }
-};
-
-template <typename InType>
-struct CastDataType {
-  CastDataType(const framework::Tensor& in, framework::Tensor* out,
-               const platform::DeviceContext* ctx)
-      : in_(in), out_(out), ctx_(ctx) {}
-  const framework::Tensor in_;
-  framework::Tensor* out_;
-  const platform::DeviceContext* ctx_;
-
-  template <typename OutType>
-  void apply() {
-    auto* in_begin = in_.data<InType>();
-    auto* in_end = in_begin + in_.numel();
-    auto* out_begin = out_->mutable_data<OutType>(in_.place());
-
-    if (platform::is_cpu_place(in_.place())) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
-      trans(*context, in_begin, in_end, out_begin,
-            CastDataTypeFunctor<InType, OutType>());
-#ifdef __NVCC__
-    } else if (platform::is_gpu_place(in_.place())) {
-      platform::Transform<platform::CUDADeviceContext> trans;
-      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
-      trans(*context, in_begin, in_end, out_begin,
-            CastDataTypeFunctor<InType, OutType>());
-      context->Wait();
-#endif
-    } else {
-      PADDLE_THROW("Unsupported place!");
-    }
-  }
-};
-
-void TransDataType(const OpKernelType& kernel_type_for_var,
-                   const OpKernelType& expected_kernel_type, const Tensor& in,
-                   Tensor* out) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
-  out->Resize(in.dims());
-  auto src_type = kernel_type_for_var.data_type_;
-  auto dst_type = expected_kernel_type.data_type_;
-  auto ctx = pool.Get(in.place());
-
-  switch (src_type) {
-    case proto::VarType::FP16:
-      framework::VisitDataType(dst_type,
-                               CastDataType<platform::float16>(in, out, ctx));
-      break;
-    case proto::VarType::FP32:
-      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
-      break;
-    case proto::VarType::FP64:
-      framework::VisitDataType(dst_type, CastDataType<double>(in, out, ctx));
-      break;
-    case proto::VarType::INT32:
-      framework::VisitDataType(dst_type, CastDataType<int>(in, out, ctx));
-      break;
-    case proto::VarType::INT64:
-      framework::VisitDataType(dst_type, CastDataType<int64_t>(in, out, ctx));
-      break;
-    case proto::VarType::BOOL:
-      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
-      break;
-    case proto::VarType::INT16:
-      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
-      break;
-    case proto::VarType::UINT8:
-      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
-      break;
-    default:
-      PADDLE_THROW("Not support type %d", src_type);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu
deleted file mode 120000
index f46491293ef4ad688c1bce9327f5f28011dec809..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_type_transform.cu
+++ /dev/null
@@ -1 +0,0 @@
-data_type_transform.cc
\ No newline at end of file
diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h
deleted file mode 100644
index 1c281b03ed61ac70e16a43d75a79854bdafd8836..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_type_transform.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <utility>
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
-
-void TransDataType(const OpKernelType& kernel_type_for_var,
-                   const OpKernelType& expected_kernel_type, const Tensor& in,
-                   Tensor* out);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
deleted file mode 100644
index bbebea9f13fd37469a0e9b7be9719aca128f5687..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type_transform.h"
-
-#include "gtest/gtest.h"
-
-TEST(DataTypeTransform, CPUTransform) {
-  auto place = paddle::platform::CPUPlace();
-
-  auto kernel_fp16 = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::FP16, place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  auto kernel_fp32 = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::FP32, place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  auto kernel_fp64 = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::FP64, place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  auto kernel_int32 = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::INT32, place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  auto kernel_int64 = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::INT64, place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  auto kernel_bool = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::BOOL, place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  // data type transform from float32
-  {
-    paddle::framework::Tensor in;
-    paddle::framework::Tensor out;
-
-    float* ptr =
-        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
-    int data_number = 2 * 3;
-
-    for (int i = 0; i < data_number; ++i) {
-      ptr[i] = i / 3;
-    }
-
-    paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in, &out);
-    double* out_data_double = out.data<double>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
-    }
-
-    paddle::framework::TransDataType(kernel_fp32, kernel_int32, in, &out);
-    int* out_data_int = out.data<int>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
-    }
-  }
-
-  // data type transform from/to float16
-  {
-    paddle::framework::Tensor in;
-    paddle::framework::Tensor out;
-
-    paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
-        paddle::framework::make_ddim({2, 3}), place);
-    int data_number = 2 * 3;
-
-    for (int i = 0; i < data_number; ++i) {
-      ptr[i] = i;
-    }
-
-    // transform from float16 to other data types
-    paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in, &out);
-    float* out_data_float = out.data<float>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
-    }
-
-    paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in, &out);
-    double* out_data_double = out.data<double>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
-    }
-
-    paddle::framework::TransDataType(kernel_fp16, kernel_int32, in, &out);
-    int* out_data_int = out.data<int>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
-    }
-
-    paddle::framework::TransDataType(kernel_fp16, kernel_int64, in, &out);
-    int64_t* out_data_int64 = out.data<int64_t>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
-    }
-
-    paddle::framework::TransDataType(kernel_fp16, kernel_bool, in, &out);
-    bool* out_data_bool = out.data<bool>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
-    }
-
-    // transform float to float16
-    float* in_data_float =
-        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
-    for (int i = 0; i < data_number; ++i) {
-      in_data_float[i] = i;
-    }
-
-    paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in, &out);
-    ptr = out.data<paddle::platform::float16>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x,
-                static_cast<paddle::platform::float16>(in_data_float[i]).x);
-    }
-
-    // transform double to float16
-    double* in_data_double =
-        in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
-    for (int i = 0; i < data_number; ++i) {
-      in_data_double[i] = i;
-    }
-
-    paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in, &out);
-    ptr = out.data<paddle::platform::float16>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x,
-                static_cast<paddle::platform::float16>(in_data_double[i]).x);
-    }
-
-    // transform int to float16
-    int* in_data_int =
-        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
-    for (int i = 0; i < data_number; ++i) {
-      in_data_int[i] = i;
-    }
-
-    paddle::framework::TransDataType(kernel_int32, kernel_fp16, in, &out);
-    ptr = out.data<paddle::platform::float16>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x,
-                static_cast<paddle::platform::float16>(in_data_int[i]).x);
-    }
-
-    // transform int64 to float16
-    int64_t* in_data_int64 =
-        in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
-    for (int i = 0; i < data_number; ++i) {
-      in_data_int64[i] = i;
-    }
-
-    paddle::framework::TransDataType(kernel_int64, kernel_fp16, in, &out);
-    ptr = out.data<paddle::platform::float16>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x,
-                static_cast<paddle::platform::float16>(in_data_int64[i]).x);
-    }
-
-    // transform bool to float16
-    bool* in_data_bool =
-        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
-    for (int i = 0; i < data_number; ++i) {
-      in_data_bool[i] = i;
-    }
-
-    paddle::framework::TransDataType(kernel_bool, kernel_fp16, in, &out);
-    ptr = out.data<paddle::platform::float16>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x,
-                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
-    }
-  }
-}
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
deleted file mode 100644
index 0874509a8797cd2ff1b1fcb347b4ef3b74a39047..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ /dev/null
@@ -1,261 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-#include "gtest/gtest.h"
-
-TEST(DataTypeTransform, GPUTransform) {
-  auto cpu_place = paddle::platform::CPUPlace();
-  auto gpu_place = paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
-
-  auto kernel_fp16 = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::FP16, gpu_place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  auto kernel_fp32 = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::FP32, gpu_place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  auto kernel_fp64 = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::FP64, gpu_place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  auto kernel_int32 = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::INT32, gpu_place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  auto kernel_int64 = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::INT64, gpu_place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  auto kernel_bool = paddle::framework::OpKernelType(
-      paddle::framework::proto::VarType::BOOL, gpu_place,
-      paddle::framework::DataLayout::kAnyLayout,
-      paddle::framework::LibraryType::kPlain);
-
-  // data type transform from float32
-  {
-    paddle::framework::Tensor in;
-    paddle::framework::Tensor in_gpu;
-    paddle::framework::Tensor out_gpu;
-    paddle::framework::Tensor out;
-
-    float* in_ptr =
-        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
-    float arr[6] = {0, 1, 2, 3, 4, 5};
-    int data_number = sizeof(arr) / sizeof(arr[0]);
-    memcpy(in_ptr, arr, sizeof(arr));
-
-    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
-    context.Wait();
-    paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    double* out_data_double = out.data<double>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
-    }
-
-    paddle::framework::TransDataType(kernel_fp32, kernel_int32, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    int* out_data_int = out.data<int>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_int[i], static_cast<int>(arr[i]));
-    }
-  }
-
-  // data type transform from/to float16
-  {
-    paddle::framework::Tensor in;
-    paddle::framework::Tensor in_gpu;
-    paddle::framework::Tensor out_gpu;
-    paddle::framework::Tensor out;
-
-    paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
-        paddle::framework::make_ddim({2, 3}), cpu_place);
-    paddle::platform::float16 arr[6] = {
-        paddle::platform::float16(0), paddle::platform::float16(1),
-        paddle::platform::float16(2), paddle::platform::float16(3),
-        paddle::platform::float16(4), paddle::platform::float16(5)};
-
-    int data_number = sizeof(arr) / sizeof(arr[0]);
-    memcpy(ptr, arr, sizeof(arr));
-    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
-    context.Wait();
-
-    // transform from float16 to other data types
-    paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    float* out_data_float = out.data<float>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
-    }
-
-    paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    double* out_data_double = out.data<double>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
-    }
-
-    paddle::framework::TransDataType(kernel_fp16, kernel_int32, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    int* out_data_int = out.data<int>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
-    }
-
-    paddle::framework::TransDataType(kernel_fp16, kernel_int64, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    int64_t* out_data_int64 = out.data<int64_t>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
-    }
-
-    paddle::framework::TransDataType(kernel_fp16, kernel_bool, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    bool* out_data_bool = out.data<bool>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
-    }
-
-    // transform float to float16
-    float* in_data_float =
-        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
-    for (int i = 0; i < data_number; ++i) {
-      in_data_float[i] = i;
-    }
-
-    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
-    context.Wait();
-    paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    ptr = out.data<paddle::platform::float16>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x,
-                static_cast<paddle::platform::float16>(in_data_float[i]).x);
-    }
-
-    // transform double to float16
-    double* in_data_double = in.mutable_data<double>(
-        paddle::framework::make_ddim({2, 3}), cpu_place);
-    for (int i = 0; i < data_number; ++i) {
-      in_data_double[i] = i;
-    }
-
-    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
-    context.Wait();
-    paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    ptr = out.data<paddle::platform::float16>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x,
-                static_cast<paddle::platform::float16>(in_data_double[i]).x);
-    }
-
-    // transform int to float16
-    int* in_data_int =
-        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), cpu_place);
-    for (int i = 0; i < data_number; ++i) {
-      in_data_int[i] = i;
-    }
-
-    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
-    context.Wait();
-    paddle::framework::TransDataType(kernel_int32, kernel_fp16, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    ptr = out.data<paddle::platform::float16>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x,
-                static_cast<paddle::platform::float16>(in_data_int[i]).x);
-    }
-
-    // transform int64 to float16
-    int64_t* in_data_int64 = in.mutable_data<int64_t>(
-        paddle::framework::make_ddim({2, 3}), cpu_place);
-    for (int i = 0; i < data_number; ++i) {
-      in_data_int64[i] = i;
-    }
-
-    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
-    context.Wait();
-    paddle::framework::TransDataType(kernel_int64, kernel_fp16, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    ptr = out.data<paddle::platform::float16>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x,
-                static_cast<paddle::platform::float16>(in_data_int64[i]).x);
-    }
-
-    // transform bool to float16
-    bool* in_data_bool =
-        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), cpu_place);
-    for (int i = 0; i < data_number; ++i) {
-      in_data_bool[i] = i;
-    }
-
-    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
-    context.Wait();
-    paddle::framework::TransDataType(kernel_bool, kernel_fp16, in_gpu,
-                                     &out_gpu);
-    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
-    context.Wait();
-
-    ptr = out.data<paddle::platform::float16>();
-    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x,
-                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
-    }
-  }
-}
diff --git a/paddle/fluid/framework/dataset_factory.cc b/paddle/fluid/framework/dataset_factory.cc
deleted file mode 100644
index 3a28c101d48342ef639956a974d59aee8ae42ed6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/dataset_factory.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/dataset_factory.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/framework/data_set.h"
-
-namespace paddle {
-namespace framework {
-typedef std::unique_ptr<Dataset> (*CreateDatasetFunction)();
-typedef std::unordered_map<std::string, CreateDatasetFunction> datasetMap;
-datasetMap g_dataset_map;
-
-#define REGISTER_DATASET_CLASS(dataset_class)                   \
-  namespace {                                                   \
-  std::unique_ptr<Dataset> Creator_##dataset_class() {          \
-    return std::unique_ptr<Dataset>(new dataset_class);         \
-  }                                                             \
-  class __Registerer_##dataset_class {                          \
-   public:                                                      \
-    __Registerer_##dataset_class() {                            \
-      g_dataset_map[#dataset_class] = &Creator_##dataset_class; \
-    }                                                           \
-  };                                                            \
-  __Registerer_##dataset_class g_registerer_##dataset_class;    \
-  }  // namespace
-
-std::string DatasetFactory::DatasetTypeList() {
-  std::string dataset_types;
-  for (auto iter = g_dataset_map.begin(); iter != g_dataset_map.end(); ++iter) {
-    if (iter != g_dataset_map.begin()) {
-      dataset_types += ", ";
-    }
-    dataset_types += iter->first;
-  }
-  return dataset_types;
-}
-
-std::unique_ptr<Dataset> DatasetFactory::CreateDataset(
-    std::string dataset_class) {
-  if (g_dataset_map.count(dataset_class) < 1) {
-    LOG(WARNING) << "Your Dataset " << dataset_class
-                 << "is not supported currently";
-    LOG(WARNING) << "Supported Dataset: " << DatasetTypeList();
-    exit(-1);
-  }
-  return g_dataset_map[dataset_class]();
-}
-
-REGISTER_DATASET_CLASS(MultiSlotDataset);
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/dataset_factory.h b/paddle/fluid/framework/dataset_factory.h
deleted file mode 100644
index d4a36cec22fc0af27a38ee7cd810a2eaa7988ea1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/dataset_factory.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/data_set.h"
-
-namespace paddle {
-namespace framework {
-class DatasetFactory {
- public:
-  static std::string DatasetTypeList();
-  static std::unique_ptr<Dataset> CreateDataset(std::string dataset_class);
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc
deleted file mode 100644
index b3aaa01d53b9b8e27138e8946e39112d312c74a0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ddim.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-DDim make_ddim(std::initializer_list<int64_t> dims) {
-  return DDim(dims.begin(), dims.size());
-}
-
-DDim make_ddim(const std::vector<int64_t>& dims) {
-  return DDim(dims.data(), dims.size());
-}
-
-DDim make_ddim(const std::vector<int>& dims) {
-  return DDim(dims.data(), dims.size());
-}
-
-struct DDimEqualityVisitor {
-  explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {}
-
-  template <int D>
-  inline bool operator()(const Dim<D>& self) const {
-    return UnrollCompare<D>::Run(self.Get(), d_);
-  }
-
-  const int64_t* d_;
-};
-
-bool DDim::operator==(const DDim& d) const {
-  return size() == d.size() &&
-         this->apply_visitor(DDimEqualityVisitor(d.Get()));
-}
-
-bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
-
-struct ProductVisitor {
-  template <int D>
-  inline int64_t operator()(const Dim<D>& dim) {
-    return product(dim);
-  }
-};
-
-int64_t product(const DDim& ddim) {
-  return ddim.apply_visitor(ProductVisitor());
-}
-
-bool contain_unknown_dim(const DDim& ddim) {
-  for (int i = 0; i < ddim.size(); ++i) {
-    if (ddim[i] < 0) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-DDim slice_ddim(const DDim& dim, int begin, int end) {
-  PADDLE_ENFORCE(begin >= 0 && end <= dim.size(),
-                 "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
-                 begin, end, dim.size());
-  // Constructor of DDim would check whether end - begin is valid
-  return DDim(dim.Get() + begin, end - begin);
-}
-
-int arity(const DDim& d) { return d.size(); }
-
-struct DDimPrinter {
-  std::ostream& os;
-  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
-
-  template <int D>
-  void operator()(const Dim<D>& t) {
-    os << t;
-  }
-};
-
-std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
-  ddim.apply_visitor(DDimPrinter(os));
-  return os;
-}
-
-DDim flatten_to_2d(const DDim& src, int num_col_dims) {
-  return DDim({product(slice_ddim(src, 0, num_col_dims)),
-               product(slice_ddim(src, num_col_dims, src.size()))});
-}
-
-DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); }
-
-DDim stride(const DDim& ddim) {
-  DDim strides;
-  strides.rank_ = ddim.size();
-  strides[ddim.size() - 1] = 1;
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i + 1];
-  }
-  return strides;
-}
-
-DDim stride_numel(const DDim& ddim) {
-  DDim strides;
-  strides.rank_ = ddim.size();
-  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i];
-  }
-  return strides;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
deleted file mode 100644
index 14824afbea7f6b7d852100642d9d27b192c093e5..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ddim.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <initializer_list>
-#include <stdexcept>
-#include <vector>
-#include "paddle/fluid/framework/dim.h"
-
-namespace paddle {
-namespace framework {
-
-#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
-  case (rank): {                               \
-    constexpr auto kRank = (rank);             \
-    return (callback);                         \
-  }
-
-#define PADDLE_VISIT_DDIM(rank, callback)    \
-  switch (rank) {                            \
-    PADDLE_VISIT_DDIM_BASE(0, callback);     \
-    PADDLE_VISIT_DDIM_BASE(1, callback);     \
-    PADDLE_VISIT_DDIM_BASE(2, callback);     \
-    PADDLE_VISIT_DDIM_BASE(3, callback);     \
-    PADDLE_VISIT_DDIM_BASE(4, callback);     \
-    PADDLE_VISIT_DDIM_BASE(5, callback);     \
-    PADDLE_VISIT_DDIM_BASE(6, callback);     \
-    PADDLE_VISIT_DDIM_BASE(7, callback);     \
-    PADDLE_VISIT_DDIM_BASE(8, callback);     \
-    PADDLE_VISIT_DDIM_BASE(9, callback);     \
-    default:                                 \
-      PADDLE_THROW("Invalid rank %d", rank); \
-  }
-
-template <typename T1, typename T2>
-inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
-  PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
-}
-
-/**
- * \brief A dynamically sized dimension.
- *
- * The number of dimensions must be between [1, 9].
- */
-class DDim {
- public:
-  constexpr static int kMaxRank = 9;
-
-  DDim() : rank_(1) { dim_[0] = 0; }
-
-  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
-
-  DDim(const int* d, int n) : rank_(n) {
-    dynamic_dim_assign(d, dim_.GetMutable(), n);
-  }
-
-  DDim(const int64_t* d, int n) : rank_(n) {
-    dynamic_dim_assign(d, dim_.GetMutable(), n);
-  }
-
-  template <int D>
-  /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
-    UnsafeCast<D>() = in;
-  }
-
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
-      : DDim(init_list.begin(), init_list.size()) {}
-
-  inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
-
-  template <int D>
-  inline DDim& operator=(const Dim<D>& dim) {
-    rank_ = D;
-    UnsafeCast<D>() = dim;
-    return *this;
-  }
-
-  inline int64_t& operator[](int idx) { return dim_[idx]; }
-
-  inline int64_t operator[](int idx) const { return dim_[idx]; }
-
-  inline int64_t& at(int idx) {
-    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
-    return dim_[idx];
-  }
-
-  inline int64_t at(int idx) const {
-    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
-    return dim_[idx];
-  }
-
-  template <typename Visitor>
-  typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
-      Visitor&& visitor) {
-    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
-  }
-
-  template <typename Visitor>
-  typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
-      Visitor&& visitor) const {
-    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
-  }
-
-  bool operator==(const DDim& d) const;
-
-  bool operator!=(const DDim& d) const;
-
-  inline const int64_t* Get() const { return dim_.Get(); }
-
-  inline int64_t* GetMutable() { return dim_.GetMutable(); }
-
-  inline int size() const { return rank_; }
-
- private:
-  template <int D>
-  inline Dim<D>& UnsafeCast() {
-    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
-    auto* p = static_cast<void*>(&dim_);
-    return *reinterpret_cast<Dim<D>*>(p);
-  }
-
-  template <int D>
-  inline const Dim<D>& UnsafeCast() const {
-    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
-    auto* p = static_cast<const void*>(&dim_);
-    return *reinterpret_cast<const Dim<D>*>(p);
-  }
-
-  inline DDim& CopyFrom(const DDim& ddim) {
-    PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
-  }
-
-  friend DDim stride(const DDim& ddim);
-  friend DDim stride_numel(const DDim& ddim);
-
- private:
-  Dim<kMaxRank> dim_;
-  int rank_;
-};
-
-#undef PADDLE_VISIT_DDIM_BASE
-#undef PADDLE_VISIT_DDIM
-
-/**
- * \brief Make a DDim from std::vector<int64_t>
- *
- * \param dims An vector of ints. Must be sized between [1, 9]
- */
-DDim make_ddim(const std::vector<int64_t>& dims);
-
-DDim make_ddim(const std::vector<int>& dims);
-
-/**
- * \brief Make a DDim from an initializer list
- *
- * \param dims An initializer list of ints. Must be sized between [1, 9]
- *
- */
-DDim make_ddim(std::initializer_list<int64_t> dims);
-
-template <typename T = int64_t>
-std::vector<T> vectorize(const DDim& ddim) {
-  std::vector<T> result(DDim::kMaxRank);
-  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
-  result.resize(ddim.size());
-  return result;
-}
-
-int64_t product(const DDim& ddim);
-
-bool contain_unknown_dim(const DDim& ddim);
-
-/**
- * \brief Slice a ddim
- *
- * Slice dim with [begin, end).
- * e.g.  DDim d = make_ddim({1,2,3,4,5});
- *       slice_ddim(d, 1, 3); ====> {2,3}
- */
-DDim slice_ddim(const DDim& dim, int begin, int end);
-
-/**
- * \brief What is the length of this dimension?
- *
- * \param Dynamic dimension to inspect
- */
-
-int arity(const DDim& ddim);
-
-std::ostream& operator<<(std::ostream&, const DDim&);
-
-// Reshape a tensor to a matrix. The matrix's first dimension(column length)
-// will be the product of tensor's first `num_col_dims` dimensions.
-DDim flatten_to_2d(const DDim& src, int num_col_dims);
-
-DDim flatten_to_1d(const DDim& src);
-
-DDim stride(const DDim& ddim);
-
-DDim stride_numel(const DDim& ddim);
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ddim_test.cc b/paddle/fluid/framework/ddim_test.cc
deleted file mode 100644
index b7b42fa019f6b2243719ebdb1628ea83c220abf9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ddim_test.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <sstream>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/ddim.h"
-
-TEST(DDim, Equality) {
-  // construct a DDim from an initialization list
-  paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5});
-  EXPECT_EQ(ddim[0], 9);
-  EXPECT_EQ(ddim[1], 1);
-  EXPECT_EQ(ddim[2], 5);
-
-  // construct a DDim from a vector
-  std::vector<int64_t> vec({9, 1, 5});
-  paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
-  EXPECT_EQ(ddim[0], 9);
-  EXPECT_EQ(ddim[1], 1);
-  EXPECT_EQ(ddim[2], 5);
-
-  // mutate a DDim
-  ddim[1] = 2;
-  EXPECT_EQ(ddim[1], 2);
-  ddim[0] = 6;
-  EXPECT_EQ(ddim[0], 6);
-
-  // vectorize a DDim
-  std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
-  EXPECT_EQ(res_vec[0], 9);
-  EXPECT_EQ(res_vec[1], 1);
-  EXPECT_EQ(res_vec[2], 5);
-  paddle::framework::Dim<3> d(3, 2, 1);
-  res_vec = paddle::framework::vectorize(paddle::framework::DDim(d));
-  EXPECT_EQ(res_vec[0], 3);
-  EXPECT_EQ(res_vec[1], 2);
-  EXPECT_EQ(res_vec[2], 1);
-
-  // arity of a DDim
-  EXPECT_EQ(paddle::framework::arity(ddim), 3);
-  EXPECT_EQ(ddim.size(), 3);
-
-  // product of a DDim
-  EXPECT_EQ(paddle::framework::product(vddim), 45);
-  EXPECT_EQ(
-      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
-      90);
-
-  // slice a DDim
-  paddle::framework::DDim ddim2 =
-      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
-  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
-  EXPECT_EQ(arity(ss), 3);
-  EXPECT_EQ(ss[0], 3);
-  EXPECT_EQ(ss[1], 4);
-  EXPECT_EQ(ss[2], 5);
-  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
-  EXPECT_EQ(arity(ss2), 6);
-  EXPECT_EQ(ss2[0], 1);
-  EXPECT_EQ(ss2[1], 2);
-  EXPECT_EQ(ss2[2], 3);
-  EXPECT_EQ(ss2[3], 4);
-  EXPECT_EQ(ss2[4], 5);
-  EXPECT_EQ(ss2[5], 6);
-}
-
-TEST(DDim, Print) {
-  // print a DDim
-  std::stringstream ss;
-  paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4});
-  ss << ddim;
-  EXPECT_EQ("2, 3, 4", ss.str());
-}
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
deleted file mode 100644
index 1e87eabc083b994ccb1010f5640d3aef79ee6025..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ /dev/null
@@ -1,106 +0,0 @@
-cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
-cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
-
-cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-
-cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry) 
-cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor)
-cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
-
-cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
-
-if(WITH_DISTRIBUTE)
-    if(NOT WITH_GRPC)
-        set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-        set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    endif()
-endif()
-
-
-if(WITH_GPU)
-    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor)
-    nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor)
-
-    if(WITH_DGC)
-        nv_library(sparse_all_reduce_op_handle SRCS sparse_all_reduce_op_handle.cc DEPS op_handle_base scope 
-            lod_tensor ddim memory dynload_cuda variable_visitor dgc all_reduce_op_handle)
-    endif()
-
-    if(WITH_DISTRIBUTE)
-        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
-    else()
-        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim dynload_cuda selected_rows_functor)
-    endif()
-    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
-    nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
-
-else()
-    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-             variable_visitor)
-    cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            variable_visitor)
-    if(WITH_DISTRIBUTE)
-        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim selected_rows_functor sendrecvop_rpc)
-    else()
-        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim selected_rows_functor)
-    endif()
-    cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
-    cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
-endif()
-
-cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
-
-cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
-
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass buffer_shared_inplace_op_pass buffer_shared_cross_op_memory_reuse_pass)
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
-
-cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
-        simple_threadpool device_context)
-
-cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
-
-set(ASYNC_SSA_GRAPH_EXECUTOR_DEPS threaded_ssa_graph_executor)
-if(WITH_DISTRIBUTE)
-    list(APPEND ASYNC_SSA_GRAPH_EXECUTOR_DEPS communicator)
-endif()
-cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS})
-
-cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
-        device_context broadcast_op_handle)
-cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
-        device_context gather_op_handle)
-
-cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows)
-cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor scope_buffered_monitor)
-#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
-#        device_context reduce_op_handle )
-cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
-        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
-cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
-
-if(WITH_NGRAPH) 
-  set(NGRAPH_BS_DEPS ngraph)
-else()
-  set(NGRAPH_BS_DEPS)
-endif()
-
-cc_library(build_strategy SRCS build_strategy.cc DEPS
-        graph_viz_pass multi_devices_graph_pass
-        multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fuse_elewise_add_act_pass multi_batch_merge_pass 
-        fuse_relu_depthwise_conv_pass
-        lock_free_optimize_pass
-        coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
-        fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
-        ${NGRAPH_BS_DEPS})
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
deleted file mode 100644
index a367772aef844a46e8d2552c90a7814fee8c5f43..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ /dev/null
@@ -1,199 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
-#include <algorithm>
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/reduce_and_gather.h"
-#include "paddle/fluid/framework/details/variable_visitor.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/profiler.h"
-
-#ifdef PADDLE_WITH_CUDA
-DECLARE_bool(sync_nccl_allreduce);
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
-                                     const std::vector<Scope *> &local_scopes,
-                                     const std::vector<platform::Place> &places,
-                                     const platform::NCCLCommunicator *ctxs)
-    : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-}
-#else
-AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
-                                     const std::vector<Scope *> &local_scopes,
-                                     const std::vector<platform::Place> &places)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-}
-#endif
-
-void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
-
-  WaitInputVarGenerated();
-  std::vector<VarHandleBase *> inputs = this->Inputs();
-  std::vector<VarHandleBase *> outputs = this->Outputs();
-  auto in_var_handles = DynamicCast<VarHandle>(inputs);
-  auto out_var_handles = DynamicCast<VarHandle>(outputs);
-  AllReduceImpl(in_var_handles, out_var_handles);
-}
-
-void AllReduceOpHandle::AllReduceImpl(
-    const std::vector<VarHandle *> &in_var_handles,
-    const std::vector<VarHandle *> &out_var_handles) {
-  size_t num_places = places_.size();
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), num_places,
-      "The NoDummyInputSize should be equal to the number of places.");
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places);
-
-  std::vector<const void *> lod_tensor_data;
-  std::vector<platform::Place> places;
-  lod_tensor_data.reserve(num_places);
-  places.reserve(num_places);
-  int64_t numel = -1;
-  bool is_gpu_place = false;
-  auto dtype = static_cast<framework::proto::VarType::Type>(0);
-  for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
-    auto &local_scope = local_exec_scopes_[i];
-    auto var = local_scope->FindVar(in_var_handles[i]->name());
-    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.",
-                            in_var_handles[i]->name());
-    auto &lod_tensor = var->Get<LoDTensor>();
-
-    if (i == 0) {
-      numel = static_cast<int64_t>(lod_tensor.numel());
-      dtype = lod_tensor.type();
-      is_gpu_place = platform::is_gpu_place(lod_tensor.place());
-    }
-    PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel()));
-    PADDLE_ENFORCE_EQ(dtype, lod_tensor.type());
-    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()));
-
-    lod_tensor_data.emplace_back(lod_tensor.data<void>());
-    places.emplace_back(lod_tensor.place());
-
-    VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
-             << ", out_name:" << out_var_handles[i]->name();
-
-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
-  }
-
-  std::vector<std::string> grad_var_names;
-  grad_var_names.reserve(num_places);
-  for (auto &out_var : out_var_handles) {
-    grad_var_names.emplace_back(out_var->Name());
-  }
-
-  AllReduceFunc(lod_tensor_data, dtype, numel, places, grad_var_names);
-}
-
-void AllReduceOpHandle::AllReduceFunc(
-    std::vector<const void *> lod_tensor_data,
-    const framework::proto::VarType::Type &dtype, int64_t numel,
-    const std::vector<platform::Place> &places,
-    const std::vector<std::string> &out_var_names) {
-  if (is_gpu_place(places[0])) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
-    ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
-    std::vector<std::function<void()>> all_reduce_calls;
-    for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
-      auto &p = places[i];
-      void *buffer = const_cast<void *>(lod_tensor_data.at(i));
-      all_reduce_calls.emplace_back([=] {
-        NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, ncclSum);
-      });
-    }
-    NCCLAllReduceFunc(all_reduce_calls);
-#else
-    PADDLE_THROW("Not compiled with CUDA.");
-#endif
-  } else {  // Special handle CPU only Operator's gradient. Like CRF
-    auto &trg = *local_exec_scopes_[0]
-                     ->FindVar(out_var_names[0])
-                     ->GetMutable<LoDTensor>();
-
-    // Reduce All Tensor to trg in CPU
-    ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
-    VisitDataType(trg.type(), func);
-
-    for (size_t i = 1; i < local_exec_scopes_.size(); ++i) {
-      auto &scope = local_exec_scopes_[i];
-      auto &p = places[i];
-      auto *var = scope->FindVar(out_var_names[i]);
-
-      size_t size = numel * SizeOfType(trg.type());
-      RunAndRecordEvent(p, [&trg, var, p, size] {
-        auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
-        platform::CPUPlace cpu_place;
-        memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
-      });
-    }
-  }
-  VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype);
-}
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-void AllReduceOpHandle::NCCLAllReduceFunc(
-    const std::vector<std::function<void()>> &all_reduce_calls) {
-  this->RunAndRecordEvent([&] {
-    if (all_reduce_calls.size() == 1UL) {
-      // Do not use NCCLGroup when manage NCCL by per thread per device
-      all_reduce_calls[0]();
-    } else {
-      platform::NCCLGroupGuard guard;
-      for (auto &call : all_reduce_calls) {
-        call();
-      }
-    }
-  });
-
-  if (FLAGS_sync_nccl_allreduce) {
-    for (auto &p : places_) {
-      int dev_id = boost::get<platform::CUDAPlace>(p).device;
-      auto *nccl_ctxs =
-          nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
-      auto &nccl_ctx = nccl_ctxs->at(dev_id);
-      auto stream = nccl_ctx.stream();
-      cudaError_t e_sync = cudaStreamSynchronize(stream);
-      if (e_sync != 0) {
-        LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync);
-      }
-
-      cudaError_t e_get = cudaGetLastError();
-      if (e_get != 0) {
-        LOG(FATAL) << "cudaGetLastError  " << cudaGetErrorString(e_get)
-                   << " errno:" << e_get;
-      }
-    }
-  }
-}
-#endif
-
-std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
deleted file mode 100644
index c18b0ed9290609d52575df6fdbaf31a9c5a2bfb3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ /dev/null
@@ -1,79 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/framework/details/nccl_op_handle.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-class AllReduceOpHandle : public NCCLOpHandleBase {
- public:
-  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                    const std::vector<platform::Place> &places,
-                    const platform::NCCLCommunicator *ctxs);
-#else
-class AllReduceOpHandle : public OpHandleBase {
- public:
-  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                    const std::vector<platform::Place> &places);
-#endif
-  std::string Name() const override;
-
-  // Delay and buffer nccl_all_reduce together can significantly increase
-  // performance. Disable this feature by returning false.
-  bool IsMultiDeviceTransfer() override { return true; };
-
- protected:
-  void RunImpl() override;
-
-  std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
-
-  std::vector<Scope *> local_scopes_;
-
-#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
-  // NCCLOpHandleBase already have these attributes.
-  // Will polish it by class inheritance framework.
-  std::vector<platform::Place> places_;
-#endif
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  void NCCLAllReduceFunc(
-      const std::vector<std::function<void()>> &all_reduce_calls);
-#endif
-
-  void AllReduceImpl(const std::vector<VarHandle *> &in_var_handles,
-                     const std::vector<VarHandle *> &out_var_handles);
-
-  void AllReduceFunc(std::vector<const void *> lod_tensor_data,
-                     const framework::proto::VarType::Type &dtype,
-                     int64_t numel, const std::vector<platform::Place> &places,
-                     const std::vector<std::string> &out_var_handles);
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
deleted file mode 100644
index 2e247075395f6603922c96bbe69f598265ec7c75..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ /dev/null
@@ -1,210 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
-
-#include "paddle/fluid/framework/variable_helper.h"
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/communicator.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-inline void InitVarsInScope(const std::vector<VarInfo> &var_infos, Scope *scope,
-                            Scope *local_scope) {
-  VLOG(3) << "InitVarsInScope";
-  for (auto &info : var_infos) {
-    if (info.persistable_) {  // Persistable
-      auto *var = scope->FindVar(info.name_);
-      if (var != nullptr) {
-        VLOG(2) << info.name_
-                << " has been initialized beforehand in global scope, skipped";
-        continue;
-      }
-      InitializeVariable(scope->Var(info.name_), info.type_);
-    } else {
-      InitializeVariable(local_scope->Var(info.name_), info.type_);
-    }
-  }
-}
-
-// get RpcContext and remote send and recv op
-void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
-#ifdef PADDLE_WITH_DISTRIBUTE
-  using RpcCtxMap = operators::distributed::RpcCtxMap;
-  VLOG(3) << "ProcessGraph";
-  RpcCtxMap send_varname_to_ctx;
-  RpcCtxMap recv_varname_to_ctx;
-  for (auto &node : graphs[0]->Nodes()) {
-    VLOG(3) << "node name " << node->Name();
-    if (node && node->IsOp()) {
-      if (node->Name() == "send") {
-        auto send_var_name = node->Op()->Input("X")[0];
-        auto send_varnames = boost::get<std::vector<std::string>>(
-            node->Op()->GetNullableAttr("send_varnames"));
-        auto epmap = boost::get<std::vector<std::string>>(
-            node->Op()->GetNullableAttr("epmap"));
-        auto height_section = boost::get<std::vector<int64_t>>(
-            node->Op()->GetNullableAttr("sections"));
-        auto trainer_id =
-            boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
-        send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext(
-            send_var_name, send_varnames, epmap, height_section, trainer_id);
-        VLOG(3) << "find and init an send op: "
-                << send_varname_to_ctx[send_var_name];
-      } else if (node->Name() == "recv") {
-        auto recv_var_name = node->Op()->Output("Out")[0];
-        auto recv_varnames = boost::get<std::vector<std::string>>(
-            node->Op()->GetNullableAttr("recv_varnames"));
-        auto epmap = boost::get<std::vector<std::string>>(
-            node->Op()->GetNullableAttr("epmap"));
-        auto trainer_id =
-            boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
-        recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(
-            recv_var_name, recv_varnames, epmap, {}, trainer_id);
-        VLOG(3) << "find and remove an recv op: "
-                << recv_varname_to_ctx[recv_var_name];
-      }
-    }
-  }
-
-  // init communicator here
-  if (send_varname_to_ctx.size() > 0) {
-    VLOG(3) << "this is distribute mode, will use communicator";
-
-    if (operators::distributed::Communicator::GetInstance() == nullptr) {
-      operators::distributed::Communicator::Init(send_varname_to_ctx,
-                                                 recv_varname_to_ctx, scope);
-      operators::distributed::Communicator::GetInstance()->Start();
-    } else {
-      VLOG(3) << "communicator has been initialized, skip";
-    }
-  }
-#endif
-}
-
-AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
-    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<Scope *> &local_exec_scopes,
-    const std::vector<platform::Place> &places, std::vector<ir::Graph *> graphs)
-    : strategy_(std::move(strategy)),
-      local_scopes_(std::move(local_scopes)),
-      local_exec_scopes_(local_exec_scopes),
-      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
-      places_(std::move(places)),
-      graphs_(std::move(graphs)) {
-  VLOG(3) << "build AsyncSSAGraphExecutor";
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-  PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
-
-  // set the correct size of thread pool to each device.
-  strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
-                               ? 1UL
-                               : strategy_.num_threads_ / places_.size();
-  VLOG(1) << "set num_threads: " << strategy_.num_threads_
-          << " to run the operators of the graph on each device.";
-  for (size_t i = 0; i < places.size(); ++i) {
-    executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, {local_scopes_[i]}, {local_exec_scopes_[i]}, {places_[i]},
-        graphs_[i]));
-  }
-
-  for (auto &node : graphs_[0]->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos_.emplace_back();
-      var_infos_.back().name_ = node->Var()->Name();
-      var_infos_.back().type_ = node->Var()->GetType();
-      var_infos_.back().persistable_ = node->Var()->Persistable();
-    }
-  }
-
-  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    InitVarsInScope(var_infos_, local_scopes_[i], local_exec_scopes_[i]);
-  }
-  ProcessGraph(graphs_, local_scopes_[0]);
-}
-
-void AsyncSSAGraphExecutor::StartOffPythonTrainLoop() {
-  VLOG(3) << "StartOffPythonTrainLoop size = " << places_.size();
-  for (size_t i = 1; i < places_.size(); ++i) {
-    auto call = [this, i]() -> void {
-      VLOG(3) << "start off python thread " << i;
-      try {
-        while (true) {
-          executors_[i]->Run({});
-        }
-      } catch (...) {
-        exception_holder_.Catch(std::current_exception());
-        VLOG(3) << "get exception type = " << exception_holder_.Type();
-      }
-      VLOG(3) << "thread " << i << " exited!";
-    };
-    run_futures_.emplace_back(pool_->enqueue(std::move(call)));
-  }
-}
-
-void AsyncSSAGraphExecutor::HandleException() {
-  if (exception_holder_.IsCaught()) {
-    for (auto &f : run_futures_) {
-      VLOG(3) << "wait future";
-      f.wait();
-    }
-    VLOG(3) << "caught exception " << exception_holder_.Type()
-            << ", rethrow it";
-    run_futures_.clear();
-    exception_holder_.ReThrow();
-  }
-}
-
-FeedFetchList AsyncSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors) {
-  // init once
-  if (run_futures_.size() == 0 && places_.size() > 1) {
-    exception_holder_.Clear();
-    StartOffPythonTrainLoop();
-  }
-
-  if (places_.size() == 1) {
-    exception_holder_.Clear();
-  } else {
-    HandleException();
-  }
-
-  FeedFetchList fetch_data;
-  fetch_data.reserve(fetch_tensors.size());
-
-  try {
-    fetch_data = executors_[0]->Run(fetch_tensors);
-  } catch (...) {
-    exception_holder_.Catch(std::current_exception());
-  }
-
-  HandleException();
-
-  FeedFetchList ret;
-  for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
-    std::vector<const LoDTensor *> lodtensor_ptrs;
-    lodtensor_ptrs.push_back(&fetch_data.at(fetch_idx));
-    ret.emplace_back();
-    ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
-  }
-  return ret;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h
deleted file mode 100644
index 97472674fada8cc1c531b54be49816e76ebde3f8..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "ThreadPool.h"
-#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct VarInfo {
-  std::string name_;
-  proto::VarType::Type type_;
-  bool persistable_;
-};
-
-class AsyncSSAGraphExecutor : public SSAGraphExecutor {
- public:
-  AsyncSSAGraphExecutor(const ExecutionStrategy &strategy,
-                        const std::vector<Scope *> &local_scopes,
-                        const std::vector<Scope *> &local_exec_scopes,
-                        const std::vector<platform::Place> &places,
-                        std::vector<ir::Graph *> graphs);
-  ~AsyncSSAGraphExecutor() final = default;
-  const ir::Graph &Graph() const override { return *graphs_[0]; }
-
-  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
-
- private:
-  void StartOffPythonTrainLoop();
-  void HandleException();
-
- private:
-  ExecutionStrategy strategy_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<Scope *> local_exec_scopes_;
-  std::unique_ptr<::ThreadPool> pool_{nullptr};
-  std::vector<platform::Place> places_;
-  std::vector<ir::Graph *> graphs_;
-
-  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
-  ExceptionHolder exception_holder_;
-  std::vector<std::future<void>> run_futures_;
-  std::vector<VarInfo> var_infos_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
deleted file mode 100644
index 3637625f7e04d1a12594e8fe93a681cb87636ef1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/broadcast_op_handle.h"
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/variable_visitor.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-void BroadcastOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
-
-  if (places_.size() == 1) return;
-
-  // The input and output may have dummy vars.
-  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
-
-  PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
-                    "The number of input should be one.");
-  PADDLE_ENFORCE_EQ(
-      out_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
-
-  VarHandle *in_var_handle = in_var_handles[0];
-
-  BroadcastOneVar(*in_var_handle, out_var_handles, local_exec_scopes_);
-}
-
-void BroadcastOpHandle::BroadcastOneVar(
-    const VarHandle &in_var_handle,
-    const std::vector<VarHandle *> &out_var_handles,
-    const std::vector<Scope *> &var_scopes) {
-  auto *in_var =
-      var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
-  PADDLE_ENFORCE_NOT_NULL(in_var);
-  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
-  if (UNLIKELY(!in_tensor.IsInitialized())) {
-    VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
-    return;
-  }
-
-  InitOutputValue(in_var_handle, out_var_handles);
-
-  if (platform::is_cpu_place(in_tensor.place())) {
-    WaitInputVarGenerated();
-    for (auto *out_var_handle : out_var_handles) {
-      if (out_var_handle->IsTheSameVar(in_var_handle)) {
-        continue;
-      }
-      auto &out_p = out_var_handle->place();
-      auto *out_var = var_scopes.at(out_var_handle->scope_idx())
-                          ->FindVar(out_var_handle->name());
-
-      RunAndRecordEvent(out_p, [in_tensor, out_var] {
-        paddle::framework::TensorCopy(
-            in_tensor, platform::CPUPlace(),
-            &VariableVisitor::GetMutableTensor(out_var));
-      });
-    }
-  } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    VarHandle *out_handle = nullptr;
-    int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
-    std::vector<std::function<void()>> broadcast_calls;
-
-    int type = platform::ToNCCLDataType(in_tensor.type());
-    size_t numel = static_cast<size_t>(in_tensor.numel());
-
-    for (auto out_var_handle : out_var_handles) {
-      Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
-                              ->FindVar(out_var_handle->name());
-
-      int dst_id =
-          boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
-
-      auto &nccl_ctx = nccl_ctxs_->at(dst_id);
-
-      void *send_recv_buffer = nullptr;
-      if (root_id == dst_id) {
-        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
-        out_handle = out_var_handle;
-      } else {
-        send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
-                               .Resize(in_tensor.dims())
-                               .mutable_data(out_var_handle->place());
-      }
-
-      broadcast_calls.emplace_back(
-          [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
-            PADDLE_ENFORCE(platform::dynload::ncclBcast(
-                send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
-                root_id, nccl_ctx.comm_, nccl_ctx.stream()));
-          });
-    }
-
-    WaitInputVarGenerated();
-    this->RunAndRecordEvent([&] {
-      {
-        platform::NCCLGroupGuard guard;
-        for (auto &call : broadcast_calls) {
-          call();
-        }
-      }
-
-      if (!out_handle->IsTheSameVar(in_var_handle)) {
-        auto out_var = var_scopes.at(in_var_handle.scope_idx())
-                           ->FindVar(out_var_handles[0]->name());
-        paddle::framework::TensorCopy(
-            in_tensor, in_var_handle.place(),
-            *(dev_ctxes_.at(in_var_handle.place())),
-            &VariableVisitor::GetMutableTensor(out_var));
-      }
-    });
-    for (auto &p : places_) {
-      nccl_ctxs_->DevCtx(p)->Wait();
-    }
-#else
-    PADDLE_THROW("CUDA is not enabled.");
-#endif
-  }
-}
-
-void BroadcastOpHandle::InitOutputValue(
-    const VarHandle &in_var_handle,
-    const std::vector<VarHandle *> &out_var_handles) const {
-  auto &var_scopes = local_exec_scopes_;
-  auto *in_var =
-      var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
-
-  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
-
-  // NOTE: The tensors' Place of input and output must be all on GPU or all on
-  // CPU.
-  for (auto *out_var_handle : out_var_handles) {
-    if (out_var_handle->IsTheSameVar(in_var_handle)) {
-      continue;
-    }
-    auto t_out_p = out_var_handle->place();
-    auto *out_var = var_scopes.at(out_var_handle->scope_idx())
-                        ->FindVar(out_var_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    if (is_gpu_place(in_tensor.place())) {
-      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-                     "Places of input and output must be all on GPU.");
-    } else {
-      t_out_p = platform::CPUPlace();
-    }
-    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
-    VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
-                                                            in_tensor.type());
-  }
-}
-
-std::string BroadcastOpHandle::Name() const { return "broadcast"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
deleted file mode 100644
index 45ccbb41e0b0efca495f1db8d05285b07ecff910..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/device_context.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct BroadcastOpHandle : public OpHandleBase {
- public:
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                    const std::vector<platform::Place> &places,
-                    const platform::NCCLContextMap *nccl_ctxs)
-      : OpHandleBase(node),
-        local_scopes_(local_scopes),
-        places_(places),
-        nccl_ctxs_(nccl_ctxs) {
-    if (nccl_ctxs_) {
-      for (auto &p_ctx : nccl_ctxs_->contexts_) {
-        this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
-                               p_ctx.second.ctx_.get());
-      }
-    }
-  }
-#else
-  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                    const std::vector<platform::Place> &places)
-      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
-#endif
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override { return true; };
-
- protected:
-  void RunImpl() override;
-
-  std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
-
-  void BroadcastOneVar(const VarHandle &in_var_handle,
-                       const std::vector<VarHandle *> &out_var_handles,
-                       const std::vector<Scope *> &var_scopes);
-
-  std::vector<Scope *> local_scopes_;
-  std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  const platform::NCCLContextMap *nccl_ctxs_;
-#endif
-
-  void InitOutputValue(const VarHandle &in_var_handle,
-                       const std::vector<VarHandle *> &out_var_handles) const;
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
deleted file mode 100644
index 650de5a48de6b1fdab120cdeda563a169fd1a1c1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
-  TestBroadcastOpHandle test_op;
-  size_t input_scope_idx = 0;
-  test_op.InitCtxOnGpu(false);
-  test_op.InitBroadcastOp(input_scope_idx);
-  test_op.TestBroadcastLodTensor(input_scope_idx);
-}
-
-TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
-  TestBroadcastOpHandle test_op;
-  size_t input_scope_idx = 0;
-  test_op.InitCtxOnGpu(false);
-  test_op.InitBroadcastOp(input_scope_idx);
-  test_op.TestBroadcastSelectedRows(input_scope_idx);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
-  TestBroadcastOpHandle test_op;
-  size_t input_scope_idx = 0;
-  test_op.InitCtxOnGpu(true);
-  test_op.InitBroadcastOp(input_scope_idx);
-  test_op.TestBroadcastLodTensor(input_scope_idx);
-}
-
-TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
-  TestBroadcastOpHandle test_op;
-  size_t input_scope_idx = 0;
-  test_op.InitCtxOnGpu(true);
-  test_op.InitBroadcastOp(input_scope_idx);
-  test_op.TestBroadcastSelectedRows(input_scope_idx);
-}
-#endif
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
deleted file mode 100644
index abc3f39e6867482dfa1d2c01cd97e96293acc9e5..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ /dev/null
@@ -1,276 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/details/broadcast_op_handle.h"
-
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-// test data amount
-const f::DDim kDims = {20, 20};
-
-struct TestBroadcastOpHandle {
-  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
-  std::vector<Scope*> local_scopes_;
-  std::vector<Scope*> param_scopes_;
-  Scope g_scope_;
-  OpHandleBase* op_handle_;
-  std::vector<VarHandleBase*> vars_;
-  std::vector<std::unique_ptr<ir::Node>> nodes_;
-  std::vector<p::Place> place_list_;
-  bool use_gpu_;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
-#endif
-
-  void WaitAll() {
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      ctxs_[j]->Wait();
-    }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    if (nccl_ctxs_) {
-      nccl_ctxs_->WaitAll();
-    }
-#endif
-  }
-
-  void InitCtxOnGpu(bool use_gpu) {
-    use_gpu_ = use_gpu;
-    if (use_gpu_) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      int count = p::GetCUDADeviceCount();
-      if (count <= 1) {
-        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
-                        "device count is "
-                     << count;
-        exit(0);
-      }
-      for (int i = 0; i < count; ++i) {
-        auto p = p::CUDAPlace(i);
-        place_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
-      }
-      nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
-#else
-      PADDLE_THROW("CUDA is not support.");
-#endif
-    } else {
-      int count = 8;
-      for (int i = 0; i < count; ++i) {
-        auto p = p::CPUPlace();
-        place_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
-      }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      nccl_ctxs_.reset(nullptr);
-#endif
-    }
-  }
-
-  void InitBroadcastOp(size_t input_scope_idx) {
-    nodes_.clear();
-    std::unordered_map<Scope*, Scope*> scope_map;
-    for (size_t j = 0; j < place_list_.size(); ++j) {
-      local_scopes_.push_back(&(g_scope_.NewScope()));
-      Scope& local_scope = local_scopes_.back()->NewScope();
-      local_scope.Var("out");
-      param_scopes_.emplace_back(&local_scope);
-      scope_map.emplace(local_scopes_.back(), param_scopes_.back());
-    }
-    param_scopes_[input_scope_idx]->Var("input");
-
-    nodes_.emplace_back(
-        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
-    if (use_gpu_) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
-                                         place_list_, nccl_ctxs_.get());
-#else
-      PADDLE_THROW("CUDA is not support.");
-#endif
-    } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
-                                         place_list_, nccl_ctxs_.get());
-#else
-      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
-                                         place_list_);
-#endif
-    }
-
-    op_handle_->SetLocalExecScopes(scope_map);
-
-    nodes_.emplace_back(
-        ir::CreateNodeForTest("node1", ir::Node::Type::kVariable));
-    auto* in_var_handle = new VarHandle(nodes_.back().get(), 1, input_scope_idx,
-                                        "input", place_list_[input_scope_idx]);
-    vars_.emplace_back(in_var_handle);
-    op_handle_->AddInput(in_var_handle);
-
-    // add dummy var
-
-    nodes_.emplace_back(
-        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable));
-    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
-    DummyVarHandle* dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back());
-    dummy_var_handle->ClearGeneratedOp();
-    op_handle_->AddInput(dummy_var_handle);
-
-    for (size_t j = 0; j < place_list_.size(); ++j) {
-      if (!use_gpu_) {
-        op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get());
-      }
-      nodes_.emplace_back(
-          ir::CreateNodeForTest("node3", ir::Node::Type::kVariable));
-      VarHandle* out_var_handle =
-          new VarHandle(nodes_.back().get(), 2, j, "out", place_list_[j]);
-      vars_.emplace_back(out_var_handle);
-      op_handle_->AddOutput(out_var_handle);
-    }
-
-    // add dummy var
-    nodes_.emplace_back(
-        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable));
-    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
-    DummyVarHandle* out_dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back());
-    out_dummy_var_handle->ClearGeneratedOp();
-    op_handle_->AddOutput(out_dummy_var_handle);
-  }
-
-  std::vector<float> InitLoDTensor(const std::string& varname,
-                                   size_t input_scope_idx, const f::LoD& lod,
-                                   float val_scalar = 0.0) {
-    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
-
-    PADDLE_ENFORCE_NOT_NULL(var);
-    auto lod_tensor = var->GetMutable<f::LoDTensor>();
-    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
-    for (size_t k = 0; k < send_vector.size(); ++k) {
-      send_vector[k] = k + val_scalar;
-    }
-    paddle::framework::TensorFromVector<float>(
-        send_vector, *(ctxs_[input_scope_idx]), lod_tensor);
-    lod_tensor->set_lod(lod);
-    lod_tensor->Resize(kDims);
-    return send_vector;
-  }
-
-  std::vector<float> InitSelectedRows(const std::string& varname,
-                                      size_t input_scope_idx,
-                                      const std::vector<int64_t>& rows,
-                                      int height, float value_scalar = 0.0) {
-    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
-    for (size_t k = 0; k < send_vector.size(); ++k) {
-      send_vector[k] = k + value_scalar;
-    }
-
-    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
-    auto selected_rows = var->GetMutable<f::SelectedRows>();
-    auto value = selected_rows->mutable_value();
-    value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
-    selected_rows->set_height(height);
-    selected_rows->set_rows(rows);
-
-    paddle::framework::TensorFromVector<float>(
-        send_vector, *(ctxs_[input_scope_idx]), value);
-
-    return send_vector;
-  }
-
-  void SelectedRowsEqual(const std::string& varname, int input_scope_idx,
-                         const std::vector<float>& send_vector,
-                         const std::vector<int64_t>& rows, int height) {
-    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
-    auto& selected_rows = var->Get<f::SelectedRows>();
-    auto rt = selected_rows.value();
-    PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal.");
-
-    for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]);
-    }
-
-    p::CPUPlace cpu_place;
-    f::Tensor result_tensor;
-    f::TensorCopySync(rt, cpu_place, &result_tensor);
-    float* ct = result_tensor.data<float>();
-
-    for (int64_t i = 0; i < f::product(kDims); ++i) {
-      ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
-    }
-  }
-
-  void LoDTensorEqual(const std::string& varname,
-                      const std::vector<float>& send_vec, const f::LoD& lod,
-                      framework::Scope* scope) {
-    p::CPUPlace cpu_place;
-    auto var = scope->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
-    auto tensor = var->Get<f::LoDTensor>();
-    PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal.");
-    f::Tensor result_tensor;
-    f::TensorCopySync(tensor, cpu_place, &result_tensor);
-    float* ct = result_tensor.mutable_data<float>(cpu_place);
-    for (int64_t k = 0; k < f::product(kDims); ++k) {
-      ASSERT_NEAR(ct[k], send_vec[k], 1e-5);
-    }
-  }
-
-  void TestBroadcastLodTensor(size_t input_scope_idx) {
-    f::LoD lod{{0, 10, 20}};
-    auto send_vector = InitLoDTensor("input", input_scope_idx, lod);
-
-    op_handle_->Run(false);
-
-    WaitAll();
-    for (size_t j = 0; j < place_list_.size(); ++j) {
-      LoDTensorEqual("out", send_vector, lod, param_scopes_[j]);
-    }
-  }
-
-  void TestBroadcastSelectedRows(size_t input_scope_idx) {
-    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
-                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-    int height = static_cast<int>(kDims[0] * 2);
-    auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height);
-
-    op_handle_->Run(false);
-
-    WaitAll();
-    for (size_t j = 0; j < place_list_.size(); ++j) {
-      SelectedRowsEqual("out", input_scope_idx, send_vector, rows, height);
-    }
-  }
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
deleted file mode 100644
index 1f5fd015b16dd053a4d2c5ab08a3d60d3ce7c3f0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ /dev/null
@@ -1,420 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/details/build_strategy.h"
-
-#include <glog/logging.h>
-#include <memory>
-#include <unordered_set>
-#include <utility>
-#include "paddle/fluid/framework/details/reduce_op_handle.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_printer.h"
-#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
-
-DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_ngraph);
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
-  // Should fix the allreduce op order if scheduling
-  // them in multiple threads or processes to avoid hang.
-  // NOTE: ParallelGraph would execute this pass on each graph, so
-  // don't need to append it here.
-  return (!strategy.enable_sequential_execution_ &&
-          strategy.num_trainers_ > 1) &&
-         !strategy.enable_parallel_graph_;
-}
-
-static inline void ConvertDefaultValue(boost::optional<bool> *default_value) {
-  if (*default_value == boost::none) {
-    *default_value = true;
-  }
-}
-
-class ParallelExecutorPassBuilder : public ir::PassBuilder {
- public:
-  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
-      : ir::PassBuilder(), strategy_(strategy) {
-    ResolveOptionConfliction();
-
-    AppendPrintGraphPass("graph_viz_pass", "_original_graph");
-    AppendPassWithCheck(strategy_.enable_sequential_execution_,
-                        "sequential_execution_pass");
-    AppendPassWithCheck(strategy_.sync_batch_norm_, "sync_batch_norm_pass");
-
-    AppendPassToUseNgraph("ngraph_subgraph_pass");
-
-    AppendOpFusePasses();
-    AppendPrintGraphPass("graph_viz_pass", "_fused_graph");
-
-    AppendMultiDevPass();
-    AppendMultiGraphOptPasses();
-
-    AppendPassToSetMkldnnAttr("mkldnn_placement_pass");
-    // runtime_context_cache pass should be the last pass to enable the attr of
-    // all original and fused operators. But no operators can be enabled this
-    // attr if putting it after MultiDevPass.
-    AppendPassWithCheck(strategy_.cache_runtime_context_,
-                        "runtime_context_cache_pass");
-    AppendPassWithCheck(strategy_.remove_unnecessary_lock_,
-                        "modify_op_lock_and_record_event_pass");
-    // Note: This pass is used to check whether the multi_device_graph is right.
-    AppendPass("multi_devices_check_pass");
-
-    SetCollectiveContext();
-  }
-
-  void ResolveOptionConfliction() {
-    // Specifies the restrictions between different pass.
-    if (strategy_.enable_parallel_graph_) {
-      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
-          << "Currently, fuse_all_optimizer_ops doesn't work under "
-             "parallel_graph.";
-      strategy_.fuse_all_optimizer_ops_ = false;
-      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
-          << "fuse_all_reduce_ops doesn't work under "
-             "parallel_graph.";
-      strategy_.fuse_all_reduce_ops_ = false;
-    }
-    if (strategy_.is_distribution_) {
-      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
-          << "Currently, fuse_all_optimizer_ops only works under "
-             "Non-distributed mode.";
-      strategy_.fuse_all_optimizer_ops_ = false;
-      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
-          << "Currently, fuse_all_reduce_ops_ only works under "
-             "Non-distributed mode.";
-      strategy_.fuse_all_reduce_ops_ = false;
-    }
-    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
-          << "Currently, fuse_all_optimizer_ops only works under AllReduce "
-             "mode.";
-      strategy_.fuse_all_optimizer_ops_ = false;
-      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
-          << "fuse_all_optimizer_ops only works under AllReduce "
-             "mode.";
-      strategy_.fuse_all_reduce_ops_ = false;
-    }
-    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-      LOG_IF(WARNING, strategy_.fuse_broadcast_ops_ == true)
-          << "Currently, fuse_broadcast_ops only works under Reduce "
-             "mode.";
-      strategy_.fuse_broadcast_ops_ = false;
-    }
-
-    ConvertDefaultValue(&strategy_.fuse_all_optimizer_ops_);
-    ConvertDefaultValue(&strategy_.fuse_all_reduce_ops_);
-    ConvertDefaultValue(&strategy_.fuse_broadcast_ops_);
-
-    if (strategy_.fuse_all_optimizer_ops_ == true) {
-      LOG_IF(WARNING, strategy_.async_mode_)
-          << "Currently, fuse_all_optimizer_ops doesn't work under "
-             "async mode.";
-      strategy_.fuse_all_optimizer_ops_ = !strategy_.async_mode_;
-    }
-    if (strategy_.fuse_all_reduce_ops_ == true) {
-      LOG_IF(WARNING, strategy_.async_mode_)
-          << "Currently, fuse_all_reduce_ops doesn't work under "
-             "async mode.";
-      strategy_.fuse_all_reduce_ops_ = !strategy_.async_mode_;
-    }
-  }
-
-  void AppendMultiGraphOptPasses() {
-    // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
-    // first, if the number is zero, fuse_all_reduce_ops will do nothing.
-    AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
-                        "fuse_all_reduce_op_pass");
-    AppendPrintGraphPass("multi_devices_print_pass", "_multi_devices_graph");
-
-    // experimental shows that the program will be faster if append
-    // all_reduce_deps_pass here.
-    bool append_all_reduce_deps_pass =
-        !strategy_.enable_parallel_graph_ &&
-        (SeqOnlyAllReduceOps(strategy_) ||
-         strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce);
-    AppendPassWithCheck(append_all_reduce_deps_pass, "all_reduce_deps_pass");
-
-    bool append_backward_optimizer_op_deps_pass =
-        strategy_.num_trainers_ > 1 && !strategy_.async_mode_ &&
-        !strategy_.is_distribution_ &&
-        strategy_.enable_backward_optimizer_op_deps_;
-    AppendPassWithCheck(append_backward_optimizer_op_deps_pass,
-                        "backward_optimizer_op_deps_pass");
-  }
-
-  void AppendOpFusePasses() {
-    AppendPassWithCheck(strategy_.fuse_relu_depthwise_conv_,
-                        "fuse_relu_depthwise_conv_pass");
-    AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
-                        "fuse_elewise_add_act_pass");
-    // for single card training, fuse_all_reduce_ops is unnecessary.
-    // coalesce_grad_tensor_pass should be before of MultiDevPass.
-    AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
-                        "coalesce_grad_tensor_pass");
-    // Fuse all the optimization operators.
-    // NOTE: fuse_all_xx_ops will count the number of xx operator first,
-    // if the number is zero, fuse_all_reduce_ops will do nothing.
-    // Currently, only one type of optimization algorithm can be fused.
-    if (strategy_.fuse_all_optimizer_ops_ == true) {
-      AppendPass("fuse_adam_op_pass");
-      AppendPass("fuse_sgd_op_pass");
-      AppendPass("fuse_momentum_op_pass");
-    }
-  }
-
-  void SetCollectiveContext() const {
-    CollectiveContext *context = CollectiveContext::GetInstance();
-    context->endpoints_ = strategy_.trainers_endpoints_;
-    context->trainer_id_ = strategy_.trainer_id_;
-    PADDLE_ENFORCE_GE(strategy_.trainer_id_, 0, "trainer_id_ >= 0");
-    if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) {
-      PADDLE_ENFORCE_LT(static_cast<size_t>(strategy_.trainer_id_),
-                        strategy_.trainers_endpoints_.size(),
-                        "trainer_id_ < endpoints_ size");
-    }
-    VLOG(1) << "CollectiveContext:" << context->String();
-  }
-
-  // Convert graph to run on multi-devices.
-  void AppendMultiDevPass() {
-    ir::Pass *multi_devices_pass = nullptr;
-    if (strategy_.async_mode_) {
-      multi_devices_pass = AppendPass("async_multi_devices_pass").get();
-    } else if (strategy_.is_distribution_) {
-      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
-    } else {
-      switch (strategy_.reduce_) {
-        case BuildStrategy::ReduceStrategy::kAllReduce:
-          multi_devices_pass =
-              AppendPass("all_reduce_mode_multi_devices_pass").get();
-          break;
-        case BuildStrategy::ReduceStrategy::kReduce:
-          multi_devices_pass =
-              AppendPass("reduce_mode_multi_devices_pass").get();
-          break;
-        default:
-          PADDLE_THROW("Unknown reduce strategy.");
-      }
-    }
-    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
-                                                         &strategy_);
-  }
-
-  void AppendPrintGraphPass(const std::string &pass_name,
-                            const std::string &debug_file_suffix) {
-    if (!strategy_.debug_graphviz_path_.empty()) {
-      auto viz_pass = AppendPass(pass_name);
-      const std::string graph_path = string::Sprintf(
-          "%s%s", strategy_.debug_graphviz_path_.c_str(), debug_file_suffix);
-      viz_pass->Set<std::string>(ir::kGraphvizPath,
-                                 new std::string(graph_path));
-    }
-  }
-
-  void AppendPassWithCheck(const boost::optional<bool> &append_pass,
-                           const std::string &pass_name) {
-    AppendPassWithCheck(append_pass == true, pass_name);
-  }
-
-  void AppendPassWithCheck(bool append_pass, const std::string &pass_name) {
-    if (append_pass) {
-      AppendPass(pass_name);
-    }
-  }
-
-  void AppendPassToSetMkldnnAttr(const std::string &pass_name) {
-#ifdef PADDLE_WITH_MKLDNN
-    if (FLAGS_use_mkldnn) {
-      AppendPass(pass_name);
-    } else if (!strategy_.mkldnn_enabled_op_types_.empty()) {
-      LOG(WARNING)
-          << "mkldnn_enabled_op_types specify the operator type list to "
-             "use MKLDNN acceleration. It is null in default, means "
-             "that all the operators supported by MKLDNN will be "
-             "accelerated. And it should not be set when "
-             "FLAGS_use_mkldnn=false.";
-    }
-#else
-    PADDLE_ENFORCE(!FLAGS_use_mkldnn,
-                   "Please compile with MKLDNN first to use MKLDNN");
-#endif
-  }
-
-  void AppendPassToUseNgraph(const std::string &pass_name) {
-#ifdef PADDLE_WITH_NGRAPH
-    if (FLAGS_use_ngraph) {
-      if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kAllReduce) {
-        LOG(WARNING) << "Currently ngraph_subgraph_pass works under AllReduce,"
-                        "please set FLAGS_use_ngraph=false.";
-      } else {
-        AppendPass(pass_name);
-      }
-    }
-#else
-    PADDLE_ENFORCE_NE(FLAGS_use_ngraph, true,
-                      "Please compile with NGRAPH first to use NGRAPH");
-#endif
-  }
-
- private:
-  BuildStrategy strategy_;
-};
-
-std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
-    bool finalize_strategy) const {
-  if (is_finalized_) {
-    return pass_builder_;
-  }
-  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
-  if (finalize_strategy) {
-    is_finalized_ = true;
-  }
-  return pass_builder_;
-}
-
-bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
-  return framework::ir::MultiDevSSAGraphBuilder().count(pass_name) > 0;
-}
-
-ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
-                                const std::vector<platform::Place> &places,
-                                const std::string &loss_var_name,
-                                const std::vector<Scope *> &local_scopes,
-                                const size_t &nranks,
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-                                const bool use_cuda,
-                                platform::NCCLCommunicator *nccl_ctxs) const {
-#else
-                                const bool use_cuda) const {
-#endif
-  VLOG(1) << "apply all passes";
-  // Create a default one if not finalized by user.
-  CreatePassesFromStrategy(false);
-
-  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
-    VLOG(1) << "BuildStrategy::Apply pass:" << pass->Type();
-    if (IsMultiDevPass(pass->Type())) {
-      pass->Erase(kPlaces);
-      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
-      pass->Erase(ir::kLossVarName);
-      pass->SetNotOwned<const std::string>(ir::kLossVarName, &loss_var_name);
-      pass->Erase(kLocalScopes);
-      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
-                                                    &local_scopes);
-      pass->Erase(kNRanks);
-      pass->Set<size_t>(kNRanks, new size_t(nranks));
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
-      pass->Erase(kNCCLCtxs);
-      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
-#endif
-    } else if (pass->Type() == "fuse_all_reduce_op_pass") {
-      pass->Erase(kNRanks);
-      pass->Set<size_t>(kNRanks, new size_t(nranks));
-      pass->Erase(kPlaces);
-      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
-      pass->Erase(kLocalScopes);
-      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
-                                                    &local_scopes);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
-      pass->Erase(kNCCLCtxs);
-      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
-      pass->Erase(kUseHierarchicalAllReduce);
-      pass->Set<bool>(kUseHierarchicalAllReduce,
-                      new bool(use_hierarchical_allreduce_));
-#endif
-    } else if (pass->Type() == "coalesce_grad_tensor_pass") {
-      pass->Erase(kNRanks);
-      pass->Set<size_t>(kNRanks, new size_t(nranks));
-    } else if (pass->Type() == "sequential_execution_pass") {
-      LOG(INFO) << "set enable_sequential_execution:"
-                << enable_sequential_execution_;
-    } else if (pass->Type() == "all_reduce_deps_pass") {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
-      pass->Erase(kNCCLCtxs);
-      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
-      pass->Erase(kUseHierarchicalAllReduce);
-      pass->Set<bool>(kUseHierarchicalAllReduce,
-                      new bool(use_hierarchical_allreduce_));
-#endif
-      LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
-                << ", num_trainers:" << num_trainers_;
-    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
-      if (!use_cuda) {
-        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
-                        "GPU, skipped.";
-        continue;
-      }
-    } else if (pass->Type() == "mkldnn_placement_pass") {
-      pass->Set("mkldnn_enabled_op_types",
-                new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
-    } else if (pass->Type() == "backward_optimizer_op_deps_pass") {
-      if (!use_cuda) {
-        VLOG(1) << "backward_optimizer_op_deps_pass is only supported on "
-                   "GPU, skipped.";
-        continue;
-      }
-    }
-    VLOG(1) << "Start Apply Pass " << pass->Type();
-    graph = pass->Apply(graph);
-    VLOG(1) << "Finish Apply Pass " << pass->Type();
-  }
-  VLOG(1) << "All Passes Applied";
-  return graph;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(sync_batch_norm_pass);
-USE_PASS(fuse_relu_depthwise_conv_pass);
-USE_PASS(fuse_elewise_add_act_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(multi_batch_merge_pass);
-USE_PASS(reduce_mode_multi_devices_pass);
-USE_PASS(all_reduce_mode_multi_devices_pass);
-USE_PASS(dist_multi_devices_pass);
-USE_PASS(multi_devices_check_pass);
-USE_PASS(multi_devices_print_pass);
-USE_PASS(sequential_execution_pass);
-USE_PASS(all_reduce_deps_pass);
-USE_PASS(backward_optimizer_op_deps_pass);
-USE_PASS(modify_op_lock_and_record_event_pass);
-USE_PASS(lock_free_optimize_pass);
-USE_PASS(coalesce_grad_tensor_pass);
-USE_PASS(graph_to_program_pass);
-USE_PASS(fuse_adam_op_pass);
-USE_PASS(fuse_sgd_op_pass);
-USE_PASS(fuse_momentum_op_pass);
-USE_PASS(fuse_all_reduce_op_pass);
-USE_PASS(runtime_context_cache_pass);
-#ifdef PADDLE_WITH_MKLDNN
-USE_PASS(mkldnn_placement_pass);
-#endif
-#ifdef PADDLE_WITH_NGRAPH
-USE_PASS(ngraph_subgraph_pass);
-#endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
deleted file mode 100644
index 5f0cc4b215c0f52de1ff529db41bc4aaae89c696..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/build_strategy.h
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "boost/optional.hpp"
-#include "paddle/fluid/framework/ir/pass_builder.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct BuildStrategy {
-  // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
-  // kReduce, for CPU and GPU. If you use kAllReduce, different threads
-  // optimize their parameters separately. If you use kReduce, the optimizations
-  // of parameters are distributed to different threads.
-  // For example, a model has 100 parameters and is running with four threads,
-  // if you choose kAllReduce, every thread is to optimize 100 parameters
-  // separately, if you choose kReduce, every thread is to optimize 25
-  // parameters.
-  // Of particular note is, if you use kReduce when using CPU training,
-  // all the parameters are shared between different threads. This feature will
-  // save memory.
-  // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
-  // equal for GPU. Because, the result of the different order of summing maybe
-  // different, for example, the result of `a+b+c+d` may be different with the
-  // result of `c+a+b+d`.
-  // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
-  // so the result of kAllReduce and kReduce maybe not equal.
-  // For CPU, if you want to fix the order of summing to make the result
-  // of kAllReduce and kReduce no diff, you can add
-  // `FLAGS_cpu_deterministic=true` to env.
-  enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
-
-  enum class GradientScaleStrategy {
-    kCoeffNumDevice = 0,
-    kOne = 1,
-    // user can customize gradient scale to use, and just feed
-    // it into exe.run().
-    kCustomized = 2,
-  };
-
-  ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
-  GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
-
-  std::string debug_graphviz_path_{""};
-
-  // Add dependency between backward ops and optimization ops, make sure that
-  // all the backward ops are finished before running the optimization ops.
-  // It might make the training speed of data parallelism faster.
-  bool enable_backward_optimizer_op_deps_{true};
-  // TODO(dev-paddle): enable_sequential_execution depends on
-  // kStaleProgramOpDescs, it is not appropriate, because kStaleProgramOpDescs
-  // will be removed in the near future.
-  bool enable_sequential_execution_{false};
-  bool remove_unnecessary_lock_{true};
-  // TODO(dev-paddle): cache_runtime_context may cause some models to hang up
-  // while running.
-  bool cache_runtime_context_{false};
-
-  // Operator fusion
-  // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
-  // cycle.
-  bool fuse_elewise_add_act_ops_{false};
-  // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
-  // should not be sparse types
-  boost::optional<bool> fuse_all_optimizer_ops_{boost::none};
-  boost::optional<bool> fuse_all_reduce_ops_{boost::none};
-  // fuse_relu_depthwise_conv can fuse the `relu ->
-  // depthwise_conv`
-  bool fuse_relu_depthwise_conv_{false};
-  // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
-  // faster. Because fusing broadcast OP equals delaying the execution of all
-  // broadcast Ops, in this case, all nccl streams are used only for reduce
-  // operations for a period of time.
-  boost::optional<bool> fuse_broadcast_ops_{boost::none};
-  // replace batch_norm with sync_batch_norm.
-  bool sync_batch_norm_{false};
-
-  // mkldnn_enabled_op_types specify the operator type list to
-  // use MKLDNN acceleration. It is null in default, means
-  // that all the operators supported by MKLDNN will be
-  // accelerated. And it should not be set when
-  // FLAGS_use_mkldnn=false
-  std::unordered_set<std::string> mkldnn_enabled_op_types_;
-
-  // By default, memory_optimize would be opened if gc is disabled, and
-  // be closed if gc is enabled.
-  // Users can forcely enable/disable memory_optimize by setting True/False.
-  boost::optional<bool> memory_optimize_{boost::none};
-
-  // Turn on inplace by default.
-  bool enable_inplace_{true};
-
-  // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
-  // num_trainers is 1, so the current fields of build_strategy doesn't tell if
-  // it's distributed model.
-  bool is_distribution_{false};
-  bool async_mode_{false};
-  int num_trainers_{1};
-  int trainer_id_{0};
-  std::vector<std::string> trainers_endpoints_;
-
-  // NCCL config
-  size_t nccl_comm_num_{1};
-  // The picture is here:
-  // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
-  bool use_hierarchical_allreduce_{false};
-  // Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu
-  // cards' number in most cases.
-  size_t hierarchical_allreduce_inter_nranks_{0};
-  // Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to
-  // nodes number.
-  size_t hierarchical_allreduce_exter_nranks_{0};
-
-  // NOTE:
-  // Before you add new options, think if it's a general strategy that works
-  // with other strategy. If not, the strategy should be created through
-  // CreatePassesFromStrategy and the pass can be managed separately.
-
-  // User normally doesn't need to call this API.
-  // The PassBuilder allows for more customized insert, remove of passes
-  // from python side.
-  // A new PassBuilder is created based on configs defined above and
-  // passes are owned by the PassBuilder.
-  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy(
-      bool finalize_strategy) const;
-
-  bool IsFinalized() const { return is_finalized_; }
-
-  bool IsMultiDevPass(const std::string &pass_name) const;
-
-  // Apply the passes built by the pass_builder_. The passes will be
-  // applied to the Program and output an ir::Graph.
-  ir::Graph *Apply(ir::Graph *graph, const std::vector<platform::Place> &places,
-                   const std::string &loss_var_name,
-                   const std::vector<Scope *> &local_scopes,
-                   const size_t &nranks,
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-                   const bool use_cuda,
-                   platform::NCCLCommunicator *nccl_ctxs) const;
-#else
-                   const bool use_cuda) const;
-#endif
-
-  // If set true, ParallelExecutor would build the main_program into multiple
-  // graphs,
-  // each of the graphs would run with one device. This approach can achieve
-  // better performance
-  // on some scenarios.
-  mutable bool enable_parallel_graph_ = false;
-
- private:
-  mutable bool is_finalized_ = false;
-  mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
deleted file mode 100644
index 0b653e57f6d48f9d919ee4f09db5b6ab6b2451b7..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-namespace details {
-ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
-                                         platform::Place place,
-                                         size_t scope_idx)
-    : OpHandleBase(node),
-      op_(framework::OpRegistry::CreateOp(*node->Op())),
-      scope_(scope),
-      place_(place),
-      scope_idx_(scope_idx) {}
-
-void ComputationOpHandle::RunImpl() {
-  WaitInputVarGenerated(place_);
-
-  auto run_func = [this]() { op_->Run(*local_exec_scopes_[0], place_); };
-
-  if (is_lock_and_record_event_free_) {
-    run_func();
-  } else {
-    this->RunAndRecordEvent(run_func);
-  }
-}
-
-bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
-  bool need_wait =
-      in_var && in_var->GeneratedOp() &&
-      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
-  return need_wait;
-}
-
-std::string ComputationOpHandle::Name() const { return op_->Type(); }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
deleted file mode 100644
index 5a65aaf0d2012f6a42f57b47e9e1c7b0167c8b35..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ /dev/null
@@ -1,65 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-class ComputationOpHandle : public OpHandleBase {
- public:
-  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
-                      size_t scope_idx);
-
-  OperatorBase *GetOp() { return op_.get(); }
-
-  std::string Name() const override;
-
-  const Scope *GetScope() const { return scope_; }
-
-  Scope *GetScope() { return scope_; }
-
-  const platform::Place &GetPlace() const { return place_; }
-
-  void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }
-
-  size_t GetScopeIdx() const { return scope_idx_; }
-
- protected:
-  void RunImpl() override;
-
-  bool NeedWait(VarHandleBase *in_var) override;
-
-  std::vector<Scope *> GetLocalScopes() override { return {scope_}; }
-
- private:
-  std::unique_ptr<OperatorBase> op_;
-  Scope *scope_;
-  platform::Place place_;
-  size_t scope_idx_;
-  bool is_lock_and_record_event_free_{false};
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/container_cast.h b/paddle/fluid/framework/details/container_cast.h
deleted file mode 100644
index a42ae78dc45c2a885f98315a21f1d5558725bca3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/container_cast.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <type_traits>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-template <typename ResultType, typename ElemType>
-std::vector<ResultType*> DynamicCast(const std::vector<ElemType*>& container) {
-  static_assert(std::is_base_of<ElemType, ResultType>::value,
-                "ElementType must be a base class of ResultType");
-  std::vector<ResultType*> res;
-  for (auto* ptr : container) {
-    auto* derived = dynamic_cast<ResultType*>(ptr);
-    if (derived) {
-      res.emplace_back(derived);
-    }
-  }
-  return res;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h
deleted file mode 100644
index 090517ff3c1822c2e62e61fad05d49e1c8db8573..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <thread>  // NOLINT
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-template <class T>
-class COWPtr {
- public:
-  typedef std::shared_ptr<T> RefPtr;
-
- private:
-  RefPtr m_sp;
-
- public:
-  COWPtr() : m_sp(nullptr) {}
-  explicit COWPtr(T* t) : m_sp(t) {}
-
-  const T& Data() const { return *m_sp; }
-
-  T* MutableData() {
-    DetachIfNotUnique();
-    return m_sp.get();
-  }
-
-  void DetachIfNotUnique() {
-    T* tmp = m_sp.get();
-    if (!(tmp == nullptr || m_sp.unique())) {
-      Detach();
-    }
-  }
-
-  void Detach() {
-    T* tmp = m_sp.get();
-    m_sp = RefPtr(new T(*tmp));
-  }
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc
deleted file mode 100644
index 5b055d7cb4d127dc20f2cf70869134f24a93d429..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/framework/details/cow_ptr.h"
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-TEST(COWPtr, all) {
-  COWPtr<int> ptr(new int{0});
-  ASSERT_EQ(ptr.Data(), 0);
-  COWPtr<int> ptr2 = ptr;
-  ASSERT_EQ(ptr2.Data(), 0);
-  ASSERT_EQ(&ptr2.Data(), &ptr.Data());
-  *ptr2.MutableData() = 10;
-  ASSERT_EQ(ptr.Data(), 0);
-  ASSERT_EQ(ptr2.Data(), 10);
-}
-
-TEST(COWPtr, change_old) {
-  COWPtr<int> ptr(new int{0});
-  COWPtr<int> ptr2 = ptr;
-  *ptr.MutableData() = 10;
-  ASSERT_EQ(ptr2.Data(), 0);
-  ASSERT_EQ(ptr.Data(), 10);
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/dgc_const_values.h b/paddle/fluid/framework/details/dgc_const_values.h
deleted file mode 100644
index fbe50dc91160e1d7d5175daa150ec9c45aa60a6f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/dgc_const_values.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-constexpr char g_dgc_counter_name[] = "__g_dgc_counter__";
-constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__";
-constexpr char g_dgc_u[] = "__dgc_u__";
-constexpr char g_dgc_v[] = "__dgc_v__";
-constexpr char g_dgc_k[] = "__dgc_k__";
-constexpr char g_dgc_encoded[] = "__dgc_encoded__";
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
deleted file mode 100644
index 2e64f9d4fb3f3673efe466e030038afc043046b4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <unordered_set>
-#include <utility>
-
-#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/profiler.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-EagerDeletionOpHandle::EagerDeletionOpHandle(
-    ir::Node *node, Scope *scope, const platform::Place &place,
-    const std::unordered_set<ir::MemOptVarInfo *> &vars, GarbageCollector *gc)
-    : OpHandleBase(node),
-      scope_(scope),
-      place_(place),
-      var_infos_(vars.begin(), vars.end()),
-      gc_(gc) {
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(place)) {
-    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
-        platform::DeviceContextPool::Instance().Get(place));
-    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
-      platform::CUDADeviceGuard guard(
-          boost::get<platform::CUDAPlace>(place).device);
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-      PADDLE_ENFORCE_NOT_NULL(event_);
-    }
-  }
-#endif
-  PADDLE_ENFORCE(!vars.empty(), "Var names cannot be empty");
-  for (auto *var : var_infos_) {
-    PADDLE_ENFORCE_NOT_NULL(var);
-  }
-}
-
-EagerDeletionOpHandle::~EagerDeletionOpHandle() {
-#ifdef PADDLE_WITH_CUDA
-  if (event_) {
-    auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-    platform::CUDADeviceGuard guard(gpu_place.device);
-    PADDLE_ENFORCE(cudaEventDestroy(event_));
-  }
-#endif
-}
-
-void EagerDeletionOpHandle::InitCUDA() {
-#ifdef PADDLE_WITH_CUDA
-  int dev_id =
-      boost::get<platform::CUDAPlace>(dev_ctxes_.begin()->first).device;
-  events_[dev_id] = nullptr;
-#endif
-}
-
-void EagerDeletionOpHandle::CallOnce() {
-  PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here");
-  Scope *exec_scope = local_exec_scopes_[0];
-  for (auto *var_info : var_infos_) {
-    auto *var = exec_scope->FindVar(var_info->Name());
-    PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr",
-                            var_info->Name());
-    vars_.emplace_back(var);
-  }
-}
-
-std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
-
-void EagerDeletionOpHandle::RunImpl() {
-  if (vars_.size() != var_infos_.size()) {
-    CallOnce();
-  }
-
-  platform::RecordEvent record_event(Name());
-  std::deque<std::shared_ptr<memory::Allocation>> garbages;
-  for (size_t i = 0; i < var_infos_.size(); ++i) {
-    auto *var_info = var_infos_[i];
-    if (var_info->IsSkippedAllMemoryOptimization() ||
-        !var_info->DecreaseRefCnt()) {
-      continue;
-    }
-
-    VLOG(2) << "Erase variable " << var_info->Name() << " on " << place_;
-
-    Variable *var = vars_[i];
-
-    if (var->IsType<LoDTensor>()) {
-      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-    } else if (var->IsType<SelectedRows>()) {
-      garbages.emplace_back(
-          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
-    } else if (var->IsType<LoDTensorArray>()) {
-      auto *tensor_arr = var->GetMutable<LoDTensorArray>();
-      for (auto &t : *tensor_arr) {
-        garbages.emplace_back(t.MoveMemoryHolder());
-      }
-    } else {
-      PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   framework::ToTypeName(var->Type()), var_info->Name());
-    }
-  }
-
-  if (!garbages.empty()) {
-    ClearGarbages(&garbages);
-  }
-}
-
-void EagerDeletionOpHandle::ClearGarbages(
-    std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
-#ifdef PADDLE_WITH_CUDA
-  if (event_) {
-    auto compute_stream = dev_ctx_->stream();
-    auto callback_stream =
-        reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
-    auto callback_func = [=]() {
-      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
-    };
-    gc_->Add(std::move(*garbages), callback_func);
-  } else {
-#endif
-    gc_->Add(std::move(*garbages));
-#ifdef PADDLE_WITH_CUDA
-  }
-#endif
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
deleted file mode 100644
index 4b2d4a83a6a88b05c8b7710f3d7a114c73a4f1d4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <deque>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-
-namespace ir {
-class MemOptVarInfo;
-}  // namespace ir
-
-namespace details {
-
-class EagerDeletionOpHandle : public OpHandleBase {
- public:
-  EagerDeletionOpHandle(ir::Node *node, Scope *scope,
-                        const platform::Place &place,
-                        const std::unordered_set<ir::MemOptVarInfo *> &vars,
-                        GarbageCollector *gc);
-
-  ~EagerDeletionOpHandle();
-
-  std::string Name() const override;
-
-  /**
-   * Currently, EagerDeletionOpHandle has the highest priority.
-   * This priority settings speed up gc 15% in Transformer
-   * V100 8-GPU model.
-   */
-  Priority GetPriority() const override { return kHighest; }
-
- protected:
-  void RunImpl() override;
-
-  void InitCUDA() override;
-
-  std::vector<Scope *> GetLocalScopes() override { return {scope_}; }
-
- private:
-  void ClearGarbages(std::deque<std::shared_ptr<memory::Allocation>> *garbages);
-
-  void CallOnce();
-
-  Scope *scope_;
-  platform::Place place_;
-  std::vector<ir::MemOptVarInfo *> var_infos_;  // not own
-  GarbageCollector *gc_;                        // not own
-  std::vector<Variable *> vars_;
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDADeviceContext *dev_ctx_{nullptr};
-  cudaEvent_t event_{nullptr};
-#endif
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
deleted file mode 100644
index f8fd395bd9cc1e569bf7789e6a3adc63b00716ac..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/exception_holder.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-
-#include "glog/logging.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class ExceptionHolder {
- public:
-  void Catch(std::exception_ptr eptr) {
-    try {
-      std::rethrow_exception(eptr);
-    } catch (platform::EOFException exp) {
-      Catch(exp);
-    } catch (platform::EnforceNotMet exp) {
-      Catch(exp);
-    } catch (std::exception& ex) {
-      LOG(FATAL) << "std::exception caught, " << ex.what();
-    } catch (...) {
-      LOG(FATAL) << "Unknown exception caught";
-    }
-  }
-
-  bool IsCaught() const {
-    std::lock_guard<std::mutex> lock(mu_);
-    return exception_.get() != nullptr;
-  }
-
-  void ReThrow() {
-    std::lock_guard<std::mutex> lock(mu_);
-    switch (type_) {
-      case kNone:
-        break;
-      case kEnforceNotMet: {
-        auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
-        throw e;
-      }
-      case kEOF: {
-        auto e = *static_cast<platform::EOFException*>(exception_.get());
-        throw e;
-      }
-    }
-    ClearImpl();
-  }
-
-  void Clear() {
-    std::lock_guard<std::mutex> lock(mu_);
-    ClearImpl();
-  }
-
-  std::string Type() {
-    std::lock_guard<std::mutex> lock(mu_);
-    switch (type_) {
-      case kNone:
-        return "None";
-      case kEnforceNotMet: {
-        return "EnforceNotMet";
-      }
-      case kEOF: {
-        return "EOF";
-      }
-    }
-    return "unknown";
-  }
-
- private:
-  void ClearImpl() {
-    exception_.reset();
-    type_ = kNone;
-  }
-
-  void Catch(const platform::EnforceNotMet& exp) {
-    std::lock_guard<std::mutex> lock(mu_);
-    exception_.reset(new platform::EnforceNotMet(exp));
-    type_ = kEnforceNotMet;
-  }
-
-  void Catch(const platform::EOFException& exp) {
-    std::lock_guard<std::mutex> lock(mu_);
-    // EOFException will not cover up existing EnforceNotMet.
-    if (exception_.get() == nullptr) {
-      exception_.reset(new platform::EOFException(exp));
-      type_ = kEOF;
-    }
-  }
-
-  enum ExceptionType { kNone, kEnforceNotMet, kEOF };
-  ExceptionType type_{kNone};
-
-  std::unique_ptr<std::exception> exception_;
-  mutable std::mutex mu_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
deleted file mode 100644
index b44e6b6a75a6f0375fe0c3e1eb47c5e4e6456d68..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cstddef>  // for size_t
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct ExecutionStrategy {
-  enum ExecutorType { kDefault = 0, kExperimental = 1 };
-
-  // num_threads indicates the size of thread pool.
-  size_t num_threads_{0};
-  bool use_cuda_{true};
-  // Note that allow_op_delay is invalid now.
-  bool allow_op_delay_{false};
-  // num_iteration_per_drop_scope indicates how many
-  // iterations the framework cleans up a local execution scope.
-  // In some models, the value of this parameter has a great
-  // influence on the performance(about 15%) of the program.
-  size_t num_iteration_per_drop_scope_{100};
-  // At present, the kExperimental executor is the fastest in most models.
-  ExecutorType type_{kExperimental};
-  // This debug option.
-  bool dry_run_{false};
-
-  // only use with async_ssa_graph_executor
-  // and pyreader with data queue
-  size_t num_iteration_per_run_{1};
-};
-
-}  //  namespace details
-}  //  namespace framework
-}  //  namespace paddle
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
deleted file mode 100644
index 97557d2b14a7eacbfe3338a8c09bb6065b68f81f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ /dev/null
@@ -1,288 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include <deque>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<Scope *> &local_exec_scopes,
-    const std::vector<platform::Place> &places, ir::Graph *graph)
-    : strategy_(strategy),
-      local_scopes_(local_scopes),
-      local_exec_scopes_(local_exec_scopes),
-      places_(places),
-      graph_(graph),
-      fetch_ctxs_(places),
-      pool_(strategy.num_threads_),
-      // add one more thread for generate op_deps
-      prepare_pool_(1) {
-  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
-    int dep = static_cast<int>(op->NotReadyInputSize());
-    op_deps_.emplace(op, dep);
-    if (dep == 0) {
-      bootstrap_ops_.emplace_back(op);
-    }
-  }
-  PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators.");
-  PrepareAtomicOpDeps();
-}
-
-FeedFetchList FastThreadedSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors) {
-  VLOG(3) << "enter FastThreadedSSAGraphExecutor Run";
-  std::unique_ptr<platform::RecordEvent> event(
-      new platform::RecordEvent("FastThreadedSSAGraphExecutorPrepare"));
-  std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
-      op_deps = atomic_op_deps_.get();
-  PrepareAtomicOpDeps();
-  size_t num_ops = op_deps->size();
-
-  paddle::framework::FeedFetchList fetches;
-  fetches.resize(fetch_tensors.size());
-  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-  std::vector<OpHandleBase *> fetch_ops;
-  std::vector<OpHandleBase *> ready_fetch_ops;
-  exception_.Clear();
-
-  InsertFetchOps(fetch_tensors, &fetches, &fetched_vars, op_deps.get(),
-                 &fetch_ops, &ready_fetch_ops);
-  event.reset(nullptr);
-  if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) {
-    // If the num_threads is 1, we can record the order of operator's
-    // execution in the first iteration, and in subsequent iterations,
-    // run the recorded operators directly. This strategy could make the
-    // execution faster.
-    VLOG(3) << "Run the traced ops.";
-    RunTracedOps(traced_ops_);
-    RunTracedOps(fetch_ops);
-    if (exception_.IsCaught()) {
-      ExecutionFinal(&fetch_ops);
-    }
-  } else {
-    traced_ops_.clear();
-    remaining_ = 0;
-    auto complete_q = std::make_shared<BlockingQueue<size_t>>();
-    for (auto op : bootstrap_ops_) {
-      RunOpAsync(op_deps.get(), op, complete_q);
-    }
-    for (auto op : ready_fetch_ops) {
-      RunOpAsync(op_deps.get(), op, complete_q);
-    }
-
-    size_t num_complete = 0;
-    while (num_complete != op_deps->size()) {
-      size_t num_comp = complete_q->Pop();
-      if (num_comp == -1UL) {
-        int remaining = 0;
-        while (true) {
-          remaining = remaining_;
-          if (remaining == 0) {
-            break;
-          }
-          for (int i = 0; i < remaining; ++i) {
-            complete_q->Pop();
-          }
-        }
-        if (exception_.IsCaught()) {
-          ExecutionFinal(&fetch_ops);
-        }
-      }
-      num_complete += num_comp;
-    }
-  }
-  // Wait FetchOps.
-  ClearFetchOp(graph_, &fetch_ops);
-  return fetches;
-}
-
-void FastThreadedSSAGraphExecutor::InsertFetchOps(
-    const std::vector<std::string> &fetch_tensors, FeedFetchList *fetches,
-    std::unordered_map<std::string, std::vector<VarHandleBase *>> *fetched_vars,
-    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-    std::vector<OpHandleBase *> *fetch_ops,
-    std::vector<OpHandleBase *> *ready_fetch_ops) {
-  std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
-                                                   fetch_tensors.end());
-  for (auto &fetch_var_name : fetch_tensor_set) {
-    for (auto &var_map : graph_->Get<GraphVars>(kGraphVars)) {
-      auto it = var_map.find(fetch_var_name);
-      if (it != var_map.end()) {
-        (*fetched_vars)[fetch_var_name].push_back(*it->second.rbegin());
-      }
-    }
-  }
-
-  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
-    auto &var_name = fetch_tensors.at(i);
-    auto fetched_var_it = fetched_vars->find(var_name);
-    PADDLE_ENFORCE(fetched_var_it != fetched_vars->end(),
-                   "Cannot find fetched variable(%s).(Perhaps the main_program "
-                   "is not set to ParallelExecutor)",
-                   var_name);
-
-    auto &vars = fetched_var_it->second;
-
-    ir::Node *fetch_node =
-        graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
-    auto *op = new FetchOpHandle(fetch_node, fetches, i, &local_scopes_,
-                                 &local_exec_scopes_);
-    fetch_ops->emplace_back(op);
-
-    for (auto &p : places_) {
-      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
-    }
-
-    for (auto *var : vars) {
-      op->AddInput(var);
-    }
-
-    int dep = static_cast<int>(op->NotReadyInputSize());
-    (*op_deps)[op] = dep;
-    if (dep == 0) {
-      ready_fetch_ops->emplace_back(op);
-    }
-  }
-}
-
-bool FastThreadedSSAGraphExecutor::RunOp(
-    OpHandleBase *op, const std::shared_ptr<BlockingQueue<size_t>> &complete_q,
-    size_t *complete) {
-  RunOpSync(op);
-  if (LIKELY(!exception_.IsCaught())) {
-    if (LIKELY(!strategy_.dry_run_)) {
-      RecordOps(op);
-    }
-    ++(*complete);
-    return true;
-  } else {
-    --remaining_;
-    complete_q->Push(-1UL);
-    return false;
-  }
-}
-
-void FastThreadedSSAGraphExecutor::RunOpAsync(
-    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-    OpHandleBase *op,
-    const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
-  ++remaining_;
-  this->pool_.enqueue([=] {
-    std::deque<OpHandleBase *> op_queue;
-    op_queue.push_front(op);
-
-    size_t complete = 0;
-    while (!op_queue.empty()) {
-      OpHandleBase *op_to_run = op_queue.back();
-      op_queue.pop_back();
-
-      if (!RunOp(op_to_run, complete_q, &complete)) {
-        return;
-      }
-
-      auto &outputs = op_to_run->Outputs();
-      op_to_run = nullptr;
-      for (auto &output : outputs) {
-        for (auto &pending_op : output->PendingOps()) {
-          std::atomic<int> &deps = op_deps->at(pending_op);
-          if (deps.fetch_sub(1) != 1) continue;
-
-          // NOTE(zjl): op with highest priority should run
-          // first without switching to another thread.
-          if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) {
-            op_queue.push_back(pending_op);
-          } else {
-            if (op_to_run == nullptr) {
-              op_to_run = pending_op;
-            } else {
-              RunOpAsync(op_deps, pending_op, complete_q);
-            }
-          }
-        }
-      }
-
-      if (op_to_run != nullptr) {
-        op_queue.push_front(op_to_run);
-      }
-    }
-    --remaining_;
-    complete_q->Push(complete);
-  });
-}
-
-void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
-  atomic_op_deps_ = prepare_pool_.enqueue([&] {
-    auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
-    for (auto &pair : op_deps_) {
-      (*op_deps)[pair.first] = pair.second;
-    }
-    return std::unique_ptr<
-        std::unordered_map<OpHandleBase *, std::atomic<int>>>(op_deps);
-  });
-}
-
-const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
-
-void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
-  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchOpHandle *>(op)) {
-    traced_ops_.emplace_back(op);
-  }
-}
-
-void FastThreadedSSAGraphExecutor::ExecutionFinal(
-    std::vector<OpHandleBase *> *fetch_ops) {
-  VLOG(3) << "caught exception " << exception_.Type() << ", rethrow it";
-  ClearFetchOp(graph_, fetch_ops);
-  exception_.ReThrow();
-}
-
-void FastThreadedSSAGraphExecutor::RunTracedOps(
-    const std::vector<OpHandleBase *> &traced_ops) {
-  for (auto &op : traced_ops) {
-    if (exception_.IsCaught()) {
-      return;
-    }
-    RunOpSync(op);
-  }
-}
-
-void FastThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
-  try {
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
-    }
-    if (LIKELY(!strategy_.dry_run_)) {
-      op->Run(strategy_.use_cuda_);
-    }
-    VLOG(10) << op << " " << op->Name() << " Done ";
-  } catch (...) {
-    exception_.Catch(std::current_exception());
-  }
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
deleted file mode 100644
index 5d11c2cfd9ed6a8b49aa6ee01c89969dc75c21a6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <ThreadPool.h>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/details/exception_holder.h"
-#include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_executor.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-namespace details {
-
-class OpHandleBase;
-class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
- public:
-  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
-                               const std::vector<Scope *> &local_scopes,
-                               const std::vector<Scope *> &local_exec_scopes,
-                               const std::vector<platform::Place> &places,
-                               ir::Graph *graph);
-  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
-  const ir::Graph &Graph() const override;
-
- private:
-  // Note(zcd): the ThreadPool should be placed last so that ThreadPool should
-  // be destroyed first.
-  ExecutionStrategy strategy_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<Scope *> local_exec_scopes_;
-  std::vector<platform::Place> places_;
-  ir::Graph *graph_;
-
-  std::unordered_map<OpHandleBase *, int> op_deps_;
-  std::vector<OpHandleBase *> bootstrap_ops_;
-
-  platform::DeviceContextPool fetch_ctxs_;
-  std::atomic<int> remaining_;
-
-  std::future<
-      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
-      atomic_op_deps_;
-  ExceptionHolder exception_;
-
-  ::ThreadPool pool_;
-  ::ThreadPool prepare_pool_;
-
-  std::vector<OpHandleBase *> traced_ops_;
-
-  bool RunOp(OpHandleBase *op,
-             const std::shared_ptr<BlockingQueue<size_t>> &complete_q,
-             size_t *complete);
-
-  void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-                  OpHandleBase *op,
-                  const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
-
-  void PrepareAtomicOpDeps();
-
-  inline void RecordOps(OpHandleBase *op);
-
-  inline void ExecutionFinal(std::vector<OpHandleBase *> *fetch_ops);
-
-  inline void RunOpSync(OpHandleBase *op);
-
-  void RunTracedOps(const std::vector<OpHandleBase *> &traced_ops);
-
-  void InsertFetchOps(
-      const std::vector<std::string> &fetch_tensors, FeedFetchList *fetches,
-      std::unordered_map<std::string, std::vector<VarHandleBase *>>
-          *fetched_vars,
-      std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-      std::vector<OpHandleBase *> *fetch_ops,
-      std::vector<OpHandleBase *> *ready_fetch_ops);
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
deleted file mode 100644
index 127183a32e938de57ce4f7cb5aed4e72f8f09682..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h"
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-namespace details {
-FetchBarrierOpHandle::FetchBarrierOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places)
-    // fetch_barrier op always run on place0, but output on all places.
-    : OpHandleBase(node),
-      op_(framework::OpRegistry::CreateOp(*node->Op())),
-      local_scopes_(local_scopes),
-      places_(places),
-      run_scope_(local_scopes[0]),
-      place_(places[0]) {
-  for (auto &p : places) {
-    this->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p));
-  }
-}
-
-bool FetchBarrierOpHandle::IsMultiDeviceTransfer() {
-  // override IsMultiDeviceTransfer to return true
-  return true;
-}
-
-void FetchBarrierOpHandle::RunImpl() {
-  WaitInputVarGenerated(place_);
-
-  auto run_func = [this]() { op_->Run(*local_exec_scopes_[0], place_); };
-
-  if (is_lock_and_record_event_free_) {
-    run_func();
-  } else {
-    this->RunAndRecordEvent(run_func);
-  }
-}
-
-bool FetchBarrierOpHandle::NeedWait(VarHandleBase *in_var) {
-  bool need_wait =
-      in_var && in_var->GeneratedOp() &&
-      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
-  return need_wait;
-}
-
-std::string FetchBarrierOpHandle::Name() const { return op_->Type(); }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_barrier_op_handle.h b/paddle/fluid/framework/details/fetch_barrier_op_handle.h
deleted file mode 100644
index d1f7e08b28e7d8291c11bd61588c978f591060c2..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.h
+++ /dev/null
@@ -1,63 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// **NOTE**: fetch_barrier op is special it outputs all recved variables on
-// all places if there are multiple places, must init with
-// multiple dev_ctxes_ !!!!
-
-struct FetchBarrierOpHandle : public OpHandleBase {
- public:
-  FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                       const std::vector<platform::Place> &places);
-
-  bool IsMultiDeviceTransfer() override;
-
-  std::string Name() const override;
-
- protected:
-  void RunImpl() override;
-
-  std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
-
-  bool NeedWait(VarHandleBase *in_var) override;
-
- private:
-  std::unique_ptr<OperatorBase> op_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<platform::Place> places_;
-  Scope *run_scope_;
-  platform::Place place_;
-
-  bool is_lock_and_record_event_free_{false};
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
deleted file mode 100644
index 221dec7235322f2a6fb6a1ce2e7563f8cdeeeba5..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
-                             std::vector<Scope *> *local_scopes,
-                             std::vector<Scope *> *local_exec_scopes)
-    : OpHandleBase(node),
-      data_(data),
-      offset_(offset),
-      local_scopes_(local_scopes),
-      local_exec_scopes_(local_exec_scopes) {}
-
-FetchOpHandle::~FetchOpHandle() {}
-
-void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-  PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
-}
-
-void FetchOpHandle::WaitAndMergeCPUTensors() const {
-  std::vector<const LoDTensor *> tensors_ptr;
-  tensors_ptr.reserve(tensors_.size());
-  for (auto &t : tensors_) {
-    tensors_ptr.emplace_back(&t);
-  }
-  data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
-}
-
-void FetchOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
-  WaitInputVarGenerated(platform::CPUPlace());
-
-  tensors_.resize(inputs_.size());
-  platform::CPUPlace cpu;
-  auto &scopes = *local_exec_scopes_;
-
-  for (size_t i = 0; i < inputs_.size(); ++i) {
-    auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
-    auto &scope = scopes.at(var_handle->scope_idx());
-    auto *var = scope->FindVar(var_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
-                            var_handle->name());
-
-    auto &t = var->Get<framework::LoDTensor>();
-    if (t.IsInitialized() && t.numel() > 0) {
-      if (platform::is_gpu_place(t.place())) {
-#ifdef PADDLE_WITH_CUDA
-        TensorCopy(t, cpu, &tensors_[i]);
-#endif
-      } else {
-        tensors_[i].ShareDataWith(t);
-      }
-    } else {
-      tensors_[i].clear();
-      tensors_[i].Resize({0});
-    }
-    tensors_[i].set_lod(t.lod());
-  }
-
-  this->WaitAndMergeCPUTensors();
-}
-
-void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
-  auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place);
-  for (auto *input : inputs_) {
-    if (input->GeneratedOp()) {
-      input->GeneratedOp()->RecordWaitEventOnCtx(cpu_ctx);
-    }
-  }
-}
-
-bool FetchOpHandle::IsMultiDeviceTransfer() { return true; }
-
-std::string FetchOpHandle::Name() const { return "Fetch"; }
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
deleted file mode 100644
index f3af4e61e2ba7664275eaed5f34c05940d0ec582..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ /dev/null
@@ -1,62 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct FetchOpHandle : public OpHandleBase {
- public:
-  FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
-                std::vector<Scope *> *local_scopes,
-                std::vector<Scope *> *local_exec_scopes);
-
-  ~FetchOpHandle();
-
-  void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
-
-  void WaitAndMergeCPUTensors() const;
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override;
-
- protected:
-  void RunImpl() override;
-
-  std::vector<Scope *> GetLocalScopes() override { return *local_scopes_; }
-
-  void WaitInputVarGenerated(const platform::Place &place) override;
-
- private:
-  FeedFetchList *data_;
-  size_t offset_;
-  std::vector<Scope *> *local_scopes_;
-  std::vector<Scope *> *local_exec_scopes_;
-  std::vector<LoDTensor> tensors_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
deleted file mode 100644
index dce4e36e02a4d22724be63b8774c593463dd4567..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
-#include <algorithm>
-#include <utility>
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/reduce_and_gather.h"
-#include "paddle/fluid/framework/details/variable_visitor.h"
-#include "paddle/fluid/platform/device_memory_aligment.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_bool(skip_fused_all_reduce_check, false, "");
-namespace paddle {
-namespace framework {
-namespace details {
-
-typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
-    GradientAndLoDTensor;
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-FusedAllReduceOpHandle::FusedAllReduceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
-    const platform::NCCLCommunicator *ctxs)
-    : AllReduceOpHandle(node, local_scopes, places, ctxs),
-      num_of_all_reduce_(num_of_all_reduce) {}
-#else
-FusedAllReduceOpHandle::FusedAllReduceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places, const size_t num_of_all_reduce)
-    : AllReduceOpHandle(node, local_scopes, places),
-      num_of_all_reduce_(num_of_all_reduce) {}
-#endif
-
-void FusedAllReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
-  VLOG(4) << this->DebugString();
-
-  WaitInputVarGenerated();
-  // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
-  // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-
-  size_t place_num = places_.size();
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), place_num * num_of_all_reduce_,
-      "The NoDummyInputSize should be equal to the number of places.");
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-
-  // Note: some gradient op doesn't have CUDAKernel, so the gradients of
-  // those op are in CPUPlace, in this case, the all reduce should not be fused.
-  if (InputIsInDifferentPlace(in_var_handles)) {
-    for (size_t j = 0; j < num_of_all_reduce_; ++j) {
-      std::vector<VarHandle *> dev_inputs;
-      std::vector<VarHandle *> dev_outputs;
-      dev_inputs.reserve(place_num);
-      dev_outputs.reserve(place_num);
-      for (size_t idx = 0; idx < place_num; ++idx) {
-        dev_inputs.emplace_back(in_var_handles.at(j * place_num + idx));
-        dev_outputs.emplace_back(out_var_handles.at(j * place_num + idx));
-      }
-      AllReduceImpl(dev_inputs, dev_outputs);
-    }
-  } else {
-    FusedAllReduceFunc(in_var_handles, out_var_handles);
-  }
-}
-
-void FusedAllReduceOpHandle::FusedAllReduceFunc(
-    const std::vector<VarHandle *> &in_var_handles,
-    const std::vector<VarHandle *> &out_var_handles) {
-  size_t place_num = places_.size();
-
-  GradientAndLoDTensor grads_tensor;
-  grads_tensor.resize(place_num);
-
-  int64_t numel = -1;
-  auto dtype = static_cast<framework::proto::VarType::Type>(0);
-  for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
-    auto &g_tensor = grads_tensor.at(scope_idx);
-    g_tensor.reserve(num_of_all_reduce_);
-
-    GetGradLoDTensor(scope_idx, in_var_handles, out_var_handles, &g_tensor);
-
-    int64_t element_num = 0;
-    framework::proto::VarType::Type ele_dtype =
-        static_cast<framework::proto::VarType::Type>(0);
-    GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num);
-
-    if (scope_idx == 0) {
-      numel = element_num;
-      dtype = ele_dtype;
-    }
-
-    PADDLE_ENFORCE_EQ(ele_dtype, dtype);
-
-    // Check whether the address space is contiguous.
-    std::sort(
-        g_tensor.begin(), g_tensor.end(),
-        [](const std::pair<std::string, const LoDTensor *> &grad1,
-           const std::pair<std::string, const LoDTensor *> &grad2) -> bool {
-          return grad1.second->data<void>() < grad2.second->data<void>();
-        });
-
-    size_t size_of_dtype = framework::SizeOfType(dtype);
-    for (size_t k = 1; k < g_tensor.size(); ++k) {
-      const void *cur_address = g_tensor.at(k - 1).second->data<void>();
-      int64_t len = g_tensor.at(k - 1).second->numel();
-      auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
-      void *infer_next_address = reinterpret_cast<void *>(
-          reinterpret_cast<uintptr_t>(cur_address) + offset);
-      const void *next_address = g_tensor.at(k).second->data<void>();
-
-      VLOG(10) << string::Sprintf(
-          "Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer "
-          "input[%d] address: 0X%02x. The offset: %d",
-          k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
-          next_address, k, infer_next_address, offset);
-      PADDLE_ENFORCE_EQ(infer_next_address, next_address,
-                        "The address is not consistent.");
-    }
-  }
-
-  if (!FLAGS_skip_fused_all_reduce_check) {
-    for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
-      for (size_t j = 1; j < num_of_all_reduce_; ++j) {
-        PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
-                          grads_tensor.at(scope_idx).at(j).first);
-      }
-    }
-  }
-
-  std::vector<const void *> lod_tensor_data;
-  lod_tensor_data.reserve(place_num);
-  for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
-    auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
-    lod_tensor_data.emplace_back(data);
-  }
-  std::vector<std::string> grad_var_names;
-  grad_var_names.reserve(place_num);
-  for (auto &grad_t : grads_tensor) {
-    grad_var_names.emplace_back(grad_t.at(0).first);
-  }
-
-  AllReduceFunc(lod_tensor_data, dtype, numel, this->places_, grad_var_names);
-}
-
-bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
-    const std::vector<VarHandle *> &in_var_handles) const {
-  for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
-    auto *local_scope = local_exec_scopes_[scope_idx];
-    size_t place_num = places_.size();
-    for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
-      auto var_name = in_var_handles[j]->name();
-      auto var = local_scope->FindVar(var_name);
-      PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
-      auto &lod_tensor = var->Get<LoDTensor>();
-      if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-void FusedAllReduceOpHandle::GetGradLoDTensor(
-    const size_t &scope_idx, const std::vector<VarHandle *> &in_var_handles,
-    const std::vector<VarHandle *> &out_var_handles,
-    std::vector<std::pair<std::string, const LoDTensor *>> *grad_tensor) const {
-  auto *local_scope = local_exec_scopes_[scope_idx];
-  size_t place_num = places_.size();
-  for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
-    auto var_name = in_var_handles[j]->name();
-    PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
-    auto var = local_scope->FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
-    auto &lod_tensor = var->Get<LoDTensor>();
-    PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx),
-                      "%s(%d) is not in the right place.", var_name, scope_idx);
-    grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
-  }
-}
-
-void FusedAllReduceOpHandle::GetDTypeAndNumel(
-    const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor,
-    proto::VarType::Type *dtype, int64_t *numel) const {
-  *numel = 0;
-  size_t size_of_dtype = 0;
-  for (size_t i = 0; i < grad_tensor.size(); ++i) {
-    // Get dtype
-    auto ele_type = grad_tensor.at(i).second->type();
-    if (i == 0) {
-      *dtype = ele_type;
-      size_of_dtype = framework::SizeOfType(ele_type);
-    }
-    PADDLE_ENFORCE_EQ(ele_type, *dtype);
-
-    // Get element number
-    int64_t len = grad_tensor.at(i).second->numel();
-    PADDLE_ENFORCE_GT(len, 0);
-    *numel +=
-        platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
-  }
-}
-
-std::string FusedAllReduceOpHandle::Name() const { return "fused_all_reduce"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
deleted file mode 100644
index f6a11c4e504a7144807f02cedac612a837465058..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ /dev/null
@@ -1,76 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/framework/details/nccl_op_handle.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-struct FusedAllReduceOpHandle : public AllReduceOpHandle {
-  FusedAllReduceOpHandle(ir::Node *node,
-                         const std::vector<Scope *> &local_scopes,
-                         const std::vector<platform::Place> &places,
-                         const size_t num_of_all_reduce,
-                         const platform::NCCLCommunicator *ctxs);
-#else
-struct FusedAllReduceOpHandle : public AllReduceOpHandle {
-  FusedAllReduceOpHandle(ir::Node *node,
-                         const std::vector<Scope *> &local_scopes,
-                         const std::vector<platform::Place> &places,
-                         const size_t num_of_all_reduce);
-#endif
-  std::string Name() const override;
-
- protected:
-  void RunImpl() override;
-
- private:
-  size_t num_of_all_reduce_;
-
-  // Check the dtype of the input
-  void GetDTypeAndNumel(
-      const std::vector<std::pair<std::string, const LoDTensor *>> &g_tensor,
-      proto::VarType::Type *dtype, int64_t *total_num) const;
-
-  // Get gradient's name and LoDTensor
-  void GetGradLoDTensor(const size_t &scope_idx,
-                        const std::vector<VarHandle *> &in_var_handles,
-                        const std::vector<VarHandle *> &out_var_handles,
-                        std::vector<std::pair<std::string, const LoDTensor *>>
-                            *grad_tensor) const;
-
-  bool InputIsInDifferentPlace(
-      const std::vector<VarHandle *> &in_var_handles) const;
-
-  void FusedAllReduceFunc(const std::vector<VarHandle *> &in_var_handles,
-                          const std::vector<VarHandle *> &out_var_handles);
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
deleted file mode 100644
index 59c5da0de8c114823a1cad3e6d65c92081b5a2b6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/variable_visitor.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-void FusedBroadcastOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
-
-  if (places_.size() == 1UL) return;
-
-  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
-
-  WaitInputVarGenerated();
-
-  size_t place_num = places_.size();
-  PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
-
-  for (size_t i = 0; i < in_var_handles.size(); ++i) {
-    BroadcastOneVar(
-        *in_var_handles[i],
-        std::vector<VarHandle *>(out_var_handles.begin() + i * place_num,
-                                 out_var_handles.begin() + (i + 1) * place_num),
-        local_exec_scopes_);
-  }
-}
-
-std::string FusedBroadcastOpHandle::Name() const { return "fused_broadcast"; }
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
deleted file mode 100644
index e43d545c9c0d062a8814a2f5d27f9ca20290b9dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/broadcast_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/device_context.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct FusedBroadcastOpHandle : public BroadcastOpHandle {
- public:
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  FusedBroadcastOpHandle(ir::Node *node,
-                         const std::vector<Scope *> local_scopes,
-                         const std::vector<platform::Place> &places,
-                         const platform::NCCLContextMap *nccl_ctx)
-      : BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {}
-#else
-  FusedBroadcastOpHandle(ir::Node* node, const std::vector<Scope*> local_scopes,
-                         const std::vector<platform::Place>& places)
-      : BroadcastOpHandle(node, local_scopes, places) {}
-#endif
-  std::string Name() const override;
-
- protected:
-  void RunImpl() override;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
deleted file mode 100644
index 49404509a6fba0a6568c5db39a7bc744418f07a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
-#include <memory>
-#include <unordered_map>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
-  std::vector<std::string> out_varnames_;
-  std::vector<std::unique_ptr<ir::Node>> nodes_;
-
-  void InitFusedBroadcastOp(std::vector<size_t> input_scope_idxes) {
-    nodes_.clear();
-    // initialize scope and var
-    std::unordered_map<Scope*, Scope*> scope_map;
-    for (size_t i = 0; i < place_list_.size(); ++i) {
-      local_scopes_.push_back(&(g_scope_.NewScope()));
-      Scope& local_scope = local_scopes_.back()->NewScope();
-      for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
-        local_scope.Var("out_var" + std::to_string(j));
-        if (i == j) local_scope.Var("in_var" + std::to_string(j));
-      }
-      param_scopes_.emplace_back(&local_scope);
-      scope_map.emplace(local_scopes_.back(), param_scopes_.back());
-    }
-
-    // create op handle node
-    nodes_.emplace_back(
-        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
-    if (use_gpu_) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      op_handle_ = new FusedBroadcastOpHandle(
-          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
-#else
-      PADDLE_THROW("CUDA is not supported.");
-#endif
-    } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      op_handle_ = new FusedBroadcastOpHandle(
-          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
-#else
-      op_handle_ = new FusedBroadcastOpHandle(nodes_.back().get(),
-                                              local_scopes_, place_list_);
-#endif
-    }
-
-    op_handle_->SetLocalExecScopes(scope_map);
-
-    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      // add input var handle
-      nodes_.emplace_back(ir::CreateNodeForTest("in_node" + std::to_string(i),
-                                                ir::Node::Type::kVariable));
-      VarHandle* in_var_handle = new VarHandle(
-          nodes_.back().get(), 1, input_scope_idxes[i],
-          "in_var" + std::to_string(i), place_list_[input_scope_idxes[i]]);
-      vars_.emplace_back(in_var_handle);
-      op_handle_->AddInput(in_var_handle);
-
-      // add output var handle
-      for (size_t j = 0; j < place_list_.size(); ++j) {
-        nodes_.emplace_back(ir::CreateNodeForTest(
-            "out_node" + std::to_string(i), ir::Node::Type::kVariable));
-        VarHandle* out_var_handle =
-            new VarHandle(nodes_.back().get(), 2, j,
-                          "out_var" + std::to_string(i), place_list_[j]);
-        vars_.emplace_back(out_var_handle);
-        op_handle_->AddOutput(out_var_handle);
-      }
-    }
-  }
-
-  void TestFusedBroadcastLoDTensor(std::vector<size_t> input_scope_idxes) {
-    std::vector<std::vector<float>> send_vec;
-    f::LoD lod{{0, 10, 20}};
-    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string varname("in_var" + std::to_string(i));
-      float val_scalar = static_cast<float>(i);
-      send_vec.push_back(
-          InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
-    }
-
-    op_handle_->Run(false);
-
-    WaitAll();
-    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string& varname("out_var" + std::to_string(i));
-      for (size_t j = 0; j < place_list_.size(); ++j) {
-        LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
-      }
-    }
-  }
-
-  void TestFusedBroadcastSelectedRows(std::vector<size_t> input_scope_idxes) {
-    std::vector<std::vector<float>> send_vector;
-    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
-                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-    int height = static_cast<int>(kDims[0] * 2);
-    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string varname("in_var" + std::to_string(i));
-      float val_scalar = static_cast<float>(i);
-      send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
-                                             rows, height, val_scalar));
-    }
-
-    op_handle_->Run(false);
-
-    WaitAll();
-    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string& varname("out_var" + std::to_string(i));
-      for (size_t j = 0; j < place_list_.size(); ++j) {
-        SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
-                          height);
-      }
-    }
-  }
-};
-
-TEST(FusedBroadcastTester, CPULodTensor) {
-  TestFusedBroadcastOpHandle test_op;
-  std::vector<size_t> input_scope_idxes = {0, 1};
-  test_op.InitCtxOnGpu(false);
-  test_op.InitFusedBroadcastOp(input_scope_idxes);
-  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
-}
-
-TEST(FusedBroadcastTester, CPUSelectedRows) {
-  TestFusedBroadcastOpHandle test_op;
-  std::vector<size_t> input_scope_idxes = {0, 1};
-  test_op.InitCtxOnGpu(false);
-  test_op.InitFusedBroadcastOp(input_scope_idxes);
-  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(FusedBroadcastTester, GPULodTensor) {
-  TestFusedBroadcastOpHandle test_op;
-  std::vector<size_t> input_scope_idxes = {0, 1};
-  test_op.InitCtxOnGpu(true);
-  test_op.InitFusedBroadcastOp(input_scope_idxes);
-  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
-}
-
-TEST(FusedBroadcastTester, GPUSelectedRows) {
-  TestFusedBroadcastOpHandle test_op;
-  std::vector<size_t> input_scope_idxes = {0, 1};
-  test_op.InitCtxOnGpu(true);
-  test_op.InitFusedBroadcastOp(input_scope_idxes);
-  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
-}
-#endif
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
deleted file mode 100644
index a039c6200e394eebf6c44846ce2b0bf5d773e764..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/gather_op_handle.h"
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/variable_visitor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-GatherOpHandle::GatherOpHandle(ir::Node *node,
-                               const std::vector<Scope *> &local_scopes,
-                               const std::vector<platform::Place> &places)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
-
-void GatherOpHandle::RunImpl() {
-  if (places_.size() == 1) return;
-  // the input and output may have dummy var.
-  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
-
-  VarHandle *out_var_handle;
-  {
-    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
-                      "The number of output should be one.");
-    out_var_handle = out_var_handles.front();
-  }
-
-  auto &var_scopes = local_exec_scopes_;
-
-  auto in_0_handle = in_var_handles[0];
-  auto pre_in_var =
-      var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
-
-  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
-                 "Currently, gather_op only can gather SelectedRows.");
-
-  // Wait input done, this Wait is asynchronous operation
-  WaitInputVarGenerated();
-
-  auto &pre_in_value = pre_in_var->Get<framework::SelectedRows>();
-  std::vector<int64_t> out_rows;
-  std::vector<Tensor> in_tensors;
-
-  // Gather the inputs
-  for (auto *in_handle : in_var_handles) {
-    auto *in_var =
-        var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
-
-    auto &in_sr_value = in_var->Get<framework::SelectedRows>();
-
-    auto &in_sr_rows = in_sr_value.rows();
-    out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
-    in_tensors.emplace_back(in_sr_value.value());
-  }
-
-  // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
-  platform::Place t_out_p = out_var_handle->place();
-  if (platform::is_gpu_place(pre_in_value.place())) {
-    PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-                   "Places of input and output must be all on GPU.");
-  } else {
-    t_out_p = platform::CPUPlace();
-  }
-
-  auto out_var = var_scopes.at(out_var_handle->scope_idx())
-                     ->FindVar(out_var_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(out_var);
-  auto out_value = out_var->GetMutable<framework::SelectedRows>();
-  out_value->set_height(pre_in_value.height());
-  out_value->set_rows(out_rows);
-  size_t rows = out_rows.size();
-  DDim out_dim = pre_in_value.GetCompleteDims();
-  out_dim[0] = static_cast<int64_t>(rows);
-  out_value->mutable_value()->Resize(out_dim).mutable_data(
-      t_out_p, pre_in_value.value().type());
-  Tensor *out_tensor = out_value->mutable_value();
-
-  // copy
-  auto dev_ctx = dev_ctxes_.at(out_var_handle->place());
-  RunAndRecordEvent(out_var_handle->place(), [in_tensors, out_tensor, &dev_ctx,
-                                              t_out_p] {
-    int s = 0, e = 0;
-    for (size_t j = 0; j < in_tensors.size(); ++j) {
-      e += in_tensors[j].dims()[0];
-      auto sub_out = out_tensor->Slice(s, e);
-      paddle::framework::TensorCopy(in_tensors[j], t_out_p, *dev_ctx, &sub_out);
-      s = e;
-    }
-  });
-}
-
-std::string GatherOpHandle::Name() const { return "gather"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
deleted file mode 100644
index ac87b246b50f8e0df1d0cc082087d4128a79384b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct GatherOpHandle : public OpHandleBase {
- public:
-  GatherOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                 const std::vector<platform::Place> &places);
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override { return false; };
-
- protected:
-  void RunImpl() override;
-
-  std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
-
- private:
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
deleted file mode 100644
index 5d8562e7046fd2f1609ba34ce2dd71b9fa28be77..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/gather_op_handle.h"
-#include <memory>
-#include <unordered_map>
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-// test data amount
-const f::DDim kDims = {20, 20};
-
-struct TestGatherOpHandle {
-  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
-  std::vector<Scope*> local_scopes_;
-  std::vector<Scope*> param_scopes_;
-  Scope g_scope_;
-  OpHandleBase* op_handle_;
-  std::vector<VarHandleBase*> vars_;
-  std::vector<p::Place> gpu_list_;
-  std::vector<std::unique_ptr<ir::Node>> nodes_;
-
-  void WaitAll() {
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      ctxs_[j]->Wait();
-    }
-  }
-
-  void InitCtxOnGpu(bool use_gpu) {
-    if (use_gpu) {
-#ifdef PADDLE_WITH_CUDA
-      int count = p::GetCUDADeviceCount();
-      if (count <= 1) {
-        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
-                        "device count is "
-                     << count;
-        exit(0);
-      }
-      for (int i = 0; i < count; ++i) {
-        auto p = p::CUDAPlace(i);
-        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
-      }
-#else
-      PADDLE_THROW("CUDA is not support.");
-#endif
-    } else {
-      int count = 8;
-      for (int i = 0; i < count; ++i) {
-        auto p = p::CPUPlace();
-        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
-      }
-    }
-  }
-
-  void InitGatherOp(size_t input_scope_idx) {
-    nodes_.clear();
-    std::unordered_map<Scope*, Scope*> scope_map;
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      local_scopes_.push_back(&(g_scope_.NewScope()));
-      Scope& local_scope = local_scopes_.back()->NewScope();
-      local_scope.Var("input");
-      param_scopes_.emplace_back(&local_scope);
-      scope_map.emplace(local_scopes_.back(), param_scopes_.back());
-    }
-    param_scopes_[input_scope_idx]->Var("out");
-
-    nodes_.emplace_back(
-        ir::CreateNodeForTest("node", ir::Node::Type::kOperation).release());
-    op_handle_ =
-        new GatherOpHandle(nodes_.back().get(), local_scopes_, gpu_list_);
-
-    op_handle_->SetLocalExecScopes(scope_map);
-
-    // add input
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
-      nodes_.emplace_back(
-          ir::CreateNodeForTest("node1", ir::Node::Type::kVariable).release());
-      auto* in_var_handle =
-          new VarHandle(nodes_.back().get(), 1, j, "input", gpu_list_[j]);
-      vars_.emplace_back(in_var_handle);
-      op_handle_->AddInput(in_var_handle);
-    }
-
-    // add dummy var
-    nodes_.emplace_back(
-        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable).release());
-    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
-    DummyVarHandle* in_dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back());
-    in_dummy_var_handle->ClearGeneratedOp();
-    op_handle_->AddInput(in_dummy_var_handle);
-
-    // add output
-    nodes_.emplace_back(
-        ir::CreateNodeForTest("node3", ir::Node::Type::kVariable).release());
-    auto* out_var_handle =
-        new VarHandle(nodes_.back().get(), 2, input_scope_idx, "out",
-                      gpu_list_[input_scope_idx]);
-    vars_.emplace_back(out_var_handle);
-    op_handle_->AddOutput(out_var_handle);
-
-    // add dummy var
-    nodes_.emplace_back(
-        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable).release());
-    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
-    DummyVarHandle* dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back());
-    op_handle_->AddOutput(dummy_var_handle);
-  }
-
-  void TestGatherSelectedRows(size_t output_scope_idx) {
-    int height = kDims[0] * 2;
-    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
-                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-    std::vector<float> send_vector(f::product(kDims));
-    for (size_t k = 0; k < send_vector.size(); ++k) {
-      send_vector[k] = k;
-    }
-
-    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
-         ++input_scope_idx) {
-      auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
-      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
-      auto value = in_selected_rows->mutable_value();
-      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-
-      in_selected_rows->set_height(height);
-      in_selected_rows->set_rows(rows);
-
-      paddle::framework::TensorFromVector<float>(
-          send_vector, *(ctxs_[input_scope_idx]), value);
-      value->Resize(kDims);
-    }
-
-    auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
-
-    auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
-    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
-
-    out_selected_rows->mutable_value()->ShareDataWith(
-        in_selected_rows->value());
-
-    op_handle_->Run(false);
-
-    WaitAll();
-
-    p::CPUPlace cpu_place;
-
-    auto& out_select_rows = out_var->Get<f::SelectedRows>();
-    auto rt = out_select_rows.value();
-
-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
-    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
-    }
-
-    f::Tensor result_tensor;
-    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
-    float* ct = result_tensor.data<float>();
-
-    for (int64_t j = 0;
-         j < f::product(kDims) * static_cast<int64_t>(gpu_list_.size()); ++j) {
-      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
-    }
-  }
-};
-
-TEST(GatherTester, TestCPUGatherTestSelectedRows) {
-  TestGatherOpHandle test_op;
-  size_t input_scope_idx = 0;
-  test_op.InitCtxOnGpu(false);
-  test_op.InitGatherOp(input_scope_idx);
-  test_op.TestGatherSelectedRows(input_scope_idx);
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-TEST(GatherTester, TestGPUGatherTestSelectedRows) {
-  TestGatherOpHandle test_op;
-  size_t input_scope_idx = 0;
-  test_op.InitCtxOnGpu(false);
-  test_op.InitGatherOp(input_scope_idx);
-  test_op.TestGatherSelectedRows(input_scope_idx);
-}
-#endif
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h
deleted file mode 100644
index d139f8488309eecf89c924a346ab0e574edc86dc..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/graph_test_base.h
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include <string>
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-class DummyOp : public OperatorBase {
- public:
-  DummyOp(const std::string& type, const VariableNameMap& inputs,
-          const VariableNameMap& outputs, const AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {}
-};
-
-class SumOpMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class AssignOpMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class SplitOpMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "");
-    AddOutput("Out", "").AsDuplicable();
-    AddComment("");
-  }
-};
-
-class DummyVarTypeInference : public VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto& inputs = ctx->Input("X");
-    auto type = ctx->GetType(inputs.front());
-    auto out_var_name = ctx->Output("Out").front();
-    ctx->SetType(out_var_name, type);
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_helper.cc b/paddle/fluid/framework/details/multi_devices_helper.cc
deleted file mode 100644
index 0242274a16c50508f2c0294264c175515c7293ef..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/multi_devices_helper.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace details {}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
deleted file mode 100644
index 49bc85dbfb820224dd1a39fbaeaadb752f0a1664..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/place.h"
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// all variable in each devices.
-// The outside vector is the device vector. Each element of this vector is a
-// map from variable name to variables. The variables, who have the same name,
-// will have a differsent version. The offset in the
-// `std::vector<VarHandle*>` is the version of varaibles.
-typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
-    GraphVars;
-constexpr char kGraphVars[] = "vars";
-
-constexpr char kNRanks[] = "nranks";
-
-constexpr char kPlaces[] = "places";
-constexpr char kLocalScopes[] = "local_scopes";
-constexpr char kNCCLCtxs[] = "nccl_ctxs";
-constexpr char kUseHierarchicalAllReduce[] = "use_hierarchical_allreduce";
-
-// aux variables to represent dependency. Useful to resolve data hazard.
-typedef std::unordered_set<VarHandleBase *> GraphDepVars;
-constexpr char kGraphDepVars[] = "dep_vars";
-
-typedef std::unordered_set<std::string> FusedVars;
-constexpr char kFusedVars[] = "fused_vars";
-constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
-
-typedef std::string FusedOptType;
-constexpr char kFusedOptType[] = "fused_opt_type";
-
-typedef std::vector<std::string> FusedGrads;
-constexpr char kFusedGrads[] = "fused_gradients";
-
-typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
-constexpr char kParamsAndDenseGrads[] = "params_and_dense_grads";
-constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads";
-
-typedef std::vector<ProgramDesc> ProgramDescs;
-constexpr char kProgramDescs[] = "program_descs";
-
-typedef std::unordered_set<std::string> PinnedVars;
-constexpr char kPinnedVars[] = "pinned_vars";
-
-typedef std::vector<std::vector<std::pair<std::string, std::string>>>
-    GroupParamsAndGrads;
-constexpr char kGroupParamsAndDenseGrads[] = "group_params_dense_grads";
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h
deleted file mode 100644
index 56dacccafaace8b094e35e4a6a85bbd78c82e10a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ /dev/null
@@ -1,235 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/dynload/nccl.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-
-DECLARE_bool(sync_nccl_allreduce);
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class NCCLOpHandleBase : public OpHandleBase {
- public:
-  NCCLOpHandleBase(ir::Node* node, const std::vector<platform::Place>& places,
-                   const platform::NCCLCommunicator* nccl_ctxs)
-      : OpHandleBase(node), places_(places), nccl_ctxs_(nccl_ctxs) {
-    if (nccl_ctxs == nullptr) {
-      return;
-    }
-    // init device context
-    auto default_nccl_ctxs = nccl_ctxs_->DefaultFlatCtx();
-    for (auto& p : places_) {
-      this->SetDeviceContext(p, default_nccl_ctxs->DevCtx(p));
-    }
-  }
-  virtual ~NCCLOpHandleBase() {
-    for (auto& ev : inter_events_) {
-      PADDLE_ENFORCE(cudaEventDestroy(ev.second));
-    }
-    for (auto& ev : exter_events_) {
-      PADDLE_ENFORCE(cudaEventDestroy(ev.second));
-    }
-  }
-  void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
-    PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0");
-    run_order_ = run_order;
-    use_hierarchical_allreduce_ = use_hierarchical_allreduce;
-
-    VLOG(10) << "SetRunEnv "
-             << " run_order:" << run_order
-             << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
-             << ", nccl_ctx_:" << nccl_ctxs_;
-
-    if (nccl_ctxs_ == nullptr) {
-      return;
-    }
-
-    if (!use_hierarchical_allreduce_) {
-      auto ctxs = nccl_ctxs_->GetFlatCtx(run_order);
-      for (auto& p : places_) {
-        this->SetDeviceContext(p, ctxs->DevCtx(p));
-      }
-      return;
-    }
-
-    PADDLE_ENFORCE(places_.size() == 1,
-                   "HierarchicalAllReduce run one proc with one card mode.");
-
-    for (auto& p : places_) {
-      auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order);
-      this->SetDeviceContext(p, ctxs->DevCtx(p));
-    }
-
-    for (auto& p : dev_ctxes_) {
-      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
-      if (inter_events_.find(dev_id) != inter_events_.end()) {
-        continue;
-      }
-
-      PADDLE_ENFORCE(cudaSetDevice(dev_id));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id],
-                                              cudaEventDisableTiming));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id],
-                                              cudaEventDisableTiming));
-      VLOG(10) << "Create events on dev_id:" << dev_id
-               << ", inter_event:" << &inter_events_[dev_id]
-               << ", exter_event:" << &exter_events_[dev_id];
-    }
-  }
-
-  void FlatNCCLAllReduce(platform::Place place, const void* sendbuff,
-                         void* recvbuff, size_t count, ncclDataType_t datatype,
-                         ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
-    auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
-    int dev_id = boost::get<platform::CUDAPlace>(place).device;
-    auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
-    auto stream = nccl_ctx.stream();
-    auto comm = nccl_ctx.comm_;
-
-    VLOG(10) << "before all reduce buffer:" << sendbuff << ", numel:" << count
-             << ", dev_id:" << dev_id << ", dtype:" << datatype
-             << ", place:" << place;
-
-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, count, datatype, op, comm, stream));
-  }
-
-  void NCCLAllReduce(platform::Place place, const void* sendbuff,
-                     void* recvbuff, size_t count, ncclDataType_t datatype,
-                     ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
-    if (!use_hierarchical_allreduce_) {
-      FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
-      return;
-    }
-
-    HierarchicalAllReduce(place, sendbuff, recvbuff, count, datatype, op);
-  }
-
-  void HierarchicalAllReduce(platform::Place place, const void* sendbuff,
-                             void* recvbuff, size_t count,
-                             ncclDataType_t datatype, ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
-    InterReduce(place, sendbuff, recvbuff, count, datatype, op);
-    // When a trainer is not in exter allreduce ring
-    // they need not to call this.
-    if (nccl_ctxs_->NeedExterAllReduce()) {
-      ExterAllReduce(place, recvbuff, recvbuff, count, datatype, op);
-    }
-    InterBroadCast(place, recvbuff, count, datatype, op);
-  }
-
- protected:
-  void InterReduce(platform::Place place, const void* sendbuff, void* recvbuff,
-                   size_t count, ncclDataType_t datatype, ncclRedOp_t op) {
-    auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
-    int dev_id = boost::get<platform::CUDAPlace>(place).device;
-    auto& nccl_ctx = nccl_ctxs->at(dev_id);
-    auto stream = nccl_ctx.stream();
-    auto comm = nccl_ctx.comm_;
-
-    VLOG(10) << "before all reduce"
-             << " run_order:" << run_order_ << ", buffer:" << sendbuff
-             << ", numel:" << count << ", dev_id:" << dev_id
-             << ", dtype:" << datatype << ", place:" << place
-             << ", stream:" << stream;
-
-    PADDLE_ENFORCE(platform::dynload::ncclReduce(
-        sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
-
-    cudaEventRecord(inter_events_.at(dev_id), stream);
-
-    if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream),
-                     "sync HierarchicalAllReduce inter stream error");
-    }
-  }
-
-  void ExterAllReduce(platform::Place place, const void* sendbuff,
-                      void* recvbuff, size_t count, ncclDataType_t datatype,
-                      ncclRedOp_t op) {
-    auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
-    PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_);
-    int dev_id = boost::get<platform::CUDAPlace>(place).device;
-    auto& nccl_ctx = nccl_ctxs->at(dev_id);
-    auto stream = nccl_ctx.stream();
-    auto comm = nccl_ctx.comm_;
-
-    VLOG(10) << "before all reduce run_order:" << run_order_
-             << "buffer:" << sendbuff << ", numel:" << count
-             << ", dev_id:" << dev_id << ", dtype:" << datatype
-             << ", place:" << place << ", stream:" << stream;
-
-    cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
-
-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, count, datatype, op, comm, stream));
-
-    cudaEventRecord(exter_events_.at(dev_id), stream);
-
-    if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream),
-                     "sync HierarchicalAllReduce exter stream error");
-    }
-  }
-
-  void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
-                      ncclDataType_t datatype, ncclRedOp_t op) {
-    auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
-    int dev_id = boost::get<platform::CUDAPlace>(place).device;
-    auto& nccl_ctx = nccl_ctxs->at(dev_id);
-    auto stream = nccl_ctx.stream();
-    auto comm = nccl_ctx.comm_;
-
-    VLOG(10) << "before InterBroadCast buffer:" << sendbuff
-             << ", numel:" << count << ", dev_id:" << dev_id
-             << ", dtype:" << datatype << ", place:" << place
-             << ", stream:" << stream;
-
-    cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
-    PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0,
-                                                comm, stream));
-  }
-
- protected:
-  std::vector<platform::Place> places_;
-  const platform::NCCLCommunicator* nccl_ctxs_{nullptr};
-  // When multi trainer call collective function, they need run the same order.
-  // Or the program will hang.So we use allreduce_deps_pass to set this
-  // run_order_.
-  int run_order_{0};
-  // Use 2d allreduce or not.
-  bool use_hierarchical_allreduce_{false};
-
- private:
-  // hierarchical needed events
-  std::unordered_map<int, cudaEvent_t> inter_events_;
-  std::unordered_map<int, cudaEvent_t> exter_events_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
deleted file mode 100644
index b2fa31f73b9d96ef7fe56dd59ca9b4b18f114c95..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include <map>
-#include <unordered_set>
-
-namespace paddle {
-namespace framework {
-namespace details {
-std::string OpHandleBase::DebugString() const {
-  std::stringstream ss;
-  ss << Name() << "(";
-  for (auto *var : inputs_) {
-    ss << var->DebugString() << ", ";
-  }
-  ss << ") --> (";
-  for (auto *var : outputs_) {
-    ss << var->DebugString() << ", ";
-  }
-  ss << ")\n";
-  return ss.str();
-}
-
-OpHandleBase::~OpHandleBase() {
-#ifdef PADDLE_WITH_CUDA
-  for (auto &ev : events_) {
-    if (ev.second) {
-      PADDLE_ENFORCE(cudaEventDestroy(ev.second));
-    }
-  }
-#endif
-}
-
-void OpHandleBase::InitCUDA() {
-#ifdef PADDLE_WITH_CUDA
-  for (auto &p : dev_ctxes_) {
-    int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
-    PADDLE_ENFORCE(cudaSetDevice(dev_id));
-    PADDLE_ENFORCE(
-        cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
-  }
-  if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
-    for (auto &out_var : outputs_) {
-      auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
-      if (out_var_handle) {
-        int dev_id =
-            boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
-        out_var_handle->SetGenerateEvent(events_.at(dev_id));
-      }
-    }
-  } else {
-    PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
-                      "%s should have only one dev_ctx.", Name());
-    auto &place = dev_ctxes_.begin()->first;
-    int dev_id = boost::get<platform::CUDAPlace>(place).device;
-    for (auto &out_var : outputs_) {
-      auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
-      if (out_var_handle) {
-        PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()),
-                       "The place of output(%s) is not consistent with the "
-                       "place of current op(%s).",
-                       out_var_handle->Name(), Name());
-        out_var_handle->SetGenerateEvent(events_.at(dev_id));
-      }
-    }
-  }
-#endif
-}
-
-void OpHandleBase::Run(bool use_cuda) {
-#ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) {
-    InitCUDA();
-  }
-#else
-  PADDLE_ENFORCE(!use_cuda);
-#endif
-
-  RunImpl();
-}
-
-void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_NOT_NULL(waited_ctx);
-  if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
-    for (auto &dev_ctx : dev_ctxes_) {
-      PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
-      dev_ctx.second->Wait();
-    }
-  } else {
-    auto stream =
-        static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
-    for (auto &ev : events_) {
-      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
-    }
-  }
-#else
-  for (auto &dev_ctx : dev_ctxes_) {
-    dev_ctx.second->Wait();
-  }
-#endif
-}
-
-void OpHandleBase::AddInput(VarHandleBase *in) {
-  this->inputs_.emplace_back(in);
-  node_->inputs.push_back(in->Node());
-  in->AddOutput(this, this->Node());
-}
-
-void OpHandleBase::AddOutput(VarHandleBase *out) {
-  outputs_.emplace_back(out);
-  node_->outputs.push_back(out->Node());
-  out->AddInput(this, this->Node());
-}
-
-void OpHandleBase::WaitInputVarGenerated() {
-  for (auto in_var : inputs_) {
-    if (NeedWait(in_var)) {
-      // Dummy Variable is used to represent dependencies between operators, so
-      // there doesn't add event for it.
-      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
-      if (in_var_handle) {
-        auto &place = in_var_handle->place();
-        if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
-          auto stream =
-              static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
-                  ->stream();
-          PADDLE_ENFORCE(
-              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
-#else
-          PADDLE_THROW("Doesn't compile the GPU.");
-#endif
-        }
-        // There are nothing to do when the place is CPUPlace.
-      }
-    }
-  }
-}
-
-void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
-  for (auto in_var : inputs_) {
-    if (NeedWait(in_var)) {
-      // Dummy Variable is used to represent dependencies between operators, so
-      // there doesn't add event for it.
-      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
-      if (in_var_handle) {
-        if (platform::is_gpu_place(in_var_handle->place())) {
-#ifdef PADDLE_WITH_CUDA
-          auto stream = static_cast<platform::CUDADeviceContext *>(
-                            dev_ctxes_.at(in_var_handle->place()))
-                            ->stream();
-          PADDLE_ENFORCE(
-              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
-#else
-          PADDLE_THROW("Doesn't compile the GPU.");
-#endif
-        }
-        // There are nothing to do when the place is CPUPlace.
-      }
-    }
-  }
-}
-
-size_t OpHandleBase::NoDummyInputSize() const {
-  size_t cnt = 0;
-  for (auto *in : inputs_) {
-    if (dynamic_cast<DummyVarHandle *>(in) == nullptr) {
-      ++cnt;
-    }
-  }
-  return cnt;
-}
-
-bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
-  return in_var && in_var->GeneratedOp();
-}
-
-void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
-#ifdef PADDLE_WITH_CUDA
-  if (!events_.empty()) {  // Use event
-    std::function<void()> method = callback;
-    for (auto &p : dev_ctxes_) {
-      method = [method, p, this]() {
-        VLOG(10) << "cudadevicecontext:"
-                 << static_cast<platform::CUDADeviceContext *>(p.second)
-                 << ", dev_id:"
-                 << boost::get<platform::CUDAPlace>(p.first).device;
-
-        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
-            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
-            method);
-      };
-    }
-    method();
-  } else {
-#endif
-    callback();
-#ifdef PADDLE_WITH_CUDA
-  }
-#endif
-}
-
-void OpHandleBase::RunAndRecordEvent(platform::Place p,
-                                     const std::function<void()> &callback) {
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_cpu_place(p) || events_.empty()) {
-    callback();
-  } else {
-    auto *ctx = dev_ctxes_.at(p);
-    auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
-    cuda_ctx->RecordEvent(events_.at(boost::get<platform::CUDAPlace>(p).device),
-                          callback);
-  }
-#else
-  callback();
-#endif
-}
-
-size_t OpHandleBase::NotReadyInputSize() const {
-  std::unordered_set<VarHandleBase *> res;
-  for (auto *var : inputs_) {
-    if (var->GeneratedOp() != nullptr) {
-      res.emplace(var);
-    }
-  }
-  return res.size();
-}
-
-void OpHandleBase::SetLocalExecScopes(
-    const std::unordered_map<Scope *, Scope *> &scope_map) {
-  local_exec_scopes_.clear();
-  auto scopes = GetLocalScopes();
-  for (auto *scope : scopes) {
-    auto iter = scope_map.find(scope);
-    PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found");
-    local_exec_scopes_.emplace_back(iter->second);
-  }
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
deleted file mode 100644
index 16016dd3cfa8f8b2c668db01bdbf7ff7518c7ee9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ /dev/null
@@ -1,144 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-
-class Scope;
-
-namespace details {
-
-// Wraps ir::Node and provide helper utilities.
-// It's responsible for populating necessary fields of ir::Node.
-class OpHandleBase {
- public:
-  /**
-   * NOTE(zjl): Some op should have higher priority than others.
-   * The higher priority op would run first without switching
-   * threads in Executor.
-   */
-  enum Priority { kHighest = 0, kNormal = 1 };
-
-  // Owned by `node`. No need to be deleted explicitly.
-  explicit OpHandleBase(ir::Node *node) : node_(node) {
-    node_->WrappedBy(this);
-  }
-
-  virtual ~OpHandleBase();
-
-  std::string DebugString() const;
-
-  virtual Priority GetPriority() const { return kNormal; }
-
-  virtual std::string Name() const = 0;
-
-  void Run(bool use_cuda);
-
-  virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx);
-
-  void AddInput(VarHandleBase *in);
-
-  void AddOutput(VarHandleBase *out);
-
-  // This method adds the wait events of all the input on all the device
-  // context.
-  // NODE: This Wait is asynchronous operation.
-  virtual void WaitInputVarGenerated();
-
-  // This method adds the wait events of all the input on the specified device
-  // context.
-  // NODE: This Wait is asynchronous operation.
-  virtual void WaitInputVarGenerated(const platform::Place &place);
-
-  virtual bool NeedWait(VarHandleBase *in_var);
-
-  // If the Op involves data transfer of multiple devices that
-  // will likely block other computations.
-  virtual bool IsMultiDeviceTransfer() { return false; }
-
-  const platform::DeviceContext *DeviceContext(platform::Place place) {
-    auto it = dev_ctxes_.find(place);
-    return it != dev_ctxes_.end() ? it->second : nullptr;
-  }
-  const std::map<platform::Place, platform::DeviceContext *> &DeviceContext() {
-    return dev_ctxes_;
-  }
-
-  void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
-    dev_ctxes_[place] = ctx_;
-  }
-
-  const std::vector<VarHandleBase *> &Inputs() const { return inputs_; }
-
-  size_t NoDupInputSize() const {
-    std::unordered_set<VarHandleBase *> res;
-    for (auto *var : inputs_) {
-      res.emplace(var);
-    }
-    return res.size();
-  }
-
-  size_t NotReadyInputSize() const;
-
-  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
-
-  size_t NoDummyInputSize() const;
-
-  ir::Node *Node() { return node_; }
-
-  const ir::Node *Node() const { return node_; }
-
-  void SetLocalExecScopes(
-      const std::unordered_map<Scope *, Scope *> &scope_map);
-
- protected:
-  virtual std::vector<Scope *> GetLocalScopes() = 0;
-
-  void RunAndRecordEvent(const std::function<void()> &callback);
-
-  void RunAndRecordEvent(platform::Place p,
-                         const std::function<void()> &callback);
-
-  virtual void RunImpl() = 0;
-
-  virtual void InitCUDA();
-
-  ir::Node *node_;
-  std::vector<VarHandleBase *> inputs_;
-  std::vector<VarHandleBase *> outputs_;
-  std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
-
-  std::vector<Scope *> local_exec_scopes_;
-
-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<int, cudaEvent_t> events_;
-#endif
-
-  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
deleted file mode 100644
index 0f03ca51da778d4ce8aefa493d2227e789614679..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/op_registry.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/grad_op_desc_maker.h"
-#include "paddle/fluid/framework/inplace_op_inference.h"
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-enum OpInfoFillType {
-  kOperator = 0,
-  kOpProtoAndCheckerMaker = 1,
-  kGradOpDescMaker = 2,
-  kVarTypeInference = 3,
-  kShapeInference = 4,
-  kInplaceOpInference = 5,
-  kNoNeedBufferVarsInference = 6,
-  kUnknown = -1
-};
-
-namespace internal {
-template <typename T, OpInfoFillType kType>
-struct TypePair {
-  using Type = T;
-  static constexpr OpInfoFillType kFillType = kType;
-};
-
-using OpRegistryClasses = std::tuple<                                // NOLINT
-    TypePair<OperatorBase, kOperator>,                               // NOLINT
-    TypePair<OpProtoAndCheckerMaker, kOpProtoAndCheckerMaker>,       // NOLINT
-    TypePair<GradOpDescMakerBase, kGradOpDescMaker>,                 // NOLINT
-    TypePair<VarTypeInference, kVarTypeInference>,                   // NOLINT
-    TypePair<InferShapeBase, kShapeInference>,                       // NOLINT
-    TypePair<InplaceOpInference, kInplaceOpInference>,               // NOLINT
-    TypePair<NoNeedBufferVarsInference, kNoNeedBufferVarsInference>  // NOLINT
-    >;
-
-static constexpr int kOpRegistryClassNumber =
-    std::tuple_size<OpRegistryClasses>::value;
-
-template <typename T, int kPos, bool kIsBounded /* = true*/>
-struct IsMatchedBaseTypeImpl {
-  using PairType = typename std::tuple_element<kPos, OpRegistryClasses>::type;
-  static constexpr bool kValue =
-      std::is_base_of<typename PairType::Type, T>::value;
-};
-
-template <typename T, int kPos>
-struct IsMatchedBaseTypeImpl<T, kPos, false> {
-  static constexpr bool kValue = false;
-};
-
-template <typename T, int kPos>
-static inline constexpr bool IsMatchedBaseType() {
-  return IsMatchedBaseTypeImpl<
-      T, kPos, (kPos >= 0 && kPos < kOpRegistryClassNumber)>::kValue;
-}
-
-template <typename T, int kStart, int kEnd, bool kIsEnd, bool kIsMatched>
-struct OpInfoFillTypeGetterImpl {};
-
-// This case should not happen
-template <typename T, int kStart, int kEnd>
-struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, true> {};
-
-template <typename T, int kStart, int kEnd>
-struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, false> {
-  static constexpr OpInfoFillType kType = kUnknown;
-};
-
-template <typename T, int kStart, int kEnd>
-struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, false> {
-  static constexpr OpInfoFillType kType =
-      OpInfoFillTypeGetterImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd,
-                               IsMatchedBaseType<T, kStart + 1>()>::kType;
-};
-
-template <typename T, int kStart, int kEnd>
-struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, true> {
-  using PairType = typename std::tuple_element<kStart, OpRegistryClasses>::type;
-  static constexpr OpInfoFillType kType = PairType::kFillType;
-};
-
-template <typename T>
-using OpInfoFillTypeGetter =
-    OpInfoFillTypeGetterImpl<T, 0, kOpRegistryClassNumber,
-                             kOpRegistryClassNumber == 0,
-                             IsMatchedBaseType<T, 0>()>;
-
-}  // namespace internal
-
-template <typename T>
-struct OpInfoFillTypeID {
-  static constexpr OpInfoFillType ID() {
-    return internal::OpInfoFillTypeGetter<T>::kType;
-  }
-};
-
-template <typename T, OpInfoFillType = OpInfoFillTypeID<T>::ID()>
-struct OpInfoFiller;
-
-template <size_t I, bool at_end, typename... ARGS>
-class OperatorRegistrarRecursive;
-
-template <size_t I, typename... ARGS>
-class OperatorRegistrarRecursive<I, false, ARGS...> {
- public:
-  using T = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {
-    OpInfoFiller<T> fill;
-    fill(op_type, info);
-    constexpr auto size = sizeof...(ARGS);
-    OperatorRegistrarRecursive<I + 1, I + 1 == size, ARGS...> reg(op_type,
-                                                                  info);
-    (void)(reg);
-  }
-};
-
-template <size_t I, typename... ARGS>
-class OperatorRegistrarRecursive<I, true, ARGS...> {
- public:
-  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {}
-};
-
-template <typename T>
-struct OpInfoFiller<T, kOperator> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
-                        const VariableNameMap& outputs,
-                        const AttributeMap& attrs) {
-      return new T(type, inputs, outputs, attrs);
-    };
-  }
-};
-
-template <typename T>
-struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->proto_ = new proto::OpProto;
-    info->checker_ = new OpAttrChecker();
-    T maker;
-    maker(info->proto_, info->checker_);
-    info->proto_->set_type(op_type);
-    PADDLE_ENFORCE(
-        info->proto_->IsInitialized(),
-        "Fail to initialize %s's OpProto, because %s is not initialized",
-        op_type, info->proto_->InitializationErrorString());
-  }
-};
-
-template <typename T>
-struct OpInfoFiller<T, kGradOpDescMaker> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->grad_op_maker_ = [](
-        const OpDesc& fwd_op,
-        const std::unordered_set<std::string>& no_grad_set,
-        std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDesc*>& grad_block) {
-      T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
-      return maker();
-    };
-
-    info->use_default_grad_op_desc_maker_ =
-        std::is_base_of<DefaultGradOpDescMaker<true>, T>::value ||
-        std::is_base_of<DefaultGradOpDescMaker<false>, T>::value;
-  }
-};
-
-template <typename T>
-struct OpInfoFiller<T, kVarTypeInference> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](InferVarTypeContext* context) {
-      T inference;
-      inference(context);
-    };
-  }
-};
-
-template <typename T>
-struct OpInfoFiller<T, kShapeInference> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_shape_ = [](InferShapeContext* ctx) {
-      T inference;
-      inference(ctx);
-    };
-  }
-};
-
-template <typename T>
-struct OpInfoFiller<T, kInplaceOpInference> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_inplace_ = [](const OpDesc& op_desc, bool use_cuda) {
-      T infer;
-      return infer(op_desc, use_cuda);
-    };
-  }
-};
-
-template <typename T>
-struct OpInfoFiller<T, kNoNeedBufferVarsInference> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_no_need_buffer_vars_ = [](const VariableNameMap& inputs,
-                                          const VariableNameMap& outputs,
-                                          const AttributeMap& attrs) {
-      T infer(inputs, outputs, attrs);
-      return infer();
-    };
-  }
-};
-
-// A fake OpInfoFiller of void
-template <>
-struct OpInfoFiller<void, kUnknown> {
-  void operator()(const char* op_type, OpInfo* info) const {}
-};
-
-}  // namespace details
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
deleted file mode 100644
index 1a3c753e7d2b075eba9af98f7b206e42b51b650c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
-#include <memory>
-#include <utility>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-std::vector<std::unique_ptr<ir::Graph>>
-ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
-  std::vector<std::unique_ptr<ir::Graph>> graphs;
-  graphs.reserve(places_.size());
-  for (size_t i = 0; i < places_.size(); ++i) {
-    ProgramDesc empty;
-    graphs.emplace_back(std::unique_ptr<ir::Graph>(new ir::Graph(empty)));
-    auto &g = graphs.back();
-    g->Set(kGraphVars, new GraphVars(1UL));
-    g->Set(kGraphDepVars, new GraphDepVars);
-    auto &stale_ops =
-        graph->Get<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs);
-    g->Erase(details::kStaleProgramOpDescs);
-    g->Set<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs,
-                                        new std::vector<OpDesc *>(stale_ops));
-  }
-  auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-
-  for (auto &op : op_handles) {
-    auto &dev_ctx = op->DeviceContext();
-    auto &p = dev_ctx.begin()->first;
-    int dev_id = boost::get<platform::CUDAPlace>(p).device;
-    auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
-    graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
-
-    for (auto &var : op->Inputs()) {
-      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
-      if (dummy_ptr) {
-        dev_dummys.insert(var);
-        if (graph->Nodes().count(var->Node()))
-          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
-      }
-    }
-    for (auto &var : op->Outputs()) {
-      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
-      if (dummy_ptr) {
-        dev_dummys.insert(var);
-        if (graph->Nodes().count(var->Node()))
-          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
-      }
-    }
-  }
-
-  for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) {
-    auto &dev_vars = graphs[dev_id]->Get<GraphVars>(kGraphVars)[0];
-    auto &origin_vars = graph->Get<GraphVars>(kGraphVars)[dev_id];
-    for (auto &name_pair : origin_vars) {
-      dev_vars.emplace(name_pair.first, name_pair.second);
-      for (auto &version_pair : name_pair.second) {
-        if (graph->Nodes().count(version_pair->Node())) {
-          graphs[dev_id]->AddNode(
-              graph->RemoveNode(version_pair->Node()).release());
-        }
-      }
-    }
-  }
-
-  return graphs;
-}
-
-ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
-    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<Scope *> &local_exec_scopes,
-    const std::vector<platform::Place> &places, ir::Graph *graph)
-    : strategy_(std::move(strategy)),
-      local_scopes_(std::move(local_scopes)),
-      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
-      places_(std::move(places)),
-      // TODO(Yancey1989): Copying graphs is not safely since it deleted the
-      // attrs.
-      graphs_(SeparateMultiDevicesGraph(graph)) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-
-  auto seq_allreduce_pass =
-      ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
-  seq_allreduce_pass->Set<bool>(kUseHierarchicalAllReduce, new bool(false));
-  for (size_t i = 0; i < graphs_.size(); ++i) {
-    graphs_[i].reset(seq_allreduce_pass->Apply(graphs_[i].release()));
-  }
-
-  // set the correct size of thread pool to each device.
-  strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
-                               ? 1UL
-                               : strategy_.num_threads_ / places_.size();
-  VLOG(1) << "set num_threads: " << strategy_.num_threads_
-          << " to run the operators of the graph on each device.";
-  for (size_t i = 0; i < places.size(); ++i) {
-    executors_.emplace_back(new details::FastThreadedSSAGraphExecutor(
-        strategy_, local_scopes_, local_exec_scopes, {places_[i]},
-        graphs_.at(i).get()));
-  }
-}
-
-std::vector<ir::Graph *> ParallelSSAGraphExecutor::Graphs() {
-  std::vector<ir::Graph *> result;
-  result.reserve(graphs_.size());
-  for (auto &g : graphs_) {
-    result.emplace_back(g.get());
-  }
-  return result;
-}
-
-FeedFetchList ParallelSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors) {
-  std::vector<std::future<FeedFetchList>> run_futures;
-
-  std::vector<FeedFetchList> fetch_data;
-  FeedFetchList ret;
-
-  fetch_data.reserve(places_.size());
-  ret.reserve(fetch_tensors.size());
-  exception_holder_.Clear();
-
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto call = [this, i, &fetch_tensors]() -> FeedFetchList {
-      try {
-        return executors_[i]->Run(fetch_tensors);
-      } catch (...) {
-        exception_holder_.Catch(std::current_exception());
-      }
-      return FeedFetchList();
-    };
-
-    if (pool_) {
-      run_futures.emplace_back(pool_->enqueue(std::move(call)));
-    } else {
-      fetch_data.emplace_back(call());
-    }
-  }
-
-  if (pool_) {
-    for (auto &f : run_futures) {
-      if (exception_holder_.IsCaught()) {
-        f.wait();
-      } else {
-        fetch_data.emplace_back(f.get());
-      }
-    }
-  }
-  if (exception_holder_.IsCaught()) {
-    exception_holder_.ReThrow();
-  }
-
-  for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
-    std::vector<const LoDTensor *> lodtensor_ptrs;
-    lodtensor_ptrs.reserve(local_scopes_.size());
-    for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
-      lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx));
-    }
-    ret.emplace_back();
-    ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
-  }
-  return ret;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
deleted file mode 100644
index 6889c54dd4c6906b179036386f8d38dad04f5c9f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "ThreadPool.h"
-#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class ParallelSSAGraphExecutor : public SSAGraphExecutor {
- public:
-  ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
-                           const std::vector<Scope *> &local_scopes,
-                           const std::vector<Scope *> &local_exec_scopes,
-                           const std::vector<platform::Place> &places,
-                           ir::Graph *graph);
-  ~ParallelSSAGraphExecutor() final = default;
-
-  const ir::Graph &Graph() const override { return *graphs_[0]; }
-
-  std::vector<ir::Graph *> Graphs();
-
-  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
-
- private:
-  std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
-      ir::Graph *graph);
-
-  ExecutionStrategy strategy_;
-  std::vector<Scope *> local_scopes_;
-  std::unique_ptr<::ThreadPool> pool_{nullptr};
-  std::vector<platform::Place> places_;
-  std::vector<std::unique_ptr<ir::Graph>> graphs_;
-
-  std::vector<std::unique_ptr<details::FastThreadedSSAGraphExecutor>>
-      executors_;
-  ExceptionHolder exception_holder_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
deleted file mode 100644
index 11c4621fde394057144462bb513aab63187512e3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ /dev/null
@@ -1,137 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <map>
-#include <vector>
-#include "paddle/fluid/framework/details/reduce_and_gather.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct ReduceLoDTensor {
-  const std::vector<const LoDTensor *> &src_tensors_;
-  LoDTensor &dst_tensor_;
-
-  ReduceLoDTensor(const std::vector<const LoDTensor *> &src, LoDTensor *dst)
-      : src_tensors_(src), dst_tensor_(*dst) {}
-
-  template <typename T>
-  void apply() const {
-    PADDLE_ENFORCE(!src_tensors_.empty());
-    auto &t0 = *src_tensors_[0];
-    PADDLE_ENFORCE_NE(t0.numel(), 0);
-
-    dst_tensor_.Resize(t0.dims());
-    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-
-    for (size_t i = 0; i < src_tensors_.size(); ++i) {
-      auto &t = *src_tensors_[i];
-      if (dst == t.data<T>()) {
-        continue;
-      }
-
-      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
-      PADDLE_ENFORCE_EQ(t.type(), t0.type());
-      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
-                     [](T a, T b) -> T { return a + b; });
-    }
-  }
-};
-
-struct ReduceBufferData {
-  const std::vector<const void *> &src_data_;
-  void *dst_data_;
-  int64_t numel_;
-
-  ReduceBufferData(const std::vector<const void *> &src, void *dst,
-                   int64_t numel)
-      : src_data_(src), dst_data_(dst), numel_(numel) {}
-
-  template <typename T>
-  void apply() const {
-    T *dst_data = reinterpret_cast<T *>(dst_data_);
-    for (size_t i = 0; i < src_data_.size(); ++i) {
-      auto srd_data = reinterpret_cast<const T *>(src_data_[i]);
-      VLOG(10) << "dst: " << dst_data_ << ", " << srd_data;
-      if (srd_data == dst_data_) {
-        continue;
-      }
-
-      std::transform(srd_data, srd_data + numel_, dst_data, dst_data,
-                     [](T a, T b) -> T { return a + b; });
-    }
-  }
-};
-
-struct GatherLocalSelectedRowsFunctor {
-  GatherLocalSelectedRowsFunctor(
-      const std::vector<const SelectedRows *> &src_selected_rows,
-      const std::vector<platform::Place> &in_places,
-      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
-      const platform::Place &out_place, SelectedRows *dst_selected_rows)
-      : dev_ctxes_(dev_ctxes),
-        in_places_(in_places),
-        out_place_(out_place),
-        dst_selected_rows_(dst_selected_rows) {
-    PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false);
-
-    std::vector<int64_t> out_rows;
-
-    for (auto in_sr_ptr : src_selected_rows) {
-      auto &in_sr = *in_sr_ptr;
-      in_tensors_.emplace_back(in_sr.value());
-      out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
-    }
-
-    auto &pre_in = src_selected_rows[0];
-
-    auto &dst_tensor = *dst_selected_rows_;
-    dst_tensor.set_height(pre_in->height());
-    dst_tensor.set_rows(out_rows);
-    size_t rows = out_rows.size();
-    DDim out_dim = pre_in->GetCompleteDims();
-    out_dim[0] = static_cast<int64_t>(rows);
-    dst_tensor.mutable_value()->Resize(out_dim);
-    dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
-  }
-
-  void operator()() {
-    auto *out_tensor = dst_selected_rows_->mutable_value();
-    // copy
-    int s = 0, e = 0;
-    for (size_t j = 0; j < in_tensors_.size(); ++j) {
-      e += in_tensors_[j].dims()[0];
-      auto sub_out = out_tensor->Slice(s, e);
-      paddle::framework::TensorCopy(in_tensors_[j], out_place_,
-                                    *(dev_ctxes_.at(in_places_[j])), &sub_out);
-      s = e;
-    }
-  }
-
- private:
-  const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes_;
-  std::vector<platform::Place> in_places_;
-  std::vector<Tensor> in_tensors_;
-
-  platform::Place out_place_;
-  SelectedRows *dst_selected_rows_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
deleted file mode 100644
index f5245713671c951be7791a67bec7345ed6393faf..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/reduce_op_handle.h"
-#include <memory>
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/reduce_and_gather.h"
-#include "paddle/fluid/framework/details/variable_visitor.h"
-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/collective_client.h"
-#include "paddle/fluid/operators/distributed/collective_server.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#endif
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_bool(
-    cpu_deterministic, false,
-    "Whether to make the result of computation deterministic in CPU side.");
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-std::once_flag CollectiveContext::init_flag_;
-std::unique_ptr<CollectiveContext> CollectiveContext::context_;
-
-static inline std::string GetRemoteVarName(const std::string &var_name,
-                                           int trainer_id) {
-  return string::Sprintf("%s_merged_tmp@trainer_%d", var_name, trainer_id);
-}
-
-void ReduceOpHandle::Wait(
-    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes) {
-  // TODO(gongwb): use event wait?
-  for (auto &dev_ctx : dev_ctxes) {
-    dev_ctx.second->Wait();
-  }
-}
-
-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-template <typename DevCtx, typename DataType>
-void ReduceOpHandle::GatherSelectedRows(
-    const std::vector<const SelectedRows *> &src_selected_rows,
-    const std::vector<platform::Place> &in_places,
-    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
-    VarHandle *out_var_handle, const platform::Place &out_place,
-    SelectedRows *dst_selected_rows) {
-  const CollectiveContext &collective_context =
-      *CollectiveContext::GetInstance();
-
-  // 1. gather local selected rows, merge them
-  std::string gathered_var_name = out_var_handle->name() + "_gathered_tmp";
-  auto scope = local_scopes_.at(out_var_handle->scope_idx());
-  auto gathered_var_mid = scope->Var(gathered_var_name);
-  auto gathered_select_rows =
-      gathered_var_mid->GetMutable<framework::SelectedRows>();
-  GatherLocalSelectedRowsFunctor functor(
-      src_selected_rows, in_places, dev_ctxes, out_place, gathered_select_rows);
-  WaitInputVarGenerated();
-  functor();
-
-  // FIXME(gongwb): remove this Wait.
-  Wait(dev_ctxes);
-
-  // merge them
-  auto merged_dev_ctx = dynamic_cast<DevCtx *>(dev_ctxes.at(out_place));
-  std::string merged_var_name =
-      GetRemoteVarName(out_var_handle->name(), collective_context.trainer_id_);
-  auto merged_select_rows =
-      scope->Var(merged_var_name)->GetMutable<SelectedRows>();
-  operators::math::scatter::MergeAdd<DevCtx, DataType> merge_func;
-  merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows);
-
-  // 2. start collective server if it doesn't exist
-  operators::distributed::CollectiveServer *server =
-      operators::distributed::CollectiveServer::GetInstance(
-          collective_context.endpoints_[collective_context.trainer_id_],
-          collective_context.endpoints_.size() - 1);
-
-  auto rpc_server = server->GetRPCServer();
-  rpc_server->RegisterVar(merged_var_name,
-                          operators::distributed::kRequestGetMonomerVariable,
-                          scope, merged_dev_ctx);
-
-  // 3. gather them from all remote nodes.
-  std::vector<const SelectedRows *> remote;
-  operators::distributed::CollectiveClient *client =
-      operators::distributed::CollectiveClient::GetInstance();
-
-  std::vector<operators::distributed::RemoteVar> vars;
-  for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) {
-    if (i == (unsigned)collective_context.trainer_id_) continue;
-
-    operators::distributed::RemoteVar var;
-    var.trainer_id_ = i;
-    var.var_name_ = GetRemoteVarName(out_var_handle->name(), i);
-    var.ep_ = collective_context.endpoints_[i];
-
-    vars.push_back(var);
-    VLOG(4) << "gather from:" << var.String();
-  }
-
-  // erase gathered vars
-  merged_dev_ctx->Wait();
-  scope->EraseVars(std::vector<std::string>{gathered_var_name});
-
-  PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
-  PADDLE_ENFORCE(remote.size() == vars.size());
-
-  // 4. merged local selected rows.
-  std::vector<const SelectedRows *> all;
-  all.resize(collective_context.endpoints_.size());
-  for (auto v : vars) {
-    all[v.trainer_id_] =
-        scope->FindVar(v.var_name_)->GetMutable<SelectedRows>();
-  }
-  all[collective_context.trainer_id_] = merged_select_rows;
-
-  merge_func(*merged_dev_ctx, all, dst_selected_rows);
-
-  rpc_server->WaitVarBarrier(merged_var_name);
-  rpc_server->ClearVar(merged_var_name);
-
-  // 5. clear mid vars
-  std::vector<std::string> tmp_vars{merged_var_name};
-  for (auto r : vars) {
-    tmp_vars.push_back(r.var_name_);
-  }
-  scope->EraseVars(tmp_vars);
-}
-#endif
-
-void ReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
-
-  if (places_.size() == 1) return;
-  // the input and output may have dummy var.
-  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
-
-  VarHandle *out_var_handle;
-  {
-    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
-
-    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
-                      "The number of output should be one.");
-    out_var_handle = out_var_handles.front();
-  }
-
-  auto in_0_handle = in_var_handles[0];
-
-  auto &var_scopes = local_exec_scopes_;
-
-  auto pre_in_var =
-      var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
-
-  // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
-  std::vector<platform::Place> in_places;  // used to get dev_ctx
-  for (auto *in_handle : in_var_handles) {
-    in_places.emplace_back(in_handle->place());
-    auto in_var =
-        var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
-  }
-
-  auto out_var = var_scopes.at(out_var_handle->scope_idx())
-                     ->FindVar(out_var_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(out_var);
-
-  // NOTE: The tensors' Place of input and output must be all on GPU or all on
-  // CPU.
-  auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
-  platform::Place t_out_p;
-  if (platform::is_gpu_place(in_p)) {
-    PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()),
-                   "Places of input and output must be all on GPU.");
-    t_out_p = out_var_handle->place();
-  } else {
-    t_out_p = platform::CPUPlace();
-  }
-
-  if (pre_in_var->IsType<framework::SelectedRows>()) {
-    this->RunAndRecordEvent([&] {
-      std::vector<const SelectedRows *> in_selected_rows =
-          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
-
-      const CollectiveContext &collective_context =
-          *CollectiveContext::GetInstance();
-      VLOG(10) << "GatherSelectedRows CollectiveContext:"
-               << collective_context.String();
-
-      // TODO(gongwb): add cpu support
-      if (collective_context.endpoints_.size() <= 1 ||
-          is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
-        GatherLocalSelectedRowsFunctor functor(
-            in_selected_rows, in_places, dev_ctxes_, t_out_p,
-            out_var->GetMutable<framework::SelectedRows>());
-        WaitInputVarGenerated();
-        functor();
-        return;
-      }
-
-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-      if (in_selected_rows[0]->value().type() ==
-          framework::proto::VarType::FP32) {
-        GatherSelectedRows<platform::CUDADeviceContext, float>(
-            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
-            out_var->GetMutable<framework::SelectedRows>());
-      } else if (in_selected_rows[0]->value().type() ==
-                 framework::proto::VarType::FP64) {
-        GatherSelectedRows<platform::CUDADeviceContext, double>(
-            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
-            out_var->GetMutable<framework::SelectedRows>());
-      } else {
-        PADDLE_THROW("only support double or float when gather SelectedRows");
-      }
-#endif
-    });
-  } else {
-    std::vector<const LoDTensor *> lod_tensors =
-        GetInputValues<LoDTensor>(in_var_handles, var_scopes);
-
-    if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
-      WaitInputVarGenerated();
-      this->RunAndRecordEvent([&] {
-        // FIXME(zcd): The order of summing is important,
-        // especially when the type of data is float or double.
-        // For example, the result of `a+b+c+d` may be different
-        // with the result of `c+a+b+d`, so the summing order should be fixed.
-        if (!FLAGS_cpu_deterministic) {
-          ReduceLoDTensor func(lod_tensors,
-                               out_var->GetMutable<framework::LoDTensor>());
-          VisitDataType(lod_tensors[0]->type(), func);
-        } else {
-          // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
-          // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
-          auto &reduce_sum_trg = *this->local_exec_scopes_[0]
-                                      ->FindVar(out_var_handle->name())
-                                      ->GetMutable<framework::LoDTensor>();
-          ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
-          VisitDataType(lod_tensors[0]->type(), func);
-
-          auto trg = out_var->GetMutable<framework::LoDTensor>();
-          if (reduce_sum_trg.data<void>() != trg->data<void>()) {
-            TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
-          }
-        }
-      });
-    } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      auto pre_in = pre_in_var->Get<framework::LoDTensor>();
-      VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
-      VariableVisitor::GetMutableTensor(out_var).mutable_data(
-          out_var_handle->place(), pre_in.type());
-
-      auto out_p = out_var_handle->place();
-      int root_id = boost::get<platform::CUDAPlace>(out_p).device;
-      std::vector<std::function<void()>> all_reduce_calls;
-      for (size_t i = 0; i < var_scopes.size(); ++i) {
-        auto &p = in_places[i];
-        auto &lod_tensor = *lod_tensors[i];
-
-        int dev_id = boost::get<platform::CUDAPlace>(p).device;
-        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
-
-        void *buffer = const_cast<void *>(lod_tensor.data<void>());
-        void *recvbuffer = nullptr;
-        if (root_id == dev_id) {
-          recvbuffer =
-              out_var->GetMutable<framework::LoDTensor>()->mutable_data(
-                  out_var_handle->place());
-        }
-
-        int type = platform::ToNCCLDataType(lod_tensor.type());
-        size_t numel = static_cast<size_t>(lod_tensor.numel());
-        all_reduce_calls.emplace_back(
-            [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
-              PADDLE_ENFORCE(platform::dynload::ncclReduce(
-                  buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
-                  ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
-            });
-      }
-
-      WaitInputVarGenerated();
-      this->RunAndRecordEvent([&] {
-        platform::NCCLGroupGuard guard;
-        for (auto &call : all_reduce_calls) {
-          call();
-        }
-      });
-#else
-      PADDLE_THROW("CUDA is not enabled.");
-#endif
-    } else {
-      PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
-    }
-  }
-}
-
-template <typename T>
-std::vector<const T *> ReduceOpHandle::GetInputValues(
-    const std::vector<VarHandle *> &in_var_handles,
-    const std::vector<Scope *> &var_scopes) const {
-  std::vector<const T *> in_selected_rows;
-  for (auto *in_handle : in_var_handles) {
-    auto &in_sr = var_scopes.at(in_handle->scope_idx())
-                      ->FindVar(in_handle->name())
-                      ->Get<T>();
-    in_selected_rows.emplace_back(&in_sr);
-  }
-  return in_selected_rows;
-}
-
-std::string ReduceOpHandle::Name() const { return "reduce"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
deleted file mode 100644
index 15064a108e79fe5ed307e46b03f90b1d74742203..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ /dev/null
@@ -1,117 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/device_context.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-struct CollectiveContext {
-  std::vector<std::string> endpoints_;
-  int trainer_id_{0};
-
-  std::string String() const {
-    std::stringstream ss;
-    ss << "endpoints_:";
-    for (auto e : endpoints_) {
-      ss << e << ",";
-    }
-
-    ss << "trainer_id_:" << trainer_id_;
-
-    return ss.str();
-  }
-
-  static CollectiveContext *GetInstance() {
-    std::call_once(init_flag_,
-                   [&]() { context_.reset(new CollectiveContext()); });
-    return context_.get();
-  }
-
- private:
-  static std::once_flag init_flag_;
-  static std::unique_ptr<CollectiveContext> context_;
-};
-
-struct ReduceOpHandle : public OpHandleBase {
-  std::vector<Scope *> local_scopes_;
-  std::vector<platform::Place> places_;
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  const platform::NCCLContextMap *nccl_ctxs_;
-  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                 const std::vector<platform::Place> &places,
-                 const platform::NCCLContextMap *nccl_ctxs)
-      : OpHandleBase(node),
-        local_scopes_(local_scopes),
-        places_(places),
-        nccl_ctxs_(nccl_ctxs) {
-    if (nccl_ctxs_) {
-      for (auto &p_ctx : nccl_ctxs_->contexts_) {
-        this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
-                               p_ctx.second.ctx_.get());
-      }
-    }
-  }
-#else
-  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                 const std::vector<platform::Place> &places)
-      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
-#endif
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override { return true; };
-
- protected:
-  void RunImpl() override;
-
-  std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
-
-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-  template <typename DevCtx, typename DataType>
-  void GatherSelectedRows(
-      const std::vector<const SelectedRows *> &src_selecte_rows_,
-      const std::vector<platform::Place> &in_places,
-      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
-      VarHandle *out_var_handle, const platform::Place &out_place,
-      SelectedRows *dst_selecte_rows);
-#endif
-
-  void Wait(
-      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes);
-
-  template <typename T>
-  std::vector<const T *> GetInputValues(
-      const std::vector<VarHandle *> &in_var_handles,
-      const std::vector<Scope *> &var_scopes) const;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
deleted file mode 100644
index 664bd00fb68fc37c6d4e7624ed42a2a905f1bd25..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ /dev/null
@@ -1,294 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/reduce_op_handle.h"
-#include <unordered_map>
-#include "gtest/gtest.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-// test data amount
-const f::DDim kDims = {20, 20};
-
-struct TestReduceOpHandle {
-  bool use_gpu_;
-  Scope g_scope_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<Scope *> param_scopes_;
-  OpHandleBase *op_handle_;
-  std::vector<VarHandleBase *> vars_;
-  std::vector<p::Place> gpu_list_;
-  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
-#endif
-
-  void WaitAll() {
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      ctxs_[j]->Wait();
-    }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    if (nccl_ctxs_) {
-      nccl_ctxs_->WaitAll();
-    }
-#endif
-  }
-
-  void InitCtxOnGpu(bool use_gpu) {
-    use_gpu_ = use_gpu;
-    if (use_gpu) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      int count = p::GetCUDADeviceCount();
-      if (count <= 1) {
-        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
-                        "device count is "
-                     << count;
-        exit(0);
-      }
-      for (int i = 0; i < count; ++i) {
-        auto p = p::CUDAPlace(i);
-        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
-      }
-      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
-#else
-      PADDLE_THROW("CUDA is not support.");
-#endif
-    } else {
-      int count = 8;
-      for (int i = 0; i < count; ++i) {
-        auto p = p::CPUPlace();
-        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
-      }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      nccl_ctxs_.reset(nullptr);
-#endif
-    }
-  }
-
-  void InitReduceOp(size_t out_scope_idx) {
-    std::vector<std::unique_ptr<ir::Node>> nodes;
-    // init scope
-    std::unordered_map<Scope *, Scope *> scope_map;
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      local_scopes_.push_back(&(g_scope_.NewScope()));
-      Scope &local_scope = local_scopes_.back()->NewScope();
-      local_scope.Var("input");
-      param_scopes_.emplace_back(&local_scope);
-      scope_map.emplace(local_scopes_.back(), param_scopes_.back());
-    }
-    param_scopes_[out_scope_idx]->Var("out");
-
-    nodes.emplace_back(new ir::Node("node"));
-    if (use_gpu_) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
-                                          gpu_list_, nccl_ctxs_.get()));
-#else
-      PADDLE_THROW("CUDA is not support.");
-#endif
-    } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
-                                          gpu_list_, nccl_ctxs_.get()));
-#else
-      op_handle_.reset(
-          new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
-#endif
-    }
-
-    op_handle_->SetLocalExecScopes(scope_map);
-
-    // init op handle
-    // add input
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      if (!use_gpu_) {
-        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
-      }
-      nodes.emplace_back(new ir::Node("node1"));
-      auto *in_var_handle =
-          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
-      in_var_handle->ClearGeneratedOp();
-      vars_.emplace_back(in_var_handle);
-      op_handle_->AddInput(in_var_handle);
-    }
-
-    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
-    DummyVarHandle *in_dummy_var_handle =
-        static_cast<DummyVarHandle *>(vars_.back().get());
-    in_dummy_var_handle->ClearGeneratedOp();
-    op_handle_->AddInput(in_dummy_var_handle);
-
-    // add output
-    nodes.emplace_back(new ir::Node("node2"));
-    auto *out_var_handle = new VarHandle(nodes.back().get(), 2, out_scope_idx,
-                                         "out", gpu_list_[out_scope_idx]);
-    vars_.emplace_back(out_var_handle);
-    op_handle_->AddOutput(out_var_handle);
-
-    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
-    DummyVarHandle *dummy_var_handle =
-        static_cast<DummyVarHandle *>(vars_.back().get());
-    op_handle_->AddOutput(dummy_var_handle);
-  }
-
-  void TestReduceSelectedRows(size_t output_scope_idx) {
-    int height = kDims[0] * 2;
-    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
-                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-    std::vector<float> send_vector(f::product(kDims));
-    for (size_t k = 0; k < send_vector.size(); ++k) {
-      send_vector[k] = k;
-    }
-
-    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
-         ++input_scope_idx) {
-      auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
-      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
-      auto value = in_selected_rows->mutable_value();
-      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-
-      in_selected_rows->set_height(height);
-      in_selected_rows->set_rows(rows);
-
-      paddle::framework::TensorFromVector<float>(
-          send_vector, *(ctxs_[input_scope_idx]), value);
-      value->Resize(kDims);
-    }
-
-    auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
-
-    auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
-    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
-
-    out_selected_rows->mutable_value()->ShareDataWith(
-        in_selected_rows->value());
-
-    op_handle_->Run(false);
-
-    WaitAll();
-
-    p::CPUPlace cpu_place;
-
-    auto &out_select_rows = out_var->Get<f::SelectedRows>();
-    auto rt = out_select_rows.value();
-
-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
-    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
-    }
-
-    f::Tensor result_tensor;
-    f::TensorCopySync(rt, cpu_place, &result_tensor);
-    float *ct = result_tensor.data<float>();
-
-    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
-      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
-    }
-  }
-
-  void TestReduceLodTensors(size_t output_scope_idx) {
-    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
-    for (size_t k = 0; k < send_vector.size(); ++k) {
-      send_vector[k] = k;
-    }
-    f::LoD lod{{0, 10, 20}};
-
-    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
-         ++input_scope_idx) {
-      auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
-      auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
-      in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-      in_lod_tensor->set_lod(lod);
-
-      paddle::framework::TensorFromVector<float>(
-          send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
-    }
-
-    auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
-
-    auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
-    auto in_lodtensor = in_var->Get<f::LoDTensor>();
-
-    out_lodtensor->ShareDataWith(in_lodtensor);
-
-    op_handle_->Run(false);
-
-    WaitAll();
-
-    p::CPUPlace cpu_place;
-
-    auto &rt = out_var->Get<f::LoDTensor>();
-
-    f::Tensor result_tensor;
-    f::TensorCopySync(rt, cpu_place, &result_tensor);
-    float *ct = result_tensor.data<float>();
-
-    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
-      ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
-    }
-  }
-};
-
-TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
-  TestReduceOpHandle test_op;
-  size_t out_scope_idx = 0;
-  test_op.InitCtxOnGpu(false);
-  test_op.InitReduceOp(out_scope_idx);
-  test_op.TestReduceSelectedRows(out_scope_idx);
-}
-TEST(ReduceTester, TestCPUReduceTestLodTensor) {
-  TestReduceOpHandle test_op;
-  size_t out_scope_idx = 0;
-  test_op.InitCtxOnGpu(false);
-  test_op.InitReduceOp(out_scope_idx);
-  test_op.TestReduceLodTensors(out_scope_idx);
-}
-#ifdef PADDLE_WITH_CUDA
-
-TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
-  TestReduceOpHandle test_op;
-  size_t out_scope_idx = 0;
-  test_op.InitCtxOnGpu(true);
-  test_op.InitReduceOp(out_scope_idx);
-  test_op.TestReduceSelectedRows(out_scope_idx);
-}
-
-TEST(ReduceTester, TestGPUReduceTestLodTensor) {
-  TestReduceOpHandle test_op;
-  size_t out_scope_idx = 0;
-  test_op.InitCtxOnGpu(true);
-  test_op.InitReduceOp(out_scope_idx);
-  test_op.TestReduceLodTensors(out_scope_idx);
-}
-#endif
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
deleted file mode 100644
index 8d61a103f98be81309d890f25b8ab6f41d5c3f02..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/rpc_op_handle.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
-                         Scope *local_scope, const std::string &name,
-                         const platform::Place &place)
-    : OpHandleBase(node),
-      op_(framework::OpRegistry::CreateOp(op_desc)),
-      local_scope_(local_scope),
-      name_(name),
-      place_(place) {}
-
-void RPCOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
-
-  for (auto *in : inputs_) {
-    auto &p = static_cast<VarHandle *>(in)->place();
-    if (ir::IsControlDepVar(*in->Node())) {
-      continue;
-    }
-    if (in->GeneratedOp()) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
-    }
-  }
-  this->RunAndRecordEvent([this] { op_->Run(*local_exec_scopes_[0], place_); });
-}
-
-std::string RPCOpHandle::Name() const { return name_; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
deleted file mode 100644
index d86d33dd676ca066b8772ac2afbab05cf0d98b38..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct RPCOpHandle : public OpHandleBase {
-  RPCOpHandle(ir::Node* node, const framework::OpDesc& op_desc,
-              Scope* local_scope, const std::string& name,
-              const platform::Place& place);
-
-  std::string Name() const override;
-
-  // Delay and buffer nccl_all_reduce together can significantly increase
-  // performance. Disable this feature by returning false.
-  bool IsMultiDeviceTransfer() override { return false; };
-
- protected:
-  void RunImpl() override;
-
-  std::vector<Scope*> GetLocalScopes() override { return {local_scope_}; }
-
- private:
-  std::unique_ptr<OperatorBase> op_;
-  Scope* local_scope_;
-  const std::string name_;
-  platform::Place place_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
deleted file mode 100644
index 517dd5ee264d96a11d8b54913a1e388edc95c034..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
-#include <string>
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
-                                             Scope *scope,
-                                             platform::Place place,
-                                             platform::DeviceContext *dev_ctx,
-                                             proto::VarType::Type dtype)
-    : OpHandleBase(node),
-      coeff_(static_cast<float>(1.0 / num_dev)),
-      scope_(scope),
-      place_(place),
-      out_dtype_(dtype) {
-  this->SetDeviceContext(place_, dev_ctx);
-}
-
-ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
-
-struct ScaleLossGradFunctor {
-  float coeff_;
-  Tensor *out_;
-  platform::Place place_;
-  proto::VarType::Type out_dtype_;
-  platform::DeviceContext *ctx_;
-
-  ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
-                       proto::VarType::Type dtype, platform::DeviceContext *ctx)
-      : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto *out_data = out_->mutable_data<OutT>(place_);
-    if (platform::is_cpu_place(place_)) {
-      *out_data = static_cast<OutT>(coeff_);
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      OutT cast_coeff = static_cast<OutT>(coeff_);
-      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
-      memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
-                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
-                   stream);
-      VLOG(10) << place_ << "RUN Scale loss grad op";
-
-#endif
-    }
-  }
-};
-
-void ScaleLossGradOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
-  // Doesn't wait any event
-  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name();
-
-  auto *tensor =
-      local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>();
-  tensor->Resize(make_ddim({1}));
-
-#ifdef PADDLE_WITH_CUDA
-  ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
-                            this->dev_ctxes_.at(place_));
-  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
-#else
-  ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr);
-  framework::VisitDataType(out_dtype_, func);
-#endif
-}
-
-std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
deleted file mode 100644
index d4f28dbe2b261be9c5d48aa50e38edfe36bfcfd3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ /dev/null
@@ -1,51 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct ScaleLossGradOpHandle : public OpHandleBase {
-  ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
-                        platform::Place place, platform::DeviceContext *context,
-                        proto::VarType::Type dtype);
-
-  ~ScaleLossGradOpHandle() final;
-
-  std::string Name() const override;
-
- protected:
-  void RunImpl() override;
-
-  std::vector<Scope *> GetLocalScopes() override { return {scope_}; }
-
- private:
-  float coeff_;
-  Scope *scope_;
-  platform::Place place_;
-  proto::VarType::Type out_dtype_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc
deleted file mode 100644
index ecbfa17a0df444540318077d913a5be34a0f3606..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/scope_buffered_monitor.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/scope_buffered_monitor.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DECLARE_double(local_exe_sub_scope_limit);
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-static constexpr double kMB = 1 / (1024 * 1024);
-
-static void GetTensors(Variable *var,
-                       std::unordered_set<Tensor *> *tensor_set) {
-  if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
-    tensor_set->insert(var->GetMutable<LoDTensor>());
-  } else if (var->IsType<SelectedRows>() &&
-             var->Get<SelectedRows>().value().IsInitialized()) {
-    tensor_set->insert(var->GetMutable<SelectedRows>()->mutable_value());
-  } else if (var->IsType<LoDTensorArray>()) {
-    auto *tensor_arr = var->GetMutable<LoDTensorArray>();
-    for (auto &t : *tensor_arr) {
-      if (t.IsInitialized()) {
-        tensor_set->insert(&t);
-      }
-    }
-  }
-}
-
-static void GetTensors(Scope *scope, std::unordered_set<Tensor *> *tensor_set) {
-  for (auto &var_name : scope->LocalVarNames()) {
-    GetTensors(scope->FindVar(var_name), tensor_set);
-  }
-
-  for (auto *kid : scope->kids()) {
-    GetTensors(kid, tensor_set);
-  }
-}
-
-static size_t GetTensorMemorySize(Scope *scope, bool clear_cpu_tensor) {
-  std::unordered_set<Tensor *> tensor_set;
-  GetTensors(scope, &tensor_set);
-  size_t memory_size = 0;
-  std::unordered_set<memory::Allocation *> allocation_set;
-  for (auto *tensor : tensor_set) {
-    if (clear_cpu_tensor && platform::is_cpu_place(tensor->place())) {
-      tensor->clear();
-    } else {
-      auto allocation = tensor->Holder().get();
-      if (!allocation_set.count(allocation)) {
-        memory_size += allocation->size();
-        allocation_set.insert(allocation);
-      }
-    }
-  }
-  return memory_size;
-}
-
-size_t GetScopeVarMemorySize(Scope *scope) {
-  return GetTensorMemorySize(scope, false /*clear_cpu_tensor*/);
-}
-
-ScopeBufferedMonitor::ScopeBufferedMonitor(
-    const std::vector<platform::Place> &places,
-    const std::vector<Scope *> &local_exec_scopes)
-    : places_(places), local_exec_scopes_(local_exec_scopes) {
-  pre_local_exec_scopes_.resize(local_exec_scopes_.size());
-  post_local_exec_scopes_.resize(local_exec_scopes_.size());
-}
-
-void ScopeBufferedMonitor::Apply(const std::function<void()> &callback,
-                                 bool has_fetch) {
-  std::unique_ptr<platform::RecordEvent> pre_local_exec_scopes_event(
-      new platform::RecordEvent(
-          "ScopeBufferedMonitor::pre_local_exec_scopes_process"));
-  for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
-    pre_local_exec_scopes_.at(scope_id).clear();
-    auto scopes = local_exec_scopes_.at(scope_id)->kids();
-    VLOG(10) << "pre_local_exec_scopes[" << scope_id
-             << "] sub-scope: " << scopes.size();
-    pre_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end());
-  }
-  pre_local_exec_scopes_event.reset();
-
-  callback();
-
-  std::unique_ptr<platform::RecordEvent> post_local_exec_scopes_event(
-      new platform::RecordEvent(
-          "ScopeBufferedMonitor::post_local_exec_scopes_process"));
-  for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
-    post_local_exec_scopes_.at(scope_id).clear();
-    auto scopes = local_exec_scopes_.at(scope_id)->kids();
-    VLOG(10) << "post_local_exec_scopes[" << scope_id
-             << "] sub-scope: " << scopes.size();
-    post_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end());
-  }
-
-  history_local_exec_scopes_.emplace_back();
-  auto &incr_local_exec_scopes = history_local_exec_scopes_.back();
-  incr_local_exec_scopes.resize(local_exec_scopes_.size());
-  for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
-    for (auto &scope : post_local_exec_scopes_.at(scope_id)) {
-      if (!pre_local_exec_scopes_.at(scope_id).count(scope)) {
-        incr_local_exec_scopes.at(scope_id).insert(scope);
-      }
-    }
-
-    if (VLOG_IS_ON(10)) {
-      if (incr_local_exec_scopes.at(scope_id).size() &&
-          FLAGS_local_exe_sub_scope_limit > 0) {
-        VLOG(10)
-            << "FLAGS_local_exe_sub_scope_limit is "
-            << FLAGS_local_exe_sub_scope_limit
-            << " MBytes now. If you don't need to limit the memory of local "
-               "execution scope, you should set "
-               "FLAGS_local_exe_sub_scope_limit=-1.";
-      }
-      std::stringstream out;
-      out << scope_id << " kids: ";
-      for (auto &scope : incr_local_exec_scopes.at(scope_id)) {
-        out << scope << ", ";
-      }
-      VLOG(10) << out.str();
-    }
-  }
-
-  size_t history_step = history_local_exec_scopes_.size();
-  if (has_fetch && history_step >= 2) {
-    ClearHistoryLocalExecScopes(history_step - 1);
-  }
-
-  // Delete CPU Memory
-  std::vector<size_t> gpu_memory_size_per_gpu(places_.size());
-  for (auto &scope_vec : history_local_exec_scopes_) {
-    for (size_t idx = 0; idx < scope_vec.size(); ++idx) {
-      for (auto &scope : scope_vec.at(idx)) {
-        gpu_memory_size_per_gpu.at(idx) +=
-            GetTensorMemorySize(scope, true /*clear_cpu_tensor*/);
-      }
-    }
-  }
-  if (VLOG_IS_ON(8)) {
-    for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) {
-      VLOG(8) << "history local exec scopes contains "
-              << string::HumanReadableSize(gpu_memory_size_per_gpu.at(idx))
-              << " in " << places_.at(idx);
-    }
-  }
-
-  if (FLAGS_local_exe_sub_scope_limit > 0) {
-    for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) {
-      if (gpu_memory_size_per_gpu.at(idx) / kMB >=
-          FLAGS_local_exe_sub_scope_limit) {
-        platform::DeviceContextPool::Instance().Get(places_.at(idx))->Wait();
-        local_exec_scopes_.at(idx)->DropKids();
-      }
-      for (auto &scope_vec : history_local_exec_scopes_) {
-        scope_vec.at(idx).clear();
-      }
-    }
-  }
-}
-
-void ScopeBufferedMonitor::ClearHistoryLocalExecScopes(size_t history_step) {
-  VLOG(10) << "delete pre_incr_local_exec_scopes.";
-  for (size_t i = 0; i < history_step; ++i) {
-    auto &pre_incr_local_exec_scopes = history_local_exec_scopes_.front();
-    for (size_t scope_idx = 0; scope_idx < pre_incr_local_exec_scopes.size();
-         ++scope_idx) {
-      for (auto scope : pre_incr_local_exec_scopes[scope_idx]) {
-        local_exec_scopes_.at(scope_idx)->DeleteScope(scope);
-      }
-    }
-    history_local_exec_scopes_.pop_front();
-  }
-}
-
-void ScopeBufferedMonitor::ClearHistoryLocalExecScopes() {
-  history_local_exec_scopes_.clear();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.h b/paddle/fluid/framework/details/scope_buffered_monitor.h
deleted file mode 100644
index 1246c35af6aedab16c2370aa881e85be5a773ddc..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/scope_buffered_monitor.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <deque>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/scope.h"
-namespace paddle {
-namespace framework {
-namespace details {
-
-class ScopeBufferedMonitor {
- public:
-  ScopeBufferedMonitor(const std::vector<platform::Place> &places,
-                       const std::vector<Scope *> &local_exec_scopes);
-
-  void Apply(const std::function<void()> &callback, bool has_fetch);
-
-  void ClearHistoryLocalExecScopes();
-
-  void ClearHistoryLocalExecScopes(size_t history_step);
-
- private:
-  std::vector<platform::Place> places_;
-  std::vector<Scope *> local_exec_scopes_;
-  std::vector<std::unordered_set<Scope *>> pre_local_exec_scopes_;
-  std::vector<std::unordered_set<Scope *>> post_local_exec_scopes_;
-  std::deque<std::vector<std::unordered_set<Scope *>>>
-      history_local_exec_scopes_;
-};
-
-size_t GetScopeVarMemorySize(Scope *scope);
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
deleted file mode 100644
index 3640e9f7dbfa5fac3c09b455ece6f98603a832b2..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/profiler.h"
-namespace paddle {
-namespace framework {
-namespace details {
-
-ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
-    ExecutionStrategy strategy, std::vector<Scope *> local_scopes,
-    std::vector<Scope *> local_exec_scopes, std::vector<VariableInfo> var_infos,
-    std::vector<platform::Place> places,
-    std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
-    : strategy_(std::move(strategy)),
-      underlying_executor_(std::move(underlying_executor)),
-      local_scopes_(std::move(local_scopes)),
-      local_exec_scopes_(std::move(local_exec_scopes)),
-      var_infos_(std::move(var_infos)),
-      places_(std::move(places)),
-      scope_monitor_(places_, local_exec_scopes_) {
-  PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
-  PrepareLocalExeScopes();
-}
-
-FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors) {
-  if (drop_scope_counter_ == 0) {
-    platform::RecordEvent e("InitLocalVars");
-    InitVariables();
-  }
-
-  std::vector<framework::LoDTensor> fetch_data;
-  std::exception_ptr eptr = nullptr;
-
-  auto exe_run_func = [&]() {
-    try {
-      fetch_data = underlying_executor_->Run(fetch_tensors);
-    } catch (...) {
-      eptr = std::current_exception();
-    }
-  };
-
-  if (strategy_.num_iteration_per_drop_scope_ == 1) {
-    exe_run_func();
-  } else {
-    scope_monitor_.Apply(exe_run_func, fetch_tensors.size() > 0);
-  }
-
-  if (VLOG_IS_ON(5)) {
-    for (auto *scope : local_exec_scopes_) {
-      VLOG(5) << "Left "
-              << string::HumanReadableSize(GetScopeVarMemorySize(scope))
-              << " on scope " << scope << " before deleting";
-    }
-  }
-
-  ++drop_scope_counter_;
-  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ ||
-      DropScopeOrNot()) {
-    DropLocalExeScopes();
-  }
-
-  if (VLOG_IS_ON(5)) {
-    for (auto *scope : local_exec_scopes_) {
-      VLOG(5) << "Left "
-              << string::HumanReadableSize(GetScopeVarMemorySize(scope))
-              << " on scope " << scope << " after deleting";
-    }
-  }
-
-  if (eptr) {
-    std::rethrow_exception(eptr);
-  } else {
-    return fetch_data;
-  }
-}
-
-bool ScopeBufferedSSAGraphExecutor::DropScopeOrNot() const {
-  for (auto &var : tensor_array_vars_) {
-    auto tensor_array = var->GetMutable<LoDTensorArray>();
-    for (LoDTensor &tensor : *tensor_array) {
-      if (tensor.IsInitialized()) {
-        return true;
-      }
-    }
-    tensor_array->clear();
-  }
-  return false;
-}
-
-void ScopeBufferedSSAGraphExecutor::InitVariables() {
-  for (auto &info : tmp_var_infos_) {
-    for (auto &pair : info) {
-      InitializeVariable(pair.first, pair.second);
-    }
-  }
-
-  const ir::Graph &graph = Graph();
-  if (graph.Has(details::kProgramDescs)) {
-    auto &program_descs =
-        graph.Get<details::ProgramDescs>(details::kProgramDescs);
-    // Init vars
-    auto &fused_grad_vars = graph.Get<details::FusedVars>(details::kFusedVars);
-    for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
-      for (auto &var_name : fused_grad_vars) {
-        auto var = local_exec_scopes_[i]->Var(var_name);
-        var->GetMutable<LoDTensor>();
-      }
-    }
-
-    for (auto &program_desc : program_descs) {
-      for (auto &op_desc : program_desc.Block(0).AllOps()) {
-        for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
-          auto op = OpRegistry::CreateOp(*op_desc);
-          op->Run(*local_exec_scopes_[i], places_[i]);
-        }
-      }
-    }
-  }
-}
-
-void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
-  platform::RecordEvent drop_scope_event("DropLocalExeScopes");
-  drop_scope_counter_ = 0;
-  for (auto &p : places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  scope_monitor_.ClearHistoryLocalExecScopes();
-  for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
-    local_exec_scopes_[i]->EraseVarsExcept(preserve_vars_[i]);
-    local_exec_scopes_[i]->DropKids();
-    for (auto &preserve_var : preserve_vars_[i]) {
-      preserve_var->Clear();
-    }
-    VLOG(3) << "Drop local execution scope: " << local_scopes_[i];
-  }
-}
-
-void ScopeBufferedSSAGraphExecutor::PrepareLocalExeScopes() {
-  // Create local scopes.
-  preserve_vars_.resize(local_scopes_.size());
-  tmp_var_infos_.resize(local_scopes_.size());
-
-  for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
-    size_t idx = local_scopes_.size() - 1 - (it - local_scopes_.rbegin());
-    auto *scope = local_scopes_[idx];
-    auto *local_scope = local_exec_scopes_[idx];
-
-    for (auto &info : var_infos_) {
-      if (info.persistable_) {  // Persistable
-        auto var = scope->FindVar(info.name_);
-        if (var != nullptr) {
-          VLOG(2)
-              << info.name_
-              << " has been initialized beforehand in global scope, skipped";
-          continue;
-        }
-        InitializeVariable(scope->Var(info.name_), info.type_);
-      } else {
-        Variable *tmp_var = local_scope->Var(info.name_);
-        preserve_vars_[idx].emplace(tmp_var);
-        tmp_var_infos_[idx].emplace_back(tmp_var, info.type_);
-        if (info.type_ == proto::VarType::LOD_TENSOR_ARRAY) {
-          tensor_array_vars_.emplace_back(tmp_var);
-        }
-      }
-    }
-  }
-}
-
-bool ScopeBufferedSSAGraphExecutor::NeedCreateLocalExeScope() {
-  return drop_scope_counter_ == 0;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
deleted file mode 100644
index 17493a89a660588b0e0f8f8da42518961b008773..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <ThreadPool.h>
-#include <deque>
-#include <list>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/scope_buffered_monitor.h"
-#include "paddle/fluid/framework/details/ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/place.h"
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct VariableInfo {
-  std::string name_;
-  proto::VarType::Type type_;
-  bool persistable_;
-};
-
-class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
- public:
-  ScopeBufferedSSAGraphExecutor(
-      ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
-      std::vector<Scope*> local_exec_scopes,
-      std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
-      std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
-
-  const ir::Graph& Graph() const override {
-    return underlying_executor_->Graph();
-  }
-
-  FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
-
-  void DropLocalExeScopes();
-
-  bool NeedCreateLocalExeScope();
-
-  void PrepareLocalExeScopes();
-
- private:
-  void InitVariables();
-
-  bool DropScopeOrNot() const;
-
-  size_t drop_scope_counter_{0};
-  ExecutionStrategy strategy_;
-  std::unique_ptr<SSAGraphExecutor> underlying_executor_;
-  std::vector<Scope*> local_scopes_;
-
-  std::vector<Scope*> local_exec_scopes_;
-  std::vector<std::unordered_set<Variable*>> preserve_vars_;
-  std::vector<std::vector<std::pair<Variable*, proto::VarType::Type>>>
-      tmp_var_infos_;
-
-  std::vector<Variable*> tensor_array_vars_;
-
-  std::vector<VariableInfo> var_infos_;
-  std::vector<platform::Place> places_;
-
-  ScopeBufferedMonitor scope_monitor_;
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
deleted file mode 100644
index fb43bfbf342ea282b517694305e26990069dbf07..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// TODO(zjl): support SelectedRows
-static inline const Tensor &GetTensorFromVar(const Variable *var) {
-  if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>();
-  } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
-  }
-}
-
-static inline Tensor *GetMutableTensorFromVar(Variable *var) {
-  if (var->IsType<LoDTensor>()) {
-    return var->GetMutable<LoDTensor>();
-  } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
-  }
-}
-
-ShareTensorBufferFunctor::ShareTensorBufferFunctor(
-    Scope *scope, size_t scope_idx, const std::string &op_type,
-    const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names)
-    : scope_(scope),
-      scope_idx_(scope_idx),
-      op_type_(op_type),
-      in_var_infos_(in_var_infos),
-      out_var_names_(out_var_names) {
-  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size());
-  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
-    AddReuseVarPair(in_var_infos_[i], out_var_names_[i]);
-  }
-}
-
-std::unordered_map<std::string, std::string>
-ShareTensorBufferFunctor::ReusedVars() const {
-  std::unordered_map<std::string, std::string> result;
-  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
-    result.insert({in_var_infos_[i]->Name(), out_var_names_[i]});
-  }
-  return result;
-}
-
-void ShareTensorBufferFunctor::AddReuseVarPair(
-    const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
-  PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr");
-  PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
-                    "in/out cannot have same name: %s", out_var_name);
-  in_var_infos_.emplace_back(in_var_info);
-  out_var_names_.emplace_back(out_var_name);
-}
-
-void ShareTensorBufferFunctor::CallOnce() {
-  PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here");
-  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
-    auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name());
-    auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    PADDLE_ENFORCE_NE(in_var, out_var);
-    in_out_vars_.emplace_back(in_var, out_var);
-  }
-}
-
-void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
-  if (!exec_scope_) {
-    PADDLE_ENFORCE_NOT_NULL(exec_scope);
-    exec_scope_ = exec_scope;
-    CallOnce();
-  } else {
-    PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same");
-  }
-
-  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
-    const auto &in_tensor = GetTensorFromVar(in_out_vars_[i].first);
-    auto *out_tensor = GetMutableTensorFromVar(in_out_vars_[i].second);
-    auto *in_var_info = in_var_infos_[i];
-
-    if (UNLIKELY(in_var_info->IsSkippedMemoryReuse())) {
-      // If in_var is inplaced in the previous batch and we want to fetch
-      // in_var in the current batch, we have to reset memory of out_var
-      // to avoid wrong calculation result.
-      if (in_tensor.Holder() == out_tensor->Holder()) {
-        VLOG(1) << "Clear " << out_var_names_[i]
-                << " because you may want to fetch an inplaced variable "
-                << in_var_info->Name()
-                << " in previous batch: " << in_var_info->Name() << " -> "
-                << out_var_names_[i];
-        out_tensor->clear();
-      }
-    } else {
-      out_tensor->ShareBufferWith(in_tensor);
-
-      VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
-              << in_var_info->Name() << " -> " << out_var_names_[i];
-    }
-  }
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
deleted file mode 100644
index 774dcd056e59bc8f090a5ceb916e73843c8c9df6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// NOTE(paddle-dev): ShareTensorBufferFunctor is responsible for
-// performing memory reuse in run-time. ShareTensorBufferOpHandle
-// is only a wrapper of ShareTensorBufferFunctor.
-// Once we find the run-time memory reuse strategy is time-consuming in
-// scheduling, we should need a pass to move ShareTensorBufferFunctor into
-// each ComputationOpHandle. ShareTensorBufferFunctor is preserved for
-// this probable movement.
-class ShareTensorBufferFunctor {
- public:
-  ShareTensorBufferFunctor(
-      Scope *scope, size_t scope_idx, const std::string &op_type,
-      const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-      const std::vector<std::string> &out_var_names);
-
-  void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
-                       const std::string &out_var_name);
-
-  void operator()(Scope *exec_scope);
-
-  std::unordered_map<std::string, std::string> ReusedVars() const;
-
-  size_t GetScopeIdx() const { return scope_idx_; }
-
-  Scope *GetScope() { return scope_; }
-
- private:
-  void CallOnce();
-
- private:
-  Scope *scope_;
-  Scope *exec_scope_{nullptr};
-
-  size_t scope_idx_;
-  std::string op_type_;
-  std::vector<const ir::MemOptVarInfo *> in_var_infos_;
-  std::vector<std::string> out_var_names_;
-
-  std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
deleted file mode 100644
index 01c4dc9733c2b0fd4236b471b0dfb69a45f26691..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-ComputationOpHandle *GetUniquePendingComputationOpHandle(
-    ShareTensorBufferOpHandle *share_tensor_op) {
-  ComputationOpHandle *result_op = nullptr;
-  for (ir::Node *out_var : share_tensor_op->Node()->outputs) {
-    for (ir::Node *pending_op : out_var->outputs) {
-      auto &op = pending_op->Wrapper<OpHandleBase>();
-      auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op);
-      PADDLE_ENFORCE_NOT_NULL(compute_op);
-
-      if (result_op == nullptr) {
-        result_op = compute_op;
-      } else {
-        PADDLE_ENFORCE_EQ(result_op, compute_op);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE_NOT_NULL(result_op);
-  return result_op;
-}
-
-ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
-    ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
-    const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names)
-    : OpHandleBase(node),
-      functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {}
-
-std::unordered_map<std::string, std::string>
-ShareTensorBufferOpHandle::ReusedVars() const {
-  return functor_.ReusedVars();
-}
-
-void ShareTensorBufferOpHandle::AddReuseVarPair(
-    const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
-  functor_.AddReuseVarPair(in_var_info, out_var_name);
-}
-
-void ShareTensorBufferOpHandle::InitCUDA() {
-#ifdef PADDLE_WITH_CUDA
-  int dev_id =
-      boost::get<platform::CUDAPlace>(dev_ctxes_.begin()->first).device;
-  events_[dev_id] = nullptr;
-#endif
-}
-
-void ShareTensorBufferOpHandle::RunImpl() { functor_(local_exec_scopes_[0]); }
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
deleted file mode 100644
index b22f5621fe44d887d70d82ce4dc9e26596d23f4e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class ShareTensorBufferOpHandle : public OpHandleBase {
- public:
-  ShareTensorBufferOpHandle(
-      ir::Node *node, Scope *scope, size_t scope_idx,
-      const std::string &op_type,
-      const std::vector<const ir::MemOptVarInfo *> &in_vars_infos,
-      const std::vector<std::string> &out_var_names);
-
-  std::unordered_map<std::string, std::string> ReusedVars() const;
-
-  Priority GetPriority() const override { return Priority::kHighest; }
-
-  size_t GetScopeIdx() const { return functor_.GetScopeIdx(); }
-
-  void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
-                       const std::string &out_var_name);
-
-  const ShareTensorBufferFunctor &Functor() const { return functor_; }
-
- protected:
-  std::string Name() const override { return "buffer_share"; }
-
-  void RunImpl() final;
-
-  void InitCUDA() override;
-
-  std::vector<Scope *> GetLocalScopes() override {
-    return {functor_.GetScope()};
-  }
-
- private:
-  ShareTensorBufferFunctor functor_;
-};
-
-ComputationOpHandle *GetUniquePendingComputationOpHandle(
-    ShareTensorBufferOpHandle *share_tensor_op);
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
deleted file mode 100644
index 070a17a9de591a2a2130338d7f82bc5d534fa066..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
-#include <algorithm>
-#include "dgc/dgc.h"
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/reduce_and_gather.h"
-#include "paddle/fluid/framework/details/variable_visitor.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DECLARE_bool(sync_nccl_allreduce);
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-SparseAllReduceOpHandle::SparseAllReduceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    const platform::NCCLCommunicator *ctxs, bool is_encoded, int nranks)
-    : AllReduceOpHandle(node, local_scopes, places, ctxs),
-      is_encoded_(is_encoded),
-      nranks_(nranks) {
-  // TODO(gongwb) :polish them!
-  if (is_encoded) {
-    VLOG(1) << "Use dgc allreduce mode";
-  }
-}
-
-void SparseAllReduceOpHandle::RunImplEncoded() {
-  platform::RecordEvent record_event(Name());
-
-  WaitInputVarGenerated();
-
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), places_.size(),
-      "The NoDummyInputSize should be equal to the number of places.");
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-
-  std::vector<const LoDTensor *> ins;
-  std::vector<LoDTensor *> outs;
-  int k = -1;
-  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    auto *local_scope = local_exec_scopes_[i];
-    auto original_name =
-        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
-    auto encode_var_name = original_name + g_dgc_encoded;
-    auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
-    auto &in = in_var->Get<LoDTensor>();
-    ins.emplace_back(&in);
-
-    auto *out = local_scope->FindVar(out_var_handles[i]->name())
-                    ->GetMutable<LoDTensor>();
-    outs.emplace_back(out);
-
-    if (k < 0) {
-      k = GetKValue(in_var_handles[i]->name());
-    }
-  }
-
-  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
-  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
-  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
-
-  int dtype = -1;
-  size_t in_numel = 0;
-  size_t out_numel = 0;
-  PADDLE_ENFORCE(nranks_ > 1);
-  std::vector<std::function<void()>> all_reduce_calls;
-
-  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    auto &place = places_[i];
-    auto &in = *ins[i];
-    void *in_tensor_buf = const_cast<void *>(in.data<void>());
-
-    auto &out = *outs[i];
-    float *out_tensor_buf = out.data<float>();
-
-    dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
-    in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
-    PADDLE_ENFORCE(in_numel % 2 == 0);
-    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
-    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
-
-    int dev_id = boost::get<platform::CUDAPlace>(place).device;
-    auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false);
-    auto &nccl_ctx = nccl_ctxs->at(dev_id);
-    auto *dev_ctx = nccl_ctxs->DevCtx(dev_id);
-    auto stream = nccl_ctx.stream();
-    auto comm = nccl_ctx.comm_;
-
-    int encode_size = 2 * k * sizeof(int);
-    // dgc use ncclAllGather to get all the encoded data
-    // so the buffer need nranks.
-    int buf_size = nranks_ * encode_size;
-    auto tmp_ious_data = memory::Alloc(*dev_ctx, buf_size);
-    void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
-
-    VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
-             << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
-             << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
-
-    all_reduce_calls.emplace_back([=] {
-      PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
-          in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
-          stream));
-    });
-  }
-
-  NCCLAllReduceFunc(all_reduce_calls);
-}
-
-int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
-  auto original_name = paddle::framework::GradOriginalVarName(grad_name);
-  auto var_name = original_name + g_dgc_k;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
-
-  auto *scope = local_exec_scopes_[0];
-  auto var = scope->FindVar(var_name);
-  PADDLE_ENFORCE_NOT_NULL(var);
-  auto tensor = var->Get<LoDTensor>().data<float>();
-  return *tensor;
-}
-
-bool SparseAllReduceOpHandle::IsEncoded() {
-  if (!is_encoded_) {
-    return false;
-  }
-  auto counter_name = g_dgc_counter_name;
-  auto step_name = g_dgc_rampup_begin_step;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
-
-  auto *local_scope = local_exec_scopes_[0];
-  auto count_var = local_scope->FindVar(counter_name);
-  auto step_var = local_scope->FindVar(step_name);
-  if (count_var == nullptr || step_var == nullptr) {
-    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
-                 step_var);
-  }
-
-  float count = *count_var->Get<LoDTensor>().data<float>();
-  float step = *step_var->Get<LoDTensor>().data<float>();
-  if (static_cast<int>(count) < static_cast<int>(step)) {
-    VLOG(10) << "in all_reduce currentstep:" << count
-             << " < rampup_begin_step:" << step
-             << " so not use sparse all reduce";
-    return false;
-  }
-
-  return true;
-}
-
-void SparseAllReduceOpHandle::RunImpl() {
-  if (!IsEncoded()) {
-    AllReduceOpHandle::RunImpl();
-    return;
-  }
-
-  RunImplEncoded();
-}
-
-std::string SparseAllReduceOpHandle::Name() const {
-  return "sparse_all_reduce";
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
deleted file mode 100644
index 9802f8dba7e05aec424f48d50992d065015179c9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
-#include "paddle/fluid/framework/details/dgc_const_values.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class SparseAllReduceOpHandle : public AllReduceOpHandle {
- public:
-  SparseAllReduceOpHandle(ir::Node *node,
-                          const std::vector<Scope *> &local_scopes,
-                          const std::vector<platform::Place> &places,
-                          const platform::NCCLCommunicator *ctxs,
-                          bool is_encoded = false, int nranks = -1);
-  std::string Name() const override;
-
- protected:
-  void RunImpl() override;
-  int GetKValue(const std::string &grad_name);
-  bool IsEncoded();
-  void RunImplEncoded();
-
- private:
-  bool is_encoded_{false};
-  int nranks_{-1};
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
deleted file mode 100644
index 4f1e44ca26cb65468da6eded74653f34dbf00336..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/ssa_graph_executor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-SSAGraphExecutor::~SSAGraphExecutor() {}
-
-void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
-  if (fetch_ops->empty()) return;
-
-  for (auto& op : *fetch_ops) {
-    PADDLE_ENFORCE_NOT_NULL(
-        dynamic_cast<FetchOpHandle*>(op),
-        "The input ops of ClearFetchOp function should be FetchOpHandle.");
-    for (auto& out_var : op->Node()->outputs) {
-      graph->RemoveNode(out_var);
-    }
-    for (auto& in_var : op->Inputs()) {
-      in_var->RemoveOutput(op, op->Node());
-    }
-    graph->RemoveNode(op->Node());
-  }
-  fetch_ops->clear();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
deleted file mode 100644
index 2454ec2b27d9d2060f28b8d6cea0ce49fe347433..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-class SSAGraphExecutor {
-  DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
-
- public:
-  SSAGraphExecutor() {}
-
-  virtual ~SSAGraphExecutor();
-
-  virtual const ir::Graph& Graph() const = 0;
-
-  virtual FeedFetchList Run(const std::vector<std::string>& fetch_tensors) = 0;
-};
-
-void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops);
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
deleted file mode 100644
index db28e1fe202116f49e0266a7bc24ddfb351c8bb4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ /dev/null
@@ -1,349 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<Scope *> &local_exec_scopes,
-    const std::vector<platform::Place> &places, ir::Graph *graph)
-    : graph_(graph),
-      local_scopes_(local_scopes),
-      local_exec_scopes_(local_exec_scopes),
-      places_(places),
-      fetch_ctxs_(places),
-      strategy_(strategy),
-      prepare_pool_(1),
-      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
-                                       : nullptr) {
-  if (strategy_.num_iteration_per_run_ > 1) {
-    int read_op_num = 0;
-    for (auto *node : graph_->Nodes()) {
-      if (node->IsOp() && node->Name() == "read") {
-        read_op_num++;
-      }
-    }
-    if (read_op_num == 0) {
-      LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model "
-                      "should use pyreader to feed data!";
-    }
-  }
-  PrepareOpDeps();
-  CopyOpDeps();
-}
-
-inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
-    const std::vector<std::string> &fetch_tensors) {
-  std::unique_ptr<platform::RecordEvent> event(
-      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
-  std::unique_ptr<OpDependentData> op_deps = op_deps_futures_.get();
-  CopyOpDeps();
-
-  VLOG(10) << "ThreadedSSAGraphExecutor::Run";
-  std::shared_ptr<BlockingQueue<VarHandleBase *>> ready_vars(
-      new BlockingQueue<VarHandleBase *>);
-  auto &pending_ops = op_deps->pending_ops_;
-  auto &pending_vars = op_deps->pending_vars_;
-  auto &ready_ops = op_deps->ready_ops_;
-  size_t num_ops = op_deps->num_ops_;
-
-  // Step 2. Insert FetchOps
-  std::vector<OpHandleBase *> fetch_ops;
-  std::unordered_set<VarHandleBase *> fetch_dependencies;
-  FeedFetchList fetch_data(fetch_tensors.size());
-
-  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &ready_ops,
-                 &pending_ops, &pending_vars, &fetch_data);
-
-  exception_holder_.Clear();
-  event.reset(nullptr);
-
-  // Step 3. Execution
-  if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) {
-    // If the num_threads is 1, we can record the order of operator's
-    // execution in the first iteration, and in subsequent iterations,
-    // run the recorded operators directly. This strategy could make the
-    // execution faster.
-    VLOG(3) << "Run the traced ops.";
-    RunTracedOps(traced_ops_);
-    RunTracedOps(fetch_ops);
-    if (exception_holder_.IsCaught()) {
-      ExecutionFinal(&fetch_ops);
-    }
-  } else {
-    traced_ops_.clear();
-    auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
-      for (auto *op : set) {
-        RunOp(ready_vars, op);
-      }
-      set.clear();
-    };
-    // Clean run context
-    run_op_futures_.clear();
-
-    while (!pending_vars.empty()) {
-      // 1. Run All Ready ops
-      // Keep loop until all vars are ready.
-      run_all_ops(ready_ops);
-
-      // 2. Find ready variable
-      bool timeout;
-      auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
-      if (timeout) {
-        for (auto &run_op_future : run_op_futures_) {
-          run_op_future.wait();
-        }
-        if (exception_holder_.IsCaught()) {
-          ExecutionFinal(&fetch_ops);
-        } else {
-          continue;
-        }
-      }
-
-      // 3. Remove the dependency of ready_var.
-      // Find the ready_ops after the ready_var.
-      for (auto ready_var : cur_ready_vars) {
-        pending_vars.erase(ready_var);
-        for (auto *op : ready_var->PendingOps()) {
-          auto &deps = pending_ops[op];
-          --deps;
-          if (deps == 0) {
-            ready_ops.insert(op);
-          }
-        }
-      }
-    }
-    PADDLE_ENFORCE(ready_ops.empty());
-  }
-
-  // Wait FetchOps.
-  ClearFetchOp(graph_, &fetch_ops);
-
-  return fetch_data;
-}
-
-FeedFetchList ThreadedSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors) {
-  for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) {
-    RunImpl({});
-  }
-  return RunImpl(fetch_tensors);
-}
-
-void ThreadedSSAGraphExecutor::InsertFetchOps(
-    const std::vector<std::string> &fetch_tensors,
-    std::vector<OpHandleBase *> *fetch_ops,
-    std::unordered_set<VarHandleBase *> *fetch_dependencies,
-    std::unordered_set<OpHandleBase *> *ready_ops,
-    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
-    std::unordered_set<VarHandleBase *> *pending_vars,
-    FeedFetchList *fetch_data) {
-  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-  std::unordered_set<VarHandleBase *> local_ready_vars;
-  std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
-                                                   fetch_tensors.end());
-  for (auto &fetch_var_name : fetch_tensor_set) {
-    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
-      auto it = var_map.find(fetch_var_name);
-      if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin());
-      }
-    }
-  }
-
-  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
-    auto &var_name = fetch_tensors[i];
-    auto fetched_var_it = fetched_vars.find(var_name);
-    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
-                   "Cannot find fetched variable(%s).(Perhaps the main_program "
-                   "is not set to ParallelExecutor)",
-                   var_name);
-
-    auto &vars = fetched_var_it->second;
-
-    ir::Node *fetch_node =
-        graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
-    auto *op = new FetchOpHandle(fetch_node, fetch_data, i, &local_scopes_,
-                                 &local_exec_scopes_);
-    fetch_ops->emplace_back(op);
-
-    for (auto &p : places_) {
-      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
-    }
-
-    for (auto *var : vars) {
-      op->AddInput(var);
-    }
-
-    ir::Node *fetch_var =
-        graph_->CreateEmptyNode("fetch", ir::Node::Type::kVariable);
-    auto *fetch_dummy = new DummyVarHandle(fetch_var);
-    op->AddOutput(fetch_dummy);
-    fetch_dependencies->emplace(fetch_dummy);
-
-    this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy);
-
-    size_t wait_input_num = 0;
-    std::unordered_set<VarHandleBase *> input_set(vars.begin(), vars.end());
-    for (auto *var : input_set) {
-      if (pending_vars->count(var)) {
-        ++wait_input_num;
-      }
-    }
-    if (wait_input_num) {
-      pending_ops->insert({op, wait_input_num});
-    } else {
-      ready_ops->insert(static_cast<OpHandleBase *>(op));
-    }
-  }
-  PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
-}
-
-void ThreadedSSAGraphExecutor::InsertPendingOp(
-    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
-    OpHandleBase *op_instance) const {
-  pending_ops->insert({op_instance, op_instance->NoDupInputSize()});
-}
-
-void ThreadedSSAGraphExecutor::InsertPendingVar(
-    std::unordered_set<VarHandleBase *> *pending_vars,
-    std::unordered_set<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
-  pending_vars->insert(var);
-  if (var->GeneratedOp() == nullptr) {
-    ready_vars->insert(var);
-  }
-}
-
-void ThreadedSSAGraphExecutor::PrepareOpDeps() {
-  op_deps_.reset(new OpDependentData());
-  std::unordered_map<OpHandleBase *, size_t> &pending_ops =
-      op_deps_->pending_ops_;
-  std::unordered_set<VarHandleBase *> &pending_vars = op_deps_->pending_vars_;
-  std::unordered_set<OpHandleBase *> &ready_ops = op_deps_->ready_ops_;
-  std::unordered_set<VarHandleBase *> ready_vars;
-
-  // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
-    for (auto &name_pair : var_map) {
-      for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(&pending_vars, &ready_vars, version_pair);
-      }
-    }
-  }
-  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    InsertPendingVar(&pending_vars, &ready_vars, var);
-  }
-
-  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
-    if (op->Inputs().empty()) {  // Special case, Op has no input.
-      ready_ops.insert(op);
-    } else {
-      InsertPendingOp(&pending_ops, op);
-    }
-  }
-  op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
-  PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators.");
-
-  for (auto ready_var : ready_vars) {
-    pending_vars.erase(ready_var);
-    for (auto *op : ready_var->PendingOps()) {
-      auto &deps = pending_ops[op];
-      --deps;
-      if (deps == 0) {
-        ready_ops.insert(op);
-      }
-    }
-  }
-}
-
-void ThreadedSSAGraphExecutor::CopyOpDeps() {
-  op_deps_futures_ = prepare_pool_.enqueue([&] {
-    auto *op_deps = new OpDependentData();
-    op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(),
-                                 op_deps_->pending_ops_.end());
-    op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(),
-                                  op_deps_->pending_vars_.end());
-    op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(),
-                               op_deps_->ready_ops_.end());
-    op_deps->num_ops_ = op_deps_->num_ops_;
-    return std::unique_ptr<OpDependentData>(op_deps);
-  });
-}
-
-void ThreadedSSAGraphExecutor::RunOp(
-    const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
-    details::OpHandleBase *op) {
-  auto op_run = [ready_var_q, op, this] {
-    RunOpSync(op);
-    try {
-      ready_var_q->Extend(op->Outputs());
-      VLOG(10) << op << " " << op->Name() << " Signal posted";
-    } catch (...) {
-      exception_holder_.Catch(std::current_exception());
-    }
-  };
-
-  if (pool_) {
-    run_op_futures_.emplace_back(pool_->enqueue(op_run));
-  } else {
-    op_run();
-  }
-
-  RecordOps(op);
-}
-
-void ThreadedSSAGraphExecutor::RunTracedOps(
-    const std::vector<OpHandleBase *> &traced_ops) {
-  for (auto &op : traced_ops) {
-    if (exception_holder_.IsCaught()) {
-      return;
-    }
-    RunOpSync(op);
-  }
-}
-
-void ThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
-  try {
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
-    }
-    if (LIKELY(!strategy_.dry_run_)) {
-      op->Run(strategy_.use_cuda_);
-    }
-    VLOG(10) << op << " " << op->Name() << " Done ";
-  } catch (...) {
-    exception_holder_.Catch(std::current_exception());
-  }
-}
-
-void ThreadedSSAGraphExecutor::ExecutionFinal(
-    std::vector<OpHandleBase *> *fetch_ops) {
-  VLOG(3) << "caught exception " << exception_holder_.Type() << ", rethrow it";
-  ClearFetchOp(graph_, fetch_ops);
-  exception_holder_.ReThrow();
-}
-
-void ThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
-  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchOpHandle *>(op)) {
-    traced_ops_.emplace_back(op);
-  }
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
deleted file mode 100644
index fe6ef95a135417c0c73cfb3c9a20af66dc5047e6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ /dev/null
@@ -1,119 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <deque>
-#include <functional>
-#include <list>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include <ThreadPool.h>  // ThreadPool in thrird party
-
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/details/exception_holder.h"
-#include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/ssa_graph_executor.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-
-namespace details {
-
-struct OpDependentData {
-  std::unordered_map<OpHandleBase *, size_t> pending_ops_;
-  std::unordered_set<VarHandleBase *> pending_vars_;
-  std::unordered_set<OpHandleBase *> ready_ops_;
-  size_t num_ops_{0};
-};
-
-class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
- public:
-  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
-                           const std::vector<Scope *> &local_scopes,
-                           const std::vector<Scope *> &local_exec_scopes,
-                           const std::vector<platform::Place> &places,
-                           ir::Graph *graph);
-
-  const ir::Graph &Graph() const override { return *graph_; }
-  // Run a SSAGraph by a thread pool
-  // Use topological sort algorithm
-  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
-
-  ~ThreadedSSAGraphExecutor() final = default;
-
- private:
-  inline FeedFetchList RunImpl(const std::vector<std::string> &fetch_tensors);
-  void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
-             details::OpHandleBase *op);
-
- private:
-  // Note(zcd): the ThreadPool should be placed last so that ThreadPool should
-  // be destroyed first.
-  ir::Graph *graph_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<Scope *> local_exec_scopes_;
-
-  std::vector<platform::Place> places_;
-  platform::DeviceContextPool fetch_ctxs_;
-  ExceptionHolder exception_holder_;
-  std::unique_ptr<OpDependentData> op_deps_;
-  std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
-  ExecutionStrategy strategy_;
-  // use std::list because clear(), push_back, and for_each are O(1)
-  std::list<std::future<void>> run_op_futures_;
-  ::ThreadPool prepare_pool_;
-  std::unique_ptr<::ThreadPool> pool_;
-  std::vector<OpHandleBase *> traced_ops_;
-
-  void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
-                       OpHandleBase *op_instance) const;
-
-  void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars,
-                        std::unordered_set<VarHandleBase *> *ready_vars,
-                        VarHandleBase *var) const;
-
-  void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
-                      std::vector<OpHandleBase *> *fetch_ops,
-                      std::unordered_set<VarHandleBase *> *fetch_dependencies,
-                      std::unordered_set<OpHandleBase *> *ready_ops,
-                      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
-                      std::unordered_set<VarHandleBase *> *pending_vars,
-                      FeedFetchList *fetch_data);
-
-  void PrepareOpDeps();
-
-  void CopyOpDeps();
-
-  inline void RecordOps(OpHandleBase *op);
-
-  inline void ExecutionFinal(std::vector<OpHandleBase *> *fetch_ops);
-
-  inline void RunOpSync(OpHandleBase *op);
-
-  void RunTracedOps(const std::vector<OpHandleBase *> &traced_ops);
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
deleted file mode 100644
index 95d62e66415e7879144d35f858ef04a8a936cd66..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/var_handle.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/var_handle.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-VarHandleBase::~VarHandleBase() {}
-
-VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); }
-
-std::string VarHandle::DebugString() const {
-  std::stringstream ss;
-  ss << "name:" << name_ << ", place:" << place_ << ", version:" << version_
-     << ", scope_idx:" << scope_idx_;
-  return ss.str();
-}
-
-std::string DummyVarHandle::DebugString() const { return node_->Name(); }
-
-DummyVarHandle::~DummyVarHandle() {
-  VLOG(4) << "deleting dummy var handle " << DebugString();
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
deleted file mode 100644
index 86428f8b7613760f59a1166189c61f3217d8017d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/var_handle.h
+++ /dev/null
@@ -1,174 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <sstream>
-#include <string>
-#include <unordered_set>
-#include <utility>
-
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-class OpHandleBase;
-
-// Wraps ir::Node and provide helper utilities.
-// It's responsible for populating necessary fields of ir::Node.
-//
-// VarHandleBase is the var node in the dependency graph.
-// A variable can only be generated by a single operator. i.e.
-// This is a single assignment graph.
-struct VarHandleBase {
-  // Owned by `node`. No need to be deleted explicitly.
-  explicit VarHandleBase(ir::Node* node) : node_(node) {
-    node_->WrappedBy(this);
-  }
-
-  virtual ~VarHandleBase();
-
-  virtual std::string DebugString() const = 0;
-  virtual const std::string& Name() const = 0;
-
-  void AddInput(OpHandleBase* in, ir::Node* node) {
-    node_->inputs.clear();
-    node_->inputs.push_back(node);
-    generated_op_ = in;
-  }
-
-  void AddOutput(OpHandleBase* out, ir::Node* node) {
-    if (pending_ops_.find(out) == pending_ops_.end()) {
-      PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr",
-                     this->Node()->Name());
-      pending_ops_.insert(out);
-      node_->outputs.push_back(node);
-    }
-  }
-
-  void RemoveOutput(OpHandleBase* out, ir::Node* node) {
-    pending_ops_.erase(out);
-    node_->outputs.erase(
-        std::remove(node_->outputs.begin(), node_->outputs.end(), node),
-        node_->outputs.end());
-  }
-
-  void ClearGeneratedOp() {
-    generated_op_ = nullptr;
-    node_->inputs.clear();
-  }
-
-  OpHandleBase* GeneratedOp() { return generated_op_; }
-
-  const OpHandleBase* GeneratedOp() const { return generated_op_; }
-
-  const std::unordered_set<OpHandleBase*>& PendingOps() const {
-    return pending_ops_;
-  }
-
-  ir::Node* Node() { return node_; }
-
-  const ir::Node* Node() const { return node_; }
-
- protected:
-  // The operator who generate this variable. nullptr if the variable
-  // is a root node.
-  OpHandleBase* generated_op_{nullptr};
-
-  // Operators which depend on this variable ready.
-  std::unordered_set<OpHandleBase*> pending_ops_;
-  ir::Node* node_;
-};
-
-// VarHandle is actually a single version of Runtime Variable.
-// Variable in Runtime mapped to many VarHandles in Graph.
-// Each assignment will generate a new var handle with newer version.
-//
-// NOTE: runtime variables have place.
-struct VarHandle : public VarHandleBase {
-  DISABLE_COPY_AND_ASSIGN(VarHandle);
-
- public:
-  virtual ~VarHandle();
-
-  std::string DebugString() const override;
-
-  VarHandle(ir::Node* node, size_t version, size_t scope_index,
-            std::string name, platform::Place place)
-      : VarHandleBase(node),
-        version_(version),
-        scope_idx_(scope_index),
-        name_(std::move(name)),
-        place_(std::move(place)) {}
-
-#ifdef PADDLE_WITH_CUDA
-  bool HasEvent() { return has_event_; }
-
-  const cudaEvent_t& GetEvent() {
-    PADDLE_ENFORCE(HasEvent(), "The event is not set.");
-    return event_;
-  }
-
-  void SetGenerateEvent(const cudaEvent_t& event) {
-    has_event_ = true;
-    event_ = event;
-  }
-#endif
-
-  // version field currently is not used, however, just store the version to
-  // debug easily.
- private:
-  size_t version_;
-  size_t scope_idx_;
-  std::string name_;
-  platform::Place place_;
-#ifdef PADDLE_WITH_CUDA
-  // Only when this event is triggered, var is generated.
-  cudaEvent_t event_;
-  bool has_event_{false};
-#endif
-
- public:
-  bool IsTheSameVar(const VarHandle& o) const {
-    return o.generated_op_ == generated_op_ && o.name_ == name_ &&
-           o.scope_idx_ == scope_idx_;
-  }
-
-  size_t version() const { return version_; }
-  size_t scope_idx() const { return scope_idx_; }
-  const std::string& Name() const override { return name_; }
-  const std::string& name() const { return name_; }
-  const platform::Place& place() const { return place_; }
-};
-
-// Dummy Variable. It is used to represent dependencies between operators
-struct DummyVarHandle : public VarHandleBase {
-  explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {}
-
-  virtual ~DummyVarHandle();
-
-  std::string DebugString() const override;
-
- public:
-  const std::string& Name() const override { return name_; }
-  std::string name_{"DummyVar"};
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
deleted file mode 100644
index 134f759081a0778194c20785e215420d6e2bb622..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/variable_visitor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-namespace paddle {
-namespace framework {
-namespace details {
-template <typename Func>
-static void VisitVariable(Variable* var, Func* func) {
-  if (var->IsType<LoDTensor>()) {
-    (*func)(var->GetMutable<LoDTensor>());
-  } else if (var->IsType<SelectedRows>()) {
-    (*func)(var->GetMutable<SelectedRows>());
-  } else {
-    PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
-  }
-}
-
-template <typename Func>
-static void VisitVariable(const Variable& var, Func* func) {
-  if (var.IsType<LoDTensor>()) {
-    (*func)(var.Get<LoDTensor>());
-  } else if (var.IsType<SelectedRows>()) {
-    (*func)(var.Get<SelectedRows>());
-  } else {
-    PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
-  }
-}
-
-struct TensorVisitor {
-  Tensor* result_{nullptr};
-
-  void operator()(LoDTensor* tensor) { result_ = tensor; }
-
-  void operator()(SelectedRows* selected_rows) {
-    result_ = selected_rows->mutable_value();
-  }
-
-  template <typename T>
-  void operator()() {
-    PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name());
-  }
-};
-
-Tensor& VariableVisitor::GetMutableTensor(Variable* var) {
-  TensorVisitor vistor;
-  VisitVariable(var, &vistor);
-  return *vistor.result_;
-}
-
-struct ShareDimsAndLoDVisitor {
-  Variable* trg_;
-  void operator()(const LoDTensor& val) {
-    auto* tensor = trg_->GetMutable<LoDTensor>();
-    tensor->set_layout(val.layout());
-    tensor->set_lod(val.lod());
-    tensor->Resize(val.dims());
-  }
-
-  void operator()(const SelectedRows& val) {
-    auto* selected_rows = trg_->GetMutable<SelectedRows>();
-    selected_rows->set_rows(val.rows());
-    selected_rows->set_height(val.height());
-    selected_rows->mutable_value()->Resize(val.value().dims());
-  }
-
-  template <typename T>
-  void operator()(const T&) {
-    PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s",
-                   typeid(T).name());
-  }
-};
-
-void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
-  ShareDimsAndLoDVisitor visitor{trg};
-  VisitVariable(src, &visitor);
-}
-
-struct EnforceShapeAndDTypeEQVisitor {
-  const Variable* trg_;
-
-  void operator()(const LoDTensor& src) {
-    auto& tensor = trg_->Get<LoDTensor>();
-    PADDLE_ENFORCE_EQ(
-        src.place().which(), tensor.place().which(),
-        "The Places of the two Variable must be all on CPU or all on GPU.");
-    PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
-                      "The dtype of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(),
-                      "The dims of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
-                      "The lod of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(),
-                      "The layout of the two Variable's tensor is not equal.");
-  }
-
-  void operator()(const SelectedRows& src) {
-    auto& selected_rows = trg_->Get<SelectedRows>();
-    PADDLE_ENFORCE_EQ(
-        src.place().which(), selected_rows.place().which(),
-        "The Places of the two Variable must be all on CPU or all on GPU.");
-    PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
-                      "The dtype of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(),
-                      "The layout of the two Variable's tensor is not equal.");
-    PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
-                      "The height of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
-                      "The dims of the two Variable is not equal.");
-  }
-
-  template <typename T>
-  void operator()(const T&) {
-    PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s",
-                   typeid(T).name());
-  }
-};
-
-void VariableVisitor::EnforceShapeAndDTypeEQ(const Variable& var1,
-                                             const Variable& var2) {
-  EnforceShapeAndDTypeEQVisitor visitor{&var1};
-  VisitVariable(var2, &visitor);
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h
deleted file mode 100644
index ca9a19bdcf1be7bf0e1d2b0de560a38f528a2d2c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/variable_visitor.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class VariableVisitor {
- public:
-  static Tensor &GetMutableTensor(Variable *var);
-
-  static void ShareDimsAndLoD(const Variable &src, Variable *trg);
-
-  static void EnforceShapeAndDTypeEQ(const Variable &var1,
-                                     const Variable &var2);
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc
deleted file mode 100644
index 7fe60b4446a1c888b4f0a1b3ad1897eea4829bb9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/device_worker.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/device_worker.h"
-
-namespace paddle {
-namespace framework {
-
-void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }
-
-void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
-  device_reader_ = data_feed;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
deleted file mode 100644
index 6164953083bf6ee1e1612829f6f89ac23dc02392..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/device_worker.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <fstream>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/trainer_desc.pb.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/reader/blocking_queue.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/timer.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-
-#define SEC_LOG                                                              \
-  VLOG(3) << "[s" << section_id_ << "p" << pipeline_id_ << "t" << thread_id_ \
-          << "]: "
-
-class PullDenseWorker {
- public:
-  virtual ~PullDenseWorker() {}
-  virtual void Initialize(const TrainerDesc& param);
-  int Start();
-  void Stop();
-  void SetRootScope(Scope* scope) { root_scope_ = scope; }
-  void IncreaseThreadVersion(int thread_id, uint64_t table_id);
-  void ResetThreadVersion(uint64_t table_id);
-  void Wait(std::vector<::std::future<int32_t>>* status_vec);
-  void PullDense(bool force_update = false);
-  static std::shared_ptr<PullDenseWorker> GetInstance() {
-    if (NULL == s_instance_) {
-      s_instance_.reset(new paddle::framework::PullDenseWorker());
-    }
-    return s_instance_;
-  }
-
- private:
-  PullDenseWorker() : root_scope_(NULL) {}
-  void Run();
-  bool CheckUpdateParam(uint64_t table_id);
-
- private:
-  static std::shared_ptr<PullDenseWorker> s_instance_;
-  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
-  PullDenseWorkerParameter param_;
-  DownpourWorkerParameter dwp_param_;
-  Scope* root_scope_;
-  bool running_;
-
-  static std::map<uint64_t, uint64_t> last_versions_;
-  static std::map<uint64_t, uint64_t> current_version_;
-  static std::mutex mutex_for_version_;
-  static std::map<uint64_t, std::vector<uint64_t>> training_versions_;
-  static std::map<uint64_t, std::vector<std::string>> dense_value_names_;
-
-  std::thread t_;
-  int thread_num_;
-  int sleep_time_ms_;
-  int threshold_;
-
-  std::vector<::std::future<int32_t>> pull_dense_status_;
-  uint32_t pull_dense_fail_times_ = 0;
-  std::vector<float> base_norm_param_;
-  std::vector<float> mean_;
-  std::vector<float> scale_;
-  float squared_sum_epsilon_ = 1e-4;
-  std::mutex mutex_for_mean_scale_;
-  float total_batch_num_ = 0;
-};
-
-// should incorporate different type of device
-class DeviceWorker {
- public:
-  DeviceWorker() { use_cvm_ = false; }
-  virtual ~DeviceWorker() {}
-  virtual void Initialize(const TrainerDesc& desc) = 0;
-  virtual void SetDeviceIndex(int tid) = 0;
-  virtual void TrainFiles() = 0;
-  virtual void PrintFetchVars() = 0;
-  virtual void TrainFilesWithProfiler() = 0;
-  virtual void CreateDeviceResource(const ProgramDesc& main_prog) = 0;
-  // will make this zero copy in the future
-  virtual void BindingDataFeedMemory() = 0;
-  virtual void SetRootScope(Scope* root_scope);
-  virtual void SetDataFeed(DataFeed* data_feed);
-  virtual void SetNeedDump(bool need_dump_field) {}
-  virtual void SetChannelWriter(ChannelObject<std::string>* queue) {}
-  virtual void SetPlace(const paddle::platform::Place& place) {
-    place_ = place;
-  }
-  virtual void SetReaderPlace(const paddle::platform::Place& place) {
-    device_reader_->SetPlace(place);
-  }
-  virtual Scope* GetThreadScope() { return thread_scope_; }
-
- protected:
-  Scope* root_scope_ = nullptr;
-  Scope* thread_scope_;
-  paddle::platform::Place place_;
-  DataFeed* device_reader_ = nullptr;
-  int64_t batch_num_;
-  FetchConfig fetch_config_;
-  bool use_cvm_;
-};
-
-class CPUWorkerBase : public DeviceWorker {
- public:
-  CPUWorkerBase() {}
-  virtual ~CPUWorkerBase() {}
-  virtual void SetDeviceIndex(int tid) { thread_id_ = tid; }
-  virtual void TrainFiles() = 0;
-  virtual void TrainFilesWithProfiler() {}
-  virtual void PrintFetchVars() {}
-  virtual void CreateDeviceResource(const ProgramDesc& main_prog) {}
-
- protected:
-  int thread_id_;
-};
-
-class HogwildWorker : public CPUWorkerBase {
- public:
-  HogwildWorker() {}
-  virtual ~HogwildWorker() {
-    for (OperatorBase* op : ops_) {
-      delete op;
-    }
-    std::vector<OperatorBase*>().swap(ops_);
-  }
-  virtual void Initialize(const TrainerDesc& desc);
-  virtual void TrainFiles();
-  virtual void TrainFilesWithProfiler();
-  virtual void PrintFetchVars();
-  virtual void CreateDeviceResource(const ProgramDesc& main_prog);
-  virtual void BindingDataFeedMemory();
-  template <typename T>
-  void SetZero(LoDTensor* tensor, LoDTensor* root_tensor, int tensor_dim);
-
- protected:
-  void CreateThreadOperators(const ProgramDesc& program);
-  void CreateThreadScope(const ProgramDesc& program);
-  std::vector<std::string> op_names_;
-  std::vector<OperatorBase*> ops_;
-  // Scope* thread_scope_;
-  HogwildWorkerParameter param_;
-  std::vector<std::string> skip_ops_;
-  std::map<std::string, int> stat_var_name_map_;
-};
-
-class DownpourWorker : public HogwildWorker {
- public:
-  DownpourWorker() {}
-  virtual ~DownpourWorker() {}
-  virtual void Initialize(const TrainerDesc& desc);
-  virtual void TrainFiles();
-  virtual void TrainFilesWithProfiler();
-  virtual void SetNeedDump(bool need_dump_field);
-  virtual void SetChannelWriter(ChannelObject<std::string>* queue);
-
- protected:
-  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
-  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
-  void FillSparseValue(size_t table_id);
-  void PushGradients();
-  void CollectLabelInfo(size_t table_id);
-  void AdjustInsWeight();
-
- private:
-  bool need_to_push_dense_;
-  bool need_dump_field_;
-  bool dump_slot_;
-  bool need_to_push_sparse_;
-  std::vector<std::string> dump_fields_;
-  ChannelWriter<std::string> writer_;
-  DownpourWorkerParameter param_;
-  float scale_datanorm_;
-  // just save the value in param_ for easy access
-  std::map<uint64_t, std::string> label_var_name_;
-  std::map<uint64_t, std::vector<std::string>> sparse_key_names_;
-  std::map<uint64_t, std::vector<std::string>> sparse_value_names_;
-  std::map<uint64_t, std::vector<std::string>> sparse_grad_names_;
-  std::map<uint64_t, std::vector<std::string>> dense_value_names_;
-  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
-
-  // feasign
-  std::map<uint64_t, std::vector<uint64_t>> features_;
-  // feasign stats
-  std::map<uint64_t, std::vector<float>> feature_labels_;
-  // feasign embedding
-  std::map<uint64_t, std::vector<std::vector<float>>> feature_values_;
-  // feasign embedding gradient
-  std::map<uint64_t, std::vector<std::vector<float>>> feature_grads_;
-  // skipped ops
-  std::vector<std::string> skip_ops_;
-
-  std::shared_ptr<PullDenseWorker> _pull_dense_worker;
-  std::vector<::std::future<int32_t>> push_sparse_status_;
-  std::vector<::std::future<int32_t>> push_dense_status_;
-
-  // adjust ins weight
-  AdjustInsWeightConfig adjust_ins_weight_config_;
-  std::vector<float> nid_show_;
-};
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-using ScopeQueue = operators::reader::BlockingQueue<Scope*>;
-
-class SyncFunctor {
- public:
-  SyncFunctor(int rank_id, int rank_num, int sync_steps);
-  virtual ~SyncFunctor() {}
-
-  void SetSyncParam(const std::vector<std::string>& sync_param) {
-    sync_param_ = &sync_param;
-  }
-  void SetNcclCtxMap(platform::NCCLContextMap* nccl_ctx_map) {
-    nccl_ctx_map_ = nccl_ctx_map;
-  }
-
-  int operator()(Scope* scope);
-  static std::vector<Scope*> pipeline_scopes_;
-  static uint64_t sync_flag_;
-
- protected:
-  const int rank_id_;
-  const int rank_num_;
-  const std::vector<std::string>* sync_param_ = nullptr;
-  platform::NCCLContextMap* nccl_ctx_map_ = nullptr;
-
-  uint64_t sync_signal_;
-  const int sync_steps_;
-  int counter_;
-
-  void Synchronize();
-};
-
-class SectionWorker : public DeviceWorker {
- public:
-  SectionWorker() {}
-  ~SectionWorker() override {}
-
-  void Initialize(const TrainerDesc& desc) override;
-
-  void BindingDataFeedMemory() override {}
-  void CreateDeviceResource(const ProgramDesc& main_prog) override{};
-
-  void TrainFiles() override;
-  void TrainFilesWithProfiler() override;
-
-  void PrintFetchVars() override {}
-
-  const platform::Place& place() const { return place_; }
-
-  void SetSectionIndex(int section_id) { section_id_ = section_id; }
-  void SetDeviceIndex(int tid) override { pipeline_id_ = tid; }
-  void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
-  void SetVarNames(const std::vector<std::string>& in_var_names,
-                   const std::vector<std::string>& out_var_names) {
-    in_var_names_ = &in_var_names;
-    out_var_names_ = &out_var_names;
-  }
-  void SetScopeQueue(ScopeQueue* in_scope_queue, ScopeQueue* out_scope_queue) {
-    in_scope_queue_ = in_scope_queue;
-    out_scope_queue_ = out_scope_queue;
-  }
-  void SetCountMutex(std::mutex* mutex) { worker_count_mutex_ = mutex; }
-  void SetWorkerCount(int* worker_count) { worker_count_ = worker_count; }
-  void SetSectionNum(int section_num) { section_num_ = section_num; }
-  void SetPipelineNum(int pipeline_num) { pipeline_num_ = pipeline_num; }
-  void SetNextSectionPlace(const paddle::platform::Place& place) {
-    next_section_place_ = place;
-  }
-  SyncFunctor* sync_func_ = nullptr;
-  void SetSyncFunctor(SyncFunctor* sync_func) { sync_func_ = sync_func; }
-
-  static std::atomic<int> cpu_id_;
-
- protected:
-  void AutoSetCPUAffinity(bool reuse);
-  int section_id_;
-  int pipeline_id_;
-  int section_num_;
-  int pipeline_num_;
-  int thread_id_;
-  // This worker will consume scope from in_scope_queue_
-  // and produce scope to out_scope_queue_
-  ScopeQueue* in_scope_queue_ = nullptr;
-  ScopeQueue* out_scope_queue_ = nullptr;
-  const std::vector<std::string>* in_var_names_ = nullptr;
-  const std::vector<std::string>* out_var_names_ = nullptr;
-  std::mutex* worker_count_mutex_ = nullptr;
-  int* worker_count_ = nullptr;
-  paddle::platform::Place next_section_place_;
-
-  std::vector<std::unique_ptr<OperatorBase>> ops_;
-
-  platform::DeviceContext* dev_ctx_ = nullptr;
-};
-#endif
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
deleted file mode 100644
index dc85941f57d172b79c06f5ab91933fe0fa50465e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-namespace paddle {
-namespace framework {
-
-typedef std::shared_ptr<DeviceWorker> (*Createdevice_workerFunction)();
-typedef std::unordered_map<std::string, Createdevice_workerFunction>
-    device_workerMap;
-device_workerMap g_device_worker_map;
-#define REGISTER_DEVICE_WORKER_CLASS(device_worker_class)                \
-  namespace {                                                            \
-  std::shared_ptr<DeviceWorker> Creator_##device_worker_class() {        \
-    return std::shared_ptr<DeviceWorker>(new device_worker_class);       \
-  }                                                                      \
-  class __Registerer_##device_worker_class {                             \
-   public:                                                               \
-    __Registerer_##device_worker_class() {                               \
-      g_device_worker_map[#device_worker_class] =                        \
-          &Creator_##device_worker_class;                                \
-    }                                                                    \
-  };                                                                     \
-  __Registerer_##device_worker_class g_registerer_##device_worker_class; \
-  }  // namespace
-
-std::string DeviceWorkerFactory::DeviceWorkerTypeList() {
-  std::string device_worker_types;
-  for (auto iter = g_device_worker_map.begin();
-       iter != g_device_worker_map.end(); ++iter) {
-    if (iter != g_device_worker_map.begin()) {
-      device_worker_types += ", ";
-    }
-    device_worker_types += iter->first;
-  }
-  return device_worker_types;
-}
-
-std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(
-    std::string device_worker_class) {
-  if (g_device_worker_map.count(device_worker_class) < 1) {
-    exit(-1);
-  }
-  return g_device_worker_map[device_worker_class]();
-}
-
-REGISTER_DEVICE_WORKER_CLASS(HogwildWorker);
-REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
-#endif
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker_factory.h b/paddle/fluid/framework/device_worker_factory.h
deleted file mode 100644
index 9d0613385e78c9f482840677c71f621e53ed85b5..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/device_worker_factory.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/device_worker.h"
-
-namespace paddle {
-namespace framework {
-
-class DeviceWorkerFactory {
- public:
-  static std::string DeviceWorkerTypeList();
-  static std::shared_ptr<DeviceWorker> CreateDeviceWorker(
-      std::string device_worker_class);
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker_test.cc b/paddle/fluid/framework/device_worker_test.cc
deleted file mode 100644
index faa648ab35d2b4d7a553344c2261d2aa07d0829a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/device_worker_test.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/trainer.h"
-
-namespace paddle {
-namespace framework {
-TEST() {
-  // create hogwild device worker
-}
-}
-}
diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h
deleted file mode 100644
index 66214b265fdf9078aeda4efa37c7ad1f2bbef62b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/dim.h
+++ /dev/null
@@ -1,100 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-
-#include "paddle/fluid/framework/array.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace framework {
-
-// Statically sized, statically indexed dimension
-template <int D>
-class Dim : public Array<int64_t, D> {
- public:
-  static_assert(D >= 0, "D must be not less than 0");
-
-  static constexpr int kRank = D;
-  using BaseClass = Array<int64_t, D>;
-
-  inline Dim(int64_t head, const Dim<D - 1>& tail) {
-    (*this)[0] = head;
-    new (this->GetMutable() + 1) Dim<D - 1>(tail);
-  }
-
-  template <typename... Args>
-  HOSTDEVICE explicit Dim(int64_t head, Args... args)
-      : BaseClass(head, args...) {}
-
-  /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); }
-
-  HOSTDEVICE Dim() = default;
-
-  HOST std::string to_string() const;
-};
-
-// Product of a Dim
-template <int D>
-HOSTDEVICE inline int64_t product(const Dim<D>& a) {
-  return UnrollProduct<D>::Run(a.Get());
-}
-
-/**
- * Helper function to create a Dim
- *
- * \param idxes The type of Dim constructed depends on the number of params
- *
- */
-
-template <typename... Args>
-HOSTDEVICE inline Dim<sizeof...(Args)> make_dim(Args... idxes) {
-  return Dim<sizeof...(Args)>(idxes...);
-}
-
-// Allows us to output a Dim
-template <int D>
-inline std::ostream& operator<<(std::ostream& os, const Dim<D>& d) {
-  os << d[0];
-  for (int i = 1; i < D; ++i) {
-    os << ", " << d[i];
-  }
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
-  return os;
-}
-
-template <int D>
-HOST std::string Dim<D>::to_string() const {
-  std::stringstream stream;
-  stream << *this;
-  return stream.str();
-}
-
-template <int D, typename T1, typename T2>
-inline void static_dim_assign(const T1* in, T2* out) {
-  UnrollAssign<D>::Run(in, out);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/dim_test.cu b/paddle/fluid/framework/dim_test.cu
deleted file mode 100644
index 7add6d140c7e0942fca22df0c118e0f15460fb07..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/dim_test.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <thrust/device_vector.h>
-#include <sstream>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/dim.h"
-
-__global__ void test(paddle::framework::Dim<2>* o) {
-  o[0] = paddle::framework::make_dim(5, 6);
-}
-
-__global__ void dyn_idx_gpu(int64_t* o) {
-  auto d = paddle::framework::make_dim(5, 6);
-  o[0] = d[1];
-}
-
-TEST(Dim, Equality) {
-  // construct a Dim on the CPU
-  auto a = paddle::framework::make_dim(3, 4);
-  EXPECT_EQ(a[0], 3);
-  EXPECT_EQ(a[1], 4);
-
-  // construct a Dim on the GPU
-  thrust::device_vector<paddle::framework::Dim<2>> t(2);
-  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
-  a = t[0];
-  EXPECT_EQ(a[0], 5);
-  EXPECT_EQ(a[1], 6);
-
-  // product
-  EXPECT_EQ(paddle::framework::product(a), 30);
-
-  // mutate a Dim
-  auto b = paddle::framework::make_dim(7, 8);
-  b[1] = 10;
-  EXPECT_EQ(b[0], 7);
-  EXPECT_EQ(b[1], 10);
-
-  b[0] = 8;
-  b[1] = 11;
-  EXPECT_EQ(b[0], 8);
-  EXPECT_EQ(b[1], 11);
-
-  // dynamic access on GPU
-  thrust::device_vector<int64_t> r(1);
-  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
-  int64_t res = r[0];
-  EXPECT_EQ(res, 6);
-}
-
-TEST(Dim, Bool) {
-  auto a = paddle::framework::make_dim(3, 4);
-  auto b = paddle::framework::make_dim(5, 6);
-  auto c = paddle::framework::make_dim(3, 4);
-
-  // comparison
-  EXPECT_TRUE(a == a);
-  EXPECT_FALSE(a == b);
-  EXPECT_TRUE(a == c);
-}
-
-TEST(Dim, Print) {
-  {
-    std::stringstream ss;
-    auto a = paddle::framework::make_dim(2, 3);
-    ss << a;
-    EXPECT_EQ(ss.str(), "2, 3");
-  }
-  {
-    std::stringstream ss;
-    ss << paddle::framework::make_dim(8);
-    EXPECT_EQ(ss.str(), "8");
-  }
-}
diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc
deleted file mode 100755
index 97fd055345e49747892b73328bc10f6ed37f1b94..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/dist_multi_trainer.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "io/fs.h"
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/data_set.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/trainer.h"
-
-namespace paddle {
-namespace framework {
-
-void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
-                                  Dataset *dataset) {
-  thread_num_ = trainer_desc.thread_num();
-  SetDataset(dataset);
-
-  dump_fields_path_ = trainer_desc.dump_fields_path();
-  dump_converter_ = trainer_desc.dump_converter();
-  need_dump_field_ = false;
-  if (trainer_desc.dump_fields_size() != 0 && dump_fields_path_ != "") {
-    need_dump_field_ = true;
-  }
-  user_define_dump_filename_ = trainer_desc.user_define_dump_filename();
-  if (need_dump_field_) {
-    auto &file_list = dataset->GetFileList();
-    if (file_list.size() == 0) {
-      need_dump_field_ = false;
-    }
-  }
-  mpi_rank_ = trainer_desc.mpi_rank() / 2;
-  const std::vector<paddle::framework::DataFeed *> readers =
-      dataset->GetReaders();
-
-  thread_num_ = readers.size();
-  workers_.resize(thread_num_);
-  for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
-       i++) {
-    need_merge_var_names_.push_back(
-        trainer_desc.downpour_param().stat_var_names(i));
-  }
-
-  for (int i = 0; i < thread_num_; ++i) {
-    workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
-        trainer_desc.device_worker_name());
-    workers_[i]->SetDeviceIndex(i);
-    workers_[i]->SetDataFeed(readers[i]);
-    workers_[i]->Initialize(trainer_desc);
-    workers_[i]->SetNeedDump(need_dump_field_);
-  }
-
-  VLOG(3) << "going to initialize pull dense worker";
-  pull_dense_worker_ = PullDenseWorker::GetInstance();
-  pull_dense_worker_->Initialize(trainer_desc);
-  VLOG(3) << "initialize pull dense worker";
-  SetDebug(trainer_desc.debug());
-}
-
-void DistMultiTrainer::DumpWork() {
-#ifdef _LINUX
-  while (1) {
-    std::string out_str;
-    if (!queue_->Get(out_str)) {
-      break;
-    }
-    size_t write_count =
-        fwrite_unlocked(out_str.data(), 1, out_str.length(), fp_.get());
-    if (write_count != out_str.length()) {
-      VLOG(3) << "dump text failed";
-      continue;
-    }
-    write_count = fwrite_unlocked("\n", 1, 1, fp_.get());
-    if (write_count != 1) {
-      VLOG(3) << "dump text failed";
-      continue;
-    }
-  }
-#endif
-}
-
-void DistMultiTrainer::InitDumpEnv() {
-  queue_ = paddle::framework::MakeChannel<std::string>();
-  int err_no = 0;
-  std::string path = string::format_string(
-      "%s/part-%03d", dump_fields_path_.c_str(), mpi_rank_);
-
-  if (user_define_dump_filename_ != "") {
-    path = string::format_string("%s/part-%s", dump_fields_path_.c_str(),
-                                 user_define_dump_filename_.c_str());
-  }
-
-  fp_ = fs_open_write(path, &err_no, dump_converter_);
-  for (int i = 0; i < thread_num_; ++i) {
-    workers_[i]->SetChannelWriter(queue_.get());
-  }
-  dump_thread_ = std::thread(&DistMultiTrainer::DumpWork, this);
-}
-
-void DistMultiTrainer::FinalizeDumpEnv() {
-  queue_->Close();
-  dump_thread_.join();
-  queue_.reset();
-}
-
-void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
-  if (need_dump_field_) {
-    InitDumpEnv();
-  }
-  pull_dense_worker_->SetRootScope(root_scope_);
-  pull_dense_worker_->Start();
-  VLOG(3) << "init other env done.";
-}
-
-void DistMultiTrainer::Run() {
-  for (int thidx = 0; thidx < thread_num_; ++thidx) {
-    if (!debug_) {
-      threads_.push_back(
-          std::thread(&DeviceWorker::TrainFiles, workers_[thidx].get()));
-    } else {
-      threads_.push_back(std::thread(&DeviceWorker::TrainFilesWithProfiler,
-                                     workers_[thidx].get()));
-    }
-  }
-}
-
-void DistMultiTrainer::Finalize() {
-  for (auto &th : threads_) {
-    th.join();
-  }
-  for (int i = 0; i < need_merge_var_names_.size(); i++) {
-    Variable *root_var = root_scope_->FindVar(need_merge_var_names_[i]);
-    if (root_var == nullptr) {
-      continue;
-    }
-    LoDTensor *root_tensor = root_var->GetMutable<LoDTensor>();
-    for (int j = 1; j < thread_num_; j++) {
-      Scope *cur_thread_scope = workers_[j]->GetThreadScope();
-      Variable *thread_var =
-          cur_thread_scope->FindVar(need_merge_var_names_[i]);
-      LoDTensor *thread_tensor = thread_var->GetMutable<LoDTensor>();
-      if (root_tensor->numel() != thread_tensor->numel()) {
-        continue;
-      }
-#define MergeCallback(cpp_type, proto_type)                                    \
-  do {                                                                         \
-    if (root_tensor->type() == proto_type) {                                   \
-      if (thread_tensor->type() != proto_type) {                               \
-        VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \
-                << "] " << need_merge_var_names_[i]                            \
-                << ", root tensor type=" << root_tensor->type()                \
-                << ", thread tensor type=" << thread_tensor->type();           \
-        exit(-1);                                                              \
-      }                                                                        \
-      MergeToRootScope<cpp_type>(root_tensor, thread_tensor);                  \
-    }                                                                          \
-  } while (0)
-      _ForEachDataType_(MergeCallback);
-    }
-  }
-
-  if (need_dump_field_) {
-    FinalizeDumpEnv();
-  }
-  pull_dense_worker_->Stop();
-  root_scope_->DropKids();
-
-  // flush local client push queue
-  auto fleet_ptr_ = FleetWrapper::GetInstance();
-  fleet_ptr_->ClientFlush();
-}
-
-template <typename T>
-void DistMultiTrainer::MergeToRootScope(LoDTensor *root_tensor,
-                                        LoDTensor *tensor) {
-  T *root_data = root_tensor->data<T>();
-  T *data = tensor->data<T>();
-  for (int i = 0; i < tensor->numel(); i++) {
-    root_data[i] += data[i];
-  }
-}
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
deleted file mode 100644
index 39652706c43fb51da99170b361b3e1a6e04c6fc9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/dlpack_tensor.h"
-#include "paddle/fluid/framework/data_type.h"
-namespace paddle {
-namespace framework {
-
-namespace internal {
-template <typename T>
-static ::DLDataType GetDLDataTypeCode() {
-  ::DLDataType dtype;
-  if (std::is_same<T, platform::float16>::value ||
-      std::is_floating_point<T>::value) {
-    dtype.code = kDLFloat;
-  } else if (std::is_unsigned<T>::value) {
-    dtype.code = kDLUInt;
-  } else if (std::is_integral<T>::value) {
-    dtype.code = kDLInt;
-  } else {
-    PADDLE_THROW("Unsupported data type %s", typeid(T).name());
-  }
-  dtype.bits = 8 * sizeof(T);
-  dtype.lanes = 1;
-  return dtype;
-}
-
-static std::unordered_map<int, ::DLDataType> CreateDLDataTypeMap() {
-  static std::unordered_map<int, ::DLDataType> result;
-
-#define REG_DL_DATA_TYPE(cpp_type, proto_type) \
-  result[static_cast<int>(proto_type)] = GetDLDataTypeCode<cpp_type>()
-
-  _ForEachDataType_(REG_DL_DATA_TYPE);
-#undef REG_DL_DATA_TYPE
-  return result;
-}
-
-static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {
-  static auto type_to_dtype_map = CreateDLDataTypeMap();
-  static auto type_to_dtype_map_end_it = type_to_dtype_map.end();
-  auto it = type_to_dtype_map.find(static_cast<int>(type));
-  PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %d",
-                 type);
-  return it->second;
-#undef REG_DL_DATA_TYPE
-}
-
-struct DLContextVisitor : public boost::static_visitor<::DLContext> {
-  inline ::DLContext operator()(const platform::CPUPlace &place) const {
-    ::DLContext ctx;
-    ctx.device_type = kDLCPU;
-    ctx.device_id = 0;
-    return ctx;
-  }
-
-  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
-#ifdef PADDLE_WITH_CUDA
-    ::DLContext ctx;
-    ctx.device_type = kDLGPU;
-    ctx.device_id = place.device;
-    return ctx;
-#else
-    PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version");
-#endif
-  }
-
-  inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
-#ifdef PADDLE_WITH_CUDA
-    ::DLContext ctx;
-    ctx.device_type = kDLCPUPinned;
-    ctx.device_id = 0;
-    return ctx;
-#else
-    PADDLE_THROW(
-        "platform::CUDAPinnedPlace is not supported in CPU only version");
-#endif
-  }
-};
-}  // namespace internal
-
-DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
-  // init data, data buffer
-  t_.data = const_cast<void *>(tensor.data<void>());
-
-  // init ctx, DLContext type with device_type and device_id
-  auto place = tensor.place();
-  t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place);
-
-  // init dtype
-  t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type());
-  t_.dtype.lanes = lanes;
-
-  // init ndim, tensor rank
-  auto &dims = tensor.dims();
-  using DimType = decltype(t_.ndim);  // int
-  t_.ndim = static_cast<DimType>(dims.size());
-
-  // init shape, tensor dims
-  t_.shape = shape_;
-  for (DimType i = 0; i < t_.ndim; ++i) {
-    t_.shape[i] = dims[i];
-  }
-
-  // init strides, nullptr means the tensor is compact
-  t_.strides = nullptr;
-
-  // init byte_offset
-  t_.byte_offset = 0;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
deleted file mode 100644
index e48b0d5c88fecf797a61283b004735fdcbabb329..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <dlpack/dlpack.h>
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace framework {
-
-class DLPackTensor {
- public:
-  using LaneType = decltype(::DLTensor::dtype.lanes);  // uint16_t
-  using ShapeType =
-      std::remove_reference<decltype(::DLTensor::shape[0])>::type;  // int64_t
-
-  // lanes is only used in CPU to enable vectorization
-  explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1);
-
-  inline operator const ::DLTensor&() const { return t_; }
-
-  inline operator ::DLTensor&() { return t_; }
-
- private:
-  ::DLTensor t_;
-
-  // The shape in DLTensor is defined as int64_t*
-  // Add this member to make TVMTensor init without heap allocation
-  ShapeType shape_[DDim::kMaxRank];
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
deleted file mode 100644
index c0a8e1bcdfa3a54aea061f1a0815fc1405c76d9c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/dlpack_tensor.h"
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-namespace {  // NOLINT
-template <typename T>
-constexpr uint8_t GetDLDataTypeCode() {
-  return std::is_same<platform::float16, T>::value ||
-                 std::is_floating_point<T>::value
-             ? static_cast<uint8_t>(kDLFloat)
-             : (std::is_unsigned<T>::value
-                    ? static_cast<uint8_t>(kDLUInt)
-                    : (std::is_integral<T>::value ? static_cast<uint8_t>(kDLInt)
-                                                  : static_cast<uint8_t>(-1)));
-}
-}  // NOLINT
-
-template <typename T>
-void TestMain(const platform::Place &place, uint16_t lanes) {
-  DDim dims{4, 5, 6, 7};
-  Tensor tensor;
-  tensor.Resize(dims);
-  void *p = tensor.mutable_data<T>(place);
-
-  DLPackTensor dlpack_tensor(tensor, lanes);
-  ::DLTensor &dl_tensor = dlpack_tensor;
-
-  CHECK_EQ(p, dl_tensor.data);
-  if (platform::is_cpu_place(place)) {
-    CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type);
-    CHECK_EQ(0, dl_tensor.ctx.device_id);
-  } else if (platform::is_gpu_place(place)) {
-    CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type);
-    CHECK_EQ(boost::get<platform::CUDAPlace>(place).device,
-             dl_tensor.ctx.device_id);
-  } else if (platform::is_cuda_pinned_place(place)) {
-    CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type);
-    CHECK_EQ(0, dl_tensor.ctx.device_id);
-  } else {
-    CHECK_EQ(false, true);
-  }
-
-  CHECK_EQ(dims.size(), dl_tensor.ndim);
-  for (auto i = 0; i < dims.size(); ++i) {
-    CHECK_EQ(dims[i], dl_tensor.shape[i]);
-  }
-
-  CHECK_EQ(dl_tensor.strides == nullptr, true);
-  CHECK_EQ(static_cast<uint64_t>(0), dl_tensor.byte_offset);
-
-  CHECK_EQ(lanes, dl_tensor.dtype.lanes);
-  CHECK_EQ(sizeof(T) * 8, dl_tensor.dtype.bits);
-
-  CHECK_EQ(GetDLDataTypeCode<T>(), dl_tensor.dtype.code);
-}
-
-template <typename T>
-void TestMainLoop() {
-#ifdef PADDLE_WITH_CUDA
-  std::vector<platform::Place> places{platform::CPUPlace(),
-                                      platform::CUDAPlace(0),
-                                      platform::CUDAPinnedPlace()};
-  if (platform::GetCUDADeviceCount() > 1) {
-    places.emplace_back(platform::CUDAPlace(1));
-  }
-#else
-  std::vector<platform::Place> places{platform::CPUPlace()};
-#endif
-  std::vector<uint16_t> lanes{1, 2};
-  for (auto &p : places) {
-    for (auto &l : lanes) {
-      TestMain<T>(p, l);
-    }
-  }
-}
-TEST(dlpack, test_all) {
-#define TestCallback(cpp_type, proto_type) TestMainLoop<cpp_type>()
-
-  _ForEachDataType_(TestCallback);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
deleted file mode 100644
index e7dbf3b1131740748a5258455fdc76e2a50f1fc9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/downpour_worker.cc
+++ /dev/null
@@ -1,782 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/string/string_helper.h"
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
-namespace paddle {
-namespace framework {
-
-void DownpourWorker::Initialize(const TrainerDesc& desc) {
-  param_ = desc.downpour_param();
-  for (int i = 0; i < param_.sparse_table_size(); ++i) {
-    uint64_t table_id =
-        static_cast<uint64_t>(param_.sparse_table(i).table_id());
-    TableParameter table = param_.sparse_table(i);
-    sparse_key_names_[table_id].resize(table.sparse_key_name_size());
-    for (int j = 0; j < table.sparse_key_name_size(); ++j) {
-      sparse_key_names_[table_id][j] = table.sparse_key_name(j);
-    }
-    sparse_value_names_[table_id].resize(table.sparse_value_name_size());
-    for (int j = 0; j < table.sparse_value_name_size(); ++j) {
-      sparse_value_names_[table_id][j] = table.sparse_value_name(j);
-    }
-    sparse_grad_names_[table_id].resize(table.sparse_grad_name_size());
-    for (int j = 0; j < table.sparse_grad_name_size(); ++j) {
-      sparse_grad_names_[table_id][j] = table.sparse_grad_name(j);
-    }
-    label_var_name_[table_id] = table.label_var_name();
-  }
-
-  for (int i = 0; i < param_.dense_table_size(); ++i) {
-    uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
-    auto table = param_.dense_table(i);
-    dense_value_names_[table_id].resize(table.dense_value_name_size());
-    for (int j = 0; j < table.dense_value_name_size(); ++j) {
-      dense_value_names_[table_id][j] = table.dense_value_name(j);
-    }
-    dense_grad_names_[table_id].resize(table.dense_grad_name_size());
-    for (int j = 0; j < table.dense_grad_name_size(); ++j) {
-      dense_grad_names_[table_id][j] = table.dense_grad_name(j);
-    }
-  }
-
-  skip_ops_.resize(param_.skip_ops_size());
-  for (int i = 0; i < param_.skip_ops_size(); ++i) {
-    skip_ops_[i] = param_.skip_ops(i);
-  }
-
-  for (int i = 0; i < param_.stat_var_names_size(); ++i) {
-    stat_var_name_map_[param_.stat_var_names(i)] = 1;
-  }
-
-  need_to_push_sparse_ = param_.push_sparse();
-  need_to_push_dense_ = param_.push_dense();
-
-  fleet_ptr_ = FleetWrapper::GetInstance();
-  fetch_config_ = desc.fetch_config();
-  use_cvm_ = desc.use_cvm();
-  scale_datanorm_ = desc.scale_datanorm();
-  dump_slot_ = desc.dump_slot();
-  dump_fields_.resize(desc.dump_fields_size());
-  for (int i = 0; i < desc.dump_fields_size(); ++i) {
-    dump_fields_[i] = desc.dump_fields(i);
-  }
-  adjust_ins_weight_config_ = desc.adjust_ins_weight_config();
-}
-
-void DownpourWorker::SetChannelWriter(ChannelObject<std::string>* queue) {
-  writer_.Reset(queue);
-}
-
-void DownpourWorker::SetNeedDump(bool need_dump_field) {
-  need_dump_field_ = need_dump_field;
-}
-
-template <typename T>
-std::string PrintLodTensorType(LoDTensor* tensor, int64_t start, int64_t end) {
-  auto count = tensor->numel();
-  if (start < 0 || end > count) {
-    VLOG(3) << "access violation";
-    return "access violation";
-  }
-  std::ostringstream os;
-  for (int64_t i = start; i < end; i++) {
-    os << ":" << tensor->data<T>()[i];
-  }
-  return os.str();
-}
-
-std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start,
-                                  int64_t end) {
-  auto count = tensor->numel();
-  if (start < 0 || end > count) {
-    VLOG(3) << "access violation";
-    return "access violation";
-  }
-  std::ostringstream os;
-  for (int64_t i = start; i < end; i++) {
-    os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
-  }
-  return os.str();
-}
-
-std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end) {
-  std::string out_val;
-  if (tensor->type() == proto::VarType::FP32) {
-    out_val = PrintLodTensorType<float>(tensor, start, end);
-  } else if (tensor->type() == proto::VarType::INT64) {
-    out_val = PrintLodTensorIntType(tensor, start, end);
-  } else if (tensor->type() == proto::VarType::FP64) {
-    out_val = PrintLodTensorType<double>(tensor, start, end);
-  } else {
-    out_val = "unsupported type";
-  }
-  return out_val;
-}
-
-std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index) {
-  auto& dims = tensor->dims();
-  if (tensor->lod().size() != 0) {
-    auto& lod = tensor->lod()[0];
-    return {lod[index] * dims[1], lod[index + 1] * dims[1]};
-  } else {
-    return {index * dims[1], (index + 1) * dims[1]};
-  }
-}
-
-bool CheckValidOutput(LoDTensor* tensor, int batch_size) {
-  auto& dims = tensor->dims();
-  if (dims.size() != 2) return false;
-  if (tensor->lod().size() != 0) {
-    auto& lod = tensor->lod()[0];
-    if (lod.size() != batch_size + 1) {
-      return false;
-    }
-  } else {
-    if (dims[0] != batch_size) {
-      return false;
-    }
-  }
-  return true;
-}
-
-void DownpourWorker::CollectLabelInfo(size_t table_idx) {
-  uint64_t table_id = static_cast<uint64_t>(
-      param_.program_config(0).pull_sparse_table_id(table_idx));
-
-  TableParameter table;
-  for (auto i : param_.sparse_table()) {
-    if (i.table_id() == table_id) {
-      table = i;
-      break;
-    }
-  }
-  auto& feature = features_[table_id];
-  auto& feature_label = feature_labels_[table_id];
-  feature_label.resize(feature.size());
-  Variable* var = thread_scope_->FindVar(label_var_name_[table_id]);
-  LoDTensor* tensor = var->GetMutable<LoDTensor>();
-  int64_t* label_ptr = tensor->data<int64_t>();
-
-  size_t global_index = 0;
-  for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
-    VLOG(3) << "sparse_key_names_[" << i
-            << "]: " << sparse_key_names_[table_id][i];
-    Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]);
-    if (fea_var == nullptr) {
-      continue;
-    }
-    LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
-    CHECK(tensor != nullptr) << "tensor of var "
-                             << sparse_key_names_[table_id][i] << " is null";
-    int64_t* ids = tensor->data<int64_t>();
-    size_t fea_idx = 0;
-    // tensor->lod()[0].size() == batch_size + 1
-    for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) {
-      for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) {
-        // should be skipped feasign defined in protobuf
-        if (ids[fea_idx] == 0u) {
-          continue;
-        }
-        feature_label[global_index++] =
-            static_cast<float>(label_ptr[lod_idx - 1]);
-      }
-    }
-  }
-  CHECK(global_index == feature.size())
-      << "expect fea info size:" << feature.size() << " real:" << global_index;
-}
-
-void DownpourWorker::FillSparseValue(size_t table_idx) {
-  uint64_t table_id = static_cast<uint64_t>(
-      param_.program_config(0).pull_sparse_table_id(table_idx));
-
-  TableParameter table;
-  for (auto i : param_.sparse_table()) {
-    if (i.table_id() == table_id) {
-      table = i;
-      break;
-    }
-  }
-
-  auto& fea_value = feature_values_[table_id];
-  auto fea_idx = 0u;
-
-  std::vector<float> init_value(table.fea_dim());
-  for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
-    std::string slot_name = sparse_key_names_[table_id][i];
-    std::string emb_slot_name = sparse_value_names_[table_id][i];
-    Variable* var = thread_scope_->FindVar(slot_name);
-    if (var == nullptr) {
-      continue;
-    }
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    CHECK(tensor != nullptr) << "tensor of var " << slot_name << " is null";
-    int64_t* ids = tensor->data<int64_t>();
-    int len = tensor->numel();
-    Variable* var_emb = thread_scope_->FindVar(emb_slot_name);
-    LoDTensor* tensor_emb = var_emb->GetMutable<LoDTensor>();
-    float* ptr = tensor_emb->mutable_data<float>({len, table.emb_dim()},
-                                                 platform::CPUPlace());
-    memset(ptr, 0, sizeof(float) * len * table.emb_dim());
-    auto& tensor_lod = tensor->lod()[0];
-    LoD data_lod{tensor_lod};
-    tensor_emb->set_lod(data_lod);
-
-    bool is_nid = (adjust_ins_weight_config_.need_adjust() &&
-                   adjust_ins_weight_config_.nid_slot() == emb_slot_name);
-    if (is_nid) {
-      nid_show_.clear();
-    }
-    int nid_ins_index = 0;
-
-    for (int index = 0; index < len; ++index) {
-      if (use_cvm_) {
-        if (ids[index] == 0u) {
-          memcpy(ptr + table.emb_dim() * index, init_value.data(),
-                 sizeof(float) * table.emb_dim());
-          if (is_nid) {
-            nid_show_.push_back(-1);
-            ++nid_ins_index;
-          }
-          continue;
-        }
-        memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data(),
-               sizeof(float) * table.emb_dim());
-        if (is_nid && index == tensor->lod()[0][nid_ins_index]) {
-          nid_show_.push_back(fea_value[fea_idx][0]);
-          ++nid_ins_index;
-        }
-        fea_idx++;
-      } else {
-        if (ids[index] == 0u) {
-          memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
-                 sizeof(float) * table.emb_dim());
-          if (is_nid) {
-            nid_show_.push_back(-1);
-            ++nid_ins_index;
-          }
-          continue;
-        }
-        memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2,
-               sizeof(float) * table.emb_dim());
-        if (is_nid && index == tensor->lod()[0][nid_ins_index]) {
-          nid_show_.push_back(fea_value[fea_idx][0]);
-          ++nid_ins_index;
-        }
-        fea_idx++;
-      }
-    }
-  }
-}
-
-void DownpourWorker::AdjustInsWeight() {
-#ifdef _LINUX
-  // check var and tensor not null
-  if (!adjust_ins_weight_config_.need_adjust()) {
-    VLOG(0) << "need_adjust=false, skip adjust ins weight";
-    return;
-  }
-  Variable* nid_var =
-      thread_scope_->FindVar(adjust_ins_weight_config_.nid_slot());
-  if (nid_var == nullptr) {
-    VLOG(0) << "nid slot var " << adjust_ins_weight_config_.nid_slot()
-            << " is nullptr, skip adjust ins weight";
-    return;
-  }
-  LoDTensor* nid_tensor = nid_var->GetMutable<LoDTensor>();
-  if (nid_tensor == nullptr) {
-    VLOG(0) << "tensor of nid slot var " << adjust_ins_weight_config_.nid_slot()
-            << " is nullptr, skip adjust ins weight";
-    return;
-  }
-  Variable* ins_weight_var =
-      thread_scope_->FindVar(adjust_ins_weight_config_.ins_weight_slot());
-  if (ins_weight_var == nullptr) {
-    VLOG(0) << "ins weight var " << adjust_ins_weight_config_.ins_weight_slot()
-            << " is nullptr, skip adjust ins weight";
-    return;
-  }
-  LoDTensor* ins_weight_tensor = ins_weight_var->GetMutable<LoDTensor>();
-  if (ins_weight_tensor == nullptr) {
-    VLOG(0) << "tensor of ins weight tensor "
-            << adjust_ins_weight_config_.ins_weight_slot()
-            << " is nullptr, skip adjust ins weight";
-    return;
-  }
-
-  float* ins_weights = ins_weight_tensor->data<float>();
-  size_t len = ins_weight_tensor->numel();  // len = batch size
-  // here we assume nid_show slot only has one feasign in each instance
-  CHECK(len == nid_show_.size()) << "ins_weight size should be equal to "
-                                 << "nid_show size, " << len << " vs "
-                                 << nid_show_.size();
-  float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold();
-  float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio();
-  int64_t nid_adjw_num = 0;
-  double nid_adjw_weight = 0.0;
-  size_t ins_index = 0;
-  for (int i = 0; i < len; ++i) {
-    float nid_show = nid_show_[i];
-    VLOG(3) << "nid_show " << nid_show;
-    if (nid_show < 0) {
-      VLOG(3) << "nid_show < 0, continue";
-      continue;
-    }
-    float ins_weight = 1.0;
-    if (nid_show >= 0 && nid_show < nid_adjw_threshold) {
-      ins_weight = log(M_E +
-                       (nid_adjw_threshold - nid_show) / nid_adjw_threshold *
-                           nid_adjw_ratio);
-      // count nid adjw insnum and weight
-      ++nid_adjw_num;
-      nid_adjw_weight += ins_weight;
-      // choose large ins weight
-      VLOG(3) << "ins weight new " << ins_weight << ", ins weight origin "
-              << ins_weights[ins_index];
-      if (ins_weight > ins_weights[ins_index]) {
-        VLOG(3) << "ins " << ins_index << " weight changes to " << ins_weight;
-        ins_weights[ins_index] = ins_weight;
-      }
-      ++ins_index;
-    }
-  }
-  VLOG(3) << "nid adjw info: total_adjw_num: " << nid_adjw_num
-          << ", avg_adjw_weight: " << nid_adjw_weight;
-#endif
-}
-
-void DownpourWorker::TrainFilesWithProfiler() {
-  VLOG(3) << "Begin to train files with profiler";
-  platform::SetNumThreads(1);
-  device_reader_->Start();
-  std::vector<double> op_total_time;
-  std::vector<std::string> op_name;
-  for (auto& op : ops_) {
-    bool need_skip = false;
-    for (auto t = 0u; t < skip_ops_.size(); ++t) {
-      if (op->Type().find(skip_ops_[t]) != std::string::npos) {
-        need_skip = true;
-        break;
-      }
-    }
-    if (!need_skip) {
-      op_name.push_back(op->Type());
-    }
-  }
-
-  VLOG(3) << "op name size: " << op_name.size();
-  op_total_time.resize(op_name.size());
-  for (size_t i = 0; i < op_total_time.size(); ++i) {
-    op_total_time[i] = 0.0;
-  }
-  platform::Timer timeline;
-  double total_time = 0.0;
-  double read_time = 0.0;
-  double pull_sparse_time = 0.0;
-  double adjust_ins_weight_time = 0.0;
-  double collect_label_time = 0.0;
-  double fill_sparse_time = 0.0;
-  double push_sparse_time = 0.0;
-  double push_dense_time = 0.0;
-  int cur_batch;
-  int batch_cnt = 0;
-  uint64_t total_inst = 0;
-  timeline.Start();
-  while ((cur_batch = device_reader_->Next()) > 0) {
-    timeline.Pause();
-    read_time += timeline.ElapsedSec();
-    total_time += timeline.ElapsedSec();
-    VLOG(3) << "program config size: " << param_.program_config_size();
-    for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
-         ++i) {
-      uint64_t tid = static_cast<uint64_t>(
-          param_.program_config(0).pull_sparse_table_id(i));
-      TableParameter table;
-      for (auto j : param_.sparse_table()) {
-        if (j.table_id() == tid) {
-          table = j;
-          break;
-        }
-      }
-      timeline.Start();
-      fleet_ptr_->PullSparseVarsSync(*thread_scope_, tid,
-                                     sparse_key_names_[tid], &features_[tid],
-                                     &feature_values_[tid], table.fea_dim());
-      timeline.Pause();
-      pull_sparse_time += timeline.ElapsedSec();
-      total_time += timeline.ElapsedSec();
-      timeline.Start();
-      CollectLabelInfo(i);
-      timeline.Pause();
-      collect_label_time += timeline.ElapsedSec();
-      total_time += timeline.ElapsedSec();
-      timeline.Start();
-      FillSparseValue(i);
-      timeline.Pause();
-      fill_sparse_time += timeline.ElapsedSec();
-      total_time += timeline.ElapsedSec();
-      timeline.Start();
-      auto nid_iter = std::find(sparse_value_names_[tid].begin(),
-                                sparse_value_names_[tid].end(),
-                                adjust_ins_weight_config_.nid_slot());
-      if (nid_iter != sparse_value_names_[tid].end()) {
-        AdjustInsWeight();
-      }
-      timeline.Pause();
-      adjust_ins_weight_time += timeline.ElapsedSec();
-      total_time += timeline.ElapsedSec();
-    }
-    VLOG(3) << "Fill sparse value for all sparse table done.";
-
-    int run_op_idx = 0;
-    for (auto& op : ops_) {
-      bool need_skip = false;
-      for (auto t = 0u; t < skip_ops_.size(); ++t) {
-        if (op->Type().find(skip_ops_[t]) != std::string::npos) {
-          need_skip = true;
-          break;
-        }
-      }
-      if (!need_skip) {
-        timeline.Start();
-        VLOG(3) << "Going to run op " << op_name[run_op_idx];
-        op->Run(*thread_scope_, place_);
-        VLOG(3) << "Op " << op_name[run_op_idx] << " Finished";
-        timeline.Pause();
-        op_total_time[run_op_idx++] += timeline.ElapsedSec();
-        total_time += timeline.ElapsedSec();
-      }
-    }
-
-    if (need_to_push_sparse_) {
-      for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
-           ++i) {
-        uint64_t tid = static_cast<uint64_t>(
-            param_.program_config(0).push_sparse_table_id(i));
-        TableParameter table;
-        for (auto i : param_.sparse_table()) {
-          if (i.table_id() == tid) {
-            table = i;
-            break;
-          }
-        }
-        timeline.Start();
-        fleet_ptr_->PushSparseVarsWithLabelAsync(
-            *thread_scope_, tid, features_[tid], feature_labels_[tid],
-            sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
-            &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_);
-        timeline.Pause();
-        push_sparse_time += timeline.ElapsedSec();
-        total_time += timeline.ElapsedSec();
-      }
-    }
-
-    if (need_to_push_dense_) {
-      timeline.Start();
-      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
-           ++i) {
-        uint64_t tid = static_cast<uint64_t>(
-            param_.program_config(0).push_dense_table_id(i));
-        fleet_ptr_->PushDenseVarsAsync(
-            *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_,
-            scale_datanorm_, cur_batch);
-      }
-      timeline.Pause();
-      push_dense_time += timeline.ElapsedSec();
-      total_time += timeline.ElapsedSec();
-      VLOG(3) << "push sparse and dense gradient done.";
-      int32_t tmp_push_dense_wait_times = -1;
-      static uint32_t push_dense_wait_times =
-          static_cast<uint32_t>(tmp_push_dense_wait_times);
-      if (push_dense_status_.size() >= push_dense_wait_times) {
-        for (auto& t : push_dense_status_) {
-          t.wait();
-        }
-        push_dense_status_.resize(0);
-      }
-
-      if (tmp_push_dense_wait_times == -1) {
-        push_dense_status_.resize(0);
-      }
-    }
-
-    if (need_to_push_sparse_) {
-      int32_t tmp_push_sparse_wait_times = -1;
-      static uint32_t push_sparse_wait_times =
-          static_cast<uint32_t>(tmp_push_sparse_wait_times);
-      if (push_sparse_status_.size() >= push_sparse_wait_times) {
-        for (auto& t : push_sparse_status_) {
-          t.wait();
-        }
-        push_sparse_status_.resize(0);
-      }
-
-      if (tmp_push_sparse_wait_times == -1) {
-        push_sparse_status_.resize(0);
-      }
-
-      VLOG(3) << "going to increase thread version";
-      VLOG(3) << "push dense table id size: "
-              << param_.program_config(0).push_dense_table_id_size();
-    }
-
-    if (need_to_push_dense_) {
-      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
-           ++i) {
-        uint64_t tid = static_cast<uint64_t>(
-            param_.program_config(0).push_dense_table_id(i));
-        pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
-      }
-    }
-
-    PrintFetchVars();
-    thread_scope_->DropKids();
-    total_inst += cur_batch;
-    ++batch_cnt;
-
-    if (thread_id_ == 0) {
-      // should be configured here
-      if (batch_cnt > 0 && batch_cnt % 100 == 0) {
-        double op_sum_time = 0;
-        std::unordered_map<std::string, double> op_to_time;
-        for (size_t i = 0; i < op_total_time.size(); ++i) {
-          fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
-                  op_name[i].c_str(), op_total_time[i] / batch_cnt);
-          if (op_to_time.find(op_name[i]) == op_to_time.end()) {
-            op_to_time[op_name[i]] = 0.0;
-          }
-          op_to_time[op_name[i]] += op_total_time[i];
-          op_sum_time += op_total_time[i];
-        }
-        for (auto& i : op_to_time) {
-          fprintf(stderr, "op [%s] run total time: [%f]ms\n", i.first.c_str(),
-                  i.second / batch_cnt);
-        }
-        fprintf(stderr, "op run total time: %fs\n", op_sum_time / batch_cnt);
-        fprintf(stderr, "train total time: %fs\n", total_time / batch_cnt);
-        fprintf(stderr, "pull sparse time: %fs\n",
-                pull_sparse_time / batch_cnt);
-        fprintf(stderr, "fill sparse time: %fs\n",
-                fill_sparse_time / batch_cnt);
-        fprintf(stderr, "push sparse time: %fs\n",
-                push_sparse_time / batch_cnt);
-        fprintf(stderr, "push dense time: %fs\n", push_dense_time / batch_cnt);
-        fprintf(stderr, "collect label time: %fs\n",
-                collect_label_time / batch_cnt);
-        fprintf(stderr, "adjust ins weight time: %fs\n",
-                adjust_ins_weight_time / batch_cnt);
-        fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
-        fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100);
-        fprintf(stderr, "op run percent: %f\n", op_sum_time / total_time * 100);
-        fprintf(stderr, "pull sparse time percent: %f\n",
-                pull_sparse_time / total_time * 100);
-        fprintf(stderr, "adjust ins weight time percent: %f\n",
-                adjust_ins_weight_time / total_time * 100);
-        fprintf(stderr, "collect label time percent: %f\n",
-                collect_label_time / total_time * 100);
-        fprintf(stderr, "fill sparse time percent: %f\n",
-                fill_sparse_time / total_time * 100);
-        fprintf(stderr, "push sparse time percent: %f\n",
-                push_sparse_time / total_time * 100);
-        fprintf(stderr, "push dense time percent: %f\n",
-                push_dense_time / total_time * 100);
-        fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time);
-      }
-    }
-    timeline.Start();
-  }
-}
-
-void DownpourWorker::TrainFiles() {
-  VLOG(3) << "Begin to train files";
-  platform::SetNumThreads(1);
-  device_reader_->Start();
-  int batch_cnt = 0;
-  int cur_batch;
-  while ((cur_batch = device_reader_->Next()) > 0) {
-    // pull sparse here
-    for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
-         ++i) {
-      uint64_t tid = static_cast<uint64_t>(
-          param_.program_config(0).pull_sparse_table_id(i));
-      TableParameter table;
-      for (auto j : param_.sparse_table()) {
-        if (j.table_id() == tid) {
-          table = j;
-          break;
-        }
-      }
-      fleet_ptr_->PullSparseVarsSync(*thread_scope_, tid,
-                                     sparse_key_names_[tid], &features_[tid],
-                                     &feature_values_[tid], table.fea_dim());
-      CollectLabelInfo(i);
-      FillSparseValue(i);
-      auto nid_iter = std::find(sparse_value_names_[tid].begin(),
-                                sparse_value_names_[tid].end(),
-                                adjust_ins_weight_config_.nid_slot());
-      if (nid_iter != sparse_value_names_[tid].end()) {
-        AdjustInsWeight();
-      }
-    }
-    VLOG(3) << "fill sparse value for all sparse table done.";
-
-    // do computation here
-    for (auto& op : ops_) {
-      bool need_skip = false;
-      for (auto t = 0u; t < skip_ops_.size(); ++t) {
-        if (op->Type().find(skip_ops_[t]) != std::string::npos) {
-          need_skip = true;
-          break;
-        }
-      }
-      if (!need_skip) {
-        op->Run(*thread_scope_, place_);
-      }
-    }
-
-    if (need_to_push_sparse_) {
-      // push gradients here
-      for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
-           ++i) {
-        uint64_t tid = static_cast<uint64_t>(
-            param_.program_config(0).push_sparse_table_id(i));
-        TableParameter table;
-        for (auto i : param_.sparse_table()) {
-          if (i.table_id() == tid) {
-            table = i;
-            break;
-          }
-        }
-        fleet_ptr_->PushSparseVarsWithLabelAsync(
-            *thread_scope_, tid, features_[tid], feature_labels_[tid],
-            sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
-            &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_);
-      }
-    }
-
-    if (need_to_push_dense_) {
-      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
-           ++i) {
-        uint64_t tid = static_cast<uint64_t>(
-            param_.program_config(0).push_dense_table_id(i));
-        fleet_ptr_->PushDenseVarsAsync(
-            *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_,
-            scale_datanorm_, cur_batch);
-      }
-      VLOG(3) << "push dense gradient done.";
-
-      // the following code should be more precise and clean
-      // TODO(guru4elephant)
-      int32_t tmp_push_dense_wait_times = -1;
-      static uint32_t push_dense_wait_times =
-          static_cast<uint32_t>(tmp_push_dense_wait_times);
-
-      if (push_dense_status_.size() >= push_dense_wait_times) {
-        for (auto& t : push_dense_status_) {
-          t.wait();
-        }
-        push_dense_status_.resize(0);
-      }
-
-      if (tmp_push_dense_wait_times == -1) {
-        push_dense_status_.resize(0);
-      }
-    }
-
-    if (need_to_push_sparse_) {
-      VLOG(3) << "push sparse gradient done.";
-      int32_t tmp_push_sparse_wait_times = -1;
-      static uint32_t push_sparse_wait_times =
-          static_cast<uint32_t>(tmp_push_sparse_wait_times);
-      if (push_sparse_status_.size() >= push_sparse_wait_times) {
-        for (auto& t : push_sparse_status_) {
-          t.wait();
-        }
-        push_sparse_status_.resize(0);
-      }
-
-      if (tmp_push_sparse_wait_times == -1) {
-        push_sparse_status_.resize(0);
-      }
-    }
-
-    if (need_to_push_dense_) {
-      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
-           ++i) {
-        uint64_t tid = static_cast<uint64_t>(
-            param_.program_config(0).push_dense_table_id(i));
-        pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
-      }
-    }
-    if (need_dump_field_) {
-      int batch_size = device_reader_->GetCurBatchSize();
-      std::vector<std::string> ars(batch_size);
-      for (auto& ar : ars) {
-        ar.clear();
-      }
-      auto& ins_id_vec = device_reader_->GetInsIdVec();
-      auto& ins_content_vec = device_reader_->GetInsContentVec();
-      for (size_t i = 0; i < ins_id_vec.size(); i++) {
-        ars[i] += ins_id_vec[i];
-        ars[i] = ars[i] + "\t" + ins_content_vec[i];
-      }
-      for (auto& field : dump_fields_) {
-        Variable* var = thread_scope_->FindVar(field);
-        if (var == nullptr) {
-          continue;
-        }
-        LoDTensor* tensor = var->GetMutable<LoDTensor>();
-        if (!CheckValidOutput(tensor, batch_size)) {
-          continue;
-        }
-        for (int i = 0; i < batch_size; ++i) {
-          auto output_dim = tensor->dims()[1];
-          std::string output_dimstr =
-              boost::lexical_cast<std::string>(output_dim);
-          ars[i] = ars[i] + "\t" + field + ":" + output_dimstr;
-          auto bound = GetTensorBound(tensor, i);
-          ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
-        }
-      }
-      // #pragma omp parallel for
-      for (size_t i = 0; i < ars.size(); i++) {
-        if (ars[i].length() == 0) {
-          continue;
-        }
-        writer_ << ars[i];
-      }
-    }
-
-    PrintFetchVars();
-    thread_scope_->DropKids();
-    ++batch_cnt;
-  }
-  if (need_dump_field_) {
-    writer_.Flush();
-  }
-}
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
deleted file mode 100644
index 5bafa4345f42a1f6209b5ee31ae6ba2ded6a899c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/eigen.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/tensor.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace framework {
-
-// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
-template <int D>
-struct EigenDim {
-  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
-
-  static Type From(const DDim& dims) {
-    PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
-    Type ret;
-    for (int64_t d = 0; d < arity(dims); d++) {
-      ret[d] = dims[d];
-    }
-    return ret;
-  }
-};
-
-// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenTensor {
-  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
-  // the speed of aligned and unaligned version in future.
-  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
-
-  using ConstType =
-      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
-
-  static Type From(Tensor& tensor, DDim dims) {  // NOLINT
-    return Type(tensor.data<T>(), EigenDim<D>::From(dims));
-  }
-
-  static Type From(Tensor& tensor) {  // NOLINT
-    return From(tensor, tensor.dims_);
-  }  // NOLINT
-
-  static ConstType From(const Tensor& tensor, DDim dims) {
-    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
-  }
-
-  static ConstType From(const Tensor& tensor) {
-    return From(tensor, tensor.dims_);
-  }
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
-  static typename EigenMatrix::Type Reshape(Tensor& tensor,  // NOLINT
-                                            int num_col_dims) {
-    int rank = tensor.dims_.size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
-    return EigenMatrix::From(tensor,
-                             flatten_to_2d(tensor.dims(), num_col_dims));
-  }
-
-  static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
-                                                 int num_col_dims) {
-    int rank = tensor.dims_.size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
-    return EigenMatrix::From(tensor,
-                             flatten_to_2d(tensor.dims(), num_col_dims));
-  }
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
-  // Flatten reshapes a Tensor into an EigenVector.
-  static typename EigenVector::Type Flatten(Tensor& tensor) {  // NOLINT
-    return EigenVector::From(tensor, {product(tensor.dims_)});
-  }
-
-  static typename EigenVector::ConstType Flatten(
-      const Tensor& tensor) {  // NOLINT
-    return EigenVector::From(tensor, {product(tensor.dims_)});
-  }
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenScalar {
-  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
-  using Type = Eigen::TensorMap<
-      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
-  using ConstType = Eigen::TensorMap<
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
-
-  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }  // NOLINT
-
-  static ConstType From(const Tensor& tensor) {
-    return ConstType(tensor.data<T>());
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/eigen_test.cc b/paddle/fluid/framework/eigen_test.cc
deleted file mode 100644
index bdc526d86f8fb3bc19e9252c38d63465e1805078..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/eigen_test.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/eigen.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-
-TEST(EigenDim, From) {
-  EigenDim<3>::Type ed = EigenDim<3>::From(make_ddim({1, 2, 3}));
-  ASSERT_EQ(1, ed[0]);
-  ASSERT_EQ(2, ed[1]);
-  ASSERT_EQ(3, ed[2]);
-}
-
-TEST(Eigen, Tensor) {
-  Tensor t;
-  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
-  for (int i = 0; i < 1 * 2 * 3; i++) {
-    p[i] = static_cast<float>(i);
-  }
-
-  EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
-
-  ASSERT_EQ(1, et.dimension(0));
-  ASSERT_EQ(2, et.dimension(1));
-  ASSERT_EQ(3, et.dimension(2));
-
-  for (int i = 0; i < 1; i++) {
-    for (int j = 0; j < 2; j++) {
-      for (int k = 0; k < 3; k++) {
-        ASSERT_NEAR((i * 2 + j) * 3 + k, et(i, j, k), 1e-6f);
-      }
-    }
-  }
-}
-
-TEST(Eigen, ScalarFrom) {
-  Tensor t;
-  int* p = t.mutable_data<int>(make_ddim({1}), platform::CPUPlace());
-  *p = static_cast<int>(100);
-
-  EigenScalar<int>::Type es = EigenScalar<int>::From(t);
-
-  ASSERT_EQ(0, es.dimension(0));
-  ASSERT_EQ(100, es(0));
-}
-
-TEST(Eigen, VectorFrom) {
-  Tensor t;
-  float* p = t.mutable_data<float>(make_ddim({6}), platform::CPUPlace());
-  for (int i = 0; i < 6; i++) {
-    p[i] = static_cast<float>(i);
-  }
-
-  EigenVector<float>::Type ev = EigenVector<float>::From(t);
-
-  ASSERT_EQ(6, ev.dimension(0));
-
-  for (int i = 0; i < 6; i++) {
-    ASSERT_NEAR(i, ev(i), 1e-6f);
-  }
-}
-
-TEST(Eigen, VectorFlatten) {
-  Tensor t;
-  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
-  for (int i = 0; i < 1 * 2 * 3; i++) {
-    p[i] = static_cast<float>(i);
-  }
-
-  EigenVector<float>::Type ev = EigenVector<float>::Flatten(t);
-
-  ASSERT_EQ(1 * 2 * 3, ev.dimension(0));
-
-  for (int i = 0; i < 1 * 2 * 3; i++) {
-    ASSERT_NEAR(i, ev(i), 1e-6f);
-  }
-}
-
-TEST(Eigen, Matrix) {
-  Tensor t;
-  float* p = t.mutable_data<float>(make_ddim({2, 3}), platform::CPUPlace());
-  for (int i = 0; i < 2 * 3; i++) {
-    p[i] = static_cast<float>(i);
-  }
-
-  EigenMatrix<float>::Type em = EigenMatrix<float>::From(t);
-
-  ASSERT_EQ(2, em.dimension(0));
-  ASSERT_EQ(3, em.dimension(1));
-
-  for (int i = 0; i < 2; i++) {
-    for (int j = 0; j < 3; j++) {
-      ASSERT_NEAR(i * 3 + j, em(i, j), 1e-6f);
-    }
-  }
-}
-
-TEST(Eigen, MatrixReshape) {
-  Tensor t;
-  float* p = t.mutable_data<float>({2, 3, 6, 4}, platform::CPUPlace());
-  for (int i = 0; i < 2 * 3 * 6 * 4; ++i) {
-    p[i] = static_cast<float>(i);
-  }
-
-  EigenMatrix<float>::Type em = EigenMatrix<float>::Reshape(t, 2);
-
-  ASSERT_EQ(2 * 3, em.dimension(0));
-  ASSERT_EQ(6 * 4, em.dimension(1));
-
-  for (int i = 0; i < 2 * 3; i++) {
-    for (int j = 0; j < 6 * 4; j++) {
-      ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f);
-    }
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
deleted file mode 100644
index df9b53d6a4045489e6f402fdca91ec0d758af0ea..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/executor.cc
+++ /dev/null
@@ -1,511 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/executor.h"
-#include <deque>
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/message.h"
-#include "google/protobuf/text_format.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/trainer_desc.pb.h"
-#include "paddle/fluid/framework/trainer_factory.h"
-#include "paddle/fluid/framework/transfer_scope_cache.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
-#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
-#include "paddle/fluid/operators/controlflow/while_op_helper.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-#ifdef PADDLE_WITH_NGRAPH
-#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
-#endif
-
-DECLARE_bool(benchmark);
-DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
-DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
-
-namespace paddle {
-namespace framework {
-namespace {
-// block id starts from 0. This id is used to represent the codeblock
-// wrapping the first block 0.
-int kProgramId = -1;
-}  // namespace
-
-ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id)
-    : prog_(prog), block_id_(block_id) {}
-
-void ExecutorPrepareContext::PrepareUnusedVars(
-    const std::vector<std::string>& keep_vars, bool force_disable_gc) {
-#ifdef PADDLE_WITH_NGRAPH
-  if (FLAGS_use_ngraph) {
-    // FIXME(zjl): There is difference when ngraph and gc are both enabled
-    // in unittests. I do not know why it happens. Maybe ngraph engine
-    // would cache some variables?
-    LOG_FIRST_N(WARNING, 1)
-        << "FLAGS_use_ngraph=True, garbage collection strategy is "
-           "disabled in Executor";
-    force_disable_gc = true;
-  }
-#endif
-  force_disable_gc_ = force_disable_gc;
-  if (GetEagerDeletionThreshold() < 0 || force_disable_gc_) {
-    return;
-  }
-
-  // If gc is enabled and block size > 1
-  if (prog_.Size() > 1) {
-    operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-        prog_, block_id_, ops_);
-    operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(prog_, block_id_,
-                                                               ops_);
-    operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
-        prog_, block_id_, ops_);
-  }
-  unused_vars_ = GetUnusedVars(prog_.Block(block_id_), ops_, keep_vars);
-}
-
-ExecutorPrepareContext::~ExecutorPrepareContext() {
-  VLOG(5) << "destroy ExecutorPrepareContext";
-}
-
-Executor::Executor(const platform::Place& place) : place_(place) {}
-
-void Executor::Close() {
-#ifdef PADDLE_WITH_DISTRIBUTE
-  // TODO(typhoonzero): complete message will need to use real trainer_id,
-  // except 0.
-  auto client =
-      paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-  client->SendComplete();
-#endif
-}
-
-void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
-                               int block_id) {
-  auto& global_block = pdesc.Block(block_id);
-
-  const Scope* ancestor_scope = scope;
-  while (ancestor_scope->parent()) {
-    ancestor_scope = ancestor_scope->parent();
-  }
-
-  if (ancestor_scope != scope) {
-    for (auto& var : global_block.AllVars()) {
-      if (var->Name() == framework::kEmptyVarName) {
-        continue;
-      }
-
-      if (var->Persistable()) {
-        auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
-        InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " global, which pointer is " << ptr;
-      } else {
-        auto* ptr = scope->Var(var->Name());
-        InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " locally, which pointer is " << ptr;
-      }
-    }
-  } else {
-    for (auto& var : global_block.AllVars()) {
-      auto* ptr = scope->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-              << ptr;
-    }
-  }
-}
-
-void Executor::RunFromDataset(const ProgramDesc& main_program, Scope* scope,
-                              Dataset* dataset,
-                              const std::string& trainer_desc_str) {
-  VLOG(3) << "Start to RunFromDataset in executor";
-  TrainerDesc trainer_desc;
-  bool success = trainer_desc.ParseFromString(trainer_desc_str);
-  PADDLE_ENFORCE(success, "Fail to parse TrainerDesc from string:\n%s",
-                 trainer_desc_str.c_str());
-  VLOG(3) << "Going to create trainer, trainer class is "
-          << trainer_desc.class_name();
-  std::shared_ptr<TrainerBase> trainer;
-  trainer = TrainerFactory::CreateTrainer(trainer_desc.class_name());
-  // initialize trainer
-  VLOG(3) << "Going to initialize trainer";
-  trainer->Initialize(trainer_desc, dataset);
-  VLOG(3) << "Set root scope here";
-  trainer->SetScope(scope);
-  // prepare training environment and helper environment
-  VLOG(3) << "Try to init train environment";
-  trainer->InitTrainerEnv(main_program, place_);
-  VLOG(3) << "Try to init other environment";
-  trainer->InitOtherEnv(main_program);
-  // training and finalize training
-  VLOG(3) << "Trainer starts to run";
-  trainer->Run();
-  VLOG(3) << "Trainer going to finalize";
-  trainer->Finalize();
-  return;
-}
-
-void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
-                   bool create_local_scope, bool create_vars,
-                   const std::vector<std::string>& skip_ref_cnt_vars,
-                   bool force_disable_gc) {
-  platform::RecordBlock b(block_id);
-  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
-  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
-  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
-}
-
-// Check whether the block already has feed operators and feed_holder.
-// Return false if the block does not have any feed operators.
-// If some feed operators have been prepended to the block, check that
-// the info contained in these feed operators matches the feed_targets
-// and feed_holder_name. Raise exception when any mismatch is found.
-// Return true if the block has feed operators and holder of matching info.
-static bool has_feed_operators(
-    const BlockDesc& block,
-    const std::map<std::string, const LoDTensor*>& feed_targets,
-    const std::string& feed_holder_name) {
-  size_t feed_count = 0;
-  for (auto* op : block.AllOps()) {
-    if (op->Type() == kFeedOpType) {
-      feed_count++;
-      // The input variable's name of feed_op should be feed_holder_name.
-      PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
-                        "Input to feed op should be '%s'", feed_holder_name);
-      std::string feed_target_name = op->Output("Out")[0];
-      PADDLE_ENFORCE(
-          feed_targets.find(feed_target_name) != feed_targets.end(),
-          "Feed operator output name '%s' cannot be found in 'feed_targets'",
-          feed_target_name);
-    }
-  }
-
-  if (feed_count > 0) {
-    PADDLE_ENFORCE_EQ(
-        feed_count, feed_targets.size(),
-        "The number of feed operators should match 'feed_targets'");
-
-    if (!feed_holder_name.empty()) {
-      // When feed operator are present, so should be feed_holder.
-      auto var = block.FindVar(feed_holder_name);
-      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-                              feed_holder_name);
-      PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
-                        "'%s' variable should be 'FEED_MINIBATCH' type",
-                        feed_holder_name);
-    }
-  }
-
-  return feed_count > 0;
-}
-
-// Check whether the block already has fetch operators and fetch_holder.
-// Return false if the block does not have any fetch operators.
-// If some fetch operators have been appended to the block, check that
-// the info contained in these fetch operators matches the fetch_targets
-// and fetch_holder_name. Raise exception when any mismatch is found.
-// Return true if the block has fetch operators and holder of matching info.
-static bool has_fetch_operators(
-    const BlockDesc& block,
-    const std::map<std::string, LoDTensor*>& fetch_targets,
-    const std::string& fetch_holder_name) {
-  size_t fetch_count = 0;
-  for (auto* op : block.AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      fetch_count++;
-      // The output variable's name of fetch_op should be fetch_holder_name.
-      PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
-                        "Output of fetch op should be '%s'", fetch_holder_name);
-      std::string fetch_target_name = op->Input("X")[0];
-      PADDLE_ENFORCE(
-          fetch_targets.find(fetch_target_name) != fetch_targets.end(),
-          "Fetch operator input name '%s' cannot be found in 'fetch_targets'",
-          fetch_target_name);
-    }
-  }
-
-  if (fetch_count > 0) {
-    PADDLE_ENFORCE_EQ(
-        fetch_count, fetch_targets.size(),
-        "The number of fetch operators should match 'fetch_targets'");
-
-    if (!fetch_holder_name.empty()) {
-      // When fetch operator are present, so should be fetch_holder.
-      auto var = block.FindVar(fetch_holder_name);
-      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-                              fetch_holder_name);
-      PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
-                        "'%s' variable should be 'FETCH_LIST' type",
-                        fetch_holder_name);
-    }
-  }
-
-  return fetch_count > 0;
-}
-
-std::unique_ptr<ExecutorPrepareContext> Executor::PrepareCtxCache(
-    const ProgramDesc& program, int block_id,
-    const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
-  return Prepare(program, block_id, skip_ref_cnt_vars, force_disable_gc);
-}
-
-void Executor::Run(const ProgramDesc& program, Scope* scope,
-                   std::map<std::string, const LoDTensor*>* feed_targets,
-                   std::map<std::string, LoDTensor*>* fetch_targets,
-                   bool create_local_scope, bool create_vars,
-                   const std::string& feed_holder_name,
-                   const std::string& fetch_holder_name) {
-  platform::RecordBlock b(kProgramId);
-  if (FLAGS_use_mkldnn) EnableMKLDNN(program);
-  bool has_feed_ops =
-      has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
-  bool has_fetch_ops =
-      has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name);
-
-  ProgramDesc* copy_program = const_cast<ProgramDesc*>(&program);
-  std::unique_ptr<ProgramDesc> unique_ptr_of_copy_program;
-  if (!has_feed_ops || !has_fetch_ops) {
-    unique_ptr_of_copy_program.reset(new ProgramDesc(program));
-    copy_program = unique_ptr_of_copy_program.get();
-  }
-  auto* global_block = copy_program->MutableBlock(0);
-
-  if (!has_feed_ops) {
-    // create feed_holder variable
-    auto* feed_holder = global_block->Var(feed_holder_name);
-    feed_holder->SetType(proto::VarType::FEED_MINIBATCH);
-    feed_holder->SetPersistable(true);
-
-    int i = 0;
-    for (auto& feed_target : (*feed_targets)) {
-      std::string var_name = feed_target.first;
-      VLOG(3) << "feed target's name: " << var_name;
-
-      // prepend feed op
-      auto* op = global_block->PrependOp();
-      op->SetType(kFeedOpType);
-      op->SetInput("X", {feed_holder_name});
-      op->SetOutput("Out", {var_name});
-      op->SetAttr("col", {static_cast<int>(i)});
-      op->CheckAttrs();
-
-      i++;
-    }
-  }
-
-  if (!has_fetch_ops) {
-    // create fetch_holder variable
-    auto* fetch_holder = global_block->Var(fetch_holder_name);
-    fetch_holder->SetType(proto::VarType::FETCH_LIST);
-    fetch_holder->SetPersistable(true);
-
-    int i = 0;
-    for (auto& fetch_target : (*fetch_targets)) {
-      std::string var_name = fetch_target.first;
-      VLOG(3) << "fetch target's name: " << var_name;
-
-      // append fetch op
-      auto* op = global_block->AppendOp();
-      op->SetType(kFetchOpType);
-      op->SetInput("X", {var_name});
-      op->SetOutput("Out", {fetch_holder_name});
-      op->SetAttr("col", {static_cast<int>(i)});
-      op->CheckAttrs();
-
-      i++;
-    }
-  }
-
-  auto ctx = Prepare(*copy_program, 0);
-  RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets,
-                     create_local_scope, create_vars, feed_holder_name,
-                     fetch_holder_name);
-}
-
-std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
-    const ProgramDesc& program, int block_id,
-    const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
-  std::unique_ptr<ExecutorPrepareContext> ctx(
-      new ExecutorPrepareContext(program, block_id));
-  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
-  auto& block = program.Block(block_id);
-  for (auto& op_desc : block.AllOps()) {
-    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-  }
-#ifdef PADDLE_WITH_NGRAPH
-  if (FLAGS_use_ngraph && ctx->block_id_ == 0) {
-    paddle::operators::NgraphEngine::FuseNgraphOps(
-        ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
-  }
-#endif
-  ctx->PrepareUnusedVars(skip_ref_cnt_vars, force_disable_gc);
-  return ctx;
-}
-
-std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
-    const ProgramDesc& program, const std::vector<int>& block_ids,
-    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars,
-    bool force_disable_gc) {
-  PADDLE_ENFORCE(
-      skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
-      "skip_ref_cnt_vars should be either empty or equals to block number %d",
-      block_ids.size());
-  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
-  size_t idx = 0;
-  for (auto& bid : block_ids) {
-    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
-    auto* ctx = new ExecutorPrepareContext(program, bid);
-    auto& block = program.Block(bid);
-    for (auto& op_desc : block.AllOps()) {
-      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-    }
-    if (skip_ref_cnt_vars.empty()) {
-      ctx->PrepareUnusedVars(std::vector<std::string>(), force_disable_gc);
-    } else {
-      ctx->PrepareUnusedVars(skip_ref_cnt_vars[idx], force_disable_gc);
-    }
-    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
-    ++idx;
-  }
-  return result;
-}
-
-void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                                  bool create_local_scope, bool create_vars,
-                                  bool keep_kids) {
-  platform::RecordBlock b(kProgramId);
-  PADDLE_ENFORCE_NOT_NULL(scope);
-  Scope* local_scope = scope;
-  if (create_vars) {
-    if (create_local_scope) {
-      local_scope = &scope->NewScope();
-    }
-    CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
-  }
-
-  int64_t max_memory_size = GetEagerDeletionThreshold();
-  std::unique_ptr<GarbageCollector> gc;
-  if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
-#ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(place_)) {
-      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new UnsafeFastGPUGarbageCollector(
-            boost::get<platform::CUDAPlace>(place_), max_memory_size));
-      } else {
-        gc.reset(new DefaultStreamGarbageCollector(
-            boost::get<platform::CUDAPlace>(place_), max_memory_size));
-      }
-    } else if (platform::is_cpu_place(place_)) {
-#endif
-      gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place_),
-                                       max_memory_size));
-#ifdef PADDLE_WITH_CUDA
-    }
-#endif
-  }
-
-  for (auto& op : ctx->ops_) {
-    op->Run(*local_scope, place_);
-    if (gc) {
-      DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get());
-    }
-  }
-
-  platform::DeviceContextPool::Instance().Get(place_)->Wait();
-
-  if (local_scope != scope) {
-    scope->DeleteScope(local_scope);
-  } else {
-    if (!keep_kids) {
-      // By default, we should delete all kid scopes after run executor because
-      // some operators may create local scope when running, such as while_op.
-      // But when while_op also create a local executor to run it's sub block,
-      // the sub scopes it created should not be dropped immediately, because
-      // while_grad_op will use some variables created during while_op run, so
-      // we need to keep the kids and wait for the outer executor to drop them.
-      scope->DropKids();
-    }
-  }
-}
-
-void Executor::RunPreparedContext(
-    ExecutorPrepareContext* ctx, Scope* scope,
-    std::map<std::string, const LoDTensor*>* feed_targets,
-    std::map<std::string, LoDTensor*>* fetch_targets, bool create_local_scope,
-    bool create_vars, const std::string& feed_holder_name,
-    const std::string& fetch_holder_name) {
-  auto& global_block = ctx->prog_.Block(ctx->block_id_);
-
-  PADDLE_ENFORCE(
-      has_feed_operators(global_block, *feed_targets, feed_holder_name),
-      "Program in ExecutorPrepareContext should has feed_ops.");
-  PADDLE_ENFORCE(
-      has_fetch_operators(global_block, *fetch_targets, fetch_holder_name),
-      "Program in the prepared context should has fetch_ops.");
-
-  // map the data of feed_targets to feed_holder
-  for (auto* op : global_block.AllOps()) {
-    if (op->Type() == kFeedOpType) {
-      std::string feed_target_name = op->Output("Out")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      SetFeedVariable(scope, *(*feed_targets)[feed_target_name],
-                      feed_holder_name, idx);
-    }
-  }
-
-  RunPreparedContext(ctx, scope, create_local_scope, create_vars);
-
-  // obtain the data of fetch_targets from fetch_holder
-  for (auto* op : global_block.AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      std::string fetch_target_name = op->Input("X")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      *(*fetch_targets)[fetch_target_name] =
-          GetFetchVariable(*scope, fetch_holder_name, idx);
-    }
-  }
-}
-
-void Executor::EnableMKLDNN(const ProgramDesc& program) {
-#ifdef PADDLE_WITH_MKLDNN
-  VLOG(3) << "use_mkldnn=True";
-  for (size_t bid = 0; bid < program.Size(); ++bid) {
-    auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
-    for (auto* op : block->AllOps()) {
-      if (op->HasAttr("use_mkldnn")) {
-        op->SetAttr("use_mkldnn", true);
-      }
-    }
-  }
-#else
-  LOG(WARNING)
-      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
-#endif
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
deleted file mode 100644
index a6db5c8d4136f726106f0ce4debd145d6d14fb45..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/executor.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/data_set.h"
-#include "paddle/fluid/framework/executor_gc_helper.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
-
-  ~ExecutorPrepareContext();
-
-  void PrepareUnusedVars(const std::vector<std::string>& keep_vars,
-                         bool force_disable_gc = false);
-
-  const framework::ProgramDesc& prog_;
-  const size_t block_id_;
-
-  std::vector<std::unique_ptr<OperatorBase>> ops_;
-
-  std::unordered_map<const OperatorBase*, std::vector<std::string>>
-      unused_vars_;
-  bool force_disable_gc_{false};
-};
-
-class Executor {
- public:
-  // TODO(dzhwinter) : Do not rely on this function, it will be removed
-  explicit Executor(const platform::DeviceContext& device)
-      : Executor(device.GetPlace()) {}
-
-  explicit Executor(const platform::Place& place);
-
-  /*
-   * Close this Executor.
-   * Calling this method will send complete messages to all pserver instances.
-   */
-  void Close();
-
-  /* @Brief
-   * Runtime evaluation of the given ProgramDesc under certain Scope
-   *
-   * @param
-   *  ProgramDesc
-   *  Scope
-   */
-  void Run(const ProgramDesc& prog, Scope* scope, int block_id,
-           bool create_local_scope = true, bool create_vars = true,
-           const std::vector<std::string>& skip_ref_cnt_vars =
-               std::vector<std::string>(),
-           bool force_disable_gc = false);
-
-  // This API is very slow.
-  void Run(const ProgramDesc& program, Scope* scope,
-           std::map<std::string, const LoDTensor*>* feed_targets,
-           std::map<std::string, LoDTensor*>* fetch_targets,
-           bool create_local_scope = true, bool create_vars = true,
-           const std::string& feed_holder_name = "feed",
-           const std::string& fetch_holder_name = "fetch");
-
-  // This API is very slow.
-  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                          std::map<std::string, const LoDTensor*>* feed_targets,
-                          std::map<std::string, LoDTensor*>* fetch_targets,
-                          bool create_local_scope = true,
-                          bool create_vars = true,
-                          const std::string& feed_holder_name = "feed",
-                          const std::string& fetch_holder_name = "fetch");
-
-  std::unique_ptr<ExecutorPrepareContext> PrepareCtxCache(
-      const ProgramDesc& program, int block_id,
-      const std::vector<std::string>& skip_ref_cnt_vars =
-          std::vector<std::string>(),
-      bool force_disable_gc = false);
-
-  static std::unique_ptr<ExecutorPrepareContext> Prepare(
-      const ProgramDesc& program, int block_id,
-      const std::vector<std::string>& skip_ref_cnt_vars =
-          std::vector<std::string>(),
-      bool force_disable_gc = false);
-
-  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
-      const ProgramDesc& program, const std::vector<int>& block_ids,
-      const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
-          std::vector<std::vector<std::string>>(),
-      bool force_disable_gc = false);
-
-  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
-
-  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                          bool create_local_scope = true,
-                          bool create_vars = true, bool keep_kids = false);
-
-  void EnableMKLDNN(const ProgramDesc& program);
-
-  void RunFromDataset(const ProgramDesc& main_program, Scope* scope,
-                      Dataset* dataset, const std::string& trainer_desc_str);
-
- private:
-  const platform::Place place_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
deleted file mode 100644
index 1712d66cf4c99f0c01bf2ba2431bf41f457390db..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/executor_gc_helper.h"
-#include <deque>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-struct OpInOutInfo {
- public:
-  void Build(const OperatorBase *op) {
-    is_built_ = true;
-    auto &inferer = op->Info().NoNeedBufferVarsInferer();
-    if (inferer) {
-      no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs());
-
-      if (no_need_buffer_ins_.empty()) return;
-
-      for (auto &in_name_pair : op->Inputs()) {
-        if (no_need_buffer_ins_.count(in_name_pair.first) != 0) {
-          continue;
-        }
-
-        for (auto &in_arg_name : in_name_pair.second) {
-          other_args_set_.insert(in_arg_name);
-        }
-      }
-
-      for (auto &out_name_pair : op->Outputs()) {
-        for (auto &out_arg_name : out_name_pair.second) {
-          other_args_set_.insert(out_arg_name);
-        }
-      }
-    }
-  }
-
-  bool IsBuilt() const { return is_built_; }
-
-  bool IsInArgBufferNeeded(const std::string &in_arg_name) const {
-    return no_need_buffer_ins_.empty() ||
-           other_args_set_.count(in_arg_name) != 0;
-  }
-
- private:
-  // A set to record unused buffer input vars of op
-  std::unordered_set<std::string> no_need_buffer_ins_;
-  // A set to record other args of op (including in, out)
-  std::unordered_set<std::string> other_args_set_;
-  bool is_built_{false};
-};
-
-static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block,
-                            const std::unordered_set<std::string> &skip_vars) {
-  if (skip_vars.count(name) != 0) {
-    return false;
-  }
-
-  auto *var_desc = block.FindVar(name);
-  if (var_desc == nullptr || var_desc->Persistable()) {
-    return false;
-  }
-
-  auto type = var_desc->Proto()->type().type();
-
-  return type == proto::VarType::LOD_TENSOR ||
-         type == proto::VarType::SELECTED_ROWS ||
-         type == proto::VarType::LOD_TENSOR_ARRAY;
-}
-
-std::unordered_map<const OperatorBase *, std::vector<std::string>>
-GetUnusedVars(const BlockDesc &block,
-              const std::vector<std::unique_ptr<OperatorBase>> &ops,
-              const std::vector<std::string> &skip_var_list) {
-  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
-                                            skip_var_list.end());
-
-  std::unordered_map<std::string, size_t> var_op_idx_map;
-
-  for (size_t i = 0; i < ops.size(); ++i) {
-    auto *op = ops[i].get();
-
-    OpInOutInfo info;
-    for (auto &name_pair : op->Inputs()) {
-      for (auto &name : name_pair.second) {
-        if (!VarCanBeDeleted(name, block, skip_vars)) {
-          continue;
-        }
-
-        // var can be gc-ed
-        if (!info.IsBuilt()) {
-          info.Build(op);
-        }
-
-        if (info.IsInArgBufferNeeded(name)) {
-          // Update the last living op of variable to current op
-          var_op_idx_map[name] = i;
-        } else {
-          VLOG(10) << "Skip reference count computing of variable "
-                   << name_pair.first << "(" << name << ") in Operator "
-                   << op->Type();
-        }
-      }
-    }
-
-    for (auto &name_pair : op->Outputs()) {
-      for (auto &name : name_pair.second) {
-        if (VarCanBeDeleted(name, block, skip_vars)) {
-          // Update the last living op of variable to current op
-          var_op_idx_map[name] = i;
-        }
-      }
-    }
-  }
-
-  std::unordered_map<const OperatorBase *, std::vector<std::string>> result;
-  for (auto &name_op_idx_pair : var_op_idx_map) {
-    auto &name = name_op_idx_pair.first;
-    size_t op_idx = name_op_idx_pair.second;
-    result[ops[op_idx].get()].emplace_back(name);
-  }
-  return result;
-}
-
-void DeleteUnusedTensors(
-    const Scope &scope, const OperatorBase *op,
-    const std::unordered_map<const OperatorBase *, std::vector<std::string>>
-        &delete_vars_map,
-    GarbageCollector *gc) {
-  auto iter = delete_vars_map.find(op);
-  if (iter == delete_vars_map.end()) {
-    return;
-  }
-
-  auto &delete_vars = iter->second;
-
-  std::deque<std::shared_ptr<memory::Allocation>> garbages;
-
-  for (auto &var_name : delete_vars) {
-    auto *var = scope.FindVar(var_name);
-    if (var == nullptr) {
-      continue;
-    }
-
-    VLOG(2) << "Erase variable " << var_name;
-    if (var->IsType<LoDTensor>()) {
-      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-    } else if (var->IsType<SelectedRows>()) {
-      garbages.emplace_back(
-          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
-    } else if (var->IsType<LoDTensorArray>()) {
-      auto *lod_tensor_arr = var->GetMutable<LoDTensorArray>();
-      for (auto &t : *lod_tensor_arr) {
-        garbages.emplace_back(t.MoveMemoryHolder());
-      }
-    } else {
-      PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   framework::ToTypeName(var->Type()), var_name);
-    }
-  }
-
-  if (!garbages.empty()) {
-    gc->Add(std::move(garbages));
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h
deleted file mode 100644
index a4c71c5304e05e3d0dca6ca08d955f39b779556b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/executor_gc_helper.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-
-// Result map: op -> variable names that can be deleted after op runs
-std::unordered_map<const OperatorBase *, std::vector<std::string>>
-GetUnusedVars(const BlockDesc &block,
-              const std::vector<std::unique_ptr<OperatorBase>> &ops,
-              const std::vector<std::string> &skip_vars);
-
-// Collect unused tensors after op runs
-void DeleteUnusedTensors(
-    const Scope &scope, const OperatorBase *op,
-    const std::unordered_map<const OperatorBase *, std::vector<std::string>>
-        &delete_vars_map,
-    GarbageCollector *gc);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
deleted file mode 100644
index 005d98c6e8fda92ff6c6b3412f89c75760bf0498..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ /dev/null
@@ -1,698 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/executor_thread_worker.h"
-#include <algorithm>
-#include <utility>
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/message.h"
-#include "google/protobuf/text_format.h"
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/timer.h"
-#include "paddle/fluid/pybind/pybind.h"
-namespace paddle {
-namespace framework {
-
-#ifdef PADDLE_WITH_PSLIB
-int DensePullThread::start() {
-  _running = true;
-  _t = std::thread(&DensePullThread::run, this);
-  return 0;
-}
-
-void DensePullThread::run() {
-  while (_running) {
-    _pull_dense_status.resize(0);
-    for (auto& t : _dense_variable_name) {
-      if (check_update_param(t.first)) {
-        auto status = pull_dense(t.first);
-        _pull_dense_status.emplace_back(std::move(status));
-        reset_thread_version(t.first);
-      }
-    }
-    if (_pull_dense_status.size() != 0) {
-      wait_all();
-    }
-
-    usleep(_sleep_time_ms * 1000);
-  }
-}
-bool DensePullThread::check_update_param(uint64_t table_id) {
-  {
-    std::lock_guard<std::mutex> lock(_mutex_for_version);
-    auto& version = _training_versions[table_id];
-    _current_version[table_id] =
-        *(std::min_element(version.begin(), version.end()));
-  }
-  if (_current_version[table_id] - _last_versions[table_id] < _threshold) {
-    return false;
-  }
-  return true;
-}
-
-void DensePullThread::reset_thread_version(uint64_t table_id) {
-  std::lock_guard<std::mutex> lock(_mutex_for_version);
-  _last_versions[table_id] = _current_version[table_id];
-}
-std::future<int32_t> DensePullThread::pull_dense(uint64_t table_id) {
-  auto& regions = _regions[table_id];
-  regions.clear();
-  auto& variables = _dense_variable_name[table_id];
-  regions.resize(variables.size());
-
-  for (auto i = 0u; i < variables.size(); ++i) {
-    auto& t = variables[i];
-    Variable* var = _root_scope->FindVar(t);
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-
-    float* w = tensor->data<float>();
-    paddle::ps::Region reg(w, tensor->numel());
-    regions[i] = std::move(reg);
-  }
-  return _ps_client->pull_dense(regions.data(), regions.size(), table_id);
-}
-
-void DensePullThread::wait_all() {
-  for (auto& t : _pull_dense_status) {
-    t.wait();
-    auto status = t.get();
-    if (status != 0) {
-      LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times;
-    }
-  }
-
-  if (_pull_dense_fail_times > 20) {
-    LOG(FATAL) << "pull dense failed times more than 20 times";
-    exit(-1);
-  }
-
-  _pull_dense_status.resize(0);
-}
-
-void DensePullThread::increase_thread_version(int thread_id,
-                                              uint64_t table_id) {
-  std::lock_guard<std::mutex> lock(_mutex_for_version);
-  _training_versions[table_id][thread_id]++;
-}
-#endif
-
-void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
-  auto& block = program.Block(0);
-  op_names_.clear();
-  for (auto& op_desc : block.AllOps()) {
-    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
-    op_names_.push_back(op_desc->Type());
-    OperatorBase* local_op_ptr = local_op.release();
-    ops_.push_back(local_op_ptr);
-    continue;
-  }
-}
-
-void ExecutorThreadWorker::CreateThreadResource(
-    const framework::ProgramDesc& program,
-    const paddle::platform::Place& place) {
-  CreateThreadScope(program);
-  CreateThreadOperators(program);
-  SetMainProgram(program);
-  SetPlace(place);
-}
-
-void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
-  auto& block = program.Block(0);
-
-  PADDLE_ENFORCE_NOT_NULL(
-      root_scope_, "root_scope should be set before creating thread scope");
-
-  thread_scope_ = &root_scope_->NewScope();
-  for (auto& var : block.AllVars()) {
-    if (var->Persistable()) {
-      auto* ptr = root_scope_->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-    } else {
-      auto* ptr = thread_scope_->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-    }
-  }
-}
-
-void ExecutorThreadWorker::SetDataFeed(
-    const std::shared_ptr<DataFeed>& datafeed) {
-  thread_reader_ = datafeed;
-}
-
-void ExecutorThreadWorker::BindingDataFeedMemory() {
-  const std::vector<std::string>& input_feed =
-      thread_reader_->GetUseSlotAlias();
-  for (auto name : input_feed) {
-    thread_reader_->AddFeedVar(thread_scope_->Var(name), name);
-  }
-}
-
-void ExecutorThreadWorker::SetFetchVarNames(
-    const std::vector<std::string>& fetch_var_names) {
-  fetch_var_names_.clear();
-  fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(),
-                          fetch_var_names.end());
-}
-
-void ExecutorThreadWorker::SetDevice() {
-#if defined _WIN32 || defined __APPLE__
-  return;
-#else
-  static unsigned concurrency_cap = std::thread::hardware_concurrency();
-  LOG(WARNING) << "concurrency capacity " << concurrency_cap;
-  int thread_id = this->thread_id_;
-
-  if (static_cast<unsigned>(thread_id) < concurrency_cap) {
-    unsigned proc = thread_id;
-
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    CPU_SET(proc, &mask);
-
-    if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) {
-      VLOG(1) << "WARNING: Failed to set thread affinity for thread "
-              << thread_id;
-    } else {
-      CPU_ZERO(&mask);
-      if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) ||
-          (CPU_ISSET(proc, &mask) == 0)) {
-        VLOG(3) << "WARNING: Failed to set thread affinity for thread "
-                << thread_id;
-      }
-    }
-  } else {
-    VLOG(1) << "WARNING: Failed to set thread affinity for thread "
-            << thread_id;
-  }
-#endif
-}
-
-template <typename T>
-void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) {
-  auto inspect = lod_tensor.data<T>();
-  auto element_num = lod_tensor.numel();
-
-  std::ostringstream sstream;
-  sstream << var_name << " (element num " << element_num << "): [";
-  sstream << inspect[0];
-  for (int j = 1; j < element_num; ++j) {
-    sstream << " " << inspect[j];
-  }
-  sstream << "]";
-
-  std::cout << sstream.str() << std::endl;
-}
-
-static void print_fetch_var(Scope* scope, const std::string& var_name) {
-  auto& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
-
-#define PrintLoDTensorCallback(cpp_type, proto_type) \
-  do {                                               \
-    if (tensor.type() == proto_type) {               \
-      print_lod_tensor<cpp_type>(var_name, tensor);  \
-      return;                                        \
-    }                                                \
-  } while (0)
-
-  _ForEachDataType_(PrintLoDTensorCallback);
-  VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
-}
-
-void ExecutorThreadWorker::TrainFilesWithTimer() {
-  platform::SetNumThreads(1);
-  SetDevice();
-  thread_reader_->Start();
-
-  std::vector<double> op_total_time;
-  std::vector<std::string> op_name;
-  for (auto& op : ops_) {
-    op_name.push_back(op->Type());
-  }
-  op_total_time.resize(ops_.size());
-  for (size_t i = 0; i < op_total_time.size(); ++i) {
-    op_total_time[i] = 0.0;
-  }
-  platform::Timer timeline;
-  double total_time = 0.0;
-  double read_time = 0.0;
-  int cur_batch;
-  int batch_cnt = 0;
-  timeline.Start();
-  while ((cur_batch = thread_reader_->Next()) > 0) {
-    timeline.Pause();
-    read_time += timeline.ElapsedSec();
-    total_time += timeline.ElapsedSec();
-    for (size_t i = 0; i < ops_.size(); ++i) {
-      timeline.Start();
-      ops_[i]->Run(*thread_scope_, place_);
-      timeline.Pause();
-      op_total_time[i] += timeline.ElapsedSec();
-      total_time += timeline.ElapsedSec();
-    }
-    ++batch_cnt;
-    thread_scope_->DropKids();
-    if (thread_id_ == 0) {
-      if (batch_cnt > 0 && batch_cnt % 100 == 0) {
-        for (size_t i = 0; i < ops_.size(); ++i) {
-          fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
-                  op_name[i].c_str(), op_total_time[i] / batch_cnt);
-        }
-        fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
-        int fetch_var_num = fetch_var_names_.size();
-        for (int i = 0; i < fetch_var_num; ++i) {
-          print_fetch_var(thread_scope_, fetch_var_names_[i]);
-        }
-        fprintf(stderr, "IO percent: %f\n", read_time / total_time);
-      }
-    }
-    timeline.Start();
-  }
-}
-
-void ExecutorThreadWorker::TrainFiles() {
-  platform::SetNumThreads(1);
-
-  // todo: configurable
-  // SetDevice();
-
-  int fetch_var_num = fetch_var_names_.size();
-  fetch_values_.clear();
-  fetch_values_.resize(fetch_var_num);
-
-  thread_reader_->Start();
-
-  int cur_batch;
-  int batch_cnt = 0;
-  while ((cur_batch = thread_reader_->Next()) > 0) {
-    // executor run here
-    for (auto& op : ops_) {
-      op->Run(*thread_scope_, place_);
-    }
-
-    ++batch_cnt;
-    thread_scope_->DropKids();
-
-    if (debug_ == false || thread_id_ != 0) {
-      continue;
-    }
-
-    for (int i = 0; i < fetch_var_num; ++i) {
-      print_fetch_var(thread_scope_, fetch_var_names_[i]);
-    }  // end for (int i = 0...)
-  }    // end while ()
-}
-
-void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; }
-
-void ExecutorThreadWorker::SetPlace(const platform::Place& place) {
-  place_ = place;
-}
-
-void ExecutorThreadWorker::SetMainProgram(
-    const ProgramDesc& main_program_desc) {
-  main_program_.reset(new ProgramDesc(main_program_desc));
-}
-
-void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
-  root_scope_ = g_scope;
-}
-
-#ifdef PADDLE_WITH_PSLIB
-//  AsyncExecutor
-void AsyncExecutorThreadWorker::TrainFiles() {
-  SetDevice();
-
-  int fetch_var_num = fetch_var_names_.size();
-  fetch_values_.clear();
-  fetch_values_.resize(fetch_var_num);
-
-  thread_reader_->Start();
-
-  int cur_batch;
-  int batch_cnt = 0;
-  while ((cur_batch = thread_reader_->Next()) > 0) {
-    // executor run here
-    TrainOneNetwork();
-
-    ++batch_cnt;
-    thread_scope_->DropKids();
-
-    if (debug_ == false || thread_id_ != 0) {
-      continue;
-    }
-
-    for (int i = 0; i < fetch_var_num; ++i) {
-      print_fetch_var(thread_scope_, fetch_var_names_[i]);
-    }  // end for (int i = 0...)
-  }    // end while ()
-}
-
-void AsyncExecutorThreadWorker::SetPSlibPtr(
-    std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
-  _pslib_ptr = pslib_ptr;
-}
-
-void AsyncExecutorThreadWorker::SetPullDenseThread(
-    std::shared_ptr<DensePullThread> dpt) {
-  _pull_dense_thread = dpt;
-}
-
-void AsyncExecutorThreadWorker::TrainOneNetwork() {
-  PrepareParams();
-
-  for (auto& op : ops_) {
-    if (op->Type().find("sgd") != std::string::npos) {
-      continue;
-    }
-    bool need_skip = false;
-    for (auto t = 0u; t < _param_config->skip_op.size(); ++t) {
-      if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) {
-        need_skip = true;
-        break;
-      }
-    }
-    if (!need_skip) {
-      op->Run(*thread_scope_, place_);
-    }
-  }
-  UpdateParams();
-}
-
-void AsyncExecutorThreadWorker::SetParamConfig(
-    AsyncWorkerParamConfig* param_config) {
-  _param_config = param_config;
-}
-
-void AsyncExecutorThreadWorker::PrepareParams() {
-  for (auto table_id : _param_config->sparse_table_id) {
-    PullSparse(table_id);
-    for (auto& t : _pull_sparse_status) {
-      t.wait();
-      auto status = t.get();
-      if (status != 0) {
-        LOG(ERROR) << "pull sparse failed, status[" << status << "]";
-        exit(-1);
-      }
-    }
-  }
-  _pull_sparse_status.resize(0);
-
-  for (auto table_id : _param_config->sparse_table_id) {
-    FillSparse(table_id);
-  }
-}
-
-void AsyncExecutorThreadWorker::UpdateParams() {
-  for (auto i : _param_config->sparse_table_id) {
-    PushSparse(i);
-  }
-  for (auto i : _param_config->dense_table_id) {
-    PushDense(i);
-  }
-  int32_t tmp_push_dense_wait_times = -1;
-  int32_t tmp_push_sparse_wait_times = -1;
-  static uint32_t push_dense_wait_times =
-      static_cast<uint32_t>(tmp_push_dense_wait_times);
-  static uint32_t push_sparse_wait_times =
-      static_cast<uint32_t>(tmp_push_sparse_wait_times);
-
-  if (_push_dense_status.size() >= push_dense_wait_times) {
-    for (auto& t : _push_dense_status) {
-      t.wait();
-    }
-    _push_dense_status.resize(0);
-  }
-  if (tmp_push_dense_wait_times == -1) {
-    _push_dense_status.resize(0);
-  }
-  if (_push_sparse_status.size() >= push_sparse_wait_times) {
-    for (auto& t : _push_sparse_status) {
-      t.wait();
-    }
-    _push_sparse_status.resize(0);
-  }
-  if (tmp_push_sparse_wait_times == -1) {
-    _push_sparse_status.resize(0);
-  }
-  for (auto dense_table_id : _param_config->dense_table_id) {
-    _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id);
-  }
-}
-
-void AsyncExecutorThreadWorker::PushDense(int table_id) {
-  std::vector<paddle::ps::Region> regions;
-  for (auto& t : _param_config->dense_gradient_variable_name[table_id]) {
-    Variable* var = thread_scope_->FindVar(t);
-    CHECK(var != nullptr) << "var[" << t << "] not found";
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    int count = tensor->numel();
-    float* g = tensor->data<float>();
-    paddle::ps::Region reg(g, count);
-    regions.emplace_back(std::move(reg));
-  }
-
-  auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(),
-                                                    regions.size(), table_id);
-  _push_dense_status.push_back(std::move(status));
-}
-
-void AsyncExecutorThreadWorker::PullSparse(int table_id) {
-  auto& features = _features[table_id];
-  auto& feature_value = _feature_value[table_id];
-  auto fea_dim = _param_config->fea_dim;
-  // slot id starts from 1
-  features.clear();
-  features.resize(0);
-  features.reserve(MAX_FEASIGN_NUM);
-  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
-  // slot_idx = 0 is label TODO
-  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
-    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    int64_t* ids = tensor->data<int64_t>();
-    int len = tensor->numel();
-    for (auto i = 0u; i < len; ++i) {
-      // todo(colourful-tree): current trick - filter feasign=use_slot_mod(
-      // bug: datafeed fill use_slot_mod for empty slot)
-      if (ids[i] == 0u) {
-        continue;
-      }
-      features.push_back(static_cast<uint64_t>(ids[i]));
-    }
-  }
-  check_pull_push_memory(features, &feature_value, fea_dim);
-
-  std::vector<float*> pull_feature_value;
-  for (auto i = 0u; i < features.size(); ++i) {
-    pull_feature_value.push_back(feature_value[i].data());
-  }
-
-  auto status = _pslib_ptr->_worker_ptr->pull_sparse(
-      pull_feature_value.data(), table_id, features.data(), features.size());
-  _pull_sparse_status.push_back(std::move(status));
-
-  auto& push_g = _feature_push_value[table_id];
-  check_pull_push_memory(features, &push_g, fea_dim);
-  collect_feasign_info(table_id);
-}
-
-void AsyncExecutorThreadWorker::FillSparse(int table_id) {
-  auto slot_dim = _param_config->slot_dim;
-  auto fea_dim = _param_config->fea_dim;
-  auto& features = _features[table_id];
-  auto& fea_value = _feature_value[table_id];
-
-  CHECK(features.size() > 0) << "feature size check failed";
-
-  auto fea_idx = 0u;
-
-  std::vector<float> init_value(fea_dim);
-
-  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
-  // slot_idx = 0 is label TODO
-  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
-    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    int64_t* ids = tensor->data<int64_t>();
-    int len = tensor->numel();
-    Variable* var_emb = thread_scope_->FindVar(
-        _param_config->slot_input_vec[table_id][slot_idx - 1]);
-    LoDTensor* tensor_emb = var_emb->GetMutable<LoDTensor>();
-    float* ptr =
-        tensor_emb->mutable_data<float>({len, slot_dim}, platform::CPUPlace());
-    memset(ptr, 0, sizeof(float) * len * slot_dim);
-    auto& tensor_lod = tensor->lod()[0];
-
-    LoD data_lod{tensor_lod};
-    tensor_emb->set_lod(data_lod);
-
-    for (auto index = 0u; index < len; ++index) {
-      if (ids[index] == 0u) {
-        memcpy(ptr + slot_dim * index, init_value.data() + 2,
-               sizeof(float) * slot_dim);
-        continue;
-      }
-      memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2,
-             sizeof(float) * slot_dim);
-      fea_idx++;
-    }
-  }
-}
-
-void AsyncExecutorThreadWorker::PushSparse(int table_id) {
-  auto slot_dim = _param_config->slot_dim;
-  auto fea_dim = _param_config->fea_dim;
-  auto& features = _features[table_id];
-  auto& push_g = _feature_push_value[table_id];
-  check_pull_push_memory(features, &push_g, fea_dim);
-  CHECK(push_g.size() == features.size() + 1)
-      << "push_g size:" << push_g.size()
-      << " features size:" << features.size();
-  uint64_t fea_idx = 0u;
-  auto& fea_info = _fea_info[table_id];
-  int offset = 2;
-  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
-  // slot_idx = 0 is label
-  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
-    if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) ==
-        _param_config->slot_alias_to_table.end()) {
-      LOG(ERROR) << "ERROR slot_idx:" << slot_idx
-                 << " name:" << feed_vec[slot_idx];
-    } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] !=
-               table_id) {
-      continue;
-    }
-    Variable* g_var = thread_scope_->FindVar(
-        _param_config->gradient_var[table_id][slot_idx - 1]);
-    CHECK(g_var != nullptr)
-        << "var[" << _param_config->gradient_var[table_id][slot_idx - 1]
-        << "] not found";
-    LoDTensor* g_tensor = g_var->GetMutable<LoDTensor>();
-    if (g_tensor == NULL) {
-      LOG(ERROR) << "var["
-                 << _param_config->gradient_var[table_id][slot_idx - 1]
-                 << "] not found";
-      exit(-1);
-    }
-    float* g = g_tensor->data<float>();
-
-    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
-    CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found";
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    if (tensor == NULL) {
-      LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found";
-      exit(-1);
-    }
-    int len = tensor->numel();
-    CHECK(slot_dim * len == g_tensor->numel())
-        << "len:" << len << " g_numel:" << g_tensor->numel();
-    CHECK(len == tensor->numel()) << "len:" << len
-                                  << "t_numel:" << tensor->numel();
-    int64_t* ids = tensor->data<int64_t>();
-    for (auto id_idx = 0u; id_idx < len; ++id_idx) {
-      if (ids[id_idx] == 0) {
-        g += slot_dim;
-        continue;
-      }
-      memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim);
-      push_g[fea_idx][0] = 1.0f;
-      CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx
-                                       << " size:" << fea_info.size();
-      push_g[fea_idx][1] = static_cast<float>(fea_info[fea_idx].label);
-      g += slot_dim;
-      fea_idx++;
-    }
-  }
-  CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx
-                                    << " features size:" << features.size();
-  CHECK_GT(features.size(), 0);
-
-  std::vector<float*> push_g_vec;
-  for (auto i = 0u; i < features.size(); ++i) {
-    push_g_vec.push_back(push_g[i].data());
-  }
-  auto status = _pslib_ptr->_worker_ptr->push_sparse(
-      table_id, features.data(), (const float**)push_g_vec.data(),
-      features.size());
-  _push_sparse_status.push_back(std::move(status));
-}
-
-void AsyncExecutorThreadWorker::collect_feasign_info(int table_id) {
-  auto& fea_info = _fea_info[table_id];
-  auto& feature = _features[table_id];
-  fea_info.resize(feature.size());
-  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
-  Variable* var = thread_scope_->FindVar(feed_vec[0]);
-  LoDTensor* tensor = var->GetMutable<LoDTensor>();
-  int64_t* label = tensor->data<int64_t>();
-
-  int global_index = 0;
-  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
-    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    int64_t* ids = tensor->data<int64_t>();
-
-    int fea_idx = 0;
-    for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) {
-      for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) {
-        if (ids[fea_idx] == 0u) {
-          continue;
-        }
-        FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]};
-
-        fea_info[global_index++] = std::move(info);
-      }
-    }
-  }
-  CHECK(global_index == feature.size())
-      << "expect fea info size:" << feature.size() << " real:" << global_index;
-}
-
-void AsyncExecutorThreadWorker::check_pull_push_memory(
-    const std::vector<uint64_t>& features,
-    std::vector<std::vector<float>>* push_g, int dim) {
-  push_g->resize(features.size() + 1);
-  for (auto& t : *push_g) {
-    t.resize(dim);
-  }
-}
-
-void AsyncExecutorThreadWorker::check_pull_push_memory(
-    const std::vector<uint64_t>& features, std::vector<float*>* push_g,
-    int dim) {
-  if (features.size() > push_g->size()) {
-    push_g->reserve(features.size() + 1);
-    auto size = features.size() - push_g->size() + 1;
-    for (auto i = 0u; i < size; ++i) {
-      float* ptr = new float[dim];
-      push_g->push_back(ptr);
-    }
-  }
-}
-#endif
-
-}  // einit_modelnd namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h
deleted file mode 100644
index 524922b0322e538d46f93011fbca3223b02d8849..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/executor_thread_worker.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#ifdef PADDLE_WITH_PSLIB
-#include <pslib.h>
-#endif
-
-namespace paddle {
-namespace framework {
-
-void CreateTensor(Variable* var, proto::VarType::Type var_type);
-#ifdef PADDLE_WITH_PSLIB
-static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100;
-
-struct AsyncWorkerParamConfig {
-  int slot_dim;
-  int fea_dim;
-  int32_t tmp_push_dense_wait_times;
-  int32_t tmp_push_sparse_wait_times;
-
-  std::vector<std::string> skip_op;
-
-  std::map<uint64_t, std::vector<std::string>> dense_variable_name;
-  std::map<uint64_t, std::vector<std::string>> dense_gradient_variable_name;
-  std::vector<int> dense_table_id;
-  // fea_dim for each dense table
-  std::vector<uint32_t> dense_table_size;
-  std::vector<int> sparse_table_id;
-  std::map<uint64_t, std::vector<std::string>> slot_input_vec;
-  std::map<uint64_t, std::vector<std::string>> gradient_var;
-  std::map<std::string, uint64_t> slot_alias_to_table;
-};
-
-struct DensePullThreadParam {
-  std::shared_ptr<paddle::ps::PSClient> ps_client;
-  int threshold;
-  int training_thread_num;
-  Scope* root_scope;
-  std::map<uint64_t, std::vector<std::string>>* dense_params;
-  int sleep_time_ms = 2;
-};
-
-class DensePullThread {
- public:
-  explicit DensePullThread(const DensePullThreadParam& param)
-      : _running(false) {
-    _ps_client = param.ps_client;
-    _threshold = param.threshold;
-    _thread_num = param.training_thread_num;
-    _root_scope = param.root_scope;
-    _sleep_time_ms = param.sleep_time_ms;
-
-    for (auto& t : *param.dense_params) {
-      _dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(),
-                                           t.second.begin(), t.second.end());
-      _training_versions[t.first].resize(_thread_num, 0);
-      _last_versions[t.first] = 0;
-      _current_version[t.first] = 0;
-    }
-  }
-
-  int start();
-
-  void stop() {
-    if (_running) {
-      _running = false;
-      _t.join();
-    }
-  }
-
-  void increase_thread_version(int thread_id, uint64_t table_id);
-  void reset_thread_version(uint64_t table_id);
-  std::future<int32_t> pull_dense(uint64_t table_id);
-  void pull_dense2(uint64_t table_id);
-  void wait_all();
-
- private:
-  void run();
-  bool check_update_param(uint64_t table_id);
-
- private:
-  std::shared_ptr<paddle::ps::PSClient> _ps_client;
-  int _thread_num;
-  int _threshold;
-  int _sleep_time_ms;
-  Scope* _root_scope;
-  bool _running;
-
-  std::map<uint64_t, uint64_t> _last_versions;
-  std::map<uint64_t, uint64_t> _current_version;
-  std::mutex _mutex_for_version;
-  std::map<uint64_t, std::vector<uint64_t>> _training_versions;
-  std::map<uint64_t, std::vector<std::string>> _dense_variable_name;
-
-  std::thread _t;
-
-  std::vector<::std::future<int32_t>> _pull_dense_status;
-
-  std::map<uint64_t, std::vector<paddle::ps::Region>> _regions;
-  uint32_t _pull_dense_fail_times = 0;
-
-  std::vector<float> _base_norm_param;
-  std::vector<float> _mean;
-  std::vector<float> _scale;
-  float _squared_sum_epsilon = 1e-4;
-  std::mutex _mutex_for_mean_scale;
-
-  float _total_batch_num = 0;
-};
-#endif
-
-class ExecutorThreadWorker {
- public:
-  ExecutorThreadWorker()
-      : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
-  virtual ~ExecutorThreadWorker() {}
-
-  void CreateThreadResource(const framework::ProgramDesc& program,
-                            const paddle::platform::Place& place);
-  void SetThreadId(int tid);
-  void SetDebug(const bool debug) { debug_ = debug; }
-  void SetRootScope(Scope* g_scope);
-  // set cpu device in this function
-  // cpu binding is used by default
-  void SetDevice();
-  // since we read data into memory that can not be accessed by program
-  // we need to bind memory of data with corresponding variables in program
-  // this function should be called after data feed is set
-  void BindingDataFeedMemory();
-  // set data feed declared in executor
-  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
-  // A multi-thread training function
-  virtual void TrainFiles();
-  // with timer log
-  virtual void TrainFilesWithTimer();
-  // set fetch variable names from python interface assigned by users
-  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
-#ifdef PADDLE_WITH_PSLIB
-  virtual void SetPSlibPtr(
-      std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {}
-  virtual void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt) {}
-  virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}
-#endif
-
- private:
-  void CreateThreadScope(const framework::ProgramDesc& program);
-  void CreateThreadOperators(const framework::ProgramDesc& program);
-  void SetMainProgram(const ProgramDesc& main_program_desc);
-  void SetPlace(const paddle::platform::Place& place);
-
- protected:
-  // thread index
-  std::shared_ptr<DataFeed> thread_reader_;  // shared queue, thread buffer
-  int thread_id_;
-  // operator name
-  std::vector<std::string> op_names_;
-  // thread level, local operators for forward and backward
-  std::vector<OperatorBase*> ops_;
-  // main program for training
-  std::unique_ptr<framework::ProgramDesc> main_program_;
-  // execution place
-  platform::Place place_;
-  // root scope for model parameters
-  Scope* root_scope_;
-  // a thread scope, father scope is global score which is shared
-  Scope* thread_scope_;
-  std::vector<std::string> fetch_var_names_;
-  std::vector<std::vector<float>> fetch_values_;
-  bool debug_;
-};
-
-#ifdef PADDLE_WITH_PSLIB
-class AsyncExecutorThreadWorker : public ExecutorThreadWorker {
- public:
-  AsyncExecutorThreadWorker() {}
-  virtual ~AsyncExecutorThreadWorker() {}
-  void SetPSlibPtr(std::shared_ptr<paddle::distributed::PSlib> pslib_ptr);
-  void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt);
-  void SetParamConfig(AsyncWorkerParamConfig* param_config);
-  void TrainFiles();
-  void TrainOneNetwork();
-  void PrepareParams();
-  void UpdateParams();
-  void PullSparse(int table_id);
-  void FillSparse(int table_id);
-  void PushSparse(int table_id);
-  void PushDense(int table_id);
-
-  void check_pull_push_memory(const std::vector<uint64_t>& features,
-                              std::vector<float*>* push_g, int dim);
-  void check_pull_push_memory(const std::vector<uint64_t>& features,
-                              std::vector<std::vector<float>>* push_g, int dim);
-  void collect_feasign_info(int table_id);
-
- private:
-  struct FeasignInfo {
-    uint32_t slot;
-    uint32_t ins;
-    int64_t label;
-  };
-
-  std::map<uint64_t, std::vector<uint64_t>> _features;
-  std::map<uint64_t, std::vector<FeasignInfo>> _fea_info;
-  std::map<uint64_t, std::vector<std::vector<float>>> _feature_value;
-  std::map<uint64_t, std::vector<std::vector<float>>> _feature_push_value;
-
-  std::shared_ptr<paddle::distributed::PSlib> _pslib_ptr;
-
-  std::shared_ptr<DensePullThread> _pull_dense_thread;
-
-  std::vector<::std::future<int32_t>> _pull_sparse_status;
-  std::vector<::std::future<int32_t>> _pull_dense_status;
-  std::vector<::std::future<int32_t>> _push_sparse_status;
-  std::vector<::std::future<int32_t>> _push_dense_status;
-
-  AsyncWorkerParamConfig* _param_config;
-};
-#endif
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/expect.h b/paddle/fluid/framework/expect.h
deleted file mode 100644
index 146f4de9382a687686d5f7fdd6f4fa2300cb043b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/expect.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
-#ifdef _LINUX
-#ifndef likely
-#define likely(x) __builtin_expect((x), 1)
-#endif
-#endif
-
-#ifdef _LINUX
-#ifndef unlikely
-#define unlikely(x) __builtin_expect((x), 0)
-#endif
-#endif
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
deleted file mode 100644
index 96530b2a3f9cfd9462627a42b2bb0fea98758f92..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include <string>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-void SetFeedVariable(Scope* scope, const LoDTensor& input,
-                     const std::string& var_name, size_t index) {
-  // If var_name Variable is not found in GlobalScope, a new variable will
-  // be created.
-  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
-  Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
-  if (index >= feed_inputs.size()) {
-    feed_inputs.resize(index + 1);
-  }
-  // shared data with input tensor
-  feed_inputs[index].ShareDataWith(input);
-  // set lod
-  feed_inputs[index].set_lod(input.lod());
-}
-
-LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
-                            size_t index) {
-  // Since we want to fetch LodTensor from a variable, the variable must
-  // be created alreadly.
-  Variable* g_fetch_value = scope.FindVar(var_name);
-  PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name);
-  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
-                 "Only %s can be invoked by GetFetchVariable",
-                 typeid(FeedFetchList).name());
-  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
-  auto& tensor = fetch_outputs[index];
-  VLOG(3) << "Fetch " << var_name << " with index " << index
-          << " shape= " << tensor.dims();
-  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
-  return tensor;
-}
-
-LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
-  Variable* var = scope.FindVar(var_name);
-  PADDLE_ENFORCE(var, "%s no in scope", var_name);
-  PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
-  return *var->GetMutable<LoDTensor>();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
deleted file mode 100644
index 031f8e01aa6128b803dcbfb990778e87d4fafc13..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-
-void SetFeedVariable(Scope* scope, const LoDTensor& input,
-                     const std::string& var_name, size_t index);
-
-LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
-                            size_t index);
-
-LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h
deleted file mode 100644
index fae792ad9fa766f456ed706cc9adeb4e34d20123..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-using FeedFetchType = LoDTensor;
-using FeedFetchList = std::vector<FeedFetchType>;
-
-static const char kFeedOpType[] = "feed";
-static const char kFetchOpType[] = "fetch";
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
deleted file mode 100644
index 424063970b7e394ca8142fc698b3936586246014..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-if(WITH_PSLIB)
-    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib)
-else()
-    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
-endif(WITH_PSLIB)
-
-cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
-if(WITH_BOX_PS)
-    cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor box_ps)
-else()
-    cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor)
-endif(WITH_BOX_PS)
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc
deleted file mode 100644
index 935bcc722a3f8b762c480a46c24d8b9574150c89..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/fleet/box_wrapper.h"
-#include <ctime>
-#include <memory>
-#include <numeric>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace framework {
-
-std::shared_ptr<BoxWrapper> BoxWrapper::s_instance_ = nullptr;
-#ifdef PADDLE_WITH_BOX_PS
-std::shared_ptr<paddle::boxps::BoxPSBase> BoxWrapper::boxps_ptr_ = nullptr;
-#endif
-
-int BoxWrapper::GetDate() const {
-  time_t now = time(0);
-  tm t;
-#ifdef _WIN32
-  localtime_s(&t, &now);
-#else
-  localtime_r(&now, &t);
-#endif
-  char buf[10];
-  snprintf(buf, sizeof(buf), "%04d%02d%02d", (1900 + t.tm_year), (1 + t.tm_mon),
-           t.tm_mday);
-  return atoi(buf);
-}
-
-void BoxWrapper::FeedPass(const std::vector<uint64_t>& feasgin_to_box) const {
-#ifdef PADDLE_WITH_BOX_PS
-  int ret = boxps_ptr_->FeedPass(GetDate(), feasgin_to_box);
-  PADDLE_ENFORCE_EQ(ret, 0, "FeedPass failed in BoxPS.");
-#endif
-}
-
-void BoxWrapper::BeginPass() const {
-#ifdef PADDLE_WITH_BOX_PS
-  int ret = boxps_ptr_->BeginPass();
-  PADDLE_ENFORCE_EQ(ret, 0, "BeginPass failed in BoxPS.");
-#endif
-}
-
-void BoxWrapper::EndPass() const {
-#ifdef PADDLE_WITH_BOX_PS
-  int ret = boxps_ptr_->EndPass();
-  PADDLE_ENFORCE_EQ(ret, 0, "EndPass failed in BoxPS.");
-#endif
-}
-
-void BoxWrapper::PullSparse(const paddle::platform::Place& place,
-                            const std::vector<const uint64_t*>& keys,
-                            const std::vector<float*>& values,
-                            const std::vector<int64_t>& slot_lengths,
-                            const int hidden_size) {
-#ifdef PADDLE_WITH_BOX_PS
-  if (platform::is_cpu_place(place) || platform::is_gpu_place(place)) {
-    int64_t total_length =
-        std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-    LoDTensor total_keys_tensor;
-    int64_t* total_keys =
-        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place);
-    int64_t offset = 0;
-    for (size_t i = 0; i < keys.size(); ++i) {
-      if (platform::is_cpu_place(place)) {
-        memory::Copy(boost::get<platform::CPUPlace>(place), total_keys + offset,
-                     boost::get<platform::CPUPlace>(place), keys[i],
-                     slot_lengths[i] * sizeof(uint64_t));
-      } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-        memory::Copy(boost::get<platform::CUDAPlace>(place),
-                     total_keys + offset,
-                     boost::get<platform::CUDAPlace>(place), keys[i],
-                     slot_lengths[i] * sizeof(uint64_t), nullptr);
-#else
-        PADDLE_THROW(
-            "Please compile WITH_GPU option, and NCCL doesn't support "
-            "windows.");
-#endif
-      }
-      offset += slot_lengths[i];
-    }
-    PADDLE_ENFORCE_EQ(offset, total_length,
-                      "BoxWrapper::PullSparse: total feasign keys length "
-                      "should be equal to the sum of length of all input "
-                      "tensors.");
-
-    // Space allocation for FeatureValue is left for boxps
-    paddle::boxps::FeatureValue* total_values;
-    if (platform::is_cpu_place(place)) {
-      int ret = boxps_ptr_->PullSparseCPU(
-          reinterpret_cast<uint64_t*>(total_keys), &total_values,
-          static_cast<int>(total_length));
-      PADDLE_ENFORCE_EQ(ret, 0, "PullSparseCPU failed in BoxPS.");
-    } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      int ret = boxps_ptr_->PullSparseGPU(
-          reinterpret_cast<uint64_t*>(total_keys), &total_values,
-          static_cast<int>(total_length),
-          boost::get<platform::CUDAPlace>(place).GetDeviceId());
-      PADDLE_ENFORCE_EQ(ret, 0, "PullSparseGPU failed in BoxPS.");
-#endif
-    }
-
-    offset = 0;
-    for (size_t i = 0; i < values.size(); ++i) {
-      int64_t fea_num = slot_lengths[i];
-      for (auto j = 0; j < fea_num; ++j) {
-        // Copy the emb from BoxPS to paddle tensor. Since 'show','click','emb'
-        // are continuous in memory, so we copy here using the 'show' address
-        if (platform::is_cpu_place(place)) {
-          memory::Copy(
-              boost::get<platform::CPUPlace>(place),
-              values[i] + j * hidden_size,
-              boost::get<platform::CPUPlace>(place),
-              reinterpret_cast<float*>(&((total_values + offset)->show)),
-              sizeof(float) * hidden_size);
-        } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-          memory::Copy(
-              boost::get<platform::CUDAPlace>(place),
-              values[i] + j * hidden_size,
-              boost::get<platform::CUDAPlace>(place),
-              reinterpret_cast<float*>(&((total_values + offset)->show)),
-              sizeof(float) * hidden_size, nullptr);
-#endif
-        }
-        ++offset;
-      }
-    }
-    PADDLE_ENFORCE_EQ(offset, total_length,
-                      "BoxWrapper::PullSparse: total emb values length should "
-                      "be equal to the sum of length of all input tensors.");
-
-  } else {
-    PADDLE_THROW(
-        "PaddleBox: PullSparse Only Support CPUPlace and CUDAPlace Now.");
-  }
-#endif
-}
-
-void BoxWrapper::PushSparseGrad(const paddle::platform::Place& place,
-                                const std::vector<const uint64_t*>& keys,
-                                const std::vector<const float*>& grad_values,
-                                const std::vector<int64_t>& slot_lengths,
-                                const int hidden_size) {
-#ifdef PADDLE_WITH_BOX_PS
-  if (platform::is_cpu_place(place) || platform::is_gpu_place(place)) {
-    int64_t total_length =
-        std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-    LoDTensor total_keys_tensor;
-    int64_t* total_keys =
-        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place);
-    int64_t offset = 0;
-    for (size_t i = 0; i < keys.size(); ++i) {
-      if (platform::is_cpu_place(place)) {
-        memory::Copy(boost::get<platform::CPUPlace>(place), total_keys + offset,
-                     boost::get<platform::CPUPlace>(place), keys[i],
-                     slot_lengths[i] * sizeof(uint64_t));
-      } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-        memory::Copy(boost::get<platform::CUDAPlace>(place),
-                     total_keys + offset,
-                     boost::get<platform::CUDAPlace>(place), keys[i],
-                     slot_lengths[i] * sizeof(uint64_t), nullptr);
-#else
-        PADDLE_THROW(
-            "Please compile WITH_GPU option, and for now NCCL doesn't support "
-            "windows.");
-#endif
-      }
-      offset += slot_lengths[i];
-    }
-    PADDLE_ENFORCE_EQ(offset, total_length,
-                      "BoxWrapper::PushSparseGrad: total feasign keys length "
-                      "should be equal to the sum of length of all input "
-                      "tensors.");
-    auto buf = memory::AllocShared(
-        place, total_length * sizeof(paddle::boxps::FeaturePushValue));
-    paddle::boxps::FeaturePushValue* total_grad_values =
-        reinterpret_cast<paddle::boxps::FeaturePushValue*>(buf->ptr());
-    offset = 0;
-    for (size_t i = 0; i < grad_values.size(); ++i) {
-      int64_t fea_num = slot_lengths[i];
-      for (auto j = 0; j < fea_num; ++j) {
-        // Copy the emb grad from paddle tensor to BoxPS. Since
-        // 'show','click','emb' are continuous in memory, so we copy here using
-        // the 'show' address
-        if (platform::is_cpu_place(place)) {
-          memory::Copy(
-              boost::get<platform::CPUPlace>(place),
-              reinterpret_cast<float*>(&((total_grad_values + offset)->show)),
-              boost::get<platform::CPUPlace>(place),
-              grad_values[i] + j * hidden_size, sizeof(float) * hidden_size);
-        } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-          memory::Copy(
-              boost::get<platform::CUDAPlace>(place),
-              reinterpret_cast<float*>(&((total_grad_values + offset)->show)),
-              boost::get<platform::CUDAPlace>(place),
-              grad_values[i] + j * hidden_size, sizeof(float) * hidden_size,
-              nullptr);
-#endif
-        }
-        ++offset;
-      }
-    }
-    PADDLE_ENFORCE_EQ(offset, total_length,
-                      "BoxWrapper::PushSparseGrad: total emb grad values "
-                      "length should be equal to the sum of length of all "
-                      "input tensors.");
-    if (platform::is_cpu_place(place)) {
-      int ret = boxps_ptr_->PushSparseCPU(
-          reinterpret_cast<uint64_t*>(total_keys), total_grad_values,
-          static_cast<int>(total_length));
-      PADDLE_ENFORCE_EQ(ret, 0, "PushSparseCPU failed in BoxPS.");
-    } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      int ret = boxps_ptr_->PushSparseGPU(
-          reinterpret_cast<uint64_t*>(total_keys), total_grad_values,
-          static_cast<int>(total_length),
-          boost::get<platform::CUDAPlace>(place).GetDeviceId());
-      PADDLE_ENFORCE_EQ(ret, 0, "PushSparseGPU failed in BoxPS.");
-#endif
-    }
-  } else {
-    PADDLE_THROW(
-        "PaddleBox: PushSparse Only Support CPUPlace and CUDAPlace Now.");
-  }
-#endif
-}
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
deleted file mode 100644
index c650d9cb7a63242d9b8d42c41049545d534a0975..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_set.h"
-#ifdef PADDLE_WITH_BOX_PS
-#include <boxps.h>
-#endif
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-class BoxWrapper {
- public:
-  virtual ~BoxWrapper() {}
-  BoxWrapper() {}
-
-  void FeedPass(const std::vector<uint64_t>& feasgin_to_box) const;
-  void BeginPass() const;
-  void EndPass() const;
-  void PullSparse(const paddle::platform::Place& place,
-                  const std::vector<const uint64_t*>& keys,
-                  const std::vector<float*>& values,
-                  const std::vector<int64_t>& slot_lengths,
-                  const int hidden_size);
-  void PushSparseGrad(const paddle::platform::Place& place,
-                      const std::vector<const uint64_t*>& keys,
-                      const std::vector<const float*>& grad_values,
-                      const std::vector<int64_t>& slot_lengths,
-                      const int hidden_size);
-
-  static std::shared_ptr<BoxWrapper> GetInstance() {
-    if (nullptr == s_instance_) {
-      // If main thread is guaranteed to init this, this lock can be removed
-      static std::mutex mutex;
-      std::lock_guard<std::mutex> lock(mutex);
-      if (nullptr == s_instance_) {
-        s_instance_.reset(new paddle::framework::BoxWrapper());
-#ifdef PADDLE_WITH_BOX_PS
-        s_instance_->boxps_ptr_.reset(new paddle::boxps::FakeBoxPS());
-#endif
-      }
-    }
-    return s_instance_;
-  }
-
- private:
-#ifdef PADDLE_WITH_BOX_PS
-  static std::shared_ptr<paddle::boxps::BoxPSBase> boxps_ptr_;
-#endif
-  static std::shared_ptr<BoxWrapper> s_instance_;
-  int GetDate() const;
-};
-
-class BoxHelper {
- public:
-  explicit BoxHelper(paddle::framework::Dataset* dataset) : dataset_(dataset) {}
-  virtual ~BoxHelper() {}
-
-  void BeginPass() {
-    auto box_ptr = BoxWrapper::GetInstance();
-    box_ptr->BeginPass();
-  }
-
-  void EndPass() {
-    auto box_ptr = BoxWrapper::GetInstance();
-    box_ptr->EndPass();
-  }
-  void LoadIntoMemory() {
-    dataset_->LoadIntoMemory();
-    FeedPass();
-  }
-  void PreLoadIntoMemory() {
-    dataset_->PreLoadIntoMemory();
-    feed_data_thread_.reset(new std::thread([&]() {
-      dataset_->WaitPreLoadDone();
-      FeedPass();
-    }));
-  }
-  void WaitFeedPassDone() { feed_data_thread_->join(); }
-
- private:
-  Dataset* dataset_;
-  std::shared_ptr<std::thread> feed_data_thread_;
-  // notify boxps to feed this pass feasigns from SSD to memory
-  void FeedPass() {
-    auto box_ptr = BoxWrapper::GetInstance();
-    auto input_channel_ =
-        dynamic_cast<MultiSlotDataset*>(dataset_)->GetInputChannel();
-    std::vector<Record> pass_data;
-    std::vector<uint64_t> feasign_to_box;
-    input_channel_->ReadAll(pass_data);
-    for (const auto& ins : pass_data) {
-      const auto& feasign_v = ins.uint64_feasigns_;
-      for (const auto feasign : feasign_v) {
-        feasign_to_box.push_back(feasign.sign().uint64_feasign_);
-      }
-    }
-    input_channel_->Open();
-    input_channel_->Write(pass_data);
-    input_channel_->Close();
-    box_ptr->FeedPass(feasign_to_box);
-  }
-};
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
deleted file mode 100644
index 22a9b79d7fcba6894073052c3c211358dece96ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ /dev/null
@@ -1,749 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include <algorithm>
-#include <utility>
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-
-const uint32_t MAX_FEASIGN_NUM = 1024 * 100 * 100;
-std::shared_ptr<FleetWrapper> FleetWrapper::s_instance_ = NULL;
-bool FleetWrapper::is_initialized_ = false;
-
-#ifdef PADDLE_WITH_PSLIB
-template <class AR>
-paddle::ps::Archive<AR>& operator<<(paddle::ps::Archive<AR>& ar,
-                                    const MultiSlotType& ins) {
-  ar << ins.GetType();
-  ar << ins.GetOffset();
-  ar << ins.GetFloatData();
-  ar << ins.GetUint64Data();
-  return ar;
-}
-
-template <class AR>
-paddle::ps::Archive<AR>& operator>>(paddle::ps::Archive<AR>& ar,
-                                    MultiSlotType& ins) {
-  ar >> ins.MutableType();
-  ar >> ins.MutableOffset();
-  ar >> ins.MutableFloatData();
-  ar >> ins.MutableUint64Data();
-  return ar;
-}
-#endif
-
-#ifdef PADDLE_WITH_PSLIB
-std::shared_ptr<paddle::distributed::PSlib> FleetWrapper::pslib_ptr_ = NULL;
-#endif
-
-void FleetWrapper::SetClient2ClientConfig(int request_timeout_ms,
-                                          int connect_timeout_ms,
-                                          int max_retry) {
-  client2client_request_timeout_ms_ = request_timeout_ms;
-  client2client_connect_timeout_ms_ = connect_timeout_ms;
-  client2client_max_retry_ = max_retry;
-}
-
-void FleetWrapper::InitServer(const std::string& dist_desc, int index) {
-#ifdef PADDLE_WITH_PSLIB
-  if (!is_initialized_) {
-    VLOG(3) << "Going to init server";
-    pslib_ptr_ = std::shared_ptr<paddle::distributed::PSlib>(
-        new paddle::distributed::PSlib());
-    pslib_ptr_->init_server(dist_desc, index);
-    is_initialized_ = true;
-  } else {
-    VLOG(3) << "Server can be initialized only once";
-  }
-#endif
-}
-
-void FleetWrapper::InitWorker(const std::string& dist_desc,
-                              const std::vector<uint64_t>& host_sign_list,
-                              int node_num, int index) {
-#ifdef PADDLE_WITH_PSLIB
-  if (!is_initialized_) {
-    VLOG(3) << "Going to init worker";
-    pslib_ptr_ = std::shared_ptr<paddle::distributed::PSlib>(
-        new paddle::distributed::PSlib());
-    pslib_ptr_->init_worker(dist_desc,
-                            const_cast<uint64_t*>(host_sign_list.data()),
-                            node_num, index);
-    is_initialized_ = true;
-  } else {
-    VLOG(3) << "Worker can be initialized only once";
-  }
-#endif
-}
-
-void FleetWrapper::StopServer() {
-#ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to stop server";
-  pslib_ptr_->stop_server();
-#endif
-}
-
-uint64_t FleetWrapper::RunServer() {
-#ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to run server";
-  return pslib_ptr_->run_server();
-#else
-  return 0;
-#endif
-}
-
-void FleetWrapper::GatherServers(const std::vector<uint64_t>& host_sign_list,
-                                 int node_num) {
-#ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to gather server ips";
-  pslib_ptr_->gather_servers(const_cast<uint64_t*>(host_sign_list.data()),
-                             node_num);
-#endif
-}
-
-void FleetWrapper::GatherClients(const std::vector<uint64_t>& host_sign_list) {
-#ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to gather client ips";
-  size_t len = host_sign_list.size();
-  pslib_ptr_->gather_clients(const_cast<uint64_t*>(host_sign_list.data()), len);
-#endif
-}
-
-std::vector<uint64_t> FleetWrapper::GetClientsInfo() {
-#ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to get client info";
-  return pslib_ptr_->get_client_info();
-#endif
-  return std::vector<uint64_t>();
-}
-
-void FleetWrapper::CreateClient2ClientConnection() {
-#ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to create client2client connection";
-  pslib_ptr_->create_client2client_connection(client2client_request_timeout_ms_,
-                                              client2client_connect_timeout_ms_,
-                                              client2client_max_retry_);
-#endif
-}
-
-void FleetWrapper::PullSparseVarsSync(
-    const Scope& scope, const uint64_t table_id,
-    const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
-    std::vector<std::vector<float>>* fea_values, int fea_value_dim) {
-#ifdef PADDLE_WITH_PSLIB
-  std::vector<::std::future<int32_t>> pull_sparse_status;
-  pull_sparse_status.resize(0);
-  fea_keys->clear();
-  fea_keys->resize(0);
-  fea_keys->reserve(MAX_FEASIGN_NUM);
-  for (auto name : var_names) {
-    Variable* var = scope.FindVar(name);
-    if (var == nullptr) {
-      continue;
-    }
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
-    int64_t* ids = tensor->data<int64_t>();
-    int len = tensor->numel();
-    for (auto i = 0u; i < len; ++i) {
-      if (ids[i] == 0u) {
-        continue;
-      }
-      fea_keys->push_back(static_cast<uint64_t>(ids[i]));
-    }
-  }
-  fea_values->resize(fea_keys->size() + 1);
-  for (auto& t : *fea_values) {
-    t.resize(fea_value_dim);
-  }
-  std::vector<float*> pull_result_ptr;
-  for (auto& t : *fea_values) {
-    pull_result_ptr.push_back(t.data());
-  }
-  auto status = pslib_ptr_->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
-  pull_sparse_status.push_back(std::move(status));
-  for (auto& t : pull_sparse_status) {
-    t.wait();
-    auto status = t.get();
-    if (status != 0) {
-      LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
-      sleep(sleep_seconds_before_fail_exit_);
-      exit(-1);
-    }
-  }
-#endif
-}
-
-void FleetWrapper::PullDenseVarsAsync(
-    const Scope& scope, const uint64_t tid,
-    const std::vector<std::string>& var_names,
-    std::vector<::std::future<int32_t>>* pull_dense_status) {
-#ifdef PADDLE_WITH_PSLIB
-  auto& regions = _regions[tid];
-  regions.clear();
-  regions.resize(var_names.size());
-  for (auto i = 0u; i < var_names.size(); ++i) {
-    Variable* var = scope.FindVar(var_names[i]);
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    float* w = tensor->data<float>();
-    paddle::ps::Region reg(w, tensor->numel());
-    regions[i] = std::move(reg);
-  }
-  auto status =
-      pslib_ptr_->_worker_ptr->pull_dense(regions.data(), regions.size(), tid);
-  pull_dense_status->push_back(std::move(status));
-#endif
-}
-
-void FleetWrapper::PullDenseVarsSync(
-    const Scope& scope, const uint64_t tid,
-    const std::vector<std::string>& var_names) {
-#ifdef PADDLE_WITH_PSLIB
-  auto& regions = _regions[tid];
-  regions.clear();
-  regions.reserve(var_names.size());
-  for (auto& t : var_names) {
-    Variable* var = scope.FindVar(t);
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    float* w = tensor->data<float>();
-    paddle::ps::Region reg(w, tensor->numel());
-    regions.emplace_back(std::move(reg));
-  }
-  auto status =
-      pslib_ptr_->_worker_ptr->pull_dense(regions.data(), regions.size(), tid);
-  status.wait();
-#endif
-}
-
-void FleetWrapper::PushDenseParamSync(
-    const Scope& scope, const uint64_t table_id,
-    const std::vector<std::string>& var_names) {
-#ifdef PADDLE_WITH_PSLIB
-  auto place = platform::CPUPlace();
-  std::vector<paddle::ps::Region> regions;
-  for (auto& t : var_names) {
-    Variable* var = scope.FindVar(t);
-    CHECK(var != nullptr) << "var[" << t << "] not found";
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    float* g = tensor->mutable_data<float>(place);
-    paddle::ps::Region reg(g, tensor->numel());
-    regions.emplace_back(std::move(reg));
-  }
-  auto push_status = pslib_ptr_->_worker_ptr->push_dense_param(
-      regions.data(), regions.size(), table_id);
-  push_status.wait();
-  auto status = push_status.get();
-  CHECK(status == 0) << "push dense param failed, status[" << status << "]";
-#endif
-}
-
-void FleetWrapper::PushDenseVarsSync(
-    Scope* scope, const uint64_t table_id,
-    const std::vector<std::string>& var_names) {}
-
-void FleetWrapper::PushDenseVarsAsync(
-    const Scope& scope, const uint64_t table_id,
-    const std::vector<std::string>& var_names,
-    std::vector<::std::future<int32_t>>* push_sparse_status,
-    float scale_datanorm, int batch_size) {
-#ifdef PADDLE_WITH_PSLIB
-  std::vector<paddle::ps::Region> regions;
-  for (auto& t : var_names) {
-    Variable* var = scope.FindVar(t);
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    int count = tensor->numel();
-    float* g = tensor->data<float>();
-    if (scale_datanorm >= 0) {
-      if (t.find(".batch_size@GRAD") != std::string::npos ||
-          t.find(".batch_sum@GRAD") != std::string::npos) {
-        Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
-        float scale = 1.0 / batch_size;
-        mat *= scale;
-      } else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
-        VLOG(3) << "epsilon: " << scale_datanorm;
-        for (int i = 0; i < count; ++i) {
-          g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
-                 batch_size * scale_datanorm;
-        }
-      }
-    }
-    paddle::ps::Region reg(g, count);
-    regions.emplace_back(std::move(reg));
-  }
-  auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
-                                                    regions.size(), table_id);
-  push_sparse_status->push_back(std::move(status));
-#endif
-}
-
-void FleetWrapper::PushSparseVarsWithLabelAsync(
-    const Scope& scope, const uint64_t table_id,
-    const std::vector<uint64_t>& fea_keys, const std::vector<float>& fea_labels,
-    const std::vector<std::string>& sparse_key_names,
-    const std::vector<std::string>& sparse_grad_names, const int emb_dim,
-    std::vector<std::vector<float>>* push_values,
-    std::vector<::std::future<int32_t>>* push_sparse_status,
-    const int batch_size, const bool use_cvm, const bool dump_slot) {
-#ifdef PADDLE_WITH_PSLIB
-  int offset = 2;
-  int slot_offset = 0;
-  int grad_dim = emb_dim;
-  int show_index = 0;
-  int click_index = 1;
-  if (use_cvm) {
-    offset = 0;
-    grad_dim = emb_dim - 2;
-  }
-  if (dump_slot) {
-    slot_offset = 1;
-    show_index = 1;
-    click_index = 2;
-  }
-  CHECK_GE(grad_dim, 0);
-
-  push_values->resize(fea_keys.size() + 1);
-  for (auto& t : *push_values) {
-    t.resize(emb_dim + offset + slot_offset);
-  }
-  uint64_t fea_idx = 0u;
-  for (size_t i = 0; i < sparse_key_names.size(); ++i) {
-    Variable* var = scope.FindVar(sparse_key_names[i]);
-    if (var == nullptr) {
-      continue;
-    }
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    if (tensor == nullptr) {
-      LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
-      exit(-1);
-    }
-    int len = tensor->numel();
-    int64_t* ids = tensor->data<int64_t>();
-    int slot = 0;
-    if (dump_slot) {
-      slot = boost::lexical_cast<int>(sparse_key_names[i]);
-    }
-    Variable* g_var = scope.FindVar(sparse_grad_names[i]);
-    if (g_var == nullptr) {
-      continue;
-    }
-    LoDTensor* g_tensor = g_var->GetMutable<LoDTensor>();
-    if (g_tensor == nullptr) {
-      LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
-      exit(-1);
-    }
-    float* g = g_tensor->data<float>();
-
-    if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
-      int dim = emb_dim + offset;
-      Eigen::Map<
-          Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-          g_mat(g, g_tensor->numel() / dim, dim);
-      g_mat.rightCols(grad_dim) *= batch_size;
-    }
-    for (auto id_idx = 0u; id_idx < len; ++id_idx) {
-      if (ids[id_idx] == 0) {
-        g += emb_dim;
-        continue;
-      }
-      CHECK(fea_idx < (*push_values).size());
-      CHECK(fea_idx < fea_labels.size());
-
-      if (use_cvm) {
-        memcpy((*push_values)[fea_idx].data() + offset + slot_offset, g,
-               sizeof(float) * emb_dim);
-      } else {
-        memcpy((*push_values)[fea_idx].data() + offset + slot_offset, g,
-               sizeof(float) * emb_dim);
-        (*push_values)[fea_idx][show_index] = 1.0f;
-        (*push_values)[fea_idx][click_index] =
-            static_cast<float>(fea_labels[fea_idx]);
-      }
-      if (dump_slot) {
-        (*push_values)[fea_idx][0] = static_cast<float>(slot);
-      }
-      g += emb_dim;
-      fea_idx++;
-    }
-  }
-  CHECK(fea_idx == fea_keys.size()) << "fea_idx: " << fea_idx
-                                    << "features size: " << fea_keys.size();
-  std::vector<float*> push_g_vec;
-  for (auto i = 0u; i < fea_keys.size(); ++i) {
-    push_g_vec.push_back((*push_values)[i].data());
-  }
-  auto status = pslib_ptr_->_worker_ptr->push_sparse(
-      table_id, fea_keys.data(), (const float**)push_g_vec.data(),
-      fea_keys.size());
-  push_sparse_status->push_back(std::move(status));
-
-#endif
-}
-
-void FleetWrapper::LoadFromPaddleModel(Scope& scope, const uint64_t table_id,
-                                       std::vector<std::string> var_list,
-                                       std::string model_path,
-                                       std::string model_proto_file,
-                                       std::vector<std::string> table_var_list,
-                                       bool load_combine) {
-#ifdef PADDLE_WITH_PSLIB
-  // load ProgramDesc from model file
-  auto read_proto_func = [](const std::string& filename) -> ProgramDesc {
-    std::string contents;
-    std::ifstream fin(filename, std::ios::in | std::ios::binary);
-    fin.seekg(0, std::ios::end);
-    contents.resize(fin.tellg());
-    fin.seekg(0, std::ios::beg);
-    fin.read(&contents[0], contents.size());
-    fin.close();
-    ProgramDesc program_desc(contents);
-    return program_desc;
-  };
-  const ProgramDesc old_program = read_proto_func(model_proto_file);
-  Scope* old_scope = new Scope();
-  auto& old_block = old_program.Block(0);
-  auto place = platform::CPUPlace();
-  std::vector<std::string> old_param_list;
-
-  for (auto& t : var_list) {
-    VarDesc* old_var_desc = old_block.FindVar(t);
-    if (old_var_desc == nullptr) {
-      continue;
-    }
-    // init variable in scope
-    Variable* old_var = old_scope->Var(old_var_desc->Name());
-    InitializeVariable(old_var, old_var_desc->GetType());
-    old_param_list.push_back(t);
-    if (load_combine) {
-      continue;
-    }
-    // load variable from model
-    paddle::framework::AttributeMap attrs;
-    attrs.insert({"file_path", model_path + "/" + old_var_desc->Name()});
-    auto load_op = paddle::framework::OpRegistry::CreateOp(
-        "load", {}, {{"Out", {old_var_desc->Name()}}}, attrs);
-    load_op->Run(*old_scope, place);
-  }
-
-  if (load_combine) {
-    std::sort(old_param_list.begin(), old_param_list.end());
-    paddle::framework::AttributeMap attrs;
-    attrs.insert({"file_path", model_path});
-    auto load_op = paddle::framework::OpRegistry::CreateOp(
-        "load_combine", {}, {{"Out", old_param_list}}, attrs);
-    load_op->Run(*old_scope, place);
-  }
-
-  for (auto& t : old_param_list) {
-    Variable* old_var = old_scope->Var(t);
-    // old model data, here we assume data type is float
-    LoDTensor* old_tensor = old_var->GetMutable<LoDTensor>();
-    float* old_data = old_tensor->data<float>();
-    // new model data, here we assume data type is float
-    Variable* var = scope.FindVar(t);
-    CHECK(var != nullptr) << "var[" << t << "] not found";
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    float* data = tensor->data<float>();
-    // copy from old data to new data
-    if (old_tensor->numel() > tensor->numel()) {
-      memcpy(data, old_data, tensor->numel() * sizeof(float));
-    } else {
-      memcpy(data, old_data, old_tensor->numel() * sizeof(float));
-    }
-  }
-  delete old_scope;
-  PushDenseParamSync(scope, table_id, table_var_list);
-#endif
-}
-
-void FleetWrapper::LoadModel(const std::string& path, const int mode) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->load(path, std::to_string(mode));
-  ret.wait();
-  if (ret.get() != 0) {
-    LOG(ERROR) << "load model from path:" << path << " failed";
-    sleep(sleep_seconds_before_fail_exit_);
-    exit(-1);
-  }
-#else
-  VLOG(0) << "FleetWrapper::LoadModel does nothing when no pslib";
-#endif
-}
-
-void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
-                                     const std::string& path, const int mode) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret =
-      pslib_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode));
-  ret.wait();
-  if (ret.get() != 0) {
-    LOG(ERROR) << "load model of table id: " << table_id
-               << ", from path: " << path << " failed";
-  }
-#else
-  VLOG(0) << "FleetWrapper::LoadModel does nothing when no pslib";
-#endif
-}
-
-void FleetWrapper::SaveModel(const std::string& path, const int mode) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->save(path, std::to_string(mode));
-  ret.wait();
-  int32_t feasign_cnt = ret.get();
-  if (feasign_cnt == -1) {
-    LOG(ERROR) << "save model failed";
-    sleep(sleep_seconds_before_fail_exit_);
-    exit(-1);
-  }
-#else
-  VLOG(0) << "FleetWrapper::SaveModel does nothing when no pslib";
-#endif
-}
-
-double FleetWrapper::GetCacheThreshold() {
-#ifdef PADDLE_WITH_PSLIB
-  double cache_threshold = 0.0;
-  auto ret = pslib_ptr_->_worker_ptr->flush();
-  ret.wait();
-  ret = pslib_ptr_->_worker_ptr->get_cache_threshold(0, cache_threshold);
-  ret.wait();
-  if (cache_threshold < 0) {
-    LOG(ERROR) << "get cache threshold failed";
-    sleep(sleep_seconds_before_fail_exit_);
-    exit(-1);
-  }
-  return cache_threshold;
-#else
-  VLOG(0) << "FleetWrapper::GetCacheThreshold does nothing when no pslib";
-  return 0.0;
-#endif
-}
-
-void FleetWrapper::CacheShuffle(int table_id, const std::string& path,
-                                const int mode, const double cache_threshold) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->cache_shuffle(
-      0, path, std::to_string(mode), std::to_string(cache_threshold));
-  ret.wait();
-  int32_t feasign_cnt = ret.get();
-  if (feasign_cnt == -1) {
-    LOG(ERROR) << "cache shuffle failed";
-    sleep(sleep_seconds_before_fail_exit_);
-    exit(-1);
-  }
-#else
-  VLOG(0) << "FleetWrapper::CacheShuffle does nothing when no pslib";
-#endif
-}
-
-int32_t FleetWrapper::SaveCache(int table_id, const std::string& path,
-                                const int mode) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->save_cache(0, path, std::to_string(mode));
-  ret.wait();
-  int32_t feasign_cnt = ret.get();
-  if (feasign_cnt == -1) {
-    LOG(ERROR) << "table save cache failed";
-    sleep(sleep_seconds_before_fail_exit_);
-    exit(-1);
-  }
-  return feasign_cnt;
-#else
-  VLOG(0) << "FleetWrapper::SaveCache does nothing when no pslib";
-  return -1;
-#endif
-}
-
-void FleetWrapper::ShrinkSparseTable(int table_id) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->shrink(table_id);
-  ret.wait();
-#else
-  VLOG(0) << "FleetWrapper::ShrinkSparseTable does nothing when no pslib";
-#endif
-}
-
-void FleetWrapper::ClearModel() {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->clear();
-  ret.wait();
-#else
-  VLOG(0) << "FleetWrapper::ClearModel does nothing when no pslib";
-#endif
-}
-
-void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope,
-                                    std::vector<std::string> var_list,
-                                    float decay, int emb_dim) {
-#ifdef PADDLE_WITH_PSLIB
-  std::vector<paddle::ps::Region> regions;
-  for (std::string& name : var_list) {
-    if (name.find("batch_sum") != std::string::npos) {
-      Variable* var = scope->FindVar(name);
-      CHECK(var != nullptr) << "var[" << name << "] not found";
-      VLOG(0) << "prepare shrink dense batch_sum";
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      float* g = tensor->data<float>();
-
-      // show_batch_sum += N * log(decay)
-      std::string size_name = name;
-      size_name.replace(size_name.find("batch_sum"), size_name.length(),
-                        "batch_size");
-      Variable* var_size = scope->FindVar(size_name);
-      CHECK(var_size != nullptr) << "var[" << size_name << "] not found";
-      VLOG(3) << "shrink dense batch_sum: " << name << ", " << size_name;
-      float* g_size = var_size->GetMutable<LoDTensor>()->data<float>();
-
-      for (int k = 0; k < tensor->numel(); k += emb_dim) {
-        g[k] = g[k] + g_size[k] * log(decay);
-      }
-      paddle::ps::Region reg(g, tensor->numel());
-      regions.emplace_back(std::move(reg));
-    } else {
-      Variable* var = scope->FindVar(name);
-      CHECK(var != nullptr) << "var[" << name << "] not found";
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      float* g = tensor->data<float>();
-      paddle::ps::Region reg(g, tensor->numel());
-      regions.emplace_back(std::move(reg));
-    }
-  }
-  auto push_status = pslib_ptr_->_worker_ptr->push_dense_param(
-      regions.data(), regions.size(), table_id);
-  push_status.wait();
-  auto status = push_status.get();
-  if (status != 0) {
-    LOG(FATAL) << "push shrink dense param failed, status[" << status << "]";
-    sleep(sleep_seconds_before_fail_exit_);
-    exit(-1);
-  }
-#else
-  VLOG(0) << "FleetWrapper::ShrinkSparseTable does nothing when no pslib";
-#endif
-}
-
-void FleetWrapper::ClientFlush() {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->flush();
-  ret.wait();
-#else
-  VLOG(0) << "FleetWrapper::ServerFlush does nothing when no pslib";
-#endif
-}
-
-int FleetWrapper::RegisterClientToClientMsgHandler(int msg_type,
-                                                   MsgHandlerFunc handler) {
-#ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "calling FleetWrapper::RegisterClientToClientMsgHandler";
-  VLOG(3) << "pslib_ptr_=" << pslib_ptr_;
-  VLOG(3) << "_worker_ptr=" << pslib_ptr_->_worker_ptr;
-  return pslib_ptr_->_worker_ptr->registe_client2client_msg_handler(msg_type,
-                                                                    handler);
-#else
-  VLOG(0) << "FleetWrapper::RegisterClientToClientMsgHandler"
-          << " does nothing when no pslib";
-#endif
-  return 0;
-}
-
-std::future<int32_t> FleetWrapper::SendClientToClientMsg(
-    int msg_type, int to_client_id, const std::string& msg) {
-#ifdef PADDLE_WITH_PSLIB
-  return pslib_ptr_->_worker_ptr->send_client2client_msg(msg_type, to_client_id,
-                                                         msg);
-#else
-  VLOG(0) << "FleetWrapper::SendClientToClientMsg"
-          << " does nothing when no pslib";
-#endif
-  return std::future<int32_t>();
-}
-
-template <typename T>
-void FleetWrapper::Serialize(const std::vector<T*>& t, std::string* str) {
-#ifdef PADDLE_WITH_PSLIB
-  paddle::ps::BinaryArchive ar;
-  for (size_t i = 0; i < t.size(); ++i) {
-    ar << *(t[i]);
-  }
-  *str = std::string(ar.buffer(), ar.length());
-#else
-  VLOG(0) << "FleetWrapper::Serialize does nothing when no pslib";
-#endif
-}
-
-template <typename T>
-void FleetWrapper::Deserialize(std::vector<T>* t, const std::string& str) {
-#ifdef PADDLE_WITH_PSLIB
-  if (str.length() == 0) {
-    return;
-  }
-  paddle::ps::BinaryArchive ar;
-  ar.set_read_buffer(const_cast<char*>(str.c_str()), str.length(), nullptr);
-  if (ar.cursor() == ar.finish()) {
-    return;
-  }
-  while (ar.cursor() < ar.finish()) {
-    t->push_back(ar.get<T>());
-  }
-  CHECK(ar.cursor() == ar.finish());
-  VLOG(3) << "Deserialize size " << t->size();
-#else
-  VLOG(0) << "FleetWrapper::Deserialize does nothing when no pslib";
-#endif
-}
-
-std::default_random_engine& FleetWrapper::LocalRandomEngine() {
-  struct engine_wrapper_t {
-    std::default_random_engine engine;
-#ifdef PADDLE_WITH_PSLIB
-    engine_wrapper_t() {
-      struct timespec tp;
-      clock_gettime(CLOCK_REALTIME, &tp);
-      double cur_time = tp.tv_sec + tp.tv_nsec * 1e-9;
-      static std::atomic<uint64_t> x(0);
-      std::seed_seq sseq = {x++, x++, x++, (uint64_t)(cur_time * 1000)};
-      engine.seed(sseq);
-    }
-#endif
-  };
-  thread_local engine_wrapper_t r;
-  return r.engine;
-}
-
-template void FleetWrapper::Serialize<std::vector<MultiSlotType>>(
-    const std::vector<std::vector<MultiSlotType>*>&, std::string*);
-template void FleetWrapper::Deserialize<std::vector<MultiSlotType>>(
-    std::vector<std::vector<MultiSlotType>>*, const std::string&);
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
deleted file mode 100644
index 4aa626340d4af44f95f6d15850f47d9d55fffb79..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#ifdef PADDLE_WITH_PSLIB
-#include <archive.h>
-#include <pslib.h>
-#endif
-#include <atomic>
-#include <ctime>
-#include <map>
-#include <random>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace framework {
-
-// A wrapper class for pslib.h, this class follows Singleton pattern
-// i.e. only initialized once in the current process
-// Example:
-//    std::shared_ptr<FleetWrapper> fleet_ptr =
-//         FleetWrapper::GetInstance();
-//    string dist_desc;
-//    fleet_ptr->InitServer(dist_desc, 0);
-// interface design principles:
-// Pull
-//   Sync: PullSparseVarsSync
-//   Async: PullSparseVarsAsync(not implemented currently)
-// Push
-//   Sync: PushSparseVarsSync
-//   Async: PushSparseVarsAsync(not implemented currently)
-//   Async: PushSparseVarsWithLabelAsync(with special usage)
-// Push dense variables to server in Async mode
-// Param<in>: scope, table_id, var_names
-// Param<out>: push_sparse_status
-
-class FleetWrapper {
- public:
-  virtual ~FleetWrapper() {}
-  FleetWrapper() {
-    scale_sparse_gradient_with_batch_size_ = true;
-    // trainer sleep some time for pslib core dump
-    sleep_seconds_before_fail_exit_ = 300;
-    // pslib request server timeout ms
-    client2client_request_timeout_ms_ = 500000;
-    // pslib connect server timeout_ms
-    client2client_connect_timeout_ms_ = 10000;
-    // pslib request max retry
-    client2client_max_retry_ = 3;
-  }
-
-  void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms,
-                              int max_retry);
-
-  // Pull sparse variables from server in Sync mode
-  // Param<in>: scope, table_id, var_names, fea_keys
-  // Param<out>: fea_values
-  void PullSparseVarsSync(const Scope& scope, const uint64_t table_id,
-                          const std::vector<std::string>& var_names,
-                          std::vector<uint64_t>* fea_keys,
-                          std::vector<std::vector<float>>* fea_values,
-                          int fea_dim);
-
-  void PullDenseVarsSync(const Scope& scope, const uint64_t table_id,
-                         const std::vector<std::string>& var_names);
-
-  void PullDenseVarsAsync(
-      const Scope& scope, const uint64_t table_id,
-      const std::vector<std::string>& var_names,
-      std::vector<::std::future<int32_t>>* pull_dense_status);
-
-  void PushDenseParamSync(const Scope& scope, const uint64_t table_id,
-                          const std::vector<std::string>& var_names);
-
-  // Push dense variables to server in async mode
-  // Param<in>: scope, table_id, var_names,
-  // Param<out>: push_sparse_status
-  void PushDenseVarsAsync(
-      const Scope& scope, const uint64_t table_id,
-      const std::vector<std::string>& var_names,
-      std::vector<::std::future<int32_t>>* push_sparse_status,
-      float scale_datanorm, int batch_size);
-
-  void PushDenseVarsSync(Scope* scope, const uint64_t table_id,
-                         const std::vector<std::string>& var_names);
-
-  // Push sparse variables with labels to server in Async mode
-  // This is specially designed for click/show stats in server
-  // Param<in>: scope, table_id, var_grad_names,
-  //            fea_keys, fea_labels, sparse_grad_names
-  // Param<out>: push_values, push_sparse_status
-  void PushSparseVarsWithLabelAsync(
-      const Scope& scope, const uint64_t table_id,
-      const std::vector<uint64_t>& fea_keys,
-      const std::vector<float>& fea_labels,
-      const std::vector<std::string>& sparse_key_names,
-      const std::vector<std::string>& sparse_grad_names, const int emb_dim,
-      std::vector<std::vector<float>>* push_values,
-      std::vector<::std::future<int32_t>>* push_sparse_status,
-      const int batch_size, const bool use_cvm, const bool dump_slot);
-
-  // Push sparse variables to server in Async mode
-  // Param<In>: scope, table_id, fea_keys, sparse_grad_names
-  // Param<Out>: push_values, push_sparse_status
-  /*
-  void PushSparseVarsAsync(
-          const Scope& scope,
-          const uint64_t table_id,
-          const std::vector<uint64_t>& fea_keys,
-          const std::vector<std::string>& sparse_grad_names,
-          std::vector<std::vector<float>>* push_values,
-          std::vector<::std::future<int32_t>>* push_sparse_status);
-  */
-
-  void InitServer(const std::string& dist_desc, int index);
-  void InitWorker(const std::string& dist_desc,
-                  const std::vector<uint64_t>& host_sign_list, int node_num,
-                  int index);
-  void StopServer();
-  uint64_t RunServer();
-  void GatherServers(const std::vector<uint64_t>& host_sign_list, int node_num);
-  // gather client ip
-  void GatherClients(const std::vector<uint64_t>& host_sign_list);
-  // get client info
-  std::vector<uint64_t> GetClientsInfo();
-  // create client to client connection
-  void CreateClient2ClientConnection();
-
-  // flush all push requests
-  void ClientFlush();
-  // load from paddle model
-  void LoadFromPaddleModel(Scope& scope, const uint64_t table_id,  // NOLINT
-                           std::vector<std::string> var_list,
-                           std::string model_path, std::string model_proto_file,
-                           std::vector<std::string> table_var_list,
-                           bool load_combine);
-  // mode = 0, load all feature
-  // mode = 1, laod delta feature, which means load diff
-  void LoadModel(const std::string& path, const int mode);
-  // mode = 0, load all feature
-  // mode = 1, laod delta feature, which means load diff
-  void LoadModelOneTable(const uint64_t table_id, const std::string& path,
-                         const int mode);
-  // mode = 0, save all feature
-  // mode = 1, save delta feature, which means save diff
-  void SaveModel(const std::string& path, const int mode);
-
-  double GetCacheThreshold();
-  void CacheShuffle(int table_id, const std::string& path, const int mode,
-                    const double cache_threshold);
-  int32_t SaveCache(int table_id, const std::string& path, const int mode);
-
-  void ClearModel();
-
-  void ShrinkSparseTable(int table_id);
-  void ShrinkDenseTable(int table_id, Scope* scope,
-                        std::vector<std::string> var_list, float decay,
-                        int emb_dim);
-
-  // register client to client communication
-  typedef std::function<int32_t(int, int, const std::string&)> MsgHandlerFunc;
-  int RegisterClientToClientMsgHandler(int msg_type, MsgHandlerFunc handler);
-  // send client to client message
-  std::future<int32_t> SendClientToClientMsg(int msg_type, int to_client_id,
-                                             const std::string& msg);
-
-  template <typename T>
-  void Serialize(const std::vector<T*>& t, std::string* str);
-  template <typename T>
-  void Deserialize(std::vector<T>* t, const std::string& str);
-  static std::shared_ptr<FleetWrapper> GetInstance() {
-    if (NULL == s_instance_) {
-      s_instance_.reset(new paddle::framework::FleetWrapper());
-    }
-    return s_instance_;
-  }
-
-  // this performs better than rand_r, especially large data
-  std::default_random_engine& LocalRandomEngine();
-
-#ifdef PADDLE_WITH_PSLIB
-  static std::shared_ptr<paddle::distributed::PSlib> pslib_ptr_;
-#endif
-
- private:
-  static std::shared_ptr<FleetWrapper> s_instance_;
-#ifdef PADDLE_WITH_PSLIB
-  std::map<uint64_t, std::vector<paddle::ps::Region>> _regions;
-#endif
-
- protected:
-  static bool is_initialized_;
-  bool scale_sparse_gradient_with_batch_size_;
-  int32_t sleep_seconds_before_fail_exit_;
-  int client2client_request_timeout_ms_;
-  int client2client_connect_timeout_ms_;
-  int client2client_max_retry_;
-  DISABLE_COPY_AND_ASSIGN(FleetWrapper);
-};
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
deleted file mode 100644
index 38c75b1df5a79bdd1a866480c3f12f953d26ad76..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/fleet/nccl_wrapper.h"
-#include <utility>
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-
-std::shared_ptr<NCCLWrapper> NCCLWrapper::s_instance_ = NULL;
-bool NCCLWrapper::is_initialized_ = false;
-
-void NCCLWrapper::InitNCCL() {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
-      &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
-      nccl_info_.my_global_rank_));
-#endif
-  return;
-}
-
-void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  nccl_info_.nccl_id_ = nccl_info.nccl_id_;
-#endif
-  return;
-}
-
-NCCLInfo NCCLWrapper::GetNCCLId() {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
-#endif
-  return nccl_info_;
-}
-
-void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
-                              const int ranks) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  nccl_info_.local_rank_ = local_rank;
-  nccl_info_.my_global_rank_ = global_rank;
-  nccl_info_.global_ranks_ = ranks;
-  PADDLE_ENFORCE(cudaSetDevice(local_rank));
-  PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_)));
-#endif
-  return;
-}
-
-void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
-                          const std::vector<std::string>& var_names) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  for (auto& name : var_names) {
-    auto var = scope.FindVar(name);
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    int32_t total_size = tensor->numel();
-    PADDLE_ENFORCE(platform::dynload::ncclBcast(
-        reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
-        root_rank, nccl_info_.comm_, nccl_info_.stream_));
-    cudaStreamSynchronize(nccl_info_.stream_);
-  }
-#endif
-  return;
-}
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h
deleted file mode 100644
index 84354308ea31a0ede9d16a95033346aefe587aa2..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/fleet/nccl_wrapper.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <ctime>
-#include <map>
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/dynload/nccl.h"
-#endif
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace framework {
-
-class NCCLInfo {
- public:
-  NCCLInfo() {}
-  virtual ~NCCLInfo() {}
-
- public:
-  int local_rank_;
-  int global_ranks_;
-  int my_global_rank_;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  ncclUniqueId nccl_id_;
-  ncclComm_t comm_;
-  cudaStream_t stream_;
-#endif
-};
-
-class NCCLWrapper {
- public:
-  virtual ~NCCLWrapper() {}
-  NCCLWrapper() {}
-
-  void InitNCCL();
-  void SetNCCLId(const NCCLInfo& nccl_info);
-  NCCLInfo GetNCCLId();
-  void SetRankInfo(const int local_rank, const int global_rank,
-                   const int ranks);
-  void SyncVar(const int root_rank, const Scope& scope,
-               const std::vector<std::string>& var_names);
-
-  static std::shared_ptr<NCCLWrapper> GetInstance() {
-    if (NULL == s_instance_) {
-      s_instance_.reset(new paddle::framework::NCCLWrapper());
-    }
-    return s_instance_;
-  }
-
- public:
-  NCCLInfo nccl_info_;
-
- private:
-  static std::shared_ptr<NCCLWrapper> s_instance_;
-
- protected:
-  static bool is_initialized_;
-  DISABLE_COPY_AND_ASSIGN(NCCLWrapper);
-};
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
deleted file mode 100644
index efdabffb9b33ddf007c13008d0f3afb7a3961eda..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/framework.proto
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
-package paddle.framework.proto;
-
-// Any incompatible changes to ProgramDesc and its dependencies should
-// raise the version defined version.h.
-//
-// Serailization and Deserialization codes should be modified in a way
-// that supports old versions following the version and compatibility policy.
-message Version { optional int64 version = 1 [ default = 0 ]; }
-
-enum AttrType {
-  INT = 0;
-  FLOAT = 1;
-  STRING = 2;
-  INTS = 3;
-  FLOATS = 4;
-  STRINGS = 5;
-  BOOLEAN = 6;
-  BOOLEANS = 7;
-  BLOCK = 8;
-  LONG = 9;
-  BLOCKS = 10;
-  LONGS = 11;
-}
-
-// OpDesc describes an instance of a C++ framework::OperatorBase
-// derived class type.
-message OpDesc {
-
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    optional int32 i = 3;
-    optional float f = 4;
-    optional string s = 5;
-    repeated int32 ints = 6;
-    repeated float floats = 7;
-    repeated string strings = 8;
-    optional bool b = 10;
-    repeated bool bools = 11;
-    optional int32 block_idx = 12;
-    optional int64 l = 13;
-    repeated int32 blocks_idx = 14;
-    repeated int64 longs = 15;
-  };
-
-  message Var {
-    required string parameter = 1;
-    repeated string arguments = 2;
-  };
-
-  required string type = 3;
-  repeated Var inputs = 1;
-  repeated Var outputs = 2;
-  repeated Attr attrs = 4;
-  optional bool is_target = 5 [ default = false ];
-};
-
-// OpProto describes a C++ framework::OperatorBase derived class.
-message OpProto {
-
-  // VarProto describes the C++ type framework::Variable.
-  message Var {
-    required string name = 1;
-    required string comment = 2;
-
-    optional bool duplicable = 3 [ default = false ];
-    optional bool intermediate = 4 [ default = false ];
-    optional bool dispensable = 5 [ default = false ];
-  }
-
-  // AttrProto describes the C++ type Attribute.
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    required string comment = 3;
-    // If that attribute is generated, it means the Paddle third
-    // language binding has responsibility to fill that
-    // attribute. End-User should not set that attribute.
-    optional bool generated = 4 [ default = false ];
-  }
-
-  required string type = 1;
-  repeated Var inputs = 2;
-  repeated Var outputs = 3;
-  repeated Attr attrs = 4;
-  required string comment = 5;
-}
-
-message VarType {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-    // Tensor<size_t> is used in C++.
-    SIZE_T = 19;
-    UINT8 = 20;
-    INT8 = 21;
-
-    // Other types that may need additional descriptions
-    LOD_TENSOR = 7;
-    SELECTED_ROWS = 8;
-    FEED_MINIBATCH = 9;
-    FETCH_LIST = 10;
-    STEP_SCOPES = 11;
-    LOD_RANK_TABLE = 12;
-    LOD_TENSOR_ARRAY = 13;
-    PLACE_LIST = 14;
-    READER = 15;
-    // Any runtime decided variable type is raw
-    // raw variables should manage their own allocations
-    // in operators like nccl_op
-    RAW = 17;
-    TUPLE = 18;
-  }
-
-  required Type type = 1;
-
-  message TensorDesc {
-    // Should only be PODType. Is enforced in C++
-    required Type data_type = 1;
-    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  }
-  optional TensorDesc selected_rows = 2;
-
-  message LoDTensorDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorDesc lod_tensor = 3;
-
-  message LoDTensorArrayDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorArrayDesc tensor_array = 4;
-
-  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
-  optional ReaderDesc reader = 5;
-
-  message Tuple { repeated Type element_type = 1; }
-  optional Tuple tuple = 7;
-}
-
-message VarDesc {
-  required string name = 1;
-  required VarType type = 2;
-  optional bool persistable = 3 [ default = false ];
-}
-
-message BlockDesc {
-  required int32 idx = 1;
-  required int32 parent_idx = 2;
-  repeated VarDesc vars = 3;
-  repeated OpDesc ops = 4;
-  optional int32 forward_block_idx = 5 [ default = -1 ];
-}
-
-// Please refer to
-// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
-// for more details.
-// TODO(panyx0718): A model can have multiple programs. Need a
-// way to distinguish them. Maybe ID or name?
-message ProgramDesc {
-  repeated BlockDesc blocks = 1;
-
-  optional Version version = 2;
-}
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
deleted file mode 100644
index f100dc6349f58260ed6c501da6148efe50437fee..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/garbage_collector.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <deque>
-#include <functional>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <utility>
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#endif
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-
-DECLARE_double(eager_delete_tensor_gb);
-DECLARE_double(memory_fraction_of_eager_deletion);
-DECLARE_bool(fast_eager_deletion_mode);
-
-namespace paddle {
-namespace framework {
-
-GarbageCollector::GarbageCollector(const platform::Place &place,
-                                   size_t max_memory_size)
-    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
-  garbages_.reset(new GarbageQueue());
-  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
-  if (max_memory_size_ > 1) {
-    mutex_.reset(new std::mutex());
-  }
-}
-
-CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place,
-                                         size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-
-void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
-  callback();
-}
-
-#ifdef PADDLE_WITH_CUDA
-UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
-    const platform::CUDAPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-
-void UnsafeFastGPUGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  callback();
-}
-
-DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
-    const platform::CUDAPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-
-void DefaultStreamGarbageCollector::Wait() const {
-  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-      ->WaitStreamCallback();
-}
-
-void DefaultStreamGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-      ->AddStreamCallback(callback);
-}
-
-StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
-                                               size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {
-  platform::CUDADeviceGuard guard(place.device);
-  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
-  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
-}
-
-StreamGarbageCollector::~StreamGarbageCollector() {
-  auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-  platform::CUDADeviceGuard guard(place.device);
-  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
-}
-
-cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
-
-void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
-
-void StreamGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  callback_manager_->AddCallback(callback);
-}
-#endif
-
-int64_t GetEagerDeletionThreshold() {
-  return FLAGS_eager_delete_tensor_gb < 0
-             ? -1
-             : static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
-                                    (static_cast<int64_t>(1) << 30));
-}
-
-bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
-
-void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode) {
-  FLAGS_eager_delete_tensor_gb = threshold;
-  FLAGS_memory_fraction_of_eager_deletion = fraction;
-  FLAGS_fast_eager_deletion_mode = fast_mode;
-}
-
-double GetEagerDeletionMemoryFraction() {
-  return FLAGS_memory_fraction_of_eager_deletion;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
deleted file mode 100644
index 610339520db540f5b6ca6caf9d37634b0a236e5f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/garbage_collector.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <deque>
-#include <functional>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <utility>
-#include "gflags/gflags.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-class GarbageCollector {
- public:
-  using GarbageQueue = std::deque<std::shared_ptr<memory::Allocation>>;
-
-  GarbageCollector(const platform::Place &place, size_t max_memory_size);
-
-  virtual ~GarbageCollector() = default;
-
-  virtual void Wait() const {}
-
-  template <typename Container>
-  void Add(Container &&objs);
-
-  template <typename Container, typename Callback>
-  void Add(Container &&objs, Callback &&callback);
-
- protected:
-  virtual void ClearCallback(const std::function<void()> &callback) = 0;
-
-  platform::DeviceContext *dev_ctx_;
-  std::unique_ptr<GarbageQueue> garbages_;
-  mutable std::unique_ptr<std::mutex> mutex_;
-  const size_t max_memory_size_;
-  size_t cur_memory_size_{0};
-};
-
-class CPUGarbageCollector : public GarbageCollector {
- public:
-  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size);
-
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-
-#ifdef PADDLE_WITH_CUDA
-class UnsafeFastGPUGarbageCollector : public GarbageCollector {
- public:
-  UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size);
-
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-
-class DefaultStreamGarbageCollector : public GarbageCollector {
- public:
-  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size);
-
-  void Wait() const override;
-
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-
-class StreamGarbageCollector : public GarbageCollector {
- public:
-  StreamGarbageCollector(const platform::CUDAPlace &place,
-                         size_t max_memory_size);
-
-  ~StreamGarbageCollector();
-
-  void Wait() const override;
-
-  cudaStream_t stream() const;
-
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-
- private:
-  cudaStream_t stream_;
-  std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
-};
-#endif
-
-template <typename Container>
-void GarbageCollector::Add(Container &&objs) {
-  Add(std::forward<Container>(objs), []() {});
-}
-
-template <typename Container, typename Callback>
-void GarbageCollector::Add(Container &&objs, Callback &&callback) {
-  // Special case when FLAGS_eager_delete_tensor_gb=0.0
-  // It speeds up GC about 2~3%.
-  if (max_memory_size_ <= 1) {
-    callback();
-    auto *container = new Container(std::move(objs));
-    ClearCallback([container] { delete container; });
-    return;
-  }
-
-  GarbageQueue *garbage_queue = nullptr;
-  {
-    std::lock_guard<std::mutex> guard(*mutex_);
-    for (auto &obj : objs) {
-      if (!obj) continue;
-      cur_memory_size_ += obj->size();
-      garbages_->push_back(std::move(obj));
-    }
-    if (cur_memory_size_ >= max_memory_size_) {
-      cur_memory_size_ = 0;
-      garbage_queue = garbages_.release();
-      garbages_.reset(new GarbageQueue());
-    }
-  }
-
-  if (garbage_queue) {
-    callback();
-    ClearCallback([garbage_queue]() { delete garbage_queue; });
-  }
-}
-
-int64_t GetEagerDeletionThreshold();
-bool IsFastEagerDeletionModeEnabled();
-
-void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode);
-
-double GetEagerDeletionMemoryFraction();
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
deleted file mode 100644
index 25a64b69ae8b459d6daefb502e9fba84b5bcf3ba..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-
-/*
-  This functor class is responsible for creating the gradient ops for the given
-  operator fwd_op. After it is called (through operator()), the pairs of
-  (gradient variable, corresponding input variable of fwd_op) will be added to
-  grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
-  gradient varialbe will be ignored or kEmptyVarName depending on the template
-  argument DropEmptyIG in the derived classes.
- */
-class GradOpDescMakerBase {
- public:
-  explicit GradOpDescMakerBase(
-      const OpDesc& fwd_op, const std::unordered_set<std::string>& no_grad_set,
-      std::unordered_map<std::string, std::string>* grad_to_var,
-      const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>())
-      : fwd_op_(fwd_op),
-        no_grad_set_(no_grad_set),
-        grad_to_var_(grad_to_var),
-        grad_block_(grad_block) {}
-
-  virtual ~GradOpDescMakerBase() = default;
-  virtual std::vector<std::unique_ptr<OpDesc>> operator()() const = 0;
-
- protected:
-  std::vector<std::string> InputGrad(const std::string& name,
-                                     bool drop_empty_grad = true) const {
-    std::vector<std::string> ret_val;
-    auto var_names = this->Input(name);
-    ret_val.reserve(var_names.size());
-    std::transform(var_names.begin(), var_names.end(),
-                   std::back_inserter(ret_val),
-                   [this](const std::string& fwd_var_name) -> std::string {
-                     auto g_name = GradVarName(fwd_var_name);
-                     if (no_grad_set_.empty() || !no_grad_set_.count(g_name)) {
-                       (*this->grad_to_var_)[g_name] = fwd_var_name;
-                       return g_name;
-                     } else {
-                       return kEmptyVarName;
-                     }
-                   });
-    if (!drop_empty_grad) {
-      return ret_val;
-    }
-    PADDLE_ENFORCE_LE(var_names.size(), 1UL,
-                      "BUG from operator developer:"
-                      " for input argument with a list of variables, "
-                      " drop_empty_grad is not allowed because it makes"
-                      " the correspondence bewteen a variable and its gradient"
-                      " ambiguous."
-                      " Op type %s",
-                      fwd_op_.Type());
-
-    std::vector<std::string> dropped_ret_val;
-    dropped_ret_val.reserve(ret_val.size());
-    std::copy_if(ret_val.begin(), ret_val.end(),
-                 std::back_inserter(dropped_ret_val),
-                 [](const std::string& str) { return str != kEmptyVarName; });
-    return dropped_ret_val;
-  }
-
-  std::vector<std::string> OutputGrad(const std::string& name) const {
-    std::vector<std::string> ret_val;
-    auto onames = this->Output(name);
-    ret_val.reserve(onames.size());
-    std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
-                   [this](const std::string& fwd_var_name) -> std::string {
-                     auto g_name = GradVarName(fwd_var_name);
-                     (*this->grad_to_var_)[g_name] = fwd_var_name;
-                     return g_name;
-                   });
-    return ret_val;
-  }
-
-  std::vector<std::string> InputNames() const {
-    return this->fwd_op_.InputNames();
-  }
-
-  std::vector<std::string> OutputNames() const {
-    return this->fwd_op_.OutputNames();
-  }
-
-  std::vector<std::string> Input(const std::string& name) const {
-    return fwd_op_.Input(name);
-  }
-
-  std::vector<std::string> Output(const std::string& name) const {
-    return fwd_op_.Output(name);
-  }
-
-  const std::unordered_map<std::string, Attribute>& Attrs() const {
-    return fwd_op_.GetAttrMap();
-  }
-
-  const Attribute& GetAttr(const std::string& name) const {
-    auto& map = fwd_op_.GetAttrMap();
-    auto it = map.find(name);
-    PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name);
-    return it->second;
-  }
-
-  template <typename T>
-  inline const T& Attr(const std::string& name) const {
-    return boost::get<T>(GetAttr(name));
-  }
-
-  std::string ForwardOpType() const { return this->fwd_op_.Type(); }
-
- protected:
-  const OpDesc& ForwardOp() const { return fwd_op_; }
-
- private:
-  const OpDesc& fwd_op_;
-  const std::unordered_set<std::string>& no_grad_set_;
-  std::unordered_map<std::string, std::string>* grad_to_var_;
-
- protected:
-  std::vector<BlockDesc*> grad_block_;
-};
-
-class SingleGradOpDescMaker : public GradOpDescMakerBase {
- public:
-  using GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<OpDesc>> operator()() const final {
-    std::vector<std::unique_ptr<OpDesc>> retv;
-    retv.emplace_back(this->Apply());
-    return retv;
-  }
-
- protected:
-  virtual std::unique_ptr<OpDesc> Apply() const = 0;
-};
-
-template <bool DropEmptyIG = true>
-class DefaultGradOpDescMaker final : public SingleGradOpDescMaker {
- public:
-  using SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<OpDesc> Apply() const final {
-    auto* grad = new OpDesc();
-    grad->SetType(this->ForwardOpType() + "_grad");
-
-    for (auto& input_param : this->InputNames()) {
-      grad->SetInput(input_param, this->Input(input_param));
-      grad->SetOutput(GradVarName(input_param),
-                      this->InputGrad(input_param, DropEmptyIG));
-    }
-
-    for (auto& output_param : this->OutputNames()) {
-      grad->SetInput(output_param, this->Output(output_param));
-      grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param));
-    }
-
-    grad->SetAttrMap(this->Attrs());
-
-    return std::unique_ptr<OpDesc>(grad);
-  }
-};
-
-class EmptyGradOpMaker final : public GradOpDescMakerBase {
- public:
-  using GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<OpDesc>> operator()() const final { return {}; }
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
deleted file mode 100644
index 4aaf2569eb4eed72fc521d3861077d0b3653e625..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/lodtensor_printer.h"
-
-namespace paddle {
-namespace framework {
-
-void HogwildWorker::Initialize(const TrainerDesc &desc) {
-  fetch_config_ = desc.fetch_config();
-  param_ = desc.hogwild_param();
-  skip_ops_.resize(param_.skip_ops_size());
-  for (int i = 0; i < param_.skip_ops_size(); ++i) {
-    skip_ops_[i] = param_.skip_ops(i);
-  }
-  use_cvm_ = desc.use_cvm();
-}
-
-void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
-  auto &block = program.Block(0);
-  op_names_.clear();
-  for (auto &op_desc : block.AllOps()) {
-    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
-    op_names_.push_back(op_desc->Type());
-    OperatorBase *local_op_ptr = local_op.release();
-    ops_.push_back(local_op_ptr);
-    continue;
-  }
-}
-
-void HogwildWorker::CreateThreadScope(const ProgramDesc &program) {
-  auto &block = program.Block(0);
-
-  PADDLE_ENFORCE_NOT_NULL(
-      root_scope_, "root_scope should be set before creating thread scope");
-
-  thread_scope_ = &root_scope_->NewScope();
-
-  for (auto &var : block.AllVars()) {
-    if (var->Persistable()) {
-      auto *ptr = root_scope_->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-      if (stat_var_name_map_.find(var->Name()) != stat_var_name_map_.end() &&
-          thread_id_ != 0) {
-        int tensor_dim =
-            root_scope_->FindVar(var->Name())->GetMutable<LoDTensor>()->numel();
-        auto *ptr1 = thread_scope_->Var(var->Name());
-        InitializeVariable(ptr1, var->GetType());
-        LoDTensor *thread_tensor = ptr1->GetMutable<LoDTensor>();
-        LoDTensor *root_tensor =
-            root_scope_->FindVar(var->Name())->GetMutable<LoDTensor>();
-#define MemsetCallback(cpp_type, proto_type)                     \
-  do {                                                           \
-    if (root_tensor->type() == proto_type) {                     \
-      SetZero<cpp_type>(thread_tensor, root_tensor, tensor_dim); \
-    }                                                            \
-  } while (0)
-        _ForEachDataType_(MemsetCallback);
-      }
-    } else {
-      auto *ptr = thread_scope_->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-    }
-  }
-}
-
-template <typename T>
-void HogwildWorker::SetZero(LoDTensor *tensor, LoDTensor *root_tensor,
-                            int tensor_dim) {
-  T *ptr = tensor->mutable_data<T>(root_tensor->dims(), platform::CPUPlace());
-  memset(ptr, 0, sizeof(T) * tensor_dim);
-}
-
-void HogwildWorker::BindingDataFeedMemory() {
-  const std::vector<std::string> &input_feed =
-      device_reader_->GetUseSlotAlias();
-  for (auto name : input_feed) {
-    device_reader_->AddFeedVar(thread_scope_->FindVar(name), name);
-  }
-}
-
-void HogwildWorker::CreateDeviceResource(const ProgramDesc &main_prog) {
-  CreateThreadScope(main_prog);
-  CreateThreadOperators(main_prog);
-}
-
-void HogwildWorker::TrainFilesWithProfiler() {
-  platform::SetNumThreads(1);
-  device_reader_->Start();
-  std::vector<double> op_total_time;
-  std::vector<std::string> op_name;
-  for (auto &op : ops_) {
-    op_name.push_back(op->Type());
-  }
-  op_total_time.resize(ops_.size());
-  for (size_t i = 0; i < op_total_time.size(); ++i) {
-    op_total_time[i] = 0.0;
-  }
-  platform::Timer timeline;
-  double total_time = 0.0;
-  double read_time = 0.0;
-  int cur_batch;
-  int batch_cnt = 0;
-  timeline.Start();
-  uint64_t total_inst = 0;
-  while ((cur_batch = device_reader_->Next()) > 0) {
-    VLOG(3) << "read a batch in thread " << thread_id_;
-    timeline.Pause();
-    read_time += timeline.ElapsedSec();
-    total_time += timeline.ElapsedSec();
-    for (size_t i = 0; i < ops_.size(); ++i) {
-      bool need_skip = false;
-      for (auto t = 0u; t < skip_ops_.size(); ++t) {
-        if (ops_[i]->Type().find(skip_ops_[t]) != std::string::npos) {
-          need_skip = true;
-          break;
-        }
-      }
-      timeline.Start();
-      VLOG(3) << "Going to run op " << op_name[i];
-      if (!need_skip) {
-        ops_[i]->Run(*thread_scope_, place_);
-      }
-      VLOG(3) << "Op " << op_name[i] << " Finished";
-      timeline.Pause();
-      op_total_time[i] += timeline.ElapsedSec();
-      total_time += timeline.ElapsedSec();
-    }
-    total_inst += cur_batch;
-    ++batch_cnt;
-    PrintFetchVars();
-    if (thread_id_ == 0) {
-      if (batch_cnt > 0 && batch_cnt % 100 == 0) {
-        for (size_t i = 0; i < ops_.size(); ++i) {
-          fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
-                  op_name[i].c_str(), op_total_time[i] / batch_cnt);
-        }
-        fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
-        fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100);
-        fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time);
-      }
-    }
-    thread_scope_->DropKids();
-    timeline.Start();
-  }
-}
-
-void HogwildWorker::TrainFiles() {
-  platform::SetNumThreads(1);
-
-  // how to accumulate fetched values here
-  device_reader_->Start();
-  int cur_batch;
-  while ((cur_batch = device_reader_->Next()) > 0) {
-    for (auto &op : ops_) {
-      bool need_skip = false;
-      for (auto t = 0u; t < skip_ops_.size(); ++t) {
-        if (op->Type().find(skip_ops_[t]) != std::string::npos) {
-          need_skip = true;
-          break;
-        }
-      }
-      if (!need_skip) {
-        op->Run(*thread_scope_, place_);
-      }
-    }
-
-    PrintFetchVars();
-    thread_scope_->DropKids();
-  }
-}
-
-void HogwildWorker::PrintFetchVars() {
-  // call count
-  batch_num_++;
-  int batch_per_print = fetch_config_.print_period();
-  if (thread_id_ == 0) {
-    if (batch_num_ % batch_per_print == 0) {
-      int fetch_var_num = fetch_config_.fetch_var_names_size();
-      for (int i = 0; i < fetch_var_num; ++i) {
-        platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i),
-                           fetch_config_.fetch_var_str_format(i));
-      }
-    }
-  }
-}
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/inlined_vector.h b/paddle/fluid/framework/inlined_vector.h
deleted file mode 100644
index 2a7f26b9f9661ecc79112307bac2780e357a55b4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/inlined_vector.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cstdint>
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T, size_t N>
-class InlinedVector {
-  static_assert(N > 0, "N must be larger than 0");
-
- public:
-  inline InlinedVector() { len_ = 0; }
-
-  inline size_t size() const { return len_; }
-
-  inline T& operator[](size_t i) { return i < N ? head_[i] : tail_[i - N]; }
-
-  inline const T& operator[](size_t i) const {
-    return i < N ? head_[i] : tail_[i - N];
-  }
-
-  inline void emplace_back(const T& item) {
-    if (LIKELY(len_ < N)) {
-      head_[len_++] = item;
-    } else {
-      tail_.emplace_back(item);
-      ++len_;
-    }
-  }
-
-  inline void pop_back() {
-    if (UNLIKELY(len_ > N)) {
-      tail_.pop_back();
-    }
-    --len_;
-  }
-
-  inline T& back() {
-    if (LIKELY(len_ <= N)) {
-      return head_[len_ - 1];
-    } else {
-      return tail_.back();
-    }
-  }
-
- private:
-  T head_[N];
-  size_t len_;
-  std::vector<T> tail_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/inlined_vector_test.cc b/paddle/fluid/framework/inlined_vector_test.cc
deleted file mode 100644
index 003c0d7bbeac2b3d8ed62766fee09912c7a07bb2..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/inlined_vector_test.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/inlined_vector.h"
-#include <cstdlib>
-#include <ctime>
-#include <iostream>
-#include <vector>
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T, size_t N>
-static std::vector<T> ToStdVector(const framework::InlinedVector<T, N> &vec) {
-  std::vector<T> std_vec;
-  std_vec.reserve(vec.size());
-  for (size_t i = 0; i < vec.size(); ++i) {
-    std_vec.emplace_back(vec[i]);
-  }
-  return std_vec;
-}
-
-template <size_t N>
-void InlinedVectorCheck(size_t n) {
-  std::srand(std::time(nullptr));
-
-  std::vector<int> std_vec;
-  framework::InlinedVector<int, N> vec;
-
-  for (size_t i = 0; i < n; ++i) {
-    int value = rand();  // NOLINT
-
-    std_vec.emplace_back(value);
-    vec.emplace_back(value);
-
-    CHECK_EQ(std_vec.size(), vec.size());
-    CHECK_EQ(std_vec.back(), vec.back());
-
-    CHECK_EQ(vec.back(), value);
-  }
-
-  bool is_equal = (std_vec == ToStdVector(vec));
-
-  CHECK_EQ(is_equal, true);
-
-  for (size_t i = 0; i < n; ++i) {
-    CHECK_EQ(std_vec.size(), vec.size());
-    CHECK_EQ(std_vec.back(), vec.back());
-    std_vec.pop_back();
-    vec.pop_back();
-    CHECK_EQ(std_vec.size(), vec.size());
-  }
-
-  CHECK_EQ(std_vec.size(), static_cast<size_t>(0));
-  CHECK_EQ(vec.size(), static_cast<size_t>(0));
-}
-
-TEST(inlined_vector, inlined_vector) {
-  for (size_t i = 0; i < 20; ++i) {
-    InlinedVectorCheck<1>(i);
-    InlinedVectorCheck<10>(i);
-    InlinedVectorCheck<15>(i);
-    InlinedVectorCheck<20>(i);
-    InlinedVectorCheck<21>(i);
-    InlinedVectorCheck<25>(i);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h
deleted file mode 100644
index 40026eaca9a92e6acdb60e03578ad41f137e8502..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/type_defs.h"
-
-namespace paddle {
-namespace framework {
-
-/*
-  Inplace Inference for create In->Out pairs for inplaced operator.
-  If we specify a pair of corresponding names. For example, X->Out.
-  then Out will inplaced use X's memory. The base class will do
-  legality validation for both variables.
-*/
-
-class InplaceOpInference {
- public:
-  virtual ~InplaceOpInference() {}
-  virtual std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, bool use_cuda) const = 0;
-};
-
-/*
-  Inplace In and Out for operator only have an Input and an Output.
-  For example, activation op.
- */
-class SingleOpInplaceInToOut : public InplaceOpInference {
- public:
-  std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, bool use_cuda) const override {
-    PADDLE_ENFORCE_EQ(op_desc.InputNames().size(), 1,
-                      "Op inputs must be unique");
-    PADDLE_ENFORCE_EQ(op_desc.OutputNames().size(), 1,
-                      "Op outputs must be unique");
-    auto x_name = op_desc.InputNames().at(0);
-    auto out_name = op_desc.OutputNames().at(0);
-    return std::unordered_map<std::string, std::string>{{x_name, out_name}};
-  }
-};
-
-#define DECLARE_INPLACE_OP_INFERER(class_name, ...)                         \
-  class class_name final : public ::paddle::framework::InplaceOpInference { \
-   public:                                                                  \
-    std::unordered_map<std::string, std::string> operator()(                \
-        const ::paddle::framework::OpDesc& op_desc,                         \
-        bool use_cuda) const final {                                        \
-      return {__VA_ARGS__};                                                 \
-    }                                                                       \
-  }
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/io/CMakeLists.txt b/paddle/fluid/framework/io/CMakeLists.txt
deleted file mode 100644
index 2baef77b9ce32ce616e7781b971665d3d885066c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/io/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-cc_library(fs SRCS fs.cc DEPS string_helper glog boost)
-cc_library(shell SRCS shell.cc DEPS string_helper glog)
diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc
deleted file mode 100644
index d5bc5df2565b0f25bc29f2fce37c1bd8626a0dbc..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/io/fs.cc
+++ /dev/null
@@ -1,456 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/io/fs.h"
-#include <memory>
-
-namespace paddle {
-namespace framework {
-
-static void fs_add_read_converter_internal(std::string& path,  // NOLINT
-                                           bool& is_pipe,      // NOLINT
-                                           const std::string& converter) {
-  if (converter == "") {
-    return;
-  }
-
-  if (!is_pipe) {
-    path = string::format_string("( %s ) < \"%s\"", converter.c_str(),
-                                 path.c_str());
-    is_pipe = true;
-  } else {
-    path = string::format_string("%s | %s", path.c_str(), converter.c_str());
-  }
-}
-
-static void fs_add_write_converter_internal(std::string& path,  // NOLINT
-                                            bool& is_pipe,      // NOLINT
-                                            const std::string& converter) {
-  if (converter == "") {
-    return;
-  }
-
-  if (!is_pipe) {
-    path = string::format_string("( %s ) > \"%s\"", converter.c_str(),
-                                 path.c_str());
-    is_pipe = true;
-  } else {
-    path = string::format_string("%s | %s", converter.c_str(), path.c_str());
-  }
-}
-
-static std::shared_ptr<FILE> fs_open_internal(const std::string& path,
-                                              bool is_pipe,
-                                              const std::string& mode,
-                                              size_t buffer_size,
-                                              int* err_no = 0) {
-  std::shared_ptr<FILE> fp = nullptr;
-
-  if (!is_pipe) {
-    fp = shell_fopen(path, mode);
-  } else {
-    fp = shell_popen(path, mode, err_no);
-  }
-
-  if (buffer_size > 0) {
-    char* buffer = new char[buffer_size];
-    CHECK_EQ(0, setvbuf(&*fp, buffer, _IOFBF, buffer_size));
-    fp = {&*fp, [fp, buffer](FILE*) mutable {  // NOLINT
-            CHECK(fp.unique());                // NOLINT
-            fp = nullptr;
-            delete[] buffer;
-          }};
-  }
-
-  return fp;
-}
-
-static bool fs_begin_with_internal(const std::string& path,
-                                   const std::string& str) {
-  return strncmp(path.c_str(), str.c_str(), str.length()) == 0;
-}
-
-static bool fs_end_with_internal(const std::string& path,
-                                 const std::string& str) {
-  return path.length() >= str.length() &&
-         strncmp(&path[path.length() - str.length()], str.c_str(),
-                 str.length()) == 0;
-}
-
-static size_t& localfs_buffer_size_internal() {
-  static size_t x = 0;
-  return x;
-}
-
-size_t localfs_buffer_size() { return localfs_buffer_size_internal(); }
-
-void localfs_set_buffer_size(size_t x) { localfs_buffer_size_internal() = x; }
-
-std::shared_ptr<FILE> localfs_open_read(std::string path,
-                                        const std::string& converter) {
-  bool is_pipe = false;
-
-  if (fs_end_with_internal(path, ".gz")) {
-    fs_add_read_converter_internal(path, is_pipe, "zcat");
-  }
-
-  fs_add_read_converter_internal(path, is_pipe, converter);
-  return fs_open_internal(path, is_pipe, "r", localfs_buffer_size());
-}
-
-std::shared_ptr<FILE> localfs_open_write(std::string path,
-                                         const std::string& converter) {
-  shell_execute(
-      string::format_string("mkdir -p $(dirname \"%s\")", path.c_str()));
-
-  bool is_pipe = false;
-
-  if (fs_end_with_internal(path, ".gz")) {
-    fs_add_write_converter_internal(path, is_pipe, "gzip");
-  }
-
-  fs_add_write_converter_internal(path, is_pipe, converter);
-  return fs_open_internal(path, is_pipe, "w", localfs_buffer_size());
-}
-
-int64_t localfs_file_size(const std::string& path) {
-  struct stat buf;
-  if (0 != stat(path.c_str(), &buf)) {
-    LOG(FATAL) << "file stat not zero";
-    return -1;
-  }
-  return (int64_t)buf.st_size;
-}
-
-void localfs_remove(const std::string& path) {
-  if (path == "") {
-    return;
-  }
-
-  shell_execute(string::format_string("rm -rf %s", path.c_str()));
-}
-
-std::vector<std::string> localfs_list(const std::string& path) {
-  if (path == "") {
-    return {};
-  }
-
-  std::shared_ptr<FILE> pipe;
-  int err_no = 0;
-  pipe = shell_popen(
-      string::format_string("find %s -type f -maxdepth 1", path.c_str()), "r",
-      &err_no);
-  string::LineFileReader reader;
-  std::vector<std::string> list;
-
-  while (reader.getline(&*pipe)) {
-    list.push_back(reader.get());
-  }
-
-  return list;
-}
-
-std::string localfs_tail(const std::string& path) {
-  if (path == "") {
-    return "";
-  }
-
-  return shell_get_command_output(
-      string::format_string("tail -1 %s ", path.c_str()));
-}
-
-bool localfs_exists(const std::string& path) {
-  std::string test_f = shell_get_command_output(
-      string::format_string("[ -f %s ] ; echo $?", path.c_str()));
-
-  if (string::trim_spaces(test_f) == "0") {
-    return true;
-  }
-
-  std::string test_d = shell_get_command_output(
-      string::format_string("[ -d %s ] ; echo $?", path.c_str()));
-
-  if (string::trim_spaces(test_d) == "0") {
-    return true;
-  }
-
-  return false;
-}
-
-void localfs_mkdir(const std::string& path) {
-  if (path == "") {
-    return;
-  }
-
-  shell_execute(string::format_string("mkdir -p %s", path.c_str()));
-}
-
-static size_t& hdfs_buffer_size_internal() {
-  static size_t x = 0;
-  return x;
-}
-
-size_t hdfs_buffer_size() { return hdfs_buffer_size_internal(); }
-
-void hdfs_set_buffer_size(size_t x) { hdfs_buffer_size_internal() = x; }
-
-static std::string& hdfs_command_internal() {
-  static std::string x = "hadoop fs";
-  return x;
-}
-
-const std::string& hdfs_command() { return hdfs_command_internal(); }
-
-void hdfs_set_command(const std::string& x) { hdfs_command_internal() = x; }
-
-std::shared_ptr<FILE> hdfs_open_read(std::string path, int* err_no,
-                                     const std::string& converter) {
-  if (fs_end_with_internal(path, ".gz")) {
-    path = string::format_string("%s -text \"%s\"", hdfs_command().c_str(),
-                                 path.c_str());
-  } else {
-    path = string::format_string("%s -cat \"%s\"", hdfs_command().c_str(),
-                                 path.c_str());
-  }
-
-  bool is_pipe = true;
-  fs_add_read_converter_internal(path, is_pipe, converter);
-  return fs_open_internal(path, is_pipe, "r", hdfs_buffer_size(), err_no);
-}
-
-std::shared_ptr<FILE> hdfs_open_write(std::string path, int* err_no,
-                                      const std::string& converter) {
-  path = string::format_string("%s -put - \"%s\"", hdfs_command().c_str(),
-                               path.c_str());
-  bool is_pipe = true;
-
-  if (fs_end_with_internal(path, ".gz\"")) {
-    fs_add_write_converter_internal(path, is_pipe, "gzip");
-  }
-
-  fs_add_write_converter_internal(path, is_pipe, converter);
-  return fs_open_internal(path, is_pipe, "w", hdfs_buffer_size(), err_no);
-}
-
-void hdfs_remove(const std::string& path) {
-  if (path == "") {
-    return;
-  }
-
-  shell_execute(string::format_string("%s -rmr %s &>/dev/null; true",
-                                      hdfs_command().c_str(), path.c_str()));
-}
-
-std::vector<std::string> hdfs_list(const std::string& path) {
-  if (path == "") {
-    return {};
-  }
-
-  std::string prefix = "hdfs:";
-
-  if (fs_begin_with_internal(path, "afs:")) {
-    prefix = "afs:";
-  }
-  int err_no = 0;
-  std::vector<std::string> list;
-  do {
-    err_no = 0;
-    std::shared_ptr<FILE> pipe;
-    pipe = shell_popen(
-        string::format_string("%s -ls %s | ( grep ^- ; [ $? != 2 ] )",
-                              hdfs_command().c_str(), path.c_str()),
-        "r", &err_no);
-    string::LineFileReader reader;
-    list.clear();
-
-    while (reader.getline(&*pipe)) {
-      std::vector<std::string> line = string::split_string(reader.get());
-      if (line.size() != 8) {
-        continue;
-      }
-      list.push_back(prefix + line[7]);
-    }
-  } while (err_no == -1);
-  return list;
-}
-
-std::string hdfs_tail(const std::string& path) {
-  if (path == "") {
-    return "";
-  }
-
-  return shell_get_command_output(string::format_string(
-      "%s -text %s | tail -1 ", hdfs_command().c_str(), path.c_str()));
-}
-
-bool hdfs_exists(const std::string& path) {
-  std::string test = shell_get_command_output(string::format_string(
-      "%s -test -e %s ; echo $?", hdfs_command().c_str(), path.c_str()));
-
-  if (string::trim_spaces(test) == "0") {
-    return true;
-  }
-
-  return false;
-}
-
-void hdfs_mkdir(const std::string& path) {
-  if (path == "") {
-    return;
-  }
-
-  shell_execute(string::format_string("%s -mkdir %s; true",
-                                      hdfs_command().c_str(), path.c_str()));
-}
-
-int fs_select_internal(const std::string& path) {
-  if (fs_begin_with_internal(path, "hdfs:")) {
-    return 1;
-  } else if (fs_begin_with_internal(path, "afs:")) {
-    return 1;
-  }
-
-  return 0;
-}
-
-std::shared_ptr<FILE> fs_open_read(const std::string& path, int* err_no,
-                                   const std::string& converter) {
-  switch (fs_select_internal(path)) {
-    case 0:
-      return localfs_open_read(path, converter);
-
-    case 1:
-      return hdfs_open_read(path, err_no, converter);
-
-    default:
-      LOG(FATAL) << "Not supported";
-  }
-
-  return {};
-}
-
-std::shared_ptr<FILE> fs_open_write(const std::string& path, int* err_no,
-                                    const std::string& converter) {
-  switch (fs_select_internal(path)) {
-    case 0:
-      return localfs_open_write(path, converter);
-
-    case 1:
-      return hdfs_open_write(path, err_no, converter);
-
-    default:
-      LOG(FATAL) << "Not supported";
-  }
-
-  return {};
-}
-
-std::shared_ptr<FILE> fs_open(const std::string& path, const std::string& mode,
-                              int* err_no, const std::string& converter) {
-  if (mode == "r" || mode == "rb") {
-    return fs_open_read(path, err_no, converter);
-  }
-
-  if (mode == "w" || mode == "wb") {
-    return fs_open_write(path, err_no, converter);
-  }
-
-  LOG(FATAL) << "Unknown mode: " << mode;
-  return {};
-}
-
-int64_t fs_file_size(const std::string& path) {
-  switch (fs_select_internal(path)) {
-    case 0:
-      return localfs_file_size(path);
-
-    default:
-      LOG(FATAL) << "Not supported";
-  }
-
-  return 0;
-}
-
-void fs_remove(const std::string& path) {
-  switch (fs_select_internal(path)) {
-    case 0:
-      return localfs_remove(path);
-
-    case 1:
-      return hdfs_remove(path);
-
-    default:
-      LOG(FATAL) << "Not supported";
-  }
-}
-
-std::vector<std::string> fs_list(const std::string& path) {
-  switch (fs_select_internal(path)) {
-    case 0:
-      return localfs_list(path);
-
-    case 1:
-      return hdfs_list(path);
-
-    default:
-      LOG(FATAL) << "Not supported";
-  }
-
-  return {};
-}
-
-std::string fs_tail(const std::string& path) {
-  switch (fs_select_internal(path)) {
-    case 0:
-      return localfs_tail(path);
-
-    case 1:
-      return hdfs_tail(path);
-
-    default:
-      LOG(FATAL) << "Not supported";
-  }
-
-  return "";
-}
-
-bool fs_exists(const std::string& path) {
-  switch (fs_select_internal(path)) {
-    case 0:
-      return localfs_exists(path);
-
-    case 1:
-      return hdfs_exists(path);
-
-    default:
-      LOG(FATAL) << "Not supported";
-  }
-
-  return false;
-}
-
-void fs_mkdir(const std::string& path) {
-  switch (fs_select_internal(path)) {
-    case 0:
-      return localfs_mkdir(path);
-
-    case 1:
-      return hdfs_mkdir(path);
-
-    default:
-      LOG(FATAL) << "Not supported";
-  }
-}
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/io/fs.h b/paddle/fluid/framework/io/fs.h
deleted file mode 100644
index 3f0174701c24cc5a3eac38d12792650bdbd9463b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/io/fs.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <stdio.h>
-#include <memory>
-#include <string>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/io/shell.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace framework {
-
-int fs_select_internal(const std::string& path);
-
-// localfs
-extern size_t localfs_buffer_size();
-
-extern void localfs_set_buffer_size(size_t x);
-
-extern std::shared_ptr<FILE> localfs_open_read(std::string path,
-                                               const std::string& converter);
-
-extern std::shared_ptr<FILE> localfs_open_write(std::string path,
-                                                const std::string& converter);
-
-extern int64_t localfs_file_size(const std::string& path);
-
-extern void localfs_remove(const std::string& path);
-
-extern std::vector<std::string> localfs_list(const std::string& path);
-
-extern std::string localfs_tail(const std::string& path);
-
-extern bool localfs_exists(const std::string& path);
-
-extern void localfs_mkdir(const std::string& path);
-
-// hdfs
-extern size_t hdfs_buffer_size();
-
-extern void hdfs_set_buffer_size(size_t x);
-
-extern const std::string& hdfs_command();
-
-extern void hdfs_set_command(const std::string& x);
-
-extern std::shared_ptr<FILE> hdfs_open_read(std::string path, int* err_no,
-                                            const std::string& converter);
-
-extern std::shared_ptr<FILE> hdfs_open_write(std::string path, int* err_no,
-                                             const std::string& converter);
-
-extern void hdfs_remove(const std::string& path);
-
-extern std::vector<std::string> hdfs_list(const std::string& path);
-
-extern std::string hdfs_tail(const std::string& path);
-
-extern bool hdfs_exists(const std::string& path);
-
-extern void hdfs_mkdir(const std::string& path);
-
-// aut-detect fs
-extern std::shared_ptr<FILE> fs_open_read(const std::string& path, int* err_no,
-                                          const std::string& converter);
-
-extern std::shared_ptr<FILE> fs_open_write(const std::string& path, int* err_no,
-                                           const std::string& converter);
-
-extern std::shared_ptr<FILE> fs_open(const std::string& path,
-                                     const std::string& mode, int* err_no,
-                                     const std::string& converter = "");
-
-extern int64_t fs_file_size(const std::string& path);
-
-extern void fs_remove(const std::string& path);
-
-extern std::vector<std::string> fs_list(const std::string& path);
-
-extern std::string fs_tail(const std::string& path);
-
-extern bool fs_exists(const std::string& path);
-
-extern void fs_mkdir(const std::string& path);
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc
deleted file mode 100644
index ce0c3a767da3ca1331579e8f7d6a61ae3c71053d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/io/shell.cc
+++ /dev/null
@@ -1,321 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/io/shell.h"
-
-namespace paddle {
-namespace framework {
-
-std::shared_ptr<FILE> shell_fopen(const std::string& path,
-                                  const std::string& mode) {
-#if defined _WIN32 || defined __APPLE__
-  return nullptr;
-#else
-  if (shell_verbose()) {
-    LOG(INFO) << "Opening file[" << path << "] with mode[" << mode << "]";
-  }
-  FILE* fp;
-  if (!(fp = fopen(path.c_str(), mode.c_str()))) {
-    LOG(FATAL) << "fopen fail, path[" << path << "], mode[" << mode << "]";
-  }
-  return {fp, [path](FILE* fp) {
-            if (shell_verbose()) {
-              LOG(INFO) << "Closing file[" << path << "]";
-            }
-            if (0 != fclose(fp)) {
-              LOG(FATAL) << "fclose fail, path[" << path << "]";
-            }
-          }};
-#endif
-}
-
-// Close all open file descriptors
-// The implementation is async signal safe
-// Mostly copy from CPython code
-static int close_open_fds_internal() {
-#if defined _WIN32 || defined __APPLE__
-  return 0;
-#else
-  struct linux_dirent {
-    long d_ino = 0;  // NOLINT
-    off_t d_off;
-    unsigned short d_reclen = 0;  // NOLINT
-    char d_name[256];
-  };
-
-  int dir_fd = -1;
-  if ((dir_fd = open("/proc/self/fd", O_RDONLY)) < 0) {
-    LOG(FATAL) << "proc/self/fd open fail";
-    return -1;
-  }
-  char buffer[sizeof(linux_dirent)];
-
-  for (;;) {
-    int bytes = 0;
-    if ((bytes = syscall(SYS_getdents, dir_fd,
-                         reinterpret_cast<linux_dirent*>(buffer),
-                         sizeof(buffer))) < 0) {
-      LOG(FATAL) << "syscall fail";
-      return -1;
-    }
-
-    if (bytes == 0) {
-      break;
-    }
-
-    linux_dirent* entry = NULL;
-
-    for (int offset = 0; offset < bytes; offset += entry->d_reclen) {
-      entry = reinterpret_cast<linux_dirent*>(buffer + offset);
-      int fd = 0;
-      const char* s = entry->d_name;
-
-      while (*s >= '0' && *s <= '9') {
-        fd = fd * 10 + (*s - '0');
-        s++;
-      }
-
-      if (s != entry->d_name && fd != dir_fd && fd >= 3) {
-        close(fd);
-      }
-    }
-  }
-
-  close(dir_fd);
-  return 0;
-#endif
-}
-
-static int shell_popen_fork_internal(const char* real_cmd, bool do_read,
-                                     int parent_end, int child_end) {
-#if defined _WIN32 || defined __APPLE__
-  return 0;
-#else
-  int child_pid = -1;
-  // Too frequent calls to fork() makes openmpi very slow. Use vfork() instead.
-  // But vfork() is very dangerous. Be careful.
-  if ((child_pid = vfork()) < 0) {
-    return -1;
-  }
-
-  // The following code is async signal safe (No memory allocation, no access to
-  // global data, etc.)
-  if (child_pid != 0) {
-    return child_pid;
-  }
-
-  int child_std_end = do_read ? 1 : 0;
-  close(parent_end);
-
-  if (child_end != child_std_end) {
-    PCHECK(dup2(child_end, child_std_end) == child_std_end);
-    close(child_end);
-  }
-
-  close_open_fds_internal();
-  PCHECK(execl("/bin/bash", "bash", "-c", real_cmd, NULL) >= 0);
-  exit(127);
-#endif
-}
-
-std::shared_ptr<FILE> shell_popen(const std::string& cmd,
-                                  const std::string& mode, int* err_no) {
-#if defined _WIN32 || defined __APPLE__
-  return nullptr;
-#else
-  bool do_read = mode == "r";
-  bool do_write = mode == "w";
-  if (!(do_read || do_write)) {
-    *err_no = -1;
-    return NULL;
-  }
-
-  if (shell_verbose()) {
-    LOG(INFO) << "Opening pipe[" << cmd << "] with mode[" << mode << "]";
-  }
-
-  std::string real_cmd = "set -o pipefail; " + cmd;
-
-  int pipe_fds[2];
-  if (pipe(pipe_fds) != 0) {
-    *err_no = -1;
-    return NULL;
-  }
-  int parent_end = 0;
-  int child_end = 0;
-
-  if (do_read) {
-    parent_end = pipe_fds[0];
-    child_end = pipe_fds[1];
-  } else if (do_write) {
-    parent_end = pipe_fds[1];
-    child_end = pipe_fds[0];
-  }
-
-  int child_pid = shell_popen_fork_internal(real_cmd.c_str(), do_read,
-                                            parent_end, child_end);
-  close(child_end);
-  fcntl(parent_end, F_SETFD, FD_CLOEXEC);
-  FILE* fp;
-  if ((fp = fdopen(parent_end, mode.c_str())) == NULL) {
-    *err_no = -1;
-    return NULL;
-  }
-  return {fp, [child_pid, cmd, err_no](FILE* fp) {
-            if (shell_verbose()) {
-              LOG(INFO) << "Closing pipe[" << cmd << "]";
-            }
-
-            if (fclose(fp) != 0) {
-              *err_no = -1;
-            }
-            int wstatus = -1;
-            waitpid(child_pid, &wstatus, 0);
-            if (wstatus == 0 || wstatus == (128 + SIGPIPE) * 256 ||
-                (wstatus == -1 && errno == ECHILD)) {
-            } else {
-              *err_no = -1;
-              LOG(WARNING) << "status[" << wstatus << "], cmd[" << cmd << "]"
-                           << ", err_no[" << *err_no << "]";
-            }
-            if (wstatus == -1 && errno == ECHILD) {
-              // temporarily remove this warning
-              // LOG(WARNING) << "errno is ECHILD";
-            }
-          }};
-#endif
-}
-
-static int shell_p2open_fork_internal(const char* real_cmd, int pipein_fds[2],
-                                      int pipeout_fds[2]) {
-#if defined _WIN32 || defined __APPLE__
-  return 0;
-#else
-  int child_pid = -1;
-  if ((child_pid = fork()) < 0) {
-    return -1;
-  }
-
-  if (child_pid != 0) {
-    return child_pid;
-  }
-
-  close(pipein_fds[0]);
-  close(pipeout_fds[1]);
-
-  if (pipein_fds[1] != 1) {
-    if (dup2(pipein_fds[1], 1) != 1) {
-      return -1;
-    }
-    close(pipein_fds[1]);
-  }
-
-  if (pipeout_fds[0] != 0) {
-    if (dup2(pipeout_fds[0], 0) != 0) {
-      return -1;
-    }
-    close(pipeout_fds[0]);
-  }
-
-  close_open_fds_internal();
-  if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) {
-    return -1;
-  }
-  exit(127);
-#endif
-}
-
-std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
-    const std::string& cmd) {
-#if defined _WIN32 || defined __APPLE__
-  return {};
-#else
-  if (shell_verbose()) {
-    LOG(INFO) << "Opening bidirectional pipe[" << cmd << "]";
-  }
-
-  std::string real_cmd = "set -o pipefail; " + cmd;
-
-  int pipein_fds[2];
-  int pipeout_fds[2];
-  if (pipe(pipein_fds) != 0) {
-    return {NULL, NULL};
-  }
-  if (pipe(pipeout_fds) != 0) {
-    return {NULL, NULL};
-  }
-
-  int child_pid =
-      shell_p2open_fork_internal(real_cmd.c_str(), pipein_fds, pipeout_fds);
-
-  close(pipein_fds[1]);
-  close(pipeout_fds[0]);
-  fcntl(pipein_fds[0], F_SETFD, FD_CLOEXEC);
-  fcntl(pipeout_fds[1], F_SETFD, FD_CLOEXEC);
-
-  std::shared_ptr<int> child_life = {
-      NULL, [child_pid, cmd](void*) {
-        if (shell_verbose()) {
-          LOG(INFO) << "Closing bidirectional pipe[" << cmd << "]";
-        }
-
-        int wstatus, ret;
-
-        do {
-          PCHECK((ret = waitpid(child_pid, &wstatus, 0)) >= 0 ||
-                 (ret == -1 && errno == EINTR));
-        } while (ret == -1 && errno == EINTR);
-
-        PCHECK(wstatus == 0 || wstatus == (128 + SIGPIPE) * 256 ||
-               (wstatus == -1 && errno == ECHILD))
-            << "status[" << wstatus << "], cmd[" << cmd << "]";
-
-        if (wstatus == -1 && errno == ECHILD) {
-          // temporarily remove this warning
-          // LOG(WARNING) << "errno is ECHILD";
-        }
-      }};
-
-  FILE* in_fp;
-  PCHECK((in_fp = fdopen(pipein_fds[0], "r")) != NULL);
-  FILE* out_fp;
-  PCHECK((out_fp = fdopen(pipeout_fds[1], "w")) != NULL);
-  return {{in_fp, [child_life](FILE* fp) { PCHECK(fclose(fp) == 0); }},
-          {out_fp, [child_life](FILE* fp) { PCHECK(fclose(fp) == 0); }}};
-#endif
-}
-
-std::string shell_get_command_output(const std::string& cmd) {
-#if defined _WIN32 || defined __APPLE__
-  return "";
-#else
-  int err_no = 0;
-  do {
-    err_no = 0;
-    std::shared_ptr<FILE> pipe = shell_popen(cmd, "r", &err_no);
-    string::LineFileReader reader;
-
-    if (reader.getdelim(&*pipe, 0)) {
-      pipe = nullptr;
-      if (err_no == 0) {
-        return reader.get();
-      }
-    }
-  } while (err_no == -1);
-  return "";
-#endif
-}
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/io/shell.h b/paddle/fluid/framework/io/shell.h
deleted file mode 100644
index 46fcc92bafa84e4c1b89e4603fe0db364572b73e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/io/shell.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <fcntl.h>
-#include <sys/stat.h>
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <sys/syscall.h>
-#endif
-#include <sys/types.h>
-#ifndef _WIN32
-#include <sys/wait.h>
-#endif
-#include <memory>
-#include <string>
-#include <utility>
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace framework {
-
-inline bool& shell_verbose_internal() {
-  static bool x = false;
-  return x;
-}
-
-inline bool shell_verbose() { return shell_verbose_internal(); }
-
-inline void shell_set_verbose(bool x) { shell_verbose_internal() = x; }
-
-extern std::shared_ptr<FILE> shell_fopen(const std::string& path,
-                                         const std::string& mode);
-
-extern std::shared_ptr<FILE> shell_popen(const std::string& cmd,
-                                         const std::string& mode, int* err_no);
-
-extern std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
-    const std::string& cmd);
-
-inline void shell_execute(const std::string& cmd) {
-  int err_no = 0;
-  do {
-    err_no = 0;
-    shell_popen(cmd, "w", &err_no);
-  } while (err_no == -1);
-}
-
-extern std::string shell_get_command_output(const std::string& cmd);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
deleted file mode 100644
index 9476256b0f0e5ac2290a814e73374fb1552ff5c2..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ /dev/null
@@ -1,143 +0,0 @@
-set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
-file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
-file(APPEND ${pass_file} "\#pragma once\n")
-file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
-
-add_subdirectory(fuse_optimizer_ops_pass)
-add_subdirectory(memory_optimize_pass)
-add_subdirectory(multi_devices_graph_pass)
-
-# Usage: pass_library(target inference) will append to paddle_inference_pass.h
-unset(INFER_IR_PASSES CACHE) # clear the global variable
-function(pass_library TARGET DEST)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS DIR)
-    set(targetPrefix "")
-
-    cmake_parse_arguments(pass_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    if(pass_library_DIR)
-        cc_library(${TARGET} SRCS ${pass_library_DIR}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${pass_library_DEPS})
-    else()
-        cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${pass_library_DEPS})
-    endif()
-
-    # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
-    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
-        message(STATUS "add pass ${TARGET} ${DEST}")
-        file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
-        set(INFER_IR_PASSES ${INFER_IR_PASSES} ${TARGET} CACHE INTERNAL "")
-    endif()
-endfunction()
-
-cc_library(codegen SRCS codegen.cc DEPS codegen_helper)
-cc_library(codegen_helper SRCS codegen_helper.cc DEPS graph node graph_helper)
-cc_library(node SRCS node.cc DEPS proto_desc)
-cc_library(graph SRCS graph.cc DEPS node pretty_log)
-cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
-cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
-cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
-cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
-cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
-cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
-
-cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper)
-
-pass_library(graph_to_program_pass base)
-pass_library(graph_viz_pass base)
-pass_library(lock_free_optimize_pass base)
-pass_library(fc_fuse_pass inference)
-pass_library(attention_lstm_fuse_pass inference)
-pass_library(fc_lstm_fuse_pass inference)
-pass_library(embedding_fc_lstm_fuse_pass inference)
-pass_library(fc_gru_fuse_pass inference)
-pass_library(seq_concat_fc_fuse_pass inference)
-pass_library(multi_batch_merge_pass base)
-pass_library(conv_bn_fuse_pass inference)
-pass_library(seqconv_eltadd_relu_fuse_pass inference)
-pass_library(seqpool_concat_fuse_pass inference)
-pass_library(seqpool_cvm_concat_fuse_pass inference)
-pass_library(repeated_fc_relu_fuse_pass inference)
-pass_library(squared_mat_sub_fuse_pass inference)
-pass_library(is_test_pass base)
-pass_library(conv_elementwise_add_act_fuse_pass inference)
-pass_library(conv_elementwise_add2_act_fuse_pass inference)
-pass_library(conv_elementwise_add_fuse_pass inference)
-pass_library(conv_affine_channel_fuse_pass inference)
-pass_library(transpose_flatten_concat_fuse_pass inference)
-pass_library(identity_scale_op_clean_pass base)
-pass_library(sync_batch_norm_pass base)
-pass_library(runtime_context_cache_pass base)
-pass_library(quant_conv2d_dequant_fuse_pass inference)
-pass_library(fillconstant_elementwisemul_fuse inference)
-pass_library(shuffle_channel_detect_pass inference)
-pass_library(delete_quant_dequant_op_pass inference)
-pass_library(simplify_with_basic_ops_pass base)
-pass_library(fc_elementwise_layernorm_fuse_pass base)
-if(WITH_GPU)
-    pass_library(cudnn_placement_pass base DEPS placement_pass_base)
-endif()
-
-if(ANAKIN_SUBGRAPH)
-pass_library(simplify_anakin_priorbox_detection_out_pass inference)
-endif()
-
-if(WITH_MKLDNN)
-    pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn)
-    pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn)
-    pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(fc_mkldnn_pass inference DIR mkldnn)
-    pass_library(cpu_quantize_placement_pass base DIR mkldnn)
-    pass_library(cpu_quantize_pass inference DIR mkldnn)
-    pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
-endif()
-
-if(WITH_NGRAPH)
-    cc_library(ngraph_subgraph_pass SRCS ngraph_subgraph_pass.cc DEPS ngraph_bridge
-      analysis_helper subgraph_detector graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
-    set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
-    file(APPEND ${pass_file} "USE_PASS(ngraph_subgraph_pass);\n")
-    set(INFER_IR_PASSES ${INFER_IR_PASSES} ngraph_subgraph_pass CACHE INTERNAL "")
-endif()
-
-cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
-cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
-
-set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
-
-cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
-
-cc_test(codegen_test SRCS codegen_test.cc DEPS codegen_helper codegen)
-cc_test(node_test SRCS node_test.cc DEPS node)
-cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
-cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
-cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
-cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
-cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
-cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
-cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
-cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_tester.cc DEPS seqpool_cvm_concat_fuse_pass framework_proto)
-cc_test(test_repeated_fc_relu_fuse_pass SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto)
-cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
-cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass)
-cc_test(test_fc_elementwise_layernorm_fuse_pass SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
-if(WITH_GPU)
-    cc_test(test_cudnn_placement_pass SRCS cudnn_placement_pass_tester.cc DEPS cudnn_placement_pass)
-endif()
-if(NOT WIN32)
-    cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
-endif()
-if (WITH_MKLDNN)
-    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
-    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
-    cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass)
-    cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass)
-    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
-    cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
-    cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
-    cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
-    cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
-endif ()
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
deleted file mode 100644
index c4ffb2a9de4970abd147ce2fd709977e26eb626b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-struct Param {
-  std::string X = "concat_0.tmp_0";
-  std::string C0 = "cell_init";
-  std::string H0 = "hidden_init";
-  std::string AttentionWeight = "attention_fc.w_0";
-  std::string AttentionBias = "attention_fc.b_0";
-  std::string AttentionScalar = "attention_output.w_0";
-  std::string AttentionScalarBias = "attention_output.b_0";
-  std::string LSTMWeight = "attention_w.new";
-  std::string LSTMBias = "attention_b.new";
-  std::string Hidden = "array_to_lod_tensor_0.tmp_0";
-  std::string Cell = "at.cell.new";
-  std::string AttentionedX = "at.x.new";
-  std::string AttentionFCOut = "at.fc.new";
-  std::string LSTMX = "at.lstmx.new";
-  std::string LSTMOUT = "at.lstmout.new";
-};
-
-void PrepareParameters(Graph* graph, const Param& param);
-
-void FindWhileOp(Graph* graph) {
-  GraphPatternDetector gpd;
-  std::unordered_set<int> fused_external_ops(
-      {35, 36, 37, 38, 43, 44, 49, 45, 46, 47, 41, 42, 53, 54, 48,
-       57, 55, 56, 52, 74, 80, 77, 78, 79, 50, 77, 39, 40, 51});
-
-  gpd.mutable_pattern()->NewNode(
-      [&](Node* n) { return fused_external_ops.count(n->id()); }, "while");
-
-  if (!graph->Has(kGraphvizMarkedNodeAttr)) {
-    graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
-  }
-  auto& marked_nodes =
-      graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
-
-  auto handle = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* g) {
-    auto* while_pat_node = gpd.pattern().RetrieveNode("while");
-    auto* while_node = subgraph.at(while_pat_node);
-    marked_nodes.insert(while_node);
-  };
-  gpd(graph, handle);
-
-  Param param;
-  // Add AttentionLSTM node
-  OpDesc op_desc;
-  op_desc.SetType("attention_lstm");
-
-#define OP_SET_IN(x) op_desc.SetInput(#x, {param.x});
-#define OP_SET_OUT(x) op_desc.SetOutput(#x, {param.x});
-  OP_SET_IN(X);
-  OP_SET_IN(C0);
-  OP_SET_IN(H0);
-  OP_SET_IN(AttentionWeight);
-  OP_SET_IN(AttentionBias);
-  OP_SET_IN(AttentionScalar);
-  OP_SET_IN(AttentionScalarBias);
-  OP_SET_IN(LSTMWeight);
-  OP_SET_IN(LSTMBias);
-
-  OP_SET_OUT(Hidden);
-  OP_SET_OUT(Cell);
-  OP_SET_OUT(AttentionedX);
-  OP_SET_OUT(AttentionFCOut);
-  OP_SET_OUT(LSTMX);
-  OP_SET_OUT(LSTMOUT);
-#undef OP_SET_IN
-#undef OP_SET_OUT
-
-  auto* X = graph->RetrieveNode(34);
-  auto* LSTMOUT = graph->RetrieveNode(81);
-  auto* cell_init = graph->RetrieveNode(6);
-  auto* hidden_init = graph->RetrieveNode(8);
-
-  auto* lstm_op = graph->CreateOpNode(&op_desc);
-  PrepareParameters(graph, param);
-
-  IR_NODE_LINK_TO(X, lstm_op);
-  IR_NODE_LINK_TO(cell_init, lstm_op);
-  IR_NODE_LINK_TO(hidden_init, lstm_op);
-  IR_NODE_LINK_TO(lstm_op, LSTMOUT);
-
-  GraphSafeRemoveNodes(graph, marked_nodes);
-}
-
-#define CHECK_P1(x) PADDLE_ENFORCE_NOT_NULL(x);
-#define CHECK_P2(x0, x1) \
-  CHECK_P1(x0);          \
-  CHECK_P1(x1);
-#define CHECK_P3(x0, x1, x2) \
-  CHECK_P2(x0, x1);          \
-  CHECK_P1(x2);
-#define CHECK_P4(x0, x1, x2, x3) \
-  CHECK_P3(x0, x1, x2);          \
-  CHECK_P1(x3);
-#define CHECK_P5(x0, x1, x2, x3, x4) \
-  CHECK_P4(x0, x1, x2, x3);          \
-  CHECK_P1(x4);
-
-void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
-                       const LoDTensor& W_forget_w1,
-                       const LoDTensor& W_input_w0, const LoDTensor& W_input_w1,
-                       const LoDTensor& W_output_w0,
-                       const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0,
-                       const LoDTensor& W_cell_w1, LoDTensor* out);
-
-void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
-                     const LoDTensor& B_output, const LoDTensor& B_cell,
-                     LoDTensor* out);
-
-void PrepareParameters(Graph* graph, const Param& param) {
-  // Check parameters
-  PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-  auto& scope = graph->Get<Scope>(kParamScopeAttr);
-
-  // Create new parameters.
-  scope.Var(param.LSTMWeight)->GetMutable<LoDTensor>();
-  scope.Var(param.LSTMBias)->GetMutable<LoDTensor>();
-  scope.Var(param.Hidden)->GetMutable<LoDTensor>();
-  scope.Var(param.Cell)->GetMutable<LoDTensor>();
-  scope.Var(param.AttentionedX)->GetMutable<LoDTensor>();
-  scope.Var(param.AttentionFCOut)->GetMutable<LoDTensor>();
-  scope.Var(param.LSTMX)->GetMutable<LoDTensor>();
-  scope.Var(param.LSTMOUT)->GetMutable<LoDTensor>();
-
-#define GATE_W(name__)                                               \
-  auto* W_##name__##_w0 = scope.FindVar(#name__ ".w_0");             \
-  auto* W_##name__##_w1 = scope.FindVar(#name__ ".w_1");             \
-  auto* W_##name__##_b0 = scope.FindVar(#name__ ".b_0");             \
-  CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);       \
-  VLOG(4) << #name__ "_w0"                                           \
-          << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
-  VLOG(4) << #name__ "_w1"                                           \
-          << " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
-  VLOG(4) << #name__ "_b0"                                           \
-          << " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
-  auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();       \
-  auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();       \
-  auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
-
-  GATE_W(forget);
-  GATE_W(input);
-  GATE_W(output);
-  GATE_W(c);
-#undef GATE_W
-
-  auto* attention_fc_w = scope.FindVar("attention_fc.w_0");
-  auto* attention_fc_b = scope.FindVar("attention_fc.b_0");
-  auto* attention_output_w = scope.FindVar("attention_output.w_0");
-  auto* attention_output_b = scope.FindVar("attention_output.b_0");
-  CHECK_P4(attention_fc_w, attention_fc_b, attention_output_w,
-           attention_output_b);
-
-  auto* lstm_weight = scope.Var(param.LSTMWeight);
-  auto* lstm_weight_t = lstm_weight->GetMutable<LoDTensor>();
-  auto* lstm_bias = scope.Var(param.LSTMBias);
-  auto* lstm_bias_t = lstm_bias->GetMutable<LoDTensor>();
-
-  // reshape attention_bias
-  auto* attention_bias_t =
-      scope.FindVar(param.AttentionBias)->GetMutable<LoDTensor>();
-  PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1);
-  attention_bias_t->Resize(make_ddim({1, attention_bias_t->dims()[0]}));
-
-  auto* attention_scalar_bias_t =
-      scope.FindVar(param.AttentionScalarBias)->GetMutable<LoDTensor>();
-  attention_scalar_bias_t->Resize(
-      make_ddim({1, attention_scalar_bias_t->dims()[0]}));
-
-  PrepareLSTMWeight(W_forget_w0_t, W_forget_w1_t, W_input_w0_t, W_input_w1_t,
-                    W_output_w0_t, W_output_w1_t, W_c_w0_t, W_c_w1_t,
-                    lstm_weight_t);
-  PrepareLSTMBias(W_forget_b0_t, W_input_b0_t, W_output_b0_t, W_c_b0_t,
-                  lstm_bias_t);
-}
-
-// Prepare parameters
-void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
-                       const LoDTensor& W_forget_w1,
-                       const LoDTensor& W_input_w0, const LoDTensor& W_input_w1,
-                       const LoDTensor& W_output_w0,
-                       const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0,
-                       const LoDTensor& W_cell_w1, LoDTensor* out) {
-  int D = W_forget_w0.dims()[0];
-  int M = W_forget_w1.dims()[0];
-  out->Resize(make_ddim({D + M, 4 * D}));
-  VLOG(3) << "LSTMWeight resized to " << out->dims();
-
-  float* out_data = out->mutable_data<float>(platform::CPUPlace());
-  std::array<const float*, 4> tensors{
-      W_forget_w0.data<float>(), W_input_w0.data<float>(),
-      W_output_w0.data<float>(), W_cell_w0.data<float>()};
-  std::array<const float*, 4> tensors1{
-      W_forget_w1.data<float>(), W_input_w1.data<float>(),
-      W_output_w1.data<float>(), W_cell_w1.data<float>()};
-
-  for (int row = 0; row < D; row++) {
-    for (int col = 0; col < 4; col++) {
-      float* dst = out_data + 4 * D * row + D * col;
-      const float* src = tensors[col] + D * row;
-      memcpy(dst, src, D * sizeof(float));
-    }
-  }
-
-  for (int row = 0; row < M; row++) {
-    for (int col = 0; col < 4; col++) {
-      float* dst = out_data + 4 * D * (D + row) + D * col;
-      const float* src = tensors1[col] + D * row;
-      memcpy(dst, src, D * sizeof(float));
-    }
-  }
-}
-
-void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
-                     const LoDTensor& B_output, const LoDTensor& B_cell,
-                     LoDTensor* out) {
-  std::array<const float*, 4> tensors{
-      B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-      B_cell.data<float>()};
-
-  PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
-  int D = B_forget.dims()[0];
-  out->Resize(make_ddim({1, 4 * D}));
-  auto* out_data = out->mutable_data<float>(platform::CPUPlace());
-  for (size_t i = 0; i < tensors.size(); i++) {
-    memcpy(out_data + D * i, tensors[i], D * sizeof(float));
-  }
-}
-
-// Parameters
-
-void AttentionLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
-  PDPattern external_pattern, subblock_pattern;
-
-  // Use the following variables to tell whether this model is RNN1.
-  // This fuse can only works on the RNN1 model.
-  std::unordered_set<std::string> specified_vars({"data_lod_attention",
-                                                  "cell_init", "hidden_init",
-                                                  "data", "week", "minute"});
-  size_t count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsVar() && specified_vars.count(node->Name())) {
-      ++count;
-    }
-  }
-  if (count < specified_vars.size()) {
-    return;
-  }
-
-  // Continue to fuse.
-  FindWhileOp(graph);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(attention_lstm_fuse_pass,
-              paddle::framework::ir::AttentionLSTMFusePass);
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
deleted file mode 100644
index 47ed9f0393fb222e612ed3bce1afbc879edb410d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class AttentionLSTMFusePass : public FusePassBase {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
deleted file mode 100644
index 5b9742f4b33070e205bbe4de56d81c01fb17476b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+++ /dev/null
@@ -1,498 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
-#include <algorithm>
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-DEFINE_double(fuse_parameter_memory_size, -1.0,  // MBytes
-              "fuse_parameter_memory_size is up limited memory size(MB)"
-              "of one group parameters' gradient which is the input "
-              "of communication calling(e.g NCCLAllReduce). "
-              "The default value is 0, it means that "
-              "not set group according to memory_size.");
-DEFINE_int32(
-    fuse_parameter_groups_size, 1,
-    "fuse_parameter_groups_size is the up limited size of one group "
-    "parameters' gradient. "
-    "The default value is a experimental result. If the "
-    "fuse_parameter_groups_size is 1, it means that the groups size is "
-    "the number of parameters' gradient. If the fuse_parameter_groups_size is "
-    "-1, it means that there are only one group. The default value is 3, it is "
-    "an experimental value.");
-
-namespace paddle {
-namespace framework {
-namespace ir {
-// unit of the FLAGS_fuse_parameter_memory_size.
-static constexpr double kMB = 1048576.0;
-
-// SetFuseParameterGroupsSize and SetFuseParameterMemorySize are used in unit
-// test, because it is invalid that seting 'FLAGS_fuse_parameter_memory_size'
-// and 'FLAGS_fuse_parameter_groups_size' in unit test.
-void SetFuseParameterGroupsSize(int group_size) {
-  FLAGS_fuse_parameter_groups_size = group_size;
-}
-
-int GetFuseParameterGroupsSize() { return FLAGS_fuse_parameter_groups_size; }
-
-void SetFuseParameterMemorySize(double memory_size) {
-  FLAGS_fuse_parameter_memory_size = memory_size;
-}
-
-double GetFuseParameterMemorySize() { return FLAGS_fuse_parameter_memory_size; }
-
-class CoalesceGradTensorPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const {
-    if (Get<size_t>(details::kNRanks) <= 1) {
-      VLOG(6) << "The number of place is" << Get<size_t>(details::kNRanks)
-              << ", there doesn't need apply FuseAllReduceOpPass.";
-      return;
-    }
-    ir::Graph &result = *graph;
-    details::ParamsAndGrads params_grads;
-    RecordParamsAndGrads(result, &params_grads);
-
-    ResetAttribute<details::ParamsAndGrads>(details::kParamsAndDenseGrads,
-                                            &result);
-    ResetAttribute<details::ParamsAndGrads>(details::kParamsAndSparseGrads,
-                                            &result);
-    ResetAttribute<details::GroupParamsAndGrads>(
-        details::kGroupParamsAndDenseGrads, &result);
-
-    VLOG(10) << "The number of params and grads is:" << params_grads.size();
-    if (params_grads.size() == 0) {
-      return;
-    }
-
-    auto &p_g_dense_grad =
-        result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads);
-    auto &p_g_sparse_grad =
-        result.Get<details::ParamsAndGrads>(details::kParamsAndSparseGrads);
-
-    auto vars_info = GetVarInfo(result);
-    for (auto &param_grad : params_grads) {
-      if (IsLoDTensorType(GetTypeOfVar(vars_info, param_grad.second))) {
-        p_g_dense_grad.emplace_back(param_grad);
-      } else {
-        p_g_sparse_grad.emplace_back(param_grad);
-      }
-    }
-
-    VLOG(10) << "Dense grads: " << p_g_dense_grad.size()
-             << ", Sparse grads: " << p_g_sparse_grad.size();
-    if (p_g_dense_grad.size() == 0) {
-      return;
-    }
-
-    auto num_of_p_g_dense_grad = p_g_dense_grad.size();
-    auto &group_params_grads = result.Get<details::GroupParamsAndGrads>(
-        details::kGroupParamsAndDenseGrads);
-    // Note: the order of p_g_dense_grad may be changed by
-    // SetGroupParamsAndGrads.
-    SetGroupParamsAndGrads(vars_info, p_g_dense_grad, &group_params_grads);
-
-    p_g_dense_grad.clear();
-    p_g_dense_grad.reserve(num_of_p_g_dense_grad);
-    for (auto &group_p_g : group_params_grads) {
-      p_g_dense_grad.insert(p_g_dense_grad.end(), group_p_g.begin(),
-                            group_p_g.end());
-    }
-    PADDLE_ENFORCE_EQ(
-        p_g_dense_grad.size(), num_of_p_g_dense_grad,
-        "The number of p_g_dense_grad is not consistent with before.");
-
-    auto &pinned_var_set =
-        graph->GetOrInit<details::PinnedVars>(details::kPinnedVars);
-    if (IsUnifiedDtype(p_g_dense_grad, vars_info)) {
-      RecordGradients(p_g_dense_grad, vars_info, &pinned_var_set);
-      CoalesceTensors(vars_info, p_g_dense_grad, &result);
-    } else {
-      for (auto &sub_param_grad : group_params_grads) {
-        RecordGradients(p_g_dense_grad, vars_info, &pinned_var_set);
-        PADDLE_ENFORCE_EQ(IsUnifiedDtype(sub_param_grad, vars_info), true,
-                          "The data type of the same group is not consistent.");
-        CoalesceTensors(vars_info, sub_param_grad, &result);
-      }
-    }
-  }
-
-  void RecordGradients(
-      const std::vector<std::pair<std::string, std::string>> &sub_param_grad,
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
-      std::unordered_set<std::string> *pinned_var_set) const {
-    // The Gradients should not be reused during memory optimization.
-    for (auto &p_g : sub_param_grad) {
-      auto iter = vars_info.find(p_g.second);
-      PADDLE_ENFORCE_EQ(iter != vars_info.end(), true, "%s is not found.",
-                        p_g.second);
-      PADDLE_ENFORCE_EQ(!iter->second.empty(), true);
-      for (auto it : iter->second) {
-        PADDLE_ENFORCE_NOT_NULL(it->Var());
-        pinned_var_set->insert(it->Var()->Name());
-      }
-      PADDLE_ENFORCE_EQ(IsLoDTensorType(GetTypeOfVar(vars_info, p_g.second)),
-                        true);
-    }
-  }
-
-  bool IsUnifiedDtype(
-      const details::ParamsAndGrads &params_grads,
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info)
-      const {
-    if (params_grads.empty()) return true;
-    auto dtype = GetDtypeOfVar(vars_info, params_grads.front().second);
-    for (auto p_g : params_grads) {
-      auto next_dtype = GetDtypeOfVar(vars_info, p_g.second);
-      if (next_dtype != dtype) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void CoalesceTensors(
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
-      const details::ParamsAndGrads &params_grads, Graph *result) const {
-    // Create a FusedVarsSet to avoid duplicating names for fused_var in other
-    // pass.
-    if (!result->Has(details::kFusedVars)) {
-      result->Set(details::kFusedVars, new details::FusedVars);
-    }
-    // the kFusedGrads is used be fuse_optimizer_op_pass.
-    if (!result->Has(details::kFusedGrads)) {
-      result->Set(details::kFusedGrads, new details::FusedGrads);
-    }
-    if (!result->Has(details::kProgramDescs)) {
-      result->Set(details::kProgramDescs, new details::ProgramDescs);
-    }
-    // the fused_var_name should be unique, so it appends
-    // params_grads.begin()->second.
-    auto fused_grad_var_name = std::string(details::kFusedVarNamePrefix) +
-                               "@GRAD@" + params_grads.begin()->second;
-    auto &fused_var_set = result->Get<details::FusedVars>(details::kFusedVars);
-    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_grad_var_name), 0,
-                      "%s is duplicate in FusedVars.", fused_grad_var_name);
-    fused_var_set.insert(fused_grad_var_name);
-    result->Get<details::FusedGrads>(details::kFusedGrads)
-        .emplace_back(fused_grad_var_name);
-
-    InitFusedVarsAndAllocSpaceForVars(vars_info, fused_grad_var_name,
-                                      params_grads, result);
-  }
-
-  template <typename AttrType>
-  void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const {
-    if (graph->Has(attr_name)) {
-      VLOG(10) << attr_name << " is reset.";
-      graph->Erase(attr_name);
-    }
-    graph->Set(attr_name, new AttrType);
-  }
-
-  void SetGroupParamsAndGrads(
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
-      const details::ParamsAndGrads &params_grads,
-      details::GroupParamsAndGrads *group_params_grads) const {
-    SetGroupAccordingToLayers(vars_info, params_grads, group_params_grads);
-    SetGroupAccordingToMemorySize(vars_info, group_params_grads);
-    if (!IsUnifiedDtype(params_grads, vars_info)) {
-      ReGroupByDtype(vars_info, group_params_grads);
-    }
-  }
-
-  void SetGroupAccordingToLayers(
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
-      const details::ParamsAndGrads &params_grads,
-      details::GroupParamsAndGrads *group_params_grads) const {
-    std::map<std::string, size_t> var_idx;
-
-    for (size_t i = 0; i < params_grads.size(); ++i) {
-      auto pos = params_grads[i].first.find_first_of(".");
-
-      std::string var_key;
-      if (pos == std::string::npos) {
-        var_key = params_grads[i].first;
-      } else {
-        var_key = params_grads[i].first.substr(0, pos);
-      }
-
-      size_t idx = 0;
-      auto var_idx_iter = var_idx.find(var_key);
-      if (var_idx_iter != var_idx.end()) {
-        idx = var_idx_iter->second;
-      } else {
-        group_params_grads->emplace_back();
-        idx = group_params_grads->size() - 1;
-        var_idx[var_key] = idx;
-      }
-      auto &local_group_params_grads = group_params_grads->at(idx);
-      local_group_params_grads.emplace_back(
-          std::make_pair(params_grads[i].first, params_grads[i].second));
-    }
-
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << "SetGroupAccordingToLayers: ";
-      PrintGroupInfo(vars_info, group_params_grads);
-    }
-  }
-
-  void PrintGroupInfo(
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
-      details::GroupParamsAndGrads *group_params_grads) const {
-    for (size_t i = 0; i < group_params_grads->size(); ++i) {
-      VLOG(10) << "group " << i;
-      std::stringstream out;
-      size_t gps_size = 0;
-      for (auto &p_g : group_params_grads->at(i)) {
-        auto var_desc = GetVarDescFromVarsInfo(vars_info, p_g.first);
-        auto shape = var_desc->GetShape();
-        size_t size = framework::SizeOfType(var_desc->GetDataType());
-        std::for_each(shape.begin(), shape.end(),
-                      [&size](const int64_t &n) { size *= n; });
-        gps_size += size;
-        out << string::Sprintf("(%s(%d), %s)", p_g.first, size, p_g.second);
-      }
-
-      auto dtype =
-          GetDtypeOfVar(vars_info, group_params_grads->at(i).front().first);
-      VLOG(10) << out.str()
-               << ", group size:" << group_params_grads->at(i).size()
-               << ", group memory size:" << static_cast<double>(gps_size) / kMB
-               << "(MB), dtype:" << dtype;
-    }
-  }
-
-  void SetGroupAccordingToMemorySize(
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
-      details::GroupParamsAndGrads *group_params_grads) const {
-    const double group_memory_size = GetFuseParameterMemorySize();
-    if (group_memory_size <= 0.0) {
-      return;
-    }
-    details::GroupParamsAndGrads local_group_params_grads;
-
-    size_t j = 0;
-    while (j < group_params_grads->size()) {
-      local_group_params_grads.emplace_back();
-      auto &group_p_g = local_group_params_grads.back();
-
-      size_t local_group_memory_size = 0;
-      while (j < group_params_grads->size()) {
-        for (auto &p_g_iter : group_params_grads->at(j)) {
-          auto var_desc = GetVarDescFromVarsInfo(vars_info, p_g_iter.second);
-          size_t size = framework::SizeOfType(var_desc->GetDataType());
-          auto shape = var_desc->GetShape();
-          std::for_each(shape.begin(), shape.end(),
-                        [&size](const int64_t &n) { size *= n; });
-          local_group_memory_size += size;
-        }
-
-        group_p_g.insert(group_p_g.end(), group_params_grads->at(j).begin(),
-                         group_params_grads->at(j).end());
-
-        ++j;
-        if (j >= group_params_grads->size()) {
-          break;
-        }
-
-        if (GetFuseParameterGroupsSize() > 1 &&
-            group_p_g.size() >
-                static_cast<size_t>(GetFuseParameterGroupsSize())) {
-          break;
-        }
-
-        if (static_cast<double>(local_group_memory_size) / kMB >=
-            group_memory_size) {
-          break;
-        }
-      }
-    }
-
-    std::swap(*group_params_grads, local_group_params_grads);
-
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << string::Sprintf(
-          "SetGroupAccordingToMemorySize(memory_size: %f MB):",
-          GetFuseParameterMemorySize());
-      PrintGroupInfo(vars_info, group_params_grads);
-    }
-  }
-
-  void ReGroupByDtype(
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
-      details::GroupParamsAndGrads *group_params_grads) const {
-    details::GroupParamsAndGrads new_group_params_grads;
-
-    for (auto &group_p_g : *group_params_grads) {
-      std::map<proto::VarType::Type, size_t> type_idx;
-      details::GroupParamsAndGrads local_group_params_grads;
-
-      for (auto &p_g : group_p_g) {
-        auto dtype = GetDtypeOfVar(vars_info, p_g.second);
-
-        size_t idx = 0;
-        auto var_idx_iter = type_idx.find(dtype);
-        if (var_idx_iter != type_idx.end()) {
-          idx = var_idx_iter->second;
-        } else {
-          local_group_params_grads.emplace_back();
-          idx = local_group_params_grads.size() - 1;
-          type_idx[dtype] = idx;
-        }
-
-        auto &local = local_group_params_grads.at(idx);
-        local.emplace_back(p_g);
-      }
-
-      VLOG(10) << "local_group_params_grads size:"
-               << local_group_params_grads.size();
-      new_group_params_grads.insert(new_group_params_grads.end(),
-                                    local_group_params_grads.begin(),
-                                    local_group_params_grads.end());
-    }
-
-    std::swap(*group_params_grads, new_group_params_grads);
-
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << string::Sprintf("ReGroupByDtype(memory_size: %f MB, %u):",
-                                  GetFuseParameterMemorySize(),
-                                  GetFuseParameterGroupsSize());
-      PrintGroupInfo(vars_info, group_params_grads);
-    }
-  }
-
-  proto::VarType::Type GetDtypeOfVar(
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
-      const std::string &name) const {
-    auto var_desc = GetVarDescFromVarsInfo(vars_info, name);
-    return var_desc->GetDataType();
-  }
-
-  proto::VarType::Type GetTypeOfVar(
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
-      const std::string &name) const {
-    auto var_desc = GetVarDescFromVarsInfo(vars_info, name);
-    return var_desc->GetType();
-  }
-
- private:
-  bool IsLoDTensorType(const proto::VarType::Type &type) const {
-    // Current only support LOD_TENSOR.
-    return type == proto::VarType::LOD_TENSOR;
-  }
-
-  std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
-      const Graph &result) const {
-    std::unordered_map<std::string, std::vector<Node *>> vars;
-    for (Node *node : result.Nodes()) {
-      if (node->IsVar() && node->Var()) {
-        // Note: The graph may have the same name node. For example, parameter
-        // is the input of operator and it also is the output of optimizer;
-        vars[node->Var()->Name()].emplace_back(node);
-      }
-    }
-    return vars;
-  }
-
-  const VarDesc *GetVarDescFromVarsInfo(
-      const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
-      const std::string &var_name) const {
-    auto grad_iter = vars_info.find(var_name);
-    PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true, "%s is not found.",
-                      var_name);
-    PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true, "%s is not found.",
-                      var_name);
-    PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
-    return grad_iter->second.front()->Var();
-  }
-
-  void RecordParamsAndGrads(const ir::Graph &graph,
-                            details::ParamsAndGrads *params_grads) const {
-    std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(graph);
-    for (auto &node : topo_nodes) {
-      try {
-        bool is_bk_op =
-            static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
-                                  OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                              static_cast<int>(OpRole::kBackward));
-        if (!is_bk_op) continue;
-        // Currently, we assume that once gradient is generated, it can be
-        // broadcast, and each gradient is only broadcast once.
-        auto backward_vars =
-            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-        PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
-        for (size_t i = 0; i < backward_vars.size(); i += 2) {
-          VLOG(10) << "Trainable parameter: " << backward_vars[i]
-                   << ", gradient: " << backward_vars[i + 1];
-
-          params_grads->emplace_back(std::make_pair(
-              backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/));
-        }
-      } catch (boost::bad_get e) {
-      }
-    }
-  }
-
-  void InitFusedVarsAndAllocSpaceForVars(
-      const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
-      const std::string &fused_var_name,
-      const details::ParamsAndGrads &params_grads, ir::Graph *result) const {
-    // Alloc continuous space for vars.
-    std::vector<std::string> grads_name;
-    std::vector<std::string> params_name;
-    grads_name.reserve(params_grads.size());
-    params_name.reserve(params_grads.size());
-    for (auto &p_g : params_grads) {
-      params_name.emplace_back(p_g.first);
-      grads_name.emplace_back(p_g.second);
-    }
-
-    result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
-    ProgramDesc &program_desc =
-        result->Get<details::ProgramDescs>(details::kProgramDescs).back();
-    auto *global_block = program_desc.MutableBlock(0);
-    AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
-                              global_block);
-  }
-
-  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
-                                 const std::vector<std::string> &grads_name,
-                                 const std::string &fused_var_name,
-                                 BlockDesc *global_block) const {
-    auto op_desc = global_block->AppendOp();
-    op_desc->SetType("coalesce_tensor");
-    op_desc->SetInput("Input", params_name);
-    op_desc->SetOutput("Output", grads_name);
-    op_desc->SetOutput("FusedOutput", {fused_var_name});
-  }
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(coalesce_grad_tensor_pass,
-              paddle::framework::ir::CoalesceGradTensorPass)
-    .RequirePassAttr(paddle::framework::details::kNRanks);
diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h
deleted file mode 100644
index 38dc4c99fc27f03d64704b479478065b636af63a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <algorithm>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetFuseParameterGroupsSize(int group_size);
-int GetFuseParameterGroupsSize();
-
-void SetFuseParameterMemorySize(double memory_size);
-double GetFuseParameterMemorySize();
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/codegen.cc b/paddle/fluid/framework/ir/codegen.cc
deleted file mode 100644
index c3e5efccba570192453d4336ea36a9a550e5be4d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/codegen.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/ir/codegen.h"
-#include <set>
-#include <sstream>
-#include "paddle/fluid/framework/ir/codegen_helper.h"
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// we get the parameter list code for the expression information
-std::string CodeGen::GetDeclarationCode(
-    std::vector<OperationExpression> expression) {
-  std::stringstream ret;
-  ret << "fuse_kernel";
-  ret << R"((int N )";
-  std::set<int> input_ids;
-  std::set<int> output_ids;
-  std::vector<int> last_output_idis;
-
-  for (size_t i = 0; i < expression.size(); i++) {
-    std::vector<int> tmp_input = expression[i].GetInputIds();
-    for (size_t j = 0; j < tmp_input.size(); j++) {
-      int id = tmp_input[j];
-      input_ids.insert(id);
-    }
-    int tmp_output = expression[i].GetOutputId();
-    output_ids.insert(tmp_output);
-  }
-
-  std::set<int>::iterator it = input_ids.begin();
-  while (it != input_ids.end()) {
-    int var_index = *it;
-    if (output_ids.find(var_index) != output_ids.end()) {
-      input_ids.erase(it++);
-    } else {
-      it++;
-    }
-  }
-
-  for (it = input_ids.begin(); it != input_ids.end(); it++) {
-    int var_index = *it;
-    ret << R"(, const T* var)" << var_index;
-  }
-
-  for (it = output_ids.begin(); it != output_ids.end(); it++) {
-    int var_index = *it;
-    ret << R"(, T* var)" << var_index;
-  }
-
-  ret << R"())";
-
-  return ret.str();
-}
-
-std::string CodeGen::GetOffsetCode() {
-  std::stringstream ret;
-  ret << indentation << "int offset = idx;" << std::endl;
-  return ret.str();
-}
-
-std::string CodeGen::GetComputeCode(
-    std::vector<OperationExpression> expression) {
-  // get the right experssion code using suffix expression
-  std::stringstream ret;
-  for (size_t i = 0; i < expression.size(); i++) {
-    ret << expression[i].GetExpression();
-  }
-  return ret.str();
-}
-// in order to get the right result of expression, we need to calculate, we
-// store the expression as
-// suffix Expressions using vector
-std::string CodeGen::GetKernelCode(
-    std::vector<OperationExpression> expression) {
-  auto declaration_code = GetDeclarationCode(expression);
-  auto offset_code = GetOffsetCode();
-  auto compute_code = GetComputeCode(expression);
-  auto cuda_kernel = const_kernel_start + declaration_code + const_kernel_mid +
-                     offset_code + compute_code + const_kernel_end;
-  return cuda_kernel;
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/codegen.h b/paddle/fluid/framework/ir/codegen.h
deleted file mode 100644
index 975d48885e72a3b6f6aa5cf89fa943118593834e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/codegen.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ir/codegen_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class CodeGen {
- public:
-  std::string GetKernelCode(std::vector<OperationExpression> expression);
-
- private:
-  std::string GetDeclarationCode(
-      std::vector<paddle::framework::ir::OperationExpression> expression);
-  std::string GetOffsetCode();
-  std::string GetComputeCode(
-      std::vector<paddle::framework::ir::OperationExpression> expression);
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/codegen_helper.cc b/paddle/fluid/framework/ir/codegen_helper.cc
deleted file mode 100644
index 8f14549eb717835063bba66503c269729ca2773d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/codegen_helper.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-#include "paddle/fluid/framework/ir/codegen_helper.h"
-#include <algorithm>
-#include <sstream>
-#include <string>
-#include <vector>
-namespace paddle {
-namespace framework {
-namespace ir {
-
-OperationExpression::OperationExpression(std::vector<int> input_ids,
-                                         int output_id,
-                                         std::string search_operation) {
-  input_ids_ = input_ids;
-  output_id_ = output_id;
-  search_operation_ = search_operation;
-}
-
-// we Traverse the graph and get the group , all input id and output id is
-// unique for the node which belong the group
-std::string OperationExpression::GetExpression() {
-  std::stringstream ret;
-  if (operator_cuda_table.find(search_operation_) ==
-      operator_cuda_table.end()) {
-    std::cerr << "Not supportted operation, " << search_operation_ << std::endl;
-  } else {
-    auto rhs = operator_cuda_table[search_operation_];
-    std::string replaced_str = "$";
-    int count = 0;
-    auto pos = rhs.find(replaced_str);
-    while (pos != -1) {
-      auto index = input_ids_[count];
-      rhs.replace(pos, replaced_str.length(),
-                  std::to_string(index) + R"([offset])");
-      pos = rhs.find(replaced_str);
-      count++;
-    }
-    auto lhs = std::string(indentation) + "var" + std::to_string(output_id_) +
-               R"([offset])";
-    auto equal_split = R"( = )";
-    auto semicolon = R"(;)";
-    ret << lhs << equal_split << rhs << semicolon << std::endl;
-  }
-
-  return ret.str();
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/codegen_helper.h b/paddle/fluid/framework/ir/codegen_helper.h
deleted file mode 100644
index be8d3c8ac26fcde9e8964475709d604822c70688..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/codegen_helper.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-static std::unordered_map<std::string, std::string> operator_cuda_table = {
-    {"elementwise_add", "var$ + var$"},
-    {"elementwise_sub", "var$ - var$"},
-    {"elementwise_mul", "var$ * var$"},
-    {"elementwise_div", "var$ / var$"},
-    {"elementwise_min", "real_min(var$, var$)"},
-    {"elementwise_max", "real_max(var$, var$)"},
-    {"relu", "real_max(var$, 0)"},
-    {"sigmoid", "1.0 / (1.0 + real_exp(-var$))"}};
-
-// op computation is composed by single or many operation
-class OperationExpression {
- public:
-  OperationExpression(std::vector<int> input_ids, int output_id,
-                      std::string search_oprtation);
-  std::string GetExpression();
-  std::vector<int> GetInputIds() { return input_ids_; }
-  int GetOutputId() { return output_id_; }
-
- private:
-  std::vector<int> input_ids_;
-  int output_id_;
-  std::string search_operation_;
-};
-
-static const char indentation[] = R"(    )";
-
-static const char const_kernel_start[] = R"(
-template <typename T>
-extern "C" __global__ void
-)";
-
-static const char const_kernel_mid[] = R"(
-{
-  for(int idx = blockIdx.x * blockDim.x + threadIdx.x;
-      idx < N;
-      idx += gridDim.x * blockDim.x) {
-
-)";
-
-static const char const_kernel_end[] = R"(
-}
-}
-)";
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/codegen_test.cc b/paddle/fluid/framework/ir/codegen_test.cc
deleted file mode 100644
index 8fd5fde3df2c1a1876b346f747f9158a3d40499b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/codegen_test.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/codegen.h"
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ir/codegen_helper.h"
-#ifdef PADDLE_WITH_CUDA
-TEST(codegen, cuda) {
-  std::vector<int> mul_input{1, 2};
-  std::vector<int> add_input{3, 4};
-  std::vector<int> sigmod_input{5};
-  int mul_out = 3;
-  int add_out = 5;
-  int sigmod_out = 6;
-
-  std::string op1 = "elementwise_mul";
-  std::string op2 = "elementwise_add";
-  std::string op3 = "sigmoid";
-  paddle::framework::ir::OperationExpression opexp1(mul_input, mul_out, op1);
-  paddle::framework::ir::OperationExpression opexp2(add_input, add_out, op2);
-  paddle::framework::ir::OperationExpression opexp3(sigmod_input, sigmod_out,
-                                                    op3);
-
-  std::vector<paddle::framework::ir::OperationExpression> fused_op = {
-      opexp1, opexp2, opexp3};
-  paddle::framework::ir::CodeGen codegen;
-  std::string result = codegen.GetKernelCode(fused_op);
-  std::cout << result << std::endl;
-}
-#endif
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
deleted file mode 100644
index fecc159adef1992a90b6ee88b3b7ffceea116243..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
-#include <functional>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-#define GET_CONV_BN_NODES(pattern_name)                                    \
-  /* OPERATORS */                                                          \
-  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
-  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
-  /* CONV inputs */                                                        \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
-  /* CONV outputs */                                                       \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
-  /* Affine Channel inputs */                                              \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
-  /* Affine channel outputs */                                             \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
-
-void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
-                                const ir::Node& ac_scale,
-                                const LoDTensor& ac_bias_tensor,
-                                LoDTensor* eltwise_y_in_tensor) {
-  using EigenVectorArrayMap =
-      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using ConstEigenVectorArrayMap =
-      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using EigenMatrixArrayMap = Eigen::Map<
-      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-  // Re-compute bias of conv2d from AffineChannel
-  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims());
-
-  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
-
-  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
-                                       scale_tensor->numel(), 1);
-  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
-                                         ac_bias_tensor.numel(), 1);
-
-  EigenVectorArrayMap eltwise_y_in_array(
-      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-      eltwise_y_in_tensor->numel(), 1);
-
-  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
-
-  // Re-compute weight of conv2d from AffineChannel
-  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
-  auto weights_shape = weights->dims();
-  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
-
-  EigenMatrixArrayMap weights_array_2d(
-      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
-      weights_shape_2d[1]);
-
-  weights_array_2d.colwise() *= scale_array;
-}
-
-void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
-                                              name_scope_);
-  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
-
-  int found_conv_ac_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle ConvAffineChannel fuse";
-
-    GET_CONV_BN_NODES(conv_ac_pattern);
-
-    // check if fuse can be done and if MKL-DNN should be used
-    FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
-    if (fuse_option == DO_NOT_FUSE) {
-      VLOG(3) << "do not perform conv+affinechannel fuse";
-      return;
-    }
-
-    // Create eltwise_y (conv bias) variable
-    VarDesc eltwise_y_in_desc(
-        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
-    eltwise_y_in_desc.SetPersistable(true);
-    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
-    auto* eltwise_y_in_tensor =
-        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
-
-    // Get affine_channel bias
-    auto* ac_bias_tensor =
-        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
-
-    // Initialize eltwise_y
-    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
-    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-                eltwise_y_in_tensor->numel(), 0.0f);
-
-    // update weights and biases
-    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
-                               eltwise_y_in_tensor);
-
-    // create an elementwise add node.
-    OpDesc desc;
-    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
-    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
-    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-    desc.SetType("elementwise_add");
-    desc.SetAttr("axis", 1);
-    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
-
-    GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
-
-    IR_NODE_LINK_TO(conv_out, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_op, ac_out);
-    found_conv_ac_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_conv_ac_count);
-}
-
-void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
-                                              name_scope_);
-  conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
-
-  int found_conv_ac_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle ConvBN fuse";
-
-    GET_CONV_BN_NODES(conv_ac_pattern);
-    // OPERATORS
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
-    // BIAS inputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
-    // BIAS outputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
-
-    // Get eltwise_y (conv bias) variable
-    auto* eltwise_y_in_tensor =
-        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
-
-    // Get batch norm bias
-    auto* ac_bias_tensor =
-        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
-
-    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
-                               eltwise_y_in_tensor);
-
-    // Update the elementwise_add node
-    eltwise->Op()->SetAttr("axis", 1);
-    eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-
-    GraphSafeRemoveNodes(graph,
-                         {ac_scale, ac_bias, affine_channel, eltwise_out});
-
-    IR_NODE_LINK_TO(eltwise, ac_out);
-
-    found_conv_ac_count++;
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_conv_ac_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_affine_channel_fuse_pass,
-              paddle::framework::ir::ConvAffineChannelFusePass);
-REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
-              paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
deleted file mode 100644
index d607020a47b8c589775ac763f04e64272dfec4e0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse the Conv and ConvAffineChannel.
- */
-class ConvAffineChannelFusePass : public FusePassBase {
- public:
-  virtual ~ConvAffineChannelFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph*) const override;
-  const std::string name_scope_{"conv_affine_channel_fuse"};
-};
-
-class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
- public:
-  virtual ~ConvEltwiseAddAffineChannelFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph*) const override;
-  const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
deleted file mode 100644
index 4fe3fb4f3dc5e1258f34cefe4c1f642b37e05936..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h"
-#include <functional>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-#define GET_CONV_BN_NODES(pattern_name)                                      \
-  /* OPERATORS */                                                            \
-  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                       \
-  GET_IR_NODE_FROM_SUBGRAPH(batch_norm, batch_norm, pattern_name);           \
-  /* CONV inputs */                                                          \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);         \
-  /* CONV outputs */                                                         \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);               \
-  /* BN inputs */                                                            \
-  GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, pattern_name);               \
-  GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, pattern_name);                 \
-  GET_IR_NODE_FROM_SUBGRAPH(bn_mean, bn_mean, pattern_name);                 \
-  GET_IR_NODE_FROM_SUBGRAPH(bn_variance, bn_variance, pattern_name);         \
-  /* BN outputs */                                                           \
-  GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, pattern_name); /* Out */         \
-  GET_IR_NODE_FROM_SUBGRAPH(bn_mean_out, bn_mean_out, pattern_name);         \
-  GET_IR_NODE_FROM_SUBGRAPH(bn_variance_out, bn_variance_out, pattern_name); \
-  GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name);     \
-  GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
-
-void recompute_bias_and_weights(const Scope* scope,
-                                ir::Node* conv_weight,            //
-                                const ir::Node& bn_scale,         //
-                                const LoDTensor& bn_bias_tensor,  //
-                                const ir::Node& bn_mean,          //
-                                const ir::Node& bn_variance,      //
-                                LoDTensor* eltwise_y_in_tensor,   //
-                                float epsilon) {
-  using EigenVectorArrayMap =
-      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using ConstEigenVectorArrayMap =
-      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using EigenMatrixArrayMap = Eigen::Map<
-      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-  // Re-compute bias of conv2d from BN
-  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims());
-
-  auto* scale_tensor = scope->FindVar(bn_scale.Name())->GetMutable<LoDTensor>();
-  auto* variance_tensor =
-      scope->FindVar(bn_variance.Name())->GetMutable<LoDTensor>();
-  auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable<LoDTensor>();
-
-  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
-                                       scale_tensor->numel(), 1);
-  EigenVectorArrayMap variance_array(
-      variance_tensor->mutable_data<float>(platform::CPUPlace()),
-      variance_tensor->numel(), 1);
-  ConstEigenVectorArrayMap mean_array(mean_tensor->data<float>(),
-                                      mean_tensor->numel(), 1);
-  ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data<float>(),
-                                         bn_bias_tensor.numel(), 1);
-
-  // variance will not be used anymore, so make it std_array and then tmp_array
-  variance_array += epsilon;
-  variance_array = variance_array.sqrt();
-  variance_array = scale_array / variance_array;
-
-  EigenVectorArrayMap eltwise_y_in_array(
-      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-      eltwise_y_in_tensor->numel(), 1);
-
-  eltwise_y_in_array =
-      ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array;
-
-  // Re-compute weight of conv2d from BN
-  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
-  auto weights_shape = weights->dims();
-  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
-
-  EigenMatrixArrayMap weights_array_2d(
-      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
-      weights_shape_2d[1]);
-
-  weights_array_2d.colwise() *= variance_array;
-}
-
-void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvBN conv_bn_pattern(gpd.mutable_pattern(), name_scope_);
-  conv_bn_pattern(conv_input, false /*with_eltwise_add*/);
-
-  int found_conv_bn_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle ConvBN fuse";
-
-    // conv, batch_norm,
-    // conv_weight, conv_out,
-    // bn_scale, bn_bias, bn_mean, bn_variance,
-    // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,
-    // bn_saved_variance
-    GET_CONV_BN_NODES(conv_bn_pattern);
-
-    // check if fuse can be done and if MKL-DNN should be used
-    FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
-    if (fuse_option == DO_NOT_FUSE) {
-      VLOG(3) << "do not perform conv+bn fuse";
-      return;
-    }
-
-    // Get batch norm bias
-    auto* bn_bias_tensor =
-        scope->FindVar(bn_bias->Name())->GetMutable<LoDTensor>();
-
-    // Create eltwise_y (conv bias) variable
-    VarDesc eltwise_y_in_desc(
-        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
-    eltwise_y_in_desc.SetShape(framework::vectorize(bn_bias_tensor->dims()));
-    eltwise_y_in_desc.SetDataType(bn_bias_tensor->type());
-    eltwise_y_in_desc.SetLoDLevel(bn_bias->Var()->GetLoDLevel());
-    eltwise_y_in_desc.SetPersistable(true);
-    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
-    auto* eltwise_y_in_tensor =
-        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
-
-    // Initialize eltwise_y
-    eltwise_y_in_tensor->Resize(bn_bias_tensor->dims());
-    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-                eltwise_y_in_tensor->numel(), 0.0f);
-
-    // update weights and biases
-    float epsilon = boost::get<float>(batch_norm->Op()->GetAttr("epsilon"));
-    recompute_bias_and_weights(scope, conv_weight, *bn_scale, *bn_bias_tensor,
-                               *bn_mean, *bn_variance, eltwise_y_in_tensor,
-                               epsilon);
-
-    // with MKL-DNN fuse conv+bn into conv with bias
-    // without MKL-DNN fuse conv+bn into conv+elementwise_add
-    if (fuse_option == FUSE_MKLDNN) {
-      auto input_names = conv->Op()->InputNames();
-      bool has_bias = std::find(input_names.begin(), input_names.end(),
-                                "Bias") != input_names.end();
-      if (has_bias && conv->Op()->Input("Bias").size() > 0) {
-        // reuse existing conv bias node
-        auto conv_bias_names = conv->Op()->Input("Bias");
-        PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1UL);
-        auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
-        auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
-        PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
-                          eltwise_y_in_tensor->dims());
-
-        auto eigen_conv_bias = EigenVector<float>::From(*conv_bias_tensor);
-        eigen_conv_bias += EigenVector<float>::From(*eltwise_y_in_tensor);
-      } else {
-        // add new conv_bias node
-        conv->Op()->SetInput(
-            "Bias", std::vector<std::string>({eltwise_y_in_node->Name()}));
-        IR_NODE_LINK_TO(eltwise_y_in_node, conv);
-      }
-      conv->Op()->SetOutput("Output",
-                            std::vector<std::string>({bn_out->Name()}));
-
-      GraphSafeRemoveNodes(
-          graph,
-          {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm,
-           bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance});
-
-      IR_NODE_LINK_TO(conv, bn_out);
-      found_conv_bn_count++;
-    } else {  // fuse_option == FUSE_NATIVE
-      // create an elementwise add node.
-      OpDesc desc;
-      desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
-      desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
-      desc.SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
-      desc.SetType("elementwise_add");
-      desc.SetAttr("axis", 1);
-      auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
-
-      GraphSafeRemoveNodes(graph, {bn_scale, bn_bias, bn_mean, bn_variance,
-                                   batch_norm, bn_mean_out, bn_variance_out,
-                                   bn_saved_mean, bn_saved_variance});
-
-      IR_NODE_LINK_TO(conv_out, eltwise_op);
-      IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
-      IR_NODE_LINK_TO(eltwise_op, bn_out);
-      found_conv_bn_count++;
-    }
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_conv_bn_count);
-}
-
-void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvBN conv_bn_pattern(gpd.mutable_pattern(), name_scope_);
-  conv_bn_pattern(conv_input, true /*with_eltwise_add*/);
-
-  int found_conv_bn_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle ConvBN fuse";
-
-    // conv, batch_norm,
-    // conv_weight, conv_out,
-    // bn_scale, bn_bias, bn_mean, bn_variance,
-    // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,bn_saved_variance
-    GET_CONV_BN_NODES(conv_bn_pattern);
-    // OPERATORS
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bn_pattern);
-    // BIAS inputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_bn_pattern);
-    // BIAS outputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bn_pattern);
-
-    // Get eltwise_y (conv bias) variable
-    auto* eltwise_y_in_tensor =
-        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
-
-    // Get batch norm bias
-    auto* bn_bias_tensor =
-        scope->FindVar(bn_bias->Name())->GetMutable<LoDTensor>();
-
-    // update weights and biases
-    float epsilon = boost::get<float>(batch_norm->Op()->GetAttr("epsilon"));
-    recompute_bias_and_weights(scope, conv_weight, *bn_scale, *bn_bias_tensor,
-                               *bn_mean, *bn_variance, eltwise_y_in_tensor,
-                               epsilon);
-
-    // Update the elementwise_add node
-    eltwise->Op()->SetAttr("axis", 1);
-    eltwise->Op()->SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
-
-    GraphSafeRemoveNodes(
-        graph,
-        {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
-         bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out});
-
-    IR_NODE_LINK_TO(eltwise, bn_out);
-
-    found_conv_bn_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_conv_bn_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_bn_fuse_pass, paddle::framework::ir::ConvBNFusePass);
-REGISTER_PASS(conv_eltwiseadd_bn_fuse_pass,
-              paddle::framework::ir::ConvEltwiseAddBNFusePass);
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
deleted file mode 100644
index 837a48ed7305f4176fc709ab2cb4edf68aeb9fa1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse the Conv and BatchNorm to a ConvBNMKLDNNOp.
- */
-class ConvBNFusePass : public FusePassBase {
- public:
-  virtual ~ConvBNFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-  const std::string name_scope_{"conv_bn_fuse"};
-};
-
-class ConvEltwiseAddBNFusePass : public FusePassBase {
- public:
-  virtual ~ConvEltwiseAddBNFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-  const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
deleted file mode 100644
index b4d6f683ce747a35aea7b431165911d942bcf092..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
-#include <string>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
-#define GET_NODES                      \
-  GET_IR_NODE(conv_op);                \
-  GET_IR_NODE(conv_out);               \
-  GET_IR_NODE(conv_filter);            \
-  GET_IR_NODE(elementwise_add_op);     \
-  GET_IR_NODE(elementwise_add_in_y);   \
-  GET_IR_NODE(elementwise_add_out);    \
-  GET_IR_NODE(elementwise_add_op_1);   \
-  GET_IR_NODE(elementwise_add_in_y_1); \
-  GET_IR_NODE(elementwise_add_out_1);  \
-  GET_IR_NODE(act_op);                 \
-  GET_IR_NODE(act_out);
-
-// Inherient the basic infomation from `base_desc`, and modify some fields.
-framework::proto::OpDesc PrepareOpDesc(
-    const framework::proto::OpDesc& base_desc, const std::string& bias,
-    const std::string& bias1, const std::string& activation,
-    const std::string& output) {
-  auto proto = base_desc;
-  framework::OpDesc desc(proto, nullptr);
-  desc.SetType("conv2d_fusion");
-  desc.SetInput("Bias", {bias});
-  desc.SetInput("ResidualData", {bias1});
-  desc.SetAttr("activation", activation);
-  desc.SetOutput("Output", {output});
-  desc.SetAttr("is_test", true);
-  desc.SetAttr("use_cudnn", false);
-  desc.Flush();
-  return *desc.Proto();
-}
-
-void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "conv_elementwise_add2_act_fuse";
-  FusePassBase::Init(pattern_name, graph);
-
-  GraphPatternDetector gpd;
-  auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
-      "conv2d", "Input");
-
-  patterns::ConvElementwiseadd2Act pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(x);
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_NODES;
-
-    auto base_op_desc = *conv_op->Op()->Proto();
-    std::string bias_name = elementwise_add_in_y->Name();
-    std::string bias1_name = elementwise_add_in_y_1->Name();
-    std::string act_op_type = act_op->Op()->Type();
-    std::string act_op_out = act_out->Name();
-
-    auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name,
-                                      act_op_type, act_op_out);
-    framework::OpDesc new_op_desc(new_op_proto, nullptr);
-
-    // Create a new node for the fused op.
-    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
-
-    // Link inputs and outputs.
-    PADDLE_ENFORCE(subgraph.count(x));
-    auto* conv_in_node = subgraph.at(x);
-
-    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
-    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
-    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
-    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // Bias
-    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output
-
-    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(
-        graph, {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
-                elementwise_add_out, elementwise_add_out_1, act_op});
-  };
-  gpd(graph, handler);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
-              paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
deleted file mode 100644
index ea9e465d8d765a298215db29c77aa58e727fd15e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class ConvElementwiseAdd2ActFusePass : public FusePassBase {
- public:
-  virtual ~ConvElementwiseAdd2ActFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
deleted file mode 100644
index ba0a2fb96458bd70105fa4d97114b609657b62f6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
-#include <string>
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
-#define GET_NODES                    \
-  GET_IR_NODE(conv_op);              \
-  GET_IR_NODE(conv_out);             \
-  GET_IR_NODE(conv_filter);          \
-  GET_IR_NODE(elementwise_add_op);   \
-  GET_IR_NODE(elementwise_add_in_y); \
-  GET_IR_NODE(elementwise_add_out);  \
-  GET_IR_NODE(act_op);               \
-  GET_IR_NODE(act_out);
-
-// Inherient the basic infomation from `base_desc`, and modify some fields.
-framework::proto::OpDesc PrepareOpDesc(
-    const framework::proto::OpDesc& base_desc, const std::string& bias,
-    const std::string& activation, const std::string& output) {
-  auto proto = base_desc;
-  framework::OpDesc desc(proto, nullptr);
-  desc.SetType("conv2d_fusion");
-  desc.SetInput("Bias", {bias});
-  desc.SetInput("ResidualData", {});
-  desc.SetAttr("activation", activation);
-  desc.SetOutput("Output", {output});
-  desc.SetAttr("is_test", true);
-  desc.SetAttr("use_cudnn", false);
-  desc.Flush();
-  return *desc.Proto();
-}
-
-void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "conv_elementwise_add_act_fuse";
-  FusePassBase::Init(pattern_name, graph);
-
-  GraphPatternDetector gpd;
-  auto* x = gpd.mutable_pattern()
-                ->NewNode("x")
-                ->assert_is_op_input("conv2d", "Input")
-                ->AsInput();
-
-  patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(x);
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_NODES;
-
-    auto base_op_desc = *conv_op->Op()->Proto();
-    std::string bias_name = elementwise_add_in_y->Name();
-    std::string act_op_type = act_op->Op()->Type();
-    std::string act_op_out = act_out->Name();
-
-    auto new_op_proto =
-        PrepareOpDesc(base_op_desc, bias_name, act_op_type, act_op_out);
-    framework::OpDesc new_op_desc(new_op_proto, nullptr);
-
-    // Create a new node for the fused op.
-    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
-
-    // Link inputs and outputs.
-    PADDLE_ENFORCE(subgraph.count(x));
-    auto* conv_in_node = subgraph.at(x);
-
-    IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
-    IR_NODE_LINK_TO(conv_filter, new_conv_op);           // Filter
-    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);  // Bias
-    IR_NODE_LINK_TO(new_conv_op, act_out);               // Output
-
-    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op,
-                                 elementwise_add_out, act_op});
-  };
-
-  gpd(graph, handler);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
-              paddle::framework::ir::ConvElementwiseAddActFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
deleted file mode 100644
index 8b34c3551d8f9b54f01e52cc0fc896901cd7df99..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class ConvElementwiseAddActFusePass : public FusePassBase {
- public:
-  virtual ~ConvElementwiseAddActFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
deleted file mode 100644
index 8c491d4f58b4d3a1d93fe075fd0d118feeb6f8c2..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-
-#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
-#define GET_NODES                    \
-  GET_IR_NODE(conv_op);              \
-  GET_IR_NODE(conv_out);             \
-  GET_IR_NODE(conv_filter);          \
-  GET_IR_NODE(elementwise_add_op);   \
-  GET_IR_NODE(elementwise_add_in_y); \
-  GET_IR_NODE(elementwise_add_out);
-
-void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "conv_elementwise_add_fuse";
-  FusePassBase::Init(pattern_name, graph);
-
-  GraphPatternDetector gpd;
-  auto* x = gpd.mutable_pattern()
-                ->NewNode("x")
-                ->assert_is_op_input("conv2d", "Input")
-                ->AsInput();
-
-  patterns::ConvElementwiseadd pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(x);
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_NODES;
-
-    auto base_op_desc = *conv_op->Op()->Proto();
-    std::string bias_name = elementwise_add_in_y->Name();
-    std::string output_name = elementwise_add_out->Name();
-
-    std::string act_type = "identity";
-    framework::OpDesc new_op_desc(base_op_desc, nullptr);
-    new_op_desc.SetType("conv2d_fusion");
-    new_op_desc.SetInput("Bias", {bias_name});
-    new_op_desc.SetInput("ResidualData", {});
-    new_op_desc.SetAttr("activation", act_type);
-    new_op_desc.SetOutput("Output", {output_name});
-    new_op_desc.SetAttr("is_test", true);
-    new_op_desc.SetAttr("use_cudnn", false);
-    new_op_desc.Flush();
-
-    // Create a new node for the fused op.
-    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
-
-    // Link inputs and outputs.
-    PADDLE_ENFORCE(subgraph.count(x));
-    auto* conv_in_node = subgraph.at(x);
-
-    IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
-    IR_NODE_LINK_TO(conv_filter, new_conv_op);           // Filter
-    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);  // Bias
-    IR_NODE_LINK_TO(new_conv_op, elementwise_add_out);   // Output
-
-    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op});
-  };
-
-  gpd(graph, handler);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_elementwise_add_fuse_pass,
-              paddle::framework::ir::ConvElementwiseAddFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
deleted file mode 100644
index 66a562cdd1948980a6792a53713cac947d72e7d6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class ConvElementwiseAddFusePass : public FusePassBase {
- public:
-  virtual ~ConvElementwiseAddFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass.cc b/paddle/fluid/framework/ir/cudnn_placement_pass.cc
deleted file mode 100644
index 420e8ee83adbc2935d84c009cfb88589d02bc29c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/cudnn_placement_pass.cc
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/cudnn_placement_pass.h"
-
-REGISTER_PASS(cudnn_placement_pass, paddle::framework::ir::CUDNNPlacementPass)
-    .RequirePassAttr("cudnn_enabled_op_types");
diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass.h b/paddle/fluid/framework/ir/cudnn_placement_pass.h
deleted file mode 100644
index d3f5858307f7141864fb238f70ee76f4f4e755c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/cudnn_placement_pass.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/placement_pass_base.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Specifies which operators should use cuDNN.
- */
-class CUDNNPlacementPass : public PlacementPassBase {
- private:
-  const std::string GetPlacementName() const { return "cuDNN"; }
-
-  const std::string GetAttrName() const { return "use_cudnn"; }
-
-  const std::unordered_set<std::string> GetOpTypesList() const {
-    return Get<std::unordered_set<std::string>>("cudnn_enabled_op_types");
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
deleted file mode 100644
index b4a563615d55afc8ed200b55c77425d66f0adbac..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/cudnn_placement_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void RegisterOpKernel() {
-  static bool is_registered = false;
-  if (!is_registered) {
-    auto& all_kernels = OperatorWithKernel::AllOpKernels();
-
-    platform::CUDAPlace place = platform::CUDAPlace(0);
-    OpKernelType plain_kernel_type =
-        OpKernelType(proto::VarType::FP32, place, DataLayout::kAnyLayout,
-                     LibraryType::kPlain);
-    OpKernelType cudnn_kernel_type =
-        OpKernelType(proto::VarType::FP32, place, DataLayout::kAnyLayout,
-                     LibraryType::kCUDNN);
-
-    auto fake_kernel_func = [](const ExecutionContext&) -> void {
-      static int num_calls = 0;
-      num_calls++;
-    };
-
-    all_kernels["conv2d"][cudnn_kernel_type] = fake_kernel_func;
-    all_kernels["pool2d"][cudnn_kernel_type] = fake_kernel_func;
-    all_kernels["depthwise_conv2d"][plain_kernel_type] = fake_kernel_func;
-    all_kernels["relu"][plain_kernel_type] = fake_kernel_func;
-
-    is_registered = true;
-  }
-}
-
-void MainTest(std::initializer_list<std::string> cudnn_enabled_op_types,
-              unsigned expected_use_cudnn_true_count) {
-  // operator                                 use_cudnn
-  // --------------------------------------------------
-  // (a,b)->concat->c                         -
-  // (c,weights,bias)->conv2d->f              false
-  // f->relu->g                               -
-  // g->pool2d->h                             false
-  // (h,weights2,bias2)->depthwise_conv2d->k  false
-  // k->relu->l                               -
-  Layers layers;
-  VarDesc* a = layers.data("a");
-  VarDesc* b = layers.data("b");
-  VarDesc* c = layers.concat(std::vector<VarDesc*>({a, b}));
-  VarDesc* weights_0 = layers.data("weights_0");
-  VarDesc* bias_0 = layers.data("bias_0");
-  VarDesc* f = layers.conv2d(c, weights_0, bias_0, false);
-  VarDesc* g = layers.relu(f);
-  VarDesc* h = layers.pool2d(g, false);
-  VarDesc* weights_1 = layers.data("weights_1");
-  VarDesc* bias_1 = layers.data("bias_1");
-  VarDesc* k = layers.depthwise_conv2d(h, weights_1, bias_1, false);
-  layers.relu(k);
-
-  RegisterOpKernel();
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
-  auto pass = PassRegistry::Instance().Get("cudnn_placement_pass");
-  pass->Set("cudnn_enabled_op_types",
-            new std::unordered_set<std::string>(cudnn_enabled_op_types));
-
-  graph.reset(pass->Apply(graph.release()));
-
-  unsigned use_cudnn_true_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()) {
-      auto* op = node->Op();
-      if (op->HasAttr("use_cudnn") &&
-          boost::get<bool>(op->GetAttr("use_cudnn"))) {
-        ++use_cudnn_true_count;
-      }
-    }
-  }
-
-  EXPECT_EQ(use_cudnn_true_count, expected_use_cudnn_true_count);
-}
-
-TEST(CUDNNPlacementPass, enable_conv2d) {
-  // 1 conv2d
-  MainTest({"conv2d"}, 1);
-}
-
-TEST(CUDNNPlacementPass, enable_relu_pool) {
-  // 1 conv2d + 1 pool2d
-  MainTest({"conv2d", "pool2d"}, 2);
-}
-
-TEST(CUDNNPlacementPass, enable_all) {
-  // 1 conv2d + 1 pool2d
-  // depthwise_conv2d doesnot have CUDNN kernel.
-  MainTest({}, 2);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(cudnn_placement_pass);
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
deleted file mode 100644
index 3d4df87ab7e3587ec4a8d0b8f3218a3272fd40e4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-
-#include "paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
-#define GET_NODES                         \
-  GET_IR_NODE(any_op_out);                \
-  GET_IR_NODE(quant_dequant_op_inscale);  \
-  GET_IR_NODE(quant_dequant_op);          \
-  GET_IR_NODE(quant_dequant_op_outscale); \
-  GET_IR_NODE(quant_dequant_op_out);      \
-  GET_IR_NODE(any_op2);
-
-void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "delete_quantdequant_op_pattern";
-  FusePassBase::Init(pattern_name, graph);
-
-  GraphPatternDetector gpd;
-
-  patterns::DeleteQuantDequantOpPattern pattern(gpd.mutable_pattern(),
-                                                pattern_name);
-  pattern();
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_NODES;
-    IR_NODE_LINK_TO(any_op_out, any_op2);
-    std::string any_op_out_name = any_op_out->Var()->Name();
-    std::string quant_dequant_op_out_name = quant_dequant_op_out->Var()->Name();
-
-    auto* any_op2_desc = any_op2->Op();
-    // auto input_args_names = any_op2_desc->InputArgumentNames();
-    auto var_map = any_op2_desc->Inputs();
-
-    for (auto& name_m : var_map) {
-      if (std::find(name_m.second.begin(), name_m.second.end(),
-                    quant_dequant_op_out_name) != name_m.second.end()) {
-        std::vector<std::string> new_inputs;
-        for (auto& i_n : name_m.second) {
-          if (i_n != quant_dequant_op_out_name) {
-            new_inputs.push_back(i_n);
-          }
-        }
-        new_inputs.push_back(any_op_out_name);
-        any_op2_desc->SetInput(name_m.first, new_inputs);
-        any_op2_desc->Flush();
-      }
-    }
-    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph,
-                         {quant_dequant_op, quant_dequant_op_out,
-                          quant_dequant_op_inscale, quant_dequant_op_outscale});
-  };
-
-  gpd(graph, handler);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(delete_quant_dequant_op_pass,
-              paddle::framework::ir::DeleteQuantDequantOpPass);
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
deleted file mode 100644
index 938ada6453edf4e3ce324aa787e9d23905869d93..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class DeleteQuantDequantOpPass : public FusePassBase {
- public:
-  virtual ~DeleteQuantDequantOpPass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
deleted file mode 100644
index 21ceec7927e4a9f5f9e29aeffbf31e473cf0237e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h"
-#include <algorithm>
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-static int BuildFusion(Graph* graph, const std::string& name_scope,
-                       Scope* scope, bool with_fc_bias) {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-
-  // Build pattern
-  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
-                  ->assert_is_op_input("lookup_table")
-                  ->assert_var_not_persistable();
-  patterns::Embedding embedding_pattern(pattern, name_scope);
-  // TODO(jczaja): Intermediate can only be for val that are not used anywhere
-  //               but lookup table output may go into other LSTM (for reverse
-  //               direction)
-  auto* embedding_out = embedding_pattern(x);
-  patterns::FC fc_pattern(pattern, name_scope);
-
-  // fc_out is a tmp var, will be removed after fuse, so marked as intermediate.
-  auto* fc_out = fc_pattern(embedding_out, with_fc_bias, /* with_relu */ false)
-                     ->AsIntermediate();
-  patterns::LSTM lstm_pattern(pattern, name_scope);
-  lstm_pattern(fc_out);
-
-  // Create New OpDesc
-  auto embedding_lstm_creator = [&](Node* embedding, Node* W, Node* lstm,
-                                    Node* input, Node* weight_x, Node* weight_h,
-                                    Node* bias, Node* hidden, Node* cell,
-                                    Node* xx, Node* fc_bias) {
-    OpDesc op_desc;
-    op_desc.SetType("fused_embedding_fc_lstm");
-#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
-    SET_IN(Ids, input);
-    SET_IN(WeightH, weight_h);
-    // Neet to have this passed as We need Wc data for peephole connections
-    SET_IN(Bias, bias);
-#undef SET_IN
-
-    // Multiply embeddings with Weights
-    PADDLE_ENFORCE(scope);
-    const std::string& embeddings = patterns::UniqueKey("Embeddings");
-    auto* embeddings_var = scope->Var(embeddings);
-    PADDLE_ENFORCE(embeddings_var);
-    auto* embeddings_tensor =
-        embeddings_var->GetMutable<framework::LoDTensor>();
-    // Get WeightX size: [single_embedding, fc_size]
-    // and embedding size: [dict_size, single_embedding]
-    // and create new size of embeddings eg. [dict_size , hidden_size]
-    auto* embedding_var = scope->FindVar(W->Name());
-    PADDLE_ENFORCE(embedding_var);
-    const auto& embedding_tensor = embedding_var->Get<framework::LoDTensor>();
-
-    const auto& weightx_tensor =
-        scope->FindVar(weight_x->Name())->Get<framework::LoDTensor>();
-    embeddings_tensor->Resize(
-        {embedding_tensor.dims()[0], weightx_tensor.dims()[1]});
-
-    // Multiplie embeddings via WeightsX and add bias
-    auto embedding_data = embedding_tensor.data<float>();
-    auto weightx_data = weightx_tensor.data<float>();
-    auto embeddings_data =
-        embeddings_tensor->mutable_data<float>(platform::CPUPlace());
-
-    // Adding biases to GEMM result to be
-    auto* lstm_bias_var = scope->FindVar(bias->Name());
-    PADDLE_ENFORCE(lstm_bias_var);
-    const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
-
-    auto alpha = 1.0f;
-    auto beta = 1.0f;
-    int m = embedding_tensor.dims()[0];
-    int n = weightx_tensor.dims()[1];
-    int k = embedding_tensor.dims()[1];
-
-    // Copy only gate biases values (only actual bias data, not peephole
-    // weights)
-    std::vector<float> combined_biases;
-    combined_biases.reserve(n);
-    std::copy_n(lstm_bias_tensor.data<float>(), n,
-                std::back_inserter(combined_biases));
-
-    if (with_fc_bias) {
-      // Add FC-bias with LSTM-bias (into GEMM result to be)
-      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
-      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
-      for (int i = 0; i < fc_bias_tensor.numel(); i++) {
-        combined_biases[i] += fc_bias_tensor.data<float>()[i];
-      }
-    }
-
-    // broadcast biases
-    std::vector<float> ones(m, 1.0f);
-    paddle::operators::math::CBlas<float>::GEMM(
-        CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1,
-        &combined_biases[0], n, 0.0f, embeddings_data, n);
-
-    // Wx*embeddings + biases
-    paddle::operators::math::CBlas<float>::GEMM(
-        CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha,
-        embedding_data, k, weightx_data, n, beta, embeddings_data, n);
-    op_desc.SetInput("Embeddings", {embeddings});
-
-    // Create temp variables.
-    const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
-    const std::string BatchedCellPreAct =
-        patterns::UniqueKey("BatchedCellPreAct");
-    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
-
-    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
-    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
-    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
-
-    op_desc.SetInput("H0", {});
-    op_desc.SetInput("C0", {});
-    op_desc.SetOutput("Hidden", {hidden->Name()});
-    op_desc.SetOutput("Cell", {cell->Name()});
-    op_desc.SetOutput("XX", {xx->Name()});
-    op_desc.SetOutput("BatchedGate", {BatchedGate});
-    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
-    op_desc.SetOutput("BatchedInput", {BatchedInput});
-    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
-    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
-    // TODO(TJ): get from attr
-    op_desc.SetAttr("use_seq", true);
-
-    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
-#define OP_SET_OUT(x)                            \
-  const std::string x = patterns::UniqueKey(#x); \
-  op_desc.SetOutput(#x, {x});                    \
-  scope.Var(x)->GetMutable<LoDTensor>()
-    OP_SET_OUT(BatchedCell);
-    OP_SET_OUT(BatchedHidden);
-    OP_SET_OUT(ReorderedH0);
-    OP_SET_OUT(ReorderedC0);
-#undef OP_SET_OUT
-
-    auto* op = graph->CreateOpNode(&op_desc);
-    IR_NODE_LINK_TO(input, op);
-    IR_NODE_LINK_TO(weight_x, op);
-    IR_NODE_LINK_TO(weight_h, op);
-    IR_NODE_LINK_TO(bias, op);
-    IR_NODE_LINK_TO(op, hidden);
-    return op;
-  };
-
-  int fusion_count{0};
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(lookup_table, lookup_table, embedding_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(W, W, embedding_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
-
-    // TODO(jczaja): Add support for is_sparse / is_distributed
-    auto is_sparse = boost::get<bool>(lookup_table->Op()->GetAttr("is_sparse"));
-    auto is_distributed =
-        boost::get<bool>(lookup_table->Op()->GetAttr("is_distributed"));
-
-    if (is_sparse == true || is_distributed == true) {
-      return;
-    }
-
-    if (with_fc_bias) {
-      GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
-      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
-      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
-      embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight,
-                             Bias, Hidden, Cell, fc_out, fc_bias);
-      // Remove unneeded nodes.
-      // TODO(jczaja): Proper removing of lookup table
-      std::unordered_set<const Node*> marked_nodes(
-          // {lookup_table, mul, lstm, elementwise_add, fc_bias, W});
-          {mul, lstm, elementwise_add, fc_bias});
-      GraphSafeRemoveNodes(graph, marked_nodes);
-    } else {
-      GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
-      embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight,
-                             Bias, Hidden, Cell, fc_out, nullptr);
-      // Remove unneeded nodes.
-      // TODO(jczaja): Proper removing of lookup table
-      // std::unordered_set<const Node*> marked_nodes({lookup_table, W, mul,
-      // lstm});
-      std::unordered_set<const Node*> marked_nodes({mul, lstm});
-      GraphSafeRemoveNodes(graph, marked_nodes);
-    }
-
-    ++fusion_count;
-  };
-
-  gpd(graph, handler);
-
-  return fusion_count;
-}
-
-void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init(name_scope_, graph);
-
-  int fusion_count =
-      BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/);
-
-  AddStatis(fusion_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(embedding_fc_lstm_fuse_pass,
-              paddle::framework::ir::EmbeddingFCLSTMFusePass);
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
deleted file mode 100644
index 65cb4439727b466506af35df1bed609b18c06ee0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// Fusing of Embedding , FC and LSTM op
-
-// Just FC without bias
-class EmbeddingFCLSTMFusePass : public FusePassBase {
- public:
-  virtual ~EmbeddingFCLSTMFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  const std::string name_scope_{"embedding_fc_lstm_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
deleted file mode 100644
index e2c7606c30836f735844b8c6ef81c265ee295606..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h"
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
-
-struct FCElementwiseLayerNorm : public PatternBase {
-  FCElementwiseLayerNorm(PDPattern *pattern, const std::string &name_scope)
-      : PatternBase(pattern, name_scope, "fc_elementwise_layernorm") {}
-
-  PDNode *operator()(PDNode *x);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(fused_fc_elementwise_layernorm);
-  PATTERN_DECL_NODE(fc);
-  PATTERN_DECL_NODE(elementwise);
-  PATTERN_DECL_NODE(layer_norm);
-  // declare variable node's name
-  PATTERN_DECL_NODE(fc_w);
-  PATTERN_DECL_NODE(fc_bias);
-  PATTERN_DECL_NODE(fc_out);  // (x,fc_w,fc_bias) -> fc_out
-  PATTERN_DECL_NODE(elementwise_input);
-  PATTERN_DECL_NODE(
-      elementwise_out);  // (fc_out,elementwise_input) -> elementwise_out
-  PATTERN_DECL_NODE(layer_norm_bias);
-  PATTERN_DECL_NODE(layer_norm_scale);
-  PATTERN_DECL_NODE(layer_norm_out);
-  PATTERN_DECL_NODE(layer_norm_mean);
-  PATTERN_DECL_NODE(layer_norm_variance);
-};
-
-PDNode *FCElementwiseLayerNorm::operator()(PDNode *x) {
-  // Create nodes for fc op.
-  x->assert_is_op_input("fc", "Input");
-  auto *fc = pattern->NewNode(fc_repr())->assert_is_op("fc");
-  auto *fc_w_var = pattern->NewNode(fc_w_repr())
-                       ->AsInput()
-                       ->assert_is_persistable_var()
-                       ->assert_is_op_input("fc", "W");
-  auto *fc_bias_var = pattern->NewNode(fc_bias_repr())
-                          ->AsInput()
-                          ->assert_is_persistable_var()
-                          ->assert_is_op_input("fc", "Bias");
-  auto *fc_out_var = pattern->NewNode(fc_out_repr())->assert_is_op_output("fc");
-
-  // Add links for fc op.
-  fc->LinksFrom({x, fc_w_var, fc_bias_var}).LinksTo({fc_out_var});
-
-  // Create nodes for elementwise_add op.
-  fc_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
-  auto *elementwise =
-      pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add");
-  auto *elementwise_input_var = pattern->NewNode(elementwise_input_repr())
-                                    ->assert_is_op_input("elementwise_add");
-
-  auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr())
-                                  ->AsOutput()
-                                  ->assert_is_op_output("elementwise_add");
-
-  // Add links for elementwise_add op.
-  elementwise->LinksFrom({fc_out_var, elementwise_input_var})
-      .LinksTo({elementwise_out_var});
-
-  // Create nodes for layer_norm op.
-  elementwise_out_var->AsIntermediate()->assert_is_op_input("layer_norm");
-  auto *layer_norm =
-      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
-  auto *layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
-                                  ->AsInput()
-                                  ->assert_is_persistable_var()
-                                  ->assert_is_op_input("layer_norm", "Bias");
-  auto *layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
-                                   ->AsInput()
-                                   ->assert_is_persistable_var()
-                                   ->assert_is_op_input("layer_norm", "Scale");
-
-  auto *layer_norm_out_var = pattern->NewNode(layer_norm_out_repr())
-                                 ->AsOutput()
-                                 ->assert_is_op_output("layer_norm", "Y");
-  auto *layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
-                                  ->AsOutput()
-                                  ->assert_is_op_output("layer_norm", "Mean");
-  auto *layer_norm_variance_var =
-      pattern->NewNode(layer_norm_variance_repr())
-          ->AsOutput()
-          ->assert_is_op_output("layer_norm", "Variance");
-
-  // Add links for layer_norm op.
-  layer_norm
-      ->LinksFrom(
-          {elementwise_out_var, layer_norm_bias_var, layer_norm_scale_var})
-      .LinksTo(
-          {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
-  return layer_norm_out_var;
-}
-
-}  // namespace patterns
-
-template <typename T>
-static bool IsEqual(const std::vector<T> &x, const std::vector<T> &y) {
-  if (!(x.size() > 0U && y.size() > 0U) || x.size() != y.size()) {
-    return false;
-  }
-  for (size_t i = 0; i < x.size(); ++i) {
-    if (x[i] != y[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph);
-  FusePassBase::Init("fc_elementwise_layernorm_fuse", graph);
-  int found_subgraph_count = 0;
-
-  GraphPatternDetector gpd;
-  auto *x = gpd.mutable_pattern()
-                ->NewNode("fc_elementwise_layernorm_fuse/x")
-                ->AsInput()
-                ->assert_is_op_input("fc", "Input");
-  patterns::FCElementwiseLayerNorm fused_pattern(
-      gpd.mutable_pattern(), "fc_elementwise_layernorm_fuse");
-  fused_pattern(x);
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
-                     Graph *graph) {
-    if (subgraph.count(x) <= 0) {
-      LOG(WARNING) << "The subgraph is empty.";
-      return;
-    }
-
-    VLOG(4) << "handle FCElementwiseLayerNorm fuse";
-    GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_w, fc_w, fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_bias, fc_bias, fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_out, fc_out, fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_input, elementwise_input,
-                              fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias, layer_norm_bias, fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale, layer_norm_scale,
-                              fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out, fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean, fused_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
-                              fused_pattern);
-
-    if (!IsEqual(fc_out->Var()->GetShape(),
-                 elementwise_input->Var()->GetShape())) {
-      return;
-    }
-
-    int begin_norm_axis =
-        boost::get<int>(layer_norm->Op()->GetAttr("begin_norm_axis"));
-    auto layer_norm_x_dims = fc_out->Var()->GetShape();
-    auto layer_norm_x_mat_dims = framework::flatten_to_2d(
-        framework::make_ddim(layer_norm_x_dims), begin_norm_axis);
-    if (fc_w->Var()->GetShape()[1] != layer_norm_x_mat_dims[1]) {
-      return;
-    }
-
-    if (fc_out->outputs.size() > 1U || elementwise_out->outputs.size() > 1U) {
-      // When fc_out or elementwise_out are used as input of other operators, we
-      // cannon fuse.
-      return;
-    }
-
-    std::unordered_set<const Node *> del_node_set;
-
-    // Create an FusedFCElementwiseLayerNorm op node
-    OpDesc new_desc;
-    new_desc.SetType("fused_fc_elementwise_layernorm");
-
-    // inputs
-    new_desc.SetInput("X", {subgraph.at(x)->Name()});
-    new_desc.SetInput("W", {fc_w->Name()});
-    new_desc.SetInput("Bias0", {fc_bias->Name()});
-    new_desc.SetInput("Y", {elementwise_input->Name()});
-    new_desc.SetInput("Scale", {layer_norm_scale->Name()});
-    new_desc.SetInput("Bias1", {layer_norm_bias->Name()});
-
-    // outputs
-    new_desc.SetOutput("Out", {layer_norm_out->Name()});
-    if (layer_norm_mean->outputs.size() > 0U) {
-      new_desc.SetOutput("Mean", {layer_norm_mean->Name()});
-    } else {
-      del_node_set.insert(layer_norm_mean);
-    }
-    if (layer_norm_variance->outputs.size() > 0U) {
-      new_desc.SetOutput("Variance", {layer_norm_variance->Name()});
-    } else {
-      del_node_set.insert(layer_norm_variance);
-    }
-
-    // attrs
-    new_desc.SetAttr("x_num_col_dims", fc->Op()->GetAttr("in_num_col_dims"));
-    new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon"));
-    new_desc.SetAttr("begin_norm_axis",
-                     layer_norm->Op()->GetAttr("begin_norm_axis"));
-    new_desc.SetAttr("activation_type", fc->Op()->GetAttr("activation_type"));
-
-    auto fused_node = graph->CreateOpNode(&new_desc);  // OpDesc will be copied.
-
-    del_node_set.insert(fc);
-    del_node_set.insert(elementwise);
-    del_node_set.insert(layer_norm);
-    del_node_set.insert(fc_out);
-    del_node_set.insert(elementwise_out);
-    GraphSafeRemoveNodes(graph, del_node_set);
-
-    IR_NODE_LINK_TO(subgraph.at(x), fused_node);
-    IR_NODE_LINK_TO(fc_w, fused_node);
-    IR_NODE_LINK_TO(fc_bias, fused_node);
-    IR_NODE_LINK_TO(elementwise_input, fused_node);
-    IR_NODE_LINK_TO(layer_norm_scale, fused_node);
-    IR_NODE_LINK_TO(layer_norm_bias, fused_node);
-    IR_NODE_LINK_TO(fused_node, layer_norm_out);
-    if (layer_norm_mean->outputs.size() > 0U) {
-      IR_NODE_LINK_TO(fused_node, layer_norm_mean);
-    }
-    if (layer_norm_variance->outputs.size() > 0U) {
-      IR_NODE_LINK_TO(fused_node, layer_norm_variance);
-    }
-
-    found_subgraph_count++;
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_subgraph_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fc_elementwise_layernorm_fuse_pass,
-              paddle::framework::ir::FCElementwiseLayerNormFusePass);
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h
deleted file mode 100644
index ac4d0b39ee267c724636954263aa2dce9d9ec47f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class FCElementwiseLayerNormFusePass : public FusePassBase {
- public:
-  virtual ~FCElementwiseLayerNormFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
deleted file mode 100644
index c1f822d7ca5cdc0a1bba1dbb5c646c61be244810..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-TEST(FCElementwiseLayerNormFusePass, basic) {
-  // inputs                           operator            output
-  // --------------------------------------------------------------------
-  // (x, weights_0, bias_0)           fc               -> fc_out_0
-  // (fc_out_0, weights_1, bias_1)    fc               -> fc_out_1
-  // (fc_out_1, y)                    elementwise_add  -> elementwise_out
-  // (elementwise_out, scale, bias_2) layer_norm       ->
-  Layers layers;
-  auto* x = layers.data("x", {128, 768});
-  auto* weights_0 = layers.data("weights_0", {768, 3072}, true);
-  auto* bias_0 = layers.data("bias_0", {3072}, true);
-  auto* fc_out_0 = layers.fc(x, weights_0, bias_0);  // {128, 3072}
-  auto* weights_1 = layers.data("weights_1", {3072, 768}, true);
-  auto* bias_1 = layers.data("bias_1", {768}, true);
-  auto* fc_out_1 =
-      layers.fc(fc_out_0, weights_1, bias_1, 1, "relu");  // {128, 768}
-  fc_out_1->SetShape({128, 768});
-  auto* y = layers.data("y", {128, 768});
-  auto* elementwise_out = layers.elementwise_add(fc_out_1, y);
-  auto* scale = layers.data("scale", {768}, true);
-  auto* bias_2 = layers.data("bias_2", {768}, true);
-  layers.layer_norm(elementwise_out, scale, bias_2);
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
-  auto pass =
-      PassRegistry::Instance().Get("fc_elementwise_layernorm_fuse_pass");
-  int num_nodes_before = graph->Nodes().size();
-  VLOG(3) << DebugString(graph);
-
-  graph.reset(pass->Apply(graph.release()));
-  int num_nodes_after = graph->Nodes().size();
-  int num_fused_nodes_after =
-      GetNumOpNodes(graph, "fused_fc_elementwise_layernorm");
-  VLOG(3) << DebugString(graph);
-
-  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6);
-  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(fc_elementwise_layernorm_fuse_pass);
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
deleted file mode 100644
index b53e6a250ced5ae17b52591811b52c37593a1145..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void FCFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph);
-  FusePassBase::Init("fc_fuse", graph);
-
-  int found_fc_count = 0;
-  for (bool with_relu : {true, false}) {
-    found_fc_count += ApplyFCPattern(graph, with_relu);
-  }
-
-  AddStatis(found_fc_count);
-}
-
-int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
-  GraphPatternDetector gpd;
-  auto* x = gpd.mutable_pattern()
-                ->NewNode("fc_fuse/x")
-                ->AsInput()
-                ->assert_is_op_input("mul", "X");
-  patterns::FC fc_pattern(gpd.mutable_pattern(), "fc_fuse");
-  fc_pattern(x, true /*with bias*/, with_relu);
-
-  int found_fc_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    if (subgraph.count(x) <= 0) {
-      LOG(WARNING) << "The subgraph is empty.";
-      return;
-    }
-
-    VLOG(4) << "handle FC fuse";
-    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(bias, bias, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
-    Node* relu = nullptr;
-    Node* relu_out = nullptr;
-    if (with_relu) {
-      GET_IR_NODE_FROM_SUBGRAPH(tmp_relu, relu, fc_pattern);
-      GET_IR_NODE_FROM_SUBGRAPH(tmp_relu_out, relu_out, fc_pattern);
-      relu = tmp_relu;
-      relu_out = tmp_relu_out;
-    }
-
-    // Create an FC Node.
-    OpDesc desc;
-    desc.SetType("fc");
-
-    // Set inputs of fc
-    desc.SetInput("Input", {subgraph.at(x)->Name()});
-    desc.SetInput("W", {w->Name()});
-    desc.SetInput("Bias", {bias->Name()});
-
-    // Set output of fc
-    std::string fc_out_name =
-        with_relu ? relu_out->Name() : elementwise_add_out->Name();
-    desc.SetOutput("Out", std::vector<std::string>({fc_out_name}));
-
-    // Set attrs of fc
-    desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims"));
-    std::string activation_type = with_relu ? "relu" : "";
-    desc.SetAttr("activation_type", activation_type);
-
-    // For anakin subgraph int8
-    // When in anakin subgraph int8 mode, the pattern like "fake_quant + mul +
-    // fake_dequant" can be detected by the quant_dequant_fuse_pass. This pass
-    // will add "input_scale", "weight_scale" which are extracted from
-    // fake_quant op and fake_dequant op to mul op, and then delete the
-    // fake_quant op and fake_dequant op in the graph. If the mul op has the
-    // scale info, we should add those to the fused fc.
-    auto* mul_op_desc = mul->Op();
-    if (mul_op_desc->HasAttr("enable_int8")) {
-      desc.SetAttr("enable_int8", mul_op_desc->GetAttr("enable_int8"));
-      desc.SetAttr("input_scale", mul_op_desc->GetAttr("input_scale"));
-      desc.SetAttr("weight_scale", mul_op_desc->GetAttr("weight_scale"));
-      if (mul_op_desc->HasAttr("out_scale"))
-        desc.SetAttr("out_scale", mul_op_desc->GetAttr("out_scale"));
-      auto elementwise_desc = elementwise_add->Op();
-      if (elementwise_desc->HasAttr("out_scale"))
-        desc.SetAttr("out_scale", elementwise_desc->GetAttr("out_scale"));
-    }
-
-    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    if (with_relu) {
-      GraphSafeRemoveNodes(
-          graph, {mul, elementwise_add, mul_out, elementwise_add_out, relu});
-    } else {
-      GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out});
-    }
-
-    IR_NODE_LINK_TO(subgraph.at(x), fc_node);
-    IR_NODE_LINK_TO(w, fc_node);
-    IR_NODE_LINK_TO(bias, fc_node);
-    if (with_relu) {
-      IR_NODE_LINK_TO(fc_node, relu_out);
-    } else {
-      IR_NODE_LINK_TO(fc_node, elementwise_add_out);
-    }
-
-    found_fc_count++;
-  };
-  gpd(graph, handler);
-  return found_fc_count;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass);
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
deleted file mode 100644
index ef6636d109a69d32aa50044b9686254e5d7769a5..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse the MUL and ELEMENTWISE_ADD to a FCOp.
- */
-class FCFusePass : public FusePassBase {
- public:
-  virtual ~FCFusePass() {}
-
- protected:
-  void ApplyImpl(Graph* graph) const override;
-
-  int ApplyFCPattern(Graph* graph, bool with_relu) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
deleted file mode 100644
index 320d28f131f03ed118614b5f97baa4397db0fcaa..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-TEST(FCFusePass, basic) {
-  // inputs                     operator            output
-  // --------------------------------------------------------
-  // (a, filters_0 bias_0)      conv2d           -> conv2d_out
-  // conv2d_out                 relu             -> relu_out_0
-  // (relu_out_0, weights_0)    mul              -> mul_out_0
-  // (mul_out_0, bias_1)        elementwise_add  -> add_out_0
-  // add_out_0                  relu             -> relu_out_1
-  // (relu_out_1, weights_1)    mul              -> mul_out_1
-  // (mul_out_1, bias_2)        elementwise_add  -> add_out_1
-  Layers layers;
-  auto* a = layers.data("a");
-  auto* filters_0 = layers.data("conv2d_filters_0", {}, true);
-  auto* bias_0 = layers.data("conv2d_bias_0", {}, true);
-  auto* conv2d_out = layers.conv2d(a, filters_0, bias_0, false);
-  auto* relu_out_0 = layers.relu(conv2d_out);
-  auto* weights_0 = layers.data("weights_0", {}, true);
-  auto* mul_out_0 = layers.mul(relu_out_0, weights_0);
-  auto* bias_1 = layers.data("bias_1", {}, true);
-  auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1);
-  auto* relu_out_1 = layers.relu(add_out_0);
-  auto* weights_1 = layers.data("weights_1", {}, true);
-  auto* mul_out_1 = layers.mul(relu_out_1, weights_1);
-  auto* bias_2 = layers.data("bias_2", {}, true);
-  auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2);
-  VLOG(4) << add_out_1;
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
-  auto pass = PassRegistry::Instance().Get("fc_fuse_pass");
-  int num_nodes_before = graph->Nodes().size();
-  int num_mul_nodes_before = GetNumOpNodes(graph, "mul");
-  VLOG(3) << DebugString(graph);
-
-  graph.reset(pass->Apply(graph.release()));
-  int num_nodes_after = graph->Nodes().size();
-  int num_fc_nodes_after = GetNumOpNodes(graph, "fc");
-  VLOG(3) << DebugString(graph);
-
-  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6);
-  PADDLE_ENFORCE_EQ(num_fc_nodes_after, 2);
-  PADDLE_ENFORCE_EQ(num_mul_nodes_before, num_fc_nodes_after);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(fc_fuse_pass);
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
deleted file mode 100644
index 287c6dc407297415f473ee505937331982a6d54a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-static int BuildFusion(Graph* graph, const std::string& name_scope,
-                       Scope* scope, bool with_fc_bias) {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-
-  // Create pattern.
-  patterns::FC fc_pattern(pattern, name_scope);
-  patterns::GRU gru_pattern(pattern, name_scope);
-
-  PDNode* x =
-      pattern->NewNode(patterns::UniqueKey("x"))->assert_var_not_persistable();
-
-  auto* fc_out = fc_pattern(x, with_fc_bias, /* with_relu */ false);
-  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
-  gru_pattern(fc_out);
-
-  // Create New OpDesc
-  auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h,
-                         Node* bias, Node* hidden, Node* fc_bias) {
-    OpDesc op_desc;
-    op_desc.SetType("fusion_gru");
-
-#define NEW_NAME(x) name_scope + "/at." #x ".new"
-#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
-    SET_IN(X, x);
-    SET_IN(WeightX, weight_x);
-    SET_IN(WeightH, weight_h);
-    if (with_fc_bias) {
-      op_desc.SetInput("Bias", {NEW_NAME(bias) + bias->Name()});
-    } else {
-      SET_IN(Bias, bias);
-    }
-#undef SET_IN
-    op_desc.SetInput("H0", {});
-    op_desc.SetOutput("Hidden", {hidden->Name()});
-    op_desc.SetAttr("is_reverse", gru->Op()->GetAttr("is_reverse"));
-    // TODO(TJ): This should be a option for infer
-    op_desc.SetAttr("use_seq", true);
-
-#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)})
-    SET_IMTERMEDIATE_OUT(ReorderedH0);
-    SET_IMTERMEDIATE_OUT(XX);
-    SET_IMTERMEDIATE_OUT(BatchedInput);
-    SET_IMTERMEDIATE_OUT(BatchedOut);
-#undef SET_IMTERMEDIATE_OUT
-
-    auto* op = graph->CreateOpNode(&op_desc);
-    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
-    if (with_fc_bias) {
-      // Fusion GRU bias = fcbias + grubias
-      auto* fusion_bias_var = scope.Var(NEW_NAME(bias) + bias->Name());
-      auto* out_bias_tensor =
-          fusion_bias_var->GetMutable<framework::LoDTensor>();
-      PADDLE_ENFORCE(fusion_bias_var);
-      auto* gru_bias_var = scope.FindVar(bias->Name());
-      auto* fc_bias_var = scope.FindVar(fc_bias->Name());
-      PADDLE_ENFORCE(gru_bias_var);
-      PADDLE_ENFORCE(fc_bias_var);
-      const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
-      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
-      // new bias = fc bias + gru bias
-      out_bias_tensor->Resize(gru_bias_tenosr.dims());
-      auto* data = out_bias_tensor->mutable_data<float>(platform::CPUPlace());
-      for (int i = 0; i < out_bias_tensor->numel(); i++) {
-        data[i] =
-            fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
-      }
-    }
-#undef GET_NODE
-
-#define NEW_IMTERMEDIATE_OUT(key) \
-  scope.Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
-    NEW_IMTERMEDIATE_OUT(ReorderedH0);
-    NEW_IMTERMEDIATE_OUT(XX);
-    NEW_IMTERMEDIATE_OUT(BatchedInput);
-    NEW_IMTERMEDIATE_OUT(BatchedOut);
-#undef NEW_NAME
-#undef NEW_IMTERMEDIATE_OUT
-
-    IR_NODE_LINK_TO(x, op);
-    IR_NODE_LINK_TO(weight_x, op);
-    IR_NODE_LINK_TO(weight_h, op);
-    IR_NODE_LINK_TO(bias, op);  // actually should link to new bias if have
-    IR_NODE_LINK_TO(op, hidden);
-    // h0?
-    return op;
-  };
-
-  int fusion_count{0};
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    auto* x_n = subgraph.at(x);
-    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, gru_pattern);
-    // nodes need be removed
-    GET_IR_NODE_FROM_SUBGRAPH(BatchGate, BatchGate, gru_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(BatchResetHiddenPrev, BatchGate, gru_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchGate, gru_pattern);
-
-    if (with_fc_bias) {
-      GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
-      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
-      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
-
-      gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
-      // Remove unneeded nodes.
-      std::unordered_set<const Node*> marked_nodes(
-          {mul, gru, elementwise_add, fc_bias, fc_out, mul_out, BatchGate,
-           BatchResetHiddenPrev, BatchHidden});
-      GraphSafeRemoveNodes(graph, marked_nodes);
-    } else {
-      gru_creater(gru, x_n, w, Weight, Bias, Hidden, nullptr);
-      // Remove unneeded nodes.
-      std::unordered_set<const Node*> marked_nodes(
-          {mul, gru, BatchGate, BatchResetHiddenPrev, BatchHidden});
-      GraphSafeRemoveNodes(graph, marked_nodes);
-    }
-#undef GET_NODE
-
-    ++fusion_count;
-  };
-
-  gpd(graph, handler);
-
-  return fusion_count;
-}
-
-void MulGRUFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init(name_scope_, graph);
-
-  int fusion_count =
-      BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/);
-
-  AddStatis(fusion_count);
-}
-
-void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init(name_scope_, graph);
-
-  int fusion_count =
-      BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/);
-
-  AddStatis(fusion_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
-REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
deleted file mode 100644
index e11cdac7ea95219444c35bb8deef630fe29d3734..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
-
-class FCGRUFusePass : public FusePassBase {
- public:
-  virtual ~FCGRUFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  const std::string name_scope_{"fc_gru_fuse"};
-};
-
-// Just FC without bias
-class MulGRUFusePass : public FusePassBase {
- public:
-  virtual ~MulGRUFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-  const std::string name_scope_{"fc_nobias_gru_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
deleted file mode 100644
index a5a72e875e49a732ae27f2f4e949ef893011a2a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
-                bool with_fc_bias) {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-
-  // Build pattern
-  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
-                  ->assert_is_op_input("mul")
-                  ->assert_var_not_persistable();
-  patterns::FC fc_pattern(pattern, name_scope);
-
-  // fc_out is a tmp var, will be removed after fuse, so marked as intermediate.
-  auto* fc_out =
-      fc_pattern(x, with_fc_bias, /* with_relu */ false)->AsIntermediate();
-  patterns::LSTM lstm_pattern(pattern, name_scope);
-  lstm_pattern(fc_out);
-
-  // Create New OpDesc
-  auto lstm_creator = [&](Node* lstm, Node* input, Node* weight_x,
-                          Node* weight_h, Node* bias, Node* hidden, Node* cell,
-                          Node* xx, Node* fc_bias) {
-    OpDesc op_desc;
-    op_desc.SetType("fusion_lstm");
-#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
-    SET_IN(X, input);
-    SET_IN(WeightX, weight_x);
-    SET_IN(WeightH, weight_h);
-    SET_IN(Bias, bias);
-#undef SET_IN
-    if (with_fc_bias) {
-      // Add FC-bias with LSTM-bias and create a new weight
-      PADDLE_ENFORCE(scope);
-      const std::string& new_bias_var = patterns::UniqueKey("NewBias");
-      auto* bias_var = scope->Var(new_bias_var);
-      PADDLE_ENFORCE(bias_var);
-      auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
-      auto* lstm_bias_var = scope->FindVar(bias->Name());
-      PADDLE_ENFORCE(lstm_bias_var);
-      const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
-      bias_tensor->Resize(lstm_bias_tensor.dims());
-
-      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
-      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
-
-      auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
-
-      for (int i = 0; i < bias_tensor->numel(); i++) {
-        data[i] =
-            fc_bias_tensor.data<float>()[i] + lstm_bias_tensor.data<float>()[i];
-      }
-      op_desc.SetInput("Bias", {new_bias_var});
-    }
-
-    // Create temp variables.
-    const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
-    const std::string BatchedCellPreAct =
-        patterns::UniqueKey("BatchedCellPreAct");
-    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
-    const std::string CheckedCell = patterns::UniqueKey("CheckedCell");
-
-    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
-    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
-    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
-    scope->Var(CheckedCell)->GetMutable<framework::LoDTensor>();
-
-    op_desc.SetInput("H0", {});
-    op_desc.SetInput("C0", {});
-    op_desc.SetOutput("Hidden", {hidden->Name()});
-    op_desc.SetOutput("Cell", {cell->Name()});
-    op_desc.SetOutput("XX", {xx->Name()});
-    op_desc.SetOutput("BatchedGate", {BatchedGate});
-    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
-    op_desc.SetOutput("BatchedInput", {BatchedInput});
-    op_desc.SetOutput("CheckedCell", {CheckedCell});
-    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
-    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
-    // TODO(TJ): get from attr
-    op_desc.SetAttr("use_seq", true);
-
-    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
-#define OP_SET_OUT(x)                            \
-  const std::string x = patterns::UniqueKey(#x); \
-  op_desc.SetOutput(#x, {x});                    \
-  scope.Var(x)->GetMutable<LoDTensor>()
-    OP_SET_OUT(BatchedCell);
-    OP_SET_OUT(BatchedHidden);
-    OP_SET_OUT(ReorderedH0);
-    OP_SET_OUT(ReorderedC0);
-#undef OP_SET_OUT
-
-    auto* op = graph->CreateOpNode(&op_desc);
-    IR_NODE_LINK_TO(input, op);
-    IR_NODE_LINK_TO(weight_x, op);
-    IR_NODE_LINK_TO(weight_h, op);
-    IR_NODE_LINK_TO(bias, op);
-    IR_NODE_LINK_TO(op, hidden);
-    return op;
-  };
-
-  int fusion_count{0};
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
-    if (with_fc_bias) {
-      GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
-      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
-      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
-      lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
-                   fc_bias);
-      // Remove unneeded nodes.
-      std::unordered_set<const Node*> marked_nodes(
-          {mul, lstm, elementwise_add, fc_bias});
-      GraphSafeRemoveNodes(graph, marked_nodes);
-    } else {
-      GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
-      lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
-                   nullptr);
-      // Remove unneeded nodes.
-      std::unordered_set<const Node*> marked_nodes({mul, lstm});
-      GraphSafeRemoveNodes(graph, marked_nodes);
-    }
-
-    ++fusion_count;
-  };
-
-  gpd(graph, handler);
-
-  return fusion_count;
-}
-
-void MulLstmFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init(name_scope_, graph);
-
-  int fusion_count =
-      BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/);
-
-  AddStatis(fusion_count);
-}
-
-void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init(name_scope_, graph);
-
-  int fusion_count =
-      BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/);
-
-  AddStatis(fusion_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
-REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
deleted file mode 100644
index 5dea7c91a860f0b9622610f12f195eafb9849555..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op.
-
-// Just FC without bias
-class FCLstmFusePass : public FusePassBase {
- public:
-  virtual ~FCLstmFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  const std::string name_scope_{"fc_lstm_fuse"};
-};
-
-class MulLstmFusePass : public FusePassBase {
- public:
-  virtual ~MulLstmFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-  const std::string name_scope_{"fc_nobias_lstm_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc
deleted file mode 100644
index 915a2f62bafa2baf98b7407cd87d3e69f20b44d2..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
-#define GET_NODES                 \
-  GET_IR_NODE(fill_constant);     \
-  GET_IR_NODE(fill_constant_out); \
-  GET_IR_NODE(elementwise_mul);   \
-  GET_IR_NODE(elementwise_mul_out);
-
-void FillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "fillconstant_elementwisemul_fuse";
-  FusePassBase::Init(pattern_name, graph);
-
-  GraphPatternDetector gpd;
-  auto* x = gpd.mutable_pattern()
-                ->NewNode("x")
-                ->assert_is_op_input("elementwise_mul", "X")
-                ->AsInput();
-
-  patterns::FillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
-                                                   pattern_name);
-  pattern(x);
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_NODES;
-
-    PADDLE_ENFORCE(subgraph.count(x));
-    auto* elementwise_in = subgraph.at(x);
-    float constant_value =
-        boost::get<float>(fill_constant->Op()->GetAttr("value"));
-
-    framework::OpDesc new_op_desc;
-    new_op_desc.SetType("scale");
-    new_op_desc.SetInput("X", {elementwise_in->Name()});
-    new_op_desc.SetAttr("scale", constant_value);
-    new_op_desc.SetAttr("bias", static_cast<float>(0.0));
-    new_op_desc.SetAttr("bias_after_scale", true);
-    new_op_desc.SetOutput("Out", {elementwise_mul_out->Name()});
-    new_op_desc.Flush();
-
-    // Create a new node for the fused op.
-    auto* scale_op = graph->CreateOpNode(&new_op_desc);
-
-    IR_NODE_LINK_TO(elementwise_in, scale_op);       // Input
-    IR_NODE_LINK_TO(scale_op, elementwise_mul_out);  // Output
-
-    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph,
-                         {fill_constant, fill_constant_out, elementwise_mul});
-  };
-
-  gpd(graph, handler);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fillconstant_elementwisemul_fuse,
-              paddle::framework::ir::FillconstantElementwisemulFuse);
diff --git a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
deleted file mode 100644
index ab66fb4a46a8a5b60b3bf95e27ae24c7217a5a3a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class FillconstantElementwisemulFuse : public FusePassBase {
- public:
-  virtual ~FillconstantElementwisemulFuse() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
deleted file mode 100644
index 7f9eccf2fdd4ee7955b90fe20b91250e5b498f32..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ /dev/null
@@ -1,370 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h"
-#include <algorithm>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void FuseElewiseAddActPass::ApplyImpl(ir::Graph *graph) const {
-  std::unordered_set<std::string> act_types = {"relu", "scale", "tanh"};
-  graph = FuseActElewiseAdd(graph, act_types);
-  graph = FuseElewiseAddAct(graph, act_types);
-  // backward
-  {
-    std::unordered_set<std::string> in_place_act_types = {"relu_grad"};
-    graph = FuseElewiseAddActInplaceGrad(graph, in_place_act_types);
-  }
-
-  // Remove the removable intermediate_out.
-  RemoveIntermediateOut(graph);
-}
-
-// ele_add(x, act(y))
-ir::Graph *FuseElewiseAddActPass::FuseElewiseAddAct(
-    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init("elewise_add_act", graph);
-
-  GraphPatternDetector gpd;
-  auto *x = gpd.mutable_pattern()
-                ->NewNode("elewise_add_act/x")
-                ->AsInput()
-                ->assert_is_op_input("elementwise_add", "X");
-  patterns::ElewiseAddAct elewise_add_act_pattern(gpd.mutable_pattern(),
-                                                  "elementwise_add");
-
-  elewise_add_act_pattern(x, act_types);
-
-  int found_elewise_add_act_count = 0;
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
-                     Graph *g) {
-    VLOG(4) << "handle FuseElewiseAddAct fuse";
-    GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
-                              elewise_add_act_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(act, act, elewise_add_act_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(ele_add, ele_add, elewise_add_act_pattern);
-
-    std::string ele_x_n = subgraph.at(x)->Name();
-    std::string ele_y_n = ele_y->Name();
-    std::string ele_out_n = ele_out->Name();
-    std::string act_out_n = act_out->Name();
-
-    Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
-        g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n);
-
-    VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
-            << ele_add->Name() << " -> " << ele_out_n << "\n"
-            << "\t " << ele_out_n << " -> " << act->Name() << " -> "
-            << act_out_n;
-
-    ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node);
-    found_elewise_add_act_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_elewise_add_act_count);
-  return graph;
-}
-
-// act(ele_add(x,y))
-ir::Graph *FuseElewiseAddActPass::FuseActElewiseAdd(
-    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init("act_elewise_add", graph);
-
-  GraphPatternDetector gpd;
-  auto *x = gpd.mutable_pattern()
-                ->NewNode("act_elewise_add/x")
-                ->AsInput()
-                ->assert_is_ops_input(act_types, "X");
-  patterns::ActElewiseAdd act_elewise_add_pattern(gpd.mutable_pattern(),
-                                                  "act_elewise_add");
-
-  act_elewise_add_pattern(x, act_types);
-
-  int found_elewise_add_act_count = 0;
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
-                     Graph *g) {
-    VLOG(4) << "handle FuseElewiseAddAct fuse";
-    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
-                              act_elewise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(act, act, act_elewise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(ele_add, ele_add, act_elewise_add_pattern);
-
-    std::string act_i_n = subgraph.at(x)->Name();
-    std::string act_o_n = act_out->Name();
-    std::string elewise_add_x_n = ele_x->Name();
-    std::string elewise_add_out_n = ele_out->Name();
-
-    Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
-        g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n);
-
-    VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
-            << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
-            << ele_add->Name() << " -> " << elewise_add_out_n;
-
-    ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node);
-    found_elewise_add_act_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_elewise_add_act_count);
-  return graph;
-}
-
-// the backward of act(ele_add(x,y))
-// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
-// ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
-ir::Graph *FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
-    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init("elewise_add_act_grad", graph);
-
-  GraphPatternDetector gpd;
-  auto *d_act_out = gpd.mutable_pattern()
-                        ->NewNode("elewise_add_act_grad_inplace/x")
-                        ->AsInput()
-                        ->assert_is_ops_input(act_types, GradVarName("Out"));
-  patterns::ElewiseAddActInplaceGrad elewise_add_act_grad_pattern(
-      gpd.mutable_pattern(), "elewise_add_act_grad_inplace");
-  elewise_add_act_grad_pattern(d_act_out, act_types);
-
-  int found_elewise_add_act_count = 0;
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
-                     Graph *g) {
-    VLOG(4) << "handle FuseElewiseAddActGrad1 fuse";
-    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out,
-                              elewise_add_act_grad_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_grad_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad, ele_add_grad,
-                              elewise_add_act_grad_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(d_ele_x, d_ele_x, elewise_add_act_grad_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(d_ele_y, d_ele_y, elewise_add_act_grad_pattern);
-
-    std::string d_act_out_n = subgraph.at(d_act_out)->Name();
-    std::string act_out_n = act_out->Name();
-    std::string d_itermediate_out_n = d_itermediate_out->Name();
-    std::string ele_y_n = ele_y->Name();
-    std::string d_ele_x_n = d_ele_x->Name();
-    std::string d_ele_y_n = d_ele_y->Name();
-
-    OpDesc desc;
-    desc.SetType("fused_elemwise_activation_grad");
-    desc.SetInput("IntermediateOut", {});
-    desc.SetInput("X", {});
-    desc.SetInput("Y", std::vector<std::string>({ele_y_n}));
-    desc.SetInput("Out", std::vector<std::string>({act_out_n}));
-    desc.SetInput(GradVarName("Out"), std::vector<std::string>({d_act_out_n}));
-    desc.SetOutput(GradVarName("X"), std::vector<std::string>({d_ele_x_n}));
-    desc.SetOutput(GradVarName("Y"), std::vector<std::string>({d_ele_y_n}));
-    desc.SetOutput(GradVarName("IntermediateOut"),
-                   std::vector<std::string>({d_itermediate_out_n}));
-
-    desc.SetAttr("save_intermediate_out", false);
-    desc.SetAttr("functor_list",
-                 std::vector<std::string>(
-                     {act_grad->Op()->Type(), ele_add_grad->Op()->Type()}));
-
-    for (auto &n : {act_grad->Op(), ele_add_grad->Op()}) {
-      for (auto &m_ele : n->GetAttrMap()) {
-        desc.SetAttr(m_ele.first, m_ele.second);
-      }
-    }
-
-    auto fused_node = g->CreateOpNode(&desc);
-
-    VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
-            << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
-            << d_itermediate_out_n << " and " << act_out_n << " -> "
-            << ele_add_grad->Name() << " -> " << d_itermediate_out_n;
-
-    ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node);
-    found_elewise_add_act_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_elewise_add_act_count);
-  return graph;
-}
-
-Node *FuseElewiseAddActPass::CreateFuseElewiseAddActNode(
-    Graph *g, const Node *op_1, const Node *op_2, const std::string &ele_x_n,
-    const std::string &ele_y_n, const std::string &ele_out_n,
-    const std::string &act_out_n) const {
-  OpDesc desc;
-  desc.SetInput("X", std::vector<std::string>({ele_x_n}));
-  desc.SetInput("Y", std::vector<std::string>({ele_y_n}));
-  desc.SetOutput("Out", std::vector<std::string>({act_out_n}));
-  desc.SetOutput("IntermediateOut", std::vector<std::string>({ele_out_n}));
-  desc.SetType("fused_elemwise_activation");
-  desc.SetAttr("save_intermediate_out", true);
-  desc.SetAttr("functor_list", std::vector<std::string>(
-                                   {op_1->Op()->Type(), op_2->Op()->Type()}));
-
-  // Set attrs
-  for (auto &n : {op_1->Op(), op_2->Op()}) {
-    for (auto &m_ele : n->GetAttrMap()) {
-      desc.SetAttr(m_ele.first, m_ele.second);
-    }
-  }
-
-  auto elewise_add_act_node = g->CreateOpNode(&desc);
-  return elewise_add_act_node;
-}
-
-void FuseElewiseAddActPass::RemoveIntermediateOut(Graph *graph) const {
-  std::unordered_set<const Node *> need_removed_nodes;
-  for (auto &cur_node : graph->Nodes()) {
-    if (cur_node->IsVar()) continue;
-    if (cur_node->Name() == "fused_elemwise_activation") {
-      bool save_intermediate_out =
-          boost::get<bool>(cur_node->Op()->GetAttr("save_intermediate_out"));
-      auto intermediate_out_args = cur_node->Op()->Output("IntermediateOut");
-      PADDLE_ENFORCE(
-          save_intermediate_out && !intermediate_out_args.empty(),
-          "The %s should save the intermediate_out in the fusing stage.",
-          cur_node->Name());
-
-      // If the intermediate_out's output is empty, it should be removed.
-      auto cur_node_outputs = cur_node->outputs;
-      for (auto &out : cur_node_outputs) {
-        if (out->Name() == intermediate_out_args[0]) {
-          if (out->outputs.size() == 0) {
-            cur_node->outputs = this->RemoveNode(out, cur_node->outputs);
-            need_removed_nodes.insert(std::move(out));
-            cur_node->Op()->SetAttr("save_intermediate_out", false);
-          }
-        }
-      }
-    } else if (cur_node->Name() == "fused_elemwise_activation_grad") {
-      auto intermediate_out_grad_args =
-          cur_node->Op()->Output(GradVarName("IntermediateOut"));
-      PADDLE_ENFORCE(
-          !intermediate_out_grad_args.empty(),
-          "The %s should save the intermediate_out in the fusing stage.",
-          cur_node->Name());
-      auto cur_node_outputs = cur_node->outputs;
-      // If the intermediate_out_g's output is empty, it should be removed.
-      for (auto &out : cur_node_outputs) {
-        if (out->Name() == intermediate_out_grad_args[0] &&
-            out->outputs.empty()) {
-          cur_node->Op()->SetOutput(GradVarName("IntermediateOut"), {});
-          cur_node->outputs = this->RemoveNode(out, cur_node->outputs);
-          need_removed_nodes.insert(std::move(out));
-        }
-      }
-    }
-  }
-  GraphSafeRemoveNodes(graph, need_removed_nodes);
-}
-
-void FuseElewiseAddActPass::ReLinkNodes(Graph *graph,
-                                        const Node *intermediate_out,
-                                        Node *op_1, Node *op_2,
-                                        Node *fused_op) const {  // delete act
-  for (auto &in : op_1->inputs) {
-    fused_op->inputs.emplace_back(in);
-    in->outputs = this->ReplaceNode(op_1, fused_op, in->outputs);
-  }
-
-  std::unordered_set<const Node *> nodes2delete;
-  for (auto &out : op_1->outputs) {
-    if (out->IsCtrlVar()) {
-      auto result_iter = std::find_if(
-          op_2->inputs.begin(), op_2->inputs.end(),
-          [&out](const Node *node) -> bool { return node == out; });
-
-      if (result_iter == op_2->inputs.end()) {
-        IR_OP_VAR_LINK(fused_op, out);
-      } else {
-        nodes2delete.emplace(out);
-      }
-    } else {
-      PADDLE_ENFORCE(out == intermediate_out);
-      IR_OP_VAR_LINK(fused_op, out);
-    }
-  }
-
-  for (auto &in : op_2->inputs) {
-    if (in == intermediate_out || nodes2delete.count(in)) {
-      continue;
-    }
-    fused_op->inputs.emplace_back(in);
-    in->outputs = this->ReplaceNode(op_2, fused_op, in->outputs);
-  }
-
-  for (auto &out : op_2->outputs) {
-    IR_OP_VAR_LINK(fused_op, out);
-  }
-
-  nodes2delete.insert(std::move(op_1));
-  nodes2delete.insert(std::move(op_2));
-
-  GraphSafeRemoveNodes(graph, nodes2delete);
-}
-
-std::vector<Node *> FuseElewiseAddActPass::ReplaceNode(
-    Node *cur_node, Node *new_node, const std::vector<Node *> &nodes) const {
-  std::vector<Node *> new_list(nodes.size());
-  bool has_replaced = false;
-  std::transform(nodes.begin(), nodes.end(), new_list.begin(),
-                 [&](Node *node) -> Node * {
-                   if (node == cur_node) {
-                     has_replaced = true;
-                     return new_node;
-                   }
-                   return node;
-                 });
-  PADDLE_ENFORCE(has_replaced, "Not find %s in the node list.",
-                 cur_node->Name());
-  return new_list;
-}
-
-std::vector<Node *> FuseElewiseAddActPass::RemoveNode(
-    Node *trg_node, const std::vector<Node *> &nodes) const {
-  std::vector<Node *> new_list(nodes.size());
-  auto end_iter =
-      std::copy_if(nodes.begin(), nodes.end(), new_list.begin(),
-                   [&](Node *node) -> bool { return node != trg_node; });
-  new_list.resize(
-      static_cast<uint64_t>(std::distance(new_list.begin(), end_iter)));
-  return new_list;
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fuse_elewise_add_act_pass,
-              paddle::framework::ir::FuseElewiseAddActPass);
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
deleted file mode 100644
index dc73f1fda03e130c6876819d91897b497b8b321e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse the ElewiseAdd and activation
- */
-class FuseElewiseAddActPass : public FusePassBase {
- public:
-  virtual ~FuseElewiseAddActPass() {}
-
- protected:
-  void ApplyImpl(ir::Graph *graph) const override;
-
-  ir::Graph *FuseElewiseAddAct(
-      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
-
-  ir::Graph *FuseActElewiseAdd(
-      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
-
-  ir::Graph *FuseElewiseAddActInplaceGrad(
-      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
-
-  /**
-   * Remove the removable intermediate_out.
-   *   - If the intermediate_out is only used by the backward op, but the
-   *     backward op doesn't use intermediate_out.
-   *   - If the intermediate_out_grad is not used by any op.
-   */
-  void RemoveIntermediateOut(Graph *graph) const;
-
-  std::vector<Node *> ReplaceNode(Node *cur_node, Node *new_node,
-                                  const std::vector<Node *> &nodes) const;
-
-  std::vector<Node *> RemoveNode(Node *trg_node,
-                                 const std::vector<Node *> &nodes) const;
-
-  void ReLinkNodes(Graph *graph, const Node *intermediate_out, Node *op_1,
-                   Node *op_2, Node *fused_op) const;
-  Node *CreateFuseElewiseAddActNode(Graph *g, const Node *op_1,
-                                    const Node *op_2,
-                                    const std::string &ele_x_n,
-                                    const std::string &ele_y_n,
-                                    const std::string &ele_out_n,
-                                    const std::string &act_out_n) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt
deleted file mode 100644
index 22876e962a033f391e54e396701c06fe826f7821..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-cc_library(fuse_optimizer_op_pass SRCS fuse_optimizer_op_pass.cc DEPS graph graph_helper)
-cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc DEPS fuse_optimizer_op_pass)
-cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc DEPS fuse_optimizer_op_pass)
-cc_library(fuse_momentum_op_pass SRCS fuse_momentum_op_pass.cc DEPS fuse_optimizer_op_pass)
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
deleted file mode 100644
index 8aec098720bb1ed6988da6ef2d213713c539a053..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <algorithm>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class FuseAdamOpPass : public FuseOptimizerOpPass {
- private:
-  const std::string GetOpType() const { return "adam"; }
-
-  const std::vector<std::string> GetAuxiliaryVarNames() const {
-    return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
-  }
-
-  ir::Node *FuseOptimizerOps(
-      const std::unordered_map<std::string, std::vector<std::string>>
-          &aux_var_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name,
-      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
-    auto fused_adam_node =
-        FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
-    auto fused_scale1 =
-        FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
-                     adam_ops, graph);
-    auto fused_scale2 =
-        FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
-                     adam_ops, graph);
-    RemoveCycleDepsBetweenOpNodes(graph, fused_scale1, fused_scale2);
-    return fused_adam_node;
-  }
-
-  void RemoveCycleDepsBetweenOpNodes(Graph *graph, const Node *fused_scale1,
-                                     const Node *fused_scale2) const {
-    std::unordered_set<Node *> not_need_ctrl_var_nodes;
-    std::unordered_set<Node *> fused_scale2_in_nodes;
-    fused_scale2_in_nodes.insert(fused_scale2->inputs.begin(),
-                                 fused_scale2->inputs.end());
-    for (auto &out_node : fused_scale1->outputs) {
-      if (fused_scale2_in_nodes.count(out_node)) {
-        PADDLE_ENFORCE(out_node->IsCtrlVar(),
-                       "The dependency var only should be ctrl var.");
-        not_need_ctrl_var_nodes.insert(out_node);
-      }
-    }
-
-    for (auto &node : not_need_ctrl_var_nodes) {
-      // remove this node from the input op node.
-      PADDLE_ENFORCE(!node->inputs.empty(),
-                     "The input should not be empty here.");
-      auto op_node = node->inputs.front();
-      PADDLE_ENFORCE(op_node->IsOp());
-      op_node->outputs.erase(
-          remove_if(
-              op_node->outputs.begin(), op_node->outputs.end(),
-              [&node](const Node *op_out_node) { return op_out_node == node; }),
-          op_node->outputs.end());
-
-      // remove this node from the output op nodes.
-      for (auto &out_op_node : node->outputs) {
-        out_op_node->inputs.erase(
-            remove_if(
-                out_op_node->inputs.begin(), out_op_node->inputs.end(),
-                [&node](const Node *op_in_node) { return op_in_node == node; }),
-            out_op_node->inputs.end());
-      }
-
-      graph->RemoveNode(node);
-    }
-  }
-
-  ir::Node *FuseAdamOps(
-      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name,
-      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
-    PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0));
-
-    // Check attributions
-    // NOTE: If new attribution is added, the following code maybe need change.
-    int op_role = boost::get<int>(
-        adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-    float beta1 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta1"));
-    float beta2 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta2"));
-    float epsilon = boost::get<float>(adam_ops[0]->Op()->GetAttr("epsilon"));
-    bool lazy_mode = boost::get<bool>(adam_ops[0]->Op()->GetAttr("lazy_mode"));
-    int64_t min_row_size_to_use_multithread = boost::get<int64_t>(
-        adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread"));
-    for (auto &adam_op : adam_ops) {
-      PADDLE_ENFORCE_EQ(beta1,
-                        boost::get<float>(adam_op->Op()->GetAttr("beta1")));
-      PADDLE_ENFORCE_EQ(beta2,
-                        boost::get<float>(adam_op->Op()->GetAttr("beta2")));
-      PADDLE_ENFORCE_EQ(epsilon,
-                        boost::get<float>(adam_op->Op()->GetAttr("epsilon")));
-      PADDLE_ENFORCE_EQ(lazy_mode,
-                        boost::get<bool>(adam_op->Op()->GetAttr("lazy_mode")));
-      PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread,
-                        boost::get<int64_t>(adam_op->Op()->GetAttr(
-                            "min_row_size_to_use_multithread")));
-      PADDLE_ENFORCE_EQ(op_role,
-                        boost::get<int>(adam_op->Op()->GetAttr(
-                            OpProtoAndCheckerMaker::OpRoleAttrName())));
-    }
-
-    // NOTE: fused_var is only exist in scope, so the graph doesn't have
-    // fused_var node.
-
-    VLOG(6) << "Insert adam to graph ";
-    OpDesc adam_desc(adam_ops[0]->Op()->Block());
-    adam_desc.SetType("adam");
-    adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
-    adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
-    adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
-    adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
-    // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
-    adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate));
-    adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
-    adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
-
-    adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
-    adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
-    adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
-    adam_desc.SetAttr("beta1", beta1);
-    adam_desc.SetAttr("beta2", beta2);
-    adam_desc.SetAttr("epsilon", epsilon);
-    adam_desc.SetAttr("lazy_mode", lazy_mode);
-    adam_desc.SetAttr("min_row_size_to_use_multithread",
-                      min_row_size_to_use_multithread);
-    adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
-    return graph->CreateOpNode(&adam_desc);
-  }
-
-  ir::Node *FuseScaleOps(const std::vector<std::string> &beta_name,
-                         const std::string &fused_var_name,
-                         const std::vector<ir::Node *> &adam_ops,
-                         ir::Graph *graph) const {
-    PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
-    const std::string scale_op_name = "scale";
-
-    // Get the scale_ops of dealing the adam's beta var.
-    std::vector<ir::Node *> scale_ops;
-    scale_ops.reserve(beta_name.size());
-    for (size_t i = 0; i < adam_ops.size(); ++i) {
-      auto &beta_1_pow_name = beta_name[i];
-      auto beta_pow_iter = std::find_if(
-          adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(),
-          [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool {
-            return var_node->Var() &&
-                   var_node->Var()->Name() == beta_1_pow_name;
-          });
-      PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end());
-
-      auto beta_pow_node = *beta_pow_iter;
-      auto scale_op_iter = std::find_if(
-          beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(),
-          [&scale_op_name](ir::Node *op_node) -> bool {
-            return op_node->Op() && op_node->Op()->Type() == scale_op_name;
-          });
-      PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end());
-
-      scale_ops.emplace_back(*scale_op_iter);
-    }
-    PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
-    VLOG(6) << "The number of scale op is " << scale_ops.size() << ".";
-    // Check attributions
-    // NOTE: If new attribution is added, the following code maybe need change.
-    int op_role = boost::get<int>(
-        scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-    float scale = boost::get<float>(scale_ops[0]->Op()->GetAttr("scale"));
-    float bias = boost::get<float>(scale_ops[0]->Op()->GetAttr("bias"));
-    bool bias_after_scale =
-        boost::get<bool>(scale_ops[0]->Op()->GetAttr("bias_after_scale"));
-    for (auto &scale_op : scale_ops) {
-      PADDLE_ENFORCE_EQ(scale,
-                        boost::get<float>(scale_op->Op()->GetAttr("scale")));
-      PADDLE_ENFORCE_EQ(bias,
-                        boost::get<float>(scale_op->Op()->GetAttr("bias")));
-      PADDLE_ENFORCE_EQ(
-          bias_after_scale,
-          boost::get<bool>(scale_op->Op()->GetAttr("bias_after_scale")));
-      PADDLE_ENFORCE_EQ(op_role,
-                        boost::get<int>(scale_op->Op()->GetAttr(
-                            OpProtoAndCheckerMaker::OpRoleAttrName())));
-    }
-
-    // NOTE: fused_var is only exist in scope, so the graph doesn't have
-    // fused_var node.
-
-    VLOG(6) << "Insert fused scale to graph.";
-    OpDesc scale_desc(scale_ops[0]->Op()->Block());
-    scale_desc.SetType("scale");
-    scale_desc.SetInput("X", {fused_var_name});
-    scale_desc.SetOutput("Out", {fused_var_name});
-    scale_desc.SetAttr("scale", scale);
-    scale_desc.SetAttr("bias", bias);
-    scale_desc.SetAttr("bias_after_scale", bias_after_scale);
-    scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
-    auto scale_node = graph->CreateOpNode(&scale_desc);
-
-    InsertInputAndOutputForFusedOpNode(scale_ops, graph, scale_node);
-    // Delete scale_ops
-    for (auto &scale_op : scale_ops) {
-      graph->RemoveNode(scale_op);
-    }
-    return scale_node;
-  }
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fuse_adam_op_pass, paddle::framework::ir::FuseAdamOpPass);
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
deleted file mode 100644
index 8f3a623a9883b6b4e638da7c39b0d1f9d78c0488..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class FuseMomentumOpPass : public FuseOptimizerOpPass {
- private:
-  virtual const std::string GetOpType() const { return "momentum"; }
-
-  virtual const std::vector<std::string> GetAuxiliaryVarNames() const {
-    return {"Velocity"};
-  }
-
-  // Fuse Momentum Ops
-  virtual ir::Node *FuseOptimizerOps(
-      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name,
-      const std::vector<ir::Node *> &momentum_ops, ir::Graph *graph) const {
-    PADDLE_ENFORCE_GT(momentum_ops.size(), static_cast<size_t>(0));
-
-    // Check attributions
-    // NOTE: If new attribution is added, the following code maybe need change.
-    int op_role = boost::get<int>(momentum_ops[0]->Op()->GetAttr(
-        OpProtoAndCheckerMaker::OpRoleAttrName()));
-    float mu = boost::get<float>(momentum_ops[0]->Op()->GetAttr("mu"));
-    bool use_nesterov =
-        boost::get<bool>(momentum_ops[0]->Op()->GetAttr("use_nesterov"));
-
-    for (auto &momentum_op : momentum_ops) {
-      PADDLE_ENFORCE_EQ(mu,
-                        boost::get<float>(momentum_op->Op()->GetAttr("mu")));
-      PADDLE_ENFORCE_EQ(
-          use_nesterov,
-          boost::get<bool>(momentum_op->Op()->GetAttr("use_nesterov")));
-      PADDLE_ENFORCE_EQ(op_role,
-                        boost::get<int>(momentum_op->Op()->GetAttr(
-                            OpProtoAndCheckerMaker::OpRoleAttrName())));
-    }
-
-    // NOTE: fused_var is only exist in scope, so the graph doesn't have
-    // fused_var node.
-
-    VLOG(6) << "Insert momentum to graph ";
-    OpDesc momentum_desc(momentum_ops[0]->Op()->Block());
-    momentum_desc.SetType("momentum");
-    momentum_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
-    momentum_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
-    momentum_desc.SetInput("Velocity", {fused_vars_name.at("Velocity")});
-    // TODO(zcd): The LearningRate should be equal.
-    momentum_desc.SetInput(kLearningRate,
-                           momentum_ops[0]->Op()->Input(kLearningRate));
-
-    momentum_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
-    momentum_desc.SetOutput("VelocityOut", {fused_vars_name.at("Velocity")});
-    momentum_desc.SetAttr("mu", mu);
-    momentum_desc.SetAttr("use_nesterov", use_nesterov);
-    momentum_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
-
-    return graph->CreateOpNode(&momentum_desc);
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fuse_momentum_op_pass, paddle::framework::ir::FuseMomentumOpPass);
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
deleted file mode 100644
index fcb5604a07c8140e1dbc02634b06d65641e5bf4b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ /dev/null
@@ -1,487 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
-#include <algorithm>
-#include <set>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
-  ir::Graph &result = *graph;
-
-  const std::string fuse_op_type = GetOpType();
-  std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
-  aux_var_names.emplace_back(kParam);
-  aux_var_names.emplace_back(kGrad);
-
-  // Step 1: Get the specified op and auxiliary variables.
-  std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
-  auto vars_info = GetVarInfo(result);
-  std::vector<ir::Node *> opt_nodes;
-  size_t opt_ops_num = 0;
-  // Note: Only take care about the dense gradients.
-  for (auto &node : topo_nodes) {
-    if (node->Op()->Type() == fuse_op_type) {
-      auto grad_name = node->Op()->Input(kGrad);
-      PADDLE_ENFORCE_EQ(grad_name.size(), static_cast<size_t>(1));
-      if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) {
-        opt_nodes.emplace_back(node);
-      }
-      ++opt_ops_num;
-    }
-  }
-
-  VLOG(6) << "Find " << fuse_op_type << " operators : " << opt_ops_num
-          << ", and " << opt_nodes.size() << " for dense gradients.";
-  if (opt_nodes.size() == 0 || result.Has(details::kFusedOptType)) {
-    if (result.Has(details::kFusedOptType)) {
-      auto &opt_type =
-          result.Get<details::FusedOptType>(details::kFusedOptType);
-      VLOG(6) << "Currently only support fusing one type optimizer op. "
-                 "Has fused "
-              << opt_type;
-    }
-    return;
-  }
-
-  // There should not have no-ctr-var between the op_nodes that link the op_node
-  // of op_nodes.
-  if (HasVarDepsBetweenOps(topo_nodes, opt_nodes)) {
-    VLOG(6) << "There are interdependent variables among these optimization "
-               "operators, which can not be handled well at present.";
-    return;
-  }
-
-  LOG(WARNING) << "Find " << fuse_op_type << " operators : " << opt_ops_num
-               << ", and " << opt_nodes.size() << " for dense gradients. "
-               << "To make the speed faster, those optimization are fused "
-                  "during training.";
-
-  result.Set(details::kFusedOptType, new details::FusedOptType);
-  result.Get<details::FusedOptType>(details::kFusedOptType) = fuse_op_type;
-  if (!result.Has(details::kProgramDescs)) {
-    result.Set(details::kProgramDescs, new details::ProgramDescs);
-  }
-
-  // Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be
-  // initialized in scopes before execution.
-  if (!result.Has(details::kFusedVars)) {
-    result.Set(details::kFusedVars, new details::FusedVars);
-  }
-  std::unordered_map<std::string, std::vector<std::string>> aux_var_set;
-  GetSpecifiedOpsAndVars(aux_var_names, opt_nodes, &aux_var_set);
-  std::unordered_map<std::string, std::string> fused_vars_name;
-  fused_vars_name.reserve(aux_var_names.size());
-  auto &fused_var_set = result.Get<details::FusedVars>(details::kFusedVars);
-  const std::string prefix(details::kFusedVarNamePrefix);
-  for (auto &var_name : aux_var_names) {
-    // NOTE: the fused_var_name should be unique.
-    auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" +
-                          aux_var_set[var_name][0];
-    VLOG(6) << var_name << ": " << fused_var_name;
-    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
-    fused_var_set.insert(fused_var_name);
-    fused_vars_name.emplace(var_name, fused_var_name);
-  }
-
-  // Step 3: Get the fused Gradient's name
-  bool grad_fused = false;
-  if (result.Has(details::kParamsAndDenseGrads)) {
-    // NOTE: kParamsAndDenseGrads is generated by
-    // alloc_continue_space_for_grad_pass
-    auto &params_and_dense_grads =
-        result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads);
-    PADDLE_ENFORCE_LE(
-        params_and_dense_grads.size(), aux_var_set.at(kGrad).size(),
-        "The number of dense gradients should be little than optimizer ops.");
-
-    std::unordered_set<std::string> opt_grad_set(aux_var_set.at(kGrad).size());
-    for (auto &p_g : params_and_dense_grads) {
-      opt_grad_set.insert(p_g.second);
-    }
-    std::vector<size_t> new_grad_idx;
-    for (size_t idx = 0; idx < aux_var_set.at(kGrad).size(); ++idx) {
-      auto &grad = aux_var_set.at(kGrad).at(idx);
-      if (!opt_grad_set.count(grad)) {
-        new_grad_idx.emplace_back(idx);
-      }
-    }
-
-    // NOTE(zcd): the gradient of kParamsAndDenseGrads may be different
-    // with the kGrad. The gradients of kParamsAndDenseGrads is
-    // collected during backward stage, but in optimization state, the
-    // some gradient's name maybe changed.
-    if (new_grad_idx.size() == 0) {
-      if (!result.Has(details::kFusedGrads)) {
-        PADDLE_THROW(
-            "The coalesce_grad_tensor_pass should "
-            "be called before this pass.");
-      }
-      auto &fused_grad = result.Get<details::FusedGrads>(details::kFusedGrads);
-      PADDLE_ENFORCE_NE(fused_grad.size(), 0,
-                        "The fused gradient should not be empty.");
-      PADDLE_ENFORCE_EQ(fused_grad.size(), 1,
-                        "Because the dtype of those gradients "
-                        "is not unified, so the number of fused gradients is "
-                        "more than one, but it is not supported currently.");
-      auto &fused_vars = result.Get<details::FusedVars>(details::kFusedVars);
-      auto iter =
-          std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front());
-      PADDLE_ENFORCE_EQ(iter != fused_vars.end(), true,
-                        "Not find the fused_grad.");
-      fused_vars_name[kGrad] = fused_grad.front();
-
-      // Sort the parameters and auxiliary variables according
-      // to parameters' name to make variables' name correspond correctly.
-      SortParametersAndAuxVars(params_and_dense_grads, &aux_var_set,
-                               &opt_nodes);
-      grad_fused = true;
-    } else {
-      VLOG(6) << "The number of new gradients is " << new_grad_idx.size();
-      if (new_grad_idx.size() == 1) return;
-      // NOTE(zcd): If the gradients of backward stage and optimization stage
-      // have diff, Only take care of the the gradient of optimization stage.
-      GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_set);
-    }
-  }
-
-  // Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
-  // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops
-  // separately.
-  if (!grad_fused) {
-    InitFusedGradsAndAllocSpaceForGrads(aux_var_set.at(kParam),
-                                        aux_var_set.at(kGrad),
-                                        fused_vars_name.at(kGrad), &result);
-  }
-  aux_var_names.pop_back();
-  InitFusedVarsAndAllocSpaceForVars(aux_var_names, aux_var_set, fused_vars_name,
-                                    &result);
-
-  // Step 5: Fuse optimizer Ops and Scale Ops
-  auto *fused_opt_node =
-      FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result);
-
-  InsertInputAndOutputForFusedOpNode(opt_nodes, graph, fused_opt_node);
-  // Step 6: Remove optimizer Ops
-  for (auto &opt_op : opt_nodes) {
-    graph->RemoveNode(opt_op);
-  }
-}
-
-bool FuseOptimizerOpPass::HasVarDepsBetweenOps(
-    const std::vector<Node *> &topo_nodes,
-    const std::vector<Node *> &opt_nodes) const {
-  std::unordered_map<Node *, std::unordered_set<Node *>> preceding_ops;
-  std::unordered_map<Node *, std::unordered_set<Node *>> pending_ops;
-  for (auto &op : topo_nodes) {
-    preceding_ops[op];
-    pending_ops[op];
-    for (auto &var : op->outputs) {
-      if (var->IsCtrlVar()) continue;
-      for (auto &pending_op : var->outputs) {
-        preceding_ops[pending_op].insert(op);
-        pending_ops[op].insert(pending_op);
-      }
-    }
-  }
-
-  std::unordered_set<Node *> opt_node_set(opt_nodes.begin(), opt_nodes.end());
-  auto has_var_deps = [](const std::unordered_set<Node *> &op_set1,
-                         const std::unordered_set<Node *> &op_set2) -> bool {
-    std::set<Node *> intersect_ops;
-    set_intersection(op_set1.begin(), op_set1.end(), op_set2.begin(),
-                     op_set2.end(),
-                     inserter(intersect_ops, intersect_ops.begin()));
-    return !intersect_ops.empty();
-  };
-
-  for (auto opt_node : opt_node_set) {
-    if (has_var_deps(preceding_ops.at(opt_node), opt_node_set)) {
-      return true;
-    }
-    if (has_var_deps(pending_ops.at(opt_node), opt_node_set)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void FuseOptimizerOpPass::GradientsFilter(
-    const std::vector<size_t> &new_grad_idx, std::vector<Node *> *opt_nodes,
-    std::unordered_map<std::string, std::vector<std::string>> *aux_var_set)
-    const {
-  for (auto &aux_vars : *aux_var_set) {
-    std::vector<std::string> sorted_vars;
-    sorted_vars.reserve(aux_vars.second.size());
-    for (size_t i : new_grad_idx) {
-      sorted_vars.emplace_back(aux_vars.second.at(i));
-    }
-    std::swap(aux_vars.second, sorted_vars);
-    if (VLOG_IS_ON(6)) {
-      std::stringstream out;
-      for (auto &var_name : aux_vars.second) {
-        out << var_name << " ";
-      }
-      VLOG(6) << aux_vars.first << ": " << out.str();
-    }
-  }
-  std::vector<Node *> sorted_ops;
-  for (size_t i : new_grad_idx) {
-    sorted_ops.emplace_back(opt_nodes->at(i));
-  }
-  std::swap(*opt_nodes, sorted_ops);
-}
-
-void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
-    const std::vector<std::string> &params,
-    const std::vector<std::string> &grads, const std::string &fused_grad_name,
-    ir::Graph *result) const {
-  auto &pinned_var_set =
-      result->GetOrInit<details::PinnedVars>(details::kPinnedVars);
-
-  auto vars_info = GetVarInfo(*result);
-  // The Gradients should not be reused during memory optimization.
-  for (auto &grad_var_name : grads) {
-    auto iter = vars_info.find(grad_var_name);
-    PADDLE_ENFORCE_EQ(iter != vars_info.end(), true, "%s is not found.",
-                      grad_var_name);
-    PADDLE_ENFORCE_EQ(!iter->second.empty(), true, "%s is not found.",
-                      grad_var_name);
-    PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var());
-    PADDLE_ENFORCE_EQ(
-        IsLoDTensorType(iter->second.front()->Var()->GetType()), true,
-        "Currently the gradient type only should be LoDTensor when "
-        "fusing optimizer ops.");
-    for (auto var : iter->second) {
-      pinned_var_set.insert(var->Var()->Name());
-    }
-  }
-
-  // Define Ops
-  result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
-  ProgramDesc &program_desc =
-      result->Get<details::ProgramDescs>(details::kProgramDescs).back();
-  auto *global_block = program_desc.MutableBlock(0);
-  AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block,
-                             false, false);
-}
-
-std::unordered_map<std::string, std::vector<Node *>>
-FuseOptimizerOpPass::GetVarInfo(const Graph &result) const {
-  std::unordered_map<std::string, std::vector<Node *>> vars;
-  for (Node *node : result.Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      // Note: The graph may have the same name node. For example, parameter
-      // is the input of operator and it also is the output of optimizer;
-      vars[node->Var()->Name()].emplace_back(node);
-    }
-  }
-  return vars;
-}
-
-bool FuseOptimizerOpPass::IsLoDTensorType(
-    const proto::VarType::Type &type) const {
-  // Current only support LOD_TENSOR.
-  return type == proto::VarType::LOD_TENSOR;
-}
-
-proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar(
-    const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
-    const std::string &name) const {
-  auto grad_iter = var_nodes.find(name);
-  PADDLE_ENFORCE_EQ(grad_iter != var_nodes.end(), true, "%s is not found.",
-                    name);
-  PADDLE_ENFORCE_GT(grad_iter->second.size(), 0);
-  PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
-  return grad_iter->second.front()->Var()->GetType();
-}
-
-void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
-    const std::vector<std::string> &aux_var_names,
-    const std::unordered_map<std::string, std::vector<std::string>>
-        &aux_var_set,
-    const std::unordered_map<std::string, std::string> &fused_vars_name,
-    ir::Graph *result) const {
-  // Define Ops
-  result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
-  ProgramDesc &program_desc =
-      result->Get<details::ProgramDescs>(details::kProgramDescs).back();
-  auto *global_block = program_desc.MutableBlock(0);
-  for (auto &var_name : aux_var_names) {
-    AppendAllocContinuousSpace(
-        aux_var_set.at(var_name), aux_var_set.at(var_name),
-        fused_vars_name.at(var_name), global_block, true);
-  }
-}
-
-void FuseOptimizerOpPass::SortParametersAndAuxVars(
-    const std::vector<std::pair<std::string, std::string>> &params_grads,
-    std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
-    std::vector<ir::Node *> *ops) const {
-  PADDLE_ENFORCE_NE(aux_vars_set->count(kGrad), static_cast<size_t>(0));
-  auto &grad_vec = aux_vars_set->at(kGrad);
-
-  std::vector<size_t> grad_sort_idx;
-  grad_sort_idx.reserve(grad_vec.size());
-
-  for (auto &p_g : params_grads) {
-    auto iter = std::find(grad_vec.begin(), grad_vec.end(), p_g.second);
-    PADDLE_ENFORCE_EQ(iter != grad_vec.end(), true,
-                      "%s is not found in grad_vec", p_g.second);
-    auto idx = std::distance(grad_vec.begin(), iter);
-    grad_sort_idx.emplace_back(idx);
-  }
-
-  for (auto &aux_vars : *aux_vars_set) {
-    std::vector<std::string> sorted_vars;
-    sorted_vars.reserve(aux_vars.second.size());
-    for (size_t i = 0; i < aux_vars.second.size(); ++i) {
-      sorted_vars.emplace_back(aux_vars.second.at(grad_sort_idx[i]));
-    }
-    std::swap(aux_vars.second, sorted_vars);
-
-    if (VLOG_IS_ON(6)) {
-      std::stringstream out;
-      for (auto &var_name : aux_vars.second) {
-        out << var_name << " ";
-      }
-      VLOG(6) << aux_vars.first << ": " << out.str();
-    }
-  }
-
-  std::vector<ir::Node *> sorted_ops;
-  sorted_ops.reserve(ops->size());
-  for (size_t i = 0; i < ops->size(); ++i) {
-    sorted_ops.emplace_back(ops->at(grad_sort_idx[i]));
-  }
-  std::swap(*ops, sorted_ops);
-}
-
-void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
-    const std::vector<std::string> &aux_vars_name,
-    const std::vector<ir::Node *> &opt_nodes,
-    std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
-    const {
-  for (auto &node : opt_nodes) {
-    std::stringstream out;
-    for (auto &var_n : aux_vars_name) {
-      auto arg_names = node->Op()->Input(var_n);
-      PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1));
-      (*aux_args_name)[var_n].emplace_back(arg_names[0]);
-      out << var_n << ", " << arg_names[0] << "; ";
-    }
-  }
-}
-
-void FuseOptimizerOpPass::AppendAllocContinuousSpace(
-    const std::vector<std::string> &in_args,
-    const std::vector<std::string> &out_args, const std::string &fused_out_arg,
-    BlockDesc *global_block, bool copy_data, bool check_name) const {
-  auto op_desc = global_block->AppendOp();
-  op_desc->SetType("coalesce_tensor");
-  op_desc->SetInput("Input", in_args);
-  op_desc->SetOutput("Output", out_args);
-  op_desc->SetOutput("FusedOutput", {fused_out_arg});
-  op_desc->SetAttr("copy_data", copy_data);
-  op_desc->SetAttr("check_name", check_name);
-}
-
-void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode(
-    const std::vector<ir::Node *> &op_nodes, ir::Graph *graph,
-    ir::Node *fused_opt_node) const {
-  std::unordered_set<ir::Node *> inputs;
-  std::unordered_set<ir::Node *> outputs;
-  for (auto opt_op : op_nodes) {
-    inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end());
-    for (auto &input : opt_op->inputs) {
-      replace(input->outputs.begin(), input->outputs.end(), opt_op,
-              fused_opt_node);
-    }
-    outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end());
-    for (auto &output : opt_op->outputs) {
-      replace(output->inputs.begin(), output->inputs.end(), opt_op,
-              fused_opt_node);
-    }
-  }
-
-  // Remove the dependence vars between op_nodes.
-  std::unordered_set<ir::Node *> out_dep_vars;
-  std::unordered_set<ir::Node *> not_useful_vars;
-
-  auto deal_with_ctrl_vars = [&out_dep_vars, &not_useful_vars,
-                              &fused_opt_node](ir::Node *ctr_var_node) {
-    PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1);
-    if (ctr_var_node->inputs.front() == fused_opt_node) {
-      PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0);
-      auto output_ops = ctr_var_node->outputs;
-      output_ops.erase(std::remove_if(output_ops.begin(), output_ops.end(),
-                                      [&fused_opt_node](const ir::Node *node) {
-                                        return node == fused_opt_node;
-                                      }),
-                       output_ops.end());
-      if (!output_ops.empty()) {
-        out_dep_vars.insert(ctr_var_node);
-      }
-      not_useful_vars.insert(ctr_var_node);
-    }
-  };
-
-  for (auto *in_node : inputs) {
-    if (in_node->IsCtrlVar()) {
-      deal_with_ctrl_vars(in_node);
-    }
-  }
-
-  for (auto *out_node : outputs) {
-    if (out_node->IsCtrlVar()) {
-      deal_with_ctrl_vars(out_node);
-    }
-  }
-
-  for (auto &node : not_useful_vars) {
-    if (inputs.count(node)) {
-      inputs.erase(node);
-    }
-    if (outputs.count(node)) {
-      outputs.erase(node);
-    }
-  }
-
-  for (auto &dep_var : out_dep_vars) {
-    if (not_useful_vars.count(dep_var)) {
-      not_useful_vars.erase(dep_var);
-    }
-    dep_var->inputs.clear();
-    dep_var->inputs.emplace_back(fused_opt_node);
-  }
-
-  outputs.insert(out_dep_vars.begin(), out_dep_vars.end());
-  fused_opt_node->inputs.insert(fused_opt_node->inputs.begin(), inputs.begin(),
-                                inputs.end());
-  fused_opt_node->outputs.insert(fused_opt_node->outputs.begin(),
-                                 outputs.begin(), outputs.end());
-
-  for (auto &ctrl_var_node : not_useful_vars) {
-    graph->RemoveNode(ctrl_var_node);
-  }
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
deleted file mode 100644
index 149bd20d38cfa148e3bfa3456cfb0c94833a9e33..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
+++ /dev/null
@@ -1,102 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-constexpr char kGrad[] = "Grad";
-constexpr char kParam[] = "Param";
-constexpr char kLearningRate[] = "LearningRate";
-
-class FuseOptimizerOpPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override;
-
- protected:
-  virtual void SortParametersAndAuxVars(
-      const std::vector<std::pair<std::string, std::string>> &params_grads,
-      std::unordered_map<std::string, std::vector<std::string>> *aux_var_set,
-      std::vector<ir::Node *> *ops) const;
-
-  void InsertInputAndOutputForFusedOpNode(
-      const std::vector<ir::Node *> &opt_ops, ir::Graph *graph,
-      ir::Node *opt_node) const;
-
- private:
-  virtual const std::string GetOpType() const = 0;
-
-  virtual const std::vector<std::string> GetAuxiliaryVarNames() const = 0;
-
-  virtual ir::Node *FuseOptimizerOps(
-      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name,
-      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;
-
-  void GetSpecifiedOpsAndVars(
-      const std::vector<std::string> &aux_vars_name,
-      const std::vector<ir::Node *> &opt_nodes,
-      std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
-      const;
-
-  void AppendAllocContinuousSpace(const std::vector<std::string> &in_args,
-                                  const std::vector<std::string> &out_args,
-                                  const std::string &fused_out_arg,
-                                  BlockDesc *global_block, bool copy_data,
-                                  bool check_name = true) const;
-
-  void InitFusedGradsAndAllocSpaceForGrads(
-      const std::vector<std::string> &params,
-      const std::vector<std::string> &grads, const std::string &fused_grad_name,
-      ir::Graph *result) const;
-
-  void InitFusedVarsAndAllocSpaceForVars(
-      const std::vector<std::string> &aux_var_names,
-      const std::unordered_map<std::string, std::vector<std::string>>
-          &aux_var_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name,
-      ir::Graph *result) const;
-
-  std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
-      const Graph &result) const;
-
-  proto::VarType::Type GetTypeOfVar(
-      const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
-      const std::string &name) const;
-
-  void GradientsFilter(const std::vector<size_t> &new_grad_idx,
-                       std::vector<Node *> *opt_nodes,
-                       std::unordered_map<std::string, std::vector<std::string>>
-                           *aux_var_set) const;
-
-  bool IsLoDTensorType(const proto::VarType::Type &type) const;
-
-  bool HasVarDepsBetweenOps(const std::vector<Node *> &topo_nodes,
-                            const std::vector<Node *> &opt_nodes) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
deleted file mode 100644
index 3dd54cbc3c983e26072c09d6af48688965098611..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <algorithm>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class FuseSgdOpPass : public FuseOptimizerOpPass {
- private:
-  virtual const std::string GetOpType() const { return "sgd"; }
-
-  virtual const std::vector<std::string> GetAuxiliaryVarNames() const {
-    return {};
-  }
-
-  // Fuse Sgd Ops
-  virtual ir::Node *FuseOptimizerOps(
-      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name,
-      const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
-    PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0));
-
-    // NOTE: fused_var is only exist in scope, so the graph doesn't have
-    // fused_var node.
-
-    int op_role = boost::get<int>(
-        sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-    VLOG(6) << "Insert sgd to graph.";
-    // Add fused scale
-    OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
-    Sgd_desc.SetType("sgd");
-    Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
-    Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
-    Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
-
-    // TODO(zcd): The LearningRate should be equal.
-    Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate));
-
-    // NOTE: multi_devices_pass requires that every op should have a role.
-    Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
-
-    return graph->CreateOpNode(&Sgd_desc);
-  }
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::ir::FuseSgdOpPass);
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.cc b/paddle/fluid/framework/ir/fuse_pass_base.cc
deleted file mode 100644
index 5e2523607d6973b53e7fd68394ae887c5e14b09d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_pass_base.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include <unordered_map>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void FusePassBase::Init(const std::string& repr, Graph* graph) const {
-  repr_ = repr;
-  graph_ = graph;
-}
-
-Scope* FusePassBase::param_scope() const {
-  PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
-  auto& scope = graph_->Get<framework::Scope>(kParamScopeAttr);
-  return &scope;
-}
-
-void FusePassBase::AddStatis(int count_of_fused) const {
-  PADDLE_ENFORCE(graph_);
-  PADDLE_ENFORCE(!repr_.empty());
-  if (!graph_->Has(kFuseStatisAttr)) {
-    graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
-  }
-  auto& info =
-      graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
-  info[repr_] = count_of_fused;
-}
-
-FuseOptions FusePassBase::FindFuseOption(const Node& node1,
-                                         const Node& node2) const {
-#ifdef PADDLE_WITH_MKLDNN
-  bool node1_mkldnn = node1.Op()->HasAttr("use_mkldnn") &&
-                      boost::get<bool>(node1.Op()->GetAttr("use_mkldnn"));
-  bool node2_mkldnn = node2.Op()->HasAttr("use_mkldnn") &&
-                      boost::get<bool>(node2.Op()->GetAttr("use_mkldnn"));
-  if (node1_mkldnn && node2_mkldnn)
-    return FUSE_MKLDNN;
-  else if (!node1_mkldnn && !node2_mkldnn)
-    return FUSE_NATIVE;
-  else
-    return DO_NOT_FUSE;
-#else
-  return FUSE_NATIVE;
-#endif
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h
deleted file mode 100644
index 3a1022bbcbd671391fb034bdff7c3cf97952f84d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-static const char kParamScopeAttr[] = "__param_scope__";
-static const char kFuseStatisAttr[] = "__fuse_statis__";
-// When we use trt or other third_party lib, the parameters are managed by
-// the lib, but not the fluid. So we need to record them to avoid duplicate
-// allocation.
-static const char kRepetitiveParamAttr[] = "__repetitive_param__";
-
-enum FuseOptions {
-  DO_NOT_FUSE,  // fusing will not be done
-  FUSE_NATIVE,  // fusing will be done without MKL-DNN
-  FUSE_MKLDNN   // fusing will be done with MKL-DNN
-};
-
-class FusePassBase : public Pass {
- public:
-  void Init(const std::string& repr, Graph* graph) const;
-  Scope* param_scope() const;
-  void AddStatis(int count_of_fused) const;
-
-  virtual ~FusePassBase() {}
-
- protected:
-  virtual FuseOptions FindFuseOption(const Node& node1,
-                                     const Node& node2) const;
-
-  mutable Graph* graph_;
-  mutable std::string repr_;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
deleted file mode 100644
index c4e6b6e6a52ec77c85c7c6162c4cbd006e47c502..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h"
-#include <algorithm>
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void FuseReluDepthwiseConvPass::ApplyImpl(ir::Graph *graph) const {
-  graph = FuseReluDepthwiseConv(graph, true);
-  graph = FuseReluDepthwiseConv(graph, false);
-}
-
-ir::Graph *FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
-    ir::Graph *graph, bool only_forward) const {
-  PADDLE_ENFORCE(graph);
-  if (only_forward)
-    FusePassBase::Init("relu_depthwise_conv_only_forward", graph);
-  else
-    FusePassBase::Init("relu_depthwise_conv", graph);
-  /*
-           x ---act--> y ---layer-> z
-            +----------+
-            ↓          ↓
-    x' <--act'--- y' <-layer'--- z'
-
-    fuse to:
-
-           x ---act-layer-> z
-           |
-           ↓
-    x' <--act-layer'--- z'
-
-  */
-
-  GraphPatternDetector gpd;
-  auto *pattern = gpd.mutable_pattern();
-  std::string act_type = "relu";
-  std::string layer_type = "depthwise_conv2d";
-  auto *x = pattern->NewNode("x")->AsInput();
-  auto *y = pattern->NewNode("y")->AsIntermediate();
-  auto *z = pattern->NewNode("z")->AsOutput();
-  PDNode *xg = nullptr;
-  PDNode *yg = nullptr;
-  PDNode *zg = nullptr;
-  if (!only_forward) {
-    xg = pattern->NewNode("xg")->AsOutput();
-    yg = pattern->NewNode("yg")->AsIntermediate();
-    zg = pattern->NewNode("zg")->AsInput();
-  }
-
-  PDNode *act_g = nullptr;
-  PDNode *layer_g = nullptr;
-  auto *act = pattern->NewNode("act")->assert_is_op(act_type);
-  auto *layer = pattern->NewNode("layer")->assert_is_op(layer_type);
-  if (!only_forward) {
-    act_g = pattern->NewNode("act_g")->assert_is_op(act_type + "_grad");
-    layer_g = pattern->NewNode("layer_g")->assert_is_op(layer_type + "_grad");
-  }
-
-  act->LinksFrom({x}).LinksTo({y});
-  layer->LinksFrom({y}).LinksTo({z});
-  if (!only_forward) {
-    layer_g->LinksFrom({y, zg}).LinksTo({yg});
-    act_g->LinksFrom({y, yg}).LinksTo({xg});
-  }
-
-  int count = 0;
-  std::unordered_set<const Node *> need_removed_nodes;
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
-                     Graph *g) {
-    VLOG(4) << "handle FuseReluDepthwiseConv fuse";
-    // 1. turn on fuse option
-    auto *layer_op = subgraph.at(layer)->Op();
-    layer_op->SetAttr("use_cudnn", false);
-    layer_op->SetAttr("fuse_relu_before_depthwise_conv", true);
-
-    OpDesc *layer_g_op = nullptr;
-    if (!only_forward) {
-      layer_g_op = subgraph.at(layer_g)->Op();
-      layer_g_op->SetAttr("use_cudnn", false);
-      layer_g_op->SetAttr("fuse_relu_before_depthwise_conv", true);
-    }
-    // 2. connect x to layer and layer_g, layer_g to xg
-    auto *y_var = subgraph.at(y)->Var();
-    auto *x_var = subgraph.at(x)->Var();
-    VarDesc *yg_var = nullptr;
-    VarDesc *xg_var = nullptr;
-    if (!only_forward) {
-      yg_var = subgraph.at(yg)->Var();
-      xg_var = subgraph.at(xg)->Var();
-    }
-
-    PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1UL);
-    PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name());
-    layer_op->SetInput("Input", {x_var->Name()});
-    subgraph.at(layer)->inputs.push_back(subgraph.at(x));
-    subgraph.at(x)->outputs.push_back(subgraph.at(layer));
-    VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name();
-
-    if (!only_forward) {
-      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1UL);
-      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name());
-      layer_g_op->SetInput("Input", {x_var->Name()});
-      subgraph.at(layer_g)->inputs.push_back(subgraph.at(x));
-      subgraph.at(x)->outputs.push_back(subgraph.at(layer_g));
-
-      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1UL);
-      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0],
-                        yg_var->Name());
-      layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()});
-      subgraph.at(layer_g)->outputs.push_back(subgraph.at(xg));
-      subgraph.at(xg)->inputs.push_back(subgraph.at(layer_g));
-      VLOG(4) << "replace " << yg_var->Name() << " -> " << xg_var->Name();
-    }
-
-    // 3. delete y, yg, act, act_g
-
-    if (only_forward) {
-      need_removed_nodes.insert({subgraph.at(y), subgraph.at(act)});
-    } else {
-      need_removed_nodes.insert({subgraph.at(y), subgraph.at(yg),
-                                 subgraph.at(act), subgraph.at(act_g)});
-    }
-    count++;
-  };
-  gpd(graph, handler);
-  GraphSafeRemoveNodes(graph, need_removed_nodes);
-  AddStatis(count);
-  return graph;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fuse_relu_depthwise_conv_pass,
-              paddle::framework::ir::FuseReluDepthwiseConvPass);
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
deleted file mode 100644
index d37c153dd2a05ecfc8f0626626bbc3ed2f85968b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse the relu and depthwise conv
- */
-class FuseReluDepthwiseConvPass : public FusePassBase {
- public:
-  virtual ~FuseReluDepthwiseConvPass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-  ir::Graph* FuseReluDepthwiseConv(ir::Graph* graph, bool only_forward) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
deleted file mode 100644
index 8ba0e8b80b1c69cad8f8796974828575da343ce8..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph.cc
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/var_desc.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-Graph::Graph(const ProgramDesc &program) : program_(program) {
-  auto var_nodes = InitFromProgram(program_);
-  ResolveHazard(var_nodes);
-}
-
-std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
-    const ProgramDesc &program) {
-  VLOG(3) << "block in program:" << program_.Size();
-  std::unordered_map<std::string, VarDesc *> all_vars;
-  // var nodes for each var name, will have multiple versions in SSA
-  std::map<std::string, std::vector<ir::Node *>> var_nodes;
-  for (auto *var : program.Block(0).AllVars()) {
-    all_vars.emplace(var->Name(), var);
-  }
-
-  for (auto *op : program.Block(0).AllOps()) {
-    ir::Node *node = CreateOpNode(op);
-    // For input args, reuse the same var name if it was created before.
-    // Otherwise, create a new one.
-    for (auto &each_var_name : op->InputArgumentNames()) {
-      ir::Node *var = nullptr;
-      if (var_nodes.find(each_var_name) != var_nodes.end()) {
-        var = var_nodes.at(each_var_name).back();
-      } else if (all_vars.count(each_var_name) != 0) {
-        var = CreateVarNode(all_vars.at(each_var_name));
-        var_nodes[each_var_name].push_back(var);
-      } else {
-        // Operation input var can be optional (dispensable). Which means
-        // the operation doesn't really need the var at runtime. In this
-        // case, the no-existed var is ready at the beginning.
-        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
-        var_nodes[each_var_name].push_back(var);
-      }
-      node->inputs.push_back(var);
-      var->outputs.push_back(node);
-    }
-    // For output args, always create a new var.
-    std::unordered_set<std::string> out_arg_set;
-    for (auto &each_var_name : op->OutputArgumentNames()) {
-      if (each_var_name != kEmptyVarName) {
-        PADDLE_ENFORCE(out_arg_set.count(each_var_name) == 0,
-                       "Program is wrong. %s occurs in output of %s several "
-                       "times.",
-                       each_var_name, op->Type());
-        out_arg_set.insert(each_var_name);
-      }
-
-      ir::Node *var = nullptr;
-      if (all_vars.count(each_var_name) != 0) {
-        var = CreateVarNode(all_vars.at(each_var_name));
-      } else {
-        // Operation output vars can be @EMPTY@. For example, while_grad
-        // can have multi @EMPTY@ outputs with no VarDesc.
-        // TODO(panyx0718): Add a test.
-        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
-      }
-      var_nodes[each_var_name].push_back(var);
-      node->outputs.push_back(var);
-      var->inputs.push_back(node);
-    }
-  }
-  Set<const std::vector<OpDesc *>>(
-      details::kStaleProgramOpDescs,
-      new std::vector<OpDesc *>(program.Block(0).AllOps()));
-  return var_nodes;
-}
-
-void Graph::ResolveHazard(
-    const std::map<std::string, std::vector<ir::Node *>> &var_nodes) {
-  /**
-   * We should handle write after read(WAR) and write after write(WAW) here.
-   * Because some of the operators of the program can be executed parallelly.
-   * So, to make the program running in the right order, we should add the
-   * dependence of WAR and WAW.
-   *
-   *
-   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
-   */
-
-  for (auto &var : var_nodes) {
-    auto &versions = var.second;
-    if (versions.size() <= 1) continue;
-
-    auto it_new = versions.rbegin();
-    auto it_old = versions.rbegin();
-    ++it_old;
-    for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
-      VLOG(3) << "deal with var: " << (*it_new)->Name();
-      ir::Node *write_op =
-          (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
-      const auto &read_ops = (*it_old)->outputs;
-
-      PADDLE_ENFORCE(
-          write_op,
-          string::Sprintf("The write_op of var %s should not be empty.",
-                          (*it_new)->Name()));
-
-      // Add write after write dependence
-      ir::Node *upstream_op =
-          (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
-      // TODO(zcd): Add a test.
-      if (upstream_op && upstream_op != write_op) {
-        ir::Node *dep_var = CreateControlDepVar();
-        write_op->inputs.push_back(dep_var);
-        upstream_op->outputs.push_back(dep_var);
-        VLOG(10) << "add dep_var:" << dep_var->Name();
-        dep_var->outputs.push_back(write_op);
-        dep_var->inputs.push_back(upstream_op);
-      }
-
-      for (auto *read_op : read_ops) {
-        // Manually add a dependency var from read_op to write_op;
-        if (read_op == write_op) {
-          // Read Write is the same op.
-          continue;
-        }
-        // 2 ops might have been connected via other vars.
-        bool has_dep = false;
-        for (ir::Node *r_out : read_op->outputs) {
-          for (ir::Node *w_in : write_op->inputs) {
-            if (r_out == w_in) {
-              has_dep = true;
-              break;
-            }
-          }
-        }
-        if (has_dep) continue;
-
-        ir::Node *dep_var = CreateControlDepVar();
-        VLOG(10) << "add dep_var:" << dep_var->Name();
-        read_op->outputs.push_back(dep_var);
-        dep_var->inputs.push_back(read_op);
-        write_op->inputs.push_back(dep_var);
-        dep_var->outputs.push_back(write_op);
-      }
-    }
-  }
-}
-
-std::shared_ptr<Graph> Graph::Clone() {
-  auto cloned_graph = std::make_shared<Graph>(this->program_);
-  cloned_graph->ReleaseNodes();
-  cloned_graph->num_node_created_ = 0;
-  std::unordered_map<ir::Node *, ir::Node *> origin_to_cloned;
-  for (auto *n : this->node_set_) {
-    ir::Node *cloned_node = nullptr;
-    if (n->IsCtrlVar()) {
-      cloned_node = cloned_graph->CreateControlDepVar();
-    } else if (!n->var_desc_ && !n->op_desc_) {  // empty node
-      cloned_node = cloned_graph->CreateEmptyNode(n->Name(), n->NodeType());
-    } else if (n->IsVar()) {
-      cloned_node = cloned_graph->CreateVarNode(n->Var());
-    } else if (n->IsOp()) {
-      cloned_node = cloned_graph->CreateOpNode(n->Op());
-    }
-    if (cloned_node) {
-      origin_to_cloned[n] = cloned_node;
-    } else {
-      PADDLE_THROW("The cloned node's type is not supported!");
-    }
-  }
-  for (auto *n : this->node_set_) {
-    for (auto it = n->inputs.begin(); it != n->inputs.end(); it++) {
-      origin_to_cloned[n]->inputs.push_back(origin_to_cloned[*it]);
-    }
-    for (auto it = n->outputs.begin(); it != n->outputs.end(); it++) {
-      origin_to_cloned[n]->outputs.push_back(origin_to_cloned[*it]);
-    }
-  }
-  return cloned_graph;
-}
-
-bool IsControlDepVar(const ir::Node &var) {
-  return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
deleted file mode 100644
index 23030905bbadbbbb69f24a852b3cdd09b73db089..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace framework {
-
-namespace details {
-
-// This attr is not recommended, because the graph should not dependence
-// the program once it is built.
-constexpr char kStaleProgramOpDescs[] = "stale_program_op_descs";
-}  //  namespace details
-
-namespace ir {
-
-/*
- * The graph is a Directed Acyclic Single Static Assignment Graph.
- *
- * In more detail, the following properties must hold:
- *
- *   The graph shouldn't contain cycle. Each node is a black-box to the graph
- *   so the node itself could be a loop operator.
- *
- *   Each Variable-type node has only one input (thus single static assignment).
- *
- *   The output/input of operator is variable and the output/input of variable
- *   is operator.
- *
- * The following data harzards in Program are addressed in the Graph:
- *
- *   Write-After-Read
- *     a = op1(x)
- *     x = op2(b)
- *     A control-dependency connection is created bettwen op1 and op2 such that
- *     op1->op2, so as to ensure correct order.
- *
- *   Write-After-Write
- *     x = op1(a)
- *     x = op2(b)
- *     A control-dependency connection is created between op1 and op2 such that
- *     op1->op2, so as to ensure correct order.
- *
- * Other properties currently hold, but is not enforced yet:
- *
- *   Variable-type node (not control dep) with the same variable name share
- *   the same underlying VarDesc.
- */
-class Graph {
- public:
-  explicit Graph(const ProgramDesc &program);
-
-  virtual ~Graph() {
-    for (auto &attr : attrs_) {
-      attr_dels_[attr.first]();
-    }
-    attrs_.clear();
-    attr_dels_.clear();
-  }
-
-  bool Has(const std::string &attr_name) const {
-    return attrs_.count(attr_name) > 0;
-  }
-
-  template <typename AttrType>
-  AttrType &GetOrInit(const std::string &attr_name) {
-    if (!Has(attr_name)) {
-      Set(attr_name, new AttrType);
-    }
-    return Get<AttrType>(attr_name);
-  }
-
-  template <typename AttrType>
-  AttrType &Get(const std::string &attr_name) const {
-    PADDLE_ENFORCE_EQ(Has(attr_name), true, "%s attr not registered for graph.",
-                      attr_name);
-    try {
-      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
-    } catch (boost::bad_any_cast &) {
-      PADDLE_THROW(
-          "Invalid attribute type of %s error, expected: %s, actual: %s",
-          attr_name, typeid(AttrType *).name(),
-          attrs_.at(attr_name).type().name());
-    }
-  }
-
-  template <typename AttrType>
-  void Set(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE_EQ(attrs_.count(attr_name), 0, "%s already set in the graph",
-                      attr_name);
-    attrs_[attr_name] = attr;
-    attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(3) << "deleting " << attr_name;
-      delete attr;
-    };
-  }
-
-  template <typename AttrType>
-  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE_EQ(attrs_.count(attr_name), 0, "%s already set in the graph",
-                      attr_name);
-    attrs_[attr_name] = attr;
-    attr_dels_[attr_name] = []() {};
-  }
-
-  void Erase(const std::string &attr_name) {
-    PADDLE_ENFORCE_NE(attrs_.count(attr_name), 0, "%s not set in the graph",
-                      attr_name);
-    attr_dels_[attr_name]();
-    attrs_.erase(attr_name);
-    attr_dels_.erase(attr_name);
-  }
-
-  const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }
-
-  // Create a normal variable with non-null VarDesc.
-  ir::Node *CreateVarNode(VarDesc *var_desc) {
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
-    auto *x = AddNode(new ir::Node(var_desc));
-    x->SetId(num_node_created_++);
-    return x;
-  }
-
-  // Create a normal runnable operator with OpDesc.
-  ir::Node *CreateOpNode(OpDesc *op_desc) {
-    PADDLE_ENFORCE_NOT_NULL(op_desc);
-    auto *x = AddNode(new ir::Node(op_desc));
-    x->SetId(num_node_created_++);
-    return x;
-  }
-
-  // Create a control dependency var that connects 2 operations. The
-  // var doesn't hold any data. Other than that, it's no different from
-  // other var, considering dependency analysis.
-  ir::Node *CreateControlDepVar() {
-    // TODO(panyx0718): control var name should be really unique.
-    const std::string name = string::Sprintf(
-        "%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName),
-        num_node_created_);
-    auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
-    x->SetId(num_node_created_++);
-    return x;
-  }
-
-  // A more free style way of creating a graph node. Mostly use for test
-  // or "copy" from another node. Avoid using it if possible.
-  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
-    auto *x = AddNode(new ir::Node(name, type));
-    x->SetId(num_node_created_++);
-    return x;
-  }
-
-  // Clear all node information of the graph and return the ownership of the
-  // nodes.
-  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes() {
-    std::vector<std::unique_ptr<ir::Node>> ret;
-    for (auto &n : nodes_) {
-      ret.emplace_back(n.second.release());
-    }
-    nodes_.clear();
-    node_set_.clear();
-    return ret;
-  }
-
-  std::unique_ptr<ir::Node> RemoveNode(ir::Node *node) {
-    PADDLE_ENFORCE_EQ(node_set_.find(node) != node_set_.end(), true);
-    std::unique_ptr<ir::Node> ret;
-    ret.reset(nodes_.at(node).release());
-    nodes_.erase(node);
-    node_set_.erase(node);
-    return ret;
-  }
-
-  // NOTE low performance, but simple and secure.
-  Node *RetrieveNode(int id) {
-    for (auto &node : nodes_) {
-      if (node.second->id() == id) {
-        return node.second.get();
-      }
-    }
-    return nullptr;
-  }
-
-  // Returns reference to the original program.
-  // WARN: After a series of passes, the current graph can be quite
-  // different from OriginProgram. Caller shouldn't assume much from
-  // the returned OriginProgram.
-  const ProgramDesc &OriginProgram() const { return program_; }
-
-  // This method takes ownership of `node`.
-  ir::Node *AddNode(ir::Node *node) {
-    PADDLE_ENFORCE_EQ(node_set_.find(node) == node_set_.end(), true);
-    nodes_[node].reset(node);
-    node_set_.insert(node);
-    return node;
-  }
-
-  void ResolveHazard(
-      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
-
-  // Create a new and duplicated graph.
-  // WARN: The method only clones the graph structure, not its attributes.
-  std::shared_ptr<Graph> Clone();
-
- private:
-  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
-      const ProgramDesc &program);
-
-  // NOTE: program_ shouldn't be exposed to user.
-  const ProgramDesc program_;
-  std::map<std::string, boost::any> attrs_;
-  std::map<std::string, std::function<void(void)>> attr_dels_;
-  std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
-  std::unordered_set<ir::Node *> node_set_;
-  size_t num_node_created_{0};  // help to generate a unique node id.
-};
-
-bool IsControlDepVar(const ir::Node &var);
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
deleted file mode 100644
index b397216f0b4d15b0e71a3c3c7814439d75d59aee..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ /dev/null
@@ -1,395 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include <algorithm>
-#include <deque>
-#include <fstream>
-#include <iosfwd>
-#include <ostream>
-#include <stack>
-#include <unordered_map>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/graph_traits.h"
-
-DEFINE_string(print_sub_graph_dir, "",
-              "FLAGS_print_sub_graph_dir is used "
-              "to print the nodes of sub_graphs.");
-
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace {
-void SortHelper(const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>,
-                               ir::NodeComp> &adj_list,
-                ir::Node *node, std::unordered_set<ir::Node *> *visited,
-                std::vector<ir::Node *> *ret) {
-  visited->insert(node);
-
-  for (auto adj : adj_list.at(node)) {
-    if (visited->find(adj) == visited->end()) {
-      SortHelper(adj_list, adj, visited, ret);
-    }
-  }
-
-  VLOG(5) << "topology sort insert: " << node->Name() << " "
-          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
-  ret->push_back(node);
-}
-
-bool HasCircleHelper(
-    ir::Node *node,
-    const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
-        &adj_list,
-    std::unordered_set<ir::Node *> *visited,
-    std::unordered_set<ir::Node *> *in_trace,
-    std::vector<std::vector<ir::Node *>> *circles) {
-  if (visited->find(node) == visited->end()) {
-    visited->insert(node);
-    in_trace->insert(node);
-
-    for (ir::Node *in : adj_list.at(node)) {
-      if (visited->find(in) == visited->end() &&
-          HasCircleHelper(in, adj_list, visited, in_trace, circles)) {
-        return true;
-      } else if (in_trace->find(in) != in_trace->end()) {
-        if (circles != nullptr) {
-          std::vector<ir::Node *> circle;
-          circle.emplace_back(in);
-          ir::Node *p = in;
-          for (auto &adj : adj_list.at(p)) {
-            if (in_trace->count(adj)) {
-              circle.emplace_back(adj);
-              p = adj;
-            }
-          }
-          circles->emplace_back(circle);
-        }
-        return true;
-      }
-    }
-  }
-  in_trace->erase(node);
-  return false;
-}
-
-bool HasCircleInternal(
-    const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
-        &adj_list,
-    std::vector<std::vector<ir::Node *>> *circles) {
-  std::unordered_set<ir::Node *> visited;
-  std::unordered_set<ir::Node *> in_trace;
-  for (auto &adj : adj_list) {
-    if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace, circles)) {
-      return true;
-    }
-  }
-  return false;
-}
-}  // namespace
-
-bool HasCircle(const Graph &graph) {
-  return HasCircleInternal(BuildOperationAdjList(graph), nullptr);
-}
-
-bool VarDescIsConsistency(const Graph &graph) {
-  std::unordered_map<std::string, std::unordered_set<ir::Node *>>
-      var_name2node_set;
-  for (ir::Node *node : graph.Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      var_name2node_set[node->Var()->Name()].emplace(node);
-    }
-  }
-  for (auto &iter : var_name2node_set) {
-    auto &first_node = *iter.second.begin();
-    bool is_persistable = std::any_of(iter.second.begin(), iter.second.end(),
-                                      [&first_node](const ir::Node *node) {
-                                        return node->Var()->Persistable();
-                                      });
-    if (is_persistable) {
-      bool is_consistency =
-          std::all_of(iter.second.begin(), iter.second.end(),
-                      [&first_node](const ir::Node *node) {
-                        return *node->Var() == *first_node->Var();
-                      });
-      if (!is_consistency) return false;
-    }
-  }
-  return true;
-}
-bool FindCircleSubGraph(const Graph &graph,
-                        std::vector<std::vector<ir::Node *>> *circles) {
-  return HasCircleInternal(BuildOperationAdjList(graph), circles);
-}
-
-std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
-  std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
-      adj_list = BuildOperationAdjList(graph);
-  PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr));
-  std::unordered_set<ir::Node *> visited;
-  std::vector<ir::Node *> ret;
-  for (auto adj : adj_list) {
-    if (visited.find(adj.first) == visited.end()) {
-      SortHelper(adj_list, adj.first, &visited, &ret);
-    }
-  }
-
-  return ret;
-}
-
-// Build operator inlink edge table.
-std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
-BuildOperationAdjList(const Graph &graph) {
-  std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
-      adj_list;
-
-  for (auto &n : graph.Nodes()) {
-    if (!n->IsOp()) continue;
-    if (adj_list.find(n) == adj_list.end()) {
-      adj_list[n] = std::set<ir::Node *, ir::NodeComp>();
-    }
-    for (auto &var : n->inputs) {
-      for (auto &adj_n : var->inputs) {
-        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
-        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
-                << " -> " << n->Name() << reinterpret_cast<void *>(n)
-                << "  via " << var->Name() << reinterpret_cast<void *>(var);
-        adj_list[n].insert(adj_n);
-      }
-    }
-  }
-  return adj_list;
-}
-
-// Build operator outlink edge table.
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationOutAdjList(
-    const Graph &graph) {
-  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
-
-  for (auto &n : graph.Nodes()) {
-    if (!n->IsOp()) continue;
-    if (adj_list.find(n) == adj_list.end()) {
-      adj_list[n] = std::unordered_set<ir::Node *>();
-    }
-    for (auto &var : n->outputs) {
-      for (auto &adj_n : var->outputs) {
-        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
-        VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
-                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
-                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
-        adj_list[n].insert(adj_n);
-      }
-    }
-  }
-  return adj_list;
-}
-
-std::vector<ir::Node *> OpDFSSort(const Graph &graph) {
-  auto edge_table = BuildOperationOutAdjList(graph);
-  std::stack<Node *> stack;
-  for (auto &ele : edge_table) {
-    if (ele.first->inputs.empty()) {
-      // find the input ops (those without input vars)
-      stack.push(ele.first);
-    } else {
-      // find the ops with only persistable vars as inputs.
-      bool all_persistable = true;
-      for (auto *input : ele.first->inputs) {
-        if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) {
-          all_persistable = false;
-        }
-      }
-      if (all_persistable) {
-        stack.push(ele.first);
-      }
-    }
-  }
-
-  std::vector<Node *> res;
-  // start from the feed op and DFS
-  std::unordered_set<Node *> unique_set;
-  while (!stack.empty()) {
-    // will start from the last feed by default.
-    auto cur = stack.top();
-    stack.pop();
-    unique_set.insert(cur);
-    res.push_back(cur);
-
-    for (auto *op : edge_table[cur]) {
-      if (!unique_set.count(op)) {
-        stack.push(op);
-      }
-    }
-  }
-  return res;
-}
-
-std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph) {
-  std::vector<ir::Node *> nodes;
-  std::unordered_map<Node *, int> in_degree;
-
-  auto set_out_ops_ready = [&](Node *var) {
-    for (auto *op : var->outputs) {
-      --in_degree[op];
-    }
-  };
-  // build in_degree
-  for (auto *node : graph.Nodes()) {
-    if (node->IsOp()) {
-      in_degree[node] += node->inputs.size();
-    } else if (node->IsVar() && node->inputs.empty()) {
-      // put all the inputs of the whole graph ready.
-      set_out_ops_ready(node);
-    }
-  }
-
-  std::deque<Node *> op_queue;
-  // first visit
-  for (auto &node : OpDFSSort(graph)) {
-    if (node->IsOp()) {
-      op_queue.push_back(node);
-    }
-  }
-
-  // traverse the graph
-  int num_ops = op_queue.size();
-  while (num_ops) {
-    for (auto it = op_queue.begin(); it != op_queue.end(); it++) {
-      auto *&cur_op = *it;
-      if (!cur_op || in_degree[cur_op] > 0) continue;
-      // visit this node
-      // put all the output var of this op valid.
-      for (auto *out_var : cur_op->outputs) {
-        if (!out_var) continue;
-        set_out_ops_ready(out_var);
-      }
-      VLOG(8) << "visit " << cur_op->Name();
-      nodes.push_back(cur_op);
-
-      cur_op = nullptr;
-      num_ops--;
-    }
-  }
-
-  return nodes;
-}
-
-size_t GraphNum(const Graph &graph) {
-  std::unordered_set<ir::Node *> nodes(graph.Nodes());
-  std::unordered_set<ir::Node *> visited_nodes;
-  visited_nodes.reserve(nodes.size());
-  std::deque<ir::Node *> q_nodes;
-  std::vector<std::unordered_set<ir::Node *>> graph_nodes;
-  std::unordered_set<ir::Node *> g_nodes;
-  // q_set used to record records in the queue.
-  std::unordered_set<ir::Node *> q_set;
-  size_t graph_count = 0;
-
-  auto traverse_nodes = [&visited_nodes, &q_nodes,
-                         &q_set](const std::vector<ir::Node *> &nodes) {
-    for (auto n : nodes) {
-      if (visited_nodes.count(n) == 0 && q_set.count(n) == 0) {
-        q_nodes.push_back(n);
-        q_set.insert(n);
-      }
-    }
-  };
-
-  while (visited_nodes.size() != nodes.size()) {
-    if (!q_nodes.empty()) {
-      auto cur_node = q_nodes.front();
-      q_nodes.pop_front();
-      q_set.erase(cur_node);
-      visited_nodes.insert(cur_node);
-      g_nodes.insert(cur_node);
-      traverse_nodes(cur_node->inputs);
-      traverse_nodes(cur_node->outputs);
-    } else {
-      ++graph_count;
-      if (g_nodes.size()) {
-        graph_nodes.emplace_back(g_nodes);
-      }
-      g_nodes.clear();
-      for (auto &n : nodes) {
-        if (visited_nodes.count(n) == 0) {
-          q_nodes.push_back(n);
-          q_set.insert(n);
-          break;
-        }
-      }
-    }
-  }
-
-  if (g_nodes.size()) {
-    graph_nodes.emplace_back(g_nodes);
-  }
-
-  if (FLAGS_print_sub_graph_dir.size()) {
-    if (graph_nodes.size() > 1) {
-      std::stringstream out;
-      for (auto &g_n : graph_nodes) {
-        out << "graph_nodes: " << g_n.size() << "\n";
-      }
-      out << "\n\n";
-      for (auto &g_n : graph_nodes) {
-        out << "graph_nodes: " << g_n.size();
-        for (auto &node : g_n) {
-          out << "\nNode: " << node->Name() << " in [";
-          for (auto &n : node->inputs) {
-            out << n->Name() << ", ";
-          }
-          out << "], out[";
-          for (auto &n : node->outputs) {
-            out << n->Name() << ", ";
-          }
-          out << "]";
-        }
-        out << "\n\n\n";
-      }
-      std::unique_ptr<std::ostream> fout(
-          new std::ofstream(FLAGS_print_sub_graph_dir));
-      PADDLE_ENFORCE(fout->good());
-      *fout << out.str();
-    }
-  }
-
-  return graph_count;
-}
-
-void CleanIndividualNodes(Graph *graph) {
-  std::unordered_set<Node *> nodes2rm;
-  for (auto *node : graph->Nodes()) {
-    if (node->inputs.empty() && node->outputs.empty()) {
-      nodes2rm.insert(node);
-    }
-  }
-
-  for (auto *node : nodes2rm) {
-    graph->RemoveNode(node);
-  }
-}
-
-std::vector<Node *> TopologyVarientSort(const Graph &graph,
-                                        SortKind sort_kind) {
-  switch (sort_kind) {
-    case SortKind::TS:
-      return framework::ir::TopologySortOperations(graph);
-    default:
-      return framework::ir::TopologyDfsSortOperations(graph);
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
deleted file mode 100644
index 074ad320fb122bd49fe144f803b47ec9768b3504..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/node.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// Compare nodes via node id.
-struct NodeComp {
-  bool operator()(ir::Node *const &node1, ir::Node *const &node2) const {
-    return node1->id() < node2->id();
-  }
-};
-
-// Test if the graph contains circle.
-bool HasCircle(const Graph &graph);
-
-// Check if the var desc of node is consistency.
-// The graph may have the same name node, for example, parameter
-// is the input of operator and it also is the output of optimizer.
-// For the persistable variable, the var_desc of the nodes with
-// the same node name should be equal.
-bool VarDescIsConsistency(const Graph &graph);
-
-// Find All Circles for debugging,
-// store all subgraph in circles.
-bool FindCircleSubGraph(const Graph &graph,
-                        std::vector<std::vector<ir::Node *>> *circles);
-
-size_t GraphNum(const Graph &graph);
-
-// Topology Sort the operations in the graph from inputs to outputs.
-// `graph` cannot contain circle.
-std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
-
-// Topological sort, but try to DFS.
-std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph);
-
-// Different kinds to sort the operators in a graph to a sequence.
-enum class SortKind {
-  // Topological Search
-  TS = 0,
-  // Topological and Depth First Search
-  TDFS
-};
-
-// Several kinds of topological sort.
-std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
-
-// Clean the nodes that doesn't connect to others.
-void CleanIndividualNodes(Graph *graph);
-
-// Build an adjacency list of operations for the `graph`.
-std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
-BuildOperationAdjList(const Graph &graph);
-
-template <typename T>
-std::vector<T *> FilterByNodeWrapper(const Graph &graph) {
-  std::vector<T *> ret;
-  for (ir::Node *n : graph.Nodes()) {
-    if (n->IsWrappedBy<T>()) ret.push_back(&n->Wrapper<T>());
-  }
-  return ret;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
deleted file mode 100644
index d8973d5aeda1a2e0650a506b4c916b4346f01e2d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include <string>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void BuildCircleGraph(Graph* g) {
-  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
-  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
-
-  o1->outputs.push_back(v1);
-  o1->inputs.push_back(v1);
-  v1->inputs.push_back(o1);
-  v1->outputs.push_back(o1);
-}
-
-void BuildCircleGraph2(Graph* g) {
-  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
-  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
-  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
-  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
-
-  o1->outputs.push_back(v1);
-  o2->inputs.push_back(v1);
-  v1->inputs.push_back(o1);
-  v1->outputs.push_back(o2);
-
-  o2->outputs.push_back(v2);
-  o1->inputs.push_back(v2);
-  v2->inputs.push_back(o2);
-  v2->outputs.push_back(o1);
-}
-
-void BuildNoCircleGraph(Graph* g) {
-  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
-  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
-  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
-  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
-  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
-  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
-  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
-  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
-  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
-
-  // o1->v1->o2
-  o1->outputs.push_back(v1);
-  o2->inputs.push_back(v1);
-  v1->inputs.push_back(o1);
-  v1->outputs.push_back(o2);
-  // o2->v2->o3
-  // o2->v2->o4
-  o2->outputs.push_back(v2);
-  o3->inputs.push_back(v2);
-  o4->inputs.push_back(v2);
-  v2->inputs.push_back(o2);
-  v2->outputs.push_back(o3);
-  v2->outputs.push_back(o4);
-  // o2->v3->o5
-  o2->outputs.push_back(v3);
-  o5->inputs.push_back(v3);
-  v3->inputs.push_back(o2);
-  v3->outputs.push_back(o5);
-  // o3-v4->o5
-  o3->outputs.push_back(v4);
-  o5->inputs.push_back(v4);
-  v4->inputs.push_back(o3);
-  v4->outputs.push_back(o5);
-}
-
-TEST(GraphHelperTest, Basic) {
-  ProgramDesc prog;
-
-  Graph g(prog);
-  BuildCircleGraph(&g);
-  ASSERT_TRUE(HasCircle(g));
-
-  Graph g2(prog);
-  BuildCircleGraph2(&g2);
-  ASSERT_TRUE(HasCircle(g2));
-
-  auto adj_list = BuildOperationAdjList(g2);
-  for (auto& adj : adj_list) {
-    auto& adj_set = adj.second;
-    if (adj.first->Name() == "op1") {
-      ASSERT_EQ((*adj_set.begin())->Name(), "op2");
-    } else if (adj.first->Name() == "op2") {
-      ASSERT_EQ((*adj_set.begin())->Name(), "op1");
-    } else {
-      ASSERT_TRUE(false);
-    }
-  }
-
-  Graph g3(prog);
-  BuildNoCircleGraph(&g3);
-  ASSERT_FALSE(HasCircle(g3));
-  auto sorted = TopologySortOperations(g3);
-  std::map<std::string, size_t> node_map;
-  for (size_t i = 0; i < sorted.size(); ++i) {
-    node_map[sorted[i]->Name()] = i;
-  }
-  ASSERT_EQ(node_map.at("op1"), 0UL);
-  ASSERT_EQ(node_map.at("op2"), 1UL);
-  ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
-}
-
-void BuildZeroGraph(Graph* g) {}
-
-void BuildOneGraph(Graph* g) {
-  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
-  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
-  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
-  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
-  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
-  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
-  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
-  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
-  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
-
-  // o1->v1->o2
-  o1->outputs.push_back(v1);
-  o2->inputs.push_back(v1);
-  v1->inputs.push_back(o1);
-  v1->outputs.push_back(o2);
-  // o2->v2->o3
-  // o2->v2->o4
-  o2->outputs.push_back(v2);
-  o3->inputs.push_back(v2);
-  o4->inputs.push_back(v2);
-  v2->inputs.push_back(o2);
-  v2->outputs.push_back(o3);
-  v2->outputs.push_back(o4);
-  // o2->v3->o5
-  o2->outputs.push_back(v3);
-  o5->inputs.push_back(v3);
-  v3->inputs.push_back(o2);
-  v3->outputs.push_back(o5);
-  // o3-v4->o5
-  o3->outputs.push_back(v4);
-  o5->inputs.push_back(v4);
-  v4->inputs.push_back(o3);
-  v4->outputs.push_back(o5);
-}
-
-void BuildTwoGraphs(Graph* g) {
-  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
-  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
-  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
-  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
-  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
-  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
-  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
-  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
-  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
-
-  // o1->v1->o2
-  o1->outputs.push_back(v1);
-  o2->inputs.push_back(v1);
-  v1->inputs.push_back(o1);
-  v1->outputs.push_back(o2);
-  // o2->v2->o3
-  // o2->v2->o4
-  o2->outputs.push_back(v2);
-  o3->inputs.push_back(v2);
-  o4->inputs.push_back(v2);
-  v2->inputs.push_back(o2);
-  v2->outputs.push_back(o3);
-  v2->outputs.push_back(o4);
-  // o2->v3->o5
-  //  o2->outputs.push_back(v3);
-  o5->inputs.push_back(v3);
-  //  v3->inputs.push_back(o2);
-  v3->outputs.push_back(o5);
-  // o3-v4->o5
-  o3->outputs.push_back(v4);
-  //  o5->inputs.push_back(v4);
-  v4->inputs.push_back(o3);
-  //  v4->outputs.push_back(o5);
-}
-
-TEST(GraphHelperTest, Circles) {
-  ProgramDesc prog;
-
-  Graph g(prog);
-  BuildCircleGraph(&g);
-
-  std::vector<std::vector<ir::Node*>> circles;
-  ASSERT_TRUE(FindCircleSubGraph(g, &circles));
-  ASSERT_EQ(circles.size(), 1UL);
-}
-
-TEST(GraphHelperTest, GraphNum) {
-  ProgramDesc prog;
-
-  Graph g(prog);
-  BuildZeroGraph(&g);
-  ASSERT_EQ(GraphNum(g), 0UL);
-
-  Graph g2(prog);
-  BuildOneGraph(&g2);
-  ASSERT_EQ(GraphNum(g2), 1UL);
-
-  Graph g3(prog);
-  BuildTwoGraphs(&g3);
-  ASSERT_EQ(GraphNum(g3), 2UL);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
deleted file mode 100644
index bbb2ee2f56a85fd6e30c02f9b0416adcc130b954..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ /dev/null
@@ -1,1974 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <array>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/string/pretty_log.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-using string::PrettyLogEndl;
-using string::PrettyLog;
-using string::Style;
-
-size_t PDPattern::id_ = 0UL;
-
-PDNode *PDPattern::NewNode(const std::string &name) {
-  if (!name.empty()) {
-    PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
-                      "PDNode's name should be unique, get duplicate [%s]",
-                      name);
-  }
-
-  nodes_.emplace_back(new PDNode(this, name));
-  auto *cur = nodes_.back().get();
-  node_map_[name] = cur;
-  return cur;
-}
-
-PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) {
-  if (!name.empty()) {
-    PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
-                      "PDNode's name should be unique, get duplicate [%s]",
-                      name);
-  }
-
-  nodes_.emplace_back(new PDNode(std::move(teller), this, name));
-  auto *cur = nodes_.back().get();
-  node_map_[name] = cur;
-  return cur;
-}
-
-PDNode *PDPattern::RetrieveNode(const std::string &id) const {
-  auto it = node_map_.find(id);
-  if (it == node_map_.end()) {
-    return nullptr;
-  }
-
-  return it->second;
-}
-
-void PDPattern::AddEdge(PDNode *a, PDNode *b) {
-  PADDLE_ENFORCE(a);
-  PADDLE_ENFORCE(b);
-  PADDLE_ENFORCE(a != b, "can't connect to the same nodes.");
-  edges_.emplace_back(a, b);
-}
-
-void GraphPatternDetector::operator()(Graph *graph,
-                                      GraphPatternDetector::handle_t handler) {
-  if (!MarkPDNodesInGraph(*graph)) {
-    return;
-  }
-
-  auto subgraphs = DetectPatterns();
-  UniquePatterns(&subgraphs);
-  RemoveOverlappedMatch(&subgraphs);
-  ValidateByNodeRole(&subgraphs);
-
-  if (subgraphs.empty()) return;
-  PrettyLogEndl(Style::detail(), "---  detected %d subgraphs",
-                subgraphs.size());
-  int id = 0;
-  for (auto &g : subgraphs) {
-    VLOG(3) << "optimizing #" << id++ << " subgraph";
-    handler(g, graph);
-  }
-}
-
-bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
-  VLOG(3) << "mark pdnodes in graph";
-  if (graph.Nodes().empty()) return false;
-
-  for (auto &node : GraphTraits::DFS(graph)) {
-    for (const auto &pdnode : pattern_.nodes()) {
-      if (pdnode->Tell(&node)) {
-        VLOG(4) << "Node " << node.Name() << " marked as " << pdnode->name();
-        pdnodes2nodes_[pdnode.get()].insert(&node);
-      }
-    }
-  }
-  // Check to early stop if some PDNode can't find matched Node.
-  for (auto &pdnode : pattern_.nodes()) {
-    if (!pdnodes2nodes_.count(pdnode.get())) {
-      VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
-      // return false;
-    }
-  }
-  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
-
-  return !pdnodes2nodes_.empty();
-}
-
-// The intermediate Nodes can only link to the nodes inside the pattern, or this
-// subgraph will be droped.
-void GraphPatternDetector::ValidateByNodeRole(
-    std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
-  std::vector<GraphPatternDetector::subgraph_t> result;
-
-  subgraphs->erase(
-      std::remove_if(
-          subgraphs->begin(), subgraphs->end(),
-          [](const GraphPatternDetector::subgraph_t &subgraph) -> bool {
-            // Collect the inputs and outputs.
-            std::unordered_set<Node *> ios;
-            for (auto &item : subgraph) {
-              if (!item.first->IsIntermediate()) {
-                ios.insert(item.second);
-              }
-            }
-            for (auto &item : subgraph) {
-              if (item.first->IsIntermediate()) {
-                for (auto *x : item.second->inputs) {
-                  if (!ios.count(x)) {
-                    return true;
-                  }
-                }
-                for (auto *x : item.second->outputs) {
-                  if (!ios.count(x)) {
-                    return true;
-                  }
-                }
-              }
-            }
-            return false;
-          }),
-      subgraphs->end());
-}
-
-struct HitGroup {
-  std::unordered_map<PDNode *, Node *> roles;
-
-  bool Match(Node *node, PDNode *pat) {
-    if (nodes_.count(node)) {
-      if (roles.count(pat) && roles[pat] == node) return true;
-      return false;
-    } else {
-      if (roles.count(pat) && roles[pat] != node) return false;
-      return true;
-    }
-  }
-
-  void Register(Node *node, PDNode *pat) {
-    roles[pat] = node;
-    nodes_.insert(node);
-  }
-
- private:
-  std::unordered_set<Node *> nodes_;
-};
-
-// Tell whether Node a links to b.
-bool IsNodesLink(Node *a, Node *b) {
-  for (auto *node : a->outputs) {
-    if (b == node) {
-      return true;
-    }
-  }
-  return false;
-}
-
-std::vector<GraphPatternDetector::subgraph_t>
-GraphPatternDetector::DetectPatterns() {
-  // Init empty subgraphs.
-  std::vector<GraphPatternDetector::subgraph_t> result;
-  std::vector<HitGroup> init_groups;
-  std::array<std::vector<HitGroup>, 2> bi_records;
-  auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
-                                               : pattern_.edges().front().first;
-  if (!pdnodes2nodes_.count(first_pnode)) return result;
-  for (auto *node : pdnodes2nodes_[first_pnode]) {
-    HitGroup group;
-    group.roles[first_pnode] = node;
-    init_groups.emplace_back(group);
-  }
-
-  int step = 0;
-  bi_records[0] = std::move(init_groups);
-
-  // Extend a PDNode to subgraphs by deducing the connection relations defined
-  // in edges of PDNodes.
-  for (const auto &edge : pattern_.edges()) {
-    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
-    // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
-    // Each role has two PDNodes, which indicates two roles.
-    // Detect two Nodes that can match these two roles and they are connected.
-    auto &pre_groups = bi_records[step % 2];
-    auto &cur_groups = bi_records[1 - (step++ % 2)];
-    cur_groups.clear();
-    if (pre_groups.empty()) break;
-    // source -> target
-    for (Node *source : pdnodes2nodes_[edge.first]) {
-      for (Node *target : pdnodes2nodes_[edge.second]) {
-        VLOG(8) << "check " << source->id() << " -- " << target->id();
-        // TODO(Superjomn) add some prune strategies.
-        for (const auto &group : pre_groups) {
-          if (IsNodesLink(source, target)) {
-            HitGroup new_group = group;
-            bool flag = new_group.Match(source, edge.first) &&
-                        new_group.Match(target, edge.second);
-            if (flag) {
-              new_group.Register(source, edge.first);
-              new_group.Register(target, edge.second);
-              cur_groups.push_back(new_group);
-              // TODO(Superjomn) need to unique
-            }
-          }
-        }
-      }
-    }
-    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
-    for (auto &group : cur_groups) {
-      for (auto &item : group.roles) {
-        VLOG(4) << "node " << item.second->id() << " as " << item.first->name();
-      }
-      VLOG(4) << "=========================================================";
-    }
-  }
-
-  for (auto &group : bi_records[step % 2]) {
-    GraphPatternDetector::subgraph_t subgraph;
-    for (auto &role : group.roles) {
-      subgraph.emplace(role.first, role.second);
-    }
-    result.emplace_back(subgraph);
-  }
-  return result;
-}
-
-struct GraphItemLessThan {
-  bool operator()(const std::pair<PDNode *, Node *> &a,
-                  const std::pair<PDNode *, Node *> &b) {
-    if (a.first != b.first) {
-      return a.first < b.first;
-    } else {
-      return a.second < b.second;
-    }
-  }
-};
-
-// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
-// see https://github.com/PaddlePaddle/Paddle/issues/13550
-void GraphPatternDetector::UniquePatterns(
-    std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
-  if (subgraphs->empty()) return;
-  std::vector<GraphPatternDetector::subgraph_t> result;
-
-  std::unordered_set<size_t> set;
-  std::hash<std::string> hasher;
-  for (auto &g : *subgraphs) {
-    // Sort the items in the sub-graph, and transform to a string key.
-    std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
-    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
-    std::stringstream ss;
-    for (auto &item : sorted_keys) {
-      ss << item.first << ":" << item.second;
-    }
-    auto key = hasher(ss.str());
-    if (!set.count(key)) {
-      result.emplace_back(g);
-      set.insert(key);
-    }
-  }
-  *subgraphs = result;
-}
-
-void GraphPatternDetector::RemoveOverlappedMatch(
-    std::vector<subgraph_t> *subgraphs) {
-  std::vector<subgraph_t> result;
-  std::unordered_set<Node *> node_set;
-
-  for (const auto &subgraph : *subgraphs) {
-    bool valid = true;
-    for (auto &item : subgraph) {
-      if (item.first->IsIntermediate() && node_set.count(item.second)) {
-        valid = false;
-        break;
-      }
-    }
-    if (valid) {
-      for (auto &item : subgraph) {
-        node_set.insert(item.second);
-      }
-      result.push_back(subgraph);
-    }
-  }
-  *subgraphs = result;
-}
-
-std::string PDPattern::DotString() const {
-  using inference::analysis::Dot;
-  Dot dot;
-  int id = 0;
-  // Create Nodes
-  std::unordered_map<PDNode *, std::string> node2dot;
-  for (const auto &node : nodes()) {
-    std::string node_id = "Node" + std::to_string(id++);
-    dot.AddNode(node_id, {}, node->name());
-    node2dot[node.get()] = node_id;
-  }
-  // Create Edges
-  for (const auto &edge : edges()) {
-    if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) {
-      LOG(ERROR) << "no node " << edge.first << " " << edge.second;
-      continue;
-    }
-    auto &src = node2dot.at(edge.first);
-    auto &trg = node2dot.at(edge.second);
-    dot.AddEdge(src, trg, {});
-  }
-  return dot.Build();
-}
-
-PDNode &PDNode::LinksTo(const std::vector<PDNode *> &others) {
-  // extend outlinks.
-  for (PDNode *x : others) {
-    pattern_->AddEdge(this, x);
-  }
-  return *this;
-}
-
-PDNode &PDNode::LinksFrom(const std::vector<PDNode *> &others) {
-  // extend outlinks.
-  for (PDNode *x : others) {
-    pattern_->AddEdge(x, this);
-  }
-  return *this;
-}
-
-PDNode *PDNode::assert_is_op() {
-  asserts_.emplace_back([](Node *x) { return x && x->IsOp(); });
-  return this;
-}
-
-PDNode *PDNode::assert_is_op(const std::string &op_type) {
-  asserts_.emplace_back([op_type](Node *x) {
-    return x && x->IsOp() && x->Op()->Type() == op_type;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_var() {
-  asserts_.emplace_back([](Node *x) { return x && x->IsVar(); });
-  return this;
-}
-
-PDNode *PDNode::assert_is_not_ctrl_var() {
-  asserts_.emplace_back([](Node *x) { return x && !x->IsCtrlVar(); });
-  return this;
-}
-
-PDNode *PDNode::assert_var_not_persistable() {
-  assert_is_var();
-  asserts_.emplace_back([](Node *x) { return !x->Var()->Persistable(); });
-  return this;
-}
-
-PDNode *PDNode::assert_is_persistable_var() {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) { return x->Var()->Persistable(); });
-  return this;
-}
-
-PDNode *PDNode::assert_is_op_nth_input(const std::string &op_type,
-                                       const std::string &argument, int nth) {
-  assert_is_var();
-  assert_is_op_input(op_type);
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->outputs) {
-      if (op->IsOp() && op->Op()->Type() == op_type &&
-          IsNthInput(x, op, argument, nth))
-        return true;
-    }
-    return false;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_op_nth_output(const std::string &op_type,
-                                        const std::string &argument, int nth) {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->inputs) {
-      if (op->IsOp() && op->Op()->Type() == op_type &&
-          IsNthOutput(x, op, argument, nth))
-        return true;
-    }
-    return false;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_only_input_of_op(const std::string &op_type) {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->outputs) {
-      if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type &&
-          op->inputs.size() == 1) {
-        return true;
-      }
-    }
-    return false;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_only_output_of_op(const std::string &op_type) {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->inputs) {
-      if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type &&
-          op->outputs.size() == 1) {
-        return true;
-      }
-    }
-    return false;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_op_output(const std::string &op_type) {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->inputs) {
-      if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) {
-        return true;
-      }
-    }
-    return false;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_op_output(const std::string &op_type,
-                                    const std::string &argument) {
-  assert_is_var();
-  assert_is_op_nth_output(op_type, argument, 0);
-  return this;
-}
-PDNode *PDNode::assert_is_op_input(const std::string &op_type) {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->outputs) {
-      if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) {
-        return true;
-      }
-    }
-    return false;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_op_input(const std::string &op_type,
-                                   const std::string &argument) {
-  assert_is_var();
-  assert_is_op_nth_input(op_type, argument, 0);
-  return this;
-}
-
-PDNode *PDNode::assert_op_has_n_inputs(const std::string &op_type, size_t n) {
-  assert_is_op(op_type);
-  asserts_.emplace_back([=](Node *x) { return x->inputs.size() == n; });
-  return this;
-}
-
-PDNode *PDNode::assert_op_has_n_outputs(const std::string &op_type, size_t n) {
-  assert_is_op(op_type);
-  asserts_.emplace_back([=](Node *x) { return x->outputs.size() == n; });
-  return this;
-}
-
-PDNode *PDNode::assert_has_n_inputs(size_t n) {
-  asserts_.emplace_back([=](Node *x) { return x->inputs.size() == n; });
-  return this;
-}
-
-PDNode *PDNode::assert_has_n_outputs(size_t n) {
-  asserts_.emplace_back([=](Node *x) { return x->outputs.size() == n; });
-  return this;
-}
-
-PDNode *PDNode::assert_more(PDNode::teller_t &&teller) {
-  asserts_.emplace_back(std::move(teller));
-  return this;
-}
-
-PDNode *PDNode::assert_is_ops(const std::unordered_set<std::string> &op_types) {
-  asserts_.emplace_back([op_types](Node *x) {
-    return x && x->IsOp() && op_types.count(x->Op()->Type());
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_ops_nth_input(
-    const std::unordered_set<std::string> &op_types,
-    const std::string &argument, int nth) {
-  assert_is_var();
-  assert_is_ops_input(op_types);
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->outputs) {
-      if (op->IsOp() && op_types.count(op->Op()->Type()) &&
-          IsNthInput(x, op, argument, nth))
-        return true;
-    }
-    return false;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_ops_nth_output(
-    const std::unordered_set<std::string> &op_types,
-    const std::string &argument, int nth) {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->inputs) {
-      if (op->IsOp() && op_types.count(op->Op()->Type()) &&
-          IsNthOutput(x, op, argument, nth))
-        return true;
-    }
-    return false;
-  });
-  return this;
-}
-PDNode *PDNode::assert_is_ops_output(
-    const std::unordered_set<std::string> &op_types) {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->inputs) {
-      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type())) {
-        return true;
-      }
-    }
-    return false;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_ops_output(
-    const std::unordered_set<std::string> &op_types,
-    const std::string &argument) {
-  assert_is_var();
-  assert_is_ops_nth_output(op_types, argument, 0);
-  return this;
-}
-
-PDNode *PDNode::assert_is_ops_input(
-    const std::unordered_set<std::string> &op_types) {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->outputs) {
-      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type())) {
-        return true;
-      }
-    }
-    return false;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_ops_input(
-    const std::unordered_set<std::string> &op_types,
-    const std::string &argument) {
-  assert_is_var();
-  assert_is_ops_nth_input(op_types, argument, 0);
-  return this;
-}
-
-bool VarLinksToOp(Node *node, const std::string &op_type) {
-  for (auto *out : node->outputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) {
-  PADDLE_ENFORCE(var->IsVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (!HasInput(op, argument) || op->Op()->Input(argument).size() <= nth)
-    return false;
-  return var->Name() == op->Op()->Input(argument)[nth];
-}
-
-bool HasInput(Node *op, const std::string &argument) {
-  PADDLE_ENFORCE(op->IsOp());
-  auto const &names = op->Op()->InputNames();
-  if (std::find(names.begin(), names.end(), argument) == names.end())
-    return false;
-  return true;
-}
-
-bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) {
-  PADDLE_ENFORCE(var->IsVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (op->Op()->Output(argument).size() <= nth) return false;
-  return var->Name() == op->Op()->Output(argument)[nth];
-}
-
-void GraphSafeRemoveNodes(Graph *graph,
-                          const std::unordered_set<const Node *> &nodes) {
-  for (auto *node : nodes) {
-    graph->RemoveNode(const_cast<Node *>(node));
-  }
-
-  for (auto *node : graph->Nodes()) {
-    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
-      if (nodes.count(*it)) {
-        it = const_cast<Node *>(node)->inputs.erase(it);
-      } else {
-        it++;
-      }
-    }
-    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
-      if (nodes.count(*it)) {
-        it = const_cast<Node *>(node)->outputs.erase(it);
-      } else {
-        it++;
-      }
-    }
-  }
-}
-
-bool VarLinksFromOp(Node *node, const std::string &op_type) {
-  for (auto *out : node->inputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
-
-PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input,
-                                     bool with_eltwise_add) {
-  // Create Operators
-  conv_input->assert_is_op_input("conv2d", "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
-
-  PDNode *eltwise_op = nullptr;
-  if (with_eltwise_add) {
-    eltwise_op =
-        pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
-  }
-  auto *batch_norm_op =
-      pattern->NewNode(batch_norm_repr())->assert_is_op("batch_norm");
-  // Create variables
-  // Conv Filter
-  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
-                              ->AsInput()
-                              ->assert_is_persistable_var()
-                              ->assert_is_op_input("conv2d", "Filter");
-
-  auto *conv_out_var = pattern->NewNode(conv_out_repr())
-                           ->AsIntermediate()
-                           ->assert_is_only_output_of_op("conv2d");
-
-  PDNode *eltwise_y_in_var = nullptr;
-  PDNode *eltwise_out_var = nullptr;
-  if (with_eltwise_add) {
-    // Conv output as Bias input
-    conv_out_var->assert_is_op_input("elementwise_add", "X");
-    // Bias
-    eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
-                           ->assert_is_op_input("elementwise_add", "Y")
-                           ->AsInput();
-    eltwise_out_var = pattern->NewNode(eltwise_out_repr())
-                          ->AsIntermediate()
-                          ->assert_is_only_output_of_op("elementwise_add");
-  } else {
-    // Conv output as BN input
-    conv_out_var->assert_is_op_input("batch_norm", "X");
-  }
-
-  // BN Scale
-  auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
-                           ->AsInput()
-                           ->assert_is_persistable_var()
-                           ->assert_is_op_input("batch_norm", "Scale");
-  // BN Bias
-  auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
-                          ->AsInput()
-                          ->assert_is_persistable_var()
-                          ->assert_is_op_input("batch_norm", "Bias");
-  // BN Mean
-  auto *bn_mean_var = pattern->NewNode(bn_mean_repr())
-                          ->AsInput()
-                          ->assert_is_persistable_var()
-                          ->assert_is_op_input("batch_norm", "Mean");
-  // BN Variance
-  auto *bn_variance_var = pattern->NewNode(bn_variance_repr())
-                              ->AsInput()
-                              ->assert_is_persistable_var()
-                              ->assert_is_op_input("batch_norm", "Variance");
-
-  // BN output
-  auto *bn_out_var = pattern->NewNode(bn_out_repr())
-                         ->AsOutput()
-                         ->assert_is_op_output("batch_norm");
-
-  auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr())
-                              ->AsOutput()
-                              ->assert_is_op_output("batch_norm", "MeanOut");
-
-  auto *bn_variance_out_var =
-      pattern->NewNode(bn_variance_out_repr())
-          ->AsOutput()
-          ->assert_is_op_output("batch_norm", "VarianceOut");
-
-  auto *bn_saved_mean_var =
-      pattern->NewNode(bn_saved_mean_repr())
-          ->AsOutput()
-          ->assert_is_op_output("batch_norm", "SavedMean");
-
-  auto *bn_saved_variance_var =
-      pattern->NewNode(bn_saved_variance_repr())
-          ->AsOutput()
-          ->assert_is_op_output("batch_norm", "SavedVariance");
-
-  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
-
-  if (with_eltwise_add) {
-    eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
-        .LinksTo({eltwise_out_var});
-    batch_norm_op
-        ->LinksFrom({eltwise_out_var, bn_scale_var, bn_bias_var, bn_mean_var,
-                     bn_variance_var})
-        .LinksTo({bn_out_var, bn_mean_out_var, bn_variance_out_var,
-                  bn_saved_mean_var, bn_saved_variance_var});
-  } else {
-    batch_norm_op
-        ->LinksFrom({conv_out_var, bn_scale_var, bn_bias_var, bn_mean_var,
-                     bn_variance_var})
-        .LinksTo({bn_out_var, bn_mean_out_var, bn_variance_out_var,
-                  bn_saved_mean_var, bn_saved_variance_var});
-  }
-  return bn_out_var;
-}
-
-PDNode *patterns::ConvActivation::operator()(
-    paddle::framework::ir::PDNode *conv_input, std::string conv_type,
-    std::string activation_type) {
-  // Create Operators
-  conv_input->assert_is_op_input(conv_type, "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(conv_type);
-  auto *activation_op =
-      pattern->NewNode(activation_repr())->assert_is_op(activation_type);
-  // Create variables
-  // Filter
-  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
-                              ->AsInput()
-                              ->assert_is_persistable_var()
-                              ->assert_is_op_input(conv_type, "Filter");
-  // intermediate variable, will be removed in the IR after fuse.
-  auto *conv_out_var = pattern->NewNode(conv_out_repr())
-                           ->AsIntermediate()
-                           ->assert_is_only_output_of_op(conv_type)
-                           ->assert_is_op_input(activation_type);
-  // output
-  auto *activation_out_var = pattern->NewNode(activation_out_repr())
-                                 ->AsOutput()
-                                 ->assert_is_op_output(activation_type);
-
-  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
-  activation_op->LinksFrom({conv_out_var}).LinksTo({activation_out_var});
-  return activation_out_var;
-}
-
-PDNode *patterns::SeqConvEltAddRelu::operator()(
-    paddle::framework::ir::PDNode *seqconv_input) {
-  // Create Operators
-  seqconv_input->assert_is_op_input("sequence_conv", "X");
-  auto *seqconv_op = pattern->NewNode(seqconv_repr())
-                         ->assert_is_op("sequence_conv")
-                         ->assert_op_attr<bool>("paddingTrainable", false)
-                         ->assert_op_attr<int>("contextStride", 1);
-
-  auto *eltadd_op =
-      pattern->NewNode(eltadd_repr())->assert_is_op("elementwise_add");
-  auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
-  // Create variables
-  // Filter
-  auto *seqconv_weight_var =
-      pattern->NewNode(seqconv_weight_repr())
-          ->AsInput()
-          ->assert_is_persistable_var()
-          ->assert_is_op_input("sequence_conv", "Filter");
-  // Bias
-  auto *eltadd_bias_var = pattern->NewNode(eltadd_bias_repr())
-                              ->AsInput()
-                              ->assert_is_op_input("elementwise_add");
-  // intermediate variable, will be removed in the IR after fuse.
-  auto *seqconv_out_var = pattern->NewNode(seqconv_out_repr())
-                              ->AsIntermediate()
-                              ->assert_is_only_output_of_op("sequence_conv")
-                              ->assert_is_op_input("elementwise_add");
-  auto *eltadd_out_var = pattern->NewNode(eltadd_out_repr())
-                             ->AsIntermediate()
-                             ->assert_is_only_output_of_op("elementwise_add")
-                             ->assert_is_only_input_of_op("relu");
-  // output
-  auto *relu_out_var = pattern->NewNode(relu_out_repr())
-                           ->AsOutput()
-                           ->assert_is_op_output("relu");
-
-  seqconv_op->LinksFrom({seqconv_input, seqconv_weight_var})
-      .LinksTo({seqconv_out_var});
-  eltadd_op->LinksFrom({seqconv_out_var, eltadd_bias_var})
-      .LinksTo({eltadd_out_var});
-  relu_op->LinksFrom({eltadd_out_var}).LinksTo({relu_out_var});
-  return relu_out_var;
-}
-
-PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
-                                 bool with_bias, bool with_relu) {
-  // Create shared nodes.
-  x->assert_is_op_input("mul", "X");
-  auto *mul = pattern->NewNode(mul_repr())->assert_is_op("mul");
-
-  auto *mul_w_var = pattern->NewNode(w_repr())
-                        ->AsInput()
-                        ->assert_is_persistable_var()
-                        ->assert_is_op_input("mul", "Y");
-
-  auto *mul_out_var =
-      pattern->NewNode(mul_out_repr())->assert_is_op_output("mul");
-
-  // Add links.
-  mul->LinksFrom({x, mul_w_var}).LinksTo({mul_out_var});
-  if (!with_bias) {  // not with bias
-    return mul_out_var;
-  } else {  // with bias
-    mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
-    // Create operators.
-    auto *elementwise_add = pattern->NewNode(elementwise_add_repr())
-                                ->assert_is_op("elementwise_add");
-    // Create variables.
-    auto *bias = pattern->NewNode(bias_repr())
-                     ->assert_is_op_input("elementwise_add")
-                     ->assert_is_persistable_var()
-                     ->AsInput();
-
-    auto *elementwise_add_out_var =
-        pattern->NewNode(elementwise_add_out_repr())
-            ->AsOutput()
-            ->assert_is_op_output("elementwise_add");
-
-    elementwise_add->LinksFrom({mul_out_var, bias})
-        .LinksTo({elementwise_add_out_var});
-    if (!with_relu) {
-      return elementwise_add_out_var;
-    } else {
-      elementwise_add_out_var->AsIntermediate()->assert_is_op_input("relu");
-      // Create operators.
-      auto *relu = pattern->NewNode(relu_repr())->assert_is_op("relu");
-      auto *relu_out_var = pattern->NewNode(relu_out_repr())
-                               ->AsOutput()
-                               ->assert_is_op_output("relu");
-
-      relu->LinksFrom({elementwise_add_out_var}).LinksTo({relu_out_var});
-      return relu_out_var;
-    }
-  }
-}
-
-PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x,
-                                       bool with_bias) {
-  // Create shared nodes.
-  x->assert_is_op_input("fc", "Input");
-
-  auto *fc_op = pattern->NewNode(fc_repr())->assert_is_op("fc");
-  // Create variables
-  // Filter
-  auto *fc_weight_var = pattern->NewNode(weights_repr())
-                            ->AsInput()
-                            ->assert_is_persistable_var()
-                            ->assert_is_op_input("fc", "W");
-  // Bias
-  auto *fc_bias_var = pattern->NewNode(bias_repr())
-                          ->AsInput()
-                          ->assert_is_persistable_var()
-                          ->assert_is_op_input("fc", "Bias");
-  // Output
-  auto *fc_out_var = pattern->NewNode(output_repr())
-                         ->AsOutput()
-                         ->assert_is_op_output("fc", "Out")
-                         ->assert_is_only_output_of_op("fc");
-
-  fc_op->LinksFrom({x, fc_weight_var, fc_bias_var}).LinksTo({fc_out_var});
-  return fc_out_var;
-}
-
-PDNode *patterns::Embedding::operator()(PDNode *x) {
-  x->assert_is_op_input("lookup_table", "Ids");
-  auto *lookup_table_op =
-      pattern->NewNode(lookup_table_repr())->assert_is_op("lookup_table");
-#define NEW_NODE(arg__, io__)                    \
-  auto *arg__ = pattern->NewNode(arg__##_repr()) \
-                    ->assert_is_op_##io__("lookup_table", #arg__);
-
-  NEW_NODE(W, input);
-
-  NEW_NODE(Out, output);
-#undef NEW_NODE
-
-  lookup_table_op->LinksFrom({x, W});
-  lookup_table_op->LinksTo({Out});
-  return Out;
-}
-
-PDNode *patterns::LSTM::operator()(PDNode *x) {
-  x->assert_is_op_input("lstm", "Input");
-  auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm");
-#define NEW_NODE(arg__, io__) \
-  auto *arg__ =               \
-      pattern->NewNode(arg__##_repr())->assert_is_op_##io__("lstm", #arg__);
-
-  // Currently, the H0 and C0 are optional
-  // TODO(Superjomn) upgrade the fuse framework to support optional.
-  // NEW_NODE(H0, input);
-  // NEW_NODE(C0, input);
-  NEW_NODE(Weight, input);
-  NEW_NODE(Bias, input);
-
-  NEW_NODE(Hidden, output);
-  NEW_NODE(Cell, output);
-  NEW_NODE(BatchGate, output);
-  NEW_NODE(BatchCellPreAct, output);
-#undef NEW_NODE
-
-  lstm_op->LinksFrom({x, Weight, Bias});
-  lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
-  return Hidden;
-}
-
-PDNode *patterns::GRU::operator()(PDNode *x) {
-  x->assert_is_op_input("gru", "Input");
-  auto *gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru");
-#define NEW_NODE(arg__, io__) \
-  auto *arg__ =               \
-      pattern->NewNode(arg__##_repr())->assert_is_op_##io__("gru", #arg__);
-
-  NEW_NODE(Weight, input);
-  // TODO(Superjomn): upgrade the fuse framework to support optional.
-  // H0 and bias are optional
-  NEW_NODE(Bias, input);  // also optional
-  // NEW_NODE(H0, input);
-
-  NEW_NODE(Hidden, output);
-  // below are intermediate
-  NEW_NODE(BatchGate, output);
-  NEW_NODE(BatchResetHiddenPrev, output);
-  NEW_NODE(BatchHidden, output);
-#undef NEW_NODE
-
-  BatchGate->AsIntermediate();
-  BatchResetHiddenPrev->AsIntermediate();
-  BatchHidden->AsIntermediate();
-
-  gru_op->LinksFrom({x, Weight, Bias});
-  gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
-  return Hidden;
-}
-
-PDNode *patterns::ActElewiseAdd::operator()(
-    paddle::framework::ir::PDNode *in_var,
-    std::unordered_set<std::string> act_types) {
-  in_var->assert_is_ops_input(act_types, "X");
-
-  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
-  auto *act_out_var = pattern->NewNode(act_out_repr())
-                          ->assert_is_not_ctrl_var()
-                          ->assert_is_ops_output(act_types);
-  act_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
-
-  auto *ele_x_var = pattern->NewNode(ele_x_repr())
-                        ->assert_is_not_ctrl_var()
-                        ->assert_is_op_input("elementwise_add")
-                        ->AsInput();
-  auto *elementwise_add =
-      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
-
-  auto *elewise_add_out = pattern->NewNode(elewise_add_out_repr())
-                              ->AsOutput()
-                              ->assert_is_op_output("elementwise_add", "Out");
-
-  act->LinksFrom({in_var}).LinksTo({act_out_var});
-  elementwise_add->LinksFrom({act_out_var, ele_x_var})
-      .LinksTo({elewise_add_out});
-
-  return elewise_add_out;
-}
-
-PDNode *patterns::ElewiseAddAct::operator()(
-    paddle::framework::ir::PDNode *ele_x_var,
-    std::unordered_set<std::string> act_types) {
-  auto *ele_y_var = pattern->NewNode(ele_y_repr())
-                        ->assert_is_op_input("elementwise_add", "Y");
-
-  auto *ele_add =
-      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
-
-  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
-                          ->assert_is_op_output("elementwise_add", "Out");
-
-  ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
-
-  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
-
-  auto *act_out_var =
-      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
-
-  ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var});
-  act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
-
-  return act_out_var;
-}
-
-PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
-    paddle::framework::ir::PDNode *d_act_out_var,
-    std::unordered_set<std::string> act_types) {
-  // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
-  // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
-  auto *act_grad = pattern->NewNode(act_grad_repr())->assert_is_ops(act_types);
-
-  auto *act_out_var =
-      pattern->NewNode(act_out_repr())->assert_is_ops_input(act_types, "Out");
-
-  auto *d_intermediate_var =
-      pattern->NewNode(d_itermediate_out_repr())
-          ->assert_is_ops_output(act_types, GradVarName("X"));
-
-  act_grad->LinksFrom({d_act_out_var, act_out_var})
-      .LinksTo({d_intermediate_var});
-
-  auto *ele_y_var = pattern->NewNode(ele_y_repr())
-                        ->assert_is_not_ctrl_var()
-                        ->assert_is_op_input("elementwise_add_grad", "Y");
-
-  auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr())
-                           ->assert_is_op("elementwise_add_grad");
-
-  auto *d_ele_x_var =
-      pattern->NewNode(d_ele_x_repr())
-          ->assert_is_not_ctrl_var()
-          ->assert_is_op_output("elementwise_add_grad", GradVarName("X"));
-
-  auto *d_ele_y_var =
-      pattern->NewNode(d_ele_y_repr())
-          ->assert_is_not_ctrl_var()
-          ->assert_is_op_output("elementwise_add_grad", GradVarName("Y"));
-
-  ele_add_grad->LinksFrom({d_intermediate_var, ele_y_var})
-      .LinksTo({d_ele_x_var, d_ele_y_var});
-
-  return ele_add_grad;
-}
-
-// conv_type: conv2d, conv3d, conv2d_transpose
-PDNode *patterns::ConvBias::operator()(
-    paddle::framework::ir::PDNode *conv_input, std::string conv_type) {
-  // Create Operators
-  conv_input->assert_is_op_input(conv_type, "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(conv_type);
-  auto *eltiwse_op =
-      pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
-  // Create variables
-  // Filter
-  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
-                              ->AsInput()
-                              ->assert_is_persistable_var()
-                              ->assert_is_op_input(conv_type, "Filter");
-  // intermediate variable, will be removed in the IR after fuse.
-  auto *conv_out_var = pattern->NewNode(conv_out_repr())
-                           ->AsIntermediate()
-                           ->assert_is_only_output_of_op(conv_type)
-                           ->assert_is_op_input("elementwise_add");
-  // Bias stored in elementwise_add
-  auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
-                               ->AsInput()
-                               ->assert_is_persistable_var()
-                               ->assert_is_op_input("elementwise_add", "Y");
-  // output
-  auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr())
-                              ->AsOutput()
-                              ->assert_is_op_output("elementwise_add");
-  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
-  eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var})
-      .LinksTo({eltwise_out_var});
-  return eltwise_out_var;
-}
-
-PDNode *patterns::Conv::operator()() {
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-
-  auto input_var = pattern->NewNode(conv_input_repr())
-                       ->AsInput()
-                       ->assert_is_op_input("conv2d", "Input");
-
-  auto filter_var = pattern->NewNode(conv_filter_repr())
-                        ->AsInput()
-                        ->assert_is_op_input("conv2d", "Filter");
-
-  auto output_var = pattern->NewNode(conv_output_repr())
-                        ->AsOutput()
-                        ->assert_is_op_output("conv2d", "Output");
-
-  conv_op->LinksFrom({input_var, filter_var}).LinksTo({output_var});
-  return output_var;
-}
-
-PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-
-  if (!with_residual_data) {
-    conv_op->assert_more([&](Node *x) {
-      auto node_names = x->Op()->InputNames();
-      if (!HasInput(x, "ResidualData") ||
-          x->Op()->Input("ResidualData").size() == 0)
-        return true;
-      return false;
-    });
-  }
-
-  auto input_var = pattern->NewNode(conv_input_repr())
-                       ->AsInput()
-                       ->assert_is_op_input("conv2d", "Input");
-
-  auto filter_var = pattern->NewNode(conv_filter_repr())
-                        ->AsInput()
-                        ->assert_is_op_input("conv2d", "Filter");
-
-  auto output_var = pattern->NewNode(conv_output_repr())
-                        ->AsOutput()
-                        ->assert_is_op_output("conv2d", "Output");
-
-  std::vector<PDNode *> links_from{input_var, filter_var};
-
-  if (with_residual_data) {
-    auto res_conn_var = pattern->NewNode(conv_residual_data_repr())
-                            ->AsInput()
-                            ->assert_is_op_input("conv2d", "ResidualData");
-    links_from.push_back(res_conn_var);
-  }
-
-  conv_op->LinksFrom(links_from).LinksTo({output_var});
-  return output_var;
-}
-
-PDNode *patterns::Pool::operator()() {
-  auto pool_op = pattern->NewNode(pool_op_repr())->assert_is_op("pool2d");
-
-  auto input_var = pattern->NewNode(pool_input_repr())
-                       ->AsInput()
-                       ->assert_is_op_input("pool2d", "X");
-
-  auto output_var = pattern->NewNode(pool_output_repr())
-                        ->AsOutput()
-                        ->assert_is_op_output("pool2d", "Out");
-
-  pool_op->LinksFrom({input_var}).LinksTo({output_var});
-  return output_var;
-}
-
-PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
-  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
-                                ->assert_is_op("elementwise_add");
-
-  x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
-  y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
-  auto out_var = pattern->NewNode(elementwise_add_out_repr())
-                     ->AsOutput()
-                     ->assert_is_op_output("elementwise_add", "Out");
-
-  elementwise_add_op->LinksFrom({x_var, y_var});
-  elementwise_add_op->LinksTo({out_var});
-
-  return out_var;
-}
-
-PDNode *patterns::Concat::operator()() {
-  auto concat_op = pattern->NewNode(concat_op_repr())->assert_is_op("concat");
-
-  auto output_var = pattern->NewNode(concat_out_repr())
-                        ->AsOutput()
-                        ->assert_is_op_output("concat", "Out");
-
-  concat_op->LinksTo({output_var});
-  return output_var;
-}
-
-PDNode *patterns::ConcatReLU::operator()() {
-  auto concat_op = pattern->NewNode(concat_op_repr())->assert_is_op("concat");
-  auto relu_op = pattern->NewNode(relu_op_repr())->assert_is_op("relu");
-
-  auto concat_out =
-      pattern->NewNode(concat_out_repr())->assert_is_op_output("concat", "Out");
-
-  auto relu_out = pattern->NewNode(relu_out_repr())
-                      ->AsOutput()
-                      ->assert_is_op_output("relu", "Out");
-
-  concat_op->LinksTo({concat_out});
-  relu_op->LinksFrom({concat_out}).LinksTo({relu_out});
-
-  return relu_out;
-}
-
-PDNode *patterns::ConvConcatReLU::operator()() {
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-  auto concat_op = pattern->NewNode(concat_op_repr())->assert_is_op("concat");
-  auto relu_op = pattern->NewNode(relu_op_repr())->assert_is_op("relu");
-
-  auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output("conv2d", "Output");
-
-  auto concat_out = pattern->NewNode(concat_out_repr())
-                        ->assert_is_op_output("concat", "Out")
-                        ->assert_is_op_input("relu", "X");
-
-  auto relu_out = pattern->NewNode(relu_out_repr())
-                      ->AsOutput()
-                      ->assert_is_op_output("relu", "Out");
-
-  conv_op->LinksTo({conv_out});
-  concat_op->LinksFrom({conv_out}).LinksTo({concat_out});
-  relu_op->LinksFrom({concat_out}).LinksTo({relu_out});
-
-  return relu_out;
-}
-
-PDNode *patterns::ConvRequant::operator()() {
-  // Create Operators
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-  auto requant_op =
-      pattern->NewNode(requant_op_repr())->assert_is_op("requantize");
-  auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output("conv2d", "Output");
-  auto requant_out = pattern->NewNode(requant_out_repr())
-                         ->AsOutput()
-                         ->assert_is_op_output("requantize", "Output");
-
-  conv_op->LinksTo({conv_out});
-  requant_op->LinksFrom({conv_out}).LinksTo({requant_out});
-
-  return requant_out;
-}
-
-PDNode *patterns::ConvDequant::operator()() {
-  // Create Operators
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-  auto dequant_op =
-      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
-
-  auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output("conv2d", "Output");
-  auto dequant_out = pattern->NewNode(dequant_out_repr())
-                         ->AsOutput()
-                         ->assert_is_op_output("dequantize", "Output");
-
-  conv_op->LinksTo({conv_out});
-  dequant_op->LinksFrom({conv_out}).LinksTo({dequant_out});
-
-  return dequant_out;
-}
-
-PDNode *patterns::PriorBox::operator()() {
-  auto prior_box_op =
-      pattern->NewNode(prior_box_op_repr())->assert_is_op("prior_box");
-
-  auto input_var = pattern->NewNode(prior_box_input_repr())
-                       ->AsInput()
-                       ->assert_is_op_input("prior_box", "Input");
-
-  auto image_var = pattern->NewNode(prior_box_image_repr())
-                       ->AsInput()
-                       ->assert_is_op_input("prior_box", "Image");
-
-  auto boxes_var = pattern->NewNode(prior_box_boxes_repr())
-                       ->AsOutput()
-                       ->assert_is_op_output("prior_box", "Boxes");
-
-  auto variances_var = pattern->NewNode(prior_box_variances_repr())
-                           ->AsOutput()
-                           ->assert_is_op_output("prior_box", "Variances");
-
-  prior_box_op->LinksFrom({input_var, image_var})
-      .LinksTo({boxes_var, variances_var});
-  return boxes_var;
-}
-
-std::unordered_set<std::string> conv_act_set({"identity", "relu"});
-
-PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
-  conv_in->AsInput();
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-  auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output("conv2d")
-                      ->assert_is_op_input("elementwise_add", "X")
-                      ->AsIntermediate();
-  auto conv_filter = pattern->NewNode(conv_filter_repr())
-                         ->assert_is_op_input("conv2d", "Filter")
-                         ->AsInput();
-  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
-                                ->assert_is_op("elementwise_add");
-  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
-                                  ->assert_is_op_input("elementwise_add", "Y")
-                                  ->AsInput();
-  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
-                                 ->assert_is_op_output("elementwise_add")
-                                 ->AsIntermediate();
-
-  auto act_op = pattern->NewNode(act_op_repr())
-                    ->assert_is_op()
-                    ->assert_more([&](Node *node) {
-                      auto op_type = node->Name();
-                      return conv_act_set.count(op_type);
-                    });
-
-  auto act_out = pattern->NewNode(act_out_repr())
-                     ->assert_is_var()
-                     // is activation op's output.
-                     ->assert_more([&](Node *node) {
-                       for (auto *in_op : node->inputs) {
-                         if (conv_act_set.count(in_op->Name())) {
-                           return true;
-                         }
-                       }
-                       return false;
-                     })
-                     ->AsOutput();
-
-  conv_op->LinksFrom({conv_in, conv_filter});
-  conv_out->LinksFrom({conv_op});
-  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
-      .LinksTo({elementwise_add_out});
-  act_op->LinksFrom({elementwise_add_out}).LinksTo({act_out});
-
-  return act_out;
-}
-
-PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-  auto conv_filter = pattern->NewNode(conv_filter_repr())
-                         ->assert_is_op_input("conv2d", "Filter")
-                         ->AsInput();
-  auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output("conv2d")
-                      ->assert_is_op_input("elementwise_add", "X")
-                      ->AsIntermediate();
-  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
-                                ->assert_is_op("elementwise_add");
-  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
-                                  ->assert_is_op_input("elementwise_add", "Y")
-                                  ->AsInput();
-  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
-                                 ->assert_is_op_output("elementwise_add")
-                                 ->assert_is_op_input("elementwise_add", "Y")
-                                 ->AsIntermediate();
-
-  auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
-                                  ->assert_is_op("elementwise_add");
-  auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
-                                    ->assert_is_op_input("elementwise_add", "X")
-                                    ->AsInput();
-  auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
-                                   ->assert_is_op_output("elementwise_add")
-                                   ->AsIntermediate();
-
-  auto act_op = pattern->NewNode(act_op_repr())
-                    ->assert_is_op()
-                    ->assert_more([&](Node *node) {
-                      auto op_type = node->Name();
-                      return conv_act_set.count(op_type);
-                    });
-  auto act_out = pattern->NewNode(act_out_repr())
-                     ->assert_is_var()
-                     // is activation op's output.
-                     ->assert_more([&](Node *node) {
-                       for (auto *in_op : node->inputs) {
-                         if (conv_act_set.count(in_op->Name())) {
-                           return true;
-                         }
-                       }
-                       return false;
-                     })
-                     ->AsOutput();
-
-  conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
-  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
-      .LinksTo({elementwise_add_out});
-  elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1})
-      .LinksTo({elementwise_add_out_1});
-  act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
-  return act_out;
-}
-
-PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
-  conv_in->AsInput();
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-  auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output("conv2d")
-                      ->assert_is_op_input("elementwise_add", "X")
-                      ->AsIntermediate();
-  auto conv_filter = pattern->NewNode(conv_filter_repr())
-                         ->assert_is_op_input("conv2d", "Filter")
-                         ->AsInput();
-  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
-                                ->assert_is_op("elementwise_add");
-  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
-                                  ->assert_is_op_input("elementwise_add", "Y")
-                                  ->AsInput();
-  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
-                                 ->assert_is_op_output("elementwise_add")
-                                 ->AsOutput();
-
-  conv_op->LinksFrom({conv_in, conv_filter});
-  conv_out->LinksFrom({conv_op});
-  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
-      .LinksTo({elementwise_add_out});
-
-  return elementwise_add_out;
-}
-
-PDNode *patterns::ConvAffineChannel::operator()(
-    paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) {
-  // Create Operators
-  conv_input->assert_is_op_input("conv2d", "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
-
-  PDNode *eltwise_op = nullptr;
-  if (with_eltwise_add) {
-    eltwise_op =
-        pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
-  }
-
-  auto *affine_channel_op =
-      pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel");
-  // Create variables
-  // Conv Filter
-  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
-                              ->AsInput()
-                              ->assert_is_persistable_var()
-                              ->assert_is_op_input("conv2d", "Filter");
-
-  auto *conv_out_var = pattern->NewNode(conv_out_repr())
-                           ->AsIntermediate()
-                           ->assert_is_only_output_of_op("conv2d");
-
-  PDNode *eltwise_y_in_var = nullptr;
-  PDNode *eltwise_out_var = nullptr;
-  if (with_eltwise_add) {
-    // Conv output as Bias input
-    conv_out_var->assert_is_op_input("elementwise_add", "X");
-    // Bias
-    eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
-                           ->assert_is_op_input("elementwise_add", "Y")
-                           ->AsInput();
-    eltwise_out_var = pattern->NewNode(eltwise_out_repr())
-                          ->AsIntermediate()
-                          ->assert_is_only_output_of_op("elementwise_add");
-  } else {
-    // Conv output as AffineChannel input
-    conv_out_var->assert_is_op_input("affine_channel", "X");
-  }
-
-  // AC Scale
-  auto *ac_scale_var = pattern->NewNode(ac_scale_repr())
-                           ->AsInput()
-                           ->assert_is_persistable_var()
-                           ->assert_has_n_outputs(1)
-                           ->assert_is_op_input("affine_channel", "Scale");
-  // AC Bias
-  auto *ac_bias_var = pattern->NewNode(ac_bias_repr())
-                          ->AsInput()
-                          ->assert_is_persistable_var()
-                          ->assert_has_n_outputs(1)
-                          ->assert_is_op_input("affine_channel", "Bias");
-
-  // AC output
-  auto *ac_out_var = pattern->NewNode(ac_out_repr())
-                         ->AsOutput()
-                         ->assert_is_op_output("affine_channel");
-
-  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
-
-  if (with_eltwise_add) {
-    eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
-        .LinksTo({eltwise_out_var});
-    affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var})
-        .LinksTo({ac_out_var});
-  } else {
-    affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var})
-        .LinksTo({ac_out_var});
-  }
-  return ac_out_var;
-}
-
-PDNode *patterns::DequantQuantAny::operator()() {
-  auto *dequant_in = pattern->NewNode(dequant_in_repr())
-                         ->AsInput()
-                         ->assert_is_op_input("dequantize", "Input");
-
-  auto *dequant_op =
-      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
-
-  auto *dequant_out = pattern->NewNode(dequant_out_repr())
-                          ->AsOutput()
-                          ->assert_is_op_output("dequantize", "Output");
-
-  auto *quant_op = pattern->NewNode(quant_op_repr())
-                       ->assert_is_op("quantize")
-                       ->AsIntermediate();
-
-  auto *quant_out = pattern->NewNode(quant_out_repr())
-                        ->AsOutput()
-                        ->assert_is_op_output("quantize");
-
-  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
-
-  dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out});
-  quant_op->LinksFrom({dequant_out}).LinksTo({quant_out});
-  next_op->LinksFrom({quant_out});
-
-  return quant_out;
-}
-
-PDNode *patterns::DequantAny::operator()() {
-  auto *dequant_op =
-      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
-
-  auto *dequant_out = pattern->NewNode(dequant_out_repr())
-                          ->AsOutput()
-                          ->assert_is_op_output("dequantize", "Output");
-
-  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
-
-  dequant_op->LinksTo({dequant_out});
-  next_op->LinksFrom({dequant_out});
-
-  return dequant_out;
-}
-
-// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
-// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
-// ...
-// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z
-// flatten_out_a -> concat_op  flatten_out_b -> concat_op ... flatten_out_z ->
-// concat_op
-PDNode *patterns::TransposeFlattenConcat::operator()(
-    std::vector<PDNode *> conv_in, int times) {
-  // The times represents the repeat times of the
-  // {trans, trans_out, flatten, flatten_out}
-  const int kNumFields = 4;
-  const int kTransOutOffset = 1;
-  const int kFlattenOffset = 2;
-  const int kFlattenOutOffset = 3;
-
-  std::vector<PDNode *> nodes;
-
-  for (int i = 0; i < times; i++) {
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("transpose" + std::to_string(i)))
-            ->assert_is_op("transpose2"));
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("transpose_out" + std::to_string(i)))
-            ->assert_is_op_output("transpose2")
-            ->assert_is_op_input("flatten2", "X")
-            ->AsIntermediate());
-    nodes.push_back(pattern->NewNode(GetNodeName("flatten" + std::to_string(i)))
-                        ->assert_is_op("flatten2"));
-
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("flatten_out" + std::to_string(i)))
-            ->assert_is_op_output("flatten2")
-            ->assert_is_op_nth_input("concat", "X", i)
-            ->AsIntermediate());
-  }
-
-  auto concat_op = pattern->NewNode(GetNodeName("concat"))
-                       ->assert_is_op("concat")
-                       ->assert_op_has_n_inputs("concat", times);
-  auto concat_out = pattern->NewNode(GetNodeName("concat_out"))
-                        ->assert_is_op_output("concat")
-                        ->AsOutput();
-
-  std::vector<PDNode *> flatten_outs;
-  for (int i = 0; i < times; i++) {
-    conv_in[i]->AsInput();
-    // trans
-    nodes[i * kNumFields]->LinksFrom({conv_in[i]});
-    // trans_out
-    nodes[i * kNumFields + kTransOutOffset]->LinksFrom({nodes[i * kNumFields]});
-    // flatten
-    nodes[i * kNumFields + kFlattenOffset]->LinksFrom(
-        {nodes[i * kNumFields + kTransOutOffset]});
-    // flatten_out
-    nodes[i * kNumFields + kFlattenOutOffset]->LinksFrom(
-        {nodes[i * kNumFields + kFlattenOffset]});
-    flatten_outs.push_back(nodes[i * kNumFields + kFlattenOutOffset]);
-  }
-
-  concat_op->LinksFrom(flatten_outs).LinksTo({concat_out});
-  return concat_out;
-}
-
-PDNode *patterns::AnakinDetectionPattern::operator()(
-    std::vector<PDNode *> conv_in, int times, std::string priorbox_type,
-    bool is_reshape) {
-  // The times represents the repeat times of the
-  // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
-  const int kNumFields = 7;
-  const int kPriorBoxLocOffset = 1;
-  const int kReshape1Offset = 2;
-  const int kReshape1OutOffset = 3;
-  const int kPriorBoxVarOffset = 4;
-  const int kReshape2Offset = 5;
-  const int kReshape2OutOffset = 6;
-
-  const int kBoxCoderThirdInputOffset = times;
-  const int kMultiClassSecondInputNmsOffset = times + 1;
-
-  std::vector<PDNode *> nodes;
-  std::string op_after_priorbox = is_reshape ? "reshape2" : "flatten2";
-
-  for (int i = 0; i < times; i++) {
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
-            ->assert_is_op(priorbox_type));
-    nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
-                        ->assert_is_op_output(priorbox_type, "Boxes")
-                        ->assert_is_op_input(op_after_priorbox, "X")
-                        ->AsIntermediate());
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
-            ->assert_is_op(op_after_priorbox));
-
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
-            ->assert_is_op_output(op_after_priorbox)
-            ->assert_is_op_nth_input("concat", "X", i)
-            ->AsIntermediate());
-
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
-            ->assert_is_op_output(priorbox_type, "Variances")
-            ->assert_is_op_input(op_after_priorbox, "X")
-            ->AsIntermediate());
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
-            ->assert_is_op(op_after_priorbox));
-
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
-            ->assert_is_op_output(op_after_priorbox)
-            ->assert_is_op_nth_input("concat", "X", i)
-            ->AsIntermediate());
-  }
-
-  auto concat_op1 = pattern->NewNode(GetNodeName("concat1"))
-                        ->assert_is_op("concat")
-                        ->assert_op_has_n_inputs("concat", times);
-  auto concat_out1 = pattern->NewNode(GetNodeName("concat1_out"))
-                         ->assert_is_op_output("concat")
-                         ->AsIntermediate();
-
-  auto concat_op2 = pattern->NewNode(GetNodeName("concat2"))
-                        ->assert_is_op("concat")
-                        ->assert_op_has_n_inputs("concat", times);
-  auto concat_out2 = pattern->NewNode(GetNodeName("concat2_out"))
-                         ->assert_is_op_output("concat")
-                         ->AsIntermediate();
-
-  auto box_coder_op = pattern->NewNode(GetNodeName("box_coder"))
-                          ->assert_is_op("box_coder")
-                          ->assert_op_has_n_inputs("box_coder", 3);
-
-  auto box_coder_out = pattern->NewNode(GetNodeName("box_coder_out"))
-                           ->assert_is_op_output("box_coder")
-                           ->AsIntermediate();
-
-  auto transpose_before_nms =
-      pattern->NewNode(GetNodeName("transpose_before_nms"))
-          ->assert_is_op("transpose2");
-
-  auto transpose_before_nms_out =
-      pattern->NewNode(GetNodeName("transpose_before_nms_out"))
-          ->assert_is_op_output("transpose2")
-          ->assert_is_op_input("multiclass_nms", "Scores")
-          ->AsIntermediate();
-
-  auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms"))
-                               ->assert_is_op("multiclass_nms")
-                               ->assert_op_has_n_inputs("multiclass_nms", 2);
-
-  auto multiclass_nms_out = pattern->NewNode(GetNodeName("multiclass_nms_out"))
-                                ->assert_is_op_output("multiclass_nms")
-                                ->AsOutput();
-
-  std::vector<PDNode *> reshape1_outs;
-  std::vector<PDNode *> reshape2_outs;
-
-  for (int i = 0; i < times; i++) {
-    conv_in[i]->AsInput();
-    // prior_box
-    nodes[i * kNumFields]->LinksFrom({conv_in[i]});
-    // prior_box box out
-    nodes[i * kNumFields + kPriorBoxLocOffset]->LinksFrom(
-        {nodes[i * kNumFields]});
-    // reshape
-    nodes[i * kNumFields + kReshape1Offset]->LinksFrom(
-        {nodes[i * kNumFields + kPriorBoxLocOffset]});
-    // reshape_out
-    nodes[i * kNumFields + kReshape1OutOffset]->LinksFrom(
-        {nodes[i * kNumFields + kReshape1Offset]});
-
-    nodes[i * kNumFields + kPriorBoxVarOffset]->LinksFrom(
-        {nodes[i * kNumFields]});
-    // reshape
-    nodes[i * kNumFields + kReshape2Offset]->LinksFrom(
-        {nodes[i * kNumFields + kPriorBoxVarOffset]});
-    // reshape_out
-    nodes[i * kNumFields + kReshape2OutOffset]->LinksFrom(
-        {nodes[i * kNumFields + kReshape2Offset]});
-
-    reshape1_outs.push_back(nodes[i * kNumFields + kReshape1OutOffset]);
-    reshape2_outs.push_back(nodes[i * kNumFields + kReshape2OutOffset]);
-  }
-
-  concat_op1->LinksFrom(reshape1_outs);
-  concat_op2->LinksFrom(reshape2_outs);
-  concat_out1->LinksFrom({concat_op1});
-  concat_out2->LinksFrom({concat_op2});
-
-  conv_in[kBoxCoderThirdInputOffset]->AsInput();
-  conv_in[kMultiClassSecondInputNmsOffset]->AsInput();
-
-  box_coder_op->LinksFrom(
-      {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]});
-  box_coder_out->LinksFrom({box_coder_op});
-
-  transpose_before_nms->LinksFrom({conv_in[kMultiClassSecondInputNmsOffset]});
-  transpose_before_nms_out->LinksFrom({transpose_before_nms});
-
-  multiclass_nms_op->LinksFrom({box_coder_out, transpose_before_nms_out})
-      .LinksTo({multiclass_nms_out});
-
-  return multiclass_nms_out;
-}
-
-PDNode *patterns::FillConstantElementWiseMulFuse::operator()(
-    PDNode *elementwise_op_input) {
-  auto fill_constant =
-      pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
-
-  auto fill_constant_out = pattern->NewNode(fill_constant_out_repr())
-                               ->assert_is_op_output("fill_constant")
-                               ->assert_is_op_input("elementwise_mul", "Y")
-                               ->AsIntermediate();
-
-  auto elementwise_mul_op =
-      pattern->NewNode(elementwise_mul_repr())->assert_is_op("elementwise_mul");
-
-  auto elementwise_mul_out = pattern->NewNode(elementwise_mul_out_repr())
-                                 ->assert_is_op_output("elementwise_mul")
-                                 ->AsOutput();
-
-  fill_constant_out->LinksFrom({fill_constant});
-  elementwise_mul_op->LinksFrom({elementwise_op_input, fill_constant_out});
-  elementwise_mul_out->LinksFrom({elementwise_mul_op});
-  return elementwise_mul_out;
-}
-
-void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
-                                              const std::string &op_type,
-                                              const std::string &weight_name,
-                                              int times,
-                                              const std::string &quant_type,
-                                              const std::string &dequant_type) {
-  int kNumFields = 5;
-  const int kQuantizedWeightOffset = 0;
-  const int kQuantizedOpOffset = 1;
-  const int kQuantizedOpOutOffset = 2;
-  const int kDequantOpOffset = 3;
-  const int kDequantOpOutOffset = 4;
-  const int kDequantOpWeightScaleOffset = 5;
-
-  // the quant op always be one.
-  auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale"))
-                               ->assert_is_op_input(quant_type, "InScale")
-                               ->AsInput();
-  auto quant_op =
-      pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type);
-
-  PDNode *quant_op_out_scale = nullptr;
-  if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-    kNumFields += 1;
-    quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale"))
-                             ->assert_is_op_output(quant_type, "OutScale")
-                             ->assert_is_op_nth_input(dequant_type, "Scales", 1)
-                             ->AsIntermediate();
-  } else {
-    quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale"))
-                             ->assert_is_op_output(quant_type, "OutScale")
-                             ->assert_is_op_input(dequant_type, "Scale")
-                             ->AsIntermediate();
-  }
-
-  auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out"))
-                          ->assert_is_op_output(quant_type, "Out")
-                          ->assert_is_op_input(op_type)
-                          ->AsIntermediate();
-
-  // there are 'times' quantized and dequant op
-  std::vector<PDNode *> nodes;
-  for (int i = 0; i < times; i++) {
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("quantized_op_weight") + std::to_string(i))
-            ->assert_is_op_input(op_type, weight_name)
-            ->AsInput());
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("quantized_op") + std::to_string(i))
-            ->assert_is_op(op_type));
-
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i))
-            ->assert_is_op_output(op_type)
-            ->assert_is_op_input(dequant_type, "X")
-            ->AsIntermediate());
-
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i))
-            ->assert_is_op(dequant_type));
-
-    nodes.push_back(
-        pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i))
-            ->assert_is_op_output(dequant_type, "Out")
-            ->AsOutput());
-
-    if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-      nodes.push_back(pattern
-                          ->NewNode(GetNodeName("dequant_channel_scale") +
-                                    std::to_string(i))
-                          ->assert_is_op_nth_input(dequant_type, "Scales", 0)
-                          ->AsInput());
-    }
-  }
-
-  quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
-  quant_op_out->LinksFrom({quant_op});
-  for (int i = 0; i < times; i++) {
-    nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
-        {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
-    nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
-        {nodes[i * kNumFields + kQuantizedOpOffset]});
-    if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-      nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
-          {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale,
-           nodes[i * kNumFields + kDequantOpWeightScaleOffset]});
-    } else {
-      nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
-          {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
-    }
-    nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
-        {nodes[i * kNumFields + kDequantOpOffset]});
-  }
-}
-
-void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) {
-  auto reshape1_op =
-      pattern->NewNode(reshape1_op_repr())->assert_is_op("reshape2");
-  reshape1_op->assert_more([&](Node *x) {
-    return boost::get<std::vector<int>>(x->Op()->GetAttr("shape")).size() == 5;
-  });
-
-  auto reshape1_out = pattern->NewNode(reshape1_out_repr())
-                          ->assert_is_op_output("reshape2", "Out")
-                          ->assert_is_op_input("transpose2")
-                          ->AsIntermediate();
-
-  auto transpose_op =
-      pattern->NewNode(transpose_op_repr())->assert_is_op("transpose2");
-
-  auto transpose_out = pattern->NewNode(transpose_out_repr())
-                           ->assert_is_op_output("transpose2", "Out")
-                           ->assert_is_op_input("reshape2")
-                           ->AsIntermediate();
-
-  auto reshape2_op =
-      pattern->NewNode(reshape2_op_repr())->assert_is_op("reshape2");
-  auto reshape2_out = pattern->NewNode(reshape2_out_repr())
-                          ->assert_is_op_output("reshape2", "Out")
-                          ->AsOutput();
-
-  reshape1_op->LinksFrom({reshape1_in});
-  reshape1_out->LinksFrom({reshape1_op});
-  transpose_op->LinksFrom({reshape1_out});
-  transpose_out->LinksFrom({transpose_op});
-  reshape2_op->LinksFrom({transpose_out});
-  reshape2_out->LinksFrom({reshape2_op});
-}
-
-void patterns::DeleteQuantDequantOpPattern::operator()() {
-  auto any_op_out =
-      pattern->NewNode(any_op_out_repr())
-          ->assert_is_op_input(
-              "fake_quantize_dequantize_moving_average_abs_max", "X")
-          ->AsInput();
-
-  auto quant_dequant_op_inscale =
-      pattern->NewNode(quant_dequant_op_inscale_repr())
-          ->assert_is_op_input(
-              "fake_quantize_dequantize_moving_average_abs_max", "InScale")
-          ->AsInput();
-  auto quant_dequant_op =
-      pattern->NewNode(quant_dequant_op_repr())
-          ->assert_is_op("fake_quantize_dequantize_moving_average_abs_max");
-
-  auto quant_dequant_out =
-      pattern->NewNode(quant_dequant_op_out_repr())
-          ->assert_is_op_output(
-              "fake_quantize_dequantize_moving_average_abs_max", "Out")
-          ->AsIntermediate();
-
-  auto quant_dequant_op_outscale =
-      pattern->NewNode(quant_dequant_op_outscale_repr())
-          ->assert_is_op_output(
-              "fake_quantize_dequantize_moving_average_abs_max", "OutScale")
-          ->AsOutput();
-  auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput();
-
-  quant_dequant_op->LinksFrom({any_op_out, quant_dequant_op_inscale});
-  quant_dequant_op_outscale->LinksFrom({quant_dequant_op});
-  quant_dequant_out->LinksFrom({quant_dequant_op});
-  any_op2->LinksFrom({quant_dequant_out});
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
deleted file mode 100644
index 0d7d56cabf30a1e984044f1720e01935b341cf4f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ /dev/null
@@ -1,1062 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_TESTING
-#include <gtest/gtest_prod.h>
-#endif
-
-#include <memory>
-#include <numeric>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/inference/analysis/dot.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-class PDPattern;
-
-// Some basic terminologies:
-//   - PDPattern: a pattern defined as a data flow graph.
-//   - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
-//     that meets some conditions defined in `PDNode.teller`.
-//   - A pattern is defined with PDNodes with edges.
-
-// Pattern detector node. This node helps to build a pattern.
-struct PDNode {
-  // tell whether an ir::Node* is a candidation for a PDNode.
-  using teller_t = std::function<bool(Node*)>;
-  enum class Type { kOp, kVar };
-  enum class Role {
-    kUnknown,      // No role,
-    kInput,        // an input and will be retained,
-    kOutput,       // an output and will be retained,
-    kIntermediate  // will be removed after handler.
-  };
-
-  // this link to others
-  PDNode& LinksTo(const std::vector<PDNode*>& others);
-  PDNode& LinksFrom(const std::vector<PDNode*>& others);
-
-  bool Tell(Node* node) const {
-    if (teller_) return teller_(node);
-
-    for (auto& asrt : asserts_) {
-      if (!asrt(node)) return false;
-    }
-    return true;
-  }
-
-  bool IsOp() const { return type_ == Type::kOp; }
-  bool IsVar() const { return type_ == Type::kVar; }
-
-  const std::string& name() const { return name_; }
-
-  PDNode& operator=(const PDNode&) = delete;
-  PDNode(const PDNode&) = delete;
-
-  // Mark this node is an Input of a subgraph and will be retained.
-  PDNode* AsInput() {
-    role_ = Role::kInput;
-    return this;
-  }
-  // Mark this node is an Output of a subgraph and will be retained.
-  PDNode* AsOutput() {
-    role_ = Role::kOutput;
-    return this;
-  }
-  // Mark this node will be removed, so all the links should be inside a matched
-  // sub-graph.
-  PDNode* AsIntermediate() {
-    role_ = Role::kIntermediate;
-    return this;
-  }
-
-  bool IsIntermediate() const { return role_ == Role::kIntermediate; }
-  bool IsInput() const { return role_ == Role::kInput; }
-  bool IsOutput() const { return role_ == Role::kOutput; }
-
-  // Assertions, helper functions to simplify the pattern definition.
-  PDNode* assert_is_op();
-  PDNode* assert_is_op(const std::string& op_type);
-  PDNode* assert_is_var();
-  PDNode* assert_is_not_ctrl_var();
-  PDNode* assert_var_not_persistable();
-  PDNode* assert_is_persistable_var();
-  PDNode* assert_is_op_output(const std::string& op_type);
-  PDNode* assert_is_op_output(const std::string& op_type,
-                              const std::string& argument);
-  PDNode* assert_is_op_input(const std::string& op_type);
-  PDNode* assert_is_op_input(const std::string& op_type,
-                             const std::string& argument);
-  PDNode* assert_is_op_nth_input(const std::string& op_type,
-                                 const std::string& argument, int nth);
-  PDNode* assert_is_op_nth_output(const std::string& op_type,
-                                  const std::string& argument, int nth);
-  PDNode* assert_is_only_input_of_op(const std::string& op_type);
-  PDNode* assert_is_only_output_of_op(const std::string& op_type);
-  PDNode* assert_op_has_n_inputs(const std::string& op_type, size_t n);
-  PDNode* assert_op_has_n_outputs(const std::string& op_type, size_t n);
-  PDNode* assert_more(teller_t&& teller);
-
-  PDNode* assert_is_ops_output(const std::unordered_set<std::string>& op_types);
-  PDNode* assert_is_ops(const std::unordered_set<std::string>& op_types);
-  PDNode* assert_is_ops_output(const std::unordered_set<std::string>& op_types,
-                               const std::string& argument);
-  PDNode* assert_is_ops_nth_input(
-      const std::unordered_set<std::string>& op_types,
-      const std::string& argument, int nth);
-  PDNode* assert_is_ops_input(const std::unordered_set<std::string>& op_types);
-  PDNode* assert_is_ops_input(const std::unordered_set<std::string>& op_types,
-                              const std::string& argument);
-  PDNode* assert_is_ops_nth_output(
-      const std::unordered_set<std::string>& op_types,
-      const std::string& argument, int nth);
-
-  PDNode* assert_has_n_inputs(size_t n);
-  PDNode* assert_has_n_outputs(size_t n);
-
-  template <typename T>
-  PDNode* assert_op_attr(const std::string& attr_name, const T& attr) {
-    asserts_.emplace_back([=](Node* x) {
-      return x && x->IsOp() && x->Op()->HasAttr(attr_name) &&
-             boost::get<T>(x->Op()->GetAttr(attr_name)) == attr;
-    });
-    return this;
-  }
-
- private:
-  PDNode(PDPattern* pattern, const std::string& name = "",
-         Type type = Type::kVar)
-      : pattern_(pattern), name_(name), type_(type) {}
-  PDNode(teller_t&& teller, PDPattern* pattern, const std::string& name = "",
-         Type type = Type::kVar)
-      : teller_(std::move(teller)),
-        pattern_(pattern),
-        name_(name),
-        type_(type) {
-    PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set.");
-  }
-
-  PDNode(PDNode&& other) = default;
-
-  friend class PDPattern;
-
-  // Will removed latter.
-  teller_t teller_;
-  std::vector<teller_t> asserts_;
-  PDPattern* pattern_;
-  std::string name_;
-  Type type_;
-  Role role_{Role::kUnknown};
-};
-
-/*
- * A pattern in a graph, which defined with PDNode and edges. Most graph
- * patterns can be divided into PDNodes and link relations between them.
- *
- * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD
- * operators from the computation graph, the MUL's output should have only one
- * consumer which is the ELEMENTWISE_ADD.
- * This pattern can be defined as with the following pseudo codes
- *
- *     // Create two operator PDNodes.
- *     MUL = PDPattern.NewNode().assert_is_op("mul");
- *     ELE = PDPattern.NewNode().assert_is_op("elementwise_add");
- *     // Create the variable PDNodes.
- *     MUL_out = PDPattern.NewNode().assert_is_op_output("mul") \
- *                                  .assert_is_op_input("elementwise_add") \
- *                                  .AsIntermediate();
- *     // Add relations.
- *     MUL->LinksTo({MUL_out});
- *     MUL_out->LinksTo({ELE});
- *
- * One can add more specific asserts for PDNodes or edges, both the Operator
- * and Variable Nodes can be ruled in PDNode.assert_more(...).
- *
- * PDPattern can record the general patterns, such as the pattern represents
- *   - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place.
- *   - Ops whose inputs and outputs share the same variables
- */
-class PDPattern {
- public:
-  using edge_t = std::pair<PDNode*, PDNode*>;
-
-  void AddEdge(PDNode* a, PDNode* b);
-
-  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
-  PDNode* NewNode(const std::string& name = NewID());
-  PDNode* NewNode(const std::string& prefix, const std::string& name) {
-    return NewNode(prefix + "/" + name);
-  }
-  PDNode* RetrieveNode(const std::string& id) const;
-
-  const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
-  const std::vector<edge_t>& edges() const { return edges_; }
-
-  std::string DotString() const;
-
- private:
-#ifdef PADDLE_WITH_TESTING
-  FRIEND_TEST(PDPattern, AddEdge);
-  FRIEND_TEST(PDPattern, NewNode);
-#endif
-
-  static std::string NewID() { return "pdnode-" + std::to_string(id_++); }
-
-  std::vector<std::unique_ptr<PDNode>> nodes_;
-  std::vector<edge_t> edges_;
-  std::unordered_map<std::string, PDNode*> node_map_;
-  static size_t id_;
-};
-
-/*
- * GraphPatternDetector helps to detect the specific patterns in the graph.
- * Input a pattern, output a list of the matched subgraphs/nodes.
- * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
- *
- * The algorithm has three phases:
- *   1. Mark the nodes that match the defined PDNodes in a PDPattern,
- *   2. Extend a PDNode to subgraphs by deducing the connection relation defined
- *      in PAPattern(the edges),
- *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
- *
- * Usage:
- *    // Create a detector
- *    GraphPatternDetector detector;
- *    // Define the detector's pattern, by adding PDNode and define the edges.
- *    auto* node0 = detector.mutable_pattern().AddNode(...)
- *    auto* node1 = detector.mutable_pattern().AddNode(...)
- *    node0->teller = some lambda.
- *    node1->teller = some lambda.
- *    detector.mutable_pattern().AddEdge(node0, node1);
- *    // Create an handler, to define the behavior of treating the filtered
- *    // subgraphs that comply with the patterns.
- *    GraphPatternDetector::handle_t handler = some labmda
- *    // Execute the detector.
- *    detector(&graph, handler);
- */
-class GraphPatternDetector {
- public:
-  using subgraph_t = std::unordered_map<PDNode*, Node*>;
-
-  // Operate on the detected pattern.
-  using handle_t =
-      std::function<void(const subgraph_t& /*hitted pattern*/, Graph*)>;
-
-  void operator()(Graph* graph, handle_t handler);
-
-  const PDPattern& pattern() const { return pattern_; }
-  PDPattern* mutable_pattern() { return &pattern_; }
-
- private:
-  // Mark the nodes that fits the pattern.
-  bool MarkPDNodesInGraph(const ir::Graph& graph);
-
-  // Detect all the pattern and output the hit records.
-  std::vector<subgraph_t> DetectPatterns();
-
-  // Remove duplicate patterns.
-  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
-
-  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
-  // The intermediate PDNodes will be removed, so can't shared by multiple
-  // patterns.
-  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
-
-  // Validate whether the intermediate nodes are linked by external nodes.
-  void ValidateByNodeRole(std::vector<subgraph_t>* subgraphs);
-
-#ifdef PADDLE_WITH_TESTING
-  FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph);
-  FRIEND_TEST(GraphPatternDetecter, DetectPatterns);
-#endif
-
- private:
-  using hit_rcd_t =
-      std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
-  PDPattern pattern_;
-  std::unordered_map<const PDNode*, std::unordered_set<Node*>> pdnodes2nodes_;
-};
-
-// some helper methods.
-
-// Tell if a var links to an Op
-bool VarLinksToOp(Node* node, const std::string& op_type);
-
-// Tell if an op links to a var
-bool VarLinksFromOp(Node* node, const std::string& op_type);
-
-// Check whether a var node is a op node's nth input.
-bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);
-
-// Check whether the op node has input of given name.
-bool HasInput(Node* op, const std::string& argument);
-
-// Tell whether a var node is a op node's nth output.
-bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
-
-// Graph safely remove some nodes, will automatically clean up the edges.
-void GraphSafeRemoveNodes(Graph* graph,
-                          const std::unordered_set<const Node*>& nodes);
-
-// Some pre-defined patterns those can be reused in multiple passes.
-// The related Fluid Layer or Op should be one pattern here for better re-usage
-// across different fusion.
-namespace patterns {
-
-struct KeyCounter {
-  static KeyCounter& Instance() {
-    static KeyCounter x;
-    return x;
-  }
-
-  int IncCounter(const std::string& key) { return dic_[key]++; }
-
- private:
-  std::unordered_map<std::string, size_t> dic_;
-};
-
-// Generate a unique PDNode's name with name_scope and id.
-// The format is {name_scope}/{repr}/{id}/{name}
-static std::string PDNodeName(const std::string& name_scope,
-                              const std::string& repr, size_t id,
-                              const std::string& name) {
-  return string::Sprintf("%s/%s/%d/%s", name_scope, repr, id, name);
-}
-// Generate a unique PDNode's name.
-// The format is {name_scope}/{repr}/{id}
-static std::string PDNodeName(const std::string& name_scope,
-                              const std::string& repr) {
-  return string::Sprintf("%s/%s/%d", name_scope, repr,
-                         KeyCounter::Instance().IncCounter(repr));
-}
-// Generate a unique key. It can be used for a universally unique temporary
-// name.
-// The format is {repr}/{id}
-static std::string UniqueKey(const std::string& repr) {
-  return string::Sprintf("%s/%d", repr,
-                         KeyCounter::Instance().IncCounter(repr));
-}
-
-// Declare a PDNode in a pattern, will create two methods:
-// std::string xxx_repr(); return this PDNode's string id.
-// PDNode* xxx_n(); return the corresponding PDNode.
-#define PATTERN_DECL_NODE(name__)                        \
-  std::string name__##_repr() const {                    \
-    return PDNodeName(name_scope_, repr_, id_, #name__); \
-  }                                                      \
-  PDNode* name__##_n() const { return pattern->RetrieveNode(name__##_repr()); }
-
-// Get an ir::Node* from the matched subgraph.
-// var: variable.
-// arg: the argument declared by PATTERN_DECL_NODE in a pattern definition.
-// pat: the pattern object.
-#define GET_IR_NODE_FROM_SUBGRAPH(var, arg, pat)                    \
-  PADDLE_ENFORCE(subgraph.count(pat.arg##_n()),                     \
-                 "Node not found for PDNode %s", pat.arg##_repr()); \
-  Node* var = subgraph.at(pat.arg##_n());                           \
-  PADDLE_ENFORCE(var, "node %s not exists in the sub-graph", #arg)
-
-// The base class of all the patterns.
-struct PatternBase {
-  PatternBase(PDPattern* pattern, const std::string& name_scope,
-              const std::string& repr)
-      : pattern(pattern),
-        name_scope_(name_scope),
-        repr_(repr),
-        id_(KeyCounter::Instance().IncCounter(repr)) {}
-
-  PDPattern* pattern;
-
- protected:
-  std::string name_scope_;
-  std::string repr_;
-  size_t id_;
-};
-
-// Conv with batch norm
-// op: conv + (elementwise_add +) batch_norm
-// named nodes:
-// conv_weight, conv_out, conv,
-// bn_x, bn_scale, bn_bias, bn_mean,  bn_variance,
-// bn_batch_norm, bn_y, bn_mean_out, bn_variance_out,
-// bn_saved_mean, bn_saved_variance
-struct ConvBN : public PatternBase {
-  ConvBN(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_bn") {}
-
-  PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(conv);
-  PATTERN_DECL_NODE(batch_norm);
-  PATTERN_DECL_NODE(eltwise);  // ELEMENTWISE_ADD
-  // CONV inputs
-  PATTERN_DECL_NODE(conv_weight);  // Filter
-  // CONV outputs
-  PATTERN_DECL_NODE(conv_out);  // tmp
-  // ELTWISE inputs
-  PATTERN_DECL_NODE(eltwise_y_in);
-  // ELTWISE outputs
-  PATTERN_DECL_NODE(eltwise_out);  // tmp
-  // BN inputs
-  PATTERN_DECL_NODE(bn_scale);
-  PATTERN_DECL_NODE(bn_bias);
-  PATTERN_DECL_NODE(bn_mean);
-  PATTERN_DECL_NODE(bn_variance);
-  // BN outputs
-  PATTERN_DECL_NODE(bn_out);  // Out
-  PATTERN_DECL_NODE(bn_mean_out);
-  PATTERN_DECL_NODE(bn_variance_out);
-  PATTERN_DECL_NODE(bn_saved_mean);
-  PATTERN_DECL_NODE(bn_saved_variance);
-};
-
-// Conv with Activation
-// op: conv + activation
-// named nodes:
-// conv_input, conv_weight,
-// conv_out, conv,
-// activation_out, activation
-struct ConvActivation : public PatternBase {
-  ConvActivation(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_activation") {}
-
-  PDNode* operator()(PDNode* conv_input, std::string conv_type = "conv2d",
-                     std::string activation_type = "relu");
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(conv);
-  PATTERN_DECL_NODE(activation);
-  // declare variable node's name
-  PATTERN_DECL_NODE(conv_weight);
-  PATTERN_DECL_NODE(conv_out);
-  PATTERN_DECL_NODE(activation_out);
-};
-
-// SEQCONV with Elementwise_Add ReLU
-// op: seqconv + elementwise_add + relu
-// named nodes:
-// seqconv_input, seqconv_weight,
-// seqconv_out, seqconv,
-// elementwise_add_bias, elementwise_add_out, elementwise_add
-// relu_out, relu
-struct SeqConvEltAddRelu : public PatternBase {
-  SeqConvEltAddRelu(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "seqconv_eltadd_relu") {}
-
-  PDNode* operator()(PDNode* seqconv_input);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(seqconv);
-  PATTERN_DECL_NODE(eltadd);
-  PATTERN_DECL_NODE(relu);
-  // declare variable node's name
-  PATTERN_DECL_NODE(seqconv_weight);
-  PATTERN_DECL_NODE(seqconv_out);
-  PATTERN_DECL_NODE(eltadd_bias);
-  PATTERN_DECL_NODE(eltadd_out);
-  PATTERN_DECL_NODE(relu_out);
-};
-
-// FC with bias
-// op: mul + elementwise_add
-// named nodes:
-// mul, elementwise_add
-// w, mul_out, bias, fc_out
-struct FC : public PatternBase {
-  FC(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "fc") {}
-
-  PDNode* operator()(PDNode* x, bool with_bias, bool with_relu);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(fc);
-  PATTERN_DECL_NODE(mul);
-  PATTERN_DECL_NODE(elementwise_add);
-  PATTERN_DECL_NODE(relu);
-  // declare variable node's name
-  PATTERN_DECL_NODE(w);
-  PATTERN_DECL_NODE(mul_out);  // (x,w) -> mul_out
-  PATTERN_DECL_NODE(bias);
-  PATTERN_DECL_NODE(elementwise_add_out);
-  PATTERN_DECL_NODE(relu_out);
-};
-
-// MKL-DNN's FC with bias
-// op: fc
-// named node:
-// fc
-// w, bias, output
-struct FCMKLDNN : public PatternBase {
-  FCMKLDNN(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "fc_mkldnn") {}
-
-  PDNode* operator()(PDNode* x, bool with_bias);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(fc);
-  // declare variable node's name
-  PATTERN_DECL_NODE(weights);
-  PATTERN_DECL_NODE(bias);
-  PATTERN_DECL_NODE(output);
-};
-
-// Embedding
-struct Embedding : public PatternBase {
-  Embedding(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "embedding") {}
-
-  PDNode* operator()(PDNode* x);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(lookup_table);
-  // Inputs
-  //
-  PATTERN_DECL_NODE(Ids);
-  PATTERN_DECL_NODE(W);  // embeddings
-  // Outputs
-  PATTERN_DECL_NODE(Out);
-};
-
-struct LSTM : public PatternBase {
-  LSTM(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "lstm") {}
-
-  PDNode* operator()(PDNode* x);
-
-  // Operators
-  PATTERN_DECL_NODE(lstm);
-
-  // Inputs
-  PATTERN_DECL_NODE(Input);
-  PATTERN_DECL_NODE(H0);
-  PATTERN_DECL_NODE(C0);
-  PATTERN_DECL_NODE(Weight);
-  PATTERN_DECL_NODE(Bias);
-
-  // Outputs
-  PATTERN_DECL_NODE(Hidden);
-  PATTERN_DECL_NODE(Cell);
-  PATTERN_DECL_NODE(BatchGate);
-  PATTERN_DECL_NODE(BatchCellPreAct);
-};
-
-struct GRU : public PatternBase {
-  GRU(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "gru") {}
-
-  PDNode* operator()(PDNode* x);
-
-  // Operators
-  PATTERN_DECL_NODE(gru);
-
-  // Inputs
-  PATTERN_DECL_NODE(Bias);
-  PATTERN_DECL_NODE(Weight);
-
-  // Outputs
-  PATTERN_DECL_NODE(BatchGate);
-  PATTERN_DECL_NODE(BatchResetHiddenPrev);
-  PATTERN_DECL_NODE(BatchHidden);
-  PATTERN_DECL_NODE(Hidden);
-};
-
-// The following patterns are used to fuse elewise_add and act
-// formula: act(ele_add(x, y))
-// op: elementwise_add + act
-// named nodes: elementwise_add, act
-//              ele_x, ele_y, elewise_add_out, act_out
-struct ElewiseAddAct : public PatternBase {
-  ElewiseAddAct(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "elewise_add_act") {}
-
-  PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(ele_add);
-  PATTERN_DECL_NODE(act);
-  // declare variable node's name
-  PATTERN_DECL_NODE(elewise_add_out);
-  PATTERN_DECL_NODE(ele_y);
-  PATTERN_DECL_NODE(act_out);
-};
-
-// formula: ele_add(x, act(y))
-// op: elementwise_add + act
-// named nodes: elementwise_add, act
-//              act_in, act_out, ele_x, elewise_add_out
-struct ActElewiseAdd : public PatternBase {
-  ActElewiseAdd(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "act_elewise_add") {}
-
-  PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(act);
-  PATTERN_DECL_NODE(ele_add);
-  // declare variable node's name
-  PATTERN_DECL_NODE(act_out);
-  PATTERN_DECL_NODE(ele_x);
-  PATTERN_DECL_NODE(elewise_add_out);
-};
-
-// the backward of act(ele_add(x, y))
-// the act is inplace.
-// op: elementwise_add_grad + act_grad
-// named nodes: elementwise_add_grad, act_grad
-//              act_out, act_out_g, ele_y, d_itermediate_out, d_ele_x, d_ele_y
-struct ElewiseAddActInplaceGrad : public PatternBase {
-  ElewiseAddActInplaceGrad(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "elewise_add_act_grad1") {}
-
-  // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
-  // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
-  PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(act_grad);
-  PATTERN_DECL_NODE(ele_add_grad);
-  // declare variable node's name
-  PATTERN_DECL_NODE(act_out);
-  PATTERN_DECL_NODE(d_itermediate_out);
-  PATTERN_DECL_NODE(d_ele_x);
-  PATTERN_DECL_NODE(d_ele_y);
-  PATTERN_DECL_NODE(ele_y);
-};
-
-// Conv with Elementwise_add as bias
-// op: conv + elementwise_add
-// named nodes:
-// conv_input, conv_weight,
-// conv_out, conv,
-// eltwise_bias, eltwise_out,
-// elementwise_add
-struct ConvBias : public PatternBase {
-  ConvBias(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_bias") {}
-  PDNode* operator()(PDNode* conv_input, std::string conv_type = "conv2d");
-  // declare operator node's name
-  PATTERN_DECL_NODE(conv);
-  PATTERN_DECL_NODE(eltwise);
-  // declare variable node's name
-  PATTERN_DECL_NODE(conv_weight);
-  PATTERN_DECL_NODE(conv_out);
-  PATTERN_DECL_NODE(eltwise_bias);
-  PATTERN_DECL_NODE(eltwise_out);
-};
-
-// Convolution op
-// Forward pass for convolution.
-// conv_input, conv_bias and conv_filter are inputs.
-// conv_output is a result of the operator.
-// residual_data is data used by skip connection.
-// If residual connection fusion is on, the formula is:
-// conv_output = conv_op(conv_filter, conv_input, conv_bias)
-//             + conv_residual_data
-// If the fusion is off, conv_residual_data is not added.
-struct Conv : public PatternBase {
-  Conv(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "convolution") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_input);
-  PATTERN_DECL_NODE(conv_filter);
-  PATTERN_DECL_NODE(conv_residual_data);
-  PATTERN_DECL_NODE(conv_output);
-};
-
-// Convolution op with residual data
-struct ConvResidual : public PatternBase {
-  ConvResidual(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_residual") {}
-
-  PDNode* operator()(bool with_residual_data);
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_input);
-  PATTERN_DECL_NODE(conv_filter);
-  PATTERN_DECL_NODE(conv_residual_data);
-  PATTERN_DECL_NODE(conv_output);
-};
-
-// Pool op
-// Forward pass for pooling.
-// pool_input is the input.
-// pool_output is a result of the operator.
-struct Pool : public PatternBase {
-  Pool(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "pooling") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(pool_op);
-  PATTERN_DECL_NODE(pool_input);
-  PATTERN_DECL_NODE(pool_output);
-};
-
-// ElementwiseAdd used in residual connections.
-// y_var is used and convolution output.
-// The operator is removed, when residual
-// connection fusion is on.
-struct ElementwiseAdd : public PatternBase {
-  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "elementwise_add") {}
-
-  PDNode* operator()(PDNode* x_var, PDNode* y_var);
-
-  PATTERN_DECL_NODE(elementwise_add_op);
-  PATTERN_DECL_NODE(elementwise_add_x);
-  PATTERN_DECL_NODE(elementwise_add_y);
-  PATTERN_DECL_NODE(elementwise_add_out);
-};
-
-// Concat op
-// Forward pass for concat.
-// concat_out is a result of the operator.
-struct Concat : public PatternBase {
-  Concat(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "concat") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(concat_op);
-  PATTERN_DECL_NODE(concat_out);
-};
-
-// Concat + ReLU
-// named nodes:
-// concat_op, concat_out, relu_op, relu_out
-struct ConcatReLU : public PatternBase {
-  ConcatReLU(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "concat_relu") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(concat_op);
-  PATTERN_DECL_NODE(concat_out);
-  PATTERN_DECL_NODE(relu_op);
-  PATTERN_DECL_NODE(relu_out);
-};
-
-// Conv + Concat + ReLU
-// named nodes:
-// conv_op, conv_out
-// concat_op, concat_out, relu_op, relu_out
-struct ConvConcatReLU : public PatternBase {
-  ConvConcatReLU(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_concat_relu") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_out);
-  PATTERN_DECL_NODE(concat_op);
-  PATTERN_DECL_NODE(concat_out);
-  PATTERN_DECL_NODE(relu_op);
-  PATTERN_DECL_NODE(relu_out);
-};
-
-// Conv + Requant
-// named nodes:
-// conv_op, conv_out
-// requant_op, requant_out
-struct ConvRequant : public PatternBase {
-  ConvRequant(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_requant") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_out);
-
-  PATTERN_DECL_NODE(requant_op);
-  PATTERN_DECL_NODE(requant_out);
-};
-
-// Conv + Dequant
-// named nodes:
-// conv_op, conv_out
-// dequant_op, dequant_out
-struct ConvDequant : public PatternBase {
-  ConvDequant(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_dequant") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_out);
-
-  PATTERN_DECL_NODE(dequant_op);
-  PATTERN_DECL_NODE(dequant_out);
-};
-
-// PriorBox operator
-// operator: prior_box_op
-// inputs: prior_box_input, prior_box_image
-// outputs: prior_box_boxes, prior_box_variances
-struct PriorBox : public PatternBase {
-  PriorBox(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "PriorBox") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(prior_box_op);
-  PATTERN_DECL_NODE(prior_box_input);
-  PATTERN_DECL_NODE(prior_box_image);
-  PATTERN_DECL_NODE(prior_box_boxes);
-  PATTERN_DECL_NODE(prior_box_variances);
-};
-
-// Conv + ElementwiseAdd + an activation
-// This pattern can futher fuse the conv related ops after the conv+bn fusion.
-struct ConvElementwiseaddAct : public PatternBase {
-  ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {}
-
-  PDNode* operator()(PDNode* conv_in);
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_out);
-  PATTERN_DECL_NODE(conv_filter);
-
-  PATTERN_DECL_NODE(elementwise_add_op);
-  PATTERN_DECL_NODE(elementwise_add_in_y);  // input
-  PATTERN_DECL_NODE(elementwise_add_out);
-
-  PATTERN_DECL_NODE(act_op);
-  PATTERN_DECL_NODE(act_out);
-};
-
-// Conv + ElementwiseAdd + ElementwiseAdd + Activation
-struct ConvElementwiseadd2Act : public PatternBase {
-  ConvElementwiseadd2Act(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope,
-                    "conv_elementwiseadd2_elementwiseadd_act") {}
-
-  PDNode* operator()(PDNode* conv_in);
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_filter);
-  PATTERN_DECL_NODE(conv_out);
-
-  PATTERN_DECL_NODE(elementwise_add_op);
-  PATTERN_DECL_NODE(elementwise_add_in_y);  // input
-  PATTERN_DECL_NODE(elementwise_add_out);
-
-  PATTERN_DECL_NODE(elementwise_add_op_1);
-  PATTERN_DECL_NODE(elementwise_add_in_y_1);  // input
-  PATTERN_DECL_NODE(elementwise_add_out_1);
-
-  PATTERN_DECL_NODE(act_op);
-  PATTERN_DECL_NODE(act_out);
-};
-
-// Conv + ElementwiseAdd
-// This pattern should be used after ConvElementwiseadd2Act or
-// ConvElementwiseadd pass
-struct ConvElementwiseadd : public PatternBase {
-  ConvElementwiseadd(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_elementwiseadd") {}
-
-  PDNode* operator()(PDNode* conv_in);
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_out);
-  PATTERN_DECL_NODE(conv_filter);
-
-  PATTERN_DECL_NODE(elementwise_add_op);
-  PATTERN_DECL_NODE(elementwise_add_in_y);
-  PATTERN_DECL_NODE(elementwise_add_out);
-};
-
-// Conv with affine_channel
-// op: conv + (elementwise_add +) affine_channel
-// named nodes:
-// conv_weight, conv_out, conv,
-// ac_x, ac_scale, ac_bias
-// affine_channel, ac_out
-struct ConvAffineChannel : public PatternBase {
-  ConvAffineChannel(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_affine_channel") {}
-
-  PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(conv);
-  PATTERN_DECL_NODE(affine_channel);
-  PATTERN_DECL_NODE(eltwise);  // ELEMENTWISE_ADD
-  // CONV inputs
-  PATTERN_DECL_NODE(conv_weight);  // Filter
-  // CONV outputs
-  PATTERN_DECL_NODE(conv_out);  // tmp
-  // ELTWISE inputs
-  PATTERN_DECL_NODE(eltwise_y_in);
-  // ELTWISE outputs
-  PATTERN_DECL_NODE(eltwise_out);  // tmp
-
-  // AC(Affine_Channel) inputs
-  PATTERN_DECL_NODE(ac_scale);
-  PATTERN_DECL_NODE(ac_bias);
-  // AC outputs
-  PATTERN_DECL_NODE(ac_out);  // Out
-};
-
-// Dequantize + Quantize + anyOP
-// This pattern is used for squashing the dequantize-quantize pairs.
-struct DequantQuantAny : public PatternBase {
-  DequantQuantAny(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "dequant_quant_any") {}
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(dequant_in);
-  PATTERN_DECL_NODE(dequant_op);
-  PATTERN_DECL_NODE(dequant_out);
-  PATTERN_DECL_NODE(quant_op);
-  PATTERN_DECL_NODE(quant_out);
-  PATTERN_DECL_NODE(next_op);
-};
-
-// Dequantize + anyOP
-// This quantize is used for getting number of ops the Dequantize's
-// output is an input to.
-struct DequantAny : public PatternBase {
-  DequantAny(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "dequant_any") {}
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(dequant_op);
-  PATTERN_DECL_NODE(dequant_out);
-  PATTERN_DECL_NODE(next_op);
-};
-
-struct TransposeFlattenConcat : public PatternBase {
-  TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "transpose_flatten_concat") {}
-
-  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
-
-  std::string GetNodeName(const std::string& op_type) {
-    return PDNodeName(name_scope_, repr_, id_, op_type);
-  }
-
-  PDNode* GetPDNode(const std::string& op_type) {
-    return pattern->RetrieveNode(GetNodeName(op_type));
-  }
-};
-
-struct AnakinDetectionPattern : public PatternBase {
-  AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
-
-  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times,
-                     std::string priorbox_type, bool is_reshape);
-
-  std::string GetNodeName(const std::string& op_type) {
-    return PDNodeName(name_scope_, repr_, id_, op_type);
-  }
-
-  PDNode* GetPDNode(const std::string& op_type) {
-    return pattern->RetrieveNode(GetNodeName(op_type));
-  }
-};
-
-struct FillConstantElementWiseMulFuse : public PatternBase {
-  FillConstantElementWiseMulFuse(PDPattern* pattern,
-                                 const std::string& name_scope)
-      : PatternBase(pattern, name_scope,
-                    "anakin_fillconstant_elementwisemul_fuse") {}
-
-  PDNode* operator()(PDNode* elementwise_op_input);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(fill_constant);
-  PATTERN_DECL_NODE(fill_constant_out);
-  PATTERN_DECL_NODE(elementwise_mul);
-  PATTERN_DECL_NODE(elementwise_mul_out);
-};
-
-struct QuantDequantOpFuse : public PatternBase {
-  QuantDequantOpFuse(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "quant_dequant_fuse") {}
-
-  void operator()(PDNode* quant_op_input, const std::string& op_name,
-                  const std::string& weight_name, int times,
-                  const std::string& quant_type,
-                  const std::string& dequant_type);
-
-  std::string GetNodeName(const std::string& op_type) {
-    return PDNodeName(name_scope_, repr_, id_, op_type);
-  }
-
-  PDNode* GetPDNode(const std::string& op_type) {
-    return pattern->RetrieveNode(GetNodeName(op_type));
-  }
-};
-
-struct ShuffleChannelPattern : public PatternBase {
-  ShuffleChannelPattern(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "shufflechannel_pattern") {}
-
-  void operator()(PDNode* reshape1_in);
-
-  PATTERN_DECL_NODE(reshape1_op);
-  PATTERN_DECL_NODE(reshape1_out);
-
-  PATTERN_DECL_NODE(transpose_op);
-  PATTERN_DECL_NODE(transpose_out);
-  PATTERN_DECL_NODE(reshape2_op);
-  PATTERN_DECL_NODE(reshape2_out);
-};
-
-struct DeleteQuantDequantOpPattern : public PatternBase {
-  DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {}
-
-  void operator()();
-
-  PATTERN_DECL_NODE(any_op_out);
-  PATTERN_DECL_NODE(quant_dequant_op_inscale);
-  PATTERN_DECL_NODE(quant_dequant_op);
-  PATTERN_DECL_NODE(quant_dequant_op_outscale);
-  PATTERN_DECL_NODE(quant_dequant_op_out);
-  PATTERN_DECL_NODE(any_op2);
-};
-
-}  // namespace patterns
-
-// Link two ir::Nodes from each other.
-#define IR_NODE_LINK_TO(a, b) \
-  a->outputs.push_back(b);    \
-  b->inputs.push_back(a);
-
-// Set the out_var as the output of the op
-#define IR_OP_VAR_LINK(op, out_var) \
-  op->outputs.push_back(out_var);   \
-  out_var->inputs.clear();          \
-  out_var->inputs.push_back(op);
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
deleted file mode 100644
index 6c466fb21fb46e09961dc874e9e39655f83d17c6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void BuildGraph(Graph* g) {
-  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
-  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
-  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
-  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
-  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
-  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
-  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
-  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
-  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
-
-  // o1->v1->o2
-  o1->outputs.push_back(v1);
-  o2->inputs.push_back(v1);
-  v1->inputs.push_back(o1);
-  v1->outputs.push_back(o2);
-  // o2->v2->o3
-  // o2->v2->o4
-  o2->outputs.push_back(v2);
-  o3->inputs.push_back(v2);
-  o4->inputs.push_back(v2);
-  v2->inputs.push_back(o2);
-  v2->outputs.push_back(o3);
-  v2->outputs.push_back(o4);
-  // o2->v3->o5
-  o2->outputs.push_back(v3);
-  o5->inputs.push_back(v3);
-  v3->inputs.push_back(o2);
-  v3->outputs.push_back(o5);
-  // o3-v4->o5
-  o3->outputs.push_back(v4);
-  o5->inputs.push_back(v4);
-  v4->inputs.push_back(o3);
-  v4->outputs.push_back(o5);
-}
-
-TEST(PDPattern, NewNode) {
-  PDPattern x;
-  auto* n = x.NewNode([](Node* x) { return true; });
-  ASSERT_TRUE(n);
-  ASSERT_EQ(x.nodes_.size(), 1UL);
-}
-
-TEST(PDPattern, AddEdge) {
-  PDPattern x;
-  auto* a = x.NewNode([](Node* x) { return true; });
-  auto* b = x.NewNode([](Node* x) { return true; });
-  ASSERT_TRUE(a);
-  ASSERT_TRUE(b);
-  x.AddEdge(a, b);
-  ASSERT_EQ(x.nodes_.size(), 2UL);
-  ASSERT_EQ(x.edges_.size(), 1UL);
-  ASSERT_EQ(x.edges_.front().first, a);
-  ASSERT_EQ(x.edges_.front().second, b);
-
-  ASSERT_EQ(x.nodes().size(), 2UL);
-  ASSERT_EQ(x.edges().size(), 1UL);
-  ASSERT_EQ(x.edges().front().first, a);
-  ASSERT_EQ(x.edges().front().second, b);
-}
-
-TEST(GraphPatternDetecter, MarkPDNodesInGraph) {
-  GraphPatternDetector x;
-  // mark o2, o3, v2
-
-  // The pattern is a graph:
-  //   o2(a node named o2) -> v2(a node named v2)
-  //   v2 -> o3(a node named o3)
-  auto* o2 = x.pattern_.NewNode([](Node* node) {
-    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->Name() == "op2" && node->IsOp();
-  });
-  auto* o3 = x.pattern_.NewNode([](Node* node) {
-    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->Name() == "op3" && node->IsOp();
-  });
-  auto* v2 = x.pattern_.NewNode([](Node* node) {
-    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->Name() == "var2" && node->IsVar();
-  });
-
-  ASSERT_FALSE(o2->Tell(nullptr));
-  ASSERT_FALSE(o3->Tell(nullptr));
-  ASSERT_FALSE(v2->Tell(nullptr));
-
-  x.pattern_.AddEdge(o2, v2);
-  x.pattern_.AddEdge(v2, o3);
-
-  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
-  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
-  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
-  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
-  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
-
-  ProgramDesc program;
-  Graph graph(program);
-  BuildGraph(&graph);
-
-  x.MarkPDNodesInGraph(graph);
-
-  ASSERT_EQ(x.pdnodes2nodes_.size(), 3UL);
-
-  auto subgraphs = x.DetectPatterns();
-  ASSERT_EQ(subgraphs.size(), 1UL);
-}
-
-TEST(GraphPatternDetecter, MultiSubgraph) {
-  ProgramDesc program;
-  Graph graph(program);
-  BuildGraph(&graph);
-
-  GraphPatternDetector x;
-
-  // The pattern is a graph:
-  //   op -> var
-  auto* any_op = x.mutable_pattern()->NewNode(
-      [](Node* node) {
-        return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
-      },
-      "OP0");
-  auto* any_var = x.mutable_pattern()
-                      ->NewNode([](Node* node) { return node->IsVar(); }, "VAR")
-                      ->AsIntermediate();
-  auto* any_op1 = x.mutable_pattern()->NewNode(
-      [](Node* node) { return node->IsOp(); }, "OP1");
-
-  x.mutable_pattern()->AddEdge(any_op, any_var);
-  x.mutable_pattern()->AddEdge(any_var, any_op1);
-
-  int count = 0;
-  GraphPatternDetector::handle_t handle = [&](
-      const GraphPatternDetector::subgraph_t& s, Graph* g) {
-    LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
-              << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
-    count++;
-  };
-
-  x(&graph, handle);
-
-  // 1. Detect op3 -> var4 -> op5
-  // 2. Detect op2 -> var2 -> op3
-  // 3. Detect op2 -> var2 -> op4
-  // 4. Detect op2 -> var3 -> op5
-  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
-  ASSERT_GE(count, 1);
-  ASSERT_LE(count, 2);
-}
-
-TEST(GraphPatternDetector, IntermediateCheck) {
-  ProgramDesc program;
-  Graph graph(program);
-  BuildGraph(&graph);
-
-  // o2->v2->o3
-  // o2->v2->o4
-  // check o2+o3 fuse, should fail because v2 also link to o4.
-  GraphPatternDetector detector;
-  auto* op2 = detector.mutable_pattern()->NewNode(
-      [](Node* x) { return x && x->IsOp() && x->Name() == "op2"; }, "op2");
-  auto* op3 = detector.mutable_pattern()->NewNode(
-      [](Node* x) { return x && x->IsOp() && x->Name() == "op3"; }, "op3");
-  auto* v2 =
-      detector.mutable_pattern()
-          ->NewNode(
-              [](Node* x) { return x && x->IsVar() && x->Name() == "var2"; },
-              "var2")
-          ->AsIntermediate();
-  v2->LinksFrom({op2}).LinksTo({op3});
-
-  int count = 0;
-  detector(&graph, [&](const GraphPatternDetector::subgraph_t& g,
-                       Graph* graph) { ++count; });
-  EXPECT_EQ(count, 0);
-
-  count = 0;
-  v2->AsInput();
-  detector(&graph, [&](const GraphPatternDetector::subgraph_t& g,
-                       Graph* graph) { ++count; });
-  ASSERT_EQ(count, 1);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_printer.h b/paddle/fluid/framework/ir/graph_printer.h
deleted file mode 100644
index 76b07f0d6530907e7b20253d6a2a744fd2e11362..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_printer.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <glog/logging.h>
-#include <fstream>
-#include <iosfwd>
-#include <memory>
-#include <ostream>
-#include <string>
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-constexpr char kGraphvizPath[] = "graph_viz_path";
-
-class SSAGraphPrinter {
- public:
-  virtual ~SSAGraphPrinter() {}
-  virtual void Print(const ir::Graph& graph, std::ostream& sout) const = 0;
-};
-
-class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
- public:
-  void Print(const ir::Graph& graph, std::ostream& sout) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
deleted file mode 100644
index 23a61b282c3d4ce5aa8b0a9d9ae106b34988ecdc..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-class NOP : public OperatorBase {
- public:
-  NOP(const std::string &type, const VariableNameMap &inputs,
-      const VariableNameMap &outputs, const AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const Scope &scope,
-               const platform::Place &place) const override {}
-};
-
-class SumOpMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "").AsDuplicable();
-    AddComment("");
-  }
-};
-
-class SumOpVarTypeInference : public VarTypeInference {
- public:
-  void operator()(InferVarTypeContext *ctx) const override {
-    auto &inputs = ctx->Input("X");
-    auto default_var_type = proto::VarType::SELECTED_ROWS;
-
-    bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
-          return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
-        });
-    if (any_input_is_lod_tensor) {
-      default_var_type = proto::VarType::LOD_TENSOR;
-    }
-
-    auto out_var_name = ctx->Output("Out").front();
-    ctx->SetType(out_var_name, default_var_type);
-  }
-};
-
-class DummyOpMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "").AsDuplicable();
-    AddComment("");
-  }
-};
-
-class DummyOpVarTypeInference : public VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {}
-};
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
-                  paddle::framework::SumOpVarTypeInference);
-REGISTER_OPERATOR(dummy, paddle::framework::NOP, paddle::framework::SumOpMaker,
-                  paddle::framework::SumOpVarTypeInference);
-REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
-                  paddle::framework::SumOpMaker);
-
-namespace paddle {
-namespace framework {
-
-TEST(GraphTest, Basic) {
-  ProgramDesc prog;
-  auto *op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("sum");
-  op->SetInput("X", {"test_a", "test_b", "test_c"});
-  op->SetOutput("Out", {"test_out"});
-  op->SetAttr("op_role", 1);
-
-  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarType::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_out");
-
-  op->InferVarType(prog.MutableBlock(0));
-
-  ASSERT_EQ(proto::VarType::SELECTED_ROWS,
-            prog.MutableBlock(0)->Var("test_out")->GetType());
-
-  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::LOD_TENSOR);
-  op->InferVarType(prog.MutableBlock(0));
-  ASSERT_EQ(proto::VarType::LOD_TENSOR,
-            prog.MutableBlock(0)->Var("test_out")->GetType());
-
-  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-  std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end());
-  for (ir::Node *n : nodes) {
-    if (n->Name() == "sum") {
-      ASSERT_EQ(n->inputs.size(), 3UL);
-      ASSERT_EQ(n->outputs.size(), 1UL);
-    } else if (n->Name() == "test_a" || n->Name() == "test_b" ||
-               n->Name() == "test_c") {
-      ASSERT_EQ(n->inputs.size(), 0UL);
-      ASSERT_EQ(n->outputs.size(), 1UL);
-    } else if (n->Name() == "test_out") {
-      ASSERT_EQ(n->inputs.size(), 1UL);
-      ASSERT_EQ(n->outputs.size(), 0UL);
-    }
-  }
-  ASSERT_EQ(nodes.size(), 5UL);
-}
-
-TEST(GraphTest, WriteAfterRead) {
-  // void Test() {
-  ProgramDesc prog;
-  auto *op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("sum");
-  op->SetInput("X", {"a"});
-  op->SetOutput("Out", {"b"});
-  op->SetAttr("op_role", 1);
-
-  op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("dummy");
-  op->SetInput("X", {"c"});
-  op->SetOutput("Out", {"a"});
-  op->SetAttr("op_role", 1);
-
-  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
-
-  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-  ir::Node *control_dep1 = nullptr;
-  ir::Node *control_dep2 = nullptr;
-  for (ir::Node *n : g->Nodes()) {
-    if (n->Name() == "sum") {
-      ASSERT_EQ(n->outputs[0]->Name(), "b");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
-      control_dep1 = n->outputs[1];
-      ASSERT_EQ(n->outputs.size(), 2);
-    }
-    if (n->Name() == "dummy") {
-      ASSERT_EQ(n->inputs[0]->Name(), "c");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
-      control_dep2 = n->inputs[1];
-      ASSERT_EQ(n->inputs.size(), 2);
-    }
-  }
-  ASSERT_EQ(control_dep1, control_dep2);
-}
-
-TEST(GraphTest, WriteAfterWrite) {
-  // void Test() {
-  ProgramDesc prog;
-  auto *op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("sum");
-  op->SetInput("X", {"a"});
-  op->SetOutput("Out", {"b"});
-  op->SetAttr("op_role", 1);
-
-  op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("dummy");
-  op->SetInput("X", {"c"});
-  op->SetOutput("Out", {"b"});
-  op->SetAttr("op_role", 1);
-
-  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
-
-  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-  ir::Node *control_dep1 = nullptr;
-  ir::Node *control_dep2 = nullptr;
-  for (ir::Node *n : g->Nodes()) {
-    if (n->Name() == "sum") {
-      ASSERT_EQ(n->outputs[0]->Name(), "b");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
-      ASSERT_EQ(n->outputs.size(), 2);
-      control_dep1 = n->outputs[1];
-    }
-    if (n->Name() == "dummy") {
-      ASSERT_EQ(n->inputs[0]->Name(), "c");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
-      control_dep2 = n->inputs[1];
-      ASSERT_EQ(n->inputs.size(), 2);
-    }
-  }
-  ASSERT_NE(control_dep1, nullptr);
-  ASSERT_NE(control_dep2, nullptr);
-  ASSERT_EQ(control_dep1, control_dep2);
-}
-
-TEST(GraphTest, TestException) {
-  ProgramDesc prog;
-  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-
-  bool not_met_exception = false;
-  try {
-    g->Erase("no_attr");
-  } catch (const platform::EnforceNotMet &e) {
-    not_met_exception = true;
-  }
-  ASSERT_TRUE(not_met_exception);
-
-  not_met_exception = false;
-  try {
-    g->CreateVarNode(nullptr);
-  } catch (const platform::EnforceNotMet &e) {
-    not_met_exception = true;
-  }
-  ASSERT_TRUE(not_met_exception);
-
-  not_met_exception = false;
-  try {
-    g->CreateOpNode(nullptr);
-  } catch (const platform::EnforceNotMet &e) {
-    not_met_exception = true;
-  }
-  ASSERT_TRUE(not_met_exception);
-
-  not_met_exception = false;
-  try {
-    g->RemoveNode(nullptr);
-  } catch (const platform::EnforceNotMet &e) {
-    not_met_exception = true;
-  }
-  ASSERT_TRUE(not_met_exception);
-
-  not_met_exception = false;
-  try {
-    g->AddNode(nullptr);
-    g->AddNode(nullptr);
-  } catch (const platform::EnforceNotMet &e) {
-    not_met_exception = true;
-  }
-  ASSERT_TRUE(not_met_exception);
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
deleted file mode 100644
index b0d056f2c0f8286caadfbfed3b55b19fcef34402..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void GraphToProgramPass::ApplyImpl(ir::Graph* graph) const {
-  // Remove the unneeded variables after memory optimization.
-  std::unordered_set<std::string> vars2remove;
-  if (graph->Has(kGraphToProgramVarsToRemove)) {
-    vars2remove = graph->Get<std::unordered_set<std::string>>(
-        kGraphToProgramVarsToRemove);
-    VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes";
-  }
-
-  ProgramDesc& program = Get<ProgramDesc>("program");
-
-  std::unique_ptr<proto::ProgramDesc> program_pb(
-      new proto::ProgramDesc(*program.Proto()));
-
-  auto block = program_pb->mutable_blocks(kRootBlockIndex);
-  block->set_idx(kRootBlockIndex);
-  block->clear_vars();
-  std::unordered_set<std::string> visited_vars;
-  for (ir::Node* n : graph->Nodes()) {
-    if (n->IsVar()) {
-      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 &&
-          !vars2remove.count(n->Var()->Name())) {
-        visited_vars.insert(n->Var()->Name());
-        block->add_vars()->MergeFrom(*n->Var()->Proto());
-      }
-    }
-  }
-  block->clear_ops();
-
-  std::vector<ir::Node*> nodes;
-  if (Has(kGraphToProgramSortKind)) {
-    // Inference Memory Optimize relays on this branch.
-    int sort_kind = Get<int>(kGraphToProgramSortKind);
-    nodes = TopologyVarientSort(
-        *graph, static_cast<framework::ir::SortKind>(sort_kind));
-  } else {
-    nodes = TopologySortOperations(*graph);
-  }
-
-  for (ir::Node* n : nodes) {
-    if (!n->Op()) continue;
-
-    block->add_ops()->MergeFrom(*n->Op()->Proto());
-  }
-
-  program.CopyFrom(*program_pb);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(graph_to_program_pass, paddle::framework::ir::GraphToProgramPass);
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h
deleted file mode 100644
index 52c8f4e0fcafcd42647b323a20fee7c7cf167b3a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-const char kGraphToProgramVarsToRemove[] =
-    "__graph_to_program_vars_to_remove__";
-const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
-
-class GraphToProgramPass : public Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
deleted file mode 100644
index 5ee6b8a5f1e4e7415adfac6b51e9d3ae8e3062a9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void BuildNoCircleGraph(Graph* g) {
-  OpDesc op1;
-  op1.SetType("op1");
-  OpDesc op2;
-  op2.SetType("op2");
-  OpDesc op3;
-  op3.SetType("op3");
-  OpDesc op4;
-  op4.SetType("op4");
-  OpDesc op5;
-  op5.SetType("op5");
-  VarDesc var1("var1");
-  VarDesc var2("var2");
-  VarDesc var3("var3");
-  VarDesc var4("var4");
-
-  ir::Node* o1 = g->CreateOpNode(&op1);
-  ir::Node* o2 = g->CreateOpNode(&op2);
-  ir::Node* o3 = g->CreateOpNode(&op3);
-  ir::Node* o4 = g->CreateOpNode(&op4);
-  ir::Node* o5 = g->CreateOpNode(&op5);
-  ir::Node* v1 = g->CreateVarNode(&var1);
-  ir::Node* v2 = g->CreateVarNode(&var2);
-  ir::Node* v3 = g->CreateVarNode(&var3);
-  ir::Node* v4 = g->CreateVarNode(&var4);
-
-  // o1->v1->o2
-  o1->outputs.push_back(v1);
-  o2->inputs.push_back(v1);
-  v1->inputs.push_back(o1);
-  v1->outputs.push_back(o2);
-  // o2->v2->o3
-  // o2->v2->o4
-  o2->outputs.push_back(v2);
-  o3->inputs.push_back(v2);
-  o4->inputs.push_back(v2);
-  v2->outputs.push_back(o3);
-  v2->outputs.push_back(o4);
-  v2->inputs.push_back(o2);
-  // o4->v3->o5
-  o4->outputs.push_back(v3);
-  o5->inputs.push_back(v3);
-  v3->inputs.push_back(o4);
-  v3->outputs.push_back(o5);
-  // o3-v4->o5
-  o3->outputs.push_back(v4);
-  o5->inputs.push_back(v4);
-  v4->inputs.push_back(o3);
-  v4->outputs.push_back(o5);
-}
-
-TEST(GraphToProgramPass, Basic) {
-  ProgramDesc prog;
-  std::unique_ptr<Graph> g(new Graph(prog));
-  BuildNoCircleGraph(g.get());
-
-  auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
-      "graph_to_program_pass");
-
-  ProgramDesc compiled_prog;
-  pass->SetNotOwned<paddle::framework::ProgramDesc>("program", &compiled_prog);
-  pass->Apply(g.get());
-  std::vector<OpDesc*> ops = compiled_prog.Block(0).AllOps();
-  EXPECT_EQ(ops[0]->Type(), "op1");
-  EXPECT_EQ(ops[1]->Type(), "op2");
-  if (ops[2]->Type() == "op3") {
-    EXPECT_EQ(ops[3]->Type(), "op4");
-  } else if (ops[2]->Type() == "op4") {
-    EXPECT_EQ(ops[3]->Type(), "op3");
-  }
-  EXPECT_EQ(ops[4]->Type(), "op5");
-
-  std::unordered_set<std::string> vars;
-  for (VarDesc* v : compiled_prog.Block(0).AllVars()) {
-    vars.insert(v->Name());
-  }
-  EXPECT_TRUE(vars.find("var1") != vars.end());
-  EXPECT_TRUE(vars.find("var2") != vars.end());
-  EXPECT_TRUE(vars.find("var3") != vars.end());
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(graph_to_program_pass);
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
deleted file mode 100644
index 929d9edc34ffb92f468d5b7af54a0b8da4121543..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/graph_traits.h"
-
-#include <set>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-//
-// NodesDFSIterator
-//
-NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
-  for (auto *x : source) stack_.push(x);
-}
-
-NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept
-    : stack_(std::move(other.stack_)),
-      visited_(std::move(other.visited_)) {}
-
-NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
-    : stack_(other.stack_), visited_(other.visited_) {}
-
-Node &NodesDFSIterator::operator*() {
-  PADDLE_ENFORCE(!stack_.empty());
-  return *stack_.top();
-}
-
-NodesDFSIterator &NodesDFSIterator::operator++() {
-  PADDLE_ENFORCE(!stack_.empty(), "the iterator exceeds range");
-  visited_.insert(stack_.top());
-  auto *cur = stack_.top();
-  stack_.pop();
-  for (auto *x : cur->outputs) {
-    if (!visited_.count(x)) {
-      stack_.push(x);
-    }
-  }
-  return *this;
-}
-bool NodesDFSIterator::operator==(const NodesDFSIterator &other) {
-  if (stack_.empty()) return other.stack_.empty();
-  if ((!stack_.empty()) && (!other.stack_.empty())) {
-    return stack_.top() == other.stack_.top();
-  }
-  return false;
-}
-
-NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
-  stack_ = other.stack_;
-  visited_ = other.visited_;
-  return *this;
-}
-Node *NodesDFSIterator::operator->() { return stack_.top(); }
-
-inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
-  return node.inputs.size() == n;
-}
-
-NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
-  PADDLE_ENFORCE(!source.empty(),
-                 "Start points of topological sorting should not be empty!");
-  // CHECK all the inputs' in-degree is 0
-  for (auto *node : source) {
-    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
-  }
-
-  std::unordered_set<Node *> visited;
-  std::set<Node *> to_visit{source.begin(), source.end()};
-
-  std::vector<Node *> inlink_visited;
-  while (!to_visit.empty()) {
-    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
-    for (auto *p : queue) {
-      inlink_visited.clear();
-
-      std::copy_if(p->inputs.begin(), p->inputs.end(),
-                   std::back_inserter(inlink_visited),
-                   [&](Node *x) -> bool { return visited.count(x) != 0; });
-
-      if (inlink_visited.size() == p->inputs.size()) {
-        sorted_.push_back(p);
-        for (auto *_ : p->outputs) {
-          if (!visited.count(_)) {
-            to_visit.insert(_);
-          }
-        }
-
-        to_visit.erase(p);
-        visited.insert(p);
-      }
-    }
-  }
-}
-
-NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
-    : sorted_(other.sorted_), cursor_(other.cursor_) {}
-
-Node &NodesTSIterator::operator*() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return *sorted_[cursor_];
-}
-
-NodesTSIterator &NodesTSIterator::operator++() {
-  if (++cursor_ >= sorted_.size()) {
-    sorted_.clear();
-    cursor_ = 0;
-  }
-  return *this;
-}
-NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
-  cursor_ = other.cursor_;
-  sorted_ = other.sorted_;
-  return *this;
-}
-
-bool NodesTSIterator::operator==(const NodesTSIterator &other) {
-  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
-}
-
-Node *NodesTSIterator::operator->() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return sorted_[cursor_];
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h
deleted file mode 100644
index f6772f9a37567c83c49bd44d551481edda1a74ae..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <stack>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/node.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-template <typename IteratorT>
-class iterator_range {
-  IteratorT begin_, end_;
-
- public:
-  template <typename Container>
-  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
-
-  iterator_range(const IteratorT &begin, const IteratorT &end)
-      : begin_(begin), end_(end) {}
-
-  const IteratorT &begin() const { return begin_; }
-  const IteratorT &end() const { return end_; }
-};
-
-// DFS iterator on nodes.
-struct NodesDFSIterator
-    : public std::iterator<std::forward_iterator_tag, Node *> {
-  NodesDFSIterator() = default;
-  explicit NodesDFSIterator(const std::vector<Node *> &source);
-  NodesDFSIterator(NodesDFSIterator &&other) noexcept;
-  NodesDFSIterator(const NodesDFSIterator &other);
-
-  Node &operator*();
-  NodesDFSIterator &operator++();
-  // TODO(Superjomn) current implementation just compare the first
-  // element, need to compare the graph and all the elements in the queue and
-  // set.
-  NodesDFSIterator &operator=(const NodesDFSIterator &other);
-  bool operator==(const NodesDFSIterator &other);
-  bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
-  Node *operator->();
-
- private:
-  std::stack<Node *> stack_;
-  std::unordered_set<Node *> visited_;
-};
-
-// Topological sorting iterator on nodes.
-struct NodesTSIterator
-    : public std::iterator<std::forward_iterator_tag, Node *> {
-  NodesTSIterator() = default;
-  NodesTSIterator(const std::vector<Node *> &source);
-  NodesTSIterator(NodesTSIterator &&other)
-      : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
-    other.cursor_ = 0;
-  }
-  NodesTSIterator(const NodesTSIterator &other);
-
-  Node &operator*();
-  NodesTSIterator &operator++();
-  // TODO(Superjomn) current implementation just compare the first
-  // element, need to compare the graph and all the elements in the queue and
-  // set.
-  NodesTSIterator &operator=(const NodesTSIterator &other);
-  bool operator==(const NodesTSIterator &other);
-  bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
-  Node *operator->();
-
- private:
-  std::vector<Node *> sorted_;
-  size_t cursor_{0};
-};
-
-/*
- * GraphTraits contains some graph traversal algorithms.
- *
- * Usage:
- *
- */
-struct GraphTraits {
-  static iterator_range<NodesDFSIterator> DFS(const Graph &g) {
-    auto start_points = ExtractStartPoints(g);
-    NodesDFSIterator x(start_points);
-    return iterator_range<NodesDFSIterator>(NodesDFSIterator(start_points),
-                                            NodesDFSIterator());
-  }
-
-  static iterator_range<NodesTSIterator> TS(const Graph &g) {
-    auto start_points = ExtractStartPoints(g);
-    PADDLE_ENFORCE(!start_points.empty());
-    NodesTSIterator x(start_points);
-    return iterator_range<NodesTSIterator>(NodesTSIterator(start_points),
-                                           NodesTSIterator());
-  }
-
- private:
-  // The nodes those have no input will be treated as start points.
-  static std::vector<Node *> ExtractStartPoints(const Graph &g) {
-    std::vector<Node *> result;
-    for (auto *node : g.Nodes()) {
-      if (node->inputs.empty()) {
-        result.push_back(node);
-      }
-    }
-    return result;
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
deleted file mode 100644
index fa7263b7e71d537284a836eef84c1e7c17cdf409..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include <algorithm>
-#include <unordered_map>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/graph_printer.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-using inference::analysis::Dot;
-namespace {
-std::string FormatName(const Node* node) {
-  if (!node->IsOp() || !node->Op() ||
-      !node->Op()->HasAttr(OpProtoAndCheckerMaker::OpNamescopeAttrName())) {
-    return node->Name();
-  }
-  const std::string full_scope = boost::get<std::string>(
-      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpNamescopeAttrName()));
-  return string::Sprintf("%s%s", full_scope.c_str(), node->Name().c_str());
-}
-}  // namespace
-
-void GraphVizPass::ApplyImpl(ir::Graph* graph) const {
-  const std::string& graph_viz_path = Get<std::string>(kGraphvizPath);
-  VLOG(3) << "draw IR graph viz to " << graph_viz_path;
-  std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
-  PADDLE_ENFORCE(fout->good());
-  std::ostream& sout = *fout;
-
-  std::unordered_map<const ir::Node*, std::string> node2dot;
-
-  Dot dot;
-
-  const std::vector<Dot::Attr> op_attrs({
-      Dot::Attr("style", "rounded,filled,bold"),  //
-      Dot::Attr("shape", "box"),                  //
-      Dot::Attr("color", "#303A3A"),              //
-      Dot::Attr("fontcolor", "#ffffff"),          //
-      Dot::Attr("width", "1.3"),                  //
-      Dot::Attr("height", "0.84"),                //
-      Dot::Attr("fontname", "Arial"),             //
-  });
-  const std::vector<Dot::Attr> arg_attrs({
-      Dot::Attr("shape", "box"),                  //
-      Dot::Attr("style", "rounded,filled,bold"),  //
-      Dot::Attr("fontname", "Arial"),             //
-      Dot::Attr("fillcolor", "#999999"),          //
-      Dot::Attr("color", "#dddddd"),              //
-  });
-
-  const std::vector<Dot::Attr> param_attrs({
-      Dot::Attr("shape", "box"),                  //
-      Dot::Attr("style", "rounded,filled,bold"),  //
-      Dot::Attr("fontname", "Arial"),             //
-      Dot::Attr("color", "#148b97"),              //
-      Dot::Attr("fontcolor", "#ffffff"),          //
-  });
-
-  const std::vector<Dot::Attr> marked_op_attrs(
-      {Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
-       Dot::Attr("fillcolor", "yellow")});
-  const std::vector<Dot::Attr> marked_var_attrs(
-      {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
-       Dot::Attr("fillcolor", "yellow")});
-
-  auto marked_nodes = ConsumeMarkedNodes(graph);
-  // Create nodes
-  for (const Node* n : graph->Nodes()) {
-    std::string node_id = FormatName(n) + "(" + std::to_string(n->id()) + ")";
-    if (n->IsOp()) {
-      decltype(op_attrs) attr =
-          marked_nodes.count(n) ? marked_op_attrs : op_attrs;
-      dot.AddNode(node_id, attr, node_id);
-    } else if (n->IsVar()) {
-      if (n->Var() && n->Var()->GetType() == proto::VarType::LOD_TENSOR) {
-        bool is_first = true;
-        for (int64_t length : n->Var()->GetShape()) {
-          if (is_first) {
-            node_id += "\n" + std::to_string(length);
-            is_first = false;
-          } else {
-            node_id += "," + std::to_string(length);
-          }
-        }
-      }
-      decltype(op_attrs)* attr;
-      if (marked_nodes.count(n)) {
-        attr = &marked_var_attrs;
-      } else if (const_cast<Node*>(n)->Var() &&
-                 const_cast<Node*>(n)->Var()->Persistable()) {
-        attr = &param_attrs;
-      } else {
-        attr = &arg_attrs;
-      }
-
-      dot.AddNode(node_id, *attr, node_id);
-    }
-    node2dot[n] = node_id;
-  }
-  // Create edges
-  for (const Node* n : graph->Nodes()) {
-    const auto& src_id = node2dot.at(n);
-    for (auto* out : n->outputs) {
-      const auto& trg_id = node2dot.at(out);
-      dot.AddEdge(src_id, trg_id, {});
-    }
-  }
-
-  sout << dot.Build();
-}
-
-GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes(
-    Graph* graph) const {
-  marked_nodes_t res;
-  if (graph->Has(kGraphvizMarkedNodeAttr)) {
-    auto& attr = graph->Get<marked_nodes_t>(kGraphvizMarkedNodeAttr);
-    res = attr;
-    attr.clear();
-  }
-  return res;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
-    .RequirePassAttr(paddle::framework::ir::kGraphvizPath);
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
deleted file mode 100644
index 7091aa6a95bd9ebde10bfbd45c98f8757b9d06c4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-const char kGraphvizMarkedNodeAttr[] = "__graphviz__marked_node__";
-
-class GraphVizPass : public Pass {
- public:
-  using marked_nodes_t = std::unordered_set<const Node*>;
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  // Tell whether there are any marked nodes in the graph. Consume the
-  // corresponding attribute.
-  marked_nodes_t ConsumeMarkedNodes(Graph* graph) const;
-};
-
-static GraphVizPass::marked_nodes_t& GetMarkedNodes(Graph* graph) {
-  if (!graph->Has(kGraphvizMarkedNodeAttr)) {
-    graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
-  }
-  return graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
deleted file mode 100644
index a39901e63bf65f7c314595a5fb2cc31d00959bd5..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h"
-#include <string>
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init("identity_scale_op_clean", graph);
-
-  // pre_op -> scale_in -> scale_op -> scale_out
-  // ->
-  // pre_op -> scale_out
-  GraphPatternDetector detector;
-  auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op();
-  auto scale_in = detector.mutable_pattern()
-                      ->NewNode("scale_in")
-                      ->assert_is_op_input("scale")
-                      ->AsIntermediate();
-  auto scale_op = detector.mutable_pattern()
-                      ->NewNode("scale_fuse")
-                      ->assert_is_op("scale")
-                      ->assert_op_attr<float>("scale", 1.)
-                      ->assert_op_attr<float>("bias", 0.);
-  auto scale_out =
-      detector.mutable_pattern()
-          ->NewNode("scale_out")
-          ->assert_is_op_output("scale")
-          // scale's output var should has only one consumer, or it can't be
-          // removed.
-          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
-
-  pre_op->LinksTo({scale_in});
-  scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
-
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    Node* scale_op_var = subgraph.at(scale_op);
-    Node* scale_in_var = subgraph.at(scale_in);
-    Node* scale_out_var = subgraph.at(scale_out);
-    Node* pre_op_var = subgraph.at(pre_op);
-    // Link pre_op directly to scale_out
-    const std::string scale_in_name = scale_in_var->Name();
-    const std::string scale_out_name = scale_out_var->Name();
-    // Remove links in graph
-    GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
-    // Modify proto message
-    auto* pre_op_desc = pre_op_var->Op();
-    for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) {
-      auto* arguments = parameter.mutable_arguments();
-      auto it = std::find(arguments->begin(), arguments->end(), scale_in_name);
-      PADDLE_ENFORCE(it != arguments->end());
-      *it = scale_out_name;
-    }
-
-    IR_NODE_LINK_TO(pre_op_var, scale_out_var);
-  };
-
-  detector(graph, handler);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(identity_scale_op_clean_pass,
-              paddle::framework::ir::IdentityScaleOpCleanPass);
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
deleted file mode 100644
index d66b411257e530fa5188091702b0b309652ffaa4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class IdentityScaleOpCleanPass : public FusePassBase {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
- private:
-  virtual ~IdentityScaleOpCleanPass() = default;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
deleted file mode 100644
index bf6fe999c1e68c35bc2c19fe38646da93bb1e204..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/is_test_pass.h"
-#include <string>
-#include <utility>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void IsTestPass::ApplyImpl(ir::Graph* graph) const {
-  VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
-             "for activations and pooling.";
-  auto op_list = {"pool2d",      "sigmoid",      "logsigmoid",
-                  "softshrink",  "exp",          "brelu",
-                  "pow",         "leaky_relu",   "stanh",
-                  "relu",        "tanh",         "tanh_shrink",
-                  "sqrt",        "abs",          "ceil",
-                  "elu",         "floor",        "cos",
-                  "sin",         "round",        "reciprocal",
-                  "hard_shrink", "hard_sigmoid", "relu6",
-                  "soft_relu",   "swish",        "thresholded_relu",
-                  "log",         "square",       "softplus",
-                  "softsign"};
-  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp()) {
-      auto* op = n->Op();
-      if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) {
-        op->SetAttr("is_test", true);
-      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
-                 end(op_list)) {
-        op->MutableAttrMap()->insert(
-            std::pair<std::string, Attribute>("is_test", true));
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(is_test_pass, paddle::framework::ir::IsTestPass);
diff --git a/paddle/fluid/framework/ir/is_test_pass.h b/paddle/fluid/framework/ir/is_test_pass.h
deleted file mode 100644
index 80cedbf9f850f6fe31c9f2898264e19ebf931c72..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/is_test_pass.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class IsTestPass : public Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc
deleted file mode 100644
index 3fa543c6221ae6ada8afddcf4563c1174127c221..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/is_test_pass.h"
-
-#include <gtest/gtest.h>
-#ifdef _WIN32
-#undef FALSE
-#undef TRUE
-#endif
-namespace paddle {
-namespace framework {
-namespace ir {
-
-enum class ISTEST_STATE { FALSE, TRUE, UNSET };
-
-void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs, bool use_mkldnn = false,
-           ISTEST_STATE is_test = ISTEST_STATE::UNSET) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetAttr("name", name);
-  op->SetInput("X", inputs);
-  op->SetOutput("Out", outputs);
-  op->SetAttr("use_mkldnn", use_mkldnn);
-  if (is_test == ISTEST_STATE::UNSET)
-    op->MutableAttrMap()->erase("is_test");
-  else if (is_test == ISTEST_STATE::FALSE)
-    op->SetAttr("is_test", false);
-  else
-    op->SetAttr("is_test", true);
-}
-
-// a->pool2d->b
-// b->relu->c
-// c,weights1)->conv2d->d
-//
-// d->pool2d->e
-// e->hard_sigmoid->f
-// (f,weights2)->conv2d->g
-//
-// g->pool2d->h
-// h->tanh->i
-// (i,weights3)->conv2d->j
-ProgramDesc BuildProgramDesc() {
-  ProgramDesc prog;
-  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h", "i",
-                                 "j", "weights1", "weights2", "weights3"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::SELECTED_ROWS);
-    if (v == "weights1" || v == "weights2" || v == "weights3") {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "pool2d", "pooling1", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"b"}), true, ISTEST_STATE::TRUE);
-  SetOp(&prog, "relu", "activation1", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"c"}), true, ISTEST_STATE::TRUE);
-  SetOp(&prog, "conv2d", "conv1", std::vector<std::string>({"c", "weights1"}),
-        std::vector<std::string>({"d"}), true, ISTEST_STATE::TRUE);
-
-  SetOp(&prog, "pool2d", "pooling2", std::vector<std::string>({"d"}),
-        std::vector<std::string>({"e"}), false, ISTEST_STATE::FALSE);
-  SetOp(&prog, "hard_sigmoid", "activation2", std::vector<std::string>({"e"}),
-        std::vector<std::string>({"f"}), false, ISTEST_STATE::FALSE);
-  SetOp(&prog, "conv2d", "conv2", std::vector<std::string>({"f", "weights2"}),
-        std::vector<std::string>({"g"}), false, ISTEST_STATE::FALSE);
-
-  SetOp(&prog, "pool2d", "pooling3", std::vector<std::string>({"g"}),
-        std::vector<std::string>({"h"}), false, ISTEST_STATE::UNSET);
-  SetOp(&prog, "tanh", "activation3", std::vector<std::string>({"h"}),
-        std::vector<std::string>({"i"}), true, ISTEST_STATE::UNSET);
-  SetOp(&prog, "conv2d", "conv3", std::vector<std::string>({"i", "weights3"}),
-        std::vector<std::string>({"j"}), false, ISTEST_STATE::UNSET);
-
-  return prog;
-}
-
-TEST(IsTestPass, basic) {
-  auto prog = BuildProgramDesc();
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  auto pass = PassRegistry::Instance().Get("is_test_pass");
-
-  graph.reset(pass->Apply(graph.release()));
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      auto op_name = boost::get<std::string>(op->GetAttr("name"));
-      if (op_name == "conv3") {
-        ASSERT_FALSE(op->HasAttr("is_test"));
-      } else {
-        ASSERT_TRUE(op->HasAttr("is_test"));
-        EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(is_test_pass);
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
deleted file mode 100644
index 05d23961a8b180381eef6372f7049bed2b530db7..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+++ /dev/null
@@ -1,355 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/lock_free_optimize_pass.h"
-
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-const char kSumGradOpName[] = "sum";
-// TODO(minqiyang): only support sgd at current time, please add
-// other optimizers later.
-const char kOptimizerType[] = "sgd";
-
-void LockFreeOptimizePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
-
-  // We could collect all weights' name from SGD, where
-  // W1 <- SGD(W0, Grad0)
-  std::unordered_set<std::string> weight_var_set;
-  for (auto* node : graph->Nodes()) {
-    if (IsOpNamed(node, kOptimizerType)) {
-      auto& param_out_vars = node->Op()->Output("ParamOut");
-      PADDLE_ENFORCE(param_out_vars.size() == 1u);
-      weight_var_set.insert(param_out_vars[0]);
-    }
-  }
-
-  // find all grad's merge op via weight name, where
-  // Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
-  std::unordered_set<ir::Node*> grad_sum_op_set;
-  for (ir::Node* node : graph->Nodes()) {
-    if (IsOpNamed(node, kSumGradOpName)) {
-      for (ir::Node* output : node->outputs) {
-        // strip the last grad suffix @GRAD
-        std::string var_name = output->Name();
-        const std::string suffix(kGradVarSuffix);
-        if (var_name != suffix && var_name.size() > suffix.size() &&
-            var_name.substr(var_name.size() - suffix.size()) == suffix) {
-          // if so then strip them off
-          var_name = var_name.substr(0, var_name.size() - suffix.size());
-          if (weight_var_set.find(var_name) != weight_var_set.end()) {
-            grad_sum_op_set.insert(node);
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  // get the forward op and backward op pairs, where
-  // out <- forward(X, W)
-  // Grad1 <- backward(out, X')
-  // Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
-  // W0 <- SGD(W1, Grad0)
-  for (ir::Node* node : grad_sum_op_set) {
-    for (ir::Node* merged_grad_var : node->outputs) {
-      // find the optimizers connected with sum op
-      if (IsVarNameEndsWith(merged_grad_var, kGradVarSuffix) &&
-          merged_grad_var->outputs.size() == 1u) {
-        ir::Node* opt_node = merged_grad_var->outputs[0];
-        VLOG(3) << "Found opt node " << opt_node->Name();
-
-        // find the backward op connected with sum op
-        for (ir::Node* unmerged_grad_var : node->inputs) {
-          if (IsVarNameContains(unmerged_grad_var, kGradVarSuffix) &&
-              unmerged_grad_var->inputs.size() == 1u) {
-            ir::Node* backward_op = unmerged_grad_var->inputs[0];
-
-            VLOG(3) << "Found backward_op " << backward_op->Name();
-
-            // find the forward op related to the backward op
-            ir::Node* forward_op =
-                FindForwardOpViaBackwardOp(graph, backward_op);
-
-            VLOG(3) << "Found forward_op " << forward_op->Name();
-
-            PADDLE_ENFORCE(forward_op);
-
-            Node* new_optimizer_node = CreateNewSGDNode(
-                graph, forward_op, backward_op, node, opt_node);
-
-            PADDLE_ENFORCE(new_optimizer_node);
-          }
-        }
-      }
-    }
-  }
-
-  // Remove the sum_op and its' outputs and connected Optimizers
-  for (Node* sum_op : grad_sum_op_set) {
-    for (Node* sum_op_output : sum_op->outputs) {
-      for (Node* optimize_op : sum_op_output->outputs) {
-        if (optimize_op->NodeType() == Node::Type::kOperation &&
-            optimize_op->Name() == kOptimizerType) {
-          VLOG(3) << "remove optimize_op: " << optimize_op->Name() << "_"
-                  << optimize_op->id();
-          graph->RemoveNode(optimize_op);
-        }
-      }
-      VLOG(3) << "remove sum_op_output: " << sum_op_output->Name() << "_"
-              << sum_op_output->id();
-      graph->RemoveNode(sum_op_output);
-    }
-    VLOG(3) << "remove sum_op: " << sum_op->Name() << "_" << sum_op->id();
-    graph->RemoveNode(sum_op);
-  }
-
-  for (auto* node : graph->Nodes()) {
-    for (Node* output_node : node->outputs) {
-      if (output_node->Name() == "sgd") {
-        VLOG(3) << "Node link to SGD: " << node->Name() << "_" << node->id()
-                << " --> " << output_node->Name() << "_" << output_node->id();
-        for (Node* input_node : node->inputs) {
-          VLOG(3) << "SGD Input link: " << input_node->Name() << "_"
-                  << input_node->id() << " --> " << node->Name() << "_"
-                  << node->id();
-        }
-      }
-    }
-  }
-}
-
-ir::Node* LockFreeOptimizePass::CreateNewSGDNode(
-    ir::Graph* graph, ir::Node* forward_node, ir::Node* backward_node,
-    ir::Node* grad_sum_node, ir::Node* optimize_node) const {
-  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(forward_node);
-  PADDLE_ENFORCE(backward_node);
-  PADDLE_ENFORCE(grad_sum_node);
-  PADDLE_ENFORCE(optimize_node);
-
-  // find the grad var node between the grad sum node and backward_node
-  std::vector<ir::Node*> grad_vars =
-      FindConnectedNode(backward_node, grad_sum_node);
-  ir::Node* grad_node = nullptr;
-  for (ir::Node* node : grad_vars) {
-    if (!ir::IsControlDepVar(*node)) {
-      grad_node = node;
-    }
-  }
-  PADDLE_ENFORCE(grad_node);
-
-  // create a new SGD node
-  OpDesc* old_desc = optimize_node->Op();
-  // keep with the same block between new optimizer and the old one
-  OpDesc new_desc(*old_desc, old_desc->Block());
-  new_desc.SetInput("Param", old_desc->Input("Param"));
-  new_desc.SetInput("LearningRate", old_desc->Input("LearningRate"));
-  new_desc.SetInput("Grad", std::vector<std::string>({grad_node->Name()}));
-  new_desc.SetOutput("ParamOut", old_desc->Output("ParamOut"));
-
-  std::vector<std::string> op_role_vars = boost::get<std::vector<std::string>>(
-      new_desc.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-  // replace the second op role var, because the grad name was
-  // changed in new optimizer
-  op_role_vars.pop_back();
-  op_role_vars.push_back(grad_node->Name());
-  new_desc.SetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(),
-                   op_role_vars);
-  new_desc.SetType(kOptimizerType);
-
-  // set backward op's op role var, this will be used to
-  // set device_id in multi_device_pass
-  backward_node->Op()->SetAttr(
-      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), op_role_vars);
-  // backward_node->Op()->SetAttr(
-  // framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), {});
-
-  // keep with the same output nodes between new optimizer and the
-  // old one
-  Node* sgd_node = graph->CreateOpNode(&new_desc);
-
-  // change all outputs of the optimize_node to the new one
-  ReplaceAllDownstreamNode(optimize_node, sgd_node);
-
-  // find connected node between forward node and optimize node
-  // and replace the optimize node to new sgd node
-  std::vector<ir::Node*> forward_opt_connected_nodes =
-      FindConnectedNode(forward_node, optimize_node);
-  for (ir::Node* node : forward_opt_connected_nodes) {
-    ReplaceUpstreamNode(node, optimize_node, sgd_node);
-  }
-
-  // find connected node between backward node and optimize node
-  // and replace the optimize node to new sgd node
-  std::vector<ir::Node*> backward_opt_connected_nodes =
-      FindConnectedNode(backward_node, optimize_node);
-  for (ir::Node* node : backward_opt_connected_nodes) {
-    ReplaceUpstreamNode(node, optimize_node, sgd_node);
-  }
-
-  // SGD must have only one param and LR in
-  PADDLE_ENFORCE(old_desc->Input("LearningRate").size() == 1u);
-  PADDLE_ENFORCE(old_desc->Input("Param").size() == 1u);
-
-  // LR and weight nodes should be copied
-  for (Node* upstream_node : optimize_node->inputs) {
-    if (upstream_node->Name() == old_desc->Input("LearningRate")[0] ||
-        upstream_node->Name() == old_desc->Input("Param")[0]) {
-      ReplaceUpstreamNode(upstream_node, optimize_node, sgd_node);
-    }
-  }
-
-  VLOG(3) << "Create new opt node" << sgd_node->Name() << "_" << sgd_node->id();
-
-  return sgd_node;
-}
-
-std::vector<ir::Node*> LockFreeOptimizePass::FindConnectedNode(
-    ir::Node* upstream_node, ir::Node* downstream_node) const {
-  std::vector<ir::Node*> result;
-  for (ir::Node* out_node : upstream_node->outputs) {
-    for (ir::Node* in_node : downstream_node->inputs) {
-      if (in_node == out_node) {
-        result.push_back(in_node);
-      }
-    }
-  }
-
-  return result;
-}
-
-void LockFreeOptimizePass::ReplaceUpstreamNode(
-    ir::Node* upstream_node, ir::Node* old_optimizer_node,
-    ir::Node* new_optimizer_node) const {
-  PADDLE_ENFORCE(upstream_node);
-  PADDLE_ENFORCE(old_optimizer_node);
-  PADDLE_ENFORCE(new_optimizer_node);
-
-  // Remove the old_optimizer_node from upstream_node's outputs vector
-  auto& output_node_vec = upstream_node->outputs;
-  for (auto output_node_iter = output_node_vec.begin();
-       output_node_iter != output_node_vec.end();) {
-    if (*output_node_iter == old_optimizer_node) {
-      output_node_vec.erase(output_node_iter);
-      break;
-    } else {
-      ++output_node_iter;
-    }
-  }
-
-  // Add the new_optimizer_node to upstream_node's outputs vector
-  output_node_vec.emplace_back(new_optimizer_node);
-  new_optimizer_node->inputs.emplace_back(upstream_node);
-}
-
-void LockFreeOptimizePass::ReplaceAllDownstreamNode(
-    ir::Node* old_optimizer_node, ir::Node* new_optimizer_node) const {
-  PADDLE_ENFORCE(old_optimizer_node);
-  PADDLE_ENFORCE(new_optimizer_node);
-
-  for (ir::Node* downstream_node : old_optimizer_node->outputs) {
-    // Remove the old_optimizer_node from downstream_node's inputs vector
-    auto& input_node_vec = downstream_node->inputs;
-    for (auto input_node_iter = input_node_vec.begin();
-         input_node_iter != input_node_vec.end();) {
-      if (*input_node_iter == old_optimizer_node) {
-        input_node_vec.erase(input_node_iter);
-        break;
-      } else {
-        ++input_node_iter;
-      }
-    }
-
-    // Add the new_optimizer_node to downstream_node's inputs vector
-    input_node_vec.emplace_back(new_optimizer_node);
-    new_optimizer_node->outputs.emplace_back(downstream_node);
-  }
-}
-
-ir::Node* LockFreeOptimizePass::FindForwardOpViaBackwardOp(
-    ir::Graph* graph, ir::Node* backward_node) const {
-  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(backward_node);
-
-  // strip the suffix _grad of backward_node's name
-  std::string forward_op_name = backward_node->Name();
-  const std::string suffix("_grad");
-  if (forward_op_name != suffix && forward_op_name.size() > suffix.size() &&
-      forward_op_name.substr(forward_op_name.size() - suffix.size()) ==
-          suffix) {
-    // if so then strip them off
-    forward_op_name =
-        forward_op_name.substr(0, forward_op_name.size() - suffix.size());
-  } else {
-    LOG(WARNING) << "Illegal backward node's name " << backward_node->Name()
-                 << " id " << backward_node->id();
-
-    return nullptr;
-  }
-
-  for (ir::Node* node : graph->Nodes()) {
-    if (node->Name() == forward_op_name) {
-      if (node->outputs.size() == 0u) {
-        // if forward_node has no output, then it has NO grad op
-        continue;
-      }
-
-      // check whether all inputs of the backward_op that ends_with @GRAD
-      // comes from the output of forward_op is the input of the backward_op
-      bool is_related_forward_node = true;
-      for (ir::Node* backward_input : backward_node->inputs) {
-        if (IsVarNameEndsWith(backward_input, kGradVarSuffix)) {
-          bool meets_correct_output = false;
-          for (ir::Node* forward_output : node->outputs) {
-            if (forward_output->Name() + kGradVarSuffix ==
-                backward_input->Name()) {
-              meets_correct_output = true;
-              break;
-            }
-          }
-
-          if (!meets_correct_output) {
-            is_related_forward_node = false;
-            break;
-          }
-        }
-      }
-
-      if (is_related_forward_node) {
-        return node;
-      }
-    }
-  }
-
-  return nullptr;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(lock_free_optimize_pass,
-              paddle::framework::ir::LockFreeOptimizePass);
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
deleted file mode 100644
index 9c923480bac26fb8c68768c8365b0f899959ec64..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include <boost/algorithm/string/predicate.hpp>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class Node;
-
-/*
-* Remove the sum op of all gradients of the backward op.
-* And remove the dependecies of the optimizer related to the
-* same backward op.
-*
-* Before this pass:
-*
-* forward_op1 forward_op2
-*     |            |
-*  grad_op1    grad_op2
-*        \      /
-*          \  /
-*         sum_op
-*           |
-*         sgd_op
-*
-* After this pass:
-* forward_op1 forward_op2
-*     |            |
-*  grad_op1    grad_op2
-*     |            |
-*  sgd_op1      sgd_op2
-*
-* sgd_op1 and sgd_op2 will update the same weight which holds the same
-* memory, so we could benefits from the acceleration
-*/
-class LockFreeOptimizePass : public Pass {
- public:
-  virtual ~LockFreeOptimizePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
- private:
-  // Create a new sgd node via current optimizer node
-  ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node,
-                             ir::Node* backward_node, ir::Node* grad_sum_node,
-                             ir::Node* optimize_node) const;
-
-  // Replace the input weight's optimizers
-  void ReplaceUpstreamNode(ir::Node* upstream_node,
-                           ir::Node* old_optimizer_node,
-                           ir::Node* new_optimizer_node) const;
-
-  // Replace the output weight's optimizers
-  void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node,
-                                ir::Node* new_optimizer_node) const;
-
-  // Find all weight variables in graph
-  bool FindAllWeightVars(ir::Graph* graph) const;
-
-  // Find the forward_op node via the backward_op node
-  ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph,
-                                       ir::Node* backward_node) const;
-
-  std::vector<ir::Node*> FindConnectedNode(ir::Node* upstream_node,
-                                           ir::Node* downstream_node) const;
-
-  inline bool IsOpNamed(ir::Node* node, const std::string& name) const {
-    PADDLE_ENFORCE(node);
-
-    return node->NodeType() == Node::Type::kOperation && node->Name() == name;
-  }
-
-  inline bool IsVarNamed(ir::Node* node, const std::string& name) const {
-    PADDLE_ENFORCE(node);
-
-    return node->NodeType() == Node::Type::kVariable && node->Name() == name;
-  }
-
-  inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const {
-    PADDLE_ENFORCE(node);
-
-    return node->NodeType() == Node::Type::kVariable &&
-           boost::algorithm::ends_with(node->Name(), name);
-  }
-
-  inline bool IsVarNameContains(ir::Node* node, const std::string& name) const {
-    PADDLE_ENFORCE(node);
-
-    return node->NodeType() == Node::Type::kVariable &&
-           node->Name().find(name) != std::string::npos;
-  }
-
-  inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const {
-    PADDLE_ENFORCE(ctrl_dep_node);
-    PADDLE_ENFORCE(node);
-
-    return IsControlDepVar(*ctrl_dep_node) &&
-           ctrl_dep_node->inputs.size() >= 1u &&
-           ctrl_dep_node->inputs[0] == node;
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
deleted file mode 100644
index 37993d3f0d96170c3926c91654cf321cabb2539f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base)
-cc_library(conditional_block_op_eager_deletion_pass SRCS conditional_block_op_eager_deletion_pass.cc DEPS conditional_block_op_helper graph_helper pass computation_op_handle)
-cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
-cc_library(recurrent_op_eager_deletion_pass SRCS recurrent_op_eager_deletion_pass.cc DEPS recurrent_op_helper graph_helper pass computation_op_handle)
-cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle var_handle)
-cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
-
-cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle
-    eager_deletion_op_handle graph graph_helper pass conditional_block_op_eager_deletion_pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper)
-
-cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handle reference_count_pass_helper share_tensor_buffer_op_handle multi_devices_helper graph pass) 
-
-cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass)
-cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) 
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
deleted file mode 100644
index 6ce14203629e0af20701fee1e589c898992d6cda..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ /dev/null
@@ -1,422 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-using OpHandleBase = details::OpHandleBase;
-using ComputationOpHandle = details::ComputationOpHandle;
-using VarHandle = details::VarHandle;
-using VarHandleBase = details::VarHandleBase;
-using DummyVarHandle = details::DummyVarHandle;
-
-enum NodeDependency { kSame = 0, kNoDep = 1, kBefore = 2, kAfter = 3 };
-
-static NodeDependency ReverseNodeDependency(NodeDependency dep) {
-  return dep == NodeDependency::kBefore
-             ? NodeDependency::kAfter
-             : (dep == NodeDependency::kAfter ? NodeDependency::kBefore : dep);
-}
-
-class BufferSharedCrossOpMemoryReusePass : public MemoryReusePass {
- protected:
-  std::string ReuseType() const override { return "cross_op_memory_reuse"; }
-
-  void Run(Graph *graph) const override;
-
- private:
-  void RunOnScopeIdx(size_t idx) const;
-
-  // Toposort ops. Different strategies can be used in the future.
-  std::vector<OpHandleBase *> SortOp(const OpGraphView &graph_view) const;
-
-  // Build the initial dependency matrix, and initializing all fields,
-  // including `ops_`, `op_to_idx_`, `deps_`
-  void BuildOpDependencyMap() const;
-
-  // Get op index inside `ops_`, used to find dependency inside `deps_`
-  size_t OpIndex(const ComputationOpHandle *op) const;
-
-  size_t ResolveDependencyBetween(
-      ComputationOpHandle *op,
-      const std::unordered_set<ComputationOpHandle *> &prev_ops) const;
-
-  // Get dependency relationship between op1 and op2
-  // Notice: GetOpDep(op1, op2) == ReverseNodeDependency(GetOpDep(op2, op1))
-  NodeDependency GetOpDep(const ComputationOpHandle *op1,
-                          const ComputationOpHandle *op2) const;
-
-  void SetOpDep(const ComputationOpHandle *op1, const ComputationOpHandle *op2,
-                NodeDependency dep) const;
-
- private:
-  mutable Graph *graph_;
-
-  // All ops in the graph, grouped by scope index
-  mutable std::vector<std::vector<ComputationOpHandle *>> ops_;
-
-  // Index of each op in `ops_`, grouped by scope index.
-  // Index of each op is the index inside `deps_`.
-  mutable std::vector<std::unordered_map<const ComputationOpHandle *, size_t>>
-      op_to_idx_;
-
-  // Dependency matrix of between any 2 ops
-  // If deps_[scope_idx][i][j] is equal to:
-  //  1. kSame, Op(i) and Op(j) are the same ops, only when i == j.
-  //  2. kNoDep, Op(i) and Op(j) have no dependency between each other.
-  //  3. kBefore, Op(i) is the preceding op of Op(j).
-  //  4. kAfter, Op(i) is the pending op of Op(j).
-  mutable std::vector<std::vector<std::vector<NodeDependency>>> deps_;
-};
-
-void BufferSharedCrossOpMemoryReusePass::Run(Graph *graph) const {
-  graph_ = graph;
-  BuildOpDependencyMap();
-  for (size_t i = 0; i < ScopeNum(); ++i) {
-    RunOnScopeIdx(i);
-  }
-}
-
-// Note(zjl): The reason why I separate SortOp from BuildOpDependencyMap()
-// is that we can use different sorting strategies in the future to
-// evaluate the effects of different sorting strategies.
-// Currently, I use BFS, but we can use other kinds of sorting strategy
-// in the future, as long as the new strategy reaches higher memory reuse
-// ratio.
-std::vector<OpHandleBase *> BufferSharedCrossOpMemoryReusePass::SortOp(
-    const OpGraphView &graph_view) const {
-  std::vector<OpHandleBase *> sorted_ops;
-  sorted_ops.reserve(graph_view.OpNumber());
-  graph_view.BreadthFirstVisit(
-      [&](OpHandleBase *cur_op) { sorted_ops.emplace_back(cur_op); });
-  PADDLE_ENFORCE_EQ(sorted_ops.size(), graph_view.OpNumber(),
-                    "There are unvisited ops");
-  return sorted_ops;
-}
-
-/**
- * Try to reuse unlived vars.
- *
- * What we do is: transverse all outputs of each op, and find a suitable
- * unused var, and then reuse its memory as output.
- *
- * How to determine unused vars?
- *
- * Case 1: unlived vars after all preceding ops run. In this case, no extra
- *   edge would be added to the graph.
- *
- * Case 2: unlived vars after all preceding ops and all no-dep ops run. In
- *   this case, the reused var is from no-dep ops, so that we have to add
- *   extra edge to resolve data hazard.
- *
- *
- * If Case 2 occurs, what we should do to resolve data hazard?
- *
- *  - Step 1: add a dep var between reused_op and share_tensor_buffer_op,
- *            that is: reused_op -> dep_var -> share_tensor_buffer_op.
- *
- *  - Step 2: Update deps_, all preceding ops of reused_op should be
- *            preceding ops of op.
- */
-void BufferSharedCrossOpMemoryReusePass::RunOnScopeIdx(size_t idx) const {
-  auto &ops = ops_[idx];
-
-  auto &last_live_ops_of_vars =
-      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars)[idx];
-
-  // Build a reverse map of `last_live_ops_of_vars`,
-  // i.e., VarHandle -> last lived ops of VarHandle
-  std::unordered_map<VarHandle *, std::unordered_set<ComputationOpHandle *>>
-      var_to_ops;
-  for (auto &pair : last_live_ops_of_vars) {
-    for (auto *op : pair.second.ops()) {
-      var_to_ops[pair.second.var()].insert(op);
-    }
-  }
-
-  // Deep copy of `var_to_ops`, used to get last lived ops of each unlived var
-  auto original_var_to_ops = var_to_ops;
-
-  // Memory size of VarHandle -> list<VarHandle>
-  std::map<int64_t, std::list<VarHandle *>> unlived_var_pool;
-  size_t reuse_num = 0;
-
-  for (auto *op : ops) {
-    // Transverse all output args of op, find whether there is unlived var
-    // can be reused.
-    auto out_args = op->Node()->Op()->OutputArgumentNames();
-    for (auto &out_arg : out_args) {
-      auto out_nodes = this->FindNodesByName(out_arg, op->Node()->outputs);
-      // If out_arg is kEmptyVarName, it may not be found in output nodes.
-      if (out_nodes.size() != 1) {
-        continue;
-      }
-
-      auto *out_node = *(out_nodes.begin());
-      auto *out_var =
-          dynamic_cast<VarHandle *>(&(out_node->Wrapper<VarHandleBase>()));
-      PADDLE_ENFORCE_NOT_NULL(out_var);
-
-      // If out_arg is not reusable, skip it
-      if (!IsOutVarReusable(*out_var)) {
-        continue;
-      }
-
-      auto mem_size = GetMemorySize(*out_var);
-      // Special case: if memory size of out_var is 0, skip it
-      if (mem_size == 0) {
-        continue;
-      }
-
-      // Find a suitable unlived var from `unlived_var_pool`
-      // Here, we use `find`, but we can perform `lower_bound` if
-      // it is better in the future.
-      auto iter = unlived_var_pool.find(std::abs(mem_size));
-      if (iter == unlived_var_pool.end()) {
-        continue;
-      }
-
-      // Obtain candidate_vars that can be reused.
-      auto &candidate_vars = iter->second;
-      for (auto var_iter = candidate_vars.begin();
-           var_iter != candidate_vars.end(); ++var_iter) {
-        bool success = this->TryReuseVar(*var_iter, out_var);
-        if (!success) continue;
-
-        // If memory reuse is successful, we should do some post-processing.
-        ++reuse_num;
-        auto &prev_ops = original_var_to_ops.at(*var_iter);
-
-        // Add extra dependencies between `op` and last lived ops of reused var
-        // (i.e. prev_ops) if needed.
-        // All `prev_ops` must be preceding ops of op to avoid data hazard.
-        size_t new_added_dep_num = ResolveDependencyBetween(op, prev_ops);
-        VLOG(3) << "Variable can be reused between: " << (*var_iter)->Name()
-                << " -> " << out_var->Name() << " when running op "
-                << op->Name() << ", add extra dependency " << new_added_dep_num
-                << "/" << prev_ops.size();
-
-        // erase reused var from ``original_var_to_ops`
-        original_var_to_ops.erase(*var_iter);
-
-        // erase reused var from `candidate_vars`
-        candidate_vars.erase(var_iter);
-        if (candidate_vars.empty()) {
-          // erase reused var from `unlived_var_pool` if there is no other vars
-          // which has same size with reused var.
-          unlived_var_pool.erase(iter);
-        }
-        break;
-      }
-    }
-
-    // After all output args have been transversed, we should check whether
-    // there is new unlived var after `op` runs.
-    for (auto op_iter = var_to_ops.begin(); op_iter != var_to_ops.end();) {
-      // erase op from `var_to_ops` first
-      op_iter->second.erase(op);
-      if (op_iter->second.empty()) {
-        // there is a unlived var, since all lived ops have run
-        VarHandle *unlived_var = op_iter->first;
-        var_to_ops.erase(op_iter++);
-        if (IsInVarReusable(*unlived_var)) {
-          auto mem_size = GetMemorySize(*unlived_var);
-          if (mem_size != 0) {
-            unlived_var_pool[std::abs(mem_size)].push_front(unlived_var);
-          }
-        }
-      } else {
-        ++op_iter;
-      }
-    }
-  }
-  VLOG(4) << "Reuse " << reuse_num << " variable(s) in Scope " << idx;
-}
-
-size_t BufferSharedCrossOpMemoryReusePass::ResolveDependencyBetween(
-    ComputationOpHandle *op,
-    const std::unordered_set<ComputationOpHandle *> &prev_ops) const {
-  size_t new_added_dep_num = 0;
-  size_t op_idx = OpIndex(op);
-  auto &deps = deps_[op->GetScopeIdx()];
-  for (auto *prev_op : prev_ops) {
-    auto op_dep = GetOpDep(prev_op, op);
-    if (op_dep == NodeDependency::kBefore) continue;
-    PADDLE_ENFORCE_EQ(op_dep, NodeDependency::kNoDep,
-                      "The graph has circle, this may be a bug");
-
-    auto iter =
-        std::find_if(prev_op->Outputs().begin(), prev_op->Outputs().end(),
-                     [](VarHandleBase *var) {
-                       return dynamic_cast<DummyVarHandle *>(var) != nullptr;
-                     });
-
-    if (iter != prev_op->Outputs().end()) {
-      op->AddInput(*iter);
-    } else {
-      auto *dep_var = new DummyVarHandle(graph_->CreateControlDepVar());
-      graph_->Get<details::GraphDepVars>(details::kGraphDepVars)
-          .emplace(dep_var);
-      prev_op->AddOutput(dep_var);
-      op->AddInput(dep_var);
-    }
-
-    // All preceding ops of `prev_op` should be preceding ops of `op`
-    size_t prev_op_idx = OpIndex(prev_op);
-    for (size_t i = 0; i < deps[prev_op_idx].size(); ++i) {
-      if (deps[prev_op_idx][i] != NodeDependency::kAfter) {
-        continue;
-      }
-
-      deps[i][op_idx] = NodeDependency::kBefore;
-      deps[op_idx][i] = NodeDependency::kAfter;
-    }
-
-    // All pending ops of `op` should be pending ops of `prev_op`.
-    for (size_t i = 0; i < deps[op_idx].size(); ++i) {
-      if (deps[op_idx][i] != NodeDependency::kBefore) {
-        continue;
-      }
-
-      deps[i][prev_op_idx] = NodeDependency::kAfter;
-      deps[prev_op_idx][i] = NodeDependency::kBefore;
-    }
-
-    // `prev_op` is one of preceding op of `op`
-    SetOpDep(prev_op, op, NodeDependency::kBefore);
-    ++new_added_dep_num;
-  }
-  return new_added_dep_num;
-}
-
-void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const {
-  PADDLE_ENFORCE(ops_.empty(), "ops_ must be initialized here");
-  PADDLE_ENFORCE(op_to_idx_.empty(), "op_to_idx_ must be initialized here");
-  PADDLE_ENFORCE(deps_.empty(), "deps_ must be initialized here");
-
-  // Toposort ops
-  OpGraphView graph_view(ir::FilterByNodeWrapper<OpHandleBase>(*graph_));
-  auto ops = SortOp(graph_view);
-
-  size_t scope_num = this->ScopeNum();
-  size_t op_num = ops.size();
-
-  // A map to record all preceding ops of each op
-  std::unordered_map<OpHandleBase *, std::unordered_set<OpHandleBase *>>
-      preceding_ops;
-
-  // BFS to fill `preceding_ops`
-  graph_view.BreadthFirstVisit([&](OpHandleBase *cur_op) {
-    // All preceding ops of cur_op should be:
-    //  - preceding ops of cur_op, that is connected to cur_op directely
-    //  - all preceding ops of `direct preceding ops of cur_op`
-    auto &all_preceding_ops_of_cur_op = preceding_ops[cur_op];
-    for (auto &preceding_op : graph_view.PrecedingOps(cur_op)) {
-      all_preceding_ops_of_cur_op.insert(preceding_op);
-      auto &prev_preceding_ops = preceding_ops[preceding_op];
-      all_preceding_ops_of_cur_op.insert(prev_preceding_ops.begin(),
-                                         prev_preceding_ops.end());
-    }
-  });
-  PADDLE_ENFORCE_EQ(preceding_ops.size(), op_num);
-
-  // Find out ComputationOpHandles only
-  ops_.resize(scope_num);
-  op_to_idx_.resize(scope_num);
-  for (auto *op : ops) {
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    if (compute_op == nullptr) continue;
-    size_t scope_idx = compute_op->GetScopeIdx();
-    ops_[scope_idx].emplace_back(compute_op);
-    op_to_idx_[scope_idx].emplace(compute_op, op_to_idx_[scope_idx].size());
-  }
-
-  // Fill deps_ according to `preceding_ops`
-  deps_.resize(scope_num);
-  for (size_t i = 0; i < deps_.size(); ++i) {
-    deps_[i].resize(ops_[i].size());
-    for (auto &item : deps_[i]) {
-      item.assign(ops_[i].size(), NodeDependency::kNoDep);
-    }
-  }
-
-  for (auto &ops_on_each_device : ops_) {
-    for (auto *op : ops_on_each_device) {
-      SetOpDep(op, op, NodeDependency::kSame);
-      for (auto *preceding_op : preceding_ops[op]) {
-        auto *compute_preceding_op =
-            dynamic_cast<ComputationOpHandle *>(preceding_op);
-        if (compute_preceding_op != nullptr &&
-            compute_preceding_op->GetScopeIdx() == op->GetScopeIdx()) {
-          SetOpDep(compute_preceding_op, op, NodeDependency::kBefore);
-        }
-      }
-    }
-  }
-}
-
-size_t BufferSharedCrossOpMemoryReusePass::OpIndex(
-    const ComputationOpHandle *op) const {
-  auto iter = op_to_idx_[op->GetScopeIdx()].find(op);
-  PADDLE_ENFORCE(iter != op_to_idx_[op->GetScopeIdx()].end());
-  return iter->second;
-}
-
-NodeDependency BufferSharedCrossOpMemoryReusePass::GetOpDep(
-    const ComputationOpHandle *op1, const ComputationOpHandle *op2) const {
-  PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx());
-  return deps_[op1->GetScopeIdx()][OpIndex(op1)][OpIndex(op2)];
-}
-
-void BufferSharedCrossOpMemoryReusePass::SetOpDep(
-    const ComputationOpHandle *op1, const ComputationOpHandle *op2,
-    NodeDependency dep) const {
-  PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx());
-  if (op1 == op2) {
-    PADDLE_ENFORCE(dep == NodeDependency::kSame);
-    auto idx = OpIndex(op1);
-    deps_[op1->GetScopeIdx()][idx][idx] = NodeDependency::kSame;
-  } else {
-    auto idx1 = OpIndex(op1);
-    auto idx2 = OpIndex(op2);
-    PADDLE_ENFORCE(dep != NodeDependency::kSame && idx1 != idx2);
-    deps_[op1->GetScopeIdx()][idx1][idx2] = dep;
-    deps_[op1->GetScopeIdx()][idx2][idx1] = ReverseNodeDependency(dep);
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(buffer_shared_cross_op_memory_reuse_pass,
-              paddle::framework::ir::BufferSharedCrossOpMemoryReusePass)
-    .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
-    .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars)
-    .RequirePassAttr(paddle::framework::ir::kUseCuda);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
deleted file mode 100644
index 006e79c92dc0b3ad70491b494af4383cdac96e41..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class BufferSharedInplaceOpPass : public MemoryReusePass {
- protected:
-  std::string ReuseType() const override { return "inplace"; }
-
-  void Run(Graph *graph) const override;
-};
-
-void BufferSharedInplaceOpPass::Run(Graph *graph) const {
-  const auto &last_live_ops =
-      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
-
-  bool use_cuda = Get<bool>(kUseCuda);
-
-  // Step 1: Build a reverse map of last_live_ops
-  // i.e.: op -> vars
-  std::unordered_map<details::ComputationOpHandle *,
-                     std::unordered_map<std::string, ir::Node *>>
-      candidate_ops;
-  for (auto &each_scope_ops : last_live_ops) {
-    for (auto &pair : each_scope_ops) {
-      // If variable has more than 1 last lived ops, this variable cannot
-      // be inplaced.
-      if (pair.second.ops().size() != 1) {
-        continue;
-      }
-
-      auto *op = *(pair.second.ops().begin());
-      const std::string &op_type = op->GetOp()->Type();
-      const framework::OpDesc *op_desc = op->Node()->Op();
-      PADDLE_ENFORCE_NOT_NULL(op_desc);
-
-      auto &infer_inplace = OpInfoMap::Instance().Get(op_type).infer_inplace_;
-      if (!infer_inplace) {
-        continue;
-      }
-
-      const std::string &var_name = pair.first;
-      auto in_nodes = this->FindNodesByName(var_name, op->Node()->inputs);
-      if (in_nodes.size() == 1) {
-        candidate_ops[op][var_name] = *in_nodes.begin();
-      }
-    }
-  }
-
-  // Step 2: Check which vars can be inplaced indeed
-  for (auto &op_vars_pair : candidate_ops) {
-    auto *op = op_vars_pair.first;
-    auto &vars = op_vars_pair.second;
-
-    const std::string &op_type = op->GetOp()->Type();
-    auto *op_desc = op->Node()->Op();
-
-    auto in_to_outs =
-        OpInfoMap::Instance().Get(op_type).infer_inplace_(*op_desc, use_cuda);
-    for (auto &pair : in_to_outs) {
-      auto &in_param = pair.first;
-      auto &in_args = op_desc->Input(in_param);
-      if (in_args.empty()) {
-        VLOG(4) << "Cannot inplace because Input(" << in_param
-                << ") is empty in " << op_type;
-        continue;
-      }
-
-      auto &in_arg = in_args[0];
-      auto iter = vars.find(in_arg);
-      if (iter == vars.end()) {
-        VLOG(4) << "Cannot inplace maybe because Input(" << in_param
-                << ")=" << in_arg << " is not lastly used in op " << op_type
-                << ", or it occurs multiple times in input or occurs in output";
-        continue;
-      }
-
-      ir::Node *in_node = iter->second;
-
-      auto &out_param = pair.second;
-      auto &out_args = op_desc->Output(out_param);
-
-      if (out_args.empty()) {
-        VLOG(4) << "Cannot inplace because Output(" << out_param
-                << ") is empty in " << op_type;
-        continue;
-      }
-
-      auto &out_arg = out_args[0];
-      auto out_nodes = this->FindNodesByName(out_arg, op->Node()->outputs);
-      if (out_nodes.size() != 1) {
-        VLOG(4) << "Cannot inplace because Output(" << out_param
-                << ")=" << out_arg << " occurs " << out_nodes.size()
-                << " time(s) in output of op " << op_type;
-        continue;
-      }
-
-      auto *out_node = *out_nodes.begin();
-
-      auto &in_var_handle = in_node->Wrapper<details::VarHandleBase>();
-      auto &out_var_handle = out_node->Wrapper<details::VarHandleBase>();
-
-      auto *in_var_handle_ptr =
-          dynamic_cast<details::VarHandle *>(&in_var_handle);
-      auto *out_var_handle_ptr =
-          dynamic_cast<details::VarHandle *>(&out_var_handle);
-
-      if (in_var_handle_ptr == nullptr || out_var_handle_ptr == nullptr) {
-        continue;
-      }
-
-      bool success = this->TryReuseVar(in_var_handle_ptr, out_var_handle_ptr);
-      if (success) {
-        VLOG(4) << "Inplace performed in op " << op_type << ": "
-                << in_var_handle_ptr->Name() << " -> "
-                << out_var_handle_ptr->Name()
-                << ". Debug String is: " << op->GetOp()->DebugString();
-      } else {
-        VLOG(3) << "Inplace failed in op " << op_type << ": "
-                << in_var_handle_ptr->Name() << " -> "
-                << out_var_handle_ptr->Name();
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(buffer_shared_inplace_pass,
-              paddle::framework::ir::BufferSharedInplaceOpPass)
-    .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
-    .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars)
-    .RequirePassAttr(paddle::framework::ir::kUseCuda);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc
deleted file mode 100644
index 56a658d4220add287f95f7b596c6a013ee64d229..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
-#include "paddle/fluid/operators/controlflow/op_variant.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class ConditionalOpEagerDeletionPass : public Pass {
- protected:
-  void ApplyImpl(Graph *graph) const override {
-    auto all_ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*graph);
-
-    // Find all conditional_op and conditional_grad_op
-    std::unordered_map<size_t, std::pair<std::vector<OperatorBase *>,
-                                         std::vector<OperatorBase *>>>
-        target_ops;
-    for (auto *op : all_ops) {
-      auto compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
-      if (compute_op == nullptr) continue;
-
-      if (compute_op->Name() == "conditional_block") {
-        target_ops[compute_op->GetScopeIdx()].first.emplace_back(
-            compute_op->GetOp());
-      } else if (compute_op->Name() == "conditional_block_grad") {
-        target_ops[compute_op->GetScopeIdx()].second.emplace_back(
-            compute_op->GetOp());
-      }
-    }
-
-    for (auto &ops_pair : target_ops) {
-      auto &ifelse_ops = ops_pair.second.first;
-      auto &ifelse_grad_ops = ops_pair.second.second;
-      operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-          graph->OriginProgram(), ifelse_ops, ifelse_grad_ops);
-    }
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conditional_block_op_eager_deletion_pass,
-              paddle::framework::ir::ConditionalOpEagerDeletionPass);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
deleted file mode 100644
index 962401a672d44939f4aa908ccbda4a42d1ef040a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ /dev/null
@@ -1,298 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <functional>
-#include <queue>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// op -> variables which can be deleted after op runs
-using OpToVarNameSetMap = std::unordered_map<details::ComputationOpHandle *,
-                                             std::unordered_set<std::string>>;
-
-static std::map<size_t, std::unordered_set<std::string>> VarsGroupByScopeIdx(
-    const OpToVarNameSetMap &map) {
-  std::map<size_t, std::unordered_set<std::string>> result;
-  for (auto &pair : map) {
-    size_t scope_idx = pair.first->GetScopeIdx();
-    auto &var_set = result[scope_idx];
-    for (auto &var : pair.second) {
-      var_set.insert(var);
-    }
-  }
-  return result;
-}
-
-// Check whether the variable is LoDTensor based on static VarDesc info
-static bool IsLoDTensor(VarDesc *var) {
-  return var->Proto()->type().type() == proto::VarType::LOD_TENSOR;
-}
-
-// Get memory size of LoDTensor
-static int64_t GetMemorySize(
-    const std::unordered_map<std::string, std::vector<details::VarHandle *>>
-        &vars,
-    const std::string &var_name) {
-  auto *var_desc = TryGetLatestVarDesc(vars.at(var_name));
-  PADDLE_ENFORCE_NOT_NULL(var_desc);
-  PADDLE_ENFORCE(IsLoDTensor(var_desc));
-  auto dims = var_desc->GetShape();
-  return SizeOfType(var_desc->GetDataType()) *
-         std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
-                         std::multiplies<int64_t>());
-}
-
-// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g.
-// SelectedRows, LoDTensorArray)
-// Since partial GC is based on static analysis of memory size of each variable
-// So we should skip SelectedRows and LoDTensorArray here
-static void SplitIntoLoDTensorAndNonLoDTensorVars(
-    const OpToVarNameSetMap &m, const details::GraphVars &vars,
-    OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) {
-  lod_tensors->clear();
-  other_vars->clear();
-
-  for (auto &op_vars_pair : m) {
-    for (auto var_name : op_vars_pair.second) {
-      auto *var_desc = TryGetLatestVarDesc(
-          vars[op_vars_pair.first->GetScopeIdx()].at(var_name));
-      if (IsLoDTensor(var_desc)) {
-        (*lod_tensors)[op_vars_pair.first].insert(var_name);
-      } else {
-        (*other_vars)[op_vars_pair.first].insert(var_name);
-      }
-    }
-  }
-}
-
-struct GCVarInfo {
-  GCVarInfo(const std::string &name, int64_t memory_size,
-            details::ComputationOpHandle *op, size_t scope_idx)
-      : name_(name),
-        memory_size_(memory_size),
-        op_(op),
-        scope_idx_(scope_idx) {}
-
-  std::string name_;     // variable name
-  int64_t memory_size_;  // memory size
-  details::ComputationOpHandle
-      *op_;           // op after which the variable could be deleted
-  size_t scope_idx_;  // scope index where the variable locates
-
-  int64_t AbsMemorySize() const { return std::abs(memory_size_); }
-};
-
-// Delete delete_lod_tensor_only is not used currently
-static OpToVarNameSetMap ShrinkGCVars(
-    const OpToVarNameSetMap &m, const details::GraphVars &vars,
-    const std::vector<platform::Place> &places, double fraction_of_memory_size,
-    bool delete_lod_tensor_only = false) {
-  // Do not perform gc when fraction_of_memory_size = 0
-  if (fraction_of_memory_size <= 0.0) return {};
-
-  /**
-   * Step 1: Split all variables into LoDTensor and Non-LoDTensor.
-   * We can only calculate memory size of LoDTensors
-   */
-  OpToVarNameSetMap lod_tensors, other_vars;
-  SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars);
-
-  // Perform complete gc when fraction_of_memory_size >= 1
-  if (fraction_of_memory_size >= 1.0) {
-    return delete_lod_tensor_only ? lod_tensors : m;
-  }
-
-  /**
-   * Step 2: build GCVarInfos, and calculate total memory sizes of each device
-   */
-
-  // place -> variable info (name, memory size, place, scope_idx)
-  std::map<platform::Place, std::vector<GCVarInfo>> place_to_vars;
-
-  // place -> total memory sizes
-  std::map<platform::Place, int64_t> place_to_size;
-  for (auto &op_vars_pair : lod_tensors) {
-    auto *op = op_vars_pair.first;
-    auto &var_names = op_vars_pair.second;
-    auto scope_idx = op->GetScopeIdx();
-    auto &place = places[scope_idx];
-
-    for (auto &var_name : var_names) {
-      auto var_size = GetMemorySize(vars[scope_idx], var_name);
-      GCVarInfo var_info(var_name, var_size, op, scope_idx);
-      place_to_size[place] += var_info.AbsMemorySize();
-      place_to_vars[place].emplace_back(std::move(var_info));
-    }
-  }
-
-  /**
-   * Step 3: sort GCVarInfos, and only delete the largest variables.
-   */
-  OpToVarNameSetMap partial_vars;
-  for (auto &place_to_var_pair : place_to_vars) {
-    auto &place = place_to_var_pair.first;
-    auto &gc_vars = place_to_var_pair.second;
-    std::sort(gc_vars.begin(), gc_vars.end(),
-              [](const GCVarInfo &var1, const GCVarInfo &var2) {
-                return var1.AbsMemorySize() > var2.AbsMemorySize();
-              });
-
-    int64_t accumulated_size = 0;
-    int64_t size_threshold =
-        static_cast<int64_t>(fraction_of_memory_size * place_to_size[place]);
-    for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold;
-         ++i) {
-      partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_);
-      accumulated_size += gc_vars[i].AbsMemorySize();
-    }
-  }
-
-  /**
-   * Step 4: Combine other vars (SelectedRows, LoDTensorArray)
-   */
-  if (!delete_lod_tensor_only) {
-    for (auto &op_vars_pair : other_vars) {
-      partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(),
-                                              op_vars_pair.second.end());
-    }
-  }
-
-  return partial_vars;
-}
-
-class EagerDeletionPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override;
-};
-
-void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
-  auto &var_infos = Get<MemOptVarInfoMapList>(kMemOptVarInfoMapList);
-
-  const auto &vars = graph->Get<details::GraphVars>(details::kGraphVars);
-
-  const auto &last_live_ops =
-      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
-  const auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
-  const auto &places = Get<std::vector<platform::Place>>(kAllPlaces);
-
-  // a reverse map of last_live_ops
-  //   i.e., last op --> variable names which can be deleted.
-  OpToVarNameSetMap op_vars_map;
-  for (auto &var_ops_map : last_live_ops) {
-    for (auto &var_ops_pair : var_ops_map) {
-      const std::string &var_name = var_ops_pair.first;
-      for (auto *op : var_ops_pair.second.ops()) {
-        op_vars_map[op].insert(var_name);
-      }
-    }
-  }
-
-  double memory_fraction = framework::GetEagerDeletionMemoryFraction();
-
-  op_vars_map = ShrinkGCVars(op_vars_map, vars, places, memory_fraction);
-
-  for (auto &pair : op_vars_map) {
-    auto *op = pair.first;
-    auto &var_names = pair.second;
-
-    auto *eager_deletion_node =
-        graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
-
-    std::unordered_set<MemOptVarInfo *> var_info;
-    for (auto &var_name : var_names) {
-      var_info.insert(var_infos[op->GetScopeIdx()].at(var_name).get());
-    }
-
-    auto *eager_deletion_op = new details::EagerDeletionOpHandle(
-        eager_deletion_node, op->GetScope(), op->GetPlace(),
-        std::move(var_info), gcs.at(places[op->GetScopeIdx()]).get());
-
-    auto it = std::find_if(
-        op->Outputs().begin(), op->Outputs().end(),
-        [](details::VarHandleBase *var) {
-          return dynamic_cast<details::DummyVarHandle *>(var) != nullptr;
-        });
-
-    if (it != op->Outputs().end()) {
-      eager_deletion_op->AddInput(*it);
-    } else {
-      auto *dep_var = new details::DummyVarHandle(graph->CreateControlDepVar());
-      graph->Get<details::GraphDepVars>(details::kGraphDepVars)
-          .emplace(dep_var);
-      op->AddOutput(dep_var);
-      eager_deletion_op->AddInput(dep_var);
-    }
-
-    auto *dummy_leaf =
-        new details::DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<details::GraphDepVars>(details::kGraphDepVars)
-        .emplace(dummy_leaf);
-    eager_deletion_op->AddOutput(dummy_leaf);
-
-    eager_deletion_op->SetDeviceContext(
-        places[op->GetScopeIdx()],
-        platform::DeviceContextPool::Instance().Get(places[op->GetScopeIdx()]));
-  }
-
-  VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " << memory_fraction;
-  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
-
-  if (VLOG_IS_ON(10)) {
-    auto vars_group_by_scope_idx = VarsGroupByScopeIdx(op_vars_map);
-    for (auto &pair : vars_group_by_scope_idx) {
-      VLOG(10) << "Scope " << pair.first << " has " << pair.second.size()
-               << " vars";
-    }
-  }
-
-  auto conditional_block_op_eager_deletion_pass =
-      ir::PassRegistry::Instance().Get(
-          "conditional_block_op_eager_deletion_pass");
-  conditional_block_op_eager_deletion_pass->Apply(graph);
-
-  auto while_op_eager_deletion_pass =
-      ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
-  while_op_eager_deletion_pass->Apply(graph);
-
-  auto recurrent_op_eager_deletion_pass =
-      ir::PassRegistry::Instance().Get("recurrent_op_eager_deletion_pass");
-  recurrent_op_eager_deletion_pass->Apply(graph);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(eager_deletion_pass, paddle::framework::ir::EagerDeletionPass)
-    .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
-    .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars)
-    .RequirePassAttr(paddle::framework::ir::kAllPlaces)
-    .RequirePassAttr(paddle::framework::ir::kGarbageCollector);
-
-USE_PASS(conditional_block_op_eager_deletion_pass);
-USE_PASS(while_op_eager_deletion_pass);
-USE_PASS(recurrent_op_eager_deletion_pass);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
deleted file mode 100644
index 4f6bacecab4aac39b6f4cb01138560ca8378c13a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class MemOptVarInfo {
- public:
-  MemOptVarInfo(const std::string &name, size_t ref_cnt) : name_(name) {
-    SetRefCnt(ref_cnt);
-  }
-
-  bool DecreaseRefCnt() {
-    return ref_cnt_ == 1 || (runtime_ref_cnt_.fetch_sub(1) == 1);
-  }
-
-  void ResetRuntimeRefCnt() {
-    if (ref_cnt_ != 1) {
-      runtime_ref_cnt_ = ref_cnt_;
-    }
-  }
-
-  void SetRefCnt(size_t ref_cnt) {
-    PADDLE_ENFORCE_GE(ref_cnt, 1,
-                      "Reference count must be larger than or equal to 1");
-    ref_cnt_ = ref_cnt;
-    runtime_ref_cnt_ = ref_cnt;
-  }
-
-  // Skip all memory optimization, including memory reuse and garbage collection
-  void SetSkipAllMemoryOptimization(bool is_skipped) {
-    skip_all_memory_optimization_ = is_skipped;
-  }
-
-  bool IsSkippedAllMemoryOptimization() const {
-    return skip_all_memory_optimization_;
-  }
-
-  // Skip all memory reuse, including inplace and cross op memory reuse
-  void SetSkipMemoryReuse(bool is_skipped) { skip_memory_reuse_ = is_skipped; }
-
-  bool IsSkippedMemoryReuse() const {
-    return skip_memory_reuse_ || skip_all_memory_optimization_;
-  }
-
-  const std::string &Name() const { return name_; }
-
- private:
-  std::string name_;
-
-  /**
-   * ref_cnt_ is the total number of last-lived ops of variable. It would not
-   * be changed during iterations.
-   *
-   * runtime_ref_cnt_ is the runtime reference count of variable, which would
-   * decrease 1 when each EagerDeletionOpHandle runs. As a result, it should
-   * be reset to ref_cnt_ after each iteration ends. Since operators are
-   * scheduled in many threads inside ParallelExecutor, runtime_ref_cnt_
-   * must be an atomic integer to guarantee the thread safety and visibility.
-   *
-   * Speciallly, if ref_cnt_ is 1, we do not need to reset runtime_ref_cnt_
-   * after iteration ends.
-   */
-  size_t ref_cnt_;
-  std::atomic<size_t> runtime_ref_cnt_;
-  bool skip_memory_reuse_{false};
-  bool skip_all_memory_optimization_{false};
-};
-
-using MemOptVarInfoMapList = std::vector<
-    std::unordered_map<std::string, std::shared_ptr<MemOptVarInfo>>>;
-
-class SkipMemOptVarsGuard {
- public:
-  SkipMemOptVarsGuard(MemOptVarInfoMapList *list,
-                      const std::vector<std::string> &vars,
-                      bool need_reset_ref_cnt)
-      : list_(list), need_reset_ref_cnt_(need_reset_ref_cnt) {
-    if (!list_) return;
-
-    skip_vars_.reserve(vars.size() * list->size());
-    for (auto &var : vars) {
-      for (auto &map : *list_) {
-        auto iter = map.find(var);
-        if (iter != map.end() &&
-            !iter->second->IsSkippedAllMemoryOptimization()) {
-          iter->second->SetSkipAllMemoryOptimization(true);
-          skip_vars_.emplace_back(iter->second.get());
-        }
-      }
-    }
-  }
-
-  ~SkipMemOptVarsGuard() {
-    for (auto *var : skip_vars_) {
-      var->SetSkipAllMemoryOptimization(false);
-    }
-
-    if (list_ && need_reset_ref_cnt_) {
-      for (auto &map : *list_) {
-        for (auto &pair : map) {
-          pair.second->ResetRuntimeRefCnt();
-        }
-      }
-    }
-  }
-
- private:
-  MemOptVarInfoMapList *list_;
-  bool need_reset_ref_cnt_;
-  std::vector<MemOptVarInfo *> skip_vars_;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
deleted file mode 100644
index 20c7968d6ac56054e31c4f6f51e72e7ae02bea57..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ /dev/null
@@ -1,370 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
-#include <functional>
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void MemoryReusePass::ApplyImpl(Graph *graph) const {
-  graph_ = graph;
-  use_cuda_ = Get<bool>(kUseCuda);
-  all_vars_ = &(graph_->Get<details::GraphVars>(details::kGraphVars));
-  var_infos_ = &(Get<MemOptVarInfoMapList>(kMemOptVarInfoMapList));
-  last_live_ops_of_vars_ =
-      &(Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars));
-
-  reused_in_var_names_.resize(all_vars_->size());
-  reused_out_var_names_.resize(all_vars_->size());
-  var_descs_.resize(all_vars_->size());
-
-  pinned_var_set_ = nullptr;
-  if (graph->Has(details::kPinnedVars)) {
-    pinned_var_set_ = &graph->Get<details::PinnedVars>(details::kPinnedVars);
-  }
-
-  // Collect the existing ShareTensorBufferOpHandles.
-  // This is because (1) we want to reuse the existing
-  // ShareTensorBufferOpHandles to avoid inserting too many ops;
-  // (2) what is more important, a variable cannot be reused
-  // by two different variables, which may cause wrong calculation
-  // results. We have to know which variables have been reused.
-  CollectShareTensorBufferOpHandles();
-  CollectReusedVars();
-  Run(graph);
-
-  std::map<size_t, size_t> op_num;
-  for (auto &pair : ops_) {
-    ++op_num[pair.first->GetScopeIdx()];
-  }
-
-  for (auto &pair : op_num) {
-    VLOG(2) << "Create " << pair.second
-            << " ShareTensorBufferOpHandles in Scope " << pair.first;
-  }
-}
-
-bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var,
-                                  details::VarHandle *out_var) const {
-  auto *op =
-      dynamic_cast<details::ComputationOpHandle *>(out_var->GeneratedOp());
-  PADDLE_ENFORCE_NOT_NULL(op);
-  if (IsVarPairReusable(*in_var, *out_var)) {
-    AddReuseVar(op, in_var, out_var);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-std::unordered_set<Node *> MemoryReusePass::FindNodesByName(
-    const std::string &name, const std::vector<Node *> &nodes) const {
-  std::unordered_set<ir::Node *> ret;
-  for (auto *node : nodes) {
-    if (node->Name() == name) {
-      ret.insert(node);
-    }
-  }
-  return ret;
-}
-
-VarDesc *MemoryReusePass::GetVarDesc(const details::VarHandle &var) const {
-  const auto var_name = var.Name();
-  size_t scope_idx = var.scope_idx();
-  auto iter = var_descs_[scope_idx].find(var_name);
-  if (iter == var_descs_[scope_idx].end()) {
-    PADDLE_ENFORCE((*all_vars_)[scope_idx].count(var_name),
-                   "Variable %s not found", var_name);
-    auto *desc = TryGetLatestVarDesc((*all_vars_)[scope_idx].at(var_name));
-    PADDLE_ENFORCE_NOT_NULL(desc);
-    var_descs_[scope_idx].emplace(var_name, desc);
-    return desc;
-  } else {
-    return iter->second;
-  }
-}
-
-int64_t MemoryReusePass::GetMemorySize(const details::VarHandle &var) const {
-  auto *var_desc = GetVarDesc(var);
-  auto shapes = var_desc->GetShape();
-  auto sizeof_dtype = static_cast<int64_t>(SizeOfType(var_desc->GetDataType()));
-  return std::accumulate(shapes.begin(), shapes.end(), static_cast<int64_t>(1),
-                         std::multiplies<int64_t>()) *
-         sizeof_dtype;
-}
-
-void MemoryReusePass::CollectShareTensorBufferOpHandles() const {
-  auto all_ops = FilterByNodeWrapper<details::OpHandleBase>(*graph_);
-  for (auto *op : all_ops) {
-    auto *share_buffer_op =
-        dynamic_cast<details::ShareTensorBufferOpHandle *>(op);
-    if (share_buffer_op != nullptr) {
-      auto *compute_op =
-          details::GetUniquePendingComputationOpHandle(share_buffer_op);
-      PADDLE_ENFORCE(ops_.count(compute_op) == 0);
-      ops_.emplace(compute_op, share_buffer_op);
-    }
-  }
-}
-
-void MemoryReusePass::CollectReusedVars() const {
-  for (auto &pair : ops_) {
-    auto reused_vars = pair.second->ReusedVars();
-    for (auto &reused_var_pair : reused_vars) {
-      reused_in_var_names_[pair.first->GetScopeIdx()].insert(
-          reused_var_pair.first);
-      reused_out_var_names_[pair.first->GetScopeIdx()].insert(
-          reused_var_pair.second);
-    }
-  }
-}
-
-bool MemoryReusePass::IsInVarAlreadyReused(
-    const details::VarHandle &in_var) const {
-  const auto var_name = in_var.Name();
-  size_t scope_idx = in_var.scope_idx();
-  return reused_in_var_names_[scope_idx].count(var_name) > 0;
-}
-
-bool MemoryReusePass::IsOutVarAlreadyReused(
-    const details::VarHandle &out_var) const {
-  const auto var_name = out_var.Name();
-  size_t scope_idx = out_var.scope_idx();
-  return reused_out_var_names_[scope_idx].count(var_name) > 0;
-}
-
-details::ShareTensorBufferOpHandle *
-MemoryReusePass::InsertShareTensorBufferOpHandleToGraph(
-    details::ComputationOpHandle *op) const {
-  auto *buffer_share_node =
-      graph_->CreateEmptyNode("buffer_share", ir::Node::Type::kOperation);
-
-  auto *buffer_share_op = new details::ShareTensorBufferOpHandle(
-      buffer_share_node, op->GetScope(), op->GetScopeIdx(), op->GetOp()->Type(),
-      {}, {});
-
-  buffer_share_op->SetDeviceContext(
-      op->GetPlace(),
-      platform::DeviceContextPool::Instance().Get(op->GetPlace()));
-
-  // Inputs of `buffer_share_op` should be all inputs of `op`
-  for (auto *in_var : op->Inputs()) {
-    buffer_share_op->AddInput(in_var);
-  }
-
-  // Add a dep_var to resolve write-after-write data hazard between
-  // `buffer_share_op` and `op`.
-  auto *dep_var = new details::DummyVarHandle(graph_->CreateControlDepVar());
-  graph_->Get<details::GraphDepVars>(details::kGraphDepVars).emplace(dep_var);
-  op->AddInput(dep_var);
-  buffer_share_op->AddOutput(dep_var);
-
-  ops_.emplace(op, buffer_share_op);
-  return buffer_share_op;
-}
-
-/**
- * Input var is reusable only when:
- *  - it is not an empty var.
- *  - it has not been reused. If an input var is reused twice or more,
- *    the calculation result may be wrong.
- *  - it is not a persistable var.
- *  - it is LoDTensor. We can support SelectedRows in the future.
- */
-bool MemoryReusePass::IsInVarReusable(const details::VarHandle &in_var) const {
-  if (in_var.Name() == kEmptyVarName) {
-    return false;
-  }
-
-  if (IsInVarAlreadyReused(in_var)) {
-    return false;
-  }
-
-  const VarDesc *in_var_desc = GetVarDesc(in_var);
-
-  if (IsPinnedVar(*in_var_desc)) {
-    return false;
-  }
-
-  if (in_var_desc->GetType() != proto::VarType::LOD_TENSOR) {
-    return false;
-  }
-
-  return true;
-}
-
-/**
- * Output var is reusable only when:
- *  - it is not an empty var.
- *  - it is the first version var. Otherwise, the var may be overwritten
- *    in the second batch, which results in wrong calculation result.
- *    It is critical especially when
- *    ExecutionStrategy::num_iteration_per_drop_scope_ > 1.
- *  - it has not reused other var's memory. It is not necessary to do memory
- *    reuse twice for the same var.
- *  - it is not a persistable var.
- *  - it is LoDTensor. We can support SelectedRows in the future.
- *  - it does not occur in inputs of the generated op. It would happen when
- *    op has the same var as both input and output.
- */
-bool MemoryReusePass::IsOutVarReusable(
-    const details::VarHandle &out_var) const {
-  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<const details::ComputationOpHandle *>(
-      out_var.GeneratedOp()));
-  const auto out_name = out_var.Name();
-  if (out_name == kEmptyVarName) {
-    return false;
-  }
-
-  // out_var must be the first version!!!
-  auto out_var_iter = (*all_vars_)[out_var.scope_idx()].find(out_name);
-  PADDLE_ENFORCE(out_var_iter != (*all_vars_)[out_var.scope_idx()].end() &&
-                     !out_var_iter->second.empty(),
-                 "Cannot find variable %s", out_name);
-
-  if (out_var_iter->second[0] != &out_var) {
-    return false;
-  }
-
-  if (IsOutVarAlreadyReused(out_var)) {
-    return false;
-  }
-
-  const VarDesc *out_var_desc = GetVarDesc(out_var);
-  if (IsPinnedVar(*out_var_desc)) {
-    return false;
-  }
-
-  if (out_var_desc->GetType() != proto::VarType::LOD_TENSOR) {
-    return false;
-  }
-
-  // If out_name occurs in input of the generated op, it cannot reuse others.
-  if (!FindNodesByName(out_name, out_var.GeneratedOp()->Node()->inputs)
-           .empty()) {
-    return false;
-  }
-
-  return true;
-}
-
-bool MemoryReusePass::IsPinnedVar(const VarDesc &var_desc) const {
-  return var_desc.Persistable() ||
-         (pinned_var_set_ && pinned_var_set_->count(var_desc.Name()));
-}
-
-/**
- * Input-Output pair can be reused only when:
- *  - they are not the same var.
- *  - they are both reusable.
- *  - input var does not occur in output of op.
- *  - input var does not occur in input of op for multiple times.
- */
-bool MemoryReusePass::IsVarPairReusable(
-    const details::VarHandle &in_var, const details::VarHandle &out_var) const {
-  auto *op =
-      dynamic_cast<const details::ComputationOpHandle *>(out_var.GeneratedOp());
-  PADDLE_ENFORCE_NOT_NULL(op);
-
-  const auto in_name = in_var.Name();
-  if (in_name == out_var.Name()) {
-    return false;
-  }
-
-  if (!IsInVarReusable(in_var) || !IsOutVarReusable(out_var)) {
-    return false;
-  }
-
-  if (!FindNodesByName(in_name, op->Node()->outputs).empty()) {
-    return false;
-  }
-
-  auto all_input_args = op->Node()->Op()->InputArgumentNames();
-  if (std::count(all_input_args.begin(), all_input_args.end(), in_name) > 1) {
-    return false;
-  }
-
-  return true;
-}
-
-void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
-                                  details::VarHandle *in_var,
-                                  details::VarHandle *out_var) const {
-  PADDLE_ENFORCE((*var_infos_)[op->GetScopeIdx()].count(in_var->Name()) > 0,
-                 "%s does not in mem-opt var infos", in_var->Name());
-
-  if (ops_.count(op) == 0) {
-    InsertShareTensorBufferOpHandleToGraph(op);
-  }
-
-  auto *share_buffer_op = ops_[op];
-
-  auto &all_input_vars = share_buffer_op->Inputs();
-  bool has_input = std::find(all_input_vars.begin(), all_input_vars.end(),
-                             in_var) != all_input_vars.end();
-
-  if (!has_input) {
-    share_buffer_op->AddInput(in_var);
-  }
-
-  share_buffer_op->AddReuseVarPair(
-      (*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(),
-      out_var->Name());
-  reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name());
-  reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name());
-
-  UpdateLastLiveOpOfVar(op, in_var, out_var);
-}
-
-// 1. Set last living op of in_var to be any last living op of out_var
-// 2. Set reference count of in_var to be 1
-void MemoryReusePass::UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
-                                            details::VarHandle *in_var,
-                                            details::VarHandle *out_var) const {
-  size_t scope_idx = op->GetScopeIdx();
-  auto out_var_op_iter =
-      (*last_live_ops_of_vars_)[scope_idx].find(out_var->Name());
-
-  // In Reduce mode, some output variable(gradient of parameter) does not have
-  // last live ops
-  details::ComputationOpHandle *last_live_op_of_in_var = nullptr;
-  if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) {
-    last_live_op_of_in_var = op;
-  } else {
-    PADDLE_ENFORCE(!out_var_op_iter->second.ops().empty());
-    last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin());
-  }
-
-  auto *last_live_ops_of_in_var =
-      (*last_live_ops_of_vars_)[scope_idx][in_var->Name()].mutable_ops();
-  last_live_ops_of_in_var->clear();
-  last_live_ops_of_in_var->insert(last_live_op_of_in_var);
-
-  auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name());
-  PADDLE_ENFORCE(in_var_info_iter != (*var_infos_)[scope_idx].end(),
-                 "Cannot find variable %s", in_var->Name());
-
-  in_var_info_iter->second->SetRefCnt(1);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
deleted file mode 100644
index 822744191847586dc429b6896ff6f490381c5901..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * MemoryReusePass is the base class of InplacePass and MemoryOptimizePass.
- *
- * Unlike the legacy Python API fluid.memory_optimize() which changes
- * variable names in the program/graph, MemoryReusePass inserts
- * ShareTensorBufferOpHandle into the graph. It is because if we use the
- * way of changing variable names:
- *
- * 1. There are so many corner cases we should skip. For example, (1) variables
- *    that relates to send/recv ops cannot be renamed (otherwise, pserver
- *    and trainer cannot find the matching variables), (2) ins/outs of ops
- *    containing sub-blocks cannot be optimized, (3) variables inside
- *    op_role_vars cannot be renamed.
- *
- * 2. It is very difficult to avoid reusing variables that users want to fetch.
- *    This is because the memory-optimize passes/transpiler runs before users
- *    fetch, i.e., exe.run(...). We cannot know what users want to fetch in the
- *    future. As a result, we have to set var.persistable = True before
- *    applying memory-optimize passes/transpiler, which is rather ugly and not
- *    friendly to users.
- *
- * 3. Dim and LoD of the reused variable would be changed, which may result
- *    in potential errors in InferShape stage of the following ops. What's
- *    more, it makes that we cannot use the information from
- *    NoNeedBufferVarsInference.
- *
- * Considering the drawbacks of the former renaming strategy, we design a
- * novel memory-optimize pass to fix these issues. Whether in-place is
- * performed can be decided during run-time. ShareTensorBufferOpHandle
- * would only share tensor buffers between in/out, never rename variable,
- * and not change dim and LoD of variable. If users want to fetch a certain
- * variable, we can skip in-place during run-time.
- *
- * The only concern on speed performance may be: there are too many
- * ShareTensorBufferOpHandles in the graph. This can be avoided by moving
- * tensor buffer sharing in each ComputationOpHandle::Run() method. We need
- * a pass to clean all ShareTensorBufferOpHandles and move sharing to
- * ComputationOpHandle::Run() in the future.
- */
-class MemoryReusePass : public Pass {
- protected:
-  void ApplyImpl(Graph *graph) const final;
-
-  virtual void Run(Graph *graph) const = 0;
-
-  virtual std::string ReuseType() const = 0;
-
-  bool TryReuseVar(details::VarHandle *in_var,
-                   details::VarHandle *out_var) const;
-
-  bool IsInVarReusable(const details::VarHandle &in_var) const;
-
-  bool IsOutVarReusable(const details::VarHandle &out_var) const;
-
-  std::unordered_set<Node *> FindNodesByName(
-      const std::string &name, const std::vector<Node *> &nodes) const;
-
-  size_t ScopeNum() const { return all_vars_->size(); }
-
-  int64_t GetMemorySize(const details::VarHandle &var) const;
-
- private:
-  VarDesc *GetVarDesc(const details::VarHandle &var) const;
-
-  bool IsVarPairReusable(const details::VarHandle &in_var,
-                         const details::VarHandle &out_var) const;
-
-  bool IsInVarAlreadyReused(const details::VarHandle &in_var) const;
-
-  bool IsOutVarAlreadyReused(const details::VarHandle &out_var) const;
-
-  details::ShareTensorBufferOpHandle *InsertShareTensorBufferOpHandleToGraph(
-      details::ComputationOpHandle *op) const;
-
-  void CollectShareTensorBufferOpHandles() const;
-
-  void CollectReusedVars() const;
-
-  void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
-                   details::VarHandle *out_var) const;
-
-  void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
-                             details::VarHandle *in_var,
-                             details::VarHandle *out_var) const;
-
- private:
-  mutable Graph *graph_;
-  mutable bool use_cuda_;
-
-  mutable details::GraphVars *all_vars_;
-  mutable MemOptVarInfoMapList *var_infos_;
-
-  mutable std::vector<LastLiveOpsOfVars> *last_live_ops_of_vars_;
-
-  mutable std::unordered_map<details::ComputationOpHandle *,
-                             details::ShareTensorBufferOpHandle *>
-      ops_;
-
-  mutable std::vector<std::unordered_set<std::string>> reused_in_var_names_;
-  mutable std::vector<std::unordered_set<std::string>> reused_out_var_names_;
-
-  mutable std::vector<std::unordered_map<std::string, VarDesc *>> var_descs_;
-  mutable details::PinnedVars *pinned_var_set_;
-
-  bool IsPinnedVar(const VarDesc &out_var_desc) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
deleted file mode 100644
index d2cc89a2b49d8a6cace230e79ccb2e5f096dc53c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
-#include <queue>
-#include <utility>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-OpGraphView::OpGraphView(const std::vector<details::OpHandleBase *> &ops) {
-  Build(ops);
-}
-
-void OpGraphView::Build(const std::vector<details::OpHandleBase *> &ops) {
-  preceding_ops_.clear();
-  pending_ops_.clear();
-  for (auto &op : ops) {
-    preceding_ops_[op];
-    pending_ops_[op];
-    for (auto &var : op->Outputs()) {
-      for (auto &pending_op : var->PendingOps()) {
-        preceding_ops_[pending_op].insert(op);
-        pending_ops_[op].insert(pending_op);
-      }
-    }
-  }
-  PADDLE_ENFORCE(
-      preceding_ops_.size() == ops.size() && pending_ops_.size() == ops.size(),
-      "There are duplicate ops in graph.");
-}
-
-std::unordered_set<details::OpHandleBase *> OpGraphView::AllOps() const {
-  std::unordered_set<details::OpHandleBase *> ret;
-  ret.reserve(preceding_ops_.size());
-  for (auto &pair : preceding_ops_) {
-    ret.insert(pair.first);
-  }
-  return ret;
-}
-
-bool OpGraphView::HasOp(details::OpHandleBase *op) const {
-  return preceding_ops_.count(op) != 0;
-}
-
-void OpGraphView::EnforceHasOp(details::OpHandleBase *op) const {
-  PADDLE_ENFORCE(HasOp(op), "Cannot find op %s in OpGraphView",
-                 op == nullptr ? "nullptr" : op->DebugString());
-}
-
-const std::unordered_set<details::OpHandleBase *> &OpGraphView::PendingOps(
-    details::OpHandleBase *op) const {
-  EnforceHasOp(op);
-  return pending_ops_.at(op);
-}
-
-const std::unordered_set<details::OpHandleBase *> &OpGraphView::PrecedingOps(
-    details::OpHandleBase *op) const {
-  EnforceHasOp(op);
-  return preceding_ops_.at(op);
-}
-
-std::unordered_map<details::OpHandleBase *, size_t>
-OpGraphView::GetPrecedingDepNum() const {
-  std::unordered_map<details::OpHandleBase *, size_t> result;
-  result.reserve(preceding_ops_.size());
-  for (auto &pair : preceding_ops_) {
-    result.emplace(pair.first, pair.second.size());
-  }
-  return result;
-}
-
-size_t OpGraphView::OpNumber() const { return preceding_ops_.size(); }
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
deleted file mode 100644
index 86b25c13959a7934b9838085a0a92a62e4ac821c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <queue>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/details/op_handle_base.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class OpGraphView {
- public:
-  explicit OpGraphView(const std::vector<details::OpHandleBase *> &ops);
-
-  std::unordered_set<details::OpHandleBase *> AllOps() const;
-
-  const std::unordered_set<details::OpHandleBase *> &PendingOps(
-      details::OpHandleBase *op) const;
-
-  const std::unordered_set<details::OpHandleBase *> &PrecedingOps(
-      details::OpHandleBase *op) const;
-
-  std::unordered_map<details::OpHandleBase *, size_t> GetPrecedingDepNum()
-      const;
-
-  bool HasOp(details::OpHandleBase *op) const;
-
-  size_t OpNumber() const;
-
-  // Use a visitor to visit all pending ops of op
-  // Stop when callback returns false
-  template <typename Callback>
-  bool VisitAllPendingOps(details::OpHandleBase *op, Callback &&callback) const;
-
-  template <typename Callback>
-  void BreadthFirstVisit(Callback &&callback) const;
-
- private:
-  void Build(const std::vector<details::OpHandleBase *> &ops);
-  void EnforceHasOp(details::OpHandleBase *op) const;
-
-  std::unordered_map<details::OpHandleBase *,
-                     std::unordered_set<details::OpHandleBase *>>
-      preceding_ops_;
-  std::unordered_map<details::OpHandleBase *,
-                     std::unordered_set<details::OpHandleBase *>>
-      pending_ops_;
-};
-
-template <typename Callback>
-bool OpGraphView::VisitAllPendingOps(details::OpHandleBase *op,
-                                     Callback &&callback) const {
-  EnforceHasOp(op);
-  std::unordered_set<details::OpHandleBase *> visited;
-  std::queue<details::OpHandleBase *> q;
-  q.push(op);
-  while (!q.empty()) {
-    op = q.front();
-    q.pop();
-    for (auto &pending_op : pending_ops_.at(op)) {
-      if (visited.count(pending_op) == 0) {
-        visited.insert(pending_op);
-        if (!callback(pending_op)) {
-          return false;
-        }
-        q.push(pending_op);
-      }
-    }
-  }
-  return true;
-}
-
-template <typename Callback>
-void OpGraphView::BreadthFirstVisit(Callback &&callback) const {
-  auto op_deps = GetPrecedingDepNum();
-  size_t op_num = op_deps.size();
-
-  std::unordered_set<details::OpHandleBase *> visited_ops;
-  std::queue<details::OpHandleBase *> ready_ops;
-  size_t num_calls = 0;
-  for (auto iter = op_deps.begin(); iter != op_deps.end();) {
-    if (iter->second != 0) {
-      ++iter;
-      continue;
-    }
-
-    visited_ops.insert(iter->first);
-    ready_ops.push(iter->first);
-    callback(iter->first);
-    ++num_calls;
-    op_deps.erase(iter++);
-  }
-
-  while (!ready_ops.empty()) {
-    auto *cur_op = ready_ops.front();
-    ready_ops.pop();
-
-    auto &pending_ops = PendingOps(cur_op);
-    for (auto *pending_op : pending_ops) {
-      if (visited_ops.count(pending_op) > 0) {
-        continue;
-      }
-
-      if (--op_deps.at(pending_op) == 0) {
-        visited_ops.insert(pending_op);
-        op_deps.erase(pending_op);
-        ready_ops.push(pending_op);
-        callback(pending_op);
-        ++num_calls;
-      }
-    }
-  }
-
-  PADDLE_ENFORCE_EQ(num_calls, op_num, "There are unvisited ops");
-  PADDLE_ENFORCE_EQ(visited_ops.size(), op_num, "There are unvisited ops");
-  PADDLE_ENFORCE(op_deps.empty(), "There are unvisited ops");
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
deleted file mode 100644
index 6077069ea747a60b5989c5da373536e6654b2b74..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h"
-
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-using paddle::operators::OpVariant;
-using paddle::operators::OpVariantSet;
-using paddle::operators::OpAndGradOpPair;
-
-void RecurrentOpEagerDeletionPass::ApplyImpl(Graph *graph) const {
-  // Find all recurrent_op and recurrent_grad_op in graph
-  // Note the graph only contains ops and block 0
-  std::unordered_map<size_t, OpAndGradOpPair> target_ops =
-      DeviceIdToRecurrentAndRecurrentGradOp(*graph);
-
-  for (auto &entry : target_ops) {
-    // Prepare safe eager deletion on different devices because the garbage
-    // collection may be different across devices
-    OpAndGradOpPair &op_pair = entry.second;
-    PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
-        graph->OriginProgram(), &op_pair);
-  }
-}
-
-// Returns a std::unordered_map mapping from the device id to recurrent op and
-// grad op pair
-std::unordered_map<size_t, OpAndGradOpPair>
-RecurrentOpEagerDeletionPass::DeviceIdToRecurrentAndRecurrentGradOp(
-    const Graph &graph) const {
-  std::unordered_map<size_t, OpAndGradOpPair> ret;
-  std::vector<details::OpHandleBase *> all_ops =
-      FilterByNodeWrapper<details::OpHandleBase>(graph);
-
-  for (auto *op : all_ops) {
-    auto compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
-    if (compute_op == nullptr) continue;
-
-    if (compute_op->Name() == "recurrent") {
-      // GetScopeIdx() returns device/place id
-      ret[compute_op->GetScopeIdx()].first.emplace(compute_op->GetOp());
-    } else if (compute_op->Name() == "recurrent_grad") {
-      // GetScopeIdx() returns device/place id
-      ret[compute_op->GetScopeIdx()].second.emplace(compute_op->GetOp());
-    }
-  }
-  return ret;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(recurrent_op_eager_deletion_pass,
-              paddle::framework::ir::RecurrentOpEagerDeletionPass);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h
deleted file mode 100644
index 9c39a9faf23ae7014eedc087222b27ed0c02165a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <unordered_map>
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/operators/controlflow/op_variant.h"
-#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// Pass class set skip eager deletion vars for recurrent ops
-class RecurrentOpEagerDeletionPass : public Pass {
- protected:
-  void ApplyImpl(Graph *graph) const override;
-
- private:
-  // Returns a std::unordered_map mapping from the device id to recurrent op and
-  // grad op pair
-  std::unordered_map<size_t, paddle::operators::OpAndGradOpPair>
-  DeviceIdToRecurrentAndRecurrentGradOp(const Graph &graph) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
deleted file mode 100644
index cc26f7f96b278fb75625f71bae75dbf44639671f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
+++ /dev/null
@@ -1,398 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <queue>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class ReferenceCountPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override;
-};
-
-// A functor to shrink/remove operators who depend on other operators in a set
-class ShrinkDepsOpFunctor {
- private:
-  enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
-
- public:
-  explicit ShrinkDepsOpFunctor(
-      const std::vector<details::OpHandleBase *> &all_ops)
-      : graph_(all_ops) {}
-
-  template <typename OpSet>
-  OpSet operator()(const OpSet &op_set) const {
-    using KeyType = typename OpSet::key_type;
-    static_assert(
-        std::is_base_of<details::OpHandleBase,
-                        typename std::remove_pointer<KeyType>::type>::value,
-        "Key type of OpSet must be details::OpHandleBase, or derived of "
-        "details::OpHandleBase");
-
-    if (op_set.size() <= 1) return op_set;
-    std::vector<details::OpHandleBase *> ops(op_set.begin(), op_set.end());
-    OpSet ret;
-    auto rels = GetRelations(ops);
-    auto not_before = [](RelationShip r) { return r != kBefore; };
-    for (size_t i = 0; i < rels.size(); ++i) {
-      if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) {
-        ret.emplace(static_cast<KeyType>(ops[i]));
-      }
-    }
-    return ret;
-  }
-
- private:
-  std::vector<std::vector<RelationShip>> GetRelations(
-      const std::vector<details::OpHandleBase *> &ops) const {
-    std::unordered_map<details::OpHandleBase *, size_t> op_to_idx;
-    for (size_t i = 0; i < ops.size(); ++i) {
-      PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
-      op_to_idx[ops[i]] = i;
-    }
-
-    PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops");
-
-    std::vector<std::vector<RelationShip>> ret(ops.size());
-    for (auto &e : ret) {
-      e.assign(ops.size(), kSame);
-    }
-
-    size_t found_num = ops.size();
-    size_t total_num = ops.size() * ops.size();
-    auto visitor = [&](details::OpHandleBase *op, size_t i) {
-      auto it = op_to_idx.find(op);
-      if (it != op_to_idx.end()) {
-        size_t j = it->second;
-        if (i != j && ret[i][j] == kSame) {
-          ret[i][j] = kBefore;
-          ret[j][i] = kAfter;
-          found_num += 2;
-          if (found_num == total_num) {
-            return false;
-          }
-        }
-      }
-      return true;
-    };
-
-    for (size_t i = 0; i < ops.size(); ++i) {
-      auto sub_visitor = [&, i](details::OpHandleBase *op) {
-        return visitor(op, i);
-      };
-      if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) {
-        break;
-      }
-    }
-
-    for (size_t i = 0; i < ops.size(); ++i) {
-      for (size_t j = i + 1; j < ops.size(); ++j) {
-        if (ret[i][j] != kSame) continue;
-        ret[i][j] = kNoDeps;
-        ret[j][i] = kNoDeps;
-      }
-    }
-
-    return ret;
-  }
-
-  const OpGraphView graph_;
-};
-
-/**
- * Shrink op dependencies according to no need buffer vars.
- *
- * If some ops do not need Tensor buffer of any input,
- * just remove the dependency of this op, i.e, decrease reference count.
- *
- * For example, input Y of elementwise_add_grad op is only used to infer shape
- * and lod of Y@GRAD, we do not need the buffer of input Y. Data buffer of
- * input Y can be collected before elementwise_add_grad op runs.
- *
- * This method returns whether the dependency count decreases to 0, and
- * shrinks op dependency if possible.
- */
-static bool ShrinkNoNeedBufferVarOpDependency(
-    const std::string &var_name,
-    std::unordered_set<details::ComputationOpHandle *> *op_handles) {
-  std::vector<details::ComputationOpHandle *> skip_ops;
-  for (auto *op_handle : *op_handles) {
-    auto *op_base = op_handle->GetOp();
-    auto &inferer = op_base->Info().NoNeedBufferVarsInferer();
-    if (!inferer) {
-      continue;
-    }
-
-    std::unordered_set<std::string> no_need_buffer_vars =
-        inferer(op_base->Inputs(), op_base->Outputs(), op_base->Attrs());
-
-    // Check whether var_name occurs in other inputs or outputs of the op
-    // If it occurs, we cannot decrease the dependency number.
-    bool occurred_in_other_vars = false;
-    for (auto &in_pair : op_base->Inputs()) {
-      if (no_need_buffer_vars.count(in_pair.first) > 0) {
-        continue;
-      }
-
-      auto &args = in_pair.second;
-      auto iter = std::find(args.begin(), args.end(), var_name);
-      if (iter != args.end()) {
-        occurred_in_other_vars = true;
-        break;
-      }
-    }
-
-    if (occurred_in_other_vars) {
-      continue;
-    }
-
-    for (auto &out_pair : op_base->Outputs()) {
-      auto &args = out_pair.second;
-      auto iter = std::find(args.begin(), args.end(), var_name);
-      if (iter != args.end()) {
-        occurred_in_other_vars = true;
-        break;
-      }
-    }
-
-    if (!occurred_in_other_vars) {
-      VLOG(2) << "Shrink var " << var_name << " in op " << op_handle->Name();
-      skip_ops.emplace_back(op_handle);
-    }
-  }
-
-  if (skip_ops.size() == op_handles->size()) {
-    op_handles->clear();
-    return true;
-  } else {
-    for (auto *skip_op : skip_ops) {
-      op_handles->erase(skip_op);
-    }
-    return false;
-  }
-}
-
-/**
- * Find the nearest downstream computation op handle. If the op is a
- * computation op, just return itself.
- */
-static details::ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
-    details::OpHandleBase *op, size_t scope_idx) {
-  std::queue<details::OpHandleBase *> q;
-  std::unordered_set<details::OpHandleBase *> visited;
-  q.push(op);
-  while (!q.empty()) {
-    auto *op = q.front();
-    q.pop();
-    auto *compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
-    if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) {
-      return compute_op;
-    }
-    for (auto *out_var : op->Outputs()) {
-      for (auto *pending_op : out_var->PendingOps()) {
-        if (visited.count(pending_op)) continue;
-        visited.insert(pending_op);
-        q.push(pending_op);
-      }
-    }
-  }
-  return nullptr;
-}
-
-enum LastLiveOpSearchStatus { kSuccess, kFailure, kShouldPrecede };
-
-static std::unordered_set<details::ComputationOpHandle *>
-ExtractComputationOpFromLastLivedVar(details::VarHandle *var, size_t scope_idx,
-                                     const std::string &var_name,
-                                     const ShrinkDepsOpFunctor &shrink_func,
-                                     LastLiveOpSearchStatus *status) {
-  // stage one. Get last op for variable.
-  std::unordered_set<details::OpHandleBase *> candidates;
-  {
-    if (var->PendingOps().empty() && var->GeneratedOp()) {
-      // No operator depends on this variable. So the last operator is the op
-      // who generates this variable.
-      candidates.emplace(var->GeneratedOp());
-    } else {
-      candidates = var->PendingOps();
-    }
-
-    // No pending ops or generated op is nullptr
-    if (candidates.empty()) {
-      *status = LastLiveOpSearchStatus::kFailure;
-      return {};
-    }
-  }
-
-  // stage two. Try to cast them to computation op.
-  // return (*status=kFailure) when failed.
-  //
-  // The reason why we cannot make any types of op handle to be the last lived
-  // op is:
-  //    some op handle may operate on many DeviceContext, however, our garbage
-  //    collector can only wait one DeviceContext for now. So currently, we wait
-  //    the nearest compute op.
-  std::unordered_set<details::ComputationOpHandle *> computation_op;
-  {
-    for (auto *op : candidates) {
-      auto *compute_op =
-          FindNextComputationOpHandleOrReturnItself(op, scope_idx);
-      if (compute_op == nullptr) {
-        *status = LastLiveOpSearchStatus::kFailure;
-        return {};
-      }
-      computation_op.emplace(compute_op);
-    }
-  }
-
-  // stage three. Try to shrink computation op if any of them does
-  // not need the buffer of var_name.
-  // If all computation ops do not need the buffer of var_name,
-  // return empty computation op set, and mark the status as kShouldPrecede,
-  // which means that the last living ops of var_name should be
-  // found in the previous version of var_name.
-  if (ShrinkNoNeedBufferVarOpDependency(var_name, &computation_op)) {
-    *status = LastLiveOpSearchStatus::kShouldPrecede;
-    return {};
-  }
-
-  PADDLE_ENFORCE(!computation_op.empty(),
-                 "Computation ops should not be empty");
-
-  // stage four. Try to shrink computation op if they depend on each other.
-  // Get the smallest set of the most ops.
-  *status = LastLiveOpSearchStatus::kSuccess;
-  return shrink_func(computation_op);
-}
-
-void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
-  auto &var_infos = Get<MemOptVarInfoMapList>(kMemOptVarInfoMapList);
-  auto &last_live_ops_of_vars =
-      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
-
-  PADDLE_ENFORCE(last_live_ops_of_vars.empty() && var_infos.empty(),
-                 "Last Live Ops and Reference Counts of vars should be "
-                 "initialized at here.");
-
-  const auto &vars = graph->Get<details::GraphVars>(details::kGraphVars);
-
-  last_live_ops_of_vars.resize(vars.size());
-  var_infos.resize(vars.size());
-
-  ShrinkDepsOpFunctor shrink_func(
-      ir::FilterByNodeWrapper<details::OpHandleBase>(*graph));
-
-  details::PinnedVars *pinned_var_set = nullptr;
-  if (graph->Has(details::kPinnedVars)) {
-    pinned_var_set = &graph->Get<details::PinnedVars>(details::kPinnedVars);
-  }
-  auto is_pinned_var = [&pinned_var_set](const VarDesc &var_desc) {
-    return var_desc.Persistable() ||
-           (pinned_var_set && pinned_var_set->count(var_desc.Name()));
-  };
-
-  VLOG(1) << "Place number: " << vars.size();
-  for (size_t i = 0; i < vars.size(); ++i) {
-    for (auto &name_var_pair : vars[i]) {
-      // Whether this variable can be reused or deleted? If not, we do not
-      // compute reference counts and dependencies.
-      VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second);
-      if (var_desc == nullptr || is_pinned_var(*var_desc)) {
-        continue;
-      }
-
-      auto var_type = var_desc->Proto()->type().type();
-      if (var_type != proto::VarType::LOD_TENSOR &&
-          var_type != proto::VarType::SELECTED_ROWS &&
-          var_type != proto::VarType::LOD_TENSOR_ARRAY) {
-        // Var type cannot be deleted
-        continue;
-      }
-
-      auto &var_name = name_var_pair.first;
-      auto &var_handles = name_var_pair.second;
-
-      PADDLE_ENFORCE_EQ(var_desc->Name(), var_name);
-
-      for (auto iter = var_handles.rbegin(); iter != var_handles.rend();
-           ++iter) {
-        if ((*iter)->Node()->IsCtrlVar()) {
-          break;
-        }
-
-        VLOG(10) << "Try to find last living ops of " << var_name << " "
-                 << (iter - var_handles.rbegin()) << " time";
-        LastLiveOpSearchStatus status = LastLiveOpSearchStatus::kFailure;
-        auto result = ExtractComputationOpFromLastLivedVar(
-            *iter, i, var_name, shrink_func, &status);
-
-        // Seldomly, some vars may have no pending or preceding computation ops
-        // Just break;
-        if (status == LastLiveOpSearchStatus::kFailure) {
-          VLOG(1) << "Cannot find last live ops of variable " << var_name
-                  << " in scope " << (*iter)->scope_idx();
-          break;
-        }
-
-        if (status == LastLiveOpSearchStatus::kShouldPrecede) {
-          VLOG(10) << "Try to precede reference count computing at var "
-                   << var_name;
-          continue;
-        }
-
-        PADDLE_ENFORCE_EQ(status, LastLiveOpSearchStatus::kSuccess);
-        PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
-                       var_name);
-
-        VLOG(10) << "Extract " << result.size() << " ops of var " << var_name;
-        var_infos[i][var_name].reset(
-            new MemOptVarInfo(var_name, result.size()));
-        auto &last_live_ops_of_var = last_live_ops_of_vars[i][var_name];
-        last_live_ops_of_var.set_var(*iter);
-        *(last_live_ops_of_var.mutable_ops()) = std::move(result);
-        break;
-      }
-
-      // Seldomly, all preceding trying failed.
-      // Just skip this corner case
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(reference_count_pass, paddle::framework::ir::ReferenceCountPass)
-    .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
-    .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc
deleted file mode 100644
index ed87f73adf137fdf545209f36f996417031fcda4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/var_desc.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-VarDesc *TryGetLatestVarDesc(const std::vector<details::VarHandle *> &vars) {
-  VarDesc *var_desc = nullptr;
-  std::find_if(vars.rbegin(), vars.rend(),
-               [&](details::VarHandle *var_handle) -> bool {
-                 var_desc = var_handle->Node()->Var();
-                 return var_desc != nullptr;
-               });
-  return var_desc;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
deleted file mode 100644
index 0e8f4e78d22db0aad8a8120b10ee916ade21829d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-
-namespace paddle {
-namespace framework {
-
-class VarDesc;
-
-namespace ir {
-
-using GarbageCollectorMap =
-    std::map<platform::Place, std::unique_ptr<GarbageCollector>>;
-
-const char kMemOptVarInfoMapList[] = "mem_opt_var_info_map_list";
-const char kGarbageCollector[] = "garbage_collector";
-const char kAllPlaces[] = "all_places";
-const char kUseCuda[] = "use_cuda";
-
-class LastLiveOpOfVarInfo {
- public:
-  details::VarHandle *var() { return var_; }
-
-  void set_var(details::VarHandle *var) { var_ = var; }
-
-  const std::unordered_set<details::ComputationOpHandle *> &ops() const {
-    return ops_;
-  }
-
-  std::unordered_set<details::ComputationOpHandle *> *mutable_ops() {
-    return &ops_;
-  }
-
- private:
-  details::VarHandle *var_{nullptr};
-  std::unordered_set<details::ComputationOpHandle *> ops_;
-};
-
-using LastLiveOpsOfVars = std::unordered_map<std::string, LastLiveOpOfVarInfo>;
-const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
-
-VarDesc *TryGetLatestVarDesc(const std::vector<details::VarHandle *> &vars);
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc
deleted file mode 100644
index da0da4c7125953d386fbd4d14bc2607837616cc3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/operators/controlflow/while_op_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class WhileOpEagerDeletionPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override {
-    auto all_ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*graph);
-
-    // Find all while_op and while_grad_op
-    std::unordered_map<size_t, std::pair<std::vector<OperatorBase *>,
-                                         std::vector<OperatorBase *>>>
-        target_ops;
-    for (auto *op : all_ops) {
-      auto compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
-      if (compute_op == nullptr) continue;
-
-      if (compute_op->Name() == "while") {
-        target_ops[compute_op->GetScopeIdx()].first.emplace_back(
-            compute_op->GetOp());
-      } else if (compute_op->Name() == "while_grad") {
-        target_ops[compute_op->GetScopeIdx()].second.emplace_back(
-            compute_op->GetOp());
-      }
-    }
-
-    for (auto &ops_pair : target_ops) {
-      auto &while_ops = ops_pair.second.first;
-      auto &while_grad_ops = ops_pair.second.second;
-      operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
-          graph->OriginProgram(), while_ops, while_grad_ops);
-    }
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(while_op_eager_deletion_pass,
-              paddle::framework::ir::WhileOpEagerDeletionPass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
deleted file mode 100644
index 2226169e65b03ce3a0d37c026f38f8031828c0ac..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph, "graph cannot be nullptr.");
-  FusePassBase::Init("conv_activation_mkldnn_fuse", graph);
-
-  GraphPatternDetector gpd;
-  auto* conv_input = gpd.mutable_pattern()
-                         ->NewNode("conv_activation_mkldnn_fuse/conv_input")
-                         ->AsInput()
-                         ->assert_is_op_input(conv_type(), "Input");
-  patterns::ConvActivation conv_activation_pattern(
-      gpd.mutable_pattern(), "conv_activation_mkldnn_fuse");
-  conv_activation_pattern(conv_input, conv_type(), activation_type());
-
-  int found_conv_activation_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle " + conv_type() + "+" + activation_type() + " fuse";
-    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
-                              conv_activation_pattern);  // Filter
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out,
-                              conv_activation_pattern);              // tmp
-    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_activation_pattern);  // CONV op
-    GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out,
-                              conv_activation_pattern);  // Out
-    GET_IR_NODE_FROM_SUBGRAPH(activation, activation,
-                              conv_activation_pattern);  // Activation op
-
-    // Transform Conv node into ConvActivation node.
-    OpDesc* desc = conv->Op();
-    desc->SetOutput("Output",
-                    std::vector<std::string>({activation_out->Name()}));
-
-    desc->SetAttr("fuse_activation", activation_type());
-
-    // MKLDNN ops use alpha and beta as activation parameters but paddle ops are
-    // not generalized
-    if (activation_type() == "relu6") {
-      desc->SetAttr("fuse_alpha",
-                    boost::get<float>(activation->Op()->GetAttr("threshold")));
-    } else {
-      desc->SetAttr("fuse_alpha",
-                    activation->Op()->GetAttrIfExists<float>("alpha"));
-    }
-    desc->SetAttr("fuse_beta",
-                  activation->Op()->GetAttrIfExists<float>("beta"));
-
-    GraphSafeRemoveNodes(graph, {activation, conv_out});
-
-    PADDLE_ENFORCE_GT(subgraph.count(conv_input), 0UL,
-                      "subgraph has to contain conv_input node.");
-    IR_NODE_LINK_TO(conv, activation_out);
-    found_conv_activation_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_conv_activation_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_activation_mkldnn_fuse_pass,
-              paddle::framework::ir::ConvActivationFusePass);
-
-REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
-              paddle::framework::ir::ConvActivationFusePass);
-
-REGISTER_PASS(conv_leaky_relu_mkldnn_fuse_pass,
-              paddle::framework::ir::Conv2DLeakyReLUFusePass);
-
-REGISTER_PASS(conv_relu6_mkldnn_fuse_pass,
-              paddle::framework::ir::Conv2DReLU6FusePass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
deleted file mode 100644
index 7c6dc238a55af2cf54aee587091fdda2c03cc8aa..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-/*
- * Fuse Conv and Activation base class.
- */
-class ConvActivationFusePass : public FusePassBase {
- public:
-  virtual ~ConvActivationFusePass() {}
-  virtual std::string conv_type() const { return "conv2d"; }
-  virtual std::string activation_type() const { return "relu"; }
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-  const std::string name_scope_{"conv_activation_mkldnn_fuse"};
-};
-/*
- * Fuse Conv and LeakyReLU class
- */
-class Conv2DLeakyReLUFusePass : public ConvActivationFusePass {
- public:
-  std::string activation_type() const { return "leaky_relu"; }
-};
-/*
- * Fuse Conv and BoundedReLU class
- */
-class Conv2DReLU6FusePass : public ConvActivationFusePass {
- public:
-  std::string activation_type() const { return "relu6"; }
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
deleted file mode 100644
index ec38788bb4bf59f97c1a7bbbf63d8e389457d7eb..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs, bool is_activation = false,
-           bool use_mkldnn = false) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetAttr("name", name);
-  if (type == "conv2d") {
-    op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetInput("Input", {inputs[0]});
-    op->SetInput("Filter", {inputs[1]});
-    op->SetInput("Bias", {inputs[2]});
-  } else if (is_activation) {
-    op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetInput("X", inputs);
-    if (type == "leaky_relu") {
-      op->SetAttr("alpha", 0.02f);
-    } else if (type == "relu6") {
-      op->SetAttr("threshold", 6.0f);
-    }
-  }
-  op->SetOutput("Out", outputs);
-  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(OpRole::kForward));
-}
-
-// a->OP0->b
-// b->OP1->c
-// (c, weights, bias)->conv->f
-// (f)->activation->g
-ProgramDesc BuildProgramDesc(std::string activation) {
-  ProgramDesc prog;
-  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
-                                 "h", "weights2", "bias2", "k", "l", "m"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::SELECTED_ROWS);
-    if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2") {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "OP0", "op0", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"b"}));
-  SetOp(&prog, "OP1", "op1", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"c"}));
-  // conv+activation, both with MKL-DNN
-  SetOp(&prog, "conv2d", "conv1",
-        std::vector<std::string>({"c", "weights", "bias"}),
-        std::vector<std::string>({"f"}), false, true);
-  SetOp(&prog, activation, "activation1", std::vector<std::string>({"f"}),
-        std::vector<std::string>({"g"}), true, true);
-  SetOp(&prog, "OP3", "op3", std::vector<std::string>({"g"}),
-        std::vector<std::string>({"h"}));
-  // conv+activation, only one with MKL-DNN
-  SetOp(&prog, "conv2d", "conv2",
-        std::vector<std::string>({"h", "weights2", "bias2"}),
-        std::vector<std::string>({"k"}), false, true);
-  SetOp(&prog, "activation", "activation2", std::vector<std::string>({"k"}),
-        std::vector<std::string>({"l"}), true, false);
-  SetOp(&prog, "OP4", "op4", std::vector<std::string>({"l"}),
-        std::vector<std::string>({"m"}));
-
-  return prog;
-}
-
-void MainTest(std::string activation) {
-  auto prog = BuildProgramDesc(activation);
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  auto pass =
-      PassRegistry::Instance().Get("conv_" + activation + "_mkldnn_fuse_pass");
-
-  int original_nodes_num = graph->Nodes().size();
-
-  graph.reset(pass->Apply(graph.release()));
-
-  int current_nodes_num = graph->Nodes().size();
-
-  // Remove 3 Nodes: CONV, activation, conv_out
-  // Add 1 Node: ConvActivation
-  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
-
-  // Assert conv_activation op in newly generated graph
-  int conv_activation_count = 0;
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()->Type() == "conv2d") {
-      auto* op = node->Op();
-      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
-      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
-      auto op_name = boost::get<std::string>(op->GetAttr("name"));
-      if (op->GetAttrIfExists<std::string>("fuse_activation") == activation) {
-        ++conv_activation_count;
-      }
-      // check if only "conv1" convolution is fused
-      if (op_name == "conv1") {
-        ASSERT_TRUE(op->HasAttr("fuse_activation"));
-      } else if (op_name == "conv2") {
-        ASSERT_FALSE(op->HasAttr("fuse_activation"));
-      }
-    }
-  }
-  EXPECT_EQ(conv_activation_count, 1);
-}
-
-TEST(ConvActivationFusePass, conv_relu_fuse_pass) { MainTest("relu"); }
-TEST(ConvActivationFusePass, conv_leaky_relu_fuse_pass) {
-  MainTest("leaky_relu");
-}
-TEST(ConvActivationFusePass, conv_relu6_fuse_pass) { MainTest("relu6"); }
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(conv_activation_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
deleted file mode 100644
index bbfc8c005580bb949b498e4474c4059cd09f56b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
-#include <functional>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-template <typename BinaryOperation>
-LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
-                               BinaryOperation f) {
-  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
-  LoDTensor vec_y;
-  vec_y.Resize(vec_a.dims());
-  const float* a = vec_a.data<float>();
-  const float* b = vec_b.data<float>();
-  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
-  for (int i = 0; i < vec_a.numel(); i++) {
-    y[i] = f(a[i], b[i]);
-  }
-  return vec_y;
-}
-
-void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input(type(), "Input");
-  patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
-  conv_bias_pattern(conv_input, type());
-  int found_conv_bias_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle ConvBias fuse";
-    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
-                              conv_bias_pattern);                      // Filter
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern);  // tmp
-    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern);  // CONV op
-    // bias
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern);
-    // output
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern);
-    // elementwise_add op
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern);
-
-    PADDLE_ENFORCE(subgraph.count(conv_input));
-
-    // check if fuse can be done and if MKL-DNN should be used
-    FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
-    if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
-      VLOG(3) << "do not perform " + type() + "+bias fuse";
-      return;
-    }
-
-    auto* eltwise_bias_tensor =
-        scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
-
-    auto input_names = conv->Op()->InputNames();
-    bool has_bias = std::find(input_names.begin(), input_names.end(), "Bias") !=
-                    input_names.end();
-    if (has_bias && conv->Op()->Input("Bias").size() > 0) {
-      auto conv_bias_names = conv->Op()->Input("Bias");
-      // add eltwise bias to existing conv bias
-      PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
-      auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
-      auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
-      PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), eltwise_bias_tensor->dims());
-      *conv_bias_tensor = tensor_apply_eltwise(
-          *conv_bias_tensor, *eltwise_bias_tensor, std::plus<float>());
-
-      conv->Op()->SetOutput("Output",
-                            std::vector<std::string>({eltwise_out->Name()}));
-
-      GraphSafeRemoveNodes(graph, {eltwise, conv_out});
-
-      IR_NODE_LINK_TO(conv, eltwise_out);
-    } else {
-      // take eltwise bias as conv bias
-      OpDesc desc;
-
-      desc.SetInput(
-          "Input", std::vector<std::string>({subgraph.at(conv_input)->Name()}));
-      desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
-      desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
-      desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
-      desc.SetType(type());
-
-      for (auto& attr : conv->Op()->GetAttrMap()) {
-        desc.SetAttr(attr.first, attr.second);
-      }
-      auto conv_bias_node = g->CreateOpNode(&desc);
-
-      IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node);
-      IR_NODE_LINK_TO(conv_weight, conv_bias_node);
-      IR_NODE_LINK_TO(eltwise_bias, conv_bias_node);
-      IR_NODE_LINK_TO(conv_bias_node, eltwise_out);
-
-      GraphSafeRemoveNodes(graph, {conv, eltwise, conv_out});
-    }
-
-    found_conv_bias_count++;
-  };
-  gpd(graph, handler);
-  AddStatis(found_conv_bias_count);
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
-              paddle::framework::ir::ConvBiasFusePass);
-REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
-              paddle::framework::ir::Conv2DTransposeBiasFusePass);
-REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
-              paddle::framework::ir::Conv3DBiasFusePass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
deleted file mode 100644
index 833fbc748ebd03377ebaa6a5fa72d334ff8b7d37..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-namespace paddle {
-namespace framework {
-namespace ir {
-/*
-* Fuse the Conv and Elementwise_add to a ConvBiasOp.
-*/
-class ConvBiasFusePass : public FusePassBase {
- public:
-  virtual ~ConvBiasFusePass() {}
-  virtual std::string type() const { return "conv2d"; }
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-  const std::string name_scope_{"conv_bias_mkldnn_fuse"};
-};
-/*
-* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
-*/
-class Conv2DTransposeBiasFusePass : public ConvBiasFusePass {
- public:
-  std::string type() const override { return "conv2d_transpose"; }
-};
-
-class Conv3DBiasFusePass : public ConvBiasFusePass {
- public:
-  std::string type() const override { return "conv3d"; }
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
deleted file mode 100644
index 427d7bc9aeb15f4adb4a486c511630836bf2bb73..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/platform/place.h"
-
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  if (type == "conv2d") {
-    op->SetAttr("use_mkldnn", true);
-    op->SetAttr("name", name);
-    op->SetInput("Input", {inputs[0]});
-    op->SetInput("Filter", {inputs[1]});
-    if (inputs.size() > 2)
-      op->SetInput("Bias", {inputs[2]});
-    else
-      op->SetInput("Bias", {});
-  } else if (type == "elementwise_add") {
-    op->SetAttr("use_mkldnn", true);
-    op->SetInput("X", {inputs[0]});
-    op->SetInput("Y", {inputs[1]});
-  }
-  op->SetOutput("Out", outputs);
-  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(OpRole::kForward));
-}
-
-// (c, weights)->conv->f
-// (f)->elementwise_add->g
-ProgramDesc BuildProgramDesc(bool convWithExistingBias) {
-  ProgramDesc prog;
-  std::vector<std::string> nodes{"c", "weights", "f", "eltwise_bias", "g"};
-  if (convWithExistingBias) nodes.push_back("conv_bias");
-  for (auto& v : nodes) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::LOD_TENSOR);
-    if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") {
-      var->SetPersistable(true);
-    }
-  }
-
-  // conv+bias, both with MKL-DNN
-  if (convWithExistingBias) {
-    SetOp(&prog, "conv2d", "conv",
-          std::vector<std::string>({"c", "weights", "conv_bias"}),
-          std::vector<std::string>({"f"}));
-  } else {
-    SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"c", "weights"}),
-          std::vector<std::string>({"f"}));
-  }
-  SetOp(&prog, "elementwise_add", "eltwise",
-        std::vector<std::string>({"f", "eltwise_bias"}),
-        std::vector<std::string>({"g"}));
-
-  return prog;
-}
-
-void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
-                      const char* var_name) {
-  auto x = scope->Var(var_name);
-  auto tensor = x->GetMutable<LoDTensor>();
-  tensor->mutable_data(place, proto::VarType::FP32, 1);
-}
-
-void MainTest(bool convWithExistingBias) {
-  auto prog = BuildProgramDesc(convWithExistingBias);
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  auto place = paddle::platform::CPUPlace();
-  NaiveExecutor exe{place};
-  Scope scope;
-  // Init scope, as it is used in pass
-  exe.CreateVariables(prog, 0, true, &scope);
-  if (convWithExistingBias) {
-    InitTensorHolder(&scope, place, "conv_bias");
-    InitTensorHolder(&scope, place, "eltwise_bias");
-  }
-  graph->SetNotOwned(kParamScopeAttr, &scope);
-
-  auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
-
-  int original_nodes_num = graph->Nodes().size();
-
-  graph.reset(pass->Apply(graph.release()));
-
-  int current_nodes_num = graph->Nodes().size();
-
-  // Remove 3 Nodes: Conv, Bias, conv_out
-  // Add 1 Node: ConvBias
-  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
-
-  // Assert conv_bias op in newly generated graph
-  int conv_bias_count = 0;
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()->Type() == "conv2d") {
-      auto* op = node->Op();
-      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
-      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
-      // check if "conv" convolution is fused
-      auto op_name = boost::get<std::string>(op->GetAttr("name"));
-      if (op_name == "conv") {
-        auto input_names = op->InputNames();
-        ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") !=
-                    input_names.end());
-        auto bias = boost::get<std::vector<std::string>>(op->Input("Bias"));
-        if (bias.size()) {
-          ++conv_bias_count;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(conv_bias_count, 1);
-}
-
-TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); }
-
-TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); }
-
-TEST(ConvBiasFusePass, conv3d) {
-  Conv3DBiasFusePass pass;
-  ASSERT_EQ(pass.type(), std::string("conv3d"));
-}
-
-TEST(ConvBiasFusePass, conv2d_transpose) {
-  Conv2DTransposeBiasFusePass pass;
-  ASSERT_EQ(pass.type(), std::string("conv2d_transpose"));
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(conv_bias_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
deleted file mode 100644
index 9e8f0f0c46cee250e4e425cc636467d89171fa84..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void ConvConcatReLUFusePass::FindConcatWithConvs(
-    ir::Graph* graph,
-    std::unordered_map<const Node*, int>* concat_with_convs_counter) const {
-  GraphPatternDetector gpd;
-  patterns::ConcatReLU concat_relu_pattern{gpd.mutable_pattern(),
-                                           "concat_relu"};
-  concat_relu_pattern();
-
-  int found_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Find Concats with Convs";
-    GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, concat_relu_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(relu_op, relu_op, concat_relu_pattern);
-
-    auto concat_inputs = concat_op->inputs;
-
-    for (auto node : concat_inputs) {
-      auto prev_op_node = node->inputs;
-      PADDLE_ENFORCE_EQ(prev_op_node.size(), 1);
-      auto* conv_op = prev_op_node[0];
-      if (conv_op->Op()->Type() != "conv2d") return;
-
-      FuseOptions fuse_option = FindFuseOption(*conv_op, *relu_op);
-      if (fuse_option == DO_NOT_FUSE) {
-        return;
-      }
-    }
-
-    (*concat_with_convs_counter)[concat_op] = concat_inputs.size();
-    found_count++;
-  };
-  gpd(graph, handler);
-  AddStatis(found_count);
-}
-
-void ConvConcatReLUFusePass::FuseConvConcatReLU(
-    ir::Graph* graph,
-    std::unordered_map<const Node*, int>* concat_with_convs_counter) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::ConvConcatReLU conv_concat_relu(pattern, name_scope_);
-  conv_concat_relu();
-
-  int found_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle ConvConcatReLU fuse";
-
-    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_concat_relu);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_concat_relu);
-    GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, conv_concat_relu);
-    GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, conv_concat_relu);
-    GET_IR_NODE_FROM_SUBGRAPH(relu_op, relu_op, conv_concat_relu);
-    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_concat_relu);
-
-    if (!concat_with_convs_counter->count(concat_op)) {
-      VLOG(4) << "this concat has input from non-conv2d operator";
-      return;
-    }
-
-    // Transform Conv node into ConvReLU node.
-    OpDesc* conv_desc = conv_op->Op();
-    conv_desc->SetAttr("fuse_activation", std::string("relu"));
-
-    // Remove ReLU when all Convs were transformed.
-    auto number_of_unfused_convs_left =
-        --(*concat_with_convs_counter)[concat_op];
-    if (number_of_unfused_convs_left == 0) {
-      OpDesc* concat_desc = concat_op->Op();
-      concat_desc->SetOutput("Out",
-                             std::vector<std::string>({relu_out->Name()}));
-      GraphSafeRemoveNodes(graph, {relu_op, concat_out});
-      IR_NODE_LINK_TO(concat_op, relu_out);
-    }
-
-    found_count++;
-  };
-  gpd(graph, handler);
-  AddStatis(found_count);
-}
-
-void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init(name_scope_, graph);
-
-  std::unordered_map<const Node*, int> concat_with_convs_counter;
-  FindConcatWithConvs(graph, &concat_with_convs_counter);
-  FuseConvConcatReLU(graph, &concat_with_convs_counter);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_concat_relu_mkldnn_fuse_pass,
-              paddle::framework::ir::ConvConcatReLUFusePass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h
deleted file mode 100644
index 91ff0760f0483c41cb5be5507426290c90142b13..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse the (multi conv) -> Concat -> ReLU -> next_op
- * to a:
- * (multi ConvReLU) -> Concat -> next_op.
- */
-class ConvConcatReLUFusePass : public FusePassBase {
- public:
-  virtual ~ConvConcatReLUFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  void FindConcatWithConvs(
-      Graph* graph,
-      std::unordered_map<const Node*, int>* concat_with_convs_counter) const;
-
-  void FuseConvConcatReLU(
-      Graph* graph,
-      std::unordered_map<const Node*, int>* concat_with_convs_counter) const;
-
-  const std::string name_scope_{"conv_concat_relu_mkldnn_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
deleted file mode 100644
index ee00a39596a4cc76606284127f51108c71056e95..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs, bool use_mkldnn = true) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  if (type == "conv2d") {
-    op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetAttr("fuse_activation", std::string(""));
-    op->SetInput("Input", {inputs[0]});
-    op->SetInput("Filter", {inputs[1]});
-    if (inputs.size() > 2) {
-      op->SetInput("Bias", {inputs[2]});
-    }
-    op->SetOutput("Output", outputs);
-  } else if (type == "relu") {
-    op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-  } else if (type == "pool2d") {
-    op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-  } else if (type == "concat") {
-    op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-  }
-  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(OpRole::kForward));
-}
-
-// (a1,w1)->conv1->c1
-// (a2,w2,b2)->conv2->c2
-// if put_only_convs_before_concat=true
-//   (a3,w3)->conv3->c3
-// else
-//   a3->pool1->c3
-//
-// (c1,c2,c3)->concat1->d
-// d->relu1->e
-ProgramDesc BuildProgramDesc(bool put_only_convs_before_concat,
-                             bool all_convs_use_mkldnn) {
-  ProgramDesc prog;
-  for (auto& v :
-       std::initializer_list<std::string>({"a1", "w1", "c1", "a2", "w2", "b2",
-                                           "c2", "a3", "w3", "c3", "d", "e"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::SELECTED_ROWS);
-    if (v.find("w") == 0 || v.find("b") == 0) {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "conv2d", {"a1", "w1", "b1"}, {"c1"}, all_convs_use_mkldnn);
-  SetOp(&prog, "conv2d", {"a2", "w2", "b2"}, {"c2"});
-  if (put_only_convs_before_concat) {
-    SetOp(&prog, "conv2d", {"a3", "w3", "b3"}, {"c3"});
-  } else {
-    SetOp(&prog, "pool2d", {"a3"}, {"c3"});
-  }
-  SetOp(&prog, "concat", {"c1", "c2", "c3"}, {"d"});
-  SetOp(&prog, "relu", {"d"}, {"e"});
-
-  return prog;
-}
-
-void MainTest(const ProgramDesc& prog, bool fuse_relu) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  int original_nodes_num = graph->Nodes().size();
-
-  auto pass = PassRegistry::Instance().Get("conv_concat_relu_mkldnn_fuse_pass");
-  graph.reset(pass->Apply(graph.release()));
-
-  int current_nodes_num = graph->Nodes().size();
-
-  if (fuse_relu) {
-    // Remove 2 nodes: concat_out, relu
-    EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
-  } else {
-    EXPECT_EQ(original_nodes_num, current_nodes_num);
-  }
-
-  int relu_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "conv2d") {
-        ASSERT_TRUE(op->HasAttr("fuse_activation"));
-        bool fuse_relu_attr =
-            (boost::get<std::string>(op->GetAttr("fuse_activation")) == "relu");
-        EXPECT_EQ(fuse_relu, fuse_relu_attr);
-      } else if (op->Type() == "relu") {
-        relu_count++;
-      }
-    }
-  }
-  EXPECT_EQ(relu_count, fuse_relu ? 0 : 1);
-}
-
-TEST(ConvConcatReLUFusePass, only_convs_before_concat) {
-  bool all_convs_use_mkldnn = true;
-  bool put_only_convs_before_concat = true;
-  auto prog =
-      BuildProgramDesc(put_only_convs_before_concat, all_convs_use_mkldnn);
-
-  bool expect_relu_fuse = true;
-  MainTest(prog, expect_relu_fuse);
-}
-
-TEST(ConvConcatReLUFusePass, only_convs_before_concat_but_one_non_mkldnn) {
-  bool all_convs_use_mkldnn = false;
-  bool put_only_convs_before_concat = true;
-  auto prog =
-      BuildProgramDesc(put_only_convs_before_concat, all_convs_use_mkldnn);
-
-  bool expect_relu_fuse = false;
-  MainTest(prog, expect_relu_fuse);
-}
-
-TEST(ConvConcatReLUFusePass, convs_and_pool_before_concat) {
-  bool all_convs_use_mkldnn = true;
-  bool put_only_convs_before_concat = false;
-  auto prog =
-      BuildProgramDesc(put_only_convs_before_concat, all_convs_use_mkldnn);
-
-  bool expect_relu_fuse = false;
-  MainTest(prog, expect_relu_fuse);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(conv_concat_relu_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
deleted file mode 100644
index 1263ddd147e86a47b8e5952f6a8cdfd40d1ee305..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ /dev/null
@@ -1,343 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
-#include <functional>
-#include <list>
-#include <map>
-#include <memory>
-#include <tuple>
-#include "paddle/fluid/framework/ir/graph_traits.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-bool IsReachable(ir::Graph* graph, Node* from, Node* to) {
-  auto find_node = [](ir::Graph* graph, const Node* node) -> Node* {
-    for (auto n : graph->Nodes()) {
-      if (n == node) {
-        return n;
-      }
-    }
-
-    return nullptr;
-  };
-
-  if (from == to) {
-    return true;
-  }
-
-  std::map<Node*, bool> visited;
-
-  for (auto& node : GraphTraits::DFS(*graph)) {
-    visited[&node] = false;
-  }
-
-  visited[from] = true;
-
-  std::list<Node*> queue;
-  queue.push_back(from);
-
-  while (!queue.empty()) {
-    auto cur = find_node(graph, queue.front());
-    queue.pop_front();
-
-    if (!cur) return false;
-
-    for (auto n : cur->outputs) {
-      if (n == to) {
-        return true;
-      }
-
-      if (!visited[n]) {
-        visited[n] = true;
-        queue.push_back(n);
-      }
-    }
-  }
-  return false;
-}
-
-template <typename T>
-boost::optional<T> HasAttribute(const Node& op, const std::string& attr) {
-  if (op.Op()->HasAttr(attr))
-    return boost::get<T>(op.Op()->GetAttr(attr));
-  else
-    return boost::none;
-}
-
-ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::IdentityConvFunc&
-        get_node_from_conv_op,
-    const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc&
-        get_node_from_elementwise_add_op)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_op{get_node_from_conv_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {}
-
-void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_op;
-  Node* conv_input;
-  Node* conv_filter;
-  Node* conv_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_identity;
-  Node* elementwise_add_out;
-
-  std::tie(conv_op, conv_input, conv_filter, conv_output) =
-      get_node_from_conv_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_op, elementwise_add_op)) return;
-
-  if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
-
-  if (HasFusedActivation(conv_op)) return;
-
-  conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
-  conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
-  conv_op->Op()->SetAttr("fuse_residual_connection", true);
-
-  GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op});
-
-  IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
-  IR_NODE_LINK_TO(conv_op, elementwise_add_out);
-
-  (*fusion_stats)++;
-}
-
-ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_x_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_y_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc&
-        get_node_from_elementwise_add_op)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_x_op{get_node_from_conv_x_op},
-      get_node_from_conv_y_op{get_node_from_conv_y_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {}
-
-void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_x_op;
-  Node* conv_x_input;
-  Node* conv_x_filter;
-  Node* conv_x_output;
-
-  Node* conv_y_op;
-  Node* conv_y_input;
-  Node* conv_y_filter;
-  Node* conv_y_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_out;
-
-  std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) =
-      get_node_from_conv_x_op(subgraph);
-  std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) =
-      get_node_from_conv_y_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_x_op, elementwise_add_op)) return;
-  if (!can_fuse_func(conv_y_op, elementwise_add_op)) return;
-
-  Node* projection_node;
-  Node* residual_conv_op;
-  Node* residual_conv_output;
-
-  if (IsReachable(graph, conv_x_input, conv_y_output)) {
-    projection_node = conv_x_output;
-    residual_conv_op = conv_y_op;
-    residual_conv_output = conv_y_output;
-  } else if (IsReachable(graph, conv_y_input, conv_x_output)) {
-    projection_node = conv_y_output;
-    residual_conv_op = conv_x_op;
-    residual_conv_output = conv_x_output;
-  } else {
-    return;
-  }
-
-  if (HasFusedActivation(residual_conv_op)) return;
-
-  residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
-  residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
-
-  residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
-
-  GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op});
-
-  IR_NODE_LINK_TO(projection_node, residual_conv_op);
-  IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
-
-  (*fusion_stats)++;
-}
-
-std::tuple<Node*, Node*, Node*, Node*>
-ResidualConnectionMKLDNNFusePass::GetNodesFromConv(
-    const patterns::Conv& conv_pattern,
-    const GraphPatternDetector::subgraph_t& subgraph) const {
-  GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
-
-  return std::make_tuple(conv_op, conv_input, conv_filter, conv_output);
-}
-
-GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
-    const std::string& name_scope,
-    const GraphWithStats& graph_with_stats) const {
-  ir::Graph* graph;
-  int stats;
-
-  std::tie(graph, stats) = graph_with_stats;
-
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-
-  patterns::Conv conv_pattern{pattern, name_scope};
-  auto conv_output = conv_pattern();
-
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(
-      conv_output,
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
-  conv_output->AsIntermediate();
-
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_y,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add);
-}
-
-GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
-    const std::string& name_scope,
-    const GraphWithStats& graph_with_stats) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-
-  patterns::Conv conv_pattern{pattern, name_scope};
-  auto conv_output = conv_pattern();
-
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
-      conv_output);
-  conv_output->AsIntermediate();
-
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_x,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add);
-}
-
-GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
-    const std::string& name_scope,
-    const GraphWithStats& graph_with_stats) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-
-  patterns::Conv conv_x_pattern{pattern, name_scope};
-  auto conv_x_output = conv_x_pattern();
-
-  patterns::Conv conv_y_pattern{pattern, name_scope};
-  auto conv_y_output = conv_y_pattern();
-
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(conv_x_output, conv_y_output);
-  conv_x_output->AsIntermediate();
-  conv_y_output->AsIntermediate();
-
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<ProjectionFuseHandle>(
-      &gpd, graph_with_stats,
-      [this,
-       &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_x_pattern, subgraph);
-      },
-      [this,
-       &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_y_pattern, subgraph);
-      },
-      get_node_from_elementwise_add);
-}
-
-void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
-  FusePassBase::Init(name_scope_, graph);
-  auto fused_graph_with_stats = FuseConvAsY(
-      name_scope_,
-      FuseConvAsX(name_scope_,
-                  FuseProjectionConv(name_scope_, std::make_pair(graph, 0))));
-
-  std::cout << "Fused graph " << fused_graph_with_stats.second << std::endl;
-  AddStatis(fused_graph_with_stats.second);
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
-              paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
deleted file mode 100644
index b95aec34d30745d99f6066e36f19c883927e2b53..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-#include <boost/optional.hpp>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-using graph_ptr = ir::Graph*;
-using GraphWithStats = std::pair<ir::Graph*, int>;
-
-void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
-bool IsReachable(ir::Graph* graph, Node* from, Node* to);
-boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
-
-class ResidualConnectionMKLDNNFusePass : public FusePassBase {
- private:
-  GraphWithStats FuseConvAsX(const std::string& name_scope,
-                             const GraphWithStats& graph_with_stats) const;
-  GraphWithStats FuseConvAsY(const std::string& name_scope,
-                             const GraphWithStats& graph_with_stats) const;
-  GraphWithStats FuseProjectionConv(
-      const std::string& name_scope,
-      const GraphWithStats& graph_with_stats) const;
-
-  template <typename RetType>
-  using GetNodeFunc =
-      std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
-  using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
-  using IdentityElementwiseAddFunc =
-      GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
-
-  using ProjectionConvFunc = IdentityConvFunc;
-  using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
-
-  using CanFuseFunc = std::function<bool(Node*, Node*)>;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  template <typename HandleType, typename... OpFuncs>
-  GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
-                                      const GraphWithStats& graph_with_stats,
-                                      OpFuncs&&... op_funcs) const {
-    ir::Graph* graph;
-    int stats;
-
-    std::tie(graph, stats) = graph_with_stats;
-
-    auto can_fuse = [this](Node* op1, Node* op2) -> bool {
-      return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
-    };
-
-    auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
-
-    (*gpd)(graph, fuse_handle);
-
-    return std::make_pair(graph, stats + fuse_handle.get_stats());
-  }
-
-  struct IdentityFuseHandle {
-    IdentityFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const IdentityConvFunc& get_node_from_conv_op,
-        const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    IdentityConvFunc get_node_from_conv_op;
-    IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
-  };
-
-  struct ProjectionFuseHandle {
-    ProjectionFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const ProjectionConvFunc& get_node_from_conv_x_op,
-        const ProjectionConvFunc& get_node_from_conv_y_op,
-        const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    ProjectionConvFunc get_node_from_conv_x_op;
-    ProjectionConvFunc get_node_from_conv_y_op;
-    ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
-  };
-
- public:
-  virtual ~ResidualConnectionMKLDNNFusePass() {}
-
- protected:
-  void ApplyImpl(graph_ptr graph) const;
-  static bool HasFusedActivation(Node* conv_node) {
-    return !(conv_node->Op()
-                 ->GetAttrIfExists<std::string>("fuse_activation")
-                 .empty());
-  }
-
-  const std::string name_scope_{"residual_connection_fuse_pass"};
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
deleted file mode 100644
index 8a13596cd50087475bf12b6cfa5920b82e24de31..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ /dev/null
@@ -1,274 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <string>
-
-#include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-namespace {
-constexpr int nodes_removed = 3;
-constexpr int nodes_added = 1;
-
-void SetOp(ProgramDesc* prog, const std::string& type,
-           const std::vector<std::pair<std::string, std::string>>& inputs,
-           const std::pair<std::string, std::string>& output) {
-  auto op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetAttr("use_mkldnn", true);
-
-  for (const auto& input : inputs) {
-    op->SetInput(input.first, {input.second});
-  }
-
-  op->SetOutput(output.first, {output.second});
-}
-
-struct TestIsReachable {
-  using func = std::function<bool(const std::string&, const std::string&)>;
-
-  auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
-    auto hash = [](const Node* node) -> std::string {
-      return node->Name() + std::to_string(node->id());
-    };
-
-    auto find_node = [&](const std::unique_ptr<ir::Graph>& graph,
-                         const std::string& name) -> Node* {
-      for (auto& node : GraphTraits::DFS(*graph)) {
-        if (name == hash(&node)) {
-          return &node;
-        }
-      }
-
-      return nullptr;
-    };
-
-    // update the from and to strings to hashed equivs in loop from graph traits
-    return [&](std::string from, std::string to) -> bool {
-      if (from == to) return true;
-
-      std::map<std::string, bool> visited;
-
-      for (auto& node : GraphTraits::DFS(*graph)) {
-        auto hashed = hash(&node);
-        if (node.Name() == from) from = hashed;
-        if (node.Name() == to) to = hashed;
-        visited[hashed] = false;
-      }
-
-      visited[from] = true;
-
-      std::list<std::string> queue;
-      queue.push_back(from);
-
-      while (!queue.empty()) {
-        auto cur = find_node(graph, queue.front());
-        queue.pop_front();
-        if (cur == nullptr) return false;
-
-        for (auto n : cur->outputs) {
-          auto hashed_name = hash(n);
-          if (hashed_name == to) return true;
-
-          if (!visited[hashed_name]) {
-            visited[hashed_name] = true;
-            queue.push_back(hashed_name);
-          }
-        }
-      }
-      return false;
-    };
-  }
-};
-
-void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph,
-                    int expected_conv_count,
-                    int expected_elementwise_add_count = 0) {
-  int conv_count = 0;
-  int elementwise_add_count = 0;
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()->Type() == "conv2d") {
-      ++conv_count;
-    }
-    if (node->IsOp() && node->Op()->Type() == "elementwise_add") {
-      ++elementwise_add_count;
-    }
-  }
-  EXPECT_EQ(conv_count, expected_conv_count);
-  EXPECT_EQ(elementwise_add_count, expected_elementwise_add_count);
-}
-
-ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
-                             const std::vector<std::string>& persistent_vars) {
-  ProgramDesc prog;
-
-  auto add_var_to_prog = [&prog](const std::string& var_name) -> VarDesc* {
-    auto var = prog.MutableBlock(0)->Var(var_name);
-    var->SetType(proto::VarType::LOD_TENSOR);
-
-    return var;
-  };
-
-  for (const auto& v : transient_vars) {
-    add_var_to_prog(v);
-  }
-
-  for (const auto& v : persistent_vars) {
-    auto var = add_var_to_prog(v);
-    var->SetPersistable(true);
-  }
-
-  return prog;
-}
-
-void RunPassAndAssert(ProgramDesc* prog, const std::string& from,
-                      const std::string& to, int expected_conv_num) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(*prog));
-
-  TestIsReachable is_reachable;
-  EXPECT_TRUE(is_reachable(graph)(from, to));
-
-  auto pass =
-      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
-  int original_nodes_num = graph->Nodes().size();
-  graph.reset(pass->Apply(graph.release()));
-  int current_nodes_num = graph->Nodes().size();
-
-  EXPECT_TRUE(is_reachable(graph)(from, to));
-
-  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
-            current_nodes_num);
-
-  AssertOpsCount(graph, expected_conv_num);
-}
-}  // namespace
-
-TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
-  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
-
-  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-  SetOp(&prog, "conv2d",
-        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
-        {"Output", "c"});
-
-  SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
-  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-
-  RunPassAndAssert(&prog, "a", "relu", 1);
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass,
-     ConvolutionProjectionAsYWithElementwiseAddRelu) {
-  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e", "f"},
-                               {"bias", "weights", "bias2", "weights2"});
-
-  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-  // right branch
-  SetOp(&prog, "conv2d",
-        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
-        {"Output", "c"});
-
-  // left branch
-  SetOp(&prog, "conv2d",
-        {{"Input", "a"}, {"Bias", "bias2"}, {"Filter", "weights2"}},
-        {"Output", "f"});
-
-  SetOp(&prog, "elementwise_add", {{"X", "f"}, {"Y", "c"}}, {"Out", "d"});
-  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-
-  RunPassAndAssert(&prog, "a", "relu", 2);
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass,
-     ConvolutionAsYWithElementwiseAddReluNoBias) {
-  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
-
-  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
-        {"Output", "c"});
-  SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
-  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-
-  RunPassAndAssert(&prog, "a", "relu", 1);
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsXWithElementwiseAddRelu) {
-  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
-
-  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-  SetOp(&prog, "conv2d",
-        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
-        {"Output", "c"});
-
-  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
-  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-
-  RunPassAndAssert(&prog, "a", "relu", 1);
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass,
-     ConvolutionAsXWithElementwiseAddReluNoBias) {
-  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
-
-  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
-        {"Output", "c"});
-  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
-  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-
-  RunPassAndAssert(&prog, "a", "relu", 1);
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
-  auto prog =
-      BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, {"weights"});
-
-  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
-        {"Output", "c"});
-
-  SetOp(&prog, "conv2d", {{"Input", "d"}, {"Filter", "weights"}},
-        {"Output", "e"});
-
-  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, {"Out", "f"});
-  SetOp(&prog, "relu", {{"X", "f"}}, {"Out", "g"});
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  TestIsReachable is_reachable;
-  EXPECT_TRUE(is_reachable(graph)("a", "g"));
-
-  auto pass =
-      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
-  int original_nodes_num = graph->Nodes().size();
-  graph.reset(pass->Apply(graph.release()));
-  int current_nodes_num = graph->Nodes().size();
-
-  EXPECT_TRUE(is_reachable(graph)("a", "g"));
-  EXPECT_EQ(original_nodes_num, current_nodes_num);
-
-  AssertOpsCount(graph, 2, 1);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(conv_elementwise_add_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
deleted file mode 100644
index 9cf55ee3254f4f1eacd717dd0c8d4497b7c559de..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ /dev/null
@@ -1,373 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
-#include <limits>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-namespace {
-
-void UnlinkNodes(ir::Node* a, ir::Node* b) {
-  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
-                   a->outputs.end());
-  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
-                  b->inputs.end());
-}
-
-}  // namespace
-
-enum { U8_MAX = 255, S8_MAX = 127 };
-
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>;
-using string::PrettyLogDetail;
-
-void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
-                                    std::string input_name, double scale_to_one,
-                                    bool is_unsigned,
-                                    std::string scale_attr_name) const {
-  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
-  float scale = scale_to_one * max;
-
-  // Create quantize output variable
-  VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
-  auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
-
-  // create a quantize op node
-  OpDesc q_desc;
-  q_desc.SetType("quantize");
-  q_desc.SetInput("Input", std::vector<std::string>({input->Name()}));
-  q_desc.SetOutput("Output",
-                   std::vector<std::string>({quantize_out_node->Name()}));
-  q_desc.SetAttr("Scale", scale);
-  q_desc.SetAttr("is_negative_input", !is_unsigned);
-  auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
-
-  // update op's input
-  op->Op()->SetInput(input_name,
-                     std::vector<std::string>({quantize_out_node->Name()}));
-
-  // link quantize op
-  UnlinkNodes(input, op);
-  IR_NODE_LINK_TO(input, quantize_op);
-  IR_NODE_LINK_TO(quantize_op, quantize_out_node);
-  IR_NODE_LINK_TO(quantize_out_node, op);
-
-  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
-}
-
-void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
-                                     VarQuantScale* scales, bool are_unsigned,
-                                     std::string scale_attr_name) const {
-  auto inputs = op->inputs;
-  auto output = op->outputs[0];
-  PADDLE_ENFORCE_GE(inputs.size(), 1);
-  PADDLE_ENFORCE_EQ(op->outputs.size(), 1);
-
-  // create a quantize op desc prototype
-  OpDesc q_desc;
-  q_desc.SetType("quantize");
-
-  std::vector<Node*> quantize_out_nodes(inputs.size());
-  std::vector<std::string> quantize_out_node_names(inputs.size());
-
-  double scale_out = (*scales)[output->Name()].second.data<double>()[0];
-  unsigned max = are_unsigned ? U8_MAX : S8_MAX;
-  float scale = scale_out * max;
-
-  for (size_t i = 0; i < inputs.size(); i++) {
-    // Create quantize output variable
-    VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
-    quantize_out_nodes[i] = g->CreateVarNode(&quantize_out_desc);
-    quantize_out_node_names[i] = quantize_out_nodes[i]->Name();
-
-    q_desc.SetAttr("Scale", scale);
-    q_desc.SetInput("Input", std::vector<std::string>({inputs[i]->Name()}));
-    q_desc.SetOutput("Output",
-                     std::vector<std::string>({quantize_out_node_names[i]}));
-    q_desc.SetAttr("is_negative_input", !are_unsigned);
-    auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
-
-    // link quantize op
-    UnlinkNodes(inputs[i], op);
-    IR_NODE_LINK_TO(inputs[i], quantize_op);
-    IR_NODE_LINK_TO(quantize_op, quantize_out_nodes[i]);
-    IR_NODE_LINK_TO(quantize_out_nodes[i], op);
-  }
-
-  // update op's input
-  op->Op()->SetInput(input_name, quantize_out_node_names);
-
-  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
-}
-
-void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
-                                       std::string output_name,
-                                       double scale_to_one, bool is_unsigned,
-                                       std::string scale_attr_name) const {
-  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
-  float scale = scale_to_one * max;
-
-  // Create dequantize input variable
-  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
-  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
-
-  // create a dequantize op node for output.
-  OpDesc deq_desc;
-  deq_desc.SetType("dequantize");
-  deq_desc.SetInput("Input",
-                    std::vector<std::string>({dequantize_in_node->Name()}));
-  deq_desc.SetOutput("Output", std::vector<std::string>({output->Name()}));
-  deq_desc.SetAttr("Scale", scale);
-  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
-
-  // update op's output
-  op->Op()->SetOutput(output_name,
-                      std::vector<std::string>({dequantize_in_node->Name()}));
-
-  // link dequantize op
-  UnlinkNodes(op, output);
-  IR_NODE_LINK_TO(op, dequantize_in_node);
-  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
-  IR_NODE_LINK_TO(dequantize_op, output);
-
-  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
-}
-
-void CPUQuantizePass::QuantizeConv(Graph* graph,
-                                   bool with_residual_data) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::ConvResidual conv_pattern{pattern, name_scope_};
-  conv_pattern(with_residual_data);
-
-  int quantize_conv_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Quantize conv2d op";
-    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
-    auto* conv_op_desc = conv_op->Op();
-
-    // skip if should not be quantized
-    if (!conv_op_desc->HasAttr("use_quantizer") ||
-        !boost::get<bool>(conv_op_desc->GetAttr("use_quantizer")))
-      return;
-
-    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
-
-    // get scales calculated after warmup, they scale variables to MAX=1.0
-    auto scales = Get<VarQuantScale>("quant_var_scales");
-
-    auto input_scale = scales[conv_input->Name()].second.data<double>()[0];
-    bool is_input_unsigned = scales[conv_input->Name()].first;
-    QuantizeInput(g, conv_op, conv_input, "Input", input_scale,
-                  is_input_unsigned, "Scale_in");
-
-    auto filter_scale_tensor = scales[conv_filter->Name()].second;
-    EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(),
-                                     filter_scale_tensor.numel(), 1};
-    eigen_tensor *= static_cast<double>(S8_MAX);
-    std::vector<float> filter_scale{
-        filter_scale_tensor.data<double>(),
-        filter_scale_tensor.data<double>() + filter_scale_tensor.numel()};
-
-    conv_op->Op()->SetAttr("Scale_weights", filter_scale);
-
-    if (with_residual_data) {
-      GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data,
-                                conv_pattern);
-      auto residual_scale =
-          scales[conv_residual_data->Name()].second.data<double>()[0];
-      bool is_residual_unsigned = scales[conv_residual_data->Name()].first;
-
-      QuantizeInput(g, conv_op, conv_residual_data, "ResidualData",
-                    residual_scale, is_residual_unsigned, "Scale_in_eltwise");
-    }
-
-    auto output_scale = scales[conv_output->Name()].second.data<double>()[0];
-    bool is_output_unsigned = scales[conv_output->Name()].first;
-    DequantizeOutput(g, conv_op, conv_output, "Output", output_scale,
-                     is_output_unsigned, "Scale_out");
-
-    // change threshold in bounded ReLu
-    if (conv_op->Op()->GetAttrIfExists<std::string>("fuse_activation") ==
-        "relu6") {
-      float scale_out = boost::get<float>(conv_op->Op()->GetAttr("Scale_out"));
-      float threshold = boost::get<float>(conv_op->Op()->GetAttr("fuse_alpha"));
-      conv_op->Op()->SetAttr("fuse_alpha", scale_out * threshold);
-    }
-
-    ++quantize_conv_count;
-  };
-
-  gpd(graph, handler);
-  AddStatis(quantize_conv_count);
-
-  std::stringstream msg_ss;
-  msg_ss << "---    quantized " << quantize_conv_count << " conv2d ops";
-  if (with_residual_data) msg_ss << " with residual connection";
-  PrettyLogDetail(msg_ss.str().c_str());
-}
-
-void CPUQuantizePass::QuantizePool(Graph* graph) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::Pool pool_pattern{pattern, name_scope_};
-  pool_pattern();
-
-  int quantize_pool_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Quantize pool2d op";
-    GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern);
-    auto* pool_op_desc = pool_op->Op();
-
-    // skip if should not be quantized
-    if (!pool_op_desc->HasAttr("use_quantizer") ||
-        !boost::get<bool>(pool_op_desc->GetAttr("use_quantizer")))
-      return;
-
-    GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern);
-
-    // get scales calculated after warmup, they scale variables to MAX=1.0
-    auto scales = Get<VarQuantScale>("quant_var_scales");
-
-    auto input_scale = scales[pool_input->Name()].second.data<double>()[0];
-    bool is_input_unsigned = scales[pool_input->Name()].first;
-    QuantizeInput(g, pool_op, pool_input, "X", input_scale, is_input_unsigned);
-
-    auto output_scale = scales[pool_output->Name()].second.data<double>()[0];
-    bool is_output_unsigned = scales[pool_output->Name()].first;
-    DequantizeOutput(g, pool_op, pool_output, "Out", output_scale,
-                     is_output_unsigned);
-
-    ++quantize_pool_count;
-  };
-
-  gpd(graph, handler);
-  AddStatis(quantize_pool_count);
-
-  PrettyLogDetail("---    quantized %d pool2d ops", quantize_pool_count);
-}
-
-void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::Concat concat_pattern{pattern, name_scope_};
-  concat_pattern();
-
-  int quantize_concat_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Quantize concat op";
-    GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, concat_pattern);
-    auto* concat_op_desc = concat_op->Op();
-
-    // skip if should not be quantized
-    if (!concat_op_desc->HasAttr("use_quantizer") ||
-        !boost::get<bool>(concat_op_desc->GetAttr("use_quantizer")))
-      return;
-
-    GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern);
-
-    // get scales calculated after warmup, they scale variables to MAX=1.0
-    auto scales = Get<VarQuantScale>("quant_var_scales");
-
-    // if all inputs were unsigned, then the output was set to unsigned
-    // during the scale calculation step
-    bool are_all_inputs_unsigned = scales[concat_out->Name()].first;
-    QuantizeInputs(g, concat_op, "X", &scales, are_all_inputs_unsigned);
-
-    auto output_scale = scales[concat_out->Name()].second.data<double>()[0];
-
-    DequantizeOutput(g, concat_op, concat_out, "Out", output_scale,
-                     are_all_inputs_unsigned);
-
-    ++quantize_concat_count;
-  };
-
-  gpd(graph, handler);
-  AddStatis(quantize_concat_count);
-
-  PrettyLogDetail("---    quantized %d concat ops", quantize_concat_count);
-}
-
-void CPUQuantizePass::QuantizePriorBox(Graph* graph) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::PriorBox prior_box_pattern{pattern, name_scope_};
-  prior_box_pattern();
-
-  int quantize_prior_box_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Quantize prior_box op";
-    GET_IR_NODE_FROM_SUBGRAPH(prior_box_op, prior_box_op, prior_box_pattern);
-    auto* prior_box_op_desc = prior_box_op->Op();
-
-    // skip if should not be quantized
-    if (!prior_box_op_desc->HasAttr("use_quantizer") ||
-        !boost::get<bool>(prior_box_op_desc->GetAttr("use_quantizer")))
-      return;
-
-    GET_IR_NODE_FROM_SUBGRAPH(prior_box_input, prior_box_input,
-                              prior_box_pattern);
-
-    // get scales calculated after warmup, they scale variables to MAX=1.0
-    auto scales = Get<VarQuantScale>("quant_var_scales");
-
-    auto input_scale = scales[prior_box_input->Name()].second.data<double>()[0];
-    bool is_input_unsigned = scales[prior_box_input->Name()].first;
-    QuantizeInput(g, prior_box_op, prior_box_input, "Input", input_scale,
-                  is_input_unsigned);
-
-    ++quantize_prior_box_count;
-  };
-
-  gpd(graph, handler);
-  AddStatis(quantize_prior_box_count);
-
-  PrettyLogDetail("---    quantized %d prior_box ops",
-                  quantize_prior_box_count);
-}
-
-void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
-  VLOG(3) << "Quantizing the graph.";
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init(name_scope_, graph);
-
-  PADDLE_ENFORCE(param_scope());
-
-  QuantizeConv(graph, false /* with_residual_data */);
-  QuantizeConv(graph, true /* with_residual_data */);
-  QuantizePool(graph);
-  QuantizeConcat(graph);
-  QuantizePriorBox(graph);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass)
-    .RequirePassAttr("quant_var_scales");
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
deleted file mode 100644
index ec4db66240c6c7de8b6e6c66d056a6386b4907e3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Map variable name to tensor of scaling factors scaling it to MAX=1.0.
- * bool denotes whether quantization of the variable should be done to unsigned
- * type.
- */
-using VarQuantScale =
-    std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
-
-/*
- * Quantize all supported operators.
- */
-class CPUQuantizePass : public FusePassBase {
- public:
-  virtual ~CPUQuantizePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
-
-  void QuantizePool(Graph* graph) const;
-
-  void QuantizeConcat(Graph* graph) const;
-
-  void QuantizePriorBox(Graph* graph) const;
-
-  void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
-                     double scale_to_one, bool is_unsigned,
-                     std::string scale_attr_name = "") const;
-
-  // quantize all inputs of given name with the same (minimum) scale
-  void QuantizeInputs(Graph* g, Node* op, std::string input_name,
-                      VarQuantScale* scales, bool are_unsigned,
-                      std::string scale_attr_name = "") const;
-
-  void DequantizeOutput(Graph* g, Node* op, Node* output,
-                        std::string output_name, double scale_to_one,
-                        bool is_unsigned,
-                        std::string scale_attr_name = "") const;
-
-  const std::string name_scope_{"quantize"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
deleted file mode 100644
index 0a68944186773f84f734d81cf29dc5214d16e173..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ /dev/null
@@ -1,310 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs, bool use_mkldnn,
-           bool use_quantizer = false) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetAttr("use_mkldnn", use_mkldnn);
-  op->SetAttr("name", name);
-  if (type == "conv2d") {
-    op->SetInput("Input", {inputs[0]});
-    op->SetInput("Filter", {inputs[1]});
-    if (inputs.size() > 2)
-      op->SetInput("Bias", {inputs[2]});
-    else
-      op->SetInput("Bias", {});
-    if (inputs.size() > 3) {
-      op->SetInput("ResidualData", {inputs[3]});
-      op->SetAttr("fuse_residual_connection", true);
-    } else {
-      op->SetInput("ResidualData", {});
-      op->SetAttr("fuse_residual_connection", false);
-    }
-    op->SetOutput("Output", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
-    op->SetAttr("Scale_in", 1.0f);
-    op->SetAttr("Scale_out", 1.0f);
-    op->SetAttr("Scale_weights", std::vector<float>{1.0f});
-  } else if (type == "pool2d") {
-    op->SetInput("X", {inputs[0]});
-    op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
-  } else if (type == "dropout") {
-    op->SetInput("X", {inputs[0]});
-    op->SetOutput("Out", {outputs[0]});
-  } else if (type == "fc") {
-    op->SetInput("Input", {inputs[0]});
-    if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
-    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
-    op->SetOutput("Out", {outputs[0]});
-  } else if (type == "concat") {
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-    op->SetAttr("use_quantizer", use_quantizer);
-  }
-}
-
-namespace {
-static const std::initializer_list<std::string> variable_names{
-    "a", "w1", "c",  "d", "w2", "e",  "f", "g",
-    "h", "w3", "b1", "i", "j",  "w4", "b2"};
-// (a,w1)->Conv1->c and c->Pool1->d
-//
-// (d,w2)->Conv2->e and e->Pool2->f
-//
-// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j
-//
-// (d,w4, b2)->Conv4->i
-ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    if (v.find("w") == 0 || v.find("b") == 0) {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn,
-        use_quantizer);
-  SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer);
-
-  SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn,
-        use_quantizer);
-  SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
-
-  SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
-  SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn);
-  SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
-        use_quantizer);
-
-  SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn,
-        use_quantizer);
-
-  return prog;
-}
-
-void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
-                      const char* var_name) {
-  auto x = scope->Var(var_name);
-  auto tensor = x->GetMutable<LoDTensor>();
-  tensor->mutable_data(place, proto::VarType::FP32, 1);
-}
-
-void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
-              int quant_count, int dequant_count, int added_nodes_count,
-              float scale) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  // Init scope, as it is used in pass
-  auto place = paddle::platform::CPUPlace();
-  NaiveExecutor exe{place};
-  Scope scope;
-  exe.CreateVariables(prog, 0, true, &scope);
-
-  auto* scales = new VarQuantScale();
-
-  for (auto& v : variable_names) {
-    InitTensorHolder(&scope, place, v.c_str());
-    LoDTensor tensor;
-    tensor.Resize({1});
-    auto* ptr = tensor.mutable_data<double>(place);
-    ptr[0] = 2.0;
-
-    (*scales)[v] = std::make_pair(false, std::move(tensor));
-  }
-
-  graph->SetNotOwned(kParamScopeAttr, &scope);
-
-  auto pass = PassRegistry::Instance().Get("cpu_quantize_pass");
-  pass->Set("quant_var_scales", scales);
-
-  int original_nodes_num = graph->Nodes().size();
-
-  graph.reset(pass->Apply(graph.release()));
-
-  int current_nodes_num = graph->Nodes().size();
-
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int conv2d_nodes_count = 0;
-  int pool2d_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "conv2d") {
-        conv2d_nodes_count++;
-        auto op_name = boost::get<std::string>(op->GetAttr("name"));
-        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_in")), scale)
-            << "Scale_in for node '" + op_name + "'.";
-        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_out")), scale)
-            << "Scale_out for node '" + op_name + "'.";
-        EXPECT_EQ(
-            boost::get<std::vector<float>>(op->GetAttr("Scale_weights"))[0],
-            scale)
-            << "Scale_weights for node '" + op_name + "'.";
-      } else if (op->Type() == "pool2d") {
-        pool2d_nodes_count++;
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-      }
-    }
-  }
-  EXPECT_EQ(conv2d_nodes_count, conv_count);
-  EXPECT_EQ(pool2d_nodes_count, pool_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
-}
-
-TEST(CpuQuantizePass, quantize) {
-  bool use_mkldnn = true;
-  bool use_quantizer = true;
-  // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and
-  // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d
-  //
-  // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and
-  // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f
-  //
-  // d->Dropout1->g and g->Fc1->h and
-  // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j
-  //
-  // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
-  // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT
-  int added_nodes = 7 + 7 + 6 + 6;
-  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes,
-           2.0f * 127);
-}
-
-TEST(CpuQuantizePass, do_not_quantize) {
-  bool use_mkldnn = true;
-  bool use_quantizer = false;
-  int added_nodes = 0;
-  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes,
-           1.0f);
-}
-
-}  // namespace
-
-namespace {
-static const std::initializer_list<std::string> variable_names_concat = {
-    "a1", "b1", "a2", "b2", "c", "d"};
-
-// a1->Pool1->b1
-// a2->Pool2->b2
-// (b1,b2)->Concat->c
-// c->Pool3->d
-ProgramDesc BuildProgramDescConcat() {
-  ProgramDesc prog;
-
-  SetOp(&prog, "pool2d", "Pool1", {"a1"}, {"b1"}, true, false);
-  SetOp(&prog, "pool2d", "Pool2", {"a2"}, {"b2"}, true, false);
-  SetOp(&prog, "concat", "Concat", {"b1", "b2"}, {"c"}, true, true);
-  SetOp(&prog, "pool2d", "Pool3", {"c"}, {"d"}, true, false);
-
-  return prog;
-}
-
-void MainTestConcat(const ProgramDesc& prog, int pool_count, int concat_count,
-                    int quant_count, int dequant_count, int added_nodes_count) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  // Init scope, as it is used in pass
-  auto place = paddle::platform::CPUPlace();
-  NaiveExecutor exe{place};
-  Scope scope;
-  exe.CreateVariables(prog, 0, true, &scope);
-
-  auto* scales = new VarQuantScale();
-
-  for (auto& v : variable_names_concat) {
-    InitTensorHolder(&scope, place, v.c_str());
-    LoDTensor tensor;
-    tensor.Resize({1});
-    auto* ptr = tensor.mutable_data<double>(place);
-    ptr[0] = 2.0;
-
-    (*scales)[v] = std::make_pair(false, std::move(tensor));
-  }
-
-  graph->SetNotOwned(kParamScopeAttr, &scope);
-
-  auto pass = PassRegistry::Instance().Get("cpu_quantize_pass");
-  pass->Set("quant_var_scales", scales);
-
-  int original_nodes_num = graph->Nodes().size();
-
-  graph.reset(pass->Apply(graph.release()));
-
-  int current_nodes_num = graph->Nodes().size();
-
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int concat_nodes_count = 0;
-  int pool2d_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "concat") {
-        concat_nodes_count++;
-      } else if (op->Type() == "pool2d") {
-        pool2d_nodes_count++;
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-      }
-    }
-  }
-  EXPECT_EQ(concat_nodes_count, concat_count);
-  EXPECT_EQ(pool2d_nodes_count, pool_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
-}
-
-TEST(CpuQuantizePass, concat) {
-  // a1->Pool1->b1
-  // a2->Pool2->b2
-  // (b1->QUANT1->IN1, b2->QUANT2->IN2)->Concat->c
-  // c->OUT1->DEQUANT1->Pool3->d
-  int pool_count = 3;
-  int concat_count = 1;
-  int quant_count = 2;
-  int dequant_count = 1;
-  int added_nodes_count = 6;
-  MainTestConcat(BuildProgramDescConcat(), pool_count, concat_count,
-                 quant_count, dequant_count, added_nodes_count);
-}
-
-}  // namespace
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(cpu_quantize_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
deleted file mode 100644
index 79a8ac68b82fc79ec91c18ec96a04e1e676c8ba0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
-#include <string>
-#include <unordered_set>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
-  VLOG(3) << "Marks operators which are to be quantized.";
-  const auto& excluded_ids_list =
-      Get<std::unordered_set<int>>("quantize_excluded_op_ids");
-  const auto& op_types_list =
-      Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
-  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp()) {
-      if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
-                    n->id()) != excluded_ids_list.end())
-        continue;
-      auto* op = n->Op();
-      if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) {
-        if (op_types_list.empty()) {
-          op->SetAttr("use_quantizer", true);
-        } else if (std::find(op_types_list.begin(), op_types_list.end(),
-                             n->Name()) != op_types_list.end()) {
-          op->SetAttr("use_quantizer", true);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(cpu_quantize_placement_pass,
-              paddle::framework::ir::CPUQuantizePlacementPass)
-    // a vector of operator type names to be quantized ("conv2d" etc.)
-    .RequirePassAttr("quantize_enabled_op_types")
-    // a vector of operator ids that are to be excluded from quantization
-    .RequirePassAttr("quantize_excluded_op_ids");
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
deleted file mode 100644
index 008a462dc414c04f53315a8f262de15ab8fb7fb5..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-/*
- * Specifies which operators should be quantized.
- */
-class CPUQuantizePlacementPass : public Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
deleted file mode 100644
index ba4d281f818bb752570e7b500013f5f58001307c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
-
-#include <gtest/gtest.h>
-#include <boost/logic/tribool.hpp>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs,
-           boost::tribool use_quantizer) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-
-  op->SetType(type);
-
-  if (!boost::indeterminate(use_quantizer))
-    op->SetAttr("use_quantizer", use_quantizer);
-
-  if (type == "conv2d") {
-    op->SetAttr("name", name);
-    op->SetInput("Input", {inputs[0]});
-    op->SetInput("Filter", {inputs[1]});
-    op->SetInput("Bias", {inputs[2]});
-  } else if (type == "relu") {
-    op->SetInput("X", inputs);
-  } else if (type == "concat") {
-    op->SetAttr("axis", 1);
-    op->SetInput("X", {inputs[0], inputs[1]});
-  } else if (type == "pool2d") {
-    op->SetInput("X", {inputs[0]});
-  } else {
-    FAIL() << "Unexpected operator type.";
-  }
-  op->SetOutput("Out", {outputs[0]});
-}
-
-// operator                      use_quantizer
-// ---------------------------------------
-// (a,b)->concat->c              none
-// (c,weights,bias)->conv->f     false
-// f->relu->g                    none
-// g->pool->h                    false
-// (h,weights2,bias2)->conv->k   false
-// k->pool->l                    false
-ProgramDesc BuildProgramDesc() {
-  ProgramDesc prog;
-
-  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
-                                 "h", "weights2", "bias2", "k", "l"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::SELECTED_ROWS);
-    if (v == "weights" || v == "bias") {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, boost::indeterminate);
-  SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, false);
-  SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, boost::indeterminate);
-  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, false);
-  SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, false);
-  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, false);
-
-  return prog;
-}
-
-void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,
-              std::initializer_list<int> quantize_excluded_op_ids,
-              unsigned expected_use_quantizer_true_count) {
-  auto prog = BuildProgramDesc();
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass");
-  pass->Set("quantize_enabled_op_types",
-            new std::unordered_set<std::string>(quantize_enabled_op_types));
-  pass->Set("quantize_excluded_op_ids",
-            new std::unordered_set<int>(quantize_excluded_op_ids));
-
-  graph.reset(pass->Apply(graph.release()));
-
-  unsigned use_quantizer_true_count = 0;
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->HasAttr("use_quantizer") &&
-          boost::get<bool>(op->GetAttr("use_quantizer"))) {
-        ++use_quantizer_true_count;
-      }
-    }
-  }
-
-  EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count);
-}
-
-TEST(QuantizerPlacementPass, enabled_pool) { MainTest({"pool2d"}, {}, 2); }
-
-TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
-  MainTest({"conv2d"}, {4}, 1);
-}
-
-TEST(QuantizerPlacementPass, excluded_none) {
-  // 2 conv + 2 pool
-  MainTest({}, {}, 4);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(cpu_quantize_placement_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
deleted file mode 100644
index 1e23539c80fb9b1e89a7f5ffc38adddfabbab67d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file eint8_outcept in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or
-// implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-using string::PrettyLogDetail;
-
-void CPUQuantizeSquashPass::FindNodesToKeep(
-    Graph* graph,
-    std::unordered_map<const Node*, int>* nodes_keep_counter) const {
-  GraphPatternDetector gpd;
-  patterns::DequantAny deq_any_pattern{gpd.mutable_pattern(), "deqant_any"};
-  deq_any_pattern();
-
-  int found_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern);
-
-    if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end())
-      (*nodes_keep_counter)[dequant_out] = 1;
-    else
-      (*nodes_keep_counter)[dequant_out] += 1;
-
-    found_count++;
-  };
-  gpd(graph, handler);
-  AddStatis(found_count);
-}
-
-void CPUQuantizeSquashPass::DequantQuantSquash(
-    Graph* graph,
-    std::unordered_map<const Node*, int>* nodes_keep_counter) const {
-  GraphPatternDetector gpd;
-  patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"};
-  squash_pattern();
-
-  int found_dequant_quant_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "squash requantize-quantize ops pair";
-
-    GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern);
-
-    auto* next_op_desc = next_op->Op();
-    float dequant_scale = boost::get<float>(dequant_op->Op()->GetAttr("Scale"));
-    float quant_scale = boost::get<float>(quant_op->Op()->GetAttr("Scale"));
-    PADDLE_ENFORCE(nodes_keep_counter->find(dequant_out) !=
-                   nodes_keep_counter->end());
-
-    // check if dequantize op should be kept or removed, decrease the counter
-    bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
-
-    if (dequant_scale == quant_scale) {
-      // squash dequantize-quantize to nothing
-      auto quant_out_var_name = quant_out->Name();
-      auto next_op_inputs = next_op_desc->InputNames();
-      for (const auto& name : next_op_inputs) {
-        auto input_names = next_op_desc->Input(name);
-        std::replace(input_names.begin(), input_names.end(), quant_out_var_name,
-                     dequant_in->Name());
-        next_op_desc->SetInput(name, input_names);
-      }
-
-      if (keep_dequant)
-        GraphSafeRemoveNodes(graph, {quant_op, quant_out});
-      else
-        GraphSafeRemoveNodes(graph,
-                             {dequant_op, quant_op, dequant_out, quant_out});
-
-      IR_NODE_LINK_TO(dequant_in, next_op);
-
-      found_dequant_quant_count++;
-    } else {
-      // squash dequantize-quantize to requantize op
-      OpDesc desc;
-      desc.SetType("requantize");
-      desc.SetInput("Input", std::vector<std::string>({dequant_in->Name()}));
-      desc.SetOutput("Output", std::vector<std::string>({quant_out->Name()}));
-      desc.SetAttr("Scale_in", dequant_scale);
-      desc.SetAttr("Scale_out", quant_scale);
-
-      auto requant_op = g->CreateOpNode(&desc);
-
-      if (keep_dequant)
-        GraphSafeRemoveNodes(graph, {quant_op});
-      else
-        GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out});
-
-      IR_NODE_LINK_TO(dequant_in, requant_op);
-      IR_NODE_LINK_TO(requant_op, quant_out);
-
-      found_dequant_quant_count++;
-    }
-  };
-  gpd(graph, handler);
-  AddStatis(found_dequant_quant_count);
-  PrettyLogDetail("---    squashed %d dequantize-quantize pairs",
-                  found_dequant_quant_count);
-}
-
-void CPUQuantizeSquashPass::ConvRequantSquash(Graph* graph) const {
-  GraphPatternDetector gpd;
-  patterns::ConvRequant conv_requant_pattern{gpd.mutable_pattern(),
-                                             "conv_requant"};
-  conv_requant_pattern();
-
-  int found_requant_squash_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "squash conv-requantize ops pair";
-
-    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_requant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_requant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(requant_op, requant_op, conv_requant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(requant_out, requant_out, conv_requant_pattern);
-
-    // if conv2d has one output squash
-    if (conv_out->outputs.size() == 1) {
-      float requant_scale_out =
-          boost::get<float>(requant_op->Op()->GetAttr("Scale_out"));
-      conv_op->Op()->SetAttr("Scale_out", requant_scale_out);
-      conv_op->Op()->SetOutput("Output",
-                               std::vector<std::string>({requant_out->Name()}));
-      IR_NODE_LINK_TO(conv_op, requant_out);
-      GraphSafeRemoveNodes(graph, {conv_out, requant_op});
-
-      found_requant_squash_count++;
-    }
-  };
-  gpd(graph, handler);
-  AddStatis(found_requant_squash_count);
-  PrettyLogDetail("---    squashed %d requantize with convs",
-                  found_requant_squash_count);
-}
-
-void CPUQuantizeSquashPass::ConvDequantSquash(Graph* graph) const {
-  GraphPatternDetector gpd;
-  patterns::ConvDequant conv_dequant_pattern{gpd.mutable_pattern(),
-                                             "conv_dequant"};
-  conv_dequant_pattern();
-
-  int found_conv_dequant_squash_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "squash conv-dequant ops pair";
-
-    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_dequant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_dequant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, conv_dequant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, conv_dequant_pattern);
-
-    // if conv2d has one output
-    // and there is no fuse residual connection
-    // because residual fusion does not support force output with fp32
-    if (conv_out->outputs.size() == 1 &&
-        !(conv_op->Op()->GetAttrIfExists<bool>("fuse_residual_connection"))) {
-      conv_op->Op()->SetAttr("force_fp32_output", true);
-      conv_op->Op()->SetOutput("Output",
-                               std::vector<std::string>({dequant_out->Name()}));
-      IR_NODE_LINK_TO(conv_op, dequant_out);
-      GraphSafeRemoveNodes(graph, {conv_out, dequant_op});
-      found_conv_dequant_squash_count++;
-    }
-  };
-  gpd(graph, handler);
-  AddStatis(found_conv_dequant_squash_count);
-  PrettyLogDetail("---    squashed %d dequant with convs",
-                  found_conv_dequant_squash_count);
-}
-
-void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init("cpu_quantize_squash_pass", graph);
-
-  std::unordered_map<const Node*, int> nodes_keep_counter;
-  FindNodesToKeep(graph, &nodes_keep_counter);
-  DequantQuantSquash(graph, &nodes_keep_counter);
-  ConvRequantSquash(graph);
-  ConvDequantSquash(graph);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(cpu_quantize_squash_pass,
-              paddle::framework::ir::CPUQuantizeSquashPass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
deleted file mode 100644
index 7e9e92e3dacd7dc71ed4902133c7da00eb595faf..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Squash dequantize->quantize pair pattern into requantize op
- */
-class CPUQuantizeSquashPass : public FusePassBase {
- public:
-  virtual ~CPUQuantizeSquashPass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  /*
-   * For each dequantize's output find the number of operators it is an input to
-   */
-  void FindNodesToKeep(
-      Graph* graph,
-      std::unordered_map<const Node*, int>* nodes_keep_counter) const;
-
-  /*
-   * Squash dequantize-quantize ops pairs into requantize or nothing
-   */
-  void DequantQuantSquash(
-      Graph* graph,
-      std::unordered_map<const Node*, int>* nodes_keep_counter) const;
-
-  /*
-   * Squash requantize op into conv with scale_out like requantize scale_out
-   */
-  void ConvRequantSquash(Graph* graph) const;
-
-  /*
-  *  Squash conv2d with dequant when dequant is the only op after conv2d
-  */
-  void ConvDequantSquash(Graph* graph) const;
-
-  const std::string name_scope_{"squash"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
deleted file mode 100644
index 08b605a713b92e296069030a5c7c439433098b06..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ /dev/null
@@ -1,408 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs, bool use_mkldnn,
-           float scale = 0) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetAttr("use_mkldnn", use_mkldnn);
-  op->SetAttr("name", name);
-  if (type == "conv2d") {
-    op->SetAttr("Scale_out", scale);
-    op->SetInput("Input", {inputs[0]});
-    if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]});
-    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
-    op->SetOutput("Output", {outputs[0]});
-  } else if (type == "quantize") {
-    op->SetInput("Input", {inputs[0]});
-    op->SetOutput("Output", {outputs[0]});
-    op->SetAttr("Scale", scale);
-  } else if (type == "dequantize") {
-    op->SetInput("Input", {inputs[0]});
-    op->SetOutput("Output", {outputs[0]});
-    op->SetAttr("Scale", scale);
-  } else if (type == "requantize") {
-    op->SetInput("Input", {inputs[0]});
-    op->SetOutput("Output", {outputs[0]});
-    op->SetAttr("Scale_out", scale);
-  } else if (type == "concat") {
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-  }
-}
-
-// (a,w1,b1)->Conv1->d
-// d->Dequant(scale1)->e
-// e->Quant(scale2)->f
-// (f,w2,b2)->Conv2->i
-ProgramDesc BuildConvRequantProgramDesc(bool use_mkldnn, float scale_out,
-                                        float scale1, float scale2) {
-  ProgramDesc prog;
-  for (auto& v : std::initializer_list<std::string>(
-           {"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    if (v.find("w") == 0 || v.find("b") == 0) {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn,
-        scale_out);
-  SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1);
-  SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2);
-  SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn,
-        scale_out);
-  return prog;
-}
-
-static const std::initializer_list<std::string> variable_names{
-    "a", "b", "c", "d", "e", "f", "g", "h"};
-
-// a->Conv1->b
-// b->Dequant(scale1)->c
-// c->Quant1(scale2)->d and d->Conv2->e
-// c->Conv3->f
-// c->Quant2(scale3)->g and g->Conv4->h
-ProgramDesc BuildConvMultiOutputProgramDesc(bool use_mkldnn, float scale_out,
-                                            float scale1, float scale2,
-                                            float scale3) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out);
-  SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1);
-
-  SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2);
-  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn, scale_out);
-
-  SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn, scale_out);
-
-  SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3);
-  SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn, scale_out);
-
-  return prog;
-}
-
-//  a->Conv1->b->Requant(scale1)->c
-//  d->Conv2->e->Requant(scale2)->f
-//  {c,f}->Concat
-ProgramDesc BuildConvsRequantConcatProgramDesc(bool use_mkldnn, float scale_out,
-                                               float scale1, float scale2) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out);
-  SetOp(&prog, "requantize", "Requant1", {"b"}, {"c"}, use_mkldnn, scale1);
-
-  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn, scale_out);
-  SetOp(&prog, "requantize", "Requant2", {"e"}, {"f"}, use_mkldnn, scale2);
-
-  SetOp(&prog, "concat", "Concat", {"c"}, {"f"}, use_mkldnn);
-
-  return prog;
-}
-
-// a->Concat->b
-// b->Dequant(scale1)->c
-// c->Quant(scale2)->d
-// d->Conv->e
-ProgramDesc BuildConcatDequantQuantProgramDesc(bool use_mkldnn, float scale_out,
-                                               float scale1, float scale2) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-
-  SetOp(&prog, "concat", "Concat", {"a"}, {"b"}, use_mkldnn);
-  SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1);
-  SetOp(&prog, "quantize", "Quant", {"c"}, {"d"}, use_mkldnn, scale2);
-  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn, scale_out);
-  return prog;
-}
-
-// a->Conv1->b
-// b->Requant1(Scale1)->c
-// b->Requant2(Scale2)->d
-ProgramDesc BuildConvMultiRequantProgramDesc(bool use_mkldnn, float scale_out,
-                                             float scale1, float scale2) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out);
-  SetOp(&prog, "requantize", "Requant1", {"b"}, {"c"}, use_mkldnn, scale1);
-  SetOp(&prog, "requantize", "Requant2", {"b"}, {"d"}, use_mkldnn, scale2);
-  return prog;
-}
-
-// a->Conv1->b
-// b->Dequant1(Scale1)->c
-// c->Concat
-ProgramDesc BuildConvDequantConcatProgramDesc(bool use_mkldnn, float scale_out,
-                                              float scale) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out);
-  SetOp(&prog, "dequantize", "Dequant1", {"b"}, {"c"}, use_mkldnn, scale);
-  SetOp(&prog, "concat", "Concat1", {"c"}, {"d"}, use_mkldnn);
-  return prog;
-}
-
-// a->Conv1->b
-// b->Dequant1(Scale1)->c
-// b->Conv2->d
-ProgramDesc BuildConvDequantConvProgramDesc(bool use_mkldnn, float scale_out,
-                                            float scale) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out);
-  SetOp(&prog, "dequantize", "Dequant1", {"b"}, {"c"}, use_mkldnn, scale);
-  SetOp(&prog, "conv2d", "Conv2", {"b"}, {"d"}, use_mkldnn);
-  return prog;
-}
-
-void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
-                      const char* var_name) {
-  auto x = scope->Var(var_name);
-  auto tensor = x->GetMutable<LoDTensor>();
-  tensor->mutable_data(place, proto::VarType::FP32, 1);
-}
-
-void PrepareGraph(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog) {
-  auto place = paddle::platform::CPUPlace();
-  NaiveExecutor exe{place};
-  Scope scope;
-  exe.CreateVariables(prog, 0, true, &scope);
-
-  for (auto& v : variable_names) {
-    InitTensorHolder(&scope, place, v.c_str());
-  }
-  (*graph)->SetNotOwned(kParamScopeAttr, &scope);
-}
-
-void RegisterPass(std::unique_ptr<ir::Graph>* graph) {
-  auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass");
-  graph->reset(pass->Apply(graph->release()));
-}
-
-// check number of nodes
-void CountNodeTest(const ProgramDesc& prog, int removed_nodes_num) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  PrepareGraph(&graph, prog);
-
-  int original_nodes_num = graph->Nodes().size();
-  RegisterPass(&graph);
-  int current_nodes_num = graph->Nodes().size();
-
-  EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num);
-}
-
-// check op->scale_out
-void EqualScaleOutTest(const ProgramDesc& prog, const std::string& name,
-                       float scale) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  PrepareGraph(&graph, prog);
-  RegisterPass(&graph);
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() &&
-        boost::get<std::string>(node->Op()->GetAttr("name")) == name) {
-      float scale_out = boost::get<float>(node->Op()->GetAttr("Scale_out"));
-      EXPECT_EQ(scale_out, scale);
-    }
-  }
-}
-
-// check requant_op scales
-void CheckRequantScalesTest(const ProgramDesc& prog, float scale_in,
-                            float scale_out) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  PrepareGraph(&graph, prog);
-  RegisterPass(&graph);
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()->Type() == "requantize") {
-      float op_scale_in = boost::get<float>(node->Op()->GetAttr("Scale_in"));
-      EXPECT_EQ(op_scale_in, scale_in);
-      float op_scale_out = boost::get<float>(node->Op()->GetAttr("Scale_out"));
-      EXPECT_EQ(op_scale_out, scale_out);
-    }
-  }
-}
-
-// From Conv1->d->Dequant->e->Quant->f->Conv2
-// To Conv1->d->Conv2
-TEST(CpuQuantizeSquashPass, equal_scales) {
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto use_mkldnn = true;
-  // Remove 4 nodes: Dequant, Quant, e, f
-  auto remove_nodes = 4;
-
-  CountNodeTest(
-      BuildConvRequantProgramDesc(use_mkldnn, scale_out, scale, scale),
-      remove_nodes);
-}
-
-// From Conv1->d->Dequant->e->Quant->f->Conv2
-// First change to Conv1->d->Requant->f->Conv2
-// Then Conv1->f->Conv2
-TEST(CpuQuantizeSquashPass, unequal_scales) {
-  auto scale_out = 1.0f;
-  auto scale1 = 1.2345f;
-  auto scale2 = 21.0f;
-  auto use_mkldnn = true;
-  // Remove 4 nodes: Dequant, Quant, e, d
-  auto remove_nodes = 4;
-
-  CountNodeTest(
-      BuildConvRequantProgramDesc(use_mkldnn, scale_out, scale1, scale2),
-      remove_nodes);
-
-  EqualScaleOutTest(
-      BuildConvRequantProgramDesc(use_mkldnn, scale_out, scale1, scale2),
-      "Conv1", scale2);
-}
-
-// from
-// a->Conv1->b->Dequant(Scale1)->c
-// c->Quant1(Scale1)->d and d->Conv2->e
-// c->Quant2(Scale2)->g and g->Conv4->h
-// c->Conv3->f
-// to
-// a->Conv1->b
-// b->Conv2->e
-// b->Requant(Scale_in = Scale1; Scale_out = Scale2)->g->Conv4->h
-// b->Dequant(Scale1)->c->Conv3->f
-TEST(CpuQuantizeSquashPass, branch_to_equal_unequal_and_fp32) {
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto scale2 = 21.0f;
-  auto use_mkldnn = true;
-  // Remove 3 nodes: Quant1, c, Quant2,
-  // Insert 1 node: Requant
-  auto remove_nodes = 2;
-
-  CountNodeTest(BuildConvMultiOutputProgramDesc(use_mkldnn, scale_out, scale,
-                                                scale, scale2),
-                remove_nodes);
-  CheckRequantScalesTest(BuildConvMultiOutputProgramDesc(use_mkldnn, scale_out,
-                                                         scale, scale, scale2),
-                         scale, scale2);
-}
-
-//  a->Conv1->b->Requant->c
-//  d->Conv2->e->Requant->f
-//  {c,f}->Concat
-TEST(CpuQuantizeSquashPass, equal_scales_squash_requantize) {
-  // Delete both requantize op
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto use_mkldnn = true;
-  // Remove 4 nodes: b, Requant1, e, Requant2
-  auto remove_nodes = 4;
-  CountNodeTest(
-      BuildConvsRequantConcatProgramDesc(use_mkldnn, scale_out, scale, scale),
-      remove_nodes);
-
-  // check equal scale conv->scale_out and requant->scale_out
-  EqualScaleOutTest(
-      BuildConvsRequantConcatProgramDesc(use_mkldnn, scale_out, scale, scale),
-      "Conv1", scale);
-  EqualScaleOutTest(
-      BuildConvsRequantConcatProgramDesc(use_mkldnn, scale_out, scale, scale),
-      "Conv2", scale);
-}
-
-// a->Concat->b->Dequant->c->Quant->d->Conv->e
-// to a->Concat->b->Requant->d->Conv->e
-TEST(CpuQuantizeSquashPass,
-     unequal_scales_squash_dequantize_quantize_into_requantize) {
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto scale2 = 21.0f;
-  auto use_mkldnn = true;
-  // Remove 3 nodes: Dequant1, c, Quant
-  // Insert 1 node: Requant
-  auto remove_nodes = 2;
-
-  CountNodeTest(
-      BuildConcatDequantQuantProgramDesc(use_mkldnn, scale_out, scale, scale2),
-      remove_nodes);
-  CheckRequantScalesTest(
-      BuildConcatDequantQuantProgramDesc(use_mkldnn, scale_out, scale, scale2),
-      scale, scale2);
-}
-
-// a->Conv1->b
-// b->Requant1(Scale1)->c
-// b->Requant2(Scale2)->d
-TEST(CpuQuantizeSquashPass, more_than_one_conv_out_outputs) {
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto scale2 = 21.0f;
-  auto use_mkldnn = true;
-  // nothing change
-  auto remove_nodes = 0;
-  CountNodeTest(
-      BuildConvMultiRequantProgramDesc(use_mkldnn, scale_out, scale, scale2),
-      remove_nodes);
-}
-
-// a->Conv1->c->Concat
-TEST(CpuQuantizeSquashPass, conv_dequant_only_one_output) {
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto use_mkldnn = true;
-  // remove 2 nodes: Dequant1, c
-  auto remove_nodes = 2;
-  CountNodeTest(BuildConvDequantConcatProgramDesc(use_mkldnn, scale_out, scale),
-                remove_nodes);
-}
-
-TEST(CpuQuantizeSquashPass, conv_dequant_more_than_one_op_after_conv) {
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto use_mkldnn = true;
-  // nothing change
-  auto remove_nodes = 0;
-  CountNodeTest(BuildConvDequantConvProgramDesc(use_mkldnn, scale_out, scale),
-                remove_nodes);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(cpu_quantize_squash_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
deleted file mode 100644
index e854559ae7a8765da604c2043e8e4e8cedbbcf88..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-#define GET_NODE(id, pattern)                               \
-  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
-                 "pattern has no Node called %s", #id);     \
-  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
-  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
-
-void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
-  FusePassBase::Init("depthwise_conv_mkldnn_pass", graph);
-  GraphPatternDetector gpd;
-
-  auto* pattern = gpd.mutable_pattern();
-  pattern->NewNode("depthwise_conv")
-      ->assert_is_op("depthwise_conv2d")
-      ->assert_op_attr("use_mkldnn", true);
-
-  int found_depthwise_conv_mkldnn_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
-    GET_NODE(depthwise_conv, (*pattern));
-    depthwise_conv->Op()->SetType("conv2d");
-    found_depthwise_conv_mkldnn_count++;
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_depthwise_conv_mkldnn_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(depthwise_conv_mkldnn_pass,
-              paddle::framework::ir::DepthwiseConvMKLDNNPass);
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
deleted file mode 100644
index ca314afde57bbc5a339b2016a2540309b31f0598..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class DepthwiseConvMKLDNNPass : public FusePassBase {
- public:
-  virtual ~DepthwiseConvMKLDNNPass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
deleted file mode 100644
index f2dfbc84a5a5a7feac2514731445eb191bd6f784..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetAttr("use_mkldnn", use_mkldnn);
-  op->SetAttr("name", name);
-  op->SetInput("Input", {inputs[0]});
-  op->SetInput("Filter", {inputs[1]});
-  op->SetInput("Bias", {inputs[2]});
-  op->SetOutput("Out", outputs);
-}
-
-// (a, weights, bias)->depthwise conv mkldnn->b
-// (b, weights2, bias2)->depthwise conv no mkldnn->c
-// (c, weights3, bias3)->conv mkldnn->d
-// (d, weights3, bias3)->conv no mkldnn->e
-ProgramDesc BuildProgramDesc() {
-  ProgramDesc prog;
-  for (auto& v : std::vector<std::string>(
-           {"a", "b", "c", "d", "e", "weights", "bias", "weights2", "bias2",
-            "weights3", "bias3", "weights4", "bias4"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::SELECTED_ROWS);
-    if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2" ||
-        v == "weights3" || v == "bias3" || v == "weights4" || v == "bias4") {
-      var->SetPersistable(true);
-    }
-  }
-
-  // depthwise conv with MKL-DNN
-  SetOp(&prog, "depthwise_conv2d", "conv1",
-        std::vector<std::string>({"a", "weights", "bias"}),
-        std::vector<std::string>({"b"}), true);
-  // depthwise conv without MKL-DNN
-  SetOp(&prog, "depthwise_conv2d", "conv2",
-        std::vector<std::string>({"b", "weights2", "bias2"}),
-        std::vector<std::string>({"c"}), false);
-  // conv with MKL-DNN
-  SetOp(&prog, "conv2d", "conv3",
-        std::vector<std::string>({"c", "weights3", "bias3"}),
-        std::vector<std::string>({"d"}), true);
-  // conv without MKL-dNN
-  SetOp(&prog, "conv2d", "conv4",
-        std::vector<std::string>({"d", "weights4", "bias4"}),
-        std::vector<std::string>({"e"}), false);
-
-  return prog;
-}
-
-TEST(DepthwiseConvMKLDNNPass, basic) {
-  auto prog = BuildProgramDesc();
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass");
-
-  struct counters {
-    int mkldnn_depthwise_conv_nodes;
-    int other_depthwise_conv_nodes;
-    int mkldnn_conv_nodes;
-    int other_conv_nodes;
-  };
-
-  counters before{1, 1, 1, 1};
-
-  graph.reset(pass->Apply(graph.release()));
-
-  // initialize counters before loop
-  counters after{0, 0, 0, 0};
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "conv2d") {
-        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
-          after.mkldnn_conv_nodes++;
-        else
-          after.other_conv_nodes++;
-      } else if (op->Type() == "depthwise_conv2d") {
-        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
-          after.mkldnn_depthwise_conv_nodes++;
-        else
-          after.other_depthwise_conv_nodes++;
-      }
-    }
-  }
-
-  EXPECT_EQ(after.other_depthwise_conv_nodes,
-            before.other_depthwise_conv_nodes);
-  EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes);
-  EXPECT_EQ(after.mkldnn_depthwise_conv_nodes,
-            before.mkldnn_depthwise_conv_nodes - 1);
-  EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(depthwise_conv_mkldnn_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
deleted file mode 100644
index 9cc2d3da3fceea06a7a753a88577fdff59a4a136..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h"
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
-  Init("fc_mkldnn_pass", graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
-
-  GraphPatternDetector gpd;
-  auto* x = gpd.mutable_pattern()
-                ->NewNode("fc_mkldnn_pass/x")
-                ->AsInput()
-                ->assert_is_op_input("fc", "Input");
-  patterns::FCMKLDNN fc_pattern(gpd.mutable_pattern(), "fc_mkldnn_pass");
-  fc_pattern(x, true /*with bias*/);
-
-  int found_fc_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Handle FC MKL-DNN pass";
-    if (!(graph->Has("use_mkldnn") && graph->Get<bool>("use_mkldnn"))) {
-      VLOG(3) << "do not perform fc fuse";
-      return;
-    }
-    GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(weights, weights, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(bias, bias, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern);
-
-    OpDesc* desc = fc->Op();
-    auto in_size = fc->inputs[0]->Var()->GetShape().size();
-    if (in_size != 2 && in_size != 4) {
-      VLOG(3) << "Do not enable FC MKL-DNN for dimensions different than 2 & 4";
-      return;
-    }
-    desc->SetAttr("use_mkldnn", true);
-    PADDLE_ENFORCE(subgraph.count(x));
-
-    found_fc_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_fc_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fc_mkldnn_pass, paddle::framework::ir::FCMKLDNNPass);
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
deleted file mode 100644
index 97c6b242989591ccf24e52a969bfcedc4f377c3f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <memory>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Transpose weights of FC to comply with MKL-DNN interface
- */
-class FCMKLDNNPass : public FusePassBase {
- public:
-  virtual ~FCMKLDNNPass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
deleted file mode 100644
index 6032f38b0cffd8627c547a08e5f5b657decf89df..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
-
-REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass)
-    .RequirePassAttr("mkldnn_enabled_op_types");
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
deleted file mode 100644
index 98bd2d0aa0280a77ee274aa4f53b1eed99fdf7fe..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/placement_pass_base.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Specifies which operators should use MKLDNN.
- */
-class MKLDNNPlacementPass : public PlacementPassBase {
- private:
-  const std::string GetPlacementName() const { return "MKLDNN"; }
-
-  const std::string GetAttrName() const { return "use_mkldnn"; }
-
-  const std::unordered_set<std::string> GetOpTypesList() const {
-    return Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
deleted file mode 100644
index 5885f327e610a5c3d931a00b36066194dac8994a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
-
-#include <gtest/gtest.h>
-#include <boost/logic/tribool.hpp>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs, boost::tribool use_mkldnn) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-
-  op->SetType(type);
-
-  if (!boost::indeterminate(use_mkldnn)) op->SetAttr("use_mkldnn", use_mkldnn);
-
-  if (type == "conv2d") {
-    op->SetAttr("name", name);
-    op->SetInput("Input", {inputs[0]});
-    op->SetInput("Filter", {inputs[1]});
-    op->SetInput("Bias", {inputs[2]});
-  } else if (type == "relu") {
-    op->SetInput("X", inputs);
-  } else if (type == "concat") {
-    op->SetAttr("axis", 1);
-    op->SetInput("X", {inputs[0], inputs[1]});
-  } else if (type == "pool2d") {
-    op->SetInput("X", {inputs[0]});
-  } else {
-    FAIL() << "Unexpected operator type.";
-  }
-  op->SetOutput("Out", {outputs[0]});
-}
-
-// operator                      use_mkldnn
-// ---------------------------------------
-// (a,b)->concat->c              none
-// (c,weights,bias)->conv->f     none
-// f->relu->g                    false
-// g->pool->h                    false
-// (h,weights2,bias2)->conv->k   true
-// k->relu->l                    true
-ProgramDesc BuildProgramDesc() {
-  ProgramDesc prog;
-
-  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
-                                 "h", "weights2", "bias2", "k", "l"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::SELECTED_ROWS);
-    if (v == "weights" || v == "bias") {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "concat", "concat1", std::vector<std::string>({"a", "b"}),
-        std::vector<std::string>({"c"}), boost::indeterminate);
-  SetOp(&prog, "conv2d", "conv1",
-        std::vector<std::string>({"c", "weights", "bias"}),
-        std::vector<std::string>({"f"}), boost::indeterminate);
-  SetOp(&prog, "relu", "relu1", std::vector<std::string>({"f"}),
-        std::vector<std::string>({"g"}), false);
-  SetOp(&prog, "pool2d", "pool1", std::vector<std::string>({"g"}),
-        std::vector<std::string>({"h"}), false);
-  SetOp(&prog, "conv2d", "conv2",
-        std::vector<std::string>({"h", "weights2", "bias2"}),
-        std::vector<std::string>({"k"}), true);
-  SetOp(&prog, "relu", "relu2", std::vector<std::string>({"k"}),
-        std::vector<std::string>({"l"}), true);
-
-  return prog;
-}
-
-void MainTest(std::initializer_list<std::string> mkldnn_enabled_op_types,
-              unsigned expected_use_mkldnn_true_count) {
-  auto prog = BuildProgramDesc();
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  auto pass = PassRegistry::Instance().Get("mkldnn_placement_pass");
-  pass->Set("mkldnn_enabled_op_types",
-            new std::unordered_set<std::string>(mkldnn_enabled_op_types));
-
-  graph.reset(pass->Apply(graph.release()));
-
-  unsigned use_mkldnn_true_count = 0;
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->HasAttr("use_mkldnn") &&
-          boost::get<bool>(op->GetAttr("use_mkldnn"))) {
-        ++use_mkldnn_true_count;
-      }
-    }
-  }
-
-  EXPECT_EQ(use_mkldnn_true_count, expected_use_mkldnn_true_count);
-}
-
-TEST(MKLDNNPlacementPass, enable_conv_relu) {
-  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 0 pool
-  MainTest({"conv2d", "relu"}, 3);
-}
-
-TEST(MKLDNNPlacementPass, enable_relu_pool) {
-  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
-  MainTest({"relu", "pool2d"}, 4);
-}
-
-TEST(MKLDNNPlacementPass, enable_all) {
-  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
-  MainTest({}, 4);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(mkldnn_placement_pass);
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
deleted file mode 100644
index a8720ff4bfb5c7fa7aee6d23949b030c328b90e6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ /dev/null
@@ -1,336 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/multi_batch_merge_pass.h"
-
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-static const char kNumRepeats[] = "num_repeats";
-typedef std::unordered_map<std::string, std::vector<ir::Node*>> SSAVarList;
-
-ir::Node* SameNameVar(std::unordered_set<ir::Node*> all, ir::Node* target) {
-  for (auto n : all) {
-    if (target->IsVar() && target->Name() == n->Name()) {
-      return n;
-    }
-  }
-  return nullptr;
-}
-
-VarDesc CopyVarDesc(VarDesc* var_desc) {
-  VarDesc repeated_var(var_desc->Name());
-  // copy other variable attributes
-  if (var_desc->GetType() != proto::VarType::READER) {
-    repeated_var.SetType(var_desc->GetType());
-    repeated_var.SetShape(var_desc->GetShape());
-    repeated_var.SetDataType(var_desc->GetDataType());
-    repeated_var.SetLoDLevel(var_desc->GetLoDLevel());
-    repeated_var.SetPersistable(var_desc->Persistable());
-  } else {
-    // TODO(typhoonzero): copy reader var
-  }
-  return repeated_var;
-}
-
-VarDesc UpdateGradVarDesc(
-    VarDesc* var_desc, int repeat,
-    const std::unordered_set<std::string>& grad_names,
-    const std::unordered_set<std::string>& bn_vars_need_rename) {
-  if (grad_names.find(var_desc->Name()) != grad_names.end() ||
-      bn_vars_need_rename.find(var_desc->Name()) != bn_vars_need_rename.end()) {
-    std::string new_gname =
-        string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat);
-    VarDesc repeated_var = CopyVarDesc(var_desc);
-    repeated_var.SetName(new_gname);
-    VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat;
-    return repeated_var;
-  }
-  return *var_desc;
-}
-
-void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
-  int num_repeats = Get<const int>(kNumRepeats);
-  std::vector<Node*> forward_backward_ops;
-  std::vector<Node*> optimize_ops;
-  std::vector<Node*> lr_ops;  // ops other than forward/backward/optimize
-  std::unordered_set<std::string> grad_names;
-  std::unordered_map<std::string, std::string> gradname2paramname;
-
-  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
-  auto origin_nodes = graph->ReleaseNodes();
-  VLOG(3) << "origin nodes count: " << origin_nodes.size();
-  ir::Graph& result = *graph;
-
-  // 1. record op nodes of different roles
-  for (auto node : nodes) {
-    if (!node->IsOp()) continue;
-    PADDLE_ENFORCE(node->Op(), "must find opdesc");
-    int op_role = boost::get<int>(node->Op()->GetAttr(
-        framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
-    if ((op_role == static_cast<int>(framework::OpRole::kForward)) ||
-        (op_role & static_cast<int>(framework::OpRole::kBackward)) ||
-        (op_role & static_cast<int>(framework::OpRole::kLoss))) {
-      forward_backward_ops.push_back(node);
-    } else if ((op_role & static_cast<int>(framework::OpRole::kOptimize)) ||
-               (op_role & static_cast<int>(framework::OpRole::kDist)) ||
-               (op_role & static_cast<int>(framework::OpRole::kRPC))) {
-      optimize_ops.push_back(node);
-      auto op_role_var = node->Op()->GetNullableAttr(
-          OpProtoAndCheckerMaker::OpRoleVarAttrName());
-      auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
-      for (size_t i = 0; i < op_role_vars.size(); i += 2) {
-        grad_names.insert(op_role_vars[i + 1]);
-        gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i];
-      }
-    } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
-      lr_ops.push_back(node);
-    } else {  // NOLINT
-      PADDLE_THROW("Invalid op_role: %d", static_cast<int>(op_role));
-    }
-  }
-
-  // 2. copy forward backward
-  ir::Node* prev_repeat_last_op_node = nullptr;
-  // record origin_grad -> repeated_grad_list map.
-  std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
-  std::map<std::string, std::vector<ir::Node*>> created;
-  std::unordered_set<std::string> bn_vars_need_rename;
-  for (int i = 0; i < num_repeats; ++i) {
-    std::unordered_set<ir::Node*> copied;
-    for (size_t node_idx = 0; node_idx < forward_backward_ops.size();
-         ++node_idx) {
-      auto node = forward_backward_ops[node_idx];
-      OpDesc repeated_op(*(node->Op()), node->Op()->Block());
-      // 3. rename grad outputs to current repeat.
-      for (auto outname : repeated_op.OutputArgumentNames()) {
-        if (grad_names.find(outname) != grad_names.end()) {
-          std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
-          repeated_op.RenameOutput(outname, new_gname);
-          // remove op_role_var for backward ops that outputs grad for a
-          // parameter.
-          repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
-                              std::vector<std::string>());
-        }
-      }
-      // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
-      // not need this update, because only moving mean and variance should be
-      // differ, trainable parameter scale and bias is the same as other
-      // parameters.
-      if (node->Name() == "batch_norm") {
-        // NOTE: assume bn op created by layers use save var as output mean and
-        // variance
-        std::string new_mean_name =
-            string::Sprintf("%s.repeat.%d", repeated_op.Input("Mean")[0], i);
-        std::string new_var_name = string::Sprintf(
-            "%s.repeat.%d", repeated_op.Input("Variance")[0], i);
-        bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]);
-        bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]);
-        VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to "
-                << new_mean_name;
-        repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name);
-        repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name);
-        repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0],
-                                 new_mean_name);
-        repeated_op.RenameOutput(repeated_op.Output("VarianceOut")[0],
-                                 new_var_name);
-      }
-
-      // 3.9 do copy
-      auto repeated_node = result.CreateOpNode(&repeated_op);
-      copied.insert(node);
-
-      // 4. add deps between repeats
-      if (node_idx == forward_backward_ops.size() - 1) {
-        prev_repeat_last_op_node = repeated_node;
-      }
-      if (node_idx == 0 && prev_repeat_last_op_node) {
-        auto* depvar = result.CreateControlDepVar();
-        prev_repeat_last_op_node->outputs.push_back(depvar);
-        depvar->inputs.push_back(prev_repeat_last_op_node);
-        repeated_node->inputs.push_back(depvar);
-        depvar->outputs.push_back(repeated_node);
-      }
-
-      for (auto in_node : node->inputs) {
-        if (in_node->IsCtrlVar()) {
-          continue;
-        }
-        ir::Node* var = nullptr;
-        auto updated_var = UpdateGradVarDesc(in_node->Var(), i, grad_names,
-                                             bn_vars_need_rename);
-        // should be initialized by startup, how to initilize tensor in the
-        // scope?
-        if (node->Name() == "batch_norm" &&
-            bn_vars_need_rename.find(in_node->Name()) !=
-                bn_vars_need_rename.end()) {
-          // Create bn mean/variance for each repeat
-          var = result.CreateVarNode(&updated_var);
-          created[updated_var.Name()].push_back(var);
-          copied.insert(in_node);
-          repeated_node->inputs.push_back(var);
-          var->outputs.push_back(repeated_node);
-          continue;
-        }
-
-        // for other ops
-        if (in_node->inputs.empty() && i > 0) {
-          // do not copy head vars (inputs, params) in repeats > 0
-          var = created.at(in_node->Name()).back();
-        } else {
-          if (copied.find(in_node) == copied.end()) {
-            var = result.CreateVarNode(&updated_var);
-            if (grad_names.find(in_node->Var()->Name()) != grad_names.end()) {
-              grad_repeated_map[in_node].push_back(var);
-            }
-            copied.insert(in_node);
-            created[updated_var.Name()].push_back(var);
-          } else {
-            var = created.at(updated_var.Name()).back();
-          }
-        }
-        repeated_node->inputs.push_back(var);
-        var->outputs.push_back(repeated_node);
-      }
-      for (auto out_node : node->outputs) {
-        if (out_node->IsCtrlVar()) {
-          continue;
-        }
-        ir::Node* var = nullptr;
-        auto updated_var = UpdateGradVarDesc(out_node->Var(), i, grad_names,
-                                             bn_vars_need_rename);
-        if (copied.find(out_node) == copied.end()) {
-          var = result.CreateVarNode(&updated_var);
-          if (grad_names.find(out_node->Var()->Name()) != grad_names.end()) {
-            grad_repeated_map[out_node].push_back(var);
-          }
-          copied.insert(out_node);
-          created[updated_var.Name()].push_back(var);
-        } else {
-          var = created.at(updated_var.Name()).back();
-        }
-        repeated_node->outputs.push_back(var);
-        var->inputs.push_back(repeated_node);
-      }
-    }
-  }  // end copy forward backward
-
-  // 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
-  // scale(1/num_repeats)
-  for (auto kv : grad_repeated_map) {
-    OpDesc sum_op;
-    sum_op.SetType("sum");
-    std::vector<std::string> repeated_grad_names;
-    std::vector<std::string> param_grad_op_role_var;
-    for (auto r : kv.second) {
-      repeated_grad_names.push_back(r->Var()->Name());
-    }
-    // NOTE: use op_role_var to control allreduce op appending in
-    //       multi_devices_graph_pass, we want to append op_role_var
-    //       only once for the merged gradient, so break after first call.
-    param_grad_op_role_var.push_back(
-        gradname2paramname.at(kv.first->Var()->Name()));        // param
-    param_grad_op_role_var.push_back(kv.first->Var()->Name());  // grad
-
-    sum_op.SetInput("X", repeated_grad_names);
-    sum_op.SetOutput("Out", {kv.first->Var()->Name()});
-    sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                   static_cast<int>(OpRole::kBackward));
-    auto sum_op_node = result.CreateOpNode(&sum_op);
-    for (auto r : kv.second) {
-      sum_op_node->inputs.push_back(r);
-      r->outputs.push_back(sum_op_node);
-    }
-    auto sum_out_var_node = result.CreateVarNode(kv.first->Var());
-    sum_op_node->outputs.push_back(sum_out_var_node);
-    sum_out_var_node->inputs.push_back(sum_op_node);
-    created[sum_out_var_node->Name()].push_back(sum_out_var_node);
-
-    OpDesc scale_op;
-    scale_op.SetType("scale");
-    scale_op.SetInput("X", {sum_out_var_node->Var()->Name()});
-    // NOTE: inplace scale.
-    scale_op.SetOutput("Out", {sum_out_var_node->Var()->Name()});
-    scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
-    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                     static_cast<int>(OpRole::kBackward));
-
-    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
-                     param_grad_op_role_var);
-
-    auto scale_op_node = result.CreateOpNode(&scale_op);
-    scale_op_node->inputs.push_back(sum_out_var_node);
-    sum_out_var_node->outputs.push_back(scale_op_node);
-    auto scale_out_var_node = result.CreateVarNode(sum_out_var_node->Var());
-    scale_op_node->outputs.push_back(scale_out_var_node);
-    scale_out_var_node->inputs.push_back(scale_op_node);
-    created[scale_out_var_node->Name()].push_back(scale_out_var_node);
-  }
-  // 6. add optimize ops
-  {
-    auto copy_node = [&result, &created](ir::Node* node) {
-      auto op_node = result.CreateOpNode(node->Op());
-      // copy op ins/outs
-      // NOTE: for send/recv ops, the OpDesc uses ctrldepvar to describe
-      // dependencies, so create those depvars if OpDesc have in/outs.
-      for (auto in_node : node->inputs) {
-        if (in_node->IsCtrlVar() && !in_node->Var()) {
-          continue;
-        }
-        ir::Node* var = nullptr;
-        if (created.find(in_node->Name()) == created.end()) {
-          var = result.CreateVarNode(in_node->Var());
-          created[in_node->Name()].push_back(var);
-        } else {
-          var = created.at(in_node->Name()).back();
-        }
-        op_node->inputs.push_back(var);
-        var->outputs.push_back(op_node);
-      }
-      for (auto out_node : node->outputs) {
-        if (out_node->IsCtrlVar() && !out_node->Var()) {
-          continue;
-        }
-        auto var = result.CreateVarNode(out_node->Var());
-        created[out_node->Name()].push_back(var);
-        op_node->outputs.push_back(var);
-        var->inputs.push_back(op_node);
-      }
-    };
-    for (auto node : lr_ops) {
-      copy_node(node);
-    }
-    for (auto node : optimize_ops) {
-      copy_node(node);
-    }
-  }
-
-  result.ResolveHazard(created);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(multi_batch_merge_pass, paddle::framework::ir::BatchMergePass)
-    .RequirePassAttr(paddle::framework::ir::kNumRepeats);
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.h b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
deleted file mode 100644
index a89616683d9c625111272fd8c1de237a5c9dbe8f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// BatchMergePass is used to copy forward and backward ops for several
-// times to run several batches to simulate large batch size training
-// as if we have more than 1 GPUs.
-// User can define how many batches to run, gradients will be merged
-// through those repeats, and then do optimization using merged gradients.
-// This pass is extremely useful when doing large batch-size distributed
-// sync training, we can simulate even large batch size as if we have more
-// GPUs.
-
-class BatchMergePass : public Pass {
- public:
-  virtual ~BatchMergePass() {}
-
- protected:
-  void ApplyImpl(Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
deleted file mode 100644
index 4cdb6a7d30882d095a2666ccc45ed7716954c37c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
-
-cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
-cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
-
-set(ALL_REDUCE_OP_HANDLES all_reduce_op_handle)
-if(WITH_GPU AND WITH_DGC)
-  list(APPEND ALL_REDUCE_OP_HANDLES sparse_all_reduce_op_handle)
-endif()
-
-cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle ${ALL_REDUCE_OP_HANDLES} reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
-cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
-
-cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
-cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS all_reduce_op_handle graph graph_helper pass)
-cc_library(backward_optimizer_op_deps_pass SRCS backward_optimizer_op_deps_pass.cc DEPS graph graph_helper pass)
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
deleted file mode 100644
index fccc36bae15fe66d2fc02353b1e821eee22c6424..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+++ /dev/null
@@ -1,219 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class AllReduceDepsPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override {
-    std::vector<details::OpHandleBase*> all_reduce_op_handles =
-        GetSortedAllReduceOps(*graph);
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto use_hierarchical_allreduce =
-        Get<bool>(details::kUseHierarchicalAllReduce);
-    for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) {
-      auto op_handle =
-          dynamic_cast<details::NCCLOpHandleBase*>(all_reduce_op_handles[i]);
-      PADDLE_ENFORCE(op_handle, "op_handle must be NCCLOpHandleBase");
-      op_handle->SetRunEnv(i, use_hierarchical_allreduce);
-    }
-#endif
-
-    for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) {
-      auto* dep_var = new details::DummyVarHandle(graph->CreateControlDepVar());
-      graph->Get<details::GraphDepVars>(details::kGraphDepVars)
-          .emplace(dep_var);
-      all_reduce_op_handles[i - 1]->AddOutput(dep_var);
-      all_reduce_op_handles[i]->AddInput(dep_var);
-    }
-
-    if (VLOG_IS_ON(10)) {
-      DebugString(*graph, all_reduce_op_handles);
-    }
-  }
-
-  std::vector<details::OpHandleBase*> GetSortedAllReduceOps(
-      const ir::Graph& graph) const {
-    std::vector<details::OpHandleBase*> all_reduce_op_handles;
-    std::unordered_map<details::OpHandleBase*, size_t> pending_ops;
-    std::unordered_set<details::OpHandleBase*> ready_ops;
-    std::unordered_set<details::OpHandleBase*> next_ready_ops;
-    auto op_handles = ir::FilterByNodeWrapper<details::OpHandleBase>(graph);
-    size_t num_of_ops = op_handles.size();
-    for (details::OpHandleBase* op : op_handles) {
-      size_t not_ready_vars = op->NotReadyInputSize();
-      if (not_ready_vars) {
-        pending_ops.insert({op, not_ready_vars});
-      } else {
-        ready_ops.insert(op);
-      }
-    }
-
-    GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
-
-    size_t has_run_ops = ready_ops.size();
-    while (has_run_ops != num_of_ops) {
-      for (auto* op : ready_ops) {
-        for (auto& ready_var : op->Outputs()) {
-          for (auto* pend_op : ready_var->PendingOps()) {
-            auto& deps = --pending_ops[pend_op];
-            if (deps == 0) {
-              next_ready_ops.insert(pend_op);
-            }
-          }
-        }
-      }
-
-      PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle.");
-      ready_ops.clear();
-      std::swap(ready_ops, next_ready_ops);
-      GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
-      has_run_ops += ready_ops.size();
-    }
-    return all_reduce_op_handles;
-  }
-
-  void GetSortedAllReduceOps(
-      const std::unordered_set<details::OpHandleBase*>& ready_ops,
-      std::vector<details::OpHandleBase*>* all_reduce_op_handles) const {
-    std::vector<details::OpHandleBase*> current_all_reduce_op_handles;
-    for (auto& op_handle : ready_ops) {
-      auto all_reduce_op_handle =
-          dynamic_cast<details::AllReduceOpHandle*>(op_handle);
-      auto fused_all_reduce_op_handle =
-          dynamic_cast<details::FusedAllReduceOpHandle*>(op_handle);
-
-      if (all_reduce_op_handle || fused_all_reduce_op_handle) {
-        current_all_reduce_op_handles.emplace_back(op_handle);
-      }
-    }
-
-    // NOTE(zcd): For distributed training, it is important to keep the order of
-    // allReduce on each node consistent. Otherwise, hang may occur.
-    // Sort the current_all_reduce_op_handles according to the name of input.
-    sort(current_all_reduce_op_handles.begin(),
-         current_all_reduce_op_handles.end(),
-         [](const details::OpHandleBase* left,
-            const details::OpHandleBase* right) -> bool {
-           auto left_in_vars =
-               details::DynamicCast<details::VarHandle>(left->Inputs());
-           auto right_in_vars =
-               details::DynamicCast<details::VarHandle>(right->Inputs());
-           PADDLE_ENFORCE_GT(left_in_vars.size(), 0);
-           PADDLE_ENFORCE_GT(right_in_vars.size(), 0);
-           return left_in_vars[0]->Name() > right_in_vars[0]->Name();
-         });
-
-    all_reduce_op_handles->insert(all_reduce_op_handles->end(),
-                                  current_all_reduce_op_handles.begin(),
-                                  current_all_reduce_op_handles.end());
-  }
-
-  void DebugString(
-      const ir::Graph& graph,
-      const std::vector<details::OpHandleBase*>& all_reduce_op_handles) const {
-    // get vars order
-    std::map<int, std::vector<std::string>> vars =
-        GetSoredGradientsFromStaleProgram(graph);
-    std::stringstream out;
-    size_t grads_of_stale_program = 0;
-    out << "Get Order From details::kStaleProgramOpDescs: ";
-    for (auto& var : vars) {
-      out << "Order " << var.first << " [";
-      for (auto& var_name : var.second) {
-        out << var_name << ", ";
-        ++grads_of_stale_program;
-      }
-      out << "], ";
-    }
-    VLOG(10) << out.str();
-
-    std::stringstream out2;
-    out2 << "Get Order From Topological order: ";
-    for (auto& op : all_reduce_op_handles) {
-      bool find_valid_input = false;
-      for (auto& in_var : op->Inputs()) {
-        if (dynamic_cast<details::VarHandle*>(in_var)) {
-          out2 << in_var->Name() << ", ";
-          find_valid_input = true;
-          break;
-        }
-      }
-      PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input.");
-    }
-    VLOG(10) << out2.str();
-    if (grads_of_stale_program != all_reduce_op_handles.size()) {
-      VLOG(10)
-          << "The gradients number of stale program and graph is not equal.";
-    }
-  }
-
-  std::map<int, std::vector<std::string>> GetSoredGradientsFromStaleProgram(
-      const ir::Graph& graph) const {
-    std::map<int, std::vector<std::string>> vars;
-    auto ops =
-        graph.Get<const std::vector<OpDesc*>>(details::kStaleProgramOpDescs);
-    int order = 0;
-    for (auto* op_desc : ops) {
-      try {
-        bool is_bk_op =
-            static_cast<bool>(boost::get<int>(op_desc->GetAttr(
-                                  OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                              static_cast<int>(OpRole::kBackward));
-        if (!is_bk_op) continue;
-
-        auto backward_vars =
-            boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
-                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-        if (backward_vars.empty()) continue;
-
-        PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-        for (size_t i = 1; i < backward_vars.size(); i += 2) {
-          vars[order].emplace_back(backward_vars[i]);
-          VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1]
-                  << ", " << backward_vars[i];
-        }
-        order++;
-      } catch (boost::bad_get e) {
-      }
-    }
-    return vars;
-  }
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(all_reduce_deps_pass, paddle::framework::ir::AllReduceDepsPass)
-    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
deleted file mode 100644
index c7ab32a2c1a91216c6ffc9c2d8e8dc812bd38cd4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class BackWardOpDepsPass : public ir::Pass {
- protected:
-  void AddDep(ir::Graph* graph, details::OpHandleBase* l,
-              details::OpHandleBase* r) const {
-    auto* dep_var = new details::DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<details::GraphDepVars>(details::kGraphDepVars).emplace(dep_var);
-    l->AddOutput(dep_var);
-    r->AddInput(dep_var);
-    VLOG(10) << "add deps:" << l->DebugString() << " and " << r->DebugString();
-  }
-
-  void ApplyImpl(ir::Graph* graph) const override {
-    // NOTE: The operator nodes should be in topology order.
-    std::vector<details::OpHandleBase*> backward_op_handles;
-    std::vector<details::OpHandleBase*> all_opt_handles;
-    details::ParamsAndGrads params_grads;
-    std::vector<ir::Node*> topo_nodes = ir::TopologySortOperations(*graph);
-    for (auto& node : topo_nodes) {
-      if (!node->Op()) continue;
-
-      GetBackWardOpHandles(node, &backward_op_handles, &params_grads);
-      GetOptimizerOpHandles(node, &all_opt_handles);
-    }
-
-    VLOG(10) << "backward_op_handles size:" << backward_op_handles.size()
-             << ", opt_handles size:" << all_opt_handles.size();
-
-    if (backward_op_handles.size() <= 1 || all_opt_handles.size() <= 1) {
-      VLOG(10) << "need not backward_op_deps_pass";
-      return;
-    }
-
-    std::vector<details::OpHandleBase*> opt_handles;
-    GetOptimizerHandlesRoot(all_opt_handles, &opt_handles, params_grads);
-
-    if (opt_handles.size() <= 1) {
-      VLOG(10) << "need not backward_op_deps_pass";
-      return;
-    }
-
-    VLOG(10) << "add optimize deps";
-    for (size_t i = 1; i < opt_handles.size(); ++i) {
-      AddDep(graph, opt_handles[i - 1], opt_handles[i]);
-    }
-
-    VLOG(10) << "add deps between backward and optimze:";
-    AddDep(graph, backward_op_handles[backward_op_handles.size() - 1],
-           opt_handles[0]);
-  }
-
-  /*
-   * When the backward ophandles complete, the optimizer ophandle's inputs var
-   * are ready.Since the optimizer ophandles can be seen as graphs which each of
-   * them doesn't connect to each other, they can run parallelly or by a
-   * specified order, such as by the grads generated order. This function will
-   * get these graphs' root.
-   */
-  void GetOptimizerHandlesRoot(
-      const std::vector<details::OpHandleBase*>& ops,
-      std::vector<details::OpHandleBase*>* result,
-      const details::ParamsAndGrads& params_grads) const {
-    std::unordered_set<details::OpHandleBase*> visit;
-    for (auto op : ops) {
-      if (visit.find(op) != visit.end()) {
-        continue;
-      }
-
-      VLOG(10) << "visiting all_opt_handles:" << op->DebugString();
-
-      result->emplace_back(op);
-      visit.insert(op);
-      VisitChildrens(op, &visit);
-    }
-
-    for (size_t i = 0; i < result->size(); i++) {
-      VLOG(10) << "get potential head op:" << (*result)[i]->DebugString();
-    }
-
-    // sort by param_grad order
-    std::unordered_map<std::string, int> pg_order;
-    int order = 0;
-    for (auto& p_g : params_grads) {
-      pg_order[p_g.second] = order++;
-    }
-
-    std::vector<std::pair<details::OpHandleBase*, int>> op_handles;
-    for (auto op : *result) {
-      int order = 0;
-      for (auto input : op->Inputs()) {
-        if (dynamic_cast<details::VarHandle*>(input) == nullptr) continue;
-
-        if (pg_order.find(input->Name()) == pg_order.end()) {
-          VLOG(10) << "not find input " << input->Name() << " in grad";
-          continue;
-        }
-
-        if (order < pg_order.at(input->Name())) {
-          order = pg_order.at(input->Name());
-        }
-      }
-      op_handles.emplace_back(std::make_pair(op, order));
-    }
-
-    sort(op_handles.begin(), op_handles.end(),
-         [](const std::pair<details::OpHandleBase*, int>& left,
-            const std::pair<details::OpHandleBase*, int>& right) -> bool {
-           return left.second < right.second;
-         });
-
-    result->clear();
-    for (auto p : op_handles) {
-      result->emplace_back(p.first);
-    }
-
-    for (size_t i = 0; i < result->size(); i++) {
-      VLOG(10) << "get head op:" << (*result)[i]->DebugString();
-    }
-  }
-
-  void VisitChildrens(details::OpHandleBase* op,
-                      std::unordered_set<details::OpHandleBase*>* visit) const {
-    for (auto out : op->Outputs()) {
-      for (auto* pending_op : out->PendingOps()) {
-        if (visit->find(pending_op) != visit->end()) {
-          continue;
-        }
-
-        VLOG(10) << "visiting:" << pending_op->DebugString();
-
-        visit->insert(pending_op);
-        VisitChildrens(pending_op, visit);
-      }
-    }
-  }
-
-  void GetBackWardOpHandles(
-      ir::Node* node, std::vector<details::OpHandleBase*>* backward_op_handles,
-      details::ParamsAndGrads* params_grads) const {
-    try {
-      bool is_bk_op =
-          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kBackward));
-      if (!is_bk_op) return;
-
-      // Currently, we assume that once gradient is generated, it can be
-      // broadcast, and each gradient is only broadcast once.
-      auto backward_vars =
-          boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
-      PADDLE_ENFORCE(node->IsWrappedBy<details::OpHandleBase>());
-
-      backward_op_handles->emplace_back(
-          &node->Wrapper<details::OpHandleBase>());
-
-      for (size_t i = 0; i < backward_vars.size(); i += 2) {
-        VLOG(10) << "Trainable parameter: " << backward_vars[i]
-                 << ", gradient: " << backward_vars[i + 1];
-
-        params_grads->emplace_back(std::make_pair(
-            backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/));
-      }
-    } catch (boost::bad_get e) {
-    }
-  }
-
-  void GetOptimizerOpHandles(
-      ir::Node* node, std::vector<details::OpHandleBase*>* opt_handles) const {
-    try {
-      bool is_opt_op =
-          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kOptimize));
-      if (!is_opt_op) return;
-
-      opt_handles->emplace_back(&node->Wrapper<details::OpHandleBase>());
-    } catch (boost::bad_get e) {
-    }
-  }
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(backward_optimizer_op_deps_pass,
-              paddle::framework::ir::BackWardOpDepsPass);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
deleted file mode 100644
index 73d7bf6dba0f01cc53ed0e9010c2da88afd6e384..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ /dev/null
@@ -1,219 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class FuseAllReduceOpPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override {
-    if (Get<size_t>(details::kNRanks) <= 1) {
-      VLOG(6) << "The number of place is" << Get<size_t>(details::kNRanks)
-              << ", there doesn't need apply FuseAllReduceOpPass.";
-      return;
-    }
-
-    auto &places = Get<const std::vector<platform::Place>>(details::kPlaces);
-    auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto *multi_nccl_ctxs =
-        &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
-#endif
-
-    ir::Graph &result = *graph;
-    auto &params_grads =
-        result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads);
-    size_t num_of_all_reduce = params_grads.size();
-    std::unordered_set<std::string> grads;
-    grads.reserve(num_of_all_reduce);
-    for (auto p_g : params_grads) {
-      grads.insert(p_g.second);
-    }
-
-    std::unordered_map<std::string, Node *> all_reduce_ops =
-        GetAllReduceOps(result, places, grads);
-
-    VLOG(6) << "Find all_reduce_ops: " << all_reduce_ops.size();
-    if (all_reduce_ops.size() == 0) {
-      return;
-    }
-
-    PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(),
-                      "The number of all_reduce OpHandle is not equal to the "
-                      "number of grads. Maybe some gradients are sparse type, "
-                      "it is not supported currently.");
-
-    auto &group_params_grads = graph->Get<details::GroupParamsAndGrads>(
-        details::kGroupParamsAndDenseGrads);
-
-    LOG(WARNING) << string::Sprintf(
-        "Find all_reduce operators: %d. To make the speed faster, some "
-        "all_reduce ops are fused during training, after fusion, "
-        "the number of all_reduce ops is %d.",
-        all_reduce_ops.size(), group_params_grads.size());
-
-    for (auto &group_p_g : group_params_grads) {
-      size_t group_size = group_p_g.size();
-      PADDLE_ENFORCE_GT(group_size, static_cast<size_t>(0));
-      std::vector<ir::Node *> group_all_reduce_ops;
-      group_all_reduce_ops.reserve(group_size);
-      for (auto &p_g : group_p_g) {
-        group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second));
-      }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      InsertFusedAllReduce(places, local_scopes, group_size,
-                           group_all_reduce_ops, multi_nccl_ctxs, &result);
-#else
-      InsertFusedAllReduce(places, local_scopes, group_size,
-                           group_all_reduce_ops, &result);
-#endif
-    }
-  }
-
-  std::unordered_map<std::string, Node *> GetAllReduceOps(
-      const Graph &result, const std::vector<platform::Place> &places,
-      const std::unordered_set<std::string> &grads) const {
-    size_t num_place = places.size();
-    std::unordered_map<std::string, Node *> all_reduce_ops;
-    all_reduce_ops.reserve(grads.size());
-    for (auto &node : result.Nodes()) {
-      if (node->IsOp()) {
-        PADDLE_ENFORCE(node->IsWrappedBy<details::OpHandleBase>());
-        auto *all_reduce_op_handle = dynamic_cast<details::AllReduceOpHandle *>(
-            &node->Wrapper<details::OpHandleBase>());
-        if (all_reduce_op_handle) {
-          auto inputs = details::DynamicCast<details::VarHandle>(
-              all_reduce_op_handle->Inputs());
-          PADDLE_ENFORCE_EQ(inputs.size(), num_place);
-          // The inputs' name should be the same.
-          auto &grad_name = inputs[0]->name();
-          for (size_t i = 1; i < inputs.size(); ++i) {
-            PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name,
-                              "The input name should be the same.");
-          }
-          PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast<size_t>(0));
-          all_reduce_ops.emplace(grad_name, node);
-        }
-      }
-    }
-    return all_reduce_ops;
-  }
-
-  void InsertFusedAllReduce(const std::vector<platform::Place> &places,
-                            const std::vector<Scope *> &local_scopes,
-                            const size_t num_of_all_reduce,
-                            const std::vector<ir::Node *> &all_reduce_ops,
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-                            const platform::NCCLCommunicator *multi_nccl_ctxs,
-#endif
-                            ir::Graph *result) const {
-    std::vector<details::VarHandleBase *> inputs;
-    std::vector<details::VarHandleBase *> outputs;
-    for (auto &op : all_reduce_ops) {
-      auto &op_handle = op->Wrapper<details::OpHandleBase>();
-      inputs.insert(inputs.end(), op_handle.Inputs().begin(),
-                    op_handle.Inputs().end());
-      // Remove output
-      for_each(op_handle.Inputs().begin(), op_handle.Inputs().end(),
-               [&op_handle](details::VarHandleBase *var_handle) {
-                 var_handle->RemoveOutput(&op_handle, op_handle.Node());
-               });
-
-      outputs.insert(outputs.end(), op_handle.Outputs().begin(),
-                     op_handle.Outputs().end());
-      // Remove Input
-      for_each(op_handle.Outputs().begin(), op_handle.Outputs().end(),
-               [](details::VarHandleBase *var_handle) {
-                 var_handle->ClearGeneratedOp();
-               });
-
-      result->RemoveNode(op_handle.Node());
-    }
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
-                           local_scopes, multi_nccl_ctxs, result);
-#else
-    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
-                           local_scopes, result);
-#endif
-  }
-
- private:
-  void CreateFusedAllReduceOp(
-      const std::vector<details::VarHandleBase *> &inputs,
-      const std::vector<details::VarHandleBase *> &outputs,
-      const size_t num_of_all_reduce,
-      const std::vector<platform::Place> &places,
-      const std::vector<Scope *> &local_scopes,
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      const platform::NCCLCommunicator *multi_nccl_ctxs,
-#endif
-      ir::Graph *result) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto *op_handle = new details::FusedAllReduceOpHandle(
-        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
-        local_scopes, places, num_of_all_reduce, multi_nccl_ctxs);
-#else
-    auto *op_handle = new details::FusedAllReduceOpHandle(
-        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
-        local_scopes, places, num_of_all_reduce);
-#endif
-
-    for (auto in : inputs) {
-      op_handle->AddInput(in);
-    }
-
-    for (auto out : outputs) {
-      op_handle->AddOutput(out);
-    }
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    if (!multi_nccl_ctxs) {
-      SetCommunicationContext(places, op_handle);
-    }
-#else
-    SetCommunicationContext(places, op_handle);
-#endif
-  }
-
-  void SetCommunicationContext(
-      const std::vector<platform::Place> &places,
-      details::FusedAllReduceOpHandle *op_handle) const {
-    for (size_t i = 0; i < places.size(); ++i) {
-      op_handle->SetDeviceContext(
-          places[i], platform::DeviceContextPool::Instance().Get(places[i]));
-    }
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fuse_all_reduce_op_pass,
-              paddle::framework::ir::FuseAllReduceOpPass)
-    .RequirePassAttr(paddle::framework::details::kNRanks);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc
deleted file mode 100644
index e9b35aefc94e8544455e9559746990cdb4362ebb..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-static bool IsLockAndRecordEventFreeComputationOpHandle(
-    details::ComputationOpHandle *op, const OpGraphView &graph_view) {
-  if (!platform::is_gpu_place(op->GetPlace())) return false;
-  for (auto &pending_op : graph_view.PendingOps(op)) {
-    auto *tmp = dynamic_cast<details::ComputationOpHandle *>(pending_op);
-    if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) {
-      return false;
-    }
-  }
-  return true;
-}
-
-class ModifyOpLockAndRecordEventPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override {
-    auto all_ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*graph);
-    OpGraphView graph_view(all_ops);
-    for (auto &op : all_ops) {
-      auto *compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
-      if (compute_op == nullptr) continue;
-      bool is_lock_and_record_event_free =
-          IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view);
-      compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free);
-      if (is_lock_and_record_event_free) {
-        VLOG(10) << "Set is_lock_and_record_event_free be true in op "
-                 << compute_op->DebugString();
-      }
-    }
-  }
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(modify_op_lock_and_record_event_pass,
-              paddle::framework::ir::ModifyOpLockAndRecordEventPass);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
deleted file mode 100644
index 8cc33a6ceb9f14d6360f03625a83bee23a577c9f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class SSAGraghBuilderWithChecker : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override {
-    PADDLE_ENFORCE(IsValidGraph(graph));
-  }
-
-  bool IsValidGraph(const ir::Graph *graph) const {
-    std::unordered_map<details::OpHandleBase *, size_t> pending_ops;
-    std::unordered_set<details::VarHandleBase *> pending_vars;
-    std::unordered_set<details::VarHandleBase *> ready_vars;
-    std::unordered_set<details::OpHandleBase *> ready_ops;
-
-    auto insert_pending_var = [&](details::VarHandleBase *var) {
-      pending_vars.insert(var);
-      if (var->GeneratedOp() == nullptr) {
-        ready_vars.emplace(var);
-      }
-    };
-
-    for (auto &var_map : graph->Get<details::GraphVars>(details::kGraphVars)) {
-      for (auto &name_pair : var_map) {
-        for (auto &version_pair : name_pair.second) {
-          insert_pending_var(version_pair);
-        }
-      }
-    }
-
-    for (auto &var :
-         graph->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-      insert_pending_var(var);
-    }
-
-    for (auto *op : ir::FilterByNodeWrapper<details::OpHandleBase>(*graph)) {
-      if (op->Inputs().empty()) {
-        ready_ops.insert(op);
-      } else {
-        pending_ops.insert({op, op->NoDupInputSize()});
-      }
-    }
-
-    auto run_all_ops = [&](std::unordered_set<details::OpHandleBase *> &set) {
-      for (auto *op : set) {
-        for (auto out : op->Outputs()) {
-          ready_vars.emplace(out);
-        }
-      }
-      set.clear();
-    };
-
-    while (!pending_vars.empty()) {
-      run_all_ops(ready_ops);
-
-      if (ready_vars.empty()) {
-        return false;
-      }
-
-      for (auto ready_var : ready_vars) {
-        pending_vars.erase(ready_var);
-        for (auto *op : ready_var->PendingOps()) {
-          auto &deps = --pending_ops[op];
-          if (deps == 0) {
-            ready_ops.insert(op);
-          }
-        }
-      }
-      ready_vars.clear();
-    }
-    return true;
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(multi_devices_check_pass,
-              paddle::framework::ir::SSAGraghBuilderWithChecker)
-    .RequireGraphAttr(paddle::framework::details::kGraphVars)
-    .RequireGraphAttr(paddle::framework::details::kGraphDepVars);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
deleted file mode 100644
index 224ab21b4788f99b91e343f06afa55dcb2a69a82..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ /dev/null
@@ -1,1118 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
-#include <algorithm>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
-#include "paddle/fluid/framework/details/broadcast_op_handle.h"
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h"
-#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
-#include "paddle/fluid/framework/details/reduce_op_handle.h"
-#include "paddle/fluid/framework/details/rpc_op_handle.h"
-#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#if defined(PADDLE_WITH_DGC)
-#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-namespace {
-// TODO(panyx0718): Clean this up as well.
-// all operators. NOTE that even we use a vector here, the operators is
-// unordered.
-typedef std::vector<details::OpHandleBase *> GraphOps;
-const char kGraphOps[] = "ops";
-
-bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) {
-  return boost::get<int>(
-             node.Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-         static_cast<int>(role);
-}
-
-void PolishGraphToSupportDataHazards(ir::Graph *graph) {
-  for (auto &var_map : graph->Get<details::GraphVars>(details::kGraphVars)) {
-    for (auto &name_pair : var_map) {
-      if (name_pair.second.size() <= 1) {
-        continue;
-      }
-      auto it_new = name_pair.second.rbegin();
-      auto it_old = name_pair.second.rbegin();
-      ++it_old;
-      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        details::OpHandleBase *write_op = (*it_new)->GeneratedOp();
-        const auto &read_ops = (*it_old)->PendingOps();
-
-        for (auto *read_op : read_ops) {
-          // Manually add a dependency var from read_op to write_op;
-          if (read_op == write_op) {
-            // Read Write is the same op.
-            continue;
-          }
-          bool has_dep = false;
-          for (auto *r_out : read_op->Outputs()) {
-            for (auto *w_in : write_op->Inputs()) {
-              if (r_out->Node() == w_in->Node()) {
-                has_dep = true;
-                break;
-              }
-            }
-          }
-          if (has_dep) continue;
-
-          auto *dep_var =
-              new details::DummyVarHandle(graph->CreateControlDepVar());
-          read_op->AddOutput(dep_var);
-          write_op->AddInput(dep_var);
-          graph->Get<details::GraphDepVars>(details::kGraphDepVars)
-              .emplace(dep_var);
-        }
-      }
-    }
-  }
-}
-
-details::VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
-                                               const platform::Place &place,
-                                               size_t place_offset) {
-  auto &var_holders =
-      graph->Get<details::GraphVars>(details::kGraphVars)[place_offset];
-  auto &var_holder = var_holders[node->Name()];
-  details::VarHandle *var = nullptr;
-  if (var_holder.empty()) {
-    if (node->Var()) {
-      var = new details::VarHandle(graph->CreateVarNode(node->Var()), 0,
-                                   place_offset, node->Name(), place);
-    } else {
-      var = new details::VarHandle(
-          graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
-          place_offset, node->Name(), place);
-    }
-    var_holder.emplace_back(var);
-  } else {
-    var = *var_holder.rbegin();
-  }
-  return var;
-}
-
-void CreateOpOutput(ir::Graph *graph, details::OpHandleBase *op_handle,
-                    ir::Node *new_node, const platform::Place &place,
-                    size_t place_offset) {
-  auto &vars = graph->Get<details::GraphVars>(
-      details::kGraphVars)[place_offset][new_node->Name()];
-  size_t version = vars.size();
-  auto var = new details::VarHandle(new_node, version, place_offset,
-                                    new_node->Name(), place);
-  vars.emplace_back(var);
-  op_handle->AddOutput(var);
-}
-
-void AddOutputToLeafOps(ir::Graph *graph) {
-  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
-    if (!op->Outputs().empty()) {
-      continue;
-    }
-    auto *dummy_leaf =
-        new details::DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<details::GraphDepVars>(details::kGraphDepVars)
-        .emplace(dummy_leaf);
-    op->AddOutput(dummy_leaf);
-  }
-}
-}  // namespace
-
-void MultiDevSSAGraphBuilderBase::CheckGraph(const ir::Graph &graph) const {}
-
-void MultiDevSSAGraphBuilderBase::Init() const {
-  all_vars_.clear();
-
-  loss_var_name_ = Get<const std::string>(kLossVarName);
-  VLOG(10) << "Init MultiDevSSAGraphBuilder, loss name: " << loss_var_name_;
-  places_ = Get<const std::vector<platform::Place>>(details::kPlaces);
-  local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes);
-  strategy_ = Get<const details::BuildStrategy>(kStrategy);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
-  nccl_ctxs_ = nullptr;
-  if (multi_nccl_ctxs_) {
-    nccl_ctxs_ = multi_nccl_ctxs_->DefaultFlatCtx();
-  }
-#endif
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-}
-
-void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const {
-  Init();
-  CheckGraph(*graph);
-  std::vector<ir::Node *> sorted_ops = SortOperations(*graph);
-
-  auto nodes = graph->ReleaseNodes();
-  ir::Graph &result = *graph;
-
-  for (auto &node : nodes) {
-    if (node->IsVar() && node->Var()) {
-      all_vars_.emplace(node->Name(), node->Var());
-    }
-  }
-
-  // We cannot invoke resize. It is a bug of GCC 4.8
-  result.Set(details::kGraphVars, new details::GraphVars(places_.size()));
-  result.Set(details::kGraphDepVars, new details::GraphDepVars);
-  result.Set(kGraphOps, new GraphOps);
-
-  bool is_forwarding = true;
-
-  for (ir::Node *node : sorted_ops) {
-    if (DealWithSpecialOp(&result, node)) {
-      continue;
-    } else {
-      // This op runs on all devices
-      if (IsScaleLossOp(node)) {
-        // user can customize loss@grad if not use_default_grad_scale_
-        InsertScaleLossGradOp(&result, node);
-        // This assumes the backward generating code will ensure IsScaleLossOp
-        // is true only for the op that scale the final scalar loss.
-        // It also assumes backward op will always follow the forward op in
-        // the block.
-        is_forwarding = false;
-      } else {
-        CreateComputationalOps(&result, node, places_.size());
-      }
-
-      // Insert collective ops if nranks > 1
-      if (!is_forwarding && Get<size_t>(details::kNRanks) > 1) {
-        try {
-          bool is_bk_op =
-              static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
-                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                                static_cast<int>(OpRole::kBackward));
-          // optimize op is already processed in DealWithSpecialOp,
-          // here we only consider backward op
-          if (!is_bk_op) continue;
-
-          /*
-           * the op that will generate the gradient of on parameter will have
-           one attr op_role_var
-           * to record the parameter and gradient, like:
-            attrs {
-              name: "op_role_var"
-              type: STRINGS
-              strings: "fc_1.b_0"
-              strings: "fc_1.b_0@GRAD"
-            }
-           */
-
-          // Currently, we assume that once gradient is generated, it can be
-          // broadcast, and each gradient is only broadcast once.
-          auto backward_vars =
-              boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-                  OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-          PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-          for (size_t i = 0; i < backward_vars.size(); i += 2) {
-            auto &p_name = backward_vars[i];
-            auto &g_name = backward_vars[i + 1];
-            VLOG(10) << "Bcast " << g_name << " for parameter " << p_name
-                     << " op_type " << node->Op()->Type();
-            if (NeedCollectiveForGrad(g_name, sorted_ops)) {
-              InsertCollectiveOp(&result, p_name, g_name);
-            }
-          }
-        } catch (boost::bad_get e) {
-        }
-      }
-    }
-  }
-
-  InsertPostprocessOps(&result);
-
-  /*
-  Dependency graph has been constructed. However, there are still data
-  hazards need to be handled.
-  */
-  PolishGraphToSupportDataHazards(&result);
-
-  /*
-   * Only variables should be the leaves of graph.
-   */
-  AddOutputToLeafOps(&result);
-
-  result.Erase(kGraphOps);
-}
-
-void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
-    ir::Graph *result, const ir::Node *node) const {
-  // user can customize loss@grad if not use_default_grad_scale_
-  size_t loss_scale = 0;
-  switch (this->strategy_.gradient_scale_) {
-    case details::BuildStrategy::GradientScaleStrategy::kOne:
-      loss_scale = 1;
-      break;
-    case details::BuildStrategy::GradientScaleStrategy::kCoeffNumDevice:
-      loss_scale = Get<size_t>(details::kNRanks);
-      break;
-    case details::BuildStrategy::GradientScaleStrategy::kCustomized:
-      loss_scale = 0;
-      break;
-    default:
-      LOG(FATAL) << "Unknown gradient scale strategy.";
-      break;
-  }
-
-  VLOG(3) << "loss_scale: " << loss_scale;
-
-  if (loss_scale) {
-    // TODO(paddle-dev): Why is there no input for this op_handle?
-    auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
-    auto out_dtype = this->all_vars_.at(loss_grad_name)->GetDataType();
-    this->CreateScaleLossGradOp(result, loss_grad_name, node->outputs[0],
-                                loss_scale, out_dtype);
-  }
-}
-
-bool MultiDevSSAGraphBuilderBase::DealWithSpecialOp(ir::Graph *result,
-                                                    ir::Node *node) const {
-  return false;
-}
-
-std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
-    const ir::Graph &graph) const {
-  return ir::TopologySortOperations(graph);
-}
-
-bool MultiDevSSAGraphBuilderBase::UseGPU() const {
-  bool use_gpu = false;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  use_gpu = nccl_ctxs_ != nullptr;
-#endif
-  return use_gpu;
-}
-
-bool MultiDevSSAGraphBuilderBase::NeedCollectiveForGrad(
-    const std::string &grad_name, std::vector<ir::Node *> ops) const {
-  // if we have allreduce_op for current gradient variable in the graph,
-  // then we don't need to add allreduce_op_handle for this gradient
-  // NOTE: This is for the case that all gradients should add collective ops
-  for (auto *node : ops) {
-    if (node->Op()->Type() != "allreduce") continue;
-    for (auto in_name : node->Op()->InputArgumentNames()) {
-      if (in_name == grad_name) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
-                                                    ir::Node *node,
-                                                    size_t place_id) const {
-  auto p = places_[place_id];
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
-  op_handle->SetDeviceContext(p,
-                              platform::DeviceContextPool::Instance().Get(p));
-
-  for (ir::Node *input : node->inputs) {
-    details::VarHandle *var =
-        CreateOrGetLatestVarHandle(result, input, p, place_id);
-    op_handle->AddInput(var);
-  }
-
-  for (ir::Node *output : node->outputs) {
-    ir::Node *new_node = nullptr;
-    if (output->Var()) {
-      new_node = result->CreateVarNode(output->Var());
-    } else {
-      new_node =
-          result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
-    }
-    CreateOpOutput(result, op_handle, new_node, p, place_id);
-  }
-}
-
-void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
-    details::OpHandleBase *op_handle, const platform::Place &p) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  if (nccl_ctxs_ == nullptr) {
-    op_handle->SetDeviceContext(p,
-                                platform::DeviceContextPool::Instance().Get(p));
-  }
-#else
-  op_handle->SetDeviceContext(p,
-                              platform::DeviceContextPool::Instance().Get(p));
-#endif
-}
-
-void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
-                                                    const std::string &p_name,
-                                                    size_t src_dev_id) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  auto *op_handle = new details::BroadcastOpHandle(
-      result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
-      local_scopes_, places_, nccl_ctxs_);
-#else
-  auto *op_handle = new details::BroadcastOpHandle(
-      result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
-      local_scopes_, places_);
-#endif
-  result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
-
-  auto *in = result->Get<details::GraphVars>(details::kGraphVars)
-                 .at(src_dev_id)
-                 .at(p_name)
-                 .back();
-  op_handle->AddInput(in);
-
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
-    auto &vars =
-        result->Get<details::GraphVars>(details::kGraphVars).at(i).at(p_name);
-    auto *out_var = new details::VarHandle(
-        result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(),
-        i, p_name, p);
-    vars.emplace_back(out_var);
-    op_handle->AddOutput(out_var);
-  }
-}
-
-void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
-    ir::Graph *result,
-    const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  auto *op_handle = new details::FusedBroadcastOpHandle(
-      result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
-      local_scopes_, places_, nccl_ctxs_);
-#else
-  auto *op_handle = new details::FusedBroadcastOpHandle(
-      result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
-      local_scopes_, places_);
-#endif
-  result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
-
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
-  }
-
-  for (size_t dev_id = 0; dev_id < bcast_varnames.size(); ++dev_id) {
-    for (auto &p_name : bcast_varnames[dev_id]) {
-      auto *in = result->Get<details::GraphVars>(details::kGraphVars)
-                     .at(dev_id)
-                     .at(p_name)
-                     .back();
-      op_handle->AddInput(in);
-      for (size_t out_dev_id = 0; out_dev_id < places_.size(); ++out_dev_id) {
-        auto &p = places_[out_dev_id];
-        auto &vars = result->Get<details::GraphVars>(details::kGraphVars)
-                         .at(out_dev_id)
-                         .at(p_name);
-        auto *out_var = new details::VarHandle(
-            result->CreateEmptyNode(p_name, ir::Node::Type::kVariable),
-            vars.size(), out_dev_id, p_name, p);
-        vars.emplace_back(out_var);
-        op_handle->AddOutput(out_var);
-      }
-    }
-  }
-}
-
-void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
-                                                        ir::Node *node,
-                                                        size_t dev_id) const {
-  result->Get<GraphOps>(kGraphOps).emplace_back(
-      new details::ComputationOpHandle(result->CreateOpNode(node->Op()),
-                                       local_scopes_[dev_id], places_[dev_id],
-                                       dev_id));
-  CreateOpHandleIOs(result, node, dev_id);
-}
-
-void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
-                                                    const std::string &og,
-                                                    bool is_encoded) const {
-  details::OpHandleBase *op_handle = nullptr;
-
-  auto append_allreduce_op = [&](
-      const std::vector<Scope *> &scopes,
-      const std::vector<platform::Place> &places) -> details::OpHandleBase * {
-#if defined(PADDLE_WITH_DGC)
-    if (is_encoded) {
-      result->Get<GraphOps>(kGraphOps).emplace_back(
-          new details::SparseAllReduceOpHandle(
-              result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-              scopes, places, multi_nccl_ctxs_, is_encoded,
-              static_cast<int>(strategy_.trainers_endpoints_.size()) *
-                  places_.size()));
-    } else {
-      result->Get<GraphOps>(kGraphOps).emplace_back(
-          new details::AllReduceOpHandle(
-              result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-              scopes, places, multi_nccl_ctxs_));
-    }
-#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    result->Get<GraphOps>(kGraphOps).emplace_back(
-        new details::AllReduceOpHandle(
-            result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-            scopes, places, multi_nccl_ctxs_));
-#else
-    result->Get<GraphOps>(kGraphOps).emplace_back(
-        new details::AllReduceOpHandle(
-            result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-            scopes, places));
-#endif
-    return result->Get<GraphOps>(kGraphOps).back();
-  };
-
-  if (!strategy_.enable_parallel_graph_)
-    op_handle = append_allreduce_op(local_scopes_, places_);
-
-  for (size_t i = 0; i < places_.size(); ++i) {
-    if (strategy_.enable_parallel_graph_) {
-      op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]});
-    }
-
-    SetCommunicationContext(op_handle, places_[i]);
-    auto &vars = result->Get<details::GraphVars>(details::kGraphVars)[i][og];
-    PADDLE_ENFORCE(!vars.empty());
-    auto &prev_grad = vars.back();
-    op_handle->AddInput(prev_grad);
-    VLOG(10) << "all_reduce_op_handle add input " << prev_grad->DebugString();
-
-    auto var = new details::VarHandle(
-        result->CreateEmptyNode(og, ir::Node::Type::kVariable), vars.size(), i,
-        og, places_[i]);
-    vars.emplace_back(var);
-    op_handle->AddOutput(var);
-    VLOG(10) << "all_reduce_op_handle add output " << og
-             << ", handle:" << var->DebugString();
-  }
-}
-
-void MultiDevSSAGraphBuilderBase::CreateScaleLossGradOp(
-    ir::Graph *result, const std::string &loss_grad_name,
-    ir::Node *out_var_node, size_t loss_scale,
-    proto::VarType::Type dtype) const {
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
-    auto *op_handle = new details::ScaleLossGradOpHandle(
-        result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
-        loss_scale, local_scopes_[i], places_[i], dev_ctx, dtype);
-    result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
-
-    // FIXME: Currently ScaleLossGradOp only use device_count as scale
-    // factor. So it does not depend on any other operators.
-    // VarHandle *loss = GetVarHandle(loss_var_name, place);
-    // loss->pending_ops_.emplace_back(op_handle);
-    // op_handle->inputs_.emplace_back(loss);
-
-    CreateOpOutput(result, op_handle,
-                   result->CreateVarNode(out_var_node->Var()), places_[i], i);
-  }
-}
-
-void MultiDevSSAGraphBuilderBase::CreateComputationalOps(
-    ir::Graph *result, ir::Node *node, size_t num_places) const {
-  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
-    auto p = places_[scope_idx];
-    auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>(kGraphOps).emplace_back(
-        new details::ComputationOpHandle(result->CreateOpNode(node->Op()), s, p,
-                                         scope_idx));
-    CreateOpHandleIOs(result, node, scope_idx);
-  }
-}
-
-details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
-    ir::Graph *result, const std::string &og, size_t dst_dev_id) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  result->Get<GraphOps>(kGraphOps).emplace_back(new details::ReduceOpHandle(
-      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
-      local_scopes_, places_, nccl_ctxs_));
-#else
-  result->Get<GraphOps>(kGraphOps).emplace_back(new details::ReduceOpHandle(
-      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
-      local_scopes_, places_));
-#endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
-
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
-    auto &vars = result->Get<details::GraphVars>(details::kGraphVars)[i][og];
-    PADDLE_ENFORCE(!vars.empty());
-    auto &prev_grad = vars.back();
-    op_handle->AddInput(prev_grad);
-  }
-  auto &vars =
-      result->Get<details::GraphVars>(details::kGraphVars)[dst_dev_id][og];
-  auto var = new details::VarHandle(
-      result->CreateEmptyNode(og, ir::Node::Type::kVariable), vars.size(),
-      dst_dev_id, og, places_[dst_dev_id]);
-  vars.emplace_back(var);
-  op_handle->AddOutput(var);
-  return var;
-}
-
-bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const {
-  return !loss_var_name_.empty() && node->Op() &&
-         boost::get<int>(
-             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-             (static_cast<int>(OpRole::kBackward) |
-              static_cast<int>(OpRole::kLoss));
-}
-
-bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
-    const std::string &og) const {
-  PADDLE_ENFORCE(all_vars_.count(og) != 0);
-  return all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS;
-}
-
-void AllReduceSSAGraphBuilder::InsertCollectiveOp(
-    ir::Graph *result, const std::string &p_name,
-    const std::string &g_name) const {
-  if (IsSparseGradient(g_name)) {
-    CreateReduceOp(result, g_name, 0);
-    CreateBroadcastOp(result, g_name, 0);
-  } else {
-#if defined(PADDLE_WITH_DGC)
-    CreateAllReduceOp(result, g_name, IsEncoded(p_name));
-#else
-    CreateAllReduceOp(result, g_name);
-#endif
-  }
-}
-
-int BalanceVarSSAGraphBuilder::GetVarDeviceID(
-    const std::string &varname) const {
-  auto got = sharded_var_device_.find(varname);
-  if (got == sharded_var_device_.end()) {
-    auto pos = varname.find(framework::kNewGradSuffix);
-    if (pos != std::string::npos) {
-      got = sharded_var_device_.find(varname.substr(0, pos));
-    }
-  }
-  return got == sharded_var_device_.end() ? -1 : got->second;
-}
-
-int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
-  if (strategy_.reduce_ != details::BuildStrategy::ReduceStrategy::kReduce) {
-    return -1;
-  }
-  if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
-    return -1;
-  }
-  auto param_grad = boost::get<std::vector<std::string>>(
-      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(param_grad[1]);
-  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
-                    node->Op()->Type(), param_grad[0], param_grad[1]);
-  return dev_id;
-}
-
-size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID(
-    const std::vector<std::string> &var_names) const {
-  int64_t numel_sum = 0;
-  for (auto var_name : var_names) {
-    if (all_vars_.find(var_name) == all_vars_.end()) continue;
-    auto var_desc = all_vars_.at(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
-    auto dim = framework::make_ddim(var_desc->GetShape());
-    int64_t numel = framework::product(dim);
-    PADDLE_ENFORCE_GT(numel, 0);
-    numel_sum += numel;
-  }
-
-  auto smallest =
-      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
-  size_t dev_id =
-      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
-  balance_vars_[dev_id] += numel_sum;
-  return dev_id;
-}
-
-void BalanceVarSSAGraphBuilder::ResetState() const {
-  balance_vars_.clear();
-  sharded_var_device_.clear();
-
-  balance_vars_.resize(places_.size(), 0);
-}
-
-void ReduceSSAGraphBuilder::Init() const {
-  MultiDevSSAGraphBuilderBase::Init();
-  ResetState();
-}
-
-void ReduceSSAGraphBuilder::ResetState() const {
-  BalanceVarSSAGraphBuilder::ResetState();
-  bcast_var_name_set_.clear();
-  bcast_var_name_set_.resize(places_.size());
-}
-
-void ReduceSSAGraphBuilder::InsertCollectiveOp(
-    ir::Graph *result, const std::string &p_name,
-    const std::string &g_name) const {
-  size_t cur_device_id = GetAppropriateDeviceID({g_name});
-  CreateReduceOp(result, g_name, cur_device_id);
-  sharded_var_device_.emplace(g_name, cur_device_id);
-  bcast_var_name_set_[cur_device_id].emplace(p_name);
-}
-
-bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
-                                              ir::Node *node) const {
-  int op_dev_id = BalanceVarSSAGraphBuilder::GetOpDeviceID(node);
-  if (op_dev_id != -1) {
-    // This op only runs on one specific device.
-    CreateComputationalOp(result, node, op_dev_id);
-    for (ir::Node *n : node->outputs) {
-      sharded_var_device_.emplace(n->Name(), op_dev_id);
-    }
-    return true;
-  }
-  return false;
-}
-
-void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
-  if (UseGPU()) {
-    if (strategy_.fuse_broadcast_ops_ == true) {
-      CreateFusedBroadcastOp(result, bcast_var_name_set_);
-    } else {
-      for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
-        auto &to_bcast_set = bcast_var_name_set_[dev_id];
-        for (auto &bcast_name : to_bcast_set) {
-          CreateBroadcastOp(result, bcast_name, dev_id);
-        }
-      }
-    }
-  }
-}
-
-int ReduceSSAGraphBuilder::GetOpDeviceID(
-    ir::Node *node,
-    std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops) const {
-  if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
-    return -1;
-  }
-
-  auto param_grad = boost::get<std::vector<std::string>>(
-      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(param_grad[1]);
-
-  if (dev_id == -1) {
-    (*delay_ops)[param_grad[1]].push_back(node);
-    return -2;
-  }
-  return dev_id;
-}
-
-std::vector<ir::Node *> ReduceSSAGraphBuilder::SortOperations(
-    const ir::Graph &graph) const {
-  std::vector<ir::Node *> sorted_ops = ir::TopologySortOperations(graph);
-  return SortForReduceMode(sorted_ops);
-}
-
-std::vector<ir::Node *> ReduceSSAGraphBuilder::SortForReduceMode(
-    const std::vector<ir::Node *> &topo_ops) const {
-  std::vector<ir::Node *> sorted_ops;
-  std::unordered_map<std::string, std::vector<ir::Node *>> delayed_op;
-  sorted_ops.reserve(topo_ops.size());
-  ResetState();
-
-  auto insert_delayed_op = [&](const std::string &var_name, int dev_id) {
-    sharded_var_device_.emplace(var_name, dev_id);
-    if (delayed_op.count(var_name)) {
-      auto &ops = delayed_op.at(var_name);
-      sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end());
-      delayed_op.at(var_name).clear();
-    }
-  };
-
-  for (ir::Node *node : topo_ops) {
-    int op_dev_id = GetOpDeviceID(node, &delayed_op);
-    if (op_dev_id > -1) {
-      // This op only runs on one specific device.
-      sorted_ops.emplace_back(node);
-      for (ir::Node *n : node->outputs) {
-        insert_delayed_op(n->Name(), op_dev_id);
-      }
-    } else if (op_dev_id == -1) {
-      // This op runs on all devices, and its output may have parameter's
-      // gradients.
-      sorted_ops.emplace_back(node);
-      bool is_bk_op =
-          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kBackward));
-      if (!is_bk_op) continue;
-      // Currently, we assume that once gradient is generated, it can be
-      // broadcast, and each gradient is only broadcast once.
-      std::vector<std::string> backward_vars;
-      try {
-        backward_vars =
-            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      } catch (boost::bad_get e) {
-      }
-      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
-      for (size_t i = 0; i < backward_vars.size(); i += 2) {
-        auto &g_name = backward_vars[i + 1];
-        size_t cur_device_id = GetAppropriateDeviceID({g_name});
-        insert_delayed_op(g_name, static_cast<int>(cur_device_id));
-      }
-    } else if (op_dev_id == -2) {
-      // The Op on which the Op depends has not yet been generated.
-    }
-  }
-
-  PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size());
-
-  ResetState();
-  return sorted_ops;
-}
-
-void DistSSAGraphBuilder::Init() const {
-  MultiDevSSAGraphBuilderBase::Init();
-  ResetState();
-}
-
-void DistSSAGraphBuilder::ResetState() const {
-  BalanceVarSSAGraphBuilder::ResetState();
-  bcast_var_name_set_.clear();
-  bcast_var_name_set_.resize(places_.size());
-}
-
-bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
-                                            ir::Node *node) const {
-  bool insert_op = false;
-  if (OpHaveRole(*node, OpRole::kRPC)) {
-    int op_dev_id = CreateRPCOp(result, node);
-    PADDLE_ENFORCE(op_dev_id != -1,
-                   "Can not schedule the RPC operator to the right place.");
-    if (node->Op()->Type() == "recv") {
-      auto recv_vars_attr =
-          boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE(recv_vars_attr.size() == 2UL);  // [parameter, gradient]
-      if (recv_vars_attr[0].find(".block") == std::string::npos) {
-        bcast_var_name_set_[op_dev_id].emplace(recv_vars_attr[0]);
-      }
-    }
-    insert_op = true;
-    need_broadcast_var_ = true;
-  } else if (OpHaveRole(*node, OpRole::kDist)) {
-    int op_dev_id = CreateDistTrainOp(result, node);
-    if (node->Op()->Type() == "concat") {
-      // the input(block of parameter) of concat is on different device,
-      // the output(parameter) will on one device.
-      auto origin_param_name = node->Op()->OutputArgumentNames()[0];
-      bcast_var_name_set_[op_dev_id].emplace(origin_param_name);
-    }
-    insert_op = true;
-  } else {
-    int op_dev_id = GetOpDeviceID(node);
-    if (op_dev_id != -1) {  // This op only runs on one specific device.
-      // optimize op will be processed here.
-      CreateComputationalOp(result, node, op_dev_id);
-      for (ir::Node *n : node->outputs) {
-        sharded_var_device_.emplace(n->Name(), op_dev_id);
-      }
-      insert_op = true;
-    }
-  }
-  return insert_op;
-}
-
-void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
-  for (ir::Node *input : node->inputs) {
-    details::VarHandle *var = nullptr;
-    for (int place_offset = 0; place_offset < num_places; ++place_offset) {
-      auto &var_holders =
-          result->Get<details::GraphVars>(details::kGraphVars)[place_offset];
-      auto &var_holder = var_holders[input->Name()];
-      if (!var_holder.empty()) {
-        var = *var_holder.rbegin();
-        op_handle->AddInput(var);
-      }
-    }
-  }
-}
-
-// Create RPC related op handles that connects its in ops and out ops.
-int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
-  int op_dev_id = -1;
-  if (node->Op()->Type() == "send") {
-    // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
-    PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
-                   "This hack no longer holds, please fix.");
-    // the variable name which contains .block means it was splited by
-    // split_byref op
-    if (strategy_.reduce_ ==
-            details::BuildStrategy::ReduceStrategy::kAllReduce &&
-        node->inputs[0]->Name().find(".block") == std::string::npos) {
-      std::vector<std::string> input_var_names;
-      for (ir::Node *n : node->inputs) {
-        input_var_names.push_back(n->Name());
-      }
-      auto send_param_grad = boost::get<std::vector<std::string>>(
-          node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
-      op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
-      VLOG(10) << "send grad " << input_var_names[0] << " origin "
-               << send_param_grad[1] << " place: " << op_dev_id;
-      for (auto &varname : input_var_names) {
-        sharded_var_device_.emplace(varname, op_dev_id);
-      }
-      sharded_var_device_.emplace(send_param_grad[1], op_dev_id);
-    }
-  } else if (node->Op()->Type() == "recv") {
-    std::vector<std::string> output_var_names;
-    for (ir::Node *n : node->outputs) {
-      output_var_names.push_back(n->Name());
-    }
-    auto recv_param_grad = boost::get<std::vector<std::string>>(
-        node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-    if (recv_param_grad.size() == 2U) {
-      op_dev_id = GetVarDeviceID(recv_param_grad[1]);
-      VLOG(10) << "recv param " << recv_param_grad[0]
-               << " get grad place: " << recv_param_grad[1]
-               << " place: " << op_dev_id;
-    } else {
-      op_dev_id = GetAppropriateDeviceID(output_var_names);
-    }
-    for (auto &varname : output_var_names) {
-      sharded_var_device_.emplace(varname, op_dev_id);
-    }
-  } else {
-    // send_barrier, fetch_barrier will run on place 0;
-    op_dev_id = 0;
-  }
-
-  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
-                 node->Op()->Type());
-
-  // Create fetch_barrier op handle to enable output on all devices.
-  // **NOTE** fetch_barrier should output variables list same as recv op does.
-  if (node->Op()->Type() == "fetch_barrier") {
-    result->Get<GraphOps>(kGraphOps).emplace_back(
-        new details::FetchBarrierOpHandle(result->CreateOpNode(node->Op()),
-                                          local_scopes_, places_));
-  } else {
-    result->Get<GraphOps>(kGraphOps).emplace_back(new details::RPCOpHandle(
-        result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
-        node->Op()->Type(), places_[op_dev_id]));
-  }
-
-  if (node->Op()->Type() == "send") {
-    CreateOpHandleIOs(result, node, op_dev_id);
-  } else {
-    // send_barrier, recv, fetch_barrier's inputs are deps var, get them from
-    // all places
-    auto p = places_[op_dev_id];
-    auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
-    op_handle->SetDeviceContext(p,
-                                platform::DeviceContextPool::Instance().Get(p));
-
-    SetOpInputsAllPlaces(result, node, places_.size());
-    for (ir::Node *output : node->outputs) {
-      int outvar_dev_id = op_dev_id;
-      if (node->Op()->Type() == "fetch_barrier") {
-        outvar_dev_id = GetVarDeviceID(output->Name());
-        PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
-      }
-      p = places_[outvar_dev_id];
-      ir::Node *new_node = nullptr;
-      if (output->Var()) {
-        new_node = result->CreateVarNode(output->Var());
-      } else {
-        new_node =
-            result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
-      }
-      CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
-    }
-  }
-  return op_dev_id;
-}
-
-int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
-                                           ir::Node *node) const {
-  int op_dev_id = -1;
-  std::vector<std::string> input_var_names;
-  std::vector<std::string> output_var_names;
-  for (ir::Node *input : node->inputs) {
-    input_var_names.push_back(input->Name());
-  }
-  for (ir::Node *output : node->outputs) {
-    output_var_names.push_back(output->Name());
-  }
-
-  if (node->Op()->Type() == "split_byref" ||
-      node->Op()->Type() == "split_selected_rows" ||
-      node->Op()->Type() == "split_ids") {
-    // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id = GetVarDeviceID(input_var_names[0]);
-    if (strategy_.reduce_ ==
-        details::BuildStrategy::ReduceStrategy::kAllReduce) {
-      op_dev_id = GetAppropriateDeviceID(input_var_names);
-      for (auto &varname : input_var_names) {
-        sharded_var_device_.emplace(varname, op_dev_id);
-      }
-    }
-    for (auto &varname : output_var_names) {
-      sharded_var_device_.emplace(varname, op_dev_id);
-    }
-  } else if (node->Op()->Type() == "concat") {
-    op_dev_id = GetVarDeviceID(input_var_names[0]);
-    for (auto &varname : output_var_names) {
-      sharded_var_device_.emplace(varname, op_dev_id);
-    }
-  } else {
-    LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
-    PADDLE_THROW(
-        "the distribute training related op should be in [split_byref, "
-        "concat].");
-  }
-
-  PADDLE_ENFORCE(op_dev_id != -1,
-                 "can not find right place for distributed op: %s",
-                 node->Op()->Type());
-
-  CreateComputationalOp(result, node, op_dev_id);
-  return op_dev_id;
-}
-
-#if defined(PADDLE_WITH_DGC)
-bool AllReduceSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
-  auto u_name = p_name + details::g_dgc_u;
-  auto it = all_vars_.find(u_name);
-  if (it == all_vars_.end()) {
-    VLOG(10) << "can't find u_name, so it's not encoded:" << u_name;
-    return false;
-  }
-
-  return true;
-}
-#else
-bool AllReduceSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
-  return false;
-}
-#endif
-
-void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
-                                             const std::string &p_name,
-                                             const std::string &g_name) const {
-  // collective gradient to each device
-  size_t cur_device_id = 0;
-  switch (strategy_.reduce_) {
-    case details::BuildStrategy::ReduceStrategy::kReduce:
-      cur_device_id = GetAppropriateDeviceID({g_name});
-      CreateReduceOp(result, g_name, cur_device_id);
-      sharded_var_device_.emplace(g_name, cur_device_id);
-      break;
-    case details::BuildStrategy::ReduceStrategy::kAllReduce:
-      if (IsSparseGradient(g_name)) {
-        CreateReduceOp(result, g_name, 0);
-        CreateBroadcastOp(result, g_name, 0);
-      } else {
-        CreateAllReduceOp(result, g_name);
-      }
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduce strategy.";
-      break;
-  }
-}
-
-void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
-  // broad cast received parameters when training in parameter server mode.
-  if (need_broadcast_var_) {
-    // There are 4 conditions:
-    // 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
-    // Need to broadcast received parameters to other GPU.
-    // 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
-    // broadcast received parameters to other GPU.
-    // 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
-    // broadcast received parameters to other scope.
-    // 4. CPU && Reduce: because all parameters share the same memory, did not
-    // broadcast received parameters.
-    if (!UseGPU() &&
-        strategy_.reduce_ == details::BuildStrategy::ReduceStrategy::kReduce) {
-      return;
-    }
-    if (strategy_.fuse_broadcast_ops_ == true) {
-      CreateFusedBroadcastOp(result, bcast_var_name_set_);
-    } else {
-      for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
-        auto &to_bcast_set = bcast_var_name_set_[dev_id];
-        for (auto &bcast_name : to_bcast_set) {
-          CreateBroadcastOp(result, bcast_name, dev_id);
-        }
-      }
-    }
-  }
-}
-
-std::unordered_set<std::string> &MultiDevSSAGraphBuilder() {
-  static std::unordered_set<std::string> regs;
-  return regs;
-}
-
-static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
-  MultiDevSSAGraphBuilder().insert(builder_mode);
-  return 0;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-#define REGISTER_MULTI_DEVICES_PASS(pass_name, pass_class)                \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      _reg_ssa_graph_builder_##pass_name,                                 \
-      "REGISTER_MULTI_DEVICES_PASS must be called in global namespace."); \
-  int _reg_ssa_graph_builder_entry_##pass_name =                          \
-      paddle::framework::ir::MultiDevSSAGraphBuilderRegister(#pass_name); \
-  REGISTER_PASS(pass_name, pass_class)                                    \
-      .RequirePassAttr(paddle::framework::ir::kLossVarName)               \
-      .RequirePassAttr(paddle::framework::details::kPlaces)               \
-      .RequirePassAttr(paddle::framework::details::kLocalScopes)          \
-      .RequirePassAttr(paddle::framework::ir::kStrategy)                  \
-      .RequirePassAttr(paddle::framework::details::kNRanks)
-
-REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass,
-                            paddle::framework::ir::ReduceSSAGraphBuilder);
-REGISTER_MULTI_DEVICES_PASS(all_reduce_mode_multi_devices_pass,
-                            paddle::framework::ir::AllReduceSSAGraphBuilder);
-REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass,
-                            paddle::framework::ir::DistSSAGraphBuilder);
-REGISTER_MULTI_DEVICES_PASS(async_multi_devices_pass,
-                            paddle::framework::ir::AsyncSSAGraphBuilder);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
deleted file mode 100644
index ea0455b6a8b16f1bf3370fc75bbf3b7b7f7545a8..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ /dev/null
@@ -1,207 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace platform {
-class NCCLContextMap;
-}
-
-namespace framework {
-class Scope;
-namespace ir {
-
-constexpr char kLossVarName[] = "loss_var_name";
-constexpr char kStrategy[] = "strategy";
-
-class MultiDevSSAGraphBuilderBase : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override;
-
-  virtual void Init() const;
-
-  virtual void CheckGraph(const ir::Graph &graph) const;
-
-  virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
-
-  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
-                                  const std::string &g_name) const = 0;
-
-  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
-
-  virtual void InsertPostprocessOps(ir::Graph *result) const = 0;
-
-  bool UseGPU() const;
-
-  virtual bool NeedCollectiveForGrad(const std::string &grad_name,
-                                     std::vector<ir::Node *> ops) const;
-
-  bool IsScaleLossOp(ir::Node *node) const;
-
-  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
-                              size_t num_places) const;
-
-  void CreateScaleLossGradOp(ir::Graph *result,
-                             const std::string &loss_grad_name,
-                             ir::Node *out_var_node, size_t loss_scale,
-                             proto::VarType::Type dtype) const;
-
-  details::VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
-                                     size_t dst_dev_id) const;
-
-  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
-                             size_t dev_id) const;
-
-  bool IsSparseGradient(const std::string &og) const;
-
-  void CreateAllReduceOp(ir::Graph *result, const std::string &og,
-                         bool is_encoded = false) const;
-
-  void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
-                         size_t src_dev_id) const;
-
-  void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const;
-
-  void CreateFusedBroadcastOp(
-      ir::Graph *result,
-      const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;
-
-  void SetCommunicationContext(details::OpHandleBase *op_handle,
-                               const platform::Place &p) const;
-
-  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
-                         size_t device_id) const;
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  mutable platform::NCCLContextMap *nccl_ctxs_{nullptr};
-  mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr};
-#endif
-
-  mutable std::string loss_var_name_;
-  mutable std::vector<platform::Place> places_;
-  mutable std::vector<Scope *> local_scopes_;
-
-  mutable details::BuildStrategy strategy_;
-  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
-};
-
-class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
- protected:
-  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
-                                  const std::string &g_name) const;
-
-  virtual void InsertPostprocessOps(ir::Graph *result) const {}
-
-  bool IsEncoded(const std::string &p_name) const;
-};
-
-class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
- protected:
-  void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
-                          const std::string &g_name) const override {}
-
-  bool NeedCollectiveForGrad(const std::string &grad_name,
-                             std::vector<ir::Node *> ops) const override {
-    return false;
-  }
-
-  bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override {
-    if (node->Op()->Type() == "recv") {
-      VLOG(1) << "set recv op do_not_run to true";
-      node->Op()->SetAttr("do_not_run", 1);
-      node->Op()->Flush();
-    }
-    return false;
-  }
-
-  void InsertPostprocessOps(ir::Graph *result) const override {}
-};
-
-class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
- protected:
-  int GetVarDeviceID(const std::string &varname) const;
-
-  int GetOpDeviceID(ir::Node *node) const;
-
-  size_t GetAppropriateDeviceID(
-      const std::vector<std::string> &var_names) const;
-
-  virtual void ResetState() const;
-
-  mutable std::unordered_map<std::string, int> sharded_var_device_;
-  mutable std::vector<int64_t> balance_vars_;
-};
-
-class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
- protected:
-  virtual void Init() const;
-
-  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
-                                  const std::string &g_name) const;
-
-  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
-
-  virtual void InsertPostprocessOps(ir::Graph *result) const;
-
-  virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
-
-  virtual void ResetState() const;
-
-  int GetOpDeviceID(ir::Node *node,
-                    std::unordered_map<std::string, std::vector<ir::Node *>>
-                        *delay_ops) const;
-
-  std::vector<ir::Node *> SortForReduceMode(
-      const std::vector<ir::Node *> &topo_ops) const;
-
-  mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
-};
-
-class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
- protected:
-  virtual void Init() const;
-
-  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
-
-  virtual void InsertPostprocessOps(ir::Graph *result) const;
-
-  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
-                                  const std::string &g_name) const;
-
-  virtual void ResetState() const;
-
-  int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
-
-  int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
-
-  mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
-  mutable bool need_broadcast_var_{false};
-};
-
-std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
deleted file mode 100644
index efd549e79d0ef2ff31a3d1253201f1c2656adf84..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_printer.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class SSAGraghBuilderWithPrinterPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override {
-    std::unique_ptr<std::ostream> fout(
-        new std::ofstream(Get<std::string>(kGraphvizPath)));
-    PADDLE_ENFORCE(fout->good());
-    if (Has("graph_printer")) {
-      Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
-    } else {
-      GraphvizSSAGraphPrinter printer;
-      printer.Print(*graph, *fout);
-    }
-  }
-};
-
-template <typename Callback>
-static inline void IterAllVar(const ir::Graph &graph, Callback callback) {
-  for (auto &each : graph.Get<details::GraphVars>(details::kGraphVars)) {
-    for (auto &pair1 : each) {
-      for (auto &pair2 : pair1.second) {
-        callback(*pair2);
-      }
-    }
-  }
-
-  for (auto &var : graph.Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    callback(*var);
-  }
-}
-
-void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
-                                    std::ostream &sout) const {
-  size_t var_id = 0;
-  std::unordered_map<const details::VarHandleBase *, size_t> vars;
-
-  sout << "digraph G {\n";
-
-  IterAllVar(graph, [&](const details::VarHandleBase &var) {
-    auto *var_ptr = &var;
-    auto *var_handle_ptr = dynamic_cast<const details::VarHandle *>(var_ptr);
-    auto *dummy_ptr = dynamic_cast<const details::DummyVarHandle *>(var_ptr);
-
-    size_t cur_var_id = var_id++;
-    vars[var_ptr] = cur_var_id;
-
-    if (var_handle_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name()
-           << "\\n"
-           << var_handle_ptr->place() << "\\n"
-           << "scope: " << var_handle_ptr->scope_idx() << "\\n"
-           << "v" << var_handle_ptr->version() << "\"]" << std::endl;
-    } else if (dummy_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
-    }
-  });
-
-  size_t op_id = 0;
-  for (auto &op : ir::FilterByNodeWrapper<details::OpHandleBase>(graph)) {
-    std::string op_name = "op_" + std::to_string(op_id++);
-    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
-         << std::endl;
-    for (auto in : op->Inputs()) {
-      std::string var_name = "var_" + std::to_string(vars[in]);
-      sout << var_name << " -> " << op_name << std::endl;
-    }
-
-    for (auto out : op->Outputs()) {
-      std::string var_name = "var_" + std::to_string(vars[out]);
-      sout << op_name << " -> " << var_name << std::endl;
-    }
-  }
-
-  sout << "}\n";
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(multi_devices_print_pass,
-              paddle::framework::ir::SSAGraghBuilderWithPrinterPass)
-    .RequirePassAttr(paddle::framework::ir::kGraphvizPath);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
deleted file mode 100644
index 7de3b7c6054183d9a9cb80e66bee571f29ed68eb..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) {
-  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
-         op1->Outputs() == op2->Outputs();
-}
-
-class SequentialExecutionPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override {
-    // FIXME(zjl): Insert dependencies between some distributed ops may cause
-    // the multi_devices_graph_pass fails. So we skip these ops here.
-    // Indeed, maybe we should not insert dependencies between these ops
-    // casually, which may cause deadlock easily.
-    // We should add more skipped distributed ops when found errors in
-    // multi_devices_graph_pass
-    static std::unordered_set<std::string> skip_dist_ops{
-        "send", "recv", "send_barrier", "fetch_barrier"};
-
-    auto &ops =
-        graph->Get<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs);
-    std::vector<ir::Node *> op_node_list;
-    op_node_list.reserve(ops.size());
-
-    std::unordered_map<ir::Node *, size_t> op_deps;
-    std::unordered_map<ir::Node *, std::unordered_set<ir::Node *>> pending_ops;
-    std::unordered_set<ir::Node *> ready_ops;
-
-    for (ir::Node *node : graph->Nodes()) {
-      if (!node->IsOp()) continue;
-      std::unordered_set<ir::Node *> preceding_ops;
-      for (auto *in : node->inputs) {
-        PADDLE_ENFORCE(in->IsVar(),
-                       "Preceding Node of Op Nodes must be Var Node");
-        if (in->inputs.empty()) continue;
-        PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp(),
-                       "Preceding Op Node of Var Node must be unique");
-        preceding_ops.insert(in->inputs[0]);
-        pending_ops[in->inputs[0]].insert(node);
-      }
-      op_deps[node] = preceding_ops.size();
-      if (preceding_ops.empty()) {
-        ready_ops.insert(node);
-      }
-    }
-
-    for (auto *op_desc : ops) {
-      ir::Node *found_node = nullptr;
-      for (auto *node : ready_ops) {
-        if (IsSameOpDesc(op_desc, node->Op())) {
-          PADDLE_ENFORCE(found_node == nullptr,
-                         "Found multiple op_desc in graph: %s",
-                         op_desc->Type());
-          found_node = node;
-        }
-      }
-
-      PADDLE_ENFORCE_NOT_NULL(found_node, "Cannot find op_desc in graph: %s",
-                              op_desc->Type());
-      for (auto *pending_op : pending_ops[found_node]) {
-        if (--op_deps.at(pending_op) == 0) {
-          ready_ops.insert(pending_op);
-        }
-      }
-      ready_ops.erase(found_node);
-      if (skip_dist_ops.count(op_desc->Type()) == 0) {
-        op_node_list.push_back(found_node);
-      }
-    }
-
-    for (size_t i = 1; i < op_node_list.size(); ++i) {
-      auto *dep_var = graph->CreateControlDepVar();
-      op_node_list[i]->inputs.push_back(dep_var);
-      op_node_list[i - 1]->outputs.push_back(dep_var);
-      dep_var->outputs.push_back(op_node_list[i]);
-      dep_var->inputs.push_back(op_node_list[i - 1]);
-      VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
-               << " and " << op_node_list[i]->Name();
-    }
-  }
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(sequential_execution_pass,
-              paddle::framework::ir::SequentialExecutionPass)
-    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
diff --git a/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc b/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc
deleted file mode 100644
index 6198fab7dcaf7cce229532e50c34e516c1697ba4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <set>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/ngraph_subgraph_pass.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
-#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-namespace ANAT = paddle::inference::analysis;
-
-std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
-                              const std::set<std::string> &engine_outputs,
-                              const std::string &size) {
-  std::string engine_hash_key = "";
-  for (auto name : engine_inputs) {
-    engine_hash_key += name;
-  }
-  for (auto name : engine_outputs) {
-    engine_hash_key += name;
-  }
-  engine_hash_key += size;
-  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
-  return engine_key;
-}
-
-void NgraphSubgraphPass::ApplyImpl(Graph *graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph);
-  FusePassBase::Init("ngraph_subgraph_pass", graph);
-
-  std::unordered_set<Node *> nodes2delete;
-
-  auto teller = [](const Node *node) {
-    if (!node->IsOp() || !node->Op()) return false;
-    auto op_type = node->Op()->Type();
-    return !paddle::operators::NgraphBridge::isRegister(op_type);
-  };
-
-  ANAT::SubGraphFuser fuser(graph, teller, 0, "ngraph_engine");
-  fuser();
-
-  for (auto *node : graph->Nodes()) {
-    if (node->IsOp() && !ANAT::Agent(node).subgraph()->empty()) {
-      OpDesc *op_desc = node->Op();
-      op_desc->SetType("ngraph_engine");
-
-      CreateNgraphEngineOp(node, graph);
-
-      std::unordered_set<const Node *> nodes2remove(
-          ANAT::Agent(node).subgraph()->begin(),
-          ANAT::Agent(node).subgraph()->end());
-
-      GraphSafeRemoveNodes(graph, nodes2remove);
-    }
-  }
-
-  std::unordered_set<const Node *> nodes2remove;
-  for (auto *node : graph->Nodes()) {
-    if (node->IsOp() && ANAT::Agent(node).deleted()) {
-      nodes2remove.insert(node);
-    }
-  }
-
-  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
-  // std::vector<ir::Node *> nodes = ir::TopologySortOperations(*graph);
-}
-
-bool IsValid(std::string name) {
-  return name.find(Node::kControlDepVarName) == std::string::npos;
-}
-
-void UpdateNgraphIO(Node *node, Graph *graph,
-                    std::vector<std::string> *input_names,
-                    std::vector<std::string> *output_names) {
-  bool is_test = true, has_fetch = false;
-  for (Node *node : graph->Nodes()) {
-    if (node->IsOp() && node->Name().find("_grad") != std::string::npos) {
-      is_test = false;
-    }
-    if (node->IsVar() && node->Var()) {
-      for (auto out : node->outputs) {
-        if (out->Name() == "fetch") has_fetch = true;
-      }
-    }
-  }
-  if (is_test && has_fetch) {
-    for (auto *x : node->inputs) {
-      (*input_names).emplace_back(x->Name());
-    }
-    for (auto *x : node->outputs) {
-      (*output_names).emplace_back(x->Name());
-    }
-    return;
-  }
-
-  auto &subgraph = *ANAT::Agent(node).subgraph();
-  std::unordered_set<std::string> inputs;
-  std::unordered_set<std::string> outputs;
-  for (auto *node : subgraph) {
-    for (auto in : node->inputs) {
-      auto name = in->Name();
-      if (!IsValid(name)) continue;
-      if (!outputs.count(name) && !inputs.count(name)) {
-        (*input_names).emplace_back(name);
-        inputs.insert(name);
-      }
-    }
-    for (auto out : node->outputs) {
-      auto name = out->Name();
-      if (!IsValid(name)) continue;
-      outputs.insert(name);
-      (*output_names).emplace_back(name);
-    }
-  }
-}
-
-void NgraphSubgraphPass::CreateNgraphEngineOp(Node *node, Graph *graph) const {
-  auto &subgraph = *ANAT::Agent(node).subgraph();
-  PADDLE_ENFORCE_NE(subgraph.empty(), true, "subgraph cannot be empty");
-
-  framework::proto::BlockDesc block_proto;
-  framework::BlockDesc block_desc(nullptr, &block_proto);
-  block_desc.Proto()->set_parent_idx(-1);
-  block_desc.Proto()->set_idx(0);
-  for (auto *node : subgraph) {
-    auto *op = block_desc.AppendOp();
-    *op->Proto() = *node->Op()->Proto();
-  }
-  auto *vars = block_desc.Proto()->mutable_vars();
-  for (Node *node : graph->Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      *vars->Add() = *node->Var()->Proto();
-    }
-  }
-  PADDLE_ENFORCE_NE(block_desc.Proto()->vars().empty(), true,
-                    "the block has no var-desc");
-
-  std::vector<std::string> input_names;
-  std::vector<std::string> output_names;
-  UpdateNgraphIO(node, graph, &input_names, &output_names);
-  auto *op_desc = node->Op();
-  op_desc->SetInput(
-      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
-  op_desc->SetOutput(
-      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-
-  int sgs = subgraph.size();
-  std::string subgraph_str = block_desc.Proto()->SerializeAsString();
-  std::string engine_key =
-      std::to_string(std::hash<std::string>()(subgraph_str));
-  std::vector<int> interval{0, sgs};
-  op_desc->SetType("ngraph_engine");
-  op_desc->SetAttr("interval", interval);
-  op_desc->SetAttr("graph", subgraph_str);
-  op_desc->SetAttr("engine_key", engine_key);
-  op_desc->SetAttr("op_role", 0);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(ngraph_subgraph_pass, paddle::framework::ir::NgraphSubgraphPass);
diff --git a/paddle/fluid/framework/ir/ngraph_subgraph_pass.h b/paddle/fluid/framework/ir/ngraph_subgraph_pass.h
deleted file mode 100644
index 09f062671c795fbd8421b46ab26dafb39d1e3852..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/ngraph_subgraph_pass.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse supported ops to a NgraphEngineOp.
- */
-class NgraphSubgraphPass : public FusePassBase {
- public:
-  void ApplyImpl(ir::Graph *graph) const override;
-
-  virtual ~NgraphSubgraphPass() {}
-
- private:
-  void CreateNgraphEngineOp(framework::ir::Node *x,
-                            framework::ir::Graph *graph) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
deleted file mode 100644
index 45d81b937392244f678fbd01395b3ffffd07f710..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/node.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/op_info.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-// msvc15 don't support constexpr in correct way.
-#if !defined(_WIN32)
-constexpr char Node::kControlDepVarName[];
-#else
-const char Node::kControlDepVarName[] = "__control_var";
-#endif
-
-std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
-                                        Node::Type type) {
-  return std::unique_ptr<Node>(new Node(name, type));
-}
-
-std::unique_ptr<Node> CreateNodeForTest(VarDesc *var_desc) {
-  return std::unique_ptr<Node>(new Node(var_desc));
-}
-
-std::unique_ptr<Node> CreateNodeForTest(OpDesc *op_desc) {
-  return std::unique_ptr<Node>(new Node(op_desc));
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
deleted file mode 100644
index fbc0d7599eae12d32ccb6d7ea9546ce044037824..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/node.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <typeindex>
-#include <typeinfo>
-#include <vector>
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// Node should only created by Graph::CreateXXXNode().
-// 1. Every Node should be part of a graph. No dangling Node exists.
-// 2. Node only contains members necessary for building graph structure.
-//    It doesn't contain other unrelated members, such as device, etc.
-//
-// Sometimes, for specific usages, Node needs to have additional members,
-// such as device_placement, version in order to be executed. It is suggested
-// to use composition pattern.
-//
-// class RunnableOp {
-//    RunnableOp(ir::Node* n) : n_(n) { n_.WrappedBy(this); }
-//
-//    int any_thing_;
-// }
-//
-// RunnableOp is owned by the ir::Node that composes it. In other words.
-// ir::Node will be responsible for deleting RunnableOp, say, when ir::Node
-// is deleted from the graph.
-class Node {
- public:
-  virtual ~Node() {
-    if (!wrapper_.empty()) {
-      VLOG(10) << "ir::Node deleting a wrapper node " << Name();
-      wrapper_deleter_();
-    }
-  }
-
-  enum class Type { kOperation, kVariable };
-#if !defined(_WIN32)  // msvc not support constexpr correctly.
-  static constexpr char kControlDepVarName[] = "__control_var";
-#else
-  static const char kControlDepVarName[];
-#endif
-
-  Type NodeType() const { return type_; }
-
-  std::string Name() const { return name_; }
-
-  VarDesc* Var() const {
-    PADDLE_ENFORCE_EQ(IsVar(), true);
-    return var_desc_.get();
-  }
-
-  OpDesc* Op() const {
-    PADDLE_ENFORCE_EQ(IsOp(), true);
-    return op_desc_.get();
-  }
-
-  // Set the `wrapper` that wraps the Node. `wrapper` is owned by Node.
-  template <typename T>
-  void WrappedBy(T* wrapper) {
-    if (!wrapper_.empty()) {
-      wrapper_deleter_();
-    }
-    wrapper_ = wrapper;
-    wrapper_deleter_ = [wrapper]() { delete wrapper; };
-    wrapper_type_ = std::type_index(typeid(T));
-  }
-
-  // Return a reference to the `wrapper`.
-  template <typename T>
-  T& Wrapper() {
-    try {
-      return *boost::any_cast<T*>(wrapper_);
-    } catch (boost::bad_any_cast&) {
-      PADDLE_THROW("Invalid wrapper type error, expected %s, actual %s",
-                   typeid(T).name(), wrapper_type_.name());
-    }
-  }
-
-  // Test if the Node is wrapped by type T.
-  template <typename T>
-  bool IsWrappedBy() const {
-    return std::type_index(typeid(T)) == wrapper_type_;
-  }
-
-  // Please don't use this API!
-  int id() const { return id_; }
-
-  bool IsOp() const { return type_ == Type::kOperation; }
-  bool IsVar() const { return type_ == Type::kVariable; }
-  bool IsCtrlVar() const {
-    return type_ == Type::kVariable &&
-           Name().find(ir::Node::kControlDepVarName) != std::string::npos;
-  }
-
-  void RenameVar(const std::string& new_name) {
-    PADDLE_ENFORCE(type_ == Type::kVariable && var_desc_,
-                   "Must be type of variable");
-    name_ = new_name;
-    var_desc_->SetName(new_name);
-  }
-
-  std::vector<Node*> inputs;
-  std::vector<Node*> outputs;
-
- protected:
-  std::string name_;
-  std::unique_ptr<VarDesc> var_desc_;
-  std::unique_ptr<OpDesc> op_desc_;
-  Type type_;
-  int id_;
-
- private:
-  // ID can only set by a Graph.
-  void SetId(int id) { id_ = id; }
-
-  friend class Graph;
-  friend std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
-                                                 Node::Type type);
-  friend std::unique_ptr<Node> CreateNodeForTest(VarDesc* var_desc);
-  friend std::unique_ptr<Node> CreateNodeForTest(OpDesc* op_desc);
-
-  explicit Node(const std::string& name, Type type)
-      : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
-
-  explicit Node(VarDesc* var_desc)
-      : name_(var_desc->Name()),
-        var_desc_(new VarDesc(*var_desc)),
-        op_desc_(nullptr),
-        type_(Type::kVariable) {}
-
-  explicit Node(OpDesc* op_desc)
-      : name_(op_desc->Type()),
-        var_desc_(nullptr),
-        op_desc_(new OpDesc(*op_desc, op_desc->Block())),
-        type_(Type::kOperation) {}
-
-  Node() = delete;
-
-  boost::any wrapper_;
-  std::function<void(void)> wrapper_deleter_;
-  std::type_index wrapper_type_ = std::type_index(typeid(void));
-
-  DISABLE_COPY_AND_ASSIGN(Node);
-};
-
-std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
-                                        Node::Type type);
-std::unique_ptr<Node> CreateNodeForTest(VarDesc* var_desc);
-
-std::unique_ptr<Node> CreateNodeForTest(OpDesc* op_desc);
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node_test.cc b/paddle/fluid/framework/ir/node_test.cc
deleted file mode 100644
index 694efadda078169c993457181c00f7b357a09e87..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/node_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class RunnableOp {
- public:
-  RunnableOp(Node* node, bool* alive) : node_(node), alive_(alive) {
-    node_->WrappedBy(this);
-  }
-
-  virtual ~RunnableOp() { *alive_ = false; }
-
- private:
-  Node* node_;
-  bool* alive_;
-};
-
-class RunnableOp2 {
- public:
-  RunnableOp2(Node* node, bool* alive) : node_(node), alive_(alive) {
-    node_->WrappedBy(this);
-  }
-
-  virtual ~RunnableOp2() { *alive_ = false; }
-
- private:
-  Node* node_;
-  bool* alive_;
-};
-
-TEST(NodeTest, Basic) {
-  bool alive1 = true;
-  bool alive2 = true;
-  std::unique_ptr<Node> n1(CreateNodeForTest("n1", Node::Type::kVariable));
-  std::unique_ptr<Node> n2(CreateNodeForTest("n2", Node::Type::kVariable));
-
-  EXPECT_FALSE(n1->IsWrappedBy<RunnableOp>());
-  EXPECT_FALSE(n1->IsWrappedBy<RunnableOp2>());
-  EXPECT_FALSE(n2->IsWrappedBy<RunnableOp>());
-  EXPECT_FALSE(n2->IsWrappedBy<RunnableOp2>());
-
-  new RunnableOp(n1.get(), &alive1);
-  new RunnableOp2(n2.get(), &alive2);
-
-  EXPECT_TRUE(n1->IsWrappedBy<RunnableOp>());
-  EXPECT_FALSE(n1->IsWrappedBy<RunnableOp2>());
-  EXPECT_FALSE(n2->IsWrappedBy<RunnableOp>());
-  EXPECT_TRUE(n2->IsWrappedBy<RunnableOp2>());
-
-  EXPECT_TRUE(alive1);
-  EXPECT_TRUE(alive2);
-
-  n1.reset(nullptr);
-  n2.reset(nullptr);
-  EXPECT_FALSE(alive1);
-  EXPECT_FALSE(alive2);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
deleted file mode 100644
index b4cfda919ce346c60ef9f4e24de705b51488e4dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/pass.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/pass.h"
-
-#include <memory>
-#include <utility>
-
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-Graph* Pass::Apply(Graph* graph) const {
-  CheckPrevPass();
-  PADDLE_ENFORCE(graph, "graph passed to Pass::Apply() cannot be empty.");
-  for (const std::string& attr : required_pass_attrs_) {
-    PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
-                   "Required pass atrribute %s not set.", attr);
-  }
-  for (const std::string& attr : required_graph_attrs_) {
-    PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
-                   attr);
-  }
-  ApplyImpl(graph);
-  // TODO(panyx0718): Add more verifications.
-  PADDLE_ENFORCE(!HasCircle(*graph),
-                 "Illegal Pass %s. Generated graph shouldn't have cycle.",
-                 Type());
-  PADDLE_ENFORCE(VarDescIsConsistency(*graph),
-                 "The VarDescs of persistable variable are not consistency.");
-  applied_ = true;
-  if (!graph->Has(kPassRecorder)) {
-    graph->Set<PassRecorder>(kPassRecorder, new PassRecorder);
-  }
-  graph->Get<PassRecorder>(kPassRecorder).insert(Type());
-  return graph;
-}
-
-PassRegistry& PassRegistry::Instance() {
-  static PassRegistry g_pass_info_map;
-  return g_pass_info_map;
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
deleted file mode 100644
index cf6b8d1338e20a67d332c2ddec562f662d8ff0a9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/pass.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-template <typename PassType>
-struct PassRegistrar;
-
-typedef std::unordered_set<std::string> PassRecorder;
-constexpr char kPassRecorder[] = "pass_recorder";
-
-class Pass {
- public:
-  Pass() = default;
-  virtual ~Pass() {
-    for (auto &attr : attrs_) {
-      if (attr_dels_.find(attr.first) != attr_dels_.end()) {
-        attr_dels_[attr.first]();
-      }
-    }
-    attrs_.clear();
-    attr_dels_.clear();
-  }
-
-  std::string Type() const { return type_; }
-
-  Graph *Apply(Graph *graph) const;
-
-  // Get a reference to the attributed previously set.
-  template <typename AttrType>
-  AttrType &Get(const std::string &attr_name) const {
-    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
-                   "%s attr not registered for pass.", attr_name);
-    try {
-      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
-    } catch (boost::bad_any_cast &) {
-      PADDLE_THROW(
-          "Invalid attribute type of %s error, expected: %s, actual: %s",
-          attr_name, typeid(AttrType *).name(),
-          attrs_.at(attr_name).type().name());
-    }
-  }
-
-  bool Has(const std::string &attr_name) const {
-    return attrs_.count(attr_name) > 0;
-  }
-
-  void Erase(const std::string &attr_name) {
-    if (!Has(attr_name)) {
-      return;
-    }
-    if (attr_dels_.find(attr_name) != attr_dels_.end()) {
-      attr_dels_[attr_name]();
-      attr_dels_.erase(attr_name);
-    }
-    attrs_.erase(attr_name);
-  }
-
-  // Set a pointer to the attribute. Pass takes ownership of the attribute.
-  template <typename AttrType>
-  void Set(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
-                   attr_name);
-    attrs_[attr_name] = attr;
-    attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(3) << "deleting " << attr_name;
-      delete attr;
-    };
-  }
-
-  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
-  // should delete the attribute.
-  template <typename AttrType>
-  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
-                   attr_name);
-    attrs_[attr_name] = attr;
-  }
-
- protected:
-  virtual void ApplyImpl(Graph *graph) const {
-    LOG(FATAL) << "Calling virtual Pass not implemented.";
-  }
-
-  // Some Pass must be placed before this Pass, and some
-  // Pass must be placed after this Pass.
-  virtual void CheckPrevPass() const {}
-
- private:
-  template <typename PassType>
-  friend struct PassRegistrar;
-
-  void RegisterRequiredPassAttrs(const std::unordered_set<std::string> &attrs) {
-    required_pass_attrs_.insert(attrs.begin(), attrs.end());
-  }
-
-  void RegisterRequiredGraphAttrs(
-      const std::unordered_set<std::string> &attrs) {
-    required_graph_attrs_.insert(attrs.begin(), attrs.end());
-  }
-
-  void RegisterType(const std::string &type) { type_ = type; }
-
-  mutable bool applied_{false};
-  std::string type_;
-  std::unordered_set<std::string> required_pass_attrs_;
-  std::unordered_set<std::string> required_graph_attrs_;
-  std::map<std::string, boost::any> attrs_;
-  std::map<std::string, std::function<void(void)>> attr_dels_;
-};
-
-using PassCreator = std::function<std::unique_ptr<Pass>()>;
-
-class Registrar {
- public:
-  // In our design, various kinds of passes,
-  // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which
-  // are not used in the code that calls package framework, and would
-  // be removed from the generated binary file by the linker. To avoid such
-  // removal, we add Touch to all registrar classes and make USE_PASS macros to
-  // call this method. So, as long as the callee code calls USE_PASS, the global
-  // registrar variable won't be removed by the linker.
-  void Touch() {}
-};
-
-class PassRegistry {
- public:
-  static PassRegistry &Instance();
-
-  bool Has(const std::string &pass_type) const {
-    return map_.find(pass_type) != map_.end();
-  }
-
-  void Insert(const std::string &pass_type, const PassCreator &pass_creator) {
-    PADDLE_ENFORCE(!Has(pass_type), "Pass %s has been registered", pass_type);
-    map_.insert({pass_type, pass_creator});
-  }
-
-  std::unique_ptr<Pass> Get(const std::string &pass_type) const {
-    PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered",
-                   pass_type);
-    return map_.at(pass_type)();
-  }
-
- private:
-  PassRegistry() = default;
-  std::unordered_map<std::string, PassCreator> map_;
-
-  DISABLE_COPY_AND_ASSIGN(PassRegistry);
-};
-
-template <typename PassType>
-struct PassRegistrar : public Registrar {
-  explicit PassRegistrar(const char *pass_type) {
-    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
-                   "'%s' is registered more than once.", pass_type);
-    PassRegistry::Instance().Insert(
-        pass_type, [this, pass_type]() -> std::unique_ptr<Pass> {
-          std::unique_ptr<Pass> pass(new PassType());
-          pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
-          pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
-          pass->RegisterType(pass_type);
-          return pass;
-        });
-  }
-
-  PassRegistrar<PassType> &RequirePassAttr(const std::string &attr) {
-    required_pass_attrs_.insert(attr);
-    return *this;
-  }
-
-  PassRegistrar<PassType> &RequireGraphAttr(const std::string &attr) {
-    required_graph_attrs_.insert(attr);
-    return *this;
-  }
-
- private:
-  std::unordered_set<std::string> required_pass_attrs_;
-  std::unordered_set<std::string> required_graph_attrs_;
-};
-
-#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg)                   \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-
-// Register a new pass that can be applied on the IR.
-#define REGISTER_PASS(pass_type, pass_class)                \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                      \
-      __reg_pass__##pass_type,                              \
-      "REGISTER_PASS must be called in global namespace");  \
-  static ::paddle::framework::ir::PassRegistrar<pass_class> \
-      __pass_registrar_##pass_type##__(#pass_type);         \
-  int TouchPassRegistrar_##pass_type() {                    \
-    __pass_registrar_##pass_type##__.Touch();               \
-    return 0;                                               \
-  }                                                         \
-  static ::paddle::framework::ir::PassRegistrar<pass_class> \
-      &__pass_tmp_registrar_##pass_type##__ UNUSED =        \
-          __pass_registrar_##pass_type##__
-
-#define USE_PASS(pass_type)                           \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                \
-      __use_pass_itself_##pass_type,                  \
-      "USE_PASS must be called in global namespace"); \
-  extern int TouchPassRegistrar_##pass_type();        \
-  static int use_pass_itself_##pass_type##_ UNUSED =  \
-      TouchPassRegistrar_##pass_type()
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc
deleted file mode 100644
index 8355764aa6c983ace203906190e6cc6d86b500dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/pass_builder.h"
-#include <memory>
-#include <utility>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
-  VLOG(1) << "Append " << pass_type;
-  auto pass = ir::PassRegistry::Instance().Get(pass_type);
-  passes_.emplace_back(pass.release());
-  return passes_.back();
-}
-
-void PassBuilder::RemovePass(size_t idx) {
-  PADDLE_ENFORCE(passes_.size() > idx);
-  passes_.erase(passes_.begin() + idx);
-}
-
-std::shared_ptr<Pass> PassBuilder::InsertPass(size_t idx,
-                                              const std::string& pass_type) {
-  PADDLE_ENFORCE(passes_.size() >= idx);
-  std::shared_ptr<Pass> pass(
-      ir::PassRegistry::Instance().Get(pass_type).release());
-  passes_.insert(passes_.begin() + idx, std::move(pass));
-  return passes_[idx];
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_builder.h b/paddle/fluid/framework/ir/pass_builder.h
deleted file mode 100644
index 733d3a3ad1ab8989ea30fe45cd7e1ffe9432de13..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/pass_builder.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class PassBuilder {
- public:
-  PassBuilder() {}
-
-  virtual ~PassBuilder() {}
-
-  // Append a new pass to the end.
-  std::shared_ptr<Pass> AppendPass(const std::string& pass_type);
-
-  // Insert a new pass after `idx`.
-  std::shared_ptr<Pass> InsertPass(size_t idx, const std::string& pass_type);
-
-  // Remove a new pass at `idx`.
-  void RemovePass(size_t idx);
-
-  // Returns a list of all passes.
-  std::vector<std::shared_ptr<Pass>> AllPasses() const { return passes_; }
-
- protected:
-  std::vector<std::shared_ptr<Pass>> passes_;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
deleted file mode 100644
index 44fddd80c19ee4d5c5618b66d3fef0cd5c39047b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/pass.h"
-#include <memory>
-#include <string>
-#include <utility>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-void BuildCircleGraph(Graph* g) {
-  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
-  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
-  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
-  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
-
-  o1->outputs.push_back(v1);
-  o2->inputs.push_back(v1);
-  v1->inputs.push_back(o1);
-  v1->outputs.push_back(o2);
-
-  o2->outputs.push_back(v2);
-  o1->inputs.push_back(v2);
-  v2->inputs.push_back(o2);
-  v2->outputs.push_back(o1);
-}
-
-class TestPass : public Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const {
-    graph->Set<int>("copy_test_pass_attr", new int);
-    graph->Set<int>("copy_test_graph_attr", new int);
-
-    int test_pass_attr = this->Get<int>("test_pass_attr");
-    graph->Get<int>("copy_test_pass_attr") = test_pass_attr + 1;
-
-    int test_graph_attr = graph->Get<int>("test_graph_attr");
-    graph->Get<int>("copy_test_graph_attr") = test_graph_attr + 1;
-  }
-};
-
-TEST(PassTest, TestPassAttrCheck) {
-  ProgramDesc prog;
-  auto pass = PassRegistry::Instance().Get("test_pass");
-  std::unique_ptr<Graph> graph(new Graph(prog));
-  std::string exception;
-  try {
-    graph.reset(pass->Apply(graph.release()));
-  } catch (paddle::platform::EnforceNotMet e) {
-    exception = std::string(e.what());
-  }
-  ASSERT_TRUE(exception.find("test_pass_attr not set") != exception.npos);
-
-  int val = 1;
-  graph.reset(new Graph(prog));
-  pass->SetNotOwned<int>("test_pass_attr", &val);
-
-  try {
-    graph.reset(pass->Apply(graph.release()));
-  } catch (paddle::platform::EnforceNotMet e) {
-    exception = std::string(e.what());
-  }
-  ASSERT_TRUE(exception.find("test_graph_attr not set") != exception.npos);
-
-  graph.reset(new Graph(prog));
-  graph->Set<int>("test_graph_attr", new int);
-  graph->Get<int>("test_graph_attr") = 1;
-  graph.reset(pass->Apply(graph.release()));
-  ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
-  ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
-
-  // Allow apply more than once.
-  graph.reset(new Graph(prog));
-  graph->Set<int>("test_graph_attr", new int);
-  graph.reset(pass->Apply(graph.release()));
-
-  pass = PassRegistry::Instance().Get("test_pass");
-  pass->SetNotOwned<int>("test_pass_attr", &val);
-  graph.reset(new Graph(prog));
-  BuildCircleGraph(graph.get());
-  graph->Set<int>("test_graph_attr", new int);
-  graph->Get<int>("test_graph_attr") = 2;
-  try {
-    pass->Apply(graph.release());
-  } catch (paddle::platform::EnforceNotMet e) {
-    exception = std::string(e.what());
-  }
-  ASSERT_TRUE(exception.find("shouldn't have cycle") != exception.npos);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(test_pass, paddle::framework::ir::TestPass)
-    .RequirePassAttr("test_pass_attr")
-    .RequireGraphAttr("test_graph_attr");
diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h
deleted file mode 100644
index 8df292b483b2842628de8aa7e92f9fb0d38373ff..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/pass_tester_helper.h
+++ /dev/null
@@ -1,338 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-struct Layers {
- public:
-  const ProgramDesc& main_program() { return program_; }
-
-  VarDesc* data(std::string name, std::vector<int64_t> shape = {},
-                bool is_persistable = false) {
-    return lod_tensor(name, shape, is_persistable);
-  }
-
-  VarDesc* conv2d(VarDesc* input, VarDesc* filter, VarDesc* bias,
-                  bool use_cudnn = false) {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("conv2d");
-    op->SetInput("Input", {input->Name()});
-    op->SetInput("Filter", {filter->Name()});
-    op->SetInput("Bias", {bias->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("use_cudnn", use_cudnn);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* depthwise_conv2d(VarDesc* input, VarDesc* filter, VarDesc* bias,
-                            bool use_cudnn) {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("depthwise_conv2d");
-    op->SetInput("Input", {input->Name()});
-    op->SetInput("Filter", {filter->Name()});
-    op->SetInput("Bias", {bias->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("use_cudnn", use_cudnn);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* pool2d(VarDesc* x, bool use_cudnn) {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("pool2d");
-    op->SetInput("X", {x->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("use_cudnn", use_cudnn);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* relu(VarDesc* x, VarDesc* out = nullptr) {
-    return unary_op("relu", x, out);
-  }
-
-  VarDesc* fc(VarDesc* input, VarDesc* w, VarDesc* bias,
-              int in_num_col_dims = 1, std::string activation_type = "") {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("fc");
-    op->SetInput("Input", {input->Name()});
-    op->SetInput("W", {w->Name()});
-    op->SetInput("Bias", {bias->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("in_num_col_dims", in_num_col_dims);
-    op->SetAttr("activation_type", activation_type);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
-               int x_num_col_dims = 1) {
-    AttributeMap attrs;
-    attrs["x_num_col_dims"] = 1;
-    return binary_op("mul", x, y, out, &attrs);
-  }
-
-  VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr) {
-    return binary_op("elementwise_add", x, y, out);
-  }
-
-  VarDesc* dropout(VarDesc* x, float dropout_prob,
-                   std::string dropout_implementation) {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("dropout");
-    op->SetInput("X", {x->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("is_test", true);
-    op->SetAttr("dropout_prob", dropout_prob);
-    op->SetAttr("dropout_implementation", dropout_implementation);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* concat(std::vector<VarDesc*> inputs, int axis = -1) {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("concat");
-    std::vector<std::string> input_names(inputs.size());
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      input_names[i] = inputs[i]->Name();
-    }
-    op->SetInput("X", input_names);
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("axis", axis);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  std::vector<VarDesc*> layer_norm(VarDesc* x, VarDesc* scale = nullptr,
-                                   VarDesc* bias = nullptr) {
-    VarDesc* y = lod_tensor(unique_name());
-    VarDesc* mean = lod_tensor(unique_name());
-    VarDesc* variance = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("layer_norm");
-    op->SetInput("X", {x->Name()});
-    if (scale) {
-      op->SetInput("Scale", {scale->Name()});
-    }
-    if (bias) {
-      op->SetInput("Bias", {bias->Name()});
-    }
-    op->SetOutput("Y", {y->Name()});
-    op->SetOutput("Mean", {mean->Name()});
-    op->SetOutput("Variance", {variance->Name()});
-    op->SetAttr("epsilon", static_cast<float>(1E-05));
-    op->SetAttr("begin_norm_axis", static_cast<int>(1));
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    std::vector<VarDesc*> outs = {y, mean, variance};
-    return outs;
-  }
-
- private:
-  VarDesc* lod_tensor(std::string name, std::vector<int64_t> shape = {},
-                      bool is_persistable = false) {
-    auto* var = program_.MutableBlock(0)->Var(name);
-    var->SetType(proto::VarType::LOD_TENSOR);
-    var->SetShape(shape);
-    var->SetPersistable(is_persistable);
-    return var;
-  }
-
-  VarDesc* unary_op(std::string type, VarDesc* x, VarDesc* out = nullptr) {
-    if (!out) {
-      out = lod_tensor(unique_name());
-    }
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType(type);
-    op->SetInput("X", {x->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* binary_op(std::string type, VarDesc* x, VarDesc* y,
-                     VarDesc* out = nullptr,
-                     const AttributeMap* attrs = nullptr) {
-    if (!out) {
-      out = lod_tensor(unique_name());
-    }
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType(type);
-    op->SetInput("X", {x->Name()});
-    op->SetInput("Y", {y->Name()});
-    op->SetOutput("Out", {out->Name()});
-    if (attrs) {
-      for (auto& iter : *attrs) {
-        op->SetAttr(iter.first, iter.second);
-      }
-    }
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  std::string unique_name() { return "tmp_" + std::to_string(idx_++); }
-
- private:
-  ProgramDesc program_;
-  int idx_{0};
-};
-
-static std::string DebugString(OpDesc* op) {
-  std::ostringstream os;
-  os << "Op(" << op->Type() << "), inputs:{";
-  bool is_first = true;
-  for (auto& name : op->InputNames()) {
-    if (!is_first) {
-      os << ", ";
-    }
-    os << name << "[";
-    bool is_first_var_name = true;
-    for (auto& var_name : op->Input(name)) {
-      if (!is_first_var_name) {
-        os << ", ";
-      }
-      os << var_name;
-      is_first_var_name = false;
-    }
-    os << "]";
-    is_first = false;
-  }
-
-  os << "}, outputs:{";
-  is_first = true;
-  for (auto& name : op->OutputNames()) {
-    if (!is_first) {
-      os << ", ";
-    }
-    os << name << "[";
-    bool is_first_var_name = true;
-    for (auto& var_name : op->Output(name)) {
-      if (!is_first_var_name) {
-        os << ", ";
-      }
-      os << var_name;
-      is_first_var_name = false;
-    }
-    os << "]";
-    is_first = false;
-  }
-  os << "}";
-  return os.str();
-}
-
-static std::string DebugString(Node* node) {
-  std::ostringstream os;
-  if (node->IsOp() && node->Op()) {
-    OpDesc* op = node->Op();
-    os << "Node(" << DebugString(op) << "), inputs:{";
-    bool is_first = true;
-    for (auto* in : node->inputs) {
-      if (!is_first) {
-        os << ", ";
-      }
-      os << in->Name();
-      is_first = false;
-    }
-    os << "}, outputs:{";
-    is_first = true;
-    for (auto* out : node->outputs) {
-      if (!is_first) {
-        os << ", ";
-      }
-      os << out->Name();
-      is_first = false;
-    }
-    os << "}.";
-  } else if (node->IsVar() && node->Var()) {
-    os << "Node(" << node->Name() << "), inputs:{";
-    bool is_first = true;
-    for (auto* in : node->inputs) {
-      if (!is_first) {
-        os << ", ";
-      }
-      if (in->IsOp() && in->Op()) {
-        os << in->Op()->Type();
-      }
-      is_first = false;
-    }
-    os << "}, outputs:{";
-    is_first = true;
-    for (auto* out : node->outputs) {
-      if (!is_first) {
-        os << ", ";
-      }
-      if (out->IsOp() && out->Op()) {
-        os << out->Op()->Type();
-      }
-      is_first = false;
-    }
-    os << "}";
-  }
-  return os.str();
-}
-
-static std::string DebugString(const std::unique_ptr<Graph>& graph) {
-  std::ostringstream os;
-  os << "Graph: {\n";
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()) {
-      os << "  ";
-    } else if (node->IsVar() && node->Var()) {
-      os << "    ";
-    }
-    os << DebugString(node) << "\n";
-  }
-  os << "}\n";
-  return os.str();
-}
-
-static int GetNumOpNodes(const std::unique_ptr<Graph>& graph,
-                         std::string op_type) {
-  int num_nodes = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op() && node->Op()->Type() == op_type) {
-      num_nodes++;
-    }
-  }
-  return num_nodes;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc
deleted file mode 100644
index 1ac7e4d6a11385dc8082083aacab4d276399907c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/placement_pass_base.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void PlacementPassBase::ApplyImpl(ir::Graph* graph) const {
-  VLOG(3) << "Applies " << GetPlacementName() << " placement strategy.";
-  std::string attr_name = GetAttrName();
-  const auto& op_types_list = GetOpTypesList();
-  if (!graph->Has(attr_name)) {
-    graph->Set<bool>(attr_name, new bool(true));
-  }
-  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp()) {
-      auto* op = n->Op();
-      if ((op->HasAttr(attr_name) || op->HasProtoAttr(attr_name)) &&
-          IsSupport(op->Type())) {
-        if (op_types_list.empty()) {
-          op->SetAttr(attr_name, true);
-        } else if (std::find(op_types_list.begin(), op_types_list.end(),
-                             n->Name()) != op_types_list.end()) {
-          op->SetAttr(attr_name, true);
-        }
-      }
-    }
-  }
-}
-
-bool PlacementPassBase::IsSupport(const std::string& op_type) const {
-  if (GetAttrName() == "use_cudnn") {
-    auto& all_kernels = OperatorWithKernel::AllOpKernels();
-    auto it = all_kernels.find(op_type);
-    if (it == all_kernels.end()) {
-      // All control operators don't have kernel.
-      return false;
-    }
-    for (auto& kernel_pair : it->second) {
-      if (platform::is_gpu_place(kernel_pair.first.place_) &&
-          (kernel_pair.first.library_type_ == LibraryType::kCUDNN)) {
-        return true;
-      }
-    }
-  } else if (GetAttrName() == "use_mkldnn") {
-    return true;
-  }
-  return false;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/placement_pass_base.h b/paddle/fluid/framework/ir/placement_pass_base.h
deleted file mode 100644
index 91693e7bed598000ba18de48046681e3485301e0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/placement_pass_base.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Specifies which operators should use cuDNN.
- */
-class PlacementPassBase : public Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  virtual const std::string GetPlacementName() const = 0;
-  virtual const std::string GetAttrName() const = 0;
-  virtual const std::unordered_set<std::string> GetOpTypesList() const = 0;
-
- private:
-  bool IsSupport(const std::string& op_type) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
deleted file mode 100644
index 62fba440ed4c5ca0cf57e3377bc1c5d5d79d8f3f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
-                     const std::string& op_type, const std::string& quant_type,
-                     const std::string& dequant_type) {
-  const std::string pattern_name = "quant_dequant_fuse";
-  int kNumFields = 5;
-  const int kQuantizedWeightOffset = 0;
-  const int kQuantizedOpOffset = 1;
-  const int kQuantizedOpOutOffset = 2;
-  const int kDequantOpOffset = 3;
-  const int kDequantOpOutOffset = 4;
-  const int kDequantOpWeightScaleOffset = 5;
-
-  if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-    kNumFields += 1;
-  }
-
-  GraphPatternDetector gpd;
-  auto* x = gpd.mutable_pattern()
-                ->NewNode("x")
-                ->assert_is_op_input(quant_type, "X")
-                ->AsInput();
-
-  std::string quantized_op_type = op_type;
-  std::string weight_name = "";
-  if (op_type == "conv2d" || op_type == "depthwise_conv2d" ||
-      op_type == "conv2d_fusion") {
-    weight_name = "Filter";
-  } else if (op_type == "mul") {
-    weight_name = "Y";
-  } else if (op_type == "fc") {
-    weight_name = "W";
-  } else {
-    PADDLE_ENFORCE(
-        "QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for "
-        "now.");
-  }
-
-  patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(x, quantized_op_type, weight_name, times, quant_type, dequant_type);
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    PADDLE_ENFORCE(subgraph.count(x));
-    auto* input_node = subgraph.at(x);
-    Node* quant_op_in_scale =
-        subgraph.at(pattern.GetPDNode("quant_op_in_scale"));
-    Node* quant_op = subgraph.at(pattern.GetPDNode("quant_op"));
-    Node* quant_op_out_scale =
-        subgraph.at(pattern.GetPDNode("quant_op_out_scale"));
-    Node* quant_op_out = subgraph.at(pattern.GetPDNode("quant_op_out"));
-
-    std::vector<Node*> nodes;
-    for (int i = 0; i < times; i++) {
-      nodes.push_back(subgraph.at(
-          pattern.GetPDNode("quantized_op_weight" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("quantized_op" + std::to_string(i))));
-      nodes.push_back(subgraph.at(
-          pattern.GetPDNode("quantized_op_out" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i))));
-      if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-        nodes.push_back(subgraph.at(
-            pattern.GetPDNode("dequant_channel_scale" + std::to_string(i))));
-      }
-    }
-
-    int bit_length = boost::get<int>(quant_op->Op()->GetAttr("bit_length"));
-    int range = ((1 << (bit_length - 1)) - 1);
-    // Prepare input scale
-    std::string input_scale_var_name = quant_op->Op()->Input("InScale").front();
-    PADDLE_ENFORCE(scope);
-    const LoDTensor& input_scale_tensor =
-        scope->FindVar(input_scale_var_name)->Get<LoDTensor>();
-
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(input_scale_tensor.place()));
-    const float* input_scale_data = input_scale_tensor.data<float>();
-    float input_scale = input_scale_data[0];
-    std::unordered_set<const Node*> delete_nodes;
-
-    for (int i = 0; i < times; i++) {
-      std::vector<float> weight_scale;
-
-      // Get weight scale from dequant op.
-      if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-        auto scales_name =
-            nodes[i * kNumFields + kDequantOpOffset]->Op()->Input("Scales");
-        PADDLE_ENFORCE(scales_name.size() == 2);
-        const LoDTensor& channel_scale_tensor =
-            scope->FindVar(scales_name[0])->Get<LoDTensor>();
-        PADDLE_ENFORCE(
-            paddle::platform::is_cpu_place(channel_scale_tensor.place()));
-        const float* channel_scale_data = channel_scale_tensor.data<float>();
-        for (int i = 0; i < channel_scale_tensor.numel(); i++) {
-          weight_scale.push_back(channel_scale_data[i]);
-        }
-        delete_nodes.insert(
-            nodes[i * kNumFields + kDequantOpWeightScaleOffset]);
-      } else {
-        float max_range = boost::get<float>(
-            nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr(
-                "max_range"));
-        weight_scale.push_back((range * range) / max_range);
-      }
-
-      // create new op_desc
-      auto base_op_desc =
-          *nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto();
-      std::string new_input = input_node->Name();
-      std::string new_output =
-          nodes[i * kNumFields + kDequantOpOutOffset]->Name();
-
-      framework::OpDesc new_op_desc(base_op_desc, nullptr);
-      new_op_desc.SetType(quantized_op_type);
-
-      if (quantized_op_type == "conv2d" ||
-          quantized_op_type == "conv2d_fusion" ||
-          quantized_op_type == "depthwise_conv2d") {
-        new_op_desc.SetInput("Input", {new_input});
-        new_op_desc.SetOutput("Output", {new_output});
-      } else if (quantized_op_type == "fc") {
-        new_op_desc.SetInput("Input", {new_input});
-        new_op_desc.SetOutput("Out", {new_output});
-      } else if (quantized_op_type == "mul") {
-        new_op_desc.SetInput("X", {new_input});
-        new_op_desc.SetOutput("Out", {new_output});
-      }
-
-      new_op_desc.SetAttr("enable_int8", true);
-      new_op_desc.SetAttr("input_scale", input_scale);
-      new_op_desc.SetAttr("weight_scale", weight_scale);
-      new_op_desc.Flush();
-      auto* new_op = graph->CreateOpNode(&new_op_desc);
-      IR_NODE_LINK_TO(input_node, new_op);
-      IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op);
-      IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]);
-
-      delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]);
-      delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]);
-      delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]);
-    }
-
-    delete_nodes.insert(quant_op_in_scale);
-    delete_nodes.insert(quant_op);
-    delete_nodes.insert(quant_op_out);
-    delete_nodes.insert(quant_op_out_scale);
-    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph, delete_nodes);
-  };
-  gpd(graph, handler);
-}
-
-void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "quant_dequant_fuse";
-  FusePassBase::Init(pattern_name, graph);
-
-  std::unordered_set<std::string> dequant_types = {
-      "fake_dequantize_max_abs", "fake_channel_wise_dequantize_max_abs"};
-  std::unordered_set<std::string> quant_types = {
-      "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
-  std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul",
-                                                        "depthwise_conv2d"};
-  auto* scope = param_scope();
-  for (auto& dequant_type : dequant_types) {
-    for (auto& quant_type : quant_types) {
-      for (auto& op_type : quantized_op_types) {
-        for (int i = 6; i >= 1; i--) {
-          RunQuantDequant(graph, scope, i, op_type, quant_type, dequant_type);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(quant_conv2d_dequant_fuse_pass,
-              paddle::framework::ir::QuantDequantFusePass);
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h
deleted file mode 100644
index a61b34563acc4cbcee778509a097587222579295..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class QuantDequantFusePass : public FusePassBase {
- public:
-  virtual ~QuantDequantFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
deleted file mode 100644
index 45157ca18be7217dca014ccb78161474df81709d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ /dev/null
@@ -1,383 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
-#include <algorithm>  // for max
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-#define MAX_NUM_FC 10
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-static bool IsInputOfFC(Node* n) {
-  if (n && n->IsVar() && VarLinksToOp(n, "fc")) {
-    return true;
-  }
-  return false;
-}
-
-static bool IsOutputOfFC(Node* n) {
-  if (n && n->IsVar() && VarLinksFromOp(n, "fc") && n->inputs.size() == 1U) {
-    return true;
-  }
-  return false;
-}
-
-static bool IsFCWithAct(Node* n, const std::string& act_type = "relu") {
-  if (n && n->IsOp() && n->Op() && n->Op()->Type() == "fc" &&
-      n->inputs.size() == 3U && n->outputs.size() == 1U) {
-    return boost::get<std::string>(n->Op()->GetAttr("activation_type")) ==
-           act_type;
-  }
-  return false;
-}
-
-static bool IsParamOfFC(Node* n, const std::string& param_name) {
-  if (IsInputOfFC(n) && n->inputs.empty() &&
-      (n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) {
-    return true;
-  }
-  return false;
-}
-
-static int FindFCIdx(Node* x, const std::string& act_type = "relu") {
-  if (!IsInputOfFC(x)) {
-    return -1;
-  }
-  for (size_t k = 0; k < x->outputs.size(); ++k) {
-    auto* out_op = x->outputs[k];
-    if (IsFCWithAct(out_op, act_type) && out_op->outputs.size() == 1U) {
-      return k;
-    }
-  }
-  return -1;
-}
-
-static int FindInputIdx(Node* n, const std::string& name,
-                        const std::string& act_type = "relu") {
-  if (!IsFCWithAct(n, act_type)) {
-    return -1;
-  }
-  for (size_t i = 0; i < n->inputs.size(); ++i) {
-    if (n->inputs[i]->Name() == n->Op()->Input(name)[0]) {
-      return i;
-    }
-  }
-  return -1;
-}
-
-void BuildRepeatedFCReluPattern(PDPattern* pattern,
-                                const std::string& name_scope, int num_fc) {
-  auto var_next_is_fc_act = [=](Node* x, const std::string& act_type = "relu",
-                                bool check_in_has_only_one_out = true,
-                                int fc_idx = 0) -> bool {
-    if (!IsInputOfFC(x)) {
-      return false;
-    }
-    if (check_in_has_only_one_out && x->outputs.size() != 1U) {
-      return false;
-    }
-    auto* fc_op = x->outputs[fc_idx];
-    return IsFCWithAct(fc_op, act_type) && fc_op->outputs.size() == 1U;
-  };
-
-  // in -> fc -> out
-  // Current x is in, return fc's out which is next fc's input.
-  auto next_var_of_part = [=](Node* x, int fc_idx = 0) -> Node* {
-    return x->outputs[fc_idx]->outputs[0];
-  };
-
-  auto var_next_is_fc_act_repeated_n_times = [=](
-      Node* x, int repeated_times, const std::string& act_type = "relu",
-      bool check_in_has_only_one_out = true) -> bool {
-    for (int i = 0; i < repeated_times; ++i) {
-      if (!var_next_is_fc_act(x, act_type,
-                              i == 0 && check_in_has_only_one_out)) {
-        return false;
-      }
-      x = next_var_of_part(x);
-    }
-    return true;
-  };
-
-  // x is output of fc
-  auto var_before_is_fc_act = [=](Node* x, const std::string& act_type = "relu",
-                                  bool at_top = false) -> bool {
-    if (!IsOutputOfFC(x)) {
-      return false;
-    }
-    auto* fc_op = x->inputs[0];
-    if (!IsFCWithAct(fc_op, act_type) || fc_op->inputs.size() != 3U) {
-      return false;
-    }
-    for (auto* fc_i : fc_op->inputs) {
-      if (!fc_i->inputs.empty()) {
-        if (at_top) {
-          return true;
-        } else {
-          return VarLinksFromOp(fc_i, "fc");
-        }
-      }
-    }
-    return false;
-  };
-
-  auto before_var_of_part = [=](Node* x) -> Node* {
-    auto* fc_op = x->inputs[0];
-    for (auto* in : fc_op->inputs) {
-      if (!in->inputs.empty()) {
-        // w and bias has no input.
-        return in;
-      }
-    }
-    return nullptr;
-  };
-
-  auto var_before_is_fc_act_repeated_n_times = [=](
-      Node* x, int repeated_times,
-      const std::string& act_type = "relu") -> bool {
-    for (int i = 0; i < repeated_times; ++i) {
-      if (!var_before_is_fc_act(x, act_type, i == repeated_times - 1)) {
-        return false;
-      }
-      x = before_var_of_part(x);
-    }
-    return true;
-  };
-
-  PDNode* fc_input_var_0 = nullptr;
-  std::vector<PDNode*> fc_output_var(num_fc);
-  std::vector<PDNode*> fc_weight_var(num_fc);
-  std::vector<PDNode*> fc_bias_var(num_fc);
-  std::vector<PDNode*> fc_ops(num_fc);
-
-  for (int i = 0; i < num_fc; ++i) {
-    if (i == 0) {
-      fc_input_var_0 = pattern->NewNode(
-          [=](Node* x) {
-            if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) {
-              return false;
-            }
-            int fc_idx = FindFCIdx(x);
-            if (fc_idx < 0) {
-              return false;
-            } else if (fc_idx == 0) {
-              return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu");
-            } else {
-              x = next_var_of_part(x, fc_idx);
-              return var_next_is_fc_act_repeated_n_times(
-                  x, std::max(1, num_fc - i - 1), "relu");
-            }
-          },
-          name_scope + "/fc_in_0");
-    }
-
-    fc_weight_var[i] = pattern->NewNode(
-        [=](Node* x) {
-          if (!IsParamOfFC(x, "W")) {
-            return false;
-          }
-          auto* fc_op = x->outputs[0];
-          int input_idx = FindInputIdx(fc_op, "Input", "relu");
-          return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu") &&
-                 var_before_is_fc_act_repeated_n_times(fc_op->inputs[input_idx],
-                                                       i, "relu");
-        },
-        name_scope + "/fc_weight_" + std::to_string(i));
-
-    fc_bias_var[i] = pattern->NewNode(
-        [=](Node* x) {
-          if (!IsParamOfFC(x, "Bias")) {
-            return false;
-          }
-          auto* fc_op = x->outputs[0];
-          int input_idx = FindInputIdx(fc_op, "Input", "relu");
-          return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu") &&
-                 var_before_is_fc_act_repeated_n_times(fc_op->inputs[input_idx],
-                                                       i, "relu");
-        },
-        name_scope + "/fc_bias_" + std::to_string(i));
-
-    fc_output_var[i] = pattern->NewNode(
-        [=](Node* x) {
-          if (!IsOutputOfFC(x)) {
-            return false;
-          }
-          x = before_var_of_part(x);
-          if (i == 0 && x->outputs.size() > 0U) {
-            if (x->inputs.size() <= 0U) {
-              return false;
-            }
-            int fc_idx = FindFCIdx(x);
-            if (fc_idx < 0) {
-              return false;
-            } else if (fc_idx == 0) {
-              return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu");
-            } else {
-              x = next_var_of_part(x, fc_idx);
-              return var_next_is_fc_act_repeated_n_times(
-                  x, std::max(1, num_fc - i - 1), "relu");
-            }
-          } else {
-            return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu") &&
-                   x->inputs.size() > 0 &&
-                   var_before_is_fc_act_repeated_n_times(x, i, "relu");
-          }
-        },
-        name_scope + "/fc_out_" + std::to_string(i));
-
-    fc_ops[i] = pattern->NewNode(
-        [=](Node* x) {
-          if (!IsFCWithAct(x, "relu")) {
-            return false;
-          }
-          auto* fc_out_var = x->outputs[0];
-          return fc_out_var && fc_out_var->IsVar() &&
-                 fc_out_var->outputs.size() == 1 &&
-                 var_next_is_fc_act_repeated_n_times(fc_out_var, num_fc - i - 1,
-                                                     "relu") &&
-                 var_before_is_fc_act_repeated_n_times(fc_out_var, i + 1,
-                                                       "relu");
-        },
-        name_scope + "/fc_op_" + std::to_string(i));
-
-    if (i == 0) {
-      fc_ops[i]
-          ->LinksFrom({fc_input_var_0, fc_weight_var[i], fc_bias_var[i]})
-          .LinksTo({fc_output_var[i]});
-    } else {
-      fc_ops[i]
-          ->LinksFrom({fc_output_var[i - 1], fc_weight_var[i], fc_bias_var[i]})
-          .LinksTo({fc_output_var[i]});
-    }
-  }
-}
-
-static int BuildFusion(Graph* graph, const std::string& name_scope,
-                       int num_fc) {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-  BuildRepeatedFCReluPattern(pattern, name_scope, num_fc);
-
-  auto retrieve_node = [](const std::string& name,
-                          const GraphPatternDetector::subgraph_t& subgraph,
-                          const PDPattern& pat) -> Node* {
-    PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)),
-                   "pattern has no Node called %s", name.c_str());
-    Node* p = subgraph.at(pat.RetrieveNode(name));
-    PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str());
-    return p;
-  };
-
-  int fusion_count{0};
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    LOG(INFO) << "handle Repeated FC Act fuse";
-    std::vector<Node*> weights_vars(num_fc);
-    std::vector<Node*> bias_vars(num_fc);
-    std::vector<Node*> relu_vars(num_fc - 1);
-
-    std::vector<std::string> weight_names(num_fc);
-    std::vector<std::string> bias_names(num_fc);
-    std::vector<std::string> relu_names(num_fc - 1);
-
-    auto& fused_pattern = gpd.pattern();
-    for (int i = 0; i < num_fc; ++i) {
-      if (i < num_fc - 1) {
-        relu_vars[i] =
-            retrieve_node(name_scope + "/fc_out_" + std::to_string(i), subgraph,
-                          fused_pattern);
-        relu_names[i] = relu_vars[i]->Name();
-      }
-
-      weights_vars[i] =
-          retrieve_node(name_scope + "/fc_weight_" + std::to_string(i),
-                        subgraph, fused_pattern);
-      weight_names[i] = weights_vars[i]->Name();
-
-      bias_vars[i] = retrieve_node(name_scope + "/fc_bias_" + std::to_string(i),
-                                   subgraph, fused_pattern);
-      bias_names[i] = bias_vars[i]->Name();
-    }
-
-    auto* input_var =
-        retrieve_node(name_scope + "/fc_in_0", subgraph, fused_pattern);
-    auto* last_out_var =
-        retrieve_node(name_scope + "/fc_out_" + std::to_string(num_fc - 1),
-                      subgraph, fused_pattern);
-
-    // Create New OpDesc
-    OpDesc op_desc;
-    op_desc.SetType("fusion_repeated_fc_relu");
-    op_desc.SetInput("X", {input_var->Name()});
-    op_desc.SetInput("W", weight_names);
-    op_desc.SetInput("Bias", bias_names);
-    op_desc.SetOutput("ReluOut", relu_names);
-    op_desc.SetOutput("Out", {last_out_var->Name()});
-
-    auto* op = graph->CreateOpNode(&op_desc);
-    IR_NODE_LINK_TO(input_var, op);
-    for (size_t i = 0; i < weights_vars.size(); ++i) {
-      IR_NODE_LINK_TO(weights_vars[i], op);
-      IR_NODE_LINK_TO(bias_vars[i], op);
-    }
-    for (size_t i = 0; i < relu_vars.size(); ++i) {
-      IR_NODE_LINK_TO(op, relu_vars[i]);
-    }
-    IR_NODE_LINK_TO(op, last_out_var);
-
-    std::unordered_set<const Node*> marked_nodes;
-    for (auto& item : subgraph) {
-      marked_nodes.insert(item.second);
-    }
-    for (size_t i = 0; i < weights_vars.size(); ++i) {
-      marked_nodes.erase(weights_vars[i]);
-      marked_nodes.erase(bias_vars[i]);
-    }
-    for (size_t i = 0; i < relu_vars.size(); ++i) {
-      marked_nodes.erase(relu_vars[i]);
-    }
-    marked_nodes.erase(input_var);
-    marked_nodes.erase(last_out_var);
-    GraphSafeRemoveNodes(graph, marked_nodes);
-    ++fusion_count;
-  };
-
-  gpd(graph, handler);
-  return fusion_count;
-}
-
-void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph);
-  FusePassBase::Init(name_scope_, graph);
-
-  int fusion_count = 0;
-  for (int i = MAX_NUM_FC; i > 1; --i) {
-    fusion_count +=
-        BuildFusion(graph, name_scope_ + "/" + std::to_string(i), i);
-  }
-  AddStatis(fusion_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(repeated_fc_relu_fuse_pass,
-              paddle::framework::ir::RepeatedFCReluFusePass);
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
deleted file mode 100644
index ae777bccebec9f99b4752fe495f96d3da38aac23..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/**
- * Fuse Repeated FC Relu
- */
-class RepeatedFCReluFusePass : public FusePassBase {
- public:
-  virtual ~RepeatedFCReluFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  const std::string name_scope_{"repeated_fc_relu_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
deleted file mode 100644
index 81d9476d409d9472518b14390492c3d9d1ab391c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void TestMain(int num_fc) {
-  // inputs                                 operator    output
-  // -------------------------------------------------------------
-  // (x, filters, bias_0)                   conv2d   -> conv2d_out
-  // (conv2d_out, fc_weights_0, fc_bias_0)  fc       -> fc_out_0
-  // (fc_out_0, fc_weights_1, fc_bias_1)    fc       -> fc_out_1
-  // ...
-  Layers layers;
-  VarDesc* x = layers.data("x");
-  VarDesc* filters = layers.data("filters", {}, true);
-  VarDesc* bias_0 = layers.data("bias_0", {}, true);
-  VarDesc* conv2d_out = layers.conv2d(x, filters, bias_0);
-  VarDesc* fc_in = conv2d_out;
-  for (int i = 0; i < num_fc; ++i) {
-    VarDesc* weights_i =
-        layers.data("fc_weights_" + std::to_string(i), {}, true);
-    VarDesc* bias_i = layers.data("fc_bias_" + std::to_string(i), {}, true);
-    std::string activation_type = i < (num_fc - 1) ? "relu" : "";
-    VarDesc* fc_out = layers.fc(fc_in, weights_i, bias_i, 1, activation_type);
-    fc_in = fc_out;
-  }
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
-  auto pass = PassRegistry::Instance().Get("repeated_fc_relu_fuse_pass");
-  int num_nodes_before = graph->Nodes().size();
-  int num_fc_nodes_before = GetNumOpNodes(graph, "fc");
-  VLOG(3) << DebugString(graph);
-
-  graph.reset(pass->Apply(graph.release()));
-  int num_nodes_after = graph->Nodes().size();
-  int num_fused_nodes_after = GetNumOpNodes(graph, "fusion_repeated_fc_relu");
-  VLOG(3) << DebugString(graph);
-
-  // Delete (num_fc_nodes_before - 1) fc ops
-  PADDLE_ENFORCE_EQ(num_nodes_before - (num_fc_nodes_before - 1) + 1,
-                    num_nodes_after);
-  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1);
-}
-
-TEST(RepeatedFCReluFusePass, basic_3) { TestMain(3); }
-
-TEST(RepeatedFCReluFusePass, basic_9) { TestMain(9); }
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(repeated_fc_relu_fuse_pass);
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
deleted file mode 100644
index 566b654f237cbd71e1983c971374ee13d7b36805..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
-#include <memory>
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const {
-  VLOG(3) << "Applies Runtime Context Cache strategy.";
-  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->Op()) {
-      n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(runtime_context_cache_pass,
-              paddle::framework::ir::RuntimeContextCachePass);
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
deleted file mode 100644
index e4783166e0cbde0be9037df5afe3e903a40a2065..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class RuntimeContextCachePass : public Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
deleted file mode 100644
index b230c50167136d2616068078ce619e8362c38fde..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ /dev/null
@@ -1,255 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
-#include <set>
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-struct FuseExpr {};
-
-// sequence expand, concat fuse pattern, return concat's output
-PDNode* BuildSeqExpandConcatPattern(PDPattern* pattern) {
-  // The following operators will be fused:
-  // concat
-  // sequence_expand
-  // sequence_expand
-
-  // The following variables will be treat as inputs:
-  // concat mid input, 0th input for fused op
-  // sequence_expand input, 1th input for fused op
-  // sequence_expand input, 2th input for fused op
-
-  // The following variables will be treat as outputs:
-  // concat output
-
-  // So the following variables will be removed:
-  // sequence-expand output
-  // sequence-expand output
-
-  // Three operators
-  auto* sequence_expand0 = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "sequence_expand";
-      },
-      "sequence_expand0");
-
-  auto* sequence_expand1 = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "sequence_expand";
-      },
-      "sequence_expand1");
-
-  auto* concat = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "concat" &&  // basic check
-               x->Op()->Input("X").size() == 3;                  // Special case
-      },
-      "concat");
-
-  auto* sequence_expand0_in = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsVar() && VarLinksToOp(x, "sequence_expand");
-      },
-      "sequence_expand0_in");
-  auto* sequence_expand1_in = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsVar() && VarLinksToOp(x, "sequence_expand");
-      },
-      "sequence_expand1_in");
-
-  // The variables
-  auto* sequence_expand0_out = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsVar() &&
-               VarLinksFromOp(x, "sequence_expand") &&  // basic check
-               VarLinksToOp(x, "concat") &&             // is concat's input
-               IsNthInput(x, x->outputs[0], "X", 1);    // X[0]
-      },
-      "sequence_expand0_out");
-
-  auto* sequence_expand1_out = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsVar() &&
-               VarLinksFromOp(x, "sequence_expand") &&  // basic check
-               VarLinksToOp(x, "concat") &&             // is concat's input
-               IsNthInput(x, x->outputs[0], "X", 2);    // x[2]
-      },
-      "sequence_expand1_out");
-
-  auto* concat_in0 = pattern->NewNode(
-      [](Node* x) { return x && x->IsVar() && VarLinksToOp(x, "concat"); },
-      "concat_in0");
-
-  auto* concat_out = pattern->NewNode(
-      [](Node* x) { return x && x->IsVar() && VarLinksFromOp(x, "concat"); },
-      "concat_out");
-
-  // Links
-  sequence_expand0->LinksFrom({sequence_expand0_in})
-      .LinksTo({sequence_expand0_out});
-  sequence_expand1->LinksFrom({sequence_expand1_in})
-      .LinksTo({sequence_expand1_out});
-  concat->LinksFrom({sequence_expand0_out, sequence_expand1_out, concat_in0})
-      .LinksTo({concat_out});
-  return concat_out;
-}
-
-PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) {
-  PDNode* fc_w = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsVar() &&                 // basic
-               VarLinksToOp(x, "mul") &&          // link
-               x->Var()->Proto()->persistable();  // is a parameter
-      },
-      "fc_w");
-
-  PDNode* mul_out = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsVar() &&                     // basic
-               VarLinksFromOp(x, "mul") &&            // link
-               VarLinksToOp(x, "elementwise_add") &&  //
-               !x->Var()->Proto()->persistable();     // is a parameter
-      },
-      "mul_out");
-
-  PDNode* fc_mul = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "mul";  // basic
-      },
-      "fc_mul");
-
-  PDNode* fc_bias = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsVar() &&                     // basic
-               VarLinksToOp(x, "elementwise_add") &&  // link
-               x->Var()->Proto()->persistable();      // is a parameter
-      },
-      "fc_bias");
-
-  PDNode* elementwise_add = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "elementwise_add";
-      },
-      "elementwise_add");
-
-  PDNode* add_out = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsVar() &&                       // basic
-               VarLinksFromOp(x, "elementwise_add") &&  // link
-               !x->Var()->Proto()->persistable();       // is a parameter
-      },
-      "add_out");
-
-  std::set<std::string> acts({"sigmoid", "tanh", "relu", "identity"});
-  PDNode* act = pattern->NewNode(
-      [=](Node* x) { return x && x->IsOp() && acts.count(x->Op()->Type()); },
-      "act");
-
-  PDNode* fc_out = pattern->NewNode(
-      [](Node* x) {
-        return x && x->IsVar() &&                  // basic
-               !x->Var()->Proto()->persistable();  // is a parameter
-      },
-      "fc_out");
-
-  fc_mul->LinksFrom({fc_w, fc_x}).LinksTo({mul_out});
-  elementwise_add->LinksFrom({mul_out, fc_bias}).LinksTo({add_out});
-  act->LinksFrom({add_out}).LinksTo({fc_out});
-  return fc_out;
-}
-
-void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init("seq_concat_fc_fuse", graph);
-  GraphPatternDetector detector;
-  auto* pattern = detector.mutable_pattern();
-  auto* concat_out = BuildSeqExpandConcatPattern(pattern);
-  BuildFCPattern(pattern, concat_out);
-
-#define GET_NODE(id, pattern)                               \
-  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
-                 "pattern has no Node called %s", #id);     \
-  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
-  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
-
-  int fuse_count{0};
-
-  detector(graph, [&](const GraphPatternDetector::subgraph_t& subgraph,
-                      Graph* graph) {
-    VLOG(4) << "get one concat pattern";
-    // fc
-    GET_NODE(fc_w, detector.pattern());
-    GET_NODE(fc_bias, detector.pattern());
-    GET_NODE(act, detector.pattern());
-    GET_NODE(fc_out, detector.pattern());
-
-    // concat
-    GET_NODE(concat_in0, detector.pattern());
-    GET_NODE(sequence_expand0_in, detector.pattern());
-    GET_NODE(sequence_expand1_in, detector.pattern());
-
-    OpDesc op_desc;
-    op_desc.SetType("fusion_seqexpand_concat_fc");
-    op_desc.SetInput("X", {concat_in0->Name(), sequence_expand0_in->Name(),
-                           sequence_expand1_in->Name()});
-    op_desc.SetInput("FCWeight", {fc_w->Name()});
-    op_desc.SetInput("FCBias", {fc_bias->Name()});
-    const std::string fc_out_tmp = fc_out->Name() + ".tmp";
-    param_scope()->Var(fc_out_tmp)->GetMutable<framework::LoDTensor>();
-    op_desc.SetOutput("FCOut", {fc_out_tmp});
-    op_desc.SetOutput("Out", {fc_out->Name()});
-    op_desc.SetAttr("fc_activation", act->Op()->Type());
-
-    auto* op_node = graph->CreateOpNode(&op_desc);
-    // Add links
-    IR_NODE_LINK_TO(fc_w, op_node);
-    IR_NODE_LINK_TO(fc_bias, op_node);
-    IR_NODE_LINK_TO(concat_in0, op_node);
-    IR_NODE_LINK_TO(sequence_expand0_in, op_node);
-    IR_NODE_LINK_TO(sequence_expand1_in, op_node);
-    IR_NODE_LINK_TO(op_node, fc_out);
-
-    // Clean nodes.
-    std::unordered_set<const Node*> marked_nodes;
-    for (auto& item : subgraph) {
-      marked_nodes.insert(item.second);
-    }
-    marked_nodes.erase(fc_w);
-    marked_nodes.erase(fc_bias);
-    marked_nodes.erase(concat_in0);
-    marked_nodes.erase(sequence_expand0_in);
-    marked_nodes.erase(sequence_expand1_in);
-    marked_nodes.erase(fc_out);
-    GraphSafeRemoveNodes(graph, marked_nodes);
-
-    ++fuse_count;
-  });
-
-  AddStatis(fuse_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(seq_concat_fc_fuse_pass,
-              paddle::framework::ir::SeqConcatFcFusePass);
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
deleted file mode 100644
index d68840a554777e64082f7f9e467221bc0948d9dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class SeqConcatFcFusePass : public FusePassBase {
- public:
-  virtual ~SeqConcatFcFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
deleted file mode 100644
index 556d28a42ae8d664712417add43732cb57f67355..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-
-  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "X"))
-                  ->assert_is_op_input("sequence_conv")
-                  ->assert_var_not_persistable();
-  patterns::SeqConvEltAddRelu fuse_pattern(pattern, name_scope);
-  fuse_pattern(x);
-
-  // Create New OpDesc
-  auto fuse_creator = [&](Node* seqconv, Node* input, Node* seqconv_weight,
-                          Node* eltadd_bias, Node* relu_out) {
-    OpDesc op_desc;
-    op_desc.SetType("fusion_seqconv_eltadd_relu");
-    op_desc.SetInput("X", {input->Name()});
-    op_desc.SetInput("Filter", {seqconv_weight->Name()});
-    op_desc.SetInput("Bias", {eltadd_bias->Name()});
-    op_desc.SetAttr("contextLength", seqconv->Op()->GetAttr("contextLength"));
-    op_desc.SetAttr("contextStart", seqconv->Op()->GetAttr("contextStart"));
-    op_desc.SetAttr("contextStride", seqconv->Op()->GetAttr("contextStride"));
-    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
-    const std::string ColMat = patterns::UniqueKey("SeqConvColMat");
-    op_desc.SetOutput("ColMat", {ColMat});
-    op_desc.SetOutput("Out", {relu_out->Name()});
-    scope.Var(ColMat)->GetMutable<LoDTensor>();
-
-    auto* op = graph->CreateOpNode(&op_desc);
-    IR_NODE_LINK_TO(input, op);
-    IR_NODE_LINK_TO(seqconv_weight, op);
-    IR_NODE_LINK_TO(eltadd_bias, op);
-    IR_NODE_LINK_TO(op, relu_out);
-    return op;
-  };
-
-  int fusion_count{0};
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle SeqConv EltAdd Relu fuse";
-    GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(eltadd, eltadd, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(eltadd_bias, eltadd_bias, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(eltadd_out, eltadd_out, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, fuse_pattern);
-
-    fuse_creator(seqconv, subgraph.at(x), seqconv_weight, eltadd_bias,
-                 relu_out);
-    std::unordered_set<const Node*> marked_nodes(
-        {seqconv, seqconv_out, eltadd, eltadd_out, relu});
-    GraphSafeRemoveNodes(graph, marked_nodes);
-    ++fusion_count;
-  };
-
-  gpd(graph, handler);
-
-  return fusion_count;
-}
-
-void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init(name_scope_, graph);
-
-  int fusion_count = BuildFusion(graph, name_scope_, param_scope());
-  AddStatis(fusion_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(seqconv_eltadd_relu_fuse_pass,
-              paddle::framework::ir::SeqConvEltAddReluFusePass);
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
deleted file mode 100644
index fde9b586c85712b14d285cec49f9e09efad78fc7..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class SeqConvEltAddReluFusePass : public FusePassBase {
- public:
-  virtual ~SeqConvEltAddReluFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
deleted file mode 100644
index 4ac379eb0471ea1a8a72c393dad405be90b2fa33..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-#define MAX_CONCAT_INPUTS 200
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
-                                  const std::string& name_scope,
-                                  int num_inputs) {
-  auto is_concat_op_with_inputs = [](Node* x, int num) -> bool {
-    return x && x->IsOp() && x->Op()->Type() == "concat" &&
-           x->Op()->Input("X").size() == static_cast<size_t>(num);
-  };
-
-  auto is_nth_input_var_of_concat = [=](Node* x, int idx) -> bool {
-    return x && x->IsVar() && VarLinksToOp(x, "concat") &&
-           x->outputs.size() == 1 && IsNthInput(x, x->outputs[0], "X", idx) &&
-           is_concat_op_with_inputs(x->outputs[0], num_inputs);
-  };
-
-  auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=](
-      Node* x, const std::string& type, int idx) -> bool {
-    bool this_is_seqpool_op =
-        x && x->IsOp() && x->Op()->Type() == "sequence_pool" &&
-        x->Op()->HasAttr("pooltype") &&
-        boost::get<std::string>(x->Op()->GetAttr("pooltype")) == type &&
-        x->outputs.size() == 2;  // seqpool should only have 2 outputs
-    bool satisfied_all = this_is_seqpool_op;
-    if (this_is_seqpool_op) {
-      // Only one output of seqpool_op is nth_input_var of concat,
-      // the other one should be unused empty var.
-      if (is_nth_input_var_of_concat(x->outputs[0], idx)) {
-        satisfied_all = satisfied_all && x->outputs[1]->IsVar() &&
-                        x->outputs[1]->outputs.empty();
-      } else {
-        satisfied_all =
-            satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) &&
-            x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0;
-      }
-    }
-    return satisfied_all;
-  };
-
-  auto* concat_op = pattern->NewNode(
-      [=](Node* x) { return is_concat_op_with_inputs(x, num_inputs); },
-      name_scope + "/concat_op");
-  concat_op->assert_op_attr<int>("axis", 1);
-
-  auto* concat_out_var = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsVar() && VarLinksFromOp(x, "concat") &&
-               x->inputs.size() == 1 &&
-               is_concat_op_with_inputs(x->inputs[0], num_inputs);
-      },
-      name_scope + "/concat_out_var");
-  concat_out_var->assert_is_only_output_of_op("concat");
-
-  std::vector<PDNode*> seqpool_ops_input_var(num_inputs);
-  std::vector<PDNode*> seqpool_ops_output_var(num_inputs);
-  std::vector<PDNode*> seqpool_ops_output_unused_var(num_inputs);
-  std::vector<PDNode*> seqpool_ops(num_inputs);
-
-  for (int i = 0; i < num_inputs; ++i) {
-    seqpool_ops_output_var[i] = pattern->NewNode(
-        [=](Node* x) {
-          return x && x->IsVar() && is_nth_input_var_of_concat(x, i) &&
-                 x->inputs.size() == 1 &&
-                 is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0],
-                                                                   "SUM", i);
-        },
-        name_scope + "/sequence_pool_out_" + std::to_string(i));
-
-    seqpool_ops_output_unused_var[i] = pattern->NewNode(
-        [=](Node* x) {
-          return x && x->IsVar() && x->inputs.size() == 1 &&
-                 x->outputs.size() == 0 &&
-                 is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0],
-                                                                   "SUM", i);
-        },
-        name_scope + "/sequence_pool_unused_out_" + std::to_string(i));
-
-    seqpool_ops[i] = pattern->NewNode(
-        [=](Node* x) {
-          return x && x->IsOp() &&
-                 is_seqpool_op_with_pootype_of_nth_input_of_concat(x, "SUM", i);
-        },
-        name_scope + "/sequence_pool_op_" + std::to_string(i));
-
-    seqpool_ops_input_var[i] = pattern->NewNode(
-        [=](Node* x) {
-          bool basic = x && x->IsVar() && x->outputs.size() >= 1;
-          bool next_is_fine = false;
-          for (auto* o : x->outputs) {
-            if (is_seqpool_op_with_pootype_of_nth_input_of_concat(o, "SUM",
-                                                                  i)) {
-              next_is_fine = true;
-              break;
-            }
-          }
-          return basic && next_is_fine;
-        },
-        name_scope + "/sequence_pool_in_" + std::to_string(i));
-
-    // Links
-    seqpool_ops[i]
-        ->LinksFrom({seqpool_ops_input_var[i]})
-        .LinksTo({seqpool_ops_output_var[i], seqpool_ops_output_unused_var[i]});
-  }
-  concat_op->LinksFrom(seqpool_ops_output_var).LinksTo({concat_out_var});
-  return concat_out_var;
-}
-
-static int BuildFusion(Graph* graph, const std::string& name_scope,
-                       int num_inputs) {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-  BuildSeqPoolConcatPattern(pattern, name_scope, num_inputs);
-
-  auto retrieve_node = [](const std::string& name,
-                          const GraphPatternDetector::subgraph_t& subgraph,
-                          const PDPattern& pat) -> Node* {
-    PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)),
-                   "pattern has no Node called %s", name.c_str());
-    Node* p = subgraph.at(pat.RetrieveNode(name));
-    PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str());
-    return p;
-  };
-
-  int fusion_count{0};
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle SeqPool Concat fuse";
-    std::vector<std::string> input_names(num_inputs);
-    std::vector<Node*> input_vars(num_inputs);
-    auto& fused_pattern = gpd.pattern();
-    for (int i = 0; i < num_inputs; ++i) {
-      input_vars[i] =
-          retrieve_node(name_scope + "/sequence_pool_in_" + std::to_string(i),
-                        subgraph, fused_pattern);
-      input_names[i] = input_vars[i]->Name();
-    }
-    auto* concat_op =
-        retrieve_node(name_scope + "/concat_op", subgraph, fused_pattern);
-    auto* concat_out_var =
-        retrieve_node(name_scope + "/concat_out_var", subgraph, fused_pattern);
-    auto* seqpool_op0 = retrieve_node(name_scope + "/sequence_pool_op_0",
-                                      subgraph, fused_pattern);
-
-    // Create New OpDesc
-    OpDesc op_desc;
-    op_desc.SetType("fusion_seqpool_concat");
-    op_desc.SetInput("X", input_names);
-    op_desc.SetAttr("pooltype", seqpool_op0->Op()->GetAttr("pooltype"));
-    op_desc.SetAttr("axis", concat_op->Op()->GetAttr("axis"));
-    op_desc.SetOutput("Out", {concat_out_var->Name()});
-    auto* op = graph->CreateOpNode(&op_desc);
-    for (size_t i = 0; i < input_vars.size(); ++i) {
-      IR_NODE_LINK_TO(input_vars[i], op);
-    }
-    IR_NODE_LINK_TO(op, concat_out_var);
-
-    std::unordered_set<const Node*> marked_nodes;
-    for (auto& item : subgraph) {
-      marked_nodes.insert(item.second);
-    }
-    for (size_t i = 0; i < input_vars.size(); ++i) {
-      marked_nodes.erase(input_vars[i]);
-    }
-    marked_nodes.erase(concat_out_var);
-    GraphSafeRemoveNodes(graph, marked_nodes);
-    ++fusion_count;
-  };
-
-  gpd(graph, handler);
-  return fusion_count;
-}
-
-void SeqPoolConcatFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init(name_scope_, graph);
-  int fusion_count = 0;
-  for (int i = MAX_CONCAT_INPUTS; i > 0; --i) {
-    fusion_count +=
-        BuildFusion(graph, name_scope_ + "/" + std::to_string(i), i);
-  }
-  AddStatis(fusion_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(seqpool_concat_fuse_pass,
-              paddle::framework::ir::SeqPoolConcatFusePass);
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
deleted file mode 100644
index 40a9edc5e642320996f5bd3451479fe347f24081..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/**
- * Fuse SequencePool(with sum pooltype yet) and Concat;
- *
- * Before fuse:
- *    |         |             |
- * seq_pool, seq_pool, ... seq_pool
- *    \         |      ...   /
- *            concat
- *              |
- * After fuse:
- *    \      |       /
- *   FusionSeqPoolConcat
- *           |
- */
-class SeqPoolConcatFusePass : public FusePassBase {
- public:
-  virtual ~SeqPoolConcatFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  const std::string name_scope_{"seqpool_concat_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
deleted file mode 100644
index d3668038518429ee04b6abba5b1f7f09eea1c9f3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  if (type == "sequence_pool") {
-    op->SetInput("X", {inputs[0]});
-    std::string pooltype = "SUM";
-    op->SetAttr("pooltype", pooltype);
-    op->SetOutput("MaxIndex", {outputs[0]});
-    op->SetOutput("Out", {outputs[1]});
-  } else if (type == "concat") {
-    op->SetInput("X", inputs);
-    op->SetAttr("axis", 1);
-    op->SetOutput("Out", {outputs[0]});
-  } else {
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-  }
-  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(OpRole::kForward));
-}
-
-int CountOpType(const ir::Graph* graph,
-                const std::string& op_type = "fusion_seqpool_concat") {
-  int count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()->Type() == op_type) {
-      ++count;
-    }
-  }
-  return count;
-}
-
-std::unique_ptr<ir::Graph> GetNumNodesOfBeforeAfter(
-    std::unique_ptr<ir::Graph> graph, int* before, int* after,
-    const std::string& pass_type = "seqpool_concat_fuse_pass") {
-  auto pass = PassRegistry::Instance().Get(pass_type);
-  *before = graph->Nodes().size();
-  graph.reset(pass->Apply(graph.release()));
-  *after = graph->Nodes().size();
-  return graph;
-}
-
-/*
- * Before fuse:
- *    a         b         c
- *    |         |         |
- *   op1       op2       op3
- *   / \       / \       / \
- *  d  e      f   g     h   i
- *      \         |        /
- *            concat
- *              |
- *              j
- * Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr
- *
- * After fuse:
- *    a         b         c
- *    \         |        /
- *    fusion_seqpool_concat
- *              |
- *              j
- */
-TEST(SeqPoolConcatFusePass, basic) {
-  ProgramDesc prog;
-  for (auto& v : std::vector<std::string>(
-           {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::LOD_TENSOR);
-  }
-
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"d", "e"}));
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"f", "g"}));
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"c"}),
-        std::vector<std::string>({"h", "i"}));
-  SetOp(&prog, "concat", std::vector<std::string>({"e", "g", "i"}),
-        std::vector<std::string>({"j"}));
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int before, after;
-  graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
-  // Remove 10 Nodes: op1, op2, op3, d, e, f, g, h, i, concat_op
-  // Add 1 Node: fusion_seqpool_concat
-  EXPECT_EQ(after, before - 9);
-  EXPECT_EQ(CountOpType(graph.get()), 1);
-}
-
-/*
- * Before fuse:
- *    a            b
- *    |           /  \
- *   op1        op2  op3
- *   / \        / \    \
- *  c  d       e   f    g
- *      \         /
- *        concat
- *          |
- *          h
- * Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr
- *
- * After fuse:
- *   a                         b
- *    \                     /     \
- *    fusion_seqpool_concat       op3
- *              |                  |
- *              h                  g
- */
-TEST(SeqPoolConcatFusePass, advanced) {
-  ProgramDesc prog;
-  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::LOD_TENSOR);
-  }
-
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"c", "d"}));
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"e", "f"}));
-  SetOp(&prog, "op3", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"g"}));
-  SetOp(&prog, "concat", std::vector<std::string>({"d", "f"}),
-        std::vector<std::string>({"h"}));
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int before, after;
-  graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
-  // Remove 7 Nodes: op1, op2, c, d, e, f concat_op
-  // Add 1 Node: fusion_seqpool_concat
-  EXPECT_EQ(after, before - 6);
-  EXPECT_EQ(CountOpType(graph.get()), 1);
-}
-
-ProgramDesc BuildProgramDesc(int num_inputs_of_concat) {
-  ProgramDesc prog;
-  auto new_var = [&](const std::string& name) {
-    auto* var = prog.MutableBlock(0)->Var(name);
-    var->SetType(proto::VarType::LOD_TENSOR);
-  };
-  std::vector<std::string> concat_inputs;
-  for (int i = 0; i < num_inputs_of_concat; ++i) {
-    std::string prefix = "seqpool_op_" + std::to_string(i);
-    new_var(prefix + "in");
-    new_var(prefix + "out");
-    new_var(prefix + "out_unused");
-    SetOp(&prog, "sequence_pool", std::vector<std::string>({prefix + "in"}),
-          std::vector<std::string>({prefix + "out", prefix + "out_unused"}));
-    concat_inputs.push_back(prefix + "out");
-  }
-  SetOp(&prog, "concat", concat_inputs,
-        std::vector<std::string>({"concat_out"}));
-  return prog;
-}
-
-// test more inputs of concat
-TEST(SeqPoolConcatFusePass, more_inputs) {
-  for (int num : {1, 2, 10}) {
-    ProgramDesc prog = BuildProgramDesc(num);
-    std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-    int before, after;
-    graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
-    // Remove Nodes: n * (seqpool_op, out, out_unused), and concat_op
-    // Add Node: fusion_seqpool_concat op
-    EXPECT_EQ(after, before - num * 3);
-    EXPECT_EQ(CountOpType(graph.get()), 1);
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(seqpool_concat_fuse_pass);
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
deleted file mode 100644
index 8261bfc15348f90af4ed7acb9e5b68373dc5e715..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h"
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-namespace {
-static PDNode* BuildCVMConcatPattern(PDPattern* pattern) {
-  auto cvm_behind_x = [](Node* x) -> bool {
-    Node* adj = x->inputs[0];
-    Node* alt = x->inputs[0]->inputs[0];
-    return x && adj && adj->IsVar() && alt->IsOp() &&
-           alt->Op()->Type() == "cvm";
-  };
-  auto* concat_op_node = pattern->NewNode("concat_op")
-                             ->assert_is_op("concat")
-                             ->assert_op_attr<int>("axis", 1)
-                             ->assert_more(cvm_behind_x);
-  return concat_op_node;
-}
-
-static void GetConcatNodes(ir::Graph* graph, std::vector<Node*>* concat_nodes) {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-  auto concat_op_node = BuildCVMConcatPattern(pattern);
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    Node* concat_op = subgraph.at(concat_op_node);
-    concat_nodes->push_back(concat_op);
-  };
-  gpd(graph, handler);
-}
-}  // anonymous namespace
-
-void SeqPoolCVMConcatFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init("seqpool_cvm_concat_fuse", graph);
-  std::vector<Node*> concat_nodes;
-  GetConcatNodes(graph, &concat_nodes);
-
-  int count = 0;
-  for (auto* concat_node : concat_nodes) {
-    GraphPatternDetector gpd;
-    auto* pattern = gpd.mutable_pattern();
-    auto concat_before_x = [=](Node* x) -> bool {
-      return x && x->outputs[0] == concat_node;
-    };
-    PDNode* seqpool_in_var_node =
-        pattern->NewNode("seqpool_in_var")
-            ->assert_is_only_input_of_op("sequence_pool");
-    PDNode* seqpool_op_node =
-        pattern->NewNode("seqpool_op")
-            ->assert_is_op("sequence_pool")
-            ->assert_op_attr<std::string>("pooltype", "SUM");
-    PDNode* seqpool_out_var_node =
-        pattern->NewNode("seqpool_out_var")
-            ->assert_is_op_nth_output("sequence_pool", "Out", 0)
-            ->assert_is_op_nth_input("cvm", "X", 0);
-    PDNode* seqpool_idx_out_var_node =
-        pattern->NewNode("seqpool_idx_out_var")
-            ->assert_is_op_nth_output("sequence_pool", "MaxIndex", 0);
-    PDNode* cvm_op_node =
-        pattern->NewNode("cvm_op")->assert_is_op("cvm")->assert_op_attr<bool>(
-            "use_cvm", true);
-    PDNode* cvm_out_var_node = pattern->NewNode("cvm_op_out_var")
-                                   ->assert_is_op_nth_output("cvm", "Y", 0)
-                                   ->assert_more(concat_before_x);
-    PDNode* cvm_cvm_in_var_node = pattern->NewNode("cvm_cvm_in_var")
-                                      ->assert_is_op_nth_input("cvm", "CVM", 0);
-
-    seqpool_op_node->LinksFrom({seqpool_in_var_node})
-        .LinksTo({seqpool_out_var_node, seqpool_idx_out_var_node});
-    seqpool_out_var_node->LinksFrom({seqpool_op_node}).LinksTo({cvm_op_node});
-    cvm_op_node->LinksTo({cvm_out_var_node})
-        .LinksFrom({cvm_cvm_in_var_node, seqpool_out_var_node});
-
-    std::unordered_map<std::string, Node*> ins_to_concat;
-    std::vector<Node*> subgraph_ins;
-    std::vector<std::string> subgraph_ins_name;
-    std::unordered_set<const Node*> marked_nodes;
-
-    Node* cvm_input_of_cvm;
-    Node* concat_out_var = concat_node->outputs[0];
-
-    GraphPatternDetector::handle_t handler = [&](
-        const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-      Node* seqpool_in_var = subgraph.at(seqpool_in_var_node);
-      Node* seqpool_op = subgraph.at(seqpool_op_node);
-      Node* seqpool_out_var = subgraph.at(seqpool_out_var_node);
-      Node* seqpool_idx_out_var = subgraph.at(seqpool_idx_out_var_node);
-      Node* cvm_op = subgraph.at(cvm_op_node);
-      Node* cvm_out_var = subgraph.at(cvm_out_var_node);
-      cvm_input_of_cvm = subgraph.at(cvm_cvm_in_var_node);
-      marked_nodes.insert({seqpool_op, seqpool_out_var, seqpool_idx_out_var,
-                           cvm_op, cvm_out_var, concat_node});
-      ins_to_concat[cvm_out_var->Name()] = seqpool_in_var;
-    };
-    gpd(graph, handler);
-
-    if (!ins_to_concat.empty()) {
-      for (const auto* in : concat_node->inputs) {
-        subgraph_ins.push_back(ins_to_concat.at(in->Name()));
-        subgraph_ins_name.push_back(ins_to_concat.at(in->Name())->Name());
-      }
-
-      // Create New OpDesc
-      OpDesc op_desc;
-      op_desc.SetType("fusion_seqpool_cvm_concat");
-      op_desc.SetInput("X", subgraph_ins_name);
-      op_desc.SetInput("CVM", {cvm_input_of_cvm->Name()});
-      op_desc.SetAttr("pooltype", std::string("SUM"));
-      op_desc.SetAttr("use_cvm", true);
-      op_desc.SetAttr("axis", concat_node->Op()->GetAttr("axis"));
-      op_desc.SetOutput("Out", {concat_out_var->Name()});
-      auto* op = graph->CreateOpNode(&op_desc);
-
-      for (size_t i = 0; i < subgraph_ins.size(); ++i) {
-        IR_NODE_LINK_TO(subgraph_ins[i], op);
-      }
-      IR_NODE_LINK_TO(cvm_input_of_cvm, op);
-      IR_NODE_LINK_TO(op, concat_out_var);
-
-      GraphSafeRemoveNodes(graph, marked_nodes);
-      count++;
-    }
-  }
-  AddStatis(count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(seqpool_cvm_concat_fuse_pass,
-              paddle::framework::ir::SeqPoolCVMConcatFusePass);
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
deleted file mode 100644
index 88a41983c6bf7b4e76d7912dbb3821b2c2ed533b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/**
- * Fuse SequencePool(with sum pooltype yet) and Concat;
- *
- * Before fuse:
- *    |         |             |
- * seq_pool, seq_pool, ... seq_pool
- *    |         |             |
- *   cvm       cvm           cvm
- *    \         |      ...   /
- *            concat
- *              |
- * After fuse:
- *    \      |       /
- * FusionSeqPoolCVMConcat
- *           |
- */
-class SeqPoolCVMConcatFusePass : public FusePassBase {
- public:
-  virtual ~SeqPoolCVMConcatFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  const std::string name_scope_{"seqpool_cvm_concat_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
deleted file mode 100644
index bba640cf148d1ebfc2583b420c3ffd8ff1d110f1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
+++ /dev/null
@@ -1,239 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  if (type == "sequence_pool") {
-    op->SetInput("X", {inputs[0]});
-    std::string pooltype = "SUM";
-    op->SetAttr("pooltype", pooltype);
-    op->SetOutput("MaxIndex", {outputs[0]});
-    op->SetOutput("Out", {outputs[1]});
-  } else if (type == "concat") {
-    op->SetInput("X", inputs);
-    op->SetAttr("axis", 1);
-    op->SetOutput("Out", {outputs[0]});
-  } else if (type == "cvm") {
-    op->SetInput("X", {inputs[0]});
-    op->SetInput("CVM", {inputs[1]});
-    op->SetOutput("Y", {outputs[0]});
-    op->SetAttr("use_cvm", true);
-  } else {
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-  }
-  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(OpRole::kForward));
-}
-
-int CountOpType(const ir::Graph* graph,
-                const std::string& op_type = "fusion_seqpool_cvm_concat") {
-  int count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()->Type() == op_type) {
-      ++count;
-    }
-  }
-  return count;
-}
-
-std::unique_ptr<ir::Graph> GetNumNodesOfBeforeAfter(
-    std::unique_ptr<ir::Graph> graph, int* before, int* after,
-    const std::string& pass_type = "seqpool_cvm_concat_fuse_pass") {
-  auto pass = PassRegistry::Instance().Get(pass_type);
-  *before = graph->Nodes().size();
-  graph.reset(pass->Apply(graph.release()));
-  *after = graph->Nodes().size();
-  return graph;
-}
-
-/*
- * Before fuse:
- *
- *
- *    a          b          c
- *    |          |          |
- *   op1        op2        op3
- *   / \        / \        / \
- *  d  e  n    f   g   n   h  i   n
- *     |  /        |  /       |  /
- *    op4         op5        op6
- *     |           |          |
-       j           k          l
- *     \           |         /
- *               concat
- *                 |
- *                 m
- *
- * Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr.
- * Type of op4, op5 and op6 are cvm, with use_cvm is true.
- *
- * After fuse:
- *    a      b      c      n
- *    \      |      |     /
- *  fusion_seqpool_cvm_concat
- *              |
- *              m
- */
-TEST(SeqPoolCVMConcatFusePass, basic) {
-  ProgramDesc prog;
-  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h", "i",
-                                 "j", "k", "l", "m", "n"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::LOD_TENSOR);
-  }
-
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"d", "e"}));
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"f", "g"}));
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"c"}),
-        std::vector<std::string>({"h", "i"}));
-  SetOp(&prog, "cvm", std::vector<std::string>({"e", "n"}),
-        std::vector<std::string>({"j"}));
-  SetOp(&prog, "cvm", std::vector<std::string>({"g", "n"}),
-        std::vector<std::string>({"k"}));
-  SetOp(&prog, "cvm", std::vector<std::string>({"i", "n"}),
-        std::vector<std::string>({"l"}));
-  SetOp(&prog, "concat", std::vector<std::string>({"j", "k", "l"}),
-        std::vector<std::string>({"m"}));
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int before, after;
-  graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
-  // Remove 16 Nodes: op1, op2, op3, op4, op5, op6, d, e, f, g, h, i, j, k, l,
-  // concat_op
-  // Add 1 Node: fusion_seqpool_cvm_concat
-  EXPECT_EQ(after, before - 15);
-  EXPECT_EQ(CountOpType(graph.get()), 1);
-}
-
-/*
- * Before fuse:
- *    a               b
- *    |           /       \
- *   op1  k     op2   k   op3
- *   / \ /      / \  /      \
- *  c  d       e   f         g
- *     |           |
- *    op4         op5
- *     |           |
- *     h           i
- *      \         /
- *        concat
- *          |
- *          j
- * Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr.
- * Type of op4 and op5 are cvm, with use_cvm is true.
- *
- * After fuse:
- *   a          k              b
- *    \         |           /     \
- *   fusion_seqpool_cvm_concat    op3
- *              |                  |
- *              j                  g
- */
-TEST(SeqPoolCVMConcatFusePass, advanced) {
-  ProgramDesc prog;
-  for (auto& v : std::vector<std::string>(
-           {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::LOD_TENSOR);
-  }
-
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"c", "d"}));
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"e", "f"}));
-  SetOp(&prog, "op3", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"g"}));
-  SetOp(&prog, "cvm", std::vector<std::string>({"d", "k"}),
-        std::vector<std::string>({"h"}));
-  SetOp(&prog, "cvm", std::vector<std::string>({"f", "k"}),
-        std::vector<std::string>({"i"}));
-  SetOp(&prog, "concat", std::vector<std::string>({"h", "i"}),
-        std::vector<std::string>({"j"}));
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int before, after;
-  graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
-  // Remove 11 Nodes: op1, op2, op4, op5, c, d, e, f, h, i, concat_op
-  // Add 1 Node: fusion_seqpool_cvm_concat
-  EXPECT_EQ(after, before - 10);
-  EXPECT_EQ(CountOpType(graph.get()), 1);
-}
-
-ProgramDesc BuildProgramDesc(int num_inputs_of_concat) {
-  ProgramDesc prog;
-  auto new_var = [&](const std::string& name) {
-    auto* var = prog.MutableBlock(0)->Var(name);
-    var->SetType(proto::VarType::LOD_TENSOR);
-  };
-  std::vector<std::string> concat_inputs;
-  new_var("cvm_in");
-  for (int i = 0; i < num_inputs_of_concat; ++i) {
-    std::string seqpool_prefix = "seqpool_op_" + std::to_string(i);
-    new_var(seqpool_prefix + "in");
-    new_var(seqpool_prefix + "out");
-    new_var(seqpool_prefix + "out_unused");
-    SetOp(&prog, "sequence_pool",
-          std::vector<std::string>({seqpool_prefix + "in"}),
-          std::vector<std::string>(
-              {seqpool_prefix + "out_unused", seqpool_prefix + "out"}));
-
-    std::string cvm_prefix = "cvm_op_" + std::to_string(i);
-    new_var(cvm_prefix + "out");
-    SetOp(&prog, "cvm",
-          std::vector<std::string>({seqpool_prefix + "out", "cvm_in"}),
-          std::vector<std::string>({cvm_prefix + "out"}));
-
-    concat_inputs.push_back(cvm_prefix + "out");
-  }
-  SetOp(&prog, "concat", concat_inputs,
-        std::vector<std::string>({"concat_out"}));
-  return prog;
-}
-
-// test more inputs of concat
-TEST(SeqPoolCVMConcatFusePass, more_inputs) {
-  for (int num : {1, 2, 10}) {
-    ProgramDesc prog = BuildProgramDesc(num);
-    std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-    int before, after;
-    graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
-    // Remove Nodes: n * (seqpool_op, seqpool_out, out_unused, cvm_op, cvm_out),
-    // and concat_op
-    // Add Node: fusion_seqpool_cvm_concat op
-    EXPECT_EQ(after, before - num * 5);
-    EXPECT_EQ(CountOpType(graph.get()), 1);
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(seqpool_cvm_concat_fuse_pass);
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
deleted file mode 100644
index e55783637a6e08578ef7717ba9768f7eece7ca8f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
-#define GET_NODES             \
-  GET_IR_NODE(reshape1_op);   \
-  GET_IR_NODE(reshape1_out);  \
-  GET_IR_NODE(transpose_op);  \
-  GET_IR_NODE(transpose_out); \
-  GET_IR_NODE(reshape2_op);   \
-  GET_IR_NODE(reshape2_out);
-
-void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "shufflechannel_pattern";
-  FusePassBase::Init(pattern_name, graph);
-
-  GraphPatternDetector gpd;
-  auto* x = gpd.mutable_pattern()
-                ->NewNode("x")
-                ->assert_is_op_input("reshape2", "X")
-                ->AsInput();
-
-  patterns::ShuffleChannelPattern pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(x);
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_NODES;
-
-    PADDLE_ENFORCE(subgraph.count(x));
-    auto* input_node = subgraph.at(x);
-    auto reshape1_desc = reshape1_op->Op();
-    auto reshape2_desc = reshape2_op->Op();
-    std::string input_name = input_node->Name();
-    std::string output_name = reshape2_out->Name();
-
-    auto reshape1_shape =
-        boost::get<std::vector<int>>(reshape1_desc->GetAttr("shape"));
-    auto reshape2_shape =
-        boost::get<std::vector<int>>(reshape2_desc->GetAttr("shape"));
-
-    int i_c = reshape1_shape[2];
-    int o_c = reshape2_shape[1];
-    int group = o_c / i_c;
-
-    framework::OpDesc new_op_desc;
-    new_op_desc.SetType("shuffle_channel");
-    new_op_desc.SetInput("X", {input_name});
-    new_op_desc.SetOutput("Out", {output_name});
-
-    new_op_desc.SetAttr("group", group);
-    new_op_desc.Flush();
-
-    // Create a new node for the fused op.
-    auto* new_op = graph->CreateOpNode(&new_op_desc);
-
-    IR_NODE_LINK_TO(input_node, new_op);
-    IR_NODE_LINK_TO(new_op, reshape2_out);
-
-    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph, {reshape1_op, reshape1_out, transpose_op,
-                                 transpose_out, reshape2_op});
-  };
-
-  gpd(graph, handler);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(shuffle_channel_detect_pass,
-              paddle::framework::ir::ShuffleChannelDetectPass);
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h
deleted file mode 100644
index 008f8013efd28b3cdc5a846662653e07e45e3985..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class ShuffleChannelDetectPass : public FusePassBase {
- public:
-  virtual ~ShuffleChannelDetectPass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.cc
deleted file mode 100644
index b3606e4d922cc8f59dca90904466a889f83f6094..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void RunSimplifyAnakinDetection(ir::Graph *graph, int times, bool is_density,
-                                bool is_reshape) {
-  const std::string pattern_name =
-      "simplify_anakin_detection_pattern_pass" + std::to_string(times);
-  std::string priorbox_type = is_density ? "density_prior_box" : "prior_box";
-
-  GraphPatternDetector gpd;
-  std::vector<PDNode *> input_nodes;
-  for (int i = 0; i < times; i++) {
-    input_nodes.push_back(gpd.mutable_pattern()
-                              ->NewNode("x" + std::to_string(i))
-                              ->assert_is_op_input(priorbox_type, "Input")
-                              ->AsInput());
-  }
-  input_nodes.push_back(gpd.mutable_pattern()
-                            ->NewNode("x" + std::to_string(times))
-                            ->assert_is_op_input("box_coder", "TargetBox")
-                            ->AsInput());
-
-  input_nodes.push_back(gpd.mutable_pattern()
-                            ->NewNode("x" + std::to_string(times + 1))
-                            ->assert_is_op_input("transpose2")
-                            ->AsInput());
-
-  patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(input_nodes, times, priorbox_type, is_reshape);
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
-                     Graph *g) {
-    const int kNumFields = 7;
-    const int kPriorBoxLocOffset = 1;
-    const int kReshape1Offset = 2;
-    const int kReshape1OutOffset = 3;
-    const int kPriorBoxVarOffset = 4;
-    const int kReshape2Offset = 5;
-    const int kReshape2OutOffset = 6;
-    std::vector<Node *> nodes;
-
-    for (int i = 0; i < times; i++) {
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
-
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
-
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
-    }
-
-    Node *concat_op1 = subgraph.at(pattern.GetPDNode("concat1"));
-    Node *concat_out1 = subgraph.at(pattern.GetPDNode("concat1_out"));
-
-    Node *concat_op2 = subgraph.at(pattern.GetPDNode("concat2"));
-    Node *concat_out2 = subgraph.at(pattern.GetPDNode("concat2_out"));
-
-    Node *box_coder_third_input = subgraph.at(input_nodes[times]);
-    Node *box_coder_op = subgraph.at(pattern.GetPDNode("box_coder"));
-    Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out"));
-
-    Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]);
-    Node *transpose_before_nms =
-        subgraph.at(pattern.GetPDNode("transpose_before_nms"));
-    Node *transpose_before_nms_out =
-        subgraph.at(pattern.GetPDNode("transpose_before_nms_out"));
-
-    Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms"));
-    Node *multiclass_nms_out =
-        subgraph.at(pattern.GetPDNode("multiclass_nms_out"));
-
-    std::string code_type =
-        boost::get<std::string>(box_coder_op->Op()->GetAttr("code_type"));
-    bool box_normalized =
-        boost::get<bool>(box_coder_op->Op()->GetAttr("box_normalized"));
-
-    int background_label =
-        boost::get<int>(multiclass_nms->Op()->GetAttr("background_label"));
-    float score_threshold =
-        boost::get<float>(multiclass_nms->Op()->GetAttr("score_threshold"));
-    int nms_top_k = boost::get<int>(multiclass_nms->Op()->GetAttr("nms_top_k"));
-    float nms_threshold =
-        boost::get<float>(multiclass_nms->Op()->GetAttr("nms_threshold"));
-    float nms_eta = boost::get<float>(multiclass_nms->Op()->GetAttr("nms_eta"));
-    int keep_top_k =
-        boost::get<int>(multiclass_nms->Op()->GetAttr("keep_top_k"));
-
-    std::vector<std::string> concat1_input_names;
-    for (int i = 0; i < times; i++) {
-      concat1_input_names.push_back(
-          nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
-    }
-
-    framework::OpDesc concat1_desc;
-    concat1_desc.SetType("concat");
-    concat1_desc.SetInput("X", concat1_input_names);
-    concat1_desc.SetAttr("axis", 2);
-    concat1_desc.SetOutput("Out", {concat_out1->Name()});
-
-    auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc);
-
-    for (int i = 0; i < times; i++) {
-      nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(
-          new_add_concat_op);
-      new_add_concat_op->inputs.push_back(
-          nodes[i * kNumFields + kPriorBoxLocOffset]);
-    }
-
-    framework::OpDesc new_op_desc;
-    new_op_desc.SetType("detection_out");
-    new_op_desc.SetInput("PriorBox", {concat_out1->Name()});
-    new_op_desc.SetInput("TargetBox", {box_coder_third_input->Name()});
-    new_op_desc.SetInput("Scores", {multiclass_nms_second_input->Name()});
-    new_op_desc.SetAttr("code_type", code_type);
-    new_op_desc.SetAttr("box_normalized", box_normalized);
-    new_op_desc.SetAttr("background_label", background_label);
-    new_op_desc.SetAttr("score_threshold", score_threshold);
-    new_op_desc.SetAttr("nms_top_k", nms_top_k);
-    new_op_desc.SetAttr("nms_threshold", nms_threshold);
-    new_op_desc.SetAttr("nms_eta", nms_eta);
-    new_op_desc.SetAttr("keep_top_k", keep_top_k);
-    new_op_desc.SetOutput("Out", {multiclass_nms_out->Name()});
-    new_op_desc.Flush();
-
-    // Create a new node for the fused op.
-    auto *detection_out_op = graph->CreateOpNode(&new_op_desc);
-
-    std::unordered_set<const Node *> delete_nodes;
-
-    for (int i = 0; i < times; i++) {
-      nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(concat_op1);
-      delete_nodes.insert(nodes[i * kNumFields + kReshape1Offset]);
-      delete_nodes.insert(nodes[i * kNumFields + kReshape1OutOffset]);
-      delete_nodes.insert(nodes[i * kNumFields + kPriorBoxVarOffset]);
-      delete_nodes.insert(nodes[i * kNumFields + kReshape2Offset]);
-      delete_nodes.insert(nodes[i * kNumFields + kReshape2OutOffset]);
-    }
-
-    delete_nodes.insert(concat_op1);
-    delete_nodes.insert(concat_op2);
-    delete_nodes.insert(concat_out2);
-    delete_nodes.insert(box_coder_op);
-    delete_nodes.insert(box_coder_out);
-    delete_nodes.insert(transpose_before_nms);
-    delete_nodes.insert(transpose_before_nms_out);
-    delete_nodes.insert(multiclass_nms);
-
-    new_add_concat_op->outputs.push_back(concat_out1);
-    concat_out1->inputs.push_back(new_add_concat_op);
-
-    detection_out_op->inputs.push_back(concat_out1);
-    detection_out_op->inputs.push_back(box_coder_third_input);
-    detection_out_op->inputs.push_back(multiclass_nms_second_input);
-    detection_out_op->outputs.push_back(multiclass_nms_out);
-
-    concat_out1->outputs.push_back(detection_out_op);
-    box_coder_third_input->outputs.push_back(detection_out_op);
-    multiclass_nms_second_input->outputs.push_back(detection_out_op);
-    multiclass_nms_out->inputs.push_back(detection_out_op);
-
-    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph, delete_nodes);
-  };
-
-  gpd(graph, handler);
-}
-
-void SimplifyAnakinDetectionPatternPass::ApplyImpl(ir::Graph *graph) const {
-  const int pattern_nums = 6;
-  const std::string pattern_name = "simplify_anakin_detection_pattern_pass";
-  FusePassBase::Init(pattern_name, graph);
-  std::vector<bool> options = {true, false};
-  for (const auto &is_density : options) {
-    for (const auto &is_reshape : options) {
-      for (int i = 1; i <= pattern_nums; i++) {
-        RunSimplifyAnakinDetection(graph, i, is_density, is_reshape);
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-typedef paddle::framework::ir::SimplifyAnakinDetectionPatternPass
-    priorbox_pattern;
-REGISTER_PASS(simplify_anakin_priorbox_detection_out_pass, priorbox_pattern);
diff --git a/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h b/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h
deleted file mode 100644
index e882b9dc252e61a2e9e4e3666de49b7eee6d714a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// There may be many transpose-flatten structures in a model, and the output of
-// these structures will be used as inputs to the concat Op. This pattern will
-// be detected by our pass. The times here represents the repeat times of this
-// structure.
-class SimplifyAnakinDetectionPatternPass : public FusePassBase {
- public:
-  virtual ~SimplifyAnakinDetectionPatternPass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
deleted file mode 100644
index 61784f8c6656e4afd2ce3cbce1cc778079c845f4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h"
-
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * This pass is to simplify the Grpah, it may contains:
- * - replace comlicated op with basic op
- * - remove some unnecessary op
- *
- * In the current implementation, it supports:
- * - remove dropout_op (upscale_in_train) or
- *   replace dropout_op with scale_op (downgrade_in_infer) when is_test is true
- */
-void SimplifyWithBasicOpsPass::ApplyImpl(Graph* graph) const {
-  VLOG(3) << "Simplify the Graph with basic ops.";
-  std::unordered_set<const Node*> del_node_set;
-  for (Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->Op()) {
-      if (n->Op()->Type() == "dropout") {
-        SimplifyDropout(graph, n, &del_node_set);
-      }
-    }
-  }
-
-  GraphSafeRemoveNodes(graph, del_node_set);
-}
-
-bool SimplifyWithBasicOpsPass::SimplifyDropout(
-    Graph* graph, Node* n,
-    std::unordered_set<const Node*>* del_node_set) const {
-  OpDesc* dropout_op_desc = n->Op();
-  bool is_test = false;
-  // In the model used in test_analyzer_bert, the is_test's AttrType of
-  // dropout_op is INT.
-  if (dropout_op_desc->HasAttr("is_test")) {
-    if (dropout_op_desc->GetAttrType("is_test") == proto::AttrType::BOOLEAN) {
-      is_test = boost::get<bool>(dropout_op_desc->GetAttr("is_test"));
-    } else if (dropout_op_desc->GetAttrType("is_test") ==
-               proto::AttrType::INT) {
-      is_test = boost::get<int>(dropout_op_desc->GetAttr("is_test")) == 0
-                    ? false
-                    : true;
-    }
-  }
-
-  if (!is_test) {
-    return false;
-  }
-
-  Node* dropout_x = GetInputVar(n, dropout_op_desc->Input("X")[0]);
-  Node* dropout_out = GetOutputVar(n, dropout_op_desc->Output("Out")[0]);
-
-  bool upscale_in_train = false;
-  // Once the dropout_implementation's AttrType is BOOLEAN, but now is STRING.
-  if (dropout_op_desc->HasAttr("dropout_implementation")) {
-    if (dropout_op_desc->GetAttrType("dropout_implementation") ==
-        proto::AttrType::BOOLEAN) {
-      upscale_in_train =
-          boost::get<bool>(dropout_op_desc->GetAttr("dropout_implementation"));
-    } else if (dropout_op_desc->GetAttrType("dropout_implementation") ==
-               proto::AttrType::STRING) {
-      upscale_in_train = boost::get<std::string>(dropout_op_desc->GetAttr(
-                             "dropout_implementation")) == "upscale_in_train";
-    }
-  }
-
-  if (upscale_in_train) {
-    // dropout_op can be deleted.
-    // dropout_x -> dropout_op -> dropout_out -> next_op -> next_out
-    //   |
-    //  \|/
-    // dropout_x -> next_op -> next_out
-    // Check whether dropout_x is some next_op's output
-    bool dropout_x_is_reused_as_output = false;
-    for (auto* next_op : dropout_out->outputs) {
-      for (auto* next_out : next_op->outputs) {
-        if (next_out == dropout_x ||
-            next_out->Var()->Name() == dropout_x->Var()->Name()) {
-          dropout_x_is_reused_as_output = true;
-          break;
-        }
-      }
-      if (dropout_x_is_reused_as_output) {
-        break;
-      }
-    }
-    if (dropout_x_is_reused_as_output) {
-      VarDesc new_var_desc(*dropout_x->Var());
-      new_var_desc.SetName("simplify_with_basic_ops_" + dropout_x->Name());
-      auto* new_var_node = graph->CreateVarNode(&new_var_desc);
-      for (auto* out_op : dropout_x->outputs) {
-        if (out_op != n) {
-          ReplaceInputVar(out_op, dropout_x, new_var_node);
-        }
-      }
-      for (auto* in_op : dropout_x->inputs) {
-        ReplaceOutputVar(in_op, dropout_x, new_var_node);
-      }
-      dropout_x = new_var_node;
-    }
-    for (auto* next_op : dropout_out->outputs) {
-      ReplaceInputVar(next_op, dropout_out, dropout_x);
-    }
-
-    del_node_set->insert(dropout_out);
-  } else {
-    // Use a scale_op replaces the dropout_op
-    // dropout_x -> dropout_op -> dropout_out -> next_op -> next_out
-    //   |
-    //  \|/
-    // dropout_x -> scale_op -> dropout_out -> next_op -> next_out
-    float scale =
-        1.0f - boost::get<float>(dropout_op_desc->GetAttr("dropout_prob"));
-
-    framework::OpDesc new_op_desc;
-    new_op_desc.SetType("scale");
-    new_op_desc.SetInput("X", {dropout_x->Name()});
-    new_op_desc.SetOutput("Out", {dropout_out->Name()});
-    new_op_desc.SetAttr("scale", scale);
-    new_op_desc.SetAttr("bias", static_cast<float>(0));
-    new_op_desc.SetAttr("bias_after_scale", true);
-
-    auto* scale_op_node = graph->CreateOpNode(&new_op_desc);
-    IR_NODE_LINK_TO(dropout_x, scale_op_node);
-    IR_NODE_LINK_TO(scale_op_node, dropout_out);
-  }
-
-  del_node_set->insert(n);
-  return true;
-}
-
-Node* SimplifyWithBasicOpsPass::GetInputVar(Node* n,
-                                            const std::string& name) const {
-  for (auto* in : n->inputs) {
-    if (in->Name() == name) {
-      return in;
-    }
-  }
-  return nullptr;
-}
-
-Node* SimplifyWithBasicOpsPass::GetOutputVar(Node* n,
-                                             const std::string& name) const {
-  for (auto* out : n->outputs) {
-    if (out->Name() == name) {
-      return out;
-    }
-  }
-  return nullptr;
-}
-
-void SimplifyWithBasicOpsPass::ReplaceInputVar(Node* op, Node* old_var,
-                                               Node* new_var) const {
-  if (op->IsOp() && op->Op()) {
-    new_var->outputs.push_back(op);
-    for (size_t i = 0; i < op->inputs.size(); ++i) {
-      if (op->inputs[i] == old_var) {
-        op->inputs[i] = new_var;
-        op->Op()->RenameInput(old_var->Name(), new_var->Name());
-      }
-    }
-  }
-}
-
-void SimplifyWithBasicOpsPass::ReplaceOutputVar(Node* op, Node* old_var,
-                                                Node* new_var) const {
-  if (op->IsOp() && op->Op()) {
-    new_var->inputs.push_back(op);
-    for (size_t i = 0; i < op->outputs.size(); ++i) {
-      if (op->outputs[i] == old_var) {
-        op->outputs[i] = new_var;
-        op->Op()->RenameOutput(old_var->Name(), new_var->Name());
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(simplify_with_basic_ops_pass,
-              paddle::framework::ir::SimplifyWithBasicOpsPass);
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
deleted file mode 100644
index f5185622468055939103876387662f6402a45bfe..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class SimplifyWithBasicOpsPass : public Pass {
- protected:
-  void ApplyImpl(Graph* graph) const override;
-
- private:
-  bool SimplifyDropout(Graph* graph, Node* n,
-                       std::unordered_set<const Node*>* del_node_set) const;
-
-  Node* GetInputVar(Node* n, const std::string& name) const;
-  Node* GetOutputVar(Node* n, const std::string& name) const;
-
-  void ReplaceInputVar(Node* op, Node* old_var, Node* new_var) const;
-  void ReplaceOutputVar(Node* op, Node* old_var, Node* new_var) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
deleted file mode 100644
index 7fb67df495f1dfe8d20e015a75aa9b510b3cfe8d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-TEST(SimplifyWithBasicOpsPass, dropout) {
-  for (std::string dropout_implementation :
-       {"downgrade_in_infer", "upscale_in_train"}) {
-    for (auto inplace : {false, true}) {
-      if (dropout_implementation == "downgrade_in_infer" && inplace == true) {
-        continue;
-      }
-
-      LOG(INFO) << "dropout_implementation: " << dropout_implementation
-                << ", inplace: " << inplace;
-      Layers layers;
-      // (x, y) -> mul -> tmp_0
-      // (tmp_0) -> dropout -> (tmp_1)
-      // (tmp_1, z) -> elementwise_add -> (tmp_2)
-      // or
-      // (tmp_1, z) -> elementwise_add -> (tmp_0)
-      auto* x = layers.data("x");
-      auto* y = layers.data("y");
-      auto* z = layers.data("z");
-      auto* mul_out = layers.mul(x, y);
-      auto* dropout_out = layers.dropout(mul_out, 0.5f, dropout_implementation);
-      if (inplace) {
-        layers.elementwise_add(dropout_out, z, mul_out);
-      } else {
-        layers.elementwise_add(dropout_out, z);
-      }
-
-      std::unique_ptr<Graph> graph(new Graph(layers.main_program()));
-      auto pass = PassRegistry::Instance().Get("simplify_with_basic_ops_pass");
-      int num_dropout_nodes_before = GetNumOpNodes(graph, "dropout");
-      int num_scale_nodes_before = GetNumOpNodes(graph, "scale");
-      VLOG(3) << DebugString(graph);
-
-      graph.reset(pass->Apply(graph.release()));
-      int num_dropout_nodes_after = GetNumOpNodes(graph, "dropout");
-      int num_scale_nodes_after = GetNumOpNodes(graph, "scale");
-      VLOG(3) << DebugString(graph);
-
-      PADDLE_ENFORCE_EQ(num_dropout_nodes_after, 0UL);
-      if (dropout_implementation == "downgrade_in_infer") {
-        PADDLE_ENFORCE_EQ(num_dropout_nodes_before,
-                          num_scale_nodes_after - num_scale_nodes_before);
-      } else {
-        PADDLE_ENFORCE_EQ(num_scale_nodes_after - num_scale_nodes_before, 0UL);
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(simplify_with_basic_ops_pass);
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
deleted file mode 100644
index 42f4a91a6f421c28826d62bf30cbd4b2cb73805a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ /dev/null
@@ -1,377 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h"
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
-                                  const std::string& name_scope) {
-  auto var_is_op_input = [=](Node* x, const std::string& op_type,
-                             const std::string& arg_name = "") -> bool {
-    if (!(x && x->IsVar())) {
-      return false;
-    }
-    for (auto* op : x->outputs) {
-      if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) {
-        if (arg_name.empty()) {
-          return true;
-        }
-        for (auto& name : op->Op()->Input(arg_name)) {
-          if (name == x->Name()) {
-            return true;
-          }
-        }
-      }
-    }
-    return false;
-  };
-
-  auto var_is_op_only_output = [](Node* x, const std::string& op_type) -> bool {
-    return x && x->IsVar() && x->inputs.size() == 1 && x->inputs[0] &&
-           x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == op_type &&
-           x->inputs[0]->outputs.size() == 1;
-  };
-
-  auto next_op = [=](Node* x, const std::string& op_type) -> Node* {
-    if (!(x && x->IsVar())) {
-      return nullptr;
-    }
-    for (auto* op : x->outputs) {
-      if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) {
-        return op;
-      }
-    }
-    return nullptr;
-  };
-
-  auto get_op_input_var = [=](Node* x, const std::string& arg_name) -> Node* {
-    if (!(x && x->IsOp())) {
-      return nullptr;
-    }
-    for (auto* var : x->inputs) {
-      for (auto name : x->Op()->Input(arg_name)) {
-        if (var->Name() == name) {
-          return var;
-        }
-      }
-    }
-    return nullptr;
-  };
-
-  auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) {
-    bool basic = var_is_op_input(x, "matmul", arg_name) &&
-                 var_is_op_input(x, "square", "X");
-    if (!basic) {
-      return false;
-    }
-    auto* squared_x_op = next_op(x, "square");
-    if (!(squared_x_op && squared_x_op->outputs.size() == 1)) {
-      return false;
-    }
-    auto* squared_x = squared_x_op->outputs[0];
-    bool next_is_matmul_from_arg =
-        var_is_op_input(squared_x, "matmul", arg_name) &&
-        squared_x->outputs.size() == 1 &&
-        squared_x->outputs[0]->outputs.size() == 1;
-    if (!next_is_matmul_from_arg) {
-      return false;
-    }
-    auto* sub_y_in = squared_x->outputs[0]->outputs[0];
-    return var_is_op_input(sub_y_in, "elementwise_sub", "Y") &&
-           sub_y_in->outputs[0]->outputs.size() == 1 &&
-           var_is_op_input(sub_y_in->outputs[0]->outputs[0], "elementwise_mul");
-  };
-
-  auto is_fusion_first_mul_out = [=](Node* x) -> bool {
-    bool input_is_matmul_op = x && x->inputs.size() == 1 &&
-                              x->inputs[0]->IsOp() &&
-                              x->inputs[0]->Op()->Type() == "matmul";
-    if (!input_is_matmul_op) {
-      return false;
-    }
-    auto* mat_x = get_op_input_var(x->inputs[0], "X");
-    auto* mat_y = get_op_input_var(x->inputs[0], "Y");
-    bool input_mul_is_valid = mat_x && is_fusion_input_var(mat_x, "X") &&
-                              mat_y && is_fusion_input_var(mat_y, "Y");
-    if (!input_mul_is_valid) {
-      return false;
-    }
-
-    bool next_is_square = var_is_op_input(x, "square", "X") &&
-                          x->outputs.size() == 1 &&
-                          x->outputs[0]->outputs.size() == 1;
-    if (!next_is_square) {
-      return false;
-    }
-    auto* sub_x_in = x->outputs[0]->outputs[0];
-    return var_is_op_input(sub_x_in, "elementwise_sub", "X") &&
-           sub_x_in->outputs[0]->outputs.size() == 1 &&
-           var_is_op_input(sub_x_in->outputs[0]->outputs[0], "elementwise_mul");
-  };
-
-  auto* x = pattern->NewNode(
-      [=](Node* x) { return is_fusion_input_var(x, "X"); }, name_scope + "/x");
-
-  auto* y = pattern->NewNode(
-      [=](Node* x) { return is_fusion_input_var(x, "Y"); }, name_scope + "/y");
-
-  auto* square_x_op = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "square" &&
-               is_fusion_input_var(x->inputs[0], "X");
-      },
-      name_scope + "/squared_x_op");
-
-  auto* square_y_op = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "square" &&
-               is_fusion_input_var(x->inputs[0], "Y");
-      },
-      name_scope + "/squared_y_op");
-
-  auto* squared_x = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->inputs.size() == 1 && x->inputs[0]->inputs.size() == 1 &&
-               is_fusion_input_var(x->inputs[0]->inputs[0], "X");
-      },
-      name_scope + "/squared_x");
-
-  auto* squared_y = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->inputs.size() == 1 && x->inputs[0]->inputs.size() == 1 &&
-               is_fusion_input_var(x->inputs[0]->inputs[0], "Y");
-      },
-      name_scope + "/squared_y");
-
-  auto* matmuled_xy =
-      pattern->NewNode([=](Node* x) { return is_fusion_first_mul_out(x); },
-                       name_scope + "/matmuled_xy");
-
-  auto* matmul_xy_op = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "matmul" &&
-               is_fusion_first_mul_out(x->outputs[0]);
-      },
-      name_scope + "/matmul_xy_op");
-
-  auto* square_matmuled_xy_op = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "square" &&
-               is_fusion_first_mul_out(x->inputs[0]);
-      },
-      name_scope + "/square_matmuled_xy_op");
-
-  auto* squared_xmuly = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsVar() && x->inputs.size() == 1 &&
-               x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "square" &&
-               is_fusion_first_mul_out(x->inputs[0]->inputs[0]);
-      },
-      name_scope + "/squared_xmuly");
-
-  auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool {
-    bool basic = x && x->IsVar() && x->inputs.size() == 1 &&
-                 x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "matmul";
-    if (!basic) {
-      return false;
-    }
-    auto* sqx = get_op_input_var(x->inputs[0], "X");
-    auto* sqy = get_op_input_var(x->inputs[0], "Y");
-
-    return var_is_op_only_output(sqx, "square") &&
-           var_is_op_only_output(sqy, "square") && sqx->inputs[0] &&
-           sqx->inputs[0]->inputs.size() == 1 &&
-           is_fusion_input_var(sqx->inputs[0]->inputs[0], "X") &&
-           sqy->inputs[0] && sqy->inputs[0]->inputs.size() == 1 &&
-           is_fusion_input_var(sqy->inputs[0]->inputs[0], "Y");
-  };
-
-  auto* matmul_squared_x_y_op = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "matmul" &&
-               is_fusion_mat_squared_x_y_op_out(x->outputs[0]);
-      },
-      name_scope + "/matmul_squared_x_y_op");
-
-  auto* mat_squared_x_y_op_out = pattern->NewNode(
-      [=](Node* x) { return is_fusion_mat_squared_x_y_op_out(x); },
-      name_scope + "/mat_squared_x_y_op_out");
-
-  auto is_fusion_sub_op = [=](Node* x) -> bool {
-    bool is_sub_op = x && x->IsOp() && x->Op()->Type() == "elementwise_sub";
-    if (!is_sub_op) {
-      return false;
-    }
-    auto* matmul_sqx_sqy_var = get_op_input_var(x, "Y");
-    return is_fusion_mat_squared_x_y_op_out(matmul_sqx_sqy_var);
-  };
-
-  auto* sub_op = pattern->NewNode([=](Node* x) { return is_fusion_sub_op(x); },
-                                  name_scope + "/sub_op");
-
-  auto* sub_op_out = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsVar() && x->inputs.size() == 1 &&
-               is_fusion_sub_op(x->inputs[0]);
-      },
-      name_scope + "/sub_op_out");
-
-  auto is_fusion_element_op = [=](Node* x) -> bool {
-    bool is_elemul_op = x && x->IsOp() && x->Op()->Type() == "elementwise_mul";
-    if (!is_elemul_op) {
-      return false;
-    }
-    for (auto* in : x->inputs) {
-      if (in && in->inputs[0] && is_fusion_sub_op(in->inputs[0])) {
-        return true;
-      }
-    }
-    return false;
-  };
-
-  auto* elementmul_op =
-      pattern->NewNode([=](Node* x) { return is_fusion_element_op(x); },
-                       name_scope + "/elementmul_op");
-
-  auto* constant_op = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "fill_constant" &&
-               x->outputs.size() == 1 &&
-               is_fusion_element_op(x->outputs[0]->outputs[0]);
-      },
-      name_scope + "/fill_constant_op");
-
-  auto* constant_op_out = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsVar() && var_is_op_input(x, "elementwise_mul") &&
-               x->inputs[0] && x->inputs[0]->IsOp() &&
-               x->inputs[0]->Op()->Type() == "fill_constant" && x->outputs[0] &&
-               is_fusion_element_op(x->outputs[0]);
-      },
-      name_scope + "/constant_op_out");
-
-  auto* last_out_var = pattern->NewNode(
-      [=](Node* x) {
-        return var_is_op_only_output(x, "elementwise_mul") &&
-               is_fusion_element_op(x->inputs[0]);
-      },
-      name_scope + "/out");
-
-  square_x_op->LinksFrom({x}).LinksTo({squared_x});
-  square_y_op->LinksFrom({y}).LinksTo({squared_y});
-  matmul_xy_op->LinksFrom({x, y}).LinksTo({matmuled_xy});
-  matmul_squared_x_y_op->LinksFrom({squared_x, squared_y})
-      .LinksTo({mat_squared_x_y_op_out});
-  square_matmuled_xy_op->LinksFrom({matmuled_xy}).LinksTo({squared_xmuly});
-  sub_op->LinksFrom({squared_xmuly, mat_squared_x_y_op_out})
-      .LinksTo({sub_op_out});
-  constant_op->LinksFrom({}).LinksTo({constant_op_out});
-  elementmul_op->LinksFrom({constant_op_out, sub_op_out})
-      .LinksTo({last_out_var});
-
-  return last_out_var;
-}
-
-static int BuildFusion(Graph* graph, const std::string& name_scope) {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-
-  BuildSquaredMatSubPattern(pattern, name_scope);
-
-  auto retrieve_node = [](const std::string& name,
-                          const GraphPatternDetector::subgraph_t& subgraph,
-                          const PDPattern& pat) -> Node* {
-    PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)),
-                   "pattern has no Node called %s", name.c_str());
-    Node* p = subgraph.at(pat.RetrieveNode(name));
-    PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str());
-    return p;
-  };
-
-  int fusion_count{0};
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    LOG(INFO) << "handle sqaure mat sub fuse";
-    auto& fused_pattern = gpd.pattern();
-
-    auto* matx = retrieve_node(name_scope + "/x", subgraph, fused_pattern);
-    auto* maty = retrieve_node(name_scope + "/y", subgraph, fused_pattern);
-    auto* squaredx =
-        retrieve_node(name_scope + "/squared_x", subgraph, fused_pattern);
-    auto* squaredy =
-        retrieve_node(name_scope + "/squared_y", subgraph, fused_pattern);
-    auto* squaredxy =
-        retrieve_node(name_scope + "/squared_xmuly", subgraph, fused_pattern);
-    auto* last_out_var =
-        retrieve_node(name_scope + "/out", subgraph, fused_pattern);
-    auto* fill_constant_op = retrieve_node(name_scope + "/fill_constant_op",
-                                           subgraph, fused_pattern);
-
-    // Create New OpDesc
-    OpDesc op_desc;
-    op_desc.SetType("fusion_squared_mat_sub");
-    op_desc.SetInput("X", {matx->Name()});
-    op_desc.SetInput("Y", {maty->Name()});
-    op_desc.SetOutput("SquaredX", {squaredx->Name()});
-    op_desc.SetOutput("SquaredY", {squaredy->Name()});
-    op_desc.SetOutput("SquaredXY", {squaredxy->Name()});
-    op_desc.SetOutput("Out", {last_out_var->Name()});
-    op_desc.SetAttr("scalar", fill_constant_op->Op()->GetAttr("value"));
-
-    auto* op = graph->CreateOpNode(&op_desc);
-    IR_NODE_LINK_TO(matx, op);
-    IR_NODE_LINK_TO(maty, op);
-    IR_NODE_LINK_TO(op, squaredx);
-    IR_NODE_LINK_TO(op, squaredy);
-    IR_NODE_LINK_TO(op, squaredxy);
-    IR_NODE_LINK_TO(op, last_out_var);
-
-    std::unordered_set<const Node*> marked_nodes;
-    for (auto& item : subgraph) {
-      marked_nodes.insert(item.second);
-    }
-
-    marked_nodes.erase(matx);
-    marked_nodes.erase(maty);
-    marked_nodes.erase(squaredx);
-    marked_nodes.erase(squaredy);
-    marked_nodes.erase(squaredxy);
-    marked_nodes.erase(last_out_var);
-    GraphSafeRemoveNodes(graph, marked_nodes);
-    ++fusion_count;
-  };
-
-  gpd(graph, handler);
-  return fusion_count;
-}
-
-void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init(name_scope_, graph);
-  int fusion_count = BuildFusion(graph, name_scope_);
-  AddStatis(fusion_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(squared_mat_sub_fuse_pass,
-              paddle::framework::ir::SquaredMatSubFusePass);
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
deleted file mode 100644
index b6165a512acdb9b6e3bdbf49196692ef83edb58f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/**
- * Fuse ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
- */
-class SquaredMatSubFusePass : public FusePassBase {
- public:
-  virtual ~SquaredMatSubFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  const std::string name_scope_{"squared_mat_sub_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
deleted file mode 100644
index 2077304b9693b41448720a72cd47804b1fe2d60d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <utility>
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class SyncBatchNormPass : public Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override {
-    VLOG(3) << "Use synchronous batch norm";
-    for (const Node *n : graph->Nodes()) {
-      if (n->IsOp() && n->Op()) {
-        auto *op = n->Op();
-        if (op->Type() == "batch_norm") {
-          op->SetType("sync_batch_norm");
-        }
-        if (op->Type() == "batch_norm_grad") {
-          op->SetType("sync_batch_norm_grad");
-        }
-      }
-    }
-  }
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(sync_batch_norm_pass, paddle::framework::ir::SyncBatchNormPass);
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
deleted file mode 100644
index 90d214116d73554040e339fa01a24eed9255696a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/program_desc.h"
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetAttr("name", name);
-  op->SetInput("X", inputs);
-  op->SetOutput("Out", outputs);
-}
-
-// (a, conv_w)->conv2d->b
-// (b, bn_scale, bn_bias, mean, var)->batch_norm
-//     ->(c, mean, var, save_mean, save_inv_var)
-ProgramDesc BuildProgramDesc() {
-  ProgramDesc prog;
-  for (auto& v : std::vector<std::string>({"a", "conv_w", "b", "bn_scale",
-                                           "bn_bias", "mean", "var", "c",
-                                           "save_mean", "save_inv_var"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    if (v == "conv_w" || v == "bn_scale" || v == "bn_bias" || v == "mean" ||
-        v == "var") {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"a", "conv_w"}),
-        std::vector<std::string>({"b"}));
-  SetOp(&prog, "batch_norm", "bn",
-        std::vector<std::string>({"b", "bn_scale", "bn_bias", "mean", "var"}),
-        std::vector<std::string>(
-            {"c", "mean", "var", "save_mean", "save_inv_var"}));
-  return prog;
-}
-
-TEST(IsTestPass, basic) {
-  auto prog = BuildProgramDesc();
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass");
-
-  graph.reset(pass->Apply(graph.release()));
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      auto op_name = boost::get<std::string>(op->GetAttr("name"));
-      if (op_name == "bn") {
-        ASSERT_EQ(op->Type(), "sync_batch_norm");
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(sync_batch_norm_pass);
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
deleted file mode 100644
index a984a4942b374c3e2c5f148f8147c55d0f5deb24..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void RunTransposeFlattenConcatFuse(ir::Graph *graph, int times) {
-  const std::string pattern_name =
-      "transpose_flatten" + std::to_string(times) + "_concat_fuse";
-
-  GraphPatternDetector gpd;
-  std::vector<PDNode *> input_nodes;
-  for (int i = 0; i < times; i++) {
-    input_nodes.push_back(gpd.mutable_pattern()
-                              ->NewNode("x" + std::to_string(i))
-                              ->assert_is_op_input("transpose2", "X")
-                              ->AsInput());
-  }
-
-  patterns::TransposeFlattenConcat pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(input_nodes, times);
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
-                     Graph *g) {
-    const int kNumFields = 5;
-    const int kTransOffset = 1;
-    const int kTransOutOffset = 2;
-    const int kFlattenOffset = 3;
-    const int kFlattenOutOffset = 4;
-    std::vector<Node *> nodes;
-
-    for (int i = 0; i < times; i++) {
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
-      PADDLE_ENFORCE(subgraph.at(input_nodes[i]));
-
-      nodes.push_back(subgraph.at(input_nodes[i]));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
-      nodes.push_back(
-          subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
-    }
-
-    Node *concat_op = subgraph.at(pattern.GetPDNode("concat"));
-    Node *concat_out = subgraph.at(pattern.GetPDNode("concat_out"));
-    std::vector<std::string> input_names;
-    std::vector<int> trans_axis = boost::get<std::vector<int>>(
-        nodes[kTransOffset]->Op()->GetAttr("axis"));
-    int flatten_axis =
-        boost::get<int>(nodes[kFlattenOffset]->Op()->GetAttr("axis"));
-    int concat_axis = boost::get<int>(concat_op->Op()->GetAttr("axis"));
-    std::string output_name = concat_out->Name();
-
-    for (int i = 0; i < times; i++) {
-      input_names.push_back(nodes[i * kNumFields]->Name());
-    }
-
-    framework::OpDesc new_op_desc;
-    new_op_desc.SetType("fusion_transpose_flatten_concat");
-    new_op_desc.SetInput("X", input_names);
-    new_op_desc.SetAttr("trans_axis", trans_axis);
-    new_op_desc.SetAttr("flatten_axis", flatten_axis);
-    new_op_desc.SetAttr("concat_axis", concat_axis);
-    new_op_desc.SetOutput("Out", {output_name});
-    new_op_desc.Flush();
-
-    // Create a new node for the fused op.
-    auto *new_conv_op = graph->CreateOpNode(&new_op_desc);
-
-    std::unordered_set<const Node *> delete_nodes;
-
-    for (int i = 0; i < times; i++) {
-      nodes[i * kNumFields]->outputs.push_back(new_conv_op);
-      new_conv_op->inputs.push_back(nodes[i * kNumFields]);
-      delete_nodes.insert(nodes[i * kNumFields + kTransOffset]);
-      delete_nodes.insert(nodes[i * kNumFields + kTransOutOffset]);
-      delete_nodes.insert(nodes[i * kNumFields + kFlattenOffset]);
-      delete_nodes.insert(nodes[i * kNumFields + kFlattenOutOffset]);
-    }
-    delete_nodes.insert(concat_op);
-
-    new_conv_op->outputs.push_back(concat_out);
-    concat_out->inputs.push_back(new_conv_op);
-
-    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph, delete_nodes);
-  };
-
-  gpd(graph, handler);
-}
-
-void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const {
-  const int pattern_nums = 6;
-  const std::string pattern_name = "transpose_flatten_concat_fuse";
-  FusePassBase::Init(pattern_name, graph);
-  for (int i = 1; i <= pattern_nums; i++) {
-    RunTransposeFlattenConcatFuse(graph, i);
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(transpose_flatten_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass);
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
deleted file mode 100644
index 939a8c31e5501e23968f9b44b4fe09e78280fd07..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// There may be many transpose-flatten structures in a model, and the output of
-// these structures will be used as inputs to the concat Op. This pattern will
-// be detected by our pass. The times here represents the repeat times of this
-// structure.
-class TransposeFlattenConcatFusePass : public FusePassBase {
- public:
-  virtual ~TransposeFlattenConcatFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
deleted file mode 100644
index 904cc013012b9c3ea8054816446844f6d2cda26b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/library_type.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cctype>
-#include <string>
-
-namespace paddle {
-namespace framework {
-
-// For more details about the design of LibraryType, Please refer to
-// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
-
-enum class LibraryType {
-  kPlain = 0,
-  kMKLDNN = 1,
-  kCUDNN = 2,
-};
-
-inline std::string LibraryTypeToString(const LibraryType& library_type) {
-  switch (library_type) {
-    case LibraryType::kPlain:
-      return "PLAIN";
-    case LibraryType::kMKLDNN:
-      return "MKLDNN";
-    case LibraryType::kCUDNN:
-      return "CUDNN";
-    default:
-      PADDLE_THROW("unknown LibraryType %d", static_cast<int>(library_type));
-  }
-}
-
-inline LibraryType StringToLibraryType(const char* ctype) {
-  std::string s(ctype);
-  for (size_t i = 0; i < s.size(); ++i) {
-    s[i] = toupper(s[i]);
-  }
-  if (s == std::string("PLAIN")) {
-    return LibraryType::kPlain;
-  } else if (s == std::string("MKLDNN")) {
-    return LibraryType::kMKLDNN;
-  } else if (s == std::string("CUDNN")) {
-    return LibraryType::kCUDNN;
-    // To be compatible with register macro.
-    // CPU, CUDA, PLAIN are same library type.
-  } else if (s == std::string("CPU")) {
-    return LibraryType::kPlain;
-  } else if (s == std::string("CUDA")) {
-    return LibraryType::kPlain;
-  } else {
-    PADDLE_THROW("Unknown LibraryType %s", s.c_str());
-  }
-}
-
-inline std::ostream& operator<<(std::ostream& out, LibraryType l) {
-  out << LibraryTypeToString(l);
-  return out;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_rank_table.cc b/paddle/fluid/framework/lod_rank_table.cc
deleted file mode 100644
index 6bc795b642bf79b7556869c5ebe9b0323d3cc5fc..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/lod_rank_table.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/lod_rank_table.h"
-
-namespace paddle {
-namespace framework {
-void LoDRankTable::Reset(const LoD& lod, size_t level) {
-  this->coarse_lod_.clear();
-  this->items_.clear();
-  PADDLE_ENFORCE(level < lod.size(),
-                 "Cannot rank lod since the level %d is less than lod size %d",
-                 level, lod.size());
-  coarse_lod_.reserve(level);
-  for (size_t i = 0; i < level; ++i) {
-    coarse_lod_.push_back(lod[i]);
-  }
-  auto& vec = lod[level];
-  for (size_t i = 0; i < vec.size() - 1; ++i) {
-    TableItem item;
-    item.index = i;
-    item.length = vec[i + 1] - vec[i];
-    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
-    items_.emplace_back(item);
-  }
-  // NOTE(yuyang18):
-  //
-  // The time complexity of stable_sort is O(N*log(N)) if additional memory is
-  // available. It is easy to debug and unit test when using `stable_sort`
-  // instead of `sort`. Also, the items of a rank table will not be too large.
-  std::stable_sort(items_.begin(), items_.end(),
-                   [](const TableItem& a, const TableItem& b) {
-                     return a.length > b.length;
-                   });
-}
-
-}  // namespace framework
-
-std::ostream& operator<<(std::ostream& out,
-                         const framework::LoDRankTable& table) {
-  out << "NumOfSequence " << table.items().size() << "\n";
-  for (auto& each_item : table.items()) {
-    out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n";
-  }
-  return out;
-}
-}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h
deleted file mode 100644
index 8c6e8b0c66ead96f0e53b56ee951887730b0d77f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/lod_rank_table.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iosfwd>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-
-// LoD Rank Table stores the `level` of `lod` which is ordered by sequence
-// length in descending order. It is useful when implement dynamic RNN and is
-// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
-// output operators.
-//
-// The table item contains two element. The length of sequence and the index of
-// sequence in that level.
-//
-// LoDRankTable also stores the coarse_lod, which is the lod information whose
-// level is less than input level, in order to restore the output LoD
-// information.
-class LoDRankTable {
- public:
-  struct TableItem {
-    size_t index;
-    size_t length;
-  };
-
-  LoDRankTable() {}
-
-  void Reset(const LoD& lod, size_t level);
-
-  const std::vector<TableItem>& items() const { return this->items_; }
-
-  const LoD& coarse_lod() const { return this->coarse_lod_; }
-
-  size_t level() const { return coarse_lod_.size(); }
-
- private:
-  LoD coarse_lod_;
-  std::vector<TableItem> items_;
-};
-
-}  // namespace framework
-
-std::ostream& operator<<(std::ostream& out,
-                         const framework::LoDRankTable& table);
-
-}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
deleted file mode 100644
index ca820068c4d2f89b76306df81bac757918195ec1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/lod_tensor.cc
+++ /dev/null
@@ -1,428 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdint.h>
-#include <string.h>
-#include <algorithm>
-#include <iterator>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/framework/version.h"
-
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/memory/memory.h"
-
-namespace paddle {
-namespace framework {
-
-std::ostream &operator<<(std::ostream &os, const LoD &lod) {
-  os << "{";
-  for (auto &v : lod) {
-    os << "{";
-    bool is_first = true;
-    for (auto &i : v) {
-      if (is_first) {
-        os << i;
-        is_first = false;
-      } else {
-        os << ", " << i;
-      }
-    }
-    os << "}";
-  }
-  os << "}";
-
-  return os;
-}
-
-std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  os << "\tlod: " << t.lod() << "\n";
-  os << static_cast<Tensor>(t) << "\n";
-
-  return os;
-}
-
-std::string LoDToString(const LoD &lod) {
-  std::ostringstream stream;
-  stream << lod;
-  return stream.str();
-}
-
-LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
-                 size_t elem_end) {
-  PADDLE_ENFORCE_LT(level, in.size());
-  PADDLE_ENFORCE_LT(elem_begin, elem_end);
-  PADDLE_ENFORCE_LT(elem_end, in[level].size());
-
-  LoD res;
-  res.resize(in.size() - level);
-  // copy the first level
-  res[0].assign(in[level].begin() + elem_begin,
-                in[level].begin() + elem_end + 1);
-  for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto &in_level = in[level + lvl];
-    const auto &above_level = res[lvl - 1];
-    auto &out_level = res[lvl];
-    out_level.assign(in_level.begin() + above_level.front(),
-                     in_level.begin() + above_level.back() + 1);
-  }
-  for (size_t lvl = 0; lvl < res.size(); lvl++) {
-    // to make the first offset equals 0, all the elements minus the first
-    // element
-    size_t front = res[lvl].front();
-    for (auto &ele : res[lvl]) {
-      ele -= front;
-    }
-  }
-  return res;
-}
-
-LoD ToAbsOffset(const LoD &in) {
-  // the lowest level stores relative offsets
-  if (in.empty() || in.size() == 1) return in;
-  LoD result = in;
-  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
-    for (size_t i = 0; i < in[level].size(); ++i) {
-      size_t index = in[level][i];
-      result[level][i] = result[level + 1][index];
-    }
-  }
-  return result;
-}
-
-bool operator==(const LoD &a, const LoD &b) {
-  if (a.size() != b.size()) {
-    return false;
-  }
-
-  for (size_t i = 0; i < a.size(); i++) {
-    const auto &a_level = a[i];
-    const auto &b_level = b[i];
-    if (a_level.size() != b_level.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < a_level.size(); j++) {
-      if (a_level[j] != b_level[j]) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool CheckLoD(const LoD &in, int tensor_height) {
-  if (in.empty()) return true;
-  for (const auto &level : in) {
-    // check: there should be more than 2 offsets existing in each level.
-    if (level.size() < 2) return false;
-    // check: the first offset(the begin offset) of each level should be 0.
-    if (level.front() != 0) return false;
-    // check: all the offsets in a level should be non-descending
-    if (!std::is_sorted(level.begin(), level.end())) {
-      return false;
-    }
-  }
-  // check: the lowest level's last offset should equals `tensor_height` if
-  //        tensor_height>0.
-  if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
-    return false;
-
-  // check: the higher level's last offset should equals the lower level's
-  // size-1.
-  // NOTE LoD store the levels from top to bottom, so the higher level goes
-  // first.
-  for (size_t level = 0; level < in.size() - 1; level++) {
-    if (in[level].back() != in[level + 1].size() - 1) return false;
-  }
-  return true;
-}
-
-bool CheckAbsLoD(const LoD &in, int tensor_height) {
-  if (in.empty()) return true;
-  for (const auto &level : in) {
-    // check: all the offsets in a level should be ascending(no same items
-    // allowed).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      return false;
-    }
-
-    // check: there should be more than 2 offsets existing in each level.
-    if (level.size() < 2) return false;
-
-    // check: the first offset of each level should be 0, and the last should be
-    // the same(the height of underlying tensor).
-    if (level.front() != 0) return false;
-    if (tensor_height < 0) {
-      tensor_height = level.back();
-    } else if ((size_t)tensor_height != level.back()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
-                                        size_t end_idx, size_t start_level) {
-  LoD sub_lod;
-
-  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
-    PADDLE_ENFORCE_LE(start_idx, end_idx);
-    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
-    std::vector<size_t> level_lens;
-    for (size_t i = start_idx; i < end_idx; ++i) {
-      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
-    }
-    sub_lod.emplace_back(level_lens);
-    start_idx = lod[level_idx][start_idx];
-    end_idx = lod[level_idx][end_idx];
-  }
-
-  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
-}
-
-void AppendLoD(LoD *lod, const LoD &lod_length) {
-  PADDLE_ENFORCE(
-      lod->empty() || lod->size() == lod_length.size(),
-      "The lod_length should has the same size with the appended lod.");
-  if (lod->empty()) {
-    for (size_t i = 0; i < lod_length.size(); ++i) {
-      lod->emplace_back(1, 0);  // size = 1, value = 0;
-    }
-    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
-  }
-  for (size_t i = 0; i < lod->size(); ++i) {
-    auto &level = (*lod)[i];
-    for (size_t len : lod_length[i]) {
-      level.push_back(level.back() + len);
-    }
-  }
-}
-
-void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
-                       const platform::DeviceContext &dev_ctx) {
-  {  // the 1st field, uint32_t version for LoDTensor
-    os.write(reinterpret_cast<const char *>(&kCurTensorVersion),
-             sizeof(kCurTensorVersion));
-  }
-  {
-    // the 2st field, LoD information
-    // uint64_t lod_level
-    // uint64_t lod_level_1 size in byte.
-    // int*     lod_level_1 data
-    // ...
-    auto lod = tensor.lod();
-    uint64_t size = lod.size();
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-    for (auto &each : lod) {
-      size = each.size() * sizeof(framework::LoD::value_type::value_type);
-      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      os.write(reinterpret_cast<const char *>(each.data()),
-               static_cast<std::streamsize>(size));
-    }
-  }
-  // the 3st field, Tensor
-  TensorToStream(os, static_cast<Tensor>(tensor), dev_ctx);
-}
-
-void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
-                           const platform::DeviceContext &dev_ctx) {
-  {
-    // the 1st field, unit32_t version for LoDTensor
-    uint32_t version;
-    is.read(reinterpret_cast<char *>(&version), sizeof(version));
-    PADDLE_ENFORCE(framework::IsTensorVersionSupported(version),
-                   "tensor version %u is not supported.", version);
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  }
-  {
-    // the 2st field, LoD information
-    uint64_t lod_level;
-    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-    auto &lod = *tensor->mutable_lod();
-    lod.resize(lod_level);
-    for (uint64_t i = 0; i < lod_level; ++i) {
-      uint64_t size;
-      is.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::vector<size_t> tmp(size / sizeof(size_t));
-      is.read(reinterpret_cast<char *>(tmp.data()),
-              static_cast<std::streamsize>(size));
-      lod[i] = tmp;
-    }
-  }
-  // the 3st filed, Tensor
-  TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
-}
-
-std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
-    const std::vector<platform::Place> places) const {
-  check_memory_size();
-  int batch_size =
-      lod().empty() ? dims()[0] : static_cast<int>(lod()[0].size()) - 1;
-  size_t result_size = std::min(static_cast<size_t>(batch_size), places.size());
-  size_t remainder = batch_size % places.size();
-
-  std::vector<LoDTensor> results;
-  results.reserve(result_size);
-
-  // if result_size(batch_size) is 0, just return #places.size() copys of empty
-  // tensors.
-  if (result_size == 0) {
-    for (size_t i = 0; i < places.size(); ++i) {
-      LoDTensor dst;
-      dst.Resize(dims());
-      dst.mutable_data(places[i], type());
-      if (!lod().empty()) {
-        dst.set_lod(lod());
-      }
-      results.emplace_back(dst);
-    }
-    return results;
-  }
-
-  int step_width = static_cast<int>(batch_size / result_size);
-  for (size_t i = 0; i < result_size; ++i) {
-    int begin = static_cast<int>(i * step_width);
-    int end = static_cast<int>((i + 1) * step_width);
-    if (i + 1 == places.size()) {  // last
-      end += remainder;
-    }
-
-    LoDTensor dst;
-    if (lod().empty()) {
-      auto src = Slice(begin, end);
-      auto &dst_place = places[i];
-      framework::TensorCopy(src, dst_place, &dst);
-    } else {
-      auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0);
-
-      auto &offset = lod_and_offset.second;
-      auto src = Slice(offset.first, offset.second);
-      auto &dst_place = places[i];
-      framework::TensorCopy(src, dst_place, &dst);
-
-      LoD my_lod;
-      for (auto &l : lod_and_offset.first) {
-        std::vector<size_t> v{0};
-        for (auto &ll : l) {
-          v.push_back(ll + v.back());
-        }
-        my_lod.emplace_back(v);
-      }
-      dst.set_lod(my_lod);
-    }
-    results.emplace_back(dst);
-  }
-
-  return results;
-}
-
-void LoDTensor::MergeLoDTensor(
-    const std::vector<const LoDTensor *> &lod_tensors,
-    platform::Place dst_place) {
-  PADDLE_ENFORCE(!lod_tensors.empty());
-
-  framework::DDim new_dim = lod_tensors[0]->dims();
-  proto::VarType::Type new_type = proto::VarType::FP32;
-  framework::DataLayout new_layout = lod_tensors[0]->layout();
-  for (auto *t : lod_tensors) {
-    if (t->numel() && t->IsInitialized()) {
-      new_dim = t->dims();
-      new_type = t->type();
-      new_layout = t->layout();
-      break;
-    }
-  }
-
-  LoD new_lod = lod_tensors[0]->lod();
-
-  for (size_t i = 1; i < lod_tensors.size(); ++i) {
-    auto *t = lod_tensors[i];
-    if (t->numel() && t->IsInitialized()) {
-      PADDLE_ENFORCE_EQ(new_type, t->type());
-      PADDLE_ENFORCE_EQ(new_layout, t->layout());
-      PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0],
-                        framework::product(t->dims()) / t->dims()[0]);
-      new_dim[0] += t->dims()[0];
-    }
-
-    auto &lod = t->lod();
-    PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
-    for (size_t j = 0; j < lod.size(); ++j) {
-      auto &sub_lod = new_lod[j];
-      size_t offset = sub_lod.back();
-      for (size_t k = 1; k < lod[j].size(); ++k) {
-        sub_lod.push_back(lod[j][k] + offset);
-      }
-    }
-  }
-  Resize(new_dim);
-  set_layout(new_layout);
-  set_lod(new_lod);
-  mutable_data(dst_place, new_type);
-
-  int begin = 0;
-  for (auto *src : lod_tensors) {
-    int end = begin + src->dims()[0];
-    if (end == begin) {
-      continue;
-    }
-    auto dst = Slice(begin, end);
-    framework::TensorCopy(*src, dst_place, &dst);
-    begin = end;
-  }
-}
-
-LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
-  LoD length_lod;
-  length_lod.reserve(offset_lod.size());
-  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
-    std::vector<size_t> level;
-    if (offset_lod[lvl].size() > 0) {
-      level.reserve(offset_lod[lvl].size() - 1);
-    }
-    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
-      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
-    }
-    length_lod.push_back(level);
-  }
-  return length_lod;
-}
-
-LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
-  LoD offset_lod;
-  offset_lod.reserve(length_lod.size());
-  for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
-    std::vector<size_t> level;
-    level.reserve(length_lod[lvl].size() + 1);
-    size_t tmp = 0;
-    level.push_back(tmp);
-    for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
-      tmp += length_lod[lvl][idx];
-      level.push_back(tmp);
-    }
-    offset_lod.push_back(level);
-  }
-  return offset_lod;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
deleted file mode 100644
index ef48753349ec7b07d2c1c0ee68d133145e4e4047..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/lod_tensor.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#ifdef PADDLE_WITH_CUDA
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#endif
-
-#include <glog/logging.h>
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-/*
- * LoD is short for Level of Details.
- *
- * - in a level, each element indicates relative offset of the lower level
- * - the first element should be 0 and that indicates that this sequence start
- * from 0
- * - each sequence's begin and end(no-inclusive) is level[id, id+1]
- *
- * For example:
- *    3-level LoD stores
- *
- *    0 2 3
- *    0 2 4 7
- *    0 2 5 7 10 12 15 20
- */
-using LoD = std::vector<Vector<size_t>>;
-
-std::ostream& operator<<(std::ostream& os, const LoD& lod);
-std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
-
-std::string LoDToString(const LoD& lod);
-
-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
-                 size_t elem_end);
-/*
- * Transform an LoD from relative offsets to absolute offsets.
- */
-LoD ToAbsOffset(const LoD& in);
-
-bool operator==(const LoD& a, const LoD& b);
-
-/*
- * Check whether this lod's format is valid.
- *
- * ATTENTION:
- *   - Empty lod is treated as valid.
- *
- * It will check two things:
- *
- *  1. all the offsets in a level should be non-descending.
- *  2. there should be more than 2 offsets existing in each level.
- *  3. the higher level's last offset should equals the lower level's size-1.
- *  4. the first offset(the begin offset) of each level should be 0.
- *  5. the lowest level's last offset should equals `tensor_height` if
- * tensor_height>0.
- */
-
-bool CheckLoD(const LoD& in, int tensor_height = -1);
-/*
- * Check whether this absolute lod's format is valid.
- *
- * ATTENTION:
- *   - Empty lod is treated as valid.
- *
- * It will check two things:
- *  1. all the offsets in a level should be ascending(no same items allowed).
- *  2. there should be more than 2 offsets existing in each level.
- *  3. the first offset of each level should be 0, and the last should be the
- *     same(the height of underlying tensor) or `tensor_height` if
- *     tensor_height>0.
- */
-bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
-
-/*
- * LoDTensor (Level of details Tensor)
- * see https://en.wikipedia.org/wiki/Level_of_details for reference.
- */
-class LoDTensor : public Tensor {
- public:
-  LoDTensor() : Tensor() {}
-
-  explicit LoDTensor(const LoD& lod) : lod_(lod) {}
-
-  void set_lod(const LoD& lod) { lod_ = lod; }
-
-  const LoD& lod() const { return lod_; }
-
-  LoD* mutable_lod() { return &lod_; }
-
-  /*
-   * Get the start offset and end offset of an  element from LoD.
-   */
-  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const {
-    PADDLE_ENFORCE_LT(level, NumLevels());
-    PADDLE_ENFORCE_LT(elem, NumElements(level));
-    return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]);
-  }
-
-  /*
-   * Number of LoDTensor's levels, each level has units of data, for example,
-   * in the sentence's view, article, paragraph, sentence are 3 levels.
-   */
-  size_t NumLevels() const { return lod_.size(); }
-  /*
-   * Number of elements in a level.
-   */
-  size_t NumElements(size_t level = 0) const {
-    PADDLE_ENFORCE_LT(level, NumLevels());
-    // the last offset is the end of last element
-    return (lod_)[level].size() - 1;
-  }
-
-  // Split LoDTensor and copy to each place specified in places.
-  std::vector<LoDTensor> SplitLoDTensor(
-      const std::vector<platform::Place> places) const;
-
-  void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
-                      platform::Place place);
-
- private:
-  LoD lod_;
-};
-
-/*
- * Expand the `source` to fit the LoD of `lod`. For example, a `source`
- * LoDTensor is
- *  - LoD: [0, 2]
- *  - tensor: [a0, a1]
- * a `lod` is
- *  - LoD: [0 3 5]
- * returns a new LoDTensor
- *  - [a0 a0 a0 a1 a1]
- */
-template <typename T>
-LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
-                    const platform::Place& place) {
-  LoD abs_lod = ToAbsOffset(lod);
-  const auto& lod_level = lod[level];
-  size_t num_instances = source.dims()[0];
-
-  // new tensor
-  LoDTensor tensor;
-  tensor.set_lod(lod);
-  auto dims = source.dims();
-  dims[0] = lod_level.back();
-  tensor.Resize(dims);
-  tensor.mutable_data<T>(place);
-
-  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
-  for (size_t ins = 0; ins < num_instances; ins++) {
-    for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
-      auto slice = tensor.Slice(elem, elem + 1);
-      TensorCopy(source.Slice(ins, ins + 1), platform::CPUPlace(),
-                 platform::CPUDeviceContext(), &slice);
-    }
-  }
-  return tensor;
-}
-
-// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
-// relative length of details for every levels(i.e., [start_level: ]).
-//
-// For example,
-//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
-//   start_level = 0
-//   start_idx = 1
-//   end_idx = 3
-//
-// Returns:
-//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
-//  pair<size_t, size_t> = {11, 24}
-std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
-    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
-
-void AppendLoD(LoD* lod, const LoD& lod_length);
-
-/*
- * Serialize/Desiralize LoDTensor to std::ostream
- * You can pass ofstream or ostringstream to serilize to file
- * or to a in memory string. GPU tensor will be copied to CPU.
- */
-void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
-                       const platform::DeviceContext& dev_ctx);
-void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
-                           const platform::DeviceContext& dev_ctx);
-
-/*
- * Convert between length-based LoD and offset-based LoD.
- * The implementation of LoDTensor class use offset-based LoD.
- * However, we want to expose the more user-friendly length-based
- * LoD to the Python side instead.
- *
- * Example:
- * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]]
- * then length_lod = [[2, 1], [3, 2, 4]]
- */
-LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
-
-LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h
deleted file mode 100644
index 36a5c3c5d601390beedaf37ceb98ee2c63ecf5a6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-
-using LoDTensorArray = std::vector<LoDTensor>;
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
deleted file mode 100644
index c93c3f2673b1d80ef1e1a9dd68ad50501ba16f42..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ /dev/null
@@ -1,310 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-
-TEST(LoD, PrintLoDTensor) {
-  LoDTensor tensor1;
-  tensor1.Resize({2});
-  tensor1.mutable_data<float>(platform::CPUPlace());
-  tensor1.data<float>()[0] = 0.2;
-  tensor1.data<float>()[1] = 0.5;
-  LOG(INFO) << tensor1;
-
-  LoDTensor tensor2;
-  tensor2.Resize({2});
-  tensor2.mutable_data<int64_t>(platform::CPUPlace());
-  tensor2.data<int64_t>()[0] = 1;
-  tensor2.data<int64_t>()[1] = 2;
-  LOG(INFO) << tensor2;
-}
-
-TEST(LoD, data) {
-  LoD lod{{0, 1, 2}};
-  lod.push_back({0, 2, 4, 5});
-  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
-
-  auto& v = lod[0];
-  for (size_t i = 0; i < v.size(); ++i) {
-    EXPECT_EQ(v[i], i);
-  }
-}
-
-TEST(LoD, ExpandLoD) {
-  LoD lod{{0, 2}};
-  LoDTensor tensor;
-  tensor.set_lod(lod);
-  tensor.Resize({2, 1});
-  tensor.mutable_data<float>(platform::CPUPlace());
-  tensor.data<float>()[0] = 0;
-  tensor.data<float>()[1] = 1;
-
-  LoD target;
-  target.emplace_back(std::vector<size_t>{0, 3, 5});
-  auto new_tensor = LodExpand<float>(tensor, target, 0UL, platform::CPUPlace());
-  std::vector<int> result{{0, 0, 0, 1, 1}};
-  for (size_t i = 0; i < 5; i++) {
-    ASSERT_EQ(new_tensor.data<float>()[i], result[i]);
-  }
-}
-
-TEST(LoD, GetFineGrainedLoDLength) {
-  LoD lod;
-  lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
-  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
-  lod.push_back(
-      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));
-
-  auto lod_and_offset =
-      paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
-  LoD lod_length = lod_and_offset.first;
-  size_t start_offset = lod_and_offset.second.first;
-  size_t end_offset = lod_and_offset.second.second;
-
-  LoD expected;
-  expected.push_back(std::vector<size_t>{2});
-  expected.push_back(std::vector<size_t>{2, 2});
-  expected.push_back(std::vector<size_t>{2, 3, 4, 2});
-  EXPECT_EQ(lod_length, expected);
-  EXPECT_EQ(start_offset, 15UL);
-  EXPECT_EQ(end_offset, 26UL);
-}
-
-TEST(LoD, AppendLoD) {
-  LoD lod_lens;
-  lod_lens.push_back(std::vector<size_t>({2}));
-  lod_lens.push_back(std::vector<size_t>({2, 2}));
-  lod_lens.push_back(std::vector<size_t>({2, 3, 4, 2}));
-
-  LoD origin;
-  origin.push_back(std::vector<size_t>({0, 2}));
-  origin.push_back(std::vector<size_t>({0, 1, 6}));
-  origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));
-
-  paddle::framework::AppendLoD(&origin, lod_lens);
-
-  LoD expected;
-  expected.push_back(std::vector<size_t>({0, 2, 4}));
-  expected.push_back(std::vector<size_t>({0, 1, 6, 8, 10}));
-  expected.push_back(
-      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
-  EXPECT_EQ(origin, expected);
-}
-
-TEST(LoD, ToAbsOffset) {
-  LoD relative_lod;
-  relative_lod.push_back(std::vector<size_t>({0, 2}));
-  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
-  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
-
-  LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod);
-
-  LoD expected;
-  expected.push_back(std::vector<size_t>({0, 5}));
-  expected.push_back(std::vector<size_t>({0, 2, 5}));
-  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
-
-  EXPECT_EQ(abs_lod, expected);
-}
-
-TEST(LoD, SplitLoDTensor) {
-  LoD lod;
-  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
-  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
-
-  platform::CPUPlace place;
-  LoDTensor lod_tensor;
-  lod_tensor.Resize({20, 1});
-  float* dst_ptr = lod_tensor.mutable_data<float>(place);
-  for (int i = 0; i < lod_tensor.numel(); ++i) {
-    dst_ptr[i] = i;
-  }
-  lod_tensor.set_lod(lod);
-
-  std::vector<platform::Place> places{platform::CPUPlace(),
-                                      platform::CPUPlace()};
-  LoD lod0;
-  lod0.push_back(std::vector<size_t>({0, 2, 4}));
-  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
-  LoD lod1;
-  lod1.push_back(std::vector<size_t>({0, 1, 2}));
-  lod1.push_back(std::vector<size_t>({0, 2, 7}));
-
-  auto lods = lod_tensor.SplitLoDTensor(places);
-  EXPECT_EQ(lods[0].lod(), lod0);
-  EXPECT_EQ(lods[1].lod(), lod1);
-}
-
-TEST(LoD, SplitLoDTensorWithZeroBatchSize) {
-  LoD lod;
-  lod.push_back(std::vector<size_t>({0}));
-
-  platform::CPUPlace place;
-  LoDTensor lod_tensor;
-  lod_tensor.Resize({0, 5});
-  lod_tensor.mutable_data<float>(place);
-  lod_tensor.set_lod(lod);
-
-  std::vector<platform::Place> places{platform::CPUPlace(),
-                                      platform::CPUPlace()};
-  LoD lod_res;
-  lod_res.push_back(std::vector<size_t>({0}));
-
-  auto lods = lod_tensor.SplitLoDTensor(places);
-  EXPECT_EQ(lods[0].lod(), lod_res);
-  EXPECT_EQ(lods[1].lod(), lod_res);
-}
-
-TEST(LoD, MergeLoDTensor) {
-  LoD lod;
-  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
-  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
-
-  platform::CPUPlace place;
-
-  LoDTensor lod_tensor0;
-  LoD lod0;
-  lod0.push_back(std::vector<size_t>({0, 2, 4}));
-  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
-  lod_tensor0.set_lod(lod0);
-
-  lod_tensor0.Resize({13, 1});
-  float* dst_ptr = lod_tensor0.mutable_data<float>(place);
-  for (int i = 0; i < lod_tensor0.numel(); ++i) {
-    dst_ptr[i] = i;
-  }
-
-  LoDTensor lod_tensor1;
-  LoD lod1;
-  lod1.push_back(std::vector<size_t>({0, 1, 2}));
-  lod1.push_back(std::vector<size_t>({0, 2, 7}));
-  lod_tensor1.set_lod(lod1);
-  lod_tensor1.Resize({7, 1});
-  dst_ptr = lod_tensor1.mutable_data<float>(place);
-  for (int i = 0; i < lod_tensor1.numel(); ++i) {
-    dst_ptr[i] = i;
-  }
-
-  LoDTensor lod_tensor2;
-  LoD lod2;
-  lod2.push_back(std::vector<size_t>({0}));
-  lod2.push_back(std::vector<size_t>({0}));
-  lod_tensor2.set_lod(lod2);
-  lod_tensor2.Resize({0});
-  dst_ptr = lod_tensor2.mutable_data<float>(place);
-
-  std::vector<const LoDTensor*> lods{&lod_tensor0, &lod_tensor1, &lod_tensor2};
-
-  LoDTensor lod_tensor;
-  lod_tensor.MergeLoDTensor(lods, place);
-  EXPECT_EQ(lod_tensor.lod(), lod);
-}
-
-TEST(LoD, CheckLoD) {
-  LoD relative_lod;
-  relative_lod.push_back(std::vector<size_t>({0, 2}));
-  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
-  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
-
-  // check compatible
-  ASSERT_TRUE(CheckLoD(relative_lod));
-  relative_lod[1].back()++;
-  ASSERT_FALSE(CheckLoD(relative_lod));
-  relative_lod[1].back()--;  // recover it
-
-  // check empty
-  LoD empty_lod;
-  ASSERT_TRUE(CheckLoD(empty_lod));
-
-  // check less than 2 offsets in a level
-  LoD some_lod0;
-  some_lod0.push_back(std::vector<size_t>({0}));
-  ASSERT_FALSE(CheckLoD(some_lod0));
-
-  // check with underlying tensor storage.
-  ASSERT_TRUE(CheckLoD(relative_lod, 5));
-  ASSERT_FALSE(CheckLoD(relative_lod, 9));
-
-  // check whether lod is ascending-sorted (allow same items)
-  ASSERT_TRUE(CheckLoD({{0, 1, 2, 3, 4, 5}}, 5));
-  ASSERT_TRUE(CheckLoD({{0, 1, 3, 3, 4, 5}}, 5));
-  ASSERT_FALSE(CheckLoD({{0, 1, 3, 2, 5}}, 5));
-}
-
-TEST(LoD, CheckAbsLoD) {
-  LoD relative_lod;
-  relative_lod.push_back(std::vector<size_t>({0, 2}));
-  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
-  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
-
-  auto abs_lod = ToAbsOffset(relative_lod);
-
-  ASSERT_TRUE(CheckAbsLoD(abs_lod));
-
-  // check less than 2 offsets in a level.
-
-  // check the last item should be compatible with tensor height.
-  abs_lod.back().back()++;
-  ASSERT_FALSE(CheckAbsLoD(abs_lod));
-  abs_lod.back().back()--;  // restore
-
-  // check less than 2 offsets in a lod.
-  LoD abs_lod0;
-  abs_lod0.push_back(std::vector<size_t>({0}));
-  ASSERT_FALSE(CheckAbsLoD(abs_lod0));
-}
-
-TEST(LoD, ConvertToLengthBasedLoD) {
-  LoD offset_lod;
-  offset_lod.push_back(std::vector<size_t>({0, 2}));
-  offset_lod.push_back(std::vector<size_t>({0, 1, 3}));
-  offset_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
-
-  LoD length_lod = ConvertToLengthBasedLoD(offset_lod);
-
-  LoD expected;
-  expected.push_back(std::vector<size_t>({2}));
-  expected.push_back(std::vector<size_t>({1, 2}));
-  expected.push_back(std::vector<size_t>({2, 2, 1}));
-
-  EXPECT_EQ(length_lod, expected);
-}
-
-TEST(LoD, ConvertToOffsetBasedLoD) {
-  LoD length_lod;
-  length_lod.push_back(std::vector<size_t>({2}));
-  length_lod.push_back(std::vector<size_t>({1, 2}));
-  length_lod.push_back(std::vector<size_t>({2, 2, 1}));
-
-  LoD offset_lod = ConvertToOffsetBasedLoD(length_lod);
-
-  LoD expected;
-  expected.push_back(std::vector<size_t>({0, 2}));
-  expected.push_back(std::vector<size_t>({0, 1, 3}));
-  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
-
-  EXPECT_EQ(offset_lod, expected);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
deleted file mode 100644
index 7d6ba984f6fe0385b81e320c8a5a162210e33e83..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-
-__global__ void test(size_t* a, int size) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
-       i += blockDim.x * gridDim.x) {
-    a[i] *= 2;
-  }
-}
-
-TEST(LoD, data) {
-  paddle::framework::InitDevices(true);
-
-  paddle::framework::LoD lod{{0, 1, 2}};
-  lod.push_back({0, 2, 4, 5});
-  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
-
-  auto& v = lod[0];
-  paddle::platform::CUDAPlace gpu(0);
-  test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
-  cudaDeviceSynchronize();
-  for (size_t i = 0; i < v.size(); ++i) {
-    EXPECT_EQ(v[i], i * 2);
-  }
-}
-
-TEST(LoDTensor, LoDInGPU) {
-  paddle::framework::InitDevices(true);
-
-  paddle::framework::LoDTensor lod_tensor;
-  paddle::platform::CUDAPlace place(0);
-
-  paddle::framework::LoD src_lod;
-  src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
-
-  lod_tensor.Resize({14, 16});
-  lod_tensor.mutable_data<float>(place);
-
-  lod_tensor.set_lod(src_lod);
-  EXPECT_EQ(lod_tensor.lod_element(0, 2).first, 4UL);
-  EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL);
-
-  auto lod = lod_tensor.lod();
-
-  test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
-  cudaDeviceSynchronize();
-
-  for (size_t i = 0; i < src_lod[0].size(); ++i) {
-    EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
-  }
-}
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
deleted file mode 100644
index 01ba743b06dd57157cceb3fc544d11ba1068dea1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/mixed_vector.h
+++ /dev/null
@@ -1,537 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <initializer_list>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/cow_ptr.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-#include "glog/logging.h"
-
-namespace paddle {
-namespace framework {
-
-#if defined(PADDLE_WITH_CUDA)
-// Vector<T> implements the std::vector interface, and can get Data or
-// MutableData from any place. The data will be synced implicitly inside.
-template <typename T>
-class Vector {
- public:
-  using value_type = T;
-  using iterator = typename std::vector<T>::iterator;
-  using const_iterator = typename std::vector<T>::const_iterator;
-
- private:
-  // The actual class to implement vector logic
-  class VectorData {
-   public:
-    VectorData() : flag_(kDataInCPU) {}
-    VectorData(size_t count, const T &value)
-        : cpu_(count, value), flag_(kDataInCPU) {}
-    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
-    template <typename U>
-    explicit VectorData(const std::vector<U> &dat)
-        : cpu_(dat), flag_(kDataInCPU) {}
-    ~VectorData() {}
-
-    VectorData(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-    }
-
-    VectorData &operator=(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-      return *this;
-    }
-
-    T &operator[](size_t i) {
-      MutableCPU();
-      return cpu_[i];
-    }
-
-    const T &operator[](size_t i) const {
-      ImmutableCPU();
-      return cpu_[i];
-    }
-
-    size_t size() const { return cpu_.size(); }
-
-    iterator begin() {
-      MutableCPU();
-      return cpu_.begin();
-    }
-
-    iterator end() {
-      MutableCPU();
-      return cpu_.end();
-    }
-
-    T &front() {
-      MutableCPU();
-      return cpu_.front();
-    }
-
-    T &back() {
-      MutableCPU();
-      return cpu_.back();
-    }
-
-    const_iterator begin() const {
-      ImmutableCPU();
-      return cpu_.begin();
-    }
-
-    const_iterator end() const {
-      ImmutableCPU();
-      return cpu_.end();
-    }
-
-    const T &back() const {
-      ImmutableCPU();
-      return cpu_.back();
-    }
-
-    T *data() { return &(*this)[0]; }
-
-    const T *data() const { return &(*this)[0]; }
-
-    const T &front() const {
-      ImmutableCPU();
-      return cpu_.front();
-    }
-
-    // assign this from iterator.
-    // NOTE: the iterator must support `end-begin`
-    template <typename Iter>
-    void assign(Iter begin, Iter end) {
-      MutableCPU();
-      cpu_.assign(begin, end);
-    }
-
-    // push_back. If the previous capacity is not enough, the memory will
-    // double.
-    void push_back(T elem) {
-      MutableCPU();
-      cpu_.push_back(elem);
-    }
-
-    // extend a vector by iterator.
-    // NOTE: the iterator must support end-begin
-    template <typename It>
-    void Extend(It begin, It end) {
-      MutableCPU();
-      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
-      std::copy(begin, end, out_it);
-    }
-
-    // resize the vector
-    void resize(size_t size) {
-      MutableCPU();
-      cpu_.resize(size);
-    }
-
-    // get cuda ptr. immutable
-    const T *CUDAData(platform::Place place) const {
-      PADDLE_ENFORCE(platform::is_gpu_place(place),
-                     "CUDA Data must on CUDA place");
-      ImmutableCUDA(place);
-      return reinterpret_cast<T *>(gpu_->ptr());
-    }
-
-    // get cuda ptr. mutable
-    T *CUDAMutableData(platform::Place place) {
-      const T *ptr = CUDAData(place);
-      flag_ = kDirty | kDataInCUDA;
-      return const_cast<T *>(ptr);
-    }
-
-    // clear
-    void clear() {
-      cpu_.clear();
-      flag_ = kDirty | kDataInCPU;
-    }
-
-    size_t capacity() const { return cpu_.capacity(); }
-
-    // reserve data
-    void reserve(size_t size) const { cpu_.reserve(size); }
-
-    // implicit cast operator. Vector can be cast to std::vector implicitly.
-    operator std::vector<T>() const {
-      ImmutableCPU();
-      return cpu_;
-    }
-
-    bool operator==(const VectorData &other) const {
-      ImmutableCPU();
-      other.ImmutableCPU();
-      return cpu_ == other.cpu_;
-    }
-
-    std::mutex &Mutex() const { return mtx_; }
-
-    boost::optional<platform::CUDAPlace> CUDAPlace() const {
-      return gpu_ == nullptr
-                 ? boost::none
-                 : boost::optional<platform::CUDAPlace>(
-                       boost::get<platform::CUDAPlace>(gpu_->place()));
-    }
-
-   private:
-    enum DataFlag {
-      kDataInCPU = 0x01,
-      kDataInCUDA = 0x02,
-      // kDirty means the data has been changed in one device.
-      kDirty = 0x10
-    };
-
-    void CopyToCPU() const {
-      // COPY GPU Data To CPU
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(gpu_->place()));
-      auto stream = dev_ctx->stream();
-      void *src = gpu_->ptr();
-      void *dst = cpu_.data();
-      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                           gpu_memory_size_, stream);
-      dev_ctx->Wait();
-    }
-
-    void MutableCPU() {
-      if (IsInCUDA() && IsDirty()) {
-        CopyToCPU();
-      }
-      flag_ = kDirty | kDataInCPU;
-    }
-
-    void ImmutableCUDA(platform::Place place) const {
-      if (IsDirty()) {
-        if (IsInCPU()) {
-          CopyCPUDataToCUDA(place);
-          UnsetFlag(kDirty);
-          SetFlag(kDataInCUDA);
-        } else if (IsInCUDA() && !(place == gpu_->place())) {
-          PADDLE_THROW("This situation should not happen");
-          // Still dirty
-        } else {
-          // Dirty && DataInCUDA && Device is same
-          // Do nothing
-        }
-      } else {
-        if (!IsInCUDA()) {
-          // Even data is not dirty. However, data is not in CUDA. Copy data.
-          CopyCPUDataToCUDA(place);
-          SetFlag(kDataInCUDA);
-        } else if (!(place == gpu_->place())) {
-          PADDLE_THROW("This situation should not happen.");
-        } else {
-          // Not Dirty && DataInCUDA && Device is same
-          // Do nothing.
-        }
-      }
-    }
-
-    void CopyCPUDataToCUDA(const platform::Place &place) const {
-      void *src = cpu_.data();
-      gpu_memory_size_ = cpu_.size() * sizeof(T);
-      gpu_ = memory::Alloc(place, gpu_memory_size_);
-      void *dst = gpu_->ptr();
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(place));
-      auto stream = dev_ctx->stream();
-      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                           gpu_memory_size_, stream);
-    }
-
-    void ImmutableCPU() const {
-      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
-                                      // CPU has no data.
-        CopyToCPU();
-        UnsetFlag(kDirty);
-      }
-      SetFlag(kDataInCPU);
-    }
-
-    void UnsetFlag(int flag) const { flag_ &= ~flag; }
-    void SetFlag(int flag) const { flag_ |= flag; }
-
-    bool IsDirty() const { return flag_ & kDirty; }
-
-    bool IsInCUDA() const { return flag_ & kDataInCUDA; }
-
-    bool IsInCPU() const { return flag_ & kDataInCPU; }
-
-    mutable std::vector<T> cpu_;
-    mutable paddle::memory::AllocationPtr gpu_;
-    mutable size_t gpu_memory_size_{0};
-    mutable int flag_;
-
-    mutable std::mutex mtx_;
-  };
-
- public:
-  // Default ctor. Create empty Vector
-  Vector() : m_(new VectorData()) {}
-
-  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T &value = T())
-      : m_(new VectorData(count, value)) {}
-
-  // Ctor with init_list
-  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
-
-  // implicit cast from std::vector.
-  template <typename U>
-  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
-  }
-
-  // Copy ctor
-  Vector(const Vector<T> &other) { m_ = other.m_; }
-
-  // Copy operator
-  Vector<T> &operator=(const Vector<T> &other) {
-    m_ = other.m_;
-    return *this;
-  }
-
-  // Move ctor
-  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
-
-  // CPU data access method. Mutable.
-  T &operator[](size_t i) { return (*m_.MutableData())[i]; }
-
-  // CPU data access method. Immutable.
-  const T &operator[](size_t i) const { return m_.Data()[i]; }
-
-  // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return m_.Data().size(); }
-
-  iterator begin() { return m_.MutableData()->begin(); }
-
-  iterator end() { return m_.MutableData()->end(); }
-
-  T &front() { return m_.MutableData()->front(); }
-
-  T &back() { return m_.MutableData()->back(); }
-
-  const_iterator begin() const { return m_.Data().begin(); }
-
-  const_iterator end() const { return m_.Data().end(); }
-
-  const_iterator cbegin() const { return begin(); }
-
-  const_iterator cend() const { return end(); }
-
-  const T &back() const { return m_.Data().back(); }
-
-  T *data() { return m_.MutableData()->data(); }
-
-  const T *data() const { return m_.Data().data(); }
-
-  const T &front() const { return m_.Data().front(); }
-  // end of std::vector iterator methods
-
-  // assign this from iterator.
-  // NOTE: the iterator must support `end-begin`
-  template <typename Iter>
-  void assign(Iter begin, Iter end) {
-    m_.MutableData()->assign(begin, end);
-  }
-
-  // push_back. If the previous capacity is not enough, the memory will
-  // double.
-  void push_back(T elem) { m_.MutableData()->push_back(elem); }
-
-  // extend a vector by iterator.
-  // NOTE: the iterator must support end-begin
-  template <typename It>
-  void Extend(It begin, It end) {
-    m_.MutableData()->Extend(begin, end);
-  }
-
-  // resize the vector
-  void resize(size_t size) {
-    if (m_.Data().size() != size) {
-      m_.MutableData()->resize(size);
-    }
-  }
-
-  // get cuda ptr. immutable
-  const T *CUDAData(platform::Place place) const {
-    {
-      auto &mtx = m_.Data().Mutex();
-      std::lock_guard<std::mutex> guard(mtx);
-      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == boost::none ||
-          cuda_place == boost::get<platform::CUDAPlace>(place)) {
-        return m_.Data().CUDAData(place);
-      }
-    }
-    // If m_ contains CUDAData in a different place. Detach manually.
-    m_.Detach();
-    return CUDAData(place);
-  }
-
-  // get cuda ptr. mutable
-  T *CUDAMutableData(platform::Place place) {
-    {
-      auto &mtx = m_.Data().Mutex();
-      std::lock_guard<std::mutex> guard(mtx);
-      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == boost::none ||
-          cuda_place == boost::get<platform::CUDAPlace>(place)) {
-        return m_.MutableData()->CUDAMutableData(place);
-      }
-    }
-    // If m_ contains CUDAData in a different place. Detach manually.
-    m_.Detach();
-    return CUDAMutableData(place);
-  }
-
-  // clear
-  void clear() { m_.MutableData()->clear(); }
-
-  size_t capacity() const { return m_.Data().capacity(); }
-
-  // reserve data
-  void reserve(size_t size) { m_.Data().reserve(size); }
-
-  // the unify method to access CPU or CUDA data. immutable.
-  const T *Data(platform::Place place) const {
-    if (platform::is_gpu_place(place)) {
-      return CUDAData(place);
-    } else {
-      return data();
-    }
-  }
-
-  // the unify method to access CPU or CUDA data. mutable.
-  T *MutableData(platform::Place place) {
-    if (platform::is_gpu_place(place)) {
-      return CUDAMutableData(place);
-    } else {
-      return data();
-    }
-  }
-
-  // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const { return m_.Data(); }
-
-  bool operator==(const Vector<T> &other) const {
-    if (size() != other.size()) return false;
-    auto it1 = cbegin();
-    auto it2 = other.cbegin();
-    for (; it1 < cend(); ++it1, ++it2) {
-      if (*it1 != *it2) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  const void *Handle() const { return &m_.Data(); }
-
- private:
-  // Vector is an COW object.
-  mutable details::COWPtr<VectorData> m_;
-};
-
-#else  // PADDLE_WITH_CUDA
-
-template <typename T>
-class CPUVector : public std::vector<T, std::allocator<T>> {
- public:
-  CPUVector() : std::vector<T>() {}
-  CPUVector(size_t count, const T &value = T())  // NOLINT
-      : std::vector<T>(count, value) {}
-  CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
-  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}  // NOLINT
-  CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
-  CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
-  CPUVector(std::vector<T> &&other)  // NOLINT
-      : std::vector<T>(std::move(other)) {}
-  CPUVector &operator=(const CPUVector &other) {
-    this->assign(other.begin(), other.end());
-    return *this;
-  }
-  CPUVector &operator=(const std::vector<T> &other) {
-    this->assign(other.begin(), other.end());
-    return *this;
-  }
-
-  friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
-    std::stringstream ss;
-    for (auto v : other) {
-      os << v << " ";
-    }
-    return os;
-  }
-
-  T &operator[](size_t id) { return this->at(id); }
-
-  const T &operator[](size_t id) const { return this->at(id); }
-
-  template <typename D>
-  void Extend(const D &begin, const D &end) {
-    this->reserve(this->size() + size_t(end - begin));
-    this->insert(this->end(), begin, end);
-  }
-
-  const T *CUDAData(platform::Place place) const {
-    PADDLE_THROW(
-        "Vector::CUDAData() method is not supported in CPU-only version");
-  }
-
-  T *CUDAMutableData(platform::Place place) {
-    PADDLE_THROW(
-        "Vector::CUDAMutableData() method is not supported in CPU-only "
-        "version");
-  }
-
-  const T *Data(platform::Place place) const {
-    PADDLE_ENFORCE(
-        platform::is_cpu_place(place),
-        "Vector::Data() method is not supported when not in CPUPlace");
-    return this->data();
-  }
-
-  T *MutableData(platform::Place place) {
-    PADDLE_ENFORCE(
-        platform::is_cpu_place(place),
-        "Vector::MutableData() method is not supported when not in CPUPlace");
-    return this->data();
-  }
-
-  const void *Handle() const { return static_cast<const void *>(this); }
-};
-
-template <typename T>
-using Vector = CPUVector<T>;
-
-#endif  // PADDLE_WITH_CUDA
-
-};  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc
deleted file mode 100644
index 0599c8d384641606b0a5ebb5ba1781b56f539e63..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/mixed_vector_test.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <memory>
-
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/mixed_vector.h"
-
-template <typename T>
-using vec = paddle::framework::Vector<T>;
-
-TEST(mixed_vector, CPU_VECTOR) {
-  vec<int> tmp;
-  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
-  }
-  ASSERT_EQ(tmp.size(), 10UL);
-  vec<int> tmp2;
-  tmp2 = tmp;
-  ASSERT_EQ(tmp2.size(), 10UL);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(tmp2[i], i);
-    ASSERT_EQ(tmp2[i], tmp[i]);
-  }
-  int cnt = 0;
-  for (auto& t : tmp2) {
-    ASSERT_EQ(t, cnt);
-    ++cnt;
-  }
-}
-
-TEST(mixed_vector, InitWithCount) {
-  paddle::framework::Vector<int> vec(10, 10);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(vec[i], 10);
-  }
-}
-
-TEST(mixed_vector, ForEach) {
-  vec<int> tmp;
-  for (auto& v : tmp) {
-    VLOG(3) << v;
-  }
-}
-
-TEST(mixed_vector, Reserve) {
-  paddle::framework::Vector<int> vec;
-  vec.reserve(1);
-  vec.push_back(0);
-  vec.push_back(0);
-  vec.push_back(0);
-}
-
-TEST(mixed_vector, Resize) {
-  paddle::framework::Vector<int> vec;
-  vec.resize(1);
-  vec.push_back(0);
-  vec.push_back(0);
-  vec.push_back(0);
-}
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
deleted file mode 100644
index 4b0caa8d350dde0462e5fdcca743df919358a364..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <cuda_runtime.h>
-#include <memory>
-
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-template <typename T>
-using vec = paddle::framework::Vector<T>;
-
-static __global__ void multiply_10(int* ptr) {
-  for (int i = 0; i < 10; ++i) {
-    ptr[i] *= 10;
-  }
-}
-
-cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
-  return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
-             paddle::platform::DeviceContextPool::Instance().Get(place))
-      ->stream();
-}
-
-TEST(mixed_vector, GPU_VECTOR) {
-  vec<int> tmp;
-  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
-  }
-  ASSERT_EQ(tmp.size(), 10UL);
-  paddle::platform::CUDAPlace gpu(0);
-
-  multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
-
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(tmp[i], i * 10);
-  }
-}
-
-TEST(mixed_vector, MultiGPU) {
-  if (paddle::platform::GetCUDADeviceCount() < 2) {
-    LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple "
-                    "GPUs in your machine.";
-    return;
-  }
-
-  vec<int> tmp;
-  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
-  }
-  ASSERT_EQ(tmp.size(), 10UL);
-  paddle::platform::CUDAPlace gpu0(0);
-  paddle::platform::SetDeviceId(0);
-  multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
-  paddle::platform::CUDAPlace gpu1(1);
-  auto* gpu1_ptr = tmp.MutableData(gpu1);
-  paddle::platform::SetDeviceId(1);
-  multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(tmp[i], i * 100);
-  }
-}
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
deleted file mode 100644
index be25672b4c7d29bc3bb7eca039a3c735994f0777..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/multi_trainer.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/trainer.h"
-
-namespace paddle {
-namespace framework {
-
-void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
-                              Dataset* dataset) {
-  thread_num_ = trainer_desc.thread_num();
-  for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
-       i++) {
-    need_merge_var_names_.push_back(
-        trainer_desc.downpour_param().stat_var_names(i));
-  }
-  SetDataset(dataset);
-  // get filelist from trainer_desc here
-  const std::vector<paddle::framework::DataFeed*> readers =
-      dataset->GetReaders();
-  VLOG(3) << "readers num: " << readers.size();
-  // change thread num to readers num
-  thread_num_ = readers.size();
-  VLOG(3) << "worker thread num: " << thread_num_;
-  workers_.resize(thread_num_);
-  for (int i = 0; i < thread_num_; ++i) {
-    workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
-        trainer_desc.device_worker_name());
-    workers_[i]->Initialize(trainer_desc);
-    workers_[i]->SetDeviceIndex(i);
-    workers_[i]->SetDataFeed(readers[i]);
-  }
-
-  // set debug here
-  SetDebug(trainer_desc.debug());
-}
-
-// call only after all resources are set in current trainer
-void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
-                                  const platform::Place& place) {
-  for (int i = 0; i < thread_num_; ++i) {
-    workers_[i]->SetPlace(place);
-    workers_[i]->SetReaderPlace(place);
-    workers_[i]->SetRootScope(root_scope_);
-    workers_[i]->CreateDeviceResource(main_program);  // Program
-    workers_[i]->BindingDataFeedMemory();
-  }
-}
-
-void MultiTrainer::Run() {
-  VLOG(3) << "Going to run";
-  for (int thidx = 0; thidx < thread_num_; ++thidx) {
-    if (!debug_) {
-      threads_.push_back(
-          std::thread(&DeviceWorker::TrainFiles, workers_[thidx].get()));
-    } else {
-      threads_.push_back(std::thread(&DeviceWorker::TrainFilesWithProfiler,
-                                     workers_[thidx].get()));
-    }
-  }
-}
-
-void MultiTrainer::Finalize() {
-  for (auto& th : threads_) {
-    th.join();
-  }
-  root_scope_->DropKids();
-}
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
deleted file mode 100644
index a37bb6f4da1fc3baffad36c13c690c6410ac4270..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/naive_executor.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace framework {
-void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
-                            int block_id, bool with_feed_fetch_ops) {
-  if (!scope) {
-    scope_ = new framework::Scope;
-  } else {
-    scope_ = scope;
-  }
-
-  VLOG(3) << "NaiveExecutor init with scope " << scope;
-  CreateOps(program_desc, block_id, with_feed_fetch_ops);
-}
-
-void NaiveExecutor::Run() {
-#ifndef PADDLE_ON_INFERENCE
-  LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
-                             "cmake flag ON_INFER is not set.";
-  LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
-                             "variables will be reused to save the allocation "
-                             "overhead.";
-  LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
-                             "setting the cmake flag ON_INFER=ON if you are "
-                             "running Paddle Inference";
-#endif  // PADDLE_ON_INFERENCE
-  for (auto &op : ops_) {
-    VLOG(4) << std::this_thread::get_id() << " run "
-            << op->DebugStringEx(scope_) << " on scope " << scope_;
-    op->SetIsCalledByExecutor(false);
-    op->Run(*scope_, place_);
-  }
-}
-
-void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
-                                    bool persistable, Scope *scope) {
-  PADDLE_ENFORCE_NOT_NULL(scope);
-
-  auto &global_block = desc.Block(block_id);
-
-  const auto *anc = scope;
-  PADDLE_ENFORCE(anc->parent() != anc);
-  while (anc->parent()) {
-    anc = anc->parent();
-  }
-
-  int num_vars = 0;
-  for (auto &var : global_block.AllVars()) {
-    if (var->Name() == framework::kEmptyVarName) {
-      continue;
-    }
-    num_vars++;
-
-    if (persistable == var->Persistable()) {
-      if (persistable) {
-        if (!anc->FindVar(var->Name())) {
-          auto *ptr = const_cast<Scope *>(anc)->Var(var->Name());
-          VLOG(3) << scope << " Create persistable variable " << var->Name()
-                  << ", which pointer is " << ptr;
-          InitializeVariable(ptr, var->GetType());
-        }
-      } else {
-        auto *ptr = const_cast<Scope *>(scope)->Var(var->Name());
-        VLOG(3) << scope << " Create variable " << var->Name()
-                << ", which pointer is " << ptr;
-        InitializeVariable(ptr, var->GetType());
-      }
-    }
-  }
-  VLOG(4) << "naive executor create " << num_vars << " vars";
-}
-
-void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
-                              bool with_feed_fetch_ops) {
-  for (const auto &op_desc : desc.Block(block_id).AllOps()) {
-    if (!with_feed_fetch_ops &&
-        (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
-      string::PrettyLogEndl(string::Style::detail(), "---  skip [%s], %s -> %s",
-                            op_desc->Input("X")[0], op_desc->Type(),
-                            op_desc->Output("Out")[0]);
-      continue;
-    }
-    ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
-  }
-}
-
-LoDTensor *NaiveExecutor::FindTensor(const std::string &name) {
-  PADDLE_ENFORCE(scope_, "Need to init scope first");
-  auto *var = scope_->FindVar(name);
-  PADDLE_ENFORCE(var, "No variable [%s] in the scope");
-  auto *tensor = const_cast<LoDTensor *>(&var->Get<LoDTensor>());
-  return tensor;
-}
-
-void NaiveExecutor::CleanFeedFetchOps() {
-  std::vector<std::unique_ptr<OperatorBase>> ops;
-  for (auto &op : ops_) {
-    if (op->Type() != "feed" && op->Type() != "fetch") {
-      ops.emplace_back(std::move(op));
-    }
-  }
-  ops_.swap(ops);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
deleted file mode 100644
index 5e673f68574c4ddaa4c9260367d09e9f62f6b751..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/naive_executor.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-/*
- * Simple, intuitive and effective. Only single thread is supported, and
- * currently designed for inference.
- */
-class NaiveExecutor {
- public:
-  explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
-
-  // Create child scope.
-  // Create variables.
-  // @with_feed_fetch_ops: whether to work with the feed and fetch operators.
-  void Prepare(Scope* scope, const ProgramDesc& program_desc, int block_id,
-               bool with_feed_fetch_ops);
-
-  // Create variables before head.
-  // Create parameters if persistable is ture, or create the temporary variables
-  // instead.
-  void CreateVariables(const ProgramDesc& desc, int block_id, bool persistable,
-                       Scope* scope);
-
-  // Run all the operators.
-  void Run();
-
-  // Get an tensor to operating directly, without the need for feed_ops.
-  LoDTensor* FindTensor(const std::string& name);
-
-  Scope* scope() { return scope_; }
-
-  void CleanFeedFetchOps();
-
- protected:
-  void CreateOps(const ProgramDesc& desc, int block_id,
-                 bool with_feed_fetch_ops);
-
- private:
-  const platform::Place place_;
-  // Catch the required resource to avoid recreate.
-  std::vector<std::unique_ptr<OperatorBase>> ops_;
-  Scope* scope_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc
deleted file mode 100644
index c917630666b082ab7148550707f9f1f720aa25d3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/naive_executor.h"
-#include <gtest/gtest.h>
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-TEST(NaiveExecutor, Basic) {
-  ProgramDesc program;
-  auto* main_block = program.MutableBlock(0);
-  auto* a = main_block->Var("a");  // input
-  auto* b = main_block->Var("b");  // input
-  auto* c = main_block->Var("c");  // input
-  a->SetType(proto::VarType::LOD_TENSOR);
-  b->SetType(proto::VarType::LOD_TENSOR);
-  c->SetType(proto::VarType::LOD_TENSOR);
-
-  auto* add = main_block->AppendOp();
-  add->SetType("elementwise_add");
-  add->SetInput("X", {"a"});
-  add->SetInput("Y", {"b"});
-  add->SetOutput("Out", {"c"});
-
-  auto place = platform::CPUPlace();
-  NaiveExecutor exe(place);
-  exe.Prepare(nullptr, program, 0, false);
-  auto* a_tensor = exe.FindTensor("a");
-  auto* b_tensor = exe.FindTensor("b");
-  auto* c_tensor = exe.FindTensor("c");
-
-  a_tensor->Resize({1, 4});
-  b_tensor->Resize({1, 4});
-  c_tensor->Resize({1, 4});
-  b_tensor->mutable_data<float>(place);
-  a_tensor->mutable_data<float>(place);
-
-  float a_arr[] = {0, 1, 2, 3};
-  float b_arr[] = {0.0, .1, .2, .3};
-
-  std::copy_n(a_arr, 4, a_tensor->mutable_data<float>(place));
-  std::copy_n(b_arr, 4, b_tensor->mutable_data<float>(place));
-
-  exe.Run();
-
-  auto* c_data = c_tensor->mutable_data<float>(place);
-  for (int i = 0; i < 4; i++) {
-    EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
-
-USE_OP(elementwise_add);
diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.h b/paddle/fluid/framework/no_need_buffer_vars_inference.h
deleted file mode 100644
index a63575611bced370d04b3847c7eb3500cb37bad8..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/no_need_buffer_vars_inference.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/op_desc.h"
-
-namespace paddle {
-namespace framework {
-
-class NoNeedBufferVarsInference {
- public:
-  NoNeedBufferVarsInference(const VariableNameMap &inputs,
-                            const VariableNameMap &outputs,
-                            const AttributeMap &attrs)
-      : inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
-
-  virtual ~NoNeedBufferVarsInference() = default;
-
-  const VariableNameMap &Inputs() const { return inputs_; }
-
-  const VariableNameMap &Outputs() const { return outputs_; }
-
-  const AttributeMap &Attrs() const { return attrs_; }
-
-  virtual std::unordered_set<std::string> operator()() const = 0;
-
- private:
-  const VariableNameMap &inputs_;
-  const VariableNameMap &outputs_;
-  const AttributeMap &attrs_;
-};
-
-#define DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(class_type, ...)  \
-  class class_type final                                        \
-      : public ::paddle::framework::NoNeedBufferVarsInference { \
-   public:                                                      \
-    using ::paddle::framework::NoNeedBufferVarsInference::      \
-        NoNeedBufferVarsInference;                              \
-                                                                \
-    std::unordered_set<std::string> operator()() const final {  \
-      return {__VA_ARGS__};                                     \
-    }                                                           \
-  }
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_call_stack.cc b/paddle/fluid/framework/op_call_stack.cc
deleted file mode 100644
index cf3b7188acb38b991297f52ddee652e79bc2d779..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_call_stack.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_call_stack.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-
-void InsertCallStackInfo(const std::string &type, const AttributeMap &attrs,
-                         platform::EnforceNotMet *exception) {
-  if (attrs.count("sub_block") != 0) {
-    return;
-  }
-  auto &callstack = boost::get<std::vector<std::string>>(
-      attrs.at(OpProtoAndCheckerMaker::OpCreationCallstackAttrName()));
-
-  if (callstack.empty()) {
-    return;
-  }
-  std::ostringstream sout;
-  sout << "Invoke operator " << type << " error.\n";
-  sout << "Python Call stacks: \n";
-  for (auto &line : callstack) {
-    sout << line;
-  }
-  sout << "C++ Call stacks: \n";
-  sout << exception->err_str_;
-  exception->err_str_ = sout.str();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_call_stack.h b/paddle/fluid/framework/op_call_stack.h
deleted file mode 100644
index 4408601abf0b3542c9850f9264d162faaa6a50ce..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_call_stack.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-void InsertCallStackInfo(const std::string &type, const AttributeMap &attrs,
-                         platform::EnforceNotMet *exception);
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc
deleted file mode 100644
index bf2f85e6f817100fb405f14ceafcffebd63e462e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_compatible_info.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_compatible_info.h"
-#include <iostream>
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace framework {
-
-inline std::vector<int> ConvertStr2Int(const std::string& str_text) {
-  auto vec_text = string::split_string<std::string>(str_text, ".");
-  PADDLE_ENFORCE((vec_text.size() == 2 || vec_text.size() == 3),
-                 "Input[%s] is not a right version format [1.6 or 1.6.0]",
-                 str_text);
-
-  std::vector<int> vec_res;
-  vec_res.reserve(3);
-  for (auto& val : vec_text) {
-    vec_res.emplace_back(atoi(val.c_str()));
-  }
-
-  if (vec_res.size() == 2) {
-    vec_res.emplace_back(0);
-  }
-
-  return vec_res;
-}
-
-/* first version >= second version return true */
-
-inline bool CompareVersion(const std::string& str_first,
-                           const std::string& str_second) {
-  auto vec_first_version = ConvertStr2Int(str_first);
-  auto vec_second_version = ConvertStr2Int(str_second);
-
-  // first version id
-  PADDLE_ENFORCE_EQ(
-      vec_first_version.size(), vec_second_version.size(),
-      "version information size not equal, first is [%d] second is [%d]",
-      vec_first_version.size(), vec_second_version.size());
-
-  for (size_t i = 0; i < vec_first_version.size() - 1; ++i) {
-    if (vec_first_version[i] != vec_second_version[i]) {
-      return vec_first_version[i] > vec_second_version[i];
-    }
-  }
-  return vec_first_version[2] >= vec_second_version[2];
-}
-
-void OpCompatibleMap::InitOpCompatibleMap() {
-  op_compatible_map_["sequence_pad"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["sequence_unpad"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-
-  op_compatible_map_["reshape2"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["slice"] = {"1.6.0", OpCompatibleType::possible};
-  op_compatible_map_["expand"] = {"1.6.0", OpCompatibleType::possible};
-
-  op_compatible_map_["layer_norm"] = {"1.6.0", OpCompatibleType::bug_fix};
-}
-
-CompatibleInfo OpCompatibleMap::GetOpCompatibleInfo(std::string op_name) {
-  auto it = op_compatible_map_.find(op_name);
-  if (it != op_compatible_map_.end()) {
-    return it->second;
-  } else {
-    return {default_required_version_, OpCompatibleType::DEFIN_NOT};
-  }
-}
-
-OpCompatibleType OpCompatibleMap::IsRequireMiniVersion(
-    std::string op_name, std::string str_current_version) {
-  auto it = op_compatible_map_.find(op_name);
-  if (it != op_compatible_map_.end()) {
-    if (CompareVersion(str_current_version, it->second.required_version_)) {
-      return OpCompatibleType::compatible;
-    } else {
-      return it->second.compatible_type_;
-    }
-
-  } else {
-    if (CompareVersion(str_current_version, default_required_version_)) {
-      return OpCompatibleType::compatible;
-    } else {
-      return OpCompatibleType::DEFIN_NOT;
-    }
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_compatible_info.h b/paddle/fluid/framework/op_compatible_info.h
deleted file mode 100644
index 03d47c82d9c75ec330bb22a2bb48db9acfe93f9a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_compatible_info.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <map>
-#include <string>
-
-#pragma once
-
-namespace paddle {
-namespace framework {
-
-enum class OpCompatibleType {
-  compatible = 0,       //   support previous version
-  DEFIN_NOT = 1,        //   definitely can't support previous version
-  possible = 2,         //   possible can support previous version, not sure
-  bug_fix = 3,          //   bug fix, can't support previous version
-  precision_change = 4  //   precision change, may cause difference
-};
-
-struct CompatibleInfo {
-  CompatibleInfo(std::string required_version, OpCompatibleType compatible_type)
-      : required_version_(required_version),
-        compatible_type_(compatible_type) {}
-  CompatibleInfo() {}
-
-  // op required version, previous version not support
-  std::string required_version_;
-  OpCompatibleType compatible_type_;
-};
-
-class OpCompatibleMap {
- public:
-  OpCompatibleMap() : default_required_version_("1.5.0") {}
-  void InitOpCompatibleMap();
-
-  CompatibleInfo GetOpCompatibleInfo(std::string op_name);
-
-  /* IsRequireMiniVersion
-   *  return type OpCompatibleType */
-
-  OpCompatibleType IsRequireMiniVersion(std::string op_name,
-                                        std::string current_version);
-
-  void SerializeToStr(std::string& str) {} /* NOLINT */
-  void UnSerialize(const std::string& str) {}
-
-  const std::string& GetDefaultRequiredVersion() {
-    return default_required_version_;
-  }
-
- private:
-  std::map<std::string, CompatibleInfo> op_compatible_map_;
-
-  std::string default_required_version_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_compatible_info_test.cc b/paddle/fluid/framework/op_compatible_info_test.cc
deleted file mode 100644
index 2a50a8302b384694cf6fbbdfeb1d8c5ee846863a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_compatible_info_test.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_compatible_info.h"
-#include <iostream>
-#include "gtest/gtest.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-TEST(test_op_compatible_info, test_op_compatible) {
-  auto comp_map = OpCompatibleMap();
-  comp_map.InitOpCompatibleMap();
-
-  auto default_req_version = comp_map.GetDefaultRequiredVersion();
-
-  auto seq_pad = comp_map.GetOpCompatibleInfo("sequence_pad");
-  auto reshape = comp_map.GetOpCompatibleInfo("reshape");
-  auto layer_norm = comp_map.GetOpCompatibleInfo("layer_norm");
-
-  auto deafult_info = comp_map.GetOpCompatibleInfo("layer_xx");
-
-  auto comp_1 = comp_map.IsRequireMiniVersion("sequence_pad", "1.5.0");
-  ASSERT_EQ(comp_1, OpCompatibleType::DEFIN_NOT);
-  auto comp_2 = comp_map.IsRequireMiniVersion("sequence_pad", "1.6.0");
-  ASSERT_EQ(comp_2, OpCompatibleType::compatible);
-  auto comp_3 = comp_map.IsRequireMiniVersion("sequence_pad", "1.6.1");
-  ASSERT_EQ(comp_3, OpCompatibleType::compatible);
-  auto comp_6 = comp_map.IsRequireMiniVersion("sequence_pad", "1.7.0");
-  ASSERT_EQ(comp_6, OpCompatibleType::compatible);
-  auto comp_7 = comp_map.IsRequireMiniVersion("sequence_pad", "0.7.0");
-  ASSERT_EQ(comp_7, OpCompatibleType::DEFIN_NOT);
-  auto comp_8 = comp_map.IsRequireMiniVersion("sequence_pad", "2.0.0");
-  ASSERT_EQ(comp_8, OpCompatibleType::compatible);
-
-  ASSERT_EQ(comp_map.IsRequireMiniVersion("unkop", "2.0.0"),
-            OpCompatibleType::compatible);
-  ASSERT_EQ(comp_map.IsRequireMiniVersion("unkop", "0.7.0"),
-            OpCompatibleType::DEFIN_NOT);
-
-  ASSERT_EQ(comp_map.IsRequireMiniVersion("slice", "0.7.0"),
-            OpCompatibleType::possible);
-  ASSERT_EQ(comp_map.IsRequireMiniVersion("slice", "1.6.0"),
-            OpCompatibleType::compatible);
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
deleted file mode 100644
index a36e3605270dae7dfb7039891d5bbc410edefd2a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_desc.cc
+++ /dev/null
@@ -1,831 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_desc.h"
-#include <algorithm>
-#include <functional>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_call_stack.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/shape_inference.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-
-namespace paddle {
-namespace framework {
-
-class OpDesc;
-class BlockDesc;
-class CompileTimeInferShapeContext : public InferShapeContext {
- public:
-  CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block);
-
-  bool HasInput(const std::string &name) const override;
-
-  bool HasOutput(const std::string &name) const override;
-
-  bool HasInputs(const std::string &name) const override;
-
-  bool HasOutputs(const std::string &name) const override;
-
-  AttrReader Attrs() const override;
-
-  const std::vector<std::string> &Inputs(
-      const std::string &name) const override;
-
-  const std::vector<std::string> &Outputs(
-      const std::string &name) const override;
-
-  void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
-                size_t j = 0) override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    const std::string &input_n = Inputs(in)[i];
-    const std::string &output_n = Outputs(out)[j];
-
-    PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
-                   in, i);
-    PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", out, j);
-
-    auto *in_var = block_.FindVarRecursive(input_n);
-    auto *out_var = block_.FindVarRecursive(output_n);
-
-    PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
-                   "The type of %s and %s is not the same.", input_n, output_n);
-
-    SetDim(output_n, GetDim(input_n));
-  }
-
-  void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
-                size_t j = 0) const override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", in, i);
-    PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", out, j);
-    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
-    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
-    if (in_var->GetType() != proto::VarType::LOD_TENSOR &&
-        in_var->GetType() != proto::VarType::LOD_TENSOR_ARRAY) {
-      VLOG(3) << "input " << in << " is not LodTensor or LodTensorArray.";
-      return;
-    }
-    out_var->SetLoDLevel(in_var->GetLoDLevel());
-  }
-
-  void DecreaseLoDLevel(const std::string &in, const std::string &out,
-                        size_t i = 0, size_t j = 0) const override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", in, i);
-    PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", out, j);
-    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
-    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
-    PADDLE_ENFORCE(out_var->GetType() == proto::VarType::LOD_TENSOR_ARRAY ||
-                       out_var->GetType() == proto::VarType::LOD_TENSOR,
-                   "The input %s should be LodTensorArray or LodTensor.",
-                   out_var->Name());
-    PADDLE_ENFORCE(in_var->GetType() == proto::VarType::LOD_TENSOR,
-                   "The input %s should be LodTensor.", in_var->Name());
-    if (in_var->GetLoDLevel() > 0) {
-      out_var->SetLoDLevel(in_var->GetLoDLevel() - 1);
-    }
-  }
-
-  std::vector<InferShapeVarPtr> GetInputVarPtrs(
-      const std::string &name) override {
-    const std::vector<std::string> arg_names = Inputs(name);
-    std::vector<InferShapeVarPtr> res;
-    res.reserve(arg_names.size());
-    std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res),
-                   [this](const std::string &name) {
-                     return block_.FindVarRecursive(name);
-                   });
-    return res;
-  }
-
-  std::vector<InferShapeVarPtr> GetOutputVarPtrs(
-      const std::string &name) override {
-    const std::vector<std::string> arg_names = Outputs(name);
-    std::vector<InferShapeVarPtr> res;
-    res.reserve(arg_names.size());
-    std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res),
-                   [this](const std::string &name) {
-                     return block_.FindVarRecursive(name);
-                   });
-    return res;
-  }
-
-  DDim GetInputDim(const std::string &name) const override {
-    const std::vector<std::string> &arg_names = Inputs(name);
-    PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                      "Input(%s) should hold one element, but now it holds %d",
-                      name, arg_names.size());
-    return this->GetDim(arg_names[0]);
-  }
-
-  std::vector<DDim> GetInputsDim(const std::string &name) const override {
-    const std::vector<std::string> &arg_names = Inputs(name);
-    return GetDims(arg_names);
-  }
-
-  bool IsRuntime() const override;
-
-  std::vector<proto::VarType::Type> GetInputsVarType(
-      const std::string &name) const override {
-    return GetVarTypes(Inputs(name));
-  }
-
-  std::vector<proto::VarType::Type> GetOutputsVarType(
-      const std::string &name) const override {
-    return GetVarTypes(Outputs(name));
-  }
-
-  void SetOutputDim(const std::string &name, const DDim &dim) override {
-    auto &arg_names = Outputs(name);
-    PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                      "Output(%s) should hold one element, but now it holds %d",
-                      name, arg_names.size());
-    SetDim(arg_names[0], dim);
-  }
-
-  void SetOutputsDim(const std::string &name,
-                     const std::vector<DDim> &dims) override {
-    auto &names = Outputs(name);
-    SetDims(names, dims);
-  }
-
- protected:
-  std::vector<proto::VarType::Type> GetVarTypes(
-      const std::vector<std::string> &names) const {
-    std::vector<proto::VarType::Type> retv;
-    retv.resize(names.size());
-    std::transform(
-        names.begin(), names.end(), retv.begin(),
-        std::bind(std::mem_fn(&CompileTimeInferShapeContext::GetVarType), this,
-                  std::placeholders::_1));
-    return retv;
-  }
-
-  proto::VarType::Type GetVarType(const std::string &name) const;
-
-  DDim GetDim(const std::string &name) const {
-    auto var = block_.FindVarRecursive(name);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-    DDim res;
-    try {
-      auto shape = var->GetShape();
-      res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
-    } catch (...) {
-      VLOG(5) << "GetDim of variable " << name << " error";
-      std::rethrow_exception(std::current_exception());
-    }
-    return res;
-  }
-
-  std::vector<DDim> GetDims(const std::vector<std::string> &names) const {
-    std::vector<DDim> ret;
-    ret.reserve(names.size());
-    std::transform(
-        names.begin(), names.end(), std::back_inserter(ret),
-        [this](const std::string &name) { return this->GetDim(name); });
-    return ret;
-  }
-
-  void SetDim(const std::string &name, const DDim &dim);
-
-  void SetDims(const std::vector<std::string> &names,
-               const std::vector<DDim> &dims) {
-    size_t length = names.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
-    for (size_t i = 0; i < length; ++i) {
-      if (names[i] == framework::kEmptyVarName) {
-        continue;
-      }
-      SetDim(names[i], dims[i]);
-    }
-  }
-
-  std::vector<DDim> GetRepeatedDims(const std::string &name) const override;
-
-  void SetRepeatedDims(const std::string &name,
-                       const std::vector<DDim> &dims) override;
-
-  const OpDesc &op_;
-  const BlockDesc &block_;
-};
-
-OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
-               const VariableNameMap &outputs, const AttributeMap &attrs) {
-  desc_.set_type(type);
-  inputs_ = inputs;
-  outputs_ = outputs;
-  attrs_ = attrs;
-  need_update_ = true;
-  block_ = nullptr;
-}
-
-OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) {
-  CopyFrom(other);
-  block_ = block;
-  need_update_ = true;
-}
-
-void OpDesc::CopyFrom(const OpDesc &op_desc) {
-  desc_.set_type(op_desc.Type());
-  inputs_ = op_desc.inputs_;
-  outputs_ = op_desc.outputs_;
-  attrs_ = op_desc.attrs_;
-  need_update_ = true;
-}
-
-OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
-    : desc_(desc), need_update_(false) {
-  // restore inputs_
-  int input_size = desc_.inputs_size();
-  for (int i = 0; i < input_size; ++i) {
-    const proto::OpDesc::Var &var = desc_.inputs(i);
-    std::vector<std::string> &args = inputs_[var.parameter()];
-    int argu_size = var.arguments_size();
-    args.reserve(argu_size);
-    for (int j = 0; j < argu_size; ++j) {
-      args.push_back(var.arguments(j));
-    }
-  }
-  // restore outputs_
-  int output_size = desc_.outputs_size();
-  for (int i = 0; i < output_size; ++i) {
-    const proto::OpDesc::Var &var = desc_.outputs(i);
-    std::vector<std::string> &args = outputs_[var.parameter()];
-    int argu_size = var.arguments_size();
-    args.reserve(argu_size);
-    for (int j = 0; j < argu_size; ++j) {
-      args.push_back(var.arguments(j));
-    }
-  }
-  // restore attrs_
-  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
-    std::string attr_name = attr.name();
-    // The sub_block referred to by the BLOCK attr hasn't been added
-    // to ProgramDesc class yet, we skip setting BLOCK/BLOCKS attr here.
-    if (attr.type() != proto::AttrType::BLOCK &&
-        attr.type() != proto::AttrType::BLOCKS) {
-      attrs_[attr_name] = GetAttrValue(attr);
-    }
-  }
-  this->block_ = block;
-}
-
-proto::OpDesc *OpDesc::Proto() {
-  Flush();
-  return &desc_;
-}
-
-const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
-  auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
-                 Type());
-  return it->second;
-}
-
-std::vector<std::string> OpDesc::InputArgumentNames() const {
-  std::vector<std::string> retv;
-  for (auto &ipt : this->inputs_) {
-    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
-  }
-  return retv;
-}
-
-void OpDesc::SetInput(const std::string &param_name,
-                      const std::vector<std::string> &args) {
-  need_update_ = true;
-  inputs_[param_name] = args;
-}
-
-const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
-  auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
-                 name, Type());
-  return it->second;
-}
-
-std::vector<std::string> OpDesc::OutputArgumentNames() const {
-  std::vector<std::string> retv;
-  for (auto &ipt : this->outputs_) {
-    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
-  }
-  return retv;
-}
-
-void OpDesc::SetOutput(const std::string &param_name,
-                       const std::vector<std::string> &args) {
-  need_update_ = true;
-  this->outputs_[param_name] = args;
-}
-
-bool OpDesc::HasProtoAttr(const std::string &name) const {
-  auto &op_info = OpInfoMap::Instance();
-  if (op_info.Has(desc_.type())) {
-    auto op_info_ptr = op_info.Get(desc_.type());
-    if (op_info_ptr.HasOpProtoAndChecker()) {
-      const proto::OpProto &proto = op_info_ptr.Proto();
-      for (int i = 0; i != proto.attrs_size(); ++i) {
-        const proto::OpProto::Attr &attr = proto.attrs(i);
-        if (attr.name() == name) {
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
-proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
-  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return static_cast<proto::AttrType>(it->second.which() - 1);
-}
-
-std::vector<std::string> OpDesc::AttrNames() const {
-  std::vector<std::string> retv;
-  retv.reserve(attrs_.size());
-  for (auto &attr : attrs_) {
-    retv.push_back(attr.first);
-  }
-  return retv;
-}
-
-void OpDesc::RemoveAttr(const std::string &name) {
-  attrs_.erase(name);
-  need_update_ = true;
-}
-
-void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
-  // NOTICE(minqiyang): pybind11 will take the empty list in python as
-  // the std::vector<int> type in C++; so we have to change the attr's type
-  // here if we meet this issue
-  proto::AttrType attr_type = static_cast<proto::AttrType>(v.which() - 1);
-  if (attr_type == proto::AttrType::INTS &&
-      boost::get<std::vector<int>>(v).size() == 0u) {
-    // Find current attr via attr name and set the correct attribute value
-    const proto::OpProto::Attr &attr = GetProtoAttr(name);
-    switch (attr.type()) {
-      case proto::AttrType::BOOLEANS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to BOOLEANS";
-        this->attrs_[name] = std::vector<bool>();
-        break;
-      }
-      case proto::AttrType::INTS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to INTS";
-        this->attrs_[name] = std::vector<int>();
-        break;
-      }
-      case proto::AttrType::LONGS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from LONGS to LONGS";
-        this->attrs_[name] = std::vector<int64_t>();
-        break;
-      }
-      case proto::AttrType::FLOATS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to FLOATS";
-        this->attrs_[name] = std::vector<float>();
-        break;
-      }
-      case proto::AttrType::STRINGS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to STRINGS";
-        this->attrs_[name] = std::vector<std::string>();
-        break;
-      }
-      case proto::AttrType::BLOCKS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to BLOCKS";
-        this->SetBlocksAttr(name, std::vector<BlockDesc *>());
-        return;
-      }
-      default:
-        PADDLE_THROW("Wrong attr type %d", attr.type());
-    }
-    need_update_ = true;
-    return;
-  }
-
-  this->attrs_[name] = v;
-  need_update_ = true;
-}
-
-void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
-  this->attrs_[name] = block;
-  need_update_ = true;
-}
-
-void OpDesc::SetBlocksAttr(const std::string &name,
-                           std::vector<BlockDesc *> blocks) {
-  this->attrs_[name] = blocks;
-  need_update_ = true;
-}
-
-void OpDesc::SetAttrMap(
-    const std::unordered_map<std::string, Attribute> &attr_map) {
-  attrs_ = attr_map;
-  need_update_ = true;
-}
-
-Attribute OpDesc::GetAttr(const std::string &name) const {
-  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return it->second;
-}
-
-const proto::OpProto::Attr &OpDesc::GetProtoAttr(
-    const std::string &name) const {
-  const proto::OpProto &proto = OpInfoMap::Instance().Get(Type()).Proto();
-  for (int i = 0; i != proto.attrs_size(); ++i) {
-    const proto::OpProto::Attr &attr = proto.attrs(i);
-    if (attr.name() == name) {
-      return attr;
-    }
-  }
-
-  PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
-}
-
-Attribute OpDesc::GetNullableAttr(const std::string &name) const {
-  auto it = attrs_.find(name);
-  if (it != attrs_.end()) {
-    return it->second;
-  } else {
-    return Attribute();
-  }
-}
-
-std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
-  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  auto blocks = boost::get<std::vector<BlockDesc *>>(it->second);
-
-  std::vector<int> ids;
-  for (auto n : blocks) {
-    ids.push_back(n->ID());
-  }
-
-  return ids;
-}
-
-int OpDesc::GetBlockAttrId(const std::string &name) const {
-  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return boost::get<BlockDesc *>(it->second)->ID();
-}
-
-const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
-  return attrs_;
-}
-
-void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
-  RenameInput(old_name, new_name);
-  RenameOutput(old_name, new_name);
-  need_update_ = true;
-}
-
-void OpDesc::RenameOutput(const std::string &old_name,
-                          const std::string &new_name) {
-  for (auto &output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
-
-  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
-  if (it != attrs_.end()) {
-    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
-    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
-  }
-
-  need_update_ = true;
-}
-
-void OpDesc::RenameInput(const std::string &old_name,
-                         const std::string &new_name) {
-  for (auto &input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-
-  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
-  if (it != attrs_.end()) {
-    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
-    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
-  }
-
-  need_update_ = true;
-}
-
-struct SetAttrDescVisitor : public boost::static_visitor<void> {
-  explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {}
-  mutable proto::OpDesc::Attr *attr_;
-  void operator()(int v) const { attr_->set_i(v); }
-  void operator()(float v) const { attr_->set_f(v); }
-  void operator()(const std::string &v) const { attr_->set_s(v); }
-
-  // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162
-  template <class T,
-            class = typename std::enable_if<std::is_same<bool, T>::value>::type>
-  void operator()(T b) const {
-    attr_->set_b(b);
-  }
-
-  void operator()(const std::vector<int> &v) const {
-    VectorToRepeated(v, attr_->mutable_ints());
-  }
-  void operator()(const std::vector<float> &v) const {
-    VectorToRepeated(v, attr_->mutable_floats());
-  }
-  void operator()(const std::vector<std::string> &v) const {
-    VectorToRepeated(v, attr_->mutable_strings());
-  }
-  void operator()(const std::vector<bool> &v) const {
-    VectorToRepeated(v, attr_->mutable_bools());
-  }
-  void operator()(const std::vector<BlockDesc *> &v) const {
-    std::vector<int> blocks_idx;
-    for (auto blk : v) {
-      blocks_idx.push_back(blk->ID());
-    }
-    VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx());
-  }
-
-  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
-
-  void operator()(int64_t v) const { attr_->set_l(v); }
-
-  void operator()(const std::vector<int64_t> &v) const {
-    VectorToRepeated(v, attr_->mutable_longs());
-  }
-
-  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
-};
-
-void OpDesc::Flush() {
-  if (need_update_) {
-    this->desc_.mutable_inputs()->Clear();
-    for (auto &ipt : inputs_) {
-      auto *input = desc_.add_inputs();
-      input->set_parameter(ipt.first);
-      VectorToRepeated(ipt.second, input->mutable_arguments());
-    }
-
-    this->desc_.mutable_outputs()->Clear();
-    for (auto &opt : outputs_) {
-      auto *output = desc_.add_outputs();
-      output->set_parameter(opt.first);
-      VectorToRepeated(opt.second, output->mutable_arguments());
-    }
-
-    this->desc_.mutable_attrs()->Clear();
-    for (auto &attr : attrs_) {
-      auto *attr_desc = desc_.add_attrs();
-      attr_desc->set_name(attr.first);
-      attr_desc->set_type(
-          static_cast<proto::AttrType>(attr.second.which() - 1));
-      SetAttrDescVisitor visitor(attr_desc);
-      boost::apply_visitor(visitor, attr.second);
-    }
-
-    need_update_ = false;
-  }
-}
-
-static std::once_flag init_infer_shape_funcs;
-
-/**
- * NOTE(paddle-dev): Very tricky code here. Maybe we should find a
- * better way to register compile-time infershape method gentlely.
- *
- * Normally, we can register a class derived from InferShapeBase, so that
- * we can set the field of `infer_shape_` inside OpInfo when registering op.
- *
- * However, there is another way we can set the field of `infer_shape_` inside
- * OpInfo. Usually, we overload InferShape method of OperatorWithKernel. After
- * running the following method InitInferShapeFuncs, `infer_shape_` would be set
- * to be the InferShape method of OperatorWithKernel. That is to say, we borrow
- * the run-time InferShape method of OperatorWithKernel to be the compile-time
- * InferShape method.
- *
- * However, during compiling time, we may not know inputs, outputs and attrs of
- * run-time OperatorWithKernel. So the following code creates a fake
- * OperatorWithKernel object. That is why the field info_ of OperatorBase
- * would be null.
- */
-static void InitInferShapeFuncs() {
-  std::call_once(init_infer_shape_funcs, [] {
-    auto &map = OpInfoMap::Instance();
-    auto &info_map = *map.mutable_map();
-
-    for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
-      auto op_type = kern_pair.first;
-      auto it = info_map.find(op_type);
-      PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered",
-                     op_type);
-      auto &op_info = it->second;
-      if (op_info.infer_shape_) {  // infer_shape has been registered.
-        continue;
-      }
-
-      auto op = dynamic_cast<OperatorWithKernel *>(op_info.Creator()(
-          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
-
-      PADDLE_ENFORCE_NOT_NULL(
-          op, "InferShapeBase is not registered to Operator %s", op_type);
-
-      op_info.infer_shape_ = [op](InferShapeContext *ctx) {
-        op->InferShape(ctx);
-      };
-    }
-  });
-}
-
-void OpDesc::CheckAttrs() {
-  PADDLE_ENFORCE(!Type().empty(),
-                 "CheckAttr() can not be called before type is setted.");
-  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
-  if (checker == nullptr) {
-    // checker is not configured. That operator could be generated by Paddle,
-    // not by users.
-    return;
-  }
-  VLOG(10) << "begin to check attribute of " << Type();
-  checker->Check(&attrs_);
-}
-
-void OpDesc::InferShape(const BlockDesc &block) const {
-  try {
-    VLOG(3) << "CompileTime infer shape on " << Type();
-    InitInferShapeFuncs();
-    auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
-    PADDLE_ENFORCE(static_cast<bool>(infer_shape),
-                   "%s's infer_shape has not been registered", this->Type());
-    CompileTimeInferShapeContext ctx(*this, block);
-    if (VLOG_IS_ON(10)) {
-      std::ostringstream sout;
-      auto inames = this->InputArgumentNames();
-      sout << " From [";
-      std::copy(inames.begin(), inames.end(),
-                std::ostream_iterator<std::string>(sout, ", "));
-      sout << "] to [";
-      auto onames = this->OutputArgumentNames();
-      std::copy(onames.begin(), onames.end(),
-                std::ostream_iterator<std::string>(sout, ", "));
-      sout << "]";
-      VLOG(10) << sout.str();
-    }
-    infer_shape(&ctx);
-  } catch (platform::EnforceNotMet exception) {
-    framework::InsertCallStackInfo(Type(), attrs_, &exception);
-    throw std::move(exception);
-  } catch (...) {
-    std::rethrow_exception(std::current_exception());
-  }
-}
-
-void OpDesc::InferVarType(BlockDesc *block) const {
-  // There are a few places that var type can be set.
-  // When VarDesc is created, default set to LOD_TENSOR.
-  // When output variable is created, default is defaut set to LOD_TENSOR.
-  // We limit here to be the only place that operator defines its customized
-  // var type inference. Hence, we don't do any "default" setting here.
-  auto &info = OpInfoMap::Instance().Get(this->Type());
-  if (info.infer_var_type_) {
-    InferVarTypeContext context(this, block);
-    info.infer_var_type_(&context);
-  }
-}
-
-CompileTimeInferShapeContext::CompileTimeInferShapeContext(
-    const OpDesc &op, const BlockDesc &block)
-    : op_(op), block_(block) {}
-
-bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
-  const std::vector<std::string> &input_names = op_.Input(name);
-  auto length = input_names.size();
-  if (length == 0) {
-    return false;
-  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input(%s) should have only one value, "
-                    "but it have %d now",
-                    name, length);
-  return block_.HasVarRecursive(input_names[0]);
-}
-
-bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
-  const std::vector<std::string> &output_names = op_.Output(name);
-  auto length = output_names.size();
-  if (length == 0) {
-    return false;
-  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Output(%s) should have only one value, "
-                    "but it have %d now",
-                    name, length);
-  return block_.HasVarRecursive(output_names[0]);
-}
-
-bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const {
-  const std::vector<std::string> &input_names = op_.Input(name);
-  if (input_names.empty()) {
-    return false;
-  }
-  for (auto &input : input_names) {
-    if (!block_.HasVarRecursive(input)) return false;
-  }
-  return true;
-}
-
-bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
-  const std::vector<std::string> &output_names = op_.Output(name);
-  if (output_names.empty()) {
-    return false;
-  }
-  for (auto &output : output_names) {
-    if (!block_.HasVarRecursive(output)) return false;
-  }
-  return true;
-}
-
-AttrReader CompileTimeInferShapeContext::Attrs() const {
-  return AttrReader(op_.GetAttrMap());
-}
-
-const std::vector<std::string> &CompileTimeInferShapeContext::Inputs(
-    const std::string &name) const {
-  return op_.Input(name);
-}
-
-const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
-    const std::string &name) const {
-  return op_.Output(name);
-}
-
-std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
-    const std::string &name) const {
-  auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-  std::vector<DDim> res;
-  try {
-    auto shapes = var->GetShapes();
-    for (const auto &s : shapes) {
-      res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
-    }
-  } catch (...) {
-    VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
-    std::rethrow_exception(std::current_exception());
-  }
-  return res;
-}
-
-void CompileTimeInferShapeContext::SetDim(const std::string &name,
-                                          const DDim &dim) {
-  block_.FindVarRecursive(name)->SetShape(vectorize(dim));
-}
-
-void CompileTimeInferShapeContext::SetRepeatedDims(
-    const std::string &name, const std::vector<DDim> &dims) {
-  auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-  std::vector<std::vector<int64_t>> dim_vec(dims.size());
-  std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize<>);
-  var->SetShapes(dim_vec);
-}
-
-bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
-
-proto::VarType::Type CompileTimeInferShapeContext::GetVarType(
-    const std::string &name) const {
-  return block_.FindVarRecursive(name)->GetType();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
deleted file mode 100644
index 2f6fb9e298440e0aaac79d0dc5ad1e7d1aed6990..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_desc.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/framework/var_desc.h"
-
-namespace paddle {
-namespace framework {
-
-class BlockDesc;
-class ProgramDesc;
-class OpDesc {
- public:
-  OpDesc() {}
-
-  OpDesc(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const AttributeMap &attrs);
-
-  OpDesc(const proto::OpDesc &desc, BlockDesc *block);
-
-  explicit OpDesc(BlockDesc *block) : block_(block) {}
-
-  OpDesc(const OpDesc &other, BlockDesc *block);
-
-  void CopyFrom(const OpDesc &op_desc);
-
-  proto::OpDesc *Proto();
-
-  std::string Type() const { return desc_.type(); }
-
-  void SetType(const std::string &type) { desc_.set_type(type); }
-
-  const std::vector<std::string> &Input(const std::string &name) const;
-
-  std::vector<std::string> InputArgumentNames() const;
-
-  void SetInput(const std::string &param_name,
-                const std::vector<std::string> &args);
-
-  const std::vector<std::string> &Output(const std::string &name) const;
-
-  std::vector<std::string> OutputArgumentNames() const;
-
-  void SetOutput(const std::string &param_name,
-                 const std::vector<std::string> &args);
-
-  bool HasAttr(const std::string &name) const {
-    return attrs_.find(name) != attrs_.end();
-  }
-
-  bool HasProtoAttr(const std::string &name) const;
-
-  proto::AttrType GetAttrType(const std::string &name) const;
-
-  std::vector<std::string> AttrNames() const;
-
-  void SetAttr(const std::string &name, const Attribute &v);
-  void RemoveAttr(const std::string &name);
-
-  void SetBlockAttr(const std::string &name, BlockDesc *block);
-
-  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> blocks);
-
-  Attribute GetAttr(const std::string &name) const;
-
-  template <typename T>
-  T GetAttrIfExists(const std::string &name) const {
-    T result{};
-    if (HasAttr(name)) {
-      result = boost::get<T>(GetAttr(name));
-    }
-    return result;
-  }
-
-  const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const;
-
-  Attribute GetNullableAttr(const std::string &name) const;
-
-  int GetBlockAttrId(const std::string &name) const;
-
-  std::vector<int> GetBlocksAttrIds(const std::string &name) const;
-
-  void Rename(const std::string &old_name, const std::string &new_name);
-
-  void RenameOutput(const std::string &old_name, const std::string &new_name);
-
-  void RenameInput(const std::string &old_name, const std::string &new_name);
-
-  // Only be used in C++
-  const AttributeMap &GetAttrMap() const;
-
-  // Only be used in C++
-  void SetAttrMap(const AttributeMap &attr_map);
-
-  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
-  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
-
-  const VariableNameMap &Inputs() const { return inputs_; }
-
-  const VariableNameMap &Outputs() const { return outputs_; }
-
-  AttributeMap *MutableAttrMap() {
-    this->need_update_ = true;
-    return &this->attrs_;
-  }
-
-  void CheckAttrs();
-
-  void InferShape(const BlockDesc &block) const;
-
-  void InferVarType(BlockDesc *block) const;
-
-  void SetIsTarget(bool is_target) { desc_.set_is_target(is_target); }
-
-  void Flush();
-
-  BlockDesc *Block() { return this->block_; }
-
-  const BlockDesc *Block() const { return this->block_; }
-
- private:
-  template <typename MapType>
-  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
-    std::vector<typename MapType::key_type> ret_val;
-    ret_val.reserve(map.size());
-    std::transform(
-        map.begin(), map.end(), std::back_inserter(ret_val),
-        [](const typename MapType::value_type &pair) { return pair.first; });
-    return ret_val;
-  }
-
-  proto::OpDesc desc_;
-  BlockDesc *block_;  // not_own
-  // input arg name => input variable names
-  VariableNameMap inputs_;
-  // output arg name => output variable names
-  VariableNameMap outputs_;
-  AttributeMap attrs_;
-
-  // need_update_ indicate there some local changes not be synchronized. If
-  // local changes should be synchronized, need_update_ should be set to true.
-  bool need_update_{false};
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
deleted file mode 100644
index c815e194d43e149f9efe0daec820c42e87f81d0c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_info.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_info.h"
-#include <set>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-// C++11 removes the need for manual locking. Concurrent execution shall wait if
-// a static local variable is already being initialized.
-// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
-OpInfoMap& OpInfoMap::Instance() {
-  static OpInfoMap g_op_info_map;
-  return g_op_info_map;
-}
-
-std::vector<std::string> OpInfoMap::GetUseDefaultGradOpDescMakerOps() const {
-  // Use set to sort op names
-  std::set<std::string> result_ops;
-  for (auto& pair : map_) {
-    if (pair.second.use_default_grad_op_desc_maker_) {
-      result_ops.insert(pair.first);
-    }
-  }
-  return std::vector<std::string>(result_ops.begin(), result_ops.end());
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
deleted file mode 100644
index 765ca361f61f78de73003e22e38796c39e12d2e5..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_info.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-
-class InferShapeBase {
- public:
-  virtual ~InferShapeBase() = default;
-  virtual void operator()(InferShapeContext*) const = 0;
-};
-
-struct OpInfo {
-  OpCreator creator_;
-  GradOpMakerFN grad_op_maker_;
-  proto::OpProto* proto_{nullptr};
-  OpAttrChecker* checker_{nullptr};
-  InferVarTypeFN infer_var_type_;
-  InferShapeFN infer_shape_;
-  InferInplaceOpFN infer_inplace_;
-  InferNoNeedBufferVarsFN infer_no_need_buffer_vars_;
-
-  // NOTE(zjl): this flag is added to check whether
-  // the grad maker is the default one.
-  bool use_default_grad_op_desc_maker_{false};
-
-  bool HasOpProtoAndChecker() const {
-    return proto_ != nullptr && checker_ != nullptr;
-  }
-
-  const proto::OpProto& Proto() const {
-    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator's Proto has not been registered");
-    PADDLE_ENFORCE(proto_->IsInitialized(),
-                   "Operator's Proto must be initialized in op info");
-    return *proto_;
-  }
-
-  const OpCreator& Creator() const {
-    PADDLE_ENFORCE_NOT_NULL(creator_,
-                            "Operator's Creator has not been registered");
-    return creator_;
-  }
-
-  const GradOpMakerFN& GradOpMaker() const {
-    // Normally, proto_ should not be null, except some special operators, such
-    // as LeaklyReluDoubleGrad op.
-    std::string type = proto_ ? proto_->type() : "unknown";
-    PADDLE_ENFORCE_NOT_NULL(
-        grad_op_maker_,
-        "Operator %s's GradOpMaker has not been "
-        "registered.\nPlease check whether %s_op has "
-        "grad_op.\nIf not, please set stop_gradient to True "
-        "for its input and output variables using var.stop_gradient=True.",
-        type.c_str(), type.c_str());
-    return grad_op_maker_;
-  }
-
-  // some op has no grad_op_maker, add check before use GradOpMaker()
-  bool HasGradOpMaker() const {
-    return grad_op_maker_ != nullptr ? true : false;
-  }
-
-  bool HasInferInplace() const {
-    return infer_inplace_ != nullptr ? true : false;
-  }
-
-  const OpAttrChecker* Checker() const { return checker_; }
-
-  const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const {
-    return infer_no_need_buffer_vars_;
-  }
-};
-
-class OpInfoMap {
- public:
-  static OpInfoMap& Instance();
-
-  bool Has(const std::string& op_type) const {
-    return map_.find(op_type) != map_.end();
-  }
-
-  void Insert(const std::string& type, const OpInfo& info) {
-    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
-    map_.insert({type, info});
-  }
-
-  const OpInfo& Get(const std::string& type) const {
-    auto op_info_ptr = GetNullable(type);
-    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered",
-                            type);
-    return *op_info_ptr;
-  }
-
-  const OpInfo* GetNullable(const std::string& type) const {
-    auto it = map_.find(type);
-    if (it == map_.end()) {
-      return nullptr;
-    } else {
-      return &it->second;
-    }
-  }
-
-  const std::unordered_map<std::string, OpInfo>& map() const { return map_; }
-
-  std::unordered_map<std::string, OpInfo>* mutable_map() { return &map_; }
-
-  std::vector<std::string> GetUseDefaultGradOpDescMakerOps() const;
-
- private:
-  OpInfoMap() = default;
-  std::unordered_map<std::string, OpInfo> map_;
-
-  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc
deleted file mode 100644
index 6d4801e4a0eed7083e671e1d49b8628dfb280cf9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_kernel_type.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_kernel_type.h"
-
-namespace paddle {
-namespace framework {
-
-size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
-  int cur_loc = 0;
-
-  int place = key.place_.which();
-  cur_loc += OpKernelType::kPlaceBits;
-
-  int data_type = static_cast<int>(key.data_type_) << cur_loc;
-  cur_loc += OpKernelType::kPrimaryDTypeBits;
-
-  int data_layout = static_cast<int>(key.data_layout_) << cur_loc;
-  cur_loc += OpKernelType::kLayoutBits;
-
-  int library_type = static_cast<int>(key.library_type_) << cur_loc;
-  cur_loc += OpKernelType::kLibBits;
-
-  int customized_value = key.customized_type_value_;
-  PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
-  customized_value = customized_value << cur_loc;
-  cur_loc += OpKernelType::kCustomizeBits;
-  PADDLE_ENFORCE(cur_loc < 64);
-
-  std::hash<int> hasher;
-  return hasher(place + data_type + data_layout + library_type +
-                customized_value);
-}
-
-bool OpKernelType::operator==(const OpKernelType& o) const {
-  return platform::places_are_same_class(place_, o.place_) &&
-         data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
-         library_type_ == o.library_type_ &&
-         customized_type_value_ == o.customized_type_value_;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
deleted file mode 100644
index 9edc1a3e150027b5a3dbd8483dc8b58d1d4ab918..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_kernel_type.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/library_type.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-class OpKernelType {
- public:
-  constexpr static int kDefaultCustomizedTypeValue = 0;
-
-  // In total should be smaller than 64.
-  constexpr static int kPlaceBits = 4;
-  constexpr static int kPrimaryDTypeBits = 8;
-  constexpr static int kLayoutBits = 4;
-  constexpr static int kLibBits = 4;
-  constexpr static int kCustomizeBits = 4;
-
-  OpKernelType(proto::VarType::Type data_type, platform::Place place,
-               DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain,
-               int customized_type_value = kDefaultCustomizedTypeValue)
-      : data_type_(data_type),
-        data_layout_(data_layout),
-        place_(place),
-        library_type_(library_type),
-        customized_type_value_(customized_type_value) {}
-
-  OpKernelType(proto::VarType::Type data_type,
-               const platform::DeviceContext& dev_ctx,
-               DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain,
-               int customized_type_value = kDefaultCustomizedTypeValue)
-      : data_type_(data_type),
-        data_layout_(data_layout),
-        place_(dev_ctx.GetPlace()),
-        library_type_(library_type),
-        customized_type_value_(customized_type_value) {}
-
-  virtual ~OpKernelType() {}
-
-  struct Hash {
-    size_t operator()(const OpKernelType& key) const;
-  };
-
-  size_t hash_key() const { return Hash()(*this); }
-
-  bool operator==(const OpKernelType& o) const;
-
-  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
-
-  proto::VarType::Type data_type_;
-  DataLayout data_layout_;
-  platform::Place place_;
-  LibraryType library_type_;
-  int customized_type_value_;
-};
-
-inline std::ostream& operator<<(std::ostream& os,
-                                const OpKernelType& kernel_key) {
-  os << "data_type[" << kernel_key.data_type_ << "]:data_layout["
-     << kernel_key.data_layout_ << "]:place[" << kernel_key.place_
-     << "]:library_type[" << kernel_key.library_type_ << "]";
-  return os;
-}
-
-inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
-  std::ostringstream stream;
-  stream << kernel_key;
-  return stream.str();
-}
-
-inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
-  bool ret =
-      (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r);
-#ifdef PADDLE_WITH_MKLDNN
-  // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa
-  ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN);
-  ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN);
-#endif
-  return ret;
-}
-
-inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) {
-  return (!platform::places_are_same_class(l.place_, r.place_)) ||
-         (l.data_type_ != r.data_type_) ||
-         NeedTransformLayout(l.data_layout_, r.data_layout_);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
deleted file mode 100644
index 40db85400d2c8776b82ce0fa2fb4deed993b0255..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include <gtest/gtest.h>
-#include <iostream>
-
-TEST(OpKernelType, ToString) {
-  using OpKernelType = paddle::framework::OpKernelType;
-  using DataType = paddle::framework::proto::VarType;
-  using CPUPlace = paddle::platform::CPUPlace;
-  using DataLayout = paddle::framework::DataLayout;
-  using LibraryType = paddle::framework::LibraryType;
-
-  OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
-                              LibraryType::kCUDNN);
-
-  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
-            "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
-            "CUDNN]");
-
-  using CUDAPlace = paddle::platform::CUDAPlace;
-  OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
-                               LibraryType::kCUDNN);
-  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
-            "data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
-            "CUDAPlace(0)]:library_"
-            "type[CUDNN]");
-}
-
-TEST(OpKernelType, Hash) {
-  using OpKernelType = paddle::framework::OpKernelType;
-  using DataType = paddle::framework::proto::VarType;
-  using CPUPlace = paddle::platform::CPUPlace;
-  using CUDAPlace = paddle::platform::CUDAPlace;
-  using DataLayout = paddle::framework::DataLayout;
-  using LibraryType = paddle::framework::LibraryType;
-
-  OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
-                                LibraryType::kCUDNN);
-  OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
-                                LibraryType::kCUDNN);
-
-  OpKernelType::Hash hasher;
-  ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
-}
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
deleted file mode 100644
index b502ef7a7c61b2114248bab9ab2bbb2d1a90dd16..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-void OpProtoAndCheckerMaker::Validate() {
-  validated_ = true;
-  CheckNoDuplicatedInOutAttrs();
-}
-
-OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
-    const std::string& name, const std::string& comment) {
-  auto* input = proto_->add_inputs();
-  input->set_name(name);
-  input->set_comment(comment);
-  return OpProtoAndCheckerMaker::VariableBuilder{input};
-}
-
-OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
-    const std::string& name, const std::string& comment) {
-  auto* output = proto_->add_outputs();
-  output->set_name(name);
-  output->set_comment(comment);
-  return OpProtoAndCheckerMaker::VariableBuilder{output};
-}
-
-void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
-  std::unordered_set<std::string> names;
-  auto checker = [&](const std::string& name) {
-    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
-    names.insert(name);
-  };
-  for (auto& attr : proto_->attrs()) {
-    checker(attr.name());
-  }
-  for (auto& input : proto_->inputs()) {
-    checker(input.name());
-  }
-  for (auto& output : proto_->outputs()) {
-    checker(output.name());
-  }
-}
-
-void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
-                                        OpAttrChecker* attr_checker) {
-  proto_ = proto;
-  op_checker_ = attr_checker;
-  Make();
-
-  AddAttr<int>(OpRoleAttrName(), "The role of this operator")
-      .InEnum(
-          {static_cast<int>(OpRole::kForward),
-           static_cast<int>(OpRole::kBackward),
-           static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
-           static_cast<int>(OpRole::kDist), static_cast<int>(OpRole::kLRSched),
-           static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
-           static_cast<int>(OpRole::kLoss) |
-               static_cast<int>(OpRole::kBackward),
-           static_cast<int>(OpRole::kOptimize) |
-               static_cast<int>(OpRole::kLRSched),
-           static_cast<int>(OpRole::kNotSpecified)})
-      .SetDefault(static_cast<int>(OpRole::kNotSpecified));
-  AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
-                                    "Optimized for variable")
-      .SetDefault({});
-
-  AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
-      .SetDefault("");
-
-  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
-                                    "Callstack for Op Creatation.")
-      .SetDefault({});
-
-  Validate();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
deleted file mode 100644
index 5f3ce60e1d94ea4078cf0b709df362bad317f621..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_proto_maker.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
-namespace paddle {
-namespace framework {
-
-//////////////////////////
-// Don't add more roles to make this too complicated!
-//////////////////////////
-enum class OpRole {
-  kForward = 0x0000,
-  kBackward = 0x0001,
-  kOptimize = 0x0002,
-  // RPC role is for send/recv related op
-  kRPC = 0x0004,
-  // Dist role is for split_byref/split_selected_rows/concat
-  // used for distributed training.
-  kDist = 0x0008,
-  // Tag all learning rate scheduler operators.
-  kLRSched = 0x0010,
-
-  kLoss = 0x0100,
-  // The default value of op's role. This should be only used for unittests and
-  // CreateOp inside a operator.
-  kNotSpecified = 0x1000,
-};
-
-// this class not only make proto but also init attribute checkers.
-class OpProtoAndCheckerMaker {
- public:
-  static const char *OpRoleAttrName() { return "op_role"; }
-  static const char *OpRoleVarAttrName() { return "op_role_var"; }
-  static const char *OpNamescopeAttrName() { return "op_namescope"; }
-  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
-
-  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
-
-  virtual void Make() = 0;
-
-  virtual ~OpProtoAndCheckerMaker() {
-    CHECK(validated_) << "should call Validate after build";
-  }
-
- protected:
-  struct VariableBuilder {
-    proto::OpProto::Var *var_;
-
-    VariableBuilder &AsDuplicable() {
-      var_->set_duplicable(true);
-      return *this;
-    }
-
-    VariableBuilder &AsIntermediate() {
-      var_->set_intermediate(true);
-      return *this;
-    }
-
-    VariableBuilder &AsDispensable() {
-      var_->set_dispensable(true);
-      return *this;
-    }
-  };
-
-  VariableBuilder AddInput(const std::string &name, const std::string &comment);
-
-  VariableBuilder AddOutput(const std::string &name,
-                            const std::string &comment);
-
-  template <typename T>
-  TypedAttrChecker<T> &AddAttr(const std::string &name,
-                               const std::string &comment,
-                               bool generated = false) {
-    auto *attr = proto_->add_attrs();
-    attr->set_name(name);
-    attr->set_comment(comment);
-    attr->set_generated(generated);
-    attr->set_type(AttrTypeID<T>());
-    return op_checker_->AddAttrChecker<T>(name);
-  }
-
-  void AddComment(const std::string &comment) { proto_->set_comment(comment); }
-
- private:
-  void CheckNoDuplicatedInOutAttrs();
-  void Validate();
-
-  proto::OpProto *proto_;
-  OpAttrChecker *op_checker_;
-  bool validated_{false};
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
deleted file mode 100644
index a8030d377fdb4d4aef74b315e21792dad10fac96..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-#include "gtest/gtest.h"
-
-class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddAttr<float>("scale", "scale of test op");
-    AddAttr<float>("scale", "scale of test op");
-  }
-};
-
-TEST(ProtoMaker, DuplicatedAttr) {
-  paddle::framework::proto::OpProto op_proto;
-  paddle::framework::OpAttrChecker op_checker;
-  TestAttrProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
-               paddle::platform::EnforceNotMet);
-}
-
-class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("input", "input of test op");
-    AddInput("input", "input of test op");
-  }
-};
-
-TEST(ProtoMaker, DuplicatedInOut) {
-  paddle::framework::proto::OpProto op_proto;
-  paddle::framework::OpAttrChecker op_checker;
-  TestAttrProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
-               paddle::platform::EnforceNotMet);
-}
diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc
deleted file mode 100644
index 346d14d408ea1ed2cfbdbed5f48e56902e6e95b2..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_registry.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
-    const std::string& type, const VariableNameMap& inputs,
-    const VariableNameMap& outputs, AttributeMap attrs) {
-  auto& info = OpInfoMap::Instance().Get(type);
-  if (info.Checker() != nullptr) {
-    info.Checker()->Check(&attrs);
-  }
-  auto op = info.Creator()(type, inputs, outputs, attrs);
-  return std::unique_ptr<OperatorBase>(op);
-}
-
-static VariableNameMap ConvertOpDescVarsToVarNameMap(
-    const google::protobuf::RepeatedPtrField<proto::OpDesc::Var>&
-        op_desc_vars) {
-  VariableNameMap ret_val;
-  for (auto& var : op_desc_vars) {
-    auto& var_names = ret_val[var.parameter()];
-    auto& var_names_in_proto = var.arguments();
-    var_names.reserve(static_cast<size_t>(var_names_in_proto.size()));
-    std::copy(var_names_in_proto.begin(), var_names_in_proto.end(),
-              std::back_inserter(var_names));
-  }
-  return ret_val;
-}
-
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
-    const proto::OpDesc& op_desc) {
-  VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
-             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
-             "instead.";
-  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
-  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
-  AttributeMap attrs;
-  for (auto& attr : op_desc.attrs()) {
-    attrs[attr.name()] = GetAttrValue(attr);
-  }
-
-  return CreateOp(op_desc.type(), inputs, outputs, attrs);
-}
-
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
-  return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
-                  op_desc.GetAttrMap());
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
deleted file mode 100644
index 3f14f47f0dddc0f203d03fcdcdb3213291ab6bdb..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_registry.h
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <atomic>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "glog/logging.h"               // For VLOG()
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/details/op_registry.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/grad_op_desc_maker.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/shape_inference.h"
-
-namespace paddle {
-namespace framework {
-
-class Registrar {
- public:
-  // In our design, various kinds of classes, e.g., operators and kernels,
-  // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which
-  // are not used in the code that calls package framework, and would
-  // be removed from the generated binary file by the linker. To avoid such
-  // removal, we add Touch to all registrar classes and make USE_OP macros to
-  // call this method. So, as long as the callee code calls USE_OP, the global
-  // registrar variable won't be removed by the linker.
-  void Touch() {}
-};
-
-template <typename... ARGS>
-struct OperatorRegistrar : public Registrar {
-  explicit OperatorRegistrar(const char* op_type) {
-    if (OpInfoMap::Instance().Has(op_type)) {
-      PADDLE_THROW("'%s' is registered more than once.", op_type);
-    }
-    static_assert(sizeof...(ARGS) != 0,
-                  "OperatorRegistrar should be invoked at least by OpClass");
-    OpInfo info;
-    details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info);
-    OpInfoMap::Instance().Insert(op_type, info);
-  }
-};
-
-class OpRegistry {
- public:
-  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
-                                                const VariableNameMap& inputs,
-                                                const VariableNameMap& outputs,
-                                                AttributeMap attrs);
-
-  static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);
-
-  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
-};
-
-template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
-struct OpKernelRegistrarFunctor;
-
-template <typename PlaceType, typename T, typename Func>
-inline void RegisterKernelClass(const char* op_type, const char* library_type,
-                                int customized_type_value, Func func) {
-  std::string library(library_type);
-  std::string data_layout = "ANYLAYOUT";
-  if (library == "MKLDNN") {
-    data_layout = "MKLDNNLAYOUT";
-  }
-  OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                   StringToDataLayout(data_layout),
-                   StringToLibraryType(library_type), customized_type_value);
-  OperatorWithKernel::AllOpKernels()[op_type][key] = func;
-}
-
-template <typename PlaceType, size_t I, typename... KernelTypes>
-struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
-  using KERNEL_TYPE =
-      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
-
-  void operator()(const char* op_type, const char* library_type,
-                  int customized_type_value) const {
-    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    RegisterKernelClass<PlaceType, T>(
-        op_type, library_type, customized_type_value,
-
-        [](const framework::ExecutionContext& ctx) {
-          KERNEL_TYPE().Compute(ctx);
-        });
-    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
-    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
-        func;
-    func(op_type, library_type, customized_type_value);
-  }
-};
-
-template <typename PlaceType, size_t I, typename... KernelType>
-struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
-  void operator()(const char* op_type, const char* library_type,
-                  int customized_type_value) const {}
-};
-
-// User can register many kernel in one place. The data type could be
-// different.
-template <typename PlaceType, typename... KernelType>
-class OpKernelRegistrar : public Registrar {
- public:
-  explicit OpKernelRegistrar(const char* op_type, const char* library_type,
-                             int customized_type_value) {
-    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    func(op_type, library_type, customized_type_value);
-  }
-};
-
-template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
-struct OpKernelRegistrarFunctorEx;
-
-template <typename PlaceType, typename... DataTypeAndKernelType>
-class OpKernelRegistrarEx : public Registrar {
- public:
-  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type,
-                               int customized_type_value) {
-    OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
-        func;
-    func(op_type, library_type, customized_type_value);
-  }
-};
-
-template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
-struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
-                                  DataTypeAndKernelType...> {
-  void operator()(const char* op_type, const char* library_type,
-                  int customized_type_value) const {}
-};
-
-template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
-struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
-                                  DataTypeAndKernelType...> {
-  using Functor =
-      typename std::tuple_element<I + 1,
-                                  std::tuple<DataTypeAndKernelType...>>::type;
-  using T =
-      typename std::tuple_element<I,
-                                  std::tuple<DataTypeAndKernelType...>>::type;
-
-  void operator()(const char* op_type, const char* library_type,
-                  int customized_type_value) const {
-    RegisterKernelClass<PlaceType, T>(op_type, library_type,
-                                      customized_type_value, Functor());
-
-    constexpr auto size =
-        std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
-    OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
-                               DataTypeAndKernelType...>
-        func;
-    func(op_type, library_type, customized_type_value);
-  }
-};
-
-// clang-format off
-/**
- * check if MACRO is used in GLOBAL NAMESPACE.
- */
-#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-
-/*
-  The variadic arguments should be class types derived from one of the
-  following classes:
-    OpProtoAndCheckerMaker
-    GradOpDescMakerBase
-    VarTypeInference
-    InferShapeBase
-*/
-#define REGISTER_OPERATOR(op_type, op_class, ...)                        \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
-      __reg_op__##op_type,                                               \
-      "REGISTER_OPERATOR must be called in global namespace");           \
-  static ::paddle::framework::OperatorRegistrar<op_class, ##__VA_ARGS__> \
-      __op_registrar_##op_type##__(#op_type);                            \
-  int TouchOpRegistrar_##op_type() {                                     \
-    __op_registrar_##op_type##__.Touch();                                \
-    return 0;                                                            \
-  }
-
-#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OPERATOR(op_type, op_class, op_maker_class, \
-                    paddle::framework::EmptyGradOpMaker)
-
-/**
- * Macro to register OperatorKernel.
- */
-#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type,             \
-                                            place_class, customized_name,      \
-                                            customized_type_value, ...)        \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
-      __reg_op_kernel_##op_type##_##library_type##_##customized_name##__,      \
-                                 "REGISTER_OP_KERNEL must be called in "       \
-                                 "global namespace");                          \
-  static ::paddle::framework::OpKernelRegistrar<place_class,                   \
-                                                __VA_ARGS__>                   \
-      __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
-          #op_type, #library_type, customized_type_value);                     \
-  int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
-    __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__   \
-        .Touch();                                                              \
-    return 0;                                                                  \
-  }
-
-#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)   \
-  REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(                                \
-      op_type, library_type, place_class, DEFAULT_TYPE,               \
-      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
-      __VA_ARGS__)
-
-#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
-
-#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
-
-#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
-                              customized_name,                     \
-                              customized_type_value,               \
-                              ...)                                 \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                  \
-      __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \
-                                 "REGISTER_OP_KERNEL_EX must be called in "  \
-                                 "global namespace");  \
-  static ::paddle::framework::OpKernelRegistrarEx<place_class,  \
-                                                  __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
-          #op_type, #library_type, customized_type_value);  \
-  int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
-    __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__   \
-        .Touch();                                                              \
-    return 0;                                                                  \
-  }
-
-#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)                 \
-  REGISTER_OP_KERNEL_EX(                                              \
-      op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE,     \
-      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
-      __VA_ARGS__)
-
-#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...)                  \
-  REGISTER_OP_KERNEL_EX(                                              \
-      op_type, CPU, ::paddle::platform::CPUPlace, DEFAULT_TYPE,       \
-      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
-      __VA_ARGS__)
-
-/**
- * Macro to mark what Operator and Kernel
- * we will use and tell the compiler to
- * link them into target.
- */
-#define USE_OP_ITSELF(op_type)                             \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                          \
-      __use_op_itself_##op_type,                           \
-      "USE_OP_ITSELF must be called in global namespace"); \
-  extern int TouchOpRegistrar_##op_type();                 \
-  UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
-
-#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type,                     \
-                                              LIBRARY_TYPE,                \
-                                              customized_name)             \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##__,  \
-      "USE_OP_DEVICE_KERNEL must be in global namespace");                 \
-  extern int                                                               \
-      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \
-  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
-      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
-
-#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
-  USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
-
-// TODO(fengjiayi): The following macros
-// seems ugly, do we have better method?
-
-#ifndef PADDLE_WITH_CUDA
-#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
-#else
-#define USE_OP_KERNEL(op_type)        \
-  USE_OP_DEVICE_KERNEL(op_type, CPU); \
-  USE_OP_DEVICE_KERNEL(op_type, CUDA)
-#endif
-
-#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
-
-#define USE_CPU_ONLY_OP(op_type) \
-  USE_OP_ITSELF(op_type);        \
-  USE_OP_DEVICE_KERNEL(op_type, CPU);
-
-#define USE_CUDA_ONLY_OP(op_type) \
-  USE_OP_ITSELF(op_type);         \
-  USE_OP_DEVICE_KERNEL(op_type, CUDA)
-
-#define USE_OP(op_type)   \
-  USE_OP_ITSELF(op_type); \
-  USE_OP_KERNEL(op_type)
-// clang-format on
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
deleted file mode 100644
index 04996d7b09cecc3c330a47153c9b10310f1792f4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_registry_test.cc
+++ /dev/null
@@ -1,370 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace pd = paddle::framework;
-
-namespace paddle {
-namespace framework {
-
-class CosineOp : public OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {}
-};
-
-class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .GreaterThan(0.0);
-    AddComment("This is cos op");
-  }
-};
-
-class MyTestOp : public OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {}
-};
-
-class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("input", "input of cosine op").AsDuplicable();
-    AddOutput("output", "output of cosine op").AsIntermediate();
-    auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
-    };
-    AddAttr<int>("test_attr", "a simple test attribute")
-        .AddCustomChecker(my_checker);
-    AddComment("This is my_test op");
-  }
-};
-}  // namespace framework
-}  // namespace paddle
-
-static void BuildVar(const std::string& param_name,
-                     std::initializer_list<const char*> arguments,
-                     paddle::framework::proto::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    var->add_arguments(arg_name);
-  }
-}
-REGISTER_OP_WITHOUT_GRADIENT(cos_sim, paddle::framework::CosineOp,
-                             paddle::framework::CosineOpProtoAndCheckerMaker);
-REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
-                             paddle::framework::MyTestOpProtoAndCheckerMaker);
-
-TEST(OpRegistry, CreateOp) {
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("cos_sim");
-  BuildVar("input", {"aa"}, op_desc.add_inputs());
-  BuildVar("output", {"bb"}, op_desc.add_outputs());
-
-  float scale = 3.3;
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
-  attr->set_f(scale);
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace cpu_place;
-  op->Run(scope, cpu_place);
-  float scale_get = op->Attr<float>("scale");
-  ASSERT_EQ(scale_get, scale);
-}
-
-TEST(OpRegistry, IllegalAttr) {
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("cos_sim");
-  BuildVar("input", {"aa"}, op_desc.add_inputs());
-  BuildVar("output", {"bb"}, op_desc.add_outputs());
-
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
-  attr->set_f(-2.0);
-
-  bool caught = false;
-  try {
-    paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::platform::EnforceNotMet err) {
-    caught = true;
-    std::string msg = "larger_than check fail";
-    const char* err_msg = err.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(err_msg[i], msg[i]);
-    }
-  }
-  ASSERT_TRUE(caught);
-}
-
-TEST(OpRegistry, DefaultValue) {
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("cos_sim");
-  BuildVar("input", {"aa"}, op_desc.add_inputs());
-  BuildVar("output", {"bb"}, op_desc.add_outputs());
-
-  ASSERT_TRUE(op_desc.IsInitialized());
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace cpu_place;
-  op->Run(scope, cpu_place);
-  ASSERT_EQ(op->Attr<float>("scale"), 1.0);
-}
-
-TEST(OpRegistry, CustomChecker) {
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("my_test_op");
-  BuildVar("input", {"ii"}, op_desc.add_inputs());
-  BuildVar("output", {"oo"}, op_desc.add_outputs());
-
-  // attr 'test_attr' is not set
-  bool caught = false;
-  try {
-    paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::platform::EnforceNotMet err) {
-    caught = true;
-    std::string msg = "Attribute 'test_attr' is required!";
-    const char* err_msg = err.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(err_msg[i], msg[i]);
-    }
-  }
-  ASSERT_TRUE(caught);
-
-  // set 'test_attr' set to an illegal value
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("test_attr");
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(3);
-  caught = false;
-  try {
-    paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::platform::EnforceNotMet err) {
-    caught = true;
-    std::string msg = "'test_attr' must be even!";
-    const char* err_msg = err.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(err_msg[i], msg[i]);
-    }
-  }
-  ASSERT_TRUE(caught);
-
-  // set 'test_attr' set to a legal value
-  op_desc.mutable_attrs()->Clear();
-  attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("test_attr");
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(4);
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-  op->Run(scope, cpu_place);
-  int test_attr = op->Attr<int>("test_attr");
-  ASSERT_EQ(test_attr, 4);
-}
-
-TEST(OperatorRegistrar, Test) {
-  paddle::framework::OperatorRegistrar<
-      paddle::framework::CosineOp,
-      paddle::framework::CosineOpProtoAndCheckerMaker>
-      reg("cos");
-}
-
-namespace paddle {
-namespace framework {
-
-class OpKernelTestMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() { AddComment("NoGradOp, same input output. no Grad"); }
-};
-
-class OpWithKernelTest : public OperatorWithKernel {
- public:
-  using OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(InferShapeContext* ctx) const override {}
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(proto::VarType::FP32, ctx.device_context());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class OpKernelTest : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const {}
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
-                             paddle::framework::OpWithKernelTest,
-                             paddle::framework::OpKernelTestMaker);
-REGISTER_OP_CPU_KERNEL(
-    op_with_kernel,
-    paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
-
-REGISTER_OP_CUDA_KERNEL(op_with_kernel,
-                        paddle::framework::OpKernelTest<
-                            paddle::platform::CUDADeviceContext, float>);
-
-TEST(OperatorRegistrar, CPU) {
-  paddle::framework::proto::OpDesc op_desc;
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-
-  op_desc.set_type("op_with_kernel");
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-
-  op->Run(scope, cpu_place);
-}
-
-TEST(OperatorRegistrar, CUDA) {
-  paddle::framework::proto::OpDesc op_desc;
-  paddle::platform::CUDAPlace cuda_place(0);
-  paddle::framework::Scope scope;
-
-  op_desc.set_type("op_with_kernel");
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-
-  op->Run(scope, cuda_place);
-}
-
-static int op_test_value = 0;
-
-using paddle::platform::CPUDeviceContext;
-using paddle::platform::CUDADeviceContext;
-using paddle::platform::DeviceContext;
-
-namespace paddle {
-namespace framework {
-
-class OpWithMultiKernelTest : public OperatorWithKernel {
- public:
-  using OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(InferShapeContext* ctx) const override {}
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0),
-                                   DataLayout::kAnyLayout,
-                                   framework::LibraryType::kCUDNN);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const;
-};
-
-template <typename T>
-class OpMultiKernelTest<CPUDeviceContext, T>
-    : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const {
-    ++op_test_value;
-  }
-};
-
-template <typename T>
-class OpMultiKernelTest<CUDADeviceContext, T>
-    : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const {
-    --op_test_value;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const;
-};
-
-template <typename T>
-class OpMultiKernelTest2<CPUDeviceContext, T>
-    : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const {
-    op_test_value += 10;
-  }
-};
-
-template <typename T>
-class OpMultiKernelTest2<CUDADeviceContext, T>
-    : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const {
-    op_test_value -= 10;
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
-                             paddle::framework::OpWithMultiKernelTest,
-                             paddle::framework::OpKernelTestMaker);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel, CPU, paddle::platform::CPUPlace,
-    paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace,
-    paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace,
-    paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace,
-    paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
-
-TEST(OperatorRegistrar, OpWithMultiKernel) {
-  paddle::framework::proto::OpDesc op_desc;
-  paddle::platform::CUDAPlace cuda_place(0);
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-
-  op_desc.set_type("op_with_multi_kernel");
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-
-  // TODO(qiao) add priority back
-  // use all available kernels
-  op->Run(scope, cuda_place);
-  EXPECT_EQ(op_test_value, -10);
-}
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
deleted file mode 100644
index 42e70d9cb0d9b4a8a99c88f23eeb75c9fac937e6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/operator.cc
+++ /dev/null
@@ -1,1191 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-
-#include <algorithm>
-#include <sstream>
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/data_transform.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_call_stack.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/shape_inference.h"
-#include "paddle/fluid/framework/transfer_scope_cache.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DECLARE_bool(benchmark);
-DECLARE_bool(check_nan_inf);
-DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
-DEFINE_bool(fast_check_nan_inf, false,
-            "Fast checking NAN/INF after each operation. It will be a little"
-            "bit slow, much faster than check_nan_inf");
-
-namespace paddle {
-namespace framework {
-
-std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
-    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
-    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
-    std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN),
-    std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
-};
-
-proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
-  if (var->IsType<framework::LoDTensor>()) {
-    return var->Get<framework::LoDTensor>().type();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().value().type();
-  } else {
-    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
-  }
-}
-
-static DDim GetDimsDebug(const Scope& scope, const std::string& name,
-                         bool get_actual_dim = false) {
-  Variable* var = scope.FindVar(name);
-  if (var == nullptr) {
-    return DDim({-1});
-  }
-
-  if (var->IsType<LoDTensor>()) {
-    const LoDTensor& tensor = var->Get<LoDTensor>();
-    return tensor.dims();
-  } else if (var->IsType<SelectedRows>()) {
-    if (get_actual_dim) {
-      return var->Get<SelectedRows>().value().dims();
-    } else {
-      return var->Get<SelectedRows>().GetCompleteDims();
-    }
-  } else {
-    return DDim({-1});
-  }
-}
-
-static bool VarInited(const Scope& scope, const std::string& name) {
-  Variable* var = scope.FindVar(name);
-  if (var == nullptr) return false;
-  return var->IsInitialized();
-}
-
-static std::string GetDtype(const Scope& scope, const std::string& name) {
-  Variable* var = scope.FindVar(name);
-  if (var == nullptr) {
-    return "";
-  }
-
-  if (var->IsType<LoDTensor>()) {
-    const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (UNLIKELY(!tensor.IsInitialized())) {
-      return "";
-    }
-    return DataTypeToString(tensor.type());
-  } else if (var->IsType<SelectedRows>()) {
-    auto tensor = var->Get<SelectedRows>().value();
-    if (UNLIKELY(!tensor.IsInitialized())) {
-      return "uninited";
-    } else {
-      return DataTypeToString(tensor.type());
-    }
-  } else {
-    return "";
-  }
-}
-
-static int GetRowSize(const Scope& scope, const std::string& name) {
-  Variable* var = scope.FindVar(name);
-  if (var == nullptr) {
-    return -1;
-  }
-
-  if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().rows().size();
-  }
-
-  return -1;
-}
-
-static LoD GetLoDDebug(const Scope& scope, const std::string& name) {
-  Variable* var = scope.FindVar(name);
-  auto default_lod = LoD({{}});
-
-  if (var == nullptr) {
-    return default_lod;
-  }
-
-  if (var->IsType<LoDTensor>()) {
-    const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (UNLIKELY(!tensor.IsInitialized())) {
-      return default_lod;
-    }
-    return tensor.lod();
-  } else {
-    return default_lod;
-  }
-}
-
-RuntimeContext::RuntimeContext(const VariableNameMap& innames,
-                               const VariableNameMap& outnames,
-                               const Scope& scope) {
-  for (auto& var_name_item : innames) {
-    std::vector<Variable*>& input_vars = inputs[var_name_item.first];
-    input_vars.reserve(var_name_item.second.size());
-    for (auto& var_name : var_name_item.second) {
-      input_vars.push_back(scope.FindVar(var_name));
-    }
-  }
-  for (auto& var_name_item : outnames) {
-    std::vector<Variable*>& output_vars = outputs[var_name_item.first];
-    output_vars.reserve(var_name_item.second.size());
-    for (auto& var_name : var_name_item.second) {
-      output_vars.push_back(scope.FindVar(var_name));
-    }
-  }
-}
-
-void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  try {
-    VLOG(4) << place << " " << DebugStringEx(&scope);
-    if (platform::is_gpu_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
-#else
-      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-      platform::SetDeviceId(dev_id);
-#endif
-    }
-
-    // The profile has a process-wide mutex, results in serious performance
-    // issue
-    // in concurrency scenerio. Here use an `if` to fix this issue.
-    // Please not remove the `if`, ask @Superjomn if there are any concern.
-    if (platform::IsProfileEnabled()) {
-      platform::RecordEvent record_event(Type());
-      RunImpl(scope, place);
-    } else {
-      RunImpl(scope, place);
-    }
-    VLOG(3) << place << " " << DebugStringEx(&scope);
-  } catch (platform::EnforceNotMet exception) {
-    framework::InsertCallStackInfo(Type(), Attrs(), &exception);
-    throw std::move(exception);
-  } catch (...) {
-    std::rethrow_exception(std::current_exception());
-  }
-}
-
-bool OperatorBase::HasInputs(const std::string& name) const {
-  return inputs_.find(name) != inputs_.end();
-}
-
-std::string OperatorBase::Input(const std::string& name) const {
-  auto& ins = Inputs(name);
-  PADDLE_ENFORCE_LE(ins.size(), 1UL,
-                    "Operator %s's input %s should contain only one variable.",
-                    type_, name);
-  return ins.empty() ? kEmptyVarName : ins[0];
-}
-
-const std::vector<std::string>& OperatorBase::Inputs(
-    const std::string& name) const {
-  auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.",
-                 type_, name);
-  return it->second;
-}
-
-bool OperatorBase::HasOutputs(const std::string& name) const {
-  if (outputs_.find(name) != outputs_.end()) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-std::string OperatorBase::Output(const std::string& name) const {
-  auto& outs = Outputs(name);
-  PADDLE_ENFORCE_LE(outs.size(), 1UL,
-                    "Operator %s's output %s should contain only one variable.",
-                    type_, name);
-  return outs.empty() ? kEmptyVarName : outs[0];
-}
-
-const std::vector<std::string>& OperatorBase::Outputs(
-    const std::string& name) const {
-  auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(),
-                 "Operator %s does not have an output called %s.", type_, name);
-  return it->second;
-}
-
-std::string OperatorBase::DebugStringEx(const Scope* scope) const {
-  std::stringstream ss;
-  ss << "Op(" << type_ << "), inputs:{";
-  for (auto it = inputs_.begin(); it != inputs_.end();) {
-    auto& input = *it;
-    ss << input.first << "[";
-    for (size_t i = 0; i < input.second.size(); ++i) {
-      auto var_name = input.second[i];
-      ss << var_name;
-      if (scope) {
-        if (!VarInited(*scope, var_name)) {
-          ss << "[uninited]";
-        } else {
-          int row_size = GetRowSize(*scope, var_name);
-          if (row_size >= 0) {
-            ss << "[row_size=" << row_size << "]";
-          }
-          std::string dtype = GetDtype(*scope, var_name);
-          ss << ":" << dtype;
-          ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
-          ss << "(" << GetLoDDebug(*scope, var_name) << ")";
-        }
-      }
-      if (i != input.second.size() - 1) {
-        ss << ", ";
-      }
-    }
-    ss << "]";
-    ++it;
-    if (it != inputs_.end()) {
-      ss << ", ";
-    }
-  }
-  ss << "}, outputs:{";
-  for (auto it = outputs_.begin(); it != outputs_.end();) {
-    auto& output = *it;
-    ss << output.first << "[";
-    for (size_t i = 0; i < output.second.size(); ++i) {
-      auto var_name = output.second[i];
-      ss << var_name;
-      if (scope) {
-        if (!VarInited(*scope, var_name)) {
-          ss << "[uninited]";
-        } else {
-          int row_size = GetRowSize(*scope, output.second[i]);
-          if (row_size >= 0) {
-            ss << "[row_size=" << row_size << "]";
-          }
-          std::string dtype = GetDtype(*scope, output.second[i]);
-          ss << ":" << dtype;
-          ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
-          ss << "(" << GetLoDDebug(*scope, var_name) << ")";
-        }
-      }
-      if (i != output.second.size() - 1) {
-        ss << ", ";
-      }
-    }
-    ss << "]";
-    ++it;
-    if (it != outputs_.end()) {
-      ss << ", ";
-    }
-  }
-  ss << "}.";
-  return ss.str();
-}
-
-OperatorBase::OperatorBase(const std::string& type,
-                           const VariableNameMap& inputs,
-                           const VariableNameMap& outputs,
-                           const AttributeMap& attrs)
-    : type_(type),
-      inputs_(inputs),
-      outputs_(outputs),
-      attrs_(attrs),
-      // NOTE(zjl): why op_info may be nullptr?
-      info_(OpInfoMap::Instance().GetNullable(type)) {
-  GenerateTemporaryNames();
-  CheckAllInputOutputSet();
-}
-
-std::vector<std::string> OperatorBase::InputVars() const {
-  std::vector<std::string> ret_val;
-  for (auto& o : inputs_) {
-    ret_val.reserve(ret_val.size() + o.second.size());
-    ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
-  }
-  return ret_val;
-}
-
-std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
-  std::vector<std::string> ret_val;
-  if (has_intermediate) {
-    // push all outputs into ret_val
-    for (auto& o : outputs_) {
-      ret_val.reserve(ret_val.size() + o.second.size());
-      ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
-    }
-    return ret_val;
-  }
-  auto& info = Info();
-
-  // get all OpProto::Var for outputs
-  for (auto& o : info.Proto().outputs()) {
-    // ignore all intermediate output
-    if (o.intermediate()) continue;
-    auto out = outputs_.find(o.name());
-    if (out != outputs_.end()) {
-      ret_val.reserve(ret_val.size() + out->second.size());
-      ret_val.insert(ret_val.end(), out->second.begin(), out->second.end());
-    }
-  }
-  return ret_val;
-}
-
-void OperatorBase::CheckAllInputOutputSet() const {
-  if (info_ == nullptr || info_->proto_ == nullptr) return;
-
-  for (auto& in : info_->Proto().inputs()) {
-    if (!in.dispensable()) {
-      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
-                     "Operator %s's input, %s, is not set", Type(), in.name());
-    }
-  }
-
-  for (auto& out : info_->Proto().outputs()) {
-    if (!out.dispensable()) {
-      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
-                     "Operator %s's output, %s, is not set", Type(),
-                     out.name());
-    }
-  }
-}
-
-void OperatorBase::GenerateTemporaryNames() {
-  static std::atomic<size_t> gUniqId(0UL);
-  for (auto& output : outputs_) {
-    for (auto& output_name : output.second) {
-      if (output_name == kTempVarName) {
-        output_name += type_;
-        output_name += "@";
-        output_name += std::to_string(gUniqId.fetch_add(1));
-      }
-    }
-  }
-}
-
-static bool VarIsTensor(const Variable& var) {
-  return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
-}
-
-const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
-  if (var.IsType<LoDTensor>()) {
-    return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
-  } else if (var.IsType<SelectedRows>()) {
-    return &(var.Get<SelectedRows>().value());
-  } else {
-    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 ToTypeName(var.Type()));
-  }
-}
-
-Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
-  if (var->IsType<LoDTensor>()) {
-    return var->GetMutable<LoDTensor>();
-  } else if (var->IsType<SelectedRows>()) {
-    return var->GetMutable<SelectedRows>()->mutable_value();
-  } else {
-    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 ToTypeName(var->Type()));
-  }
-}
-
-bool ExecutionContext::HasInput(const std::string& name) const {
-  if (!op_.HasInputs(name)) {
-    return false;
-  }
-  auto& ins = Inputs(name);
-  size_t length = ins.size();
-  if (length == 0) {
-    return false;
-  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input %s should not have more than one inputs", name);
-  auto arg = ins[0];
-  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
-  return var != nullptr;
-}
-
-bool ExecutionContext::HasOutput(const std::string& name) const {
-  if (!op_.HasOutputs(name)) {
-    return false;
-  }
-  auto& outs = Outputs(name);
-  size_t length = outs.size();
-  if (length == 0) {
-    return false;
-  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Output %s should not have more than one inputs", name);
-  auto arg = outs[0];
-  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
-  return var != nullptr;
-}
-
-const Variable* ExecutionContext::InputVar(const std::string& name) const {
-  auto it = ctx_.inputs.find(name);
-  if (it == ctx_.inputs.end()) return nullptr;
-
-  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
-                    "Operator %s's input %s should contain only one variable.",
-                    op_.Type(), name);
-  return it->second.empty() ? nullptr : it->second[0];
-}
-
-Variable* ExecutionContext::OutputVar(const std::string& name) const {
-  auto it = ctx_.outputs.find(name);
-  if (it == ctx_.outputs.end()) return nullptr;
-
-  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
-                    "Operator %s's output %s should contain only one variable.",
-                    op_.Type(), name);
-  return it->second.empty() ? nullptr : it->second[0];
-}
-
-template <>
-const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
-  return Input<LoDTensor>(name);
-}
-
-template <>
-const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
-    const std::string& name) const {
-  auto it = ctx_.inputs.find(name);
-  if (it == ctx_.inputs.end()) {
-    return {};
-  }
-  const std::vector<Variable*>& vars = it->second;
-  std::vector<const Tensor*> res;
-  res.reserve(vars.size());
-  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
-                 [&](Variable* var) -> const Tensor* {
-                   if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE(
-                       var->IsType<LoDTensor>(),
-                       "should be LoDTensor, but the received type is %s",
-                       ToTypeName(var->Type()));
-                   return &(var->Get<LoDTensor>());
-                 });
-  return res;
-}
-
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
-  return Output<LoDTensor>(name);
-}
-
-template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
-    const std::string& name) const {
-  auto it = ctx_.outputs.find(name);
-  if (it == ctx_.outputs.end()) {
-    return {};
-  }
-  const std::vector<Variable*>& vars = it->second;
-  std::vector<Tensor*> res;
-  res.reserve(vars.size());
-  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
-                 [&](Variable* var) -> Tensor* {
-                   return var == nullptr ? nullptr
-                                         : var->GetMutable<LoDTensor>();
-                 });
-  return res;
-}
-
-bool OpSupportGPU(const std::string& op_type) {
-  auto& all_kernels = OperatorWithKernel::AllOpKernels();
-  auto it = all_kernels.find(op_type);
-  if (it == all_kernels.end()) {
-    // All control operator must support GPU
-    return true;
-  }
-  for (auto& kern_pair : it->second) {
-    if (platform::is_gpu_place(kern_pair.first.place_)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-class RuntimeInferShapeContext : public InferShapeContext {
- public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope,
-                           const RuntimeContext& ctx)
-      : op_(op), ctx_(ctx) {}
-
-  bool HasInput(const std::string& name) const override {
-    // has only one input
-    const auto& ins = ctx_.inputs;
-    auto it = ins.find(name);
-    if (it == ins.end()) {
-      return false;
-    }
-    const auto& in = it->second;
-    if (in.size() == 0) return false;
-    PADDLE_ENFORCE_EQ(in.size(), 1UL,
-                      "Input %s should not have more than one inputs", name);
-    return in[0] != nullptr;
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    // has only one output
-    const auto& outs = ctx_.outputs;
-    auto it = outs.find(name);
-    if (it == outs.end()) {
-      return false;
-    }
-    const auto& out = it->second;
-    if (out.size() == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(out.size(), 1UL,
-                      "Output %s should not have more than one outputs", name);
-    return out[0] != nullptr;
-  }
-
-  bool HasInputs(const std::string& name) const override {
-    const auto& ins = ctx_.inputs;
-    auto it = ins.find(name);
-    if (it == ins.end() || it->second.empty()) {
-      return false;
-    }
-    for (auto& input : it->second) {
-      if (input == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool HasOutputs(const std::string& name) const override {
-    const auto& outs = ctx_.outputs;
-    auto it = outs.find(name);
-    if (it == outs.end() || it->second.empty()) {
-      return false;
-    }
-    for (auto& output : it->second) {
-      if (output == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
-
-  const std::vector<std::string>& Inputs(
-      const std::string& name) const override {
-    return op_.Inputs(name);
-  }
-
-  const std::vector<std::string>& Outputs(
-      const std::string& name) const override {
-    return op_.Outputs(name);
-  }
-
-  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) override {
-    auto in_it = ctx_.inputs.find(in);
-    auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
-                   "Inputs %s should have %llu argument", in, i);
-    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
-                   "Outputs %s should have %llu argument", out, j);
-
-    Variable* in_var = in_it->second[i];
-    Variable* out_var = out_it->second[j];
-
-    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
-                   "The type of %s and %s is not the same.", in, out);
-
-    if (in_var->IsType<framework::SelectedRows>()) {
-      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
-      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
-      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
-      out_sele_rows->set_rows(in_sele_rows.rows());
-      out_sele_rows->set_height(in_sele_rows.height());
-    } else if (in_var->IsType<framework::LoDTensor>()) {
-      auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
-      auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
-      out_lod_tensor->Resize(in_lod_tensor.dims());
-    } else {
-      PADDLE_THROW(
-          "Currently, the input type of ShareDim only can be LoDTensor "
-          "or SelectedRows.");
-    }
-  }
-
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override {
-    auto in_it = ctx_.inputs.find(in);
-    auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
-                   "Inputs %s should have %llu argument", in, i);
-    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
-                   "Outputs %s should have %llu argument", out, j);
-
-    Variable* in_var = in_it->second.at(i);
-    if (!in_var->IsType<LoDTensor>()) return;
-    Variable* out_var = out_it->second.at(j);
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto& in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_lod(in_tensor.lod());
-
-// TODO(dzhwinter) : reuse ShareLoD in most operators.
-// Need to call ShareLayout explicitly in sequence related ops.
-// Shall we have a better method to shared info between in/out Tensor?
-#ifdef PADDLE_WITH_MKLDNN
-    // Fix me: ugly workaround below
-    // Correct solution:
-    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
-    //    layout of output tensor should be set "manually" in Compute()
-    //    of each OPKernel. The reason layout should NOT be shared between
-    //    input and output "automatically" (now by InferShape()->ShareLoD())
-    //    is that layout transform may occur after InferShape().
-    // Workaround:
-    //    Skip set_layout() when input layout is kMKLDNN
-    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
-    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
-    //    in Compute()
-    if (in_tensor.layout() != DataLayout::kMKLDNN)
-#endif
-      out_tensor->set_layout(in_tensor.layout());
-  }
-
-  void DecreaseLoDLevel(const std::string& in, const std::string& out,
-                        size_t i = 0, size_t j = 0) const override {
-    PADDLE_THROW("DecreaseLoDLevel is only used in compile time.");
-  }
-
-  bool IsRuntime() const override { return true; }
-
-  // TODO(paddle-dev): Can this be template?
-  std::vector<InferShapeVarPtr> GetInputVarPtrs(
-      const std::string& name) override {
-    const std::vector<Variable*>& vars = InputVars(name);
-    std::vector<InferShapeVarPtr> res;
-    res.reserve(vars.size());
-    res.insert(res.begin(), vars.begin(), vars.end());
-    return res;
-  }
-
-  std::vector<InferShapeVarPtr> GetOutputVarPtrs(
-      const std::string& name) override {
-    const std::vector<Variable*>& vars = OutputVars(name);
-    std::vector<InferShapeVarPtr> res;
-    res.reserve(vars.size());
-    res.insert(res.begin(), vars.begin(), vars.end());
-    return res;
-  }
-
-  DDim GetInputDim(const std::string& name) const override {
-    const std::vector<Variable*>& vars = InputVars(name);
-    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
-                      "Input(%s) should hold one element, but now it holds %d",
-                      name, vars.size());
-    return this->GetDim(vars[0]);
-  }
-
-  std::vector<DDim> GetInputsDim(const std::string& name) const override {
-    const std::vector<Variable*>& vars = InputVars(name);
-    return GetDims(vars);
-  }
-
-  std::vector<proto::VarType::Type> GetInputsVarType(
-      const std::string& name) const override {
-    return GetVarTypes(InputVars(name));
-  }
-
-  std::vector<proto::VarType::Type> GetOutputsVarType(
-      const std::string& name) const override {
-    return GetVarTypes(OutputVars(name));
-  }
-
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    auto& vars = OutputVars(name);
-    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
-                      "Output(%s) should hold one element, but now it holds %d",
-                      name, vars.size());
-    SetDim(vars[0], dim);
-  }
-
-  void SetOutputsDim(const std::string& name,
-                     const std::vector<DDim>& dims) override {
-    auto& vars = OutputVars(name);
-    SetDims(vars, dims);
-  }
-
- protected:
-  DDim GetDim(Variable* var) const {
-    PADDLE_ENFORCE_NOT_NULL(var);
-    if (var->IsType<LoDTensor>()) {
-      return var->Get<LoDTensor>().dims();
-    } else if (var->IsType<SelectedRows>()) {
-      return var->Get<SelectedRows>().GetCompleteDims();
-    } else {
-      PADDLE_THROW(
-          "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
-          "type_id is %s.",
-          ToTypeName(var->Type()));
-    }
-  }
-
-  std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
-    std::vector<DDim> ret;
-    ret.reserve(vars.size());
-    std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
-                   [this](Variable* var) { return this->GetDim(var); });
-    return ret;
-  }
-
-  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW("Only compile time support this method");
-  }
-
-  void SetDim(Variable* var, const DDim& dim) {
-    if (var->IsType<LoDTensor>()) {
-      var->GetMutable<LoDTensor>()->Resize(dim);
-    } else if (var->IsType<SelectedRows>()) {
-      var->GetMutable<SelectedRows>()->set_height(dim[0]);
-    } else {
-      PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                   ToTypeName(var->Type()));
-    }
-  }
-
-  void SetDims(const std::vector<Variable*>& vars,
-               const std::vector<DDim>& dims) {
-    size_t length = vars.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
-    for (size_t i = 0; i < length; ++i) {
-      if (vars[i] == nullptr) {
-        continue;
-      }
-      SetDim(vars[i], dims[i]);
-    }
-  }
-
-  void SetRepeatedDims(const std::string& name,
-                       const std::vector<DDim>& dims) override {
-    PADDLE_THROW("Only compile time support this method");
-  }
-
-  std::vector<proto::VarType::Type> GetVarTypes(
-      const std::vector<Variable*>& vars) const {
-    std::vector<proto::VarType::Type> retv;
-    retv.resize(vars.size());
-    std::transform(vars.begin(), vars.end(), retv.begin(),
-                   std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
-                             this, std::placeholders::_1));
-    return retv;
-  }
-
-  proto::VarType::Type GetVarType(Variable* var) const {
-    return ToVarType(var->Type());
-  }
-
- private:
-  const std::vector<Variable*>& InputVars(const std::string& name) const {
-    auto it = ctx_.inputs.find(name);
-    PADDLE_ENFORCE(it != ctx_.inputs.end(),
-                   "Operator %s does not have the input %s.", op_.Type(), name);
-    return it->second;
-  }
-
-  const std::vector<Variable*>& OutputVars(const std::string& name) const {
-    auto it = ctx_.outputs.find(name);
-    PADDLE_ENFORCE(it != ctx_.outputs.end(),
-                   "Operator %s does not have the outputs %s.", op_.Type(),
-                   name);
-    return it->second;
-  }
-
-  const OperatorBase& op_;
-  const RuntimeContext& ctx_;
-};
-
-static void CheckTensorNANOrInf(const std::string& op_type,
-                                const std::string& name,
-                                const framework::Tensor& tensor) {
-  if (tensor.memory_size() == 0) {
-    return;
-  }
-  if (tensor.type() != proto::VarType::FP32 &&
-      tensor.type() != proto::VarType::FP64) {
-    return;
-  }
-  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Operator %s output Tensor %s contains Inf", op_type, name);
-  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Operator %s output Tensor %s contains NAN", op_type, name);
-}
-
-void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
-                                           const platform::Place& place,
-                                           const RuntimeContext& ctx) const {
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx);
-  this->InferShape(&infer_shape_ctx);
-}
-
-std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
-    const OpKernelType& key) const {
-  auto config_iter = kernel_configs_map_.find(key);
-  std::vector<KernelConfig>* kernel_configs = nullptr;
-  if (config_iter != kernel_configs_map_.end()) {
-    kernel_configs = &(config_iter->second);
-  }
-  return kernel_configs;
-}
-
-void OperatorWithKernel::RunImpl(const Scope& scope,
-                                 const platform::Place& place) const {
-  // To reduce the elapsed time of HasAttr, we use bool variable to record the
-  // result of HasAttr.
-  if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext))
-    enable_cache_runtime_context_ = true;
-  if (!all_kernels_must_compute_runtime_shape_ &&
-      HasAttr(kAllKernelsMustComputeRuntimeShape))
-    all_kernels_must_compute_runtime_shape_ = true;
-  if (!enable_cache_runtime_context_) {
-    RuntimeContext ctx(Inputs(), Outputs(), scope);
-    RunImpl(scope, place, &ctx);
-  } else {
-    const Scope* cur_scope = &scope;
-    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-      std::lock_guard<std::mutex> lock(cache_update_mutex_);
-      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-        pre_scope_ = cur_scope;
-      }
-    }
-    RunImpl(scope, place, runtime_ctx_.get());
-  }
-}
-
-void OperatorWithKernel::RunImpl(const Scope& scope,
-                                 const platform::Place& place,
-                                 RuntimeContext* runtime_ctx) const {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-
-  if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
-    ChooseKernel(*runtime_ctx, scope, place);
-  }
-
-  std::vector<KernelConfig>* kernel_configs = GetKernelConfig(*kernel_type_);
-
-  // do data transformScope &transfer_scope;
-  std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope =
-      PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
-
-  // exec scope is the scope that kernel actually executed on.
-  const Scope& exec_scope =
-      (transfer_scope == nullptr ? scope : *transfer_scope);
-
-  if (!(kernel_type_->place_ == dev_ctx->GetPlace())) {
-    dev_ctx = pool.Get(kernel_type_->place_);
-  }
-
-  if (!all_kernels_must_compute_runtime_shape_) {
-    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
-    this->InferShape(&infer_shape_ctx);
-  }
-  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
-  // not Scope. Imperative mode only pass inputs and get outputs.
-  (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
-                                   kernel_configs));
-
-  if (!transfered_inplace_vars.empty()) {
-    // there is inplace variable has been transfered.
-    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
-  }
-
-  /*For profiling/benchmark only*/
-  if (FLAGS_benchmark) {
-    dev_ctx->Wait();
-  }
-
-  if (FLAGS_fast_check_nan_inf) {
-    for (auto& vname : OutputVars(true)) {
-      // only check inserted vars,
-      // please see executor.py for details of fast_check_nan_inf
-      if (vname.rfind("debug_var") == 0) {
-        VLOG(3) << "debugging nan/inf in var " << vname;
-
-        auto* var = exec_scope.FindVar(vname);
-        if (var == nullptr) continue;
-        if (var->IsType<framework::LoDTensor>()) {
-          CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
-        } else if (var->IsType<framework::SelectedRows>()) {
-          CheckTensorNANOrInf(type_, vname,
-                              var->Get<framework::SelectedRows>().value());
-        }
-      }
-    }
-  }
-
-  if (FLAGS_check_nan_inf) {
-    for (auto& vname : OutputVars(true)) {
-      auto* var = exec_scope.FindVar(vname);
-      if (var == nullptr) continue;
-      if (var->IsType<framework::LoDTensor>()) {
-        CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
-      } else if (var->IsType<framework::SelectedRows>()) {
-        CheckTensorNANOrInf(type_, vname,
-                            var->Get<framework::SelectedRows>().value());
-      }
-    }
-  }
-
-  // To solve issue #15032, have a discussion with @Luotao for cpu inference,
-  // do not cache transfer scope, hence in this case delete transfer scope
-  // after run to avoid memory leak
-  if (transfer_scope && !run_by_executor_ && !enable_cache_transfer_scope_) {
-    scope.DeleteScope(transfer_scope);
-  }
-}
-
-void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
-                                      const Scope& scope,
-                                      const platform::Place& place) const {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-
-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
-  }
-
-  OpKernelMap& kernels = kernels_iter->second;
-
-  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-  auto kernel_iter = kernels.find(expected_kernel_key);
-#ifdef PADDLE_WITH_MKLDNN
-  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
-  if (kernel_iter == kernels.end() &&
-      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
-    expected_kernel_key.library_type_ = LibraryType::kPlain;
-    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
-
-  std::lock_guard<std::mutex> lock(cache_update_mutex_);
-  if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
-    kernel_type_.reset(new OpKernelType(expected_kernel_key));
-    kernel_func_.reset(new OpKernelFunc(kernel_iter->second));
-  }
-}
-
-void OperatorWithKernel::TransferInplaceVarsBack(
-    const Scope& scope, const std::vector<std::string>& inplace_vars,
-    const Scope& transfer_scope) const {
-  for (auto& var_name : inplace_vars) {
-    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
-    auto* origin_var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.",
-                            var_name);
-    auto* original_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
-    auto* var = transfer_scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.",
-                            var_name);
-    auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-    original_tensor->ShareDataWith(*transformed_tensor);
-  }
-}
-
-Scope* OperatorWithKernel::PrepareData(
-    const Scope& scope, const OpKernelType& expected_kernel_key,
-    std::vector<std::string>* transfered_inplace_vars,
-    RuntimeContext* ctx) const {
-  Scope* new_scope = nullptr;
-
-  std::unordered_set<std::string> no_buffer_ins;
-  if (info_) {
-    auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer();
-    // Some op may not register NoNeedBufferVarsInferer
-    if (no_buffer_inferer) {
-      no_buffer_ins = no_buffer_inferer(Inputs(), Outputs(), Attrs());
-    }
-  }
-
-  for (auto& var_name_item : Inputs()) {
-    // NOTE(zjl): STL does not guarantee fast std::unordered_set::count when set
-    // is empty. At least STL implemented on my mac does calculate hash code
-    // of search key even though the set is empty.
-    if (!no_buffer_ins.empty() &&
-        no_buffer_ins.count(var_name_item.first) > 0) {
-      VLOG(7) << "Skip scanning input " << var_name_item.first
-              << " in Operator " << type_;
-      continue;
-    }
-
-    std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
-
-    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
-      auto& var_name = var_name_item.second[i];
-      auto* var = input_vars[i];
-
-      // Only tensor can be tranfer to another device.
-      if (var == nullptr || !VarIsTensor(*var)) {
-        continue;
-      }
-
-      auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      if (!tensor_in->IsInitialized()) {
-        continue;
-      }
-
-      auto kernel_type_for_var = GetKernelTypeForVar(
-          var_name_item.first, *tensor_in, expected_kernel_key);
-
-      if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
-        continue;
-      }
-
-      auto out_var_names = OutputVars(true);
-      if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
-          out_var_names.end()) {
-        transfered_inplace_vars->emplace_back(var_name);
-      }
-
-      VLOG(3) << "Transform Variable " << var_name << " from "
-              << kernel_type_for_var << " to " << expected_kernel_key;
-
-      // In the inference scenerio, the scopes will be reused across the
-      // batches, so the `new_scope` here will result in GPU memroy explosion
-      // over the  running of operators.
-      // We use a thread_local cache to fix that issue, the key in the cache is
-      // the combination of the `scope` argument, from_kernel_type,
-      // target_kernel_type.
-      // Have a discussion with @Superjomn or the inference developers if some
-      // changes on this logic for this macro might not tested on the other
-      // scenerios.
-      // If this op is not called by an Executor or ParallelExecutor, it should
-      // called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
-      // variables, that behavior a lot different.
-      //
-      // To solve issue #15032, have a discussion with @Luotao for cpu
-      // inference, for all cpu kernels cases without GPU participation, here
-      // not do transfer scope caching, and cpu inference performance is not
-      // impacted by test.
-      enable_cache_transfer_scope_ = false;
-      if (!run_by_executor_ &&
-          (platform::is_gpu_place(kernel_type_for_var.place_) ||
-           platform::is_gpu_place(expected_kernel_key.place_))) {
-        new_scope = TryCreateTransferScope(kernel_type_for_var,
-                                           expected_kernel_key, &scope);
-        enable_cache_transfer_scope_ = true;
-      }
-      if (!new_scope) {
-        new_scope = &scope.NewScope();
-      }
-      // For inference, if a gpu model has an op which could only run on CPU,
-      // each result of different input will be the same with the first one.
-      // The reason is that if a gpu tensor is the input of a cpu kernel,
-      // we will create a new cpu tensor in new scope.
-      // However, if enable_cache_runtime_context_, we get the cpu tensor each
-      // time, not the gpu tensor.
-      // Thus, we set pre_scope_ = nullptr to trigger `new RuntimeContext()` in
-      // RunImpl().
-      if (enable_cache_runtime_context_) {
-        pre_scope_ = nullptr;
-      }
-
-      auto* trans_var = new_scope->Var(var_name);
-      input_vars[i] = trans_var;
-
-      Tensor out;
-      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
-      SetTensorToVariable(*var, out, trans_var);
-    }
-  }
-
-  return new_scope;
-}
-
-proto::VarType::Type OperatorWithKernel::IndicateDataType(
-    const ExecutionContext& ctx) const {
-  proto::VarType::Type dafault_data_type =
-      static_cast<proto::VarType::Type>(-1);
-  proto::VarType::Type data_type = dafault_data_type;
-  for (auto& input : this->inputs_) {
-    const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
-    for (size_t i = 0; i < vars.size(); ++i) {
-      const Variable* var = vars[i];
-      if (var != nullptr) {
-        const Tensor* t = nullptr;
-        if (var->IsType<Tensor>()) {
-          t = &var->Get<Tensor>();
-        } else if (var->IsType<LoDTensor>()) {
-          t = &var->Get<LoDTensor>();
-        } else if (var->IsType<SelectedRows>()) {
-          t = &(var->Get<SelectedRows>().value());
-        }
-        if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu) is not initialized",
-                         input.first, i);
-          proto::VarType::Type tmp = t->type();
-          PADDLE_ENFORCE(
-              tmp == data_type || data_type == dafault_data_type,
-              "DataType of Paddle Op %s %s must be the same. Get (%s) != (%s)",
-              Type(), input.first, DataTypeToString(data_type),
-              DataTypeToString(tmp));
-          data_type = tmp;
-        }
-      }
-    }
-  }
-  PADDLE_ENFORCE(data_type != dafault_data_type,
-                 "DataType should be indicated by input");
-  return data_type;
-}
-
-OpKernelType OperatorWithKernel::GetExpectedKernelType(
-    const ExecutionContext& ctx) const {
-  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
-}
-
-OpKernelType OperatorWithKernel::GetKernelTypeForVar(
-    const std::string& var_name, const Tensor& tensor,
-    const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
-                      tensor.layout());
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
deleted file mode 100644
index 5899a14f503fffe603803bfe56533aa40425a252..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/operator.h
+++ /dev/null
@@ -1,513 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <atomic>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "glog/logging.h"  // For VLOG
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/operator_kernel_configs.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/variant.h"
-
-DECLARE_int32(inner_op_parallelism);
-
-namespace paddle {
-namespace framework {
-
-/// If a variable is a empty variable, that name will be used.
-constexpr char kEmptyVarName[] = "@EMPTY@";
-
-/// If a variable is a temporary variable, that name will be set in Python,
-/// but it will be convert to a unique name in scope after OpCreator.
-constexpr char kTempVarName[] = "@TEMP@";
-
-/// If a variable's name has a certain suffix, it means that the
-/// variable is the gradient of another varibale.
-/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
-constexpr char kGradVarSuffix[] = "@GRAD";
-
-constexpr size_t kGradVarSuffixSize = 5U;
-
-/// Variables with this suffix are supposed to be filled up with zeros.
-constexpr char kZeroVarSuffix[] = "@ZERO";
-
-/// Variables with this suffix are the new Gradient.
-constexpr char kNewGradSuffix[] = "@NEWGRAD@";
-
-/// RuntimeContext is used to relate input/output names of Operator with
-/// the corresponding variables in name scope.
-/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
-/// name scope, since the input/output names of this Op do not change in the
-/// execution, RuntimeContext could be created only at the first iteration of
-/// this Op's execution to save the elapsed time.
-constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
-
-/// If an Op has this attribute, all its kernels should calculate output
-/// variable's shape in the corresponding Compute() function. And
-/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
-/// function in its runtime for speedup.
-/// TODO(luotao): Note that this temporal attribute would be deleted after all
-/// ops contain it.
-constexpr char kAllKernelsMustComputeRuntimeShape[] =
-    "@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@";
-
-// define some kernel priority
-/* Define multiple kernel type fallback order*/
-extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
-
-inline std::string GradVarName(const std::string& var_name) {
-  std::string result;
-  result.reserve(var_name.size() + kGradVarSuffixSize);
-  result += var_name;
-  result += kGradVarSuffix;
-  return result;
-}
-
-inline std::string GradOriginalVarName(const std::string& grad_var_name) {
-  std::size_t pos = grad_var_name.rfind(kGradVarSuffix);
-  if (pos == std::string::npos) {
-    return grad_var_name;
-  } else {
-    return grad_var_name.substr(0, pos);
-  }
-}
-
-proto::VarType::Type GetDataTypeOfVar(const Variable* var);
-const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
-Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
-
-class OperatorBase;
-class ExecutionContext;
-
-class RuntimeContext {
- public:
-  RuntimeContext(const VariableNameMap& innames,
-                 const VariableNameMap& outnames, const Scope& scope);
-
-  RuntimeContext(const VariableValueMap& invars,
-                 const VariableValueMap& outvars)
-      : inputs(invars), outputs(outvars) {}
-
-  VariableValueMap inputs;
-  VariableValueMap outputs;
-};
-
-/**
- * OperatorBase has the basic elements that Net will call to do computation.
- * Only CreateOperator from OpRegistry will new Operator directly. User
- * should always construct a proto message OpDesc and call
- * OpRegistry::CreateOp(op_desc) to get an Operator instance.
- */
-class OperatorBase {
- public:
-  OperatorBase(const std::string& type, const VariableNameMap& inputs,
-               const VariableNameMap& outputs, const AttributeMap& attrs);
-
-  virtual ~OperatorBase() {}
-
-  /// Executor will call this interface function to Run an op.
-  //  The implementation should be written at RunImpl
-  void Run(const Scope& scope, const platform::Place& place);
-
-  // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
-  virtual void Stop() {}
-
-  /// if scope is not null, also show dimensions of arguments
-  virtual std::string DebugStringEx(const Scope* scope) const;
-  std::string DebugString() const { return DebugStringEx(nullptr); }
-
-  virtual bool SupportGPU() const { return false; }
-
-  const std::string& Type() const { return type_; }
-
-  bool HasAttr(const std::string& name) const { return attrs_.count(name); }
-  template <typename T>
-  inline const T& Attr(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(),
-                   "%s should be in AttributeMap", name);
-    return boost::get<T>(attrs_.at(name));
-  }
-  const AttributeMap& Attrs() const { return attrs_; }
-
-  const VariableNameMap& Inputs() const { return inputs_; }
-  const VariableNameMap& Outputs() const { return outputs_; }
-
-  const OpInfo& Info() const {
-    PADDLE_ENFORCE_NOT_NULL(info_, "OpInfo of %s is not found", type_);
-    return *info_;
-  }
-
-  bool HasInputs(const std::string& name) const;
-  //! Get a input with argument's name described in `op_proto`
-  std::string Input(const std::string& name) const;
-  //! Get a input which has multiple variables.
-  const std::vector<std::string>& Inputs(const std::string& name) const;
-  //! Get all inputs variable names
-  std::vector<std::string> InputVars() const;
-
-  bool HasOutputs(const std::string& name) const;
-  //! Get a output with argument's name described in `op_proto`
-  std::string Output(const std::string& name) const;
-  //! Get an output which has multiple variables.
-  //! TODO add a vector_view to prevent memory copy.
-  const std::vector<std::string>& Outputs(const std::string& name) const;
-  //! Get all outputs variable names
-  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
-
-  void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
-  virtual void RuntimeInferShape(const Scope& scope,
-                                 const platform::Place& place,
-                                 const RuntimeContext& ctx) const {}
-
- protected:
-  std::string type_;
-  // NOTE: in case of OpGrad, inputs_ contains:
-  // I (Inputs)
-  // O (Outputs)
-  // OG (Output Gradients)
-  VariableNameMap inputs_;
-
-  // NOTE: in case of OpGrad, outputs_ contains
-  // IG (Inputs Gradients)
-  VariableNameMap outputs_;
-  AttributeMap attrs_;
-
-  // OpInfo
-  const OpInfo* info_;
-
-  // Whether this operator executes in an Executor.
-  bool run_by_executor_{true};
-
- private:
-  void GenerateTemporaryNames();
-  void CheckAllInputOutputSet() const;
-  virtual void RunImpl(const Scope& scope,
-                       const platform::Place& place) const = 0;
-};
-
-#ifdef PADDLE_WITH_CUDA
-using KernelConfig = boost::variant<
-    std::shared_ptr<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>,
-    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>,
-    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>>;
-#else
-using KernelConfig = boost::variant<boost::blank>;
-#endif
-
-using OpKernelConfigsMap =
-    std::unordered_map<OpKernelType, std::vector<KernelConfig>,
-                       OpKernelType::Hash>;
-
-class ExecutionContext {
- public:
-  ExecutionContext(const OperatorBase& op, const Scope& scope,
-                   const platform::DeviceContext& device_context,
-                   const RuntimeContext& ctx,
-                   std::vector<KernelConfig>* configs)
-      : op_(op),
-        scope_(scope),
-        device_context_(device_context),
-        ctx_(ctx),
-        kernel_configs_(configs) {}
-
-  const OperatorBase& op() const { return op_; }
-
-  const Scope& scope() const { return scope_; }
-
-  template <typename T>
-  inline const T& Attr(const std::string& name) const {
-    return op_.Attr<T>(name);
-  }
-
-  bool HasAttr(const std::string& name) const { return op_.HasAttr(name); }
-
-  bool HasInput(const std::string& name) const;
-
-  bool HasOutput(const std::string& name) const;
-
-  size_t InputSize(const std::string& name) const {
-    return op_.Inputs(name).size();
-  }
-
-  size_t OutputSize(const std::string& name) const {
-    return op_.Outputs(name).size();
-  }
-
-  const Variable* InputVar(const std::string& name) const;
-
-  Variable* OutputVar(const std::string& name) const;
-
-  const std::vector<const Variable*> MultiInputVar(
-      const std::string& name) const {
-    auto it = ctx_.inputs.find(name);
-    if (it == ctx_.inputs.end()) {
-      return {};
-    }
-    return {it->second.begin(), it->second.end()};
-  }
-
-  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
-    auto names = op_.Outputs(name);
-    auto it = ctx_.outputs.find(name);
-    if (it == ctx_.outputs.end()) {
-      return {};
-    }
-    return it->second;
-  }
-
-  template <typename T>
-  const T* Input(const std::string& name) const {
-    auto* var = InputVar(name);
-    return var == nullptr ? nullptr : &var->Get<T>();
-  }
-
-  template <typename T>
-  T* Output(const std::string& name) const {
-    auto var = OutputVar(name);
-    return var == nullptr ? nullptr : var->GetMutable<T>();
-  }
-
-  template <typename T>
-  const std::vector<const T*> MultiInput(const std::string& name) const {
-    auto it = ctx_.inputs.find(name);
-    if (it == ctx_.inputs.end()) {
-      return {};
-    }
-    const std::vector<Variable*>& vars = it->second;
-    std::vector<const T*> res;
-    res.reserve(vars.size());
-    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
-                   [&](Variable* var) -> const T* {
-                     return var == nullptr ? nullptr : &var->Get<T>();
-                   });
-    return res;
-  }
-
-  template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
-    auto it = ctx_.outputs.find(name);
-    if (it == ctx_.outputs.end()) {
-      return {};
-    }
-    const std::vector<Variable*>& vars = it->second;
-    std::vector<T*> res;
-    res.reserve(vars.size());
-    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
-                   [&](Variable* var) -> T* {
-                     return var == nullptr ? nullptr : var->GetMutable<T>();
-                   });
-    return res;
-  }
-
-  platform::Place GetPlace() const { return device_context_.GetPlace(); }
-
-  template <typename DeviceContextType>
-  const DeviceContextType& device_context() const {
-    return *reinterpret_cast<const DeviceContextType*>(&device_context_);
-  }
-
-  const platform::DeviceContext& device_context() const {
-    return device_context_;
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  const inline platform::CUDADeviceContext& cuda_device_context() const {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true);
-    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        &device_context_);
-  }
-#endif
-
-  //! Get actual name vector for this input.
-  const std::vector<std::string>& Inputs(const std::string& name) const {
-    return op_.Inputs(name);
-  }
-
-  //! Get actual name vector for this output.
-  const std::vector<std::string>& Outputs(const std::string& name) const {
-    return op_.Outputs(name);
-  }
-
-  template <typename T, typename DevContext>
-  Tensor AllocateTmpTensor(const framework::DDim& dim,
-                           const DevContext& dev_ctx) const {
-    auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T));
-    auto& deleter = tmp_allocation_ptr.get_deleter();
-    auto* allocation_ptr = tmp_allocation_ptr.release();
-    auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
-        allocation_ptr, deleter);
-
-    PADDLE_ENFORCE_GE(allocation_ptr->size(),
-                      framework::product(dim) * sizeof(T));
-
-    paddle::framework::Tensor temp_tensor(
-        framework::ToDataType(std::type_index(typeid(T))));
-    temp_tensor.Resize(dim);
-    temp_tensor.ResetHolder(std::move(shared_allocation));
-    return temp_tensor;
-  }
-
-  template <typename T>
-  T& GetKernelConfig(size_t idx) const {
-    PADDLE_ENFORCE(
-        kernel_configs_ && kernel_configs_->size() > static_cast<size_t>(idx),
-        "%s selected kernel doesn't have kernel config %lu <= %lu",
-        op_.Type().c_str(), kernel_configs_->size(), idx);
-    return *boost::get<std::shared_ptr<T>>((*kernel_configs_)[idx]);
-  }
-
- private:
-  const OperatorBase& op_;
-  const Scope& scope_;
-  const platform::DeviceContext& device_context_;
-  const RuntimeContext& ctx_;
-  mutable std::vector<KernelConfig>* kernel_configs_;
-};
-
-template <>
-const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
-
-template <>
-const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
-    const std::string& name) const;
-
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
-
-template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
-    const std::string& name) const;
-
-class OpKernelBase {
- public:
-  /**
-   * ExecutionContext is the only parameter of Kernel Run function.
-   * Run will get input/output variables, state such as momentum and
-   * device resource such as CUDA stream, cublas handle, etc. from
-   * ExecutionContext. User should construct it before run the Operator.
-   */
-
-  virtual void Compute(const ExecutionContext& context) const = 0;
-
-  virtual ~OpKernelBase() = default;
-};
-
-template <typename T>
-class OpKernel : public OpKernelBase {
- public:
-  using ELEMENT_TYPE = T;
-};
-
-class OperatorWithKernel : public OperatorBase {
- public:
-  using OpKernelFunc = std::function<void(const ExecutionContext&)>;
-  using OpKernelMap =
-      std::unordered_map<OpKernelType, OpKernelFunc, OpKernelType::Hash>;
-
-  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
-                     const VariableNameMap& outputs, const AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
-  AllOpKernels() {
-    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
-    return g_all_op_kernels;
-  }
-
-  bool SupportGPU() const override {
-    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
-    return std::any_of(op_kernels.begin(), op_kernels.end(),
-                       [](OpKernelMap::const_reference kern_pair) {
-                         return platform::is_gpu_place(kern_pair.first.place_);
-                       });
-  }
-
-  virtual void InferShape(InferShapeContext* ctx) const {
-    Info().infer_shape_(ctx);
-  }
-
-  void RuntimeInferShape(const Scope& scope, const platform::Place& place,
-                         const RuntimeContext& ctx) const override;
-
-  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
-
-  std::vector<KernelConfig>* GetKernelConfig(const OpKernelType& key) const;
-
-  // change this to public so that in dygraph mode we can call it to check if we
-  // need transform data
-  virtual OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const OpKernelType& expected_kernel_type) const;
-
- private:
-  // indicate kernel DataType by input data. By default all input data must be
-  // same.
-  proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
-  void RunImpl(const Scope& scope, const platform::Place& place) const final;
-  void RunImpl(const Scope& scope, const platform::Place& place,
-               RuntimeContext* runtime_ctx) const;
-
-  /**
-   * Transfer data from scope to a transfered scope. If there is no data need to
-   * be tranfered, it returns nullptr.
-   *
-   * * transfered_inplace_vars is a output vector.
-   */
-  Scope* PrepareData(const Scope& scope,
-                     const OpKernelType& expected_kernel_key,
-                     std::vector<std::string>* transfered_inplace_vars,
-                     RuntimeContext* ctx) const;
-
-  void TransferInplaceVarsBack(const Scope& scope,
-                               const std::vector<std::string>& inplace_vars,
-                               const Scope& exec_scope) const;
-
-  void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
-                    const platform::Place& place) const;
-
- protected:
-  mutable OpKernelConfigsMap kernel_configs_map_;
-  mutable std::unique_ptr<OpKernelType> kernel_type_;
-  mutable std::unique_ptr<OpKernelFunc> kernel_func_;
-  mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
-  mutable const Scope* pre_scope_ = nullptr;
-  mutable bool enable_cache_runtime_context_ = false;
-  mutable bool all_kernels_must_compute_runtime_shape_ = false;
-  mutable std::mutex cache_update_mutex_;
-  mutable bool enable_cache_transfer_scope_ = false;
-};
-
-extern bool OpSupportGPU(const std::string& op_type);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h
deleted file mode 100644
index 5c5a7423832ae3c0b16df8a98aa3faa8b2983f84..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <unordered_map>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-// Not thread-safe. Should be owned per-kernel.
-template <typename TAlgorithm>
-class AlgorithmsCache {
- public:
-  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
-  // Caches the best algorithm for a given
-  // combination of tensor dimensions & compute data type.
-  TAlgorithm GetAlgorithm(
-      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::vector<int>& dilations,
-      int algorithmFlags,  // can set for different data type
-      std::function<TAlgorithm()> gen_func);
-
-  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
-                          std::function<TAlgorithm()> gen_func);
-
- private:
-  std::unordered_map<int64_t, TAlgorithm> hash_;
-  int search_times_;
-};
-
-template <typename TAlgorithm>
-TAlgorithm framework::AlgorithmsCache<TAlgorithm>::GetAlgorithm(
-    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    const std::vector<int>& dilations, int algorithmFlags,
-    std::function<TAlgorithm()> gen_func) {
-  int64_t seed = 0;
-  // Hash all of the inputs, use to try and look up a previously
-  // discovered algorithm, or fall back to generating a new one.
-  std::hash<int64_t> hashFn;
-  // do hash like boost
-  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
-  for (const auto num : dims1) {
-    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-  }
-
-  for (const auto num : dims2) {
-    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
-  }
-
-  for (const auto num : strides) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 2;
-  }
-
-  for (const auto num : paddings) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 3;
-  }
-
-  for (const auto num : dilations) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 4;
-  }
-
-  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
-          (seed << 6) + (seed >> 2) + 5;
-
-  VLOG(10) << "seed:" << seed << ", hash_.size:" << hash_.size();
-
-  if (seed == 0) return gen_func();
-
-  if (hash_.find(seed) == hash_.end()) {
-    TAlgorithm value = gen_func();
-    hash_[seed] = value;
-  }
-  return hash_[seed];
-}
-
-template <typename TAlgorithm>
-TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
-    int64_t area, int search_times, int algorithmFlags,
-    std::function<TAlgorithm()> gen_func) {
-  if (hash_.find(area) != hash_.end()) {
-    return hash_[area];
-  }
-  if (search_times_ < search_times) {
-    auto algo = gen_func();
-    hash_[area] = algo;
-    ++search_times_;
-    return algo;
-  }
-  TAlgorithm algo{};
-  int64_t min = static_cast<uint64_t>(INT_MAX);
-  for (const auto& m : hash_) {
-    if (m.first < min) {
-      min = m.first;
-      algo = m.second;
-    }
-  }
-  return algo;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
deleted file mode 100644
index fe4804ac253925c112cf7b508efc42c45868a2fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/operator_test.cc
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/init.h"
-
-namespace paddle {
-namespace framework {
-
-static int op_run_num = 0;
-
-class OpWithoutKernelTest : public OperatorBase {
- public:
-  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
-                      const VariableNameMap& outputs, const AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {
-    ++op_run_num;
-    ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
-    ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
-    ASSERT_EQ(scope.FindVar(inputs_.at("input")[0]), nullptr);
-    ASSERT_EQ(x, 1);
-    ASSERT_NE(scope.FindVar(outputs_.at("output")[0]), nullptr);
-  }
-
- public:
-  int x{0};
-};
-
-class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("input", "input of test op");
-    AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op");
-    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
-        .SetDefault(0);
-    AddComment("This is test op");
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-static void BuildVar(const std::string& param_name,
-                     std::initializer_list<const char*> arguments,
-                     paddle::framework::proto::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    *var->mutable_arguments()->Add() = arg_name;
-  }
-}
-
-REGISTER_OP_WITHOUT_GRADIENT(test_operator,
-                             paddle::framework::OpWithoutKernelTest,
-                             paddle::framework::OpWithoutKernelCheckerMaker);
-
-TEST(OperatorBase, all) {
-  paddle::framework::InitDevices(true);
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("test_operator");
-  BuildVar("input", {"IN1"}, op_desc.add_inputs());
-  BuildVar("output", {"OUT1"}, op_desc.add_outputs());
-
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
-  attr->set_f(3.14);
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  scope.Var("OUT1");
-  ASSERT_EQ(paddle::framework::op_run_num, 0);
-  op->Run(scope, cpu_place);
-  ASSERT_EQ(paddle::framework::op_run_num, 1);
-}
-
-namespace paddle {
-namespace framework {
-
-static int special_type_value = 1;
-
-class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("x", "input of test op");
-    AddOutput("y", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .GreaterThan(0.0);
-    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
-        .SetDefault(0);
-    AddComment("This is test op");
-  }
-};
-
-static int cpu_kernel_run_num = 0;
-static int cpu_kernel2_run_num = 0;
-
-class OpWithKernelTest : public OperatorWithKernel {
- public:
-  using OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-  OpKernelType GetExpectedKernelType(
-      const ExecutionContext& ctx) const override {
-    int sub_type = ctx.Attr<int>("kernel_sub_type");
-    return OpKernelType(proto::VarType::FP32, ctx.GetPlace(),
-                        framework::DataLayout::kAnyLayout,
-                        framework::LibraryType::kPlain, sub_type);
-  }
-};
-
-template <typename T1, typename T2>
-class CPUKernelTest : public OpKernel<float> {
- public:
-  void Compute(const ExecutionContext& ctx) const {
-    std::cout << ctx.op().DebugString() << std::endl;
-    cpu_kernel_run_num++;
-    ASSERT_EQ(ctx.op().Input("x"), "IN1");
-    ASSERT_EQ(ctx.op().Output("y"), "OUT1");
-  }
-};
-
-template <typename T1, typename T2>
-class CPUKernel2Test : public OpKernel<float> {
- public:
-  void Compute(const ExecutionContext& ctx) const {
-    std::cout << ctx.op().DebugString() << std::endl;
-    cpu_kernel2_run_num++;
-    ASSERT_EQ(ctx.op().Input("x"), "IN1");
-    ASSERT_EQ(ctx.op().Output("y"), "OUT1");
-  }
-};
-
-class OpKernelTestMultiInputsProtoAndCheckerMaker
-    : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("xs", "inputs of test op").AsDuplicable();
-    AddInput("k", "input of test op");
-    AddOutput("ys", "outputs of test op").AsDuplicable();
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .GreaterThan(0.0);
-    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
-        .SetDefault(0);
-    AddComment("This is test op");
-  }
-};
-
-class CPUKernalMultiInputsTest : public OpKernel<float> {
- public:
-  void Compute(const ExecutionContext& ctx) const {
-    auto xs = ctx.op().Inputs("xs");
-    ASSERT_EQ(xs.size(), 3UL);
-    ASSERT_EQ(xs[0], "x0");
-    ASSERT_EQ(xs[1], "x1");
-    ASSERT_EQ(xs[2], "x2");
-
-    auto inVar0 = ctx.MultiInputVar("xs");
-    ASSERT_EQ(inVar0.size(), 3U);
-
-    auto intVar1 = ctx.InputVar("k");
-    ASSERT_NE(intVar1, nullptr);
-
-    auto outVar0 = ctx.MultiOutputVar("ys");
-    ASSERT_EQ(outVar0.size(), 2U);
-
-    auto inTensor0 = ctx.MultiInput<Tensor>("xs");
-    ASSERT_EQ(inTensor0.size(), 3U);
-
-    auto intTensor1 = ctx.Input<Tensor>("k");
-    ASSERT_NE(intTensor1, nullptr);
-
-    auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
-    ASSERT_EQ(outTensor0.size(), 2U);
-
-    auto k = ctx.op().Input("k");
-    ASSERT_EQ(k, "k0");
-
-    auto ys = ctx.op().Outputs("ys");
-    ASSERT_EQ(ys.size(), 2UL);
-    ASSERT_EQ(ys[0], "y0");
-    ASSERT_EQ(ys[1], "y1");
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(
-    op_with_kernel, paddle::framework::OpWithKernelTest,
-    paddle::framework::OpKernelTestProtoAndCheckerMaker);
-
-REGISTER_OP_CPU_KERNEL(op_with_kernel,
-                       paddle::framework::CPUKernelTest<float, float>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
-    op_with_kernel, CPU, paddle::platform::CPUPlace, MY_SPECIAL_NAME,
-    paddle::framework::special_type_value,
-    paddle::framework::CPUKernel2Test<float, float>);
-
-// test with single input
-TEST(OpKernel, all) {
-  paddle::framework::InitDevices(true);
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("op_with_kernel");
-  BuildVar("x", {"IN1"}, op_desc.add_inputs());
-  BuildVar("y", {"OUT1"}, op_desc.add_outputs());
-
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
-  attr->set_f(3.14);
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
-  op->Run(scope, cpu_place);
-  // kerne_sub_type = 0, hence cpu_kernel is called, cpu_kernel2 is not called.
-  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
-  ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0);
-
-  attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("kernel_sub_type");
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(1);
-  auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc);
-  op2->Run(scope, cpu_place);
-  // kerne_sub_type = 1, hence cpu_kernel2 is called, cpu_kernel is not called.
-  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
-  ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1);
-}
-
-REGISTER_OP_WITHOUT_GRADIENT(
-    op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
-    paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
-                       paddle::framework::CPUKernalMultiInputsTest);
-
-// test with multi inputs
-TEST(OpKernel, multi_inputs) {
-  paddle::framework::InitDevices(true);
-  paddle::framework::proto::OpDesc op_desc;
-
-  op_desc.set_type("op_multi_inputs_with_kernel");
-  BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
-  BuildVar("k", {"k0"}, op_desc.add_inputs());
-  BuildVar("ys", {"y0", "y1"}, op_desc.add_outputs());
-
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
-  attr->set_f(3.14);
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-  scope.Var("x0")->GetMutable<paddle::framework::LoDTensor>();
-  scope.Var("x1")->GetMutable<paddle::framework::LoDTensor>();
-  scope.Var("x2")->GetMutable<paddle::framework::LoDTensor>();
-  scope.Var("k0")->GetMutable<paddle::framework::LoDTensor>();
-  scope.Var("y0")->GetMutable<paddle::framework::LoDTensor>();
-  scope.Var("y1")->GetMutable<paddle::framework::LoDTensor>();
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  op->Run(scope, cpu_place);
-}
-
-TEST(VarNameTest, all) {
-  std::string var_name("X");
-  std::string grad_var_name = paddle::framework::GradVarName(var_name);
-  ASSERT_EQ(grad_var_name, "X@GRAD");
-  std::string original_var_name =
-      paddle::framework::GradOriginalVarName(grad_var_name);
-  ASSERT_EQ(original_var_name, "X");
-  original_var_name = paddle::framework::GradOriginalVarName(original_var_name);
-  ASSERT_EQ(original_var_name, "X");
-
-  std::string var_name_2("XYZ");
-  grad_var_name = paddle::framework::GradVarName(var_name_2);
-  ASSERT_EQ(grad_var_name, "XYZ@GRAD");
-  original_var_name = paddle::framework::GradOriginalVarName(grad_var_name);
-  ASSERT_EQ(original_var_name, "XYZ");
-  original_var_name = paddle::framework::GradOriginalVarName(original_var_name);
-  ASSERT_EQ(original_var_name, "XYZ");
-
-  std::string var_name_3("");
-  grad_var_name = paddle::framework::GradVarName(var_name_3);
-  ASSERT_EQ(grad_var_name, "@GRAD");
-  original_var_name = paddle::framework::GradOriginalVarName(grad_var_name);
-  ASSERT_EQ(original_var_name, "");
-  original_var_name = paddle::framework::GradOriginalVarName(original_var_name);
-  ASSERT_EQ(original_var_name, "");
-}
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
deleted file mode 100644
index a12b4c87665dcd2ce2da1c287eac9bc9345aa724..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/parallel_executor.cc
+++ /dev/null
@@ -1,865 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/parallel_executor.h"
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DECLARE_bool(use_ngraph);
-
-#ifdef WITH_GPERFTOOLS
-#include "gperftools/profiler.h"
-#endif
-DEFINE_string(pe_profile_fname, "",
-              "Profiler filename for PE, which generated by gperftools."
-              "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
-DEFINE_bool(enable_parallel_graph, false,
-            "Force disable parallel graph execution mode if set false.");
-
-namespace paddle {
-namespace framework {
-
-static std::once_flag gProfileOnce;
-#ifdef WITH_GPERFTOOLS
-static bool gProfileStarted = false;
-#endif
-
-class ParallelExecutorPrivate {
- public:
-  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
-      : places_(places) {
-    if (!FLAGS_pe_profile_fname.empty()) {
-      std::call_once(gProfileOnce, [] {
-#ifdef WITH_GPERFTOOLS
-        ProfilerStart(FLAGS_pe_profile_fname.c_str());
-        gProfileStarted = true;
-#else
-        LOG(WARNING) << "Paddle is not compiled with gperftools. "
-          "FLAGS_pe_profile_fname will be ignored";
-#endif
-      });
-    }
-  }
-
-  ~ParallelExecutorPrivate() {
-    if (own_local_scope_) {
-      for (size_t i = 1; i < local_scopes_.size(); ++i) {
-        // Skip the first scope, since it is the global scope.
-        Scope *local_scope = local_scopes_[i];
-        if (global_scope_->HasKid(local_scope)) {
-          global_scope_->DeleteScope(local_scope);
-        }
-      }
-    }
-  }
-
-  ir::Graph *ApplyMemoryOptimizePass(ir::Graph *graph);
-
-  inline bool HasGarbageCollectors() const { return !gcs_.empty(); }
-
-  /**
-   * NOTE(zengjinle): the feeded variables of users should not be reused,
-   * because users may feed them into another network. Changing the feeded
-   * variables that users can visit may cause calculation wrong, which is
-   * a very subtle bug when traning networks. However, these variables
-   * can be garbage collected.
-   *
-   * ParallelExecutor provides 2 methods to feed variables:
-   *
-   *  - FeedTensorsIntoLocalScopes: this method would share memory of feeded
-   *                                variables, so we have to skip these.
-   *
-   *  - FeedAndSplitTensorIntoLocalScopes: this method would copy data of feeded
-   *                                       variables, so we do not need to skip
-   *                                       them.
-   */
-  inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) {
-    auto iter = mem_opt_var_infos_[scope_idx].find(name);
-    if (iter != mem_opt_var_infos_[scope_idx].end()) {
-      iter->second->SetSkipMemoryReuse(true);
-    }
-  }
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
-    VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
-            << ", num_trainers:" << bst.num_trainers_
-            << ", trainer_id:" << bst.trainer_id_;
-
-    if (bst.use_hierarchical_allreduce_) {
-      VLOG(1) << ", use_hierarchical_allreduce:"
-              << bst.use_hierarchical_allreduce_ << ", inter_trainers_num:"
-              << bst.hierarchical_allreduce_inter_nranks_
-              << ", exter_trainers_num:"
-              << bst.hierarchical_allreduce_exter_nranks_;
-    }
-
-    std::vector<ncclUniqueId *> flat_nccl_ids;
-    if (nranks_ == 1) {
-      // FIXME(gongwb): need not to create ncclid when nranks==1
-      nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
-      return;
-    }
-
-    if (bst.enable_parallel_graph_) {
-      VLOG(1) << "use only one ncclid in pg model";
-
-      ncclUniqueId *nccl_id = nullptr;
-
-      std::string var_name = platform::GetFlatNCCLVarName(0);
-      auto nccl_id_var = scope->FindVar(var_name);
-      if (nccl_id_var) {
-        nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-        VLOG(10) << "find nccl_id_var:" << var_name << ", nccl_id:" << nccl_id;
-      } else {
-        nccl_id = new ncclUniqueId();
-        PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id));
-        VLOG(10) << "can't find nccl_id_var:" << var_name
-                 << ", nccl_id:" << nccl_id;
-      }
-
-      flat_nccl_ids.push_back(nccl_id);
-
-      nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
-      VLOG(1) << "init bst nccl context complete!";
-      return;
-    }
-
-    // num_trainers ==1 && places > 1
-    if (bst.num_trainers_ == 1) {
-      nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
-      return;
-    }
-
-    for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
-      std::string var_name = platform::GetFlatNCCLVarName(i);
-      auto nccl_id_var = scope->FindVar(var_name);
-      PADDLE_ENFORCE(nccl_id_var, "can't find %s nccl_id_var", var_name);
-      auto nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-      flat_nccl_ids.push_back(nccl_id);
-    }
-
-    nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                             bst.trainer_id_);
-
-    if (bst.use_hierarchical_allreduce_) {
-      std::vector<ncclUniqueId *> inter_nccl_ids;
-      for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
-        std::string var_name = platform::GetHierarchicalInterNCCLVarName(i);
-        auto nccl_id_var = scope->FindVar(var_name);
-        PADDLE_ENFORCE(nccl_id_var, "can't find %s nccl_id_var", var_name);
-        auto inter_nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-        inter_nccl_ids.push_back(inter_nccl_id);
-      }
-
-      std::vector<ncclUniqueId *> exter_nccl_ids;
-      for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
-        std::string var_name = platform::GetHierarchicalExterNCCLVarName(i);
-        auto nccl_id_var = scope->FindVar(var_name);
-        PADDLE_ENFORCE(nccl_id_var, "can't find %s nccl_id_var", var_name);
-        auto nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-        exter_nccl_ids.push_back(nccl_id);
-      }
-
-      nccl_ctxs_->InitHierarchicalCtxs(
-          places_, inter_nccl_ids, exter_nccl_ids, bst.num_trainers_,
-          bst.trainer_id_, bst.hierarchical_allreduce_inter_nranks_,
-          bst.hierarchical_allreduce_exter_nranks_);
-    }
-  }
-
-  void InitOrGetNCCLCommunicator(framework::Scope *scope, BuildStrategy *bst) {
-    const std::string var_name = "NCCLCommunicator";
-    auto var = scope->FindVar(var_name);
-    if (var != nullptr) {
-      PADDLE_ENFORCE(var->IsInitialized(),
-                     "if %s exists, it must be initialized", var_name);
-      VLOG(1) << "find " << var_name
-              << " in scope, so use it and does not recreate!";
-      nccl_ctxs_ = var->GetMutable<platform::NCCLCommunicator>();
-      return;
-    }
-
-    if (bst->use_hierarchical_allreduce_) {
-      PADDLE_ENFORCE(bst->num_trainers_ > 1, "num_trainers:%llu < 1",
-                     bst->num_trainers_);
-      PADDLE_ENFORCE(bst->hierarchical_allreduce_inter_nranks_ > 1,
-                     "inter_nranks:%d < 1",
-                     bst->hierarchical_allreduce_inter_nranks_);
-      PADDLE_ENFORCE(
-          (bst->num_trainers_ % bst->hierarchical_allreduce_inter_nranks_ == 0),
-          "num_trainers:%llu mod inter_nranks:%d != 0", bst->num_trainers_,
-          bst->hierarchical_allreduce_inter_nranks_);
-
-      bst->hierarchical_allreduce_exter_nranks_ =
-          bst->num_trainers_ / bst->hierarchical_allreduce_inter_nranks_;
-    }
-
-    VLOG(1) << "not find " << var_name << " in scope, so recreate it!";
-    nccl_ctxs_ = scope->Var(var_name)->GetMutable<platform::NCCLCommunicator>();
-    InitNCCLCtxs(scope, *bst);
-  }
-#endif
-
-  inline bool IsPersistable(const std::string &name) const {
-    auto iter = is_persistable_.find(name);
-    return iter != is_persistable_.end() && iter->second;
-  }
-
-  BuildStrategy build_strategy_;
-  std::vector<platform::Place> places_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<Scope *> local_exec_scopes_;
-  Scope *global_scope_;  // not owned
-  std::unique_ptr<details::SSAGraphExecutor> executor_;
-
-  std::unordered_map<std::string, bool> is_persistable_;
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  platform::NCCLCommunicator *nccl_ctxs_{nullptr};
-#endif
-  bool own_local_scope_;
-  bool use_cuda_;
-  bool use_all_reduce_;
-  size_t nranks_;
-
-  ir::MemOptVarInfoMapList mem_opt_var_infos_;
-  ir::GarbageCollectorMap gcs_;
-};
-
-ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
-  if (FLAGS_use_ngraph) {
-    LOG_FIRST_N(WARNING, 1)
-        << "FLAGS_use_ngraph=True, memory optimization strategy is "
-           "disabled in ParallelExecutor";
-    return graph;
-  }
-
-  std::vector<ir::LastLiveOpsOfVars> last_live_ops_of_vars;
-
-  auto ref_cnt_pass = ir::PassRegistry::Instance().Get("reference_count_pass");
-  ref_cnt_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
-  ref_cnt_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
-  graph = ref_cnt_pass->Apply(graph);
-  VLOG(10) << "ReferenceCountPass Applied";
-
-  if (build_strategy_.enable_inplace_) {
-    auto inplace_pass =
-        ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
-    inplace_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
-    inplace_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
-    inplace_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
-    VLOG(10) << "Start to apply buffer_shared_inplace_pass";
-    graph = inplace_pass->Apply(graph);
-    VLOG(10) << "buffer_shared_inplace_pass Applied";
-    LOG(INFO) << "Inplace strategy is enabled, when "
-                 "build_strategy.enable_inplace = True";
-  }
-
-  /**
-   * NOTE(zengjinle): If BuildStrategy.memory_optimize = None in Python,
-   * set BuildStrategy.memory_optimize according to whether gc is enabled.
-   * If gc is enabled, BuildStrategy.memory_optimize = False.
-   * If gc is disabled, BuildStrategy.memory_optimize = True.
-   * This is because gc+memory_optimize is worse than gc only.
-   *
-   * As an option, users can enable BuildStrategy.memory_optimize forcely
-   * by setting True, and disable it forcely by setting False.
-   */
-  bool is_gc_enabled = (GetEagerDeletionThreshold() >= 0);
-  if (!build_strategy_.memory_optimize_) {
-    build_strategy_.memory_optimize_ = !is_gc_enabled;
-  }
-
-  if (build_strategy_.memory_optimize_.get()) {
-    auto cross_op_memory_reuse_pass = ir::PassRegistry::Instance().Get(
-        "buffer_shared_cross_op_memory_reuse_pass");
-    cross_op_memory_reuse_pass->SetNotOwned(ir::kMemOptVarInfoMapList,
-                                            &mem_opt_var_infos_);
-    cross_op_memory_reuse_pass->SetNotOwned(ir::kLastLiveOpsOfVars,
-                                            &last_live_ops_of_vars);
-    cross_op_memory_reuse_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
-    VLOG(10) << "Start to apply buffer_shared_cross_op_memory_reuse_pass";
-    graph = cross_op_memory_reuse_pass->Apply(graph);
-    VLOG(10) << "buffer_shared_cross_op_memory_reuse_pass Applied";
-    LOG(INFO) << "Cross op memory reuse strategy is enabled, when "
-                 "build_strategy.memory_optimize = True or garbage collection "
-                 "strategy is disabled, which is not recommended";
-  }
-
-  if (!is_gc_enabled) {
-    return graph;
-  }
-  size_t max_memory_size = static_cast<size_t>(GetEagerDeletionThreshold());
-
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &place = places_[i];
-    if (gcs_.count(place) > 0) {
-      continue;
-    }
-    std::unique_ptr<GarbageCollector> gc;
-#ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(place)) {
-      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new UnsafeFastGPUGarbageCollector(
-            boost::get<platform::CUDAPlace>(place), max_memory_size));
-      } else {
-        gc.reset(new StreamGarbageCollector(
-            boost::get<platform::CUDAPlace>(place), max_memory_size));
-      }
-      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-    } else {
-#endif
-      if (platform::is_cpu_place(place)) {
-        gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place),
-                                         max_memory_size));
-        VLOG(10) << "Created GarbageCollector at " << place;
-      } else {
-        PADDLE_THROW("Unsupported place for garbage collection");
-      }
-#ifdef PADDLE_WITH_CUDA
-    }
-#endif
-
-    gcs_.emplace(place, std::move(gc));
-  }
-
-  if (!gcs_.empty()) {
-    auto eager_deletion_pass =
-        ir::PassRegistry::Instance().Get("eager_deletion_pass");
-    eager_deletion_pass->SetNotOwned(ir::kMemOptVarInfoMapList,
-                                     &mem_opt_var_infos_);
-    eager_deletion_pass->SetNotOwned(ir::kGarbageCollector, &gcs_);
-    eager_deletion_pass->SetNotOwned(ir::kLastLiveOpsOfVars,
-                                     &last_live_ops_of_vars);
-    eager_deletion_pass->SetNotOwned(ir::kAllPlaces, &places_);
-    graph = eager_deletion_pass->Apply(graph);
-    VLOG(10) << "EagerDeletionPass Applied";
-    LOG(INFO) << "Garbage collection strategy is enabled, when "
-              << "FLAGS_eager_delete_tensor_gb = "
-              << (static_cast<double>(GetEagerDeletionThreshold()) / (1 << 30));
-  }
-  return graph;
-}
-
-std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
-  return member_->local_scopes_;
-}
-
-void ParallelExecutor::DropLocalExeScopes() {
-  auto executor = dynamic_cast<details::ScopeBufferedSSAGraphExecutor *>(
-      member_->executor_.get());
-  if (executor) {
-    executor->DropLocalExeScopes();
-  }
-}
-
-bool ParallelExecutor::NeedCreateLocalExeScope() {
-  auto executor = dynamic_cast<details::ScopeBufferedSSAGraphExecutor *>(
-      member_->executor_.get());
-  return executor && executor->NeedCreateLocalExeScope();
-}
-
-ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
-                                   const std::vector<std::string> &bcast_vars,
-                                   const std::string &loss_var_name,
-                                   Scope *scope,
-                                   const std::vector<Scope *> &local_scopes,
-                                   const ExecutionStrategy &exec_strategy,
-                                   const BuildStrategy &build_strategy,
-                                   ir::Graph *graph)
-    : member_(new ParallelExecutorPrivate(places)) {
-  member_->global_scope_ = scope;
-  member_->use_cuda_ = exec_strategy.use_cuda_;
-  member_->build_strategy_ = build_strategy;
-  member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
-                             BuildStrategy::ReduceStrategy::kAllReduce;
-  member_->nranks_ = build_strategy.num_trainers_ * places.size();
-  if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
-    LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
-                 "the number of places should be greater than 1.";
-    member_->build_strategy_.reduce_ =
-        BuildStrategy::ReduceStrategy::kAllReduce;
-    member_->use_all_reduce_ = true;
-  }
-#if defined(PADDLE_WITH_CUDA) && defined(_WIN32)
-  if (member_->use_cuda_) {
-    PADDLE_ENFORCE(places.size() == 1, "Windows can support Single GPU only.");
-  }
-#endif
-
-  LOG(INFO) << string::Sprintf(
-      "The number of %s, which is used in ParallelExecutor, is %lu. And "
-      "the Program will be copied %lu copies",
-      (member_->use_cuda_ ? "CUDAPlace" : "CPUPlace"), places.size(),
-      places.size());
-
-  // Step 1. Bcast the bcast_vars to devs.
-  // Create local scopes
-  if (local_scopes.empty()) {
-    member_->own_local_scope_ = true;
-    member_->local_scopes_.emplace_back(member_->global_scope_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&scope->NewScope());
-    }
-  } else {
-    member_->own_local_scope_ = false;
-    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
-    }
-  }
-
-  std::vector<ir::Graph *> graphs;
-  if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE(!member_->use_cuda_,
-                   "gpu mode does not support async_mode_ now!");
-    graphs.push_back(graph);
-    for (size_t i = 1; i < places.size(); ++i) {
-      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
-      async_graphs_.emplace_back(tmp_graph);
-      graphs.push_back(tmp_graph);
-    }
-  }
-
-  // FIXME(Yancey1989): parallel graph mode get better performance
-  // in GPU allreduce distributed training. Need an elegant way to
-  // choice the execution strategy.
-  member_->build_strategy_.enable_parallel_graph_ =
-      EnableParallelGraphExecution(*graph, exec_strategy,
-                                   member_->build_strategy_);
-  if (member_->build_strategy_.enable_parallel_graph_) {
-    LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
-                 "Execution which can get better performance,"
-              << "you can force it off by env FLAGS_enable_parallel_graph=0";
-  }
-
-  if (member_->use_cuda_ && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
-
-    // Initialize device context's nccl comm, will be used by normal
-    // Operators like sync_batch_norm, and collective ops.
-    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
-    // be rewrite and there will be some problem.
-    // NOTE: NCCL group-calls and non-group-calls can not use the same
-    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
-    // same communicators.
-    auto *nccl_ctxs =
-        member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
-      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_nccl_comm(nccl_ctx.comm());
-    }
-#endif
-  }
-  // broadcast parameters from the 0th device to others:
-  auto need_broadcast = [&]() -> bool {
-    if (member_->build_strategy_.num_trainers_ > 1) {
-      // 1. num_tariners would be grater than 1 for nccl distributed training.
-      return true;
-    } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-      // 2. Only one trainer process, but ParallelExecutor hold multiple
-      // devices.
-      return true;
-    }
-    return false;
-  };
-  // Bcast Parameters to all GPUs
-  if (need_broadcast()) {
-    BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_);
-  }
-
-  // Startup Program has been run. All local scopes has correct parameters.
-
-  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  std::vector<ir::Graph *> async_graphs(places.size());
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_cuda_,
-        member_->nccl_ctxs_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_cuda_,
-          member_->nccl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_);
-  }
-#else
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_cuda_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_cuda_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_cuda_);
-  }
-#endif
-
-  graph = member_->ApplyMemoryOptimizePass(graph);
-
-  async_graphs[0] = graph;
-
-  // Step 3. Create vars in each scope. Passes may also create new vars.
-  //         skip control vars and empty vars
-  std::vector<details::VariableInfo> var_infos;
-  for (auto &node : graph->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos.emplace_back();
-      var_infos.back().name_ = node->Var()->Name();
-      var_infos.back().type_ = node->Var()->GetType();
-      var_infos.back().persistable_ = node->Var()->Persistable();
-
-      member_->is_persistable_.emplace(node->Var()->Name(),
-                                       node->Var()->Persistable());
-    }
-  }
-
-  std::unordered_map<Scope *, Scope *> scope_map;
-  for (auto *scope : member_->local_scopes_) {
-    auto &local_exec_scope = scope->NewScope();
-    member_->local_exec_scopes_.emplace_back(&local_exec_scope);
-    scope_map.emplace(scope, &local_exec_scope);
-  }
-
-  PADDLE_ENFORCE_EQ(member_->local_scopes_.size(),
-                    member_->local_exec_scopes_.size());
-
-  std::vector<ir::Graph *> final_graphs;
-
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use AsyncSSAGraphExecutor";
-    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, async_graphs));
-    final_graphs = async_graphs;
-  } else if (member_->build_strategy_.enable_parallel_graph_) {
-    VLOG(3) << "use ParallelSSAGraphExecutor";
-#ifdef PADDLE_WITH_CUDA
-    // TODO(Yancey1989): Remove passing in the main_program when
-    // allreduce_seq_pass doesn't need it as the attr.
-    auto *pg_exe = new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, graph);
-    final_graphs = pg_exe->Graphs();
-    member_->executor_.reset(pg_exe);
-#else
-    PADDLE_THROW(
-        "Paddle should be compiled with CUDA for ParallelGraph Execution.");
-#endif
-  } else {
-    if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
-      VLOG(3) << "use ThreadedSSAGraphExecutor";
-      member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-          member_->places_, graph));
-    } else {
-      VLOG(3) << "use FastThreadedSSAGraphExecutor";
-      member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-          member_->places_, graph));
-    }
-    final_graphs.emplace_back(graph);
-  }
-
-  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
-  if (!member_->build_strategy_.async_mode_) {
-    member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        std::move(var_infos), member_->places_, std::move(member_->executor_)));
-  }
-
-  for (auto *g : final_graphs) {
-    auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*g);
-    for (auto *op : ops) {
-      op->SetLocalExecScopes(scope_map);
-    }
-  }
-}
-
-void ParallelExecutor::BCastParamsToDevices(
-    const std::vector<std::string> &vars, int trainer_id) const {
-  VLOG(3) << "BCastParamsToDevices";
-  // the initializing bcast, all vars would be bcast from device(0).
-  for (auto &var : vars) {
-    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
-    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
-      continue;
-    }
-
-    auto &main_tensor = main_var->Get<LoDTensor>();
-    if (!main_tensor.IsInitialized()) {
-      VLOG(3) << "one in var not inited, return!";
-      continue;
-    }
-    auto &dims = main_tensor.dims();
-    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      std::vector<void *> buffers;
-      buffers.reserve(member_->places_.size());
-      size_t numel = main_tensor.numel();
-      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        auto place = member_->places_[i];
-        void *buffer;
-
-        if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data<void>());
-        } else {
-          auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-          t->Resize(dims);
-          buffer = t->mutable_data(place, main_tensor.type());
-        }
-        buffers.push_back(buffer);
-      }
-
-      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
-                        "variables' buffer size to bcast NOT equal to places");
-      {
-        auto *nccl_ctxs = member_->nccl_ctxs_->DefaultFlatCtx();
-        platform::NCCLGroupGuard guard;
-        for (size_t i = 0; i < member_->places_.size(); ++i) {
-          auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]);
-          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
-                                       nccl_ctx.comm_, nccl_ctx.stream());
-        }
-        nccl_ctxs->WaitAll();
-      }
-#endif
-    } else {
-      platform::CPUPlace cpu;
-      for (size_t i = 1; i < member_->places_.size(); ++i) {
-        auto local_scope = member_->local_scopes_[i];
-        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-
-        auto copy_memory = [&] {
-          t->Resize(dims);
-          t->mutable_data(cpu, main_tensor.type());
-          paddle::framework::TensorCopy(main_tensor, cpu, t);
-        };
-
-        auto share_memory = [&] { t->ShareDataWith(main_tensor); };
-
-        // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
-        if (member_->build_strategy_.async_mode_) {
-          share_memory();
-        } else if (member_->use_all_reduce_ || member_->use_cuda_ ||
-                   var == "@LR_DECAY_COUNTER@") {
-          copy_memory();
-        } else {
-          share_memory();
-        }
-      }
-    }
-  }
-}
-
-FeedFetchList ParallelExecutor::Run(
-    const std::vector<std::string> &fetch_tensors) {
-  VLOG(3) << "enter ParallelExecutor Run";
-#ifdef WITH_GPERFTOOLS
-  if (gProfileStarted) {
-    ProfilerFlush();
-  }
-#endif
-
-  platform::RecordBlock b(0);
-
-  ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), fetch_tensors,
-                                member_->HasGarbageCollectors());
-
-  VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run";
-  auto fetch_data = member_->executor_->Run(fetch_tensors);
-  return fetch_data;
-}
-
-void ParallelExecutor::FeedTensorsIntoLocalScopes(
-    const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
-  PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), tensors.size());
-
-  for (size_t i = 0; i < tensors.size(); ++i) {
-    auto &map = tensors[i];
-    for (auto &pair : map) {
-      bool is_persistable = member_->IsPersistable(pair.first);
-      if (!is_persistable) {
-        member_->SetSkipMemoryReuse(i, pair.first);
-      }
-      auto *feed_scope = is_persistable ? member_->local_scopes_[i]
-                                        : member_->local_exec_scopes_[i];
-      auto *feed_var = feed_scope->Var(pair.first);
-
-      auto *trg = feed_var->GetMutable<LoDTensor>();
-      trg->ShareDataWith(pair.second);
-      trg->set_lod(pair.second.lod());
-    }
-  }
-}
-
-void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
-    const std::unordered_map<std::string, LoDTensor> &tensors) {
-  size_t num_places = member_->places_.size();
-  for (auto &pair : tensors) {
-    bool is_persistable = member_->IsPersistable(pair.first);
-    VLOG(3) << "Split " << (is_persistable ? "persistable" : "no persistable")
-            << " data (" << pair.first << "), dim:" << pair.second.dims()
-            << ", place: " << pair.second.place();
-    auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
-    bool is_cpu_place = platform::is_cpu_place(member_->places_.front());
-    if (!is_persistable && num_places != lod_tensors.size()) {
-      auto error_info = string::Sprintf(
-          "The number(%d) of samples[%s] of current batch is less than the "
-          "count(%d) of devices(%s), currently, it is not allowed. ",
-          lod_tensors.size(), pair.first, num_places,
-          (is_cpu_place ? "CPU" : "GPU"));
-      if (is_cpu_place) {
-        error_info +=
-            "You should set the environment variable CPU_NUM in the system "
-            "to determine the number of devices you need.";
-      }
-      PADDLE_THROW(error_info);
-    } else if (is_persistable) {
-      if (lod_tensors.size() == 1) {
-        lod_tensors.reserve(num_places);
-        auto &tensor = lod_tensors.front();
-        PADDLE_ENFORCE_EQ(tensor.dims(), pair.second.dims(),
-                          "The dim doesn't match.");
-        PADDLE_ENFORCE_EQ(tensor.place(), member_->places_.at(0),
-                          "The place doesn't match.");
-        for (size_t i = 1; i < num_places; ++i) {
-          lod_tensors.emplace_back();
-          auto &tmp = lod_tensors.back();
-          framework::TensorCopy(pair.second, member_->places_.at(i), &tmp);
-        }
-      }
-      if (lod_tensors.size() != num_places) {
-        auto error_info = string::Sprintf(
-            "The number(%d) of samples[%s] of the current batch does not match "
-            "the count(%d) of devices(%s). Because that %s is a persistable "
-            "variable, you can feed just one sample, in that case, the input "
-            "sample will be copied in %d copies and be sent to different "
-            "places separately. If you need that different place has different "
-            "value, you should feed %d samples.",
-            lod_tensors.size(), pair.first, num_places,
-            (is_cpu_place ? "CPU" : "GPU"), pair.first, num_places, num_places);
-        PADDLE_THROW(error_info);
-      }
-    }
-
-    for (size_t j = 0; j < num_places; ++j) {
-      auto *feed_scope = is_persistable ? member_->local_scopes_[j]
-                                        : member_->local_exec_scopes_[j];
-      auto *feed_var = feed_scope->Var(pair.first);
-
-      auto t = feed_var->GetMutable<LoDTensor>();
-      t->ShareDataWith(lod_tensors[j]);
-      t->set_lod(lod_tensors[j].lod());
-    }
-  }
-}
-
-ParallelExecutor::~ParallelExecutor() {
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  delete member_;
-}
-
-bool ParallelExecutor::EnableParallelGraphExecution(
-    const ir::Graph &graph, const ExecutionStrategy &exec_strategy,
-    const BuildStrategy &build_strategy) const {
-  if (!FLAGS_enable_parallel_graph) {
-    return false;
-  }
-
-  bool enable_parallel_graph = true;
-
-  for (ir::Node *node : graph.Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      // TODO(Yancey1989): support sparse update in ParallelGraph mode.
-      if (node->Var()->GetType() == proto::VarType::SELECTED_ROWS) {
-        enable_parallel_graph = false;
-        break;
-      }
-    } else if (node->IsOp() && node->Op()) {
-      // TODO(Yancey1989): support pserver mode
-      if (node->Op()->Type() == "send" || node->Op()->Type() == "recv") {
-        enable_parallel_graph = false;
-        break;
-      }
-    }
-  }
-
-  if (!member_->use_all_reduce_ || !member_->use_cuda_) {
-    if (build_strategy.enable_sequential_execution_ ||
-        exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) {
-      enable_parallel_graph = false;
-    }
-  }
-
-#ifdef WIN32
-  VLOG(1) << "Windows has no support to parallel graph, enable_parallel_graph "
-             "would be forced to false.";
-  enable_parallel_graph = false;
-#endif
-
-  return enable_parallel_graph;
-}
-
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(reference_count_pass);
-USE_PASS(eager_deletion_pass);
-USE_PASS(buffer_shared_inplace_pass);
-USE_PASS(buffer_shared_cross_op_memory_reuse_pass);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
deleted file mode 100644
index 00ac5e134db91836d499cac765d606a19fe0f954..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/parallel_executor.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-
-class ParallelExecutorPrivate;
-
-using details::BuildStrategy;
-using details::ExecutionStrategy;
-
-class ParallelExecutor {
-  DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
-
- public:
-  explicit ParallelExecutor(const std::vector<platform::Place> &places,
-                            const std::vector<std::string> &bcast_vars,
-                            const std::string &loss_var_name, Scope *scope,
-                            const std::vector<Scope *> &local_scopes,
-                            const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy,
-                            ir::Graph *graph);
-
-  ~ParallelExecutor();
-
-  std::vector<Scope *> &GetLocalScopes();
-
-  void DropLocalExeScopes();
-
-  // This API is used to check whether DropLocalExeScopes work.
-  bool NeedCreateLocalExeScope();
-
-  /**
-   * Feed tensors to local scopes. The size of tensors should be equal to the
-   * size of local scopes.
-   */
-  void FeedTensorsIntoLocalScopes(
-      const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors);
-
-  void FeedAndSplitTensorIntoLocalScopes(
-      const std::unordered_map<std::string, LoDTensor> &tensors);
-
-  FeedFetchList Run(const std::vector<std::string> &fetch_tensors);
-
- private:
-  // broadcast the parameters from the 0th device.
-  // trainer_id the trainer index in nccl distributed training.
-  void BCastParamsToDevices(const std::vector<std::string> &vars,
-                            int trainer_id = 0) const;
-  bool EnableParallelGraphExecution(const ir::Graph &graph,
-                                    const ExecutionStrategy &exec_strategy,
-                                    const BuildStrategy &build_strategy) const;
-
-  ParallelExecutorPrivate *member_;
-  std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
deleted file mode 100644
index 3617a8f18865729e5fac0d6340d436cef2158ee8..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ /dev/null
@@ -1,266 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/trainer.h"
-#include "paddle/fluid/framework/trainer_desc.pb.h"
-
-namespace paddle {
-namespace framework {
-
-void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
-                                 Dataset* dataset) {
-  pipeline_num_ = trainer_desc.thread_num();
-  VLOG(3) << "pipeline num: " << pipeline_num_;
-
-  SetDataset(dataset);
-  // get filelist from trainer_desc here
-  const std::vector<paddle::framework::DataFeed*> readers =
-      dataset->GetReaders();
-  VLOG(3) << "readers num: " << readers.size();
-
-  pipeline_config_ = trainer_desc.section_param();
-  scope_queue_size_ = pipeline_config_.queue_size();
-  sync_steps_ = pipeline_config_.sync_steps();
-  section_num_ = pipeline_config_.section_config_size();
-
-  VLOG(3) << "scope_queue_size: " << scope_queue_size_;
-  VLOG(3) << "section num: " << section_num_;
-  VLOG(3) << "sync_steps: " << sync_steps_;
-
-  workers_.resize(section_num_);
-  in_var_names_.resize(section_num_);
-  out_var_names_.resize(section_num_);
-  worker_count_.resize(section_num_);
-  worker_count_mutex_.resize(section_num_);
-  param_need_sync_.reset(new std::vector<std::string>);
-
-  int reader_index = 0;
-  for (int i = 0; i < section_num_; ++i) {
-    const auto& section_config = pipeline_config_.section_config(i);
-    int concurrency = section_config.concurrency();
-    VLOG(3) << "the thread num of each pipeline in section " << i
-            << " is: " << concurrency;
-    in_var_names_[i].reset(new std::vector<std::string>(
-        section_config.section_in_var_names().begin(),
-        section_config.section_in_var_names().end()));
-    out_var_names_[i].reset(new std::vector<std::string>(
-        section_config.section_out_var_names().begin(),
-        section_config.section_out_var_names().end()));
-    worker_count_[i].resize(pipeline_num_);
-    worker_count_mutex_[i].resize(pipeline_num_);
-    for (int j = 0; j < pipeline_num_; ++j) {
-      worker_count_[i][j] = new int(concurrency);
-      worker_count_mutex_[i][j].reset(new std::mutex);
-    }
-
-    platform::Place place;
-    workers_[i].resize(pipeline_num_);
-    for (int j = 0; j < pipeline_num_; ++j) {
-      workers_[i][j].resize(concurrency);
-
-      switch (section_config.place()) {
-        case SectionConfig::CPUPlace:
-          place = platform::CPUPlace();
-          break;
-        case SectionConfig::CUDAPlace:
-          // Note that one section has at most one GPU place in one pipeline
-          place = platform::CUDAPlace(j);
-          break;
-        case SectionConfig::CUDAPinnedPlace:
-          place = platform::CUDAPinnedPlace();
-          break;
-        default:
-          PADDLE_ENFORCE(false, "Unkown place type in SectionConfig: %d",
-                         section_config.place());
-      }
-
-      for (int k = 0; k < concurrency; ++k) {
-        workers_[i][j][k] = DeviceWorkerFactory::CreateDeviceWorker(
-            trainer_desc.device_worker_name());
-        auto this_worker =
-            std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-                workers_[i][j][k]);
-        this_worker->SetSectionIndex(i);
-        this_worker->SetDeviceIndex(j);
-        this_worker->SetThreadIndex(k);
-        this_worker->SetSectionNum(section_num_);
-        this_worker->SetPipelineNum(pipeline_num_);
-        if (i == 0) {
-          this_worker->SetDataFeed(readers[reader_index++]);
-          this_worker->SetReaderPlace(place);
-        }
-        this_worker->SetPlace(place);
-        this_worker->Initialize(trainer_desc);
-      }
-    }
-  }
-  param_need_sync_.reset(
-      new std::vector<std::string>(pipeline_config_.param_need_sync().begin(),
-                                   pipeline_config_.param_need_sync().end()));
-  VLOG(3) << "param_need_sync_ have: ";
-  for (const std::string& name : *param_need_sync_) {
-    VLOG(3) << name;
-  }
-  // set debug here
-  SetDebug(trainer_desc.debug());
-}
-
-void PipelineTrainer::InitFirstScopeQueue(ScopeQueue* scope_queue,
-                                          int pipeline_id,
-                                          const ProgramDesc& main_program) {
-  for (int i = 0; i < scope_queue_size_; ++i) {
-    Scope* scope = &pipeline_scopes_[pipeline_id]->NewScope();
-    for (auto& var : main_program.Block(0).AllVars()) {
-      if (!var->Persistable()) {
-        auto* ptr = scope->Var(var->Name());
-        InitializeVariable(ptr, var->GetType());
-      }
-    }
-    scope_queue->Send(scope);
-  }
-}
-
-void PipelineTrainer::CopyParameters(const Scope& root_scope, int pipeline_id) {
-  for (const std::string& name : *param_need_sync_) {
-    const LoDTensor& root_tensor = root_scope.FindVar(name)->Get<LoDTensor>();
-
-    // TODO(hutxian): check a new var of the same name is created in
-    // pipeline_scope
-    LoDTensor* gpu_tensor =
-        pipeline_scopes_[pipeline_id]->Var(name)->GetMutable<LoDTensor>();
-    platform::Place place = platform::CUDAPlace(pipeline_id);
-    TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
-               static_cast<Tensor*>(gpu_tensor));
-  }
-}
-
-void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
-                                     const platform::Place& place) {
-  PADDLE_ENFORCE(root_scope_, "Null root_scope pointer");
-  SectionWorker::cpu_id_.store(pipeline_config_.start_cpu_core_id());
-  scope_queues_.resize(section_num_);
-  pipeline_scopes_.resize(pipeline_num_);
-
-  VLOG(3) << "Init ScopeQueues and create all scopes";
-  for (int i = 0; i < section_num_; ++i) {
-    for (int j = 0; j < pipeline_num_; ++j) {
-      scope_queues_[i].emplace_back(new ScopeQueue(scope_queue_size_));
-      if (i == 0) {
-        pipeline_scopes_[j] = &root_scope_->NewScope();
-        CopyParameters(*root_scope_, j);
-        InitFirstScopeQueue(scope_queues_[0].back().get(), j, main_program);
-      }
-    }
-  }
-
-  for (int i = 0; i < section_num_; ++i) {
-    for (int j = 0; j < pipeline_num_; ++j) {
-      for (size_t k = 0; k < workers_[i][j].size(); ++k) {
-        auto this_worker =
-            std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-                workers_[i][j][k]);
-        this_worker->SetRootScope(root_scope_);
-        this_worker->SetCountMutex(worker_count_mutex_[i][j].get());
-        this_worker->SetWorkerCount(worker_count_[i][j]);
-        this_worker->SetScopeQueue(scope_queues_[i][j].get(),
-                                   (i == section_num_ - 1)
-                                       ? scope_queues_[0][j].get()
-                                       : scope_queues_[i + 1][j].get());
-        this_worker->SetVarNames(*in_var_names_[i], *out_var_names_[i]);
-        if (i != section_num_ - 1) {
-          // For data copy in adjacent different place
-          this_worker->SetNextSectionPlace(
-              std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-                  workers_[i + 1][j][0])
-                  ->place());
-        }
-      }
-    }
-  }
-
-  if (pipeline_num_ > 1) {
-    construct_sync_functor();
-  }
-}
-
-void PipelineTrainer::construct_sync_functor() {
-  std::vector<platform::Place> cuda_places;
-  for (int i = 0; i < pipeline_num_; ++i) {
-    cuda_places.emplace_back(platform::CUDAPlace(i));
-  }
-  nccl_ctx_map_.reset(new platform::NCCLContextMap(cuda_places));
-  sync_functors_.resize(pipeline_num_);
-  SyncFunctor::sync_flag_ = 0;
-  SyncFunctor::pipeline_scopes_.resize(0);
-
-  for (int j = 0; j < pipeline_num_; ++j) {
-    SyncFunctor* sync_function = new SyncFunctor(j, pipeline_num_, sync_steps_);
-    sync_function->SetSyncParam(*param_need_sync_);
-    sync_function->SetNcclCtxMap(nccl_ctx_map_.get());
-    SyncFunctor::pipeline_scopes_.push_back(this->pipeline_scopes_[j]);
-    sync_functors_[j].reset(sync_function);
-  }
-  for (int i = section_num_ - 1; i >= 0; --i) {
-    if (SectionConfig::CUDAPlace ==
-        pipeline_config_.section_config(i).place()) {
-      for (int j = 0; j < pipeline_num_; ++j) {
-        for (size_t k = 0; k < workers_[i][j].size(); ++k) {
-          auto this_worker =
-              std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-                  workers_[i][j][k]);
-          this_worker->SetSyncFunctor(sync_functors_[j].get());
-        }
-      }
-      break;
-    }
-  }
-}
-
-void PipelineTrainer::Run() {
-  VLOG(3) << "Going to run";
-  for (int i = 0; i < section_num_; ++i) {
-    for (int j = 0; j < pipeline_num_; ++j) {
-      for (size_t k = 0; k < workers_[i][j].size(); ++k) {
-        if (!debug_) {
-          section_threads_.push_back(
-              std::thread(&DeviceWorker::TrainFiles, workers_[i][j][k].get()));
-        } else {
-          section_threads_.push_back(std::thread(
-              &DeviceWorker::TrainFilesWithProfiler, workers_[i][j][k].get()));
-        }
-      }
-    }
-  }
-}
-
-void PipelineTrainer::Finalize() {
-  for (auto& th : section_threads_) {
-    th.join();
-  }
-  for (const auto& var : *param_need_sync_) {
-    auto* root_tensor = root_scope_->Var(var)->GetMutable<LoDTensor>();
-    // TODO(hutuxian): Add a final all-reduce?
-    const auto& thread_tensor =
-        pipeline_scopes_[0]->FindVar(var)->Get<LoDTensor>();
-    TensorCopySync(thread_tensor, platform::CPUPlace(), root_tensor);
-  }
-  root_scope_->DropKids();
-}
-
-}  // end namespace framework
-}  // end namespace paddle
-#endif
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
deleted file mode 100644
index 4b9667113bc7918c1323f0213213a6ffdb7eed8e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/program_desc.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/version.h"
-
-namespace paddle {
-namespace framework {
-
-BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
-  auto *b = desc_.add_blocks();
-  b->set_parent_idx(parent.ID());
-  b->set_idx(desc_.blocks_size() - 1);
-  blocks_.emplace_back(new BlockDesc(this, b));
-  return blocks_.back().get();
-}
-
-void ProgramDesc::Flush() {
-  for (auto &block : blocks_) {
-    block->Flush();
-  }
-}
-
-proto::ProgramDesc *ProgramDesc::Proto() {
-  Flush();
-  return &desc_;
-}
-
-int64_t ProgramDesc::Version() const { return desc_.version().version(); }
-
-ProgramDesc::ProgramDesc() {
-  desc_.mutable_version()->set_version(kCurProgramVersion);
-  auto *block = desc_.mutable_blocks()->Add();
-  block->set_idx(kRootBlockIndex);
-  block->set_parent_idx(kNoneBlockIndex);
-  blocks_.emplace_back(new BlockDesc(this, block));
-}
-
-ProgramDesc::ProgramDesc(const ProgramDesc &o) {
-  desc_ = o.desc_;
-  for (int i = 0; i < desc_.blocks_size(); ++i) {
-    auto *block = desc_.mutable_blocks(i);
-    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
-  }
-  for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
-    auto all_ops = blocks_[block_id]->AllOps();
-    for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
-      auto &op = all_ops[op_id];
-
-      for (const std::string &attr_name : op->AttrNames()) {
-        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
-          int sub_block_id =
-              o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
-          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
-        } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
-          std::vector<int> sub_block_ids =
-              o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name);
-          std::vector<BlockDesc *> block_descs;
-          for (int block_id : sub_block_ids) {
-            block_descs.push_back(MutableBlock(block_id));
-          }
-          op->SetBlocksAttr(attr_name, block_descs);
-        }
-      }
-    }
-  }
-}
-
-ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
-  desc_ = desc;
-  InitFromProto();
-}
-
-void ProgramDesc::CopyFrom(const proto::ProgramDesc &desc) {
-  blocks_.clear();
-  desc_ = desc;
-  InitFromProto();
-}
-
-ProgramDesc::ProgramDesc(const std::string &binary_str) {
-  PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
-                 "Fail to parse program_desc from binary string.");
-  InitFromProto();
-}
-
-void ProgramDesc::InitFromProto() {
-  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDesc(this, &block_desc));
-  }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
-        } else if (attr.type() == proto::AttrType::BLOCKS) {
-          auto blks_idx = attr.blocks_idx();
-          std::vector<BlockDesc *> block_descs;
-          for (int blk_idx : blks_idx) {
-            block_descs.push_back(this->MutableBlock(blk_idx));
-          }
-          op->SetBlocksAttr(attr.name(), block_descs);
-        }
-      }
-    }
-  }
-}
-
-const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
-  auto &global_block = Block(0);
-  // The order of feed_target_names must follow the index specified in `col`.
-  // since feed operator's order doesn't necessary follow 'col'.
-  std::vector<std::string> feed_target_names;
-  for (auto *op : global_block.AllOps()) {
-    if (op->Type() == kFeedOpType) {
-      size_t col = boost::get<int>(op->GetAttr("col"));
-      if (col >= feed_target_names.size()) {
-        feed_target_names.resize(col + 1);
-      }
-      feed_target_names[col] = op->Output("Out")[0];
-    }
-  }
-  return feed_target_names;
-}
-
-const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
-  auto &global_block = Block(0);
-  // The order of fetch_target_names must follow the index specified in `col`.
-  // since fetch operator's order doesn't necessary follow 'col'.
-  std::vector<std::string> fetch_target_names;
-  for (auto *op : global_block.AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      size_t col = boost::get<int>(op->GetAttr("col"));
-      if (col >= fetch_target_names.size()) {
-        fetch_target_names.resize(col + 1);
-      }
-      fetch_target_names[col] = op->Input("X")[0];
-    }
-  }
-  return fetch_target_names;
-}
-
-void ProgramDesc::SetFeedHolderName(const std::string &feed_holder_name) {
-  auto *global_block = MutableBlock(0);
-  int index = 0;
-  for (auto *op : global_block->AllOps()) {
-    if (op->Type() == kFeedOpType) {
-      // Unify the input's name of all feed_ops to feed_holder_name
-      global_block->RemoveVar(op->Input("X")[0]);
-      op->SetInput("X", {feed_holder_name});
-      op->SetAttr("col", {index});
-      op->CheckAttrs();
-      index++;
-    }
-  }
-
-  auto *feed_holder = global_block->Var(feed_holder_name);
-  feed_holder->SetType(proto::VarType::FEED_MINIBATCH);
-  feed_holder->SetPersistable(true);
-}
-
-void ProgramDesc::SetFetchHolderName(const std::string &fetch_holder_name) {
-  auto *global_block = MutableBlock(0);
-  int index = 0;
-  for (auto *op : global_block->AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      // Unify the output's name of all fetch_ops to fetch_holder_name
-      global_block->RemoveVar(op->Output("Out")[0]);
-      op->SetOutput("Out", {fetch_holder_name});
-      op->SetAttr("col", {index});
-      op->CheckAttrs();
-      index++;
-    }
-  }
-
-  auto *fetch_holder = global_block->Var(fetch_holder_name);
-  fetch_holder->SetType(proto::VarType::FETCH_LIST);
-  fetch_holder->SetPersistable(true);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
deleted file mode 100644
index 2ec0e9d7a0969d44f88c7407bfb8cd4646530147..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/program_desc.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/proto_desc.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-
-class BlockDesc;
-
-class ProgramDesc {
- public:
-  ProgramDesc();
-
-  explicit ProgramDesc(const proto::ProgramDesc &desc);
-
-  ProgramDesc(const ProgramDesc &o);
-
-  explicit ProgramDesc(const std::string &binary_str);
-
-  BlockDesc *AppendBlock(const BlockDesc &parent);
-
-  BlockDesc *MutableBlock(size_t idx) {
-    if (idx == static_cast<size_t>(kNoneBlockIndex)) {
-      return nullptr;
-    } else {
-      return blocks_[idx].get();
-    }
-  }
-
-  const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; }
-
-  size_t Size() const { return blocks_.size(); }
-
-  void Flush();
-
-  void CopyFrom(const proto::ProgramDesc &desc);
-
-  proto::ProgramDesc *Proto();
-
-  int64_t Version() const;
-
-  // The output variable of feed_op is referenced as feed_target.
-  // This function is used to collect the output variable's name of all
-  // feed_ops.
-  const std::vector<std::string> GetFeedTargetNames();
-
-  // The input variable of fetch_op is referenced as fetch_target.
-  // This function is used to collect the input variable's name of all
-  // fetch_ops.
-  const std::vector<std::string> GetFetchTargetNames();
-
-  // The input variable of feed_op that holds input Tensor provided by users is
-  // referenced as feed_holder.
-  // This function is used to change or unify the feed_holder variables' name.
-  void SetFeedHolderName(const std::string &feed_holder_name);
-
-  // The output variable of fetch_op that holds output Tensor needed by users is
-  // referenced as fetch_holder.
-  // This function is used to change or unify the fetch_holder variables' name.
-  void SetFetchHolderName(const std::string &fetch_holder_name);
-
- private:
-  void InitFromProto();
-
-  proto::ProgramDesc desc_;
-
-  std::vector<std::unique_ptr<BlockDesc>> blocks_;
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
deleted file mode 100644
index 48bde2785e6a51afc0d2905ac31fe20a3c3019b6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/program_desc_test.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/program_desc.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-
-namespace paddle {
-namespace framework {
-TEST(ProgramDesc, copy_ctor) {
-  ProgramDesc program;
-  auto* global_block = program.MutableBlock(0);
-  auto* x = global_block->Var("X");
-  x->SetType(proto::VarType::LOD_TENSOR);
-  x->SetLoDLevel(0);
-  x->SetDataType(proto::VarType::FP32);
-  x->SetShape({1000, 784});
-
-  auto* y = global_block->Var("Y");
-  y->SetType(proto::VarType::LOD_TENSOR);
-  y->SetLoDLevel(0);
-  y->SetDataType(proto::VarType::FP32);
-  y->SetShape({784, 100});
-
-  auto* op = global_block->AppendOp();
-  op->SetType("mul");
-  op->SetInput("X", {x->Name()});
-  op->SetInput("Y", {y->Name()});
-
-  auto* out = global_block->Var("Out");
-  out->SetType(proto::VarType::LOD_TENSOR);
-  op->SetOutput("Y", {out->Name()});
-
-  BlockDesc* new_block = program.AppendBlock(*global_block);
-  op = new_block->AppendOp();
-  op->SetType("mul");
-
-  op = global_block->AppendOp();
-  op->SetType("op_with_subblock");
-  op->SetAttr("sub_block", new_block);
-
-  std::vector<BlockDesc*> sub_blocks;
-  sub_blocks.push_back(program.AppendBlock(*global_block));
-  sub_blocks.push_back(program.AppendBlock(*global_block));
-  op->SetAttr("sub_blocks", sub_blocks);
-
-  ProgramDesc program_copy(program);
-
-  auto* global_block_copy = program_copy.MutableBlock(0);
-  ASSERT_NE(global_block, global_block_copy);
-
-  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
-    ASSERT_TRUE(global_block_copy->HasVar(name));
-    auto* copy = global_block_copy->Var(name);
-    ASSERT_NE(copy, var_before);
-    ASSERT_EQ(copy->Name(), var_before->Name());
-    ASSERT_EQ(copy->GetType(), var_before->GetType());
-    ASSERT_EQ(copy->GetShape(), var_before->GetShape());
-    ASSERT_EQ(copy->Proto()->SerializeAsString(),
-              var_before->Proto()->SerializeAsString());
-  };
-
-  ASSERT_EQ(global_block->LocalVarNames(), global_block_copy->LocalVarNames());
-  ASSERT_EQ(3UL, global_block_copy->LocalVarNames().size());
-  assert_same_var("X", x);
-  assert_same_var("Y", y);
-  assert_same_var("Out", out);
-
-  bool found_sub_block = false;
-  bool found_sub_blocks = false;
-  for (size_t i = 0; i < global_block->OpSize(); ++i) {
-    auto op_origin = global_block->Op(i);
-    auto op_copy = global_block_copy->Op(i);
-
-    ASSERT_EQ(op_origin->Type(), op_copy->Type());
-    ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
-    ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs());
-
-    ASSERT_EQ(op_origin->Proto()->attrs().size(),
-              op_copy->Proto()->attrs().size());
-    for (auto it = op_origin->Proto()->attrs().begin();
-         it != op_origin->Proto()->attrs().end(); ++it) {
-      for (auto it_2 = op_copy->Proto()->attrs().begin();
-           it_2 != op_copy->Proto()->attrs().end(); ++it_2) {
-        if (it->name() == it_2->name()) {
-          ASSERT_TRUE(it_2->SerializeAsString() == it->SerializeAsString());
-        }
-      }
-    }
-
-    if (op->Type() == "op_with_subblock") {
-      ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
-      found_sub_block = true;
-
-      ASSERT_EQ(2UL, op->GetBlocksAttrIds("sub_blocks").size());
-      found_sub_blocks = true;
-    }
-  }
-  ASSERT_TRUE(found_sub_block);
-  ASSERT_TRUE(found_sub_blocks);
-  // Not check block's protostr are same it because the order of vars could be
-  // different and it is correct.
-}
-
-TEST(ProgramDescBind, serialize_and_deserialize) {
-  ProgramDesc program_origin;
-  auto* global_block = program_origin.MutableBlock(0);
-  auto* x = global_block->Var("X");
-  x->SetType(proto::VarType::LOD_TENSOR);
-  x->SetLoDLevel(0);
-  x->SetDataType(proto::VarType::FP32);
-  x->SetShape({1000, 784});
-
-  auto* y = global_block->Var("Y");
-  y->SetType(proto::VarType::LOD_TENSOR);
-  y->SetLoDLevel(0);
-  y->SetDataType(proto::VarType::FP32);
-  y->SetShape({784, 100});
-
-  auto* op = global_block->AppendOp();
-  op->SetType("mul");
-  op->SetInput("X", {x->Name()});
-  op->SetInput("Y", {y->Name()});
-
-  auto* out = global_block->Var("Out");
-  out->SetType(proto::VarType::LOD_TENSOR);
-  op->SetOutput("Y", {out->Name()});
-
-  std::string binary_str;
-  program_origin.Proto()->SerializeToString(&binary_str);
-
-  ProgramDesc program_restored(binary_str);
-  auto* global_block_restored = program_restored.MutableBlock(0);
-  ASSERT_NE(global_block, global_block_restored);
-
-  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
-    ASSERT_TRUE(global_block_restored->HasVar(name));
-    auto* restored = global_block_restored->Var(name);
-    ASSERT_NE(restored, var_before);
-    ASSERT_EQ(restored->Name(), var_before->Name());
-    ASSERT_EQ(restored->GetType(), var_before->GetType());
-    ASSERT_EQ(restored->GetShape(), var_before->GetShape());
-    ASSERT_EQ(restored->Proto()->SerializeAsString(),
-              var_before->Proto()->SerializeAsString());
-  };
-
-  ASSERT_EQ(global_block->LocalVarNames(),
-            global_block_restored->LocalVarNames());
-  ASSERT_EQ(3UL, global_block_restored->LocalVarNames().size());
-  assert_same_var("X", x);
-  assert_same_var("Y", y);
-  assert_same_var("Out", out);
-
-  for (size_t i = 0; i < global_block->OpSize(); ++i) {
-    auto op_origin = global_block->Op(i);
-    auto op_restored = global_block_restored->Op(i);
-
-    ASSERT_EQ(op_origin->Type(), op_restored->Type());
-    ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs());
-    ASSERT_EQ(op_origin->Outputs(), op_restored->Outputs());
-
-    ASSERT_EQ(op_restored->Proto()->SerializeAsString(),
-              op_origin->Proto()->SerializeAsString());
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/proto_desc.h b/paddle/fluid/framework/proto_desc.h
deleted file mode 100644
index 40521c07829336313c32ccbf9152ca8198b99688..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/proto_desc.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-namespace framework {
-
-// The Index of first Block in Program. also called root block.
-constexpr int kRootBlockIndex = 0;
-// The Parent Index of root Block, this block does not exist.
-constexpr int kNoneBlockIndex = -1;
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
deleted file mode 100644
index c58cb8ad2ace9927d85a22cb400e2b91af331cbd..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/prune.cc
+++ /dev/null
@@ -1,392 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/prune.h"
-
-#include <glog/logging.h>
-
-#include <algorithm>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-const char kFeedOpType[] = "feed";
-const char kFetchOpType[] = "fetch";
-
-const char kRecurrent[] = "recurrent";
-const char kStates[] = "states";
-const char kExStates[] = "ex_states";
-
-bool HasDependentInputVar(
-    const proto::OpDesc& op_desc,
-    const std::unordered_set<std::string>& dependent_vars) {
-  for (auto& var : op_desc.inputs()) {
-    for (auto& argu : var.arguments()) {
-      if (dependent_vars.count(argu) != 0) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool HasDependentOutputVar(
-    const proto::OpDesc& op_desc,
-    const std::unordered_set<std::string>& dependent_vars) {
-  for (auto& var : op_desc.outputs()) {
-    for (auto& argu : var.arguments()) {
-      if (dependent_vars.count(argu) != 0) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool IsTarget(const proto::OpDesc& op_desc) {
-  if (op_desc.has_is_target()) {
-    return op_desc.is_target();
-  }
-  return false;
-}
-
-bool HasTrueTarget(const proto::OpDesc& op_desc) {
-  return op_desc.has_is_target() && op_desc.is_target();
-}
-
-bool HasFalseTarget(const proto::OpDesc& op_desc) {
-  return op_desc.has_is_target() && !op_desc.is_target();
-}
-
-int GetSubBlockIndex(const proto::OpDesc& op_desc) {
-  for (auto& attr : op_desc.attrs()) {
-    if (attr.type() == proto::AttrType::BLOCK) {
-      PADDLE_ENFORCE(attr.has_block_idx());
-      return attr.block_idx();
-    }
-  }
-  return -1;
-}
-
-bool HasSubBlock(const proto::OpDesc& op_desc) {
-  return GetSubBlockIndex(op_desc) > 0;
-}
-
-void AppendOpInputVarNames(const proto::OpDesc& op_desc,
-                           std::unordered_set<std::string>* vars_set) {
-  for (auto& var : op_desc.inputs()) {
-    for (auto& arg : var.arguments()) {
-      vars_set->emplace(arg);
-    }
-  }
-}
-
-void AppendOpOutputVarNames(const proto::OpDesc& op_desc,
-                            std::unordered_set<std::string>* vars_set) {
-  for (auto& var : op_desc.outputs()) {
-    for (auto& arg : var.arguments()) {
-      vars_set->emplace(arg);
-    }
-  }
-}
-
-// block_id is the idx of the current block in the input desc
-// parent_block_id is the idx of the parent of the current block
-// in the output desc, -1 means the current block is global block
-// dependent_vars is passed recursively from the parent block to
-// the child block to help pruning
-void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
-                int block_id, int parent_block_id,
-                std::unordered_set<std::string>* dependent_vars,
-                const std::set<std::string> feed_var_names) {
-  auto& block = input.blocks(block_id);
-  auto& ops = block.ops();
-
-  bool expect_feed = true;
-  for (auto& op_desc : ops) {
-    PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed,
-                   "All FeedOps are at the beginning of the ProgramDesc");
-    expect_feed = (op_desc.type() == kFeedOpType);
-  }
-
-  bool expect_fetch = true;
-  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
-    auto& op_desc = *op_iter;
-    PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch,
-                   "All FetchOps must at the end of the ProgramDesc");
-    expect_fetch = (op_desc.type() == kFetchOpType);
-  }
-
-  std::vector<bool> should_run;
-  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
-    auto& op_desc = *op_iter;
-    if (IsTarget(op_desc) || HasDependentOutputVar(op_desc, *dependent_vars)) {
-      // insert its input to the dependency graph
-      for (auto& var : op_desc.inputs()) {
-        for (auto& argu : var.arguments()) {
-          if (feed_var_names.count(argu) == 0) {
-            dependent_vars->insert(argu);
-          }
-        }
-      }
-      should_run.push_back(true);
-    } else {
-      should_run.push_back(false);
-    }
-  }
-
-  // since we are traversing the ProgramDesc in reverse order
-  // we reverse the should_run vector
-  std::reverse(should_run.begin(), should_run.end());
-
-  // copy the current block from input to output
-  auto* block_field = output->mutable_blocks();
-  *block_field->Add() = input.blocks(block_id);
-
-  int output_block_id = output->blocks_size() - 1;
-  auto* output_block = output->mutable_blocks(output_block_id);
-  output_block->set_idx(output_block_id);
-  output_block->set_parent_idx(parent_block_id);
-
-  auto* op_field = output_block->mutable_ops();
-  op_field->Clear();
-  for (size_t i = 0; i < should_run.size(); ++i) {
-    if (should_run[i]) {
-      auto* op = op_field->Add();
-      *op = input.blocks(block_id).ops(i);
-      if (HasSubBlock(*op)) {
-        VLOG(2) << "Pruning op which has sub block: " << op->type();
-        // create sub_block_dependent_vars here to help prune the sub block
-        std::unordered_set<std::string> sub_block_dependent_vars;
-        for (auto& var : op->inputs()) {
-          for (auto& argu : var.arguments()) {
-            if (feed_var_names.count(argu) == 0) {
-              sub_block_dependent_vars.insert(argu);
-            }
-          }
-        }
-        for (auto& var : op->outputs()) {
-          for (auto& argu : var.arguments()) {
-            if (feed_var_names.count(argu) == 0) {
-              sub_block_dependent_vars.insert(argu);
-            }
-          }
-        }
-
-        // Recurrent op's states are also dependent vars
-        if (op->type() == kRecurrent) {
-          auto& attributes = op->attrs();
-          for (auto& attr : attributes) {
-            if (attr.name() == kStates || attr.name() == kExStates) {
-              for (auto& argu : attr.strings()) {
-                if (feed_var_names.count(argu) == 0) {
-                  sub_block_dependent_vars.insert(argu);
-                }
-              }
-            }
-          }
-        }
-        // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
-        // output_block_id is the idx of the current block in the output desc
-        prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
-                   &sub_block_dependent_vars, feed_var_names);
-      }
-    }
-  }
-
-  // remove the VarDescs in BlockDesc that are not referenced in
-  // the pruned OpDescs
-  std::unordered_map<std::string, proto::VarDesc> var_map;
-  auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars();
-  for (const auto& var : *var_field) {
-    var_map[var.name()] = var;
-  }
-
-  std::set<std::string> var_names;
-  for (const auto& op : *op_field) {
-    auto& input_field = op.inputs();
-    for (auto& input_var : input_field) {
-      for (auto& arg : input_var.arguments()) {
-        if (var_map.count(arg) != 0) {
-          var_names.insert(arg);
-        }
-      }
-    }
-    auto& output_field = op.outputs();
-    for (auto& output_var : output_field) {
-      for (auto& arg : output_var.arguments()) {
-        if (var_map.count(arg) != 0) {
-          var_names.insert(arg);
-        }
-      }
-    }
-  }
-
-  var_field->Clear();
-  for (const auto& name : var_names) {
-    *var_field->Add() = var_map[name];
-  }
-}
-
-// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
-void Prune(const proto::ProgramDesc& input,
-           const std::set<std::string>& feed_var_names,
-           proto::ProgramDesc* output) {
-  std::unordered_set<std::string> dependent_vars;
-  output->clear_blocks();
-  prune_impl(input, output, 0, -1, &dependent_vars, feed_var_names);
-}
-
-void CloneWholeBlock(proto::ProgramDesc* input, proto::ProgramDesc* output,
-                     int block_id, int parent_block_id) {
-  auto* block_field = output->mutable_blocks();
-  *block_field->Add() = input->blocks(block_id);
-  int output_block_id = output->blocks_size() - 1;
-  auto* output_block = output->mutable_blocks(output_block_id);
-  output_block->set_idx(output_block_id);
-  output_block->set_parent_idx(parent_block_id);
-}
-
-void PruneBackwardImpl(proto::ProgramDesc* input, proto::ProgramDesc* output,
-                       int block_id, int parent_block_id) {
-  // Step 1. Copy the current input block to output
-  CloneWholeBlock(input, output, block_id, parent_block_id);
-  int output_block_id = output->blocks_size() - 1;
-  auto* output_block = output->mutable_blocks(output_block_id);
-
-  // Step 2. Mark forward ops on main branch
-  auto* ops = input->mutable_blocks(block_id)->mutable_ops();
-  std::unordered_set<std::string> op_input_vars;
-  std::unordered_set<std::string> op_output_vars;
-  for (auto op_iter = ops->rbegin(); op_iter != ops->rend(); ++op_iter) {
-    auto& op_desc = *op_iter;
-    if (HasTrueTarget(op_desc) ||
-        HasDependentOutputVar(op_desc, op_input_vars)) {
-      op_desc.set_is_target(true);
-      AppendOpInputVarNames(op_desc, &op_input_vars);
-      AppendOpOutputVarNames(op_desc, &op_output_vars);
-    }
-  }
-
-  // Step 3. Mark backward & optimize ops on main branch
-  std::unordered_set<std::string> gradop_input_vars;
-  std::unordered_set<std::string> gradop_output_vars;
-  for (auto op_iter = ops->begin(); op_iter != ops->end(); ++op_iter) {
-    auto& op_desc = *op_iter;
-    if (HasFalseTarget(op_desc) ||
-        HasDependentInputVar(op_desc, gradop_output_vars)) {
-      op_desc.set_is_target(false);
-      AppendOpInputVarNames(op_desc, &gradop_input_vars);
-      AppendOpOutputVarNames(op_desc, &gradop_output_vars);
-    }
-  }
-
-  // Step 4. Mark ops need to be reserved on sub-branch
-  for (auto op_iter = ops->rbegin(); op_iter != ops->rend(); ++op_iter) {
-    auto& op_desc = *op_iter;
-    if (!op_desc.has_is_target()) {
-      if (HasDependentOutputVar(op_desc, gradop_input_vars)) {
-        op_desc.set_is_target(false);
-        AppendOpInputVarNames(op_desc, &gradop_input_vars);
-      } else {
-        op_desc.set_is_target(true);
-        AppendOpInputVarNames(op_desc, &op_input_vars);
-        AppendOpOutputVarNames(op_desc, &op_output_vars);
-      }
-    }
-  }
-
-  // Step 5. Copy the forward ops to new ProgramDesc
-  //   Note: The proto::ProgramDesc doesn't have interface
-  //         to remove op and var
-  auto* op_field = output_block->mutable_ops();
-  op_field->Clear();
-  for (auto op_iter = ops->begin(); op_iter != ops->end(); ++op_iter) {
-    if (IsTarget(*op_iter)) {
-      auto* op = op_field->Add();
-      *op = *op_iter;
-      if (HasSubBlock(*op)) {
-        CloneWholeBlock(input, output, GetSubBlockIndex(*op), output_block_id);
-      }
-    }
-  }
-
-  // Step 6. Copy the forward vars to new ProgramDesc
-  // construct all var's map before clear
-  auto* var_field = output_block->mutable_vars();
-  std::unordered_map<std::string, proto::VarDesc> var_map;
-  for (const auto& var : *var_field) {
-    var_map[var.name()] = var;
-  }
-  std::unordered_set<std::string> var_names;
-  var_names.insert(op_input_vars.begin(), op_input_vars.end());
-  var_names.insert(op_output_vars.begin(), op_output_vars.end());
-  var_field->Clear();
-  for (const auto& name : var_names) {
-    *var_field->Add() = var_map[name];
-  }
-}
-
-std::unique_ptr<framework::ProgramDesc> PruneBackward(
-    const framework::ProgramDesc& origin) {
-  // Copy original ProgramDesc, origin can't be change
-  framework::ProgramDesc origin_clone(origin);
-
-  // Step 1. Update loss op's role & set loss op to be target
-  //   The loss op's op_role is (kForward | kLoss)
-  //   The input ProgramDesc should have loss operator.
-  auto ops = origin_clone.Block(0).AllOps();
-  bool has_loss_op = false;
-  for (auto op : ops) {
-    int op_role =
-        boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-    if (op_role == (static_cast<int>(OpRole::kForward) |
-                    static_cast<int>(OpRole::kLoss))) {
-      op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                  static_cast<int>(OpRole::kForward));
-      op->SetIsTarget(true);
-      has_loss_op = true;
-    } else if (op_role == (static_cast<int>(OpRole::kBackward) |
-                           static_cast<int>(OpRole::kLoss))) {
-      op->SetIsTarget(false);
-      break;
-    }
-  }
-  PADDLE_ENFORCE_EQ(has_loss_op, true,
-                    "The Program need to be pruned its backward part"
-                    "should have loss operator.");
-
-  // Step 2. Prune backward
-  proto::ProgramDesc pruned_desc;
-  pruned_desc.clear_blocks();
-  PruneBackwardImpl(origin_clone.Proto(), &pruned_desc, 0, -1);
-
-  // Step 3. Contruct new framework::ProgramDesc
-  return std::unique_ptr<framework::ProgramDesc>(
-      new framework::ProgramDesc(pruned_desc));
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h
deleted file mode 100644
index f710106a263a4d4350007c1580aaf83560faaa7e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/prune.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <set>
-#include <string>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-void Prune(const proto::ProgramDesc& input,
-           const std::set<std::string>& feed_var_names,
-           proto::ProgramDesc* output);
-
-std::unique_ptr<framework::ProgramDesc> PruneBackward(
-    const framework::ProgramDesc& origin);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
deleted file mode 100644
index eb5c241a8372a460483c70e38f962168b1cdbbc0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/prune_test.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/prune.h"
-
-#include <gtest/gtest.h>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/operator.h"
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace f = paddle::framework;
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDesc *block) {
-  // insert output
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(paddle::framework::proto::VarType::FP32);
-    }
-  }
-
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-TEST(Prune, one_operator) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
-        block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  f::proto::ProgramDesc pruned;
-  std::set<std::string> feed_var_names = {};
-  f::Prune(*pdesc, feed_var_names, &pruned);
-  EXPECT_EQ(pruned.blocks(0).ops_size(), 0);
-
-  feed_var_names.insert("a");
-  pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
-  f::Prune(*pdesc, feed_var_names, &pruned);
-  EXPECT_EQ(pruned.blocks(0).ops_size(), 1);
-}
-
-TEST(Prune, forward) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
-        block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  std::set<std::string> feed_var_names = {"a"};
-  for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
-    f::proto::ProgramDesc pruned;
-    pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
-    f::Prune(*pdesc, feed_var_names, &pruned);
-    EXPECT_EQ(pruned.blocks(0).ops_size(), i + 1);
-  }
-}
-
-TEST(Prune, multi_input_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{},
-        block);
-  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
-        f::AttributeMap{}, block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
-
-  f::proto::ProgramDesc pruned;
-  std::set<std::string> feed_var_names = {"a0", "a1", "a2"};
-  f::Prune(*pdesc, feed_var_names, &pruned);
-  EXPECT_EQ(pruned.blocks(0).ops_size(), 4);
-}
-
-TEST(Prune, multi_output_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
-        f::AttributeMap{}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
-        block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
-
-  f::proto::ProgramDesc pruned;
-  std::set<std::string> feed_var_names = {"a"};
-  f::Prune(*pdesc, feed_var_names, &pruned);
-  EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
-}
-
-TEST(Prune, multi_target) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
-        f::AttributeMap{}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
-        block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
-  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
-
-  f::proto::ProgramDesc pruned;
-  std::set<std::string> feed_var_names = {"a"};
-  f::Prune(*pdesc, feed_var_names, &pruned);
-  EXPECT_EQ(pruned.blocks(0).ops_size(), 3);
-}
-
-TEST(Prune, recurrrent_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::BlockDesc *sub_block = program.AppendBlock(*block);
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
-        f::AttributeMap{}, block);
-
-  std::vector<std::string> state_var_name(1, "y");
-  AddOp("recurrent", {{"input", {"b", "c"}}}, {{"output", {"b1, c1"}}},
-        {{"ex_states", state_var_name},
-         {"states", state_var_name},
-         {"sub_block", sub_block}},
-        block);
-
-  EXPECT_TRUE(sub_block != nullptr);
-  AddOp("rnn_memory_helper", {{"input", {"x"}}}, {{"output", {"y"}}},
-        f::AttributeMap{}, sub_block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
-
-  f::proto::ProgramDesc pruned;
-  std::set<std::string> feed_var_names = {"a"};
-
-  f::Prune(*pdesc, feed_var_names, &pruned);
-  EXPECT_EQ(pruned.blocks_size(), 2);
-  EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
-  EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
-}
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
deleted file mode 100644
index 20d7f98e93695107637107c60f5ef42b8ce9293d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <time.h>
-#include "paddle/fluid/framework/device_worker.h"
-
-namespace paddle {
-namespace framework {
-
-std::shared_ptr<PullDenseWorker> PullDenseWorker::s_instance_ = NULL;
-std::mutex PullDenseWorker::mutex_for_version_;
-std::map<uint64_t, uint64_t> PullDenseWorker::last_versions_;
-std::map<uint64_t, uint64_t> PullDenseWorker::current_version_;
-std::map<uint64_t, std::vector<uint64_t>> PullDenseWorker::training_versions_;
-std::map<uint64_t, std::vector<std::string>>
-    PullDenseWorker::dense_value_names_;
-
-void PullDenseWorker::Initialize(const TrainerDesc& param) {
-  running_ = false;
-  param_ = param.pull_dense_param();
-  dwp_param_ = param.downpour_param();
-  threshold_ = param_.threshold();
-  thread_num_ = param_.device_num();
-  sleep_time_ms_ = param_.sleep_time_ms();
-  for (size_t i = 0;
-       i < dwp_param_.program_config(0).pull_dense_table_id_size(); ++i) {
-    uint64_t tid = static_cast<uint64_t>(
-        dwp_param_.program_config(0).pull_dense_table_id(i));
-    TableParameter table;
-    for (auto i : param_.dense_table()) {
-      if (i.table_id() == tid) {
-        table = i;
-        break;
-      }
-    }
-    // setup dense variables for each table
-    int var_num = table.dense_value_name_size();
-    dense_value_names_[tid].resize(var_num);
-    for (int j = 0; j < var_num; ++j) {
-      dense_value_names_[tid][j] = table.dense_value_name(j);
-    }
-    // setup training version for each table
-    training_versions_[tid].resize(thread_num_, 0);
-    last_versions_[tid] = 0;
-    current_version_[tid] = 0;
-  }
-  fleet_ptr_ = FleetWrapper::GetInstance();
-}
-
-void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
-  for (auto& t : *status_vec) {
-    t.wait();
-    auto status = t.get();
-    if (status != 0) {
-      LOG(WARNING) << "Current Pull Dense Thread Failed Times"
-                   << ++pull_dense_fail_times_;
-    }
-  }
-
-  int MAX_FAIL_NUM = 20;
-  if (pull_dense_fail_times_ > MAX_FAIL_NUM) {
-    LOG(FATAL) << "Pull Dense Failed Times More Than " << MAX_FAIL_NUM
-               << " Times";
-    exit(-1);
-  }
-  status_vec->resize(0);
-}
-
-void PullDenseWorker::Stop() {
-  if (running_) {
-    running_ = false;
-    t_.join();
-  }
-}
-
-void PullDenseWorker::PullDense(bool force_update) {
-  pull_dense_status_.resize(0);
-  for (size_t i = 0;
-       i < dwp_param_.program_config(0).pull_dense_table_id_size(); ++i) {
-    uint64_t tid = static_cast<uint64_t>(
-        dwp_param_.program_config(0).pull_dense_table_id(i));
-    if (force_update || CheckUpdateParam(tid)) {
-      fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
-                                     &pull_dense_status_);
-      ResetThreadVersion(tid);
-    }
-  }
-  if (pull_dense_status_.size() != 0) {
-    Wait(&pull_dense_status_);
-  }
-}
-
-int PullDenseWorker::Start() {
-  running_ = true;
-  // before training, we can pull dense from pserver first.
-  PullDense(true);
-  t_ = std::thread(&PullDenseWorker::Run, this);
-  return 0;
-}
-
-void PullDenseWorker::Run() {
-  while (running_) {
-    PullDense(false);
-#ifndef _WIN32
-    usleep(sleep_time_ms_ * 1000);
-#endif
-  }
-}
-
-void PullDenseWorker::IncreaseThreadVersion(int thread_id, uint64_t table_id) {
-  std::lock_guard<std::mutex> lock(mutex_for_version_);
-  training_versions_[table_id][thread_id]++;
-}
-
-bool PullDenseWorker::CheckUpdateParam(uint64_t table_id) {
-  std::lock_guard<std::mutex> lock(mutex_for_version_);
-  auto& version = training_versions_[table_id];
-  current_version_[table_id] =
-      *(std::min_element(version.begin(), version.end()));
-  if (current_version_[table_id] - last_versions_[table_id] < threshold_) {
-    return false;
-  }
-  return true;
-}
-
-void PullDenseWorker::ResetThreadVersion(uint64_t table_id) {
-  std::lock_guard<std::mutex> lock(mutex_for_version_);
-  last_versions_[table_id] = current_version_[table_id];
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/python_headers.h b/paddle/fluid/framework/python_headers.h
deleted file mode 100644
index 8f9e3fad57f7bb87e78e334e741be23751417a78..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/python_headers.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// workaround for Python 2 issue: https://bugs.python.org/issue17120
-#pragma push_macro("_XOPEN_SOURCE")
-#pragma push_macro("_POSIX_C_SOURCE")
-#undef _XOPEN_SOURCE
-#undef _POSIX_C_SOURCE
-
-#include "pybind11/pybind11.h"
-
-#pragma pop_macro("_XOPEN_SOURCE")
-#pragma pop_macro("_POSIX_C_SOURCE")
-
-#if !defined(PYBIND11_HIDDEN)
-#ifdef _WIN32
-#define PYBIND11_HIDDEN __declspec(dllexport)
-#else
-#define PYBIND11_HIDDEN __attribute__((visibility("hidden")))
-#endif
-#endif
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
deleted file mode 100644
index d3513fb7dbed0413e61796d8a843c38fbbcf93dc..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/reader.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/reader.h"
-#include <deque>
-
-namespace paddle {
-namespace framework {
-
-void ReaderBase::ReadNext(std::vector<LoDTensor> *out) {
-  std::lock_guard<std::mutex> lock(mu_);
-  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning);
-  ReadNextImpl(out);
-}
-
-void ReaderBase::InsertDecoratedReader(
-    const std::shared_ptr<ReaderBase> &decorated_reader) {
-  std::lock_guard<std::mutex> guard(mu_);
-  decorated_readers_.emplace_back(decorated_reader);
-}
-
-std::unordered_set<ReaderBase *> ReaderBase::GetEndPoints() {
-  std::unordered_set<ReaderBase *> result;
-  std::deque<ReaderBase *> queue;
-  queue.emplace_back(this);
-  while (!queue.empty()) {  // BFS search
-    auto *front = queue.front();
-    queue.pop_front();
-    if (front->decorated_readers_.empty()) {
-      result.emplace(front);
-    } else {
-      for (auto &reader : front->decorated_readers_) {
-        if (auto *reader_ptr = reader.lock().get()) {
-          queue.emplace_back(reader_ptr);
-        }
-      }
-    }
-  }
-
-  return result;
-}
-
-void ReaderBase::Shutdown() {
-  std::lock_guard<std::mutex> lock(mu_);
-  if (status_ != ReaderStatus::kStopped) {
-    ShutdownImpl();
-    status_ = ReaderStatus::kStopped;
-  }
-}
-
-void ReaderBase::Start() {
-  std::lock_guard<std::mutex> lock(mu_);
-  if (status_ != ReaderStatus::kRunning) {
-    StartImpl();
-    status_ = ReaderStatus::kRunning;
-  }
-}
-
-ReaderBase::~ReaderBase() {}
-
-DecoratedReader::~DecoratedReader() {
-  VLOG(1) << "~DecoratedReader";
-  reader_->Shutdown();
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
deleted file mode 100644
index 4b400e72a4cacd3848b57ac3ba2b3ef5f9a9a9c4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/reader.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-class ReaderBase {
- public:
-  virtual void ReadNext(std::vector<LoDTensor>* out);
-
-  virtual void Shutdown();
-
-  virtual void Start();
-
-  // Return the readers which are the end of decorating chain. Basically
-  // they are readers just before read op.
-  std::unordered_set<ReaderBase*> GetEndPoints();
-
-  virtual ~ReaderBase();
-
- protected:
-  virtual void ReadNextImpl(std::vector<LoDTensor>* out) {}
-
-  virtual void ShutdownImpl() {}
-
-  virtual void StartImpl() {}
-
-  enum ReaderStatus { kRunning, kStopped };
-
-  ReaderStatus status_{kRunning};
-
-  mutable std::mutex mu_;
-
- private:
-  friend class DecoratedReader;
-  // These methods can be only invoked inside DecoratedReader to record the
-  // decorating chain.
-  void InsertDecoratedReader(
-      const std::shared_ptr<ReaderBase>& decorated_reader);
-  // A set of which readers that decorated this reader.
-  std::vector<std::weak_ptr<ReaderBase>> decorated_readers_;
-};
-
-class DecoratedReader : public ReaderBase,
-                        public std::enable_shared_from_this<DecoratedReader> {
- public:
-  explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
-      : ReaderBase(), reader_(reader) {
-    PADDLE_ENFORCE_NOT_NULL(reader_);
-  }
-
-  void RegisterDecorateChain() {
-    reader_->InsertDecoratedReader(shared_from_this());
-  }
-
-  ~DecoratedReader();
-
- protected:
-  void ShutdownImpl() override {
-    VLOG(1) << "ShutdownImpl";
-    reader_->Shutdown();
-  }
-
-  void StartImpl() override { reader_->Start(); }
-
-  std::shared_ptr<ReaderBase> reader_;
-};
-
-// FileReader is just a conceptual class.
-class FileReader : public ReaderBase {};
-
-// The ReaderHolder is used as reader' unified wrapper,
-// making it easier to access different type reader in Variables.
-class ReaderHolder {
- public:
-  template <typename T>
-  void Reset(const std::shared_ptr<T>& reader) {
-    auto reader_base = std::dynamic_pointer_cast<ReaderBase>(reader);
-    PADDLE_ENFORCE_NOT_NULL(reader_base);
-    reader_ = reader_base;
-  }
-
-  ~ReaderHolder() { VLOG(1) << "~ReaderHolder"; }
-
-  const std::shared_ptr<ReaderBase>& Get() const { return reader_; }
-
-  void ReadNext(std::vector<LoDTensor>* out) {
-    PADDLE_ENFORCE_NOT_NULL(reader_);
-    reader_->ReadNext(out);
-  }
-
-  void ResetAll() {
-    VLOG(1) << "ResetAll";
-    auto end_readers = reader_->GetEndPoints();
-    for (auto* reader : end_readers) {
-      reader->Shutdown();
-    }
-    for (auto* reader : end_readers) {
-      reader->Start();
-    }
-  }
-
-  void Shutdown() {
-    VLOG(1) << "Shutdown";
-    PADDLE_ENFORCE_NOT_NULL(reader_);
-    reader_->Shutdown();
-  }
-
-  void Start() {
-    VLOG(1) << "start";
-    PADDLE_ENFORCE_NOT_NULL(reader_);
-    reader_->Start();
-  }
-
-  operator const std::shared_ptr<ReaderBase>&() const { return this->reader_; }
-
- private:
-  std::shared_ptr<ReaderBase> reader_;
-};
-
-template <typename T, typename... ARGS>
-inline std::shared_ptr<DecoratedReader> MakeDecoratedReader(ARGS&&... args) {
-  std::shared_ptr<DecoratedReader> reader(new T(std::forward<ARGS>(args)...));
-  reader->RegisterDecorateChain();
-  return reader;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/reader_test.cc b/paddle/fluid/framework/reader_test.cc
deleted file mode 100644
index d812417a38200bcfdbdeac78800190647510a144..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/reader_test.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/reader.h"
-#include <memory>
-#include "gtest/gtest.h"
-
-class StubDecoratedReader : public paddle::framework::DecoratedReader {
- public:
-  explicit StubDecoratedReader(const std::shared_ptr<ReaderBase> &reader)
-      : DecoratedReader(reader) {}
-
-  void ReadNextImpl(std::vector<paddle::framework::LoDTensor> *out) override {}
-};
-
-class StubRootReader : public paddle::framework::ReaderBase {
- public:
-  void ReadNextImpl(std::vector<paddle::framework::LoDTensor> *out) override {}
-};
-
-TEST(READER, decorate_chain) {
-  auto root = std::make_shared<StubRootReader>();
-  auto end_point1 =
-      paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
-  auto end_point2 =
-      paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
-
-  {
-    auto endpoints = root->GetEndPoints();
-    ASSERT_EQ(endpoints.size(), 2U);
-    ASSERT_NE(endpoints.count(end_point1.get()), 0UL);
-    ASSERT_NE(endpoints.count(end_point2.get()), 0UL);
-  }
-
-  {
-    auto end_point3 =
-        paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
-    ASSERT_EQ(root->GetEndPoints().size(), 3U);
-  }
-  { ASSERT_EQ(root->GetEndPoints().size(), 2U); }
-}
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
deleted file mode 100644
index f8aa87519a2fc1a14765887e95c96883d7b4589f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/rw_lock.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if !defined(_WIN32)
-#include <pthread.h>
-#else
-#include <mutex>  // NOLINT
-#endif            // !_WIN32
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-#if !defined(_WIN32)
-struct RWLock {
-  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
-
-  ~RWLock() { pthread_rwlock_destroy(&lock_); }
-
-  inline void RDLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
-                      "acquire read lock failed");
-  }
-
-  inline void WRLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
-                      "acquire write lock failed");
-  }
-
-  inline void UNLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
-  }
-
- private:
-  pthread_rwlock_t lock_;
-};
-// TODO(paddle-dev): Support RWLock for WIN32 for correctness.
-#else
-// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
-// In windows, rw_lock seems like a hack. Use empty object and do nothing.
-struct RWLock {
-  // FIXME(minqiyang): use mutex here to do fake lock
-  inline void RDLock() { mutex_.lock(); }
-
-  inline void WRLock() { mutex_.lock(); }
-
-  inline void UNLock() { mutex_.unlock(); }
-
- private:
-  std::mutex mutex_;
-};
-#endif
-
-class AutoWRLock {
- public:
-  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
-
-  ~AutoWRLock() { UnLock(); }
-
- private:
-  inline void Lock() { lock_->WRLock(); }
-
-  inline void UnLock() { lock_->UNLock(); }
-
- private:
-  RWLock* lock_;
-};
-
-class AutoRDLock {
- public:
-  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
-
-  ~AutoRDLock() { UnLock(); }
-
- private:
-  inline void Lock() { lock_->RDLock(); }
-
-  inline void UnLock() { lock_->UNLock(); }
-
- private:
-  RWLock* lock_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc
deleted file mode 100644
index 16f9cbb65229f10912ee90436c3557aaaca169b8..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/rw_lock_test.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/rw_lock.h"
-#include <gtest/gtest.h>
-#include <chrono>  // NOLINT
-#include <thread>  // NOLINT
-#include <vector>
-
-namespace f = paddle::framework;
-
-void f1(f::RWLock *lock) {
-  lock->RDLock();
-  lock->UNLock();
-}
-
-TEST(RWLOCK, read_read) {
-  f::RWLock lock;
-  lock.RDLock();
-  std::thread t1(f1, &lock);
-  std::thread t2(f1, &lock);
-  t1.join();
-  t2.join();
-  lock.UNLock();
-}
-
-void f2(f::RWLock *lock, std::vector<int> *result) {
-  lock->RDLock();
-  ASSERT_EQ(result->size(), 0UL);
-  lock->UNLock();
-}
-
-void f3(f::RWLock *lock, std::vector<int> *result) {
-  lock->WRLock();
-  result->push_back(1);
-  lock->UNLock();
-}
-
-TEST(RWLOCK, read_write) {
-  f::RWLock lock;
-  std::vector<int> result;
-
-  lock.RDLock();
-  std::thread t1(f2, &lock, &result);
-  t1.join();
-  std::thread t2(f3, &lock, &result);
-  std::this_thread::sleep_for(std::chrono::seconds(1));
-  ASSERT_EQ(result.size(), 0UL);
-  lock.UNLock();
-  t2.join();
-  ASSERT_EQ(result.size(), 1UL);
-}
-
-void f4(f::RWLock *lock, std::vector<int> *result) {
-  lock->RDLock();
-  ASSERT_EQ(result->size(), 1UL);
-  lock->UNLock();
-}
-
-TEST(RWLOCK, write_read) {
-  f::RWLock lock;
-  std::vector<int> result;
-
-  lock.WRLock();
-  std::thread t1(f4, &lock, &result);
-  std::this_thread::sleep_for(std::chrono::seconds(1));
-  result.push_back(1);
-  lock.UNLock();
-  t1.join();
-}
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
deleted file mode 100644
index afafff5218ccf95fdc4baf7282d4f2757a74ac9c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/scope.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/scope.h"
-
-#include <memory>  // for unique_ptr
-#include <queue>
-#include <set>
-#include <unordered_set>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/string/printf.h"
-
-DECLARE_bool(benchmark);
-
-DEFINE_bool(
-    eager_delete_scope, true,
-    "Delete local scope eagerly. It will reduce GPU memory usage but "
-    "slow down the destruction of variables.(around 1% performance harm)");
-
-// When in inference scenario, the scopes will not be written by two threads in
-// a mean time, but a scope may be read by multiple threads concurrently, and
-// the mutex will cause serious performance issue.
-// So the mutex is disabled when `ON_INFER`.
-#ifdef PADDLE_ON_INFERENCE
-#define SCOPE_KIDS_READER_LOCK
-#define SCOPE_KIDS_WRITER_LOCK
-#define SCOPE_VARS_READER_LOCK
-#define SCOPE_VARS_WRITER_LOCK
-#else
-#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
-#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
-#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
-#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
-#endif
-
-namespace paddle {
-namespace framework {
-
-Scope::~Scope() { DropKids(); }
-
-Scope& Scope::NewScope() const {
-  Scope* child = new Scope(this);
-  {
-    SCOPE_KIDS_WRITER_LOCK
-    kids_.push_back(child);
-  }
-  return *child;
-}
-
-std::unique_ptr<Scope> Scope::NewTmpScope() const {
-  return std::unique_ptr<Scope>(new Scope(this));
-}
-
-Variable* Scope::Var(const std::string& name) {
-  SCOPE_VARS_WRITER_LOCK
-  return VarInternal(name);
-}
-
-Variable* Scope::Var(std::string* name) {
-  SCOPE_VARS_WRITER_LOCK
-  auto new_name = std::to_string(reinterpret_cast<uintptr_t>(this)) + "." +
-                  std::to_string(vars_.size());
-  if (name != nullptr) {
-    *name = new_name;
-  }
-  return VarInternal(new_name);
-}
-
-Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_VARS_READER_LOCK
-  return FindVarInternal(name);
-}
-
-Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_VARS_READER_LOCK
-  return FindVarLocally(name);
-}
-
-const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_VARS_READER_LOCK
-  return FindScopeInternal(var);
-}
-
-void Scope::DropKids() {
-  SCOPE_KIDS_WRITER_LOCK
-  for (Scope* s : kids_) delete s;
-  kids_.clear();
-}
-
-bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_KIDS_READER_LOCK
-  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
-  return it != this->kids_.end();
-}
-
-std::vector<std::string> Scope::LocalVarNames() const {
-  std::vector<std::string> known_vars;
-  {
-    SCOPE_VARS_READER_LOCK
-    known_vars.reserve(this->vars_.size());
-    for (auto& p : vars_) {
-      known_vars.emplace_back(p.first);
-    }
-  }
-  return known_vars;
-}
-
-void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_KIDS_WRITER_LOCK
-  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
-  PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
-                 this, scope);
-  this->kids_.erase(it);
-  // When making memory benchmark on Fluid, we have to delete scope sync.
-  if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
-    delete scope;
-  } else {
-    Async([scope] { delete scope; });
-  }
-}
-
-void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::set<std::string> var_set(var_names.begin(), var_names.end());
-  SCOPE_VARS_WRITER_LOCK
-  for (auto it = vars_.begin(); it != vars_.end();) {
-    if (var_set.find(it->first) != var_set.end()) {
-      it = vars_.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-
-void Scope::Rename(const std::string& origin_name,
-                   const std::string& new_name) const {
-  SCOPE_VARS_WRITER_LOCK
-  RenameInternal(origin_name, new_name);
-}
-
-std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_VARS_WRITER_LOCK
-  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
-  RenameInternal(origin_name, new_name);
-  return new_name;
-}
-
-Variable* Scope::VarInternal(const std::string& name) {
-  auto* v = FindVarLocally(name);
-  if (v != nullptr) return v;
-  v = new Variable();
-  vars_.emplace(name, std::unique_ptr<Variable>(v));
-  VLOG(3) << "Create variable " << name;
-  return v;
-}
-
-const Scope* Scope::FindScopeInternal(const Variable* var) const {
-  for (auto& kv : vars_) {
-    if (kv.second.get() == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
-}
-
-void Scope::RenameInternal(const std::string& origin_name,
-                           const std::string& new_name) const {
-  auto origin_it = vars_.find(origin_name);
-  PADDLE_ENFORCE(origin_it != vars_.end(),
-                 "Cannot find original variable with name %s", origin_name);
-  auto new_it = vars_.find(new_name);
-  PADDLE_ENFORCE(new_it == vars_.end(),
-                 "The variable with name %s is already in the scope", new_name);
-  vars_[new_name].reset(origin_it->second.release());
-  vars_.erase(origin_it);
-}
-
-Variable* Scope::FindVarInternal(const std::string& name) const {
-  auto var = FindVarLocally(name);
-  if (var != nullptr) {
-    return var;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
-}
-
-Variable* Scope::FindVarLocally(const std::string& name) const {
-  auto it = vars_.find(name);
-  if (it != vars_.end()) return it->second.get();
-  return nullptr;
-}
-
-void Scope::EraseVarsExcept(const std::unordered_set<Variable*>& vars) {
-  SCOPE_VARS_WRITER_LOCK
-  for (auto iter = vars_.begin(); iter != vars_.end();) {
-    if (vars.count(iter->second.get()) != 0) {
-      ++iter;
-    } else {
-      vars_.erase(iter++);
-    }
-  }
-}
-
-std::string GenScopeTreeDebugInfo(Scope* root) {
-  std::stringstream os;
-
-  if (!root) return "";
-
-  // level traversal
-  std::queue<Scope*> queue;
-  queue.push(root);
-
-  std::vector<Scope*> scopes;
-
-  while (!queue.empty()) {
-    auto* end = queue.back();
-    Scope* q = nullptr;
-    while (q != end) {
-      q = queue.front();
-      queue.pop();
-      os << q << " ";
-      scopes.push_back(q);
-
-      for (auto* c : q->kids()) {
-        queue.push(c);
-      }
-    }
-    // end of a level
-    os << "\n------------------------------------------\n";
-  }
-
-  os << "\nDetails:\n\n";
-
-  for (Scope* q : scopes) {
-    os << "====\n";
-    os << q << ":\n";
-    for (auto& var : q->LocalVarNames()) {
-      os << "  - " << var << "\n";
-    }
-  }
-
-  return os.str();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
deleted file mode 100644
index d3e2f33d2e3788c7ed1ff9a77d2936ca0d32c767..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/scope.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-extern "C" {
-#include <xxhash.h>
-}
-
-#include <list>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/rw_lock.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-
-class Scope;
-
-/**
- * @brief Scope that manage all variables.
- *
- * Scope is an association of a name to Variable. All variables belong to
- * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
- * One net can run in different scopes and update different variable in the
- * scope.
- */
-class Scope {
- public:
-  Scope() {}
-  ~Scope();
-
-  /// Create a sub-scope. Returns a reference other than a pointer so
-  /// to prevent from manual deletion.
-  /// Mark it to const because that new kid scope cannot change parent scope.
-  Scope& NewScope() const;
-
-  /// Create a sub-scope for current scope but do not record it in the kids to
-  /// avoid performance problems.
-  std::unique_ptr<Scope> NewTmpScope() const;
-
-  /// Create a variable with given name if it doesn't exist.
-  /// Caller doesn't own the returned Variable.
-  Variable* Var(const std::string& name);
-
-  /// Create a variable with a scope-unique name.
-  /// Caller doesn't own the returned Variable.
-  Variable* Var(std::string* name = nullptr);
-
-  void EraseVars(const std::vector<std::string>& var_names);
-
-  // Erase all variables except the given `vars`
-  void EraseVarsExcept(const std::unordered_set<Variable*>& vars);
-
-  /// Find a variable in the scope or any of its ancestors.  Returns
-  /// nullptr if cannot find.
-  /// Caller doesn't own the returned Variable.
-  Variable* FindVar(const std::string& name) const;
-
-  /// Find a variable in the current scope.
-  /// Return nullptr if cannot find.
-  /// Caller doesn't own the returned Variable.
-  Variable* FindLocalVar(const std::string& name) const;
-
-  const Scope* parent() const { return parent_; }
-
-  /// Find the scope or an ancestor scope that contains the given variable.
-  const Scope* FindScope(const Variable* var) const;
-
-  void DeleteScope(Scope* scope) const;
-
-  /// Drop all kids scopes belonged to this scope.
-  void DropKids();
-
-  /// Find if a scope exists in the kid scopes
-  bool HasKid(const Scope* scope) const;
-
-  const std::list<Scope*>& kids() const { return kids_; }
-
-  // enumerate all the variables current contains.
-  std::vector<std::string> LocalVarNames() const;
-
-  // Rename variable to a new name
-  void Rename(const std::string& origin_name,
-              const std::string& new_name) const;
-
-  // Rename variable to a new name and return the new name
-  std::string Rename(const std::string& origin_name) const;
-
- protected:
-  struct KeyHasher {
-    std::size_t operator()(const std::string& key) const {
-      return XXH32(key.c_str(), key.size(), 1);
-    }
-  };
-
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>, KeyHasher>
-      vars_;
-
- private:
-  // Call Scope::NewScope for a sub-scope.
-  explicit Scope(Scope const* parent) : parent_(parent) {}
-
-  // Called by Var.
-  Variable* VarInternal(const std::string& name);
-
-  // Called by FindScope.
-  const Scope* FindScopeInternal(const Variable* var) const;
-
-  // Called by Rename.
-  void RenameInternal(const std::string& origin_name,
-                      const std::string& new_name) const;
-
-  // Called by FindVar recursively.
-  Variable* FindVarInternal(const std::string& name) const;
-
-  // Called by FindVarInternal and Var.
-  Variable* FindVarLocally(const std::string& name) const;
-
-  // Scope in `kids_` are owned by this class.
-  mutable std::list<Scope*> kids_;
-  const Scope* parent_{nullptr};
-
-  DISABLE_COPY_AND_ASSIGN(Scope);
-
-#ifndef PADDLE_ON_INFERENCE
-
- private:
-  mutable RWLock kids_lock_;
-  mutable RWLock vars_lock_;
-#endif
-};
-
-// Generate some debug string about the inherience structure of scope, quite
-// naive.
-std::string GenScopeTreeDebugInfo(Scope*);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc
deleted file mode 100644
index 5cb241a7a341d793d8450f0c9cde3929acef8965..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/scope_pool.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/scope_pool.h"
-#include "paddle/fluid/framework/threadpool.h"
-
-namespace paddle {
-namespace framework {
-
-ScopePool &ScopePool::Instance() {  // NOLINT
-  static ScopePool pool;
-  return pool;
-}
-
-void ScopePool::DeleteScope(Scope *scope) { delete scope; }
-
-void ScopePool::Insert(std::unique_ptr<Scope> &&s) {
-  std::lock_guard<std::mutex> guard(mtx_);
-  scopes_.insert(s.release());
-}
-
-void ScopePool::Remove(Scope *s) {
-  size_t has_scope;
-  {
-    std::lock_guard<std::mutex> guard(mtx_);
-    has_scope = scopes_.erase(s);
-  }
-  PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope");
-  DeleteScope(s);
-}
-
-ScopePool::~ScopePool() { Clear(); }
-
-void ScopePool::Clear() {
-  std::lock_guard<std::mutex> guard(mtx_);
-  for (auto *s : scopes_) {
-    DeleteScope(s);
-  }
-  scopes_.clear();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/scope_pool.h b/paddle/fluid/framework/scope_pool.h
deleted file mode 100644
index a8b468699abe148d44a395cf888158aefab4380b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/scope_pool.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mutex>  // NOLINT
-#include <unordered_set>
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-
-class ScopePool {
- public:
-  static ScopePool &Instance();  // NOLINT
-
-  void Insert(std::unique_ptr<Scope> &&s);
-
-  void Remove(Scope *s);
-
-  void Clear();
-
-  ~ScopePool();
-
- private:
-  ScopePool() = default;
-
-  static void DeleteScope(Scope *scope);
-
-  std::unordered_set<Scope *> scopes_;
-  std::mutex mtx_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/scope_test.cc b/paddle/fluid/framework/scope_test.cc
deleted file mode 100644
index ebf8178a8319cd33f2cc5eacb95b163043c986b5..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/scope_test.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/scope.h"
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-using paddle::framework::Scope;
-using paddle::framework::Variable;
-
-TEST(Scope, VarsShadowing) {
-  Scope s;
-  Scope& ss1 = s.NewScope();
-  Scope& ss2 = s.NewScope();
-
-  Variable* v0 = s.Var("a");
-  Variable* v1 = ss1.Var("a");
-
-  EXPECT_NE(v0, v1);
-
-  EXPECT_EQ(v0, s.FindVar("a"));
-  EXPECT_EQ(v1, ss1.FindVar("a"));
-  EXPECT_EQ(v0, ss2.FindVar("a"));
-}
-
-TEST(Scope, FindVar) {
-  Scope s;
-  Scope& ss = s.NewScope();
-
-  EXPECT_EQ(nullptr, s.FindVar("a"));
-  EXPECT_EQ(nullptr, ss.FindVar("a"));
-
-  ss.Var("a");
-
-  EXPECT_EQ(nullptr, s.FindVar("a"));
-  EXPECT_NE(nullptr, ss.FindVar("a"));
-}
-
-TEST(Scope, FindScope) {
-  Scope s;
-  Scope& ss = s.NewScope();
-  Variable* v = s.Var("a");
-
-  EXPECT_EQ(&s, s.FindScope(v));
-  EXPECT_EQ(&s, ss.FindScope(v));
-}
-
-TEST(Scope, GetAllNames) {
-  Scope s;
-  Variable* v = s.Var("a");
-  EXPECT_EQ(&s, s.FindScope(v));
-
-  std::vector<std::string> ans = s.LocalVarNames();
-  std::string str;
-  for (auto& var : ans) {
-    str += var;
-  }
-
-  EXPECT_STREQ("a", str.c_str());
-}
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
deleted file mode 100644
index c1a404c1cb268df047a4a8b117251b53b4e3f607..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/section_worker.cc
+++ /dev/null
@@ -1,411 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/message.h"
-#include "google/protobuf/text_format.h"
-
-#include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/trainer_desc.pb.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/lodtensor_printer.h"
-
-namespace paddle {
-namespace framework {
-
-uint64_t SyncFunctor::sync_flag_ = 0;
-std::vector<Scope*> SyncFunctor::pipeline_scopes_;
-
-SyncFunctor::SyncFunctor(int rank_id, int rank_num, int sync_steps)
-    : rank_id_(rank_id), rank_num_(rank_num), sync_steps_(sync_steps) {
-  PADDLE_ENFORCE(rank_num > 1, "rank_num should larger than 1");
-  counter_ = 0;
-  sync_signal_ = 0;
-  uint8_t* ptr = reinterpret_cast<uint8_t*>(&sync_signal_);
-  for (int i = 0; i < rank_num_; ++i) {
-    ptr[i] = 0xFF;
-  }
-}
-
-int SyncFunctor::operator()(Scope* scope) {
-  ++counter_;
-  if (counter_ < sync_steps_) {
-    return 0;
-  }
-  if (counter_ == sync_steps_) {
-    reinterpret_cast<uint8_t*>(&sync_flag_)[rank_id_] = 0xFF;
-  }
-
-  if (sync_flag_ == sync_signal_) {
-    static std::mutex mutex;
-    if (mutex.try_lock()) {
-      if (sync_flag_ == sync_signal_) {
-        Synchronize();
-        sync_flag_ = 0;
-      }
-      mutex.unlock();
-    }
-  }
-
-  if (sync_flag_ == 0) {
-    counter_ = 0;
-  }
-  return 0;
-}
-
-void SyncFunctor::Synchronize() {
-  for (const std::string& name : *sync_param_) {
-    platform::NCCLGroupGuard guard;
-    for (int i = 0; i < rank_num_; ++i) {
-      const platform::NCCLContext& nccl_ctx = nccl_ctx_map_->at(i);
-      LoDTensor* tensor =
-          pipeline_scopes_[i]->Var(name)->GetMutable<LoDTensor>();
-      // TODO(hutuxian): do not depend on data type explicitly
-      float* data =
-          tensor->mutable_data<float>(nccl_ctx_map_->DevCtx(i)->GetPlace());
-      const int numel = tensor->numel();
-
-      paddle::framework::AttributeMap attrs;
-      attrs.insert({"scale", static_cast<float>(1. / rank_num_)});
-      auto scale_op = framework::OpRegistry::CreateOp("scale", {{"X", {name}}},
-                                                      {{"Out", {name}}}, attrs);
-      scale_op->Run(*(pipeline_scopes_[i]),
-                    nccl_ctx_map_->DevCtx(i)->GetPlace());
-      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-          data, data, numel, ncclFloat, ncclSum, nccl_ctx.comm(),
-          dynamic_cast<platform::CUDADeviceContext*>(
-              platform::DeviceContextPool::Instance().Get(
-                  platform::CUDAPlace(i)))
-              ->stream()));
-    }
-  }
-  nccl_ctx_map_->WaitAll();
-}
-
-std::atomic<int> SectionWorker::cpu_id_(0);
-void SectionWorker::Initialize(const TrainerDesc& trainer_desc) {
-  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
-  std::shared_ptr<framework::ProgramDesc> program;
-  program.reset(new ProgramDesc(
-      trainer_desc.section_param().section_config(section_id_).program_desc()));
-  for (auto& op_desc : program->Block(0).AllOps()) {
-    ops_.push_back(OpRegistry::CreateOp(*op_desc));
-  }
-}
-
-void SectionWorker::AutoSetCPUAffinity(bool reuse) {
-  int thread_cpu_id = cpu_id_.fetch_add(1);
-
-  unsigned concurrency_cap = std::thread::hardware_concurrency();
-  unsigned proc = thread_cpu_id;
-
-  if (proc >= concurrency_cap) {
-    if (reuse) {
-      proc %= concurrency_cap;
-    } else {
-      LOG(INFO) << "All " << concurrency_cap
-                << " CPUs have been set affinities. Fail to set "
-                << thread_cpu_id << "th thread";
-      return;
-    }
-  }
-
-  cpu_set_t mask;
-  CPU_ZERO(&mask);
-  CPU_SET(proc, &mask);
-
-  if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) {
-    LOG(WARNING) << "Fail to set thread affinity to CPU " << proc;
-    return;
-  }
-
-  CPU_ZERO(&mask);
-  if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) ||
-      (0 == CPU_ISSET(proc, &mask))) {
-    LOG(WARNING) << "Fail to set thread affinity to CPU " << proc;
-  }
-  SEC_LOG << "Set " << thread_cpu_id << "th thread affinity to CPU " << proc;
-}
-
-void SectionWorker::TrainFiles() {
-  SEC_LOG << "begin section_worker TrainFiles";
-  AutoSetCPUAffinity(true);
-
-  int64_t step_cnt = 0;
-  int64_t accum_num = 0;
-  int batch_size = 0;
-  Scope* scope = nullptr;
-  while (in_scope_queue_->Receive(&scope)) {
-    if (device_reader_ != nullptr) {
-      device_reader_->AssignFeedVar(*scope);
-      batch_size = device_reader_->Next();
-      if (batch_size <= 0) {
-        break;
-      }
-      SEC_LOG << "read batch size: " << batch_size;
-    } else {
-      // TODO(hutuxian): Keep batch_size in scope? Or is there a better way to
-      // fetch batch_size? Some variables may not have batch_size.
-      PADDLE_ENFORCE(
-          in_var_names_->size(),
-          "Section without a reader or in variable is not supported by now");
-      const LoDTensor& tensor =
-          scope->FindVar(in_var_names_->at(0))->Get<LoDTensor>();
-      batch_size =
-          tensor.lod().size() ? tensor.lod()[0].size() - 1 : tensor.dims()[0];
-      SEC_LOG << "input batch size: " << batch_size;
-    }
-
-    Scope* exe_scope = scope;
-    if (section_id_ > 0 && platform::is_gpu_place(place_)) {
-      SEC_LOG << "CPU2GPU memory copy";
-
-      if (scope->kids().empty()) {
-        exe_scope = &scope->NewScope();
-      } else {
-        exe_scope = scope->kids().front();
-        PADDLE_ENFORCE(scope->kids().size() == 1, "scope->kids().size(): %zu",
-                       scope->kids().size());
-      }
-
-      for (const std::string& name : *in_var_names_) {
-        const LoDTensor& src_tensor = scope->FindVar(name)->Get<LoDTensor>();
-        if (platform::is_gpu_place(src_tensor.place())) {
-          continue;
-        }
-        LoDTensor* gpu_tensor = exe_scope->Var(name)->GetMutable<LoDTensor>();
-        gpu_tensor->set_lod(src_tensor.lod());
-        TensorCopy(*static_cast<const Tensor*>(&src_tensor), place_, *dev_ctx_,
-                   static_cast<Tensor*>(gpu_tensor));
-      }
-    }
-
-    SEC_LOG << "begin running ops";
-
-    for (auto& op : ops_) {
-      op->Run(*exe_scope, place_);
-    }
-    exe_scope->DropKids();
-    // Wait for GPU calc finising, as the cudaMemcpy and GPU calc may be in
-    // different streams
-    // No effect when it is a CPUDeviceContext
-    dev_ctx_->Wait();
-
-    if (section_id_ != section_num_ - 1 && platform::is_gpu_place(place_)) {
-      // FIXME: Temporarily we assume two adjacent sections are in different
-      // places,
-      // and we do data transformation only in sections in GPU place, so the
-      // data is
-      // transform from GPU to CPU
-      // A better way to handle such a data transformation is to record each
-      // place of
-      // joint-out variables, and do transform as required
-
-      SEC_LOG << "GPU2CPU memory copy";
-
-      for (const std::string& name : *out_var_names_) {
-        const LoDTensor& src_tensor =
-            exe_scope->FindVar(name)->Get<LoDTensor>();
-        LoDTensor* dst_tensor = scope->Var(name)->GetMutable<LoDTensor>();
-        dst_tensor->set_lod(src_tensor.lod());
-        TensorCopy(*static_cast<const Tensor*>(&src_tensor),
-                   next_section_place_, *dev_ctx_,
-                   static_cast<Tensor*>(dst_tensor));
-      }
-    }
-
-    out_scope_queue_->Send(scope);
-
-    if (sync_func_) {
-      (*sync_func_)(scope);
-    }
-
-    ++step_cnt;
-    accum_num += batch_size;
-  }
-
-  worker_count_mutex_->lock();
-  --(*worker_count_);
-  worker_count_mutex_->unlock();
-
-  if (*worker_count_ <= 0) {
-    while (section_id_ < section_num_ - 1 && out_scope_queue_->Size()) {
-      sleep(1);
-    }
-    out_scope_queue_->Close();
-  }
-}
-
-void SectionWorker::TrainFilesWithProfiler() {
-  SEC_LOG << "begin section_worker TrainFiles with profiler";
-  AutoSetCPUAffinity(true);
-
-  int64_t step_cnt = 0;
-  int64_t accum_num = 0;
-  int batch_size = 0;
-  Scope* scope = nullptr;
-
-  platform::Timer reader_timer;
-  platform::Timer cal_timer;
-  platform::Timer trans_timer;
-  platform::Timer sync_timer;
-  platform::Timer main_timer;
-  platform::Timer outer_timer;
-
-  std::vector<double> op_total_time;
-  std::vector<std::string> op_name;
-  for (auto& op : ops_) {
-    op_name.push_back(op->Type());
-  }
-  op_total_time.resize(ops_.size());
-  for (size_t i = 0; i < op_total_time.size(); ++i) {
-    op_total_time[i] = 0.0;
-  }
-  platform::Timer timeline;
-
-  bool started = false;
-  while (in_scope_queue_->Receive(&scope)) {
-    if (UNLIKELY(!started)) {
-      outer_timer.Start();
-      started = true;
-    }
-    main_timer.Resume();
-
-    if (device_reader_ != nullptr) {
-      reader_timer.Resume();
-      device_reader_->AssignFeedVar(*scope);
-      batch_size = device_reader_->Next();
-      reader_timer.Pause();
-      if (batch_size <= 0) {
-        break;
-      }
-      SEC_LOG << "read batch size: " << batch_size;
-    } else {
-      PADDLE_ENFORCE(
-          in_var_names_->size(),
-          "Section without a reader or in variable is not supported by now");
-      const LoDTensor& tensor =
-          scope->FindVar(in_var_names_->at(0))->Get<LoDTensor>();
-      batch_size =
-          tensor.lod().size() ? tensor.lod()[0].size() - 1 : tensor.dims()[0];
-      SEC_LOG << "input batch size: " << batch_size;
-    }
-
-    Scope* exe_scope = scope;
-    if (section_id_ > 0 && platform::is_gpu_place(place_)) {
-      SEC_LOG << "CPU2GPU memory copy";
-      trans_timer.Resume();
-      if (scope->kids().empty()) {
-        exe_scope = &scope->NewScope();
-      } else {
-        exe_scope = scope->kids().front();
-        PADDLE_ENFORCE(scope->kids().size() == 1, "scope->kids().size(): %zu",
-                       scope->kids().size());
-      }
-
-      for (const std::string& name : *in_var_names_) {
-        const LoDTensor& src_tensor = scope->FindVar(name)->Get<LoDTensor>();
-        if (platform::is_gpu_place(src_tensor.place())) {
-          continue;
-        }
-        LoDTensor* gpu_tensor = exe_scope->Var(name)->GetMutable<LoDTensor>();
-        gpu_tensor->set_lod(src_tensor.lod());
-        TensorCopy(*static_cast<const Tensor*>(&src_tensor), place_, *dev_ctx_,
-                   static_cast<Tensor*>(gpu_tensor));
-      }
-      trans_timer.Pause();
-    }
-
-    SEC_LOG << "begin running ops";
-    cal_timer.Resume();
-    int op_id = 0;
-    for (auto& op : ops_) {
-      timeline.Start();
-      op->Run(*exe_scope, place_);
-      timeline.Pause();
-      op_total_time[op_id++] += timeline.ElapsedUS();
-    }
-    exe_scope->DropKids();
-    // Wait for GPU calc finising, as the cudaMemcpy and GPU calc may be in
-    // different streams
-    // No effect when it is a CPUDeviceContext
-    dev_ctx_->Wait();
-    cal_timer.Pause();
-
-    if (section_id_ != section_num_ - 1 && platform::is_gpu_place(place_)) {
-      // FIXME: Temporarily we assume two adjacent sections are in different
-      // places,
-      // and we do data transformation only in sections in GPU place, so the
-      // data is
-      // transform from GPU to CPU
-      // A better way to handle such a data transformation is to record each
-      // place of
-      // joint-out variables, and do transform as required
-
-      SEC_LOG << "GPU2CPU memory copy";
-      trans_timer.Resume();
-      for (const std::string& name : *out_var_names_) {
-        const LoDTensor& src_tensor =
-            exe_scope->FindVar(name)->Get<LoDTensor>();
-        LoDTensor* dst_tensor = scope->Var(name)->GetMutable<LoDTensor>();
-        dst_tensor->set_lod(src_tensor.lod());
-        TensorCopy(*static_cast<const Tensor*>(&src_tensor),
-                   next_section_place_, *dev_ctx_,
-                   static_cast<Tensor*>(dst_tensor));
-      }
-      trans_timer.Pause();
-    }
-
-    out_scope_queue_->Send(scope);
-
-    if (sync_func_) {
-      sync_timer.Resume();
-      (*sync_func_)(scope);
-      sync_timer.Pause();
-    }
-
-    ++step_cnt;
-    accum_num += batch_size;
-    main_timer.Pause();
-  }
-  outer_timer.Pause();
-
-  worker_count_mutex_->lock();
-  --(*worker_count_);
-  worker_count_mutex_->unlock();
-
-  if (*worker_count_ <= 0) {
-    while (section_id_ < section_num_ - 1 && out_scope_queue_->Size()) {
-      sleep(1);
-    }
-    out_scope_queue_->Close();
-  }
-  LOG(ERROR) << "log_for_profile"
-             << " card:" << pipeline_id_ << " thread:" << thread_id_
-             << " section:" << section_id_ << " step_count:" << step_cnt
-             << " batch_count:" << accum_num
-             << " read_time:" << reader_timer.ElapsedUS()
-             << " trans_time:" << trans_timer.ElapsedUS()
-             << " cal_time:" << cal_timer.ElapsedUS()
-             << " sync_time:" << sync_timer.ElapsedUS()
-             << " main_time:" << main_timer.ElapsedUS()
-             << " outer_time:" << outer_timer.ElapsedUS();
-  for (size_t i = 0; i < ops_.size(); ++i) {
-    LOG(ERROR) << "op: " << op_name[i]
-               << ", mean time: " << op_total_time[i] / accum_num;
-  }
-}
-}  // namespace framework
-}  // namespace paddle
-#endif
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
deleted file mode 100644
index 54a818250b45e593de4110f56e42a04a9ea65e00..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/selected_rows.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/selected_rows.h"
-
-namespace paddle {
-namespace framework {
-
-struct ReAllocateVisitor {
-  ReAllocateVisitor(const framework::DDim& dims, framework::Tensor* tensor)
-      : dims_(dims), tensor_(tensor) {}
-
-  template <typename T>
-  void operator()() const {
-    framework::Tensor cpu_tensor;
-    platform::CPUPlace cpu;
-    T* ptr = cpu_tensor.mutable_data<T>(dims_, cpu);
-    const T* old_ptr =
-        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
-    if (old_ptr != nullptr) {
-      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
-    }
-    tensor_->ShareDataWith(cpu_tensor);
-  }
-
-  framework::DDim dims_;
-  framework::Tensor* tensor_;
-};
-
-struct TensorCopyVisitor {
-  TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset,
-                    const framework::Tensor src, int64_t src_offset,
-                    int64_t size)
-      : dst_(dst),
-        dst_offset_(dst_offset),
-        src_(src),
-        src_offset_(src_offset),
-        size_(size) {}
-
-  template <typename T>
-  void apply() const {
-    // TODO(Yancey1989): support other place
-    platform::CPUPlace cpu;
-    memory::Copy(cpu, dst_->mutable_data<T>(cpu) + dst_offset_, cpu,
-                 src_.data<T>() + src_offset_, size_ * sizeof(T));
-  }
-
-  framework::Tensor* dst_;
-  int64_t dst_offset_;
-  framework::Tensor src_;
-  int64_t src_offset_;
-  int64_t size_;
-};
-
-struct TensorFillVisitor {
-  TensorFillVisitor(framework::Tensor* dst, int64_t dst_offset, int64_t size,
-                    float value)
-      : dst_(dst), dst_offset_(dst_offset), size_(size) {}
-
-  template <typename T>
-  void apply() const {
-    // TODO(qiao): support other place
-    platform::CPUPlace cpu;
-    auto* tensor_data = dst_->mutable_data<T>(cpu);
-    auto* start = tensor_data + dst_offset_;
-    auto* end = start + size_;
-    std::fill(start, end, static_cast<T>(0.0));
-  }
-
-  framework::Tensor* dst_;
-  int64_t dst_offset_;
-  int64_t size_;
-};
-
-void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
-                       const platform::DeviceContext& dev_ctx) {
-  {  // the 1st field, uint32_t version
-    constexpr uint32_t version = 0;
-    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
-  }
-  {
-    // the 2st field, rows information
-    auto& rows = selected_rows.rows();
-    uint64_t size = rows.size();
-    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
-    for (uint64_t i = 0; i < size; ++i) {
-      os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
-    }
-  }
-  {
-    // the 3st field, the height of SelectedRows
-    int64_t height = selected_rows.height();
-    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
-  }
-  // the 4st field, Tensor data
-  TensorToStream(os, selected_rows.value(), dev_ctx);
-}
-
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
-                           const platform::DeviceContext& dev_ctx) {
-  {
-    // the 1st field, unit32_t version for SelectedRows
-    uint32_t version;
-    is.read(reinterpret_cast<char*>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  }
-  {
-    // the 2st field, rows information
-    uint64_t size;
-    is.read(reinterpret_cast<char*>(&size), sizeof(size));
-    auto& rows = *selected_rows->mutable_rows();
-    rows.resize(size);
-    for (uint64_t i = 0; i < size; ++i) {
-      is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
-    }
-  }
-  {
-    // the 3st field, the height of the SelectedRows
-    int64_t height;
-    is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
-    selected_rows->set_height(height);
-  }
-  // the 4st field, tensor which contains the data
-  TensorFromStream(is, selected_rows->mutable_value(), dev_ctx);
-}
-
-bool SelectedRows::HasKey(int64_t key) const {
-  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
-                                                                   : true;
-}
-
-int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
-                                     bool is_test) {
-  if (is_test) {
-    auto iter = id_to_index_.find(key);
-    if (iter == id_to_index_.end()) {
-      return -1;
-    } else {
-      return iter->second;
-    }
-  }
-
-  rwlock_->RDLock();
-  auto iter = id_to_index_.find(key);
-  if (iter == id_to_index_.end()) {
-    rwlock_->UNLock();
-    if (!auto_grown) {
-      PADDLE_THROW("key %d not found", key);
-    }
-    rwlock_->WRLock();
-    auto map_size = id_to_index_.size();
-    auto vector_size = rows_.size();
-    if (map_size != vector_size) {
-      rwlock_->UNLock();
-      PADDLE_THROW(
-          "id_to_index_ size %d should have the same size with rows_ %d",
-          map_size, vector_size);
-    }
-    auto write_iter = id_to_index_.find(key);
-    if (write_iter == id_to_index_.end()) {
-      int row_num = rows_.size();
-      if (row_num == value_->dims()[0]) {
-        rwlock_->UNLock();
-        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
-      }
-      // key logic to put a key into id_to_index_
-      rows_.push_back(key);
-      auto index = static_cast<int64_t>(rows_.size() - 1);
-      id_to_index_[key] = index;
-      rwlock_->UNLock();
-      return index;
-    } else {
-      auto index = write_iter->second;
-      rwlock_->UNLock();
-      return index;
-    }
-  } else {
-    auto index = iter->second;
-    rwlock_->UNLock();
-    return index;
-  }
-}
-
-void SelectedRows::SyncIndex() {
-  rwlock_->WRLock();
-  id_to_index_.clear();
-  for (size_t i = 0; i < rows_.size(); ++i) {
-    id_to_index_[rows_[i]] = i;
-  }
-  rwlock_->UNLock();
-}
-
-void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
-                       bool auto_grown, bool is_test) {
-  PADDLE_ENFORCE(value->IsInitialized(),
-                 "The value tensor should be initialized.");
-  if (ids.numel() == 0) {
-    VLOG(3) << "keys is empty, please check data!";
-  } else {
-    int64_t value_width = value_->numel() / value_->dims()[0];
-    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
-                      "output tensor should have the same shape with table "
-                      "except the dims[0].");
-    for (int i = 0; i < ids.numel(); ++i) {
-      auto id = ids.data<int64_t>()[i];
-      int64_t index = AutoGrownIndex(id, auto_grown, is_test);
-      if (index < 0) {
-        VLOG(5) << "id " << id << " not in the table, return 0";
-        framework::VisitDataType(
-            value_->type(),
-            TensorFillVisitor(value, i * value_width, value_width, 0.0));
-      } else {
-        framework::VisitDataType(
-            value_->type(),
-            TensorCopyVisitor(value, i * value_width, *value_.get(),
-                              index * value_width, value_width));
-      }
-    }
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
deleted file mode 100644
index e1bdba9b46a4cbdb664b70c7419f567ef95bdf31..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/selected_rows.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/rw_lock.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace framework {
-
-class SelectedRows {
-  /*
-   * @brief We can use the SelectedRows structure to reproduce a sparse table.
-   *  A sparse table is a key-value structure that the key is an `int64_t`,
-   *  and the value is a Tensor which the first dimension is 0.
-   *  You can use the following interface to operate the sparse table, and you
-   * can find
-   *  some detail information from the comments of each interface:
-   *
-   *  HasKey(key), whether the sparse table has the specified key.
-   *  Set(key, value), set a key-value pair into the sparse table.
-   *  Get(keys, value*), get value by given key list and apply it to the given
-   * value pointer
-   *    with the specified offset.
-   *
-   */
- public:
-  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
-      : rows_(rows), height_(height) {
-    value_.reset(new Tensor());
-    rwlock_.reset(new RWLock);
-  }
-
-  SelectedRows() {
-    height_ = 0;
-    value_.reset(new Tensor());
-    rwlock_.reset(new RWLock);
-  }
-
-  platform::Place place() const { return value_->place(); }
-
-  const Tensor& value() const { return *value_; }
-
-  Tensor* mutable_value() { return value_.get(); }
-
-  int64_t height() const { return height_; }
-
-  void set_height(int64_t height) { height_ = height; }
-
-  const Vector<int64_t>& rows() const { return rows_; }
-
-  Vector<int64_t>* mutable_rows() { return &rows_; }
-
-  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
-
-  /*
-   * @brief Get the index of key in rows
-   *
-   * @return -1 if the key does not exists.
-   */
-  int64_t Index(int64_t key) const {
-    auto it = std::find(rows_.begin(), rows_.end(), key);
-    if (it == rows_.end()) {
-      PADDLE_THROW("id %s not in table", key);
-    }
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
-
-  /*
-   * @brief whether has the specified key in the table.
-   *
-   * @return true if the key is exists.
-   */
-  bool HasKey(int64_t key) const;
-
-  /*
-   * @brief Get value by the key list.
-   * Note!!! this interface is only used when selected_rows is used as
-   * parameters
-   * for distribute lookup table.
-   *
-   * @return a list of pair which contains the non-exists key and the index in
-   * the value
-   */
-  void Get(const framework::Tensor& ids, framework::Tensor* value,
-           bool auto_grown = false, bool is_test = false);
-
-  /*
-   * @brief Get the index of the key from id_to_index_ map. If the key not
-   * exist,
-   * add the key into id_to_index_.
-   *
-   * Note!!! this interface is only used when selected_rows is used as
-   * parameters
-   * for distribute lookup table.
-   *
-   * @return index of the key.
-   */
-  int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
-
-  /*
-   * @brief Get the index of the key from id_to_index_ map.
-   */
-  inline int64_t GetIndexFromId(int64_t key) {
-    auto iter = id_to_index_.find(key);
-    if (iter == id_to_index_.end()) {
-      return -1;
-    } else {
-      return iter->second;
-    }
-  }
-
-  void SyncIndex();
-  /*
-   * @brief Get complete Dims before
-   */
-  DDim GetCompleteDims() const {
-    std::vector<int64_t> dims = vectorize(value_->dims());
-    dims[0] = height_;
-    return make_ddim(dims);
-  }
-
- private:
-  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simply concated when adding together. Until a
-  // SelectedRows add a Tensor, will the duplicate rows be handled.
-  Vector<int64_t> rows_;
-  std::unordered_map<int64_t, int64_t>
-      id_to_index_;  // should not be used when rows_ has duplicate member
-  std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;  // height indicates the underline tensor's height
-  std::unique_ptr<RWLock> rwlock_{nullptr};
-};
-
-/*
- * Serialize/Desiralize SelectedRows to std::ostream
- * You can pass ofstream or ostringstream to serilize to file
- * or to a in memory string. GPU tensor will be copied to CPU.
- */
-void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
-                       const platform::DeviceContext& dev_ctx);
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
-                           const platform::DeviceContext& dev_ctx);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
deleted file mode 100644
index 3b0509e0344efedf08ab21cac0a075049617ca97..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <time.h>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/selected_rows.h"
-
-namespace paddle {
-namespace framework {
-
-class SelectedRowsTester : public ::testing::Test {
- public:
-  void SetUp() override {
-    std::vector<int64_t> rows{0, 4, 7};
-    int64_t height = 10;
-    int64_t row_numel = 100;
-    selected_rows_.reset(new SelectedRows(rows, height));
-
-    Tensor* value = selected_rows_->mutable_value();
-    auto* data = value->mutable_data<float>(
-        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
-    for (int64_t i = 0; i < value->numel(); ++i) {
-      data[i] = static_cast<float>(i);
-    }
-  }
-
- protected:
-  platform::CPUPlace place_;
-  std::unique_ptr<SelectedRows> selected_rows_{nullptr};
-};
-
-TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
-
-TEST_F(SelectedRowsTester, dims) {
-  ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100}));
-}
-
-TEST_F(SelectedRowsTester, complete_dims) {
-  ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
-}
-
-TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
-  SelectedRows dst_tensor;
-  platform::CPUDeviceContext cpu_ctx(place_);
-  std::ostringstream oss;
-
-  SerializeToStream(oss, *selected_rows_, cpu_ctx);
-
-  std::istringstream iss(oss.str());
-  DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
-
-  ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
-  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
-  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
-  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
-  auto* dst_data = dst_tensor.value().data<float>();
-  for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
-    ASSERT_EQ(dst_data[i], static_cast<float>(i));
-  }
-}
-
-TEST(SelectedRows, SparseTable) {
-  platform::CPUPlace cpu;
-  SelectedRows table;
-
-  int64_t table_size = 100;
-  int64_t embedding_width = 8;
-  // initialize a sparse table
-  table.mutable_value()->Resize(
-      framework::make_ddim({table_size, embedding_width}));
-  auto* data = table.mutable_value()->mutable_data<float>(cpu);
-  for (int64_t i = 0; i < table_size; ++i) {
-    for (int64_t j = 0; j < embedding_width; ++j) {
-      data[i * embedding_width + j] = static_cast<float>(i);
-    }
-  }
-  ASSERT_EQ(table.AutoGrownIndex(10, true, false), 0);
-  ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1);
-  ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1);
-  ASSERT_EQ(table.AutoGrownIndex(6, true, false), 2);
-  for (int64_t i = 11; i < 20; i++) {
-    ASSERT_EQ(table.AutoGrownIndex(i, true, true), -1);
-    ASSERT_TRUE(!table.HasKey(i));
-  }
-  ASSERT_TRUE(table.HasKey(10));
-  ASSERT_TRUE(table.HasKey(8));
-  ASSERT_TRUE(table.HasKey(6));
-  ASSERT_EQ(table.rows().size(), 3UL);
-
-  framework::Tensor ids;
-  ids.Resize(framework::make_ddim({4}));
-  auto* ids_data = ids.mutable_data<int64_t>(cpu);
-  ids_data[0] = static_cast<int64_t>(6);
-  ids_data[1] = static_cast<int64_t>(6);
-  ids_data[2] = static_cast<int64_t>(8);
-  ids_data[3] = static_cast<int64_t>(10);
-
-  framework::Tensor get_value;
-  auto* value_data = get_value.mutable_data<float>(
-      framework::make_ddim({4, embedding_width}), cpu);
-  table.Get(ids, &get_value);
-
-  for (int j = 0; j < embedding_width; ++j) {
-    ASSERT_EQ(value_data[0 * embedding_width + j], 2);
-  }
-  for (int j = 0; j < embedding_width; ++j) {
-    ASSERT_EQ(value_data[1 * embedding_width + j], 2);
-  }
-  for (int j = 0; j < embedding_width; ++j) {
-    ASSERT_EQ(value_data[2 * embedding_width + j], 1);
-  }
-  for (int j = 0; j < embedding_width; ++j) {
-    ASSERT_EQ(value_data[3 * embedding_width + j], 0);
-  }
-}
-
-void f1(SelectedRows* table, int table_size) {
-  for (int i = 1000000; i > 0; --i) {
-    auto id = i % table_size;
-    int64_t index1 = table->AutoGrownIndex(id, true);
-    int64_t index2 = table->AutoGrownIndex(id, false);
-    int64_t index3 = table->AutoGrownIndex(id, true);
-    ASSERT_EQ(index1, index2);
-    ASSERT_EQ(index2, index3);
-  }
-}
-
-void f2(SelectedRows* table, int table_size) {
-  for (int i = 0; i < 1000000; ++i) {
-    auto id = i % table_size;
-    int64_t index1 = table->AutoGrownIndex(id, true);
-    int64_t index2 = table->AutoGrownIndex(id, false);
-    int64_t index3 = table->AutoGrownIndex(id, true);
-    ASSERT_EQ(index1, index2);
-    ASSERT_EQ(index2, index3);
-  }
-}
-
-void f3(SelectedRows* table, int table_size) {
-  clock_t t1 = clock();
-  for (int i = 100000; i > 0; --i) {
-    auto id1 = table->AutoGrownIndex(i % table_size, true);
-    auto id2 = table->Index(i % table_size);
-    ASSERT_EQ(id1, id2);
-  }
-  clock_t t2 = clock();
-  std::cout << "f3 run time:" << t2 - t1 << std::endl;
-}
-
-void f4(SelectedRows* table, int table_size) {
-  clock_t t1 = clock();
-  for (int i = 0; i < 100000; ++i) {
-    auto id1 = table->AutoGrownIndex(i % table_size, true);
-    auto id2 = table->Index(i % table_size);
-    ASSERT_EQ(id1, id2);
-  }
-  clock_t t2 = clock();
-  std::cout << "f4 run time:" << t2 - t1 << std::endl;
-}
-
-TEST(SelectedRows, MultiThreadAutoIndex) {
-  platform::CPUPlace cpu;
-  SelectedRows table;
-
-  int64_t table_size = 100000;
-  int64_t embedding_width = 8;
-  // initialize a sparse table
-  table.mutable_value()->Resize(
-      framework::make_ddim({table_size, embedding_width}));
-  auto* data = table.mutable_value()->mutable_data<float>(cpu);
-  for (int64_t i = 0; i < table_size; ++i) {
-    for (int64_t j = 0; j < embedding_width; ++j) {
-      data[i * embedding_width + j] = static_cast<float>(i);
-    }
-  }
-
-  std::thread t1(f1, &table, table_size);
-  std::thread t11(f1, &table, table_size);
-  std::thread t2(f2, &table, table_size);
-  std::thread t22(f2, &table, table_size);
-  t1.join();
-  t11.join();
-  t2.join();
-  t22.join();
-  std::thread t3(f3, &table, table_size);
-  std::thread t4(f4, &table, table_size);
-  t3.join();
-  t4.join();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
deleted file mode 100644
index 4ac872ac3d3bf918678f5294a4c35097c3fb18ab..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/shape_inference.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/shape_inference.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/grad_op_desc_maker.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-
-std::vector<DDim> InferShapeContext::GetReaderDims(
-    const std::string &name) const {
-  const std::vector<std::string> &arg_names = Inputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader input '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
-  return this->GetRepeatedDims(arg_names[0]);
-}
-
-void InferShapeContext::SetReaderDims(const std::string &name,
-                                      const std::vector<DDim> &dims) {
-  const std::vector<std::string> &arg_names = Outputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader output '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
-  return this->SetRepeatedDims(arg_names[0], dims);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
deleted file mode 100644
index e0a848273b8d6b50eb1706998e368141a0d1f7f3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/shape_inference.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace framework {
-
-class OperatorBase;
-
-using InferShapeVarPtr = boost::variant<VarDesc *, Variable *>;
-
-class InferShapeContext {
- public:
-  virtual ~InferShapeContext() = default;
-  virtual bool HasInput(const std::string &name) const = 0;
-  virtual bool HasOutput(const std::string &name) const = 0;
-
-  virtual std::vector<proto::VarType::Type> GetInputsVarType(
-      const std::string &name) const = 0;
-  virtual std::vector<proto::VarType::Type> GetOutputsVarType(
-      const std::string &name) const = 0;
-
-  virtual bool HasInputs(const std::string &name) const = 0;
-  virtual bool HasOutputs(const std::string &name) const = 0;
-
-  virtual DDim GetInputDim(const std::string &name) const = 0;
-  virtual std::vector<DDim> GetInputsDim(const std::string &name) const = 0;
-  virtual std::vector<DDim> GetReaderDims(const std::string &name) const;
-
-  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
-  virtual void SetOutputsDim(const std::string &name,
-                             const std::vector<DDim> &dims) = 0;
-  virtual void SetReaderDims(const std::string &name,
-                             const std::vector<DDim> &dims);
-
-  virtual AttrReader Attrs() const = 0;
-  virtual const std::vector<std::string> &Inputs(
-      const std::string &name) const = 0;
-  virtual const std::vector<std::string> &Outputs(
-      const std::string &name) const = 0;
-
-  virtual void ShareDim(const std::string &in, const std::string &out,
-                        size_t i = 0, size_t j = 0) = 0;
-
-  virtual void ShareLoD(const std::string &in, const std::string &out,
-                        size_t i = 0, size_t j = 0) const = 0;
-
-  virtual void DecreaseLoDLevel(const std::string &in, const std::string &out,
-                                size_t i = 0, size_t j = 0) const = 0;
-
-  virtual bool IsRuntime() const = 0;
-
-  virtual std::vector<InferShapeVarPtr> GetInputVarPtrs(
-      const std::string &name) = 0;
-  virtual std::vector<InferShapeVarPtr> GetOutputVarPtrs(
-      const std::string &name) = 0;
-
- protected:
-  virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0;
-  virtual void SetRepeatedDims(const std::string &name,
-                               const std::vector<DDim> &dims) = 0;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
deleted file mode 100644
index 7b39c5359e8f266d9217f6f9c2db8ba80b08d6b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tensor.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_type.h"
-
-namespace paddle {
-namespace framework {
-extern size_t SizeOfType(proto::VarType::Type type);
-void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_LE(
-      numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
-}
-
-Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
-
-size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-}
-
-void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type,
-                           size_t requested_size) {
-  type_ = type;
-  PADDLE_ENFORCE_GE(numel(), 0,
-                    "When calling this method, the Tensor's numel must be "
-                    "equal or larger than zero. "
-                    "Please check Tensor::dims, or Tensor::Resize has been "
-                    "called first. The Tensor's shape is [",
-                    dims(), "] now");
-  size_t size = numel() * SizeOfType(type);
-  if (requested_size) {
-    PADDLE_ENFORCE_GE(requested_size, size);
-    size = requested_size;
-  }
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    // Reset holder first before re-allocate to save memory
-    holder_.reset();
-    holder_ = memory::AllocShared(place, size);
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
-  PADDLE_ENFORCE_NOT_NULL(
-      this->holder_, "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, type_, requested_size);
-}
-
-Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-
-Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
-  PADDLE_ENFORCE_LT(
-      begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
-
-  if (dims_[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(layout_);
-    dst.type_ = type_;
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-    return dst;
-  }
-}
-
-Tensor& Tensor::Resize(const DDim& dims) {
-  dims_ = dims;
-  return *this;
-}
-
-const DDim& Tensor::dims() const { return dims_; }
-
-int64_t Tensor::numel() const { return product(dims_); }
-
-void Tensor::ResetHolder(std::shared_ptr<memory::Allocation> holder) {
-  if (holder_) {
-    PADDLE_ENFORCE_EQ(numel() * SizeOfType(type()), holder->size());
-  }
-  holder_ = holder;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
deleted file mode 100644
index 8fffecfa0e157768a00db893595bb6df4dc51a9d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tensor.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <typeindex>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-
-namespace framework {
-
-class LoDTensor;
-
-class Tensor {
-#ifdef PADDLE_WITH_MKLDNN
-
- public:
-  inline mkldnn::memory::format format() const { return format_; }
-
-  inline void set_format(const mkldnn::memory::format format) {
-    format_ = format;
-  }
-
- protected:
-  /**
-   * @brief the detail format of memory block which have layout as kMKLDNN
-   *
-   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
-   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
-   *       this field.
-   */
-
-  mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
-#endif
-
- public:
-  template <typename T, size_t D, int MajorType, typename IndexType>
-  friend struct EigenTensor;
-
-  template <typename T, int MajorType, typename IndexType>
-  friend struct EigenMatrix;
-
-  template <typename T, int MajorType, typename IndexType>
-  friend struct EigenVector;
-
- public:
-  Tensor() : type_(proto::VarType::FP32), offset_(0) {}
-
-  explicit Tensor(const proto::VarType::Type&);
-
-  /*! Return a pointer to mutable memory block. */
-  template <typename T>
-  T* data();
-
-  /*! Return a pointer to constant memory block. */
-  template <typename T>
-  const T* data() const;
-
-  inline bool IsInitialized() const;
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  T* mutable_data(platform::Place place, size_t requested_size = 0);
-
-  void* mutable_data(platform::Place place, proto::VarType::Type type,
-                     size_t requested_size = 0);
-
-  void* mutable_data(platform::Place place, size_t requested_size = 0);
-
-  /**
-   * @brief     Return a pointer to mutable memory block.
-   *
-   * @param[in] dims           The dimensions of the memory block.
-   * @param[in] place          The place of the memory block.
-   * @param[in] requested_size The size of the block in bytes.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
-
-  /*! Return the dimensions of the memory block. */
-  const DDim& dims() const;
-
-  /*! Return the numel of the memory block. */
-  int64_t numel() const;
-
-  /*! Resize the dimensions of the memory block. */
-  Tensor& Resize(const DDim& dims);
-
-  /*! The internal of two tensors share the same memory block. */
-  Tensor& ShareDataWith(const Tensor& src);
-
-  /**
-   * @brief  Return a sub-tensor of the given tensor.
-   *
-   * @param[in] begin_idx   The index of the start row(inclusive) to slice.
-   *                        The index number begins from 0.
-   * @param[in] end_idx     The index of the end row(exclusive) to slice.
-   *                        The index number begins from 0.
-   */
-  Tensor Slice(int64_t begin_idx, int64_t end_idx) const;
-
-  platform::Place place() const {
-    PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::place() is called.");
-    return holder_->place();
-  }
-
-  proto::VarType::Type type() const {
-    PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::type() is called.");
-    return type_;
-  }
-
-  // memory size returns the holding memory size in byte.
-  size_t memory_size() const;
-
-  void check_memory_size() const;
-
-  DataLayout layout() const { return layout_; }
-
-  void set_layout(const DataLayout layout) { layout_ = layout; }
-
-  void clear() {
-    holder_ = nullptr;
-    offset_ = 0;
-  }
-
-  void ShareBufferWith(const Tensor& tensor) {
-    holder_ = tensor.holder_;
-    offset_ = tensor.offset_;
-  }
-
-  const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
-  size_t offset() const { return offset_; }
-
-  std::shared_ptr<memory::Allocation> MoveMemoryHolder() {
-    return std::move(holder_);
-  }
-
-  void ResetHolder(std::shared_ptr<memory::Allocation> holder);
-
- private:
-  /*! holds the memory block if allocated. */
-  std::shared_ptr<memory::Allocation> holder_;
-  proto::VarType::Type type_;
-  /**
-   * @brief points to elements dimensions.
-   *
-   * @note dims_ do not indicate the memory block size.
-   */
-
-  DDim dims_;
-
-  /**
-   * @brief the layout of memory block, default is NHWC.
-   *
-   * @note the memory allocation order, describe how weight/data is stored
-   *       For example, in 4-D Tensor(rank=4), there are three commonly
-   *       used layout. They are
-   *            NCHW, NHWC, CHWN.
-   *       N,C,H,W for respectively the batch size, the number of
-   *       feature maps, the height.
-   */
-  // Fix me: here just change the default layout to kNCHW
-  // it doesn't fix the real issue, i.e. feeder should set up tensor layout
-  // according to actual input data
-  DataLayout layout_ = DataLayout::kNCHW;
-
-  /**
-   * @brief   A PlaceHolder may be shared by more than one tensor.
-   *
-   * @note    Some of them may be slices of the others. So the offset_
-   *          is introduced here to indicate the byte offset between
-   *          PlaceHolder::ptr_ and where the tensor data really begins.
-   */
-  size_t offset_;
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-#include "paddle/fluid/framework/tensor_impl.h"
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
deleted file mode 100644
index a5c39b7e923e24e82996402489ea537df08a7d5d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tensor_impl.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace framework {
-template <typename T>
-inline const T* Tensor::data() const {
-  check_memory_size();
-  bool valid =
-      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
-  PADDLE_ENFORCE(
-      valid, "Tensor holds the wrong type, it holds %s, but desires to be %s",
-      DataTypeToString(type_), DataTypeToString(DataTypeTrait<T>::DataType()));
-
-  return reinterpret_cast<const T*>(
-      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-}
-
-inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
-
-template <typename T>
-inline T* Tensor::data() {
-  check_memory_size();
-  bool valid =
-      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
-  PADDLE_ENFORCE(
-      valid, "Tensor holds the wrong type, it holds %s, but desires to be %s",
-      DataTypeToString(type_), DataTypeToString(DataTypeTrait<T>::DataType()));
-  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                              offset_);
-}
-
-template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place,
-                               size_t requested_size) {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-  Resize(dims);
-  return mutable_data<T>(place, requested_size);
-}
-
-template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(
-      mutable_data(place, DataTypeTrait<T>::DataType(), requested_size));
-}
-
-inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
-  int rank = src.dims().size();
-  PADDLE_ENFORCE_GE(
-      rank, 2,
-      "'ReshapeToMatrix()' is only used for flatten high rank "
-      "tensors to matrixs. Can not be used in reshaping vectors.");
-  if (rank == 2) {
-    return src;
-  }
-  Tensor res;
-  res.ShareDataWith(src);
-  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
-  return res;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
deleted file mode 100644
index f94c0c532bd1cb3b266af339eef62a13097133a7..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tensor_test.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/tensor.h"
-#include <gtest/gtest.h>
-#include <string>
-#include "paddle/fluid/platform/float16.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-
-TEST(Tensor, Dims) {
-  framework::Tensor tt;
-  tt.Resize({2, 3, 4});
-  framework::DDim dims = tt.dims();
-  ASSERT_EQ(arity(dims), 3);
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(i + 2, dims[i]);
-  }
-}
-
-TEST(Tensor, DataAssert) {
-  framework::Tensor src_tensor;
-
-  bool caught = false;
-  try {
-    src_tensor.data<double>();
-  } catch (platform::EnforceNotMet err) {
-    caught = true;
-    std::string msg =
-        "holder_ should not be null\nTensor holds no memory. Call "
-        "Tensor::mutable_data first.";
-    const char* what = err.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
-  }
-  ASSERT_TRUE(caught);
-}
-
-TEST(Tensor, MutableData) {
-  {
-    framework::Tensor src_tensor;
-    float* p1 = nullptr;
-    float* p2 = nullptr;
-    // initialization
-    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
-                                        platform::CPUPlace());
-    auto p1_holder = src_tensor.Holder();
-    EXPECT_NE(p1, nullptr);
-    // set src_tensor a new dim with large size
-    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
-                                        platform::CPUPlace());
-    EXPECT_NE(p2, nullptr);
-    auto p2_holder1 = src_tensor.Holder();
-    EXPECT_NE(p1_holder.get(), p2_holder1.get());
-    // set src_tensor a new dim with same size
-    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
-                                        platform::CPUPlace());
-    auto p2_holder2 = src_tensor.Holder();
-    EXPECT_EQ(p2_holder1.get(), p2_holder2.get());
-    // set src_tensor a new dim with smaller size
-    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
-                                        platform::CPUPlace());
-    auto p2_holder3 = src_tensor.Holder();
-    EXPECT_EQ(p1, p2);
-    EXPECT_EQ(p2_holder2.get(), p2_holder3.get());
-
-    float* p3 = nullptr;
-    float* p4 = nullptr;
-    // set src_tensor a different type but smaller size.
-    // memory block is supposed to be unchanged.
-    auto* tmp = src_tensor.mutable_data<uint8_t>(framework::make_ddim({2, 2}),
-                                                 platform::CPUPlace());
-    p3 = reinterpret_cast<float*>(tmp);
-    auto p3_holder1 = src_tensor.Holder();
-    EXPECT_EQ(p1, p3);
-    EXPECT_EQ(p2_holder3.get(), p3_holder1.get());
-
-    // set src_tensor a different type but bigger size.
-    // memory block is supposed to be changed.
-    auto* tmp2 = src_tensor.mutable_data<double>(
-        framework::make_ddim({2, 2, 3}), platform::CPUPlace());
-    auto p3_holder2 = src_tensor.Holder();
-    p4 = reinterpret_cast<float*>(tmp2);
-    EXPECT_NE(p1, p4);
-    EXPECT_NE(p3_holder1.get(), p3_holder2.get());
-  }
-  // Not sure if it's desired, but currently, Tensor type can be changed.
-  {
-    framework::Tensor src_tensor;
-    int8_t* p1 = src_tensor.mutable_data<int8_t>(framework::make_ddim({1}),
-                                                 platform::CPUPlace());
-    EXPECT_NE(p1, nullptr);
-    *p1 = 1;
-
-    uint8_t* p2 = src_tensor.mutable_data<uint8_t>(framework::make_ddim({1}),
-                                                   platform::CPUPlace());
-    EXPECT_NE(p2, nullptr);
-    EXPECT_EQ(static_cast<int>(p2[0]), 1);
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    framework::Tensor src_tensor;
-    float* p1 = nullptr;
-    float* p2 = nullptr;
-    // initialization
-    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
-                                        platform::CUDAPlace());
-    auto p1_holder = src_tensor.Holder();
-    EXPECT_NE(p1, nullptr);
-    // set src_tensor a new dim with large size
-    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
-                                        platform::CUDAPlace());
-    auto p2_holder = src_tensor.Holder();
-    EXPECT_NE(p2, nullptr);
-    EXPECT_NE(p1_holder.get(), p2_holder.get());
-    // set src_tensor a new dim with same size
-    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
-                                        platform::CUDAPlace());
-    EXPECT_EQ(p1, p2);
-    // set src_tensor a new dim with smaller size
-    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
-                                        platform::CUDAPlace());
-    EXPECT_EQ(p1, p2);
-  }
-#endif
-}
-
-TEST(Tensor, ShareDataWith) {
-  {
-    framework::Tensor src_tensor;
-    framework::Tensor dst_tensor;
-    // Try to share data form uninitialized tensor
-    bool caught = false;
-    try {
-      dst_tensor.ShareDataWith(src_tensor);
-    } catch (paddle::platform::EnforceNotMet err) {
-      caught = true;
-      std::string msg =
-          "holder_ should not be null\nTensor holds no memory. Call "
-          "Tensor::mutable_data first.";
-      const char* what = err.what();
-      for (size_t i = 0; i < msg.length(); ++i) {
-        ASSERT_EQ(what[i], msg[i]);
-      }
-    }
-    ASSERT_TRUE(caught);
-
-    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
-                                 platform::CPUPlace());
-    dst_tensor.ShareDataWith(src_tensor);
-    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    framework::Tensor src_tensor;
-    framework::Tensor dst_tensor;
-    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
-                                 platform::CUDAPlace());
-    dst_tensor.ShareDataWith(src_tensor);
-    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
-  }
-#endif
-}
-
-TEST(Tensor, Slice) {
-  {
-    framework::Tensor src_tensor;
-    src_tensor.mutable_data<int>(framework::make_ddim({5, 3, 4}),
-                                 platform::CPUPlace());
-    framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
-    framework::DDim slice_dims = slice_tensor.dims();
-    ASSERT_EQ(arity(slice_dims), 3);
-    EXPECT_EQ(slice_dims[0], 2);
-    EXPECT_EQ(slice_dims[1], 3);
-    EXPECT_EQ(slice_dims[2], 4);
-
-    uintptr_t src_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.data<int>());
-    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<int>(src_tensor.dims(), platform::CPUPlace()));
-    uintptr_t slice_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
-    uintptr_t slice_mutable_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<int>(
-            slice_tensor.dims(), platform::CPUPlace()));
-    EXPECT_EQ(src_data_address, src_mutable_data_address);
-    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
-    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    framework::Tensor src_tensor;
-    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
-                                    platform::CUDAPlace());
-    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
-    framework::DDim slice_dims = slice_tensor.dims();
-    ASSERT_EQ(arity(slice_dims), 2);
-    EXPECT_EQ(slice_dims[0], 4);
-    EXPECT_EQ(slice_dims[1], 9);
-
-    uintptr_t src_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-    uintptr_t src_mutable_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
-            src_tensor.dims(), platform::CUDAPlace()));
-    uintptr_t slice_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-    uintptr_t slice_mutable_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
-            slice_tensor.dims(), platform::CUDAPlace()));
-    EXPECT_EQ(src_data_address, src_mutable_data_address);
-    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
-    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
-  }
-#endif
-}
-
-TEST(Tensor, ReshapeToMatrix) {
-  framework::Tensor src;
-  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
-  for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
-    src_ptr[i] = i;
-  }
-  framework::Tensor res = framework::ReshapeToMatrix(src, 2);
-  ASSERT_EQ(res.dims()[0], 2 * 3);
-  ASSERT_EQ(res.dims()[1], 4 * 9);
-}
-
-TEST(Tensor, Layout) {
-  framework::Tensor src;
-  ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW);
-  src.set_layout(framework::DataLayout::kAnyLayout);
-  ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
-}
-
-TEST(Tensor, FP16) {
-  using platform::float16;
-  framework::Tensor src;
-  float16* src_ptr = src.mutable_data<float16>({2, 3}, platform::CPUPlace());
-  for (int i = 0; i < 2 * 3; ++i) {
-    src_ptr[i] = static_cast<float16>(i);
-  }
-  EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16));
-  // EXPECT a human readable error message
-  // src.data<uint8_t>();
-  // Tensor holds the wrong type, it holds N6paddle8platform7float16E at
-  // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
-}
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
deleted file mode 100644
index fb6cc1f210b29f783f2f2a04fff01a3c6f92ea3e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tensor_util.cc
+++ /dev/null
@@ -1,545 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/fluid/framework/tensor_util.h"
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-
-void TensorCopy(const Tensor& src, const platform::Place& dst_place,
-                const platform::DeviceContext& ctx, Tensor* dst) {
-  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
-          << dst_place;
-  src.check_memory_size();
-
-  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
-  auto src_place = src.place();
-  auto src_ptr = src.data<void>();
-
-  auto dst_ptr = dst->mutable_data(dst_place, src.type());
-
-  auto size = src.numel() * SizeOfType(src.type());
-
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    if (platform::is_same_place(src_place, dst_place)) {
-      if (src_ptr == dst_ptr) {
-        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-                << dst_place;
-        return;
-      }
-      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-                   stream);
-    } else {
-      if (platform::is_same_place(ctx_place, src_place)) {
-        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-                     stream);
-        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
-      } else if (platform::is_same_place(ctx_place, dst_place)) {
-        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
-        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-                     stream);
-      } else {
-        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
-      }
-    }
-  } else {
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
-  }
-#endif
-}
-
-void TensorCopy(const Tensor& src, const platform::Place& dst_place,
-                Tensor* dst) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(dst_place)) {
-    dev_ctx = pool.Get(dst_place);
-  } else {
-    dev_ctx = pool.Get(src.place());
-  }
-  TensorCopy(src, dst_place, *dev_ctx, dst);
-}
-
-void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
-                    Tensor* dst) {
-  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
-          << " to " << dst_place;
-  src.check_memory_size();
-  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
-  auto src_place = src.place();
-  auto src_ptr = src.data<void>();
-  auto dst_ptr = dst->mutable_data(dst_place, src.type());
-  auto size = src.numel() * SizeOfType(src.type());
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    platform::RecordEvent record_event("TensorCopy:GPU->CPU");
-    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
-    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    platform::RecordEvent record_event("TensorCopy:CPU->GPU");
-    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    platform::RecordEvent record_event("TensorCopy:GPU->GPU");
-    if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
-      VLOG(3) << "Skip copy the same data from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_cuda_pinned_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU");
-    auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
-                 nullptr);
-  } else {
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
-  }
-#endif
-}
-
-template <typename Predicate, typename DevCtx>
-struct AnyDTypeVisitor {
-  Predicate predicate_;
-  const Tensor& tensor_;
-  const DevCtx& ctx_;
-  Tensor* out_;
-
-  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
-                  Tensor* out)
-      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
-
-  template <typename T>
-  void apply() const {
-    auto t = EigenVector<T>::Flatten(tensor_);
-    auto o = EigenScalar<bool>::From(*out_);
-    // return any of predicate_(t) is true.
-    o.device(*ctx_.eigen_device()) = predicate_(t).any();
-  }
-};
-
-template <typename Predicate, typename DevCtx>
-inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
-                    const DevCtx& ctx, framework::Tensor* out) {
-  VisitDataType(tensor.type(), AnyDTypeVisitor<Predicate, DevCtx>(
-                                   predicate, tensor, ctx, out));
-}
-
-template <typename Predicate>
-class AnyVisitor : public boost::static_visitor<bool> {
- private:
-  const framework::Tensor& tensor_;
-  Predicate predicate_;
-
- public:
-  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
-      : tensor_(tensor), predicate_(std::move(predicate)) {}
-
-  template <typename Place>
-  bool operator()(const Place& place) const {
-    framework::Tensor out;
-    out.Resize({1});
-    out.mutable_data<bool>(place);
-    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
-    AnyImpl(predicate_, tensor_, *ctx, &out);
-    return this->GetResult(out, place);
-  }
-
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CUDAPlace& gpu) const {
-    platform::CPUPlace cpu;
-    framework::Tensor tmp;
-    tmp.Resize({1});
-    tmp.mutable_data<bool>(cpu);
-    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
-    gpuctx->Wait();
-    TensorCopy(out, cpu, *gpuctx, &tmp);
-    gpuctx->Wait();
-    return GetResult(tmp, cpu);
-  }
-
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CPUPlace& cpu) const {
-    return *out.data<bool>();
-  }
-
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CUDAPinnedPlace& cpu) const {
-    return *out.data<bool>();
-  }
-};
-
-template <typename Predicate>
-class AnyOutVisitor : public boost::static_visitor<> {
- private:
-  const framework::Tensor& tensor_;
-  mutable framework::Tensor* out_;
-  Predicate predicate_;
-
- public:
-  AnyOutVisitor(const framework::Tensor& tensor, Predicate predicate,
-                framework::Tensor* out)
-      : tensor_(tensor), out_(out), predicate_(std::move(predicate)) {}
-
-  template <typename Place>
-  void operator()(const Place& place) const {
-    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
-    out_->Resize({1});
-    out_->mutable_data<bool>(place);
-    AnyImpl(predicate_, tensor_, *ctx, out_);
-  }
-};
-
-template <typename Predicate>
-inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
-  AnyVisitor<Predicate> visitor(tensor, predicate);
-  auto place = tensor.place();
-  return platform::VisitPlace(place, visitor);
-}
-
-template <typename Predicate>
-inline void Any(const framework::Tensor& tensor, Predicate predicate,
-                framework::Tensor* out) {
-  AnyOutVisitor<Predicate> visitor(tensor, predicate, out);
-  auto place = tensor.place();
-  platform::VisitPlace(place, visitor);
-}
-
-struct ContainsNANPredicate {
-  template <typename T>
-  auto operator()(const T& eigen_vec) const
-      -> decltype(std::declval<T>().isnan()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isnan();
-  }
-};
-
-bool TensorContainsNAN(const framework::Tensor& tensor) {
-  ContainsNANPredicate predicate;
-  return Any(tensor, predicate);
-}
-
-void TensorContainsNAN(const framework::Tensor& tensor,
-                       framework::Tensor* out) {
-  ContainsNANPredicate predicate;
-  Any(tensor, predicate, out);
-}
-
-struct ContainsInfPredicate {
-  template <typename T>
-  auto operator()(const T& eigen_vec) const
-      -> decltype(std::declval<T>().isinf()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isinf();
-  }
-};
-
-bool TensorContainsInf(const framework::Tensor& tensor) {
-  ContainsInfPredicate predicate;
-  return Any(tensor, predicate);
-}
-
-void TensorContainsInf(const framework::Tensor& tensor,
-                       framework::Tensor* out) {
-  ContainsInfPredicate predicate;
-  Any(tensor, predicate, out);
-}
-
-// NOTE(dzhwinter):
-// Isfinite need a AllVisitor to loop through all the elements.
-// We choose two cuda call instead of one allvisitor. The AllVisitor
-// should be implemented if the performance hurts.
-bool TensorIsfinite(const framework::Tensor& tensor) {
-  ContainsInfPredicate pred_inf;
-  ContainsNANPredicate pred_nan;
-  return !Any(tensor, pred_inf) && !Any(tensor, pred_nan);
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-static inline void __global__ BothFalse(const T* cmp, T* out) {
-  out[0] = (!cmp[0]) && (!out[0]);
-}
-#endif
-
-struct BothFalseVisitor : public boost::static_visitor<> {
-  const framework::Tensor& in_;
-  mutable framework::Tensor* out_;
-  BothFalseVisitor(const framework::Tensor& in, framework::Tensor* out)
-      : in_(in), out_(out) {}
-
-  template <typename Place>
-  void operator()(const Place& place) const {
-    VisitorImpl(place);
-  }
-
-  void VisitorImpl(const platform::CUDAPlace& gpu) const {
-#ifdef PADDLE_WITH_CUDA
-    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
-    BothFalse<bool><<<1, 1, 0, ctx->stream()>>>(in_.data<bool>(),
-                                                out_->mutable_data<bool>(gpu));
-#endif
-  }
-
-  void VisitorImpl(const platform::CPUPlace& cpu) const {
-    bool lhs = !in_.data<bool>()[0];
-    bool rhs = !out_->mutable_data<bool>(cpu)[0];
-    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
-  }
-
-  void VisitorImpl(
-      const platform::CUDAPinnedPlace& cpu /* equals to cpu*/) const {
-    bool lhs = !in_.data<bool>()[0];
-    bool rhs = !out_->mutable_data<bool>(cpu)[0];
-    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
-  }
-};
-
-void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
-  framework::Tensor tmp;
-  TensorContainsInf(tensor, &tmp);
-  TensorContainsNAN(tensor, out);
-  BothFalseVisitor visitor(tmp, out);
-  auto place = tensor.place();
-  platform::VisitPlace(place, visitor);
-}
-
-void TensorToStream(std::ostream& os, const Tensor& tensor,
-                    const platform::DeviceContext& dev_ctx) {
-  {  // the 1st field, uint32_t version
-    constexpr uint32_t version = 0;
-    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
-  }
-  {  // the 2nd field, tensor description
-     // int32_t  size
-     // void*    protobuf message
-    proto::VarType::TensorDesc desc;
-    desc.set_data_type(tensor.type());
-    auto dims = framework::vectorize(tensor.dims());
-    auto* pb_dims = desc.mutable_dims();
-    pb_dims->Resize(static_cast<int>(dims.size()), 0);
-    std::copy(dims.begin(), dims.end(), pb_dims->begin());
-    int32_t size = desc.ByteSize();
-    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
-    auto out = desc.SerializeAsString();
-    os.write(out.data(), size);
-  }
-  {  // the 3rd field, tensor data
-    uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
-
-    auto* data_ptr = tensor.data<void>();
-    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                   "Index overflow when writing tensor");
-    if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto& gpu_dev_ctx =
-          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      platform::CPUPlace cpu;
-      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-      while (size != 0) {
-        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu, buf.get(),
-                     boost::get<platform::CUDAPlace>(tensor.place()),
-                     reinterpret_cast<const void*>(data), size_to_write,
-                     gpu_dev_ctx.stream());
-        gpu_dev_ctx.Wait();
-        os.write(buf.get(), size_to_write);
-        data += size_to_write;
-        size -= size_to_write;
-      }
-#else
-      PADDLE_THROW("Unexpected branch");
-#endif
-    } else {
-      os.write(static_cast<const char*>(data_ptr),
-               static_cast<std::streamsize>(size));
-    }
-  }
-}
-
-struct DeserializedDataFunctor {
-  DeserializedDataFunctor(void** buf, Tensor* tensor,
-                          const platform::Place& place)
-      : buf_(buf), tensor_(tensor), place_(place) {}
-
-  template <typename T>
-  void apply() {
-    *buf_ = tensor_->mutable_data<T>(place_);
-  }
-
-  void** buf_;
-  Tensor* tensor_;
-  platform::Place place_;
-};
-
-void TensorFromStream(std::istream& is, Tensor* tensor,
-                      const platform::DeviceContext& dev_ctx) {
-  uint32_t version;
-  is.read(reinterpret_cast<char*>(&version), sizeof(version));
-  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  proto::VarType::TensorDesc desc;
-  {  // int32_t size
-     // proto buffer
-    int32_t size;
-    is.read(reinterpret_cast<char*>(&size), sizeof(size));
-    std::unique_ptr<char[]> buf(new char[size]);
-    is.read(reinterpret_cast<char*>(buf.get()), size);
-    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                   "Cannot parse tensor desc");
-  }
-  {  // read tensor
-    std::vector<int64_t> dims;
-    dims.reserve(static_cast<size_t>(desc.dims().size()));
-    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
-    tensor->Resize(framework::make_ddim(dims));
-    void* buf;
-    auto ctx = platform::CPUDeviceContext();
-    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      Tensor cpu_tensor;
-      cpu_tensor.Resize(framework::make_ddim(dims));
-      framework::VisitDataType(
-          desc.data_type(),
-          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), size);
-      auto dst_place = dev_ctx.GetPlace();
-      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
-#else
-      PADDLE_THROW("Unexpected branch");
-#endif
-    } else {
-      framework::VisitDataType(
-          desc.data_type(),
-          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), size);
-    }
-  }
-}
-
-template <typename T>
-std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
-  auto inspect = tensor.data<T>();
-  auto element_num = tensor.numel();
-
-  os << "\tdata: [";
-  if (element_num > 0) {
-    os << inspect[0];
-    for (int j = 1; j < element_num; ++j) {
-      os << " " << inspect[j];
-    }
-  }
-  os << "]";
-  return os;
-}
-
-std::ostream& operator<<(std::ostream& os, const Tensor& t) {
-  os << "\tdim: " << t.dims() << "\n";
-  os << "\tlayout: " << DataLayoutToString(t.layout()) << "\n";
-
-  Tensor tensor;
-  tensor.Resize(t.dims());
-  if (platform::is_cpu_place(t.place())) {
-    tensor.ShareDataWith(t);
-  } else {
-    platform::CPUPlace place;
-    framework::TensorCopy(t, place, &tensor);
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& dev_ctx = *pool.Get(t.place());
-    dev_ctx.Wait();
-  }
-
-#define PrintTensorCallback(cpp_type, proto_type) \
-  do {                                            \
-    if (tensor.type() == proto_type) {            \
-      os << "\tdtype: " << proto_type << "\n";    \
-      print_tensor<cpp_type>(os, tensor);         \
-      return os;                                  \
-    }                                             \
-  } while (0)
-
-  _ForEachDataType_(PrintTensorCallback);
-  VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
-  return os;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu
deleted file mode 120000
index edd88c4e547016c7ec0e0c95547b3ede4b6f3c60..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tensor_util.cu
+++ /dev/null
@@ -1 +0,0 @@
-tensor_util.cc
\ No newline at end of file
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
deleted file mode 100644
index cab72e294f6c2b07da8d5db9bf38de8732c0e5d8..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tensor_util.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
-// and dst_place are two different GPU, to ensure that the operation can
-// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
-// If ctx_place and src_place are the same, src_ctx.Wait() is added
-// after memory::Copy; if ctx_place and dst_place are the same,
-// src_ctx.Wait() is added before memory::Copy.
-void TensorCopy(const Tensor& src, const platform::Place& dst_place,
-                const platform::DeviceContext& ctx, Tensor* dst);
-
-// NOTE(zcd): If the src.place() and dst_place are two different GPU,
-// the copy operation is carried out on the dst_place's stream. This is
-// very important, because TensorCopy is an async operator, and in most
-// case, once this copy operator returns, dst is to be used in dst_place's
-// stream, if this copy operation is carried out on the src_place's stream,
-// when dst is used in dst_place's stream the copy operation may be
-// not completed.
-void TensorCopy(const Tensor& src, const platform::Place& dst_place,
-                Tensor* dst);
-
-void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
-                    Tensor* dst);
-
-template <typename T>
-void TensorFromVector(const std::vector<T>& src,
-                      const platform::DeviceContext& ctx, Tensor* dst);
-template <typename T>
-void TensorFromVector(const std::vector<T>& src, Tensor* dst);
-
-template <typename T>
-void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
-                    std::vector<T>* dst);
-template <typename T>
-void TesnorToVector(const Tensor& src, std::vector<T>* dst);
-
-// copy the result bool to cpu
-bool TensorContainsNAN(const framework::Tensor& tensor);
-bool TensorContainsInf(const framework::Tensor& tensor);
-bool TensorIsfinite(const framework::Tensor& tensor);
-
-// store the result bool in gpu tensor, async operation. Faster than above ones.
-void TensorContainsNAN(const framework::Tensor& tensor, framework::Tensor* out);
-void TensorContainsInf(const framework::Tensor& tensor, framework::Tensor* out);
-void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out);
-
-void TensorToStream(std::ostream& os, const Tensor& tensor,
-                    const platform::DeviceContext& dev_ctx);
-void TensorFromStream(std::istream& is, Tensor* tensor,
-                      const platform::DeviceContext& dev_ctx);
-
-//
-// The implementation of template functions.
-//
-
-template <typename T>
-void TensorFromVector(const std::vector<T>& src,
-                      const platform::DeviceContext& ctx, Tensor* dst) {
-  auto dst_place = ctx.GetPlace();
-  auto src_ptr = static_cast<const void*>(src.data());
-  platform::CPUPlace src_place;
-  dst->Resize({static_cast<int64_t>(src.size())});
-  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
-  auto size = src.size() * sizeof(T);
-
-  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
-                 src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
-template <typename T>
-void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
-  platform::CPUPlace dst_place = platform::CPUPlace();
-  auto src_ptr = static_cast<const void*>(src.data());
-  platform::CPUPlace src_place;
-  dst->Resize({static_cast<int64_t>(src.size())});
-  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
-  auto size = src.size() * sizeof(T);
-
-  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-}
-
-template <typename T>
-void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
-                    std::vector<T>* dst) {
-  auto src_ptr = static_cast<const void*>(src.data<T>());
-  auto size = src.numel() * sizeof(T);
-
-  platform::CPUPlace dst_place;
-  dst->resize(src.numel());
-  auto dst_ptr = static_cast<void*>(dst->data());
-
-  if (platform::is_cpu_place(src.place())) {
-    memory::Copy(dst_place, dst_ptr,
-                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
-        src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
-template <typename T>
-void TensorToVector(const Tensor& src, std::vector<T>* dst) {
-  auto src_ptr = static_cast<const void*>(src.data<T>());
-  auto size = src.numel() * sizeof(T);
-
-  platform::CPUPlace dst_place;
-  dst->resize(src.numel());
-  auto dst_ptr = static_cast<void*>(dst->data());
-
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(src.place()), true);
-
-  memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
-               src_ptr, size);
-}
-
-std::ostream& operator<<(std::ostream& os, const Tensor& t);
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
deleted file mode 100644
index 17c55378178325b40e394f4b422c22c1c10bd130..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ /dev/null
@@ -1,396 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/tensor_util.h"
-#include <gtest/gtest.h>
-#include <cmath>
-#include <string>
-
-namespace paddle {
-namespace framework {
-
-TEST(TensorCopy, Tensor) {
-  Tensor src_tensor;
-  Tensor dst_tensor;
-  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
-
-  int* src_ptr =
-      src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
-
-  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-  memcpy(src_ptr, arr, 9 * sizeof(int));
-  src_tensor.set_layout(DataLayout::kAnyLayout);
-
-  auto cpu_place = new platform::CPUPlace();
-  TensorCopy(src_tensor, *cpu_place, &dst_tensor);
-
-  const int* dst_ptr = dst_tensor.data<int>();
-  EXPECT_NE(src_ptr, dst_ptr);
-  for (size_t i = 0; i < 9; ++i) {
-    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-  }
-
-  TensorCopy(dst_tensor, *cpu_place, &dst_tensor);
-  for (size_t i = 0; i < 9; ++i) {
-    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-  }
-
-  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
-
-  Tensor slice_tensor = src_tensor.Slice(1, 2);
-  TensorCopy(slice_tensor, *cpu_place, &dst_tensor);
-  const int* slice_ptr = slice_tensor.data<int>();
-  dst_ptr = dst_tensor.data<int>();
-  EXPECT_NE(dst_ptr, slice_ptr);
-  for (size_t i = 0; i < 3; ++i) {
-    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-  }
-  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    Tensor src_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    int* src_ptr =
-        src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
-
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-
-    // CPU Tensor to GPU Tensor
-    auto gpu_place = new platform::CUDAPlace(0);
-    platform::CUDADeviceContext gpu_ctx(*gpu_place);
-    TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
-
-    // GPU Tensor to CPU Tensor
-    auto cpu_place = new platform::CPUPlace();
-    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* dst_ptr = dst_tensor.data<int>();
-    EXPECT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    // Copy the same tensor
-    TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
-    gpu_ctx.Wait();
-    const int* dst_ptr_tmp = dst_tensor.data<int>();
-    EXPECT_NE(src_ptr, dst_ptr_tmp);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
-    }
-
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-
-    // CPU Slice Tensor to GPU Tensor
-    TensorCopy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
-
-    // GPU Tensor to CPU Tensor
-    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
-
-    // Sync before Compare Slice Tensors
-    gpu_ctx.Wait();
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    EXPECT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-
-    EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
-  }
-#endif
-}
-
-TEST(TensorFromVector, Tensor) {
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    paddle::framework::Tensor cpu_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
-
-    // Compare Tensors
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* src_ptr = src_vec.data();
-    EXPECT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
-    paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
-    cpu_ptr = cpu_tensor.data<int>();
-    src_ptr = src_vec.data();
-    EXPECT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    delete cpu_place;
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    paddle::framework::Tensor cpu_tensor;
-    paddle::framework::Tensor gpu_tensor;
-    paddle::framework::Tensor dst_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
-    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
-
-    // Copy to GPUTensor
-    gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
-    auto gpu_place = new paddle::platform::CUDAPlace();
-    paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
-    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-    // Copy from GPU to CPU tensor for comparison
-    paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* src_ptr = src_vec.data();
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* dst_ptr = dst_tensor.data<int>();
-    EXPECT_NE(src_ptr, cpu_ptr);
-    EXPECT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-
-    cpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
-    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
-    gpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
-    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-    paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    src_ptr = src_vec.data();
-    cpu_ptr = cpu_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    EXPECT_NE(src_ptr, cpu_ptr);
-    EXPECT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    delete cpu_place;
-    delete gpu_place;
-  }
-#endif
-}
-
-TEST(TensorToVector, Tensor) {
-  {
-    paddle::framework::Tensor src;
-    int* src_ptr = src.mutable_data<int>({3, 3}, paddle::platform::CPUPlace());
-    for (int i = 0; i < 3 * 3; ++i) {
-      src_ptr[i] = i;
-    }
-
-    paddle::platform::CPUPlace place;
-    std::vector<int> dst;
-    paddle::framework::TensorToVector<int>(src, &dst);
-
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_ptr[i], dst[i]);
-    }
-  }
-#ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    paddle::framework::Tensor gpu_tensor;
-    paddle::platform::CUDAPlace place;
-    paddle::platform::CUDADeviceContext gpu_ctx(place);
-    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-
-    std::vector<int> dst;
-    paddle::framework::TensorToVector<int>(gpu_tensor, gpu_ctx, &dst);
-
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_vec[i], dst[i]);
-    }
-  }
-#endif
-}
-
-TEST(TensorContainsNAN, CPU) {
-  {
-    paddle::framework::Tensor src;
-    float* buf = src.mutable_data<float>({3}, paddle::platform::CPUPlace());
-    buf[0] = 0.0;
-    buf[1] = NAN;
-    buf[2] = 0.0;
-    EXPECT_TRUE(paddle::framework::TensorContainsNAN(src));
-    buf[1] = 0.0;
-    EXPECT_FALSE(paddle::framework::TensorContainsNAN(src));
-  }
-
-  {
-    paddle::framework::Tensor src;
-    paddle::platform::float16* buf =
-        src.mutable_data<paddle::platform::float16>(
-            {3}, paddle::platform::CPUPlace());
-    buf[0] = 0.0;
-    buf[1].x = 0x7fff;
-    buf[2] = 0.0;
-    EXPECT_TRUE(paddle::framework::TensorContainsNAN(src));
-    buf[1] = 0.0;
-    EXPECT_FALSE(paddle::framework::TensorContainsNAN(src));
-  }
-}
-
-TEST(TensorContainsInf, CPU) {
-  {
-    paddle::framework::Tensor src;
-    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
-    buf[0] = 1.0;
-    buf[1] = INFINITY;
-    buf[2] = 0.0;
-    EXPECT_TRUE(paddle::framework::TensorContainsInf(src));
-    buf[1] = 1.0;
-    EXPECT_FALSE(paddle::framework::TensorContainsInf(src));
-  }
-
-  {
-    paddle::framework::Tensor src;
-    paddle::platform::float16* buf =
-        src.mutable_data<paddle::platform::float16>(
-            {3}, paddle::platform::CPUPlace());
-    buf[0] = 1.0;
-    buf[1].x = 0x7c00;
-    buf[2] = 0.0;
-    EXPECT_TRUE(paddle::framework::TensorContainsInf(src));
-    buf[1] = 1.0;
-    EXPECT_FALSE(paddle::framework::TensorContainsInf(src));
-  }
-}
-
-TEST(TensorIsfinite, CPU) {
-  {
-    paddle::framework::Tensor src, out;
-    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
-    buf[0] = 1.0;
-    buf[1] = INFINITY;
-    buf[2] = 0.0;
-    paddle::framework::TensorIsfinite(src, &out);
-    EXPECT_EQ(out.data<bool>()[0], false);
-    buf[1] = 1.0;
-    paddle::framework::TensorIsfinite(src, &out);
-    EXPECT_EQ(out.data<bool>()[0], true);
-  }
-
-  {
-    paddle::framework::Tensor src, out;
-    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
-    buf[0] = 1.0;
-    buf[1] = NAN;
-    buf[2] = 0.0;
-    paddle::framework::TensorIsfinite(src, &out);
-    EXPECT_EQ(out.data<bool>()[0], false);
-    buf[1] = 1.0;
-    paddle::framework::TensorIsfinite(src, &out);
-    EXPECT_EQ(out.data<bool>()[0], true);
-  }
-
-  {
-    paddle::framework::Tensor src, out;
-    paddle::platform::float16* buf =
-        src.mutable_data<paddle::platform::float16>(
-            {3}, paddle::platform::CPUPlace());
-    buf[0] = 1.0;
-    buf[1].x = 0x7c00;
-    buf[2] = 0.0;
-    paddle::framework::TensorIsfinite(src, &out);
-    EXPECT_EQ(out.data<bool>()[0], false);
-    buf[1] = 1.0;
-    paddle::framework::TensorIsfinite(src, &out);
-    EXPECT_EQ(out.data<bool>()[0], true);
-    buf[1].x = 0x7fff;
-    paddle::framework::TensorIsfinite(src, &out);
-    EXPECT_EQ(out.data<bool>()[0], false);
-  }
-}
-
-TEST(Tensor, FromAndToStream) {
-  framework::Tensor src_tensor;
-  int array[6] = {1, 2, 3, 4, 5, 6};
-  src_tensor.Resize({2, 3});
-  int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
-  for (int i = 0; i < 6; ++i) {
-    src_ptr[i] = array[i];
-  }
-  {
-    framework::Tensor dst_tensor;
-    auto place = new platform::CPUPlace();
-    platform::CPUDeviceContext cpu_ctx(*place);
-    std::ostringstream oss;
-    TensorToStream(oss, src_tensor, cpu_ctx);
-
-    std::istringstream iss(oss.str());
-    TensorFromStream(iss, &dst_tensor, cpu_ctx);
-    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < 5; ++i) {
-      EXPECT_EQ(dst_ptr[i], array[i]);
-    }
-    EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
-    delete place;
-  }
-#ifdef PADDLE_WITH_CUDA
-  {
-    Tensor gpu_tensor;
-    gpu_tensor.Resize({2, 3});
-    Tensor dst_tensor;
-
-    auto gpu_place = new platform::CUDAPlace();
-    platform::CUDADeviceContext gpu_ctx(*gpu_place);
-
-    TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
-
-    std::ostringstream oss;
-    TensorToStream(oss, gpu_tensor, gpu_ctx);
-
-    std::istringstream iss(oss.str());
-    TensorFromStream(
-        iss, &dst_tensor,
-        *platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));
-
-    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < 6; ++i) {
-      EXPECT_EQ(dst_ptr[i], array[i]);
-    }
-    delete gpu_place;
-  }
-#endif
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu
deleted file mode 100644
index a51f74199e714b8606c9766c57bc6b1dc4c73c65..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ /dev/null
@@ -1,260 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-static __global__ void FillNAN(float* buf) {
-  buf[0] = 0.0;
-  buf[1] = 0.1;
-  buf[2] = NAN;
-}
-
-static __global__ void FillInf(float* buf) {
-  buf[0] = INFINITY;
-  buf[1] = 0.1;
-  buf[2] = 0.2;
-}
-
-static __global__ void FillNAN(platform::float16* buf) {
-  buf[0] = 0.0;
-  buf[1] = 0.1;
-  buf[2].x = 0x7fff;
-}
-
-static __global__ void FillInf(platform::float16* buf) {
-  buf[0] = 0.0;
-  buf[1].x = 0x7c00;
-  buf[2] = 0.5;
-}
-
-static __global__ void FillFinite(float* buf) {
-  buf[0] = 0.0;
-  buf[1] = 0.1;
-  buf[2] = 0.2;
-}
-
-static __global__ void FillFinite(platform::float16* buf) {
-  buf[0] = 0.0;
-  buf[1] = 0.1;
-  buf[2] = 0.2;
-}
-
-TEST(TensorContainsNAN, GPU) {
-  paddle::platform::CUDAPlace gpu(0);
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto* cuda_ctx = pool.GetByPlace(gpu);
-  {
-    Tensor tensor;
-    float* buf = tensor.mutable_data<float>({3}, gpu);
-    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    ASSERT_TRUE(TensorContainsNAN(tensor));
-  }
-  {
-    Tensor tensor;
-    paddle::platform::float16* buf =
-        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
-    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    ASSERT_TRUE(TensorContainsNAN(tensor));
-  }
-}
-
-TEST(TensorContainsInf, GPU) {
-  paddle::platform::CUDAPlace gpu(0);
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto* cuda_ctx = pool.GetByPlace(gpu);
-  {
-    Tensor tensor;
-    float* buf = tensor.mutable_data<float>({3}, gpu);
-    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    ASSERT_TRUE(TensorContainsInf(tensor));
-  }
-  {
-    Tensor tensor;
-    paddle::platform::float16* buf =
-        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
-    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    ASSERT_TRUE(TensorContainsInf(tensor));
-  }
-}
-
-TEST(TensorIsfinite, GPU) {
-  paddle::platform::CUDAPlace gpu(0);
-  using paddle::platform::float16;
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto* cuda_ctx = pool.GetByPlace(gpu);
-  // contains inf
-  {
-    Tensor tensor;
-    float* buf = tensor.mutable_data<float>({3}, gpu);
-    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    EXPECT_TRUE(!TensorIsfinite(tensor));
-  }
-  {
-    Tensor tensor;
-    float16* buf = tensor.mutable_data<float16>({3}, gpu);
-    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    EXPECT_TRUE(!TensorIsfinite(tensor));
-  }
-
-  // contains nan
-  {
-    Tensor tensor;
-    float* buf = tensor.mutable_data<float>({3}, gpu);
-    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    EXPECT_TRUE(!TensorIsfinite(tensor));
-  }
-  {
-    Tensor tensor;
-    float16* buf = tensor.mutable_data<float16>({3}, gpu);
-    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    EXPECT_TRUE(!TensorIsfinite(tensor));
-  }
-
-  // all element are finite
-  {
-    Tensor tensor;
-    float* buf = tensor.mutable_data<float>({3}, gpu);
-    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    EXPECT_TRUE(TensorIsfinite(tensor));
-  }
-  {
-    Tensor tensor;
-    float16* buf = tensor.mutable_data<float16>({3}, gpu);
-    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    EXPECT_TRUE(TensorIsfinite(tensor));
-  }
-}
-
-TEST(TensorContainsInf, GPUWithoutWait) {
-  paddle::platform::CUDAPlace gpu(0);
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto* cuda_ctx = pool.GetByPlace(gpu);
-  {
-    Tensor tensor, out;
-    float* buf = tensor.mutable_data<float>({3}, gpu);
-    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    TensorContainsInf(tensor, &out);
-    platform::CPUPlace cpu;
-    Tensor tmp;
-    TensorCopy(out, cpu, *cuda_ctx, &tmp);
-    cuda_ctx->Wait();
-    ASSERT_EQ(tmp.data<bool>()[0], true);
-  }
-  {
-    Tensor tensor, out;
-    paddle::platform::float16* buf =
-        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
-    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    TensorContainsInf(tensor, &out);
-    platform::CPUPlace cpu;
-    Tensor tmp;
-    TensorCopy(out, cpu, *cuda_ctx, &tmp);
-    cuda_ctx->Wait();
-    ASSERT_EQ(tmp.data<bool>()[0], true);
-  }
-}
-
-TEST(TensorContainsNAN, GPUWithoutWait) {
-  paddle::platform::CUDAPlace gpu(0);
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto* cuda_ctx = pool.GetByPlace(gpu);
-  {
-    Tensor tensor, out;
-    float* buf = tensor.mutable_data<float>({3}, gpu);
-    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    TensorContainsNAN(tensor, &out);
-    platform::CPUPlace cpu;
-    Tensor tmp;
-    TensorCopy(out, cpu, *cuda_ctx, &tmp);
-    cuda_ctx->Wait();
-    ASSERT_EQ(tmp.data<bool>()[0], true);
-  }
-  {
-    Tensor tensor, out;
-    paddle::platform::float16* buf =
-        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
-    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    TensorContainsNAN(tensor, &out);
-    platform::CPUPlace cpu;
-    Tensor tmp;
-    TensorCopy(out, cpu, *cuda_ctx, &tmp);
-    cuda_ctx->Wait();
-    ASSERT_EQ(tmp.data<bool>()[0], true);
-  }
-}
-
-TEST(TensorIsfinite, GPUWithoutWait) {
-  paddle::platform::CUDAPlace gpu(0);
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto* cuda_ctx = pool.GetByPlace(gpu);
-  {
-    Tensor tensor, out;
-    float* buf = tensor.mutable_data<float>({3}, gpu);
-    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    TensorIsfinite(tensor, &out);
-    platform::CPUPlace cpu;
-    Tensor tmp;
-    TensorCopy(out, cpu, *cuda_ctx, &tmp);
-    cuda_ctx->Wait();
-    EXPECT_EQ(tmp.data<bool>()[0], false);
-  }
-  {
-    Tensor tensor, out;
-    float* buf = tensor.mutable_data<float>({3}, gpu);
-    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    TensorIsfinite(tensor, &out);
-    platform::CPUPlace cpu;
-    Tensor tmp;
-    TensorCopy(out, cpu, *cuda_ctx, &tmp);
-    cuda_ctx->Wait();
-    EXPECT_EQ(tmp.data<bool>()[0], false);
-  }
-  {
-    Tensor tensor, out;
-    float* buf = tensor.mutable_data<float>({3}, gpu);
-    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-    cuda_ctx->Wait();
-    TensorIsfinite(tensor, &out);
-    platform::CPUPlace cpu;
-    Tensor tmp;
-    TensorCopy(out, cpu, *cuda_ctx, &tmp);
-    cuda_ctx->Wait();
-    EXPECT_EQ(tmp.data<bool>()[0], true);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
deleted file mode 100644
index 7f7f426d0e28224932fc96a3fefa0df1279e6475..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/threadpool.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/framework/threadpool.h"
-#include <memory>
-#include <utility>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/platform/enforce.h"
-
-DEFINE_int32(io_threadpool_size, 100,
-             "number of threads used for doing IO, default 100");
-
-DECLARE_int32(dist_threadpool_size);
-
-namespace paddle {
-namespace framework {
-std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
-std::once_flag ThreadPool::init_flag_;
-
-ThreadPool* ThreadPool::GetInstance() {
-  std::call_once(init_flag_, &ThreadPool::Init);
-  return threadpool_.get();
-}
-
-void ThreadPool::Init() {
-  if (threadpool_.get() == nullptr) {
-    // TODO(Yancey1989): specify the max threads number
-    int num_threads = std::thread::hardware_concurrency();
-    if (FLAGS_dist_threadpool_size > 0) {
-      num_threads = FLAGS_dist_threadpool_size;
-      VLOG(1) << "set dist_threadpool_size to " << num_threads;
-    }
-    PADDLE_ENFORCE_GT(num_threads, 0);
-    threadpool_.reset(new ThreadPool(num_threads));
-  }
-}
-
-ThreadPool::ThreadPool(int num_threads) : running_(true) {
-  threads_.resize(num_threads);
-  for (auto& thread : threads_) {
-    // TODO(Yancey1989): binding the thread on the specify CPU number
-    thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
-  }
-}
-
-ThreadPool::~ThreadPool() {
-  {
-    // notify all threads to stop running
-    std::unique_lock<std::mutex> l(mutex_);
-    running_ = false;
-  }
-  scheduled_.notify_all();
-
-  for (auto& t : threads_) {
-    t->join();
-    t.reset(nullptr);
-  }
-}
-
-void ThreadPool::TaskLoop() {
-  while (true) {
-    Task task;
-
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      scheduled_.wait(
-          lock, [this] { return !this->tasks_.empty() || !this->running_; });
-
-      if (!running_ && tasks_.empty()) {
-        return;
-      }
-
-      if (tasks_.empty()) {
-        PADDLE_THROW("This thread has no task to Run");
-      }
-
-      // pop a task from the task queue
-      task = std::move(tasks_.front());
-      tasks_.pop();
-    }
-    // run the task
-    task();
-  }
-}
-
-std::unique_ptr<ThreadPool> ThreadPoolIO::io_threadpool_(nullptr);
-std::once_flag ThreadPoolIO::io_init_flag_;
-
-ThreadPool* ThreadPoolIO::GetInstanceIO() {
-  std::call_once(io_init_flag_, &ThreadPoolIO::InitIO);
-  return io_threadpool_.get();
-}
-
-void ThreadPoolIO::InitIO() {
-  if (io_threadpool_.get() == nullptr) {
-    // TODO(typhoonzero1986): make this configurable
-    io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size));
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
deleted file mode 100644
index 7a51d18fbbf65f68725aa86a6a0ce4d15dff5673..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/threadpool.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <functional>
-#include <future>  // NOLINT
-#include <mutex>   // NOLINT
-#include <queue>
-#include <thread>  // NOLINT
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace framework {
-
-struct ExceptionHandler {
-  mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
-  explicit ExceptionHandler(
-      std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
-      : future_(std::move(f)) {}
-  void operator()() const {
-    auto ex = this->future_.get();
-    if (ex != nullptr) {
-      LOG(FATAL) << "The exception is thrown inside the thread pool. You "
-                    "should use RunAndGetException to handle the exception.\n"
-                    "The default exception handler is LOG(FATAL)."
-                 << ex->what();
-    }
-  }
-};
-
-// ThreadPool maintains a queue of tasks, and runs them using a fixed
-// number of threads.
-class ThreadPool {
- public:
-  explicit ThreadPool(int num_threads);
-
-  using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
-
-  // Returns the singleton of ThreadPool.
-  static ThreadPool* GetInstance();
-
-  ~ThreadPool();
-
-  // Run pushes a function to the task queue and returns a std::future
-  // object. To wait for the completion of the task, call
-  // std::future::wait().
-  template <typename Callback>
-  std::future<void> Run(Callback fn) {
-    auto f = this->RunAndGetException(fn);
-    return std::async(std::launch::deferred, ExceptionHandler(std::move(f)));
-  }
-
-  template <typename Callback>
-  std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
-      Callback fn) {
-    Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
-      try {
-        fn();
-      } catch (platform::EnforceNotMet ex) {
-        return std::unique_ptr<platform::EnforceNotMet>(
-            new platform::EnforceNotMet(ex));
-      } catch (const std::exception& e) {
-        LOG(FATAL) << "Unexpected exception is catched in thread pool. All "
-                      "throwable exception in Fluid should be an EnforceNotMet."
-                   << e.what();
-      }
-      return nullptr;
-    });
-    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      if (!running_) {
-        PADDLE_THROW("enqueue on stopped ThreadPool");
-      }
-      tasks_.push(std::move(task));
-    }
-    scheduled_.notify_one();
-    return f;
-  }
-
- private:
-  DISABLE_COPY_AND_ASSIGN(ThreadPool);
-
-  // The constructor starts threads to run TaskLoop, which retrieves
-  // and runs tasks from the queue.
-  void TaskLoop();
-
-  // Init is called by GetInstance.
-  static void Init();
-
- private:
-  static std::unique_ptr<ThreadPool> threadpool_;
-  static std::once_flag init_flag_;
-
-  std::vector<std::unique_ptr<std::thread>> threads_;
-
-  std::queue<Task> tasks_;
-  std::mutex mutex_;
-  bool running_;
-  std::condition_variable scheduled_;
-};
-
-class ThreadPoolIO : ThreadPool {
- public:
-  static ThreadPool* GetInstanceIO();
-  static void InitIO();
-
- private:
-  // NOTE: threadpool in base will be inhereted here.
-  static std::unique_ptr<ThreadPool> io_threadpool_;
-  static std::once_flag io_init_flag_;
-};
-
-// Run a function asynchronously.
-// NOTE: The function must return void. If the function need to return a value,
-// you can use lambda to capture a value pointer.
-template <typename Callback>
-std::future<void> Async(Callback callback) {
-  return ThreadPool::GetInstance()->Run(callback);
-}
-
-template <typename Callback>
-std::future<void> AsyncIO(Callback callback) {
-  return ThreadPoolIO::GetInstanceIO()->Run(callback);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc
deleted file mode 100644
index 884d61e23428a0ad758946295ca9c470767e93ef..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/threadpool_test.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <atomic>
-
-#include "paddle/fluid/framework/threadpool.h"
-
-namespace framework = paddle::framework;
-
-void do_sum(std::vector<std::future<void>>* fs, std::mutex* mu,
-            std::atomic<int>* sum, int cnt) {
-  for (int i = 0; i < cnt; ++i) {
-    std::lock_guard<std::mutex> l(*mu);
-    fs->push_back(framework::Async([sum]() { sum->fetch_add(1); }));
-  }
-}
-
-TEST(ThreadPool, ConcurrentInit) {
-  framework::ThreadPool* pool;
-  int n = 50;
-  std::vector<std::thread> threads;
-  for (int i = 0; i < n; ++i) {
-    std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
-    threads.push_back(std::move(t));
-  }
-  for (auto& t : threads) {
-    t.join();
-  }
-}
-
-TEST(ThreadPool, ConcurrentRun) {
-  std::atomic<int> sum(0);
-  std::vector<std::thread> threads;
-  std::vector<std::future<void>> fs;
-  std::mutex fs_mu;
-  int n = 50;
-  // sum = (n * (n + 1)) / 2
-  for (int i = 1; i <= n; ++i) {
-    std::thread t(do_sum, &fs, &fs_mu, &sum, i);
-    threads.push_back(std::move(t));
-  }
-  for (auto& t : threads) {
-    t.join();
-  }
-  for (auto& t : fs) {
-    t.wait();
-  }
-  EXPECT_EQ(sum, ((n + 1) * n) / 2);
-}
diff --git a/paddle/fluid/framework/trainer.cc b/paddle/fluid/framework/trainer.cc
deleted file mode 100644
index 644bd33a1420aa0ff54e34005eedd10c28342665..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/trainer.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/trainer.h"
-
-namespace paddle {
-namespace framework {
-
-void TrainerBase::SetScope(Scope* root_scope) { root_scope_ = root_scope; }
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
deleted file mode 100755
index 4ca303ceecd6d9211ca824a14322d0f8ef0d4c88..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/trainer.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/data_set.h"
-#include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/trainer_desc.pb.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/reader/blocking_queue.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace framework {
-
-class TrainerBase {
- public:
-  TrainerBase() {}
-  virtual ~TrainerBase() {}
-  // model memory are hosted in root_scope
-  void SetScope(Scope* root_scope);
-  void SetDebug(const bool debug) { debug_ = debug; }
-  void SetDataset(Dataset* dataset_ptr) { dataset_ptr_ = dataset_ptr; }
-  virtual void Initialize(const TrainerDesc& trainer_desc,
-                          Dataset* data_set) = 0;
-  virtual void InitTrainerEnv(const ProgramDesc& main_program,
-                              const platform::Place& place) = 0;
-  virtual void InitOtherEnv(const ProgramDesc& main_program) = 0;
-  virtual void Run() = 0;
-  virtual void Finalize() = 0;
-
- protected:
-  Scope* root_scope_;
-  bool debug_;
-  Dataset* dataset_ptr_;
-};
-
-// general trainer for async execution
-// local trainer and distributed trainer are supported
-// depends on the assigned device_worker
-class MultiTrainer : public TrainerBase {
- public:
-  MultiTrainer() {}
-  virtual ~MultiTrainer() {}
-  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
-  virtual void InitTrainerEnv(const ProgramDesc& main_program,
-                              const platform::Place& place);
-  virtual void InitOtherEnv(const ProgramDesc& main_program) {}
-  virtual void Run();
-  virtual void Finalize();
-
- protected:
-  int thread_num_;
-  std::vector<std::thread> threads_;
-  std::vector<DataFeed*> readers_;
-  std::vector<std::shared_ptr<DeviceWorker>> workers_;
-  std::vector<std::string> need_merge_var_names_;
-};
-
-class DistMultiTrainer : public MultiTrainer {
- public:
-  DistMultiTrainer() {}
-  virtual ~DistMultiTrainer() {}
-  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
-  virtual void InitOtherEnv(const ProgramDesc& main_program);
-  virtual void Run();
-  virtual void Finalize();
-  template <typename T>
-  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
-  virtual void FinalizeDumpEnv();
-  virtual void InitDumpEnv();
-  virtual void DumpWork();
-
- protected:
-  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
-  std::thread dump_thread_;
-  std::shared_ptr<FILE> fp_;
-  std::shared_ptr<paddle::framework::ChannelObject<std::string>> queue_;
-
-  bool need_dump_field_;
-  std::string dump_fields_path_;
-  std::string user_define_dump_filename_;
-  std::string dump_converter_;
-  std::vector<std::string> dump_fields_;
-  int mpi_rank_;
-};
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-class PipelineTrainer : public TrainerBase {
- public:
-  PipelineTrainer() {}
-  ~PipelineTrainer() override {}
-  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
-  void InitTrainerEnv(const ProgramDesc& main_program,
-                      const platform::Place& place) override;
-  void InitOtherEnv(const ProgramDesc& main_program) override {}
-  void Run() override;
-  void Finalize() override;
-
- protected:
-  int section_num_;
-  int pipeline_num_;
-  int scope_queue_size_;
-  int sync_steps_;
-
-  SectionWorkerParameter pipeline_config_;
-
-  // The in/output var names for each section
-  std::vector<std::unique_ptr<std::vector<std::string>>> in_var_names_;
-  std::vector<std::unique_ptr<std::vector<std::string>>> out_var_names_;
-
-  // Counter for the running thread
-  std::vector<std::vector<int*>> worker_count_;
-  std::vector<std::vector<std::unique_ptr<std::mutex>>> worker_count_mutex_;
-
-  // worker: [section_id][pipeline_id][thread_id]
-  std::vector<std::vector<
-      std::vector<std::shared_ptr<paddle::framework::DeviceWorker>>>>
-      workers_;
-  std::vector<std::thread> section_threads_;
-
-  // We use scope to maintain context info, and scopes
-  // will be deliverd between different sections.
-  std::vector<std::vector<std::unique_ptr<ScopeQueue>>> scope_queues_;
-  std::vector<Scope*> pipeline_scopes_;
-
-  // The parameters that should be syncronized between different cards using
-  // nccl all-reduce
-  std::shared_ptr<std::vector<std::string>> param_need_sync_;
-  std::vector<std::unique_ptr<SyncFunctor>> sync_functors_;
-  std::shared_ptr<platform::NCCLContextMap> nccl_ctx_map_;
-
-  std::vector<DataFeed*> readers_;
-
-  void InitFirstScopeQueue(ScopeQueue* scope_queue, int pipeline_id,
-                           const ProgramDesc& main_program);
-  void CopyParameters(const Scope& root_scope, int pipeline_id);
-  void construct_sync_functor();
-};
-#endif
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
deleted file mode 100755
index 284c78ba0f3ff80f3fadbe1c14a6a3d5ddc48bcf..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/trainer_desc.proto
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
-import "data_feed.proto";
-import "framework.proto";
-package paddle.framework;
-
-message TrainerDesc {
-  // class name for create trainer desc
-  // the matchness of trainer name and device worker name
-  // will be checked in python API
-  optional string class_name = 1;
-  // class name for creating device worker
-  optional string device_worker_name = 2;
-  // thread number
-  optional int32 thread_num = 3;
-  // if we need to binding cpu
-  optional bool binding_cpu = 4 [ default = false ];
-  repeated string filelist = 5;
-  optional bool debug = 6 [ default = false ];
-  optional FetchConfig fetch_config = 7;
-  optional bool use_cvm = 8 [ default = false ];
-  optional bool dump_slot = 9 [ default = false ];
-  optional float scale_datanorm = 10 [ default = -1 ];
-  optional int32 mpi_rank = 11 [ default = -1 ];
-  optional string dump_fields_path = 12;
-  repeated string dump_fields = 13;
-  optional string dump_converter = 14;
-  optional string user_define_dump_filename = 15;
-  
-  // device worker parameters
-  optional HogwildWorkerParameter hogwild_param = 101;
-  optional DownpourWorkerParameter downpour_param = 103;
-  optional PullDenseWorkerParameter pull_dense_param = 102;
-  optional SectionWorkerParameter section_param = 104;
-  // datafeed desc
-  optional DataFeedDesc data_desc = 201;
-  // adjust ins weight
-  optional AdjustInsWeightConfig adjust_ins_weight_config = 301;
-}
-
-message HogwildWorkerParameter { repeated string skip_ops = 1; }
-
-message DownpourWorkerParameter {
-  repeated TableParameter sparse_table = 1;
-  repeated TableParameter dense_table = 2;
-  repeated string skip_ops = 3;
-  repeated ProgramConfig program_config = 4;
-  optional bool push_sparse = 5 [ default = true ];
-  optional bool push_dense = 6 [ default = true ];
-  repeated string stat_var_names = 7;
-}
-
-message SectionWorkerParameter {
-  repeated SectionConfig section_config = 1;
-  optional int32 queue_size = 2 [ default = 1 ];
-  optional int64 sync_steps = 3 [ default = 1 ];
-  optional int32 start_cpu_core_id = 4 [ default = 1 ];
-  repeated string param_need_sync = 5;
-}
-
-message SectionConfig {
-  enum Place {
-    CPUPlace = 0;
-    CUDAPlace = 1;
-    CUDAPinnedPlace = 2;
-  }
-
-  // FIXME: How to use proto::ProgramDesc
-  // required string program_desc_str = 1;
-  optional proto.ProgramDesc program_desc = 1;
-  optional Place place = 2;
-  optional int32 concurrency = 3 [ default = 1 ];
-  repeated string section_in_var_names = 4;
-  repeated string section_out_var_names = 5;
-}
-
-message FetchConfig {
-  enum Method { PRINT = 0; }
-  repeated string fetch_var_names = 1;
-  repeated string fetch_var_str_format = 2;
-  optional int32 print_period = 3 [ default = 100 ];
-  optional Method method = 4 [ default = PRINT ];
-}
-
-message AdjustInsWeightConfig {
-  optional bool need_adjust = 1 [ default = false ];
-  optional string nid_slot = 2 [ default = "" ];
-  optional float nid_adjw_threshold = 3 [ default = 0.0 ];
-  optional float nid_adjw_ratio = 4 [ default = 0.0 ];
-  optional string ins_weight_slot = 5 [ default = "" ];
-}
-
-message ProgramConfig {
-  required string program_id = 1;
-  repeated int32 push_sparse_table_id = 2;
-  repeated int32 push_dense_table_id = 3;
-  repeated int32 pull_sparse_table_id = 4;
-  repeated int32 pull_dense_table_id = 5;
-}
-
-message PullDenseWorkerParameter {
-  // dense table only and specialized usage
-  optional int32 threshold = 1 [ default = 1 ];
-  optional int32 device_num = 2;
-  optional int32 sleep_time_ms = 3 [ default = 2 ];
-  repeated TableParameter dense_table = 4;
-}
-
-message TableParameter {
-  // dense table only
-  optional uint64 table_id = 1;
-  repeated string dense_value_name = 2;
-  repeated string dense_grad_name = 3;
-  repeated int32 push_dense_wait_times = 5;
-  // sparse table only
-  repeated string sparse_key_name = 6;
-  repeated string sparse_value_name = 7;
-  repeated string sparse_grad_name = 8;
-  repeated int32 push_sparse_wait_times = 9;
-  // sparse table only and specialized usage
-  optional int32 emb_dim = 10;
-  optional int32 fea_dim = 11;
-  optional string label_var_name = 12;
-}
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
deleted file mode 100644
index ce0eb5ec30c55d757a44a6cc2c374267c52c4adc..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/trainer_factory.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/trainer_factory.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/framework/trainer.h"
-
-namespace paddle {
-namespace framework {
-
-typedef std::shared_ptr<TrainerBase> (*CreatetrainerFunction)();
-typedef std::unordered_map<std::string, CreatetrainerFunction> trainerMap;
-trainerMap g_trainer_map;
-
-#define REGISTER_TRAINER_CLASS(trainer_class)                   \
-  namespace {                                                   \
-  std::shared_ptr<TrainerBase> Creator_##trainer_class() {      \
-    return std::shared_ptr<TrainerBase>(new trainer_class);     \
-  }                                                             \
-  class __Registerer_##trainer_class {                          \
-   public:                                                      \
-    __Registerer_##trainer_class() {                            \
-      g_trainer_map[#trainer_class] = &Creator_##trainer_class; \
-    }                                                           \
-  };                                                            \
-  __Registerer_##trainer_class g_registerer_##trainer_class;    \
-  }  // namespace
-
-std::string TrainerFactory::TrainerTypeList() {
-  std::string trainer_types;
-  for (auto iter = g_trainer_map.begin(); iter != g_trainer_map.end(); ++iter) {
-    if (iter != g_trainer_map.begin()) {
-      trainer_types += ", ";
-    }
-    trainer_types += iter->first;
-  }
-  return trainer_types;
-}
-
-std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
-    std::string trainer_class) {
-  if (g_trainer_map.count(trainer_class) < 1) {
-    LOG(WARNING) << "Trainer class: " << trainer_class << " not defined";
-    LOG(WARNING) << TrainerTypeList();
-    exit(-1);
-  }
-  return g_trainer_map[trainer_class]();
-}
-
-REGISTER_TRAINER_CLASS(MultiTrainer);
-REGISTER_TRAINER_CLASS(DistMultiTrainer);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-REGISTER_TRAINER_CLASS(PipelineTrainer);
-#endif
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/trainer_factory.h b/paddle/fluid/framework/trainer_factory.h
deleted file mode 100644
index 9c772a4f19ed9ba50f704ed62ef361555b1285fb..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/trainer_factory.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/trainer.h"
-
-namespace paddle {
-namespace framework {
-
-class TrainerFactory {
- public:
-  static std::string TrainerTypeList();
-  static std::shared_ptr<TrainerBase> CreateTrainer(std::string trainer_class);
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/trainer_test.cc b/paddle/fluid/framework/trainer_test.cc
deleted file mode 100644
index f689679d48696ced2ff1fe5c2d3706e3ed2190a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/trainer_test.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/trainer.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-TEST() {
-  // create multi trainer
-  // create hogwild device worker
-  // create dataset
-  // train for a while
-}
-}
-}
diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc
deleted file mode 100644
index 2b138280fb535307755b5e93bf4e12f356e1d956..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/transfer_scope_cache.h"
-
-namespace paddle {
-namespace framework {
-
-std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
-  thread_local auto* x = new std::unordered_map<size_t, Scope*>;
-  return *x;
-}
-
-std::unordered_set<Scope*>& global_transfer_scope_cache() {
-  thread_local auto* x = new std::unordered_set<Scope*>;
-  return *x;
-}
-
-Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
-                              const Scope* scope) {
-  Scope* new_scope{nullptr};
-  size_t infer_cache_key =
-      CombineHash(OpKernelType::Hash()(type0), OpKernelType::Hash()(type1));
-  infer_cache_key =
-      CombineHash(infer_cache_key, std::hash<const Scope*>()(scope));
-
-  auto it = global_transfer_data_cache().find(infer_cache_key);
-  if (it != global_transfer_data_cache().end()) {
-    new_scope = global_transfer_data_cache()[infer_cache_key];
-  } else {
-    new_scope = &scope->NewScope();
-    global_transfer_data_cache()[infer_cache_key] = new_scope;
-  }
-  global_transfer_scope_cache().insert(new_scope);
-  return new_scope;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/transfer_scope_cache.h b/paddle/fluid/framework/transfer_scope_cache.h
deleted file mode 100644
index 9a5d45263219adacddb9e9280feec40cfb1ff903..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/transfer_scope_cache.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <unordered_set>
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-
-std::unordered_map<size_t, Scope*>& global_transfer_data_cache();
-
-std::unordered_set<Scope*>& global_transfer_scope_cache();
-
-// Combine two hash values to a single hash.
-static size_t CombineHash(size_t seed, size_t a) {
-  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-
-Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
-                              const Scope* scope);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
deleted file mode 100644
index 508ee931c6ed7f66e09abd8f0e4b33c3d3c135fd..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tuple.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdexcept>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace framework {
-
-typedef boost::variant<int, int64_t, float, double, std::string, Tensor,
-                       LoDTensor /*, ChannelHolder*/>
-    ElementVar;
-
-class Tuple {
- public:
-  using ElementVars = std::vector<ElementVar>;
-
-  Tuple(const std::vector<ElementVar>& var,
-        const std::vector<VarDesc>& var_desc)
-      : var_(var), var_desc_(var_desc) {}
-  explicit Tuple(std::vector<ElementVar>& var) : var_(var) {}
-
-  ElementVar get(int idx) const { return var_[idx]; }
-
-  ElementVar& get(int idx) { return var_[idx]; }
-
-  bool isSameType(const Tuple& t) const;
-
-  size_t getSize() const { return var_.size(); }
-
- private:
-  ElementVars var_;
-  std::vector<VarDesc> var_desc_;
-};
-
-bool Tuple::isSameType(const Tuple& t) const {
-  size_t tuple_size = getSize();
-  if (tuple_size != t.getSize()) {
-    return false;
-  }
-  for (size_t j = 0; j < tuple_size; ++j) {
-    auto type1 = get(j).which();
-    auto type2 = t.get(j).which();
-    if (type1 != type2) return false;
-  }
-  return true;
-}
-
-Tuple* make_tuple(std::vector<ElementVar> tuple) { return new Tuple(tuple); }
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/tuple_test.cc b/paddle/fluid/framework/tuple_test.cc
deleted file mode 100644
index 810900f161ccc08234e28b982bdd962e4cded9ae..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/tuple_test.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <sstream>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/tuple.h"
-
-TEST(Tuple, Make) {
-  std::vector<paddle::framework::ElementVar> element_type;
-  element_type.push_back(12);
-  element_type.push_back(12.0f);
-  element_type.push_back("ElementVar");
-
-  paddle::framework::Tuple* tuple = paddle::framework::make_tuple(element_type);
-
-  EXPECT_EQ(boost::get<int>(tuple->get(0)), 12);
-  EXPECT_EQ(boost::get<float>(tuple->get(1)), 12.0f);
-  EXPECT_EQ(boost::get<std::string>(tuple->get(2)), "ElementVar");
-
-  delete tuple;
-}
-
-TEST(Tuple, IsTheSameType) {
-  std::vector<paddle::framework::ElementVar> element_type1;
-  std::vector<paddle::framework::ElementVar> element_type2;
-  std::vector<paddle::framework::ElementVar> element_type3;
-
-  element_type1.push_back(12);
-  element_type1.push_back(12.0f);
-  element_type1.push_back("Tuple1");
-
-  element_type2.push_back(13);
-  element_type2.push_back(13.0f);
-  element_type2.push_back("Tuple2");
-
-  element_type3.push_back(14.0f);
-  element_type3.push_back(14);
-  element_type3.push_back("Tuple3");
-
-  paddle::framework::Tuple* tuple1 =
-      paddle::framework::make_tuple(element_type1);
-  paddle::framework::Tuple* tuple2 =
-      paddle::framework::make_tuple(element_type2);
-  paddle::framework::Tuple* tuple3 =
-      paddle::framework::make_tuple(element_type3);
-
-  EXPECT_TRUE(tuple1->isSameType(*tuple2));
-  EXPECT_FALSE(tuple1->isSameType(*tuple3));
-
-  delete tuple1;
-  delete tuple2;
-  delete tuple3;
-}
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
deleted file mode 100644
index 7f1bfb5d9a81d45ab7840ab18e62374cc6554f12..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/type_defs.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace framework {
-class OperatorBase;
-class OpDesc;
-class InferShapeContext;
-class InferVarTypeContext;
-class BlockDesc;
-class Variable;
-class NoNeedBufferVarsInference;
-
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-// TODO(panyx0718): Replace vector with something like gtl::Vector.
-using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
-
-// The order should be as same as framework.proto
-using Attribute =
-    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                   std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*, int64_t,
-                   std::vector<BlockDesc*>, std::vector<int64_t>>;
-
-using AttributeMap = std::unordered_map<std::string, Attribute>;
-
-using OpCreator = std::function<OperatorBase*(
-    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
-    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
-
-using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
-    const OpDesc&, const std::unordered_set<std::string>& /*no_grad_set*/,
-    std::unordered_map<std::string, std::string>* /*grad_to_var*/,
-    const std::vector<BlockDesc*>& grad_block)>;
-
-using InferVarTypeFN =
-    std::function<void(framework::InferVarTypeContext* /*context*/)>;
-
-using InferShapeFN = std::function<void(InferShapeContext*)>;
-
-using InplacePair = std::unordered_map<std::string, std::string>;
-using InferInplaceOpFN = std::function<InplacePair(const OpDesc&, bool)>;
-
-using InferNoNeedBufferVarsFN = std::function<std::unordered_set<std::string>(
-    const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/,
-    const AttributeMap& /*attrs*/)>;
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h
deleted file mode 100644
index ab17641080551166f246c69d42ed85df3892a3e0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/unroll_array_ops.h
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cstddef>
-#include <type_traits>
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace framework {
-
-namespace detail {
-
-template <size_t kStart, size_t kEnd, bool kStop>
-struct UnrollFillConstant {
-  template <typename T>
-  HOSTDEVICE inline static void Run(T *data, T val) {
-    data[kStart] = val;
-    UnrollFillConstant<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(data, val);
-  }
-};
-
-template <size_t kStart, size_t kEnd>
-struct UnrollFillConstant<kStart, kEnd, true> {
-  template <typename T>
-  HOSTDEVICE inline static void Run(T *data, T val) {}
-};
-
-template <size_t kStart, size_t kEnd, bool kStop>
-struct UnrollAssign {
-  template <typename Tin, typename Tout>
-  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {
-    d2[kStart] = static_cast<Tout>(d1[kStart]);
-    UnrollAssign<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
-  }
-};
-
-template <size_t kStart, size_t kEnd>
-struct UnrollAssign<kStart, kEnd, true> {
-  template <typename Tin, typename Tout>
-  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {}
-};
-
-template <typename T, size_t kStart, size_t kEnd, bool kStop>
-struct UnrollVarArgsAssignImpl {
-  template <typename... Args>
-  HOSTDEVICE inline static void Run(T *d, T val, Args... args) {
-    static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument");
-    d[kStart] = val;
-    UnrollVarArgsAssignImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
-        d, args...);
-  }
-};
-
-template <typename T, size_t kStart, size_t kEnd>
-struct UnrollVarArgsAssignImpl<T, kStart, kEnd, true> {
-  HOSTDEVICE inline static void Run(T *d) {}
-};
-
-template <typename T>
-struct UnrollVarArgsAssign {
-  template <typename... Args>
-  HOSTDEVICE inline static void Run(T *d, Args... args) {
-    UnrollVarArgsAssignImpl<T, 0, sizeof...(Args), sizeof...(Args) == 0>::Run(
-        d, args...);
-  }
-};
-
-template <size_t kStart, size_t kEnd, bool kStop>
-struct UnrollCompare {
-  template <typename T>
-  HOSTDEVICE inline static bool Run(const T *d1, const T *d2) {
-    return d1[kStart] == d2[kStart] &&
-           UnrollCompare<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
-  }
-};
-
-template <size_t kStart, size_t kEnd>
-struct UnrollCompare<kStart, kEnd, true> {
-  template <typename T>
-  HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) {
-    return true;
-  }
-};
-
-template <size_t kStart, size_t kEnd, bool kStop>
-struct UnrollProduct {
-  template <typename T>
-  HOSTDEVICE inline static T Run(const T *d) {
-    return d[kStart] *
-           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d);
-  }
-};
-
-template <size_t kStart, size_t kEnd>
-struct UnrollProduct<kStart, kEnd, true> {
-  template <typename T>
-  HOSTDEVICE inline constexpr static T Run(const T *d) {
-    return 1;
-  }
-};
-
-}  // namespace detail
-
-template <size_t N>
-using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>;
-
-template <size_t N>
-using UnrollAssign = detail::UnrollAssign<0, N, N == 0>;
-
-template <typename T>
-using UnrollVarArgsAssign = detail::UnrollVarArgsAssign<T>;
-
-template <size_t N>
-using UnrollCompare = detail::UnrollCompare<0, N, N == 0>;
-
-template <size_t N>
-using UnrollProduct = detail::UnrollProduct<0, N, N == 0>;
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/unroll_array_ops_test.cc b/paddle/fluid/framework/unroll_array_ops_test.cc
deleted file mode 100644
index be811478eec17e0986ae7579ff323d94dea3155a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/unroll_array_ops_test.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/unroll_array_ops.h"
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <array>
-#include <cstdint>
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-bool CheckEquality(const T* p, size_t n, T val) {
-  return std::all_of(p, p + n, [val](const T& v) { return v == val; });
-}
-
-template <int D1, int D2>
-bool FillConstantTestMain() {
-  static_assert(D1 >= D2, "");
-  std::array<int, D1> arr;
-  arr.fill(0);
-
-  UnrollFillConstant<D2>::Run(arr.data(), 1);
-  return CheckEquality(arr.data(), D2, 1) &&
-         CheckEquality(arr.data() + D2, arr.size() - D2, 0);
-}
-
-TEST(unroll_ops, fill_constant) {
-  EXPECT_TRUE((FillConstantTestMain<9, 0>()));
-  EXPECT_TRUE((FillConstantTestMain<9, 1>()));
-  EXPECT_TRUE((FillConstantTestMain<9, 4>()));
-  EXPECT_TRUE((FillConstantTestMain<9, 9>()));
-}
-
-TEST(unroll_ops, assign) {
-  const int a[] = {1, 2, 3, 4, 5};
-  int b[] = {0, 0, 0, 0, 0};
-  UnrollAssign<3>::Run(a, b);
-  EXPECT_EQ(b[0], 1);
-  EXPECT_EQ(b[1], 2);
-  EXPECT_EQ(b[2], 3);
-  EXPECT_EQ(b[3], 0);
-  EXPECT_EQ(b[4], 0);
-}
-
-TEST(unroll_ops, var_args_assign) {
-  int a[] = {0, 0, 0};
-  UnrollVarArgsAssign<int>::Run(a, 1, 2);
-  EXPECT_EQ(a[0], 1);
-  EXPECT_EQ(a[1], 2);
-  EXPECT_EQ(a[2], 0);
-}
-
-TEST(unroll_ops, compare) {
-  int a[] = {1, 2, 3};
-  int b[] = {1, 2, 4};
-  EXPECT_TRUE(UnrollCompare<2>::Run(a, b));
-  EXPECT_FALSE(UnrollCompare<3>::Run(a, b));
-
-  b[0] = -1;
-  EXPECT_TRUE(UnrollCompare<0>::Run(a, b));
-  EXPECT_FALSE(UnrollCompare<1>::Run(a, b));
-}
-
-TEST(unroll_ops, product) {
-  int a[] = {2, 3, 4};
-  EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
deleted file mode 100644
index f3ea1f624ee836a483c37c2addb4d9766e87c107..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/var_desc.cc
+++ /dev/null
@@ -1,275 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <google/protobuf/util/message_differencer.h>
-
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-proto::VarType::Type VarDesc::GetType() const { return desc_.type().type(); }
-
-void VarDesc::SetType(proto::VarType::Type type) {
-  desc_.mutable_type()->set_type(type);
-}
-
-void VarDesc::SetShape(const std::vector<int64_t> &dims) {
-  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
-}
-
-void VarDesc::SetTensorDescNum(size_t num) {
-  switch (desc_.type().type()) {
-    case proto::VarType::READER: {
-      auto *lod_tensors_ptr =
-          desc_.mutable_type()->mutable_reader()->mutable_lod_tensor();
-      lod_tensors_ptr->Clear();
-      for (size_t i = 0; i < num; ++i) {
-        lod_tensors_ptr->Add();
-      }
-      return;
-    } break;
-    default:
-      PADDLE_THROW(
-          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-size_t VarDesc::GetTensorDescNum() const {
-  switch (desc_.type().type()) {
-    case proto::VarType::READER:
-      return desc_.type().reader().lod_tensor_size();
-      break;
-    default:
-      PADDLE_THROW(
-          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-void VarDesc::SetShapes(
-    const std::vector<std::vector<int64_t>> &multiple_dims) {
-  if (multiple_dims.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
-    SetTensorDescNum(multiple_dims.size());
-  }
-  std::vector<proto::VarType::TensorDesc *> tensors = mutable_tensor_descs();
-  for (size_t i = 0; i < multiple_dims.size(); ++i) {
-    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
-  }
-}
-
-std::vector<int64_t> VarDesc::GetShape() const {
-  return RepeatedToVector(tensor_desc().dims());
-}
-
-std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
-  std::vector<proto::VarType::TensorDesc> descs = tensor_descs();
-  std::vector<std::vector<int64_t>> res;
-  res.reserve(descs.size());
-  for (const auto &tensor_desc : descs) {
-    res.push_back(RepeatedToVector(tensor_desc.dims()));
-  }
-  return res;
-}
-
-void VarDesc::SetDataType(proto::VarType::Type data_type) {
-  mutable_tensor_desc()->set_data_type(data_type);
-}
-
-void VarDesc::SetDataTypes(
-    const std::vector<proto::VarType::Type> &multiple_data_type) {
-  if (multiple_data_type.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given data types("
-            << multiple_data_type.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
-    SetTensorDescNum(multiple_data_type.size());
-  }
-  std::vector<proto::VarType::TensorDesc *> tensor_descs =
-      mutable_tensor_descs();
-  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
-    tensor_descs[i]->set_data_type(multiple_data_type[i]);
-  }
-}
-
-proto::VarType::Type VarDesc::GetDataType() const {
-  return tensor_desc().data_type();
-}
-
-std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
-  std::vector<proto::VarType::TensorDesc> descs = tensor_descs();
-  std::vector<proto::VarType::Type> res;
-  res.reserve(descs.size());
-  for (const auto &tensor_desc : descs) {
-    res.push_back(tensor_desc.data_type());
-  }
-  return res;
-}
-
-void VarDesc::SetLoDLevel(int32_t lod_level) {
-  switch (desc_.type().type()) {
-    case proto::VarType::LOD_TENSOR:
-      desc_.mutable_type()->mutable_lod_tensor()->set_lod_level(lod_level);
-      break;
-    case proto::VarType::LOD_TENSOR_ARRAY:
-      desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
-      break;
-    default:
-      PADDLE_THROW(
-          "Setting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
-  if (multiple_lod_level.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given lod_levels("
-            << multiple_lod_level.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
-    SetTensorDescNum(multiple_lod_level.size());
-  }
-  switch (desc_.type().type()) {
-    case proto::VarType::READER: {
-      size_t i = 0;
-      for (auto &lod_tensor :
-           *desc_.mutable_type()->mutable_reader()->mutable_lod_tensor()) {
-        lod_tensor.set_lod_level(multiple_lod_level[i++]);
-      }
-    } break;
-    default:
-      PADDLE_THROW(
-          "Setting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-int32_t VarDesc::GetLoDLevel() const {
-  switch (desc_.type().type()) {
-    case proto::VarType::LOD_TENSOR:
-      return desc_.type().lod_tensor().lod_level();
-    case proto::VarType::LOD_TENSOR_ARRAY:
-      return desc_.type().tensor_array().lod_level();
-    default:
-      PADDLE_THROW(
-          "Getting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-std::vector<int32_t> VarDesc::GetLoDLevels() const {
-  std::vector<int32_t> res;
-  switch (desc_.type().type()) {
-    case proto::VarType::READER:
-      res.reserve(desc_.type().reader().lod_tensor_size());
-      for (auto &lod_tensor : desc_.type().reader().lod_tensor()) {
-        res.push_back(lod_tensor.lod_level());
-      }
-      return res;
-      break;
-    default:
-      PADDLE_THROW(
-          "Getting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
-  switch (desc_.type().type()) {
-    case proto::VarType::SELECTED_ROWS:
-      return desc_.type().selected_rows();
-    case proto::VarType::LOD_TENSOR:
-      return desc_.type().lod_tensor().tensor();
-    case proto::VarType::LOD_TENSOR_ARRAY:
-      return desc_.type().tensor_array().tensor();
-    default:
-      PADDLE_THROW(
-          "Getting 'tensor_desc' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  std::vector<proto::VarType::TensorDesc> res;
-  res.reserve(GetTensorDescNum());
-  switch (desc_.type().type()) {
-    case proto::VarType::READER:
-      for (const auto &lod_tensor : desc_.type().reader().lod_tensor()) {
-        res.push_back(lod_tensor.tensor());
-      }
-      return res;
-    default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
-  }
-}
-
-proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
-  switch (desc_.type().type()) {
-    case proto::VarType::SELECTED_ROWS:
-      return desc_.mutable_type()->mutable_selected_rows();
-    case proto::VarType::LOD_TENSOR:
-      return desc_.mutable_type()->mutable_lod_tensor()->mutable_tensor();
-    case proto::VarType::LOD_TENSOR_ARRAY:
-      return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
-    default:
-      PADDLE_THROW(
-          "Getting 'mutable_tensor_desc' is not supported by the type of var "
-          "%s.",
-          this->Name());
-  }
-}
-
-std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
-  std::vector<proto::VarType::TensorDesc *> res;
-  res.reserve(GetTensorDescNum());
-  switch (desc_.type().type()) {
-    case proto::VarType::READER:
-      for (auto &lod_tensor :
-           *desc_.mutable_type()->mutable_reader()->mutable_lod_tensor()) {
-        res.push_back(lod_tensor.mutable_tensor());
-      }
-      return res;
-    default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
-  }
-}
-
-bool operator==(const VarDesc &left, const VarDesc &right) {
-  return left.Proto()->SerializeAsString() ==
-         right.Proto()->SerializeAsString();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
deleted file mode 100644
index 7c82e1d68f21d81b20885f4f62683d16ec3c3975..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/var_desc.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/framework.pb.h"
-
-namespace paddle {
-namespace framework {
-
-// convert between std::vector and protobuf repeated.
-template <typename T>
-inline std::vector<T> RepeatedToVector(
-    const google::protobuf::RepeatedField<T> &repeated_field) {
-  std::vector<T> ret;
-  ret.reserve(repeated_field.size());
-  std::copy(repeated_field.begin(), repeated_field.end(),
-            std::back_inserter(ret));
-  return ret;
-}
-
-template <typename T, typename RepeatedField>
-inline void VectorToRepeated(const std::vector<T> &vec,
-                             RepeatedField *repeated_field) {
-  repeated_field->Clear();
-  repeated_field->Reserve(vec.size());
-  for (const auto &elem : vec) {
-    *repeated_field->Add() = elem;
-  }
-}
-
-// Specialize vector<bool>.
-template <typename RepeatedField>
-inline void VectorToRepeated(const std::vector<bool> &vec,
-                             RepeatedField *repeated_field) {
-  repeated_field->Clear();
-  repeated_field->Reserve(vec.size());
-  for (auto elem : vec) {
-    *repeated_field->Add() = elem;
-  }
-}
-
-class VarDesc {
- public:
-  explicit VarDesc(const std::string &name) {
-    desc_.set_name(name);
-    // TODO(paddle-dev): Why default to lodtensor.
-    desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR);
-  }
-
-  explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
-
-  proto::VarDesc *Proto() { return &desc_; }
-
-  const proto::VarDesc *Proto() const { return &desc_; }
-
-  std::string Name() const { return desc_.name(); }
-
-  void SetName(std::string name) { desc_.set_name(name); }
-
-  void SetTensorDescNum(size_t num);
-
-  size_t GetTensorDescNum() const;
-
-  void SetShape(const std::vector<int64_t> &dims);
-
-  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
-
-  std::vector<int64_t> GetShape() const;
-
-  std::vector<std::vector<int64_t>> GetShapes() const;
-
-  void SetDataType(proto::VarType::Type data_type);
-
-  void SetDataTypes(
-      const std::vector<proto::VarType::Type> &multiple_data_type);
-
-  proto::VarType::Type GetDataType() const;
-
-  std::vector<proto::VarType::Type> GetDataTypes() const;
-
-  void SetLoDLevel(int32_t lod_level);
-
-  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
-
-  int32_t GetLoDLevel() const;
-
-  std::vector<int32_t> GetLoDLevels() const;
-
-  proto::VarType::Type GetType() const;
-
-  void SetType(proto::VarType::Type type);
-
-  bool Persistable() const { return desc_.persistable(); }
-
-  void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
-
- private:
-  const proto::VarType::TensorDesc &tensor_desc() const;
-  std::vector<proto::VarType::TensorDesc> tensor_descs() const;
-  proto::VarType::TensorDesc *mutable_tensor_desc();
-  std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs();
-
-  proto::VarDesc desc_;
-};
-
-bool operator==(const VarDesc &left, const VarDesc &right);
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
deleted file mode 100644
index 73be446f71f193bea203c986b482e6b98a9826c5..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/var_type.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-inline bool IsType(const std::type_index& type) {
-  return type == typeid(T);
-}
-
-inline proto::VarType::Type ToVarType(int type) {
-  switch (type) {
-    case proto::VarType::LOD_TENSOR:
-    case proto::VarType::SELECTED_ROWS:
-    case proto::VarType::LOD_RANK_TABLE:
-    case proto::VarType::LOD_TENSOR_ARRAY:
-    case proto::VarType::READER:
-      return static_cast<proto::VarType::Type>(type);
-    default:
-      PADDLE_THROW("ToVarType:Unsupported type %d", type);
-  }
-}
-
-template <typename Visitor>
-inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
-  switch (var.Type()) {
-    case proto::VarType::LOD_TENSOR:
-      visitor(var.Get<LoDTensor>());
-      return;
-    case proto::VarType::LOD_RANK_TABLE:
-      visitor(var.Get<LoDRankTable>());
-      return;
-    case proto::VarType::LOD_TENSOR_ARRAY:
-      visitor(var.Get<LoDTensorArray>());
-      return;
-    case proto::VarType::SELECTED_ROWS:
-      visitor(var.Get<SelectedRows>());
-      return;
-    case proto::VarType::READER:
-      visitor(var.Get<ReaderHolder>());
-      return;
-    default:
-      PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
deleted file mode 100644
index 66e6ac81623a1cd1c79981c1e4a97d974e9c2426..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/var_type_inference.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/type_defs.h"
-
-namespace paddle {
-namespace framework {
-
-class OpDesc;
-class BlockDesc;
-// default infer var type context
-class InferVarTypeContext {
- public:
-  InferVarTypeContext(const OpDesc* op, BlockDesc* block)
-      : op_(op), block_(block) {}
-
-  virtual ~InferVarTypeContext() {}
-
-  virtual Attribute GetAttr(const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(op_);
-    return op_->GetAttr(name);
-  }
-
-  virtual bool HasVar(const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    return block_->FindVarRecursive(name) != nullptr;
-  }
-
-  virtual bool HasInput(const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(op_);
-    auto& inputs = op_->Inputs();
-    auto input = inputs.find(name);
-    return input != inputs.end() && !input->second.empty();
-  }
-
-  virtual bool HasOutput(const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(op_);
-    auto& outputs = op_->Outputs();
-    auto output = outputs.find(name);
-    return output != outputs.end() && !output->second.empty();
-  }
-
-  virtual const std::vector<std::string>& Input(const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(op_);
-    return op_->Input(name);
-  }
-
-  virtual const std::vector<std::string>& Output(
-      const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(op_);
-    return op_->Output(name);
-  }
-
-  virtual proto::VarType::Type GetType(const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    return block_->FindRecursiveOrCreateVar(name).GetType();
-  }
-
-  virtual void SetType(const std::string& name, proto::VarType::Type type) {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    block_->FindRecursiveOrCreateVar(name).SetType(type);
-  }
-
-  virtual proto::VarType::Type GetDataType(const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    return block_->FindRecursiveOrCreateVar(name).GetDataType();
-  }
-
-  virtual void SetDataType(const std::string& name, proto::VarType::Type type) {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    block_->FindRecursiveOrCreateVar(name).SetDataType(type);
-  }
-
-  virtual std::vector<proto::VarType::Type> GetDataTypes(
-      const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    return block_->FindRecursiveOrCreateVar(name).GetDataTypes();
-  }
-
-  virtual void SetDataTypes(
-      const std::string& name,
-      const std::vector<proto::VarType::Type>& multiple_data_type) {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    block_->FindRecursiveOrCreateVar(name).SetDataTypes(multiple_data_type);
-  }
-
-  virtual std::vector<int64_t> GetShape(const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    return block_->FindRecursiveOrCreateVar(name).GetShape();
-  }
-
-  virtual void SetShape(const std::string& name,
-                        const std::vector<int64_t>& dims) {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    block_->FindRecursiveOrCreateVar(name).SetShape(dims);
-  }
-
-  virtual int32_t GetLoDLevel(const std::string& name) const {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    return block_->FindRecursiveOrCreateVar(name).GetLoDLevel();
-  }
-
-  virtual void SetLoDLevel(const std::string& name, int32_t lod_level) {
-    PADDLE_ENFORCE_NOT_NULL(block_);
-    block_->FindRecursiveOrCreateVar(name).SetLoDLevel(lod_level);
-  }
-
- protected:
-  const OpDesc* op_;
-  BlockDesc* block_;
-};
-
-class VarTypeInference {
- public:
-  virtual ~VarTypeInference() {}
-  virtual void operator()(InferVarTypeContext* context) const = 0;  // NOLINT
-};
-
-class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const final {  // NOLINT
-    auto in_out_var_names = this->GetInputOutputWithSameType();
-
-    for (auto& i_o_n : in_out_var_names) {
-      auto& x_name = ctx->Input(i_o_n.first).at(0);
-      auto& out_name = ctx->Output(i_o_n.second).at(0);
-
-      ctx->SetType(out_name, ctx->GetType(x_name));
-      ctx->SetDataType(out_name, ctx->GetDataType(x_name));
-    }
-  }
-
- protected:
-  virtual std::unordered_map<std::string, std::string>
-  GetInputOutputWithSameType() const = 0;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
deleted file mode 100644
index 6bbb25a573d076d5ec6d6fd960a304639e9e3d49..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/var_type_inference.h"
-#include <string>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-class NOP : public OperatorBase {
- public:
-  NOP(const std::string &type, const VariableNameMap &inputs,
-      const VariableNameMap &outputs, const AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const Scope &scope,
-               const platform::Place &place) const override {}
-};
-
-class SumOpMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class SumOpVarTypeInference : public VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto &inputs = ctx->Input("X");
-    auto default_var_type = proto::VarType::SELECTED_ROWS;
-
-    bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
-          return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
-        });
-    if (any_input_is_lod_tensor) {
-      default_var_type = proto::VarType::LOD_TENSOR;
-    }
-
-    auto out_var_name = ctx->Output("Out").front();
-    ctx->SetType(out_var_name, default_var_type);
-  }
-};
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
-                  paddle::framework::SumOpVarTypeInference);
-REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
-                  paddle::framework::SumOpMaker);
-
-namespace paddle {
-namespace framework {
-
-TEST(InferVarType, sum_op) {
-  ProgramDesc prog;
-  auto *op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("sum");
-  op->SetInput("X", {"test_a", "test_b", "test_c"});
-  op->SetOutput("Out", {"test_out"});
-
-  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarType::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_out");
-
-  op->InferVarType(prog.MutableBlock(0));
-
-  ASSERT_EQ(proto::VarType::SELECTED_ROWS,
-            prog.MutableBlock(0)->Var("test_out")->GetType());
-
-  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::LOD_TENSOR);
-  op->InferVarType(prog.MutableBlock(0));
-  ASSERT_EQ(proto::VarType::LOD_TENSOR,
-            prog.MutableBlock(0)->Var("test_out")->GetType());
-}
-
-TEST(InferVarType, sum_op_without_infer_var_type) {
-  ProgramDesc prog;
-  auto *op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("sum_without_infer_var_type");
-  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
-  op->SetOutput("Out", {"test2_out"});
-
-  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_out");
-
-  op->InferVarType(prog.MutableBlock(0));
-
-  ASSERT_EQ(proto::VarType::LOD_TENSOR,
-            prog.MutableBlock(0)->Var("test2_out")->GetType());
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
deleted file mode 100644
index 7cc2b3b42258942e6016486f7cf7ecfcae92b91c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/var_type_traits.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/var_type_traits.h"
-#include <unordered_map>
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-#include "paddle/fluid/platform/macros.h"
-#ifdef PADDLE_WITH_CUDA
-#ifndef _WIN32
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-#include <cudnn.h>
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/operators/cudnn_rnn_cache.h"
-#endif
-
-namespace paddle {
-namespace framework {
-
-// Besides registering variable type id, it is helpful to register a
-// var_id -> std::type_index map (for example, get type names according to id)
-namespace detail {
-
-template <int kStart, int kEnd, bool kStop>
-struct VarIdToTypeIndexMapInitializerImpl {
-  template <typename MapType1, typename MapType2>
-  static void Init(MapType1 *id_to_type, MapType2 *type_to_id) {
-    using Type =
-        typename std::tuple_element<kStart, VarTypeRegistry::ArgTuple>::type;
-    static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
-    constexpr int kId = VarTypeTrait<Type>::kId;
-    auto type = std::type_index(typeid(Type));
-    PADDLE_ENFORCE(id_to_type->count(kId) == 0,
-                   "Registered duplicate type id %d for type %s", kId,
-                   type.name());
-    PADDLE_ENFORCE(type_to_id->count(type) == 0,
-                   "Registered duplicate type_index %s for id %d", type.name(),
-                   kId);
-    id_to_type->emplace(kId, type);
-    type_to_id->emplace(type, kId);
-    VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
-                                       kStart + 1 == kEnd>::Init(id_to_type,
-                                                                 type_to_id);
-  }
-};
-
-template <int kStart, int kEnd>
-struct VarIdToTypeIndexMapInitializerImpl<kStart, kEnd, true> {
-  template <typename MapType1, typename MapType2>
-  static void Init(MapType1 *, MapType2 *) {}
-};
-
-// VarIdToTypeIndexMapInitializer is designed to initialize var_id ->
-// std::type_index map and std::type_index -> var_id map
-using VarIdToTypeIndexMapInitializer =
-    VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum,
-                                       VarTypeRegistry::kRegisteredTypeNum ==
-                                           0>;
-
-struct VarIdToTypeIndexMapHolder {
-  DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder);
-
- public:
-  static const std::type_index &ToTypeIndex(int var_id) {
-    auto it = Instance().id_to_type_map_.find(var_id);
-    PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
-                   "VarId %d is not registered.", var_id);
-    return it->second;
-  }
-
-  static int ToTypeId(const std::type_index &type) {
-    auto it = Instance().type_to_id_map_.find(type);
-    PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
-                   "VarType %s is not registered.", type.name());
-    return it->second;
-  }
-
- private:
-  VarIdToTypeIndexMapHolder() {
-    VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_);
-  }
-
-  static const VarIdToTypeIndexMapHolder &Instance() {
-    static const VarIdToTypeIndexMapHolder instance;
-    return instance;
-  }
-
-  std::unordered_map<int, std::type_index> id_to_type_map_;
-  std::unordered_map<std::type_index, int> type_to_id_map_;
-};
-
-}  // namespace detail
-
-const std::type_index &VarTraitIdToTypeIndex(int var_id) {
-  return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
-}
-
-const char *ToTypeName(int var_id) {
-  return VarTraitIdToTypeIndex(var_id).name();
-}
-
-int TypeIndexToVarTraitId(const std::type_index &type) {
-  return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
deleted file mode 100644
index 7147f06233cb9d435d8be62814df0a3891b729fb..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/var_type_traits.h
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <tuple>
-#include <typeindex>
-#include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
-#include <cudnn.h>
-#ifndef _WIN32
-#include <nccl.h>
-#endif
-#endif
-
-// Users should add forward declarations here
-namespace paddle {
-
-namespace platform {
-#ifdef PADDLE_WITH_CUDA
-#ifndef _WIN32
-class Communicator;
-class NCCLCommunicator;
-#endif
-#endif
-}  // namespace platform
-
-namespace framework {
-class Tensor;
-class LoDTensor;
-class SelectedRows;
-class LoDRankTable;
-class ReaderHolder;
-class Scope;
-}  // namespace framework
-
-namespace operators {
-
-class CudnnRNNCache;
-
-namespace reader {
-class LoDTensorBlockingQueueHolder;
-}  // namespace reader
-}  // namespace operators
-
-}  // namespace paddle
-
-namespace paddle {
-namespace framework {
-
-const char *ToTypeName(int var_id);
-const std::type_index &VarTraitIdToTypeIndex(int var_id);
-int TypeIndexToVarTraitId(const std::type_index &type);
-
-namespace detail {
-
-template <bool kStop, int kStart, int kEnd, typename T1, typename T2,
-          typename... Args>
-struct TypePosFinderImpl {
-  static constexpr int kPos =
-      std::is_same<T1, T2>::value
-          ? kStart
-          : TypePosFinderImpl<kStart + 2 == kEnd, kStart + 1, kEnd, T1,
-                              Args...>::kPos;
-};
-
-template <int kStart, int kEnd, typename T1, typename T2>
-struct TypePosFinderImpl<true, kStart, kEnd, T1, T2> {
-  static constexpr int kPos = std::is_same<T1, T2>::value ? kStart : -1;
-};
-
-// TypePosFinder helps to find the position in which T is inside Args...
-// If T is not inside Args..., kPos would be -1
-template <typename T, typename... Args>
-struct TypePosFinder {
-  static constexpr int kPos =
-      TypePosFinderImpl<sizeof...(Args) == 1, 0, sizeof...(Args), T,
-                        Args...>::kPos;
-};
-
-template <typename... Args>
-struct VarTypeRegistryImpl {
-  static constexpr size_t kRegisteredTypeNum = sizeof...(Args);
-  using ArgTuple = std::tuple<Args...>;
-
-  // TypePos() returns the position in which T is inside Args...
-  // If T is not inside Args..., return -1
-  template <typename T>
-  static constexpr int TypePos() {
-    return TypePosFinder<T, Args...>::kPos;
-  }
-
-  // IsRegistered() returns whether T is registered inside RegistryImpl
-  template <typename T>
-  static constexpr bool IsRegistered() {
-    return TypePos<T>() >= 0;
-  }
-};
-
-}  // namespace detail
-
-#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id)           \
-  template <>                                              \
-  struct VarTypeTrait<type> {                              \
-    static_assert(VarTypeRegistry::IsRegistered<type>(),   \
-                  "Must be registered type");              \
-    using Type = type;                                     \
-    static constexpr int kId = static_cast<int>(proto_id); \
-  }
-
-/**
- * The following codes are designed to register variable types.
- * Only registered types can be stored in Variable.
- * This registry mechanism is designed to speed up Variable.
- *
- * Caution: If you want to add more var types, please consider carefully
- * whether you really need to add it.
- */
-
-// Users should add other variable types below.
-// Paddle would generate unique Ids for each registered variable types.
-using VarTypeRegistry = detail::VarTypeRegistryImpl<
-    Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
-    LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
-    std::map<size_t, Tensor>, operators::reader::LoDTensorBlockingQueueHolder,
-#ifdef PADDLE_WITH_CUDA
-#ifndef _WIN32
-    ncclUniqueId, platform::Communicator, platform::NCCLCommunicator,
-#endif
-    operators::CudnnRNNCache,
-#endif
-    int, float>;
-
-template <typename T>
-struct VarTypeTrait {
-  static_assert(VarTypeRegistry::IsRegistered<T>(), "Must be registered type");
-  using Type = T;
-  /**
-   * Unique VarType Id generation.
-   *
-   * The auto-generated id should not be the same as any protobuf id defined in
-   * framework.proto. Therefore, we generate id by adding the type pos and
-   * maximum protobuf id (i.e., proto::VarType::TUPLE).
-   *
-   * However, we may need more protobuf id in the future.
-   * To avoid changing this auto id generation algorithm frequently, we
-   * generate id by adding the type pos and twice of maximum protobuf id (i.e.,
-   * proto::VarType::TUPLE).
-   */
-  static constexpr int kId = VarTypeRegistry::TypePos<T>() +
-                             static_cast<int>(proto::VarType::TUPLE) * 2;
-};
-
-// Users should set some of variable type ids to be what is defined in
-// framework.proto below
-REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR);
-REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS);
-REG_PROTO_VAR_TYPE_TRAIT(std::vector<Scope *>, proto::VarType::STEP_SCOPES);
-REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
-REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
-REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
-REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
-REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
-REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
-
-/** End of variable type registration */
-
-template <typename T>
-inline constexpr bool IsRegisteredVarType() {
-  return VarTypeRegistry::IsRegistered<T>();
-}
-
-#undef REG_PROTO_VAR_TYPE_TRAIT
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
deleted file mode 100644
index 67dbfd740ed9b71fa06b684c14720ae2814fe11c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <cstdint>
-#include <iostream>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-#ifdef PADDLE_WITH_CUDA
-#ifndef _WIN32
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/operators/cudnn_rnn_cache.h"
-#endif
-
-namespace paddle {
-namespace framework {
-
-template <int kPos, int kEnd, bool kStop>
-struct TypeIndexChecker {
-  template <typename SetType1, typename SetType2>
-  static void Check(SetType1 *var_id_set, SetType2 *type_index_set) {
-    using Type =
-        typename std::tuple_element<kPos, VarTypeRegistry::ArgTuple>::type;
-    static_assert(std::is_same<typename VarTypeTrait<Type>::Type, Type>::value,
-                  "Type must be the same");
-    constexpr auto kId = VarTypeTrait<Type>::kId;
-    std::type_index actual_type(typeid(Type));
-    EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
-    EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type);
-    EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId);
-    EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)),
-              actual_type);
-    EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId);
-
-    EXPECT_TRUE(var_id_set->count(kId) == 0);              // NOLINT
-    EXPECT_TRUE(type_index_set->count(actual_type) == 0);  // NOLINT
-    var_id_set->insert(kId);
-    type_index_set->insert(std::type_index(typeid(Type)));
-    TypeIndexChecker<kPos + 1, kEnd, kPos + 1 == kEnd>::Check(var_id_set,
-                                                              type_index_set);
-  }
-};
-
-template <int kPos, int kEnd>
-struct TypeIndexChecker<kPos, kEnd, true> {
-  template <typename SetType1, typename SetType2>
-  static void Check(SetType1 *, SetType2 *) {}
-};
-
-TEST(var_type_traits, check_no_duplicate_registry) {
-  constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum;
-  std::unordered_set<int> var_id_set;
-  std::unordered_set<std::type_index> type_index_set;
-  TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check(
-      &var_id_set, &type_index_set);
-}
-
-template <typename T>
-bool CheckVarId(int proto_id) {
-  static_assert(std::is_same<typename VarTypeTrait<T>::Type, T>::value,
-                "Type must be the same");
-  return VarTypeTrait<T>::kId == proto_id;
-}
-
-TEST(var_type_traits, check_proto_type_id) {
-  ASSERT_TRUE(CheckVarId<LoDTensor>(proto::VarType::LOD_TENSOR));
-  ASSERT_TRUE(CheckVarId<SelectedRows>(proto::VarType::SELECTED_ROWS));
-  ASSERT_TRUE(CheckVarId<std::vector<Scope *>>(proto::VarType::STEP_SCOPES));
-  ASSERT_TRUE(CheckVarId<LoDRankTable>(proto::VarType::LOD_RANK_TABLE));
-  ASSERT_TRUE(CheckVarId<LoDTensorArray>(proto::VarType::LOD_TENSOR_ARRAY));
-  ASSERT_TRUE(CheckVarId<platform::PlaceList>(proto::VarType::PLACE_LIST));
-  ASSERT_TRUE(CheckVarId<ReaderHolder>(proto::VarType::READER));
-  ASSERT_TRUE(CheckVarId<int>(proto::VarType::INT32));
-  ASSERT_TRUE(CheckVarId<float>(proto::VarType::FP32));
-
-  ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR);
-  ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS);
-  ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES);
-  ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE);
-  ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY,
-            proto::VarType::LOD_TENSOR_ARRAY);
-  ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST);
-  ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER);
-  ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH);
-  ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST);
-  ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW);
-  ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE);
-  ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32);
-  ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32);
-}
-
-TEST(var_type_traits, test_registry) {
-  using Registry = detail::VarTypeRegistryImpl<int8_t, int32_t, size_t, double>;
-  ASSERT_TRUE(Registry::TypePos<int8_t>() == 0);
-  ASSERT_TRUE(Registry::TypePos<int32_t>() == 1);
-  ASSERT_TRUE(Registry::TypePos<size_t>() == 2);
-  ASSERT_TRUE(Registry::TypePos<double>() == 3);
-  ASSERT_TRUE(Registry::TypePos<float>() == -1);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
deleted file mode 100644
index b9d07da822cf1eb42859e1d7d84437582fada8ff..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/variable.h
+++ /dev/null
@@ -1,104 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <memory>
-#include <string>
-#include <typeindex>
-#include <typeinfo>
-
-#include "paddle/fluid/framework/var_type_traits.h"
-
-namespace paddle {
-namespace framework {
-
-class Variable {
- public:
-  template <typename T>
-  const T& Get() const {
-    static_assert(
-        IsRegisteredVarType<T>(),
-        "Not registered type. Please register T inside var_type_traits.h");
-    PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing");
-    PADDLE_ENFORCE(holder_->Type() == VarTypeTrait<T>::kId,
-                   "Variable must be type %s, the holding type is %s",
-                   ToTypeName(VarTypeTrait<T>::kId),
-                   ToTypeName(holder_->Type()));
-    return *static_cast<const T*>(holder_->Ptr());
-  }
-
-  bool IsInitialized() const { return holder_ != nullptr; }
-
-  template <typename T>
-  T* GetMutable() {
-    if (!holder_) {
-      holder_.reset(new PlaceholderImpl<T>());
-    } else {
-      PADDLE_ENFORCE(holder_->Type() == VarTypeTrait<T>::kId,
-                     "Variable must be type %s, the holding type is %s",
-                     ToTypeName(VarTypeTrait<T>::kId),
-                     ToTypeName(holder_->Type()));
-    }
-    return static_cast<T*>(holder_->Ptr());
-  }
-
-  template <typename T>
-  bool IsType() const {
-    return holder_ && holder_->Type() == VarTypeTrait<T>::kId;
-  }
-
-  void Clear() { holder_.reset(); }
-
-  int Type() const {
-    PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
-    return holder_->Type();
-  }
-
- private:
-  struct Placeholder {
-    virtual ~Placeholder() = default;
-
-    inline int Type() const { return type_; }
-    inline const void* Ptr() const { return ptr_; }
-    inline void* Ptr() { return ptr_; }
-
-   protected:
-    inline void Init(void* p, int type) {
-      ptr_ = p;
-      type_ = type;
-    }
-
-    void* ptr_;
-    int type_;
-  };
-
-  // Placeholder hides type T, so it doesn't appear as a template
-  // parameter of Variable.
-  template <typename T>
-  struct PlaceholderImpl : public Placeholder {
-    static_assert(
-        IsRegisteredVarType<T>(),
-        "Not registered type. Please register T inside var_type_traits.h");
-    PlaceholderImpl() { this->Init(&obj_, VarTypeTrait<T>::kId); }
-
-   private:
-    T obj_;
-  };
-
-  // pointers to a PlaceholderImpl object indeed.
-  std::unique_ptr<Placeholder> holder_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
deleted file mode 100644
index 65c939af173a8a2a22d69c636de355293f95dec6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/variable_helper.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/variable_helper.h"
-
-#include <vector>
-
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope *>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
-        var_type);
-  }
-}
-
-void CopyVariable(const Variable &src_var, Variable *dst_var) {
-  // only support cpu now
-  auto cpu_place = platform::CPUPlace();
-
-  if (src_var.IsType<framework::LoDTensor>()) {
-    auto *tmp_grad_tensor = dst_var->GetMutable<framework::LoDTensor>();
-    auto &src_tensor = src_var.Get<framework::LoDTensor>();
-    tmp_grad_tensor->set_lod(src_tensor.lod());
-    framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor);
-  } else if (src_var.IsType<framework::SelectedRows>()) {
-    auto &src_slr = src_var.Get<framework::SelectedRows>();
-    auto *tmp_grad_slr = dst_var->GetMutable<framework::SelectedRows>();
-    tmp_grad_slr->set_rows(src_slr.rows());
-    tmp_grad_slr->set_height(src_slr.height());
-    auto &src_t = src_slr.value();
-    auto *dst_t = tmp_grad_slr->mutable_value();
-    framework::TensorCopy(src_t, cpu_place, dst_t);
-  } else {
-    PADDLE_THROW("unknown var type to copy");
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h
deleted file mode 100644
index 5a2c267b7388f6c2de89054dc480fd74b4544bed..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/variable_helper.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/variable.h"
-namespace paddle {
-namespace framework {
-
-void InitializeVariable(Variable* var, proto::VarType::Type var_type);
-void CopyVariable(const Variable& src_var, Variable* dst_var);
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
deleted file mode 100644
index 511c9c52146ece4b90905cc9d49565103589c1ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/variable_test.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace framework {
-
-TEST(Variable, GetMutable) {
-  std::unique_ptr<Variable> v(new Variable());
-
-  auto* t = v->GetMutable<std::string>();
-  *t = "1234";
-
-  const auto& tt = v->Get<std::string>();
-  EXPECT_EQ("1234", tt);
-
-  try {
-    v->GetMutable<Tensor>();
-  } catch (std::exception& e) {
-    return;
-  }
-  EXPECT_TRUE(false);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc
deleted file mode 100644
index 81c0392bf3cc7378cec06a9de3ae81f2b221ecec..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/version.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/version.h"
-#include <algorithm>
-
-namespace paddle {
-namespace framework {
-bool IsProgramVersionSupported(int64_t version) {
-  static int num_supported =
-      sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]);
-  return std::find(kSupportedProgramVersion,
-                   kSupportedProgramVersion + num_supported,
-                   version) != kSupportedProgramVersion + num_supported;
-}
-
-bool IsTensorVersionSupported(uint32_t version) {
-  static int num_supported =
-      sizeof(kSupportedTensorVersion) / sizeof(kSupportedTensorVersion[0]);
-  return std::find(kSupportedTensorVersion,
-                   kSupportedTensorVersion + num_supported,
-                   version) != kSupportedTensorVersion + num_supported;
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h
deleted file mode 100644
index 9945bc58c69df8456ff3d1aa0c777970bdbdbf98..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/version.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstdint>
-
-#pragma once
-
-namespace paddle {
-namespace framework {
-
-// Note:
-// Program and Tensor that pass the IsXXXVersionSupported should
-// be supported by the current codes. Otherwise, it's a compatibility
-// bug.
-
-// The program version the current codes generate.
-constexpr int64_t kCurProgramVersion = 0;
-
-// The program version that was generated by previous or current codes
-// and supported by current codes.
-constexpr int64_t kSupportedProgramVersion[] = {0};
-
-// Due to historical reasons, tensor version use uint32_t.
-// The tensor version the current codes generate.
-constexpr uint32_t kCurTensorVersion = 0;
-
-// The tensor version that was generated by previous or current codes
-// and supported by current codes.
-constexpr uint32_t kSupportedTensorVersion[] = {0};
-
-bool IsProgramVersionSupported(int64_t version);
-
-bool IsTensorVersionSupported(uint32_t version);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc
deleted file mode 100644
index e8c5f256000522af976bbf487741a586f1abc439..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/version_test.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/version.h"
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace framework {
-TEST(Version, Basic) {
-  EXPECT_TRUE(IsProgramVersionSupported(0));
-  EXPECT_FALSE(IsProgramVersionSupported(1));
-  EXPECT_FALSE(IsProgramVersionSupported(-1));
-
-  EXPECT_TRUE(IsTensorVersionSupported(0));
-  EXPECT_FALSE(IsTensorVersionSupported(1));
-  EXPECT_FALSE(IsTensorVersionSupported(-1));
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
deleted file mode 100644
index 5ba7c32d01fce2c75007ac1026278f4a7689ef55..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-cc_library(imperative_flag SRCS flags.cc DEPS gflags) 
-
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
-cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows var_type_traits layer)
-cc_library(tracer SRCS tracer.cc DEPS layer engine)
-cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator)
-cc_library(imperative_profiler SRCS profiler.cc)
-cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
-
-add_subdirectory(tests)
diff --git a/paddle/fluid/imperative/README.md b/paddle/fluid/imperative/README.md
deleted file mode 100644
index 4c4d619b35a9fd67231071ecca791c9df670fea1..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/README.md
+++ /dev/null
@@ -1,212 +0,0 @@
-# Overview
-
-Imperative Programming is easier to learn, debug and try new ideas.
-
-# Related Works
-
-## Pytorch
-https://pytorch.org/
-
-## TensorFlow Eager
-https://www.tensorflow.org/guide/eager
-
-# Design
-
-## API
-```python
-class Layer(object):
-
-  def __call__(inputs):
-    # build some parameter once.
-    # ...
-    return self.apply(inputs):
-
-  def forward(inputs):
-    # forward logic with paddle operators. backward auto-generated.
-
-
-class PyLayer(core.PyLayer):
-
-  def __call__(cls, inputs):
-    # trace the logic.
-
-  @staticmethod
-  def forward(inputs):
-    # any forward logic implemented with numpy io.
-
-  @staticmethod
-  def backward(inputs):
-    # any backward logic implemented with numpy io.
-
-```
-
-
-## Tracer
-
-Current: Python Variable -> C++ VarBase -> C++ Variable -> C++ Tensor
-
-Longer term.
-```python
-
-# Parent class.
-class PyVarBase(object):
-  pass
-
-# Current python variable.
-class Variable(PyVarBase):
-  pass
-
-class IVariable(PyVarBase):
-  def __init__(self):
-    self._ivar = core.VarBase()
-
-  # Move var to a device.
-  def to(device): pass
-  # Get var value.
-  def value(): pass
-  # Trigger backward.
-  def backward(): pass
-  # Get var's gradient value.
-  def gradient_value(): pass
-  # operators to override.
-```
-
-
-
-```cpp
-class Tracer {
- public:
-  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
-
-  virtual ~Tracer() {}
-
-  void Trace(OpBase* op,
-             const std::map<std::string, std::vector<VarBase*>>& inputs,
-             const std::map<std::string, std::vector<VarBase*>>& outputs,
-             framework::BlockDesc* block, const bool stop_gradient = false);
-
-  std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
-                                bool stop_gradient = false);
-};
-```
-
-* Trace forward operations
-* Perform quick shape/type infer, push kernel execution engine and return to user.
-* Perform autograd to generate gradients.
-* Clear trace.
-* Apply gradients with optimizers
-
-## Autodiff
-
-Lots of research already.
-https://autodiff-workshop.github.io/
-https://en.wikipedia.org/wiki/Automatic_differentiation
-
-Basically, trace the forward execution, and perform autodiff
-when needed.
-
-* Can be triggered by `backward()`.
-* Can select a block of code to trace and autodiff.
-* Use `require_grad` to drop some forward subgraph that doesn't need autodiff.
-
-## Execution Engine
-
-Lazy execution of pushed C++ operations.
-
-## Device Placement
-
-* Operator executes on the inputs' device.
-* All inputs should live on the same device.
-* use `Var.to()` to explicitly move var to a device.
-
-## Save/Load Models
-
-TODO
-
-## I/O
-
-TODO
-
-## Refactor
-
-* All function layers with parameters converted to class Layers.
-* Existing models converted to imperative mode.
-* All op tests run once in static graph, once in imperative mode.
-
-# Examples
-
-```python
-class MyLayer(fluid.imperative.Layer):
-    def __init__(self):
-        super(MyLayer, self).__init__()
-
-    def forward(self, inputs):
-        x = fluid.layers.relu(inputs)
-        x = fluid.layers.elementwise_mul(x, x)
-        x = fluid.layers.reduce_sum(x)
-        return [x]
-
-
-class MyPyLayer(fluid.imperative.PyLayer):
-    def __init__(self):
-        super(MyPyLayer, self).__init__()
-
-    @staticmethod
-    def forward(inputs):
-        return np.tanh(inputs[0])
-
-    @staticmethod
-    def backward(inputs):
-        return np.array(dout) * (1 - np.square(np.array(out)))
-
-
-np_inp = np.ones([2, 2], np.float32)
-with fluid.imperative.guard():
-    my_py_layer = MyPyLayer()
-    outs = my_py_layer(np_inp)
-    dy_out = np.sum(outs[0]._numpy())
-    outs[0]._backward()
-    dy_grad = var_inp._gradient()
-
-
-class MLP(fluid.imperative.Layer):
-    def __init__(self):
-        super(MLP, self).__init__()
-        self._fc1 = FC(3,
-                       fluid.ParamAttr(
-                           initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = FC(4,
-                       fluid.ParamAttr(
-                           initializer=fluid.initializer.Constant(value=0.1)))
-
-    def forward(self, inputs):
-        x = self._fc1(inputs)
-        x = self._fc2(x)
-        x = fluid.layers.reduce_sum(x)
-        return x
-
-
- np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
- with fluid.imperative.guard():
-     var_inp = fluid.imperative.base.to_variable(np_inp)
-     mlp = MLP()
-     out = mlp(var_inp)
-     dy_out = out._numpy()
-     out._backward()
-```
-
-# Plan
-
-2.1，3 fulltime, Can run a few simple models. (Currently, 2 20% engs)
-
-4.1, 4 fulltime, Can run 6 models, Performance 70% Pytorch. Release alpha.
-
-6.1, 5 fulltime, Performance close to Pytorch, can run multi-devices. Release Beta.
-
-8.1, 5 fulltime, Works in general. Update existing models. Can compile to static graph, support more optimizations.
-
-12.1 Done.
-
-# Discussion
-
-TODO.
diff --git a/paddle/fluid/imperative/backward_strategy.h b/paddle/fluid/imperative/backward_strategy.h
deleted file mode 100644
index 0f04d6db8e63d5d069745ed1895df774e69d60d0..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/backward_strategy.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Created by Jiabin on 2019-04-25.
-//
-#pragma once
-
-namespace paddle {
-namespace imperative {
-namespace detail {
-
-struct BackwardStrategy {
-  /* DyGraph now support two kinds of backward strategy, one is sorted sum
-   * gradient, another is sum gradient once they are created */
-  // TODO(jiabin): add more Strategy when we support
-  bool sorted_sum_gradient_{false};
-};
-
-}  // namespace detail
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc
deleted file mode 100644
index 3a41bafbfc4c81d0fba3f07db23b3e7f2b670f79..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/engine.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/imperative/engine.h"
-
-#include <algorithm>
-#include <memory>
-#include <queue>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/imperative/gradient_accumulator.h"
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/imperative/tracer.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace imperative {
-
-void Engine::RunOp(paddle::imperative::OpBase* op,
-                   const paddle::imperative::NameVarBaseMap& ins,
-                   const paddle::imperative::NameVarBaseMap& outs,
-                   const paddle::platform::Place& place) {
-  platform::RecordEvent event(op->Type());
-
-  op->Run(ins, outs);
-}
-
-void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy) {
-  backward_strategy_ = strategy;
-  const std::vector<OpBase*> ops = var->GradVarBase()->GradOps();
-  var->ClearGradOps();
-
-  if (ops.empty()) {
-    VLOG(3) << "Skip auto grad since there is no grad op for var: "
-            << var->Name();
-    return;
-  } else {
-    bool valid = false;
-    for (const auto& op : ops) {
-      if (op) {
-        valid = true;
-      }
-    }
-    if (!valid) {
-      VLOG(3) << "Skip auto grad since all grad op of start VarBase is nullptr";
-      return;
-    }
-  }
-  init_ops_ = ops;
-  platform::RecordEvent record_event("Imperative Backward");
-  VLOG(3) << "start backward";
-
-  PADDLE_ENFORCE_EQ(var->HasGradVar(), true,
-                    "Grad variable not exist for variable %s", var->Name());
-
-  auto& fwd_var = var->Var().Get<framework::LoDTensor>();
-  auto* grad_var =
-      var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
-  VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
-          << " as stop_gradient false";
-  var->GradVarBase()->InnerSetOverridedStopGradient(false);
-  var->GradVarBase()->SetGradGenerated(true);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
-  grad_var->Resize(fwd_var.dims());
-  grad_var->mutable_data(fwd_var.place(), fwd_var.type());
-  operators::math::set_constant(*dev_ctx, grad_var, 1.0);
-}
-
-void BasicEngine::CheckBackwardInputs(OpBase* op) {
-  for (auto& pair : op->GetInsMap()) {
-    for (auto& var : pair.second) {
-      if (var && IsGrad(var.get())) {
-        // if grad var has OverridedStopGradient skip this Op
-        if (!var->GradGenerated()) {
-          VLOG(6) << "Set ungenerated Grad: " << var->Name() << " as zero";
-          auto* dev_ctx =
-              platform::DeviceContextPool::Instance().Get(op->place());
-          auto* tensor = var->MutableVar()->GetMutable<framework::LoDTensor>();
-          tensor->mutable_data(op->place(), var->DataType());
-          operators::math::set_constant(*dev_ctx, tensor, 0.0);
-        } else {
-          continue;
-        }
-      }
-    }
-  }
-}
-
-void BasicEngine::SetBackwardOutputs(paddle::imperative::OpBase* op) {
-  for (auto& pair : op->GetOutsMap()) {
-    for (auto& var : pair.second) {
-      if (var) {
-        // Set Backward outputs's generate_grad as true
-        var->SetGradGenerated(true);
-        VLOG(6) << "Set backward output: " << var->Name()
-                << "'s SetGeneratedGrad as True";
-      }
-    }
-  }
-}
-void BasicEngine::PrepareGradAccumulators(OpBase* op) {
-  for (const auto& pair : op->GetOutsMap()) {
-    for (const auto& var : pair.second) {
-      if (!var) continue;
-
-      auto& accumulator = accumulators_[var.get()];
-      if (!accumulator) {
-        if (backward_strategy_.sorted_sum_gradient_) {
-          accumulator.reset(new SortedGradientAccumulator(var.get()));
-        } else {
-          accumulator.reset(new EagerGradientAccumulator(var.get()));
-        }
-      }
-
-      accumulator->IncreaseRefCnt();
-
-      VLOG(3) << "Prepare to acccumulate variable grad " << var->Name()
-              << "with reference count " << accumulator->RefCnt();
-    }
-  }
-}
-
-void BasicEngine::PrepareDeps() {
-  PADDLE_ENFORCE_EQ(op_deps_.empty(), true, "Op deps must be initialized here");
-  PADDLE_ENFORCE_EQ(accumulators_.empty(), true,
-                    "Accumulators must be initialized here");
-
-  std::queue<OpBase*> q;
-  std::unordered_set<OpBase*> visited;
-  for (const auto& init_op : init_ops_) {
-    q.push(init_op);
-    visited.insert(init_op);
-  }
-
-  while (!q.empty()) {
-    auto* cur_op = q.front();
-    q.pop();
-    VLOG(3) << "Checking grads of op " << cur_op->Type();
-
-    CheckBackwardInputs(cur_op);
-
-    SetBackwardOutputs(cur_op);
-
-    PrepareGradAccumulators(cur_op);
-
-    auto& grad_pending_ops = cur_op->GradPendingOps();
-    for (auto* grad_pending_op : grad_pending_ops) {
-      PADDLE_ENFORCE_NOT_NULL(grad_pending_op);
-      ++op_deps_[grad_pending_op];
-      if (visited.count(grad_pending_op) == 0) {
-        visited.insert(grad_pending_op);
-        q.push(grad_pending_op);
-      }
-    }
-  }
-}
-
-void BasicEngine::SumGradient(OpBase* op, std::shared_ptr<VarBase> src,
-                              VarBase* dst) {
-  auto iter = accumulators_.find(dst);
-  PADDLE_ENFORCE_EQ(iter != accumulators_.end(), true,
-                    "Cannot find gradient of variable %s", dst->Name());
-  iter->second->Add(std::move(src), op->id());
-}
-void BasicEngine::Execute() {
-  PrepareDeps();
-  // Start execute Computation graph
-  std::queue<OpBase*> q;
-  for (const auto& init_op : init_ops_) {
-    q.push(init_op);
-  }
-  while (!q.empty()) {
-    OpBase* cur_op = q.front();
-    q.pop();
-
-    // Step 1: Run Backward
-    auto& bwd_ins = cur_op->GetInsMap();
-    auto& bwd_outs = cur_op->GetOutsMap();
-
-    NameVarBaseMap tmp_outs;
-    // A var may be coresponding to several grad var in one op
-    std::unordered_map<VarBase*, std::vector<std::shared_ptr<VarBase>>> var_map;
-    size_t counter = 0;
-    for (auto& bwd_out : bwd_outs) {
-      auto& tmp_var_list = tmp_outs[bwd_out.first];
-      tmp_var_list.reserve(bwd_out.second.size());
-      for (auto& var : bwd_out.second) {
-        auto tmp_var = std::make_shared<VarBase>(
-            false, "Gtmp@" + std::to_string(counter++));  // Do not need grad
-        tmp_var_list.emplace_back(tmp_var);
-        if (var) {
-          var_map[var.get()].emplace_back(std::move(tmp_var));
-          var->ClearGradOps();
-        }
-      }
-    }
-
-    VLOG(3) << "Start to execute grad op " << cur_op->Type();
-    RunOp(cur_op, bwd_ins, tmp_outs, cur_op->place());
-    // Step 2: Sum Gradient
-    {
-      platform::RecordEvent record_event("merge_grads");
-      for (auto& var_pair : var_map) {
-        auto* dst_var = var_pair.first;
-        if (dst_var == nullptr) continue;
-        for (auto& src_var : var_pair.second) {
-          VLOG(3) << "Sum gradient of variable " << dst_var->Name()
-                  << " after op " << cur_op->Type();
-          SumGradient(cur_op, std::move(src_var), dst_var);
-        }
-      }
-    }
-
-    // Step 3: Collect ready ops
-    for (auto* grad_pending_op : cur_op->GradPendingOps()) {
-      PADDLE_ENFORCE_NOT_NULL(grad_pending_op);
-      auto iter = op_deps_.find(grad_pending_op);
-      if (iter == op_deps_.end()) {
-        continue;
-      }
-
-      VLOG(3) << "Found grad_pending op of " << cur_op->Type();
-      // An Op is ready to go while its deps comes to zero
-
-      if (--(iter->second) == 0) {
-        q.push(grad_pending_op);
-        VLOG(3) << "Push grad_pending op " << grad_pending_op->Type()
-                << " into queue";
-      }
-    }
-
-    // Step 4: Delete op to collect unused variables
-    VLOG(3) << "Remove op after op " << cur_op->Type() << " runs";
-    RemoveOp(cur_op);
-  }
-  VLOG(3) << "Clean properties of BasicEngine";
-  CleanEngine();
-}
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h
deleted file mode 100644
index a26800455291a6912121b7a95da3550400297856..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/engine.h
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/imperative/backward_strategy.h"
-#include "paddle/fluid/imperative/gradient_accumulator.h"
-#include "paddle/fluid/imperative/layer.h"
-
-namespace paddle {
-namespace imperative {
-
-// It seems there is no need for Engine to be an
-// singleton, we can have multi-engine to run
-// mutil-graoh. For future use we may expose a interface
-// to Python to support
-class Engine {
- public:
-  virtual ~Engine() = default;
-  virtual void Execute() = 0;
-  virtual void Init(VarBase* var, const detail::BackwardStrategy& strategy) = 0;
-  virtual void RunOp(imperative::OpBase* op, const NameVarBaseMap& ins,
-                     const NameVarBaseMap& outs, const platform::Place& place);
-
-  virtual void RemoveOp(OpBase* op) {
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot remove null op");
-    auto iter = grad_ops_.find(op);
-    PADDLE_ENFORCE_EQ(iter != grad_ops_.end(), true, "Op is not inside tracer");
-    grad_ops_.erase(iter);
-  }
-
-  void InsertOp(OpBase* op, std::shared_ptr<OpBase> op_shared) {
-    grad_ops_[op] = std::move(op_shared);
-  }
-
-  void InsertGradVar(VarBase* grad) { grad_vars_.emplace(grad); }
-
-  bool IsGrad(VarBase* var) { return grad_vars_.count(var) > 0; }
-
-  void Clear() {
-    grad_ops_.clear();
-    grad_vars_.clear();
-  }
-
- private:
-  std::unordered_map<OpBase*, std::shared_ptr<OpBase>>
-      grad_ops_;  // opBase for remove - grad_op
-  std::unordered_set<VarBase*> grad_vars_;
-};
-
-class BasicEngine : public Engine {
- public:
-  BasicEngine() = default;
-
-  void Init(VarBase* var, const detail::BackwardStrategy& strategy) override;
-
-  ~BasicEngine() override = default;
-
-  void Execute() override;
-
- private:
-  void PrepareDeps();
-
-  void CheckBackwardInputs(OpBase* op);
-
-  void SetBackwardOutputs(OpBase* op);
-
-  void PrepareGradAccumulators(OpBase* op);
-
-  void SumGradient(OpBase* op, std::shared_ptr<VarBase> src, VarBase* dst);
-
-  // TODO(jiabin): maybe we can optimize the performance of engine by cache the
-  // result
-  void CleanEngine() {
-    init_ops_.clear();
-    op_deps_.clear();
-    accumulators_.clear();
-    Clear();
-  }
-
-  std::vector<OpBase*> init_ops_;
-  detail::BackwardStrategy backward_strategy_;
-  std::unordered_map<OpBase*, size_t> op_deps_;
-  std::unordered_map<VarBase*, std::unique_ptr<GradientAccumulator>>
-      accumulators_;
-};
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/flags.cc b/paddle/fluid/imperative/flags.cc
deleted file mode 100644
index 57656d64ab78868a5c1c5eb73520523cd0f5d0b5..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/flags.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/imperative/flags.h"
-#include "gflags/gflags.h"
-
-DEFINE_uint64(dygraph_debug, 0,
-              "Debug level of dygraph. This flag is not "
-              "open to users");
-
-namespace paddle {
-namespace imperative {
-
-bool IsDebugEnabled() { return FLAGS_dygraph_debug != 0; }
-
-uint64_t GetDebugLevel() { return FLAGS_dygraph_debug; }
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/flags.h b/paddle/fluid/imperative/flags.h
deleted file mode 100644
index 094bce831c4d505ecf9ba72aa0cb2d65f486ba27..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/flags.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdint>
-
-namespace paddle {
-namespace imperative {
-
-extern bool IsDebugEnabled();
-extern uint64_t GetDebugLevel();
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
deleted file mode 100644
index 509415a367284d0e92f8d45c011695ad727bc8ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/imperative/gradient_accumulator.h"
-#include <algorithm>
-#include <memory>
-#include <utility>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace imperative {
-
-template <typename T>
-class TensorAddFunctor : public boost::static_visitor<> {
- public:
-  TensorAddFunctor(int64_t numel, const T* x, T* y)
-      : numel_(numel), x_(x), y_(y) {}
-
-  void operator()(const platform::CPUPlace& place) {
-    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
-    blas.AXPY(numel_, 1., x_, y_);
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  void operator()(const platform::CUDAPlace& place) {
-    platform::CUDADeviceContext* ctx =
-        dynamic_cast<platform::CUDADeviceContext*>(
-            platform::DeviceContextPool::Instance().Get(place));
-    auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
-    blas.AXPY(numel_, 1., x_, y_);
-  }
-#else
-  void operator()(const platform::CUDAPlace& place) {
-    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
-  }
-#endif
-
-  // there is NO blas in CUDAPinnedPlace
-  void operator()(const platform::CUDAPinnedPlace& place) {
-    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
-  }
-
- private:
-  int64_t numel_;
-  const T* x_;
-  T* y_;
-};
-
-void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
-  auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
-  auto& src_tensor = src.Get<framework::LoDTensor>();
-
-  auto numel = src_tensor.numel();
-
-  // FIXME(minqiyang): loss_grad op will pass a zero grad of label
-  // ugly fix for it
-  if (numel == 0) {
-    return;
-  }
-
-  PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true,
-                    "dst_numel %d vs. src_numel %d", dst_tensor->numel(),
-                    numel);
-
-  auto data_type = src_tensor.type();
-  auto place = src_tensor.place();
-
-#define PADDLE_TENSOR_ADD_MACRO(cpp_type)                            \
-  if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) { \
-    TensorAddFunctor<cpp_type> func(                                 \
-        numel, src_tensor.data<cpp_type>(),                          \
-        dst_tensor->mutable_data<cpp_type>(place));                  \
-    boost::apply_visitor(func, place);                               \
-    return;                                                          \
-  }
-
-  PADDLE_TENSOR_ADD_MACRO(float);
-  PADDLE_TENSOR_ADD_MACRO(double);
-
-#undef PADDLE_TENSOR_ADD_MACRO
-
-  PADDLE_THROW("Not supported data type %s for AddTo",
-               framework::DataTypeToString(data_type));
-}
-
-void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var,
-                                   size_t trace_id) {
-  auto* dst_var = var_->MutableVar();
-  auto place = var->Var().Get<framework::LoDTensor>().place();
-  if (!var_->OverridedStopGradient()) {
-    VLOG(3) << "Sum Gradient for: " << var_->Name();
-    if (cur_cnt_ == 0) {
-      *dst_var = std::move(*(var->MutableVar()));
-    } else {
-      TensorAdd(var->Var(), dst_var);
-    }
-  } else {
-    if (!var_->Var().IsInitialized() ||
-        !var_->Var().Get<framework::LoDTensor>().IsInitialized()) {
-      VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero";
-      auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>();
-      tensor->mutable_data(place, var->DataType());
-      operators::math::set_constant(*dev_ctx, tensor, 0.0);
-    }
-  }
-  ++cur_cnt_;
-}
-
-void SortedGradientAccumulator::Add(std::shared_ptr<VarBase> var,
-                                    size_t trace_id) {
-  auto* dst_var = var_->MutableVar();
-  auto place = var->Var().Get<framework::LoDTensor>().place();
-  if (!var_->OverridedStopGradient()) {
-    if (ref_cnt_ == 1) {
-      *dst_var = std::move(*(var->MutableVar()));
-    } else {
-      if (tmp_grad_vars_.empty()) {
-        tmp_grad_vars_.reserve(ref_cnt_);
-      }
-
-      tmp_grad_vars_.emplace_back(std::move(var), trace_id);
-
-      if (tmp_grad_vars_.size() != ref_cnt_) {
-        return;
-      }
-
-      std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(),
-                [](const std::pair<std::shared_ptr<VarBase>, size_t>& p1,
-                   const std::pair<std::shared_ptr<VarBase>, size_t>& p2) {
-                  return p1.second > p2.second;
-                });
-
-      *dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar()));
-      for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) {
-        TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var);
-      }
-
-      tmp_grad_vars_.clear();
-    }
-  } else {
-    if (!var_->Var().IsInitialized() ||
-        !var_->Var().Get<framework::LoDTensor>().IsInitialized()) {
-      VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero";
-      auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>();
-      tensor->mutable_data(place, var->DataType());
-      operators::math::set_constant(*dev_ctx, tensor, 0.0);
-    }
-    // looks like tmp_grad_vars will not have any member but just in case
-    tmp_grad_vars_.clear();
-  }
-}
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
deleted file mode 100644
index d4980496b266f08273108c5f98be7d4520678b29..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/imperative/layer.h"
-
-namespace paddle {
-namespace imperative {
-
-class GradientAccumulator {
- public:
-  explicit GradientAccumulator(VarBase* var) : var_(var) {}
-
-  virtual void Add(std::shared_ptr<VarBase> var, size_t trace_id) = 0;
-
-  virtual ~GradientAccumulator() = default;
-
-  inline void IncreaseRefCnt() { ++ref_cnt_; }
-
-  inline size_t RefCnt() const { return ref_cnt_; }
-
- protected:
-  VarBase* var_;
-  size_t ref_cnt_{0};
-};
-
-class EagerGradientAccumulator : public GradientAccumulator {
- public:
-  using GradientAccumulator::GradientAccumulator;
-
-  void Add(std::shared_ptr<VarBase> var, size_t trace_id) override;
-
- private:
-  size_t cur_cnt_{0};
-};
-
-class SortedGradientAccumulator : public GradientAccumulator {
- public:
-  using GradientAccumulator::GradientAccumulator;
-
-  void Add(std::shared_ptr<VarBase> var, size_t trace_id) override;
-
- private:
-  std::vector<std::pair<std::shared_ptr<VarBase>, size_t>> tmp_grad_vars_;
-};
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
deleted file mode 100644
index 8a5db26d7d7f158c3f436e3ad339dd29b8132735..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/layer.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/imperative/layer.h"
-#include <algorithm>
-#include <queue>
-#include <utility>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/imperative/prepared_operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace imperative {
-
-using framework::Variable;
-void ThreadSafeNameSet::Insert(const std::string& name) {
-  std::lock_guard<std::mutex> guard(mtx_);
-  set_.insert(name);
-}
-
-void ThreadSafeNameSet::Remove(const std::string& name) {
-  std::lock_guard<std::mutex> guard(mtx_);
-  auto iter = set_.find(name);
-  PADDLE_ENFORCE_EQ(iter != set_.end(), true, "%s does not exist", name);
-  set_.erase(iter);
-}
-
-std::vector<std::string> ThreadSafeNameSet::Names() const {
-  std::lock_guard<std::mutex> guard(mtx_);
-  return std::vector<std::string>(set_.begin(), set_.end());
-}
-
-ThreadSafeNameSet VarBase::name_set_;
-
-std::vector<std::string> VarBase::AliveVarNames() { return name_set_.Names(); }
-
-static framework::VariableNameMap CreateVarNameMap(
-    const framework::OpInfo& op_info, const std::string& op_type,
-    const NameVarBaseMap& varbase_map, bool is_input) {
-  if (op_info.proto_ == nullptr) {
-    return {};
-  }
-
-  framework::VariableNameMap result;
-
-  for (auto& var :
-       is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) {
-    auto it = varbase_map.find(var.name());
-    if (it == varbase_map.end()) {
-      PADDLE_ENFORCE_EQ(
-          var.dispensable(), true,
-          "Var: %s not dispensable and there are no such var in inputs",
-          var.name());
-      result[var.name()] = {};
-    } else {
-      auto& var_vector = it->second;
-      std::vector<std::string> args;
-      args.reserve(var_vector.size());
-      for (auto& var_base : var_vector) {
-        args.emplace_back(var_base->Name());
-      }
-      result[var.name()] = std::move(args);
-    }
-  }
-  return result;
-}
-
-static framework::RuntimeContext PrepareRuntimeContext(
-    const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
-  framework::VariableValueMap inputs, outputs;
-  for (auto& in_pair : ins) {
-    auto& in_ctx = inputs[in_pair.first];
-    in_ctx.reserve(in_pair.second.size());
-    for (auto& in_var : in_pair.second) {
-      in_ctx.emplace_back(in_var->MutableVar());
-    }
-  }
-
-  for (auto& out_pair : outs) {
-    auto& out_ctx = outputs[out_pair.first];
-    out_ctx.reserve(out_pair.second.size());
-    for (auto& out_var : out_pair.second) {
-      out_ctx.emplace_back(out_var->MutableVar());
-    }
-  }
-  return framework::RuntimeContext(std::move(inputs), std::move(outputs));
-}
-
-static std::string DebugString(
-    const std::string& name,
-    const std::vector<std::shared_ptr<VarBase>>& vars) {
-  std::stringstream ss;
-  ss << name << "{";
-
-  for (size_t i = 0; i < vars.size(); ++i) {
-    if (i > 0) ss << ", ";
-
-    if (vars[i] == nullptr) {
-      ss << "NULL";
-      continue;
-    }
-    ss << vars[i]->Name() << "[";
-    auto& var = vars[i]->Var();
-    if (!var.IsInitialized()) {
-      ss << "NOT_INITED_VAR";
-    } else if (var.IsType<framework::LoDTensor>()) {
-      auto& tensor = var.Get<framework::LoDTensor>();
-      ss << "LoDTensor<";
-      if (tensor.IsInitialized()) {
-        ss << framework::DataTypeToString(tensor.type()) << ", ";
-        ss << tensor.place() << ", ";
-        ss << "(" << tensor.dims() << ")";
-      } else {
-        ss << "NOT_INITED";
-      }
-      ss << ">";
-    } else {
-      ss << "UNRESOLVED_TYPE";
-    }
-    ss << "]";
-  }
-
-  ss << "}";
-  return ss.str();
-}
-
-std::string LayerDebugString(const std::string& op_type,
-                             const NameVarBaseMap& ins,
-                             const NameVarBaseMap& outs) {
-  std::stringstream ss;
-  ss << "Op(" << op_type << "): ";
-
-  ss << "Inputs: ";
-
-  size_t i = 0;
-  for (auto& pair : ins) {
-    if (i > 0) ss << ", ";
-    ss << DebugString(pair.first, pair.second);
-    ++i;
-  }
-
-  ss << ",   Outputs: ";
-  i = 0;
-  for (auto& pair : outs) {
-    if (i > 0) ss << ", ";
-    ss << DebugString(pair.first, pair.second);
-    ++i;
-  }
-  return ss.str();
-}
-
-void VarBase::AddGradOps(const std::weak_ptr<OpBase>& op) {
-  if (op.lock() == nullptr) {
-    return;
-  }
-  for (const auto& cur_op : grad_ops_) {
-    if (cur_op.lock() == op.lock()) {
-      return;
-    }
-  }
-  grad_ops_.emplace_back(op);
-}
-
-void VarBase::ClearGradient() {
-  if (grad_var_) {
-    auto* grad_t = grad_var_->var_.GetMutable<framework::LoDTensor>();
-    if (grad_t->IsInitialized()) {
-      auto* dev_ctx =
-          platform::DeviceContextPool::Instance().Get(grad_t->place());
-      operators::math::set_constant(*dev_ctx, grad_t, 0.0);
-    }
-  }
-}
-
-std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
-                                             const bool blocking) const {
-  PADDLE_ENFORCE_EQ(var_.IsInitialized() && var_.IsType<framework::LoDTensor>(),
-                    true,
-                    "Variable must be initialized and type of LoDTensor when "
-                    "getting numpy tensor");
-
-  auto& src_tensor = var_.Get<framework::LoDTensor>();
-
-  // TODO(Jiabin): change this after move unique_name generator to CXX
-  auto new_var = std::make_shared<VarBase>(
-      false, "Itmp" + std::to_string(copied_counter_++));
-
-  auto* dst_tensor = new_var->var_.GetMutable<framework::LoDTensor>();
-  dst_tensor->set_lod(src_tensor.lod());
-
-  framework::TensorCopy(src_tensor, dst_place, dst_tensor);
-  if (blocking) {
-    platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
-    auto src_place = src_tensor.place();
-    if (!(src_place == dst_place)) {
-      platform::DeviceContextPool::Instance().Get(src_place)->Wait();
-    }
-  }
-
-  if (platform::is_gpu_place(dst_place)) {
-    VLOG(3) << "copy tensor " << Name() << " from gpu";
-  }
-
-  return new_var;
-}
-// create OpBase from optype
-OpBase::OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins,
-               const NameVarBaseMap& outs, framework::AttributeMap attrs,
-               const platform::Place& place)
-    : id_(id), place_(place) {
-  const auto& info = framework::OpInfoMap::Instance().Get(type);
-
-  // Step 1: Run forward
-  if (info.Checker() != nullptr) {
-    info.Checker()->Check(&attrs);
-  }
-
-  auto input_name_map = CreateVarNameMap(info, type, ins, true);
-  auto output_name_map = CreateVarNameMap(info, type, outs, false);
-  op_ = framework::OpRegistry::CreateOp(type, std::move(input_name_map),
-                                        std::move(output_name_map),
-                                        std::move(attrs));
-  VLOG(3) << "Construct Op: " << type << std::endl;
-}
-
-// create OpBase from opdesc
-OpBase::OpBase(size_t id, const framework::OpDesc& op_desc,
-               const platform::Place& place)
-    : id_(id), op_(framework::OpRegistry::CreateOp(op_desc)), place_(place) {
-  VLOG(3) << "Construct Op: " << op_desc.Type() << std::endl;
-}
-
-void OpBase::Run(const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
-  auto* op_kernel = dynamic_cast<framework::OperatorWithKernel*>(op_.get());
-  PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
-  auto& info = op_->Info();
-  if (info.infer_var_type_) {
-    RuntimeInferVarTypeContext infer_var_type_ctx(ins, &outs, op_->Attrs());
-    info.infer_var_type_(&infer_var_type_ctx);
-  }
-
-  // Initialize output var type
-  for (auto& var_pair : outs) {
-    for (auto& var : var_pair.second) {
-      InitializeVariable(var->MutableVar(), var->Type());
-    }
-  }
-
-  VLOG(3) << "Running Op " << Type();
-  VLOG(5) << LayerDebugString(Type(), ins, outs);
-  auto runtime_ctx = PrepareRuntimeContext(ins, outs);
-
-  VLOG(6) << "start preparing op: " << Type();
-  auto prepared_op = PreparedOp::Prepare(runtime_ctx, *op_kernel, place(), ins);
-
-  VLOG(6) << "finish preparing op: " << Type();
-  prepared_op.Run();
-
-  VLOG(4) << LayerDebugString(Type(), ins, outs);
-}
-
-void OpBase::ClearBackwardTrace() {
-  grad_pending_ops_.clear();
-  ins_.clear();
-  outs_.clear();
-}
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
deleted file mode 100644
index 4ef22c97d0d4c940dac24215906eec14d398994b..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/layer.h
+++ /dev/null
@@ -1,432 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <atomic>
-#include <cstdint>
-#include <list>
-#include <map>     // NOLINT
-#include <memory>  // NOLINT
-#include <mutex>   // NOLINT
-#include <set>
-#include <string>         // NOLINT
-#include <unordered_map>  // NOLINT
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/flags.h"
-#include "paddle/fluid/imperative/type_defs.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace imperative {
-
-class OpBase;
-
-class ThreadSafeNameSet {
- public:
-  void Insert(const std::string& name);
-
-  void Remove(const std::string& name);
-
-  std::vector<std::string> Names() const;
-
- private:
-  std::multiset<std::string> set_;
-  mutable std::mutex mtx_;
-};
-
-class VarBase {
-  DISABLE_COPY_AND_ASSIGN(VarBase);
-
- public:
-  static std::vector<std::string> AliveVarNames();
-  explicit VarBase(bool has_grad, const std::string& name)
-      : name_(name),
-        grad_var_(has_grad ? new VarBase(false, GradVarName()) : nullptr) {
-    if (IsDebugEnabled()) {
-      VLOG(10) << "Construct VarBase: " << name;
-      name_set_.Insert(name_);
-    }
-  }
-
-  explicit VarBase(const std::string& name) : VarBase(true, name) {}
-
-  ~VarBase() {
-    VLOG(10) << "Destruct VarBase: " << name_;
-    if (IsDebugEnabled()) {
-      name_set_.Remove(name_);
-    }
-  }
-
-  const framework::Variable& Var() const { return var_; }
-
-  framework::Variable* MutableVar() { return &var_; }
-
-  bool HasGradVar() const { return grad_var_ != nullptr; }
-
-  const std::shared_ptr<VarBase>& GradVarBase() const { return grad_var_; }
-
-  const framework::Variable& GradVar() const {
-    PADDLE_ENFORCE_NOT_NULL(grad_var_, "Gradient of %s does not exist", name_);
-    return grad_var_->var_;
-  }
-
-  framework::Variable* MutableGradVar() {
-    PADDLE_ENFORCE_NOT_NULL(grad_var_, "Gradient of %s does not exist", name_);
-    return &(grad_var_->var_);
-  }
-
-  // This is used for python api
-  void SetOverridedStopGradient(bool stop_gradient) {
-    if (stop_gradient) {
-      overrided_stop_gradient_ = 1;
-    } else {
-      overrided_stop_gradient_ = 0;
-    }
-    if (grad_var_) {
-      grad_var_->SetOverridedStopGradient(stop_gradient);
-    }
-  }
-  // This is used for python api
-  bool OverridedStopGradient() const {
-    if (overrided_stop_gradient_ == 0) {
-      return false;
-    } else {
-      return true;
-    }
-  }
-
-  // This is used inside C++
-  int InnerOverridedStopGradient() const { return overrided_stop_gradient_; }
-
-  bool GradGenerated() const { return grad_generated_; }
-
-  void SetGradGenerated(bool generated) { grad_generated_ = generated; }
-  // This is used inside C++
-  void InnerSetOverridedStopGradient(bool stop_gradient) {
-    if (overrided_stop_gradient_ == -1) {
-      overrided_stop_gradient_ = static_cast<int>(stop_gradient);
-      if (grad_var_) {
-        grad_var_->InnerSetOverridedStopGradient(stop_gradient);
-      }
-    } else {
-      VLOG(6) << "Ignore Stop gradient conversion for Var: " << Name()
-              << "Set value is: " << overrided_stop_gradient_;
-    }
-  }
-
-  void SetPersistable(bool persistable) { persistable_ = persistable; }
-
-  bool Persistable() const { return persistable_; }
-
-  void AddGradOps(const std::weak_ptr<OpBase>& op);
-
-  std::vector<OpBase*> GradOps() {
-    std::vector<OpBase*> rlt;
-    // TODO(jiabin): use better data structure to remove nullptr when we find it
-    for (const auto& wk_ptr : grad_ops_) {
-      OpBase* tmp_op = wk_ptr.lock().get();
-      if (tmp_op) rlt.emplace_back(tmp_op);
-    }
-    return rlt;
-  }
-  void ClearGradOps() { grad_ops_.clear(); }
-
-  const std::string& Name() const { return name_; }
-
-  void SetName(const std::string& name) {
-    name_ = name;
-    if (grad_var_) {
-      grad_var_->SetName(GradVarName());
-    }
-  }
-
-  std::string GradVarName() { return framework::GradVarName(name_); }
-
-  void SetType(framework::proto::VarType::Type type) { type_ = type; }
-
-  framework::proto::VarType::Type Type() const { return type_; }
-
-  void SetDataType(framework::proto::VarType::Type data_type) {
-    data_type_ = data_type;
-    if (grad_var_) {
-      grad_var_->SetDataType(data_type_);
-    }
-  }
-
-  framework::proto::VarType::Type DataType() const { return data_type_; }
-
-  void ClearGradient();
-
-  std::shared_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
-                                      const bool blocking) const;
-
- private:
-  framework::Variable var_;
-  std::string name_;
-  std::shared_ptr<VarBase> grad_var_;
-  mutable size_t copied_counter_ = 0;
-
-  // grad_op indicates which grad_op will this var be used as input
-  std::vector<std::weak_ptr<OpBase>> grad_ops_;
-  // add this property for users may set stop_gradient themselves and this
-  // should override the
-  // frameworks setting (-1) unset, (1) true, (0) false
-  int overrided_stop_gradient_{-1};
-  bool grad_generated_{false};
-  bool persistable_{false};
-
-  framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR};
-  framework::proto::VarType::Type data_type_{framework::proto::VarType::FP32};
-  static ThreadSafeNameSet name_set_;
-};
-
-class Layer {
- public:
-  virtual ~Layer() {}
-
-  virtual std::vector<std::shared_ptr<VarBase>> Forward(
-      const std::vector<std::shared_ptr<VarBase>>& inputs) {
-    return {};
-  }
-};
-
-// infer var type context for imperative mode
-class RuntimeInferVarTypeContext : public framework::InferVarTypeContext {
- public:
-  RuntimeInferVarTypeContext(const NameVarBaseMap& inputs,
-                             const NameVarBaseMap* outputs,
-                             const framework::AttributeMap& attrs_map)
-      : InferVarTypeContext(nullptr, nullptr),
-        inputs_(inputs),
-        outputs_(outputs),
-        attrs_(attrs_map),
-        input_names_(),
-        output_names_(),
-        var_set_() {
-    input_names_.reserve(inputs_.size());
-    for (auto& it : inputs_) {
-      for (auto& var : it.second) {
-        input_names_[it.first].emplace_back(var->Name());
-        var_set_[var->Name()] = var.get();
-      }
-    }
-
-    output_names_.reserve(outputs_->size());
-    for (auto& it : *outputs_) {
-      for (auto& var : it.second) {
-        output_names_[it.first].emplace_back(var->Name());
-        var_set_[var->Name()] = var.get();
-      }
-    }
-  }
-
-  virtual ~RuntimeInferVarTypeContext() {}
-
-  framework::Attribute GetAttr(const std::string& name) const override {
-    auto iter = attrs_.find(name);
-    PADDLE_ENFORCE_EQ(iter != attrs_.end(), true, "Cannot find attribute %s",
-                      name);
-    return iter->second;
-  }
-
-  bool HasVar(const std::string& name) const override {
-    return var_set_.count(name) > 0;
-  }
-
-  bool HasInput(const std::string& name) const override {
-    return inputs_.count(name) > 0;
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    PADDLE_ENFORCE_NOT_NULL(outputs_);
-    return outputs_->count(name) > 0;
-  }
-
-  const std::vector<std::string>& Input(
-      const std::string& name) const override {
-    auto iter = input_names_.find(name);
-    PADDLE_ENFORCE_EQ(iter != input_names_.end(), true, "Cannot find input %s",
-                      name);
-    return iter->second;
-  }
-
-  const std::vector<std::string>& Output(
-      const std::string& name) const override {
-    auto iter = output_names_.find(name);
-    PADDLE_ENFORCE_EQ(iter != output_names_.end(), true,
-                      "Cannot find output %s", name);
-    return iter->second;
-  }
-
-  framework::proto::VarType::Type GetType(
-      const std::string& name) const override {
-    auto iter = var_set_.find(name);
-    PADDLE_ENFORCE_EQ(iter != var_set_.end(), true,
-                      "Cannot find var %s in GetType", name);
-    return iter->second->Type();
-  }
-
-  void SetType(const std::string& name,
-               framework::proto::VarType::Type type) override {
-    if (name == "kLookupTablePath") {
-      VLOG(2) << "SUPER UGLY FIX, remove this when move imperative mode in C++";
-    } else {
-      var_set_[name]->SetType(type);
-    }
-  }
-
-  framework::proto::VarType::Type GetDataType(
-      const std::string& name) const override {
-    auto iter = var_set_.find(name);
-    PADDLE_ENFORCE_EQ(iter != var_set_.end(), true,
-                      "Cannot find var %s in GetDataType", name);
-    return iter->second->DataType();
-  }
-
-  void SetDataType(const std::string& name,
-                   framework::proto::VarType::Type type) override {
-    var_set_[name]->SetDataType(type);
-  }
-
-  std::vector<framework::proto::VarType::Type> GetDataTypes(
-      const std::string& name) const override {
-    PADDLE_THROW("GetDataTypes is not supported in runtime InferVarType");
-  }
-
-  void SetDataTypes(const std::string& name,
-                    const std::vector<framework::proto::VarType::Type>&
-                        multiple_data_type) override {
-    PADDLE_THROW("SetDataTypes is not supported in runtime InferVarType");
-  }
-
-  std::vector<int64_t> GetShape(const std::string& name) const override {
-    PADDLE_THROW("Do not handle Shape in runtime InferVarType");
-  }
-
-  void SetShape(const std::string& name,
-                const std::vector<int64_t>& dims) override {
-    PADDLE_THROW("Do not handle Shape in runtime InferVarType");
-  }
-
-  int32_t GetLoDLevel(const std::string& name) const override {
-    PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
-  }
-
-  void SetLoDLevel(const std::string& name, int32_t lod_level) override {
-    PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
-  }
-
- private:
-  const NameVarBaseMap& inputs_;
-  const NameVarBaseMap* outputs_;
-  const framework::AttributeMap& attrs_;
-  std::unordered_map<std::string, std::vector<std::string>> input_names_;
-  std::unordered_map<std::string, std::vector<std::string>> output_names_;
-  std::unordered_map<std::string, VarBase*> var_set_;
-};
-
-// TODO(zjl): to support py_func layer
-class OpBase : public std::enable_shared_from_this<OpBase> {
-  DISABLE_COPY_AND_ASSIGN(OpBase);
-
- public:
-  ~OpBase() { VLOG(3) << "Destruct Op: " << Type() << std::endl; }
-
-  // Developer should not rely on this method to create OpBase.
-  // OpBase should be created in Tracer and managed by Tracer totally.
-  template <typename... Args>
-  static std::shared_ptr<OpBase> Create(Args&&... args) {
-    return std::shared_ptr<OpBase>(new OpBase(std::forward<Args>(args)...));
-  }
-
-  size_t id() const { return id_; }
-
-  const std::string& Type() const { return op_->Type(); }
-
-  void Run(const NameVarBaseMap& ins, const NameVarBaseMap& outs);
-
-  const framework::VariableNameMap& InputNameMap() const {
-    return op_->Inputs();
-  }
-
-  const framework::VariableNameMap& OutputNameMap() const {
-    return op_->Outputs();
-  }
-
-  const framework::AttributeMap& Attrs() const { return op_->Attrs(); }
-  const framework::OpInfo& Info() const { return op_->Info(); }
-
-  void ClearBackwardTrace();
-
-  const std::vector<OpBase*>& GradPendingOps() const {
-    return grad_pending_ops_;
-  }
-
-  void InsertGradPendingOps(OpBase* op) { grad_pending_ops_.emplace_back(op); }
-
-  void SortGradPendingOps() {
-    std::sort(grad_pending_ops_.begin(), grad_pending_ops_.end(),
-              [](OpBase* op1, OpBase* op2) { return op1->id() > op2->id(); });
-  }
-  NameVarBaseMap* GetMutableOutsMap() { return &outs_; }
-  NameVarBaseMap* GetMutableInsMap() { return &ins_; }
-  const NameVarBaseMap& GetInsMap() { return ins_; }
-  const NameVarBaseMap& GetOutsMap() { return outs_; }
-  const platform::Place& place() const { return place_; }
-
-  // TODO(jiabin) prepare for backward hook
-  void RegisterBackwardHooks(const std::function<void()>& func) {
-    backward_hooks_.emplace_back(func);
-  }
-
-  void InvokeBackwardHooks() {
-    for (const auto& func : backward_hooks_) {
-      func();
-      VLOG(5) << "Invoke Backward Hook for: " << Type() << std::endl;
-    }
-  }
-
- private:
-  OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins,
-         const NameVarBaseMap& outs, framework::AttributeMap attrs,
-         const platform::Place& place);
-
-  OpBase(size_t id, const framework::OpDesc& op_desc,
-         const platform::Place& place);
-
-  size_t id_;
-
-  std::unique_ptr<framework::OperatorBase> op_;
-
-  std::vector<std::function<void()>> backward_hooks_;
-  platform::Place place_;
-
-  // Not need to be std::weak_ptr, because op is binded to a certain Tracer,
-  // and would not be used by a Tracer that does not create itself.
-  std::vector<OpBase*> grad_pending_ops_;
-
-  // This part is only used for backward
-  NameVarBaseMap ins_;
-  NameVarBaseMap outs_;
-};
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
deleted file mode 100644
index ab612b2f152d0a0c1ec59f133dc0f33d27bd525e..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/nccl_context.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/imperative/nccl_context.h"
-
-namespace paddle {
-namespace imperative {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-void NCCLParallelContext::RecvNCCLID(const std::string &ep,
-                                     ncclUniqueId *nccl_id) {
-  auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
-                    "The endpoint should contain host and port: %s", ep);
-  std::string host = addr[0];
-  int port = std::stoi(addr[1]);
-
-  int server_fd, new_socket;
-  struct sockaddr_in address;
-  int addrlen = sizeof(address);
-  char buffer[1024] = {0};
-  int opt = 0;
-  // creating socket fd
-  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
-    PADDLE_THROW("create server fd failed");
-  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)))
-    PADDLE_THROW("set socket opt failed");
-
-  address.sin_family = AF_INET;
-  address.sin_addr.s_addr = INADDR_ANY;
-  address.sin_port = htons(port);
-
-  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
-    PADDLE_THROW("binding failed on ep: %s", ep);
-  VLOG(3) << "listening on: " << ep;
-  if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
-
-  if ((new_socket =
-           accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
-                  reinterpret_cast<socklen_t *>(&addrlen))) < 0)
-    PADDLE_THROW("accept the new socket fd failed");
-
-  if (read(new_socket, buffer, 1024) < 0)
-    PADDLE_THROW("reading the ncclUniqueId from socket failed");
-  VLOG(3) << "recevived the ncclUniqueId";
-  memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
-
-  VLOG(3) << "closing the socket server: " << ep;
-  close(server_fd);
-}
-
-void NCCLParallelContext::SendNCCLID(const std::string &ep,
-                                     ncclUniqueId *nccl_id) {
-  auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
-                    "The endpoint should contain host and port: %s", ep);
-  std::string host = addr[0];
-  int port = std::stoi(addr[1]);
-  // struct sockaddr_in address;
-  int sock = 0;
-  struct sockaddr_in serv_addr;
-  char buffer[1024] = {0};
-
-  memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
-  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
-    PADDLE_THROW("create socket failed");
-
-  memset(&serv_addr, '0', sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_port = htons(port);
-
-  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
-    PADDLE_THROW("invalied address: %s", ep);
-
-  int try_times = 0;
-  while (true) {
-    if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
-      VLOG(0) << "worker: " << ep
-              << (try_times < 5 ? " is not ready, will retry after 3 seconds..."
-                                : " is not ready. Maybe that some process "
-                                  "is occupied the GPUs of this node now, "
-                                  "and you should kill those process manually. "
-                                  "Will retry after 3 seconds...");
-
-      std::this_thread::sleep_for(std::chrono::seconds(3));
-      ++try_times;
-      continue;
-    }
-    VLOG(3) << "sending the ncclUniqueId to " << ep;
-    send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0);
-    break;
-  }
-  close(sock);
-}
-
-void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) {
-  if (strategy_.local_rank_ == root) {
-    for (auto ep : strategy_.trainer_endpoints_) {
-      if (ep != strategy_.current_endpoint_) SendNCCLID(ep, nccl_id);
-    }
-  } else {
-    RecvNCCLID(strategy_.current_endpoint_, nccl_id);
-  }
-}
-
-void NCCLParallelContext::Init() {
-  ncclUniqueId nccl_id;
-  ncclComm_t comm;
-  if (strategy_.local_rank_ == 0) {
-    // generate the unique ncclid on the root worker
-    platform::dynload::ncclGetUniqueId(&nccl_id);
-    BcastNCCLId(&nccl_id, 0);
-  } else {
-    BcastNCCLId(&nccl_id, 0);
-  }
-  int gpu_id = boost::get<platform::CUDAPlace>(place_).device;
-  VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
-          << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id;
-
-  PADDLE_ENFORCE(cudaSetDevice(gpu_id));
-  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
-      &comm, strategy_.nranks_, nccl_id, strategy_.local_rank_));
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(pool.Get(place_));
-  dev_ctx->set_nccl_comm(comm);
-}
-#endif
-
-}  //  namespace imperative
-}  //  namespace paddle
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
deleted file mode 100644
index b4f44e56405a51082e60afd69fb6f011dab44b86..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/nccl_context.h
+++ /dev/null
@@ -1,81 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-// network header files
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include <arpa/inet.h>
-#include <netinet/in.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-#endif
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/dynload/nccl.h"
-#endif
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-namespace paddle {
-namespace imperative {
-
-struct ParallelStrategy {
-  int nranks_{1};
-  int local_rank_{0};
-  std::vector<std::string> trainer_endpoints_{};
-  std::string current_endpoint_{""};
-};
-
-class ParallelContext {
- public:
-  explicit ParallelContext(const ParallelStrategy& strategy,
-                           const platform::Place& place)
-      : strategy_(strategy), place_(place) {}
-
-  virtual ~ParallelContext() {}
-
-  virtual void Init() = 0;
-
- protected:
-  ParallelStrategy strategy_;
-  platform::Place place_;
-};
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-class NCCLParallelContext : ParallelContext {
- public:
-  explicit NCCLParallelContext(const ParallelStrategy& strategy,
-                               const platform::Place& place)
-      : ParallelContext(strategy, place) {}
-
-  ~NCCLParallelContext() {}
-
-  void BcastNCCLId(ncclUniqueId* nccl_id, int root);
-
-  void Init() override;
-
- protected:
-  void RecvNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
-
-  void SendNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
-};
-#endif
-
-}  //  namespace imperative
-}  //  namespace paddle
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
deleted file mode 100644
index 6f8ee92bdfc7ba9c68f8e567f3f1bad0a2cbabeb..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/imperative/prepared_operator.h"
-#include <sstream>
-
-namespace paddle {
-namespace imperative {
-
-const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
-  if (var.IsType<framework::LoDTensor>()) {
-    return &(var.Get<framework::LoDTensor>());
-  } else if (var.IsType<framework::SelectedRows>()) {
-    return &(var.Get<framework::SelectedRows>().value());
-  } else {
-    return nullptr;
-  }
-}
-
-void PreparedOp::PrepareData(
-    const platform::Place& place, const NameVarBaseMap& ins,
-    const framework::OperatorWithKernel& op,
-    const framework::OpKernelType& expected_kernel_key) {
-  for (const auto& name_pair : ins) {
-    for (const auto& var_base : name_pair.second) {
-      const auto* tensor = GetTensorFromVar(var_base->Var());
-      if (tensor && tensor->IsInitialized()) {
-        auto tmp_place = tensor->place();
-        // TODO(jiabin): Support transform data layout when we Verify it on more
-        // tests
-        if (!(tmp_place == place)) {
-          auto kernel_type_for_var = op.GetKernelTypeForVar(
-              name_pair.first, *tensor, expected_kernel_key);
-          if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
-            continue;
-          } else {
-            VLOG(3) << "Transform Variable " << var_base->Name() << " from "
-                    << kernel_type_for_var << " to " << expected_kernel_key;
-            framework::Tensor out;
-            TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
-                          &out);
-            SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
-          }
-        }
-      }
-    }
-  }
-}
-
-PreparedOp::PreparedOp(const framework::OperatorBase& op,
-                       const framework::RuntimeContext& ctx,
-                       framework::OperatorWithKernel::OpKernelFunc func,
-                       platform::DeviceContext* dev_ctx,
-                       std::vector<framework::KernelConfig>* kernel_configs)
-    : op_(op),
-      ctx_(ctx),
-      func_(std::move(func)),
-      dev_ctx_(dev_ctx),
-      kernel_configs_(kernel_configs) {}
-
-PreparedOp PreparedOp::Prepare(const framework::RuntimeContext& ctx,
-                               const framework::OperatorWithKernel& op,
-                               platform::Place place,
-                               const NameVarBaseMap& ins) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-
-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = op.AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(op.Type());
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.",
-        op.Type());
-  }
-
-  auto& kernels = kernels_iter->second;
-
-  auto expected_kernel_key =
-      op.GetExpectedKernelType(framework::ExecutionContext(
-          op, framework::Scope(), *dev_ctx, ctx, nullptr));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-  auto kernel_iter = kernels.find(expected_kernel_key);
-  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
-                 KernelTypeToString(expected_kernel_key));
-  }
-  std::vector<framework::KernelConfig>* kernel_configs =
-      op.GetKernelConfig(expected_kernel_key);
-
-  if (!(expected_kernel_key.place_ == place)) {
-    dev_ctx = pool.Get(expected_kernel_key.place_);
-    place = dev_ctx->GetPlace();
-  }
-
-  PrepareData(place, ins, op, expected_kernel_key);
-  return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
-}
-
-void PreparedOp::Run() {
-  // TODO(zjl): remove scope in dygraph
-  framework::Scope scope;
-  op_.RuntimeInferShape(scope, dev_ctx_->GetPlace(), ctx_);
-  VLOG(6) << "Finish Runtime infer shape";
-  func_(framework::ExecutionContext(op_, scope, *dev_ctx_, ctx_,
-                                    kernel_configs_));
-}
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
deleted file mode 100644
index 886311f8c82fce4b3b1cd46bbe2ac6e5f22c50e5..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/prepared_operator.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/data_transform.h"
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/imperative/type_defs.h"
-
-namespace paddle {
-namespace imperative {
-
-const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
-
-class PreparedOp {
- public:
-  static PreparedOp Prepare(const framework::RuntimeContext& ctx,
-                            const framework::OperatorWithKernel& op,
-                            platform::Place place, const NameVarBaseMap& ins);
-
-  inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx_; }
-
-  void Run();
-
-  static void PrepareData(const platform::Place& place,
-                          const NameVarBaseMap& ins,
-                          const framework::OperatorWithKernel& op,
-                          const framework::OpKernelType& expected_kernel_key);
-
- private:
-  PreparedOp(const framework::OperatorBase& op,
-             const framework::RuntimeContext& ctx,
-             framework::OperatorWithKernel::OpKernelFunc func,
-             platform::DeviceContext* dev_ctx,
-             std::vector<framework::KernelConfig>* kernel_configs);
-
- private:
-  const framework::OperatorBase& op_;
-  const framework::RuntimeContext& ctx_;
-  framework::OperatorWithKernel::OpKernelFunc func_;
-  platform::DeviceContext* dev_ctx_;
-  std::vector<framework::KernelConfig>* kernel_configs_;
-};
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
deleted file mode 100644
index 34570b3a60ec83fdeb1577789271942125b16eb1..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/profiler.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/imperative/profiler.h"
-
-#ifdef WITH_GPERFTOOLS
-#include "gperftools/profiler.h"
-#endif
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <mutex>   // NOLINT
-#include <thread>  // NOLINT
-
-DEFINE_string(
-    tracer_profile_fname, "xxgperf",
-    "Profiler filename for imperative tracer, which generated by gperftools."
-    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
-
-namespace paddle {
-namespace imperative {
-
-static std::once_flag gTracerProfileOnce;
-#ifdef WITH_GPERFTOOLS
-static bool gTracerProfilerStarted = false;
-#endif
-
-void StartProfile() {
-  if (!FLAGS_tracer_profile_fname.empty()) {
-    std::call_once(gTracerProfileOnce, [] {
-#ifdef WITH_GPERFTOOLS
-      ProfilerStart(FLAGS_tracer_profile_fname.c_str());
-      gTracerProfilerStarted = true;
-#else
-      LOG(WARNING) << "Paddle is not compiled with gperftools. "
-                      "FLAGS_tracer_profile_fname will be ignored";
-#endif
-    });
-  }
-}
-
-void StopProfile() {
-#ifdef WITH_GPERFTOOLS
-  ProfilerFlush();
-#else
-  LOG(WARNING) << "Paddle is not compiled with gperftools. "
-                  "FLAGS_tracer_profile_fname will be ignored";
-#endif
-}
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/profiler.h b/paddle/fluid/imperative/profiler.h
deleted file mode 100644
index d52aeed4e81755cfa285616d7b0a7e79061c6af8..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/profiler.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace imperative {
-
-extern void StartProfile();
-
-extern void StopProfile();
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
deleted file mode 100644
index f32f0a1726fc07bab5fdbb971fa258a97e3c8f7f..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
-cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS gradient_accumulator memcpy)
-cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy)
-cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split assign_op place)
-cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op memcpy)
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
deleted file mode 100644
index 74a74ebe921378e2994a6a4cb2087d0acde950b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/imperative/nccl_context.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace imperative = paddle::imperative;
-namespace platform = paddle::platform;
-
-imperative::ParallelStrategy GetStrategy(int local_rank) {
-  std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
-  imperative::ParallelStrategy strategy;
-  strategy.trainer_endpoints_ = eps;
-  strategy.current_endpoint_ = eps[local_rank];
-  strategy.nranks_ = 2;
-  strategy.local_rank_ = local_rank;
-  return strategy;
-}
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) {
-  auto strategy = GetStrategy(local_rank);
-  platform::CUDAPlace gpu(local_rank);
-  imperative::NCCLParallelContext ctx(strategy, gpu);
-  ctx.BcastNCCLId(nccl_id, 0);
-}
-
-TEST(BcastNCCLId, Run) {
-  ncclUniqueId nccl_id;
-  platform::dynload::ncclGetUniqueId(&nccl_id);
-  std::thread t(BcastNCCLId, 0, &nccl_id);
-
-  ncclUniqueId recv_nccl_id;
-  BcastNCCLId(1, &recv_nccl_id);
-
-  t.join();
-  EXPECT_EQ(0, std::memcmp(nccl_id.internal, recv_nccl_id.internal,
-                           NCCL_UNIQUE_ID_BYTES));
-}
-#endif
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
deleted file mode 100644
index 29a51733c9316efed585741e0440c07886491ab5..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/gradient_accumulator.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace imperative = paddle::imperative;
-namespace platform = paddle::platform;
-namespace framework = paddle::framework;
-namespace paddle {
-namespace imperative {
-
-void TensorAdd(const framework::Variable& src, framework::Variable* dst);
-
-#if defined(PADDLE_WITH_CUDA)
-template <typename T>
-int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
-  framework::Variable var1;
-  framework::Variable var2;
-  std::vector<T> src_data(10, t1);
-  std::vector<T> dst_data(10, t2);
-  std::vector<T> result;
-  platform::CPUPlace src_place;
-  for (unsigned int i = 0; i < 10; i++) {
-    result.emplace_back(src_data[i] + dst_data[i]);
-  }
-  std::vector<int64_t> dims = {2, 5};
-  auto* src = var1.GetMutable<framework::LoDTensor>();
-  auto* dst = var2.GetMutable<framework::LoDTensor>();
-  src->Resize(framework::make_ddim(dims));
-  dst->Resize(framework::make_ddim(dims));
-  auto* src_mutable = src->mutable_data<T>(place);
-  auto* dst_mutable = dst->mutable_data<T>(place);
-  paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
-                       sizeof(T) * src_data.size(), 0);
-  paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
-                       sizeof(T) * dst_data.size(), 0);
-  imperative::TensorAdd(var1, &var2);
-  framework::LoDTensor rlt;
-  platform::CPUPlace rlt_place;
-  framework::TensorCopySync(*dst, rlt_place, &rlt);
-
-  for (unsigned int i = 0; i < rlt.numel(); i++) {
-    if (rlt.data<T>()[i] != result[i]) return 1;
-  }
-  return 0;
-}
-#endif
-
-template <typename T>
-int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
-  framework::Variable var1;
-  framework::Variable var2;
-  std::vector<T> src_data(10, t1);
-  std::vector<T> dst_data(10, t2);
-  std::vector<T> result;
-  platform::CPUPlace src_place;
-  for (unsigned int i = 0; i < 10; i++) {
-    result.emplace_back(src_data[i] + dst_data[i]);
-  }
-  std::vector<int64_t> dims = {2, 5};
-  auto* src = var1.GetMutable<framework::LoDTensor>();
-  auto* dst = var2.GetMutable<framework::LoDTensor>();
-  src->Resize(framework::make_ddim(dims));
-  dst->Resize(framework::make_ddim(dims));
-  auto* src_mutable = src->mutable_data<T>(place);
-  auto* dst_mutable = dst->mutable_data<T>(place);
-  paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
-                       sizeof(T) * src_data.size());
-  paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
-                       sizeof(T) * dst_data.size());
-  imperative::TensorAdd(var1, &var2);
-  framework::LoDTensor rlt;
-  platform::CPUPlace rlt_place;
-  framework::TensorCopySync(*dst, rlt_place, &rlt);
-
-  for (unsigned int i = 0; i < rlt.numel(); i++) {
-    if (rlt.data<T>()[i] != result[i]) return 1;
-  }
-  return 0;
-}
-
-TEST(test_add_functor, add_functor) {
-#if defined(PADDLE_WITH_CUDA)
-  platform::CUDAPlace gpu_place(0);
-#endif
-  platform::CPUPlace cpu_place;
-
-  int cpu_res = 1;
-  cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0);
-  EXPECT_EQ(cpu_res, 0);
-  cpu_res = TensorCPUAddTest(cpu_place, static_cast<double>(1.0),
-                             static_cast<double>(2.0));
-  EXPECT_EQ(cpu_res, 0);
-#if defined(PADDLE_WITH_CUDA)
-  int gpu_res = 1;
-  gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0);
-  EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorGPUAddTest(gpu_place, static_cast<double>(1.0),
-                             static_cast<double>(2.0));
-  EXPECT_EQ(gpu_res, 0);
-#endif
-}
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc
deleted file mode 100644
index c92d0fd67c9c055d5b37a959ce0997c734b29a82..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Created by Jiabin on 2019-08-16.
-//
-
-#include <paddle/fluid/framework/op_registry.h>
-#include <memory>
-#include <string>
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/imperative/layer.h"
-
-namespace imperative = paddle::imperative;
-namespace platform = paddle::platform;
-namespace framework = paddle::framework;
-
-namespace paddle {
-namespace imperative {
-
-using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
-
-using var_pair = std::pair<std::string, vb_vector>;
-
-TEST(test_layer, test_runtime_context) {
-  std::shared_ptr<imperative::VarBase> vin(
-      new imperative::VarBase(false, "vin"));
-  std::shared_ptr<imperative::VarBase> vout(
-      new imperative::VarBase(false, "vout"));
-  var_pair in_pair = var_pair("X", vb_vector(1, vin));
-  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
-  imperative::NameVarBaseMap ins = {in_pair};
-  imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap attrs;
-  auto* ctx = new imperative::RuntimeInferVarTypeContext(ins, &outs, attrs);
-  ASSERT_TRUE(ctx->HasVar("vin"));
-  ASSERT_TRUE(ctx->HasInput("X"));
-  ASSERT_TRUE(ctx->HasOutput("Out"));
-
-  ASSERT_ANY_THROW(ctx->GetDataTypes("vin"));
-  std::vector<framework::proto::VarType::Type> NullType;
-  ASSERT_ANY_THROW(ctx->SetDataTypes("vin", NullType));
-  ASSERT_ANY_THROW(ctx->GetShape("vin"));
-  ASSERT_ANY_THROW(ctx->GetLoDLevel("vin"));
-  ASSERT_ANY_THROW(ctx->SetLoDLevel("vin", 2));
-}
-
-std::string LayerDebugString(const std::string& op_type,
-                             const NameVarBaseMap& ins,
-                             const NameVarBaseMap& outs);
-
-TEST(test_layer, test_debug_string_test_debug_Test) {
-  std::shared_ptr<imperative::VarBase> vin(
-      new imperative::VarBase(false, "vin"));
-  std::shared_ptr<imperative::VarBase> vin_error(
-      new imperative::VarBase(false, "vin_error"));
-  std::shared_ptr<imperative::VarBase> vout(
-      new imperative::VarBase(false, "vout"));
-  std::shared_ptr<imperative::VarBase> vout_error(
-      new imperative::VarBase(false, "vout_error"));
-  vin_error->MutableVar()->GetMutable<framework::LoDTensor>();
-  vout->MutableVar()->GetMutable<framework::LoDTensor>();
-  vout_error->MutableVar()->GetMutable<framework::SelectedRows>();
-  var_pair in_pair = var_pair("X", vb_vector(1, vin));
-  vb_vector vb_in_error = {vin_error, nullptr};
-  var_pair vin_error_pair = var_pair("X", vb_in_error);
-  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
-  var_pair vout_error_pair = var_pair("Out2", vb_vector(1, vout_error));
-  imperative::NameVarBaseMap ins = {in_pair};
-  imperative::NameVarBaseMap ins_error = {vin_error_pair};
-  imperative::NameVarBaseMap outs = {out_pair};
-  imperative::NameVarBaseMap outs_error = {vout_error_pair};
-  ASSERT_NO_FATAL_FAILURE(LayerDebugString("test_op", ins, outs));
-  std::string res = LayerDebugString("test_op", ins, outs_error);
-  ASSERT_TRUE(res.find("UNRESOLVED_TYPE") != std::string::npos);
-  std::string res2 = LayerDebugString("test_op", ins_error, outs_error);
-  VLOG(3) << res2;
-  ASSERT_TRUE(res2.find("NOT_INITED") != std::string::npos);
-  ASSERT_TRUE(res2.find("NULL") != std::string::npos);
-}
-
-TEST(test_layer, test_clear_backward_info) {
-  std::shared_ptr<imperative::VarBase> vin(
-      new imperative::VarBase(false, "vin"));
-  std::shared_ptr<imperative::VarBase> vout(
-      new imperative::VarBase(false, "vout"));
-  framework::OpDesc desc;
-  platform::CPUPlace place;
-  var_pair x_pair = var_pair("X", vb_vector(1, vin));
-  var_pair y_pair = var_pair("Y", vb_vector(1, vin));
-  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
-  imperative::NameVarBaseMap ins = {x_pair, y_pair};
-  imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap concat_att_map;
-  concat_att_map["axis"] = 1;
-  std::shared_ptr<imperative::OpBase> op(
-      OpBase::Create(0, "mul", ins, outs, concat_att_map, place));
-  std::shared_ptr<imperative::OpBase> preceding_op(
-      OpBase::Create(0, "mul", ins, outs, concat_att_map, place));
-  op->InsertGradPendingOps(preceding_op.get());
-  *(op->GetMutableInsMap()) = ins;
-  *(op->GetMutableOutsMap()) = outs;
-  ASSERT_GT(op->GetInsMap().size(), 0);
-  ASSERT_GT(op->GetOutsMap().size(), 0);
-  ASSERT_GT(op->GradPendingOps().size(), 0);
-
-  op->ClearBackwardTrace();
-
-  ASSERT_EQ(op->GetInsMap().size(), 0);
-  ASSERT_EQ(op->GetOutsMap().size(), 0);
-  ASSERT_EQ(op->GradPendingOps().size(), 0);
-}
-
-TEST(test_layer, test_varbase_basic) {
-  platform::CPUPlace place;
-  std::shared_ptr<imperative::VarBase> vin(
-      new imperative::VarBase(false, "vin"));
-  vin->MutableVar()->GetMutable<framework::LoDTensor>()->mutable_data<float>(
-      place);
-  std::shared_ptr<imperative::VarBase> vout(vin->NewVarBase(place, false));
-  ASSERT_EQ(vout->Name(), "Itmp0");
-
-  std::shared_ptr<imperative::VarBase> vin_with_grad(
-      new imperative::VarBase(true, "vin"));
-  ASSERT_ANY_THROW(vin->MutableGradVar());
-  ASSERT_NO_THROW(ASSERT_TRUE(dynamic_cast<framework::Variable*>(
-                                  vin_with_grad->MutableGradVar()) != 0));
-  ASSERT_TRUE(
-      dynamic_cast<framework::Variable*>(vin_with_grad->MutableGradVar()) != 0);
-  vin_with_grad->SetOverridedStopGradient(false);
-  ASSERT_FALSE(vin_with_grad->OverridedStopGradient());
-  ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetPersistable(true));
-  ASSERT_FALSE(vin_with_grad->OverridedStopGradient());
-  ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetName("new_name"));
-  ASSERT_EQ(vin_with_grad->Name(), "new_name");
-}
-// TODO(jiabin): Add more ut here for layer
-
-}  // namespace imperative
-}  // namespace paddle
-
-USE_OP(mul);
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
deleted file mode 100644
index 1a30868da041eb0c7dc2d7ed9308871f231f5ab9..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Created by Jiabin on 2019-08-19.
-//
-
-#include <paddle/fluid/framework/op_registry.h>
-#include <memory>
-#include <string>
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/imperative/prepared_operator.h"
-#include "paddle/fluid/imperative/type_defs.h"
-
-namespace imperative = paddle::imperative;
-namespace platform = paddle::platform;
-namespace framework = paddle::framework;
-
-namespace paddle {
-namespace imperative {
-
-static framework::RuntimeContext PrepareRuntimeContext(
-    const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
-  framework::VariableValueMap inputs, outputs;
-  for (auto& in_pair : ins) {
-    auto& in_ctx = inputs[in_pair.first];
-    in_ctx.reserve(in_pair.second.size());
-    for (auto& in_var : in_pair.second) {
-      in_ctx.emplace_back(in_var->MutableVar());
-    }
-  }
-
-  for (auto& out_pair : outs) {
-    auto& out_ctx = outputs[out_pair.first];
-    out_ctx.reserve(out_pair.second.size());
-    for (auto& out_var : out_pair.second) {
-      out_ctx.emplace_back(out_var->MutableVar());
-    }
-  }
-  return framework::RuntimeContext(std::move(inputs), std::move(outputs));
-}
-
-static framework::VariableNameMap CreateVarNameMap(
-    const framework::OpInfo& op_info, const std::string& op_type,
-    const NameVarBaseMap& varbase_map, bool is_input) {
-  if (op_info.proto_ == nullptr) {
-    return {};
-  }
-
-  framework::VariableNameMap result;
-
-  for (auto& var :
-       is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) {
-    auto it = varbase_map.find(var.name());
-    if (it == varbase_map.end()) {
-      PADDLE_ENFORCE_EQ(
-          var.dispensable(), true,
-          "Var: %s not dispensable and there are no such var in inputs",
-          var.name());
-      result[var.name()] = {};
-    } else {
-      auto& var_vector = it->second;
-      std::vector<std::string> args;
-      args.reserve(var_vector.size());
-      for (auto& var_base : var_vector) {
-        args.emplace_back(var_base->Name());
-      }
-      result[var.name()] = std::move(args);
-    }
-  }
-  return result;
-}
-
-using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
-
-using var_pair = std::pair<std::string, vb_vector>;
-
-TEST(test_prepare_op, test_prepare_op) {
-  std::shared_ptr<imperative::VarBase> vin(
-      new imperative::VarBase(false, "vin"));
-  std::shared_ptr<imperative::VarBase> vout(
-      new imperative::VarBase(false, "vout"));
-  framework::OpDesc desc;
-  platform::CPUPlace place;
-  vin->MutableVar()->GetMutable<framework::LoDTensor>()->mutable_data<float>(
-      place);
-  var_pair x_pair = var_pair("X", vb_vector(1, vin));
-  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
-  imperative::NameVarBaseMap ins = {x_pair};
-  imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap split_attr_map;
-  const auto& info = framework::OpInfoMap::Instance().Get("split");
-  framework::VariableNameMap var_in_map =
-      CreateVarNameMap(info, "split", ins, true);
-  framework::VariableNameMap var_out_map =
-      CreateVarNameMap(info, "split", outs, false);
-  framework::OperatorWithKernel op("split", var_in_map, var_out_map,
-                                   split_attr_map);
-  framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
-  ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp =
-                              PreparedOp::Prepare(ctx, op, place, ins));
-}
-
-const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
-
-TEST(test_prepare_op, test_get_tensor_from_var) {
-  std::shared_ptr<imperative::VarBase> vout_error(
-      new imperative::VarBase(false, "vout_error"));
-  vout_error->MutableVar()->GetMutable<framework::SelectedRows>();
-  auto* ts = GetTensorFromVar(*vout_error->MutableVar());
-  ASSERT_TRUE(ts != nullptr);
-}
-#if defined(PADDLE_WITH_CUDA)
-TEST(test_prepare_op, test_prepare_data) {
-  std::shared_ptr<imperative::VarBase> vin(
-      new imperative::VarBase(false, "vin"));
-  std::shared_ptr<imperative::VarBase> vout(
-      new imperative::VarBase(false, "vout"));
-
-  framework::OpDesc desc;
-  platform::CPUPlace cpu_place;
-  platform::CUDAPlace gpu_place(0);
-  std::vector<float> src_data(10, 2.0);
-  std::vector<int64_t> dims = {2, 5};
-
-  // prepare an cpu only input
-  auto* vin_tensor = vin->MutableVar()->GetMutable<framework::LoDTensor>();
-  vin_tensor->Resize(framework::make_ddim(dims));
-  auto* vin_mutable_tensor = vin_tensor->mutable_data<float>(cpu_place);
-  paddle::memory::Copy(cpu_place, vin_mutable_tensor, cpu_place,
-                       src_data.data(), sizeof(float) * src_data.size());
-
-  var_pair x_pair = var_pair("X", vb_vector(1, vin));
-  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
-  imperative::NameVarBaseMap ins = {x_pair};
-  imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap assign_attr_map;
-  const auto& info = framework::OpInfoMap::Instance().Get("assign");
-  framework::VariableNameMap var_in_map =
-      CreateVarNameMap(info, "assign", ins, true);
-  framework::VariableNameMap var_out_map =
-      CreateVarNameMap(info, "assign", outs, false);
-  framework::OperatorWithKernel assign_op("assign", var_in_map, var_out_map,
-                                          assign_attr_map);
-  framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
-
-  // test if it can be transformed to GPU place
-  PreparedOp prepared_op = PreparedOp::Prepare(ctx, assign_op, gpu_place, ins);
-  for (const auto& name_pair : ins) {
-    for (const auto& vb : name_pair.second) {
-      ASSERT_TRUE(platform::is_same_place(
-          vb->Var().Get<framework::LoDTensor>().place(), gpu_place));
-    }
-  }
-}
-#endif
-
-TEST(test_prepare_op, test_prepare_data_same_place) {
-  std::shared_ptr<imperative::VarBase> vin(
-      new imperative::VarBase(false, "vin"));
-  std::shared_ptr<imperative::VarBase> vout(
-      new imperative::VarBase(false, "vout"));
-
-  framework::OpDesc desc;
-  platform::CPUPlace cpu_place;
-  std::vector<float> src_data(10, 2.0);
-  std::vector<int64_t> dims = {2, 5};
-
-  // prepare an cpu only input
-  auto* vin_tensor = vin->MutableVar()->GetMutable<framework::LoDTensor>();
-  vin_tensor->Resize(framework::make_ddim(dims));
-  auto* vin_mutable_tensor = vin_tensor->mutable_data<float>(cpu_place);
-  paddle::memory::Copy(cpu_place, vin_mutable_tensor, cpu_place,
-                       src_data.data(), sizeof(float) * src_data.size());
-
-  var_pair x_pair = var_pair("X", vb_vector(1, vin));
-  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
-  imperative::NameVarBaseMap ins = {x_pair};
-  imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap assign_attr_map;
-  const auto& info = framework::OpInfoMap::Instance().Get("assign");
-  framework::VariableNameMap var_in_map =
-      CreateVarNameMap(info, "assign", ins, true);
-  framework::VariableNameMap var_out_map =
-      CreateVarNameMap(info, "assign", outs, false);
-  framework::OperatorWithKernel assign_op("assign", var_in_map, var_out_map,
-                                          assign_attr_map);
-  framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
-
-  // test if it never transfered on GPU place
-  PreparedOp prepared_op = PreparedOp::Prepare(ctx, assign_op, cpu_place, ins);
-  for (const auto& name_pair : ins) {
-    for (const auto& vb : name_pair.second) {
-      ASSERT_TRUE(platform::is_same_place(
-          vb->Var().Get<framework::LoDTensor>().place(), cpu_place));
-    }
-  }
-}
-}  // namespace imperative
-}  // namespace paddle
-
-USE_OP(split);
-USE_OP(assign);
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
deleted file mode 100644
index f112b9fc1e90bf06dc78cd73b9315d279f4cc723..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Created by Jiabin on 2019-08-16.
-//
-
-#include <paddle/fluid/framework/op_registry.h>
-#include <memory>
-#include <string>
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/imperative/tracer.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace imperative = paddle::imperative;
-namespace platform = paddle::platform;
-namespace framework = paddle::framework;
-
-namespace paddle {
-namespace imperative {
-
-using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
-
-using var_pair = std::pair<std::string, vb_vector>;
-
-TEST(test_tracer, test_trace_op) {
-  // Doing an mul
-  imperative::Tracer tracer;
-  std::shared_ptr<imperative::VarBase> x_in(
-      new imperative::VarBase(true, "x_in"));
-  std::shared_ptr<imperative::VarBase> y_in(
-      new imperative::VarBase(true, "y_in"));
-  std::shared_ptr<imperative::VarBase> vout(
-      new imperative::VarBase(true, "vout"));
-  platform::CPUPlace place;
-  std::vector<float> src_data(10, 2.0);
-  std::vector<int64_t> dims1 = {2, 5};
-  std::vector<int64_t> dims2 = {5, 2};
-
-  auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
-  auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
-  x_in_tensor->Resize(framework::make_ddim(dims1));
-  auto* mutable_x = x_in_tensor->mutable_data<float>(place);
-  paddle::memory::Copy(place, mutable_x, place, src_data.data(),
-                       sizeof(float) * src_data.size());
-  y_in_tensor->Resize(framework::make_ddim(dims2));
-  auto* mutable_y = y_in_tensor->mutable_data<float>(place);
-  paddle::memory::Copy(place, mutable_y, place, src_data.data(),
-                       sizeof(float) * src_data.size());
-
-  var_pair x_pair = var_pair("X", vb_vector(1, x_in));
-  var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
-  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
-  imperative::NameVarBaseMap ins = {x_pair, y_pair};
-  imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
-  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
-  const auto& out_tensor = vout->Var().Get<framework::LoDTensor>();
-  for (size_t i = 0; i < vout->Var().Get<framework::LoDTensor>().numel(); i++) {
-    ASSERT_EQ(out_tensor.data<float>()[i], 20.0);
-  }
-}
-
-TEST(test_tracer, test_track_backward_output) {
-  // Doing an mul
-  imperative::Tracer tracer;
-  std::shared_ptr<imperative::VarBase> x_in(
-      new imperative::VarBase(true, "x_in"));
-  std::shared_ptr<imperative::VarBase> y_in(
-      new imperative::VarBase(false, "y_in"));
-  x_in->SetOverridedStopGradient(false);
-  std::shared_ptr<imperative::VarBase> vout(
-      new imperative::VarBase(true, "vout"));
-  platform::CPUPlace place;
-  std::vector<float> src_data(10, 2.0);
-  std::vector<int64_t> dims1 = {2, 5};
-  std::vector<int64_t> dims2 = {5, 2};
-
-  auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
-  auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
-  x_in_tensor->Resize(framework::make_ddim(dims1));
-  auto* mutable_x = x_in_tensor->mutable_data<float>(place);
-  paddle::memory::Copy(place, mutable_x, place, src_data.data(),
-                       sizeof(float) * src_data.size());
-  y_in_tensor->Resize(framework::make_ddim(dims2));
-  auto* mutable_y = y_in_tensor->mutable_data<float>(place);
-  paddle::memory::Copy(place, mutable_y, place, src_data.data(),
-                       sizeof(float) * src_data.size());
-
-  var_pair x_pair = var_pair("X", vb_vector(1, x_in));
-  var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
-  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
-  imperative::NameVarBaseMap ins = {x_pair, y_pair};
-  imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
-  ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true));
-}
-
-TEST(test_tracer, test_track_backward_input) {
-  // Doing an mul
-  imperative::Tracer tracer;
-  std::shared_ptr<imperative::VarBase> x_in(
-      new imperative::VarBase(true, "x_in"));
-  std::shared_ptr<imperative::VarBase> y_in(
-      new imperative::VarBase(true, "y_in"));
-  std::shared_ptr<imperative::VarBase> vout(
-      new imperative::VarBase(false, "vout"));
-  platform::CPUPlace place;
-  x_in->SetOverridedStopGradient(false);
-  std::vector<float> src_data(10, 2.0);
-  std::vector<int64_t> dims1 = {2, 5};
-  std::vector<int64_t> dims2 = {5, 2};
-
-  auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
-  auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
-  x_in_tensor->Resize(framework::make_ddim(dims1));
-  auto* mutable_x = x_in_tensor->mutable_data<float>(place);
-  paddle::memory::Copy(place, mutable_x, place, src_data.data(),
-                       sizeof(float) * src_data.size());
-  y_in_tensor->Resize(framework::make_ddim(dims2));
-  auto* mutable_y = y_in_tensor->mutable_data<float>(place);
-  paddle::memory::Copy(place, mutable_y, place, src_data.data(),
-                       sizeof(float) * src_data.size());
-
-  var_pair x_pair = var_pair("X", vb_vector(1, x_in));
-  var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
-  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
-  imperative::NameVarBaseMap ins = {x_pair, y_pair};
-  imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
-  ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true));
-}
-#if defined(PADDLE_WITH_CUDA)
-TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
-  // Doing an mul
-  imperative::Tracer tracer;
-  std::shared_ptr<imperative::VarBase> x_in(
-      new imperative::VarBase(true, "x_in"));
-  std::shared_ptr<imperative::VarBase> y_in(
-      new imperative::VarBase(true, "y_in"));
-  std::shared_ptr<imperative::VarBase> vout(
-      new imperative::VarBase(true, "vout"));
-  platform::CPUPlace place;
-  platform::CUDAPlace gpu_place(0);
-  std::vector<float> src_data(10, 2.0);
-  std::vector<int64_t> dims1 = {2, 5};
-  std::vector<int64_t> dims2 = {5, 2};
-
-  auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
-  auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
-  x_in_tensor->Resize(framework::make_ddim(dims1));
-  auto* mutable_x = x_in_tensor->mutable_data<float>(place);
-  paddle::memory::Copy(place, mutable_x, place, src_data.data(),
-                       sizeof(float) * src_data.size());
-  y_in_tensor->Resize(framework::make_ddim(dims2));
-  auto* mutable_y = y_in_tensor->mutable_data<float>(gpu_place);
-  paddle::memory::Copy(gpu_place, mutable_y, place, src_data.data(),
-                       sizeof(float) * src_data.size(), 0);
-  var_pair x_pair = var_pair("X", vb_vector(1, x_in));
-  var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
-  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
-  imperative::NameVarBaseMap ins = {x_pair, y_pair};
-  imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
-  tracer.TraceOp("mul", ins, outs, mul_attr_map, gpu_place, true);
-  framework::LoDTensor rlt;
-  framework::TensorCopySync(vout->Var().Get<framework::LoDTensor>(), place,
-                            &rlt);
-  for (size_t i = 0; i < rlt.numel(); i++) {
-    ASSERT_EQ(rlt.data<float>()[i], 20.0);
-  }
-}
-#endif
-}  // namespace imperative
-}  // namespace paddle
-
-USE_OP(mul);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
deleted file mode 100644
index 0fff6b8cda3f022d96402f17b3436e103f2903bd..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/tracer.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/imperative/tracer.h"
-#include <unordered_set>
-#include <utility>
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace imperative {
-
-static std::vector<std::unique_ptr<framework::OpDesc>> CreateGradOpDescs(
-    const framework::OpInfo& op_info, const framework::OpDesc& op_desc,
-    const std::unordered_set<std::string>& no_grad_set,
-    const std::vector<framework::BlockDesc*>& grad_sub_block,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  if (op_info.grad_op_maker_) {
-    return op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var,
-                                  grad_sub_block);
-  } else {
-    return {};
-  }
-}
-
-static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) {
-  for (const auto& name_pair : outs) {
-    for (const auto& vb : name_pair.second) {
-      VLOG(6) << "Set output: " << vb->Name() << "'s OverridedStopGradient as "
-              << generate_grad;
-      vb->InnerSetOverridedStopGradient(generate_grad);
-    }
-  }
-}
-
-void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
-                     const NameVarBaseMap& outs, framework::AttributeMap attrs,
-                     const platform::Place& place, bool trace_backward) {
-  platform::RecordEvent event(type);
-  VLOG(1) << "Trace Op: " << type;
-  size_t op_id = GenerateUniqueId();
-  auto op = OpBase::Create(op_id, type, ins, outs, std::move(attrs), place);
-  op->Run(ins, outs);
-
-  if (ComputeRequiredGrad(ins, outs, trace_backward)) {
-    TraceBackward(op, framework::OpDesc(op->Type(), op->InputNameMap(),
-                                        op->OutputNameMap(), op->Attrs()),
-                  ins, outs);
-  } else {
-    VLOG(3) << "No Grad to track for Op: " << type;
-  }
-}
-
-bool Tracer::ComputeRequiredGrad(const NameVarBaseMap& ins,
-                                 const NameVarBaseMap& outs,
-                                 bool trace_backward) {
-  if (!trace_backward) return false;
-
-  for (const auto& name_pair : ins) {
-    for (const auto& var_base : name_pair.second) {
-      if (!var_base->OverridedStopGradient()) {
-        VLOG(6) << "Find out input: " << var_base->Name()
-                << "'s GeneratedGrad is True";
-        PassStopGradient(outs, var_base->OverridedStopGradient());
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-void Tracer::TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
-                           const framework::OpDesc& fwd_op_desc,
-                           const NameVarBaseMap& ins,
-                           const NameVarBaseMap& outs) {
-  // grad_to_var is a map of framework::GradVarName(in_var_name/out_var_name) ->
-  // in_var_name/out_var_name
-  std::unordered_map<std::string, std::string> grad_to_var;
-
-  // Get grad_op_desc using fwd_op_desc
-  std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs_ =
-      CreateGradOpDescs(fwd_op->Info(), fwd_op_desc, {}, {}, &grad_to_var);
-
-  // Create grad_ops using grad_op_descs
-
-  size_t grad_op_num = grad_op_descs_.size();
-
-  VLOG(3) << "Create " << grad_op_num << " grad op desc(s) to op "
-          << fwd_op->Type();
-
-  if (grad_op_num == 0) {
-    return;
-  }
-  // Build a map to record var_name -> std::shared_ptr<VarBase>*,
-  // so that we can find suitable var in grad op descs
-  std::unordered_map<std::string, const std::shared_ptr<VarBase>*> name_to_var;
-  for (auto& pair : ins) {
-    for (auto& var : pair.second) {
-      auto& var_ptr = name_to_var[var->Name()];
-      PADDLE_ENFORCE_EQ(var_ptr == nullptr || var_ptr->get() == var.get(), true,
-                        "There are different variables with same name %s",
-                        var->Name());
-      var_ptr = &var;
-    }
-  }
-
-  for (auto& pair : outs) {
-    for (auto& var : pair.second) {
-      auto& var_ptr = name_to_var[var->Name()];
-      PADDLE_ENFORCE_EQ(var_ptr == nullptr || var_ptr->get() == var.get(), true,
-                        "There are different variables with same name %s",
-                        var->Name());
-      var_ptr = &var;
-    }
-  }
-
-  // Build backward ins and outs
-
-  for (size_t i = 0; i < grad_op_num; i++) {
-    // Step1: build grad op and add them to engine
-
-    // Use trace id to decide the order of gradient sum in sorted sum mode
-    size_t trace_id = fwd_op->id();
-    std::shared_ptr<OpBase> grad_op =
-        OpBase::Create(trace_id, (*(grad_op_descs_[i].get())), fwd_op->place());
-
-    // this OpBase* is just used to manage op's life time
-    engine_->InsertOp(grad_op.get(), grad_op);
-
-    std::unordered_set<OpBase*> visited_preceding_ops;
-    // Step2 : prepare grad_in vars and bind them with grad_op,
-    // set inputs' grad_op as current grad_op
-    for (const auto& grad_ins : grad_op_descs_[i]->Inputs()) {
-      if (grad_ins.second.empty()) continue;
-      auto& bwd_in = (*grad_op->GetMutableInsMap())[grad_ins.first];
-      bwd_in.reserve(grad_ins.second.size());
-
-      for (auto& grad_in_var_name : grad_ins.second) {
-        auto iter = grad_to_var.find(grad_in_var_name);
-
-        if (iter != grad_to_var.end()) {
-          // If it is a grad var, find its coresponding forward var
-          auto& fwd_var_name = iter->second;
-          auto fwd_var_iter = name_to_var.find(fwd_var_name);
-          PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
-                            "Cannot find forward variable named %s",
-                            fwd_var_name);
-          const auto& tmp = (*(fwd_var_iter->second))->GradVarBase();
-          PADDLE_ENFORCE_NOT_NULL(
-              tmp.get(),
-              "Grad of %s should "
-              "not be NULL when we Track_Backward Input of %s",
-              (*(fwd_var_iter->second))->Name(), grad_op->Type());
-          // Create grad_in's dim in tensor for Grad Dependency compute
-          auto* tensor = tmp->MutableVar()->GetMutable<framework::LoDTensor>();
-          tensor->Resize((*(fwd_var_iter->second))
-                             ->Var()
-                             .Get<framework::LoDTensor>()
-                             .dims());
-          // Add Grad Op for grad_in
-          tmp->AddGradOps(grad_op);
-          VLOG(3) << "Add Grad Op " << grad_op->Type() << " for :"
-                  << (*(fwd_var_iter->second))->GradVarBase()->Name();
-          // Add Grad var input to engine set
-          engine_->InsertGradVar(tmp.get());
-          VLOG(3) << "Add Grad: " << tmp->Name() << " in to Engine";
-          bwd_in.emplace_back((*(fwd_var_iter->second))->GradVarBase());
-        } else {
-          // If it is a forward var, just add it
-          auto fwd_var_iter = name_to_var.find(grad_in_var_name);
-          PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
-                            "Cannot find forward variable named %s",
-                            grad_in_var_name);
-          bwd_in.emplace_back(*(fwd_var_iter->second));
-        }
-        VLOG(3) << "Set backward input from fwd var" << grad_ins.first << " of "
-                << grad_op->Type() << " to be "
-                << (bwd_in.back() ? bwd_in.back()->Name() : "nullptr");
-      }
-    }
-
-    // Step3: prepare grad_out vars and using their grad_ops to set current
-    // grad_op's preceding op
-    for (auto& grad_outs : grad_op_descs_[i]->Outputs()) {
-      if (grad_outs.second.empty()) continue;
-      auto& bwd_out = (*grad_op->GetMutableOutsMap())[grad_outs.first];
-      bwd_out.reserve(grad_outs.second.size());
-
-      for (auto& grad_out_var_name : grad_outs.second) {
-        auto iter = grad_to_var.find(grad_out_var_name);
-        PADDLE_ENFORCE_EQ(iter != grad_to_var.end(), true,
-                          "Cannot find output of input grad %s in op %s",
-                          grad_out_var_name, fwd_op->Type());
-        auto fwd_var_iter = name_to_var.find(iter->second);
-        PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
-                          "Cannot find forward variable named %s",
-                          iter->second);
-        const auto& tmp = (*(fwd_var_iter->second))->GradVarBase();
-
-        PADDLE_ENFORCE_NOT_NULL(tmp.get(),
-                                "Grad output: %s of op: %s should not be NULL",
-                                (tmp->Name(), grad_op->Type()));
-
-        if ((!tmp->OverridedStopGradient()) || (grad_outs.second.size() > 1)) {
-          VLOG(3) << "Set backward output " << grad_outs.first << " of "
-                  << grad_op->Type() << " to be " << tmp->Name()
-                  << ". Its Overrided Stop_Gradient is: False";
-          bwd_out.emplace_back(tmp);
-          auto grad_pending_ops =
-              (*(fwd_var_iter->second))->GradVarBase()->GradOps();
-          if (VLOG_IS_ON(3) && !grad_pending_ops.empty()) {
-            VLOG(3) << "Add grad_pending Op of :"
-                    << (*(fwd_var_iter->second))->GradVarBase()->Name()
-                    << " It's grad_pending Op are: ";
-            for (const auto& op : grad_pending_ops) {
-              VLOG(3) << op->Type();
-            }
-          }
-          if (!grad_pending_ops.empty()) {
-            for (const auto& op : grad_pending_ops) {
-              PADDLE_ENFORCE_NOT_NULL(op,
-                                      "No nullptr should be grad_pending op");
-              if (visited_preceding_ops.count(op) == 0) {
-                visited_preceding_ops.insert(op);
-                grad_op->InsertGradPendingOps(op);
-              }
-            }
-          } else {
-            VLOG(5) << "Hit leaf VarBase"
-                    << (*(fwd_var_iter->second))->GradVarBase()->Name();
-          }
-        } else {
-          VLOG(3) << "Skip backward output " << grad_outs.first << " of "
-                  << grad_op->Type() << " Named: " << tmp->Name()
-                  << ", since its Overrided Stop_Gradient is: True";
-        }
-      }
-    }
-    // To ensure numeric stability as static graph
-    grad_op->SortGradPendingOps();
-  }
-}
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
deleted file mode 100644
index 9c24b65ee1603d41cc038c28560358d7c3c27bb0..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/tracer.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ThreadPool.h"
-#include "paddle/fluid/imperative/engine.h"
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace imperative {
-
-class Tracer {
-  DISABLE_COPY_AND_ASSIGN(Tracer);
-
- public:
-  Tracer() : engine_(new BasicEngine()) {}
-
-  ~Tracer() = default;
-
-  void TraceOp(const std::string& type, const NameVarBaseMap& ins,
-               const NameVarBaseMap& outs, framework::AttributeMap attrs,
-               const platform::Place& place, bool trace_bacward);
-
-  bool ComputeRequiredGrad(const NameVarBaseMap& ins,
-                           const NameVarBaseMap& outs, bool trace_backward);
-
-  void TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
-                     const framework::OpDesc& fwd_op_desc,
-                     const NameVarBaseMap& ins, const NameVarBaseMap& outs);
-  Engine* GetDefaultEngine() const { return engine_.get(); }
-
- private:
-  static size_t GenerateUniqueId() {
-    static std::atomic<size_t> id{0};
-    return id.fetch_add(1);
-  }
-
- private:
-  std::unique_ptr<Engine> engine_;
-};
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
deleted file mode 100644
index 615b1b082d2e653dcf5e6cc22637ebb4a8fb495e..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/type_defs.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace imperative {
-
-class VarBase;
-class OpBase;
-class Tracer;
-
-using NameVarBaseMap =
-    std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
deleted file mode 100644
index d1db924e6b2161d7797dad1c3425188469ad573f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/CMakeLists.txt
+++ /dev/null
@@ -1,108 +0,0 @@
-if(WITH_TESTING)
-  include(tests/test.cmake) # some generic cmake funtion for inference
-endif()
-
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor data_feed_proto)
-
-# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
-cc_library(paddle_fluid_api
-    SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-
-# analysis and tensorrt must be added before creating static library,
-# otherwise, there would be undefined reference to them in static library.
-add_subdirectory(analysis)
-add_subdirectory(utils)
-if (TENSORRT_FOUND)
-  add_subdirectory(tensorrt)
-endif()
-
-if (ANAKIN_SUBGRAPH)
-  add_subdirectory(anakin)
-endif()
-
-get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
-get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS)
-if (WIN32)
-list(APPEND fluid_third_partys gflags glog protobuf cblas)
-endif(WIN32)
-
-# paddle_fluid_origin exclude inference api interface
-if(WIN32)
-  sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
-else(WIN32)
-  cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
-endif(WIN32)
-
-add_subdirectory(api)
-
-if(WITH_MKLDNN)
-	set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/api/mkldnn_quantizer.cc)
-	set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
-endif()
-
-set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor)
-if (ANAKIN_FOUND)
-    set(ANAKIN_SHARED_INFERENCE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/api/api_anakin_engine.cc)
-endif()
-set(SHARED_INFERENCE_SRCS
-    io.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/data_feed.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/data_set.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/data_feed_factory.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/dataset_factory.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
-    ${mkldnn_quantizer_src}
-    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
-    ${ANAKIN_SHARED_INFERENCE_SRCS})
-
-if(WIN32)
-  sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
-              analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
-else(WIN32)
-  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS}
-             zero_copy_tensor reset_tensor_array analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
-endif(WIN32)
-
-if(NOT APPLE)
-  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
-  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
-  set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-endif()
-
-# Create shared library
-if(WIN32)
-  sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-              DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
-                   analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
-else(WIN32)
-  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-             DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
-                  analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
-endif()
-get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-target_link_libraries(paddle_fluid_shared ${os_dependency_modules})
-
-set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
-if(NOT APPLE AND NOT WIN32)
-  # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
-  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
-  set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  # check symbol hidden
-  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
-    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
-    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n"
-    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
-    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
-    "endif()\n")
-  add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
-    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
-    DEPENDS paddle_fluid_shared)
-  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
-endif()
-
-if(WITH_TESTING)
-    # tests/book depends the models that generated by python/paddle/fluid/tests/book
-  add_subdirectory(tests/book)
-  if(WITH_INFERENCE_API_TEST)
-    add_subdirectory(tests/api)
-  endif()
-endif()
diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt
deleted file mode 100644
index 8292af2225bbfd7ba8262bb1e56783c83d287e50..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-cc_library(anakin_engine SRCS engine.cc DEPS framework_proto boost)
-cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost)
-target_link_libraries(anakin_engine anakin anakin_saber_common)
-cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
-add_subdirectory(convert)
diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
deleted file mode 100644
index 67194c9ff2411bb51bf3db8c8d0d38d8be3d576b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
-elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc
-batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc
-detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc
-roi_align.cc shuffle_channel.cc helper.cc DEPS anakin_engine framework_proto
-scope op_registry gtest)
-
-cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
-cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv)
-cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter)
-cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling)
-cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split)
-cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split)
-cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op)
-cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter)
-cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax)
-cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op)
-cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op)
-cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op)
-cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op)
-cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op)
-cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS  anakin_op_converter sum_op selected_rows_functor)
-cc_test(test_anakin_affine_channel SRCS test_affine_channel_op.cc DEPS anakin_op_converter affine_channel_op)
diff --git a/paddle/fluid/inference/anakin/convert/activation.cc b/paddle/fluid/inference/anakin/convert/activation.cc
deleted file mode 100644
index 523571f1aa8b5a9d17f97a1fd765fe9f1ac95b22..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/activation.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/activation.h"
-#include <algorithm>
-#include <map>
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-ActivationOpConverter<TargetT, PrecisionT>::ActivationOpConverter(
-    const std::string &op_type)
-    : op_type_(op_type) {
-  auto it = anakin_op_types_.find(op_type_);
-  PADDLE_ENFORCE(it != anakin_op_types_.end(),
-                 "activation op type is not support");
-  anakin_op_type_ = it->second;
-}
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void ActivationOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  auto input_name = op_desc.Input("X").front();
-  auto output_name = op_desc.Output("Out").front();
-  this->engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
-  this->engine_->AddOpAttr(op_name, "type", anakin_op_type_);
-
-  if (op_type_ == "swish") {
-    float beta = boost::get<float>(op_desc.GetAttr("beta"));
-    this->engine_->AddOpAttr(op_name, "clip_relu_num", beta);
-  }
-  if (op_type_ == "relu6") {
-    float threshold = boost::get<float>(op_desc.GetAttr("threshold"));
-    this->engine_->AddOpAttr(op_name, "clip_relu_num", threshold);
-  }
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter);
-REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter);
-REGISTER_ANAKIN_OP_CONVERTER(swish, SwishOpConverter);
-REGISTER_ANAKIN_OP_CONVERTER(relu6, Relu6OpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/activation.h b/paddle/fluid/inference/anakin/convert/activation.h
deleted file mode 100644
index a2475e492c408008fd7b22815a03aedfd3d04650..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/activation.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class ActivationOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  explicit ActivationOpConverter(const std::string &op_type);
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~ActivationOpConverter() {}
-
- private:
-  std::string op_type_;
-  std::string anakin_op_type_;
-  std::map<std::string, std::string> anakin_op_types_{{"tanh", "TanH"},
-                                                      {"sigmoid", "Sigmoid"},
-                                                      {"relu6", "ClippedRelu"},
-                                                      {"swish", "Swish"}};
-};
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class TanhOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
- public:
-  TanhOpConverter() : ActivationOpConverter<TargetT, PrecisionT>("tanh") {}
-};
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class SigmoidOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
- public:
-  SigmoidOpConverter()
-      : ActivationOpConverter<TargetT, PrecisionT>("sigmoid") {}
-};
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class Relu6OpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
- public:
-  Relu6OpConverter() : ActivationOpConverter<TargetT, PrecisionT>("relu6") {}
-};
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class SwishOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
- public:
-  SwishOpConverter() : ActivationOpConverter<TargetT, PrecisionT>("swish") {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/affine_channel.cc b/paddle/fluid/inference/anakin/convert/affine_channel.cc
deleted file mode 100644
index 534e7dca81db959115283d65018ec33cc7a0924c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/affine_channel.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/affine_channel.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/inference/anakin/convert/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void AffineChannelOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  auto input_name = op_desc.Input("X").front();
-  auto output_name = op_desc.Output("Out").front();
-  this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name});
-
-  // Copy the Scale to CPUPlace and get the pointer.
-  auto *scale_v = scope.FindVar(op_desc.Input("Scale").front());
-  PADDLE_ENFORCE_NOT_NULL(scale_v);
-  auto weight1 = pblock_from_var<TargetT, PrecisionT>(*scale_v, this->engine_);
-  this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
-
-  // Copy the Bias to CPUPlace and get the pointer.
-  auto *bias_v = scope.FindVar(op_desc.Input("Bias").front());
-  PADDLE_ENFORCE_NOT_NULL(bias_v);
-  auto weight2 = pblock_from_var<TargetT, PrecisionT>(*bias_v, this->engine_);
-  this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(affine_channel, AffineChannelOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/affine_channel.h b/paddle/fluid/inference/anakin/convert/affine_channel.h
deleted file mode 100644
index 443f6101288af4ef6b82a9370f83b7b0c07e23c5..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/affine_channel.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class AffineChannelOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  AffineChannelOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~AffineChannelOpConverter() {}
-
- private:
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc
deleted file mode 100644
index b41f5dc925208d38ae233f0b3d1ca425537b9b47..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/batch_norm.h"
-#include <math.h>
-#include <algorithm>
-#include <map>
-#include <string>
-#include <vector>
-#include "paddle/fluid/inference/anakin/convert/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void BatchNormOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
-  std::map<std::string, std::string> inputs;
-  for (auto k : {"X", "Scale", "Bias", "Mean", "Variance"}) {
-    PADDLE_ENFORCE_EQ(op_desc.Input(k).size(), 1UL);
-  }
-
-  auto input = op_desc.Input("X").front();
-  auto output = op_desc.Output("Y").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front();
-  auto epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));
-
-  auto bn_op_name = op_name + ":bn";
-  auto bn_output = bn_op_name + "_output";
-  this->engine_->AddOp(bn_op_name, "BatchNorm", {input}, {bn_output});
-  this->engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
-  this->engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0));
-
-  auto scale_op_name = op_name + ":scale";
-  this->engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output});
-  this->engine_->AddOpAttr(scale_op_name, "axis", 1);
-  this->engine_->AddOpAttr(scale_op_name, "num_axes", 1);
-  this->engine_->AddOpAttr(scale_op_name, "bias_term", true);
-
-  auto *mean_v = scope.FindVar(op_desc.Input("Mean").front());
-  PADDLE_ENFORCE_NOT_NULL(mean_v);
-  auto weight1 = pblock_from_var<TargetT, PrecisionT>(*mean_v, this->engine_);
-  this->engine_->AddOpAttr(bn_op_name, "weight_1", *weight1);
-
-  auto *variance_v = scope.FindVar(op_desc.Input("Variance").front());
-  PADDLE_ENFORCE_NOT_NULL(variance_v);
-  auto weight2 =
-      pblock_from_var<TargetT, PrecisionT>(*variance_v, this->engine_);
-  this->engine_->AddOpAttr(bn_op_name, "weight_2", *weight2);
-
-  auto *weight3 = pblock_from_vector<TargetT, PrecisionT>(
-      std::vector<float>({1}), this->engine_);
-  this->engine_->AddOpAttr(bn_op_name, "weight_3", *weight3);
-
-  auto *scale_v = scope.FindVar(op_desc.Input("Scale").front());
-  PADDLE_ENFORCE_NOT_NULL(scale_v);
-  auto scale = pblock_from_var<TargetT, PrecisionT>(*scale_v, this->engine_);
-  this->engine_->AddOpAttr(scale_op_name, "weight_1", *scale);
-
-  auto *bias_v = scope.FindVar(op_desc.Input("Bias").front());
-  PADDLE_ENFORCE_NOT_NULL(bias_v);
-  auto bias = pblock_from_var<TargetT, PrecisionT>(*bias_v, this->engine_);
-  this->engine_->AddOpAttr(scale_op_name, "weight_2", *bias);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(batch_norm, BatchNormOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.h b/paddle/fluid/inference/anakin/convert/batch_norm.h
deleted file mode 100644
index 52156aeb0283af9419c77490bbaded5bb5f45f4b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/batch_norm.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class BatchNormOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  BatchNormOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~BatchNormOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/concat.cc b/paddle/fluid/inference/anakin/convert/concat.cc
deleted file mode 100644
index 584a82ead43fa75f0117cf524151bbd75cf54ba6..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/concat.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/concat.h"
-#include <algorithm>
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void ConcatOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  int axis = boost::get<int>(op_desc.GetAttr("axis"));
-  auto input_names = op_desc.Input("X");
-
-  auto y_name = op_desc.Output("Out").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  this->engine_->AddOp(op_name, "Concat", input_names, {y_name});
-  this->engine_->AddOpAttr(op_name, "axis", axis);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(concat, ConcatOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/concat.h b/paddle/fluid/inference/anakin/convert/concat.h
deleted file mode 100644
index fb5514affa78d254476faf41bd09e21f41d2090d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/concat.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class ConcatOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  ConcatOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~ConcatOpConverter() {}
-
- private:
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc
deleted file mode 100644
index 7904d407c36b4c66f675c5dd9fd62f8fe91e1908..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/conv2d.h"
-#include <algorithm>
-#include <memory>
-#include <vector>
-#include "paddle/fluid/inference/anakin/convert/helper.h"
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void Conv2dOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL);
-
-  auto input_name = op_desc.Input("Input").front();
-  auto output_name = op_desc.Output("Output").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
-  this->engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
-
-  auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
-  PADDLE_ENFORCE_NOT_NULL(filter_v);
-  auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace());
-  auto weight_shape = framework::vectorize<int>(weight_tensor->dims());
-
-  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
-
-  const int filter_h = weight_tensor->dims()[2];
-  const int filter_w = weight_tensor->dims()[3];
-
-  auto filter_num = weight_tensor->dims()[0];
-  this->engine_->template AddOpAttr<int>(op_name, "filter_num", filter_num);
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "kernel_size",
-                                                 {filter_h, filter_w});
-  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
-  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
-  auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dilation_rate",
-                                                 dilations);
-  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
-  this->engine_->AddOpAttr(op_name, "group", groups);
-  this->engine_->AddOpAttr(op_name, "axis", 1);
-  this->engine_->AddOpAttr(op_name, "bias_term", false);
-
-  ::anakin::saber::Shape anakin_shape(weight_shape);
-  bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
-
-  if (enable_int8) {
-    const float int8_range = 127.;
-    float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    auto weight_scale =
-        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
-    PBlock<TargetT> *weight1 =
-        new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
-    this->engine_->RegistBlock(weight1);
-    float *weight_data = weight_tensor->data<float>();
-    std::vector<char> weight_int8;
-    int weight_num = weight_tensor->numel();
-    for (int i = 0; i < weight_tensor->numel(); i++) {
-      bool is_valid_int8 =
-          ((weight_data[i] >= -128) && (weight_data[i] <= 127));
-      PADDLE_ENFORCE(is_valid_int8,
-                     "We are in anakin subgraph int8 mode, the weight of conv "
-                     "should be in range [-128, 127]");
-      weight_int8.push_back(static_cast<char>(weight_data[i]));
-    }
-    memcpy(static_cast<void *>(weight1->h_tensor().mutable_data()),
-           static_cast<void *>(weight_int8.data()), sizeof(char) * weight_num);
-    weight1->d_tensor().set_shape(anakin_shape);
-    weight1->d_tensor().copy_from(weight1->h_tensor());
-    this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
-    this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
-    this->engine_->Graph()->SetWeightsScale(
-        op_name, {weight_scale[0] / int8_range}, false);
-    this->engine_->AddTensorScale(input_name, in_scale / int8_range);
-  } else {
-    auto *weight1 = pblock_from_tensor<TargetT, PrecisionT>(
-        *weight_tensor, weight_shape, this->engine_);
-    this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
-  }
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(conv2d, Conv2dOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/conv2d.h b/paddle/fluid/inference/anakin/convert/conv2d.h
deleted file mode 100644
index b22cb8ea9318cf75deacc681e3c1e7b271d1f86b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/conv2d.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class Conv2dOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  Conv2dOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~Conv2dOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
deleted file mode 100644
index 4d7a502dee341b44236749813bb982be8a003af7..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/conv2d_fusion.h"
-#include <algorithm>
-#include <memory>
-#include <vector>
-#include "paddle/fluid/inference/anakin/convert/helper.h"
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1UL);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL);
-
-  auto input_name = op_desc.Input("Input").front();
-  auto output_name = op_desc.Output("Output").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
-  this->engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
-
-  auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
-  PADDLE_ENFORCE_NOT_NULL(filter_v);
-
-  auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace());
-  auto weight_shape = framework::vectorize<int>(weight_tensor->dims());
-
-  auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
-  PADDLE_ENFORCE_NOT_NULL(b_v);
-
-  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
-  const int filter_h = weight_tensor->dims()[2];
-  const int filter_w = weight_tensor->dims()[3];
-  auto filter_num = weight_tensor->dims()[0];
-  this->engine_->template AddOpAttr<int>(op_name, "filter_num", filter_num);
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "kernel_size",
-                                                 {filter_h, filter_w});
-  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
-  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
-  auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dilation_rate",
-                                                 dilations);
-  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
-  this->engine_->AddOpAttr(op_name, "group", groups);
-  this->engine_->AddOpAttr(op_name, "axis", 1);
-  this->engine_->AddOpAttr(op_name, "bias_term", true);
-
-  ::anakin::saber::Shape anakin_shape(weight_shape);
-  bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
-  if (enable_int8) {
-    const float int8_range = 127.;
-    float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    auto weight_scale =
-        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
-    PBlock<TargetT> *weight1 =
-        new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
-    this->engine_->RegistBlock(weight1);
-    float *weight_data = weight_tensor->data<float>();
-    std::vector<char> weight_int8;
-    int weight_num = weight_tensor->numel();
-    for (int i = 0; i < weight_tensor->numel(); i++) {
-      bool is_valid_int8 =
-          ((weight_data[i] >= -128) && (weight_data[i] <= 127));
-      PADDLE_ENFORCE(is_valid_int8,
-                     "We are in anakin subgraph int8 mode, the weight of conv "
-                     "should be in range [-128, 127]");
-      weight_int8.push_back(static_cast<char>(weight_data[i]));
-    }
-    memcpy(static_cast<void *>(weight1->h_tensor().mutable_data()),
-           static_cast<void *>(weight_int8.data()), sizeof(char) * weight_num);
-    weight1->d_tensor().set_shape(anakin_shape);
-    weight1->d_tensor().copy_from(weight1->h_tensor());
-    this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
-    this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
-    this->engine_->Graph()->SetWeightsScale(
-        op_name, {weight_scale[0] / int8_range}, false);
-    this->engine_->AddTensorScale(input_name, in_scale / int8_range);
-  } else {
-    auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace());
-    auto weight_shape = framework::vectorize<int>(weight_tensor->dims());
-    auto *weight1 = pblock_from_tensor<TargetT, PrecisionT>(
-        *weight_tensor, weight_shape, this->engine_);
-    this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
-    auto weight2 = pblock_from_var<TargetT, PrecisionT>(*b_v, this->engine_);
-    this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
-  }
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
deleted file mode 100644
index 768814d3f996dd5c7224d5aebcbed9d430439ab5..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class Conv2dFusionOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  Conv2dFusionOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~Conv2dFusionOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
deleted file mode 100644
index 5bbaeb57a7da46adfaa47fb696b4b73c8e33c7f0..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/density_prior_box.h"
-#include <algorithm>
-#include <map>
-#include <vector>
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void DensityPriorBoxOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc,
-    const framework::Scope& scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  auto input_name = op_desc.Input("Input").front();
-  auto image_name = op_desc.Input("Image").front();
-  auto output_name = op_desc.Output("Boxes").front();
-  auto op_type = op_desc.Type();
-  auto op_name = op_type + ":" + op_desc.Output("Boxes").front();
-
-  // only for density_prior_box
-  std::vector<float> fixed_sizes = {};
-  std::vector<float> fixed_ratios = {};
-  std::vector<int> densities = {};
-
-  std::vector<float> min_sizes = {};
-  std::vector<float> max_sizes = {};
-  std::vector<float> aspect_ratios = {};
-  bool is_clip = false;
-  bool is_flip = false;
-
-  if (op_type == "density_prior_box") {
-    fixed_sizes =
-        boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
-    fixed_ratios =
-        boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
-    densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
-    is_clip = boost::get<bool>(op_desc.GetAttr("clip"));
-  } else if (op_type == "prior_box") {
-    min_sizes = boost::get<std::vector<float>>(op_desc.GetAttr("min_sizes"));
-    max_sizes = boost::get<std::vector<float>>(op_desc.GetAttr("max_sizes"));
-    aspect_ratios =
-        boost::get<std::vector<float>>(op_desc.GetAttr("aspect_ratios"));
-    is_clip = boost::get<bool>(op_desc.GetAttr("clip"));
-    is_flip = boost::get<bool>(op_desc.GetAttr("flip"));
-  }
-  std::vector<float> dens;
-  for (auto& ele : densities) {
-    dens.push_back(static_cast<float>(ele));
-  }
-
-  auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
-
-  // lack img_h, img_w
-  auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
-  auto step_w = boost::get<float>(op_desc.GetAttr("step_w"));
-  auto offset = boost::get<float>(op_desc.GetAttr("offset"));
-  PTuple<std::string> t_order;
-  t_order.push_back("MIN");
-  t_order.push_back("COM");
-  t_order.push_back("MAX");
-
-  std::vector<float> temp_v = {};
-
-  this->engine_->AddOp(op_name, "PriorBox", {input_name, image_name},
-                       {output_name});
-  this->engine_->template AddOpAttr<PTuple<float>>(op_name, "min_size",
-                                                   min_sizes);
-  this->engine_->template AddOpAttr<PTuple<float>>(op_name, "max_size",
-                                                   max_sizes);
-  this->engine_->template AddOpAttr<PTuple<float>>(op_name, "aspect_ratio",
-                                                   aspect_ratios);
-  this->engine_->template AddOpAttr<PTuple<float>>(op_name, "fixed_size",
-                                                   fixed_sizes);
-  this->engine_->template AddOpAttr<PTuple<float>>(op_name, "fixed_ratio",
-                                                   fixed_ratios);
-  this->engine_->template AddOpAttr<PTuple<float>>(op_name, "density", dens);
-  this->engine_->AddOpAttr(op_name, "is_flip", is_flip);
-  this->engine_->AddOpAttr(op_name, "is_clip", is_clip);
-  this->engine_->template AddOpAttr<PTuple<float>>(op_name, "variance",
-                                                   variances);
-  this->engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
-  this->engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
-  this->engine_->AddOpAttr(op_name, "step_h", step_h);
-  this->engine_->AddOpAttr(op_name, "step_w", step_w);
-  this->engine_->AddOpAttr(op_name, "offset", offset);
-  this->engine_->template AddOpAttr<PTuple<std::string>>(op_name, "order",
-                                                         t_order);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
-REGISTER_ANAKIN_OP_CONVERTER(prior_box, DensityPriorBoxOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.h b/paddle/fluid/inference/anakin/convert/density_prior_box.h
deleted file mode 100644
index 5714f57a04b7b34581f4deb5cbdd2eb4318ba72c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class DensityPriorBoxOpConverter
-    : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  DensityPriorBoxOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~DensityPriorBoxOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc
deleted file mode 100644
index 73dd6f2832541ecda119702f1779363e2950e413..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/detection_out.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/detection_out.h"
-#include <algorithm>
-#include <map>
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void DetectionOutOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  auto target_name = op_desc.Input("TargetBox").front();
-  auto prior_box_name = op_desc.Input("PriorBox").front();
-  auto scores_name = op_desc.Input("Scores").front();
-  auto output_name = op_desc.Output("Out").front();
-
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  auto code_type = boost::get<std::string>(op_desc.GetAttr("code_type"));
-  auto background_label = boost::get<int>(op_desc.GetAttr("background_label"));
-  auto score_threshold = boost::get<float>(op_desc.GetAttr("score_threshold"));
-  auto nms_top_k = boost::get<int>(op_desc.GetAttr("nms_top_k"));
-  auto nms_threshold = boost::get<float>(op_desc.GetAttr("nms_threshold"));
-  auto nms_eta = boost::get<float>(op_desc.GetAttr("nms_eta"));
-  auto keep_top_k = boost::get<int>(op_desc.GetAttr("keep_top_k"));
-  std::string anakin_code_type;
-  if (code_type == "decode_center_size") {
-    anakin_code_type = "CENTER_SIZE";
-  } else if (code_type == "encode_center_size") {
-    PADDLE_THROW(
-        "Not support encode_center_size code_type in DetectionOut of anakin");
-  }
-
-  this->engine_->AddOp(op_name, "DetectionOutput",
-                       {target_name, scores_name, prior_box_name},
-                       {output_name});
-  this->engine_->AddOpAttr(op_name, "share_location", true);
-  this->engine_->AddOpAttr(op_name, "variance_encode_in_target", false);
-  this->engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0));
-  this->engine_->AddOpAttr(op_name, "background_id", background_label);
-  this->engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k);
-  this->engine_->AddOpAttr(op_name, "code_type", anakin_code_type);
-  this->engine_->AddOpAttr(op_name, "conf_thresh", score_threshold);
-  this->engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k);
-  this->engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold);
-  this->engine_->AddOpAttr(op_name, "nms_eta", nms_eta);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/detection_out.h b/paddle/fluid/inference/anakin/convert/detection_out.h
deleted file mode 100644
index c34342a66c1c6c42585c4cb92d64ed3964f7f427..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/detection_out.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class DetectionOutOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  DetectionOutOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~DetectionOutOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/dropout.cc b/paddle/fluid/inference/anakin/convert/dropout.cc
deleted file mode 100644
index 6c5f80b5f8e07fd501348632ff3b4cda58de248c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/dropout.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/dropout.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/inference/anakin/convert/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void DropoutOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Mask").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto x_name = op_desc.Input("X").front();
-  auto out_name = op_desc.Output("Out").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  this->engine_->AddOp(op_name, "Scale", {x_name}, {out_name});
-
-  auto dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
-  auto factor = 1 - dropout_prob;
-  auto *weight1 = pblock_from_vector<TargetT, PrecisionT>(
-      std::vector<float>({factor}), this->engine_);
-
-  this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
-  this->engine_->AddOpAttr(op_name, "axis", 0);
-  this->engine_->AddOpAttr(op_name, "num_axes", 0);
-  this->engine_->AddOpAttr(op_name, "bias_term", false);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(dropout, DropoutOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/dropout.h b/paddle/fluid/inference/anakin/convert/dropout.h
deleted file mode 100644
index 801aa3dd16f8504360c75e251896f6bd4718925b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/dropout.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class DropoutOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  DropoutOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~DropoutOpConverter() {}
-
- private:
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/elementwise.cc b/paddle/fluid/inference/anakin/convert/elementwise.cc
deleted file mode 100644
index d221f26e11934d6851972d304c6f859346fe3b61..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/elementwise.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/elementwise.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void ElementwiseAddOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto x_name = op_desc.Input("X").front();
-  auto y_name = op_desc.Input("Y").front();
-  auto out_name = op_desc.Output("Out").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
-  std::string elementwise_type = "Add";
-  this->engine_->template AddOpAttr<std::string>(op_name, "type",
-                                                 elementwise_type);
-  std::vector<float> coeff = {1.0, 1.0};
-  this->engine_->template AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
-}
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void ElementwiseMulOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto x_name = op_desc.Input("X").front();
-  auto y_name = op_desc.Input("Y").front();
-  auto out_name = op_desc.Output("Out").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
-  std::string elementwise_type = "Mul";
-  this->engine_->template AddOpAttr<std::string>(op_name, "type",
-                                                 elementwise_type);
-  std::vector<float> coeff = {1.0, 1.0};
-  this->engine_->template AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter);
-REGISTER_ANAKIN_OP_CONVERTER(elementwise_mul, ElementwiseMulOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/elementwise.h b/paddle/fluid/inference/anakin/convert/elementwise.h
deleted file mode 100644
index 190a8b55f0e3c29e9e9c8b254d4b4df824c3330b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/elementwise.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class ElementwiseAddOpConverter
-    : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  ElementwiseAddOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~ElementwiseAddOpConverter() {}
-
- private:
-};
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class ElementwiseMulOpConverter
-    : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  ElementwiseMulOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~ElementwiseMulOpConverter() {}
-
- private:
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc
deleted file mode 100644
index 265d318aaee20d14350da3fb32a02ea66373c7fc..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/fc.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/inference/anakin/convert/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void FcBaseOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  auto input_names = op_desc.InputNames();
-  bool with_bias = input_names.size() >= 3;
-
-  std::string w_name = "Y";
-  std::string i_name = "X";
-  if (with_bias) {
-    w_name = "W";
-    i_name = "Input";
-  }
-
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  // get weights
-  auto *y_v = scope.FindVar(op_desc.Input(w_name).front());
-  PADDLE_ENFORCE_NOT_NULL(y_v);
-  auto weight_tensor = tensor_from_var(*y_v, platform::CPUPlace());
-  auto weight_shape = framework::vectorize<int>(weight_tensor->dims());
-
-  int out_dim = weight_shape[1];
-  const int w_m = weight_shape[0];
-  const int w_k = weight_shape[1];
-
-  auto input_name = op_desc.Input(i_name).front();
-  auto output_name = op_desc.Output("Out").front();
-
-  this->engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
-  this->engine_->AddOpAttr(op_name, "bias_term", with_bias);
-  this->engine_->AddOpAttr(op_name, "axis", 1);
-  this->engine_->AddOpAttr(op_name, "out_dim", out_dim);
-
-  auto *weight_data = weight_tensor->data<float>();
-  PADDLE_ENFORCE(w_m * w_k == weight_tensor->numel());
-
-  std::vector<float> trans_weight_data(weight_tensor->numel());
-  for (int i = 0; i < w_m; i++) {
-    for (int j = 0; j < w_k; j++) {
-      trans_weight_data[i + j * w_m] = weight_data[i * w_k + j];
-    }
-  }
-
-  int weight_num = weight_tensor->numel();
-  bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
-  if (enable_int8) {
-    if (weight_shape.size() < 4UL) {
-      weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1);
-    }
-    ::anakin::saber::Shape anakin_shape(weight_shape);
-    const float int8_range = 127.;
-    float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    auto weight_scale =
-        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
-    PBlock<TargetT> *weight1 =
-        new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
-    this->engine_->RegistBlock(weight1);
-    std::vector<char> weight_int8;
-    for (int i = 0; i < weight_num; i++) {
-      bool is_valid_int8 =
-          ((trans_weight_data[i] >= -128) && (trans_weight_data[i] <= 127));
-      PADDLE_ENFORCE(is_valid_int8,
-                     "We are in anakin subgraph int8 mode, the weight of fc "
-                     "should be in range [-128, 127]");
-      weight_int8.push_back(static_cast<char>(trans_weight_data[i]));
-    }
-    memcpy(static_cast<void *>(weight1->h_tensor().mutable_data()),
-           static_cast<void *>(weight_int8.data()), sizeof(char) * weight_num);
-    weight1->d_tensor().set_shape(anakin_shape);
-    weight1->d_tensor().copy_from(weight1->h_tensor());
-    this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
-    this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
-    this->engine_->Graph()->SetWeightsScale(
-        op_name, {weight_scale[0] / int8_range}, false);
-    this->engine_->AddTensorScale(input_name, in_scale / int8_range);
-  } else {
-    auto *weight1 = pblock_from_vector<TargetT, PrecisionT>(trans_weight_data,
-                                                            this->engine_);
-    this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
-  }
-
-  // get bias
-  if (with_bias) {
-    auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
-    PADDLE_ENFORCE_NOT_NULL(b_v);
-    auto weight2 = pblock_from_var<TargetT, PrecisionT>(*b_v, this->engine_);
-    this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
-  }
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter);
-REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/fc.h b/paddle/fluid/inference/anakin/convert/fc.h
deleted file mode 100644
index 6fe65e3ecd4ec42b6a1b5d874d0306cfba86c8b2..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/fc.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class FcBaseOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  FcBaseOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~FcBaseOpConverter() {}
-};
-
-// with bias
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class FcOpConverter : public FcBaseOpConverter<TargetT, PrecisionT> {
- public:
-  FcOpConverter() = default;
-};
-
-// without bias
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class MulOpConverter : public FcBaseOpConverter<TargetT, PrecisionT> {
- public:
-  MulOpConverter() = default;
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc
deleted file mode 100644
index 7ce519a4de36c950bef1b4e856452828398aa57e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/flatten.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/flatten.h"
-#include <vector>
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void FlattenOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
-
-  auto input = op_desc.Input("X").front();
-  auto output = op_desc.Output("Out").front();
-  int axis = boost::get<int>(op_desc.GetAttr("axis"));
-  PADDLE_ENFORCE(axis == 1,
-                 "the anakin flatten op converter now only support aixs == 1.");
-
-  std::vector<int> out_dims = {0, -1, 1, 1};
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  this->engine_->AddOp(op_name, "Reshape", {input}, {output});
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dims", out_dims);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/flatten.h b/paddle/fluid/inference/anakin/convert/flatten.h
deleted file mode 100644
index 6e5e059927d4d35cb28e383041a1c6ce1d59b282..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/flatten.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class FlattenOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  FlattenOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~FlattenOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/helper.cc b/paddle/fluid/inference/anakin/convert/helper.cc
deleted file mode 100644
index 7804619bf836d93beae5dba9b561da273936c381..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/helper.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-std::unique_ptr<framework::LoDTensor> tensor_from_var(
-    const framework::Variable& var, const platform::Place& place) {
-  auto& src = var.Get<framework::LoDTensor>();
-  std::unique_ptr<framework::LoDTensor> dst(new framework::LoDTensor());
-  dst->Resize(src.dims());
-  TensorCopySync((src), place, dst.get());
-  return dst;
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/helper.h b/paddle/fluid/inference/anakin/convert/helper.h
deleted file mode 100644
index 6ba8fd6ceb5e7850263da224e07e08d88ec2992b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/helper.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/inference/anakin/engine.h"
-
-#include "framework/core/net/net.h"
-#include "framework/core/types.h"
-#include "framework/graph/graph.h"
-#include "framework/graph/graph_global_mem.h"
-#include "saber/saber_types.h"
-
-using anakin::saber::Shape;
-using anakin::AK_FLOAT;
-using anakin::AK_INT8;
-using anakin::PBlock;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-std::unique_ptr<framework::LoDTensor> tensor_from_var(
-    const framework::Variable& var, const platform::Place& place);
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-PBlock<TargetT>* pblock_from_tensor(const framework::LoDTensor& tensor,
-                                    std::vector<int> shape_vec,
-                                    AnakinEngine<TargetT, PrecisionT>* engine) {
-  while (shape_vec.size() < 4) {
-    shape_vec.insert(shape_vec.begin(), 1);
-  }
-  Shape shape(shape_vec);
-  PBlock<TargetT>* weight = new PBlock<TargetT>(shape, AK_FLOAT);
-  engine->RegistBlock(weight);
-  float* cpu_data = static_cast<float*>(weight->h_tensor().mutable_data());
-  std::copy_n(tensor.data<float>(), tensor.numel(), cpu_data);
-  weight->d_tensor().set_shape(shape);
-  weight->d_tensor().copy_from(weight->h_tensor());
-  return weight;
-}
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-PBlock<TargetT>* pblock_from_vector(const std::vector<float>& vec,
-                                    std::vector<int> shape_vec,
-                                    AnakinEngine<TargetT, PrecisionT>* engine) {
-  while (shape_vec.size() < 4) {
-    shape_vec.insert(shape_vec.begin(), 1);
-  }
-  Shape shape(shape_vec);
-  PBlock<TargetT>* weight = new PBlock<TargetT>(shape, AK_FLOAT);
-  engine->RegistBlock(weight);
-  auto* weight_data = static_cast<float*>(weight->h_tensor().mutable_data());
-  std::copy(std::begin(vec), std::end(vec), weight_data);
-  weight->d_tensor().set_shape(shape);
-  weight->d_tensor().copy_from(weight->h_tensor());
-  return weight;
-}
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-PBlock<TargetT>* pblock_from_vector(const std::vector<float>& vec,
-                                    AnakinEngine<TargetT, PrecisionT>* engine) {
-  int size = vec.size();
-  return pblock_from_vector<TargetT, PrecisionT>(
-      vec, std::vector<int>({1, 1, 1, size}), engine);
-}
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-PBlock<TargetT>* pblock_from_var(const framework::Variable& var,
-                                 AnakinEngine<TargetT, PrecisionT>* engine) {
-  auto tensor = tensor_from_var(var, platform::CPUPlace());
-  auto shape = framework::vectorize<int>(tensor->dims());
-  return pblock_from_tensor<TargetT, PrecisionT>(*tensor, shape, engine);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.cc b/paddle/fluid/inference/anakin/convert/im2sequence.cc
deleted file mode 100644
index 5a4e3e61c5e4e40d8fe98fba8e098d89d916dde1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/im2sequence.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/im2sequence.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void Im2SequenceConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 0);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto x_name = op_desc.Input("X").front();
-  auto out_name = op_desc.Output("Out").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  this->engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name});
-
-  std::vector<int> dilations = {1, 1};
-  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-  auto kernels = boost::get<std::vector<int>>(op_desc.GetAttr("kernels"));
-
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "paddings", paddings);
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "window_size",
-                                                 kernels);
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dilations",
-                                                 dilations);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(im2sequence, Im2SequenceConverter);
diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.h b/paddle/fluid/inference/anakin/convert/im2sequence.h
deleted file mode 100644
index 8241d4d6f9ce78f57753bb7300e2fe968da4a927..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/im2sequence.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class Im2SequenceConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  Im2SequenceConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~Im2SequenceConverter() {}
-
- private:
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
deleted file mode 100644
index 9692f42779c8f23a4918761e859bb3e28f9a09e9..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "framework/core/types.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/anakin/engine.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#include "saber/saber_types.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class AnakinOpConverter {
-  using AnakinEngineT = AnakinEngine<TargetT, PrecisionT>;
-
- public:
-  AnakinOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope, bool test_mode) {}
-  void ConvertOp(const framework::proto::OpDesc &op,
-                 const framework::BlockDesc &block_desc,
-                 const std::unordered_set<std::string> &parameters,
-                 const framework::Scope &scope, AnakinEngineT *engine,
-                 bool test_mode = false) {
-    framework::OpDesc op_desc(op, nullptr);
-    std::string op_type = op_desc.Type();
-    AnakinOpConverter *it = nullptr;
-    if (op_type == "depthwise_conv2d") op_type = "conv2d";
-    if (op_type == "reshape2") op_type = "reshape";
-    if (op_type == "transpose2") op_type = "transpose";
-    if (op_type == "flatten2") op_type = "flatten";
-
-    if (!it) {
-      it = Registry<AnakinOpConverter>::Global().Lookup(op_type);
-    }
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type);
-    it->SetEngine(engine);
-    (*it)(op, block_desc, scope, test_mode);
-  }
-
-  void ConvertBlock(framework::BlockDesc *block_desc,
-                    const std::unordered_set<std::string> &parameters,
-                    const framework::Scope &scope, AnakinEngineT *engine) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    framework::proto::BlockDesc *block = block_desc->Proto();
-    for (auto i = 0; i < block->ops_size(); i++) {
-      auto &op = block->ops(i);
-      ConvertOp(op, *block_desc, parameters, scope, engine);
-    }
-  }
-
-  // The scope  here should be inited with the parameter vars.
-  void ConvertBlockToAnakinEngine(
-      framework::BlockDesc *block_desc, framework::Scope *scope,
-      const std::vector<std::string> &inputs,
-      const std::unordered_set<std::string> &parameters,
-      const std::vector<std::string> &outputs, AnakinEngineT *engine) {
-    ConvertBlock(block_desc, parameters, *scope, engine);
-    // if the max_batch size
-    int max_batch_size = engine->GetMaxBatchSize();
-    PADDLE_ENFORCE(max_batch_size > 0,
-                   "the max_batch_size setted from config->EnableAnakinEngine "
-                   "must largger than 0");
-    // If the user does not specify this variable, we use the input shape from
-    // the block_desc.
-    auto max_input_shape = engine->GetMaxInputShape();
-    std::map<std::string, std::vector<int>> temp_max_input_shape;
-    // Register outputs with anakin using the RegistVar interface before Freeze.
-    // Note that RegistVar's parameters can only be outputs, not inputs.
-    for (auto &output : outputs) {
-      engine->Graph()->RegistVar(output);
-    }
-    engine->Freeze();
-    // Add scale for tensor in int8 mode.
-    auto tensor_scales = engine->GetTensorScales();
-
-    for (auto &item : tensor_scales) {
-      engine->Graph()->SetVarScale(item.first, item.second);
-    }
-
-    for (auto &input : inputs) {
-      if (parameters.count(input)) continue;
-      std::vector<int> input_shape;
-      input_shape.resize(4);
-      input_shape[0] = max_batch_size;
-      if (max_input_shape.count(input)) {
-        PADDLE_ENFORCE(max_input_shape[input].size() == 4,
-                       "the dimensions of max_input_shape setted from "
-                       "config->EnableAnakinEngine must be 4");
-        for (int i = 1; i < 4; i++) {
-          input_shape[i] = max_input_shape[input][i];
-        }
-      } else {
-        auto *var = block_desc->FindVar(input);
-        PADDLE_ENFORCE(var, "no variable called %s", input);
-
-        auto var_shape = var->GetShape();
-        std::cout << "input :" << input << std::endl;
-        PADDLE_ENFORCE(var_shape.size() == 4);
-
-        for (size_t i = 1; i < var_shape.size(); i++) {
-          input_shape[i] = var_shape[i];
-        }
-      }
-      temp_max_input_shape[input] = input_shape;
-      engine->SetInputShape(input, input_shape);
-    }
-    engine->SetMaxInputShape(temp_max_input_shape);
-    engine->Optimize();
-    engine->InitNet();
-  }
-
-  void SetEngine(AnakinEngineT *engine) { engine_ = engine; }
-  virtual ~AnakinOpConverter() {}
-
- protected:
-  bool test_mode_;
-  AnakinEngineT *engine_{nullptr};
-
- private:
-  std::unordered_map<std::string, AnakinOpConverter<TargetT, PrecisionT> *>
-      converters_;
-  framework::Scope *scope_{nullptr};
-  std::mutex mutex_;
-};
-
-template class AnakinOpConverter<::anakin::saber::NV,
-                                 ::anakin::Precision::FP32>;
-template class AnakinOpConverter<::anakin::saber::NV,
-                                 ::anakin::Precision::INT8>;
-#ifdef ANAKIN_X86_PLACE
-template class AnakinOpConverter<::anakin::saber::X86,
-                                 ::anakin::Precision::FP32>;
-template class AnakinOpConverter<::anakin::saber::X86,
-                                 ::anakin::Precision::INT8>;
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-#define REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__,              \
-                                          place_type__, place_class__,         \
-                                          precision_type__, precision_class__) \
-  struct anakin_##op_type__##_##place_type__##_##precision_type__##_converter  \
-      : public ::paddle::framework::Registrar {                                \
-    anakin_##op_type__##_##place_type__##_##precision_type__##_converter() {   \
-      LOG(INFO) << "register convert " << #op_type__ << " ";                   \
-      ::paddle::inference::Registry<                                           \
-          ::paddle::inference::anakin::AnakinOpConverter<                      \
-              place_class__, precision_class__>>::Global()                     \
-          .Register<Converter__>(#op_type__);                                  \
-    }                                                                          \
-  };                                                                           \
-  anakin_##op_type__##_##place_type__##_##precision_type__##_converter         \
-      anakin_##op_type__##_##place_type__##_##precision_type__##_converter__;  \
-  int Touch_anakin_##op_type__##_##place_type__##_##precision_type__() {       \
-    anakin_##op_type__##_##place_type__##_##precision_type__##_converter__     \
-        .Touch();                                                              \
-    return 0;                                                                  \
-  }
-
-#define WRAP(...) __VA_ARGS__
-
-#define REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__,       \
-                                          precision_type__)             \
-  REGISTER_ANAKIN_OP_CONVERTER_BASE(                                    \
-      op_type__,                                                        \
-      ::paddle::inference::anakin::Converter__<WRAP(                    \
-          ::anakin::saber::NV, ::anakin::Precision::precision_type__)>, \
-      CUDA, ::anakin::saber::NV, precision_type__,                      \
-      ::anakin::Precision::precision_type__)
-
-#define REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__,         \
-                                         precision_type__)               \
-  REGISTER_ANAKIN_OP_CONVERTER_BASE(                                     \
-      op_type__,                                                         \
-      ::paddle::inference::anakin::Converter__<WRAP(                     \
-          ::anakin::saber::X86, ::anakin::Precision::precision_type__)>, \
-      CPU, ::anakin::saber::X86, precision_type__,                       \
-      ::anakin::Precision::precision_type__)
-
-#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)
-#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)       \
-  REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
-  REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8); \
-  REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32);  \
-  REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
-#elif defined(PADDLE_WITH_CUDA)
-#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)       \
-  REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
-  REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
-#endif
-
-#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__)   \
-  extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \
-  int use_converter_anakin_##op_type__##_##place_type__##_##precision_type__   \
-      UNUSED =                                                                 \
-          Touch_anakin_##op_type__##_##place_type__##_##precision_type__();
-
-#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)
-#define USE_ANAKIN_CONVERTER(op_type__)            \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32) \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32)
-#define USE_INT8_ANAKIN_CONVERTER(op_type__)       \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8) \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8)
-#elif defined(PADDLE_WITH_CUDA)
-#define USE_ANAKIN_CONVERTER(op_type__) \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32)
-#define USE_INT8_ANAKIN_CONVERTER(op_type__) \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8)
-#endif
diff --git a/paddle/fluid/inference/anakin/convert/pool2d.cc b/paddle/fluid/inference/anakin/convert/pool2d.cc
deleted file mode 100644
index 11e7c717fd689b27a619a33bcac4037b30f97af8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/pool2d.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/pool2d.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void Pool2dOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto x_name = op_desc.Input("X").front();
-  auto y_name = op_desc.Output("Out").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
-  std::string pool_type =
-      boost::get<std::string>(op_desc.GetAttr("pooling_type"));
-  std::vector<int> ksize =
-      boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
-  std::vector<int> strides =
-      boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-  std::vector<int> paddings =
-      boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-  bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));
-  std::string anakin_pool_type;
-  if (pool_type == "max") {
-    anakin_pool_type = "MAX";
-  } else if (pool_type == "avg") {
-    if (paddings[0] || paddings[1]) {
-      anakin_pool_type = "AVGEXC";
-    } else {
-      anakin_pool_type = "AVG";
-    }
-  } else {
-    PADDLE_THROW("TensorRT unsupported pooling type!");
-  }
-
-  this->engine_->AddOp(op_name, "Pooling", {x_name}, {y_name});
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "pool_size", ksize);
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
-  this->engine_->AddOpAttr(op_name, "method", anakin_pool_type);
-  this->engine_->AddOpAttr(op_name, "global_pooling", global_pooling);
-  this->engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/pool2d.h b/paddle/fluid/inference/anakin/convert/pool2d.h
deleted file mode 100644
index 7a06ff1b660a4ccd28570337b67aff68e7bce6a7..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/pool2d.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class Pool2dOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  Pool2dOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~Pool2dOpConverter() {}
-
- private:
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/relu.cc b/paddle/fluid/inference/anakin/convert/relu.cc
deleted file mode 100644
index 00853406634bdf5c488d21aca8289826f3a93a16..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/relu.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/relu.h"
-#include <algorithm>
-#include <map>
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void ReluOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  auto input_name = op_desc.Input("X").front();
-  auto output_name = op_desc.Output("Out").front();
-
-  this->engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
-  this->engine_->AddOpAttr(op_name, "alpha", 0);
-}
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void LeakyReluOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  auto input_name = op_desc.Input("X").front();
-  auto output_name = op_desc.Output("Out").front();
-
-  float alpha = boost::get<float>(op_desc.GetAttr("alpha"));
-  this->engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
-  this->engine_->AddOpAttr(op_name, "alpha", alpha);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter);
-REGISTER_ANAKIN_OP_CONVERTER(leaky_relu, LeakyReluOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/relu.h b/paddle/fluid/inference/anakin/convert/relu.h
deleted file mode 100644
index f366f05a94ae937c98c72e179a7bf14015a695ea..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/relu.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class ReluOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  ReluOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~ReluOpConverter() {}
-};
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class LeakyReluOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  LeakyReluOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~LeakyReluOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc
deleted file mode 100644
index d73736b7fecc758a2965f2d2afff9a808d6e2adc..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/reshape.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/reshape.h"
-#include <vector>
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void ReshapeOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
-
-  auto input = op_desc.Input("X").front();
-  auto output = op_desc.Output("Out").front();
-
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  this->engine_->AddOp(op_name, "Reshape", {input}, {output});
-
-  auto shape = boost::get<std::vector<int>>(op_desc.GetAttr("shape"));
-  if (shape.size() < 4) {
-    shape.insert(shape.end(), 4 - shape.size(), 1);
-  }
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dims", shape);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/reshape.h b/paddle/fluid/inference/anakin/convert/reshape.h
deleted file mode 100644
index 88de2641e60f1a08cf11b1206be28eb516c575f1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/reshape.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class ReshapeOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  ReshapeOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~ReshapeOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/roi_align.cc b/paddle/fluid/inference/anakin/convert/roi_align.cc
deleted file mode 100644
index 8702f638e10bbf72fa43d45e0042c16ffae447f1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/roi_align.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/roi_align.h"
-#include <algorithm>
-#include <map>
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void RoiAlignOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Input("ROIs").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  auto input_x_name = op_desc.Input("X").front();
-  auto input_rois_name = op_desc.Input("ROIs").front();
-  auto output_name = op_desc.Output("Out").front();
-
-  auto spatial_scale = boost::get<float>(op_desc.GetAttr("spatial_scale"));
-  auto pooled_height = boost::get<int>(op_desc.GetAttr("pooled_height"));
-  auto pooled_width = boost::get<int>(op_desc.GetAttr("pooled_width"));
-  auto sampling_ratio = boost::get<int>(op_desc.GetAttr("sampling_ratio"));
-
-  this->engine_->AddOp(op_name, "RoiAlign", {input_x_name, input_rois_name},
-                       {output_name});
-  this->engine_->AddOpAttr(op_name, "spatial_scale", spatial_scale);
-  this->engine_->AddOpAttr(op_name, "pooled_height", pooled_height);
-  this->engine_->AddOpAttr(op_name, "pooled_width", pooled_width);
-  this->engine_->AddOpAttr(op_name, "sampling_ratio", sampling_ratio);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(roi_align, RoiAlignOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/roi_align.h b/paddle/fluid/inference/anakin/convert/roi_align.h
deleted file mode 100644
index 8b5d23a01676f035174aa6fd2d2a79582fc1e2e0..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/roi_align.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class RoiAlignOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  RoiAlignOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~RoiAlignOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/scale.cc b/paddle/fluid/inference/anakin/convert/scale.cc
deleted file mode 100644
index 2559ec498c8ba423bf894b1ec67e24bd2567ff2b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/scale.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/scale.h"
-#include <algorithm>
-#include <map>
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void ScaleOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  auto input_name = op_desc.Input("X").front();
-  auto output_name = op_desc.Output("Out").front();
-  float scale = boost::get<float>(op_desc.GetAttr("scale"));
-  float bias = boost::get<float>(op_desc.GetAttr("bias"));
-  float bias_after_scale =
-      boost::get<bool>(op_desc.GetAttr("bias_after_scale"));
-  PADDLE_ENFORCE(bias_after_scale,
-                 "The anakin scale layer only support bias after scale now.");
-
-  this->engine_->AddOp(op_name, "Power", {input_name}, {output_name});
-  this->engine_->AddOpAttr(op_name, "shift", bias);
-  this->engine_->AddOpAttr(op_name, "scale", scale);
-  this->engine_->AddOpAttr(op_name, "power", static_cast<float>(1.0));
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/scale.h b/paddle/fluid/inference/anakin/convert/scale.h
deleted file mode 100644
index f19a92019349718ccd961d2dc2968ad479ff1a3c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/scale.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class ScaleOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  ScaleOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~ScaleOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/shuffle_channel.cc b/paddle/fluid/inference/anakin/convert/shuffle_channel.cc
deleted file mode 100644
index fdd2e3182e34992205d7707b83efbc3c6421076c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/shuffle_channel.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/shuffle_channel.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void ShuffleChannelOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto input = op_desc.Input("X").front();
-  auto output = op_desc.Output("Out").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  this->engine_->AddOp(op_name, "ShuffleChannel", {input}, {output});
-
-  auto group = boost::get<int>(op_desc.GetAttr("group"));
-  this->engine_->AddOpAttr(op_name, "group", group);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(shuffle_channel, ShuffleChannelOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/shuffle_channel.h b/paddle/fluid/inference/anakin/convert/shuffle_channel.h
deleted file mode 100644
index 457a14865a91bd6cfa763513f01cda72e34186e8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/shuffle_channel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class ShuffleChannelOpConverter
-    : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  ShuffleChannelOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~ShuffleChannelOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/softmax.cc b/paddle/fluid/inference/anakin/convert/softmax.cc
deleted file mode 100644
index a4dc5a9156b8f54cf8915e2a8829ada22d442ace..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/softmax.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/softmax.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void SoftMaxOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
-
-  auto input = op_desc.Input("X").front();
-  auto output = op_desc.Output("Out").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  auto input_var_desc = block_desc.FindVar(input);
-  PADDLE_ENFORCE(input_var_desc,
-                 "Cant find %s variable When runing Anakin Softmax converter.",
-                 input);
-  auto input_shape_in_fluid = input_var_desc->GetShape();
-  size_t input_dims = input_shape_in_fluid.size();
-
-  this->engine_->AddOp(op_name, "Softmax", {input}, {output});
-  this->engine_->AddOpAttr(op_name, "axis", static_cast<int>(input_dims - 1));
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(softmax, SoftMaxOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/softmax.h b/paddle/fluid/inference/anakin/convert/softmax.h
deleted file mode 100644
index dc431b5b867a2679050fa5b0128640678f36d210..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/softmax.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class SoftMaxOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  SoftMaxOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~SoftMaxOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/split.cc b/paddle/fluid/inference/anakin/convert/split.cc
deleted file mode 100644
index e63edea94ae010f3bd2240fd21147642f647581e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/split.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/split.h"
-#include <algorithm>
-#include <vector>
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void SplitOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  auto input_name = op_desc.Input("X").front();
-  auto y_names = op_desc.Output("Out");
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  int axis = boost::get<int>(op_desc.GetAttr("axis"));
-
-  std::vector<int> output_lengths =
-      boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
-
-  int split_num = output_lengths.size();
-  PADDLE_ENFORCE(split_num > 1,
-                 "anakin split op converter: the split num should > 1");
-  int num_sum = 0;
-  std::vector<int> slice_point;
-  for (int i = 0; i < split_num - 1; i++) {
-    num_sum += output_lengths[i];
-    slice_point.push_back(num_sum);
-  }
-  this->engine_->AddOp(op_name, "Slice", {input_name}, y_names);
-  this->engine_->AddOpAttr(op_name, "axis", axis);
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "slice_point",
-                                                 slice_point);
-  // slice_dim is useless in anakin
-  this->engine_->AddOpAttr(op_name, "slice_dim", 4);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(split, SplitOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/split.h b/paddle/fluid/inference/anakin/convert/split.h
deleted file mode 100644
index 819915315d90a500772756d1a21a5454694b9c0a..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/split.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class SplitOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  SplitOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~SplitOpConverter() {}
-
- private:
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/sum.cc b/paddle/fluid/inference/anakin/convert/sum.cc
deleted file mode 100644
index 870c07934090370a05ad5e8a2e68af8f314e25ae..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/sum.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/sum.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void SumOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto input_names = op_desc.Input("X");
-  auto out_name = op_desc.Output("Out").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-
-  std::vector<float> coeff = {1, 1};
-  std::string elementwise_type = "Add";
-  this->engine_->AddOp(op_name, "Eltwise", input_names, {out_name});
-  this->engine_->template AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
-  this->engine_->template AddOpAttr<std::string>(op_name, "type",
-                                                 elementwise_type);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(sum, SumOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/sum.h b/paddle/fluid/inference/anakin/convert/sum.h
deleted file mode 100644
index aefc64c623e916ee42604fed771d6985c4dcfd1d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/sum.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class SumOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  SumOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~SumOpConverter() {}
-
- private:
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/test_activation_op.cc b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
deleted file mode 100644
index 5ac8b45882f5175f90db6c5ddb2f41a67ca145e2..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/activation.h"
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-static void test_activation_op(const std::string& op_type,
-                               const platform::DeviceContext& context,
-                               bool use_gpu) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("act-X", {10, 6, 1, 1});
-  validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
-  framework::OpDesc desc;
-  desc.SetType(op_type);
-  desc.SetInput("X", {"act-X"});
-  desc.SetOutput("Out", {"act-Out"});
-
-  if (op_type == "swish") {
-    desc.SetAttr("beta", 1.0f);
-  }
-
-  if (op_type == "relu6") {
-    desc.SetAttr("threshold", 6.0f);
-  }
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(5);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(sigm_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_activation_op<::anakin::saber::NV>("sigmoid", ctx, true);
-}
-
-TEST(tanh_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_activation_op<::anakin::saber::NV>("tanh", ctx, true);
-}
-
-TEST(relu6_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_activation_op<::anakin::saber::NV>("relu6", ctx, true);
-}
-
-TEST(swish_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_activation_op<::anakin::saber::NV>("swish", ctx, true);
-}
-#endif
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(sigmoid);
-USE_OP(tanh);
-USE_OP(relu6);
-USE_OP(swish);
-
-USE_ANAKIN_CONVERTER(sigmoid);
-USE_ANAKIN_CONVERTER(tanh);
-USE_ANAKIN_CONVERTER(relu6);
-USE_ANAKIN_CONVERTER(swish);
diff --git a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc
deleted file mode 100644
index 008537dc8a5a82326f243e73fc33ce1dbeb730ef..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/affine_channel.h"
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_affine_channel_op(const platform::DeviceContext& context,
-                            bool use_gpu) {
-  // Declare the difference between the inputs.
-  std::unordered_set<std::string> parameters({"scale", "bias"});
-
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("x", {1, 3, 5, 2});
-  validator.DeclOutputVar("out", {1, 3, 5, 2});
-  validator.DeclParamVar("scale", {3});
-  validator.DeclParamVar("bias", {3});
-
-  // Prepare Op descriptions.
-  framework::OpDesc desc;
-  desc.SetType("affine_channel");
-  desc.SetInput("X", {"x"});
-  desc.SetInput("Bias", {"bias"});
-  desc.SetInput("Scale", {"scale"});
-  desc.SetOutput("Out", {"out"});
-
-  // Layout must be explicitly specified here as NCHW.
-  desc.SetAttr("data_layout", std::string("NCHW"));
-
-  validator.SetOp(*desc.Proto());
-  validator.Execute(1);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(affine_channel_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_affine_channel_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(affine_channel_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_affine_channel_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(affine_channel);
-USE_ANAKIN_CONVERTER(affine_channel);
diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
deleted file mode 100644
index edba90235fac023a1c9712f308b535da9ba39e3a..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_batchnorm_op(const platform::DeviceContext& context, bool use_gpu) {
-  std::unordered_set<std::string> parameters(
-      {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
-       "batch_norm_variance"});
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  std::vector<int> param_shape{2};
-
-  validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5});
-  validator.DeclParamVar("batch_norm_scale", param_shape);
-  validator.DeclParamVar("batch_norm_bias", param_shape);
-  validator.DeclParamVar("batch_norm_mean", param_shape);
-  validator.DeclParamVar("batch_norm_variance", param_shape);
-  validator.DeclOutputVar("batch_norm_Y", {1, 2, 5, 5});
-  validator.DeclOutputVar("batch_norm_save_mean", param_shape);
-  validator.DeclOutputVar("batch_norm_save_variance", param_shape);
-
-  // Prepare Op description
-  framework::OpDesc desc;
-
-  desc.SetType("batch_norm");
-  desc.SetInput("X", {"batch_norm_X"});
-  desc.SetInput("Scale", {"batch_norm_scale"});
-  desc.SetInput("Bias", {"batch_norm_bias"});
-  desc.SetInput("Mean", {"batch_norm_mean"});
-  desc.SetInput("Variance", {"batch_norm_variance"});
-  desc.SetOutput("Y", {"batch_norm_Y"});
-  desc.SetOutput("MeanOut", {"batch_norm_mean"});
-  desc.SetOutput("VarianceOut", {"batch_norm_variance"});
-  desc.SetOutput("SavedMean", {"batch_norm_save_mean"});
-  desc.SetOutput("SavedVariance", {"batch_norm_save_variance"});
-
-  float eps = 1e-5f;
-  bool is_test = true;
-  desc.SetAttr("epsilon", eps);
-  desc.SetAttr("is_test", is_test);
-
-  validator.SetOp(*desc.Proto());
-
-  std::unordered_set<std::string> neglected_output = {
-      "batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean",
-      "batch_norm_variance"};
-  validator.Execute(1, neglected_output);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(batch_norm_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_batchnorm_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(batch_norm_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_batchnorm_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-USE_OP(batch_norm);
-USE_ANAKIN_CONVERTER(batch_norm);
diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
deleted file mode 100644
index 6870260c865873874072e25721edafeba8dab234..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/concat.h"
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_concat_op(const platform::DeviceContext& context, bool use_gpu) {
-  std::unordered_set<std::string> parameters({""});
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
-  validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
-  validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
-  validator.DeclOutputVar("concat_out", {1, 6, 1, 1});
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("concat");
-  desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
-  desc.SetOutput("Out", {"concat_out"});
-
-  int axis = 1;
-  desc.SetAttr("axis", axis);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(1);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(concat_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_concat_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(concat_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_concat_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-USE_OP(concat);
-USE_ANAKIN_CONVERTER(concat);
diff --git a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
deleted file mode 100644
index 723a348b12e3b451b047514838a68e56238956a2..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/conv2d.h"
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_conv2d_op(const platform::DeviceContext& context, bool use_gpu) {
-  std::unordered_set<std::string> parameters({"conv2d-Y"});
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
-  validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
-  validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("conv2d");
-  desc.SetInput("Input", {"conv2d-X"});
-  desc.SetInput("Filter", {"conv2d-Y"});
-  desc.SetOutput("Output", {"conv2d-Out"});
-
-  const std::vector<int> strides({1, 1});
-  const std::vector<int> paddings({0, 0});
-  const std::vector<int> dilations({1, 1});
-  const int groups = 1;
-
-  desc.SetAttr("strides", strides);
-  desc.SetAttr("paddings", paddings);
-  desc.SetAttr("dilations", dilations);
-  desc.SetAttr("groups", groups);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(3);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(conv2d_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_conv2d_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(conv2d_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_conv2d_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(conv2d);
-USE_ANAKIN_CONVERTER(conv2d);
diff --git a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
deleted file mode 100644
index 83792676a00440257d836c3fb50e7d685f5d110a..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/dropout.h"
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_dropout_op(const platform::DeviceContext& context, bool use_gpu) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("x", {1, 1, 2, 2});
-  validator.DeclOutputVar("out", {1, 1, 2, 2});
-  validator.DeclOutputVar("mask", {1, 1, 2, 2});
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("dropout");
-  desc.SetInput("X", {"x"});
-  desc.SetOutput("Out", {"out"});
-  desc.SetOutput("Mask", {"mask"});
-
-  float dropout_prob = 0.5;
-  desc.SetAttr("dropout_prob", dropout_prob);
-  desc.SetAttr("is_test", true);
-
-  validator.SetOp(*desc.Proto());
-  std::unordered_set<std::string> neglected_output = {"mask"};
-  validator.Execute(1, neglected_output);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(dropout_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_dropout_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(dropout_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_dropout_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(dropout);
-USE_ANAKIN_CONVERTER(dropout);
diff --git a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
deleted file mode 100644
index ee128c1ec9ad62998310e7faaef962fa251cca7f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/elementwise.h"
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-static void test_elementwise_op(const std::string& op_type,
-                                const platform::DeviceContext& context,
-                                bool use_gpu) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("x", {1, 1, 2, 2});
-  validator.DeclInputVar("y", {1, 1, 2, 2});
-  validator.DeclOutputVar("out", {1, 1, 2, 2});
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType(op_type);
-  desc.SetInput("X", {"x"});
-  desc.SetInput("Y", {"y"});
-  desc.SetOutput("Out", {"out"});
-
-  int axis = -1;
-  desc.SetAttr("axis", axis);
-
-  validator.SetOp(*desc.Proto());
-  validator.Execute(1);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(elementwise_op, native_add_gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_elementwise_op<::anakin::saber::NV>("elementwise_add", ctx, true);
-}
-TEST(elementwise_op, native_mul_gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_elementwise_op<::anakin::saber::NV>("elementwise_mul", ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(elementwise_op, native_add_cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_elementwise_op<::anakin::saber::X86>("elementwise_add", ctx, false);
-}
-TEST(elementwise_op, native_mul_cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_elementwise_op<::anakin::saber::X86>("elementwise_mul", ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(elementwise_add);
-USE_OP(elementwise_mul);
-USE_ANAKIN_CONVERTER(elementwise_add);
-USE_ANAKIN_CONVERTER(elementwise_mul);
diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
deleted file mode 100644
index 3e68d8fed6a66423d5fc4c271445a41207417253..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_mul_op(const platform::DeviceContext& context, bool use_gpu) {
-  std::unordered_set<std::string> parameters({"mul_y"});
-  framework::Scope scope;
-
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("mul_x", {1, 1, 2, 2});
-  validator.DeclParamVar("mul_y", {4, 2});
-  validator.DeclOutputVar("mul_out", {1, 2});
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("mul");
-  desc.SetInput("X", {"mul_x"});
-  desc.SetInput("Y", {"mul_y"});
-  desc.SetOutput("Out", {"mul_out"});
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(10);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(mul_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_mul_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(mul_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_mul_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(mul);
-USE_ANAKIN_CONVERTER(fc);
diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
deleted file mode 100644
index 5e4cfdabfd7ca4dfc865ba3722030c5dbd44d036..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_flatten_op(const platform::DeviceContext& context, bool use_gpu) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
-  validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
-  framework::OpDesc desc;
-  desc.SetType("flatten");
-  desc.SetInput("X", {"flatten-X"});
-  desc.SetOutput("Out", {"flatten-Out"});
-  desc.SetAttr("axis", 1);
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(5);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(flatten_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_flatten_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(flatten_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_flatten_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(reshape);
-USE_OP_ITSELF(flatten);
-USE_ANAKIN_CONVERTER(flatten);
diff --git a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
deleted file mode 100644
index 5e5764633125c867e27b0b52e0e6ef18714653b2..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/im2sequence.h"
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-TEST(im2sequence_op, native) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, &scope);
-
-  std::vector<int> kernels = {6, 1};
-  std::vector<int> strides = {1, 1};
-  std::vector<int> paddings = {0, 0, 0, 0};
-
-  validator.DeclInputVar("x", {1, 1, 2, 2});
-  validator.DeclOutputVar("out", {1, 1 * kernels[0] * kernels[1]});
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("im2sequence");
-  desc.SetInput("X", {"x"});
-  desc.SetOutput("Out", {"out"});
-
-  desc.SetAttr("kernels", kernels);
-  desc.SetAttr("strides", strides);
-  desc.SetAttr("paddings", paddings);
-
-  validator.SetOp(*desc.Proto());
-  validator.Execute(1);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(im2sequence);
-USE_ANAKIN_CONVERTER(im2sequence);
diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
deleted file mode 100644
index 9b23b5b93df16ae833fda891dc89c8dfe98cddcb..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_pool2d(const platform::DeviceContext& context, bool use_gpu,
-                 bool global_pooling, bool ceil_mode,
-                 std::string pool_type = "max") {
-  framework::Scope scope;
-  std::unordered_set<std::string> parameters;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-
-  // The ITensor's Dims should not contain the batch size.
-  // So, the ITensor's Dims of input and output should be C * H * W.
-  validator.DeclInputVar("pool2d_x", {1, 3, 6, 7});
-  if (global_pooling)
-    validator.DeclOutputVar("pool2d_out", {1, 3, 1, 1});
-  else if (ceil_mode)
-    validator.DeclOutputVar("pool2d_out", {1, 3, 3, 4});
-  else
-    validator.DeclOutputVar("pool2d_out", {1, 3, 3, 3});
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("pool2d");
-  desc.SetInput("X", {"pool2d_x"});
-  desc.SetOutput("Out", {"pool2d_out"});
-
-  std::vector<int> ksize({2, 2});
-  std::vector<int> strides({2, 2});
-  std::vector<int> paddings({0, 0});
-  std::string pooling_t = pool_type;
-
-  desc.SetAttr("pooling_type", pooling_t);
-  desc.SetAttr("ksize", ksize);
-  desc.SetAttr("strides", strides);
-  desc.SetAttr("paddings", paddings);
-  desc.SetAttr("global_pooling", global_pooling);
-  desc.SetAttr("ceil_mode", ceil_mode);
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(1);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Pool2dOpConverter, normal) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_pool2d<::anakin::saber::NV>(ctx, true, false, false);
-}
-TEST(Pool2dOpConverter, test_global_pooling) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_pool2d<::anakin::saber::NV>(ctx, true, true, false);
-}
-
-TEST(Pool2dOpConverter, max_ceil_test) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_pool2d<::anakin::saber::NV>(ctx, true, false, true);
-}
-
-TEST(Pool2dOpConverter, avg_ceil_test) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_pool2d<::anakin::saber::NV>(ctx, true, false, true, "avg");
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(Pool2dOpConverter, normal_cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_pool2d<::anakin::saber::X86>(ctx, false, false, false);
-}
-TEST(Pool2dOpConverter, test_global_pooling_cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_pool2d<::anakin::saber::X86>(ctx, false, true, false);
-}
-
-TEST(Pool2dOpConverter, max_ceil_test_cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_pool2d<::anakin::saber::X86>(ctx, false, false, true);
-}
-
-TEST(Pool2dOpConverter, avg_ceil_test_cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_pool2d<::anakin::saber::X86>(ctx, false, false, true, "avg");
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(pool2d);
-USE_ANAKIN_CONVERTER(pool2d);
diff --git a/paddle/fluid/inference/anakin/convert/test_relu_op.cc b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
deleted file mode 100644
index eb6429f3383d2848a8b512009ada78d578dab919..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/relu.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-static void test_activation_op(const std::string& op_type,
-                               const platform::DeviceContext& context,
-                               bool use_gpu) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("act-X", {10, 6, 1, 1});
-  validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
-  framework::OpDesc desc;
-  desc.SetType(op_type);
-  desc.SetInput("X", {"act-X"});
-  desc.SetOutput("Out", {"act-Out"});
-  if (op_type == "leaky_relu") {
-    desc.SetAttr("alpha", 0.1f);
-  }
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(5);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(relu_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_activation_op<::anakin::saber::NV>("relu", ctx, true);
-}
-
-TEST(leaky_relu_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_activation_op<::anakin::saber::NV>("leaky_relu", ctx, true);
-}
-#endif
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(relu);
-USE_OP(leaky_relu);
-USE_ANAKIN_CONVERTER(relu);
-USE_ANAKIN_CONVERTER(leaky_relu);
diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
deleted file mode 100644
index b1be42e542ce06cb1ea88af8db71f2dfcec8123b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_reshape1_op(const platform::DeviceContext& context, bool use_gpu) {
-  framework::Scope scope;
-  std::unordered_set<std::string> parameters;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-
-  // validator.DeclInputVar("reshape-X", {2, 3, 3, 1});
-  // validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3});
-  validator.DeclInputVar("reshape-X", {1, 2, 4, 1});
-  validator.DeclOutputVar("reshape-Out", {1, 8, 1, 1});
-
-  framework::OpDesc desc;
-  desc.SetType("reshape");
-  desc.SetInput("X", {"reshape-X"});
-  desc.SetOutput("Out", {"reshape-Out"});
-  // desc.SetAttr("shape", std::vector<int>({3, 2, 1, 3}));
-  desc.SetAttr("shape", std::vector<int>({1, 8, 1, 1}));
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-  validator.Execute(1);
-}
-
-template <typename TargetT>
-void test_reshape2_op(const platform::DeviceContext& context, bool use_gpu) {
-  framework::Scope scope;
-  std::unordered_set<std::string> parameters;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-
-  validator.DeclInputVar("reshape-X", {1, 2, 4});
-  validator.DeclOutputVar("reshape-Out", {1, 4, 2});
-
-  framework::OpDesc desc;
-  desc.SetType("reshape");
-  desc.SetInput("X", {"reshape-X"});
-  desc.SetOutput("Out", {"reshape-Out"});
-  // desc.SetAttr("shape", std::vector<int>({3, 2, 1, 3}));
-  desc.SetAttr("shape", std::vector<int>({0, -1, 2}));
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-  validator.Execute(1);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(reshape1_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_reshape1_op<::anakin::saber::NV>(ctx, true);
-}
-
-TEST(reshape2_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_reshape2_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(reshape1_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_reshape2_op<::anakin::saber::X86>(ctx, false);
-}
-
-TEST(reshape2_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_reshape2_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(reshape);
-USE_ANAKIN_CONVERTER(reshape);
diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
deleted file mode 100644
index 1a324739d98534d3b5443cd5f2c2f57f7045543e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_softmax_op(const platform::DeviceContext& context, bool use_gpu) {
-  framework::Scope scope;
-  std::unordered_set<std::string> parameters;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-
-  validator.DeclInputVar("softmax-X", {1, 10, 2});
-  validator.DeclOutputVar("softmax-Out", {1, 10, 2});
-
-  framework::OpDesc desc;
-  desc.SetType("softmax");
-  desc.SetInput("X", {"softmax-X"});
-  desc.SetOutput("Out", {"softmax-Out"});
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-  validator.Execute(1);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(softmax_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_softmax_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(relu_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_softmax_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(softmax);
-USE_ANAKIN_CONVERTER(softmax);
diff --git a/paddle/fluid/inference/anakin/convert/test_split_op.cc b/paddle/fluid/inference/anakin/convert/test_split_op.cc
deleted file mode 100644
index f9ef54fdcacecd7540becb5b8aff997d40c5872d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_split_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/split.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, int Axis>
-void AnakinSliceTest(const platform::DeviceContext &context, bool use_gpu,
-                     const std::vector<int> &in_shape,
-                     const std::vector<int> &sections) {
-  std::unordered_set<std::string> parameters({""});
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-
-  validator.DeclInputVar("split_input", in_shape);
-  std::vector<std::string> output_vars;
-  for (size_t i = 0; i < sections.size(); ++i) {
-    auto out_shape = in_shape;
-    out_shape[Axis] = sections[i];
-    std::string output_name = "split_out" + std::to_string(i);
-    validator.DeclOutputVar(output_name, out_shape);
-    output_vars.push_back(output_name);
-  }
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("split");
-  desc.SetInput("X", {"split_input"});
-  desc.SetOutput("Out", output_vars);
-
-  desc.SetAttr("axis", Axis);
-  desc.SetAttr("num", 0);
-  desc.SetAttr("sections", sections);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(1);
-}
-
-// batch = 0, axis = 1, same shape
-TEST(split_op, test_same_shape_axis1_batch1) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  AnakinSliceTest<::anakin::saber::NV, 1>(ctx, true, {1, 4, 2, 2}, {2, 2});
-}
-// batch = 0, axis = 1, different shape
-TEST(split_op, test_different_shape_axis1_batch1) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  AnakinSliceTest<::anakin::saber::NV, 1>(ctx, true, {1, 3, 2, 2}, {2, 1});
-}
-// batch = 0, axis = 2, same shape
-TEST(split_op, test_same_shape_axis2_batch1) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  AnakinSliceTest<::anakin::saber::NV, 2>(ctx, true, {1, 3, 4, 2}, {2, 2});
-}
-// batch = 0, axis = 2, different shape
-TEST(split_op, test_different_shape_axis2_batch1) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  AnakinSliceTest<::anakin::saber::NV, 2>(ctx, true, {1, 3, 3, 2}, {2, 1});
-}
-
-// batch = 0, axis = 3, same shape
-TEST(split_op, test_same_shape_axis3_batch1) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 4}, {2, 2});
-}
-// batch = 0, axis = 3, different shape
-TEST(split_op, test_different_shape_axis3_batch1) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 3}, {2, 1});
-}
-#ifdef ANAKIN_X86_PLACE
-TEST(split_op, test_different_shape_axis1_batch1_cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  AnakinSliceTest<::anakin::saber::X86, 1>(ctx, false, {1, 3, 2, 3}, {2, 1});
-}
-
-TEST(split_op, test_different_shape_axis2_batch1_cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  AnakinSliceTest<::anakin::saber::X86, 2>(ctx, false, {1, 3, 4, 2}, {2, 2});
-}
-
-TEST(split_op, test_different_shape_axis3_batch1_cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  AnakinSliceTest<::anakin::saber::X86, 3>(ctx, false, {1, 3, 2, 4}, {2, 2});
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(split);
-USE_ANAKIN_CONVERTER(split);
diff --git a/paddle/fluid/inference/anakin/convert/test_sum_op.cc b/paddle/fluid/inference/anakin/convert/test_sum_op.cc
deleted file mode 100644
index 9d26430ea68c5b818b96365e36381a088c3725f6..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/sum.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-#include "paddle/fluid/operators/sum_op.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-static void test_sum_op(const platform::DeviceContext& context, bool use_gpu) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("sum_x1", {1, 2, 1, 2});
-  validator.DeclInputVar("sum_x2", {1, 2, 1, 2});
-  validator.DeclOutputVar("sum_out", {1, 2, 1, 2});
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("sum");
-  desc.SetInput("X", {"sum_x1", "sum_x2"});
-  desc.SetOutput("Out", {"sum_out"});
-
-  validator.SetOp(*desc.Proto());
-  validator.Execute(1);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(sum_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_sum_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(sum_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_sum_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(sum);
-USE_ANAKIN_CONVERTER(sum);
diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
deleted file mode 100644
index 466e2f1a49f21b30973553ae6cd7bd4f0864def3..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT>
-void test_transpose1_op(const platform::DeviceContext& context, bool use_gpu) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("transpose-X", {2, 3, 4, 5});
-  validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3});
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("transpose");
-  desc.SetInput("X", {"transpose-X"});
-  desc.SetOutput("Out", {"transpose-Out"});
-  desc.SetAttr("axis", std::vector<int>({2, 0, 3, 1}));
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(3);
-}
-
-template <typename TargetT>
-void test_transpose2_op(const platform::DeviceContext& context, bool use_gpu) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
-      parameters, &scope, context, use_gpu);
-  validator.DeclInputVar("transpose-X", {3, 4, 5});
-  validator.DeclOutputVar("transpose-Out", {3, 5, 4});
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("transpose");
-  desc.SetInput("X", {"transpose-X"});
-  desc.SetOutput("Out", {"transpose-Out"});
-  desc.SetAttr("axis", std::vector<int>({0, 2, 1}));
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(1);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(transpose1_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_transpose1_op<::anakin::saber::NV>(ctx, true);
-}
-
-TEST(transpose2_op, gpu) {
-  platform::CUDAPlace gpu_place(0);
-  platform::CUDADeviceContext ctx(gpu_place);
-  test_transpose2_op<::anakin::saber::NV>(ctx, true);
-}
-#endif
-#ifdef ANAKIN_X86_PLACE
-TEST(transpose1_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_transpose2_op<::anakin::saber::X86>(ctx, false);
-}
-
-TEST(transpose2_op, cpu) {
-  platform::CPUPlace cpu_place;
-  platform::CPUDeviceContext ctx(cpu_place);
-  test_transpose2_op<::anakin::saber::X86>(ctx, false);
-}
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(transpose);
-USE_ANAKIN_CONVERTER(transpose);
diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc
deleted file mode 100644
index 28071ca8449cdd61799011308a992eacd51dfd38..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/transpose.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/convert/transpose.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-
-using anakin::PTuple;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-void TransposeOpConverter<TargetT, PrecisionT>::operator()(
-    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
-    const framework::Scope &scope, bool test_mode) {
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-  auto input = op_desc.Input("X").front();
-  auto output = op_desc.Output("Out").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  this->engine_->AddOp(op_name, "Permute", {input}, {output});
-
-  auto axis = boost::get<std::vector<int>>(op_desc.GetAttr("axis"));
-  size_t axis_size = axis.size();
-  while (axis.size() < 4) {
-    axis.push_back(axis_size);
-    axis_size += 1;
-  }
-  this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dims", axis);
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/transpose.h b/paddle/fluid/inference/anakin/convert/transpose.h
deleted file mode 100644
index b7b0a0f209e7d6402ad7d5e30d23cf0523d3cf90..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/transpose.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class TransposeOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
- public:
-  TransposeOpConverter() = default;
-
-  virtual void operator()(const framework::proto::OpDesc &op,
-                          const framework::BlockDesc &block_desc,
-                          const framework::Scope &scope,
-                          bool test_mode) override;
-  virtual ~TransposeOpConverter() {}
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h
deleted file mode 100644
index f2b56a2569f584ea8751c4fedcf28b02775147f9..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <gtest/gtest.h>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/engine.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/enforce.h"
-
-using anakin::Precision;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-/*
- * Get a random float value between [low, high]
- */
-float random(float low, float high) {
-  static std::random_device rd;
-  static std::mt19937 mt(rd());
-  std::uniform_real_distribution<double> dist(low, high);
-  return dist(mt);
-}
-
-void RandomizeTensor(framework::LoDTensor* tensor,
-                     const platform::Place& place) {
-  auto dims = tensor->dims();
-  size_t num_elements = analysis::AccuDims(dims, dims.size());
-  PADDLE_ENFORCE_GT(num_elements, 0);
-
-  platform::CPUPlace cpu_place;
-  framework::LoDTensor temp_tensor;
-  temp_tensor.Resize(dims);
-  auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
-
-  for (size_t i = 0; i < num_elements; i++) {
-    *(temp_data + i) = random(0., 1.);
-  }
-
-  TensorCopySync(temp_tensor, place, tensor);
-}
-
-/*
- * Help to validate the correctness between Fluid Op and the corresponding
- * anakin
- * layer.
- */
-template <typename TargetT, ::anakin::Precision PrecisionT>
-class AnakinConvertValidation {
-  using AnakinNvEngineT = AnakinEngine<TargetT, PrecisionT>;
-
- public:
-  AnakinConvertValidation() = delete;
-
-  AnakinConvertValidation(const std::unordered_set<std::string>& parameters,
-                          framework::Scope* scope,
-                          const platform::DeviceContext& ctx,
-                          bool use_gpu = true)
-      : parameters_(parameters), scope_(scope), ctx_(ctx), use_gpu_(use_gpu) {
-    engine_.reset(new AnakinEngine<TargetT, PrecisionT>(true));
-  }
-
-  // Declare a Variable as input with random initialization.
-  void DeclInputVar(const std::string& name,
-                    const std::vector<int> tensor_dims) {
-    DeclVar(name, tensor_dims);
-    // should decalre anakin input here.
-  }
-
-  void DeclParamVar(const std::string& name, const std::vector<int> dim_vec) {
-    DeclVar(name, dim_vec);
-  }
-
-  void DeclOutputVar(const std::string& name, const std::vector<int> dim_vec) {
-    DeclVar(name, dim_vec);
-    // should declare anakin output here.
-  }
-
-  void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
-    auto* x = scope_->Var(name);
-    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
-    x_tensor->Resize(framework::make_ddim(dim_vec));
-    RandomizeTensor(x_tensor, ctx_.GetPlace());
-
-    std::vector<int64_t> dim_vec_int64;
-    for (auto& ele : dim_vec) {
-      dim_vec_int64.push_back(static_cast<int64_t>(ele));
-    }
-
-    // Add var_desc to block_desc
-    auto* block_desc = program_desc_.MutableBlock(framework::kRootBlockIndex);
-
-    auto* var_desc = block_desc->Var(name);
-    var_desc->SetShape(dim_vec_int64);
-  }
-
-  void SetOp(const framework::proto::OpDesc& desc) {
-    op_ = framework::OpRegistry::CreateOp(desc);
-    op_desc_.reset(new framework::OpDesc(desc, nullptr));
-    // should init anakin engine here.
-
-    auto& block_desc = program_desc_.Block(framework::kRootBlockIndex);
-    Singleton<AnakinOpConverter<TargetT, PrecisionT>>::Global().ConvertOp(
-        desc, block_desc, parameters_, *scope_, engine_.get(),
-        true /*test_mode*/);
-    engine_->Freeze();
-
-    std::map<std::string, std::vector<int>> temp_max_input_shape;
-    for (const auto& input : op_desc_->InputArgumentNames()) {
-      if (parameters_.count(input)) continue;
-      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(*scope_,
-                                                                        input);
-      auto t_shape = framework::vectorize<int>(t.dims());
-      while (t_shape.size() < 4) {
-        t_shape.push_back(1);
-      }
-      engine_->SetInputShape(input, t_shape);
-      temp_max_input_shape[input] = t_shape;
-    }
-    engine_->SetMaxInputShape(temp_max_input_shape);
-    engine_->Optimize();
-    engine_->InitNet();
-  }
-
-  // We use the set 'neglected_output' here, because some Ops like batch norm,
-  // the outputs specified in the op des are only used during training,
-  // so we should neglect those output during inference.
-  void Execute(int batch_size,
-               std::unordered_set<std::string> neglected_output = {}) {
-    // Execute Fluid Op
-    op_->Run(*scope_, ctx_.GetPlace());
-
-    std::map<std::string, framework::LoDTensor*> inputs;
-    for (const auto& input : op_desc_->InputArgumentNames()) {
-      if (parameters_.count(input)) continue;
-      auto* var = scope_->FindVar(input);
-      auto tensor = var->GetMutable<framework::LoDTensor>();
-      inputs.insert({input, tensor});
-    }
-
-    std::map<std::string, framework::LoDTensor*> outputs;
-    std::vector<std::vector<float>> fluid_outputs;
-    for (const auto& output : op_desc_->OutputArgumentNames()) {
-      if (neglected_output.count(output)) continue;
-      std::vector<float> fluid_out;
-      auto* var = scope_->FindVar(output);
-      auto tensor = var->GetMutable<framework::LoDTensor>();
-      framework::TensorToVector(*tensor, ctx_, &fluid_out);
-      fluid_outputs.push_back(fluid_out);
-
-      outputs.insert({output, tensor});
-    }
-
-    if (!use_gpu_) {
-      engine_->Execute(inputs, outputs);
-    } else {
-      cudaStream_t stream;
-      PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream), 0);
-      engine_->Execute(inputs, outputs, stream);
-    }
-
-    int i_output = 0;
-    for (const auto& output : op_desc_->OutputArgumentNames()) {
-      if (neglected_output.count(output)) continue;
-      std::vector<float> anakin_out;
-      auto* var = scope_->FindVar(output);
-      auto tensor = var->GetMutable<framework::LoDTensor>();
-      framework::TensorToVector(*tensor, ctx_, &anakin_out);
-
-      size_t anakin_out_size = anakin_out.size();
-      auto fluid_out = fluid_outputs[i_output++];
-      for (size_t i = 0; i < anakin_out_size; i++) {
-        EXPECT_LT(std::abs(fluid_out[i] - anakin_out[i]), 1e-3);
-      }
-    }
-  }
-
- private:
-  std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
-  std::unique_ptr<framework::OperatorBase> op_;
-  std::unique_ptr<framework::OpDesc> op_desc_;
-  framework::ProgramDesc program_desc_;
-  const std::unordered_set<std::string>& parameters_;
-  framework::Scope* scope_;
-  const platform::DeviceContext& ctx_;
-  bool use_gpu_{true};
-};
-
-template class AnakinConvertValidation<::anakin::saber::NV,
-                                       ::anakin::Precision::FP32>;
-template class AnakinConvertValidation<::anakin::saber::NV,
-                                       ::anakin::Precision::INT8>;
-#ifdef ANAKIN_X86_PLACE
-template class AnakinConvertValidation<::anakin::saber::X86,
-                                       ::anakin::Precision::FP32>;
-template class AnakinConvertValidation<::anakin::saber::X86,
-                                       ::anakin::Precision::INT8>;
-#endif
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc
deleted file mode 100644
index d57952db0d36051a219391868f19a55079b8d7bf..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/engine.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/engine.h"
-#include <algorithm>
-#include <cstring>
-#include <map>
-#include <utility>
-#include "paddle/fluid/framework/ddim.h"
-
-using anakin::Precision;
-using anakin::OpRunType;
-using paddle::framework::LoDTensor;
-template <typename T, Precision P, OpRunType O>
-using AnakinNetT = anakin::Net<T, P, O>;
-
-template <typename T, Precision P>
-using AnakinGraphT = anakin::graph::Graph<T, P>;
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-extern std::once_flag
-    AnakinEngine<TargetT, PrecisionType, RunType>::init_anakin_;
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(
-    bool need_summary, int device, int max_batch_size,
-    std::map<std::string, std::vector<int>> max_input_shape,
-    std::vector<std::string> program_inputs, bool auto_config_layout)
-    : device_(device),
-      max_batch_size_(max_batch_size),
-      max_input_shape_(max_input_shape),
-      program_inputs_(program_inputs),
-      auto_config_layout_(auto_config_layout) {
-  ::anakin::TargetWrapper<TargetT>::set_device(device_);
-  std::call_once(init_anakin_,
-                 [this]() { ::anakin::Env<TargetT>::env_init(); });
-  graph_.reset(new AnakinGraphT<TargetT, PrecisionType>());
-  net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary));
-}
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-AnakinEngine<TargetT, PrecisionType, RunType>::~AnakinEngine() {}
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-void AnakinEngine<TargetT, PrecisionType, RunType>::SetInputShape(
-    const std::string &name, std::vector<int> shape) {
-  graph_->AddOpAttr<::anakin::PTuple<int>>(name, "input_shape",
-                                           std::move(shape));
-}
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-void AnakinEngine<TargetT, PrecisionType, RunType>::InitNet() {
-  net_->init(*graph_, auto_config_layout_);
-}
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
-    const std::string &name, const std::string &type,
-    const std::vector<std::string> &inputs,
-    const std::vector<std::string> &outputs) {
-  PADDLE_ENFORCE(graph_->AddOp(name, type, inputs, outputs), "Add operation.");
-}
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-void AnakinEngine<TargetT, PrecisionType, RunType>::BindInput(
-    const std::map<std::string, framework::LoDTensor *> &inputs) {
-#ifdef PADDLE_WITH_CUDA
-  cudaDeviceSynchronize();
-#endif
-  for (const auto &input : inputs) {
-    auto *tensor = input.second;
-    auto *data = tensor->data<float>();
-
-    auto fluid_input_shape = framework::vectorize<int>(tensor->dims());
-    while (fluid_input_shape.size() < 4) {
-      fluid_input_shape.push_back(1);
-    }
-    auto *anakin_input = net_->get_in(input.first);
-    std::vector<int> max_input_shape = max_input_shape_[input.first];
-    int max_shape_sum =
-        std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1,
-                        std::multiplies<int>());
-    if (tensor->numel() > max_shape_sum) {
-      PADDLE_ENFORCE(std::find(program_inputs_.begin(), program_inputs_.end(),
-                               input.first) == program_inputs_.end(),
-                     "The anakin input max shape should be greater than"
-                     " or equal to the real input shape, Please set the max "
-                     "input shape using EnableAnakinEngine");
-      VLOG(3) << "Anakin Net will be reset because of the inputs out of range: "
-              << input.first;
-      graph_->Reshape(input.first, fluid_input_shape);
-      net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(true));
-      net_->init(*graph_);
-      anakin_input = net_->get_in(input.first);
-    }
-    anakin_input->reshape(fluid_input_shape);
-    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), device_,
-                                                       fluid_input_shape);
-    anakin_input->copy_from(tmp_anakin_tensor);
-  }
-}
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
-    const std::map<std::string, framework::LoDTensor *> &inputs,
-    const std::map<std::string, framework::LoDTensor *> &outputs) {
-  BindInput(inputs);
-  net_->prediction();
-  for (const auto &output : outputs) {
-    platform::CPUPlace cpu_place;
-    auto *tensor = output.second;
-    auto *anakin_output = net_->get_out(output.first);
-    auto *anakin_data = anakin_output->data();
-    auto anakin_output_shape = anakin_output->valid_shape();
-    tensor->Resize(framework::make_ddim(anakin_output_shape));
-    auto *fluid_data = tensor->mutable_data<float>(cpu_place);
-    memory::Copy(cpu_place, static_cast<void *>(fluid_data), cpu_place,
-                 static_cast<void *>(anakin_data),
-                 tensor->numel() * sizeof(float));
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
-    const std::map<std::string, framework::LoDTensor *> &inputs,
-    const std::map<std::string, framework::LoDTensor *> &outputs,
-    cudaStream_t stream) {
-  BindInput(inputs);
-  net_->prediction();
-  cudaDeviceSynchronize();
-  for (const auto &output : outputs) {
-    platform::CUDAPlace gpu_place(device_);
-    auto *tensor = output.second;
-    auto *anakin_output = net_->get_out(output.first);
-    auto *anakin_data = anakin_output->data();
-    auto anakin_output_shape = anakin_output->valid_shape();
-    tensor->Resize(framework::make_ddim(anakin_output_shape));
-    auto *fluid_data = tensor->mutable_data<float>(gpu_place);
-    memory::Copy(gpu_place, static_cast<void *>(fluid_data), gpu_place,
-                 static_cast<void *>(anakin_data),
-                 tensor->numel() * sizeof(float), stream);
-  }
-  cudaDeviceSynchronize();
-}
-#endif
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-void AnakinEngine<TargetT, PrecisionType, RunType>::Freeze() {
-  PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph.");
-}
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-void AnakinEngine<TargetT, PrecisionType, RunType>::Optimize() {
-  PADDLE_ENFORCE(graph_->Optimize(), "Graph optimization.");
-}
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-void AnakinEngine<TargetT, PrecisionType, RunType>::RegistBlock(
-    ::anakin::PBlock<TargetT> *block_p) {
-  PADDLE_ENFORCE(graph_->RegistBlock(block_p), "Block register.");
-}
-
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-std::unique_ptr<AnakinEngine<TargetT, PrecisionType, RunType>>
-AnakinEngine<TargetT, PrecisionType, RunType>::Clone() {
-  auto *engine = new AnakinEngine();
-  engine->net_ = std::move(net_->Clone());
-  return std::unique_ptr<AnakinEngine>(engine);
-}
-
-#ifdef PADDLE_WITH_CUDA
-template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>;
-template class AnakinEngineManager<::anakin::saber::NV,
-                                   ::anakin::Precision::FP32>;
-
-template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::INT8>;
-template class AnakinEngineManager<::anakin::saber::NV,
-                                   ::anakin::Precision::INT8>;
-#endif
-#ifdef ANAKIN_X86_PLACE
-template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
-template class AnakinEngineManager<::anakin::saber::X86,
-                                   ::anakin::Precision::FP32>;
-template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::INT8>;
-template class AnakinEngineManager<::anakin::saber::X86,
-                                   ::anakin::Precision::INT8>;
-#endif
-// template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h
deleted file mode 100644
index e62bb82fd12405fcb93b16310f9197e7c5fd63b5..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/engine.h
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <functional>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/engine.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#ifdef EXIT  // NOLINT
-#undef EXIT  // NOLINT
-#endif       // NOLINT
-#include "framework/core/net/net.h"
-#include "framework/core/types.h"
-#include "framework/graph/graph.h"
-#include "framework/graph/graph_global_mem.h"
-#include "saber/saber_types.h"
-
-using anakin::Precision;
-
-namespace anakin {
-
-template <typename, Precision, OpRunType>
-class Net;
-
-namespace graph {
-template <typename, Precision>
-class Graph;
-}  // namespace graph
-}  // namespace anakin
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-template <typename TargetT, ::anakin::Precision PrecisionType,
-          ::anakin::OpRunType RunType = ::anakin::OpRunType::ASYNC>
-class AnakinEngine {
-  using NetT = ::anakin::Net<TargetT, PrecisionType, RunType>;
-  using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
-
- public:
-  explicit AnakinEngine(
-      bool need_summary = false, int device = 0, int max_batch_size = 1,
-      std::map<std::string, std::vector<int>> max_input_shape = {},
-      std::vector<std::string> program_inputs = {},
-      bool auto_config_layout = false);
-  ~AnakinEngine();
-  void InitNet();
-  void SetInputShape(const std::string &name, std::vector<int> shape);
-  void AddOp(const std::string &name, const std::string &type,
-             const std::vector<std::string> &inputs,
-             const std::vector<std::string> &outputs);
-
-  template <typename T>
-  void AddOpAttr(const std::string &op_name, const std::string &attr_name,
-                 const T &attr_value) {
-    PADDLE_ENFORCE(graph_->AddOpAttr(op_name, attr_name, attr_value),
-                   "Add operation's attribution.");
-  }
-  NetT *Net() { return net_.get(); }
-  GraphT *Graph() { return graph_.get(); }
-  std::unique_ptr<AnakinEngine> Clone();
-  const std::map<std::string, std::vector<int>> &GetMaxInputShape() {
-    return max_input_shape_;
-  }
-  void SetMaxInputShape(std::map<std::string, std::vector<int>> shape) {
-    max_input_shape_ = shape;
-  }
-  const std::vector<std::string> &GetScalableInputs() {
-    return program_inputs_;
-  }
-  void SetScalableInputs(std::vector<std::string> program_inputs) {
-    program_inputs_ = program_inputs;
-  }
-  int GetMaxBatchSize() { return max_batch_size_; }
-  void Freeze();
-  void Optimize();
-  void RegistBlock(::anakin::PBlock<TargetT> *block_p);
-  void Save(std::string path) { graph_->save(path); }
-  bool IsInit() { return initialized_; }
-  int GetDevice() { return device_; }
-  void AddTensorScale(const std::string &tensor_name, float scale) {
-    tensor_scales_[tensor_name] = scale;
-  }
-  std::unordered_map<std::string, float> GetTensorScales() {
-    return tensor_scales_;
-  }
-  void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
-               const std::map<std::string, framework::LoDTensor *> &outputs);
-#ifdef PADDLE_WITH_CUDA
-  void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
-               const std::map<std::string, framework::LoDTensor *> &outputs,
-               cudaStream_t stream);
-#endif
-
- private:
-  void BindInput(const std::map<std::string, framework::LoDTensor *> &inputs);
-
- private:
-  bool initialized_{false};
-  int device_;
-  int max_batch_size_;
-  std::map<std::string, std::vector<int>> max_input_shape_;
-  std::vector<std::string> program_inputs_;
-  std::unique_ptr<GraphT> graph_;
-  std::unique_ptr<NetT> net_;
-  static std::once_flag init_anakin_;
-  std::unordered_map<std::string, float> tensor_scales_;
-  // Always be false in gpu mode but true in most cpu cases.
-  bool auto_config_layout_;
-};
-
-template <typename TargetT, ::anakin::Precision PrecisionType>
-class AnakinEngineManager {
-  using AnakinEngineT = AnakinEngine<TargetT, PrecisionType>;
-
- public:
-  bool HasEngine(const std::string &name) const {
-    if (engines_.count(name) == 0) return false;
-    return engines_.at(name).get() != nullptr;
-  }
-  AnakinEngineT *Get(const std::string &name) const {
-    return engines_.at(name).get();
-  }
-
-  AnakinEngineT *Create(bool need_summary, int device, int max_batch_size,
-                        std::map<std::string, std::vector<int>> max_input_shape,
-                        std::vector<std::string> program_inputs,
-                        bool auto_config_layout, std::string engine_name) {
-    std::unique_lock<std::mutex> lk(mut_);
-    auto *p = new AnakinEngine<TargetT, PrecisionType>(
-        need_summary, device, max_batch_size, max_input_shape, program_inputs,
-        auto_config_layout);
-    engines_[engine_name].reset(p);
-    return p;
-  }
-
-  void DeleteALL() {
-    for (auto &item : engines_) {
-      item.second.reset(nullptr);
-    }
-  }
-
- private:
-  std::unordered_map<std::string, std::unique_ptr<AnakinEngineT>> engines_;
-  std::mutex mut_;
-};
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc
deleted file mode 100644
index 67b771226c4999a361a818e32e8caedd81723c03..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/anakin/op_teller.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-// Just tell by the op_types.
-struct SimpleOpTypeSetTeller : public Teller {
-  SimpleOpTypeSetTeller() {
-    teller_set.insert("mul");
-    teller_set.insert("fc");
-    teller_set.insert("conv2d_fusion");
-    teller_set.insert("split");
-    teller_set.insert("relu");
-    teller_set.insert("pool2d");
-    teller_set.insert("elementwise_add");
-    teller_set.insert("elementwise_mul");
-    teller_set.insert("concat");
-    teller_set.insert("tanh");
-    teller_set.insert("conv2d");
-    teller_set.insert("batch_norm");
-    teller_set.insert("softmax");
-    teller_set.insert("flatten2");
-    teller_set.insert("reshape2");
-    teller_set.insert("transpose2");
-    teller_set.insert("density_prior_box");
-    teller_set.insert("detection_out");
-    teller_set.insert("dropout");
-    teller_set.insert("sigmoid");
-    teller_set.insert("sum");
-    teller_set.insert("depthwise_conv2d");
-    teller_set.insert("prior_box");
-    teller_set.insert("leaky_relu");
-    teller_set.insert("affine_channel");
-    teller_set.insert("relu6");
-    teller_set.insert("swish");
-    teller_set.insert("shuffle_channel");
-  }
-
-  bool operator()(const std::string& op_type,
-                  const framework::OpDesc& desc) override {
-    return teller_set.count(op_type);
-  }
-
- private:
-  std::unordered_set<std::string> teller_set;
-};
-
-bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
-  for (auto& teller : tellers_) {
-    if ((*teller)(op_type, desc)) return true;
-  }
-  return false;
-}
-
-OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/op_teller.h b/paddle/fluid/inference/anakin/op_teller.h
deleted file mode 100644
index 15a42067b8438e60851a50e454abde95782d90ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/op_teller.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/op_desc.h"
-
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-/*
- * Single Op teller definition.
- * One can override this and define a more complex tell logic, considerring more
- * issues such as op_desc.
- */
-struct Teller {
-  virtual bool operator()(const std::string& op_type,
-                          const framework::OpDesc& desc) = 0;
-
-  virtual ~Teller() = default;
-};
-/*
- * A real example:
- *
- * struct SomeTeller : public Teller {
- * bool operator()(const std::string& op_type,
- *                const framework::OpDesc& desc) override {
- *  return op_type == "fc" && desc.Inputs().size() == 2;
- * }
- *};
- */
-
-/*
- * class OpTeller helps to tell whether a fluid
- * operator can be transformed to a TensorRT layer.
- */
-class OpTeller {
- public:
-  static OpTeller& Global() {
-    static std::unique_ptr<OpTeller> x(new OpTeller);
-    return *x;
-  }
-
-  bool Tell(const std::string& op_type, const framework::OpDesc& desc);
-
- private:
-  OpTeller();
-
- private:
-  std::vector<std::unique_ptr<Teller>> tellers_;
-};
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/test_anakin_engine.cc b/paddle/fluid/inference/anakin/test_anakin_engine.cc
deleted file mode 100644
index 3c8a33ec60f8aa04e4b40eb260f4107281332a7d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/anakin/test_anakin_engine.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include <map>
-
-#include "paddle/fluid/inference/anakin/engine.h"
-
-using anakin::AK_FLOAT;
-using anakin::Precision;
-using anakin::saber::NV;
-using anakin::saber::Shape;
-using anakin::PBlock;
-using anakin::PTuple;
-namespace paddle {
-namespace inference {
-namespace anakin {
-
-class TestAnakinEngine : public ::testing::Test {
- protected:
-  void SetUp() override;
-  void TearDown() override {}
-
- protected:
-  using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
-  std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
-};
-
-void TestAnakinEngine::SetUp() {
-  engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
-}
-
-TEST_F(TestAnakinEngine, Execute) {
-  engine_->AddOp("op1", "Dense", {"x"}, {"y"});
-  engine_->AddOpAttr("op1", "out_dim", 2);
-  engine_->AddOpAttr("op1", "bias_term", false);
-  engine_->AddOpAttr("op1", "axis", 1);
-  std::vector<int> shape = {1, 1, 1, 2};
-  Shape tmp_shape(shape);
-
-  PBlock<NV> *weight1 = new PBlock<NV>(tmp_shape, AK_FLOAT);
-  engine_->RegistBlock(weight1);
-  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
-  cpu_data[0] = 2.;
-  weight1->d_tensor().set_shape(tmp_shape);
-  weight1->d_tensor().copy_from(weight1->h_tensor());
-  engine_->AddOpAttr("op1", "weight_1", *weight1);
-
-  engine_->Freeze();
-  // PTuple<int> input_shape = {1};
-  // engine_->AddOpAttr("x", "input_shape", input_shape);
-  engine_->SetInputShape("x", {1, 1, 1, 1});
-  engine_->Optimize();
-  engine_->InitNet();
-  framework::LoDTensor x;
-  framework::LoDTensor y;
-  x.Resize({1, 1, 1, 1});
-  y.Resize({1, 1, 1, 2});
-  auto *x_data = x.mutable_data<float>(platform::CUDAPlace());
-  float x_data_cpu[] = {1.};
-  cudaMemcpy(x_data, x_data_cpu, sizeof(float), cudaMemcpyHostToDevice);
-
-  std::map<std::string, framework::LoDTensor *> inputs = {{"x", &x}};
-  auto *y_data = y.mutable_data<float>(platform::CUDAPlace());
-  std::map<std::string, framework::LoDTensor *> outputs = {{"y", &y}};
-
-  cudaStream_t stream;
-
-  engine_->Execute(inputs, outputs, stream);
-  auto *y_data_gpu = y_data;
-  float y_data_cpu[2];
-  cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost);
-  LOG(INFO) << "output value: " << y_data_cpu[0] << ", " << y_data_cpu[1];
-}
-
-}  // namespace anakin
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
deleted file mode 100644
index d79fb529092ded176a4ab17ffb7cf828edce07a1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-unset(analysis_deps CACHE)
-set(analysis_deps # analysis_deps can be extended accross the project
-        framework_proto proto_desc graph pass paddle_fluid_api executor pretty_log
-        ir_pass_manager
-        CACHE INTERNAL "")
-
-add_subdirectory(ir_passes)
-add_subdirectory(passes)
-
-cc_library(analysis_helper SRCS helper.cc DEPS framework_proto proto_desc graph paddle_fluid_api)
-
-cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES} analysis_helper)
-
-cc_library(argument SRCS argument.cc DEPS scope proto_desc)
-cc_library(analysis_pass SRCS analysis_pass.cc DEPS proto_desc)
-
-cc_library(analysis SRCS
-  analyzer.cc
-  analysis_pass
-  DEPS ${analysis_deps} analysis_helper
-  ${INFER_IR_PASSES}
-  )
-
-cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
-
-function(inference_analysis_test_build TARGET)
-  if(WITH_TESTING)
-     set(options "")
-     set(oneValueArgs "")
-     set(multiValueArgs SRCS EXTRA_DEPS)
-     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-     inference_base_test_build(${TARGET}
-             SRCS ${analysis_test_SRCS}
-             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS})
-  endif()
-endfunction()
-
-function(inference_analysis_test_run TARGET)
-  if(WITH_TESTING)
-     set(options "")
-     set(oneValueArgs "")
-     set(multiValueArgs COMMAND ARGS)
-     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-     inference_base_test_run(${TARGET}
-	     COMMAND ${analysis_test_COMMAND}
-             ARGS ${analysis_test_ARGS})
-  endif()
-endfunction()
-
-function(inference_analysis_test TARGET)
-  if(WITH_TESTING)
-     set(options "")
-     set(oneValueArgs "")
-     set(multiValueArgs SRCS ARGS EXTRA_DEPS)
-     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-     inference_base_test_build(${TARGET}
-             SRCS ${analysis_test_SRCS}
-             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS})
-     inference_base_test_run(${TARGET}
-	     COMMAND ${TARGET}
-             ARGS ${analysis_test_ARGS})
-  endif()
-endfunction(inference_analysis_test)
-
-inference_analysis_test(test_analyzer
-	SRCS analyzer_tester.cc
-	EXTRA_DEPS reset_tensor_array paddle_inference_api
-	ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
deleted file mode 100644
index 70adb4a974cc5f9911cb302840bbef7ec2591505..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# Inference Analysis
-
-The `inference/analysis` module is used to analyze and optimize the inference program,
-it references some philosophy from `LLVM/analysis`, 
-and make the various optimization features be pluggable and co-exist in a pipeline.
-
-We borrowed some concepts from LLVM, such as
-
-- [Pass](./pass.h)es to implement optimization that traverse the inference program,
-- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
-- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
-
-There are some other basic concepts here
-
-- [Node](./node.h), the node in a `DataFlowGraph`,
-  - `Function`, the Operator in Fluid,
-  - `Value`, the Variable in Fluid;
-- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
-
-## How it works
-
-The `inference/analysis` module make all the passes in a pipeline, and works in such way:
-
-1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
-2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
-3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
-
-The new optimization features can be added as an independent `Pass` and controlled by gflags,
-each pass will generate unified debug information or visualization for better debugging.
-
-## Supported Passes
-
-### `FluidToDataFlowGraphPass`
-Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes, 
-this should be the first pass of the pipeline.
-
-### `DataFlowGraphToFluidPass`
-Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
-
-### `TensorRTSubgraphNodeMarkPass`
-Mark the `Node` that are supported by TensorRT, 
-this pass will generate a visualization file which can be used for debugging.
-
-### `TensorRTSubGraphPass`
-Split the sub-graph that are can be accelerated by TensorRT.
-
-### `DFG_GraphvizDrawPass`
-This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
-
-It can be used as a helper class that draws the modified graph after each pass.
-
-## Utilities
-
-There is some helper legacy/function/class for analysis.
-
-- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
-- [graph_traits.h](./graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes,
-there are some implementations in  [data_flow_graph.cc](./data_flow_graph.cc) , such as BFS and DFS..
diff --git a/paddle/fluid/inference/analysis/analysis_pass.cc b/paddle/fluid/inference/analysis/analysis_pass.cc
deleted file mode 100644
index 9be9f755b9ed7273d842f8c0e2046f0ca0ce2247..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/analysis_pass.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
deleted file mode 100644
index d5a972fab3beae4d4e2e512d1ccda3f0b8356682..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <iosfwd>
-#include <string>
-
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * AnalysisPass is a pass used to control the IR passes.
- */
-class AnalysisPass {
- public:
-  AnalysisPass() = default;
-  virtual ~AnalysisPass() = default;
-
-  // Run on a single Graph.
-  void Run(Argument* argument) { RunImpl(argument); }
-
-  // Human-readable short representation.
-  virtual std::string repr() const = 0;
-  // Human-readable long description.
-  virtual std::string description() const { return "No DOC"; }
-
- protected:
-  // User should implement these.
-  virtual void RunImpl(Argument* argument) = 0;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
deleted file mode 100644
index 71fdb5570c7c6fca56a302b5d2deee4bd1a8f9f8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/inference/analysis/passes/passes.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-Analyzer::Analyzer() {}
-
-void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
-
-void Analyzer::RunAnalysis(Argument *argument) {
-  PADDLE_ENFORCE(argument->analysis_passes_valid(),
-                 "analsis_passes is not valid in the argument.");
-  for (auto &pass : argument->analysis_passes()) {
-    string::PrettyLogH1("--- Running analysis [%s]", pass);
-    if (!argument->enable_analysis_optim() && pass == "ir_analysis_pass")
-      continue;
-
-    auto *ptr = PassRegistry::Global().Retreive(pass);
-    PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
-    ptr->Run(argument);
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
deleted file mode 100644
index a6de18db60072fc2a5310893a885994d675dd8b6..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/*
- * This file contains Analyzer, an class that exposed as a library that analyze
- * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
- * multiple flags to
- * control whether an process is applied on the program.
- *
- * The processes are called Passes in analysis, the Passes are placed in a
- * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
- * Fluid ProgramDesc to
- * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
- * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
- * pipeline can be any Passes
- * which take a node or data flow graph as input.
- *
- * The Analyzer can be used in two methods, the first is a executable file which
- * can be used to pre-process the inference model and can be controlled by
- * passing difference command flags;
- * the other way is to compose inside the inference API as a runtime pre-process
- * phase in the inference service.
- */
-
-#include <gflags/gflags.h>
-#include <string>
-#include <vector>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/flags.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-class Analyzer final {
- public:
-  Analyzer();
-
-  void Run(Argument* argument);
-
-  DISABLE_COPY_AND_ASSIGN(Analyzer);
-
- protected:
-  void RunAnalysis(Argument* argument);
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
deleted file mode 100644
index 489345da49a232e7fb21bd44c1ecf34cf1e4fe8f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/analyzer.h"
-
-#include <google/protobuf/text_format.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-using namespace framework;  // NOLINT
-
-TEST(Analyzer, analysis_without_tensorrt) {
-  Argument argument;
-  argument.SetModelDir(FLAGS_inference_model_dir);
-  argument.SetEnableAnalysisOptim(false);
-  argument.SetUseGPU(false);
-  argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
-                              "ir_params_sync_among_devices_pass"});
-
-  Analyzer analyser;
-  analyser.Run(&argument);
-}
-
-TEST(Analyzer, analysis_with_tensorrt) {
-  Argument argument;
-  argument.SetEnableAnalysisOptim(false);
-  argument.SetTensorRtMaxBatchSize(3);
-  argument.SetTensorRtWorkspaceSize(1 << 20);
-  argument.SetModelDir(FLAGS_inference_model_dir);
-  argument.SetUseGPU(false);
-  argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
-                              "ir_params_sync_among_devices_pass"});
-
-  Analyzer analyser;
-  analyser.Run(&argument);
-}
-
-void TestWord2vecPrediction(const std::string& model_path) {
-  NativeConfig config;
-  config.model_dir = model_path;
-  config.use_gpu = false;
-  config.device = 0;
-  auto predictor = ::paddle::CreatePaddlePredictor<NativeConfig>(config);
-
-  // One single batch
-
-  int64_t data[4] = {1, 2, 3, 4};
-  PaddleTensor tensor;
-  tensor.shape = std::vector<int>({4, 1});
-  tensor.data = PaddleBuf(data, sizeof(data));
-  tensor.dtype = PaddleDType::INT64;
-
-  // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> slots(4, tensor);
-  std::vector<PaddleTensor> outputs;
-  CHECK(predictor->Run(slots, &outputs));
-
-  PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-  // Check the output buffer size and result of each tid.
-  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL);
-  float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
-                     0.000932706};
-  const size_t num_elements = outputs.front().data.length() / sizeof(float);
-  // The outputs' buffers are in CPU memory.
-  for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements);
-       i++) {
-    LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
-              << " result: " << result[i];
-    EXPECT_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
-                1e-3);
-  }
-}
-
-TEST(Analyzer, word2vec_without_analysis) {
-  TestWord2vecPrediction(FLAGS_inference_model_dir);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.cc b/paddle/fluid/inference/analysis/argument.cc
deleted file mode 100644
index cb0263d5d98e86b612696ebde66d17fb2543809b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/argument.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/argument.h"
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
deleted file mode 100644
index 42858655aaa306bb3f212fac14e1c262fad2cd4f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/argument.h
+++ /dev/null
@@ -1,220 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file defines the class Argument, which is the input and output of the
- * analysis module. All the fields that needed either by Passes or PassManagers
- * are contained in Argument.
- *
- * TODO(Superjomn) Find some way better to contain the fields when it grow too
- * big.
- */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-using framework::ir::Graph;
-
-#ifdef PADDLE_WITH_MKLDNN
-using VarQuantScale =
-    std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
-#endif
-
-/*
- * The argument definition of both Pass and PassManagers.
- *
- * All the fields should be registered here for clearness.
- */
-struct Argument {
-  Argument() = default;
-  explicit Argument(const std::string& model_dir) { SetModelDir(model_dir); }
-
-  using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>;
-  using fusion_statis_t = std::unordered_map<std::string, int>;
-  using anakin_max_shape_t = std::map<std::string, std::vector<int>>;
-
-  bool Has(const std::string& key) const { return valid_fields_.count(key); }
-  // If we set the model using config.SetModelBuffer,
-  // the model and parameter will occupy additional CPU resources.
-  // Use this interface to release these resources.
-  void PartiallyRelease() {
-    if (Has("model_program_path")) {
-      if (Has("model_from_memory") && model_from_memory()) {
-        model_program_path().clear();
-        model_program_path().shrink_to_fit();
-        model_params_path().clear();
-        model_params_path().shrink_to_fit();
-      }
-    }
-  }
-
-#define DECL_ARGUMENT_FIELD(field__, Field, type__)          \
- public:                                                     \
-  type__& field__() {                                        \
-    PADDLE_ENFORCE(Has(#field__), "There is no such field"); \
-    return field__##_;                                       \
-  }                                                          \
-  void Set##Field(const type__& x) {                         \
-    field__##_ = x;                                          \
-    valid_fields_.insert(#field__);                          \
-  }                                                          \
-  DECL_ARGUMENT_FIELD_VALID(field__);                        \
-  type__* field__##_ptr() { return &field__##_; }            \
-                                                             \
- private:                                                    \
-  type__ field__##_;
-
-#define DECL_ARGUMENT_FIELD_VALID(field__) \
-  bool field__##_valid() { return Has(#field__); }
-
-#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                \
- public:                                                                  \
-  type__& field__() {                                                     \
-    PADDLE_ENFORCE_NOT_NULL(field__##_);                                  \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    return *static_cast<type__*>(field__##_.get());                       \
-  }                                                                       \
-  void Set##Field(type__* x) {                                            \
-    field__##_ =                                                          \
-        unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); }); \
-    valid_fields_.insert(#field__);                                       \
-  }                                                                       \
-  void Set##Field##NotOwned(type__* x) {                                  \
-    valid_fields_.insert(#field__);                                       \
-    field__##_ = unique_ptr_t(x, [](void* x) {});                         \
-  }                                                                       \
-  DECL_ARGUMENT_FIELD_VALID(field__);                                     \
-  type__* field__##_ptr() {                                               \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    return static_cast<type__*>(field__##_.get());                        \
-  }                                                                       \
-  type__* Release##Field() {                                              \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    valid_fields_.erase(#field__);                                        \
-    return static_cast<type__*>(field__##_.release());                    \
-  }                                                                       \
-                                                                          \
- private:                                                                 \
-  unique_ptr_t field__##_;
-
-  DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
-  // Model path
-  DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string);
-  // Model specified with program and parameters files.
-  DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
-  DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
-  DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
-  DECL_ARGUMENT_FIELD(optim_cache_dir, OptimCacheDir, std::string);
-  DECL_ARGUMENT_FIELD(enable_analysis_optim, EnableAnalysisOptim, bool);
-
-  // The overall graph to work on.
-  DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
-  // The overall Scope to work on.
-  DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope);
-
-  // The default program, loaded from disk.
-  DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
-
-  // The ir passes to perform in analysis phase.
-  DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
-                      std::vector<std::string>);
-  DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses,
-                      std::vector<std::string>);
-
-  // Pass a set of op types to enable its mkldnn kernel
-  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
-                      std::unordered_set<std::string>);
-  // The cache capacity of different input shapes for mkldnn.
-  DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int);
-
-#ifdef PADDLE_WITH_MKLDNN
-  // A set of op types to enable their quantized kernels
-  DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes,
-                      std::unordered_set<std::string>);
-
-  // A set of op IDs to exclude from enabling their quantized kernels
-  DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds,
-                      std::unordered_set<int>);
-
-  // Scales for variables to be quantized
-  DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
-#endif
-
-  // Passed from config.
-  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
-  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
-  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
-  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
-  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
-  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
-  DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
-                      AnalysisConfig::Precision);
-  DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
-                      bool);
-  DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
-
-  DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
-                      anakin_max_shape_t);
-  DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
-  DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int);
-  DECL_ARGUMENT_FIELD(anakin_precision_mode, AnakinPrecisionMode,
-                      AnalysisConfig::Precision);
-  DECL_ARGUMENT_FIELD(anakin_auto_config_layout, AnakinAutoConfigLayout, bool);
-  DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
-  DECL_ARGUMENT_FIELD(anakin_passes_filter, AnakinPassesFilter,
-                      std::vector<std::string>);
-  DECL_ARGUMENT_FIELD(anakin_ops_filter, AnakinOpsFilter,
-                      std::vector<std::string>);
-
-  // Memory optimized related.
-  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
-
-  // Indicate which kind of sort algorithm is used for operators, the memory
-  // optimization relays on the sort algorithm.
-  DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
-
-  // The program transformed by IR analysis phase.
-  DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
-                             framework::proto::ProgramDesc);
-
-  DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
-
- private:
-  std::unordered_set<std::string> valid_fields_;
-};
-
-#define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \
-  PADDLE_ENFORCE(argument__->Has(#fieldname__),       \
-                 "the argument field [%s] should be set", #fieldname__);
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/device.h b/paddle/fluid/inference/analysis/device.h
deleted file mode 100644
index 585c9923291e5f9cb6e50dbc4bcd28c374191048..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/device.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-enum class Device { CPU, GPU };
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h
deleted file mode 100644
index 4693729cb43d7a9df96b11c4bf3064a70d1db4c3..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/dot.h
+++ /dev/null
@@ -1,161 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file implements some helper classes and methods for DOT programming
- * support. It will give a visualization of the graph and that helps to debug
- * the logics of each Pass.
- */
-#pragma once
-
-#include <glog/logging.h>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-static size_t dot_node_counter{0};
-
-/*
- * A Dot template that helps to build a DOT graph definition.
- */
-class Dot {
- public:
-  struct Attr {
-    std::string key;
-    std::string value;
-
-    Attr(const std::string& key, const std::string& value)
-        : key(key), value(value) {}
-
-    std::string repr() const {
-      std::stringstream ss;
-      ss << key << "=" << '"' << value << '"';
-      return ss.str();
-    }
-  };
-
-  struct Node {
-    std::string name;
-    std::vector<Attr> attrs;
-
-    Node(const std::string& name, const std::vector<Attr>& attrs)
-        : name(name),
-          attrs(attrs),
-          id_("node_" + std::to_string(dot_node_counter++)) {}
-
-    std::string id() const { return id_; }
-
-    std::string repr() const {
-      std::stringstream ss;
-      CHECK(!name.empty());
-      ss << id_;
-      if (attrs.empty()) {
-        ss << "[label=" << '"' << name << '"' << "]";
-        return ss.str();
-      }
-      for (size_t i = 0; i < attrs.size(); i++) {
-        if (i == 0) {
-          ss << "[label=" << '"' << name << '"' << " ";
-        }
-        ss << attrs[i].repr();
-        ss << ((i < attrs.size() - 1) ? " " : "]");
-      }
-      return ss.str();
-    }
-
-   private:
-    std::string id_;
-  };
-
-  struct Edge {
-    std::string source;
-    std::string target;
-    std::vector<Attr> attrs;
-
-    Edge(const std::string& source, const std::string& target,
-         const std::vector<Attr>& attrs)
-        : source(source), target(target), attrs(attrs) {}
-
-    std::string repr() const {
-      std::stringstream ss;
-      CHECK(!source.empty());
-      CHECK(!target.empty());
-      ss << source << "->" << target;
-      for (size_t i = 0; i < attrs.size(); i++) {
-        if (i == 0) {
-          ss << "[";
-        }
-        ss << attrs[i].repr();
-        ss << ((i < attrs.size() - 1) ? " " : "]");
-      }
-      return ss.str();
-    }
-  };
-
-  Dot() = default;
-
-  explicit Dot(const std::vector<Attr>& attrs) : attrs_(attrs) {}
-
-  void AddNode(const std::string& id, const std::vector<Attr>& attrs,
-               std::string label = "") {
-    CHECK(!nodes_.count(id)) << "duplicate Node '" << id << "'";
-    if (label.empty()) label = id;
-    nodes_.emplace(id, Node{label, attrs});
-  }
-
-  void AddEdge(const std::string& source, const std::string& target,
-               const std::vector<Attr>& attrs) {
-    CHECK(!source.empty());
-    CHECK(!target.empty());
-    auto sid = nodes_.at(source).id();
-    auto tid = nodes_.at(target).id();
-    edges_.emplace_back(sid, tid, attrs);
-  }
-
-  // Compile to DOT language codes.
-  std::string Build() const {
-    std::stringstream ss;
-    const std::string indent = "   ";
-    ss << "digraph G {" << '\n';
-
-    // Add graph attrs
-    for (const auto& attr : attrs_) {
-      ss << indent << attr.repr() << '\n';
-    }
-    // add nodes
-    for (auto& item : nodes_) {
-      ss << indent << item.second.repr() << '\n';
-    }
-    // add edges
-    for (auto& edge : edges_) {
-      ss << indent << edge.repr() << '\n';
-    }
-    ss << "} // end G";
-    return ss.str();
-  }
-
- private:
-  std::unordered_map<std::string, Node> nodes_;
-  std::vector<Edge> edges_;
-  std::vector<Attr> attrs_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot_tester.cc b/paddle/fluid/inference/analysis/dot_tester.cc
deleted file mode 100644
index c785a312bf96c3586ea990fd9028cfd3b930d577..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/dot_tester.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/dot.h"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-class DotTester : public ::testing::Test {
- protected:
-  void SetUp() override {
-    std::vector<Dot::Attr> attrs({{"title", "hello"}});
-    dot.reset(new Dot(attrs));
-    dot->AddNode("a", {Dot::Attr{"shape", "box"}, Dot::Attr("color", "blue")});
-    dot->AddNode("b", {});
-    dot->AddNode("c", {});
-    dot->AddEdge("a", "b", {});
-    dot->AddEdge("b", "c", {});
-    dot->AddEdge("a", "c", {});
-  }
-
-  std::unique_ptr<Dot> dot;
-};
-
-TEST_F(DotTester, Build) {
-  auto codes = dot->Build();
-  // Output the DOT language code, the generated codes are too long to compare
-  // the string.
-  //
-  // The output is
-  //
-  // digraph G {
-  //   title="hello"
-  //   node_1
-  //   node_2
-  //   node_0[label="a" shape="box" color="blue"]
-  //   node_0->node_1
-  //   node_1->node_2
-  //   node_0->node_2
-  // } // end G
-  LOG(INFO) << '\n' << codes;
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/flags.h b/paddle/fluid/inference/analysis/flags.h
deleted file mode 100644
index 717e543f01dfa071865a5c14c0b7679e65239daf..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/flags.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-
-// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
-// flag if not available.
-DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
-DECLARE_string(IA_graphviz_log_root);
-DECLARE_string(IA_output_storage_path);
-DECLARE_bool(IA_enable_ir);
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
deleted file mode 100644
index 368ef2e5583fe2f6fcb24c98ded02f4e5325f7a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/helper.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/framework/framework.pb.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-template <>
-void SetAttr<std::string>(framework::proto::OpDesc *op, const std::string &name,
-                          const std::string &data) {
-  auto *attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRING);
-  attr->set_s(data);
-}
-template <>
-void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
-                  const int &data) {
-  auto *attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(data);
-}
-template <>
-void SetAttr<bool>(framework::proto::OpDesc *op, const std::string &name,
-                   const bool &data) {
-  auto *attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
-  attr->set_b(data);
-}
-template <>
-void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
-                      const int64_t &data) {
-  auto *attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::LONG);
-  attr->set_l(data);
-}
-template <>
-void SetAttr<std::vector<std::string>>(framework::proto::OpDesc *op,
-                                       const std::string &name,
-                                       const std::vector<std::string> &data) {
-  auto *attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
-  for (const auto &s : data) {
-    attr->add_strings(s.c_str());
-  }
-}
-
-template <>
-void SetAttr<std::vector<int>>(framework::proto::OpDesc *op,
-                               const std::string &name,
-                               const std::vector<int> &data) {
-  auto *attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::INTS);
-  for (const auto i : data) {
-    attr->add_ints(i);
-  }
-}
-
-template <>
-void SetAttr<std::vector<int64_t>>(framework::proto::OpDesc *op,
-                                   const std::string &name,
-                                   const std::vector<int64_t> &data) {
-  auto *attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::LONGS);
-  for (const auto i : data) {
-    attr->add_longs(i);
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
deleted file mode 100644
index a48058400241b030f17557156a4d973fca92fd8d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/helper.h
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <sys/stat.h>
-#include <cstdio>
-#include <fstream>
-#include <memory>
-#include <set>
-#include <string>
-#include <typeindex>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
-
-#ifdef _WIN32
-#include <direct.h>
-#include <io.h>
-#define GCC_ATTRIBUTE(attr__)
-#define MKDIR(path) _mkdir(path)
-#else
-#include <unistd.h>
-#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
-#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)
-#endif
-#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-template <typename T>
-void SetAttr(framework::proto::OpDesc *op, const std::string &name,
-             const T &data);
-
-template <typename Vec>
-int AccuDims(Vec &&vec, int size) {
-  int res = 1;
-  for (int i = 0; i < size; i++) {
-    res *= std::forward<Vec>(vec)[i];
-  }
-  return res;
-}
-
-#define SET_TYPE(type__) dic_[std::type_index(typeid(type__))] = #type__;
-/*
- * Map typeid to representation.
- */
-struct DataTypeNamer {
-  static const DataTypeNamer &Global() {
-    static auto *x = new DataTypeNamer();
-    return *x;
-  }
-
-  template <typename T>
-  const std::string &repr() const {
-    auto x = std::type_index(typeid(T));
-    PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
-    return dic_.at(x);
-  }
-
-  const std::string &repr(const std::type_index &type) const {  // NOLINT
-    PADDLE_ENFORCE(dic_.count(type), "unknown type for representation");
-    return dic_.at(type);
-  }
-
- private:
-  DataTypeNamer() {
-    SET_TYPE(int);
-    SET_TYPE(bool);
-    SET_TYPE(float);
-    SET_TYPE(void *);
-  }
-
-  std::unordered_map<std::type_index, std::string> dic_;
-};
-#undef SET_TYPE
-
-template <typename IteratorT>
-class iterator_range {
-  IteratorT begin_, end_;
-
- public:
-  template <typename Container>
-  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
-
-  iterator_range(const IteratorT &begin, const IteratorT &end)
-      : begin_(begin), end_(end) {}
-
-  const IteratorT &begin() const { return begin_; }
-  const IteratorT &end() const { return end_; }
-};
-
-/*
- * An registry helper class, with its records keeps the order they registers.
- */
-template <typename T>
-class OrderedRegistry {
- public:
-  T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
-    dic_[name] = elements_.size();
-    elements_.emplace_back(std::unique_ptr<T>(x));
-    return elements_.back().get();
-  }
-
-  T *Lookup(const std::string &name) {
-    auto it = dic_.find(name);
-    if (it == dic_.end()) return nullptr;
-    return elements_[it->second].get();
-  }
-
- protected:
-  std::unordered_map<std::string, int> dic_;
-  std::vector<std::unique_ptr<T>> elements_;
-};
-
-template <typename T>
-T &GetFromScope(const framework::Scope &scope, const std::string &name) {
-  framework::Variable *var = scope.FindVar(name);
-  PADDLE_ENFORCE(var != nullptr);
-  return *var->GetMutable<T>();
-}
-
-static framework::proto::ProgramDesc LoadProgramDesc(
-    const std::string &model_path) {
-  std::ifstream fin(model_path, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
-  fin.seekg(0, std::ios::end);
-  std::string buffer(fin.tellg(), ' ');
-  fin.seekg(0, std::ios::beg);
-  fin.read(&buffer[0], buffer.size());
-  fin.close();
-  framework::proto::ProgramDesc program_desc;
-  program_desc.ParseFromString(buffer);
-  return program_desc;
-}
-
-static bool FileExists(const std::string &filepath) {
-  std::ifstream file(filepath);
-  bool exists = file.is_open();
-  file.close();
-  return exists;
-}
-
-static bool PathExists(const std::string &path) {
-  struct stat statbuf;
-  if (stat(path.c_str(), &statbuf) != -1) {
-    if (S_ISDIR(statbuf.st_mode)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-static std::string GetDirRoot(const std::string &path) {
-  char sep = '/';
-
-#ifdef _WIN32
-  sep = '\\';
-#endif
-
-  size_t i = path.rfind(sep, path.length());
-  if (i != std::string::npos) {
-    return (path.substr(0, i));
-  }
-  return path;
-}
-
-static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
-  std::string opt_cache_dir = model_root + "/_opt_cache/";
-  if (!PathExists(opt_cache_dir)) {
-    PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
-                   "Can not create optimize cache directory: %s, Make sure you "
-                   "have permission to write",
-                   opt_cache_dir);
-  }
-  return opt_cache_dir;
-}
-
-static std::string GetTrtCalibPath(const std::string &model_root,
-                                   const std::string &engine_key) {
-  return model_root + "/trt_calib_" + engine_key;
-}
-
-// If there is no calib table data file in model_opt_cache_dir, return "".
-static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
-                                        const std::string &engine_key,
-                                        bool enable_int8) {
-  std::string trt_calib_table_path =
-      GetTrtCalibPath(model_opt_cache_dir, engine_key);
-  if (enable_int8 && FileExists(trt_calib_table_path)) {
-    VLOG(3) << "Calibration table file: " << trt_calib_table_path
-            << "is found here";
-    std::ifstream infile(trt_calib_table_path, std::ios::in);
-    std::stringstream buffer;
-    buffer << infile.rdbuf();
-    std::string calibration_data(buffer.str());
-    return calibration_data;
-  }
-  return "";
-}
-
-static std::string GetTrtEngineSerializedPath(const std::string &model_root,
-                                              const std::string &engine_key) {
-  return model_root + "/trt_serialized_" + engine_key;
-}
-
-static std::string GetTrtEngineSerializedData(
-    const std::string &model_opt_cache_dir, const std::string &engine_key) {
-  std::string trt_serialized_path =
-      GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key);
-  if (FileExists(trt_serialized_path)) {
-    VLOG(3) << "Trt serialized file: " << trt_serialized_path
-            << "is found here";
-    std::ifstream infile(trt_serialized_path, std::ios::in);
-    std::stringstream buffer;
-    buffer << infile.rdbuf();
-    std::string trt_engine_serialized_data(buffer.str());
-    return trt_engine_serialized_data;
-  }
-  return "";
-}
-
-static void SaveTrtEngineSerializedDataToFile(
-    const std::string &trt_serialized_path,
-    const std::string &engine_serialized_data) {
-  std::ofstream outfile(trt_serialized_path);
-  outfile << engine_serialized_data;
-  outfile.close();
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
-
-#define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \
-  type__(const type__ &) = delete;              \
-  void operator=(const type__ &) = delete;
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
deleted file mode 100644
index 3fa907b418cfc6982ac6eb6c5c7077b32c050676..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-using string::PrettyLogEndl;
-using string::PrettyLog;
-using string::Style;
-
-IRPassManager::IRPassManager(Argument *argument) {
-  ARGUMENT_CHECK_FIELD(argument, main_program);
-  graph_ = std::unique_ptr<Graph>(new Graph(argument->main_program()));
-  if (argument->Has("scope")) {
-    auto *scope_ptr = argument->scope_ptr();
-    PADDLE_ENFORCE(scope_ptr);
-    graph_->SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
-  }
-
-  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
-  CreatePasses(argument, argument->ir_analysis_passes());
-}
-
-void IRPassManager::CreatePasses(Argument *argument,
-                                 const std::vector<std::string> &passes) {
-  std::string pre_pass;
-  int pass_num = 0;
-  for (const std::string &pass_name : passes) {
-    auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
-
-    if (pass_name == "graph_viz_pass") {
-      std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
-                                  (pre_pass.empty() ? "origin" : pre_pass) +
-                                  ".dot";
-      pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
-      pass_num++;
-    } else if (pass_name == "mkldnn_placement_pass") {
-      pass->Set("mkldnn_enabled_op_types",
-                new std::unordered_set<std::string>(
-                    argument->mkldnn_enabled_op_types()));
-    } else if (pass_name == "cudnn_placement_pass") {
-      pass->Set("cudnn_enabled_op_types",
-                new std::unordered_set<std::string>());
-#ifdef PADDLE_WITH_MKLDNN
-    } else if (pass_name == "cpu_quantize_placement_pass") {
-      pass->Set("quantize_enabled_op_types",
-                new std::unordered_set<std::string>(
-                    argument->quantize_enabled_op_types()));
-      pass->Set(
-          "quantize_excluded_op_ids",
-          new std::unordered_set<int>(argument->quantize_excluded_op_ids()));
-    } else if (pass_name == "cpu_quantize_pass") {
-      pass->Set("quant_var_scales",
-                new VarQuantScale(argument->quant_var_scales()));
-#endif
-    } else if (pass_name == "tensorrt_subgraph_pass") {
-      pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
-      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
-      pass->Set("min_subgraph_size",
-                new int(argument->tensorrt_min_subgraph_size()));
-      pass->Set("program",
-                new framework::ProgramDesc *(&argument->main_program()));
-
-      auto precision_mode = argument->tensorrt_precision_mode();
-      bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
-
-      pass->Set("predictor_id", new int(argument->predictor_id()));
-      bool use_calib_mode = argument->tensorrt_use_calib_mode();
-      pass->Set("enable_int8", new bool(enable_int8));
-      pass->Set("use_calib_mode", new bool(use_calib_mode));
-      pass->Set("precision_mode",
-                new AnalysisConfig::Precision(precision_mode));
-
-      bool use_static_engine = argument->tensorrt_use_static_engine();
-      bool model_from_memory = argument->model_from_memory();
-      std::string optim_cache_dir = argument->optim_cache_dir();
-      bool int8_valid =
-          !(model_from_memory && optim_cache_dir.empty() && enable_int8);
-      PADDLE_ENFORCE(int8_valid,
-                     "When you are in TRT INT8 mode, and load model from "
-                     "memory, you should set optim_cache_dir using "
-                     "config.SetOptimCacheDir()");
-      PADDLE_ENFORCE(!(model_from_memory && use_static_engine),
-                     "When you are using Paddle-TRT, and also using load model "
-                     "from memory, you should set the use_static to false.");
-
-      if (!optim_cache_dir.empty()) {
-        pass->Set("model_opt_cache_dir", new std::string(optim_cache_dir));
-      } else if (use_static_engine || enable_int8) {
-        std::string model_opt_cache_dir =
-            argument->Has("model_dir")
-                ? argument->model_dir()
-                : GetDirRoot(argument->model_program_path());
-        pass->Set(
-            "model_opt_cache_dir",
-            new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
-      }
-      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
-      pass->Set("use_static_engine", new bool(use_static_engine));
-      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
-    }
-    if (pass_name == "ngraph_subgraph_pass") {
-      pass->Set("program",
-                new framework::ProgramDesc *(&argument->main_program()));
-    }
-    if (pass_name == "anakin_subgraph_pass") {
-      pass->Set("program",
-                new framework::ProgramDesc *(&argument->main_program()));
-      pass->Set("use_gpu", new bool(argument->use_gpu()));
-      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
-      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
-      pass->Set("predictor_id", new int(argument->predictor_id()));
-      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
-                                       argument->anakin_max_input_shape()));
-      pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
-      bool enable_int8 =
-          argument->anakin_precision_mode() == AnalysisConfig::Precision::kInt8;
-      pass->Set("enable_int8", new bool(enable_int8));
-      pass->Set("anakin_ops_filter",
-                new std::vector<std::string>(argument->anakin_ops_filter()));
-      pass->Set("auto_config_layout",
-                new bool(argument->anakin_auto_config_layout()));
-    }
-
-    pre_pass = pass_name;
-
-    passes_.emplace_back(std::move(pass));
-  }
-}
-
-std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
-  if (passes_.empty()) {
-    return graph;
-  }
-  PADDLE_ENFORCE(graph.get());
-  // Apply all the passes
-  for (const auto &pass : passes_) {
-    if (pass->Type() != "graph_viz_pass") {
-      PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
-    }
-    graph.reset(pass->Apply(graph.release()));
-  }
-  return graph;
-}
-
-framework::proto::ProgramDesc IRPassManager::AcquireProgram(
-    std::unique_ptr<Graph> *graph, ProgramDesc *program) const {
-  auto pass =
-      framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
-
-  // Direct using ProgramDesc desc(argument->main_program()) may cause
-  // incomplete copies of information.
-  ProgramDesc desc;
-  desc.CopyFrom(*program->Proto());
-  pass->SetNotOwned("program", &desc);
-  auto *the_graph = graph->release();
-  graph->reset(pass->Apply(the_graph));
-  return *desc.Proto();
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
deleted file mode 100644
index 2d120679eedd392d78b4da66276297ff7280792b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file defines IRPassManager, it helps control the passes in IR. Inference
- * phrase will load the model program and parameters from disk, that is quite
- * different from the training phase.
- * This manager will control the Passes and make the passes in IR work smoothly
- * for inference.
- */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-using framework::ProgramDesc;
-using framework::ir::Graph;
-
-class IRPassManager final {
- public:
-  explicit IRPassManager(Argument *argument);
-
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph);
-
-  framework::proto::ProgramDesc AcquireProgram(std::unique_ptr<Graph> *graph,
-                                               ProgramDesc *program) const;
-
-  framework::ir::Graph &graph() const { return *graph_; }
-
- private:
-  void CreatePasses(Argument *argument, const std::vector<std::string> &passes);
-
-  std::unique_ptr<Graph> graph_;
-  std::vector<std::unique_ptr<framework::ir::Pass>> passes_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
deleted file mode 100644
index ddadbc6df4aa3f95b271b011edb85a8d7077796f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-cc_library(subgraph_detector SRCS subgraph_detector.cc subgraph_util.cc DEPS proto_desc)
-if(WITH_TESTING)
-  add_dependencies(subgraph_detector gtest)
-endif()
-
-if (WITH_GPU AND TENSORRT_FOUND)
-  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
-
-  set(analysis_deps ${analysis_deps}
-          subgraph_detector tensorrt_subgraph_pass
-          CACHE INTERNAL "")
-
-  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
-  file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
-  set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
-endif()
-
-if (ANAKIN_SUBGRAPH) 
-  cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller)
-
-  set(analysis_deps ${analysis_deps}
-          subgraph_detector anakin_subgraph_pass
-          CACHE INTERNAL "")
-
-  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
-  file(APPEND ${pass_file} "USE_PASS(anakin_subgraph_pass);\n")
-  set(INFER_IR_PASSES ${INFER_IR_PASSES} anakin_subgraph_pass CACHE INTERNAL "")
-endif()
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
deleted file mode 100644
index a6c6f33cf779f6117d0dda9a9eca279bd846ac84..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ /dev/null
@@ -1,283 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/op_teller.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-using framework::ir::Node;
-
-void analysis::AnakinSubgraphPass::ApplyImpl(
-    framework::ir::Graph *graph) const {
-  framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph);
-
-  auto &anakin_ops_filter = Get<std::vector<std::string>>("anakin_ops_filter");
-
-  auto teller = [&anakin_ops_filter](const framework::ir::Node *node) {
-    if (!node->IsOp() || !node->Op())
-      return false;
-    else if (std::find(anakin_ops_filter.begin(), anakin_ops_filter.end(),
-                       node->Op()->Type()) != anakin_ops_filter.end())
-      return false;
-    return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
-  };
-
-  SubGraphFuser fuser(graph, teller, 6 /* min_subgraph_size */);
-  fuser();
-
-  std::vector<std::string> graph_param_names =
-      ExtractParameters(graph->Nodes());
-
-  // those parameter already exist in anakin, and should not have another copy
-  // in fluid.
-  std::vector<std::string> repetitive_params;
-
-  for (auto *node : graph->Nodes()) {
-    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateAnakinOp(node, graph, graph_param_names, &repetitive_params);
-      std::unordered_set<const Node *> nodes2remove(
-          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
-      framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
-    }
-  }
-
-  std::unordered_set<const Node *> nodes2remove;
-  for (auto *node : graph->Nodes()) {
-    if (node->IsOp() && Agent(node).deleted()) {
-      nodes2remove.insert(node);
-    }
-  }
-  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
-  graph->Set(framework::ir::kRepetitiveParamAttr,
-             new std::vector<std::string>(repetitive_params));
-}
-
-std::string GenerateAnakinEngineKey(const std::set<std::string> &engine_inputs,
-                                    const std::set<std::string> &engine_outputs,
-                                    std::string id) {
-  std::string engine_hash_key = "";
-  for (auto name : engine_inputs) {
-    engine_hash_key += name;
-  }
-  for (auto name : engine_outputs) {
-    engine_hash_key += name;
-  }
-  engine_hash_key += id;
-  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
-  return engine_key;
-}
-
-void AnakinSubgraphPass::CreateAnakinOp(
-    framework::ir::Node *node, Graph *graph,
-    const std::vector<std::string> &graph_params,
-    std::vector<std::string> *repetitive_params) const {
-  auto *op_desc = node->Op();
-  auto &subgraph = *Agent(node).subgraph();
-  PADDLE_ENFORCE(!subgraph.empty());
-
-  framework::ProgramDesc *program_desc =
-      Get<framework::ProgramDesc *>("program");
-  // Add new block for TensorRTEngineOP
-  const framework::BlockDesc &main_block =
-      program_desc->Block(framework::kRootBlockIndex);
-  // const framework::BlockDesc& main_block = program_desc->Block(0);
-  framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
-
-  // An fake block desc.
-  framework::proto::BlockDesc block_proto;
-  framework::BlockDesc block_desc(nullptr, &block_proto);
-  block_desc.Proto()->set_parent_idx(-1);
-  block_desc.Proto()->set_idx(0);
-  string::PrettyLogDetail("---  detect a sub-graph with %d nodes",
-                          subgraph.size());
-
-  for (auto *node : subgraph) {
-    auto *new_block_op = new_block->AppendOp();
-    auto *op = block_desc.AppendOp();
-    *new_block_op->Proto() = *node->Op()->Proto();
-    *op->Proto() = *node->Op()->Proto();
-  }
-
-  // Then, we will use the input_names_with_id and output_names_with_id to
-  // generate the eigine key.
-  // So, We use set instead of unordered_set here to ensure that the engine key
-  // is unique.
-  std::set<std::string> input_names;
-  std::set<std::string> input_names_with_id;
-  std::vector<std::string> params;
-  for (auto *x : node->inputs) {
-    input_names.insert(x->Name());
-    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
-    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
-      params.push_back(x->Name());
-    }
-  }
-  std::copy(params.begin(), params.end(),
-            std::back_inserter(*repetitive_params));
-  op_desc->SetInput(
-      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
-
-  std::set<std::string> output_names;
-  std::set<std::string> output_names_with_id;
-  for (auto *x : node->outputs) {
-    output_names.insert(x->Name());
-    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
-  }
-
-  op_desc->SetOutput(
-      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-  op_desc->SetType("anakin_engine");
-
-  std::unordered_map<std::string, std::string> output_name_map;
-  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
-
-  for (framework::ir::Node *node : graph->Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      graph_var_map[node->Name()] = node;
-    }
-  }
-  auto &subgraph_nodes = *Agent(node).subgraph();
-
-  // The following procedure is used to rename all the intermediate
-  // variables and the output variables of the subgraph.
-  RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
-                      &output_names_with_id, &output_names, &output_name_map,
-                      graph_var_map, false);
-
-  // When anakin engine runs at the end of the operation,
-  // output_mapping help us copy the data from the renamed ITensor
-  // to Tensor.
-  std::vector<std::string> output_mapping;
-  for (auto name : output_names) {
-    PADDLE_ENFORCE(output_name_map.count(name) != 0);
-    output_mapping.push_back(output_name_map[name]);
-  }
-
-  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
-                 "the block has no var-desc");
-  PADDLE_ENFORCE(!output_mapping.empty());
-  op_desc->SetBlockAttr("sub_block", new_block);
-  SetAttr(op_desc->Proto(), "subgraph",
-          block_desc.Proto()->SerializeAsString());
-  // Set attrs
-  SetAttr(op_desc->Proto(), "parameters", params);
-  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
-  int predictor_id = Get<int>("predictor_id");
-  auto engine_key = GenerateAnakinEngineKey(
-      input_names_with_id, output_names_with_id, std::to_string(predictor_id));
-
-  SetAttr(op_desc->Proto(), "engine_key", engine_key);
-  auto max_input_shape =
-      Get<std::map<std::string, std::vector<int>>>("max_input_shape");
-  auto program_inputs = program_desc->GetFeedTargetNames();
-
-  bool use_gpu = Get<bool>("use_gpu");
-  SetAttr(op_desc->Proto(), "use_gpu", use_gpu);
-  bool enable_int8 = Get<bool>("enable_int8");
-  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
-  if (enable_int8) {
-    CreateAnakinEngine<::anakin::Precision::INT8>(&block_desc, params,
-                                                  input_names, output_mapping,
-                                                  program_inputs, engine_key);
-  } else {
-    CreateAnakinEngine<::anakin::Precision::FP32>(&block_desc, params,
-                                                  input_names, output_mapping,
-                                                  program_inputs, engine_key);
-  }
-}
-
-template <::anakin::Precision PrecisionT>
-void AnakinSubgraphPass::CreateAnakinEngine(
-    framework::BlockDesc *block_desc, const std::vector<std::string> &params,
-    const std::set<std::string> &input_names,
-    const std::vector<std::string> &output_mapping,
-    const std::vector<std::string> &program_inputs,
-    const std::string &engine_key) const {
-  framework::BlockDesc block_desc_temp(nullptr, block_desc->Proto());
-  bool use_gpu = Get<bool>("use_gpu");
-  auto max_batch_size = Get<int>("max_batch_size");
-  auto max_input_shape =
-      Get<std::map<std::string, std::vector<int>>>("max_input_shape");
-  if (use_gpu) {
-#ifdef PADDLE_WITH_CUDA
-    inference::Singleton<
-        anakin::AnakinEngineManager<::anakin::saber::NV, PrecisionT>>::Global()
-        .Create(true, Get<int>("gpu_device_id"), max_batch_size,
-                max_input_shape, program_inputs, false, engine_key);
-#endif
-  } else {
-#ifdef ANAKIN_X86_PLACE
-    bool auto_config_layout = Get<bool>("auto_config_layout");
-    inference::Singleton<
-        anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global()
-        .Create(true, Get<int>("gpu_device_id"), max_batch_size,
-                max_input_shape, program_inputs, auto_config_layout,
-                engine_key);
-#endif
-  }
-
-  auto *scope = param_scope();
-  std::unordered_set<std::string> param_set(params.begin(), params.end());
-  if (use_gpu) {
-#ifdef PADDLE_WITH_CUDA
-    auto *anakin_engine =
-        inference::Singleton<inference::anakin::AnakinEngineManager<
-            ::anakin::saber::NV, PrecisionT>>::Global()
-            .Get(engine_key);
-    inference::Singleton<inference::anakin::AnakinOpConverter<
-        ::anakin::saber::NV, PrecisionT>>::Global()
-        .ConvertBlockToAnakinEngine(
-            &block_desc_temp, scope,
-            std::vector<std::string>(input_names.begin(), input_names.end()),
-            param_set, output_mapping, anakin_engine);
-#endif
-  } else {
-#ifdef ANAKIN_X86_PLACE
-    auto *anakin_engine =
-        inference::Singleton<inference::anakin::AnakinEngineManager<
-            ::anakin::saber::X86, PrecisionT>>::Global()
-            .Get(engine_key);
-    inference::Singleton<inference::anakin::AnakinOpConverter<
-        ::anakin::saber::X86, PrecisionT>>::Global()
-        .ConvertBlockToAnakinEngine(
-            &block_desc_temp, scope,
-            std::vector<std::string>(input_names.begin(), input_names.end()),
-            param_set, output_mapping, anakin_engine);
-#endif
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_PASS(anakin_subgraph_pass,
-              paddle::inference::analysis::AnakinSubgraphPass);
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
deleted file mode 100644
index 4ab2297b2d48876a95f41deb715188b2476b6b38..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <paddle/fluid/framework/ir/fuse_pass_base.h>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/anakin/engine.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
-
-using anakin::Precision;
-using anakin::saber::NV;
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-class AnakinSubgraphPass : public framework::ir::FusePassBase {
- public:
-  void ApplyImpl(framework::ir::Graph *graph) const override;
-
- private:
-  void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph,
-                      const std::vector<std::string> &graph_params,
-                      std::vector<std::string> *repetitive_params) const;
-  void CleanIntermediateOutputs(framework::ir::Node *node);
-  template <::anakin::Precision PrecisionT>
-  void CreateAnakinEngine(framework::BlockDesc *block_desc,
-                          const std::vector<std::string> &params,
-                          const std::set<std::string> &input_names,
-                          const std::vector<std::string> &output_mapping,
-                          const std::vector<std::string> &program_inputs,
-                          const std::string &engine_key) const;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
deleted file mode 100644
index 064f947aaa7ca75c6497ddf76d4d78c5557fdeb8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ /dev/null
@@ -1,474 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/node.h"
-
-DECLARE_bool(use_ngraph);
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-using framework::ir::Node;
-
-std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
-  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
-  std::unordered_set<Node *> inputs;
-  std::unordered_set<Node *> outputs;
-  // Input a Value, check whether its inlink is in the subgraph.
-  auto inlink_in_subgraph = [&](Node *n) {
-    for (auto *in : n->inputs) {
-      if (nodes.count(in)) return true;
-    }
-    return false;
-  };
-
-  for (auto &node : graph) {
-    for (auto *in : node->inputs) {
-      // The Value that is written by nodes inside a sub-graph shouldn't be the
-      // input of the sub-graph.
-      if (!nodes.count(in) && in->IsVar() && !inlink_in_subgraph(in)) {
-        inputs.insert(in);
-      }
-    }
-    for (auto *out : node->outputs) {
-      if (!nodes.count(out) && out->IsVar()) {
-        outputs.insert(out);
-      }
-    }
-  }
-  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
-                        std::vector<Node *>(outputs.begin(), outputs.end()));
-}
-
-// Filter the Intermediate results of the subgraph node.
-void FilterRedundantOutputOfSubGraph(Graph *graph) {
-  std::vector<Node *> op_nodes;
-  for (auto &node : TopologicalSort(*graph)) {
-    if (node.IsVar() || Agent(&node).deleted()) {
-      continue;
-    }
-    op_nodes.push_back(&node);
-  }
-  size_t op_num = op_nodes.size();
-  for (size_t i = 0; i < op_num; i++) {
-    if (op_nodes[i]->IsOp()) continue;
-    std::unordered_set<std::string> follow_up_input_names;
-    for (size_t j = i + 1; j < op_num; j++) {
-      for (auto *in : op_nodes[j]->inputs) {
-        follow_up_input_names.insert(in->Name());
-      }
-    }
-    std::vector<Node *> filtered_subgraph_outlinks;
-    for (auto *out : op_nodes[i]->outputs) {
-      if (follow_up_input_names.count(out->Name())) {
-        filtered_subgraph_outlinks.push_back(out);
-      } else {
-        Agent(out).set_deleted(true);
-      }
-    }
-    // The filtered_subgraph_outlinks may be empty.
-    op_nodes[i]->outputs = filtered_subgraph_outlinks;
-  }
-}
-
-std::vector<std::vector<Node *>> SubgraphDetector::operator()() {
-  MarkNodesInsideSubGraph();
-  return ExtractSubGraphs();
-}
-
-// Mark the output variables inside a subgraph with the func.
-inline void MarkOutLinksInSubGraph(const Node *func) {
-  for (auto *var : func->outputs) {
-    Agent(var).set_marked(true);
-  }
-}
-
-void SubgraphDetector::MarkNodesInsideSubGraph() {
-  for (auto &node : framework::ir::GraphTraits::DFS(*graph_)) {
-    if (node_inside_subgraph_teller_(&node)) {
-      Agent(&node).set_marked(true);
-      if (node.IsOp()) {
-        // If a function is inside the sub-graph, mark all the output variables
-        // to be inside too, so that two marked functions will be inside a same
-        // sub-graph, lets take a example:  A_function->var->B_function, if
-        // A_function is marked, var should also be marked, so that B_function
-        // will be in the same sub-graph with A_function if B_function is
-        // marked.
-        MarkOutLinksInSubGraph(&node);
-      }
-    }
-  }
-}
-
-// Use the Union Find(UF) algorithm to find fully connected sub-graphs, if node
-// a's output is node b, that is a and b is in the same sub-graph. The UF
-// algorithm will group them to the same cluster.
-using node_map_t = std::unordered_map<int, Node *>;
-// Find the ancestor id of a node.
-int UnionFindGetAncestor(const node_map_t &node_map, size_t id) {
-  int tmp = id;
-  do {
-    tmp = Agent(node_map.at(tmp)).union_find_parent();
-  } while (Agent(node_map.at(tmp)).union_find_parent() != tmp);
-  return tmp;
-}
-// Make this two node share the same ancestor.
-// TODO(Superjom) bad performance, make a balanced tree latter.
-void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
-  int a_ancestor = UnionFindGetAncestor(node_map, a);
-  int b_ancestor = UnionFindGetAncestor(node_map, b);
-  Agent(node_map.at(b_ancestor)).set_union_find_parent(a_ancestor);
-  Agent(node_map.at(a)).set_union_find_parent(a_ancestor);
-  Agent(node_map.at(b)).set_union_find_parent(a_ancestor);
-}
-
-// This is a simple representation of a graph.
-// The BriefNode hold the pointer of the Node.
-// This is to avoid changing the original graph
-// in the process of trt graph analysis.
-struct BriefNode {
-  explicit BriefNode(Node *n) { node = n; }
-  Node *node;
-  std::vector<BriefNode *> inlinks;
-  std::vector<BriefNode *> outlinks;
-};
-
-// Union two adjacent BriefNode.
-// Suppose we have two adjacent nodes src and dst.
-// We will perform the following operations:
-// 1. add all inputs(except src) of dst to src inlinks.
-// 2. add all outputs of dst to src outlinks.
-// 3. change all the dst's inputs and outputs
-// corresponding inlinks and outlinks to src node.
-// 4. delete all dst's inlinks and outlinks.
-void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
-                          int src_id, int dst_id) {
-  // merge the two adjacent nodes into one node.
-  BriefNode *src_node = node_map.at(src_id);
-  BriefNode *dst_node = node_map.at(dst_id);
-
-  std::unordered_set<BriefNode *> inputs(src_node->inlinks.begin(),
-                                         src_node->inlinks.end());
-  std::unordered_set<BriefNode *> outputs;
-
-  for (auto *n : src_node->outlinks) {
-    if (n != dst_node) outputs.insert(n);
-  }
-
-  // Add the inlinks and outlinks of dst node to src node.
-  std::vector<BriefNode *> dst_in_nodes = dst_node->inlinks;
-  for (BriefNode *node : dst_in_nodes) {
-    if (node != src_node) {
-      inputs.insert(node);
-    }
-  }
-
-  std::vector<BriefNode *> dst_out_nodes = dst_node->outlinks;
-  for (BriefNode *node : dst_out_nodes) {
-    outputs.insert(node);
-  }
-
-// update the dst and src node's inlinks and outlinks.
-#ifdef __clang__
-  src_node->inlinks = std::vector<BriefNode *>(inputs.begin(), inputs.end());
-  src_node->outlinks = std::vector<BriefNode *>(outputs.begin(), outputs.end());
-  dst_node->inlinks.clear();
-  dst_node->outlinks.clear();
-#else
-  src_node->inlinks =
-      std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end()));
-  src_node->outlinks =
-      std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end()));
-  dst_node->inlinks.clear();
-  dst_node->outlinks.clear();
-#endif
-
-  auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) {
-    for (auto *&n : nodes) {
-      if (n == src_node || n == dst_node) {
-        n = src_node;
-      }
-    }
-  };
-  // Change all the dst inputs and outputs corresponding inlink and
-  // outlink to the src node.
-  for (auto *node : src_node->inlinks) {
-    inlink_or_outlink_cleaner(node->outlinks);
-  }
-
-  for (auto *node : src_node->outlinks) {
-    inlink_or_outlink_cleaner(node->inlinks);
-  }
-}
-
-// FlexibleDFS
-// If reverse is true, do reverse dfs.
-// If enter func is not nullptr, calls enter(node) before visiting any children
-// of node.
-// If leave func not nullptr, calls leave(node) after visiting all parents of
-// node.
-void FlexibleDFS(const std::vector<BriefNode *> &source, bool reverse,
-                 const std::function<bool(const BriefNode *)> &enter,
-                 const std::function<bool(const BriefNode *)> &leave) {
-  typedef struct {
-    const BriefNode *node;
-    bool leave;
-  } FNode;
-
-  std::vector<FNode> stack;
-  for (auto &node : source) {
-    stack.push_back(FNode{node, false});
-  }
-  std::unordered_set<const BriefNode *> visited;
-  while (!stack.empty()) {
-    auto fnode = stack.back();
-    stack.pop_back();
-
-    if (fnode.leave) {
-      if (leave && !leave(fnode.node)) return;
-    }
-    if (visited.count(fnode.node)) continue;
-    visited.insert(fnode.node);
-
-    if (enter && !enter(fnode.node)) return;
-
-    if (leave) stack.push_back(FNode{fnode.node, true});
-    const std::vector<BriefNode *> iter_nodes =
-        reverse == true ? fnode.node->inlinks : fnode.node->outlinks;
-    for (const BriefNode *node : iter_nodes) {
-      if (!visited.count(node)) {
-        stack.push_back(FNode{node, false});
-      }
-    }
-  }
-}
-
-std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() {
-  // Run the Extract algorithm to find all subgraphs.
-  std::vector<Node *> marked_nodes;
-  //  We use brief_node_map to represent the original graph in order to avoid
-  //  changing the original graph.
-  std::unordered_map<int, BriefNode *> brief_node_map;
-
-  std::unordered_set<int32_t> valid_node_ids;
-  for (auto *node : graph_->Nodes()) {
-    valid_node_ids.insert(node->id());
-  }
-
-  for (auto &node : framework::ir::GraphTraits::TS(*graph_)) {
-    brief_node_map[node.id()] = new BriefNode(&node);
-    if (Agent(&node).marked()) {
-      marked_nodes.push_back(&node);
-    }
-  }
-
-  // extract sub-graphs in the marked node set, use Union Find algorithm.
-  node_map_t node_map;  // id to ptr
-  for (auto *n : marked_nodes) {
-    // n's parent == n.id means it is the ancestor
-    Agent(n).set_union_find_parent(n->id());
-    node_map[n->id()] = n;
-  }
-
-  // create breif node map
-  for (auto &itr : brief_node_map) {
-    for (Node *node : itr.second->node->inputs) {
-      if (!valid_node_ids.count(node->id())) {
-        LOG(INFO) << "invalid node id " << node->id();
-        continue;
-      }
-      itr.second->inlinks.push_back(brief_node_map.at(node->id()));
-    }
-
-    for (Node *node : itr.second->node->outputs) {
-      if (!valid_node_ids.count(node->id())) {
-        LOG(INFO) << "invalid node id " << node->id();
-        continue;
-      }
-      itr.second->outlinks.push_back(brief_node_map.at(node->id()));
-    }
-  }
-
-  for (auto &itr : brief_node_map) {
-    BriefNode *brief_node = itr.second;
-
-    if (!Agent(brief_node->node).marked()) {
-      VLOG(4) << brief_node->node->id() << " node not a trt candidate.";
-      continue;
-    }
-
-    //  Our algorithm must guarantee that:
-    //  1. The graph is always directed acyclic graph（DAG）.
-    //  2. If there is a path in the subgraph from X to Y (X and Y are both
-    //  nodes in the subgraph), then all paths from X to Y are in the
-    //  subgraph.
-    //
-    //  In order to achieve the above guarantee.
-    //  For adjacent nodes src -> dst.
-    //  1. Get all dst input nodes except src.
-    //  2. Reverse DFS from those input nodes
-    //  3. If there is a path from input nodes to src,
-    //  then the src and dst nodes can not be fused into one node,
-    //  otherwise it can be done.
-
-    while (true) {
-      std::unordered_set<BriefNode *> contract_nodes;
-      for (auto *out : brief_node->outlinks) {
-        // must be an trt candidate
-        if (!Agent(out->node).marked()) continue;
-        // get all dst input nodes except src.
-        std::vector<BriefNode *> source_nodes;
-        for (auto *n : out->inlinks) {
-          if (n != brief_node) {
-            source_nodes.push_back(n);
-          }
-        }
-
-        // Reverse DFS from the source_nodes.
-        bool have_excess_path = false;
-        FlexibleDFS(source_nodes, true, nullptr,
-                    [&have_excess_path, brief_node](const BriefNode *n) {
-                      if (n == brief_node) {
-                        have_excess_path = true;
-                        return false;
-                      }
-                      return true;
-                    });
-        if (have_excess_path) continue;
-        contract_nodes.insert(out);
-      }
-      if (contract_nodes.empty()) break;
-
-      for (auto dst_node : contract_nodes) {
-        UnionFindCombine(node_map, brief_node->node->id(),
-                         dst_node->node->id());
-        UnionContractedNodes(brief_node_map, brief_node->node->id(),
-                             dst_node->node->id());
-      }
-    }
-  }
-
-  std::unordered_map<int /*ancestor*/, std::vector<Node *>> clusters;
-  for (auto *n : marked_nodes) {
-    if (n->IsOp()) {
-      clusters[UnionFindGetAncestor(node_map, Agent(n).union_find_parent())]
-          .push_back(n);
-    }
-  }
-  std::vector<std::vector<Node *>> result;
-  std::for_each(clusters.begin(), clusters.end(),
-                [&](const decltype(clusters)::value_type &it) {
-                  result.push_back(it.second);
-                });
-
-  return result;
-}
-
-void SubGraphFuser::operator()() { ReplaceNodesWithSubGraphs(); }
-
-void RemoveIntermediateOutputInSubgraph(const std::vector<Node *> &subgraph,
-                                        Graph *graph,
-                                        std::vector<Node *> *outputs) {
-  std::unordered_set<Node *> subgraph_set(subgraph.begin(), subgraph.end());
-  std::unordered_set<Node *> valid_output;
-
-  for (auto *output : *outputs) {
-    int num_used = 0;
-    for (auto *node : output->outputs) {
-      if (!subgraph_set.count(node)) ++num_used;
-      if (num_used > 0) valid_output.insert(output);
-    }
-  }
-
-  // In use for ngraph subgraph pass for parallel executor,
-  // this will remove all nodes, bypass this and let ngraph
-  // subgraph pass to process outputs
-  if (FLAGS_use_ngraph && valid_output.size() == 0) return;
-
-  outputs->assign(valid_output.begin(), valid_output.end());
-}
-
-void DetachDeletedNodes(framework::ir::Graph *graph) {
-  std::unordered_set<const Node *> nodes;
-  for (auto *node : graph->Nodes()) {
-    if (Agent(node).deleted()) {
-      node->inputs.clear();
-      node->outputs.clear();
-    }
-  }
-}
-
-void SubGraphFuser::ReplaceNodesWithSubGraphs() {
-  auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
-  for (auto &subgraph : subgraphs) {
-    if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
-    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
-    // replace this sub-graph with the first node. Two steps: 1. Create a Block
-    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
-    // as deleted. 3. Replace the deleted node with the new Block Node.
-    framework::OpDesc empty_desc;
-    empty_desc.SetType(name_);
-    auto *block_node = graph_->CreateOpNode(&empty_desc);
-    Agent(block_node).set_subgraph({});
-    auto io = ExtractInputAndOutputOfSubGraph(subgraph);
-    block_node->inputs = std::move(io.first);
-    block_node->outputs = std::move(io.second);
-
-    RemoveIntermediateOutputInSubgraph(subgraph, graph_, &block_node->outputs);
-
-    for (auto *node : subgraph) {
-      // TODO(Superjomn) need a unified mechanism to treat deleted node in each
-      // pass.
-      Agent(node).set_deleted(true);
-      Agent(block_node).subgraph()->push_back(node);
-    }
-
-    // Change all the sub-graph's inputs and outputs corresponding inlink and
-    // outlink to this sub-graph node.
-    auto inlink_or_outlink_cleaner = [&](std::vector<Node *> &nodes) {
-      for (auto *&n : nodes) {
-        if (subgraph_uniq.count(n)) {
-          n = block_node;
-        }
-      }
-      std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
-      nodes.assign(uniq.begin(), uniq.end());
-    };
-    for (auto *i : block_node->inputs) {
-      inlink_or_outlink_cleaner(i->outputs);
-    }
-    for (auto *&o : block_node->outputs) {
-      inlink_or_outlink_cleaner(o->inputs);
-    }
-  }
-  // DetachDeletedNodes(graph_);
-  FilterRedundantOutputOfSubGraph(graph_);
-}
-
-inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
-  return node.inputs.size() == n;
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
deleted file mode 100644
index 26201541f67e3bf8546bc38dbf6823a3dc05a3ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the the class to partition a graph.
- */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-using framework::ir::Graph;
-using framework::ir::NodesTSIterator;
-
-const char kIsFunctionNode[] = "__is_function_node__";
-const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
-const char kSubgraphSplitterMarkerAttrName[] =
-    "_sub_graph_splitter_inside_sub_graph";
-
-/*
- * Detect the nodes in a sub-graph that meet some conditions. This class doesn't
- * modify the graph.
- */
-class SubgraphDetector {
- public:
-  // Tell whether a node is inside a sub-graph.
-  using NodeInsideSubgraphTeller =
-      std::function<bool(const framework::ir::Node *)>;
-
-  SubgraphDetector(Graph *graph, const NodeInsideSubgraphTeller &teller)
-      : graph_(graph), node_inside_subgraph_teller_(teller) {}
-
-  std::vector<std::vector<framework::ir::Node *>> operator()();
-
- protected:
-  // Mark the nodes inside the accepted sub-graph using
-  // node_inside_subgraph_teller.
-  void MarkNodesInsideSubGraph();
-
-  // Merge the marked nodes into sub-graphs and return the sub-graphs.
-  std::vector<std::vector<framework::ir::Node *>> ExtractSubGraphs();
-
- private:
-  Graph *graph_;
-  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
-};
-
-/*
- * SubGraphFuser - Replace some nodes with the sub-graph node they are inside.
- * To some extent, the TensorRT engine is just a fusion op for a model.
- */
-class SubGraphFuser {
- public:
-  using NodeInsideSubgraphTeller = SubgraphDetector::NodeInsideSubgraphTeller;
-
-  SubGraphFuser(Graph *graph, const NodeInsideSubgraphTeller &teller,
-                int min_subgraph_size, std::string name = "anakin_engine")
-      : graph_(graph),
-        node_inside_subgraph_teller_(teller),
-        min_subgraph_size_{min_subgraph_size},
-        name_{name} {}
-
-  // The main method which run all the logic.
-  void operator()();
-
- protected:
-  // Remove the nodes inside sub-graphs and replace with the SubGraphNode.
-  void ReplaceNodesWithSubGraphs();
-
- private:
-  Graph *graph_;
-  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
-  int min_subgraph_size_;
-  const std::string name_;
-};
-
-struct NodeWrapper {
-  bool deleted{false};
-  bool marked{false};
-  int union_find_parent{-1};
-  std::vector<framework::ir::Node *> subgraph;
-};
-
-/*
- * ir::Node agent for subgraph detector.
- */
-struct Agent {
-  explicit Agent(framework::ir::Node *x) : x_(x) {}
-
-  NodeWrapper &wrapper() {
-    if (!x_->IsWrappedBy<NodeWrapper>()) {
-      x_->WrappedBy<NodeWrapper>(new NodeWrapper);
-    }
-    return x_->template Wrapper<NodeWrapper>();
-  }
-
-  bool deleted() { return wrapper().deleted; }
-  void set_deleted(bool x) { wrapper().deleted = x; }
-
-  bool marked() { return wrapper().marked; }
-  void set_marked(bool x) { wrapper().marked = x; }
-
-  void set_subgraph(const std::vector<framework::ir::Node *> &x) {
-    wrapper().subgraph = x;
-  }
-
-  int union_find_parent() { return wrapper().union_find_parent; }
-  void set_union_find_parent(int v) { wrapper().union_find_parent = v; }
-
-  std::vector<framework::ir::Node *> *subgraph() { return &wrapper().subgraph; }
-  std::vector<framework::ir::Node *> &inputs() { return x_->inputs; }
-  std::vector<framework::ir::Node *> &outputs() { return x_->outputs; }
-
- private:
-  framework::ir::Node *x_;
-};
-
-// The nodes those have no input will be treated as start points.
-static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
-  std::vector<framework::ir::Node *> result;
-  for (auto *node : g.Nodes()) {
-    if (node->inputs.empty()) {
-      result.push_back(node);
-    }
-  }
-  return result;
-}
-
-static iterator_range<NodesTSIterator> TopologicalSort(const Graph &g) {
-  auto start_points = ExtractStartPoints(g);
-  PADDLE_ENFORCE(!start_points.empty());
-  NodesTSIterator x(start_points);
-  return iterator_range<NodesTSIterator>(NodesTSIterator(start_points),
-                                         NodesTSIterator());
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
deleted file mode 100644
index e16cce54c24c2412d3df71e86b23a24329cb61b7..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the the class to partition a graph.
- */
-
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
-#include <algorithm>
-#include <string>
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-using framework::ir::Node;
-
-std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes) {
-  // We can judge whether a variable is a parameter by
-  // its presistable property, but sometimes the presistable
-  // of the feed op output is true, so we have to identify it.
-  std::vector<std::string> feed_outputs;
-  for (const auto &node : nodes) {
-    if (!node->IsOp()) continue;
-    std::string op_type = node->Op()->Type();
-    if (op_type == "feed" || op_type == "fetch") {
-      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
-      std::copy(output_names.begin(), output_names.end(),
-                std::back_inserter(feed_outputs));
-    }
-  }
-
-  std::vector<std::string> parameters;
-  for (const auto &node : nodes) {
-    if (!node->IsVar()) continue;
-    if (node->Var()->Persistable() &&
-        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
-            feed_outputs.end()) {
-      parameters.push_back(node->Name());
-    }
-  }
-  return parameters;
-}
-
-void RenameAndGetOutputs(
-    const std::vector<framework::ir::Node *> &subgraph_nodes,
-    framework::BlockDesc *block_desc,
-    const std::set<std::string> &input_names_with_id,
-    std::set<std::string> *output_names_with_id,
-    std::set<std::string> *output_names,
-    std::unordered_map<std::string, std::string> *output_name_map,
-    const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
-    bool trt_and_not_int8) {
-  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
-  // When there are more than two convolutions of 1 * 1 with the same input, the
-  // paddle-tensorrt will do the merging optimization, which fuse those conv
-  // into one conv, and then trigger bug. So,  We should use strategy to avoid
-  // this optimization for the time being. This bug will be fixed in the future.
-  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
-      same_hierarchy_conv2d_num_map;
-
-  auto add_block_var = [&](const std::string &graph_arg,
-                           const std::string &block_arg) {
-    auto arg_var_node = graph_var_map.find(graph_arg);
-    PADDLE_ENFORCE(arg_var_node != graph_var_map.end());
-    auto *var_t = block_desc->Var(block_arg);
-    var_t->SetShape(arg_var_node->second->Var()->GetShape());
-    var_t->SetDataType(arg_var_node->second->Var()->GetDataType());
-  };
-
-  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
-    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
-    framework::OpDesc op_desc(*op, nullptr);
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
-
-    std::unordered_map<std::string, size_t> var2id;
-    std::unordered_map<std::string, framework::ir::Node *> in_vars;
-    for (auto *in_var : correspond_node->inputs) {
-      var2id[in_var->Name()] = in_var->id();
-      in_vars[in_var->Name()] = in_var;
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      // one input
-      auto *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        const std::string arg_value = in_var->arguments(k);
-        const std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-          if (graph_var_map.count(arg_value)) {
-            add_block_var(arg_value, arg_value);
-          }
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-          if (graph_var_map.count(arg_value)) {
-            add_block_var(arg_value, arg_value_with_id);
-          }
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outputs) {
-      var2id[out_var->Name()] = out_var->id();
-    }
-    if (op_desc.Type() == "conv2d" && trt_and_not_int8) {
-      auto input_var_name = op_desc.Input("Input").front();
-      auto filter_var_name = op_desc.Input("Filter").front();
-      auto out_var_name = op_desc.Output("Output").front();
-      auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
-      const std::vector<int> strides =
-          boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-      const std::vector<int> paddings =
-          boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-      if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
-        (*output_names_with_id)
-            .insert(out_var_name + std::to_string(var2id[out_var_name]));
-        (*output_names).insert(out_var_name);
-      } else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
-                 strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
-                 paddings[1] == 0) {
-        same_hierarchy_conv2d_num_map[input_var_name] += 1;
-      }
-    }
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        const std::string arg_value = out_var->arguments(k);
-        const std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (graph_var_map.count(arg_value)) {
-          add_block_var(arg_value, arg_value_with_id);
-        }
-        if (output_names_with_id->count(arg_value_with_id)) {
-          (*output_name_map)[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
deleted file mode 100644
index 444e1984cf8ee52a84e918874e2279b92f78f88e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the the class to partition a graph.
- */
-
-#pragma once
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/framework/ir/node.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-using framework::ir::Node;
-
-std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes);
-
-void RenameAndGetOutputs(
-    const std::vector<framework::ir::Node *> &subgraph_nodes,
-    framework::BlockDesc *block_desc,
-    const std::set<std::string> &input_names_with_id,
-    std::set<std::string> *output_names_with_id,
-    std::set<std::string> *output_names,
-    std::unordered_map<std::string, std::string> *output_name_map,
-    const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
-    bool trt_and_not_int8 = false);
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
deleted file mode 100644
index bd2f79a12aa6640c127b17e6f8c82fb23c2fedc0..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <map>
-#include <set>
-
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
-#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/tensorrt/op_teller.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-using framework::ir::Node;
-
-void analysis::TensorRtSubgraphPass::ApplyImpl(
-    framework::ir::Graph *graph) const {
-  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph);
-
-  auto teller = [](const framework::ir::Node *node) {
-    if (!node->IsOp() || !node->Op()) return false;
-    return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
-  };
-
-  SubGraphFuser fuser(graph, teller,
-                      Get<int>("min_subgraph_size") /*min subgraph size*/,
-                      "tensorrt_engine");
-  fuser();
-
-  std::vector<std::string> graph_param_names =
-      ExtractParameters(graph->Nodes());
-  // those parameter already exist in trt, and should not have another copy in
-  // fluid.
-  std::vector<std::string> repetitive_params;
-
-  for (auto *node : graph->Nodes()) {
-    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params);
-
-      std::unordered_set<const Node *> nodes2remove(
-          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
-      framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
-    }
-  }
-
-  std::unordered_set<const Node *> nodes2remove;
-  for (auto *node : graph->Nodes()) {
-    if (node->IsOp() && Agent(node).deleted()) {
-      nodes2remove.insert(node);
-    }
-  }
-  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
-  graph->Set(framework::ir::kRepetitiveParamAttr,
-             new std::vector<std::string>(repetitive_params));
-}
-
-std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
-                              const std::set<std::string> &engine_outputs,
-                              const std::string &predictor_id) {
-  std::string engine_hash_key = "";
-  for (auto name : engine_inputs) {
-    engine_hash_key += name;
-  }
-  for (auto name : engine_outputs) {
-    engine_hash_key += name;
-  }
-  engine_hash_key += predictor_id;
-  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
-  return engine_key;
-}
-
-void TensorRtSubgraphPass::CreateTensorRTOp(
-    framework::ir::Node *node, Graph *graph,
-    const std::vector<std::string> &graph_params,
-    std::vector<std::string> *repetitive_params) const {
-  auto *op_desc = node->Op();
-  auto &subgraph = *Agent(node).subgraph();
-  PADDLE_ENFORCE(!subgraph.empty());
-
-  framework::ProgramDesc *program_desc =
-      Get<framework::ProgramDesc *>("program");
-  // Add new block for TensorRTEngineOP
-  const framework::BlockDesc &main_block =
-      program_desc->Block(framework::kRootBlockIndex);
-  // const framework::BlockDesc& main_block = program_desc->Block(0);
-  framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
-
-  // A fake block desc.
-  framework::proto::BlockDesc block_proto;
-  framework::BlockDesc block_desc(nullptr, &block_proto);
-  block_desc.Proto()->set_parent_idx(-1);
-  block_desc.Proto()->set_idx(0);
-  string::PrettyLogDetail("---  detect a sub-graph with %d nodes",
-                          subgraph.size());
-
-  for (auto *node : subgraph) {
-    auto *new_block_op = new_block->AppendOp();
-    auto *op = block_desc.AppendOp();
-    *new_block_op->Proto() = *node->Op()->Proto();
-    *op->Proto() = *node->Op()->Proto();
-  }
-
-  // Then, we will use the input_names_with_id and output_names_with_id to
-  // generate the engine key.
-  // So, We use set instead of unordered_set here to ensure that the engine key
-  // is unique.
-  std::set<std::string> input_names;
-  std::set<std::string> input_names_with_id;
-  std::vector<std::string> params;
-  // if we delete fluid copy of params shared by more than 1 ops, there will be
-  // problem, so we filter them out.
-  std::vector<std::string> params_not_shared;
-
-  // The node->inputs contains input tensors and parameters.
-  for (auto *x : node->inputs) {
-    input_names.insert(x->Name());
-    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
-    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
-      params.push_back(x->Name());
-    }
-    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 &&
-        x->outputs.size() <= 1) {
-      params_not_shared.push_back(x->Name());
-    }
-  }
-
-  std::set<std::string> output_names;
-  std::set<std::string> output_names_with_id;
-  for (auto *x : node->outputs) {
-    output_names.insert(x->Name());
-    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
-  }
-
-  std::unordered_map<std::string, std::string> output_name_map;
-  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
-
-  for (framework::ir::Node *node : graph->Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      graph_var_map[node->Name()] = node;
-    }
-  }
-  auto precision_mode = Get<AnalysisConfig::Precision>("precision_mode");
-  bool enable_fp16 = false;
-  if (precision_mode == AnalysisConfig::Precision::kHalf) enable_fp16 = true;
-  auto enable_int8 = Get<bool>("enable_int8");
-  auto use_calib_mode = Get<bool>("use_calib_mode");
-  auto &subgraph_nodes = *Agent(node).subgraph();
-
-  // The following procedure is used to rename all the intermediate
-  // variables and the output variables of the subgraph.
-  // Why we do this?
-  // During the transition from fluid OP to tensorrt OP, we map
-  // the input and output Tensor(fluid data structure) of fluid OP
-  // to the corresponding ITensor (trt data structure) through the
-  // Tensor name. When we set up ITensor for an variable, we must
-  // ensure that it has not been set before.
-  // If there is variable in the fluid graph, which is not only the
-  // input of a OP, but also the output of a Op, there will be problems.
-  // So we have to rename the variable in the subgraph to make sure
-  // it is either an OP's input or an OP's output.
-  RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
-                      &output_names_with_id, &output_names, &output_name_map,
-                      graph_var_map, !enable_int8);
-
-  // When tensorrt engine runs at the end of the operation,
-  // output_mapping help us copy the data from the renamed ITensor
-  // to Tensor.
-  std::vector<std::string> output_mapping;
-  for (auto name : output_names) {
-    PADDLE_ENFORCE(output_name_map.count(name) != 0);
-    output_mapping.push_back(output_name_map[name]);
-  }
-  PADDLE_ENFORCE(!output_mapping.empty());
-  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
-                 "the block has no var-desc");
-
-  // Set attrs
-  op_desc->SetType("tensorrt_engine");
-  op_desc->SetInput(
-      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
-
-  op_desc->SetOutput(
-      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-
-  op_desc->SetBlockAttr("sub_block", new_block);
-  op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
-  op_desc->SetAttr("max_batch_size", Get<int>("max_batch_size"));
-  op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
-  op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
-  op_desc->SetAttr("output_name_mapping", output_mapping);
-  op_desc->SetAttr("parameters", params);
-
-  // we record all inputs' shapes in attr to check if they are consistent
-  // with the real inputs' shapes retrieved from scope when trt runs.
-  for (auto *x : node->inputs) {
-    if (x->IsVar() && x->Var()) {
-      framework::VarDesc *var = x->Var();
-      SetAttr(op_desc->Proto(), var->Name() + "_shape", var->GetShape());
-    }
-  }
-
-  auto use_static_engine = Get<bool>("use_static_engine");
-  // TODO(NHZlX)
-  // There are models with the same structure but the different parameters,
-  // when runing in the 'use_serialize' mode, there is a bug.
-  auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
-                                      std::to_string(0));
-  auto predictor_id = Get<int>("predictor_id");
-
-  // Get "" when there is no cached calibration table data.
-  bool load_from_memory = Get<bool>("model_from_memory");
-  std::string calibration_data = "";
-  if (enable_int8 && use_calib_mode) {
-    calibration_data = GetTrtCalibTableData(
-        Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
-  }
-  op_desc->SetAttr("calibration_data", calibration_data);
-  op_desc->SetAttr("enable_int8", enable_int8);
-  op_desc->SetAttr("enable_fp16", enable_fp16);
-  op_desc->SetAttr("use_calib_mode", use_calib_mode);
-  op_desc->SetAttr("engine_key", engine_key);
-  op_desc->SetAttr("predictor_id", predictor_id);
-
-  std::string trt_engine_serialized_data = "";
-  op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data);
-  op_desc->Flush();
-
-  std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
-  if (enable_int8 && calibration_data.size() != 0) {
-    calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
-    LOG(INFO) << "RUN Paddle TRT int8 calibration mode...";
-  }
-  // When in int8 mode and calibration_mode, the program just produce the
-  // calibration table data.
-  bool calibration_mode =
-      (enable_int8 && calibration_data.size() == 0 && use_calib_mode);
-  if (calibration_mode) {
-    // calibraion mode means generate int8 calibration table data process.
-    return;
-  }
-
-  std::copy(params_not_shared.begin(), params_not_shared.end(),
-            std::back_inserter(*repetitive_params));
-
-  tensorrt::TensorRTEngine *trt_engine =
-      inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-          .Create(engine_key + std::to_string(predictor_id),
-                  Get<int>("max_batch_size"), Get<int>("workspace_size"),
-                  precision_mode, calibrator.get(), Get<int>("gpu_device_id"));
-
-  bool need_serialize = (use_static_engine && !load_from_memory);
-  if (need_serialize) {
-    trt_engine_serialized_data = GetTrtEngineSerializedData(
-        Get<std::string>("model_opt_cache_dir"), engine_key);
-    // we can load the engine info serialized before from the disk.
-    if (!trt_engine_serialized_data.empty()) {
-      trt_engine->Deserialize(trt_engine_serialized_data);
-      LOG(INFO) << "Load TRT Optimized Info from "
-                << GetTrtEngineSerializedPath(
-                       Get<std::string>("model_opt_cache_dir"), engine_key);
-      return;
-    }
-  }
-
-  // the following code will NOT run in following situation:
-  // 1. calibraion mode (generate trt int8 calibraiton table data)
-  // 2. already load serialized trt engine info.
-  LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
-               "kernel etc). This process may cost a lot of time.";
-
-  auto *scope = param_scope();
-  framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
-  std::unordered_set<std::string> param_set(params.begin(), params.end());
-  inference::Singleton<inference::tensorrt::OpConverter>::Global()
-      .ConvertBlockToTRTEngine(
-          &block_desc_temp, *scope,
-          std::vector<std::string>(input_names.begin(), input_names.end()),
-          param_set, output_mapping, trt_engine);
-
-  if (need_serialize) {
-    nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
-    trt_engine_serialized_data =
-        std::string((const char *)serialized_engine_data->data(),
-                    serialized_engine_data->size());
-    SaveTrtEngineSerializedDataToFile(
-        GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
-                                   engine_key),
-        trt_engine_serialized_data);
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_PASS(tensorrt_subgraph_pass,
-              paddle::inference::analysis::TensorRtSubgraphPass)
-    .RequirePassAttr("max_batch_size")
-    .RequirePassAttr("workspace_size")
-    .RequirePassAttr("min_subgraph_size");
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
deleted file mode 100644
index b6b67ce8eceb40cbf0aa98fe56684d76ce6c9602..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
-#include "paddle/fluid/inference/api/paddle_analysis_config.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-class TensorRtSubgraphPass : public framework::ir::FusePassBase {
- public:
-  void ApplyImpl(framework::ir::Graph *graph) const override;
-
- private:
-  void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
-                        const std::vector<std::string> &graph_params,
-                        std::vector<std::string> *repetitive_params) const;
-  void CleanIntermediateOutputs(framework::ir::Node *node);
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
deleted file mode 100644
index 1c878d66ba97a13e14d341d08943dfe8c78228a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor)
-cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
-cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass)
-cc_library(inference_op_replace_pass SRCS inference_op_replace_pass.cc DEPS analysis_pass graph_to_program_pass)
-cc_library(ir_graph_clean_pass SRCS ir_graph_clean_pass.cc DEPS analysis_pass)
-
-cc_library(analysis_passes SRCS passes.cc DEPS
-  ir_graph_build_pass
-  ir_analysis_pass
-  ir_params_sync_among_devices_pass
-  adjust_cudnn_workspace_size_pass
-  memory_optim_pass
-  inference_op_replace_pass
-  ir_graph_to_program_pass
-  ir_graph_clean_pass
-)
-
-set(analysis_deps ${analysis_deps}
-        analysis_passes
-        subgraph_detector
-        CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc
deleted file mode 100644
index 0470e0d5a247163ecd7e7dd1e8f88e6b71ae93d7..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void AdjustCudnnWorkSpacePass::RunImpl(Argument* argument) {
-  if (!argument->use_gpu()) return;
-  auto& graph = argument->main_graph();
-  auto nodes = graph.Nodes();
-  const int cudnn_workspace_size_MB = 64;
-  const std::string attr_name = "workspace_size_MB";
-
-  for (auto& node : nodes) {
-    if (!node->IsOp()) continue;
-    auto* op_desc = node->Op();
-    if (!op_desc->HasAttr(attr_name)) continue;
-    op_desc->SetAttr(attr_name, cudnn_workspace_size_MB);
-    op_desc->Flush();
-  }
-}
-
-std::string AdjustCudnnWorkSpacePass::repr() const {
-  return "adjust-cudnn-work-space-pass";
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h
deleted file mode 100644
index 65d1c545313e110028a92776e73a070d32010420..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * The default cudnn workspace is 4G, we set it to 64M in this pass, which
- * is applicable for most inference tasks.
- */
-class AdjustCudnnWorkSpacePass : public AnalysisPass {
- public:
-  void RunImpl(Argument *argument) override;
-  std::string repr() const override;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
deleted file mode 100644
index 86ced982d34d80e38e24650c0d687152ab5e3dcb..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h"
-#include <unordered_map>
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void InferenceOpReplacePass::RunImpl(Argument* argument) {
-  std::unordered_map<std::string, std::string> replaced_map{
-      {"conditional_block", "conditional_block_infer"},
-      {"merge_lod_tensor", "merge_lod_tensor_infer"},
-  };
-
-  auto& graph = argument->main_graph();
-  auto nodes = graph.Nodes();
-
-  for (auto& node : nodes) {
-    if (!node->IsOp()) continue;
-    auto* op_desc = node->Op();
-    std::string op_type = op_desc->Type();
-    if (!replaced_map.count(op_type)) continue;
-    op_desc->SetType(replaced_map[op_type]);
-    op_desc->Flush();
-  }
-}
-
-std::string InferenceOpReplacePass::repr() const {
-  return "inference-op-replace-pass";
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h
deleted file mode 100644
index 7fbdd88e014ba83277e45798b98ab90af3191f99..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * There are some ops (while, conditional_block_op etc) which have different
- * optimization points under predicion and training conditions.
- * So, We added the corresponding inference impl to these ops separately.
- * This pass replaces these ops with corresponding inference ops.
- */
-class InferenceOpReplacePass : public AnalysisPass {
- public:
-  void RunImpl(Argument *argument) override;
-  std::string repr() const override;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
deleted file mode 100644
index d986811a827b6ed477b30bc43d26f52a71e8f178..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void IrAnalysisPass::RunImpl(Argument* argument) {
-  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
-  ARGUMENT_CHECK_FIELD(argument, main_program);
-  ARGUMENT_CHECK_FIELD(argument, scope);
-
-  auto* the_graph = argument->ReleaseMainGraph();
-  auto graph = std::unique_ptr<Graph>(the_graph);
-
-  // Apply passes.
-  IRPassManager the_ir_manager(argument);
-  graph = the_ir_manager.Apply(std::move(graph));
-  PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
-  argument->SetMainGraph(graph.release());
-  CollectFusionStatis(argument);
-}
-
-void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
-  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
-    LOG(INFO) << "argument has no fuse statis";
-    return;
-  }
-  argument->SetFusionStatis(
-      argument->main_graph().Get<Argument::fusion_statis_t>(
-          framework::ir::kFuseStatisAttr));
-}
-
-std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
deleted file mode 100644
index 2c2113c06d917b5473e68935889e4a7b16b6cfc1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Perform IR analysis passes.
- *
- * It is used to fuse some
- */
-class IrAnalysisPass : public AnalysisPass {
- public:
-  void RunImpl(Argument* argument) override;
-
-  void CollectFusionStatis(Argument* argument);
-
-  std::string repr() const override;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
deleted file mode 100644
index 970ecdbbeb0c4c12ce6ba928a74a14ca1ae183ca..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-
-extern void ReadBinaryFile(const std::string &filename, std::string *contents);
-
-namespace analysis {
-
-void IrGraphBuildPass::RunImpl(Argument *argument) {
-  if (!argument->scope_valid()) {
-    argument->SetScope(new framework::Scope);
-  }
-  PADDLE_ENFORCE(argument->use_gpu_valid());
-
-  // The load program should run on the same device with the inference program,
-  // so that the parameters will on the same device, or they will keep copying
-  // between difference devices.
-  platform::Place place;
-  place = platform::CPUPlace();
-
-  if (argument->model_dir_valid()) {
-    auto program =
-        LoadModel(argument->model_dir(), argument->scope_ptr(), place);
-    argument->SetMainProgram(program.release());
-  } else if (argument->model_program_path_valid() &&
-             argument->model_params_path_valid()) {
-    auto program = LoadModel(
-        argument->model_program_path(), argument->model_params_path(),
-        argument->scope_ptr(), place,
-        argument->model_from_memory_valid() && argument->model_from_memory());
-    argument->SetMainProgram(program.release());
-  } else {
-    PADDLE_THROW(
-        "either model_dir or (program path and parameter path) should be set.");
-  }
-
-  auto graph = std::unique_ptr<Graph>(new Graph(argument->main_program()));
-  argument->SetMainGraph(graph.release());
-  auto *scope_ptr = argument->scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
-  argument->main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
-}
-
-std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
-    const std::string &path, framework::Scope *scope,
-    const platform::Place &place) {
-  framework::Executor exe(place);
-  return Load(&exe, scope, path);
-}
-
-std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
-    const std::string &program_path, const std::string &params_path,
-    framework::Scope *scope, const platform::Place &place,
-    bool model_from_memory) {
-  framework::Executor exe(place);
-  if (!model_from_memory) {
-    return Load(&exe, scope, program_path, params_path);
-  } else {
-    return LoadFromMemory(&exe, scope, program_path, params_path);
-  }
-}
-
-std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; }
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
deleted file mode 100644
index adbde0433fad28b006b18b47c8fd0a8946d21a98..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Load program and parameter to memory from the disk or directly from memory.
- */
-class IrGraphBuildPass : public AnalysisPass {
- public:
-  void RunImpl(Argument *argument) override;
-
-  std::string repr() const override;
-
- private:
-  std::unique_ptr<framework::ProgramDesc> LoadModel(
-      const std::string &path, framework::Scope *scope,
-      const platform::Place &place);
-  std::unique_ptr<framework::ProgramDesc> LoadModel(
-      const std::string &program_path, const std::string &params_path,
-      framework::Scope *scope, const platform::Place &place,
-      bool model_from_memory);
-
-  std::string model_binary_str_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
deleted file mode 100644
index 1f888a28da0416b41a87b551208fbe109f54d844..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h"
-#include <algorithm>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/node.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void IrInferCleanGraphPass::RunImpl(Argument* argument) {
-  auto& graph = argument->main_graph();
-  auto is_valid_node = [](framework::ir::Node* x) {
-    return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
-  };
-
-  std::unordered_set<const framework::ir::Node*> invalid_nodes;
-  int valid_op = 0;
-  for (auto* node : graph.Nodes()) {
-    PADDLE_ENFORCE_NOT_NULL(node);
-    if (is_valid_node(node)) {
-      invalid_nodes.insert(node);
-    } else if (node->IsOp()) {
-      ++valid_op;
-    }
-  }
-
-  GraphSafeRemoveNodes(&graph, invalid_nodes);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
deleted file mode 100644
index a9d58aa2f4cbb5d135221b0d02c633f6f78c8190..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-class IrInferCleanGraphPass : public AnalysisPass {
- public:
-  void RunImpl(Argument *argument) override;
-
-  std::string repr() const override { return "ir_graph_clean_pass"; }
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
deleted file mode 100644
index 35df396fe89eb23317b8f086c668396fdb3a4559..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
-#include <memory>
-#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void IrGraphToProgramPass::RunImpl(Argument *argument) {
-  auto pass =
-      framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
-
-  if (argument->memory_optim_sort_kind_valid()) {
-    pass->Set(framework::ir::kGraphToProgramSortKind,
-              new int(argument->memory_optim_sort_kind()));
-  }
-
-  std::unique_ptr<Graph> graph(argument->main_graph_ptr());
-
-  // Direct using ProgramDesc desc(argument->main_program()) may cause
-  // incomplete copies of information.
-  framework::ProgramDesc desc;
-  desc.CopyFrom(*argument->main_program().Proto());
-  pass->SetNotOwned("program", &desc);
-  pass->Apply(graph.release());  // the argument still own the graph.
-
-  argument->SetIrAnalyzedProgram(
-      new framework::proto::ProgramDesc(*desc.Proto()));
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
deleted file mode 100644
index 838ebdbc9d71eb3a73882e4c4c3e4bf6078150e4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-class IrGraphToProgramPass : public AnalysisPass {
- public:
-  void RunImpl(Argument *argument) override;
-
-  std::string repr() const override { return "ir-graph-to-param-pass"; }
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
deleted file mode 100644
index fedee3ff95f0ffe7af730c7113dbe6ea33c118e5..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
-  PADDLE_ENFORCE(argument->scope_valid());
-  PADDLE_ENFORCE(argument->use_gpu_valid());
-
-  platform::Place place;
-
-  // The parameters are on the cpu, therefore, synchronization is not necessary.
-  if (!argument->use_gpu()) return;
-
-  auto &graph = argument->main_graph();
-  std::vector<std::string> repetitive_params;
-
-  if (graph.Has(framework::ir::kRepetitiveParamAttr))
-    repetitive_params = graph.Get<std::vector<std::string>>(
-        framework::ir::kRepetitiveParamAttr);
-
-  LOG(INFO) << "Sync params from CPU to GPU";
-
-  PADDLE_ENFORCE(argument->gpu_device_id_valid());
-  place = platform::CUDAPlace(argument->gpu_device_id());
-
-  auto *scope = argument->scope_ptr();
-  std::vector<std::string> all_vars = scope->LocalVarNames();
-
-  // We get all the vars from local_scope instead of the ProgramDesc.
-  // Because there exists the case that new parameter variables are not added to
-  // the program in the analysis pass.
-  for (auto &var_name : all_vars) {
-    if (std::count(repetitive_params.begin(), repetitive_params.end(),
-                   var_name)) {
-      scope->EraseVars({var_name});
-      continue;
-    }
-    auto *var = scope->FindLocalVar(var_name);
-    PADDLE_ENFORCE(var != nullptr);
-    if (var->IsType<framework::LoDTensor>() ||
-        var->IsType<framework::Tensor>()) {
-      auto *t = var->GetMutable<framework::LoDTensor>();
-
-      platform::CPUPlace cpu_place;
-      framework::LoDTensor temp_tensor;
-      temp_tensor.Resize(t->dims());
-      temp_tensor.mutable_data<float>(cpu_place);
-
-      // Copy the parameter data to a tmp tensor.
-      TensorCopySync(*t, cpu_place, &temp_tensor);
-      // Reallocation the space on GPU
-      t->clear();
-
-      // Copy parameter data to newly allocated GPU space.
-      TensorCopySync(temp_tensor, place, t);
-    }
-  }
-}
-
-std::string IrParamsSyncAmongDevicesPass::repr() const {
-  return "ir-params-sync-among-devices-pass";
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
deleted file mode 100644
index 61990150a30db147418c4301359428cf3c6db541..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Sync parameter from CPU to GPU.
- */
-class IrParamsSyncAmongDevicesPass : public AnalysisPass {
- public:
-  void RunImpl(Argument *argument) override;
-  std::string repr() const override;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
deleted file mode 100644
index 6fbf880356c541e72cae6f3b03efe017042254ff..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ /dev/null
@@ -1,327 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
-#include <algorithm>
-#include <fstream>
-#include <functional>
-#include <limits>
-#include <map>
-#include <set>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-#include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-using framework::ir::Graph;
-using framework::ir::Node;
-using framework::ir::TopologyVarientSort;
-using space_table_t = MemoryOptimizePass::space_table_t;
-
-typedef struct {
-  std::string name;
-  size_t size;
-  int cluster;
-  std::pair<int, int> lifetime;
-  std::unordered_set<std::string> adj;
-} MemNode;
-
-// Collect the lifecycles of the tensors.
-// Traverse the graph in topological order.
-// The traversal order also affect the lifecycles, so different sort_kind is
-// used.
-void MemoryOptimizePass::CollectLifeCycle(
-    std::unordered_map<std::string, lifecycle_t>* lifecycles,
-    int sort_kind) const {
-  max_lifecycle_ = 0;
-  for (auto* op_node : framework::ir::TopologyVarientSort(
-           *graph_, static_cast<framework::ir::SortKind>(sort_kind))) {
-    if (!op_node->IsOp()) continue;
-    auto reads = op_node->inputs;
-    auto writes = op_node->outputs;
-
-    std::vector<Node*> requires(reads.begin(), reads.end());
-    requires.insert(requires.end(), writes.begin(), writes.end());
-
-    // Disable reuse of feed variables.
-    if (op_node->Name() == "feed") {
-      for (auto* node : op_node->outputs) {
-        auto var = node->Name();
-        lifecycles->emplace(var,
-                            std::make_pair(0, std::numeric_limits<int>::max()));
-      }
-    } else {
-      // Normal operators.
-      for (const Node* node : requires) {
-        if (node->Var()->Persistable()) continue;
-        std::string var = node->Name();
-        if (!lifecycles->count(var)) {
-          (*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_);
-        } else {
-          (*lifecycles)[var].second =
-              std::max(max_lifecycle_, lifecycles->at(var).second);  // max()
-        }
-      }
-    }
-
-    ++max_lifecycle_;
-  }
-}
-
-// TODO(Superjomn) Make this a general help method.
-int DataTypeToSpace(framework::proto::VarType_Type type) {
-  switch (type) {
-    case framework::proto::VarType_Type_BOOL:
-      return sizeof(bool);
-    case framework::proto::VarType_Type_FP32:
-      return sizeof(float);
-    case framework::proto::VarType_Type_INT32:
-      return sizeof(int32_t);
-    case framework::proto::VarType_Type_INT64:
-      return sizeof(int64_t);
-    case framework::proto::VarType_Type_INT16:
-      return sizeof(int16_t);
-    case framework::proto::VarType_Type_FP16:
-      return sizeof(int16_t);
-    case framework::proto::VarType_Type_FP64:
-      return sizeof(double);
-    case framework::proto::VarType_Type_UINT8:
-      return sizeof(unsigned char);
-    case framework::proto::VarType_Type_INT8:
-      return sizeof(int8_t);
-    default:
-      PADDLE_THROW("Unknown data type");
-  }
-}
-
-void MemoryOptimizePass::CollectVarMemorySize(
-    space_table_t* space_table) const {
-  const int fake_batch_size = 1;
-
-  auto valid_var = [&](framework::ir::Node* node) -> bool {
-    std::set<std::string> invalid_op = {"while",
-                                        "conditional_block",
-                                        "tensorrt_engine",
-                                        "conditional_block_infer",
-                                        "merge_lod_tensor_infer",
-                                        "merge_lod_tensor",
-                                        "equal",
-                                        "lod_reset"};
-    for (auto* tmp : node->inputs) {
-      CHECK(tmp->IsOp());
-      std::string op_type = tmp->Op()->Type();
-      if (std::find(invalid_op.begin(), invalid_op.end(), op_type) !=
-          invalid_op.end()) {
-        return false;
-      }
-    }
-    for (auto* tmp : node->outputs) {
-      CHECK(tmp->IsOp());
-      std::string op_type = tmp->Op()->Type();
-      if (std::find(invalid_op.begin(), invalid_op.end(), op_type) !=
-          invalid_op.end()) {
-        return false;
-      }
-    }
-    return true;
-  };
-  // Collect tensors from graph.
-  for (auto* node : graph_->Nodes()) {
-    if (node->IsVar() &&
-        node->Var()->GetType() ==
-            framework::proto::VarType::Type::VarType_Type_LOD_TENSOR &&
-        valid_var(node)) {
-      // Parameters will not be reused.
-      if (node->Var()->Persistable()) continue;
-      auto shape = node->Var()->GetShape();
-      for (auto& v : shape) {
-        if (v < 0) v = fake_batch_size;
-      }
-
-      int size = std::accumulate(shape.begin(), shape.end(), 1,
-                                 std::multiplies<int>());
-      (*space_table)[node->Var()->Name()] =
-          size * DataTypeToSpace(node->Var()->GetDataType());
-    }
-  }
-}
-
-void MakeSimpleReusePlan(
-    const std::unordered_map<std::string, std::pair<int, int>>& lifecycles,
-    const std::unordered_map<std::string, size_t>& space_table,
-    std::unordered_map<std::string, std::string>* node2cluster,
-    std::unordered_map<std::string, int>* cluster_size) {
-  std::vector<MemNode> mem_nodes;
-  for (auto& data : lifecycles) {
-    if (!space_table.count(data.first)) continue;
-    MemNode temp_node;
-    temp_node.name = data.first;
-    temp_node.size = space_table.at(data.first);
-    temp_node.cluster = -1;
-    temp_node.lifetime = data.second;
-    mem_nodes.push_back(temp_node);
-  }
-  auto overlap = [](std::pair<int, int> a, std::pair<int, int> b) -> bool {
-    return b.second >= a.first && a.second >= b.first;
-  };
-  // If the lifetime of two nodes is overwritten, we set them as adjacent nodes.
-  for (size_t i = 0; i < mem_nodes.size(); i++) {
-    for (size_t j = i + 1; j < mem_nodes.size(); j++) {
-      if (overlap(mem_nodes[i].lifetime, mem_nodes[j].lifetime)) {
-        mem_nodes[i].adj.insert(mem_nodes[j].name);
-        mem_nodes[j].adj.insert(mem_nodes[i].name);
-      }
-    }
-  }
-
-  // Sort the nodes according to the node memory size.
-  auto sort_func = [](MemNode a, MemNode b) { return a.size > b.size; };
-  std::sort(mem_nodes.begin(), mem_nodes.end(), sort_func);
-
-  // Generating Memory Reuse Strategy Based on Greedy Way
-  for (size_t i = 0; i < mem_nodes.size(); i++) {
-    if (mem_nodes[i].cluster >= 0) continue;
-    int cluster_index = cluster_size->size();
-    mem_nodes[i].cluster = cluster_index;
-    (*cluster_size)[mem_nodes[i].name] = mem_nodes[i].size;
-    (*node2cluster)[mem_nodes[i].name] = mem_nodes[i].name;
-    std::unordered_set<std::string> cluster_adj = mem_nodes[i].adj;
-    for (size_t j = i + 1; j < mem_nodes.size(); j++) {
-      if (mem_nodes[j].cluster < 0 &&
-          (cluster_adj.find(mem_nodes[j].name) == cluster_adj.end())) {
-        (*node2cluster)[mem_nodes[j].name] = mem_nodes[i].name;
-        mem_nodes[j].cluster = cluster_index;
-        for (auto& n : mem_nodes[j].adj) {
-          cluster_adj.insert(n);
-        }
-      }
-    }
-  }
-  for (auto& cluster : *cluster_size) {
-    LOG(INFO) << "Cluster name : " << cluster.first
-              << "  size: " << cluster.second;
-  }
-}
-
-// NOTE The optimized opdesc doesn't match ir::Graph.
-void UpdateOpDescsByReuse(
-    Graph* graph,
-    const std::unordered_map<std::string, std::string>& reuse_table,
-    int sort_kind) {
-  // TODO(Superjomn) change here to be compatible with the runtime order.
-  for (auto* node : TopologyVarientSort(
-           *graph, static_cast<framework::ir::SortKind>(sort_kind))) {
-    if (node->IsOp()) {
-      // Replace the original inputs/outputs with the reused tensors.
-      std::unordered_map<std::string, std::vector<std::string>> in_args,
-          out_args;
-      for (auto argument : node->Op()->Inputs()) {
-        for (const auto& x : argument.second) {
-          auto name = x;
-          if (reuse_table.count(x) && reuse_table.at(x) != x) {
-            name = reuse_table.at(x);
-          }
-          in_args[argument.first].push_back(name);
-          VLOG(4) << node->Name() << " input " << x << " -> " << name;
-        }
-      }
-
-      // modify the graph
-      for (auto input_node : node->inputs) {
-        PADDLE_ENFORCE(input_node->IsVar());
-        std::string input_node_name = input_node->Name();
-        if (reuse_table.count(input_node_name) &&
-            reuse_table.at(input_node_name) != input_node_name) {
-          auto name = reuse_table.at(input_node_name);
-          input_node->RenameVar(name);
-        }
-      }
-
-      for (auto argument : node->Op()->Outputs()) {
-        for (const auto& x : argument.second) {
-          auto name = x;
-          if (reuse_table.count(x) && reuse_table.at(x) != x) {
-            name = reuse_table.at(x);
-          }
-          out_args[argument.first].push_back(name);
-          VLOG(4) << node->Name() << " output " << x << " -> " << name;
-        }
-      }
-
-      // modify the graph
-      for (auto out_node : node->outputs) {
-        PADDLE_ENFORCE(out_node->IsVar());
-        std::string out_node_name = out_node->Name();
-        if (reuse_table.count(out_node_name) &&
-            reuse_table.at(out_node_name) != out_node_name) {
-          auto name = reuse_table.at(out_node_name);
-          out_node->RenameVar(name);
-        }
-      }
-
-      // Update arguments.
-      for (auto& arg : in_args) {
-        node->Op()->SetInput(arg.first, arg.second);
-      }
-      for (auto& arg : out_args) {
-        node->Op()->SetOutput(arg.first, arg.second);
-      }
-      node->Op()->Flush();
-    }
-  }
-}
-
-std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
-
-void MemoryOptimizePass::RunImpl(Argument* argument) {
-  // Memory optimization.
-  // We will perform the following operation:
-  // 1. Collect all var's lifetime.
-  // 2. Make reuse plan: the vars can be reused if there is no overlap(on
-  // lifetime) between
-  // them.
-  // The final plan is a mapping table in which the key represents the original
-  // name of var and the value in the table represents the current name of var.
-  // 3. Perform reuse plan: Replace all var's name in the model according to the
-  // mapping table.
-  if (!argument->enable_memory_optim()) return;
-  graph_ = argument->main_graph_ptr();
-
-  int sort_kind = 0;
-  std::unordered_map<std::string, lifecycle_t> lifecycles;
-  space_table_t space_table;
-  std::unordered_map<std::string, std::string> node2cluster;
-  std::unordered_map<std::string, int> cluster_size;
-
-  CollectLifeCycle(&lifecycles, sort_kind);
-  CollectVarMemorySize(&space_table);
-  MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size);
-  UpdateOpDescsByReuse(graph_, node2cluster, sort_kind);
-  return;
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
deleted file mode 100644
index 77da5d40d8dea96e6d7ae833501626894c6d7b37..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/* Memory optimization.
-* We will perform the following operation:
-* 1. Collect all var's lifetime.
-* 2. Make reuse plan: the vars can be reused if there is no overlap(on lifetime)
-* between
-* them.
-* The final plan is a mapping table in which the key represents the original
-* name of var and the value in the table represents the current name of var.
-* 3. Perform reuse plan: Replace all var's name in the model according to the
-* mapping table.
-*/
-class MemoryOptimizePass : public AnalysisPass {
- public:
-  using space_table_t = std::unordered_map<std::string, size_t>;
-  using lifecycle_t = std::pair<int, int>;
-
-  virtual ~MemoryOptimizePass() = default;
-
- protected:
-  void RunImpl(Argument *argument) override;
-
- private:
-  void CollectLifeCycle(
-      std::unordered_map<std::string, lifecycle_t> *lifecycles,
-      int sort_kind) const;
-
-  void CollectVarMemorySize(space_table_t *space_table) const;
-
- public:
-  std::string repr() const override;
-
- private:
-  mutable framework::ir::Graph *graph_{nullptr};
-  mutable int max_lifecycle_{-1};
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
deleted file mode 100644
index ca0b25c29d495dc0e71e69a6d7d2a10f0f8c2254..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/passes.h"
-#include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h"
-#include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h"
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
-#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
-#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h"
-#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
-#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
-#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-PassRegistry::PassRegistry() {
-  // Register manually to avoid the trivial `USE_OP` like macro for easier use
-  // and link.
-  passes_.emplace("ir_analysis_pass",
-                  std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
-  passes_.emplace("ir_graph_build_pass",
-                  std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
-  passes_.emplace("ir_graph_clean_pass",
-                  std::unique_ptr<AnalysisPass>(new IrInferCleanGraphPass));
-  passes_.emplace("memory_optimize_pass",
-                  std::unique_ptr<AnalysisPass>(new MemoryOptimizePass));
-  passes_.emplace(
-      "ir_params_sync_among_devices_pass",
-      std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
-  passes_.emplace("adjust_cudnn_workspace_size_pass",
-                  std::unique_ptr<AnalysisPass>(new AdjustCudnnWorkSpacePass));
-  passes_.emplace("inference_op_replace_pass",
-                  std::unique_ptr<AnalysisPass>(new InferenceOpReplacePass));
-  passes_.emplace(
-      "ir_graph_to_program_pass",
-      std::unique_ptr<IrGraphToProgramPass>(new IrGraphToProgramPass));
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/passes.h b/paddle/fluid/inference/analysis/passes/passes.h
deleted file mode 100644
index 8a13091d083e51ecc84e6790f973ffa39ba5a6b9..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/passes.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-struct PassRegistry {
-  PassRegistry();
-
-  AnalysisPass* Retreive(const std::string& pass_type) {
-    return passes_[pass_type].get();
-  }
-
-  static PassRegistry& Global() {
-    static auto* x = new PassRegistry;
-    return *x;
-  }
-
- private:
-  std::unordered_map<std::string, std::unique_ptr<AnalysisPass>> passes_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
deleted file mode 100644
index d599099a8050eaeabb8e0544b1bfe3b6b46b17ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <fstream>
-#include <string>
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-
-namespace paddle {
-namespace inference {
-
-// Read ProgramDesc from a __model__ file, defined in io.cc
-extern void ReadBinaryFile(const std::string& filename, std::string* contents);
-
-namespace analysis {
-
-DEFINE_string(inference_model_dir, "", "inference test model dir");
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
deleted file mode 100755
index 344d12dd0b12a0378f68b3de91c84ca11253c502..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
-endif(APPLE)
-
-
-set(inference_deps ${analysis_deps}
-  paddle_inference_api paddle_fluid_api
-  analysis pass naive_executor
-  ${GLOB_PASS_LIB})
-
-if(WITH_GPU AND TENSORRT_FOUND)
-    set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
-endif()
-
-if (ANAKIN_SUBGRAPH)
-    set(inference_deps ${inference_deps} anakin_op_converter anakin_engine)
-endif()
-
-if(WITH_NGRAPH)
-    set(inference_deps ${inference_deps} ngraph)
-endif()
-
-add_subdirectory(details)
-
-if(WITH_MKLDNN)
-  set(mkldnn_quantizer_src mkldnn_quantizer.cc)
-  set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
-  cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
-endif()
-
-cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder)
-if(WITH_NGRAPH)
-  cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc DEPS ngraph)
-else(WITH_NGRAPH)
-  cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-endif(WITH_NGRAPH)
-cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS paddle_inference_api zero_copy_tensor
-  reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
-           lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
-           paddle_pass_builder zero_copy_tensor
-           reset_tensor_array)
-
-cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
-
-if(WITH_TESTING)
-  inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
-                      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
-  set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
-  set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
-endif()
-cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
-        ARGS --dirname=${WORD2VEC_MODEL_DIR})
-
-if(ANAKIN_FOUND)
-  # Do not turn warnings into errors.
-  set_source_files_properties(api.cc api_anakin_engine.cc PROPERTIES COMPILE_FLAGS "-Wno-error")
-  cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS boost xxhash framework_proto eigen3)
-  target_link_libraries(inference_anakin_api anakin anakin_saber_common)
-  cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS boost xxhash framework_proto eigen3)
-  target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
-  function(anakin_target target_name)
-    target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
-  endfunction()
-  anakin_target(inference_anakin_api)
-  anakin_target(inference_anakin_api_shared)
-endif()
diff --git a/paddle/fluid/inference/api/README.md b/paddle/fluid/inference/api/README.md
deleted file mode 100644
index a2d685d723bd9ab2b84969adb86e177a8754328d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/README.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Embed Paddle Inference in Your Application
-
-Paddle inference offers the APIs in `C` and `C++` languages.
-
-You can easily deploy a model trained by Paddle following the steps as below:
-
-1. Optimize the native model;
-2. Write some codes for deployment.
-
-## The APIs
-
-All the released APIs are located in the `paddle_inference_api.h` header file. 
-The stable APIs are wrapped by `namespace paddle`, the unstable APIs are protected by `namespace paddle::contrib`.
-
-## Write some codes
-
-Read `paddle_inference_api.h` for more information.
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
deleted file mode 100644
index ace260c7cdb1c3c2ad8ad970b40748559798f24a..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ /dev/null
@@ -1,484 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_pass_builder.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-extern const std::vector<std::string> kTRTSubgraphPasses;
-extern const std::vector<std::string> kAnakinSubgraphPasses;
-
-PassStrategy *AnalysisConfig::pass_builder() const {
-  if (!pass_builder_.get()) {
-    if (use_gpu_) {
-      LOG(INFO) << "Create GPU IR passes";
-      pass_builder_.reset(new GpuPassStrategy);
-    } else {
-      LOG(INFO) << "Create CPU IR passes";
-      pass_builder_.reset(new CpuPassStrategy);
-    }
-  } else if (pass_builder_->use_gpu() ^ use_gpu()) {
-    LOG(WARNING) << "The use_gpu flag is not compatible between Config and "
-                    "PassBuilder, the flags are "
-                 << use_gpu() << " " << pass_builder_->use_gpu();
-    LOG(WARNING) << "Please make them compatible, still use the existing "
-                    "PassBuilder.";
-  }
-
-  return pass_builder_.get();
-}
-
-AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
-  model_dir_ = model_dir;
-
-  Update();
-}
-AnalysisConfig::AnalysisConfig(const std::string &prog_file,
-                               const std::string &params_file) {
-  prog_file_ = prog_file;
-  params_file_ = params_file;
-
-  Update();
-}
-void AnalysisConfig::SetModel(const std::string &prog_file_path,
-                              const std::string &params_file_path) {
-  prog_file_ = prog_file_path;
-  params_file_ = params_file_path;
-
-  Update();
-}
-void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
-                                  int device_id) {
-#ifdef PADDLE_WITH_CUDA
-  use_gpu_ = true;
-  memory_pool_init_size_mb_ = memory_pool_init_size_mb;
-  device_id_ = device_id;
-#else
-  LOG(ERROR) << "Please compile with gpu to EnableGpu()";
-  use_gpu_ = false;
-#endif
-
-  Update();
-}
-void AnalysisConfig::DisableGpu() {
-  use_gpu_ = false;
-
-  Update();
-}
-
-AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
-#define CP_MEMBER(member__) member__ = other.member__;
-
-  // Model related.
-  CP_MEMBER(model_dir_);
-  CP_MEMBER(model_from_memory_);  // the memory model reuses prog_file_ and
-                                  // params_file_ fields.
-
-  CP_MEMBER(opt_cache_dir_);
-  prog_file_ = std::move(other.prog_file_);
-  params_file_ = std::move(other.params_file_);
-
-  // GPU related.
-  CP_MEMBER(use_gpu_);
-  CP_MEMBER(use_cudnn_);
-  CP_MEMBER(device_id_);
-  CP_MEMBER(memory_pool_init_size_mb_);
-
-  CP_MEMBER(enable_memory_optim_);
-  // TensorRT related.
-  CP_MEMBER(use_tensorrt_);
-  CP_MEMBER(tensorrt_workspace_size_);
-  CP_MEMBER(tensorrt_max_batchsize_);
-  CP_MEMBER(tensorrt_min_subgraph_size_);
-  CP_MEMBER(tensorrt_precision_mode_);
-  CP_MEMBER(trt_use_static_engine_);
-  CP_MEMBER(trt_use_calib_mode_);
-  // NGRAPH related.
-  CP_MEMBER(use_ngraph_);
-  // MKLDNN related.
-  CP_MEMBER(use_mkldnn_);
-  CP_MEMBER(mkldnn_enabled_op_types_);
-  CP_MEMBER(mkldnn_cache_capacity_);
-  // Quantization related.
-  CP_MEMBER(use_mkldnn_quantizer_);
-  CP_MEMBER(mkldnn_quantizer_config_);
-
-  CP_MEMBER(use_anakin_);
-  CP_MEMBER(anakin_max_batchsize_);
-  CP_MEMBER(anakin_max_input_shape_);
-  CP_MEMBER(anakin_min_subgraph_size_);
-  CP_MEMBER(anakin_precision_mode_);
-  CP_MEMBER(anakin_auto_config_layout_);
-  CP_MEMBER(anakin_passes_filter_);
-  CP_MEMBER(anakin_ops_filter_);
-
-  // profile related.
-  CP_MEMBER(with_profile_);
-
-  // Ir related.
-  CP_MEMBER(enable_ir_optim_);
-  CP_MEMBER(use_feed_fetch_ops_);
-  CP_MEMBER(ir_debug_);
-  CP_MEMBER(specify_input_name_);
-
-  CP_MEMBER(cpu_math_library_num_threads_);
-
-  CP_MEMBER(serialized_info_cache_);
-
-  if (use_gpu_) {
-    pass_builder_.reset(new GpuPassStrategy(
-        *static_cast<GpuPassStrategy *>(other.pass_builder())));
-  } else {
-    pass_builder_.reset(new CpuPassStrategy(
-        *static_cast<CpuPassStrategy *>(other.pass_builder())));
-  }
-
-#undef CP_MEMBER
-
-  Update();
-}
-
-void AnalysisConfig::EnableCUDNN() {
-#ifdef PADDLE_WITH_CUDA
-  use_cudnn_ = use_gpu_;
-#else
-  LOG(ERROR) << "Please compile with CUDA first to use cuDNN";
-  use_cudnn_ = false;
-#endif
-
-  Update();
-}
-
-void AnalysisConfig::EnableMKLDNN() {
-#ifdef PADDLE_WITH_MKLDNN
-  use_mkldnn_ = true;
-#else
-  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
-  use_mkldnn_ = false;
-#endif
-
-  Update();
-}
-
-void AnalysisConfig::SetMkldnnCacheCapacity(int capacity) {
-#ifdef PADDLE_WITH_MKLDNN
-  mkldnn_cache_capacity_ = capacity;
-#else
-  LOG(ERROR) << "Please compile with MKLDNN first to set MKLDNN Thread Id";
-  mkldnn_cache_capacity_ = 0;
-#endif
-}
-
-void AnalysisConfig::EnableMkldnnQuantizer() {
-#ifdef PADDLE_WITH_MKLDNN
-  if (!mkldnn_quantizer_config_)
-    mkldnn_quantizer_config_.reset(new MkldnnQuantizerConfig());
-  use_mkldnn_quantizer_ = true;
-#else
-  LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
-  use_mkldnn_quantizer_ = false;
-#endif
-
-  Update();
-}
-
-void AnalysisConfig::EnableNgraph() {
-#ifdef PADDLE_WITH_NGRAPH
-  pass_builder()->EnableNgraph();
-  use_ngraph_ = true;
-#else
-  LOG(ERROR) << "Please compile with NGRAPH first to use NGRAPH";
-  use_ngraph_ = false;
-#endif
-}
-
-MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
-  PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
-                          "MkldnnQuantizer was not enabled yet.");
-  return mkldnn_quantizer_config_.get();
-}
-
-void AnalysisConfig::EnableTensorRtEngine(
-    int workspace_size, int max_batch_size, int min_subgraph_size,
-    AnalysisConfig::Precision precision_mode, bool use_static,
-    bool use_calib_mode) {
-#ifdef PADDLE_WITH_CUDA
-  if (!use_gpu()) {
-    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
-    return;
-  }
-
-  use_tensorrt_ = true;
-  tensorrt_workspace_size_ = workspace_size;
-  tensorrt_max_batchsize_ = max_batch_size;
-  tensorrt_min_subgraph_size_ = min_subgraph_size;
-  tensorrt_precision_mode_ = precision_mode;
-  trt_use_static_engine_ = use_static;
-  trt_use_calib_mode_ = use_calib_mode;
-
-  Update();
-#else
-  LOG(ERROR)
-      << "To use TensorRT engine, please compile inference lib with GPU first.";
-#endif
-}
-
-// TODO(Superjomn) refactor this, buggy.
-void AnalysisConfig::Update() {
-  auto info = SerializeInfoCache();
-  if (info == serialized_info_cache_) return;
-
-  // Transfer pass_builder and copy the existing compatible passes.
-  if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) {
-    if (use_gpu()) {
-      pass_builder_.reset(new GpuPassStrategy);
-
-      if (use_tensorrt_) {
-        // Append after the Affine_channel_conv_fuse pass.
-        pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
-      }
-    } else {
-      pass_builder_.reset(new CpuPassStrategy);
-    }
-
-  } else {
-    if (use_gpu()) {
-      pass_builder_.reset(new GpuPassStrategy(
-          *static_cast<GpuPassStrategy *>(pass_builder_.get())));
-
-    } else {
-      pass_builder_.reset(new CpuPassStrategy(
-          *static_cast<CpuPassStrategy *>(pass_builder_.get())));
-    }
-  }
-
-  if (use_tensorrt_) {
-    pass_builder()->ClearPasses();
-    for (const auto &pass : kTRTSubgraphPasses) {
-      pass_builder()->AppendPass(pass);
-    }
-  }
-  if (use_gpu() && use_cudnn_) {
-#ifdef PADDLE_WITH_CUDA
-    if (!enable_ir_optim_) {
-      LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled.";
-    } else {
-      pass_builder()->EnableCUDNN();
-    }
-#endif
-  }
-
-  if (use_ngraph_) {
-    if (!enable_ir_optim_) {
-      LOG(ERROR)
-          << "EnableNgraph() only works when IR optimization is enabled.";
-    }
-#ifdef PADDLE_WITH_NGRAPH
-    pass_builder()->EnableNgraph();
-    use_ngraph_ = true;
-#else
-    LOG(ERROR) << "Please compile with NGRAPH first to use NGRAPH";
-    use_ngraph_ = false;
-#endif
-  }
-
-  if (use_mkldnn_) {
-#ifdef PADDLE_WITH_MKLDNN
-    if (!enable_ir_optim_) {
-      LOG(ERROR)
-          << "EnableMKLDNN() only works when IR optimization is enabled.";
-    } else {
-      pass_builder()->EnableMKLDNN();
-    }
-#endif
-  }
-
-  // Quantization passes must come after all other optimization passes
-  if (use_mkldnn_quantizer_) {
-    if (!enable_ir_optim_) {
-      LOG(ERROR) << "EnableMkldnnQuantizer() only works when IR optimization "
-                    "is enabled.";
-    }
-#ifdef PADDLE_WITH_MKLDNN
-    pass_builder()->EnableMkldnnQuantizer();
-#endif
-  }
-
-#ifdef PADDLE_WITH_MKLDNN
-  // Do not optimize before quantization
-  if (enable_memory_optim_ && !use_mkldnn_quantizer_) {
-#else
-  if (enable_memory_optim_) {
-#endif
-    pass_builder()->AppendAnalysisPass("memory_optimize_pass");
-  }
-
-  if (use_anakin_) {
-    PADDLE_ENFORCE(!use_tensorrt_,
-                   "Anakin sub-graph and TensorRT sub-graph are not allowed to "
-                   "run at the same time!");
-    if (use_gpu_) {
-      LOG(INFO) << "Run Anakin GPU mode";
-    } else {
-      LOG(INFO) << "Run Anakin CPU mode";
-    }
-
-    pass_builder()->ClearPasses();
-    for (const auto &pass : kAnakinSubgraphPasses) {
-      if (std::find(anakin_passes_filter_.begin(), anakin_passes_filter_.end(),
-                    pass) == anakin_passes_filter_.end()) {
-        pass_builder()->AppendPass(pass);
-      }
-    }
-  }
-
-  if (ir_debug_) {
-    pass_builder()->TurnOnDebug();
-  }
-}
-
-std::string AnalysisConfig::SerializeInfoCache() {
-  std::stringstream ss;
-  ss << model_dir_;
-  ss << prog_file_;
-  ss << params_file_;
-
-  ss << use_gpu_;
-  ss << device_id_;
-  ss << memory_pool_init_size_mb_;
-
-  ss << use_tensorrt_;
-  ss << tensorrt_workspace_size_;
-  ss << tensorrt_max_batchsize_;
-  ss << tensorrt_min_subgraph_size_;
-
-  ss << enable_memory_optim_;
-
-  ss << use_ngraph_;
-
-  ss << use_mkldnn_;
-  ss << mkldnn_cache_capacity_;
-  for (auto &item : mkldnn_enabled_op_types_) ss << item;
-  ss << ";";
-
-  ss << use_mkldnn_quantizer_;
-  ss << model_from_memory_;
-
-  ss << with_profile_;
-
-  ss << enable_ir_optim_;
-  ss << use_feed_fetch_ops_;
-  ss << ir_debug_;
-
-  ss << specify_input_name_;
-  ss << cpu_math_library_num_threads_;
-  ss << use_anakin_;
-  ss << anakin_min_subgraph_size_;
-  return ss.str();
-}
-
-void AnalysisConfig::SetCpuMathLibraryNumThreads(
-    int cpu_math_library_num_threads) {
-  cpu_math_library_num_threads_ = cpu_math_library_num_threads;
-
-  Update();
-}
-
-float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
-#ifdef PADDLE_WITH_CUDA
-  // Get the GPU memory details and calculate the fraction of memory for the
-  // GPU memory pool.
-  size_t gpu_used, gpu_available;
-  platform::SetDeviceId(device_id_);
-  platform::GpuMemoryUsage(&gpu_used, &gpu_available);
-  double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.;
-  float fraction_of_gpu_memory =
-      static_cast<double>(memory_pool_init_size_mb()) / total_gpu_memory;
-  return fraction_of_gpu_memory;
-#else
-  return 0.;
-#endif
-}
-
-void AnalysisConfig::EnableMemoryOptim() {
-  enable_memory_optim_ = true;
-  Update();
-}
-
-bool AnalysisConfig::enable_memory_optim() const {
-  return enable_memory_optim_;
-}
-
-void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
-                                    size_t prog_buffer_size,
-                                    const char *param_buffer,
-                                    size_t param_buffer_size) {
-  prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
-  params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
-  model_from_memory_ = true;
-
-  Update();
-}
-
-NativeConfig AnalysisConfig::ToNativeConfig() const {
-  NativeConfig config;
-  config.model_dir = model_dir_;
-  config.prog_file = prog_file_;
-  config.param_file = params_file_;
-  config.use_gpu = use_gpu_;
-  config.device = device_id_;
-  config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
-  config.specify_input_name = specify_input_name_;
-  return config;
-}
-
-void AnalysisConfig::SwitchIrDebug(int x) {
-  ir_debug_ = x;
-  Update();
-}
-
-void AnalysisConfig::EnableProfile() {
-  with_profile_ = true;
-  Update();
-}
-
-void AnalysisConfig::EnableAnakinEngine(
-    int max_batch_size, std::map<std::string, std::vector<int>> max_input_shape,
-    int min_subgraph_size, AnalysisConfig::Precision precision_mode,
-    bool auto_config_layout, std::vector<std::string> passes_filter,
-    std::vector<std::string> ops_filter) {
-  anakin_max_batchsize_ = max_batch_size;
-  anakin_max_input_shape_ = max_input_shape;
-  anakin_min_subgraph_size_ = min_subgraph_size;
-  anakin_passes_filter_ = passes_filter;
-  anakin_ops_filter_ = ops_filter;
-  use_anakin_ = true;
-  anakin_precision_mode_ = precision_mode;
-  anakin_auto_config_layout_ = auto_config_layout;
-  Update();
-}
-
-void AnalysisConfig::PartiallyRelease() {
-  prog_file_.clear();
-  prog_file_.shrink_to_fit();
-  params_file_.clear();
-  params_file_.shrink_to_fit();
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
deleted file mode 100644
index d47bde32de65d7db60f7dd308a67f9e443289bb5..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ /dev/null
@@ -1,931 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include <glog/logging.h>
-#include <algorithm>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
-#endif
-
-#if PADDLE_WITH_TENSORRT
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
-#endif
-
-#if PADDLE_WITH_ANAKIN
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#endif
-
-namespace paddle {
-
-using inference::Singleton;
-#if PADDLE_WITH_TENSORRT
-using inference::tensorrt::TRTInt8Calibrator;
-using inference::tensorrt::TRTCalibratorEngine;
-using inference::tensorrt::TRTCalibratorEngineManager;
-#endif
-
-namespace {
-bool IsPersistable(const framework::VarDesc *var) {
-  if (var->Persistable() &&
-      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != framework::proto::VarType::FETCH_LIST &&
-      var->GetType() != framework::proto::VarType::RAW) {
-    return true;
-  }
-  return false;
-}
-}  // namespace
-
-bool AnalysisPredictor::Init(
-    const std::shared_ptr<framework::Scope> &parent_scope,
-    const std::shared_ptr<framework::ProgramDesc> &program) {
-  VLOG(3) << "Predictor::init()";
-  if (config_.with_profile_) {
-    LOG(WARNING) << "Profiler is activated, which might affect the performance";
-    auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll
-                                             : platform::ProfilerState::kCPU;
-    platform::EnableProfiler(tracking_device);
-  } else {
-    LOG(INFO) << "Profiler is deactivated, and no profiling report will be "
-                 "generated.";
-  }
-
-  // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
-
-  if (!PrepareScope(parent_scope)) {
-    return false;
-  }
-  if (!CreateExecutor()) {
-    return false;
-  }
-  if (!PrepareProgram(program)) {
-    return false;
-  }
-
-  // Prepare executor, create local variables.
-  if (!PrepareExecutor()) {
-    return true;
-  }
-
-  // Get the feed_target_names and fetch_target_names
-  PrepareFeedFetch();
-
-  return true;
-}
-
-bool AnalysisPredictor::PrepareScope(
-    const std::shared_ptr<framework::Scope> &parent_scope) {
-  if (parent_scope) {
-    PADDLE_ENFORCE_NOT_NULL(
-        parent_scope,
-        "Both program and parent_scope should be set in Clone mode.");
-    scope_ = parent_scope;
-    status_is_cloned_ = true;
-  } else {
-    if (config_.use_gpu_) {
-      paddle::framework::InitDevices(false, {config_.device_id_});
-    } else {
-      paddle::framework::InitDevices(false, {});
-    }
-    scope_.reset(new paddle::framework::Scope());
-    status_is_cloned_ = false;
-  }
-  sub_scope_ = &scope_->NewScope();
-  return true;
-}
-bool AnalysisPredictor::PrepareProgram(
-    const std::shared_ptr<framework::ProgramDesc> &program) {
-  if (!program) {
-    if (!LoadProgramDesc()) return false;
-    // If not cloned, the parameters should be loaded.
-    // If config_.ir_optim() is True, parameters is loaded in
-    // OptimizeInferenceProgram(), but other persistable variables
-    // (like RAW type var) are not created in scope.
-    // If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
-    // still need to create other persistable variables.
-    // So in both case, create persistable variables at first.
-    executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
-
-    // if enable_ir_optim_ is false,
-    // the analysis pass(op fuse, graph analysis, trt subgraph, mkldnn etc) will
-    // not be executed.
-    OptimizeInferenceProgram();
-  } else {
-    // If the program is passed from external, no need to optimize it, this
-    // logic is used in the clone scenario.
-    inference_program_ = program;
-  }
-
-  executor_->CreateVariables(*inference_program_, 0, false, sub_scope_);
-
-  return true;
-}
-bool AnalysisPredictor::CreateExecutor() {
-  if (config_.use_gpu_) {
-    status_use_gpu_ = true;
-    place_ = paddle::platform::CUDAPlace(config_.device_id_);
-  } else {
-    place_ = paddle::platform::CPUPlace();
-  }
-  executor_.reset(new paddle::framework::NaiveExecutor(place_));
-  return true;
-}
-bool AnalysisPredictor::PrepareExecutor() {
-  executor_->Prepare(sub_scope_, *inference_program_, 0,
-                     config_.use_feed_fetch_ops_);
-
-  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
-
-  return true;
-}
-
-void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
-#ifdef PADDLE_WITH_MKLDNN
-  VLOG(2) << "AnalysisPredictor::Run get_cur_mkldnn_session_id="
-          << platform::get_cur_mkldnn_session_id();
-  // In cache clearing mode.
-  if (config_.mkldnn_cache_capacity_ > 0) {
-    VLOG(2) << "In mkldnn cache clear mode.";
-    platform::set_cur_mkldnn_session_id(
-        platform::kMKLDNNSessionID_CacheClearing);
-    platform::set_cur_input_shape_cache_capacity(
-        config_.mkldnn_cache_capacity_);
-    // Set current_input_shape for caching dynamic shape.
-    std::stringstream ss;
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      for (size_t j = 0; j < inputs[i].shape.size(); ++j) {
-        ss << inputs[i].shape[j] << "-";
-      }
-    }
-    VLOG(2) << "Set input shape=" << ss.str();
-    platform::set_cur_input_shape_str(ss.str());
-  }
-#endif
-}
-
-void AnalysisPredictor::MkldnnPostReset() {
-#ifdef PADDLE_WITH_MKLDNN
-  // In cache clearing mode.
-  if (config_.mkldnn_cache_capacity_ > 0) {
-    paddle::platform::set_cur_mkldnn_session_id(
-        platform::kMKLDNNSessionID_Default);
-    platform::set_cur_input_shape_cache_capacity(0);
-    platform::set_cur_input_shape_str("");
-  }
-#endif
-}
-
-bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
-                            std::vector<PaddleTensor> *output_data,
-                            int batch_size) {
-  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
-#ifdef PADDLE_WITH_MKLDNN
-  if (config_.use_mkldnn_) MkldnnPreSet(inputs);
-#endif
-  VLOG(3) << "Predictor::predict";
-  inference::Timer timer;
-  timer.tic();
-  // set feed variable
-  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
-  PADDLE_ENFORCE_NOT_NULL(scope, "The scope should not be nullptr.");
-  if (!SetFeed(inputs, scope)) {
-    LOG(ERROR) << "fail to set feed";
-    return false;
-  }
-
-  // Run the inference program
-  // if share variables, we need not create variables
-  executor_->Run();
-
-  // get fetch variable
-  if (!GetFetch(output_data, scope)) {
-    LOG(ERROR) << "fail to get fetches";
-    return false;
-  }
-
-  VLOG(3) << "predict cost: " << timer.toc() << "ms";
-
-  // All the containers in the scope will be hold in inference, but the
-  // operators assume that the container will be reset after each batch.
-  // Here is a bugfix, collect all the container variables, and reset then to a
-  // bool; the next time, the operator will call MutableData and construct a new
-  // container again, so that the container will be empty for each batch.
-  if (sub_scope_) {
-    tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
-  }
-  tensor_array_batch_cleaner_.ResetNoTensorVars();
-
-  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
-  // conflict when integrating it into deployment service.
-  paddle::platform::SetNumThreads(1);
-#ifdef PADDLE_WITH_MKLDNN
-  if (config_.use_mkldnn_) MkldnnPostReset();
-#endif
-  return true;
-}
-
-bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
-                                framework::Scope *scope) {
-  VLOG(3) << "Predictor::set_feed";
-  if (inputs.size() != feeds_.size()) {
-    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
-               << inputs.size();
-    return false;
-  }
-
-  // Cache the inputs memory for better concurrency performance.
-  feed_tensors_.resize(inputs.size());
-
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    auto &input = feed_tensors_[i];
-    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
-    void *input_ptr;
-    if (inputs[i].dtype == PaddleDType::INT64) {
-      input_ptr = input.mutable_data<int64_t>(ddim, place_);
-    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
-      input_ptr = input.mutable_data<float>(ddim, place_);
-    } else if (inputs[i].dtype == PaddleDType::INT32) {
-      input_ptr = input.mutable_data<int32_t>(ddim, place_);
-    } else {
-      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
-      return false;
-    }
-
-    PADDLE_ENFORCE_NOT_NULL(input_ptr);
-    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
-
-    if (platform::is_cpu_place(place_)) {
-      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
-                  inputs[i].data.length());
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto *dev_ctx =
-          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
-      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
-      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
-                   platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(), dev_ctx->stream());
-#else
-      PADDLE_THROW("Not compile with CUDA, should not reach here.");
-#endif
-    }
-    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
-    framework::LoD lod;
-    for (auto &level : inputs[i].lod) {
-      lod.emplace_back(level);
-    }
-    input.set_lod(lod);
-    int idx = -1;
-    if (config_.specify_input_name_) {
-      auto name = inputs[i].name;
-      if (feed_names_.find(name) == feed_names_.end()) {
-        LOG(ERROR) << "feed names from program do not have name: [" << name
-                   << "] from specified input";
-      }
-      idx = feed_names_[name];
-    } else {
-      idx = boost::get<int>(feeds_[i]->GetAttr("col"));
-    }
-    framework::SetFeedVariable(scope, input, "feed", idx);
-  }
-  return true;
-}
-
-template <typename T>
-void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
-                                    PaddleTensor *output) {
-  // set shape.
-  auto shape = framework::vectorize(fetch.dims());
-  output->shape.assign(shape.begin(), shape.end());
-  // set data.
-  const T *data = fetch.data<T>();
-  int num_elems = inference::VecReduceToInt(shape);
-  output->data.Resize(num_elems * sizeof(T));
-  // The fetched tensor output by fetch op, should always in CPU memory, so just
-  // copy.
-  memcpy(output->data.data(), data, num_elems * sizeof(T));
-  // set lod
-  output->lod.clear();
-  for (auto &level : fetch.lod()) {
-    output->lod.emplace_back(level.begin(), level.end());
-  }
-}
-
-bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
-                                 framework::Scope *scope) {
-  VLOG(3) << "Predictor::get_fetch";
-  outputs->resize(fetches_.size());
-  for (size_t i = 0; i < fetches_.size(); ++i) {
-    int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
-    PADDLE_ENFORCE((size_t)idx == i);
-    framework::LoDTensor &fetch =
-        framework::GetFetchVariable(*scope, "fetch", idx);
-    auto type = fetch.type();
-    auto output = &(outputs->at(i));
-    output->name = fetches_[idx]->Input("X")[0];
-    if (type == framework::proto::VarType::FP32) {
-      GetFetchOne<float>(fetch, output);
-      output->dtype = PaddleDType::FLOAT32;
-    } else if (type == framework::proto::VarType::INT64) {
-      GetFetchOne<int64_t>(fetch, output);
-      output->dtype = PaddleDType::INT64;
-    } else if (type == framework::proto::VarType::INT32) {
-      GetFetchOne<int32_t>(fetch, output);
-      output->dtype = PaddleDType::INT32;
-    } else {
-      LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
-    }
-  }
-  return true;
-}
-
-void AnalysisPredictor::PrepareArgument() {
-  argument_.SetUseGPU(config_.use_gpu());
-  argument_.SetGPUDeviceId(config_.gpu_device_id());
-  argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_);
-  argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
-  argument_.SetModelFromMemory(config_.model_from_memory_);
-  // Analyze inference_program
-  argument_.SetUseAnakin(config_.anakin_engine_enabled());
-  argument_.SetPredictorID(predictor_id_);
-  argument_.SetOptimCacheDir(config_.opt_cache_dir_);
-  if (!config_.model_dir().empty()) {
-    argument_.SetModelDir(config_.model_dir());
-  } else {
-    PADDLE_ENFORCE(
-        !config_.params_file().empty(),
-        "Either model_dir or (param_file, prog_file) should be set.");
-    PADDLE_ENFORCE(!config_.prog_file().empty());
-    std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
-
-    argument_.SetModelProgramPath(config_.prog_file());
-    argument_.SetModelParamsPath(config_.params_file());
-  }
-
-  if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
-    LOG(INFO) << "TensorRT subgraph engine is enabled";
-    argument_.SetUseTensorRT(true);
-    argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
-    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
-    argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
-    argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
-    argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
-    argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
-  }
-
-  if (config_.anakin_engine_enabled()) {
-    argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
-    argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
-    argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_);
-    argument_.SetAnakinPrecisionMode(config_.anakin_precision_mode_);
-    argument_.SetAnakinAutoConfigLayout(config_.anakin_auto_config_layout_);
-    argument_.SetAnakinPassesFilter(config_.anakin_passes_filter_);
-    argument_.SetAnakinOpsFilter(config_.anakin_ops_filter_);
-    LOG(INFO) << "Anakin subgraph engine is enabled";
-  }
-
-  if (config_.use_mkldnn_) {
-    LOG(INFO) << "MKLDNN is enabled";
-    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
-  }
-
-#ifdef PADDLE_WITH_MKLDNN
-  if (config_.mkldnn_quantizer_enabled()) {
-    LOG(INFO) << "Quantization is enabled";
-    argument_.SetQuantizeEnabledOpTypes(
-        config_.mkldnn_quantizer_config()->enabled_op_types());
-    argument_.SetQuantizeExcludedOpIds(
-        config_.mkldnn_quantizer_config()->excluded_op_ids());
-  }
-#endif
-
-  auto passes = config_.pass_builder()->AllPasses();
-  if (!config_.ir_optim()) {
-    passes.clear();
-    LOG(INFO) << "ir_optim is turned off, no IR pass will be executed";
-  }
-  argument_.SetIrAnalysisPasses(passes);
-  argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
-  argument_.SetScopeNotOwned(scope_.get());
-}
-
-// NOTE All the members in AnalysisConfig should be copied to Argument.
-void AnalysisPredictor::OptimizeInferenceProgram() {
-  PrepareArgument();
-  Analyzer().Run(&argument_);
-
-  PADDLE_ENFORCE(argument_.scope_valid());
-  VLOG(5) << "to prepare executor";
-  ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
-  inference_program_.reset(
-      new framework::ProgramDesc(argument_.ir_analyzed_program()));
-  // The config and argument take a lot of storage,
-  // when the predictor settings are complete, we release these stores.
-  argument_.PartiallyRelease();
-  config_.PartiallyRelease();
-  LOG(INFO) << "======= optimize end =======";
-}
-
-template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
-  VLOG(3) << "create AnalysisConfig";
-  PADDLE_ENFORCE(config.is_valid(),
-                 "Note: Each config can only be used for one predictor.");
-  if (config.use_gpu()) {
-    // 1. GPU memory
-    PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
-    PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
-                      config.gpu_device_id());
-    std::vector<std::string> flags;
-
-    float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
-    if (fraction_of_gpu_memory > 0.95f) {
-      LOG(ERROR)
-          << "Allocate too much memory for the GPU memory pool, assigned "
-          << config.memory_pool_init_size_mb() << " MB";
-      LOG(ERROR)
-          << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
-    }
-
-    if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
-      flags.push_back("dummy");
-      std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         std::to_string(fraction_of_gpu_memory);
-      flags.push_back(flag);
-      flags.push_back("--selected_gpus=" +
-                      std::to_string(config.gpu_device_id()));
-      VLOG(3) << "set flag: " << flag;
-      framework::InitGflags(flags);
-    }
-  }
-
-  std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
-  // Each config can only be used for one predictor.
-  config.SetInValid();
-  auto predictor_p = dynamic_cast<AnalysisPredictor *>(predictor.get());
-
-  if (!predictor_p->Init(nullptr)) {
-    return nullptr;
-  }
-
-  if (config.mkldnn_quantizer_enabled() && !predictor_p->MkldnnQuantize()) {
-    return nullptr;
-  }
-
-  return predictor;
-}
-
-bool AnalysisPredictor::MkldnnQuantize() {
-#if PADDLE_WITH_MKLDNN
-  if (!mkldnn_quantizer_)
-    mkldnn_quantizer_ = new AnalysisPredictor::MkldnnQuantizer(
-        *this, config_.mkldnn_quantizer_config());
-  return mkldnn_quantizer_->Quantize();
-#else
-  LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
-  return false;
-#endif
-}
-
-void AnalysisPredictor::PrepareFeedFetch() {
-  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
-  CreateFeedFetchVar(sub_scope_);
-  for (auto *op : inference_program_->Block(0).AllOps()) {
-    if (op->Type() == "feed") {
-      int idx = boost::get<int>(op->GetAttr("col"));
-      if (feeds_.size() <= static_cast<size_t>(idx)) {
-        feeds_.resize(idx + 1);
-      }
-      feeds_[idx] = op;
-      feed_names_[op->Output("Out")[0]] = idx;
-      idx2feeds_[idx] = op->Output("Out")[0];
-    } else if (op->Type() == "fetch") {
-      int idx = boost::get<int>(op->GetAttr("col"));
-      if (fetches_.size() <= static_cast<size_t>(idx)) {
-        fetches_.resize(idx + 1);
-      }
-      fetches_[idx] = op;
-      idx2fetches_[idx] = op->Input("X")[0];
-    }
-  }
-}
-
-void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
-  PADDLE_ENFORCE_NOT_NULL(scope);
-  auto *var = scope->Var("feed");
-  var->GetMutable<framework::FeedFetchList>();
-  var = scope->Var("fetch");
-  var->GetMutable<framework::FeedFetchList>();
-}
-
-std::vector<std::string> AnalysisPredictor::GetInputNames() {
-  std::vector<std::string> input_names;
-  for (auto &item : idx2feeds_) {
-    input_names.push_back(item.second);
-  }
-  return input_names;
-}
-
-std::map<std::string, std::vector<int64_t>>
-AnalysisPredictor::GetInputTensorShape() {
-  std::map<std::string, std::vector<int64_t>> input_shapes;
-  std::vector<std::string> names = GetInputNames();
-  for (std::string name : names) {
-    auto *var = inference_program_->Block(0).FindVar(name);
-    PADDLE_ENFORCE_NOT_NULL(var, "input %s does not exist.", name);
-    input_shapes[name] = var->GetShape();
-  }
-  return input_shapes;
-}
-
-std::vector<std::string> AnalysisPredictor::GetOutputNames() {
-  std::vector<std::string> output_names;
-  for (auto &item : idx2fetches_) {
-    output_names.push_back(item.second);
-  }
-  return output_names;
-}
-
-std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
-    const std::string &name) {
-  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
-  res->input_or_output_ = true;
-  res->SetName(name);
-  if (platform::is_cpu_place(place_)) {
-    res->SetPlace(PaddlePlace::kCPU);
-  } else {
-    auto gpu_place = boost::get<platform::CUDAPlace>(place_);
-    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
-  }
-
-  return res;
-}
-
-std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
-    const std::string &name) {
-  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
-  res->input_or_output_ = false;
-  res->SetName(name);
-  if (platform::is_cpu_place(place_)) {
-    res->SetPlace(PaddlePlace::kCPU);
-  } else {
-    auto gpu_place = boost::get<platform::CUDAPlace>(place_);
-    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
-  }
-  return res;
-}
-
-bool AnalysisPredictor::ZeroCopyRun() {
-  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
-  executor_->Run();
-  // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
-  tensor_array_batch_cleaner_.ResetTensorArray();
-
-  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
-  // conflict when integrating it into deployment service.
-  paddle::platform::SetNumThreads(1);
-  return true;
-}
-
-bool AnalysisPredictor::LoadProgramDesc() {
-  // Initialize the inference program
-  std::string filename;
-  if (!config_.model_dir().empty()) {
-    filename = config_.model_dir() + "/__model__";
-  } else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
-    // All parameters are saved in a single file.
-    // The file names should be consistent with that used
-    // in Python API `fluid.io.save_inference_model`.
-    filename = config_.prog_file();
-  } else {
-    if (config_.model_dir().empty() && config_.prog_file().empty()) {
-      LOG(ERROR)
-          << "Either model_dir or (prog_file, param_file) should be set.";
-      return false;
-    }
-    LOG(ERROR) << string::Sprintf(
-        "not valid model path '%s' or program path '%s'.", config_.model_dir(),
-        config_.params_file());
-    return false;
-  }
-
-  // Create ProgramDesc
-  framework::proto::ProgramDesc proto;
-  if (!config_.model_from_memory()) {
-    std::string pb_content;
-    // Read binary
-    std::ifstream fin(filename, std::ios::in | std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
-                   filename);
-    fin.seekg(0, std::ios::end);
-    pb_content.resize(fin.tellg());
-    fin.seekg(0, std::ios::beg);
-    fin.read(&(pb_content.at(0)), pb_content.size());
-    fin.close();
-
-    proto.ParseFromString(pb_content);
-  } else {
-    proto.ParseFromString(config_.prog_file());
-  }
-  inference_program_.reset(new framework::ProgramDesc(proto));
-  return true;
-}
-
-bool AnalysisPredictor::LoadParameters() {
-  PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
-                          "The inference program should be loaded first.");
-
-  const auto &global_block = inference_program_->MutableBlock(0);
-
-  // create a temporary program to load parameters.
-
-  std::unique_ptr<framework::ProgramDesc> load_program(
-      new framework::ProgramDesc());
-  framework::BlockDesc *load_block = load_program->MutableBlock(0);
-  std::vector<std::string> params;
-
-  for (auto *var : global_block->AllVars()) {
-    if (IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
-
-      framework::VarDesc *new_var = load_block->Var(var->Name());
-      new_var->SetShape(var->GetShape());
-      new_var->SetDataType(var->GetDataType());
-      new_var->SetType(var->GetType());
-      new_var->SetLoDLevel(var->GetLoDLevel());
-      new_var->SetPersistable(true);
-
-      if (!config_.params_file().empty()) {
-        params.push_back(new_var->Name());
-      } else {
-        // append_op
-        framework::OpDesc *op = load_block->AppendOp();
-        op->SetType("load");
-        op->SetOutput("Out", {new_var->Name()});
-        op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()});
-        op->CheckAttrs();
-      }
-    }
-  }
-
-  if (!config_.params_file().empty()) {
-    // sort paramlist to have consistent ordering
-    std::sort(params.begin(), params.end());
-    // append just the load_combine op
-    framework::OpDesc *op = load_block->AppendOp();
-    op->SetType("load_combine");
-    op->SetOutput("Out", params);
-    op->SetAttr("file_path", {config_.params_file()});
-    op->CheckAttrs();
-  }
-
-  // Use NaiveExecutor to Load parameters.
-  framework::NaiveExecutor e(place_);
-  e.Prepare(scope_.get(), *load_program, 0, false);
-  e.Run();
-  VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load";
-
-  return true;
-}
-
-#if PADDLE_WITH_TENSORRT
-bool AnalysisPredictor::SaveTrtCalibToDisk() {
-  PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
-                 "This func can be invoked only in trt mode");
-  auto &block = inference_program_->Block(0);
-  for (auto &op_desc : block.AllOps()) {
-    if (op_desc->Type() == "tensorrt_engine") {
-      std::string engine_name =
-          boost::get<std::string>(op_desc->GetAttr("engine_key"));
-      if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_name)) {
-        LOG(ERROR) << "You should run the predictor(with trt) on the real data "
-                      "to generate calibration info";
-        return false;
-      }
-      TRTCalibratorEngine *calib_engine =
-          Singleton<TRTCalibratorEngineManager>::Global().Get(engine_name);
-      LOG(INFO) << "Wait for calib threads done.";
-      calib_engine->calib_->waitAndSetDone();
-      LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot "
-                   "of time...";
-      calib_engine->thr_->join();
-      std::string calibration_table_data =
-          calib_engine->calib_->getCalibrationTableAsString();
-
-      if (calibration_table_data.empty()) {
-        LOG(ERROR) << "the calibration table is empty.";
-        return false;
-      }
-
-      std::string model_opt_cache_dir =
-          argument_.Has("model_dir")
-              ? argument_.model_dir()
-              : inference::analysis::GetDirRoot(argument_.model_program_path());
-
-      std::string calibration_table_data_path =
-          inference::analysis::GetTrtCalibPath(
-              inference::analysis::GetOrCreateModelOptCacheDir(
-                  model_opt_cache_dir),
-              engine_name);
-
-      std::ofstream ofile(calibration_table_data_path, std::ios::out);
-      LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file "
-                << calibration_table_data_path;
-      ofile << calibration_table_data;
-      ofile.close();
-    }
-  }
-  // Free all calibrator resources.
-  Singleton<TRTCalibratorEngineManager>::Global().DeleteALL();
-  return true;
-}
-#endif
-
-AnalysisPredictor::~AnalysisPredictor() {
-#if PADDLE_WITH_TENSORRT
-  if (config_.tensorrt_engine_enabled() &&
-      config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
-      Singleton<TRTCalibratorEngineManager>::Global().Has()) {
-    SaveTrtCalibToDisk();
-  }
-#endif
-  if (config_.with_profile_) {
-    platform::DisableProfiler(platform::EventSortingKey::kTotal,
-                              "./profile.log");
-  }
-  if (sub_scope_) {
-    scope_->DeleteScope(sub_scope_);
-  }
-
-#if PADDLE_WITH_MKLDNN
-  if (mkldnn_quantizer_) {
-    delete mkldnn_quantizer_;
-    mkldnn_quantizer_ = nullptr;
-  }
-#endif
-}
-
-std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
-  std::lock_guard<std::mutex> lk(clone_mutex_);
-  auto *x = new AnalysisPredictor(config_);
-  x->Init(scope_, inference_program_);
-  return std::unique_ptr<PaddlePredictor>(x);
-}
-
-std::string AnalysisPredictor::GetSerializedProgram() const {
-  return inference_program_->Proto()->SerializeAsString();
-}
-
-// Add SaveOptimModel
-void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
-  // save model
-  std::string model_name = dir + "/model";
-  std::ofstream outfile;
-  outfile.open(model_name, std::ios::out | std::ios::binary);
-  std::string inference_prog_desc = GetSerializedProgram();
-  outfile << inference_prog_desc;
-  // save params
-  framework::ProgramDesc save_program;
-  auto *save_block = save_program.MutableBlock(0);
-
-  const framework::ProgramDesc &main_program = program();
-  const framework::BlockDesc &global_block = main_program.Block(0);
-  std::vector<std::string> save_var_list;
-  for (framework::VarDesc *var : global_block.AllVars()) {
-    if (IsPersistable(var)) {
-      framework::VarDesc *new_var = save_block->Var(var->Name());
-      new_var->SetShape(var->GetShape());
-      new_var->SetDataType(var->GetDataType());
-      new_var->SetType(var->GetType());
-      new_var->SetLoDLevel(var->GetLoDLevel());
-      new_var->SetPersistable(true);
-
-      save_var_list.push_back(new_var->Name());
-    }
-  }
-  std::sort(save_var_list.begin(), save_var_list.end());
-  auto *op = save_block->AppendOp();
-  op->SetType("save_combine");
-  op->SetInput("X", save_var_list);
-  op->SetAttr("file_path", dir + "/params");
-  op->CheckAttrs();
-
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  exe.Run(save_program, scope(), 0, true, true);
-}
-
-template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
-    const AnalysisConfig &config) {
-  return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-      config);
-}
-
-}  // namespace paddle
-
-#if PADDLE_WITH_TENSORRT
-USE_TRT_CONVERTER(elementwise_add_weight);
-USE_TRT_CONVERTER(elementwise_add_tensor);
-USE_TRT_CONVERTER(elementwise_sub_tensor);
-USE_TRT_CONVERTER(elementwise_div_tensor);
-USE_TRT_CONVERTER(elementwise_mul_tensor);
-USE_TRT_CONVERTER(elementwise_max_tensor);
-USE_TRT_CONVERTER(elementwise_min_tensor);
-USE_TRT_CONVERTER(elementwise_pow_tensor);
-USE_TRT_CONVERTER(mul);
-USE_TRT_CONVERTER(conv2d);
-USE_TRT_CONVERTER(relu);
-USE_TRT_CONVERTER(sigmoid);
-USE_TRT_CONVERTER(tanh);
-USE_TRT_CONVERTER(fc);
-USE_TRT_CONVERTER(pool2d);
-USE_TRT_CONVERTER(softmax);
-USE_TRT_CONVERTER(batch_norm);
-USE_TRT_CONVERTER(concat);
-USE_TRT_CONVERTER(dropout);
-USE_TRT_CONVERTER(pad);
-USE_TRT_CONVERTER(split);
-USE_TRT_CONVERTER(prelu);
-USE_TRT_CONVERTER(conv2d_transpose);
-USE_TRT_CONVERTER(leaky_relu);
-USE_TRT_CONVERTER(shuffle_channel);
-USE_TRT_CONVERTER(swish);
-#endif
-
-#if PADDLE_WITH_ANAKIN
-USE_ANAKIN_CONVERTER(mul);
-USE_ANAKIN_CONVERTER(fc);
-USE_ANAKIN_CONVERTER(conv2d);
-USE_ANAKIN_CONVERTER(conv2d_fusion);
-USE_ANAKIN_CONVERTER(concat);
-USE_ANAKIN_CONVERTER(split);
-USE_ANAKIN_CONVERTER(relu);
-USE_ANAKIN_CONVERTER(sigmoid);
-USE_ANAKIN_CONVERTER(tanh);
-USE_ANAKIN_CONVERTER(pool2d);
-USE_ANAKIN_CONVERTER(elementwise_add);
-USE_ANAKIN_CONVERTER(elementwise_mul);
-USE_ANAKIN_CONVERTER(batch_norm);
-USE_ANAKIN_CONVERTER(flatten);
-USE_ANAKIN_CONVERTER(reshape);
-USE_ANAKIN_CONVERTER(transpose);
-USE_ANAKIN_CONVERTER(softmax);
-USE_ANAKIN_CONVERTER(detection_out);
-USE_ANAKIN_CONVERTER(density_prior_box);
-USE_ANAKIN_CONVERTER(dropout);
-USE_ANAKIN_CONVERTER(sum);
-USE_ANAKIN_CONVERTER(prior_box);
-USE_ANAKIN_CONVERTER(leaky_relu);
-USE_ANAKIN_CONVERTER(affine_channel);
-USE_ANAKIN_CONVERTER(relu6);
-USE_ANAKIN_CONVERTER(swish);
-USE_ANAKIN_CONVERTER(shuffle_channel);
-#endif
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
deleted file mode 100644
index 33a2e62303af776f608c3d9e272e9cd73b0d12b4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/api_impl.h"
-#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/string/printf.h"
-#ifdef PADDLE_WITH_TESTING
-#include <gtest/gtest.h>
-#include <gtest/gtest_prod.h>
-#endif
-namespace paddle {
-
-using inference::analysis::Argument;
-using inference::analysis::Analyzer;
-using framework::proto::ProgramDesc;
-using framework::NaiveExecutor;
-
-/** \brief This predictor is based on the original native predictor with IR and
- * Analysis support.
- *
- * It will optimize IR and Parameters in the runtime.
- *
- * TODO(Superjomn) Replace the Navive predictor?
- */
-class AnalysisPredictor : public PaddlePredictor {
- public:
-  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
-    predictor_id_ = inference::GetUniqueId();
-  }
-  ~AnalysisPredictor();
-
-  bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
-            const std::shared_ptr<framework::ProgramDesc> &program = nullptr);
-
-  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data,
-           int batch_size = -1) override;
-
-  std::vector<std::string> GetInputNames();
-  std::vector<std::string> GetOutputNames();
-
-  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
-      const std::string &name) override;
-  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
-      const std::string &name) override;
-
-  std::map<std::string, std::vector<int64_t>> GetInputTensorShape() override;
-
-  bool ZeroCopyRun() override;
-
-  void CreateFeedFetchVar(framework::Scope *scope);
-  void PrepareFeedFetch();
-
-  void PrepareArgument();
-  void OptimizeInferenceProgram();
-
-  Argument &analysis_argument() { return argument_; }
-
-  std::unique_ptr<PaddlePredictor> Clone() override;
-
-  framework::Scope *scope() { return scope_.get(); }
-  framework::ProgramDesc &program() { return *inference_program_; }
-
-  std::string GetSerializedProgram() const override;
-
-  bool MkldnnQuantize();
-
-  // save program to  model
-  // save parameters to params
-  void SaveOptimModel(const std::string &dir);
-
- protected:
-  bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
-  bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
-  bool CreateExecutor();
-  bool PrepareExecutor();
-
-  bool LoadProgramDesc();
-  bool LoadParameters();
-
-  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
-               framework::Scope *scope);
-  bool GetFetch(std::vector<PaddleTensor> *output_data,
-                framework::Scope *scope);
-  template <typename T>
-  void GetFetchOne(const framework::LoDTensor &fetchs,
-                   PaddleTensor *output_data);
-  // PreSet and PostReset for Mkldnn multi-thread and dynamic shape input.
-  // Used in AnalysisPredictor::Run(), do not support
-  // AnalysisPredictor::ZeroRun() now.
-  void MkldnnPreSet(const std::vector<PaddleTensor> &inputs);
-  void MkldnnPostReset();
-
-#if PADDLE_WITH_TENSORRT
-  // When we use Paddle-TRT INT8 engine, we need to generate calibration table
-  // data first,
-  // the calibration table contains the range for each op's input and output,
-  // this whole process can be divided into several steps:
-  //
-  // 1. Builds a 32-bit engine, runs it on the calibration set, and records a
-  // histogram for each
-  // tensor of the distribution of activation values.
-  // 2. Builds a calibration table from the histograms.
-  //
-  // After step 2, we need to store the calibration table on disk
-  bool SaveTrtCalibToDisk();
-#endif
-
-// Some more detailed tests, they are made the friends of the predictor, so that
-// the all the details can be tested.
-#if PADDLE_WITH_TESTING
-  FRIEND_TEST(AnalysisPredictor, analysis_off);
-  FRIEND_TEST(AnalysisPredictor, analysis_on);
-  FRIEND_TEST(AnalysisPredictor, with_gpu);
-#endif
-
- private:
-  AnalysisConfig config_;
-  Argument argument_;
-  std::unique_ptr<NaiveExecutor> executor_;
-  platform::Place place_;
-  std::shared_ptr<framework::Scope> scope_;
-  framework::Scope *sub_scope_{nullptr};
-  std::shared_ptr<framework::ProgramDesc> inference_program_;
-  std::vector<framework::OpDesc *> feeds_;
-  std::map<std::string, size_t> feed_names_;
-  // Sorted according to the idx.
-  std::map<size_t, std::string> idx2feeds_;
-  std::vector<framework::OpDesc *> fetches_;
-  std::map<size_t, std::string> idx2fetches_;
-
-#if PADDLE_WITH_MKLDNN
-  // Helper class to perform quantization
-  class MkldnnQuantizer;
-  MkldnnQuantizer *mkldnn_quantizer_{nullptr};
-
-#if PADDLE_WITH_TESTING
-  friend class MkldnnQuantizerTest;
-#endif
-#endif
-
-  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
-  // concurrency problems, wrong results and memory leak, so cache them.
-  std::vector<framework::LoDTensor> feed_tensors_;
-  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
-  // A mutex help to make Clone thread safe.
-  std::mutex clone_mutex_;
-
-  // For memory optimization.
-  const size_t max_shape_collect_count_{1000};
-  int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
-  std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
-  int predictor_id_;
-
- private:
-  // Some status here that help to determine the status inside the predictor.
-  bool status_is_cloned_{false};
-  bool status_use_gpu_{false};
-};
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
deleted file mode 100644
index e990b2c7736ae51a1ac2ba2fd15362012288b9bb..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ /dev/null
@@ -1,488 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <thread>  // NOLINT
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
-#endif
-
-DEFINE_string(dirname, "", "dirname to tests.");
-
-namespace paddle {
-
-TEST(AnalysisPredictor, analysis_off) {
-  AnalysisConfig config;
-  config.SetModel(FLAGS_dirname);
-  config.SwitchIrOptim(false);
-
-  auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-  auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
-
-  // Without analysis, the scope_ and sub_scope_ are created by predictor
-  // itself.
-  ASSERT_TRUE(predictor->scope_);
-  ASSERT_TRUE(predictor->sub_scope_);
-  ASSERT_EQ(predictor->scope_->parent(), nullptr);
-  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
-  // ir is turned off, so program shouldn't be optimized.
-  LOG(INFO) << "scope parameters " << predictor->scope_->LocalVarNames().size();
-
-  // 2. Dummy Input Data
-  int64_t data[4] = {1, 2, 3, 4};
-  PaddleTensor tensor;
-  tensor.shape = std::vector<int>({4, 1});
-  tensor.data.Reset(data, sizeof(data));
-  tensor.dtype = PaddleDType::INT64;
-
-  std::vector<PaddleTensor> inputs(4, tensor);
-  std::vector<PaddleTensor> outputs;
-  ASSERT_TRUE(predictor->Run(inputs, &outputs));
-}
-
-TEST(AnalysisPredictor, analysis_on) {
-  AnalysisConfig config;
-  config.SetModel(FLAGS_dirname);
-  config.SwitchIrOptim(true);
-#ifdef PADDLE_WITH_CUDA
-  config.EnableUseGpu(100, 0);
-#else
-  config.DisableGpu();
-#endif
-
-  auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-  auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
-
-  ASSERT_TRUE(predictor->scope_);
-  ASSERT_TRUE(predictor->sub_scope_);
-  ASSERT_EQ(predictor->scope_->parent(), nullptr);
-  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
-  // 2. Dummy Input Data
-  int64_t data[4] = {1, 2, 3, 4};
-  PaddleTensor tensor;
-  tensor.shape = std::vector<int>({4, 1});
-  tensor.data.Reset(data, sizeof(data));
-  tensor.dtype = PaddleDType::INT64;
-
-  std::vector<PaddleTensor> inputs(4, tensor);
-  std::vector<PaddleTensor> outputs;
-  ASSERT_TRUE(predictor->Run(inputs, &outputs));
-
-  for (auto& output : outputs) {
-    LOG(INFO) << inference::DescribeTensor(output);
-  }
-
-  // compare with NativePredictor
-  auto naive_predictor =
-      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
-  std::vector<PaddleTensor> naive_outputs;
-  ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs));
-  ASSERT_EQ(naive_outputs.size(), 1UL);
-  inference::CompareTensor(outputs.front(), naive_outputs.front());
-}
-
-TEST(AnalysisPredictor, ZeroCopy) {
-  AnalysisConfig config;
-  config.SetModel(FLAGS_dirname);
-  config.SwitchUseFeedFetchOps(false);
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-
-  auto w0 = predictor->GetInputTensor("firstw");
-  auto w1 = predictor->GetInputTensor("secondw");
-  auto w2 = predictor->GetInputTensor("thirdw");
-  auto w3 = predictor->GetInputTensor("forthw");
-
-  w0->Reshape({4, 1});
-  w1->Reshape({4, 1});
-  w2->Reshape({4, 1});
-  w3->Reshape({4, 1});
-
-  auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
-  auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
-  auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
-  auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
-
-  for (int i = 0; i < 4; i++) {
-    w0_data[i] = i;
-    w1_data[i] = i;
-    w2_data[i] = i;
-    w3_data[i] = i;
-  }
-
-  predictor->ZeroCopyRun();
-
-  auto out = predictor->GetOutputTensor("fc_1.tmp_2");
-  PaddlePlace place;
-  int size = 0;
-  auto* out_data = out->data<float>(&place, &size);
-  LOG(INFO) << "output size: " << size / sizeof(float);
-  LOG(INFO) << "output_data: " << out_data;
-}
-
-TEST(AnalysisPredictor, Clone) {
-  AnalysisConfig config;
-  config.SetModel(FLAGS_dirname);
-  config.SwitchUseFeedFetchOps(true);
-  config.SwitchIrOptim(true);
-
-  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  predictors.emplace_back(CreatePaddlePredictor(config));
-
-  LOG(INFO) << "************** to clone ************************";
-  const int num_threads = 3;
-  for (int i = 1; i < num_threads; i++) {
-    predictors.emplace_back(predictors.front()->Clone());
-  }
-
-  auto* root_scope =
-      static_cast<AnalysisPredictor*>(predictors[0].get())->scope();
-  ASSERT_FALSE(root_scope->kids().empty());
-  LOG(INFO) << "***** scope ******\n"
-            << framework::GenScopeTreeDebugInfo(root_scope);
-
-  // 2. Dummy Input Data
-  int64_t data[4] = {1, 2, 3, 4};
-  PaddleTensor tensor;
-  tensor.shape = std::vector<int>({4, 1});
-  tensor.data.Reset(data, sizeof(data));
-  tensor.dtype = PaddleDType::INT64;
-
-  std::vector<PaddleTensor> inputs(4, tensor);
-  std::vector<PaddleTensor> outputs;
-  predictors[0]->Run(inputs, &outputs);
-
-  LOG(INFO) << "Run with single thread";
-  for (int i = 0; i < num_threads; i++) {
-    LOG(INFO) << "run predictor " << i;
-    ASSERT_TRUE(predictors[i]->Run(inputs, &outputs));
-  }
-
-  LOG(INFO) << "Run with multiple threads";
-  std::vector<std::thread> threads;
-  for (int i = 0; i < num_threads; i++) {
-    threads.emplace_back([&predictors, &inputs, i] {
-      LOG(INFO) << "thread #" << i << " running";
-      std::vector<PaddleTensor> outputs;
-      auto predictor = predictors.front()->Clone();
-      for (int j = 0; j < 10; j++) {
-        ASSERT_TRUE(predictor->Run(inputs, &outputs));
-      }
-    });
-  }
-
-  for (auto& t : threads) {
-    t.join();
-  }
-}
-
-// This function is not released yet, will fail on some machine.
-// TODO(Superjomn) Turn on it latter.
-/*
-TEST(AnalysisPredictor, memory_optim) {
-  AnalysisConfig config(FLAGS_dirname);
-  config.DisableGpu();
-  config.EnableMemoryOptim(true);
-  config.SwitchIrDebug();
-
-  auto native_predictor =
-      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
-
-  // 2. Dummy Input Data
-  int64_t data[4] = {1, 2, 3, 4};
-  PaddleTensor tensor;
-  tensor.shape = std::vector<int>({4, 1});
-  tensor.data.Reset(data, sizeof(data));
-  tensor.dtype = PaddleDType::INT64;
-
-  std::vector<PaddleTensor> inputs(4, tensor);
-  std::vector<PaddleTensor> output, output1;
-
-  {
-    // The first predictor help to cache the memory optimize strategy.
-    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-    LOG(INFO) << "serialized program: " << predictor->GetSerializedProgram();
-    ASSERT_FALSE(predictor->GetSerializedProgram().empty());
-
-    // Run several times to check the parameters are not reused by mistake.
-    for (int i = 0; i < 5; i++) {
-      ASSERT_TRUE(predictor->Run(inputs, &output));
-    }
-  }
-
-  {
-    output.clear();
-    // The second predictor to perform memory optimization.
-    config.EnableMemoryOptim(false);
-    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-
-    // Run with memory optimization
-    ASSERT_TRUE(predictor->Run(inputs, &output));
-  }
-
-  // Run native
-  ASSERT_TRUE(native_predictor->Run(inputs, &output1));
-
-  LOG(INFO) << "the output " << inference::DescribeTensor(output.front());
-  LOG(INFO) << "the native output "
-            << inference::DescribeTensor(output1.front());
-
-  inference::CompareResult(output, output1);
-}
-*/
-
-#ifdef PADDLE_WITH_MKLDNN
-class MkldnnQuantizerTest : public testing::Test {
- public:
-  MkldnnQuantizerTest() {
-    AnalysisConfig config(FLAGS_dirname);
-
-    predictor.reset(new AnalysisPredictor(config));
-    auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());
-
-    auto qconfig = new MkldnnQuantizerConfig();
-
-    mkldnn_quantizer.reset(
-        new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig));
-  }
-
-  std::pair<std::vector<int>, float> Histogram(
-      const framework::LoDTensor& var_tensor, float min_val, float max_val,
-      int num_bins) const {
-    return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins);
-  }
-
-  std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
-      const framework::LoDTensor& var_tensor, bool is_unsigned) const {
-    return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned);
-  }
-
-  std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
-      const framework::LoDTensor& var_tensor, bool is_unsigned) const {
-    return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned);
-  }
-
-  std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
-      const framework::LoDTensor& var_tensor, bool is_unsigned) const {
-    return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned);
-  }
-
- protected:
-  std::unique_ptr<PaddlePredictor> predictor;
-  std::unique_ptr<AnalysisPredictor::MkldnnQuantizer> mkldnn_quantizer;
-  float abs_error = 1e-6;
-  static const std::array<float, 10> non_negative_values;
-  static const std::array<float, 10> positive_and_negative_values;
-};
-
-const std::array<float, 10> MkldnnQuantizerTest::non_negative_values = {
-    0.0158671, 0.026459,   0.0280772,  0.00962479, 0.0131628,
-    0.016704,  0.00118407, 0.00765726, 0.0123213,  0.00944741};
-const std::array<float, 10> MkldnnQuantizerTest::positive_and_negative_values =
-    {-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586,
-     -0.0495346, 0.0629528,  -0.00531285, -0.0230353,  0.0269089};
-
-TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) {
-  const auto& values = non_negative_values;
-  auto min_val = *std::min_element(values.begin(), values.end());
-  auto max_val = *std::max_element(values.begin(), values.end());
-
-  framework::LoDTensor var_tensor;
-  var_tensor.Resize(framework::make_dim(values.size()));
-  std::copy(begin(values), end(values),
-            var_tensor.mutable_data<float>(platform::CPUPlace()));
-
-  ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3),
-               platform::EnforceNotMet);
-}
-
-TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) {
-  // all non-negative values
-  const auto& values = non_negative_values;
-  auto min_val = *std::min_element(values.begin(), values.end());
-  auto max_val = *std::max_element(values.begin(), values.end());
-
-  framework::LoDTensor var_tensor;
-  var_tensor.Resize(framework::make_dim(values.size()));
-  std::copy(begin(values), end(values),
-            var_tensor.mutable_data<float>(platform::CPUPlace()));
-
-  std::vector<int> histogram;
-  float bin_width;
-
-  std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3);
-
-  ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error)
-      << "Improperly calculated bin_width.";
-
-  ASSERT_EQ(histogram[0], 4);
-  ASSERT_EQ(histogram[1], 4);
-  ASSERT_EQ(histogram[2], 2);
-}
-
-TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) {
-  const auto& values = positive_and_negative_values;
-  auto min_val = *std::min_element(values.begin(), values.end());
-  auto max_val = *std::max_element(values.begin(), values.end());
-
-  framework::LoDTensor var_tensor;
-  var_tensor.Resize(framework::make_dim(values.size()));
-  std::copy(begin(values), end(values),
-            var_tensor.mutable_data<float>(platform::CPUPlace()));
-
-  std::vector<int> histogram;
-  float bin_width;
-
-  std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3);
-
-  ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error)
-      << "Improperly calculated bin_width.";
-
-  ASSERT_EQ(histogram[0], 3);
-  ASSERT_EQ(histogram[1], 5);
-  ASSERT_EQ(histogram[2], 2);
-}
-
-TEST_F(MkldnnQuantizerTest, histogram_zero_bins) {
-  const auto& values = non_negative_values;
-  auto min_val = *std::min_element(values.begin(), values.end());
-  auto max_val = *std::max_element(values.begin(), values.end());
-
-  framework::LoDTensor var_tensor;
-  var_tensor.Resize(framework::make_dim(values.size()));
-  std::copy(begin(values), end(values),
-            var_tensor.mutable_data<float>(platform::CPUPlace()));
-
-  ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0),
-               platform::EnforceNotMet);
-}
-
-TEST_F(MkldnnQuantizerTest, histogram_empty) {
-  // empty tensor
-  ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet);
-
-  // zero tensor
-  framework::LoDTensor var_tensor;
-  var_tensor.Resize({0});
-  var_tensor.mutable_data<double>(platform::CPUPlace());
-
-  ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet);
-}
-
-TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) {
-  const auto& values = positive_and_negative_values;
-
-  framework::LoDTensor var_tensor;
-  var_tensor.Resize(framework::make_dim(values.size()));
-  std::copy(begin(values), end(values),
-            var_tensor.mutable_data<float>(platform::CPUPlace()));
-
-  bool is_unsigned;
-  framework::LoDTensor lod_tensor;
-
-  std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false);
-
-  ASSERT_EQ(is_unsigned, false);
-  ASSERT_EQ(lod_tensor.numel(), 1);
-  ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0899106152344, abs_error);
-}
-
-TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) {
-  const auto& values = positive_and_negative_values;
-  auto max_val = *std::max_element(values.begin(), values.end());
-
-  framework::LoDTensor var_tensor;
-  var_tensor.Resize(framework::make_dim(values.size()));
-  std::copy(begin(values), end(values),
-            var_tensor.mutable_data<float>(platform::CPUPlace()));
-
-  bool is_unsigned;
-  framework::LoDTensor lod_tensor;
-
-  std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false);
-
-  ASSERT_EQ(is_unsigned, false);
-  ASSERT_EQ(lod_tensor.numel(), 1);
-  ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error);
-}
-
-TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) {
-  const auto& values = non_negative_values;
-  auto max_val = *std::max_element(values.begin(), values.end());
-
-  framework::LoDTensor var_tensor;
-  var_tensor.Resize(framework::make_dim(values.size()));
-  std::copy(begin(values), end(values),
-            var_tensor.mutable_data<float>(platform::CPUPlace()));
-
-  bool is_unsigned;
-  framework::LoDTensor lod_tensor;
-
-  std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true);
-
-  ASSERT_EQ(is_unsigned, true);
-  ASSERT_EQ(lod_tensor.numel(), 1);
-  ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error);
-}
-
-TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) {
-  const auto& values = non_negative_values;
-  auto max_val = *std::max_element(values.begin(), values.end());
-  int channels = 3;
-
-  framework::LoDTensor var_tensor;
-  var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size()));
-  for (int i = 0; i < channels; i++)
-    std::copy(begin(values), end(values),
-              var_tensor.mutable_data<float>(platform::CPUPlace()) +
-                  i * values.size());
-
-  bool is_unsigned;
-  framework::LoDTensor lod_tensor;
-
-  std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true);
-
-  ASSERT_EQ(is_unsigned, true);
-  ASSERT_EQ(lod_tensor.numel(), channels);
-  for (int i = 0; i < channels; i++) {
-    ASSERT_NEAR(lod_tensor.data<double>()[i], 1.0 / max_val, abs_error);
-  }
-}
-
-TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
-  const auto& values = non_negative_values;
-
-  framework::LoDTensor var_tensor;
-  var_tensor.Resize(framework::make_dim(values.size()));
-  std::copy(begin(values), end(values),
-            var_tensor.mutable_data<float>(platform::CPUPlace()));
-
-  bool is_unsigned;
-  framework::LoDTensor lod_tensor;
-
-  std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true);
-
-  ASSERT_EQ(is_unsigned, true);
-  ASSERT_EQ(lod_tensor.numel(), 1);
-  ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0252845321362, abs_error);
-}
-#endif
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
deleted file mode 100644
index ec659f1cfc62be4ee45c006467a1b238a27846e9..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-#include "paddle/fluid/framework/commit.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_pass_builder.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-
-int PaddleDtypeSize(PaddleDType dtype) {
-  switch (dtype) {
-    case PaddleDType::FLOAT32:
-      return sizeof(float);
-    case PaddleDType::INT64:
-      return sizeof(int64_t);
-    case PaddleDType::INT32:
-      return sizeof(int32_t);
-    default:
-      assert(false);
-      return -1;
-  }
-}
-
-PaddleBuf::PaddleBuf(PaddleBuf &&other)
-    : data_(other.data_),
-      length_(other.length_),
-      memory_owned_(other.memory_owned_) {
-  other.memory_owned_ = false;
-  other.data_ = nullptr;
-  other.length_ = 0;
-}
-
-PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; }
-
-PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
-  if (!other.memory_owned_) {
-    data_ = other.data_;
-    length_ = other.length_;
-    memory_owned_ = other.memory_owned_;
-  } else {
-    Resize(other.length());
-    // if other.length() == 0 or other.data() == nullptr, then the memcpy
-    // behavior is undefined
-    if (other.length() && other.data())
-      memcpy(data_, other.data(), other.length());
-    else if (other.length())
-      PADDLE_THROW(
-          "Invalid argument, null pointer data with length %u is passed",
-          other.length());
-
-    length_ = other.length();
-    memory_owned_ = true;
-  }
-  return *this;
-}
-
-PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) {
-  // only the buffer with external memory can be copied
-  data_ = other.data_;
-  length_ = other.length_;
-  memory_owned_ = other.memory_owned_;
-  other.data_ = nullptr;
-  other.length_ = 0;
-  other.memory_owned_ = false;
-  return *this;
-}
-
-void PaddleBuf::Resize(size_t length) {
-  // Only the owned memory can be reset, the external memory can't be changed.
-  if (length_ >= length) return;
-  if (memory_owned_) {
-    Free();
-    data_ = malloc(length);
-    length_ = length;
-    memory_owned_ = true;
-  } else {
-    PADDLE_THROW("The memory is allocated externally, can not Resized");
-  }
-}
-
-void PaddleBuf::Reset(void *data, size_t length) {
-  Free();
-  memory_owned_ = false;
-  data_ = data;
-  length_ = length;
-}
-
-void PaddleBuf::Free() {
-  if (memory_owned_ && data_) {
-    PADDLE_ENFORCE_GT(length_, 0UL);
-    free(static_cast<char *>(data_));
-    data_ = nullptr;
-    length_ = 0;
-  }
-}
-
-std::string get_version() {
-  std::stringstream ss;
-  ss << "version: " << framework::paddle_version() << "\n";
-  ss << "commit: " << framework::paddle_commit() << "\n";
-  ss << "branch: " << framework::paddle_compile_branch() << "\n";
-  return ss.str();
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
deleted file mode 100644
index 4c51c239f6d4449795fa38665495ab260277c84d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ /dev/null
@@ -1,461 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/inference/api/api_anakin_engine.h"
-#include "paddle/fluid/inference/api/paddle_api.h"
-
-#include "framework/core/net/net.h"
-#include "framework/operators/ops.h"
-#include "saber/funcs/timer.h"
-
-namespace paddle {
-
-using paddle::contrib::AnakinConfig;
-template <typename T, Precision P, OpRunType R>
-extern std::mutex PaddleInferenceAnakinPredictor<T, P, R>::mutex_;
-template <typename T, Precision P, OpRunType R>
-extern std::once_flag PaddleInferenceAnakinPredictor<T, P, R>::init_anakin_;
-
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::InitEnv() {
-  std::call_once(this->init_anakin_, [this]() {
-    anakin::Env<T>::env_init(this->config_.max_stream);
-  });
-  anakin::TargetWrapper<T>::set_device(this->config_.device_id);
-}
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::InitNet() {
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  delete this->executor_p_;
-  this->executor_p_ = new anakin::Net<T, P, R>(*this->graph_p_, true);
-}
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::SetContext() {
-  this->ctx_p_ = std::make_shared<anakin::Context<T>>(
-      this->config_.device_id, this->config_.data_stream_id,
-      this->config_.compute_stream_id);
-}
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::InitGraph() {
-  this->graph_p_ =
-      std::make_shared<anakin::graph::Graph<T, anakin::Precision::FP32>>();
-  if (!this->config_.model_file.empty()) {
-    this->graph_p_->load(this->config_.model_file);
-  } else if (this->config_.model_buf_p) {
-    this->graph_p_->load(this->config_.model_buf_p,
-                         this->config_.model_buf_len);
-  } else {
-    LOG(FATAL) << "Model load error.";
-  }
-  this->input_names_ = this->graph_p_->get_ins();
-  this->output_names_ = this->graph_p_->get_outs();
-  for (auto &input_str : this->input_names_) {
-    if (this->config_.init_inputs_shape.find(input_str) ==
-        this->config_.init_inputs_shape.end()) {
-      LOG(FATAL) << input_str << " should be set in init_inputs_shape.";
-    }
-    std::vector<int> shape =
-        this->config_.init_inputs_shape.find(input_str)->second;
-    this->graph_p_->Reshape(input_str, shape);
-  }
-}
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::OptimizeGraph() {
-  if (!this->graph_p_->Optimize()) {
-    LOG(FATAL) << "Graph optimization error.";
-  }
-}
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::InitPredictor() {
-  this->InitEnv();
-  this->SetContext();
-  this->InitGraph();
-  this->OptimizeGraph();
-  this->InitNet();
-}
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::Predict(int batch_size) {
-  anakin::TargetWrapper<T>::device_sync();
-  this->executor_p_->prediction();
-  anakin::TargetWrapper<T>::device_sync();
-}
-template <typename T, Precision P, OpRunType R>
-bool PaddleInferenceAnakinPredictor<T, P, R>::Run(
-    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data, int batch_size) {
-  if (this->config_.re_allocable) {
-    return this->RunImpl(inputs, output_data, batch_size);
-  } else {
-    // Run inputs data that exceeds batch size in batches.
-    // 1. Reassign the batch size.
-    if (batch_size == -1) {
-      if (!inputs[0].lod.empty()) {
-        batch_size = inputs[0].lod[0].size() - 1;
-      } else {
-        batch_size = inputs[0].shape[0];
-      }
-    }
-    // 2. If the data don't need to be batched, run it directly.
-    if (batch_size <= this->config_.init_batch_size) {
-      return this->RunImpl(inputs, output_data);
-    }
-    // 3. Check the batch size and define temporary variables.
-    std::vector<PaddleTensor> cur_inputs;
-    std::vector<PaddleTensor> outputs_master;
-    std::vector<std::vector<paddle::PaddleTensor>> outputs_vec;
-    for (const auto &input : inputs) {
-      if (!input.lod.empty()) {
-        if (input.lod.size() != 1) {
-          return false;
-        }
-        if (input.lod[0].size() - 1 != batch_size) {
-          return false;
-        }
-      } else {
-        LOG(INFO) << "Non-lod mode to be implemented.";
-        return false;
-      }
-      PaddleTensor tensor;
-      tensor.name = input.name;
-      tensor.dtype = PaddleDType::FLOAT32;
-      cur_inputs.push_back(tensor);
-    }
-    for (auto output : *output_data) {
-      PaddleTensor tensor;
-      tensor.name = output.name;
-      outputs_master.push_back(tensor);
-    }
-    // 4. Batch execution.
-    for (size_t start_batch = 0; start_batch < batch_size;) {
-      auto end_batch = start_batch + this->config_.init_batch_size;
-      if (end_batch > batch_size) {
-        end_batch = batch_size;
-      }
-      auto cur_outputs = outputs_master;
-      for (size_t i = 0; i < inputs.size(); i++) {
-        auto start = inputs[i].lod[0][start_batch];
-        auto end = inputs[i].lod[0][end_batch];
-        std::vector<size_t> offsets;
-        for (size_t j = start_batch; j <= end_batch; j++) {
-          offsets.push_back(inputs[i].lod[0][j] -
-                            inputs[i].lod[0][start_batch]);
-        }
-        auto mem_start = static_cast<float *>(inputs[i].data.data()) + start;
-        cur_inputs[i].data =
-            PaddleBuf(mem_start, (end - start) * sizeof(float));
-        cur_inputs[i].lod = std::vector<std::vector<size_t>>({offsets});
-        cur_inputs[i].shape =
-            std::vector<int>({static_cast<int>(end - start), 1, 1, 1});
-      }
-      if (!this->RunImpl(cur_inputs, &cur_outputs)) {
-        return false;
-      }
-      outputs_vec.push_back(cur_outputs);
-      start_batch = end_batch;
-    }
-    // 5. Copy the results to contiguous memory.
-    // Assume that each batch has the same final outputs size.
-    auto count = [](const std::vector<int> &v) {
-      int cnt = 1;
-      for_each(v.begin(), v.end(), [&cnt](int n) { cnt *= n; });
-      return cnt;
-    };
-    for (size_t i = 0; i < output_data->size(); i++) {
-      std::vector<int> shape = outputs_vec[i][0].shape;
-      shape[0] = batch_size;
-      int total_cnt = count(shape);
-      (*output_data)[i].shape = shape;
-      (*output_data)[i].data.Resize(total_cnt * sizeof(float));
-      float *addr = static_cast<float *>((*output_data)[i].data.data());
-      for (const auto &single_out : outputs_vec) {
-        int cnt = count(single_out[i].shape);
-        memcpy(addr, single_out[i].data.data(), cnt * sizeof(float));
-        addr += cnt;
-      }
-    }
-  }
-  return true;
-}
-template <typename T, Precision P, OpRunType R>
-bool PaddleInferenceAnakinPredictor<T, P, R>::RunImpl(
-    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data, int batch_size) {
-  anakin::TargetWrapper<T>::set_device(this->config_.device_id);
-  for (const auto &input : inputs) {
-    if (input.dtype != PaddleDType::FLOAT32) {
-      LOG(FATAL) << "Only support float type inputs. " << input.name
-                 << "'s type is not float";
-    }
-    auto d_tensor_p = this->executor_p_->get_in(input.name);
-    auto net_shape = d_tensor_p->valid_shape();
-    if (net_shape.size() != input.shape.size()) {
-      LOG(FATAL) << " input  " << input.name
-                 << "'s shape size should be equal to that of net";
-    }
-#ifndef ANAKIN_MLU_PLACE
-    int sum = 1;
-    for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
-    if (sum > net_shape.count()) {
-      if (this->config_.re_allocable) {
-        this->graph_p_->Reshape(input.name, input.shape);
-        this->InitNet();
-        d_tensor_p = this->executor_p_->get_in(input.name);
-      } else {
-        LOG(FATAL)
-            << "Run failed because Anakin was expected not to reallocate "
-               "memory.";
-      }
-    }
-#endif
-    std::vector<int> tmp_shape;
-    for (auto s : input.shape) {
-      tmp_shape.push_back(s);
-    }
-    auto *data = static_cast<float *>(input.data.data());
-    anakin::saber::Tensor<typename anakin::DefaultHostType<T>::Host_type>
-        h_tensor(data, typename anakin::DefaultHostType<T>::Host_type(), 0,
-                 tmp_shape);
-#ifndef ANAKIN_MLU_PLACE
-    d_tensor_p->reshape(tmp_shape);
-#endif
-    if (input.lod.size() > 0) {
-      if (input.lod.size() > 1) {
-        LOG(FATAL) << " input lod first dim should <=1, but you set "
-                   << input.lod.size();
-      }
-      std::vector<int> lod(input.lod[0].begin(), input.lod[0].end());
-      std::vector<std::vector<int>> offset({lod});
-      d_tensor_p->set_seq_offset(offset);
-      VLOG(3) << "offset.size(): " << offset[0].size();
-      for (int i = 0; i < offset[0].size(); i++) {
-        VLOG(3) << offset[0][i];
-      }
-    }
-    d_tensor_p->copy_from(h_tensor);
-  }
-  this->Predict(batch_size);
-  if (output_data->empty()) {
-    LOG(FATAL) << "The output param in the Run function is incorrect.";
-  }
-  for (auto &output : *output_data) {
-    if (std::find(this->output_names_.begin(), this->output_names_.end(),
-                  output.name) == this->output_names_.end()) {
-      LOG(FATAL) << output.name << " is not in the outputs of the graph.";
-    }
-    auto *d_tensor_p = this->executor_p_->get_out(output.name);
-    auto tmp_shape = d_tensor_p->valid_shape();
-#ifdef ANAKIN_MLU_PLACE
-    tmp_shape.set_num(batch_size);
-#endif
-    output.shape = tmp_shape;
-    if (output.data.length() < tmp_shape.count() * sizeof(float)) {
-      output.data.Resize(tmp_shape.count() * sizeof(float));
-    }
-    auto *data = static_cast<float *>(output.data.data());
-    anakin::saber::Tensor<typename anakin::DefaultHostType<T>::Host_type>
-        h_tensor(data, typename anakin::DefaultHostType<T>::Host_type(), 0,
-                 tmp_shape);
-    h_tensor.copy_from(*d_tensor_p);
-  }
-  return true;
-}
-template <typename T, Precision P, OpRunType R>
-bool PaddleInferenceAnakinPredictor<T, P, R>::Reset(
-    PaddleInferenceAnakinPredictor<T, P, R> *predictor) {
-  this->config_ = predictor->GetConfig();
-  this->graph_p_ = predictor->GetGraph();
-  this->input_names_ = predictor->GetInputNames();
-  this->output_names_ = predictor->GetOutputNames();
-  this->ctx_p_ = std::make_shared<anakin::Context<T>>(
-      this->config_.device_id, this->config_.data_stream_id,
-      this->config_.compute_stream_id);
-  this->InitNet();
-  return true;
-}
-template <typename T, Precision P, OpRunType R>
-std::unique_ptr<PaddlePredictor>
-PaddleInferenceAnakinPredictor<T, P, R>::New() {
-  return std::unique_ptr<PaddlePredictor>(
-      new PaddleInferenceAnakinPredictor<T, P, R>());
-}
-// the cloned new Predictor of anakin share the same net weights from original
-// Predictor
-template <typename T, Precision P, OpRunType R>
-std::unique_ptr<PaddlePredictor>
-PaddleInferenceAnakinPredictor<T, P, R>::Clone() {
-  VLOG(3) << "Anakin Predictor::clone";
-  std::unique_ptr<PaddlePredictor> cls = std::move(this->New());
-  auto anakin_predictor_p =
-      dynamic_cast<PaddleInferenceAnakinPredictor<T, P, R> *>(cls.get());
-  if (!anakin_predictor_p) {
-    LOG(FATAL) << "fail to call Init";
-  }
-  anakin_predictor_p->Reset(this);
-  return cls;
-}
-
-#ifdef ANAKIN_MLU_PLACE
-template <Precision P, OpRunType R>
-std::unique_ptr<PaddlePredictor>
-PaddleInferenceAnakinMLUPredictor<P, R>::New() {
-  return std::unique_ptr<PaddlePredictor>(
-      new PaddleInferenceAnakinMLUPredictor<P, R>());
-}
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinMLUPredictor<P, R>::SetContext() {
-  this->ctx_p_ = std::make_shared<anakin::Context<anakin::MLU>>(
-      this->config_.device_id, this->config_.data_stream_id,
-      this->config_.compute_stream_id);
-  this->ctx_p_->set_model_parallel(this->config_.model_parallel);
-  this->ctx_p_->set_fusion(this->config_.op_fuse);
-  this->ctx_p_->enable_batch_changable();
-  this->ctx_p_->enable_channel_duplicate();
-}
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinMLUPredictor<P, R>::OptimizeGraph() {
-  if (!this->graph_p_->fusion_optimize(this->config_.op_fuse)) {
-    LOG(FATAL) << "Graph optimization error.";
-  }
-}
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinMLUPredictor<P, R>::InitNet() {
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  delete this->executor_p_;
-  this->executor_p_ = new anakin::Net<anakin::MLU, P, R>();
-  this->executor_p_->fusion_init(*this->graph_p_, this->ctx_p_, true);
-}
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinMLUPredictor<P, R>::Predict(int batch_size) {
-  this->executor_p_->fusion_prediction(batch_size);
-}
-#endif
-
-#ifdef ANAKIN_BM_PLACE
-template <Precision P, OpRunType R>
-std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinBMPredictor<P, R>::New() {
-  return std::unique_ptr<PaddlePredictor>(
-      new PaddleInferenceAnakinBMPredictor<P, R>());
-}
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinBMPredictor<P, R>::OptimizeGraph() {
-  if (!this->graph_p_->fusion_optimize()) {
-    LOG(FATAL) << "Graph optimization error.";
-  }
-}
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinBMPredictor<P, R>::InitNet() {
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  delete this->executor_p_;
-  this->executor_p_ = new anakin::Net<anakin::BM, P, R>();
-  this->executor_p_->fusion_init(*this->graph_p_, this->ctx_p_, true);
-}
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinBMPredictor<P, R>::Predict(int batch_size) {
-  this->executor_p_->fusion_prediction();
-}
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-template class PaddleInferenceAnakinPredictor<
-    anakin::NV, anakin::Precision::FP32, ::anakin::OpRunType::ASYNC>;
-#endif
-#ifdef ANAKIN_X86_PLACE
-template class PaddleInferenceAnakinPredictor<
-    anakin::X86, anakin::Precision::FP32, ::anakin::OpRunType::ASYNC>;
-#endif
-#ifdef ANAKIN_MLU_PLACE
-template class PaddleInferenceAnakinMLUPredictor<anakin::Precision::FP32,
-                                                 ::anakin::OpRunType::SYNC>;
-#endif
-#ifdef ANAKIN_BM_PLACE
-template class PaddleInferenceAnakinBMPredictor<anakin::Precision::FP32,
-                                                ::anakin::OpRunType::ASYNC>;
-#endif
-
-// A factory to help create difference predictor.
-template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
-    const contrib::AnakinConfig &config) {
-#ifdef PADDLE_WITH_CUDA
-  if (config.target_type == contrib::AnakinConfig::NVGPU) {
-    return std::unique_ptr<PaddlePredictor>(
-        new PaddleInferenceAnakinPredictor<anakin::NV, anakin::Precision::FP32,
-                                           ::anakin::OpRunType::ASYNC>(config));
-  }
-#endif
-#ifdef ANAKIN_X86_PLACE
-  if (config.target_type == contrib::AnakinConfig::X86) {
-    return std::unique_ptr<PaddlePredictor>(
-        new PaddleInferenceAnakinPredictor<anakin::X86, anakin::Precision::FP32,
-                                           ::anakin::OpRunType::ASYNC>(config));
-  }
-#endif
-#ifdef ANAKIN_MLU_PLACE
-  if (config.target_type == contrib::AnakinConfig::MLU) {
-    return std::unique_ptr<PaddlePredictor>(
-        new PaddleInferenceAnakinMLUPredictor<anakin::Precision::FP32,
-                                              ::anakin::OpRunType::SYNC>(
-            config));
-  }
-#endif
-#ifdef ANAKIN_BM_PLACE
-  if (config.target_type == contrib::AnakinConfig::BM) {
-    return std::unique_ptr<PaddlePredictor>(
-        new PaddleInferenceAnakinBMPredictor<anakin::Precision::FP32,
-                                             ::anakin::OpRunType::ASYNC>(
-            config));
-  }
-#endif
-  LOG(FATAL) << "Anakin Predictor create on unknown platform: "
-             << config.target_type;
-  return nullptr;
-}
-template <typename T, Precision P, OpRunType R>
-void DisplayOpTimer(anakin::Net<T, P, R> *net_executor, int epoch) {
-#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
-  std::vector<float> op_time = net_executor->get_op_time();
-  auto exec_funcs = net_executor->get_exec_funcs();
-  auto op_param = net_executor->get_op_param();
-  for (int i = 0; i < op_time.size(); i++) {
-    LOG(INFO) << "name: " << exec_funcs[i].name
-              << " op_type: " << exec_funcs[i].op_name
-              << " op_param: " << op_param[i] << " time " << op_time[i] / epoch;
-  }
-  std::map<std::string, float> op_map;
-  for (int i = 0; i < op_time.size(); i++) {
-    auto it = op_map.find(op_param[i]);
-    if (it != op_map.end())
-      op_map[op_param[i]] += op_time[i];
-    else
-      op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
-  }
-  for (auto it = op_map.begin(); it != op_map.end(); ++it) {
-    LOG(INFO) << it->first << "  " << (it->second) / epoch << " ms";
-  }
-#endif
-}
-template <typename T, Precision P, OpRunType R>
-PaddleInferenceAnakinPredictor<T, P, R>::~PaddleInferenceAnakinPredictor() {
-  DisplayOpTimer<T, P, R>(this->executor_p_, this->config_.init_batch_size);
-  delete this->executor_p_;
-  this->executor_p_ = nullptr;
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
deleted file mode 100644
index 97fc00610e05d4362d705a13a45ee6a3e5d39ffe..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains the implementation of inference API with Anakin engine
- * embeded, this API can only support Anakin models.
- */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "framework/core/net/net.h"
-#include "framework/graph/graph.h"
-#include "paddle/fluid/inference/api/paddle_anakin_config.h"
-#include "saber/core/shape.h"
-#include "saber/saber_types.h"
-
-namespace paddle {
-
-using contrib::AnakinConfig;
-using anakin::Precision;
-using anakin::OpRunType;
-
-template <typename T, Precision P, OpRunType R>
-class PaddleInferenceAnakinPredictor : public PaddlePredictor {
- public:
-  PaddleInferenceAnakinPredictor() = default;
-
-  explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config)
-      : config_(config) {
-    this->InitPredictor();
-  }
-
-  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
-  // should be allocated first.
-  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data,
-           int batch_size = -1) override;
-
-  std::unique_ptr<PaddlePredictor> Clone() override;
-  bool Reset(PaddleInferenceAnakinPredictor<T, P, R>* predictor);
-  void InitPredictor();
-  std::shared_ptr<anakin::graph::Graph<T, P>> GetGraph() {
-    return this->graph_p_;
-  }
-  std::vector<std::string> GetInputNames() override {
-    return this->input_names_;
-  }
-  std::vector<std::string> GetOutputNames() override {
-    return this->output_names_;
-  }
-  const AnakinConfig& GetConfig() const { return this->config_; }
-
-  ~PaddleInferenceAnakinPredictor() override;
-
- protected:
-  void InitEnv();
-  void InitGraph();
-  virtual void OptimizeGraph();
-  virtual void InitNet();
-  virtual void SetContext();
-  virtual void Predict(int batch_size);
-  virtual std::unique_ptr<PaddlePredictor> New();
-  static std::mutex mutex_;
-  AnakinConfig config_;
-  std::shared_ptr<anakin::Context<T>> ctx_p_;
-  std::shared_ptr<anakin::graph::Graph<T, P>> graph_p_;
-  anakin::Net<T, P, R>* executor_p_{nullptr};
-  std::vector<std::string> input_names_;
-  std::vector<std::string> output_names_;
-
- private:
-  bool RunImpl(const std::vector<PaddleTensor>& inputs,
-               std::vector<PaddleTensor>* output_data, int batch_size = -1);
-  static std::once_flag init_anakin_;
-};
-
-#ifdef ANAKIN_MLU_PLACE
-template <Precision P, OpRunType R>
-class PaddleInferenceAnakinMLUPredictor final
-    : public PaddleInferenceAnakinPredictor<anakin::MLU, P, R> {
- public:
-  PaddleInferenceAnakinMLUPredictor() = default;
-  explicit PaddleInferenceAnakinMLUPredictor(const AnakinConfig& config) {
-    this->config_ = config;
-    this->InitPredictor();
-  }
-  std::unique_ptr<PaddlePredictor> New() override;
-  void SetContext() override;
-  void OptimizeGraph() override;
-  void InitNet() override;
-  void Predict(int batch_size) override;
-};
-#endif
-
-#ifdef ANAKIN_BM_PLACE
-template <Precision P, OpRunType R>
-class PaddleInferenceAnakinBMPredictor final
-    : public PaddleInferenceAnakinPredictor<anakin::BM, P, R> {
- public:
-  PaddleInferenceAnakinBMPredictor() = default;
-  explicit PaddleInferenceAnakinBMPredictor(const AnakinConfig& config) {
-    this->config_ = config;
-    this->InitPredictor();
-  }
-  std::unique_ptr<PaddlePredictor> New() override;
-  void OptimizeGraph() override;
-  void InitNet() override;
-  void Predict(int batch_size) override;
-};
-#endif
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
deleted file mode 100644
index 628817c6f4614026566f74510426efb65f740ea5..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api_impl.cc
+++ /dev/null
@@ -1,341 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <map>
-#include <set>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/inference/api/api_impl.h"
-#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_bool(profile, false, "Turn on profiler for fluid");
-
-namespace paddle {
-namespace {
-using paddle::inference::Timer;
-
-template <class T>
-std::string num2str(T a) {
-  std::stringstream istr;
-  istr << a;
-  return istr.str();
-}
-}  // namespace
-
-void NativePaddlePredictor::PrepareFeedFetch() {
-  for (auto *op : inference_program_->Block(0).AllOps()) {
-    if (op->Type() == "feed") {
-      int idx = boost::get<int>(op->GetAttr("col"));
-      if (feeds_.size() <= static_cast<size_t>(idx)) {
-        feeds_.resize(idx + 1);
-      }
-      feeds_[idx] = op;
-      feed_names_[op->Output("Out")[0]] = idx;
-    } else if (op->Type() == "fetch") {
-      int idx = boost::get<int>(op->GetAttr("col"));
-      if (fetchs_.size() <= static_cast<size_t>(idx)) {
-        fetchs_.resize(idx + 1);
-      }
-      fetchs_[idx] = op;
-    }
-  }
-}
-
-bool NativePaddlePredictor::Init(
-    std::shared_ptr<framework::Scope> parent_scope) {
-  VLOG(3) << "Predictor::init()";
-  if (FLAGS_profile) {
-    LOG(WARNING) << "Profiler is actived, might affect the performance";
-    LOG(INFO) << "You can turn off by set gflags '-profile false'";
-
-    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
-                                           : platform::ProfilerState::kCPU;
-    platform::EnableProfiler(tracking_device);
-  }
-
-  // no matter with or without MKLDNN
-  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
-
-  if (config_.use_gpu) {
-    place_ = paddle::platform::CUDAPlace(config_.device);
-  } else {
-    place_ = paddle::platform::CPUPlace();
-  }
-  if (parent_scope) {
-    scope_ = parent_scope;
-    sub_scope_ = &(parent_scope->NewScope());
-    PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail");
-  } else {
-    paddle::framework::InitDevices(false);
-    scope_.reset(new paddle::framework::Scope());
-  }
-
-  executor_.reset(new paddle::framework::Executor(place_));
-
-  // Initialize the inference program
-  if (!config_.model_dir.empty()) {
-    // Parameters are saved in separate files sited in
-    // the specified `dirname`.
-    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
-                                                 config_.model_dir);
-  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
-    // All parameters are saved in a single file.
-    // The file names should be consistent with that used
-    // in Python API `fluid.io.save_inference_model`.
-    inference_program_ = paddle::inference::Load(
-        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
-  } else {
-    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
-    return false;
-  }
-
-  ctx_ = executor_->Prepare(*inference_program_, 0);
-  executor_->CreateVariables(*inference_program_,
-                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
-
-  // Get the feed_target_names and fetch_target_names
-  PrepareFeedFetch();
-  return true;
-}
-
-NativePaddlePredictor::~NativePaddlePredictor() {
-  if (FLAGS_profile) {
-    platform::DisableProfiler(platform::EventSortingKey::kTotal,
-                              "./profile.log");
-  }
-  if (sub_scope_) {
-    scope_->DeleteScope(sub_scope_);
-  }
-}
-
-bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
-                                std::vector<PaddleTensor> *output_data,
-                                int batch_size) {
-  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
-    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
-  }
-  VLOG(3) << "Predictor::predict";
-  Timer timer;
-  timer.tic();
-  // set feed variable
-  framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
-  if (!SetFeed(inputs, scope)) {
-    LOG(ERROR) << "fail to set feed";
-    return false;
-  }
-  // Run the inference program
-  // if share variables, we need not create variables
-  VLOG(4) << "Run prepared context";
-  executor_->RunPreparedContext(ctx_.get(), scope,
-                                false, /* don't create local scope each time*/
-                                false /* don't create variable each time */);
-  VLOG(4) << "Finish prepared context";
-  // get fetch variable
-  if (!GetFetch(output_data, scope)) {
-    LOG(ERROR) << "fail to get fetches";
-    return false;
-  }
-  VLOG(3) << "predict cost: " << timer.toc() << "ms";
-
-  // For some other vector like containers not cleaned after each batch.
-  tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get());
-  tensor_array_batch_cleaner_.ResetNoTensorVars();
-  return true;
-}
-
-std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
-  std::lock_guard<std::mutex> lk(clone_mutex_);
-  VLOG(3) << "Predictor::clone";
-  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
-  // Hot fix the bug that result diff in multi-thread.
-  // TODO(Superjomn) re-implement a real clone here.
-  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<NativePaddlePredictor *>(cls.get()));
-  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
-    LOG(ERROR) << "fail to call Init";
-    return nullptr;
-  }
-
-#ifdef __clang__
-  // fix clang compile error
-  return cls;
-#else
-  // fix manylinux compile error.
-  return std::move(cls);
-#endif
-}
-
-bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
-                                    framework::Scope *scope) {
-  VLOG(3) << "Predictor::set_feed";
-  if (inputs.size() != feeds_.size()) {
-    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
-               << inputs.size();
-    return false;
-  }
-
-  // Cache the inputs memory for better concurrency performance.
-  feed_tensors_.resize(inputs.size());
-
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    auto &input = feed_tensors_[i];
-    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
-    void *input_ptr;
-    if (inputs[i].dtype == PaddleDType::INT64) {
-      input_ptr = input.mutable_data<int64_t>(ddim, place_);
-    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
-      input_ptr = input.mutable_data<float>(ddim, place_);
-    } else if (inputs[i].dtype == PaddleDType::INT32) {
-      input_ptr = input.mutable_data<int32_t>(ddim, place_);
-    } else {
-      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
-      return false;
-    }
-
-    PADDLE_ENFORCE_NOT_NULL(input_ptr);
-    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
-    if (platform::is_cpu_place(place_)) {
-      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
-                  inputs[i].data.length());
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto *dev_ctx =
-          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
-      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
-      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
-                   platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(), dev_ctx->stream());
-#else
-      PADDLE_THROW("Not compile with CUDA, should not reach here.");
-#endif
-    }
-
-    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
-    framework::LoD lod;
-    for (auto &level : inputs[i].lod) {
-      lod.emplace_back(level);
-    }
-    input.set_lod(lod);
-    int idx = -1;
-    if (config_.specify_input_name) {
-      idx = feed_names_[inputs[i].name];
-    } else {
-      idx = boost::get<int>(feeds_[i]->GetAttr("col"));
-    }
-    framework::SetFeedVariable(scope, input, "feed", idx);
-  }
-  return true;
-}
-template <typename T>
-void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
-                                        PaddleTensor *output) {
-  // set shape.
-  auto shape = framework::vectorize(fetch.dims());
-  output->shape.assign(shape.begin(), shape.end());
-  // set data.
-  const T *data = fetch.data<T>();
-  int num_elems = inference::VecReduceToInt(shape);
-  output->data.Resize(num_elems * sizeof(T));
-  // The fetched tensor output by fetch op, should always in CPU memory, so just
-  // copy.
-  memcpy(output->data.data(), data, num_elems * sizeof(T));
-  // set lod
-  output->lod.clear();
-  for (auto &level : fetch.lod()) {
-    output->lod.emplace_back(level.begin(), level.end());
-  }
-}
-
-bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
-                                     framework::Scope *scope) {
-  VLOG(3) << "Predictor::get_fetch";
-  outputs->resize(fetchs_.size());
-  for (size_t i = 0; i < fetchs_.size(); ++i) {
-    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
-    PADDLE_ENFORCE((size_t)idx == i);
-    framework::LoDTensor &fetch =
-        framework::GetFetchVariable(*scope, "fetch", idx);
-    auto type = fetch.type();
-    auto output = &(outputs->at(i));
-    output->name = fetchs_[idx]->Input("X")[0];
-    if (type == framework::DataTypeTrait<float>::DataType()) {
-      GetFetchOne<float>(fetch, output);
-      output->dtype = PaddleDType::FLOAT32;
-    } else if (type == framework::DataTypeTrait<int64_t>::DataType()) {
-      GetFetchOne<int64_t>(fetch, output);
-      output->dtype = PaddleDType::INT64;
-    } else if (type == framework::DataTypeTrait<int32_t>::DataType()) {
-      GetFetchOne<int32_t>(fetch, output);
-      output->dtype = PaddleDType::INT32;
-    } else {
-      LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
-    }
-  }
-  return true;
-}
-
-template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
-  VLOG(3) << "create NativePaddlePredictor";
-  if (config.use_gpu) {
-    // 1. GPU memory
-    PADDLE_ENFORCE_GE(
-        config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
-    std::vector<std::string> flags;
-    if (config.fraction_of_gpu_memory >= 0.0f ||
-        config.fraction_of_gpu_memory <= 0.95f) {
-      flags.push_back("dummpy");
-      std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         num2str<float>(config.fraction_of_gpu_memory);
-      flags.push_back(flag);
-      VLOG(3) << "set flag: " << flag;
-      framework::InitGflags(flags);
-    }
-  }
-
-  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
-  PADDLE_ENFORCE_NOT_NULL(
-      dynamic_cast<NativePaddlePredictor *>(predictor.get()));
-  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
-    return nullptr;
-  }
-#ifdef __clang__
-  // fix clang compile error
-  return predictor;
-#else
-  return std::move(predictor);
-#endif
-}
-
-template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
-    const NativeConfig &config) {
-  return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
deleted file mode 100644
index 96b94777304382a9d4be115a84f80ead69249863..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api_impl.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-
-class NativePaddlePredictor : public PaddlePredictor {
- public:
-  explicit NativePaddlePredictor(const NativeConfig &config)
-      : config_(config) {}
-
-  // will only create sub scope if have global scope
-  bool Init(std::shared_ptr<framework::Scope> parent_scope);
-
-  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data,
-           int batch_size = -1) override;
-
-  std::unique_ptr<PaddlePredictor> Clone() override;
-
-  ~NativePaddlePredictor() override;
-
-  framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); }
-
- protected:
-  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
-               framework::Scope *scope);
-  bool GetFetch(std::vector<PaddleTensor> *output_data,
-                framework::Scope *scope);
-  template <typename T>
-  void GetFetchOne(const framework::LoDTensor &fetchs,
-                   PaddleTensor *output_data);
-  void PrepareFeedFetch();
-
-  NativeConfig config_;
-  platform::Place place_;
-  std::unique_ptr<framework::Executor> executor_;
-  std::shared_ptr<framework::Scope> scope_;
-  std::unique_ptr<framework::ExecutorPrepareContext> ctx_;
-  std::unique_ptr<framework::ProgramDesc> inference_program_;
-  std::vector<framework::OpDesc *> feeds_;
-  std::map<std::string, size_t> feed_names_;
-  std::vector<framework::OpDesc *> fetchs_;
-  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
-  // concurrency problems, wrong results and memory leak, so cache them.
-  std::vector<framework::LoDTensor> feed_tensors_;
-  // Do not use unique_ptr, use parent scope to delete
-  framework::Scope *sub_scope_{nullptr};
-  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
-  // A mutex to make Clone thread safe.
-  std::mutex clone_mutex_;
-};
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
deleted file mode 100644
index c80187adfa721fb0a81652ae59556ad4ad9a3e88..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include <thread>  // NOLINT
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/inference/api/api_impl.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-
-#ifdef __clang__
-#define ACC_DIFF 4e-3
-#else
-#define ACC_DIFF 1e-3
-#endif
-
-DEFINE_string(word2vec_dirname, "",
-              "Directory of the word2vec inference model.");
-DEFINE_string(book_dirname, "", "Directory of the book inference model.");
-
-namespace paddle {
-
-PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
-  PaddleTensor pt;
-
-  if (t->type() == framework::proto::VarType::INT64) {
-    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
-    pt.dtype = PaddleDType::INT64;
-  } else if (t->type() == framework::proto::VarType::FP32) {
-    pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
-    pt.dtype = PaddleDType::FLOAT32;
-  } else if (t->type() == framework::proto::VarType::INT32) {
-    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int32_t));
-    pt.dtype = PaddleDType::INT32;
-  } else {
-    LOG(FATAL) << "unsupported type.";
-  }
-  pt.shape = framework::vectorize<int>(t->dims());
-  return pt;
-}
-
-NativeConfig GetConfig() {
-  NativeConfig config;
-  config.model_dir = FLAGS_word2vec_dirname;
-  LOG(INFO) << "dirname  " << config.model_dir;
-  config.fraction_of_gpu_memory = 0.15;
-#ifdef PADDLE_WITH_CUDA
-  config.use_gpu = true;
-#else
-  config.use_gpu = false;
-#endif
-  config.device = 0;
-  return config;
-}
-
-void MainWord2Vec(bool use_gpu) {
-  NativeConfig config = GetConfig();
-  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-  config.use_gpu = use_gpu;
-
-  framework::LoDTensor first_word, second_word, third_word, fourth_word;
-  framework::LoD lod{{0, 1}};
-  int64_t dict_size = 2073;  // The size of dictionary
-
-  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
-
-  std::vector<PaddleTensor> paddle_tensor_feeds;
-  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&first_word));
-  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&second_word));
-  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&third_word));
-  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&fourth_word));
-
-  std::vector<PaddleTensor> outputs;
-  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
-  ASSERT_EQ(outputs.size(), 1UL);
-  size_t len = outputs[0].data.length();
-  float* data = static_cast<float*>(outputs[0].data.data());
-  for (size_t j = 0; j < len / sizeof(float); ++j) {
-    ASSERT_LT(data[j], 1.0);
-    ASSERT_GT(data[j], -1.0);
-  }
-
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&first_word);
-  cpu_feeds.push_back(&second_word);
-  cpu_feeds.push_back(&third_word);
-  cpu_feeds.push_back(&fourth_word);
-
-  framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
-
-  float* lod_data = output1.data<float>();
-  for (int i = 0; i < output1.numel(); ++i) {
-    EXPECT_LT(lod_data[i] - data[i], ACC_DIFF);
-    EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF);
-  }
-}
-
-void MainImageClassification(bool use_gpu) {
-  int batch_size = 2;
-  bool repeat = false;
-  NativeConfig config = GetConfig();
-  config.use_gpu = use_gpu;
-  config.model_dir =
-      FLAGS_book_dirname + "/image_classification_resnet.inference.model";
-
-  const bool is_combined = false;
-  std::vector<std::vector<int64_t>> feed_target_shapes =
-      GetFeedTargetShapes(config.model_dir, is_combined);
-
-  framework::LoDTensor input;
-  // Use normilized image pixels as input data,
-  // which should be in the range [0.0, 1.0].
-  feed_target_shapes[0][0] = batch_size;
-  framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
-  SetupTensor<float>(&input, input_dims, static_cast<float>(0),
-                     static_cast<float>(1));
-  std::vector<framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-
-  framework::LoDTensor output1;
-  std::vector<framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  TestInference<platform::CPUPlace, false, true>(
-      config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined);
-
-  auto predictor = CreatePaddlePredictor(config);
-  std::vector<PaddleTensor> paddle_tensor_feeds;
-  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input));
-
-  std::vector<PaddleTensor> outputs;
-  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
-  ASSERT_EQ(outputs.size(), 1UL);
-  size_t len = outputs[0].data.length();
-  float* data = static_cast<float*>(outputs[0].data.data());
-  float* lod_data = output1.data<float>();
-  for (size_t j = 0; j < len / sizeof(float); ++j) {
-    EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF);
-  }
-}
-
-void MainThreadsWord2Vec(bool use_gpu) {
-  NativeConfig config = GetConfig();
-  config.use_gpu = use_gpu;
-  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
-
-  // prepare inputs data and reference results
-  constexpr int num_jobs = 3;
-  std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
-  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
-  std::vector<framework::LoDTensor> refs(num_jobs);
-  for (size_t i = 0; i < jobs.size(); ++i) {
-    // each job has 4 words
-    jobs[i].resize(4);
-    for (size_t j = 0; j < 4; ++j) {
-      framework::LoD lod{{0, 1}};
-      int64_t dict_size = 2073;  // The size of dictionary
-      SetupLoDTensor(&jobs[i][j], lod, static_cast<int64_t>(0), dict_size - 1);
-      paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j]));
-    }
-
-    // get reference result of each job
-    std::vector<paddle::framework::LoDTensor*> ref_feeds;
-    std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
-    for (auto& word : jobs[i]) {
-      ref_feeds.push_back(&word);
-    }
-    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
-  }
-
-  // create threads and each thread run 1 job
-  std::vector<std::thread> threads;
-  for (int tid = 0; tid < num_jobs; ++tid) {
-    threads.emplace_back([&, tid]() {
-      auto predictor = CreatePaddlePredictor(config);
-      auto& local_inputs = paddle_tensor_feeds[tid];
-      std::vector<PaddleTensor> local_outputs;
-      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
-
-      // check outputs range
-      ASSERT_EQ(local_outputs.size(), 1UL);
-      const size_t len = local_outputs[0].data.length();
-      float* data = static_cast<float*>(local_outputs[0].data.data());
-      for (size_t j = 0; j < len / sizeof(float); ++j) {
-        ASSERT_LT(data[j], 1.0);
-        ASSERT_GT(data[j], -1.0);
-      }
-
-      // check outputs correctness
-      float* ref_data = refs[tid].data<float>();
-      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
-      for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], 2e-3);
-      }
-    });
-  }
-  for (int i = 0; i < num_jobs; ++i) {
-    threads[i].join();
-  }
-}
-
-void MainThreadsImageClassification(bool use_gpu) {
-  constexpr int num_jobs = 4;  // each job run 1 batch
-  constexpr int batch_size = 1;
-  NativeConfig config = GetConfig();
-  config.use_gpu = use_gpu;
-  config.model_dir =
-      FLAGS_book_dirname + "/image_classification_resnet.inference.model";
-
-  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
-  std::vector<framework::LoDTensor> jobs(num_jobs);
-  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
-  std::vector<framework::LoDTensor> refs(num_jobs);
-  for (size_t i = 0; i < jobs.size(); ++i) {
-    // prepare inputs
-    std::vector<std::vector<int64_t>> feed_target_shapes =
-        GetFeedTargetShapes(config.model_dir, /*is_combined*/ false);
-    feed_target_shapes[0][0] = batch_size;
-    framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
-    SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f);
-    paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i]));
-
-    // get reference result of each job
-    std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
-    std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
-    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
-  }
-
-  // create threads and each thread run 1 job
-  std::vector<std::thread> threads;
-  for (int tid = 0; tid < num_jobs; ++tid) {
-    threads.emplace_back([&, tid]() {
-      auto predictor = CreatePaddlePredictor(config);
-      auto& local_inputs = paddle_tensor_feeds[tid];
-      std::vector<PaddleTensor> local_outputs;
-      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
-
-      // check outputs correctness
-      ASSERT_EQ(local_outputs.size(), 1UL);
-      const size_t len = local_outputs[0].data.length();
-      float* data = static_cast<float*>(local_outputs[0].data.data());
-      float* ref_data = refs[tid].data<float>();
-      EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
-      for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
-      }
-    });
-  }
-  for (int i = 0; i < num_jobs; ++i) {
-    threads[i].join();
-  }
-}
-
-TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
-TEST(inference_api_native, word2vec_cpu_threads) {
-  MainThreadsWord2Vec(false /*use_gpu*/);
-}
-TEST(inference_api_native, image_classification_cpu) {
-  MainImageClassification(false /*use_gpu*/);
-}
-TEST(inference_api_native, image_classification_cpu_threads) {
-  MainThreadsImageClassification(false /*use_gpu*/);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
-// Turn off temporarily for the unstable result.
-// TEST(inference_api_native, word2vec_gpu_threads) {
-//   MainThreadsWord2Vec(true /*use_gpu*/);
-// }
-TEST(inference_api_native, image_classification_gpu) {
-  MainImageClassification(true /*use_gpu*/);
-}
-// Turn off temporarily for the unstable result.
-// TEST(inference_api_native, image_classification_gpu_threads) {
-//   MainThreadsImageClassification(true /*use_gpu*/);
-// }
-#endif
-
-TEST(PassBuilder, Delete) {
-  AnalysisConfig config;
-  config.DisableGpu();
-  config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
-  const auto& passes = config.pass_builder()->AllPasses();
-  auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass");
-  ASSERT_EQ(it, passes.end());
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc
deleted file mode 100644
index 2c450ef7cead4d5c3870d5e9186eb221e5dc19a0..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api_tester.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
-namespace paddle {
-
-/*
- * Do not use this, just a demo indicating how to customize a config for a
- * specific predictor.
- */
-struct DemoConfig : public PaddlePredictor::Config {
-  float other_config;
-};
-
-/*
- * Do not use this, just a demo indicating how to customize a Predictor.
- */
-class DemoPredictor : public PaddlePredictor {
- public:
-  explicit DemoPredictor(const DemoConfig &config) {
-    LOG(INFO) << "I get other_config " << config.other_config;
-  }
-  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data,
-           int batch_size = 0) override {
-    LOG(INFO) << "Run";
-    return false;
-  }
-
-  std::unique_ptr<PaddlePredictor> Clone() override { return nullptr; }
-
-  ~DemoPredictor() override {}
-};
-
-template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<DemoConfig>(
-    const DemoConfig &config) {
-  std::unique_ptr<PaddlePredictor> x(new DemoPredictor(config));
-  return x;
-}
-
-TEST(paddle_inference_api, demo) {
-  DemoConfig config;
-  config.other_config = 1.7;
-  auto predictor = CreatePaddlePredictor(config);
-  std::vector<PaddleTensor> outputs;
-  predictor->Run({}, &outputs);
-}
-
-TEST(paddle_inference_api, get_version) {
-  LOG(INFO) << "paddle version:\n" << get_version();
-  auto version = get_version();
-  ASSERT_FALSE(version.empty());
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/.gitignore b/paddle/fluid/inference/api/demo_ci/.gitignore
deleted file mode 100644
index 1269488f7fb1f4b56a8c0e5eb48cecbfadfa9219..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/demo_ci/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
deleted file mode 100644
index 113302b7e2adf4c79b20b2a2fe8e12f06dd3488f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ /dev/null
@@ -1,176 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-project(cpp_inference_demo CXX C)
-option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
-option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
-option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
-option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
-if(NOT WITH_STATIC_LIB)
-  add_definitions("-DPADDLE_WITH_SHARED_LIB")
-endif()
-
-macro(safe_set_static_flag)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-endmacro()
-
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
-endif()
-if(NOT DEFINED DEMO_NAME)
-  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
-endif()
-
-include_directories("${PADDLE_LIB}/")
-set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
-include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
-include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include")
-include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include")
-include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
-include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}zlib/include")
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
-
-link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}zlib/lib")
-link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
-link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib")
-link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
-link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
-link_directories("${PADDLE_LIB}/paddle/lib")
-
-if (WIN32)
-  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-  if (MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-    if (WITH_STATIC_LIB)
-      safe_set_static_flag()
-      add_definitions(-DSTATIC_LIB)
-    endif()
-  endif()
-else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-endif()
-message("flags" ${CMAKE_CXX_FLAGS})
-
-if(WITH_GPU)
-  if(NOT WIN32)
-    set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
-  else()
-    if(CUDA_LIB STREQUAL "")
-      set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
-    endif()
-  endif(NOT WIN32)
-endif()
-
-if (NOT WIN32)
-  if (USE_TENSORRT AND WITH_GPU)
-      include_directories("${TENSORRT_INCLUDE_DIR}")
-      link_directories("${TENSORRT_LIB_DIR}")
-  endif()
-endif(NOT WIN32)
-
-if (NOT WIN32)
-    set(NGRAPH_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}ngraph")
-    if(EXISTS ${NGRAPH_PATH})
-        include(GNUInstallDirs)
-        include_directories("${NGRAPH_PATH}/include")
-        link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
-        set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
-    endif()
-endif()
-
-if(WITH_MKL)
-  set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml")
-  include_directories("${MATH_LIB_PATH}/include")
-  if(WIN32)
-    set(MATH_DLL ${MATH_LIB_PATH}/lib/mklml${CMAKE_SHARED_LIBRARY_SUFFIX}
-            ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX}
-            ${MATH_LIB_PATH}/lib/msvcr120${CMAKE_SHARED_LIBRARY_SUFFIX}
-            )
-  else()
-    set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
-                 ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
-  endif()
-  set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
-  if(EXISTS ${MKLDNN_PATH})
-    include_directories("${MKLDNN_PATH}/include")
-    if(WIN32)
-      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
-    else(WIN32)
-      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
-    endif(WIN32)
-  endif()
-else()
-  set(MATH_LIB ${PADDLE_LIB_THIRD_PARTY_PATH}openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
-  if(WIN32)
-    set(MATH_DLL ${PADDLE_LIB_THIRD_PARTY_PATH}openblas/lib/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
-  endif()
-endif()
-
-# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
-if(WITH_STATIC_LIB)
-  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
-else()
-  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
-endif()
-
-if (NOT WIN32)
-  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-  set(DEPS ${DEPS}
-      ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
-      glog gflags protobuf z xxhash
-      ${EXTERNAL_LIB})
-else()
-  set(DEPS ${DEPS}
-      ${MATH_LIB} ${MKLDNN_LIB}
-      glog gflags_static libprotobuf zlibstatic xxhash ${EXTERNAL_LIB})
-  set(DEPS ${DEPS} libcmt shlwapi.lib)
-endif(NOT WIN32)
-
-if(WITH_GPU)
-  if(NOT WIN32)
-    if (USE_TENSORRT)
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
-    endif()
-    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
-  else()
-    if (USE_TENSORRT)
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
-    endif()
-    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
-    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
-    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
-  endif()
-endif()
-
-add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
-target_link_libraries(${DEMO_NAME} ${DEPS})
-if(WIN32)
-  if(USE_TENSORRT)
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
-              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
-            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
-              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
-            )
-  endif()
-  if(WITH_MKL)
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
-          COMMAND ${CMAKE_COMMAND} -E copy ${MATH_DLL} ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
-          )
-    else()
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${MATH_DLL} ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
-            )
-    endif()
-endif()
diff --git a/paddle/fluid/inference/api/demo_ci/README.md b/paddle/fluid/inference/api/demo_ci/README.md
deleted file mode 100644
index 7f013da7f30acd84ec484773f4ea716a08efa0ff..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/demo_ci/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# Inference Demos
-
-There are several demos:
-
-- simple_on_word2vec: 
-  - Follow the C++ codes is in `simple_on_word2vec.cc`. 
-  - It is suitable for word2vec model.
-- vis_demo: 
-  - Follow the C++ codes is in `vis_demo.cc`. 
-  - It is suitable for mobilenet, se_resnext50 and ocr three models.
-  - Input data format:
-    - Each line contains a single record
-    - Each record's format is
-    ```
-    <space splitted floats as data>\t<space splitted ints as shape>
-    ```
-
-To build and execute the demos, simply run 
-```
-./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU
-```
-- It will build and execute the demos in both static and shared library.
-- `$PADDLE_ROOT`: paddle library path
-- `$TURN_ON_MKL`: use MKL or Openblas
-- `$TEST_GPU_CPU`: test both GPU/CPU mode or only CPU mode
-- NOTE: for simple_on_word2vec, must run `ctest -R test_word2vec -R` to obtain word2vec model at first.
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
deleted file mode 100755
index 0d9f3d2aa237acaf3bd7adb031b1f2a73c555352..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/demo_ci/clean.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-set -x
-cd `dirname $0`
-rm -rf build/ data/
-set +x
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
deleted file mode 100755
index 054f9de3d7e51097c9a8597d2e337dbc71c4ef7b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/bin/bash
-set -x
-PADDLE_ROOT=$1
-TURN_ON_MKL=$2 # use MKL or Openblas
-TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
-DATA_DIR=$4 # dataset
-TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
-TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir
-
-cd `dirname $0`
-current_dir=`pwd`
-if [ $2 == ON ]; then
-  # You can export yourself if move the install path
-  MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
-  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
-fi
-if [ $3 == ON ]; then
-  use_gpu_list='true false'
-else
-  use_gpu_list='false'
-fi
-
-USE_TENSORRT=OFF
-if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
-  USE_TENSORRT=ON
-fi
-
-PREFIX=inference-vis-demos%2F
-URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
-
-# download vis_demo data
-function download() {
-  dir_name=$1
-  mkdir -p $dir_name
-  cd $dir_name
-  if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then
-    echo "${PREFIX}{dir_name}.tar.gz has been downloaded."
-  else
-      wget -q ${URL_ROOT}$dir_name.tar.gz
-      tar xzf *.tar.gz
-  fi
-  cd ..
-}
-mkdir -p $DATA_DIR
-cd $DATA_DIR
-vis_demo_list='se_resnext50 ocr mobilenet'
-for vis_demo_name in $vis_demo_list; do
-  download $vis_demo_name
-done
-
-# compile and test the demo
-cd $current_dir
-mkdir -p build
-cd build
-
-for WITH_STATIC_LIB in ON OFF; do
-# TODO(Superjomn) reopen this
-# something wrong with the TensorArray reset.
-:<<D
-  # -----simple_on_word2vec-----
-  rm -rf *
-  cmake .. -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=simple_on_word2vec \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
-  make -j
-  word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
-  if [ -d $word2vec_model ]; then
-    for use_gpu in $use_gpu_list; do
-      ./simple_on_word2vec \
-        --dirname=$word2vec_model \
-        --use_gpu=$use_gpu
-      if [ $? -ne 0 ]; then
-        echo "simple_on_word2vec demo runs fail."
-        exit 1
-      fi
-    done
-  fi
-D
-  # ---------vis_demo---------
-  rm -rf *
-  cmake .. -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=vis_demo \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
-  make -j
-  for use_gpu in $use_gpu_list; do
-    for vis_demo_name in $vis_demo_list; do
-      ./vis_demo \
-        --modeldir=$DATA_DIR/$vis_demo_name/model \
-        --data=$DATA_DIR/$vis_demo_name/data.txt \
-        --refer=$DATA_DIR/$vis_demo_name/result.txt \
-        --use_gpu=$use_gpu
-      if [ $? -ne 0 ]; then
-        echo "vis demo $vis_demo_name runs fail."
-        exit 1
-      fi
-    done
-  done
-
-  # --------tensorrt mobilenet------
-  if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
-    rm -rf *
-    cmake .. -DPADDLE_LIB=${inference_install_dir} \
-      -DWITH_MKL=$TURN_ON_MKL \
-      -DDEMO_NAME=trt_mobilenet_demo \
-      -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DUSE_TENSORRT=$USE_TENSORRT \
-      -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
-      -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
-    make -j
-    ./trt_mobilenet_demo \
-      --modeldir=$DATA_DIR/mobilenet/model \
-      --data=$DATA_DIR/mobilenet/data.txt \
-      --refer=$DATA_DIR/mobilenet/result.txt 
-    if [ $? -ne 0 ]; then
-      echo "trt demo trt_mobilenet_demo runs fail."
-      exit 1
-    fi
-  fi
-done
-set +x
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
deleted file mode 100644
index 3dd1d3c838c4b1bcdefdadff16b02dbfb4a02ee9..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains a simple demo for how to take a model for inference.
- */
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-
-#include <algorithm>
-#include <memory>
-#include <thread>  //NOLINT
-
-#include "utils.h"  // NOLINT
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-DEFINE_bool(use_gpu, false, "Whether use gpu.");
-
-namespace paddle {
-namespace demo {
-
-void Main(bool use_gpu) {
-  //# 1. Create PaddlePredictor with a config.
-  NativeConfig config;
-  if (FLAGS_dirname.empty()) {
-    LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model";
-    exit(1);
-  }
-  config.model_dir = FLAGS_dirname;
-  config.use_gpu = use_gpu;
-  config.fraction_of_gpu_memory = 0.15;
-  config.device = 0;
-  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-
-  for (int batch_id = 0; batch_id < 3; batch_id++) {
-    //# 2. Prepare input.
-    int64_t data[4] = {1, 2, 3, 4};
-
-    PaddleTensor tensor;
-    tensor.shape = std::vector<int>({4, 1});
-    tensor.data = PaddleBuf(data, sizeof(data));
-    tensor.dtype = PaddleDType::INT64;
-
-    // For simplicity, we set all the slots with the same data.
-    std::vector<PaddleTensor> slots(4, tensor);
-
-    //# 3. Run
-    std::vector<PaddleTensor> outputs;
-    CHECK(predictor->Run(slots, &outputs));
-
-    //# 4. Get output.
-    CHECK_EQ(outputs.size(), 1UL);
-    // Check the output buffer size and result of each tid.
-    CHECK_EQ(outputs.front().data.length(), 33168UL);
-    float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
-                       0.000932706};
-    const size_t num_elements = outputs.front().data.length() / sizeof(float);
-    // The outputs' buffers are in CPU memory.
-    for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
-         i++) {
-      CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
-                 0.001);
-    }
-  }
-}
-
-void MainThreads(int num_threads, bool use_gpu) {
-  // Multi-threads only support on CPU
-  // 0. Create PaddlePredictor with a config.
-  NativeConfig config;
-  config.model_dir = FLAGS_dirname;
-  config.use_gpu = use_gpu;
-  config.fraction_of_gpu_memory = 0.15;
-  config.device = 0;
-  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
-
-  std::vector<std::thread> threads;
-  for (int tid = 0; tid < num_threads; ++tid) {
-    threads.emplace_back([&, tid]() {
-      // 1. clone a predictor which shares the same parameters
-      auto predictor = main_predictor->Clone();
-      constexpr int num_batches = 3;
-      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
-        // 2. Dummy Input Data
-        int64_t data[4] = {1, 2, 3, 4};
-        PaddleTensor tensor;
-        tensor.shape = std::vector<int>({4, 1});
-        tensor.data = PaddleBuf(data, sizeof(data));
-        tensor.dtype = PaddleDType::INT64;
-
-        std::vector<PaddleTensor> inputs(4, tensor);
-        std::vector<PaddleTensor> outputs;
-        // 3. Run
-        CHECK(predictor->Run(inputs, &outputs));
-
-        // 4. Get output.
-        CHECK_EQ(outputs.size(), 1UL);
-        // Check the output buffer size and result of each tid.
-        CHECK_EQ(outputs.front().data.length(), 33168UL);
-        float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
-                           0.000932706};
-        const size_t num_elements =
-            outputs.front().data.length() / sizeof(float);
-        // The outputs' buffers are in CPU memory.
-        for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
-             i++) {
-          CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i],
-                     result[i], 0.001);
-        }
-      }
-    });
-  }
-  for (int i = 0; i < num_threads; ++i) {
-    threads[i].join();
-  }
-}
-
-}  // namespace demo
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  paddle::demo::Main(false /* use_gpu*/);
-  paddle::demo::MainThreads(1, false /* use_gpu*/);
-  paddle::demo::MainThreads(4, false /* use_gpu*/);
-  if (FLAGS_use_gpu) {
-    paddle::demo::Main(true /*use_gpu*/);
-    paddle::demo::MainThreads(1, true /*use_gpu*/);
-    paddle::demo::MainThreads(4, true /*use_gpu*/);
-  }
-  return 0;
-}
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
deleted file mode 100644
index f7da55c9ae368763786c1b1fd3e86d942c5e9fe8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains demo of mobilenet for tensorrt.
- */
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
-#include "utils.h"  // NOLINT
-
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DEFINE_string(modeldir, "", "Directory of the inference model.");
-DEFINE_string(refer, "", "path to reference result for comparison.");
-DEFINE_string(
-    data, "",
-    "path of data; each line is a record, format is "
-    "'<space splitted floats as data>\t<space splitted ints as shape'");
-
-namespace paddle {
-namespace demo {
-
-/*
- * Use the tensorrt fluid engine to inference the demo.
- */
-void Main() {
-  std::unique_ptr<PaddlePredictor> predictor;
-  paddle::AnalysisConfig config;
-  config.EnableUseGpu(100, 0);
-  config.SetModel(FLAGS_modeldir + "/__model__",
-                  FLAGS_modeldir + "/__params__");
-  config.EnableTensorRtEngine();
-  predictor = CreatePaddlePredictor(config);
-
-  VLOG(3) << "begin to process data";
-  // Just a single batch of data.
-  std::string line;
-  std::ifstream file(FLAGS_data);
-  std::getline(file, line);
-  auto record = ProcessALine(line);
-  file.close();
-
-  // Inference.
-  PaddleTensor input;
-  input.shape = record.shape;
-  input.data =
-      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
-  input.dtype = PaddleDType::FLOAT32;
-
-  VLOG(3) << "run executor";
-  std::vector<PaddleTensor> output;
-  predictor->Run({input}, &output, 1);
-
-  VLOG(3) << "output.size " << output.size();
-  auto& tensor = output.front();
-  VLOG(3) << "output: " << SummaryTensor(tensor);
-
-  // compare with reference result
-  CheckOutput(FLAGS_refer, tensor);
-}
-
-}  // namespace demo
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  paddle::demo::Main();
-  return 0;
-}
diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
deleted file mode 100644
index 1505a898c5bba285b377203c1503b8615666b196..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-#include "paddle/include/paddle_inference_api.h"
-
-namespace paddle {
-namespace demo {
-
-struct Record {
-  std::vector<float> data;
-  std::vector<int32_t> shape;
-};
-
-static void split(const std::string& str, char sep,
-                  std::vector<std::string>* pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-Record ProcessALine(const std::string& line) {
-  VLOG(3) << "process a line";
-  std::vector<std::string> columns;
-  split(line, '\t', &columns);
-  CHECK_EQ(columns.size(), 2UL)
-      << "data format error, should be <data>\t<shape>";
-
-  Record record;
-  std::vector<std::string> data_strs;
-  split(columns[0], ' ', &data_strs);
-  for (auto& d : data_strs) {
-    record.data.push_back(std::stof(d));
-  }
-
-  std::vector<std::string> shape_strs;
-  split(columns[1], ' ', &shape_strs);
-  for (auto& s : shape_strs) {
-    record.shape.push_back(std::stoi(s));
-  }
-  VLOG(3) << "data size " << record.data.size();
-  VLOG(3) << "data shape size " << record.shape.size();
-  return record;
-}
-
-void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
-  std::string line;
-  std::ifstream file(referfile);
-  std::getline(file, line);
-  auto refer = ProcessALine(line);
-  file.close();
-
-  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  VLOG(3) << "predictor output numel " << numel;
-  VLOG(3) << "reference output numel " << refer.data.size();
-  CHECK_EQ(numel, refer.data.size());
-  switch (output.dtype) {
-    case PaddleDType::INT64: {
-      for (size_t i = 0; i < numel; ++i) {
-        CHECK_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
-      }
-      break;
-    }
-    case PaddleDType::FLOAT32: {
-      for (size_t i = 0; i < numel; ++i) {
-        CHECK_LT(
-            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
-            1e-5);
-      }
-      break;
-    }
-    case PaddleDType::INT32: {
-      for (size_t i = 0; i < numel; ++i) {
-        CHECK_EQ(static_cast<int32_t*>(output.data.data())[i], refer.data[i]);
-      }
-      break;
-    }
-  }
-}
-
-/*
- * Get a summary of a PaddleTensor content.
- */
-static std::string SummaryTensor(const PaddleTensor& tensor) {
-  std::stringstream ss;
-  int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype);
-
-  ss << "data[:10]\t";
-  switch (tensor.dtype) {
-    case PaddleDType::INT64: {
-      for (int i = 0; i < std::min(num_elems, 10); i++) {
-        ss << static_cast<int64_t*>(tensor.data.data())[i] << " ";
-      }
-      break;
-    }
-    case PaddleDType::FLOAT32: {
-      for (int i = 0; i < std::min(num_elems, 10); i++) {
-        ss << static_cast<float*>(tensor.data.data())[i] << " ";
-      }
-      break;
-    }
-    case PaddleDType::INT32: {
-      for (int i = 0; i < std::min(num_elems, 10); i++) {
-        ss << static_cast<int32_t*>(tensor.data.data())[i] << " ";
-      }
-      break;
-    }
-  }
-  return ss.str();
-}
-
-}  // namespace demo
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
deleted file mode 100644
index b63e8e62a11dcf7eb22eafdfc16bdd4fcb9fa5a5..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains demo for mobilenet, se-resnext50 and ocr.
- */
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include "utils.h"  // NOLINT
-
-#ifdef PADDLE_WITH_CUDA
-DECLARE_double(fraction_of_gpu_memory_to_use);
-#endif
-DEFINE_string(modeldir, "", "Directory of the inference model.");
-DEFINE_string(refer, "", "path to reference result for comparison.");
-DEFINE_string(
-    data, "",
-    "path of data; each line is a record, format is "
-    "'<space splitted floats as data>\t<space splitted ints as shape'");
-DEFINE_bool(use_gpu, false, "Whether use gpu.");
-#ifdef PADDLE_WITH_SHARED_LIB
-DEFINE_bool(profile, false, "Whether use profile.");
-#endif
-
-namespace paddle {
-namespace demo {
-
-/*
- * Use the native and analysis fluid engine to inference the demo.
- */
-void Main(bool use_gpu) {
-  std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
-  AnalysisConfig config;
-  if (use_gpu) {
-    config.EnableUseGpu(100, 0);
-  }
-  config.SetModel(FLAGS_modeldir + "/__model__",
-                  FLAGS_modeldir + "/__params__");
-
-  predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
-  analysis_predictor = CreatePaddlePredictor(config);
-
-  // Just a single batch of data.
-  std::string line;
-  std::ifstream file(FLAGS_data);
-  std::getline(file, line);
-  auto record = ProcessALine(line);
-  file.close();
-
-  // Inference.
-  PaddleTensor input;
-  input.shape = record.shape;
-  input.data =
-      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
-  input.dtype = PaddleDType::FLOAT32;
-
-  std::vector<PaddleTensor> output, analysis_output;
-  predictor->Run({input}, &output, 1);
-
-  auto& tensor = output.front();
-
-  // compare with reference result
-  CheckOutput(FLAGS_refer, tensor);
-
-  // the analysis_output has some diff with native_output,
-  // TODO(luotao): add CheckOutput for analysis_output later.
-  analysis_predictor->Run({input}, &analysis_output, 1);
-}
-
-}  // namespace demo
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_use_gpu) {
-    paddle::demo::Main(true /*use_gpu*/);
-  } else {
-    paddle::demo::Main(false /*use_gpu*/);
-  }
-  return 0;
-}
diff --git a/paddle/fluid/inference/api/demo_ci/windows_inference.md b/paddle/fluid/inference/api/demo_ci/windows_inference.md
deleted file mode 100644
index 44b2586ad6d33ce7cbd2bb3080acc96b5e27f660..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/demo_ci/windows_inference.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# windows inference
-本文介绍windows inference，目前只提供了静态编译，编译出paddle_fluid.lib，包含了除openblas.dll之外的所有第三方依赖库。
-
-1. 下载最新的paddle_fluid.lib和openblas.dll，并把它们放在同一个目录下。
-
-2. 准备预训练好的模型文件，例如models中的模型，可以将模型用safe_inference_model接口保存下来。将模型文件放到该目录下
-
-3. 进入Paddle/paddle/fluid/inference/api/demo_ci目录，新建build目录，然后使用cmake生成vs2015的solution文件。
-其中PADDLE_LIB是前面的paddle_fluid.lib对应文件夹, CUDA_LIB指定为x64格式下的cuda系统库目录文件夹。
-```shell
- cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_fluid.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64
-```
-然后用vs2015打开对应的项目文件，注意使用静态链接 "/MT"，生成对应的exe。将openblas.dll放到exe所在目录。
-
-4. 该exe即为项目生成文件，可绑定运行。
-
-## FAQ
-1. cmake需要您手动下载，并添加到系统路径里
-2. 路径中的不要包含空格，例如发现CUDA_LIB路径是Program Files(x86)可能会出错。可以将CUDA拷贝到一个新位置。
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
deleted file mode 100644
index 80b53b32a8607b4e67f42ba30bd1a283c93ebed1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope)
-cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
-cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
deleted file mode 100644
index 03c2aa3fb8094ce2996f513b90589de0ef903ae8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
-
-namespace paddle {
-namespace details {
-
-// Should be called after the parameters are loaded.
-void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
-  if (flag_) {
-    for (auto &var_name : scope->LocalVarNames()) {
-      auto *var = scope->FindVar(var_name);
-      // TODO(Superjomn) should avoid the case when a TensorArray is a
-      // parameter.
-      if (var_name == "feed" || var_name == "fetch") continue;
-      if (var->IsType<framework::LoDTensorArray>()) {
-        VLOG(4) << "collect " << var_name;
-        arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
-      }
-    }
-    for (auto *kid : scope->kids()) {
-      CollectTensorArrays(kid);
-    }
-
-    VLOG(3) << "Collect " << arrays_.size() << " arrays";
-    flag_ = false;
-  }
-}
-
-// Should be called when `Run` finished.
-void TensorArrayBatchCleaner::ResetTensorArray() {
-  for (auto *arr : arrays_) {
-    arr->clear();
-  }
-}
-
-void TensorArrayBatchCleaner::CollectNoTensorVars(framework::Scope *scope) {
-  if (no_tensor_flag_) {
-    for (auto &var_name : scope->LocalVarNames()) {
-      auto *var = scope->FindVar(var_name);
-      if (!var->IsInitialized()) continue;
-      if (!valid_types_.count(var->Type())) {
-        no_tensor_vars_.insert(var);
-      }
-    }
-
-    for (auto *kid : scope->kids()) {
-      CollectTensorArrays(kid);
-    }
-    no_tensor_flag_ = false;  // Only collect one time.
-  }
-}
-
-void TensorArrayBatchCleaner::ResetNoTensorVars() {
-  for (auto *var : no_tensor_vars_) {
-    var->Clear();
-  }
-}
-
-}  // namespace details
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
deleted file mode 100644
index 213c6891d0e2320689c8c69266d40611f295edc8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace details {
-
-// Clean the TensorArray each batch to make the behavior the same with the
-// training phase.
-struct TensorArrayBatchCleaner {
-  TensorArrayBatchCleaner() {
-    constexpr auto kTensorId = framework::VarTypeTrait<framework::Tensor>::kId;
-    constexpr auto kLoDTensorId =
-        framework::VarTypeTrait<framework::LoDTensor>::kId;
-    valid_types_.insert(kTensorId);
-    valid_types_.insert(kLoDTensorId);
-  }
-  // Collect the variables that are not Tensor or LoDTensor, and reset them to a
-  // bool(trick), because some of them are containers, and some operators just
-  // keep inserting new items without clearing the containers first; So the
-  // memory grow larger and larger in inference service deployed online.
-  void CollectNoTensorVars(framework::Scope *scope);
-  void ResetNoTensorVars();
-
-  // Fix the tensor array not clear in the inference scenarios.
-  void CollectTensorArrays(framework::Scope *scope);
-  void ResetTensorArray();
-
- private:
-  bool flag_{true};
-  bool no_tensor_flag_{true};
-  std::vector<framework::LoDTensorArray *> arrays_;
-
-  std::unordered_set<int> valid_types_;
-  std::unordered_set<framework::Variable *> no_tensor_vars_;
-};
-
-}  // namespace details
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
deleted file mode 100644
index 59ad2c09c0f94d9657c91879956810ccfacbcb35..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-
-void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
-  PADDLE_ENFORCE(!name_.empty(),
-                 "Need to SetName first, so that the corresponding tensor can "
-                 "be retrieved.");
-  PADDLE_ENFORCE(input_or_output_,
-                 "Can't reshape the output tensor, it is readonly");
-  PADDLE_ENFORCE(scope_);
-  auto *scope = static_cast<framework::Scope *>(scope_);
-  auto *var = scope->FindVar(name_);
-  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
-  auto *tensor = var->GetMutable<framework::LoDTensor>();
-  tensor->Resize(framework::make_ddim(shape));
-}
-
-#define EAGER_GET_TENSOR    \
-  if (!tensor_) {           \
-    tensor_ = FindTensor(); \
-  }                         \
-  auto *tensor = static_cast<framework::LoDTensor *>(tensor_);
-
-template <typename T>
-T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
-  EAGER_GET_TENSOR;
-  PADDLE_ENFORCE_GT(
-      tensor->numel(), 0,
-      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
-      "function before retrieving mutable_data from input tensor.");
-  switch (static_cast<int>(place)) {
-    case static_cast<int>(PaddlePlace::kCPU): {
-      return tensor->mutable_data<T>(platform::CPUPlace());
-    }
-    case static_cast<int>(PaddlePlace::kGPU): {
-      return tensor->mutable_data<T>(platform::CUDAPlace());
-    }
-    default:
-      PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
-      break;
-  }
-  return nullptr;
-}
-
-template <typename T>
-T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
-  EAGER_GET_TENSOR;
-  auto *res = tensor->data<T>();
-
-  if (platform::is_cpu_place(tensor->place())) {
-    *place = PaddlePlace::kCPU;
-  } else if (platform::is_gpu_place(tensor->place())) {
-    *place = PaddlePlace::kGPU;
-  } else {
-    *place = PaddlePlace::kUNK;
-  }
-
-  *size = tensor->numel();
-  return res;
-}
-
-PaddleDType ZeroCopyTensor::type() const {
-  EAGER_GET_TENSOR;
-  auto type = tensor->type();
-  if (type == framework::proto::VarType::FP32) {
-    return PaddleDType::FLOAT32;
-  } else if (type == framework::proto::VarType::INT64) {
-    return PaddleDType::INT64;
-  } else if (type == framework::proto::VarType::INT32) {
-    return PaddleDType::INT32;
-  } else if (type == framework::proto::VarType::UINT8) {
-    return PaddleDType::UINT8;
-  }
-  return PaddleDType::FLOAT32;
-}
-
-template <typename T>
-void ZeroCopyTensor::copy_from_cpu(const T *data) {
-  EAGER_GET_TENSOR;
-  PADDLE_ENFORCE_GE(
-      tensor->numel(), 0,
-      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
-      "function before copying data from cpu.");
-  size_t ele_size = tensor->numel() * sizeof(T);
-
-  if (place_ == PaddlePlace::kCPU) {
-    auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
-    std::memcpy(static_cast<void *>(t_data), data, ele_size);
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    platform::CUDAPlace gpu_place(device_);
-    auto *t_data = tensor->mutable_data<T>(gpu_place);
-    auto *dev_ctx =
-        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
-
-    memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
-                 data, ele_size, dev_ctx->stream());
-#else
-    PADDLE_THROW("Not compiled with CUDA, should not reach here.");
-#endif
-  }
-}
-
-template <typename T>
-void ZeroCopyTensor::copy_to_cpu(T *data) {
-  EAGER_GET_TENSOR;
-  auto ele_num = tensor->numel();
-  auto *t_data = tensor->data<T>();
-  auto t_place = tensor->place();
-
-  if (platform::is_cpu_place(t_place)) {
-    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto gpu_place = boost::get<platform::CUDAPlace>(t_place);
-    auto *dev_ctx =
-        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
-    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
-                 t_data, ele_num * sizeof(T), dev_ctx->stream());
-    cudaDeviceSynchronize();
-#else
-    PADDLE_THROW("Not compile with CUDA, should not reach here.");
-#endif
-  }
-}
-template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
-template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
-template void ZeroCopyTensor::copy_from_cpu<int32_t>(const int32_t *data);
-template void ZeroCopyTensor::copy_from_cpu<uint8_t>(const uint8_t *data);
-template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
-template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
-template void ZeroCopyTensor::copy_to_cpu<int32_t>(int32_t *data);
-template void ZeroCopyTensor::copy_to_cpu<uint8_t>(uint8_t *data);
-
-template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
-                                            int *size) const;
-template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
-                                                int *size) const;
-template int32_t *ZeroCopyTensor::data<int32_t>(PaddlePlace *place,
-                                                int *size) const;
-template uint8_t *ZeroCopyTensor::data<uint8_t>(PaddlePlace *place,
-                                                int *size) const;
-template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
-template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
-template int32_t *ZeroCopyTensor::mutable_data<int32_t>(PaddlePlace place);
-template uint8_t *ZeroCopyTensor::mutable_data<uint8_t>(PaddlePlace place);
-
-void *ZeroCopyTensor::FindTensor() const {
-  PADDLE_ENFORCE(!name_.empty(),
-                 "Need to SetName first, so that the corresponding tensor can "
-                 "be retrieved.");
-  PADDLE_ENFORCE(scope_);
-  auto *scope = static_cast<framework::Scope *>(scope_);
-  auto *var = scope->FindVar(name_);
-  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
-  auto *tensor = var->GetMutable<framework::LoDTensor>();
-  return tensor;
-}
-
-std::vector<int> ZeroCopyTensor::shape() const {
-  EAGER_GET_TENSOR;
-  PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
-  return framework::vectorize<int>(tensor->dims());
-}
-
-void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
-  EAGER_GET_TENSOR;
-  framework::LoD lod;
-  for (auto &level : x) {
-    lod.emplace_back(level);
-  }
-  tensor->set_lod(lod);
-}
-
-std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
-  EAGER_GET_TENSOR;
-  std::vector<std::vector<size_t>> res;
-  for (auto &level : tensor->lod()) {
-    res.emplace_back(level);
-  }
-  return res;
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
deleted file mode 100644
index cbbb3ea2d1395acdf4c460bea4b7868c31a20e53..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
-namespace paddle {
-
-void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {}
-
-template <typename T>
-T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
-  return nullptr;
-}
-
-template <typename T>
-T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
-  return nullptr;
-}
-
-template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
-                                            int *size) const;
-template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
-                                                int *size) const;
-template float *ZeroCopyTensor::mutable_data(PaddlePlace place);
-template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
-
-void *ZeroCopyTensor::FindTensor() const { return nullptr; }
-
-std::vector<int> ZeroCopyTensor::shape() const { return {}; }
-
-void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
-
-std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
-  return std::vector<std::vector<size_t>>();
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
deleted file mode 100644
index 9cc491e10d691a206dd903b78c0ea570741da44c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/helper.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/api/helper.h"
-
-namespace paddle {
-namespace inference {
-
-template <>
-std::string to_string<std::vector<float>>(
-    const std::vector<std::vector<float>> &vec) {
-  std::stringstream ss;
-  for (const auto &piece : vec) {
-    ss << to_string(piece) << "\n";
-  }
-  return ss.str();
-}
-
-template <>
-std::string to_string<std::vector<std::vector<float>>>(
-    const std::vector<std::vector<std::vector<float>>> &vec) {
-  std::stringstream ss;
-  for (const auto &line : vec) {
-    for (const auto &rcd : line) {
-      ss << to_string(rcd) << ";\t";
-    }
-    ss << '\n';
-  }
-  return ss.str();
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
deleted file mode 100644
index 907d35b298c5bff872afe5cbfe12201b087c6d97..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/helper.h
+++ /dev/null
@@ -1,341 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <glog/logging.h>
-#include <fstream>
-#if !defined(_WIN32)
-#include <sys/time.h>
-#endif
-#include <algorithm>
-#include <chrono>  // NOLINT
-#include <functional>
-#include <iterator>
-#include <numeric>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/string/printf.h"
-
-extern std::string paddle::framework::DataTypeToString(
-    const framework::proto::VarType::Type type);
-
-namespace paddle {
-namespace inference {
-
-using paddle::framework::DataTypeToString;
-
-// Timer for timer
-class Timer {
- public:
-  std::chrono::high_resolution_clock::time_point start;
-  std::chrono::high_resolution_clock::time_point startu;
-
-  void tic() { start = std::chrono::high_resolution_clock::now(); }
-  double toc() {
-    startu = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> time_span =
-        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
-                                                                  start);
-    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
-    return used_time_ms;
-  }
-};
-
-static int GetUniqueId() {
-  static int id = 0;
-  return id++;
-}
-
-static void split(const std::string &str, char sep,
-                  std::vector<std::string> *pieces, bool ignore_null = true) {
-  pieces->clear();
-  if (str.empty()) {
-    if (!ignore_null) {
-      pieces->push_back(str);
-    }
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-template <typename T>
-static T convert(const std::string &item,
-                 std::function<T(const std::string &item)> func) {
-  T res;
-  try {
-    res = func(item);
-  } catch (std::invalid_argument &e) {
-    std::string message =
-        "invalid_argument exception when try to convert : " + item;
-    LOG(ERROR) << message;
-    PADDLE_THROW(message);
-  } catch (std::out_of_range &e) {
-    std::string message =
-        "out_of_range exception when try to convert : " + item;
-    LOG(ERROR) << message;
-    PADDLE_THROW(message);
-  } catch (...) {
-    std::string message = "unexpected exception when try to convert " + item;
-    LOG(ERROR) << message;
-    PADDLE_THROW(message);
-  }
-  return res;
-}
-
-static void split_to_float(const std::string &str, char sep,
-                           std::vector<float> *fs) {
-  std::vector<std::string> pieces;
-  split(str, sep, &pieces);
-  std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs),
-                 [](const std::string &v) {
-                   return convert<float>(v, [](const std::string &item) {
-                     return std::stof(item);
-                   });
-                 });
-}
-static void split_to_int64(const std::string &str, char sep,
-                           std::vector<int64_t> *is) {
-  std::vector<std::string> pieces;
-  split(str, sep, &pieces);
-  std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is),
-                 [](const std::string &v) {
-                   return convert<int64_t>(v, [](const std::string &item) {
-                     return std::stoll(item);
-                   });
-                 });
-}
-static void split_to_int(const std::string &str, char sep,
-                         std::vector<int> *is) {
-  std::vector<std::string> pieces;
-  split(str, sep, &pieces);
-  std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is),
-                 [](const std::string &v) {
-                   return convert<int>(v, [](const std::string &item) {
-                     return std::stoi(item);
-                   });
-                 });
-}
-template <typename T>
-std::string to_string(const std::vector<T> &vec) {
-  std::stringstream ss;
-  for (const auto &c : vec) {
-    ss << c << " ";
-  }
-  return ss.str();
-}
-template <>
-std::string to_string<std::vector<float>>(
-    const std::vector<std::vector<float>> &vec);
-
-template <>
-std::string to_string<std::vector<std::vector<float>>>(
-    const std::vector<std::vector<std::vector<float>>> &vec);
-
-template <typename T>
-int VecReduceToInt(const std::vector<T> &v) {
-  return std::accumulate(v.begin(), v.end(), 1, [](T a, T b) { return a * b; });
-}
-
-template <typename T>
-static void TensorAssignData(PaddleTensor *tensor,
-                             const std::vector<std::vector<T>> &data) {
-  // Assign buffer
-  int num_elems = VecReduceToInt(tensor->shape);
-  tensor->data.Resize(sizeof(T) * num_elems);
-  int c = 0;
-  for (const auto &f : data) {
-    for (T v : f) {
-      static_cast<T *>(tensor->data.data())[c++] = v;
-    }
-  }
-}
-
-template <typename T>
-static void TensorAssignData(PaddleTensor *tensor,
-                             const std::vector<std::vector<T>> &data,
-                             const std::vector<size_t> &lod) {
-  int size = lod[lod.size() - 1];
-  tensor->shape.assign({size, 1});
-  tensor->lod.assign({lod});
-  TensorAssignData(tensor, data);
-}
-
-template <typename T>
-static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
-                                     const std::vector<std::vector<T>> &data) {
-  auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
-  int c = 0;
-  for (const auto &f : data) {
-    for (T v : f) {
-      ptr[c++] = v;
-    }
-  }
-}
-
-template <typename T>
-static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
-                                     const PaddleBuf &data) {
-  auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
-  for (size_t i = 0; i < data.length() / sizeof(T); i++) {
-    ptr[i] = *(reinterpret_cast<T *>(data.data()) + i);
-  }
-}
-
-static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
-  if (a.dtype != b.dtype) {
-    LOG(ERROR) << "dtype not match";
-    return false;
-  }
-
-  if (a.lod.size() != b.lod.size()) {
-    LOG(ERROR) << "lod not match";
-    return false;
-  }
-  for (size_t i = 0; i < a.lod.size(); i++) {
-    if (a.lod[i].size() != b.lod[i].size()) {
-      LOG(ERROR) << "lod not match";
-      return false;
-    }
-    for (size_t j = 0; j < a.lod[i].size(); j++) {
-      if (a.lod[i][j] != b.lod[i][j]) {
-        LOG(ERROR) << "lod not match";
-        return false;
-      }
-    }
-  }
-
-  if (a.shape.size() != b.shape.size()) {
-    LOG(INFO) << "shape not match";
-    return false;
-  }
-  for (size_t i = 0; i < a.shape.size(); i++) {
-    if (a.shape[i] != b.shape[i]) {
-      LOG(ERROR) << "shape not match";
-      return false;
-    }
-  }
-
-  auto *adata = static_cast<float *>(a.data.data());
-  auto *bdata = static_cast<float *>(b.data.data());
-  for (int i = 0; i < VecReduceToInt(a.shape); i++) {
-    if (adata[i] != bdata[i]) {
-      LOG(ERROR) << "data not match";
-      return false;
-    }
-  }
-  return true;
-}
-
-static std::string DescribeTensor(const PaddleTensor &tensor,
-                                  int max_num_of_data = 15) {
-  std::stringstream os;
-  os << "Tensor [" << tensor.name << "]\n";
-  os << " - type: ";
-  switch (tensor.dtype) {
-    case PaddleDType::FLOAT32:
-      os << "float32";
-      break;
-    case PaddleDType::INT64:
-      os << "int64";
-      break;
-    case PaddleDType::INT32:
-      os << "int32";
-      break;
-    default:
-      os << "unset";
-  }
-  os << '\n';
-
-  os << " - shape: " << to_string(tensor.shape) << '\n';
-  os << " - lod: ";
-  for (auto &l : tensor.lod) {
-    os << to_string(l) << "; ";
-  }
-  os << "\n";
-  os << " - memory length: " << tensor.data.length();
-  os << "\n";
-
-  os << " - data: ";
-  int dim = VecReduceToInt(tensor.shape);
-  float *pdata = static_cast<float *>(tensor.data.data());
-  for (int i = 0; i < dim; i++) {
-    os << pdata[i] << " ";
-  }
-  os << '\n';
-  return os.str();
-}
-
-static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
-  std::stringstream os;
-  os << "Tensor [" << tensor.name() << "]\n";
-
-  os << " - shape: " << to_string(tensor.shape()) << '\n';
-  os << " - lod: ";
-  for (auto &l : tensor.lod()) {
-    os << to_string(l) << "; ";
-  }
-  os << "\n";
-  PaddlePlace place;
-  int size;
-  const auto *data = tensor.data<float>(&place, &size);
-  os << " - numel: " << size;
-  os << "\n";
-  os << " - data: ";
-  for (int i = 0; i < size; i++) {
-    os << data[i] << " ";
-  }
-  return os.str();
-}
-
-static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
-                      double batch_latency, int epoch = 1,
-                      const framework::proto::VarType::Type data_type =
-                          framework::proto::VarType::FP32) {
-  PADDLE_ENFORCE_GT(batch_size, 0, "Non-positive batch size.");
-  double sample_latency = batch_latency / batch_size;
-  LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
-            << " ======";
-  LOG(INFO) << "====== batch size: " << batch_size << ", iterations: " << epoch
-            << ", repetitions: " << repeat << " ======";
-  LOG(INFO) << "====== batch latency: " << batch_latency
-            << "ms, number of samples: " << batch_size * epoch
-            << ", sample latency: " << sample_latency
-            << "ms, fps: " << 1000.f / sample_latency
-            << ", data type: " << DataTypeToString(data_type) << " ======";
-}
-
-static bool IsFileExists(const std::string &path) {
-  std::ifstream file(path);
-  bool exists = file.is_open();
-  file.close();
-  return exists;
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/high_level_api.md b/paddle/fluid/inference/api/high_level_api.md
deleted file mode 100644
index 8b8b6916d7e2b1a2f9fd09e9dfd2fe5a332461f5..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/high_level_api.md
+++ /dev/null
@@ -1,60 +0,0 @@
-# Inference High-level APIs
-This document describes the high-level inference APIs, one can use them to deploy a Paddle model for an application quickly.
-
-The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed for a deployment.
-
-## PaddleTensor
-We provide the `PaddleTensor` data structure to give a general tensor interface.
-
-The definition is 
-
-```c++
-struct PaddleTensor {
-  std::string name;  // variable name.
-  std::vector<int> shape;
-  PaddleBuf data;  // blob of data.
-  PaddleDType dtype;
-};
-```
-
-The data is stored in a continuous memory `PaddleBuf,` and a `PaddleDType` specifies tensor's data type. 
-The `name` field is used to specify the name of an input variable, 
-that is important when there are multiple inputs and need to distinguish which variable to set.
-
-## engine
-The inference APIs has two different underlying engines
-
-- the native engine, which is consists of the native operators and framework,
-- the Anakin engine, which has an Anakin library embedded.
-
-The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, 
-the Anakin engine is faster for some model, 
-but it can only take the Anakin model as input(user need to transform the format first manually) and currently not all Paddle models are supported.
-
-```c++
-enum class PaddleEngineKind {
-  kNative = 0,  // Use the native Fluid facility.
-  kAnakin,      // Use Anakin for inference.
-};
-```
-
-## PaddlePredictor and how to create one
-The main interface is `PaddlePredictor,` there are following methods 
-
-- `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
-  - take inputs and output `output_data.`
-- `Clone` to clone a predictor from an existing one, with model parameter shared.
-
-There is a factory method to help create a predictor, and the user takes the ownership of this object.
-
-```c++
-template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-```
-
-By specifying the engine kind and config, one can get a specific implementation.
-
-## Reference
-
-- [paddle_inference_api.h](./paddle_inference_api.h)
-- [some demos](./demo_ci)
diff --git a/paddle/fluid/inference/api/high_level_api_cn.md b/paddle/fluid/inference/api/high_level_api_cn.md
deleted file mode 100644
index 442c598978c700f4c438b365b8900db5b65bc5ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/high_level_api_cn.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Paddle 预测 API
-
-为了更简单方便的预测部署，Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
-
-预测库包含:
-
-- 头文件 `paddle_inference_api.h` 定义了所有的接口
-- 库文件`libpaddle_fluid.so` 或 `libpaddle_fluid.a`
-- 库文件 `libpaddle_inference_api.so` 或 `libpaddle_inference_api.a`
-
-下面是详细的一些 API 概念介绍
-
-## PaddleTensor
-
-PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
-
-```c++
-struct PaddleTensor {
-  std::string name;  // variable name.
-  std::vector<int> shape;
-  PaddleBuf data;  // blob of data.
-  PaddleDType dtype;
-};
-```
-
-- `name` 用于指定输入数据对应的 模型中variable 的名字 （暂时没有用，但会在后续支持任意 target 时启用）
-- `shape` 表示一个 Tensor 的 shape
-- `data`  数据以连续内存的方式存储在`PaddleBuf` 中，`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存，详细可以参考头文件中相关定义。
-- `dtype` 表示 Tensor 的数据类型
-
-## engine
-
-高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
-
-- 原生 engine，由 paddle 原生的 forward operator 组成，可以天然支持所有paddle 训练出的模型，
-- Anakin engine，封装了 [Anakin](https://github.com/PaddlePaddle/Anakin) ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle 模型，
-- TensorRT mixed engine，用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ，支持所有paddle 模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
-
-其实现为
-
-```c++
-enum class PaddleEngineKind {
-  kNative = 0,       // Use the native Fluid facility.
-  kAnakin,           // Use Anakin for inference.
-  kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
-};
-```
-
-## 预测部署过程
-
-总体上分为以下步骤
-
-1. 用合适的配置创建 `PaddlePredictor`
-2. 创建输入用的 `PaddleTensor`，传入到 `PaddlePredictor` 中
-3. 获取输出的 `PaddleTensor` ，将结果取出
-
-下面完整演示一个简单的模型，部分细节代码隐去
-
-```c++
-#include "paddle_inference_api.h"
-
-// 创建一个 config，并修改相关设置
-paddle::NativeConfig config;
-config.model_dir = "xxx";
-config.use_gpu = false;
-// 创建一个原生的 PaddlePredictor
-auto predictor =
-      paddle::CreatePaddlePredictor<paddle::NativeConfig, paddle::PaddleEngineKind::kNative>(config);
-// 创建输入 tensor
-int64_t data[4] = {1, 2, 3, 4};
-paddle::PaddleTensor tensor{.name = "",
-                            .shape = std::vector<int>({4, 1}),
-                            .data = paddle::PaddleBuf(data, sizeof(data)),
-                            .dtype = paddle::PaddleDType::INT64};
-// 创建输出 tensor，输出 tensor 的内存可以复用
-std::vector<paddle::PaddleTensor> outputs;
-// 执行预测
-CHECK(predictor->Run(slots, &outputs));
-// 获取 outputs ...
-```
-
-编译时，联编 `libpaddle_fluid.a/.so` 和 `libpaddle_inference_api.a/.so` 便可。 
-
-## 详细代码参考
-
-- [inference demos](./demo_ci)
-- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/test_api_impl.cc)
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
deleted file mode 100644
index 94c556ce52d61258475e4e9cc497b23b073938fc..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ /dev/null
@@ -1,478 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
-#include <algorithm>
-#include <limits>
-#include <map>
-#include <numeric>
-#include <unordered_map>
-#include <utility>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-
-using platform::CPUPlace;
-using framework::LoDTensor;
-using framework::ir::Graph;
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
-using string::PrettyLogH1;
-static LoDTensor CreateScaleTensor(int64_t channels_num = 1);
-
-bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
-  PrettyLogH1("--- Calculating scales for quantization");
-  using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-  std::map<std::string, std::map<std::string, LoDTensor>> gathered_data;
-  for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) {
-    if (op->HasAttr("use_quantizer") &&
-        boost::get<bool>(op->GetAttr("use_quantizer"))) {
-      const VariableNameMap& connections_in = op->Inputs();
-      const VariableNameMap& connections_out = op->Outputs();
-
-      auto glambda = [&](const VariableNameMap& connections, bool is_output) {
-        for (auto const& conn : connections) {
-          for (const auto& var_name : conn.second) {
-            // skip if scale already computed
-            if (scales_.find(var_name) != scales_.end()) continue;
-
-            auto* var = predictor_.sub_scope_->FindVar(var_name);
-            PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
-            PADDLE_ENFORCE(var->IsType<LoDTensor>(),
-                           "Only support lod tensor now.");
-            LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
-
-            // force unsigned type if already know it
-            bool is_unsigned = false;
-            bool compute_scale = true;
-            if (is_output) {
-              if (op->Type() == "conv2d") {
-                // output of conv2d with relu must be unsigned
-                std::string fuse_activation =
-                    op->GetAttrIfExists<std::string>("fuse_activation");
-                is_unsigned =
-                    (fuse_activation == "relu" || fuse_activation == "relu6");
-              } else if (op->Type() == "relu") {
-                is_unsigned = true;
-              } else if (op->Type() == "transpose2" ||
-                         op->Type() == "reshape2" || op->Type() == "pool2d") {
-                auto input_var_name = op->Input("X")[0];
-                PADDLE_ENFORCE(scales_.find(input_var_name) != scales_.end(),
-                               "Input scales must be calculated before the "
-                               "output scales to infer if output is unsigned.");
-                if (scales_.find(input_var_name) != scales_.end()) {
-                  scales_[var_name] = scales_[input_var_name];
-                }
-                compute_scale = false;
-              } else if (op->Type() == "concat") {
-                // output of ops with unsigned input must be unsigned
-                is_unsigned = true;
-                double min_scale = std::numeric_limits<double>::max();
-                for (auto input_var_name : op->Input("X")) {
-                  PADDLE_ENFORCE(
-                      scales_.find(input_var_name) != scales_.end(),
-                      "Input scales must be calculated before the "
-                      "output scales to infer if output is unsigned.");
-                  is_unsigned = is_unsigned && scales_[input_var_name].first;
-                  min_scale = std::min(
-                      min_scale,
-                      scales_[input_var_name].second.data<double>()[0]);
-                }
-                auto scale_tensor = CreateScaleTensor();
-                scale_tensor.data<double>()[0] = min_scale;
-                scales_[var_name] = {is_unsigned, scale_tensor};
-                compute_scale = false;
-              }
-            }
-            if (compute_scale)
-              CalculateSingleScale(op->Type(), conn.first, var_name,
-                                   *var_tensor, is_unsigned);
-          }
-        }
-      };
-
-      // handle inputs first to let is_unsigned be inferred for the outputs
-      glambda(connections_in, false /* is_output */);
-      glambda(connections_out, true /* is_output */);
-    }
-  }
-
-  return true;
-}
-
-void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
-    const std::string& op_type_name, const std::string& conn_name,
-    const std::string& var_name, const LoDTensor& var_tensor,
-    bool is_unsigned) {
-  auto rule = qconfig_->scale_algo(op_type_name, conn_name);
-  if (rule == ScaleAlgo::NONE) return;
-
-  PADDLE_ENFORCE(
-      var_tensor.numel() > 0,
-      "MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
-      "%s of connection %s should not be empty.",
-      var_name, op_type_name, conn_name);
-
-  switch (rule) {
-    case ScaleAlgo::MAX:
-      scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned);
-      break;
-    case ScaleAlgo::MAX_CH:
-      scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned);
-      break;
-    case ScaleAlgo::KL:
-      scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned);
-      break;
-    default:
-      throw std::runtime_error(
-          "MkldnnQuantizer: Unexpected ScaleAlgo specified.");
-  }
-}
-
-static LoDTensor CreateScaleTensor(int64_t channels_num) {
-  LoDTensor scale_tensor;
-  scale_tensor.Resize({channels_num});
-  scale_tensor.mutable_data<double>(CPUPlace());
-  return scale_tensor;
-}
-
-std::vector<int> AnalysisPredictor::MkldnnQuantizer::ExpandQuantizedBins(
-    std::vector<int> quantized_bins, std::vector<int> reference_bins) const {
-  std::vector<int> expanded_quantized_bins(reference_bins.size(), 0);
-  int num_merged_bins = reference_bins.size() / quantized_bins.size();
-  int j_start = 0;
-  int j_end = num_merged_bins;
-  for (size_t idx = 0; idx < quantized_bins.size(); idx++) {
-    int zero_count =
-        std::count(&reference_bins[j_start], &reference_bins[j_end], 0);
-    num_merged_bins = j_end - j_start;
-    int avg_bin_ele;
-    if (zero_count == num_merged_bins) {
-      avg_bin_ele = 0;
-    } else {
-      avg_bin_ele = quantized_bins[idx] / (num_merged_bins - zero_count + 0.0);
-    }
-    for (int idx1 = j_start; idx1 < j_end; idx1++) {
-      expanded_quantized_bins[idx1] =
-          (reference_bins[idx1] == 0) ? 0 : avg_bin_ele;
-    }
-    j_start += num_merged_bins;
-    j_end += num_merged_bins;
-    if ((idx + 1) == quantized_bins.size() - 1) {
-      j_end = reference_bins.size();
-    }
-  }
-  return expanded_quantized_bins;
-}
-
-std::pair<bool, LoDTensor>
-AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor(
-    const LoDTensor& var_tensor, bool is_unsigned) const {
-  ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
-                                        var_tensor.numel(), 1};
-  int precision_hist_num_bins = 2048;
-  float max_val = eigen_tensor.maxCoeff();
-  float min_val = eigen_tensor.minCoeff();
-  bool is_positive = min_val >= 0.0f;
-  if (is_unsigned)
-    PADDLE_ENFORCE(
-        is_positive,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
-
-  int num_quantized_bins = 255;
-
-  std::vector<int> hist;
-  float bin_width;
-  int starting_iter;
-  int ending_iter = precision_hist_num_bins - 1;
-  if (is_positive) {
-    std::tie(hist, bin_width) =
-        Histogram(var_tensor, min_val, max_val, precision_hist_num_bins);
-    starting_iter = static_cast<int>(ending_iter * 0.7);
-  } else {
-    float th = std::max(std::abs(max_val), std::abs(min_val));
-    std::tie(hist, bin_width) =
-        Histogram(var_tensor, -th, th, precision_hist_num_bins);
-    starting_iter = 0;
-    if (std::abs(max_val) > std::abs(min_val)) {
-      while (starting_iter < ending_iter) {
-        if (hist[starting_iter] == 0) {
-          ++starting_iter;
-          continue;
-        } else {
-          break;
-        }
-      }
-      starting_iter += static_cast<int>((ending_iter - starting_iter) * 0.6);
-    } else {
-      while (ending_iter > 0) {
-        if (hist[ending_iter] == 0) {
-          --ending_iter;
-          continue;
-        } else {
-          break;
-        }
-      }
-      starting_iter = static_cast<int>(0.6 * ending_iter);
-    }
-  }
-  auto P_sum = eigen_tensor.size();
-  int min_kl_divergence = 0;
-  int min_kl_index = 0;
-  bool kl_inited = false;
-  for (int i = starting_iter; i <= ending_iter; i++) {
-    std::vector<int> reference_distr_P(&hist[0], &hist[i]);
-    auto outliers_count =
-        std::accumulate(&hist[i], &hist[precision_hist_num_bins], 0);
-    if (reference_distr_P[i - 1] == 0) {
-      continue;
-    }
-    reference_distr_P[i - 1] += outliers_count;
-    auto reference_distr_bins = reference_distr_P;
-    std::vector<int> candidate_distr_Q(&hist[0], &hist[i]);
-    int num_merged_bins = i / num_quantized_bins;
-    std::vector<int> candidate_distr_Q_quantized(num_quantized_bins, 0);
-    int j_start = 0;
-    int j_end = num_merged_bins;
-    for (int idx = 0; idx < num_quantized_bins; idx++) {
-      candidate_distr_Q_quantized[idx] = std::accumulate(
-          &candidate_distr_Q[j_start], &candidate_distr_Q[j_end], 0);
-      j_start += num_merged_bins;
-      j_end += num_merged_bins;
-      if ((idx + 1) == num_quantized_bins - 1) {
-        j_end = i;
-      }
-    }
-    candidate_distr_Q =
-        ExpandQuantizedBins(candidate_distr_Q_quantized, reference_distr_bins);
-    int Q_sum =
-        std::accumulate(candidate_distr_Q.begin(), candidate_distr_Q.end(), 0);
-    auto kl_divergence =
-        SafeEntropy(reference_distr_P, P_sum, candidate_distr_Q, Q_sum);
-    if (!kl_inited) {
-      min_kl_divergence = kl_divergence;
-      min_kl_index = i;
-      kl_inited = true;
-    } else if (kl_divergence < min_kl_divergence) {
-      min_kl_divergence = kl_divergence;
-      min_kl_index = i;
-    } else {
-    }
-  }
-  if (min_kl_index == 0) {
-    while (starting_iter > 0) {
-      if (hist[starting_iter] == 0) {
-        starting_iter -= 1;
-        continue;
-      } else {
-        break;
-      }
-    }
-    min_kl_index = starting_iter;
-  }
-
-  LoDTensor scale_tensor = CreateScaleTensor();
-  scale_tensor.data<double>()[0] = 1.0 / ((min_kl_index + 0.5) * bin_width);
-
-  return std::make_pair(is_unsigned, scale_tensor);
-}
-
-std::pair<bool, LoDTensor>
-AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
-    const LoDTensor& var_tensor, bool is_unsigned) const {
-  ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
-                                        var_tensor.numel(), 1};
-  float max_abs = eigen_tensor.abs().maxCoeff();
-  float min_val = eigen_tensor.minCoeff();
-  if (is_unsigned)
-    PADDLE_ENFORCE(
-        min_val >= 0.0f,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
-
-  LoDTensor scale_tensor = CreateScaleTensor();
-  scale_tensor.data<double>()[0] = 1.0 / max_abs;
-
-  return std::make_pair(is_unsigned, scale_tensor);
-}
-
-std::pair<bool, LoDTensor>
-AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
-    const LoDTensor& var_tensor, bool is_unsigned) const {
-  PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty.");
-
-  ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
-                                        var_tensor.numel(), 1};
-  float min_val = eigen_tensor.minCoeff();
-  if (is_unsigned)
-    PADDLE_ENFORCE(
-        min_val >= 0.0f,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
-
-  int channels = var_tensor.dims()[0];
-  LoDTensor scale_tensor = CreateScaleTensor(channels);
-  auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
-
-  for (int i = 0; i < channels; ++i) {
-    const auto tensor = var_tensor.Slice(i, i + 1);
-
-    ConstEigenVectorArrayMap eigen_tensor{tensor.data<float>(), tensor.numel(),
-                                          1};
-    float max_abs = eigen_tensor.abs().maxCoeff();
-    scale_ptr[i] = 1.0 / max_abs;
-  }
-
-  return std::make_pair(is_unsigned, scale_tensor);
-}
-
-std::pair<std::vector<int>, float>
-AnalysisPredictor::MkldnnQuantizer::Histogram(
-    const framework::LoDTensor& var_tensor, float min_val, float max_val,
-    size_t num_bins) const {
-  PADDLE_ENFORCE_GT(num_bins, 0,
-                    "MkldnnQuantizer: To calculate Histogram, num_bins (" +
-                        std::to_string(num_bins) + ") must be positive.");
-  PADDLE_ENFORCE_GT(
-      var_tensor.numel(), 0,
-      "MkldnnQuantizer: To calculate Histogram, the tensor must not be empty.");
-  PADDLE_ENFORCE(max_val >= min_val,
-                 "MkldnnQuantizer: To calculate Histogram, max_val (" +
-                     std::to_string(max_val) +
-                     ") must be greater or equal"
-                     "to min_val (" +
-                     std::to_string(min_val) + ").");
-  ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
-                                        var_tensor.numel(), 1};
-  auto bin_width = std::abs(max_val - min_val) / num_bins;
-  std::vector<int> hist(num_bins);
-
-  for (int i = 0; i < eigen_tensor.size(); i++) {
-    int bin = std::min(
-        num_bins - 1,
-        static_cast<size_t>(floor((eigen_tensor[i] - min_val) / bin_width)));
-    ++hist[bin];
-  }
-
-  return std::make_pair(std::move(hist), std::move(bin_width));
-}
-
-void AnalysisPredictor::MkldnnQuantizer::ClearDeviceContext() const {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::MKLDNNDeviceContext* dev_ctx =
-      (platform::MKLDNNDeviceContext*)pool.Get(predictor_.place_);
-  dev_ctx->ResetBlobMap();
-}
-
-void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
-  auto& arg = predictor_.argument_;
-  if (!arg.scope_valid()) arg.SetScope(new framework::Scope);
-  arg.SetMainProgramNotOwned(predictor_.inference_program_.get());
-  auto graph = std::unique_ptr<Graph>(new Graph(arg.main_program()));
-  arg.SetMainGraph(graph.release());
-  auto* scope_ptr = arg.scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
-  arg.main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
-
-  auto* builder = predictor_.config_.pass_builder();
-  builder->SetPasses({
-      "cpu_quantize_pass", "cpu_quantize_squash_pass",
-  });
-  if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
-  auto passes = builder->AllPasses();
-  predictor_.argument_.SetIrAnalysisPasses(passes);
-  predictor_.argument_.SetAnalysisPasses(
-      {"ir_graph_clean_pass", "ir_analysis_pass", "memory_optimize_pass",
-       "ir_graph_to_program_pass"});
-  predictor_.argument_.SetQuantVarScales(scales_);
-}
-
-bool AnalysisPredictor::MkldnnQuantizer::Quantize() {
-  if (!RunWarmup()) return false;
-  if (!CalculateScales()) return false;
-  ClearDeviceContext();
-  predictor_.PrepareScope(predictor_.scope_);
-  predictor_.CreateExecutor();
-  if (!RunQuantizePasses()) return false;
-  predictor_.PrepareExecutor();
-  predictor_.PrepareFeedFetch();
-  return true;
-}
-
-bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const {
-  predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, true,
-                                        predictor_.sub_scope_);
-  PrepareArgument();
-  auto& arg = predictor_.argument_;
-  Analyzer().Run(&arg);
-  PADDLE_ENFORCE(arg.scope_valid());
-  VLOG(5) << "to prepare executor";
-  ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program);
-  predictor_.inference_program_.reset(
-      new framework::ProgramDesc(arg.ir_analyzed_program()));
-  LOG(INFO) << "== optimize 2 end ==";
-  predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0,
-                                        false, predictor_.sub_scope_);
-  return true;
-}
-
-bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
-  VLOG(3) << "Predictor: run a quantization warmup iteration";
-  auto warmup_data = qconfig_->warmup_data();
-  PADDLE_ENFORCE_NOT_NULL(warmup_data,
-                          "Warmup data cannot be NULL in the config.");
-  PrettyLogH1("--- Running warmup iteration for quantization");
-
-  // Run the inference program
-  std::vector<PaddleTensor> output_slots;
-  predictor_.Run(*warmup_data, &output_slots, qconfig_->warmup_batch_size());
-
-  return true;
-}
-
-float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
-    std::vector<int> reference_distr_P, int P_sum,
-    std::vector<int> candidate_distr_Q, int Q_sum) const {
-  PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size());
-  float tmp_sum1 = 0;
-  float tmp_sum2 = 0;
-  for (size_t idx = 0; idx < reference_distr_P.size(); idx++) {
-    int p_idx = reference_distr_P[idx];
-    int q_idx = candidate_distr_Q[idx];
-    if (p_idx == 0) {
-      tmp_sum1 += 0;
-      tmp_sum2 += 0;
-    } else {
-      PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " +
-                                     std::to_string(idx) +
-                                     " qindex = 0! p_idx = " +
-                                     std::to_string(p_idx));
-    }
-    tmp_sum1 += p_idx * (log(Q_sum * p_idx));
-    tmp_sum2 += p_idx * (log(P_sum * q_idx));
-  }
-  return (tmp_sum1 - tmp_sum2) / P_sum;
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h
deleted file mode 100644
index 6c438265f0b8e2a65c0475f0b11064042549269e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/mkldnn_quantizer.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/api_impl.h"
-#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/string/printf.h"
-#ifdef PADDLE_WITH_TESTING
-#include <gtest/gtest.h>
-#include <gtest/gtest_prod.h>
-#endif
-
-namespace paddle {
-
-/*
- * Map variable name to tensor of scaling factors scaling it to MAX=1.0.
- * bool denotes whether quantization of the variable should be done to unsigned
- * type.
- */
-using VarQuantScale =
-    std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
-
-class AnalysisPredictor::MkldnnQuantizer {
- public:
-  explicit MkldnnQuantizer(AnalysisPredictor& predictor,  // NOLINT
-                           const MkldnnQuantizerConfig* qconfig)
-      : predictor_(predictor), qconfig_(qconfig) {}
-
-  // Execute full quantization procedure.
-  bool Quantize();
-
-#if PADDLE_WITH_TESTING
-  friend class MkldnnQuantizerTest;
-#endif
-
- private:
-  // Run single warmup iteration
-  bool RunWarmup() const;
-  // Gather data from variables and calculate scales for them.
-  bool CalculateScales();
-  // Calculate a scale for tensor based on ScaleAlgo rules.
-  void CalculateSingleScale(const std::string& op_name,
-                            const std::string& conn_name,
-                            const std::string& var_name,
-                            const framework::LoDTensor& var_tensor,
-                            bool is_unsigned);
-  void PrepareArgument() const;
-  void ClearDeviceContext() const;
-  bool RunQuantizePasses() const;
-
-  std::vector<int> ExpandQuantizedBins(std::vector<int> quantized_bins,
-                                       std::vector<int> reference_bins) const;
-
-  // Using the KL-divergence method get the most precise scaling factor.
-  std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
-      const framework::LoDTensor& var_tensor, bool is_unsigned) const;
-
-  std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
-      const framework::LoDTensor& var_tensor, bool is_unsigned) const;
-
-  std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
-      const framework::LoDTensor& var_tensor, bool is_unsigned) const;
-
-  // Returns histogram and bin width
-  std::pair<std::vector<int>, float> Histogram(
-      const framework::LoDTensor& var_tensor, float min_val, float max_val,
-      size_t num_bins = 2048) const;
-
-  // Calculate the entropy.
-  float SafeEntropy(std::vector<int> reference_distr_P, int P_sum,
-                    std::vector<int> candidate_distr_Q, int Q_sum) const;
-
- private:
-  AnalysisPredictor& predictor_;
-  const MkldnnQuantizerConfig* qconfig_;
-
-  // A map: variable name -> scale
-  VarQuantScale scales_;
-};
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
deleted file mode 100644
index c2b2ba0b60a2346729d63c8473109fe6b4293874..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h"
-
-namespace paddle {
-
-MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
-  // The default configuration of scale computing algorightms
-  rules_["conv2d"]["Input"] = ScaleAlgo::KL;
-  rules_["conv2d"]["Filter"] = ScaleAlgo::MAX_CH;
-  rules_["conv2d"]["Bias"] = ScaleAlgo::NONE;  // do not compute scale
-  rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL;
-  rules_["conv2d"]["Output"] = ScaleAlgo::KL;
-
-  rules_["pool2d"]["X"] = ScaleAlgo::KL;
-  rules_["pool2d"]["Out"] = ScaleAlgo::KL;
-
-  rules_["concat"]["X"] = ScaleAlgo::KL;
-  rules_["concat"]["Out"] = ScaleAlgo::KL;
-
-  rules_["prior_box"]["Input"] = ScaleAlgo::KL;
-  rules_["prior_box"]["Image"] = ScaleAlgo::NONE;
-  rules_["prior_box"]["Boxes"] = ScaleAlgo::NONE;
-  rules_["prior_box"]["Variances"] = ScaleAlgo::NONE;
-}
-
-ScaleAlgo MkldnnQuantizerConfig::scale_algo(
-    const std::string& op_type_name, const std::string& conn_name) const {
-  if (rules_.find(op_type_name) != rules_.end()) {
-    auto op_rule = rules_.at(op_type_name);
-    if (op_rule.find(conn_name) != op_rule.end()) return op_rule.at(conn_name);
-  }
-  return default_scale_algo_;
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_anakin_config.h b/paddle/fluid/inference/api/paddle_anakin_config.h
deleted file mode 100644
index e9af13f526a2341a6e02c6979493e995c72dfa98..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/paddle_anakin_config.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <cassert>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle_api.h"  // NOLINT
-
-namespace paddle {
-namespace contrib {
-// Configurations for Anakin engine.
-struct AnakinConfig : public PaddlePredictor::Config {
-  enum TargetType { NVGPU = 0, X86, MLU, BM };
-  int device_id{0};
-  std::string model_file;
-  std::map<std::string, std::vector<int>> init_inputs_shape;
-  int init_batch_size{-1};
-  bool re_allocable{true};
-  int max_stream{4};
-  int data_stream_id{0};
-  int compute_stream_id{0};
-  char* model_buf_p{nullptr};
-  size_t model_buf_len{0};
-  TargetType target_type;
-#ifdef ANAKIN_MLU_PLACE
-  int model_parallel{8};
-  int data_parallel{1};
-  bool op_fuse{false};
-  bool sparse{false};
-#endif
-};
-
-}  // namespace contrib
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
deleted file mode 100644
index 7764a4986955c9cebb2d66e4e06c9fefd4ce0e4c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ /dev/null
@@ -1,355 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <cassert>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-/*! \file */
-
-// Here we include some header files with relative paths, for that in deploy,
-// the abstract path of this header file will be changed.
-#include "paddle_api.h"           // NOLINT
-#include "paddle_pass_builder.h"  // NOLINT
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle_mkldnn_quantizer_config.h"  // NOLINT
-#endif
-
-namespace paddle {
-
-class AnalysisPredictor;
-struct MkldnnQuantizerConfig;
-
-// NOTE WIP, not stable yet.
-struct AnalysisConfig {
-  AnalysisConfig() = default;
-  explicit AnalysisConfig(const AnalysisConfig& other);
-  explicit AnalysisConfig(const std::string& model_dir);
-  explicit AnalysisConfig(const std::string& prog_file,
-                          const std::string& params_file);
-  enum class Precision {
-    kFloat32 = 0,
-    kInt8,
-    kHalf,
-  };
-
-  /** Set model with a directory.
-   */
-  void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
-  /** Set model with two specific pathes for program and parameters.
-   */
-  void SetModel(const std::string& prog_file_path,
-                const std::string& params_file_path);
-  /** Set program file path.
-   */
-  void SetProgFile(const std::string& x) { prog_file_ = x; }
-  /** Set parameter composed file path.
-   */
-  void SetParamsFile(const std::string& x) { params_file_ = x; }
-  /** Set opt cache dir.
-   */
-  void SetOptimCacheDir(const std::string& opt_cache_dir) {
-    opt_cache_dir_ = opt_cache_dir;
-  }
-  /** Get the model directory path.
-   */
-  const std::string& model_dir() const { return model_dir_; }
-  /** Get the program file path.
-   */
-  const std::string& prog_file() const { return prog_file_; }
-  /** Get the composed parameters file.
-   */
-  const std::string& params_file() const { return params_file_; }
-
-  // GPU related.
-
-  /**
-   * \brief Turn on GPU.
-   * @param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
-   * @param device_id the GPU card to use (default is 0).
-   */
-  void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
-  /** Turn off the GPU.
-   */
-  void DisableGpu();
-  /** A bool state telling whether the GPU is turned on.
-   */
-  bool use_gpu() const { return use_gpu_; }
-  /** Get the GPU device id.
-   */
-  int gpu_device_id() const { return device_id_; }
-  /** Get the initial size in MB of the GPU memory pool.
-   */
-  int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
-  /** Get the proportion of the initial memory pool size compared to the device.
-   */
-  float fraction_of_gpu_memory_for_pool() const;
-
-  /** Turn on CUDNN
-   */
-  void EnableCUDNN();
-  /** A boolean state telling whether to use cuDNN.
-   */
-  bool cudnn_enabled() const { return use_cudnn_; }
-
-  /** \brief Control whether to perform IR graph optimization.
-   *
-   * If turned off, the AnalysisConfig will act just like a NativeConfig.
-   */
-  void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
-  /** A boolean state tell whether the ir graph optimization is actived.
-   */
-  bool ir_optim() const { return enable_ir_optim_; }
-
-  /** \brief INTERNAL Determine whether to use the feed and fetch operators.
-   * Just for internal development, not stable yet.
-   * When ZeroCopyTensor is used, this should turned off.
-   */
-  void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
-  /** A boolean state telling whether to use the feed and fetch operators.
-   */
-  bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
-
-  /** \brief Control whether to specify the inputs' names.
-   *
-   * The PaddleTensor type has a `name` member, assign it with the corresponding
-   * variable name. This is used only when the input PaddleTensors passed to the
-   * `PaddlePredictor.Run(...)` cannot follow the order in the training phase.
-   */
-  void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
-
-  /** A boolean state tell whether the input PaddleTensor names specified should
-   * be used to reorder the inputs in `PaddlePredictor.Run(...)`.
-   */
-  bool specify_input_name() const { return specify_input_name_; }
-
-  /**
-   * \brief Turn on the TensorRT engine.
-   *
-   * The TensorRT engine will accelerate some subgraphes in the original Fluid
-   * computation graph. In some models such as TensorRT50, GoogleNet and so on,
-   * it gains significant performance acceleration.
-   *
-   * @param workspace_size the memory size(in byte) used for TensorRT workspace.
-   * @param max_batch_size the maximum batch size of this prediction task,
-   * better set as small as possible, or performance loss.
-   * @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a
-   * subgraph is less than this, it will not transfer to TensorRT engine.
-   */
-  void EnableTensorRtEngine(int workspace_size = 1 << 20,
-                            int max_batch_size = 1, int min_subgraph_size = 3,
-                            Precision precision = Precision::kFloat32,
-                            bool use_static = false,
-                            bool use_calib_mode = true);
-  /** A boolean state telling whether the TensorRT engine is used.
-   */
-  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
-  /**
-   *  \brief Turn on the usage of Anakin sub-graph engine.
-   */
-  void EnableAnakinEngine(
-      int max_batch_size = 1,
-      std::map<std::string, std::vector<int>> max_input_shape = {},
-      int min_subgraph_size = 6, Precision precision = Precision::kFloat32,
-      bool auto_config_layout = false,
-      std::vector<std::string> passes_filter = {},
-      std::vector<std::string> ops_filter = {});
-
-  /** A boolean state indicating whether the Anakin sub-graph engine is used.
-  */
-  bool anakin_engine_enabled() const { return use_anakin_; }
-
-  /** \brief Control whether to debug IR graph analysis phase.
-   *
-   * This will generate DOT files for visualizing the computation graph after
-   * each analysis pass applied.
-   */
-  void SwitchIrDebug(int x = true);
-
-  /** Turn on NGRAPH.
-   */
-  void EnableNgraph();
-  /** A boolean state telling whether to use the NGRAPH.
-   */
-  bool ngraph_enabled() const { return use_ngraph_; }
-
-  /** Turn on MKLDNN.
-   */
-  void EnableMKLDNN();
-  /** set the cache capacity of different input shapes for MKLDNN.
-   *  Default 0 means don't cache any shape.
-   */
-  void SetMkldnnCacheCapacity(int capacity);
-  /** A boolean state telling whether to use the MKLDNN.
-   */
-  bool mkldnn_enabled() const { return use_mkldnn_; }
-
-  /** Set and get the number of cpu math library threads.
-   */
-  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
-  /** An int state telling how many threads are used in the CPU math library.
-   */
-  int cpu_math_library_num_threads() const {
-    return cpu_math_library_num_threads_;
-  }
-
-  /** Transform the AnalysisConfig to NativeConfig.
-   */
-  NativeConfig ToNativeConfig() const;
-  /** Specify the operator type list to use MKLDNN acceleration.
-   * @param op_list the operator type list.
-   */
-  void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
-    mkldnn_enabled_op_types_ = op_list;
-  }
-
-  /** Turn on quantization.
-   */
-  void EnableMkldnnQuantizer();
-
-  /** A boolean state telling whether the quantization is enabled.
-  */
-  bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; }
-
-  MkldnnQuantizerConfig* mkldnn_quantizer_config() const;
-
-  /** Specify the memory buffer of program and parameter
-   * @param prog_buffer the memory buffer of program.
-   * @param prog_buffer_size the size of the data.
-   * @param params_buffer the memory buffer of the composed parameters file.
-   * @param params_buffer_size the size of the commposed parameters data.
-   */
-  void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
-                      const char* params_buffer, size_t params_buffer_size);
-  /** A boolean state telling whether the model is set from the CPU memory.
-   */
-  bool model_from_memory() const { return model_from_memory_; }
-
-  /** Turn on memory optimize
-   * NOTE still in development, will release latter.
-   */
-  void EnableMemoryOptim();
-  /** Tell whether the memory optimization is activated. */
-  bool enable_memory_optim() const;
-
-  /** \brief Turn on profiling report.
-   *
-   * If not turned on, no profiling report will be generateed.
-   */
-  void EnableProfile();
-  /** A boolean state telling whether the profiler is activated.
-   */
-  bool profile_enabled() const { return with_profile_; }
-
-  void SetInValid() const { is_valid_ = false; }
-  bool is_valid() const { return is_valid_; }
-
-  friend class ::paddle::AnalysisPredictor;
-
-  /** NOTE just for developer, not an official API, easily to be broken.
-   * Get a pass builder for customize the passes in IR analysis phase.
-   */
-  PassStrategy* pass_builder() const;
-  void PartiallyRelease();
-
- protected:
-  // Update the config.
-  void Update();
-
-  std::string SerializeInfoCache();
-
- protected:
-  // Model pathes.
-  std::string model_dir_;
-  mutable std::string prog_file_;
-  mutable std::string params_file_;
-
-  // GPU related.
-  bool use_gpu_{false};
-  int device_id_{0};
-  uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
-
-  bool use_cudnn_{false};
-
-  // TensorRT related.
-  bool use_tensorrt_{false};
-  // For workspace_size, refer it from here:
-  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
-  int tensorrt_workspace_size_;
-  // While TensorRT allows an engine optimized for a given max batch size
-  // to run at any smaller size, the performance for those smaller
-  // sizes may not be as well-optimized. Therefore, Max batch is best
-  // equivalent to the runtime batch size.
-  int tensorrt_max_batchsize_;
-  //  We transform the Ops that can be converted into TRT layer in the model,
-  //  and aggregate these Ops into subgraphs for TRT execution.
-  //  We set this variable to control the minimum number of nodes in the
-  //  subgraph, 3 as default value.
-  int tensorrt_min_subgraph_size_{3};
-  Precision tensorrt_precision_mode_;
-  bool trt_use_static_engine_;
-  bool trt_use_calib_mode_;
-
-  // memory reuse related.
-  bool enable_memory_optim_{false};
-
-  bool use_ngraph_{false};
-  bool use_mkldnn_{false};
-  std::unordered_set<std::string> mkldnn_enabled_op_types_;
-
-  bool model_from_memory_{false};
-
-  bool enable_ir_optim_{true};
-  bool use_feed_fetch_ops_{true};
-  bool ir_debug_{false};
-
-  bool specify_input_name_{false};
-
-  int cpu_math_library_num_threads_{1};
-
-  bool with_profile_{false};
-
-  // A runtime cache, shouldn't be transferred to others.
-  std::string serialized_info_cache_;
-
-  mutable std::unique_ptr<PassStrategy> pass_builder_;
-
-  bool use_anakin_{false};
-  int anakin_max_batchsize_;
-  int anakin_min_subgraph_size_{6};
-  std::map<std::string, std::vector<int>> anakin_max_input_shape_;
-  Precision anakin_precision_mode_;
-  bool anakin_auto_config_layout_{false};
-  std::vector<std::string> anakin_passes_filter_;
-  std::vector<std::string> anakin_ops_filter_;
-
-  // mkldnn related.
-  int mkldnn_cache_capacity_{0};
-  bool use_mkldnn_quantizer_{false};
-  std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
-
-  // If the config is already used on a predictor, it becomes invalid.
-  // Any config can only be used with one predictor.
-  // Variables held by config can take up a lot of memory in some cases.
-  // So we release the memory when the predictor is set up.
-  mutable bool is_valid_{true};
-  std::string opt_cache_dir_;
-};
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
deleted file mode 100644
index 8c0adfcb0688920163bd8a2f960fa5332ff206e1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/paddle_api.h
+++ /dev/null
@@ -1,365 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-/*! \file paddle_api.h
- */
-
-/*! \mainpage Paddle Inference APIs
- * \section intro_sec Introduction
- * The Paddle inference library aims to offer an high performance inference SDK
- * for Paddle users.
- */
-
-#include <cassert>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-/*! \namespace paddle
- */
-namespace paddle {
-
-/** paddle data type.
- */
-enum PaddleDType {
-  FLOAT32,
-  INT64,
-  INT32,
-  UINT8,
-  // TODO(Superjomn) support more data types if needed.
-};
-
-/**
- * \brief Memory manager for `PaddleTensor`.
- *
- * The PaddleBuf holds a buffer for data input or output. The memory can be
- * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- * should be reused for better performance.
- *
- * For user allocated memory, the following API can be used:
- * - PaddleBuf(void* data, size_t length) to set an external memory by
- * specifying the memory address and length.
- * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
- *memory.
- * ATTENTION, for user allocated memory, deallocation should be done by users
- *externally after the program finished. The PaddleBuf won't do any allocation
- *or deallocation.
- *
- * To have the PaddleBuf allocate and manage the memory:
- * - PaddleBuf(size_t length) will allocate a memory of size `length`.
- * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
- *  if the allocated memory is larger than `length`, nothing will done.
- *
- * Usage:
- *
- * Let PaddleBuf manage the memory internally.
- * \code{cpp}
- * const int num_elements = 128;
- * PaddleBuf buf(num_elements * sizeof(float));
- * \endcode
- *
- * Or
- * \code{cpp}
- * PaddleBuf buf;
- * buf.Resize(num_elements * sizeof(float));
- * \endcode
- * Works the exactly the same.
- *
- * One can also make the `PaddleBuf` use the external memory.
- * \code{cpp}
- * PaddleBuf buf;
- * void* external_memory = new float[num_elements];
- * buf.Reset(external_memory, num_elements*sizeof(float));
- * ...
- * delete[] external_memory; // manage the memory lifetime outside.
- * \endcode
- */
-class PaddleBuf {
- public:
-  /** PaddleBuf allocate memory internally, and manage it.
-   */
-  explicit PaddleBuf(size_t length)
-      : data_(new char[length]), length_(length), memory_owned_(true) {}
-  /** Set external memory, the PaddleBuf won't manage it.
-   */
-  PaddleBuf(void* data, size_t length)
-      : data_(data), length_(length), memory_owned_{false} {}
-  /** Copy only available when memory is managed externally.
-   */
-  explicit PaddleBuf(const PaddleBuf&);
-
-  /** Resize the memory.
-   */
-  void Resize(size_t length);
-  /** Reset to external memory, with address and length set.
-   */
-  void Reset(void* data, size_t length);
-  /** Tell whether the buffer is empty.
-   */
-  bool empty() const { return length_ == 0; }
-  /** Get the data's memory address.
-   */
-  void* data() const { return data_; }
-  /** Get the memory length.
-   */
-  size_t length() const { return length_; }
-
-  ~PaddleBuf() { Free(); }
-  PaddleBuf& operator=(const PaddleBuf&);
-  PaddleBuf& operator=(PaddleBuf&&);
-  PaddleBuf() = default;
-  PaddleBuf(PaddleBuf&& other);
-
- private:
-  void Free();
-  void* data_{nullptr};  // pointer to the data memory.
-  size_t length_{0};     // number of memory bytes.
-  bool memory_owned_{true};
-};
-
-/** Basic input and output data structure for PaddlePredictor.
- */
-struct PaddleTensor {
-  PaddleTensor() = default;
-  std::string name;  // variable name.
-  std::vector<int> shape;
-  PaddleBuf data;  // blob of data.
-  PaddleDType dtype;
-  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
-};
-
-enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
-
-/** Tensor without copy, currently only supports `AnalysisPredictor`.
- */
-class ZeroCopyTensor {
- public:
-  void Reshape(const std::vector<int>& shape);
-
-  /** Get the memory in CPU or GPU with specific data type, should Reshape first
-   * to tell the data size.
-   * One can directly call this data to feed the data.
-   * This is for writing the input tensor.
-   */
-  template <typename T>
-  T* mutable_data(PaddlePlace place);
-  /** Get the memory directly, will return the place and element size by
-   * pointer.
-   * This is for reading the output tensor.
-   */
-  template <typename T>
-  T* data(PaddlePlace* place, int* size) const;
-
-  template <typename T>
-  void copy_from_cpu(const T* data);
-
-  template <typename T>
-  void copy_to_cpu(T* data);
-
-  std::vector<int> shape() const;
-
-  void SetLoD(const std::vector<std::vector<size_t>>& x);
-  std::vector<std::vector<size_t>> lod() const;
-  const std::string& name() const { return name_; }
-  void SetPlace(PaddlePlace place, int device = -1) {
-    place_ = place;
-    device_ = device;
-  }
-
-  PaddleDType type() const;
-
- protected:
-  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
-  void SetName(const std::string& name) { name_ = name; }
-  void* FindTensor() const;
-
- private:
-  std::string name_;
-  bool input_or_output_;
-  friend class AnalysisPredictor;
-  void* scope_{nullptr};
-  // The corresponding tensor pointer inside Paddle workspace is cached for
-  // performance.
-  mutable void* tensor_{nullptr};
-  PaddlePlace place_;
-  PaddleDType dtype_;
-  int device_;
-};
-
-/** A simple Inference API for Paddle.
- */
-class PaddlePredictor {
- public:
-  struct Config;
-  PaddlePredictor() = default;
-  PaddlePredictor(const PaddlePredictor&) = delete;
-  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
-
-  /** Predict an record.
-   * The caller should be responsible for allocating and releasing the memory of
-   * `inputs`. `inputs` should be available until Run returns. Caller should be
-   * responsible for the output tensor's buffer, either allocated or passed from
-   * outside.
-   */
-  virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data,
-                   int batch_size = -1) = 0;
-
-  /** \brief Get input names of the model
-   */
-  virtual std::vector<std::string> GetInputNames() { return {}; }
-
-  /** \brief Get input shapes of the model
-   */
-  virtual std::map<std::string, std::vector<int64_t>> GetInputTensorShape() {
-    return {};
-  }
-
-  /** \brief Get output names of the model
-   */
-  virtual std::vector<std::string> GetOutputNames() { return {}; }
-
-  /** \brief Get a mutable tensor directly.
-   *
-   * NOTE Only works in AnalysisPredictor.
-   *
-   * One can also use this to modify any temporary variable related tensors in
-   * the predictor.
-   *
-   */
-  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
-      const std::string& name) {
-    return nullptr;
-  }
-  /**
-   * \brief Get an immutable tensor without copy.
-   *
-   * NOTE Only works in AnalysisPredictor.
-   * One can use this API to get any temporary tensors in the predictor and
-   * read it.
-   */
-  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
-      const std::string& name) {
-    return nullptr;
-  }
-  /**
-   * \brief Run the predictor with zero-copied inputs and outputs.
-   *
-   * NOTE Only works in AnalysisPredictor.
-   *
-   * This will save the IO copy for transfering inputs and outputs to predictor
-   * workspace and get some performance improvement.
-   * To use it, one should call the `AnalysisConfig.SwitchUseFeedFetchOp(true)`
-   * and then use the `GetInputTensor` and `GetOutputTensor` to directly write
-   * or read the input/output tensors.
-   */
-  virtual bool ZeroCopyRun() { return false; }
-
-  /** Clone a predictor that share the model weights, the Cloned predictor
-   * should be thread-safe.
-   */
-  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
-
-  /** Destroy the Predictor.
-   */
-  virtual ~PaddlePredictor() = default;
-
-  /** \brief Get the serialized model program that executes in inference phase.
-   * Its data type is ProgramDesc, which is a protobuf message.
-   */
-  virtual std::string GetSerializedProgram() const {
-    assert(false);  // Force raise error.
-    return "NotImplemented";
-  }
-
-  /** The common configs for all the predictors.
-   */
-  struct Config {
-    std::string model_dir; /*!< path to the model directory. */
-  };
-};
-
-struct NativeConfig : public PaddlePredictor::Config {
-  // GPU related fields.
-  bool use_gpu{false};
-  int device{0};
-  float fraction_of_gpu_memory{
-      -1.f}; /*!< Change to a float in (0,1] if needed. */
-
-  // Specify the exact path of program and parameter files.
-  std::string prog_file;
-  std::string param_file;
-
-  /** Specify the variable's name of each input if input tensors don't follow
-   * the
-   * `feeds` and `fetches` of the phase `save_inference_model`.
-   */
-  bool specify_input_name{false};
-
-  /** Set and get the number of cpu math library threads.
-   */
-  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
-    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
-  }
-  int cpu_math_library_num_threads() const {
-    return cpu_math_library_num_threads_;
-  }
-
- protected:
-  // number of cpu math library (such as MKL, OpenBlas) threads for each
-  // instance.
-  int cpu_math_library_num_threads_{1};
-};
-
-/*! \fn std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&
- * config);
- *
- * \brief A factory to help create different predictors.
- *
- * Usage:
- *
- * \code{.cpp}
- * NativeConfig config;
- * ... // change the configs.
- * auto native_predictor = CreatePaddlePredictor(config);
- * \endcode
- *
- * FOR EXTENSION DEVELOPER:
- * Different predictors are designated by config type. Similar configs can be
- * merged, but there shouldn't be a huge config containing different fields for
- * more than one kind of predictors.
- */
-template <typename ConfigT>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-
-/** NOTE The following APIs are too trivial, we will discard it in the following
- * versions.
- */
-enum class PaddleEngineKind {
-  kNative = 0,        /*!< Use the native Fluid facility. */
-  kAutoMixedTensorRT, /*!< Automatically mix Fluid with TensorRT. */
-  kAnalysis,          /*!< More optimization. */
-  kAnakin             /*!< Use Anakin for inference, not mature yet. */
-};
-
-template <typename ConfigT, PaddleEngineKind engine>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-
-int PaddleDtypeSize(PaddleDType dtype);
-
-std::string get_version();
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
deleted file mode 100644
index feb5373c1dc6206b657c325bd34fb1450eb197c8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains the definition of a simple Inference API for Paddle.
- *
- * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
- * might release another API.
- */
-
-#pragma once
-
-#include <cassert>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle_analysis_config.h"  // NOLINT
-#include "paddle_api.h"              // NOLINT
-#if (defined PADDLE_WITH_ANAKIN)
-#include "paddle_anakin_config.h"  // NOLINT
-#endif
diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
deleted file mode 100644
index d46f842de7a2277ee5d00672386b12af7ba28deb..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <cassert>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle_api.h"  // NOLINT
-
-namespace paddle {
-
-// Algorithms for finding scale of quantized Tensors.
-enum class ScaleAlgo {
-  NONE,    // Do not compute scale
-  MAX,     // Find scale based on the maximum absolute value
-  MAX_CH,  // Find scale based on the maximum absolute value per channel
-  KL,      // Find scale based on KL Divergence
-};
-
-struct MkldnnQuantizerConfig {
-  MkldnnQuantizerConfig();
-
-  /** Specify a quantization algorithm for a connection (input/output) of the
-   * operator type.
-   * @param op_type_name the operator's name.
-   * @param conn_name name of the connection (input/output) of the operator.
-   * @param algo the algorithm for computing scale.
-   */
-  void SetScaleAlgo(std::string op_type_name, std::string conn_name,
-                    ScaleAlgo algo) {
-    rules_[op_type_name][conn_name] = algo;
-  }
-
-  /** Get the quantization algorithm for a connection (input/output) of the
-   * operator type.
-   * @param op_type_name the operator's name.
-   * @param conn_name name of the connection (input/output) of the operator.
-   * @return the algorithm for computing scale.
-   */
-  ScaleAlgo scale_algo(const std::string& op_type_name,
-                       const std::string& conn_name) const;
-
-  /** Set the batch of data to be used for warm-up iteration.
-   * @param data batch of data.
-   */
-  void SetWarmupData(std::shared_ptr<std::vector<PaddleTensor>> data) {
-    warmup_data_ = data;
-  }
-
-  /** Get the batch of data used for warm-up iteration.
-   * @return batch of data.
-   */
-  std::shared_ptr<std::vector<PaddleTensor>> warmup_data() const {
-    return warmup_data_;
-  }
-
-  void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; }
-
-  int warmup_batch_size() const { return warmup_bs_; }
-
-  void SetEnabledOpTypes(std::unordered_set<std::string> op_list) {
-    enabled_op_types_ = op_list;
-  }
-
-  const std::unordered_set<std::string>& enabled_op_types() const {
-    return enabled_op_types_;
-  }
-
-  void SetExcludedOpIds(std::unordered_set<int> op_ids_list) {
-    excluded_op_ids_ = op_ids_list;
-  }
-
-  const std::unordered_set<int>& excluded_op_ids() const {
-    return excluded_op_ids_;
-  }
-
-  void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; }
-
-  ScaleAlgo default_scale_algo() const { return default_scale_algo_; }
-
- protected:
-  std::map<std::string, std::map<std::string, ScaleAlgo>> rules_;
-  std::unordered_set<std::string> enabled_op_types_;
-  std::unordered_set<int> excluded_op_ids_;
-  std::shared_ptr<std::vector<PaddleTensor>> warmup_data_;
-  int warmup_bs_{1};
-  ScaleAlgo default_scale_algo_{ScaleAlgo::MAX};
-};
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
deleted file mode 100644
index e81a842814a64890e68bcccacf65a7b975aa7de9..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/api/paddle_pass_builder.h"
-#ifdef PADDLE_WITH_CUDA
-#include <cudnn.h>
-#endif
-#include <glog/logging.h>
-
-namespace paddle {
-
-void PaddlePassBuilder::AppendPass(const std::string &pass_type) {
-  passes_.push_back(pass_type);
-}
-
-void PaddlePassBuilder::TurnOnDebug() {
-  std::vector<std::string> passes;
-  auto it = std::begin(passes_);
-  while (it != std::end(passes_)) {
-    if (*it != "graph_viz_pass") {
-      it = passes_.insert(it + 1, "graph_viz_pass");
-    } else {
-      ++it;
-    }
-  }
-}
-
-std::string PaddlePassBuilder::DebugString() {
-  std::stringstream ss;
-  ss << "Passes to apply:\n";
-  for (auto &pass : passes_) {
-    ss << "  - " << pass << '\n';
-  }
-  return ss.str();
-}
-
-void PaddlePassBuilder::DeletePass(const std::string &pass_type) {
-  auto it = std::begin(passes_);
-  while (it != std::end(passes_)) {
-    if (*it == pass_type) {
-      it = passes_.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-
-void PaddlePassBuilder::InsertPass(size_t idx, const std::string &pass_type) {
-  passes_.insert(std::begin(passes_) + idx, pass_type);
-}
-
-void PaddlePassBuilder::DeletePass(size_t idx) {
-  passes_.erase(std::begin(passes_) + idx);
-}
-
-void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
-  analysis_passes_.push_back(pass);
-}
-
-void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
-
-const std::vector<std::string> kTRTSubgraphPasses({
-  "conv_affine_channel_fuse_pass",                 //
-      "conv_eltwiseadd_affine_channel_fuse_pass",  //
-      "shuffle_channel_detect_pass",               //
-      "quant_conv2d_dequant_fuse_pass",            //
-      "delete_quant_dequant_op_pass",              //
-      // "fc_fuse_pass",                                 //
-      "tensorrt_subgraph_pass",  //
-      "conv_bn_fuse_pass",       //
-#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
-                           // guaranteed at least v7
-      "conv_elementwise_add_act_fuse_pass",   //
-      "conv_elementwise_add2_act_fuse_pass",  //
-      "conv_elementwise_add_fuse_pass",       //
-#endif                                        //
-      "transpose_flatten_concat_fuse_pass",
-});
-
-// The following passes works for Anakin sub-graph engine.
-const std::vector<std::string> kAnakinSubgraphPasses({
-    "quant_conv2d_dequant_fuse_pass",               //
-    "simplify_anakin_priorbox_detection_out_pass",  //
-    "fillconstant_elementwisemul_fuse",             //
-    "fc_fuse_pass",                                 //
-    "conv_elementwise_add_fuse_pass",               //
-    "fc_gru_fuse_pass",                             //
-    "shuffle_channel_detect_pass",                  //
-    "anakin_subgraph_pass",                         //
-    "fc_gru_fuse_pass",                             //
-});
-
-GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
-  passes_.assign({
-    //   "identity_scale_op_clean_pass",             //
-    "is_test_pass",                                  //
-        "simplify_with_basic_ops_pass",              //
-        "fc_fuse_pass",                              //
-        "fc_elementwise_layernorm_fuse_pass",        //
-        "conv_affine_channel_fuse_pass",             //
-        "conv_eltwiseadd_affine_channel_fuse_pass",  //
-        "conv_bn_fuse_pass",                         //
-        "conv_eltwiseadd_bn_fuse_pass",              //
-#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
-                           // guaranteed at least v7
-        "conv_elementwise_add_act_fuse_pass",   //
-        "conv_elementwise_add2_act_fuse_pass",  //
-        "conv_elementwise_add_fuse_pass",       //
-#endif                                          //
-        "transpose_flatten_concat_fuse_pass",
-        // following pass should be located in the last, since it will
-        // work on all fused ops.
-        "runtime_context_cache_pass"
-  });
-
-  use_gpu_ = true;
-}
-
-void GpuPassStrategy::EnableCUDNN() {
-  if (!use_cudnn_) {
-    passes_.insert(passes_.begin(), "cudnn_placement_pass");
-  }
-  use_cudnn_ = true;
-}
-
-void GpuPassStrategy::EnableMKLDNN() {
-  LOG(ERROR) << "GPU not support MKLDNN yet";
-}
-
-void GpuPassStrategy::EnableMkldnnQuantizer() {
-  LOG(ERROR) << "GPU not support MKL-DNN quantization";
-}
-
-void GpuPassStrategy::EnableNgraph() {
-  LOG(ERROR) << "GPU not support Ngraph yet";
-}
-
-CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
-  // NOTE the large fusions should be located in the front, so that they will
-  // not be damaged by smaller ones.
-  passes_.assign({"simplify_with_basic_ops_pass",   //
-                  "attention_lstm_fuse_pass",       //
-                  "seqconv_eltadd_relu_fuse_pass",  //
-                  // "seqpool_concat_fuse_pass",    //
-                  "seqpool_cvm_concat_fuse_pass",  //
-                  // "embedding_fc_lstm_fuse_pass", //
-                  "fc_lstm_fuse_pass",             //
-                  "mul_lstm_fuse_pass",            //
-                  "fc_gru_fuse_pass",              //
-                  "mul_gru_fuse_pass",             //
-                  "seq_concat_fc_fuse_pass",       //
-                  "fc_fuse_pass",                  //
-                  "repeated_fc_relu_fuse_pass",    //
-                  "squared_mat_sub_fuse_pass",     //
-                  "conv_bn_fuse_pass",             //
-                  "conv_eltwiseadd_bn_fuse_pass",  //
-                  "is_test_pass",                  //
-                  // following pass should be located in the last, since
-                  // it will work on all fused ops.
-                  "runtime_context_cache_pass"});
-
-  use_gpu_ = false;
-}
-
-void CpuPassStrategy::EnableCUDNN() { LOG(ERROR) << "CPU not support cuDNN"; }
-
-void CpuPassStrategy::EnableMKLDNN() {
-// TODO(Superjomn) Consider the way to mix CPU with GPU.
-#ifdef PADDLE_WITH_MKLDNN
-  if (!use_mkldnn_) {
-    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
-
-    for (auto &pass : std::vector<std::string>({
-             "depthwise_conv_mkldnn_pass",    //
-             "conv_bn_fuse_pass",             // Execute BN passes again to
-             "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-             "conv_bias_mkldnn_fuse_pass",    //
-             "conv_transpose_bias_mkldnn_fuse_pass",
-             "conv3d_bias_mkldnn_fuse_pass",  //
-             "conv_elementwise_add_mkldnn_fuse_pass",
-             "conv_concat_relu_mkldnn_fuse_pass",
-             "conv_relu_mkldnn_fuse_pass",        //
-             "conv_leaky_relu_mkldnn_fuse_pass",  //
-             "conv_relu6_mkldnn_fuse_pass",       //
-             // Disabled due to topology-dependent speed-up
-             // "fc_mkldnn_pass"
-         })) {
-      passes_.push_back(pass);
-    }
-  }
-  use_mkldnn_ = true;
-#else
-  use_mkldnn_ = false;
-#endif
-}
-
-void CpuPassStrategy::EnableMkldnnQuantizer() {
-#ifdef PADDLE_WITH_MKLDNN
-  if (!use_mkldnn_quantizer_) {
-    passes_.push_back("cpu_quantize_placement_pass");
-  }
-  use_mkldnn_quantizer_ = true;
-#else
-  use_mkldnn_quantizer_ = false;
-#endif
-}
-
-void CpuPassStrategy::EnableNgraph() {
-#ifdef PADDLE_WITH_NGRAPH
-  if (!use_ngraph_) {
-    passes_.insert(passes_.begin(), "ngraph_subgraph_pass");
-  }
-  use_ngraph_ = true;
-#else
-  use_ngraph_ = false;
-#endif
-}
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
deleted file mode 100644
index 69bc5cd774a8df288ceda5cc4d1b1fb9bdcba296..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <sstream>
-#include <string>
-#include <vector>
-
-/*! \file */
-
-/*! \namespace paddle */
-namespace paddle {
-
-/** This is a pass builder based on string. It is part of inference API.
- */
-class PaddlePassBuilder {
- public:
-  explicit PaddlePassBuilder(const std::vector<std::string> &passes)
-      : passes_(passes) {}
-
-  void SetPasses(std::initializer_list<std::string> passes) {
-    passes_ = passes;
-  }
-
-  /** Append a pass to the end of the passes. */
-  void AppendPass(const std::string &pass_type);
-
-  /** Insert a pass to a specific position.
-   * @param idx the position to insert.
-   * @param pass_type the pass key.
-   */
-  void InsertPass(size_t idx, const std::string &pass_type);
-
-  /** Delete the `idx`-th pass. */
-  void DeletePass(size_t idx);
-
-  /** Delete all the passes that has type `pass_type`. */
-  void DeletePass(const std::string &pass_type);
-
-  void ClearPasses();
-  /** Append an analysis pass. */
-  void AppendAnalysisPass(const std::string &pass);
-
-  /** Visualize the computation graph after each pass by generating a DOT
-   * language file, one can draw them with the Graphviz toolkit.
-   */
-  void TurnOnDebug();
-
-  /** Human-readible information. */
-  std::string DebugString();
-
-  const std::vector<std::string> &AllPasses() const { return passes_; }
-  std::vector<std::string> AnalysisPasses() const {
-    auto passes = analysis_passes_;
-    // To make sure the ir_graph_to_program should be the last pass so any
-    // modication of IR will persist to the program.
-    passes.push_back("ir_graph_to_program_pass");
-    return passes;
-  }
-
- protected:
-  std::vector<std::string> analysis_passes_{
-      {"ir_graph_build_pass", "ir_graph_clean_pass", "ir_analysis_pass",
-       "ir_params_sync_among_devices_pass", "adjust_cudnn_workspace_size_pass",
-       "inference_op_replace_pass"}};
-  std::vector<std::string> passes_;
-};
-
-/**Pass strategy to help control the IR passes.
- */
-class PassStrategy : public PaddlePassBuilder {
- public:
-  explicit PassStrategy(const std::vector<std::string> &passes)
-      : PaddlePassBuilder(passes) {}
-
-  /** Enable the use of cuDNN kernel
-   */
-  virtual void EnableCUDNN() {}
-
-  /** The MKLDNN control exists in both CPU and GPU mode, because there can be
-   * still some CPU kernels running in CPU mode.
-   */
-  virtual void EnableMKLDNN() {}
-
-  /** Enable NGRAPH optimization
-   */
-  virtual void EnableNgraph() {}
-
-  /** Enable MKLDNN quantize optimization
-   */
-  virtual void EnableMkldnnQuantizer() {}
-
-  bool use_gpu() const { return use_gpu_; }
-
-  virtual ~PassStrategy() = default;
-
- protected:
-  bool use_ngraph_{false};
-  bool use_gpu_{false};
-  bool use_mkldnn_{false};
-};
-
-/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
- */
-class CpuPassStrategy : public PassStrategy {
- public:
-  CpuPassStrategy();
-
-  explicit CpuPassStrategy(const CpuPassStrategy &other)
-      : PassStrategy(other.AllPasses()) {
-    use_gpu_ = other.use_gpu_;
-    use_ngraph_ = other.use_ngraph_;
-    use_mkldnn_ = other.use_mkldnn_;
-    use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
-  }
-
-  virtual ~CpuPassStrategy() = default;
-
-  void EnableCUDNN() override;
-  void EnableNgraph() override;
-  void EnableMKLDNN() override;
-  void EnableMkldnnQuantizer() override;
-
- protected:
-  bool use_ngraph_{false};
-  bool use_mkldnn_quantizer_{false};
-};
-
-/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
- */
-class GpuPassStrategy : public PassStrategy {
- public:
-  GpuPassStrategy();
-
-  explicit GpuPassStrategy(const GpuPassStrategy &other)
-      : PassStrategy(other.AllPasses()) {
-    use_gpu_ = true;
-    use_cudnn_ = other.use_cudnn_;
-  }
-
-  void EnableCUDNN() override;
-  void EnableNgraph() override;
-  void EnableMKLDNN() override;
-  void EnableMkldnnQuantizer() override;
-
-  virtual ~GpuPassStrategy() = default;
-
- protected:
-  bool use_cudnn_{false};
-};
-
-extern const std::vector<std::string> kTRTSubgraphPasses;
-extern const std::vector<std::string> kAnakinSubgraphPasses;
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
deleted file mode 100755
index b6b7d1f20baf77c89ecbc149668da2ff2d2e3b5e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/check_symbol.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-lib=$1
-if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
-
-num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
-num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep "T " | wc -l)
-
-if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
-if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
-
-exit 0
diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h
deleted file mode 100644
index 1a13ba510384c010e476bf0ba0ad5b0ba84d3240..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/engine.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/framework.pb.h"
-
-namespace paddle {
-namespace inference {
-
-struct Buffer;
-enum class DeviceType { UNK = -1, CPU, GPU };
-
-/*
- * EngineBase is the base class of all inference engines. An inference engine
- * takes a paddle program as input, and outputs the result in fluid Tensor
- * format. It can be used to optimize performance of computation sub-blocks, for
- * example, break down the original block into sub-blocks and execute each
- * sub-blocks in different engines.
- *
- * For example:
- *   When inference, the resnet50 model can put most of the model into subgraph
- * and run it on a TensorRT engine.
- *
- * There are several engines such as TensorRT and other frameworks, so an
- * EngineBase is put forward to give an unified interface for all the
- * different engine implemention.
- */
-class EngineBase {
- public:
-  using DescType = ::paddle::framework::proto::BlockDesc;
-
-  // Build the model and do some preparation, for example, in TensorRT, run
-  // createInferBuilder, buildCudaEngine.
-  virtual void Build(const DescType& paddle_model) = 0;
-
-  // Execute the engine, that will run the inference network.
-  virtual void Execute(int batch_size) = 0;
-
-  virtual ~EngineBase() {}
-};  // class EngineBase
-
-struct Buffer {
-  void* buffer{nullptr};               // buffer should be allocated only once.
-  size_t max_size;                     // buffer allocated space.
-  size_t size;                         // data size.
-  DeviceType device{DeviceType::UNK};  // tells which device this buffer is on.
-};
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
deleted file mode 100644
index 8b379457a2d031dbe859562c1a8dade0badc56c2..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/io.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/io.h"
-
-#include <algorithm>
-#include <fstream>
-#include <vector>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/pybind/pybind.h"
-
-DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
-DEFINE_bool(init_p2p, false, "Whether to init p2p.");
-DEFINE_int32(math_num_threads, 1,
-             "Number of threads used to run math functions.");
-
-namespace paddle {
-namespace inference {
-
-void Init(const std::vector<std::string> argv) {
-  framework::InitGflags(argv);
-  platform::SetNumThreads(FLAGS_math_num_threads);
-  // init devices
-  std::vector<int> devices;
-  std::string token;
-  std::istringstream tokenStream(FLAGS_devices);
-  while (std::getline(tokenStream, token, ',')) {
-    devices.push_back(std::stoi(token));
-  }
-  framework::InitDevices(FLAGS_init_p2p, devices);
-}
-
-void ReadBinaryFile(const std::string& filename, std::string* contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-bool IsPersistable(const framework::VarDesc* var) {
-  if (var->Persistable() &&
-      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != framework::proto::VarType::FETCH_LIST &&
-      var->GetType() != framework::proto::VarType::RAW) {
-    return true;
-  }
-  return false;
-}
-
-void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
-                      const framework::ProgramDesc& main_program,
-                      const std::string& dirname,
-                      const std::string& param_filename,
-                      bool model_from_memory = false) {
-  const framework::BlockDesc& global_block = main_program.Block(0);
-
-  framework::ProgramDesc* load_program = new framework::ProgramDesc();
-  framework::BlockDesc* load_block = load_program->MutableBlock(0);
-  std::vector<std::string> paramlist;
-
-  for (auto* var : global_block.AllVars()) {
-    if (IsPersistable(var)) {
-      VLOG(4) << "persistable variable's name: " << var->Name();
-
-      framework::VarDesc* new_var = load_block->Var(var->Name());
-      new_var->SetShape(var->GetShape());
-      new_var->SetDataType(var->GetDataType());
-      new_var->SetType(var->GetType());
-
-      if (var->GetType() !=
-          framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) {
-        new_var->SetLoDLevel(var->GetLoDLevel());
-      }
-
-      new_var->SetPersistable(true);
-
-      if (!param_filename.empty()) {
-        paramlist.push_back(new_var->Name());
-      } else {
-        // append_op
-        framework::OpDesc* op = load_block->AppendOp();
-        op->SetType("load");
-        op->SetOutput("Out", {new_var->Name()});
-        op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
-        op->CheckAttrs();
-      }
-    }
-  }
-
-  if (!param_filename.empty()) {
-    // sort paramlist to have consistent ordering
-    std::sort(paramlist.begin(), paramlist.end());
-    // append just the load_combine op
-    framework::OpDesc* op = load_block->AppendOp();
-    op->SetType("load_combine");
-    op->SetOutput("Out", paramlist);
-    op->SetAttr("file_path", {param_filename});
-    op->SetAttr("model_from_memory", {model_from_memory});
-    op->CheckAttrs();
-  }
-
-  executor->Run(*load_program, scope, 0, true, true);
-
-  delete load_program;
-}
-
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
-                                             framework::Scope* scope,
-                                             const std::string& dirname) {
-  std::string model_filename = dirname + "/__model__";
-  std::string program_desc_str;
-  VLOG(3) << "loading model from " << model_filename;
-  ReadBinaryFile(model_filename, &program_desc_str);
-
-  std::unique_ptr<framework::ProgramDesc> main_program(
-      new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
-
-  // model_from_memory is false in seperate parameters.
-  LoadPersistables(executor, scope, *main_program, dirname, "",
-                   false /* model_from_memory */);
-  return main_program;
-}
-
-std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor* executor, framework::Scope* scope,
-    const std::string& prog_filename, const std::string& param_filename) {
-  std::string program_desc_str;
-  ReadBinaryFile(prog_filename, &program_desc_str);
-
-  std::unique_ptr<framework::ProgramDesc> main_program(
-      new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
-
-  LoadPersistables(executor, scope, *main_program, "", param_filename,
-                   false /* model_from_memory */);
-  return main_program;
-}
-
-std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
-    framework::Executor* executor, framework::Scope* scope,
-    const std::string& prog_buffer, const std::string& param_buffer) {
-  std::unique_ptr<framework::ProgramDesc> main_program(
-      new framework::ProgramDesc(prog_buffer));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
-
-  LoadPersistables(executor, scope, *main_program, "", param_buffer,
-                   true /* model_filename */);
-  return main_program;
-}
-
-void SaveVars(const framework::Scope& scope,
-              const std::vector<std::string>& vars, const std::string& dirname,
-              bool predicate) {
-  framework::ProgramDesc prog;
-  auto* block = prog.MutableBlock(0);
-  auto* op = block->AppendOp();
-  op->SetType("save_combine");
-  op->SetInput("X", vars);
-  op->SetAttr("file_path", dirname + "/param");
-  op->CheckAttrs();
-
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  exe.Run(prog, const_cast<framework::Scope*>(&scope), 0, true, true);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
deleted file mode 100644
index 317ef9d93acf3af174cb44da6099425fff1418eb..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/io.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/init.h"
-
-namespace paddle {
-namespace inference {
-
-void Init(const std::vector<std::string> argv);
-
-void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
-                      const framework::ProgramDesc& main_program,
-                      const std::string& dirname,
-                      const std::string& param_filename,
-                      bool model_from_memory);
-
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
-                                             framework::Scope* scope,
-                                             const std::string& dirname);
-
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
-                                             framework::Scope* scope,
-                                             const std::string& prog_filename,
-                                             const std::string& param_filename);
-
-std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
-    framework::Executor* executor, framework::Scope* scope,
-    const std::string& prog_buffer, const std::string& param_buffer);
-
-// Save the variables from a scope to disk.
-void SaveVars(const framework::Scope& scope,
-              const std::vector<std::string>& vars, const std::string& dirname,
-              bool predicate = true);
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map
deleted file mode 100644
index 05935701635d9ca3199c767243d492f1a1868822..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/paddle_fluid.map
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-	global:
-		*paddle*;
-		*Pass*;
-		*profile*;
-	local:
-		*;
-};
diff --git a/paddle/fluid/inference/paddle_fluid.sym b/paddle/fluid/inference/paddle_fluid.sym
deleted file mode 100644
index ef2a04d788aa86b7f6a61c4af479d70d1137f374..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/paddle_fluid.sym
+++ /dev/null
@@ -1 +0,0 @@
-*paddle*
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
deleted file mode 100644
index 4f3da10f6eb133659a6577719b404164fa6c166f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost)
-nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost)
-nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
-nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
-add_subdirectory(plugin)
-add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
deleted file mode 100644
index b63b75f78901d3f3df38aea911417b697f540dd4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-# Add TRT tests
-nv_library(tensorrt_converter
-           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc
-shuffle_channel_op.cc swish_op.cc
-           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
-
-nv_test(test_op_converter SRCS test_op_converter.cc DEPS
-  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
-
-# TODO(xingzhaolong): fix the the following ci ut error.
-
-#nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
-#nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op)
-#nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op)
-#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op)
-#nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op)
-#nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op tensorrt_plugin)
-#nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-#             elementwise_add_op elementwise_mul_op)
-#nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op)
-#nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine batch_norm_op)
-#nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine concat_op)
-#nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine dropout_op)
-#nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op)
-#nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-#             split_op concat_op)
-#nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-#        prelu_op)
-#nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op)
-
-#nv_test(test_shuffle_channel_op SRCS test_shuffle_channel_op.cc shuffle_channel_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine shuffle_channel_op)
-
-#nv_test(test_swish_op SRCS test_swish_op.cc swish_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op tensorrt_plugin)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
deleted file mode 100644
index 18de448690534656cdfe851c74a2b390264b1b6b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-class ActivationOpConverter : public OpConverter {
- public:
-  ActivationOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    // Here the two nullptr looks strange, that's because the
-    // framework::OpDesc's constructor is strange.
-    framework::OpDesc op_desc(op, nullptr);
-    VLOG(3)
-        << "convert a fluid Activation op to tensorrt activation layer whose "
-           "type is "
-        << op_type_;
-    const nvinfer1::ITensor* input_tensor =
-        engine_->GetITensor(op_desc.Input("X")[0]);
-
-    auto op_pair = ops.find(op_type_);
-    if (op_pair == ops.end()) {
-      PADDLE_THROW("Wrong activation op type!");
-    }
-
-    nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
-        op_pair->second);
-
-#if IS_TRT_VERSION_GE(5130)
-    // max(alpha, min(beta, x))
-    if (op_type_ == "relu6") {
-      layer->setAlpha(0.);
-      layer->setBeta(6.);
-    }
-#endif
-
-    auto output_name = op_desc.Output("Out")[0];
-
-    RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
-    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5130)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
-    }
-  }
-
- protected:
-  std::string op_type_;
-  static const std::unordered_map<std::string, nvinfer1::ActivationType> ops;
-};
-
-const std::unordered_map<std::string, nvinfer1::ActivationType>
-    ActivationOpConverter::ops = {
-        {"relu", nvinfer1::ActivationType::kRELU},
-        {"sigmoid", nvinfer1::ActivationType::kSIGMOID},
-        {"tanh", nvinfer1::ActivationType::kTANH},
-#if IS_TRT_VERSION_GE(5130)
-        {"relu6", nvinfer1::ActivationType::kCLIP},
-#endif
-};
-
-class ReluOpConverter : public ActivationOpConverter {
- public:
-  ReluOpConverter() { op_type_ = "relu"; }
-};
-
-class SigmoidOpConverter : public ActivationOpConverter {
- public:
-  SigmoidOpConverter() { op_type_ = "sigmoid"; }
-};
-
-class TanhOpConverter : public ActivationOpConverter {
- public:
-  TanhOpConverter() { op_type_ = "tanh"; }
-};
-
-class Relu6OpConverter : public ActivationOpConverter {
- public:
-  Relu6OpConverter() { op_type_ = "relu6"; }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
-REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter);
-REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter);
-REGISTER_TRT_OP_CONVERTER(relu6, Relu6OpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
deleted file mode 100644
index 25f0d866dcdc7b0ab1074fe132fa037e78908622..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-class BatchNormOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm";
-
-    framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1);   // Bias is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1);   // Mean is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1);  // Scale is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Variance").size(),
-                      1);  // Variance is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
-
-    auto* X = engine_->GetITensor(op_desc.Input("X").front());
-    // Declare weights
-    auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
-    auto* Mean_v = scope.FindVar(op_desc.Input("Mean").front());
-    auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front());
-    auto* Variance_v = scope.FindVar(op_desc.Input("Variance").front());
-    const float eps = boost::get<float>(op_desc.GetAttr("epsilon"));
-
-    PADDLE_ENFORCE_NOT_NULL(Bias_v);
-    PADDLE_ENFORCE_NOT_NULL(Mean_v);
-    PADDLE_ENFORCE_NOT_NULL(Scale_v);
-    PADDLE_ENFORCE_NOT_NULL(Variance_v);
-
-    // get tensor
-    auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>();
-    auto* Mean_t = Mean_v->GetMutable<framework::LoDTensor>();
-    auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>();
-    auto* Variance_t = Variance_v->GetMutable<framework::LoDTensor>();
-
-    // create temp tensor for weights
-    framework::LoDTensor bias_tensor;
-    framework::LoDTensor mean_tensor;
-    framework::LoDTensor scale_tensor;
-    framework::LoDTensor variance_tensor;
-
-    bias_tensor.Resize(Bias_t->dims());
-    mean_tensor.Resize(Mean_t->dims());
-    scale_tensor.Resize(Scale_t->dims());
-    variance_tensor.Resize(Variance_t->dims());
-
-    platform::CPUPlace cpu_place;
-    // copy data from gpu to cpu
-    TensorCopySync((*Bias_t), cpu_place, &bias_tensor);
-    TensorCopySync((*Mean_t), cpu_place, &mean_tensor);
-    TensorCopySync((*Scale_t), cpu_place, &scale_tensor);
-    TensorCopySync((*Variance_t), cpu_place, &variance_tensor);
-
-    auto* bias_data = bias_tensor.mutable_data<float>(platform::CPUPlace());
-    auto* mean_data = mean_tensor.mutable_data<float>(platform::CPUPlace());
-    auto* scale_data = scale_tensor.mutable_data<float>(platform::CPUPlace());
-    auto* variance_data =
-        variance_tensor.mutable_data<float>(platform::CPUPlace());
-
-    std::unique_ptr<framework::LoDTensor> combile_scale_tensor(
-        new framework::LoDTensor());
-    std::unique_ptr<framework::LoDTensor> combile_bias_tensor(
-        new framework::LoDTensor());
-
-    combile_scale_tensor->Resize(scale_tensor.dims());
-    combile_bias_tensor->Resize(bias_tensor.dims());
-
-    auto* combile_scale_data =
-        combile_scale_tensor->mutable_data<float>(platform::CPUPlace());
-    auto* combile_bias_data =
-        combile_bias_tensor->mutable_data<float>(platform::CPUPlace());
-
-    size_t ele_num = combile_scale_tensor->memory_size() / sizeof(float);
-
-    for (size_t i = 0; i < ele_num; i++) {
-      float scale = scale_data[i];
-      float bias = bias_data[i];
-      float mean = mean_data[i];
-      float variance = variance_data[i];
-      combile_scale_data[i] = scale / sqrtf(variance + eps);
-      combile_bias_data[i] = bias - mean * combile_scale_data[i];
-    }
-
-    TensorRTEngine::Weight scale_weights{
-        nvinfer1::DataType::kFLOAT, static_cast<void*>(combile_scale_data),
-        combile_scale_tensor->memory_size() / sizeof(float)};
-    TensorRTEngine::Weight shift_weights{
-        nvinfer1::DataType::kFLOAT, static_cast<void*>(combile_bias_data),
-        combile_bias_tensor->memory_size() / sizeof(float)};
-    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
-                                         0};
-
-    nvinfer1::IScaleLayer* layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast<nvinfer1::ITensor*>(X),
-                             nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
-                             scale_weights.get(), power_weights.get());
-
-    auto output_name = op_desc.Output("Y").front();
-    engine_->SetWeights(op_desc.Input("Bias").front(),
-                        std::move(combile_bias_tensor));
-    engine_->SetWeights(op_desc.Input("Scale").front(),
-                        std::move(combile_scale_tensor));
-    RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(batch_norm, BatchNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
deleted file mode 100644
index ec771850edf5f4f0207fb664e26b2d9b98a7a128..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * ConcatOp
- */
-class ConcatOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias";
-
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    std::vector<nvinfer1::ITensor*> itensors;
-    for (auto& input_name : op_desc.Input("X")) {
-      itensors.push_back(engine_->GetITensor(input_name));
-    }
-    int axis = boost::get<int>(op_desc.GetAttr("axis"));
-    PADDLE_ENFORCE(axis > 0,
-                   "The axis attr of Concat op should be large than 0 for trt");
-
-    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
-                                       itensors.size());
-    axis = axis - 1;  // Remove batch dim
-    layer->setAxis(axis);
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "concat", {output_name}, test_mode);
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(concat, ConcatOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
deleted file mode 100644
index 73bfa800f0900d79394863fb9eb730c9e3c5c560..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-template <typename RegistFunc, typename SetDilationFunc>
-void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
-                   const framework::Scope& scope, bool test_mode,
-                   RegistFunc fadd_layer, SetDilationFunc fset_dilation,
-                   const std::string& name) {
-  VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias";
-
-  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
-  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
-
-  PADDLE_ENFORCE(engine != nullptr);
-  auto* X = engine->GetITensor(op_desc.Input("Input").front());
-  auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
-  PADDLE_ENFORCE_NOT_NULL(Y_v);
-  auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-  float* weight_data = nullptr;
-  bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
-
-  if (enable_int8) {
-#if IS_TRT_VERSION_GE(5000)
-    float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    auto weight_scale =
-        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
-    weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t,
-                                           true, weight_scale);
-    engine->SetTensorDynamicRange(X, in_scale);
-#endif
-  } else {
-    weight_data =
-        engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t, false);
-  }
-
-  PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
-  const int n_output = Y_t->dims()[0];
-  const int n_input = Y_t->dims()[1];
-  const int filter_h = Y_t->dims()[2];
-  const int filter_w = Y_t->dims()[3];
-  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
-  const std::vector<int> dilations =
-      boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
-  const std::vector<int> strides =
-      boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-  const std::vector<int> paddings =
-      boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-
-  nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
-  nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
-  nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
-  nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
-
-  TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                static_cast<void*>(weight_data),
-                                static_cast<size_t>(Y_t->numel())};
-
-  TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-  auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
-                           nv_ksize, weight, bias);
-  PADDLE_ENFORCE(layer != nullptr);
-  layer->setStride(nv_strides);
-  layer->setPadding(nv_paddings);
-  layer->setNbGroups(groups);
-  // set dilations
-  fset_dilation(layer, nv_dilations);
-
-  auto output_name = op_desc.Output("Output").front();
-  layer->setName((name + " (Output: " + output_name + ")").c_str());
-  layer->getOutput(0)->setName(output_name.c_str());
-  engine->SetITensor(output_name, layer->getOutput(0));
-
-#if IS_TRT_VERSION_GE(5000)
-  if (enable_int8) {
-    float output_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-    engine->SetTensorDynamicRange(layer->getOutput(0), output_scale);
-  }
-#endif
-
-  if (test_mode) {
-    engine->DeclareOutput(output_name);
-  }
-}
-
-class Conv2dOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    ConvertConv2d(
-        engine_, op, scope, test_mode,
-        [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
-            int n_input,                             /* Conv input maps */
-            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
-            TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
-          auto* layer =
-              TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
-                                   ksize, weight.get(), bias.get());
-          return layer;
-        },
-        [](nvinfer1::IConvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-          layer->setDilation(dilations);
-        },
-        "conv2d");
-  }
-};
-
-class Deconv2dOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    ConvertConv2d(
-        engine_, op, scope, test_mode,
-        [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
-            int n_input,                             /* Deconv output maps */
-            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
-            TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
-          auto* layer =
-              TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input,
-                                   ksize, weight.get(), bias.get());
-          return layer;
-        },
-        [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-          PADDLE_ENFORCE(
-              dilations.d[0] == 1 && dilations.d[1] == 1,
-              "Dilations must be (1, 1) for tensorRT, but given (%d, %d)",
-              dilations.d[0], dilations.d[1]);
-        },
-        "conv2d_transpose");
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
-REGISTER_TRT_OP_CONVERTER(conv2d_transpose, Deconv2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
deleted file mode 100644
index cd28c6d98a05aa6e48b4cb0b01849d33fcfd4c2c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * DropoutOp. This Layer doesn't has weights.
- */
-class DropoutOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid dropout op to tensorrt dropout layer";
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
-    float dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
-
-    std::string downgrade_in_infer = "";
-    if (op_desc.HasAttr("dropout_implementation")) {
-      downgrade_in_infer =
-          boost::get<std::string>(op_desc.GetAttr("dropout_implementation"));
-    }
-
-    if (!downgrade_in_infer.empty() &&
-        downgrade_in_infer == "upscale_in_train") {
-      auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
-      auto output_name = op_desc.Output("Out")[0];
-      RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode);
-      return;
-    }
-
-    platform::CPUPlace cpu_place;
-    std::unique_ptr<framework::LoDTensor> weight_tensor(
-        new framework::LoDTensor());
-    weight_tensor->Resize(framework::make_ddim({1}));
-    auto* weight_data =
-        weight_tensor->mutable_data<float>(platform::CPUPlace());
-    weight_data[0] = 1 - dropout_prob;
-
-    TensorRTEngine::Weight scale_weights{
-        nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
-        weight_tensor->memory_size() / sizeof(float)};
-    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, nullptr,
-                                         0};
-    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
-                                         0};
-
-    auto* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Scale, *const_cast<nvinfer1::ITensor*>(input1),
-        nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(),
-        power_weights.get());
-
-    engine_->SetWeights(op_desc.Output("Out").front() + "_dropout",
-                        std::move(weight_tensor));
-    auto output_name = op_desc.Output("Out")[0];
-
-    RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode);
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(dropout);
-REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
deleted file mode 100644
index c61dd753a33e2eca1c62f13c1d310faa89ccce32..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ /dev/null
@@ -1,264 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-static bool CheckDims(const nvinfer1::Dims& dims_x,
-                      const nvinfer1::Dims& dims_y) {
-  if (dims_x.nbDims != dims_y.nbDims) {
-    return false;
-  }
-  for (int i = 0; i < dims_x.nbDims; i++) {
-    if (dims_x.d[i] != dims_y.d[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-class ElementwiseWeightOpConverter : public OpConverter {
- public:
-  ElementwiseWeightOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    // Here the two nullptr looks strange, that's because the
-    // framework::OpDesc's constructor is strange.
-    nvinfer1::ILayer* layer = nullptr;
-    framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
-
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-    auto* X = engine_->GetITensor(op_desc.Input("X").front());
-    nvinfer1::Dims dims_x = X->getDimensions();
-    PADDLE_ENFORCE(dims_x.nbDims >= 3, "x dims experts 3, but %d is given.",
-                   dims_x.nbDims);
-
-    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
-    PADDLE_ENFORCE_NOT_NULL(Y_v);
-    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-    float* weight_data = nullptr;
-    weight_data =
-        engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false);
-
-    auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-
-    std::vector<int> dims_y = framework::vectorize<int>(Y_t->dims());
-    if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
-      if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
-    }
-
-    if (static_cast<int>(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) {
-      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-    } else if (static_cast<int>(dims_y.size()) == dims_x.nbDims &&
-               dims_y[0] == dims_x.d[0]) {
-      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      for (int i = 1; i < dims_x.nbDims; i++) {
-        if (dims_y[i] != dims_x.d[i]) {
-          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-          break;
-        }
-      }
-      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
-        for (int i = 1; i < dims_x.nbDims; i++) {
-          if (dims_y[i] != 1)
-            PADDLE_THROW(
-                "TensorRT unsupported weight shape for Elementwise op!");
-        }
-      }
-    } else {
-      PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
-    }
-
-    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
-                                         static_cast<void*>(weight_data),
-                                         static_cast<size_t>(Y_t->numel())};
-    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
-                                         0};
-    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
-                                         0};
-    if (op_type_ == "add") {
-      nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
-          engine_, Scale, *X, scale_mode, shift_weights.get(),
-          scale_weights.get(), power_weights.get());
-      layer = scale_layer;
-    } else if (op_type_ == "mul") {
-      nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
-          engine_, Scale, *X, scale_mode, scale_weights.get(),
-          shift_weights.get(), power_weights.get());
-      layer = scale_layer;
-    }
-
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
-                             test_mode);
-    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
-    }
-  }
-
- protected:
-  std::string op_type_;
-};
-
-class ElementwiseTensorOpConverter : public OpConverter {
- public:
-  ElementwiseTensorOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    auto op_pair = ops.find(op_type_);
-    PADDLE_ENFORCE(op_pair != ops.end(), "Wrong elementwise op type!");
-
-    // Here the two nullptr looks strange, that's because the
-    // framework::OpDesc's constructor is strange.
-    framework::OpDesc op_desc(op, nullptr);
-    nvinfer1::ILayer* layer = nullptr;
-
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-
-    auto* X = engine_->GetITensor(op_desc.Input("X").front());
-    auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
-    nvinfer1::Dims dims_x = X->getDimensions();
-    nvinfer1::Dims dims_y = Y->getDimensions();
-
-    int axis = boost::get<int>(op_desc.GetAttr("axis"));
-    auto output_name = op_desc.Output("Out")[0];
-    if (CheckDims(dims_x, dims_y)) {
-      // The two input tensor should have the same dims
-      VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
-      nvinfer1::IElementWiseLayer* elet_layer = TRT_ENGINE_ADD_LAYER(
-          engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
-          *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
-
-      layer = elet_layer;
-    } else {
-      VLOG(3) << "Convert a fluid elementwise op to TensorRT "
-                 "ElementWisePluginLayer";
-
-      plugin::ElementWisePlugin* plugin =
-          new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
-      plugin->AddInput(X);
-      plugin->AddInput(Y);
-      nvinfer1::IPluginLayer* plugin_layer = engine_->AddPlugin(
-          const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
-          reinterpret_cast<plugin::PluginTensorRT*>(plugin));
-
-      layer = plugin_layer;
-    }
-    RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
-    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
-    }
-  }
-
- protected:
-  static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
-      ops;
-  std::string op_type_;
-};
-
-const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
-    ElementwiseTensorOpConverter::ops = {
-        {"add", nvinfer1::ElementWiseOperation::kSUM},
-        {"mul", nvinfer1::ElementWiseOperation::kPROD},
-        {"sub", nvinfer1::ElementWiseOperation::kSUB},
-        {"div", nvinfer1::ElementWiseOperation::kDIV},
-        {"min", nvinfer1::ElementWiseOperation::kMIN},
-        {"pow", nvinfer1::ElementWiseOperation::kPOW},
-        {"max", nvinfer1::ElementWiseOperation::kMAX},
-};
-
-class ElementwiseWeightAddOpConverter : public ElementwiseWeightOpConverter {
- public:
-  ElementwiseWeightAddOpConverter() { op_type_ = "add"; }
-};
-
-class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter {
- public:
-  ElementwiseWeightMulOpConverter() { op_type_ = "mul"; }
-};
-
-class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
- public:
-  ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
-};
-
-class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter {
- public:
-  ElementwiseTensorMulOpConverter() { op_type_ = "mul"; }
-};
-
-class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter {
- public:
-  ElementwiseTensorSubOpConverter() { op_type_ = "sub"; }
-};
-
-class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter {
- public:
-  ElementwiseTensorDivOpConverter() { op_type_ = "div"; }
-};
-
-class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter {
- public:
-  ElementwiseTensorMinOpConverter() { op_type_ = "min"; }
-};
-
-class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter {
- public:
-  ElementwiseTensorMaxOpConverter() { op_type_ = "max"; }
-};
-
-class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
- public:
-  ElementwiseTensorPowOpConverter() { op_type_ = "pow"; }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(elementwise_add_weight,
-                          ElementwiseWeightAddOpConverter);
-REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight,
-                          ElementwiseWeightMulOpConverter);
-
-REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
-                          ElementwiseTensorAddOpConverter);
-REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor,
-                          ElementwiseTensorSubOpConverter);
-REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor,
-                          ElementwiseTensorDivOpConverter);
-REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor,
-                          ElementwiseTensorMulOpConverter);
-REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor,
-                          ElementwiseTensorMaxOpConverter);
-REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor,
-                          ElementwiseTensorMinOpConverter);
-REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor,
-                          ElementwiseTensorPowOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
deleted file mode 100644
index ea108d6a07e32132002594c0f9a6819ad56243a0..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
-// tensorflow.
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
-template <typename T>
-void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
-              T* odata, nvinfer1::DimsHW ostrides) {
-  for (int h = 0; h < shape.h(); ++h) {
-    for (int w = 0; w < shape.w(); ++w) {
-      odata[h * ostrides.h() + w * ostrides.w()] =
-          idata[h * istrides.h() + w * istrides.w()];
-    }
-  }
-}
-// indata c * k
-// Reorder the data layout from CK to KC.
-void ReorderCKtoKC(TensorRTEngine::Weight& iweights,  // NOLINT
-                   TensorRTEngine::Weight* oweights) {
-  int c = iweights.dims[0];
-  int k = iweights.dims[1];
-  oweights->dims.assign({k, c});
-  nvinfer1::DimsHW istrides = {1, k};
-  nvinfer1::DimsHW ostrides = {c, 1};
-  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
-           static_cast<float*>(const_cast<void*>(oweights->get().values)),
-           ostrides);
-}
-
-/*
- * FC converter convert a MUL op in Fluid to a FC layer in TRT.
- */
-class FcOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
-    framework::OpDesc op_desc(op, nullptr);
-
-    auto input_names = op_desc.InputNames();
-    bool with_bias = input_names.size() >= 3;
-    std::string w_name = "Y";
-    std::string i_name = "X";
-    if (with_bias) {
-      w_name = "W";
-      i_name = "Input";
-    }
-
-    // Declare inputs
-    auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
-
-    // Declare weights
-    auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
-    PADDLE_ENFORCE_NOT_NULL(Y_v);
-    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-    // This may trigger a GPU->CPU copy, because TRT's weight can only be
-    // assigned from CPU memory, which can't be avoided.
-    float* weight_data = nullptr;
-    bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
-    if (enable_int8) {
-#if IS_TRT_VERSION_GE(5000)
-      float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-      auto weight_scale =
-          boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
-      weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(),
-                                              Y_t, true, weight_scale);
-      engine_->SetTensorDynamicRange(X, in_scale);
-#endif
-    } else {
-      weight_data =
-          engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t, false);
-    }
-
-    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
-    size_t n_output = Y_t->dims()[1];
-
-    std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor());
-    tmp->Resize(Y_t->dims());
-
-    memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data,
-           Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                  static_cast<void*>(weight_data),
-                                  static_cast<size_t>(Y_t->numel())};
-    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(tmp->data<float>()),
-                                      static_cast<size_t>(Y_t->numel()));
-    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
-    tmp_weight.dims = weight.dims;
-
-    // The data layout of TRT FC layer's weight is different from fluid's FC,
-    // need to reorder the elements.
-    ReorderCKtoKC(weight, &tmp_weight);
-
-    // Currently, the framework can only handle one fluid op -> one TRT layer,
-    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
-    // handle `mul`, leave `add` as another layer.
-    // DEBUG
-    float* bias_data = nullptr;
-    int bias_num = 0;
-    if (with_bias) {
-      auto* b_v = scope.FindVar(op_desc.Input("Bias").front());
-      auto* b_t = b_v->GetMutable<framework::LoDTensor>();
-      bias_data =
-          engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t, false);
-      bias_num = b_t->numel();
-    }
-    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
-                                static_cast<void*>(bias_data),
-                                static_cast<size_t>(bias_num)};
-
-    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
-                                       *const_cast<nvinfer1::ITensor*>(X),
-                                       n_output, tmp_weight.get(), bias.get());
-
-    engine_->SetWeights(op_desc.Input(w_name).front(), std::move(tmp));
-    auto output_name = op_desc.Output("Out").front();
-
-    RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode);
-    if (enable_int8) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
-    }
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
deleted file mode 100644
index 854f434d93e81237dc85c5df62debcf3b3824b78..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
-#include <cuda.h>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-using platform::is_gpu_place;
-using platform::is_cpu_place;
-
-class DefaultIOConverter : public EngineIOConverter {
- public:
-  DefaultIOConverter() {}
-  // NOTE out is GPU memory.
-  virtual void operator()(const LoDTensor& in, void* out,
-                          size_t max_size) override {
-    PADDLE_ENFORCE(out != nullptr);
-    PADDLE_ENFORCE(stream_ != nullptr);
-    const auto& place = in.place();
-    size_t size = in.memory_size();
-    PADDLE_ENFORCE_LE(size, max_size);
-    if (is_cpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
-                                           cudaMemcpyHostToDevice, *stream_));
-    } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
-                                           cudaMemcpyDeviceToDevice, *stream_));
-    } else {
-      PADDLE_THROW("Unknown device for converter");
-    }
-    cudaStreamSynchronize(*stream_);
-  }
-  // NOTE in is GPU memory.
-  virtual void operator()(const void* in, LoDTensor* out,
-                          size_t max_size) override {
-    PADDLE_ENFORCE(in != nullptr);
-    PADDLE_ENFORCE(stream_ != nullptr);
-    const auto& place = out->place();
-    size_t size = out->memory_size();
-    PADDLE_ENFORCE_LE(size, max_size);
-    if (is_cpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToHost, *stream_));
-    } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToDevice, *stream_));
-    } else {
-      PADDLE_THROW("Unknown device for converter");
-    }
-    cudaStreamSynchronize(*stream_);
-  }
-};
-
-// fluid LodTensor <-> tensorrt ITensor
-REGISTER_TENSORRT_IO_CONVERTER(default, DefaultIOConverter);
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h
deleted file mode 100644
index 5daa242f6ab802a50fa6105f0102b817b700f461..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-using framework::LoDTensor;
-
-/*
- * Convert Input from Fluid to TensorRT Engine.
- * Convert Output from TensorRT Engine to Fluid.
- *
- * Note that TensorRT's ITensor follows row major, NCHW. Fluid is also row
- * major,
- * so in the default case just need to copy the data.
- */
-class EngineIOConverter {
- public:
-  EngineIOConverter() {}
-
-  virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
-  virtual void operator()(const void* in, LoDTensor* out, size_t max_size) {}
-
-  void SetStream(cudaStream_t* stream) { stream_ = stream; }
-
-  static void ConvertInput(const std::string& op_type, const LoDTensor& in,
-                           void* out, size_t max_size, cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
-    auto* converter = Registry<EngineIOConverter>::Global().Lookup(
-        op_type, "default" /* default_type */);
-    PADDLE_ENFORCE_NOT_NULL(converter);
-    converter->SetStream(stream);
-    (*converter)(in, out, max_size);
-  }
-
-  static void ConvertOutput(const std::string& op_type, const void* in,
-                            LoDTensor* out, size_t max_size,
-                            cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
-    auto* converter = Registry<EngineIOConverter>::Global().Lookup(
-        op_type, "default" /* default_type */);
-    PADDLE_ENFORCE_NOT_NULL(converter);
-    converter->SetStream(stream);
-    (*converter)(in, out, max_size);
-  }
-
-  virtual ~EngineIOConverter() {}
-
- protected:
-  cudaStream_t* stream_{nullptr};
-};
-
-#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__)                 \
-  struct trt_io_##op_type__##_converter {                                      \
-    trt_io_##op_type__##_converter() {                                         \
-      Registry<EngineIOConverter>::Global().Register<Converter__>(#op_type__); \
-    }                                                                          \
-  };                                                                           \
-  trt_io_##op_type__##_converter trt_io_##op_type__##_converter__;
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
deleted file mode 100644
index f3c714009f82a698560604c70512968a9d833f78..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-// LeakyRelu converter from fluid to tensorRT
-class LeakyReluOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert fluid leaky_relu op to tensorrt layer";
-
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE(input_num == 1);
-    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE(output_num == 1);
-    // Get attrs
-    float alpha = boost::get<float>(op_desc.GetAttr("alpha"));
-    nvinfer1::ILayer* output_layer = nullptr;
-
-#if IS_TRT_VERSION_GE(5100)
-    nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Activation, *input, nvinfer1::ActivationType::kLEAKY_RELU);
-    layer->setAlpha(alpha);
-    output_layer = layer;
-#else
-    platform::CPUPlace place;
-    std::unique_ptr<framework::LoDTensor> alpha_tensor(
-        new framework::LoDTensor());
-    alpha_tensor->Resize(framework::make_ddim({2}));
-    float* alpha_data = alpha_tensor->mutable_data<float>(place);
-    alpha_data[0] = alpha;
-    alpha_data[1] = 1.f - alpha;
-    // the leaky relu formula y = (x > 0) ? x : alpha * x is equal to
-    // y = alpha * x + (x > 0) ? (1 - alpha) * x : 0
-    TensorRTEngine::Weight scale{nvinfer1::DataType::kFLOAT, &alpha_data[0], 1};
-    TensorRTEngine::Weight shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    TensorRTEngine::Weight power{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    // y_scale = alpha * x
-    auto* scale_layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM, shift.get(),
-        scale.get(), power.get());
-    PADDLE_ENFORCE(nullptr != scale_layer);
-    // y_relu = (x > 0) : x : 0
-    auto* relu_layer = TRT_ENGINE_ADD_LAYER(engine_, Activation, *input,
-                                            nvinfer1::ActivationType::kRELU);
-    PADDLE_ENFORCE(nullptr != relu_layer);
-    //
-    TensorRTEngine::Weight sub_scale{nvinfer1::DataType::kFLOAT, &alpha_data[1],
-                                     1};
-    auto* scale_relu_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Scale, *(relu_layer->getOutput(0)),
-                             nvinfer1::ScaleMode::kUNIFORM, shift.get(),
-                             sub_scale.get(), power.get());
-    PADDLE_ENFORCE(nullptr != scale_relu_layer);
-    output_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *(scale_layer->getOutput(0)),
-                             *(scale_relu_layer->getOutput(0)),
-                             nvinfer1::ElementWiseOperation::kSUM);
-    PADDLE_ENFORCE(nullptr != output_layer);
-    // keep alpha tensor to avoid release it's memory
-    std::string alpha_name = op_desc.Output("Out")[0] + "_alpha";
-    PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) ==
-                   engine_->weight_map.end());
-    engine_->SetWeights(alpha_name, std::move(alpha_tensor));
-#endif
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name},
-                             test_mode);
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(leaky_relu, LeakyReluOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
deleted file mode 100644
index 5b6aaad49833cedbd8d1ee0ec5d24c7f983190e6..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
- */
-class MulOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias";
-
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
-    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
-    // Both the input1 and input2 do not need transpose.
-    auto* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, MatrixMultiply, *const_cast<nvinfer1::ITensor*>(input1), false,
-        *const_cast<nvinfer1::ITensor*>(input2), false);
-
-    auto output_name = op_desc.Output("Out")[0];
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {  // the test framework can not determine which is the
-                      // output, so place the declaration inside.
-      engine_->DeclareOutput(output_name);
-    }
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
deleted file mode 100644
index 3a2deae360605f0e6a98d672098bb22359fa9ac6..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-using FluidDT = framework::proto::VarType_Type;
-using TRT_DT = nvinfer1::DataType;
-
-namespace {  // NOLINT
-
-TRT_DT FluidDataType2TRT(FluidDT type) {
-  switch (type) {
-    case FluidDT::VarType_Type_FP32:
-      return TRT_DT::kFLOAT;
-    case FluidDT::VarType_Type_INT32:
-      return TRT_DT::kINT32;
-    default:
-      return TRT_DT::kINT32;
-  }
-  PADDLE_THROW("unkown type");
-  return TRT_DT::kINT32;
-}
-
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
-  PADDLE_ENFORCE_GT(shape.size(), 1UL,
-                    "TensorRT' tensor input requires at least 2 dimensions");
-  PADDLE_ENFORCE_LE(shape.size(), 4UL,
-                    "TensorRT' tensor input requires at most 4 dimensions");
-  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
-  if (shape.size() == 4UL)
-    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
-  return nvinfer1::DimsCHW(shape[1], 1, 1);
-}
-
-}  // namespace // NOLINT
-
-/*
- * Convert Op from Fluid to TensorRT Engine.
- */
-class OpConverter {
- public:
-  OpConverter() {}
-
-  // Converter logic for an op.
-  virtual void operator()(const framework::proto::OpDesc& op,
-                          const framework::Scope& scope,
-                          bool test_mode = false) {}
-
-  // Convert a single fluid operator and add the corresponding layer to TRT.
-  // test_mode: whether the instance executes in an unit test.
-  void ConvertOp(const framework::proto::OpDesc& op,
-                 const std::unordered_set<std::string>& parameters,
-                 const framework::Scope& scope, TensorRTEngine* engine,
-                 bool test_mode = false) {
-    framework::OpDesc op_desc(op, nullptr);
-
-    OpConverter* it{nullptr};
-
-    if (op_desc.Type() == "mul") {
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
-      std::string Y = op_desc.Input("Y")[0];
-      if (parameters.count(Y)) {
-        it = Registry<OpConverter>::Global().Lookup("fc");
-      }
-    }
-    if (op_desc.Type().find("elementwise") != std::string::npos) {
-      static std::unordered_set<std::string> add_tensor_op_set{
-          "add", "mul", "sub", "div", "max", "min", "pow"};
-      // TODO(xingzhaolong): all mul, sub, div
-      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
-      // "sub", "div"};
-      static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
-      int op_type_len = op_desc.Type().size();
-      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
-      std::string Y = op_desc.Input("Y")[0];
-      if (parameters.count(Y)) {
-        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
-                       "Unsupported elementwise type" + op_type);
-        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
-                                                    "_weight");
-        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                                op_desc.Type());
-      } else {
-        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
-                       "Unsupported elementwise type" + op_type);
-        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
-                                                    "_tensor");
-      }
-      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                              op_desc.Type());
-    }
-
-    if (op_desc.Type() == "depthwise_conv2d") {
-      it = Registry<OpConverter>::Global().Lookup("conv2d");
-      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                              op_desc.Type());
-    }
-
-    if (!it) {
-      it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
-    }
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                            op_desc.Type());
-    it->SetEngine(engine);
-    (*it)(op, scope, test_mode);
-  }
-
-  // Convert a fluid block to tensorrt network, NOTE it just convert operators,
-  // the INetwork's inputs and outputs should specified in some other modules.
-  void ConvertBlock(const framework::proto::BlockDesc& block,
-                    const std::unordered_set<std::string>& parameters,
-                    const framework::Scope& scope, TensorRTEngine* engine) {
-    std::unique_lock<std::mutex> lk(mut_);
-    for (int i = 0; i < block.ops_size(); i++) {
-      const auto& op = block.ops(i);
-      ConvertOp(op, parameters, scope, engine);
-    }
-  }
-
-  // The scope  here should be inited with the parameter vars.
-  void ConvertBlockToTRTEngine(
-      framework::BlockDesc* block_desc, const framework::Scope& scope,
-      const std::vector<std::string>& inputs,
-      const std::unordered_set<std::string>& parameters,
-      const std::vector<std::string>& outputs, TensorRTEngine* engine) {
-    engine->InitNetwork();
-    for (auto& input : inputs) {
-      if (parameters.count(input)) continue;
-      auto* var = block_desc->FindVar(input);
-      PADDLE_ENFORCE(var, "no variable called %s", input);
-      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                        "TensorRT engine only takes LoDTensor as input");
-      auto var_shape = var->GetShape();
-
-      engine->DeclareInput(
-          input, FluidDataType2TRT(
-                     var->Proto()->type().lod_tensor().tensor().data_type()),
-          Vec2TRT_Dims(var_shape));
-    }
-    framework::proto::BlockDesc* block_proto = block_desc->Proto();
-    ConvertBlock(*block_proto, parameters, scope, engine);
-    for (auto& output : outputs) {
-      engine->DeclareOutput(output);
-    }
-    engine->FreezeNetwork();
-    engine->ClearWeights();
-  }
-
-  void RreplenishLayerAndOutput(
-      nvinfer1::ILayer* layer, const std::string& layer_type,
-      const std::vector<std::string>& output_tensor_names,
-      bool test_mode = false) {
-    size_t num_out = output_tensor_names.size();
-    for (size_t i = 0; i < num_out; i++) {
-      layer->getOutput(i)->setName(output_tensor_names[i].c_str());
-      engine_->SetITensor(output_tensor_names[i], layer->getOutput(i));
-      if (test_mode) {
-        engine_->DeclareOutput(output_tensor_names[i]);
-      }
-    }
-    layer->setName(
-        (layer_type + " (Output: " + output_tensor_names[0] + ")").c_str());
-  }
-  void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
-
-  virtual ~OpConverter() {}
-
-  // TensorRT engine
-  TensorRTEngine* engine_{nullptr};
-
- protected:
-  bool test_mode_;
-
- private:
-  // registered op converter map, whose key is the fluid op type, and value is
-  // the pointer position of corresponding OpConverter class.
-  std::unordered_map<std::string, OpConverter*> converters_;
-  // fluid inference scope
-  framework::Scope* scope_{nullptr};
-  std::mutex mut_;
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
-  struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
-    trt_##op_type__##_converter() {                                            \
-      ::paddle::inference::Registry<                                           \
-          paddle::inference::tensorrt::OpConverter>::Global()                  \
-          .Register<::paddle::inference::tensorrt::Converter__>(#op_type__);   \
-    }                                                                          \
-  };                                                                           \
-  trt_##op_type__##_converter trt_##op_type__##_converter__;                   \
-  int TouchConverterRegister_##op_type__() {                                   \
-    trt_##op_type__##_converter__.Touch();                                     \
-    return 0;                                                                  \
-  }
-
-#define USE_TRT_CONVERTER(op_type__)                   \
-  extern int TouchConverterRegister_##op_type__();     \
-  static int use_op_converter_trt_##op_type__ UNUSED = \
-      TouchConverterRegister_##op_type__();
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
deleted file mode 100644
index bcd2166728b312dd551917bd7c70eb7764a8479c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * PadOp.
- */
-class PadOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid transpose op to tensorrt tranpose layer";
-
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-
-    const std::vector<int> paddings =
-        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-    const float pad_value = boost::get<float>(op_desc.GetAttr("pad_value"));
-
-    nvinfer1::Dims input_shape = input->getDimensions();
-    int nbDims = input_shape.nbDims;
-    int pad_size = static_cast<int>(paddings.size());
-    PADDLE_ENFORCE_GE(nbDims, 2);
-    PADDLE_ENFORCE_EQ((nbDims + 1) * 2, pad_size);
-    PADDLE_ENFORCE(pad_value == 0.0, "The pad layer of TRT only support zero.");
-
-    nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
-    nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
-
-    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Padding,
-                                       *const_cast<nvinfer1::ITensor*>(input),
-                                       pre_pad, post_pad);
-
-    PADDLE_ENFORCE(layer != nullptr);
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "pad", {output_name}, test_mode);
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(pad, PadOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
deleted file mode 100644
index 1752c52c3f55abfbb808903bd19418de26788d88..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-void DealCeilMode(const nvinfer1::Dims &input_shape, std::vector<int> ksize,
-                  std::vector<int> strides, std::vector<int> paddings,
-                  nvinfer1::DimsHW *pre_pad, nvinfer1::DimsHW *post_pad,
-                  int input_dims) {
-  int input_height = input_shape.d[input_dims - 2];
-  int input_width = input_shape.d[input_dims - 1];
-  int floor_h_output_size =
-      (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-  int ceil_h_output_size =
-      (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
-          strides[0] +
-      1;
-
-  int floor_w_output_size =
-      (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-  int ceil_w_output_size =
-      (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) / strides[1] +
-      1;
-  if (floor_h_output_size != ceil_h_output_size) {
-    post_pad->h() = strides[0] - 1;
-  }
-
-  if (floor_w_output_size != ceil_w_output_size) {
-    post_pad->w() = strides[1] - 1;
-  }
-}
-
-/*
- * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
- */
-class Pool2dOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc &op,
-                  const framework::Scope &scope, bool test_mode) override {
-    VLOG(4)
-        << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-    auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]);
-    nvinfer1::Dims input_shape = input1->getDimensions();
-    int input_dims = input_shape.nbDims;
-
-    PADDLE_ENFORCE_EQ(input_dims, 3UL);
-
-    bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
-    std::string pool_type =
-        boost::get<std::string>(op_desc.GetAttr("pooling_type"));
-    std::vector<int> ksize =
-        boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
-    std::vector<int> strides =
-        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-    std::vector<int> paddings =
-        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-    bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));
-
-    nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
-    if (pool_type == "max") {
-      nv_pool_type = nvinfer1::PoolingType::kMAX;
-    } else if (pool_type == "avg") {
-      nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
-    } else {
-      PADDLE_THROW("TensorRT unsupported pooling type!");
-    }
-
-    nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
-    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
-    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
-
-    nvinfer1::ILayer *layer = nullptr;
-
-    if (global_pooling == true) {
-      nv_ksize.d[0] = input_shape.d[input_dims - 2];
-      nv_ksize.d[1] = input_shape.d[input_dims - 1];
-      auto *layer = TRT_ENGINE_ADD_LAYER(
-          engine_, Pooling, *const_cast<nvinfer1::ITensor *>(input1),
-          nv_pool_type, nv_ksize);
-      PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
-      auto output_name = op_desc.Output("Out")[0];
-      layer->setName(("pool2d (Output: " + output_name + ")").c_str());
-      layer->getOutput(0)->setName(output_name.c_str());
-      engine_->SetITensor(output_name, layer->getOutput(0));
-      if (test_mode) {
-        engine_->DeclareOutput(output_name);
-      }
-      return;
-    }
-
-    if (pool_type == "max") {
-      // Under ceil mode, the pre_pad and post_pad are used to
-      // record the the padding size. In some ceil mode cases,
-      // we do not need padding, so we initialize the two vars to 0.
-
-      nvinfer1::DimsHW pre_pad(0, 0);
-      nvinfer1::DimsHW post_pad(0, 0);
-      if (ceil_mode) {
-        // If ceil mode is true, we will pad the appropriate size to the input.
-        DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
-                     input_dims);
-        auto *pad_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Padding, *const_cast<nvinfer1::ITensor *>(input1), pre_pad,
-            post_pad);
-        PADDLE_ENFORCE_NOT_NULL(
-            pad_layer, "pad layer in poolOp converter could not be created.");
-        input1 = pad_layer->getOutput(0);
-      }
-      auto *pool_layer = TRT_ENGINE_ADD_LAYER(
-          engine_, Pooling, *const_cast<nvinfer1::ITensor *>(input1),
-          nv_pool_type, nv_ksize);
-      PADDLE_ENFORCE_NOT_NULL(pool_layer, "pool layer could not be created.");
-      pool_layer->setStride(nv_strides);
-      pool_layer->setPadding(nv_paddings);
-      layer = pool_layer;
-    } else {
-      // Average pooling needs to exclude the padding pixels from the average
-      // mean.
-      // It is not supported well by TRT, we use a plugin here.
-      std::vector<int> input_shape_v;
-      for (int i = 0; i < input_dims; i++) {
-        input_shape_v.push_back(input_shape.d[i]);
-      }
-      plugin::AvgPoolPlugin *plugin = new plugin::AvgPoolPlugin(
-          ceil_mode, ksize, strides, paddings, input_shape_v);
-      auto *avg_pool_layer = engine_->AddPlugin(&input1, 1, plugin);
-      layer = avg_pool_layer;
-    }
-
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
-
-    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
-    }
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(pool2d);
-REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
deleted file mode 100644
index d327a743662aa5169901846e40232d593a158499..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * PRelu converter from fluid to tensorRT.
- */
-class PReluOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert fluid prelu op to tensorrt prelu layer";
-
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE(input_num == 1);
-    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE(output_num == 1);
-    // Get attrs
-    std::string mode = boost::get<std::string>(op_desc.GetAttr("mode"));
-    //
-    auto* alpha_var = scope.FindVar(op_desc.Input("Alpha")[0]);
-    PADDLE_ENFORCE_NOT_NULL(alpha_var);
-    auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
-
-    platform::CPUPlace cpu_place;
-    std::unique_ptr<framework::LoDTensor> alpha_tensor_temp(
-        new framework::LoDTensor());
-    alpha_tensor_temp->Resize(alpha_tensor->dims());
-    TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get());
-    float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);
-
-    plugin::PReluPlugin* plugin =
-        new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode);
-    nvinfer1::IPluginLayer* layer =
-        engine_->AddPlugin(&input, input_num, plugin);
-    // keep alpha tensor to avoid release it's memory
-    engine_->SetWeights(op_desc.Input("Alpha")[0],
-                        std::move(alpha_tensor_temp));
-
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode);
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(prelu, PReluOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
deleted file mode 100644
index 0f891e0f9f4e6731199e4a6884ec74a1265b3fef..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * ConcatOp
- */
-class ShuffleChannelOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    auto input_dims = input->getDimensions();
-    PADDLE_ENFORCE(input_dims.nbDims == 3);
-    int c = input_dims.d[0];
-    int h = input_dims.d[1];
-    int w = input_dims.d[2];
-    int group = boost::get<int>(op_desc.GetAttr("group"));
-
-    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-    nvinfer1::Dims4 reshape_dim(group, c / group, h, w);
-    layer->setReshapeDimensions(reshape_dim);
-    layer->setSecondTranspose({1, 0, 2, 3});
-    auto* output = layer->getOutput(0);
-
-    auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *output);
-    nvinfer1::DimsCHW reshape_dim2(c, h, w);
-    reshape_layer->setReshapeDimensions(reshape_dim2);
-
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(reshape_layer, "concat", {output_name}, test_mode);
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(shuffle_channel, ShuffleChannelOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
deleted file mode 100644
index b0ae1694127ca942b0d1cc222389357a6cd67874..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights.
- */
-class SoftMaxOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3)
-        << "convert a fluid softmax op to tensorrt softmax layer without bias";
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
-    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax,
-                                       *const_cast<nvinfer1::ITensor*>(input1));
-
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "softmax", {output_name}, test_mode);
-
-    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
-    }
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(softmax);
-REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
deleted file mode 100644
index ae5b1b98060a4e73b2d1761d4edafb152f364070..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-class SplitOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid split op to tensorrt split layer";
-
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    auto input_dims = input->getDimensions();
-    int input_num = op_desc.Input("X").size();
-    size_t output_num = op_desc.Output("Out").size();
-
-    // Get Attrs
-    PADDLE_ENFORCE(input_num == 1);
-    int axis = boost::get<int>(op_desc.GetAttr("axis"));
-    std::vector<int> output_lengths =
-        boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
-    // split on batch is not supported in TensorRT
-    PADDLE_ENFORCE(axis != 0);
-    axis += (axis < 0) ? input_dims.nbDims : -1;
-
-    PADDLE_ENFORCE(output_lengths.size() == output_num);
-    plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
-    nvinfer1::IPluginLayer* layer =
-        engine_->AddPlugin(&input, input_num, plugin);
-
-    std::string layer_name = "split (Output: ";
-    for (size_t i = 0; i < output_num; i++) {
-      auto output_name = op_desc.Output("Out")[i];
-      layer->getOutput(i)->setName(output_name.c_str());
-      engine_->SetITensor(output_name, layer->getOutput(i));
-      layer_name += output_name;
-      if (test_mode) {
-        engine_->DeclareOutput(output_name);
-      }
-    }
-    layer->setName((layer_name + ")").c_str());
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(split, SplitOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
deleted file mode 100644
index 42f2008afa16c305561db9b27f472819fe4cec17..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-class SwishOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert fluid swish op to tensorrt layer";
-
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE(input_num == 1);
-    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE(output_num == 1);
-    // Get attrs
-    float beta = boost::get<float>(op_desc.GetAttr("beta"));
-
-    plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta);
-
-    nvinfer1::IPluginLayer* layer =
-        engine_->AddPlugin(&input, input_num, plugin);
-
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "swish", {output_name}, test_mode);
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(swish, SwishOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
deleted file mode 100644
index f2dc5ba1c7c2c832e0239f6a30760c354aaf4699..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-void test_activation(std::string act_type) {
-  framework::Scope scope;
-  std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("act-X", nvinfer1::Dims2(10, 6));
-  validator.DeclOutputVar("act-Out", nvinfer1::Dims2(10, 6));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType(act_type);
-  desc.SetInput("X", {"act-X"});
-  desc.SetOutput("Out", {"act-Out"});
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(5);
-}
-
-TEST(ReluOpConverter, main) { test_activation("relu"); }
-
-TEST(SigmoidOpConverter, main) { test_activation("sigmoid"); }
-
-TEST(TanhOpConverter, main) { test_activation("tanh"); }
-
-TEST(Relu6OpConverter, main) { test_activation("relu6"); }
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(relu);
-USE_OP(sigmoid);
-USE_OP(tanh);
-USE_OP(relu6);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
deleted file mode 100644
index 41412cb079540da72760558379b158b6538aa6a8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(batch_norm_op, test) {
-  std::unordered_set<std::string> parameters(
-      {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
-       "batch_norm_variance"});
-  framework::Scope scope;
-  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
-  std::vector<int> param_shape{2};
-
-  validator.DeclInputVar("batch_norm_X", nvinfer1::DimsCHW(2, 5, 5));
-  validator.DeclParamVar("batch_norm_scale", param_shape);
-  validator.DeclParamVar("batch_norm_bias", param_shape);
-  validator.DeclParamVar("batch_norm_mean", param_shape);
-  validator.DeclParamVar("batch_norm_variance", param_shape);
-  validator.DeclOutputVar("batch_norm_Y", nvinfer1::DimsCHW(2, 5, 5));
-  validator.DeclOutputVar("batch_norm_save_mean", param_shape);
-  validator.DeclOutputVar("batch_norm_save_variance", param_shape);
-
-  // Prepare Op description
-  framework::OpDesc desc;
-
-  desc.SetType("batch_norm");
-  desc.SetInput("X", {"batch_norm_X"});
-  desc.SetInput("Scale", {"batch_norm_scale"});
-  desc.SetInput("Bias", {"batch_norm_bias"});
-  desc.SetInput("Mean", {"batch_norm_mean"});
-  desc.SetInput("Variance", {"batch_norm_variance"});
-  desc.SetOutput("Y", {"batch_norm_Y"});
-  desc.SetOutput("MeanOut", {"batch_norm_mean"});
-  desc.SetOutput("VarianceOut", {"batch_norm_variance"});
-  desc.SetOutput("SavedMean", {"batch_norm_save_mean"});
-  desc.SetOutput("SavedVariance", {"batch_norm_save_variance"});
-
-  float eps = 1e-5f;
-  bool is_test = true;
-  desc.SetAttr("epsilon", eps);
-  desc.SetAttr("is_test", is_test);
-
-  validator.SetOp(*desc.Proto());
-
-  std::unordered_set<std::string> neglected_output = {
-      "batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean",
-      "batch_norm_variance"};
-  validator.Execute(3, neglected_output);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-USE_OP(batch_norm);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
deleted file mode 100644
index 4f284a4db5758e072915d7fd0f16115b8a36ba8b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(concat_op, test) {
-  std::unordered_set<std::string> parameters({""});
-  framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("concat_x1", nvinfer1::DimsCHW(10, 3, 1));
-  validator.DeclInputVar("concat_x2", nvinfer1::DimsCHW(3, 3, 1));
-  validator.DeclInputVar("concat_x3", nvinfer1::DimsCHW(7, 3, 1));
-  validator.DeclOutputVar("concat_out", nvinfer1::DimsCHW(20, 3, 1));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("concat");
-  desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
-  desc.SetOutput("Out", {"concat_out"});
-
-  int axis = 1;
-  desc.SetAttr("axis", axis);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(5);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-USE_OP(concat);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
deleted file mode 100644
index 95916746d6fcb528d26a8f8bb39980b55c4f3704..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-USE_OP(conv2d);
-USE_OP(conv2d_transpose);
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(conv2d_op, test) {
-  std::unordered_set<std::string> parameters({"conv2d-Y"});
-  framework::Scope scope;
-  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
-
-  validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5));
-  validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
-  validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("conv2d");
-  desc.SetInput("Input", {"conv2d-X"});
-  desc.SetInput("Filter", {"conv2d-Y"});
-  desc.SetOutput("Output", {"conv2d-Out"});
-
-  const std::vector<int> strides({1, 1});
-  const std::vector<int> paddings({1, 1});
-  const std::vector<int> dilations({1, 1});
-  const int groups = 1;
-
-  desc.SetAttr("strides", strides);
-  desc.SetAttr("paddings", paddings);
-  desc.SetAttr("dilations", dilations);
-  desc.SetAttr("groups", groups);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(3);
-}
-
-TEST(conv2d_transpose_op, test) {
-  std::unordered_set<std::string> parameters({"deconv2d-Y"});
-  framework::Scope scope;
-  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
-
-  validator.DeclInputVar("deconv2d-X", nvinfer1::Dims3(3, 5, 5));
-  validator.DeclParamVar("deconv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
-  validator.DeclOutputVar("deconv2d-Out", nvinfer1::Dims3(2, 5, 5));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("conv2d_transpose");
-  desc.SetInput("Input", {"deconv2d-X"});
-  desc.SetInput("Filter", {"deconv2d-Y"});
-  desc.SetOutput("Output", {"deconv2d-Out"});
-
-  const std::vector<int> strides({1, 1});
-  const std::vector<int> paddings({1, 1});
-  const std::vector<int> dilations({1, 1});
-  const int groups = 1;
-
-  desc.SetAttr("strides", strides);
-  desc.SetAttr("paddings", paddings);
-  desc.SetAttr("dilations", dilations);
-  desc.SetAttr("groups", groups);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(3);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
deleted file mode 100644
index 81e905b975327125fddc8a33d871cc97290e4ac1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(DropoutOpConverter, main) {
-  framework::Scope scope;
-  std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(8, parameters, scope, 1000);
-
-  std::vector<int> tensor_shape{8, 10};
-  validator.DeclInputVar("dropout-X", tensor_shape,
-                         nvinfer1::DimsCHW(10, 1, 1));
-  validator.DeclOutputVar("dropout-Out", nvinfer1::DimsCHW(10, 1, 1));
-  validator.DeclOutputVar("mask-Out", nvinfer1::DimsCHW(10, 1, 1));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  int is_test = 1;
-  float dropout_prob = 0.4;
-  std::string dropout_implementation = "upscale_in_train";
-
-  desc.SetType("dropout");
-  desc.SetInput("X", {"dropout-X"});
-  desc.SetOutput("Mask", {"mask-Out"});
-  desc.SetOutput("Out", {"dropout-Out"});
-  desc.SetAttr("is_test", is_test);
-  desc.SetAttr("dropout_prob", dropout_prob);
-
-  desc.SetAttr("dropout_implementation", dropout_implementation);
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  std::unordered_set<std::string> neglected_output = {"mask-Out"};
-
-  validator.Execute(8, neglected_output);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(dropout);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
deleted file mode 100644
index cc967464a5f29151a061e99cda6870f9f370ec1b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(elementwise_op, add_weight) {
-  std::unordered_set<std::string> parameters({"elementwise_add-Y"});
-  framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1 << 15);
-  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
-  validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
-  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("elementwise_add");
-  desc.SetInput("X", {"elementwise_add-X"});
-  desc.SetInput("Y", {"elementwise_add-Y"});
-  desc.SetOutput("Out", {"elementwise_add-Out"});
-
-  int axis = 1;
-  desc.SetAttr("axis", axis);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(8);
-}
-
-TEST(elementwise_op, native) {
-  for (std::string type : {"add", "mul"}) {
-    int batch_size = 8;
-    std::unordered_set<std::string> parameters;
-    framework::Scope scope;
-    TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
-    validator.DeclInputVar("elementwise_" + type + "-X",
-                           nvinfer1::DimsCHW(10, 3, 3));
-    validator.DeclInputVar("elementwise_" + type + "-Y",
-                           nvinfer1::Dims3(10, 3, 3));
-    validator.DeclOutputVar("elementwise_" + type + "-Out",
-                            nvinfer1::DimsCHW(10, 3, 3));
-
-    // Prepare Op description
-    framework::OpDesc desc;
-    desc.SetType("elementwise_" + type);
-    desc.SetInput("X", {"elementwise_" + type + "-X"});
-    desc.SetInput("Y", {"elementwise_" + type + "-Y"});
-    desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
-
-    int axis = -1;
-    desc.SetAttr("axis", axis);
-
-    validator.SetOp(*desc.Proto());
-    validator.Execute(batch_size);
-  }
-}
-
-TEST(elementwise_op, plugin) {
-  for (std::string type : {"add", "mul"}) {
-    int batch_size = 8;
-    std::unordered_set<std::string> parameters;
-    framework::Scope scope;
-    TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
-    validator.DeclInputVar("elementwise_" + type + "-X",
-                           nvinfer1::DimsCHW(10, 3, 3));
-    validator.DeclInputVar("elementwise_" + type + "-Y",
-                           nvinfer1::Dims3(10, 1, 1));
-    validator.DeclOutputVar("elementwise_" + type + "-Out",
-                            nvinfer1::DimsCHW(10, 3, 3));
-
-    // Prepare Op description
-    framework::OpDesc desc;
-    desc.SetType("elementwise_" + type);
-    desc.SetInput("X", {"elementwise_" + type + "-X"});
-    desc.SetInput("Y", {"elementwise_" + type + "-Y"});
-    desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
-
-    int axis = -1;
-    desc.SetAttr("axis", axis);
-
-    validator.SetOp(*desc.Proto());
-    validator.Execute(batch_size);
-  }
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(elementwise_add);
-USE_OP(elementwise_mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
deleted file mode 100644
index 1ae2668e733aad23241c63b9985e708396d0b1bc..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(fc_op, test) {
-  std::unordered_set<std::string> parameters({"mul-Y"});
-  framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1));
-  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
-  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("mul");
-  desc.SetInput("X", {"mul-X"});
-  desc.SetInput("Y", {"mul-Y"});
-  desc.SetOutput("Out", {"mul-Out"});
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(10);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
deleted file mode 100644
index 8f91309a0a00d5131268f026c319e25ba3cb964a..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-void IOConverterTester(const platform::DeviceContext& ctx) {
-  cudaStream_t stream;
-  ASSERT_EQ(0, cudaStreamCreate(&stream));
-
-  // init fluid in_tensor
-  framework::LoDTensor in_tensor;
-  in_tensor.Resize({10, 10});
-  auto place = ctx.GetPlace();
-  in_tensor.mutable_data<float>(place);
-  std::vector<float> init;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init.push_back(i);
-  }
-  framework::TensorFromVector(init, ctx, &in_tensor);
-
-  // init tensorrt buffer
-  void* buffer;
-  size_t size = in_tensor.memory_size();
-  ASSERT_EQ(cudaMalloc(&buffer, size), 0);
-
-  // convert fluid in_tensor to tensorrt buffer
-  EngineIOConverter::ConvertInput("test", in_tensor, buffer, size, &stream);
-
-  // convert tensorrt buffer to fluid out_tensor
-  framework::LoDTensor out_tensor;
-  out_tensor.Resize({10, 10});
-  out_tensor.mutable_data<float>(place);
-  EngineIOConverter::ConvertOutput("test", buffer, &out_tensor, size, &stream);
-
-  // compare in_tensor and out_tensor
-  std::vector<float> result;
-  framework::TensorToVector(out_tensor, ctx, &result);
-  EXPECT_EQ(init.size(), result.size());
-  for (size_t i = 0; i < init.size(); i++) {
-    EXPECT_EQ(init[i], result[i]);
-  }
-  cudaStreamDestroy(stream);
-}
-
-TEST(EngineIOConverterTester, DefaultCPU) {
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
-  IOConverterTester(ctx);
-}
-
-TEST(EngineIOConverterTester, DefaultGPU) {
-  platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
-  IOConverterTester(ctx);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
deleted file mode 100644
index d00826af075159004d3727a7519e7c319dbddb02..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(leaky_relu_op, test_leaky_relu) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("leaky_relu_input", nvinfer1::DimsCHW(3, 2, 2));
-  validator.DeclOutputVar("leaky_relu_out", nvinfer1::DimsCHW(3, 2, 2));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("leaky_relu");
-  desc.SetInput("X", {"leaky_relu_input"});
-  desc.SetOutput("Out", {"leaky_relu_out"});
-
-  desc.SetAttr("alpha", 0.1f);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(1);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-// USE_OP(leaky_relu);
-USE_OP(leaky_relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
deleted file mode 100644
index 282f53559aa75b2c7c252450e392e1996f9b1d81..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(MulOpConverter, main) {
-  framework::Scope scope;
-  std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(10, parameters, scope, 1000, false);
-  validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
-  validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
-  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("mul");
-  desc.SetInput("X", {"mul-X"});
-  desc.SetInput("Y", {"mul-Y"});
-  desc.SetOutput("Out", {"mul-Out"});
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(2);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
deleted file mode 100644
index 52655663706d7d00a5e8561fa1d319d7ccf774e3..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-#include <gtest/gtest.h>  // NOLINT
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(OpConverter, ConvertBlock) {
-  framework::ProgramDesc prog;
-  auto* block = prog.MutableBlock(0);
-  auto* conv2d_op = block->AppendOp();
-
-  // init trt engine
-  std::unique_ptr<TensorRTEngine> engine_;
-  engine_.reset(new TensorRTEngine(5, 1 << 15));
-  engine_->InitNetwork();
-
-  engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
-                        nvinfer1::Dims3(2, 5, 5));
-
-  conv2d_op->SetType("conv2d");
-  conv2d_op->SetInput("Input", {"conv2d-X"});
-  conv2d_op->SetInput("Filter", {"conv2d-Y"});
-  conv2d_op->SetOutput("Output", {"conv2d-Out"});
-
-  const std::vector<int> strides({1, 1});
-  const std::vector<int> paddings({1, 1});
-  const std::vector<int> dilations({1, 1});
-  const int groups = 1;
-
-  conv2d_op->SetAttr("strides", strides);
-  conv2d_op->SetAttr("paddings", paddings);
-  conv2d_op->SetAttr("dilations", dilations);
-  conv2d_op->SetAttr("groups", groups);
-
-  // init scope
-  framework::Scope scope;
-  std::vector<int> dim_vec = {3, 2, 3, 3};
-  auto* x = scope.Var("conv2d-Y");
-  auto* x_tensor = x->GetMutable<framework::LoDTensor>();
-  x_tensor->Resize(framework::make_ddim(dim_vec));
-  x_tensor->mutable_data<float>(platform::CUDAPlace(0));
-
-  OpConverter converter;
-  converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
-                         engine_.get() /*TensorRTEngine*/);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_TRT_CONVERTER(conv2d)
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
deleted file mode 100644
index ba35d7ddbb2f4e6062713bd82be277e7ad0cb341..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(PadConverter, main) {
-  framework::Scope scope;
-  std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("pad-X", nvinfer1::Dims3(3, 2, 2));
-  validator.DeclOutputVar("pad-Out", nvinfer1::Dims3(3, 3, 5));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("pad");
-  desc.SetInput("X", {"pad-X"});
-  desc.SetOutput("Out", {"pad-Out"});
-
-  std::vector<int> paddings = {0, 0, 0, 0, 0, 1, 1, 2};
-  float pad_value = 0.0;
-  desc.SetAttr("paddings", paddings);
-  desc.SetAttr("pad_value", pad_value);
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(2);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(pad);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
deleted file mode 100644
index bded833505cd25352adc4123de415613d1fc926d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include <gtest/gtest.h>
-#include <fstream>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-void test_pool2d(bool global_pooling, bool ceil_mode,
-                 std::string pool_type = "max") {
-  framework::Scope scope;
-  std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
-
-  // The ITensor's Dims should not contain the batch size.
-  // So, the ITensor's Dims of input and output should be C * H * W.
-  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 6, 7));
-  if (global_pooling)
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
-  else if (ceil_mode)
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 3, 4));
-  else
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 3, 3));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("pool2d");
-  desc.SetInput("X", {"pool2d-X"});
-  desc.SetOutput("Out", {"pool2d-Out"});
-
-  std::vector<int> ksize({2, 2});
-  std::vector<int> strides({2, 2});
-  std::vector<int> paddings({0, 0});
-  std::string pooling_t = pool_type;
-
-  desc.SetAttr("pooling_type", pooling_t);
-  desc.SetAttr("ksize", ksize);
-  desc.SetAttr("strides", strides);
-  desc.SetAttr("paddings", paddings);
-  desc.SetAttr("global_pooling", global_pooling);
-  desc.SetAttr("ceil_mode", ceil_mode);
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(3);
-}
-
-TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
-TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }
-
-TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); }
-TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(pool2d);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
deleted file mode 100644
index b086c910d38a243d98315f2d6eb82ecc0ec5c06d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(prelu_op, test_channel_wise) {
-  std::unordered_set<std::string> parameters({"prelu_alpha"});
-  framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
-  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(3, 1, 1));
-  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("prelu");
-  desc.SetInput("X", {"prelu_input"});
-  desc.SetInput("Alpha", {"prelu_alpha"});
-  desc.SetOutput("Out", {"prelu_out"});
-
-  desc.SetAttr("mode", std::string("channel"));
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(1);
-}
-
-TEST(prelu_op, test_element_wise) {
-  std::unordered_set<std::string> parameters({"prelu_alpha"});
-  framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
-  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims4(10, 3, 2, 2));
-  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("prelu");
-  desc.SetInput("X", {"prelu_input"});
-  desc.SetInput("Alpha", {"prelu_alpha"});
-  desc.SetOutput("Out", {"prelu_out"});
-
-  desc.SetAttr("mode", std::string("element"));
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(1);
-}
-
-TEST(prelu_op, test_scalar) {
-  std::unordered_set<std::string> parameters({"prelu_alpha"});
-  framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
-  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(1, 1, 1));
-  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("prelu");
-  desc.SetInput("X", {"prelu_input"});
-  desc.SetInput("Alpha", {"prelu_alpha"});
-  desc.SetOutput("Out", {"prelu_out"});
-
-  desc.SetAttr("mode", std::string("all"));
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(1);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(prelu);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
deleted file mode 100644
index e3cc5273734e02ecc4ed6453e6cd47052463c8b2..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(leaky_relu_op, test_leaky_relu) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("sc_input", nvinfer1::DimsCHW(4, 2, 2));
-  validator.DeclOutputVar("sc_out", nvinfer1::DimsCHW(4, 2, 2));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("shuffle_channel");
-  desc.SetInput("X", {"sc_input"});
-  desc.SetOutput("Out", {"sc_out"});
-  int group = 2;
-  desc.SetAttr("group", group);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(1);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-// USE_OP(leaky_relu);
-USE_OP(shuffle_channel);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
deleted file mode 100644
index 503ce71f7fb4377bb4304569b7484fb25abdb284..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(SoftMaxOpConverter, main) {
-  framework::Scope scope;
-  std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(8, parameters, scope, 1000);
-
-  std::vector<int> tensor_shape{8, 10};
-  validator.DeclInputVar("softmax-X", tensor_shape,
-                         nvinfer1::DimsCHW(10, 1, 1));
-  validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("softmax");
-  desc.SetInput("X", {"softmax-X"});
-  desc.SetOutput("Out", {"softmax-Out"});
-
-  LOG(INFO) << "set OP";
-  validator.SetOp(*desc.Proto());
-  LOG(INFO) << "execute";
-
-  validator.Execute(3);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(softmax);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
deleted file mode 100644
index 5aacc5c600dd1371e3865adc888bb8e24640e7d9..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-template <int BatchSize, int Axis>
-void TensorRTSplitTest(const std::vector<int> &in_shape,
-                       const std::vector<int> &sections) {
-  std::unordered_set<std::string> parameters({""});
-  framework::Scope scope;
-  TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000);
-
-  auto make_dim = [](const std::vector<int> &shape) {
-    nvinfer1::DimsCHW dim;
-    dim.c() = shape[0];
-    dim.h() = shape[1];
-    dim.w() = shape[2];
-    return dim;
-  };
-  validator.DeclInputVar("split_input", make_dim(in_shape));
-  std::vector<std::string> output_vars;
-  for (size_t i = 0; i < sections.size(); ++i) {
-    auto out_shape = in_shape;
-    out_shape[Axis - 1] = sections[i];
-    std::string output_name = "split_out" + std::to_string(i);
-    validator.DeclOutputVar(output_name, make_dim(out_shape));
-    output_vars.push_back(output_name);
-  }
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("split");
-  desc.SetInput("X", {"split_input"});
-  desc.SetOutput("Out", output_vars);
-
-  desc.SetAttr("axis", Axis);
-  desc.SetAttr("num", 0);
-  desc.SetAttr("sections", sections);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(BatchSize);
-}
-
-// batch = 0, axis = 1, same shape
-TEST(split_op, test_same_shape_axis1_batch1) {
-  TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2});
-}
-// batch = 0, axis = 1, different shape
-TEST(split_op, test_different_shape_axis1_batch1) {
-  TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1});
-}
-// batch = 10, axis = 1, same shape
-TEST(split_op, test_same_shape_axis1_batch10) {
-  TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2});
-}
-// batch = 10, axis = 1, different shape
-TEST(split_op, test_different_shape_axis1_batch10) {
-  TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1});
-}
-// batch = 0, axis = 2, same shape
-TEST(split_op, test_same_shape_axis2_batch1) {
-  TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2});
-}
-// batch = 0, axis = 2, different shape
-TEST(split_op, test_different_shape_axis2_batch1) {
-  TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1});
-}
-// batch = 10, axis = 2, same shape
-TEST(split_op, test_same_shape_axis2_batch10) {
-  TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2});
-}
-// batch = 10, axis = 2, different shape
-TEST(split_op, test_different_shape_axis2_batch10) {
-  TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1});
-}
-// batch = 0, axis = 3, same shape
-TEST(split_op, test_same_shape_axis3_batch1) {
-  TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2});
-}
-// batch = 0, axis = 3, different shape
-TEST(split_op, test_different_shape_axis3_batch1) {
-  TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1});
-}
-// batch = 10, axis = 3, same shape
-TEST(split_op, test_same_shape_axis3_batch10) {
-  TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2});
-}
-// batch = 10, axis = 3, different shape
-TEST(split_op, test_different_shape_axis3_batch10) {
-  TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1});
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(split);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
deleted file mode 100644
index c15c79bb13fad4233775482dc1b8b4841e61a23a..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-TEST(swish_op, test_swish) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("sw_input", nvinfer1::DimsCHW(3, 2, 2));
-  validator.DeclOutputVar("sw_out", nvinfer1::DimsCHW(3, 2, 2));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("swish");
-  desc.SetInput("X", {"sw_input"});
-  desc.SetOutput("Out", {"sw_out"});
-
-  desc.SetAttr("beta", 2.0f);
-
-  validator.SetOp(*desc.Proto());
-
-  validator.Execute(1);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-USE_OP(swish);
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
deleted file mode 100644
index 97affafb4bffd20a52199bdd80affc235319f5f4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file implements a UT framework to make the validation of transforming
- * Fluid Op to TRT Layer.
- */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * Get a random float value between [low, high]
- */
-float random(float low, float high) {
-  static std::mt19937 mt(100);
-  std::uniform_real_distribution<double> dist(low, high);
-  return dist(mt);
-}
-
-void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
-                     const platform::DeviceContext& ctx) {
-  auto dims = tensor->dims();
-  size_t num_elements = analysis::AccuDims(dims, dims.size());
-  PADDLE_ENFORCE_GT(num_elements, 0);
-
-  platform::CPUPlace cpu_place;
-  framework::LoDTensor temp_tensor;
-  temp_tensor.Resize(dims);
-  auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
-
-  for (size_t i = 0; i < num_elements; i++) {
-    *(temp_data + i) = random(0., 1.);
-  }
-
-  TensorCopySync(temp_tensor, place, tensor);
-}
-
-/*
- * Help to validate the correctness between Fluid Op and the corresponding TRT
- * layer.
- */
-class TRTConvertValidation {
- public:
-  TRTConvertValidation() = delete;
-
-  TRTConvertValidation(int max_batch_size,
-                       const std::unordered_set<std::string>& parameters,
-                       framework::Scope& scope,  // NOLINT
-                       int workspace_size = 1 << 10, bool if_add_batch = true)
-      : parameters_(parameters),
-        scope_(scope),
-        if_add_batch_(if_add_batch),
-        max_batch_size_(max_batch_size) {
-    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
-    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size));
-    engine_->InitNetwork();
-  }
-
-  // Declare a Variable as input with random initialization.
-  void DeclInputVar(const std::string& name, const std::vector<int> tensor_dims,
-                    const nvinfer1::Dims& trt_dims) {
-    DeclVar(name, tensor_dims);
-    engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, trt_dims);
-  }
-
-  void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
-    DeclVar(name, dims);
-    // Declare TRT inputs.
-    engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
-  }
-
-  void DeclParamVar(const std::string& name, const std::vector<int> dim_vec) {
-    DeclVar(name, dim_vec);
-  }
-
-  // Declare a parameter varaible in the scope.
-  void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
-    DeclVar(name, dims, true);
-  }
-
-  void DeclOutputVar(const std::string& name, const std::vector<int> dim_vec) {
-    DeclVar(name, dim_vec);
-  }
-
-  void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
-    DeclVar(name, dims);
-  }
-
-  void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
-    platform::CUDADeviceContext ctx(place_);
-
-    auto* x = scope_.Var(name);
-    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
-    x_tensor->Resize(framework::make_ddim(dim_vec));
-    RandomizeTensor(x_tensor, place_, ctx);
-  }
-  // Declare a variable in a fluid Scope.
-  void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
-               bool is_param = false) {
-    // Init Fluid tensor.
-    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
-    // There is no batchsize in ITensor's shape, but We should add it to
-    // tensor's shape of fluid. If the variable is not parameter and the
-    // if_add_batch_ flag is true, add the max batchsize to dim_vec.
-    if (is_param != true && if_add_batch_ == true)
-      dim_vec.insert(dim_vec.begin(), max_batch_size_);
-
-    DeclVar(name, dim_vec);
-  }
-
-  void SetOp(const framework::proto::OpDesc& desc) {
-    op_ = framework::OpRegistry::CreateOp(desc);
-
-    Singleton<OpConverter>::Global().ConvertOp(
-        desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
-
-    engine_->FreezeNetwork();
-
-    // Declare outputs.
-    op_desc_.reset(new framework::OpDesc(desc, nullptr));
-  }
-
-  // We use the set 'neglected_output' here, because some Ops like batch norm,
-  // the outputs specified in the op des are only used during training,
-  // so we should neglect those output during inference.
-  void Execute(int batch_size,
-               std::unordered_set<std::string> neglected_output = {}) {
-    // Execute Fluid Op
-    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
-    platform::CUDADeviceContext ctx(place_);
-    op_->Run(scope_, place_);
-    cudaStreamSynchronize(stream_);
-    std::vector<std::string> input_output_names;
-
-    // Note: we need filter the parameter
-    for (const auto& input : op_desc_->InputArgumentNames()) {
-      if (parameters_.count(input)) continue;
-      input_output_names.push_back(input);
-    }
-
-    // Collect the fluid outputs.
-    std::vector<std::vector<float>> fluid_outs;
-    for (const auto& output : op_desc_->OutputArgumentNames()) {
-      if (neglected_output.count(output)) continue;
-      input_output_names.push_back(output);
-      std::vector<float> fluid_out;
-      auto* var = scope_.FindVar(output);
-      auto* tensor = var->GetMutable<framework::LoDTensor>();
-      framework::TensorToVector(*tensor, ctx, &fluid_out);
-      fluid_outs.push_back(fluid_out);
-    }
-
-    // Bind input and output for TRT.
-    const int num_bindings = input_output_names.size();
-    std::vector<void*> buffers(num_bindings);
-
-    for (const std::string& name : input_output_names) {
-      auto* var = scope_.FindVar(name);
-      auto* tensor = var->GetMutable<framework::LoDTensor>();
-      const int bind_index = engine_->engine()->getBindingIndex(name.c_str());
-      buffers[bind_index] =
-          static_cast<void*>(tensor->mutable_data<float>(place_));
-    }
-
-    // Execute TRT.
-    engine_->Execute(batch_size, &buffers, stream_);
-    cudaStreamSynchronize(stream_);
-
-    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
-    int index = 0;
-    for (const auto& output : op_desc_->OutputArgumentNames()) {
-      if (neglected_output.count(output)) continue;
-      std::vector<float> trt_out;
-      auto* var = scope_.FindVar(output);
-      auto* tensor = var->GetMutable<framework::LoDTensor>();
-      framework::TensorToVector(*tensor, ctx, &trt_out);
-
-      size_t fluid_out_size = fluid_outs[index].size();
-      if (if_add_batch_ == true) {
-        fluid_out_size =
-            batch_size * (framework::product(tensor->dims()) / max_batch_size_);
-      }
-
-      for (size_t i = 0; i < fluid_out_size; i++) {
-        // Loose the threshold for CI in different machine model.
-        EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5);
-      }
-      index += 1;
-    }
-  }
-
-  framework::Scope& scope() { return scope_; }
-
- private:
-  platform::CUDAPlace place_;
-  std::unique_ptr<TensorRTEngine> engine_;
-  cudaStream_t stream_;
-  std::unique_ptr<framework::OperatorBase> op_;
-  std::unique_ptr<framework::OpDesc> op_desc_;
-  const std::unordered_set<std::string>& parameters_;
-  framework::Scope& scope_;
-  // The ITensor of trt does not cotain the batch size,
-  // bug, in most cases, we need to set batch size for
-  // fluid's tensor shape. This variable indicates
-  // whether to add batch size to tensor shape of fluid.
-  bool if_add_batch_;
-  int max_batch_size_;
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
deleted file mode 100644
index f806069b47689de52a45d6f6917b853eff170164..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use
-this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/engine.h"
-
-#include <NvInfer.h>
-#include <cuda.h>
-#include <glog/logging.h>
-#include <string>
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/tensorrt/helper.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-int TensorRTEngine::runtime_batch_ = 1;
-
-void TensorRTEngine::Build(const DescType &paddle_model) {
-  PADDLE_ENFORCE(false, "not implemented");
-}
-
-void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
-                             cudaStream_t stream) {
-  freshDeviceId();
-  const std::thread::id tid = std::this_thread::get_id();
-  batch_size_ = batch_size;
-  if (infer_context_.find(tid) == infer_context_.end()) {
-    PADDLE_ENFORCE_NOT_NULL(
-        infer_engine_,
-        "You should build engine first and then set the context.");
-    infer_context_[tid].reset(infer_engine_->createExecutionContext());
-  }
-  infer_context_[tid]->enqueue(batch_size, buffers->data(), stream, nullptr);
-  cudaStreamSynchronize(stream);
-  SetRuntimeBatch(batch_size);
-}
-
-void TensorRTEngine::FreezeNetwork() {
-  freshDeviceId();
-  VLOG(3) << "TRT to freeze network";
-  PADDLE_ENFORCE(infer_builder_ != nullptr,
-                 "Call InitNetwork first to initialize network.");
-  PADDLE_ENFORCE(infer_network_ != nullptr,
-                 "Call InitNetwork first to initialize network.");
-  // build engine.
-  infer_builder_->setMaxBatchSize(max_batch_);
-  infer_builder_->setMaxWorkspaceSize(max_workspace_);
-  bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
-#if IS_TRT_VERSION_GE(5000)
-  if (enable_fp16) {
-    bool support_fp16 = infer_builder_->platformHasFastFp16();
-    infer_builder_->setFp16Mode(support_fp16);
-    if (!support_fp16) {
-      LOG(INFO) << "You specify FP16 mode, but the hardware do not support "
-                   "FP16 speed up, use FP32 instead.";
-    }
-  }
-#else
-  if (enable_fp16)
-    LOG(INFO) << "Using FP16 in Paddle-TRT must ensure that the version of TRT "
-                 "is at least 5."
-                 "So, use FP32 to run.";
-#endif
-  bool enable_int8 = (precision_ == AnalysisConfig::Precision::kInt8);
-
-  if (enable_int8) {
-    infer_builder_->setInt8Mode(true);
-    if (calibrator_) {
-      infer_builder_->setInt8Calibrator(calibrator_);
-    } else {
-      infer_builder_->setInt8Calibrator(nullptr);
-
-#if IS_TRT_VERSION_GE(5000)
-      infer_builder_->setStrictTypeConstraints(true);
-      for (auto &quant_range : quant_dynamic_range_) {
-        auto tensor = quant_range.first;
-        float range = quant_range.second;
-        tensor->setDynamicRange(-range, range);
-      }
-
-      std::unordered_set<nvinfer1::ITensor *> all_t;
-      for (int i = 0; i < infer_network_->getNbLayers(); i++) {
-        auto layer = infer_network_->getLayer(i);
-        for (int j = 0; j < layer->getNbOutputs(); j++) {
-          all_t.insert(layer->getOutput(j));
-        }
-      }
-      for (int i = 0; i < infer_network_->getNbInputs(); i++) {
-        all_t.insert(infer_network_->getInput(i));
-      }
-
-      for (auto &t : all_t) {
-        if (!quant_dynamic_range_.count(t)) {
-          LOG(WARNING)
-              << "We are in trt int8 mode(not calibration), scale not setted"
-              << " for tensor " << t->getName()
-              << ", this might be ok when trt does not need this range";
-        }
-      }
-#endif
-    }
-  }
-
-  infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
-  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
-}
-
-nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
-                                                nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims &dims) {
-  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
-                    name);
-
-  PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
-  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
-  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
-                        analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
-  PADDLE_ENFORCE(input->isNetworkInput());
-  TensorRTEngine::SetITensor(name, input);
-  return input;
-}
-
-void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
-                                   const std::string &name) {
-  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
-                    name);
-
-  auto *output = layer->getOutput(offset);
-  SetITensor(name, output);
-  PADDLE_ENFORCE(output != nullptr);
-  output->setName(name.c_str());
-  PADDLE_ENFORCE(!output->isNetworkInput());
-  infer_network_->markOutput(*output);
-  PADDLE_ENFORCE(output->isNetworkOutput());
-  // output buffers' size can only be decided later, set zero here to mark this
-  // and will reset later.
-  buffer_sizes_[name] = 0;
-}
-
-bool TensorRTEngine::HasDeclared(const std::string &name) {
-  return buffer_sizes_.count(name) > 0;
-}
-
-void TensorRTEngine::DeclareOutput(const std::string &name) {
-  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
-                    name);
-
-  auto *output = TensorRTEngine::GetITensor(name);
-  PADDLE_ENFORCE(output != nullptr);
-  output->setName(name.c_str());
-  PADDLE_ENFORCE(!output->isNetworkInput());
-  infer_network_->markOutput(*output);
-  // output buffers' size can only be decided later, set zero here to mark this
-  // and will reset later.
-  buffer_sizes_[name] = 0;
-}
-
-void TensorRTEngine::SetITensor(const std::string &name,
-                                nvinfer1::ITensor *tensor) {
-  PADDLE_ENFORCE(tensor != nullptr);
-  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
-                    name);
-  itensor_map_[name] = tensor;
-}
-
-nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
-  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
-  return itensor_map_[name];
-}
-
-void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
-  runtime_batch_ = batch_size;
-}
-
-float *TensorRTEngine::GetWeightCPUData(const std::string &name,
-                                        framework::Tensor *weight_tensor,
-                                        bool enable_int8,
-                                        const std::vector<float> &scale) {
-  static int name_suffix_counter = 0;
-  std::string name_suffix = std::to_string(name_suffix_counter);
-  std::string name_with_suffix = name + name_suffix;
-  auto w_dims = weight_tensor->dims();
-  platform::CPUPlace cpu_place;
-  PADDLE_ENFORCE_EQ(
-      weight_map.count(name_with_suffix), 0,
-      "During TRT Op converter: We set weight %s with the same name "
-      "twice into the weight_map",
-      name_with_suffix);
-  weight_map[name_with_suffix].reset(new framework::Tensor());
-  weight_map[name_with_suffix]->Resize(weight_tensor->dims());
-  TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
-  float *weight_data =
-      weight_map[name_with_suffix]->mutable_data<float>(cpu_place);
-  name_suffix_counter += 1;
-
-  if (enable_int8) {
-    // when the op is fc, scale's size should be 1
-    // when the op is conv, scale's size should be w_dims[0]
-    bool valid_scale_size =
-        (scale.size() == 1 || scale.size() == static_cast<size_t>(w_dims[0]));
-    PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size");
-    for (int i = 0; i < weight_tensor->numel(); i++) {
-      bool is_valid_int8 =
-          ((weight_data[i] >= -128) && (weight_data[i] <= 127));
-      PADDLE_ENFORCE(is_valid_int8,
-                     "We are in anakin subgraph int8 mode, the weight of conv "
-                     "should be in range [-128, 127]");
-      if (scale.size() == 1) {
-        weight_data[i] *= (scale[0] / 127);
-      } else {
-        PADDLE_ENFORCE(w_dims.size() == 4,
-                       "TRT int8 quant : We only use the channel quant for "
-                       "conv op, so the weight dims should be 4.");
-        int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
-        weight_data[i] *= (scale[i / inner_size] / 127);
-      }
-    }
-  }
-  return weight_data;
-}
-
-int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
-
-nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
-    nvinfer1::ITensor *const *inputs, int num_inputs,
-    plugin::PluginTensorRT *plugin) {
-  owned_plugin_.emplace_back(plugin);
-  return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
-}
-
-void TensorRTEngine::freshDeviceId() {
-  int count;
-  cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE_LT(device_id_, count);
-  cudaSetDevice(device_id_);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
deleted file mode 100644
index c1d950035cacff199db53829967e37343f5e3d70..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <NvInfer.h>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/inference/engine.h"
-#include "paddle/fluid/inference/tensorrt/helper.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
-#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-class TRTInt8Calibrator;
-/*
- * TensorRT Engine.
- *
- * There are two alternative ways to use it, one is  to build from a paddle
- * protobuf model, another way is to manually construct the network.
- */
-class TensorRTEngine {
-  using DescType = ::paddle::framework::proto::BlockDesc;
-
- public:
-  // Weight is model parameter.
-  class Weight {
-   public:
-    Weight() = default;
-    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
-      w_.type = dtype;
-      w_.values = value;
-      w_.count = num_elem;
-    }
-    const nvinfer1::Weights& get() { return w_; }
-
-    std::vector<int64_t> dims;
-
-   private:
-    nvinfer1::Weights w_;
-  };
-
-  TensorRTEngine(
-      int max_batch, int max_workspace,
-      AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32,
-      TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
-      nvinfer1::ILogger& logger = NaiveLogger::Global())
-      : max_batch_(max_batch),
-        max_workspace_(max_workspace),
-        precision_(precision),
-        calibrator_(calibrator),
-        device_id_(device_id),
-        logger_(logger) {}
-
-  ~TensorRTEngine() {}
-
-  // TODO(Superjomn) implement it later when graph segmentation is supported.
-  void Build(const DescType& paddle_model);
-
-  void Execute(int batch_size, std::vector<void*>* buffers,
-               cudaStream_t stream);
-
-  // Initialize the inference network, so that TensorRT layers can add to this
-  // network.
-  void InitNetwork() {
-    freshDeviceId();
-    infer_builder_.reset(createInferBuilder(&logger_));
-    infer_network_.reset(infer_builder_->createNetwork());
-  }
-  // After finishing adding ops, freeze this network and creates the execution
-  // environment.
-  void FreezeNetwork();
-
-  // Add an input and set its name, data type and dimension.
-  nvinfer1::ITensor* DeclareInput(const std::string& name,
-                                  nvinfer1::DataType dtype,
-                                  const nvinfer1::Dims& dim);
-  // Set the offset-th output from a layer as the network's output, and set its
-  // name.
-  void DeclareOutput(const nvinfer1::ILayer* layer, int offset,
-                     const std::string& name);
-  // Set the itensor_map_[name] as the network's output, and set its name.
-  void DeclareOutput(const std::string& name);
-  // Check if the ITensor has been declared
-  bool HasDeclared(const std::string& name);
-
-  void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
-  // Get an ITensor called name.
-  nvinfer1::ITensor* GetITensor(const std::string& name);
-
-  nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
-  nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
-
-  nvinfer1::IHostMemory* Serialize() {
-    PADDLE_ENFORCE(infer_engine_ != nullptr,
-                   "You should build engine first and then serialize");
-    ihost_memory_.reset(infer_engine_->serialize());
-    return ihost_memory_.get();
-  }
-
-  void Deserialize(const std::string& engine_serialized_data) {
-    freshDeviceId();
-    infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
-    infer_engine_.reset(runtime->deserializeCudaEngine(
-        engine_serialized_data.c_str(), engine_serialized_data.size(),
-        &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
-    PADDLE_ENFORCE(infer_engine_ != nullptr,
-                   "build cuda engine failed when deserialize engine info.!");
-  }
-
-  void SetRuntimeBatch(size_t batch_size);
-  int GetRuntimeBatch();
-  int GetDeviceId() { return device_id_; }
-  nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
-                                    int num_inputs, plugin::PluginTensorRT*);
-  void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) {
-    quant_dynamic_range_[tensor] = range;
-  }
-
-  float* GetWeightCPUData(const std::string& name,
-                          framework::Tensor* weight_tensor, bool enable_int8,
-                          const std::vector<float>& scale = {});
-
-  // A pointer to CPU memory is needed of the TRT weight.
-  // Before TRT runs, fluid loads weight into GPU storage.
-  // so we need to copy the weights from GPU to CPU in our op converter.
-  // We use a map to store these weights for the weight memory is not released
-  // in advance, which affecting the construction of TRT Op.
-  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
-      weight_map;
-
-  // When setting weight_map, a self-increasing suffix is needed for the names
-  // so as to avoid repeatedly setting weights with the same name.
-  void SetWeights(std::string w_name,
-                  std::unique_ptr<framework::Tensor> w_tensor) {
-    static int suffix_counter = 0;
-    std::string suffix = std::to_string(suffix_counter);
-    weight_map[w_name + suffix] = std::move(w_tensor);
-    suffix_counter += 1;
-  }
-
-  void ClearWeights() {
-    for (auto& weight_pair : weight_map) {
-      weight_pair.second.reset(nullptr);
-    }
-  }
-
- private:
-  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
-  // ensure that the thread is associated with the correct device by calling
-  // freshDeviceId().
-  void freshDeviceId();
-
-  // the max batch size
-  int max_batch_;
-  // the runtime batch size
-  static int runtime_batch_;
-  // the max memory size the engine uses
-  int max_workspace_;
-
-  AnalysisConfig::Precision precision_;
-  TRTInt8Calibrator* calibrator_;
-  // batch size of the current data, will be updated each Executation.
-  int batch_size_{-1};
-
-  int device_id_;
-  nvinfer1::ILogger& logger_;
-
-  // max data size for the buffers.
-  std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
-  std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
-      itensor_map_;
-
-  std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
-
-  // TensorRT related internal members
-  template <typename T>
-  struct Destroyer {
-    void operator()(T* x) {
-      if (x) {
-        x->destroy();
-      }
-    }
-  };
-  template <typename T>
-  using infer_ptr = std::unique_ptr<T, Destroyer<T>>;
-  infer_ptr<nvinfer1::IBuilder> infer_builder_;
-  infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
-  infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
-  std::unordered_map<std::thread::id, infer_ptr<nvinfer1::IExecutionContext>>
-      infer_context_;
-  infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
-  std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
-};  // class TensorRTEngine
-
-#define IS_TRT_VERSION_GE(version)                       \
-  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
-    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version)
-
-// Add a layer__ into engine__ with args ARGS.
-// For example:
-//
-// Reference
-// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network
-//
-// will add a fully connected layer into the engine.
-// TensorRT has too many layers, so that is not wise to add member functions for
-// them, and an macro like this is more extensible when underlying TensorRT
-// library add new layer supports.
-#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ...) \
-  engine__->network()->add##layer__(__VA_ARGS__);
-
-class TRTEngineManager {
- public:
-  bool Empty() const { return engines_.size() == 0; }
-  bool Has(const std::string& name) const {
-    if (engines_.count(name) == 0) return false;
-    return engines_.at(name).get() != nullptr;
-  }
-
-  TensorRTEngine* Get(const std::string& name) const {
-    return engines_.at(name).get();
-  }
-
-  TensorRTEngine* Create(
-      std::string name, int max_batch, int max_workspace,
-      AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32,
-      TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
-      nvinfer1::ILogger& logger = NaiveLogger::Global()) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, precision,
-                                 calibrator, device_id, logger);
-    engines_[name].reset(p);
-    return p;
-  }
-
-  void DeleteAll() {
-    for (auto& item : engines_) {
-      item.second.reset(nullptr);
-    }
-  }
-
- private:
-  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
deleted file mode 100644
index 010942a0678fe9a592d1a95ba9cdc6adc42cc2ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <NvInfer.h>
-#include <cuda.h>
-#include <glog/logging.h>
-#include <string>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/platform/dynload/tensorrt.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-namespace dy = paddle::platform::dynload;
-
-// TensorRT data type to size
-const int kDataTypeSize[] = {
-    4,  // kFLOAT
-    2,  // kHALF
-    1,  // kINT8
-    4   // kINT32
-};
-
-// The following two API are implemented in TensorRT's header file, cannot load
-// from the dynamic library. So create our own implementation and directly
-// trigger the method from the dynamic library.
-static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) {
-  return static_cast<nvinfer1::IBuilder*>(
-      dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION));
-}
-static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
-  return static_cast<nvinfer1::IRuntime*>(
-      dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
-}
-
-// A logger for create TensorRT infer builder.
-class NaiveLogger : public nvinfer1::ILogger {
- public:
-  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
-    switch (severity) {
-      case Severity::kINFO:
-        VLOG(3) << msg;
-        break;
-      case Severity::kWARNING:
-        LOG(WARNING) << msg;
-        break;
-      case Severity::kINTERNAL_ERROR:
-      case Severity::kERROR:
-        LOG(ERROR) << msg;
-        break;
-      default:
-        break;
-    }
-  }
-
-  static nvinfer1::ILogger& Global() {
-    static nvinfer1::ILogger* x = new NaiveLogger;
-    return *x;
-  }
-
-  ~NaiveLogger() override {}
-};
-
-class NaiveProfiler : public nvinfer1::IProfiler {
- public:
-  typedef std::pair<std::string, float> Record;
-  std::vector<Record> mProfile;
-
-  virtual void reportLayerTime(const char* layerName, float ms) {
-    auto record =
-        std::find_if(mProfile.begin(), mProfile.end(),
-                     [&](const Record& r) { return r.first == layerName; });
-    if (record == mProfile.end())
-      mProfile.push_back(std::make_pair(layerName, ms));
-    else
-      record->second += ms;
-  }
-
-  void printLayerTimes() {
-    float totalTime = 0;
-    for (size_t i = 0; i < mProfile.size(); i++) {
-      printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(),
-             mProfile[i].second);
-      totalTime += mProfile[i].second;
-    }
-    printf("Time over all layers: %4.3f\n", totalTime);
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
deleted file mode 100644
index 292f5e1d4b928e81bb1a3020ae212791ac60d45b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tensorrt/op_teller.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-// Just tell by the op_types.
-struct SimpleOpTypeSetTeller : public Teller {
-  SimpleOpTypeSetTeller() {
-#if IS_TRT_VERSION_GE(5130)
-    teller_set.insert("relu6");
-#endif
-  }
-
-  bool operator()(const std::string& op_type,
-                  const framework::OpDesc& desc) override {
-    return teller_set.count(op_type);
-  }
-
- private:
-  std::unordered_set<std::string> teller_set{{"mul",
-                                              "conv2d",
-                                              "pool2d",
-                                              "relu",
-                                              "softmax",
-                                              "sigmoid",
-                                              "depthwise_conv2d",
-                                              "batch_norm",
-                                              "concat",
-                                              "tanh",
-                                              "pad",
-                                              "elementwise_add",
-                                              "elementwise_mul",
-                                              "dropout",
-                                              "prelu",
-                                              "conv2d_transpose",
-                                              "leaky_relu",
-                                              "fc",
-                                              "shuffle_channel",
-                                              "swish",
-                                              "split"}};
-};
-
-bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
-  for (auto& teller : tellers_) {
-    if ((*teller)(op_type, desc)) return true;
-  }
-  return false;
-}
-
-OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
deleted file mode 100644
index 7ff1d4746a1817493774d653982b345cf6948f74..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * Single Op teller definition.
- * One can override this and define a more complex tell logic, considerring more
- * issues such as op_desc.
- */
-struct Teller {
-  virtual bool operator()(const std::string& op_type,
-                          const framework::OpDesc& desc) = 0;
-
-  virtual ~Teller() = default;
-};
-/*
- * A real example:
- *
- * struct SomeTeller : public Teller {
- * bool operator()(const std::string& op_type,
- *                const framework::OpDesc& desc) override {
- *  return op_type == "fc" && desc.Inputs().size() == 2;
- * }
- *};
- */
-
-/*
- * class OpTeller helps to tell whether a fluid
- * operator can be transformed to a TensorRT layer.
- */
-class OpTeller {
- public:
-  static OpTeller& Global() {
-    static std::unique_ptr<OpTeller> x(new OpTeller);
-    return *x;
-  }
-
-  bool Tell(const std::string& op_type, const framework::OpDesc& desc);
-
- private:
-  OpTeller();
-
- private:
-  std::vector<std::unique_ptr<Teller>> tellers_;
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
deleted file mode 100644
index d01c5c823b51d204f1e507b55edb127737a18be4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-nv_library(tensorrt_plugin
-           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
-           prelu_op_plugin.cu  trt_plugin_factory.cc
-           avg_pool_op_plugin.cu swish_op_plugin.cu
-           DEPS enforce tensorrt_engine prelu)
diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
deleted file mode 100644
index f27a838162c89b6377a7ffd995608b3a5a49eeae..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
-#include "paddle/fluid/operators/math/pooling.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer,
-                                              size_t length) {
-  return new AvgPoolPlugin(buffer, length);
-}
-REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize);
-
-nvinfer1::Dims AvgPoolPlugin::getOutputDimensions(
-    int index, const nvinfer1::Dims* inputDims, int nbInputs) {
-  assert(nbInputs == 1);
-  assert(index == 0);
-  assert(inputDims[0].nbDims == 3);
-  nvinfer1::Dims const& input_dims = inputDims[0];
-
-  nvinfer1::Dims output_dims = input_dims;
-
-  output_dims.d[1] = output_shape_[1];
-  output_dims.d[2] = output_shape_[2];
-  return output_dims;
-}
-
-int AvgPoolPlugin::enqueue(int batchSize, const void* const* inputs,
-                           void** outputs, void* workspace,
-                           cudaStream_t stream) {
-  auto const& input_dims = this->getInputDims(0);
-  int input_size = 0;
-  float const* idata = reinterpret_cast<float const*>(inputs[0]);
-  float** odatas = reinterpret_cast<float**>(outputs);
-
-  paddle::operators::math::AvgPool<float> pool_process;
-  paddle::operators::math::Pool2dDirectCUDAFunctor<
-      paddle::operators::math::AvgPool<float>, float>
-      pool2d_forward;
-
-  std::vector<int> input_shape = input_shape_;
-  std::vector<int> output_shape = output_shape_;
-  input_shape.insert(input_shape.begin(), batchSize);
-  output_shape.insert(output_shape.begin(), batchSize);
-
-  pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_,
-                 pool_process, true, odatas[0], stream);
-
-  return cudaGetLastError() != cudaSuccess;
-}
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
deleted file mode 100644
index a7c0aa5794e6bb131d012cb12d6d9fc12a73bd0d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cassert>
-#include <vector>
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-class AvgPoolPlugin : public PluginTensorRT {
- private:
-  bool ceil_mode_;
-  std::vector<int> ksize_;
-  std::vector<int> strides_;
-  std::vector<int> paddings_;
-  std::vector<int> input_shape_;
-  std::vector<int> output_shape_;
-
- protected:
-  size_t getSerializationSize() override {
-    return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) +
-           SerializedSize(ksize_) + SerializedSize(strides_) +
-           SerializedSize(paddings_) + SerializedSize(input_shape_) +
-           SerializedSize(output_shape_) + getBaseSerializationSize();
-  }
-
-  // TRT will call this func when we need to serialize the configuration of
-  // tensorrt.
-  void serialize(void *buffer) override {
-    SerializeValue(&buffer, getPluginType());
-    serializeBase(buffer);
-    SerializeValue(&buffer, ceil_mode_);
-    SerializeValue(&buffer, ksize_);
-    SerializeValue(&buffer, strides_);
-    SerializeValue(&buffer, paddings_);
-    SerializeValue(&buffer, input_shape_);
-    SerializeValue(&buffer, output_shape_);
-  }
-
- public:
-  AvgPoolPlugin() {}
-  AvgPoolPlugin(bool ceil_mode, std::vector<int> ksize,
-                std::vector<int> strides, std::vector<int> paddings,
-                std::vector<int> input_shape)
-      : ceil_mode_(ceil_mode),
-        ksize_(ksize),
-        strides_(strides),
-        paddings_(paddings),
-        input_shape_(input_shape) {
-    int output_h, output_w;
-    output_shape_ = input_shape_;
-    if (!ceil_mode_) {
-      output_h =
-          (input_shape[1] - ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1;
-      output_w =
-          (input_shape[2] - ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1;
-    } else {
-      output_h =
-          (input_shape[1] - ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) /
-              strides_[0] +
-          1;
-      output_w =
-          (input_shape[2] - ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) /
-              strides_[1] +
-          1;
-    }
-    output_shape_[1] = output_h;
-    output_shape_[2] = output_w;
-  }
-
-  // It was used for tensorrt deserialization.
-  // It should not be called by users.
-  AvgPoolPlugin(void const *serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
-    DeserializeValue(&serialData, &serialLength, &ceil_mode_);
-    DeserializeValue(&serialData, &serialLength, &ksize_);
-    DeserializeValue(&serialData, &serialLength, &strides_);
-    DeserializeValue(&serialData, &serialLength, &paddings_);
-    DeserializeValue(&serialData, &serialLength, &input_shape_);
-    DeserializeValue(&serialData, &serialLength, &output_shape_);
-  }
-
-  AvgPoolPlugin *clone() const override {
-    return new AvgPoolPlugin(ceil_mode_, ksize_, strides_, paddings_,
-                             input_shape_);
-  }
-
-  const char *getPluginType() const override { return "avg_pool_plugin"; }
-  int getNbOutputs() const override { return 1; }
-  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
-                                     int nbInputDims) override;
-  int initialize() override { return 0; }
-  int enqueue(int batchSize, const void *const *inputs, void **outputs,
-              void *workspace, cudaStream_t stream) override;
-};
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
deleted file mode 100644
index 9aed3ddab1448fde7cb6b0e13bcf0b05e23622e9..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer,
-                                                      size_t length) {
-  return new ElementWisePlugin(buffer, length);
-}
-REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize);
-
-namespace details {
-
-template <typename T>
-struct Add {
-  __device__ T operator()(const T& a, const T& b) const { return a + b; }
-};
-
-template <typename T>
-struct Mul {
-  __device__ T operator()(const T& a, const T& b) const { return a * b; }
-};
-
-template <typename T, typename Operator>
-__global__ void ColumnWiseKernel(Operator op, const T* x, const T* y, T* out,
-                                 int batch_size, int num_rows, int num_cols) {
-  for (int batch_id = 0; batch_id < batch_size; ++batch_id) {
-    int row = blockIdx.x;
-    for (; row < num_rows; row += gridDim.x) {
-      T value_y = y[batch_id * num_rows + row];
-      int col = threadIdx.x;
-      int offset = (batch_id * num_rows + row) * num_cols;
-      for (; col < num_cols; col += blockDim.x) {
-        T value_x = x[offset + col];
-        out[offset + col] = op(value_x, value_y);
-      }
-    }
-  }
-}
-
-template <typename T, typename Operator>
-static void ElementWise(Operator op, const T* x, const T* y, T* out,
-                        int batch_size, int prev, int midd, int post,
-                        cudaStream_t stream) {
-  const int kThreadsPerBlock = 1024;
-  const int kMaximumBlocks = 65535;
-  if (prev == 1) {
-    int num_threads = (post > kThreadsPerBlock) ? kThreadsPerBlock
-                                                : (((post + 31) >> 5) << 5);
-    int num_blocks = (midd < kMaximumBlocks) ? midd : kMaximumBlocks;
-    ColumnWiseKernel<<<num_blocks, num_threads, 0, stream>>>(
-        op, x, y, out, batch_size, midd, post);
-  } else if (post == 1) {
-    PADDLE_THROW("Not implemented.");
-  } else {
-    PADDLE_THROW("Not implemented.");
-  }
-}
-
-}  // namespace details
-
-nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
-    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
-  PADDLE_ENFORCE_EQ(index, 0);
-  PADDLE_ENFORCE_EQ(num_inputs, 2);
-  PADDLE_ENFORCE_NOT_NULL(input_dims);
-  return input_dims[0];
-}
-
-int ElementWisePlugin::initialize() {
-  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
-
-  axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
-  int trimed_nb_dims = dims_y_.nbDims;
-  for (; trimed_nb_dims > 0; --trimed_nb_dims) {
-    if (dims_y_.d[trimed_nb_dims - 1] != 1) {
-      break;
-    }
-  }
-  dims_y_.nbDims = trimed_nb_dims;
-
-  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
-  PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
-
-  prev_size_ = 1;
-  midd_size_ = 1;
-  post_size_ = 1;
-  for (int i = 0; i < axis_; ++i) {
-    prev_size_ *= dims_x_.d[i];
-  }
-
-  for (int i = 0; i < dims_y_.nbDims; ++i) {
-    PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
-                      "Broadcast dimension mismatch.");
-    midd_size_ *= dims_y_.d[i];
-  }
-
-  for (int i = axis_ + dims_y_.nbDims; i < dims_x_.nbDims; ++i) {
-    post_size_ *= dims_x_.d[i];
-  }
-  return 0;
-}
-
-int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
-                               void** outputs, void* workspace,
-                               cudaStream_t stream) {
-  const float* x = reinterpret_cast<const float*>(inputs[0]);
-  const float* y = reinterpret_cast<const float*>(inputs[1]);
-  float* out = reinterpret_cast<float*>(outputs[0]);
-
-  if (type_ == "add") {
-    details::ElementWise(details::Add<float>(), x, y, out, batch_size,
-                         prev_size_, midd_size_, post_size_, stream);
-  } else if (type_ == "mul") {
-    details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
-                         prev_size_, midd_size_, post_size_, stream);
-  } else {
-    PADDLE_THROW("Not implemented.");
-  }
-
-  return cudaGetLastError() != cudaSuccess;
-}
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
deleted file mode 100644
index 3b040f14c531c540b8a855da85ecc3008224526c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-class ElementWisePlugin : public PluginTensorRT {
- public:
-  ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x,
-                    nvinfer1::Dims const &dims_y, int axis)
-      : type_(type),
-        dims_x_(dims_x),
-        dims_y_(dims_y),
-        axis_(axis),
-        prev_size_(1),
-        midd_size_(1),
-        post_size_(1) {}
-
-  ElementWisePlugin(void const *serial_data, size_t serial_length) {
-    deserializeBase(serial_data, serial_length);
-    const char *elementwise_type;
-    DeserializeValue(&serial_data, &serial_length, &elementwise_type);
-    type_ = std::string(elementwise_type);
-    DeserializeValue(&serial_data, &serial_length, &axis_);
-    DeserializeValue(&serial_data, &serial_length, &dims_x_);
-    DeserializeValue(&serial_data, &serial_length, &dims_y_);
-  }
-
-  ElementWisePlugin *clone() const override {
-    // return new ElementWisePlugin(dims_x_, dims_y_, axis_);
-    return nullptr;
-  }
-
-  const char *getPluginType() const override { return "elementwise_plugin"; }
-
-  nvinfer1::Dims getOutputDimensions(int index,
-                                     const nvinfer1::Dims *input_dims,
-                                     int num_inputs) override;
-
-  int initialize() override;
-
-  // execute the layer
-  int enqueue(int batch_size, const void *const *inputs, void **outputs,
-              void *workspace, cudaStream_t stream);
-
- protected:
-  size_t getSerializationSize() override {
-    return SerializedSize(getPluginType()) + SerializedSize(axis_) +
-           SerializedSize(dims_x_) + SerializedSize(dims_y_) +
-           getBaseSerializationSize();
-  }
-
-  void serialize(void *buffer) override {
-    SerializeValue(&buffer, getPluginType());
-    serializeBase(buffer);
-    SerializeValue(&buffer, type_.c_str());
-    SerializeValue(&buffer, axis_);
-    SerializeValue(&buffer, dims_x_);
-    SerializeValue(&buffer, dims_y_);
-  }
-
-  std::string type_;
-  nvinfer1::Dims dims_x_;
-  nvinfer1::Dims dims_y_;
-  int axis_;
-  int prev_size_;
-  int midd_size_;
-  int post_size_;
-};
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
deleted file mode 100644
index 84f938eeb5fa50421a819978cd84c968919c96b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdio.h>
-#include <cassert>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
-#include "paddle/fluid/operators/math/prelu.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) {
-  return new PReluPlugin(buffer, length);
-}
-REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize);
-
-int PReluPlugin::initialize() {
-  cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
-  cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
-             cudaMemcpyHostToDevice);
-  return 0;
-}
-
-nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
-                                                const nvinfer1::Dims *inputDims,
-                                                int nbInputs) {
-  assert(nbInputs == 1);
-  assert(index < this->getNbOutputs());
-  nvinfer1::Dims const &input_dims = inputDims[0];
-  nvinfer1::Dims output_dims = input_dims;
-  return output_dims;
-}
-
-int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
-                         void **outputs, void *workspace, cudaStream_t stream) {
-  // input dims is CHW.
-  const auto &input_dims = this->getInputDims(0);
-  const float *input = reinterpret_cast<const float *>(inputs[0]);
-  // const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
-  const float *alpha = p_gpu_weight_;
-  float *output = reinterpret_cast<float **>(outputs)[0];
-
-  std::vector<int> input_shape;
-  input_shape.push_back(batch_size);
-  for (int i = 0; i < input_dims.nbDims; i++) {
-    input_shape.push_back(input_dims.d[i]);
-  }
-
-  if (mode_ == "channel") {
-    operators::math::PreluChannelWiseDirectCUDAFunctor<float>
-        prelu_channel_wise;
-    prelu_channel_wise(stream, input, alpha, output, input_shape);
-  } else if (mode_ == "element") {
-    operators::math::PreluElementWiseDirectCUDAFunctor<float>
-        prelu_element_wise;
-    prelu_element_wise(stream, input, alpha, output, input_shape);
-  } else {
-    operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
-    prelu_scalar(stream, input, alpha, output, input_shape);
-  }
-  return cudaGetLastError() != cudaSuccess;
-}
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
deleted file mode 100644
index a96649503f1c764e07370cb2b47b10f3dae72be4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-class PReluPlugin : public PluginTensorRT {
-  std::vector<float> weight_;
-  float *p_gpu_weight_;
-  std::string mode_;
-
- protected:
-  size_t getSerializationSize() override {
-    return getBaseSerializationSize() + SerializedSize(mode_.c_str()) +
-           SerializedSize(weight_) + SerializedSize(getPluginType());
-  }
-
-  // TRT will call this func when we need to serialize the configuration of
-  // tensorrt.
-  // It should not be called by users.
-  void serialize(void *buffer) override {
-    SerializeValue(&buffer, getPluginType());
-    serializeBase(buffer);
-    SerializeValue(&buffer, weight_);
-    SerializeValue(&buffer, mode_.c_str());
-  }
-
- public:
-  PReluPlugin(const float *weight, const int weight_num,
-              std::string const &mode)
-      : mode_(mode) {
-    weight_.resize(weight_num);
-    std::copy(weight, weight + weight_num, weight_.data());
-  }
-
-  // It was used for tensorrt deserialization.
-  // It should not be called by users.
-  PReluPlugin(void const *serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
-    DeserializeValue(&serialData, &serialLength, &weight_);
-    const char *prelu_mode;
-    DeserializeValue(&serialData, &serialLength, &prelu_mode);
-    mode_ = std::string(prelu_mode);
-  }
-  ~PReluPlugin() { cudaFree(p_gpu_weight_); }
-  int initialize() override;
-
-  PReluPlugin *clone() const override {
-    return new PReluPlugin(weight_.data(), weight_.size(), mode_);
-  }
-
-  const char *getPluginType() const override { return "prelu_plugin"; }
-  int getNbOutputs() const override { return 1; }
-  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
-                                     int nbInputDims) override;
-  int enqueue(int batchSize, const void *const *inputs, void **outputs,
-              void *workspace, cudaStream_t stream) override;
-};
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
deleted file mode 100644
index 9e927ed6800d0522dd3b5f6e74990348408b39b6..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cuda_fp16.h>
-#include <algorithm>
-#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) {
-  return new SplitPlugin(buffer, length);
-}
-REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize);
-
-template <typename T>
-__device__ int upper_bound(T const* vals, int n, T const& key) {
-  int i = 0;
-  while (n > 0) {
-    int m = n / 2;
-    int j = i + m;
-    if (!(key < vals[j])) {
-      i = j + 1;
-      n -= m + 1;
-    } else {
-      n = m;
-    }
-  }
-  return i;
-}
-
-nvinfer1::Dims SplitPlugin::getOutputDimensions(
-    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
-  PADDLE_ENFORCE_EQ(num_inputs, 1);
-  PADDLE_ENFORCE_LT(index, this->getNbOutputs());
-
-  nvinfer1::Dims output_dims = input_dims[0];
-  output_dims.d[axis_] = output_length_.at(index);
-  return output_dims;
-}
-
-int SplitPlugin::initialize() {
-  PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
-  // notice input dims is [C, H, W]
-  nvinfer1::Dims dims = this->getInputDims(0);
-  outer_rows_ = 1;
-  inner_cols_ = 1;
-  for (int i = 0; i < axis_; ++i) {
-    outer_rows_ *= dims.d[i];
-  }
-  for (int i = axis_ + 1; i < dims.nbDims; ++i) {
-    inner_cols_ *= dims.d[i];
-  }
-  same_shape_ = true;
-  std::vector<int> segment_offsets(1, 0);
-  for (int i = 0; i < this->getNbOutputs(); ++i) {
-    if (output_length_[i] != output_length_[0]) {
-      same_shape_ = false;
-    }
-    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
-  }
-  axis_shape_ = dims.d[axis_];
-  d_segment_offsets_ = segment_offsets;
-  segment_offsets_ = std::move(segment_offsets);
-  d_output_ptrs_.resize(this->getNbOutputs(), nullptr);
-  return 0;
-}
-
-// The following part of the code refers to onnx-tensorrt
-// https://github.com/onnx/onnx-tensorrt/blob/master/Split.cu
-template <typename T>
-__global__ void split_kernel(int nsegment,
-                             int const* __restrict__ segment_offsets,
-                             T const* __restrict__ idata, T* const* odatas,
-                             int inner_cols, int axis_shape, int outer_rows) {
-  int x0 = threadIdx.x + blockIdx.x * blockDim.x;
-  int src_y0 = threadIdx.y + blockIdx.y * blockDim.y;
-  int z0 = threadIdx.z + blockIdx.z * blockDim.z;
-  for (int z = z0; z < outer_rows; z += blockDim.z * gridDim.z) {
-    for (int src_y = src_y0; src_y < axis_shape;
-         src_y += blockDim.y * gridDim.y) {
-      for (int x = x0; x < inner_cols; x += blockDim.x * gridDim.x) {
-        int segment = upper_bound(segment_offsets, nsegment, src_y) - 1;
-        int dst_y = src_y - segment_offsets[segment];
-        int dst_ny = segment_offsets[segment + 1] - segment_offsets[segment];
-        odatas[segment][x + inner_cols * (dst_y + dst_ny * z)] =
-            idata[x + inner_cols * (src_y + axis_shape * z)];
-      }
-    }
-  }
-}
-
-int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
-                         void** outputs, void* workspace, cudaStream_t stream) {
-  const int* d_segment_offsets_ptr =
-      thrust::raw_pointer_cast(&d_segment_offsets_[0]);
-  float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
-  float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
-  float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
-      output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
-      cudaMemcpyHostToDevice, stream));
-
-  int outer_rows = outer_rows_ * batchSize;
-
-  dim3 block(32, 16);
-  dim3 grid(std::min((inner_cols_ - 1) / block.x + 1, 65535u),
-            std::min((axis_shape_ - 1) / block.y + 1, 65535u),
-            std::min((outer_rows_ - 1) / block.z + 1, 65535u));
-
-  split_kernel<<<grid, block, 0, stream>>>(
-      d_segment_offsets_.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
-      inner_cols_, axis_shape_, outer_rows);
-  return cudaGetLastError() != cudaSuccess;
-}
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
deleted file mode 100644
index b2a7bc3bdaa2543e83ab024548c3c10ffd7212be..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <thrust/device_vector.h>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-class SplitPlugin : public PluginTensorRT {
- public:
-  SplitPlugin() {}
-  SplitPlugin(int axis, std::vector<int> const &output_lengths)
-      : axis_(axis), same_shape_(true), output_length_(output_lengths) {}
-
-  SplitPlugin(void const *serial_data, size_t serial_length) {
-    deserializeBase(serial_data, serial_length);
-    DeserializeValue(&serial_data, &serial_length, &axis_);
-    DeserializeValue(&serial_data, &serial_length, &output_length_);
-  }
-
-  SplitPlugin *clone() const override {
-    return new SplitPlugin(axis_, output_length_);
-  }
-
-  const char *getPluginType() const override { return "split_plugin"; }
-  int getNbOutputs() const override { return output_length_.size(); }
-  nvinfer1::Dims getOutputDimensions(int index,
-                                     const nvinfer1::Dims *input_dims,
-                                     int num_inputs) override;
-
-  int initialize() override;
-  int enqueue(int batchSize, const void *const *inputs, void **outputs,
-              void *workspace, cudaStream_t stream) override;
-
- protected:
-  size_t getSerializationSize() override {
-    return SerializedSize(getPluginType()) + SerializedSize(axis_) +
-           SerializedSize(output_length_) + getBaseSerializationSize();
-  }
-
-  void serialize(void *buffer) override {
-    SerializeValue(&buffer, getPluginType());
-    serializeBase(buffer);
-    SerializeValue(&buffer, axis_);
-    SerializeValue(&buffer, output_length_);
-  }
-
-  int axis_;
-  int outer_rows_;
-  int inner_cols_;
-  int axis_shape_;
-  bool same_shape_;
-  std::vector<int> output_length_;
-  std::vector<int> segment_offsets_;
-  thrust::device_vector<int> d_segment_offsets_;
-  thrust::device_vector<float *> d_output_ptrs_;
-};
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
deleted file mode 100644
index 864ca5f080f95d56191b0e9895654068edb8d0ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdio.h>
-#include <cassert>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-SwishPlugin *CreateSwishPluginDeserialize(const void *buffer, size_t length) {
-  return new SwishPlugin(buffer, length);
-}
-REGISTER_TRT_PLUGIN("swish_plugin", CreateSwishPluginDeserialize);
-
-int SwishPlugin::initialize() { return 0; }
-
-nvinfer1::Dims SwishPlugin::getOutputDimensions(int index,
-                                                const nvinfer1::Dims *inputDims,
-                                                int nbInputs) {
-  assert(nbInputs == 1);
-  assert(index < this->getNbOutputs());
-  nvinfer1::Dims const &input_dims = inputDims[0];
-  nvinfer1::Dims output_dims = input_dims;
-  return output_dims;
-}
-__global__ void swish_kernel(int num, const float *input, float *output,
-                             float beta) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < num) {
-#if __CUDA_ARCH__ >= 350
-    output[index] =
-        __ldg(input + index) / (1.0f + expf(-beta * __ldg(input + index)));
-#else
-    output[index] = input[index] / (1.0f + expf(-beta * input[index]));
-#endif
-  }
-}
-
-int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
-                         void **outputs, void *workspace, cudaStream_t stream) {
-  // input dims is CHW.
-  const auto &input_dims = this->getInputDims(0);
-  const float *input = reinterpret_cast<const float *>(inputs[0]);
-  float *output = reinterpret_cast<float **>(outputs)[0];
-  int num = batch_size;
-  for (int i = 0; i < input_dims.nbDims; i++) {
-    num *= input_dims.d[i];
-  }
-  int threads = 1024;
-  int blocks = (num + threads - 1) / threads;
-  swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output, beta_);
-
-  return cudaGetLastError() != cudaSuccess;
-}
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
deleted file mode 100644
index 6c3cd038884bf6482edd49fe27901888b2e93bdd..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-class SwishPlugin : public PluginTensorRT {
- private:
-  float beta_;
-
- protected:
-  size_t getSerializationSize() override {
-    return getBaseSerializationSize() + SerializedSize(beta_);
-  }
-
-  // TRT will call this func when we need to serialize the configuration of
-  // tensorrt.
-  // It should not be called by users.
-  void serialize(void *buffer) override {
-    SerializeValue(&buffer, getPluginType());
-    serializeBase(buffer);
-    SerializeValue(&buffer, beta_);
-  }
-
- public:
-  explicit SwishPlugin(const float beta) : beta_(beta) {}
-
-  // It was used for tensorrt deserialization.
-  // It should not be called by users.
-  SwishPlugin(void const *serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
-    DeserializeValue(&serialData, &serialLength, &beta_);
-  }
-  ~SwishPlugin() {}
-  int initialize() override;
-
-  SwishPlugin *clone() const override { return new SwishPlugin(beta_); }
-
-  const char *getPluginType() const override { return "swish_plugin"; }
-  int getNbOutputs() const override { return 1; }
-  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
-                                     int nbInputDims) override;
-  int enqueue(int batchSize, const void *const *inputs, void **outputs,
-              void *workspace, cudaStream_t stream) override;
-};
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
deleted file mode 100644
index b0f4cff3ac184beeed2ebd3a4b7531d570c87075..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-void PluginTensorRT::serializeBase(void*& buffer) {
-  SerializeValue(&buffer, input_dims_);
-  SerializeValue(&buffer, max_batch_size_);
-  SerializeValue(&buffer, data_type_);
-  SerializeValue(&buffer, data_format_);
-}
-
-void PluginTensorRT::deserializeBase(void const*& serial_data,
-                                     size_t& serial_length) {
-  DeserializeValue(&serial_data, &serial_length, &input_dims_);
-  DeserializeValue(&serial_data, &serial_length, &max_batch_size_);
-  DeserializeValue(&serial_data, &serial_length, &data_type_);
-  DeserializeValue(&serial_data, &serial_length, &data_format_);
-}
-
-size_t PluginTensorRT::getBaseSerializationSize() {
-  return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) +
-          SerializedSize(data_type_) + SerializedSize(data_format_));
-}
-
-bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
-                                    nvinfer1::PluginFormat format) const {
-  return ((type == nvinfer1::DataType::kFLOAT) &&
-          (format == nvinfer1::PluginFormat::kNCHW));
-}
-
-void PluginTensorRT::configureWithFormat(
-    const nvinfer1::Dims* input_dims, int num_inputs,
-    const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type,
-    nvinfer1::PluginFormat format, int max_batch_size) {
-  data_type_ = type;
-  data_format_ = format;
-  input_dims_.assign(input_dims, input_dims + num_inputs);
-  max_batch_size_ = max_batch_size;
-}
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
deleted file mode 100644
index 3b737bd726ad09637f8530a114362d98d1dac1b0..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <NvInfer.h>
-#include <cstring>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DECLARE_bool(profile);
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-class PluginTensorRT;
-
-typedef std::function<PluginTensorRT*(const void*, size_t)>
-    PluginDeserializeFunc;
-
-typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
-
-class PluginTensorRT : public nvinfer1::IPluginExt {
- public:
-  PluginTensorRT() {}
-  // It was used for TensorRT deserialization.
-  // It should not be called by users.
-  PluginTensorRT(const void* serialized_data, size_t length) {}
-  virtual ~PluginTensorRT() {}
-
-  nvinfer1::Dims const& getInputDims(int index) const {
-    return input_dims_.at(index);
-  }
-  size_t getMaxBatchSize() const { return max_batch_size_; }
-  nvinfer1::DataType getDataType() const { return data_type_; }
-  nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
-  virtual const char* getPluginVersion() const { return "1"; }
-
-  void AddInput(nvinfer1::ITensor* input) { inputs_.push_back(input); }
-  std::vector<nvinfer1::ITensor*>& GetInputs() { return inputs_; }
-
-  virtual nvinfer1::IPluginExt* clone() const = 0;
-  virtual const char* getPluginType() const = 0;
-
-  // Following functions are inherit from nvinfer1::IPluginExt
-  // Get the number of outputs from the layer
-  int getNbOutputs() const { return 1; }
-  // Get the dimension of an output tensor
-  virtual nvinfer1::Dims getOutputDimensions(int index,
-                                             const nvinfer1::Dims* input_dims,
-                                             int num_inputs) = 0;
-  // Find the workspace size required by the layer
-  size_t getWorkspaceSize(int) const override { return 0; }
-
-  // Initialize the layer for execution.
-  // This is called when the engine is created.
-  int initialize() override { return 0; }
-  // Shutdown the layer. This is called when the engine is destroyed
-  void terminate() override {}
-  // Execute the layer
-  virtual int enqueue(int batch_size, const void* const* inputs, void** outputs,
-                      void* workspace, cudaStream_t stream) = 0;
-
-  // Find the size of the serialization buffer required
-  virtual size_t getSerializationSize() = 0;
-  // Serialize the layer config to buffer.
-  // TensorRT will call this func to serialize the configuration of TensorRT
-  // engine. It should not be called by users.
-  virtual void serialize(void* buffer) = 0;
-
-  // Check format support. The default is FLOAT32 and NCHW.
-  bool supportsFormat(nvinfer1::DataType type,
-                      nvinfer1::PluginFormat format) const override;
-  // Configure the layer
-  void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs,
-                           const nvinfer1::Dims* output_dims, int num_outputs,
-                           nvinfer1::DataType type,
-                           nvinfer1::PluginFormat format,
-                           int max_batch_size) override;
-
- protected:
-  // Deserialize input_dims, max_batch_size, data_type, data_format
-  void deserializeBase(void const*& serial_data,  // NOLINT
-                       size_t& serial_length);    // NOLINT
-  size_t getBaseSerializationSize();
-  // Serialize input_dims, max_batch_size, data_type, data_format
-  void serializeBase(void*& buffer);  // NOLINT
-
-  std::vector<nvinfer1::Dims> input_dims_;
-  size_t max_batch_size_;
-  nvinfer1::DataType data_type_;
-  nvinfer1::PluginFormat data_format_;
-
-  std::vector<nvinfer1::ITensor*> inputs_;
-};
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
deleted file mode 100644
index 3c20b6d1e725273dbfdc20c01fb01deea4e8d88e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
-                                                    const void* serial_data,
-                                                    size_t serial_length) {
-  const char* plugin_type;
-  DeserializeValue(&serial_data, &serial_length, &plugin_type);
-
-  PADDLE_ENFORCE(Has(plugin_type),
-                 "trt plugin type %s does not exists, check it.", plugin_type);
-  auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
-  owned_plugins_.emplace_back(plugin);
-
-  return plugin;
-}
-
-bool PluginFactoryTensorRT::RegisterPlugin(
-    const std::string& op_name, PluginDeserializeFunc deserialize_func) {
-  if (Has(op_name)) return false;
-  auto ret = plugin_registry_.emplace(op_name, deserialize_func);
-  return ret.second;
-}
-
-void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); }
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
deleted file mode 100644
index ed825801fc4b64e3c220a0d357dc7e5c5bde9c90..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <NvInfer.h>
-#include <cstring>
-#include <list>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-class PluginFactoryTensorRT : public nvinfer1::IPluginFactory,
-                              public DeleteHelper {
- public:
-  // Deserialization method
-  PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
-                               size_t serial_length) override;
-
-  bool RegisterPlugin(const std::string& op_name,
-                      PluginDeserializeFunc deserialize_func);
-
-  bool Has(const std::string& op_name) {
-    return plugin_registry_.find(op_name) != plugin_registry_.end();
-  }
-
-  void DestroyPlugins();
-
- protected:
-  std::unordered_map<std::string, PluginDeserializeFunc> plugin_registry_;
-
-  std::list<std::unique_ptr<PluginTensorRT>> owned_plugins_;
-};
-
-class TrtPluginRegistrar {
- public:
-  TrtPluginRegistrar(const std::string& name,
-                     PluginDeserializeFunc deserialize_func) {
-    inference::Singleton<PluginFactoryTensorRT>::Global().RegisterPlugin(
-        name, deserialize_func);
-  }
-};
-
-#define REGISTER_TRT_PLUGIN(name, deserialize_func) \
-  REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func)
-
-#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func)      \
-  static paddle::inference::tensorrt::plugin::TrtPluginRegistrar   \
-      trt_plugin_registrar##ctr UNUSED =                           \
-          paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
-              name, deserialize_func)
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
deleted file mode 100644
index 1cae4ccae4cc593785d9b3b0e87523e740eef4ff..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cstring>
-#include <string>
-#include <type_traits>
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-// Some trt base classes lack of the destructor.
-// We use a assisted class to fix this.
-struct DeleteHelper {
- protected:
-  virtual ~DeleteHelper() {}
-};
-
-template <typename T>
-inline void SerializeValue(void** buffer, T const& value);
-
-template <typename T>
-inline void DeserializeValue(void const** buffer, size_t* buffer_size,
-                             T* value);
-
-namespace details {
-
-template <typename T, class Enable = void>
-struct Serializer {};
-
-template <typename T>
-struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
-                                             std::is_enum<T>::value ||
-                                             std::is_pod<T>::value>::type> {
-  static size_t SerializedSize(T const& value) { return sizeof(T); }
-
-  static void Serialize(void** buffer, T const& value) {
-    std::memcpy(*buffer, &value, sizeof(T));
-    reinterpret_cast<char*&>(*buffer) += sizeof(T);
-  }
-
-  static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
-    assert(*buffer_size >= sizeof(T));
-    std::memcpy(value, *buffer, sizeof(T));
-    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
-    *buffer_size -= sizeof(T);
-  }
-};
-
-template <>
-struct Serializer<const char*> {
-  static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
-
-  static void Serialize(void** buffer, const char* value) {
-    std::strcpy(static_cast<char*>(*buffer), value);  // NOLINT
-    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
-  }
-
-  static void Deserialize(void const** buffer, size_t* buffer_size,
-                          const char** value) {
-    *value = static_cast<char const*>(*buffer);
-    size_t data_size = strnlen(*value, *buffer_size) + 1;
-    assert(*buffer_size >= data_size);
-    reinterpret_cast<char const*&>(*buffer) += data_size;
-    *buffer_size -= data_size;
-  }
-};
-
-template <typename T>
-struct Serializer<std::vector<T>,
-                  typename std::enable_if<std::is_arithmetic<T>::value ||
-                                          std::is_enum<T>::value ||
-                                          std::is_pod<T>::value>::type> {
-  static size_t SerializedSize(std::vector<T> const& value) {
-    return sizeof(value.size()) + value.size() * sizeof(T);
-  }
-
-  static void Serialize(void** buffer, std::vector<T> const& value) {
-    SerializeValue(buffer, value.size());
-    size_t nbyte = value.size() * sizeof(T);
-    std::memcpy(*buffer, value.data(), nbyte);
-    reinterpret_cast<char*&>(*buffer) += nbyte;
-  }
-
-  static void Deserialize(void const** buffer, size_t* buffer_size,
-                          std::vector<T>* value) {
-    size_t size;
-    DeserializeValue(buffer, buffer_size, &size);
-    value->resize(size);
-    size_t nbyte = value->size() * sizeof(T);
-    PADDLE_ENFORCE_GE(*buffer_size, nbyte);
-    std::memcpy(value->data(), *buffer, nbyte);
-    reinterpret_cast<char const*&>(*buffer) += nbyte;
-    *buffer_size -= nbyte;
-  }
-};
-
-}  // namespace details
-
-template <typename T>
-inline size_t SerializedSize(T const& value) {
-  return details::Serializer<T>::SerializedSize(value);
-}
-
-template <typename T>
-inline void SerializeValue(void** buffer, T const& value) {
-  return details::Serializer<T>::Serialize(buffer, value);
-}
-
-template <typename T>
-inline void DeserializeValue(void const** buffer, size_t* buffer_size,
-                             T* value) {
-  return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
-}
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
deleted file mode 100644
index a03dd45db0f80487cb4c2e6b68f94944e8558ae4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-class TensorRTEngineTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
-
-    engine_ = new TensorRTEngine(10, 1 << 10);
-    engine_->InitNetwork();
-  }
-
-  void TearDown() override {
-    if (engine_) {
-      delete engine_;
-      engine_ = nullptr;
-    }
-  }
-
-  void PrepareInputOutput(const std::vector<float> &input,
-                          std::vector<int> output_shape) {
-    TensorFromVector(input, *ctx_, &input_);
-    output_.Resize(framework::make_ddim(output_shape));
-  }
-
-  void GetOutput(std::vector<float> *output) {
-    TensorToVector(output_, *ctx_, output);
-  }
-
- protected:
-  framework::Tensor input_;
-  framework::Tensor output_;
-  TensorRTEngine *engine_;
-  platform::CUDADeviceContext *ctx_;
-};
-
-TEST_F(TensorRTEngineTest, add_layer) {
-  const int size = 1;
-
-  float raw_weight[size] = {2.};  // Weight in CPU memory.
-  float raw_bias[size] = {3.};
-
-  std::vector<void *> buffers(2);  // TRT binded inputs
-
-  LOG(INFO) << "create weights";
-  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size);
-  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size);
-  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
-                                  nvinfer1::DimsCHW{1, 1, 1});
-  auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
-                                        weight.get(), bias.get());
-  PADDLE_ENFORCE(fc_layer != nullptr);
-
-  engine_->DeclareOutput(fc_layer, 0, "y");
-  LOG(INFO) << "freeze network";
-  engine_->FreezeNetwork();
-  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
-
-  // fill in real data
-  std::vector<float> x_v = {1234};
-  std::vector<float> y_cpu;
-  PrepareInputOutput(x_v, {1});
-
-  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
-  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
-
-  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
-  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
-
-  LOG(INFO) << "to execute";
-  engine_->Execute(1, &buffers, ctx_->stream());
-
-  LOG(INFO) << "to get output";
-  GetOutput(&y_cpu);
-
-  LOG(INFO) << "to checkout output";
-  ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);
-}
-
-TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
-  // Weight in CPU memory.
-  // It seems tensorrt FC use col-major: [[1.0, 3.3], [1.1, 4.4]]
-  // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
-  float raw_weight[4] = {1.0, 1.1, 3.3, 4.4};
-  float raw_bias[2] = {1.3, 2.4};
-  std::vector<void *> buffers(2);  // TRT binded inputs
-
-  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4);
-  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2);
-  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
-                                  nvinfer1::DimsCHW{1, 2, 1});
-  auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
-                                        weight.get(), bias.get());
-  PADDLE_ENFORCE(fc_layer != nullptr);
-
-  engine_->DeclareOutput(fc_layer, 0, "y");
-  engine_->FreezeNetwork();
-  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
-
-  // fill in real data
-  std::vector<float> x_v = {1.0, 2.0};
-  std::vector<float> y_cpu;
-  PrepareInputOutput(x_v, {2});
-
-  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
-  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
-
-  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
-  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
-
-  engine_->Execute(1, &buffers, ctx_->stream());
-
-  LOG(INFO) << "to get output";
-  GetOutput(&y_cpu);
-
-  auto dims = engine_->GetITensor("y")->getDimensions();
-  ASSERT_EQ(dims.nbDims, 3);
-  ASSERT_EQ(dims.d[0], 2);
-  ASSERT_EQ(dims.d[1], 1);
-
-  ASSERT_EQ(y_cpu[0], 4.5);
-  ASSERT_EQ(y_cpu[1], 14.5);
-}
-
-TEST_F(TensorRTEngineTest, test_conv2d) {
-  // Weight in CPU memory.
-  float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  float raw_bias[1] = {0};
-  std::vector<void *> buffers(2);  // TRT binded inputs
-
-  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
-  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
-  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
-                                  nvinfer1::Dims3{1, 3, 3});
-  auto *conv_layer =
-      TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
-                           weight.get(), bias.get());
-  PADDLE_ENFORCE(conv_layer != nullptr);
-  conv_layer->setStride(nvinfer1::DimsHW{1, 1});
-  conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
-
-  engine_->DeclareOutput(conv_layer, 0, "y");
-  engine_->FreezeNetwork();
-  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
-
-  // fill in real data
-  std::vector<float> x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-                            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  std::vector<float> y_cpu;
-  PrepareInputOutput(x_v, {18});
-
-  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
-  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
-
-  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
-  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
-
-  engine_->Execute(2, &buffers, ctx_->stream());
-
-  LOG(INFO) << "to get output";
-  GetOutput(&y_cpu);
-
-  ASSERT_EQ(y_cpu[0], 4.0);
-  ASSERT_EQ(y_cpu[1], 6.0);
-}
-
-TEST_F(TensorRTEngineTest, test_pool2d) {
-  // Weight in CPU memory.
-  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
-                                  nvinfer1::Dims3{1, 2, 2});
-
-  std::vector<void *> buffers(2);  // TRT binded inputs
-  nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
-  auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
-                                          nvinfer1::DimsHW{2, 2});
-
-  PADDLE_ENFORCE(pool_layer != nullptr);
-  pool_layer->setStride(nvinfer1::DimsHW{1, 1});
-  pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
-
-  engine_->DeclareOutput(pool_layer, 0, "y");
-  engine_->FreezeNetwork();
-  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
-
-  // fill in real data
-  std::vector<float> x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
-  std::vector<float> y_cpu;
-  PrepareInputOutput(x_v, {2});
-
-  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
-  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
-
-  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
-  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
-
-  engine_->Execute(2, &buffers, ctx_->stream());
-
-  LOG(INFO) << "to get output";
-  GetOutput(&y_cpu);
-
-  ASSERT_EQ(y_cpu[0], 2.0);
-  ASSERT_EQ(y_cpu[1], 5.0);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
deleted file mode 100644
index a07537985738ab0ad4092b794f3b62ba53dfa866..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include "NvInfer.h"
-#include "paddle/fluid/platform/dynload/tensorrt.h"
-
-namespace dy = paddle::platform::dynload;
-
-class Logger : public nvinfer1::ILogger {
- public:
-  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
-    switch (severity) {
-      case Severity::kINFO:
-        LOG(INFO) << msg;
-        break;
-      case Severity::kWARNING:
-        LOG(WARNING) << msg;
-        break;
-      case Severity::kINTERNAL_ERROR:
-      case Severity::kERROR:
-        LOG(ERROR) << msg;
-        break;
-      default:
-        break;
-    }
-  }
-};
-
-class ScopedWeights {
- public:
-  explicit ScopedWeights(float value) : value_(value) {
-    w.type = nvinfer1::DataType::kFLOAT;
-    w.values = &value_;
-    w.count = 1;
-  }
-  const nvinfer1::Weights& get() { return w; }
-
- private:
-  float value_;
-  nvinfer1::Weights w;
-};
-
-// The following two API are implemented in TensorRT's header file, cannot load
-// from the dynamic library. So create our own implementation and directly
-// trigger the method from the dynamic library.
-nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) {
-  return static_cast<nvinfer1::IBuilder*>(
-      dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION));
-}
-nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
-  return static_cast<nvinfer1::IRuntime*>(
-      dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
-}
-
-const char* kInputTensor = "input";
-const char* kOutputTensor = "output";
-
-// Creates a network to compute y = 2x + 3
-nvinfer1::IHostMemory* CreateNetwork() {
-  Logger logger;
-  // Create the engine.
-  nvinfer1::IBuilder* builder = createInferBuilder(&logger);
-  ScopedWeights weights(2.);
-  ScopedWeights bias(3.);
-
-  nvinfer1::INetworkDefinition* network = builder->createNetwork();
-  // Add the input
-  auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
-                                 nvinfer1::DimsCHW{1, 1, 1});
-  EXPECT_NE(input, nullptr);
-  // Add the hidden layer.
-  auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
-  EXPECT_NE(layer, nullptr);
-  // Mark the output.
-  auto output = layer->getOutput(0);
-  output->setName(kOutputTensor);
-  network->markOutput(*output);
-  // Build the engine.
-  builder->setMaxBatchSize(1);
-  builder->setMaxWorkspaceSize(1 << 10);
-  auto engine = builder->buildCudaEngine(*network);
-  EXPECT_NE(engine, nullptr);
-  // Serialize the engine to create a model, then close.
-  nvinfer1::IHostMemory* model = engine->serialize();
-  network->destroy();
-  engine->destroy();
-  builder->destroy();
-  return model;
-}
-
-void Execute(nvinfer1::IExecutionContext* context, const float* input,
-             float* output) {
-  const nvinfer1::ICudaEngine& engine = context->getEngine();
-  // Two binds, input and output
-  ASSERT_EQ(engine.getNbBindings(), 2);
-  const int input_index = engine.getBindingIndex(kInputTensor);
-  const int output_index = engine.getBindingIndex(kOutputTensor);
-  // Create GPU buffers and a stream
-  void* buffers[2];
-  ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
-  ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
-  cudaStream_t stream;
-  ASSERT_EQ(0, cudaStreamCreate(&stream));
-  // Copy the input to the GPU, execute the network, and copy the output back.
-  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
-                               cudaMemcpyHostToDevice, stream));
-  context->enqueue(1, buffers, stream, nullptr);
-  ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
-                               cudaMemcpyDeviceToHost, stream));
-  cudaStreamSynchronize(stream);
-
-  // Release the stream and the buffers
-  cudaStreamDestroy(stream);
-  ASSERT_EQ(0, cudaFree(buffers[input_index]));
-  ASSERT_EQ(0, cudaFree(buffers[output_index]));
-}
-
-TEST(TensorrtTest, BasicFunction) {
-  // Create the network serialized model.
-  nvinfer1::IHostMemory* model = CreateNetwork();
-
-  // Use the model to create an engine and an execution context.
-  Logger logger;
-  nvinfer1::IRuntime* runtime = createInferRuntime(&logger);
-  nvinfer1::ICudaEngine* engine =
-      runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
-  model->destroy();
-  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
-
-  // Execute the network.
-  float input = 1234;
-  float output;
-  Execute(context, &input, &output);
-  EXPECT_EQ(output, input * 2 + 3);
-
-  // Destroy the engine.
-  context->destroy();
-  engine->destroy();
-  runtime->destroy();
-}
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
deleted file mode 100644
index 4a85c8b8fe6d70052edd3be59f98582c9b2e86b9..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
-#include "glog/logging.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-// set the batch size before constructing the thread to execute engine
-int TRTInt8Calibrator::getBatchSize() const { return batch_size_; }
-
-TRTInt8Calibrator::TRTInt8Calibrator(
-    const std::unordered_map<std::string, size_t>& buffers, int batch_size,
-    std::string engine_name, const platform::Place place)
-    : batch_size_(batch_size), engine_name_(engine_name) {
-  int i = 0;
-  VLOG(4) << "Init a new calibrator: " << engine_name_;
-  for (const auto it : buffers) {
-    framework::Tensor temp_tensor;
-    std::string input_name = it.first;
-    int data_size = it.second;
-    int num_ele = data_size / sizeof(int16_t);
-    framework::DDim data_shape = framework::make_ddim({num_ele});
-    temp_tensor.Resize(data_shape);
-    data_tensors_.push_back(temp_tensor);
-    data_buffers_[input_name] = std::pair<void*, size_t>(
-        static_cast<void*>(temp_tensor.mutable_data<int16_t>(place)), num_ele);
-    i += 1;
-  }
-}
-
-TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data)
-    : batch_size_(0),
-      calib_running_(false),
-      data_is_set_(false),
-      done_(true),
-      calibration_table_(calib_data) {}
-
-void TRTInt8Calibrator::waitAndSetDone() {
-  std::unique_lock<std::mutex> lk(mut_);
-  while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk);
-  if (!done_) {
-    done_ = true;
-    cond_.notify_all();
-  }
-}
-
-// There might be more than one input for trt subgraph,
-// So, we use a map to store input information.
-bool TRTInt8Calibrator::setBatch(
-    const std::unordered_map<std::string, void*>& data) {
-  VLOG(3) << "set batch: " << engine_name_;
-  std::unique_lock<std::mutex> lk(mut_);
-  //  There is a producer and a consumer. The producer set the batch data and
-  //  the consumer get the batch data. The size of the data pool is one.
-  //  So, the producer has to wait for the consumer to finish processing before
-  //  they can set the data.
-  while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk);
-  // The done_ is set to true using waitAndSetDone, When all calibration data
-  // are processed.
-  if (done_) return false;
-
-  // Sets the batch.
-  for (const auto& it : data) {
-    auto dataptr = data_buffers_.find(it.first);
-    if (dataptr == data_buffers_.end()) {
-      LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first
-                 << "' does not match with the buffer names";
-    }
-    const auto& d = dataptr->second;
-    PADDLE_ENFORCE(
-        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
-        "Fail to cudaMemcpy %s for %s", engine_name_, it.first);
-  }
-
-  data_is_set_ = true;
-  cond_.notify_all();
-  return true;
-}
-
-bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
-                                 int num_bindings) {
-  VLOG(4) << "get batch: " << engine_name_;
-  std::unique_lock<std::mutex> lk(mut_);
-  // The consumer has just finished processing a data.
-  // The producer can set the data again.
-  calib_running_ = false;
-  cond_.notify_all();
-
-  // As long as there is data in the pool, the consumer can get it.
-  while (!data_is_set_ && !done_) cond_.wait(lk);
-  if (done_) return false;
-
-  // Gets the batch
-  for (int i = 0; i < num_bindings; i++) {
-    auto it = data_buffers_.find(names[i]);
-    if (it == data_buffers_.end()) {
-      LOG(FATAL) << "Calibration engine asked for unknown tensor name '"
-                 << names[i] << "' at position " << i;
-    }
-    bindings[i] = it->second.first;
-  }
-
-  data_is_set_ = false;
-  calib_running_ = true;
-  VLOG(4) << "get batch done: " << engine_name_;
-  return true;
-}
-
-void TRTInt8Calibrator::setDone() {
-  std::unique_lock<std::mutex> lk(mut_);
-  done_ = true;
-  cond_.notify_all();
-}
-
-const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) {
-  if (calibration_table_.empty()) return nullptr;
-  length = calibration_table_.size();
-  return calibration_table_.data();
-}
-
-void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
-                                              std::size_t length) {
-  calibration_table_ = std::string((const char*)ptr, length);
-  VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr
-          << " length=" << length;
-}
-TRTInt8Calibrator::~TRTInt8Calibrator() {
-  VLOG(4) << "Destroying calibrator for " << engine_name_;
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
deleted file mode 100644
index 5815bc9a1464293e0a56f05e34183580eac96cea..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <NvInfer.h>
-#include <cuda_runtime_api.h>
-#include <atomic>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-class TensorRTEngine;
-
-struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
- public:
-  TRTInt8Calibrator(const std::unordered_map<std::string, size_t>& buffers,
-                    int batch_size, std::string engine_name,
-                    const platform::Place place);
-
-  explicit TRTInt8Calibrator(const std::string& calibration_data);
-  ~TRTInt8Calibrator();
-
-  int getBatchSize() const override;
-
-  bool getBatch(void* bindings[], const char* names[],
-                int num_bindings) override;
-
-  bool setBatch(const std::unordered_map<std::string, void*>& data);
-  void setDone();
-  void waitAndSetDone();
-
-  const void* readCalibrationCache(std::size_t& length) override;
-  void writeCalibrationCache(const void* ptr, std::size_t length) override;
-  const std::string& getCalibrationTableAsString() {
-    return calibration_table_;
-  }
-
- private:
-  const int batch_size_;
-
-  bool calib_running_{true};
-  bool data_is_set_{false};
-  bool done_{false};
-
-  std::mutex mut_;
-  std::condition_variable cond_;
-
-  std::unordered_map<std::string, std::pair<void*, size_t>> data_buffers_;
-  std::vector<framework::Tensor> data_tensors_;
-
-  std::string engine_name_;
-  std::string calibration_table_;
-};
-
-class TRTCalibratorEngine {
- public:
-  TRTCalibratorEngine() {}
-  std::unique_ptr<TRTInt8Calibrator> calib_;
-  std::unique_ptr<std::thread> thr_;
-  std::unique_ptr<TensorRTEngine> engine_;
-};
-/*
- * Manager to control the TensorRT Int8 calibration creation and deltetion.
- */
-class TRTCalibratorEngineManager {
- public:
-  bool Has() const { return res_.size() > 0; }
-  bool Has(const std::string& name) const {
-    if (res_.count(name) == 0) return false;
-    return res_.at(name).get() != nullptr;
-  }
-
-  // Get Int8Calibrator via name
-  TRTCalibratorEngine* Get(const std::string& name) const {
-    return res_.at(name).get();
-  }
-
-  // Look up or create a calibrator.
-  TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) {
-    if (res_.count(engine_name) == 0) {
-      auto* p = new TRTCalibratorEngine;
-      res_[engine_name].reset(p);
-    }
-    return res_.at(engine_name).get();
-  }
-
-  // Create an Int8Calibrator
-  TRTCalibratorEngine* Create(const std::string& engine_name) {
-    auto* p = new TRTCalibratorEngine;
-    res_[engine_name].reset(p);
-    return p;
-  }
-
-  void DeleteALL() {
-    for (auto& item : res_) {
-      item.second.reset(nullptr);
-    }
-  }
-
- private:
-  std::unordered_map<std::string, std::unique_ptr<TRTCalibratorEngine>> res_;
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
deleted file mode 100644
index e064d01bc76facf1d9e4c357b86bcfed9f33a31a..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ /dev/null
@@ -1,286 +0,0 @@
-set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor benchmark)
-
-if(WITH_GPU AND TENSORRT_FOUND)
-    set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor)
-endif()
-
-function(download_data install_dir data_file)
-    if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file})
-    endif()
-endfunction()
-
-function(download_int8_data install_dir data_file)
-    if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
-    endif()
-endfunction()
-
-function(download_model_and_data install_dir model_name data_name)
-    download_data(${install_dir} ${model_name}) 
-    download_data(${install_dir} ${data_name})
-endfunction()
-
-function(inference_analysis_api_test target install_dir filename)
-    inference_analysis_test(${target} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
-        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
-endfunction()
-
-function(inference_analysis_api_int8_test_build TARGET_NAME filename)
-	inference_analysis_test_build(${TARGET_NAME} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark)
-endfunction()
-
-function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir data_path)
-	inference_analysis_test_run(${TARGET_NAME}
-	COMMAND ${test_binary}
-        ARGS --infer_model=${model_dir}/model
-             --infer_data=${data_path}
-             --warmup_batch_size=100
-             --batch_size=50
-             --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
-	     --iterations=2)
-endfunction()
-
-function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME test_binary model_dir data_path)
-	inference_analysis_test_run(${TARGET_NAME}
-	COMMAND ${test_binary}
-        ARGS --infer_model=${model_dir}/model
-             --infer_data=${data_path}
-             --warmup_batch_size=10
-             --batch_size=300
-             --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
-	     --iterations=1)
-endfunction()
-
-function(inference_analysis_api_test_with_fake_data_build TARGET_NAME filename)
-	inference_analysis_test_build(${TARGET_NAME} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS})
-endfunction()
-
-function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary model_dir disable_fc)
-    inference_analysis_test_run(${TARGET_NAME}
-	COMMAND ${test_binary}
-        ARGS --infer_model=${model_dir}/model
-             --disable_mkldnn_fc=${disable_fc}) 
-endfunction()
-
-function(inference_analysis_api_test_with_refer_result target install_dir filename)
-    inference_analysis_test(${target} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt
-             --refer_result=${install_dir}/result.txt)
-endfunction()
-
-if(NOT APPLE AND WITH_MKLML)
-    # RNN1
-    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
-    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
-    
-    # seq_pool1
-    set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
-    download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
-else()
-    # TODO: fix this test on MACOS and OPENBLAS, the reason is that
-    # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
-    message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1")
-    message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1")
-endif()
-
-
-# RNN2
-set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
-download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
-
-# TODO(luotao, Superjom) Disable DAM test, temporarily fix
-# https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914.
-# After inference framework refactor, will reopen it.
-# normal DAM
-set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
-download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-#inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator)
-
-# small DAM
-set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
-download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1)
-
-#save model 
-inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR} analyzer_save_model_tester.cc)
-
-# chinese_ner
-set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
-download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc)
-
-# lac
-set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
-download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
-
-# Pyramid DNN
-set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn")
-download_model_and_data(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} analyzer_pyramid_dnn_tester.cc)
-
-# text_classification
-set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
-download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)
-
-# seq_conv1
-set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
-download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
-
-# transformer, the dataset only works on batch_size=8 now
-set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
-download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
-  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
-       --paddle_num_threads=${CPU_NUM_THREADS_ON_CI})
-
-# ocr
-set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
-if (NOT EXISTS ${OCR_INSTALL_DIR})
-    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
-endif()
-inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
-
-# mobilenet with transpose op
-set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
-if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
-    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
-endif()
-inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
-
-### Image classification tests with fake data
-set(IMG_CLASS_TEST_APP "test_analyzer_image_classification")
-set(IMG_CLASS_TEST_APP_SRC "analyzer_image_classification_tester.cc")
-
-# build test binary to be used in subsequent tests
-inference_analysis_api_test_with_fake_data_build(${IMG_CLASS_TEST_APP} ${IMG_CLASS_TEST_APP_SRC})
-
-# googlenet
-set(GOOGLENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/googlenet")
-download_data(${GOOGLENET_MODEL_DIR} "googlenet.tar.gz")
-inference_analysis_api_test_with_fake_data_run(test_analyzer_googlenet ${IMG_CLASS_TEST_APP}
-	${GOOGLENET_MODEL_DIR} false)
-
-# resnet50
-set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
-download_data(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz")
-inference_analysis_api_test_with_fake_data_run(test_analyzer_resnet50 ${IMG_CLASS_TEST_APP}
-	${RESNET50_MODEL_DIR} true)
-
-# mobilenet with depthwise_conv op
-set(MOBILENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv")
-download_data(${MOBILENET_MODEL_DIR} "mobilenet_model.tar.gz")
-inference_analysis_api_test_with_fake_data_run(test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP}
-	${MOBILENET_MODEL_DIR} false)
-
-### INT8 tests
-if(WITH_MKLDNN)
-
-  set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
-
-  ### Image classification tests
-  set(IMAGENET_DATA_PATH "${INT8_DATA_DIR}/data.bin")
-  set(INT8_IMG_CLASS_TEST_APP "test_analyzer_int8_image_classification")
-  set(INT8_IMG_CLASS_TEST_APP_SRC "analyzer_int8_image_classification_tester.cc")
-
-  # download dataset if necessary
-  download_int8_data(${INT8_DATA_DIR} "imagenet_val_100_tail.tar.gz")
-
-  # build test binary to be used in subsequent tests
-  inference_analysis_api_int8_test_build(${INT8_IMG_CLASS_TEST_APP} ${INT8_IMG_CLASS_TEST_APP_SRC})
-
-  # resnet50 int8
-  set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
-  download_int8_data(${INT8_RESNET50_MODEL_DIR} "resnet50_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_resnet50 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH})
-
-  # mobilenetv1 int8
-  set(INT8_MOBILENETV1_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv1")
-  download_int8_data(${INT8_MOBILENETV1_MODEL_DIR} "mobilenetv1_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv1 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
-  # mobilenetv2 int8
-  set(INT8_MOBILENETV2_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv2")
-  download_int8_data(${INT8_MOBILENETV2_MODEL_DIR} "mobilenet_v2_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv2 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
-  # resnet101 int8
-  set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101")
-  download_int8_data(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
-  # vgg16 int8
-  set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16")
-  download_int8_data(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
-  # vgg19 int8
-  set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19")
-  download_int8_data(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH})
-
-  # googlenet int8
-  set(INT8_GOOGLENET_MODEL_DIR "${INT8_DATA_DIR}/googlenet")
-  download_int8_data(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH})
-
-  ### Object detection models
-  set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_300.bin")
-  set(INT8_OBJ_DETECT_TEST_APP "test_analyzer_int8_object_detection")
-  set(INT8_OBJ_DETECT_TEST_APP_SRC "analyzer_int8_object_detection_tester.cc")
-
-  # download dataset if necessary
-  download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz")
-
-  # download small demo set of pascalvoc for testing local userdata preprocessing
-  download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz")
-
-  # build test binary to be used in subsequent tests
-  inference_analysis_api_int8_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC})
-
-  # mobilenet-ssd int8
-  set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd")
-  download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
-  inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
-
-endif()
-
-# bert, max_len=20, embedding_dim=128
-set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
-download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc)
-
-if(WITH_GPU AND TENSORRT_FOUND)
-    set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
-    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
-        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz")
-    endif()
-    inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(trt_resnet50_test SRCS trt_resnet50_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(trt_resnext_test SRCS trt_resnext_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(trt_fc_prelu_test SRCS trt_fc_prelu_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(trt_cascade_rcnn_test SRCS trt_cascade_rcnn_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-endif()
diff --git a/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc b/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc
deleted file mode 100644
index 8094c744fef50361c50eabe53799658985113f09..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
-DEFINE_string(model, "", "Directory of the inference model.");
-
-namespace paddle {
-
-contrib::AnakinConfig Config() {
-  // Determine the use of memory here.
-  std::map<std::string, std::vector<int>> init_inputs_shape;
-  init_inputs_shape["input_0"] = std::vector<int>({1, 3, 112, 112});
-
-  contrib::AnakinConfig config;
-  config.target_type = contrib::AnakinConfig::MLU;
-  config.model_file = FLAGS_model;
-  config.init_inputs_shape = init_inputs_shape;
-
-  // Determine the device execution context.
-  config.device_id = 0;
-  config.data_stream_id = 0;
-  config.compute_stream_id = 0;
-
-  // Set re_allocable and op_fuse TRUE.
-  config.re_allocable = true;
-  config.op_fuse = true;
-
-  return config;
-}
-
-void single_test() {
-  // 1. Defining basic data structures.
-  auto config = paddle::Config();
-  auto predictor =
-      paddle::CreatePaddlePredictor<paddle::contrib::AnakinConfig,
-                                    paddle::PaddleEngineKind::kAnakin>(config);
-
-  // 2. Define the data structure of the predictor inputs and outputs.
-  std::vector<paddle::PaddleTensor> input_tensors;
-  std::vector<paddle::PaddleTensor> output_tensors;
-
-  // 3. Define and fill the inputs tensor.
-  int num = 1;
-  int channel = 3;
-  int height = 112;
-  int width = 112;
-  std::vector<float> input(num * channel * height * width, 1);
-  std::vector<std::vector<float>> inputs({input});
-  const std::vector<std::string> input_names{"input_0"};
-  for (auto& name : input_names) {
-    paddle::PaddleTensor tensor;
-    tensor.name = name;
-    tensor.dtype = PaddleDType::FLOAT32;
-    input_tensors.push_back(tensor);
-  }
-  for (size_t j = 0; j < input_tensors.size(); j++) {
-    input_tensors[j].data =
-        paddle::PaddleBuf(&inputs[j][0], inputs[j].size() * sizeof(float));
-    // The shape of each execution can be changed.
-    input_tensors[j].shape = std::vector<int>({num, channel, height, width});
-  }
-
-  // 4. Set the output placeholder of predictor.
-  PaddleTensor predict_out, score_out;
-  predict_out.name = "landmark_predict_out";
-  score_out.name = "landmark_score_out";
-  output_tensors.push_back(predict_out);
-  output_tensors.push_back(score_out);
-
-  // 5. Execution predict.
-  predictor->Run(input_tensors, &output_tensors);
-
-  // 6. Take out the output data.
-  for (auto out : output_tensors) {
-    float* data_o = static_cast<float*>(out.data.data());
-    LOG(INFO) << out.name << " size = " << out.data.length() / sizeof(float);
-  }
-}
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  paddle::single_test();
-  return 0;
-}
diff --git a/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc
deleted file mode 100644
index 27abaa530b36f4a95f05ea16e8068be052d9711c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc
+++ /dev/null
@@ -1,261 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include <cmath>
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
-#define BUFFER_SIZE (10000)
-#define COMPARE_OUTPUTS (1)
-#define PRINT_INPUTS (0)
-
-DEFINE_string(model, "", "Directory of the inference model.");
-DEFINE_string(datapath, "", "Path of the dataset.");
-DEFINE_string(truthpath, "", "Path of the dataset.");
-DEFINE_int32(batch_size, 1, "Batch size per execution.");
-DEFINE_int32(repeats, 1, "Number of iterations.");
-DEFINE_int32(
-    start_line, 0,
-    "The starting line of the text file read (this line will be read).");
-DEFINE_int32(end_line, 1000000,
-             "The ending line of the text file read (this line will be read).");
-DEFINE_int32(init_batch_size, 40,
-             "Max batch size for Anakin memory allocation.");
-DEFINE_int32(threads_num, 2, "Threads num for Anakin.");
-
-class Data {
- public:
-  Data(std::string file_name, size_t batch_size, size_t start = 0,
-       size_t end = 1000000)
-      : _batch_size(batch_size), _total_length(0), _inputs_size(6) {
-    _file.open(file_name);
-    _file.seekg(_file.end);
-    _total_length = _file.tellg();
-    _file.seekg(_file.beg);
-    read_file_to_vec(start, end);
-    reset_current_line();
-  }
-  void reset_current_line();
-  const std::vector<std::string>& get_lines();
-  void read_file_to_vec(const size_t start, const size_t end);
-  int get_next_batches(std::vector<std::vector<float>>* inputs,
-                       std::vector<std::vector<size_t>>* seq_offsets);
-
- private:
-  std::fstream _file;
-  int _batch_size;
-  size_t _total_length;
-  size_t _inputs_size;
-  std::vector<std::string> _lines;
-  size_t _current_line;
-};
-
-void Data::read_file_to_vec(const size_t start, const size_t end) {
-  std::string line;
-  size_t count = 0;
-  _lines.clear();
-  while (std::getline(_file, line)) {
-    if (count >= start && count <= end) {
-      _lines.push_back(line);
-    }
-    count++;
-  }
-}
-
-const std::vector<std::string>& Data::get_lines() { return _lines; }
-
-void Data::reset_current_line() { _current_line = 0; }
-
-int Data::get_next_batches(std::vector<std::vector<float>>* data,
-                           std::vector<std::vector<size_t>>* offsets) {
-  data->clear();
-  offsets->clear();
-  data->resize(_inputs_size);
-  offsets->resize(_inputs_size);
-  for (auto& offset : *offsets) {
-    offset.push_back(0);
-  }
-
-  int seq_num = -1;
-  int pre_query_index = -1;
-  while (_current_line < _lines.size()) {
-    int cur_query_index = -1;
-    std::vector<std::string> line;
-    paddle::inference::split(_lines[_current_line], ';', &line);
-    for (size_t i = 0; i < line.size(); i++) {
-      std::vector<float> float_v;
-      paddle::inference::split_to_float(line[i], ' ', &float_v);
-      if (i == 0) {
-        cur_query_index = float_v[0];
-        if (pre_query_index != -1 && cur_query_index != pre_query_index) {
-          return seq_num;
-        }
-        seq_num++;
-        _current_line++;
-      } else {
-        if (float_v.size() == 0) {
-          float_v.push_back(-1);
-        }
-        (*data)[i - 1].insert((*data)[i - 1].end(), float_v.begin(),
-                              float_v.end());
-        (*offsets)[i - 1].push_back((*offsets)[i - 1][seq_num] +
-                                    float_v.size());
-      }
-    }
-    if (seq_num + 1 >= _batch_size) {
-      return seq_num;
-    } else {
-      pre_query_index = cur_query_index;
-    }
-  }
-  return seq_num;
-}
-
-namespace paddle {
-
-contrib::AnakinConfig GetConfig() {
-  contrib::AnakinConfig config;
-
-  std::map<std::string, std::vector<int>> init_inputs_shape;
-  init_inputs_shape["q_basic"] = std::vector<int>({1000, 1, 1, 1});
-  init_inputs_shape["q_bigram0"] = std::vector<int>({1000, 1, 1, 1});
-  init_inputs_shape["pt_basic"] = std::vector<int>({2000, 1, 1, 1});
-  init_inputs_shape["pa_basic"] = std::vector<int>({4000, 1, 1, 1});
-  init_inputs_shape["pa_bigram0"] = std::vector<int>({4000, 1, 1, 1});
-  init_inputs_shape["pt_bigram0"] = std::vector<int>({2000, 1, 1, 1});
-
-  // using AnakinConfig::X86 if you need to use cpu to do inference
-  config.target_type = contrib::AnakinConfig::NVGPU;
-  config.model_file = FLAGS_model;
-  config.device_id = 0;
-  config.init_batch_size = FLAGS_init_batch_size;
-  config.init_inputs_shape = init_inputs_shape;
-  config.re_allocable = false;
-  return config;
-}
-
-void single_test(PaddlePredictor* predictor_master) {
-  auto predictor = predictor_master->Clone();
-
-  Data data(FLAGS_datapath, FLAGS_batch_size, FLAGS_start_line, FLAGS_end_line);
-
-  std::vector<std::vector<float>> inputs;
-  std::vector<std::vector<size_t>> seq_offsets;
-  std::vector<float> compare_outputs;
-
-  const std::vector<std::string> input_names{"q_basic",  "q_bigram0",
-                                             "pt_basic", "pt_bigram0",
-                                             "pa_basic", "pa_bigram0"};
-  std::vector<PaddleTensor> input_tensors;
-  std::vector<PaddleTensor> output_tensors;
-  for (auto& name : input_names) {
-    PaddleTensor tensor;
-    tensor.name = name;
-    tensor.dtype = PaddleDType::FLOAT32;
-    input_tensors.push_back(tensor);
-  }
-
-  PaddleTensor tensor_out;
-  tensor_out.name = "save_infer_model/scale_0";
-  tensor_out.shape = std::vector<int>({});
-  tensor_out.data = PaddleBuf();
-  tensor_out.dtype = PaddleDType::FLOAT32;
-  output_tensors.push_back(tensor_out);
-
-  inference::Timer timer;
-  for (int i = 0; i < FLAGS_repeats; i++) {
-    data.reset_current_line();
-    size_t count = 0;
-    float time_sum = 0;
-    while (data.get_next_batches(&inputs, &seq_offsets) >= 0) {
-#if PRINT_INPUTS
-      for (size_t i = 0; i < inputs.size(); i++) {
-        LOG(INFO) << "data " << i;
-        for (size_t j = 0; j < inputs[i].size(); j++) {
-          LOG(INFO) << j << ": " << inputs[i][j];
-        }
-        for (auto j : seq_offsets[i]) {
-          LOG(INFO) << "offsets: " << i << ": " << j;
-        }
-      }
-#endif
-      for (size_t j = 0; j < input_tensors.size(); j++) {
-        input_tensors[j].data =
-            PaddleBuf(&inputs[j][0], inputs[j].size() * sizeof(float));
-        input_tensors[j].lod =
-            std::vector<std::vector<size_t>>({seq_offsets[j]});
-        input_tensors[j].shape =
-            std::vector<int>({static_cast<int>(inputs[j].size()), 1, 1, 1});
-      }
-      timer.tic();
-      predictor->Run(input_tensors, &output_tensors);
-      float time = timer.toc();
-#if COMPARE_OUTPUTS
-      float* data_o = static_cast<float*>(output_tensors[0].data.data());
-      LOG(INFO) << "outputs[0].data.size() = "
-                << output_tensors[0].data.length() / sizeof(float);
-      size_t sum = 1;
-      for_each(output_tensors[0].shape.begin(), output_tensors[0].shape.end(),
-               [&](int n) { sum *= n; });
-      for (size_t j = 0; j < sum; ++j) {
-        LOG(INFO) << "output[" << j << "]: " << data_o[j];
-        compare_outputs.push_back(data_o[j]);
-      }
-#endif
-      LOG(INFO) << "Single Time: " << time;
-      count++;
-      if (count > 10) {
-        time_sum += timer.toc();
-      }
-    }
-    inference::PrintTime(FLAGS_batch_size, FLAGS_repeats, 1, 0,
-                         time_sum / (count - 10));
-#if COMPARE_OUTPUTS
-    Data data(FLAGS_truthpath, 1);
-    const std::vector<std::string> truth_vals = data.get_lines();
-    for (size_t j = 0; j < truth_vals.size(); j++) {
-      float truth = std::atof(truth_vals[j].c_str());
-      float compa = compare_outputs[j];
-      float diff = std::abs(truth - compa);
-      LOG(INFO) << "[DIFF " << j << " ] " << diff;
-      if (diff > 0.0001) {
-        LOG(FATAL) << "The result is wrong!";
-      }
-    }
-    LOG(INFO) << "The result is correct!";
-#endif
-  }
-}
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  std::vector<std::thread> threads;
-
-  auto config = paddle::GetConfig();
-  config.data_stream_id = 0;
-  config.compute_stream_id = 0;
-  std::unique_ptr<paddle::PaddlePredictor> predictor_master =
-      paddle::CreatePaddlePredictor<paddle::contrib::AnakinConfig,
-                                    paddle::PaddleEngineKind::kAnakin>(config);
-
-  for (int i = 0; i < FLAGS_threads_num; i++) {
-    threads.push_back(std::thread(paddle::single_test, predictor_master.get()));
-  }
-  for (auto& t : threads) {
-    t.join();
-  }
-  return 0;
-}
diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
deleted file mode 100644
index f679e1221821a3ef32989127e01e6af67240fab8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/transfer_scope_cache.h"
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-
-using paddle::PaddleTensor;
-
-template <typename T>
-void GetValueFromStream(std::stringstream *ss, T *t) {
-  (*ss) >> (*t);
-}
-
-template <>
-void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
-  *t = ss->str();
-}
-
-// Split string to vector
-template <typename T>
-void Split(const std::string &line, char sep, std::vector<T> *v) {
-  std::stringstream ss;
-  T t;
-  for (auto c : line) {
-    if (c != sep) {
-      ss << c;
-    } else {
-      GetValueFromStream<T>(&ss, &t);
-      v->push_back(std::move(t));
-      ss.str({});
-      ss.clear();
-    }
-  }
-
-  if (!ss.str().empty()) {
-    GetValueFromStream<T>(&ss, &t);
-    v->push_back(std::move(t));
-    ss.str({});
-    ss.clear();
-  }
-}
-
-// Parse tensor from string
-template <typename T>
-bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
-  std::vector<std::string> data;
-  Split(field, ':', &data);
-  if (data.size() < 2) return false;
-
-  std::string shape_str = data[0];
-
-  std::vector<int> shape;
-  Split(shape_str, ' ', &shape);
-
-  std::string mat_str = data[1];
-
-  std::vector<T> mat;
-  Split(mat_str, ' ', &mat);
-
-  tensor->shape = shape;
-  auto size =
-      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
-      sizeof(T);
-  tensor->data.Resize(size);
-  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
-  tensor->dtype = GetPaddleDType<T>();
-
-  return true;
-}
-
-// Parse input tensors from string
-bool ParseLine(const std::string &line,
-               std::vector<paddle::PaddleTensor> *tensors) {
-  std::vector<std::string> fields;
-  Split(line, ';', &fields);
-
-  if (fields.size() < 5) return false;
-
-  tensors->clear();
-  tensors->reserve(5);
-
-  int i = 0;
-  // src_id
-  paddle::PaddleTensor src_id;
-  ParseTensor<int64_t>(fields[i++], &src_id);
-  tensors->push_back(src_id);
-
-  // pos_id
-  paddle::PaddleTensor pos_id;
-  ParseTensor<int64_t>(fields[i++], &pos_id);
-  tensors->push_back(pos_id);
-
-  // segment_id
-  paddle::PaddleTensor segment_id;
-  ParseTensor<int64_t>(fields[i++], &segment_id);
-  tensors->push_back(segment_id);
-
-  // self_attention_bias
-  paddle::PaddleTensor self_attention_bias;
-  ParseTensor<float>(fields[i++], &self_attention_bias);
-  tensors->push_back(self_attention_bias);
-
-  // next_segment_index
-  paddle::PaddleTensor next_segment_index;
-  ParseTensor<int64_t>(fields[i++], &next_segment_index);
-  tensors->push_back(next_segment_index);
-
-  return true;
-}
-
-bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
-  if (FLAGS_infer_data.empty()) {
-    LOG(ERROR) << "please set input data path";
-    return false;
-  }
-
-  std::ifstream fin(FLAGS_infer_data);
-  std::string line;
-  int sample = 0;
-
-  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
-  while (std::getline(fin, line)) {
-    std::vector<paddle::PaddleTensor> feed_data;
-    ParseLine(line, &feed_data);
-    inputs->push_back(std::move(feed_data));
-    sample++;
-    if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break;
-  }
-  LOG(INFO) << "number of samples: " << sample;
-
-  return true;
-}
-
-void SetConfig(AnalysisConfig *config) { config->SetModel(FLAGS_infer_model); }
-
-void profile(bool use_mkldnn = false, bool use_ngraph = false) {
-  AnalysisConfig config;
-  SetConfig(&config);
-
-  if (use_mkldnn) {
-    config.EnableMKLDNN();
-    config.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  if (use_ngraph) {
-    config.EnableNgraph();
-  }
-
-  std::vector<std::vector<PaddleTensor>> outputs;
-  std::vector<std::vector<PaddleTensor>> inputs;
-  LoadInputData(&inputs);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
-                 inputs, &outputs, FLAGS_num_threads);
-}
-
-TEST(Analyzer_bert, profile) { profile(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_bert, profile_mkldnn) { profile(true, false); }
-#endif
-
-#ifdef PADDLE_WITH_NGRAPH
-TEST(Analyzer_bert, profile_ngraph) { profile(false, true); }
-#endif
-
-// Check the fuse status
-TEST(Analyzer_bert, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-  LOG(INFO) << "num_ops: " << num_ops;
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false, bool use_ngraph = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  if (use_ngraph) {
-    cfg.EnableNgraph();
-  }
-
-  std::vector<std::vector<PaddleTensor>> inputs;
-  LoadInputData(&inputs);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), inputs);
-}
-
-TEST(Analyzer_bert, compare) { compare(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_bert, compare_mkldnn) {
-  compare(true, false /* use_mkldnn, no use_ngraph */);
-}
-#endif
-
-#ifdef PADDLE_WITH_NGRAPH
-TEST(Analyzer_bert, compare_ngraph) {
-  compare(false, true /* no use_mkldnn, use_ngraph */);
-}
-#endif
-
-// Compare Deterministic result
-TEST(Analyzer_bert, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> inputs;
-  LoadInputData(&inputs);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       inputs);
-}
-
-TEST(Analyzer_bert, transfer_scope_cache) {
-  AnalysisConfig config;
-  SetConfig(&config);
-
-  std::vector<PaddleTensor> input, output;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-
-  int threads_num = 10;
-  std::vector<std::thread> threads;
-  std::unordered_set<std::unordered_set<paddle::framework::Scope *> *>
-      global_transfer_scope_cache;
-  std::unordered_set<std::unordered_map<size_t, paddle::framework::Scope *> *>
-      global_transfer_data_cache;
-
-  std::ifstream fin(FLAGS_infer_data);
-  std::string line;
-
-  for (int i = 0; i < threads_num; i++) {
-    threads.emplace_back([&, i]() {
-      std::getline(fin, line);
-      ParseLine(line, &input);
-      predictor->Run(input, &output, FLAGS_batch_size);
-      global_transfer_scope_cache.insert(
-          &paddle::framework::global_transfer_scope_cache());
-      global_transfer_data_cache.insert(
-          &paddle::framework::global_transfer_data_cache());
-    });
-    threads[0].join();
-    threads.clear();
-    std::vector<PaddleTensor>().swap(input);
-  }
-  // Since paddle::framework::global_transfer_scope_cache() and
-  // paddle::framework::global_transfer_data_cache() are thread_local,
-  // their pointer should be different among different thread id.
-  PADDLE_ENFORCE(global_transfer_scope_cache.size(), threads_num);
-  PADDLE_ENFORCE(global_transfer_data_cache.size(), threads_num);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
deleted file mode 100644
index 78c87b6db508c4eb49f74d3f87bdb83afc470208..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ /dev/null
@@ -1,333 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-DEFINE_int32(max_turn_num, 9,
-             "The max turn number: 1 for the small and 9 for the normal.");
-
-namespace paddle {
-namespace inference {
-
-constexpr int32_t kMaxTurnLen = 50;
-
-static std::vector<float> result_data;
-
-struct DataRecord {
-  std::vector<std::vector<int64_t>> *turns;
-  std::vector<std::vector<float>> *turns_mask;
-  std::vector<std::vector<int64_t>> response;     // response data : 1
-  std::vector<std::vector<float>> response_mask;  // response mask data : 1
-  size_t batch_iter{0};
-  size_t batch_size{1};
-  size_t num_samples;  // total number of samples
-
-  DataRecord() {
-    turns = new std::vector<std::vector<
-        int64_t>>[FLAGS_max_turn_num];  // turns data : FLAGS_max_turn_num
-    turns_mask = new std::vector<std::vector<
-        float>>[FLAGS_max_turn_num];  // turns mask data : FLAGS_max_turn_num
-  }
-
-  explicit DataRecord(const std::string &path, int batch_size = 1)
-      : DataRecord() {
-    this->batch_size = batch_size;
-    Load(path);
-  }
-
-  ~DataRecord() {
-    delete[] turns;
-    delete[] turns_mask;
-  }
-
-  DataRecord NextBatch() {
-    DataRecord data;
-    size_t batch_end = batch_iter + batch_size;
-    // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= response.size()) {
-      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
-        data.turns[i].assign(turns[i].begin() + batch_iter,
-                             turns[i].begin() + batch_end);
-      }
-      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
-        data.turns_mask[i].assign(turns_mask[i].begin() + batch_iter,
-                                  turns_mask[i].begin() + batch_end);
-      }
-      data.response.assign(response.begin() + batch_iter,
-                           response.begin() + batch_end);
-      data.response_mask.assign(response_mask.begin() + batch_iter,
-                                response_mask.begin() + batch_end);
-      CHECK(!data.response.empty());
-      CHECK(!data.response_mask.empty());
-      CHECK_EQ(data.response.size(), data.response_mask.size());
-    }
-    batch_iter += batch_size;
-    return data;
-  }
-
-  void Load(const std::string &path) {
-    std::ifstream file(path);
-    std::string line;
-    size_t num_lines = 0;
-    result_data.clear();
-    while (std::getline(file, line)) {
-      num_lines++;
-      std::vector<std::string> data;
-      split(line, ',', &data);
-      CHECK_EQ(data.size(), (size_t)(2 * FLAGS_max_turn_num + 3));
-      // load turn data
-      std::vector<int64_t> turns_tmp[FLAGS_max_turn_num];
-      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
-        split_to_int64(data[i], ' ', &turns_tmp[i]);
-        turns[i].push_back(std::move(turns_tmp[i]));
-      }
-      // load turn_mask data
-      std::vector<float> turns_mask_tmp[FLAGS_max_turn_num];
-      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
-        split_to_float(data[FLAGS_max_turn_num + i], ' ', &turns_mask_tmp[i]);
-        turns_mask[i].push_back(std::move(turns_mask_tmp[i]));
-      }
-      // load response data
-      std::vector<int64_t> response_tmp;
-      split_to_int64(data[2 * FLAGS_max_turn_num], ' ', &response_tmp);
-      response.push_back(std::move(response_tmp));
-      // load response_mask data
-      std::vector<float> response_mask_tmp;
-      split_to_float(data[2 * FLAGS_max_turn_num + 1], ' ', &response_mask_tmp);
-      response_mask.push_back(std::move(response_mask_tmp));
-      // load result data
-      float result_tmp;
-      result_tmp = std::stof(data[2 * FLAGS_max_turn_num + 2]);
-      result_data.push_back(result_tmp);
-    }
-    num_samples = num_lines;
-  }
-};
-
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
-                   int batch_size) {
-  PaddleTensor turns_tensor[FLAGS_max_turn_num];
-  PaddleTensor turns_mask_tensor[FLAGS_max_turn_num];
-  PaddleTensor response_tensor;
-  PaddleTensor response_mask_tensor;
-  std::string turn_pre = "turn_";
-  std::string turn_mask_pre = "turn_mask_";
-
-  auto one_batch = data->NextBatch();
-  PADDLE_ENFORCE(!one_batch.response.empty());
-  int size = one_batch.response[0].size();
-  CHECK_EQ(size, kMaxTurnLen);
-  // turn tensor assignment
-  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
-    turns_tensor[i].name = turn_pre + std::to_string(i);
-    turns_tensor[i].shape.assign({batch_size, size, 1});
-    turns_tensor[i].dtype = PaddleDType::INT64;
-    TensorAssignData<int64_t>(&turns_tensor[i], one_batch.turns[i]);
-  }
-  // turn mask tensor assignment
-  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
-    turns_mask_tensor[i].name = turn_mask_pre + std::to_string(i);
-    turns_mask_tensor[i].shape.assign({batch_size, size, 1});
-    turns_mask_tensor[i].dtype = PaddleDType::FLOAT32;
-    TensorAssignData<float>(&turns_mask_tensor[i], one_batch.turns_mask[i]);
-  }
-  // response tensor assignment
-  response_tensor.name = "response";
-  response_tensor.shape.assign({batch_size, size, 1});
-  response_tensor.dtype = PaddleDType::INT64;
-  TensorAssignData<int64_t>(&response_tensor, one_batch.response);
-  // response mask tensor assignment
-  response_mask_tensor.name = "response_mask";
-  response_mask_tensor.shape.assign({batch_size, size, 1});
-  response_mask_tensor.dtype = PaddleDType::FLOAT32;
-  TensorAssignData<float>(&response_mask_tensor, one_batch.response_mask);
-
-  // Set inputs.
-  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
-    input_slots->push_back(std::move(turns_tensor[i]));
-  }
-  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
-    input_slots->push_back(std::move(turns_mask_tensor[i]));
-  }
-  input_slots->push_back(std::move(response_tensor));
-  input_slots->push_back(std::move(response_mask_tensor));
-}
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim(true);
-}
-
-void SetOptimConfig(AnalysisConfig *cfg) {
-  std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
-  cfg->SetModel(optimModelPath + "/model", optimModelPath + "/params");
-  cfg->SwitchIrOptim(true);
-  cfg->SwitchSpecifyInputNames();
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  std::vector<PaddleTensor> input_slots;
-  int test_batch_num =
-      FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
-  LOG(INFO) << "The number of samples to be test: "
-            << test_batch_num * FLAGS_batch_size;
-  for (int bid = 0; bid < test_batch_num; ++bid) {
-    input_slots.clear();
-    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-    (*inputs).emplace_back(input_slots);
-  }
-}
-
-// Easy for profiling independently.
-void profile(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    // Enable all the mkldnn supported ops except conv3d in dam
-    std::unordered_set<std::string> op_list = {"softmax", "elementwise_add",
-                                               "relu", "fc"};
-    cfg.SetMKLDNNOp(op_list);
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  std::vector<std::vector<PaddleTensor>> outputs;
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-
-  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
-    auto output = outputs.back();
-    PADDLE_ENFORCE_GT(output.size(), 0);
-    size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(output[0].data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(result[i], result_data[i], 1e-3);
-    }
-  }
-}
-
-TEST(Analyzer_dam, profile) { profile(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); }
-#endif
-
-// Check the fuse status
-TEST(Analyzer_dam, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    // Enable all the mkldnn supported ops except conv3d in dam
-    std::unordered_set<std::string> op_list = {"softmax", "elementwise_add",
-                                               "relu"};
-    cfg.SetMKLDNNOp(op_list);
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-TEST(Analyzer_dam, compare_with_dynamic_memory_optim) {
-  // The small dam will core in CI, but works in local.
-  if (FLAGS_max_turn_num == 9) {
-    AnalysisConfig cfg, cfg1;
-    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-
-    std::vector<std::vector<PaddleTensor>> input_slots_all;
-    SetInput(&input_slots_all);
-    // Run the first time to force to update memory cache
-    SetConfig(&cfg);
-    cfg.EnableMemoryOptim();
-
-    CompareNativeAndAnalysis(
-        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-        input_slots_all);
-  }
-}
-
-TEST(Analyzer_dam, compare) { compare(); }
-
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
-#endif
-
-// Compare Deterministic result
-TEST(Analyzer_dam, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-// Save optim model
-TEST(Analyzer_dam, save_optim_model) {
-  AnalysisConfig cfg;
-  std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
-  mkdir(optimModelPath.c_str(), 0777);
-  SetConfig(&cfg);
-  SaveOptimModel(&cfg, optimModelPath);
-}
-
-void CompareOptimAndOrig(const PaddlePredictor::Config *orig_config,
-                         const PaddlePredictor::Config *optim_config,
-                         const std::vector<std::vector<PaddleTensor>> &inputs) {
-  PrintConfig(orig_config, true);
-  PrintConfig(optim_config, true);
-  std::vector<std::vector<PaddleTensor>> orig_outputs, optim_outputs;
-  TestOneThreadPrediction(orig_config, inputs, &orig_outputs, false);
-  TestOneThreadPrediction(optim_config, inputs, &optim_outputs, false);
-  CompareResult(orig_outputs.back(), optim_outputs.back());
-}
-
-TEST(Analyzer_dam, compare_optim_orig) {
-  AnalysisConfig orig_cfg;
-  AnalysisConfig optim_cfg;
-  SetConfig(&orig_cfg);
-  SetOptimConfig(&optim_cfg);
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareOptimAndOrig(
-      reinterpret_cast<const PaddlePredictor::Config *>(&orig_cfg),
-      reinterpret_cast<const PaddlePredictor::Config *>(&optim_cfg),
-      input_slots_all);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
deleted file mode 100644
index 07934f96dc55ef1e80b54472c74975ff62b6add3..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-DEFINE_bool(disable_mkldnn_fc, false, "Disable usage of MKL-DNN's FC op");
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
-  cfg->DisableGpu();
-  cfg->SwitchIrOptim();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  SetFakeImageInput(inputs, FLAGS_infer_model);
-}
-
-void SetOptimConfig(AnalysisConfig *cfg) {
-  std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
-  cfg->SetModel(optimModelPath + "/model", optimModelPath + "/params");
-  cfg->DisableGpu();
-  cfg->SwitchIrOptim();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
-}
-
-// Easy for profiling independently.
-void profile(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    if (!FLAGS_disable_mkldnn_fc)
-      cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-  std::vector<std::vector<PaddleTensor>> outputs;
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
-
-TEST(Analyzer_resnet50, profile) { profile(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); }
-#endif
-
-// Check the fuse status
-TEST(Analyzer_resnet50, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-  LOG(INFO) << "num_ops: " << num_ops;
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    if (!FLAGS_disable_mkldnn_fc)
-      cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-TEST(Analyzer_resnet50, compare) { compare(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); }
-#endif
-
-// Compare Deterministic result
-TEST(Analyzer_resnet50, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-// Save optim model
-TEST(Analyzer_resnet50, save_optim_model) {
-  AnalysisConfig cfg;
-  std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
-  mkdir(optimModelPath.c_str(), 0777);
-  SetConfig(&cfg);
-  SaveOptimModel(&cfg, optimModelPath);
-}
-
-void CompareOptimAndOrig(const PaddlePredictor::Config *orig_config,
-                         const PaddlePredictor::Config *optim_config,
-                         const std::vector<std::vector<PaddleTensor>> &inputs) {
-  PrintConfig(orig_config, true);
-  PrintConfig(optim_config, true);
-  std::vector<std::vector<PaddleTensor>> orig_outputs, optim_outputs;
-  TestOneThreadPrediction(orig_config, inputs, &orig_outputs, false);
-  TestOneThreadPrediction(optim_config, inputs, &optim_outputs, false);
-  CompareResult(orig_outputs.back(), optim_outputs.back());
-}
-
-TEST(Analyzer_resnet50, compare_optim_orig) {
-  AnalysisConfig orig_cfg;
-  AnalysisConfig optim_cfg;
-  SetConfig(&orig_cfg);
-  SetOptimConfig(&optim_cfg);
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareOptimAndOrig(
-      reinterpret_cast<const PaddlePredictor::Config *>(&orig_cfg),
-      reinterpret_cast<const PaddlePredictor::Config *>(&optim_cfg),
-      input_slots_all);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
deleted file mode 100644
index 3e4a8f3ff38c65fc6ed17df1c0bbffdba56eeeba..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model);
-  cfg->DisableGpu();
-  cfg->SwitchIrOptim();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
-  cfg->EnableMKLDNN();
-}
-
-template <typename T>
-class TensorReader {
- public:
-  TensorReader(std::ifstream &file, size_t beginning_offset,
-               std::vector<int> shape, std::string name)
-      : file_(file), position(beginning_offset), shape_(shape), name_(name) {
-    numel = std::accumulate(shape_.begin(), shape_.end(), size_t{1},
-                            std::multiplies<size_t>());
-  }
-
-  PaddleTensor NextBatch() {
-    PaddleTensor tensor;
-    tensor.name = name_;
-    tensor.shape = shape_;
-    tensor.dtype = GetPaddleDType<T>();
-    tensor.data.Resize(numel * sizeof(T));
-
-    file_.seekg(position);
-    file_.read(static_cast<char *>(tensor.data.data()), numel * sizeof(T));
-    position = file_.tellg();
-
-    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
-    if (file_.fail())
-      throw std::runtime_error(name_ + ": failed reading file.");
-
-    return tensor;
-  }
-
- protected:
-  std::ifstream &file_;
-  size_t position;
-  std::vector<int> shape_;
-  std::string name_;
-  size_t numel;
-};
-
-std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
-    const std::vector<std::vector<PaddleTensor>> &test_data,
-    int num_images = FLAGS_warmup_batch_size) {
-  int test_data_batch_size = test_data[0][0].shape[0];
-  auto iterations = test_data.size();
-  PADDLE_ENFORCE(
-      static_cast<size_t>(num_images) <= iterations * test_data_batch_size,
-      "The requested quantization warmup data size " +
-          std::to_string(num_images) + " is bigger than all test data size.");
-
-  PaddleTensor images;
-  images.name = "image";
-  images.shape = {num_images, 3, 224, 224};
-  images.dtype = PaddleDType::FLOAT32;
-  images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
-
-  PaddleTensor labels;
-  labels.name = "label";
-  labels.shape = {num_images, 1};
-  labels.dtype = PaddleDType::INT64;
-  labels.data.Resize(sizeof(int64_t) * num_images);
-
-  for (int i = 0; i < num_images; i++) {
-    auto batch = i / test_data_batch_size;
-    auto element_in_batch = i % test_data_batch_size;
-    std::copy_n(static_cast<float *>(test_data[batch][0].data.data()) +
-                    element_in_batch * 3 * 224 * 224,
-                3 * 224 * 224,
-                static_cast<float *>(images.data.data()) + i * 3 * 224 * 224);
-
-    std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
-                    element_in_batch,
-                1, static_cast<int64_t *>(labels.data.data()) + i);
-  }
-
-  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
-  (*warmup_data)[0] = std::move(images);
-  (*warmup_data)[1] = std::move(labels);
-  return warmup_data;
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
-              int32_t batch_size = FLAGS_batch_size) {
-  std::ifstream file(FLAGS_infer_data, std::ios::binary);
-  if (!file) {
-    FAIL() << "Couldn't open file: " << FLAGS_infer_data;
-  }
-
-  int64_t total_images{0};
-  file.read(reinterpret_cast<char *>(&total_images), sizeof(total_images));
-  LOG(INFO) << "Total images in file: " << total_images;
-
-  std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
-  std::vector<int> label_batch_shape{batch_size, 1};
-  auto images_offset_in_file = static_cast<size_t>(file.tellg());
-  auto labels_offset_in_file =
-      images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
-
-  TensorReader<float> image_reader(file, images_offset_in_file,
-                                   image_batch_shape, "image");
-  TensorReader<int64_t> label_reader(file, labels_offset_in_file,
-                                     label_batch_shape, "label");
-
-  auto iterations_max = total_images / batch_size;
-  auto iterations = iterations_max;
-  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) {
-    iterations = FLAGS_iterations;
-  }
-  for (auto i = 0; i < iterations; i++) {
-    auto images = image_reader.NextBatch();
-    auto labels = label_reader.NextBatch();
-    inputs->emplace_back(
-        std::vector<PaddleTensor>{std::move(images), std::move(labels)});
-  }
-}
-
-TEST(Analyzer_int8_image_classification, quantization) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  AnalysisConfig q_cfg;
-  SetConfig(&q_cfg);
-
-  // read data from file and prepare batches with test data
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-
-  // prepare warmup batch from input data read earlier
-  // warmup batch size can be different than batch size
-  std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
-      GetWarmupData(input_slots_all);
-
-  // configure quantizer
-  q_cfg.EnableMkldnnQuantizer();
-  q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
-  q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size);
-
-  CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
deleted file mode 100644
index 72da7c48b2575d2bc49abeaf79d8c989aee4898a..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ /dev/null
@@ -1,283 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-// setting iterations to 0 means processing the whole dataset
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model);
-  cfg->DisableGpu();
-  cfg->SwitchIrOptim(true);
-  cfg->SwitchSpecifyInputNames(false);
-  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
-  cfg->EnableMKLDNN();
-}
-
-std::vector<size_t> ReadObjectsNum(std::ifstream &file, size_t offset,
-                                   int64_t total_images) {
-  std::vector<size_t> num_objects;
-  num_objects.resize(total_images);
-
-  file.clear();
-  file.seekg(offset);
-  file.read(reinterpret_cast<char *>(num_objects.data()),
-            total_images * sizeof(size_t));
-
-  if (file.eof()) LOG(ERROR) << "Reached end of stream";
-  if (file.fail()) throw std::runtime_error("Failed reading file.");
-  return num_objects;
-}
-
-template <typename T>
-class TensorReader {
- public:
-  TensorReader(std::ifstream &file, size_t beginning_offset, std::string name)
-      : file_(file), position(beginning_offset), name_(name) {}
-
-  PaddleTensor NextBatch(std::vector<int> shape, std::vector<size_t> lod) {
-    int numel =
-        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    PaddleTensor tensor;
-    tensor.name = name_;
-    tensor.shape = shape;
-    tensor.dtype = GetPaddleDType<T>();
-    tensor.data.Resize(numel * sizeof(T));
-    if (lod.empty() == false) {
-      tensor.lod.clear();
-      tensor.lod.push_back(lod);
-    }
-    file_.seekg(position);
-    file_.read(reinterpret_cast<char *>(tensor.data.data()), numel * sizeof(T));
-    position = file_.tellg();
-    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
-    if (file_.fail())
-      throw std::runtime_error(name_ + ": failed reading file.");
-    return tensor;
-  }
-
- protected:
-  std::ifstream &file_;
-  size_t position;
-  std::string name_;
-};
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
-              int32_t batch_size = FLAGS_batch_size) {
-  std::ifstream file(FLAGS_infer_data, std::ios::binary);
-  if (!file) {
-    FAIL() << "Couldn't open file: " << FLAGS_infer_data;
-  }
-
-  int64_t total_images{0};
-  file.read(reinterpret_cast<char *>(&total_images), sizeof(int64_t));
-  LOG(INFO) << "Total images in file: " << total_images;
-
-  size_t image_beginning_offset = static_cast<size_t>(file.tellg());
-  auto lod_offset_in_file =
-      image_beginning_offset + sizeof(float) * total_images * 3 * 300 * 300;
-  auto labels_beginning_offset =
-      lod_offset_in_file + sizeof(size_t) * total_images;
-
-  std::vector<size_t> lod_full =
-      ReadObjectsNum(file, lod_offset_in_file, total_images);
-  size_t sum_objects_num =
-      std::accumulate(lod_full.begin(), lod_full.end(), 0UL);
-
-  auto bbox_beginning_offset =
-      labels_beginning_offset + sizeof(int64_t) * sum_objects_num;
-  auto difficult_beginning_offset =
-      bbox_beginning_offset + sizeof(float) * sum_objects_num * 4;
-
-  TensorReader<float> image_reader(file, image_beginning_offset, "image");
-  TensorReader<int64_t> label_reader(file, labels_beginning_offset, "gt_label");
-  TensorReader<float> bbox_reader(file, bbox_beginning_offset, "gt_bbox");
-  TensorReader<int64_t> difficult_reader(file, difficult_beginning_offset,
-                                         "gt_difficult");
-  auto iterations_max = total_images / batch_size;
-  auto iterations = iterations_max;
-  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) {
-    iterations = FLAGS_iterations;
-  }
-  for (auto i = 0; i < iterations; i++) {
-    auto images_tensor = image_reader.NextBatch({batch_size, 3, 300, 300}, {});
-    std::vector<size_t> batch_lod(lod_full.begin() + i * batch_size,
-                                  lod_full.begin() + batch_size * (i + 1));
-    size_t batch_num_objects =
-        std::accumulate(batch_lod.begin(), batch_lod.end(), 0UL);
-    batch_lod.insert(batch_lod.begin(), 0UL);
-    for (auto it = batch_lod.begin() + 1; it != batch_lod.end(); it++) {
-      *it = *it + *(it - 1);
-    }
-    auto labels_tensor = label_reader.NextBatch(
-        {static_cast<int>(batch_num_objects), 1}, batch_lod);
-    auto bbox_tensor = bbox_reader.NextBatch(
-        {static_cast<int>(batch_num_objects), 4}, batch_lod);
-    auto difficult_tensor = difficult_reader.NextBatch(
-        {static_cast<int>(batch_num_objects), 1}, batch_lod);
-
-    inputs->emplace_back(std::vector<PaddleTensor>{
-        std::move(images_tensor), std::move(bbox_tensor),
-        std::move(labels_tensor), std::move(difficult_tensor)});
-  }
-}
-
-std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
-    const std::vector<std::vector<PaddleTensor>> &test_data,
-    int32_t num_images = FLAGS_warmup_batch_size) {
-  int test_data_batch_size = test_data[0][0].shape[0];
-  auto iterations = test_data.size();
-  PADDLE_ENFORCE_LE(
-      static_cast<size_t>(num_images), iterations * test_data_batch_size,
-      "The requested quantization warmup data size " +
-          std::to_string(num_images) + " is bigger than all test data size.");
-
-  PaddleTensor images;
-  images.name = "image";
-  images.shape = {num_images, 3, 300, 300};
-  images.dtype = PaddleDType::FLOAT32;
-  images.data.Resize(sizeof(float) * num_images * 3 * 300 * 300);
-
-  int batches = num_images / test_data_batch_size;
-  int batch_remain = num_images % test_data_batch_size;
-  size_t num_objects = 0UL;
-  std::vector<size_t> accum_lod;
-  accum_lod.push_back(0UL);
-  for (int i = 0; i < batches; i++) {
-    std::transform(test_data[i][1].lod[0].begin() + 1,
-                   test_data[i][1].lod[0].end(), std::back_inserter(accum_lod),
-                   [&num_objects](size_t lodtemp) -> size_t {
-                     return lodtemp + num_objects;
-                   });
-    num_objects += test_data[i][1].lod[0][test_data_batch_size];
-  }
-  if (batch_remain > 0) {
-    std::transform(test_data[batches][1].lod[0].begin() + 1,
-                   test_data[batches][1].lod[0].begin() + batch_remain + 1,
-                   std::back_inserter(accum_lod),
-                   [&num_objects](size_t lodtemp) -> size_t {
-                     return lodtemp + num_objects;
-                   });
-    num_objects = num_objects + test_data[batches][1].lod[0][batch_remain];
-  }
-
-  PaddleTensor labels;
-  labels.name = "gt_label";
-  labels.shape = {static_cast<int>(num_objects), 1};
-  labels.dtype = PaddleDType::INT64;
-  labels.data.Resize(sizeof(int64_t) * num_objects);
-  labels.lod.push_back(accum_lod);
-
-  PaddleTensor bbox;
-  bbox.name = "gt_bbox";
-  bbox.shape = {static_cast<int>(num_objects), 4};
-  bbox.dtype = PaddleDType::FLOAT32;
-  bbox.data.Resize(sizeof(float) * num_objects * 4);
-  bbox.lod.push_back(accum_lod);
-
-  PaddleTensor difficult;
-  difficult.name = "gt_difficult";
-  difficult.shape = {static_cast<int>(num_objects), 1};
-  difficult.dtype = PaddleDType::INT64;
-  difficult.data.Resize(sizeof(int64_t) * num_objects);
-  difficult.lod.push_back(accum_lod);
-
-  size_t objects_accum = 0;
-  size_t objects_in_batch = 0;
-  for (int i = 0; i < batches; i++) {
-    objects_in_batch = test_data[i][1].lod[0][test_data_batch_size];
-    std::copy_n(static_cast<float *>(test_data[i][0].data.data()),
-                test_data_batch_size * 3 * 300 * 300,
-                static_cast<float *>(images.data.data()) +
-                    i * test_data_batch_size * 3 * 300 * 300);
-    std::copy_n(static_cast<int64_t *>(test_data[i][1].data.data()),
-                objects_in_batch,
-                static_cast<int64_t *>(labels.data.data()) + objects_accum);
-    std::copy_n(static_cast<float *>(test_data[i][2].data.data()),
-                objects_in_batch * 4,
-                static_cast<float *>(bbox.data.data()) + objects_accum * 4);
-    std::copy_n(static_cast<int64_t *>(test_data[i][3].data.data()),
-                objects_in_batch,
-                static_cast<int64_t *>(difficult.data.data()) + objects_accum);
-    objects_accum = objects_accum + objects_in_batch;
-  }
-  if (batch_remain > 0) {
-    size_t objects_remain = test_data[batches][1].lod[0][batch_remain];
-    std::copy_n(static_cast<float *>(test_data[batches][0].data.data()),
-                batch_remain * 3 * 300 * 300,
-                static_cast<float *>(images.data.data()) +
-                    objects_accum * 3 * 300 * 300);
-    std::copy_n(static_cast<int64_t *>(test_data[batches][1].data.data()),
-                objects_remain,
-                static_cast<int64_t *>(labels.data.data()) + objects_accum);
-    std::copy_n(static_cast<float *>(test_data[batches][2].data.data()),
-                objects_remain * 4,
-                static_cast<float *>(bbox.data.data()) + objects_accum * 4);
-    std::copy_n(static_cast<int64_t *>(test_data[batches][3].data.data()),
-                objects_remain,
-                static_cast<int64_t *>(difficult.data.data()) + objects_accum);
-    objects_accum = objects_accum + objects_remain;
-  }
-  PADDLE_ENFORCE_EQ(
-      static_cast<size_t>(num_objects), static_cast<size_t>(objects_accum),
-      "The requested num of objects " + std::to_string(num_objects) +
-          " is the same as objects_accum.");
-
-  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(4);
-  (*warmup_data)[0] = std::move(images);
-  (*warmup_data)[1] = std::move(bbox);
-  (*warmup_data)[2] = std::move(labels);
-  (*warmup_data)[3] = std::move(difficult);
-
-  return warmup_data;
-}
-
-TEST(Analyzer_int8_mobilenet_ssd, quantization) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  AnalysisConfig q_cfg;
-  SetConfig(&q_cfg);
-
-  // read data from file and prepare batches with test data
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-
-  // prepare warmup batch from input data read earlier
-  // warmup batch size can be different than batch size
-  std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
-      GetWarmupData(input_slots_all);
-
-  // configure quantizer
-  q_cfg.EnableMkldnnQuantizer();
-  q_cfg.mkldnn_quantizer_config();
-  std::unordered_set<std::string> quantize_operators(
-      {"conv2d", "depthwise_conv2d", "prior_box"});
-  q_cfg.mkldnn_quantizer_config()->SetEnabledOpTypes(quantize_operators);
-  q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
-  q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size);
-
-  // 0 is avg_cost, 1 is top1_acc, 2 is top5_acc or mAP
-  CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all, 2);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
deleted file mode 100644
index 142905dcd8d9964d93d0c5f7444823eef2b84900..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-struct DataRecord {
-  std::vector<int64_t> data;
-  std::vector<size_t> lod;
-  // for dataset and nextbatch
-  size_t batch_iter{0};
-  std::vector<std::vector<size_t>> batched_lods;
-  std::vector<std::vector<int64_t>> batched_datas;
-  std::vector<std::vector<int64_t>> datasets;
-  DataRecord() = default;
-  explicit DataRecord(const std::string &path, int batch_size = 1) {
-    Load(path);
-    Prepare(batch_size);
-    batch_iter = 0;
-  }
-  void Load(const std::string &path) {
-    std::ifstream file(path);
-    std::string line;
-    int num_lines = 0;
-    datasets.resize(0);
-    while (std::getline(file, line)) {
-      num_lines++;
-      std::vector<std::string> data;
-      split(line, ';', &data);
-      std::vector<int64_t> words_ids;
-      split_to_int64(data[1], ' ', &words_ids);
-      datasets.emplace_back(words_ids);
-    }
-  }
-  void Prepare(int bs) {
-    if (bs == 1) {
-      batched_datas = datasets;
-      for (auto one_sentence : datasets) {
-        batched_lods.push_back({0, one_sentence.size()});
-      }
-    } else {
-      std::vector<int64_t> one_batch;
-      std::vector<size_t> lod{0};
-      int bs_id = 0;
-      for (auto one_sentence : datasets) {
-        bs_id++;
-        one_batch.insert(one_batch.end(), one_sentence.begin(),
-                         one_sentence.end());
-        lod.push_back(lod.back() + one_sentence.size());
-        if (bs_id == bs) {
-          bs_id = 0;
-          batched_datas.push_back(one_batch);
-          batched_lods.push_back(lod);
-          one_batch.clear();
-          one_batch.resize(0);
-          lod.clear();
-          lod.resize(0);
-          lod.push_back(0);
-        }
-      }
-      if (one_batch.size() != 0) {
-        batched_datas.push_back(one_batch);
-        batched_lods.push_back(lod);
-      }
-    }
-  }
-
-  DataRecord NextBatch() {
-    DataRecord data;
-    data.data = batched_datas[batch_iter];
-    data.lod = batched_lods[batch_iter];
-    batch_iter++;
-    if (batch_iter >= batched_datas.size()) {
-      batch_iter = 0;
-    }
-    return data;
-  }
-};
-
-void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
-                 int batch_size) {
-  auto one_batch = data->NextBatch();
-  PaddleTensor input_tensor;
-  input_tensor.name = "word";
-  input_tensor.dtype = PaddleDType::INT64;
-  TensorAssignData<int64_t>(&input_tensor, {one_batch.data}, one_batch.lod);
-  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
-  input_slots->assign({input_tensor});
-}
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model);
-  cfg->DisableGpu();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim();
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  std::vector<PaddleTensor> input_slots;
-  int epoch = FLAGS_test_all_data ? data.batched_datas.size() : 1;
-  LOG(INFO) << "number of samples: " << epoch;
-  for (int bid = 0; bid < epoch; ++bid) {
-    GetOneBatch(&input_slots, &data, FLAGS_batch_size);
-    (*inputs).emplace_back(input_slots);
-  }
-}
-
-// Easy for profiling independently.
-TEST(Analyzer_LAC, profile) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> outputs;
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-
-  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    // the first inference result
-    const int64_t lac_ref_data[] = {
-        24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
-        44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
-        15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
-    auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
-    size_t size = GetSize(output[0]);
-    size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
-    PADDLE_ENFORCE_GE(size, batch1_size);
-    int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
-    for (size_t i = 0; i < batch1_size; ++i) {
-      EXPECT_EQ(pdata[i], lac_ref_data[i]);
-    }
-  }
-}
-
-// Check the fuse status
-TEST(Analyzer_LAC, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
-  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
-  EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
-  EXPECT_EQ(num_ops, 11);
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_LAC, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-// Compare Deterministic result
-TEST(Analyzer_LAC, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
deleted file mode 100644
index 36e07d5f55600dc7aa96227289f707fb19f92d56..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-
-struct DataRecord {
-  std::vector<std::vector<int64_t>> word, mention;
-  std::vector<size_t> lod;  // two inputs have the same lod info.
-  size_t batch_iter{0}, batch_size{1}, num_samples;  // total number of samples
-  DataRecord() = default;
-  explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
-    Load(path);
-  }
-  DataRecord NextBatch() {
-    DataRecord data;
-    size_t batch_end = batch_iter + batch_size;
-    // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= word.size()) {
-      GetInputPerBatch(word, &data.word, &data.lod, batch_iter, batch_end);
-      GetInputPerBatch(mention, &data.mention, &data.lod, batch_iter,
-                       batch_end);
-    }
-    batch_iter += batch_size;
-    return data;
-  }
-  void Load(const std::string &path) {
-    std::ifstream file(path);
-    std::string line;
-    int num_lines = 0;
-    while (std::getline(file, line)) {
-      num_lines++;
-      std::vector<std::string> data;
-      split(line, ';', &data);
-      // load word data
-      std::vector<int64_t> word_data;
-      split_to_int64(data[1], ' ', &word_data);
-      // load mention data
-      std::vector<int64_t> mention_data;
-      split_to_int64(data[3], ' ', &mention_data);
-      word.push_back(std::move(word_data));
-      mention.push_back(std::move(mention_data));
-    }
-    num_samples = num_lines;
-  }
-};
-
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
-  PaddleTensor lod_word_tensor, lod_mention_tensor;
-  lod_word_tensor.name = "word";
-  lod_mention_tensor.name = "mention";
-  auto one_batch = data->NextBatch();
-  // assign data
-  TensorAssignData<int64_t>(&lod_word_tensor, one_batch.word, one_batch.lod);
-  TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.mention,
-                            one_batch.lod);
-  // Set inputs.
-  input_slots->assign({lod_word_tensor, lod_mention_tensor});
-  for (auto &tensor : *input_slots) {
-    tensor.dtype = PaddleDType::INT64;
-  }
-}
-
-void SetConfig(AnalysisConfig *cfg, bool memory_load = false) {
-  if (memory_load) {
-    std::string buffer_prog, buffer_param;
-    ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog);
-    ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param);
-    cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
-                        buffer_param.size());
-  } else {
-    cfg->SetModel(FLAGS_infer_model + "/__model__",
-                  FLAGS_infer_model + "/param");
-  }
-  cfg->DisableGpu();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim();
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  std::vector<PaddleTensor> input_slots;
-  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
-  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
-  for (int bid = 0; bid < epoch; ++bid) {
-    PrepareInputs(&input_slots, &data);
-    (*inputs).emplace_back(input_slots);
-  }
-}
-
-// Easy for profiling independently.
-void profile(bool memory_load = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg, memory_load);
-  std::vector<std::vector<PaddleTensor>> outputs;
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-
-  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    // the first inference result
-    const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
-                                           48, 39, 38, 16, 25};
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
-    auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
-    size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
-    int64_t *result = static_cast<int64_t *>(output[0].data.data());
-    for (size_t i = 0; i < std::min(11UL, size); i++) {
-      EXPECT_EQ(result[i], chinese_ner_result_data[i]);
-    }
-  }
-}
-
-TEST(Analyzer_Chinese_ner, profile) { profile(); }
-
-TEST(Analyzer_Chinese_ner, profile_memory_load) {
-  profile(true /* memory_load */);
-}
-
-// Check the fuse status
-TEST(Analyzer_Chinese_ner, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
-  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
-  EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
-  EXPECT_EQ(num_ops, 14);
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_Chinese_ner, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-// Compare Deterministic result
-TEST(Analyzer_Chinese_ner, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
deleted file mode 100644
index 11a49ed2914ae22c2ddb4cfe384900adfce4f21d..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-
-struct DataRecord {
-  std::vector<std::vector<int64_t>> query_basic, query_phrase, title_basic,
-      title_phrase;
-  std::vector<size_t> lod1, lod2, lod3, lod4;
-  size_t batch_iter{0}, batch_size{1}, num_samples;  // total number of samples
-  DataRecord() = default;
-  explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
-    Load(path);
-  }
-  DataRecord NextBatch() {
-    DataRecord data;
-    size_t batch_end = batch_iter + batch_size;
-    // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= query_basic.size()) {
-      GetInputPerBatch(query_basic, &data.query_basic, &data.lod1, batch_iter,
-                       batch_end);
-      GetInputPerBatch(query_phrase, &data.query_phrase, &data.lod2, batch_iter,
-                       batch_end);
-      GetInputPerBatch(title_basic, &data.title_basic, &data.lod3, batch_iter,
-                       batch_end);
-      GetInputPerBatch(title_phrase, &data.title_phrase, &data.lod4, batch_iter,
-                       batch_end);
-    }
-    batch_iter += batch_size;
-    return data;
-  }
-  void Load(const std::string &path) {
-    std::ifstream file(path);
-    std::string line;
-    int num_lines = 0;
-    while (std::getline(file, line)) {
-      std::vector<std::string> data;
-      split(line, ';', &data);
-      // load query data
-      std::vector<int64_t> query_basic_data;
-      split_to_int64(data[1], ' ', &query_basic_data);
-      std::vector<int64_t> query_phrase_data;
-      split_to_int64(data[2], ' ', &query_phrase_data);
-      // load title data
-      std::vector<int64_t> title_basic_data;
-      split_to_int64(data[3], ' ', &title_basic_data);
-      std::vector<int64_t> title_phrase_data;
-      split_to_int64(data[4], ' ', &title_phrase_data);
-      // filter the empty data
-      bool flag =
-          data[1].size() && data[2].size() && data[3].size() && data[4].size();
-      if (flag) {
-        query_basic.push_back(std::move(query_basic_data));
-        query_phrase.push_back(std::move(query_phrase_data));
-        title_basic.push_back(std::move(title_basic_data));
-        title_phrase.push_back(std::move(title_phrase_data));
-        num_lines++;
-      }
-    }
-    num_samples = num_lines;
-  }
-};
-
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
-                   int batch_size) {
-  PaddleTensor query_basic_tensor, query_phrase_tensor, title_basic_tensor,
-      title_phrase_tensor;
-  query_basic_tensor.name = "query_basic";
-  query_phrase_tensor.name = "query_phrase";
-  title_basic_tensor.name = "pos_title_basic";
-  title_phrase_tensor.name = "pos_title_phrase";
-  auto one_batch = data->NextBatch();
-  // assign data
-  TensorAssignData<int64_t>(&query_basic_tensor, one_batch.query_basic,
-                            one_batch.lod1);
-  TensorAssignData<int64_t>(&query_phrase_tensor, one_batch.query_phrase,
-                            one_batch.lod2);
-  TensorAssignData<int64_t>(&title_basic_tensor, one_batch.title_basic,
-                            one_batch.lod3);
-  TensorAssignData<int64_t>(&title_phrase_tensor, one_batch.title_phrase,
-                            one_batch.lod4);
-  // Set inputs.
-  input_slots->assign({query_basic_tensor, query_phrase_tensor,
-                       title_basic_tensor, title_phrase_tensor});
-  for (auto &tensor : *input_slots) {
-    tensor.dtype = PaddleDType::INT64;
-  }
-}
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model);
-  cfg->DisableGpu();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim();
-  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
-  if (FLAGS_zero_copy) {
-    cfg->SwitchUseFeedFetchOps(false);
-  }
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  std::vector<PaddleTensor> input_slots;
-  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
-  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
-  for (int bid = 0; bid < epoch; ++bid) {
-    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-    (*inputs).emplace_back(input_slots);
-  }
-}
-
-// Easy for profiling independently.
-TEST(Analyzer_Pyramid_DNN, profile) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> outputs;
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-
-  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
-    auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
-    size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(output[0].data.data());
-    // output is probability, which is in (0, 1).
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_GT(result[i], 0);
-      EXPECT_LT(result[i], 1);
-    }
-  }
-}
-
-// Check the fuse status
-TEST(Analyzer_Pyramid_DNN, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_Pyramid_DNN, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
-TEST(Analyzer_Pyramid_DNN, compare_zero_copy) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  AnalysisConfig cfg1;
-  SetConfig(&cfg1);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  std::vector<std::string> outputs_name;
-  outputs_name.emplace_back("cos_sim_2.tmp_0");
-  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
-                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
-                             input_slots_all, outputs_name);
-}
-
-// Compare Deterministic result
-TEST(Analyzer_Pyramid_DNN, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
deleted file mode 100644
index 620a1d1f7a390f38fe2662169f35994dca9976f9..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-DEFINE_bool(with_precision_check, true, "turn on test");
-
-namespace paddle {
-namespace inference {
-
-using namespace framework;  // NOLINT
-
-struct DataRecord {
-  std::vector<std::vector<std::vector<float>>> link_step_data_all;
-  std::vector<std::vector<float>> week_data_all, minute_data_all;
-  std::vector<size_t> lod1, lod2, lod3;
-  std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
-      rnn_minute_datas;
-  size_t num_samples;  // total number of samples
-  size_t batch_iter{0};
-  size_t batch_size{1};
-  DataRecord() = default;
-
-  explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
-    Load(path);
-  }
-
-  DataRecord NextBatch() {
-    DataRecord data;
-    size_t batch_end = batch_iter + batch_size;
-    // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= link_step_data_all.size()) {
-      data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
-                                     link_step_data_all.begin() + batch_end);
-      data.week_data_all.assign(week_data_all.begin() + batch_iter,
-                                week_data_all.begin() + batch_end);
-      data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
-                                  minute_data_all.begin() + batch_end);
-      // Prepare LoDs
-      data.lod1.push_back(0);
-      data.lod2.push_back(0);
-      data.lod3.push_back(0);
-      CHECK(!data.link_step_data_all.empty()) << "empty";
-      CHECK(!data.week_data_all.empty());
-      CHECK(!data.minute_data_all.empty());
-      CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
-      CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
-      for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
-        for (const auto &d : data.link_step_data_all[j]) {
-          data.rnn_link_data.push_back(d);
-        }
-        data.rnn_week_datas.push_back(data.week_data_all[j]);
-        data.rnn_minute_datas.push_back(data.minute_data_all[j]);
-        // calculate lod
-        data.lod1.push_back(data.lod1.back() +
-                            data.link_step_data_all[j].size());
-        data.lod3.push_back(data.lod3.back() + 1);
-        for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
-          data.lod2.push_back(data.lod2.back() +
-                              data.link_step_data_all[j].size());
-        }
-      }
-    }
-    batch_iter += batch_size;
-    return data;
-  }
-  void Load(const std::string &path) {
-    std::ifstream file(path);
-    std::string line;
-    int num_lines = 0;
-    while (std::getline(file, line)) {
-      num_lines++;
-      std::vector<std::string> data;
-      split(line, ':', &data);
-      std::vector<std::vector<float>> link_step_data;
-      std::vector<std::string> link_datas;
-      split(data[0], '|', &link_datas);
-      for (auto &step_data : link_datas) {
-        std::vector<float> tmp;
-        split_to_float(step_data, ',', &tmp);
-        link_step_data.push_back(tmp);
-      }
-      // load week data
-      std::vector<float> week_data;
-      split_to_float(data[2], ',', &week_data);
-      // load minute data
-      std::vector<float> minute_data;
-      split_to_float(data[1], ',', &minute_data);
-      link_step_data_all.push_back(std::move(link_step_data));
-      week_data_all.push_back(std::move(week_data));
-      minute_data_all.push_back(std::move(minute_data));
-    }
-    num_samples = num_lines;
-  }
-};
-
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
-                   int batch_size) {
-  PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
-      week_tensor, minute_tensor;
-  lod_attention_tensor.name = "data_lod_attention";
-  init_zero_tensor.name = "cell_init";
-  lod_tensor_tensor.name = "data";
-  week_tensor.name = "week";
-  minute_tensor.name = "minute";
-  auto one_batch = data->NextBatch();
-  std::vector<int> rnn_link_data_shape(
-      {static_cast<int>(one_batch.rnn_link_data.size()),
-       static_cast<int>(one_batch.rnn_link_data.front().size())});
-  lod_attention_tensor.shape.assign({1, 2});
-  lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
-  init_zero_tensor.shape.assign({batch_size, 15});
-  init_zero_tensor.lod.assign({one_batch.lod3});
-  lod_tensor_tensor.shape = rnn_link_data_shape;
-  lod_tensor_tensor.lod.assign({one_batch.lod1});
-  // clang-format off
-  week_tensor.shape.assign(
-      {static_cast<int>(one_batch.rnn_week_datas.size()),
-       static_cast<int>(one_batch.rnn_week_datas.front().size())});
-  week_tensor.lod.assign({one_batch.lod3});
-  minute_tensor.shape.assign(
-      {static_cast<int>(one_batch.rnn_minute_datas.size()),
-       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
-  minute_tensor.lod.assign({one_batch.lod3});
-  // clang-format on
-  // assign data
-  TensorAssignData<float>(&lod_attention_tensor,
-                          std::vector<std::vector<float>>({{0, 0}}));
-  std::vector<float> tmp_zeros(batch_size * 15, 0.);
-  TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
-  TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
-  TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
-  TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
-  // Set inputs.
-  auto init_zero_tensor1 = init_zero_tensor;
-  init_zero_tensor1.name = "hidden_init";
-  input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
-                       init_zero_tensor1, lod_attention_tensor,
-                       lod_tensor_tensor});
-  for (auto &tensor : *input_slots) {
-    tensor.dtype = PaddleDType::FLOAT32;
-  }
-}
-
-void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
-                           ZeroCopyTensor *cell_init_tensor,
-                           ZeroCopyTensor *data_tensor,
-                           ZeroCopyTensor *hidden_init_tensor,
-                           ZeroCopyTensor *week_tensor,
-                           ZeroCopyTensor *minute_tensor,
-                           DataRecord *data_record, int batch_size) {
-  auto one_batch = data_record->NextBatch();
-  std::vector<int> rnn_link_data_shape(
-      {static_cast<int>(one_batch.rnn_link_data.size()),
-       static_cast<int>(one_batch.rnn_link_data.front().size())});
-  lod_attention_tensor->Reshape({1, 2});
-  lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2});
-
-  cell_init_tensor->Reshape({batch_size, 15});
-  cell_init_tensor->SetLoD({one_batch.lod3});
-
-  hidden_init_tensor->Reshape({batch_size, 15});
-  hidden_init_tensor->SetLoD({one_batch.lod3});
-
-  data_tensor->Reshape(rnn_link_data_shape);
-  data_tensor->SetLoD({one_batch.lod1});
-
-  week_tensor->Reshape(
-      {static_cast<int>(one_batch.rnn_week_datas.size()),
-       static_cast<int>(one_batch.rnn_week_datas.front().size())});
-  week_tensor->SetLoD({one_batch.lod3});
-
-  minute_tensor->Reshape(
-      {static_cast<int>(one_batch.rnn_minute_datas.size()),
-       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
-  minute_tensor->SetLoD({one_batch.lod3});
-
-  // assign data
-  float arr0[] = {0, 0};
-  std::vector<float> zeros(batch_size * 15, 0);
-  std::copy_n(arr0, 2,
-              lod_attention_tensor->mutable_data<float>(PaddlePlace::kCPU));
-  std::copy_n(arr0, 2, data_tensor->mutable_data<float>(PaddlePlace::kCPU));
-  std::copy_n(zeros.begin(), zeros.size(),
-              cell_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
-  std::copy_n(zeros.begin(), zeros.size(),
-              hidden_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
-  ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data);
-  ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas);
-  ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas);
-}
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
-  cfg->DisableGpu();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim();
-  if (FLAGS_zero_copy) {
-    cfg->SwitchUseFeedFetchOps(false);
-  }
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  std::vector<PaddleTensor> input_slots;
-  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
-  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
-  for (int bid = 0; bid < epoch; ++bid) {
-    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-    (*inputs).emplace_back(input_slots);
-  }
-}
-
-// Easy for profiling independently.
-TEST(Analyzer_rnn1, profile) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  cfg.DisableGpu();
-  cfg.SwitchIrDebug();
-  std::vector<std::vector<PaddleTensor>> outputs;
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
-
-// Check the fuse status
-TEST(Analyzer_rnn1, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
-  EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
-  EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
-  EXPECT_EQ(num_ops,
-            13);  // After graph optimization, only 13 operators exists.
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_rnn1, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-// Compare Deterministic result
-TEST(Analyzer_rnn1, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-// Test Multi-Thread.
-TEST(Analyzer_rnn1, multi_thread) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> outputs;
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, 2 /* multi_thread */);
-}
-
-// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
-TEST(Analyzer_rnn1, compare_zero_copy) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  AnalysisConfig cfg1;
-  SetConfig(&cfg1);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  std::vector<std::string> outputs_name;
-  outputs_name.emplace_back("final_output.tmp_1");
-  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
-                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
-                             input_slots_all, outputs_name);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
deleted file mode 100644
index 9ccbf58cbd2bbaab9b1a132c27e50356e1a5df37..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-
-using namespace framework;  // NOLINT
-static std::vector<float> result_data;
-
-struct DataRecord {
-  std::vector<std::vector<std::vector<float>>> link_step_data_all;
-  std::vector<size_t> lod;
-  std::vector<std::vector<float>> rnn_link_data;
-  size_t num_samples;  // total number of samples
-  size_t batch_iter{0};
-  size_t batch_size{1};
-  DataRecord() = default;
-  explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
-    Load(path);
-  }
-  DataRecord NextBatch() {
-    DataRecord data;
-    size_t batch_end = batch_iter + batch_size;
-    // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= link_step_data_all.size()) {
-      data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
-                                     link_step_data_all.begin() + batch_end);
-      // Prepare LoDs
-      data.lod.push_back(0);
-      CHECK(!data.link_step_data_all.empty()) << "empty";
-      for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
-        for (const auto &d : data.link_step_data_all[j]) {
-          data.rnn_link_data.push_back(d);
-          // calculate lod
-          data.lod.push_back(data.lod.back() + 11);
-        }
-      }
-    }
-    batch_iter += batch_size;
-    return data;
-  }
-  void Load(const std::string &path) {
-    std::ifstream file(path);
-    std::string line;
-    int num_lines = 0;
-    result_data.clear();
-    while (std::getline(file, line)) {
-      num_lines++;
-      std::vector<std::string> data;
-      split(line, ':', &data);
-      if (num_lines % 2) {  // feature
-        std::vector<std::string> feature_data;
-        split(data[1], ' ', &feature_data);
-        std::vector<std::vector<float>> link_step_data;
-        int feature_count = 1;
-        std::vector<float> feature;
-        for (auto &step_data : feature_data) {
-          std::vector<float> tmp;
-          split_to_float(step_data, ',', &tmp);
-          feature.insert(feature.end(), tmp.begin(), tmp.end());
-          if (feature_count % 11 == 0) {  // each sample has 11 features
-            link_step_data.push_back(feature);
-            feature.clear();
-          }
-          feature_count++;
-        }
-        link_step_data_all.push_back(std::move(link_step_data));
-      } else {  // result
-        std::vector<float> tmp;
-        split_to_float(data[1], ',', &tmp);
-        result_data.insert(result_data.end(), tmp.begin(), tmp.end());
-      }
-    }
-    num_samples = num_lines / 2;
-  }
-};
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
-                   int batch_size) {
-  PaddleTensor feed_tensor;
-  feed_tensor.name = "feed";
-  auto one_batch = data->NextBatch();
-  int token_size = one_batch.rnn_link_data.size();
-  // each token has 11 features, each feature's dim is 54.
-  std::vector<int> rnn_link_data_shape({token_size * 11, 54});
-  feed_tensor.shape = rnn_link_data_shape;
-  feed_tensor.lod.assign({one_batch.lod});
-  feed_tensor.dtype = PaddleDType::FLOAT32;
-  TensorAssignData<float>(&feed_tensor, one_batch.rnn_link_data);
-  // Set inputs.
-  input_slots->assign({feed_tensor});
-}
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
-  cfg->DisableGpu();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim();
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  std::vector<PaddleTensor> input_slots;
-  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
-  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
-  for (int bid = 0; bid < epoch; ++bid) {
-    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-    (*inputs).emplace_back(input_slots);
-  }
-}
-
-// Easy for profiling independently.
-TEST(Analyzer_rnn2, profile) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> outputs;
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-
-  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    // the first inference result
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
-    auto output = outputs.back();
-    PADDLE_ENFORCE_GT(output.size(), 0);
-    size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(output[0].data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(result[i], result_data[i], 1e-3);
-    }
-  }
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_rnn2, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-// Compare Deterministic result
-TEST(Analyzer_rnn2, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc b/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc
deleted file mode 100644
index 977b2ec885dcba8677a0705f698cd0200b789916..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim(true);
-  cfg->SwitchIrDebug();
-}
-
-int GetNumOps(const AnalysisConfig &cfg) {
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  GetFuseStatis(static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-  return num_ops;
-}
-
-TEST(Analyzer, save_model) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  cfg.SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
-  //  ensure the path being unique
-  std::string optimModelPath = FLAGS_infer_model + "/only_for_save_model_test";
-  mkdir(optimModelPath.c_str(), 0777);
-  SaveOptimModel(&cfg, optimModelPath);
-
-  // Each config can only be applied to one predictor.
-  AnalysisConfig cfg2;
-  SetConfig(&cfg2);
-  cfg2.pass_builder()->ClearPasses();
-  cfg2.SetModel(optimModelPath + "/model", optimModelPath + "/params");
-  int origin_num_ops = GetNumOps(cfg2);
-
-  AnalysisConfig cfg3;
-  SetConfig(&cfg3);
-  cfg3.SetModel(optimModelPath + "/model", optimModelPath + "/params");
-  int fused_num_ops = GetNumOps(cfg3);
-  CHECK_LE(fused_num_ops, origin_num_ops);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
deleted file mode 100644
index e3f8b835f78371170aaf107e1b2d1ca41b300e56..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-
-struct DataRecord {
-  std::vector<std::vector<int64_t>> title1, title2, title3, l1;
-  std::vector<size_t> lod1, lod2, lod3, l1_lod;
-  size_t batch_iter{0}, batch_size{1}, num_samples;  // total number of samples
-  DataRecord() = default;
-  explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
-    Load(path);
-  }
-  DataRecord NextBatch() {
-    DataRecord data;
-    size_t batch_end = batch_iter + batch_size;
-    // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= title1.size()) {
-      GetInputPerBatch(title1, &data.title1, &data.lod1, batch_iter, batch_end);
-      GetInputPerBatch(title2, &data.title2, &data.lod2, batch_iter, batch_end);
-      GetInputPerBatch(title3, &data.title3, &data.lod3, batch_iter, batch_end);
-      GetInputPerBatch(l1, &data.l1, &data.l1_lod, batch_iter, batch_end);
-    }
-    batch_iter += batch_size;
-    return data;
-  }
-  void Load(const std::string &path) {
-    std::ifstream file(path);
-    std::string line;
-    int num_lines = 0;
-    while (std::getline(file, line)) {
-      num_lines++;
-      std::vector<std::string> data;
-      split(line, '\t', &data);
-      PADDLE_ENFORCE(data.size() >= 4);
-      // load title1 data
-      std::vector<int64_t> title1_data;
-      split_to_int64(data[0], ' ', &title1_data);
-      // load title2 data
-      std::vector<int64_t> title2_data;
-      split_to_int64(data[1], ' ', &title2_data);
-      // load title3 data
-      std::vector<int64_t> title3_data;
-      split_to_int64(data[2], ' ', &title3_data);
-      // load l1 data
-      std::vector<int64_t> l1_data;
-      split_to_int64(data[3], ' ', &l1_data);
-      title1.push_back(std::move(title1_data));
-      title2.push_back(std::move(title2_data));
-      title3.push_back(std::move(title3_data));
-      l1.push_back(std::move(l1_data));
-    }
-    num_samples = num_lines;
-  }
-};
-
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
-                   int batch_size) {
-  PaddleTensor title1_tensor, title2_tensor, title3_tensor, l1_tensor;
-  title1_tensor.name = "title1";
-  title2_tensor.name = "title2";
-  title3_tensor.name = "title3";
-  l1_tensor.name = "l1";
-  auto one_batch = data->NextBatch();
-  // assign data
-  TensorAssignData<int64_t>(&title1_tensor, one_batch.title1, one_batch.lod1);
-  TensorAssignData<int64_t>(&title2_tensor, one_batch.title2, one_batch.lod2);
-  TensorAssignData<int64_t>(&title3_tensor, one_batch.title3, one_batch.lod3);
-  TensorAssignData<int64_t>(&l1_tensor, one_batch.l1, one_batch.l1_lod);
-  // Set inputs.
-  input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor});
-  for (auto &tensor : *input_slots) {
-    tensor.dtype = PaddleDType::INT64;
-  }
-}
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model);
-  cfg->DisableGpu();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim();
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  std::vector<PaddleTensor> input_slots;
-  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
-  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
-  for (int bid = 0; bid < epoch; ++bid) {
-    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-    (*inputs).emplace_back(input_slots);
-  }
-}
-
-// Easy for profiling independently.
-TEST(Analyzer_seq_conv1, profile) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> outputs;
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-
-  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    // the first inference result
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
-    auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
-    size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(output[0].data.data());
-    // output is probability, which is in (0, 1).
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_GT(result[i], 0);
-      EXPECT_LT(result[i], 1);
-    }
-  }
-}
-
-// Check the fuse status
-TEST(Analyzer_seq_conv1, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-
-  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  ASSERT_TRUE(fuse_statis.count("seqconv_eltadd_relu_fuse"));
-  EXPECT_EQ(fuse_statis.at("fc_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("seqconv_eltadd_relu_fuse"), 6);
-  EXPECT_EQ(num_ops, 31);
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_seq_conv1, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-// Compare Deterministic result
-TEST(Analyzer_seq_conv1, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
deleted file mode 100644
index e6f2bfad68c9883b50f7fdd306a65946c178e50a..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-// diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1
-static const char out_var_name[] = "reduce_sum_0.tmp_0";
-
-// for diff: 154, for speed 111
-constexpr int num_slots = 154;
-
-struct OneSlotInBatch {
-  std::string name;
-  std::vector<std::vector<float>> data;
-  std::vector<int> shape;
-  std::vector<size_t> lod;
-};
-
-struct DataRecord {
-  std::vector<std::vector<OneSlotInBatch>> batched_data;
-  std::map<std::string, std::vector<std::vector<float>>> datasets;
-  size_t batch_iter{0}, num_samples;  // total number of samples
-
-  DataRecord() = default;
-  explicit DataRecord(const std::string &path, int batch_size = 1) {
-    Load(path);
-    Prepare(batch_size);
-  }
-
-  void Load(const std::string &path) {
-    std::ifstream file(path);
-    std::string line;
-    int num_lines = 0;
-    while (std::getline(file, line)) {
-      num_lines++;
-      std::vector<std::string> data;
-      split(line, '\t', &data);
-      std::vector<float> slot_data;
-      split_to_float(data[1], ' ', &slot_data);
-      std::string name = data[0];
-      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL,
-                        "line %d, %s should be divisible", num_lines, name);
-      datasets[name].emplace_back(std::move(slot_data));
-    }
-    num_samples = num_lines / num_slots;
-    PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
-                      "num samples should be divisible");
-    PADDLE_ENFORCE_GT(num_samples, 0UL);
-  }
-
-  void Prepare(int bs) {
-    for (auto it = datasets.begin(); it != datasets.end(); ++it) {
-      PADDLE_ENFORCE_EQ(it->second.size(), num_samples,
-                        "size of each slot should be equal");
-    }
-    size_t num_batches = num_samples / bs;
-    EXPECT_GT(num_batches, 0);
-    batched_data.resize(num_batches);
-    for (auto &one_batch : batched_data) {
-      one_batch.resize(datasets.size());
-      size_t i = 0;
-      for (auto it = datasets.begin(); it != datasets.end(); ++it) {
-        auto &slot = one_batch[i];
-        slot.name = it->first;
-        slot.data.resize(bs);
-        slot.lod.resize(bs + 1);
-        slot.lod[0] = 0;
-        auto &lod = slot.lod;
-        auto &datas = it->second;
-        for (int k = 0; k < bs; ++k) {
-          size_t id = k + batch_iter * bs;
-          std::copy(datas[id].begin(), datas[id].end(),
-                    std::back_inserter(slot.data[k]));
-          size_t len = datas[id].size() / 11;
-          PADDLE_ENFORCE_EQ(len * 11, datas[id].size(),
-                            "%s %d size should be divisible", slot.name, id);
-          lod[k + 1] = lod[k] + len;
-        }
-        slot.shape.assign({static_cast<int>(lod[bs]), 11});
-        i++;
-      }
-    }
-  }
-
-  const std::vector<OneSlotInBatch> &NextBatch() {
-    if (batch_iter >= batched_data.size() - 1) {
-      batch_iter = -1;
-    }
-    return batched_data[++batch_iter];
-  }
-};
-
-static void TensorAssignSlot(PaddleTensor *tensor, const OneSlotInBatch &slot) {
-  tensor->name = slot.name + "_embed";
-  tensor->shape = slot.shape;
-  tensor->dtype = PaddleDType::FLOAT32;
-  tensor->lod.clear();
-  tensor->lod.emplace_back(slot.lod);
-  TensorAssignData(tensor, slot.data);
-}
-
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
-  const auto &one_batch = data->NextBatch();
-  input_slots->resize(one_batch.size());
-  for (size_t i = 0; i < one_batch.size(); ++i) {
-    auto &slot = one_batch[i];
-    TensorAssignSlot(&((*input_slots)[i]), slot);
-  }
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  std::vector<PaddleTensor> input_slots;
-  int epoch = FLAGS_test_all_data ? data.batched_data.size() : 1;
-  LOG(INFO) << "number of samples: "
-            << data.batched_data.size() * FLAGS_batch_size;
-  for (int bid = 0; bid < epoch; ++bid) {
-    PrepareInputs(&input_slots, &data);
-    (*inputs).emplace_back(input_slots);
-  }
-}
-
-void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
-  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
-  cfg->DisableGpu();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrDebug();
-  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
-  if (FLAGS_zero_copy) {
-    cfg->SwitchUseFeedFetchOps(false);
-  }
-  if (use_mkldnn) {
-    cfg->EnableMKLDNN();
-    cfg->pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-  // Enable seqpool_concat_fuse_pass, disabled by default since it takes much
-  // time
-  cfg->pass_builder()->InsertPass(2, "seqpool_concat_fuse_pass");
-}
-
-void profile(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg, use_mkldnn);
-
-  std::vector<std::vector<PaddleTensor>> outputs;
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
-
-TEST(Analyzer_seq_pool1, profile) { profile(); }
-
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_seq_pool1, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-// Compare Deterministic result
-TEST(Analyzer_seq_pool1, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-// Check the fuse status
-TEST(Analyzer_seq_pool1, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
-  ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse"));
-  ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
-  ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
-  EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
-  LOG(INFO) << "num_ops: " << num_ops;
-  EXPECT_EQ(num_ops, 171);
-}
-
-// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
-TEST(Analyzer_seq_pool1, compare_zero_copy) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  AnalysisConfig cfg1;
-  SetConfig(&cfg1);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  std::vector<std::string> outputs_name;
-  outputs_name.emplace_back(out_var_name);
-  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
-                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
-                             input_slots_all, outputs_name);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
deleted file mode 100644
index 78e500b2ed530d5a1dce8a7927538fdd0bbb6907..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-
-struct DataReader {
-  explicit DataReader(const std::string &path)
-      : file(new std::ifstream(path)) {}
-
-  bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
-    PADDLE_ENFORCE_EQ(batch_size, 1);
-    std::string line;
-    PaddleTensor tensor;
-    tensor.dtype = PaddleDType::INT64;
-    tensor.lod.emplace_back(std::vector<size_t>({0}));
-    std::vector<int64_t> data;
-
-    for (int i = 0; i < batch_size; i++) {
-      if (!std::getline(*file, line)) return false;
-      inference::split_to_int64(line, ' ', &data);
-    }
-    tensor.lod.front().push_back(data.size());
-
-    tensor.data.Resize(data.size() * sizeof(int64_t));
-    CHECK(tensor.data.data() != nullptr);
-    CHECK(data.data() != nullptr);
-    memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
-    tensor.shape.push_back(data.size());
-    tensor.shape.push_back(1);
-    input->assign({tensor});
-    return true;
-  }
-
-  std::unique_ptr<std::ifstream> file;
-};
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model);
-  cfg->DisableGpu();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim();
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  std::vector<PaddleTensor> input_slots;
-  DataReader reader(FLAGS_infer_data);
-  int num_batches = 0;
-  while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
-    (*inputs).emplace_back(input_slots);
-    ++num_batches;
-    if (!FLAGS_test_all_data) return;
-  }
-  LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
-}
-
-// Easy for profiling independently.
-TEST(Analyzer_Text_Classification, profile) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  cfg.SwitchIrDebug();
-  std::vector<std::vector<PaddleTensor>> outputs;
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-
-  if (FLAGS_num_threads == 1) {
-    // Get output
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
-    LOG(INFO) << "get outputs " << outputs.back().size();
-    for (auto &output : outputs.back()) {
-      LOG(INFO) << "output.shape: " << to_string(output.shape);
-      // no lod ?
-      CHECK_EQ(output.lod.size(), 0UL);
-      LOG(INFO) << "output.dtype: " << output.dtype;
-      std::stringstream ss;
-      int num_data = 1;
-      for (auto i : output.shape) {
-        num_data *= i;
-      }
-
-      for (int i = 0; i < num_data; i++) {
-        ss << static_cast<float *>(output.data.data())[i] << " ";
-      }
-      LOG(INFO) << "output.data summary: " << ss.str();
-      // one batch ends
-    }
-  }
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_Text_Classification, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  cfg.EnableMemoryOptim();
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-// Compare Deterministic result
-TEST(Analyzer_Text_Classification, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  // Enable embedding_fc_lstm_fuse_pass (disabled by default)
-  cfg.pass_builder()->InsertPass(2, "embedding_fc_lstm_fuse_pass");
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
deleted file mode 100644
index f2195966add8c4c159d26682c9578c95301a345f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ /dev/null
@@ -1,238 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-
-struct DataRecord {
-  std::vector<std::vector<int64_t>> src_word, src_pos, trg_word, init_idx;
-  std::vector<std::vector<float>> src_slf_attn_bias, init_score,
-      trg_src_attn_bias;
-  std::vector<std::vector<int32_t>> batch_data_shape;
-  std::vector<std::vector<size_t>> lod;
-  size_t batch_iter{0}, batch_size{1}, num_samples;  // total number of samples
-  DataRecord() = default;
-  explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
-    Load(path);
-  }
-  DataRecord NextBatch() {
-    DataRecord data;
-    size_t batch_end = batch_iter + batch_size;
-    // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= src_word.size()) {
-      data.src_word.assign(src_word.begin() + batch_iter,
-                           src_word.begin() + batch_end);
-      data.src_pos.assign(src_pos.begin() + batch_iter,
-                          src_pos.begin() + batch_end);
-      data.src_slf_attn_bias.assign(src_slf_attn_bias.begin() + batch_iter,
-                                    src_slf_attn_bias.begin() + batch_end);
-      data.trg_word.assign(trg_word.begin() + batch_iter,
-                           trg_word.begin() + batch_end);
-      data.init_score.assign(init_score.begin() + batch_iter,
-                             init_score.begin() + batch_end);
-      data.init_idx.assign(init_idx.begin() + batch_iter,
-                           init_idx.begin() + batch_end);
-      data.trg_src_attn_bias.assign(trg_src_attn_bias.begin() + batch_iter,
-                                    trg_src_attn_bias.begin() + batch_end);
-      std::vector<int32_t> batch_shape =
-          *(batch_data_shape.begin() + batch_iter);
-      data.batch_data_shape.push_back(batch_shape);
-      data.lod.resize(2);
-      for (int i = 0; i < batch_shape[0] + 1; i++) {
-        data.lod[0].push_back(i);
-        data.lod[1].push_back(i);
-      }
-    }
-    batch_iter += batch_size;
-    return data;
-  }
-  void Load(const std::string &path) {
-    std::ifstream file(path);
-    std::string line;
-    size_t num_lines = 0;
-    while (std::getline(file, line)) {
-      num_lines++;
-      std::vector<std::string> data;
-      split(line, ',', &data);
-      CHECK_EQ(data.size(), static_cast<size_t>(8));
-      // load src_word
-      std::vector<int64_t> src_word_data;
-      split_to_int64(data[0], ' ', &src_word_data);
-      src_word.push_back(std::move(src_word_data));
-      // load src_pos
-      std::vector<int64_t> src_pos_data;
-      split_to_int64(data[1], ' ', &src_pos_data);
-      src_pos.push_back(std::move(src_pos_data));
-      // load src_slf_attn_bias
-      std::vector<float> src_slf_attn_bias_data;
-      split_to_float(data[2], ' ', &src_slf_attn_bias_data);
-      src_slf_attn_bias.push_back(std::move(src_slf_attn_bias_data));
-      // load trg_word
-      std::vector<int64_t> trg_word_data;
-      split_to_int64(data[3], ' ', &trg_word_data);
-      trg_word.push_back(std::move(trg_word_data));
-      // load init_score
-      std::vector<float> init_score_data;
-      split_to_float(data[4], ' ', &init_score_data);
-      init_score.push_back(std::move(init_score_data));
-      // load init_idx
-      std::vector<int64_t> init_idx_data;
-      split_to_int64(data[5], ' ', &init_idx_data);
-      init_idx.push_back(std::move(init_idx_data));
-      // load trg_src_attn_bias
-      std::vector<float> trg_src_attn_bias_data;
-      split_to_float(data[6], ' ', &trg_src_attn_bias_data);
-      trg_src_attn_bias.push_back(std::move(trg_src_attn_bias_data));
-      // load shape for variant data shape
-      std::vector<int> batch_data_shape_data;
-      split_to_int(data[7], ' ', &batch_data_shape_data);
-      batch_data_shape.push_back(std::move(batch_data_shape_data));
-    }
-    num_samples = num_lines;
-  }
-};
-
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
-                   int batch_size) {
-  auto one_batch = data->NextBatch();
-  batch_size = one_batch.batch_data_shape[0][0];
-  auto n_head = one_batch.batch_data_shape[0][1];
-  auto trg_seq_len = one_batch.batch_data_shape[0][2];  // 1 for inference
-  auto src_seq_len = one_batch.batch_data_shape[0][3];
-
-  PaddleTensor src_word, src_pos, src_slf_attn_bias, trg_word, init_score,
-      init_idx, trg_src_attn_bias;
-
-  src_word.name = "src_word";
-  src_word.shape.assign({batch_size, src_seq_len, 1});
-  src_word.dtype = PaddleDType::INT64;
-  TensorAssignData<int64_t>(&src_word, one_batch.src_word);
-
-  src_pos.name = "src_pos";
-  src_pos.shape.assign({batch_size, src_seq_len, 1});
-  src_pos.dtype = PaddleDType::INT64;
-  TensorAssignData<int64_t>(&src_pos, one_batch.src_pos);
-
-  src_slf_attn_bias.name = "src_slf_attn_bias";
-  src_slf_attn_bias.shape.assign(
-      {batch_size, n_head, src_seq_len, src_seq_len});
-  src_slf_attn_bias.dtype = PaddleDType::FLOAT32;
-  TensorAssignData<float>(&src_slf_attn_bias, one_batch.src_slf_attn_bias);
-
-  trg_word.name = "trg_word";
-  trg_word.shape.assign({batch_size, 1});
-  trg_word.dtype = PaddleDType::INT64;
-  trg_word.lod.assign(one_batch.lod.begin(), one_batch.lod.end());
-  TensorAssignData<int64_t>(&trg_word, one_batch.trg_word);
-
-  init_score.name = "init_score";
-  init_score.shape.assign({batch_size, 1});
-  init_score.dtype = PaddleDType::FLOAT32;
-  init_score.lod.assign(one_batch.lod.begin(), one_batch.lod.end());
-  TensorAssignData<float>(&init_score, one_batch.init_score);
-
-  init_idx.name = "init_idx";
-  init_idx.shape.assign({batch_size});
-  init_idx.dtype = PaddleDType::INT32;
-  TensorAssignData<int64_t>(&init_idx, one_batch.init_idx);
-
-  trg_src_attn_bias.name = "trg_src_attn_bias";
-  trg_src_attn_bias.shape.assign(
-      {batch_size, n_head, trg_seq_len, src_seq_len});
-  trg_src_attn_bias.dtype = PaddleDType::FLOAT32;
-  TensorAssignData<float>(&trg_src_attn_bias, one_batch.trg_src_attn_bias);
-
-  input_slots->assign({src_word, src_pos, src_slf_attn_bias, trg_word,
-                       init_score, init_idx, trg_src_attn_bias});
-}
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
-  cfg->DisableGpu();
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim();
-  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  std::vector<PaddleTensor> input_slots;
-  int test_batch_num =
-      FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
-  LOG(INFO) << "The number of samples to be test: "
-            << test_batch_num * FLAGS_batch_size;
-  for (int bid = 0; bid < test_batch_num; ++bid) {
-    input_slots.clear();
-    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-    (*inputs).emplace_back(input_slots);
-  }
-}
-
-// Easy for profiling independently.
-void profile(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> outputs;
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
-
-TEST(Analyzer_Transformer, profile) { profile(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
-#endif
-
-// Check the fuse status
-TEST(Analyzer_Transformer, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-TEST(Analyzer_Transformer, compare) { compare(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
-#endif
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
deleted file mode 100644
index 5f65229ecd52abb904654647eb2f00a8248d8632..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <fstream>
-#include <iostream>
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-struct Record {
-  std::vector<float> data;
-  std::vector<int32_t> shape;
-};
-
-Record ProcessALine(const std::string &line) {
-  VLOG(3) << "process a line";
-  std::vector<std::string> columns;
-  split(line, '\t', &columns);
-  CHECK_EQ(columns.size(), 2UL)
-      << "data format error, should be <data>\t<shape>";
-
-  Record record;
-  std::vector<std::string> data_strs;
-  split(columns[0], ' ', &data_strs);
-  for (auto &d : data_strs) {
-    record.data.push_back(std::stof(d));
-  }
-
-  std::vector<std::string> shape_strs;
-  split(columns[1], ' ', &shape_strs);
-  for (auto &s : shape_strs) {
-    record.shape.push_back(std::stoi(s));
-  }
-  VLOG(3) << "data size " << record.data.size();
-  VLOG(3) << "data shape size " << record.shape.size();
-  return record;
-}
-
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model + "/__model__",
-                FLAGS_infer_model + "/__params__");
-  cfg->DisableGpu();
-  cfg->SwitchIrDebug();
-  cfg->SwitchSpecifyInputNames(false);
-  // TODO(TJ): fix fusion gru
-  cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
-  std::string line;
-  std::ifstream file(FLAGS_infer_data);
-  std::getline(file, line);
-  auto record = ProcessALine(line);
-
-  PaddleTensor input;
-  input.shape = record.shape;
-  input.dtype = PaddleDType::FLOAT32;
-  size_t input_size = record.data.size() * sizeof(float);
-  input.data.Resize(input_size);
-  memcpy(input.data.data(), record.data.data(), input_size);
-  std::vector<PaddleTensor> input_slots;
-  input_slots.assign({input});
-  (*inputs).emplace_back(input_slots);
-}
-
-// Easy for profiling independently.
-//  ocr, mobilenet and se_resnext50
-void profile(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-  // cfg.pass_builder()->TurnOnDebug();
-  std::vector<std::vector<PaddleTensor>> outputs;
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    std::string line;
-    std::ifstream file(FLAGS_refer_result);
-    std::getline(file, line);
-    auto refer = ProcessALine(line);
-    file.close();
-
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
-    auto &output = outputs.back().front();
-    size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-    CHECK_EQ(numel, refer.data.size());
-    for (size_t i = 0; i < numel; ++i) {
-      EXPECT_NEAR(static_cast<float *>(output.data.data())[i], refer.data[i],
-                  1e-5);
-    }
-  }
-}
-
-TEST(Analyzer_vis, profile) { profile(); }
-
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); }
-#endif
-
-// Check the fuse status
-TEST(Analyzer_vis, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  GetFuseStatis(predictor.get(), &num_ops);
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-TEST(Analyzer_vis, compare) { compare(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); }
-#endif
-
-// Compare Deterministic result
-TEST(Analyzer_vis, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
deleted file mode 100644
index de938669c0b0866f9c2f55cd04b866f9a9611294..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <ostream>
-#include <string>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
-namespace paddle {
-namespace inference {
-
-thread_local int num_spaces = 0;
-
-static std::string GenSpaces(int num_spaces) {
-  std::ostringstream os;
-  for (int i = 0; i < num_spaces; ++i) {
-    os << "  ";
-  }
-  return os.str();
-}
-
-std::ostream &operator<<(std::ostream &os,
-                         const PaddlePredictor::Config &config) {
-  os << GenSpaces(num_spaces) << "PaddlePredictor::Config {\n";
-  num_spaces++;
-  os << GenSpaces(num_spaces) << "model_dir: " << config.model_dir << "\n";
-  num_spaces--;
-  os << GenSpaces(num_spaces) << "}\n";
-  return os;
-}
-
-std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
-  os << GenSpaces(num_spaces) << "NativeConfig {\n";
-  num_spaces++;
-  os << *reinterpret_cast<const PaddlePredictor::Config *>(&config);
-  os << GenSpaces(num_spaces) << "use_gpu: " << config.use_gpu << "\n";
-  os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
-  os << GenSpaces(num_spaces)
-     << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
-  os << GenSpaces(num_spaces)
-     << "specify_input_name: " << config.specify_input_name << "\n";
-  num_spaces--;
-  os << GenSpaces(num_spaces) << "}\n";
-  return os;
-}
-
-std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
-  os << GenSpaces(num_spaces) << "AnalysisConfig {\n";
-  num_spaces++;
-  os << config.ToNativeConfig();
-  if (!config.model_from_memory()) {
-    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n";
-    os << GenSpaces(num_spaces) << "param_file: " << config.params_file()
-       << "\n";
-  } else {
-    os << GenSpaces(num_spaces)
-       << "prog_file and param_file: load from memory \n";
-  }
-  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
-     << "\n";
-  os << GenSpaces(num_spaces)
-     << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
-  os << GenSpaces(num_spaces)
-     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
-  os << GenSpaces(num_spaces)
-     << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n";
-  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled()
-     << "\n";
-  os << GenSpaces(num_spaces) << "use_ngraph: " << config.ngraph_enabled()
-     << "\n";
-  num_spaces--;
-  os << GenSpaces(num_spaces) << "}\n";
-  return os;
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
deleted file mode 100644
index 826c45311f478fb30fff173578427b875a1260bb..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-import hashlib
-import unittest
-import os
-import numpy as np
-import time
-import sys
-import random
-import functools
-import contextlib
-from PIL import Image
-import math
-from paddle.dataset.common import download
-import tarfile
-import StringIO
-
-random.seed(0)
-np.random.seed(0)
-
-DATA_DIM = 224
-SIZE_FLOAT32 = 4
-SIZE_INT64 = 8
-FULL_SIZE_BYTES = 30106000008
-FULL_IMAGES = 50000
-TARGET_HASH = '22d2e0008dca693916d9595a5ea3ded8'
-FOLDER_NAME = "ILSVRC2012/"
-VALLIST_TAR_NAME = "ILSVRC2012/val_list.txt"
-CHUNK_SIZE = 8192
-
-img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-
-
-def resize_short(img, target_size):
-    percent = float(target_size) / min(img.size[0], img.size[1])
-    resized_width = int(round(img.size[0] * percent))
-    resized_height = int(round(img.size[1] * percent))
-    img = img.resize((resized_width, resized_height), Image.LANCZOS)
-    return img
-
-
-def crop_image(img, target_size, center):
-    width, height = img.size
-    size = target_size
-    if center == True:
-        w_start = (width - size) / 2
-        h_start = (height - size) / 2
-    else:
-        w_start = np.random.randint(0, width - size + 1)
-        h_start = np.random.randint(0, height - size + 1)
-    w_end = w_start + size
-    h_end = h_start + size
-    img = img.crop((w_start, h_start, w_end, h_end))
-    return img
-
-
-def process_image(img):
-    img = resize_short(img, target_size=256)
-    img = crop_image(img, target_size=DATA_DIM, center=True)
-    if img.mode != 'RGB':
-        img = img.convert('RGB')
-    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
-    img -= img_mean
-    img /= img_std
-    return img
-
-
-def download_concat(cache_folder, zip_path):
-    data_urls = []
-    data_md5s = []
-    data_urls.append(
-        'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
-    )
-    data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
-    data_urls.append(
-        'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
-    )
-    data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
-    file_names = []
-    print("Downloading full ImageNet Validation dataset ...")
-    for i in range(0, len(data_urls)):
-        download(data_urls[i], cache_folder, data_md5s[i])
-        file_name = os.path.join(cache_folder, data_urls[i].split('/')[-1])
-        file_names.append(file_name)
-        print("Downloaded part {0}\n".format(file_name))
-    if not os.path.exists(zip_path):
-        with open(zip_path, "w+") as outfile:
-            for fname in file_names:
-                with open(fname) as infile:
-                    outfile.write(infile.read())
-
-
-def print_processbar(done_percentage):
-    done_filled = done_percentage * '='
-    empty_filled = (100 - done_percentage) * ' '
-    sys.stdout.write("\r[%s%s]%d%%" %
-                     (done_filled, empty_filled, done_percentage))
-    sys.stdout.flush()
-
-
-def check_integrity(filename, target_hash):
-    print('\nThe binary file exists. Checking file integrity...\n')
-    md = hashlib.md5()
-    count = 0
-    onepart = FULL_SIZE_BYTES / CHUNK_SIZE / 100
-    with open(filename) as ifs:
-        while True:
-            buf = ifs.read(CHUNK_SIZE)
-            if count % onepart == 0:
-                done = count / onepart
-                print_processbar(done)
-            count = count + 1
-            if not buf:
-                break
-            md.update(buf)
-    hash1 = md.hexdigest()
-    if hash1 == target_hash:
-        return True
-    else:
-        return False
-
-
-def convert(tar_file, output_file):
-    print('Converting 50000 images to binary file ...\n')
-    tar = tarfile.open(name=tar_file, mode='r:gz')
-
-    print_processbar(0)
-
-    dataset = {}
-    for tarInfo in tar:
-        if tarInfo.isfile() and tarInfo.name != VALLIST_TAR_NAME:
-            dataset[tarInfo.name] = tar.extractfile(tarInfo).read()
-
-    with open(output_file, "w+b") as ofs:
-        ofs.seek(0)
-        num = np.array(int(FULL_IMAGES)).astype('int64')
-        ofs.write(num.tobytes())
-
-        per_percentage = FULL_IMAGES / 100
-
-        idx = 0
-        for imagedata in dataset.values():
-            img = Image.open(StringIO.StringIO(imagedata))
-            img = process_image(img)
-            np_img = np.array(img)
-            ofs.write(np_img.astype('float32').tobytes())
-            if idx % per_percentage == 0:
-                print_processbar(idx / per_percentage)
-            idx = idx + 1
-
-        val_info = tar.getmember(VALLIST_TAR_NAME)
-        val_list = tar.extractfile(val_info).read()
-
-        lines = val_list.split('\n')
-        val_dict = {}
-        for line_idx, line in enumerate(lines):
-            if line_idx == FULL_IMAGES:
-                break
-            name, label = line.split()
-            val_dict[name] = label
-
-        for img_name in dataset.keys():
-            remove_len = (len(FOLDER_NAME))
-            img_name_prim = img_name[remove_len:]
-            label = val_dict[img_name_prim]
-            label_int = (int)(label)
-            np_label = np.array(label_int)
-            ofs.write(np_label.astype('int64').tobytes())
-        print_processbar(100)
-    tar.close()
-    print("Conversion finished.")
-
-
-def run_convert():
-    print('Start to download and convert 50000 images to binary file...')
-    cache_folder = os.path.expanduser('~/.cache/paddle/dataset/int8/download')
-    zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz.partaa')
-    output_file = os.path.join(cache_folder, 'int8_full_val.bin')
-    retry = 0
-    try_limit = 3
-
-    while not (os.path.exists(output_file) and
-               os.path.getsize(output_file) == FULL_SIZE_BYTES and
-               check_integrity(output_file, TARGET_HASH)):
-        if os.path.exists(output_file):
-            sys.stderr.write(
-                "\n\nThe existing binary file is broken. Start to generate new one...\n\n".
-                format(output_file))
-            os.remove(output_file)
-        if retry < try_limit:
-            retry = retry + 1
-        else:
-            raise RuntimeError(
-                "Can not convert the dataset to binary file with try limit {0}".
-                format(try_limit))
-        download_concat(cache_folder, zip_path)
-        convert(zip_path, output_file)
-    print("\nSuccess! The binary file can be found at {0}".format(output_file))
-
-
-if __name__ == '__main__':
-    run_convert()
diff --git a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
deleted file mode 100644
index d703a129706e7565ac7931af61542b3fb487de47..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import xml.etree.ElementTree
-from PIL import Image
-import numpy as np
-import os
-import sys
-from paddle.dataset.common import download
-import tarfile
-import StringIO
-import hashlib
-import tarfile
-import argparse
-
-DATA_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar"
-DATA_DIR = os.path.expanduser("~/.cache/paddle/dataset/pascalvoc/")
-TAR_FILE = "VOCtest_06-Nov-2007.tar"
-TAR_PATH = os.path.join(DATA_DIR, TAR_FILE)
-RESIZE_H = 300
-RESIZE_W = 300
-MEAN_VALUE = [127.5, 127.5, 127.5]
-AP_VERSION = '11point'
-DATA_OUT = 'pascalvoc_full.bin'
-DATA_OUT_PATH = os.path.join(DATA_DIR, DATA_OUT)
-BIN_TARGETHASH = "f6546cadc42f5ff13178b84ed29b740b"
-TAR_TARGETHASH = "b6e924de25625d8de591ea690078ad9f"
-TEST_LIST_KEY = "VOCdevkit/VOC2007/ImageSets/Main/test.txt"
-BIN_FULLSIZE = 5348678856
-
-
-def preprocess(img):
-    img_width, img_height = img.size
-    img = img.resize((RESIZE_W, RESIZE_H), Image.ANTIALIAS)
-    img = np.array(img)
-    # HWC to CHW
-    if len(img.shape) == 3:
-        img = np.swapaxes(img, 1, 2)
-        img = np.swapaxes(img, 1, 0)
-    # RBG to BGR
-    img = img[[2, 1, 0], :, :]
-    img = img.astype('float32')
-    img_mean = np.array(MEAN_VALUE)[:, np.newaxis, np.newaxis].astype('float32')
-    img -= img_mean
-    img = img * 0.007843
-    return img
-
-
-def convert_pascalvoc_local2bin(args):
-    data_dir = os.path.expanduser(args.data_dir)
-    label_fpath = os.path.join(data_dir, args.label_file)
-    flabel = open(label_fpath)
-    label_list = [line.strip() for line in flabel]
-
-    img_annotation_list_path = os.path.join(data_dir, args.img_annotation_list)
-    flist = open(img_annotation_list_path)
-    lines = [line.strip() for line in flist]
-
-    output_file_path = os.path.join(data_dir, args.output_file)
-    f1 = open(output_file_path, "w+b")
-    f1.seek(0)
-    image_nums = len(lines)
-    f1.write(np.array(image_nums).astype('int64').tobytes())
-
-    boxes = []
-    lbls = []
-    difficults = []
-    object_nums = []
-
-    for line in lines:
-        image_path, label_path = line.split()
-        image_path = os.path.join(data_dir, image_path)
-        label_path = os.path.join(data_dir, label_path)
-
-        im = Image.open(image_path)
-        if im.mode == 'L':
-            im = im.convert('RGB')
-        im_width, im_height = im.size
-
-        im = preprocess(im)
-        np_im = np.array(im)
-        f1.write(np_im.astype('float32').tobytes())
-
-        # layout: label | xmin | ymin | xmax | ymax | difficult
-        bbox_labels = []
-        root = xml.etree.ElementTree.parse(label_path).getroot()
-
-        objects = root.findall('object')
-        objects_size = len(objects)
-        object_nums.append(objects_size)
-
-        for object in objects:
-            bbox_sample = []
-            # start from 1
-            bbox_sample.append(
-                float(label_list.index(object.find('name').text)))
-            bbox = object.find('bndbox')
-            difficult = float(object.find('difficult').text)
-            bbox_sample.append(float(bbox.find('xmin').text) / im_width)
-            bbox_sample.append(float(bbox.find('ymin').text) / im_height)
-            bbox_sample.append(float(bbox.find('xmax').text) / im_width)
-            bbox_sample.append(float(bbox.find('ymax').text) / im_height)
-            bbox_sample.append(difficult)
-            bbox_labels.append(bbox_sample)
-
-        bbox_labels = np.array(bbox_labels)
-        if len(bbox_labels) == 0: continue
-
-        lbls.extend(bbox_labels[:, 0])
-        boxes.extend(bbox_labels[:, 1:5])
-        difficults.extend(bbox_labels[:, -1])
-
-    f1.write(np.array(object_nums).astype('uint64').tobytes())
-    f1.write(np.array(lbls).astype('int64').tobytes())
-    f1.write(np.array(boxes).astype('float32').tobytes())
-    f1.write(np.array(difficults).astype('int64').tobytes())
-    f1.close()
-
-    object_nums_sum = sum(object_nums)
-    target_size = 8 + image_nums * 3 * args.resize_h * args.resize_h * 4 + image_nums * 8 + object_nums_sum * (
-        8 + 4 * 4 + 8)
-    if (os.path.getsize(output_file_path) == target_size):
-        print("Success! \nThe output binary file can be found at: ",
-              output_file_path)
-    else:
-        print("Conversion failed!")
-
-
-def print_processbar(done_percentage):
-    done_filled = done_percentage * '='
-    empty_filled = (100 - done_percentage) * ' '
-    sys.stdout.write("\r[%s%s]%d%%" %
-                     (done_filled, empty_filled, done_percentage))
-    sys.stdout.flush()
-
-
-def convert_pascalvoc_tar2bin(tar_path, data_out_path):
-    print("Start converting ...\n")
-    images = {}
-    gt_labels = {}
-    boxes = []
-    lbls = []
-    difficults = []
-    object_nums = []
-
-    # map label to number (index)
-    label_list = [
-        "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus",
-        "car", "cat", "chair", "cow", "diningtable", "dog", "horse",
-        "motorbike", "person", "pottedplant", "sheep", "sofa", "train",
-        "tvmonitor"
-    ]
-    print_processbar(0)
-    #read from tar file and write to bin
-    tar = tarfile.open(tar_path, "r")
-    f_test = tar.extractfile(TEST_LIST_KEY).read()
-    lines = f_test.split('\n')
-    del lines[-1]
-    image_nums = len(lines)
-    per_percentage = image_nums / 100
-
-    f1 = open(data_out_path, "w+b")
-    f1.seek(0)
-    f1.write(np.array(image_nums).astype('int64').tobytes())
-    for tarInfo in tar:
-        if tarInfo.isfile():
-            tmp_filename = tarInfo.name
-            name_arr = tmp_filename.split('/')
-            name_prefix = name_arr[-1].split('.')[0]
-            if name_arr[-2] == 'JPEGImages' and name_prefix in lines:
-                images[name_prefix] = tar.extractfile(tarInfo).read()
-            if name_arr[-2] == 'Annotations' and name_prefix in lines:
-                gt_labels[name_prefix] = tar.extractfile(tarInfo).read()
-
-    for line_idx, name_prefix in enumerate(lines):
-        im = Image.open(StringIO.StringIO(images[name_prefix]))
-        if im.mode == 'L':
-            im = im.convert('RGB')
-        im_width, im_height = im.size
-
-        im = preprocess(im)
-        np_im = np.array(im)
-        f1.write(np_im.astype('float32').tobytes())
-
-        # layout: label | xmin | ymin | xmax | ymax | difficult
-        bbox_labels = []
-        root = xml.etree.ElementTree.fromstring(gt_labels[name_prefix])
-
-        objects = root.findall('object')
-        objects_size = len(objects)
-        object_nums.append(objects_size)
-
-        for object in objects:
-            bbox_sample = []
-            bbox_sample.append(
-                float(label_list.index(object.find('name').text)))
-            bbox = object.find('bndbox')
-            difficult = float(object.find('difficult').text)
-            bbox_sample.append(float(bbox.find('xmin').text) / im_width)
-            bbox_sample.append(float(bbox.find('ymin').text) / im_height)
-            bbox_sample.append(float(bbox.find('xmax').text) / im_width)
-            bbox_sample.append(float(bbox.find('ymax').text) / im_height)
-            bbox_sample.append(difficult)
-            bbox_labels.append(bbox_sample)
-
-        bbox_labels = np.array(bbox_labels)
-        if len(bbox_labels) == 0: continue
-        lbls.extend(bbox_labels[:, 0])
-        boxes.extend(bbox_labels[:, 1:5])
-        difficults.extend(bbox_labels[:, -1])
-
-        if line_idx % per_percentage:
-            print_processbar(line_idx / per_percentage)
-
-    f1.write(np.array(object_nums).astype('uint64').tobytes())
-    f1.write(np.array(lbls).astype('int64').tobytes())
-    f1.write(np.array(boxes).astype('float32').tobytes())
-    f1.write(np.array(difficults).astype('int64').tobytes())
-    f1.close()
-    print_processbar(100)
-    print("Conversion finished!\n")
-
-
-def download_pascalvoc(data_url, data_dir, tar_targethash, tar_path):
-    print("Downloading pascalvcoc test set...")
-    download(data_url, data_dir, tar_targethash)
-    if not os.path.exists(tar_path):
-        print("Failed in downloading pascalvoc test set. URL %s\n" % data_url)
-    else:
-        tmp_hash = hashlib.md5(open(tar_path, 'rb').read()).hexdigest()
-        if tmp_hash != tar_targethash:
-            print("Downloaded test set is broken, removing ...\n")
-        else:
-            print("Downloaded successfully. Path: %s\n" % tar_path)
-
-
-def run_convert():
-    try_limit = 2
-    retry = 0
-    while not (os.path.exists(DATA_OUT_PATH) and
-               os.path.getsize(DATA_OUT_PATH) == BIN_FULLSIZE and BIN_TARGETHASH
-               == hashlib.md5(open(DATA_OUT_PATH, 'rb').read()).hexdigest()):
-        if os.path.exists(DATA_OUT_PATH):
-            sys.stderr.write(
-                "The existing binary file is broken. It is being removed...\n")
-            os.remove(DATA_OUT_PATH)
-        if retry < try_limit:
-            retry = retry + 1
-        else:
-            download_pascalvoc(DATA_URL, DATA_DIR, TAR_TARGETHASH, TAR_PATH)
-            convert_pascalvoc_tar2bin(TAR_PATH, DATA_OUT_PATH)
-    print("Success!\nThe binary file can be found at %s\n" % DATA_OUT_PATH)
-
-
-def main_pascalvoc_preprocess(args):
-    parser = argparse.ArgumentParser(
-        description="Convert the full pascalvoc val set or local data to binary file."
-    )
-    parser.add_argument(
-        '--choice', choices=['local', 'VOC_test_2007'], required=True)
-    parser.add_argument(
-        "--data_dir",
-        default="/home/li/AIPG-Paddle/paddle/build/third_party/inference_demo/int8v2/pascalvoc_small",
-        type=str,
-        help="Dataset root directory")
-    parser.add_argument(
-        "--img_annotation_list",
-        type=str,
-        default="test_100.txt",
-        help="A file containing the image file path and relevant annotation file path"
-    )
-    parser.add_argument(
-        "--label_file",
-        type=str,
-        default="label_list",
-        help="List the labels in the same sequence as denoted in the annotation file"
-    )
-    parser.add_argument(
-        "--output_file",
-        type=str,
-        default="pascalvoc_small.bin",
-        help="File path of the output binary file")
-    parser.add_argument("--resize_h", type=int, default=RESIZE_H)
-    parser.add_argument("--resize_w", type=int, default=RESIZE_W)
-    parser.add_argument("--mean_value", type=str, default=MEAN_VALUE)
-    parser.add_argument("--ap_version", type=str, default=AP_VERSION)
-    args = parser.parse_args()
-    if args.choice == 'local':
-        convert_pascalvoc_local2bin(args)
-    elif args.choice == 'VOC_test_2007':
-        run_convert()
-
-
-if __name__ == "__main__":
-    main_pascalvoc_preprocess(sys.argv)
diff --git a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
deleted file mode 100644
index 4add8bb2ab8c3513011491277a25f0a7e677bd12..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
+++ /dev/null
@@ -1,105 +0,0 @@
-# INT8 MKL-DNN quantization
-
-This document describes how to use Paddle inference Engine to convert the FP32 model to INT8 model on ResNet-50 and MobileNet-V1. We provide the instructions on enabling INT8 MKL-DNN quantization in Paddle inference and show the ResNet-50 and MobileNet-V1 results in accuracy and performance.
-
-## 0. Install PaddlePaddle
-
-Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments.
-
-```bash
-cmake ..  -DWITH_TESTING=ON -WITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_MKL=ON -DWITH_MKLDNN=ON -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
-
-```
-
-Note: MKL-DNN and MKL are required.
-
-## 1. Enable INT8 MKL-DNN quantization
-
-For reference, please examine the code of unit test enclosed in [analyzer_int8_image_classification_tester.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc).
-
-* ### Create Analysis config
-
-INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease)
-
-* ### Create quantize config by analysis config
-
-We enable the MKL-DNN quantization procedure by calling an appropriate method from analysis config. Afterwards, all the required quantization parameters (quantization op names, quantization strategies etc.) can be set through quantizer config which is present in the analysis config. It is also necessary to specify a pre-processed warmup dataset and desired batch size.
-
-```cpp
-//Enable MKL-DNN quantization
-cfg.EnableMkldnnQuantizer();
-
-//use analysis config to call the MKL-DNN quantization config
-cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
-cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
-```
-
-## 2. Accuracy and Performance benchmark
-
-We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 on single core.
-
->**Dataset: ILSVRC2012 Validation dataset**
-
->**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
-
-| Model        | FP32 Accuracy   | INT8 Accuracy   | Accuracy Diff(FP32-INT8)   |
-| :----------: | :-------------: | :------------:  | :--------------:           |
-| GoogleNet    |  70.50%         |  69.81%         |   0.69%                    |
-| MobileNet-V1 |  70.78%         |  70.42%         |   0.36%                    |
-| MobileNet-V2 |  71.90%         |  71.35%         |   0.55%                    |
-| ResNet-101   |  77.50%         |  77.42%         |   0.08%                    |
-| ResNet-50    |  76.63%         |  76.52%         |   0.11%                    |
-| VGG16        |  72.08%         |  72.03%         |   0.05%                    |
-| VGG19        |  72.57%         |  72.55%         |   0.02%                    |
-
->**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
-
-| Model        | FP32 Throughput(images/s)  | INT8 Throughput(images/s) | Ratio(INT8/FP32)|
-| :-----------:| :------------:             | :------------:            | :------------:  |
-| GoogleNet    |    34.06                   |    72.79                  |   2.14          |
-| MobileNet-V1 |    80.02                   |   230.65                  |   2.88          |
-| MobileNet-V2 |    99.38                   |   206.92                  |   2.08          |
-| ResNet-101   |     7.38                   |    27.31                  |   3.70          |
-| ResNet-50    |    13.71                   |    50.55                  |   3.69          |
-| VGG16        |     3.64                   |    10.56                  |   2.90          |
-| VGG19        |     2.95                   |     9.02                  |   3.05          |
-
-Notes:
-
-* Measurement of accuracy requires a model which accepts two inputs: data and labels.
-
-* Different sampling batch size data may cause slight difference on INT8 top accuracy.
-* CAPI performance data is better than python API performance data because of the python overhead. Especially for the small computational model, python overhead will be more obvious.
-
-## 3. Commands to reproduce the above accuracy and performance benchmark
-
-Two steps to reproduce the above-mentioned accuracy results, and we take GoogleNet benchmark as an example:
-
-* ### Prepare dataset
-
-Running the following commands to download and preprocess the ILSVRC2012 Validation dataset.
-
-```bash
-cd /PATH/TO/PADDLE/build
-python ../paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
-```
-
-Then the ILSVRC2012 Validation dataset will be preprocessed and saved by default in `~/.cache/paddle/dataset/int8/download/int8_full_val.bin`
-
-* ### Commands to reproduce benchmark
-
-You can run `test_analyzer_int8_imagenet_classification` with the following arguments to reproduce the accuracy result on GoogleNet.
-
-```bash
-./paddle/fluid/inference/tests/api/test_analyzer_int8_image_classification --infer_model=third_party/inference_demo/int8v2/resnet50/model --infer_data=/~/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
-```
-
-To verify all the 7 models, you need to set the parameter of `--infer_model` to one of the following values in command line:
-
-```bash
---infer_model /PATH/TO/PADDLE/build/third_party/inference_demo/int8v2/MODEL_NAME/model
-```
-
-```text
-MODEL_NAME=googlenet, mobilenetv1, mobilenetv2, resnet101, resnet50, vgg16, vgg19
-```
diff --git a/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py b/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py
deleted file mode 100644
index 4576d60a3d2a0bf8eb1715d3f15e74cc284c9afc..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from full_pascalvoc_test_preprocess import main_pascalvoc_preprocess
-import numpy as np
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-import unittest
-import os
-
-
-class Test_Preprocess(unittest.TestCase):
-    def test_local_convert(self):
-        os.system("python full_pascalvoc_test_preprocess.py --choice=local")
-
-    def test_online_convert(self):
-        os.system(
-            "python full_pascalvoc_test_preprocess.py --choice=VOC_test_2007")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
deleted file mode 100644
index 463fc4b12fcabad2444ed6706d661fb0ceab2e04..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ /dev/null
@@ -1,802 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <vector>
-#ifdef WITH_GPERFTOOLS
-#include <gperftools/profiler.h>
-#endif
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/inference/tests/api/config_printer.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-#include "paddle/fluid/inference/utils/benchmark.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_string(model_name, "", "model name");
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data file");
-DEFINE_string(refer_result, "", "reference result for comparison");
-DEFINE_int32(batch_size, 1, "batch size");
-DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction");
-DEFINE_bool(enable_int8, true, "Enable INT8 type prediction");
-DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
-// setting iterations to 0 means processing the whole dataset
-DEFINE_int32(iterations, 0, "number of batches to process");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
-DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
-DEFINE_bool(use_analysis, true,
-            "Running the inference program in analysis mode.");
-DEFINE_bool(record_benchmark, false,
-            "Record benchmark after profiling the model");
-DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
-DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
-DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
-DEFINE_bool(warmup, false,
-            "Use warmup to calculate elapsed_time more accurately. "
-            "To reduce CI time, it sets false in default.");
-
-DECLARE_bool(profile);
-DECLARE_int32(paddle_num_threads);
-
-namespace paddle {
-namespace inference {
-
-using paddle::framework::proto::VarType;
-
-template <typename T>
-constexpr paddle::PaddleDType GetPaddleDType();
-
-template <>
-constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
-  return paddle::PaddleDType::INT64;
-}
-
-template <>
-constexpr paddle::PaddleDType GetPaddleDType<float>() {
-  return paddle::PaddleDType::FLOAT32;
-}
-
-void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
-  const auto *analysis_config =
-      reinterpret_cast<const AnalysisConfig *>(config);
-  if (use_analysis) {
-    LOG(INFO) << *analysis_config;
-    return;
-  }
-  LOG(INFO) << analysis_config->ToNativeConfig();
-}
-
-// Compare result between two PaddleTensor
-void CompareResult(const std::vector<PaddleTensor> &outputs,
-                   const std::vector<PaddleTensor> &ref_outputs) {
-  EXPECT_GT(outputs.size(), 0UL);
-  EXPECT_EQ(outputs.size(), ref_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &ref_out = ref_outputs[i];
-    size_t size = VecReduceToInt(out.shape);
-    size_t ref_size = VecReduceToInt(ref_out.shape);
-    EXPECT_GT(size, 0UL);
-    EXPECT_EQ(size, ref_size);
-    EXPECT_EQ(out.dtype, ref_out.dtype);
-    switch (out.dtype) {
-      case PaddleDType::INT64: {
-        int64_t *pdata = static_cast<int64_t *>(out.data.data());
-        int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
-        for (size_t j = 0; j < size; ++j) {
-          EXPECT_EQ(pdata_ref[j], pdata[j]);
-        }
-        break;
-      }
-      case PaddleDType::FLOAT32: {
-        float *pdata = static_cast<float *>(out.data.data());
-        float *pdata_ref = static_cast<float *>(ref_out.data.data());
-        for (size_t j = 0; j < size; ++j) {
-          CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
-        }
-        break;
-      }
-      case PaddleDType::INT32: {
-        int32_t *pdata = static_cast<int32_t *>(out.data.data());
-        int32_t *pdata_ref = static_cast<int32_t *>(ref_out.data.data());
-        for (size_t j = 0; j < size; ++j) {
-          EXPECT_EQ(pdata_ref[j], pdata[j]);
-        }
-        break;
-      }
-      case PaddleDType::UINT8: {
-        uint8_t *pdata = static_cast<uint8_t *>(out.data.data());
-        uint8_t *pdata_ref = static_cast<uint8_t *>(ref_out.data.data());
-        for (size_t j = 0; j < size; ++j) {
-          EXPECT_EQ(pdata_ref[j], pdata[j]);
-        }
-        break;
-      }
-    }
-  }
-}
-
-// Compare result between a PaddleTensor and a ZeroCopyTensor
-void CompareResult(const std::vector<PaddleTensor> &outputs,
-                   const std::vector<ZeroCopyTensor> &ref_outputs) {
-  EXPECT_GT(outputs.size(), 0UL);
-  EXPECT_EQ(outputs.size(), ref_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &ref_out = ref_outputs[i];
-    size_t size = VecReduceToInt(out.shape);
-    EXPECT_GT(size, 0UL);
-    int ref_size = 0;  // this is the number of elements not memory size
-    PaddlePlace place;
-    switch (out.dtype) {
-      case PaddleDType::INT64: {
-        int64_t *pdata = static_cast<int64_t *>(out.data.data());
-        int64_t *pdata_ref = ref_out.data<int64_t>(&place, &ref_size);
-        EXPECT_EQ(size, static_cast<size_t>(ref_size));
-        for (size_t j = 0; j < size; ++j) {
-          EXPECT_EQ(pdata_ref[j], pdata[j]);
-        }
-        break;
-      }
-      case PaddleDType::FLOAT32: {
-        float *pdata = static_cast<float *>(out.data.data());
-        float *pdata_ref = ref_out.data<float>(&place, &ref_size);
-        EXPECT_EQ(size, ref_size);
-        for (size_t j = 0; j < size; ++j) {
-          CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
-        }
-        break;
-      }
-      case PaddleDType::INT32: {
-        int32_t *pdata = static_cast<int32_t *>(out.data.data());
-        int32_t *pdata_ref = ref_out.data<int32_t>(&place, &ref_size);
-        EXPECT_EQ(size, ref_size);
-        for (size_t j = 0; j < size; ++j) {
-          EXPECT_EQ(pdata_ref[j], pdata[j]);
-        }
-        break;
-      }
-      case PaddleDType::UINT8: {
-        uint8_t *pdata = static_cast<uint8_t *>(out.data.data());
-        uint8_t *pdata_ref = ref_out.data<uint8_t>(&place, &ref_size);
-        EXPECT_EQ(size, ref_size);
-        for (size_t j = 0; j < size; ++j) {
-          EXPECT_EQ(pdata_ref[j], pdata[j]);
-        }
-        break;
-      }
-    }
-  }
-}
-
-std::unique_ptr<PaddlePredictor> CreateTestPredictor(
-    const PaddlePredictor::Config *config, bool use_analysis = true) {
-  const auto *analysis_config =
-      reinterpret_cast<const AnalysisConfig *>(config);
-  if (use_analysis) {
-    return CreatePaddlePredictor<AnalysisConfig>(*analysis_config);
-  }
-  auto native_config = analysis_config->ToNativeConfig();
-  return CreatePaddlePredictor<NativeConfig>(native_config);
-}
-
-size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
-
-std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
-                                                   int *num_ops) {
-  std::unordered_map<std::string, int> res;
-  auto *analysis_predictor = static_cast<AnalysisPredictor *>(predictor);
-  auto *fusion_status =
-      analysis_predictor->analysis_argument().fusion_statis_ptr();
-  if (!fusion_status) {
-    return res;
-  }
-  for (auto &item : *fusion_status) {
-    LOG(INFO) << "fused " << item.first << " " << item.second;
-  }
-  int num = 0;
-  for (auto &node :
-       analysis_predictor->analysis_argument().main_graph().Nodes()) {
-    if (node->IsOp()) {
-      ++num;
-    }
-  }
-  *num_ops = num;
-  return *fusion_status;
-}
-
-void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
-                       const std::string &dirname, bool is_combined = true,
-                       std::string model_filename = "model",
-                       std::string params_filename = "params",
-                       const std::vector<std::string> *feed_names = nullptr,
-                       const int continuous_inuput_index = 0) {
-  // Set fake_image_data
-  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
-  std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
-      dirname, is_combined, model_filename, params_filename);
-  std::ostringstream os;
-  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
-    os << "feed target " << i << ": {" << feed_target_shapes[i][0];
-    for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) {
-      os << ", " << feed_target_shapes[i][j];
-    }
-    os << "}\n";
-  }
-  LOG(INFO) << os.str();
-  if (feed_names) {
-    PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size());
-  }
-  std::vector<PaddleTensor> input_slots(feed_target_shapes.size());
-  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
-    const auto &feed_shape = feed_target_shapes[i];
-    auto &input = input_slots[i];
-    std::vector<int> shape({FLAGS_batch_size});
-    for (size_t s = 1; s < feed_shape.size(); ++s) {
-      shape.push_back(static_cast<int>(feed_shape[s]));
-    }
-    if (feed_names) {
-      input.name = (*feed_names)[i];
-    }
-    input.shape = shape;
-    input.dtype = PaddleDType::FLOAT32;
-    size_t len = std::accumulate(shape.begin(), shape.end(), size_t{1},
-                                 [](int a, int b) { return a * b; });
-    input.data.Resize(len * sizeof(float));
-    input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}});
-    float *input_data = static_cast<float *>(input.data.data());
-    // fill input data, for profile easily, do not use random data here.
-    for (size_t j = 0; j < len; ++j) {
-      *(input_data + j) =
-          static_cast<float>((j + continuous_inuput_index) % len) / len;
-    }
-  }
-  (*inputs).emplace_back(input_slots);
-}
-
-void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in,
-                      std::vector<std::vector<int64_t>> *out,
-                      std::vector<size_t> *lod, size_t batch_iter,
-                      size_t batch_end) {
-  lod->clear();
-  lod->push_back(0);
-  for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) {
-    out->push_back(*it);
-    lod->push_back(lod->back() + (*it).size());  // calculate lod
-  }
-}
-
-void ConvertPaddleTensorToZeroCopyTensor(
-    PaddlePredictor *predictor, const std::vector<PaddleTensor> &inputs) {
-  for (size_t i = 0; i < inputs.size(); i++) {
-    auto input = inputs[i];
-    auto tensor = predictor->GetInputTensor(input.name);
-    tensor->Reshape(input.shape);
-    tensor->SetLoD({input.lod});
-    if (input.dtype == PaddleDType::INT64) {
-      ZeroCopyTensorAssignData<int64_t>(tensor.get(), input.data);
-    } else if (input.dtype == PaddleDType::FLOAT32) {
-      ZeroCopyTensorAssignData<float>(tensor.get(), input.data);
-    } else if (input.dtype == PaddleDType::INT32) {
-      ZeroCopyTensorAssignData<int32_t>(tensor.get(), input.data);
-    } else if (input.dtype == PaddleDType::UINT8) {
-      ZeroCopyTensorAssignData<uint8_t>(tensor.get(), input.data);
-    } else {
-      LOG(ERROR) << "unsupported feed type " << input.dtype;
-    }
-  }
-}
-
-void PredictionWarmUp(PaddlePredictor *predictor,
-                      const std::vector<std::vector<PaddleTensor>> &inputs,
-                      std::vector<std::vector<PaddleTensor>> *outputs,
-                      int num_threads, int tid,
-                      const VarType::Type data_type = VarType::FP32) {
-  int batch_size = FLAGS_batch_size;
-  LOG(INFO) << "Running thread " << tid << ", warm up run...";
-  if (FLAGS_zero_copy) {
-    ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
-  }
-  outputs->resize(1);
-  Timer warmup_timer;
-  warmup_timer.tic();
-  if (!FLAGS_zero_copy) {
-    predictor->Run(inputs[0], &(*outputs)[0], batch_size);
-  } else {
-    predictor->ZeroCopyRun();
-  }
-  PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1, data_type);
-  if (FLAGS_profile) {
-    paddle::platform::ResetProfiler();
-  }
-}
-
-void PredictionRun(PaddlePredictor *predictor,
-                   const std::vector<std::vector<PaddleTensor>> &inputs,
-                   std::vector<std::vector<PaddleTensor>> *outputs,
-                   int num_threads, int tid,
-                   const VarType::Type data_type = VarType::FP32,
-                   float *sample_latency = nullptr) {
-  int num_times = FLAGS_repeat;
-  int iterations = inputs.size();  // process the whole dataset ...
-  if (FLAGS_iterations > 0 &&
-      FLAGS_iterations < static_cast<int64_t>(inputs.size()))
-    iterations =
-        FLAGS_iterations;  // ... unless the number of iterations is set
-  outputs->resize(iterations);
-  LOG(INFO) << "Thread " << tid << ", number of threads " << num_threads
-            << ", run " << num_times << " times...";
-  Timer run_timer;
-  double elapsed_time = 0;
-#ifdef WITH_GPERFTOOLS
-  ProfilerStart("paddle_inference.prof");
-#endif
-  int predicted_num = 0;
-  if (!FLAGS_zero_copy) {
-    for (int i = 0; i < iterations; i++) {
-      run_timer.tic();
-      for (int j = 0; j < num_times; j++) {
-        predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
-      }
-      elapsed_time += run_timer.toc();
-
-      predicted_num += FLAGS_batch_size;
-      if (predicted_num % 100 == 0) {
-        LOG(INFO) << predicted_num << " samples";
-      }
-    }
-  } else {
-    for (int i = 0; i < iterations; i++) {
-      ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
-      run_timer.tic();
-      for (int j = 0; j < num_times; j++) {
-        predictor->ZeroCopyRun();
-      }
-      elapsed_time += run_timer.toc();
-
-      predicted_num += FLAGS_batch_size;
-      if (predicted_num % 100 == 0) {
-        LOG(INFO) << predicted_num << " samples";
-      }
-    }
-  }
-
-#ifdef WITH_GPERFTOOLS
-  ProfilerStop();
-#endif
-
-  auto batch_latency = elapsed_time / (iterations * num_times);
-  PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency,
-            iterations, data_type);
-
-  if (sample_latency != nullptr)
-    *sample_latency = batch_latency / FLAGS_batch_size;
-
-  if (FLAGS_record_benchmark) {
-    Benchmark benchmark;
-    benchmark.SetName(FLAGS_model_name);
-    benchmark.SetBatchSize(FLAGS_batch_size);
-    benchmark.SetLatency(batch_latency);
-    benchmark.PersistToFile("benchmark_record.txt");
-  }
-}
-
-void TestOneThreadPrediction(
-    const PaddlePredictor::Config *config,
-    const std::vector<std::vector<PaddleTensor>> &inputs,
-    std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true,
-    const VarType::Type data_type = VarType::FP32,
-    float *sample_latency = nullptr) {
-  auto predictor = CreateTestPredictor(config, use_analysis);
-  if (FLAGS_warmup) {
-    PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0, data_type);
-  }
-  PredictionRun(predictor.get(), inputs, outputs, 1, 0, data_type,
-                sample_latency);
-}
-
-void TestMultiThreadPrediction(
-    const PaddlePredictor::Config *config,
-    const std::vector<std::vector<PaddleTensor>> &inputs,
-    std::vector<std::vector<PaddleTensor>> *outputs, int num_threads,
-    bool use_analysis = true) {
-  std::vector<std::thread> threads;
-  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
-  for (int tid = 1; tid < num_threads; tid++) {
-    predictors.emplace_back(predictors.front()->Clone());
-  }
-
-  for (int tid = 0; tid < num_threads; ++tid) {
-    threads.emplace_back([&, tid]() {
-      // Each thread should have local inputs and outputs.
-      // The inputs of each thread are all the same.
-      std::vector<std::vector<PaddleTensor>> outputs_tid;
-      auto &predictor = predictors[tid];
-      if (FLAGS_warmup) {
-        PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads,
-                         tid);
-      }
-      PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid);
-    });
-  }
-  for (int i = 0; i < num_threads; ++i) {
-    threads[i].join();
-  }
-}
-
-void TestPrediction(const PaddlePredictor::Config *config,
-                    const std::vector<std::vector<PaddleTensor>> &inputs,
-                    std::vector<std::vector<PaddleTensor>> *outputs,
-                    int num_threads, bool use_analysis = FLAGS_use_analysis) {
-  PrintConfig(config, use_analysis);
-  if (num_threads == 1) {
-    TestOneThreadPrediction(config, inputs, outputs, use_analysis);
-  } else {
-    TestMultiThreadPrediction(config, inputs, outputs, num_threads,
-                              use_analysis);
-  }
-}
-
-void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8,
-                       int compared_idx) {
-  PADDLE_ENFORCE_LE(compared_idx, 2,
-                    "Compare either top1 accuracy or mAP (top5), the "
-                    "compared_idx is out of range");
-  PADDLE_ENFORCE_GE(compared_idx, 1,
-                    "Compare either top1 accuracy or mAP (top5), the "
-                    "compared_idx is out of range");
-  std::string prefix = (compared_idx == 1) ? "top1_accuracy " : "mAP ";
-  LOG(INFO) << "--- Accuracy summary --- ";
-  LOG(INFO) << "Accepted " << prefix
-            << "drop threshold: " << FLAGS_quantized_accuracy
-            << ". (condition: (FP32_" << prefix << " - INT8_" << prefix
-            << ") <= threshold)";
-  LOG(INFO) << "FP32: avg " << prefix << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc_fp32;
-  LOG(INFO) << "INT8: avg " << prefix << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc_int8;
-}
-
-void SummarizePerformance(const char *title, float sample) {
-  CHECK_GT(sample, 0.0);
-  auto throughput = 1000.0 / sample;
-  LOG(INFO) << title << ": avg fps: " << std::fixed << std::setw(6)
-            << std::setprecision(4) << throughput << ", avg latency: " << sample
-            << " ms";
-}
-
-void SummarizePerformance(float sample_latency_fp32,
-                          float sample_latency_int8) {
-  if (FLAGS_enable_fp32) SummarizePerformance("FP32", sample_latency_fp32);
-  if (FLAGS_enable_int8) SummarizePerformance("INT8", sample_latency_int8);
-}
-
-float CompareAccuracyOne(
-    const std::vector<std::vector<PaddleTensor>> &output_slots,
-    int compared_idx) {
-  if (output_slots.size() == 0)
-    throw std::invalid_argument(
-        "CompareAccuracy: output_slots vector is empty.");
-
-  float total_accs{0};
-
-  for (size_t i = 0; i < output_slots.size(); ++i) {
-    switch (compared_idx) {
-      case 1:
-        PADDLE_ENFORCE_GE(
-            output_slots[i].size(), 2UL,
-            "To achieve top 1 accuracy, output_slots_quant[i].size()>=2");
-        break;
-      case 2:
-        PADDLE_ENFORCE_GE(
-            output_slots[i].size(), 2UL,
-            "To achieve top 1 accuracy, output_slots_ref[i].size()>=2");
-        break;
-      default:
-        throw std::invalid_argument(
-            "CompareAccuracy: compared_idx is out of range.");
-    }
-
-    if (output_slots[i][compared_idx].lod.size() > 0)
-      throw std::invalid_argument("CompareAccuracy: output has nonempty LoD.");
-
-    if (output_slots[i][compared_idx].dtype != paddle::PaddleDType::FLOAT32)
-      throw std::invalid_argument(
-          "CompareAccuracy: output is of a wrong type.");
-
-    total_accs +=
-        *static_cast<float *>(output_slots[i][compared_idx].data.data());
-  }
-
-  CHECK_GT(output_slots.size(), 0);
-
-  return total_accs / output_slots.size();
-}
-
-void CompareAccuracy(
-    const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
-    const std::vector<std::vector<PaddleTensor>> &output_slots_ref,
-    int compared_idx) {
-  if ((FLAGS_enable_fp32 && FLAGS_enable_int8) &&
-      (output_slots_quant.size() == 0 || output_slots_ref.size()) == 0)
-    throw std::invalid_argument(
-        "CompareAccuracy: output_slots vector is empty.");
-
-  float avg_acc_quant = 0.0;
-  float avg_acc_ref = 0.0;
-
-  if (FLAGS_enable_int8)
-    avg_acc_quant = CompareAccuracyOne(output_slots_quant, compared_idx);
-
-  if (FLAGS_enable_fp32)
-    avg_acc_ref = CompareAccuracyOne(output_slots_ref, compared_idx);
-
-  SummarizeAccuracy(avg_acc_ref, avg_acc_quant, compared_idx);
-
-  if (FLAGS_enable_fp32) CHECK_GT(avg_acc_ref, 0.0);
-
-  if (FLAGS_enable_int8) CHECK_GT(avg_acc_quant, 0.0);
-
-  if (FLAGS_enable_fp32 && FLAGS_enable_int8)
-    CHECK_LE(avg_acc_ref - avg_acc_quant, FLAGS_quantized_accuracy);
-}
-
-void CompareDeterministic(
-    const PaddlePredictor::Config *config,
-    const std::vector<std::vector<PaddleTensor>> &inputs) {
-  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;
-  auto predictor = CreateTestPredictor(config, FLAGS_use_analysis);
-
-  std::vector<PaddleTensor> warmup_outputs, outputs;
-  // run num_times to Compare Deterministic Result.
-  for (size_t j = 0; j < inputs.size(); j++) {
-    // warmup run
-    predictor->Run(inputs[j], &warmup_outputs, batch_size);
-    for (int i = 0; i < num_times; i++) {
-      predictor->Run(inputs[j], &outputs, batch_size);
-      CompareResult(outputs, warmup_outputs);
-    }
-  }
-}
-
-void CompareNativeAndAnalysis(
-    const PaddlePredictor::Config *config,
-    const std::vector<std::vector<PaddleTensor>> &inputs) {
-  PrintConfig(config, true);
-  std::vector<std::vector<PaddleTensor>> native_outputs, analysis_outputs;
-  TestOneThreadPrediction(config, inputs, &native_outputs, false);
-  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
-  PADDLE_ENFORCE_GT(native_outputs.size(), 0, "Native output is empty.");
-  PADDLE_ENFORCE_GT(analysis_outputs.size(), 0, "Analysis output is empty.");
-  CompareResult(analysis_outputs.back(), native_outputs.back());
-}
-
-void CompareQuantizedAndAnalysis(
-    const AnalysisConfig *config, const AnalysisConfig *qconfig,
-    const std::vector<std::vector<PaddleTensor>> &inputs,
-    const int compared_idx = 1) {
-  PADDLE_ENFORCE_EQ(inputs[0][0].shape[0], FLAGS_batch_size,
-                    "Input data has to be packed batch by batch.");
-  LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size
-            << ", warmup batch size " << FLAGS_warmup_batch_size << ".";
-
-  LOG(INFO) << "--- FP32 prediction start ---";
-  auto *cfg = reinterpret_cast<const PaddlePredictor::Config *>(config);
-  PrintConfig(cfg, true);
-  std::vector<std::vector<PaddleTensor>> analysis_outputs;
-  float sample_latency_fp32{-1};
-
-  if (FLAGS_enable_fp32) {
-    TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32,
-                            &sample_latency_fp32);
-  }
-
-  LOG(INFO) << "--- INT8 prediction start ---";
-  auto *qcfg = reinterpret_cast<const PaddlePredictor::Config *>(qconfig);
-  PrintConfig(qcfg, true);
-  std::vector<std::vector<PaddleTensor>> quantized_outputs;
-  float sample_latency_int8{-1};
-
-  if (FLAGS_enable_int8) {
-    TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true,
-                            VarType::INT8, &sample_latency_int8);
-  }
-  SummarizePerformance(sample_latency_fp32, sample_latency_int8);
-
-  CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx);
-}
-
-void CompareNativeAndAnalysis(
-    PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
-    const std::vector<std::vector<PaddleTensor>> &inputs) {
-  int batch_size = FLAGS_batch_size;
-  std::vector<PaddleTensor> native_outputs, analysis_outputs;
-  native_pred->Run(inputs[0], &native_outputs, batch_size);
-  analysis_pred->Run(inputs[0], &analysis_outputs, batch_size);
-  CompareResult(analysis_outputs, native_outputs);
-}
-
-void CompareAnalysisAndZeroCopy(
-    PaddlePredictor::Config *config, PaddlePredictor::Config *config1,
-    const std::vector<std::vector<PaddleTensor>> &inputs,
-    const std::vector<std::string> &outputs_name) {
-  int batch_size = FLAGS_batch_size;
-  // analysis
-  std::vector<PaddleTensor> analysis_outputs;
-  auto predictor = CreateTestPredictor(config, true);
-  predictor->Run(inputs[0], &analysis_outputs, batch_size);
-  // analysis + zero_copy
-  std::vector<ZeroCopyTensor> zerocopy_outputs;
-  reinterpret_cast<AnalysisConfig *>(config1)->SwitchUseFeedFetchOps(false);
-  predictor = CreateTestPredictor(config1, true);
-  ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]);
-  predictor->ZeroCopyRun();
-  for (size_t i = 0; i < outputs_name.size(); i++) {
-    ZeroCopyTensor zerocopy_output =
-        *predictor->GetOutputTensor(outputs_name[i]).get();
-    zerocopy_outputs.emplace_back(zerocopy_output);
-    LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output);
-  }
-  // compare
-  CompareResult(analysis_outputs, zerocopy_outputs);
-}
-
-void SaveOptimModel(AnalysisConfig *cfg, const std::string &dstPath) {
-  auto predictor = CreateTestPredictor(
-      reinterpret_cast<const PaddlePredictor::Config *>(cfg),
-      FLAGS_use_analysis);
-  (static_cast<AnalysisPredictor *>(predictor.get()))->SaveOptimModel(dstPath);
-}
-
-template <typename T>
-std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
-  std::stringstream ss;
-  ss << "\n---- tensor ---" << '\n';
-  ss << "lod: [";
-  for (const auto &level : tensor.lod()) {
-    ss << "[ ";
-    for (auto i : level) {
-      ss << i << ", ";
-    }
-    ss << "]";
-  }
-  ss << "]\n";
-
-  ss << "shape: [";
-  int size = 1;
-  for (int i = 0; i < tensor.dims().size(); i++) {
-    int dim = tensor.dims()[i];
-    ss << dim << ", ";
-    size *= dim;
-  }
-  ss << "]\n";
-
-  ss << "data: ";
-  for (int i = 0; i < std::min(20, size); i++) {
-    ss << tensor.data<T>()[i] << " ";
-  }
-  ss << "\n";
-
-  return ss.str();
-}
-
-static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) {
-  if (a.size() != b.size()) {
-    LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(),
-                                  b.size());
-    return false;
-  }
-  for (size_t i = 0; i < a.size(); i++) {
-    auto &al = a[i];
-    auto &bl = b[i];
-    if (al.size() != bl.size()) {
-      LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(),
-                                    bl.size());
-      return false;
-    }
-  }
-  return true;
-}
-
-static bool CompareShape(const std::vector<int64_t> &a,
-                         const std::vector<int64_t> &b) {
-  if (a.size() != b.size()) {
-    LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(),
-                                  b.size());
-    return false;
-  }
-  for (size_t i = 0; i < a.size(); i++) {
-    if (a[i] != b[i]) {
-      LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i,
-                                    a[i], b[i]);
-      return false;
-    }
-  }
-  return true;
-}
-
-static bool CompareTensorData(const framework::LoDTensor &a,
-                              const framework::LoDTensor &b) {
-  auto a_shape = framework::vectorize(a.dims());
-  auto b_shape = framework::vectorize(b.dims());
-  size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), size_t{1},
-                                  [](int a, int b) { return a * b; });
-  size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), size_t{1},
-                                  [](int a, int b) { return a * b; });
-  if (a_size != b_size) {
-    LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
-                                  a_size, b_size);
-  }
-
-  for (size_t i = 0; i < a_size; i++) {
-    if (a.type() == VarType::FP32) {
-      const auto *a_data = a.data<float>();
-      const auto *b_data = b.data<float>();
-      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
-        LOG(ERROR) << string::Sprintf(
-            "tensor data %d-th element not match, %f != %f", i, a_data[i],
-            b_data[i]);
-        return false;
-      }
-    } else if (a.type() == VarType::INT64) {
-      const auto *a_data = a.data<int64_t>();
-      const auto *b_data = b.data<int64_t>();
-      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
-        LOG(ERROR) << string::Sprintf(
-            "tensor data %d-th element not match, %f != %f", i, a_data[i],
-            b_data[i]);
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-static bool CompareTensor(const framework::LoDTensor &a,
-                          const framework::LoDTensor &b) {
-  if (!CompareLoD(a.lod(), b.lod())) {
-    return false;
-  }
-  if (!CompareShape(framework::vectorize(a.dims()),
-                    framework::vectorize(b.dims()))) {
-    return false;
-  }
-
-  if (!CompareTensorData(a, b)) {
-    return false;
-  }
-
-  return true;
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
deleted file mode 100644
index 35be7db560a214150a4b9d99ca650938fcf72cff..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
-
-namespace paddle {
-namespace inference {
-
-TEST(TensorRT, cascade_rcnn) {
-  std::string model_dir = FLAGS_infer_model + "/cascade_rcnn";
-  AnalysisConfig config;
-  int batch_size = 1;
-  config.EnableUseGpu(100, 0);
-  config.SetModel(model_dir + "/model", model_dir + "/params");
-  config.SwitchUseFeedFetchOps(false);
-  config.EnableTensorRtEngine(1 << 30, batch_size, 40,
-                              AnalysisConfig::Precision::kFloat32, false);
-
-  auto predictor = CreatePaddlePredictor(config);
-
-  int channels = 3;
-  int height = 640;
-  int width = 640;
-  int input_num = batch_size * channels * height * width;
-  float *input = new float[input_num];
-  memset(input, 1.0, input_num * sizeof(float));
-
-  float *im_shape = new float[3];
-  im_shape[0] = 3.0;
-  im_shape[1] = 640.0;
-  im_shape[2] = 640.0;
-
-  auto input_names = predictor->GetInputNames();
-
-  auto input_t = predictor->GetInputTensor(input_names[0]);
-  input_t->Reshape({batch_size, channels, height, width});
-  input_t->copy_from_cpu(input);
-
-  auto input_t1 = predictor->GetInputTensor(input_names[1]);
-  input_t1->Reshape({batch_size, 3});
-  input_t1->copy_from_cpu(im_shape);
-
-  ASSERT_TRUE(predictor->ZeroCopyRun());
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
deleted file mode 100644
index 2ee75f90b441f7d13cd50908078eaf925332dde6..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
-
-namespace paddle {
-namespace inference {
-
-TEST(TensorRT_fc, compare) {
-  std::string model_dir = FLAGS_infer_model + "/fc_uint8";
-  compare(model_dir, /* use_tensorrt */ true);
-  // Open it when need.
-  // profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
-}
-
-TEST(ZeroCopyTensor, uint8) {
-  std::string model_dir = FLAGS_infer_model + "/" + "fc_uint8";
-  AnalysisConfig config;
-  config.EnableUseGpu(100, 0);
-  config.SetModel(model_dir);
-  config.SwitchUseFeedFetchOps(false);
-  config.EnableProfile();
-
-  std::vector<std::vector<PaddleTensor>> inputs_all;
-  auto predictor = CreatePaddlePredictor(config);
-  auto input_names = predictor->GetInputNames();
-  auto name2shape = predictor->GetInputTensorShape();
-
-  int batch_size = 1;
-  int length = 4;
-  int input_num = batch_size * length;
-  uint8_t *input = new uint8_t[input_num];
-  memset(input, 1, input_num * sizeof(uint8_t));
-  auto input_t = predictor->GetInputTensor(input_names[0]);
-  input_t->Reshape({batch_size, length});
-  input_t->copy_from_cpu(input);
-  input_t->type();
-
-  ASSERT_TRUE(predictor->ZeroCopyRun());
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
deleted file mode 100644
index 1dbdcccf41ba3a42dd21982cd9fac86f5e767382..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
-
-namespace paddle {
-namespace inference {
-
-TEST(TensorRT_mobilenet, compare) {
-  std::string model_dir = FLAGS_infer_model + "/mobilenet";
-  compare(model_dir, /* use_tensorrt */ true);
-  // Open it when need.
-  // profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
-}
-
-TEST(AnalysisPredictor, use_gpu) {
-  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  AnalysisConfig config;
-  config.EnableUseGpu(100, 0);
-  config.EnableCUDNN();
-  config.SetModel(model_dir);
-  config.pass_builder()->TurnOnDebug();
-
-  std::vector<std::vector<PaddleTensor>> inputs_all;
-  auto predictor = CreatePaddlePredictor(config);
-  SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
-
-  std::vector<PaddleTensor> outputs;
-  for (auto& input : inputs_all) {
-    ASSERT_TRUE(predictor->Run(input, &outputs));
-  }
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
deleted file mode 100644
index 9f70a58a0c04451bdc0d4f11a5daa8a865881757..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
-
-namespace paddle {
-namespace inference {
-
-TEST(resnet50, compare_continuous_input) {
-  std::string model_dir = FLAGS_infer_model + "/resnet50";
-  compare_continuous_input(model_dir, /* use_tensorrt */ true);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_resnext_test.cc b/paddle/fluid/inference/tests/api/trt_resnext_test.cc
deleted file mode 100644
index 588b5bffd74e540aa9f3bbdd4f5496c520290aa4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/trt_resnext_test.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
-
-namespace paddle {
-namespace inference {
-
-TEST(TensorRT_resnext50, compare) {
-  std::string model_dir = FLAGS_infer_model + "/resnext50";
-  compare(model_dir, /* use_tensorrt */ true);
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_test_helper.h b/paddle/fluid/inference/tests/api/trt_test_helper.h
deleted file mode 100644
index ee3ba63bb2ca6854564dc60ba96c235035a26216..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/trt_test_helper.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <string>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
-
-namespace paddle {
-namespace inference {
-
-DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine.");
-DEFINE_string(prog_filename, "", "Name of model file.");
-DEFINE_string(param_filename, "", "Name of parameters file.");
-
-template <typename ConfigType>
-void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu,
-               bool use_tensorrt = false, int batch_size = -1) {
-  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
-    config->param_file = model_dir + "/" + FLAGS_param_filename;
-  } else {
-    config->model_dir = model_dir;
-  }
-  if (use_gpu) {
-    config->use_gpu = true;
-    config->device = 0;
-    config->fraction_of_gpu_memory = 0.15;
-  }
-}
-
-template <>
-void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
-                               bool use_gpu, bool use_tensorrt,
-                               int batch_size) {
-  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-    config->SetModel(model_dir + "/" + FLAGS_prog_filename,
-                     model_dir + "/" + FLAGS_param_filename);
-  } else {
-    config->SetModel(model_dir);
-  }
-  if (use_gpu) {
-    config->EnableUseGpu(100, 0);
-    if (use_tensorrt) {
-      config->EnableTensorRtEngine(1 << 10, batch_size, 3,
-                                   AnalysisConfig::Precision::kFloat32, false);
-      config->pass_builder()->DeletePass("conv_bn_fuse_pass");
-      config->pass_builder()->DeletePass("fc_fuse_pass");
-      config->pass_builder()->TurnOnDebug();
-    } else {
-      config->EnableCUDNN();
-      config->SwitchIrOptim();
-    }
-  }
-}
-
-void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
-  std::vector<std::vector<PaddleTensor>> inputs_all;
-  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-    SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
-                      FLAGS_param_filename);
-  } else {
-    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
-  }
-
-  std::vector<std::vector<PaddleTensor>> outputs;
-  if (use_analysis || use_tensorrt) {
-    AnalysisConfig config;
-    config.EnableUseGpu(100, 0);
-    config.pass_builder()->TurnOnDebug();
-    SetConfig<AnalysisConfig>(&config, model_dir, true, use_tensorrt,
-                              FLAGS_batch_size);
-    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
-                   inputs_all, &outputs, FLAGS_num_threads, true);
-  } else {
-    NativeConfig config;
-    SetConfig<NativeConfig>(&config, model_dir, true, false);
-    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
-                   inputs_all, &outputs, FLAGS_num_threads, false);
-  }
-}
-
-void compare(std::string model_dir, bool use_tensorrt) {
-  std::vector<std::vector<PaddleTensor>> inputs_all;
-  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-    SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
-                      FLAGS_param_filename);
-  } else {
-    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
-  }
-
-  AnalysisConfig analysis_config;
-  SetConfig<AnalysisConfig>(&analysis_config, model_dir, true, use_tensorrt,
-                            FLAGS_batch_size);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config),
-      inputs_all);
-}
-
-void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
-  AnalysisConfig analysis_config;
-  SetConfig<AnalysisConfig>(&analysis_config, model_dir, true, use_tensorrt,
-                            FLAGS_batch_size);
-  auto config =
-      reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config);
-  auto native_pred = CreateTestPredictor(config, false);
-  auto analysis_pred = CreateTestPredictor(config, true);
-  for (int i = 0; i < 20; i++) {
-    std::vector<std::vector<PaddleTensor>> inputs_all;
-    if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-      SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
-                        FLAGS_param_filename, nullptr, i);
-    } else {
-      SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "", nullptr,
-                        i);
-    }
-    CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(),
-                             inputs_all);
-  }
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
deleted file mode 100644
index 874727943c2b2cd0824ce8c5386a96b7215ca501..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-function(inference_test TARGET_NAME)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs ARGS)
-  cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  set(arg_list "")
-  if(inference_test_ARGS)
-    foreach(arg ${inference_test_ARGS})
-      list(APPEND arg_list "_${arg}")
-    endforeach()
-  else()
-    list(APPEND arg_list "_")
-  endif()
-  foreach(arg ${arg_list})
-    string(REGEX REPLACE "^_$" "" arg "${arg}")
-    cc_test(test_inference_${TARGET_NAME}${arg}
-        SRCS test_inference_${TARGET_NAME}.cc
-        DEPS paddle_fluid_origin
-        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
-    set_tests_properties(test_inference_${TARGET_NAME}${arg}
-            PROPERTIES DEPENDS test_${TARGET_NAME})
-    set_tests_properties(test_inference_${TARGET_NAME}${arg}
-            PROPERTIES LABELS "RUN_TYPE=DIST")
-  endforeach()
-endfunction(inference_test)
-
-####################
-# Inference tests here depend on fluid/tests/book. If users want to run
-# individual test with ctest, they need to run tests in fluid/tests/book
-# first to generate saved model.
-####################
-# This unittest is buggy!
-#inference_test(fit_a_line)
-inference_test(image_classification ARGS vgg resnet)
-inference_test(label_semantic_roles)
-inference_test(recognize_digits ARGS mlp conv)
-inference_test(recommender_system)
-#inference_test(rnn_encoder_decoder)
-#inference_test(understand_sentiment ARGS conv)
-inference_test(word2vec)
-
-# This is an unly work around to make this test run
-# TODO(TJ): clean me up
-cc_test(test_inference_nlp
-  SRCS test_inference_nlp.cc
-  DEPS paddle_fluid_origin
-  ARGS
-  --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
-set_tests_properties(test_inference_nlp PROPERTIES LABELS "RUN_TYPE=DIST")
diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
deleted file mode 100644
index 2c5b66a32903f4ffdedb074b31aec53ae6cacaf3..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-#include "paddle/fluid/inference/tests/test_multi_thread_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-TEST(inference, fit_a_line) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  for (int num_threads : {1, 2}) {
-    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_feeds;
-    cpu_feeds.resize(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
-      auto* input = new paddle::framework::LoDTensor();
-      // The second dim of the input tensor should be 13
-      // The input data should be >= 0
-      int64_t batch_size = 10;
-      SetupTensor<float>(input, {batch_size, 13}, static_cast<float>(0),
-                         static_cast<float>(10));
-      cpu_feeds[i].push_back(input);
-    }
-
-    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
-    cpu_fetchs1.resize(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
-      auto* output = new paddle::framework::LoDTensor();
-      cpu_fetchs1[i].push_back(output);
-    }
-
-    // Run inference on CPU
-    LOG(INFO) << "--- CPU Runs (num_threads: " << num_threads << "): ---";
-    if (num_threads == 1) {
-      TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds[0],
-                                                cpu_fetchs1[0]);
-    } else {
-      TestMultiThreadInference<paddle::platform::CPUPlace>(
-          dirname, cpu_feeds, cpu_fetchs1, num_threads);
-    }
-
-#ifdef PADDLE_WITH_CUDA
-    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
-    cpu_fetchs2.resize(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
-      auto* output = new paddle::framework::LoDTensor();
-      cpu_fetchs2[i].push_back(output);
-    }
-
-    // Run inference on CUDA GPU
-    LOG(INFO) << "--- GPU Runs (num_threads: " << num_threads << "): ---";
-    if (num_threads == 1) {
-      TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds[0],
-                                                 cpu_fetchs2[0]);
-    } else {
-      TestMultiThreadInference<paddle::platform::CUDAPlace>(
-          dirname, cpu_feeds, cpu_fetchs2, num_threads);
-    }
-
-    for (int i = 0; i < num_threads; ++i) {
-      CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
-      delete cpu_fetchs2[i][0];
-    }
-#endif
-
-    for (int i = 0; i < num_threads; ++i) {
-      delete cpu_feeds[i][0];
-      delete cpu_fetchs1[i][0];
-    }
-  }  // num_threads-loop
-}
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
deleted file mode 100644
index 60c761c5281e2f535aab0200c93fb738addcdb87..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model.");
-DEFINE_int32(batch_size, 1, "Batch size of input data");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times");
-DEFINE_bool(skip_cpu, false, "Skip the cpu test");
-
-TEST(inference, image_classification) {
-  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
-                  "--batch_size=1 --repeat=1";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  const bool is_combined = false;
-  std::vector<std::vector<int64_t>> feed_target_shapes =
-      GetFeedTargetShapes(dirname, is_combined);
-
-  paddle::framework::LoDTensor input;
-  // Use normilized image pixels as input data,
-  // which should be in the range [0.0, 1.0].
-  feed_target_shapes[0][0] = FLAGS_batch_size;
-  paddle::framework::DDim input_dims =
-      paddle::framework::make_ddim(feed_target_shapes[0]);
-  LOG(INFO) << input_dims;
-  SetupTensor<float>(&input, input_dims, static_cast<float>(0),
-                     static_cast<float>(1));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-
-  paddle::framework::LoDTensor output1;
-  if (!FLAGS_skip_cpu) {
-    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-    cpu_fetchs1.push_back(&output1);
-
-    // Run inference on CPU
-    LOG(INFO) << "--- CPU Runs: ---";
-    LOG(INFO) << "Batch size is " << FLAGS_batch_size;
-    TestInference<paddle::platform::CPUPlace, false, true>(
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
-    LOG(INFO) << output1.dims();
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  LOG(INFO) << "--- GPU Runs: ---";
-  LOG(INFO) << "Batch size is " << FLAGS_batch_size;
-  TestInference<paddle::platform::CUDAPlace, false, true>(
-      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
-  LOG(INFO) << output2.dims();
-
-  if (!FLAGS_skip_cpu) {
-    CheckError<float>(output1, output2);
-  }
-
-  // float16 inference requires cuda GPUs with >= 5.3 compute capability
-  if (!FLAGS_fp16_dirname.empty() &&
-      paddle::platform::GetCUDAComputeCapability(0) >= 53) {
-    paddle::framework::LoDTensor output3;
-    std::vector<paddle::framework::LoDTensor*> cpu_fetchs3;
-    cpu_fetchs3.push_back(&output3);
-
-    LOG(INFO) << "--- GPU Runs in float16 mode: ---";
-    LOG(INFO) << "Batch size is " << FLAGS_batch_size;
-
-    TestInference<paddle::platform::CUDAPlace, false, true>(
-        FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat);
-
-    CheckError<float>(output2, output3);
-  }
-#endif
-}
diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
deleted file mode 100644
index 84bb855fea5fa397ff71e2c922fea3302951b7ca..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-TEST(inference, label_semantic_roles) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1,
-      ctx_p2, mark;
-  paddle::framework::LoD lod{{0, 4, 10}};
-  int64_t word_dict_len = 44068;
-  int64_t predicate_dict_len = 3162;
-  int64_t mark_dict_len = 2;
-
-  SetupLoDTensor(&word, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(&predicate, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(predicate_dict_len - 1));
-  SetupLoDTensor(&ctx_n2, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(&ctx_n1, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(&ctx_0, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(&ctx_p1, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(&ctx_p2, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(&mark, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(mark_dict_len - 1));
-
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&word);
-  cpu_feeds.push_back(&predicate);
-  cpu_feeds.push_back(&ctx_n2);
-  cpu_feeds.push_back(&ctx_n1);
-  cpu_feeds.push_back(&ctx_0);
-  cpu_feeds.push_back(&ctx_p1);
-  cpu_feeds.push_back(&ctx_p2);
-  cpu_feeds.push_back(&mark);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
-}
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
deleted file mode 100644
index 5c1204b9e6b78e42d999b12a2b7be6f822ecf818..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <time.h>
-#include <fstream>
-#include <thread>  // NOLINT
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-
-#include "paddle/fluid/framework/feed_fetch_method.h"
-
-DEFINE_string(model_path, "", "Directory of the inference model.");
-DEFINE_string(data_file, "", "File of input index data.");
-DEFINE_int32(repeat, 100, "Running the inference program repeat times");
-DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
-DEFINE_int32(num_threads, 1, "Number of threads should be used");
-DECLARE_bool(use_mkldnn);
-DECLARE_int32(paddle_num_threads);
-
-inline double GetCurrentMs() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
-}
-
-// This function just give dummy data for recognize_digits model.
-size_t DummyData(std::vector<paddle::framework::LoDTensor>* out) {
-  paddle::framework::LoDTensor input;
-  SetupTensor<float>(&input, {1, 1, 28, 28}, -1.f, 1.f);
-  out->emplace_back(input);
-  return 1;
-}
-
-// Load the input word index data from file and save into LodTensor.
-// Return the size of words.
-size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
-                const std::string& filename) {
-  if (filename.empty()) {
-    return DummyData(out);
-  }
-
-  size_t sz = 0;
-  std::fstream fin(filename);
-  std::string line;
-  out->clear();
-  while (getline(fin, line)) {
-    std::istringstream iss(line);
-    std::vector<int64_t> ids;
-    std::string field;
-    while (getline(iss, field, ' ')) {
-      ids.push_back(stoi(field));
-    }
-    if (ids.size() >= 1024) {
-      // Synced with NLP guys, they will ignore input larger then 1024
-      continue;
-    }
-
-    paddle::framework::LoDTensor words;
-    paddle::framework::LoD lod{{0, ids.size()}};
-    words.set_lod(lod);
-    int64_t* pdata = words.mutable_data<int64_t>(
-        {static_cast<int64_t>(ids.size()), 1}, paddle::platform::CPUPlace());
-    memcpy(pdata, ids.data(), words.numel() * sizeof(int64_t));
-    out->emplace_back(words);
-    sz += ids.size();
-  }
-  return sz;
-}
-
-// Split input data samples into small pieces jobs as balanced as possible,
-// according to the number of threads.
-void SplitData(
-    const std::vector<paddle::framework::LoDTensor>& datasets,
-    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
-    const int num_threads) {
-  size_t s = 0;
-  jobs->resize(num_threads);
-  while (s < datasets.size()) {
-    for (auto it = jobs->begin(); it != jobs->end(); it++) {
-      it->emplace_back(&datasets[s]);
-      s++;
-      if (s >= datasets.size()) {
-        break;
-      }
-    }
-  }
-}
-
-void ThreadRunInfer(
-    const int tid, paddle::framework::Scope* scope,
-    const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
-  // maybe framework:ProgramDesc is not thread-safe
-  paddle::platform::CPUPlace place;
-  paddle::framework::Executor executor(place);
-  auto& sub_scope = scope->NewScope();
-  auto inference_program =
-      paddle::inference::Load(&executor, scope, FLAGS_model_path);
-
-  auto ctx = executor.Prepare(*inference_program, /*block_id*/ 0);
-  executor.CreateVariables(*inference_program, &sub_scope, /*block_id*/ 0);
-
-  const std::vector<std::string>& feed_target_names =
-      inference_program->GetFeedTargetNames();
-  const std::vector<std::string>& fetch_target_names =
-      inference_program->GetFetchTargetNames();
-
-  PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-  paddle::framework::LoDTensor outtensor;
-  fetch_targets[fetch_target_names[0]] = &outtensor;
-
-  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
-  PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
-
-  // map the data of feed_targets to feed_holder
-  for (auto* op : inference_program->Block(0).AllOps()) {
-    if (op->Type() == "feed") {
-      std::string feed_target_name = op->Output("Out")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      paddle::framework::SetFeedVariable(scope, *feed_targets[feed_target_name],
-                                         "feed", idx);
-    }
-  }
-
-  auto& inputs = jobs[tid];
-  auto start_ms = GetCurrentMs();
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    feed_targets[feed_target_names[0]] = inputs[i];
-    executor.RunPreparedContext(ctx.get(), &sub_scope,
-                                false /*create_local_scope*/);
-  }
-  auto stop_ms = GetCurrentMs();
-
-  // obtain the data of fetch_targets from fetch_holder
-  for (auto* op : inference_program->Block(0).AllOps()) {
-    if (op->Type() == "fetch") {
-      std::string fetch_target_name = op->Input("X")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      *fetch_targets[fetch_target_name] =
-          paddle::framework::GetFetchVariable(*scope, "fetch", idx);
-    }
-  }
-
-  scope->DeleteScope(&sub_scope);
-  LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
-            << " samples, avg time per sample: "
-            << (stop_ms - start_ms) / inputs.size() << " ms";
-}
-
-TEST(inference, nlp) {
-  if (FLAGS_model_path.empty()) {
-    LOG(FATAL) << "Usage: ./example --model_path=path/to/your/model";
-  }
-  if (FLAGS_data_file.empty()) {
-    LOG(WARNING) << "No data file provided, will use dummy data!"
-                 << "Note: if you use nlp model, please provide data file.";
-  }
-  LOG(INFO) << "Model Path: " << FLAGS_model_path;
-  LOG(INFO) << "Data File: " << FLAGS_data_file;
-
-  std::vector<paddle::framework::LoDTensor> datasets;
-  size_t num_total_words = LoadData(&datasets, FLAGS_data_file);
-  LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
-  LOG(INFO) << "Total number of words: " << num_total_words;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  std::unique_ptr<paddle::framework::Scope> scope(
-      new paddle::framework::Scope());
-
-  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
-
-  double start_ms = 0, stop_ms = 0;
-  if (FLAGS_num_threads > 1) {
-    std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
-    SplitData(datasets, &jobs, FLAGS_num_threads);
-    std::vector<std::unique_ptr<std::thread>> threads;
-    start_ms = GetCurrentMs();
-    for (int i = 0; i < FLAGS_num_threads; ++i) {
-      threads.emplace_back(
-          new std::thread(ThreadRunInfer, i, scope.get(), std::ref(jobs)));
-    }
-    for (int i = 0; i < FLAGS_num_threads; ++i) {
-      threads[i]->join();
-    }
-    stop_ms = GetCurrentMs();
-  } else {
-    // 1. Define place, executor, scope
-    paddle::platform::CPUPlace place;
-    paddle::framework::Executor executor(place);
-
-    // 2. Initialize the inference_program and load parameters
-    std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-    inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path,
-                                    /*model combined*/ false);
-    // always prepare context
-    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
-    ctx = executor.Prepare(*inference_program, 0);
-    if (FLAGS_prepare_vars) {
-      executor.CreateVariables(*inference_program, scope.get(), 0);
-    }
-    // preapre fetch
-    const std::vector<std::string>& fetch_target_names =
-        inference_program->GetFetchTargetNames();
-    PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
-    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-    paddle::framework::LoDTensor outtensor;
-    fetch_targets[fetch_target_names[0]] = &outtensor;
-
-    // prepare feed
-    const std::vector<std::string>& feed_target_names =
-        inference_program->GetFeedTargetNames();
-    PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
-    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
-
-    // feed data and run
-    start_ms = GetCurrentMs();
-    for (size_t i = 0; i < datasets.size(); ++i) {
-      feed_targets[feed_target_names[0]] = &(datasets[i]);
-      executor.RunPreparedContext(ctx.get(), scope.get(), &feed_targets,
-                                  &fetch_targets, !FLAGS_prepare_vars);
-    }
-    stop_ms = GetCurrentMs();
-    LOG(INFO) << "Tid: 0, process " << datasets.size()
-              << " samples, avg time per sample: "
-              << (stop_ms - start_ms) / datasets.size() << " ms";
-  }
-  LOG(INFO) << "Total inference time with " << FLAGS_num_threads
-            << " threads : " << (stop_ms - start_ms) / 1000.0
-            << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000);
-}
diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
deleted file mode 100644
index f12828a2685305c20d26492dbf04fa9ddacf9317..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-DEFINE_int32(batch_size, 1, "Batch size of input data");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times");
-
-TEST(inference, recognize_digits) {
-  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
-                  "--batch_size=1 --repeat=1";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor input;
-  // Use normilized image pixels as input data,
-  // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(&input, {FLAGS_batch_size, 1, 28, 28},
-                     static_cast<float>(-1), static_cast<float>(1));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-
-  for (auto is_combined : {false, true}) {
-    paddle::framework::LoDTensor output1;
-    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-    cpu_fetchs1.push_back(&output1);
-
-    // Run inference on CPU
-    LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
-                                              FLAGS_repeat, is_combined);
-    LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-    paddle::framework::LoDTensor output2;
-    std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-    cpu_fetchs2.push_back(&output2);
-
-    // Run inference on CUDA GPU
-    LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
-                                               FLAGS_repeat, is_combined);
-    LOG(INFO) << output2.dims();
-
-    CheckError<float>(output1, output2);
-#endif
-  }
-}
diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
deleted file mode 100644
index 70aa6b194d4417fc85384cc3f615089f024f928e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-TEST(inference, recommender_system) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  int64_t batch_size = 1;
-
-  paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id,
-      category_id, movie_title;
-
-  // Use the first data from paddle.dataset.movielens.test() as input
-  std::vector<int64_t> user_id_data = {1};
-  SetupTensor<int64_t>(&user_id, {batch_size, 1}, user_id_data);
-
-  std::vector<int64_t> gender_id_data = {1};
-  SetupTensor<int64_t>(&gender_id, {batch_size, 1}, gender_id_data);
-
-  std::vector<int64_t> age_id_data = {0};
-  SetupTensor<int64_t>(&age_id, {batch_size, 1}, age_id_data);
-
-  std::vector<int64_t> job_id_data = {10};
-  SetupTensor<int64_t>(&job_id, {batch_size, 1}, job_id_data);
-
-  std::vector<int64_t> movie_id_data = {783};
-  SetupTensor<int64_t>(&movie_id, {batch_size, 1}, movie_id_data);
-
-  std::vector<int64_t> category_id_data = {10, 8, 9};
-  SetupLoDTensor<int64_t>(&category_id, {3, 1}, {{0, 3}}, category_id_data);
-
-  std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
-  SetupLoDTensor<int64_t>(&movie_title, {5, 1}, {{0, 5}}, movie_title_data);
-
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&user_id);
-  cpu_feeds.push_back(&gender_id);
-  cpu_feeds.push_back(&age_id);
-  cpu_feeds.push_back(&job_id);
-  cpu_feeds.push_back(&movie_id);
-  cpu_feeds.push_back(&category_id);
-  cpu_feeds.push_back(&movie_title);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
-}
diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
deleted file mode 100644
index e15c3f59acb1eac535120554a3799c37e9d4e951..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-TEST(inference, rnn_encoder_decoder) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor word_data, trg_word;
-  paddle::framework::LoD lod{{0, 4, 10}};
-
-  SetupLoDTensor(&word_data, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(1));
-  SetupLoDTensor(&trg_word, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(1));
-
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&word_data);
-  cpu_feeds.push_back(&trg_word);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
-}
diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
deleted file mode 100644
index 0dbb6a30405eb64133613052ad57b1f705a9e7b4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-TEST(inference, understand_sentiment) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor words;
-  paddle::framework::LoD lod{{0, 4, 10}};
-  int64_t word_dict_len = 5147;
-
-  SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(word_dict_len - 1));
-
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&words);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
-}
diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
deleted file mode 100644
index c9328eb21b4fdb06c5f65ba0f7337b1e79fa1927..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/inference/tests/test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-TEST(inference, word2vec) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word;
-  paddle::framework::LoD lod{{0, 1}};
-  int64_t dict_size = 2073;  // The size of dictionary
-
-  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
-
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&first_word);
-  cpu_feeds.push_back(&second_word);
-  cpu_feeds.push_back(&third_word);
-  cpu_feeds.push_back(&fourth_word);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
-}
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
deleted file mode 100644
index 444bab1b33df063221828fe6c1457e2af672e652..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/test.cmake
+++ /dev/null
@@ -1,82 +0,0 @@
-include(ExternalProject)
-set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
-set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
-    "A path setting inference demo download directories.")
-set(CPU_NUM_THREADS_ON_CI 4 CACHE STRING "Run multi-threads on CI to reduce CI time.")
-
-function(inference_download INSTALL_DIR URL FILENAME)
-  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
-  string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
-  ExternalProject_Add(
-      extern_inference_download_${FILENAME_EX}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      PREFIX                ${INSTALL_DIR}
-      URL                   ${URL}/${FILENAME}
-      DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME}
-      DOWNLOAD_DIR          ${INSTALL_DIR}
-      DOWNLOAD_NO_PROGRESS  1
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
-      UPDATE_COMMAND        ""
-      INSTALL_COMMAND       ""
-  )
-endfunction()
-
-function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
-  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
-  string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
-  set(EXTERNAL_PROJECT_NAME "extern_inference_download_${FILENAME_EX}")
-  set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
-  ExternalProject_Add(
-      ${EXTERNAL_PROJECT_NAME}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      PREFIX                ${INSTALL_DIR}
-      DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
-                            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
-      DOWNLOAD_DIR          ${INSTALL_DIR}
-      DOWNLOAD_NO_PROGRESS  1
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
-      UPDATE_COMMAND        ""
-      INSTALL_COMMAND       ""
-  )
-endfunction()
-
-set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
-  inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
-endif()
-set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
-
-function (inference_base_test_build TARGET)
-   set(options "")
-   set(oneValueArgs "")
-   set(multiValueArgs SRCS DEPS)
-   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-   cc_test_build(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS})
-endfunction()
-
-function (inference_base_test_run TARGET)
-   set(options "")
-   set(oneValueArgs "")
-   set(multiValueArgs COMMAND ARGS)
-   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-   if(WITH_GPU)
-       set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
-   endif()
-   cc_test_run(${TARGET} COMMAND ${base_test_COMMAND} ARGS ${mem_opt} ${base_test_ARGS})
-endfunction()
-
-function (inference_base_test TARGET)
-   set(options "")
-   set(oneValueArgs "")
-   set(multiValueArgs SRCS ARGS DEPS)
-   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-   inference_base_test_build(${TARGET}
-	   SRCS ${base_test_SRCS}
-	   DEPS ${base_test_DEPS})
-   inference_base_test_run(${TARGET}
-	   COMMAND ${TARGET}
-	   ARGS ${base_test_ARGS})
-endfunction()
-
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
deleted file mode 100644
index 861f69f4d2143b16bdec546d92ce7bd13ca53ed3..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/test_helper.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <map>
-#include <random>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DECLARE_bool(use_mkldnn);
-
-template <typename T>
-void SetupTensor(paddle::framework::LoDTensor* input,
-                 paddle::framework::DDim dims, T lower, T upper) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-
-  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
-  for (int i = 0; i < input->numel(); ++i) {
-    input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-template <typename T>
-void SetupTensor(paddle::framework::LoDTensor* input,
-                 paddle::framework::DDim dims, const std::vector<T>& data) {
-  CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
-  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
-  memcpy(input_ptr, data.data(), input->numel() * sizeof(T));
-}
-
-template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor* input,
-                    const paddle::framework::LoD& lod, T lower, T upper) {
-  input->set_lod(lod);
-  int dim = lod[0][lod[0].size() - 1];
-  SetupTensor<T>(input, {dim, 1}, lower, upper);
-}
-
-template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor* input,
-                    paddle::framework::DDim dims,
-                    const paddle::framework::LoD lod,
-                    const std::vector<T>& data) {
-  const size_t level = lod.size() - 1;
-  CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
-  input->set_lod(lod);
-  SetupTensor<T>(input, dims, data);
-}
-
-template <typename T>
-void CheckError(const paddle::framework::LoDTensor& output1,
-                const paddle::framework::LoDTensor& output2) {
-  // Check lod information
-  EXPECT_EQ(output1.lod(), output2.lod());
-
-  EXPECT_EQ(output1.dims(), output2.dims());
-  EXPECT_EQ(output1.numel(), output2.numel());
-
-  T err = static_cast<T>(0);
-  if (typeid(T) == typeid(float)) {
-    err = 1E-3;
-  } else if (typeid(T) == typeid(double)) {
-    err = 1E-6;
-  } else {
-    err = 0;
-  }
-
-  size_t count = 0;
-  for (int64_t i = 0; i < output1.numel(); ++i) {
-    if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
-}
-
-std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
-    paddle::framework::Executor* executor, paddle::framework::Scope* scope,
-    const std::string& dirname, const bool is_combined = false,
-    const std::string& prog_filename = "__model_combined__",
-    const std::string& param_filename = "__params_combined__") {
-  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-  if (is_combined) {
-    // All parameters are saved in a single file.
-    // Hard-coding the file names of program and parameters in unittest.
-    // The file names should be consistent with that used in Python API
-    //  `fluid.io.save_inference_model`.
-    inference_program =
-        paddle::inference::Load(executor, scope, dirname + "/" + prog_filename,
-                                dirname + "/" + param_filename);
-  } else {
-    // Parameters are saved in separate files sited in the specified
-    // `dirname`.
-    inference_program = paddle::inference::Load(executor, scope, dirname);
-  }
-  return inference_program;
-}
-
-std::vector<std::vector<int64_t>> GetFeedTargetShapes(
-    const std::string& dirname, const bool is_combined = false,
-    const std::string& prog_filename = "__model_combined__",
-    const std::string& param_filename = "__params_combined__") {
-  auto place = paddle::platform::CPUPlace();
-  auto executor = paddle::framework::Executor(place);
-  auto* scope = new paddle::framework::Scope();
-
-  auto inference_program = InitProgram(&executor, scope, dirname, is_combined,
-                                       prog_filename, param_filename);
-  auto& global_block = inference_program->Block(0);
-
-  const std::vector<std::string>& feed_target_names =
-      inference_program->GetFeedTargetNames();
-  std::vector<std::vector<int64_t>> feed_target_shapes;
-  for (size_t i = 0; i < feed_target_names.size(); ++i) {
-    auto* var = global_block.FindVar(feed_target_names[i]);
-    std::vector<int64_t> var_shape = var->GetShape();
-    feed_target_shapes.push_back(var_shape);
-  }
-
-  delete scope;
-  return feed_target_shapes;
-}
-
-template <typename Place, bool CreateVars = true, bool PrepareContext = false>
-void TestInference(const std::string& dirname,
-                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1, const bool is_combined = false) {
-  // 1. Define place, executor, scope
-  auto place = Place();
-  auto executor = paddle::framework::Executor(place);
-  auto* scope = new paddle::framework::Scope();
-
-  // Profile the performance
-  paddle::platform::ProfilerState state;
-  if (paddle::platform::is_cpu_place(place)) {
-    state = paddle::platform::ProfilerState::kCPU;
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    state = paddle::platform::ProfilerState::kAll;
-    // The default device_id of paddle::platform::CUDAPlace is 0.
-    // Users can get the device_id using:
-    //   int device_id = place.GetDeviceId();
-    paddle::platform::SetDeviceId(0);
-#else
-    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-  }
-
-  // 2. Initialize the inference_program and load parameters
-  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-
-  // Enable the profiler
-  paddle::platform::EnableProfiler(state);
-  {
-    paddle::platform::RecordEvent record_event("init_program");
-    inference_program = InitProgram(&executor, scope, dirname, is_combined);
-  }
-
-  // Disable the profiler and print the timing information
-  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
-                                    "load_program_profiler");
-  paddle::platform::ResetProfiler();
-
-  // 3. Get the feed_target_names and fetch_target_names
-  const std::vector<std::string>& feed_target_names =
-      inference_program->GetFeedTargetNames();
-  const std::vector<std::string>& fetch_target_names =
-      inference_program->GetFetchTargetNames();
-
-  // 4. Prepare inputs: set up maps for feed targets
-  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
-  for (size_t i = 0; i < feed_target_names.size(); ++i) {
-    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
-    feed_targets[feed_target_names[i]] = cpu_feeds[i];
-  }
-
-  // 5. Define Tensor to get the outputs: set up maps for fetch targets
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
-    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
-  }
-
-  // 6. If export Flags_use_mkldnn=True, use mkldnn related ops.
-  if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program);
-
-  // 7. Run the inference program
-  {
-    if (!CreateVars) {
-      // If users don't want to create and destroy variables every time they
-      // run, they need to set `create_vars` to false and manually call
-      // `CreateVariables` before running.
-      executor.CreateVariables(*inference_program, scope, 0);
-    }
-
-    // Ignore the profiling results of the first run
-    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
-    bool CreateLocalScope = CreateVars;
-    if (PrepareContext) {
-      ctx = executor.Prepare(*inference_program, 0);
-      executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
-                                  &fetch_targets, CreateLocalScope, CreateVars);
-    } else {
-      executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
-                   CreateLocalScope, CreateVars);
-    }
-
-    // Enable the profiler
-    paddle::platform::EnableProfiler(state);
-
-    // Run repeat times to profile the performance
-    for (int i = 0; i < repeat; ++i) {
-      paddle::platform::RecordEvent record_event("run_inference");
-
-      if (PrepareContext) {
-        // Note: if you change the inference_program, you need to call
-        // executor.Prepare() again to get a new ExecutorPrepareContext.
-        executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
-                                    &fetch_targets, CreateLocalScope,
-                                    CreateVars);
-      } else {
-        executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
-                     CreateLocalScope, CreateVars);
-      }
-    }
-
-    // Disable the profiler and print the timing information
-    paddle::platform::DisableProfiler(
-        paddle::platform::EventSortingKey::kDefault, "run_inference_profiler");
-    paddle::platform::ResetProfiler();
-  }
-
-  delete scope;
-}
diff --git a/paddle/fluid/inference/tests/test_multi_thread_helper.h b/paddle/fluid/inference/tests/test_multi_thread_helper.h
deleted file mode 100644
index 56745f115db231d4350da72b7de7967175ac9fe8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/test_multi_thread_helper.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/io.h"
-
-void ThreadedRunInference(
-    const std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
-    paddle::framework::Executor* executor, paddle::framework::Scope* scope,
-    const int thread_id,
-    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-    const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
-  auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
-      new paddle::framework::ProgramDesc(*inference_program));
-
-  std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
-  std::string fetch_holder_name =
-      "fetch_" + paddle::string::to_string(thread_id);
-  copy_program->SetFeedHolderName(feed_holder_name);
-  copy_program->SetFetchHolderName(fetch_holder_name);
-
-  // 3. Get the feed_target_names and fetch_target_names
-  const std::vector<std::string>& feed_target_names =
-      copy_program->GetFeedTargetNames();
-  const std::vector<std::string>& fetch_target_names =
-      copy_program->GetFetchTargetNames();
-
-  // 4. Prepare inputs: set up maps for feed targets
-  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
-  for (size_t i = 0; i < feed_target_names.size(); ++i) {
-    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
-    feed_targets[feed_target_names[i]] = cpu_feeds[i];
-  }
-
-  // 5. Define Tensor to get the outputs: set up maps for fetch targets
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
-    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
-  }
-
-  // 6. Run the inference program
-  executor->Run(*copy_program, scope, feed_targets, fetch_targets, true,
-                feed_holder_name, fetch_holder_name);
-}
-
-template <typename Place>
-void TestMultiThreadInference(
-    const std::string& dirname,
-    const std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_feeds,
-    const std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_fetchs,
-    const int num_threads) {
-  // 1. Define place, executor, scope
-  auto place = Place();
-  auto executor = paddle::framework::Executor(place);
-  auto* scope = new paddle::framework::Scope();
-
-  // 2. Initialize the inference_program and load parameters
-  std::unique_ptr<paddle::framework::ProgramDesc> inference_program =
-      paddle::inference::Load(executor, *scope, dirname);
-
-  std::vector<std::thread*> threads;
-  for (int i = 0; i < num_threads; ++i) {
-    threads.push_back(new std::thread(
-        ThreadedRunInference, std::ref(inference_program), &executor, scope, i,
-        std::ref(cpu_feeds[i]), std::ref(cpu_fetchs[i])));
-  }
-  for (int i = 0; i < num_threads; ++i) {
-    threads[i]->join();
-    delete threads[i];
-  }
-
-  delete scope;
-}
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
deleted file mode 100644
index 2104e4ac7222258ee025bd5acd60b1db251df654..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-cc_library(benchmark SRCS benchmark.cc DEPS enforce)
-cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc
deleted file mode 100644
index 0bd526bcac2d9ceda95730dc3c5210aed8ccfb5c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/utils/benchmark.h"
-#include <sstream>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-
-std::string Benchmark::SerializeToString() const {
-  std::stringstream ss;
-  ss << "-----------------------------------------------------\n";
-  ss << "name\t";
-  ss << "batch_size\t";
-  ss << "num_threads\t";
-  ss << "latency\t";
-  ss << "qps";
-  ss << '\n';
-
-  ss << name_ << "\t";
-  ss << batch_size_ << "\t\t";
-  ss << num_threads_ << "\t";
-  ss << latency_ << "\t";
-  ss << 1000.0 / latency_;
-  ss << '\n';
-  return ss.str();
-}
-void Benchmark::PersistToFile(const std::string &path) const {
-  std::ofstream file(path, std::ios::app);
-  PADDLE_ENFORCE(file.is_open(), "Can not open %s to add benchmark", path);
-  file << SerializeToString();
-  file.flush();
-  file.close();
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h
deleted file mode 100644
index a1304cf4e7778f74e15e4fe5e2f405fd3c185eb4..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/utils/benchmark.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <fstream>
-#include <iostream>
-#include <string>
-
-namespace paddle {
-namespace inference {
-
-/*
- * Helper class to calculate the performance.
- */
-struct Benchmark {
-  int batch_size() const { return batch_size_; }
-  void SetBatchSize(int x) { batch_size_ = x; }
-
-  int num_threads() const { return num_threads_; }
-  void SetNumThreads(int x) { num_threads_ = x; }
-
-  bool use_gpu() const { return use_gpu_; }
-  void SetUseGpu() { use_gpu_ = true; }
-
-  float latency() const { return latency_; }
-  void SetLatency(float x) { latency_ = x; }
-
-  const std::string& name() const { return name_; }
-  void SetName(const std::string& name) { name_ = name; }
-
-  std::string SerializeToString() const;
-  void PersistToFile(const std::string& path) const;
-
- private:
-  bool use_gpu_{false};
-  int batch_size_{0};
-  float latency_;
-  int num_threads_{1};
-  std::string name_;
-};
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc
deleted file mode 100644
index 0c48c2db9b691ae8cf587f2729c2789d4ce2dbe1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/utils/benchmark.h"
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-using namespace paddle::inference;  // NOLINT
-TEST(Benchmark, basic) {
-  Benchmark benchmark;
-  benchmark.SetName("key0");
-  benchmark.SetBatchSize(10);
-  benchmark.SetUseGpu();
-  benchmark.SetLatency(220);
-  LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString();
-}
-
-TEST(Benchmark, PersistToFile) {
-  Benchmark benchmark;
-  benchmark.SetName("key0");
-  benchmark.SetBatchSize(10);
-  benchmark.SetUseGpu();
-  benchmark.SetLatency(220);
-
-  benchmark.PersistToFile("1.log");
-  benchmark.PersistToFile("2.log");
-  benchmark.PersistToFile("3.log");
-}
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
deleted file mode 100644
index 990bef359499834c3a7cb025c3fb1d94ceea958e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/utils/singleton.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-
-// NOTE not thread-safe.
-template <typename T>
-struct Singleton {
-  static T& Global() {
-    static T* x = new T;
-    return *x;
-  }
-
-  Singleton() = delete;
-  Singleton& operator=(const Singleton&) = delete;
-};
-
-/*
- * An registor for any type.
- * NOTE not thread-safe.
- */
-template <typename ItemParent>
-struct Registry {
-  static Registry& Global() {
-    static auto* x = new Registry<ItemParent>;
-    return *x;
-  }
-
-  template <typename ItemChild>
-  void Register(const std::string& name) {
-    PADDLE_ENFORCE_EQ(items_.count(name), 0);
-    items_[name] = new ItemChild;
-  }
-
-  ItemParent* Lookup(const std::string& name,
-                     const std::string& default_name = "") {
-    auto it = items_.find(name);
-    if (it == items_.end()) {
-      if (default_name == "")
-        return nullptr;
-      else
-        return items_.find(default_name)->second;
-    }
-    return it->second;
-  }
-
-  ~Registry() {
-    for (auto& item : items_) {
-      delete item.second;
-    }
-  }
-
- private:
-  Registry() = default;
-  std::unordered_map<std::string, ItemParent*> items_;
-};
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
deleted file mode 100644
index ce24f5a4d9c1868d2c35b5d5c56500ad1175ba79..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-add_subdirectory(detail)
-add_subdirectory(allocation)
-
-if (WITH_MKLDNN)
-    set(MKLDNN_CTX_DEPS mkldnn)
-else ()
-    set(MKLDNN_CTX_DEPS)
-endif()
-
-cc_library(malloc SRCS malloc.cc DEPS
-    place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
-cc_library(memcpy SRCS memcpy.cc DEPS place)
-
-cc_library(memory
-        DEPS
-        malloc
-        memcpy)
-
-if (WITH_GPU)
-    nv_test(malloc_test
-            SRCS malloc_test.cu
-            DEPS device_context malloc)
-endif()
-
-#if (WITH_GPU)
-#   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
-#endif()
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
deleted file mode 100644
index ffae6e648080ba32fafd38440e8ff8590437669a..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-cc_library(allocator SRCS allocator.cc DEPS place)
-cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
-cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
-cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
-cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
-cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler)
-cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator)
-
-if (WITH_MKLDNN)
-  set(MKLDNN_CTX_DEPS mkldnn)
-else ()
-  set(MKLDNN_CTX_DEPS)
-endif()
-
-if (WITH_GPU)
-  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
-endif()
-
-cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
-
-nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
-if (WITH_GPU)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
-else ()
-    set(AllocatorFacadeDeps)
-endif()
-
-if (WITH_GPU)
-    nv_test(best_fit_allocator_test
-            SRCS best_fit_allocator_test.cc
-                 best_fit_allocator_test.cu
-            DEPS best_fit_allocator
-                 locked_allocator
-                 cpu_allocator
-                 cuda_allocator
-                 device_context
-                 memcpy)
-else()
-    cc_test(best_fit_allocator_test
-            SRCS best_fit_allocator_test.cc
-            DEPS best_fit_allocator
-                 locked_allocator
-                 cpu_allocator)
-endif()
-
-list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator)
-
-cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
-cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
-cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
-
-cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
-if (WITH_TESTING)
-  if (WITH_GPU)
-    target_link_libraries(retry_allocator_test cuda_allocator)
-  endif()
-
-  set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-endif()
-
-cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
-
-cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
-
-cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
-cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
deleted file mode 100644
index c9a031dffc8f71b5eb270c9c81f6e4344b9462d7..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/aligned_allocator.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class AlignedAllocation : public Allocation {
- public:
-  AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
-      : Allocation(
-            reinterpret_cast<uint8_t*>(underlying_allocation->ptr()) + offset,
-            underlying_allocation->size() - offset,
-            underlying_allocation->place()),
-        underlying_allocation_(std::move(underlying_allocation)) {}
-
- private:
-  AllocationPtr underlying_allocation_;
-};
-
-AlignedAllocator::AlignedAllocator(
-    const std::shared_ptr<Allocator>& underlyning_allocator, size_t alignment)
-    : underlying_allocator_(underlyning_allocator), alignment_(alignment) {
-  PADDLE_ENFORCE(alignment_ > 0, "alignment must be positive integer");
-  if (alignment_ & (alignment_ - 1)) {
-    PADDLE_THROW("alignment must be 2^N, but got %d", alignment_);
-  }
-}
-
-bool AlignedAllocator::IsAllocThreadSafe() const {
-  return underlying_allocator_->IsAllocThreadSafe();
-}
-
-Allocation* AlignedAllocator::AllocateImpl(size_t size) {
-  auto raw_allocation = underlying_allocator_->Allocate(size + alignment_);
-  size_t offset = AlignedPtrOffset(raw_allocation->ptr(), alignment_);
-  return new AlignedAllocation(std::move(raw_allocation), offset);
-}
-
-void AlignedAllocator::FreeImpl(Allocation* allocation) { delete allocation; }
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
deleted file mode 100644
index 71250766034753e17687c9529d9fe1c314b6b662..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <utility>
-#include "paddle/fluid/memory/allocation/allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class AlignedAllocator : public Allocator {
- public:
-  AlignedAllocator(const std::shared_ptr<Allocator>& underlying_allocator,
-                   size_t alignment);
-
-  bool IsAllocThreadSafe() const override;
-
- protected:
-  Allocation* AllocateImpl(size_t size) override;
-
-  void FreeImpl(Allocation* allocation) override;
-
- private:
-  std::shared_ptr<Allocator> underlying_allocator_;
-  size_t alignment_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
deleted file mode 100644
index 4998f3dbb9613abbf5ca67a3d43863d01483b79f..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-bool Allocator::IsAllocThreadSafe() const { return false; }
-
-void Allocator::FreeImpl(Allocation* allocation) {
-  Allocator* allocator = allocation->TopDecoratedAllocator();
-  allocator->Free(allocation);
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
deleted file mode 100644
index 379c8d00960947880e048c6990e7ba856ddc68dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/allocator.h
+++ /dev/null
@@ -1,204 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/inlined_vector.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-// Exception when `Alloc`/`AllocShared` failed
-struct BadAlloc : public std::exception {
-  inline explicit BadAlloc(std::string err_msg, const char* file, int line)
-      : err_str_(platform::GetTraceBackString(std::move(err_msg), file, line)) {
-  }
-
-  const char* what() const noexcept override { return err_str_.c_str(); }
-
-  std::string err_str_;
-};
-
-class Allocator;
-
-// Allocation is the object holding the actually pointer. Use
-// `Allocation::ptr()` will returns the pointer that allocated.
-//
-// NOTE: this is the base class of Allocation. Each allocator can use its own
-//       allocation object.
-// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
-
-/**
- * Allocation is returned by Allocator::Allocate() method.
- *
- * An allocator may be decorated by another allocator. For example, we can
- * decorate a RetryAllocator to any allocator to perform allocation retry when
- * first allocation request fails.
- *
- * Explanations of Allocator design are as follows:
- *
- * Suppose we have an allocator which is decorated by several allocators:
- *
- *   A(1) <- A(2) <- A(3) <- ... <- A(n)
- *
- * , and the public allocator is A(1).
- *
- * The allocation process would be:
- *
- *   A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate()
- *
- * , and the free process would be:
- *
- *   A(1).Free() -> A(2).Free() -> ... -> A(n).Free()
- *
- * Therefore, we should record the allocator chain when allocating, so
- * that we can free the allocation in the reverse order of allocator chain.
- * The field `decorated_allocators_` is used to record this chain.
- *
- * Another example is that we want to add additional fields in Allocation,
- * e.g., something what is done in AlignedAllocator, etc.
- * In this case, we should declare a derived class of Allocation, which
- * contains an underlying Allocation allocated by the underlying allocator.
- * Therefore, `decorated_allocators_` of the new Allocation object would
- * be a new chain, differing from the underlying Allocation object.
- */
-class Allocation {
- public:
-  inline Allocation(void* ptr, size_t size, platform::Place place)
-      : ptr_(ptr), size_(size), place_(place) {}
-
-  Allocation(const Allocation& o) = delete;
-  Allocation& operator=(const Allocation& o) = delete;
-  Allocation(Allocation&& o) = delete;
-  Allocation& operator=(Allocation&& o) = delete;
-
-  // Returns the holding pointer.
-  // NOTE: For performance consideration, it is better not to make this method
-  // as a virtual method. If we want to implement a `defragmentation` later,
-  // we might need to make `ptr_` field as a protected field, and add a virtual
-  // method like `defragmentation` to change `ptr_`.
-  inline void* ptr() const { return ptr_; }
-
-  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
-  // last valid element.
-  //
-  // NOTE: Some allocator might alloc more memory than request. The size
-  // could larger than its request. For example,
-  //    the AlignedAllocator will always allocate memory as size + kAlignment.
-  //    The raw pointer might not aligned, so an offset might be added to raw
-  //    the pointer. The size of this allocation will be
-  //    `size + kAlignemnt - offset`.
-  inline size_t size() const { return size_; }
-
-  inline const platform::Place& place() const { return place_; }
-
-  virtual ~Allocation() {}
-
- private:
-  inline void RegisterDecoratedAllocator(Allocator* allocator) {
-    decorated_allocators_.emplace_back(allocator);
-  }
-
-  inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }
-
-  inline Allocator* TopDecoratedAllocator() {
-    return decorated_allocators_.back();
-  }
-
- private:
-  void* ptr_;
-  size_t size_;
-  platform::Place place_;
-
-  /**
-   * NOTE(zjl): Since decorated_allocators_ is usually a small vector.
-   * We reserve a small buffer to it to prevent frequent heap allocation
-   *
-   * Instead, we can use a std::vector<Allocator *> here, and reserve
-   * kReserveAllocatorNum in constructor of Allocation.
-   * But using std::vector<Allocator *> would make ocr recognition model
-   * fail in CE. The train duration is 8% slower than KPI.
-   */
-  static constexpr size_t kReserveAllocatorNum = 8;
-  using DecoratedAllocatorStack =
-      framework::InlinedVector<Allocator*, kReserveAllocatorNum>;
-
-  DecoratedAllocatorStack decorated_allocators_;
-
-  friend class Allocator;
-};
-
-// Base interface class of memory Allocator.
-class Allocator {
- public:
-  virtual ~Allocator() {}
-
-  class AllocationDeleter {
-   public:
-    inline void operator()(Allocation* allocation) const {
-      Allocator* allocator = allocation->TopDecoratedAllocator();
-      allocator->Free(allocation);
-    }
-  };
-
-  using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
-
-  // Allocate an allocation.
-  // size may be 0, but it would be too complex if we handle size == 0
-  // in each Allocator. So we handle size == 0 inside AllocatorFacade
-  // in our design.
-  inline AllocationPtr Allocate(size_t size) {
-    auto ptr = AllocateImpl(size);
-    ptr->RegisterDecoratedAllocator(this);
-    return AllocationPtr(ptr);
-  }
-
-  // This function should not be called outside Allocator class
-  inline void Free(Allocation* allocation) {
-    allocation->PopDecoratedAllocator();
-    FreeImpl(allocation);
-  }
-
-  // True if the `Allocate` is thread safe.
-  virtual bool IsAllocThreadSafe() const;
-
- protected:
-  virtual Allocation* AllocateImpl(size_t size) = 0;
-  virtual void FreeImpl(Allocation* allocation);
-};
-
-using AllocationDeleter = Allocator::AllocationDeleter;
-using AllocationPtr = Allocator::AllocationPtr;
-
-inline size_t AlignedSize(size_t size, size_t alignment) {
-  auto remaining = size % alignment;
-  return remaining == 0 ? size : size + alignment - remaining;
-}
-
-inline size_t AlignedPtrOffset(const void* ptr, size_t alignment) {
-  auto ptr_addr = reinterpret_cast<uintptr_t>(ptr);
-  auto diff = ptr_addr % alignment;
-  return diff == 0 ? 0 : alignment - diff;
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
deleted file mode 100644
index 220b50b1cc392ba5734542277a466ad1aa279129..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include <gflags/gflags.h>
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
-#include "paddle/fluid/memory/allocation/cpu_allocator.h"
-#include "paddle/fluid/memory/allocation/locked_allocator.h"
-#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
-#include "paddle/fluid/memory/allocation/retry_allocator.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/memory/allocation/cuda_allocator.h"
-#include "paddle/fluid/memory/allocation/pinned_allocator.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
-
-DEFINE_int64(
-    gpu_allocator_retry_time, 10000,
-    "The retry time (milliseconds) when allocator fails "
-    "to allocate memory. No retry if this value is not greater than 0");
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class AllocatorFacadePrivate {
- public:
-  AllocatorFacadePrivate() {
-    auto strategy = GetAllocatorStrategy();
-    switch (strategy) {
-      case AllocatorStrategy::kNaiveBestFit: {
-        InitNaiveBestFitCPUAllocator();
-#ifdef PADDLE_WITH_CUDA
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
-          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
-        }
-        InitNaiveBestFitCUDAPinnedAllocator();
-#endif
-        break;
-      }
-
-      case AllocatorStrategy::kAutoGrowth: {
-        InitNaiveBestFitCPUAllocator();
-#ifdef PADDLE_WITH_CUDA
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
-          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id));
-        }
-        InitNaiveBestFitCUDAPinnedAllocator();
-#endif
-        break;
-      }
-
-      default: {
-        PADDLE_THROW("Unsupported allocator strategy: %d",
-                     static_cast<int>(strategy));
-      }
-    }
-    InitZeroSizeAllocators();
-
-    if (FLAGS_gpu_allocator_retry_time > 0) {
-      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
-    }
-
-    CheckAllocThreadSafe();
-  }
-
-  inline const std::shared_ptr<Allocator>& GetAllocator(
-      const platform::Place& place, size_t size) {
-    const auto& allocators = (size > 0 ? allocators_ : zero_size_allocators_);
-    auto iter = allocators.find(place);
-    PADDLE_ENFORCE(iter != allocators.end(),
-                   "No such allocator for the place, %s", place);
-    return iter->second;
-  }
-
- private:
-  void InitNaiveBestFitCPUAllocator() {
-    allocators_[platform::CPUPlace()] =
-        std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  void InitNaiveBestFitCUDAPinnedAllocator() {
-    allocators_[platform::CUDAPinnedPlace()] =
-        std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
-  }
-
-  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
-    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-  }
-
-  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) {
-    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
-    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize());
-  }
-#endif
-
-  class ZeroSizeAllocator : public Allocator {
-   public:
-    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
-
-    bool IsAllocThreadSafe() const override { return true; }
-
-   protected:
-    Allocation* AllocateImpl(size_t size) override {
-      return new Allocation(nullptr, 0, place_);
-    }
-
-    void FreeImpl(Allocation* allocation) override { delete allocation; }
-
-   private:
-    platform::Place place_;
-  };
-
-  void InitZeroSizeAllocators() {
-    std::vector<platform::Place> places;
-    places.emplace_back(platform::CPUPlace());
-#ifdef PADDLE_WITH_CUDA
-    int device_count = platform::GetCUDADeviceCount();
-    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
-      places.emplace_back(platform::CUDAPlace(dev_id));
-    }
-    places.emplace_back(platform::CUDAPinnedPlace());
-#endif
-
-    for (auto& p : places) {
-      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
-    }
-  }
-
-  void CheckAllocThreadSafe() const {
-    for (auto& pair : allocators_) {
-      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
-    }
-
-    for (auto& pair : zero_size_allocators_) {
-      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
-    }
-  }
-
-  void WrapCUDARetryAllocator(size_t retry_time) {
-    PADDLE_ENFORCE_GT(retry_time, 0, "Retry time must be larger than 0");
-    for (auto& pair : allocators_) {
-      if (platform::is_gpu_place(pair.first)) {
-        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
-      }
-    }
-  }
-
- private:
-  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
-  std::map<platform::Place, std::shared_ptr<Allocator>> zero_size_allocators_;
-};
-
-// Pimpl. Make interface clean.
-AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
-// delete m_ may cause core dump when the destructor of python in conflict with
-// cpp.
-AllocatorFacade::~AllocatorFacade() {}
-
-AllocatorFacade& AllocatorFacade::Instance() {
-  static AllocatorFacade instance;
-  return instance;
-}
-
-std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
-    const platform::Place& place, size_t size) {
-  return std::shared_ptr<Allocation>(Alloc(place, size));
-}
-
-AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
-                                     size_t size) {
-  return m_->GetAllocator(place, size)->Allocate(size);
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
deleted file mode 100644
index 64b6fe25c352e82d6320e26d95efb61f3cb4a5b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-// Allocator Facade is the interface exposed to other modules.
-// All the configuration or dirty code under development should
-// be hidden behind this facade.
-//
-// NOTE(yy): This class is a singleton class.
-// NOTE(yy): To create a stable ABI and make compilation faster. Here we use
-// a Pimpl trick;
-class AllocatorFacadePrivate;
-class AllocatorFacade {
- public:
-  ~AllocatorFacade();
-  AllocatorFacade(const AllocatorFacade& o) = delete;
-  const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
-
-  static AllocatorFacade& Instance();
-
-  // Allocate a shared allocation.
-  std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
-                                          size_t size);
-
-  // Allocate a unique allocation.
-  AllocationPtr Alloc(const platform::Place& place, size_t size);
-
-  // TODO(yy): Allocate a Copy-On-Write allocation?
- private:
-  AllocatorFacade();
-  AllocatorFacadePrivate* m_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
deleted file mode 100644
index 3e10be037bd317f3bca8b2d8519a40df97303bc7..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#ifdef PADDLE_WITH_CUDA
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_int64(gpu_allocator_retry_time);
-#endif
-DECLARE_string(allocator_strategy);
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-//! Run allocate test cases for different places
-void AllocateTestCases() {
-  auto &instance = AllocatorFacade::Instance();
-  platform::Place place;
-  size_t size = 1024;
-
-  {
-    place = platform::CPUPlace();
-    size = 1024;
-    auto cpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(cpu_allocation, nullptr);
-    ASSERT_NE(cpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(cpu_allocation->place(), place);
-    ASSERT_EQ(cpu_allocation->size(), size);
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    place = platform::CUDAPlace(0);
-    size = 1024;
-    auto gpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(gpu_allocation, nullptr);
-    ASSERT_NE(gpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(gpu_allocation->place(), place);
-    ASSERT_GE(gpu_allocation->size(), size);
-  }
-
-  {
-    // Allocate 2GB gpu memory
-    place = platform::CUDAPlace(0);
-    size = 2 * static_cast<size_t>(1 << 30);
-    auto gpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(gpu_allocation, nullptr);
-    ASSERT_NE(gpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(gpu_allocation->place(), place);
-    ASSERT_GE(gpu_allocation->size(), size);
-  }
-
-  {
-    place = platform::CUDAPinnedPlace();
-    size = (1 << 20);
-    auto cuda_pinned_allocation =
-        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
-    ASSERT_NE(cuda_pinned_allocation, nullptr);
-    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
-    ASSERT_EQ(cuda_pinned_allocation->place(), place);
-    ASSERT_GE(cuda_pinned_allocation->size(), size);
-  }
-#endif
-}
-
-TEST(Allocator, SpecifyGpuMemory) {
-#ifdef PADDLE_WITH_CUDA
-  // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
-  // FLAGS_reallocate_gpu_memory_in_mb
-  FLAGS_fraction_of_gpu_memory_to_use = 0.0;
-  // 512 MB
-  FLAGS_initial_gpu_memory_in_mb = 512;
-  // 4 MB
-  FLAGS_reallocate_gpu_memory_in_mb = 4;
-  FLAGS_gpu_allocator_retry_time = 500;
-  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
-#endif
-
-  FLAGS_allocator_strategy = "naive_best_fit";
-
-  AllocateTestCases();
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
deleted file mode 100644
index 3748805b1ceb9e392a767e8c08577c1f47bfe436..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#ifdef PADDLE_WITH_CUDA
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_int64(gpu_allocator_retry_time);
-#endif
-DECLARE_string(allocator_strategy);
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-//! Run allocate test cases for different places
-void AllocateTestCases() {
-  auto &instance = AllocatorFacade::Instance();
-  platform::Place place;
-  size_t size = 1024;
-
-  {
-    place = platform::CPUPlace();
-    size = 1024;
-    auto cpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(cpu_allocation, nullptr);
-    ASSERT_NE(cpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(cpu_allocation->place(), place);
-    ASSERT_EQ(cpu_allocation->size(), size);
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    place = platform::CUDAPlace(0);
-    size = 1024;
-    auto gpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(gpu_allocation, nullptr);
-    ASSERT_NE(gpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(gpu_allocation->place(), place);
-    ASSERT_GE(gpu_allocation->size(), size);
-  }
-
-  {
-    // Allocate 2GB gpu memory
-    place = platform::CUDAPlace(0);
-    size = 2 * static_cast<size_t>(1 << 30);
-    auto gpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(gpu_allocation, nullptr);
-    ASSERT_NE(gpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(gpu_allocation->place(), place);
-    ASSERT_GE(gpu_allocation->size(), size);
-  }
-
-  {
-    place = platform::CUDAPinnedPlace();
-    size = (1 << 20);
-    auto cuda_pinned_allocation =
-        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
-    ASSERT_NE(cuda_pinned_allocation, nullptr);
-    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
-    ASSERT_EQ(cuda_pinned_allocation->place(), place);
-    ASSERT_GE(cuda_pinned_allocation->size(), size);
-  }
-#endif
-}
-
-TEST(Allocator, Allocator) {
-#ifdef PADDLE_WITH_CUDA
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_gpu_allocator_retry_time = 500;
-  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
-#endif
-  FLAGS_allocator_strategy = "naive_best_fit";
-
-  AllocateTestCases();
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
deleted file mode 100644
index 19b1380612b6de2387771e633ee0604bdc30046f..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "paddle/fluid/platform/enforce.h"
-
-DECLARE_string(allocator_strategy);
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-static AllocatorStrategy GetStrategyFromFlag() {
-  if (FLAGS_allocator_strategy == "naive_best_fit") {
-    return AllocatorStrategy::kNaiveBestFit;
-  }
-
-  if (FLAGS_allocator_strategy == "auto_growth") {
-    return AllocatorStrategy::kAutoGrowth;
-  }
-
-  PADDLE_THROW("Unsupported allocator strategy: %s", FLAGS_allocator_strategy);
-}
-
-AllocatorStrategy GetAllocatorStrategy() {
-  static AllocatorStrategy strategy = GetStrategyFromFlag();
-  return strategy;
-}
-
-void UseAllocatorStrategyGFlag() {}
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.h b/paddle/fluid/memory/allocation/allocator_strategy.h
deleted file mode 100644
index ff6e7839ff7d655a88a2e84e2b7af6a427a5a462..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/allocator_strategy.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth };
-
-extern AllocatorStrategy GetAllocatorStrategy();
-
-// Do nothing, just make sure linker do not prune this file.
-extern void UseAllocatorStrategyGFlag();
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
deleted file mode 100644
index 9ce4fd07829ba5fc7b73812f40dfb7dfdf2b1bc9..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
-#include <algorithm>
-#include <list>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <unordered_map>
-#include "paddle/fluid/memory/allocation/aligned_allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
-    const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
-    size_t chunk_size)
-    : underlying_allocator_(
-          std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
-      alignment_(alignment),
-      chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)) {}
-
-Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
-  size = AlignedSize(size, alignment_);
-
-  std::lock_guard<std::mutex> guard(mtx_);
-  auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
-  BlockIt block_it;
-  if (iter != free_blocks_.end()) {
-    block_it = iter->second;
-    free_blocks_.erase(iter);
-    auto *chunk = block_it->chunk_;
-    size_t remaining_size = block_it->size_ - size;
-    if (remaining_size == 0) {
-      block_it->is_free_ = false;
-    } else {
-      auto remaining_free_block = chunk->blocks_.insert(
-          block_it, Block(block_it->ptr_, remaining_size, true, chunk));
-      free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_),
-                           remaining_free_block);
-      block_it->ptr_ =
-          reinterpret_cast<uint8_t *>(block_it->ptr_) + remaining_size;
-      block_it->size_ = size;
-      block_it->is_free_ = false;
-    }
-  } else {
-    FreeIdleChunks();
-    size_t realloc_size = std::max(size, chunk_size_);
-
-    try {
-      chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
-    } catch (BadAlloc &ex) {
-      if (size == realloc_size) throw ex;
-      realloc_size = size;
-      chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
-    }
-
-    auto *chunk = &(*chunks_.rbegin());
-    realloc_size = chunk->allocation_->size();
-    uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
-    auto &blocks = chunk->blocks_;
-
-    size_t remaining_size = realloc_size - size;
-    if (remaining_size > 0) {
-      blocks.emplace_back(p, remaining_size, true, chunk);
-      free_blocks_.emplace(std::make_pair(remaining_size, p), --(blocks.end()));
-    }
-    blocks.emplace_back(p + remaining_size, size, false, chunk);
-    block_it = --(blocks.end());
-    VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining "
-            << remaining_size;
-  }
-  return new BlockAllocation(block_it);
-}
-
-void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
-  std::lock_guard<std::mutex> guard(mtx_);
-  auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
-  auto &blocks = block_it->chunk_->blocks_;
-
-  block_it->is_free_ = true;
-
-  if (block_it != blocks.begin()) {
-    auto prev_it = block_it;
-    --prev_it;
-
-    if (prev_it->is_free_) {
-      free_blocks_.erase(std::make_pair(prev_it->size_, prev_it->ptr_));
-      prev_it->size_ += block_it->size_;
-      blocks.erase(block_it);
-      block_it = prev_it;
-    }
-  }
-
-  auto next_it = block_it;
-  ++next_it;
-
-  if (next_it != blocks.end() && next_it->is_free_) {
-    free_blocks_.erase(std::make_pair(next_it->size_, next_it->ptr_));
-    block_it->size_ += next_it->size_;
-    blocks.erase(next_it);
-  }
-
-  free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_),
-                       block_it);
-
-  delete allocation;
-}
-
-void AutoGrowthBestFitAllocator::FreeIdleChunks() {
-  for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) {
-    auto &blocks = chunk_it->blocks_;
-    if (blocks.size() == 1 && blocks.begin()->is_free_) {
-      auto &block = *blocks.begin();
-      VLOG(2) << "Free chunk with size " << block.size_;
-      free_blocks_.erase(std::make_pair(block.size_, block.ptr_));
-      chunk_it = chunks_.erase(chunk_it);
-    } else {
-      ++chunk_it;
-    }
-  }
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
deleted file mode 100644
index 27257883d558e7b3a09f8aeb5264d093e7d5c480..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <list>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <utility>
-#include "paddle/fluid/memory/allocation/allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class AutoGrowthBestFitAllocator : public Allocator {
- public:
-  AutoGrowthBestFitAllocator(
-      const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
-      size_t chunk_size = 0);
-
-  bool IsAllocThreadSafe() const override { return true; }
-
- protected:
-  Allocation *AllocateImpl(size_t size) override;
-
-  void FreeImpl(Allocation *allocation) override;
-
- private:
-  void FreeIdleChunks();
-
-  template <typename T>
-  using List = std::list<T>;
-
-  struct Chunk;
-
-  struct Block {
-    Block(void *ptr, size_t size, bool is_free, Chunk *chunk)
-        : ptr_(ptr), size_(size), is_free_(is_free), chunk_(chunk) {}
-
-    void *ptr_;
-    size_t size_;
-    bool is_free_;
-    Chunk *chunk_;  // which chunk it is from
-  };
-
-  struct Chunk {
-    explicit Chunk(AllocationPtr allocation)
-        : allocation_(std::move(allocation)) {}
-
-    AllocationPtr allocation_;
-    List<Block> blocks_;
-  };
-
-  struct BlockAllocation : public Allocation {
-    explicit BlockAllocation(const List<Block>::iterator &it)
-        : Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()),
-          block_it_(it) {}
-
-    List<Block>::iterator block_it_;
-  };
-
-  using BlockIt = List<Block>::iterator;
-
-  std::shared_ptr<Allocator> underlying_allocator_;
-  std::map<std::pair<size_t, void *>, BlockIt> free_blocks_;
-  std::list<Chunk> chunks_;
-  size_t alignment_;
-  size_t chunk_size_;
-
-  mutable std::mutex mtx_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
deleted file mode 100644
index 69de02734024f7449191b8fb7f340ce3db9d7542..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <chrono>              // NOLINT
-#include <condition_variable>  // NOLINT
-#include <mutex>               // NOLINT
-#include <random>
-#include <thread>  // NOLINT
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-#ifdef PADDLE_WITH_CUDA
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
-DECLARE_int64(gpu_allocator_retry_time);
-#endif
-
-DECLARE_string(allocator_strategy);
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-static inline size_t AlignTo(size_t size, size_t alignment) {
-  auto remaining = size % alignment;
-  return remaining == 0 ? size : size + alignment - remaining;
-}
-
-TEST(allocator, allocator) {
-#ifdef PADDLE_WITH_CUDA
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_gpu_allocator_retry_time = 500;
-  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
-#endif
-
-  FLAGS_allocator_strategy = "auto_growth";
-
-  auto &instance = AllocatorFacade::Instance();
-  size_t size = 1024;
-  platform::Place place;
-
-  {
-    place = platform::CPUPlace();
-    size = 1024;
-    auto cpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(cpu_allocation, nullptr);
-    ASSERT_NE(cpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(cpu_allocation->place(), place);
-    ASSERT_EQ(cpu_allocation->size(), AlignedSize(size, 1024));
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    place = platform::CUDAPlace(0);
-    size = 1024;
-    auto gpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(gpu_allocation, nullptr);
-    ASSERT_NE(gpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(gpu_allocation->place(), place);
-    ASSERT_GE(gpu_allocation->size(),
-              AlignedSize(size, platform::GpuMinChunkSize()));
-  }
-
-  {
-    // Allocate 2GB gpu memory
-    place = platform::CUDAPlace(0);
-    size = 2 * static_cast<size_t>(1 << 30);
-    auto gpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(gpu_allocation, nullptr);
-    ASSERT_NE(gpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(gpu_allocation->place(), place);
-    ASSERT_GE(gpu_allocation->size(),
-              AlignedSize(size, platform::GpuMinChunkSize()));
-  }
-
-  {
-    place = platform::CUDAPinnedPlace();
-    size = (1 << 20);
-    auto cuda_pinned_allocation =
-        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
-    ASSERT_NE(cuda_pinned_allocation, nullptr);
-    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
-    ASSERT_EQ(cuda_pinned_allocation->place(), place);
-    ASSERT_GE(cuda_pinned_allocation->size(), AlignedSize(size, 1 << 20));
-  }
-#endif
-}
-
-TEST(multithread_allocate, test_segfault) {
-  FLAGS_allocator_strategy = "auto_growth";
-#ifdef PADDLE_WITH_CUDA
-  std::mutex mtx;
-  std::condition_variable cv;
-  bool flag = false;
-
-  auto alloc_func = [&](int dev_id, unsigned int seed) {
-    auto &instance = AllocatorFacade::Instance();
-
-    std::mt19937 gen(seed);
-    std::uniform_int_distribution<size_t> dist(1 << 20, 1 << 25);
-
-    {
-      std::unique_lock<std::mutex> lock(mtx);
-      cv.wait(lock, [&] { return flag; });
-    }
-
-    for (int i = 0; i < 50; i++) {
-      size_t size = dist(gen);
-      for (int j = 0; j < 10; j++) {
-        instance.Alloc(platform::CUDAPlace(dev_id), size);
-      }
-    }
-  };
-
-  std::vector<std::thread> ths;
-  for (size_t i = 0; i < 50; ++i) {
-    std::random_device rd;
-    ths.emplace_back(alloc_func, 0, rd());
-  }
-
-  {
-    std::lock_guard<std::mutex> guard(mtx);
-    flag = true;
-  }
-  cv.notify_all();
-
-  for (auto &th : ths) {
-    th.join();
-  }
-#endif
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
deleted file mode 100644
index 126464f049e00d41b6642a49678f3e111faaffc8..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
-#include <cmath>
-#include <list>
-#include <map>
-#include <string>
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-static int HighestBitPos(size_t N) {
-  if (UNLIKELY(N == 0)) {
-    return 0;
-  } else {
-#ifdef __GNUCC__
-    return sizeof(unsigned int) * 8 - __builtin_clz(N);
-#else
-    return static_cast<int>(std::log2(N) + 1);
-#endif
-  }
-}
-
-BestFitAllocator::BestFitAllocator(Allocation* allocation)
-    : allocation_(allocation) {
-  details::Chunk chunk;
-  chunk.size_ = allocation_->size();
-  chunk.offset_ = 0;
-  chunk.is_free = true;
-  chunks_.emplace_back(chunk);
-  free_chunks_[HighestBitPos(chunk.size_)].insert(
-      {chunk.size_, chunks_.begin()});
-}
-
-size_t BestFitAllocator::FreeSize() const {
-  size_t acc = 0;
-  for (auto& array_item : free_chunks_) {
-    for (auto& pair : array_item) {
-      acc += pair.second->size_;
-    }
-  }
-  return acc;
-}
-
-BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size,
-                                                      size_t free_chunk_offset,
-                                                      MapIt bin_iterator) {
-  auto to_split_it = bin_iterator->second;
-  free_chunks_[free_chunk_offset].erase(bin_iterator);
-
-  PADDLE_ENFORCE(to_split_it->is_free);
-  PADDLE_ENFORCE_GE(to_split_it->size_, request_size);
-
-  auto remaining_size = to_split_it->size_ - request_size;
-  details::Chunk to_use;
-  details::Chunk remaining;
-  to_use.size_ = request_size;
-  to_use.is_free = false;
-  remaining.size_ = remaining_size;
-  remaining.is_free = true;
-
-  // calc offsets
-  to_use.offset_ = to_split_it->offset_;
-  remaining.offset_ = to_use.offset_ + to_use.size_;
-
-  // insert to chunk list
-  auto to_use_it = chunks_.insert(to_split_it, to_use);
-  if (remaining.size_ != 0) {
-    auto bit_size = static_cast<size_t>(HighestBitPos(remaining.size_));
-    free_chunks_[bit_size].insert(
-        {remaining.size_, chunks_.insert(to_split_it, remaining)});
-  }
-  chunks_.erase(to_split_it);
-  return to_use_it;
-}
-
-void BestFitAllocator::InsertFreeNode(const ListIt& it) {
-  auto pos = static_cast<size_t>(HighestBitPos(it->size_));
-  auto& free_map = free_chunks_[pos];
-  free_map.insert({it->size_, it});
-}
-void BestFitAllocator::EraseFreeNode(const ListIt& it) {
-  size_t pos = static_cast<size_t>(HighestBitPos(it->size_));
-  auto& free_map = free_chunks_[pos];
-  auto map_it = free_map.find(it->size_);
-  while (map_it->second != it && map_it != free_map.end()) {
-    ++map_it;
-  }
-  PADDLE_ENFORCE(map_it != free_map.end());
-  free_map.erase(map_it);
-}
-size_t BestFitAllocator::NumFreeChunks() const {
-  size_t num = 0;
-  for (auto& array_item : free_chunks_) {
-    num += array_item.size();
-  }
-  return num;
-}
-void BestFitAllocator::FreeImpl(Allocation* allocation) {
-  auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
-  PADDLE_ENFORCE_NOT_NULL(bf_allocation,
-                          "The input allocation is not BestFitAllocation.");
-  auto chunk_it = bf_allocation->ChunkIterator();
-  PADDLE_ENFORCE(!chunk_it->is_free);
-  chunk_it->is_free = true;
-  if (chunk_it != chunks_.begin()) {
-    auto prev_it = chunk_it;
-    --prev_it;
-
-    if (prev_it->is_free) {
-      // Merge Left.
-      EraseFreeNode(prev_it);
-      prev_it->size_ += chunk_it->size_;
-      chunks_.erase(chunk_it);
-      chunk_it = prev_it;
-    }
-  }
-
-  auto next_it = chunk_it;
-  ++next_it;
-  if (next_it != chunks_.end() && next_it->is_free) {
-    EraseFreeNode(next_it);
-    chunk_it->size_ += next_it->size_;
-    chunks_.erase(next_it);
-  }
-
-  InsertFreeNode(chunk_it);
-  delete allocation;
-}
-Allocation* BestFitAllocator::AllocateImpl(size_t size) {
-  auto highest_set_bit = static_cast<size_t>(HighestBitPos(size));
-  MapIt map_it;
-  for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) {
-    map_it = free_chunks_[highest_set_bit].lower_bound(size);
-    if (map_it != free_chunks_[highest_set_bit].end()) {
-      break;
-    }
-  }
-  if (UNLIKELY(highest_set_bit == free_chunks_.size())) {
-    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d, All fragments size is %d", size,
-                           FreeSize());
-  }
-  auto chunk_it = SplitChunk(size, highest_set_bit, map_it);
-  return new BestFitAllocation(this, chunk_it);
-}
-
-BestFitAllocation::BestFitAllocation(
-    paddle::memory::allocation::BestFitAllocator* allocator,
-    typename details::ChunkList::iterator chunk_it)
-    : Allocation(reinterpret_cast<void*>(
-                     reinterpret_cast<uintptr_t>(allocator->BasePtr()) +
-                     chunk_it->offset_),
-                 chunk_it->size_, allocator->Place()),
-      chunk_it_(chunk_it) {}
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
deleted file mode 100644
index 64a552e4fd2af1f661e3174e5041ffc71f74fa2c..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <array>
-#include <list>
-#include <map>
-#include "paddle/fluid/memory/allocation/allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-namespace details {
-struct Chunk {
-  bool is_free{true};
-  // Offset to the base allocation.
-  uintptr_t offset_;
-  size_t size_;
-};
-
-// Here we use std::list to maintain chunk list.
-// NOTE(yy): The traditional implementation of ChunkList is add `prev`/`next`
-// pointers in `Chunk`, and split the allocation as `ChunkHeader` and
-// `Payload`. Such as
-//   *-------*---------------*---------------*--------------*
-//   | Chunk | prev_ pointer | next_ pointer | payload .... |
-//   *-------*---------------*---------------*--------------*
-// This implementation can just return a raw pointer, and we can get the list
-// structure by the raw pointer. However, we cannot use the same code on GPU
-// since CPU cannot access GPU memory directly.
-//
-// So we choose to use `std::list` and return an allocation instance, which
-// contains the list node iterator, then we can unify CPU/GPU code.
-//
-// To return an allocation is not a bad idea, since Tensor/Vector should holds
-// an allocation instead of raw pointer directly.
-using ChunkList = std::list<Chunk>;
-
-// Here we use a multi-level map of free chunks.
-// the map is
-//      MSB offset --> size --> [ChunkList::iterator]
-//
-// The time complexities:
-//     find a free chunk:
-//          O(logN),
-//               where N is the number of free nodes with the same MSB offset.
-//     find the position of a chunk iterator:
-//          O(logN + K),
-//               where N is the number of free nodes with the same MSB offset.
-//               where K is the number of free nodes with the same size.
-//     insert a free chunk:
-//          O(logN),
-//               where N is the number of free nodes with the same MSB offset.
-//     erase a free chunk:
-//          O(1)
-using FreeChunkBin =
-    std::array<std::multimap<size_t, ChunkList::iterator>, sizeof(size_t) * 8>;
-}  // namespace details
-
-class BestFitAllocator;
-
-// The BestFitAllocation maintain the List Node iterator.
-class BestFitAllocation : public Allocation {
- private:
-  using ListIt = typename details::ChunkList::iterator;
-
- public:
-  BestFitAllocation(BestFitAllocator* allocator, ListIt chunk_it);
-
-  const ListIt& ChunkIterator() const { return chunk_it_; }
-
- private:
-  typename details::ChunkList::iterator chunk_it_;
-};
-
-// TODO(yy): Current BestFitAllocator is not thread-safe. To make it thread
-// safe, we must wrap a locked_allocator. However, we can implement a thread
-// safe allocator by locking each bin and chunks list independently. It will
-// make BestFitAllocator faster in multi-thread situation.
-//
-// This allocator implements a best-fit allocator with merging the free nodes.
-//
-// To allocate a buffer, it will find the best-fit chunk. If the best-fit chunk
-// is larger than request size, the original block will be split into two
-// chunks. The first block will be used and the second block will be put into
-// free chunks.
-//
-// To free an allocation, it will set the chunk of allocation to free and merge
-// the prev-chunk and the next-chunk when possible.
-class BestFitAllocator : public Allocator {
- public:
-  explicit BestFitAllocator(Allocation* allocation);
-
-  void* BasePtr() const { return allocation_->ptr(); }
-
-  const platform::Place& Place() const { return allocation_->place(); }
-
-  size_t NumFreeChunks() const;
-
- private:
-  size_t FreeSize() const;
-  using MapIt = typename details::FreeChunkBin::value_type::iterator;
-  using ListIt = typename details::ChunkList::iterator;
-
-  ListIt SplitChunk(size_t request_size, size_t free_chunk_offset,
-                    MapIt bin_iterator);
-  void EraseFreeNode(const ListIt& it);
-  void InsertFreeNode(const ListIt& it);
-
- protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
-
- private:
-  Allocation* allocation_;  // not owned
-  details::ChunkList chunks_;
-  details::FreeChunkBin free_chunks_;
-};
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
deleted file mode 100644
index 7e5207e6345bbd8ec02fdc897466c269779e2830..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
-#include <memory>
-#include <random>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/cpu_allocator.h"
-#include "paddle/fluid/memory/allocation/locked_allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class StubAllocation : public Allocation {
- public:
-  explicit StubAllocation(size_t size)
-      : Allocation(0, size, platform::CPUPlace()) {}
-};
-
-TEST(BestFitAllocator, test_allocation) {
-  StubAllocation stub(4UL * 1024 * 1024 * 1024);
-  BestFitAllocator allocator(&stub);
-  { auto allocation = allocator.Allocate(64); }
-
-  {
-    auto allocation = allocator.Allocate(80);
-
-    {
-      auto best_fit_allocation =
-          dynamic_cast<BestFitAllocation*>(allocation.get());
-      ASSERT_NE(best_fit_allocation, nullptr);
-      ASSERT_FALSE(best_fit_allocation->ChunkIterator()->is_free);
-      ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0);
-      ASSERT_EQ(allocation->size(), 80);
-      ASSERT_EQ(allocation->ptr(), nullptr);
-    }
-
-    auto allocation2 = allocator.Allocate(60);
-    auto allocation3 = allocator.Allocate(90);
-    allocation2.reset();
-    allocation2 = allocator.Allocate(30);
-
-    {
-      auto best_fit_allocation =
-          dynamic_cast<BestFitAllocation*>(allocation2.get());
-      ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80);
-    }
-    allocation2.reset();
-    allocation2 = allocator.Allocate(60);
-
-    {
-      auto best_fit_allocation =
-          dynamic_cast<BestFitAllocation*>(allocation2.get());
-      ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80);
-    }
-
-    allocation.reset();
-    allocation2.reset();
-
-    allocation = allocator.Allocate(80 + 60);
-    {
-      auto best_fit_allocation =
-          dynamic_cast<BestFitAllocation*>(allocation.get());
-      ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0);
-    }
-
-    allocation.reset();
-
-    allocation = allocator.Allocate(80);
-    allocation2 = allocator.Allocate(60);
-    allocation = nullptr;
-    allocation2 = nullptr;
-    allocation3 = nullptr;
-
-    ASSERT_EQ(allocator.NumFreeChunks(), 1U);
-  }
-}
-
-TEST(BestFitAllocator, test_concurrent_cpu_allocation) {
-  CPUAllocator allocator;
-  auto global_allocation = allocator.Allocate(256UL * 1024 * 1024);
-
-  std::unique_ptr<Allocator> best_fit_allocator(
-      new BestFitAllocator(global_allocation.get()));
-
-  LockedAllocator locked_allocator(std::move(best_fit_allocator));
-
-  auto th_main = [&](std::random_device::result_type seed) {
-    std::default_random_engine engine(seed);
-    std::uniform_int_distribution<size_t> dist(1U, 1024U);
-
-    for (size_t i = 0; i < 128; ++i) {
-      size_t allocate_size = dist(engine);
-
-      auto allocation =
-          locked_allocator.Allocate(sizeof(size_t) * allocate_size);
-
-      size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
-
-      for (size_t j = 0; j < allocate_size; ++j) {
-        data[j] = j;
-      }
-      std::this_thread::yield();
-
-      for (size_t j = 0; j < allocate_size; ++j) {
-        ASSERT_EQ(data[j], j);
-      }
-    }
-  };
-  {
-    std::vector<std::thread> threads;
-    for (size_t i = 0; i < 1024; ++i) {
-      std::random_device dev;
-      threads.emplace_back(th_main, dev());
-    }
-    for (auto& th : threads) {
-      th.join();
-    }
-  }
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
deleted file mode 100644
index eb24ba84c886e3393cf36b6f764d7b33e76defeb..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <random>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
-#include "paddle/fluid/memory/allocation/cuda_allocator.h"
-#include "paddle/fluid/memory/allocation/locked_allocator.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/for_range.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-struct ForEachFill {
-  size_t* ptr_;
-
-  explicit ForEachFill(size_t* ptr) : ptr_(ptr) {}
-
-  __device__ void operator()(size_t i) { ptr_[i] = i; }
-};
-
-TEST(BestFitAllocator, concurrent_cuda) {
-  CUDAAllocator allocator(platform::CUDAPlace(0));
-  // 256 MB
-  auto cuda_allocation = allocator.Allocate(256U * 1024 * 1024);
-  LockedAllocator concurrent_allocator(
-      std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
-
-  auto th_main = [&](std::random_device::result_type seed) {
-    std::default_random_engine engine(seed);
-    std::uniform_int_distribution<size_t> dist(1U, 1024U);
-    platform::CUDAPlace gpu(0);
-    platform::CUDADeviceContext dev_ctx(gpu);
-    std::array<size_t, 1024> buf;
-    for (size_t i = 0; i < 128; ++i) {
-      size_t allocate_size = dist(engine);
-
-      auto allocation =
-          concurrent_allocator.Allocate(sizeof(size_t) * allocate_size);
-
-      size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
-
-      ForEachFill fill(data);
-      platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                                allocate_size);
-      for_range(fill);
-
-      memory::Copy(platform::CPUPlace(), buf.data(), gpu, data,
-                   sizeof(size_t) * allocate_size, dev_ctx.stream());
-
-      dev_ctx.Wait();
-      for (size_t j = 0; j < allocate_size; ++j) {
-        ASSERT_EQ(buf[j], j);
-      }
-      allocation = nullptr;
-    }
-  };
-
-  {
-    std::vector<std::thread> threads;
-    for (size_t i = 0; i < 1024; ++i) {
-      std::random_device dev;
-      threads.emplace_back(th_main, dev());
-    }
-    for (auto& th : threads) {
-      th.join();
-    }
-  }
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
deleted file mode 100644
index d80616b7a8d39a5e1074ce240d9c4ddb069b212a..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/buffered_allocator.h"
-#include <algorithm>
-#include <limits>
-#include <utility>
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-BufferedAllocator::BufferedAllocator(std::shared_ptr<Allocator> allocator)
-    : underlying_allocator_(std::move(allocator)) {
-  PADDLE_ENFORCE_NOT_NULL(
-      underlying_allocator_,
-      "Underlying allocator of BufferedAllocator must not be null");
-  if (underlying_allocator_->IsAllocThreadSafe()) {
-    mtx_.reset(new std::mutex());
-  }
-}
-
-BufferedAllocator::~BufferedAllocator() { FreeCache(-1UL); }
-
-void BufferedAllocator::FreeCache(size_t size) {
-  platform::LockGuardPtr<std::mutex> guard(mtx_);
-  if (UNLIKELY(size == 0)) return;
-  size_t cur = 0;
-  while (!allocations_.empty()) {  // free the largest
-    auto it = --allocations_.end();
-    cur += it->second->size();
-    underlying_allocator_->Free(it->second.release());
-    allocations_.erase(it);
-    if (cur >= size) return;
-  }
-}
-
-bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
-
-void BufferedAllocator::FreeImpl(Allocation *allocation) {
-  platform::LockGuardPtr<std::mutex> guard(mtx_);
-  allocations_.emplace(allocation->size(), AllocationPtr(allocation));
-}
-
-Allocation *BufferedAllocator::AllocateImpl(size_t size) {
-  {
-    platform::LockGuardPtr<std::mutex> guard(mtx_);
-    auto it = allocations_.lower_bound(size);
-    if (it != allocations_.end() && it->first < size * 2) {
-      AllocationPtr result(std::move(it->second));
-      allocations_.erase(it);
-      return result.release();
-    }
-  }
-
-  try {
-    return underlying_allocator_->Allocate(size).release();
-  } catch (BadAlloc &) {
-    FreeCache(size);
-    return underlying_allocator_->Allocate(size).release();
-  }
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
deleted file mode 100644
index fd0996f7748ef407262dba7bca705af9b5fb9674..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <vector>
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-// NOTE(zjl): BufferedAllocator maintains a memory pool to accelerate
-// memory allocation and reuse memory.
-// BufferedAllocator provides the same thread-safety level as
-// underlying_allocator_
-class BufferedAllocator : public Allocator {
- public:
-  explicit BufferedAllocator(std::shared_ptr<Allocator> allocator);
-
-  ~BufferedAllocator();
-
-  bool IsAllocThreadSafe() const override;
-
-  // only used in unittest
-  inline void ClearCache() { FreeCache(-1UL); }
-
- private:
-  void FreeCache(size_t size);
-
- protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
-
- private:
-  std::shared_ptr<Allocator> underlying_allocator_;
-  std::multimap<size_t, AllocationPtr> allocations_;
-  std::unique_ptr<std::mutex> mtx_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
deleted file mode 100644
index e4825233d58c7386bc1b7456cdc5c11f03f6b90e..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/buffered_allocator.h"
-#include <gtest/gtest.h>
-#include <utility>
-#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
-#include "paddle/fluid/memory/allocation/cpu_allocator.h"
-#include "paddle/fluid/memory/allocation/locked_allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-inline std::unique_ptr<BufferedAllocator> GetBufferedAllocator(
-    Allocation *allocation, bool thread_safe) {
-  std::unique_ptr<Allocator> allocator(new BestFitAllocator(allocation));
-  if (thread_safe) {
-    allocator.reset(new LockedAllocator(std::move(allocator)));
-  }
-
-  return std::unique_ptr<BufferedAllocator>(
-      new BufferedAllocator(std::move(allocator)));
-}
-
-TEST(buffered_allocator, thread_safety) {
-  std::unique_ptr<CPUAllocator> allocator(new CPUAllocator());
-  auto chunk = allocator->Allocate(1 << 20);
-  {
-    auto buf_allocator = GetBufferedAllocator(chunk.get(), true);
-    ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), true);
-  }
-
-  {
-    auto buf_allocator = GetBufferedAllocator(chunk.get(), false);
-    ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), false);
-  }
-}
-
-class StubAllocation : public Allocation {
- public:
-  using Allocation::Allocation;
-};
-
-class StubAllocator : public Allocator {
- public:
-  void ResetCounter() {
-    construct_count_ = 0;
-    destruct_count_ = 0;
-  }
-
-  size_t GetAllocCount() const { return construct_count_; }
-
-  size_t GetFreeCount() const { return destruct_count_; }
-
- protected:
-  void FreeImpl(Allocation *allocation) override {
-    auto *alloc = dynamic_cast<StubAllocation *>(allocation);
-    PADDLE_ENFORCE_NOT_NULL(alloc);
-    if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
-    ++destruct_count_;
-    delete allocation;
-  }
-  Allocation *AllocateImpl(size_t size) override {
-    ++construct_count_;
-    if (size == 0) {
-      return new StubAllocation(nullptr, 0, platform::CPUPlace());
-    } else {
-      return new StubAllocation(new uint8_t[size], size, platform::CPUPlace());
-    }
-  }
-
- private:
-  size_t construct_count_ = 0;
-  size_t destruct_count_ = 0;
-};
-
-constexpr size_t kZero = 0;
-constexpr size_t kOne = 1;
-constexpr size_t kTwo = 2;
-
-TEST(buffered_allocator, lazy_free) {
-  std::unique_ptr<StubAllocator> stub_allocator(new StubAllocator());
-  auto *underlying_allocator = stub_allocator.get();
-  std::unique_ptr<BufferedAllocator> allocator(
-      new BufferedAllocator(std::move(stub_allocator)));
-
-  {
-    underlying_allocator->ResetCounter();
-    auto x = allocator->Allocate(1025);
-    ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
-    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
-    x = nullptr;
-    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
-  }
-
-  {
-    underlying_allocator->ResetCounter();
-    auto x = allocator->Allocate(900);
-    ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
-    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
-    auto y = allocator->Allocate(2048);
-    ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
-    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
-    x = nullptr;
-    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
-    y = nullptr;
-    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
-  }
-
-  {
-    underlying_allocator->ResetCounter();
-    allocator->ClearCache();
-    ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
-    ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo);
-  }
-}
-
-TEST(buffered_allocator, garbage_collection) {
-  std::unique_ptr<CPUAllocator> cpu_allocator(new CPUAllocator());
-  auto chunk = cpu_allocator->Allocate(2048);
-  auto allocator = GetBufferedAllocator(chunk.get(), false);
-  auto x1 = allocator->Allocate(1600);
-  auto x2 = allocator->Allocate(400);
-  x1 = nullptr;
-  x2 = nullptr;
-  auto x3 = allocator->Allocate(1600);
-  ASSERT_NE(x3, nullptr);
-  ASSERT_NE(x3->ptr(), nullptr);
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
deleted file mode 100644
index 580cf1af56ab0ad2f096f9b6fefaff0ba0e501a0..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/cpu_allocator.h"
-#include <stdlib.h>
-#include <string>
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-bool CPUAllocator::IsAllocThreadSafe() const { return true; }
-
-void CPUAllocator::FreeImpl(Allocation *allocation) {
-  void *p = allocation->ptr();
-#ifdef _WIN32
-  _aligned_free(p);
-#else
-  free(p);
-#endif
-  delete allocation;
-}
-
-Allocation *CPUAllocator::AllocateImpl(size_t size) {
-  void *p;
-#ifdef _WIN32
-  p = _aligned_malloc(size, kAlignment);
-#else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!",
-                    size);
-#endif
-  return new Allocation(p, size, platform::CPUPlace());
-}
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
deleted file mode 100644
index 058ff63381658da698841c839425dec000a748da..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/memory/allocation/allocator.h"
-
-#ifdef _WIN32
-#define posix_memalign_free _aligned_free
-#define posix_memalign(p, a, s) \
-  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
-#endif
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-// CPU system allocator and allocation.
-//
-// NOTE(yy): Should we just use `malloc` here since there is an
-// aligned_allocator.
-//
-// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
-// an open-sourced allocator into Paddle.
-class CPUAllocator : public Allocator {
- public:
-  constexpr static size_t kAlignment = 4096UL;
-  bool IsAllocThreadSafe() const override;
-
- protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
-};
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
deleted file mode 100644
index 2ba3b6d0b5b2a89fcbea472c2eb90d7874ce0104..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/cuda_allocator.h"
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <string>
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
-void CUDAAllocator::FreeImpl(Allocation* allocation) {
-  platform::CUDADeviceGuard guard(place_.device);
-  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()),
-                    place_);
-  PADDLE_ENFORCE(cudaFree(allocation->ptr()));
-  delete allocation;
-}
-
-Allocation* CUDAAllocator::AllocateImpl(size_t size) {
-  platform::CUDADeviceGuard guard(place_.device);
-  void* ptr;
-  auto status = cudaMalloc(&ptr, size);
-  if (UNLIKELY(status != cudaSuccess)) {
-    PADDLE_ENFORCE_NE(cudaGetLastError(), cudaSuccess);
-    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s",
-                           size, place_.device, status,
-                           cudaGetErrorString(status));
-  }
-  return new Allocation(ptr, size, platform::Place(place_));
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
deleted file mode 100644
index 886f6e7a327f70068c6fabb6328f927bf71b2881..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class CUDAAllocator : public Allocator {
- public:
-  explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
-  explicit CUDAAllocator(const platform::Place& place)
-      : place_(boost::get<platform::CUDAPlace>(place)) {}
-  bool IsAllocThreadSafe() const override;
-
- protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
-
- private:
-  platform::CUDAPlace place_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
deleted file mode 100644
index 1f8ad370bf2f7ed780e45f5775e8e599bdfbed71..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include <map>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-
-namespace memory {
-namespace allocation {
-
-/**
- * CUDADeviceContextAllocation is a wrapper of the underbeneath allocation.
- * CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath
- * allocation so that CUDADeviceContextAllocation can be used in a CUDA stream
- * which deletes allocation in the callback.
- */
-class CUDADeviceContextAllocation : public Allocation {
- public:
-  explicit CUDADeviceContextAllocation(AllocationPtr allocation)
-      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
-        underlying_allocation_(std::move(allocation)) {}
-
-  ~CUDADeviceContextAllocation() {
-    PADDLE_ENFORCE_NOT_NULL(
-        dev_ctx_, "Didn't set device context for CUDADeviceContextAllocation");
-    auto *p_allocation = underlying_allocation_.release();
-    VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at "
-            << p_allocation;
-    dev_ctx_->AddStreamCallback([p_allocation] {
-      VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
-      AllocationDeleter()(p_allocation);
-    });
-  }
-
-  void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) {
-    dev_ctx_ = dev_ctx;
-  }
-
- private:
-  AllocationPtr underlying_allocation_;
-  const platform::CUDADeviceContext *dev_ctx_{nullptr};
-};
-
-/**
- * CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation
- * after waiting for a self-created event on the default stream. It does so to
- * let the non-default stream be able to allocate GPU memory which will be
- * released by stream callback
- */
-class CUDADeviceContextAllocator : public Allocator {
- public:
-  explicit CUDADeviceContextAllocator(platform::CUDAPlace place,
-                                      cudaStream_t default_stream)
-      : place_(place), default_stream_(default_stream) {
-    platform::CUDADeviceGuard guard(place_.device);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventCreate(&event_, cudaEventDisableTiming),
-        "Create event failed in CUDADeviceContextAllocator");
-  }
-
-  ~CUDADeviceContextAllocator() {
-    if (event_) {
-      platform::CUDADeviceGuard guard(place_.device);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventDestroy(event_),
-          "Destory event failed in CUDADeviceContextAllocator destroctor");
-    }
-  }
-
- protected:
-  Allocation *AllocateImpl(size_t size) override {
-    PADDLE_ENFORCE_NOT_NULL(
-        default_stream_,
-        "Didn't set default stream for CUDADeviceContextAllocator");
-    platform::CUDADeviceGuard guard(place_.device);
-    auto allocation =
-        new CUDADeviceContextAllocation(memory::Alloc(place_, size));
-    // Wait for the event on stream
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventRecord(event_, default_stream_),
-        "Failed to record event in CUDADeviceContextAllocator");
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamWaitEvent(default_stream_, event_, 0),
-        "Failed to wait event in CUDADeviceContextAllocator");
-    return allocation;
-  }
-
-  void FreeImpl(Allocation *allocation) override { delete allocation; }
-
- private:
-  platform::CUDAPlace place_;
-  cudaEvent_t event_{nullptr};
-  cudaStream_t default_stream_{nullptr};
-};
-
-/**
- * CUDADeviceContextAllocatorPool is a singletion stores mapping from
- * CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a
- * CUDADeviceContext's compute stream isn't default stream, it can call this
- * class to allocate GPU memory which will be released by a callback after
- * stream execution.
- */
-class CUDADeviceContextAllocatorPool {
- public:
-  static CUDADeviceContextAllocatorPool &Instance() {
-    static CUDADeviceContextAllocatorPool pool;
-    return pool;
-  }
-
-  AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
-    auto iter =
-        allocators_.find(boost::get<platform::CUDAPlace>(dev_ctx.GetPlace()));
-    PADDLE_ENFORCE_EQ(iter != allocators_.end(), true,
-                      "CUDADeviceContextAllocatorPool initialization error");
-    auto &allocator = iter->second;
-    AllocationPtr allocation = allocator->Allocate(size);
-    static_cast<CUDADeviceContextAllocation *>(allocation.get())
-        ->SetCUDADeviceContext(&dev_ctx);
-    return allocation;
-  }
-
- private:
-  CUDADeviceContextAllocatorPool() {
-    std::vector<int> devices = platform::GetSelectedDevices();
-    for (int i : devices) {
-      auto place = platform::CUDAPlace(i);
-      auto compute_stream =
-          platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
-      auto allocator = std::shared_ptr<CUDADeviceContextAllocator>(
-          new CUDADeviceContextAllocator(place, compute_stream));
-      allocators_.insert(make_pair(place, allocator));
-    }
-  }
-
-  std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>>
-      allocators_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
deleted file mode 100644
index a912807645bafee3c1cb63f03ff456418033b416..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/locked_allocator.h"
-#include <mutex>  // NOLINT
-#include <utility>
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-bool LockedAllocator::IsAllocThreadSafe() const { return true; }
-
-LockedAllocator::LockedAllocator(
-    std::shared_ptr<Allocator> underlying_allocator)
-    : underlying_allocator_(std::move(underlying_allocator)) {
-  PADDLE_ENFORCE_NOT_NULL(underlying_allocator_);
-  if (!underlying_allocator_->IsAllocThreadSafe()) {
-    mtx_.reset(new std::mutex());
-  }
-}
-
-void LockedAllocator::FreeImpl(Allocation *allocation) {
-  platform::LockGuardPtr<std::mutex> guard(mtx_);
-  underlying_allocator_->Free(allocation);
-}
-
-Allocation *LockedAllocator::AllocateImpl(size_t size) {
-  platform::LockGuardPtr<std::mutex> guard(mtx_);
-  return underlying_allocator_->Allocate(size).release();
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
deleted file mode 100644
index 4af77e6e057f54d15dcb0248ba6cf36f6f00c2f1..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <memory>
-#include <mutex>   // NOLINT
-#include <thread>  // NOLINT
-#include "paddle/fluid/memory/allocation/allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-// A allocator to make underlying allocator thread safe.
-class LockedAllocator : public Allocator {
- public:
-  explicit LockedAllocator(std::shared_ptr<Allocator> underlying_allocator);
-  bool IsAllocThreadSafe() const override;
-
- protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
-
- private:
-  std::shared_ptr<Allocator> underlying_allocator_;
-  std::unique_ptr<std::mutex> mtx_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
deleted file mode 100644
index 24df3ce3661ca9f05e8b78e78c46289535779b07..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ /dev/null
@@ -1,317 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "glog/logging.h"
-#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
-#include "paddle/fluid/memory/detail/buddy_allocator.h"
-#include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/split.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#endif
-
-DEFINE_bool(init_allocated_mem, false,
-            "It is a mistake that the values of the memory allocated by "
-            "BuddyAllocator are always zeroed in some op's implementation. "
-            "To find this error in time, we use init_allocated_mem to indicate "
-            "that initializing the allocated memory with a small value "
-            "during unit testing.");
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_bool(benchmark);
-
-namespace paddle {
-namespace memory {
-namespace legacy {
-template <typename Place>
-void *Alloc(const Place &place, size_t size);
-
-template <typename Place>
-void Free(const Place &place, void *p, size_t size);
-
-template <typename Place>
-size_t Used(const Place &place);
-
-struct Usage : public boost::static_visitor<size_t> {
-  size_t operator()(const platform::CPUPlace &cpu) const;
-  size_t operator()(const platform::CUDAPlace &gpu) const;
-  size_t operator()(const platform::CUDAPinnedPlace &cuda_pinned) const;
-};
-
-size_t memory_usage(const platform::Place &p);
-
-using BuddyAllocator = detail::BuddyAllocator;
-
-BuddyAllocator *GetCPUBuddyAllocator() {
-  // We tried thread_local for inference::RNN1 model, but that not works much
-  // for multi-thread test.
-  static std::once_flag init_flag;
-  static detail::BuddyAllocator *a = nullptr;
-
-  std::call_once(init_flag, []() {
-    a = new detail::BuddyAllocator(
-        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
-  });
-
-  return a;
-}
-
-template <>
-void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
-  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
-  void *p = GetCPUBuddyAllocator()->Alloc(size);
-  if (FLAGS_init_allocated_mem) {
-    memset(p, 0xEF, size);
-  }
-  VLOG(10) << "  pointer=" << p;
-  return p;
-}
-
-template <>
-void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
-                              size_t size) {
-  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-  GetCPUBuddyAllocator()->Free(p);
-}
-
-template <>
-size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
-  return GetCPUBuddyAllocator()->Used();
-}
-
-#ifdef PADDLE_WITH_CUDA
-BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
-  static std::once_flag init_flag;
-  static detail::BuddyAllocator **a_arr = nullptr;
-  static std::vector<int> devices;
-
-  std::call_once(init_flag, [gpu_id]() {
-    devices = platform::GetSelectedDevices();
-    int gpu_num = devices.size();
-    a_arr = new BuddyAllocator *[gpu_num];
-
-    for (size_t i = 0; i < devices.size(); ++i) {
-      int dev_id = devices[i];
-      a_arr[i] = nullptr;
-      platform::SetDeviceId(dev_id);
-      a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                        new detail::GPUAllocator(dev_id)),
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-
-      VLOG(10) << "\n\nNOTE:\n"
-               << "You can set GFlags environment variable "
-               << "'FLAGS_fraction_of_gpu_memory_to_use' "
-               << "or 'FLAGS_initial_gpu_memory_in_mb' "
-               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
-               << "to change the memory size for GPU usage.\n"
-               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
-               << FLAGS_fraction_of_gpu_memory_to_use
-               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
-               << FLAGS_initial_gpu_memory_in_mb
-               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
-               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
-    }
-    platform::SetDeviceId(gpu_id);
-  });
-
-  auto pos = std::distance(devices.begin(),
-                           std::find(devices.begin(), devices.end(), gpu_id));
-  return a_arr[pos];
-}
-#endif
-
-template <>
-size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#ifdef PADDLE_WITH_CUDA
-  return GetGPUBuddyAllocator(place.device)->Used();
-#else
-  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-}
-
-template <>
-void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
-                                 size_t size) {
-#ifdef PADDLE_WITH_CUDA
-  auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
-  auto *ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
-    platform::CUDADeviceGuard(place.device);
-    size_t avail, total;
-    platform::GpuMemoryUsage(&avail, &total);
-    LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
-               << " in GPU " << place.device << ", available "
-               << string::HumanReadableSize(avail) << ", total "
-               << string::HumanReadableSize(total) << ", GpuMinChunkSize "
-               << string::HumanReadableSize(buddy_allocator->GetMinChunkSize())
-               << ", GpuMaxChunkSize "
-               << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
-               << ", GPU memory used: "
-               << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
-  } else {
-    if (FLAGS_init_allocated_mem) {
-      cudaMemset(ptr, 0xEF, size);
-    }
-  }
-  return ptr;
-#else
-  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-}
-
-template <>
-void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
-                               size_t size) {
-#ifdef PADDLE_WITH_CUDA
-  GetGPUBuddyAllocator(place.device)->Free(p);
-#else
-  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-}
-
-#ifdef PADDLE_WITH_CUDA
-BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
-  static std::once_flag init_flag;
-  static BuddyAllocator *ba = nullptr;
-
-  std::call_once(init_flag, []() {
-    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                new detail::CUDAPinnedAllocator),
-                            platform::CUDAPinnedMinChunkSize(),
-                            platform::CUDAPinnedMaxChunkSize());
-  });
-
-  return ba;
-}
-#endif
-
-template <>
-size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
-#ifdef PADDLE_WITH_CUDA
-  return GetCUDAPinnedBuddyAllocator()->Used();
-#else
-  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
-#endif
-}
-
-template <>
-void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
-                                       size_t size) {
-#ifdef PADDLE_WITH_CUDA
-  auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
-  void *ptr = buddy_allocator->Alloc(size);
-
-  if (ptr == nullptr) {
-    LOG(WARNING) << "cudaHostAlloc Cannot allocate " << size
-                 << " bytes in CUDAPinnedPlace";
-  }
-  if (FLAGS_init_allocated_mem) {
-    memset(ptr, 0xEF, size);
-  }
-  return ptr;
-#else
-  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
-#endif
-}
-
-template <>
-void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
-                                     void *p, size_t size) {
-#ifdef PADDLE_WITH_CUDA
-  GetCUDAPinnedBuddyAllocator()->Free(p);
-#else
-  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
-#endif
-}
-
-struct AllocVisitor : public boost::static_visitor<void *> {
-  inline explicit AllocVisitor(size_t size) : size_(size) {}
-
-  template <typename Place>
-  inline void *operator()(const Place &place) const {
-    return Alloc<Place>(place, size_);
-  }
-
- private:
-  size_t size_;
-};
-
-struct FreeVisitor : public boost::static_visitor<void> {
-  inline explicit FreeVisitor(void *ptr, size_t size)
-      : ptr_(ptr), size_(size) {}
-
-  template <typename Place>
-  inline void operator()(const Place &place) const {
-    Free<Place>(place, ptr_, size_);
-  }
-
- private:
-  void *ptr_;
-  size_t size_;
-};
-
-size_t Usage::operator()(const platform::CPUPlace &cpu) const {
-  return Used(cpu);
-}
-
-size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
-#ifdef PADDLE_WITH_CUDA
-  return Used(gpu);
-#else
-  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-}
-
-size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
-#ifdef PADDLE_WITH_CUDA
-  return Used(cuda_pinned);
-#else
-  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
-#endif
-}
-}  // namespace legacy
-
-namespace allocation {
-
-Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
-  void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
-  auto *tmp_alloc = new Allocation(ptr, size, place_);
-  platform::MemEvenRecorder::Instance().PushMemRecord(
-      static_cast<void *>(tmp_alloc), place_, size);
-  return tmp_alloc;
-}
-
-void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
-  boost::apply_visitor(
-      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
-      allocation->place());
-  platform::MemEvenRecorder::Instance().PopMemRecord(
-      static_cast<void *>(allocation), place_);
-  delete allocation;
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
deleted file mode 100644
index 913d583099c3f403a8262ff716fcd4c9ab930d22..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <mutex>  // NOLINT
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/place.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class NaiveBestFitAllocator : public Allocator {
- public:
-  explicit NaiveBestFitAllocator(const platform::Place &p) : place_(p) {}
-
-  bool IsAllocThreadSafe() const override { return true; }
-
- protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
-
- private:
-  platform::Place place_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
deleted file mode 100644
index 35391167fe66b9b941e3a5359db452ced7995762..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/pinned_allocator.h"
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
-void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
-  PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
-  delete allocation;
-}
-Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
-  void *ptr;
-  PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
-  return new Allocation(ptr, size, platform::CUDAPinnedPlace());
-}
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
deleted file mode 100644
index 4f535ef33734a3c6f7048ae6538e4332e0c9e8e4..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/memory/allocation/allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-// Allocator uses `cudaHostAlloc`
-class CPUPinnedAllocator : public Allocator {
- public:
-  bool IsAllocThreadSafe() const override;
-
- protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
deleted file mode 100644
index ae6af53241dfee50ff69bf039d69b3e119a21bfb..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/retry_allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class WaitedAllocateSizeGuard {
- public:
-  WaitedAllocateSizeGuard(std::atomic<size_t>* waited_size,
-                          size_t requested_size)
-      : waited_size_(waited_size), requested_size_(requested_size) {
-    waited_size_->fetch_add(requested_size_,
-                            std::memory_order::memory_order_relaxed);
-  }
-
-  ~WaitedAllocateSizeGuard() {
-    waited_size_->fetch_sub(requested_size_,
-                            std::memory_order::memory_order_relaxed);
-  }
-
- private:
-  std::atomic<size_t>* waited_size_;
-  size_t requested_size_;
-};
-
-void RetryAllocator::FreeImpl(Allocation* allocation) {
-  // Delete underlying allocation first.
-  size_t size = allocation->size();
-  underlying_allocator_->Free(allocation);
-  if (UNLIKELY(waited_allocate_size_)) {
-    VLOG(10) << "Free " << size << " bytes and notify all waited threads, "
-                                   "where waited_allocate_size_ = "
-             << waited_allocate_size_;
-    cv_.notify_all();
-  }
-}
-
-Allocation* RetryAllocator::AllocateImpl(size_t size) {
-  auto alloc_func = [&, this]() {
-    return underlying_allocator_->Allocate(size).release();
-  };
-  // In fact, we can unify the code of allocation success and failure
-  // But it would add lock even when allocation success at the first time
-  try {
-    return alloc_func();
-  } catch (BadAlloc&) {
-    {
-      WaitedAllocateSizeGuard guard(&waited_allocate_size_, size);
-      VLOG(10) << "Allocation failed when allocating " << size
-               << " bytes, waited_allocate_size_ = " << waited_allocate_size_;
-      // We can just write allocation retry inside the predicate function of
-      // wait_until. But it needs to acquire the lock when executing predicate
-      // function. For better performance, we use loop here
-      auto end_time = std::chrono::high_resolution_clock::now() + retry_time_;
-      auto wait_until = [&, this] {
-        std::unique_lock<std::mutex> lock(mutex_);
-        return cv_.wait_until(lock, end_time);
-      };
-
-      size_t retry_time = 0;
-      while (wait_until() != std::cv_status::timeout) {
-        try {
-          return alloc_func();
-        } catch (BadAlloc&) {
-          // do nothing when it is not timeout
-          ++retry_time;
-          VLOG(10) << "Allocation failed when retrying " << retry_time
-                   << " times when allocating " << size
-                   << " bytes. Wait still.";
-        } catch (...) {
-          throw;
-        }
-      }
-    }
-    VLOG(10) << "Allocation failed because of timeout when allocating " << size
-             << " bytes.";
-    return alloc_func();  // If timeout, try last allocation request.
-  } catch (...) {
-    throw;
-  }
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
deleted file mode 100644
index 7c218e25c45286aeca194f6bf213814f0e5ec98b..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>              // NOLINT
-#include <chrono>              // NOLINT
-#include <condition_variable>  // NOLINT
-#include <memory>
-#include <mutex>  // NOLINT
-#include <utility>
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class RetryAllocator : public Allocator {
- public:
-  RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms)
-      : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) {
-    PADDLE_ENFORCE_NOT_NULL(
-        underlying_allocator_,
-        "UnderlyingAllocator of RetryAllocator must not be null");
-    PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(),
-                   "UnderlyingAllocator of RetryAllocator must be thread-safe");
-  }
-
-  bool IsAllocThreadSafe() const override { return true; }
-
- protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
-
- private:
-  std::shared_ptr<Allocator> underlying_allocator_;
-  std::chrono::milliseconds retry_time_;
-  std::mutex mutex_;
-  std::condition_variable cv_;
-
-  std::atomic<size_t> waited_allocate_size_{0};
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
deleted file mode 100644
index 11a8dfdc472d6b6cdec2b3f618aeb7065a10447b..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/retry_allocator.h"
-#include <algorithm>
-#include <chrono>              // NOLINT
-#include <condition_variable>  // NOLINT
-#include <mutex>               // NOLINT
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
-#include "paddle/fluid/memory/allocation/cpu_allocator.h"
-#include "paddle/fluid/memory/allocation/locked_allocator.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/memory/allocation/cuda_allocator.h"
-#endif
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-TEST(RetryAllocator, RetryAllocator) {
-  CPUAllocator cpu_allocator;
-
-  size_t size = (1 << 20);
-  auto cpu_allocation = cpu_allocator.Allocate(size);
-
-  std::unique_ptr<BestFitAllocator> best_fit_allocator(
-      new BestFitAllocator(cpu_allocation.get()));
-  std::unique_ptr<LockedAllocator> locked_allocator(
-      new LockedAllocator(std::move(best_fit_allocator)));
-
-  size_t thread_num = 4;
-  size_t sleep_time = 40;
-  size_t extra_time = 10;
-
-  // Reserve to perform more tests in the future
-  std::vector<std::shared_ptr<Allocator>> allocators;
-  {
-    std::unique_ptr<BestFitAllocator> best_fit_allocator(
-        new BestFitAllocator(cpu_allocation.get()));
-    std::unique_ptr<LockedAllocator> locked_allocator(
-        new LockedAllocator(std::move(best_fit_allocator)));
-    allocators.push_back(std::make_shared<RetryAllocator>(
-        std::move(locked_allocator),
-        (thread_num - 1) * (sleep_time + extra_time)));
-  }
-
-  for (auto &allocator : allocators) {
-    std::vector<std::thread> threads(thread_num);
-    std::vector<void *> addresses(threads.size(), nullptr);
-
-    std::mutex mutex;
-    std::condition_variable cv;
-    bool flag = false;
-
-    for (size_t i = 0; i < threads.size(); ++i) {
-      threads[i] = std::thread([&, i]() {
-        {
-          std::unique_lock<std::mutex> lock(mutex);
-          cv.wait(lock, [&] { return flag; });
-        }
-
-        auto ret = allocator->Allocate(size - 1);
-        addresses[i] = ret->ptr();
-        std::this_thread::sleep_for(std::chrono::milliseconds(sleep_time));
-      });
-    }
-
-    {
-      std::lock_guard<std::mutex> lock(mutex);
-      flag = true;
-      cv.notify_all();
-    }
-
-    for (auto &th : threads) {
-      th.join();
-    }
-
-    void *val = cpu_allocation->ptr();
-    bool is_all_equal = std::all_of(addresses.begin(), addresses.end(),
-                                    [val](void *p) { return p == val; });
-    ASSERT_TRUE(is_all_equal);
-  }
-}
-
-class DummyAllocator : public Allocator {
- public:
-  bool IsAllocThreadSafe() const override { return true; }
-
- protected:
-  Allocation *AllocateImpl(size_t size) override {
-    PADDLE_THROW_BAD_ALLOC("Always BadAlloc");
-  }
-
-  void FreeImpl(Allocation *) override {}
-};
-
-TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
-  size_t retry_ms = 10;
-  {
-    RetryAllocator allocator(std::make_shared<DummyAllocator>(), retry_ms);
-    try {
-      auto allocation = allocator.Allocate(100);
-      ASSERT_TRUE(false);
-      allocation.reset();
-    } catch (BadAlloc &ex) {
-      ASSERT_TRUE(std::string(ex.what()).find("Always BadAlloc") !=
-                  std::string::npos);
-    }
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    platform::CUDAPlace p(0);
-    RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms);
-    size_t allocate_size = (static_cast<size_t>(1) << 40);  // Very large number
-    try {
-      auto allocation = allocator.Allocate(allocate_size);
-      ASSERT_TRUE(false);
-      allocation.reset();
-    } catch (BadAlloc &ex) {
-      ASSERT_TRUE(std::string(ex.what()).find(
-                      "Cannot allocate " + std::to_string(allocate_size) +
-                      " on GPU " + std::to_string(p.device)) !=
-                  std::string::npos);
-    }
-  }
-#endif
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/test_aligned_allocator.cc b/paddle/fluid/memory/allocation/test_aligned_allocator.cc
deleted file mode 100644
index 41936ab347d9905693b390ac2281adaa710aafb3..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/test_aligned_allocator.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/aligned_allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-TEST(aligned, aligned_size) {
-  ASSERT_EQ(AlignedSize(1024, 1024), 1024);
-  ASSERT_EQ(AlignedSize(1023, 1024), 1024);
-  ASSERT_EQ(AlignedSize(1025, 1024), 2048);
-}
-
-struct StubAllocator : public Allocator {
- public:
-  StubAllocator() = default;
-
-  size_t AllocNum() const { return alloc_num_; }
-
- protected:
-  Allocation *AllocateImpl(size_t size) override {
-    ++alloc_num_;
-    return new Allocation(new uint8_t[size], size, platform::CPUPlace());
-  }
-
-  void FreeImpl(Allocation *allocation) override {
-    delete[] static_cast<uint8_t *>(allocation->ptr());
-    delete allocation;
-    --alloc_num_;
-  }
-
- private:
-  size_t alloc_num_{0};
-};
-
-bool IsAligned(const AllocationPtr &alloc, size_t alignment) {
-  return reinterpret_cast<uintptr_t>(alloc->ptr()) % alignment == 0;
-}
-
-TEST(aligned_allocator, aligned_allocator) {
-  size_t alignment = 1024;
-  auto allocator = std::make_shared<StubAllocator>();
-  auto aligned_allocator =
-      std::make_shared<AlignedAllocator>(allocator, alignment);
-
-  auto alloc1 = aligned_allocator->Allocate(1345);
-  ASSERT_EQ(allocator->AllocNum(), 1);
-  ASSERT_TRUE(IsAligned(alloc1, alignment));
-  alloc1.reset();
-  ASSERT_EQ(allocator->AllocNum(), 0);
-
-  {
-    auto alloc2 = aligned_allocator->Allocate(200);
-    ASSERT_TRUE(IsAligned(alloc2, alignment));
-    ASSERT_EQ(allocator->AllocNum(), 1);
-
-    auto alloc3 = aligned_allocator->Allocate(3021);
-    ASSERT_TRUE(IsAligned(alloc3, alignment));
-    ASSERT_EQ(allocator->AllocNum(), 2);
-  }
-
-  ASSERT_EQ(allocator->AllocNum(), 0);
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
deleted file mode 100644
index e1c9a4f021e9ad104ba0e25972fe3d47e3dffee3..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place)
-
-if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
-else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
-endif(${WITH_GPU})
-
-cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
-
-cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
-
-cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator)
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
deleted file mode 100644
index 3e4af0a47cec53e8743e81e1cf3a354e9e243dbc..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ /dev/null
@@ -1,274 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/detail/buddy_allocator.h"
-
-#include <algorithm>
-#include <utility>
-
-#include "glog/logging.h"
-
-#ifdef PADDLE_WITH_CUDA
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-#endif
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-BuddyAllocator::BuddyAllocator(
-    std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
-    size_t max_chunk_size)
-    : min_chunk_size_(min_chunk_size),
-      max_chunk_size_(max_chunk_size),
-      cache_(system_allocator->UseGpu()),
-      system_allocator_(std::move(system_allocator)) {}
-
-BuddyAllocator::~BuddyAllocator() {
-  VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
-              "have actually been freed";
-  while (!pool_.empty()) {
-    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    VLOG(10) << "Free from block (" << block << ", " << block->size(cache_)
-             << ")";
-
-    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
-    cache_.invalidate(block);
-    pool_.erase(pool_.begin());
-  }
-}
-
-inline size_t align(size_t size, size_t alignment) {
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
-void* BuddyAllocator::Alloc(size_t unaligned_size) {
-  // adjust allocation alignment
-  size_t size =
-      align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_);
-
-  // acquire the allocator lock
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size "
-           << size;
-
-  // if the allocation is huge, send directly to the system allocator
-  if (size > max_chunk_size_) {
-    VLOG(10) << "Allocate from system allocator.";
-    return SystemAlloc(size);
-  }
-
-  // query and allocate from the existing chunk
-  auto it = FindExistChunk(size);
-
-  // refill the pool if failure
-  if (it == pool_.end()) {
-    it = RefillPool(size);
-    // if still failure, fail fatally
-    if (it == pool_.end()) {
-      return nullptr;
-    }
-  } else {
-    VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
-             << " at address "
-             << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
-  }
-
-  total_used_ += size;
-  total_free_ -= size;
-
-  // split the allocation and return data for use
-  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
-}
-
-void BuddyAllocator::Free(void* p) {
-  // Point back to metadata
-  auto block = static_cast<MemoryBlock*>(p)->metadata();
-
-  // Acquire the allocator lock
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  VLOG(10) << "Free from address " << block;
-
-  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
-    VLOG(10) << "Free directly from system allocator";
-    system_allocator_->Free(block, block->total_size(cache_),
-                            block->index(cache_));
-
-    // Invalidate GPU allocation from cache
-    cache_.invalidate(block);
-
-    return;
-  }
-
-  block->mark_as_free(&cache_);
-
-  total_used_ -= block->total_size(cache_);
-  total_free_ += block->total_size(cache_);
-
-  // Trying to merge the right buddy
-  if (block->has_right_buddy(cache_)) {
-    VLOG(10) << "Merging this block " << block << " with its right buddy "
-             << block->right_buddy(cache_);
-
-    auto right_buddy = block->right_buddy(cache_);
-
-    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      // Take away right buddy from pool
-      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
-                                   right_buddy->total_size(cache_),
-                                   right_buddy));
-
-      // merge its right buddy to the block
-      block->merge(&cache_, right_buddy);
-    }
-  }
-
-  // Trying to merge the left buddy
-  if (block->has_left_buddy(cache_)) {
-    VLOG(10) << "Merging this block " << block << " with its left buddy "
-             << block->left_buddy(cache_);
-
-    auto left_buddy = block->left_buddy(cache_);
-
-    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      // Take away right buddy from pool
-      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
-                                   left_buddy->total_size(cache_), left_buddy));
-
-      // merge the block to its left buddy
-      left_buddy->merge(&cache_, block);
-      block = left_buddy;
-    }
-  }
-
-  // Dumping this block into pool
-  VLOG(10) << "Inserting free block (" << block << ", "
-           << block->total_size(cache_) << ")";
-  pool_.insert(
-      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
-}
-
-size_t BuddyAllocator::Used() { return total_used_; }
-size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
-size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
-
-void* BuddyAllocator::SystemAlloc(size_t size) {
-  size_t index = 0;
-  void* p = system_allocator_->Alloc(&index, size);
-
-  VLOG(10) << "Allocated " << p << " from system allocator.";
-
-  if (p == nullptr) return nullptr;
-
-  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
-                                     size, nullptr, nullptr);
-
-  return static_cast<MemoryBlock*>(p)->data();
-}
-
-BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
-    size_t request_bytes) {
-  size_t allocate_bytes = max_chunk_size_;
-  size_t index = 0;
-
-#ifdef PADDLE_WITH_CUDA
-  if (system_allocator_->UseGpu()) {
-    if ((total_used_ + total_free_) == 0) {
-      // Compute the allocation size for gpu for the first allocation.
-      allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
-    } else {
-      // Compute the re-allocation size, we store the re-allocation size when
-      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
-      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
-        realloc_size_ = platform::GpuReallocSize();
-      }
-      allocate_bytes = std::max(realloc_size_, request_bytes);
-    }
-  }
-#endif
-
-  // Allocate a new block
-  void* p = system_allocator_->Alloc(&index, allocate_bytes);
-
-  if (p == nullptr) return pool_.end();
-
-  VLOG(10) << "Creating and inserting new block " << p
-           << " from system allocator";
-
-  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
-                                     allocate_bytes, nullptr, nullptr);
-
-  total_free_ += allocate_bytes;
-
-  // dump the block into pool
-  return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
-}
-
-BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
-  size_t index = 0;
-
-  while (1) {
-    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
-
-    // no match chunk memory
-    if (it == pool_.end()) return it;
-
-    if (std::get<0>(*it) > index) {
-      // find suitable one
-      if (std::get<1>(*it) >= size) {
-        return it;
-      }
-      // update and continue
-      index = std::get<0>(*it);
-      continue;
-    }
-    return it;
-  }
-}
-
-void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
-                                   size_t size) {
-  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
-  pool_.erase(it);
-
-  VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
-           << ") into";
-  block->split(&cache_, size);
-
-  VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
-           << ")";
-  block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);
-
-  // the rest of memory if exist
-  if (block->has_right_buddy(cache_)) {
-    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", "
-               << block->right_buddy(cache_)->total_size(cache_) << ")";
-
-      pool_.insert(
-          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
-                           block->right_buddy(cache_)->total_size(cache_),
-                           block->right_buddy(cache_)));
-    }
-  }
-
-  return block;
-}
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
deleted file mode 100644
index 791f8b56277723c59ea47e60c0d8d9eec9745fc4..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <mutex>  // NOLINT
-#include <set>
-#include <tuple>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-class BuddyAllocator {
- public:
-  BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
-                 size_t min_chunk_size, size_t max_chunk_size);
-
-  ~BuddyAllocator();
-
- public:
-  void* Alloc(size_t unaligned_size);
-  void Free(void* ptr);
-  size_t Used();
-  size_t GetMinChunkSize();
-  size_t GetMaxChunkSize();
-
- public:
-  // Disable copy and assignment
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
-
- private:
-  // Tuple (allocator index, memory size, memory address)
-  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
-  // Each element in PoolSet is a free allocation
-  using PoolSet = std::set<IndexSizeAddress>;
-
-  /*! \brief Allocate fixed-size memory from system */
-  void* SystemAlloc(size_t size);
-
-  /*! \brief If existing chunks are not suitable, refill pool */
-  PoolSet::iterator RefillPool(size_t request_bytes);
-
-  /**
-   *  \brief   Find the suitable chunk from existing pool and split
-   *           it to left and right buddies
-   *
-   *  \param   it     the iterator of pool list
-   *  \param   size   the size of allocation
-   *
-   *  \return  the left buddy address
-   */
-  void* SplitToAlloc(PoolSet::iterator it, size_t size);
-
-  /*! \brief Find the existing chunk which used to allocation */
-  PoolSet::iterator FindExistChunk(size_t size);
-
- private:
-  size_t total_used_ = 0;  // the total size of used memory
-  size_t total_free_ = 0;  // the total size of free memory
-
-  size_t min_chunk_size_;  // the minimum size of each chunk
-  size_t max_chunk_size_;  // the maximum size of each chunk
-
-  size_t realloc_size_ = 0;  // the size of re-allocated chunk
-
- private:
-  /**
-   * \brief A list of free allocation
-   *
-   * \note  Only store free chunk memory in pool
-   */
-  PoolSet pool_;
-
- private:
-  /*! Unify the metadata format between GPU and CPU allocations */
-  MetadataCache cache_;
-
- private:
-  /*! Allocate CPU/GPU memory from system */
-  std::unique_ptr<SystemAllocator> system_allocator_;
-  std::mutex mutex_;
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
deleted file mode 100644
index dce48ac9ee81a0323957cb7118697feaab165c6a..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/detail/buddy_allocator.h"
-
-#include <memory>
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-#endif
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-constexpr static int TEST_GPU_ID = 0;
-
-int* TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes,
-                        bool use_system_allocator = false,
-                        bool free_ptr = true) {
-  bool freed = false;
-  size_t used_bytes = allocator->Used();
-
-  if (size_bytes > 0) {
-    void* p = allocator->Alloc(size_bytes);
-
-    EXPECT_NE(p, nullptr);
-
-#ifdef PADDLE_WITH_CUDA
-    if (size_bytes < allocator->GetMaxChunkSize()) {
-#else
-    if (size_bytes < allocator->GetMaxChunkSize()) {
-#endif
-      // Not allocate from SystemAllocator
-      EXPECT_FALSE(use_system_allocator);
-      EXPECT_GE(allocator->Used(), used_bytes + size_bytes);
-    } else {
-      // Allocate from SystemAllocator doesn't count in Used()
-      EXPECT_TRUE(use_system_allocator);
-      EXPECT_EQ(allocator->Used(), used_bytes);
-    }
-
-    int* intp = static_cast<int*>(p);
-    if (!free_ptr) {
-      return intp;
-    }
-    std::shared_ptr<int> ptr(intp, [&](void* p) {
-      allocator->Free(intp);
-      freed = true;
-    });
-  } else {
-    freed = true;
-  }
-
-  EXPECT_EQ(used_bytes, allocator->Used());
-  EXPECT_TRUE(freed);
-  return nullptr;
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(BuddyAllocator, GpuFraction) {
-  // In a 16 GB machine, the pool size will be about 160 MB
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
-      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
-
-  // Less than pool size
-  TestBuddyAllocator(&buddy_allocator, 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-
-  // Greater than max chunk size
-  TestBuddyAllocator(&buddy_allocator, 499 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-
-TEST(BuddyAllocator, InitRealloc) {
-  FLAGS_initial_gpu_memory_in_mb = 100;
-  FLAGS_reallocate_gpu_memory_in_mb = 50;
-
-  EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20));
-
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
-      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
-
-  // Less then initial size and reallocate size
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  // Between initial size and reallocate size and not exceed pool
-  TestBuddyAllocator(&buddy_allocator, 80 << 20);
-  TestBuddyAllocator(&buddy_allocator, 99 << 20);
-  // Greater than max chunk size
-  TestBuddyAllocator(&buddy_allocator, 101 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-
-TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
-  FLAGS_initial_gpu_memory_in_mb = 5;
-  FLAGS_reallocate_gpu_memory_in_mb = 10;
-
-  EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20));
-
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
-      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
-
-  // Less than initial size and reallocate size
-  TestBuddyAllocator(&buddy_allocator, 1 << 20);
-  // Between initial size and reallocate size and exceed pool
-  TestBuddyAllocator(&buddy_allocator, 6 << 20);
-  TestBuddyAllocator(&buddy_allocator, 8 << 20);
-  TestBuddyAllocator(&buddy_allocator, 9 << 20);
-  // Greater than max trunk size
-  TestBuddyAllocator(&buddy_allocator, 11 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-
-TEST(BuddyAllocator, FractionRefillPool) {
-  FLAGS_fraction_of_gpu_memory_to_use = 0.6;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-
-  size_t max_chunk_size = platform::GpuMaxChunkSize();
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
-      platform::GpuMinChunkSize(), max_chunk_size);
-
-  // Less than pool size
-  int* p0 = TestBuddyAllocator(&buddy_allocator, max_chunk_size - 1000,
-                               /* use_system_allocator = */ false,
-                               /* free_ptr = */ false);
-  // Max chunk size should be same during allocation
-  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
-
-  size_t alloc =
-      platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
-  // Exceed pool trigger refilling size of fraction of avaiable gpu, and should
-  // be able to alloc 60% of the remaining GPU
-  int* p1 = TestBuddyAllocator(&buddy_allocator, alloc,
-                               /* use_system_allocator = */ false,
-                               /* free_ptr = */ false);
-  // Max chunk size should be same during allocation
-  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
-
-  alloc =
-      platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
-  // Exceed pool trigger refilling size of fraction of avaiable gpu, and should
-  // be able to alloc 60% of the remaining GPU
-  TestBuddyAllocator(&buddy_allocator, alloc,
-                     /* use_system_allocator = */ false);
-  // Max chunk size should be same during allocation
-  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
-
-  buddy_allocator.Free(p0);
-  buddy_allocator.Free(p1);
-}
-
-TEST(BuddyAllocator, AllocFromAvailable) {
-  FLAGS_fraction_of_gpu_memory_to_use = 0.7;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-
-  size_t total = 0, available = 0;
-  platform::SetDeviceId(TEST_GPU_ID);
-  platform::GpuMemoryUsage(&available, &total);
-
-  // Take half of available GPU
-  void* p;
-  cudaError_t result = cudaMalloc(&p, available >> 1);
-  EXPECT_TRUE(result == cudaSuccess);
-
-  // BuddyAllocator should be able to alloc the remaining GPU
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
-      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
-
-  TestBuddyAllocator(&buddy_allocator, 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30));
-
-  if (p) {
-    EXPECT_TRUE(cudaFree(p) == cudaSuccess);
-  }
-}
-
-TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
-  FLAGS_fraction_of_gpu_memory_to_use = 1.0;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-
-  void* p = nullptr;
-  EXPECT_TRUE(cudaMalloc(&p, static_cast<size_t>(3) << 30) == cudaSuccess);
-
-  // BuddyAllocator should be able to alloc the remaining GPU
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
-      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
-
-  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
-  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(2) << 30);
-
-  if (p) {
-    EXPECT_TRUE(cudaFree(p) == cudaSuccess);
-  }
-}
-
-#endif
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc
deleted file mode 100644
index 15e2e856385a14acbbb4717681be5b5181e9e522..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/memory_block.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-void MemoryBlock::init(MetadataCache* cache, Type t, size_t index, size_t size,
-                       void* left_buddy, void* right_buddy) {
-  cache->save(
-      this, MemoryBlock::Desc(t, index, size - sizeof(MemoryBlock::Desc), size,
-                              static_cast<MemoryBlock*>(left_buddy),
-                              static_cast<MemoryBlock*>(right_buddy)));
-}
-
-MemoryBlock::Type MemoryBlock::type(const MetadataCache& cache) const {
-  return cache.load(this).type;
-}
-
-size_t MemoryBlock::size(const MetadataCache& cache) const {
-  return cache.load(this).size;
-}
-
-size_t MemoryBlock::index(const MetadataCache& cache) const {
-  return cache.load(this).index;
-}
-
-size_t MemoryBlock::total_size(const MetadataCache& cache) const {
-  return cache.load(this).total_size;
-}
-
-bool MemoryBlock::has_left_buddy(const MetadataCache& cache) const {
-  return left_buddy(cache) != nullptr;
-}
-
-bool MemoryBlock::has_right_buddy(const MetadataCache& cache) const {
-  return right_buddy(cache) != nullptr;
-}
-
-MemoryBlock* MemoryBlock::left_buddy(const MetadataCache& cache) const {
-  return cache.load(this).left_buddy;
-}
-
-MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const {
-  return cache.load(this).right_buddy;
-}
-
-void MemoryBlock::split(MetadataCache* cache, size_t size) {
-  // make sure the split fits
-  PADDLE_ENFORCE_GE(total_size(*cache), size);
-
-  // bail out if there is no room for another partition
-  if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) {
-    return;
-  }
-
-  // find the position of the split
-  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
-
-  size_t remaining_size = total_size(*cache) - size;
-
-  // Add the new block as a buddy
-  auto metadata = cache->load(this);
-
-  // Write the metadata for the new block
-  auto new_block_right_buddy = metadata.right_buddy;
-
-  cache->save(static_cast<MemoryBlock*>(right_partition),
-              MemoryBlock::Desc(FREE_CHUNK, index(*cache),
-                                remaining_size - sizeof(MemoryBlock::Desc),
-                                remaining_size, this, new_block_right_buddy));
-
-  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
-  metadata.size = size - sizeof(MemoryBlock::Desc);
-  metadata.total_size = size;
-
-  cache->save(this, metadata);
-
-  // Write metadata for the new block's right buddy
-  if (new_block_right_buddy != nullptr) {
-    auto buddy_metadata = cache->load(new_block_right_buddy);
-
-    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
-
-    cache->save(new_block_right_buddy, buddy_metadata);
-  }
-}
-
-void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) {
-  // only free blocks can be merged
-  PADDLE_ENFORCE_EQ(type(*cache), FREE_CHUNK);
-  PADDLE_ENFORCE_EQ(right_buddy->type(*cache), FREE_CHUNK);
-
-  auto metadata = cache->load(this);
-
-  // link this->buddy's buddy
-  metadata.right_buddy = right_buddy->right_buddy(*cache);
-
-  // link buddy's buddy -> this
-  if (metadata.right_buddy != nullptr) {
-    auto buddy_metadata = cache->load(metadata.right_buddy);
-
-    buddy_metadata.left_buddy = this;
-
-    cache->save(metadata.right_buddy, buddy_metadata);
-  }
-
-  metadata.size += right_buddy->total_size(*cache);
-  metadata.total_size += right_buddy->total_size(*cache);
-
-  cache->save(this, metadata);
-  cache->save(right_buddy,
-              MemoryBlock::Desc(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
-}
-
-void MemoryBlock::mark_as_free(MetadataCache* cache) {
-  // check for double free or corruption
-  PADDLE_ENFORCE_NE(type(*cache), FREE_CHUNK);
-  PADDLE_ENFORCE_NE(type(*cache), INVALID_CHUNK);
-  set_type(cache, FREE_CHUNK);
-}
-
-void MemoryBlock::set_type(MetadataCache* cache, Type t) {
-  auto metadata = cache->load(this);
-  metadata.type = t;
-  cache->save(this, metadata);
-}
-
-void* MemoryBlock::data() const {
-  return const_cast<MemoryBlock::Desc*>(
-             reinterpret_cast<const MemoryBlock::Desc*>(this)) +
-         1;
-}
-
-MemoryBlock* MemoryBlock::metadata() const {
-  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
-      reinterpret_cast<const MemoryBlock::Desc*>(this) - 1));
-}
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h
deleted file mode 100644
index 5cceba659beeec1b3c986dc43229f6725e3e11de..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/memory_block.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <cstdint>
-#include <unordered_map>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-// Forward declaration.
-class MetadataCache;
-
-// MemoryBlock represents Each allocated memory block, which contains
-// MemoryBlock::Desc and the payload.
-struct MemoryBlock {
-  enum Type {
-    FREE_CHUNK,    // memory is free and idle
-    ARENA_CHUNK,   // memory is being occupied
-    HUGE_CHUNK,    // memory is out of management
-    INVALID_CHUNK  // memory is invalid
-  };
-
-  // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
-  // If it is a CPU memory block, the MetadataCache writes the
-  // MemoryBlock::Desc to the beginning of the block; or, if it is a GPU memory
-  // block, the MetadataCache writes the Meatadata to a std::map in
-  // the CPU.
-  void init(MetadataCache* cache, Type t, size_t index, size_t size,
-            void* left_buddy, void* right_buddy);
-
-  // All these accessors returns fields in the MemoryBlock::Desc of the memory
-  // block.  They all need a MetadataCache instance as their first
-  // parameter because they read the MemoryBlock::Desc from the cache.
-  Type type(const MetadataCache& cache) const;
-  size_t size(const MetadataCache& cache) const;
-  size_t index(const MetadataCache& cache) const;
-  size_t total_size(const MetadataCache& cache) const;
-  bool has_left_buddy(const MetadataCache& cache) const;
-  bool has_right_buddy(const MetadataCache& cache) const;
-  MemoryBlock* left_buddy(const MetadataCache& cache) const;
-  MemoryBlock* right_buddy(const MetadataCache& cache) const;
-
-  // Split the allocation into left/right blocks.
-  void split(MetadataCache* cache, size_t size);
-
-  // Merge left and right blocks together.
-  void merge(MetadataCache* cache, MemoryBlock* right_buddy);
-
-  // Mark the allocation as free.
-  void mark_as_free(MetadataCache* cache);
-
-  // Change the type of the allocation.
-  void set_type(MetadataCache* cache, Type t);
-
-  void* data() const;
-  MemoryBlock* metadata() const;
-
-  // MemoryBlock::Desc describes a MemoryBlock.
-  struct Desc {
-    Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
-         MemoryBlock* r);
-    Desc();
-
-    // Updates guard_begin and guard_end by hashes of the Metadata object.
-    void update_guards();
-
-    // Checks that guard_begin and guard_end are hashes of the Metadata object.
-    bool check_guards() const;
-
-    // TODO(gangliao): compress this
-    size_t guard_begin = 0;
-    MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK;
-    size_t index = 0;
-    size_t size = 0;
-    size_t total_size = 0;
-    MemoryBlock* left_buddy = nullptr;
-    MemoryBlock* right_buddy = nullptr;
-    size_t guard_end = 0;
-  };
-};
-
-// A cache for accessing memory block meta-data that may be expensive
-// to access directly.  This class exists to unify the
-// MemoryBlock::Desc format between GPU and CPU allocations. It should
-// be removed when the CPU can access all GPU allocations directly via
-// UVM.
-class MetadataCache {
- public:
-  explicit MetadataCache(bool uses_gpu);
-
-  // Disable copying and assignment.
-  MetadataCache(const MetadataCache&) = delete;
-  MetadataCache& operator=(const MetadataCache&) = delete;
-
-  // Returns the MemoryBlock::Desc for a memory block.  When MetadataCache is
-  // used to manage CPU memory, the MemoryBlock::Desc resides at the beginning
-  // of the memory block; when used to manage GPU memory, the
-  // Meatadata resides in CPU memory indexed by cache_.
-  MemoryBlock::Desc load(const MemoryBlock* memory_block) const;
-
-  // Saves the MemoryBlock::Desc of a memory block into the cache.  For CPU
-  // memory block, writes the MemoryBlock::Desc to the beginning of the memory
-  // block; whereas for GPU memory, writes it to cache_.
-  void save(MemoryBlock* memory_block, const MemoryBlock::Desc& meta_data);
-
-  // For GPU memory block, erases its MemoryBlock::Desc from cache_.
-  void invalidate(MemoryBlock* memory_block);
-
- private:
-  typedef std::unordered_map<const MemoryBlock*, MemoryBlock::Desc> MetadataMap;
-  MetadataMap cache_;
-  bool uses_gpu_;
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/memory_block_desc.cc b/paddle/fluid/memory/detail/memory_block_desc.cc
deleted file mode 100644
index 393dd9209c0aa443cd17c29b2f9de6eafb48bac9..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/memory_block_desc.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-
-#include "paddle/fluid/memory/detail/memory_block.h"
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-MemoryBlock::Desc::Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
-                        MemoryBlock* l, MemoryBlock* r)
-    : type(t),
-      index(i),
-      size(s),
-      total_size(ts),
-      left_buddy(l),
-      right_buddy(r) {}
-
-MemoryBlock::Desc::Desc()
-    : type(MemoryBlock::INVALID_CHUNK),
-      index(0),
-      size(0),
-      total_size(0),
-      left_buddy(nullptr),
-      right_buddy(nullptr) {}
-
-namespace {
-
-template <class T>
-inline void hash_combine(std::size_t* seed, const T& v) {
-  std::hash<T> hasher;
-  (*seed) ^= hasher(v) + 0x9e3779b9 + ((*seed) << 6) + ((*seed) >> 2);
-}
-
-inline size_t hash(const MemoryBlock::Desc& metadata, size_t initial_seed) {
-  size_t seed = initial_seed;
-
-  hash_combine(&seed, static_cast<size_t>(metadata.type));
-  hash_combine(&seed, metadata.index);
-  hash_combine(&seed, metadata.size);
-  hash_combine(&seed, metadata.total_size);
-  hash_combine(&seed, metadata.left_buddy);
-  hash_combine(&seed, metadata.right_buddy);
-
-  return seed;
-}
-
-}  // namespace
-
-void MemoryBlock::Desc::update_guards() {
-  guard_begin = hash(*this, 1);
-  guard_end = hash(*this, 2);
-}
-
-bool MemoryBlock::Desc::check_guards() const {
-  return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2);
-}
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc
deleted file mode 100644
index f04b0c800e3d81419b408843e79ddfe74149a36d..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "glog/logging.h"
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
-
-MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
-  if (uses_gpu_) {
-    auto existing_desc = cache_.find(block);
-    PADDLE_ENFORCE_EQ(existing_desc->second.check_guards(), true);
-    return existing_desc->second;
-  } else {
-    auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
-    VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
-    PADDLE_ENFORCE_EQ(desc->check_guards(), true);
-    return *reinterpret_cast<const MemoryBlock::Desc*>(block);
-  }
-}
-
-void MetadataCache::save(MemoryBlock* block,
-                         const MemoryBlock::Desc& original_desc) {
-  auto desc = original_desc;
-  desc.update_guards();
-
-  if (uses_gpu_) {
-    cache_[block] = desc;
-  } else {
-    *reinterpret_cast<MemoryBlock::Desc*>(block) = desc;
-  }
-}
-
-void MetadataCache::invalidate(MemoryBlock* block) {
-  if (uses_gpu_) {
-    cache_.erase(block);
-  }
-}
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
deleted file mode 100644
index 55011179ee8ab824f73aaf79df038660a64a54da..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-
-#include "paddle/fluid/memory/detail/system_allocator.h"
-
-#ifdef _WIN32
-#include <malloc.h>
-#include <windows.h>  // VirtualLock/VirtualUnlock
-#else
-#include <sys/mman.h>  // for mlock and munlock
-#endif
-#include <stdlib.h>   // for malloc and free
-#include <algorithm>  // for std::max
-#include <string>
-#include <utility>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#endif
-
-DECLARE_bool(use_pinned_memory);
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-void* AlignedMalloc(size_t size) {
-  void* p = nullptr;
-  size_t alignment = 32ul;
-#ifdef PADDLE_WITH_MKLDNN
-  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
-  // memory alignment
-  alignment = 4096ul;
-#endif
-#ifdef _WIN32
-  p = _aligned_malloc(size, alignment);
-#else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, alignment, size), 0, "Alloc %ld error!",
-                    size);
-#endif
-  PADDLE_ENFORCE_NOT_NULL(p, "Fail to allocate CPU memory: size = %d .", size);
-  return p;
-}
-
-void* CPUAllocator::Alloc(size_t* index, size_t size) {
-  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
-  // malloc might not return nullptr if size is zero, but the returned
-  // pointer shall not be dereferenced -- so we make it nullptr.
-  if (size <= 0) return nullptr;
-
-  *index = 0;  // unlock memory
-
-  void* p = AlignedMalloc(size);
-
-  if (p != nullptr) {
-    if (FLAGS_use_pinned_memory) {
-      *index = 1;
-#ifdef _WIN32
-      VirtualLock(p, size);
-#else
-      mlock(p, size);  // lock memory
-#endif
-    }
-  }
-
-  return p;
-}
-
-void CPUAllocator::Free(void* p, size_t size, size_t index) {
-  if (p != nullptr && index == 1) {
-#ifdef _WIN32
-    VirtualUnlock(p, size);
-#else
-    munlock(p, size);
-#endif
-  }
-#ifdef _WIN32
-  _aligned_free(p);
-#else
-  free(p);
-#endif
-}
-
-bool CPUAllocator::UseGpu() const { return false; }
-
-#ifdef PADDLE_WITH_CUDA
-
-void* GPUAllocator::Alloc(size_t* index, size_t size) {
-  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
-  // if size is 0.  We just make sure it does.
-  if (size <= 0) return nullptr;
-
-  paddle::platform::CUDADeviceGuard guard(gpu_id_);
-
-  void* p;
-  cudaError_t result = cudaMalloc(&p, size);
-
-  if (result == cudaSuccess) {
-    *index = 0;
-    gpu_alloc_size_ += size;
-    return p;
-  } else {
-    PADDLE_ENFORCE_NE(cudaGetLastError(), cudaSuccess);
-
-    size_t avail, total;
-    platform::GpuMemoryUsage(&avail, &total);
-
-    PADDLE_THROW_BAD_ALLOC(
-        "\n\nOut of memory error on GPU %d. "
-        "Cannot allocate %s memory on GPU %d, "
-        "available memory is only %s.\n\n"
-        "Please check whether there is any other process using GPU %d.\n"
-        "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
-        "2. If no, please try one of the following suggestions:\n"
-        "   1) Decrease the batch size of your model.\n"
-        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
-        "please set it to a higher value but less than 1.0.\n"
-        "      The command is "
-        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.\n\n",
-        gpu_id_, string::HumanReadableSize(size), gpu_id_,
-        string::HumanReadableSize(avail), gpu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use);
-  }
-}
-
-void GPUAllocator::Free(void* p, size_t size, size_t index) {
-  cudaError_t err;
-  PADDLE_ENFORCE_EQ(index, 0);
-  PADDLE_ENFORCE_GE(gpu_alloc_size_, size);
-  gpu_alloc_size_ -= size;
-  err = cudaFree(p);
-
-  // Purposefully allow cudaErrorCudartUnloading, because
-  // that is returned if you ever call cudaFree after the
-  // driver has already shutdown. This happens only if the
-  // process is terminating, in which case we don't care if
-  // cudaFree succeeds.
-  if (err != cudaErrorCudartUnloading) {
-    PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free.");
-  }
-}
-
-bool GPUAllocator::UseGpu() const { return true; }
-
-// PINNED memory allows direct DMA transfers by the GPU to and from system
-// memory. It’s locked to a physical address.
-void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
-  if (size <= 0) return nullptr;
-
-  // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
-  // of host pinned allocation. Allocates too much would reduce
-  // the amount of memory available to the underlying system for paging.
-  size_t usable =
-      paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
-
-  if (size > usable) {
-    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
-                 << " MB pinned memory."
-                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
-    return nullptr;
-  }
-
-  void* p;
-  // PINNED memory is visible to all CUDA contexts.
-  cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);
-
-  if (result == cudaSuccess) {
-    *index = 1;  // PINNED memory
-    cuda_pinnd_alloc_size_ += size;
-    return p;
-  } else {
-    LOG(WARNING) << "cudaHostAlloc failed.";
-    return nullptr;
-  }
-
-  return nullptr;
-}
-
-void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
-  cudaError_t err;
-  PADDLE_ENFORCE_EQ(index, 1);
-
-  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size);
-  cuda_pinnd_alloc_size_ -= size;
-  err = cudaFreeHost(p);
-
-  // Purposefully allow cudaErrorCudartUnloading, because
-  // that is returned if you ever call cudaFreeHost after the
-  // driver has already shutdown. This happens only if the
-  // process is terminating, in which case we don't care if
-  // cudaFreeHost succeeds.
-  if (err != cudaErrorCudartUnloading) {
-    PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
-  }
-}
-
-bool CUDAPinnedAllocator::UseGpu() const { return false; }
-
-#endif
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
deleted file mode 100644
index 42f0f23ec1d5d48276285dcef547a4d51054538b..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>  // for size_t
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-/**
- * \brief SystemAllocator is the parent class of CPUAllocator,
- *        CUDAPinnedAllocator and GPUAllocator. A BuddyAllocator
- *        object uses a SystemAllocator* pointing to the
- *        underlying system allocator.
- */
-class SystemAllocator {
- public:
-  virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t* index, size_t size) = 0;
-  virtual void Free(void* p, size_t size, size_t index) = 0;
-  virtual bool UseGpu() const = 0;
-};
-
-class CPUAllocator : public SystemAllocator {
- public:
-  virtual void* Alloc(size_t* index, size_t size);
-  virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu() const;
-};
-
-#ifdef PADDLE_WITH_CUDA
-class GPUAllocator : public SystemAllocator {
- public:
-  explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
-
-  virtual void* Alloc(size_t* index, size_t size);
-  virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu() const;
-
- private:
-  size_t gpu_alloc_size_ = 0;
-  int gpu_id_;
-};
-
-class CUDAPinnedAllocator : public SystemAllocator {
- public:
-  virtual void* Alloc(size_t* index, size_t size);
-  virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu() const;
-
- private:
-  size_t cuda_pinnd_alloc_size_ = 0;
-};
-#endif
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
deleted file mode 100644
index 34bb40d549dff1d711cd8924630cdf528f41ed76..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/detail/system_allocator.h"
-
-#include <memory>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/allocator.h"
-
-DECLARE_bool(use_pinned_memory);
-
-void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
-  bool freed = false;
-  {
-    size_t index;
-    void* p = a->Alloc(&index, size);
-    if (size > 0) {
-      EXPECT_NE(p, nullptr);
-    } else {
-      EXPECT_EQ(p, nullptr);
-    }
-
-    int* i = static_cast<int*>(p);
-    std::shared_ptr<int> ptr(i, [&](void* p) {
-      freed = true;
-      a->Free(p, size, index);
-    });
-  }
-  EXPECT_TRUE(freed);
-}
-
-TEST(CPUAllocator, NoLockMem) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::CPUAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
-}
-
-TEST(CPUAllocator, LockMem) {
-  FLAGS_use_pinned_memory = true;
-  paddle::memory::detail::CPUAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(GPUAllocator, Alloc) {
-  paddle::memory::detail::GPUAllocator a(0);
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
-}
-
-TEST(CUDAPinnedAllocator, Alloc) {
-  paddle::memory::detail::CUDAPinnedAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
-}
-
-TEST(GPUAllocator, AllocFailure) {
-  paddle::memory::detail::GPUAllocator allocator(0);
-  size_t index;
-  size_t alloc_size = (static_cast<size_t>(1) << 40);  // Very large number
-  try {
-    allocator.Alloc(&index, alloc_size);
-    ASSERT_TRUE(false);
-  } catch (paddle::memory::allocation::BadAlloc&) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-  }
-}
-#endif
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
deleted file mode 100644
index e01f030585a8330a2e9bcc2bc2a662f00f5cde1c..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/malloc.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/malloc.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-
-std::shared_ptr<Allocation> AllocShared(const platform::Place &place,
-                                        size_t size) {
-  return allocation::AllocatorFacade::Instance().AllocShared(place, size);
-}
-
-AllocationPtr Alloc(const platform::Place &place, size_t size) {
-  return allocation::AllocatorFacade::Instance().Alloc(place, size);
-}
-
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
deleted file mode 100644
index 9ba572acaca9eba2b913847c52e5a54e19d79bdf..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/malloc.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/place.h"
-namespace paddle {
-
-namespace platform {
-class DeviceContext;
-}  // platform
-
-namespace memory {
-
-using allocation::Allocation;
-using allocation::Allocator;
-using allocation::AllocationPtr;
-
-extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
-                                               size_t size);
-
-extern AllocationPtr Alloc(const platform::Place& place, size_t size);
-
-extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
-
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
deleted file mode 100644
index 89853e159bde378ff1084ff656718c5f4316f051..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/malloc_test.cu
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace memory {
-
-const int NUM_STREAMS = 8;
-const int N = 2;
-const float DELTA = 1e-1;
-
-using CudaDevCtxVec = std::vector<std::unique_ptr<platform::CUDADeviceContext>>;
-
-__global__ void kernel(float *x, int n) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
-    x[i] = 3.14159 * i;
-  }
-}
-
-void CheckKernelOutput(float *x, int n) {
-  auto host_x = std::unique_ptr<float[]>(new float[n]);
-  for (int i = 0; i < n; ++i) {
-    EXPECT_TRUE(cudaSuccess == cudaMemcpy(host_x.get(), x, n * sizeof(float),
-                                          cudaMemcpyDeviceToHost));
-    EXPECT_GE(host_x[i] + DELTA, 3.14159f * i);
-    EXPECT_LE(host_x[i] - DELTA, 3.14159f * i);
-  }
-}
-
-void MultiStreamCompute(float **data, float **second_data,
-                        const platform::CUDADeviceContext &ctx) {
-  // multi-streams
-  AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float));
-  EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
-  *data = reinterpret_cast<float *>(allocation_ptr->ptr());
-  kernel<<<1, 64, 0, ctx.stream()>>>(*data, N);
-
-  // allocate and compute on same stream again
-  allocation_ptr = Alloc(ctx, N * sizeof(float));
-  EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
-  *second_data = reinterpret_cast<float *>(allocation_ptr->ptr());
-  kernel<<<1, 64, 0, ctx.stream()>>>(*second_data, N);
-}
-
-TEST(Malloc, CUDADeviceContextMultiStream) {
-  auto place = platform::CUDAPlace(0);
-  EXPECT_TRUE(cudaSuccess == cudaSetDevice(0));
-
-  AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float));
-  EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float));
-  float *main_stream_data =
-      reinterpret_cast<float *>(main_stream_alloc_ptr->ptr());
-
-  float *data[NUM_STREAMS];
-  float *second_data[NUM_STREAMS];
-  CudaDevCtxVec dev_ctx;
-
-  // default stream
-  kernel<<<1, 64>>>(main_stream_data, N);
-  main_stream_alloc_ptr.reset();
-
-  for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
-    MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
-  }
-
-  EXPECT_TRUE(cudaSuccess == cudaDeviceSynchronize());
-  for (int i = 0; i < NUM_STREAMS; ++i) {
-    CheckKernelOutput(data[i], N);
-    CheckKernelOutput(second_data[i], N);
-  }
-}
-
-TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
-  auto place = platform::CUDAPlace(0);
-  EXPECT_TRUE(cudaSuccess == cudaSetDevice(0));
-
-  AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float));
-  EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float));
-  float *main_stream_data =
-      reinterpret_cast<float *>(main_stream_alloc_ptr->ptr());
-
-  float *data[NUM_STREAMS];
-  float *second_data[NUM_STREAMS];
-  CudaDevCtxVec dev_ctx;
-  std::vector<std::thread> threads;
-
-  // default stream
-  kernel<<<1, 64>>>(main_stream_data, N);
-  main_stream_alloc_ptr.reset();
-
-  for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
-    threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
-                                  std::cref(*dev_ctx[i])));
-  }
-
-  for (int i = 0; i < NUM_STREAMS; ++i) {
-    threads[i].join();
-  }
-
-  EXPECT_TRUE(cudaSuccess == cudaDeviceSynchronize());
-  for (int i = 0; i < NUM_STREAMS; ++i) {
-    CheckKernelOutput(data[i], N);
-    CheckKernelOutput(second_data[i], N);
-  }
-}
-
-TEST(Malloc, AllocZero) {
-  auto place = platform::CUDAPlace(0);
-  AllocationPtr allocation_ptr = Alloc(place, 0);
-  EXPECT_GE(allocation_ptr->size(), 0);
-}
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
deleted file mode 100644
index c08d86eb213310b4e8dbac541c254867bb44b903..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/memcpy.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/memcpy.h"
-
-#include <cstring>  // for memcpy
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace memory {
-
-template <>
-void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
-                                                  platform::CPUPlace,
-                                                  const void* src, size_t num) {
-  if (UNLIKELY(num == 0)) return;
-  std::memcpy(dst, src, num);
-}
-
-#ifdef PADDLE_WITH_CUDA
-static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
-
-// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
-// because GpuMemcpySync issues the copying command to the default stream,
-// which will make two commands from different streams cannot run concurrently.
-// Reference:
-// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
-
-template <>
-void Copy<platform::CPUPlace, platform::CUDAPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, cudaStream_t stream) {
-  if (UNLIKELY(num == 0)) return;
-  platform::SetDeviceId(src_place.device);
-
-  if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
-  } else {
-    platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
-    platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
-    // FIXME(zjl): do we really need it?
-    if (num <= kMaxGpuAsyncCopyBytes) {
-      cudaStreamSynchronize(0);
-    }
-  }
-}
-
-template <>
-void Copy<platform::CUDAPlace, platform::CPUPlace>(
-    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
-    const void* src, size_t num, cudaStream_t stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  platform::SetDeviceId(dst_place.device);
-  if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
-  } else {
-    platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
-    platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
-    // FIXME(zjl): do we really need it?
-    if (num <= kMaxGpuAsyncCopyBytes) {
-      cudaStreamSynchronize(0);
-    }
-  }
-}
-
-template <>
-void Copy<platform::CUDAPlace, platform::CUDAPlace>(
-    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, cudaStream_t stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  if (dst_place == src_place) {
-    platform::SetDeviceId(src_place.device);
-    if (stream) {
-      platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
-      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
-    } else {
-      platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
-      platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
-    }
-  } else {
-    if (stream) {
-      platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
-      platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
-                                   num, stream);
-    } else {
-      platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
-      platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
-                                  num);
-    }
-  }
-}
-
-template <>
-void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
-    platform::CPUPlace dst_place, void* dst,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
-  if (UNLIKELY(num == 0)) return;
-  std::memcpy(dst, src, num);
-}
-
-template <>
-void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CPUPlace src_place, const void* src, size_t num) {
-  if (UNLIKELY(num == 0)) return;
-  std::memcpy(dst, src, num);
-}
-
-template <>
-void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
-  if (UNLIKELY(num == 0)) return;
-  std::memcpy(dst, src, num);
-}
-
-template <>
-void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CUDAPlace src_place, const void* src, size_t num,
-    cudaStream_t stream) {
-  if (UNLIKELY(num == 0)) return;
-  platform::SetDeviceId(src_place.device);
-  if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
-  } else {
-    platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
-    platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
-  }
-}
-
-template <>
-void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
-    platform::CUDAPlace dst_place, void* dst,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
-    cudaStream_t stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  platform::SetDeviceId(dst_place.device);
-  if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
-  } else {
-    platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
-    platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
-  }
-}
-
-#endif
-
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
deleted file mode 100644
index 7b2b8eb0662fb1a6f690b7c10f84b35c717ba6ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/memcpy.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-
-/**
- * \brief   Copy memory from one place to another place.
- *
- * \param[in]  DstPlace Destination allocation place (CPU).
- * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU).
- * \param[in]  src      Source memory address.
- * \param[in]  num      memory size in bytes to copy.
- *
- */
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
-
-#ifdef PADDLE_WITH_CUDA
-
-/**
- * \brief   Copy memory from one place to another place.
- *
- * \param[in]  DstPlace Destination allocation place (CPU or GPU).
- * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or GPU).
- * \param[in]  src      Source memory address.
- * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   CUDA stream.
- *
- * \note    For GPU memory copy, CUDA stream need to be specified
- *          for asynchronously memory copy.
- *
- */
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          cudaStream_t stream);
-
-#endif
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h
deleted file mode 100644
index 8d904e3be56abf0974ba7379f7ca1b676fcb0409..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/memory.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu
deleted file mode 100644
index 0d898f59ee1b8c783c5357aa7e27581a993a6d30..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <gtest/gtest.h>
-#include <unordered_map>
-
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/memory/memory.h"
-
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
-
-// This unit test is an example comparing the performance between using pinned
-// memory and not. In general, using pinned memory will be faster.
-template <typename T>
-__global__ void Kernel(T* output, int dim) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < dim) {
-    output[tid] = output[tid] * output[tid] / 100;
-  }
-}
-
-template <typename Place>
-float test_pinned_memory() {
-  Place cpu_place;
-  paddle::platform::CUDAPlace cuda_place;
-
-  const int data_size = 4096;
-  const int iteration = 10;
-
-  // create event start and end
-  cudaEvent_t start_e, stop_e, copying_e;
-  float elapsedTime = 0;
-  cudaEventCreate(&start_e);
-  cudaEventCreate(&stop_e);
-  cudaEventCreate(&copying_e);
-
-  // create computation stream, data copying stream
-  cudaStream_t computation_stream, copying_stream;
-  cudaStreamCreate(&computation_stream);
-  cudaStreamCreate(&copying_stream);
-
-  // create record event, pinned memory, gpu memory
-  std::vector<cudaEvent_t> record_event(iteration);
-  std::vector<float*> input_pinned_mem(iteration);
-  std::vector<float*> gpu_mem(iteration);
-  std::vector<float*> output_pinned_mem(iteration);
-
-  // initial data
-  for (int j = 0; j < iteration; ++j) {
-    cudaEventCreateWithFlags(&record_event[j], cudaEventDisableTiming);
-    cudaEventCreate(&(record_event[j]));
-    input_pinned_mem[j] = static_cast<float*>(
-        paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
-    output_pinned_mem[j] = static_cast<float*>(
-        paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
-    gpu_mem[j] = static_cast<float*>(
-        paddle::memory::Alloc(cuda_place, data_size * sizeof(float)));
-
-    for (int k = 0; k < data_size; ++k) {
-      input_pinned_mem[j][k] = k;
-    }
-  }
-
-  cudaEventRecord(start_e, computation_stream);
-
-  // computation
-  for (int m = 0; m < 30; ++m) {
-    for (int i = 0; i < iteration; ++i) {
-      // cpu -> GPU on computation stream.
-      // note: this operation is async for pinned memory.
-      paddle::memory::Copy(cuda_place, gpu_mem[i], cpu_place,
-                           input_pinned_mem[i], data_size * sizeof(float),
-                           computation_stream);
-
-      // call kernel on computation stream.
-      Kernel<<<4, 1024, 0, computation_stream>>>(gpu_mem[i], data_size);
-
-      // record event_computation on computation stream
-      cudaEventRecord(record_event[i], computation_stream);
-
-      // wait event_computation on copy stream.
-      // note: this operation is async.
-      cudaStreamWaitEvent(copying_stream, record_event[i], 0);
-
-      // copy data GPU->CPU, on copy stream.
-      // note: this operation is async for pinned memory.
-      paddle::memory::Copy(cpu_place, output_pinned_mem[i], cuda_place,
-                           gpu_mem[i], data_size * sizeof(float),
-                           copying_stream);
-    }
-  }
-
-  cudaEventRecord(copying_e, copying_stream);
-  cudaStreamWaitEvent(computation_stream, copying_e, 0);
-
-  cudaEventRecord(stop_e, computation_stream);
-
-  cudaEventSynchronize(start_e);
-  cudaEventSynchronize(stop_e);
-  cudaEventElapsedTime(&elapsedTime, start_e, stop_e);
-
-  // std::cout << cpu_place << " "
-  //          << "time consume:" << elapsedTime / 30 << std::endl;
-
-  for (int l = 0; l < iteration; ++l) {
-    for (int k = 0; k < data_size; ++k) {
-      float temp = input_pinned_mem[l][k];
-      temp = temp * temp / 100;
-      EXPECT_FLOAT_EQ(temp, output_pinned_mem[l][k]);
-    }
-  }
-
-  // destroy resource
-  cudaEventDestroy(copying_e);
-  cudaEventDestroy(start_e);
-  cudaEventDestroy(stop_e);
-  for (int j = 0; j < 10; ++j) {
-    cudaEventDestroy((record_event[j]));
-    paddle::memory::Free(cpu_place, input_pinned_mem[j]);
-    paddle::memory::Free(cpu_place, output_pinned_mem[j]);
-    paddle::memory::Free(cuda_place, gpu_mem[j]);
-  }
-  return elapsedTime / 30;
-}
-
-TEST(CPUANDCUDAPinned, CPUAllocatorAndCUDAPinnedAllocator) {
-  // Generally speaking, operation on pinned_memory is faster than that on
-  // unpinned-memory, but if this unit test fails frequently, please close this
-  // test for the time being.
-  float time1 = test_pinned_memory<paddle::platform::CPUPlace>();
-  float time2 = test_pinned_memory<paddle::platform::CUDAPinnedPlace>();
-  EXPECT_GT(time1, time2);
-}
diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec
deleted file mode 100644
index 389a174cc4a831a51814004a5984254deade380d..0000000000000000000000000000000000000000
--- a/paddle/fluid/op_use_default_grad_op_maker.spec
+++ /dev/null
@@ -1,32 +0,0 @@
-conv_shift
-cos_sim
-fc
-flatten
-fsp
-fused_embedding_seq_pool
-gru
-lrn
-lstm_unit
-match_matrix_tensor
-max_pool2d_with_index
-max_pool3d_with_index
-maxout
-modified_huber_loss
-nce
-pool2d
-pool3d
-prelu
-rank_loss
-reduce_max
-reduce_min
-reduce_prod
-reshape
-rnn_memory_helper
-sequence_softmax
-spp
-squeeze
-tensor_array_to_tensor
-transpose
-unpool
-unsqueeze
-var_conv_2d
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
deleted file mode 100644
index f99cbc8762aab5ae420c48624b204cbff438a15a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/CMakeLists.txt
+++ /dev/null
@@ -1,128 +0,0 @@
-include(operators)
-
-# clean cache and pybind_file content first when rebuild
-unset(GLOB_OP_LIB CACHE)
-unset(OP_LIBRARY CACHE)
-set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h CACHE INTERNAL "pybind.h file")
-file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
-
-add_subdirectory(math)
-add_subdirectory(controlflow)
-add_subdirectory(detection)
-add_subdirectory(elementwise)
-add_subdirectory(fused)
-add_subdirectory(metrics)
-add_subdirectory(ngraph)
-add_subdirectory(optimizers)
-add_subdirectory(reduce_ops)
-add_subdirectory(sequence_ops)
-add_subdirectory(jit)
-
-if(WITH_DISTRIBUTE)
-    add_subdirectory(distributed)
-    add_subdirectory(distributed_ops)
-    add_subdirectory(collective)
-endif()
-
-add_subdirectory(reader)
-
-if (NOT WIN32)
-    add_subdirectory(nccl)
-endif()
-
-if (WITH_GPU AND TENSORRT_FOUND)
-    add_subdirectory(tensorrt)
-endif()
-
-if (ANAKIN_SUBGRAPH) 
-    add_subdirectory(anakin)
-endif()
-
-SET(OP_HEADER_DEPS xxhash)
-if (WITH_GPU)
-    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
-endif()
-
-SET(OP_PREFETCH_DEPS "")
-if (WITH_DISTRIBUTE)
-    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
-endif()
-
-SET(OP_ONLY_MKL "")
-if (NOT WITH_MKL)
-    SET(OP_ONLY_MKL ${OP_ONLY_MKL} match_matrix_tensor_op)
-    SET(OP_ONLY_MKL ${OP_ONLY_MKL} var_conv_2d_op)
-endif()
-
-register_operators(EXCLUDES py_func_op warpctc_op dgc_op conv_fusion_op
-	sync_batch_norm_op ${OP_ONLY_MKL} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
-
-if (WITH_GPU)
-    # warpctc_op needs cudnn 7 above
-    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
-    else()
-        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
-    endif()
-    # conv_fusion_op needs cudnn 7 above
-    if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
-        op_library(conv_fusion_op)
-        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
-    endif()
-    if (NOT WIN32)
-        op_library(sync_batch_norm_op)
-        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
-    endif()
-else()
-    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
-endif()
-
-set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
-
-if (WITH_DGC)
-    op_library(dgc_op DEPS dgc)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(dgc);\n")
-    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dgc)
-endif()
-
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper)
-if (WITH_GPU)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
-endif()
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
-
-# FIXME(typhoonzero): operator deps may not needed.
-# op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
-# op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
-# op_library(unsqueeze_op DEPS reshape_op)
-# op_library(squeeze_op DEPS reshape_op)
-# op_library(flatten_op DEPS reshape_op)
-# op_library(unstack_op DEPS stack_op)
-# op_library(tensor_array_to_tensor_op DEPS concat_op)
-
-set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
-set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
-
-cc_test(gather_test SRCS gather_test.cc DEPS tensor)
-cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
-cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
-cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
-cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
-cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
-if (WITH_GPU)
-    nv_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc test_leaky_relu_grad_grad_functor.cu DEPS tensor device_context eigen3)
-else()
-    cc_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc DEPS tensor device_context eigen3)
-endif()
-
-if (WITH_PYTHON)
-  cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind)
-endif()
-
-set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
-add_subdirectory(benchmark)
diff --git a/paddle/fluid/operators/activation_cudnn.cu.cc b/paddle/fluid/operators/activation_cudnn.cu.cc
deleted file mode 100644
index 494c02374a9faa22486644c9b9c7d586c86d41b0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/activation_cudnn.cu.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-using platform::ActivationDescriptor;
-using platform::TensorDescriptor;
-
-template <typename Functor>
-class CudnnActivationKernel
-    : public framework::OpKernel<Functor::ElEWISE_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    framework::Tensor *X, *Out;
-    ExtractActivationTensor(context, X, Out);
-    ActivationDescriptor act_desc;
-    TensorDescriptor x_desc, out_desc;
-    x_desc.set(detail::Ref(X));
-    out_desc.set(detail::Ref(Out));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
deleted file mode 100644
index f03355eb441f99b54d78fe90bcb3bea116db58f1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-using platform::ActivationDescriptor;
-using platform::TensorDescriptor;
-using platform::CUDADeviceContext;
-
-template <typename T>
-struct CudnnActivationFunctor {
-  using ELEMENT_TYPE = T;
-  CudnnActivationFunctor(const CUDADeviceContext& ctx, const T& c,
-                         const cudnnActivationMode_t& m)
-      : ctx_(ctx), coef_(c), mode_(m) {}
-  void operator()(const Tensor& x, Tensor* out) {
-    ActivationDescriptor act_desc;
-    act_desc.set(mode_, coef_);
-    TensorDescriptor x_desc, out_desc;
-    x_desc.set(x);
-    out_desc.set(detail::Ref(out));
-    PADDLE_ENFORCE(platform::dynload::cudnnActivationForward(
-        ctx_.cudnn_handle(), act_desc.desc(),
-        platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
-        platform::CudnnDataType<T>::kZero(), out_desc.desc(),
-        out->mutable_data<T>(ctx_.GetPlace())));
-  }
-  const CUDADeviceContext& ctx_;
-  const T coef_;
-  const cudnnActivationMode_t mode_;
-};
-
-template <typename T>
-struct CudnnActivationGradFunctor {
-  using ELEMENT_TYPE = T;
-  CudnnActivationGradFunctor(const CUDADeviceContext& ctx, const T& c,
-                             const cudnnActivationMode_t& m)
-      : ctx_(ctx), coef_(c), mode_(m) {}
-  void operator()(const Tensor& x, const Tensor& out, const Tensor dout,
-                  Tensor* dx) {
-    ActivationDescriptor act_desc;
-    act_desc.set(mode_, coef_);
-    TensorDescriptor x_desc, out_desc, dout_desc, dx_desc;
-    x_desc.set(x);
-    out_desc.set(out);
-    dout_desc.set(dout);
-    dx_desc.set(detail::Ref(dx));
-    PADDLE_ENFORCE(platform::dynload::cudnnActivationBackward(
-        ctx_.cudnn_handle(), act_desc.desc(),
-        platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
-        dout_desc.desc(), dout.data<T>(), x_desc.desc(), x.data<T>(),
-        platform::CudnnDataType<T>::kZero(), dx_desc.desc(),
-        dx->mutable_data<T>(ctx_.GetPlace())));
-  }
-  const CUDADeviceContext& ctx_;
-  const T coef_;
-  const cudnnActivationMode_t mode_;
-};
-
-template <typename T>
-struct CudnnReluFunctor : public CudnnActivationFunctor<T> {
-  explicit CudnnReluFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_RELU) {}
-};
-template <typename T>
-struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_RELU) {}
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct CudnnRelu6Functor : public CudnnActivationFunctor<T> {
-  explicit CudnnRelu6Functor(const CUDADeviceContext& ctx)
-      : CudnnActivationFunctor<T>(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {}
-};
-template <typename T>
-struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationGradFunctor<T>(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct CudnnSigmoidFunctor : public CudnnActivationFunctor<T> {
-  explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {}
-};
-template <typename T>
-struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {}
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct CudnnTanhFunctor : public CudnnActivationFunctor<T> {
-  explicit CudnnTanhFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_TANH) {}
-};
-template <typename T>
-struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_TANH) {}
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename Functor>
-class CudnnActivationKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* X = nullptr;
-    framework::Tensor* Out = nullptr;
-    ExtractActivationTensor(context, &X, &Out);
-    Out->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
-    Functor functor(dev_ctx);
-    functor(detail::Ref(X), Out);
-  }
-};
-
-template <typename Functor>
-class CudnnActivationGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
-    static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out.");
-
-    const framework::Tensor *X, *Out, *dOut;
-    X = Out = dOut = nullptr;
-    framework::Tensor* dX = nullptr;
-    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &X, &Out, &dOut,
-                                                    &dX);
-    dX->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
-    Functor functor(dev_ctx);
-    functor(detail::Ref(X), detail::Ref(Out), detail::Ref(dOut), dX);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-#define FOR_EACH_CUDNN_OP_FUNCTOR(__macro)                  \
-  __macro(relu, CudnnReluFunctor, CudnnReluGradFunctor);    \
-  __macro(relu6, CudnnRelu6Functor, CudnnRelu6GradFunctor); \
-  __macro(sigmoid, CudnnTanhFunctor, CudnnTanhGradFunctor); \
-  __macro(tanh, CudnnTanhFunctor, CudnnTanhGradFunctor)
-
-#define REGISTER_ACTIVATION_CUDNN_KERNEL(act_type, functor, grad_functor) \
-  REGISTER_OP_KERNEL(act_type, CUDNN, plat::CUDAPlace,                    \
-                     ops::CudnnActivationKernel<ops::functor<float>>,     \
-                     ops::CudnnActivationKernel<ops::functor<double>>);   \
-  REGISTER_OP_KERNEL(                                                     \
-      act_type##_grad, CUDNN, plat::CUDAPlace,                            \
-      ops::CudnnActivationGradKernel<ops::grad_functor<float>>,           \
-      ops::CudnnActivationGradKernel<ops::grad_functor<double>>);
-
-FOR_EACH_CUDNN_OP_FUNCTOR(REGISTER_ACTIVATION_CUDNN_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
deleted file mode 100644
index f6848a800fbb34eeb0fab3ffb72b36549260255b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/activation_op.cc
+++ /dev/null
@@ -1,1009 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/activation_op.h"
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
-#include "paddle/fluid/platform/port.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-
-DECLARE_bool(use_mkldnn);
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-
-template <typename GradFunctor>
-static constexpr bool CanInplaceAct() {
-  return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps;
-}
-
-#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                    \
-  class OP_NAME##OpMaker                                                     \
-      : public ::paddle::framework::OpProtoAndCheckerMaker {                 \
-   public:                                                                   \
-    void Make() override {                                                   \
-      AddInput("X", "Input of " #OP_NAME " operator");                       \
-      AddOutput("Out", "Output of " #OP_NAME " operator");                   \
-      AddAttr<bool>("use_mkldnn",                                            \
-                    "(bool, default false) Only used in mkldnn kernel")      \
-          .SetDefault(false);                                                \
-      AddAttr<bool>("use_cudnn",                                             \
-                    "(bool, default false) Only used in cudnn kernel, need " \
-                    "install cudnn")                                         \
-          .SetDefault(false);                                                \
-      AddAttr<bool>(                                                         \
-          "is_test",                                                         \
-          "(bool, default false) Set to true for inference only, false "     \
-          "for training. Some layers may run faster when this is true.")     \
-          .SetDefault(false);                                                \
-      AddComment(OP_COMMENT);                                                \
-    }                                                                        \
-  }
-
-template <ActBwdOpFwdDeps kDepValue>
-class ActivationGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType(ForwardOpType() + "_grad");
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-
-    if ((static_cast<int>(kDepValue) &
-         static_cast<int>(ActBwdOpFwdDeps::kDepX)) ||
-        FLAGS_use_mkldnn || (op->HasAttr("use_mkldnn") &&
-                             boost::get<bool>(op->GetAttr("use_mkldnn")))) {
-      op->SetInput("X", Input("X"));
-    }
-
-    if (static_cast<int>(kDepValue) &
-        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
-      op->SetInput("Out", Output("Out"));
-    }
-
-    return op;
-  }
-};
-
-framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
-                                      const framework::OperatorWithKernel& oper,
-                                      const std::string& name) {
-  framework::LibraryType library{framework::LibraryType::kPlain};
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-// FIXME(liuwei1031) temporarily disable the code to unblock users
-// TODO(liuwei1031) figure out the reason behind
-// https://github.com/PaddlePaddle/Paddle/issues/16096
-// and re-enable this in the future
-// #ifdef PADDLE_WITH_CUDA
-//   auto it1 = oper.Attrs().find("use_cudnn");
-//   if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
-//     library = framework::LibraryType::kCUDNN;
-//   }
-// #endif
-#ifdef PADDLE_WITH_MKLDNN
-  auto it = oper.Attrs().find("use_mkldnn");
-  if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library = framework::LibraryType::kMKLDNN;
-    layout = framework::DataLayout::kMKLDNN;
-  }
-#endif
-  return framework::OpKernelType(
-      framework::GetDataTypeOfVar(ctx.InputVar(name)), ctx.GetPlace(), layout,
-      library);
-}
-
-class ActivationOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, "X");
-  }
-};
-
-class ActivationOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
-  }
-};
-
-class ActivationOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto out_grad_name = framework::GradVarName("Out");
-    ctx->ShareDim(out_grad_name, framework::GradVarName("X"));
-    ctx->ShareLoD(out_grad_name, framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, framework::GradVarName("Out"));
-  }
-};
-
-UNUSED constexpr char SigmoidDoc[] = R"DOC(
-Sigmoid Activation Operator
-
-$$out = \\frac{1}{1 + e^{-x}}$$
-
-)DOC";
-
-UNUSED constexpr char LogSigmoidDoc[] = R"DOC(
-Logsigmoid Activation Operator
-
-$$out = \\log \\frac{1}{1 + e^{-x}}$$
-
-)DOC";
-
-UNUSED constexpr char ExpDoc[] = R"DOC(
-Exp Activation Operator.
-
-$out = e^x$
-
-)DOC";
-
-UNUSED constexpr char ReluDoc[] = R"DOC(
-Relu Activation Operator.
-
-$out = \max(x, 0)$
-
-)DOC";
-
-UNUSED constexpr char GeluDoc[] = R"DOC(
-Gelu Activation Operator.
-
-$out = \\frac{1 + erf(\\frac{x}{\\sqrt{2}})}{2} x$
-
-)DOC";
-
-UNUSED constexpr char TanhDoc[] = R"DOC(
-Tanh Activation Operator.
-
-$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
-
-)DOC";
-
-UNUSED constexpr char TanhShrinkDoc[] = R"DOC(
-TanhShrink Activation Operator.
-
-$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
-
-)DOC";
-
-UNUSED constexpr char SqrtDoc[] = R"DOC(
-Sqrt Activation Operator.
-
-Please make sure legal input, when input a negative value closed to zero,
-you should add a small epsilon(1e-12) to avoid negative number caused by numerical errors.
-
-$out = \sqrt{x}$
-
-)DOC";
-
-UNUSED constexpr char RsqrtDoc[] = R"DOC(
-Rsqrt Activation Operator.
-
-Please make sure input is legal in case of numeric errors.
-
-$out = \frac{1}{\sqrt{x}}$
-
-)DOC";
-
-UNUSED constexpr char AbsDoc[] = R"DOC(
-Abs Activation Operator.
-
-$out = |x|$
-
-)DOC";
-
-UNUSED constexpr char CeilDoc[] = R"DOC(
-Ceil Activation Operator.
-
-$out = \left \lceil x \right \rceil$
-
-)DOC";
-
-UNUSED constexpr char FloorDoc[] = R"DOC(
-Floor Activation Operator.
-
-$out = \left \lfloor x \right \rfloor$
-
-)DOC";
-
-UNUSED constexpr char CosDoc[] = R"DOC(
-Cosine Activation Operator.
-
-$out = cos(x)$
-
-)DOC";
-
-UNUSED constexpr char SinDoc[] = R"DOC(
-Sine Activation Operator.
-
-$out = sin(x)$
-
-)DOC";
-
-UNUSED constexpr char RoundDoc[] = R"DOC(
-Round Activation Operator.
-
-$out = [x]$
-
-)DOC";
-
-UNUSED constexpr char ReciprocalDoc[] = R"DOC(
-Reciprocal Activation Operator.
-
-$$out = \\frac{1}{x}$$
-
-)DOC";
-
-UNUSED constexpr char LogDoc[] = R"DOC(
-Log Activation Operator.
-
-$out = \ln(x)$
-
-Natural logarithm of x.
-
-)DOC";
-
-UNUSED constexpr char SquareDoc[] = R"DOC(
-Square Activation Operator.
-
-$out = x^2$
-
-)DOC";
-
-UNUSED constexpr char SoftplusDoc[] = R"DOC(
-Softplus Activation Operator.
-
-$out = \ln(1 + e^{x})$
-
-)DOC";
-
-UNUSED constexpr char SoftsignDoc[] = R"DOC(
-Softsign Activation Operator.
-
-$$out = \\frac{x}{1 + \|x\|}$$
-
-)DOC";
-
-class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of acos operator");
-    AddOutput("Out", "Output of acos operator");
-    AddComment(R"DOC(
-Arccosine Activation Operator.
-
-$$out = \cos^{-1}(x)$$
-
-)DOC");
-  }
-};
-
-class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of asin operator");
-    AddOutput("Out", "Output of asin operator");
-    AddComment(R"DOC(
-Arcsine Activation Operator.
-
-$$out = \sin^{-1}(x)$$
-
-)DOC");
-  }
-};
-
-class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of atan operator");
-    AddOutput("Out", "Output of atan operator");
-    AddComment(R"DOC(
-Arctanh Activation Operator.
-
-$$out = \tanh^{-1}(x)$$
-
-)DOC");
-  }
-};
-
-class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of LeakyRelu operator");
-    AddOutput("Out", "Output of LeakyRelu operator");
-    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-LeakyRelu Activation Operator.
-
-$out = \max(x, \alpha * x)$
-
-)DOC");
-  }
-};
-
-class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of Softshrink operator");
-    AddOutput("Out", "Output of Softshrink operator");
-    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
-    AddComment(R"DOC(
-:strong:`Softshrink Activation Operator`
-
-..  math::
-    out = \begin{cases}
-         x - \lambda, \text{if } x > \lambda \\
-         x + \lambda, \text{if } x < -\lambda \\
-         0,  \text{otherwise}
-         \end{cases}
-
-)DOC");
-  }
-};
-
-class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of HardShrink operator");
-    AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold",
-                   "The value of threshold for HardShrink. [default: 0.5]")
-        .SetDefault(0.5f);
-    AddComment(R"DOC(
-:strong:`HardShrink activation operator`
-
-..  math::
-    out = \begin{cases}
-            x, \text{if } x > \lambda \\
-            x, \text{if } x < -\lambda \\
-            0,  \text{otherwise}
-          \end{cases}
-
-)DOC");
-  }
-};
-
-class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of BRelu operator");
-    AddOutput("Out", "Output of BRelu operator");
-    AddAttr<float>("t_min", "The min marginal value of BRelu")
-        .SetDefault(static_cast<float>(0));
-    AddAttr<float>("t_max", "The max marginal value of BRelu")
-        .SetDefault(static_cast<float>(24));
-    AddComment(R"DOC(
-BRelu Activation Operator.
-
-$out = \max(\min(x, t_{min}), t_{max})$
-
-)DOC");
-  }
-};
-
-class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of SoftRelu operator");
-    AddOutput("Out", "Output of SoftRelu operator");
-    AddAttr<float>("threshold", "The threshold value of SoftRelu")
-        .SetDefault(40.0f);
-    AddComment(R"DOC(
-SoftRelu Activation Operator.
-
-$out = \ln(1 + \exp(\max(\min(x, threshold), -threshold)))$
-
-)DOC");
-  }
-};
-
-class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of ELU operator");
-    AddOutput("Out", "Output of ELU operator");
-    AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
-    AddComment(R"DOC(
-ELU Activation Operator.
-
-Applies the following element-wise computation on the input according to
-https://arxiv.org/abs/1511.07289.
-
-$out = \max(0, x) + \min(0, \alpha * (e^x - 1))$
-
-)DOC");
-  }
-};
-
-class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of Relu6 operator");
-    AddOutput("Out", "Output of Relu6 operator");
-    AddAttr<float>("threshold", "The threshold value of Relu6")
-        .SetDefault(6.0f);
-    AddComment(R"DOC(
-Relu6 Activation Operator.
-
-$out = \min(\max(0, x), 6)$
-
-)DOC");
-  }
-};
-
-class PowOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of Pow operator");
-    AddInput("FactorTensor",
-             "(Tensor<float>, optional). If provided, pow will use this"
-             "The shape of FactorTensor MUST BE [1]."
-             "it has higher priority than attr(factor).")
-        .AsDispensable();
-    AddOutput("Out", "Output of Pow operator");
-    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
-    AddComment(R"DOC(
-Pow Activation Operator.
-
-$out = x^{factor}$
-
-)DOC");
-  }
-};
-
-class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of STanh operator");
-    AddOutput("Out", "Output of STanh operator");
-    AddAttr<float>("scale_a", "The scale parameter of a for the input")
-        .SetDefault(2.0f / 3.0f);
-    AddAttr<float>("scale_b", "The scale parameter of b for the input")
-        .SetDefault(1.7159f);
-    AddComment(R"DOC(
-STanh Activation Operator.
-
-$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
-
-)DOC");
-  }
-};
-
-class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of ThresholdedRelu operator");
-    AddOutput("Out", "Output of ThresholdedRelu operator");
-    AddAttr<float>("threshold",
-                   "The threshold location of activation. [default 1.0].")
-        .SetDefault(1.0f);
-    AddComment(R"DOC(
-:strong:`ThresholdedRelu activation operator`
-
-..  math::
-
-    out = \begin{cases}
-             x,  \text{if } x > threshold \\
-             0,  \text{otherwise}
-          \end{cases}
-)DOC");
-  }
-};
-
-class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of HardSigmoid operator");
-    AddOutput("Out", "Output of HardSigmoid operator");
-    AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
-        .SetDefault(0.2f);
-    AddAttr<float>("offset", "Offset for linear approximation of sigmoid")
-        .SetDefault(0.5f);
-    AddComment(R"DOC(
-HardSigmoid Activation Operator.
-
-Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
-which is much faster than sigmoid.
-
-$out = \max(0, \min(1, slope * x + shift))$
-
-The slope should be positive. The offset can be either positive or negative.
-The default slope and shift are set according to the above reference.
-It is recommended to use the defaults for this activation.
-
-)DOC");
-  }
-};
-
-class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of Swish operator");
-    AddOutput("Out", "Output of Swish operator");
-    AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
-    AddComment(R"DOC(
-Swish Activation Operator.
-
-$$out = \\frac{x}{1 + e^{- \beta \ x}}$$
-
-)DOC");
-  }
-};
-
-class HardSwishOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of HardSwish operator");
-    AddOutput("Out", "Output of HardSwish operator");
-    AddAttr<float>("threshold", "The threshold parameter of HardSwish operator")
-        .SetDefault(6.0f);
-    AddAttr<float>("scale", "The scale parameter of HardSwish operator")
-        .SetDefault(6.0f);
-    AddAttr<float>("offset", "The offset parameter of HardSwish operator")
-        .SetDefault(3.0f);
-    AddComment(R"DOC(
-HardSwish Activation Operator.
-
-The hard version of swish(https://arxiv.org/pdf/1905.02244.pdf).
-
-$out = \frac{x * (min(max(0, x+offset), threshold))}{scale}$
-
-The threshold and scale should be positive. The offset can be either positive or negative.
-The default parameters are set according to the above reference.
-It is recommended to use the defaults for this activation.
-
-)DOC");
-  }
-};
-
-REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
-REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
-REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
-REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
-REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc);
-REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
-REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
-REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
-REGISTER_ACTIVATION_OP_MAKER(Rsqrt, RsqrtDoc);
-REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc);
-REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc);
-REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc);
-REGISTER_ACTIVATION_OP_MAKER(Cos, CosDoc);
-REGISTER_ACTIVATION_OP_MAKER(Sin, SinDoc);
-REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc);
-REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
-REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
-REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
-REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
-REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
-
-template <ActBwdOpFwdDeps kDepValue>
-class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
-      if (ctx->HasOutput("DX")) {
-        ctx->ShareDim("X", "DX");
-        ctx->ShareLoD("X", "DX");
-      }
-      if (ctx->HasOutput("DDOut")) {
-        ctx->ShareDim("X", "DDOut");
-        ctx->ShareLoD("X", "DDOut");
-      }
-    }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
-      if (ctx->HasOutput("DOut")) {
-        ctx->ShareDim("Out", "DOut");
-        ctx->ShareLoD("Out", "DOut");
-      }
-      if (ctx->HasOutput("DDOut")) {
-        ctx->ShareDim("Out", "DDOut");
-        ctx->ShareLoD("Out", "DDOut");
-      }
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, "DDX");
-  }
-};
-
-template <ActBwdOpFwdDeps kDepValue>
-class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
-      if (ctx->HasOutput("DDOut")) {
-        ctx->ShareDim("X", "DDOut");
-        ctx->ShareLoD("X", "DDOut");
-      }
-    }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
-      if (ctx->HasOutput("DDOut")) {
-        ctx->ShareDim("Out", "DDOut");
-        ctx->ShareLoD("Out", "DDOut");
-      }
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, "DDX");
-  }
-};
-
-//
-// ReluGrad: dx = dy if y >= 0 else 0
-// ReluGradGrad: ddy = ddx if y >= 0 else 0
-//
-class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
- public:
-  using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {
-    auto* op = new ::paddle::framework::OpDesc();
-    op->SetType("relu_grad_grad");
-    // input1: Out
-    op->SetInput("Out", Input("Out"));
-    // input2: ddx
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetAttrMap(Attrs());
-    // output: ddy
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    return std::unique_ptr<::paddle::framework::OpDesc>(op);
-  }
-};
-
-// leaky_relu Grad: dx=dy if y>=0 else alpha * dy
-// leaky_relu GradGrad: ddy=ddx if y>=0 else alpha * ddx
-class LeakyReluDoubleGradMaker
-    : public ::paddle::framework::SingleGradOpDescMaker {
- public:
-  using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {
-    auto* op = new ::paddle::framework::OpDesc();
-    op->SetType("leaky_relu_grad_grad");
-    // input1: Out
-    op->SetInput("Out", Input("Out"));
-    // X@GRAD@GRAD: ddx
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetAttrMap(Attrs());
-    // Out@GRAD@GRAD: ddy
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    return std::unique_ptr<::paddle::framework::OpDesc>(op);
-  }
-};
-
-// sqrt Grad: dx = 0.5 * dy / y
-// sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx
-class SqrtDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
- public:
-  using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {
-    auto* op = new ::paddle::framework::OpDesc();
-    op->SetType("sqrt_grad_grad");
-    op->SetInput("Out", Input("Out"));
-    op->SetInput("DX", Output(framework::GradVarName("X")));
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetAttrMap(Attrs());
-    op->SetOutput("DOut", InputGrad("Out"));
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    return std::unique_ptr<::paddle::framework::OpDesc>(op);
-  }
-};
-
-// square Grad: dx=2x*dy
-// square GradGrad: ddy=2x*ddx, dx=2dy*ddx
-class SquareDoubleGradMaker
-    : public ::paddle::framework::SingleGradOpDescMaker {
- public:
-  using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {
-    auto* op = new ::paddle::framework::OpDesc();
-    op->SetType("square_grad_grad");
-    op->SetInput("X", Input("X"));
-    // Out@GRAD: dy
-    op->SetInput("DOut", Input(framework::GradVarName("Out")));
-    // X@GRAD@GRAD: ddx
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-
-    op->SetAttrMap(Attrs());
-
-    // X@GRAD: dx
-    op->SetOutput("DX", InputGrad("X"));
-    // Out@GRAD@GRAD: ddy
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    return std::unique_ptr<::paddle::framework::OpDesc>(op);
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInference,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-DECLARE_INPLACE_OP_INFERER(ActivationDoubleGradOpInplaceInference,
-                           {"DDX", "DDOut"});
-
-class PowGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("pow_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetInput("FactorTensor", Input("FactorTensor"));
-    op->SetAttrMap(Attrs());
-
-    return op;
-  }
-};
-class PowOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, "X");
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "FactorTensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class PowOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto out_grad_name = framework::GradVarName("Out");
-    ctx->ShareDim(out_grad_name, framework::GradVarName("X"));
-    ctx->ShareLoD(out_grad_name, framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, framework::GradVarName("Out"));
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "FactorTensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#define REGISTER_ACTIVATION_OP(KERNEL_TYPE, OP_NAME, functor, grad_functor) \
-  REGISTER_OPERATOR(                                                        \
-      KERNEL_TYPE, ops::ActivationOp, ops::OP_NAME##OpMaker,                \
-      ops::ActivationOpInferVarType,                                        \
-      ops::ActivationGradOpDescMaker<ops::grad_functor<float>::FwdDeps()>,  \
-      std::conditional<ops::CanInplaceAct<ops::grad_functor<float>>(),      \
-                       ::paddle::framework::SingleOpInplaceInToOut,         \
-                       void>::type);                                        \
-  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ops::ActivationOpGrad,              \
-                    ops::ActivationGradOpInplaceInference);
-
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name, functor,        \
-                                       grad_functor)                      \
-  REGISTER_OP_CPU_KERNEL(                                                 \
-      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
-                                      ops::functor<float>>,               \
-      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
-                            ops::functor<double>>);                       \
-  REGISTER_OP_CPU_KERNEL(                                                 \
-      act_type##_grad,                                                    \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
-                                ops::grad_functor<float>>,                \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
-                                ops::grad_functor<double>>);
-
-FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
-FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
-
-/* ==========================    relu register  ============================= */
-REGISTER_OPERATOR(
-    relu, ops::ActivationOp, ops::ReluOpMaker, ops::ActivationOpInferVarType,
-    ops::ActivationGradOpDescMaker<ops::ReluGradFunctor<float>::FwdDeps()>,
-    paddle::framework::SingleOpInplaceInToOut);
-REGISTER_OPERATOR(relu_grad, ops::ActivationOpGrad,
-                  ops::ActivationGradOpInplaceInference,
-                  ops::ReluDoubleGradMaker);
-REGISTER_OPERATOR(
-    relu_grad_grad,
-    ops::ActivationOpDoubleGrad2<ops::ReluGradFunctor<float>::FwdDeps()>,
-    ops::ActivationDoubleGradOpInplaceInference);
-
-REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluFunctor, ReluGradFunctor);
-
-REGISTER_OP_CPU_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ======================== leaky relu register  ============================ */
-REGISTER_OPERATOR(
-    leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
-    ops::ActivationOpInferVarType,
-    ops::ActivationGradOpDescMaker<ops::LeakyReluGradFunctor<float>::FwdDeps()>,
-    paddle::framework::SingleOpInplaceInToOut);
-REGISTER_OPERATOR(leaky_relu_grad, ops::ActivationOpGrad,
-                  ops::ActivationGradOpInplaceInference,
-                  ops::LeakyReluDoubleGradMaker);
-REGISTER_OPERATOR(
-    leaky_relu_grad_grad,
-    ops::ActivationOpDoubleGrad2<ops::LeakyReluGradFunctor<float>::FwdDeps()>,
-    ops::ActivationDoubleGradOpInplaceInference);
-
-REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
-                               LeakyReluGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    leaky_relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::LeakyReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::LeakyReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<
-        plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ===========================   sqrt register  ============================= */
-REGISTER_OPERATOR(
-    sqrt, ops::ActivationOp, ops::SqrtOpMaker, ops::ActivationOpInferVarType,
-    ops::ActivationGradOpDescMaker<ops::SqrtGradFunctor<float>::FwdDeps()>,
-    paddle::framework::SingleOpInplaceInToOut);
-REGISTER_OPERATOR(sqrt_grad, ops::ActivationOpGrad,
-                  ops::ActivationGradOpInplaceInference,
-                  ops::SqrtDoubleGradMaker);
-REGISTER_OPERATOR(
-    sqrt_grad_grad,
-    ops::ActivationOpDoubleGrad<ops::SqrtGradGradFunctor<float>::FwdDeps()>,
-    ops::ActivationDoubleGradOpInplaceInference);
-
-REGISTER_ACTIVATION_CPU_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    sqrt_grad_grad, ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
-                                              ops::SqrtGradGradFunctor<float>>,
-    ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::SqrtGradGradFunctor<double>>,
-    ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::SqrtGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ==========================   square register  ============================ */
-REGISTER_OPERATOR(
-    square, ops::ActivationOp, ops::SquareOpMaker,
-    ops::ActivationOpInferVarType,
-    ops::ActivationGradOpDescMaker<ops::SquareGradFunctor<float>::FwdDeps()>,
-    paddle::framework::SingleOpInplaceInToOut);
-REGISTER_OPERATOR(square_grad, ops::ActivationOpGrad,
-                  ops::ActivationGradOpInplaceInference,
-                  ops::SquareDoubleGradMaker);
-REGISTER_OPERATOR(
-    square_grad_grad,
-    ops::ActivationOpDoubleGrad<ops::SquareGradGradFunctor<float>::FwdDeps()>,
-    ops::ActivationDoubleGradOpInplaceInference);
-
-REGISTER_ACTIVATION_CPU_KERNEL(square, Square, SquareFunctor,
-                               SquareGradFunctor);
-
-REGISTER_OP_CPU_KERNEL(
-    square_grad_grad,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<float>>,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<double>>,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ==========================   pow register  ============================ */
-
-REGISTER_OPERATOR(
-    pow, ops::PowOp, ops::PowOpMaker, ops::ActivationOpInferVarType,
-    ops::PowGradOpDescMaker,
-    std::conditional<ops::CanInplaceAct<ops::PowGradFunctor<float>>(),
-                     ::paddle::framework::SingleOpInplaceInToOut, void>::type);
-REGISTER_OPERATOR(pow_grad, ops::PowOpGrad,
-                  ops::ActivationGradOpInplaceInference);
-
-REGISTER_OP_CPU_KERNEL(
-    pow, ops::PowKernel<plat::CPUDeviceContext, ops::PowFunctor<float>>,
-    ops::PowKernel<plat::CPUDeviceContext, ops::PowFunctor<double>>);
-REGISTER_OP_CPU_KERNEL(
-    pow_grad,
-    ops::PowGradKernel<plat::CPUDeviceContext, ops::PowGradFunctor<float>>,
-    ops::PowGradKernel<plat::CPUDeviceContext, ops::PowGradFunctor<double>>);
-/* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
deleted file mode 100644
index ac03b8983b4e549beadf679590af3d0937af9edb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/activation_op.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,         \
-                                        grad_functor)                       \
-  REGISTER_OP_CUDA_KERNEL(                                                  \
-      act_type,                                                             \
-      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>,  \
-      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<double>>, \
-      ops::ActivationKernel<plat::CUDADeviceContext,                        \
-                            ops::functor<plat::float16>>);                  \
-  REGISTER_OP_CUDA_KERNEL(                                                  \
-      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
-                                                 ops::grad_functor<float>>, \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>,                 \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<plat::float16>>);
-
-FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
-
-/* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
-                                LeakyReluGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    leaky_relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::LeakyReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::LeakyReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<
-        plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ===========================    relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluFunctor, ReluGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ===========================   sqrt register  ============================= */
-REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    sqrt_grad_grad,
-    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<float>>,
-    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<double>>,
-    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ===========================  square register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(square, Square, SquareFunctor,
-                                SquareGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    square_grad_grad,
-    ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<float>>,
-    ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<double>>,
-    ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ==========================   pow register  ============================ */
-
-REGISTER_OP_CUDA_KERNEL(
-    pow, ops::PowKernel<plat::CUDADeviceContext, ops::PowFunctor<float>>,
-    ops::PowKernel<plat::CUDADeviceContext, ops::PowFunctor<double>>,
-    ops::PowKernel<plat::CUDADeviceContext, ops::PowFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    pow_grad,
-    ops::PowGradKernel<plat::CUDADeviceContext, ops::PowGradFunctor<float>>,
-    ops::PowGradKernel<plat::CUDADeviceContext, ops::PowGradFunctor<double>>,
-    ops::PowGradKernel<plat::CUDADeviceContext,
-                       ops::PowGradFunctor<plat::float16>>);
-/* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
deleted file mode 100644
index ea19dcd3ab08d64e5ee6806df4d70f8be64a9905..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/activation_op.h
+++ /dev/null
@@ -1,1723 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <glog/logging.h>
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include <cmath>
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES
-#endif
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/float16.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-enum ActBwdOpFwdDeps {
-  kNoDeps = 0x00,  // Do not need any forward input/output
-  kDepX = 0x01,    // Only need forward input X
-  kDepOut = 0x02,  // Only need forward output Out
-
-  // Never add kDepXOut, because Out can be always calculated
-  // by forward input X in backward part.
-  // FIXME(zjl): but in MKLDNN abs, X and Out are all needed...
-  // Developers should not rely on this enum value!
-  kDepXOut = 0x03
-};
-
-/* The following operator can be used to process SelectedRows, because the
- * output of those operator for zero is zero too.
- */
-static std::unordered_set<std::string> CanBeUsedBySelectedRows = {
-    "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"};
-
-inline void ExtractActivationTensor(const framework::ExecutionContext& context,
-                                    const framework::Tensor** X,
-                                    framework::Tensor** Out) {
-  auto x_var = context.InputVar("X");
-  auto out_var = context.OutputVar("Out");
-  PADDLE_ENFORCE(x_var != nullptr,
-                 "Cannot get input Variable X, variable name = %s",
-                 context.op().Input("X"));
-  PADDLE_ENFORCE(out_var != nullptr,
-                 "Cannot get output Variable Out, variable name = %s",
-                 context.op().Output("Out"));
-  if (CanBeUsedBySelectedRows.count(context.op().Type())) {
-    *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
-    *Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-        out_var);
-  } else {
-    *X = context.Input<framework::Tensor>("X");
-    *Out = context.Output<framework::Tensor>("Out");
-  }
-
-  PADDLE_ENFORCE(*Out != nullptr,
-                 "Cannot get output tensor Out, variable name = %s",
-                 context.op().Output("Out"));
-}
-
-template <ActBwdOpFwdDeps kDepValue>
-inline void ExtractActivationGradTensor(
-    const framework::ExecutionContext& context, const framework::Tensor** X,
-    const framework::Tensor** Out, const framework::Tensor** dOut,
-    framework::Tensor** dX) {
-  auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
-  auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
-  const framework::Variable* out_var = nullptr;
-
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
-    out_var = context.InputVar("Out");
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot get input Variable Out, variable name = %s",
-                   context.op().Input("Out"));
-  }
-  PADDLE_ENFORCE(out_grad_var != nullptr,
-                 "Cannot get input Variable %s, variable name = %s",
-                 framework::GradVarName("Out"),
-                 context.op().Input(framework::GradVarName("Out")));
-  PADDLE_ENFORCE(x_grad_var != nullptr,
-                 "Cannot get output Variable %s, variable name = %s",
-                 framework::GradVarName("X"),
-                 context.op().Output(framework::GradVarName("X")));
-
-  if (CanBeUsedBySelectedRows.count(context.op().Type())) {
-    *dOut = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
-        *out_grad_var);
-    *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-        x_grad_var);
-
-    if (out_var) {
-      *Out =
-          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
-    } else {
-      *Out = *dOut;  // fake out
-    }
-
-  } else {
-    *Out = context.Input<framework::Tensor>("Out");
-    *dOut = context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    *dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    if (out_var) {
-      *Out = &(out_var->Get<framework::LoDTensor>());
-    } else {
-      *Out = *dOut;  // fake out
-    }
-  }
-
-  PADDLE_ENFORCE(*dX != nullptr,
-                 "Cannot get output tensor %s, variable name = %s",
-                 framework::GradVarName("X"),
-                 context.op().Output(framework::GradVarName("X")));
-
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
-    auto x_var = context.InputVar("X");
-    PADDLE_ENFORCE(x_var != nullptr,
-                   "Cannot get input tensor X, variable name = %s",
-                   context.op().Input("X"));
-    if (CanBeUsedBySelectedRows.count(context.op().Type())) {
-      *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
-    } else {
-      *X = context.Input<framework::Tensor>("X");
-    }
-  } else {
-    VLOG(10) << " Inplace activation of Op : " << context.op().Type();
-    *X = *dX;
-  }
-}
-
-template <typename DeviceContext, typename Functor>
-class ActivationKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* X = nullptr;
-    framework::Tensor* Out = nullptr;
-    ExtractActivationTensor(context, &X, &Out);
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    functor(*place, x, out);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class ActivationGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor *X, *Out, *dOut;
-    framework::Tensor* dX = nullptr;
-    X = Out = dOut = nullptr;
-    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &X, &Out, &dOut,
-                                                    &dX);
-    dX->mutable_data<T>(context.GetPlace());
-    auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-    auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    functor(*place, x, out, dout, dx);
-  }
-};
-
-template <typename T>
-struct BaseActivationFunctor {
-  using ELEMENT_TYPE = T;
-
-  using AttrPair = std::vector<std::pair<const char*, float*>>;
-
-  AttrPair GetAttrs() { return AttrPair(); }
-
-  /* NOTE(*): Output reuse X memory if X is not dependented by its Gradient.
-     For example, sigmoid op's gradient didn't involve x, so its output can
-     reuse
-     input memory. But abs op's gradient use x, it can not be inplaced.
-     gradient did use x.
-   */
-  bool Inplace() const { return false; }
-};
-
-// sigmoid(x) = 1 / (1 + exp(-x))
-template <typename T>
-struct SigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-  }
-};
-
-template <typename T>
-struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out * (static_cast<T>(1) - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-// Originally: logsigmoid(x) = -log (1 + exp(-x))
-// For numerical stability, we can use the log-sum-exp trick:
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// We can rewrite the above equation as:
-// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
-//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
-//           max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
-//
-// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
-// + exp(-x - max(-x, 0))))
-template <typename T>
-struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
-  }
-};
-
-// Originally: f' = exp(-x) / (1 + exp(-x))
-// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
-// exp(-x - max(-x, 0)))
-template <typename T>
-struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    dx.device(d) =
-        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// exp(x) = e^x
-template <typename T>
-struct ExpFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.exp();
-  }
-};
-
-template <typename T>
-struct ExpGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-// relu(x) = max(x, 0)
-template <typename T>
-struct ReluFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(0));
-  }
-};
-
-template <typename T>
-struct ReluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-// gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
-template <typename T>
-struct GeluFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-// Because the execute or device context can not be deliver here, it keep the
-// marco for NVCC.
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
-    auto x_data = x.data();
-    auto out_data = out.data();
-    int n = std::min(x.size(), out.size());
-
-    std::memset(out_data, 0, n * sizeof(T));
-    math::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data, 1);
-    math::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
-    for (int i = 0; i < n; i++) {
-      out_data[i] += static_cast<T>(1);
-    }
-    math::CBlas<T>::VMUL(n, x_data, out_data, out_data);
-    for (int i = 0; i < n; i++) {
-      out_data[i] *= static_cast<T>(0.5);
-    }
-#else
-    auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
-    out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
-#endif
-  }
-};
-
-// gelu_grad(x) = dout * (0.5 * (1 + erf(x / sqrt(2))) + 0.5 * 2 / sqrt(pie) /
-// sqrt(2) * x * exp (-0.5 * sqrt(x)))
-// gelu_grad(x) = dout * (0.5 + 0.5 * erf(x * M_SQRT1_2) + (0.5 * M_2_SQRTPI *
-// M_SQRT1_2) * x * exp (-0.5 * sqrt(x)))
-template <typename T>
-struct GeluGradFunctor : BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
-    auto x_data = x.data();
-    auto dx_data = dx.data();
-    int n = std::min(x.size(), dx.size());
-
-    std::memset(dx_data, 0, n * sizeof(T));
-
-    // First(dx_data) = erf(x * M_SQRT1_2)
-    math::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, dx_data, 1);
-    math::CBlas<T>::VMERF(n, dx_data, dx_data, VML_LA);
-
-    // Second = 0.5 * M_2_SQRTPI * M_SQRT1_2 * x * exp (-0.5 * sqrt(x))
-    auto second = static_cast<T*>(std::malloc(n * sizeof(T)));
-    std::memset(second, 0, n * sizeof(T));
-
-    math::CBlas<T>::VSQUARE(n, x_data, second);
-    for (int i = 0; i < n; i++) {
-      second[i] *= static_cast<T>(-0.5);
-    }
-    math::CBlas<T>::VEXP(n, second, second);
-    math::CBlas<T>::VMUL(n, x_data, second, second);
-    T tmp = static_cast<T>(0.5) * static_cast<T>(M_SQRT1_2) *
-            static_cast<T>(M_2_SQRTPI);
-    for (int i = 0; i < n; i++) {
-      second[i] *= tmp;
-    }
-
-    // Sum = 0.5 * First + Second
-    math::CBlas<T>::AXPY(n, static_cast<T>(0.5), dx_data, 1, second, 1);
-
-    // 0.5 + Sum
-    for (int i = 0; i < n; i++) {
-      second[i] += static_cast<T>(0.5);
-    }
-
-    // * dout
-    auto dout_data = dout.data();
-    math::CBlas<T>::VMUL(n, dout_data, second, dx_data);
-
-    std::free(second);
-#else
-    auto first = static_cast<T>(0.5) *
-                 (static_cast<T>(1) + ((x * static_cast<T>(M_SQRT1_2)).erf()));
-
-    auto second = static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
-                  (-static_cast<T>(0.5) * x.square()).exp();
-    dx.device(d) = dout * (first + second);
-#endif
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.tanh();
-  }
-};
-
-template <typename T>
-struct TanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) - out * out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x - x.tanh();
-  }
-};
-
-template <typename T>
-struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x.tanh() * x.tanh());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct HardShrinkFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>();
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>();
-    out.device(d) = x * (temp1 + temp2);
-  }
-};
-
-template <typename T>
-struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>();
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
-// otherwise
-template <typename T>
-struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
-  }
-};
-
-template <typename T>
-struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// sqrt(x) = x^(1/2)
-template <typename T>
-struct SqrtFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.sqrt();
-  }
-};
-
-template <typename T>
-struct SqrtGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0.5) * dout / out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-// rsqrt(x) = x^(-1/2)
-template <typename T>
-struct RsqrtFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.rsqrt();
-  }
-};
-
-template <typename T>
-struct RsqrtGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(-0.5) * dout * out * out * out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-// ceil(x) = ceiling(x)
-template <typename T>
-struct CeilFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.ceil();
-  }
-};
-
-template <typename T>
-struct ZeroGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0) * out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
-};
-
-// floor(x) = flooring(x)
-template <typename T>
-struct FloorFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.floor();
-  }
-};
-
-template <typename T>
-struct Sine {
-  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
-};
-
-template <>
-struct Sine<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(sin(static_cast<float>(val)));
-  }
-};
-
-template <typename T>
-struct Cosine {
-  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
-};
-
-template <>
-struct Cosine<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(cos(static_cast<float>(val)));
-  }
-};
-
-// cosine'(x) = -sin(x)
-template <typename T>
-struct CosGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// cosine(x) = cos(x)
-template <typename T>
-struct CosFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Cosine<T>());
-  }
-};
-
-// sine'(x) = cos(x)
-template <typename T>
-struct SinGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// sine(x) = sin(x)
-template <typename T>
-struct SinFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Sine<T>());
-  }
-};
-
-template <typename T>
-struct Acos {
-  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
-};
-
-template <>
-struct Acos<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(acos(static_cast<float>(val)));
-  }
-};
-
-// Acos(x) = acos(x)
-template <typename T>
-struct AcosFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Acos<T>());
-  }
-};
-
-// acos'(x) = -1/sqrt(1-x^2)
-template <typename T>
-struct AcosGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Asin {
-  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
-};
-
-template <>
-struct Asin<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(asin(static_cast<float>(val)));
-  }
-};
-
-// Asin(x) = asin(x)
-template <typename T>
-struct AsinFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Asin<T>());
-  }
-};
-
-// asin'(x) = 1/sqrt(1-x^2)
-template <typename T>
-struct AsinGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Atan {
-  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
-};
-
-template <>
-struct Atan<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(atan(static_cast<float>(val)));
-  }
-};
-
-// Atan(x) = atan(x)
-template <typename T>
-struct AtanFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Atan<T>());
-  }
-};
-
-// atan'(x) =  1 / (1 + x^2)
-template <typename T>
-struct AtanGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// round(x) = [x]
-template <typename T>
-struct RoundFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.round();
-  }
-};
-
-// abs(x) = |x|
-template <typename T>
-struct AbsFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.abs();
-  }
-};
-
-template <typename T>
-struct AbsGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.sign();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepXOut; }
-};
-
-// reciprocal(x) = 1 / x
-template <typename T>
-struct ReciprocalFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = static_cast<T>(1) / x;
-  }
-};
-
-template <typename T>
-struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(-1) * out * out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-// log(x) = natural logarithm of x
-template <typename T>
-struct LogFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log();
-  }
-};
-
-template <typename T>
-struct LogGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) / x);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// square(x) = x^2
-template <typename T>
-struct SquareFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.square();
-  }
-};
-
-template <typename T>
-struct SquareGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(2) * x;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct BReluFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-
-  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
-  // not polymorphism for speed.
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
-  }
-};
-
-template <typename T>
-struct BReluGradFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
-                       .template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// relu6(x) = min(max(0, x), 6)
-template <typename T>
-struct Relu6Functor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
-  }
-};
-
-template <typename T>
-struct Relu6GradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout *
-        ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
-            .template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-// HardSwish = min(max(0, x+3), 6) * x / 6
-template <typename T>
-struct HardSwishFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  float scale;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = (x + static_cast<T>(offset))
-                        .cwiseMax(static_cast<T>(0))
-                        .cwiseMin(static_cast<T>(threshold)) *
-                    x / static_cast<T>(scale);
-  }
-};
-
-template <typename T>
-struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  float scale;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto tmp = ((x + static_cast<T>(offset)) < static_cast<T>(threshold))
-                   .template cast<T>();
-    dx.device(d) =
-        dout *
-        (((x + static_cast<T>(offset)) > static_cast<T>(0)).template cast<T>() *
-             (static_cast<T>(2) * x + static_cast<T>(offset)) /
-             static_cast<T>(scale) * tmp +
-         static_cast<T>(1) * (static_cast<T>(1) - tmp));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// softplus(x) = log(1 + exp(x))
-// When x is a very large positive number, exp(x) may explode to inf,
-// Using trick below for numerical stability
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
-template <typename T>
-struct SoftplusFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
-  }
-};
-
-// d(softplus(x))/dx = exp(x) / (1 + exp(x))
-// For numerical stability:
-// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) +
-// exp(x - max(x, 0)))
-template <typename T>
-struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    dx.device(d) =
-        dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// softsign(x) = x / (1 + |x|)
-template <typename T>
-struct SoftsignFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) {
-    out.device(d) = x / (static_cast<T>(1) + x.abs());
-  }
-};
-
-// d(softsign(x))/dx = 1 / (1 + |x|)^2
-// Taken from https://en.wikipedia.org/wiki/Activation_function
-template <typename T>
-struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
-    dx.device(d) =
-        dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct SoftReluFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto tmp = static_cast<T>(threshold);
-    auto temp = x.cwiseMax(-tmp).cwiseMin(tmp);
-    out.device(d) = (static_cast<T>(1) + temp.exp()).log();
-  }
-};
-
-template <typename T>
-struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto tmp = static_cast<T>(threshold);
-    auto temp = ((out > -tmp) * (out < tmp)).template cast<T>();
-    dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct LeakyReluFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
-  }
-};
-
-template <typename T>
-struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 =
-        static_cast<T>(alpha) * (out <= static_cast<T>(0)).template cast<T>();
-    auto temp2 = (out > static_cast<T>(0)).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct ELUFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(0)) +
-                    (static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
-                        .cwiseMin(static_cast<T>(0));
-  }
-};
-
-template <typename T>
-struct ELUGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
-                   dout * static_cast<T>(alpha) * x.exp() *
-                       (x < static_cast<T>(0)).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
-template <typename T>
-struct PowFunctor : public BaseActivationFunctor<T> {
-  float factor;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"factor", &factor}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.pow(static_cast<T>(factor));
-  }
-};
-
-template <typename T>
-struct PowGradFunctor : public BaseActivationFunctor<T> {
-  float factor;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"factor", &factor}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct STanhFunctor : public BaseActivationFunctor<T> {
-  float scale_a;
-  float scale_b;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
-  }
-};
-
-template <typename T>
-struct STanhGradFunctor : public BaseActivationFunctor<T> {
-  float scale_a;
-  float scale_b;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto a = static_cast<T>(scale_a);
-    auto b = static_cast<T>(scale_b);
-    auto temp = (a * x).tanh() * (a * x).tanh();
-    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto th = static_cast<T>(threshold);
-    out.device(d) = (x > th).template cast<T>() * x;
-  }
-};
-
-template <typename T>
-struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto th = static_cast<T>(threshold);
-    dx.device(d) = dout * (x > th).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    out.device(d) =
-        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
-  }
-};
-
-template <typename T>
-struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
-                       .template cast<T>() *
-                   static_cast<T>(slope);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct SwishFunctor : public BaseActivationFunctor<T> {
-  float beta;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x / (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
-  }
-};
-
-template <typename T>
-struct SwishGradFunctor : public BaseActivationFunctor<T> {
-  float beta;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out fake_out, dOut dout, dX dx) const {
-    auto temp1 = static_cast<T>(1) /
-                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
-    auto out = x * temp1;
-    auto temp2 = temp1 * (static_cast<T>(1) - (static_cast<T>(beta) * out));
-    dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-/*
- * in arguments: x, out, ddx
- * out arguments: ddout, dout, dx
- */
-template <ActBwdOpFwdDeps kDepValue>
-inline void ExtractActivationDoubleGradTensor(
-    const framework::ExecutionContext& ctx, const framework::Tensor** X,
-    const framework::Tensor** Out, const framework::Tensor** ddX,
-    framework::Tensor** dX, framework::Tensor** dOut,
-    framework::Tensor** ddOut) {
-  auto ddx_var = ctx.InputVar("DDX");
-  auto ddo_var = ctx.OutputVar("DDOut");
-  PADDLE_ENFORCE(ddx_var != nullptr,
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.op().Input("DDX"));
-  if (CanBeUsedBySelectedRows.count(ctx.op().Type())) {
-    *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var);
-    if (ddo_var) {
-      *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-          ddo_var);
-    }
-  } else {
-    *ddX = ctx.Input<framework::Tensor>("DDX");
-    if (ddo_var) {
-      *ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-  }
-  PADDLE_ENFORCE(*ddX != nullptr,
-                 "Cannot get output tensor DDX, variable name = %s",
-                 ctx.op().Output("DDX"));
-
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
-    auto x_var = ctx.InputVar("X");
-    PADDLE_ENFORCE(x_var != nullptr,
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.op().Input("X"));
-    auto dx_var = ctx.OutputVar("DX");
-    if (CanBeUsedBySelectedRows.count(ctx.op().Type())) {
-      *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
-      if (dx_var) {
-        *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-            dx_var);
-      }
-    } else {
-      *X = ctx.Input<framework::Tensor>("X");
-      if (dx_var) {
-        *dX = ctx.Output<framework::Tensor>("DX");
-      }
-    }
-  } else {
-    VLOG(10) << "Inplace activation of Op: " << ctx.op().Type();
-    *X = *ddX;
-  }
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot get input tensor Out, variable name = %s",
-                   ctx.op().Input("Out"));
-    auto dout_var = ctx.OutputVar("DOut");
-    if (CanBeUsedBySelectedRows.count(ctx.op().Type())) {
-      *Out =
-          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
-      if (dout_var) {
-        *dOut =
-            paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-                dout_var);
-      }
-    } else {
-      *Out = ctx.Input<framework::Tensor>("Out");
-      if (dout_var) {
-        *dOut = ctx.Output<framework::Tensor>("DOut");
-      }
-    }
-  } else {
-    VLOG(10) << "Inplace activation of Op: " << ctx.op().Type();
-    *Out = *ddX;
-  }
-}
-
-template <typename DeviceContext, typename Functor>
-class ActivationDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *Out, *ddX;
-    X = Out = ddX = nullptr;
-    framework::Tensor *ddOut, *dOut, *dX;
-    ddOut = dOut = dX = nullptr;
-
-    ExtractActivationDoubleGradTensor<Functor::FwdDeps()>(ctx, &X, &Out, &ddX,
-                                                          &dX, &dOut, &ddOut);
-
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(ctx.GetPlace());
-    if (dX) dX->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
-    functor(place, X, Out, ddX, ddOut, dOut, dX);
-  }
-};
-
-template <typename T>
-struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* Out, const framework::Tensor* ddX,
-                  framework::Tensor* ddOut, framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
-      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* Out, const framework::Tensor* ddX,
-                  framework::Tensor* ddOut, framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
-    if (ddOut) {
-      auto* d = dev.eigen_device();
-      auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-      auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
-      ddout.device(*d) = ddx *
-                         ((out > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) *
-                              (out <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  framework::Tensor* dOut, const framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-    // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx
-    // calculate dy first, so ddy can inplace ddx
-    if (dOut) {
-      auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-      auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
-      dout.device(*d) = dx * ddx * static_cast<T>(-1) / out;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
-      ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
-    // square GradGrad: ddy=2x*ddx, dx=2dy*ddx
-    // calculate dx first, so ddy can inplace ddx
-    if (dX) {
-      auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-      auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
-      dx.device(*d) = ddx * static_cast<T>(2) * dout;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
-      ddout.device(*d) = ddx * static_cast<T>(2) * x;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// TODO(dengkaipeng): double gradient calculation for Square/Sqrt need
-// DOut(dy) as input(not output), tensor extraction is different from
-// others. Impliment extraction kernel seperately here.
-inline void ExtractDoubleGradTensorWithInputDOut(
-    const framework::ExecutionContext& ctx, const framework::Tensor** X,
-    const framework::Tensor** ddX, framework::Tensor** dX,
-    const framework::Tensor** dOut, framework::Tensor** ddOut) {
-  // extract ddX(output), ddOut(input)
-  auto ddx_var = ctx.InputVar("DDX");
-  auto ddo_var = ctx.OutputVar("DDOut");
-  PADDLE_ENFORCE(ddx_var != nullptr,
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.op().Input("DDX"));
-  *ddX = ctx.Input<framework::Tensor>("DDX");
-  if (ddo_var) {
-    *ddOut = ctx.Output<framework::Tensor>("DDOut");
-  }
-  PADDLE_ENFORCE(*ddX != nullptr,
-                 "Cannot get output tensor DDX, variable name = %s",
-                 ctx.op().Output("DDX"));
-
-  // extract x(input), dx(output)
-  auto x_var = ctx.InputVar("X");
-  PADDLE_ENFORCE(x_var != nullptr,
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.op().Input("X"));
-  auto dx_var = ctx.OutputVar("DX");
-  *X = ctx.Input<framework::Tensor>("X");
-  if (dx_var) {
-    *dX = ctx.Output<framework::Tensor>("DX");
-  }
-
-  // extract dOut(input)
-  auto dout_var = ctx.InputVar("DOut");
-  if (dout_var) {
-    *dOut = ctx.Input<framework::Tensor>("DOut");
-  }
-}
-
-template <typename DeviceContext, typename Functor>
-class SquareDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *ddX, *dOut;
-    X = ddX = dOut = nullptr;
-    framework::Tensor *dX, *ddOut;
-    dX = ddOut = nullptr;
-
-    ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut);
-
-    if (dX) dX->mutable_data<T>(X->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    functor(place, X, ddX, ddOut, dOut, dX);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class SqrtDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *dX, *ddX;
-    Out = dX = ddX = nullptr;
-    framework::Tensor *ddOut, *dOut;
-    ddOut = dOut = nullptr;
-
-    // extract ddx(input), ddout(output)
-    auto ddx_var = ctx.InputVar("DDX");
-    auto ddo_var = ctx.OutputVar("DDOut");
-    PADDLE_ENFORCE(ddx_var != nullptr,
-                   "Cannot get input Variable DDX, variable name = %s",
-                   ctx.op().Input("DDX"));
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    if (ddo_var) {
-      ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-    PADDLE_ENFORCE(ddX != nullptr,
-                   "Cannot get input Variable DDX, variable name = %s",
-                   ctx.op().Input("DDX"));
-
-    // extract out(input), dout(output)
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.op().Input("Out"));
-    auto dout_var = ctx.OutputVar("DOut");
-    Out = ctx.Input<framework::Tensor>("Out");
-    if (dout_var) {
-      dOut = ctx.Output<framework::Tensor>("DOut");
-    }
-
-    // extract dx(input)
-    auto dx_var = ctx.InputVar("DX");
-    PADDLE_ENFORCE(dx_var != nullptr,
-                   "Cannot get input Variable DX, variable name = %s",
-                   ctx.op().Input("DX"));
-    if (dx_var) {
-      dX = ctx.Input<framework::Tensor>("DX");
-    }
-
-    if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    functor(place, Out, ddX, ddOut, dOut, dX);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class PowKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* X = nullptr;
-    framework::Tensor* Out = nullptr;
-    ExtractActivationTensor(context, &X, &Out);
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    // get FactorTensor
-    auto* factor_tensor = context.HasInput("FactorTensor")
-                              ? context.Input<framework::Tensor>("FactorTensor")
-                              : nullptr;
-    if (factor_tensor) {
-      auto* factor_data = factor_tensor->data<float>();
-      framework::Tensor cpu_factor_tensor;
-      if (platform::is_gpu_place(factor_tensor->place())) {
-        TensorCopySync(*factor_tensor, platform::CPUPlace(),
-                       &cpu_factor_tensor);
-        factor_data = cpu_factor_tensor.data<float>();
-      }
-      auto factor =
-          std::vector<float>(factor_data, factor_data + factor_tensor->numel());
-      PADDLE_ENFORCE_EQ(factor.size(), 1,
-                        "The shape of factor(tensor) MUST BE [1].");
-      for (auto& attr : attrs) {
-        *attr.second = factor[0];
-      }
-    }
-    functor(*place, x, out);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class PowGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor *X, *Out, *dOut;
-    framework::Tensor* dX = nullptr;
-    X = Out = dOut = nullptr;
-    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &X, &Out, &dOut,
-                                                    &dX);
-    dX->mutable_data<T>(context.GetPlace());
-    auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-    auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    // get FactorTensor
-    auto* factor_tensor =
-        context.HasInput("FactorTensor")
-            ? context.Input<framework::LoDTensor>("FactorTensor")
-            : nullptr;
-    if (factor_tensor) {
-      auto* factor_data = factor_tensor->data<float>();
-      framework::Tensor cpu_factor_tensor;
-      if (platform::is_gpu_place(factor_tensor->place())) {
-        TensorCopySync(*factor_tensor, platform::CPUPlace(),
-                       &cpu_factor_tensor);
-        factor_data = cpu_factor_tensor.data<float>();
-      }
-      auto factor =
-          std::vector<float>(factor_data, factor_data + factor_tensor->numel());
-      PADDLE_ENFORCE_EQ(factor.size(), 1,
-                        "The shape of factor(tensor) MUST BE [1].");
-      for (auto& attr : attrs) {
-        *attr.second = factor[0];
-      }
-    }
-    functor(*place, x, out, dout, dx);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-#define FOR_EACH_ACTIVATION_OP(__macro)                                       \
-  __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
-  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(exp, Exp, ExpFunctor, ExpGradFunctor);                              \
-  __macro(gelu, Gelu, GeluFunctor, GeluGradFunctor);                          \
-  __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor);                          \
-  __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
-  __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
-  __macro(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor);                      \
-  __macro(abs, Abs, AbsFunctor, AbsGradFunctor);                              \
-  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
-  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
-  __macro(cos, Cos, CosFunctor, CosGradFunctor);                              \
-  __macro(acos, Acos, AcosFunctor, AcosGradFunctor);                          \
-  __macro(sin, Sin, SinFunctor, SinGradFunctor);                              \
-  __macro(asin, Asin, AsinFunctor, AsinGradFunctor);                          \
-  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
-  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
-  __macro(log, Log, LogFunctor, LogGradFunctor);                              \
-  __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
-  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
-  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                      \
-  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);          \
-  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);          \
-  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                      \
-  __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
-  __macro(elu, ELU, ELUFunctor, ELUGradFunctor);                              \
-  __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \
-  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                      \
-          HardSigmoidGradFunctor);                                            \
-  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
-  __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor,          \
-          ThresholdedReluGradFunctor);                                        \
-  __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
deleted file mode 100644
index 2580c5a523e13fb489bf9810c205257102d8a72e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/add_position_encoding_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class AddPositionEncodingOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "X(Input) of add_position_encoding_op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Out(Output) of add_position_encoding_op should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-      ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        platform::CPUPlace());
-  }
-};
-
-class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of AddPositionEncoding operator");
-    AddOutput("Out", "Output of AddPositionEncoding operator");
-    AddAttr<float>("alpha", "The scale of Original Embedding.")
-        .SetDefault(1.0f)
-        .AddCustomChecker([](const float& alpha) {
-          PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
-        });
-    AddAttr<float>("beta", "The scale of Position Embedding.")
-        .SetDefault(1.0f)
-        .AddCustomChecker([](const float& beta) {
-          PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
-        });
-    AddComment(R"DOC(
-    Add Position Encoding Operator.
-    
-    The add position encoding calculates the output based on the input, alpha, beta.
-    The size of each dimension of the parameters checked in the infer-shape.
-  )DOC");
-  }
-};
-
-class AddPositionEncodingGradOpDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("add_position_encoding_grad");
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plt = paddle::platform;
-
-REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
-                  ops::AddPositionEncodingOpMaker,
-                  ops::AddPositionEncodingGradOpDescMaker);
-REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    add_position_encoding,
-    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, float>,
-    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    add_position_encoding_grad,
-    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, float>,
-    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/add_position_encoding_op.h b/paddle/fluid/operators/add_position_encoding_op.h
deleted file mode 100644
index 0b40d3de890a02a9dbec2328f9f6388ffa35561b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/add_position_encoding_op.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AddPositionEncodingKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::LoDTensor>("X");
-    auto& x_lod = X->lod();
-    auto* src_ptr = X->data<T>();
-
-    auto* Out = context.Output<framework::LoDTensor>("Out");
-    auto* dst_ptr = Out->mutable_data<T>(context.GetPlace());
-
-    float alpha = context.Attr<float>("alpha");
-    float beta = context.Attr<float>("beta");
-
-    auto x_dim = X->dims();
-    int batch_size = 0;
-    int max_seq_len = 0;
-    int enc_size = 0;
-
-    if (x_lod.empty()) {
-      PADDLE_ENFORCE(
-          x_dim.size() == 3UL,
-          "The input X of Add Position Encoding should be 3-D Tensor!");
-      batch_size = x_dim[0];
-      max_seq_len = x_dim[1];
-      enc_size = x_dim[2];
-    } else {
-      PADDLE_ENFORCE(
-          x_dim.size() == 2UL,
-          "The input X of Add Position Encoding should be 2-D LoDTensor!");
-      PADDLE_ENFORCE(
-          x_lod.size() == 1UL,
-          "The Add Position Encoding Op only supports lod_level == 1!");
-      batch_size = x_lod[0].size() - 1;
-      max_seq_len = -1;
-      enc_size = x_dim[1];
-    }
-
-    PADDLE_ENFORCE(enc_size % 2 == 0, "Only support even encode size!");
-
-    const int half_size = enc_size / 2;
-    for (int i = 0; i < batch_size; ++i) {
-      const int max_length =
-          x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
-      for (int j = 0; j < max_length; ++j) {
-        for (int k = 0; k < half_size; ++k) {
-          const double val =
-              (half_size > 1)
-                  ? j / pow(10000.0, static_cast<double>(k) / (half_size - 1))
-                  : j / 10000.0;
-          dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
-          dst_ptr[half_size + k] =
-              src_ptr[half_size + k] * alpha + cos(val) * beta;
-        }
-        src_ptr += enc_size;
-        dst_ptr += enc_size;
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AddPositionEncodingGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dOut =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-
-    auto* dX =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-
-    float alpha = context.Attr<float>("alpha");
-
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    dx.device(*place) = dout * static_cast<T>(alpha);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
deleted file mode 100644
index 1476cfc2c89130677de22bc6f43cb258cd5e0be2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class AffineChannelOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor) Feature map input can be a 4D tensor with order NCHW "
-             "or NHWC. It also can be a 2D tensor and C is the second "
-             "dimension.");
-    AddInput("Scale",
-             "(Tensor) 1D input of shape (C), the c-th element "
-             "is the scale factor of the affine transformation "
-             "for the c-th channel of the input.");
-    AddInput("Bias",
-             "(Tensor) 1D input of shape (C), the c-th element "
-             "is the bias of the affine transformation for the "
-             "c-th channel of the input.");
-    AddAttr<std::string>(
-        "data_layout",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout");
-    AddOutput("Out", "(Tensor) A tensor of the same shape and order with X.");
-    AddComment(R"DOC(
-
-Applies a separate affine transformation to each channel of the input. Useful
-for replacing spatial batch norm with its equivalent fixed transformation.
-The input also can be 2D tensor and applies a affine transformation in second
-dimension.
-
-$$Out = Scale*X + Bias$$
-
-)DOC");
-  }
-};
-
-class AffineChannelOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of AffineChannelOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                   "Input(Scale) of AffineChannelOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of AffineChannelOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of AffineChannelOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto scale_dims = ctx->GetInputDim("Scale");
-    auto b_dims = ctx->GetInputDim("Bias");
-    const framework::DataLayout data_layout = framework::StringToDataLayout(
-        ctx->Attrs().Get<std::string>("data_layout"));
-
-    const int64_t C = (data_layout == framework::DataLayout::kNCHW
-                           ? x_dims[1]
-                           : x_dims[x_dims.size() - 1]);
-
-    PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL);
-    PADDLE_ENFORCE_EQ(b_dims.size(), 1UL);
-    if (ctx->IsRuntime() || scale_dims[0] > 0) {
-      PADDLE_ENFORCE_EQ(scale_dims[0], C);
-    }
-    if (ctx->IsRuntime() || b_dims[0] > 0) {
-      PADDLE_ENFORCE_EQ(b_dims[0], C);
-    }
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class AffineChannelOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                     "Input(Scale) should not be null.");
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-      // Scale@GRAD and Bias@GRAD must exist at the same time.
-      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
-                     "Output(Scale@GRAD) should not be null.");
-      PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-      ctx->SetOutputDim(framework::GradVarName("Scale"),
-                        ctx->GetInputDim("Scale"));
-      ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Scale"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-};
-
-class AffineChannelGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType("affine_channel_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("Scale", Input("Scale"));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
-    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
-template <typename DeviceContext, typename T>
-class AffineChannelKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* bias = ctx.Input<framework::Tensor>("Bias");
-
-    auto* y = ctx.Output<framework::Tensor>("Out");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    const framework::DataLayout layout =
-        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
-
-    auto dims = x->dims();
-    int N = dims[0];
-    int C = layout == framework::DataLayout::kNCHW ? dims[1]
-                                                   : dims[dims.size() - 1];
-    int HxW = x->numel() / N / C;
-
-    auto* scale_d = scale->data<T>();
-    auto* bias_d = bias->data<T>();
-    ConstEigenVectorArrayMap<T> a_e(scale_d, C);
-    ConstEigenVectorArrayMap<T> b_e(bias_d, C);
-
-    auto* x_d = x->data<T>();
-    auto* y_d = y->data<T>();
-    if (layout == framework::DataLayout::kNCHW) {
-      int stride = C * HxW;
-      for (int i = 0; i < N; i++) {
-        ConstEigenArrayMap<T> x_e(x_d, HxW, C);
-        EigenArrayMap<T> y_e(y_d, HxW, C);
-        y_e = (x_e.rowwise() * a_e.transpose()).rowwise() + b_e.transpose();
-        x_d += stride;
-        y_d += stride;
-      }
-    } else {
-      int num = N * HxW;
-      ConstEigenArrayMap<T> x_e(x_d, C, num);
-      EigenArrayMap<T> y_e(y_d, C, num);
-      y_e = (x_e.colwise() * a_e).colwise() + b_e;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AffineChannelGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dscale =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
-
-    const framework::DataLayout layout =
-        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
-
-    auto dims = x->dims();
-    int N = dims[0];
-    int C = layout == framework::DataLayout::kNCHW ? dims[1]
-                                                   : dims[dims.size() - 1];
-    int HxW = x->numel() / N / C;
-
-    auto* dy_d = dy->data<T>();
-    auto* scale_d = scale->data<T>();
-    ConstEigenVectorArrayMap<T> scale_e(scale_d, C);
-
-    T* dx_d = dx ? dx->mutable_data<T>(ctx.GetPlace()) : nullptr;
-    T* dscale_d = dscale ? dscale->mutable_data<T>(ctx.GetPlace()) : nullptr;
-    T* dbias_d = dbias ? dbias->mutable_data<T>(ctx.GetPlace()) : nullptr;
-    EigenVectorArrayMap<T> dscale_e(dscale_d, C);
-    EigenVectorArrayMap<T> dbias_e(dbias_d, C);
-
-    if (layout == framework::DataLayout::kNCHW) {
-      // compute dscale and dbias
-      int stride = C * HxW;
-      auto* original_dy_d = dy_d;
-      if (dscale && dbias) {
-        auto* x_d = x->data<T>();
-        for (int i = 0; i < N; i++) {
-          ConstEigenArrayMap<T> x_e(x_d, HxW, C);
-          ConstEigenArrayMap<T> dy_e(dy_d, HxW, C);
-          if (i == 0) {
-            dscale_e = (x_e * dy_e).colwise().sum();
-          } else {
-            dscale_e += (x_e * dy_e).colwise().sum();
-          }
-          if (i == 0) {
-            dbias_e = dy_e.colwise().sum();
-          } else {
-            dbias_e += dy_e.colwise().sum();
-          }
-          x_d += stride;
-          dy_d += stride;
-        }
-      }
-
-      // compute dx
-      if (dx) {
-        dy_d = original_dy_d;
-        for (int i = 0; i < N; i++) {
-          ConstEigenArrayMap<T> dy_e(dy_d, HxW, C);
-          EigenArrayMap<T> dx_e(dx_d, HxW, C);
-          dx_e = dy_e.rowwise() * scale_e.transpose();
-          dy_d += stride;
-          dx_d += stride;
-        }
-      }
-    } else {
-      int num = N * HxW;
-      ConstEigenArrayMap<T> dy_e(dy_d, C, num);
-      // compute dscale and dbias
-      if (dscale && dbias) {
-        auto* x_d = x->data<T>();
-        ConstEigenArrayMap<T> x_e(x_d, C, num);
-        dscale_e = (x_e * dy_e).rowwise().sum();
-        dbias_e = dy_e.rowwise().sum();
-      }
-
-      // compute dx
-      if (dx) {
-        EigenArrayMap<T> dx_e(dx_d, C, num);
-        dx_e = dy_e.colwise() * scale_e;
-      }
-    }
-  }
-};
-
-class AffineChannelNoNeedBufferVarsInference
-    : public framework::NoNeedBufferVarsInference {
- public:
-  using framework::NoNeedBufferVarsInference::NoNeedBufferVarsInference;
-
- private:
-  inline bool HasOutput(const std::string& name) const {
-    auto& outputs = Outputs();
-    auto iter = outputs.find(name);
-    if (iter == outputs.end() || iter->second.empty()) {
-      return false;
-    } else {
-      return iter->second[0] != framework::kEmptyVarName;
-    }
-  }
-
- public:
-  std::unordered_set<std::string> operator()() const override {
-    if (!HasOutput(framework::GradVarName("Scale")) &&
-        !HasOutput(framework::GradVarName("Bias"))) {
-      return {"X"};
-    } else {
-      return {};
-    }
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(AffineChannelInplaceInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(AffineChannelGradInplaceInferer,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-
-REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp,
-                  ops::AffineChannelOpMaker, ops::AffineChannelGradMaker,
-                  ops::AffineChannelInplaceInferer);
-REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad,
-                  ops::AffineChannelNoNeedBufferVarsInference,
-                  ops::AffineChannelGradInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>,
-                       ops::AffineChannelKernel<CPU, double>);
-REGISTER_OP_CPU_KERNEL(affine_channel_grad,
-                       ops::AffineChannelGradKernel<CPU, float>,
-                       ops::AffineChannelGradKernel<CPU, double>);
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
deleted file mode 100644
index 5e598071216ae33c25b5b9efe35d3f8c26ee3003..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cub/cub.cuh"
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, framework::DataLayout layout, bool HasBias>
-__global__ void KeAffineChannelCUDA(const T* x, const T* scale, const T* bias,
-                                    const int C, const int HxW, const int num,
-                                    T* y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    if (HasBias) {
-      y[i] = scale[c] * x[i] + bias[c];
-    } else {
-      y[i] = scale[c] * x[i];
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class AffineChannelCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* bias = ctx.Input<framework::Tensor>("Bias");
-
-    auto* y = ctx.Output<framework::Tensor>("Out");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    const framework::DataLayout layout =
-        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto dims = x->dims();
-    const int num = x->numel();
-    int N = dims[0];
-    int C = layout == framework::DataLayout::kNCHW ? dims[1]
-                                                   : dims[dims.size() - 1];
-    int HxW = num / N / C;
-
-    const T* x_d = x->data<T>();
-    const T* scale_d = scale->data<T>();
-    const T* bias_d = bias->data<T>();
-    T* y_d = y->data<T>();
-
-    int block = 1024;
-    int grid = (num + block - 1) / block;
-
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    grid = std::min(std::max(max_threads / block, 1), grid);
-    if (layout == framework::DataLayout::kNCHW) {
-      KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
-                          true><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_d, scale_d, bias_d, C, HxW, num, y_d);
-    } else {
-      KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
-                          true><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_d, scale_d, bias_d, C, HxW, num, y_d);
-    }
-  }
-};
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void AffineChannelScaleBiasGradientCUDAKernel(
-    const T* dy, const T* x, const int N, const int C, const int HxW, T* dscale,
-    T* dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    T ds_sum = 0;
-    T db_sum = 0;
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      ds_sum += dy[index] * x[index];
-      db_sum += dy[index];
-    }
-    __syncthreads();
-    auto ds_out =
-        BlockReduce(ds_storage).Reduce(static_cast<double>(ds_sum), cub::Sum());
-    auto db_out =
-        BlockReduce(db_storage).Reduce(static_cast<double>(db_sum), cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      dscale[i] = ds_out;
-      dbias[i] = db_out;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* bias = ctx.Input<framework::Tensor>("Bias");
-    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dscale =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
-
-    const framework::DataLayout layout =
-        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto dims = dy->dims();
-    const int num = dy->numel();
-    int N = dims[0];
-    int C = layout == framework::DataLayout::kNCHW ? dims[1]
-                                                   : dims[dims.size() - 1];
-    int HxW = num / N / C;
-
-    const T* dy_d = dy->data<T>();
-    const T* s_d = scale->data<T>();
-
-    T* dx_d = dx ? dx->mutable_data<T>(ctx.GetPlace()) : nullptr;
-    T* ds_d = dscale ? dscale->mutable_data<T>(ctx.GetPlace()) : nullptr;
-    T* db_d = dbias ? dbias->mutable_data<T>(ctx.GetPlace()) : nullptr;
-
-    const int block = 1024;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    int grid1 = (num + block - 1) / block;
-    int grid2 = std::min(C, max_blocks);
-    if (layout == framework::DataLayout::kNCHW) {
-      if (dscale && dbias) {
-        const T* x_d = x->data<T>();
-        AffineChannelScaleBiasGradientCUDAKernel<
-            T, block, framework::DataLayout::kNCHW><<<grid2, block, 0,
-                                                      dev_ctx.stream()>>>(
-            dy_d, x_d, N, C, HxW, ds_d, db_d);
-      }
-      if (dx) {
-        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
-                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
-            dy_d, s_d, nullptr, C, HxW, num, dx_d);
-      }
-    } else {
-      if (dscale && dbias) {
-        const T* x_d = x->data<T>();
-        AffineChannelScaleBiasGradientCUDAKernel<
-            T, block, framework::DataLayout::kNHWC><<<grid2, block, 0,
-                                                      dev_ctx.stream()>>>(
-            dy_d, x_d, N, C, HxW, ds_d, db_d);
-      }
-
-      if (dx) {
-        KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
-                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
-            dy_d, s_d, nullptr, C, HxW, num, dx_d);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-
-REGISTER_OP_CUDA_KERNEL(affine_channel,
-                        ops::AffineChannelCUDAKernel<CUDA, float>,
-                        ops::AffineChannelCUDAKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(affine_channel_grad,
-                        ops::AffineChannelGradCUDAKernel<CUDA, float>,
-                        ops::AffineChannelGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
deleted file mode 100644
index ed71594ba5781590f3291d56c4ba1a4443003bd5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedSpatialTransformerDescriptor =
-    platform::ScopedSpatialTransformerDescriptor;
-
-template <typename T>
-class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    auto* theta = ctx.Input<Tensor>("Theta");
-    auto* output = ctx.Output<Tensor>("Output");
-    const T* theta_data = theta->data<T>();
-
-    int n = theta->dims()[0];
-    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
-    Tensor h_sizes;
-    int* h_size_data;
-    if (size_attr.size() == 0) {
-      auto* output_shape = ctx.Input<Tensor>("OutputShape");
-      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
-      h_size_data = h_sizes.data<int>();
-    } else {
-      h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
-      h_size_data[0] = n;
-      h_size_data[1] = size_attr[1];
-      h_size_data[2] = size_attr[2];
-      h_size_data[3] = size_attr[3];
-    }
-
-    T* output_data = output->mutable_data<T>(
-        {n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace());
-    ScopedSpatialTransformerDescriptor st_desc;
-    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
-        st_desc.descriptor<T>(4, h_size_data);
-
-    PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward(
-        handle, cudnn_st_desc, theta_data, output_data));
-  }
-};
-
-template <typename T>
-class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
-
-    int n = output_grad->dims()[0];
-    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
-    Tensor h_sizes;
-    int* h_size_data;
-    if (size_attr.size() == 0) {
-      auto* output_shape = ctx.Input<Tensor>("OutputShape");
-      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
-      h_size_data = h_sizes.data<int>();
-    } else {
-      h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
-      h_size_data[0] = n;
-      h_size_data[1] = size_attr[1];
-      h_size_data[2] = size_attr[2];
-      h_size_data[3] = size_attr[3];
-    }
-
-    ScopedSpatialTransformerDescriptor st_desc;
-    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
-        st_desc.descriptor<T>(4, h_size_data);
-
-    const T* output_grad_data = output_grad->data<T>();
-    T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
-
-    PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward(
-        handle, cudnn_st_desc, output_grad_data, theta_grad_data));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace plat = paddle::platform;
-REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNAffineGridOpKernel<float>,
-                   paddle::operators::CUDNNAffineGridOpKernel<double>);
-REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNAffineGridGradOpKernel<float>,
-                   paddle::operators::CUDNNAffineGridGradOpKernel<double>);
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
deleted file mode 100644
index 9d7100cc3db91f5bf7dbd993c9f9ba5d4fc98ea6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/affine_grid_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct Linspace<paddle::platform::CPUDeviceContext, T> {
-  void operator()(T start, T end, int count, framework::Tensor* numbers,
-                  const framework::ExecutionContext& ctx) {
-    T* number_data = numbers->mutable_data<T>({count}, platform::CPUPlace());
-    T slice = (end - start) / (T)(count - 1);
-    for (int i = 0; i < count; ++i) {
-      number_data[i] = start + (T)i * slice;
-    }
-  }
-};
-
-class AffineGridOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Theta"),
-                   "Input(Theta) of AffineGridOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                   "Output(Output) of AffineGridOp should not be null.");
-    auto theta_dims = ctx->GetInputDim("Theta");
-    PADDLE_ENFORCE(theta_dims.size() == 3,
-                   "AffineGrid's Input(Theta) should be 3-D tensor.");
-
-    auto output_shape = ctx->Attrs().Get<std::vector<int>>("output_shape");
-    if (output_shape.size() == 0) {
-      PADDLE_ENFORCE(ctx->HasInput("OutputShape"),
-                     "Input(OutputShape) of AffineGridOp should not be null if "
-                     "attr(output_shape) is not configured.");
-      auto output_shape_dims = ctx->GetInputDim("OutputShape");
-      PADDLE_ENFORCE(output_shape_dims.size() == 1,
-                     "AffineGrid's Input(OutputShape) should be 1-D tensor.");
-    } else {
-      PADDLE_ENFORCE(output_shape.size() == 4,
-                     "The size of attr(output_shape) should be 4.");
-    }
-
-    PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2.");
-    PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3.");
-    // N * H * W * 2
-    ctx->SetOutputDim("Output",
-                      framework::make_ddim({theta_dims[0], -1, -1, 2}));
-    ctx->ShareLoD("Theta", "Output");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kCUDNN;
-    }
-#endif
-    auto data_type = ctx.Input<Tensor>("Theta")->type();
-    return framework::OpKernelType(data_type, ctx.GetPlace(),
-                                   framework::DataLayout::kAnyLayout, library);
-  }
-};
-
-class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "Theta",
-        "(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. "
-        "It is used to transform coordinate (x_0, y_0) to coordinate (x_1, "
-        "y_1).");
-    AddInput("OutputShape",
-             "(Tensor) The shape of target image with format [N, C, H, W].")
-        .AsDispensable();
-    AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2].");
-    AddAttr<bool>(
-        "use_cudnn",
-        "(bool, default false) Only used in cudnn kernel, need install cudnn")
-        .SetDefault(true);
-    AddAttr<std::vector<int>>(
-        "output_shape",
-        "The target output image shape with format [N, C, H, W].")
-        .SetDefault(std::vector<int>());
-
-    AddComment(R"DOC(
-    It generates a grid of (x,y) coordinates using the parameters of the
-    affine transformation that correspond to a set of points where the input
-    feature map should be sampled to produce the transformed output feature map.
-
-    Given:
-        Theta = [[[x_11, x_12, x_13]
-                  [x_14, x_15, x_16]]
-                 [[x_21, x_22, x_23]
-                  [x_24, x_25, x_26]]]
-    
-        OutputShape = [2, 3, 5, 5]
-
-    Step 1:
-
-        Generate relative coordinates according to OutputShape.
-        The values of relative coordinates are in the interval between -1 and 1.
-        The shape of the relative coordinates is [2, H, W] as below:
-    
-        C = [[[-1.  -1.  -1.  -1.  -1. ]
-              [-0.5 -0.5 -0.5 -0.5 -0.5]
-              [ 0.   0.   0.   0.   0. ]
-              [ 0.5  0.5  0.5  0.5  0.5]
-              [ 1.   1.   1.   1.   1. ]] 
-             [[-1.  -0.5  0.   0.5  1. ]
-              [-1.  -0.5  0.   0.5  1. ]
-              [-1.  -0.5  0.   0.5  1. ]
-              [-1.  -0.5  0.   0.5  1. ]
-              [-1.  -0.5  0.   0.5  1. ]]]
-        C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
-    
-    Step2:
-        Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
-        C_ = [[-1.  -1.   1. ]
-              [-0.5 -1.   1. ]
-              [ 0.  -1.   1. ]
-              [ 0.5 -1.   1. ]
-              [ 1.  -1.   1. ]
-              [-1.  -0.5  1. ]
-              [-0.5 -0.5  1. ]
-              [ 0.  -0.5  1. ]
-              [ 0.5 -0.5  1. ]
-              [ 1.  -0.5  1. ]
-              [-1.   0.   1. ]
-              [-0.5  0.   1. ]
-              [ 0.   0.   1. ]
-              [ 0.5  0.   1. ]
-              [ 1.   0.   1. ]
-              [-1.   0.5  1. ]
-              [-0.5  0.5  1. ]
-              [ 0.   0.5  1. ]
-              [ 0.5  0.5  1. ]
-              [ 1.   0.5  1. ]
-              [-1.   1.   1. ]
-              [-0.5  1.   1. ]
-              [ 0.   1.   1. ]
-              [ 0.5  1.   1. ]
-              [ 1.   1.   1. ]]
-    Step3:
-        Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
-    )DOC");
-  }
-};
-
-class AffineGridOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("Theta"))) {
-      auto output_dims = ctx->GetInputDim(framework::GradVarName("Output"));
-      ctx->SetOutputDim(framework::GradVarName("Theta"),
-                        {output_dims[0], 2, 3});
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kCUDNN;
-    }
-#endif
-    return framework::OpKernelType(ctx.Input<Tensor>("Theta")->type(),
-                                   ctx.GetPlace(),
-                                   framework::DataLayout::kAnyLayout, library_);
-  }
-};
-
-class AffineGridGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType("affine_grid_grad");
-    op->SetInput("Theta", Input("Theta"));
-    op->SetInput("OutputShape", Input("OutputShape"));
-    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta"));
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker,
-                  ops::AffineGridGradMaker);
-REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    affine_grid,
-    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    affine_grid_grad,
-    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
deleted file mode 100644
index 73df8a38b96c30196a7e39d2cf1e348f2a7722ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/affine_grid_op.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-using Array1 = Eigen::DSizes<int64_t, 1>;
-using Array2 = Eigen::DSizes<int64_t, 2>;
-using Array3 = Eigen::DSizes<int64_t, 3>;
-using Array4 = Eigen::DSizes<int64_t, 4>;
-
-/**
- *Return a tensor with evenly spaced numbers over a specified interval.
- */
-template <typename DeviceContext, typename T>
-struct Linspace {
-  void operator()(T start, T end, int count, framework::Tensor* numbers,
-                  const framework::ExecutionContext& ctx);
-};
-
-template <typename DeviceContext, typename T>
-inline void GetIdxMap(int n, int h, int w, Tensor* grid,
-                      const framework::ExecutionContext& ctx) {
-  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-  grid->mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
-  auto grid_t = EigenTensor<T, 4>::From(*grid);
-  // Get indexes of height with shape [height, width, 1]
-  Tensor h_idx;
-  Linspace<DeviceContext, T> linspace;
-  linspace((T)-1, (T)1, h, &h_idx, ctx);
-  auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
-  // Get indexes of width with shape [height, width, 1]
-  Tensor w_idx;
-  linspace((T)-1, (T)1, w, &w_idx, ctx);
-  auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
-  // Get constant ones tensor with shape [height, width, 1]
-  Tensor ones;
-  ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
-  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
-  // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
-  // ones
-  Tensor w_idx_map;
-  w_idx_map.mutable_data<T>({h, w, 1}, ctx.GetPlace());
-  auto w_idx_map_t = EigenTensor<T, 3>::From(w_idx_map);
-  Tensor h_idx_map;
-  h_idx_map.mutable_data<T>({h, w, 1}, ctx.GetPlace());
-  auto h_idx_map_t = EigenTensor<T, 3>::From(h_idx_map);
-  Tensor w_h_idx_map;
-  w_h_idx_map.mutable_data<T>({h, w, 2}, ctx.GetPlace());
-  auto w_h_idx_map_t = EigenTensor<T, 3>::From(w_h_idx_map);
-  Tensor w_h_one_idx_map;
-  w_h_one_idx_map.mutable_data<T>({h, w, 3}, ctx.GetPlace());
-  auto w_h_one_idx_map_t = EigenTensor<T, 3>::From(w_h_one_idx_map);
-
-  w_idx_map_t.device(place) = w_idx_t.reshape(Array2(1, w))
-                                  .broadcast(Array2(h, 1))
-                                  .reshape(Array3(h, w, 1));
-
-  h_idx_map_t.device(place) = h_idx_t.reshape(Array2(1, h))
-                                  .broadcast(Array2(w, 1))
-                                  .shuffle(Array2(1, 0))
-                                  .reshape(Array3(h, w, 1));
-
-  w_h_idx_map_t.device(place) = w_idx_map_t.concatenate(h_idx_map_t, 2);
-  w_h_one_idx_map_t.device(place) = w_h_idx_map_t.concatenate(ones_t, 2);
-  grid_t.device(place) = w_h_one_idx_map_t.reshape(Array4(1, h, w, 3))
-                             .broadcast(Array4(n, 1, 1, 1));
-}
-
-template <typename DeviceContext, typename T>
-class AffineGridOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* theta = ctx.Input<Tensor>("Theta");
-    int n = theta->dims()[0];
-    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
-    int h = 0;
-    int w = 0;
-    if (size_attr.size() == 0) {
-      auto* output_shape = ctx.Input<Tensor>("OutputShape");
-      Tensor h_sizes;
-      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
-      const int* h_size_data = h_sizes.data<int>();
-      h = h_size_data[2];
-      w = h_size_data[3];
-    } else {
-      h = size_attr[2];
-      w = size_attr[3];
-    }
-    auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), output,
-        static_cast<T>(0));
-    Tensor grid;
-    GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
-    // output = grid * theta.T
-    // TODO(wanghaoshuang): Refine batched matrix multiply
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-    for (int i = 0; i < n; ++i) {
-      Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
-          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
-      Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
-      Tensor sliced_out = output->Slice(i, i + 1).Resize(
-          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 2});
-      blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
-                  T(0));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AffineGridGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
-    int n = output_grad->dims()[0];
-    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
-    int h = 0;
-    int w = 0;
-    if (size_attr.size() == 0) {
-      auto* output_shape = ctx.Input<Tensor>("OutputShape");
-      Tensor h_sizes;
-      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
-      const int* h_size_data = h_sizes.data<int>();
-      h = h_size_data[2];
-      w = h_size_data[3];
-    } else {
-      h = size_attr[2];
-      w = size_attr[3];
-    }
-    theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), theta_grad,
-        static_cast<T>(0));
-    Tensor grid;
-    GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
-    // output = grid * theta.T
-    // TODO(wanghaoshuang): Refine batched matrix multiply
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-    for (int i = 0; i < n; ++i) {
-      Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
-          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
-      Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize(
-          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 2});
-      Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
-      blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
-                  &sliced_theta_grad, T(0));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/anakin/CMakeLists.txt b/paddle/fluid/operators/anakin/CMakeLists.txt
deleted file mode 100644
index 5eacefc645bab288da7c289a5d7701abbcbef03d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/anakin/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-op_library(anakin_engine_op DEPS anakin_engine anakin_op_converter)
-# file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(anakin_engine);\n")
diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.cc b/paddle/fluid/operators/anakin/anakin_engine_op.cc
deleted file mode 100644
index 58db16ea0c1347a366a4d5927e414d76864cb6ab..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/anakin/anakin_engine_op.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/anakin/anakin_engine_op.h"
-
-namespace paddle {
-
-namespace operators {
-
-class AnakinEngineOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Xs", "A list of inputs.").AsDuplicable();
-    AddOutput("Ys", "A list of outputs").AsDuplicable();
-    AddAttr<std::string>("subgraph", "the subgraph.");
-    AddAttr<std::string>(
-        "engine_key",
-        "The engine_key here is used to distinguish different TRT Engines");
-    AddAttr<framework::BlockDesc *>("sub_block", "the trt block");
-    AddComment("Anakin engine operator.");
-  }
-};
-
-class AnakinEngineInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(anakin_engine, ops::AnakinEngineOp, ops::AnakinEngineOpMaker,
-                  ops::AnakinEngineOpMaker);
-
-#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h
deleted file mode 100644
index b4aaa228693c8f438a2df3dd316f68b2acaafcc2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <fstream>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/inference/anakin/convert/op_converter.h"
-#include "paddle/fluid/inference/anakin/engine.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-
-namespace paddle {
-namespace operators {
-
-using inference::Singleton;
-using inference::anakin::AnakinEngine;
-
-class AnakinEngineOp : public framework::OperatorBase {
- private:
-  std::vector<std::string> input_names_;
-  std::unordered_set<std::string> param_names_;
-  std::string engine_key_;
-  std::string engine_serialized_data_;
-  bool use_gpu_;
-  bool enable_int8_;
-
- public:
-  AnakinEngineOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {
-    input_names_ = Inputs("Xs");
-    engine_key_ = Attr<std::string>("engine_key");
-    auto params = Attr<std::vector<std::string>>("parameters");
-    use_gpu_ = Attr<bool>("use_gpu");
-    enable_int8_ = Attr<bool>("enable_int8");
-    for (const auto &param : params) {
-      param_names_.insert(param);
-    }
-  }
-
- protected:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    RunAnakin(scope, dev_place);
-  }
-
-  void RunAnakin(const framework::Scope &scope,
-                 const platform::Place &dev_place) const {
-    PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
-
-    std::vector<std::string> output_maps =
-        Attr<std::vector<std::string>>("output_name_mapping");
-
-    std::map<std::string, framework::LoDTensor *> inputs;
-    for (const auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
-      auto &t =
-          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
-
-      inputs.insert({x, &t});
-    }
-
-    std::map<std::string, framework::LoDTensor *> outputs;
-    int output_index = 0;
-    for (const auto &y : Outputs("Ys")) {
-      auto *fluid_v = scope.FindVar(y);
-      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
-      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-      outputs.insert({output_maps[output_index], fluid_t});
-      output_index += 1;
-    }
-    if (enable_int8_) {
-      Execute<::anakin::Precision::INT8>(inputs, outputs, dev_place);
-    } else {
-      Execute<::anakin::Precision::FP32>(inputs, outputs, dev_place);
-    }
-  }
-
-  template <::anakin::Precision PrecisionT>
-  void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
-               const std::map<std::string, framework::LoDTensor *> &outputs,
-               const platform::Place &dev_place) const {
-    if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(dev_place);
-      auto stream =
-          reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx)
-              .stream();
-      auto *engine =
-          inference::Singleton<inference::anakin::AnakinEngineManager<
-              ::anakin::saber::NV, PrecisionT>>::Global()
-              .Get(engine_key_);
-      engine->Execute(inputs, outputs, stream);
-#endif
-    } else {
-#ifdef ANAKIN_X86_PLACE
-      auto *engine =
-          inference::Singleton<inference::anakin::AnakinEngineManager<
-              ::anakin::saber::X86, PrecisionT>>::Global()
-              .Get(engine_key_);
-      engine->Execute(inputs, outputs);
-#else
-      LOG(FATAL) << "Unknown Platform for AnakinEngine!";
-#endif
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
deleted file mode 100644
index 7fe9a0df7467970286fb0efc7c5ce7aaf01ac28b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/arg_max_op.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp,
-                  paddle::operators::ArgMaxOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
deleted file mode 100644
index 85e4f98173511435a52b32e506afc8d5b772f74f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/arg_max_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
deleted file mode 100644
index bf7b83bb7a7d4f4861276a228389e87a42a39ef7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <type_traits>
-#include <vector>
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace paddle {
-namespace operators {
-
-enum ArgMinMaxType { kArgMin, kArgMax };
-
-template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
-          ArgMinMaxType argMinMaxValue>
-struct ArgMinMaxFunctor {};
-
-#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)      \
-  template <typename DeviceContext, typename T, typename Tout, int64_t Rank>  \
-  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
-                          enum_argminmax_value> {                             \
-    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor* out, int64_t axis) {                \
-      auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
-      auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);    \
-      out_eigen.device(*(ctx.eigen_device())) =                               \
-          in_eigen.eigen_op_type(axis).template cast<Tout>();                 \
-    }                                                                         \
-  }
-
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
-
-template <typename DeviceContext, typename T, typename Tout,
-          ArgMinMaxType EnumArgMinMaxValue>
-class ArgMinMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& x = *(ctx.Input<framework::LoDTensor>("X"));
-    auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
-    out.mutable_data<Tout>(ctx.GetPlace());
-    auto axis = ctx.Attr<int64_t>("axis");
-    auto x_rank = x.dims().size();
-    if (axis < 0) axis += x_rank;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-#define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
-  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
-      functor##rank;                                                 \
-  functor##rank(dev_ctx, x, &out, axis)
-
-    switch (x.dims().size()) {
-      case 1:
-        CALL_ARG_MINMAX_FUNCTOR(1);
-        break;
-      case 2:
-        CALL_ARG_MINMAX_FUNCTOR(2);
-        break;
-      case 3:
-        CALL_ARG_MINMAX_FUNCTOR(3);
-        break;
-      case 4:
-        CALL_ARG_MINMAX_FUNCTOR(4);
-        break;
-      case 5:
-        CALL_ARG_MINMAX_FUNCTOR(5);
-        break;
-      case 6:
-        CALL_ARG_MINMAX_FUNCTOR(6);
-        break;
-      default:
-        PADDLE_THROW(
-            "%s operator doesn't supports tensors whose ranks are greater "
-            "than 6.",
-            (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
-        break;
-#undef CALL_ARG_MINMAX_FUNCTOR
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-using ArgMinKernel =
-    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMin>;
-
-template <typename DeviceContext, typename T>
-using ArgMaxKernel =
-    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMax>;
-
-class ArgMinMaxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
-    const auto& x_dims = ctx->GetInputDim("X");
-    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
-    PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(),
-                   "'axis' must be inside [-Rank(X), Rank(X))");
-
-    auto x_rank = x_dims.size();
-    if (axis < 0) axis += x_rank;
-
-    std::vector<int64_t> vec;
-    for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
-    for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
-    ctx->SetOutputDim("Out", framework::make_ddim(vec));
-  }
-};
-
-class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
- protected:
-  virtual const char* OpName() const = 0;
-  virtual const char* Name() const = 0;
-
- public:
-  void Make() override {
-    AddInput("X", "Input tensor.");
-    AddOutput("Out", "Output tensor.");
-    AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
-    AddComment(string::Sprintf(R"DOC(
-      %s Operator.
-
-      Computes the indices of the %s elements of the input tensor's element
-      along the provided axis.
-)DOC",
-                               OpName(), Name()));
-  }
-};
-
-class ArgMinOpMaker : public BaseArgMinMaxOpMaker {
- protected:
-  const char* OpName() const override { return "ArgMin"; }
-  const char* Name() const override { return "min"; }
-};
-
-class ArgMaxOpMaker : public BaseArgMinMaxOpMaker {
- protected:
-  const char* OpName() const override { return "ArgMax"; }
-  const char* Name() const override { return "max"; }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
deleted file mode 100644
index 23b24735cd0ba17afd30b95c329cb0530a1f0104..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/arg_min_op.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp,
-                  paddle::operators::ArgMinOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
deleted file mode 100644
index 47d7c8b12243c6c5c501188af7f48f125c266009..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/arg_min_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t>);
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
deleted file mode 100644
index d25160f4232b5a621d16b9f469f56bd5aa7c88e3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/argsort_op.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/argsort_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ArgsortOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ArgsortOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ArgsortOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
-                   "Output(Indices) of ArgsortOp should not be null.");
-
-    auto in_dims = ctx->GetInputDim("X");
-    int axis = ctx->Attrs().Get<int>("axis");
-
-    auto num_dims = in_dims.size();
-    PADDLE_ENFORCE(axis < num_dims,
-                   "Attr(axis) %d of ArgsortOp is out of bounds for Input(X)'s "
-                   "rank %d.",
-                   axis, num_dims);
-    PADDLE_ENFORCE(axis >= -num_dims,
-                   "Attr(axis) %d of ArgsortOp must be not less than "
-                   "-rank(Input(X)) (%d).",
-                   axis, num_dims);
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareDim("X", "Indices");
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-};
-
-class ArgsortOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input of Argsort op.");
-    AddOutput("Out",
-              "(Tensor) The sorted tensor of Argsort op, with the same "
-              "shape as Input(X).");
-    AddOutput("Indices",
-              "(Tensor) The indices of a tensor giving the sorted order, with "
-              "the same shape as Input(X).");
-    AddComment(R"DOC(
-Argsort operator
-
-Performs sorting on the input tensor along the given axis and outputs two 
-tensors, Output(Out) and Output(Indices). They reserve the same shape 
-with Input(X), and Output(Out) represents the sorted tensor while 
-Output(Indices) gives the sorted order along the given axis Attr(axis).
-
- )DOC");
-    AddAttr<int>("axis",
-                 "(int, default -1) The axis along which to sort the tensor. "
-                 "When axis < 0, the actual axis will be the |axis|'th "
-                 "counting backwards. Default -1, the last dimension.")
-        .SetDefault(-1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(argsort,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, float>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
deleted file mode 100644
index 1a0b303817a48ba50f7ce917f94251886c12d229..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/argsort_op.cu
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/execution_policy.h>
-#include <thrust/sort.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/argsort_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-const int kMaxRank = 9;  // The max rank of a tensor allowed in Fluid
-
-__global__ void ComputeTargetIdx(const int64_t* in_dims, int dims_size,
-                                 int axis, int64_t n, int64_t* trg_idx,
-                                 int64_t* med_ids) {
-  int64_t index = threadIdx.x + blockDim.x * blockIdx.x;
-  if (index < n) {
-    int64_t shape_out_axis[kMaxRank - 1] = {0};
-    int64_t dims_out_axis[kMaxRank - 1] = {0};
-    int64_t tmp = index;
-    int64_t pos_in_axis = 0;
-    int64_t i = dims_size - 2;
-    int64_t dim_axis = 0;
-    for (int64_t j = dims_size - 1; j >= 0; --j) {
-      int64_t dim = in_dims[j];
-      if (j != axis) {
-        shape_out_axis[i] = tmp % dim;
-        dims_out_axis[i] = dim;
-        i--;
-      } else {
-        dim_axis = dim;
-        pos_in_axis = tmp % dim_axis;
-      }
-      tmp /= dim;
-    }
-    int64_t group = (dims_size > 1) ? shape_out_axis[0] : 0;
-    for (int64_t j = 0; j < dims_size - 2; ++j) {
-      group = group * dims_out_axis[j + 1] + shape_out_axis[j + 1];
-    }
-
-    int64_t traget_idx = group * dim_axis + pos_in_axis;
-    trg_idx[index] = traget_idx;
-    med_ids[traget_idx] = pos_in_axis;
-  }
-}
-
-template <typename T>
-__global__ void PermuteInData(const T* in, const int64_t* trg_idx, int64_t n,
-                              T* med_out) {
-  int index = threadIdx.x + blockDim.x * blockIdx.x;
-  if (index < n) {
-    med_out[trg_idx[index]] = in[index];
-  }
-}
-
-template <typename T>
-__global__ void Sort(int64_t axis_dim, int64_t groups, T* med_out,
-                     int64_t* med_ids) {
-  int index = threadIdx.x + blockDim.x * blockIdx.x;
-  if (index < groups) {
-    thrust::sort_by_key(thrust::device, med_out + index * axis_dim,
-                        med_out + axis_dim * (1 + index),
-                        med_ids + index * axis_dim);
-  }
-}
-
-template <typename T>
-__global__ void PermuteMediateData(const T* med_out, const int64_t* med_ids,
-                                   const int64_t* trg_idx, int64_t n, T* out,
-                                   int64_t* indices) {
-  int index = threadIdx.x + blockDim.x * blockIdx.x;
-  if (index < n) {
-    out[index] = med_out[trg_idx[index]];
-    indices[index] = med_ids[trg_idx[index]];
-  }
-}
-
-template <typename T>
-class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    const T* in_data = input->data<T>();
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    int64_t numel = input->numel();
-    int64_t groups = numel / in_dims[axis];
-
-    std::vector<int64_t> in_dims_vec = vectorize(in_dims);
-    thrust::device_vector<int64_t> in_dims_dev(in_dims_vec.begin(),
-                                               in_dims_vec.end());
-    int64_t* in_dims_data = thrust::raw_pointer_cast(in_dims_dev.data());
-    // Mediate tensor for sorting data and indices
-    Tensor mediate_output, mediate_indices;
-    T* med_out_data =
-        mediate_output.mutable_data<T>(input->dims(), ctx.GetPlace());
-    int64_t* med_ids_data =
-        mediate_indices.mutable_data<int64_t>(in_dims, ctx.GetPlace());
-    // Target index of each element along the given axis in the mediate tensors
-    Tensor trg_idx_t;
-    int64_t* trg_idx = trg_idx_t.mutable_data<int64_t>(in_dims, ctx.GetPlace());
-
-    auto stream = ctx.cuda_device_context().stream();
-    const int num_threads = PADDLE_CUDA_NUM_THREADS;
-
-    ComputeTargetIdx<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
-        in_dims_data, in_dims.size(), axis, numel, trg_idx, med_ids_data);
-
-    PermuteInData<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
-        in_data, trg_idx, numel, med_out_data);
-
-    Sort<<<(groups - 1) / num_threads + 1, num_threads, 0, stream>>>(
-        in_dims[axis], groups, med_out_data, med_ids_data);
-
-    PermuteMediateData<<<(numel - 1) / num_threads + 1, num_threads, 0,
-                         stream>>>(med_out_data, med_ids_data, trg_idx, numel,
-                                   out_data, ids_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(argsort, paddle::operators::ArgsortOpCUDAKernel<float>,
-                        paddle::operators::ArgsortOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h
deleted file mode 100644
index 7e9112cfb7cbe5f783b04729fb4dff3676c922bc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/argsort_op.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ArgsortKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    const T* in_data = input->data<T>();
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    int64_t groups = input->numel() / in_dims[axis];
-    int64_t stride = (axis == in_dims.size() - 1)
-                         ? 1
-                         : framework::product(framework::slice_ddim(
-                               in_dims, axis + 1, in_dims.size()));
-
-    for (int64_t i = 0; i < groups; ++i) {
-      int64_t idx = i;
-      std::vector<int64_t> shape_vec(in_dims.size(), 0);
-      for (int64_t dim = in_dims.size() - 1; dim >= 0; --dim) {
-        if (dim != axis) {
-          shape_vec[dim] = idx % in_dims[dim];
-          idx /= in_dims[dim];
-        }
-      }
-
-      int64_t start_index = shape_vec[0];
-      for (int64_t dim = 0; dim < in_dims.size() - 1; ++dim) {
-        start_index = start_index * in_dims[dim + 1] + shape_vec[dim + 1];
-      }
-
-      std::vector<int64_t> org_index_vec(in_dims[axis], start_index);
-      for (int64_t j = 1; j < in_dims[axis]; ++j) {
-        org_index_vec[j] += j * stride;
-      }
-
-      std::sort(org_index_vec.begin(), org_index_vec.end(),
-                [in_data](const int64_t v1, const int64_t v2) {
-                  return in_data[v1] < in_data[v2];
-                });
-
-      for (size_t j = 0; j < org_index_vec.size(); ++j) {
-        int64_t index = start_index + j * stride;
-        out_data[index] = in_data[org_index_vec[j]];
-        ids_data[index] = (org_index_vec[j] - start_index) / stride;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
deleted file mode 100644
index 4309f0a5497456065e5c43bc8f7b265fa711f699..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/array_operator.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-class ArrayOp : public framework::OperatorBase {
- public:
-  ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- protected:
-  size_t GetOffset(const framework::Scope &scope,
-                   const platform::Place &place) const {
-    auto *i = scope.FindVar(Input("I"));
-    PADDLE_ENFORCE(i != nullptr, "I must be set");
-    auto &i_tensor = i->Get<framework::LoDTensor>();
-    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    size_t offset;
-    if (platform::is_gpu_place(i_tensor.place())) {
-      // FIXME: Avoid copy from GPU to CPU
-      framework::Tensor t;
-      framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
-      dev_ctx.Wait();
-      offset = static_cast<size_t>(*t.data<int64_t>());
-    } else {
-      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
-    }
-    VLOG(10) << " Offset = " << offset;
-    return offset;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
deleted file mode 100644
index d942391b8644959f63ac58f6a7122bbd3c0ddf84..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <paddle/fluid/operators/math/concat_and_split.h>
-#include <numeric>
-
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-using LoD = framework::LoD;
-
-struct ArrayToLoDFunctor;
-template <typename DeviceContext>
-struct ArrayToLoDFunctorImpl {
-  const ArrayToLoDFunctor *prev_functor_;
-  DeviceContext *dev_ctx_;
-
-  template <typename T>
-  void apply();
-};
-
-struct ArrayToLoDFunctor : public boost::static_visitor<void> {
-  std::vector<framework::Tensor> in;
-  mutable framework::Tensor *out;
-
-  template <typename Place>
-  void operator()(Place place) const {
-    auto &pool = platform::DeviceContextPool::Instance();
-    if (std::is_same<Place, platform::CPUPlace>::value) {
-      Apply(static_cast<platform::CPUDeviceContext *>(pool.Get(place)));
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place)));
-#else
-      PADDLE_THROW("Fluid is not compiled with CUDA");
-#endif
-    }
-  }
-
-  template <typename DeviceContext>
-  void Apply(DeviceContext *dev_ctx) const {
-    ArrayToLoDFunctorImpl<DeviceContext> functor;
-    functor.dev_ctx_ = dev_ctx;
-    functor.prev_functor_ = this;
-    framework::VisitDataType(out->type(), functor);
-  }
-};
-
-template <typename DeviceContext>
-template <typename T>
-void ArrayToLoDFunctorImpl<DeviceContext>::apply() {
-  math::ConcatFunctor<DeviceContext, T> func;
-  func(*dev_ctx_, prev_functor_->in, 0, prev_functor_->out);
-}
-
-class ArrayToLoDTensorOp : public framework::OperatorBase {
- public:
-  ArrayToLoDTensorOp(const std::string &type,
-                     const framework::VariableNameMap &inputs,
-                     const framework::VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
-    auto &rank_table =
-        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
-    auto *out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-
-    // Check dims, place and data type of input's elements and infer output's
-    // dim
-    PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
-    int rank = x[0].dims().size();
-    platform::Place place = x[0].place();
-    auto data_type = x[0].type();
-    int64_t batch_size = x[0].dims()[0];
-    framework::DDim ins_dims = rank > 1
-                                   ? framework::slice_ddim(x[0].dims(), 1, rank)
-                                   : framework::make_ddim({0});
-    for (size_t i = 1; i < x.size(); ++i) {
-      auto ins_i_dims = rank > 1 ? framework::slice_ddim(x[i].dims(), 1, rank)
-                                 : framework::make_ddim({0});
-      PADDLE_ENFORCE_EQ(ins_i_dims, ins_dims,
-                        "The dimension of the %zu'th element in LoDTensorArray "
-                        "differs from previous ones.",
-                        i);
-      PADDLE_ENFORCE(x[i].place() == place,
-                     "The place class of the %zu'th element in LoDTensorArray "
-                     "differs from previous ones.",
-                     i);
-      PADDLE_ENFORCE(x[i].type() == data_type,
-                     "The date type of the %zu'th element in LoDTensorArray "
-                     "differs from previous ones.",
-                     i);
-      batch_size += x[i].dims()[0];
-    }
-    auto ins_dim_vec = framework::vectorize(ins_dims);
-    ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
-    framework::DDim out_dims = framework::make_ddim(ins_dim_vec);
-    out->Resize(out_dims);
-    out->mutable_data(place, data_type);
-
-    auto &table_items = rank_table.items();
-    std::vector<size_t> table_item_idx(table_items.size());
-    // table_item_idx = range(table_items_idx.size())
-    std::iota(table_item_idx.begin(), table_item_idx.end(), 0);
-    std::sort(table_item_idx.begin(), table_item_idx.end(),
-              [&](size_t a, size_t b) {
-                return table_items[a].index < table_items[b].index;
-              });
-
-    // Build LoDTensor `out`
-    framework::LoD *out_lod = out->mutable_lod();
-    out_lod->clear();
-    auto prefix_lod = rank_table.coarse_lod();
-    prefix_lod.emplace_back();
-    auto &cur_level_lod = prefix_lod.back();
-    cur_level_lod.push_back(0);
-    ArrayToLoDFunctor functor;
-    for (size_t idx : table_item_idx) {
-      cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
-      PADDLE_ENFORCE_LE(table_items[idx].length, x.size());
-      for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
-        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
-            x[x_idx].lod(), idx, idx + 1, 0);
-
-        auto &lod_length = lod_and_offset.first;
-        framework::AppendLoD(out_lod, lod_length);
-
-        size_t start_offset = lod_and_offset.second.first;
-        size_t end_offset = lod_and_offset.second.second;
-        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
-                 << ", " << end_offset << "]";
-        // Copy data
-        PADDLE_ENFORCE_GE(end_offset, start_offset);
-        size_t len = end_offset - start_offset;
-        if (len == 0) {
-          continue;
-        }
-        functor.in.emplace_back(x[x_idx].Slice(start_offset, end_offset));
-      }
-    }
-    functor.out = out;
-    platform::VisitPlace(place, functor);
-    out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
-  }
-};
-
-class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(std::vector<LodTensor>) A vector of tensors that is going to "
-             "be casted to a big LoDTensor.");
-    AddInput("RankTable",
-             "(LoDRankTable) RankTable provides the coarse lod infomation to "
-             "build the output LoDTensor. See "
-             "'paddle/framework/lod_rank_table.h' for more details.");
-    AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
-    AddComment(
-        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor> 
-          and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
-          outputs back to a normal LoDTensor. The std::vector<LoDTensor> 
-          would be the output of RNN Op and the LoDRankTable would be build 
-          with RNN's input.)DOC");
-  }
-};
-
-class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "ArrayToLoDTensorOp must has input X.");
-    PADDLE_ENFORCE(context->HasInput("RankTable"),
-                   "ArrayToLoDTensorOp must has input RankTable.");
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-  }
-};
-
-class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("lod_tensor_to_array");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetInput("RankTable", Input("RankTable"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
-                  ops::ArrayToLoDTensorOpProtoMaker,
-                  ops::ArrayToLoDTensorInferShape,
-                  ops::ArrayToLoDTensorGradMaker);
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
deleted file mode 100644
index ff423778c5982c31a5ff5b0a0e264750ad81ed04..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/assign_op.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-class AssignFunctor {
- public:
-  AssignFunctor(framework::Variable *out,
-                const platform::DeviceContext &dev_ctx)
-      : out_(out), dev_ctx_(dev_ctx) {}
-
-  void operator()(const framework::LoDTensor &lod_tensor) const {
-    auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
-    copy_tensor(lod_tensor, &out_tensor);
-  }
-
-  void operator()(const framework::LoDTensorArray &array) const {
-    auto &out_array = *out_->GetMutable<framework::LoDTensorArray>();
-    out_array.resize(array.size());
-    for (size_t i = 0; i < array.size(); ++i) {
-      copy_tensor(array[i], &out_array[i]);
-    }
-  }
-
-  void operator()(const framework::SelectedRows &rows) const {
-    framework::SelectedRows &out_rows =
-        *out_->GetMutable<framework::SelectedRows>();
-    out_rows.set_rows(rows.rows());
-    out_rows.set_height(rows.height());
-    auto &t = rows.value();
-    auto *m = out_rows.mutable_value();
-    framework::TensorCopy(t, t.place(), dev_ctx_, m);
-  }
-
-  template <typename T>
-  void operator()(const T &v) const {
-    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
-  }
-
- private:
-  void copy_tensor(const framework::LoDTensor &lod_tensor,
-                   framework::LoDTensor *out) const {
-    if (lod_tensor.numel() == 0) return;
-    auto &out_tensor = *out;
-    TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
-    out_tensor.set_lod(lod_tensor.lod());
-  }
-
-  framework::Variable *out_;
-  const platform::DeviceContext &dev_ctx_;
-};
-
-class AssignOp : public framework::OperatorWithKernel {
- public:
-  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
-           const framework::VariableNameMap &outputs,
-           const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasInput("X")) {
-      auto type = ctx->GetInputsVarType("X")[0];
-      if (type == framework::proto::VarType::SELECTED_ROWS ||
-          type == framework::proto::VarType::LOD_TENSOR) {
-        ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        if (type == framework::proto::VarType::LOD_TENSOR) {
-          ctx->ShareLoD("X", /*->*/ "Out");
-        }
-      }
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class AssignKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *x = ctx.InputVar("X");
-    if (x == nullptr) {
-      return;
-    }
-    auto *out = ctx.OutputVar("Out");
-    PADDLE_ENFORCE(
-        out != nullptr,
-        "The Output(Out) should not be null if the Input(X) is set.");
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(ctx.GetPlace());
-
-    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
-  }
-};
-
-class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
-             "could be LoDTensor, SelectedRows or LoDTensorArray.")
-        .AsDispensable();
-    AddOutput("Out",
-              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
-              "is the same as input X.");
-    AddComment(R"DOC(Assign Operator
-
-Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
-raise error if the type is not listed above.
-)DOC");
-  }
-};
-
-class AssignGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op = new framework::OpDesc();
-    op->SetType("assign");
-    op->SetInput("X", OutputGrad("Out"));
-    op->SetOutput("Out", InputGrad("X"));
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker,
-                  ops::AssignOpProtoMaker, ops::AssignOpInplaceInferer);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                               ops::AssignKernel, int, ops::AssignKernel,
-                               int64_t, ops::AssignKernel);
-
-#ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                                ops::AssignKernel, int, ops::AssignKernel,
-                                int64_t, ops::AssignKernel);
-#endif
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
deleted file mode 100644
index a757916be7f6ece9b783d51d1051aac6a276795b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/assign_value_op.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/assign_value_op.h"
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class AssignValueOp : public framework::OperatorWithKernel {
- public:
-  AssignValueOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of AssignValueOp should not be null.");
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    ctx->SetOutputDim("Out", framework::make_ddim(shape));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "(Tensor) Output tensor of assign_value operator.");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "Shape of values.");
-    AddAttr<int>("dtype", "data type of values")
-        .InEnum({framework::proto::VarType::INT32,
-                 framework::proto::VarType::FP32});
-    AddAttr<std::vector<float>>("fp32_values", "store the float values")
-        .SetDefault({});
-    AddAttr<std::vector<int>>("int32_values", "store the int values")
-        .SetDefault({});
-    AddComment(R"DOC(
-AssignValue operator
-
-$$Out = values$$
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                       ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/assign_value_op.cu.cc b/paddle/fluid/operators/assign_value_op.cu.cc
deleted file mode 100644
index 08bfde5dc92de9c675e5b9b85f8e65a3bab8631c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/assign_value_op.cu.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/assign_value_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                        ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
deleted file mode 100644
index e749d6f6d3685f207f0ad4f2ebc7c3c7ae32992c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/assign_value_op.h
+++ /dev/null
@@ -1,51 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class AssignValueKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto shape = ctx.Attr<std::vector<int>>("shape");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    int dtype = ctx.Attr<int>("dtype");
-    const char* value_name = nullptr;
-    switch (dtype) {
-      case framework::proto::VarType::INT32:
-        value_name = "int32_values";
-        break;
-      case framework::proto::VarType::FP32:
-        value_name = "fp32_values";
-        break;
-      default:
-        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
-        break;
-    }
-    auto values = ctx.Attr<std::vector<T>>(value_name);
-    framework::TensorFromVector(values, ctx.device_context(), out);
-    out->Resize(framework::make_ddim(shape));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
deleted file mode 100644
index c6d98f1f9a534aa98923afc1ead0ffc1f83a8b99..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ /dev/null
@@ -1,427 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/attention_lstm_op.h"
-#include <string>
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"),
-                 "Assert only one Input(X) of AttentionLSTM.");
-  PADDLE_ENFORCE(ctx->HasInput("C0"),
-                 "Assert only one Input(C0) of AttentionLSTM.");
-  PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
-                 "Assert only one Input(LSTMWeight) of AttentionLSTM.");
-  PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
-                 "Assert only one Input(LSTMBias) of AttentionLSTM.");
-  PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
-                 "Assert only one Input(AttentionWeight) of AttentionLSTM.");
-
-  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Assert only one Output(Hidden) of AttentionLSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                 "Assert only one Output(Cell) of AttentionLSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
-                 "Assert only one Output(AttentionedX) of AttentionLSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
-                 "Assert only one Output(AttentionFCOut) of AttentionLSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
-                 "Assert only one Output(LSTMX) of AttentionLSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
-                 "Assert only one Output(LSTMOUT) of AttentionLSTM.");
-
-  auto x_dims = ctx->GetInputDim("X");
-  const int M = x_dims[1];
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-
-  auto w_dims = ctx->GetInputDim("LSTMWeight");
-  const int D = w_dims[1] / 4;
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(LSTMWeight)'s rank must be 2.");
-  PADDLE_ENFORCE_EQ(w_dims[0], D + M,
-                    "LSTMWeight dims should be (%d + %d) * %d.", D, M, 4 * D);
-
-  auto b_dims = ctx->GetInputDim("LSTMBias");
-  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2.");
-  PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x %d.", 4 * D);
-  PADDLE_ENFORCE_EQ(b_dims[1], 4 * D, "LSTMBias dims should be 1 x %d.", 4 * D);
-
-  auto c_dims = ctx->GetInputDim("C0");
-  PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
-  if (ctx->IsRuntime()) {
-    PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
-  }
-
-  if (ctx->HasInput("H0")) {
-    auto h_dims = ctx->GetInputDim("H0");
-    PADDLE_ENFORCE_EQ(h_dims.size(), 2UL, "Input(H0)'s rank must be 2.");
-    if (ctx->IsRuntime() ||
-        (framework::product(c_dims) > 0 && framework::product(h_dims) > 0)) {
-      PADDLE_ENFORCE(h_dims == c_dims,
-                     "The dimension of Input(H0) and Input(C0) "
-                     "should be the same.");
-    }
-  }
-
-  auto atten_w_dims = ctx->GetInputDim("AttentionWeight");
-  PADDLE_ENFORCE_EQ(atten_w_dims.size(), 2,
-                    "Input(AttentionWeight)'s rank must be 2.");
-  PADDLE_ENFORCE_EQ(atten_w_dims[0], M + D,
-                    "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
-  PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
-                    "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
-
-  if (ctx->HasInput("AttentionBias")) {
-    auto atten_b_dims = ctx->GetInputDim("AttentionBias");
-    PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
-                      "Input(AttentionBias)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(atten_b_dims[0], 1,
-                      "AttentionBias shapes must be 1 * 1.");
-    PADDLE_ENFORCE_EQ(atten_b_dims[1], 1,
-                      "AttentionBias shapes must be 1 * 1.");
-  }
-
-  if (ctx->HasInput("AttentionScalar")) {
-    auto dims = ctx->GetInputDim("AttentionScalar");
-    PADDLE_ENFORCE_EQ(dims.size(), 2,
-                      "Input(AttentionScalar)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalar shapes must be 1 * 1.");
-    PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1.");
-  }
-
-  if (ctx->HasInput("AttentionScalarBias")) {
-    auto dims = ctx->GetInputDim("AttentionScalarBias");
-    PADDLE_ENFORCE(
-        ctx->HasInput("AttentionScalar"),
-        "AttentionScalar should not be null when have AttentionScalarBias.");
-    PADDLE_ENFORCE_EQ(dims.size(), 2,
-                      "Input(AttentionScalarBias)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalarBias shapes must be 1 * 1.");
-    PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalarBias shapes must be 1 * 1.");
-  }
-
-  framework::DDim out_dims({x_dims[0], D});
-  ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("Cell", out_dims);
-  ctx->SetOutputDim("AttentionedX", {x_dims[0], 1});
-  ctx->SetOutputDim("LSTMX", {1, M});
-  ctx->SetOutputDim("LSTMOUT", {1, 4 * D});
-  // AttentionFCOut should be reshape as (maxseqlen,1) in runtime
-  ctx->ShareLoD("X", "Hidden");
-  ctx->ShareLoD("X", "Cell");
-}
-
-framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                 ctx.device_context());
-}
-
-void AttentionLSTMOpMaker::Make() {
-  AddInput("X",
-           "(LoDTensor) the input is a LodTensor, which support "
-           "variable-time length input sequence. The underlying tensor in "
-           "this LoDTensor is a matrix with shape (T X M), where T is the "
-           "total time steps in this mini-batch, M is the dim size of x.");
-  AddInput("C0",
-           "(Tensor) LSTM C0"
-           "This is a tensor with shape (N x D), where N is the batch size, D "
-           "is the gate size."
-           "C0 is necessary because of attention.");
-  AddInput("H0",
-           "(Tensor, optional) LSTM H0"
-           "This is a tensor with shape (N x D), where N is the "
-           "batch size and D is the gate size.")
-      .AsDispensable();
-  AddInput("AttentionWeight",
-           "(Tensor) the weights of attention fc. Always relu the fc result."
-           "The shape is ((M+D) x 1), where M is the dim size of x, D is the "
-           "gate size of LSTM.");
-  AddInput("AttentionBias",
-           "(Tensor, optional) the bias of attention fc."
-           "The shape is (1 x 1)")
-      .AsDispensable();
-  AddInput("AttentionScalar",
-           "(Tensor, optional) the scalar on the result of attentioned fc. "
-           "Always relu the Scalar."
-           "The shape is (1 x 1)")
-      .AsDispensable();
-  AddInput("AttentionScalarBias",
-           "(Tensor, optional) the scalar bias of attention fc."
-           "The shape is (1 x 1)")
-      .AsDispensable();
-  AddInput("LSTMWeight",
-           "(Tensor) the combined weight of LSTM"
-           " - The shape is ((D+M) x 4D), where D is the hidden gate size, M "
-           "is the dim size of x"
-           " - Weight = {W_forget, W_input, W_output, W_cell}");
-  AddInput("LSTMBias",
-           "(Tensor) the combined bias of LSTM, shape (1x4D)."
-           "Note: we should add the bias of hidden and context accorindg to "
-           "the same gate: "
-           "{B_forget, B_input, B_output, B_cell}");
-  AddOutput("Hidden",
-            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
-            "The shape is (T x D), and lod is the same with the `Input`.");
-  AddOutput("Cell",
-            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
-            "The shape is (T x D), and lod is the same with the `Input`.");
-  AddOutput("AttentionedX",
-            "(Tensor) shape is (T x 1), the result after X * AttentionWeight,"
-            " where T is the total time steps in this mini-batch,"
-            " D is the hidden size.")
-      .AsIntermediate();
-  AddOutput("AttentionFCOut",
-            "(Tensor) (max_seq_len, 1), compute at each step.")
-      .AsIntermediate();
-  AddOutput("LSTMX",
-            "(Tensor) the input X of LSTM for each step."
-            "Shape is (1 x M), where M is the x frame size")
-      .AsIntermediate();
-  AddOutput(
-      "LSTMOUT",
-      "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step."
-      "Shape is (1 x 4D), where M is the x frame size")
-      .AsIntermediate();
-  AddAttr<std::string>("gate_activation",
-                       "(string, default: sigmoid)"
-                       "The activation for input gate, forget gate and output "
-                       "gate, `sigmoid` by default.")
-      .SetDefault("sigmoid")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddAttr<std::string>("cell_activation",
-                       "(string, default: tanh)"
-                       "The activation for cell output, `tanh` by default.")
-      .SetDefault("tanh")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddAttr<std::string>("candidate_activation",
-                       "(string, default: tanh)"
-                       "The activation for candidate hidden state, "
-                       "`tanh` by default.")
-      .SetDefault("tanh")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddComment(R"DOC(
-Attention Long-Short Term Memory (LSTM) Operator.
-
-Attention part:
-concat( x(seqlen * M), expand( cell_t-1(1,D) ) ) => tmp(seqlen*(M+D))
-
-tmp(seqlen*(M+D)) * fc((M+D)*1) => fcout(seqlen*1) with bias, relu
-
-fcout(seqlen*1) * scalar => fcout(seqlen*1) with bias, relu
-
-dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M) 
-
-LSTM part:
-use lstm_x_t as input and compute as standard LSTM.
-
-)DOC");
-}
-
-// y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0;
-template <typename T>
-inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
-  if (bias) {
-    math::vec_add_bias<T, platform::avx>(n, *bias, x, y);
-    math::vec_relu<T, platform::avx>(n, y, y);
-  } else {
-    math::vec_relu<T, platform::avx>(n, x, y);
-  }
-}
-
-template <typename T>
-inline void vec_softmax(const int n, const T* x, T* y) {
-  T scalar = x[0];
-  // max
-  for (int i = 1; i < n; ++i) {
-    scalar = scalar < x[i] ? x[i] : scalar;
-  }
-  math::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
-  math::vec_exp<T>(n, y, y);                               // exp
-  // sum
-  scalar = T(0);
-  for (int i = 0; i < n; ++i) {
-    scalar += y[i];
-  }
-  math::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
-}
-
-template <typename T>
-class AttentionLSTMKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
-
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* c0 = ctx.Input<Tensor>("C0");
-    auto* atten_w = ctx.Input<Tensor>("AttentionWeight");
-    auto* atten_b = ctx.Input<Tensor>("AttentionBias");
-    auto* atten_scalar = ctx.Input<Tensor>("AttentionScalar");
-    auto* atten_scalar_bias = ctx.Input<Tensor>("AttentionScalarBias");
-    auto* lstm_w = ctx.Input<Tensor>("LSTMWeight");
-    auto* lstm_b = ctx.Input<Tensor>("LSTMBias");
-
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    auto* atted_x = ctx.Output<Tensor>("AttentionedX");
-    auto* fc_out = ctx.Output<Tensor>("AttentionFCOut");
-    auto* lstm_x = ctx.Output<Tensor>("LSTMX");
-    auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");
-
-    // some shape should be reshape here since infershape can not get lod info
-    auto x_lod = x->lod();
-    const int N = x_lod[0].size() - 1;  // batch size
-    auto x_dims = x->dims();            // T x M
-    auto w_dims = lstm_w->dims();       // (D+M) x 4D
-    const int total_T = x_dims[0];
-    const int M = x_dims[1];      // x frame size
-    const int D = w_dims[1] / 4;  // gate frame size
-    const int D2 = D * 2;
-    const int D3 = D * 3;
-    const int D4 = w_dims[1];
-    int max_seq_len = x_lod[0][1];
-    for (int i = 1; i < N; ++i) {
-      int len = x_lod[0][i + 1] - x_lod[0][i];
-      max_seq_len = max_seq_len < len ? len : max_seq_len;
-    }
-    PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, "Input(X)'s lod size must be 1.");
-    PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
-    fc_out->Resize({max_seq_len, 1});
-
-    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
-    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
-    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
-    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
-    if (platform::MayIUse(platform::avx)) {
-      math::VecActivations<T, platform::avx> act_functor;
-      act_gate = act_functor(act_gate_str);
-      act_cell = act_functor(act_cell_str);
-      act_cand = act_functor(act_cand_str);
-    } else {
-      math::VecActivations<T, platform::isa_any> act_functor;
-      act_gate = act_functor(act_gate_str);
-      act_cell = act_functor(act_cell_str);
-      act_cand = act_functor(act_cand_str);
-    }
-
-    const T* x_data = x->data<T>();
-    const T* h0_data = h0 ? h0->data<T>() : NULL;
-    const T* c0_data = c0->data<T>();
-    const T* lstm_w_data = lstm_w->data<T>();
-    const T* lstm_b_data = lstm_b->data<T>();
-    const T* atten_w_data = atten_w->data<T>();
-    const T* atten_b_data = atten_b ? atten_b->data<T>() : NULL;
-    const T* atten_scalar_data = atten_scalar ? atten_scalar->data<T>() : NULL;
-    const T* atten_scalar_bias_data =
-        atten_scalar_bias ? atten_scalar_bias->data<T>() : NULL;
-
-    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
-    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
-    T* atted_x_data = atted_x->mutable_data<T>(ctx.GetPlace());
-    T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
-    T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
-    T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-
-    // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    math::FCFunctor<DeviceContext, T> fc;
-    fc(dev_ctx, total_T, 1, M, x_data, atten_w_data, atted_x_data,
-       atten_b_data);
-
-    const T* cur_atten_x_data = atted_x_data;
-    const T* cur_x_data = x_data;
-    const T* prev_cell_data = NULL;
-    const T* prev_hidden_data = NULL;
-    T* cur_cell_out_data = cell_out_data;
-    T* cur_hidden_out_data = hidden_out_data;
-    for (int i = 0; i < N; ++i) {
-      int seq_len = x_lod[0][i + 1] - x_lod[0][i];
-      prev_cell_data = c0_data + i * D;
-      prev_hidden_data = h0_data ? h0_data + i * D : NULL;
-      for (int step = 0; step < seq_len; ++step) {
-        /// 1. compute attention vector
-        // 1a. prev_cell(1xD) * fc(D) rest part of atten_wgt
-        T prev_cell_bias = blas.DOT(D, prev_cell_data, atten_w_data + M);
-        // 1b. add cell bias and relu
-        bias_relu<T>(seq_len, cur_atten_x_data, &prev_cell_bias, fc_out_data);
-        // 1c. fc scalar
-        if (atten_scalar_data) {
-          blas.SCAL(seq_len, *atten_scalar_data, fc_out_data);
-          bias_relu<T>(seq_len, fc_out_data, atten_scalar_bias_data,
-                       fc_out_data);
-        }
-        // 1d. softmax
-        vec_softmax<T>(seq_len, fc_out_data, fc_out_data);
-        // mul x(seq_len*M) and sum pool
-        fc(dev_ctx, 1, M, seq_len, fc_out_data, cur_x_data, lstm_x_data);
-
-        /// 2. compute LSTM step
-        // lstm weight : concat[forget , input , output , tilde]
-        // shape : (D + M) x (4 * D)
-        // fc inputX(1xM) * weightX(M*(4D))  => 1 x 4D
-        blas.MatMul(1, D4, M, lstm_x_data, lstm_w_data + D * D4, lstm_out_data);
-        if (prev_hidden_data) {
-          blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
-                    prev_hidden_data, D, lstm_w_data, D4, static_cast<T>(1),
-                    lstm_out_data, D4);
-        }
-        // since input is 1xM, so can use add bias
-        blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data);
-
-        // gate act: sigmoid
-        act_gate(D3, lstm_out_data, lstm_out_data);
-        // candicate act: tanh
-        act_cand(D, lstm_out_data + D3, lstm_out_data + D3);
-
-        // a = forget * prev_cell
-        blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data);
-
-        // b = input * tilde
-        blas.VMUL(D, lstm_out_data + D, lstm_out_data + D3, lstm_out_data + D);
-
-        // cell_out = a + b
-        blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data);
-
-        // state act tanh(cell_out) * output_gate
-        act_cell(D, cur_cell_out_data, lstm_out_data);
-        blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data);
-
-        prev_hidden_data = cur_hidden_out_data;
-        prev_cell_data = cur_cell_out_data;
-        cur_cell_out_data = cur_cell_out_data + D;
-        cur_hidden_out_data = cur_hidden_out_data + D;
-      }
-      cur_x_data = cur_x_data + seq_len * M;
-      cur_atten_x_data = cur_atten_x_data + seq_len;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp,
-                  ops::AttentionLSTMOpMaker);
-
-REGISTER_OP_CPU_KERNEL(attention_lstm, ops::AttentionLSTMKernel<float>,
-                       ops::AttentionLSTMKernel<double>);
diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h
deleted file mode 100644
index 6ede3a7f3c96dd2d13d7c5c19816647e16a3c8d0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/attention_lstm_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-class AttentionLSTMOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class AttentionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
deleted file mode 100644
index 0922b03b5f5fbd2a7a62b0a325ebed9600767497..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/average_accumulates_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <>
-void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t* num_updates,
-    int64_t* num_accumulates, int64_t* old_num_accumulates) {
-  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
-  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
-  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
-
-  *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
-  *num_accumulates = in_num_accumulates->data<int64_t>()[0];
-  *num_updates = in_num_updates->data<int64_t>()[0];
-}
-
-template <>
-void SetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t num_updates,
-    int64_t num_accumulates, int64_t old_num_accumulates) {
-  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
-  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
-  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-
-  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
-  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
-  out_num_updates->data<int64_t>()[0] = num_updates;
-}
-
-class AverageAccumulatesOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("param"),
-        "Input (param) of average_accumulates op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("in_sum_1"),
-        "Input (sum_1) of average_accumulates op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("in_sum_2"),
-        "Input (sum_2) of average_accumulates op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("in_sum_3"),
-        "Input (sum_3) of average_accumulates op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("in_num_accumulates"),
-        "Input (in_num_accumulates) of average_accumulates op should "
-        "not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
-                   "Input (old_num_accumulates) of average_accumulates op "
-                   "should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("in_num_updates"),
-        "Input (num_updates) of average_accumulates op should not be null.");
-
-    PADDLE_ENFORCE(
-        ctx->HasOutput("out_sum_1"),
-        "Output (sum_1) of average_accumulates op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("out_sum_2"),
-        "Output (sum_2) of average_accumulates op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("out_sum_3"),
-        "Output (sum_3) of average_accumulates op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
-                   "Output (num_accumulates) of average_accumulates op should "
-                   "not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
-                   "Output (old_num_accumulates) of average_accumulates op "
-                   "should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("out_num_updates"),
-        "Output (num_updates) of average_accumulates op should not be null.");
-
-    auto in_dim = ctx->GetInputDim("param");
-
-    ctx->SetOutputDim("out_sum_1", in_dim);
-    ctx->SetOutputDim("out_sum_2", in_dim);
-    ctx->SetOutputDim("out_sum_3", in_dim);
-    ctx->SetOutputDim("out_num_accumulates", {1});
-    ctx->SetOutputDim("out_old_num_accumulates", {1});
-    ctx->SetOutputDim("out_num_updates", {1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("param")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("param", "(Tensor), The parameter to be accumulated.");
-    AddInput("in_sum_1",
-             "(Tensor), A tensor used to store the parameter "
-             "sums with the same shape as input(param).");
-    AddInput("in_sum_2",
-             "(Tensor), A auxiliary tensor to help "
-             "accumulating sums of parameter values with the same shape as "
-             "input(param). It is used to avoid loss of precision due to too "
-             "many sums.");
-    AddInput("in_sum_3",
-             "(Tensor), A auxiliary tensor to help "
-             "accumulating sums of parameter values with the same shape as "
-             "input(param).");
-    AddInput("in_num_accumulates",
-             "(Tensor<int64_t>), The accumulating times of current window with "
-             "shape [1].");
-    AddInput(
-        "in_old_num_accumulates",
-        "(Tensor<int64_t>), The accumulating times of previous window with "
-        "shape [1].");
-    AddInput("in_num_updates",
-             "(Tensor<int64_t>), The total number of batches used by trainning "
-             "before this batch with shape [1].");
-
-    AddOutput("out_sum_1",
-              "(Tensor), A tensor used to store the "
-              "parameter sums with the same shape as input(param).");
-    AddOutput("out_sum_2",
-              "(Tensor), A auxiliary tensor to help "
-              "accumulating sums of parameter values with the same shape as "
-              "input(param). It is used to avoid loss of precision due to too "
-              "many sums.");
-    AddOutput("out_sum_3",
-              "(Tensor), A auxiliary tensor to help "
-              "accumulating sums of parameter values with the same shape as "
-              "input(param).");
-    AddOutput(
-        "out_num_accumulates",
-        "(Tensor<int64_t>), The accumulating times of current window with "
-        "shape [1].");
-    AddOutput(
-        "out_old_num_accumulates",
-        "(Tensor<int64_t>) The accumulating times of previous window with "
-        "shape [1].");
-    AddOutput(
-        "out_num_updates",
-        "(Tensor<int64_t>), The total number of batches used by trainning "
-        "before this batch with shape [1].");
-
-    AddAttr<float>("average_window",
-                   "(float, default 0) "
-                   "The rate of average window size relative to num_updates.")
-        .SetDefault(0);
-    AddAttr<int64_t>("max_average_window",
-                     "(int64_t) "
-                     "Maximum size of average window. It suggests that the "
-                     "number of mini-batches "
-                     "in one pass is appropriate value to set.");
-    AddAttr<int64_t>("min_average_window",
-                     "(int64_t, default 10000L) "
-                     "Minimu size of average window.")
-        .SetDefault(10000L);
-
-    AddComment(R"DOC(
-AverageAccumulates Operator.
-Accumulate the sum of parameter within sliding window. The size of sliding window is
-determined by 'average_window', 'max_average_window' and 'min_average_window'.
-Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
-'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
-
-All the accumulators were inited to zero before training.
-
-And for a mini-batch in training, accumulators were computed as below steps:
-    num_updates += 1
-    num_accumulates += 1
-    sum_1 += param
-    if num_updates % kMaxNumAccumulates == 0:
-        sum_2 += sum_1
-        sum_1 = 0
-    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
-        sum_3 = sum_1 + sum_2
-        sum_1 = 0
-        sum_2 = 0
-        old_num_accumulates = num_accumulates
-        num_accumulates = 0
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
-                  ops::AverageAccumulatesOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    average_accumulates,
-    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
deleted file mode 100644
index 104e24f6ee2e2503d98f3a3991a903d8dbc4bdfe..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/average_accumulates_op.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-template <>
-void GetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t* num_updates_,
-    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
-  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
-  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
-  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
-  auto stream = ctx.cuda_device_context().stream();
-  auto cuda_place =
-      boost::get<platform::CUDAPlace>(in_old_num_accumulates->place());
-  memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place,
-               in_old_num_accumulates->data<int64_t>(), sizeof(int64_t),
-               stream);
-  memory::Copy(platform::CPUPlace(), num_accumulates_, cuda_place,
-               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), num_updates_, cuda_place,
-               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
-}
-
-template <>
-void SetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t num_updates_,
-    int64_t num_accumulates_, int64_t old_num_accumulates_) {
-  auto stream = ctx.cuda_device_context().stream();
-  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
-  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
-  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-  auto cuda_place =
-      boost::get<platform::CUDAPlace>(out_old_num_accumulates->place());
-
-  memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(),
-               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
-               stream);
-  memory::Copy(cuda_place, out_num_accumulates->data<int64_t>(),
-               platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
-               stream);
-  memory::Copy(cuda_place, out_num_updates->data<int64_t>(),
-               platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    average_accumulates,
-    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
deleted file mode 100644
index 3958d3f685470f2505abf0e8bfd269d3834970ae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext>
-void GetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t* num_updates, int64_t* num_accumulates,
-                     int64_t* old_num_accumulates);
-
-template <typename DeviceContext>
-void SetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t num_updates, int64_t num_accumulates,
-                     int64_t old_num_accumulates);
-
-template <typename DeviceContext, typename T>
-class AverageAccumulatesKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // It is used to avoid loss of precision
-    static const int64_t kMaxNumAccumulates = 16384;
-    // Get accumulators from input
-    int64_t num_updates = 0;
-    int64_t num_accumulates = 0;
-    int64_t old_num_accumulates = 0;
-    GetAccumulators<DeviceContext>(ctx, &num_updates, &num_accumulates,
-                                   &old_num_accumulates);
-
-    // Get attrs
-    float average_window = ctx.Attr<float>("average_window");
-    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
-    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
-                      "min_average_window shouldn't be larger than "
-                      "max_average_window");
-
-    // Get inputs
-    auto* param = ctx.Input<Tensor>("param");
-    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
-    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
-    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
-    auto param_tensor = EigenVector<T>::Flatten(*param);
-    auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
-    auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
-    auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
-
-    // Get outputs
-    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
-    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
-    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
-    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
-    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
-    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
-
-    // Compute
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    math::SetConstant<DeviceContext, T> constant_functor;
-    ++num_updates;
-    ++num_accumulates;
-    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
-    out_sum_2_tensor.device(place) = in_sum_2_tensor;
-    out_sum_3_tensor.device(place) = in_sum_3_tensor;
-    if (num_updates % kMaxNumAccumulates == 0) {
-      // Move the sum to a different buffer to avoid loss of precision due to
-      // too many sums.
-      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
-      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
-                       0.0);
-    }
-    if (num_accumulates >= min_average_window &&
-        num_accumulates >= std::min<int64_t>(max_average_window,
-                                             num_updates * average_window)) {
-      //  Now the average window is too long, discard the old sum.
-      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
-      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
-                       0.0);
-      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
-                       0.0);
-      old_num_accumulates = num_accumulates;
-      num_accumulates = 0;
-    }
-
-    // Set accumulators to output
-    SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
-                                   old_num_accumulates);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
deleted file mode 100644
index 9a1d724c73962e37f71102afd65c49bbc14088cb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ /dev/null
@@ -1,631 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/data_layout.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ConvOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                 "Input(Scale) of ConvOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                 "Input(Bias) of ConvOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Mean"),
-                 "Input(Mean) of ConvOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Variance"),
-                 "Input(Variance) of ConvOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                 "Output(Y) of ConvOp should not be null.");
-  bool is_test = ctx->Attrs().Get<bool>("is_test");
-  if (!is_test) {
-    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"),
-                   "Output(MeanOut) of ConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"),
-                   "Output(VarianceOut) of ConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"),
-                   "Output(SavedMean) of ConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"),
-                   "Output(SavedVariance) of ConvOp should not be null.");
-  }
-
-  // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
-  PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
-                    "Mean and MeanOut should share the same memory");
-  PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0],
-                    "Variance and VarianceOut should share the same memory");
-
-  const auto x_dims = ctx->GetInputDim("X");
-  const DataLayout data_layout = framework::StringToDataLayout(
-      ctx->Attrs().Get<std::string>("data_layout"));
-
-  PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                 "Input X must have 2 to 5 dimensions.");
-
-  const int64_t C =
-      (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                        : x_dims[x_dims.size() - 1]);
-
-  auto scale_dim = ctx->GetInputDim("Scale");
-  auto bias_dim = ctx->GetInputDim("Bias");
-
-  PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL);
-  PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL);
-
-  bool check = true;
-  if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
-                              framework::product(bias_dim) <= 0)) {
-    check = false;
-  }
-
-  if (check) {
-    PADDLE_ENFORCE_EQ(scale_dim[0], C);
-    PADDLE_ENFORCE_EQ(scale_dim[0], C);
-  }
-  ctx->SetOutputDim("Y", x_dims);
-  ctx->SetOutputDim("MeanOut", {C});
-  ctx->SetOutputDim("VarianceOut", {C});
-  ctx->SetOutputDim("SavedMean", {C});
-  ctx->SetOutputDim("SavedVariance", {C});
-  ctx->ShareLoD("X", "Y");
-}
-
-framework::OpKernelType BatchNormOp::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
-  auto input_data_type = ctx.Input<Tensor>("X")->type();
-  // By default, the type of the scale, bias, mean,
-  // and var tensors should both be float. (For float or float16 input tensor)
-  // or double (For double input tensor).
-  auto bn_param_type = framework::proto::VarType::FP32;
-  if (input_data_type == framework::proto::VarType::FP64) {
-    bn_param_type = framework::proto::VarType::FP64;
-  }
-  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
-                    "Scale input should be of float type");
-  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
-                    "Bias input should be of float type");
-  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
-                    "Mean input should be of float type");
-  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
-                    "Variance input should be of float type");
-
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::LibraryType library = framework::LibraryType::kPlain;
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-#ifdef PADDLE_WITH_MKLDNN
-  if (library == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library = framework::LibraryType::kMKLDNN;
-    layout = framework::DataLayout::kMKLDNN;
-  }
-#endif
-
-  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                 library);
-}
-
-void BatchNormOpMaker::Make() {
-  AddAttr<bool>("is_test",
-                "(bool, default false) Set to true for inference only, false "
-                "for training. Some layers may run faster when this is true.")
-      .SetDefault(false);
-  AddAttr<float>("momentum", "").SetDefault(0.9);
-  AddAttr<float>("epsilon", "")
-      .SetDefault(1e-5)
-      .AddCustomChecker([](const float &epsilon) {
-        PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
-                       "'epsilon' should be between 0.0 and 0.001.");
-      });
-  AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
-  AddInput("X", "The input tensor");
-  AddInput("Scale",
-           "Scale is a 1-dimensional tensor of size C "
-           "that is applied to the output");
-  AddInput("Bias",
-           "Bias is a 1-dimensional tensor of size C "
-           "that is applied to the output");
-  AddInput("Mean",
-           "The global mean (for training) or "
-           "estimated mean (for testing)");
-  AddInput("Variance",
-           "The global variance (for training) "
-           "or estimated Variance (for testing)");
-  AddOutput("Y", "result after normalization");
-  AddOutput("MeanOut",
-            "Share memory with Mean. "
-            "Store the global mean when training");
-  AddOutput("VarianceOut",
-            "Share memory with Variance. "
-            "Store the global Variance when training");
-  AddOutput("SavedMean",
-            "Mean of the current mini batch, "
-            "will apply to output when training")
-      .AsIntermediate();
-  AddOutput("SavedVariance",
-            "Variance of the current mini batch, "
-            "will apply to output when training")
-      .AsIntermediate();
-  AddAttr<bool>("use_mkldnn",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<bool>("fuse_with_relu",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<bool>("use_global_stats",
-                "(bool, default false) Whether to use global mean and "
-                "variance. In inference or test mode, set use_global_stats "
-                "to true or is_test true. the behavior is equivalent. "
-                "In train mode, when setting use_global_stats True, the "
-                "global mean and variance are also used during train time, "
-                "the BN acts as scaling and shiffting.")
-      .SetDefault(false);
-  AddComment(R"DOC(
-Batch Normalization.
-
-Batch Norm has been implemented as discussed in the paper:
-https://arxiv.org/pdf/1502.03167.pdf
-Can be used as a normalizer function for conv2d and fully_connected operations.
-The required data format for this layer is one of the following:
-1. NHWC `[batch, in_height, in_width, in_channels]`
-2. NCHW `[batch, in_channels, in_height, in_width]`
-
-)DOC");
-}
-
-template <typename T>
-class BatchNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-
-    bool global_stats = is_test || use_global_stats;
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 2 and 5");
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
-
-    if (!global_stats) {
-      // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(
-          saved_mean->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> saved_variance_e(
-          saved_variance->mutable_data<T>(ctx.GetPlace()), C);
-      saved_mean_e.setZero();
-      saved_variance_e.setZero();
-
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), C);
-
-      if ((N * sample_size) == 1) {
-        // Only 1 element in normalization dimension,
-        // we skip the batch norm calculation, let y = x.
-        framework::TensorCopy(*x, ctx.GetPlace(), y);
-        return;
-      }
-
-      switch (data_layout) {
-        case DataLayout::kNCHW: {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_mean_e(nc % C) += x_arr.col(nc).sum();
-          }
-          saved_mean_e /= N * sample_size;
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_variance_e(nc % C) +=
-                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        case DataLayout::kNHWC: {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
-          for (int i = 0; i < N * sample_size; ++i) {
-            saved_mean_e += x_arr.col(i);
-          }
-          saved_mean_e /= N * sample_size;
-          for (int i = 0; i < N * sample_size; ++i) {
-            saved_variance_e +=
-                (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        default:
-          PADDLE_THROW("Unknown storage order: %s", data_layout_str);
-      }
-
-      running_mean_arr =
-          running_mean_arr * momentum + saved_mean_e * (1. - momentum);
-      running_var_arr =
-          running_var_arr * momentum + saved_variance_e * (1. - momentum);
-    }
-
-    // use SavedMean and SavedVariance to do normalize
-    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    if (global_stats) {
-      ConstEigenVectorArrayMap<T> var_arr(
-          ctx.Input<Tensor>("Variance")->data<T>(), C);
-      inv_std = (var_arr + epsilon).sqrt().inverse();
-    } else {
-      EigenVectorArrayMap<T> saved_inv_std(
-          ctx.Output<Tensor>("SavedVariance")->data<T>(), C);
-      // inverse SavedVariance first, gradient will use it too.
-      saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
-      inv_std = saved_inv_std;
-    }
-    ConstEigenVectorArrayMap<T> mean_arr(
-        global_stats ? ctx.Input<Tensor>("Mean")->data<T>()
-                     : ctx.Output<Tensor>("SavedMean")->data<T>(),
-        C);
-
-    //   ((x - est_mean) * (inv_var) * scale + bias
-    //   formula transform ====>
-    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
-    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
-    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
-        bias_arr - mean_arr * inv_std * scale_arr;
-
-    switch (data_layout) {
-      case DataLayout::kNCHW: {
-        EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
-                               N * C);
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        for (int nc = 0; nc < N * C; ++nc) {
-          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
-        }
-        break;
-      }
-      case DataLayout::kNHWC: {
-        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
-                         N * sample_size) =
-            (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
-             new_scale)
-                .colwise() +
-            new_bias;
-        break;
-      }
-      default:
-        PADDLE_THROW("Unknown storage order: %d", data_layout);
-    }
-  }
-};
-
-void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
-  // check input
-  PADDLE_ENFORCE(ctx->HasInput("X"));
-  PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                 "Input(Y@GRAD) should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
-                 "Input(SavedMean) should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
-                 "Input(SavedVariance) should not be null");
-
-  // check output
-  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
-  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
-                   "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
-                   "null at same time");
-  }
-  const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
-  if (use_global_stats) {
-    PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
-                   "Using global stats during training is not supported "
-                   "in gradient op kernel of batch_norm_mkldnn_op now.");
-  }
-
-  const auto x_dims = ctx->GetInputDim("X");
-  const DataLayout data_layout = framework::StringToDataLayout(
-      ctx->Attrs().Get<std::string>("data_layout"));
-  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                                  : x_dims[x_dims.size() - 1]);
-
-  ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
-  }
-}
-
-framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
-  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-  if (var == nullptr) {
-    PADDLE_THROW("can't find Y@GRAD");
-  }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
-  } else if (var->IsType<LoDTensor>()) {
-    t = &var->Get<LoDTensor>();
-  }
-  if (t == nullptr) {
-    PADDLE_THROW("can't find Y@GRAD");
-  }
-
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::LibraryType library = framework::LibraryType::kPlain;
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-
-#ifdef PADDLE_WITH_MKLDNN
-  if (library == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library = framework::LibraryType::kMKLDNN;
-    layout = framework::DataLayout::kMKLDNN;
-  }
-#endif
-
-  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
-                                 layout, library);
-}
-
-template <typename T>
-class BatchNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 2 and 5");
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-
-    const T *mean_data = saved_mean->data<T>();
-    const T *inv_var_data = saved_inv_variance->data<T>();
-    Tensor inv_var_tensor;
-    if (use_global_stats) {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
-      mean_data = running_mean->data<T>();
-      inv_var_tensor.Resize({C});
-      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
-      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
-      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
-
-      inv_var_tmp = (var_arr + epsilon).sqrt().inverse().eval();
-      inv_var_data = running_inv_var_data;
-    }
-
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
-
-    T *d_bias_data = nullptr;
-    T *d_scale_data = nullptr;
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      d_bias_data = d_bias->mutable_data<T>(ctx.GetPlace());
-      d_scale_data = d_scale->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // d_bias = np.sum(d_y, axis=0)
-    // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
-    // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
-    //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
-    EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
-    EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
-
-    if (d_scale && d_bias) {
-      d_bias_arr.setZero();
-      d_scale_arr.setZero();
-    }
-
-    if ((N * sample_size) == 1 && !use_global_stats) {
-      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-      return;
-    }
-
-    int scale_coefff = use_global_stats ? 1 : N * sample_size;
-    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
-
-    Tensor dy_sum;
-    dy_sum.Resize({C});
-    dy_sum.mutable_data<T>(ctx.GetPlace());
-    EigenVectorArrayMap<T> dy_sum_arr(dy_sum.mutable_data<T>(ctx.GetPlace()),
-                                      C);
-
-    Tensor dy_mul_x_sub_mean_mul_invstd_sum;
-    dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
-    dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace());
-    EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
-        dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace()), C);
-
-    dy_sum_arr.setZero();
-    dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
-
-    switch (data_layout) {
-      case DataLayout::kNCHW: {
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
-        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
-                                 sample_size, N * C);
-        d_x_arr.setZero();
-
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          dy_sum_arr(c) += d_y_arr.col(nc).sum();
-          dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
-              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
-                  .sum();
-        }
-
-        if (d_scale && d_bias) {
-          d_bias_arr = dy_sum_arr;
-          d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
-        }
-
-        if (!use_global_stats) {
-          for (int nc = 0; nc < N * C; ++nc) {
-            int c = nc % C;
-            d_x_arr.col(nc) +=
-                scale_inv_var_nhw(c) *
-                (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
-                 (x_arr.col(nc) - mean_arr[c]) *
-                     dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c));
-          }
-        } else {
-          for (int nc = 0; nc < N * C; ++nc) {
-            int c = nc % C;
-            d_x_arr.col(nc) += scale_inv_var_nhw(c) * d_y_arr.col(nc);
-          }
-        }
-        break;
-      }
-      case DataLayout::kNHWC: {
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
-        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
-                                 N * sample_size);
-        d_x_arr.setZero();
-
-        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-          dy_sum_arr += d_y_arr.col(nhw);
-          dy_mul_x_sub_mean_mul_invstd_sum_arr +=
-              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
-        }
-
-        if (d_scale && d_bias) {
-          d_bias_arr = dy_sum_arr;
-          d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
-        }
-
-        if (!use_global_stats) {
-          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-            d_x_arr.col(nhw) +=
-                scale_inv_var_nhw *
-                (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
-                 (x_arr.col(nhw) - mean_arr) *
-                     dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
-          }
-        } else {
-          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-            d_x_arr.col(nhw) += scale_inv_var_nhw * d_y_arr.col(nhw);
-          }
-        }
-        break;
-      }
-      default:
-        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
-    }
-  }
-};
-
-std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
-  auto *op = new framework::OpDesc();
-  op->SetType(GradOpType());
-  op->SetInput("X", Input("X"));
-  op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-
-  op->SetInput("Scale", Input("Scale"));
-  op->SetInput("Bias", Input("Bias"));
-  op->SetInput("SavedMean", Output("SavedMean"));
-  op->SetInput("SavedVariance", Output("SavedVariance"));
-
-  // used when setting use_global_stats True during training
-  if (boost::get<bool>(GetAttr("use_global_stats"))) {
-    op->SetInput("Mean", Output("MeanOut"));
-    op->SetInput("Variance", Output("VarianceOut"));
-  }
-
-  op->SetAttrMap(Attrs());
-
-  op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-  op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
-  op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-
-  return std::unique_ptr<framework::OpDesc>(op);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
-REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
deleted file mode 100644
index 95d7f23b2c0ac6e46cf85bef4340eb4180dc3dba..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ /dev/null
@@ -1,489 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <cfloat>
-#include <string>
-#include <vector>
-#include "cub/cub.cuh"
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-#include "paddle/fluid/platform/float16.h"
-
-DECLARE_bool(cudnn_batchnorm_spatial_persistent);
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-template <typename T>
-using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
-
-template <typename T>
-class BatchNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 2 and 5");
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    auto *y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    // ------------------- cudnn descriptors ---------------------
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if CUDNN_VERSION_MIN(7, 0, 0)
-    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#else
-    mode_ = CUDNN_BATCHNORM_SPATIAL;
-#endif
-
-    VLOG(3) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (data_layout == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * D * C, 1, W * D * C, D * C, C};
-    }
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    // Note: PERSISTENT not implemented for inference
-    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, is_test ? CUDNN_BATCHNORM_SPATIAL : mode_));
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    // Now, depending on whether we are running test or not, we have two paths.
-    if (is_test || use_global_stats) {
-      // only when test we use input to do computation.
-      const auto *est_mean = ctx.Input<Tensor>("Mean");
-      const auto *est_var = ctx.Input<Tensor>("Variance");
-      // Run inference mode.
-      PADDLE_ENFORCE_EQ(est_mean->dims().size(), 1UL);
-      PADDLE_ENFORCE_EQ(est_var->dims().size(), 1UL);
-      PADDLE_ENFORCE_EQ(est_mean->dims()[0], C);
-      PADDLE_ENFORCE_EQ(est_var->dims()[0], C);
-
-      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardInference(
-          handle,
-          // Note: PERSISTENT not implemented for inference
-          CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-          CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
-          data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
-          bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-          bias->template data<BatchNormParamType<T>>(),
-          est_mean->template data<BatchNormParamType<T>>(),
-          est_var->template data<BatchNormParamType<T>>(), epsilon));
-    } else {
-      // Run training mode.
-      // obtain running mean and running inv var, and see if we need to
-      // initialize them.
-
-      auto *mean_out = ctx.Output<Tensor>("MeanOut");
-      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-          functor;
-      functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
-      functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
-
-      if ((N * H * W * D) == 1) {
-        // Only 1 element in normalization dimension,
-        // skip the batch norm calculation, let y = x.
-        framework::TensorCopy(*x, ctx.GetPlace(), y);
-      } else {
-        double this_factor = 1. - momentum;
-
-        CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
-            handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-            data_desc_, x->template data<T>(), data_desc_,
-            y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), this_factor,
-            mean_out->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace()),
-            variance_out->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace()),
-            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
-                         ctx.GetPlace()),
-            saved_variance->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace())));
-      }
-    }
-
-    // clean when exit.
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-  }
-};
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void KeBNBackwardScaleBias(
-    const T *dy, const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const double epsilon, const int N,
-    const int C, const int HxW, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-
-    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
-    BatchNormParamType<T> mean_i = mean[i];
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
-                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale[i] = ds_sum * inv_var_i;
-      dbias[i] = db_sum;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNBackwardData(const T *dy,
-                                        const BatchNormParamType<T> *scale,
-                                        const BatchNormParamType<T> *variance,
-                                        const double epsilon, const int C,
-                                        const int HxW, const int num, T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
-    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
-                           scale[c] * inv_var);
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void BNBackwardData(const T *dy,
-                                      const BatchNormParamType<T> *scale,
-                                      const BatchNormParamType<T> *mean,
-                                      const T *x,
-                                      const BatchNormParamType<T> *variance,
-                                      const int C, const int N, const int HxW,
-                                      T *dx) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
-  __shared__ BatchNormParamType<T> dy_sum_val;
-  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> inv_var_i = variance[i];
-    BatchNormParamType<T> mean_i = mean[i];
-    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> dy_x_sub_mean_sum =
-        static_cast<BatchNormParamType<T>>(0);
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      dy_sum += dy_i;
-      dy_x_sub_mean_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-    }
-
-    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
-                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
-
-    if (threadIdx.x == 0) {
-      dy_sum_val = dy_sum;
-      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] =
-          (static_cast<BatchNormParamType<T>>(dy[index]) -
-           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
-           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
-               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
-          scale[i] * inv_var_i;
-    }
-  }
-}
-
-template <typename T>
-class BatchNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-
-    const auto &x_dims = x->dims();
-
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 2 and 5");
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    }
-    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
-    PADDLE_ENFORCE_EQ(scale->dims()[0], C);
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (data_layout == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * C * D, 1, W * D * C, D * C, C};
-    }
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    const int num = x->numel();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    int grid1 = (num + block - 1) / block;
-    int grid2 = std::min(C, max_blocks);
-
-    if (!use_global_stats) {
-      if ((N * H * W * D) == 1) {
-        framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-        math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-            functor;
-        functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-        functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-        return;
-      }
-
-      // ------------------- cudnn descriptors ---------------------
-      cudnnTensorDescriptor_t data_desc_;
-      cudnnTensorDescriptor_t bn_param_desc_;
-      cudnnBatchNormMode_t mode_;
-
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-      if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-        LOG(ERROR) << "Provided epsilon is smaller than "
-                   << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                   << "CUDNN_BN_MIN_EPSILON instead.";
-      }
-      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if CUDNN_VERSION_MIN(7, 0, 0)
-      if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-        mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#else
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-#endif
-
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          data_desc_, CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-      CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-          bn_param_desc_, data_desc_, mode_));
-
-      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-      const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-      const auto *saved_mean_data =
-          saved_mean->template data<BatchNormParamType<T>>();
-      const auto *saved_var_data =
-          saved_var->template data<BatchNormParamType<T>>();
-
-      if (d_scale && d_bias) {
-        CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-            dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-            CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-            CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
-            data_desc_, d_y->template data<T>(), data_desc_,
-            d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-            scale->template data<BatchNormParamType<T>>(),
-            d_scale->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace()),
-            d_bias->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace()),
-            epsilon, saved_mean_data, saved_var_data));
-      } else {
-        if (data_layout == framework::DataLayout::kNCHW) {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-        } else {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-        }
-      }
-
-      // clean when exit.
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-    } else {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_var = ctx.Input<Tensor>("Variance");
-
-      const auto *running_mean_data =
-          running_mean->template data<BatchNormParamType<T>>();
-      const auto *running_var_data =
-          running_var->template data<BatchNormParamType<T>>();
-
-      if (data_layout == framework::DataLayout::kNCHW) {
-        if (d_x) {
-          KeBNBackwardData<T, framework::DataLayout::kNCHW><<<
-              grid1, block, 0, dev_ctx.stream()>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
-              grid2, block, 0, dev_ctx.stream()>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      } else {
-        if (d_x) {
-          KeBNBackwardData<T, framework::DataLayout::kNHWC><<<
-              grid1, block, 0, dev_ctx.stream()>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNHWC><<<
-              grid2, block, 0, dev_ctx.stream()>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
deleted file mode 100644
index 6c7dbe0db4e3545200ff77c9331b6b656d9de2ea..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/batch_norm_op.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/norm_utils.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
-class BatchNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override;
-};
-
-class BatchNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override;
-};
-
-class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override;
-
-  virtual std::string GradOpType() const {
-    return this->ForwardOpType() + "_grad";
-  }
-};
-
-class BatchNormOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BatchNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class BatchNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
deleted file mode 100644
index 7e2740f148f1d273310f44ed4a35d413e7201394..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/batch_size_like.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class BatchSizeLikeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of %s should not be null.", Type());
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of %s should not be null.", Type());
-
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE_GT(shape.size(), 0);
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto output_dim = framework::make_ddim(shape_int64);
-
-    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
-    PADDLE_ENFORCE_GE(input_dim_idx, 0);
-    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
-
-    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
-    PADDLE_ENFORCE_GE(output_dim_idx, 0);
-    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
-
-    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
-    ctx->SetOutputDim("Out", output_dim);
-  }
-};
-
-class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput(
-        "Input",
-        "Tensor whose input_dim_idx'th dimension specifies the batch_size");
-    AddOutput("Out",
-              "Tensor of specified shape will be filled "
-              "with the specified value");
-    AddAttr<std::vector<int>>("shape", "The shape of the output");
-    AddAttr<int>("input_dim_idx",
-                 "default 0. The index of input's batch size dimension")
-        .SetDefault(0);
-    AddAttr<int>("output_dim_idx",
-                 "default 0. The index of output's batch size dimension")
-        .SetDefault(0);
-    Apply();
-  }
-
- protected:
-  virtual void Apply() = 0;
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(BatchSizeLikeNoNeedBufferVarsInference,
-                                      "Input");
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
deleted file mode 100644
index 4cef49280dfb5207a9d94df42d94657f03ec838f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <string>
-
-#include "paddle/fluid/operators/beam_search_decode_op.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-struct BeamSearchDecodeFunctor {
-  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
-                          const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor, LoDTensor* score_tensor,
-                          size_t beam_size, int end_id)
-      : beam_size_(beam_size),
-        end_id_(end_id),
-        step_ids_origin_(step_ids),
-        step_scores_origin_(step_scores),
-        id_tensor_(id_tensor),
-        score_tensor_(score_tensor) {
-    tensor_on_gpu_ = false;
-    // First make a copy of GPU data on CPU
-    if (platform::is_gpu_place(step_ids_origin_[0].place())) {
-      tensor_on_gpu_ = true;
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      auto* dev_ctx = pool.Get(step_ids_origin_[0].place());
-      // Copy all tensors in the input tensor array
-      for (auto& step_id : step_ids_origin_) {
-        framework::LoDTensor out;
-        if (step_id.numel() > 0) {
-          dev_ctx->Wait();
-          framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
-          dev_ctx->Wait();
-        }
-
-        out.set_lod(step_id.lod());
-        step_ids_.push_back(out);
-      }
-    }
-    if (platform::is_gpu_place(step_scores_origin_[0].place())) {
-      tensor_on_gpu_ = true;
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      auto* dev_ctx = pool.Get(step_scores_origin_[0].place());
-      // Copy all tensors in the input tensor array
-      for (auto& step_score : step_scores_origin_) {
-        framework::LoDTensor out;
-        if (step_score.numel() > 0) {
-          dev_ctx->Wait();
-          framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx,
-                                &out);
-          dev_ctx->Wait();
-        }
-
-        out.set_lod(step_score.lod());
-        step_scores_.push_back(out);
-      }
-    }
-  }
-
-  template <typename T>
-  void apply() const;
-
-  bool tensor_on_gpu_;
-  size_t beam_size_;
-  int end_id_;
-  // TODO(Superjomn) Here might result serious performance issue in the
-  // concurrency
-  // scenarios.
-  const LoDTensorArray& step_ids_origin_;
-  const LoDTensorArray& step_scores_origin_;
-  LoDTensorArray step_ids_ = LoDTensorArray();
-  LoDTensorArray step_scores_ = LoDTensorArray();
-  LoDTensor* id_tensor_;
-  LoDTensor* score_tensor_;
-};
-
-template <typename T>
-void BeamSearchDecodeFunctor::apply() const {
-  BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
-  // Check if the tensor is on GPU. If so, use the CPU copy instead
-  if (tensor_on_gpu_) {
-    beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
-                                  score_tensor_);
-  } else {
-    beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_,
-                                  id_tensor_, score_tensor_);
-  }
-}
-
-template <>
-void BeamSearchDecodeFunctor::apply<bool>() const {
-  PADDLE_THROW("beam search decode op does not support bool!");
-}
-
-class BeamSearchDecodeOp : public framework::OperatorBase {
- public:
-  BeamSearchDecodeOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& dev_ctx = *pool.Get(dev_place);
-
-    framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
-    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx, nullptr);
-
-    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
-    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
-    const size_t step_num = ids->size();
-    PADDLE_ENFORCE_GT(step_num, 0UL,
-                      "beam search steps should be larger than 0");
-    const size_t source_num = ids->at(0).lod().at(0).size() - 1;
-    PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0");
-
-    for (size_t i = 0; i < step_num; ++i) {
-      PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL,
-                        "Level of LodTensor should be 2");
-    }
-
-    size_t beam_size = ctx.Attr<int>("beam_size");
-    int end_id = ctx.Attr<int>("end_id");
-
-    // prepare output
-    LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
-    LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
-
-    framework::VisitDataType(
-        scores->at(0).type(),
-        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores,
-                                beam_size, end_id));
-  }
-};
-
-class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LodTensorArray)"
-             "The LodTensorArray containing the selected ids of all steps");
-    AddInput("Scores",
-             "(LodTensorArray)"
-             "The LodTensorArray containing the selected scores of all steps");
-    AddOutput(
-        "SentenceIds",
-        "(LodTensor)"
-        "An LodTensor containing all generated id sequences for all source "
-        "sentences");
-    AddOutput(
-        "SentenceScores",
-        "(LodTensor)"
-        "An LodTensor containing scores corresponding to Output(SentenceIds)");
-    AddAttr<int>("beam_size", "beam size for beam search");
-    AddAttr<int>("end_id",
-                 "the token id which indicates the end of a sequence");
-    AddComment(R"DOC(
-Beam Search Decode Operator. This Operator constructs the full hypotheses for
-each source sentence by walking back along the LoDTensorArray Input(ids)
-whose lods can be used to restore the path in the beam search tree.
-
-The Output(SentenceIds) and Output(SentenceScores) separately contain the
-generated id sequences and the corresponding scores. The shapes and lods of the
-two LodTensor are same. The lod level is 2 and the two levels separately
-indicate how many hypotheses each source sentence has and how many ids each
-hypothesis has.
-)DOC");
-  }
-};
-
-class BeamSearchDecodeInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE(context->HasInput("Ids"),
-                   "BeamSearchDecodeOp must has input Ids");
-    PADDLE_ENFORCE(context->HasInput("Scores"),
-                   "BeamSearchDecodeOp must has input Scores");
-    PADDLE_ENFORCE(context->HasOutput("SentenceIds"),
-                   "BeamSearchDecodeOp must has output SentenceIds");
-    PADDLE_ENFORCE(context->HasOutput("SentenceScores"),
-                   "BeamSearchDecodeOp must has output SentenceScores");
-  }
-};
-
-class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    for (auto& o : ctx->Output("SentenceIds")) {
-      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
-    }
-    for (auto& o : ctx->Output("SentenceScores")) {
-      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp,
-                  paddle::operators::BeamSearchDecodeOpProtoMaker,
-                  paddle::operators::BeamSearchDecodeInferShape,
-                  paddle::operators::BeamSearchDecodeInferVarType,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
deleted file mode 100644
index 0b883c3158fb922caae2e731875bbb8d43a1e9ca..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensorArray = framework::LoDTensorArray;
-
-// all the lod have 2 levels.
-// The first is source level, the second is sentence level.
-// source level describe how many prefixes (branchs) for each source sentece
-// (beam). sentence level describe how these candidates belong to the prefixes.
-const size_t kSourceLevel = 0;
-const size_t kSentenceLevel = 1;
-
-template <typename T>
-struct Sentence {
-  std::vector<int64_t> word_ids;
-  std::vector<T> scores;
-};
-
-template <typename T>
-using SentenceVector = std::vector<Sentence<T>>;
-
-template <typename T>
-struct BeamSearchDecoder {
-  BeamSearchDecoder(size_t beam_size, int end_id)
-      : beam_size_(beam_size), end_id_(end_id) {}
-
-  /**
-   * convert the result sentence_vector for each source sentence into two
-   * LodTensor.
-   * One is all candidate sentences with word id, one is all candidate sentences
-   * with word score.
-   * Param:
-   *  sentence_vector_list: sentence_vector for each source sentence.
-   *  id_tensor: result LoDTensor for sentences of id.
-   *  score_tensor: result LoDTensor for sentences of score.
-   *  reverse: whether ids of sentence in sentence_vector_list is reversed
-   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
-   */
-  void ConvertSentenceVectorToLodTensor(
-      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-      LoDTensor* score_tensor, bool reverse = true,
-      bool sort_by_score = true) const;
-
-  /**
-   * Gather the hypotheses for each source sentence by backtrace though the
-   * LoDTensorArray step_ids whose lods reserve the path in the tree.
-   */
-  void Backtrace(const LoDTensorArray& step_ids,
-                 const LoDTensorArray& step_scores, LoDTensor* id_tensor,
-                 LoDTensor* score_tensor) const;
-
-  size_t beam_size_;
-  int end_id_;
-};
-
-template <typename T>
-void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
-    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-    LoDTensor* score_tensor, bool reverse, bool sort_by_score) const {
-  size_t src_num = sentence_vector_list.size();
-
-  PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
-
-  std::vector<size_t> source_level_lod = {0};
-  std::vector<size_t> sentence_level_lod = {0};
-  std::vector<int64_t> id_data;
-  std::vector<T> score_data;
-
-  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-    if (sort_by_score) {
-      sort(sentence_vector_list[src_idx].begin(),
-           sentence_vector_list[src_idx].end(),
-           [reverse](const Sentence<T>& a, const Sentence<T>& b) {
-             if (reverse)
-               return a.scores.front() > b.scores.front();
-             else
-               return a.scores.back() > b.scores.back();
-           });
-    }
-    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-      if (reverse) {
-        id_data.insert(id_data.end(), sentence.word_ids.rbegin(),
-                       sentence.word_ids.rend());
-        score_data.insert(score_data.end(), sentence.scores.rbegin(),
-                          sentence.scores.rend());
-      } else {
-        id_data.insert(id_data.end(), sentence.word_ids.begin(),
-                       sentence.word_ids.end());
-        score_data.insert(score_data.end(), sentence.scores.begin(),
-                          sentence.scores.end());
-      }
-
-      sentence_level_lod.push_back(sentence_level_lod.back() +
-                                   sentence.word_ids.size());
-    }
-    source_level_lod.push_back(source_level_lod.back() +
-                               sentence_vector_list[src_idx].size());
-  }
-
-  auto cpu_place = std::unique_ptr<paddle::platform::CPUPlace>(
-      new paddle::platform::CPUPlace());
-  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
-
-  framework::LoD lod;
-  lod.push_back(source_level_lod);
-  lod.push_back(sentence_level_lod);
-
-  id_tensor->set_lod(lod);
-  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
-  id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
-  framework::TensorFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
-
-  score_tensor->set_lod(lod);
-  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
-  score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
-  framework::TensorFromVector<T>(score_data, cpu_ctx, score_tensor);
-}
-
-template <typename T>
-void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
-                                     const LoDTensorArray& step_scores,
-                                     LoDTensor* id_tensor,
-                                     LoDTensor* score_tensor) const {
-  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
-  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
-                    "step_ids and step_scores should be the same");
-  const size_t step_num = step_ids.size();
-  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
-  std::vector<SentenceVector<T>> sentence_vector_list(
-      src_num, SentenceVector<T>(beam_size_));
-  std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
-  for (int step_id = step_num - 1; step_id >= 0; --step_id) {
-    auto& cur_ids = step_ids.at(step_id);
-    auto& cur_scores = step_scores.at(step_id);
-    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-      // for each source sentence
-      auto& sentence_vector = sentence_vector_list.at(src_idx);
-      auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
-      size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
-      size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-      if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
-                                        // or the last time step
-        for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
-             ++prefix_idx) {
-          size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-          size_t candidate_end =
-              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
-          for (size_t candidate_idx = candidate_start;
-               candidate_idx < candidate_end; ++candidate_idx) {
-            prefix_idx_vector.push_back(prefix_idx);
-            size_t idx = prefix_idx_vector.size() - 1;
-            auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
-            auto cur_score = cur_scores.data<T>()[candidate_idx];
-            sentence_vector.at(idx).word_ids.push_back(cur_id);
-            sentence_vector.at(idx).scores.push_back(cur_score);
-          }
-        }
-      } else {  // use prefix_idx_vector to backtrace
-        size_t src_candidate_start =
-            cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
-        size_t prefix_idx = src_prefix_start;
-        size_t candidate_num =
-            cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-            cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-        for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
-          auto candidate_idx = prefix_idx_vector.at(idx);
-          auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
-          auto cur_score = cur_scores.data<T>()[candidate_idx];
-          if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
-            // to skip redundant end tokens
-            sentence_vector.at(idx).word_ids.push_back(cur_id);
-            sentence_vector.at(idx).scores.push_back(cur_score);
-          }
-
-          while (src_candidate_start + candidate_num <=
-                 candidate_idx) {  // search the corresponding prefix
-            prefix_idx++;
-            candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-                             cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-          }
-          prefix_idx_vector.at(idx) = prefix_idx;
-        }
-      }
-    }
-  }
-
-  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
-                                   score_tensor, true, true);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
deleted file mode 100644
index 88339e38d89db3f79abf232d6b0d035b759739a6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/beam_search_decode_op.h"
-#include "gtest/gtest.h"
-
-using CPUPlace = paddle::platform::CPUPlace;
-using LoD = paddle::framework::LoD;
-using LoDTensor = paddle::framework::LoDTensor;
-using LoDTensorArray = paddle::framework::LoDTensorArray;
-
-template <typename T>
-using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
-template <typename T>
-using Sentence = paddle::operators::Sentence<T>;
-template <typename T>
-using SentenceVector = paddle::operators::SentenceVector<T>;
-
-namespace paddle {
-namespace test {
-
-void GenerateExample(const std::vector<size_t>& level_0,
-                     const std::vector<size_t>& level_1,
-                     const std::vector<int>& data, LoDTensorArray* ids,
-                     LoDTensorArray* scores) {
-  PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1,
-                    "source level is used to describe candidate set");
-  PADDLE_ENFORCE_EQ(level_1.back(), data.size(),
-                    "the lowest level is used to describe data"
-                    ", so it's last element should be data length");
-
-  CPUPlace place;
-
-  LoD lod;
-  lod.push_back(level_0);
-  lod.push_back(level_1);
-
-  // Ids
-  LoDTensor tensor_id;
-  tensor_id.set_lod(lod);
-  tensor_id.Resize({static_cast<int64_t>(data.size())});
-  // malloc memory
-  int64_t* id_ptr = tensor_id.mutable_data<int64_t>(place);
-  for (size_t i = 0; i < data.size(); ++i) {
-    id_ptr[i] = static_cast<int64_t>(data.at(i));
-  }
-
-  // Scores
-  LoDTensor tensor_score;
-  tensor_score.set_lod(lod);
-  tensor_score.Resize({static_cast<int64_t>(data.size())});
-  // malloc memory
-  float* score_ptr = tensor_score.mutable_data<float>(place);
-  for (size_t i = 0; i < data.size(); ++i) {
-    score_ptr[i] = static_cast<float>(data.at(i));
-  }
-
-  ids->push_back(tensor_id);
-  scores->push_back(tensor_score);
-}
-
-}  // namespace test
-}  // namespace paddle
-
-TEST(BeamSearchDecodeOp, Backtrace) {
-  CPUPlace place;
-
-  // Construct sample data with 5 steps and 2 source sentences
-  // beam_size = 2, start_id = 0, end_id = 1
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 1, 2}, std::vector<size_t>{0, 1, 2},
-      std::vector<int>{0, 0}, &ids, &scores);  // start with start_id
-  paddle::test::GenerateExample(std::vector<size_t>{0, 1, 2},
-                                std::vector<size_t>{0, 2, 4},
-                                std::vector<int>{2, 3, 4, 5}, &ids, &scores);
-  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
-                                std::vector<size_t>{0, 2, 2, 4, 4},
-                                std::vector<int>{3, 1, 5, 4}, &ids, &scores);
-  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
-                                std::vector<size_t>{0, 1, 2, 3, 4},
-                                std::vector<int>{1, 1, 3, 5}, &ids, &scores);
-  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 2, 4},
-      std::vector<size_t>{0, 0, 0, 2,
-                          2},  // the branchs of the first source sentence
-                               // are pruned since finished
-      std::vector<int>{5, 1},
-      &ids, &scores);
-
-  ASSERT_EQ(ids.size(), 5UL);
-  ASSERT_EQ(scores.size(), 5UL);
-
-  BeamSearchDecoder<float> helper(2, 1);  // beam_size = 2, end_id = 1
-
-  LoDTensor id_tensor;
-  LoDTensor score_tensor;
-  helper.Backtrace(ids, scores, &id_tensor, &score_tensor);
-
-  LoD lod = id_tensor.lod();
-  std::vector<size_t> expect_source_lod = {0, 2, 4};
-  EXPECT_EQ(lod[0], expect_source_lod);
-  std::vector<size_t> expect_sentence_lod = {0, 4, 7, 12, 17};
-  EXPECT_EQ(lod[1], expect_sentence_lod);
-  std::vector<int> expect_data = {0, 2, 3, 1, 0, 2, 1, 0, 4,
-                                  5, 3, 5, 0, 4, 5, 3, 1};
-  ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
-  for (size_t i = 0; i < expect_data.size(); ++i) {
-    ASSERT_EQ(id_tensor.data<int64_t>()[i],
-              static_cast<int64_t>(expect_data[i]));
-  }
-  for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) {
-    ASSERT_EQ(score_tensor.data<float>()[i],
-              static_cast<float>(id_tensor.data<int64_t>()[i]));
-  }
-}
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
deleted file mode 100644
index a6aa35e0569364d79c15aea6e6dbc6ca670d49f0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/beam_search_op.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/beam_search_op.h"
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    // inputs and outputs stored in proto
-    AddInput("pre_ids",
-             "(LoDTensor) The LoDTensor containing the selected ids at the "
-             "previous step. It should be a tensor with shape (batch_size, 1) "
-             "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
-             "the first step.");
-    AddInput("pre_scores",
-             "(LoDTensor) The LoDTensor containing the accumulated "
-             "scores corresponding to the selected ids at the previous step.");
-    AddInput("ids",
-             "(LoDTensor) The LoDTensor containing the candidates ids. Its "
-             "shape should be (batch_size * beam_size, W). If not set, it will "
-             "be calculated out according to Input(scores) in this operator.")
-        .AsDispensable();
-    AddInput("scores",
-             "(LoDTensor) The LoDTensor containing the current scores "
-             "corresponding to Input(ids). If Input(ids) is not nullptr, its "
-             "shape is the same as that of Input(ids)."
-             "If is_accumulated is true, Input(scores) is accumulated scores "
-             "and will be used derectedly. Else, each score will be "
-             "transformed to the log field and accumulate Input(pre_sores) "
-             "first.");
-    AddOutput("selected_ids",
-              "A LodTensor that stores the IDs selected by beam search.");
-    AddOutput("selected_scores",
-              "A LoDTensor containing the accumulated scores corresponding to "
-              "Output(selected_ids).");
-    AddOutput("parent_idx",
-              "A Tensor preserving the selected_ids' parent indice in pre_ids.")
-        .AsDispensable();
-
-    // Attributes stored in AttributeMap
-    AddAttr<int>("level", "the level of LoDTensor");
-    AddAttr<int>("beam_size", "beam size for beam search");
-    AddAttr<int>("end_id",
-                 "the token id which indicates the end of a sequence");
-    AddAttr<bool>("is_accumulated",
-                  "Whether the Input(scores) is accumulated scores.")
-        .SetDefault(true);
-
-    AddComment(R"DOC(
-This operator does the search in beams for one time step.
-Specifically, it selects the top-K candidate word ids of current step from
-Input(ids) according to their Input(scores) for all source sentences,
-where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
-from the computation cell. Additionally, Input(pre_ids) and Input(pre_scores)
-are the output of beam_search at previous step, they are needed for special use
-to handle ended candidate translations. The paths linking prefixes and selected
-candidates are organized and reserved in lod.
-
-Note that the Input(scores) passed in should be accumulated scores, and
-length penalty should be done with extra operators before calculating the
-accumulated scores if needed, also suggest finding top-K before it and
-using the top-K candidates following.
-)DOC");
-  }
-};
-
-class BeamSearchOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    for (const std::string &arg :
-         std::vector<std::string>({"pre_ids", "scores"})) {
-      PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
-                     arg);
-    }
-    for (const std::string &arg :
-         std::vector<std::string>({"selected_ids", "selected_scores"})) {
-      PADDLE_ENFORCE(ctx->HasOutput(arg),
-                     "BeamSearch need output argument '%s'", arg);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto *scores = ctx.Input<framework::LoDTensor>("scores");
-    size_t level = ctx.Attr<int>("level");
-    size_t batch_size = scores->lod()[level].size() - 1;
-    // The current CUDA kernel only support cases with batch_size < 4.
-    // Compute on CPU for cases with batch_size > 4.
-    if (batch_size <= 4) {
-      return framework::OpKernelType(
-          ctx.Input<framework::LoDTensor>("pre_ids")->type(), ctx.GetPlace());
-    } else {
-      return framework::OpKernelType(
-          ctx.Input<framework::LoDTensor>("pre_ids")->type(),
-          platform::CPUPlace());
-    }
-  }
-};
-
-class BeamSearchInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    for (auto &o : ctx->Output("selected_ids")) {
-      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
-    }
-    for (auto &o : ctx->Output("selected_scores")) {
-      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(beam_search, ops::BeamSearchOp, ops::BeamSearchOpMaker,
-                  ops::BeamSearchInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    beam_search,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc
deleted file mode 100644
index 4ef9476eee5d3fac4decd7273da824b2f2349199..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/beam_search_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/beam_search_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    beam_search,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
deleted file mode 100644
index 3d32ea0cc9686a709b185087d76d12f266663d03..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/beam_search_op.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/beam_search.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class BeamSearchOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* ids = context.Input<framework::LoDTensor>("ids");
-    auto* scores = context.Input<framework::LoDTensor>("scores");
-    auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
-    auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
-
-    PADDLE_ENFORCE_NOT_NULL(scores);
-    PADDLE_ENFORCE_NOT_NULL(pre_ids);
-    PADDLE_ENFORCE_NOT_NULL(pre_scores);
-
-    size_t level = context.Attr<int>("level");
-    size_t beam_size = context.Attr<int>("beam_size");
-    int end_id = context.Attr<int>("end_id");
-    bool is_accumulated = context.Attr<bool>("is_accumulated");
-
-    auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
-    auto selected_scores =
-        context.Output<framework::LoDTensor>("selected_scores");
-    auto* parent_idx = context.Output<framework::Tensor>("parent_idx");
-    PADDLE_ENFORCE_NOT_NULL(selected_ids);
-    PADDLE_ENFORCE_NOT_NULL(selected_scores);
-
-    math::BeamSearchFunctor<DeviceContext, T> alg;
-    alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
-        ids, scores, selected_ids, selected_scores, parent_idx, level,
-        beam_size, end_id, is_accumulated);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt
deleted file mode 100644
index 54008336a9f67f0123ba1cfa6fcea35b79b7ac4c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/benchmark/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
-        DEPS memory timer framework_proto proto_desc lod_tensor op_registry
-        device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
deleted file mode 100644
index ac487223d09b1b5be2cb889fb7fb7f60c0093397..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ /dev/null
@@ -1,522 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/benchmark/op_tester.h"
-#include <fstream>
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/platform/timer.h"
-#include "paddle/fluid/pybind/pybind.h"
-
-namespace paddle {
-namespace operators {
-namespace benchmark {
-
-DEFINE_string(op_config_list, "", "Path of op config file.");
-DEFINE_int32(specified_config_id, -1, "Test the specified op config.");
-
-void OpTester::Init(const std::string &filename) {
-  Init(OpTesterConfig(filename));
-}
-
-void OpTester::Init(const OpTesterConfig &config) {
-  config_ = config;
-
-  auto &op_desc_info = framework::OpInfoMap::Instance();
-  // Initialize the OpDesc
-  if (op_desc_info.Has(config_.op_type)) {
-    type_ = config_.op_type;
-
-    CreateOpDesc();
-    CreateInputVarDesc();
-    CreateOutputVarDesc();
-  } else {
-    LOG(FATAL) << "Op \"" << config_.op_type << "\" is not registered.";
-  }
-
-  if (config_.device_id >= 0) {
-    place_ = paddle::platform::CUDAPlace(config_.device_id);
-  } else {
-    place_ = paddle::platform::CPUPlace();
-  }
-
-  framework::InitDevices(false);
-  scope_.reset(new paddle::framework::Scope());
-
-  op_ = framework::OpRegistry::CreateOp(op_desc_);
-  CreateVariables(scope_.get());
-}
-
-void OpTester::Run() {
-  if (config_.print_debug_string) {
-    LOG(INFO) << DebugString();
-  }
-
-  // Warm up
-  RunImpl();
-
-  platform::Timer timer;
-  if (config_.profile) {
-    if (platform::is_cpu_place(place_)) {
-      platform::EnableProfiler(platform::ProfilerState::kCPU);
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      platform::EnableProfiler(platform::ProfilerState::kAll);
-      platform::SetDeviceId(config_.device_id);
-#else
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-    }
-
-    timer.Start();
-    for (int i = config_.repeat; i > 0; --i) {
-      RunImpl();
-    }
-    timer.Pause();
-    platform::DisableProfiler(platform::EventSortingKey::kDefault,
-                              "op_tester_profiler");
-  } else {
-    timer.Start();
-    for (int i = config_.repeat; i > 0; --i) {
-      RunImpl();
-    }
-    timer.Pause();
-  }
-  config_.runtime = timer.ElapsedMS() / config_.repeat;
-  LOG(INFO) << "=== Run " << config_.repeat
-            << " times, latency: " << config_.runtime << " ms ===";
-}
-
-void OpTester::RunImpl() {
-  op_->Run(*scope_, place_);
-  platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  scope_->DropKids();
-}
-
-std::vector<std::string> OpTester::GetOpProtoInputNames() {
-  std::vector<std::string> input_names;
-  const framework::proto::OpProto &proto =
-      framework::OpInfoMap::Instance().Get(type_).Proto();
-  for (int i = 0; i != proto.inputs_size(); ++i) {
-    const auto &input = proto.inputs(i);
-    input_names.push_back(input.name());
-  }
-  return input_names;
-}
-
-std::vector<std::string> OpTester::GetOpProtoOutputNames() {
-  std::vector<std::string> output_names;
-  const framework::proto::OpProto &proto =
-      framework::OpInfoMap::Instance().Get(type_).Proto();
-  for (int i = 0; i != proto.outputs_size(); ++i) {
-    const auto &output = proto.outputs(i);
-    output_names.push_back(output.name());
-  }
-  return output_names;
-}
-
-std::unordered_map<std::string, framework::proto::AttrType>
-OpTester::GetOpProtoAttrNames() {
-  std::unordered_map<std::string, framework::proto::AttrType> attr_types;
-  const framework::proto::OpProto &proto =
-      framework::OpInfoMap::Instance().Get(type_).Proto();
-  const std::vector<std::string> skipped_attrs = {
-      framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
-      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(),
-      framework::OpProtoAndCheckerMaker::OpNamescopeAttrName(),
-      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()};
-  for (int i = 0; i != proto.attrs_size(); ++i) {
-    const auto &attr = proto.attrs(i);
-    if (!Has(skipped_attrs, attr.name())) {
-      VLOG(4) << "attr: " << attr.name() << ", type: " << attr.type();
-      attr_types[attr.name()] = attr.type();
-    }
-  }
-  return attr_types;
-}
-
-framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
-  if (str == "int32") {
-    return framework::proto::VarType::INT32;
-  } else if (str == "int64") {
-    return framework::proto::VarType::INT64;
-  } else if (str == "fp32") {
-    return framework::proto::VarType::FP32;
-  } else if (str == "fp64") {
-    return framework::proto::VarType::FP64;
-  } else {
-    PADDLE_THROW("Unsupported dtype %s.", str.c_str());
-  }
-}
-
-void OpTester::CreateInputVarDesc() {
-  std::vector<std::string> input_names = GetOpProtoInputNames();
-  for (auto &name : input_names) {
-    const OpInputConfig *input = config_.GetInput(name);
-    if (input == nullptr) {
-      LOG(FATAL) << "The input " << name << " of op " << config_.op_type
-                 << " is not correctlly provided.";
-    }
-
-    std::string var_name = config_.op_type + "." + name;
-    framework::VarDesc *var = Var(var_name);
-    // Need to support more type
-    var->SetType(framework::proto::VarType::LOD_TENSOR);
-    var->SetPersistable(false);
-    var->SetDataType(TransToVarType(input->dtype));
-    var->SetShape(input->dims);
-
-    op_desc_.SetInput(name, {var_name});
-    inputs_[var_name] = *input;
-  }
-}
-
-void OpTester::CreateOutputVarDesc() {
-  std::vector<std::string> output_names = GetOpProtoOutputNames();
-  for (auto &name : output_names) {
-    std::string var_name = config_.op_type + "." + name;
-    framework::VarDesc *var = Var(var_name);
-    // Need to support more type
-    var->SetType(framework::proto::VarType::LOD_TENSOR);
-    var->SetPersistable(false);
-    var->SetDataType(framework::proto::VarType::FP32);
-
-    op_desc_.SetOutput(name, {var_name});
-  }
-}
-
-void OpTester::CreateOpDesc() {
-  op_desc_.SetType(config_.op_type);
-  std::unordered_map<std::string, framework::proto::AttrType> attr_types =
-      GetOpProtoAttrNames();
-  for (auto item : config_.attrs) {
-    const std::string &name = item.first;
-    if (attr_types.find(name) == attr_types.end()) {
-      LOG(FATAL) << "Operator " << type_ << " do not have attr " << name;
-    }
-
-    const std::string &value_str = item.second;
-    const framework::proto::AttrType &type = attr_types[name];
-    switch (type) {
-      case framework::proto::AttrType::BOOLEAN:
-        break;
-      case framework::proto::AttrType::INT: {
-        int value = StringTo<int>(value_str);
-        op_desc_.SetAttr(name, {value});
-      } break;
-      case framework::proto::AttrType::FLOAT: {
-        float value = StringTo<float>(value_str);
-        op_desc_.SetAttr(name, {value});
-      } break;
-      case framework::proto::AttrType::STRING: {
-        op_desc_.SetAttr(name, {value_str});
-      } break;
-      case framework::proto::AttrType::BOOLEANS:
-      case framework::proto::AttrType::INTS:
-      case framework::proto::AttrType::FLOATS:
-      case framework::proto::AttrType::STRINGS:
-        LOG(FATAL) << "Not supported yet.";
-        break;
-      case framework::proto::AttrType::LONG: {
-        int64_t value = StringTo<int64_t>(value_str);
-        op_desc_.SetAttr(name, value);
-      } break;
-      case framework::proto::AttrType::LONGS:
-      default:
-        PADDLE_THROW("Unsupport attr type %d", type);
-    }
-  }
-}
-
-framework::VarDesc *OpTester::Var(const std::string &name) {
-  auto it = vars_.find(name);
-  if (it != vars_.end()) {
-    return it->second.get();
-  }
-  auto *var = new framework::VarDesc(name);
-  vars_[name].reset(var);
-  return var;
-}
-
-template <typename T>
-void OpTester::SetupTensor(framework::LoDTensor *tensor,
-                           const std::vector<int64_t> &shape, T lower, T upper,
-                           const std::string &initializer,
-                           const std::string &filename) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-
-  T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
-
-  framework::LoDTensor cpu_tensor;
-  T *cpu_ptr = nullptr;
-
-  if (!platform::is_cpu_place(place_)) {
-    cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
-                                         platform::CPUPlace());
-  } else {
-    cpu_ptr = ptr;
-  }
-
-  if (initializer == "random") {
-    for (int i = 0; i < cpu_tensor.numel(); ++i) {
-      cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-    }
-  } else if (initializer == "natural") {
-    for (int i = 0; i < cpu_tensor.numel(); ++i) {
-      cpu_ptr[i] = static_cast<T>(lower + i);
-    }
-  } else if (initializer == "zeros") {
-    for (int i = 0; i < cpu_tensor.numel(); ++i) {
-      cpu_ptr[i] = static_cast<T>(0);
-    }
-  } else if (initializer == "file") {
-    std::ifstream is(filename);
-    for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
-      T value;
-      is >> value;
-      cpu_ptr[i] = static_cast<T>(value);
-    }
-    is.close();
-  } else {
-    PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
-  }
-
-  if (!platform::is_cpu_place(place_)) {
-    TensorCopySync(cpu_tensor, place_, tensor);
-  }
-}
-
-void OpTester::CreateVariables(framework::Scope *scope) {
-  for (auto &item : vars_) {
-    auto &var = item.second;
-    if (var->Name() == framework::kEmptyVarName) {
-      continue;
-    }
-
-    auto *ptr = scope->Var(var->Name());
-    framework::InitializeVariable(ptr, var->GetType());
-    if (var->Persistable()) {
-      VLOG(3) << "Create Variable " << var->Name()
-              << " global, which pointer is " << ptr;
-    } else {
-      VLOG(3) << "Create Variable " << var->Name()
-              << " locally, which pointer is " << ptr;
-    }
-  }
-
-  for (auto &item : inputs_) {
-    // Allocate memory for input tensor
-    auto &var_name = item.first;
-    VLOG(3) << "Allocate memory for tensor " << var_name;
-
-    auto &var_desc = vars_[var_name];
-    std::vector<int64_t> shape = var_desc->GetShape();
-
-    auto *var = scope->Var(var_name);
-    auto *tensor = var->GetMutable<framework::LoDTensor>();
-    const auto &data_type = var_desc->GetDataType();
-    if (data_type == framework::proto::VarType::INT32) {
-      SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer,
-                       item.second.filename);
-    } else if (data_type == framework::proto::VarType::INT64) {
-      SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer,
-                           item.second.filename);
-    } else if (data_type == framework::proto::VarType::FP32) {
-      SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
-                         static_cast<float>(1.0), item.second.initializer,
-                         item.second.filename);
-    } else if (data_type == framework::proto::VarType::FP64) {
-      SetupTensor<double>(tensor, shape, static_cast<double>(0.0),
-                          static_cast<double>(1.0), item.second.initializer,
-                          item.second.filename);
-    } else {
-      PADDLE_THROW("Unsupported dtype %d.", data_type);
-    }
-
-    VLOG(3) << "Set lod for tensor " << var_name;
-    std::vector<std::vector<size_t>> &lod_vec = item.second.lod;
-    framework::LoD lod;
-    for (size_t i = 0; i < lod_vec.size(); ++i) {
-      lod.push_back(lod_vec[i]);
-    }
-    tensor->set_lod(lod);
-  }
-}
-
-static std::string GenSpaces(int count) {
-  std::stringstream ss;
-  for (int i = 0; i < count; ++i) {
-    ss << "  ";
-  }
-  return ss.str();
-}
-
-std::string OpTester::DebugString() {
-  std::stringstream ss;
-  int count = 0;
-  for (auto &item : vars_) {
-    auto &var = item.second;
-    ss << GenSpaces(count++) << "vars {\n";
-    ss << GenSpaces(count) << "name: \"" << var->Name() << "\"\n";
-    ss << GenSpaces(count++) << "type: {\n";
-    ss << GenSpaces(count) << "type: LOD_TENSOR\n";
-    ss << GenSpaces(count++) << "lod_tensor {\n";
-    ss << GenSpaces(count++) << "tensor {\n";
-    const auto &data_type = var->GetDataType();
-    if (data_type == framework::proto::VarType::INT32) {
-      ss << GenSpaces(count) << "data_type: INT32\n";
-    } else if (data_type == framework::proto::VarType::INT64) {
-      ss << GenSpaces(count) << "data_type: INT64\n";
-    } else if (data_type == framework::proto::VarType::FP32) {
-      ss << GenSpaces(count) << "data_type: FP32\n";
-    } else if (data_type == framework::proto::VarType::FP64) {
-      ss << GenSpaces(count) << "data_type: FP64\n";
-    }
-    std::vector<int64_t> shape = var->GetShape();
-    for (auto d : shape) {
-      ss << GenSpaces(count) << "dims: " << d << "\n";
-    }
-    ss << GenSpaces(--count) << "}\n";
-    ss << GenSpaces(--count) << "}\n";
-    ss << GenSpaces(--count) << "}\n";
-    ss << GenSpaces(count) << "persistable: " << var->Persistable() << "\n";
-    ss << GenSpaces(--count) << "}\n";
-  }
-  ss << GenSpaces(count++) << "ops {\n";
-  for (auto &name : op_desc_.InputNames()) {
-    ss << GenSpaces(count++) << "inputs {\n";
-    ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
-    ss << GenSpaces(count) << "arguments: \"" << op_desc_.Input(name)[0]
-       << "\"\n";
-    ss << GenSpaces(--count) << "}\n";
-  }
-  for (auto &name : op_desc_.OutputNames()) {
-    ss << GenSpaces(count++) << "outputs {\n";
-    ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
-    ss << GenSpaces(count) << "arguments: \"" << op_desc_.Output(name)[0]
-       << "\"\n";
-    ss << GenSpaces(--count) << "}\n";
-  }
-  ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
-  for (auto &name : op_desc_.AttrNames()) {
-    ss << GenSpaces(count++) << "attrs {\n";
-    const auto &attr_type = op_desc_.GetAttrType(name);
-    const auto &attr = op_desc_.GetAttr(name);
-    ss << GenSpaces(count) << "name: \"" << name << "\"\n";
-    switch (attr_type) {
-      case framework::proto::AttrType::BOOLEAN: {
-        ss << GenSpaces(count) << "type: BOOLEAN\n";
-        ss << GenSpaces(count) << "b: " << boost::get<bool>(attr) << "\n";
-      } break;
-      case framework::proto::AttrType::INT: {
-        ss << GenSpaces(count) << "type: INT\n";
-        ss << GenSpaces(count) << "i: " << boost::get<int>(attr) << "\n";
-      } break;
-      case framework::proto::AttrType::FLOAT: {
-        ss << GenSpaces(count) << "type: FLOAT\n";
-        ss << GenSpaces(count) << "f: " << boost::get<float>(attr) << "\n";
-      } break;
-      case framework::proto::AttrType::STRING: {
-        ss << GenSpaces(count) << "type: STRING\n";
-        ss << GenSpaces(count) << "s: \"" << boost::get<std::string>(attr)
-           << "\"\n";
-      } break;
-      case framework::proto::AttrType::BOOLEANS: {
-        ss << GenSpaces(count) << "type: BOOLEANS\n";
-        ss << GenSpaces(count) << "bools: "
-           << "\n";
-      } break;
-      case framework::proto::AttrType::INTS: {
-        ss << GenSpaces(count) << "type: INTS\n";
-        ss << GenSpaces(count) << "ints: "
-           << "\n";
-      } break;
-      case framework::proto::AttrType::FLOATS: {
-        ss << GenSpaces(count) << "type: FLOATS\n";
-        ss << GenSpaces(count) << "floats: "
-           << "\n";
-      } break;
-      case framework::proto::AttrType::STRINGS: {
-        ss << GenSpaces(count) << "type: STRINGS\n";
-        ss << GenSpaces(count) << "strings: "
-           << "\n";
-      } break;
-      case framework::proto::AttrType::LONG: {
-        ss << GenSpaces(count) << "type: LONG\n";
-        ss << GenSpaces(count) << "l: " << boost::get<int64_t>(attr) << "\n";
-      } break;
-      case framework::proto::AttrType::LONGS: {
-        ss << GenSpaces(count) << "type: LONGS\n";
-        ss << GenSpaces(count) << "longs: "
-           << "\n";
-      } break;
-      default:
-        PADDLE_THROW("Unsupport attr type %d", attr_type);
-    }
-    ss << GenSpaces(--count) << "}\n";
-  }
-  ss << GenSpaces(--count) << "}\n";
-  return ss.str();
-}
-
-TEST(op_tester, base) {
-  if (!FLAGS_op_config_list.empty()) {
-    std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
-                   FLAGS_op_config_list.c_str());
-    std::vector<OpTesterConfig> op_configs;
-    while (!fin.eof()) {
-      VLOG(4) << "Reading config " << op_configs.size() << "...";
-      OpTesterConfig config;
-      bool result = config.Init(fin);
-      if (result) {
-        op_configs.push_back(config);
-      }
-    }
-    if (FLAGS_specified_config_id >= 0 &&
-        FLAGS_specified_config_id < static_cast<int>(op_configs.size())) {
-      OpTester tester;
-      tester.Init(op_configs[FLAGS_specified_config_id]);
-      tester.Run();
-    } else {
-      for (size_t i = 0; i < op_configs.size(); ++i) {
-        OpTester tester;
-        tester.Init(op_configs[i]);
-        tester.Run();
-      }
-    }
-  } else {
-    OpTester tester;
-    OpTesterConfig config;
-    config.op_type = "elementwise_add";
-    config.inputs.resize(2);
-    config.inputs[0].name = "X";
-    config.inputs[0].dims = {64, 64};
-    config.inputs[1].name = "Y";
-    config.inputs[1].dims = {64, 1};
-    tester.Init(config);
-    tester.Run();
-  }
-}
-
-}  // namespace benchmark
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h
deleted file mode 100644
index a6d21573a05166a5cb98e78d4993f9304882d2e1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/benchmark/op_tester_config.h"
-
-namespace paddle {
-namespace operators {
-namespace benchmark {
-
-class OpTester {
- public:
-  OpTester() {}
-
-  void Init(const std::string &filename);
-  void Init(const OpTesterConfig &config);
-
-  void Run();
-
-  std::string DebugString();
-
- private:
-  std::vector<std::string> GetOpProtoInputNames();
-  std::vector<std::string> GetOpProtoOutputNames();
-  std::unordered_map<std::string, framework::proto::AttrType>
-  GetOpProtoAttrNames();
-
-  framework::proto::VarType::Type TransToVarType(std::string str);
-  void CreateInputVarDesc();
-  void CreateOutputVarDesc();
-  void CreateOpDesc();
-
-  framework::VarDesc *Var(const std::string &name);
-  void CreateVariables(framework::Scope *scope);
-
-  template <typename T>
-  void SetupTensor(framework::LoDTensor *input,
-                   const std::vector<int64_t> &shape, T lower, T upper,
-                   const std::string &initializer, const std::string &filename);
-
-  void RunImpl();
-
- private:
-  OpTesterConfig config_;
-  std::string type_;
-  framework::OpDesc op_desc_;
-  std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
-  std::unordered_map<std::string, OpInputConfig> inputs_;
-  std::unique_ptr<framework::OperatorBase> op_;
-  platform::Place place_;
-  std::unique_ptr<framework::Scope> scope_;
-};
-
-}  // namespace benchmark
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
deleted file mode 100644
index 818e5f64edc2c1d213659c48d282df75625676ca..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/benchmark/op_tester_config.h"
-#include <fstream>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace benchmark {
-
-static const char kStartSeparator[] = "{";
-static const char kEndSeparator[] = "}";
-static const char kSepBetweenItems[] = ";";
-
-static bool StartWith(const std::string& str, const std::string& substr) {
-  return str.find(substr) == 0;
-}
-
-static bool EndWith(const std::string& str, const std::string& substr) {
-  return str.rfind(substr) == (str.length() - substr.length());
-}
-
-static void EraseEndSep(std::string* str,
-                        std::string substr = kSepBetweenItems) {
-  if (EndWith(*str, substr)) {
-    str->erase(str->length() - substr.length(), str->length());
-  }
-}
-
-OpInputConfig::OpInputConfig(std::istream& is) {
-  std::string sep;
-  is >> sep;
-  if (sep == kStartSeparator) {
-    while (sep != kEndSeparator) {
-      is >> sep;
-      if (sep == "name" || sep == "name:") {
-        is >> name;
-        EraseEndSep(&name);
-      } else if (sep == "dtype" || sep == "dtype:") {
-        ParseDType(is);
-      } else if (sep == "initializer" || sep == "initializer:") {
-        ParseInitializer(is);
-      } else if (sep == "dims" || sep == "dims:") {
-        ParseDims(is);
-      } else if (sep == "lod" || sep == "lod:") {
-        ParseLoD(is);
-      } else if (sep == "filename") {
-        is >> filename;
-        EraseEndSep(&filename);
-      }
-    }
-  }
-}
-
-void OpInputConfig::ParseDType(std::istream& is) {
-  std::string dtype_str;
-  is >> dtype_str;
-  EraseEndSep(&dtype_str);
-
-  if (dtype_str == "int32" || dtype_str == "int") {
-    dtype = "int32";
-  } else if (dtype_str == "int64" || dtype_str == "long") {
-    dtype = "int64";
-  } else if (dtype_str == "fp32" || dtype_str == "float") {
-    dtype = "fp32";
-  } else if (dtype_str == "fp64" || dtype_str == "double") {
-    dtype = "fp64";
-  } else {
-    PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str());
-  }
-  VLOG(4) << "dtype of input " << name << " is: " << dtype;
-}
-
-void OpInputConfig::ParseInitializer(std::istream& is) {
-  std::string initializer_str;
-  is >> initializer_str;
-  EraseEndSep(&initializer_str);
-
-  const std::vector<std::string> supported_initializers = {"random", "natural",
-                                                           "zeros", "file"};
-  if (!Has(supported_initializers, initializer_str)) {
-    PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
-  }
-
-  initializer = initializer_str;
-  VLOG(4) << "initializer of input " << name << " is: " << initializer;
-}
-
-void OpInputConfig::ParseDims(std::istream& is) {
-  std::string dims_str;
-  is >> dims_str;
-
-  dims.clear();
-  std::string token;
-  std::istringstream token_stream(dims_str);
-  while (std::getline(token_stream, token, 'x')) {
-    dims.push_back(std::stoi(token));
-  }
-}
-
-void OpInputConfig::ParseLoD(std::istream& is) {
-  std::string lod_str;
-  std::string start_sep =
-      std::string(kStartSeparator) + std::string(kStartSeparator);
-  std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator);
-
-  std::string sep;
-  is >> sep;
-  if (StartWith(sep, start_sep)) {
-    lod_str += sep;
-    while (!EndWith(sep, end_sep)) {
-      is >> sep;
-      lod_str += sep;
-    }
-  }
-  EraseEndSep(&lod_str);
-  PADDLE_ENFORCE_GE(lod_str.length(), 4U);
-  VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
-
-  // Parse the lod_str
-  lod.clear();
-  for (size_t i = 1; i < lod_str.length() - 1;) {
-    if (lod_str[i] == '{') {
-      std::vector<size_t> level;
-      while (lod_str[i] != '}') {
-        ++i;
-
-        std::string number;
-        while (lod_str[i] >= '0' && lod_str[i] <= '9') {
-          number += lod_str[i];
-          ++i;
-        }
-        level.push_back(StringTo<size_t>(number));
-      }
-      lod.push_back(level);
-    } else if (lod_str[i] == '}') {
-      ++i;
-    }
-  }
-}
-
-OpTesterConfig::OpTesterConfig(const std::string& filename) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
-                 filename.c_str());
-
-  Init(fin);
-}
-
-bool OpTesterConfig::Init(std::istream& is) {
-  std::string sep;
-  is >> sep;
-  if (sep == kStartSeparator) {
-    while (sep != kEndSeparator) {
-      is >> sep;
-      if (sep == "op_type" || sep == "op_type:") {
-        is >> op_type;
-      } else if (sep == "device_id" || sep == "device_id:") {
-        is >> device_id;
-      } else if (sep == "repeat" || sep == "repeat:") {
-        is >> repeat;
-      } else if (sep == "profile" || sep == "profile:") {
-        is >> profile;
-      } else if (sep == "print_debug_string" || sep == "print_debug_string:") {
-        is >> print_debug_string;
-      } else if (sep == "input" || sep == "input:") {
-        OpInputConfig input_config(is);
-        inputs.push_back(input_config);
-      } else if (sep == "attrs" || sep == "attrs:") {
-        ParseAttrs(is);
-      } else {
-        if (sep != kEndSeparator) {
-          return false;
-        }
-      }
-    }
-  } else {
-    return false;
-  }
-  return true;
-}
-
-bool OpTesterConfig::ParseAttrs(std::istream& is) {
-  std::string sep;
-  is >> sep;
-  if (sep == kStartSeparator) {
-    while (true) {
-      std::string key;
-      is >> key;
-      if (key == kEndSeparator) {
-        break;
-      }
-
-      std::string value;
-      is >> value;
-      EraseEndSep(&key, ":");
-      EraseEndSep(&value);
-      VLOG(4) << "attrs: " << key << ", " << value;
-
-      attrs[key] = value;
-    }
-  }
-  return true;
-}
-
-const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    if (inputs[i].name == name) {
-      return &inputs[i];
-    }
-  }
-  return nullptr;
-}
-
-}  // namespace benchmark
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h
deleted file mode 100644
index 3956bc0a8b1080e14cb773c9664f821dc7e40abd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/benchmark/op_tester_config.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <istream>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace benchmark {
-
-struct OpInputConfig {
-  OpInputConfig() {}
-  explicit OpInputConfig(std::istream& is);
-
-  void ParseDType(std::istream& is);
-  void ParseInitializer(std::istream& is);
-  void ParseDims(std::istream& is);
-  void ParseLoD(std::istream& is);
-
-  std::string name;
-  std::string dtype{"fp32"};  // int32/int, int64/long, fp32/float, fp64/double
-  std::string initializer{"random"};  // random, natural, zeros, file
-  std::string filename{""};
-  std::vector<int64_t> dims;
-  std::vector<std::vector<size_t>> lod;
-};
-
-struct OpTesterConfig {
-  OpTesterConfig() {}
-  explicit OpTesterConfig(const std::string& filename);
-
-  bool Init(std::istream& is);
-
-  bool ParseAttrs(std::istream& is);
-
-  const OpInputConfig* GetInput(const std::string& name);
-
-  std::string op_type;
-  std::vector<OpInputConfig> inputs;
-  std::unordered_map<std::string, std::string> attrs;
-  int device_id{-1};  // CPU: -1
-  int repeat{1};
-  int profile{0};
-  int print_debug_string{0};
-  double runtime{0.0};
-};
-
-static bool Has(const std::vector<std::string>& vec, const std::string& item) {
-  for (size_t i = 0; i < vec.size(); ++i) {
-    if (vec[i] == item) {
-      return true;
-    }
-  }
-  return false;
-}
-
-template <typename T>
-T StringTo(const std::string& str) {
-  std::istringstream is(str);
-  T value;
-  is >> value;
-  return value;
-}
-
-}  // namespace benchmark
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
deleted file mode 100644
index f2c30cd7e8c6674866b8dfa482f1bc5195f689c2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/bilinear_tensor_product_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class BilinearTensorProductOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto weight_dims = ctx->GetInputDim("Weight");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor.");
-    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
-    PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
-                      "The input(Weight) must be a 3D tensor.");
-    if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0)) {
-      PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
-                        "The first dimension(batch_size) of input(X) must be "
-                        "equal to the first dimension of the input(Y).");
-    }
-    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
-                      "The second dimension of input(X) must be equal to "
-                      "the second dimension of the input(Weight).");
-    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
-                      "The second dimension of input(Y) must be equal to "
-                      "the third dimension of the input(Weight).");
-
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL,
-                     "The Input(Bias) must be a 2-D tensor with "
-                     "the 2nd dimension fixed to 1 (a row vector).");
-      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
-                        "The second dimension of input(Bias) must be equal "
-                        "to the first dimension of the input(Weight).");
-    }
-
-    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The first input of bilinear_tensor_product operator.");
-    AddInput("Y", "The second input of bilinear_tensor_product operator.");
-    AddInput("Weight",
-             "The learnable parameters of bilinear_tensor_product operator.");
-    AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.")
-        .AsDispensable();
-    AddOutput("Out", "The output of bilinear_tensor_product operator.");
-    AddComment(R"DOC(
-Bilinear Tensor Product operator.
-Given input X and Y, a 3D tensor Weight and a Bias. Each column of the
-Output is computed by one slice $i = 1, . . . , k$ of the tensor:
-
-$$
-M =  (X W_i) * Y \\
-Out_i = \sum_j {M_j} + Bias_i
-$$
-
-Where $W_i$ is the $i$-th slice of Input(Weight);
-      $M_j$ is the $j$-th column of $M$;
-      $Out_i$ is the $i$-th column of Output(Out);
-      $Bias_i$ is a column vector, each element of it is equal to
-        the $i$-th element of $Bias$;
-
-)DOC");
-  }
-};
-
-class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
-                      "The input(Out@GRAD) must be a 2D Tensor.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], out_dims[0],
-        "The first dimension(batch_size) of input(Out@GRAD) must be "
-        "equal to the first dimension of the Input(X).");
-    PADDLE_ENFORCE_EQ(
-        weight_dims[0], out_dims[1],
-        "The second dimension of input(Out@GRAD) must be equal to "
-        "the third dimension of the Input(Weight).");
-
-    auto bias_grad_name = framework::GradVarName("Bias");
-    if (ctx->HasOutput(bias_grad_name)) {
-      ctx->SetOutputDim(bias_grad_name, {1, out_dims[1]});
-    }
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    auto weight_grad_name = framework::GradVarName("Weight");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-    if (ctx->HasOutput(weight_grad_name)) {
-      ctx->SetOutputDim(weight_grad_name, weight_dims);
-    }
-  }
-};
-
-class BilinearTensorProductGradOpDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("bilinear_tensor_product_grad");
-    op->SetAttrMap(Attrs());
-    op->SetInput("X", Input("X"));
-    op->SetInput("Y", Input("Y"));
-    op->SetInput("Weight", Input("Weight"));
-    if (ForwardOp().Inputs().count("Bias") > 0) {
-      op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-    }
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp,
-                  ops::BilinearTensorProductOpMaker,
-                  ops::BilinearTensorProductGradOpDescMaker);
-REGISTER_OPERATOR(bilinear_tensor_product_grad,
-                  ops::BilinearTensorProductOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-REGISTER_OP_CPU_KERNEL(
-    bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
-                                         float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
-                                         double>);
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu
deleted file mode 100644
index c2b4f69e6854522b91dfd9fb5f738c0e5ffc77b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/bilinear_tensor_product_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
-REGISTER_OP_CUDA_KERNEL(
-    bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
-                                         float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
-                                         double>);
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
deleted file mode 100644
index 5017c3a457abc8865b9c20bec1c7c1429a4dfef4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class BilinearTensorProductKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto y_mat = EigenMatrix<T>::From(*y);
-    auto output_mat = EigenMatrix<T>::From(*out);
-
-    auto batch_size = x->dims()[0];
-    auto weight_dims = weight->dims();
-    int out_dim = weight_dims[0];
-    auto x_dim = weight_dims[1];
-    auto y_dim = weight_dims[2];
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    // Create the intermediate variable to caculate the result of
-    // Input(X) multiplied by Input(Weight_i), the formula is:
-    // left_mul = X Weight_i.
-    Tensor left_mul;
-    left_mul.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
-                             ctx.GetPlace());
-    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
-
-    for (int i = 0; i < out_dim; ++i) {
-      auto output_col_vec = output_mat.chip(i, 1);
-      Tensor weight_mat =
-          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
-          CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
-          weight_mat.data<T>(), 0, left_mul.data<T>());
-      output_col_vec.device(place) =
-          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
-    }
-    if (bias) {
-      auto bias_vec = EigenMatrix<T>::From(*bias);
-      Eigen::DSizes<int, 2> bcast(batch_size, 1);
-      output_mat.device(place) = bias_vec.broadcast(bcast).eval() + output_mat;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor* weight = ctx.Input<Tensor>("Weight");
-    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
-    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto batch_size = x->dims()[0];
-    auto weight_dims = weight->dims();
-    int out_dim = weight_dims[0];
-    auto x_dim = weight_dims[1];
-    auto y_dim = weight_dims[2];
-
-    auto x_mat = EigenMatrix<T>::From(*x);
-    auto y_mat = EigenMatrix<T>::From(*y);
-    auto d_out_mat = EigenMatrix<T>::From(*d_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // Create the intermediate variable to calculate the Output(Y@Grad).
-    Tensor x_scale;
-    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
-                            ctx.GetPlace());
-    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
-
-    // Create the intermediate variable to calculate the Output(X@Grad).
-    Tensor y_scale;
-    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
-                            ctx.GetPlace());
-    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
-
-    math::SetConstant<DeviceContext, T> set_zero;
-
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_x, static_cast<T>(0));
-    }
-
-    if (d_y) {
-      d_y->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_y, static_cast<T>(0));
-    }
-
-    if (d_weight) {
-      d_weight->mutable_data<T>(ctx.GetPlace());
-    }
-
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-
-    // Caculate the Output(X@Grad) and Output(Y@Grad).
-    if (d_x || d_y || d_weight) {
-      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
-      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
-      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
-
-      for (int i = 0; i < out_dim; ++i) {
-        Tensor weight_i = weight->Slice(i, i + 1).Resize(
-            framework::make_ddim({x_dim, y_dim}));
-        auto output_vec = d_out_mat.chip(i, 1);
-
-        if (d_x) {
-          y_scale_mat.device(place) =
-              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_x)
-                  .eval() *
-              y_mat;
-          blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
-                    y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
-        }
-
-        if (d_y || d_weight) {
-          auto output_vec_y =
-              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_y)
-                  .eval();
-          x_scale_mat.device(place) = output_vec_y * x_mat;
-          if (d_y) {
-            blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-                      x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
-          }
-          if (d_weight) {
-            Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
-                framework::make_ddim({x_dim, y_dim}));
-            blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
-                      x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
-          }
-        }
-      }
-    }
-
-    // calculate the gradient of Input(Bias).
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
-      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
deleted file mode 100644
index 51c4d878142dcd93a170c9ea4211b9c6ec8e4422..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/bpr_loss_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class BprLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(rank, label_dims.size(),
-                      "Input(X) and Input(Label) shall have the same rank.");
-
-    if (ctx->IsRuntime() || (framework::product(x_dims) > 0 &&
-                             framework::product(label_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                        framework::slice_ddim(label_dims, 0, rank - 1),
-                        "Input(X) and Input(Label) shall have the same shape "
-                        "except the last dimension.");
-    }
-
-    auto y_dims = x_dims;
-    y_dims[rank - 1] = 1;
-    ctx->SetOutputDim("Y", y_dims);
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of Seq-bpr
-  // is determined by its input "X".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class BprLossGradientOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) shoudl be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(dy_dims.size(), rank,
-                      "Input(Y@Grad) and Input(X) should have the same rank.");
-    PADDLE_ENFORCE_EQ(label_dims.size(), rank,
-                      "Input(Label) and Input(X) should have the same rank.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      framework::slice_ddim(label_dims, 0, rank - 1),
-                      "The Input(X) and Input(Label) should have the same "
-                      "shape except the last dimension.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      framework::slice_ddim(dy_dims, 0, rank - 1),
-                      "The Input(X) and Input(Y@Grad) should have the same "
-                      "shape except the last dimension.");
-    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
-                      "The last dimension of Input(Y@Grad) should be 1.");
-    PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
-                      " the last dimension of Input(Label) should be 1.");
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of cross_entropy
-  // is determined by its input "X".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class BprLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a tensor whose last dimension "
-             "size is equal to the number of classes. This input is a "
-             "real number.");
-    AddInput(
-        "Label",
-        "(Tensor), the tensor which represents the ground truth. It has the "
-        "same shape with 'X' except the last dimension. the last dimension "
-        "size is 1.");
-    AddOutput("Y",
-              "(Tensor, default Tensor<float>), a tensor whose shape is same "
-              "with 'X' except that the last dimension size is 1. It "
-              "represents the sequence bpr loss.");
-    AddComment(R"DOC(
-Bayesian Personalized Ranking Loss Operator.
-
-This operator belongs to pairwise ranking loss. Label is the desired item.
-The loss at a given point in one session is defined as:
-$Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$
-
-Learn more details by reading paper <session-based recommendations with recurrent
-neural networks>(https://arxiv.org/abs/1511.06939)
-
-)DOC");
-  }
-};
-
-class BprLossGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("bpr_loss_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Label", Input("Label"));
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPUCtx = paddle::platform::CPUDeviceContext;
-
-REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker,
-                  ops::BprLossGradDescMaker);
-REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
-REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel<CPUCtx, float>,
-                       ops::BprLossOpKernel<CPUCtx, double>);
-REGISTER_OP_CPU_KERNEL(bpr_loss_grad,
-                       ops::BprLossGradientOpKernel<CPUCtx, float>,
-                       ops::BprLossGradientOpKernel<CPUCtx, double>);
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
deleted file mode 100644
index a01666596b62cd0f8433e6bc290ed92ba77966ad..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-/*Todo:
- *Find a way to adapt TolerableValue, using blas or eigen.
- */
-template <typename T>
-struct TolerableValue {
-  HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ENFORCE_EQ(std::is_floating_point<T>::value, true);
-    const T kApproInf = 1e20;
-    if (x == INFINITY) return kApproInf;
-    if (x == -INFINITY) return -kApproInf;
-    return x;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BprLossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-    int rank = x->dims().size();
-
-    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
-    Tensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1);
-    Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
-
-    const framework::Tensor* logits = &x_2d;
-    const framework::Tensor* labels = &labels_2d;
-    framework::Tensor* out = &y_2d;
-
-    const int step_size = logits->dims()[0];
-    const int class_num = logits->dims()[1];
-    const T* logits_data = logits->data<T>();
-    T* loss_data = out->data<T>();
-
-    const int64_t* label_data = labels->data<int64_t>();
-    for (int i = 0; i < step_size; ++i) {
-      int lbl_pos = label_data[i];
-      PADDLE_ENFORCE_GE(lbl_pos, 0);
-      PADDLE_ENFORCE_LT(lbl_pos, class_num);
-      int index_pos = i * class_num + lbl_pos;
-      T sum = static_cast<T>(0);
-      for (int j = 0; j < class_num; j++) {
-        if (j == lbl_pos) continue;
-        int index_neg = i * class_num + j;
-        sum += TolerableValue<T>()(-std::log(
-            1.0f + TolerableValue<T>()(std::exp(logits_data[index_neg] -
-                                                logits_data[index_pos]))));
-      }
-      loss_data[i] = -sum / (class_num - 1);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BprLossGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    const size_t step_size = static_cast<size_t>(x->dims()[0]);
-    const size_t num_classes = static_cast<size_t>(x->dims()[1]);
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    const T* dy_data = dy->data<T>();
-    const T* x_data = x->data<T>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    for (size_t sample_id = 0; sample_id < step_size; sample_id++) {
-      for (size_t x_offset = sample_id * num_classes;
-           x_offset < (sample_id + 1) * num_classes; x_offset++) {
-        dx_data[x_offset] = static_cast<T>(0);
-      }
-      auto p_index = sample_id * num_classes + label_data[sample_id];
-      for (size_t ni = 0; ni < num_classes; ni++) {
-        if (label_data[sample_id] == ni) continue;
-        auto n_index = sample_id * num_classes + ni;
-        auto grad_ = -dy_data[sample_id] /
-                     ((num_classes - 1) *
-                      (1.0f + TolerableValue<T>()(std::exp(x_data[p_index] -
-                                                           x_data[n_index]))));
-        dx_data[p_index] += grad_;
-        dx_data[n_index] -= grad_;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
deleted file mode 100644
index 0c517cc757ca3f6f1ff7f4191ab2d529890b7154..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cast_op.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cast_op.h"
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor of cast op");
-    AddOutput("Out", "The output tensor of cast op");
-    AddAttr<int>("out_dtype", "output data type");
-    AddAttr<int>("in_dtype", "input data type");
-    AddComment(R"DOC(
-Cast Operator.
-
-This Operator casts the input tensor to another data type and
-returns the Output Tensor. It's meaningless if the output dtype equals
-the input dtype, but it's fine if you do so.
-
-)DOC");
-  }
-};
-
-class CastOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"), "The input of cast op must be set");
-    PADDLE_ENFORCE(context->HasOutput("Out"),
-                   "The output of cast op must be set");
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-class CastOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto grad = new framework::OpDesc();
-    grad->SetType("cast");
-    grad->SetInput("X", OutputGrad("Out"));
-    grad->SetOutput("Out", InputGrad("X"));
-    grad->SetAttr("out_dtype", GetAttr("in_dtype"));
-    grad->SetAttr("in_dtype", GetAttr("out_dtype"));
-    return std::unique_ptr<framework::OpDesc>(grad);
-  }
-};
-
-class CastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
-    // CastOp kernel's device type is decided by input tensor place
-    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
-    return kt;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OPERATOR(cast, ops::CastOp, ops::CastOpGradMaker,
-                  ops::CastOpInferShape, ops::CastOpProtoMaker);
-REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
-                       ops::CastOpKernel<CPU, double>,
-                       ops::CastOpKernel<CPU, int>,
-                       ops::CastOpKernel<CPU, int64_t>,
-                       ops::CastOpKernel<CPU, bool>,
-                       ops::CastOpKernel<CPU, uint8_t>,
-                       ops::CastOpKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
deleted file mode 100644
index 657d162878c108760585ca9bd58e2fd34bf1fef3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cast_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cast_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-template <typename T>
-using CastOpKernel =
-    paddle::operators::CastOpKernel<paddle::platform::CUDADeviceContext, T>;
-
-REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
-                        CastOpKernel<int>, CastOpKernel<int64_t>,
-                        CastOpKernel<bool>, CastOpKernel<uint8_t>,
-                        CastOpKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
deleted file mode 100644
index 8fa0416049f8fa128d7ab61f8350b41960f07263..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cast_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename InT, typename OutT>
-struct CastOpTransformFunctor {
-  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
-};
-
-template <typename DeviceContext, typename InT>
-struct CastOpFunctor {
-  const framework::Tensor* in_;
-  framework::Tensor* out_;
-  const DeviceContext& ctx_;
-  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
-                const DeviceContext& ctx)
-      : in_(in), out_(out), ctx_(ctx) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* in_begin = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* in_end = in_begin + numel;
-    auto* out_begin = out_->mutable_data<OutT>(ctx_.GetPlace());
-    platform::Transform<DeviceContext> trans;
-    trans(ctx_, in_begin, in_end, out_begin,
-          CastOpTransformFunctor<InT, OutT>());
-  }
-};
-
-template <typename DeviceContext, typename InT>
-class CastOpKernel : public framework::OpKernel<InT> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("out_dtype")),
-        CastOpFunctor<DeviceContext, InT>(
-            in, out, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc
deleted file mode 100644
index bf766a056a767f4b5e152800e9305d1f51f6d901..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/center_loss_op.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/center_loss_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-class CenterLossOp : public framework::OperatorWithKernel {
- public:
-  CenterLossOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of CenterLoss should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-
-    PADDLE_ENFORCE(ctx->HasInput("CenterUpdateRate"),
-                   "Input(CenterUpdateRate) of CenterLoss should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input(Label) of CenterLoss should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("Centers"),
-                   "Input(Centers) of CenterLoss should not be null.");
-
-    PADDLE_ENFORCE(
-        ctx->HasOutput("SampleCenterDiff"),
-        "Output(SampleCenterDiff) of CenterLoss should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
-                   "Output(Loss) of CenterLoss should not be null.");
-
-    PADDLE_ENFORCE(
-        ctx->HasOutput("CentersOut"),
-        "Output(CentersOut) of CenterLoss shared data with Centers.");
-
-    ctx->SetOutputDim("SampleCenterDiff",
-                      {x_dims[0], product(x_dims) / x_dims[0]});
-    ctx->SetOutputDim("CentersOut", ctx->GetInputDim("Centers"));
-    ctx->SetOutputDim("Loss", {x_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Loss");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class CenterLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of center_loss operator.");
-    AddInput("Label", "(Tensor) Input tensor of center_loss operator.");
-    AddInput("Centers", "(Tensor) Input tensor of center_loss operator.");
-    AddInput("CenterUpdateRate",
-             "(Tensor) Input tensor of center_loss operator.");
-
-    AddOutput("CentersOut", "(Tensor) Input tensor of center_loss operator.");
-    AddOutput("SampleCenterDiff",
-              "(Tensor) output tensor of center_loss operator.");
-    AddOutput("Loss", "(Tensor) Output tensor of center_loss operator.");
-
-    AddAttr<int>("cluster_num",
-                 "The output cluster num of the center_loss operator.");
-    AddAttr<bool>("need_update", "whether need to update center info.");
-    AddComment(R"DOC(
-**CenterLoss operator**
-implemention of the center loss function in the papper<<A Discriminative 
-Feature Learning Approach for Deep Face Recognition>>, equations in this  implement
-is:loss = 1/2 * (x-y)^2 ,where x(X) means the deep feature(output of last hidden layer )
-and y(Label) the target label 
-)DOC");
-  }
-};
-
-class CenterLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("SampleCenterDiff"),
-                   "Input(SampleCenterDiff) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input(Loss) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X) should not be null");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>("SampleCenterDiff")->type(), ctx.device_context());
-  }
-};
-
-class CenterLossOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
-    retv->SetType("center_loss_grad");
-    retv->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
-    retv->SetInput("SampleCenterDiff", Output("SampleCenterDiff"));
-    retv->SetInput("X", Input("X"));
-    retv->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-
-    retv->SetAttrMap(Attrs());
-    return retv;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPUCtx = paddle::platform::CPUDeviceContext;
-
-REGISTER_OPERATOR(center_loss, ops::CenterLossOp, ops::CenterLossOpMaker,
-                  ops::CenterLossOpGradMaker);
-
-REGISTER_OPERATOR(center_loss_grad, ops::CenterLossGradOp);
-
-REGISTER_OP_CPU_KERNEL(center_loss, ops::CenterLossKernel<CPUCtx, float>,
-                       ops::CenterLossKernel<CPUCtx, double>);
-
-REGISTER_OP_CPU_KERNEL(center_loss_grad,
-                       ops::CenterLossGradKernel<CPUCtx, float>,
-                       ops::CenterLossGradKernel<CPUCtx, double>);
diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
deleted file mode 100644
index 10b65fa215adc51ecc5c4ff482803a4c8379a757..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/center_loss_op.cu
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "paddle/fluid/operators/center_loss_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void ComputeDifferent(T *centers_diff, const T *X, const T *centers,
-                                 const int64_t *ids, const int64_t N,
-                                 const int64_t K, const int64_t D) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
-
-  while (idy < K) {
-    int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "received id:", id);
-    PADDLE_ENFORCE(id < N, "received id:", id);
-    T *out = centers_diff + idy * D;
-    const T *x = X + idy * D;
-    const T *cent = centers + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
-      out[i] = x[i] - cent[i];
-    }
-    idy += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void UpdateCenters(T *centers, T *centers_diff, const int64_t *ids,
-                              const int64_t N, const int64_t K, const int64_t D,
-                              const T *alpha) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
-  int count;
-  while (idy < K) {
-    int count = 1;
-    int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "received id:", id);
-    PADDLE_ENFORCE(id < N, "received id:", id);
-
-    for (int i = 0; i < K; i++) {
-      if (ids[i] == id) {
-        count++;
-      }
-    }
-    const T *diff = centers_diff + idy * D;
-    T *cent = centers + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
-      paddle::platform::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
-    }
-    idy += BlockDimY * GridDimX;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CenterLossCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &device_context = ctx.template device_context<DeviceContext>();
-    auto stream = device_context.stream();
-    auto *X = ctx.Input<Tensor>("X");  // deep feature
-    auto *labels = ctx.Input<Tensor>("Label");
-    auto *centers = ctx.Input<Tensor>("Centers");
-    auto *update_rate = ctx.Input<Tensor>("CenterUpdateRate");
-    int cluster_num = ctx.Attr<int>("cluster_num");
-    auto *lr_center = update_rate->data<T>();
-    bool need_update = static_cast<T>(ctx.Attr<bool>("need_update"));
-
-    auto x_data = X->data<T>();
-    auto label_data = labels->data<int64_t>();
-
-    auto x_dims = X->dims();
-    int batch_size = x_dims[0];
-    const int deep_feat_dim = x_dims[1];
-
-    auto *centers_diff = ctx.Output<Tensor>("SampleCenterDiff");
-    auto centers_diff_data = centers_diff->mutable_data<T>(ctx.GetPlace());
-
-    auto centers_data = centers->data<T>();
-    auto centers_dim = centers->dims();
-    auto *out_loss = ctx.Output<Tensor>("Loss");
-    auto loss_data = out_loss->mutable_data<T>(ctx.GetPlace());
-
-    auto *centers_out = ctx.Output<Tensor>("CentersOut");
-    auto *centers_out_data = centers_out->mutable_data<T>(ctx.GetPlace());
-
-    auto ctx_place = ctx.GetPlace();
-    if (centers != centers_out) {
-      framework::TensorCopy(
-          *static_cast<const framework::Tensor *>(centers), ctx_place,
-          *platform::DeviceContextPool::Instance().Get(ctx_place),
-          static_cast<framework::Tensor *>(centers_out));
-    }
-
-    int64_t numel = X->numel();
-
-    size_t N = centers->dims()[0];
-    size_t D = centers->dims()[1];
-    size_t K = labels->numel();
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-
-    ComputeDifferent<T, 128, 8, 8><<<grids, threads, 0, stream>>>(
-        centers_diff_data, x_data, centers_data, label_data, N, K, D);
-
-    auto &place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto sub_result = EigenMatrix<T>::From(*centers_diff);
-
-    auto sub_res_pow2 = (sub_result * sub_result) / T(2.0);
-    auto z = EigenVector<T>::Flatten(*out_loss);
-    z.device(place) = sub_res_pow2.sum(Eigen::array<int, 1>({{1}}));
-    if (need_update) {
-      UpdateCenters<T, 128, 8, 8><<<grids, threads, 0, stream>>>(
-          centers_out_data, centers_diff_data, label_data, N, K, D, lr_center);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using GPUCtx = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(center_loss, ops::CenterLossCUDAKernel<GPUCtx, float>,
-                        ops::CenterLossCUDAKernel<GPUCtx, double>);
-
-REGISTER_OP_CUDA_KERNEL(center_loss_grad,
-                        ops::CenterLossGradKernel<GPUCtx, float>,
-                        ops::CenterLossGradKernel<GPUCtx, double>);
diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h
deleted file mode 100644
index f134bd0cd3c7a565019c92bf08ee4c565ba67ac5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/center_loss_op.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <cstring>
-#include <limits>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/functors.h"
-#include "paddle/fluid/platform/transform.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
-};
-
-template <typename DeviceContext, typename T>
-class CenterLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<Tensor>("X");  // deep feature
-    auto *labels = ctx.Input<Tensor>("Label");
-    auto *centers = ctx.Input<Tensor>("Centers");
-    auto *update_rate = ctx.Input<Tensor>("CenterUpdateRate");
-    int cluster_num = ctx.Attr<int>("cluster_num");
-    auto *lr_center = update_rate->data<T>();
-    T alpha = lr_center[0];
-    bool need_update = static_cast<T>(ctx.Attr<bool>("need_update"));
-
-    auto x_data = X->data<T>();
-    auto label_data = labels->data<int64_t>();
-
-    auto centers_dim = centers->dims();
-    auto centers_data = centers->data<T>();
-
-    auto x_dims = X->dims();
-    int batch_size = x_dims[0];
-    int deep_feat_dim = x_dims[1];
-
-    auto centers_diff = ctx.Output<Tensor>("SampleCenterDiff");
-    auto centers_diff_data = centers_diff->mutable_data<T>(ctx.GetPlace());
-    auto *out_loss = ctx.Output<Tensor>("Loss");
-
-    auto *centers_out = ctx.Output<Tensor>("CentersOut");
-    auto *centers_out_data = centers_out->mutable_data<T>(ctx.GetPlace());
-
-    if (centers_out_data != centers_data) {
-      int size = centers_out->numel() * sizeof(T);
-      memcpy(centers_out_data, centers_data, size);
-    }
-
-    std::vector<int> center_update_count(cluster_num, 1);
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto loss_data = out_loss->mutable_data<T>(ctx.GetPlace());
-
-    Tensor centers_diffacc;  // used to accumulate all diff
-    auto centers_diffacc_data =
-        centers_diffacc.mutable_data<T>(centers_dim, ctx.GetPlace());
-    int numel = centers_diffacc.numel();
-    std::memset(centers_diffacc_data, 0, sizeof(T) * numel);
-
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-    int tLabel;
-
-    const T *x_index;
-    const T *center_index;
-    T *center_out_index;
-    T *center_loss_diff_index;
-    T *acc_index;
-    platform::Transform<DeviceContext> trans;
-
-    for (int i = 0; i < batch_size; ++i) {
-      tLabel = label_data[i];
-      center_update_count[tLabel]++;
-      x_index = x_data + i * deep_feat_dim;                  // xi index
-      center_index = centers_data + tLabel * deep_feat_dim;  // center index
-      center_loss_diff_index = centers_diff_data + i * deep_feat_dim;
-      trans(dev_ctx, x_index, x_index + deep_feat_dim, center_index,
-            center_loss_diff_index, SubFunctor<T>());
-
-      acc_index = centers_diffacc_data + tLabel * deep_feat_dim;
-      blas.VADD(deep_feat_dim, center_loss_diff_index, acc_index,
-                acc_index);  // accumulate
-      loss_data[i] = blas.DOT(deep_feat_dim, center_loss_diff_index,
-                              center_loss_diff_index) /
-                     T(2.0);
-    }
-
-    // update centers data
-    if (need_update == true) {
-      for (int i = 0; i < cluster_num; i++) {
-        acc_index = centers_diffacc_data + i * deep_feat_dim;
-        center_out_index = centers_out_data + i * deep_feat_dim;
-        T scale = alpha / center_update_count[i];
-        blas.SCAL(deep_feat_dim, scale, acc_index);
-        blas.VADD(deep_feat_dim, acc_index, center_out_index, center_out_index);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CenterLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in0 = context.Input<Tensor>("SampleCenterDiff");
-    auto *in1 = context.Input<Tensor>(framework::GradVarName("Loss"));
-    auto *x_g = context.Output<Tensor>(framework::GradVarName("X"));
-    auto sub_result = EigenMatrix<T>::From(*in0);
-    auto out_grad = EigenMatrix<T>::From(*in1);
-
-    auto x_dims = x_g->dims();
-    int cols = x_g->numel() / x_dims[0];
-    // calculate gradient
-    auto grad_mat =
-        (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) * sub_result;
-
-    // propagate back to input
-    auto &eigen_place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    x_g->mutable_data<T>(context.GetPlace());
-    // eigen matrix
-    auto x_grad =
-        EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
-    x_grad.device(eigen_place) = grad_mat;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
deleted file mode 100644
index 21dfaf912a140fce0f444c99c123da55edc18935..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/chunk_eval_op.h"
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class ChunkEvalOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Inference"),
-                   "Input(Inference) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input(Label) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Precision"),
-                   "Output(Precision) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Recall"),
-                   "Output(Recall) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
-                   "Output(F1-Score) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"),
-                   "Output(NumInferChunks) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"),
-                   "Output(NumLabelChunks) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("NumCorrectChunks"),
-        "Output(NumCorrectChunks) of ChunkEvalOp should not be null.");
-
-    auto inference_dim = ctx->GetInputDim("Inference");
-    auto label_dim = ctx->GetInputDim("Label");
-
-    PADDLE_ENFORCE(inference_dim == label_dim,
-                   "Inference's shape must be the same as Label's shape.");
-
-    bool use_padding = ctx->HasInput("SeqLength");
-    if (use_padding) {
-      PADDLE_ENFORCE(inference_dim.size() == 3,
-                     "when SeqLength is provided, Inference should be of dim 3 "
-                     "(batch, bucket, 1)");
-      auto seq_length_dim = ctx->GetInputDim("SeqLength");
-      PADDLE_ENFORCE(seq_length_dim.size() == 1, "seq_length should be rank 1");
-    }
-
-    ctx->SetOutputDim("Precision", {1});
-    ctx->SetOutputDim("Recall", {1});
-    ctx->SetOutputDim("F1-Score", {1});
-    ctx->SetOutputDim("NumInferChunks", {1});
-    ctx->SetOutputDim("NumLabelChunks", {1});
-    ctx->SetOutputDim("NumCorrectChunks", {1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::FP32,
-                                   platform::CPUPlace());
-  }
-};
-
-class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Inference",
-             "(Tensor, default: Tensor<int64_t>). "
-             "Predictions from the network.");
-    AddInput("Label",
-             "(Tensor, default: Tensor<int64_t>). The true tag sequences.");
-    AddInput("SeqLength",
-             "(Tensor, default: Tensor<int64_t>). The length of each sequence, "
-             "used when Inference and Label are Tensor type .")
-        .AsDispensable();
-    AddOutput("Precision",
-              "(float). The evaluated precision (called positive predictive "
-              "value) of chunks on the given mini-batch.");
-    AddOutput("Recall",
-              "(float). The evaluated recall (true positive rate or "
-              "sensitivity) of chunks on the given mini-batch.");
-    AddOutput("F1-Score",
-              "(float). The evaluated F1-Score on the given mini-batch.");
-    AddOutput("NumInferChunks",
-              "(int64_t). The number of chunks in Inference on the given "
-              "mini-batch.");
-    AddOutput(
-        "NumLabelChunks",
-        "(int64_t). The number of chunks in Label on the given mini-batch.");
-    AddOutput(
-        "NumCorrectChunks",
-        "(int64_t). The number of chunks both in Inference and Label on the "
-        "given mini-batch.");
-    AddAttr<int>("num_chunk_types",
-                 "The number of chunk type. See the description for details.");
-    AddAttr<std::string>("chunk_scheme",
-                         "The labeling scheme indicating "
-                         "how to encode the chunks. Must be IOB, IOE, IOBES or "
-                         "plain. See the description"
-                         "for details.")
-        .SetDefault("IOB");
-    AddAttr<std::vector<int>>("excluded_chunk_types",
-                              "A list including chunk type ids "
-                              "indicating chunk types that are not counted. "
-                              "See the description for details.")
-        .SetDefault(std::vector<int>{});
-    AddComment(R"DOC(
-For some basics of chunking, please refer to
-'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
-
-ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
-and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
-Here is a NER example of labeling for these tagging schemes:
-   
-          Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
-   IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
-   IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
-   IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
-   IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
-
-There are three chunk types(named entity types) including PER(person), ORG(organization)
-and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
-
-Since the calculations actually use label ids rather than labels, extra attention
-should be paid when mapping labels to ids to make CheckEvalOp work. The key point
-is that the listed equations are satisfied by ids.
-   
-   tag_type = label % num_tag_type
-   chunk_type = label / num_tag_type
-
-where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
-is the num of chunk types, and `tag_type` get its value from the following table.
-   
-   Scheme Begin Inside End   Single
-    plain   0     -      -     -
-    IOB     0     1      -     -
-    IOE     -     0      1     -
-    IOBES   0     1      2     3
-
-Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
-PER and LOC. To satisfy the above equations, the label map can be like this:
-
-   B-ORG  0
-   I-ORG  1
-   B-PER  2
-   I-PER  3
-   B-LOC  4
-   I-LOC  5
-   O      6
-
-It's not hard to verify the equations noting that the num of chunk types
-is 3 and the num of tag types in IOB scheme is 2. For example, the label
-id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
-I-LOC is 2, which consistent with the results from the equations.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp,
-                             ops::ChunkEvalOpMaker);
-REGISTER_OP_CPU_KERNEL(chunk_eval,
-                       ops::ChunkEvalKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
deleted file mode 100644
index 63c77e52fb0a70cee8c200ad6fe54b2d2fbe6772..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class ChunkEvalKernel : public framework::OpKernel<T> {
- public:
-  struct Segment {
-    int begin;
-    int end;
-    int type;
-    bool operator==(const Segment& y) const {
-      return begin == y.begin && end == y.end && type == y.type;
-    }
-  };
-
-  void GetSegments(const int64_t* label, int length,
-                   std::vector<Segment>* segments, int num_chunk_types,
-                   int num_tag_types, int other_chunk_type, int tag_begin,
-                   int tag_inside, int tag_end, int tag_single) const {
-    segments->clear();
-    segments->reserve(length);
-    int chunk_start = 0;
-    bool in_chunk = false;
-    int tag = -1;
-    int type = other_chunk_type;
-    for (int i = 0; i < length; ++i) {
-      int prev_tag = tag;
-      int prev_type = type;
-      PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types);
-      tag = label[i] % num_tag_types;
-      type = label[i] / num_tag_types;
-      if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type,
-                               tag_begin, tag_inside, tag_end, tag_single)) {
-        Segment segment{
-            chunk_start,  // begin
-            i - 1,        // end
-            prev_type,
-        };
-        segments->push_back(segment);
-        in_chunk = false;
-      }
-      if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
-                     tag_begin, tag_inside, tag_end, tag_single)) {
-        chunk_start = i;
-        in_chunk = true;
-      }
-    }
-    if (in_chunk) {
-      Segment segment{
-          chunk_start,  // begin
-          length - 1,   // end
-          type,
-      };
-      segments->push_back(segment);
-    }
-  }
-
-  bool ChunkEnd(int prev_tag, int prev_type, int tag, int type,
-                int other_chunk_type, int tag_begin, int tag_inside,
-                int tag_end, int tag_single) const {
-    if (prev_type == other_chunk_type) return false;
-    if (type == other_chunk_type) return true;
-    if (type != prev_type) return true;
-    if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single;
-    if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single;
-    if (prev_tag == tag_end) return true;
-    if (prev_tag == tag_single) return true;
-    return false;
-  }
-
-  bool ChunkBegin(int prev_tag, int prev_type, int tag, int type,
-                  int other_chunk_type, int tag_begin, int tag_inside,
-                  int tag_end, int tag_single) const {
-    if (prev_type == other_chunk_type) return type != other_chunk_type;
-    if (type == other_chunk_type) return false;
-    if (type != prev_type) return true;
-    if (tag == tag_begin) return true;
-    if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single;
-    if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single;
-    if (tag == tag_single) return true;
-    return false;
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    // initialize to parse configurations
-    int num_chunk_types, num_tag_types;
-    int other_chunk_type;
-    int tag_begin, tag_inside, tag_end, tag_single;
-    std::vector<Segment> label_segments;
-    std::vector<Segment> output_segments;
-    std::set<int> excluded_chunk_types;
-
-    if (context.Attr<std::string>("chunk_scheme") == "IOB") {
-      num_tag_types = 2;
-      tag_begin = 0;
-      tag_inside = 1;
-      tag_end = -1;
-      tag_single = -1;
-    } else if (context.Attr<std::string>("chunk_scheme") == "IOE") {
-      num_tag_types = 2;
-      tag_begin = -1;
-      tag_inside = 0;
-      tag_end = 1;
-      tag_single = -1;
-    } else if (context.Attr<std::string>("chunk_scheme") == "IOBES") {
-      num_tag_types = 4;
-      tag_begin = 0;
-      tag_inside = 1;
-      tag_end = 2;
-      tag_single = 3;
-    } else if (context.Attr<std::string>("chunk_scheme") == "plain") {
-      num_tag_types = 1;
-      tag_begin = -1;
-      tag_inside = -1;
-      tag_end = -1;
-      tag_single = -1;
-    } else {
-      PADDLE_THROW("Unknown chunk scheme.");
-    }
-    other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
-    excluded_chunk_types.insert(
-        context.Attr<std::vector<int>>("excluded_chunk_types").begin(),
-        context.Attr<std::vector<int>>("excluded_chunk_types").end());
-
-    auto* inference = context.Input<LoDTensor>("Inference");
-    auto place = inference->place();
-    auto* label = context.Input<LoDTensor>("Label");
-    auto* precision = context.Output<Tensor>("Precision");
-    auto* recall = context.Output<Tensor>("Recall");
-    auto* f1 = context.Output<Tensor>("F1-Score");
-    auto* num_infer_chunks = context.Output<Tensor>("NumInferChunks");
-    auto* num_label_chunks = context.Output<Tensor>("NumLabelChunks");
-    auto* num_correct_chunks = context.Output<Tensor>("NumCorrectChunks");
-
-    const int64_t* inference_data = inference->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-    T* precision_data = precision->mutable_data<T>(place);
-    T* racall_data = recall->mutable_data<T>(place);
-    T* f1_data = f1->mutable_data<T>(place);
-    int64_t* num_infer_chunks_data =
-        num_infer_chunks->mutable_data<int64_t>(place);
-    int64_t* num_label_chunks_data =
-        num_label_chunks->mutable_data<int64_t>(place);
-    int64_t* num_correct_chunks_data =
-        num_correct_chunks->mutable_data<int64_t>(place);
-    *num_infer_chunks_data = 0;
-    *num_label_chunks_data = 0;
-    *num_correct_chunks_data = 0;
-
-    auto lod = label->lod();
-    bool use_padding = lod.empty();
-    int num_sequences = 0;
-
-    if (use_padding) {
-      auto dim1 = inference->dims()[1];
-      auto* seq_length_t = context.Input<Tensor>("SeqLength");
-      auto* seq_length_data = seq_length_t->data<int64_t>();
-      num_sequences = seq_length_t->dims()[0];
-
-      for (int i = 0; i < num_sequences; ++i) {
-        int seq_length = seq_length_data[i];
-        EvalOneSeq(inference_data + i * dim1, label_data + i * dim1, seq_length,
-                   &output_segments, &label_segments, num_infer_chunks_data,
-                   num_label_chunks_data, num_correct_chunks_data,
-                   num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
-                   tag_inside, tag_end, tag_single, excluded_chunk_types);
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(lod.size(), 1UL,
-                        "Only support one level sequence now.");
-      PADDLE_ENFORCE(lod == inference->lod(),
-                     "LoD must be same between Inference and Label.");
-      num_sequences = lod[0].size() - 1;
-
-      for (int i = 0; i < num_sequences; ++i) {
-        int seq_length = lod[0][i + 1] - lod[0][i];
-        EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i],
-                   seq_length, &output_segments, &label_segments,
-                   num_infer_chunks_data, num_label_chunks_data,
-                   num_correct_chunks_data, num_chunk_types, num_tag_types,
-                   other_chunk_type, tag_begin, tag_inside, tag_end, tag_single,
-                   excluded_chunk_types);
-      }
-    }
-
-    *precision_data = !(*num_infer_chunks_data)
-                          ? 0
-                          : static_cast<T>(*num_correct_chunks_data) /
-                                (*num_infer_chunks_data);
-    *racall_data = !(*num_label_chunks_data)
-                       ? 0
-                       : static_cast<T>(*num_correct_chunks_data) /
-                             (*num_label_chunks_data);
-    *f1_data = !(*num_correct_chunks_data)
-                   ? 0
-                   : 2 * (*precision_data) * (*racall_data) /
-                         ((*precision_data) + (*racall_data));
-  }
-
-  void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
-                  std::vector<Segment>* output_segments,
-                  std::vector<Segment>* label_segments,
-                  int64_t* num_output_segments, int64_t* num_label_segments,
-                  int64_t* num_correct, int num_chunk_types, int num_tag_types,
-                  int other_chunk_type, int tag_begin, int tag_inside,
-                  int tag_end, int tag_single,
-                  const std::set<int>& excluded_chunk_types) const {
-    GetSegments(output, length, output_segments, num_chunk_types, num_tag_types,
-                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
-    GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
-                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
-    size_t i = 0, j = 0;
-    while (i < output_segments->size() && j < label_segments->size()) {
-      if (output_segments->at(i) == label_segments->at(j) &&
-          excluded_chunk_types.count(output_segments->at(i).type) != 1) {
-        ++(*num_correct);
-      }
-      if (output_segments->at(i).end < label_segments->at(j).end) {
-        ++i;
-      } else if (output_segments->at(i).end > label_segments->at(j).end) {
-        ++j;
-      } else {
-        ++i;
-        ++j;
-      }
-    }
-    for (auto& segment : (*label_segments)) {
-      if (excluded_chunk_types.count(segment.type) != 1) {
-        ++(*num_label_segments);
-      }
-    }
-    for (auto& segment : (*output_segments)) {
-      if (excluded_chunk_types.count(segment.type) != 1) {
-        ++(*num_output_segments);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
deleted file mode 100644
index 5720b295ecf8171540803aaadff43dfdcb20553b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/clip_by_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
-                             ops::ClipByNormOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    clip_by_norm,
-    ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu
deleted file mode 100644
index 788eab7cb2bc57971b85159d0482367551862bf5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/clip_by_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    clip_by_norm,
-    ops::ClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
deleted file mode 100644
index b35e9c72c334dea34e6024cc80eb82e12f6a0747..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class ClipByNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max_norm = context.Attr<T>("max_norm");
-    auto in_var = context.InputVar("X");
-
-    Tensor* output = nullptr;
-    const Tensor* input = nullptr;
-    if (in_var->IsType<framework::LoDTensor>()) {
-      input = context.Input<Tensor>("X");
-
-      output = context.Output<Tensor>("Out");
-      output->mutable_data<T>(context.GetPlace());
-    } else if (in_var->IsType<SelectedRows>()) {
-      auto* x = context.Input<SelectedRows>("X");
-
-      // merge ids in selected rows first
-      math::scatter::MergeAdd<DeviceContext, T> merge_func;
-      SelectedRows* merged_input =
-          const_cast<framework::Scope&>(context.scope())
-              .Var()
-              ->GetMutable<SelectedRows>();
-      merge_func(context.template device_context<DeviceContext>(), *x,
-                 merged_input);
-      input = &(merged_input->value());
-
-      SelectedRows* output_selected_rows = context.Output<SelectedRows>("Out");
-      output_selected_rows->set_rows(merged_input->rows());
-      output_selected_rows->set_height(merged_input->height());
-      output = output_selected_rows->mutable_value();
-      output->Resize(merged_input->value().dims());
-      output->mutable_data<T>(context.GetPlace());
-    } else {
-      PADDLE_THROW("Unexpected branch, input variable type is %s",
-                   framework::ToTypeName(in_var->Type()));
-    }
-
-    PADDLE_ENFORCE_NOT_NULL(input);
-
-    auto x = EigenVector<T>::Flatten(*input);
-    auto out = EigenVector<T>::Flatten(*output);
-    auto x_norm = x.square().sum().sqrt();
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto temp = (x_norm <= max_norm).template cast<T>().eval();
-    auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
-    Eigen::array<int, 1> one_dim{{1}};
-    Eigen::DSizes<int, 1> m_dsize(input->numel());
-    out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
-  }
-};
-
-class ClipByNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ClipByNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ClipByNormOp should not be null.");
-    auto max_norm = ctx->Attrs().Get<float>("max_norm");
-    PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor) The input of clip_by_norm op."
-             "The number of dimensions must be between [1, 9].");
-    AddOutput("Out",
-              "(Tensor) The output of clip_by_norm op with shape as input(X)");
-    AddAttr<float>("max_norm", "(float) The maximum norm value.");
-    AddComment(R"DOC(
-ClipByNorm Operator.
-
-This operator limits the L2 norm of the input $X$ within $max\_norm$.
-If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be
-the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will
-be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
-shown in the following formula:
-
-$$
-Out = \\frac{max\\_norm * X}{norm(X)},
-$$
-
-where $norm(X)$ represents the L2 norm of $X$.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
deleted file mode 100644
index d51f676c1db9919e8a08415c6db227cb9638880f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/clip_op.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/clip_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class ClipOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ClipOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ClipOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto max = ctx->Attrs().Get<float>("max");
-    auto min = ctx->Attrs().Get<float>("min");
-    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-template <typename AttrType>
-class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor)The input of clip op."
-             "The number of dimensions must be between [1, 9].");
-    AddOutput("Out", "(Tensor)The output of clip op with shape as input(X)");
-    AddAttr<AttrType>(
-        "min", "(float)Minimum value, under which element is replaced by min.");
-    AddAttr<AttrType>(
-        "max", "(float)Maximum value, above which element is replaced by max");
-    AddComment(R"DOC(
-Clip Operator.
-
-The clip operator limits the value of given input within an interval. The
-interval is specified with arguments 'min' and 'max':
-
-$$
-Out = \min(\max(X, min), max)
-$$
-
-)DOC");
-  }
-};
-
-class ClipOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    }
-  }
-};
-
-class ClipGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("clip_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(ClipInplaceInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(ClipGradInplaceInferer,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>,
-                  ops::ClipGradOpDescMaker, ops::ClipInplaceInferer);
-REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu
deleted file mode 100644
index 10bee444f6b9683686df77f0815e3c30348236cd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/clip_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/clip_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
deleted file mode 100644
index daf06f370ffb591e25ad846b94c8284aad19a8dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/clip_op.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using platform::Transform;
-
-template <typename T>
-class ClipFunctor {
- public:
-  explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x) const {
-    if (x < min_)
-      return min_;
-    else if (x > max_)
-      return max_;
-    else
-      return x;
-  }
-
- private:
-  T min_;
-  T max_;
-};
-
-template <typename T>
-class ClipGradFunctor {
- public:
-  explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x, const T& y) const {
-    return (y > min_ && y < max_) ? x : 0;
-  }
-
- private:
-  T min_;
-  T max_;
-};
-
-template <typename DeviceContext, typename T>
-class ClipKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max = context.Attr<T>("max");
-    auto min = context.Attr<T>("min");
-    auto* x_var = context.InputVar("X");
-    if (x_var->IsType<framework::LoDTensor>()) {
-      auto* x = context.Input<framework::LoDTensor>("X");
-      auto* out = context.Output<framework::LoDTensor>("Out");
-      T* out_data = out->mutable_data<T>(context.GetPlace());
-      const T* x_data = x->data<T>();
-      int64_t numel = x->numel();
-      Transform<DeviceContext> trans;
-      trans(context.template device_context<DeviceContext>(), x_data,
-            x_data + numel, out_data, ClipFunctor<T>(min, max));
-    } else if (x_var->IsType<framework::SelectedRows>()) {
-      auto* x = context.Input<framework::SelectedRows>("X");
-      auto* out = context.Output<framework::SelectedRows>("Out");
-      PADDLE_ENFORCE_NE(x, out,
-                        "Inplace clip is not allowed when x is SelectedRows");
-      math::scatter::MergeAdd<DeviceContext, T> merge_func;
-      merge_func(context.template device_context<DeviceContext>(), *x, out);
-      auto* out_tensor = out->mutable_value();
-      auto* out_data = out_tensor->data<T>();
-      int64_t numel = out_tensor->numel();
-      Transform<DeviceContext> trans;
-      trans(context.template device_context<DeviceContext>(), out_data,
-            out_data + numel, out_data, ClipFunctor<T>(min, max));
-    } else {
-      PADDLE_THROW("ClipOp only supports LoDTensor and SelectedRows");
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ClipGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max = context.Attr<T>("max");
-    auto min = context.Attr<T>("min");
-    auto* d_out =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto* d_x =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    if (d_x != nullptr) {
-      auto* x = context.Input<framework::LoDTensor>("X");
-      int64_t numel = d_out->numel();
-      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-      const T* d_out_data = d_out->data<T>();
-      const T* x_data = x->data<T>();
-      Transform<DeviceContext> trans;
-      trans(context.template device_context<DeviceContext>(), d_out_data,
-            d_out_data + numel, x_data, d_x_data, ClipGradFunctor<T>(min, max));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
deleted file mode 100644
index ce425e7e698b3a8282e6b5b52ace9eaab70d87f5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_memory_aligment.h"
-
-namespace paddle {
-namespace operators {
-
-static framework::proto::VarType::Type kDefaultDtype =
-    framework::proto::VarType::Type::VarType_Type_BOOL;
-
-template <typename DeviceContext, typename T>
-class CoalesceTensorOp : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &in_var_names = context.Inputs("Input");
-    auto &out_var_names = context.Outputs("Output");
-    auto &in_vars = context.MultiInputVar("Input");
-    auto out_vars = context.MultiOutputVar("Output");
-
-    PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0));
-    PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size());
-
-    for (size_t i = 0; i < in_var_names.size(); ++i) {
-      // Only support LoDTensor
-      PADDLE_ENFORCE_NOT_NULL(in_vars[i], "%s should not be nullptr,",
-                              in_var_names[i]);
-      PADDLE_ENFORCE_NOT_NULL(out_vars[i], "%s should not be nullptr,",
-                              out_var_names[i]);
-      PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensor>());
-      PADDLE_ENFORCE(out_vars[i]->IsType<framework::LoDTensor>());
-    }
-
-    auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
-
-    if (context.Attr<bool>("check_name")) {
-      for (size_t i = 0; i < in_var_names.size(); ++i) {
-        PADDLE_ENFORCE_EQ(in_var_names[i], out_var_names[i]);
-      }
-    } else {
-      // Init the output as input
-      for (size_t i = 0; i < in_tensors.size(); ++i) {
-        out_vars[i]->GetMutable<framework::LoDTensor>()->Resize(
-            in_tensors[i]->dims());
-      }
-    }
-
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-
-    // Get numel and dtype
-    size_t numel = 0;
-    auto dtype = kDefaultDtype;
-    GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype,
-                       context.GetPlace());
-
-    // Alloc the continuous space
-    auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
-    fused_tensor->Resize(framework::make_ddim({static_cast<int64_t>(numel)}))
-        .mutable_data(context.GetPlace(), dtype);
-
-    // Init the continuous space
-    auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output");
-    size_t offset = 0;
-    size_t size_of_dtype = framework::SizeOfType(dtype);
-    if (context.Attr<bool>("copy_data")) {
-      for (size_t i = 0; i < in_var_names.size(); ++i) {
-        size_t len = static_cast<size_t>(in_tensors[i]->numel());
-        auto sub_tensor = fused_tensor->Slice(
-            static_cast<int64_t>(offset), static_cast<int64_t>(offset + len));
-        framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
-                              &sub_tensor);
-
-        offset += platform::Alignment(len * size_of_dtype, context.GetPlace()) /
-                  size_of_dtype;
-      }
-    } else if (context.Attr<bool>("set_constant")) {
-      math::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx, fused_tensor,
-                   static_cast<T>(context.Attr<float>("constant")));
-    }
-
-    // Make the outputs point to the continuous space.
-    offset = 0;
-    std::stringstream ss;
-    ss << "alloc_space_for_vars: ";
-    for (size_t i = 0; i < out_tensors.size(); ++i) {
-      size_t len = static_cast<size_t>(out_tensors[i]->numel());
-      auto dim = out_tensors[i]->dims();
-      out_tensors[i]
-          ->ShareDataWith(fused_tensor->Slice(
-              static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
-          .Resize(dim);
-      len = platform::Alignment(len * size_of_dtype, context.GetPlace()) /
-            size_of_dtype;
-      offset += len;
-      ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
-         << " address: " << out_tensors[i]->data<void>() << ", ";
-    }
-    VLOG(10) << ss.str();
-  }
-
- private:
-  void GetMemSizeAndDtype(
-      const std::vector<const framework::LoDTensor *> &lod_tensors,
-      const std::vector<std::string> var_names, size_t *numel,
-      framework::proto::VarType::Type *dtype,
-      const platform::Place &place) const {
-    PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
-    *numel = 0;
-    size_t size_of_dtype = 0;
-
-    std::stringstream ss;
-    ss << "alloc_space_for_vars: ";
-    for (size_t i = 0; i < var_names.size(); ++i) {
-      PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
-                     var_names[i]);
-
-      auto p_dtype = lod_tensors[i]->type();
-      if (*dtype == kDefaultDtype) {
-        PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.",
-                          var_names[i], kDefaultDtype);
-        *dtype = p_dtype;
-        size_of_dtype = framework::SizeOfType(p_dtype);
-      }
-      PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal.");
-
-      auto size = lod_tensors[i]->numel();
-      PADDLE_ENFORCE_GT(size, 0);
-      ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
-         << "), ";
-      *numel += platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
-                                    place) /
-                size_of_dtype;
-    }
-
-    VLOG(10) << ss.str();
-  }
-};
-
-class AllocContinuousSpaceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-};
-
-class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(vector<LoDTensor>) The input tensors of"
-             " coalesce_tensor operator.")
-        .AsDuplicable();
-    AddOutput("Output",
-              "(vector<LoDTensor>) The output "
-              "tensors of coalesce_tensor operator. And the address "
-              "of output tensors are continuous, they are sliced from the "
-              "tensor of FusedOutput.")
-        .AsDuplicable();
-    AddOutput("FusedOutput",
-              "(LoDTensor) The output tensor "
-              "of coalesce_tensor operator. And the tensors of"
-              " Output is sliced from the tensor of FusedOutput.");
-    AddAttr<bool>("copy_data", "Whether to copy the Input value to Output.")
-        .SetDefault(false);
-    AddAttr<bool>("set_constant",
-                  "Whether to set the Output with a constant value.")
-        .SetDefault(false);
-    AddAttr<float>("constant",
-                   "If set_constant is true, the constant value will be used "
-                   "to set the Output.")
-        .SetDefault(0.0);
-    AddAttr<bool>("check_name",
-                  "Whether to check the name of Input and Output to ensure "
-                  "they are the same separately.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-AllocContinuousSpace Operator.
-
-coalesce_tensor is used to make the address of Output
-continuous according to the Input. This Op will alloc a big tensor
-according to the tensors of Input, the dtype is the same with those input tensors,
-the size is the sum of those input tensors' numel, and the dim of the big
-tensor is {sum(numel)}. And the big tensor is stored in FusedOutput.
-The tensors of Output are sliced from the tensor of FusedOutput.
-Note that, the dtype of Input should be the same, and the dim of Input
-and Output should equal.
-The tensors of Input and Output could be the same or different. And
-coalesce_tensor allows copying the value of Input to Output, or
-setting the Output with a constant value.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(coalesce_tensor, paddle::operators::AllocContinuousSpaceOp,
-                  paddle::operators::AllocContinuousSpaceOpMaker);
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CPU_KERNEL(
-    coalesce_tensor,
-    ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, plat::float16>,
-    ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, int>,
-    ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, float>,
-    ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, double>);
-
-#ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL(
-    coalesce_tensor,
-    ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, int>,
-    ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, float>,
-    ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, double>);
-#endif
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
deleted file mode 100644
index 89103f63d013d8c61ce848dac00c6497ffe1d858..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-include(operators)
-
-set(COLLECTIVE_DEPS "")
-if(WITH_GRPC)
-    set(COLLECTIVE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
-else()
-    set(COLLECTIVE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
-    if(WITH_BRPC_RDMA)
-        find_library(IBVERBS_LIBRARY NAMES ibverbs)
-        ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
-
-
-        find_library(RDMACM_LIBRARY NAMES rdmacm)
-        ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
-
-        set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} ibverbs rdmacm)
-    endif()
-endif()
-
-set(COLLECTIVE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-list(REMOVE_DUPLICATES OPS)
-
-foreach(src ${OPS})
-    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
-endforeach()
-
-register_operators(EXCLUDES c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
-
-if(WITH_GPU AND NOT WIN32)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
-    op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common)
-endif()
-
-set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
-set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
deleted file mode 100644
index 18c8f5d642332d96f6e76cf7f2e70b554cacbb89..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class CAllGatherOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
-    int nranks = ctx->Attrs().Get<int>("nranks");
-    PADDLE_ENFORCE_GE(nranks, 2, "nranks should be >=2");
-    framework::DDim dim = ctx->GetInputDim("X");
-    dim[0] = dim[0] * nranks;
-    ctx->SetOutputDim("Out", dim);
-  }
-};
-
-class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) tensor to be allgather");
-    AddOutput("Out", "(Tensor) the allgather result");
-    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
-        .SetDefault(0);
-    AddAttr<bool>(
-        "use_calc_stream",
-        "(bool default false) eject CUDA operations to calculation stream.")
-        .SetDefault(false);
-    AddAttr<int>("nranks",
-                 "Total trainer count of the distributed training job");
-    AddComment(R"DOC(
-CAllGather Operator
-each rank receives the aggregation of data from all ranks in the order of the ranks
-
-reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allgather
-)DOC");
-  }
-};
-
-class CAllGatherOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
-    retv->SetType("c_reducescatter");
-    retv->SetInput("X", OutputGrad("Out"));
-    retv->SetOutput("Out", InputGrad("X"));
-    retv->SetAttrMap(Attrs());
-    return retv;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OPERATOR(c_allgather, ops::CAllGatherOp, ops::CAllGatherOpGradMaker,
-                  ops::CAllGatherOpMaker);
-
-REGISTER_OP_CPU_KERNEL(c_allgather, ops::CAllGatherOpCPUKernel<float>,
-                       ops::CAllGatherOpCPUKernel<double>,
-                       ops::CAllGatherOpCPUKernel<int>,
-                       ops::CAllGatherOpCPUKernel<int64_t>,
-                       ops::CAllGatherOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
deleted file mode 100644
index 14e2741e52e9cc11fd3de830d9224d8201898c77..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-
-#include <memory>
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
-
-    int nranks = ctx.Attr<int>("nranks");
-    int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
-    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
-    PADDLE_ENFORCE_EQ(nranks, comm->nranks());
-
-    framework::DDim out_dims = in->dims();
-    out_dims[0] *= nranks;
-    out->mutable_data<T>(out_dims, place);
-
-    int64_t send_numel = in->numel();
-    const T* send_buff = in->data<T>();
-    T* recv_buff = out->data<T>();
-
-    cudaStream_t stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
-        send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
-        comm->comm(), stream));
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(c_allgather, ops::CAllGatherOpCUDAKernel<float>,
-                        ops::CAllGatherOpCUDAKernel<double>,
-                        ops::CAllGatherOpCUDAKernel<int>,
-                        ops::CAllGatherOpCUDAKernel<int64_t>,
-                        ops::CAllGatherOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h
deleted file mode 100644
index fe99a9e128d1892a093c090f33f065ae2a158056..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CAllGatherOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("unimplemented cpu kernel for CAllGatherOp.");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
deleted file mode 100644
index bcb529f1570828d2bcca6d4275a213be4d5633eb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CAllReduceMaxOpMaker : public CAllReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Max"; }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max, ops::CAllReduceOp,
-                             ops::CAllReduceMaxOpMaker);
-
-REGISTER_OP_CPU_KERNEL(c_allreduce_max,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMax, float>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMax, double>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMax, int>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMax, int64_t>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMax, plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
deleted file mode 100644
index 34054103aa0cf195ed394df860a8219ff3aa0157..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    c_allreduce_max, ops::CAllReduceOpCUDAKernel<ops::kRedMax, float>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMax, double>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMax, int>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMax, int64_t>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
deleted file mode 100644
index 9d27a9ceb30704463270b3922c3584febcc05c9e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CAllReduceMinOpMaker : public CAllReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Min"; }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_min, ops::CAllReduceOp,
-                             ops::CAllReduceMinOpMaker);
-
-REGISTER_OP_CPU_KERNEL(c_allreduce_min,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMin, float>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMin, double>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMin, int>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMin, int64_t>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMin, plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
deleted file mode 100644
index 4e8b6f9d0a937ec4a5e7205bbd9ae834a2d305db..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    c_allreduce_min, ops::CAllReduceOpCUDAKernel<ops::kRedMin, float>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMin, double>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMin, int>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMin, int64_t>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
deleted file mode 100644
index 02f6210ca4c5fcf2dd53aed23db586aed597df43..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-enum ReduceType { kRedSum, kRedMax, kRedMin, kRedProd };
-
-class CAllReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-template <ReduceType red_type, typename T>
-class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("CAllReduce op do not support CPUKernel for now.");
-  }
-};
-
-template <ReduceType red_type, typename T>
-class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-
-    auto place = ctx.GetPlace();
-    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
-    int64_t numel = in->numel();
-    const void* sendbuff = in->data<void>();
-    out->Resize(in->dims());
-    void* recvbuff = out->mutable_data<T>(place);
-
-    int rid = ctx.Attr<int>("ring_id");
-    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
-
-    cudaStream_t stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    ncclRedOp_t nccl_red_type = ncclSum;
-    switch (red_type) {
-      case kRedSum:
-        nccl_red_type = ncclSum;
-        break;
-
-      case kRedMax:
-        nccl_red_type = ncclMax;
-        break;
-
-      case kRedMin:
-        nccl_red_type = ncclMin;
-        break;
-
-      case kRedProd:
-        nccl_red_type = ncclProd;
-        break;
-
-      default:
-        PADDLE_THROW("Invalid reduce type: %d", red_type);
-    }
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be allreduced.");
-    AddOutput("Out", "(Tensor) the allreduced result.");
-    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
-        .SetDefault(0);
-    AddAttr<bool>(
-        "use_calc_stream",
-        "(bool default false) eject CUDA operations to calculation stream.")
-        .SetDefault(false);
-    AddComment(string::Sprintf(R"DOC(
-CAllReduce %s Operator
-
-Call collective AllReduce with reduce type %s. If input and output are
-the same variable, in-place allreduce will be used.
-Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allreduce
-)DOC",
-                               GetName(), GetName()));
-  }
-
- protected:
-  virtual std::string GetName() const = 0;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
deleted file mode 100644
index 3cfb1723f18d326b33ed6c332f595010c5241f9a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CAllReduceProdOpMaker : public CAllReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Prod"; }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod, ops::CAllReduceOp,
-                             ops::CAllReduceProdOpMaker);
-
-REGISTER_OP_CPU_KERNEL(c_allreduce_prod,
-                       ops::CAllReduceOpCPUKernel<ops::kRedProd, float>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedProd, double>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedProd, int>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedProd, int64_t>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
deleted file mode 100644
index 61f76c178d03afad6eddf6848badb5894b51dcd8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    c_allreduce_prod, ops::CAllReduceOpCUDAKernel<ops::kRedProd, float>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedProd, double>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedProd, int>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedProd, int64_t>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
deleted file mode 100644
index c80c585a83261f4b039289dd3419f5c7605d8767..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CAllReduceSumOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
-    retv->SetType("c_allreduce_sum");
-    retv->SetInput("X", OutputGrad("Out"));
-    retv->SetOutput("Out", InputGrad("X"));
-    retv->SetAttrMap(Attrs());
-    return retv;
-  }
-};
-
-class CAllReduceSumOpMaker : public CAllReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Sum"; }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OPERATOR(c_allreduce_sum, ops::CAllReduceOp,
-                  ops::CAllReduceSumOpGradMaker, ops::CAllReduceSumOpMaker);
-
-REGISTER_OP_CPU_KERNEL(c_allreduce_sum,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, float>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, double>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, int>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, int64_t>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
deleted file mode 100644
index 8fe7fce21e465af8af4d045c29dbc12ab9bc3c84..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    c_allreduce_sum, ops::CAllReduceOpCUDAKernel<ops::kRedSum, float>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, double>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, int>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, int64_t>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc
deleted file mode 100644
index 72d330306cc9df2836f27309d4f5617dacced34f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CBroadcastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) tensor to be broadcasted.");
-    AddOutput("Out", "(Tensor) the result of broadcast.");
-    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
-        .SetDefault(0);
-    AddAttr<int>("root", "(int default 0) root id for broadcasting.")
-        .SetDefault(0);
-    AddAttr<bool>(
-        "use_calc_stream",
-        "(bool default false) eject CUDA operations to calculation stream.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-CBroadcast Operator
-
-Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#broadcast
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_broadcast, ops::CBroadcastOp,
-                             ops::CBroadcastOpMaker);
-
-REGISTER_OP_CPU_KERNEL(c_broadcast, ops::CBroadcastOpCPUKernel<float>,
-                       ops::CBroadcastOpCPUKernel<double>,
-                       ops::CBroadcastOpCPUKernel<int>,
-                       ops::CBroadcastOpCPUKernel<int64_t>,
-                       ops::CBroadcastOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
deleted file mode 100644
index a4433d0b3d1214808e42d6bb697ab6ff4b6ca149..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto x = ctx.Input<framework::LoDTensor>("X");
-    auto out = ctx.Output<framework::LoDTensor>("Out");
-    int numel = x->numel();
-    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
-
-    int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
-    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
-
-    cudaStream_t stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    int root = ctx.Attr<int>("root");
-    if (root == comm->rank()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
-          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
-          root, comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
-              << x->numel();
-
-      if (out != x) {
-        framework::TensorCopy(
-            *static_cast<const framework::Tensor*>(x), place,
-            *platform::DeviceContextPool::Instance().Get(place),
-            static_cast<framework::Tensor*>(out));
-      }
-    } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::ncclBcast(out->mutable_data<T>(place), numel,
-                                       dtype, root, comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
-              << framework::product(out->dims());
-    }
-
-    out->Resize(x->dims());
-    out->set_lod(x->lod());
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(c_broadcast, ops::CBroadcastOpCUDAKernel<float>,
-                        ops::CBroadcastOpCUDAKernel<double>,
-                        ops::CBroadcastOpCUDAKernel<int>,
-                        ops::CBroadcastOpCUDAKernel<int64_t>,
-                        ops::CBroadcastOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h
deleted file mode 100644
index 4ceb0aa835fe116cdc14444dfb7ea6046f33c482..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_broadcast_op.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CBroadcastOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("Unimplemented cpu kernel for CBroadcastOp.");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
deleted file mode 100644
index 758affbd438af0261727162685def40fa277bad4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include <nccl.h>
-#endif
-#include <stdint.h>
-#include <ostream>
-#include <string>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class CCommInitAllInferShape : public framework::InferShapeBase {
- public:
-  ~CCommInitAllInferShape() {}
-  void operator()(framework::InferShapeContext* ctx) const override{};
-};
-
-class CCommInitAllOp : public framework::OperatorBase {
- public:
-  CCommInitAllOp(const std::string& type,
-                 const framework::VariableNameMap& inputs,
-                 const framework::VariableNameMap& outputs,
-                 const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      "CCommInitAllOp can run on gpu place only.");
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    std::vector<int> devices = Attr<std::vector<int>>("devices");
-    if (devices.empty()) {
-      devices = platform::GetSelectedDevices();
-    }
-
-    int rid = Attr<int>("ring_id");
-
-    platform::NCCLCommContext::Instance().CreateAllNCCLComms(devices, rid);
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-class CCommInitAllOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddComment(R"DOC(
-CCommInitAll operator
-
-Initialize all collective communicatoin context
-)DOC");
-    AddAttr<std::vector<int>>(
-        "devices",
-        "(std::vector<int>) which devices does the nccl comm initialized on")
-        .SetDefault({});
-    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(c_comm_init_all, ops::CCommInitAllOp,
-                  ops::CCommInitAllInferShape, ops::CCommInitAllOpMaker);
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
deleted file mode 100644
index 16ca6e5238e43c34cb45b6be4f8e310537dd4a88..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include <nccl.h>
-#endif
-#include <stdint.h>
-#include <ostream>
-#include <string>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class CCommInitOp : public framework::OperatorBase {
- public:
-  CCommInitOp(const std::string& type, const framework::VariableNameMap& inputs,
-              const framework::VariableNameMap& outputs,
-              const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE(is_gpu_place(place),
-                   "CCommInitOp can run on gpu place only.");
-
-    auto var = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE_NOT_NULL(var);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    ncclUniqueId* nccl_id = var->GetMutable<ncclUniqueId>();
-
-    int nranks = Attr<int>("nranks");
-    int rank_id = Attr<int>("rank");
-    int rid = Attr<int>("ring_id");
-
-    platform::NCCLCommContext::Instance().CreateNCCLComm(
-        nccl_id, nranks, rank_id, boost::get<platform::CUDAPlace>(place).device,
-        rid);
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-class CCommInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
-    AddComment(R"DOC(
-CCommInit operator
-
-Initialize collective communicatoin context within this trainer
-)DOC");
-    AddAttr<int>("nranks", "(int) The number of ranks of distributed trainers");
-    AddAttr<int>("rank",
-                 "(int) The rank of the trainer in distributed training.");
-    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(c_comm_init, ops::CCommInitOp, ops::CCommInitOpMaker);
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
deleted file mode 100644
index d576ca7d6a3b1f1f1cd74003b17e39a131ae5643..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include <nccl.h>
-#endif
-
-#include <stdint.h>
-#include <ostream>
-#include <string>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class CGenNCCLIdOp : public framework::OperatorBase {
- public:
-  CGenNCCLIdOp(const std::string& type,
-               const framework::VariableNameMap& inputs,
-               const framework::VariableNameMap& outputs,
-               const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    // put nccl id in CPUPlace
-    auto& dev_ctx = *pool.Get(platform::CPUPlace());
-    int rank = Attr<int>("rank");
-    framework::Scope& local_scope = scope.NewScope();
-
-    if (rank == 0) {
-      GenerateAndSend(&local_scope, dev_ctx);
-    } else {
-      GetIdByServer(&local_scope, dev_ctx);
-    }
-    scope.DeleteScope(&local_scope);
-  }
-
- private:
-  void GenerateAndSend(framework::Scope* scope,
-                       const platform::DeviceContext& dev_ctx) const {
-    std::string var_name = Output("Out");
-    auto var = scope->FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var);
-    auto id = var->GetMutable<ncclUniqueId>();
-    PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(id));
-
-    std::vector<std::string> endpoint_list =
-        Attr<std::vector<std::string>>("other_endpoints");
-    distributed::RPCClient* client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-    for (auto& ep : endpoint_list) {
-      VLOG(3) << "sending nccl id to " << ep;
-      client->AsyncSendVar(ep, dev_ctx, *scope, var_name);
-    }
-    client->Wait();
-    for (auto& ep : endpoint_list) {
-      client->AsyncSendBatchBarrier(ep);
-    }
-    client->Wait();
-    VLOG(3) << "sending completed...";
-  }
-
-  void GetIdByServer(framework::Scope* scope,
-                     const platform::DeviceContext& dev_ctx) const {
-    std::string endpoint = Attr<std::string>("endpoint");
-    // NOTE: Can not use unique_ptr here because the default
-    // deleter will call GRPC Server's base class's dtor and
-    // that will cause a wired crash.
-    distributed::RequestSendHandler rpc_h(true);
-    std::unique_ptr<distributed::RPCServer> rpc_service(
-        new RPCSERVER_T(endpoint, 1));
-
-    rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
-    rpc_h.SetRPCServer(rpc_service.get());
-
-    framework::ProgramDesc empty_program;
-    framework::Executor executor(dev_ctx.GetPlace());
-    rpc_h.SetScope(scope);
-    rpc_h.SetDevCtx(&dev_ctx);
-    rpc_h.SetProgram(&empty_program);
-    rpc_h.SetExecutor(&executor);
-
-    std::thread server_thread(
-        std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
-
-    rpc_service->SetCond(distributed::kRequestSend);
-    VLOG(3) << "start getting nccl id from trainer 0...";
-    rpc_service->WaitBarrier(distributed::kRequestSend);
-    VLOG(3) << "got nccl id and stop server...";
-    rpc_service->ShutDown();
-    VLOG(3) << "rpc server stopped";
-    server_thread.join();
-  }
-};
-
-class CGenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "Raw variable contains a NCCL UniqueId instaces.");
-    AddComment(R"DOC(
-CGenNCCLId operator
-
-For trainer 0: generate a new UniqueId and send it to all the other trainers.
-For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
-)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string), e.g. 127.0.0.1:6175 "
-                         "current listen endpoint");
-    AddAttr<std::vector<std::string>>(
-        "other_endpoints",
-        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
-        "list of other trainer endpoints")
-        .SetDefault({});
-    AddAttr<int>("rank",
-                 "(int default 0) "
-                 "The rank of the trainer in distributed training.")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(c_gen_nccl_id, ops::CGenNCCLIdOp, ops::CGenNCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc
deleted file mode 100644
index 1194ac71b32b4e61adf299cfbda0949d4823da90..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class CReduceScatterOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
-    int nranks = ctx->Attrs().Get<int>("nranks");
-    framework::DDim dim = ctx->GetInputDim("X");
-    if (dim[0] > 0 || dim[0] < -1) {
-      PADDLE_ENFORCE(dim[0] % nranks == 0,
-                     "dim[0] (%d) is not divisible by nranks(%d)", dim[0],
-                     nranks);
-      dim[0] /= nranks;
-    }
-    ctx->SetOutputDim("Out", dim);
-  }
-};
-
-class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) tensor to be allgather");
-    AddOutput("Out", "(Tensor) the allgather result");
-    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
-        .SetDefault(0);
-    AddAttr<int>("nranks",
-                 "Total trainer count of the distributed training job")
-        .SetDefault(1);
-    AddAttr<bool>(
-        "use_calc_stream",
-        "(bool default false) eject CUDA operations to calculation stream.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-CReduceScatter Operator
-
-Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#reducescatter
-)DOC");
-  }
-};
-
-class CReduceScatterOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
-    retv->SetType("c_allgather");
-    retv->SetInput("X", OutputGrad("Out"));
-    retv->SetOutput("Out", InputGrad("X"));
-    retv->SetAttrMap(Attrs());
-    return retv;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OPERATOR(c_reducescatter, ops::CReduceScatterOp,
-                  ops::CReduceScatterOpMaker);
-
-REGISTER_OP_CPU_KERNEL(c_reducescatter, ops::CReduceScatterOpCPUKernel<float>,
-                       ops::CReduceScatterOpCPUKernel<double>,
-                       ops::CReduceScatterOpCPUKernel<int>,
-                       ops::CReduceScatterOpCPUKernel<int64_t>,
-                       ops::CReduceScatterOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
deleted file mode 100644
index da92b65aa9ed2c90cefaf61a785566c4609935da..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-
-    int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
-    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
-    int nranks = comm->nranks();
-
-    auto out_dims = in->dims();
-    out_dims[0] = out_dims[0] / nranks;
-    out->mutable_data<T>(out_dims, place);
-
-    int64_t recv_numel = in->numel() / nranks;
-    const T* send_buff = in->data<T>();
-    T* recv_buff = out->data<T>();
-    int dtype = platform::ToNCCLDataType(in->type());
-
-    cudaStream_t stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduceScatter(
-        send_buff, recv_buff, recv_numel, static_cast<ncclDataType_t>(dtype),
-        ncclSum, comm->comm(), stream));
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(c_reducescatter, ops::CReduceScatterOpCUDAKernel<float>,
-                        ops::CReduceScatterOpCUDAKernel<double>,
-                        ops::CReduceScatterOpCUDAKernel<int>,
-                        ops::CReduceScatterOpCUDAKernel<int64_t>,
-                        ops::CReduceScatterOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h
deleted file mode 100644
index ee30808067704ed74eff089a1bf94e55902a26db..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_reducescatter_op.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CReduceScatterOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("Unimplemented cpu kernel for CReduceScatterOp.");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
deleted file mode 100644
index fe74fc597732d7fe1034ad95cc7f8f8e8109f302..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include <nccl.h>
-#endif
-
-#include <string>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class CSyncCalcStreamOp : public framework::OperatorBase {
- public:
-  CSyncCalcStreamOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE(is_gpu_place(place),
-                   "Sync stream op can run on gpu place only for now.");
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    cudaError_t e_sync = cudaStreamSynchronize(dev_ctx->stream());
-    if (e_sync != 0) {
-      LOG(FATAL) << "Fail to sync cuda stream: " << cudaGetErrorString(e_sync);
-    }
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-class CSyncCalcStreamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Dependency of the variable need to sync");
-    AddOutput("Out", "(Tensor) Dependency of the variable need to sync");
-    AddComment(R"DOC(
-CSyncCalcStream Operator
-
-Call calculation stream synchronization.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(c_sync_calc_stream, ops::CSyncCalcStreamOp,
-                  ops::CSyncCalcStreamOpMaker);
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
deleted file mode 100644
index 320c85070385de24461e2121af3d7cfa2c8a6f36..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include <nccl.h>
-#endif
-
-#include <string>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class CSyncCommStreamOp : public framework::OperatorBase {
- public:
-  CSyncCommStreamOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      "Sync stream op can run on gpu place only for now.");
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    int ring_id = Attr<int>("ring_id");
-    auto stream =
-        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
-    cudaError_t e_sync = cudaStreamSynchronize(stream);
-    if (e_sync != 0) {
-      LOG(FATAL) << "Fail to sync nccl stream: " << cudaGetErrorString(e_sync);
-    }
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Dependency of the variable need to sync");
-    AddOutput("Out", "(Tensor) Dependency of the variable need to sync");
-    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
-    AddComment(R"DOC(
-CSyncCommStream Operator
-
-Call communication stream synchronization.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(c_sync_comm_stream, ops::CSyncCommStreamOp,
-                  ops::CSyncCommStreamOpMaker);
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
deleted file mode 100644
index e52d280836e0236bea8ce871116da5d14f4d6bd6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/concat_op.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/concat_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-#ifdef PADDLE_WITH_MKLDNN
-#include <paddle/fluid/platform/mkldnn_helper.h>
-#endif
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-class ConcatOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                      "Inputs(X) of ConcatOp should not be empty.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ConcatOp should not be null.");
-
-    auto ins = ctx->GetInputsDim("X");
-    size_t axis =
-        ComputeAxis(static_cast<int64_t>(ctx->Attrs().Get<int>("axis")),
-                    static_cast<int64_t>(ins[0].size()));
-
-    const size_t n = ins.size();
-
-    PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
-    if (n == 1) {
-      VLOG(3) << "Warning: concat op have only one input, may waste memory";
-    }
-
-    auto out_dims = ins[0];
-    size_t in_zero_dims_size = out_dims.size();
-    for (size_t i = 1; i < n; i++) {
-      for (size_t j = 0; j < in_zero_dims_size; j++) {
-        if (j == axis) {
-          if (ctx->IsRuntime()) {
-            out_dims[axis] += ins[i][j];
-          } else {
-            if (ins[i][j] == -1) {
-              out_dims[axis] = -1;
-            } else {
-              out_dims[axis] += ins[i][j];
-            }
-          }
-        } else {
-          bool check_shape =
-              ctx->IsRuntime() || (out_dims[j] > 0 && ins[i][j] > 0);
-          if (check_shape) {
-            // check all shape in run time
-            PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
-                              "Input tensors should have the same "
-                              "elements except the specify axis.");
-          }
-        }
-      }
-    }
-    if (out_dims[axis] < 0) {
-      out_dims[axis] = -1;
-    }
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto inputs = ctx.MultiInput<Tensor>("X");
-    auto input_data_type = framework::proto::VarType::Type(0);
-    bool flag = 0;
-    for (auto *input : inputs) {
-      if (input->IsInitialized() && input->numel() > 0) {
-        input_data_type = input->type();
-        flag = 1;
-        break;
-      }
-    }
-    if (flag == 0) {
-      PADDLE_THROW("All Inputs of Concat OP are Empty!");
-    }
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (platform::CanMKLDNNBeUsed(ctx)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input tensors of concat operator.").AsDuplicable();
-    AddOutput("Out", "Output tensor of concat operator.");
-    AddAttr<bool>(
-        "use_mkldnn",
-        "(bool, default false) Indicates if MKL-DNN kernel will be used")
-        .SetDefault(false);
-    AddAttr<int>("axis",
-                 "The axis along which the input tensors will be concatenated."
-                 "The axis could also be negative numbers. Negative axis is "
-                 "interpreted as counting from the end of the rank."
-                 "i.e., axis + rank(X) th dimension.")
-        .SetDefault(0);
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Concat Operator.
-
-Concatenate the input tensors along dimension axis.
-Examples:
-  Input[0] = [[1,2],[3,4]]
-  Input[1] = [[5,6]]
-  axis = 0
-  Output = [[1,2],
-            [3,4],
-            [5,6]]
-
-)DOC");
-  }
-};
-
-class ConcatOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    auto in_x = "X";
-    auto out_x_g_n = framework::GradVarName(in_x);
-    ctx->SetOutputsDim(out_x_g_n, ctx->GetInputsDim(in_x));
-    auto &in_names = ctx->Inputs(in_x);
-    auto &out_names = ctx->Outputs(out_x_g_n);
-    PADDLE_ENFORCE_EQ(
-        in_names.size(), out_names.size(),
-        "The number of arguments in %s[%d] and %s[%d] is not equal.", in_x,
-        in_names.size(), out_x_g_n, out_names.size());
-    for (size_t i = 0; i < in_names.size(); ++i) {
-      if (out_names[i] != framework::kEmptyVarName) {
-        ctx->ShareLoD(in_x, out_x_g_n, i, i);
-      }
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ConcatOpGradNoNeedBufferVarInference,
-                                      "X");
-
-class ConcatGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("concat_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
-                  ops::ConcatGradOpDescMaker);
-REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
-                  ops::ConcatOpGradNoNeedBufferVarInference);
-REGISTER_OP_CPU_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>);
-REGISTER_OP_CPU_KERNEL(
-    concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc
deleted file mode 100644
index 334126c4e0b782c98db2fd3c8278b1daf87da6b6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/concat_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ConcatKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>);
-REGISTER_OP_CUDA_KERNEL(
-    concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
deleted file mode 100644
index 4a371de32354d196492a54dce47bf73bf644bad1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/concat_op.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-static inline int64_t ComputeAxis(int64_t axis, int64_t rank) {
-  if (axis < 0) {
-    axis = axis + rank;
-  }
-  return axis > 0 ? axis : 0;
-}
-
-template <typename DeviceContext, typename T>
-class ConcatKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
-    PADDLE_ENFORCE(ins[0], "The input should not be null.");
-    auto axis = ComputeAxis(static_cast<int64_t>(ctx.Attr<int>("axis")),
-                            static_cast<int64_t>(ins[0]->dims().size()));
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    // Sometimes direct copies will be faster, this maybe need deeply analysis.
-    if (axis == 0 && ins.size() < 10) {
-      size_t output_offset = 0;
-      for (auto* in : ins) {
-        if (!in || in->numel() == 0UL) {
-          continue;
-        }
-        auto in_stride = framework::stride_numel(in->dims());
-        auto out_stride = framework::stride_numel(out->dims());
-        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
-                                    out->data<T>() + output_offset, out_stride,
-                                    in->data<T>(), in_stride, in_stride[axis]);
-        output_offset += in_stride[axis];
-      }
-    } else {
-      std::vector<framework::Tensor> inputs;
-      for (size_t j = 0; j < ins.size(); ++j) {
-        if (ins[j] && ins[j]->numel() > 0) {
-          inputs.push_back(*ins[j]);
-        } else {
-          continue;
-        }
-      }
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      paddle::operators::math::ConcatFunctor<DeviceContext, T> concat_functor;
-      concat_functor(dev_ctx, inputs, static_cast<int>(axis), out);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ConcatGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
-    auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
-    auto outs =
-        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
-
-    {
-      auto dx = outs;
-      auto x = ins;
-      for (size_t i = 0; i < dx.size(); ++i) {
-        if (dx[i] != nullptr) {
-          dx[i]->set_lod(x[i]->lod());
-        }
-      }
-    }
-    PADDLE_ENFORCE(ins[0], "The input should not be null.");
-    auto axis = ComputeAxis(static_cast<int64_t>(ctx.Attr<int>("axis")),
-                            static_cast<int64_t>(ins[0]->dims().size()));
-
-    // get output tensor that the name is not kEmptyVarName
-    std::vector<framework::Tensor*> outputs;
-    for (size_t j = 0; j < outs.size(); ++j) {
-      if (out_var_names[j] != framework::kEmptyVarName &&
-          outs[j]->numel() != 0UL) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs.push_back(outs[j]);
-      } else {
-        outputs.push_back(nullptr);
-      }
-    }
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    // Sometimes direct copies will be faster, this maybe need deeply analysis.
-    if (axis == 0 && outs.size() < 10) {
-      std::vector<const framework::Tensor*> ref_shape;
-      ref_shape.insert(ref_shape.begin(), ins.begin(), ins.end());
-      StridedMemcpyWithAxis0<T>(dev_ctx, *out_grad, ref_shape, &outputs);
-    } else {
-      math::SplitFunctor<DeviceContext, T> split_functor;
-      split_functor(dev_ctx, *out_grad, ctx.MultiInput<framework::Tensor>("X"),
-                    static_cast<int>(axis), &outputs);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
deleted file mode 100644
index 758f0a65d13c1d8ec88212ca82199293678f99cb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-include(operators)
-register_operators(DEPS naive_executor)
-cc_library(op_variant SRCS op_variant.cc DEPS operator proto_desc)
-cc_library(conditional_block_op_helper SRCS conditional_block_op_helper.cc DEPS operator op_variant conditional_block_op)
-cc_library(recurrent_op_helper SRCS recurrent_op_helper.cc DEPS operator op_variant recurrent_op)
-cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator op_variant) 
-
-target_link_libraries(conditional_block_infer_op conditional_block_op) 
-
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
deleted file mode 100644
index 5d3f9b43f8c08d356319fa0b9ccaf808811d3d39..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor>
-class CompareOpKernel<platform::CPUDeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    int axis = context.Attr<int>("axis");
-
-    if (x->numel() == 1 && y->numel() == 1) {
-      bool* z_data = z->mutable_data<bool>(context.GetPlace());
-      z_data[0] = Functor()(x->data<T>()[0], y->data<T>()[0]);
-    } else {
-      ElementwiseComputeEx<Functor, platform::CPUDeviceContext, T, bool>(
-          context, x, y, axis, Functor(), z);
-    }
-  }
-};
-
-template <typename OpComment>
-class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    OpComment comment;
-    AddInput("X", string::Sprintf("the left hand operand of %s operator",
-                                  comment.type));
-    AddInput("Y", string::Sprintf("the right hand operand of %s operator",
-                                  comment.type));
-    AddAttr<int>(
-        "axis",
-        "The start dimension index for broadcasting Y onto X. [default -1]")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
-    AddAttr<bool>("force_cpu",
-                  "Force fill output variable to cpu "
-                  "memory. Otherwise, fill output variable to the running "
-                  "device [default true].")
-        .SetDefault(true);
-    AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s",
-                                     comment.equation));
-    AddComment(string::Sprintf(R"DOC(
-It operates element-wise on X and Y, and returns the Out. Each of them is a
-N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
-calculated by $%s$
-)DOC",
-                               comment.equation));
-  }
-};
-
-template <typename OpComment>
-class CompareOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* context) const override {
-    OpComment comment;
-    PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X",
-                   comment.type);
-    PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y",
-                   comment.type);
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-    PADDLE_ENFORCE_GE(dim_x.size(), dim_y.size(),
-                      "The size of dim_y should not be greater than dim_x's.");
-
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-class CompareOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
-    // CompareOp kernel's device type is decided by input tensor place
-    bool force_cpu = ctx.Attr<bool>("force_cpu");
-    kt.place_ = force_cpu ? platform::CPUPlace()
-                          : ctx.Input<framework::LoDTensor>("X")->place();
-    return kt;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_COMPARE_OP(op_type, _equation)                      \
-  struct _##op_type##Comment {                                       \
-    static char type[];                                              \
-    static char equation[];                                          \
-  };                                                                 \
-  char _##op_type##Comment::type[]{#op_type};                        \
-  char _##op_type##Comment::equation[]{_equation};                   \
-  REGISTER_OPERATOR(                                                 \
-      op_type, ::paddle::operators::CompareOp,                       \
-      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
-      ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
-      ::paddle::framework::EmptyGradOpMaker);
-
-REGISTER_COMPARE_OP(less_than, "Out = X < Y");
-REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
-REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
-REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
-REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
-REGISTER_COMPARE_KERNEL(greater_than, CPU,
-                        paddle::operators::GreaterThanFunctor);
-REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
-REGISTER_COMPARE_KERNEL(greater_equal, CPU,
-                        paddle::operators::GreaterEqualFunctor);
-REGISTER_COMPARE_OP(equal, "Out = X == Y");
-REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
-REGISTER_COMPARE_OP(not_equal, "Out = X != Y");
-REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
deleted file mode 100644
index b1f306358359764b919f9e570cf44f9733a7d178..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-
-REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
-REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
-REGISTER_COMPARE_KERNEL(greater_than, CUDA,
-                        paddle::operators::GreaterThanFunctor);
-REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
-                        paddle::operators::GreaterEqualFunctor);
-REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
-REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
deleted file mode 100644
index b7529e4ae632d31524846d9d5aa4b1883f4509a1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct LessThanFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
-};
-
-template <typename T>
-struct LessEqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
-};
-
-template <typename T>
-struct GreaterThanFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
-};
-
-template <typename T>
-struct GreaterEqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
-};
-
-template <typename T>
-struct EqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
-    if (std::is_floating_point<T>::value) {
-      // This branch will be optimized while compiling if T is integer. It is
-      // safe to cast a and b to double.
-      return fabs(static_cast<double>(a - b)) < 1e-8;
-    } else {
-      return (a == b);
-    }
-  }
-};
-
-template <typename T>
-struct NotEqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
-    return !EqualFunctor<T>()(a, b);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class CompareOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    int axis = context.Attr<int>("axis");
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                          Functor(), z);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_COMPARE_KERNEL(op_type, dev, functor)                    \
-  REGISTER_OP_##dev##_KERNEL(                                             \
-      op_type, ::paddle::operators::CompareOpKernel<                      \
-                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
deleted file mode 100644
index 8ad2f79389d9148119b3399789d6671624897cd9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/conditional_block_op.h"
-
-namespace paddle {
-namespace operators {
-
-/* We will implement the op with block separately in the future.
- * The main reason is that some of the training requirements
- * in these OPS can lead to problems(such as memory leaks) during inference.
- */
-class ConditionalBlockInferOp : public ConditionalOp {
- public:
-  ConditionalBlockInferOp(const std::string &type,
-                          const framework::VariableNameMap &inputs,
-                          const framework::VariableNameMap &outputs,
-                          const framework::AttributeMap &attrs)
-      : ConditionalOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    bool need_run;
-    if (Attr<bool>("is_scalar_condition")) {
-      // When is_scalar_condition is True, the conditional variable is a scalar,
-      // whether need to execute the operators in sub-block depends on the
-      // conditional variable (Cond).
-      auto xs = InputTensors(scope, "Cond");
-      need_run = ScalarCondition(xs);
-    } else {
-      // When is_scalar_condition is False, the conditional variable maybe a
-      // vector or tensor, whether need to execute the operators in sub-block
-      // depends on the input variables (Input).
-      auto xs = InputTensors(scope, "Input");
-      need_run = std::all_of(
-          xs.begin(), xs.end(),
-          [](const framework::LoDTensor *t) { return t->numel() != 0; });
-    }
-
-    if (need_run) {
-      auto *scope_var = scope.FindVar(Output("Scope"));
-      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
-      auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
-      scopes->resize(1);
-      scopes->front() = &scope.NewScope();
-      auto &cur_scope = *scopes->front();
-
-      framework::Executor exec(dev_place);
-      auto *block = Attr<framework::BlockDesc *>("sub_block");
-      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
-      scope.DeleteScope(scopes->front());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(conditional_block_infer, ops::ConditionalBlockInferOp,
-                  ops::ConditionalBlockOpProtoMaker,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
deleted file mode 100644
index 260b5672b4f06ab37b9ac0d7fe40e5fb69beb96f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/conditional_block_op.h"
-
-namespace paddle {
-namespace operators {
-
-const char ConditionalOp::kInputs[] = "Input";
-const char ConditionalOp::kOutputs[] = "Out";
-const char ConditionalOp::kCondition[] = "Cond";
-const char ConditionalOp::kScope[] = "Scope";
-const char ConditionalOp::kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
-
-class ConditionalBlockOp : public ConditionalOp {
- public:
-  ConditionalBlockOp(const std::string &type,
-                     const framework::VariableNameMap &inputs,
-                     const framework::VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs)
-      : ConditionalOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    bool need_run;
-    if (Attr<bool>("is_scalar_condition")) {
-      // When is_scalar_condition is True, the conditional variable is a scalar,
-      // whether need to execute the operators in sub-block depends on the
-      // conditional variable (Cond).
-      auto xs = InputTensors(scope, ConditionalOp::kCondition);
-      need_run = ScalarCondition(xs);
-    } else {
-      // When is_scalar_condition is False, the conditional variable maybe a
-      // vector or tensor, whether need to execute the operators in sub-block
-      // depends on the input variables (Input).
-      auto xs = InputTensors(scope, ConditionalOp::kInputs);
-      need_run = std::all_of(
-          xs.begin(), xs.end(),
-          [](const framework::LoDTensor *t) { return t->numel() != 0; });
-    }
-
-    if (need_run) {
-      auto *scope_var = scope.FindVar(Output(ConditionalOp::kScope));
-      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
-      auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
-      scopes->resize(1);
-      scopes->front() = &scope.NewScope();
-      auto &cur_scope = *scopes->front();
-
-      framework::Executor exec(dev_place);
-      auto *block = Attr<framework::BlockDesc *>("sub_block");
-      auto &skip_vars =
-          Attr<std::vector<std::string>>(ConditionalOp::kSkipEagerDeletionVars);
-      exec.Run(*block->Program(), &cur_scope, block->ID(), false, true,
-               skip_vars);
-    }
-  }
-};
-
-class ConditionalBlockGradOp : public ConditionalOp {
- public:
-  ConditionalBlockGradOp(const std::string &type,
-                         const framework::VariableNameMap &inputs,
-                         const framework::VariableNameMap &outputs,
-                         const framework::AttributeMap &attrs)
-      : ConditionalOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    bool need_run;
-    if (Attr<bool>("is_scalar_condition")) {
-      auto xs = this->InputTensors(scope, ConditionalOp::kCondition);
-      need_run = ScalarCondition(xs);
-    } else {
-      auto xs = this->InputTensors(scope, ConditionalOp::kInputs);
-      need_run = std::all_of(
-          xs.begin(), xs.end(),
-          [](const framework::LoDTensor *t) { return t->numel() != 0; });
-    }
-
-    if (need_run) {
-      auto *scope_var = scope.FindVar(Input(ConditionalOp::kScope));
-      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
-      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
-      framework::Scope &cur_scope = *scopes[0];
-
-      framework::Executor exec(dev_place);
-      auto *block = Attr<framework::BlockDesc *>("sub_block");
-
-      const auto &ins = Inputs(ConditionalOp::kInputs);
-      const auto &d_ins =
-          Outputs(framework::GradVarName(ConditionalOp::kInputs));
-      const auto &conds = Inputs(ConditionalOp::kCondition);
-      const auto &d_conds =
-          Outputs(framework::GradVarName(ConditionalOp::kCondition));
-
-      std::vector<std::string> ins_conds_grads;
-      ins_conds_grads.reserve(ins.size() + conds.size());
-      for (auto &in : ins) {
-        ins_conds_grads.emplace_back(framework::GradVarName(in));
-      }
-      for (auto &cond : conds) {
-        ins_conds_grads.emplace_back(framework::GradVarName(cond));
-      }
-
-      exec.Run(*block->Program(), &cur_scope, block->ID(), false, true,
-               ins_conds_grads);
-
-      AssignLocalGradientToGlobal(dev_place, cur_scope, ins_conds_grads.data(),
-                                  ins.size(), d_ins);
-
-      AssignLocalGradientToGlobal(dev_place, cur_scope,
-                                  ins_conds_grads.data() + ins.size(),
-                                  conds.size(), d_conds);
-    }
-  }
-
- private:
-  void AssignLocalGradientToGlobal(
-      const platform::Place &place, const framework::Scope &cur_scope,
-      const std::string *p_grad_names, size_t p_grad_names_num,
-      const std::vector<std::string> &pg_names) const {
-    for (size_t i = 0; i < p_grad_names_num; ++i) {
-      auto out_grad_name = pg_names[i];
-      const auto &in_grad_name = p_grad_names[i];
-      auto *in_var = cur_scope.FindVar(in_grad_name);
-      if (in_var == nullptr) {
-        continue;
-      }
-      auto new_in_grad_name = cur_scope.Rename(in_grad_name);
-      auto assign = framework::OpRegistry::CreateOp(
-          "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
-          framework::AttributeMap{});
-      assign->Run(cur_scope, place);
-      cur_scope.Rename(new_in_grad_name, in_grad_name);
-    }
-  }
-};
-
-class ConditionalBlockGradInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInputs(ConditionalOp::kCondition));
-    if (context->HasInputs(ConditionalOp::kInputs)) {
-      PADDLE_ENFORCE(
-          context->HasOutputs(framework::GradVarName(ConditionalOp::kInputs)));
-      context->SetOutputsDim(framework::GradVarName(ConditionalOp::kInputs),
-                             context->GetInputsDim(ConditionalOp::kInputs));
-    }
-    if (context->HasOutputs(
-            framework::GradVarName(ConditionalOp::kCondition))) {
-      context->SetOutputsDim(framework::GradVarName(ConditionalOp::kCondition),
-                             context->GetInputsDim(ConditionalOp::kCondition));
-    }
-  }
-};
-
-class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto grad_op = new framework::OpDesc();
-    grad_op->SetType("conditional_block_grad");
-    grad_op->SetInput(ConditionalOp::kCondition,
-                      Input(ConditionalOp::kCondition));
-    grad_op->SetInput(ConditionalOp::kInputs, Input(ConditionalOp::kInputs));
-    grad_op->SetInput(ConditionalOp::kOutputs, Output(ConditionalOp::kOutputs));
-    grad_op->SetInput(framework::GradVarName(ConditionalOp::kOutputs),
-                      OutputGrad(ConditionalOp::kOutputs));
-    grad_op->SetInput(ConditionalOp::kScope, Output(ConditionalOp::kScope));
-    grad_op->SetOutput(framework::GradVarName(ConditionalOp::kCondition),
-                       InputGrad(ConditionalOp::kCondition, false));
-    grad_op->SetOutput(framework::GradVarName(ConditionalOp::kInputs),
-                       InputGrad(ConditionalOp::kInputs, false));
-    grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
-    grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp,
-                  ops::ConditionalBlockOpProtoMaker,
-                  ops::ConditionalBlockGradMaker);
-REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp,
-                  ops::ConditionalBlockGradInferShape);
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
deleted file mode 100644
index 9d65c33c51c1226b2518225c3e8efdc5b349238b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-
-namespace paddle {
-namespace operators {
-
-class ConditionalOp : public framework::OperatorBase {
- public:
-  ConditionalOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  static const char kInputs[];
-  static const char kOutputs[];
-  static const char kCondition[];
-  static const char kScope[];
-  static const char kSkipEagerDeletionVars[];
-
- protected:
-  std::vector<const framework::LoDTensor *> InputTensors(
-      const framework::Scope &scope, const std::string &in_name) const {
-    std::vector<const framework::LoDTensor *> retv;
-    auto xs = Inputs(in_name);
-    retv.resize(xs.size(), nullptr);
-    std::transform(
-        xs.begin(), xs.end(), retv.begin(),
-        [&scope](const std::string &var_name) -> const framework::LoDTensor * {
-          auto *var = scope.FindVar(var_name);
-          PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name);
-          return &var->Get<framework::LoDTensor>();
-        });
-    return retv;
-  }
-
-  bool ScalarCondition(
-      const std::vector<const framework::LoDTensor *> &ips) const {
-    if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
-      PADDLE_THROW("should have one initialized input as condition");
-    }
-
-    PADDLE_ENFORCE(ips[0]->type() == framework::proto::VarType::BOOL &&
-                       ips[0]->numel() == 1,
-                   "condition input's data type should be bool, "
-                   "numel should be 1, actual numel is %d",
-                   ips[0]->numel());
-    bool res = false;
-    if (platform::is_gpu_place(ips[0]->place())) {
-#ifdef PADDLE_WITH_CUDA
-      framework::LoDTensor cpu_tensor;
-      framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
-      platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
-      res = cpu_tensor.data<bool>()[0];
-#endif
-    } else {
-      res = ips[0]->data<bool>()[0];
-    }
-    return res;
-  }
-};
-
-class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(ConditionalOp::kCondition,
-             "The conditional variable of this operator. If Cond is empty, the "
-             "whole sub-block will not be executed.")
-        .AsDuplicable();
-    AddInput(ConditionalOp::kInputs, "The input variables of the sub-block.")
-        .AsDuplicable();
-    AddOutput(ConditionalOp::kOutputs, "The output variables of the sub-block.")
-        .AsDuplicable();
-    AddOutput(ConditionalOp::kScope,
-              "(std::vector<Scope*>) The step scope of conditional block. To "
-              "unify the conditional block, rnn and while op, the type of "
-              "scope is std::vector<Scope*>");
-    AddAttr<framework::BlockDesc *>(
-        "sub_block", "The step block of conditional block operator");
-    AddAttr<bool>("is_scalar_condition",
-                  "The conditional variable (Cond) is used as scalar "
-                  "condition.")
-        .SetDefault(false);
-    AddAttr<std::vector<std::string>>(ConditionalOp::kSkipEagerDeletionVars,
-                                      "Vars that would not be deleted when "
-                                      "garbage collection strategy enables")
-        .SetDefault(std::vector<std::string>());
-    AddComment(R"DOC(Conditional block operator
-
-If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar,
-run the operators in sub-block if Cond is True.
-
-If `is_scalar_condition` is False, the conditional variable (Cond) is a vector or
-tensor, run the operators in sub-block if all of input variables are not empty.
-
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
deleted file mode 100644
index 13a00c852a27da2b75056ffbcdc0873ee553e2a8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/operators/controlflow/op_variant.h"
-
-namespace paddle {
-namespace operators {
-
-static bool IsMatchedConditionalBlockOpAndConditionalBlockGradOp(
-    const OpVariant &fwd_op, const OpVariant &bwd_op) {
-  return fwd_op.Outputs().at(ConditionalOp::kScope) ==
-         bwd_op.Inputs().at(ConditionalOp::kScope);
-}
-
-static void FindAllConditionalBlockAndConditionalBlockGradOp(
-    const framework::ProgramDesc &program, std::vector<OpVariant> *fwd_ops,
-    std::vector<OpVariant> *bwd_ops) {
-  PADDLE_ENFORCE_GE(fwd_ops->size(), bwd_ops->size());
-
-  for (size_t i = 1; i < program.Size(); ++i) {
-    auto &block = program.Block(i);
-    for (size_t j = 0; j < block.OpSize(); ++j) {
-      auto *op = block.Op(j);
-      if (op->Type() == "conditional_block") {
-        fwd_ops->emplace_back(op);
-      } else if (op->Type() == "conditional_block_grad") {
-        bwd_ops->emplace_back(op);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE_GE(
-      fwd_ops->size(), bwd_ops->size(),
-      "There are extra conditional_block_grad ops in the graph or program");
-}
-
-static void SetSkipVarsForConditionalBlockOp(OpVariant *fwd_op,
-                                             OpVariant *bwd_op) {
-  auto *grad_block = bwd_op->Attr<framework::BlockDesc *>("sub_block");
-  auto is_skippable_in_fwd = [grad_block](const std::string &var_name) {
-    return var_name != framework::kEmptyVarName &&
-           !grad_block->HasVar(var_name);
-  };
-
-  std::unordered_set<std::string> forward_skip_vars;
-  for (auto *op_desc : grad_block->AllOps()) {
-    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-      if (is_skippable_in_fwd(in_arg_name)) {
-        forward_skip_vars.insert(in_arg_name);
-      }
-    }
-
-    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
-      if (is_skippable_in_fwd(out_arg_name)) {
-        forward_skip_vars.insert(out_arg_name);
-      }
-    }
-  }
-
-  auto &fwd_attrs = const_cast<framework::AttributeMap &>(fwd_op->Attrs());
-  std::vector<std::string> skip_vars_vec(forward_skip_vars.begin(),
-                                         forward_skip_vars.end());
-  VLOG(2) << "Prepare to skip " << skip_vars_vec.size()
-          << " var(s): " << string::join_strings(skip_vars_vec, ' ');
-  fwd_attrs[ConditionalOp::kSkipEagerDeletionVars] = std::move(skip_vars_vec);
-}
-
-static void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl(
-    const framework::ProgramDesc &program, std::vector<OpVariant> *ifelse_ops,
-    std::vector<OpVariant> *ifelse_grad_ops) {
-  FindAllConditionalBlockAndConditionalBlockGradOp(program, ifelse_ops,
-                                                   ifelse_grad_ops);
-
-  VLOG(2) << "Found conditional_block op num: " << ifelse_ops->size()
-          << ", conditional_block_grad op num: " << ifelse_grad_ops->size();
-
-  if (ifelse_grad_ops->empty()) {
-    return;
-  }
-
-  std::unordered_set<OpVariant, OpVariant::Hasher> ifelse_op_set(
-      ifelse_ops->begin(), ifelse_ops->end());
-
-  for (auto &bwd_op : *ifelse_grad_ops) {
-    const OpVariant *matched_fwd_op = nullptr;
-    for (auto &fwd_op : ifelse_op_set) {
-      if (IsMatchedConditionalBlockOpAndConditionalBlockGradOp(fwd_op,
-                                                               bwd_op)) {
-        PADDLE_ENFORCE(matched_fwd_op == nullptr,
-                       "Found multiple matched conditional_block ops");
-        matched_fwd_op = &fwd_op;
-      }
-    }
-
-    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
-                            "Cannot find matched forward conditional_block op");
-
-    SetSkipVarsForConditionalBlockOp(const_cast<OpVariant *>(matched_fwd_op),
-                                     &bwd_op);
-    ifelse_op_set.erase(*matched_fwd_op);
-  }
-}
-
-void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-    const framework::ProgramDesc &program, int block_id,
-    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) {
-  // If block_id is not 0, returns
-  // This is because all conditional_block_ops and conditional_block_grad_ops
-  // in the whole program would be processed when block_id is 0 (i.e.
-  // when Executor::Run() or ParallelExecutor constructs).
-
-  // What's more, all conditional_block_ops and conditional_block_grad_ops
-  // must be processed when block_id is zero. If not, conditional_block_op
-  // may run first and erase variables used in conditional_block_grad_op,
-  // and in this moment, conditional_block_grad_ops may be not constructed yet.
-  if (block_id != 0) return;
-
-  std::vector<OpVariant> fwd_ops, bwd_ops;
-  for (auto &op : all_ops) {
-    if (op->Type() == "conditional_block") {
-      fwd_ops.emplace_back(op.get());
-    } else if (op->Type() == "conditional_block_grad") {
-      bwd_ops.emplace_back(op.get());
-    }
-  }
-
-  PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl(
-      program, &fwd_ops, &bwd_ops);
-}
-
-void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-    const framework::ProgramDesc &program,
-    const std::vector<framework::OperatorBase *> &ifelse_ops,
-    const std::vector<framework::OperatorBase *> &ifelse_grad_ops) {
-  std::vector<OpVariant> fwd_ops, bwd_ops;
-  fwd_ops.reserve(ifelse_ops.size());
-  for (auto *op : ifelse_ops) {
-    fwd_ops.emplace_back(op);
-  }
-
-  bwd_ops.reserve(ifelse_grad_ops.size());
-  for (auto *op : ifelse_grad_ops) {
-    bwd_ops.emplace_back(op);
-  }
-
-  PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl(
-      program, &fwd_ops, &bwd_ops);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
deleted file mode 100644
index f7dfba6f364e197a97cc5e061e42cd5cc84309db..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/controlflow/conditional_block_op.h"
-
-namespace paddle {
-namespace operators {
-
-void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-    const framework::ProgramDesc &program, int block_id,
-    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops);
-
-void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-    const framework::ProgramDesc &program,
-    const std::vector<framework::OperatorBase *> &ifelse_ops,
-    const std::vector<framework::OperatorBase *> &ifelse_grad_ops);
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
deleted file mode 100644
index 0dfed7f5cc1e929c1fb566df1a7dfb4b2450323b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class FeedOp : public framework::OperatorBase {
- public:
-  FeedOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // get device context from pool
-    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-
-    auto feed_var_name = Input("X");
-    auto *feed_var = scope.FindVar(feed_var_name);
-
-    PADDLE_ENFORCE(feed_var != nullptr,
-                   "Cannot find feed_var in scope, feed_var_name is %s",
-                   feed_var_name);
-
-    auto out_name = this->Output("Out");
-    auto *out_var = scope.FindVar(out_name);
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot find out_var in scope, out_var_name is %s",
-                   out_name);
-
-    auto col = Attr<int>("col");
-
-    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var "
-            << out_name;
-
-    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
-    PADDLE_ENFORCE_LT(static_cast<size_t>(col), feed_list.size());
-    auto &feed_item = feed_list.at(static_cast<size_t>(col));
-    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-
-    if (platform::is_same_place(feed_item.place(), place)) {
-      out_item->ShareDataWith(feed_item);
-    } else {
-      framework::TensorCopy(feed_item, place, *dev_ctx, out_item);
-    }
-    out_item->set_lod(feed_item.lod());
-  }
-};
-
-class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of feed op");
-    AddOutput("Out", "The output of feed op");
-    AddAttr<int>("col", "(int) The column of feed");
-    AddComment(R"DOC(
-Feed Operator.
-
-It should not be configured by users directly.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(feed, paddle::operators::FeedOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::FeedOpInfoMaker);
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
deleted file mode 100644
index 39fdf07f051da85413f5f1470fb136ff7b063a8c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-class FetchOp : public framework::OperatorBase {
- public:
-  FetchOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto fetch_var_name = Input("X");
-    auto *fetch_var = scope.FindVar(fetch_var_name);
-    PADDLE_ENFORCE(fetch_var != nullptr,
-                   "Cannot find fetch variable in scope, fetch_var_name is %s",
-                   fetch_var_name);
-
-    auto out_name = this->Output("Out");
-    auto *out_var = scope.FindVar(out_name);
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot find out_var in scope, out_var_name is %s",
-                   out_name);
-
-    auto col = static_cast<size_t>(Attr<int>("col"));
-
-    auto *fetch_list = out_var->GetMutable<framework::FeedFetchList>();
-    auto &src_item = fetch_var->Get<framework::FeedFetchType>();
-
-    if (col >= fetch_list->size()) {
-      fetch_list->resize(col + 1);
-    }
-    auto &dst_item = fetch_list->at(col);
-
-    // FIXME(yuyang18): Should we assume the fetch operator always generate
-    // CPU outputs?
-    if (src_item.IsInitialized() && src_item.numel() > 0) {
-      // Conversion from MKL-DNN to Paddle
-      if (src_item.layout() == framework::DataLayout::kMKLDNN) {
-        framework::Tensor out;
-        framework::innerTransDataLayoutFromMKLDNN(
-            src_item.layout(), framework::DataLayout::kNCHW, src_item, &out,
-            platform::CPUPlace());
-        TensorCopySync(out, platform::CPUPlace(), &dst_item);
-      } else {
-        TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
-      }
-    } else {
-      // Not copy, if the src tensor is empty.
-      dst_item.clear();
-      dst_item.Resize({0});
-    }
-    dst_item.set_lod(src_item.lod());
-
-    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
-  }
-};
-
-class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of fetch op");
-    AddOutput("Out", "The output of fetch op");
-    AddAttr<int>("col", "(int) The column of fetch");
-    AddComment(R"DOC(
-Fetch Operator.
-
-It should not be configured by users directly.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(fetch, paddle::operators::FetchOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::FetchOpInfoMaker);
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
deleted file mode 100644
index fa77f97419b6d605e478709e13413606ff124572..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thread>  // NOLINT
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-static size_t CUDADevCount() {
-#ifdef PADDLE_WITH_CUDA
-  return platform::GetCUDADeviceCount();
-#else
-  return 0UL;
-#endif
-}
-
-class GetPlacesOp : public framework::OperatorBase {
- public:
-  GetPlacesOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    bool is_gpu;
-    if (Attr<std::string>("device_type") == "AUTO") {
-      is_gpu = platform::is_gpu_place(place);
-    } else {
-      is_gpu = Attr<std::string>("device_type") == "CUDA";
-    }
-    auto device_count = static_cast<size_t>(Attr<int>("device_count"));
-    if (device_count == 0) {
-      device_count =
-          is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
-    }
-    PADDLE_ENFORCE_NE(device_count, 0UL, "Cannot indicate %s device count",
-                      is_gpu ? "GPU" : "CPU");
-
-    auto out_var_name = Output("Out");
-    auto &places =
-        *(detail::Ref(scope.FindVar(out_var_name),
-                      "Output variable %s cannot be found", out_var_name)
-              .GetMutable<platform::PlaceList>());
-    places.reserve(device_count);
-    if (is_gpu) {
-      PADDLE_ENFORCE_LE(device_count, CUDADevCount(),
-                        "Only %d CUDA devices found, cannot set to %d",
-                        CUDADevCount(), device_count);
-      for (size_t i = 0; i < device_count; ++i) {
-        places.emplace_back(platform::CUDAPlace(static_cast<int>(i)));
-      }
-    } else {
-      for (size_t i = 0; i < device_count; ++i) {
-        places.emplace_back(platform::CPUPlace());
-      }
-    }
-  }
-};
-
-class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "vector of Place");
-    AddAttr<int>("device_count", "device count").SetDefault(0);
-    AddAttr<std::string>("device_type", "device type")
-        .InEnum({"CUDA", "CPU", "AUTO"})
-        .SetDefault("AUTO");
-    AddComment(R"DOC(
-Returns a list of places based on arguments. The list will be used for parallel
-execution.
-)DOC");
-  }
-};
-
-class GetPlacesInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    for (auto &o_name : ctx->Output("Out")) {
-      ctx->SetType(o_name, framework::proto::VarType::PLACE_LIST);
-    }
-  }
-};
-
-class GetPlacesInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    // Do nothing
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(get_places, ops::GetPlacesOp, ops::GetPlacesOpProtoMaker,
-                  ops::GetPlacesInferVarType, ops::GetPlacesInferShape,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
deleted file mode 100644
index 37a82a8067f84722fc37e2469c739faf25f7540b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/logical_op.h"
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename OpComment>
-class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    OpComment comment;
-    AddInput("X",
-             string::Sprintf("(LoDTensor) Left hand operand of %s operator",
-                             comment.type));
-    AddInput("Y",
-             string::Sprintf("(LoDTensor) Right hand operand of %s operator",
-                             comment.type));
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
-It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean tensors.
-Each element of Out is calculated by %s
-)DOC",
-                               comment.type, comment.equation));
-  }
-};
-
-template <typename OpComment>
-class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    OpComment comment;
-    AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
-                                  comment.type));
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
-It operates element-wise on X, and returns the Out. X and Out are N-dim boolean tensors.
-Each element of Out is calculated by %s
-)DOC",
-                               comment.type, comment.equation));
-  }
-};
-
-template <typename OpComment>
-class BinaryLogicalOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    OpComment comment;
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "Input(X) of %s operator must not be null", comment.type);
-    PADDLE_ENFORCE(context->HasInput("Y"),
-                   "Input(Y) of %s operator must not be null", comment.type);
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-
-    int product_x = framework::product(dim_x);
-    int product_y = framework::product(dim_y);
-    bool check = context->IsRuntime() || (product_x >= 0 && product_y >= 0);
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          product_x, product_y,
-          "The number of elements in X and Y should be same, %d != %d",
-          product_x, product_y);
-    }
-
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-template <typename OpComment>
-class UnaryLogicalOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    OpComment comment;
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "Input(X) of %s operator must not be null", comment.type);
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-class LogicalOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
-    // LogicalOp kernel's device type is decided by input tensor place
-    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
-    return kt;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_BINARY_LOGICAL_OP(op_type, _equation)                     \
-  struct _##op_type##Comment {                                             \
-    static char type[];                                                    \
-    static char equation[];                                                \
-  };                                                                       \
-  char _##op_type##Comment::type[]{#op_type};                              \
-  char _##op_type##Comment::equation[]{_equation};                         \
-  REGISTER_OPERATOR(                                                       \
-      op_type, ::paddle::operators::LogicalOp,                             \
-      ::paddle::operators::BinaryLogicalOpProtoMaker<_##op_type##Comment>, \
-      ::paddle::operators::BinaryLogicalOpInferShape<_##op_type##Comment>, \
-      ::paddle::framework::EmptyGradOpMaker);
-
-#define REGISTER_UNARY_LOGICAL_OP(op_type, _equation)                     \
-  struct _##op_type##Comment {                                            \
-    static char type[];                                                   \
-    static char equation[];                                               \
-  };                                                                      \
-  char _##op_type##Comment::type[]{#op_type};                             \
-  char _##op_type##Comment::equation[]{_equation};                        \
-  REGISTER_OPERATOR(                                                      \
-      op_type, ::paddle::operators::LogicalOp,                            \
-      ::paddle::operators::UnaryLogicalOpProtoMaker<_##op_type##Comment>, \
-      ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \
-      ::paddle::framework::EmptyGradOpMaker);
-
-REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
-                               paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
-                               paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
-                              paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_OP(logical_xor,
-                           "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
-                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu
deleted file mode 100644
index 7ca54b488bfbb260c422941b82145f092a150be7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/logical_op.h"
-
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
-                               paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA,
-                               paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA,
-                              paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA,
-                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h
deleted file mode 100644
index 4a83e0fda6e4ecdb1112f096eb37159337c37147..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct LogicalAndFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; }
-};
-
-template <typename T>
-struct LogicalOrFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; }
-};
-
-template <typename T>
-struct LogicalNotFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a) const { return !a; }
-};
-
-template <typename T>
-struct LogicalXorFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
-    return (a || b) && !(a && b);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class BinaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor binary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(), y->data<T>(),
-          out->mutable_data<bool>(context.GetPlace()), binary_func);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class UnaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor unary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(),
-          out->mutable_data<bool>(context.GetPlace()), unary_func);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
-  REGISTER_OP_##dev##_KERNEL(                                 \
-      op_type, ::paddle::operators::BinaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
-
-#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
-  REGISTER_OP_##dev##_KERNEL(                                \
-      op_type, ::paddle::operators::UnaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
diff --git a/paddle/fluid/operators/controlflow/op_variant.cc b/paddle/fluid/operators/controlflow/op_variant.cc
deleted file mode 100644
index d6eea8c4c8d4b0d006185f3c0515f21f57c0e80d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/op_variant.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/controlflow/op_variant.h"
-
-namespace paddle {
-namespace operators {
-
-struct InputsVisitor
-    : public boost::static_visitor<const framework::VariableNameMap *> {
-  template <typename OpType>
-  const framework::VariableNameMap *operator()(const OpType *op) const {
-    return &(op->Inputs());
-  }
-};
-
-struct OutputsVisitor
-    : public boost::static_visitor<const framework::VariableNameMap *> {
-  template <typename OpType>
-  const framework::VariableNameMap *operator()(const OpType *op) const {
-    return &(op->Outputs());
-  }
-};
-
-struct AttributeMapVisitor
-    : public boost::static_visitor<const framework::AttributeMap *> {
-  const framework::AttributeMap *operator()(const framework::OpDesc *op) const {
-    return &(op->GetAttrMap());
-  }
-
-  const framework::AttributeMap *operator()(
-      const framework::OperatorBase *op) const {
-    return &(op->Attrs());
-  }
-};
-
-struct RawPointerVisitor : public boost::static_visitor<const void *> {
-  template <typename OpType>
-  const void *operator()(const OpType *op) const {
-    return op;
-  }
-};
-
-const framework::VariableNameMap &OpVariant::Inputs() const {
-  return *boost::apply_visitor(InputsVisitor(), op_);
-}
-
-const framework::VariableNameMap &OpVariant::Outputs() const {
-  return *boost::apply_visitor(OutputsVisitor(), op_);
-}
-
-const framework::AttributeMap &OpVariant::Attrs() const {
-  return *boost::apply_visitor(AttributeMapVisitor(), op_);
-}
-
-const void *OpVariant::RawPointer() const {
-  return boost::apply_visitor(RawPointerVisitor(), op_);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h
deleted file mode 100644
index 26c70589f2677eaade9f886aed28f8b8f69541ad..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace operators {
-
-// OpVariant is a wrapper class of OpDesc and OperatorBase pointer
-// So that API would be the same.
-class OpVariant {
- public:
-  OpVariant(const framework::OperatorBase *op) : op_(op) {}  // NOLINT
-
-  OpVariant(const framework::OpDesc *op) : op_(op) {}  // NOLINT
-
-  const framework::VariableNameMap &Inputs() const;
-
-  const framework::VariableNameMap &Outputs() const;
-
-  const framework::AttributeMap &Attrs() const;
-
-  const void *RawPointer() const;
-
-  template <typename AttrType>
-  const AttrType &Attr(const std::string &name) const {
-    auto &attrs = Attrs();
-    auto it = attrs.find(name);
-    PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
-    return boost::get<AttrType>(it->second);
-  }
-
-  bool operator==(const OpVariant &other) const {
-    return RawPointer() == other.RawPointer();
-  }
-
-  int which() const { return static_cast<int>(op_.which()); }
-
-  struct Hasher {
-    size_t operator()(const OpVariant &op) const {
-      return reinterpret_cast<size_t>(op.RawPointer());
-    }
-  };
-
- private:
-  const boost::variant<const framework::OperatorBase *,
-                       const framework::OpDesc *>
-      op_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
deleted file mode 100644
index d2bb68272dff46e36349baf23fff88433950b3fd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ /dev/null
@@ -1,260 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
-
-#include <algorithm>
-#include <string>
-#include <unordered_set>
-#include <utility>
-
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/recurrent_op.h"
-
-namespace paddle {
-namespace operators {
-
-static bool IsMatchedRecurrentOpAndRecurrentGradOp(const OpVariant &fwd_op,
-                                                   const OpVariant &grad_op) {
-  return fwd_op.Inputs().at(RecurrentBase::kInputs) ==
-             grad_op.Inputs().at(RecurrentBase::kInputs) &&
-         fwd_op.Outputs().at(RecurrentBase::kOutputs) ==
-             grad_op.Inputs().at(RecurrentBase::kOutputs);
-}
-
-// Returns whether the variable is skippable in forward recurrent op
-// The variable is skippable in recurrent_op when the variable used in
-// recurrent_grad is not from grad_block.
-static bool IsSkippableVar(const std::string &name,
-                           framework::BlockDesc *grad_block) {
-  return name != framework::kEmptyVarName && !grad_block->HasVar(name);
-}
-
-static void ClearSkipVars(const OpVariant &op) {
-  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
-  std::vector<std::string> &attr_skip_vars =
-      boost::get<std::vector<std::string>>(
-          attrs[RecurrentBase::kSkipEagerDeletionVars]);
-  attr_skip_vars.clear();
-}
-
-// Add skip vars into op's attribute
-template <class Container>
-static void AddSkipVars(const OpVariant &op, const Container &skip_vars) {
-  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
-  VLOG(2) << "Prepare to add " << skip_vars.size()
-          << " skip var(s): " << paddle::string::join_strings(skip_vars, ' ');
-  std::vector<std::string> &attr_skip_vars =
-      boost::get<std::vector<std::string>>(
-          attrs[RecurrentBase::kSkipEagerDeletionVars]);
-  attr_skip_vars.insert(attr_skip_vars.end(), skip_vars.cbegin(),
-                        skip_vars.cend());
-}
-
-// Find all ops and grad ops with given type name. The ops and grad ops
-// may locate in different blocks so we should traverse all blocks in the
-// program and find them out
-static void FindAllOpAndGradOp(const framework::ProgramDesc &program,
-                               OpAndGradOpPair *op_and_grad_op,
-                               const std::string &type_name,
-                               const std::string &backward_type_name) {
-  OpVariantSet &ops = op_and_grad_op->first;
-  OpVariantSet &grad_ops = op_and_grad_op->second;
-
-  PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(),
-                    "There are extra grad ops in the graph or program");
-
-  for (size_t i = 1; i < program.Size(); ++i) {
-    auto &block = program.Block(i);
-    for (size_t j = 0; j < block.OpSize(); ++j) {
-      auto *op = block.Op(j);
-      if (op->Type() == type_name) {
-        ops.emplace(op);
-      } else if (op->Type() == backward_type_name) {
-        grad_ops.emplace(op);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(),
-                    "There are extra grad ops in the graph or program");
-}
-
-// Returns GradVarName of input var names
-static std::vector<std::string> GradVarLists(
-    const std::vector<std::string> &var_names) {
-  std::vector<std::string> retv;
-  retv.reserve(var_names.size());
-  std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv),
-                 framework::GradVarName);
-  return retv;
-}
-
-// Add memory vars in recurrent op as skip vars.
-static void AddOpMemVarsAsSkip(const OpVariant &op, bool set_grad_mem_vars) {
-  bool has_state = op.Attr<bool>(RecurrentBase::kHasStates);
-  if (has_state) {
-    std::unordered_set<std::string> skip_vars;
-
-    auto &mem_vars = op.Attr<std::vector<std::string>>(RecurrentBase::kStates);
-    skip_vars.insert(mem_vars.begin(), mem_vars.end());
-
-    auto &pre_mem_vars =
-        op.Attr<std::vector<std::string>>(RecurrentBase::kExStates);
-    skip_vars.insert(pre_mem_vars.begin(), pre_mem_vars.end());
-
-    if (set_grad_mem_vars) {
-      auto mem_grad_vars = GradVarLists(mem_vars);
-      skip_vars.insert(mem_grad_vars.begin(), mem_grad_vars.end());
-      auto pre_mem_grad_vars = GradVarLists(pre_mem_vars);
-      skip_vars.insert(pre_mem_grad_vars.begin(), pre_mem_grad_vars.end());
-    }
-    AddSkipVars(op, skip_vars);
-  }
-}
-
-// Set outputs and memory vars of the input forward op as skip vars
-static void SetRecurrentForwardOpOnlySkipVarAttr(const OpVariant &fwd_op) {
-  ClearSkipVars(fwd_op);
-
-  AddOpMemVarsAsSkip(fwd_op, /* set_grad_mem_vars = */ false);
-  auto &output_vars = fwd_op.Outputs().at(RecurrentBase::kOutputs);
-  AddSkipVars(fwd_op, output_vars);
-}
-
-// Set skip vars of matched recurrent op and recurrent_grad op
-static void SetRecurrentOpAndRecurrentGradOpSkipVarAttr(
-    const OpVariant &fwd_op, const OpVariant &bwd_op) {
-  // Find all skippable variables in forward recurrent_op
-  ClearSkipVars(fwd_op);
-  AddOpMemVarsAsSkip(fwd_op, /* set_grad_mem_vars = */ false);
-
-  auto *grad_block =
-      bwd_op.Attr<framework::BlockDesc *>(RecurrentBase::kStepBlock);
-  std::unordered_set<std::string> fwd_skip_vars;
-  for (auto *op_desc : grad_block->AllOps()) {
-    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-      if (IsSkippableVar(in_arg_name, grad_block)) {
-        fwd_skip_vars.insert(in_arg_name);
-      }
-    }
-    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
-      if (IsSkippableVar(out_arg_name, grad_block)) {
-        fwd_skip_vars.insert(out_arg_name);
-      }
-    }
-  }
-  AddSkipVars(fwd_op, fwd_skip_vars);
-
-  // Find all skippable variables in recurrent_grad_op
-  // The skippable variables are those which would be used across time steps
-  ClearSkipVars(bwd_op);
-  AddOpMemVarsAsSkip(bwd_op, /* set_grad_mem_vars = */ true);
-  std::unordered_set<std::string> bwd_skip_vars;
-
-  auto &fwd_input = fwd_op.Inputs().at(RecurrentBase::kInputs);
-  auto &in_grads =
-      bwd_op.Outputs().at(framework::GradVarName(RecurrentBase::kInputs));
-
-  PADDLE_ENFORCE_EQ(
-      fwd_input.size(), in_grads.size(),
-      "Backward input gradient number does not match forward input number.");
-  for (size_t i = 0; i < in_grads.size(); ++i) {
-    if (in_grads[i] == framework::kEmptyVarName) {
-      continue;
-    }
-    bwd_skip_vars.insert(in_grads[i]);
-    bwd_skip_vars.insert(framework::GradVarName(fwd_input[i]));
-  }
-
-  auto &fwd_param = fwd_op.Inputs().at(RecurrentBase::kParameters);
-  auto &param_grads =
-      bwd_op.Outputs().at(framework::GradVarName(RecurrentBase::kParameters));
-  PADDLE_ENFORCE_EQ(fwd_param.size(), param_grads.size(),
-                    "Backward parameter gradient number does not match forward "
-                    "parameter number.");
-  for (size_t i = 0; i < fwd_param.size(); ++i) {
-    if (param_grads[i] == framework::kEmptyVarName) {
-      continue;
-    }
-    bwd_skip_vars.insert(param_grads[i]);
-    bwd_skip_vars.insert(framework::GradVarName(fwd_param[i]));
-  }
-
-  AddSkipVars(bwd_op, bwd_skip_vars);
-}
-
-void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
-    const framework::ProgramDesc &program, int block_id,
-    const std::vector<std::unique_ptr<paddle::framework::OperatorBase>>
-        &all_ops) {
-  // If block_id is not 0, returns
-  // This is because all recurrent_ops and recurrent_grad_ops in the whole
-  // program would be processed when block_id is 0 (i.e. when Executor::Run()
-  // or ParallelExecutor constructs).
-
-  // What's more, all recurrent_ops and recurrent_grad_ops must be processed
-  // when block_id is zero. If not, recurrent_op may run first and erase
-  // variables
-  // used in recurrent_grad_op, and in this moment, recurrent_grad_ops may be
-  // not constructed yet.
-  if (block_id != 0) return;
-
-  OpAndGradOpPair op_pair;
-  for (auto &op : all_ops) {
-    if (op->Type() == "recurrent") {
-      op_pair.first.emplace(op.get());
-    } else if (op->Type() == "recurrent_grad") {
-      op_pair.second.emplace(op.get());
-    }
-  }
-  PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(program, &op_pair);
-}
-
-void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
-    const framework::ProgramDesc &program, OpAndGradOpPair *op_pair) {
-  // Find all ops and grad ops at all blocks
-  FindAllOpAndGradOp(program, op_pair, "recurrent", "recurrent_grad");
-
-  OpVariantSet &recurrent_ops = op_pair->first;
-  OpVariantSet &recurrent_grad_ops = op_pair->second;
-
-  VLOG(2) << "Found recurrent op num: " << recurrent_ops.size()
-          << ", recurrent grad op num: " << recurrent_grad_ops.size();
-
-  if (recurrent_ops.empty()) {
-    return;
-  }
-
-  for (auto &bwd_op : recurrent_grad_ops) {
-    const OpVariant *matched_fwd_op = nullptr;
-    for (auto &fwd_op : recurrent_ops) {
-      if (IsMatchedRecurrentOpAndRecurrentGradOp(fwd_op, bwd_op)) {
-        PADDLE_ENFORCE(matched_fwd_op == nullptr,
-                       "Found multiple matched recurrent op");
-        matched_fwd_op = &fwd_op;
-      }
-    }
-    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, "Cannot find matched forward op");
-    SetRecurrentOpAndRecurrentGradOpSkipVarAttr(*matched_fwd_op, bwd_op);
-    recurrent_ops.erase(*matched_fwd_op);
-  }
-
-  for (auto &fwd_op : recurrent_ops) {
-    SetRecurrentForwardOpOnlySkipVarAttr(fwd_op);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.h b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
deleted file mode 100644
index aacca0762ca1d45634d36da572448dae7e9fe195..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/controlflow/op_variant.h"
-#include "paddle/fluid/operators/recurrent_op.h"
-#include "paddle/fluid/platform/variant.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using OpVariantSet = std::unordered_set<OpVariant, OpVariant::Hasher>;
-using OpAndGradOpPair = std::pair<OpVariantSet, OpVariantSet>;
-
-// Set vars to skip eager deletion on input recurrent and recurrent_grad for
-// preparing safe eager deletion. Input contains all recurrent and
-// recurrent_grad ops at block 0 and the function will find all recurrent and
-// recurrent_grad ops across blocks.
-void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
-    const framework::ProgramDesc &program, OpAndGradOpPair *op_pair);
-
-// Set vars to skip eager deletion on input recurrent and recurrent_grad for
-// preparing safe eager deletion. The input block_id must be 0 and caller can
-// input all ops in the block. The function will find all recurrent and
-// recurrent_grad ops across blocks.
-void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
-    const framework::ProgramDesc &program, int block_id,
-    const std::vector<std::unique_ptr<paddle::framework::OperatorBase>>
-        &all_ops);
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
deleted file mode 100644
index 2ca5242c5c935e2156bf95689c53b0c29809c235..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/array_operator.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-namespace paddle {
-namespace operators {
-
-class WriteToArrayOp : public ArrayOp {
- public:
-  WriteToArrayOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : ArrayOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto *x = scope.FindVar(Input("X"));
-    if (x == nullptr) return;
-    auto &x_tensor = x->Get<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, place);
-    auto *out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
-    if (offset >= out->size()) {
-      VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
-               << " to " << offset + 1;
-      out->resize(offset + 1);
-    }
-    auto *out_tensor = &out->at(offset);
-    out_tensor->set_lod(x_tensor.lod());
-    if (x_tensor.memory_size() > 0) {
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-
-      TensorCopy(x_tensor, place, dev_ctx, out_tensor);
-    } else {
-      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
-                  "nothing has been written to output array["
-               << offset << "].";
-    }
-  }
-};
-
-class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
-    AddInput(
-        "I",
-        "(Tensor) the subscript index in tensor array. The number of element "
-        "should be 1");
-    AddOutput("Out", "(TensorArray) the tensor array will be written");
-    AddComment(R"DOC(
-WriteToArray Operator.
-
-This operator writes a LoDTensor to a LoDTensor array.
-
-Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
-equation is
-
-$$A[i] = T$$
-
-)DOC");
-  }
-};
-
-class WriteToArrayInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
-    if (context->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
-                        "The number of element of subscript index must be 1");
-    }
-    if (!context->HasInput("X")) {
-      return;
-    }
-    PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-  }
-
- protected:
-  virtual const char *NotHasXError() const { return "Must set the lod tensor"; }
-
-  virtual const char *NotHasOutError() const {
-    return "Must set the lod tensor array";
-  }
-};
-
-class WriteToArrayInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = ctx->Input("X")[0];
-    auto out_name = ctx->Output("Out")[0];
-    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
-    ctx->SetType(out_name, framework::proto::VarType::LOD_TENSOR_ARRAY);
-    if (ctx->HasVar(x_name)) {
-      ctx->SetDataType(out_name, ctx->GetDataType(x_name));
-    }
-  }
-};
-
-class ReadFromArrayOp : public ArrayOp {
- public:
-  ReadFromArrayOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : ArrayOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto *x = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE(x != nullptr, "X must be set");
-    auto &x_array = x->Get<framework::LoDTensorArray>();
-    auto *out = scope.FindVar(Output("Out"));
-    PADDLE_ENFORCE(out != nullptr, "Out must be set");
-    size_t offset = GetOffset(scope, place);
-    if (offset < x_array.size()) {
-      auto *out_tensor = out->GetMutable<framework::LoDTensor>();
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-      framework::TensorCopy(x_array[offset], place, dev_ctx, out_tensor);
-      out_tensor->set_lod(x_array[offset].lod());
-    } else {
-      VLOG(10) << "offset " << offset << " >= " << x_array.size();
-    }
-  }
-};
-
-class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(TensorArray) the array will be read from.");
-    AddInput("I",
-             "(Tensor) the subscript index in tensor array. The number of "
-             "element should be 1");
-    AddOutput("Out", "(LoDTensor) the tensor will be read from.");
-    AddComment(R"DOC(
-ReadFromArray Operator.
-
-Read a LoDTensor from a LoDTensor Array.
-
-Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
-equation is
-
-$$T = A[i]$$
-
-)DOC");
-  }
-};
-
-class ReadFromArrayInferShape : public WriteToArrayInferShape {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    WriteToArrayInferShape::operator()(context);
-    if (!context->HasInput("X")) {
-      return;
-    }
-
-    // FIXME: just for compile time.
-    if (!context->IsRuntime()) {
-      context->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
- protected:
-  const char *NotHasXError() const override {
-    return "The input array X must be set";
-  }
-  const char *NotHasOutError() const override {
-    return "The output tensor out must be set";
-  }
-};
-
-class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("read_from_array");
-    grad_op->SetInput("I", Input("I"));
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("write_to_array");
-    grad_op->SetInput("I", Input("I"));
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(write_to_array, ops::WriteToArrayOp,
-                  ops::WriteToArrayInferShape, ops::WriteToArrayOpProtoMaker,
-                  ops::WriteToArrayGradMaker, ops::WriteToArrayInferVarType);
-REGISTER_OPERATOR(read_from_array, ops::ReadFromArrayOp,
-                  ops::ReadFromArrayInferShape, ops::ReadFromArrayProtoMaker,
-                  ops::ReadFromArrayGradMaker);
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
deleted file mode 100644
index cd11e87c9327df25dad572758c1d2e04dbf7cc93..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ /dev/null
@@ -1,465 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/controlflow/while_op_helper.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-
-using StepScopeVar = std::vector<framework::Scope *>;
-using LoDTensor = framework::LoDTensor;
-
-namespace {  // NOLINT
-static std::string GetSkipEagerDeletionVarsDebugString(
-    const std::vector<std::string> &vars) {
-  std::string str = "Skip " + std::to_string(vars.size()) +
-                    " var(s) in eager deletion mode: ";
-  for (auto &var : vars) {
-    str.append(var);
-    str.push_back(' ');
-  }
-  return str;
-}
-}  // NOLINT
-
-class WhileOp : public framework::OperatorBase {
- public:
-  WhileOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
-
-    auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
-    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
-
-    framework::Executor executor(dev_place);
-    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-
-    auto *program = block->Program();
-
-    auto step_scopes =
-        scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
-    PADDLE_ENFORCE_EQ(step_scopes->size(), 0, "The StepScope should be empty.");
-    PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
-                   "Condition of while op must in CPU memory.");
-
-    bool is_test = Attr<bool>("is_test");
-    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
-    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
-
-    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
-    if (!is_test) {
-      while (cond.data<bool>()[0]) {
-        auto &current_scope = scope.NewScope();
-        step_scopes->push_back(&current_scope);
-        executor.RunPreparedContext(ctx.get(), &current_scope, false, true,
-                                    true);
-      }
-    } else {
-      auto &current_scope = scope.NewScope();
-      executor.CreateVariables(*program, &current_scope, block->ID());
-      while (cond.data<bool>()[0]) {
-        for (auto &name : current_scope.LocalVarNames()) {
-          auto *var = current_scope.Var(name);
-          if (var->IsType<framework::LoDTensor>()) {
-            // Clear all lod information for all lod_tensors.
-            auto *t = var->GetMutable<framework::LoDTensor>();
-            framework::LoD empty_lod;
-            t->set_lod(empty_lod);
-          } else if (var->IsType<framework::LoDTensorArray>()) {
-            // Clear elements of all tensor arrays.
-            auto *t = var->GetMutable<framework::LoDTensorArray>();
-            t->clear();
-          }
-        }
-        executor.RunPreparedContext(ctx.get(), &current_scope, false, false,
-                                    false);
-      }
-      scope.DeleteScope(&current_scope);
-    }
-  }
-};
-
-class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kX,
-             "A set of variables, which are required by operators inside the "
-             "block of While Op.")
-        .AsDuplicable();
-    AddInput(
-        kCondition,
-        "(Bool) An scalar. When it's False, the While Op will be terminated.")
-        .AsDuplicable();
-    AddOutput(kOutputs,
-              "A set of variables, which will be assigned with values "
-              "generated by the operators inside the block of While Op.")
-        .AsDuplicable();
-    AddOutput(kStepScopes,
-              "(StepScopeVar) A vector of local scope, which size equals the "
-              "step number of While Op. The i'th scope storages temporary "
-              "variables generated in the i'th step.");
-    AddAttr<framework::BlockDesc *>(kStepBlock,
-                                    "The step block inside WhileOp");
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars,
-                                      "Vars that would skip eager deletion."
-                                      "Users should not set this manually.")
-        .SetDefault(std::vector<std::string>());
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-
-class WhileGradOp : public framework::OperatorBase {
- public:
-  WhileGradOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_ENFORCE(!Attr<bool>("is_test"),
-                   "GradOp is only callable when is_test is false");
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    framework::Executor executor(dev_place);
-    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-    auto *program = block->Program();
-
-    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
-    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
-    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
-
-    auto *step_scopes =
-        scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
-
-    auto outside_og_names = Inputs(framework::GradVarName(kOutputs));
-    auto inside_og_names =
-        Attr<std::vector<std::string>>("original_output_grad");
-
-    PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size());
-
-    for (auto cur_scope_iter = step_scopes->rbegin();
-         cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
-      VLOG(3) << "Start backward at time_step "
-              << cur_scope_iter - step_scopes->rbegin();
-      framework::Scope &cur_scope = **cur_scope_iter;
-      // Link OG from outside to inside
-      for (size_t i = 0; i < outside_og_names.size(); ++i) {
-        auto outside_og_name = outside_og_names[i];
-        auto inside_og_name = inside_og_names[i];
-        VLOG(8) << "Linking outside " << outside_og_name << " --> inside "
-                << inside_og_name;
-        if (scope.FindVar(outside_og_name) == nullptr) {
-          continue;
-        }
-
-        auto &og_outside =
-            detail::Ref(scope.FindVar(outside_og_name),
-                        "Cannot find Outside Gradient %s", outside_og_name);
-        auto &og_inside =
-            detail::Ref(cur_scope.Var(inside_og_name),
-                        "Cannot find inside gradient %s", inside_og_name);
-        if (og_outside.IsType<framework::LoDTensor>()) {
-          auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
-          auto &inside_tensor =
-              detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
-          inside_tensor.set_lod(outside_tensor.lod());
-          inside_tensor.ShareDataWith(outside_tensor);
-        } else if (og_outside.IsType<framework::LoDTensorArray>()) {
-          auto outside_array =
-              og_outside.GetMutable<framework::LoDTensorArray>();
-          auto &inside_array =
-              detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
-          inside_array.clear();
-          inside_array.resize(outside_array->size());
-          VLOG(8) << outside_og_name << " size = " << outside_array->size();
-
-          for (size_t j = 0; j < inside_array.size(); ++j) {
-            if (!outside_array->at(j).IsInitialized()) {
-              outside_array->at(j).Resize({0});
-            }
-            VLOG(8) << j << " " << outside_array->at(j).numel();
-            if (outside_array->at(j).numel() != 0) {
-              inside_array[j].set_lod(outside_array->at(j).lod());
-              inside_array[j].ShareDataWith(outside_array->at(j));
-            } else {
-              PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
-            }
-          }
-        } else {
-          PADDLE_THROW("Currently only support LoDTensor and LoDTensorArray.");
-        }
-      }
-      executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false, true,
-                                  true);
-
-      // The Outputs(kXGRAD) contains the names of the gradient of parameters
-      // and inputs.
-      auto &pg_ig_names = Outputs(kXGRAD);
-      auto &p_names = Inputs(kX);
-      PADDLE_ENFORCE_EQ(pg_ig_names.size(), p_names.size());
-      for (size_t param_id = 0; param_id < pg_ig_names.size(); ++param_id) {
-        if (pg_ig_names[param_id] == framework::kEmptyVarName) {
-          continue;  // parameter doesn't have gradient
-        }
-        auto inside_grad_name = framework::GradVarName(p_names[param_id]);
-
-        // for some grad_op, their input doesn't have gradient,
-        // for example lookup_table_grad_op, the input(Idx) doesn't have
-        // gradient.
-        auto pg_ig_var = cur_scope.FindVar(inside_grad_name);
-        PADDLE_ENFORCE(pg_ig_var != nullptr);
-        if (pg_ig_var->IsType<framework::LoDTensorArray>()) {
-          auto pg_ig_lod_t_arr =
-              pg_ig_var->GetMutable<framework::LoDTensorArray>();
-          bool empty = true;
-          for (auto &each : *pg_ig_lod_t_arr) {
-            if (each.numel() != 0) {
-              empty = false;
-              break;
-            }
-          }
-          if (empty) {
-            LOG(WARNING) << pg_ig_names[param_id]
-                         << " is not found in cur_scope.";
-            continue;
-          }
-        }
-
-        //  // TODO(tonyyang-svail): Not sure we need the following
-        //  // If does not compute gradient of that variable inside rnn,
-        //  just
-        //  // continue
-        //  if (local_var_names.find(inside_grad_name) ==
-        //  local_var_names.end()) {
-        //    continue;
-        //  }
-
-        // zero gradient variable in step 0
-        if (cur_scope_iter == step_scopes->rbegin()) {
-          auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
-          PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
-          PADDLE_ENFORCE(
-              var->IsType<framework::LoDTensorArray>() ||
-                  var->IsType<LoDTensor>(),
-              "Currently the type of var only can be LoDTensorArray, "
-              "or LoDTensor, but the received var[%s] is %s.",
-              inside_grad_name, framework::ToTypeName(var->Type()));
-
-          if (var->IsType<LoDTensor>()) {
-            auto &inside_tensor = var->Get<framework::LoDTensor>();
-            framework::AttributeMap attrs;
-            attrs["dtype"] = inside_tensor.type();
-            attrs["shape"] = framework::vectorize<int>(inside_tensor.dims());
-            attrs["value"] = 0.0f;
-
-            auto var_name = pg_ig_names[param_id];
-            auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", framework::VariableNameMap{},
-                {{"Out", {var_name}}}, attrs);
-            zero_op->Run(scope, dev_place);
-            scope.FindVar(var_name)
-                ->GetMutable<framework::LoDTensor>()
-                ->set_lod(inside_tensor.lod());
-          }
-        }
-        auto new_inside_name = cur_scope.Rename(inside_grad_name);
-        auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {pg_ig_names[param_id], new_inside_name}}},
-            {{"Out", {pg_ig_names[param_id]}}},
-            framework::AttributeMap{{"use_mkldnn", {false}}});
-        sum_op->Run(cur_scope, dev_place);
-        cur_scope.Rename(new_inside_name, inside_grad_name);
-      }
-      dev_ctx.Wait();
-      const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
-    }
-    step_scopes->clear();
-  }
-};
-
-class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *while_grad = new framework::OpDesc();
-    while_grad->SetType("while_grad");
-    while_grad->SetInput(kX, Input(kX));
-    while_grad->SetInput(kOutputs, Output(kOutputs));
-    while_grad->SetInput(kStepScopes, Output(kStepScopes));
-
-    auto *grad_block = this->grad_block_[0];
-    auto *fwd_block = grad_block->ForwardBlock();
-    auto *parent_block = grad_block->ParentBlock();
-
-    // Not all of IGs will be generated by inner gradient operators of while op.
-    // Ignore IGs that is not generated by the inside block.
-    std::unordered_set<std::string> inner_op_outputs;
-    for (const auto *op : grad_block->AllOps()) {
-      for (auto &oname : op->OutputArgumentNames()) {
-        inner_op_outputs.insert(oname);
-      }
-    }
-    auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
-    for (auto &each_ig : igs) {
-      if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) {
-        VLOG(8) << "Ignore " << each_ig;
-        each_ig = framework::kEmptyVarName;
-      }
-    }
-    while_grad->SetOutput(framework::GradVarName(kX), igs);
-
-    // OG should be re-calculated by step blocks, since many outputs of while op
-    // do not need to calculate gradients.
-    std::unordered_set<std::string> block_ins;
-    block_ins.reserve(Input(kX).size() + Output(kOutputs).size());
-    for (auto &p : Input(kX)) {
-      block_ins.insert(p);
-    }
-    for (auto &o : Output(kOutputs)) {
-      block_ins.insert(o);
-    }
-    std::unordered_set<std::string> output_grads;
-    for (const auto *op : grad_block->AllOps()) {
-      for (auto &input_name : op->InputArgumentNames()) {
-        // If the input of Op has been recorded or is generated by the forward
-        // block, do not make it as input again.
-
-        // The input is located in I/O or other op's outputs or the variable is
-        // located in grad_block's parents
-        if (block_ins.find(input_name) != block_ins.end() ||
-            (fwd_block->FindVarRecursive(input_name) != nullptr ||
-             parent_block->FindVarRecursive(input_name) != nullptr)) {
-          continue;
-        }
-
-        output_grads.insert(input_name);
-      }
-      for (auto &output_name : op->OutputArgumentNames()) {
-        block_ins.insert(output_name);
-      }
-    }
-
-    std::vector<std::string> output_grads_list;
-    output_grads_list.resize(output_grads.size());
-    std::copy(output_grads.begin(), output_grads.end(),
-              output_grads_list.begin());
-    while_grad->SetInput(framework::GradVarName(kOutputs), output_grads_list);
-
-    while_grad->SetAttrMap(this->Attrs());
-    while_grad->SetBlockAttr(kStepBlock, grad_block);
-    // record the original output gradient names, since the gradient name of
-    // while operator could be renamed.
-    while_grad->SetAttr("original_output_grad", output_grads_list);
-
-    while_grad->SetAttr(kSkipEagerDeletionVars, std::vector<std::string>());
-
-    return std::unique_ptr<framework::OpDesc>(while_grad);
-  }
-};
-
-class WhileGradOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto p_names = ctx->Input(kX);
-    auto pg_ig_names = ctx->Output(framework::GradVarName(kX));
-
-    for (size_t i = 0; i < p_names.size(); ++i) {
-      if (ctx->HasVar(pg_ig_names[i])) {
-        VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
-                << " type: " << ctx->GetType(p_names[i]);
-        ctx->SetType(pg_ig_names[i], ctx->GetType(p_names[i]));
-        ctx->SetDataType(pg_ig_names[i], ctx->GetDataType(p_names[i]));
-      }
-    }
-  }
-};
-
-class WhileGradOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    ctx->HasInputs(kX);
-    ctx->HasOutputs(framework::GradVarName(kX));
-    ctx->HasInputs(kOutputs);
-    ctx->HasInputs(framework::GradVarName(kOutputs));
-
-    auto pg_ig_names = ctx->Outputs(kXGRAD);
-    std::vector<framework::InferShapeVarPtr> in_var_ptrs =
-        ctx->GetInputVarPtrs(kX);
-    std::vector<framework::InferShapeVarPtr> out_var_ptrs =
-        ctx->GetOutputVarPtrs(kXGRAD);
-    PADDLE_ENFORCE(in_var_ptrs.size() == out_var_ptrs.size());
-
-    for (size_t i = 0; i < in_var_ptrs.size(); ++i) {
-      if (pg_ig_names[i] == framework::kEmptyVarName) {
-        continue;
-      }
-      if (ctx->IsRuntime()) {
-        framework::Variable *in_var =
-            boost::get<framework::Variable *>(in_var_ptrs[i]);
-        framework::Variable *out_var =
-            boost::get<framework::Variable *>(out_var_ptrs[i]);
-
-        auto type = framework::ToVarType(in_var->Type());
-        if (type == framework::proto::VarType::LOD_TENSOR) {
-          out_var->GetMutable<LoDTensor>()->Resize(
-              in_var->Get<framework::LoDTensor>().dims());
-        } else if (type == framework::proto::VarType::SELECTED_ROWS) {
-          out_var->GetMutable<framework::SelectedRows>()->set_height(
-              in_var->Get<framework::SelectedRows>().GetCompleteDims()[0]);
-        } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-          PADDLE_THROW("WhileGradOp doesn't support type %d",
-                       static_cast<int>(type));
-        }
-      } else {
-        framework::VarDesc *in_var =
-            boost::get<framework::VarDesc *>(in_var_ptrs[i]);
-        boost::get<framework::VarDesc *>(out_var_ptrs[i])
-            ->SetShape(in_var->GetShape());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(while, paddle::operators::WhileOp,
-                  paddle::operators::WhileOpMaker,
-                  paddle::operators::WhileGradOpDescMaker);
-REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp,
-                  paddle::operators::WhileGradOpShapeInference,
-                  paddle::operators::WhileGradOpVarTypeInference);
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
deleted file mode 100644
index 8f1e3f60927abc23c18c208efbd77715e40136bc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/controlflow/while_op_helper.h"
-
-#include <string>
-#include <unordered_set>
-#include <utility>
-
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/controlflow/op_variant.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace operators {
-
-// Set skip variables of while_op and while_grad_op
-// These variables should be skipped when eager deletion enables.
-// It is because:
-//  1. while_grad_op needs some variables defined in while_op.
-//  2. while_grad_op needs variables from the previous time step.
-static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) {
-  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
-  VLOG(2) << "Prepare to skip " << attr.size()
-          << " var(s): " << string::join_strings(attr, ' ');
-  attrs[kSkipEagerDeletionVars] = std::move(attr);
-}
-
-// Check whether the forward while_op and while_grad_op match
-// The program may have many while_ops.
-static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op,
-                                           const OpVariant &grad_op) {
-  return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) &&
-         fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs);
-}
-
-// Test whether the variable is skippable in forward while_op
-// The variable is skippable in while_op when the variable used in while_grad
-// is not from grad_block.
-static bool IsSkippableVar(const std::string &name,
-                           framework::BlockDesc *grad_block) {
-  return name != framework::kEmptyVarName && !grad_block->HasVar(name);
-}
-
-static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op,
-                                            const OpVariant &bwd_op) {
-  auto *grad_block = bwd_op.Attr<framework::BlockDesc *>(kStepBlock);
-
-  // Find all skippable variables in forward while_op
-  std::unordered_set<std::string> forward_skip_vars;
-  for (auto *op_desc : grad_block->AllOps()) {
-    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-      if (IsSkippableVar(in_arg_name, grad_block)) {
-        forward_skip_vars.insert(in_arg_name);
-      }
-    }
-
-    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
-      if (IsSkippableVar(out_arg_name, grad_block)) {
-        forward_skip_vars.insert(out_arg_name);
-      }
-    }
-  }
-
-  SetSkipVars(fwd_op, std::vector<std::string>(forward_skip_vars.begin(),
-                                               forward_skip_vars.end()));
-
-  // Find all skippable variables in while_grad_op
-  // The skipped variables are those which would be used across time steps.
-  auto &fwd_input = fwd_op.Inputs().at(kX);
-  auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX));
-  PADDLE_ENFORCE_EQ(
-      fwd_input.size(), in_grads.size(),
-      "Backward input gradient number does not match forward input number.");
-
-  std::unordered_set<std::string> backward_skip_vars;
-  for (size_t i = 0; i < in_grads.size(); ++i) {
-    if (in_grads[i] == framework::kEmptyVarName) {
-      continue;
-    }
-    backward_skip_vars.insert(in_grads[i]);
-    backward_skip_vars.insert(framework::GradVarName(fwd_input[i]));
-  }
-
-  SetSkipVars(bwd_op, std::vector<std::string>(backward_skip_vars.begin(),
-                                               backward_skip_vars.end()));
-}
-
-// Find all while_ops and while_grad_ops in the graph or program
-// The while_grad_op and while_op may located in different blocks
-// So we should traverse all blocks in the program and find them out.
-static void FindAllWhileAndWhileGradOp(const framework::ProgramDesc &program,
-                                       std::vector<OpVariant> *while_ops,
-                                       std::vector<OpVariant> *while_grad_ops) {
-  PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size());
-  for (size_t i = 1; i < program.Size(); ++i) {
-    auto &block = program.Block(i);
-    for (size_t j = 0; j < block.OpSize(); ++j) {
-      auto *op = block.Op(j);
-      if (op->Type() == "while") {
-        while_ops->emplace_back(op);
-      } else if (op->Type() == "while_grad") {
-        while_grad_ops->emplace_back(op);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(),
-                    "There are extra while_grad ops in the graph or program");
-}
-
-static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(
-    const framework::ProgramDesc &program, std::vector<OpVariant> *while_ops,
-    std::vector<OpVariant> *while_grad_ops) {
-  FindAllWhileAndWhileGradOp(program, while_ops, while_grad_ops);
-
-  VLOG(2) << "Found while op num: " << while_ops->size()
-          << ", while grad op num: " << while_grad_ops->size();
-
-  if (while_grad_ops->empty()) {
-    return;
-  }
-
-  std::unordered_set<OpVariant, OpVariant::Hasher> while_op_set(
-      while_ops->begin(), while_ops->end());
-
-  for (auto &bwd_op : *while_grad_ops) {
-    const OpVariant *matched_fwd_op = nullptr;
-    for (auto &fwd_op : while_op_set) {
-      if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) {
-        PADDLE_ENFORCE(matched_fwd_op == nullptr,
-                       "Found multiple matched while ops");
-        matched_fwd_op = &fwd_op;
-      }
-    }
-    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
-                            "Cannot find matched forward while op.");
-    ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op);
-    while_op_set.erase(*matched_fwd_op);
-  }
-}
-
-void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
-    const framework::ProgramDesc &program, int block_id,
-    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) {
-  // If block_id is not 0, returns
-  // This is because all while_ops and while_grad_ops in the whole program
-  // would be processed when block_id is 0 (i.e. when Executor::Run() or
-  // ParallelExecutor constructs).
-
-  // What's more, all while_ops and while_grad_ops must be processed when
-  // block_id is zero. If not, while_op may run first and erase variables
-  // used in while_grad_op, and in this moment, while_grad_ops may be not
-  // constructed yet.
-  if (block_id != 0) return;
-
-  std::vector<OpVariant> fwd_ops, bwd_ops;
-  for (auto &op : all_ops) {
-    if (op->Type() == "while") {
-      fwd_ops.emplace_back(op.get());
-    } else if (op->Type() == "while_grad") {
-      bwd_ops.emplace_back(op.get());
-    }
-  }
-  PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(program, &fwd_ops,
-                                                      &bwd_ops);
-}
-
-void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
-    const framework::ProgramDesc &program,
-    const std::vector<framework::OperatorBase *> &while_ops,
-    const std::vector<framework::OperatorBase *> &while_grad_ops) {
-  std::vector<OpVariant> fwd_ops, bwd_ops;
-  fwd_ops.reserve(while_ops.size());
-  for (auto *op : while_ops) {
-    fwd_ops.emplace_back(op);
-  }
-
-  bwd_ops.reserve(while_grad_ops.size());
-  for (auto *op : while_grad_ops) {
-    bwd_ops.emplace_back(op);
-  }
-
-  PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(program, &fwd_ops,
-                                                      &bwd_ops);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
deleted file mode 100644
index e2cfece658088b8e8b74ae52da4b43b21c01127c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr char kStepBlock[] = "sub_block";
-static constexpr char kCondition[] = "Condition";
-static constexpr char kStepScopes[] = "StepScopes";
-static constexpr char kX[] = "X";
-static constexpr char kXGRAD[] = "X@GRAD";
-static constexpr char kOutputs[] = "Out";
-static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
-
-void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
-    const framework::ProgramDesc &program, int block_id,
-    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops);
-
-void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
-    const framework::ProgramDesc &program,
-    const std::vector<framework::OperatorBase *> &while_ops,
-    const std::vector<framework::OperatorBase *> &while_grad_ops);
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
deleted file mode 100644
index 5f52042419d43f1b3c15762b33bc9f90c2fb1f45..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ /dev/null
@@ -1,436 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/operator_kernel_configs.h"
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-namespace paddle {
-namespace operators {
-
-template <typename T>
-std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
-  out << "[";
-  for (auto const& tmp : v) out << tmp << ",";
-  out << "]";
-  return out;
-}
-
-using framework::AlgorithmsCache;
-
-struct ConvArgs {
-  cudnnHandle_t handle;
-  platform::TensorDescriptor idesc, odesc;
-  platform::FilterDescriptor wdesc;
-  platform::ConvolutionDescriptor cdesc;
-  const framework::Tensor *x, *w, *o;
-
-  // strides
-  std::vector<int> s;
-  // paddings
-  std::vector<int> p;
-  // dilations
-  std::vector<int> d;
-
-  ConvArgs(const framework::Tensor* x, const framework::Tensor* w,
-           const framework::Tensor* o, const std::vector<int> s,
-           const std::vector<int> p, const std::vector<int> d)
-      : x(x), w(w), o(o), s(s), p(p), d(d) {}
-};
-
-template <typename perf_t>
-struct SearchAlgorithm {};
-
-template <>
-struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
-  using perf_t = cudnnConvolutionFwdAlgoPerf_t;
-  using algo_t = cudnnConvolutionFwdAlgo_t;
-
-  template <typename T>
-  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic, int algo_cache_id,
-                     const framework::ExecutionContext& ctx) {
-    auto dtype = platform::CudnnDataType<T>::type;
-    bool has_got_workspace_size = true;
-    bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF);
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
-    algo_t algo;
-
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-          args.cdesc.desc(), CUDNN_TENSOR_OP_MATH));
-      VLOG(5) << "use cudnn_tensor_op_math";
-    } else {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-          args.cdesc.desc(), CUDNN_DEFAULT_MATH));
-      VLOG(5) << "NOT use cudnn_tensor_op_math";
-    }
-#endif
-
-    if (!exhaustive) {
-#if CUDNN_VERSION >= 7001
-      int perf_count;
-      int best_algo_idx = 0;
-      std::unique_ptr<perf_t[]> perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]);
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
-          args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(),
-          args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, &perf_count,
-          perf_results.get()));
-      algo = (perf_results.get())[best_algo_idx].algo;
-      workspace_size = GetWorkspaceSize(args, algo);
-
-      if (workspace_size > workspace_size_limit) {
-        has_got_workspace_size = false;
-        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                   "the workspace size request("
-                << workspace_size << ") exceeds the limit("
-                << workspace_size_limit << ")";
-      }
-      if (!has_got_workspace_size) {
-        CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-            args.handle, args.idesc.desc(), args.wdesc.desc(),
-            args.cdesc.desc(), args.odesc.desc(),
-            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit,
-            &algo));
-      }
-#else
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(),
-          args.odesc.desc(), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &algo));
-#endif
-      VLOG(3) << "choose algo " << algo;
-    } else {
-      AlgorithmsCache<algo_t>& algo_cache =
-          ctx.GetKernelConfig<AlgorithmsCache<algo_t>>(algo_cache_id);
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-
-      auto x_dims = framework::vectorize(args.x->dims());
-      auto w_dims = framework::vectorize(args.w->dims());
-
-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
-               << algo_cache_id << ", x_dims:" << x_dims
-               << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
-               << args.p << ", args.d" << args.d;
-
-      algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
-            int returned_algo_count;
-            std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
-
-            auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-              CUDNN_ENFORCE(
-                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                      args.handle, args.idesc.desc(), args.x->data<T>(),
-                      args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
-                      args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
-                      kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                      perf_stat.data(), cudnn_workspace_ptr,
-                      workspace_size_limit));
-            };
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
-
-            VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)";
-            for (int i = 0; i < returned_algo_count; ++i) {
-              const auto& stat = perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                      << " " << stat.memory;
-            }
-            return perf_stat[0].algo;
-          });
-    }
-    VLOG(3) << "choose algo " << algo;
-    return algo;
-  }
-
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
-    size_t workspace_size = 0;
-    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-        args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(),
-        args.odesc.desc(), algo, &workspace_size));
-    return workspace_size;
-  }
-};
-
-template <>
-struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
-  using perf_t = cudnnConvolutionBwdDataAlgoPerf_t;
-  using algo_t = cudnnConvolutionBwdDataAlgo_t;
-
-  template <typename T>
-  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic, int algo_cache_id,
-                     const framework::ExecutionContext& ctx) {
-    auto dtype = platform::CudnnDataType<T>::type;
-    bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF);
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
-    bool has_got_workspace_size = true;
-    algo_t algo;
-
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-          args.cdesc.desc(), CUDNN_TENSOR_OP_MATH));
-      VLOG(5) << "use cudnn_tensor_op_math";
-    } else {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-          args.cdesc.desc(), CUDNN_DEFAULT_MATH));
-      VLOG(5) << "NOT use cudnn_tensor_op_math";
-    }
-#endif
-
-    if (!exhaustive && !deterministic) {
-#if CUDNN_VERSION >= 7001
-      int perf_count;
-      int best_algo_idx = 0;
-      std::unique_ptr<perf_t[]> perf_results(
-          new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]);
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
-              args.handle, args.wdesc.desc(), args.odesc.desc(),
-              args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS,
-              &perf_count, perf_results.get()));
-      algo = (perf_results.get())[best_algo_idx].algo;
-
-#if CUDNN_VERSION < 7500
-      int stride_dim = args.x->dims().size() - 2;
-      bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim,
-                                   [=](int n) { return n != 1; });
-      if (blacklist && (static_cast<cudnnConvolutionBwdDataAlgo_t>(
-                            perf_results[best_algo_idx].algo) ==
-                            CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
-                        static_cast<cudnnConvolutionBwdDataAlgo_t>(
-                            perf_results[best_algo_idx].algo) ==
-                            CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-      }
-#endif
-      workspace_size = GetWorkspaceSize(args, algo);
-      if (workspace_size > workspace_size_limit) {
-        has_got_workspace_size = false;
-        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                   "the workspace size request("
-                << workspace_size << ") exceeds the limit("
-                << workspace_size_limit << ")";
-      }
-      if (!has_got_workspace_size) {
-        CUDNN_ENFORCE(
-            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-                args.handle, args.wdesc.desc(), args.odesc.desc(),
-                args.cdesc.desc(), args.idesc.desc(),
-                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &algo));
-      }
-#else
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-          args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(),
-          args.idesc.desc(), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &algo));
-#endif
-    } else if (deterministic) {
-      return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-    } else {
-      AlgorithmsCache<algo_t>& algo_cache =
-          ctx.GetKernelConfig<AlgorithmsCache<algo_t>>(algo_cache_id);
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-
-      auto x_dims = framework::vectorize(args.x->dims());
-      auto w_dims = framework::vectorize(args.w->dims());
-
-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
-               << algo_cache_id << ", x_dims:" << x_dims
-               << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
-               << args.p << ", args.d" << args.d;
-
-      algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
-            int returned_algo_count;
-            std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
-
-            auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-              CUDNN_ENFORCE(
-                  platform::dynload::
-                      cudnnFindConvolutionBackwardDataAlgorithmEx(
-                          args.handle, args.wdesc.desc(), args.w->data<T>(),
-                          args.odesc.desc(), args.o->data<T>(),
-                          args.cdesc.desc(), args.idesc.desc(),
-                          const_cast<T*>(args.x->data<T>()),
-                          kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
-                          perf_stat.data(), cudnn_workspace_ptr,
-                          workspace_size_limit));
-            };
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
-
-            VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)";
-            for (int i = 0; i < returned_algo_count; ++i) {
-              const auto& stat = perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                      << " " << stat.memory;
-            }
-
-            return perf_stat[0].algo;
-          });
-    }
-    VLOG(3) << "choose algo " << algo;
-    return algo;
-  }
-
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
-    size_t workspace_size = 0;
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-            args.handle, args.wdesc.desc(), args.odesc.desc(),
-            args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size));
-    return workspace_size;
-  }
-};
-
-template <>
-struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
-  using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
-  using algo_t = cudnnConvolutionBwdFilterAlgo_t;
-
-  template <typename T>
-  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic, int algo_cache_id,
-                     const framework::ExecutionContext& ctx) {
-    auto dtype = platform::CudnnDataType<T>::type;
-    bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF);
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
-    bool has_got_workspace_size = true;
-
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-          args.cdesc.desc(), CUDNN_TENSOR_OP_MATH));
-      VLOG(5) << "use cudnn_tensor_op_math";
-    } else {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-          args.cdesc.desc(), CUDNN_DEFAULT_MATH));
-      VLOG(5) << "NOT use cudnn_tensor_op_math";
-    }
-#endif
-
-    algo_t algo;
-    if (!exhaustive && !deterministic) {
-#if CUDNN_VERSION >= 7001
-      using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
-      int perf_count;
-      int best_algo_idx = 0;
-      std::unique_ptr<perf_t[]> perf_results(
-          new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]);
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-              args.handle, args.idesc.desc(), args.odesc.desc(),
-              args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS,
-              &perf_count, perf_results.get()));
-      algo = (perf_results.get())[best_algo_idx].algo;
-      workspace_size = GetWorkspaceSize(args, algo);
-      if (workspace_size > workspace_size_limit) {
-        has_got_workspace_size = false;
-        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                   "the workspace size request("
-                << workspace_size << ") exceeds the limit("
-                << workspace_size_limit << ")";
-      }
-      if (!has_got_workspace_size) {
-        CUDNN_ENFORCE(
-            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-                args.handle, args.idesc.desc(), args.odesc.desc(),
-                args.cdesc.desc(), args.wdesc.desc(),
-                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &algo));
-      }
-#else
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-              args.handle, args.idesc.desc(), args.odesc.desc(),
-              args.cdesc.desc(), args.wdesc.desc(),
-              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &algo));
-#endif
-    } else if (deterministic) {
-      return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
-    } else {
-      AlgorithmsCache<algo_t>& algo_cache =
-          ctx.GetKernelConfig<AlgorithmsCache<algo_t>>(algo_cache_id);
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-
-      auto x_dims = framework::vectorize(args.x->dims());
-      auto w_dims = framework::vectorize(args.w->dims());
-
-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
-               << algo_cache_id << ", x_dims:" << x_dims
-               << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
-               << args.p << ", args.d" << args.d;
-
-      algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
-            int returned_algo_count;
-            std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
-            auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-              CUDNN_ENFORCE(
-                  platform::dynload::
-                      cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                          args.handle, args.idesc.desc(), args.x->data<T>(),
-                          args.odesc.desc(), args.o->data<T>(),
-                          args.cdesc.desc(), args.wdesc.desc(),
-                          const_cast<T*>(args.w->data<T>()),
-                          kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
-                          perf_stat.data(), cudnn_workspace_ptr,
-                          workspace_size_limit));
-            };
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
-
-            VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
-            for (int i = 0; i < returned_algo_count; ++i) {
-              const auto& stat = perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                      << " " << stat.memory;
-            }
-            return perf_stat[0].algo;
-          });
-    }
-    VLOG(3) << "choose algo " << algo;
-    return algo;
-  }
-
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
-    size_t workspace_size = 0;
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-            args.handle, args.idesc.desc(), args.odesc.desc(),
-            args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size));
-    return workspace_size;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
deleted file mode 100644
index 6629a203f80ede3883860a630861f27e7edbe977..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ /dev/null
@@ -1,518 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/conv_cudnn_helper.h"
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DECLARE_bool(cudnn_deterministic);
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using DataLayout = platform::DataLayout;
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-using framework::AlgorithmsCache;
-
-static inline void GetNCDHW(const framework::DDim& dims,
-                            const DataLayout& layout, int* N, int* C, int* D,
-                            int* H, int* W) {
-  *N = dims[0];
-  *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-  int i = layout == DataLayout::kNCHW ? 0 : 1;
-  if (dims.size() == 5) {
-    *D = dims[2 - i];
-    *H = dims[3 - i];
-    *W = dims[4 - i];
-  } else {
-    *D = 1;
-    *H = dims[2 - i];
-    *W = dims[3 - i];
-  }
-}
-
-template <typename T>
-class CUDNNConvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
-
-    if (exhaustive_search && FLAGS_cudnn_deterministic) {
-      PADDLE_THROW(
-          "Cann't set exhaustive_search True and "
-          "FLAGS_cudnn_deterministic True at same time.");
-    }
-
-    const T* input_data = input->data<T>();
-    const T* filter_data = filter->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    // ------------------- cudnn descriptors ---------------------
-    ConvArgs args{input, filter, output, strides, paddings, dilations};
-    auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    auto dtype = platform::CudnnDataType<T>::type;
-    DataLayout layout = DataLayout::kNCHW;
-    if (input->dims().size() == 5) {
-      layout = DataLayout::kNCDHW;
-    }
-    auto layout_format = GetCudnnTensorFormat(layout);
-
-    args.handle = handle;
-    args.cdesc.set(dtype, paddings, strides, dilations);
-#if CUDNN_VERSION_MIN(7, 0, 1)
-    // cudnn 7 can support groups, no need to do it manually
-    // FIXME(typhoonzero): find a better way to disable groups
-    // rather than setting it to 1.
-    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
-        args.cdesc.desc(), groups));
-    groups = 1;
-#endif
-    args.idesc.set(*input, groups);
-    args.wdesc.set(*filter, layout_format, groups);
-    args.odesc.set(*output, groups);
-    int i_n, i_c, i_d, i_h, i_w;
-    GetNCDHW(input->dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w);
-    int o_n, o_c, o_d, o_h, o_w;
-    GetNCDHW(output->dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, &o_h, &o_w);
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = filter->numel() / groups;
-    // ------------------- cudnn conv workspace ---------------------
-    size_t workspace_size = 0;  // final workspace to allocate.
-    // ------------------- cudnn conv algorithm ---------------------
-    cudnnConvolutionFwdAlgo_t algo{};
-
-    using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    algo = search::Find<T>(args, exhaustive_search, false, 0, ctx);
-    workspace_size = search::GetWorkspaceSize(args, algo);
-
-    // ------------------- cudnn conv forward ---------------------
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    for (int i = 0; i < groups; i++) {
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-                handle, &alpha, args.idesc.desc(),
-                input_data + i * group_offset_in, args.wdesc.desc(),
-                filter_data + i * group_offset_filter, args.cdesc.desc(), algo,
-                workspace_ptr, workspace_size, &beta, args.odesc.desc(),
-                output_data + i * group_offset_out));
-          },
-          workspace_size);
-    }
-  }
-};
-
-template <typename T>
-class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-
-    const T* input_data = input->data<T>();
-    const T* output_grad_data = output_grad->data<T>();
-    const T* filter_data = filter->data<T>();
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
-    bool deterministic = FLAGS_cudnn_deterministic;
-    if (exhaustive_search && deterministic) {
-      PADDLE_THROW(
-          "Can't set exhaustive_search True and "
-          "FLAGS_cudnn_deterministic True at same time.");
-    }
-
-    T* filter_grad_data = nullptr;
-    T* input_grad_data = nullptr;
-    ConvArgs args1{input_grad, filter,   output_grad,
-                   strides,    paddings, dilations};
-    ConvArgs args2{input,   filter_grad, output_grad,
-                   strides, paddings,    dilations};
-    // conv_cudnn_helper.h
-    auto handle = dev_ctx.cudnn_handle();
-    auto dtype = platform::CudnnDataType<T>::type;
-    DataLayout layout = DataLayout::kNCHW;
-    if (input->dims().size() == 5) {
-      layout = DataLayout::kNCDHW;
-    }
-    auto layout_tensor = GetCudnnTensorFormat(layout);
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    int i_n, i_c, i_d, i_h, i_w;
-    GetNCDHW(input->dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w);
-    int o_n, o_c, o_d, o_h, o_w;
-    GetNCDHW(output_grad->dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, &o_h,
-             &o_w);
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = filter->numel() / groups;
-    // ------------------- cudnn backward algorithm ---------------------
-    cudnnConvolutionBwdDataAlgo_t data_algo =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-    size_t workspace_size = 0;
-    int iwo_groups, c_groups;
-
-#if CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_groups = 1;
-    c_groups = groups;
-    groups = 1;
-#endif
-
-    if (input_grad) {
-      // ------------------- cudnn descriptors ---------------------
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      args1.handle = handle;
-      args1.idesc.set(*input_grad, iwo_groups);
-      args1.wdesc.set(*filter, layout_tensor, iwo_groups);
-      args1.odesc.set(*output_grad, iwo_groups);
-      args1.cdesc.set(dtype, paddings, strides, dilations, c_groups);
-
-      using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-      data_algo =
-          search1::Find<T>(args1, exhaustive_search, deterministic, 0, ctx);
-      workspace_size =
-          std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
-    }
-
-    if (filter_grad) {
-      // ------------------- cudnn descriptors ---------------------
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      args2.handle = handle;
-      args2.idesc.set(*input, iwo_groups);
-      args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups);
-      args2.odesc.set(*output_grad, iwo_groups);
-      args2.cdesc.set(dtype, paddings, strides, dilations, c_groups);
-
-      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo =
-          search2::Find<T>(args2, exhaustive_search, deterministic, 1, ctx);
-      workspace_size = std::max(workspace_size,
-                                search2::GetWorkspaceSize(args2, filter_algo));
-    }
-
-    // ------------------- cudnn conv backward data ---------------------
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    if (input_grad) {
-      // Because beta is zero, it is unnecessary to reset input_grad.
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-                  handle, &alpha, args1.wdesc.desc(),
-                  filter_data + i * group_offset_filter, args1.odesc.desc(),
-                  output_grad_data + i * group_offset_out, args1.cdesc.desc(),
-                  data_algo, cudnn_workspace_ptr, workspace_size, &beta,
-                  args1.idesc.desc(), input_grad_data + i * group_offset_in));
-            },
-            workspace_size);
-      }
-    }
-    // ------------------- cudnn conv backward filter ---------------------
-    if (filter_grad) {
-      // Because beta is zero, it is unnecessary to reset filter_grad.
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-                  handle, &alpha, args2.idesc.desc(),
-                  input_data + i * group_offset_in, args2.odesc.desc(),
-                  output_grad_data + i * group_offset_out, args2.cdesc.desc(),
-                  filter_algo, cudnn_workspace_ptr, workspace_size, &beta,
-                  args2.wdesc.desc(),
-                  filter_grad_data + i * group_offset_filter));
-            },
-            workspace_size);
-      }
-    }
-  }
-};
-
-/*
- * Inputs:  I, W, dO, ddI, ddW
- * Outputs: ddO, dW, dI
- * ddo = conv(ddI, W) + conv(I, ddW)
- * dW = conv_bp_filter(ddI, dO)
- * dI = conv_bp_data(ddW, dO)
- */
-template <typename T>
-class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto X = ctx.Input<Tensor>("Input");
-    auto W = ctx.Input<Tensor>("Filter");
-    auto dO = ctx.Input<Tensor>("DOutput");
-    auto ddX = ctx.Input<Tensor>("DDInput");
-    auto ddW = ctx.Input<Tensor>("DDFilter");
-
-    auto ddO = ctx.Output<Tensor>("DDOutput");
-    auto dW = ctx.Output<Tensor>("DFilter");
-    auto dX = ctx.Output<Tensor>("DInput");
-
-    const T* x = X->data<T>();
-    const T* dy = dO->data<T>();
-    const T* w = W->data<T>();
-
-    const T* ddx = nullptr;
-    const T* ddw = nullptr;
-    T *dw, *dx, *ddy;
-    dw = dx = ddy = nullptr;
-
-    const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int>& paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int>& dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
-    bool deterministic = FLAGS_cudnn_deterministic;
-    if (exhaustive_search && deterministic) {
-      PADDLE_THROW(
-          "Can't set exhaustive_search True and "
-          "FLAGS_cudnn_deterministic True at same time.");
-    }
-
-    int iwo_group = groups;
-    int c_group = 1;
-#if CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_group = 1;
-    c_group = groups;
-#endif
-    auto dtype = platform::CudnnDataType<T>::type;
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    ConvArgs args1{ddX, W, ddO, strides, paddings, dilations};
-    ConvArgs args2{X, ddW, ddO, strides, paddings, dilations};
-    ConvArgs args3{ddX, dW, dO, strides, paddings, dilations};
-    ConvArgs args4{dX, ddW, dO, strides, paddings, dilations};
-
-    cudnnConvolutionFwdAlgo_t fwd_algo1 =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionFwdAlgo_t fwd_algo2 =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionBwdDataAlgo_t data_algo =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-
-    auto layout = GetCudnnTensorFormat(DataLayout::kNCHW);
-
-    // ddo = conv(ddI, W) + conv(I, ddW)
-    size_t workspace_size = 0;
-    if (ddO) {
-      ddy = ddO->mutable_data<T>(ctx.GetPlace());
-      args1.handle = handle;
-      args1.idesc.set(*ddX, iwo_group);
-      args1.wdesc.set(*W, layout, iwo_group);
-      args1.odesc.set(*ddO, iwo_group);
-      args1.cdesc.set(dtype, paddings, strides, dilations, c_group);
-
-      using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, 0, ctx);
-      workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
-
-      if (ddW) {
-        ddw = ddW->data<T>();
-        args2.handle = handle;
-        args2.idesc.set(*X, iwo_group);
-        args2.wdesc.set(*ddW, layout, iwo_group);
-        args2.odesc.set(*ddO, iwo_group);
-        args2.cdesc.set(dtype, paddings, strides, dilations, c_group);
-
-        using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, 0, ctx);
-        workspace_size = std::max(workspace_size,
-                                  search2::GetWorkspaceSize(args2, fwd_algo2));
-      }
-    }
-
-    if (dW) {
-      dw = dW->mutable_data<T>(ctx.GetPlace());
-      args3.handle = handle;
-      args3.idesc.set(*ddX, iwo_group);
-      args3.wdesc.set(*dW, layout, iwo_group);
-      args3.odesc.set(*dO, iwo_group);
-      args3.cdesc.set(dtype, paddings, strides, dilations, c_group);
-
-      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo =
-          search3::Find<T>(args3, exhaustive_search, deterministic, 1, ctx);
-      workspace_size = std::max(workspace_size,
-                                search3::GetWorkspaceSize(args3, filter_algo));
-    }
-
-    if (ddW && dX) {
-      dx = dX->mutable_data<T>(ctx.GetPlace());
-      args4.handle = handle;
-      args4.idesc.set(*dX, iwo_group);
-      args4.wdesc.set(*ddW, layout, iwo_group);
-      args4.odesc.set(*dO, iwo_group);
-      args4.cdesc.set(dtype, paddings, strides, dilations, c_group);
-
-      using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-      data_algo =
-          search4::Find<T>(args4, exhaustive_search, deterministic, 2, ctx);
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
-    }
-
-    int i_n, i_c, i_d, i_h, i_w;
-    GetNCDHW(X->dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w);
-    int o_n, o_c, o_d, o_h, o_w;
-    GetNCDHW(dO->dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, &o_h, &o_w);
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = W->numel() / groups;
-
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    if (ddO) {
-      ddx = ddX->data<T>();
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-                  handle, &alpha, args1.idesc.desc(), ddx + i * group_offset_in,
-                  args1.wdesc.desc(), w + i * group_offset_filter,
-                  args1.cdesc.desc(), fwd_algo1, workspace_ptr, workspace_size,
-                  &beta, args1.odesc.desc(), ddy + i * group_offset_out));
-            },
-            workspace_size);
-      }
-      if (ddW) {
-        for (int i = 0; i < groups; i++) {
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-                    handle, &alpha, args2.idesc.desc(), x + i * group_offset_in,
-                    args2.wdesc.desc(), ddw + i * group_offset_filter,
-                    args2.cdesc.desc(), fwd_algo2, workspace_ptr,
-                    workspace_size, &alpha, args2.odesc.desc(),
-                    ddy + i * group_offset_out));
-              },
-              workspace_size);
-        }
-      }
-    }
-
-    if (dW) {
-      ddx = ddX->data<T>();
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-                  handle, &alpha, args3.idesc.desc(), ddx + i * group_offset_in,
-                  args3.odesc.desc(), dy + i * group_offset_out,
-                  args3.cdesc.desc(), filter_algo, workspace_ptr,
-                  workspace_size, &beta, args3.wdesc.desc(),
-                  dw + i * group_offset_filter));
-            },
-            workspace_size);
-      }
-    }
-
-    if (dX && ddW) {
-      ddw = ddW->data<T>();
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-                  handle, &alpha, args4.wdesc.desc(),
-                  ddw + i * group_offset_filter, args4.odesc.desc(),
-                  dy + i * group_offset_out, args4.cdesc.desc(), data_algo,
-                  workspace_ptr, workspace_size, &beta, args4.idesc.desc(),
-                  dx + i * group_offset_in));
-            },
-            workspace_size);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace plat = paddle::platform;
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(
-    conv3d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
deleted file mode 100644
index de883580dc02619a224863207ea5aca5674e1be4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
-DECLARE_int64(cudnn_exhaustive_search_times);
-
-namespace paddle {
-namespace operators {
-
-#if CUDNN_VERSION_MIN(6, 0, 5)
-static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
-static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
-    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
-static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
-    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
-#else
-// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
-static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
-static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
-static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
-#endif
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_fusion_op.cc b/paddle/fluid/operators/conv_fusion_op.cc
deleted file mode 100644
index 23b8087e781da30ed7b66ba651f8071ecb7aaf50..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_fusion_op.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/conv_op.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-// This fused conv follows the equation:
-//   y = act ( alpha1 * conv(x) + alpha2 * z + bias ).
-//   here, y is Output,
-//         x is Input,
-//         z is ResidualData,
-//         bias is Bias
-// When `split_channels` is set, y will be splitted into multiple outputs,
-// each output has split_channels[i] number of channels.
-class Conv2DFusionOpMaker : public Conv2DOpMaker {
- protected:
-  void Apply() override {
-    AddAttr<std::string>(
-        "activation",
-        "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
-        "'relux' , 'tanh', 'band_pass'")
-        .SetDefault("relu");
-    AddAttr<std::vector<int>>(
-        "split_channels",
-        "When `split_channels` are set, there will be multiple outputs, the "
-        "output size is equal to the number of `split_channels`.")
-        .SetDefault({});
-    AddOutput("Outputs",
-              "This Outputs is used when setting `split_channels`."
-              "Usually used to fuse conv with same input and same filter size, "
-              "padding, stride, dilation size.")
-        .AsDuplicable()
-        .AsDispensable();
-    AddInput("AlgoCache",
-             "The cache of convolution algorithm, a RAW type variable.")
-        .AsDispensable();
-    AddAttr<int>(
-        "search_times",
-        "The number of exhaustive search times for convolution algorithm.")
-        .SetDefault(-1);
-  }
-};
-
-class Conv2DFusionOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of ConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                   "Input(Filter) of ConvOp should not be null.");
-    auto in_dims = ctx->GetInputDim("Input");
-    auto filter_dims = ctx->GetInputDim("Filter");
-
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    std::vector<int> dilations =
-        ctx->Attrs().Get<std::vector<int>>("dilations");
-
-    std::vector<int64_t> oshape({in_dims[0], filter_dims[0]});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      oshape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                      dilations[i], paddings[i], strides[i]));
-    }
-    PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                   "Output(Output) of ConvOp should not be null.");
-    ctx->SetOutputDim("Output", framework::make_ddim(oshape));
-    std::vector<int> channels =
-        ctx->Attrs().Get<std::vector<int>>("split_channels");
-    if (channels.size()) {
-      PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
-                     "Output(Outputs) of ConvOp should not be null.");
-      std::vector<framework::DDim> oshapes;
-      oshapes.reserve(channels.size());
-      for (size_t i = 0; i < channels.size(); ++i) {
-        oshapes.push_back({oshape[0], channels[i], oshape[2], oshape[3]});
-      }
-      ctx->SetOutputsDim("Outputs", oshapes);
-    }
-  }
-};
-
-// TODO(qingqing): add gradient operator for conv2d_fusion
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker,
-                  ops::Conv2DFusionOpInferShape, ops::ConvOpInferVarType,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
deleted file mode 100644
index 566daa6608282d89a92418e2c168bdf2c09c65c1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-DECLARE_int64(cudnn_exhaustive_search_times);
-
-namespace paddle {
-namespace operators {
-
-#if CUDNN_VERSION >= 7100
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
-using DataLayout = platform::DataLayout;
-using framework::AlgorithmsCache;
-
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-
-template <typename T>
-class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    PADDLE_ENFORCE(bias, "The bias should not be null.");
-    auto* residual = ctx.Input<Tensor>("ResidualData");
-    auto* output = ctx.Output<Tensor>("Output");
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const std::string activation = ctx.Attr<std::string>("activation");
-    int groups = ctx.Attr<int>("groups");
-    int64_t user_workspace_size =
-        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
-
-    const T* input_data = input->data<T>();
-    const T* filter_data = filter->data<T>();
-    const T* bias_data = bias->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    const T* residual_data = residual ? residual->data<T>() : output_data;
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedFilterDescriptor filter_desc;
-    ScopedTensorDescriptor bias_desc;
-    ScopedConvolutionDescriptor conv_desc;
-    ScopedActivationDescriptor act_desc;
-    DataLayout layout = DataLayout::kNCHW;
-    if (input->dims().size() == 5) {
-      layout = DataLayout::kNCDHW;
-    }
-
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
-    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
-        cudnn_conv_desc, groups));
-
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize<int>(output->dims()));
-    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize<int>(filter->dims()));
-    // Now only support NCHW
-    std::vector<int> bias_dim = {1, static_cast<int>(output->dims()[1]), 1, 1};
-    cudnnTensorDescriptor_t cudnn_bias_desc =
-        bias_desc.descriptor<T>(layout, bias_dim);
-    cudnnActivationDescriptor_t cudnn_act_desc =
-        act_desc.descriptor<T>(activation);
-
-    // ------------------- cudnn conv workspace ---------------------
-    size_t workspace_size_in_bytes;  // final workspace to allocate.
-    size_t workspace_size_limit = 0;
-    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
-      int64_t max_user_size =
-          std::min(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
-                   user_workspace_size);
-      workspace_size_limit = max_user_size * 1024 * 1024;
-    }
-
-    // ------------------- cudnn conv algorithm ---------------------
-    cudnnConvolutionFwdAlgo_t algo;
-    auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-
-    auto x_dims = framework::vectorize(input->dims());
-    auto f_dims = framework::vectorize(filter->dims());
-    if (!exhaustive_search) {
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &algo));
-      VLOG(3) << "cuDNN forward algo " << algo;
-    } else {
-      auto search_func = [&]() {
-        int returned_algo_count;
-        std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
-            fwd_perf_stat;
-        auto cudnn_find_func = [&](void* cudnn_workspace) {
-          CUDNN_ENFORCE(
-              platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                  handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-                  filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
-                  kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
-        };
-        workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
-        VLOG(3) << "Perf result: (algo: stat, time, memory)";
-        for (int i = 0; i < returned_algo_count; ++i) {
-          const auto& stat = fwd_perf_stat[i];
-          VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " "
-                  << stat.memory;
-        }
-        return fwd_perf_stat[0].algo;
-      };
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
-          ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(0);
-      int search_times = ctx.Attr<int>("search_times");
-      search_times = std::max(
-          static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
-      // TODO(dangqingqing): Unify this if-else.
-      if (search_times > 0) {
-        // The searched algo will be cached by `search_times` times for
-        // different input dimension. For other dimensions, select the algo
-        // of closest area.
-        algo = algo_cache.GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
-                                       search_func);
-      } else {
-        algo = algo_cache.GetAlgorithm(x_dims, f_dims, strides, paddings,
-                                       dilations, 0, search_func);
-      }
-      VLOG(3) << "choose algo " << algo;
-    }
-
-    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-        cudnn_output_desc, algo, &workspace_size_in_bytes));
-    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
-                      "workspace_size to be allocated exceeds the limit");
-
-    if ((activation == "identity") && (!residual)) {
-      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
-      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
-      // But test in some case, the speed is slower, change to use
-      // cudnnConvolutionForward and cudnnAddTensor
-      // ------------- cudnn conv forward and bias add ---------------------
-      ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
-            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
-      };
-      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
-      CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
-          handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
-          output_data));
-    } else {
-      if (activation == "identity") {
-        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-      }
-      // ------------------- cudnn conv+bias+act forward --------------------
-      ScalingParamType<T> alpha1 = 1.0f;
-      ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-            handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
-            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-            workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
-            cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
-            output_data));
-      };
-      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
-    }
-    std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
-    if (channels.size()) {
-      auto outs = ctx.MultiOutput<framework::Tensor>("Outputs");
-      if (x_dims[0] == 1) {
-        // share data with Output
-        framework::Tensor t;
-        t.ShareDataWith(*output);
-        auto y_dims = output->dims();
-        t.Resize({y_dims[1], y_dims[2], y_dims[3]});
-        int s = 0;
-        for (size_t i = 0; i < channels.size(); ++i) {
-          int e = s + channels[i];
-          outs[i]->ShareDataWith(t.Slice(s, e));
-          outs[i]->Resize({x_dims[0], channels[i], y_dims[2], y_dims[3]});
-          s = e;
-        }
-      } else {
-        // TODO(qingiqng): do copy when batch size large than 1
-        PADDLE_THROW("Batch size greater than 1 is Unsupported");
-      }
-    }
-  }
-};
-#endif
-
-}  // namespace operators
-}  // namespace paddle
-
-#if CUDNN_VERSION >= 7100
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
-                        ops::CUDNNConvFusionOpKernel<double>);
-#endif
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
deleted file mode 100644
index 1cfdf7da86a5f4747e51d2a00b8237ad71dd9a03..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_op.cc
+++ /dev/null
@@ -1,717 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_op.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-
-namespace paddle {
-namespace operators {
-
-void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of ConvOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of ConvOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of ConvOp should not be null.");
-
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  int groups = ctx->Attrs().Get<int>("groups");
-  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
-
-  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
-                 "Conv intput should be 4-D or 5-D tensor, get %u",
-                 in_dims.size());
-
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(), filter_dims.size(),
-      "Conv input dimension and filter dimension should be the same.");
-  PADDLE_ENFORCE(
-      in_dims.size() - strides.size() == 2U,
-      "Conv input dimension and strides dimension should be consistent.");
-  PADDLE_ENFORCE_EQ(
-      paddings.size(), strides.size(),
-      "Conv paddings dimension and Conv strides dimension should be the same.");
-
-  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
-                    "The number of input channels should be equal to filter "
-                    "channels * groups.");
-  PADDLE_ENFORCE_EQ(
-      filter_dims[0] % groups, 0,
-      "The number of output channels should be divided by groups.");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    if ((!ctx->IsRuntime()) &&
-        (in_dims[i + 2] <= 0 || filter_dims[i + 2] <= 0)) {
-      output_shape.push_back(-1);
-    } else {
-      output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                            dilations[i], paddings[i],
-                                            strides[i]));
-    }
-  }
-  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
-  ctx->ShareLoD("Input", "Output");
-}
-
-framework::OpKernelType ConvOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  int customized_type_value =
-      framework::OpKernelType::kDefaultCustomizedTypeValue;
-  framework::LibraryType library{framework::LibraryType::kPlain};
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  auto input_data_type = ctx.Input<Tensor>("Input")->type();
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout = framework::StringToDataLayout(data_format);
-
-#ifdef PADDLE_WITH_CUDA
-  if (platform::CanCUDNNBeUsed(ctx)) {
-    library = framework::LibraryType::kCUDNN;
-  }
-#endif
-#ifdef PADDLE_WITH_MKLDNN
-  if (library == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library = framework::LibraryType::kMKLDNN;
-    layout = framework::DataLayout::kMKLDNN;
-    customized_type_value =
-        (input_data_type == framework::DataTypeTrait<int8_t>::DataType() ||
-         input_data_type == framework::DataTypeTrait<uint8_t>::DataType())
-            ? kConvMKLDNNINT8
-            : kConvMKLDNNFP32;
-  }
-#endif
-
-  if (input_data_type != framework::proto::VarType::INT8 &&
-      input_data_type != framework::proto::VarType::UINT8) {
-    auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
-    PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
-                      "input and filter data type should be consistent");
-  }
-  if (input_data_type == framework::proto::VarType::FP16) {
-    PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
-                      "float16 can only be used when CUDNN is used");
-  }
-
-  auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                      library, customized_type_value);
-#ifdef PADDLE_WITH_CUDA
-  std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
-  // TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn
-  // to false. It should be fixed and then here should only create if library
-  // is kCUDNN.
-  if (configs.empty()) {
-    std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>> p(
-        new framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>());
-    configs.push_back(p);
-  }
-#endif
-  return type;
-}
-
-void Conv2DOpMaker::Make() {
-  AddAttr<bool>("is_test",
-                "(bool, default false) Set to true for inference only, false "
-                "for training. Some layers may run faster when this is true.")
-      .SetDefault(false);
-  AddInput(
-      "Input",
-      "(Tensor) The input tensor of convolution operator. "
-      "The format of input tensor is NCHW, where N is batch size, C is the "
-      "number of channels, H is the height of the feature, "
-      "and W is the width of the feature.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution operator. "
-           "The format of the filter tensor is MCHW, where M is the number of "
-           "output image channels, C is the number of input image channels, "
-           "H is the height of the filter, and W is the width of the filter. "
-           "If the groups attribute is greater than 1, C equals the number of "
-           "input image channels divided by the groups.");
-  AddInput("Bias",
-           "(Tensor) Bias to be added to each output of filter application."
-           "The format of output tensor is X (one-dimensional) of size equal"
-           "to the number of output channels. Only used with MKL-DNN.")
-      .AsDispensable();
-  AddInput("ResidualData",
-           "(Tensor) Tensor with residual data "
-           "to which convolution output will be added."
-           "Used with fuse_residual_connection fusion.")
-      .AsDispensable();
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution operator. "
-            "The format of output tensor is also NCHW.");
-  AddAttr<std::vector<int>>("strides",
-                            "(vector<int> default:{1, 1}), the "
-                            "strides(h_stride, w_stride) of "
-                            "convolution operator.")
-      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>("paddings",
-                            "(vector<int> default:{0, 0}), the "
-                            "paddings(h_pad, w_pad) of "
-                            "convolution operator.")
-      .SetDefault({0, 0});
-  AddAttr<int>(
-      "groups",
-      "(int default:1), the groups number of the convolution operator. "
-      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
-      "when group=2, the first half of the filters is only connected to the "
-      "first half of the input channels, while the second half of the filters "
-      "is only connected to the second half of the input channels.")
-      .SetDefault(1);
-  AddAttr<std::vector<int>>("dilations",
-                            "(vector<int> default:{1, 1}), the "
-                            "dilations(h_dilation, w_dilation) of "
-                            "convolution operator.")
-      .SetDefault({1, 1});
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<bool>("fuse_relu_before_depthwise_conv",
-                "(bool, default false) Only used in cuda depthwise kernel")
-      .SetDefault(false);
-  AddAttr<bool>("use_mkldnn",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<bool>("use_quantizer",
-                "(bool, default false) "
-                "Set to true for operators that should be quantized and use "
-                "int8 kernel. "
-                "Only used on CPU.")
-      .SetDefault(false);
-  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<bool>("fuse_brelu",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<float>("fuse_brelu_threshold",
-                 "(float, default false 6.0) Only used in mkldnn kernel")
-      .SetDefault(6.0f);
-  AddAttr<std::string>("fuse_activation",
-                       "(string, default \"\") Only used in mkldnn kernel")
-      .SetDefault("");
-  AddAttr<float>("fuse_alpha",
-                 "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
-  AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
-  AddAttr<bool>("fuse_residual_connection",
-                "(bool, default false) Only used in mkldnn kernel. Used "
-                "whenever convolution output is as an input to residual "
-                "connection.")
-      .SetDefault(false);
-  AddAttr<float>("Scale_in",
-                 "Scale_in to be used for int8 input data."
-                 "Only used with MKL-DNN INT8.")
-      .SetDefault(1.0f);
-  AddAttr<float>("Scale_out",
-                 "Scale_out to be used for int8 output data."
-                 "Only used with MKL-DNN INT8.")
-      .SetDefault(1.0f);
-  AddAttr<float>("Scale_in_eltwise",
-                 "Scale_in_eltwise to be used for int8 eltwise input data."
-                 "Only used with MKL-DNN INT8.")
-      .SetDefault(1.0f);
-  AddAttr<std::vector<float>>("Scale_weights",
-                              "Scale_weights to be used for int8 weights data."
-                              "Only used with MKL-DNN INT8.")
-      .SetDefault({1.0f});
-  AddAttr<bool>("force_fp32_output",
-                "(bool, default false) Force INT8 kernel output FP32, only "
-                "used in MKL-DNN INT8")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  // TODO(dzhwinter): need to registered layout transform function
-  AddAttr<int>("workspace_size_MB",
-               "Only used in cudnn kernel. Need set use_cudnn to true."
-               "workspace size for cudnn, in MB, "
-               "workspace is a section of GPU memory which will be "
-               "allocated/freed each time the operator runs, larger "
-               "workspace size can increase performance but also requires "
-               "better hardware. This size should be chosen carefully.")
-      .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB);
-  AddAttr<bool>("exhaustive_search",
-                "(bool, default false) cuDNN has many algorithm to calculation "
-                "convolution, whether enable exhaustive search "
-                "for cuDNN convolution or not, default is False.")
-      .SetDefault(false);
-  AddComment(R"DOC(
-Convolution Operator.
-
-The convolution operation calculates the output based on the input, filter
-and strides, paddings, dilations, groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape.
-Input(Input) and Output(Output) are in NCHW format. Where N is batch
-size, C is the number of channels, H is the height of the feature, and W is
-the width of the feature.
-Filters(Input) is MCHW format. Where M is the number of output image channels, C is
-the number of input image channels, H is the height of the filter, and W
-is the width of the filter.
-Parameters(strides, paddings, dilations) are two elements. These two elements represent
-height and width, respectively.
-The input(X) size and output(Out) size may be different.
-
-Example:
-  Input:
-       Input shape: $(N, C_{in}, H_{in}, W_{in})$
-       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
-  Output:
-       Output shape: $(N, C_{out}, H_{out}, W_{out})$
-  Where
-$$
-       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
-       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
-$$
-)DOC");
-  Apply();
-}
-
-void Conv3DOpMaker::Make() {
-  AddAttr<bool>("is_test",
-                "(bool, default false) Set to true for inference only, false "
-                "for training. Some layers may run faster when this is true.")
-      .SetDefault(false);
-  AddInput(
-      "Input",
-      "(Tensor) The input tensor of convolution operator. "
-      "The format of input tensor is NCDHW. Where N is batch size, C is the "
-      "number of channels, D is the depth of the feature, H is the height of "
-      "the feature, "
-      "and W is the width of the feature.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution operator. "
-           "The format of the filter tensor is MCDHW, where M is the number of "
-           "output image channels, C is the number of input image channels, "
-           "D is the depth of the filter, H is the height of the filter, and W "
-           "is the width of the filter."
-           "If the groups attribute is greater than 1, C equals the number of "
-           "input image channels divided by the groups.");
-  AddInput("ResidualData",
-           "(Tensor) Tensor with residual data "
-           "to which convolution output will be added."
-           "Used with fuse_residual_connection fusion.")
-      .AsDispensable();
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution operator."
-            "The format of output tensor is also NCDHW.");
-  AddAttr<std::vector<int>>("strides",
-                            "(vector<int>, default:{1, 1, 1}), the "
-                            "strides(d_stride, h_stride, w_stride) of "
-                            "convolution operator.")
-      .SetDefault({1, 1, 1});
-  AddAttr<std::vector<int>>("paddings",
-                            "(vector<int>, default:{0, 0, 0}), the "
-                            "paddings(d_pad, h_pad, w_pad) of convolution "
-                            "operator.")
-      .SetDefault({0, 0, 0});
-  AddAttr<int>(
-      "groups",
-      "(int default:1), the groups number of the convolution operator. "
-      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
-      "when group=2, the first half of the filters is only connected to the "
-      "first half of the input channels, while the second half of the filters "
-      "is only connected to the second half of the input channels.")
-      .SetDefault(1);
-  AddAttr<std::vector<int>>("dilations",
-                            "(vector<int> default:{1, 1, 1}), the "
-                            "dilations(d_dilation, h_dilation, w_dilation) of "
-                            "convolution operator.")
-      .SetDefault({1, 1, 1});
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<bool>("use_mkldnn",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<std::string>("fuse_activation",
-                       "(string, default \"\") Only used in mkldnn kernel")
-      .SetDefault("");
-  AddAttr<float>("fuse_alpha",
-                 "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
-  AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
-  AddAttr<bool>("fuse_residual_connection",
-                "(bool, default false) Only used in mkldnn kernel. Used "
-                "whenever convolution output is as an input to residual "
-                "connection.")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  AddAttr<bool>("force_fp32_output",
-                "(bool, default false) Only used in mkldnn INT8 kernel")
-      .SetDefault(false);
-  // TODO(dzhwinter): need to registered layout transform function
-  AddAttr<int>("workspace_size_MB",
-               "Only used in cudnn kernel. workspace size for cudnn, in MB, "
-               "workspace is a section of GPU memory which will be "
-               "allocated/freed each time the operator runs, larger "
-               "workspace size can increase performance but also requires "
-               "better hardware. This size should be chosen carefully.")
-      .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB);
-  AddAttr<bool>("exhaustive_search",
-                "(bool, default false) cuDNN has many algorithm to calculation "
-                "convolution, whether enable exhaustive search "
-                "for cuDNN convolution or not, default is False.")
-      .SetDefault(false);
-  AddComment(R"DOC(
-Convolution3D Operator.
-
-The convolution operation calculates the output based on the input, filter
-and strides, paddings, dilations, groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape.
-Input(Input) and output(Output) are in NCDHW format, where N is batch
-size, C is the number of channels,D is the depth of the feature, H is the height of
-the feature, and W is the width of the feature.
-Filters(Input) is MCDHW format, where M is the number of output image channels,
-C is the number of input image channels, D is the depth of the filter,
-H is the height of the filter, and W is the width of the filter.
-Parameters(strides, paddings, dilations) are three elements. These three elements
-represent depth, height and width, respectively.
-The input(X) size and output(Out) size may be different.
-
-Example:
-  Input:
-       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
-       Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
-  Output:
-       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
-  Where
-  $$
-       D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
-       H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
-       W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
-  $$
-)DOC");
-  Apply();
-}
-
-void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-  }
-}
-
-framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  int customized_type_value =
-      framework::OpKernelType::kDefaultCustomizedTypeValue;
-  framework::LibraryType library_{framework::LibraryType::kPlain};
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-
-#ifdef PADDLE_WITH_CUDA
-  if (platform::CanCUDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
-  }
-#endif
-#ifdef PADDLE_WITH_MKLDNN
-  if (library_ == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kMKLDNN;
-    layout_ = framework::DataLayout::kMKLDNN;
-    customized_type_value = kConvMKLDNNFP32;
-  }
-#endif
-
-  auto type = framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                      ctx.GetPlace(), layout_, library_,
-                                      customized_type_value);
-#ifdef PADDLE_WITH_CUDA
-  if (library_ == framework::LibraryType::kCUDNN) {
-    std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
-    if (configs.empty()) {
-      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>
-          p(new framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>());
-      configs.push_back(p);
-
-      std::shared_ptr<
-          framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>
-          p2(new framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>());
-      configs.push_back(p2);
-    }
-  }
-#endif
-  return type;
-}
-
-class Conv2DGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType(this->ForwardOpType() + "_grad");
-    op->SetInput("Input", Input("Input"));
-    op->SetInput("Filter", Input("Filter"));
-    op->SetInput("Bias", Input("Bias"));
-    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
-
-    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
-    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-    op->SetAttrMap(Attrs());
-
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-class Conv3DGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType(this->ForwardOpType() + "_grad");
-    op->SetInput("Input", Input("Input"));
-    op->SetInput("Filter", Input("Filter"));
-    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
-
-    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
-
-    if (ForwardOp().Inputs().count("ResidualData") != 0) {
-      op->SetInput("ResidualData", Input("ResidualData"));
-    }
-
-    op->SetAttrMap(Attrs());
-
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-/*
- * Inputs:  I, W, dO, ddI, ddW
- * Outputs: ddO, dW, dI
- */
-class Conv2DDoubleGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType(this->ForwardOpType() + "_grad");
-    // I, W, dO, ddI, ddW
-    op->SetInput("Input", Input("Input"));
-    op->SetInput("Filter", Input("Filter"));
-    op->SetInput("DOutput", Input(framework::GradVarName("Output")));
-    op->SetInput("DDInput", OutputGrad(framework::GradVarName("Input")));
-    op->SetInput("DDFilter", OutputGrad(framework::GradVarName("Filter")));
-
-    // ddO, dI, dW
-    // Unlike grad op, double grad op does not use name@GRAD@GRAD
-    // as key of ops' inputs and outputs.
-    auto ddx = OutputGrad(framework::GradVarName("Input"));
-    auto ddw = OutputGrad(framework::GradVarName("Filter"));
-    std::vector<std::string> empty_str = {};
-
-    op->SetOutput(
-        "DDOutput",
-        ddx.empty() ? empty_str : InputGrad(framework::GradVarName("Output")));
-    op->SetOutput("DFilter", ddx.empty() ? empty_str : InputGrad("Filter"));
-    op->SetOutput("DInput", ddw.empty() ? empty_str : InputGrad("Input"));
-
-    op->SetAttrMap(Attrs());
-
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-/*
- * Inputs:  I, W, dO, ddI, ddW
- * Outputs: ddO, dW, dI
- */
-class Conv3DDoubleGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType(this->ForwardOpType() + "_grad");
-    // I, W, dO, ddI, ddW
-    op->SetInput("Input", Input("Input"));
-    op->SetInput("Filter", Input("Filter"));
-    op->SetInput("DOutput", Input(framework::GradVarName("Output")));
-    op->SetInput("DDInput", OutputGrad(framework::GradVarName("Input")));
-    op->SetInput("DDFilter", OutputGrad(framework::GradVarName("Filter")));
-
-    auto ddx = OutputGrad(framework::GradVarName("Input"));
-    auto ddw = OutputGrad(framework::GradVarName("Filter"));
-    std::vector<std::string> empty_str = {};
-
-    op->SetOutput(
-        "DDOutput",
-        ddx.empty() ? empty_str : InputGrad(framework::GradVarName("Output")));
-    op->SetOutput("DFilter", ddx.empty() ? empty_str : InputGrad("Filter"));
-    op->SetOutput("DInput", ddw.empty() ? empty_str : InputGrad("Input"));
-
-    op->SetAttrMap(Attrs());
-
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-void ConvOpDoubleGrad::InferShape(framework::InferShapeContext* ctx) const {
-  auto x_dims = ctx->GetInputDim("Input");
-  auto w_dims = ctx->GetInputDim("Filter");
-  auto do_dims = ctx->GetInputDim("DOutput");
-
-  if (ctx->HasOutput("DDOutput") && ctx->HasInput("DDInput")) {
-    ctx->SetOutputDim("DDOutput", do_dims);
-  }
-  if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) {
-    ctx->SetOutputDim("DFilter", w_dims);
-  }
-  if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) {
-    ctx->SetOutputDim("DInput", x_dims);
-  }
-}
-
-framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  int customized_type_value =
-      framework::OpKernelType::kDefaultCustomizedTypeValue;
-  framework::LibraryType library_{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-
-#ifdef PADDLE_WITH_CUDA
-  if (platform::CanCUDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
-  }
-#endif
-#ifdef PADDLE_WITH_MKLDNN
-  if (library_ == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kMKLDNN;
-    layout_ = framework::DataLayout::kMKLDNN;
-    customized_type_value = kConvMKLDNNFP32;
-  }
-#endif
-  auto type = framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                      ctx.GetPlace(), layout_, library_,
-                                      customized_type_value);
-#ifdef PADDLE_WITH_CUDA
-  if (library_ == framework::LibraryType::kCUDNN) {
-    std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
-    if (configs.empty()) {
-      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>> p0(
-          new framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>());
-      configs.push_back(p0);
-
-      std::shared_ptr<
-          framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>
-          p1(new framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>());
-      configs.push_back(p1);
-
-      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>
-          p2(new framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>());
-      configs.push_back(p2);
-    }
-  }
-#endif
-  return type;
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-                  ops::ConvOpInferVarType, ops::Conv2DGradMaker);
-REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad, ops::Conv2DDoubleGradMaker);
-REGISTER_OPERATOR(conv2d_grad_grad, ops::ConvOpDoubleGrad);
-
-// depthwise convolution op
-REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-                  ops::ConvOpInferVarType, ops::Conv2DGradMaker);
-REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
-
-REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
-                  ops::ConvOpInferVarType, ops::Conv3DGradMaker);
-REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad, ops::Conv3DDoubleGradMaker);
-REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad);
-
-// depthwise conv kernel
-// TODO(xingzhaolong): neon kernel for mobile
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad_grad,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad_grad,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc
deleted file mode 100644
index d07593f5c02e9129c1f333667baccb0531bc31f9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_op.cu.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
deleted file mode 100644
index aa621529b525838e00b36234bf2d9c2bea38a38c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_op.h
+++ /dev/null
@@ -1,698 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/vol2col.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-constexpr int kConvMKLDNNFP32 = 1;
-constexpr int kConvMKLDNNINT8 = 2;
-constexpr int MaxKeyLength = 256;
-
-// Base convolution operator definations for other conv
-// like operators to reuse the implementation.
-inline int ConvOutputSize(int input_size, int filter_size, int dilation,
-                          int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  PADDLE_ENFORCE(
-      output_size > 0,
-      "Due to the settings of padding(%d), filter_size(%d), dilation(%d) and "
-      "stride(%d), the output size is less than 0, please check "
-      "again. Input_size:%d",
-      padding, filter_size, dilation, stride, input_size);
-
-  return output_size;
-}
-inline bool IsExpand(const std::vector<int64_t>& filter_dim,
-                     const std::vector<int>& strides,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
-// Define Op classes in .h file so that other conv
-// operator implementations can reuse the code.
-class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final;
-
- protected:
-  virtual void Apply() {}
-};
-
-class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final;
-
- protected:
-  virtual void Apply() {}
-};
-
-class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{
-        {"Input", /*->*/ "Output"}};
-  }
-};
-
-class ConvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class ConvOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class ConvOpDoubleGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    int groups = context.Attr<int>("groups");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-
-    // use col_shape in the im2col calculation
-    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-    // o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = input->dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
-    // o_h * o_w)
-    framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, data_dim + 1);
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    framework::DDim input_shape =
-        framework::slice_ddim(input->dims(), 1, input->dims().size());
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {
-        output->dims()[1],
-        output->numel() / (output->dims()[0] * output->dims()[1])};
-
-    // convolution operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(input->dims()[1]) / groups;
-    int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-    math::Vol2ColFunctor<DeviceContext, T> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-      for (int g = 0; g < groups; g++) {
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          // im2col
-          im2col(dev_ctx, in_slice, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                  paddings[1]},
-                 &col);
-        } else if (data_dim == 3U) {
-          // vol2col
-          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-        }
-
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice,
-                    T(0.0));
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    // The filter and filter_grad will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    int groups = context.Attr<int>("groups");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(
-        framework::vectorize(output_grad->dims()));
-
-    // use col_shape in the im2col calculation
-    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-    // o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = input->dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (i_c/g * k_h * k_w, o_h * o_w)
-    // or
-    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
-    framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, data_dim + 1);
-
-    framework::DDim input_shape =
-        framework::slice_ddim(input->dims(), 1, input->dims().size());
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {
-        output_grad->dims()[1],
-        output_grad->numel() /
-            (output_grad->dims()[0] * output_grad->dims()[1])};
-
-    // convolution backward input operator:  gemm + col2im(or col2vol)
-    // convolution backward weight operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(input->dims()[1]) / groups;
-    int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-
-      // if is_expand is false, the operation of set_zero is unnecessary,
-      // because math::matmul will reset input_grad.
-      if (is_expand) {
-        set_zero(dev_ctx, input_grad, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-          Tensor in_grad_slice =
-              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          if (!is_expand) {
-            col_matrix.ShareDataWith(in_grad_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0),
-                      &col_matrix, T(0.0));
-
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-                   &in_grad_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
-          }
-        }
-      }
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // im2col
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          if (!is_expand) {
-            col.ShareDataWith(in_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, in_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-          }
-
-          // gemm
-          Tensor filter_grad_slice =
-              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0),
-                      &filter_grad_slice, T(1.0));
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      "It must use CPUPlace.");
-    const Tensor* X = ctx.Input<Tensor>("Input");
-    const Tensor* dY = ctx.Input<Tensor>("DOutput");
-    const Tensor* ddX = ctx.Input<Tensor>("DDInput");
-    const Tensor* ddW_in = ctx.Input<Tensor>("DDFilter");
-
-    Tensor* ddY = ctx.Output<Tensor>("DDOutput");
-    Tensor* dW = ctx.Output<Tensor>("DFilter");
-    Tensor* dX = ctx.Output<Tensor>("DInput");
-    Tensor W = detail::Ref(ctx.Input<Tensor>("Filter"),
-                           "Cannot find input Filter(%s) in scope)",
-                           ctx.Inputs("Filter")[0]);
-
-    if (!ddY && !dW && !dX) return;
-    int groups = ctx.Attr<int>("groups");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(X->dims()[0]);
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(W.dims()));
-    std::vector<int64_t> output_shape_vec(framework::vectorize(dY->dims()));
-
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    // col_shape [in_channel/group, kh, kw, oh, ow]
-    col_shape_vec[0] = X->dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-    // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
-    framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, data_dim + 1);
-    // input_shape [Cin, H, W]
-    framework::DDim input_shape =
-        framework::slice_ddim(X->dims(), 1, X->dims().size());
-    // filter_matrix_shape [Cout, Cin * kh * kw]
-    framework::DDim filter_matrix_shape = {W.dims()[0],
-                                           W.numel() / W.dims()[0]};
-
-    W.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {
-        dY->dims()[1], dY->numel() / (dY->dims()[0] * dY->dims()[1])};
-    int in_step = static_cast<int>(X->dims()[1]) / groups;
-    int out_step = static_cast<int>(dY->dims()[1]) / groups;
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    Tensor col_matrix;
-    if (is_expand) {
-      col = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-
-    // dx convolution double grad:  gemm + col2im(col2vol)
-    // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
-    // oH, oW)
-    if (dX && ddW_in) {
-      Tensor ddW;
-      ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-
-      dX->mutable_data<T>(ctx.GetPlace());
-      // if is_expand is false, the operation of set_zero is unnecessary
-      // because math::matmul will reset dx
-      if (is_expand) {
-        set_zero(dev_ctx, dX, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor dy_batch = dY->Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor dx_batch = dX->Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-          Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col_matrix.ShareDataWith(dx_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix,
-                      T(0.0));
-
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-                   &dx_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
-          }
-        }
-      }
-    }
-
-    // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
-    // oH, oW)
-    // dw convolution double grad:  im2col(vol2col) + gemm
-    if (dW) {
-      dW->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dW, static_cast<T>(0));
-      Tensor dW_arr = *dW;
-      dW_arr.Resize(filter_matrix_shape);
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; ++i) {
-        Tensor dy_batch = dY->Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor ddx_batch = ddX->Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; ++g) {
-          // im2col
-          Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col.ShareDataWith(ddx_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, ddx_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-          }
-
-          Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice,
-                      T(1.0));
-        }
-      }
-    }
-
-    // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
-    // w/ddw(Cout, Cin, kh, kw)
-    // ddy convolution double grad: im2col(vol2col) + gemm
-    if (ddY) {
-      ddY->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, ddY, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; ++i) {
-        Tensor ddx_batch = ddX->Slice(i, i + 1).Resize(input_shape);
-        Tensor x_batch = X->Slice(i, i + 1).Resize(input_shape);
-        Tensor ddy_batch = ddY->Slice(i, i + 1).Resize(output_matrix_shape);
-        for (int g = 0; g < groups; ++g) {
-          Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
-          Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col.ShareDataWith(ddx_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            // im2col
-            im2col(dev_ctx, ddx_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-                   &col);
-          } else if (data_dim == 3U) {
-            // vol2col
-            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-          }
-
-          // gemm
-          Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice,
-                      T(0.0));
-
-          if (ddW_in) {
-            Tensor ddW;
-            ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-
-            if (!is_expand) {
-              col.ShareDataWith(x_slice);
-              col_matrix.ShareDataWith(col);
-              col_matrix.Resize(col_matrix_shape);
-            } else if (data_dim == 2U) {
-              // im2col
-              im2col(dev_ctx, x_slice, dilations, strides,
-                     std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                      paddings[1]},
-                     &col);
-            } else if (data_dim == 3U) {
-              // vol2col
-              vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
-            }
-
-            // gemm
-            Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-            blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice,
-                        T(1.0));
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    PADDLE_ENFORCE_EQ(
-        output->dims()[1] % input->dims()[1], 0,
-        "The output channels must be a multiple of the input channels");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    if (fuse_relu) {
-      math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
-      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                    output);
-    } else {
-      math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
-      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                    output);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-
-      if (fuse_relu) {
-        math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
-            depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                               paddings, dilations, input_grad);
-      } else {
-        math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
-            depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                               paddings, dilations, input_grad);
-      }
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      if (fuse_relu) {
-        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
-            depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
-                                paddings, dilations, filter_grad);
-      } else {
-        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
-            depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
-                                paddings, dilations, filter_grad);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
deleted file mode 100644
index fa4edb70b48e529102f11a1b0b9cac2110a33966..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_shift_op.h"
-#include "paddle/fluid/framework/eigen.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-class ConvShiftOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2.");
-    if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0))
-      PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
-                        "The 1st dimension of Input(X) and Input(Y) should "
-                        "be equal.");
-    if (ctx->IsRuntime() || y_dims[1] > 0)
-      PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
-                        "The 2nd dimension of Input(Y) should be odd.");
-    if (ctx->IsRuntime() || (x_dims[1] > 0 && y_dims[1] > 0))
-      PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
-                        "The 2nd dimension of Input(Y) should be less than or "
-                        "equal to the 2nd dimension of Input(X).");
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ConvShiftGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should be not null.");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      auto x_dims = ctx->GetInputDim("X");
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(y_grad_name)) {
-      auto y_dims = ctx->GetInputDim("Y");
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
-             "where B is the batch size and M is the data dimension.");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x N, "
-             "where B is the batch size and N is the data dimension. N must "
-             "be odd.");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
-              "i.e., the same shape as X.");
-    AddComment(R"DOC(
-ConvShift Operator.
-
-A layer for circular convolution of two vectors,
-as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401
-
-The equation is:
-
-$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$
-
-where X's index is computed modulo M, and Y's index is computed modulo N.
-
-Both inputs X and Y can carry LoD (Level of Details) information.
-However, the output only shares the LoD information with input X.
-
-)DOC");
-  }
-};
-
-template <typename T>
-class ConvShiftKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<Tensor>("X");
-    auto *Y = context.Input<Tensor>("Y");
-    auto *Out = context.Output<Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = EigenMatrix<T>::From(*X);
-    auto y = EigenMatrix<T>::From(*Y);
-    auto out = EigenMatrix<T>::From(*Out);
-    out.setZero();
-
-    size_t batch_size = X->dims()[0];
-    size_t x_width = X->dims()[1];
-    size_t y_width = Y->dims()[1];
-    size_t y_half_width = (y_width - 1) / 2;
-
-    for (size_t k = 0; k < batch_size; ++k) {
-      for (size_t i = 0; i < x_width; ++i) {
-        for (size_t j = 0; j < y_width; ++j) {
-          int index = (i + j - y_half_width + x_width) % x_width;
-          out(k, i) += x(k, index) * y(k, j);
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class ConvShiftGradKernel<platform::CPUPlace, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<Tensor>("X");
-    auto *Y = context.Input<Tensor>("Y");
-    auto *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto *dY = context.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto x = EigenMatrix<T>::From(*X);
-    auto y = EigenMatrix<T>::From(*Y);
-    auto dout = EigenMatrix<T>::From(*dOut);
-
-    auto x_dims = X->dims();
-    auto y_dims = Y->dims();
-    size_t batch_size = x_dims[0];
-    size_t x_width = x_dims[1];
-    size_t y_width = y_dims[1];
-    size_t y_half_width = (y_width - 1) / 2;
-
-    // The below trades code duplication for efficiency (keeping the if
-    // statement outside of the loop).
-    if (dX) {
-      dX->mutable_data<T>(context.GetPlace());
-      auto dx = EigenMatrix<T>::From(*dX);
-      dx.setZero();
-      for (size_t k = 0; k < batch_size; ++k) {
-        for (size_t i = 0; i < x_width; ++i) {
-          for (size_t j = 0; j < y_width; ++j) {
-            int index = (i + j - y_half_width + x_width) % x_width;
-            dx(k, index) += dout(k, i) * y(k, j);
-          }
-        }
-      }
-    }
-
-    if (dY) {
-      dY->mutable_data<T>(context.GetPlace());
-      auto dy = EigenMatrix<T>::From(*dY);
-      dy.setZero();
-      for (size_t k = 0; k < batch_size; ++k) {
-        for (size_t i = 0; i < x_width; ++i) {
-          for (size_t j = 0; j < y_width; ++j) {
-            int index = (i + j - y_half_width + x_width) % x_width;
-            dy(k, j) += x(k, index) * dout(k, i);
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(conv_shift_grad, ops::ConvShiftGradOp);
-REGISTER_OP_CPU_KERNEL(conv_shift,
-                       ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    conv_shift_grad,
-    ops::ConvShiftGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
deleted file mode 100644
index 314d33310588ed960eecaf1a0319ebf56d925c55..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_shift_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-namespace {
-
-inline int DivUp(int x, int y) { return (x + y - 1) / y; }
-
-// Some notes on the design:
-//
-// Each thread is responsible for computing a single output out[k, i].
-// Thread blocks are based on tiles of x with height 1 in the batch dimension.
-//
-// This design is based on the typical use case where the filter
-// y is fairly small. For large y, it would probably be more efficient
-// to also tile across y.
-template <typename T>
-__global__ void ConvShiftForward(const T *x, const T *y, int x_width,
-                                 int y_width, int y_half_width, int batch_size,
-                                 T *out) {
-  extern __shared__ T mem[];
-
-  int tx = threadIdx.x;
-  int i = blockIdx.x * blockDim.x + tx;  // global x index
-  int k = blockIdx.y;                    // batch index
-
-  // Check if we are in a boundary block with fewer x's to process than
-  // blockDim.x.
-  int num_x =
-      (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x;
-
-  T *sx = mem;
-  T *sx_pad = &mem[num_x];
-  T *sy = &mem[blockDim.x + y_width];
-
-  // Collaboratively load y[k, :] and length-y padding of x into shared memory.
-  int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width;
-  for (int j = tx; j < y_width; j += blockDim.x) {
-    sy[j] = y[k * y_width + j];
-    sx_pad[j] = x[k * x_width + (pad_start + j) % x_width];
-  }
-
-  // Load a cyclically shifted slice of x into shared memory.
-  if (tx < num_x) {
-    int load_i = (i - y_half_width + x_width) % x_width;
-    sx[tx] = x[k * x_width + load_i];
-  }
-  __syncthreads();
-
-  if (tx < num_x) {
-    // Compute dot product of sx[tx:tx + y_width] and sy.
-    T sum = 0;
-    for (int j = 0; j < y_width; ++j) {
-      sum += sx[tx + j] * sy[j];
-    }
-
-    // Save to out[k, i].
-    out[k * x_width + i] = sum;
-  }
-}
-
-// Compute x gradient - initial naive implementation with atomic add.
-template <typename T>
-__global__ void ConvShiftGradX(const T *dout, const T *y, int x_width,
-                               int y_width, int y_half_width, int batch_size,
-                               T *dx) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
-  int j = blockIdx.y;                             // y index
-  int k = blockIdx.z;                             // batch index
-
-  if (i < x_width) {
-    int index = (i + j - y_half_width + x_width) % x_width;
-    atomicAdd(&dx[k * x_width + index],
-              dout[k * x_width + i] * y[k * y_width + j]);
-  }
-}
-
-// Compute y gradient - initial naive implementation with atomic add.
-template <typename T>
-__global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width,
-                            int y_half_width, int batch_size, T *dy) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
-  int j = blockIdx.y;                             // y index
-  int k = blockIdx.z;                             // batch index
-
-  if (i < x_width) {
-    int index = (i + j - y_half_width + x_width) % x_width;
-    atomicAdd(&dy[k * y_width + j],
-              x[k * x_width + index] * dout[k * x_width + i]);
-  }
-}
-}  // namespace
-
-template <typename T>
-class ConvShiftKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Y = context.Input<Tensor>("Y");
-    Tensor *Out = context.Output<Tensor>("Out");
-    const T *x_data = X->data<T>();
-    const T *y_data = Y->data<T>();
-    T *out_data = Out->mutable_data<T>(context.GetPlace());
-
-    int batch_size = X->dims()[0];
-    int x_width = X->dims()[1];
-    int y_width = Y->dims()[1];
-    int y_half_width = (y_width - 1) / 2;
-
-    const int x_per_block = 256;
-    int num_x_blocks = DivUp(x_width, x_per_block);
-    int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T);
-
-    dim3 grid_dim(num_x_blocks, batch_size);
-
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
-        x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
-  }
-};
-
-template <typename T>
-class ConvShiftGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Y = context.Input<Tensor>("Y");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    const T *x_data = X->data<T>();
-    const T *y_data = Y->data<T>();
-    const T *dout_data = dOut->data<T>();
-
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    Tensor *dY = context.Output<Tensor>(framework::GradVarName("Y"));
-
-    int batch_size = X->dims()[0];
-    int x_width = X->dims()[1];
-    int y_width = Y->dims()[1];
-    int y_half_width = (y_width - 1) / 2;
-
-    auto &device_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> zero;
-
-    const int x_per_block = 256;
-    int num_x_blocks = DivUp(x_width, x_per_block);
-    dim3 grid_dim(num_x_blocks, y_width, batch_size);
-
-    if (dX) {
-      T *dx_data = dX->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, dX, static_cast<T>(0.0));
-      ConvShiftGradX<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
-          dout_data, y_data, x_width, y_width, y_half_width, batch_size,
-          dx_data);
-    }
-    if (dY) {
-      T *dy_data = dY->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, dY, static_cast<T>(0.0));
-      ConvShiftDy<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
-          x_data, dout_data, x_width, y_width, y_half_width, batch_size,
-          dy_data);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    conv_shift,
-    ops::ConvShiftKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    conv_shift_grad,
-    ops::ConvShiftGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/conv_shift_op.h b/paddle/fluid/operators/conv_shift_op.h
deleted file mode 100644
index 6d8ddd793738ac2c352235ebf9eb8f2fd0ee3ca3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_shift_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ConvShiftKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override;
-};
-
-template <typename DeviceContext, typename T>
-class ConvShiftGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override;
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
deleted file mode 100644
index bab6fe24e42f15e2703a977d1500bc63f343e79c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using DataLayout = platform::DataLayout;
-
-static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024;
-
-template <typename T>
-class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    // cudnn v5 does not support dilations
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
-
-    const T* input_data = input->data<T>();
-    const T* filter_data = filter->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedFilterDescriptor filter_desc;
-    ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout;
-
-    if (strides.size() == 2U) {
-      layout = DataLayout::kNCHW;
-    } else {
-      layout = DataLayout::kNCDHW;
-    }
-
-    // (N, M, H, W) or (N, M, D, H, W)
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()), groups);
-    // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize<int>(output->dims()), groups);
-    // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
-    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize<int>(filter->dims()), groups);
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
-
-    // ------------------- cudnn conv workspace ---------------------
-    size_t workspace_size_in_bytes;  // final workspace to allocate.
-    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
-    if (user_workspace_size > 0) {
-      workspace_size_limit = user_workspace_size * 1024 * 1024;
-    }
-    // ------------------- cudnn conv algorithm ---------------------
-    cudnnConvolutionBwdDataAlgo_t algo;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    // Get the algorithm
-    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-        handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
-        // dxDesc: Handle to the previously initialized output tensor
-        // descriptor.
-        cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        workspace_size_limit, &algo));
-
-    // get workspace size able to allocate
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-            handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
-            cudnn_output_desc, algo, &workspace_size_in_bytes));
-
-    // ------------------- cudnn conv transpose forward ---------------------
-    int input_offset = input->numel() / input->dims()[0] / groups;
-    int output_offset = output->numel() / output->dims()[0] / groups;
-    int filter_offset = filter->numel() / groups;
-    T alpha = 1.0f, beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    for (int g = 0; g < groups; g++) {
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-            handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
-            cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
-            algo, cudnn_workspace, workspace_size_in_bytes, &beta,
-            cudnn_output_desc, output_data + output_offset * g));
-      };
-      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
-    }
-  }
-};
-
-template <typename T>
-class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    const T* input_data = input->data<T>();
-    const T* output_grad_data = output_grad->data<T>();
-    const T* filter_data = filter->data<T>();
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    // cudnn v5 does not support dilations
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedFilterDescriptor filter_desc;
-    ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout = DataLayout::kNCHW;
-
-    // Input: (N, M, H, W) or (N, M, D, H, W)
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()), groups);
-    // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize<int>(output_grad->dims()), groups);
-    // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
-    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize<int>(filter->dims()), groups);
-
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
-
-    // ------------------- cudnn backward algorithm ---------------------
-    cudnnConvolutionFwdAlgo_t data_algo;
-    cudnnConvolutionBwdFilterAlgo_t filter_algo;
-    size_t bwd_filter_ws_size, fwd_ws_size;
-    size_t workspace_size_in_bytes = 0;
-    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
-    if (user_workspace_size > 0) {
-      workspace_size_limit = user_workspace_size * 1024 * 1024;
-    }
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    if (input_grad) {
-      // choose backward algorithm for data
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &data_algo));
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_input_desc, data_algo, &fwd_ws_size));
-      workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
-    }
-
-    if (filter_grad) {
-      // choose backward algorithm for filter
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
-              cudnn_filter_desc,
-              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &filter_algo));
-
-      // get workspace for backwards filter algorithm
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
-              cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
-      workspace_size_in_bytes =
-          std::max(workspace_size_in_bytes, bwd_filter_ws_size);
-    }
-
-    // ------------------- cudnn conv backward data ---------------------
-    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
-    int input_offset = input->numel() / input->dims()[0] / groups;
-    int output_grad_offset =
-        output_grad->numel() / output_grad->dims()[0] / groups;
-    int filter_offset = filter->numel() / groups;
-    T alpha = 1.0f, beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    if (input_grad) {
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      // Because beta is zero, it is unnecessary to reset input_grad.
-      for (int g = 0; g < groups; g++) {
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-              handle, &alpha, cudnn_output_desc,
-              output_grad_data + output_grad_offset * g, cudnn_filter_desc,
-              filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
-              cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-              input_grad_data + input_offset * g));
-        };
-        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
-      }
-    }
-
-    // ------------------- cudnn conv backward filter ---------------------
-    if (filter_grad) {
-      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      // Because beta is zero, it is unnecessary to reset filter_grad.
-      // Gradient with respect to the filter
-      for (int g = 0; g < groups; g++) {
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-              handle, &alpha, cudnn_output_desc,
-              output_grad_data + output_grad_offset * g, cudnn_input_desc,
-              input_data + input_offset * g, cudnn_conv_desc, filter_algo,
-              cudnn_workspace, workspace_size_in_bytes, &beta,
-              cudnn_filter_desc, filter_grad_data + filter_offset * g));
-        };
-        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<float>,
-                   ops::CUDNNConvTransposeOpKernel<double>);
-REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<float>,
-                   ops::CUDNNConvTransposeGradOpKernel<double>);
-
-REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<float>,
-                   ops::CUDNNConvTransposeOpKernel<double>);
-REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<float>,
-                   ops::CUDNNConvTransposeGradOpKernel<double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
deleted file mode 100644
index e76c57abc6300d845908a9c6db939747d17ca289..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ /dev/null
@@ -1,430 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of ConvTransposeOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of ConvTransposeOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of ConvTransposeOp should not be null.");
-
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  std::vector<int> output_size =
-      ctx->Attrs().Get<std::vector<int>>("output_size");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
-  int groups = ctx->Attrs().Get<int>("groups");
-
-  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
-                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-  PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
-                    "ConvTransposeOp input dimension and filter dimension "
-                    "should be the same.");
-  PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
-                 "ConvTransposeOp input dimension and strides dimension should "
-                 "be consistent.");
-  if (output_size.size())
-    PADDLE_ENFORCE_EQ(output_size.size(), strides.size(),
-                      "ConvTransposeOp output_size dimension and strides "
-                      "dimension should be the same.");
-  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
-                    "ConvTransposeOp paddings dimension and strides "
-                    "dimension should be the same.");
-  PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(),
-                    "ConvTransposeOp paddings dimension and dilations "
-                    "dimension should be the same.");
-  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
-                    "In ConvTransposeOp, The number of input channels should "
-                    "be equal to the number of filter's channels.");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-    auto infer_shape =
-        (in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + filter_extent;
-    if (output_size.size()) {
-      PADDLE_ENFORCE((output_size[i] >= infer_shape &&
-                      output_size[i] < infer_shape + strides[i]),
-                     "ConvTransposeOp output_size should be "
-                     "in appropriate range.");
-      output_shape.push_back(output_size[i]);
-    } else {
-      output_shape.push_back(infer_shape);
-    }
-  }
-  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
-}
-
-framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library_{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-    if (use_cudnn) {
-      library_ = framework::LibraryType::kCUDNN;
-    }
-  }
-#endif
-#ifdef PADDLE_WITH_MKLDNN
-  if (library_ == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kMKLDNN;
-    layout_ = framework::DataLayout::kMKLDNN;
-  }
-#endif
-
-  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                 ctx.GetPlace(), layout_, library_);
-}
-
-void Conv2DTransposeOpMaker::Make() {
-  AddAttr<bool>("is_test",
-                "(bool, default false) Set to true for inference only, false "
-                "for training. Some layers may run faster when this is true.")
-      .SetDefault(false);
-  AddInput(
-      "Input",
-      "(Tensor) The input tensor of convolution transpose operator. "
-      "The format of input tensor is NCHW. Where N is batch size, C is the "
-      "number of input channels, H is the height of the feature, and "
-      "W is the width of the feature.");
-  AddInput(
-      "Filter",
-      "(Tensor) The filter tensor of convolution transpose operator. "
-      "The format of the filter tensor is MCHW, where M is the number of "
-      "input feature channels, C is the number of "
-      "output feature channels,"
-      "H is the height of the filter, and W is the width of the filter. "
-      "We enforce groups number == 1 in the convolution transpose scenario.");
-  AddInput("Bias",
-           "(Tensor) Bias to be added to each output of filter application."
-           "The format of output tensor is X (one-dimensional) of size equal"
-           "to the number of output channels. Only used with MKL-DNN.")
-      .AsDispensable();
-
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution transpose operator. "
-            "The format of output tensor is also NCHW.");
-  AddAttr<std::vector<int>>("output_size",
-                            "(vector<int> default: []), the "
-                            "size of the output tensor")
-      .SetDefault({});
-  AddAttr<int>("groups",
-               "(int default:1), the groups number of the convolution "
-               "transpose operator. ")
-      .SetDefault(1);
-  AddAttr<std::vector<int>>("dilations",
-                            "(vector<int> default:{1, 1}), the "
-                            "dilations(h_dilation, w_dilation) of convolution "
-                            "transpose operator.")
-      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>(
-      "strides",
-      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
-      "convolution transpose operator.")
-      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>(
-      "paddings",
-      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
-      "transpose operator.")
-      .SetDefault({0, 0});
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<bool>("use_mkldnn",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<std::string>("fuse_activation",
-                       "(string, default \"\") Only used in mkldnn kernel")
-      .SetDefault("");
-  AddAttr<float>("fuse_alpha",
-                 "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
-  AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  // TODO(dzhwinter): need to registered layout transform function
-  AddAttr<int>("workspace_size_MB",
-               "Used in cudnn kernel only. workspace size for cudnn, in MB, "
-               "workspace is a section of GPU memory which will be "
-               "allocated/freed each time the operator runs, larger "
-               "workspace size can increase performance but also requires "
-               "better hardward. This size should be carefully setted.")
-      .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB);
-  AddComment(R"DOC(
-Convolution2D Transpose Operator.
-
-The convolution transpose operation calculates the output based on the input, filter
-and dilations, strides, paddings, groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape.
-Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
-number of channels, H is the height of the feature, and W is the width of the feature.
-Filter(Input) is in MCHW format. Where M is the number of input feature channels,
-C is the number of output feature channels, H is the height of the filter,
-and W is the width of the filter.
-Parameters(strides, paddings) are two elements. These two elements represent height
-and width, respectively.
-The input(X) size and output(Out) size may be different.
-
-For an example:
-  Input:
-       Input shape: $(N, C_{in}, H_{in}, W_{in})$
-       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
-  Output:
-       Output shape: $(N, C_{out}, H_{out}, W_{out})$
-  Where
-  $$
-       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\
-       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
-  $$
-)DOC");
-}
-
-void Conv3DTransposeOpMaker::Make() {
-  AddInput("Input",
-           "(Tensor) The input tensor of convolution transpose operator."
-           "The format of input tensor is NCDHW. Where N is batch size, C is "
-           "the number of channels, D is the depth of the feature, H is the "
-           "height of the feature, and "
-           "W is the width of the feature.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution transpose operator."
-           "The format of the filter tensor is MCDHW, where M is the number of "
-           "input feature channels, C is the number of "
-           "output feature channels, D "
-           "is the depth of the filter, H is the height of the filter, and "
-           "W is the width of the filter."
-           "We enforce groups number == 1 and padding == 0 in "
-           "the convolution3d transpose scenario.");
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution transpose operator."
-            "The format of output tensor is also NCDHW."
-            "Where N is batch size, C is "
-            "the number of channels, D is the depth of the feature, H is the "
-            "height of the feature, and W is the width of the feature.");
-  AddAttr<std::vector<int>>("output_size",
-                            "(vector<int> default: []), the "
-                            "size of the output tensor")
-      .SetDefault({});
-  AddAttr<std::vector<int>>(
-      "dilations",
-      "(vector<int> default:{1, 1, 1}), the "
-      "dilations(d_dilation,h_dilation, w_dilation) of convolution "
-      "transpose operator.")
-      .SetDefault({1, 1, 1});
-  AddAttr<std::vector<int>>("strides",
-                            "(vector<int> default:{1, 1, 1}), the "
-                            "strides{d_stride, h_stride, w_stride} of "
-                            "convolution transpose operator.")
-      .SetDefault({1, 1, 1});
-  AddAttr<std::vector<int>>("paddings",
-                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
-                            "h_pad, w_pad) of convolution transpose operator.")
-      .SetDefault({0, 0, 0});
-  AddAttr<int>("groups",
-               "(int default:1), the groups number of the convolution3d "
-               "transpose operator. ")
-      .SetDefault(1);
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<bool>("use_mkldnn",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  // TODO(dzhwinter): need to registered layout transform function
-  AddAttr<int>("workspace_size_MB",
-               "Used in cudnn kernel only. workspace size for cudnn, in MB, "
-               "workspace is a section of GPU memory which will be "
-               "allocated/freed each time the operator runs, larger "
-               "workspace size can increase performance but also requires "
-               "better hardward. This size should be carefully setted.")
-      .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB);
-  AddComment(R"DOC(
-Convolution3D Transpose Operator.
-
-The convolution transpose operation calculates the output based on the input, filter
-and dilations, strides, paddings, groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape.
-Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
-number of channels, D is the depth of the feature, H is the height of the feature,
-and W is the width of the feature.
-Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
-C is the number of output feature channels, D is the depth of the filter,H is the
-height of the filter, and W is the width of the filter.
-Parameters(strides, paddings) are three elements. These three elements represent
-depth, height and width, respectively.
-The input(X) size and output(Out) size may be different.
-
-Example:
-  Input:
-       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
-       Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
-  Output:
-       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
-  Where
-  $$
-       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\
-       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\
-       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
-  $$
-)DOC");
-}
-
-void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-  }
-}
-
-framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-  }
-#endif
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
-  }
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                 ctx.GetPlace(), layout_, library_);
-}
-
-class ConvTransposeGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType(ForwardOp().Type() + "_grad");
-    op->SetInput("Input", Input("Input"));
-    op->SetInput("Filter", Input("Filter"));
-    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
-    if (ForwardOp().Inputs().count("Bias") > 0) {
-      op->SetInput("Bias", Input("Bias"));
-      op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-    }
-    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-// conv2d_transpose
-REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
-                  ops::Conv2DTransposeOpMaker,
-                  ops::ConvTransposeGradOpDescMaker);
-REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-
-// conv3d_transpose
-REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
-                  ops::Conv3DTransposeOpMaker,
-                  ops::ConvTransposeGradOpDescMaker);
-REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-
-// depthwise conv2d_transpose
-REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
-                  ops::Conv2DTransposeOpMaker,
-                  ops::ConvTransposeGradOpDescMaker);
-REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cu.cc b/paddle/fluid/operators/conv_transpose_op.cu.cc
deleted file mode 100644
index a6d5665df83ae5c89d42840e91a6abd853fedd12..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_transpose_op.cu.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_transpose_op.h"
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-
-// conv2d
-REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
-                        ops::GemmConvTransposeKernel<CUDA, float>,
-                        ops::GemmConvTransposeKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
-                        ops::GemmConvTransposeGradKernel<CUDA, float>,
-                        ops::GemmConvTransposeGradKernel<CUDA, double>);
-
-// conv3d
-REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
-                        ops::GemmConvTransposeKernel<CUDA, float>,
-                        ops::GemmConvTransposeKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
-                        ops::GemmConvTransposeGradKernel<CUDA, float>,
-                        ops::GemmConvTransposeGradKernel<CUDA, double>);
-
-// depthwise conv2d
-REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
-                        ops::DepthwiseConvTransposeKernel<CUDA, float>,
-                        ops::DepthwiseConvTransposeKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
-                        ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
-                        ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
deleted file mode 100644
index 88c578b1410558b9adcd55f1cd6b53fb9cb124e2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ /dev/null
@@ -1,391 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/vol2col.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-// Define Op classes in .h file so that other conv transpose
-// operator implementations can reuse the code.
-class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-class ConvTransposeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class ConvTransposeOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped, so it should not be constant pointer
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    int groups = context.Attr<int>("groups");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
-    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
-
-    // use col_shape in the im2col and col2im (or vol2col and col2vol)
-    // calculation
-    // col_shape_vec: {c/g, k_h, k_w, h, w} or {c/g, k_d, k_h, k_w, d, h, w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = output->dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
-    }
-    DDim col_shape(framework::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (c/g * k_h * k_w, h * w) or (c/g * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
-
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-
-    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
-    DDim output_shape =
-        framework::slice_ddim(output->dims(), 1, output->dims().size());
-
-    // input matrix size: (m, h * w) or (m, d * h * w)
-    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
-
-    // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
-    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
-    filter.Resize(filter_matrix_shape);
-
-    output->mutable_data<T>(context.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    set_zero(dev_ctx, output, static_cast<T>(0));
-
-    int in_step = static_cast<int>(input->dims()[1]) / groups;
-    int out_step = static_cast<int>(output->dims()[1]) / groups;
-    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-    math::Col2VolFunctor<DeviceContext, T> col2vol;
-
-    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
-    // on input)
-    for (int i = 0; i < batch_size; i++) {
-      // batch with size (m, h * w) or (m, d * h * w)
-      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-
-      // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
-      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
-
-      for (int g = 0; g < groups; g++) {
-        Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
-        Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
-        Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
-
-        // col_matrix = filter_slice * input_slice
-        // of shape (c/g * k_h * k_w, h * w)
-        // or (c/g * k_d * k_h * k_w, d * h * w)
-        blas.MatMul(filter_slice, true, in_slice, false, static_cast<T>(1.0),
-                    &col_matrix, static_cast<T>(0.0));
-
-        if (data_dim == 2U) {
-          // col2im: col_matrix -> dy
-          // from (c/g * k_h * k_w, h * w) to (c/g, o_h, o_w)
-          col2im(dev_ctx, col, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                  paddings[1]},
-                 &out_slice);
-        } else if (data_dim == 3U) {
-          // col2vol: col_matrix -> dy
-          // from (c/g * k_d * k_h * k_w, d * h * w) to (c/g, o_d, o_h, o_w)
-          col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice);
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    // For filter, we do not use const pointer b/c we will do reshape,
-    // but we should avoid modifying its value.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-
-    if ((!input_grad) && (!filter_grad)) return;
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    int groups = context.Attr<int>("groups");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
-    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
-
-    // use col_shape in the im2col and col2im (or vol2col and col2vol)
-    // calculation
-    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = output_grad->dims()[1];
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
-    }
-    DDim col_shape(framework::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
-
-    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
-    DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
-                                              output_grad->dims().size());
-
-    // input matrix size: (m, h * w) or (m, d * h * w)
-    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
-
-    // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
-    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0] / groups};
-    filter.Resize(filter_matrix_shape);
-    int in_step = static_cast<int>(input->dims()[1]) / groups;
-    int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
-
-    // convolution transpose grad on input:
-    // im2col + gemm (similar to conv-forward)
-    // input need to compute gradient
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    if (input_grad || filter_grad) {
-      Tensor col;
-      col.mutable_data<T>(col_shape, context.GetPlace());
-      // col_matrix shares the same piece of data with col,
-      // but will be reshaped into a two-dimensional matrix shape
-      // to call the matrix multiplication interface.
-      Tensor col_matrix;
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-
-      Tensor filter_grad_;
-      math::SetConstant<DeviceContext, T> set_zero;
-
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-
-      if (input_grad) {
-        input_grad->mutable_data<T>(context.GetPlace());
-      }
-      if (filter_grad) {  // filter size (m, c/g, k_h, k_w)
-        filter_grad->mutable_data<T>(context.GetPlace());
-        set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-        filter_grad_ = *filter_grad;
-        filter_grad_.Resize(filter_matrix_shape);
-      }
-
-      for (int i = 0; i < batch_size; i++) {
-        // batch with size (c, o_h * o_w)
-        Tensor output_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_shape);
-
-        if (data_dim == 2U) {
-          // im2col: dy -> col matrix
-          // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
-          im2col(dev_ctx, output_grad_batch, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                  paddings[1]},
-                 &col);
-        } else if (data_dim == 3U) {
-          // vol2col: dy -> col_matrix
-          // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
-          vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings,
-                  &col);
-        }
-
-        if (input_grad) {
-          // batch with size (m, h, w)
-          Tensor input_grad_batch =
-              input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
-          // gemm: dx = filter * dy
-          // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w)
-          // or
-          // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
-          // d, h, w)
-          for (int g = 0; g < groups; g++) {
-            Tensor input_grad_slice =
-                input_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-            Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
-            Tensor col_matrix_slice =
-                col_matrix.Slice(g * col_step, (g + 1) * col_step);
-
-            blas.MatMul(filter_slice, false, col_matrix_slice, false,
-                        static_cast<T>(1.0), &input_grad_slice,
-                        static_cast<T>(0.0));
-          }
-        }
-        if (filter_grad) {
-          // input batch
-          Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-          // gemm: d_filter = x * dy^T
-          // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w)
-          // or
-          // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
-          // k_h * k_w)
-          for (int g = 0; g < groups; g++) {
-            Tensor in_batch_slice =
-                in_batch.Slice(g * in_step, (g + 1) * in_step);
-            Tensor filter_grad_slice =
-                filter_grad_.Slice(g * in_step, (g + 1) * in_step);
-            Tensor col_matrix_slice =
-                col_matrix.Slice(g * col_step, (g + 1) * col_step);
-            blas.MatMul(in_batch_slice, false, col_matrix_slice, true,
-                        static_cast<T>(1.0), &filter_grad_slice,
-                        static_cast<T>(1.0));
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    int groups = context.Attr<int>("groups");
-    PADDLE_ENFORCE_EQ(groups, filter.dims()[0]);
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    for (auto v : dilations) {
-      PADDLE_ENFORCE_EQ(v, 1);
-    }
-
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, output, static_cast<T>(0));
-
-    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
-        depthwiseConvInputGrad;
-    depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
-                           dilations, output);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-
-    if (input_grad) {
-      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations,
-                    input_grad);
-    }
-
-    if (filter_grad) {
-      math::SetConstant<DeviceContext, T> set_zero;
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-
-      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
-          depthwiseConvFilterGrad;
-      depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
-                              dilations, filter_grad);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
deleted file mode 100644
index 93304ec6700b795c923f24a5d0663884b818b9b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cos_sim_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class CosSimOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // notnull check
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of CosSimOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of CosSimOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of CosSimOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("XNorm"),
-                   "Output(XNorm) of CosSimOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("YNorm"),
-                   "Output(YNorm) of CosSimOp should not be null.");
-
-    // shape check
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
-                        "Ranks of Input(X) and Input(Y) must be equal.");
-      PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                        "Rank of Input(X) must not be less than 2.");
-      PADDLE_ENFORCE_EQ(
-          framework::slice_ddim(x_dims, 1, x_dims.size()),
-          framework::slice_ddim(y_dims, 1, y_dims.size()),
-          "All dimensions except the 1st of Input(X) and Input(Y) "
-          "must be equal.");
-      PADDLE_ENFORCE(
-          x_dims[0] == y_dims[0] || y_dims[0] == 1,
-          "The 1st dimension of Input(Y) must be equal to Input(X) or"
-          " just 1 (which will be broadcasted to match Input(X)).");
-    }
-
-    // resize tensor
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-    ctx->SetOutputDim("XNorm", {x_dims[0], 1});
-    ctx->SetOutputDim("YNorm", {y_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The 1st input of cos_sim op.");
-    AddInput("Y", "The 2nd input of cos_sim op.");
-    AddOutput("Out", "The output of cos_sim op.");
-    AddOutput("XNorm",
-              "Norm of the first input, reduced along the 1st "
-              "dimension.")
-        .AsIntermediate();
-    AddOutput("YNorm",
-              "Norm of the second input, reduced along the 1st "
-              "dimension.")
-        .AsIntermediate();
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
-                  "Skip calling InferShape() function in the runtime.")
-        .SetDefault(true);
-
-    AddComment(R"DOC(
-**Cosine Similarity Operator**
-
-$Out = \frac{X^T * Y}{(\sqrt{X^T * X} * \sqrt{Y^T * Y})}$
-
-The input X and Y must have the same shape, except that the 1st dimension
-of input Y could be just 1 (different from input X), which will be
-broadcasted to match the shape of input X before computing their cosine
-similarity.
-
-Both the input X and Y can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input X.
-
-)DOC");
-  }
-};
-
-class CosSimOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // notnull check
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("XNorm"), "Input(XNorm) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("YNorm"), "Input(YNorm) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) must not be null.");
-
-    // shape check
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto xnorm_dims = ctx->GetInputDim("XNorm");
-    auto ynorm_dims = ctx->GetInputDim("YNorm");
-    auto out_dims = ctx->GetInputDim("Out");
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Ranks of Input(X) and Input(Y) must be equal.");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "Rank of Input(X) must not be less than 2.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
-                      framework::slice_ddim(y_dims, 1, y_dims.size()),
-                      "All dimensions except the 1st of Input(X) and Input(Y) "
-                      "must be equal.");
-    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
-                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
-                   " just 1 (which will be broadcasted to match Input(X)).");
-    auto target_xnorm_dims = framework::make_ddim({x_dims[0], 1});
-    auto target_ynorm_dims = framework::make_ddim({y_dims[0], 1});
-    PADDLE_ENFORCE_EQ(xnorm_dims, target_xnorm_dims,
-                      "Shape of Input(XNorm) must be [X.Dim(0), 1].");
-    PADDLE_ENFORCE_EQ(ynorm_dims, target_ynorm_dims,
-                      "Shape of Input(YNorm) must be [Y.Dim(0), 1].");
-    PADDLE_ENFORCE_EQ(out_dims, target_xnorm_dims,
-                      "Shape of Input(Out) must be [X.Dim(0), 1].");
-    PADDLE_ENFORCE_EQ(out_grad_dims, target_xnorm_dims,
-                      "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
-
-    // resize tensor
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(cos_sim, ops::CosSimOp, ops::CosSimOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    cos_sim_grad,
-    ops::CosSimGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu
deleted file mode 100644
index 3d144ca29d9989ad2cbb438a950860eaac873d07..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cos_sim_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/cos_sim_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cos_sim, ops::CosSimKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    cos_sim_grad,
-    ops::CosSimGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h
deleted file mode 100644
index 0b4e3f774674112ddc268ba911e1df317d5edcca..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cos_sim_op.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/cos_sim_functor.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class CosSimKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // get Tensor
-    auto* in_x = context.Input<framework::LoDTensor>("X");
-    auto* in_y = context.Input<Tensor>("Y");
-    auto* out_z = context.Output<framework::LoDTensor>("Out");
-    auto* out_x_norm = context.Output<Tensor>("XNorm");
-    auto* out_y_norm = context.Output<Tensor>("YNorm");
-
-    int rows_x = in_x->dims()[0];
-    int rows_y = in_y->dims()[0];
-    out_z->Resize({rows_x, 1});
-    out_x_norm->Resize({rows_x, 1});
-    out_y_norm->Resize({rows_y, 1});
-    out_z->mutable_data<T>(context.GetPlace());
-    out_x_norm->mutable_data<T>(context.GetPlace());
-    out_y_norm->mutable_data<T>(context.GetPlace());
-    out_z->set_lod(in_x->lod());
-
-    int cols = framework::product(in_x->dims()) / rows_x;
-
-    if (rows_x == rows_y) {
-      math::CosSimFunctor<T, true> functor(
-          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
-          out_y_norm->data<T>(), out_z->data<T>(), cols);
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(context.device_context()), rows_x);
-      for_range(functor);
-    } else {
-      math::CosSimFunctor<T, false> functor(
-          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
-          out_y_norm->data<T>(), out_z->data<T>(), cols);
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(context.device_context()), rows_x);
-      for_range(functor);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CosSimGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // get Tensor
-    auto* in_x = context.Input<Tensor>("X");
-    auto* in_y = context.Input<Tensor>("Y");
-    auto* in_z = context.Input<Tensor>("Out");
-    auto* in_x_norm = context.Input<Tensor>("XNorm");
-    auto* in_y_norm = context.Input<Tensor>("YNorm");
-    auto* out_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
-    auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
-
-    // compute gradident
-    int rows_x = in_x->dims()[0];
-    int rows_y = in_y->dims()[0];
-    int cols = framework::product(in_x->dims()) / rows_x;
-
-    if (rows_x == rows_y) {
-      if (out_grad_x) {
-        out_grad_x->Resize(in_x->dims());
-        math::CosSimGradFunctor<T> functor(
-            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
-            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
-            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(context.device_context()),
-            rows_x);
-        for_range(functor);
-      }
-      if (out_grad_y) {
-        out_grad_y->Resize(in_y->dims());
-        math::CosSimGradFunctor<T> functor(
-            in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
-            in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
-            out_grad_y->mutable_data<T>(context.GetPlace()), cols);
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(context.device_context()),
-            rows_x);
-        for_range(functor);
-      }
-    } else {
-      if (out_grad_x) {
-        out_grad_x->Resize(in_x->dims());
-        math::CosSimDxFunctor<T> functor(
-            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
-            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
-            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(context.device_context()),
-            rows_x);
-        for_range(functor);
-      }
-      if (out_grad_y) {
-        out_grad_y->Resize(in_y->dims());
-        out_grad_y->mutable_data<T>(context.GetPlace());
-        math::SetConstant<DeviceContext, T> set_zero;
-        auto& dev_ctx = context.template device_context<DeviceContext>();
-        set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
-
-        math::CosSimDyFunctor<DeviceContext, T> functor;
-        functor(dev_ctx, in_x_norm->data<T>(), in_y_norm->data<T>(),
-                in_x->data<T>(), in_y->data<T>(), in_z->data<T>(),
-                in_grad_z->data<T>(), static_cast<size_t>(rows_x),
-                static_cast<size_t>(cols), out_grad_y->data<T>());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
deleted file mode 100644
index 2de714e0d4615c9c65c29dd76524f4760433e1ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/crf_decoding_op.h"
-
-namespace paddle {
-namespace operators {
-class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "Emission",
-        "(Tensor<float>/LoDTensor<float>). For a LoDTensor input, its "
-        "shape is [N x D] where N is the total sequence length of the "
-        "mini-batch and D is the total tag number. While for a tensor "
-        "input, its shape is [B X S X D] with B the batch size and S the "
-        "sequence length of each sample after padding. This input is the "
-        "unscaled emission weight matrix of the linear_chain_crf operator.");
-    AddInput(
-        "Transition",
-        "(Tensor<float>). A Tensor with shape [(D + 2) x D]. "
-        "This input is the transition weights learned by the linear_chain_crf "
-        "operator, denoted as w. The 1st row of w are transition weights for "
-        "the start mask. The 2nd row of w are transition weights for the end "
-        "mask. Transition weights between other tags begin from the 3rd row of "
-        "w. See more details in comments of the linear_chain_crf operator.");
-    AddInput(
-        "Label",
-        "(Tensor<int64_t>/LoDTensor<int64_t>). The ground truth with shape "
-        "[N x 1] (for LoDTensor) or [B x S] (for Tensor). This input is "
-        "optional. "
-        "See more details in the operator's comments.")
-        .AsDispensable();
-    AddOutput(
-        "ViterbiPath",
-        "(Tensor<int64_t>/LoDTensor<int64_t>). The decoding results. What to "
-        "return changes depending on whether the Input(Label) (the ground "
-        "truth) is given. See more details in the operator's comment.");
-    AddInput("Length",
-             "(Tensor<int64_t>). The actual length of each sample before "
-             "padding with shape [B x 1]. It means the Input(Emission), "
-             "Input(Label) "
-             "and Output(ViterbiPath) are common tensors with padding when "
-             "this input "
-             "is given.")
-        .AsDispensable();
-    AddComment(R"DOC(
-The crf_decoding operator reads the emission feature weights and the transition
-feature weights learned by the linear_chain_crf operator. It implements the
-Viterbi algorithm which is a dynamic programming algorithm for finding the most
-likely sequence of hidden states, called the Viterbi path, that results in a
-sequence of observed tags.
-
-The output of this operator changes according to whether Input(Label) is given:
-
-1. Input(Label) is given:
-   This happens in training. This operator is used to co-work with the chunk_eval
-   operator.
-   When Input(Label) is given, the crf_decoding operator returns tensor with the 
-   sampe shape as Input(Label) whose values are fixed to be 0, indicating an 
-   incorrect prediction, or 1 indicating a tag is correctly predicted. Such an 
-   output is the input to chunk_eval operator.
-
-2. Input(Label) is not given:
-   This is the standard decoding process.
-
-The crf_decoding operator returns a row vector with shape [N x 1]/[B x S], here 
-the shape depends on the inputs are LoDTensors or common tensors, whose values
-range from 0 to maximum tag number - 1, Each element indicates an index of a
-predicted tag.
-)DOC");
-  }
-};
-
-class CRFDecodingOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Emission"), true,
-                      "Input(Emission) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Transition"), true,
-                      "Input(Transition) should be not null.");
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("ViterbiPath"), true,
-                      "Output(ViterbiPath) should be not null.");
-
-    auto emission_dims = ctx->GetInputDim("Emission");
-    bool has_length = ctx->HasInput("Length");
-
-    if (has_length) {
-      PADDLE_ENFORCE_EQ(emission_dims.size(), 3,
-                        "The Input(Emission) should be a 3-D tensor.");
-    } else {
-      PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
-                        "The Input(Emission) should be a 2-D tensor.");
-    }
-    PADDLE_ENFORCE_NE(emission_dims[0], 0,
-                      "An empty mini-batch is not allowed.");
-
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
-                      "The Input(Transition) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        transition_dims[0] - 2, transition_dims[1],
-        "An invalid dimension for the Input(Transition), which should "
-        "be a 2-D tensor with shape [(D + 2) x D].");
-    if (ctx->IsRuntime() || (emission_dims[emission_dims.size() - 1] > 0 &&
-                             transition_dims[transition_dims.size() - 1] > 0)) {
-      PADDLE_ENFORCE_EQ(
-          emission_dims[emission_dims.size() - 1],
-          transition_dims[transition_dims.size() - 1],
-          "The last dimension of the Input(Emission) and the Input(Transition) "
-          "should be equal to the tag number.");
-    }
-    if (ctx->HasInput("Label")) {
-      auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
-                        "The Input(Label) should be a 2-D tensor");
-      if (ctx->IsRuntime() || (emission_dims[0] > 0 && label_dims[0] > 0)) {
-        PADDLE_ENFORCE_EQ(
-            emission_dims[0], label_dims[0],
-            "The height of Input(Emission) and the height of Input(Label) "
-            "should be the same.");
-      }
-    }
-
-    ctx->ShareLoD("Emission", /*->*/ "ViterbiPath");
-    if (has_length) {
-      ctx->SetOutputDim("ViterbiPath", {emission_dims[0], emission_dims[1]});
-    } else {
-      ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1});
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<LoDTensor>("Emission")->type(),
-                                   platform::CPUPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp,
-                             ops::CRFDecodingOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    crf_decoding,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
deleted file mode 100644
index 74b9cb20a9d8606db081d3005e9b6aacdf03708f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <limits>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::LoDTensor;
-using framework::LoD;
-using framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class CRFDecodingOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* emission_weights = ctx.Input<LoDTensor>("Emission");
-    auto* transition_weights = ctx.Input<Tensor>("Transition");
-    auto* label = ctx.Input<LoDTensor>("Label");
-    auto* decoded_path = ctx.Output<Tensor>("ViterbiPath");
-
-    int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
-    math::SetConstant<DeviceContext, int64_t>()(
-        ctx.template device_context<DeviceContext>(), decoded_path, 0);
-
-    bool has_length = ctx.HasInput("Length");
-    if (has_length) {
-      auto* length = ctx.Input<Tensor>("Length");
-      const size_t seq_num = length->numel();
-      const int64_t* length_data = length->data<int64_t>();
-      auto in_dims = emission_weights->dims();
-
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      framework::Tensor emission_weights_tmp =
-          ctx.AllocateTmpTensor<T, DeviceContext>(emission_weights->dims(),
-                                                  dev_ctx);
-      emission_weights_tmp.ShareDataWith(*emission_weights);
-      emission_weights_tmp.Resize({in_dims[0] * in_dims[1], in_dims[2]});
-
-      decoded_path->Resize({in_dims[0] * in_dims[1], 1});
-      for (size_t i = 0; i < seq_num; ++i) {
-        if (length_data[i] == 0) continue;
-        int start_pos = i * in_dims[1];
-        int end_pos = start_pos + static_cast<int>(length_data[i]);
-        Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
-        Decode(emission_weights_tmp.Slice(start_pos, end_pos),
-               *transition_weights, &decoded_path_one_seq);
-      }
-      decoded_path->Resize({in_dims[0], in_dims[1]});
-    } else {
-      PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL,
-                        "The Input(Emission) should be a sequence.");
-      auto lod = emission_weights->lod();
-      PADDLE_ENFORCE_GT(lod.size(), 0, "Input(Emission) must be a sequence.");
-      const size_t level = 0;
-      const size_t seq_num = lod[level].size() - 1;
-
-      for (size_t i = 0; i < seq_num; ++i) {
-        if (lod[level][i] == lod[level][i + 1]) continue;
-        int start_pos = static_cast<int>(lod[level][i]);
-        int end_pos = static_cast<int>(lod[level][i + 1]);
-        Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
-        Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights,
-               &decoded_path_one_seq);
-      }
-    }
-    if (label) {
-      if (!has_length) {
-        PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
-                          "The Input(Label) should be a sequence.");
-      }
-      const int64_t* label_value = label->data<int64_t>();
-      size_t numel = label->numel();
-      for (size_t i = 0; i < numel; ++i) {
-        path[i] = label_value[i] == path[i] ? 1 : 0;
-      }
-    }
-  }
-
- private:
-  void Decode(const Tensor& emission_weights, const Tensor& transition_weights,
-              Tensor* decoded_path) const {
-    auto emission_dims = emission_weights.dims();
-    const size_t seq_len = emission_dims[0];
-    const size_t tag_num = emission_dims[1];
-    const T* x = emission_weights.data<T>();
-    const T* w = transition_weights.data<T>();
-    int64_t* path = decoded_path->data<int64_t>();
-
-    // alpha is a memo table. An element alpha(k, v) records the score of the
-    // best sequence of tags from position 1 to position k with v being the end
-    // tag.
-    Tensor alpha;
-    T* alpha_value = alpha.mutable_data<T>(emission_dims, platform::CPUPlace());
-    Tensor track;
-    int* track_value =
-        track.mutable_data<int>(emission_dims, platform::CPUPlace());
-    auto ker =
-        jit::KernelFuncs<jit::CRFDecodingTuple<T>, platform::CPUPlace>::Cache()
-            .At(tag_num);
-    ker(static_cast<int>(seq_len), x, w, alpha_value, track_value, tag_num);
-    T max_score = -std::numeric_limits<T>::max();
-    int max_i = 0;
-    for (size_t i = 0; i < tag_num; ++i) {
-      T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];
-      if (score > max_score) {
-        max_score = score;
-        max_i = i;
-      }
-    }
-    path[seq_len - 1] = max_i;
-    for (int k = seq_len - 1; k >= 1; --k) {
-      path[k - 1] = max_i = track_value[k * tag_num + max_i];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
deleted file mode 100644
index 78fcd07e1df8d590ad2a4508bbc82477d928c6e9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/crop_op.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/crop_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class CropOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of CropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of CropOp should not be null.");
-    auto x_dim = ctx->GetInputDim("X");
-    if (!ctx->HasInput("Y")) {
-      auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-      PADDLE_ENFORCE_EQ(
-          int64_t(shape.size()), x_dim.size(),
-          "Shape size should be equal to dimention size of input tensor.");
-      std::vector<int64_t> tensor_shape(shape.size());
-      for (size_t i = 0; i < shape.size(); ++i) {
-        tensor_shape[i] = static_cast<int64_t>(shape[i]);
-      }
-      ctx->SetOutputDim("Out", framework::make_ddim(tensor_shape));
-    } else {
-      auto y_dim = ctx->GetInputDim("Y");
-      PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(y_dim),
-                        "Tensor rank of both CropOp's "
-                        "inputs must be same.");
-      ctx->SetOutputDim("Out", y_dim);
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class CropOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input of pad op. "
-             "The input should be a k-D tensor(k > 0 and k < 7).");
-    AddInput("Y",
-             "The input used as reference for cropping, "
-             "which is of the same dimensions as X.")
-        .AsDispensable();
-    AddInput("Offsets",
-             "The input used to describe offsets in runtime, which is a "
-             "1-D vector whose size equals to the rank of input 'X'. The "
-             "elements data type must be int.")
-        .AsDispensable();
-    AddOutput("Out",
-              "The output of crop op, "
-              "which is of the same dimensions as X.");
-    AddAttr<std::vector<int>>("offsets",
-                              "A list<int> describing offsets to be cropped. "
-                              "The size of offsets list should be the same as "
-                              "the dimension size of input X.")
-        .SetDefault(std::vector<int>());
-    AddAttr<std::vector<int>>("shape",
-                              "A list<int> describing the shape of output. "
-                              "The size of shape list should be the same as "
-                              "the dimension size of input X.")
-        .SetDefault(std::vector<int>());
-    AddComment(R"DOC(
-Crop Operator.
-
-Crop input into output, as specified by offsets and shape.
-
-There are two ways to set the offsets:
-1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
-               output of other operators. This way is suitable for 
-               dynamic offsets.
-2. In network configuration: Using the attribute 'offsets', which will be 
-                             set in Python configure script. This way is 
-                             suitable for fixed offsets.
-You CANNOT use these two ways at the same time. An exception will be raised 
-if input 'Offset' is configured and meanwhile the attribute 'offsets' is 
-not empty.
-
-There are two ways to set shape:
-1. reference input: crop input X into the same shape as reference input.
-                    The dimension of reference input should
-                    be the same as the dimension of input X.
-2. shape list: crop input X into the shape described by a list<int>.
-               The size of shape list should be the same as
-               the dimension size of input X.
-
-The input should be a k-D tensor(k > 0 and k < 7). As an example:
-
-Case 1:
-Given
-
-    X = [[0, 1, 2, 0, 0]
-         [0, 3, 4, 0, 0]
-         [0, 0, 0, 0, 0]],
-
-and
-
-    offsets = [0, 1],
-
-and
-
-    shape = [2, 2],
-
-we get:
-
-    Out = [[1, 2],
-           [3, 4]].
-
-
-Case 2:
-Given
-
-    X = [[0, 1, 2, 5, 0]
-         [0, 3, 4, 6, 0]
-         [0, 0, 0, 0, 0]],
-
-and
-
-    offsets = [0, 1],
-
-and
-
-    Y = [[0, 0, 0]
-         [0, 0, 0]],
-
-we get:
-
-    Out = [[1, 2, 5],
-           [3, 4, 6]].
-)DOC");
-  }
-};
-
-class CropOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class CropGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("crop_grad");
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("X", Input("X"));
-    if (ForwardOp().Inputs().count("Offsets") > 0) {
-      op->SetInput("Offsets", Input("Offsets"));
-    }
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
-                  ops::CropGradOpDescMaker);
-REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    crop, ops::CropKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu
deleted file mode 100644
index 66cb5c452de4b2107693127ce414daf9fb7cd7d8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/crop_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/crop_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    crop, ops::CropKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
deleted file mode 100644
index cfc2cac7beb8a13526cefc94c127ffc2aea533df..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/crop_op.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {  // Internal
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::Tensor;
-
-static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
-  std::vector<int> res;
-  int rank = ctx.Input<Tensor>("X")->dims().size();
-  if (ctx.HasInput("Offsets")) {
-    PADDLE_ENFORCE(ctx.Attr<std::vector<int>>("offsets").empty(),
-                   "Input 'Offsets' and attribute 'offsets' should not be used "
-                   "at the same time.");
-    const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
-    PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1);
-    PADDLE_ENFORCE_EQ(
-        rank, offsets_tensor->dims()[0],
-        "Offsets size should be equal to dimension size of input tensor.");
-    const int* offsets_data;
-    framework::Tensor cpu_tmp_tensor;
-    if (platform::is_cpu_place(offsets_tensor->place())) {
-      offsets_data = offsets_tensor->data<int>();
-    } else {
-      framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(),
-                                &cpu_tmp_tensor);
-      offsets_data = cpu_tmp_tensor.data<int>();
-    }
-    res = std::vector<int>(offsets_data, offsets_data + rank);
-  } else {
-    res = ctx.Attr<std::vector<int>>("offsets");
-    PADDLE_ENFORCE_EQ(
-        rank, static_cast<int>(res.size()),
-        "Offsets size should be equal to dimension size of input tensor.");
-  }
-  return res;
-}
-
-template <typename DeviceContext, typename T, size_t D>
-void CropFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* out = context.Output<Tensor>("Out");
-  auto out_dims = out->dims();
-  if (out_dims[0] == -1) {
-    out_dims[0] = x->dims()[0];
-  }
-  out->mutable_data<T>(out_dims, context.GetPlace());
-  auto x_stride = framework::stride(x->dims());
-  auto offsets = GetOffsets(context);
-  int64_t offset = 0;
-  for (size_t i = 0; i < offsets.size(); ++i) {
-    offset += (x_stride[i] * offsets[i]);
-  }
-
-  auto x_tensor = EigenTensor<T, D>::From(*x);
-  auto out_tensor = EigenTensor<T, D>::From(*out);
-  Eigen::array<int, D> e_offsets;
-  Eigen::array<int, D> e_shape;
-  for (size_t i = 0; i < D; ++i) {
-    e_offsets[i] = offsets[i];
-    e_shape[i] = out->dims()[i];
-  }
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape);
-}
-
-template <typename DeviceContext, typename T>
-class CropKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        CropFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        CropFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        CropFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        CropFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        CropFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        CropFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "CropOp only support tensors with no more than 6 dimensions.");
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, size_t D>
-void CropGradFunction(const framework::ExecutionContext& context) {
-  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-  auto* x = context.Input<Tensor>("X");
-  if (d_x != nullptr) {
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    d_x->mutable_data<T>(x->dims(), context.GetPlace());
-    auto offsets = GetOffsets(context);
-    Eigen::array<std::pair<int, int>, D> paddings;
-    for (size_t i = 0; i < D; ++i) {
-      paddings[i].first = offsets[i];
-      paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
-    }
-    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
-    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    d_x_tensor.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        d_out_tensor.pad(paddings, 0);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CropGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    size_t rank =
-        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
-    switch (rank) {
-      case 1:
-        CropGradFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        CropGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        CropGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        CropGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        CropGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        CropGradFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "CropOp only support tensors with no more than 6 dimensions.");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
deleted file mode 100644
index 9b536e98e41f7360867f349769875567c75ad2a7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/crop_tensor_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class CropTensorOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of Op(crop_tensor) should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of Op(crop_tensor) should not be null.");
-
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    if (ctx->HasInputs("ShapeTensor")) {
-      // top prority shape
-      auto inputs_name = ctx->Inputs("ShapeTensor");
-      PADDLE_ENFORCE_GT(
-          inputs_name.size(), 0,
-          "Input(ShapeTensor)'size of Op(crop_tensor) can't be zero. "
-          "Please check the Attr(shape)'s size of "
-          "Op(fluid.layers.crop_tensor).");
-      auto out_dims = std::vector<int>(inputs_name.size(), -1);
-      for (size_t i = 0; i < shape.size(); ++i) {
-        if (shape[i] != -1) {
-          out_dims[i] = static_cast<int64_t>(shape[i]);
-        }
-      }
-      ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-
-      return;
-    }
-    auto x_dim = ctx->GetInputDim("X");
-    if (ctx->HasInput("Shape")) {
-      auto shape_dim = ctx->GetInputDim("Shape");
-      PADDLE_ENFORCE_EQ(
-          shape_dim.size(), 1,
-          "Input(Shape)'s dimension size of Op(crop_tensor) must be 1. "
-          "Please check the Attr(shape)'s dimension size of "
-          "Op(fluid.layers.crop_tensor).");
-      PADDLE_ENFORCE_EQ(shape_dim[0], x_dim.size(),
-                        "Input(Shape)'s size of Op(crop_tensor) must be equal "
-                        "to dimension size of input tensor. "
-                        "Please check the Attr(shape)'s size of "
-                        "Op(fluid.layers.crop_tensor).");
-      if (ctx->IsRuntime()) {
-        // If true, set the shape of Output(Out) according to Input(Shape) in
-        // CropTensorKernel with ExecutionContext. Also check LoD in
-        // CropTensorKernel.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      } else {
-        auto out_dims = std::vector<int>(shape_dim[0], -1);
-        ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-      }
-      return;
-    }
-    PADDLE_ENFORCE_EQ(int64_t(shape.size()), x_dim.size(),
-                      "Attr(shape)'size of Op(crop_tensor) should be equal to "
-                      "dimention size of input tensor.");
-    std::vector<int64_t> tensor_shape(shape.size());
-    for (size_t i = 0; i < shape.size(); ++i) {
-      tensor_shape[i] = static_cast<int64_t>(shape[i]);
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(tensor_shape));
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "ShapeTensor" || var_name == "OffsetsTensor" ||
-        var_name == "Shape" || var_name == "Offsets") {
-      return expected_kernel_type;
-    }
-
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class CropTensorOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input of pad op. "
-             "The input should be a k-D tensor(k > 0 and k < 7).");
-    AddInput("Shape",
-             "The input used to describe shape of output, which is a "
-             "1-D vector whose size equals to the rank of input 'X'. The "
-             "elements data type must be int. It has a higher priority than "
-             "the shape attribute")
-        .AsDispensable();
-    AddInput("Offsets",
-             "The input used to describe offsets in runtime, which is a "
-             "1-D vector whose size equals to the rank of input 'X'. The "
-             "elements data type must be int. It has a higher priority than "
-             "the offsets attribute")
-        .AsDispensable();
-    AddInput("ShapeTensor",
-             "(vector<Tensor<int32>>, optional). If provided, crop_tensor will "
-             "use this. The shape of the tensor in vector MUST BE [1]. "
-             "It has the highest priority compare with Input(Shape) and "
-             "attr(shape).")
-        .AsDuplicable()
-        .AsDispensable();
-    AddInput("OffsetsTensor",
-             "(vector<Tensor<int32>>, optional). If provided, crop_tensor will "
-             "use this. The shape of the tensor in vector MUST BE [1]. "
-             "It has the highest priority compare with Input(Offsets) and "
-             "attr(offsets).")
-        .AsDuplicable()
-        .AsDispensable();
-    AddOutput("Out",
-              "The output of crop_tensor op, "
-              "which is of the same dimensions as X.");
-    AddAttr<std::vector<int>>("offsets",
-                              "A list<int> describing offsets to be cropped. "
-                              "The size of offsets list should be the same as "
-                              "the dimension size of input X.")
-        .SetDefault(std::vector<int>());
-    AddAttr<std::vector<int>>("shape",
-                              "A list<int> describing the shape of output. "
-                              "The size of shape list should be the same as "
-                              "the dimension size of input X.")
-        .SetDefault(std::vector<int>());
-    AddComment(R"DOC(
-CropTensor Operator.
-
-Crop input into output, as specified by offsets and shape.
-
-There are three ways to set the offsets:
-1. Input 'OffsetsTensor: It is a tensor list. It should be set as a list that 
-                         contains tensor variable in python configure script. 
-                         This way is suitable for dynamic offsets.
-2. Input 'Offsets': It is a variable and can be output of other operators. 
-                    This way is suitable for dynamic offsets.
-3. Attribute 'offsets': It will be set in python configure script. This way 
-                        is suitable for fixed offsets.
-
-You CANNOT use these three ways at the same time. An exception will be raised 
-if input 'OffsetsTensor' or 'Offset' is configured and meanwhile the attribute 'offsets' is 
-not empty.
-
-There are three ways to set shape:
-1. Input 'ShapeTensor': It is a tensor list. It should be set as a list that contains
-                        tensor variable in python configure script. This way is suitable 
-                        for dynamic shape.
-2. Input 'Shape': It is a Variable and can be output of other operators. This way is suitable 
-                  for dynamic shape.
-2. Attribute 'shape': crop input X into the shape described by a list<int>. The size of shape 
-                      list should be the same as the dimension size of input X. This way is 
-                      suitable for fixed shape.
-
-The input should be a k-D tensor(k > 0 and k < 7). As an example:
-
-Case 1:
-Given
-
-    X = [[0, 1, 2, 0, 0]
-         [0, 3, 4, 0, 0]
-         [0, 0, 0, 0, 0]],
-
-and
-
-    offsets = [0, 1],
-
-and
-
-    shape = [2, 2],
-
-we get:
-
-    Out = [[1, 2],
-           [3, 4]].
-
-
-Case 2:
-Given
-
-    X = [[0, 1, 2, 5, 0]
-         [0, 3, 4, 6, 0]
-         [0, 0, 0, 0, 0]],
-
-and offsets is a list that contains tensor variable,
-in runtime offses_var' s value is 1.
-
-    offsets = [0, offsets_var],
-
-and shape is a list that contains tensor variable,
-in runtime dim's value is 2.
-
-    shape = [dim, 3]
-
-we get:
-
-    Out = [[1, 2, 5],
-           [3, 4, 6]].
-)DOC");
-  }
-};
-
-class CropTensorOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of Op(crop_tensor) should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) of Op(crop_tensor) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "ShapeTensor" || var_name == "OffsetsTensor" ||
-        var_name == "Shape" || var_name == "Offsets") {
-      return expected_kernel_type;
-    }
-
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class CropTensorGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("crop_tensor_grad");
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("X", Input("X"));
-    if (ForwardOp().Inputs().count("OffsetsTensor") > 0) {
-      op->SetInput("OffsetsTensor", Input("OffsetsTensor"));
-    }
-    if (ForwardOp().Inputs().count("Offsets") > 0) {
-      op->SetInput("Offsets", Input("Offsets"));
-    }
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(crop_tensor, ops::CropTensorOp, ops::CropTensorOpMaker,
-                  ops::CropTensorGradOpDescMaker);
-REGISTER_OPERATOR(crop_tensor_grad, ops::CropTensorOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    crop_tensor,
-    ops::CropTensorKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CropTensorKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    crop_tensor_grad,
-    ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/crop_tensor_op.cu b/paddle/fluid/operators/crop_tensor_op.cu
deleted file mode 100644
index 9d28d98490830db762bc720d3f1762b757c8af24..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/crop_tensor_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/crop_tensor_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    crop_tensor,
-    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    crop_tensor_grad,
-    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h
deleted file mode 100644
index 42f118d02208dc1ce53d8e313805aa433e5a93a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/crop_tensor_op.h
+++ /dev/null
@@ -1,284 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {  // Internal
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::Tensor;
-
-inline std::vector<int> get_new_data(
-    const std::vector<const Tensor*>& list_new_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_data;
-  for (size_t i = 0; i < list_new_tensor.size(); ++i) {
-    auto tensor = list_new_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(), framework::make_ddim({1}),
-        "The tensor's shape in list of Op(crop_tensor) should be [1].");
-    if (platform::is_gpu_place(tensor->place())) {
-      framework::Tensor temp;
-      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-
-      vec_new_data.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
-    } else {
-      vec_new_data.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
-    }
-  }
-
-  return vec_new_data;
-}
-
-static framework::DDim ValidateShape(const std::vector<int> shape,
-                                     const framework::DDim& in_dims) {
-  auto in_dim_size = in_dims.size();
-  auto shape_size = shape.size();
-  PADDLE_ENFORCE_EQ(
-      in_dim_size, shape_size,
-      "Input(ShapeTensor)'s dimension size of Op(crop_tensor) should be equal "
-      "to that of input tensor. "
-      "Please check the Attr(shape)'s size of Op(fluid.layers.crop_tensor).");
-  const int64_t unk_dim_val = -1;
-  int unk_dim_idx = -1;
-  std::vector<int64_t> output_shape(shape.size(), 0);
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] == unk_dim_val) {
-      PADDLE_ENFORCE_EQ(unk_dim_idx, -1,
-                        "Only one element of shape can be unknown.");
-      PADDLE_ENFORCE_EQ(i, 0, "Only the first element of shape can be -1.");
-      unk_dim_idx = i;
-    } else {
-      PADDLE_ENFORCE_GT(shape[i], 0,
-                        "Each element of shape must be greater than 0 "
-                        "except the first element.");
-    }
-    output_shape[i] = static_cast<int64_t>(shape[i]);
-  }
-
-  return framework::make_ddim(output_shape);
-}
-
-static std::vector<int> GetShape(const framework::ExecutionContext& ctx) {
-  std::vector<int> res;
-  int rank = ctx.Input<Tensor>("X")->dims().size();
-  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("ShapeTensor");
-  if (list_new_shape_tensor.size() > 0) {
-    // have offsets tensor list
-    PADDLE_ENFORCE_EQ(list_new_shape_tensor.size(), rank,
-                      "Input(ShapeTensor)'s length of Op(crop_tensor) should "
-                      "be equal to dimension size of input tensor.");
-    res = get_new_data(list_new_shape_tensor);
-
-    return res;
-  }
-
-  auto* shape_tensor = ctx.HasInput("Shape")
-                           ? ctx.Input<framework::LoDTensor>("Shape")
-                           : nullptr;
-  if (shape_tensor) {
-    auto* shape_data = shape_tensor->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    if (platform::is_gpu_place(shape_tensor->place())) {
-      TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
-      shape_data = cpu_shape_tensor.data<int>();
-    }
-    res = std::vector<int>(shape_data, shape_data + shape_tensor->numel());
-  }
-
-  return res;
-}
-
-static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
-  std::vector<int> res;
-  int rank = ctx.Input<Tensor>("X")->dims().size();
-  auto list_new_offsets_tensor =
-      ctx.MultiInput<framework::Tensor>("OffsetsTensor");
-  if (list_new_offsets_tensor.size() > 0) {
-    // have offsets tensor list
-    res = get_new_data(list_new_offsets_tensor);
-
-    return res;
-  }
-
-  if (ctx.HasInput("Offsets")) {
-    PADDLE_ENFORCE_EQ(
-        ctx.Attr<std::vector<int>>("offsets").empty(), true,
-        "Input 'Offsets' and attribute 'offsets' should not be used "
-        "at the same time.");
-    const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
-    PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1);
-    PADDLE_ENFORCE_EQ(
-        rank, offsets_tensor->dims()[0],
-        "Offsets size should be equal to dimension size of input tensor.");
-    const int* offsets_data;
-    framework::Tensor cpu_tmp_tensor;
-    if (platform::is_cpu_place(offsets_tensor->place())) {
-      offsets_data = offsets_tensor->data<int>();
-    } else {
-      framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(),
-                                &cpu_tmp_tensor);
-      offsets_data = cpu_tmp_tensor.data<int>();
-    }
-    res = std::vector<int>(offsets_data, offsets_data + rank);
-  } else {
-    res = ctx.Attr<std::vector<int>>("offsets");
-    PADDLE_ENFORCE_EQ(
-        rank, static_cast<int>(res.size()),
-        "Offsets size should be equal to dimension size of input tensor.");
-  }
-  return res;
-}
-
-template <typename DeviceContext, typename T, size_t D>
-void CropTensorFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* out = context.Output<Tensor>("Out");
-  auto x_dims = x->dims();
-  auto out_dims = out->dims();
-
-  // get shape from Input(ShapeTensor) of Input(Shape)
-  std::vector<int> shape = GetShape(context);
-  // out_dims setted by arrt(shape)
-  if (shape.size() == 0) {
-    for (size_t i = 0; i < out_dims.size(); ++i) {
-      shape.push_back(out_dims[i]);
-    }
-  }
-  out_dims = ValidateShape(shape, x->dims());
-  if (out_dims[0] == -1) {
-    out_dims[0] = x->dims()[0];
-  }
-
-  out->mutable_data<T>(out_dims, context.GetPlace());
-  auto x_stride = framework::stride(x->dims());
-  auto offsets = GetOffsets(context);
-  int64_t offset = 0;
-  for (size_t i = 0; i < offsets.size(); ++i) {
-    PADDLE_ENFORCE_LE(
-        offsets[i] + shape[i], x_dims[i],
-        "The sum of the Attr(offsets) and Attr(shape) of Op(crop_tensor) "
-        "should be less than or equal to corresponding input dimension size.");
-    offset += (x_stride[i] * offsets[i]);
-  }
-
-  auto x_tensor = EigenTensor<T, D>::From(*x);
-  auto out_tensor = EigenTensor<T, D>::From(*out);
-  Eigen::array<int, D> e_offsets;
-  Eigen::array<int, D> e_shape;
-  for (size_t i = 0; i < D; ++i) {
-    e_offsets[i] = offsets[i];
-    e_shape[i] = out->dims()[i];
-  }
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape);
-}
-
-template <typename DeviceContext, typename T>
-class CropTensorKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        CropTensorFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        CropTensorFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        CropTensorFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        CropTensorFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        CropTensorFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        CropTensorFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "CropTensorOp only support tensors with no more than 6 "
-            "dimensions.");
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, size_t D>
-void CropTensorGradFunction(const framework::ExecutionContext& context) {
-  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-  auto* x = context.Input<Tensor>("X");
-  if (d_x != nullptr) {
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    d_x->mutable_data<T>(x->dims(), context.GetPlace());
-    auto offsets = GetOffsets(context);
-    Eigen::array<std::pair<int, int>, D> paddings;
-    for (size_t i = 0; i < D; ++i) {
-      paddings[i].first = offsets[i];
-      paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
-    }
-    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
-    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    d_x_tensor.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        d_out_tensor.pad(paddings, 0);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CropTensorGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    size_t rank =
-        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
-    switch (rank) {
-      case 1:
-        CropTensorGradFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        CropTensorGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        CropTensorGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        CropTensorGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        CropTensorGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        CropTensorGradFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "CropTensorOp only support tensors with no more than 6 "
-            "dimensions.");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
deleted file mode 100644
index 624b2b9c00de1e6812496a9164a4189c27e87146..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ /dev/null
@@ -1,384 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cross_entropy_op.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-namespace paddle {
-namespace operators {
-
-class CrossEntropyOpBase : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true,
-                      "Input(Label) should be not null.");
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true,
-                      "Output(Y) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    int rank = x_dims.size();
-
-    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
-                               framework::contain_unknown_dim(label_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                        framework::slice_ddim(label_dims, 0, rank - 1),
-                        "Input(X) and Input(Label) shall have the same shape "
-                        "except the last dimension.");
-    }
-
-    if (IsSoftLabel(ctx)) {
-      PADDLE_ENFORCE_EQ(
-          rank, label_dims.size(),
-          "If Attr(soft_label) == true, Input(X) and Input(Label) "
-          "shall have the same rank.");
-      if (check) {
-        PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
-                          "If Attr(soft_label) == true, the last dimension of "
-                          "Input(X) and Input(Label) should be equal.");
-      }
-    } else {
-      if (rank == label_dims.size()) {
-        PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL,
-                          "the last dimension of Input(Label) should be 1.");
-      } else {
-        PADDLE_ENFORCE_EQ(
-            rank, label_dims.size() + 1,
-            "The rank of Input(X) should be equal to Input(Label) plus 1.");
-      }
-    }
-
-    auto y_dims = label_dims;
-    if (rank == label_dims.size()) {
-      y_dims[rank - 1] = 1;
-    }
-    ctx->SetOutputDim("Y", y_dims);
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of cross_entropy
-  // is determined by its input "X".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
-  virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
-    return ctx->Attrs().Get<bool>("soft_label");
-  }
-};
-
-class CrossEntropyGradientOpBase : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true,
-                      "Input(Label) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Y")), true,
-                      "Input(Y@GRAD) shoudl be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      "Output(X@GRAD) should be not null.");
-
-    auto x_dims = GetXDim(ctx);
-    auto label_dims = ctx->GetInputDim("Label");
-    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(dy_dims.size(), label_dims.size(),
-                      "Input(Y@Grad) and Input(Y) should have the same rank.");
-
-    bool check = true;
-    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
-                                framework::product(label_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                        framework::slice_ddim(dy_dims, 0, rank - 1),
-                        "The Input(X) and Input(Y@Grad) should have the same "
-                        "shape except the last dimension.");
-    }
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD(VarNameWithXLoD(), framework::GradVarName("X"));
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of cross_entropy
-  // is determined by its input "X".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Y"))->type(),
-        ctx.device_context());
-  }
-
-  virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const {
-    return ctx->GetInputDim("X");
-  }
-
-  virtual const char* VarNameWithXLoD() const { return "X"; }
-
-  virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
-    return ctx->Attrs().Get<bool>("soft_label");
-  }
-};
-
-class CrossEntropyOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
-  }
-};
-
-class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a tensor whose last dimension "
-             "size is equal to the number of classes. This input is a "
-             "probability computed by the previous operator, which is almost "
-             "always the result of a softmax operator.");
-    AddInput(
-        "Label",
-        "(Tensor), the tensor which represents the ground truth. It has the "
-        "same shape with 'X' except the last dimension. When soft_label is set "
-        "to false, the last dimension size is 1; when soft_label is set to "
-        "true, the last dimension size is equal to the number of classes.");
-    AddOutput("Y",
-              "(Tensor, default Tensor<float>), a tensor whose shape is same "
-              "with 'X' except that the last dimension size is 1. It "
-              "represents the cross entropy loss.");
-    AddAttr<bool>("soft_label",
-                  "(bool, default false), a flag indicating whether to "
-                  "interpretate the given labels as soft labels.")
-        .SetDefault(false);
-    AddAttr<int>("ignore_index",
-                 "(int, default -100), Specifies a target value that is"
-                 "ignored and does not contribute to the input gradient."
-                 "Only valid if soft_label is set to False")
-        .SetDefault(-100);
-    AddComment(R"DOC(
-CrossEntropy Operator.
-
-The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
-The matrix's second dimension(row length) is as same as the original last 
-dimension, and the first dimension(column length) is the product of all other 
-original dimensions. Then the softmax computation will take palce on each raw 
-of flattened matrixs.
-
-It supports both standard cross-entropy and soft-label cross-entropy loss
-computation.
-1) One-hot cross-entropy:
-    soft_label = false, Label[i, 0] indicates the class index for sample i:
-
-                $Y[i] = -\log(X[i, Label[i]])$
-
-2) Soft-label cross-entropy:
-    soft_label = true, Label[i, j] indicates the soft label of class j
-    for sample i:
-
-                $Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}$
-
-   Please make sure that in this case the summuation of each row of Label
-   equals one.
-
-3) One-hot cross-entropy with vecterized Input(Label):
-     As a special case of 2), when each row of Input(Label) has only one
-     non-zero element (equals 1), soft-label cross-entropy degenerates to a
-     one-hot cross-entropy with one-hot label representation.
-
-Both the input X and Label can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input X.
-
-)DOC");
-  }
-};
-
-class CrossEntropyGradientOp : public CrossEntropyGradientOpBase {
- public:
-  using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should be not null.");
-    CrossEntropyGradientOpBase::InferShape(ctx);
-  }
-};
-
-class CrossEntropyGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("cross_entropy_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Label", Input("Label"));
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class CrossEntropyOp2 : public CrossEntropyOpBase {
- public:
-  using CrossEntropyOpBase::CrossEntropyOpBase;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    CrossEntropyOpBase::InferShape(ctx);
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true,
-                      "Output(XShape) should be not null.");
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("MatchX"), true,
-                      "Output(MatchX) should be not null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_dims_vec = framework::vectorize(x_dims);
-    x_dims_vec.push_back(0);
-    ctx->SetOutputDim("XShape", framework::make_ddim(x_dims_vec));
-    x_dims[x_dims.size() - 1] = 1;
-    ctx->SetOutputDim("MatchX", x_dims);
-    ctx->ShareLoD("X", /*->*/ "XShape");
-  }
-
- protected:
-  bool IsSoftLabel(framework::InferShapeContext* ctx) const override {
-    return false;
-  }
-};
-
-class CrossEntropyGradientOp2 : public CrossEntropyGradientOpBase {
- public:
-  using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("MatchX"), true,
-                      "Input(MatchX) must exist");
-    CrossEntropyGradientOpBase::InferShape(ctx);
-  }
-
- protected:
-  virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const {
-    auto x_shape = ctx->GetInputDim("XShape");
-    return framework::DDim(x_shape.Get(), x_shape.size() - 1);
-  }
-
-  virtual const char* VarNameWithXLoD() const { return "XShape"; }
-
-  virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
-    return false;
-  }
-};
-
-class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a tensor whose last dimension "
-             "size is equal to the number of classes. This input is a "
-             "probability computed by the previous operator, which is almost "
-             "always the result of a softmax operator.");
-    AddInput(
-        "Label",
-        "(Tensor), the tensor which represents the ground truth. It has the "
-        "same shape with 'X' except the last dimension. One hot Tensor.");
-    AddOutput("Y",
-              "(Tensor, default Tensor<float>), a tensor whose shape is same "
-              "with 'X' except that the last dimension size is 1. It "
-              "represents the cross entropy loss.");
-    AddOutput("XShape", "Temporaily variable to save shape and LoD of X.");
-    AddOutput("MatchX",
-              "X value that matches label, used for gradient computation.");
-    AddAttr<int>("ignore_index",
-                 "(int, default -100), Specifies a target value that is"
-                 "ignored and does not contribute to the input gradient."
-                 "Only valid if soft_label is set to False")
-        .SetDefault(-100);
-    AddComment(R"DOC(
-Hard-label CrossEntropy Operator.
-
-The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
-The matrix's second dimension(row length) is as same as the original last 
-dimension, and the first dimension(column length) is the product of all other 
-original dimensions. Then the softmax computation will take palce on each raw 
-of flattened matrixs.
-
-Only support hard label.
-
-Both the input X and Label can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input X.
-
-)DOC");
-  }
-};
-
-class CrossEntropyGradOpDescMaker2 : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("cross_entropy_grad2");
-    op->SetInput("Label", Input("Label"));
-    op->SetInput("MatchX", Output("MatchX"));
-    op->SetInput("XShape", Output("XShape"));
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPUCtx = paddle::platform::CPUDeviceContext;
-
-REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase,
-                  ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType,
-                  ops::CrossEntropyGradOpDescMaker);
-REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
-REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
-                       ops::CrossEntropyOpKernel<CPUCtx, double>);
-REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
-                       ops::CrossEntropyGradientOpKernel<CPUCtx, float>,
-                       ops::CrossEntropyGradientOpKernel<CPUCtx, double>);
-
-REGISTER_OPERATOR(cross_entropy2, ops::CrossEntropyOp2,
-                  ops::CrossEntropyOpMaker2, ops::CrossEntropyOpInferVarType,
-                  ops::CrossEntropyGradOpDescMaker2);
-REGISTER_OPERATOR(cross_entropy_grad2, ops::CrossEntropyGradientOp2);
-REGISTER_OP_CPU_KERNEL(cross_entropy2,
-                       ops::CrossEntropyOpKernel2<CPUCtx, float>,
-                       ops::CrossEntropyOpKernel2<CPUCtx, double>);
-REGISTER_OP_CPU_KERNEL(cross_entropy_grad2,
-                       ops::CrossEntropyGradientOpKernel2<CPUCtx, float>,
-                       ops::CrossEntropyGradientOpKernel2<CPUCtx, double>);
diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
deleted file mode 100644
index 243e7f52c1e3c4c210e91f708ae5d6de97e4afbc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cross_entropy_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-using CUDACtx = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(cross_entropy,
-                        ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>,
-                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(cross_entropy2,
-                        ops::CrossEntropyOpKernel2<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel2<CUDACtx, double>,
-                        ops::CrossEntropyOpKernel2<CUDACtx, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    cross_entropy_grad2, ops::CrossEntropyGradientOpKernel2<CUDACtx, float>,
-    ops::CrossEntropyGradientOpKernel2<CUDACtx, double>,
-    ops::CrossEntropyGradientOpKernel2<CUDACtx, plat::float16>);
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
deleted file mode 100644
index 667135c4f8d145cdba4255dab0f8075489b68d6d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ /dev/null
@@ -1,278 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class CrossEntropyOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    int rank = x->dims().size();
-    auto label_dims = labels->dims();
-    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
-    Tensor labels_2d, y_2d;
-    if (label_dims.size() < rank) {
-      labels_2d.ShareDataWith(*labels);
-      labels_2d.Resize({framework::product(label_dims), 1});
-
-      y_2d.ShareDataWith(*y);
-      y_2d.Resize({framework::product(y->dims()), 1});
-
-    } else {
-      labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
-      y_2d = framework::ReshapeToMatrix(*y, rank - 1);
-    }
-
-    int axis_dim = x->dims()[rank - 1];
-    math::CrossEntropyFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d,
-        ctx.Attr<bool>("soft_label"), ctx.Attr<int>("ignore_index"), axis_dim);
-  }
-};
-
-template <typename T>
-class XeSoftlabelGradFunctor {
- public:
-  XeSoftlabelGradFunctor(T* dx,
-                         const T* dy,     // NOLINT
-                         const T* x,      // NOLINT
-                         const T* label,  // NOLINT
-                         size_t num_classes)
-      : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
-
-  HOSTDEVICE void operator()(size_t i) {
-    auto row_ids = i / num_classes_;
-    dx_[i] = -label_[i] * dy_[row_ids] / x_[i];
-  }
-
- private:
-  T* dx_;
-  const T* dy_;
-  const T* x_;
-  const T* label_;
-  size_t num_classes_;
-};
-
-template <typename T>
-class XeGradFunctor {
- public:
-  XeGradFunctor(T* dx,
-                const T* dy,           // NOLINT
-                const T* x,            // NOLINT
-                const int64_t* label,  // NOLINT
-                size_t num_classes, size_t ignore_index)
-      : dx_(dx),
-        dy_(dy),
-        x_(x),
-        label_(label),
-        num_classes_(num_classes),
-        ignore_index_(ignore_index) {}
-
-  HOSTDEVICE void operator()(size_t sample_id) {
-    auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
-    for (size_t x_offset = sample_id * num_classes_;
-         x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
-      dx_[x_offset] = (x_offset != x_is_true_offset ||
-                       label_[sample_id] == static_cast<int64_t>(ignore_index_))
-                          ? static_cast<T>(0)
-                          : -dy_[sample_id] / x_[x_offset];
-    }
-  }
-
- private:
-  T* dx_;
-  const T* dy_;
-  const T* x_;
-  const int64_t* label_;
-  size_t num_classes_;
-  size_t ignore_index_;
-};
-
-template <typename DeviceContext, typename T>
-class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-
-    // Following computation only depends on the last dimension size. So it's
-    // unnecessary to convert tensors to 2-D views.
-    int rank = x->dims().size();
-    int64_t class_num = x->dims()[rank - 1];
-    int64_t ignore_index = ctx.Attr<int>("ignore_index");
-    if (ctx.Attr<bool>("soft_label")) {
-      XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
-                                        label->data<T>(),
-                                        static_cast<size_t>(class_num));
-      platform::ForRange<DeviceContext> for_range(
-          ctx.template device_context<DeviceContext>(),
-          static_cast<size_t>(dx->numel()));
-      for_range(functor);
-    } else {
-      XeGradFunctor<T> functor(
-          dx_data, dy->data<T>(), x->data<T>(), label->data<int64_t>(),
-          static_cast<size_t>(class_num), static_cast<size_t>(ignore_index));
-      platform::ForRange<DeviceContext> for_range(
-          ctx.template device_context<DeviceContext>(),
-          static_cast<size_t>(dy->numel()));
-      for_range(functor);
-    }
-  }
-};
-
-template <typename T>
-struct HardLabelCrossEntropyForwardFunctor {
-  HardLabelCrossEntropyForwardFunctor(const T* x, T* y, T* match_x,
-                                      const int64_t* label,
-                                      int64_t ignore_index,
-                                      int64_t feature_size)
-      : x_(x),
-        y_(y),
-        match_x_(match_x),
-        label_(label),
-        ignore_index_(ignore_index),
-        feature_size_(feature_size) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    auto label = label_[idx];
-    if (label != ignore_index_) {
-      PADDLE_ENFORCE(label >= 0 && label < feature_size_,
-                     "Variable value (label) of "
-                     "OP(fluid.layers.cross_entropy) expected >= 0 "
-                     "and < %ld, but got %ld. Please check label value.",
-                     feature_size_, label);
-      auto match_x = x_[idx * feature_size_ + label];
-      y_[idx] = -math::TolerableValue<T>()(real_log(match_x));
-      match_x_[idx] = match_x;
-    } else {
-      y_[idx] = 0;
-      match_x_[idx] = 0;  // any value is ok
-    }
-  }
-
-  const T* x_;
-  T* y_;
-  T* match_x_;
-  const int64_t* label_;
-  int64_t ignore_index_;
-  int64_t feature_size_;
-};
-
-template <typename T>
-struct HardLabelCrossEntropyBackwardFunctor {
-  HardLabelCrossEntropyBackwardFunctor(T* dx, const T* dy, const T* match_x,
-                                       const int64_t* label,
-                                       int64_t ignore_index,
-                                       int64_t feature_size)
-      : dx_(dx),
-        dy_(dy),
-        match_x_(match_x),
-        label_(label),
-        ignore_index_(ignore_index),
-        feature_size_(feature_size) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    auto row_idx = idx / feature_size_;
-    auto col_idx = idx % feature_size_;
-    auto label = label_[row_idx];
-    if (label == col_idx && label != ignore_index_) {
-      dx_[idx] = -dy_[row_idx] / match_x_[row_idx];
-    } else {
-      dx_[idx] = 0;
-    }
-  }
-
-  T* dx_;
-  const T* dy_;
-  const T* match_x_;
-  const int64_t* label_;
-  int64_t ignore_index_;
-  int64_t feature_size_;
-};
-
-template <typename DeviceContext, typename T>
-class CrossEntropyOpKernel2 : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* match_x = ctx.Output<Tensor>("MatchX");
-
-    auto& x_dims = x->dims();
-    auto feature_size = x_dims[x_dims.size() - 1];
-    auto batch_size = framework::product(x->dims()) / feature_size;
-
-    auto* p_x = x->data<T>();
-    auto* p_label = label->data<int64_t>();
-    auto* p_y = y->mutable_data<T>(ctx.GetPlace());
-    auto* p_match_x = match_x->mutable_data<T>(ctx.GetPlace());
-
-    auto ignore_index = ctx.Attr<int>("ignore_index");
-
-    platform::ForRange<DeviceContext> for_range(
-        ctx.template device_context<DeviceContext>(), batch_size);
-    for_range(HardLabelCrossEntropyForwardFunctor<T>(
-        p_x, p_y, p_match_x, p_label, ignore_index, feature_size));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CrossEntropyGradientOpKernel2 : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* match_x = ctx.Input<Tensor>("MatchX");
-    auto* label = ctx.Input<Tensor>("Label");
-
-    auto* p_dx = dx->mutable_data<T>(ctx.GetPlace());
-    auto* p_dy = dy->data<T>();
-    auto* p_match_x = match_x->data<T>();
-    auto* p_label = label->data<int64_t>();
-
-    int64_t ignore_index = ctx.Attr<int>("ignore_index");
-    int rank = dx->dims().size();
-    int64_t feature_size = dx->dims()[rank - 1];
-    int64_t batch_size = framework::product(dx->dims()) / feature_size;
-
-    platform::ForRange<DeviceContext> for_range(
-        ctx.template device_context<DeviceContext>(),
-        batch_size * feature_size);
-    for_range(HardLabelCrossEntropyBackwardFunctor<T>(
-        p_dx, p_dy, p_match_x, p_label, ignore_index, feature_size));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
deleted file mode 100644
index 4abe9509e6d4a5143698fcdf343bc54f6ad207fc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/ctc_align_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CTCAlignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      "Input of CTCAlignOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Output"), true,
-                      "Output of CTCAlignOp should not be null.");
-
-    auto input_dims = ctx->GetInputDim("Input");
-
-    // TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
-    ctx->SetOutputDim("Output", input_dims);
-    if (ctx->HasInput("InputLength")) {
-      ctx->SetOutputDim("OutputLength", {input_dims[0], 1});
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "2-D Tensor or LodTensor with  shape "
-             "[Lp, 1], where Lp is the sum of all input sequences' length.");
-    AddInput("InputLength",
-             "2-D Tensor with shape [batch_size, 1], "
-             " When Input is padding mode, InputLength is length of every "
-             "sequence in Input.")
-        .AsDispensable();
-    AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
-    AddOutput("OutputLength",
-              "2-D Tensor with shape [batch_size, 1], "
-              "When Input is padding mode, OutputLength is length of every "
-              "sequence in Output.")
-        .AsDispensable();
-    AddAttr<int>("blank",
-                 "(int, default: 0), the blank label setted in Connectionist "
-                 "Temporal Classification (CTC) op.")
-        .SetDefault(0);
-    AddAttr<bool>("merge_repeated",
-                  "(bool, default: true), whether to "
-                  "merge repeated elements between two blanks. ")
-        .SetDefault(true);
-    // add attr padding number for tensor input
-    AddAttr<int>("padding_value",
-                 "(int, default: 0), padding number "
-                 "use to padding tensor. ")
-        .SetDefault(0);
-    AddComment(R"DOC(
-CTCAlign op is used to merge repeated elements between two blanks
-and then delete all blanks in sequence.
-
-Given:
-    Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6,
-                  6, 0, 0, 7, 7, 7, 0]
-    Input.dims = {18, 1}
-    Input.LoD = [[0, 11, 18]]
-
-And:
-    blank = 0
-    merge_repeated = True
-
-Then:
-    Output.data = [1, 2, 4, 4, 5, 6,
-                   6, 7]
-    Output.dims = {8, 1}
-    Output.LoD = [[0, 6, 8]]
-or Given:
-    Input.data = [[0, 1, 2, 2, 0, 4], 
-                  [0, 4, 5, 0, 6, 0], 
-                  [0, 7, 7, 7, 0, 0]]
-    InputLength.data  = [[6],
-                         [5],
-                         [4]],   
-    Input.dims = {3, 6},
-    Input.Lod = []
-And:
-    blank = 0
-    merge_repeated = True
-    padding_value = 0
-
-Then:
-    Output.data = [[1, 2, 4, 0, 0, 0],
-                   [4, 5, 6, 0, 0, 0],
-                   [7, 0, 0, 0, 0, 0]],
-    OutputLength.data = [[3],
-                         [3],
-                         [1]],
-    Output.dims = {3, 6},
-    Output.Lod = []
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(ctc_align, ops::CTCAlignOp, ops::CTCAlignOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    ctc_align, ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
deleted file mode 100644
index 44a7c16f96a5e6298e4b5602252698198c726c8f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <vector>
-#include "paddle/fluid/operators/ctc_align_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
-                                      const size_t num_seq, size_t* lod0,
-                                      const int blank, const int merge_repeated,
-                                      size_t* out_lod0, T* output) {
-  int ouput_idx = 0;
-  out_lod0[0] = 0;
-
-  for (int i = 0; i < num_seq; ++i) {
-    T pre_token = -1;
-    for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
-      if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
-        output[ouput_idx] = tokens[j];
-        ++ouput_idx;
-      }
-      pre_token = tokens[j];
-    }
-    out_lod0[i + 1] = ouput_idx;
-  }
-}
-
-template <typename T>
-__global__ void PaddingMergeAndDelCudaKernel(
-    const int64_t num_token, const T* tokens, const T* tokens_length,
-    const int blank, const int merge_repeated, const int padding_value,
-    const int64_t batch_size, T* output, T* output_length) {
-  int ind = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ind >= batch_size) return;
-  int output_idx = ind * num_token;
-  T prev_token = -1;
-  for (int i = ind * num_token; i < ind * num_token + tokens_length[ind]; i++) {
-    if ((unsigned)tokens[i] != blank &&
-        !(merge_repeated && tokens[i] == prev_token)) {
-      output[output_idx] = tokens[i];
-      ++output_idx;
-    }
-    prev_token = tokens[i];
-  }
-  output_length[ind] = output_idx - ind * num_token;
-  for (int i = output_idx; i < ind * num_token + num_token; i++) {
-    output[i] = padding_value;
-  }
-}
-
-template <typename T>
-class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* output = ctx.Output<LoDTensor>("Output");
-    const int blank = ctx.Attr<int>("blank");
-    const int merge_repeated =
-        static_cast<int>(ctx.Attr<bool>("merge_repeated"));
-    const T* tokens = input->data<T>();
-    auto stream = ctx.cuda_device_context().stream();
-
-    // tensor input which has no lod
-    if (input->lod().empty()) {
-      const int padding_value = ctx.Attr<int>("padding_value");
-      auto input_dims = input->dims();
-      T* output_data = output->mutable_data<T>({input_dims[0], input_dims[1]},
-                                               ctx.GetPlace());
-      auto* input_length = ctx.Input<LoDTensor>("InputLength");
-      const T* input_length_data = input_length->data<T>();
-      auto* output_length = ctx.Output<LoDTensor>("OutputLength");
-      T* output_length_data =
-          output_length->mutable_data<T>({input_dims[0], 1}, ctx.GetPlace());
-      PaddingMergeAndDelCudaKernel<
-          T><<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
-          input_dims[1], tokens, input_length_data, blank, merge_repeated,
-          padding_value, input_dims[0], output_data, output_length_data);
-    } else {
-      const size_t level = 0;
-      auto input_lod = framework::ToAbsOffset(input->lod());
-
-      const int64_t num_tokens = input->dims()[0];
-      const size_t num_seq = input_lod[level].size() - 1;
-
-      // prepare a lod to record lod information while merging elements
-      thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
-      size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
-
-      // merge elements and delete blank
-      T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
-
-      MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
-          num_tokens, tokens, num_seq,
-          input_lod[level].CUDAMutableData(ctx.GetPlace()), blank,
-          merge_repeated, dev_out_lod0_ptr, output_data);
-
-      // set output lod
-      std::vector<size_t> host_out_lod0(dev_out_lod0.begin(),
-                                        dev_out_lod0.end());
-      framework::LoD out_lod;
-      out_lod.push_back(host_out_lod0);
-      output->set_lod(out_lod);
-
-      // resize output dims
-      output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
-
-      if (host_out_lod0.back() == 0) {
-        output->Resize({1, 1});
-        output->mutable_data<T>(ctx.GetPlace());
-        math::SetConstant<platform::CUDADeviceContext, T> set_constant;
-        set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
-                     output, -1);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(ctc_align, paddle::operators::CTCAlignOpCUDAKernel<int>,
-                        paddle::operators::CTCAlignOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
deleted file mode 100644
index ccf91471ab99ced7d2731f877f14ecb8deb437ea..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ctc_align_op.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string.h>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class CTCAlignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* output = ctx.Output<LoDTensor>("Output");
-    size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
-    bool merge_repeated = ctx.Attr<bool>("merge_repeated");
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    auto input_dims = input->dims();
-    const T* input_data = input->data<T>();
-
-    // support tensor input, no lod information
-    if (input->lod().empty()) {
-      size_t padding_value =
-          static_cast<size_t>(ctx.Attr<int>("padding_value"));
-      auto* input_length = ctx.Input<LoDTensor>("InputLength");
-      const T* input_length_data = input_length->data<T>();
-
-      auto* output_length = ctx.Output<LoDTensor>("OutputLength");
-      T* output_length_data = output_length->mutable_data<T>(ctx.GetPlace());
-
-      for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0];
-           batch_id++) {
-        T prev_token = -1;
-        size_t output_idx = 0;
-        for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) {
-          size_t input_ind = batch_id * input_dims[1] + i;
-          if ((unsigned)input_data[input_ind] != blank &&
-              !(merge_repeated && input_data[input_ind] == prev_token)) {
-            output_data[batch_id * input_dims[1] + output_idx] =
-                input_data[input_ind];
-            ++output_idx;
-          }
-          prev_token = input_data[input_ind];
-        }
-        output_length_data[batch_id] = output_idx;
-        for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
-          output_data[batch_id * input_dims[1] + j] = padding_value;
-      }
-    } else {
-      const size_t level = 0;
-      auto input_lod = framework::ToAbsOffset(input->lod());
-
-      // check input dims and lod
-      PADDLE_ENFORCE_EQ(
-          input_dims[0], static_cast<int64_t>(input_lod[level].back()),
-          "The first dimension of Input(Input) should be equal to "
-          "the sum of all sequences' lengths.");
-
-      const size_t num_sequences = input_lod[level].size() - 1;
-
-      // merge repeated tokens and delete blank
-      size_t output_idx = 0;
-      std::vector<size_t> output_lod0(1, 0);
-      for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
-        T prev_token = -1;
-        for (size_t i = input_lod[level][seq_idx];
-             i < input_lod[level][seq_idx + 1]; ++i) {
-          if ((unsigned)input_data[i] != blank &&
-              !(merge_repeated && input_data[i] == prev_token)) {
-            output_data[output_idx] = input_data[i];
-            ++output_idx;
-          }
-          prev_token = input_data[i];
-        }
-        output_lod0.push_back(output_idx);
-      }
-
-      // set output lod
-      framework::LoD output_lod;
-      output_lod.push_back(output_lod0);
-      output->set_lod(output_lod);
-      // resize output dims
-      output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
-      // for empty sequence
-      if (output_lod0.back() == 0) {
-        output->Resize({1, 1});
-        output_data = output->mutable_data<T>(ctx.GetPlace());
-        output_data[0] = -1;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
deleted file mode 100644
index 73e04da3b0db275ed4d49878e8c0a8879b3106dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class CudnnLSTMOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(Weight) of LSTM should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("InitH"),
-                   "Input(init_h) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("InitC"),
-                   "Input(init_c) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Cache"),
-                   "Input(Cache) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("last_h"),
-                   "Output(last_h) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("last_c"),
-                   "Output(last_c) of LSTM should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3.");
-
-    auto out_dims = in_dims;
-    auto hidden_size = ctx->Attrs().Get<int>("hidden_size");
-    out_dims[2] = hidden_size;
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH"));
-    ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC"));
-  }
-};
-
-class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "Input",
-        "(Tensor) RNN input tensor, which support variable-time length input "
-        "sequence."
-        "The shape of the Tensor MUST be ( seq_len * batch_size * input_size)"
-        "seq_len is the total time step in this mini-batch (CAN be change in "
-        "different batch)"
-        "batch_size is the instance number of this batch"
-        "input_size is the hidden size of the input."
-        "input_hidden_size and the hidden_size in the next may not be same");
-    AddInput("InitH",
-             "(Tensor) the initial hidden state of the LSTM"
-             "input. This is a tensor with shape (num_layers x batch_size x "
-             "hidden_size)"
-             "and When is_bidirec is True, the shape will be (num_layers*2 x "
-             "batch_size x hidden_size)");
-    AddInput("InitC",
-             "(Tensor) the initial cell state of the LSTm "
-             "input. This is a tensor with shape (num_layers x batch_size x "
-             "hidden_size)"
-             "and When is_bidirec is True, the shape will be (num_layers*2 x "
-             "batch_size x hidden_size)");
-    AddInput("W",
-             "(Tensor) the learnable hidden-hidden weights."
-             " The shape is (N), where N is total weight size of the LSTM. "
-             " cudnn concatenate all the weight to one Tensor");
-    AddInput("Cache",
-             "The cache of dropout op, a RAW type variable including random "
-             "number generator states and some descriptors, which is used in "
-             "cudnn kernel.")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor) the hidden state of LSTM operator. "
-              "The shape is ( seq_len x batch_size x hidden_size) if "
-              "is_bidirec is False"
-              "and When is_bidirec is True, the shape will be ( seq_len x "
-              "batch_size x hidden_size * 2) ");
-    AddOutput("last_h",
-              "(Tensor) the hidden state of the last step. "
-              "The shape is ( num_layers x batch_size x hidden_size) if "
-              "is_bidirec is False"
-              "and When is_bidirec is True, the shape will be (num_layers*2 x "
-              "batch_size x hidden_size)");
-    AddOutput("last_c",
-              "(Tensor) the cell state of the last step"
-              "The shape is ( num_layers x batch_size x hidden_size) if "
-              "is_bidirec is False"
-              "and When is_bidirect is True, the shape will be (num_layers*2 x "
-              "batch_size x hidden_size*2)");
-    AddAttr<int>("max_len",
-                 "max length of the LSTM op"
-                 "the first dim of the Input can NOT be greater than max_len")
-        .SetDefault(20);
-    AddAttr<float>(
-        "dropout_prob",
-        "dropout prob of the dropout op"
-        "the dropout ONLY work between lstm layers, not between time steps"
-        "There is no dropout work on the Out tensor")
-        .SetDefault(0.0);
-    AddAttr<bool>("is_bidirec",
-                  "is_bidirec"
-                  "if it is bidirection rnn"
-                  "The will affect the shape of the Out, last_h, and last_c")
-        .SetDefault(false);
-    AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
-    AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
-    AddAttr<int>("num_layers", "the total layer number of the LSTM")
-        .SetDefault(1);
-    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
-    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
-    AddComment(R"DOC(
-CUDNN LSTM implementation
-
-A four-gate Long Short-Term Memory network with no peephole connections.
-In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
-the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
-
-$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
-
-$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
-
-$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
-
-$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
-
-$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
-
-$$ h_t = o_t \\odot tanh(c_t) $$
-
-- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
-  of weights from the input gate to the input)
-- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
-- sigmoid is the logistic sigmoid function.
-- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-  and cell activation vectors, respectively, all of which have the same size as
-  the cell output activation vector $h$.
-- The $\odot$ is the element-wise product of the vectors.
-- `tanh` is the activation functions.
-- $\tilde{c_t}$ is also called candidate hidden state,
-  which is computed based on the current input and the previous hidden state.
-
-Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
-X represensts a matrix multiplication
-
-
-)DOC");
-  }
-};
-
-class CudnnLSTMGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Cache"),
-                   "Input(last_c) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("InitH"),
-                   "Input(init_h) of LSTM should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("InitC"),
-                   "Input(init_c) of LSTM should not be null.");
-
-    auto SetOutGradDim = [&ctx](const std::string& name) {
-      auto g_name = framework::GradVarName(name);
-      if (ctx->HasOutput(g_name)) {
-        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
-      }
-    };
-
-    SetOutGradDim("Input");
-    SetOutGradDim("W");
-    SetOutGradDim("InitH");
-    SetOutGradDim("InitC");
-  }
-};
-
-class CudnnLSTMGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("cudnn_lstm_grad");
-    op->SetInput("Input", Input("Input"));
-    op->SetInput("InitH", Input("InitH"));
-    op->SetInput("InitC", Input("InitC"));
-    op->SetInput("W", Input("W"));
-    if (ForwardOp().Inputs().count("Cache") > 0) {
-      op->SetInput("Cache", Input("Cache"));
-    }
-    op->SetInput("Out", Output("Out"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput(framework::GradVarName("last_c"), OutputGrad("last_c"));
-    op->SetInput(framework::GradVarName("last_h"), OutputGrad("last_h"));
-
-    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("W"), InputGrad("W"));
-    op->SetOutput(framework::GradVarName("InitH"), InputGrad("InitH"));
-    op->SetOutput(framework::GradVarName("InitC"), InputGrad("InitC"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-template <typename T>
-class NotImpleKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(
-        "CPU is not support for this kernel now. Will be add in the future");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
-                  ops::CudnnLSTMGradOpDescMaker);
-REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
-
-REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);
-REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel<float>);
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
deleted file mode 100644
index 1bf41ed948b5bd4fbd49587f072f5debfa81d77c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ /dev/null
@@ -1,261 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/cudnn_rnn_cache.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-template <typename T>
-class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const Tensor *x = ctx.Input<Tensor>("Input");
-    const Tensor *init_h = ctx.Input<Tensor>("InitH");
-    const Tensor *init_c = ctx.Input<Tensor>("InitC");
-
-    auto w = ctx.Input<Tensor>("W");
-
-    Tensor *out = ctx.Output<Tensor>("Out");
-    Tensor *last_h = ctx.Output<Tensor>("last_h");
-    Tensor *last_c = ctx.Output<Tensor>("last_c");
-
-    const T *x_data = x->data<T>();
-    const T *init_h_data = init_h->data<T>();
-    const T *init_c_data = init_c->data<T>();
-
-    const T *w_data = w->data<T>();
-
-    T *out_data = out->mutable_data<T>(ctx.GetPlace());
-    T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
-    T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
-
-    size_t max_len = ctx.Attr<int>("max_len");
-    float dropout_prob = ctx.Attr<float>("dropout_prob");
-    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
-    int input_size = ctx.Attr<int>("input_size");
-    int hidden_size = ctx.Attr<int>("hidden_size");
-    int num_layers = ctx.Attr<int>("num_layers");
-    bool is_test = ctx.Attr<bool>("is_test");
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    auto *cache_var = ctx.InputVar("Cache");
-    if (!cache_var) {
-      // The RAW type cache variable wouldn't be created and broadcasted on
-      // multi-devices before the first running.
-      // use parent scope to make cache persistable
-      auto *scope = const_cast<framework::Scope *>(ctx.scope().parent());
-      auto cache_var_name = ctx.Inputs("Cache")[0];
-      cache_var = scope->Var(cache_var_name);
-    }
-    CudnnRNNCache *cudnn_rnn_cache = nullptr;
-    if (cache_var->IsInitialized()) {
-      // const_cast is usually bad.
-      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
-                            ->GetMutable<CudnnRNNCache>();
-    } else {
-      // const_cast is usually bad.
-      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
-                            ->GetMutable<CudnnRNNCache>();
-      std::random_device rnd;
-      int seed = ctx.Attr<int>("seed");
-      if (seed == -1) {
-        seed = rnd();
-      }
-
-      auto input_w_numel = w->numel();
-      auto batch_size = x->dims()[1];
-      cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size,
-                            input_size, hidden_size, num_layers, dropout_prob,
-                            is_bidirec, seed, input_w_numel);
-    }
-
-    auto run_seq_len = x->dims()[0];
-
-    if (is_test) {
-      // for inference
-      CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference(
-          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
-          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
-          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
-          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
-          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-          cudnn_rnn_cache->workspace_size_));
-    } else {
-      // for train
-      CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining(
-          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
-          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
-          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
-          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
-          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-          cudnn_rnn_cache->workspace_size_,
-          cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
-          cudnn_rnn_cache->reserve_size_));
-    }
-  }
-};
-
-template <typename T>
-class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *input = ctx.Input<Tensor>("Input");
-    auto *weight = ctx.Input<Tensor>("W");
-    auto *init_h = ctx.Input<Tensor>("InitH");
-    auto *init_c = ctx.Input<Tensor>("InitC");
-    // auto * last_h = ctx.Input<Tensor>("last_h");
-    // auto * last_c = ctx.Input<Tensor>("last_c");
-    auto *out = ctx.Input<Tensor>("Out");
-    auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("last_h"));
-    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("last_c"));
-
-    // auto* init_h = ctx.Input<Tensor>("init_h");
-    // auto* init_c = ctx.Input<Tensor>("init_c");
-
-    auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
-    auto *init_h_grad = ctx.Output<Tensor>(framework::GradVarName("InitH"));
-    auto *init_c_grad = ctx.Output<Tensor>(framework::GradVarName("InitC"));
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    auto *cache_var = ctx.InputVar("Cache");
-    PADDLE_ENFORCE(cache_var->IsInitialized());
-    CudnnRNNCache *cudnn_rnn_cache =
-        const_cast<framework::Variable *>(cache_var)
-            ->GetMutable<CudnnRNNCache>();
-
-    auto input_dims = input->dims();
-    auto init_h_dims = init_h->dims();
-    auto init_c_dims = init_c->dims();
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    weight_grad->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
-    zero(dev_ctx, in_grad, static_cast<T>(0.0));
-    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-
-    T *init_h_grad_data = NULL;
-    if (init_h_grad == nullptr) {
-      Tensor init_h_grad_temp;
-      init_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, &init_h_grad_temp, static_cast<T>(0.0));
-
-      init_h_grad_data = init_h_grad_temp.data<T>();
-    } else {
-      init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, init_h_grad, static_cast<T>(0.0));
-      init_h_grad_data = init_h_grad->data<T>();
-    }
-
-    T *init_c_grad_data = NULL;
-    if (init_c_grad == nullptr) {
-      Tensor init_c_grad_temp;
-      init_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, &init_c_grad_temp, static_cast<T>(0.0));
-
-      init_c_grad_data = init_c_grad_temp.data<T>();
-    } else {
-      init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, init_c_grad, static_cast<T>(0.0));
-      init_c_grad_data = init_c_grad->data<T>();
-    }
-
-    const T *last_h_grad_data = NULL;
-    if (last_h_grad == nullptr) {
-      Tensor last_h_grad_temp;
-      last_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, &last_h_grad_temp, static_cast<T>(0.0));
-
-      last_h_grad_data = (const T *)last_h_grad_temp.data<T>();
-    } else {
-      last_h_grad_data = last_h_grad->data<T>();
-    }
-
-    const T *last_c_grad_data = NULL;
-    if (last_c_grad == nullptr) {
-      Tensor last_c_grad_temp;
-      last_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, &last_c_grad_temp, static_cast<T>(0.0));
-
-      last_c_grad_data = (const T *)last_c_grad_temp.data<T>();
-    } else {
-      last_c_grad_data = last_c_grad->data<T>();
-    }
-
-    const T *out_grad_data = NULL;
-    if (out_grad == nullptr) {
-      Tensor out_grad_temp;
-      out_grad_temp.mutable_data<T>(out->dims(), ctx.GetPlace());
-      zero(dev_ctx, &out_grad_temp, static_cast<T>(0.0));
-
-      out_grad_data = (const T *)out_grad_temp.data<T>();
-    } else {
-      out_grad_data = out_grad->data<T>();
-    }
-
-    // zero( dev_ctx, last_h_grad, static_cast<T>(0.0));
-    // zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
-
-    auto out_data = out->data<T>();
-    // auto out_grad_data = out_grad->data<T>();
-    auto weight_data = weight->data<T>();
-    auto init_h_data = init_h->data<T>();
-    auto init_c_data = init_c->data<T>();
-    auto in_grad_data = in_grad->data<T>();
-
-    auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
-    auto reserve_data = cudnn_rnn_cache->reserve_data_.data<uint8_t>();
-
-    auto run_seq_len = input_dims[0];
-    PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_,
-                      "cudnn running seq_len CAN not greater max_lengh");
-    CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData(
-        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-        cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_,
-        out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data,
-        cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_,
-        weight_data, cudnn_rnn_cache->hx_desc_, init_h_data,
-        cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_,
-        in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data,
-        cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data,
-        cudnn_rnn_cache->workspace_size_, reserve_data,
-        cudnn_rnn_cache->reserve_size_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights(
-        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-        cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_,
-        init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(),
-        cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_,
-        weight_grad->data<T>(), cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
-        cudnn_rnn_cache->reserve_size_));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>);
-REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>);
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
deleted file mode 100644
index 7f18b839271a29523cc06c999c28cc0394717397..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-struct CudnnRNNCache {
-  CudnnRNNCache() {
-    x_desc_ = NULL;
-    y_desc_ = NULL;
-    dx_desc_ = NULL;
-    dy_desc_ = NULL;
-  }
-  ~CudnnRNNCache() { release(); }
-
-  cudnnRNNDescriptor_t rnn_desc_;
-  cudnnTensorDescriptor_t *x_desc_;
-  cudnnTensorDescriptor_t *y_desc_;
-  cudnnTensorDescriptor_t *dx_desc_;
-  cudnnTensorDescriptor_t *dy_desc_;
-
-  cudnnTensorDescriptor_t hx_desc_;
-  cudnnTensorDescriptor_t cx_desc_;
-  cudnnTensorDescriptor_t hy_desc_;
-  cudnnTensorDescriptor_t cy_desc_;
-
-  cudnnTensorDescriptor_t dhx_desc_;
-  cudnnTensorDescriptor_t dcx_desc_;
-  cudnnTensorDescriptor_t dhy_desc_;
-  cudnnTensorDescriptor_t dcy_desc_;
-
-  cudnnTensorDescriptor_t output_x_desc_;
-  cudnnTensorDescriptor_t output_y_desc_;
-
-  cudnnDropoutDescriptor_t dropout_desc_;
-
-  size_t weights_size_;
-  cudnnFilterDescriptor_t w_desc_;
-  cudnnFilterDescriptor_t dw_desc_;
-
-  size_t workspace_size_;
-  size_t reserve_size_;
-  framework::Tensor reserve_data_;
-  framework::Tensor workspace_data_;
-
-  framework::Tensor dropout_state_;
-
-  size_t max_length_;
-
-  float dropout_prob_;
-  bool is_bidirec_;
-
-  int batch_size_;
-  int input_size_;
-  int hidden_size_;
-  int num_layers_;
-  int seed_;
-
-  void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len,
-            int batch_size, int input_size, int hidden_size, int num_layers,
-            float dropout_prob, bool is_bidirec, int seed, int weight_numel) {
-    max_length_ = max_len;
-    batch_size_ = batch_size;
-    input_size_ = input_size;
-    hidden_size_ = hidden_size;
-    num_layers_ = num_layers;
-    dropout_prob_ = dropout_prob;
-    is_bidirec_ = is_bidirec;
-    seed_ = seed;
-
-    x_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    y_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    int dim_a[3];
-    int stride_a[3];
-
-    for (size_t i = 0; i < max_length_; ++i) {
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
-      dim_a[0] = batch_size_;
-      dim_a[1] = input_size_;
-      dim_a[2] = 1;
-
-      stride_a[0] = dim_a[2] * dim_a[1];
-      stride_a[1] = dim_a[2];
-      stride_a[2] = 1;
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-
-      dim_a[0] = batch_size_;
-      dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
-      dim_a[2] = 1;
-
-      stride_a[0] = dim_a[2] * dim_a[1];
-      stride_a[1] = dim_a[2];
-      stride_a[2] = 1;
-
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    }
-
-    dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
-    dim_a[1] = batch_size_;
-    dim_a[2] = hidden_size_;
-
-    stride_a[0] = dim_a[2] * dim_a[1];
-    stride_a[1] = dim_a[2];
-    stride_a[2] = 1;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
-
-    size_t state_size;
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
-        dropout_state_.Resize({static_cast<int64_t>(state_size)}));
-    auto *dropout_state_data = dropout_state_.mutable_data<uint8_t>(place);
-    CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
-        dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
-        seed_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
-
-#if CUDNN_VERSION >= 6000
-    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
-        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
-        CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
-#else
-    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_DATA_FLOAT));
-#endif
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
-        handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
-
-    PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
-                      "cudnn lstm weight size should be SAME");
-    int dim_w[3];
-    dim_w[0] = weights_size_ / sizeof(float);
-    dim_w[1] = 1;
-    dim_w[2] = 1;
-    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
-        w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
-        dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
-        handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
-    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
-        handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
-
-    reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
-    reserve_data_.mutable_data<uint8_t>(place);
-
-    workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
-    workspace_data_.mutable_data<uint8_t>(place);
-  }
-
-  void release() {
-    for (size_t i = 0; i < max_length_; ++i) {
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
-    }
-
-    delete[] x_desc_;
-    delete[] y_desc_;
-    delete[] dx_desc_;
-    delete[] dy_desc_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
-
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
deleted file mode 100644
index 7c0fda4169b5e1cf663d04b78b6425d73965c292..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cum_op.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <array>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename Functor>
-class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
-                          "Cannot get input tensor X, variable name = %s",
-                          context.op().Input("X"));
-
-    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
-                            "Cannot get output tensor Out, variable name = %s",
-                            context.op().Output("Out"));
-    int axis = context.Attr<int>("axis");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool reverse = context.Attr<bool>("reverse");
-    auto x_dims = X.dims();
-    if (axis == -1) {
-      axis = x_dims.size() - 1;
-    }
-    PADDLE_ENFORCE_LT(
-        axis, x_dims.size(),
-        "axis should be less than the dimensiotn of the input tensor");
-    Out.mutable_data<T>(context.GetPlace());
-
-    int pre = 1;
-    int post = 1;
-    int mid = x_dims[axis];
-    for (int i = 0; i < axis; ++i) {
-      pre *= x_dims[i];
-    }
-    for (int i = axis + 1; i < x_dims.size(); ++i) {
-      post *= x_dims[i];
-    }
-
-    auto x = framework::EigenVector<T>::Flatten(X);
-    auto out = framework::EigenVector<T>::Flatten(Out);
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    using IndexT = Eigen::DenseIndex;
-    if (pre == 1) {
-      if (post == 1) {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 1>(mid), x, out,
-                   /* axis= */ 0, reverse, exclusive);
-      } else {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(mid, post), x, out,
-                   /* axis= */ 0, reverse, exclusive);
-      }
-    } else {
-      if (post == 1) {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(pre, mid), x, out,
-                   /* axis= */ 1, reverse, exclusive);
-      } else {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 3>(pre, mid, post), x, out,
-                   /* axis= */ 1, reverse, exclusive);
-      }
-    }
-  }
-
- private:
-  template <typename Device, typename Dim, typename X, typename Out>
-  void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis,
-                  bool reverse, bool exclusive) const {
-    if (!reverse) {
-      out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive);
-    } else {
-      std::array<bool, Dim::count> rev;
-      rev.fill(false);
-      rev[axis] = reverse;
-      out.reshape(dims).device(d) =
-          Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev);
-    }
-  }
-};
-
-template <typename T>
-struct CumsumFunctor {
-  using ELEMENT_TYPE = T;
-  template <typename X>
-  const typename X::TensorScanSumOp operator()(X x, int axis,
-                                               bool exclusive) const {
-    return x.cumsum(axis, exclusive);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
deleted file mode 100644
index 5302b822d6b9f232e9ccd0d03cc549d7d5044ebf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cumsum_op.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cum_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CumOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of cumsum operator");
-    AddOutput("Out", "Output of cumsum operator");
-    AddAttr<int>("axis",
-                 "The dimenstion to accumulate along. -1 means the last "
-                 "dimenstion [default -1].")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
-    AddAttr<bool>("exclusive",
-                  "Whether to perform exclusive cumsum. [default false].")
-        .SetDefault(false);
-    AddAttr<bool>("reverse",
-                  "If true, the cumsum is performed in the reversed direction. "
-                  "[default false].")
-        .SetDefault(false);
-    AddComment(R"DOC(
-The cumulative sum of the elements along a given axis.
-By default, the first element of the result is the same of the first element of
-the input. If exlusive is true, the first element of the result is 0.
-)DOC");
-  }
-};
-
-class CumsumGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("cumsum");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttr("axis", Attr<int>("axis"));
-    grad_op->SetAttr("reverse", !Attr<bool>("reverse"));
-    grad_op->SetAttr("exclusive", Attr<bool>("exclusive"));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-
-REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker);
-REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>);
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
deleted file mode 100644
index eb5fd99ccb844b1f1717b818e7807a384d6515eb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cumsum_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cum_op.h"
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-
-REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel<CUDA, ops::CumsumFunctor<float>>,
-                        ops::CumKernel<CUDA, ops::CumsumFunctor<double>>,
-                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>);
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
deleted file mode 100644
index 53ed86ade48ce52d49285495388f93f1bc4f5d9e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cvm_op.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cvm_op.h"
-#include <memory>
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class CVMOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("CVM"), "Input(CVM) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto cvm_dims = ctx->GetInputDim("CVM");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(cvm_dims.size(), 2UL, "Input(CVM)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(cvm_dims[1], 2UL,
-                      "The 2nd dimension of "
-                      "Input(CVM) should be 2.");
-
-    if (ctx->Attrs().Get<bool>("use_cvm")) {
-      ctx->SetOutputDim("Y", {x_dims[0], x_dims[1]});
-    } else {
-      ctx->SetOutputDim("Y", {x_dims[0], x_dims[1] - 2});
-    }
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // cvm
-  // is determined by its input "X".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class CVMGradientOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("CVM"), "Input(CVM) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto cvm_dims = ctx->GetInputDim("CVM");
-    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(cvm_dims.size(), 2, "Input(CVM)'s rank should be 2.");
-
-    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
-                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
-                      "be equal.");
-
-    PADDLE_ENFORCE_EQ(cvm_dims[1], 2,
-                      "When Attr(soft_label) == false, the 2nd dimension of "
-                      "Input(CVM) should be 2.");
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // cvm
-  // is determined by its input "X".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class CVMOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
-             "[N x D],"
-             " where N is the batch size and D is the emebdding dim. ");
-    AddInput("CVM",
-             "(Tensor),  a 2-D Tensor with shape [N x 2], where N is the batch "
-             "size, 2 is show and click.");
-    AddOutput("Y",
-              "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
-              "[N x K].");
-    AddAttr<bool>("use_cvm", "bool, use cvm or not").SetDefault(true);
-    AddComment(R"DOC(
-CVM Operator.
-
-      We assume that input X is a embedding vector with cvm_feature(show and click), which shape is [N * D] (D is 2(cvm_feature) + embedding dim, N is batch_size)
-      if use_cvm is True, we will log(cvm_feature), and output shape is [N * D].
-      if use_cvm is False, we will remove cvm_feature from input, and output shape is [N * (D - 2)].
-
-)DOC");
-  }
-};
-
-class CVMGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("cvm_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("CVM", Input("CVM"));
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(cvm, ops::CVMOp, ops::CVMOpMaker, ops::CVMGradOpDescMaker);
-
-REGISTER_OPERATOR(cvm_grad, ops::CVMGradientOp);
-
-REGISTER_OP_CPU_KERNEL(cvm, ops::CVMOpKernel<float>, ops::CVMOpKernel<double>);
-
-REGISTER_OP_CPU_KERNEL(cvm_grad, ops::CVMGradOpKernel<float>,
-                       ops::CVMGradOpKernel<double>);
diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h
deleted file mode 100644
index c6140483ff5cb8108895546b6a01f058708231fd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cvm_op.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-void CvmComputeKernel(const bool use_cvm, const int64_t item_width, const T** X,
-                      T** Y) {
-  const auto cvm_offset = use_cvm ? 0 : 2;
-
-  std::memcpy(*Y, *X + cvm_offset, (item_width - cvm_offset) * sizeof(T));
-
-  if (use_cvm) {
-    (*Y)[0] = log((*Y)[0] + 1);
-    (*Y)[1] = log((*Y)[1] + 1) - (*Y)[0];
-  }
-
-  (*X) += item_width;
-  (*Y) += item_width - cvm_offset;
-}
-
-template <typename T>
-void CvmGradComputeKernel(const bool use_cvm, const int64_t item_width,
-                          const T& CVM, const T** DY, T** DX) {
-  const auto cvm_offset = use_cvm ? 0 : 2;
-
-  std::memcpy(*DX + cvm_offset, *DY, (item_width - cvm_offset) * sizeof(T));
-
-  (*DX)[0] = (&CVM)[0];
-  (*DX)[1] = (&CVM)[1];
-
-  (*DX) += item_width;
-  (*DY) += item_width - cvm_offset;
-}
-
-template <typename T>
-class CVMOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<LoDTensor>("X");
-    const T* x_data = x->data<T>();
-
-    auto batch_size = x->dims()[0];
-    auto item_size = x->numel() / batch_size;
-    auto use_cvm = context.Attr<bool>("use_cvm");
-
-    auto* y = context.Output<LoDTensor>("Y");
-    T* y_data = y->mutable_data<T>(context.GetPlace());
-
-    // for Input X do not have Lod Information.
-    if (x->NumLevels() == 0) {
-      for (int i = 0; i < batch_size; i++) {
-        CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
-      }
-    } else {
-      auto lod = x->lod()[0];
-      for (size_t i = 0; i < lod.size() - 1; ++i) {
-        for (size_t j = 0; j < lod[i + 1] - lod[i]; ++j) {
-          CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class CVMGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dx = context.Output<LoDTensor>(framework::GradVarName("X"));
-    T* dx_data = dx->mutable_data<T>(context.GetPlace());
-
-    const Tensor* cvm = context.Input<Tensor>("CVM");
-    const T* cvm_data = cvm->data<T>();
-
-    const auto* dOut =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Y"));
-    const T* dout_data = dOut->data<T>();
-
-    auto use_cvm = context.Attr<bool>("use_cvm");
-
-    auto offset = 2;
-    auto batch_size = dx->dims()[0];
-    auto item_size = dx->numel() / batch_size;
-
-    // for Input X do not have Lod Information.
-    if (dx->NumLevels() == 0) {
-      for (int x = 0; x < batch_size; ++x) {
-        CvmGradComputeKernel(use_cvm, item_size, *cvm_data, &dout_data,
-                             &dx_data);
-        cvm_data += offset;
-      }
-    } else {
-      auto lod = dx->lod()[0];
-      int seq_num = static_cast<int>(lod.size()) - 1;
-      for (int i = 0; i < seq_num; ++i) {
-        for (size_t j = 0; j < lod[i + 1] - lod[i]; ++j) {
-          CvmGradComputeKernel(use_cvm, item_size, *cvm_data, &dout_data,
-                               &dx_data);
-        }
-        cvm_data += offset;
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
deleted file mode 100644
index 5dc83ac7b3078960b2aa36b3c6c8a77d502f9a05..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/data_norm_op.cc
+++ /dev/null
@@ -1,418 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/data_norm_op.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/data_layout.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
-class DataNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "");
-    PADDLE_ENFORCE(ctx->HasInput("BatchSize"), "");
-    PADDLE_ENFORCE(ctx->HasInput("BatchSum"), "");
-    PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("Means"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("Scales"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
-
-    const auto x_dims = ctx->GetInputDim("X");
-    const DataLayout data_layout = framework::StringToDataLayout(
-        ctx->Attrs().Get<std::string>("data_layout"));
-
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "Input X must have 2 to 5 dimensions.");
-
-    const int64_t C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(), 1UL);
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C);
-    }
-
-    ctx->SetOutputDim("Y", x_dims);
-    ctx->SetOutputDim("Means", {C});
-    ctx->SetOutputDim("Scales", {C});
-    ctx->ShareLoD("X", "Y");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("X")->type();
-    // By default, the type of the scale, bias, mean,
-    // and var tensors should both be float. (For float or float16 input tensor)
-    // or double (For double input tensor).
-    auto dn_param_type = framework::proto::VarType::FP32;
-    if (input_data_type == framework::proto::VarType::FP64) {
-      dn_param_type = framework::proto::VarType::FP64;
-    }
-    PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input<Tensor>("BatchSize")->type(),
-                      "BatchSize input should be of float type");
-    PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input<Tensor>("BatchSum")->type(),
-                      "BatchSum input should be of float type");
-    PADDLE_ENFORCE_EQ(dn_param_type,
-                      ctx.Input<Tensor>("BatchSquareSum")->type(),
-                      "BatchSquareSum input should be of float type");
-
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-    }
-#endif
-
-    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                   library);
-  }
-};
-
-class DataNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    // AddAttr<bool>("is_test", "").SetDefault(false);
-    AddAttr<float>("epsilon", "")
-        .SetDefault(1e-4)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
-                         "'epsilon' should be between 0.0 and 0.001.");
-        });
-    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddInput("X", "The input tensor");
-    AddInput("BatchSize",
-             "BatchSize is a 1-dimensional tensor of size C "
-             "that is applied to the output");
-    AddInput("BatchSum",
-             "BatchSum is a 1-dimensional tensor of size C "
-             "that is applied to the output");
-    AddInput("BatchSquareSum",
-             "The global BatchSquareSum (for training) or "
-             "estimated BatchSquareSum (for testing)");
-    AddOutput("Y", "result after normalization");
-    AddOutput("Means",
-              "Mean of the history data batch, "
-              "will apply to output when training")
-        .AsIntermediate();
-    AddOutput("Scales",
-              "Scales of the history data batch, "
-              "will apply to output when training")
-        .AsIntermediate();
-    AddComment(R"DOC(
-Data Normalization.
-
-Can be used as a normalizer function for data
-The required data format for this layer is one of the following:
-1. NHWC `[batch, in_height, in_width, in_channels]`
-2. NCHW `[batch, in_channels, in_height, in_width]`
-
-)DOC");
-  }
-};
-
-template <typename T>
-class DataNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    // const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2");
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("Means");
-    auto *scales = ctx.Output<Tensor>("Scales");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-
-    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    ConstEigenVectorArrayMap<T> b_size_arr(
-        ctx.Input<Tensor>("BatchSize")->data<T>(), C);
-    ConstEigenVectorArrayMap<T> b_sum_arr(
-        ctx.Input<Tensor>("BatchSum")->data<T>(), C);
-    ConstEigenVectorArrayMap<T> b_square_sum_arr(
-        ctx.Input<Tensor>("BatchSquareSum")->data<T>(), C);
-    EigenVectorArrayMap<T> means_arr(mean_out->mutable_data<T>(ctx.GetPlace()),
-                                     C);
-    EigenVectorArrayMap<T> scales_arr(scales->mutable_data<T>(ctx.GetPlace()),
-                                      C);
-    means_arr = b_sum_arr / b_size_arr;
-    scales_arr = (b_size_arr / b_square_sum_arr).sqrt();
-
-    switch (data_layout) {
-      case DataLayout::kNCHW:  // because it's two dimensions, so make no
-                               // difference
-      case DataLayout::kNHWC: {
-        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C, N) =
-            (ConstEigenArrayMap<T>(x->data<T>(), C, N).colwise() - means_arr)
-                .colwise() *
-            scales_arr;
-        break;
-      }
-      default:
-        PADDLE_THROW("Unknown storage order: %d", data_layout);
-    }
-  }
-};
-
-class DataNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // check input
-    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
-    PADDLE_ENFORCE(ctx->HasInput("BatchSize"), "");
-    PADDLE_ENFORCE(ctx->HasInput("BatchSum"), "");
-    PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Means"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Scales"), "");
-
-    // check output
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSize")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSum")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSquareSum")),
-                   "");
-
-    const auto x_dims = ctx->GetInputDim("X");
-    const DataLayout data_layout = framework::StringToDataLayout(
-        ctx->Attrs().Get<std::string>("data_layout"));
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    }
-    ctx->SetOutputDim(framework::GradVarName("BatchSize"), {C});
-    ctx->SetOutputDim(framework::GradVarName("BatchSum"), {C});
-    ctx->SetOutputDim(framework::GradVarName("BatchSquareSum"), {C});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-    if (var == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
-    } else if (var->IsType<LoDTensor>()) {
-      t = &var->Get<LoDTensor>();
-    }
-    if (t == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-    }
-#endif
-
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace(), layout, library);
-  }
-};
-
-template <typename T>
-class DataNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *batch_size = ctx.Input<Tensor>("BatchSize");
-    const auto *batch_sum = ctx.Input<Tensor>("BatchSum");
-    const auto *batch_square_sum = ctx.Input<Tensor>("BatchSquareSum");
-    const auto *scales = ctx.Input<Tensor>("Scales");
-    const auto *means = ctx.Input<Tensor>("Means");
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2");
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-
-    // init output
-    Tensor *d_x = nullptr;
-    if (ctx.HasOutput(framework::GradVarName("X"))) {
-      d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    }
-    auto *d_batch_size =
-        ctx.Output<Tensor>(framework::GradVarName("BatchSize"));
-    auto *d_batch_sum = ctx.Output<Tensor>(framework::GradVarName("BatchSum"));
-    auto *d_batch_square_sum =
-        ctx.Output<Tensor>(framework::GradVarName("BatchSquareSum"));
-
-    EigenVectorArrayMap<T> d_batch_size_arr(
-        d_batch_size->mutable_data<T>(ctx.GetPlace()), C);
-    EigenVectorArrayMap<T> d_batch_sum_arr(
-        d_batch_sum->mutable_data<T>(ctx.GetPlace()), C);
-    EigenVectorArrayMap<T> d_batch_square_sum_arr(
-        d_batch_square_sum->mutable_data<T>(ctx.GetPlace()), C);
-
-    d_batch_size_arr.setZero();
-    d_batch_sum_arr.setZero();
-    d_batch_square_sum_arr.setZero();
-
-    const float epsilon = ctx.Attr<float>("epsilon");
-    switch (
-        data_layout) {  // because it's two dimensions, so make no difference
-      case DataLayout::kNCHW:
-      case DataLayout::kNHWC: {
-        ConstEigenVectorArrayMap<T> scales_arr(scales->data<T>(), C);
-        ConstEigenVectorArrayMap<T> means_arr(means->data<T>(), C);
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N);
-        if (d_x != nullptr) {
-          EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C, N);
-          d_x_arr.setZero();
-          for (int nc = 0; nc < N; ++nc) {
-            d_x_arr.col(nc) = d_y_arr.col(nc) * scales_arr;
-          }
-        }
-
-        // calculate data sum and squre sum
-        ConstEigenVectorArrayMap<T> batch_size_arr(batch_size->data<T>(), C);
-        ConstEigenVectorArrayMap<T> batch_sum_arr(batch_sum->data<T>(), C);
-        ConstEigenVectorArrayMap<T> batch_square_sum_arr(
-            batch_square_sum->data<T>(), C);
-        Eigen::Array<T, Eigen::Dynamic, 1> sample_sum(C);
-        Eigen::Array<T, Eigen::Dynamic, 1> sample_square_sum(C);
-        // calculate data sample sum and square sum
-        sample_sum.setZero();
-        sample_square_sum.setZero();
-        for (int nc = 0; nc < N; ++nc) {
-          sample_sum += x_arr.col(nc);
-          sample_square_sum += (x_arr.col(nc) - means_arr).square();
-        }
-        // calculate gradient
-        d_batch_size_arr.setConstant(N);
-        d_batch_sum_arr = sample_sum;
-        d_batch_square_sum_arr = sample_square_sum + d_batch_size_arr * epsilon;
-        break;
-      }
-      default:
-        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
-    }
-  }
-};
-
-class DataNormGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op = new framework::OpDesc();
-    op->SetType("data_norm_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-
-    op->SetInput("BatchSize", Input("BatchSize"));
-    op->SetInput("BatchSum", Input("BatchSum"));
-    op->SetInput("BatchSquareSum", Input("BatchSquareSum"));
-    op->SetInput("Scales", Output("Scales"));
-    op->SetInput("Means", Output("Means"));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("BatchSize"), InputGrad("BatchSize"));
-    op->SetOutput(framework::GradVarName("BatchSum"), InputGrad("BatchSum"));
-    op->SetOutput(framework::GradVarName("BatchSquareSum"),
-                  InputGrad("BatchSquareSum"));
-
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(data_norm, ops::DataNormOp, ops::DataNormOpMaker,
-                  ops::DataNormGradMaker);
-REGISTER_OPERATOR(data_norm_grad, ops::DataNormGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    data_norm, ops::DataNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DataNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    data_norm_grad,
-    ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/data_norm_op.h b/paddle/fluid/operators/data_norm_op.h
deleted file mode 100644
index 63451214bcf649d0a7a949f391db9b651d237d22..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/data_norm_op.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DataNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class DataNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/deformable_conv_filter.cu.h b/paddle/fluid/operators/deformable_conv_filter.cu.h
deleted file mode 100644
index f466d1803f819ca8ee5c96e693d1ade7801d8f99..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_conv_filter.cu.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Part of the following code in this file refs to
-// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
-//
-// Copyright (c) 2017 Microsoft
-// Licensed under The Apache-2.0 License [see LICENSE for details]
-// \file deformable_psroi_pooling.cu
-// \brief
-// \author Yi Li, Guodong Zhang, Jifeng Dai
-
-#pragma once
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-template <typename T>
-__global__ void FilterGradAddupCUDAKernel(const int nthreads, const int n,
-                                          const int height, const int width,
-                                          const T* dweight_3d, T* filter_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    filter_grad[i] = filter_grad[i] + dweight_3d[i];
-  }
-}
diff --git a/paddle/fluid/operators/deformable_conv_func.h b/paddle/fluid/operators/deformable_conv_func.h
deleted file mode 100644
index ba1c5044302232c45f4d53236290712c33c3a352..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_conv_func.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Part of the following code in this file refs to
-// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
-//
-// Copyright (c) 2017 Microsoft
-// Licensed under The Apache-2.0 License [see LICENSE for details]
-// \file deformable_psroi_pooling.cu
-// \brief
-// \author Yi Li, Guodong Zhang, Jifeng Dai
-
-#pragma once
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-template <typename T>
-HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, T argmax_w, const int h,
-                                   const int w, const int height,
-                                   const int width) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-
-  weight = (h == argmax_h_low && w == argmax_w_low)
-               ? (h + 1 - argmax_h) * (w + 1 - argmax_w)
-               : weight;
-  weight = (h == argmax_h_low && w == argmax_w_high)
-               ? (h + 1 - argmax_h) * (argmax_w + 1 - w)
-               : weight;
-  weight = (h == argmax_h_high && w == argmax_w_low)
-               ? (argmax_h + 1 - h) * (w + 1 - argmax_w)
-               : weight;
-  weight = (h == argmax_h_high && w == argmax_w_high)
-               ? (argmax_h + 1 - h) * (argmax_w + 1 - w)
-               : weight;
-
-  return weight;
-}
-
-template <typename T>
-HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h, T argmax_w, const int height,
-                                     const int width, const T* im_data,
-                                     const int data_width, const int bp_dir) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-
-  if (bp_dir == 0) {
-    weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
-                  ? -1 * (argmax_w_low + 1 - argmax_w) *
-                        im_data[argmax_h_low * data_width + argmax_w_low]
-                  : 0;
-
-    weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-                  ? -1 * (argmax_w - argmax_w_low) *
-                        im_data[argmax_h_low * data_width + argmax_w_high]
-                  : 0;
-
-    weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-                  ? (argmax_w_low + 1 - argmax_w) *
-                        im_data[argmax_h_high * data_width + argmax_w_low]
-                  : 0;
-    weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-                  ? (argmax_w - argmax_w_low) *
-                        im_data[argmax_h_high * data_width + argmax_w_high]
-                  : 0;
-  } else if (bp_dir == 1) {
-    weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
-                  ? -1 * (argmax_h_low + 1 - argmax_h) *
-                        im_data[argmax_h_low * data_width + argmax_w_low]
-                  : 0;
-    weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-                  ? (argmax_h_low + 1 - argmax_h) *
-                        im_data[argmax_h_low * data_width + argmax_w_high]
-                  : 0;
-    weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-                  ? -1 * (argmax_h - argmax_h_low) *
-                        im_data[argmax_h_high * data_width + argmax_w_low]
-                  : 0;
-    weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-                  ? (argmax_h - argmax_h_low) *
-                        im_data[argmax_h_high * data_width + argmax_w_high]
-                  : 0;
-  }
-
-  return weight;
-}
-
-template <typename T>
-HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data, const int data_width,
-                                const int height, const int width, T h, T w) {
-  int h_low = floor(h);
-  int w_low = floor(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh;
-  T hw = 1 - lw;
-
-  T v1 =
-      (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0;
-  T v2 = (h_low >= 0 && w_high <= width - 1)
-             ? bottom_data[h_low * data_width + w_high]
-             : 0;
-  T v3 = (h_high <= height - 1 && w_low >= 0)
-             ? bottom_data[h_high * data_width + w_low]
-             : 0;
-  T v4 = (h_high <= height - 1 && w_high <= width - 1)
-             ? bottom_data[h_high * data_width + w_high]
-             : 0;
-
-  T w1 = hh * hw;
-  T w2 = hh * lw;
-  T w3 = lh * hw;
-  T w4 = lh * lw;
-
-  return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
-}
diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc
deleted file mode 100644
index 01cbec5633ab1610bec5633466ba54a223afd75c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_conv_op.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/deformable_conv_op.h"
-#include <memory>
-#include "paddle/fluid/operators/conv_op.h"
-
-namespace paddle {
-namespace operators {
-class DeformableConvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(Tensor) The input of deformable conv op. "
-             "The shape of input is "
-             "[N, channel_in, H, W]");
-    AddInput("Offset",
-             "(Tensor) The input offset. "
-             "The shape of the offset is "
-             "[N, deformable_groups * kernel_w * kernel_h * 2, H, W");
-    AddInput("Mask",
-             "(Tensor) The input mask. "
-             "The shape of the mask is "
-             "[N, deformable_groups * kernel_w * kernel_h, H, W].");
-    AddInput("Filter",
-             "(Tensor) The Input Filter "
-             "The shape of the wight is "
-             "[num_filters, channel_in, kernel_h, kernel_w.");
-    AddOutput("Output",
-              "(Tensor) The output. "
-              "The shape of the output tensor is "
-              "[N, num_filters, out_height, out_width]].");
-    AddAttr<std::vector<int>>("strides",
-                              "(vector<int> default:{1, 1}), the "
-                              "strides(h_stride, w_stride) of "
-                              "convolution operator.")
-        .SetDefault({1, 1});
-    AddAttr<std::vector<int>>("paddings",
-                              "(vector<int> default:{0,0}), the "
-                              "paddings(h_pad, w_pad) of "
-                              "convolution operator. ")
-        .SetDefault({0, 0});
-    AddAttr<std::vector<int>>("dilations",
-                              "(vector<int> default:{1, 1}), the "
-                              "dilations(h_dilation, w_dilation) of "
-                              "convolution operator.")
-        .SetDefault({1, 1});
-    AddAttr<int>(
-        "groups",
-        "(int default:1), the groups number of the convolution operator. "
-        "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
-        "when group=2, the first half of the filters is only connected to the "
-        "first half of the input channels, while the second half of the "
-        "filters "
-        "is only connected to the second half of the input channels.")
-        .SetDefault(1);
-    AddAttr<int>("deformable_groups",
-                 "(int default:1), the number of the deformable groups.")
-        .SetDefault(1);
-    AddAttr<int>("im2col_step",
-                 "im2col maximum number of image per computation")
-        .SetDefault(64);
-    AddComment(R"DOC(
-**Deformable Convolution Operator**
-
-Compute 2-D deformable convolution on 4-D input.
-
-Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
-
-$$
-y(p) = \\sum_{k=1}^{K}{w_k * x(p + p_k + \\Delta p_k) * \\Delta m_k}
-$$
-
-Where $$\\Delta p_k$$ and $$\Delta m_k$$ are the learnable offset and modulation scalar for the k-th location, respectively.
-
-Refer to 'Deformable ConvNets v2: More Deformable, Better Results
-'<https://arxiv.org/abs/1811.11168v2>
-
-Example:
-  Input:
-       Input shape: $(N, C_{in}, H_{in}, W_{in})$
-       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
-       Offset shape: $(N, 2 * deformable_groups, * H_f * W_f, H_{out}, W_{out})$
-       Mask shape: $(N, deformable_groups * H_f * W_f, H_{out}, W_{out})$
-  Output:
-       Output shape: $(N, C_{out}, H_{out}, W_{out})$
-                     where $H_{out}, W_{out}$ must be equal to $H_{in}, W_{in}$ respectively.
-  Where
-$$
-       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
-       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
-$$
-)DOC");
-  }
-};
-
-class DeformableConvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of DeformableConvOp "
-                   "should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Offset"),
-                   "Input(Offset) of DeformableConvOp "
-                   "should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Mask"),
-                   "Input(Mask) of DeformableConvOp "
-                   "should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                   "Input(Filter) of DeformableConvOp "
-                   "should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                   "Output(Output) of DeformableConvOp "
-                   "should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    auto offset_dims = ctx->GetInputDim("Offset");
-    auto mask_dims = ctx->GetInputDim("Mask");
-
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    std::vector<int> dilations =
-        ctx->Attrs().Get<std::vector<int>>("dilations");
-    int groups = ctx->Attrs().Get<int>("groups");
-    int deformable_groups = ctx->Attrs().Get<int>("deformable_groups");
-    int im2col_step = ctx->Attrs().Get<int>("im2col_step");
-
-    PADDLE_ENFORCE(in_dims.size() == 4,
-                   "Conv input should be 4-D tensor, get %u", in_dims.size());
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(), filter_dims.size(),
-        "Conv input dimension and filter dimension should be the same.");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size() - strides.size(), 2U,
-        "Conv input dimension and strides dimension should be consistent.");
-    PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
-                      "Conv paddings dimension and Conv strides dimension "
-                      "should be the same.");
-
-    PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
-                      "The number of input channels should be equal to filter "
-                      "channels * groups.");
-    PADDLE_ENFORCE_EQ(
-        filter_dims[0] % groups, 0,
-        "The number of output channels should be divided by groups.");
-    PADDLE_ENFORCE_EQ(filter_dims[0] % deformable_groups, 0,
-                      "The number of output channels should be "
-                      "divided by deformable groups.");
-
-    if (in_dims[0] > im2col_step) {
-      PADDLE_ENFORCE_EQ(
-          in_dims[0] % im2col_step, 0U,
-          "Input batchsize must be smaller than or divide im2col_step");
-    }
-
-    for (size_t i = 0; i < strides.size(); ++i) {
-      PADDLE_ENFORCE_GT(strides[i], 0U, "stride %d size incorrect", i);
-    }
-    for (size_t i = 0; i < dilations.size(); ++i) {
-      PADDLE_ENFORCE_GT(dilations[i], 0U, "dilation %d size incorrect", i);
-    }
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                            dilations[i], paddings[i],
-                                            strides[i]));
-    }
-    PADDLE_ENFORCE_EQ(output_shape[1] % deformable_groups, 0U,
-                      "output num_filter must divide deformable group size.");
-    PADDLE_ENFORCE_EQ(output_shape[2], offset_dims[2],
-                      "output height must equal to offset map height.");
-    PADDLE_ENFORCE_EQ(output_shape[3], offset_dims[3],
-                      "output width must equal to offset map width.");
-    PADDLE_ENFORCE_EQ(offset_dims[1] % (filter_dims[2] * filter_dims[3]), 0U,
-                      "offset filter must divide deformable group size.");
-    PADDLE_ENFORCE_EQ(offset_dims[1] / (2 * filter_dims[2] * filter_dims[3]),
-                      deformable_groups,
-                      "offset filter must divide deformable group size.");
-    PADDLE_ENFORCE_EQ(output_shape[2], mask_dims[2],
-                      "output height must equal to mask map height.");
-    PADDLE_ENFORCE_EQ(output_shape[3], mask_dims[3],
-                      "output width must equal to mask map width.");
-    PADDLE_ENFORCE_EQ(mask_dims[1] % (filter_dims[2] * filter_dims[3]), 0U,
-                      "mask filter must divide deformable group size.");
-    PADDLE_ENFORCE_EQ(mask_dims[1] / (filter_dims[2] * filter_dims[3]),
-                      deformable_groups,
-                      "mask filter must divide deformable group size.");
-    ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class DeformableConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("deformable_conv_grad");
-    op->SetInput("Input", Input("Input"));
-    op->SetInput("Filter", Input("Filter"));
-    op->SetInput("Offset", Input("Offset"));
-    op->SetInput("Mask", Input("Mask"));
-    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
-
-    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
-    op->SetOutput(framework::GradVarName("Offset"), InputGrad("Offset"));
-    op->SetOutput(framework::GradVarName("Mask"), InputGrad("Mask"));
-
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class DeformableConvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    auto in_dims = ctx->GetInputDim("Input");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    auto offset_dims = ctx->GetInputDim("Offset");
-    auto mask_dims = ctx->GetInputDim("Mask");
-
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Output")),
-                   "the gradient of output(Out) must not be null");
-    if (ctx->HasOutput(framework::GradVarName("Input"))) {
-      ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-      ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Offset"))) {
-      ctx->SetOutputDim(framework::GradVarName("Offset"), offset_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Mask"))) {
-      ctx->SetOutputDim(framework::GradVarName("Mask"), mask_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   ctx.device_context());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp,
-                  ops::DeformableConvOpMaker,
-                  ops::DeformableConvGradOpDescMaker);
-REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp);
-
-REGISTER_OP_CPU_KERNEL(deformable_conv, ops::DeformableConvCPUKernel<float>,
-                       ops::DeformableConvCPUKernel<double>);
-REGISTER_OP_CPU_KERNEL(deformable_conv_grad,
-                       ops::DeformableConvGradCPUKernel<float>,
-                       ops::DeformableConvGradCPUKernel<double>);
diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu
deleted file mode 100644
index 0a771627e060f44cc19fb897bdc4c82bf74a74ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_conv_op.cu
+++ /dev/null
@@ -1,753 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Part of the following code in this file refs to
-// https://github.com/msracver/Deformable-ConvNets/blob/master/DCNv2_op/nn/modulated_deformable_im2col.cuh
-//
-// Copyright (c) 2018 Microsoft
-// Licensed under The MIT License [see LICENSE for details]
-// \file modulated_deformable_im2col.cuh
-// \brief
-// \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
-
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/deformable_conv_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-__device__ T DmcnGetGradientWeight(T argmax_h, T argmax_w, const int h,
-                                   const int w, const int height,
-                                   const int width) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-  if (h == argmax_h_low && w == argmax_w_low)
-    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
-  if (h == argmax_h_low && w == argmax_w_high)
-    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
-  if (h == argmax_h_high && w == argmax_w_low)
-    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
-  if (h == argmax_h_high && w == argmax_w_high)
-    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
-  return weight;
-}
-
-template <typename T>
-__global__ void ModulatedDeformableCol2imGpuKernel(
-    const int nthreads, const T* data_col, const T* data_offset,
-    const T* data_mask, const int channels, const int height, const int width,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int channel_per_deformable_group,
-    const int batch_size, const int deformable_group, const int height_col,
-    const int width_col, T* grad_im) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t thread = index; thread < nthreads; thread += offset) {
-    const int j = (thread / width_col / height_col / batch_size) % kernel_w;
-    const int i =
-        (thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
-        thread / width_col / height_col / batch_size / kernel_w / kernel_h;
-
-    const int deformable_group_index = c / channel_per_deformable_group;
-
-    int w_out = thread % width_col;
-    int h_out = (thread / width_col) % height_col;
-    int b = (thread / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
-    const T* data_mask_ptr = data_mask +
-                             (b * deformable_group + deformable_group_index) *
-                                 kernel_h * kernel_w * height_col * width_col;
-    const int data_offset_h_ptr =
-        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
-        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const int data_mask_hw_ptr =
-        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T mask = data_mask_ptr[data_mask_hw_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
-
-    const T cur_top_grad = data_col[thread] * mask;
-    const int cur_h = static_cast<int>(cur_inv_h_data);
-    const int cur_w = static_cast<int>(cur_inv_w_data);
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
-        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
-            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
-              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight =
-              DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy,
-                                    cur_w + dx, height, width);
-
-          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-inline void ModulatedDeformableCol2im(
-    const platform::DeviceContext& ctx, const T* data_col, const T* data_offset,
-    const T* data_mask, const std::vector<int64_t> im_shape,
-    const std::vector<int64_t> col_shape,
-    const std::vector<int64_t> kernel_shape, const std::vector<int> pad,
-    const std::vector<int> stride, const std::vector<int> dilation,
-    const int deformable_group, T* grad_im) {
-  int channel_per_deformable_group = im_shape[0] / deformable_group;
-  int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
-  int blocks = NumBlocks(num_kernels);
-  int threads = kNumCUDAThreads;
-
-  ModulatedDeformableCol2imGpuKernel<T><<<
-      blocks, threads, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      num_kernels, data_col, data_offset, data_mask, im_shape[0], im_shape[1],
-      im_shape[2], kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0],
-      stride[1], dilation[0], dilation[1], channel_per_deformable_group,
-      col_shape[1], deformable_group, col_shape[2], col_shape[3], grad_im);
-}
-
-template <typename T>
-__device__ T DmcnGetCoordinateWeight(T argmax_h, T argmax_w, const int height,
-                                     const int width, const T* im_data,
-                                     const int data_width, const int bp_dir) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-
-  if (bp_dir == 0) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += -1 * (argmax_w - argmax_w_low) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_w - argmax_w_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  } else if (bp_dir == 1) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  }
-  return weight;
-}
-
-template <typename T>
-__device__ T DmcnIm2colBilinear(const T* bottom_data, const int data_width,
-                                const int height, const int width, T h, T w) {
-  int h_low = floor(h);
-  int w_low = floor(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh, hw = 1 - lw;
-
-  T v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low];
-  T v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-    v2 = bottom_data[h_low * data_width + w_high];
-  T v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-    v3 = bottom_data[h_high * data_width + w_low];
-  T v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-    v4 = bottom_data[h_high * data_width + w_high];
-
-  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-template <typename T>
-__global__ void ModulatedDeformableCol2imCoordGpuKernel(
-    const int nthreads, const T* data_col, const T* data_im,
-    const T* data_offset, const T* data_mask, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int offset_channels, const int deformable_group, const int height_col,
-    const int width_col, T* grad_offset, T* grad_mask) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    T val = 0, mval = 0;
-    const int w = i % width_col;
-    const int h = (i / width_col) % height_col;
-    const int c = (i / width_col / height_col) % offset_channels;
-    const int b = (i / width_col / height_col) / offset_channels;
-
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const T* data_col_ptr = data_col +
-                            deformable_group_index *
-                                channel_per_deformable_group * batch_size *
-                                width_col * height_col;
-    const T* data_im_ptr = data_im +
-                           (b * deformable_group + deformable_group_index) *
-                               channel_per_deformable_group / kernel_h /
-                               kernel_w * height * width;
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
-    const T* data_mask_ptr = data_mask +
-                             (b * deformable_group + deformable_group_index) *
-                                 kernel_h * kernel_w * height_col * width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
-         col_c += col_step) {
-      const int col_pos =
-          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
-          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
-          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
-          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
-           w_out);
-      const int data_mask_hw_ptr =
-          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      const T mask = data_mask_ptr[data_mask_hw_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
-        inv_h = inv_w = -2;
-      } else {
-        mval += data_col_ptr[col_pos] *
-                DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width,
-                                   height, width, inv_h, inv_w);
-      }
-      const T weight = DmcnGetCoordinateWeight(
-          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
-          width, bp_dir);
-      val += weight * data_col_ptr[col_pos] * mask;
-      cnt += 1;
-    }
-    grad_offset[i] = val;
-    if (offset_c % 2 == 0)
-      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
-                      kernel_w +
-                  offset_c / 2) *
-                     height_col +
-                 h) *
-                    width_col +
-                w] = mval;
-  }
-}
-
-template <typename T>
-inline void ModulatedDeformableCol2imCoord(
-    const platform::DeviceContext& ctx, const T* data_col, const T* data_im,
-    const T* data_offset, const T* data_mask,
-    const std::vector<int64_t> im_shape, const std::vector<int64_t> col_shape,
-    const std::vector<int64_t> kernel_shape, const std::vector<int> paddings,
-    const std::vector<int> strides, const std::vector<int> dilations,
-    const int deformable_groups, T* grad_offset, T* grad_mask) {
-  int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
-                    col_shape[2] * col_shape[3] * deformable_groups;
-  int channel_per_deformable_group = col_shape[0] / deformable_groups;
-  int blocks = NumBlocks(num_kernels);
-  int threads = kNumCUDAThreads;
-
-  ModulatedDeformableCol2imCoordGpuKernel<T><<<
-      blocks, threads, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      num_kernels, data_col, data_im, data_offset, data_mask, im_shape[0],
-      im_shape[1], im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0],
-      paddings[1], strides[0], strides[1], dilations[0], dilations[1],
-      channel_per_deformable_group, col_shape[1],
-      2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
-      deformable_groups, col_shape[2], col_shape[3], grad_offset, grad_mask);
-}
-
-template <typename T>
-__global__ void ModulatedDeformableIm2colGpuKernel(
-    const int nthreads, const T* data_im, const T* data_offset,
-    const T* data_mask, const int height, const int width, const int kernel_h,
-    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col,
-    const int width_col, T* data_col) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    const int w_col = i % width_col;
-    const int h_col = (i / width_col) % height_col;
-    const int b_col = (i / width_col) / height_col % batch_size;
-    const int c_im = (i / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-
-    T* data_col_ptr =
-        data_col +
-        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T* data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const T* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
-    const T* data_mask_ptr =
-        data_mask +
-        (b_col * deformable_group + deformable_group_index) * kernel_h *
-            kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
-            w_col;
-        const int data_mask_hw_ptr =
-            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        const T mask = data_mask_ptr[data_mask_hw_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val =
-              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
-        }
-        *data_col_ptr = val * mask;
-        data_col_ptr += batch_size * height_col * width_col;
-      }
-    }
-  }
-}
-
-template <typename T>
-inline void ModulatedDeformableIm2col(
-    const platform::DeviceContext& ctx, const T* data_im, const T* data_offset,
-    const T* data_mask, const std::vector<int64_t> im_shape,
-    const std::vector<int64_t> col_shape,
-    const std::vector<int64_t> filter_shape, const std::vector<int> paddings,
-    const std::vector<int> strides, const std::vector<int> dilations,
-    const int deformable_groups, T* data_col) {
-  int channel_per_deformable_group = im_shape[0] / deformable_groups;
-  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
-
-  int blocks = NumBlocks(num_kernels);
-  int threads = kNumCUDAThreads;
-
-  ModulatedDeformableIm2colGpuKernel<T><<<
-      blocks, threads, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      num_kernels, data_im, data_offset, data_mask, im_shape[1], im_shape[2],
-      filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0],
-      strides[1], dilations[0], dilations[1], channel_per_deformable_group,
-      col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3],
-      data_col);
-}
-
-template <typename T>
-__global__ void FilterGradAddupGpuKernel(const int nthreads, const int n,
-                                         const int height, const int width,
-                                         const T* dweight_3d, T* filter_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    filter_grad[i] = filter_grad[i] + dweight_3d[i];
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DeformableConvCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor offset = *ctx.Input<Tensor>("Offset");
-    const Tensor mask = *ctx.Input<Tensor>("Mask");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.cuda_device_context();
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-
-    // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, DeviceContext>(output_shape, dev_ctx);
-
-    int64_t M = output_shape_vec[1] / groups;
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K =
-        input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(
-        framework::make_ddim({groups, M, K}));
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer)
-        .Resize(framework::make_ddim({groups, K, N}));
-    Tensor output_4d;
-    output_4d.ShareDataWith(output_buffer)
-        .Resize(framework::make_ddim({batch_size / im2col_step, groups, M, N}));
-    output_4d.mutable_data<T>(ctx.GetPlace());
-    framework::DDim input_shape =
-        framework::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape);
-
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset.numel() / offset.dims()[0];
-    int input_mask_dim = mask.numel() / mask.dims()[0];
-
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-
-    const T* input_ptr = input->data<T>();
-    const T* offset_ptr = offset.data<T>();
-    const T* mask_ptr = mask.data<T>();
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    T* col_buffer_ptr = col_buffer.data<T>();
-
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      ModulatedDeformableIm2col(
-          ctx.device_context(), input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-          col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
-          deformable_groups, col_buffer_ptr);
-
-      Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
-          framework::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice =
-            weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        Tensor output_3d_slice =
-            output_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                output_3d.dims(), 1, output_3d.dims().size()));
-
-        blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
-                    &output_3d_slice, T(0.0));
-      }
-    }
-    output->ShareDataWith(output_buffer)
-        .Resize(framework::make_ddim(output_shape_vec));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor* offset_grad = ctx.Output<Tensor>(framework::GradVarName("Offset"));
-    Tensor* mask_grad = ctx.Output<Tensor>(framework::GradVarName("Mask"));
-
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    Tensor offset = *ctx.Input<Tensor>("Offset");
-    Tensor mask = *ctx.Input<Tensor>("Mask");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    if (!input_grad && !filter_grad && !offset_grad && !mask_grad) return;
-
-    int groups = ctx.Attr<int>("groups");
-    int deformable_groups = ctx.Attr<int>("deformable_groups");
-    int im2col_step = ctx.Attr<int>("im2col_step");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    framework::DDim input_shape =
-        framework::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape);
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(
-        framework::vectorize(output_grad->dims()));
-
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, DeviceContext>(output_shape, dev_ctx);
-
-    output_buffer.ShareDataWith(*output_grad);
-
-    int64_t M =
-        input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K = output_shape_vec[1] / groups;
-
-    framework::DDim weight_3d_shape = {groups, K, M};
-    framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K,
-                                         N};
-    framework::DDim col_buffer_3d_shape = {groups, M, N};
-    framework::DDim filter_grad_shape = {groups, K, M};
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(weight_3d_shape);
-    Tensor out_grad_4d;
-    out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape);
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    col_buffer_3d.mutable_data<T>(ctx.GetPlace());
-    out_grad_4d.mutable_data<T>(ctx.GetPlace());
-
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset.numel() / offset.dims()[0];
-    int input_mask_dim = mask.numel() / mask.dims()[0];
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      filter_grad->Resize(filter_grad_shape);
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-    }
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-    }
-
-    if (offset_grad && mask_grad) {
-      offset_grad->mutable_data<T>(ctx.GetPlace());
-      mask_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, offset_grad, static_cast<T>(0));
-      set_zero(dev_ctx, mask_grad, static_cast<T>(0));
-    }
-
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      Tensor out_grad_3d =
-          out_grad_4d.Slice(i, i + 1).Resize(framework::slice_ddim(
-              out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice =
-            weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor out_grad_3d_slice =
-            out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-
-        blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0),
-                    &col_buffer_3d_slice, T(0.0));
-      }
-      col_buffer.Resize(col_shape);
-
-      T* col_buffer_ptr = col_buffer.data<T>();
-      const T* input_ptr = input->data<T>();
-      const T* offset_ptr = offset.data<T>();
-      const T* mask_ptr = mask.data<T>();
-
-      if (mask_grad && offset_grad) {
-        T* offset_grad_ptr = offset_grad->data<T>();
-        T* mask_grad_ptr = mask_grad->data<T>();
-        ModulatedDeformableCol2imCoord(
-            ctx.device_context(), col_buffer_ptr,
-            input_ptr + i * im2col_step * input_dim,
-            offset_ptr + i * im2col_step * input_offset_dim,
-            mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-            col_buffer_shape_vec, filter_shape_vec, paddings, strides,
-            dilations, deformable_groups,
-            offset_grad_ptr + i * im2col_step * input_offset_dim,
-            mask_grad_ptr + i * im2col_step * input_mask_dim);
-      }
-      if (input_grad) {
-        T* input_grad_ptr = input_grad->data<T>();
-        ModulatedDeformableCol2im(
-            ctx.device_context(), col_buffer_ptr,
-            offset_ptr + i * im2col_step * input_offset_dim,
-            mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-            col_buffer_shape_vec, filter_shape_vec, paddings, strides,
-            dilations, deformable_groups,
-            input_grad_ptr + i * im2col_step * input_dim);
-        input_grad->Resize(input->dims());
-      }
-
-      ModulatedDeformableIm2col(
-          ctx.device_context(), input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-          col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
-          deformable_groups, col_buffer_ptr);
-
-      col_buffer_3d.Resize(col_buffer_3d_shape);
-
-      if (filter_grad) {
-        Tensor dweight_3d;
-        dweight_3d =
-            ctx.AllocateTmpTensor<T, DeviceContext>(filter_grad_shape, dev_ctx);
-        for (int g = 0; g < groups; ++g) {
-          Tensor out_grad_3d_slice =
-              out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
-          Tensor col_buffer_3d_slice =
-              col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-          Tensor dweight_3d_slice =
-              dweight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  dweight_3d.dims(), 1, dweight_3d.dims().size()));
-
-          blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true,
-                      T(1.0), &dweight_3d_slice, T(0.0));
-        }
-        FilterGradAddupGpuKernel<
-            T><<<NumBlocks(dweight_3d.numel()), kNumCUDAThreads, 0,
-                 ctx.cuda_device_context().stream()>>>(
-            dweight_3d.numel(), groups, K, M, dweight_3d.data<T>(),
-            filter_grad->data<T>());
-      }
-    }
-    if (filter_grad) {
-      filter_grad->Resize(filter.dims());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-
-REGISTER_OP_CUDA_KERNEL(deformable_conv,
-                        ops::DeformableConvCUDAKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(deformable_conv_grad,
-                        ops::DeformableConvGradCUDAKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h
deleted file mode 100644
index 33a97bf48b2806788b23154c48e2ee174fdd3d92..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_conv_op.h
+++ /dev/null
@@ -1,613 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Part of the following code in this file refs to
-// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
-//
-// Copyright (c) 2017 Microsoft
-// Licensed under The Apache-2.0 License [see LICENSE for details]
-// \file deformable_psroi_pooling.cu
-// \brief
-// \author Yi Li, Guodong Zhang, Jifeng Dai
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/deformable_conv_func.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using CPUDeviceContext = platform::CPUDeviceContext;
-
-template <typename T>
-void ModulatedDeformableCol2imCPUKernel(
-    const int num_kernels, const T* data_col, const T* data_offset,
-    const T* data_mask, const int channels, const int height, const int width,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int channel_per_deformable_group,
-    const int batch_size, const int deformable_group, const int height_col,
-    const int width_col, T* grad_im) {
-  for (size_t thread = 0; thread < num_kernels; thread++) {
-    const int j = (thread / width_col / height_col / batch_size) % kernel_w;
-    const int i =
-        (thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
-        thread / width_col / height_col / batch_size / kernel_w / kernel_h;
-
-    const int deformable_group_index = c / channel_per_deformable_group;
-
-    int w_out = thread % width_col;
-    int h_out = (thread / width_col) % height_col;
-    int b = (thread / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
-    const T* data_mask_ptr = data_mask +
-                             (b * deformable_group + deformable_group_index) *
-                                 kernel_h * kernel_w * height_col * width_col;
-    const int data_offset_h_ptr =
-        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
-        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const int data_mask_hw_ptr =
-        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T mask = data_mask_ptr[data_mask_hw_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
-
-    const T cur_top_grad = data_col[thread] * mask;
-    const int cur_h = static_cast<int>(cur_inv_h_data);
-    const int cur_w = static_cast<int>(cur_inv_w_data);
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
-        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
-            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
-              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight =
-              DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy,
-                                    cur_w + dx, height, width);
-
-          *(grad_im + cur_bottom_grad_pos) =
-              *(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static inline void ModulatedDeformableCol2imCPU(
-    const platform::CPUDeviceContext& ctx, const T* data_col,
-    const T* data_offset, const T* data_mask,
-    const std::vector<int64_t> im_shape, const std::vector<int64_t> col_shape,
-    const std::vector<int64_t> kernel_shape, const std::vector<int> pad,
-    const std::vector<int> stride, const std::vector<int> dilation,
-    const int deformable_group, T* grad_im) {
-  int channel_per_deformable_group = im_shape[0] / deformable_group;
-  int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
-
-  ModulatedDeformableCol2imCPUKernel(
-      num_kernels, data_col, data_offset, data_mask, im_shape[0], im_shape[1],
-      im_shape[2], kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0],
-      stride[1], dilation[0], dilation[1], channel_per_deformable_group,
-      col_shape[1], deformable_group, col_shape[2], col_shape[3], grad_im);
-}
-
-template <typename T>
-void ModulatedDeformableCol2imCoordCPUKernel(
-    const int num_kernels, const T* data_col, const T* data_im,
-    const T* data_offset, const T* data_mask, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int offset_channels, const int deformable_group, const int height_col,
-    const int width_col, T* grad_offset, T* grad_mask) {
-  for (size_t i = 0; i < num_kernels; i++) {
-    T val = 0, mval = 0;
-    const int w = i % width_col;
-    const int h = (i / width_col) % height_col;
-    const int c = (i / width_col / height_col) % offset_channels;
-    const int b = (i / width_col / height_col) / offset_channels;
-
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const T* data_col_ptr = data_col +
-                            deformable_group_index *
-                                channel_per_deformable_group * batch_size *
-                                width_col * height_col;
-    const T* data_im_ptr = data_im +
-                           (b * deformable_group + deformable_group_index) *
-                               channel_per_deformable_group / kernel_h /
-                               kernel_w * height * width;
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
-    const T* data_mask_ptr = data_mask +
-                             (b * deformable_group + deformable_group_index) *
-                                 kernel_h * kernel_w * height_col * width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
-         col_c += col_step) {
-      const int col_pos =
-          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
-          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
-          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
-          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
-           w_out);
-      const int data_mask_hw_ptr =
-          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      const T mask = data_mask_ptr[data_mask_hw_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
-        inv_h = inv_w = -2;
-      } else {
-        mval += data_col_ptr[col_pos] *
-                DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width,
-                                   height, width, inv_h, inv_w);
-      }
-      const T weight = DmcnGetCoordinateWeight(
-          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
-          width, bp_dir);
-      val += weight * data_col_ptr[col_pos] * mask;
-      cnt += 1;
-    }
-    grad_offset[i] = val;
-    if (offset_c % 2 == 0)
-      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
-                      kernel_w +
-                  offset_c / 2) *
-                     height_col +
-                 h) *
-                    width_col +
-                w] = mval;
-  }
-}
-
-template <typename T>
-static inline void ModulatedDeformableCol2imCoordCPU(
-    const platform::CPUDeviceContext& ctx, const T* data_col, const T* data_im,
-    const T* data_offset, const T* data_mask,
-    const std::vector<int64_t> im_shape, const std::vector<int64_t> col_shape,
-    const std::vector<int64_t> kernel_shape, const std::vector<int> paddings,
-    const std::vector<int> strides, const std::vector<int> dilations,
-    const int deformable_groups, T* grad_offset, T* grad_mask) {
-  int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
-                    col_shape[2] * col_shape[3] * deformable_groups;
-  int channel_per_deformable_group = col_shape[0] / deformable_groups;
-
-  ModulatedDeformableCol2imCoordCPUKernel(
-      num_kernels, data_col, data_im, data_offset, data_mask, im_shape[0],
-      im_shape[1], im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0],
-      paddings[1], strides[0], strides[1], dilations[0], dilations[1],
-      channel_per_deformable_group, col_shape[1],
-      2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
-      deformable_groups, col_shape[2], col_shape[3], grad_offset, grad_mask);
-}
-
-template <typename T>
-void ModulatedDeformableIm2colCPUKernel(
-    const int num_kernels, const T* data_im, const T* data_offset,
-    const T* data_mask, const int height, const int width, const int kernel_h,
-    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col,
-    const int width_col, T* data_col) {
-  for (size_t i = 0; i < num_kernels; i++) {
-    const int w_col = i % width_col;
-    const int h_col = (i / width_col) % height_col;
-    const int b_col = (i / width_col) / height_col % batch_size;
-    const int c_im = (i / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-
-    T* data_col_ptr =
-        data_col +
-        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T* data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const T* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
-    const T* data_mask_ptr =
-        data_mask +
-        (b_col * deformable_group + deformable_group_index) * kernel_h *
-            kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
-            w_col;
-        const int data_mask_hw_ptr =
-            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        const T mask = data_mask_ptr[data_mask_hw_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val =
-              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
-        }
-        *data_col_ptr = val * mask;
-        data_col_ptr += batch_size * height_col * width_col;
-      }
-    }
-  }
-}
-
-template <typename T>
-static inline void ModulatedDeformableIm2colCPU(
-    const platform::CPUDeviceContext& ctx, const T* data_im,
-    const T* data_offset, const T* data_mask,
-    const std::vector<int64_t> im_shape, const std::vector<int64_t> col_shape,
-    const std::vector<int64_t> filter_shape, const std::vector<int> paddings,
-    const std::vector<int> strides, const std::vector<int> dilations,
-    const int deformable_groups, T* data_col) {
-  int channel_per_deformable_group = im_shape[0] / deformable_groups;
-  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
-
-  // get outputs of im2col with offset by bilinear interpolation
-  ModulatedDeformableIm2colCPUKernel(
-      num_kernels, data_im, data_offset, data_mask, im_shape[1], im_shape[2],
-      filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0],
-      strides[1], dilations[0], dilations[1], channel_per_deformable_group,
-      col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3],
-      data_col);
-}
-
-template <typename T>
-void FilterGradAddupCPUKernel(const int nthreads, const int n, const int height,
-                              const int width, const T* dweight_3d,
-                              T* filter_grad) {
-  for (size_t i = 0; i < nthreads; i++) {
-    filter_grad[i] = filter_grad[i] + dweight_3d[i];
-  }
-}
-
-template <typename T>
-class DeformableConvCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* mask = ctx.Input<Tensor>("Mask");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<CPUDeviceContext>();
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-
-    // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, CPUDeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, CPUDeviceContext>(output_shape, dev_ctx);
-    int64_t M = output_shape_vec[1] / groups;
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K =
-        input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(
-        framework::make_ddim({groups, M, K}));
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer)
-        .Resize(framework::make_ddim({groups, K, N}));
-    Tensor output_4d;
-    output_4d.ShareDataWith(output_buffer)
-        .Resize(framework::make_ddim({batch_size / im2col_step, groups, M, N}));
-    output_4d.mutable_data<T>(ctx.GetPlace());
-    framework::DDim input_shape =
-        framework::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape);
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset->numel() / offset->dims()[0];
-    int input_mask_dim = mask->numel() / mask->dims()[0];
-    auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
-    const T* input_ptr = input->data<T>();
-    const T* offset_ptr = offset->data<T>();
-    const T* mask_ptr = mask->data<T>();
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    T* col_buffer_ptr = col_buffer.data<T>();
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      ModulatedDeformableIm2colCPU(
-          dev_ctx, input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-          col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
-          deformable_groups, col_buffer_ptr);
-      Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
-          framework::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
-      // get the product of pixel and weight
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice =
-            weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        Tensor output_3d_slice =
-            output_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                output_3d.dims(), 1, output_3d.dims().size()));
-        blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
-                    &output_3d_slice, T(0.0));
-      }
-    }
-    output->ShareDataWith(output_buffer)
-        .Resize(framework::make_ddim(output_shape_vec));
-  }
-};
-
-template <typename T>
-class DeformableConvGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor* offset_grad = ctx.Output<Tensor>(framework::GradVarName("Offset"));
-    Tensor* mask_grad = ctx.Output<Tensor>(framework::GradVarName("Mask"));
-
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    Tensor offset = *ctx.Input<Tensor>("Offset");
-    Tensor mask = *ctx.Input<Tensor>("Mask");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    if (!input_grad && !filter_grad && !offset_grad && !mask_grad) return;
-
-    int groups = ctx.Attr<int>("groups");
-    int deformable_groups = ctx.Attr<int>("deformable_groups");
-    int im2col_step = ctx.Attr<int>("im2col_step");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    auto& dev_ctx = ctx.template device_context<CPUDeviceContext>();
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    framework::DDim input_shape =
-        framework::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape);
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(
-        framework::vectorize(output_grad->dims()));
-
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, CPUDeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, CPUDeviceContext>(output_shape, dev_ctx);
-
-    output_buffer.ShareDataWith(*output_grad);
-
-    int64_t M =
-        input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K = output_shape_vec[1] / groups;
-
-    framework::DDim weight_3d_shape = {groups, K, M};
-    framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K,
-                                         N};
-    framework::DDim col_buffer_3d_shape = {groups, M, N};
-    framework::DDim filter_grad_shape = {groups, K, M};
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(weight_3d_shape);
-    Tensor out_grad_4d;
-    out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape);
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
-
-    math::SetConstant<CPUDeviceContext, T> set_zero;
-    auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
-
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    col_buffer_3d.mutable_data<T>(ctx.GetPlace());
-    out_grad_4d.mutable_data<T>(ctx.GetPlace());
-
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset.numel() / offset.dims()[0];
-    int input_mask_dim = mask.numel() / mask.dims()[0];
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      filter_grad->Resize(filter_grad_shape);
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-    }
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-    }
-
-    if (offset_grad && mask_grad) {
-      offset_grad->mutable_data<T>(ctx.GetPlace());
-      mask_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, offset_grad, static_cast<T>(0));
-      set_zero(dev_ctx, mask_grad, static_cast<T>(0));
-    }
-
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      Tensor out_grad_3d =
-          out_grad_4d.Slice(i, i + 1).Resize(framework::slice_ddim(
-              out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice =
-            weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor out_grad_3d_slice =
-            out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-
-        blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0),
-                    &col_buffer_3d_slice, T(0.0));
-      }
-      col_buffer.Resize(col_shape);
-
-      T* col_buffer_ptr = col_buffer.data<T>();
-      const T* input_ptr = input->data<T>();
-      const T* offset_ptr = offset.data<T>();
-      const T* mask_ptr = mask.data<T>();
-
-      if (mask_grad && offset_grad) {
-        T* offset_grad_ptr = offset_grad->data<T>();
-        T* mask_grad_ptr = mask_grad->data<T>();
-        // get grad of offset and mask
-        ModulatedDeformableCol2imCoordCPU(
-            ctx.template device_context<CPUDeviceContext>(), col_buffer_ptr,
-            input_ptr + i * im2col_step * input_dim,
-            offset_ptr + i * im2col_step * input_offset_dim,
-            mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-            col_buffer_shape_vec, filter_shape_vec, paddings, strides,
-            dilations, deformable_groups,
-            offset_grad_ptr + i * im2col_step * input_offset_dim,
-            mask_grad_ptr + i * im2col_step * input_mask_dim);
-      }
-      if (input_grad) {
-        T* input_grad_ptr = input_grad->data<T>();
-        // get grad of input
-        ModulatedDeformableCol2imCPU(
-            ctx.template device_context<CPUDeviceContext>(), col_buffer_ptr,
-            offset_ptr + i * im2col_step * input_offset_dim,
-            mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-            col_buffer_shape_vec, filter_shape_vec, paddings, strides,
-            dilations, deformable_groups,
-            input_grad_ptr + i * im2col_step * input_dim);
-        input_grad->Resize(input->dims());
-      }
-
-      ModulatedDeformableIm2colCPU(
-          ctx.template device_context<CPUDeviceContext>(),
-          input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-          col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
-          deformable_groups, col_buffer_ptr);
-
-      col_buffer_3d.Resize(col_buffer_3d_shape);
-
-      if (filter_grad) {
-        Tensor dweight_3d;
-        dweight_3d = ctx.AllocateTmpTensor<T, CPUDeviceContext>(
-            filter_grad_shape, dev_ctx);
-        for (int g = 0; g < groups; ++g) {
-          Tensor out_grad_3d_slice =
-              out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
-          Tensor col_buffer_3d_slice =
-              col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-          Tensor dweight_3d_slice =
-              dweight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  dweight_3d.dims(), 1, dweight_3d.dims().size()));
-
-          blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true,
-                      T(1.0), &dweight_3d_slice, T(0.0));
-        }
-        // update grad of weights
-        FilterGradAddupCPUKernel(dweight_3d.numel(), groups, K, M,
-                                 dweight_3d.data<T>(), filter_grad->data<T>());
-      }
-    }
-    if (filter_grad) {
-      filter_grad->Resize(filter.dims());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cc b/paddle/fluid/operators/deformable_conv_v1_op.cc
deleted file mode 100644
index 6129e29655048ea7001bf1e48846f6801c16459d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_conv_v1_op.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/deformable_conv_v1_op.h"
-#include <memory>
-#include "paddle/fluid/operators/conv_op.h"
-
-namespace paddle {
-namespace operators {
-class DeformableConvV1OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(Tensor) The input of deformable conv op. "
-             "The shape of input is "
-             "[N, channel_in, H, W]");
-    AddInput("Offset",
-             "(Tensor) The input offset. "
-             "The shape of the offset is "
-             "[N, deformable_groups * kernel_w * kernel_h * 2, H, W");
-    AddInput("Filter",
-             "(Tensor) The Input Filter "
-             "The shape of the wight is "
-             "[num_filters, channel_in, kernel_h, kernel_w.");
-    AddOutput("Output",
-              "(Tensor) The output. "
-              "The shape of the output tensor is "
-              "[N, num_filters, out_height, out_width]].");
-    AddAttr<std::vector<int>>("strides",
-                              "(vector<int> default:{1, 1}), the "
-                              "strides(h_stride, w_stride) of "
-                              "convolution operator.")
-        .SetDefault({1, 1});
-    AddAttr<std::vector<int>>("paddings",
-                              "(vector<int> default:{0,0}), the "
-                              "paddings(h_pad, w_pad) of "
-                              "convolution operator. ")
-        .SetDefault({0, 0});
-    AddAttr<std::vector<int>>("dilations",
-                              "(vector<int> default:{1, 1}), the "
-                              "dilations(h_dilation, w_dilation) of "
-                              "convolution operator.")
-        .SetDefault({1, 1});
-    AddAttr<int>(
-        "groups",
-        "(int default:1), the groups number of the convolution operator. "
-        "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
-        "when group=2, the first half of the filters is only connected to the "
-        "first half of the input channels, while the second half of the "
-        "filters "
-        "is only connected to the second half of the input channels.")
-        .SetDefault(1);
-    AddAttr<int>("deformable_groups",
-                 "(int default:1), the number of the deformable groups.")
-        .SetDefault(1);
-    AddAttr<int>("im2col_step",
-                 "im2col maximum number of image per computation")
-        .SetDefault(64);
-    AddComment(R"DOC(
-**Deformable Convolution v1 Operator**
-
-Deformable Convolution is a new method based Convolution which feature has offset 
-in spatial location.
-
-1. Get offset of each pixel in feature map with convolution layers which number 
-   of channels should be double of weight size.
-
-2. Add offset to pixel to get new location and the new value which are computed 
-   directly through bilinear interpolation with four nearest pixel.
-
-3. Get the product of pixel and weight as result
-
-Compute 2-D deformable convolution on 4-D input.
-
-Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
-
-$$
-y(p) = \\sum_{k=1}^{K}{w_k * x(p + p_k + \\Delta p_k)}
-$$
-
-Where $$\\Delta p_k$$ is the learnable offset for the k-th location, respectively.
-
-Refer to 'https://arxiv.org/abs/1703.06211 '<https://arxiv.org/abs/1703.06211>
-
-Example:
-  Input:
-       Input shape: $(N, C_{in}, H_{in}, W_{in})$
-       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
-       Offset shape: $(N, 2 * deformable_groups, * H_f * W_f, H_{out}, W_{out})$
-  Output:
-       Output shape: $(N, C_{out}, H_{out}, W_{out})$
-                     where $H_{out}, W_{out}$ must be equal to $H_{in}, W_{in}$ respectively.
-  Where
-$$
-       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
-       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
-$$
-)DOC");
-  }
-};
-
-class DeformableConvV1Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      "Input(Input) of DeformableConvOp "
-                      "should not be null");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Offset"), true,
-                      "Input(Offset) of DeformableConvOp "
-                      "should not be null");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Filter"), true,
-                      "Input(Filter) of DeformableConvOp "
-                      "should not be null");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Output"), true,
-                      "Output(Output) of DeformableConvOp "
-                      "should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    auto offset_dims = ctx->GetInputDim("Offset");
-
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    std::vector<int> dilations =
-        ctx->Attrs().Get<std::vector<int>>("dilations");
-    int groups = ctx->Attrs().Get<int>("groups");
-    int deformable_groups = ctx->Attrs().Get<int>("deformable_groups");
-    int im2col_step = ctx->Attrs().Get<int>("im2col_step");
-
-    PADDLE_ENFORCE_EQ(in_dims.size(), 4,
-                      "Conv input should be 4-D tensor, get %u",
-                      in_dims.size());
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(), filter_dims.size(),
-        "Conv input dimension and filter dimension should be the same.");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size() - strides.size(), 2U,
-        "Conv input dimension and strides dimension should be consistent.");
-    PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
-                      "Conv paddings dimension and Conv strides dimension "
-                      "should be the same.");
-
-    PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
-                      "The number of input channels should be equal to filter "
-                      "channels * groups.");
-    PADDLE_ENFORCE_EQ(
-        filter_dims[0] % groups, 0,
-        "The number of output channels should be divided by groups.");
-    PADDLE_ENFORCE_EQ(filter_dims[0] % deformable_groups, 0,
-                      "The number of output channels should be "
-                      "divided by deformable groups.");
-
-    if (in_dims[0] > im2col_step) {
-      PADDLE_ENFORCE_EQ(
-          in_dims[0] % im2col_step, 0U,
-          "Input batchsize must be smaller than or divide im2col_step");
-    }
-
-    for (size_t i = 0; i < strides.size(); ++i) {
-      PADDLE_ENFORCE_GT(strides[i], 0U, "stride %d size incorrect", i);
-    }
-    for (size_t i = 0; i < dilations.size(); ++i) {
-      PADDLE_ENFORCE_GT(dilations[i], 0U, "dilation %d size incorrect", i);
-    }
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                            dilations[i], paddings[i],
-                                            strides[i]));
-    }
-    PADDLE_ENFORCE_EQ(output_shape[1] % deformable_groups, 0U,
-                      "output num_filter must divide deformable group size.");
-    PADDLE_ENFORCE_EQ(output_shape[2], offset_dims[2],
-                      "output height must equal to offset map height.");
-    PADDLE_ENFORCE_EQ(output_shape[3], offset_dims[3],
-                      "output width must equal to offset map width.");
-    PADDLE_ENFORCE_EQ(offset_dims[1] % (filter_dims[2] * filter_dims[3]), 0U,
-                      "offset filter must divide deformable group size.");
-    PADDLE_ENFORCE_EQ(offset_dims[1] / (2 * filter_dims[2] * filter_dims[3]),
-                      deformable_groups,
-                      "offset filter must divide deformable group size.");
-
-    ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class DeformableConvV1GradOpDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("deformable_conv_v1_grad");
-    op->SetInput("Input", Input("Input"));
-    op->SetInput("Filter", Input("Filter"));
-    op->SetInput("Offset", Input("Offset"));
-    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
-
-    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
-    op->SetOutput(framework::GradVarName("Offset"), InputGrad("Offset"));
-
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class DeformableConvV1GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    auto in_dims = ctx->GetInputDim("Input");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    auto offset_dims = ctx->GetInputDim("Offset");
-
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Output")), true,
-                      "the gradient of output(Out) must not be null");
-    if (ctx->HasOutput(framework::GradVarName("Input"))) {
-      ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-      ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Offset"))) {
-      ctx->SetOutputDim(framework::GradVarName("Offset"), offset_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   ctx.device_context());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(deformable_conv_v1, ops::DeformableConvV1Op,
-                  ops::DeformableConvV1OpMaker,
-                  ops::DeformableConvV1GradOpDescMaker);
-REGISTER_OPERATOR(deformable_conv_v1_grad, ops::DeformableConvV1GradOp);
-
-REGISTER_OP_CPU_KERNEL(deformable_conv_v1,
-                       ops::DeformableConvV1CPUKernel<float>);
-REGISTER_OP_CPU_KERNEL(deformable_conv_v1_grad,
-                       ops::DeformableConvV1GradCPUKernel<float>);
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cu b/paddle/fluid/operators/deformable_conv_v1_op.cu
deleted file mode 100644
index a865766f9adbbe2e4a3c994d774438dff731a732..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_conv_v1_op.cu
+++ /dev/null
@@ -1,609 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Part of the following code in this file refs to
-// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
-//
-// Copyright (c) 2017 Microsoft
-// Licensed under The Apache-2.0 License [see LICENSE for details]
-// \file deformable_psroi_pooling.cu
-// \brief
-// \author Yi Li, Guodong Zhang, Jifeng Dai
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/deformable_conv_filter.cu.h"
-#include "paddle/fluid/operators/deformable_conv_func.h"
-#include "paddle/fluid/operators/deformable_conv_v1_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-static constexpr int kNumCUDAThread = 512;
-static constexpr int kNumMaximumNumBlock = 4096;
-
-static inline int NumBlock(const int N) {
-  return std::min((N + kNumCUDAThread - 1) / kNumCUDAThread,
-                  kNumMaximumNumBlock);
-}
-
-template <typename T>
-__global__ void DeformableCol2imCUDAKernel(
-    const int nthreads, const T* data_col, const T* data_offset,
-    const int channels, const int height, const int width, const int kernel_h,
-    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int deformable_group, const int height_col, const int width_col,
-    T* grad_im) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t thread = index; thread < nthreads; thread += offset) {
-    const int j = (thread / width_col / height_col / batch_size) % kernel_w;
-    const int i =
-        (thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
-        thread / width_col / height_col / batch_size / kernel_w / kernel_h;
-
-    const int deformable_group_index = c / channel_per_deformable_group;
-
-    int w_out = thread % width_col;
-    int h_out = (thread / width_col) % height_col;
-    int b = (thread / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
-    const int data_offset_h_ptr =
-        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
-        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
-
-    const T cur_top_grad = data_col[thread];
-    const int cur_h = static_cast<int>(cur_inv_h_data);
-    const int cur_w = static_cast<int>(cur_inv_w_data);
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
-        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
-            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
-              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight =
-              DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy,
-                                    cur_w + dx, height, width);
-
-          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-inline void DeformableCol2im(const platform::CUDADeviceContext& ctx,
-                             const T* data_col, const T* data_offset,
-                             const std::vector<int64_t> im_shape,
-                             const std::vector<int64_t> col_shape,
-                             const std::vector<int64_t> kernel_shape,
-                             const std::vector<int> pad,
-                             const std::vector<int> stride,
-                             const std::vector<int> dilation,
-                             const int deformable_group, T* grad_im) {
-  int channel_per_deformable_group = im_shape[0] / deformable_group;
-  int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
-  int blocks = NumBlock(num_kernels);
-  int threads = kNumCUDAThread;
-
-  DeformableCol2imCUDAKernel<T><<<
-      blocks, threads, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      num_kernels, data_col, data_offset, im_shape[0], im_shape[1], im_shape[2],
-      kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0], stride[1],
-      dilation[0], dilation[1], channel_per_deformable_group, col_shape[1],
-      deformable_group, col_shape[2], col_shape[3], grad_im);
-}
-
-template <typename T>
-__global__ void DeformableCol2imCoordCUDAKernel(
-    const int nthreads, const T* data_col, const T* data_im,
-    const T* data_offset, const int channels, const int height, const int width,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int channel_per_deformable_group,
-    const int batch_size, const int offset_channels, const int deformable_group,
-    const int height_col, const int width_col, T* grad_offset) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    T val = 0, mval = 0;
-    const int w = i % width_col;
-    const int h = (i / width_col) % height_col;
-    const int c = (i / width_col / height_col) % offset_channels;
-    const int b = (i / width_col / height_col) / offset_channels;
-
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const T* data_col_ptr = data_col +
-                            deformable_group_index *
-                                channel_per_deformable_group * batch_size *
-                                width_col * height_col;
-    const T* data_im_ptr = data_im +
-                           (b * deformable_group + deformable_group_index) *
-                               channel_per_deformable_group / kernel_h /
-                               kernel_w * height * width;
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
-         col_c += col_step) {
-      const int col_pos =
-          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
-          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
-          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
-          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
-           w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
-        inv_h = inv_w = -2;
-      } else {
-        mval += data_col_ptr[col_pos] *
-                DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width,
-                                   height, width, inv_h, inv_w);
-      }
-      const T weight = DmcnGetCoordinateWeight(
-          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
-          width, bp_dir);
-      val += weight * data_col_ptr[col_pos];
-      cnt += 1;
-    }
-    grad_offset[i] = val;
-  }
-}
-
-template <typename T>
-inline void DeformableCol2imCoord(
-    const platform::CUDADeviceContext& ctx, const T* data_col, const T* data_im,
-    const T* data_offset, const std::vector<int64_t> im_shape,
-    const std::vector<int64_t> col_shape,
-    const std::vector<int64_t> kernel_shape, const std::vector<int> paddings,
-    const std::vector<int> strides, const std::vector<int> dilations,
-    const int deformable_groups, T* grad_offset) {
-  int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
-                    col_shape[2] * col_shape[3] * deformable_groups;
-  int channel_per_deformable_group = col_shape[0] / deformable_groups;
-  int blocks = NumBlock(num_kernels);
-  int threads = kNumCUDAThread;
-
-  DeformableCol2imCoordCUDAKernel<T><<<
-      blocks, threads, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      num_kernels, data_col, data_im, data_offset, im_shape[0], im_shape[1],
-      im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0], paddings[1],
-      strides[0], strides[1], dilations[0], dilations[1],
-      channel_per_deformable_group, col_shape[1],
-      2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
-      deformable_groups, col_shape[2], col_shape[3], grad_offset);
-}
-
-template <typename T>
-__global__ void DeformableIm2colCUDAKernel(
-    const int nthreads, const T* data_im, const T* data_offset,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col,
-    const int width_col, T* data_col) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    const int w_col = i % width_col;
-    const int h_col = (i / width_col) % height_col;
-    const int b_col = (i / width_col) / height_col % batch_size;
-    const int c_im = (i / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-
-    T* data_col_ptr =
-        data_col +
-        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T* data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const T* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
-            w_col;
-
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val =
-              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
-        }
-        *data_col_ptr = val;
-        data_col_ptr += batch_size * height_col * width_col;
-      }
-    }
-  }
-}
-
-template <typename T>
-inline void DeformableIm2col(const platform::CUDADeviceContext& ctx,
-                             const T* data_im, const T* data_offset,
-                             const std::vector<int64_t> im_shape,
-                             const std::vector<int64_t> col_shape,
-                             const std::vector<int64_t> filter_shape,
-                             const std::vector<int> paddings,
-                             const std::vector<int> strides,
-                             const std::vector<int> dilations,
-                             const int deformable_groups, T* data_col) {
-  int channel_per_deformable_group = im_shape[0] / deformable_groups;
-  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
-
-  int blocks = NumBlock(num_kernels);
-  int threads = kNumCUDAThread;
-
-  // get outputs of im2col with offset by bilinear interpolation
-  DeformableIm2colCUDAKernel<T><<<
-      blocks, threads, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      num_kernels, data_im, data_offset, im_shape[1], im_shape[2],
-      filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0],
-      strides[1], dilations[0], dilations[1], channel_per_deformable_group,
-      col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3],
-      data_col);
-}
-
-template <typename T>
-class DeformableConvV1CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor offset = *ctx.Input<Tensor>("Offset");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<CUDADeviceContext>();
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-
-    // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer =
-        ctx.AllocateTmpTensor<T, CUDADeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, CUDADeviceContext>(output_shape, dev_ctx);
-
-    int64_t M = output_shape_vec[1] / groups;
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K =
-        input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(
-        framework::make_ddim({groups, M, K}));
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer)
-        .Resize(framework::make_ddim({groups, K, N}));
-    Tensor output_4d;
-    output_4d.ShareDataWith(output_buffer)
-        .Resize(framework::make_ddim({batch_size / im2col_step, groups, M, N}));
-    output_4d.mutable_data<T>(ctx.GetPlace());
-    framework::DDim input_shape =
-        framework::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape);
-
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset.numel() / offset.dims()[0];
-
-    auto blas = math::GetBlas<CUDADeviceContext, T>(dev_ctx);
-
-    const T* input_ptr = input->data<T>();
-    const T* offset_ptr = offset.data<T>();
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    T* col_buffer_ptr = col_buffer.data<T>();
-
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      DeformableIm2col(dev_ctx, input_ptr + i * im2col_step * input_dim,
-                       offset_ptr + i * im2col_step * input_offset_dim,
-                       input_shape_vec, col_buffer_shape_vec, filter_shape_vec,
-                       paddings, strides, dilations, deformable_groups,
-                       col_buffer_ptr);
-
-      Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
-          framework::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
-      // get the product of pixel and weight
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice =
-            weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        Tensor output_3d_slice =
-            output_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                output_3d.dims(), 1, output_3d.dims().size()));
-
-        blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
-                    &output_3d_slice, T(0.0));
-      }
-    }
-    output->ShareDataWith(output_buffer)
-        .Resize(framework::make_ddim(output_shape_vec));
-  }
-};
-
-template <typename T>
-class DeformableConvV1GradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor* offset_grad = ctx.Output<Tensor>(framework::GradVarName("Offset"));
-
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    Tensor offset = *ctx.Input<Tensor>("Offset");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    if (!input_grad && !filter_grad && !offset_grad) return;
-
-    int groups = ctx.Attr<int>("groups");
-    int deformable_groups = ctx.Attr<int>("deformable_groups");
-    int im2col_step = ctx.Attr<int>("im2col_step");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    auto& dev_ctx = ctx.template device_context<CUDADeviceContext>();
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    framework::DDim input_shape =
-        framework::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape);
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(
-        framework::vectorize(output_grad->dims()));
-
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer =
-        ctx.AllocateTmpTensor<T, CUDADeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, CUDADeviceContext>(output_shape, dev_ctx);
-
-    output_buffer.ShareDataWith(*output_grad);
-
-    int64_t M =
-        input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K = output_shape_vec[1] / groups;
-
-    framework::DDim weight_3d_shape = {groups, K, M};
-    framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K,
-                                         N};
-    framework::DDim col_buffer_3d_shape = {groups, M, N};
-    framework::DDim filter_grad_shape = {groups, K, M};
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(weight_3d_shape);
-    Tensor out_grad_4d;
-    out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape);
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
-
-    math::SetConstant<CUDADeviceContext, T> set_zero;
-    auto blas = math::GetBlas<CUDADeviceContext, T>(dev_ctx);
-
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    col_buffer_3d.mutable_data<T>(ctx.GetPlace());
-    out_grad_4d.mutable_data<T>(ctx.GetPlace());
-
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset.numel() / offset.dims()[0];
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      filter_grad->Resize(filter_grad_shape);
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-    }
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-    }
-
-    if (offset_grad) {
-      offset_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, offset_grad, static_cast<T>(0));
-    }
-
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      Tensor out_grad_3d =
-          out_grad_4d.Slice(i, i + 1).Resize(framework::slice_ddim(
-              out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice =
-            weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor out_grad_3d_slice =
-            out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-
-        blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0),
-                    &col_buffer_3d_slice, T(0.0));
-      }
-      col_buffer.Resize(col_shape);
-
-      T* col_buffer_ptr = col_buffer.data<T>();
-      const T* input_ptr = input->data<T>();
-      const T* offset_ptr = offset.data<T>();
-
-      if (offset_grad) {
-        T* offset_grad_ptr = offset_grad->data<T>();
-        // get grad of offset
-        DeformableCol2imCoord(
-            dev_ctx, col_buffer_ptr, input_ptr + i * im2col_step * input_dim,
-            offset_ptr + i * im2col_step * input_offset_dim, input_shape_vec,
-            col_buffer_shape_vec, filter_shape_vec, paddings, strides,
-            dilations, deformable_groups,
-            offset_grad_ptr + i * im2col_step * input_offset_dim);
-      }
-      if (input_grad) {
-        T* input_grad_ptr = input_grad->data<T>();
-        // get grad of input
-        DeformableCol2im(dev_ctx, col_buffer_ptr,
-                         offset_ptr + i * im2col_step * input_offset_dim,
-                         input_shape_vec, col_buffer_shape_vec,
-                         filter_shape_vec, paddings, strides, dilations,
-                         deformable_groups,
-                         input_grad_ptr + i * im2col_step * input_dim);
-        input_grad->Resize(input->dims());
-      }
-
-      DeformableIm2col(dev_ctx, input_ptr + i * im2col_step * input_dim,
-                       offset_ptr + i * im2col_step * input_offset_dim,
-                       input_shape_vec, col_buffer_shape_vec, filter_shape_vec,
-                       paddings, strides, dilations, deformable_groups,
-                       col_buffer_ptr);
-
-      col_buffer_3d.Resize(col_buffer_3d_shape);
-
-      if (filter_grad) {
-        Tensor dweight_3d;
-        dweight_3d = ctx.AllocateTmpTensor<T, CUDADeviceContext>(
-            filter_grad_shape, dev_ctx);
-        for (int g = 0; g < groups; ++g) {
-          Tensor out_grad_3d_slice =
-              out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
-          Tensor col_buffer_3d_slice =
-              col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-          Tensor dweight_3d_slice =
-              dweight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  dweight_3d.dims(), 1, dweight_3d.dims().size()));
-
-          blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true,
-                      T(1.0), &dweight_3d_slice, T(0.0));
-        }
-        FilterGradAddupCUDAKernel<T><<<NumBlock(dweight_3d.numel()),
-                                       kNumCUDAThread, 0, dev_ctx.stream()>>>(
-            dweight_3d.numel(), groups, K, M, dweight_3d.data<T>(),
-            filter_grad->data<T>());
-      }
-    }
-    if (filter_grad) {
-      filter_grad->Resize(filter.dims());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(deformable_conv_v1,
-                        ops::DeformableConvV1CUDAKernel<float>);
-REGISTER_OP_CUDA_KERNEL(deformable_conv_v1_grad,
-                        ops::DeformableConvV1GradCUDAKernel<float>);
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.h b/paddle/fluid/operators/deformable_conv_v1_op.h
deleted file mode 100644
index 89dc10cfa3375033eca4d4c64f51b2f624697f0d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_conv_v1_op.h
+++ /dev/null
@@ -1,564 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Part of the following code in this file refs to
-// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
-//
-// Copyright (c) 2017 Microsoft
-// Licensed under The Apache-2.0 License [see LICENSE for details]
-// \file deformable_psroi_pooling.cu
-// \brief
-// \author Yi Li, Guodong Zhang, Jifeng Dai
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/deformable_conv_func.h"
-#include "paddle/fluid/operators/deformable_conv_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using CPUDeviceContext = platform::CPUDeviceContext;
-
-template <typename T>
-void DeformableCol2imCPUKernel(
-    const int num_kernels, const T* data_col, const T* data_offset,
-    const int channels, const int height, const int width, const int kernel_h,
-    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int deformable_group, const int height_col, const int width_col,
-    T* grad_im) {
-  for (size_t thread = 0; thread < num_kernels; thread++) {
-    const int j = (thread / width_col / height_col / batch_size) % kernel_w;
-    const int i =
-        (thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
-        thread / width_col / height_col / batch_size / kernel_w / kernel_h;
-
-    const int deformable_group_index = c / channel_per_deformable_group;
-
-    int w_out = thread % width_col;
-    int h_out = (thread / width_col) % height_col;
-    int b = (thread / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
-    const int data_offset_h_ptr =
-        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
-        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
-
-    const T cur_top_grad = data_col[thread];
-    const int cur_h = static_cast<int>(cur_inv_h_data);
-    const int cur_w = static_cast<int>(cur_inv_w_data);
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
-        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
-            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
-              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight =
-              DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy,
-                                    cur_w + dx, height, width);
-
-          *(grad_im + cur_bottom_grad_pos) =
-              *(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-inline void DeformableCol2imCPU(const platform::CPUDeviceContext& ctx,
-                                const T* data_col, const T* data_offset,
-                                const std::vector<int64_t> im_shape,
-                                const std::vector<int64_t> col_shape,
-                                const std::vector<int64_t> kernel_shape,
-                                const std::vector<int> pad,
-                                const std::vector<int> stride,
-                                const std::vector<int> dilation,
-                                const int deformable_group, T* grad_im) {
-  int channel_per_deformable_group = im_shape[0] / deformable_group;
-  int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
-
-  DeformableCol2imCPUKernel(
-      num_kernels, data_col, data_offset, im_shape[0], im_shape[1], im_shape[2],
-      kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0], stride[1],
-      dilation[0], dilation[1], channel_per_deformable_group, col_shape[1],
-      deformable_group, col_shape[2], col_shape[3], grad_im);
-}
-
-template <typename T>
-void DeformableCol2imCoordCPUKernel(
-    const int num_kernels, const T* data_col, const T* data_im,
-    const T* data_offset, const int channels, const int height, const int width,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int channel_per_deformable_group,
-    const int batch_size, const int offset_channels, const int deformable_group,
-    const int height_col, const int width_col, T* grad_offset) {
-  for (size_t i = 0; i < num_kernels; i++) {
-    T val = 0, mval = 0;
-    const int w = i % width_col;
-    const int h = (i / width_col) % height_col;
-    const int c = (i / width_col / height_col) % offset_channels;
-    const int b = (i / width_col / height_col) / offset_channels;
-
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const T* data_col_ptr = data_col +
-                            deformable_group_index *
-                                channel_per_deformable_group * batch_size *
-                                width_col * height_col;
-    const T* data_im_ptr = data_im +
-                           (b * deformable_group + deformable_group_index) *
-                               channel_per_deformable_group / kernel_h /
-                               kernel_w * height * width;
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
-         col_c += col_step) {
-      const int col_pos =
-          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
-          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
-          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
-          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
-           w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
-        inv_h = inv_w = -2;
-      } else {
-        mval += data_col_ptr[col_pos] *
-                DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width,
-                                   height, width, inv_h, inv_w);
-      }
-      const T weight = DmcnGetCoordinateWeight(
-          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
-          width, bp_dir);
-      val += weight * data_col_ptr[col_pos];
-      cnt += 1;
-    }
-    grad_offset[i] = val;
-  }
-}
-
-template <typename T>
-inline void DeformableCol2imCoordCPU(
-    const platform::CPUDeviceContext& ctx, const T* data_col, const T* data_im,
-    const T* data_offset, const std::vector<int64_t> im_shape,
-    const std::vector<int64_t> col_shape,
-    const std::vector<int64_t> kernel_shape, const std::vector<int> paddings,
-    const std::vector<int> strides, const std::vector<int> dilations,
-    const int deformable_groups, T* grad_offset) {
-  int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
-                    col_shape[2] * col_shape[3] * deformable_groups;
-  int channel_per_deformable_group = col_shape[0] / deformable_groups;
-
-  DeformableCol2imCoordCPUKernel(
-      num_kernels, data_col, data_im, data_offset, im_shape[0], im_shape[1],
-      im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0], paddings[1],
-      strides[0], strides[1], dilations[0], dilations[1],
-      channel_per_deformable_group, col_shape[1],
-      2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
-      deformable_groups, col_shape[2], col_shape[3], grad_offset);
-}
-
-template <typename T>
-void DeformableIm2colCPUKernel(
-    const int num_kernels, const T* data_im, const T* data_offset,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col,
-    const int width_col, T* data_col) {
-  for (size_t i = 0; i < num_kernels; i++) {
-    const int w_col = i % width_col;
-    const int h_col = (i / width_col) % height_col;
-    const int b_col = (i / width_col) / height_col % batch_size;
-    const int c_im = (i / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-
-    T* data_col_ptr =
-        data_col +
-        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T* data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const T* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
-            w_col;
-
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val =
-              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
-        }
-        *data_col_ptr = val;
-        data_col_ptr += batch_size * height_col * width_col;
-      }
-    }
-  }
-}
-
-template <typename T>
-inline void DeformableIm2colCPU(const platform::CPUDeviceContext& ctx,
-                                const T* data_im, const T* data_offset,
-                                const std::vector<int64_t> im_shape,
-                                const std::vector<int64_t> col_shape,
-                                const std::vector<int64_t> filter_shape,
-                                const std::vector<int> paddings,
-                                const std::vector<int> strides,
-                                const std::vector<int> dilations,
-                                const int deformable_groups, T* data_col) {
-  int channel_per_deformable_group = im_shape[0] / deformable_groups;
-  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
-
-  // get outputs of im2col with offset by bilinear interpolation
-  DeformableIm2colCPUKernel(
-      num_kernels, data_im, data_offset, im_shape[1], im_shape[2],
-      filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0],
-      strides[1], dilations[0], dilations[1], channel_per_deformable_group,
-      col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3],
-      data_col);
-}
-
-template <typename T>
-class DeformableConvV1CPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<CPUDeviceContext>();
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-
-    // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, CPUDeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, CPUDeviceContext>(output_shape, dev_ctx);
-    int64_t M = output_shape_vec[1] / groups;
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K =
-        input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(
-        framework::make_ddim({groups, M, K}));
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer)
-        .Resize(framework::make_ddim({groups, K, N}));
-    Tensor output_4d;
-    output_4d.ShareDataWith(output_buffer)
-        .Resize(framework::make_ddim({batch_size / im2col_step, groups, M, N}));
-    output_4d.mutable_data<T>(ctx.GetPlace());
-    framework::DDim input_shape =
-        framework::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape);
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset->numel() / offset->dims()[0];
-    auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
-    const T* input_ptr = input->data<T>();
-    const T* offset_ptr = offset->data<T>();
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    T* col_buffer_ptr = col_buffer.data<T>();
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      DeformableIm2colCPU(dev_ctx, input_ptr + i * im2col_step * input_dim,
-                          offset_ptr + i * im2col_step * input_offset_dim,
-                          input_shape_vec, col_buffer_shape_vec,
-                          filter_shape_vec, paddings, strides, dilations,
-                          deformable_groups, col_buffer_ptr);
-      Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
-          framework::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
-      // get the product of pixel and weight
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice =
-            weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        Tensor output_3d_slice =
-            output_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                output_3d.dims(), 1, output_3d.dims().size()));
-        blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
-                    &output_3d_slice, T(0.0));
-      }
-    }
-    output->ShareDataWith(output_buffer)
-        .Resize(framework::make_ddim(output_shape_vec));
-  }
-};
-
-template <typename T>
-class DeformableConvV1GradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor* offset_grad = ctx.Output<Tensor>(framework::GradVarName("Offset"));
-
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    Tensor offset = *ctx.Input<Tensor>("Offset");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    if (!input_grad && !filter_grad && !offset_grad) return;
-
-    int groups = ctx.Attr<int>("groups");
-    int deformable_groups = ctx.Attr<int>("deformable_groups");
-    int im2col_step = ctx.Attr<int>("im2col_step");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    auto& dev_ctx = ctx.template device_context<CPUDeviceContext>();
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    framework::DDim input_shape =
-        framework::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape);
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(
-        framework::vectorize(output_grad->dims()));
-
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, CPUDeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, CPUDeviceContext>(output_shape, dev_ctx);
-
-    output_buffer.ShareDataWith(*output_grad);
-
-    int64_t M =
-        input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K = output_shape_vec[1] / groups;
-
-    framework::DDim weight_3d_shape = {groups, K, M};
-    framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K,
-                                         N};
-    framework::DDim col_buffer_3d_shape = {groups, M, N};
-    framework::DDim filter_grad_shape = {groups, K, M};
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(weight_3d_shape);
-    Tensor out_grad_4d;
-    out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape);
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
-
-    math::SetConstant<CPUDeviceContext, T> set_zero;
-    auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
-
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    col_buffer_3d.mutable_data<T>(ctx.GetPlace());
-    out_grad_4d.mutable_data<T>(ctx.GetPlace());
-
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset.numel() / offset.dims()[0];
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      filter_grad->Resize(filter_grad_shape);
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-    }
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-    }
-
-    if (offset_grad) {
-      offset_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, offset_grad, static_cast<T>(0));
-    }
-
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      Tensor out_grad_3d =
-          out_grad_4d.Slice(i, i + 1).Resize(framework::slice_ddim(
-              out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice =
-            weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor out_grad_3d_slice =
-            out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-
-        blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0),
-                    &col_buffer_3d_slice, T(0.0));
-      }
-      col_buffer.Resize(col_shape);
-
-      T* col_buffer_ptr = col_buffer.data<T>();
-      const T* input_ptr = input->data<T>();
-      const T* offset_ptr = offset.data<T>();
-
-      if (offset_grad) {
-        T* offset_grad_ptr = offset_grad->data<T>();
-        // get grad of offset
-        DeformableCol2imCoordCPU(
-            dev_ctx, col_buffer_ptr, input_ptr + i * im2col_step * input_dim,
-            offset_ptr + i * im2col_step * input_offset_dim, input_shape_vec,
-            col_buffer_shape_vec, filter_shape_vec, paddings, strides,
-            dilations, deformable_groups,
-            offset_grad_ptr + i * im2col_step * input_offset_dim);
-      }
-      if (input_grad) {
-        T* input_grad_ptr = input_grad->data<T>();
-        // get grad of input
-        DeformableCol2imCPU(dev_ctx, col_buffer_ptr,
-                            offset_ptr + i * im2col_step * input_offset_dim,
-                            input_shape_vec, col_buffer_shape_vec,
-                            filter_shape_vec, paddings, strides, dilations,
-                            deformable_groups,
-                            input_grad_ptr + i * im2col_step * input_dim);
-        input_grad->Resize(input->dims());
-      }
-
-      DeformableIm2colCPU(dev_ctx, input_ptr + i * im2col_step * input_dim,
-                          offset_ptr + i * im2col_step * input_offset_dim,
-                          input_shape_vec, col_buffer_shape_vec,
-                          filter_shape_vec, paddings, strides, dilations,
-                          deformable_groups, col_buffer_ptr);
-
-      col_buffer_3d.Resize(col_buffer_3d_shape);
-
-      if (filter_grad) {
-        Tensor dweight_3d;
-        dweight_3d = ctx.AllocateTmpTensor<T, CPUDeviceContext>(
-            filter_grad_shape, dev_ctx);
-        for (int g = 0; g < groups; ++g) {
-          Tensor out_grad_3d_slice =
-              out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
-          Tensor col_buffer_3d_slice =
-              col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-          Tensor dweight_3d_slice =
-              dweight_3d.Slice(g, g + 1).Resize(framework::slice_ddim(
-                  dweight_3d.dims(), 1, dweight_3d.dims().size()));
-
-          blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true,
-                      T(1.0), &dweight_3d_slice, T(0.0));
-        }
-        // update grad of weights
-        FilterGradAddupCPUKernel(dweight_3d.numel(), groups, K, M,
-                                 dweight_3d.data<T>(), filter_grad->data<T>());
-      }
-    }
-    if (filter_grad) {
-      filter_grad->Resize(filter.dims());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
deleted file mode 100644
index d17f22b9b4f7641f7d69e0056e19762945f2d05c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
-#include <iostream>
-#include <memory>
-#include <vector>
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-class DeformablePSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(Tensor), "
-             "the input of Deformable PSROIPooling. "
-             "The shape of input tensor is [N,C,H,W]. Where N is batch size, "
-             "C is number of input channels, "
-             "H is height of the feature, and "
-             "W is the width of the feature.");
-    AddInput("ROIs",
-             "(LoDTensor), "
-             "ROIs (Regions of Interest) to pool over. "
-             "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-             "given as [[x1, y1, x2, y2], ...]. "
-             "(x1, y1) is the top left coordinates, and "
-             "(x2, y2) is the bottom right coordinates.");
-    AddInput("Trans",
-             "(Tensor),"
-             "offset of features on ROIs while pooling. "
-             "The format is NCHW, where N is number of ROIs, "
-             "C is number of channels, which indicate the offset distance "
-             "in the x and y directions, "
-             "H is pooled height, and "
-             "W is pooled width.");
-    AddAttr<bool>("no_trans",
-                  "(bool), "
-                  "whether add offset to get new value or not while roi "
-                  "pooling, which value is True or False");
-    AddAttr<float>("spatial_scale",
-                   "(float), "
-                   "ratio of input feature map height (or width) to "
-                   "raw image height (or width). Equals the reciprocal "
-                   "of total stride in convolutional layers.");
-    AddAttr<int>("output_dim",
-                 "(int), "
-                 "the number of output channels, which should be less than "
-                 "input channels. Deformable roi_pooling requires "
-                 "output_channels = input_channels, while deformable "
-                 "psroi_pooling requires output_channels = input_channels "
-                 "* pooled_height * pooled_width");
-    AddAttr<std::vector<int>>(
-        "group_size",
-        "(vector<int>), "
-        "the number of groups which input channels are divided."
-        "(eg.number of input channels is k1*k2*(C+1), which k1 and k2 "
-        "are group width and height and C+1 is number of output "
-        "chanels. eg.(4, 6), which 4 is height of group and 6 is "
-        "width of group");
-    AddAttr<int>("pooled_height",
-                 "(int), "
-                 "the pooled output height.");
-    AddAttr<int>("pooled_width",
-                 "(int), "
-                 "the pooled output width.");
-    AddAttr<std::vector<int>>(
-        "part_size",
-        "(vector<int>), "
-        "the height and width of offset, eg.(4, 6), which height is 4 "
-        " and width is 6");
-    AddAttr<int>("sample_per_part",
-                 "(int), "
-                 "the number of samples in each bin");
-    AddAttr<float>("trans_std",
-                   "(float), "
-                   "Coefficient of offset");
-    AddOutput("TopCount",
-              "(Tensor), "
-              "record the number of pixel in average pooling to in each bin. "
-              "The format is NCHW, where N is the number of ROIs, "
-              "C is the number of output channels, "
-              "H is the height of output, and "
-              "W is the width of output.");
-    AddOutput("Output",
-              "(Tensor), "
-              "the output of Deformable PSROIPooling. "
-              "The format is NCHW, where N is the number of ROIs, "
-              "C is the number of output channels, "
-              "H is the height of output, and "
-              "W is thewidth of output. ");
-    AddComment(R"DOC(
-**DeformablePSROIPooling Operator**
-DeformablePSROIPooling is a new method based Region of interest pooling 
-(also known as RoI pooling).
-The operator has four steps:
-
-1. Dividing each region proposal into equal-sized sections with
-   the pooled_width and pooled_height.
-
-2. Add offset to pixel in ROI to get new location and the new value which are
-   computed directly through bilinear interpolation with four nearest pixel.
-
-3. Sample several points to get average values in each bin.
-
-4. Copying these average values to the output buffer.
-
-DeformablePSROIPooling is part of Deformable Convolutional Networks,
-please refer to https://arxiv.org/abs/1703.06211 for more details.
-    )DOC");
-  }
-};
-
-class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of DeformablePSROIPoolOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
-                   "Input(ROIs) of DeformablePSROIPoolOp "
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Trans"),
-                   "Input(Trans) of DeformablePSROIPoolOp "
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                   "Output(Output) of DeformablePSROIPoolOp "
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("TopCount"),
-                   "Output(TopCount) of DeformablePSROIPoolOp "
-                   "should not be null.");
-    auto input_dims = ctx->GetInputDim("Input");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-    auto trans_dims = ctx->GetInputDim("Trans");
-    PADDLE_ENFORCE(rois_dims.size() == 2,
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[ x1, y1, x2, y2], ...].");
-    PADDLE_ENFORCE(trans_dims.size() == 4,
-                   "The format of Input Trans is (N, 2, H, W).");
-    auto pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    auto pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    auto spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-    auto output_channels = ctx->Attrs().Get<int>("output_dim");
-    auto group_size = ctx->Attrs().Get<std::vector<int>>("group_size");
-    auto group_height = group_size[0];
-    auto group_width = group_size[1];
-    auto part_size = ctx->Attrs().Get<std::vector<int>>("part_size");
-    auto part_height = part_size[0];
-    auto part_width = part_size[1];
-    auto sample_per_part = ctx->Attrs().Get<int>("sample_per_part");
-    auto trans_std = ctx->Attrs().Get<float>("trans_std");
-    PADDLE_ENFORCE(trans_std >= 0.0f, "trans_std must greater than 0.0");
-    PADDLE_ENFORCE(input_dims[1] >= output_channels,
-                   "input channels must greater than out_channels");
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      "The pooled height must greater than 0");
-    PADDLE_ENFORCE_GT(pooled_width, 0, "The pooled width must greater than 0");
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      "The spatial scale must greater than 0");
-    PADDLE_ENFORCE_EQ(group_size.size(), 2,
-                      "The size of group_size should be 2.");
-    PADDLE_ENFORCE_GT(group_height, 0,
-                      "The group_height in group_size must greater than 0");
-    PADDLE_ENFORCE_GT(group_width, 0,
-                      "The group_width in group_size must greater than 0");
-    PADDLE_ENFORCE_EQ(part_size.size(), 2,
-                      "The size of part_size should be 2.");
-    PADDLE_ENFORCE_GT(part_height, 0,
-                      "The part_height in part_size must greater than 0");
-    PADDLE_ENFORCE_GT(part_width, 0,
-                      "The part_width in part_size must greater than 0");
-    PADDLE_ENFORCE(part_height <= trans_dims[2],
-                   "The height of trans must greater than part_height");
-    PADDLE_ENFORCE(part_width <= trans_dims[3],
-                   "The width of trans must greater than part_width");
-    PADDLE_ENFORCE_GT(sample_per_part, 0,
-                      "The sample_per_part must greater than 0");
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = output_channels;
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Output", out_dims);
-    ctx->SetOutputDim("TopCount", out_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class DeformablePSROIPoolGradOpDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("deformable_psroi_pooling_grad");
-    op->SetInput("Input", Input("Input"));
-    op->SetInput("Trans", Input("Trans"));
-    op->SetInput("ROIs", Input("ROIs"));
-    op->SetInput("TopCount", Output("TopCount"));
-    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
-
-    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("Trans"), InputGrad("Trans"));
-
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class DeformablePSROIPoolGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Output")),
-                   "The gradient of Output should not be null.");
-    if (ctx->HasOutput(framework::GradVarName("Input"))) {
-      ctx->SetOutputDim(framework::GradVarName("Input"),
-                        ctx->GetInputDim("Input"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Trans"))) {
-      ctx->SetOutputDim(framework::GradVarName("Trans"),
-                        ctx->GetInputDim("Trans"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Trans")->type(),
-                                   ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OPERATOR(deformable_psroi_pooling, ops::DeformablePSROIPoolOp,
-                  ops::DeformablePSROIPoolOpMaker,
-                  ops::DeformablePSROIPoolGradOpDescMaker);
-REGISTER_OPERATOR(deformable_psroi_pooling_grad,
-                  ops::DeformablePSROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling,
-                       ops::DeformablePSROIPoolCPUKernel<CPU, float>,
-                       ops::DeformablePSROIPoolCPUKernel<CPU, double>);
-REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling_grad,
-                       ops::DeformablePSROIPoolGradCPUKernel<CPU, float>,
-                       ops::DeformablePSROIPoolGradCPUKernel<CPU, double>);
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
deleted file mode 100644
index 4bf0416725b7f210345e7e09fb1951697d8575f7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ /dev/null
@@ -1,529 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Part of the following code in this file refs to
-// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_psroi_pooling.cu
-//
-// Copyright (c) 2017 Microsoft
-// Licensed under The Apache-2.0 License [see LICENSE for details]
-// \file deformable_psroi_pooling.cu
-// \brief
-// \author Yi Li, Guodong Zhang, Jifeng Dai
-
-#pragma once
-#include <stdio.h>
-#include <algorithm>
-#include <iostream>
-#include <limits>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-const int CUDA_NUM_THREADS = 1024;
-static inline int GET_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-template <typename T>
-__device__ T bilinear_interpolation(const T* data, const T x, const T y,
-                                    const int width, const int height) {
-  int x1 = floor(x);
-  int x2 = ceil(x);
-  int y1 = floor(y);
-  int y2 = ceil(y);
-  T dist_x = static_cast<T>(x - x1);
-  T dist_y = static_cast<T>(y - y1);
-  T value11 = data[y1 * width + x1];
-  T value12 = data[y2 * width + x1];
-  T value21 = data[y1 * width + x2];
-  T value22 = data[y2 * width + x2];
-  T value = (1 - dist_x) * (1 - dist_y) * value11 +
-            (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 +
-            dist_x * dist_y * value22;
-  return value;
-}
-
-template <typename T>
-__global__ void DeformablePSROIPoolForwardKernel(
-    const int count, const T* bottom_data, const T spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const T* bottom_rois,
-    const T* bottom_trans, const bool no_trans, const T trans_std,
-    const int sample_per_part, const int output_dim, const int group_height,
-    const int group_width, const int part_height, const int part_width,
-    const int num_classes, const int channels_each_class, T* top_data,
-    T* top_count, int* roi_batch_id_data) {
-  CUDA_KERNEL_LOOP(index, count) {
-    // The output is in order (n, ctop, ph, pw)
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int ctop = (index / pooled_width / pooled_height) % output_dim;
-    int n = index / pooled_width / pooled_height / output_dim;
-    const T* offset_bottom_rois = bottom_rois + n * 4;
-    int roi_batch_ind = roi_batch_id_data[n];
-
-    // location of roi on feature map
-    T roi_start_w =
-        static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
-    T roi_start_h =
-        static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
-    T roi_end_w =
-        static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
-    T roi_end_h =
-        static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
-
-    // width and height of roi
-    T roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
-    T roi_height = max(roi_end_h - roi_start_h, 0.1);
-
-    // width and height of each bin
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    // sampling interval ineach bin
-    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
-    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
-
-    // obtain offset of roi
-    int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
-    int part_w = floor(static_cast<T>(pw) / pooled_width * part_width);
-    int class_id = ctop / channels_each_class;
-
-    T trans_x =
-        no_trans
-            ? static_cast<T>(0)
-            : bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
-                            part_h) *
-                               part_width +
-                           part_w] *
-                  static_cast<T>(trans_std);
-    T trans_y = no_trans
-                    ? static_cast<T>(0)
-                    : bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
-                                        part_height +
-                                    part_h) *
-                                       part_width +
-                                   part_w] *
-                          static_cast<T>(trans_std);
-
-    // location of start after adding offset
-    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
-    wstart += trans_x * roi_width;
-    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
-    hstart += trans_y * roi_height;
-    T sum = 0;
-    int count = 0;
-    int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
-    int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
-    gw = min(max(gw, 0), group_width - 1);
-    gh = min(max(gh, 0), group_height - 1);
-    const T* offset_bottom_data =
-        bottom_data + (roi_batch_ind * channels) * height * width;
-
-    // sampling in each bin
-    for (int ih = 0; ih < sample_per_part; ih++) {
-      for (int iw = 0; iw < sample_per_part; iw++) {
-        T w = wstart + iw * sub_bin_size_w;
-        T h = hstart + ih * sub_bin_size_h;
-        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
-          continue;
-        }
-        w = min(max(w, 0.), width - 1.);
-        h = min(max(h, 0.), height - 1.);
-        int c = (ctop * group_height + gh) * group_width + gw;
-        // bilinear interpolation
-        T val = bilinear_interpolation(offset_bottom_data + c * height * width,
-                                       w, h, width, height);
-        sum += val;
-        count++;
-      }
-    }
-    top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
-    top_count[index] = count;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const LoDTensor* rois = ctx.Input<LoDTensor>("ROIs");
-    const Tensor* trans = ctx.Input<Tensor>("Trans");
-    Tensor* out = ctx.Output<Tensor>("Output");
-    out->mutable_data<T>(ctx.GetPlace());
-    Tensor* top_count = ctx.Output<Tensor>("TopCount");
-    top_count->mutable_data<T>(ctx.GetPlace());
-
-    auto no_trans = ctx.Attr<bool>("no_trans");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_dim = ctx.Attr<int>("output_dim");
-    auto group_size = ctx.Attr<std::vector<int>>("group_size");
-    auto group_height = group_size[0];
-    auto group_width = group_size[1];
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto part_size = ctx.Attr<std::vector<int>>("part_size");
-    auto part_height = part_size[0];
-    auto part_width = part_size[1];
-    auto sample_per_part = ctx.Attr<int>("sample_per_part");
-    auto trans_std = ctx.Attr<float>("trans_std");
-
-    const int batch = static_cast<int>(input->dims()[0]);
-    const int channels = static_cast<int>(input->dims()[1]);
-    const int height = static_cast<int>(input->dims()[2]);
-    const int width = static_cast<int>(input->dims()[3]);
-    const int channels_trans = no_trans ? 2 : trans->dims()[1];
-    const int num_rois = rois->dims()[0];
-    PADDLE_ENFORCE_EQ(num_rois, out->dims()[0],
-                      "number of rois should be same with number of output");
-    const int count = num_rois * output_dim * pooled_height * pooled_width;
-    const int num_classes = no_trans ? 1 : channels_trans / 2;
-    const int channels_each_class =
-        no_trans ? output_dim : output_dim / num_classes;
-    PADDLE_ENFORCE(channels_each_class >= 1,
-                   "channels_each must greater than 1");
-
-    const T* bottom_data = input->data<T>();
-    const T* bottom_rois = rois->data<T>();
-    const T* bottom_trans = no_trans ? NULL : trans->data<T>();
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({num_rois});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch,
-        "The rois_batch_size and imgs batch_size must be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-
-    T* top_data = out->mutable_data<T>(ctx.GetPlace());
-    T* top_count_data = top_count->mutable_data<T>(ctx.GetPlace());
-
-    DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0,
-                                       dev_ctx.stream()>>>(
-        count, bottom_data, (T)spatial_scale, channels, height, width,
-        pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans,
-        (T)trans_std, sample_per_part, output_dim, group_height, group_width,
-        part_height, part_width, num_classes, channels_each_class, top_data,
-        top_count_data, roi_id_data);
-  }
-};
-
-template <typename T>
-__global__ void DeformablePSROIPoolBackwardAccKernel(
-    const int count, const T* top_diff, const T* top_count, const int num_rois,
-    const T spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int output_dim, T* bottom_data_diff, T* bottom_trans_diff,
-    const T* bottom_data, const T* bottom_rois, const T* bottom_trans,
-    const bool no_trans, const T trans_std, const int sample_per_part,
-    const int group_height, const int group_width, const int part_height,
-    const int part_width, const int num_classes, const int channels_each_class,
-    int* roi_batch_id_data) {
-  CUDA_KERNEL_LOOP(index, count) {
-    // The output is in order (n, ctop, ph, pw)
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int ctop = (index / pooled_width / pooled_height) % output_dim;
-    int n = index / pooled_width / pooled_height / output_dim;
-    int num_box = count / pooled_height / pooled_width / output_dim;
-    const T* offset_bottom_rois = bottom_rois + n * 4;
-    int roi_batch_ind = roi_batch_id_data[n];
-
-    // location of roi on feature map
-    T roi_start_w =
-        static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
-    T roi_start_h =
-        static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
-    T roi_end_w =
-        static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
-    T roi_end_h =
-        static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
-
-    // width and height of roi
-    T roi_width = max(roi_end_w - roi_start_w, 0.1);
-    T roi_height = max(roi_end_h - roi_start_h, 0.1);
-
-    // width and height of each bin
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    // sampling interval in each bin
-    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
-    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
-
-    // obtain offset of roi
-    int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
-    int part_w = floor(static_cast<T>(pw) / pooled_width * part_width);
-    int class_id = ctop / channels_each_class;
-
-    T trans_x =
-        no_trans
-            ? static_cast<T>(0)
-            : bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
-                            part_h) *
-                               part_width +
-                           part_w] *
-                  static_cast<T>(trans_std);
-    T trans_y = no_trans
-                    ? static_cast<T>(0)
-                    : bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
-                                        part_height +
-                                    part_h) *
-                                       part_width +
-                                   part_w] *
-                          static_cast<T>(trans_std);
-    // location of start after adding offset
-    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
-    wstart += trans_x * roi_width;
-    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
-    hstart += trans_y * roi_height;
-
-    if (top_count[index] <= 0) {
-      continue;
-    }
-
-    T diff_val = top_diff[index] / top_count[index];
-    const T* offset_bottom_data =
-        bottom_data + roi_batch_ind * channels * height * width;
-    int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
-    int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
-    gw = min(max(gw, 0), group_width - 1);
-    gh = min(max(gh, 0), group_height - 1);
-
-    // sampling in each bin
-    for (int ih = 0; ih < sample_per_part; ih++) {
-      for (int iw = 0; iw < sample_per_part; iw++) {
-        T w = wstart + iw * sub_bin_size_w;
-        T h = hstart + ih * sub_bin_size_h;
-        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
-          continue;
-        }
-        w = min(max(w, 0.), width - 1.);
-        h = min(max(h, 0.), height - 1.);
-        int c = (ctop * group_height + gh) * group_width + gw;
-        int x0 = floor(w);
-        int x1 = ceil(w);
-        int y0 = floor(h);
-        int y1 = ceil(h);
-
-        // compute coefficient of gradient
-        T dist_x = w - x0, dist_y = h - y0;
-        T q00 = (1 - dist_x) * (1 - dist_y);
-        T q01 = (1 - dist_x) * dist_y;
-        T q10 = dist_x * (1 - dist_y);
-        T q11 = dist_x * dist_y;
-        int bottom_index_base = c * height * width;
-
-        // compute gradient of input
-        if (bottom_data_diff) {
-          platform::CudaAtomicAdd(
-              bottom_data_diff + roi_batch_ind * channels * height * width +
-                  bottom_index_base + y0 * width + x0,
-              q00 * diff_val);
-          platform::CudaAtomicAdd(
-              bottom_data_diff + roi_batch_ind * channels * height * width +
-                  bottom_index_base + y1 * width + x0,
-              q01 * diff_val);
-          platform::CudaAtomicAdd(
-              bottom_data_diff + roi_batch_ind * channels * height * width +
-                  bottom_index_base + y0 * width + x1,
-              q10 * diff_val);
-          platform::CudaAtomicAdd(
-              bottom_data_diff + roi_batch_ind * channels * height * width +
-                  bottom_index_base + y1 * width + x1,
-              q11 * diff_val);
-        }
-
-        // compute gradient of trans
-        if (no_trans || bottom_trans_diff == NULL) {
-          continue;
-        }
-
-        T u00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
-        T u01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
-        T u10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
-        T u11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
-        T diff_x = (u11 * dist_y + u10 * (1 - dist_y) - u01 * dist_y -
-                    u00 * (1 - dist_y)) *
-                   trans_std * diff_val;
-        diff_x *= roi_width;
-        T diff_y = (u11 * dist_x + u01 * (1 - dist_x) - u10 * dist_x -
-                    u00 * (1 - dist_x)) *
-                   trans_std * diff_val;
-        diff_y *= roi_height;
-        platform::CudaAtomicAdd(
-            bottom_trans_diff +
-                (((n * num_classes + class_id) * 2) * part_height + part_h) *
-                    part_width +
-                part_w,
-            diff_x);
-        platform::CudaAtomicAdd(
-            bottom_trans_diff +
-                (((n * num_classes + class_id) * 2 + 1) * part_height +
-                 part_h) *
-                    part_width +
-                part_w,
-            diff_y);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const LoDTensor* rois = ctx.Input<LoDTensor>("ROIs");
-    const Tensor* trans = ctx.Input<Tensor>("Trans");
-    const Tensor* top_count = ctx.Input<Tensor>("TopCount");
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* trans_grad = ctx.Output<Tensor>(framework::GradVarName("Trans"));
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.cuda_device_context();
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-    }
-    if (trans_grad) {
-      trans_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, trans_grad, static_cast<T>(0));
-    }
-
-    auto no_trans = ctx.Attr<bool>("no_trans");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_dim = ctx.Attr<int>("output_dim");
-    auto group_size = ctx.Attr<std::vector<int>>("group_size");
-    auto group_height = group_size[0];
-    auto group_width = group_size[1];
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto part_size = ctx.Attr<std::vector<int>>("part_size");
-    auto part_height = part_size[0];
-    auto part_width = part_size[1];
-    auto sample_per_part = ctx.Attr<int>("sample_per_part");
-    auto trans_std = ctx.Attr<float>("trans_std");
-
-    const int batch = static_cast<int>(input->dims()[0]);
-    const int channels = static_cast<int>(input->dims()[1]);
-    const int height = static_cast<int>(input->dims()[2]);
-    const int width = static_cast<int>(input->dims()[3]);
-    const int channels_trans = no_trans ? 2 : trans->dims()[1];
-    const int num_rois = rois->dims()[0];
-    const int count = num_rois * output_dim * pooled_height * pooled_width;
-    const int num_classes = no_trans ? 1 : channels_trans / 2;
-    const int channels_each_class =
-        no_trans ? output_dim : output_dim / num_classes;
-
-    const T* top_diff = output_grad->data<T>();
-    const T* bottom_data = input->data<T>();
-    const T* bottom_rois = rois->data<T>();
-    const T* bottom_trans = no_trans ? NULL : trans->data<T>();
-
-    T* bottom_data_diff = NULL;
-    T* bottom_trans_diff = NULL;
-    if (input_grad) {
-      bottom_data_diff = input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (trans_grad) {
-      bottom_trans_diff =
-          no_trans ? NULL : trans_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const T* top_count_data = top_count->data<T>();
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({num_rois});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch,
-        "The rois_batch_size and imgs batch_size must be the same.");
-
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-
-    DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS,
-                                           0, dev_ctx.stream()>>>(
-        count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels,
-        height, width, pooled_height, pooled_width, output_dim,
-        bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois,
-        bottom_trans, no_trans, (T)trans_std, sample_per_part, group_height,
-        group_width, part_height, part_width, num_classes, channels_each_class,
-        roi_id_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling,
-                        ops::DeformablePSROIPoolCUDAKernel<CUDA, float>,
-                        ops::DeformablePSROIPoolCUDAKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling_grad,
-                        ops::DeformablePSROIPoolGradCUDAKernel<CUDA, float>,
-                        ops::DeformablePSROIPoolGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
deleted file mode 100644
index 22df51c6deabab7b121475a83d05d5720b566f0e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ /dev/null
@@ -1,488 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Part of the following code in this file refs to
-// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_psroi_pooling.cu
-//
-// Copyright (c) 2017 Microsoft
-// Licensed under The Apache-2.0 License [see LICENSE for details]
-// \file deformable_psroi_pooling.cu
-// \brief
-// \author Yi Li, Guodong Zhang, Jifeng Dai
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-T bilinear_interp(const T* data, const T x, const T y, const int width,
-                  const int height) {
-  int x1 = floor(x);
-  int x2 = ceil(x);
-  int y1 = floor(y);
-  int y2 = ceil(y);
-  T dist_x = static_cast<T>(x - x1);
-  T dist_y = static_cast<T>(y - y1);
-  T value11 = data[y1 * width + x1];
-  T value12 = data[y2 * width + x1];
-  T value21 = data[y1 * width + x2];
-  T value22 = data[y2 * width + x2];
-  T value = (1 - dist_x) * (1 - dist_y) * value11 +
-            (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 +
-            dist_x * dist_y * value22;
-  return value;
-}
-
-template <typename T>
-void DeformablePSROIPoolForwardCPUKernel(
-    const int count, const T* bottom_data, const T spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const T* bottom_rois,
-    const T* bottom_trans, const bool no_trans, const float trans_std,
-    const int sample_per_part, const int output_dim, const int group_height,
-    const int group_width, const int part_height, const int part_width,
-    const int num_classes, const int channels_each_class, T* top_data,
-    T* top_count, const int batch_size, int* roi_batch_id_data,
-    const LoDTensor* rois) {
-  for (int ix = 0; ix < count; ix++) {
-    int pw = ix % pooled_width;
-    int ph = (ix / pooled_width) % pooled_height;
-    int ctop = (ix / pooled_width / pooled_height) % output_dim;
-    int n = ix / pooled_width / pooled_height / output_dim;
-    const T* offset_bottom_rois = bottom_rois + n * 4;
-
-    int roi_batch_ind = roi_batch_id_data[n];
-    T roi_start_w =
-        static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
-    T roi_start_h =
-        static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
-    T roi_end_w =
-        static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
-    T roi_end_h =
-        static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
-
-    //  width and height of roi
-    T roi_width = std::max(roi_end_w - roi_start_w, T(0.1));
-    T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
-
-    //  width and height of each bin
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    //  sampling interval in each bin
-    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
-    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
-
-    //  obtain offset of roi
-    int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
-    int part_w = floor(static_cast<T>(pw) / pooled_width * part_width);
-    int class_id = ctop / channels_each_class;
-
-    T trans_x =
-        no_trans
-            ? static_cast<T>(0)
-            : bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
-                            part_h) *
-                               part_width +
-                           part_w] *
-                  static_cast<T>(trans_std);
-    T trans_y = no_trans
-                    ? static_cast<T>(0)
-                    : bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
-                                        part_height +
-                                    part_h) *
-                                       part_width +
-                                   part_w] *
-                          static_cast<T>(trans_std);
-
-    //  location of start after adding offset
-    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
-    wstart += trans_x * roi_width;
-    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
-    hstart += trans_y * roi_height;
-    T sum = 0;
-    int num_sample = 0;
-    int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
-    int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
-    gw = std::min(std::max(gw, 0), group_width - 1);
-    gh = std::min(std::max(gh, 0), group_height - 1);
-    const T* offset_bottom_data =
-        bottom_data + (roi_batch_ind * channels) * height * width;
-
-    //  sampling in each bin
-    for (int ih = 0; ih < sample_per_part; ih++) {
-      for (int iw = 0; iw < sample_per_part; iw++) {
-        T w = wstart + iw * sub_bin_size_w;
-        T h = hstart + ih * sub_bin_size_h;
-        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
-          continue;
-        }
-        w = std::min(std::max(w, T(0.)), T(width - 1.));
-        h = std::min(std::max(h, T(0.)), height - T(1.));
-        int c = (ctop * group_height + gh) * group_width + gw;
-        // bilinear interpolation to get value
-        T val = bilinear_interp(offset_bottom_data + c * height * width, w, h,
-                                width, height);
-        sum += val;
-        num_sample++;
-      }
-    }
-    top_data[ix] = num_sample == 0 ? static_cast<T>(0) : sum / num_sample;
-    top_count[ix] = num_sample;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* trans = ctx.Input<Tensor>("Trans");
-    auto* out = ctx.Output<Tensor>("Output");
-    out->mutable_data<T>(ctx.GetPlace());
-    auto* top_count = ctx.Output<Tensor>("TopCount");
-    top_count->mutable_data<T>(ctx.GetPlace());
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    set_zero(dev_ctx, out, static_cast<T>(0));
-    set_zero(dev_ctx, top_count, static_cast<T>(0));
-
-    const int num_rois = rois->dims()[0];
-    PADDLE_ENFORCE_EQ(num_rois, out->dims()[0],
-                      "number of rois should be same with number of output");
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({num_rois});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    auto no_trans = ctx.Attr<bool>("no_trans");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_dim = ctx.Attr<int>("output_dim");
-    auto group_size = ctx.Attr<std::vector<int>>("group_size");
-    auto group_height = group_size[0];
-    auto group_width = group_size[1];
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto part_size = ctx.Attr<std::vector<int>>("part_size");
-    auto part_height = part_size[0];
-    auto part_width = part_size[1];
-    auto sample_per_part = ctx.Attr<int>("sample_per_part");
-    auto trans_std = ctx.Attr<float>("trans_std");
-
-    int batch = static_cast<int>(input->dims()[0]);
-    int channels = static_cast<int>(input->dims()[1]);
-    int height = static_cast<int>(input->dims()[2]);
-    int width = static_cast<int>(input->dims()[3]);
-    int channels_trans = no_trans ? 2 : trans->dims()[1];
-    auto count = num_rois * output_dim * pooled_height * pooled_width;
-    auto num_classes = no_trans ? 1 : channels_trans / 2;
-    auto channels_each_class = no_trans ? output_dim : output_dim / num_classes;
-    PADDLE_ENFORCE(channels_each_class >= 1,
-                   "channels_each must greater than 1");
-
-    const T* bottom_data = input->data<T>();
-    const T* bottom_rois = rois->data<T>();
-    const T* bottom_trans = no_trans ? NULL : trans->data<T>();
-
-    T* top_data = out->mutable_data<T>(ctx.GetPlace());
-    T* top_count_data = top_count->mutable_data<T>(ctx.GetPlace());
-
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(rois_batch_size, batch,
-                      "The rois_batch_size must equal to batch_size of img.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-
-    DeformablePSROIPoolForwardCPUKernel(
-        count, bottom_data, (T)spatial_scale, channels, height, width,
-        pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans,
-        trans_std, sample_per_part, output_dim, group_height, group_width,
-        part_height, part_width, num_classes, channels_each_class, top_data,
-        top_count_data, batch, roi_batch_id_data, rois);
-  }
-};
-
-template <typename T>
-void DeformablePSROIPoolBackwardAccCPUKernel(
-    const int count, const T* top_diff, const T* top_count, const int num_rois,
-    const T spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int output_dim, T* bottom_data_diff, T* bottom_trans_diff,
-    const T* bottom_data, const T* bottom_rois, const T* bottom_trans,
-    const bool no_trans, const float trans_std, const int sample_per_part,
-    const int group_height, const int group_width, const int part_height,
-    const int part_width, const int num_classes, const int channels_each_class,
-    const int batch_size, int* roi_batch_id_data, const LoDTensor* rois) {
-  for (int index = 0; index < count; index++) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int ctop = (index / pooled_width / pooled_height) % output_dim;
-    int n = index / pooled_width / pooled_height / output_dim;
-
-    //  location of roi on feature map
-    const T* offset_bottom_rois = bottom_rois + n * 4;
-    int roi_batch_ind = roi_batch_id_data[n];
-    T roi_start_w =
-        static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
-    T roi_start_h =
-        static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
-    T roi_end_w =
-        static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
-    T roi_end_h =
-        static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
-
-    //  width and height of roi
-    T roi_width = std::max(roi_end_w - roi_start_w, T(0.1));
-    T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
-
-    //  width and height of each bin
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    //  sampling interval in each bin
-    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
-    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
-
-    //  obtain offset of roi
-    int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
-    int part_w = floor(static_cast<T>(pw) / pooled_width * part_height);
-    int class_id = ctop / channels_each_class;
-
-    T trans_x =
-        no_trans
-            ? static_cast<T>(0)
-            : bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
-                            part_h) *
-                               part_width +
-                           part_w] *
-                  static_cast<T>(trans_std);
-    T trans_y = no_trans
-                    ? static_cast<T>(0)
-                    : bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
-                                        part_height +
-                                    part_h) *
-                                       part_width +
-                                   part_w] *
-                          static_cast<T>(trans_std);
-
-    //  location of start after adding offset
-    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
-    wstart += trans_x * roi_width;
-    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
-    hstart += trans_y * roi_height;
-
-    if (top_count[index] <= 0) {
-      continue;
-    }
-
-    T diff_val = top_diff[index] / top_count[index];
-    const T* offset_bottom_data =
-        bottom_data + roi_batch_ind * channels * height * width;
-    int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
-    int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
-    gw = std::min(std::max(gw, 0), group_width - 1);
-    gh = std::min(std::max(gh, 0), group_height - 1);
-
-    //  sampling in each bin
-    for (int ih = 0; ih < sample_per_part; ih++) {
-      for (int iw = 0; iw < sample_per_part; iw++) {
-        T w = wstart + iw * sub_bin_size_w;
-        T h = hstart + ih * sub_bin_size_h;
-        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
-          continue;
-        }
-        w = std::min(std::max(w, T(0.)), T(width - 1.));
-        h = std::min(std::max(h, T(0.)), T(height - 1.));
-        int c = (ctop * group_height + gh) * group_width + gw;
-        int x0 = floor(w);
-        int x1 = ceil(w);
-        int y0 = floor(h);
-        int y1 = ceil(h);
-
-        //  compute coefficient of gradient
-        T dist_x = w - x0, dist_y = h - y0;
-        T q00 = (1 - dist_x) * (1 - dist_y);
-        T q01 = (1 - dist_x) * dist_y;
-        T q10 = dist_x * (1 - dist_y);
-        T q11 = dist_x * dist_y;
-        int bottom_index_base = c * height * width;
-
-        //  compute gradient of input
-        if (bottom_data_diff != NULL) {
-          T* offset_bottom_data_diff_addr00 =
-              bottom_data_diff + roi_batch_ind * channels * height * width +
-              bottom_index_base + y0 * width + x0;
-          T* offset_bottom_data_diff_addr01 =
-              bottom_data_diff + roi_batch_ind * channels * height * width +
-              bottom_index_base + y1 * width + x0;
-          T* offset_bottom_data_diff_addr10 =
-              bottom_data_diff + roi_batch_ind * channels * height * width +
-              bottom_index_base + y0 * width + x1;
-          T* offset_bottom_data_diff_addr11 =
-              bottom_data_diff + roi_batch_ind * channels * height * width +
-              bottom_index_base + y1 * width + x1;
-          *offset_bottom_data_diff_addr00 =
-              *offset_bottom_data_diff_addr00 + q00 * diff_val;
-          *offset_bottom_data_diff_addr01 =
-              *offset_bottom_data_diff_addr01 + q01 * diff_val;
-          *offset_bottom_data_diff_addr10 =
-              *offset_bottom_data_diff_addr10 + q10 * diff_val;
-          *offset_bottom_data_diff_addr11 =
-              *offset_bottom_data_diff_addr11 + q11 * diff_val;
-        }
-
-        //  compute gradient of trans
-        if (no_trans || bottom_trans_diff == NULL) {
-          continue;
-        }
-
-        T u00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
-        T u01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
-        T u10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
-        T u11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
-
-        T diff_x = (u11 * dist_y + u10 * (1 - dist_y) - u01 * dist_y -
-                    u00 * (1 - dist_y)) *
-                   trans_std * diff_val;
-        diff_x *= roi_width;
-        T diff_y = (u11 * dist_x + u01 * (1 - dist_x) - u10 * dist_x -
-                    u00 * (1 - dist_x)) *
-                   trans_std * diff_val;
-        diff_y *= roi_height;
-        T* offset_bottom_trans_diff_x =
-            bottom_trans_diff +
-            (((n * num_classes + class_id) * 2) * part_height + part_h) *
-                part_width +
-            part_w;
-        T* offset_bottom_trans_diff_y =
-            bottom_trans_diff +
-            (((n * num_classes + class_id) * 2 + 1) * part_height + part_h) *
-                part_width +
-            part_w;
-
-        *offset_bottom_trans_diff_x = *offset_bottom_trans_diff_x + diff_x;
-        *offset_bottom_trans_diff_y = *offset_bottom_trans_diff_y + diff_y;
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* trans = ctx.Input<Tensor>("Trans");
-    auto* top_count = ctx.Input<Tensor>("TopCount");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(.0));
-    }
-    auto* trans_grad = ctx.Output<Tensor>(framework::GradVarName("Trans"));
-    if (trans_grad) {
-      trans_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, trans_grad, static_cast<T>(.0));
-    }
-    auto no_trans = ctx.Attr<bool>("no_trans");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_dim = ctx.Attr<int>("output_dim");
-    auto group_size = ctx.Attr<std::vector<int>>("group_size");
-    auto group_height = group_size[0];
-    auto group_width = group_size[1];
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto part_size = ctx.Attr<std::vector<int>>("part_size");
-    auto part_height = part_size[0];
-    auto part_width = part_size[1];
-    auto sample_per_part = ctx.Attr<int>("sample_per_part");
-    auto trans_std = ctx.Attr<float>("trans_std");
-
-    const int batch = static_cast<int>(input->dims()[0]);
-    const int channels = static_cast<int>(input->dims()[1]);
-    const int height = static_cast<int>(input->dims()[2]);
-    const int width = static_cast<int>(input->dims()[3]);
-    const int channels_trans = no_trans ? 2 : trans->dims()[1];
-    const int num_rois = rois->dims()[0];
-    const int count = num_rois * output_dim * pooled_height * pooled_width;
-    const int num_classes = no_trans ? 1 : channels_trans / 2;
-    const int channels_each_class =
-        no_trans ? output_dim : output_dim / num_classes;
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({num_rois});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    const T* top_diff = output_grad->data<T>();
-    const T* bottom_data = input->data<T>();
-    const T* bottom_rois = rois->data<T>();
-    const T* bottom_trans = no_trans ? NULL : trans->data<T>();
-
-    T* bottom_data_diff = NULL;
-    T* bottom_trans_diff = NULL;
-    if (input_grad) {
-      bottom_data_diff = input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (trans_grad) {
-      bottom_trans_diff =
-          no_trans ? NULL : trans_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const T* top_count_data = top_count->data<T>();
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-
-    DeformablePSROIPoolBackwardAccCPUKernel(
-        count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels,
-        height, width, pooled_height, pooled_width, output_dim,
-        bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois,
-        bottom_trans, no_trans, (T)trans_std, sample_per_part, group_height,
-        group_width, part_height, part_width, num_classes, channels_each_class,
-        batch, roi_batch_id_data, rois);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc
deleted file mode 100644
index 89416f7ab5d07ddac5b540b9bb361f831c1ef360..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/delete_var_op.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class DeleteVarOp : public framework::OperatorBase {
- public:
-  DeleteVarOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    dev_ctx.Wait();
-
-    auto delete_var_names = Inputs("X");
-    const_cast<framework::Scope &>(scope).EraseVars(delete_var_names);
-  }
-};
-
-class DeleteVarOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of delete op").AsDuplicable();
-    AddComment(R"DOC(
-Delete Operator.
-It should not be configured by users directly.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::DeleteVarOpInfoMaker,
-                  paddle::operators::DeleteVarOpShapeInference);
diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc
deleted file mode 100644
index 97f49dbcb08e4428b4857f4a70ab21399fb35612..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dequantize_op.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/dequantize_op.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-framework::OpKernelType DeQuantOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library_ = framework::LibraryType::kMKLDNN;
-  framework::DataLayout layout_ = framework::DataLayout::kMKLDNN;
-
-  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                 ctx.GetPlace(), layout_, library_);
-}
-
-void DeQuantOpMaker::Make() {
-  AddInput("Input", "input data");
-  AddOutput("Output", "output data");
-  AddAttr<float>("Scale", "scale data").SetDefault({1.0f});
-  AddComment(R"DOC(This op will dequantize data from INT8 to FP32)DOC");
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker);
diff --git a/paddle/fluid/operators/dequantize_op.h b/paddle/fluid/operators/dequantize_op.h
deleted file mode 100644
index 75c27a06c210f2d0e4d7cf52aa16f4c123f8ad8e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dequantize_op.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::OpKernelType;
-using framework::Tensor;
-
-class DeQuantOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Output", ctx->GetInputDim("Input"));
-    ctx->ShareLoD("Input", /*->*/ "Output");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class DeQuantOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-class DeQuantGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h
deleted file mode 100644
index c56329d9ee5ab73c6a683c9ea0955e27bdc65564..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/safe_ref.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-/**
- * Get Reference From Pointer with check. The error message is printf format,
- * and passed by `args`
- */
-template <typename T, typename... ARGS>
-inline T& Ref(T* ptr, ARGS&&... args) {
-  PADDLE_ENFORCE_NOT_NULL(ptr, ::paddle::string::Sprintf(args...));
-  return *ptr;
-}
-
-template <typename T, typename... ARGS>
-inline std::vector<std::reference_wrapper<T>> VectorRef(
-    const std::vector<T*>& vec, ARGS&&... args) {
-  std::vector<std::reference_wrapper<T>> result;
-  result.reserve(vec.size());
-  for (auto* ptr : vec) {
-    result.emplace_back(Ref(ptr, args...));
-  }
-  return result;
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h
deleted file mode 100644
index 94419d1f9a4ba654952e0aedb46ab94ea8d5c0a8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-template <typename T, int Rank>
-struct StridedMemcpyFunctor;
-
-template <typename T>
-struct StridedMemcpyFunctor<T, 0> {
-  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  const int64_t* src_stride, const int64_t* dst_dim,
-                  const int64_t* dst_stride, T* dst) const {
-    auto place = dev_ctx.GetPlace();
-    if (platform::is_cpu_place(place)) {
-      auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
-      auto& cuda_ctx =
-          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T),
-                   cuda_ctx.stream());
-#else
-      PADDLE_THROW("Paddle is not compiled with GPU");
-#endif
-    }
-  }
-};
-
-template <typename T>
-struct StridedMemcpyFunctor<T, 1> {
-  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  const int64_t* src_stride, const int64_t* dst_dim,
-                  const int64_t* dst_stride, T* dst) const {
-    auto place = dev_ctx.GetPlace();
-    if (platform::is_cpu_place(place)) {
-      auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
-      auto& cuda_ctx =
-          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0],
-                   cuda_ctx.stream());
-#else
-      PADDLE_THROW("Paddle is not compiled with GPU");
-#endif
-    }
-  }
-};
-
-template <typename T, int Rank>
-struct StridedMemcpyFunctor {
-  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  const int64_t* src_stride, const int64_t* dst_dim,
-                  const int64_t* dst_stride, T* dst) const {
-    for (int64_t i = 0; i < dst_dim[0]; ++i) {
-      StridedMemcpyFunctor<T, Rank - 1> func;
-      func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst);
-      src += src_stride[0];
-      dst += dst_stride[0];
-    }
-  }
-};
-
-template <typename T>
-struct StridedCopyDimVisitor {
-  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
-                        const framework::DDim& src_stride,
-                        const framework::DDim& dst_stride, T* dst)
-      : dev_ctx_(dev_ctx),
-        src_(src),
-        src_stride_(src_stride),
-        dst_stride_(dst_stride),
-        dst_(dst) {}
-
-  template <int D>
-  void operator()(const framework::Dim<D>& dst_dim) const {
-    StridedMemcpyFunctor<T, D> functor;
-    functor(dev_ctx_, src_, src_stride_.Get(), dst_dim.Get(), dst_stride_.Get(),
-            dst_);
-  }
-
-  const platform::DeviceContext& dev_ctx_;
-  const T* src_;
-  const framework::DDim& src_stride_;
-  const framework::DDim& dst_stride_;
-  T* dst_;
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
deleted file mode 100644
index f1c504d6e4bd065e4221b1207a117ff0f6732459..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-set(LOCAL_DETECTION_LIBS)
-
-function(detection_library TARGET_NAME)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    set(options "")
-    set(common_deps op_registry)
-    set(pybind_flag 0)
-    cmake_parse_arguments(detection_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-    op_library(${TARGET_NAME} SRCS ${detection_library_SRCS} DEPS ${common_deps} ${detection_library_DEPS})
-    set(LOCAL_DETECTION_LIBS
-            ${TARGET_NAME}
-            ${LOCAL_DETECTION_LIBS}
-        PARENT_SCOPE)
-endfunction()
-
-detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
-detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
-detection_library(iou_similarity_op SRCS iou_similarity_op.cc
-iou_similarity_op.cu)
-detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
-detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
-detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
-detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
-detection_library(anchor_generator_op SRCS anchor_generator_op.cc
-anchor_generator_op.cu)
-detection_library(target_assign_op SRCS target_assign_op.cc
-target_assign_op.cu)
-detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
-polygon_box_transform_op.cu)
-detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
-detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
-detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
-detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
-detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
-detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
-detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
-detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
-
-if(WITH_GPU)
-  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
-  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
-  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub)
-else()
-  detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
-  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
-  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc)
-endif()
-
-detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
-#Export local libraries to parent
-# set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
-
-foreach(src ${LOCAL_DETECTION_LIBS})
-    set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
-endforeach()
-
-cc_library(mask_util SRCS mask_util.cc DEPS memory)
-cc_test(mask_util_test SRCS mask_util_test.cc DEPS memory mask_util)
-detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util)
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc
deleted file mode 100644
index 4a333b559f82e6d39d2d4345c8ad58bc8d430c69..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/anchor_generator_op.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/anchor_generator_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AnchorGeneratorOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of AnchorGeneratorOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Anchors"),
-                   "Output(Anchors) of AnchorGeneratorOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Variances"),
-        "Output(Variances) of AnchorGeneratorOp should not be null.");
-
-    auto input_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
-
-    auto anchor_sizes = ctx->Attrs().Get<std::vector<float>>("anchor_sizes");
-    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
-    auto stride = ctx->Attrs().Get<std::vector<float>>("stride");
-    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
-
-    size_t num_anchors = aspect_ratios.size() * anchor_sizes.size();
-
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_anchors;
-    dim_vec[3] = 4;
-    ctx->SetOutputDim("Anchors", framework::make_ddim(dim_vec));
-    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>("Input")->type(), ctx.device_context());
-  }
-};
-
-class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(Tensor, default Tensor<float>), "
-             "the input feature is a tensor with a rank of 4. "
-             "The layout is NCHW.");
-    AddOutput("Anchors",
-              "(Tensor, default Tensor<float>), the output is a "
-              "tensor with a rank of 4. The layout is [H, W, num_anchors, 4]. "
-              "H is the height of input, W is the width of input, num_anchors "
-              "is the box count of each position. "
-              "Each anchor is in (xmin, ymin, xmax, ymax) format");
-    AddOutput("Variances",
-              "(Tensor, default Tensor<float>), the expanded variances for "
-              "normalizing bbox regression targets. The layout is [H, W, "
-              "num_anchors, 4]. "
-              "H is the height of input, W is the width of input, num_anchors "
-              "is the box count of each position. "
-              "Each variance is in (xcenter, ycenter, w, h) format");
-
-    AddAttr<std::vector<float>>(
-        "anchor_sizes",
-        "(vector<float>) List of Region Proposal Network(RPN) anchor sizes "
-        " given in absolute pixels e.g. (64, 128, 256, 512)."
-        " For instance, the anchor size of 64 means the area of this anchor "
-        "equals to 64**2.")
-        .AddCustomChecker([](const std::vector<float>& anchor_sizes) {
-          PADDLE_ENFORCE_GT(anchor_sizes.size(), 0UL,
-                            "Size of anchor_sizes must be at least 1.");
-          for (size_t i = 0; i < anchor_sizes.size(); ++i) {
-            PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0,
-                              "anchor_sizes[%d] must be positive.", i);
-          }
-        });
-    AddAttr<std::vector<float>>(
-        "aspect_ratios",
-        "(vector<float>) List of Region Proposal Network(RPN) anchor aspect "
-        "ratios, e.g. (0.5, 1, 2)."
-        "For instacne, the aspect ratio of 0.5 means the height / width of "
-        "this anchor equals 0.5.");
-
-    AddAttr<std::vector<float>>("variances",
-                                "(vector<float>) List of variances to be used "
-                                "in box regression deltas")
-        .AddCustomChecker([](const std::vector<float>& variances) {
-          PADDLE_ENFORCE_EQ(variances.size(), 4UL,
-                            "Must and only provide 4 variance.");
-          for (size_t i = 0; i < variances.size(); ++i) {
-            PADDLE_ENFORCE_GT(variances[i], 0.0,
-                              "variance[%d] must be greater than 0.", i);
-          }
-        });
-
-    AddAttr<std::vector<float>>("stride",
-                                "Anchors stride across width and height, "
-                                "with a default of (16, 16)")
-        .SetDefault(std::vector<float>(2, 16.0))
-        .AddCustomChecker([](const std::vector<float>& stride) {
-          PADDLE_ENFORCE_EQ(
-              stride.size(), 2UL,
-              "Must and only provide 2 stride for width and height.");
-          for (size_t i = 0; i < stride.size(); ++i) {
-            PADDLE_ENFORCE_GT(stride[i], 0.0,
-                              "stride[%d] should be larger than 0.", i);
-          }
-        });
-
-    AddAttr<float>("offset",
-                   "(float) "
-                   "Anchor center offset, with a default of 0.5")
-        .SetDefault(0.5);
-    AddComment(R"DOC(
-AnchorGenerator operator
-Generates anchors for Faster RCNN, FPN etc. algorithm.
-Each position of the input produce N anchors, N =
- size(anchor_sizes) * size(aspect_ratios).
-
-Please get more information from the following papers:
-https://arxiv.org/abs/1506.01497.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(anchor_generator, ops::AnchorGeneratorOp,
-                  ops::AnchorGeneratorOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(anchor_generator, ops::AnchorGeneratorOpKernel<float>,
-                       ops::AnchorGeneratorOpKernel<double>);
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu
deleted file mode 100644
index 3cc9bbeee1eeed17142a6b1bd23b45aff9cf745f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/anchor_generator_op.cu
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/anchor_generator_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
-                           const T* anchor_sizes, const int as_num,
-                           const T* stride, const int sd_num, const int height,
-                           const int width, const T offset) {
-  int num_anchors = as_num * ar_num;
-  int box_num = height * width * num_anchors;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
-       i += blockDim.x * gridDim.x) {
-    int h_idx = i / (num_anchors * width);
-    int w_idx = (i / num_anchors) % width;
-    T stride_width = stride[0];
-    T stride_height = stride[1];
-    T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
-    T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
-    T area, area_ratios;
-    T base_w, base_h;
-    T scale_w, scale_h;
-    T anchor_width, anchor_height;
-    int anch_idx = i % num_anchors;
-    int ar_idx = anch_idx / as_num;
-    int as_idx = anch_idx % as_num;
-    T aspect_ratio = aspect_ratios[ar_idx];
-    T anchor_size = anchor_sizes[as_idx];
-    area = stride_width * stride_height;
-    area_ratios = area / aspect_ratio;
-    base_w = round(sqrt(area_ratios));
-    base_h = round(base_w * aspect_ratio);
-    scale_w = anchor_size / stride_width;
-    scale_h = anchor_size / stride_height;
-    anchor_width = scale_w * base_w;
-    anchor_height = scale_h * base_h;
-
-    T xmin = (x_ctr - 0.5 * (anchor_width - 1));
-    T ymin = (y_ctr - 0.5 * (anchor_height - 1));
-    T xmax = (x_ctr + 0.5 * (anchor_width - 1));
-    T ymax = (y_ctr + 0.5 * (anchor_height - 1));
-    out[i * 4] = xmin;
-    out[i * 4 + 1] = ymin;
-    out[i * 4 + 2] = xmax;
-    out[i * 4 + 3] = ymax;
-  }
-}
-
-template <typename T>
-__global__ void SetVariance(T* out, const T* var, const int vnum,
-                            const int num) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    out[i] = var[i % vnum];
-  }
-}
-
-template <typename T>
-class AnchorGeneratorOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
-
-    auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
-    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
-    auto stride = ctx.Attr<std::vector<float>>("stride");
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-
-    T offset = static_cast<T>(ctx.Attr<float>("offset"));
-
-    auto width = input->dims()[3];
-    auto height = input->dims()[2];
-
-    int num_anchors = aspect_ratios.size() * anchor_sizes.size();
-
-    int box_num = width * height * num_anchors;
-
-    int block = 512;
-    int grid = (box_num + block - 1) / block;
-
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-
-    anchors->mutable_data<T>(ctx.GetPlace());
-    vars->mutable_data<T>(ctx.GetPlace());
-
-    framework::Tensor ar;
-    framework::TensorFromVector(aspect_ratios, ctx.device_context(), &ar);
-
-    framework::Tensor as;
-    framework::TensorFromVector(anchor_sizes, ctx.device_context(), &as);
-
-    framework::Tensor sd;
-    framework::TensorFromVector(stride, ctx.device_context(), &sd);
-
-    GenAnchors<T><<<grid, block, 0, stream>>>(
-        anchors->data<T>(), ar.data<T>(), aspect_ratios.size(), as.data<T>(),
-        anchor_sizes.size(), sd.data<T>(), stride.size(), height, width,
-        offset);
-
-    framework::Tensor v;
-    framework::TensorFromVector(variances, ctx.device_context(), &v);
-    grid = (box_num * 4 + block - 1) / block;
-    SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
-                                               variances.size(), box_num * 4);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(anchor_generator,
-                        ops::AnchorGeneratorOpCUDAKernel<float>,
-                        ops::AnchorGeneratorOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
deleted file mode 100644
index e0e499d76a19ba5f6b91ba4c8797684fb53c7caa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
-
-    auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
-    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
-    auto stride = ctx.Attr<std::vector<float>>("stride");
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-
-    T offset = static_cast<T>(ctx.Attr<float>("offset"));
-
-    auto feature_width = input->dims()[3];
-    auto feature_height = input->dims()[2];
-
-    T stride_width, stride_height;
-    stride_width = stride[0];
-    stride_height = stride[1];
-
-    int num_anchors = aspect_ratios.size() * anchor_sizes.size();
-
-    anchors->mutable_data<T>(ctx.GetPlace());
-    vars->mutable_data<T>(ctx.GetPlace());
-
-    auto e_anchors = framework::EigenTensor<T, 4>::From(*anchors);
-    for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
-      for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
-        T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
-        T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
-        T area, area_ratios;
-        T base_w, base_h;
-        T scale_w, scale_h;
-        T anchor_width, anchor_height;
-        int idx = 0;
-        for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-          auto ar = aspect_ratios[r];
-          for (size_t s = 0; s < anchor_sizes.size(); ++s) {
-            auto anchor_size = anchor_sizes[s];
-            area = stride_width * stride_height;
-            area_ratios = area / ar;
-            base_w = round(sqrt(area_ratios));
-            base_h = round(base_w * ar);
-            scale_w = anchor_size / stride_width;
-            scale_h = anchor_size / stride_height;
-            anchor_width = scale_w * base_w;
-            anchor_height = scale_h * base_h;
-            e_anchors(h_idx, w_idx, idx, 0) =
-                (x_ctr - 0.5 * (anchor_width - 1));
-            e_anchors(h_idx, w_idx, idx, 1) =
-                (y_ctr - 0.5 * (anchor_height - 1));
-            e_anchors(h_idx, w_idx, idx, 2) =
-                (x_ctr + 0.5 * (anchor_width - 1));
-            e_anchors(h_idx, w_idx, idx, 3) =
-                (y_ctr + 0.5 * (anchor_height - 1));
-            idx++;
-          }
-        }
-      }
-    }
-
-    framework::Tensor var_t;
-    var_t.mutable_data<T>(
-        framework::make_ddim({1, static_cast<int>(variances.size())}),
-        ctx.GetPlace());
-    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
-    for (size_t i = 0; i < variances.size(); ++i) {
-      var_et(0, i) = variances[i];
-    }
-
-    int anchor_num = feature_height * feature_width * num_anchors;
-    auto var_dim = vars->dims();
-    vars->Resize({anchor_num, static_cast<int>(variances.size())});
-
-    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
-    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(anchor_num, 1));
-
-    vars->Resize(var_dim);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
deleted file mode 100644
index afc39c1db9fba8bf01a78ade83af1037a83d8d9d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-struct RangeInitFunctor {
-  int start;
-  int delta;
-  int* out;
-  HOSTDEVICE void operator()(size_t i) { out[i] = start + i * delta; }
-};
-
-template <typename T>
-inline HOSTDEVICE T RoIArea(const T* box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-/*
- * transform that computes target bounding-box regression deltas
- * given proposal boxes and ground-truth boxes.
- */
-template <typename T>
-inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
-                       const framework::Tensor& gt_boxes, const float* weights,
-                       const bool normalized, framework::Tensor* box_delta) {
-  auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
-  auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
-  auto trg = framework::EigenTensor<T, 2>::From(*box_delta);
-  T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
-  for (int64_t i = 0; i < box_num; ++i) {
-    ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false);
-    ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false);
-    ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
-    ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
-
-    gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false);
-    gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false);
-    gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
-    gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
-
-    trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w;
-    trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h;
-    trg(i, 2) = std::log(gt_w / ex_w);
-    trg(i, 3) = std::log(gt_h / ex_h);
-
-    if (weights) {
-      trg(i, 0) = trg(i, 0) / weights[0];
-      trg(i, 1) = trg(i, 1) / weights[1];
-      trg(i, 2) = trg(i, 2) / weights[2];
-      trg(i, 3) = trg(i, 3) / weights[3];
-    }
-  }
-}
-
-template <typename T>
-void Gather(const T* in, const int in_stride, const int* index, const int num,
-            T* out) {
-  const int stride_bytes = in_stride * sizeof(T);
-  for (int i = 0; i < num; ++i) {
-    int id = index[i];
-    memcpy(out + i * in_stride, in + id * in_stride, stride_bytes);
-  }
-}
-
-template <typename T>
-void BboxOverlaps(const framework::Tensor& r_boxes,
-                  const framework::Tensor& c_boxes,
-                  framework::Tensor* overlaps) {
-  auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
-  auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
-  auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
-  int r_num = r_boxes.dims()[0];
-  int c_num = c_boxes.dims()[0];
-  auto zero = static_cast<T>(0.0);
-  T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
-      inter_area;
-  for (int i = 0; i < r_num; ++i) {
-    r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
-                 (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
-    for (int j = 0; j < c_num; ++j) {
-      c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
-                   (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
-      x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
-      y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
-      x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
-      y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
-      inter_w = std::max(x_max - x_min + 1, zero);
-      inter_h = std::max(y_max - y_min + 1, zero);
-      inter_area = inter_w * inter_h;
-      overlaps_et(i, j) =
-          (inter_area == 0.) ? 0 : inter_area /
-                                       (r_box_area + c_box_area - inter_area);
-    }
-  }
-}
-
-template <class T>
-void ClipTiledBoxes(const platform::DeviceContext& ctx,
-                    const framework::Tensor& im_info,
-                    const framework::Tensor& input_boxes,
-                    framework::Tensor* out) {
-  T* out_data = out->mutable_data<T>(ctx.GetPlace());
-  const T* im_info_data = im_info.data<T>();
-  const T* input_boxes_data = input_boxes.data<T>();
-  T zero(0);
-  T im_w = round(im_info_data[1] / im_info_data[2]);
-  T im_h = round(im_info_data[0] / im_info_data[2]);
-  for (int64_t i = 0; i < input_boxes.numel(); ++i) {
-    if (i % 4 == 0) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
-    } else if (i % 4 == 1) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
-    } else if (i % 4 == 2) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
-    } else {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
deleted file mode 100644
index af7797a6d7cde6e81c66a3d29ed36154b6e11529..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ /dev/null
@@ -1,290 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-class BipartiteMatchOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
-                   "Input(DistMat) of BipartiteMatch should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("ColToRowMatchIndices"),
-        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("ColToRowMatchDist"),
-        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
-
-    auto dims = ctx->GetInputDim("DistMat");
-    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
-
-    ctx->SetOutputDim("ColToRowMatchIndices", dims);
-    ctx->SetOutputDim("ColToRowMatchDist", dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<LoDTensor>("DistMat")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-template <class T>
-bool DistPairDescend(std::tuple<int, int, T> pair1,
-                     std::tuple<int, int, T> pair2) {
-  return std::get<2>(pair1) > std::get<2>(pair2);
-}
-
-template <typename T>
-class BipartiteMatchKernel : public framework::OpKernel<T> {
- public:
-  // The match_indices must be initialized to -1 at first.
-  // The match_dist must be initialized to 0 at first.
-  void BipartiteMatch(const Tensor& dist, int* match_indices,
-                      T* match_dist) const {
-    PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
-    int64_t row = dist.dims()[0];
-    int64_t col = dist.dims()[1];
-    auto* dist_data = dist.data<T>();
-    // Test result: When row==130 the speed of these two methods almost the same
-    if (row >= 130) {
-      std::vector<std::tuple<int, int, T>> match_pair;
-
-      for (int64_t i = 0; i < row; ++i) {
-        for (int64_t j = 0; j < col; ++j) {
-          match_pair.push_back(std::make_tuple(i, j, dist_data[i * col + j]));
-        }
-      }
-      std::sort(match_pair.begin(), match_pair.end(), DistPairDescend<T>);
-      std::vector<int> row_indices(row, -1);
-
-      int64_t idx = 0;
-      for (int64_t k = 0; k < row * col; ++k) {
-        int64_t i = std::get<0>(match_pair[k]);
-        int64_t j = std::get<1>(match_pair[k]);
-        T dist = std::get<2>(match_pair[k]);
-
-        if (idx >= row) {
-          break;
-        }
-        if (match_indices[j] == -1 && row_indices[i] == -1 && dist > 0) {
-          match_indices[j] = i;
-          row_indices[i] = j;
-          match_dist[j] = dist;
-          idx += 1;
-        }
-      }
-    } else {
-      constexpr T kEPS = static_cast<T>(1e-6);
-      std::vector<int> row_pool;
-      for (int i = 0; i < row; ++i) {
-        row_pool.push_back(i);
-      }
-      while (row_pool.size() > 0) {
-        int max_idx = -1;
-        int max_row_idx = -1;
-        T max_dist = -1;
-        for (int64_t j = 0; j < col; ++j) {
-          if (match_indices[j] != -1) {
-            continue;
-          }
-          for (size_t k = 0; k < row_pool.size(); ++k) {
-            int m = row_pool[k];
-            // distance is 0 between m-th row and j-th column
-            if (dist_data[m * col + j] < kEPS) {
-              continue;
-            }
-            if (dist_data[m * col + j] > max_dist) {
-              max_idx = j;
-              max_row_idx = m;
-              max_dist = dist_data[m * col + j];
-            }
-          }
-        }
-        if (max_idx == -1) {
-          // Cannot find good match.
-          break;
-        } else {
-          PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
-          match_indices[max_idx] = max_row_idx;
-          match_dist[max_idx] = max_dist;
-          // Erase the row index.
-          row_pool.erase(
-              std::find(row_pool.begin(), row_pool.end(), max_row_idx));
-        }
-      }
-    }
-  }
-
-  void ArgMaxMatch(const Tensor& dist, int* match_indices, T* match_dist,
-                   T overlap_threshold) const {
-    constexpr T kEPS = static_cast<T>(1e-6);
-    int64_t row = dist.dims()[0];
-    int64_t col = dist.dims()[1];
-    auto* dist_data = dist.data<T>();
-    for (int64_t j = 0; j < col; ++j) {
-      if (match_indices[j] != -1) {
-        // the j-th column has been matched to one entity.
-        continue;
-      }
-      int max_row_idx = -1;
-      T max_dist = -1;
-      for (int i = 0; i < row; ++i) {
-        T dist = dist_data[i * col + j];
-        if (dist < kEPS) {
-          // distance is 0 between m-th row and j-th column
-          continue;
-        }
-        if (dist >= overlap_threshold && dist > max_dist) {
-          max_row_idx = i;
-          max_dist = dist;
-        }
-      }
-      if (max_row_idx != -1) {
-        PADDLE_ENFORCE_EQ(match_indices[j], -1);
-        match_indices[j] = max_row_idx;
-        match_dist[j] = max_dist;
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dist_mat = context.Input<LoDTensor>("DistMat");
-    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
-
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-
-    auto col = dist_mat->dims()[1];
-
-    int64_t n = dist_mat->lod().size() == 0UL
-                    ? 1
-                    : static_cast<int64_t>(dist_mat->lod().back().size() - 1);
-    if (dist_mat->lod().size()) {
-      PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL,
-                        "Only support 1 level of LoD.");
-    }
-    match_indices->mutable_data<int>({n, col}, context.GetPlace());
-    match_dist->mutable_data<T>({n, col}, context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, int> iset;
-    iset(dev_ctx, match_indices, static_cast<int>(-1));
-    math::SetConstant<platform::CPUDeviceContext, T> tset;
-    tset(dev_ctx, match_dist, static_cast<T>(0));
-
-    int* indices = match_indices->data<int>();
-    T* dist = match_dist->data<T>();
-    auto type = context.Attr<std::string>("match_type");
-    auto threshold = context.Attr<float>("dist_threshold");
-    if (n == 1) {
-      BipartiteMatch(*dist_mat, indices, dist);
-      if (type == "per_prediction") {
-        ArgMaxMatch(*dist_mat, indices, dist, threshold);
-      }
-    } else {
-      auto lod = dist_mat->lod().back();
-      for (size_t i = 0; i < lod.size() - 1; ++i) {
-        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
-        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
-        if (type == "per_prediction") {
-          ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
-        }
-      }
-    }
-  }
-};
-
-class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "DistMat",
-        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
-        "[K, M]. It is pair-wise distance matrix between the entities "
-        "represented by each row and each column. For example, assumed one "
-        "entity is A with shape [K], another entity is B with shape [M]. The "
-        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
-        "the distance is, the better macthing the pairs are. Please note, "
-        "This tensor can contain LoD information to represent a batch of "
-        "inputs. One instance of this batch can contain different numbers of "
-        "entities.");
-    AddAttr<std::string>(
-        "match_type",
-        "(string, default: per_prediction) "
-        "The type of matching method, should be 'bipartite' or "
-        "'per_prediction', 'bipartite' by default.")
-        .SetDefault("bipartite")
-        .InEnum({"bipartite", "per_prediction"});
-    AddAttr<float>(
-        "dist_threshold",
-        "(float, default: 0.5) "
-        "If `match_type` is 'per_prediction', this threshold is to determine "
-        "the extra matching bboxes based on the maximum distance.")
-        .SetDefault(0.5);
-    AddOutput("ColToRowMatchIndices",
-              "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
-              "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
-              "means B[j] does not match any entity in i-th instance. "
-              "Otherwise, it means B[j] is matched to row "
-              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
-              "i-th instance is saved in ColToRowMatchIndices[i][j].");
-    AddOutput("ColToRowMatchDist",
-              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
-              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
-              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
-              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
-              "instance are called LoD. Then "
-              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
-    AddComment(R"DOC(
-This operator is a greedy bipartite matching algorithm, which is used to
-obtain the matching with the maximum distance based on the input
-distance matrix. For input 2D matrix, the bipartite matching algorithm can
-find the matched column for each row, also can find the matched row for
-each column. And this operator only calculate matched indices from column
-to row. For each instance, the number of matched indices is the number of
-of columns of the input distance matrix.
-
-There are two outputs to save matched indices and distance.
-A simple description, this algorithm matched the best (maximum distance)
-row entity to the column entity and the matched indices are not duplicated
-in each row of ColToRowMatchIndices. If the column entity is not matched
-any row entity, set -1 in ColToRowMatchIndices.
-
-Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
-If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
-If Tensor, the height of ColToRowMatchIndices is 1.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp,
-                  ops::BipartiteMatchOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel<float>,
-                       ops::BipartiteMatchKernel<double>);
diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc
deleted file mode 100644
index 3aa766559a530bc31fbb277f2bcd474da776e63b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/box_clip_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class BoxClipOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of BoxClipOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ImInfo"),
-                   "Input(ImInfo) of BoxClipOp should not be null.");
-
-    auto input_box_dims = ctx->GetInputDim("Input");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-
-    if (ctx->IsRuntime()) {
-      auto input_box_size = input_box_dims.size();
-      PADDLE_ENFORCE_EQ(input_box_dims[input_box_size - 1], 4,
-                        "The last dimension of Input must be 4");
-      PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
-                        "The rank of Input(Input) in BoxClipOp must be 2");
-      PADDLE_ENFORCE_EQ(im_info_dims[1], 3,
-                        "The last dimension of ImInfo must be 3");
-    }
-    ctx->ShareDim("Input", /*->*/ "Output");
-    ctx->ShareLoD("Input", /*->*/ "Output");
-  }
-};
-
-class BoxClipOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(LoDTensor) "
-             "Input is a LoDTensor with shape [..., 4] holds 4 points"
-             "in last dimension in format [xmin, ymin, xmax, ymax]");
-    AddInput("ImInfo",
-             "(Tensor) Information for image reshape is in shape (N, 3), "
-             "in format (height, width, im_scale)");
-    AddOutput("Output",
-              "(LoDTensor) "
-              "Output is a LoDTensor with the same shape as Input"
-              "and it is the result after clip");
-    AddComment(R"DOC(
-This operator clips input boxes to original input images.
-
-For each input box, The formula is given as follows:
-
-       $$xmin = \max(\min(xmin, im_w - 1), 0)$$
-       $$ymin = \max(\min(ymin, im_h - 1), 0)$$     
-       $$xmax = \max(\min(xmax, im_w - 1), 0)$$
-       $$ymax = \max(\min(ymax, im_h - 1), 0)$$
-
-where im_w and im_h are computed from ImInfo, the formula is given as follows:
-
-       $$im_w = \round(width / im_scale)$$
-       $$im_h = \round(height / im_scale)$$ 
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(box_clip, ops::BoxClipOp, ops::BoxClipOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    box_clip, ops::BoxClipKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BoxClipKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
deleted file mode 100644
index b727da5f7b736b6f22407d1dfbca708ed0cf04d9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/box_clip_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTenso = framework::LoDTensor;
-
-static constexpr int ImInfoSize = 3;
-
-template <typename T, int BlockSize>
-static __global__ void GPUBoxClip(const T *input, const size_t *lod,
-                                  const size_t width, const T *im_info,
-                                  T *output) {
-  T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] /
-                 im_info[blockIdx.x * ImInfoSize + 2]);
-  T im_h = round(im_info[blockIdx.x * ImInfoSize] /
-                 im_info[blockIdx.x * ImInfoSize + 2]);
-  for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width;
-       i += BlockSize) {
-    int idx = lod[blockIdx.x] * width + i;
-    T im_size = (idx % 2 == 0) ? im_w : im_h;
-    output[idx] = max(min(input[idx], im_size - 1), T(0.));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GPUBoxClipKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto *input = context.Input<LoDTensor>("Input");
-    auto *im_info = context.Input<Tensor>("ImInfo");
-    auto *output = context.Output<LoDTensor>("Output");
-    const int64_t num = input->dims()[0];
-    const int64_t bbox_width = input->numel() / num;
-    auto lod = input->lod();
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    auto stream = dev_ctx.stream();
-    const size_t batch_size = lod.back().size() - 1;
-    T *output_data = output->mutable_data<T>(dev_ctx.GetPlace());
-    GPUBoxClip<T, 512><<<batch_size, 512, 0, stream>>>(
-        input->data<T>(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()),
-        bbox_width, im_info->data<T>(), output_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    box_clip, ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
deleted file mode 100644
index 74e1f88f8d8b28e490d170934760bd9bffc807bc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class BoxClipKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_box = context.Input<LoDTensor>("Input");
-    auto* im_info = context.Input<LoDTensor>("ImInfo");
-    auto* output_box = context.Output<LoDTensor>("Output");
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-    output_box->mutable_data<T>(context.GetPlace());
-    if (input_box->lod().size()) {
-      PADDLE_ENFORCE_EQ(input_box->lod().size(), 1UL,
-                        "Only support 1 level of LoD.");
-    }
-    auto box_lod = input_box->lod().back();
-    int64_t n = static_cast<int64_t>(box_lod.size() - 1);
-    for (int i = 0; i < n; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]);
-      Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]);
-      ClipTiledBoxes<T>(dev_ctx, im_info_slice, box_slice, &output_slice);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
deleted file mode 100644
index de3612677440596387f313e1ff59184cb3fdb7ae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/box_coder_op.h"
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class BoxCoderOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
-                   "Input(PriorBox) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
-                   "Input(TargetBox) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
-                   "Output(OutputBox) of BoxCoderOp should not be null.");
-
-    auto prior_box_dims = ctx->GetInputDim("PriorBox");
-    auto target_box_dims = ctx->GetInputDim("TargetBox");
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
-                        "The rank of Input PriorBox must be 2");
-      PADDLE_ENFORCE_EQ(prior_box_dims[1], 4,
-                        "The shape of PriorBox is [N, 4]");
-      if (ctx->HasInput("PriorBoxVar")) {
-        auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
-        PADDLE_ENFORCE(prior_box_var_dims.size() == 2,
-                       "Input(PriorBoxVar) of BoxCoderOp should be 2.");
-        PADDLE_ENFORCE_EQ(
-            prior_box_dims, prior_box_var_dims,
-            "The dimension of Input(PriorBoxVar) should be equal to"
-            "the dimension of Input(PriorBox) when the rank is 2.");
-      }
-    }
-
-    auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
-    int axis = ctx->Attrs().Get<int>("axis");
-    if (code_type == BoxCodeType::kEncodeCenterSize) {
-      PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
-                        "The rank of Input TargetBox must be 2");
-      PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
-                        "The shape of TargetBox is [M, 4]");
-      ctx->SetOutputDim(
-          "OutputBox",
-          framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
-    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
-                        "The rank of Input TargetBox must be 3");
-      PADDLE_ENFORCE(axis == 0 || axis == 1, "axis must be 0 or 1");
-      if (ctx->IsRuntime()) {
-        if (axis == 0) {
-          PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
-        } else if (axis == 1) {
-          PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
-        }
-        PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
-      }
-      ctx->ShareDim("TargetBox", /*->*/ "OutputBox");
-    }
-
-    if (code_type == BoxCodeType::kDecodeCenterSize && axis == 1) {
-      ctx->ShareLoD("PriorBox", /*->*/ "OutputBox");
-    } else {
-      ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
-    }
-  }
-};
-
-class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "PriorBox",
-        "(Tensor, default Tensor<float>) "
-        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
-        "each box is represented as [xmin, ymin, xmax, ymax], "
-        "[xmin, ymin] is the left top coordinate of the anchor box, "
-        "if the input is image feature map, they are close to the origin "
-        "of the coordinate system. [xmax, ymax] is the right bottom "
-        "coordinate of the anchor box.");
-    AddInput("PriorBoxVar",
-             "(Tensor, default Tensor<float>, optional) "
-             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
-             "of variance. PriorBoxVar will set all elements to 1 by "
-             "default.")
-        .AsDispensable();
-    AddInput(
-        "TargetBox",
-        "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape "
-        "[N, 4] when code_type is 'encode_center_size'. This input also can "
-        "be a 3-D Tensor with shape [N, M, 4] when code_type is "
-        "'decode_center_size'. [N, 4], each box is represented as "
-        "[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate "
-        "of the box if the input is image feature map, they are close to "
-        "the origin of the coordinate system. [xmax, ymax] is the right "
-        "bottom coordinate of the box. This tensor can contain LoD "
-        "information to represent a batch of inputs. One instance of this "
-        "batch can contain different numbers of entities.");
-    AddAttr<std::string>("code_type",
-                         "(string, default encode_center_size) "
-                         "the code type used with the target box")
-        .SetDefault("encode_center_size")
-        .InEnum({"encode_center_size", "decode_center_size"});
-    AddAttr<bool>("box_normalized",
-                  "(bool, default true) "
-                  "whether treat the priorbox as a noramlized box")
-        .SetDefault(true);
-    AddAttr<int>("axis",
-                 "(int, default 0)"
-                 "which axis in PriorBox to broadcast for box decode,"
-                 "for example, if axis is 0 and TargetBox has shape"
-                 "[N, M, 4] and PriorBox has shape [M, 4], then PriorBox "
-                 "will broadcast to [N, M, 4] for decoding. It is only valid"
-                 "when code type is decode_center_size")
-        .SetDefault(0)
-        .InEnum({0, 1});
-    AddAttr<std::vector<float>>(
-        "variance",
-        "(vector<float>, default {}),"
-        "variance of prior box with shape [4]. PriorBoxVar and variance can"
-        "not be provided at the same time.")
-        .SetDefault(std::vector<float>{});
-    AddOutput("OutputBox",
-              "(LoDTensor or Tensor) "
-              "When code_type is 'encode_center_size', the output tensor of "
-              "box_coder_op with shape [N, M, 4] representing the result of N "
-              "target boxes encoded with M Prior boxes and variances. When "
-              "code_type is 'decode_center_size', N represents the batch size "
-              "and M represents the number of deocded boxes.");
-
-    AddComment(R"DOC(
-
-Bounding Box Coder.
-
-Encode/Decode the target bounding box with the priorbox information.
-
-The Encoding schema described below:
-
-    ox = (tx - px) / pw / pxv
-
-    oy = (ty - py) / ph / pyv
-
-    ow = log(abs(tw / pw)) / pwv 
-
-    oh = log(abs(th / ph)) / phv 
-
-The Decoding schema described below:
-
-    ox = (pw * pxv * tx * + px) - tw / 2
-
-    oy = (ph * pyv * ty * + py) - th / 2
-
-    ow = exp(pwv * tw) * pw + tw / 2
-
-    oh = exp(phv * th) * ph + th / 2
-
-where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
-and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
-priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
-`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
-encoded/decoded coordinates, width and height. 
-
-During Box Decoding, two modes for broadcast are supported. Say target box has 
-shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior
-box will broadcast to target box along the assigned axis. 
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    box_coder, ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
deleted file mode 100644
index b3dd142de77e2f8087ee4493378978f30b00fc58..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/detection/box_coder_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void EncodeCenterSizeKernel(
-    const T* prior_box_data, const T* prior_box_var_data,
-    const T* target_box_data, const int row, const int col, const int len,
-    const bool normalized, const T prior_box_var_size, const float* variance,
-    const int var_size, T* output) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < row * col) {
-    const int row_idx = idx / col;
-    const int col_idx = idx % col;
-    T prior_box_width = prior_box_data[col_idx * len + 2] -
-                        prior_box_data[col_idx * len] + (normalized == false);
-    T prior_box_height = prior_box_data[col_idx * len + 3] -
-                         prior_box_data[col_idx * len + 1] +
-                         (normalized == false);
-    T prior_box_center_x = prior_box_data[col_idx * len] + prior_box_width / 2;
-    T prior_box_center_y =
-        prior_box_data[col_idx * len + 1] + prior_box_height / 2;
-
-    T target_box_center_x =
-        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
-        2;
-    T target_box_center_y = (target_box_data[row_idx * len + 3] +
-                             target_box_data[row_idx * len + 1]) /
-                            2;
-    T target_box_width = target_box_data[row_idx * len + 2] -
-                         target_box_data[row_idx * len] + (normalized == false);
-    T target_box_height = target_box_data[row_idx * len + 3] -
-                          target_box_data[row_idx * len + 1] +
-                          (normalized == false);
-
-    output[idx * len] =
-        (target_box_center_x - prior_box_center_x) / prior_box_width;
-    output[idx * len + 1] =
-        (target_box_center_y - prior_box_center_y) / prior_box_height;
-    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
-    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
-    if (prior_box_var_data) {
-      int prior_var_offset = col_idx * len;
-      output[idx * len] /= prior_box_var_data[prior_var_offset];
-      output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1];
-      output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2];
-      output[idx * len + 3] /= prior_box_var_data[prior_var_offset + 3];
-    } else if (var_size == 4) {
-      for (int k = 0; k < 4; ++k) {
-        output[idx * len + k] /= static_cast<T>(variance[k]);
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void DecodeCenterSizeKernel(
-    const T* prior_box_data, const T* prior_box_var_data,
-    const T* target_box_data, const int row, const int col, const int len,
-    const bool normalized, const T prior_box_var_size, const float* variance,
-    const int var_size, const int axis, T* output) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int prior_box_offset = 0;
-  if (idx < row * col) {
-    const int col_idx = idx % col;
-    const int row_idx = idx / col;
-    prior_box_offset = axis == 0 ? col_idx * len : row_idx * len;
-    T prior_box_width = prior_box_data[prior_box_offset + 2] -
-                        prior_box_data[prior_box_offset] +
-                        (normalized == false);
-    T prior_box_height = prior_box_data[prior_box_offset + 3] -
-                         prior_box_data[prior_box_offset + 1] +
-                         (normalized == false);
-    T prior_box_center_x =
-        prior_box_data[prior_box_offset] + prior_box_width / 2;
-    T prior_box_center_y =
-        prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
-    T target_box_width, target_box_height;
-    T target_box_center_x, target_box_center_y;
-    T box_var_x = T(1), box_var_y = T(1);
-    T box_var_w = T(1), box_var_h = T(1);
-    if (prior_box_var_data) {
-      int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
-      box_var_x = prior_box_var_data[prior_var_offset];
-      box_var_y = prior_box_var_data[prior_var_offset + 1];
-      box_var_w = prior_box_var_data[prior_var_offset + 2];
-      box_var_h = prior_box_var_data[prior_var_offset + 3];
-    } else if (var_size == 4) {
-      box_var_x = static_cast<T>(variance[0]);
-      box_var_y = static_cast<T>(variance[1]);
-      box_var_w = static_cast<T>(variance[2]);
-      box_var_h = static_cast<T>(variance[3]);
-    }
-    target_box_width =
-        exp(box_var_w * target_box_data[idx * len + 2]) * prior_box_width;
-    target_box_height =
-        exp(box_var_h * target_box_data[idx * len + 3]) * prior_box_height;
-    target_box_center_x =
-        box_var_x * target_box_data[idx * len] * prior_box_width +
-        prior_box_center_x;
-    target_box_center_y =
-        box_var_y * target_box_data[idx * len + 1] * prior_box_height +
-        prior_box_center_y;
-
-    output[idx * len] = target_box_center_x - target_box_width / 2;
-    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
-    output[idx * len + 2] =
-        target_box_center_x + target_box_width / 2 - (normalized == false);
-    output[idx * len + 3] =
-        target_box_center_y + target_box_height / 2 - (normalized == false);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class BoxCoderCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = context.Output<framework::Tensor>("OutputBox");
-    std::vector<float> variance = context.Attr<std::vector<float>>("variance");
-    const T* prior_box_data = prior_box->data<T>();
-    const T* target_box_data = target_box->data<T>();
-    const T* prior_box_var_data = nullptr;
-    auto prior_box_var_size = 0;
-    if (prior_box_var) {
-      PADDLE_ENFORCE(variance.empty(),
-                     "Input 'PriorBoxVar' and attribute 'variance' should not"
-                     "be used at the same time.");
-      prior_box_var_data = prior_box_var->data<T>();
-      prior_box_var_size = prior_box_var->dims().size();
-    }
-    if (!(variance.empty())) {
-      PADDLE_ENFORCE(static_cast<int>(variance.size()) == 4,
-                     "Size of attribute 'variance' should be 4");
-    }
-
-    if (target_box->lod().size()) {
-      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
-                        "Only support 1 level of LoD.");
-    }
-    const int var_size = static_cast<int>(variance.size());
-
-    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
-    bool normalized = context.Attr<bool>("box_normalized");
-    int axis = context.Attr<int>("axis");
-
-    auto row = target_box->dims()[0];
-    auto col = prior_box->dims()[0];
-    if (code_type == BoxCodeType::kDecodeCenterSize) {
-      col = target_box->dims()[1];
-    }
-    auto len = prior_box->dims()[1];
-    int block = 512;
-    int grid = (row * col + block - 1) / block;
-    auto& device_ctx = context.cuda_device_context();
-
-    int bytes = var_size * sizeof(float);
-    auto dev_var = memory::Alloc(device_ctx, bytes);
-    float* dev_var_data = reinterpret_cast<float*>(dev_var->ptr());
-    auto cplace = platform::CPUPlace();
-    const auto gplace = boost::get<platform::CUDAPlace>(context.GetPlace());
-    memory::Copy(gplace, dev_var_data, cplace, &variance[0], bytes,
-                 device_ctx.stream());
-
-    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
-    T* output = output_box->data<T>();
-
-    if (code_type == BoxCodeType::kEncodeCenterSize) {
-      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
-          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          normalized, prior_box_var_size, dev_var_data, var_size, output);
-    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
-          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          normalized, prior_box_var_size, dev_var_data, var_size, axis, output);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    box_coder,
-    ops::BoxCoderCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BoxCoderCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
deleted file mode 100644
index d4c7e8cf7723bf83d3cd8bf36b9ae6c5f1c35b10..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
-
-inline BoxCodeType GetBoxCodeType(const std::string &type) {
-  if (type == "encode_center_size") {
-    return BoxCodeType::kEncodeCenterSize;
-  } else if (type == "decode_center_size") {
-    return BoxCodeType::kDecodeCenterSize;
-  }
-  PADDLE_THROW("Not support type %s.", type);
-}
-
-template <typename DeviceContext, typename T>
-class BoxCoderKernel : public framework::OpKernel<T> {
- public:
-  void EncodeCenterSize(const framework::Tensor *target_box,
-                        const framework::Tensor *prior_box,
-                        const framework::Tensor *prior_box_var,
-                        const bool normalized,
-                        const std::vector<float> variance, T *output) const {
-    int64_t row = target_box->dims()[0];
-    int64_t col = prior_box->dims()[0];
-    int64_t len = prior_box->dims()[1];
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(2)
-#endif
-    for (int64_t i = 0; i < row; ++i) {
-      for (int64_t j = 0; j < col; ++j) {
-        auto *target_box_data = target_box->data<T>();
-        auto *prior_box_data = prior_box->data<T>();
-        size_t offset = i * col * len + j * len;
-        T prior_box_width = prior_box_data[j * len + 2] -
-                            prior_box_data[j * len] + (normalized == false);
-        T prior_box_height = prior_box_data[j * len + 3] -
-                             prior_box_data[j * len + 1] +
-                             (normalized == false);
-        T prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
-        T prior_box_center_y =
-            prior_box_data[j * len + 1] + prior_box_height / 2;
-
-        T target_box_center_x =
-            (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
-        T target_box_center_y =
-            (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-        T target_box_width = target_box_data[i * len + 2] -
-                             target_box_data[i * len] + (normalized == false);
-        T target_box_height = target_box_data[i * len + 3] -
-                              target_box_data[i * len + 1] +
-                              (normalized == false);
-
-        output[offset] =
-            (target_box_center_x - prior_box_center_x) / prior_box_width;
-        output[offset + 1] =
-            (target_box_center_y - prior_box_center_y) / prior_box_height;
-        output[offset + 2] =
-            std::log(std::fabs(target_box_width / prior_box_width));
-        output[offset + 3] =
-            std::log(std::fabs(target_box_height / prior_box_height));
-      }
-    }
-
-    if (prior_box_var) {
-      const T *prior_box_var_data = prior_box_var->data<T>();
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(3)
-#endif
-      for (int64_t i = 0; i < row; ++i) {
-        for (int64_t j = 0; j < col; ++j) {
-          for (int k = 0; k < 4; ++k) {
-            size_t offset = i * col * len + j * len;
-            int prior_var_offset = j * len;
-            output[offset + k] /= prior_box_var_data[prior_var_offset + k];
-          }
-        }
-      }
-    } else if (!(variance.empty())) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(3)
-#endif
-      for (int64_t i = 0; i < row; ++i) {
-        for (int64_t j = 0; j < col; ++j) {
-          for (int k = 0; k < 4; ++k) {
-            size_t offset = i * col * len + j * len;
-            output[offset + k] /= static_cast<T>(variance[k]);
-          }
-        }
-      }
-    }
-  }
-
-  template <int axis, int var_size>
-  void DecodeCenterSize(const framework::Tensor *target_box,
-                        const framework::Tensor *prior_box,
-                        const framework::Tensor *prior_box_var,
-                        const bool normalized, std::vector<float> variance,
-                        T *output) const {
-    int64_t row = target_box->dims()[0];
-    int64_t col = target_box->dims()[1];
-    int64_t len = target_box->dims()[2];
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(2)
-#endif
-    for (int64_t i = 0; i < row; ++i) {
-      for (int64_t j = 0; j < col; ++j) {
-        auto *target_box_data = target_box->data<T>();
-        auto *prior_box_data = prior_box->data<T>();
-
-        T var_data[4] = {1., 1., 1., 1.};
-        T *var_ptr = var_data;
-        size_t offset = i * col * len + j * len;
-        int prior_box_offset = axis == 0 ? j * len : i * len;
-
-        T prior_box_width = prior_box_data[prior_box_offset + 2] -
-                            prior_box_data[prior_box_offset] +
-                            (normalized == false);
-        T prior_box_height = prior_box_data[prior_box_offset + 3] -
-                             prior_box_data[prior_box_offset + 1] +
-                             (normalized == false);
-        T prior_box_center_x =
-            prior_box_data[prior_box_offset] + prior_box_width / 2;
-        T prior_box_center_y =
-            prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
-
-        T target_box_center_x = 0, target_box_center_y = 0;
-        T target_box_width = 0, target_box_height = 0;
-        int prior_var_offset = axis == 0 ? j * len : i * len;
-        if (var_size == 2) {
-          std::memcpy(var_ptr, prior_box_var->data<T>() + prior_var_offset,
-                      4 * sizeof(T));
-        } else if (var_size == 1) {
-          var_ptr = reinterpret_cast<T *>(variance.data());
-        }
-        T box_var_x = *var_ptr;
-        T box_var_y = *(var_ptr + 1);
-        T box_var_w = *(var_ptr + 2);
-        T box_var_h = *(var_ptr + 3);
-
-        target_box_center_x =
-            box_var_x * target_box_data[offset] * prior_box_width +
-            prior_box_center_x;
-        target_box_center_y =
-            box_var_y * target_box_data[offset + 1] * prior_box_height +
-            prior_box_center_y;
-        target_box_width =
-            std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
-        target_box_height = std::exp(box_var_h * target_box_data[offset + 3]) *
-                            prior_box_height;
-
-        output[offset] = target_box_center_x - target_box_width / 2;
-        output[offset + 1] = target_box_center_y - target_box_height / 2;
-        output[offset + 2] =
-            target_box_center_x + target_box_width / 2 - (normalized == false);
-        output[offset + 3] =
-            target_box_center_y + target_box_height / 2 - (normalized == false);
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *prior_box = context.Input<framework::Tensor>("PriorBox");
-    auto *prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto *target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto *output_box = context.Output<framework::Tensor>("OutputBox");
-    std::vector<float> variance = context.Attr<std::vector<float>>("variance");
-    const int axis = context.Attr<int>("axis");
-    if (target_box->lod().size()) {
-      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
-                        "Only support 1 level of LoD.");
-    }
-    if (prior_box_var) {
-      PADDLE_ENFORCE(variance.empty(),
-                     "Input 'PriorBoxVar' and attribute 'variance' should not"
-                     "be used at the same time.");
-    }
-    if (!(variance.empty())) {
-      PADDLE_ENFORCE(static_cast<int>(variance.size()) == 4,
-                     "Size of attribute 'variance' should be 4");
-    }
-    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
-    bool normalized = context.Attr<bool>("box_normalized");
-
-    auto row = target_box->dims()[0];
-    auto col = prior_box->dims()[0];
-    if (code_type == BoxCodeType::kDecodeCenterSize) {
-      col = target_box->dims()[1];
-    }
-    auto len = prior_box->dims()[1];
-
-    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
-
-    T *output = output_box->data<T>();
-    if (code_type == BoxCodeType::kEncodeCenterSize) {
-      EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
-                       variance, output);
-    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      if (prior_box_var) {
-        if (axis == 0) {
-          DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var,
-                                 normalized, variance, output);
-        } else {
-          DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var,
-                                 normalized, variance, output);
-        }
-      } else if (!(variance.empty())) {
-        if (axis == 0) {
-          DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var,
-                                 normalized, variance, output);
-        } else {
-          DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var,
-                                 normalized, variance, output);
-        }
-      } else {
-        if (axis == 0) {
-          DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var,
-                                 normalized, variance, output);
-        } else {
-          DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var,
-                                 normalized, variance, output);
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
deleted file mode 100644
index 976aa317b8819b46fe3dd06c68d384fa6e34c6fd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-
-class BoxDecoderAndAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("PriorBox"),
-        "Input(PriorBox) of BoxDecoderAndAssignOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("PriorBoxVar"),
-        "Input(PriorBoxVar) of BoxDecoderAndAssignOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("TargetBox"),
-        "Input(TargetBox) of BoxDecoderAndAssignOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("BoxScore"),
-        "Input(BoxScore) of BoxDecoderAndAssignOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("DecodeBox"),
-        "Output(DecodeBox) of BoxDecoderAndAssignOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("OutputAssignBox"),
-        "Output(OutputAssignBox) of BoxDecoderAndAssignOp should not be null.");
-
-    auto prior_box_dims = ctx->GetInputDim("PriorBox");
-    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
-    auto target_box_dims = ctx->GetInputDim("TargetBox");
-    auto box_score_dims = ctx->GetInputDim("BoxScore");
-
-    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
-                      "The rank of Input of PriorBox must be 2");
-    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
-    PADDLE_ENFORCE_EQ(prior_box_var_dims.size(), 1,
-                      "The rank of Input of PriorBoxVar must be 1");
-    PADDLE_ENFORCE_EQ(prior_box_var_dims[0], 4,
-                      "The shape of PriorBoxVar is [4]");
-    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
-                      "The rank of Input of TargetBox must be 2");
-    PADDLE_ENFORCE_EQ(box_score_dims.size(), 2,
-                      "The rank of Input of BoxScore must be 2");
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0],
-                        "The first dim of prior_box and target_box is roi nums "
-                        "and should be same!");
-      PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0],
-                        "The first dim of prior_box and box_score is roi nums "
-                        "and should be same!");
-      PADDLE_ENFORCE_EQ(
-          target_box_dims[1], box_score_dims[1] * prior_box_dims[1],
-          "The shape of target_box is [N, classnum * 4], The shape "
-          "of box_score is [N, classnum], The shape of prior_box "
-          "is [N, 4]");
-    }
-    ctx->SetOutputDim("DecodeBox", framework::make_ddim({target_box_dims[0],
-                                                         target_box_dims[1]}));
-    ctx->ShareLoD("TargetBox", /*->*/ "DecodeBox");
-    ctx->SetOutputDim(
-        "OutputAssignBox",
-        framework::make_ddim({prior_box_dims[0], prior_box_dims[1]}));
-    ctx->ShareLoD("PriorBox", /*->*/ "OutputAssignBox");
-  }
-};
-
-class BoxDecoderAndAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "PriorBox",
-        "(Tensor, default Tensor<float>) "
-        "Box list PriorBox is a 2-D Tensor with shape [N, 4] which holds N "
-        "boxes and each box is represented as [xmin, ymin, xmax, ymax], "
-        "[xmin, ymin] is the left top coordinate of the anchor box, "
-        "if the input is image feature map, they are close to the origin "
-        "of the coordinate system. [xmax, ymax] is the right bottom "
-        "coordinate of the anchor box.");
-    AddInput("PriorBoxVar",
-             "(Tensor, default Tensor<float>, optional) "
-             "PriorBoxVar is a 2-D Tensor with shape [N, 4] which holds N "
-             "group of variance. PriorBoxVar will set all elements to 1 by "
-             "default.")
-        .AsDispensable();
-    AddInput("TargetBox",
-             "(LoDTensor or Tensor) "
-             "This input can be a 2-D LoDTensor with shape "
-             "[N, classnum*4]. It holds N targets for N boxes.");
-    AddInput("BoxScore",
-             "(LoDTensor or Tensor) "
-             "This input can be a 2-D LoDTensor with shape "
-             "[N, classnum], each box is represented as [classnum] which is "
-             "the classification probabilities.");
-    AddAttr<float>("box_clip",
-                   "(float, default 4.135, np.log(1000. / 16.)) "
-                   "clip box to prevent overflowing")
-        .SetDefault(4.135f);
-    AddOutput("DecodeBox",
-              "(LoDTensor or Tensor) "
-              "the output tensor of op with shape [N, classnum * 4] "
-              "representing the result of N target boxes decoded with "
-              "M Prior boxes and variances for each class.");
-    AddOutput("OutputAssignBox",
-              "(LoDTensor or Tensor) "
-              "the output tensor of op with shape [N, 4] "
-              "representing the result of N target boxes decoded with "
-              "M Prior boxes and variances with the best non-background class "
-              "by BoxScore.");
-    AddComment(R"DOC(
-
-Bounding Box Coder.
-
-Decode the target bounding box with the prior_box information.
-
-The Decoding schema is described below:
-
-    $$
-    ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2} 
-    $$
-    $$
-    oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2}
-    $$
-    $$
-    ow = \\exp (pwv \\times tw) \\times pw + \\frac{tw}{2}
-    $$
-    $$
-    oh = \\exp (phv \\times th) \\times ph + \\frac{th}{2}
-    $$
-
-where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
-and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
-prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
-`phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the
-decoded coordinates, width and height in decode_box. 
-
-decode_box is obtained after box decode, then assigning schema is described below:
-
-For each prior_box, use the best non-background class's decoded values to 
-update the prior_box locations and get output_assign_box. So, the shape of
-output_assign_box is the same as PriorBox.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(box_decoder_and_assign, ops::BoxDecoderAndAssignOp,
-                  ops::BoxDecoderAndAssignOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    box_decoder_and_assign,
-    ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
deleted file mode 100644
index 25e6545eb59bde5e080dc907f9ecd4281062413f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void DecodeBoxKernel(const T* prior_box_data,
-                                const T* prior_box_var_data,
-                                const T* target_box_data, const int roi_num,
-                                const int class_num, const T box_clip,
-                                T* output_box_data) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < roi_num * class_num) {
-    int i = idx / class_num;
-    int j = idx % class_num;
-    T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1;
-    T prior_box_height =
-        prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1;
-    T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2;
-    T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2;
-
-    int offset = i * class_num * 4 + j * 4;
-    T dw = prior_box_var_data[2] * target_box_data[offset + 2];
-    T dh = prior_box_var_data[3] * target_box_data[offset + 3];
-    if (dw > box_clip) {
-      dw = box_clip;
-    }
-    if (dh > box_clip) {
-      dh = box_clip;
-    }
-    T target_box_center_x = 0, target_box_center_y = 0;
-    T target_box_width = 0, target_box_height = 0;
-    target_box_center_x =
-        prior_box_var_data[0] * target_box_data[offset] * prior_box_width +
-        prior_box_center_x;
-    target_box_center_y =
-        prior_box_var_data[1] * target_box_data[offset + 1] * prior_box_height +
-        prior_box_center_y;
-    target_box_width = expf(dw) * prior_box_width;
-    target_box_height = expf(dh) * prior_box_height;
-
-    output_box_data[offset] = target_box_center_x - target_box_width / 2;
-    output_box_data[offset + 1] = target_box_center_y - target_box_height / 2;
-    output_box_data[offset + 2] =
-        target_box_center_x + target_box_width / 2 - 1;
-    output_box_data[offset + 3] =
-        target_box_center_y + target_box_height / 2 - 1;
-  }
-}
-
-template <typename T>
-__global__ void AssignBoxKernel(const T* prior_box_data,
-                                const T* box_score_data, T* output_box_data,
-                                const int roi_num, const int class_num,
-                                T* output_assign_box_data) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < roi_num) {
-    int i = idx;
-    T max_score = -1;
-    int max_j = -1;
-    for (int j = 0; j < class_num; ++j) {
-      T score = box_score_data[i * class_num + j];
-      if (score > max_score && j > 0) {
-        max_score = score;
-        max_j = j;
-      }
-    }
-    if (max_j > 0) {
-      for (int pno = 0; pno < 4; pno++) {
-        output_assign_box_data[i * 4 + pno] =
-            output_box_data[i * class_num * 4 + max_j * 4 + pno];
-      }
-    } else {
-      for (int pno = 0; pno < 4; pno++) {
-        output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno];
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto* prior_box = context.Input<framework::LoDTensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* box_score = context.Input<framework::LoDTensor>("BoxScore");
-    auto* output_box = context.Output<framework::Tensor>("DecodeBox");
-    auto* output_assign_box =
-        context.Output<framework::Tensor>("OutputAssignBox");
-
-    auto roi_num = target_box->dims()[0];
-    auto class_num = box_score->dims()[1];
-    auto* target_box_data = target_box->data<T>();
-    auto* prior_box_data = prior_box->data<T>();
-    auto* prior_box_var_data = prior_box_var->data<T>();
-    auto* box_score_data = box_score->data<T>();
-    output_box->mutable_data<T>({roi_num, class_num * 4}, context.GetPlace());
-    output_assign_box->mutable_data<T>({roi_num, 4}, context.GetPlace());
-    T* output_box_data = output_box->data<T>();
-    T* output_assign_box_data = output_assign_box->data<T>();
-
-    int block = 512;
-    int grid = (roi_num * class_num + block - 1) / block;
-    auto& device_ctx = context.cuda_device_context();
-
-    const T box_clip = context.Attr<T>("box_clip");
-
-    DecodeBoxKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
-        prior_box_data, prior_box_var_data, target_box_data, roi_num, class_num,
-        box_clip, output_box_data);
-
-    context.device_context().Wait();
-    int assign_grid = (roi_num + block - 1) / block;
-    AssignBoxKernel<T><<<assign_grid, block, 0, device_ctx.stream()>>>(
-        prior_box_data, box_score_data, output_box_data, roi_num, class_num,
-        output_assign_box_data);
-    context.device_context().Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    box_decoder_and_assign,
-    ops::BoxDecoderAndAssignCUDAKernel<paddle::platform::CUDADeviceContext,
-                                       float>,
-    ops::BoxDecoderAndAssignCUDAKernel<paddle::platform::CUDADeviceContext,
-                                       double>);
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
deleted file mode 100644
index e66a8351f4761fc805dbd2e44f237c751642d816..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class BoxDecoderAndAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* prior_box = context.Input<framework::LoDTensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* box_score = context.Input<framework::LoDTensor>("BoxScore");
-    auto* output_box = context.Output<framework::Tensor>("DecodeBox");
-    auto* output_assign_box =
-        context.Output<framework::Tensor>("OutputAssignBox");
-    int roi_num = target_box->dims()[0];
-    int class_num = box_score->dims()[1];
-    auto* target_box_data = target_box->data<T>();
-    auto* prior_box_data = prior_box->data<T>();
-    auto* prior_box_var_data = prior_box_var->data<T>();
-    auto* box_score_data = box_score->data<T>();
-    output_box->mutable_data<T>({roi_num, class_num * 4}, context.GetPlace());
-    output_assign_box->mutable_data<T>({roi_num, 4}, context.GetPlace());
-    T* output_box_data = output_box->data<T>();
-    T* output_assign_box_data = output_assign_box->data<T>();
-    const T bbox_clip = context.Attr<T>("box_clip");
-
-    for (int i = 0; i < roi_num; ++i) {
-      T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1;
-      T prior_box_height =
-          prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1;
-      T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2;
-      T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2;
-      for (int j = 0; j < class_num; ++j) {
-        int64_t offset = i * class_num * 4 + j * 4;
-        T dw = std::min(prior_box_var_data[2] * target_box_data[offset + 2],
-                        bbox_clip);
-        T dh = std::min(prior_box_var_data[3] * target_box_data[offset + 3],
-                        bbox_clip);
-        T target_box_center_x = 0, target_box_center_y = 0;
-        T target_box_width = 0, target_box_height = 0;
-        target_box_center_x =
-            prior_box_var_data[0] * target_box_data[offset] * prior_box_width +
-            prior_box_center_x;
-        target_box_center_y = prior_box_var_data[1] *
-                                  target_box_data[offset + 1] *
-                                  prior_box_height +
-                              prior_box_center_y;
-        target_box_width = std::exp(dw) * prior_box_width;
-        target_box_height = std::exp(dh) * prior_box_height;
-
-        output_box_data[offset] = target_box_center_x - target_box_width / 2;
-        output_box_data[offset + 1] =
-            target_box_center_y - target_box_height / 2;
-        output_box_data[offset + 2] =
-            target_box_center_x + target_box_width / 2 - 1;
-        output_box_data[offset + 3] =
-            target_box_center_y + target_box_height / 2 - 1;
-      }
-
-      T max_score = -1;
-      int max_j = -1;
-      for (int j = 0; j < class_num; ++j) {
-        T score = box_score_data[i * class_num + j];
-        if (score > max_score && j > 0) {
-          max_score = score;
-          max_j = j;
-        }
-      }
-
-      if (max_j > 0) {
-        for (int pno = 0; pno < 4; pno++) {
-          output_assign_box_data[i * 4 + pno] =
-              output_box_data[i * class_num * 4 + max_j * 4 + pno];
-        }
-      } else {
-        for (int pno = 0; pno < 4; pno++) {
-          output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno];
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
deleted file mode 100644
index 0603072835e8f146e5bb006d5759220900a29e56..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-     http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.*/
-
-#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-class CollectFpnProposalsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInputs("MultiLevelRois"),
-                   "Inputs(MultiLevelRois) shouldn't be null");
-    PADDLE_ENFORCE(context->HasInputs("MultiLevelScores"),
-                   "Inputs(MultiLevelScores) shouldn't be null");
-    PADDLE_ENFORCE(context->HasOutput("FpnRois"),
-                   "Outputs(MultiFpnRois) of DistributeOp should not be null");
-    auto roi_dims = context->GetInputsDim("MultiLevelRois");
-    auto score_dims = context->GetInputsDim("MultiLevelScores");
-    auto post_nms_topN = context->Attrs().Get<int>("post_nms_topN");
-    std::vector<int64_t> out_dims;
-    for (auto &roi_dim : roi_dims) {
-      PADDLE_ENFORCE_EQ(roi_dim[1], 4,
-                        "Second dimension of Input(MultiLevelRois) must be 4");
-    }
-    for (auto &score_dim : score_dims) {
-      PADDLE_ENFORCE_EQ(
-          score_dim[1], 1,
-          "Second dimension of Input(MultiLevelScores) must be 1");
-    }
-    context->SetOutputDim("FpnRois", {post_nms_topN, 4});
-    if (!context->IsRuntime()) {  // Runtime LoD infershape will be computed
-      // in Kernel.
-      context->ShareLoD("MultiLevelRois", "FpnRois");
-    }
-    if (context->IsRuntime()) {
-      std::vector<framework::InferShapeVarPtr> roi_inputs =
-          context->GetInputVarPtrs("MultiLevelRois");
-      std::vector<framework::InferShapeVarPtr> score_inputs =
-          context->GetInputVarPtrs("MultiLevelScores");
-      for (size_t i = 0; i < roi_inputs.size(); ++i) {
-        framework::Variable *roi_var =
-            boost::get<framework::Variable *>(roi_inputs[i]);
-        framework::Variable *score_var =
-            boost::get<framework::Variable *>(score_inputs[i]);
-        auto &roi_lod = roi_var->Get<LoDTensor>().lod();
-        auto &score_lod = score_var->Get<LoDTensor>().lod();
-        PADDLE_ENFORCE_EQ(roi_lod, score_lod,
-                          "Inputs(MultiLevelRois) and Inputs(MultiLevelScores) "
-                          "should have same lod.");
-      }
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        framework::GetDataTypeOfVar(ctx.MultiInputVar("MultiLevelRois")[0]);
-    return framework::OpKernelType(data_type, ctx.GetPlace());
-  }
-};
-
-class CollectFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("MultiLevelRois",
-             "(LoDTensor) Multiple roi LoDTensors from each level in shape "
-             "(N, 4), N is the number of RoIs")
-        .AsDuplicable();
-    AddInput("MultiLevelScores",
-             "(LoDTensor) Multiple score LoDTensors from each level in shape"
-             " (N, 1), N is the number of RoIs.")
-        .AsDuplicable();
-    AddOutput("FpnRois", "(LoDTensor) All selected RoIs with highest scores");
-    AddAttr<int>("post_nms_topN",
-                 "Select post_nms_topN RoIs from"
-                 " all images and all fpn layers");
-    AddComment(R"DOC(
-This operator concats all proposals from different images
- and different FPN levels. Then sort all of those proposals
-by objectness confidence. Select the post_nms_topN RoIs in
- total. Finally, re-sort the RoIs in the order of batch index. 
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(collect_fpn_proposals, ops::CollectFpnProposalsOp,
-                  ops::CollectFpnProposalsOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(collect_fpn_proposals,
-                       ops::CollectFpnProposalsOpKernel<float>,
-                       ops::CollectFpnProposalsOpKernel<double>);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
deleted file mode 100644
index ba0b4ac010b75b916e2e9cfbc2d6d287a1790308..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-  Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-      http://www.apache.org/licenses/LICENSE-2.0
-  Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/fluid/memory/allocation/allocator.h>
-#include "cub/cub.cuh"
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 64;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-const int kBBoxSize = 4;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-static __global__ void GetLengthLoD(const int nthreads, const int* batch_ids,
-                                    int* length_lod) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (nthreads);
-       i += blockDim.x * gridDim.x) {
-    platform::CudaAtomicAdd(length_lod + batch_ids[i], 1);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto roi_ins = ctx.MultiInput<LoDTensor>("MultiLevelRois");
-    const auto score_ins = ctx.MultiInput<LoDTensor>("MultiLevelScores");
-    auto fpn_rois = ctx.Output<LoDTensor>("FpnRois");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    const int post_nms_topN = ctx.Attr<int>("post_nms_topN");
-
-    // concat inputs along axis = 0
-    int roi_offset = 0;
-    int score_offset = 0;
-    int total_roi_num = 0;
-    for (size_t i = 0; i < roi_ins.size(); ++i) {
-      total_roi_num += roi_ins[i]->dims()[0];
-    }
-
-    int real_post_num = min(post_nms_topN, total_roi_num);
-    fpn_rois->mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
-    Tensor concat_rois;
-    Tensor concat_scores;
-    T* concat_rois_data = concat_rois.mutable_data<T>(
-        {total_roi_num, kBBoxSize}, dev_ctx.GetPlace());
-    T* concat_scores_data =
-        concat_scores.mutable_data<T>({total_roi_num, 1}, dev_ctx.GetPlace());
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({total_roi_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
-    int index = 0;
-    int lod_size;
-    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
-
-    for (size_t i = 0; i < roi_ins.size(); ++i) {
-      auto roi_in = roi_ins[i];
-      auto score_in = score_ins[i];
-      auto roi_lod = roi_in->lod().back();
-      lod_size = roi_lod.size() - 1;
-      for (size_t n = 0; n < lod_size; ++n) {
-        for (size_t j = roi_lod[n]; j < roi_lod[n + 1]; ++j) {
-          roi_batch_id_data[index++] = n;
-        }
-      }
-
-      memory::Copy(place, concat_rois_data + roi_offset, place,
-                   roi_in->data<T>(), roi_in->numel() * sizeof(T),
-                   dev_ctx.stream());
-      memory::Copy(place, concat_scores_data + score_offset, place,
-                   score_in->data<T>(), score_in->numel() * sizeof(T),
-                   dev_ctx.stream());
-      roi_offset += roi_in->numel();
-      score_offset += score_in->numel();
-    }
-
-    // copy batch id list to GPU
-    Tensor roi_batch_id_list_gpu;
-    framework::TensorCopy(roi_batch_id_list, dev_ctx.GetPlace(),
-                          &roi_batch_id_list_gpu);
-
-    Tensor index_in_t;
-    int* idx_in =
-        index_in_t.mutable_data<int>({total_roi_num}, dev_ctx.GetPlace());
-    platform::ForRange<platform::CUDADeviceContext> for_range_total(
-        dev_ctx, total_roi_num);
-    for_range_total(RangeInitFunctor{0, 1, idx_in});
-
-    Tensor keys_out_t;
-    T* keys_out =
-        keys_out_t.mutable_data<T>({total_roi_num}, dev_ctx.GetPlace());
-    Tensor index_out_t;
-    int* idx_out =
-        index_out_t.mutable_data<int>({total_roi_num}, dev_ctx.GetPlace());
-
-    // Determine temporary device storage requirements
-    size_t temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairsDescending<T, int>(
-        nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
-        idx_out, total_roi_num);
-    // Allocate temporary storage
-    auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-
-    // Run sorting operation
-    // sort score to get corresponding index
-    cub::DeviceRadixSort::SortPairsDescending<T, int>(
-        d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
-        keys_out, idx_in, idx_out, total_roi_num);
-    index_out_t.Resize({real_post_num});
-    Tensor sorted_rois;
-    sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
-    Tensor sorted_batch_id;
-    sorted_batch_id.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
-    GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
-    GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
-                   &sorted_batch_id);
-
-    Tensor batch_index_t;
-    int* batch_idx_in =
-        batch_index_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
-    platform::ForRange<platform::CUDADeviceContext> for_range_post(
-        dev_ctx, real_post_num);
-    for_range_post(RangeInitFunctor{0, 1, batch_idx_in});
-
-    Tensor out_id_t;
-    int* out_id_data =
-        out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
-    // Determine temporary device storage requirements
-    temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs<int, int>(
-        nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
-        batch_idx_in, index_out_t.data<int>(), real_post_num);
-    // Allocate temporary storage
-    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-
-    // Run sorting operation
-    // sort batch_id to get corresponding index
-    cub::DeviceRadixSort::SortPairs<int, int>(
-        d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
-        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
-
-    GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
-
-    Tensor length_lod;
-    int* length_lod_data =
-        length_lod.mutable_data<int>({lod_size}, dev_ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, int> set_zero;
-    set_zero(dev_ctx, &length_lod, static_cast<int>(0));
-
-    int blocks = NumBlocks(real_post_num);
-    int threads = kNumCUDAThreads;
-
-    // get length-based lod by batch ids
-    GetLengthLoD<<<blocks, threads>>>(real_post_num, out_id_data,
-                                      length_lod_data);
-    std::vector<int> length_lod_cpu(lod_size);
-    memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place,
-                 length_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
-    dev_ctx.Wait();
-
-    std::vector<size_t> offset(1, 0);
-    for (int i = 0; i < lod_size; ++i) {
-      offset.emplace_back(offset.back() + length_lod_cpu[i]);
-    }
-
-    framework::LoD lod;
-    lod.emplace_back(offset);
-    fpn_rois->set_lod(lod);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    collect_fpn_proposals,
-    ops::GPUCollectFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::GPUCollectFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
-                                        double>);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
deleted file mode 100644
index 268f7e2160f59c4f1780b1c0968b1e886d27ed1d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.*/
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-const int kBoxDim = 4;
-
-template <typename T>
-struct ScoreWithID {
-  T score;
-  int batch_id;
-  int index;
-  int level;
-  ScoreWithID() {
-    batch_id = -1;
-    index = -1;
-    level = -1;
-  }
-  ScoreWithID(T score_, int batch_id_, int index_, int level_) {
-    score = score_;
-    batch_id = batch_id_;
-    index = index_;
-    level = level_;
-  }
-};
-template <typename T>
-static inline bool CompareByScore(ScoreWithID<T> a, ScoreWithID<T> b) {
-  return a.score >= b.score;
-}
-
-template <typename T>
-static inline bool CompareByBatchid(ScoreWithID<T> a, ScoreWithID<T> b) {
-  return a.batch_id < b.batch_id;
-}
-
-template <typename T>
-class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto multi_layer_rois =
-        context.MultiInput<paddle::framework::LoDTensor>("MultiLevelRois");
-
-    auto multi_layer_scores =
-        context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
-
-    auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
-
-    int post_nms_topN = context.Attr<int>("post_nms_topN");
-
-    PADDLE_ENFORCE_GE(post_nms_topN, 0UL,
-                      "The parameter post_nms_topN must be a positive integer");
-
-    // assert that the length of Rois and scores are same
-    PADDLE_ENFORCE(multi_layer_rois.size() == multi_layer_scores.size(),
-                   "DistributeFpnProposalsOp need 1 level of LoD");
-    // Check if the lod information of two LoDTensor is same
-    const int num_fpn_level = multi_layer_rois.size();
-    std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
-    for (int i = 0; i < num_fpn_level; ++i) {
-      auto cur_rois_lod = multi_layer_rois[i]->lod().back();
-      integral_of_all_rois[i + 1] =
-          integral_of_all_rois[i] + cur_rois_lod[cur_rois_lod.size() - 1];
-    }
-
-    // concatenate all fpn rois scores into a list
-    // create a vector to store all scores
-    std::vector<ScoreWithID<T>> scores_of_all_rois(
-        integral_of_all_rois[num_fpn_level], ScoreWithID<T>());
-    for (int i = 0; i < num_fpn_level; ++i) {
-      const T* cur_level_scores = multi_layer_scores[i]->data<T>();
-      int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
-      auto cur_scores_lod = multi_layer_scores[i]->lod().back();
-      int cur_batch_id = 0;
-      for (int j = 0; j < cur_level_num; ++j) {
-        if (j >= cur_scores_lod[cur_batch_id + 1]) {
-          cur_batch_id++;
-        }
-        int cur_index = j + integral_of_all_rois[i];
-        scores_of_all_rois[cur_index].score = cur_level_scores[j];
-        scores_of_all_rois[cur_index].index = j;
-        scores_of_all_rois[cur_index].level = i;
-        scores_of_all_rois[cur_index].batch_id = cur_batch_id;
-      }
-    }
-    // keep top post_nms_topN rois
-    // sort the rois by the score
-    if (post_nms_topN > integral_of_all_rois[num_fpn_level]) {
-      post_nms_topN = integral_of_all_rois[num_fpn_level];
-    }
-    std::stable_sort(scores_of_all_rois.begin(), scores_of_all_rois.end(),
-                     CompareByScore<T>);
-    scores_of_all_rois.resize(post_nms_topN);
-    // sort by batch id
-    std::stable_sort(scores_of_all_rois.begin(), scores_of_all_rois.end(),
-                     CompareByBatchid<T>);
-    // create a pointer array
-    std::vector<const T*> multi_fpn_rois_data(num_fpn_level);
-    for (int i = 0; i < num_fpn_level; ++i) {
-      multi_fpn_rois_data[i] = multi_layer_rois[i]->data<T>();
-    }
-    // initialize the outputs
-    fpn_rois->mutable_data<T>({post_nms_topN, kBoxDim}, context.GetPlace());
-    T* fpn_rois_data = fpn_rois->data<T>();
-    std::vector<size_t> lod0(1, 0);
-    int cur_batch_id = 0;
-    for (int i = 0; i < post_nms_topN; ++i) {
-      int cur_fpn_level = scores_of_all_rois[i].level;
-      int cur_level_index = scores_of_all_rois[i].index;
-      memcpy(fpn_rois_data,
-             multi_fpn_rois_data[cur_fpn_level] + cur_level_index * kBoxDim,
-             kBoxDim * sizeof(T));
-      fpn_rois_data += kBoxDim;
-      if (scores_of_all_rois[i].batch_id != cur_batch_id) {
-        cur_batch_id = scores_of_all_rois[i].batch_id;
-        lod0.emplace_back(i);
-      }
-    }
-    lod0.emplace_back(post_nms_topN);
-    framework::LoD lod;
-    lod.emplace_back(lod0);
-    fpn_rois->set_lod(lod);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc
deleted file mode 100644
index cacd47ed4a80489c59cdd80747d69c70bd5ea286..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/density_prior_box_op.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/density_prior_box_op.h"
-
-namespace paddle {
-namespace operators {
-
-class DensityPriorBoxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of DensityPriorBoxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Image"),
-                   "Input(Image) of DensityPriorBoxOp should not be null.");
-
-    auto image_dims = ctx->GetInputDim("Image");
-    auto input_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
-
-    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
-                      "The height of input must smaller than image.");
-
-    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
-                      "The width of input must smaller than image.");
-    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
-
-    auto fixed_sizes = ctx->Attrs().Get<std::vector<float>>("fixed_sizes");
-    auto fixed_ratios = ctx->Attrs().Get<std::vector<float>>("fixed_ratios");
-    auto densities = ctx->Attrs().Get<std::vector<int>>("densities");
-    bool flatten = ctx->Attrs().Get<bool>("flatten_to_2d");
-
-    PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(),
-                      "The number of fixed_sizes and densities must be equal.");
-    size_t num_priors = 0;
-    for (size_t i = 0; i < densities.size(); ++i) {
-      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-    }
-    if (!flatten) {
-      std::vector<int64_t> dim_vec(4);
-      dim_vec[0] = input_dims[2];
-      dim_vec[1] = input_dims[3];
-      dim_vec[2] = num_priors;
-      dim_vec[3] = 4;
-      ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
-      ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
-    } else {
-      int64_t dim0 = input_dims[2] * input_dims[3] * num_priors;
-      ctx->SetOutputDim("Boxes", {dim0, 4});
-      ctx->SetOutputDim("Variances", {dim0, 4});
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>("Input")->type(), ctx.GetPlace());
-  }
-};
-
-class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "Input",
-        "(Tensor, default Tensor<float>), "
-        "the input feature data of DensityPriorBoxOp, the layout is NCHW.");
-    AddInput("Image",
-             "(Tensor, default Tensor<float>), "
-             "the input image data of DensityPriorBoxOp, the layout is NCHW.");
-    AddOutput("Boxes",
-              "(Tensor, default Tensor<float>), the output prior boxes of "
-              "DensityPriorBoxOp. The layout is [H, W, num_priors, 4]. "
-              "H is the height of input, W is the width of input, num_priors "
-              "is the box count of each position.");
-    AddOutput("Variances",
-              "(Tensor, default Tensor<float>), the expanded variances of "
-              "DensityPriorBoxOp. The layout is [H, W, num_priors, 4]. "
-              "H is the height of input, W is the width of input, num_priors "
-              "is the box count of each position.");
-    AddAttr<std::vector<float>>("variances",
-                                "(vector<float>) List of variances to be "
-                                "encoded in density prior boxes.")
-        .AddCustomChecker([](const std::vector<float>& variances) {
-          PADDLE_ENFORCE_EQ(variances.size(), 4,
-                            "Must and only provide 4 variance.");
-          for (size_t i = 0; i < variances.size(); ++i) {
-            PADDLE_ENFORCE_GT(variances[i], 0.0,
-                              "variance[%d] must be greater than 0.", i);
-          }
-        });
-    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
-        .SetDefault(true);
-    AddAttr<bool>("flatten_to_2d",
-                  "(bool) Whether to flatten to 2D and "
-                  "the second dim is 4.")
-        .SetDefault(false);
-    AddAttr<float>(
-        "step_w",
-        "Density prior boxes step across width, 0.0 for auto calculation.")
-        .SetDefault(0.0)
-        .AddCustomChecker([](const float& step_w) {
-          PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0.");
-        });
-    AddAttr<float>(
-        "step_h",
-        "Density prior boxes step across height, 0.0 for auto calculation.")
-        .SetDefault(0.0)
-        .AddCustomChecker([](const float& step_h) {
-          PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0.");
-        });
-
-    AddAttr<float>("offset",
-                   "(float) "
-                   "Density prior boxes center offset.")
-        .SetDefault(0.5);
-    AddAttr<std::vector<float>>("fixed_sizes",
-                                "(vector<float>) List of fixed sizes "
-                                "of generated density prior boxes.")
-        .SetDefault(std::vector<float>{})
-        .AddCustomChecker([](const std::vector<float>& fixed_sizes) {
-          for (size_t i = 0; i < fixed_sizes.size(); ++i) {
-            PADDLE_ENFORCE_GT(fixed_sizes[i], 0.0,
-                              "fixed_sizes[%d] should be larger than 0.", i);
-          }
-        });
-
-    AddAttr<std::vector<float>>("fixed_ratios",
-                                "(vector<float>) List of fixed ratios "
-                                "of generated density prior boxes.")
-        .SetDefault(std::vector<float>{})
-        .AddCustomChecker([](const std::vector<float>& fixed_ratios) {
-          for (size_t i = 0; i < fixed_ratios.size(); ++i) {
-            PADDLE_ENFORCE_GT(fixed_ratios[i], 0.0,
-                              "fixed_ratios[%d] should be larger than 0.", i);
-          }
-        });
-
-    AddAttr<std::vector<int>>("densities",
-                              "(vector<float>) List of densities "
-                              "of generated density prior boxes.")
-        .SetDefault(std::vector<int>{})
-        .AddCustomChecker([](const std::vector<int>& densities) {
-          for (size_t i = 0; i < densities.size(); ++i) {
-            PADDLE_ENFORCE_GT(densities[i], 0,
-                              "densities[%d] should be larger than 0.", i);
-          }
-        });
-    AddComment(R"DOC(
-        Density Prior box operator
-        Each position of the input produce N density prior boxes, N is determined by
-        the count of fixed_ratios, densities, the calculation of N is as follows:
-        for density in densities:
-        N += size(fixed_ratios)*density^2
-        )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(density_prior_box, ops::DensityPriorBoxOp,
-                  ops::DensityPriorBoxOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(density_prior_box, ops::DensityPriorBoxOpKernel<float>,
-                       ops::DensityPriorBoxOpKernel<double>);
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
deleted file mode 100644
index 6337a4837a64cef2ce0e7bae70d8ba5b8994958e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/density_prior_box_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static __device__ inline T Clip(T in) {
-  return min(max(in, 0.), 1.);
-}
-
-template <typename T>
-static __global__ void GenDensityPriorBox(
-    const int height, const int width, const int im_height, const int im_width,
-    const T offset, const T step_width, const T step_height,
-    const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin,
-    const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) {
-  int gidx = blockIdx.x * blockDim.x + threadIdx.x;
-  int gidy = blockIdx.y * blockDim.y + threadIdx.y;
-  int step_x = blockDim.x * gridDim.x;
-  int step_y = blockDim.y * gridDim.y;
-
-  const T* width_ratio = ratios_shift;
-  const T* height_ratio = ratios_shift + num_priors;
-  const T* width_shift = ratios_shift + 2 * num_priors;
-  const T* height_shift = ratios_shift + 3 * num_priors;
-
-  for (int j = gidy; j < height; j += step_y) {
-    for (int i = gidx; i < width * num_priors; i += step_x) {
-      int h = j;
-      int w = i / num_priors;
-      int k = i % num_priors;
-
-      T center_x = (w + offset) * step_width;
-      T center_y = (h + offset) * step_height;
-
-      T center_x_temp = center_x + width_shift[k];
-      T center_y_temp = center_y + height_shift[k];
-
-      T box_width_ratio = width_ratio[k] / 2.;
-      T box_height_ratio = height_ratio[k] / 2.;
-
-      T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.);
-      T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.);
-      T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.);
-      T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.);
-
-      int out_offset = (j * width * num_priors + i) * 4;
-      out[out_offset] = is_clip ? Clip<T>(xmin) : xmin;
-      out[out_offset + 1] = is_clip ? Clip<T>(ymin) : ymin;
-      out[out_offset + 2] = is_clip ? Clip<T>(xmax) : xmax;
-      out[out_offset + 3] = is_clip ? Clip<T>(ymax) : ymax;
-
-      var[out_offset] = var_xmin;
-      var[out_offset + 1] = var_ymin;
-      var[out_offset + 2] = var_xmax;
-      var[out_offset + 3] = var_ymax;
-    }
-  }
-}
-
-template <typename T>
-class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
-
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-    auto is_clip = ctx.Attr<bool>("clip");
-
-    auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
-    auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
-    auto densities = ctx.Attr<std::vector<int>>("densities");
-
-    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
-    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
-    T offset = static_cast<T>(ctx.Attr<float>("offset"));
-
-    auto img_width = image->dims()[3];
-    auto img_height = image->dims()[2];
-
-    auto feature_width = input->dims()[3];
-    auto feature_height = input->dims()[2];
-
-    T step_width, step_height;
-    if (step_w == 0 || step_h == 0) {
-      step_width = static_cast<T>(img_width) / feature_width;
-      step_height = static_cast<T>(img_height) / feature_height;
-    } else {
-      step_width = step_w;
-      step_height = step_h;
-    }
-
-    int num_priors = 0;
-    for (size_t i = 0; i < densities.size(); ++i) {
-      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-    }
-    int step_average = static_cast<int>((step_width + step_height) * 0.5);
-
-    framework::Tensor h_temp;
-    T* tdata = h_temp.mutable_data<T>({num_priors * 4}, platform::CPUPlace());
-    int idx = 0;
-    for (size_t s = 0; s < fixed_sizes.size(); ++s) {
-      auto fixed_size = fixed_sizes[s];
-      int density = densities[s];
-      for (size_t r = 0; r < fixed_ratios.size(); ++r) {
-        float ar = fixed_ratios[r];
-        int shift = step_average / density;
-        float box_width_ratio = fixed_size * sqrt(ar);
-        float box_height_ratio = fixed_size / sqrt(ar);
-        for (int di = 0; di < density; ++di) {
-          for (int dj = 0; dj < density; ++dj) {
-            float center_x_temp = shift / 2. + dj * shift - step_average / 2.;
-            float center_y_temp = shift / 2. + di * shift - step_average / 2.;
-            tdata[idx] = box_width_ratio;
-            tdata[num_priors + idx] = box_height_ratio;
-            tdata[2 * num_priors + idx] = center_x_temp;
-            tdata[3 * num_priors + idx] = center_y_temp;
-            idx++;
-          }
-        }
-      }
-    }
-
-    boxes->mutable_data<T>(ctx.GetPlace());
-    vars->mutable_data<T>(ctx.GetPlace());
-
-    framework::Tensor d_temp;
-    framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp);
-
-    // At least use 32 threads, at most 512 threads.
-    // blockx is multiple of 32.
-    int blockx = std::min(
-        static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5),
-        static_cast<int64_t>(512L));
-    int gridx = (feature_width * num_priors + blockx - 1) / blockx;
-    dim3 threads(blockx, 1);
-    dim3 grids(gridx, feature_height);
-
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    GenDensityPriorBox<T><<<grids, threads, 0, stream>>>(
-        feature_height, feature_width, img_height, img_width, offset,
-        step_width, step_height, num_priors, d_temp.data<T>(), is_clip,
-        variances[0], variances[1], variances[2], variances[3],
-        boxes->data<T>(), vars->data<T>());
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(density_prior_box,
-                        ops::DensityPriorBoxOpCUDAKernel<float>,
-                        ops::DensityPriorBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
deleted file mode 100644
index 42137215e21af1a529563ecc995a54d610120beb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/operators/detection/prior_box_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
-
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-    auto clip = ctx.Attr<bool>("clip");
-
-    auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
-    auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
-    auto densities = ctx.Attr<std::vector<int>>("densities");
-
-    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
-    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
-    T offset = static_cast<T>(ctx.Attr<float>("offset"));
-
-    auto img_width = image->dims()[3];
-    auto img_height = image->dims()[2];
-
-    auto feature_width = input->dims()[3];
-    auto feature_height = input->dims()[2];
-
-    T step_width, step_height;
-    if (step_w == 0 || step_h == 0) {
-      step_width = static_cast<T>(img_width) / feature_width;
-      step_height = static_cast<T>(img_height) / feature_height;
-    } else {
-      step_width = step_w;
-      step_height = step_h;
-    }
-    int num_priors = 0;
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for reduction(+ : num_priors)
-#endif
-    for (size_t i = 0; i < densities.size(); ++i) {
-      num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-    }
-
-    boxes->mutable_data<T>(ctx.GetPlace());
-    vars->mutable_data<T>(ctx.GetPlace());
-
-    auto box_dim = vars->dims();
-    boxes->Resize({feature_height, feature_width, num_priors, 4});
-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
-    int step_average = static_cast<int>((step_width + step_height) * 0.5);
-
-    std::vector<float> sqrt_fixed_ratios;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-    for (size_t i = 0; i < fixed_ratios.size(); i++) {
-      sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
-    }
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(2)
-#endif
-    for (int h = 0; h < feature_height; ++h) {
-      for (int w = 0; w < feature_width; ++w) {
-        T center_x = (w + offset) * step_width;
-        T center_y = (h + offset) * step_height;
-        int idx = 0;
-        // Generate density prior boxes with fixed sizes.
-        for (size_t s = 0; s < fixed_sizes.size(); ++s) {
-          auto fixed_size = fixed_sizes[s];
-          int density = densities[s];
-          int shift = step_average / density;
-          // Generate density prior boxes with fixed ratios.
-          for (size_t r = 0; r < fixed_ratios.size(); ++r) {
-            float box_width_ratio = fixed_size * sqrt_fixed_ratios[r];
-            float box_height_ratio = fixed_size / sqrt_fixed_ratios[r];
-            float density_center_x = center_x - step_average / 2. + shift / 2.;
-            float density_center_y = center_y - step_average / 2. + shift / 2.;
-            for (int di = 0; di < density; ++di) {
-              for (int dj = 0; dj < density; ++dj) {
-                float center_x_temp = density_center_x + dj * shift;
-                float center_y_temp = density_center_y + di * shift;
-                e_boxes(h, w, idx, 0) = std::max(
-                    (center_x_temp - box_width_ratio / 2.) / img_width, 0.);
-                e_boxes(h, w, idx, 1) = std::max(
-                    (center_y_temp - box_height_ratio / 2.) / img_height, 0.);
-                e_boxes(h, w, idx, 2) = std::min(
-                    (center_x_temp + box_width_ratio / 2.) / img_width, 1.);
-                e_boxes(h, w, idx, 3) = std::min(
-                    (center_y_temp + box_height_ratio / 2.) / img_height, 1.);
-                idx++;
-              }
-            }
-          }
-        }
-      }
-    }
-    if (clip) {
-      T* dt = boxes->data<T>();
-      std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
-        return std::min<T>(std::max<T>(v, 0.), 1.);
-      });
-    }
-    framework::Tensor var_t;
-    var_t.mutable_data<T>(
-        framework::make_ddim({1, static_cast<int>(variances.size())}),
-        ctx.GetPlace());
-
-    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
-
-    for (size_t i = 0; i < variances.size(); ++i) {
-      var_et(0, i) = variances[i];
-    }
-
-    int box_num = feature_height * feature_width * num_priors;
-    auto var_dim = vars->dims();
-    vars->Resize({box_num, static_cast<int>(variances.size())});
-
-    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(2)
-#endif
-    for (int i = 0; i < box_num; ++i) {
-      for (size_t j = 0; j < variances.size(); ++j) {
-        e_vars(i, j) = variances[j];
-      }
-    }
-
-    vars->Resize(var_dim);
-    boxes->Resize(box_dim);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
deleted file mode 100644
index 4cc989b6325f4da0cb38dd25a1529178a9af2268..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
-
-namespace paddle {
-namespace operators {
-
-class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("FpnRois"),
-                   "Input(FpnRois) shouldn't be null");
-    PADDLE_ENFORCE_GE(
-        ctx->Outputs("MultiFpnRois").size(), 1UL,
-        "Outputs(MultiFpnRois) of DistributeOp should not be empty");
-    size_t min_level = static_cast<size_t>(ctx->Attrs().Get<int>("min_level"));
-    size_t max_level = static_cast<size_t>(ctx->Attrs().Get<int>("max_level"));
-    PADDLE_ENFORCE_GE(max_level, min_level,
-                      "max_level must not lower than min_level");
-    // Set the output shape
-    size_t num_out_rois = max_level - min_level + 1;
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.reserve(num_out_rois);
-    for (size_t i = 0; i < num_out_rois; ++i) {
-      framework::DDim out_dim = {-1, 4};
-      outs_dims.push_back(out_dim);
-    }
-    ctx->SetOutputsDim("MultiFpnRois", outs_dims);
-    ctx->SetOutputDim("RestoreIndex", {-1, 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("FpnRois"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)");
-    AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator")
-        .AsDuplicable();
-    AddOutput("RestoreIndex",
-              "(Tensor) An array of positive number which is "
-              "used to restore the order of FpnRois");
-    AddAttr<int>("min_level",
-                 "The lowest level of FPN layer where the"
-                 " proposals come from");
-    AddAttr<int>("max_level",
-                 "The highest level of FPN layer where the"
-                 " proposals come from");
-    AddAttr<int>("refer_level",
-                 "The referring level of FPN layer with"
-                 " specified scale");
-    AddAttr<int>("refer_scale",
-                 "The referring scale of FPN layer with"
-                 " specified level");
-    AddComment(R"DOC(
-This operator distribute all proposals into different fpn level,
- with respect to scale of the proposals, the referring scale and
- the referring level. Besides, to restore the order of proposals,
-we return an array which indicate the original index of rois in
- current proposals.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(distribute_fpn_proposals, ops::DistributeFpnProposalsOp,
-                  ops::DistributeFpnProposalsOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals,
-                       ops::DistributeFpnProposalsOpKernel<float>,
-                       ops::DistributeFpnProposalsOpKernel<double>);
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
deleted file mode 100644
index f34866360f91b8e75d8e0e89425ba2b2e83af8af..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/fluid/memory/allocation/allocator.h>
-#include "cub/cub.cuh"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 64;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-int const BBoxSize = 4;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <class T>
-__global__ void GPUDistFpnProposalsHelper(
-    const int nthreads, const T* rois, const int lod_size,
-    const int refer_level, const int refer_scale, const int max_level,
-    const int min_level, int* roi_batch_id_data, int* sub_lod_list,
-    int* target_lvls) {
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    const T* offset_roi = rois + i * BBoxSize;
-    int roi_batch_ind = roi_batch_id_data[i];
-    // get the target level of current rois
-    T roi_area = RoIArea(offset_roi, false);
-    T roi_scale = sqrt(roi_area);
-    int tgt_lvl = floor(
-        log2(roi_scale / static_cast<T>(refer_scale) + (T)1e-6) + refer_level);
-    tgt_lvl = min(max_level, max(tgt_lvl, min_level));
-    target_lvls[i] = tgt_lvl;
-    // compute number of rois in the same batch and same target level
-    platform::CudaAtomicAdd(
-        sub_lod_list + (tgt_lvl - min_level) * lod_size + roi_batch_ind, 1);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* fpn_rois = ctx.Input<paddle::framework::LoDTensor>("FpnRois");
-
-    auto multi_fpn_rois = ctx.MultiOutput<LoDTensor>("MultiFpnRois");
-    auto* restore_index = ctx.Output<Tensor>("RestoreIndex");
-
-    const int min_level = ctx.Attr<int>("min_level");
-    const int max_level = ctx.Attr<int>("max_level");
-    const int refer_level = ctx.Attr<int>("refer_level");
-    const int refer_scale = ctx.Attr<int>("refer_scale");
-    int num_level = max_level - min_level + 1;
-
-    // check that the fpn_rois is not empty
-    PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
-                      "DistributeFpnProposalsOp need 1 level of LoD");
-
-    auto fpn_rois_lod = fpn_rois->lod().back();
-    int lod_size = fpn_rois_lod.size() - 1;
-    int roi_num = fpn_rois_lod[lod_size];
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    // get batch id by lod in CPU
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({roi_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
-    for (int n = 0; n < lod_size; ++n) {
-      for (size_t i = fpn_rois_lod[n]; i < fpn_rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-    // copy batch id list to GPU
-    Tensor roi_batch_id_list_gpu;
-    framework::TensorCopySync(roi_batch_id_list, dev_ctx.GetPlace(),
-                              &roi_batch_id_list_gpu);
-
-    Tensor sub_lod_list;
-    sub_lod_list.Resize({num_level, lod_size});
-    int* sub_lod_list_data = sub_lod_list.mutable_data<int>(dev_ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, int> set_zero;
-    set_zero(dev_ctx, &sub_lod_list, static_cast<int>(0));
-
-    Tensor target_lvls;
-    target_lvls.Resize({roi_num});
-    int* target_lvls_data = target_lvls.mutable_data<int>(dev_ctx.GetPlace());
-
-    int dist_blocks = NumBlocks(roi_num);
-    int threads = kNumCUDAThreads;
-    // get target levels and sub_lod list
-    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads>>>(
-        roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
-        max_level, min_level, roi_batch_id_list_gpu.data<int>(),
-        sub_lod_list_data, target_lvls_data);
-    dev_ctx.Wait();
-    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
-
-    Tensor index_in_t;
-    int* idx_in = index_in_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, roi_num);
-    for_range(RangeInitFunctor{0, 1, idx_in});
-
-    Tensor keys_out_t;
-    int* keys_out = keys_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
-    Tensor index_out_t;
-    int* idx_out = index_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
-
-    // Determine temporary device storage requirements
-    size_t temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
-                                              target_lvls_data, keys_out,
-                                              idx_in, idx_out, roi_num);
-    // Allocate temporary storage
-    auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-
-    // Run sorting operation
-    // sort target level to get corresponding index
-    cub::DeviceRadixSort::SortPairs<int, int>(
-        d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
-        idx_in, idx_out, roi_num);
-
-    int* restore_idx_data =
-        restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
-    // sort current index to get restore index
-    cub::DeviceRadixSort::SortPairs<int, int>(
-        d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
-        restore_idx_data, roi_num);
-
-    int start = 0;
-    for (int i = 0; i < num_level; ++i) {
-      Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
-      int* sub_lod_data = sub_lod.data<int>();
-      // transfer length-based lod to offset-based lod
-      std::vector<size_t> offset(1, 0);
-      std::vector<int> sub_lod_cpu(lod_size);
-      memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place,
-                   sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
-      dev_ctx.Wait();
-      for (int j = 0; j < lod_size; ++j) {
-        offset.emplace_back(offset.back() + sub_lod_cpu[j]);
-      }
-
-      int sub_rois_num = offset.back();
-
-      int end = start + sub_rois_num;
-      if (end > start) {
-        Tensor sub_idx = index_out_t.Slice(start, end);
-        start = end;
-        multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
-                                           dev_ctx.GetPlace());
-        GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
-      } else {
-        multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
-                                           dev_ctx.GetPlace());
-      }
-      framework::LoD lod;
-      lod.emplace_back(offset);
-      multi_fpn_rois[i]->set_lod(lod);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    distribute_fpn_proposals,
-    ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
-                                           double>);
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
deleted file mode 100644
index a3196ea5f6b357a552c40ba0b3ae2a975d12f46d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-const int kBoxDim = 4;
-
-template <typename T>
-static inline T BBoxArea(const T* box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <typename T>
-class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* fpn_rois = context.Input<paddle::framework::LoDTensor>("FpnRois");
-
-    auto multi_fpn_rois =
-        context.MultiOutput<paddle::framework::LoDTensor>("MultiFpnRois");
-
-    auto* restore_index =
-        context.Output<paddle::framework::Tensor>("RestoreIndex");
-
-    const int min_level = context.Attr<int>("min_level");
-    const int max_level = context.Attr<int>("max_level");
-    const int refer_level = context.Attr<int>("refer_level");
-    const int refer_scale = context.Attr<int>("refer_scale");
-    const int num_level = max_level - min_level + 1;
-
-    // check that the fpn_rois is not empty
-    PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
-                      "DistributeFpnProposalsOp need 1 level of LoD");
-
-    auto fpn_rois_lod = fpn_rois->lod().back();
-    int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
-    std::vector<int> target_level;
-    // std::vector<int> target_level(fpn_rois_num, -1);
-    // record the number of rois in each level
-    std::vector<int> num_rois_level(num_level, 0);
-    std::vector<int> num_rois_level_integral(num_level + 1, 0);
-    for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
-          fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
-      const T* rois_data = fpn_rois_slice.data<T>();
-      for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
-        // get the target level of current rois
-        T roi_scale = std::sqrt(BBoxArea(rois_data, false));
-        int tgt_lvl = std::floor(std::log2(roi_scale / refer_scale + (T)1e-6) +
-                                 refer_level);
-        tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
-        target_level.push_back(tgt_lvl);
-        num_rois_level[tgt_lvl - min_level]++;
-        rois_data += kBoxDim;
-      }
-    }
-    // define the output rois
-    // pointer which point to each level fpn rois
-    std::vector<T*> multi_fpn_rois_data(num_level);
-    // lod0 which will record the offset information of each level rois
-    std::vector<std::vector<size_t>> multi_fpn_rois_lod0;
-    for (int i = 0; i < num_level; ++i) {
-      // allocate memory for each level rois
-      multi_fpn_rois[i]->mutable_data<T>({num_rois_level[i], kBoxDim},
-                                         context.GetPlace());
-      multi_fpn_rois_data[i] = multi_fpn_rois[i]->data<T>();
-      std::vector<size_t> lod0(1, 0);
-      multi_fpn_rois_lod0.push_back(lod0);
-      // statistic start point for each level rois
-      num_rois_level_integral[i + 1] =
-          num_rois_level_integral[i] + num_rois_level[i];
-    }
-    restore_index->mutable_data<int>({fpn_rois_num, 1}, context.GetPlace());
-    int* restore_index_data = restore_index->data<int>();
-    std::vector<int> restore_index_inter(fpn_rois_num, -1);
-    // distribute the rois into different fpn level by target level
-    for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
-          fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
-      const T* rois_data = fpn_rois_slice.data<T>();
-      size_t cur_offset = fpn_rois_lod[i];
-      // std::vector<size_t > lod_offset[num_level];
-      for (int j = 0; j < num_level; j++) {
-        multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]);
-      }
-      for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
-        int lvl = target_level[cur_offset + j];
-        memcpy(multi_fpn_rois_data[lvl - min_level], rois_data,
-               kBoxDim * sizeof(T));
-        multi_fpn_rois_data[lvl - min_level] += kBoxDim;
-        int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
-                               multi_fpn_rois_lod0[lvl - min_level][i + 1];
-        restore_index_inter[index_in_shuffle] = cur_offset + j;
-        multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
-        rois_data += kBoxDim;
-      }
-    }
-    for (int i = 0; i < fpn_rois_num; ++i) {
-      restore_index_data[restore_index_inter[i]] = i;
-    }
-    // merge lod information into LoDTensor
-    for (int i = 0; i < num_level; ++i) {
-      framework::LoD lod;
-      lod.emplace_back(multi_fpn_rois_lod0[i]);
-      multi_fpn_rois[i]->set_lod(lod);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
deleted file mode 100644
index 0d77c7f3a79fc491dfdc54d74c7cfebd85a5992e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ /dev/null
@@ -1,441 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/detection/mask_util.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-const int kBoxDim = 4;
-
-template <typename T>
-void AppendMask(LoDTensor* out, int64_t offset, Tensor* to_add) {
-  auto* out_data = out->data<T>();
-  auto* to_add_data = to_add->data<T>();
-  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
-}
-
-class GenerateMaskLabelsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
-                   "Input(GtClasses) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
-                   "Input(IsCrowd) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("GtSegms"),
-                   "Input(GtSegms) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Rois"), "Input(Rois) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LabelsInt32"),
-                   "Input(LabelsInt32) shouldn't be null.");
-
-    PADDLE_ENFORCE(
-        ctx->HasOutput("MaskRois"),
-        "Output(MaskRois) of GenerateMaskLabelsOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("RoiHasMaskInt32"),
-        "Output(RoiHasMaskInt32) of GenerateMaskLabelsOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("MaskInt32"),
-        "Output(MaskInt32) of GenerateMaskLabelsOp should not be null");
-
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-    auto gt_segms_dims = ctx->GetInputDim("GtSegms");
-    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
-                      "The rank of Input(ImInfo) must be 2.");
-    PADDLE_ENFORCE_EQ(gt_segms_dims.size(), 2,
-                      "The rank of Input(GtSegms) must be 2.");
-    PADDLE_ENFORCE_EQ(gt_segms_dims[1], 2,
-                      "The second dim of Input(GtSegms) must be 2.");
-    int num_classes = ctx->Attrs().Get<int>("num_classes");
-    int resolution = ctx->Attrs().Get<int>("resolution");
-
-    ctx->SetOutputDim("MaskRois", {-1, 4});
-    ctx->SetOutputDim("RoiHasMaskInt32", {-1, 1});
-    ctx->SetOutputDim("MaskInt32", {-1, num_classes * resolution * resolution});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Rois"));
-    return framework::OpKernelType(data_type, platform::CPUPlace());
-  }
-};
-
-/*
- * Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2)
- * to encode class specific mask targets.
- */
-template <typename T>
-static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
-                                    const Tensor& masks,
-                                    const Tensor& mask_class_labels,
-                                    const int resolution, const int num_classes,
-                                    Tensor* mask_targets) {
-  const uint8_t* masks_data = masks.data<uint8_t>();
-  int64_t num_mask = masks.dims()[0];
-  const int* mask_class_labels_data = mask_class_labels.data<int>();
-  const int M = resolution * resolution;
-  const int mask_dim = M * num_classes;
-
-  int* mask_targets_data =
-      mask_targets->mutable_data<int>({num_mask, mask_dim}, ctx.GetPlace());
-  math::set_constant(ctx, mask_targets, -1);
-  for (int64_t mask_id = 0; mask_id < num_mask; ++mask_id) {
-    int cls = mask_class_labels_data[mask_id];
-    int start = M * cls;
-    if (cls > 0) {
-      for (int i = 0; i < M; ++i) {
-        mask_targets_data[mask_id * mask_dim + start + i] =
-            static_cast<int>(masks_data[mask_id * M + i]);
-      }
-    }
-  }
-}
-
-template <typename T>
-std::vector<Tensor> SampleMaskForOneImage(
-    const platform::CPUDeviceContext& ctx, const Tensor& im_info,
-    const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_segms,
-    const Tensor& rois, const Tensor& label_int32, const int num_classes,
-    const int resolution, const framework::LoD& segm_length) {
-  // Prepare the mask targets by associating one gt mask to each training roi
-  // that has a fg (non-bg) class label.
-  const int64_t gt_size = static_cast<int64_t>(gt_classes.dims()[0]);
-  const int64_t roi_size = static_cast<int64_t>(rois.dims()[0]);
-  const int* gt_classes_data = gt_classes.data<int>();
-  const int* is_crowd_data = is_crowd.data<int>();
-  const int* label_int32_data = label_int32.data<int>();
-  PADDLE_ENFORCE_EQ(roi_size, label_int32.dims()[0]);
-
-  std::vector<int> mask_gt_inds, fg_inds;
-  std::vector<std::vector<std::vector<T>>> gt_polys;
-
-  auto polys_num = segm_length[1];
-  auto segm_lod_offset = framework::ConvertToOffsetBasedLoD(segm_length);
-  auto lod1 = segm_lod_offset[1];
-  auto lod2 = segm_lod_offset[2];
-  const T* polys_data = gt_segms.data<T>();
-  for (int64_t i = 0; i < gt_size; ++i) {
-    if ((gt_classes_data[i] > 0) && (is_crowd_data[i] == 0)) {
-      mask_gt_inds.emplace_back(i);
-
-      // slice fg segmentation polys
-      int poly_num = polys_num[i];
-      std::vector<std::vector<T>> polys;
-      int s_idx = lod1[i];
-      for (int j = 0; j < poly_num; ++j) {
-        int s = lod2[s_idx + j];
-        int e = lod2[s_idx + j + 1];
-        PADDLE_ENFORCE_NE(s, e);
-        std::vector<T> plts(polys_data + s * 2, polys_data + e * 2);
-        polys.push_back(plts);
-      }
-      gt_polys.push_back(polys);
-    }
-  }
-  for (int64_t i = 0; i < roi_size; ++i) {
-    if (label_int32_data[i] > 0) {
-      fg_inds.emplace_back(i);
-    }
-  }
-  int gt_num = mask_gt_inds.size();
-  int fg_num = fg_inds.size();
-
-  Tensor boxes_from_polys;
-  boxes_from_polys.mutable_data<T>({gt_num, 4}, platform::CPUPlace());
-  Poly2Boxes(gt_polys, boxes_from_polys.data<T>());
-
-  std::vector<int> roi_has_mask =
-      std::vector<int>(fg_inds.begin(), fg_inds.end());
-  Tensor mask_class_labels;
-  Tensor masks;
-  Tensor rois_fg;
-
-  auto im_scale = im_info.data<T>()[2];
-  if (fg_num > 0) {
-    // Class labels for the foreground rois
-    mask_class_labels.mutable_data<int>({fg_num, 1}, ctx.GetPlace());
-    Gather<int>(label_int32_data, 1, fg_inds.data(), fg_inds.size(),
-                mask_class_labels.data<int>());
-
-    uint8_t* masks_data = masks.mutable_data<uint8_t>(
-        {fg_num, resolution * resolution}, ctx.GetPlace());
-
-    // Find overlap between all foreground rois and the bounding boxes
-    // enclosing each segmentation
-    T* rois_fg_data = rois_fg.mutable_data<T>({fg_num, 4}, ctx.GetPlace());
-    Gather<T>(rois.data<T>(), 4, fg_inds.data(), fg_inds.size(),
-              rois_fg.data<T>());
-
-    for (int k = 0; k < rois_fg.numel(); ++k) {
-      rois_fg_data[k] = rois_fg_data[k] / im_scale;
-    }
-
-    Tensor overlaps_bbfg_bbpolys;
-    overlaps_bbfg_bbpolys.mutable_data<T>({fg_num, gt_num}, ctx.GetPlace());
-    BboxOverlaps<T>(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys);
-
-    // Map from each fg rois to the index of the mask with highest overlap
-    // (measured by bbox overlap)
-    T* overlaps_bbfg_bbpolys_data = overlaps_bbfg_bbpolys.data<T>();
-    std::vector<int> fg_masks_inds;
-    for (int64_t i = 0; i < fg_num; ++i) {
-      const T* v = overlaps_bbfg_bbpolys_data + i * gt_num;
-      T max_overlap = std::numeric_limits<T>::min();
-      int id = 0;
-      for (int64_t j = 0; j < gt_num; ++j) {
-        if (v[j] > max_overlap) {
-          max_overlap = v[j];
-          id = j;
-        }
-      }
-      fg_masks_inds.push_back(id);
-    }
-
-    // add fg targets
-    for (int64_t i = 0; i < fg_num; ++i) {
-      int fg_polys_ind = fg_masks_inds[i];
-      T* roi_fg = rois_fg_data + i * 4;
-      uint8_t* mask = masks_data + i * resolution * resolution;
-      Polys2MaskWrtBox(gt_polys[fg_polys_ind], roi_fg, resolution, mask);
-    }
-  } else {
-    // The network cannot handle empty blobs, so we must provide a mask
-    // We simply take the first bg roi, given it an all -1's mask (ignore
-    // label), and label it with class zero (bg).
-    int bg_num = 1;
-    T* rois_fg_data = rois_fg.mutable_data<T>({bg_num, 4}, ctx.GetPlace());
-    const T* rois_data = rois.data<T>();
-    std::vector<int> bg_inds;
-    for (int64_t i = 0; i < roi_size; ++i) {
-      if (label_int32_data[i] == 0) {
-        bg_inds.emplace_back(i);
-        rois_fg_data[0] = rois_data[0] / im_scale;
-        rois_fg_data[1] = rois_data[1] / im_scale;
-        rois_fg_data[2] = rois_data[2] / im_scale;
-        rois_fg_data[3] = rois_data[3] / im_scale;
-        break;
-      }
-    }
-    masks.mutable_data<uint8_t>({bg_num, resolution * resolution},
-                                ctx.GetPlace());
-    math::set_constant(ctx, &masks, -1);
-    int* mask_class_labels_data =
-        mask_class_labels.mutable_data<int>({bg_num, 1}, ctx.GetPlace());
-    mask_class_labels_data[0] = 0;
-    roi_has_mask = std::vector<int>(bg_inds.begin(), bg_inds.end());
-  }
-
-  Tensor masks_expand;
-  ExpandMaskTarget<T>(ctx, masks, mask_class_labels, resolution, num_classes,
-                      &masks_expand);
-
-  T* rois_fg_data = rois_fg.data<T>();
-  for (int k = 0; k < rois_fg.numel(); ++k) {
-    rois_fg_data[k] = rois_fg_data[k] * im_scale;
-  }
-
-  Tensor roi_has_mask_t;
-  int roi_has_mask_size = roi_has_mask.size();
-  int* roi_has_mask_data =
-      roi_has_mask_t.mutable_data<int>({roi_has_mask_size, 1}, ctx.GetPlace());
-  std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data);
-
-  std::vector<Tensor> res;
-  res.emplace_back(rois_fg);
-  res.emplace_back(roi_has_mask_t);
-  res.emplace_back(masks_expand);
-  return res;
-}
-
-template <typename T>
-class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* im_info = ctx.Input<LoDTensor>("ImInfo");
-    auto* gt_classes = ctx.Input<LoDTensor>("GtClasses");
-    auto* is_crowd = ctx.Input<LoDTensor>("IsCrowd");
-    auto* gt_segms = ctx.Input<LoDTensor>("GtSegms");
-    auto* rois = ctx.Input<LoDTensor>("Rois");
-    auto* label_int32 = ctx.Input<LoDTensor>("LabelsInt32");
-
-    auto* mask_rois = ctx.Output<LoDTensor>("MaskRois");
-    auto* roi_has_mask_int32 = ctx.Output<LoDTensor>("RoiHasMaskInt32");
-    auto* mask_int32 = ctx.Output<LoDTensor>("MaskInt32");
-
-    int num_classes = ctx.Attr<int>("num_classes");
-    int resolution = ctx.Attr<int>("resolution");
-
-    PADDLE_ENFORCE_EQ(gt_classes->lod().size(), 1UL,
-                      "GenerateMaskLabelsOp gt_classes needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
-                      "GenerateMaskLabelsOp is_crowd needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(rois->lod().size(), 1UL,
-                      "GenerateMaskLabelsOp rois needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(label_int32->lod().size(), 1UL,
-                      "GenerateMaskLabelsOp label_int32 needs 1 level of LoD");
-
-    PADDLE_ENFORCE_EQ(gt_segms->lod().size(), 3UL);
-
-    int64_t n = static_cast<int64_t>(gt_classes->lod().back().size() - 1);
-    PADDLE_ENFORCE_EQ(gt_segms->lod()[0].size() - 1, n);
-
-    int mask_dim = num_classes * resolution * resolution;
-    int roi_num = rois->lod().back()[n];
-    mask_rois->mutable_data<T>({roi_num, kBoxDim}, ctx.GetPlace());
-    roi_has_mask_int32->mutable_data<int>({roi_num, 1}, ctx.GetPlace());
-    mask_int32->mutable_data<int>({roi_num, mask_dim}, ctx.GetPlace());
-
-    framework::LoD lod;
-    std::vector<size_t> lod0(1, 0);
-
-    int64_t num_mask = 0;
-    auto& dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
-
-    auto gt_classes_lod = gt_classes->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    auto rois_lod = rois->lod().back();
-    auto label_int32_lod = label_int32->lod().back();
-    auto gt_segms_lod = gt_segms->lod();
-
-    for (int i = 0; i < n; ++i) {
-      if (rois_lod[i] == rois_lod[i + 1]) {
-        lod0.emplace_back(num_mask);
-        continue;
-      }
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor gt_classes_slice =
-          gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
-      Tensor is_crowd_slice =
-          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor label_int32_slice =
-          label_int32->Slice(label_int32_lod[i], label_int32_lod[i + 1]);
-      Tensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);
-
-      auto sub_lod_and_offset =
-          framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0);
-      auto lod_length = sub_lod_and_offset.first;
-      size_t s = sub_lod_and_offset.second.first;
-      size_t e = sub_lod_and_offset.second.second;
-      Tensor gt_segms_slice = gt_segms->Slice(s, e);
-
-      std::vector<Tensor> tensor_output = SampleMaskForOneImage<T>(
-          dev_ctx, im_info_slice, gt_classes_slice, is_crowd_slice,
-          gt_segms_slice, rois_slice, label_int32_slice, num_classes,
-          resolution, lod_length);
-
-      Tensor sampled_mask_rois = tensor_output[0];
-      Tensor sampled_roi_has_mask_int32 = tensor_output[1];
-      Tensor sampled_mask_int32 = tensor_output[2];
-
-      AppendMask<T>(mask_rois, kBoxDim * num_mask, &sampled_mask_rois);
-      AppendMask<int>(roi_has_mask_int32, num_mask,
-                      &sampled_roi_has_mask_int32);
-      AppendMask<int>(mask_int32, mask_dim * num_mask, &sampled_mask_int32);
-
-      num_mask += sampled_mask_rois.dims()[0];
-      lod0.emplace_back(num_mask);
-    }
-
-    lod.emplace_back(lod0);
-    mask_rois->set_lod(lod);
-    roi_has_mask_int32->set_lod(lod);
-    mask_int32->set_lod(lod);
-    mask_rois->Resize({num_mask, kBoxDim});
-    roi_has_mask_int32->Resize({num_mask, 1});
-    mask_int32->Resize({num_mask, mask_dim});
-  }
-};
-
-class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("ImInfo",
-             "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
-             "B is the number of input images, "
-             "each element consists of im_height, im_width, im_scale.");
-    AddInput("GtClasses",
-             "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
-             "M is the number of groundtruth, "
-             "each element is a class label of groundtruth.");
-    AddInput(
-        "IsCrowd",
-        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
-        "M is the number of groundtruth, "
-        "each element is a flag indicates whether a groundtruth is crowd.");
-    AddInput(
-        "GtSegms",
-        "(LoDTensor), This input is a 2D LoDTensor with shape [S, 2], it's LoD "
-        "level is 3. The LoD[0] represents the gt objects number of each "
-        "instance. LoD[1] represents the segmentation counts of each objects. "
-        "LoD[2] represents the polygons number of each segmentation. S the "
-        "total number of polygons coordinate points. Each element is (x, y) "
-        "coordinate points.");
-    AddInput(
-        "Rois",
-        "(LoDTensor), This input is a 2D LoDTensor with shape [R, 4]. "
-        "R is the number of rois which is the output of "
-        "generate_proposal_labels, "
-        "each element is a bounding box with (xmin, ymin, xmax, ymax) format.");
-    AddInput("LabelsInt32",
-             "(LoDTensor), This intput is a 2D LoDTensor with shape [R, 1], "
-             "each element repersents a class label of a roi");
-    AddOutput(
-        "MaskRois",
-        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
-        "P is the number of mask, "
-        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
-    AddOutput("RoiHasMaskInt32",
-              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
-              "each element repersents the output mask rois index with regard "
-              "to input rois");
-    AddOutput("MaskInt32",
-              "(LoDTensor), This output is a 4D LoDTensor with shape [P, Q], "
-              "Q equal to num_classes * resolution * resolution");
-
-    AddAttr<int>("num_classes", "Class number.");
-    AddAttr<int>("resolution", "Resolution of mask.");
-
-    AddComment(R"DOC(
-This operator can be, for given the RoIs and corresponding labels,
-to sample foreground RoIs. This mask branch also has
-a :math: `K \\times M^{2}` dimensional output targets for each foreground
-RoI, which encodes K binary masks of resolution M x M, one for each of the
-K classes. This mask targets are used to compute loss of mask branch.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(generate_mask_labels, ops::GenerateMaskLabelsOp,
-                  ops::GenerateMaskLabelsOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(generate_mask_labels,
-                       ops::GenerateMaskLabelsKernel<float>);
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
deleted file mode 100644
index 451e0ca85501bccd2588dd58d0c8efe7142559d9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ /dev/null
@@ -1,591 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-const int kBoxDim = 4;
-
-template <typename T>
-void AppendRois(LoDTensor* out, int64_t offset, Tensor* to_add) {
-  auto* out_data = out->data<T>();
-  auto* to_add_data = to_add->data<T>();
-  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
-}
-
-class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("RpnRois"),
-                   "Input(RpnRois) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
-                   "Input(GtClasses) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
-                   "Input(IsCrowd) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
-                   "Input(GtBoxes) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
-
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Rois"),
-        "Output(Rois) of GenerateProposalLabelsOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("LabelsInt32"),
-        "Output(LabelsInt32) of GenerateProposalLabelsOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("BboxTargets"),
-        "Output(BboxTargets) of GenerateProposalLabelsOp should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("BboxInsideWeights"),
-                   "Output(BboxInsideWeights) of GenerateProposalLabelsOp "
-                   "should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("BboxOutsideWeights"),
-                   "Output(BboxOutsideWeights) of GenerateProposalLabelsOp "
-                   "should not be null");
-
-    auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
-    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-
-    PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2,
-                      "The rank of Input(RpnRois) must be 2.");
-    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
-                      "The rank of Input(GtBoxes) must be 2.");
-    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
-                      "The rank of Input(ImInfo) must be 2.");
-
-    int class_nums = ctx->Attrs().Get<int>("class_nums");
-
-    ctx->SetOutputDim("Rois", {-1, 4});
-    ctx->SetOutputDim("LabelsInt32", {-1, 1});
-    ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums});
-    ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums});
-    ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("RpnRois"));
-    return framework::OpKernelType(data_type, platform::CPUPlace());
-  }
-};
-
-template <typename T>
-void Concat(const platform::CPUDeviceContext& context,
-            const Tensor& in_tensor_a, const Tensor& in_tensor_b,
-            Tensor* out_tensor) {
-  int axis = 0;
-  std::vector<Tensor> inputs;
-  inputs.emplace_back(in_tensor_a);
-  inputs.emplace_back(in_tensor_b);
-  math::ConcatFunctor<platform::CPUDeviceContext, T> concat_functor;
-  concat_functor(context, inputs, axis, out_tensor);
-}
-
-template <typename T>
-std::vector<std::vector<int>> SampleFgBgGt(
-    const platform::CPUDeviceContext& context, Tensor* iou,
-    const Tensor& is_crowd, const int batch_size_per_im,
-    const float fg_fraction, const float fg_thresh, const float bg_thresh_hi,
-    const float bg_thresh_lo, std::minstd_rand engine, const bool use_random,
-    const bool is_cascade_rcnn, const Tensor& rpn_rois) {
-  std::vector<int> fg_inds;
-  std::vector<int> bg_inds;
-  std::vector<int> mapped_gt_inds;
-  int64_t gt_num = is_crowd.numel();
-  const int* crowd_data = is_crowd.data<int>();
-  T* proposal_to_gt_overlaps = iou->data<T>();
-  int64_t row = iou->dims()[0];
-  int64_t col = iou->dims()[1];
-  float epsilon = 0.00001;
-  const T* rpn_rois_dt = rpn_rois.data<T>();
-  // Follow the Faster RCNN's implementation
-  for (int64_t i = 0; i < row; ++i) {
-    const T* v = proposal_to_gt_overlaps + i * col;
-    T max_overlap = *std::max_element(v, v + col);
-    if ((i < gt_num) && (crowd_data[i])) {
-      max_overlap = -1.0;
-    }
-    if (is_cascade_rcnn &&
-        ((rpn_rois_dt[i * 4 + 2] - rpn_rois_dt[i * 4 + 0] + 1) <= 0 ||
-         (rpn_rois_dt[i * 4 + 3] - rpn_rois_dt[i * 4 + 1] + 1) <= 0)) {
-      continue;
-    }
-    if (max_overlap >= fg_thresh) {
-      // fg mapped gt label index
-      for (int64_t j = 0; j < col; ++j) {
-        T val = proposal_to_gt_overlaps[i * col + j];
-        auto diff = std::abs(max_overlap - val);
-        if (diff < epsilon) {
-          fg_inds.emplace_back(i);
-          mapped_gt_inds.emplace_back(j);
-          break;
-        }
-      }
-    } else if ((max_overlap >= bg_thresh_lo) && (max_overlap < bg_thresh_hi)) {
-      bg_inds.emplace_back(i);
-    } else {
-      continue;
-    }
-  }
-
-  std::vector<std::vector<int>> res;
-  if (is_cascade_rcnn) {
-    res.emplace_back(fg_inds);
-    res.emplace_back(bg_inds);
-    res.emplace_back(mapped_gt_inds);
-  } else {
-    // Reservoir Sampling
-    // sampling fg
-    std::uniform_real_distribution<float> uniform(0, 1);
-    int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction);
-    int fg_rois_this_image = fg_inds.size();
-    int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image);
-    if (use_random) {
-      const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
-      if (fg_size > fg_rois_per_this_image) {
-        for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
-          int rng_ind = std::floor(uniform(engine) * i);
-          if (rng_ind < fg_rois_per_this_image) {
-            std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i);
-            std::iter_swap(mapped_gt_inds.begin() + rng_ind,
-                           mapped_gt_inds.begin() + i);
-          }
-        }
-      }
-    }
-    std::vector<int> new_fg_inds(fg_inds.begin(),
-                                 fg_inds.begin() + fg_rois_per_this_image);
-    std::vector<int> new_gt_inds(
-        mapped_gt_inds.begin(),
-        mapped_gt_inds.begin() + fg_rois_per_this_image);
-    // sampling bg
-    int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image;
-    int bg_rois_this_image = bg_inds.size();
-    int bg_rois_per_this_image =
-        std::min(bg_rois_per_image, bg_rois_this_image);
-    if (use_random) {
-      const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
-      if (bg_size > bg_rois_per_this_image) {
-        for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
-          int rng_ind = std::floor(uniform(engine) * i);
-          if (rng_ind < fg_rois_per_this_image)
-            std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
-        }
-      }
-    }
-    std::vector<int> new_bg_inds(bg_inds.begin(),
-                                 bg_inds.begin() + bg_rois_per_this_image);
-    //
-    res.emplace_back(new_fg_inds);
-    res.emplace_back(new_bg_inds);
-    res.emplace_back(new_gt_inds);
-  }
-
-  return res;
-}
-
-template <typename T>
-void GatherBoxesLabels(const platform::CPUDeviceContext& context,
-                       const Tensor& boxes, const Tensor& gt_boxes,
-                       const Tensor& gt_classes,
-                       const std::vector<int>& fg_inds,
-                       const std::vector<int>& bg_inds,
-                       const std::vector<int>& gt_inds, Tensor* sampled_boxes,
-                       Tensor* sampled_labels, Tensor* sampled_gts) {
-  int fg_num = fg_inds.size();
-  int bg_num = bg_inds.size();
-  Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
-  int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
-  int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
-  int* gt_box_inds_data =
-      gt_box_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
-  int* gt_label_inds_data =
-      gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
-  std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data);
-  std::copy(bg_inds.begin(), bg_inds.end(), bg_inds_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_box_inds_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_label_inds_data);
-
-  Tensor fg_boxes, bg_boxes, fg_labels, bg_labels;
-  fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
-  bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
-  Concat<T>(context, fg_boxes, bg_boxes, sampled_boxes);
-  CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
-  fg_labels.mutable_data<int>({fg_num}, context.GetPlace());
-  CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
-  bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
-  math::set_constant(context, &bg_labels, 0);
-  Concat<int>(context, fg_labels, bg_labels, sampled_labels);
-}
-
-template <typename T>
-std::vector<Tensor> SampleRoisForOneImage(
-    const platform::CPUDeviceContext& context, const Tensor& rpn_rois_in,
-    const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_boxes,
-    const Tensor& im_info, const int batch_size_per_im, const float fg_fraction,
-    const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo,
-    const std::vector<float>& bbox_reg_weights, const int class_nums,
-    std::minstd_rand engine, bool use_random, bool is_cascade_rcnn,
-    bool is_cls_agnostic) {
-  // 1.1 map to original image
-  auto im_scale = im_info.data<T>()[2];
-  Tensor rpn_rois_slice;
-  Tensor rpn_rois;
-
-  if (is_cascade_rcnn) {
-    // slice rpn_rois from gt_box_num refer to detectron
-    rpn_rois_slice =
-        rpn_rois_in.Slice(gt_boxes.dims()[0], rpn_rois_in.dims()[0]);
-    rpn_rois.mutable_data<T>(rpn_rois_slice.dims(), context.GetPlace());
-    const T* rpn_rois_in_dt = rpn_rois_slice.data<T>();
-    T* rpn_rois_dt = rpn_rois.data<T>();
-    for (int i = 0; i < rpn_rois.numel(); ++i) {
-      rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
-    }
-  } else {
-    rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
-    const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
-    T* rpn_rois_dt = rpn_rois.data<T>();
-    for (int i = 0; i < rpn_rois.numel(); ++i) {
-      rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
-    }
-  }
-
-  // 1.2 compute overlaps
-  int proposals_num = gt_boxes.dims()[0] + rpn_rois.dims()[0];
-  Tensor boxes;
-  boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-  Concat<T>(context, gt_boxes, rpn_rois, &boxes);
-  Tensor proposal_to_gt_overlaps;
-  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
-                                          context.GetPlace());
-  BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);
-
-  // Generate proposal index
-  std::vector<std::vector<int>> fg_bg_gt =
-      SampleFgBgGt<T>(context, &proposal_to_gt_overlaps, is_crowd,
-                      batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
-                      bg_thresh_lo, engine, use_random, is_cascade_rcnn, boxes);
-  std::vector<int> fg_inds = fg_bg_gt[0];
-  std::vector<int> bg_inds = fg_bg_gt[1];
-  std::vector<int> mapped_gt_inds = fg_bg_gt[2];  // mapped_gt_labels
-
-  // Gather boxes and labels
-  Tensor sampled_boxes, sampled_labels, sampled_gts;
-  int fg_num = fg_inds.size();
-  int bg_num = bg_inds.size();
-  int boxes_num = fg_num + bg_num;
-  framework::DDim bbox_dim({boxes_num, kBoxDim});
-  sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
-  sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
-  sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  GatherBoxesLabels<T>(context, boxes, gt_boxes, gt_classes, fg_inds, bg_inds,
-                       mapped_gt_inds, &sampled_boxes, &sampled_labels,
-                       &sampled_gts);
-
-  // Compute targets
-  Tensor bbox_targets_single;
-  bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
-  BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(),
-                false, &bbox_targets_single);
-
-  // Scale rois
-  Tensor sampled_rois;
-  sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
-  auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
-  auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
-  sampled_rois_et = sampled_boxes_et * im_scale;
-
-  // Expand box targets
-  Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
-  framework::DDim bbox_expand_dim({boxes_num, kBoxDim * class_nums});
-  bbox_targets.mutable_data<T>(bbox_expand_dim, context.GetPlace());
-  bbox_inside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
-  bbox_outside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
-  math::set_constant(context, &bbox_targets, 0.0);
-  math::set_constant(context, &bbox_inside_weights, 0.0);
-  math::set_constant(context, &bbox_outside_weights, 0.0);
-
-  auto* bbox_targets_single_data = bbox_targets_single.data<T>();
-  auto* sampled_labels_data = sampled_labels.data<int>();
-  auto* bbox_targets_data = bbox_targets.data<T>();
-  auto* bbox_inside_weights_data = bbox_inside_weights.data<T>();
-  auto* bbox_outside_weights_data = bbox_outside_weights.data<T>();
-  int width = kBoxDim * class_nums;
-  for (int64_t i = 0; i < boxes_num; ++i) {
-    int label = sampled_labels_data[i];
-    if (label > 0) {
-      if (is_cls_agnostic) {
-        label = 1;
-      }
-      int dst_idx = i * width + kBoxDim * label;
-      int src_idx = kBoxDim * i;
-      bbox_targets_data[dst_idx] = bbox_targets_single_data[src_idx];
-      bbox_targets_data[dst_idx + 1] = bbox_targets_single_data[src_idx + 1];
-      bbox_targets_data[dst_idx + 2] = bbox_targets_single_data[src_idx + 2];
-      bbox_targets_data[dst_idx + 3] = bbox_targets_single_data[src_idx + 3];
-      bbox_inside_weights_data[dst_idx] = 1;
-      bbox_inside_weights_data[dst_idx + 1] = 1;
-      bbox_inside_weights_data[dst_idx + 2] = 1;
-      bbox_inside_weights_data[dst_idx + 3] = 1;
-      bbox_outside_weights_data[dst_idx] = 1;
-      bbox_outside_weights_data[dst_idx + 1] = 1;
-      bbox_outside_weights_data[dst_idx + 2] = 1;
-      bbox_outside_weights_data[dst_idx + 3] = 1;
-    }
-  }
-  std::vector<Tensor> res;
-  res.emplace_back(sampled_rois);
-  res.emplace_back(sampled_labels);
-  res.emplace_back(bbox_targets);
-  res.emplace_back(bbox_inside_weights);
-  res.emplace_back(bbox_outside_weights);
-  return res;
-}
-
-template <typename T>
-class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* rpn_rois = context.Input<LoDTensor>("RpnRois");
-    auto* gt_classes = context.Input<LoDTensor>("GtClasses");
-    auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
-    auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
-    auto* im_info = context.Input<LoDTensor>("ImInfo");
-
-    auto* rois = context.Output<LoDTensor>("Rois");
-    auto* labels_int32 = context.Output<LoDTensor>("LabelsInt32");
-    auto* bbox_targets = context.Output<LoDTensor>("BboxTargets");
-    auto* bbox_inside_weights = context.Output<LoDTensor>("BboxInsideWeights");
-    auto* bbox_outside_weights =
-        context.Output<LoDTensor>("BboxOutsideWeights");
-
-    int batch_size_per_im = context.Attr<int>("batch_size_per_im");
-    float fg_fraction = context.Attr<float>("fg_fraction");
-    float fg_thresh = context.Attr<float>("fg_thresh");
-    float bg_thresh_hi = context.Attr<float>("bg_thresh_hi");
-    float bg_thresh_lo = context.Attr<float>("bg_thresh_lo");
-    std::vector<float> bbox_reg_weights =
-        context.Attr<std::vector<float>>("bbox_reg_weights");
-    int class_nums = context.Attr<int>("class_nums");
-    bool use_random = context.Attr<bool>("use_random");
-    bool is_cascade_rcnn = context.Attr<bool>("is_cascade_rcnn");
-    bool is_cls_agnostic = context.Attr<bool>("is_cls_agnostic");
-    PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL,
-                      "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(
-        gt_classes->lod().size(), 1UL,
-        "GenerateProposalLabelsOp gt_classes needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
-                      "GenerateProposalLabelsOp is_crowd needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
-                      "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD");
-    int64_t n = static_cast<int64_t>(rpn_rois->lod().back().size() - 1);
-
-    rois->mutable_data<T>({n * batch_size_per_im, kBoxDim}, context.GetPlace());
-    labels_int32->mutable_data<int>({n * batch_size_per_im, 1},
-                                    context.GetPlace());
-    bbox_targets->mutable_data<T>({n * batch_size_per_im, kBoxDim * class_nums},
-                                  context.GetPlace());
-    bbox_inside_weights->mutable_data<T>(
-        {n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace());
-    bbox_outside_weights->mutable_data<T>(
-        {n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace());
-
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed = rnd();
-    engine.seed(seed);
-
-    framework::LoD lod;
-    std::vector<size_t> lod0(1, 0);
-
-    int64_t num_rois = 0;
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-
-    auto rpn_rois_lod = rpn_rois->lod().back();
-    auto gt_classes_lod = gt_classes->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    auto gt_boxes_lod = gt_boxes->lod().back();
-    for (int i = 0; i < n; ++i) {
-      if (rpn_rois_lod[i] == rpn_rois_lod[i + 1]) {
-        lod0.emplace_back(num_rois);
-        continue;
-      }
-      Tensor rpn_rois_slice =
-          rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
-      Tensor gt_classes_slice =
-          gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
-      Tensor is_crowd_slice =
-          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor gt_boxes_slice =
-          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
-          dev_ctx, rpn_rois_slice, gt_classes_slice, is_crowd_slice,
-          gt_boxes_slice, im_info_slice, batch_size_per_im, fg_fraction,
-          fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
-          engine, use_random, is_cascade_rcnn, is_cls_agnostic);
-      Tensor sampled_rois = tensor_output[0];
-      Tensor sampled_labels_int32 = tensor_output[1];
-      Tensor sampled_bbox_targets = tensor_output[2];
-      Tensor sampled_bbox_inside_weights = tensor_output[3];
-      Tensor sampled_bbox_outside_weights = tensor_output[4];
-
-      AppendRois<T>(rois, kBoxDim * num_rois, &sampled_rois);
-      AppendRois<int>(labels_int32, num_rois, &sampled_labels_int32);
-      AppendRois<T>(bbox_targets, kBoxDim * num_rois * class_nums,
-                    &sampled_bbox_targets);
-      AppendRois<T>(bbox_inside_weights, kBoxDim * num_rois * class_nums,
-                    &sampled_bbox_inside_weights);
-      AppendRois<T>(bbox_outside_weights, kBoxDim * num_rois * class_nums,
-                    &sampled_bbox_outside_weights);
-
-      num_rois += sampled_rois.dims()[0];
-      lod0.emplace_back(num_rois);
-    }
-
-    lod.emplace_back(lod0);
-    rois->set_lod(lod);
-    labels_int32->set_lod(lod);
-    bbox_targets->set_lod(lod);
-    bbox_inside_weights->set_lod(lod);
-    bbox_outside_weights->set_lod(lod);
-    rois->Resize({num_rois, kBoxDim});
-    labels_int32->Resize({num_rois, 1});
-    bbox_targets->Resize({num_rois, kBoxDim * class_nums});
-    bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums});
-    bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums});
-  }
-};
-
-class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "RpnRois",
-        "(LoDTensor), This input is a 2D LoDTensor with shape [N, 4]. "
-        "N is the number of the GenerateProposalOp's output, "
-        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
-    AddInput("GtClasses",
-             "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
-             "M is the number of groundtruth, "
-             "each element is a class label of groundtruth.");
-    AddInput(
-        "IsCrowd",
-        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
-        "M is the number of groundtruth, "
-        "each element is a flag indicates whether a groundtruth is crowd.");
-    AddInput(
-        "GtBoxes",
-        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 4]. "
-        "M is the number of groundtruth, "
-        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
-    AddInput("ImInfo",
-             "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
-             "B is the number of input images, "
-             "each element consists of im_height, im_width, im_scale.");
-
-    AddOutput(
-        "Rois",
-        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
-        "P usuall equal to  batch_size_per_im * batch_size, "
-        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
-    AddOutput("LabelsInt32",
-              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
-              "each element repersents a class label of a roi");
-    AddOutput("BboxTargets",
-              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
-              "class_nums], "
-              "each element repersents a box label of a roi");
-    AddOutput(
-        "BboxInsideWeights",
-        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
-        "class_nums], "
-        "each element indicates whether a box should contribute to loss.");
-    AddOutput(
-        "BboxOutsideWeights",
-        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
-        "class_nums], "
-        "each element indicates whether a box should contribute to loss.");
-
-    AddAttr<int>("batch_size_per_im", "Batch size of rois per images.");
-    AddAttr<float>("fg_fraction",
-                   "Foreground fraction in total batch_size_per_im.");
-    AddAttr<float>(
-        "fg_thresh",
-        "Overlap threshold which is used to chose foreground sample.");
-    AddAttr<float>("bg_thresh_hi",
-                   "Overlap threshold upper bound which is used to chose "
-                   "background sample.");
-    AddAttr<float>("bg_thresh_lo",
-                   "Overlap threshold lower bound which is used to chose "
-                   "background sample.");
-    AddAttr<std::vector<float>>("bbox_reg_weights", "Box regression weights.");
-    AddAttr<int>("class_nums", "Class number.");
-    AddAttr<bool>(
-        "use_random",
-        "Use random sampling to choose foreground and background boxes.")
-        .SetDefault(true);
-    AddAttr<bool>("is_cascade_rcnn",
-                  "cascade rcnn sampling policy changed from stage 2.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "is_cls_agnostic",
-        "the box regress will only include fg and bg locations if set true ")
-        .SetDefault(false);
-
-    AddComment(R"DOC(
-This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
-to sample foreground boxes and background boxes, and compute loss target.
-
-RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes
-were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction,
-If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample.
-If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi,
-then it was considered as a background sample.
-After all foreground and background boxes are chosen (so called Rois),
-then we apply random sampling to make sure
-the number of foreground boxes is no more than batch_size_per_im * fg_fraction.
-
-For each box in Rois, we assign the classification (class label) and regression targets (box label) to it.
-Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(generate_proposal_labels, ops::GenerateProposalLabelsOp,
-                  ops::GenerateProposalLabelsOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(generate_proposal_labels,
-                       ops::GenerateProposalLabelsKernel<float>,
-                       ops::GenerateProposalLabelsKernel<double>);
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
deleted file mode 100644
index 06e48f1262a74dfdfd6d38e71cd02116f3e6eca5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ /dev/null
@@ -1,500 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <cstring>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-
-static void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
-  auto *out_data = dst->data<void>();
-  auto *to_add_data = src.data<void>();
-  size_t size_of_t = framework::SizeOfType(src.type());
-  offset *= size_of_t;
-  std::memcpy(
-      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
-      to_add_data, src.numel() * size_of_t);
-}
-
-class GenerateProposalsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Scores"), "Input(Scores) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("BboxDeltas"),
-                   "Input(BboxDeltas) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Anchors"),
-                   "Input(Anchors) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Variances"),
-                   "Input(Variances) shouldn't be null.");
-
-    ctx->SetOutputDim("RpnRois", {-1, 4});
-    ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Anchors")->type(),
-                                   ctx.device_context());
-  }
-};
-
-template <class T>
-static inline void BoxCoder(const platform::DeviceContext &ctx,
-                            Tensor *all_anchors, Tensor *bbox_deltas,
-                            Tensor *variances, Tensor *proposals) {
-  T *proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
-
-  int64_t row = all_anchors->dims()[0];
-  int64_t len = all_anchors->dims()[1];
-
-  auto *bbox_deltas_data = bbox_deltas->data<T>();
-  auto *anchor_data = all_anchors->data<T>();
-  const T *variances_data = nullptr;
-  if (variances) {
-    variances_data = variances->data<T>();
-  }
-
-  for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
-
-    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
-    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
-
-    T bbox_center_x = 0, bbox_center_y = 0;
-    T bbox_width = 0, bbox_height = 0;
-
-    if (variances) {
-      bbox_center_x =
-          variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
-          anchor_center_x;
-      bbox_center_y = variances_data[i * len + 1] *
-                          bbox_deltas_data[i * len + 1] * anchor_height +
-                      anchor_center_y;
-      bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
-                                            bbox_deltas_data[i * len + 2],
-                                        kBBoxClipDefault)) *
-                   anchor_width;
-      bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
-                                             bbox_deltas_data[i * len + 3],
-                                         kBBoxClipDefault)) *
-                    anchor_height;
-    } else {
-      bbox_center_x =
-          bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
-      bbox_center_y =
-          bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-      bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
-                                        kBBoxClipDefault)) *
-                   anchor_width;
-      bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
-                                         kBBoxClipDefault)) *
-                    anchor_height;
-    }
-
-    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
-    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
-  }
-  // return proposals;
-}
-
-template <class T>
-static inline void ClipTiledBoxes(const platform::DeviceContext &ctx,
-                                  const Tensor &im_info, Tensor *boxes) {
-  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
-  const T *im_info_data = im_info.data<T>();
-  T zero(0);
-  for (int64_t i = 0; i < boxes->numel(); ++i) {
-    if (i % 4 == 0) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else if (i % 4 == 1) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    } else if (i % 4 == 2) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    }
-  }
-}
-
-template <class T>
-static inline void FilterBoxes(const platform::DeviceContext &ctx,
-                               Tensor *boxes, float min_size,
-                               const Tensor &im_info, Tensor *keep) {
-  const T *im_info_data = im_info.data<T>();
-  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
-  T im_scale = im_info_data[2];
-  keep->Resize({boxes->dims()[0]});
-  min_size = std::max(min_size, 1.0f);
-  int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
-
-  int keep_len = 0;
-  for (int i = 0; i < boxes->dims()[0]; ++i) {
-    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
-    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
-    T ws_origin_scale =
-        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
-    T hs_origin_scale =
-        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
-    T x_ctr = boxes_data[4 * i] + ws / 2;
-    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
-    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
-        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
-      keep_data[keep_len++] = i;
-    }
-  }
-  keep->Resize({keep_len});
-}
-
-template <class T>
-static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
-    const std::vector<T> &scores) {
-  std::vector<std::pair<T, int>> sorted_indices;
-  sorted_indices.reserve(scores.size());
-  for (size_t i = 0; i < scores.size(); ++i) {
-    sorted_indices.emplace_back(scores[i], i);
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
-                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
-                     return a.first < b.first;
-                   });
-  return sorted_indices;
-}
-
-template <class T>
-static inline T BBoxArea(const T *box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
-    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
-                                    int selected_num) {
-  Tensor keep_nms;
-  keep_nms.Resize({selected_num});
-  auto *keep_data = keep_nms.mutable_data<T>(platform::CPUPlace());
-  for (int i = 0; i < selected_num; ++i) {
-    keep_data[i] = selected_indices[i];
-  }
-  return keep_nms;
-}
-
-template <class T>
-static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox,
-                         Tensor *scores, T nms_threshold, float eta) {
-  PADDLE_ENFORCE_NOT_NULL(bbox);
-  int64_t num_boxes = bbox->dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox->dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices =
-      GetSortedScoreIndex<T>(scores_data);
-
-  std::vector<int> selected_indices;
-  int selected_num = 0;
-  T adaptive_threshold = nms_threshold;
-  const T *bbox_data = bbox->data<T>();
-  while (sorted_indices.size() != 0) {
-    int idx = sorted_indices.back().second;
-    bool flag = true;
-    for (int kept_idx : selected_indices) {
-      if (flag) {
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, false);
-        flag = (overlap <= adaptive_threshold);
-      } else {
-        break;
-      }
-    }
-    if (flag) {
-      selected_indices.push_back(idx);
-      ++selected_num;
-    }
-    sorted_indices.erase(sorted_indices.end() - 1);
-    if (flag && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-  return VectorToTensor(selected_indices, selected_num);
-}
-
-template <typename T>
-class GenerateProposalsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *scores = context.Input<Tensor>("Scores");
-    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
-    auto *im_info = context.Input<Tensor>("ImInfo");
-    auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
-                               "Cannot find input Anchors(%s) in scope",
-                               context.Inputs("Anchors")[0]);
-    auto variances = detail::Ref(context.Input<Tensor>("Variances"),
-                                 "Cannot find input Variances(%s) in scope",
-                                 context.Inputs("Variances")[0]);
-
-    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
-    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
-
-    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
-    int post_nms_top_n = context.Attr<int>("post_nms_topN");
-    float nms_thresh = context.Attr<float>("nms_thresh");
-    float min_size = context.Attr<float>("min_size");
-    float eta = context.Attr<float>("eta");
-
-    auto &dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-
-    auto &scores_dim = scores->dims();
-    int64_t num = scores_dim[0];
-    int64_t c_score = scores_dim[1];
-    int64_t h_score = scores_dim[2];
-    int64_t w_score = scores_dim[3];
-
-    auto &bbox_dim = bbox_deltas->dims();
-    int64_t c_bbox = bbox_dim[1];
-    int64_t h_bbox = bbox_dim[2];
-    int64_t w_bbox = bbox_dim[3];
-
-    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
-                              context.GetPlace());
-    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
-
-    Tensor bbox_deltas_swap, scores_swap;
-    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
-                                     dev_ctx.GetPlace());
-    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
-                                dev_ctx.GetPlace());
-
-    math::Transpose<platform::CPUDeviceContext, T, 4> trans;
-    std::vector<int> axis = {0, 2, 3, 1};
-    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
-    trans(dev_ctx, *scores, &scores_swap, axis);
-
-    framework::LoD lod;
-    lod.resize(1);
-    auto &lod0 = lod[0];
-    lod0.push_back(0);
-    anchors.Resize({anchors.numel() / 4, 4});
-    variances.Resize({variances.numel() / 4, 4});
-
-    int64_t num_proposals = 0;
-    for (int64_t i = 0; i < num; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
-      Tensor scores_slice = scores_swap.Slice(i, i + 1);
-
-      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
-      scores_slice.Resize({h_score * w_score * c_score, 1});
-
-      std::pair<Tensor, Tensor> tensor_pair =
-          ProposalForOneImage(dev_ctx, im_info_slice, anchors, variances,
-                              bbox_deltas_slice, scores_slice, pre_nms_top_n,
-                              post_nms_top_n, nms_thresh, min_size, eta);
-      Tensor &proposals = tensor_pair.first;
-      Tensor &scores = tensor_pair.second;
-
-      AppendProposals(rpn_rois, 4 * num_proposals, proposals);
-      AppendProposals(rpn_roi_probs, num_proposals, scores);
-      num_proposals += proposals.dims()[0];
-      lod0.push_back(num_proposals);
-    }
-    rpn_rois->set_lod(lod);
-    rpn_roi_probs->set_lod(lod);
-    rpn_rois->Resize({num_proposals, 4});
-    rpn_roi_probs->Resize({num_proposals, 1});
-  }
-
-  std::pair<Tensor, Tensor> ProposalForOneImage(
-      const platform::CPUDeviceContext &ctx, const Tensor &im_info_slice,
-      const Tensor &anchors, const Tensor &variances,
-      const Tensor &bbox_deltas_slice,  // [M, 4]
-      const Tensor &scores_slice,       // [N, 1]
-      int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
-      float eta) const {
-    auto *scores_data = scores_slice.data<T>();
-
-    // Sort index
-    Tensor index_t;
-    index_t.Resize({scores_slice.numel()});
-    int *index = index_t.mutable_data<int>(ctx.GetPlace());
-    for (int i = 0; i < scores_slice.numel(); ++i) {
-      index[i] = i;
-    }
-    auto compare = [scores_data](const int64_t &i, const int64_t &j) {
-      return scores_data[i] > scores_data[j];
-    };
-
-    if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
-      std::sort(index, index + scores_slice.numel(), compare);
-    } else {
-      std::nth_element(index, index + pre_nms_top_n,
-                       index + scores_slice.numel(), compare);
-      index_t.Resize({pre_nms_top_n});
-    }
-
-    Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
-    scores_sel.mutable_data<T>({index_t.numel(), 1}, ctx.GetPlace());
-    bbox_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
-    anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
-    var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
-
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
-
-    Tensor proposals;
-    proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
-    BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
-
-    ClipTiledBoxes<T>(ctx, im_info_slice, &proposals);
-
-    Tensor keep;
-    FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, &keep);
-
-    Tensor scores_filter;
-    bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
-    scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
-    if (nms_thresh <= 0) {
-      return std::make_pair(bbox_sel, scores_filter);
-    }
-
-    Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
-
-    if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
-      keep_nms.Resize({post_nms_top_n});
-    }
-
-    proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
-    scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
-
-    return std::make_pair(proposals, scores_sel);
-  }
-};
-
-class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Scores",
-             "(Tensor) The scores from conv is in shape (N, A, H, W), "
-             "N is batch size, A is number of anchors, "
-             "H and W are height and width of the feature map");
-    AddInput("BboxDeltas",
-             "(Tensor) Bounding box deltas from conv is in "
-             "shape (N, 4*A, H, W).");
-    AddInput("ImInfo",
-             "(Tensor) Information for image reshape is in shape (N, 3), "
-             "in format (height, width, scale)");
-    AddInput("Anchors",
-             "(Tensor) Bounding box anchors from anchor_generator_op "
-             "is in shape (A, H, W, 4).");
-    AddInput("Variances",
-             "(Tensor) Bounding box variances with same shape as `Anchors`.");
-
-    AddOutput("RpnRois",
-              "(LoDTensor), Output proposals with shape (rois_num, 4).");
-    AddOutput("RpnRoiProbs",
-              "(LoDTensor) Scores of proposals with shape (rois_num, 1).");
-    AddAttr<int>("pre_nms_topN",
-                 "Number of top scoring RPN proposals to keep before "
-                 "applying NMS.");
-    AddAttr<int>("post_nms_topN",
-                 "Number of top scoring RPN proposals to keep after "
-                 "applying NMS");
-    AddAttr<float>("nms_thresh", "NMS threshold used on RPN proposals.");
-    AddAttr<float>("min_size",
-                   "Proposal height and width both need to be greater "
-                   "than this min_size.");
-    AddAttr<float>("eta", "The parameter for adaptive NMS.");
-    AddComment(R"DOC(
-This operator Generate bounding box proposals for Faster RCNN.
-The propoasls are generated for a list of images based on image
-score 'Scores', bounding box regression result 'BboxDeltas' as
-well as predefined bounding box shapes 'anchors'. Greedy
-non-maximum suppression is applied to generate the final bounding
-boxes.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp,
-                  ops::GenerateProposalsOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>,
-                       ops::GenerateProposalsKernel<double>);
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
deleted file mode 100644
index 43deb5f9f3871b69ca46b7908c56c1236c1c5595..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ /dev/null
@@ -1,466 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/fluid/memory/allocation/allocator.h>
-#include <stdio.h>
-#include <string>
-#include <vector>
-#include "cub/cub.cuh"
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-namespace {
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-int const kThreadsPerBlock = sizeof(uint64_t) * 8;
-
-static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-
-struct RangeInitFunctor {
-  int start_;
-  int delta_;
-  int *out_;
-  __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
-};
-
-template <typename T>
-static void SortDescending(const platform::CUDADeviceContext &ctx,
-                           const Tensor &value, Tensor *value_out,
-                           Tensor *index_out) {
-  int num = static_cast<int>(value.numel());
-  Tensor index_in_t;
-  int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
-  platform::ForRange<platform::CUDADeviceContext> for_range(ctx, num);
-  for_range(RangeInitFunctor{0, 1, idx_in});
-
-  int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
-
-  const T *keys_in = value.data<T>();
-  T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
-
-  // Determine temporary device storage requirements
-  size_t temp_storage_bytes = 0;
-  cub::DeviceRadixSort::SortPairsDescending<T, int>(
-      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
-  // Allocate temporary storage
-  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-
-  // Run sorting operation
-  cub::DeviceRadixSort::SortPairsDescending<T, int>(
-      d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
-      idx_out, num);
-}
-
-template <typename T>
-struct BoxDecodeAndClipFunctor {
-  const T *anchor;
-  const T *deltas;
-  const T *var;
-  const int *index;
-  const T *im_info;
-
-  T *proposals;
-
-  BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var,
-                          const int *index, const T *im_info, T *proposals)
-      : anchor(anchor),
-        deltas(deltas),
-        var(var),
-        index(index),
-        im_info(im_info),
-        proposals(proposals) {}
-
-  T bbox_clip_default{static_cast<T>(kBBoxClipDefault)};
-
-  __device__ void operator()(size_t i) {
-    int k = index[i] * 4;
-    T axmin = anchor[k];
-    T aymin = anchor[k + 1];
-    T axmax = anchor[k + 2];
-    T aymax = anchor[k + 3];
-
-    T w = axmax - axmin + 1.0;
-    T h = aymax - aymin + 1.0;
-    T cx = axmin + 0.5 * w;
-    T cy = aymin + 0.5 * h;
-
-    T dxmin = deltas[k];
-    T dymin = deltas[k + 1];
-    T dxmax = deltas[k + 2];
-    T dymax = deltas[k + 3];
-
-    T d_cx, d_cy, d_w, d_h;
-    if (var) {
-      d_cx = cx + dxmin * w * var[k];
-      d_cy = cy + dymin * h * var[k + 1];
-      d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w;
-      d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h;
-    } else {
-      d_cx = cx + dxmin * w;
-      d_cy = cy + dymin * h;
-      d_w = exp(Min(dxmax, bbox_clip_default)) * w;
-      d_h = exp(Min(dymax, bbox_clip_default)) * h;
-    }
-
-    T oxmin = d_cx - d_w * 0.5;
-    T oymin = d_cy - d_h * 0.5;
-    T oxmax = d_cx + d_w * 0.5 - 1.;
-    T oymax = d_cy + d_h * 0.5 - 1.;
-
-    proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.);
-    proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.);
-    proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.);
-    proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.);
-  }
-
-  __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; }
-
-  __device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; }
-};
-
-template <typename T, int BlockSize>
-static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
-                                    const T min_size, const int num,
-                                    int *keep_num, int *keep) {
-  T im_h = im_info[0];
-  T im_w = im_info[1];
-  T im_scale = im_info[2];
-
-  int cnt = 0;
-  __shared__ int keep_index[BlockSize];
-
-  CUDA_1D_KERNEL_LOOP(i, num) {
-    keep_index[threadIdx.x] = -1;
-    __syncthreads();
-
-    int k = i * 4;
-    T xmin = bboxes[k];
-    T ymin = bboxes[k + 1];
-    T xmax = bboxes[k + 2];
-    T ymax = bboxes[k + 3];
-
-    T w = xmax - xmin + 1.0;
-    T h = ymax - ymin + 1.0;
-    T cx = xmin + w / 2.;
-    T cy = ymin + h / 2.;
-
-    T w_s = (xmax - xmin) / im_scale + 1.;
-    T h_s = (ymax - ymin) / im_scale + 1.;
-
-    if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) {
-      keep_index[threadIdx.x] = i;
-    }
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      int size = (num - i) < BlockSize ? num - i : BlockSize;
-      for (int j = 0; j < size; ++j) {
-        if (keep_index[j] > -1) {
-          keep[cnt++] = keep_index[j];
-        }
-      }
-    }
-    __syncthreads();
-  }
-  if (threadIdx.x == 0) {
-    keep_num[0] = cnt;
-  }
-}
-
-static __device__ inline float IoU(const float *a, const float *b) {
-  float left = max(a[0], b[0]), right = min(a[2], b[2]);
-  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
-  float inter_s = width * height;
-  float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return inter_s / (s_a + s_b - inter_s);
-}
-
-static __global__ void NMSKernel(const int n_boxes,
-                                 const float nms_overlap_thresh,
-                                 const float *dev_boxes, uint64_t *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  const int row_size =
-      min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock);
-  const int col_size =
-      min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock);
-
-  __shared__ float block_boxes[kThreadsPerBlock * 4];
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 4 + 0] =
-        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0];
-    block_boxes[threadIdx.x * 4 + 1] =
-        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1];
-    block_boxes[threadIdx.x * 4 + 2] =
-        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2];
-    block_boxes[threadIdx.x * 4 + 3] =
-        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x;
-    const float *cur_box = dev_boxes + cur_box_idx * 4;
-    int i = 0;
-    uint64_t t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-template <typename T>
-static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
-                const Tensor &sorted_indices, const T nms_threshold,
-                Tensor *keep_out) {
-  int boxes_num = proposals.dims()[0];
-  PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
-
-  const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
-  dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
-              DIVUP(boxes_num, kThreadsPerBlock));
-  dim3 threads(kThreadsPerBlock);
-
-  const T *boxes = proposals.data<T>();
-  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  framework::Vector<uint64_t> mask(boxes_num * col_blocks);
-  NMSKernel<<<blocks, threads>>>(
-      boxes_num, nms_threshold, boxes,
-      mask.CUDAMutableData(boost::get<platform::CUDAPlace>(ctx.GetPlace())));
-
-  std::vector<uint64_t> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
-
-  std::vector<int> keep_vec;
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / kThreadsPerBlock;
-    int inblock = i % kThreadsPerBlock;
-
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      ++num_to_keep;
-      keep_vec.push_back(i);
-      uint64_t *p = &mask[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv[j] |= p[j];
-      }
-    }
-  }
-  int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
-  memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
-               sizeof(int) * num_to_keep, ctx.stream());
-  ctx.Wait();
-}
-
-template <typename T>
-static std::pair<Tensor, Tensor> ProposalForOneImage(
-    const platform::CUDADeviceContext &ctx, const Tensor &im_info,
-    const Tensor &anchors, const Tensor &variances,
-    const Tensor &bbox_deltas,  // [M, 4]
-    const Tensor &scores,       // [N, 1]
-    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
-    float eta) {
-  // 1. pre nms
-  Tensor scores_sort, index_sort;
-  SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
-  int num = scores.numel();
-  int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
-                                                                : pre_nms_top_n;
-  scores_sort.Resize({pre_nms_num, 1});
-  index_sort.Resize({pre_nms_num, 1});
-
-  // 2. box decode and clipping
-  Tensor proposals;
-  proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
-
-  {
-    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, pre_nms_num);
-    for_range(BoxDecodeAndClipFunctor<T>{
-        anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
-        index_sort.data<int>(), im_info.data<T>(), proposals.data<T>()});
-  }
-
-  // 3. filter
-  Tensor keep_index, keep_num_t;
-  keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
-  keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
-  min_size = std::max(min_size, 1.0f);
-  auto stream = ctx.stream();
-  FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
-      proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
-      keep_num_t.data<int>(), keep_index.data<int>());
-  int keep_num;
-  const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
-               keep_num_t.data<int>(), sizeof(int), ctx.stream());
-  ctx.Wait();
-  keep_index.Resize({keep_num});
-
-  Tensor scores_filter, proposals_filter;
-  proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
-  scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
-
-  if (nms_thresh <= 0) {
-    return std::make_pair(proposals_filter, scores_filter);
-  }
-
-  // 4. nms
-  Tensor keep_nms;
-  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
-  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
-    keep_nms.Resize({post_nms_top_n});
-  }
-
-  Tensor scores_nms, proposals_nms;
-  proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
-  scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
-
-  return std::make_pair(proposals_nms, scores_nms);
-}
-}  // namespace
-
-template <typename DeviceContext, typename T>
-class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *scores = context.Input<Tensor>("Scores");
-    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
-    auto *im_info = context.Input<Tensor>("ImInfo");
-    auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
-                               "Cannot find input Anchors(%s) in scope",
-                               context.Inputs("Anchors")[0]);
-    auto variances = detail::Ref(context.Input<Tensor>("Variances"),
-                                 "Cannot find input Variances(%s) in scope",
-                                 context.Inputs("Variances")[0]);
-
-    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
-    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
-
-    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
-    int post_nms_top_n = context.Attr<int>("post_nms_topN");
-    float nms_thresh = context.Attr<float>("nms_thresh");
-    float min_size = context.Attr<float>("min_size");
-    float eta = context.Attr<float>("eta");
-    PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS.");
-
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-
-    auto scores_dim = scores->dims();
-    int64_t num = scores_dim[0];
-    int64_t c_score = scores_dim[1];
-    int64_t h_score = scores_dim[2];
-    int64_t w_score = scores_dim[3];
-
-    auto bbox_dim = bbox_deltas->dims();
-    int64_t c_bbox = bbox_dim[1];
-    int64_t h_bbox = bbox_dim[2];
-    int64_t w_bbox = bbox_dim[3];
-
-    Tensor bbox_deltas_swap, scores_swap;
-    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
-                                     dev_ctx.GetPlace());
-    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
-                                dev_ctx.GetPlace());
-
-    math::Transpose<DeviceContext, T, 4> trans;
-    std::vector<int> axis = {0, 2, 3, 1};
-    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
-    trans(dev_ctx, *scores, &scores_swap, axis);
-
-    anchors.Resize({anchors.numel() / 4, 4});
-    variances.Resize({variances.numel() / 4, 4});
-
-    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
-                              context.GetPlace());
-    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
-
-    T *rpn_rois_data = rpn_rois->data<T>();
-    T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
-
-    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
-
-    int64_t num_proposals = 0;
-    std::vector<size_t> offset(1, 0);
-    for (int64_t i = 0; i < num; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
-      Tensor scores_slice = scores_swap.Slice(i, i + 1);
-
-      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
-      scores_slice.Resize({h_score * w_score * c_score, 1});
-
-      std::pair<Tensor, Tensor> box_score_pair =
-          ProposalForOneImage<T>(dev_ctx, im_info_slice, anchors, variances,
-                                 bbox_deltas_slice, scores_slice, pre_nms_top_n,
-                                 post_nms_top_n, nms_thresh, min_size, eta);
-
-      Tensor &proposals = box_score_pair.first;
-      Tensor &scores = box_score_pair.second;
-
-      memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
-                   proposals.data<T>(), sizeof(T) * proposals.numel(),
-                   dev_ctx.stream());
-      memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
-                   scores.data<T>(), sizeof(T) * scores.numel(),
-                   dev_ctx.stream());
-      dev_ctx.Wait();
-      num_proposals += proposals.dims()[0];
-      offset.emplace_back(num_proposals);
-    }
-    framework::LoD lod;
-    lod.emplace_back(offset);
-    rpn_rois->set_lod(lod);
-    rpn_roi_probs->set_lod(lod);
-    rpn_rois->Resize({num_proposals, 4});
-    rpn_roi_probs->Resize({num_proposals, 1});
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(generate_proposals,
-                        ops::CUDAGenerateProposalsKernel<
-                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc
deleted file mode 100644
index b46d231d0ff7774c64745b3b77953cf2ed8d82f7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/gpc.cc
+++ /dev/null
@@ -1,2211 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/**
- * @file src/gpc.cpp
- * @author huhan02(com@baidu.com)
- * @date 2015/12/18 14:17:30
- * @brief
- *
- * @modified by sunyipeng
- * @email sunyipeng@baidu.com
- * @date 2018/6/12
- **/
-
-#include "paddle/fluid/operators/detection/gpc.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace gpc {
-
-typedef struct lmt_shape { /* Local minima table                */
-  double y;                /* Y coordinate at local minimum     */
-  edge_node *first_bound;  /* Pointer to bound list             */
-  struct lmt_shape *next;  /* Pointer to next local minimum     */
-} lmt_node;
-
-typedef struct sbt_t_shape { /* Scanbeam tree                     */
-  double y;                  /* Scanbeam node y value             */
-  struct sbt_t_shape *less;  /* Pointer to nodes with lower y     */
-  struct sbt_t_shape *more;  /* Pointer to nodes with higher y    */
-} sb_tree;
-
-typedef struct it_shape { /* Intersection table                */
-  edge_node *ie[2];       /* Intersecting edge (bundle) pair   */
-  gpc_vertex point;       /* Point of intersection             */
-  struct it_shape *next;  /* The next intersection table node  */
-} it_node;
-
-typedef struct st_shape { /* Sorted edge table                 */
-  edge_node *edge;        /* Pointer to AET edge               */
-  double xb;              /* Scanbeam bottom x coordinate      */
-  double xt;              /* Scanbeam top x coordinate         */
-  double dx;              /* Change in x for a unit y increase */
-  struct st_shape *prev;  /* Previous edge in sorted list      */
-} st_node;
-
-typedef struct bbox_shape { /* Contour axis-aligned bounding box */
-  double xmin;              /* Minimum x coordinate              */
-  double ymin;              /* Minimum y coordinate              */
-  double xmax;              /* Maximum x coordinate              */
-  double ymax;              /* Maximum y coordinate              */
-} bbox;
-
-/*
-===========================================================================
-                               Global Data
-===========================================================================
-*/
-
-/* Horizontal edge state transitions within scanbeam boundary */
-const h_state next_h_state[3][6] = {
-    /*        ABOVE     BELOW     CROSS */
-    /*        L   R     L   R     L   R */
-    /* NH */
-    {BH, TH, TH, BH, NH, NH},
-    /* BH */
-    {NH, NH, NH, NH, TH, TH},
-    /* TH */
-    {NH, NH, NH, NH, BH, BH}};
-
-/*
-===========================================================================
-                             Private Functions
-===========================================================================
-*/
-
-static void reset_it(it_node **it) {
-  it_node *itn;
-
-  while (*it) {
-    itn = (*it)->next;
-    gpc_free<it_node>(*it);
-    *it = itn;
-  }
-}
-
-static void reset_lmt(lmt_node **lmt) {
-  lmt_node *lmtn;
-
-  while (*lmt) {
-    lmtn = (*lmt)->next;
-    gpc_free<lmt_node>(*lmt);
-    *lmt = lmtn;
-  }
-}
-
-static void insert_bound(edge_node **b, edge_node *e) {
-  edge_node *existing_bound = NULL;
-
-  if (!*b) {
-    /* Link node e to the tail of the list */
-    *b = e;
-  } else {
-    /* Do primary sort on the x field */
-    if (e[0].bot.x < (*b)[0].bot.x) {
-      /* Insert a new node mid-list */
-      existing_bound = *b;
-      *b = e;
-      (*b)->next_bound = existing_bound;
-    } else {
-      if (e[0].bot.x == (*b)[0].bot.x) {
-        /* Do secondary sort on the dx field */
-        if (e[0].dx < (*b)[0].dx) {
-          /* Insert a new node mid-list */
-          existing_bound = *b;
-          *b = e;
-          (*b)->next_bound = existing_bound;
-        } else {
-          /* Head further down the list */
-          insert_bound(&((*b)->next_bound), e);
-        }
-      } else {
-        /* Head further down the list */
-        insert_bound(&((*b)->next_bound), e);
-      }
-    }
-  }
-}
-
-static edge_node **bound_list(lmt_node **lmt, double y) {
-  lmt_node *existing_node;
-
-  if (!*lmt) {
-    /* Add node onto the tail end of the LMT */
-    gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
-                         const_cast<char *>("LMT insertion"));
-    (*lmt)->y = y;
-    (*lmt)->first_bound = NULL;
-    (*lmt)->next = NULL;
-    return &((*lmt)->first_bound);
-  } else if (y < (*lmt)->y) {
-    /* Insert a new LMT node before the current node */
-    existing_node = *lmt;
-    gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
-                         const_cast<char *>("LMT insertion"));
-    (*lmt)->y = y;
-    (*lmt)->first_bound = NULL;
-    (*lmt)->next = existing_node;
-    return &((*lmt)->first_bound);
-  } else {
-    if (y > (*lmt)->y) {
-      /* Head further up the LMT */
-      return bound_list(&((*lmt)->next), y);
-    } else {
-      /* Use this existing LMT node */
-      return &((*lmt)->first_bound);
-    }
-  }
-}
-
-static void add_to_sbtree(int *entries, sb_tree **sbtree, double y) {
-  if (!*sbtree) {
-    /* Add a new tree node here */
-    gpc_malloc<sb_tree>(*sbtree, sizeof(sb_tree),
-                        const_cast<char *>("scanbeam tree insertion"));
-    (*sbtree)->y = y;
-    (*sbtree)->less = NULL;
-    (*sbtree)->more = NULL;
-    (*entries)++;
-  } else {
-    if ((*sbtree)->y > y) {
-      /* Head into the 'less' sub-tree */
-      add_to_sbtree(entries, &((*sbtree)->less), y);
-    } else {
-      if ((*sbtree)->y < y) {
-        /* Head into the 'more' sub-tree */
-        add_to_sbtree(entries, &((*sbtree)->more), y);
-      }
-    }
-  }
-}
-
-static void build_sbt(int *entries, double *sbt, sb_tree *sbtree) {
-  if (sbtree->less) {
-    build_sbt(entries, sbt, sbtree->less);
-  }
-  sbt[*entries] = sbtree->y;
-  (*entries)++;
-  if (sbtree->more) {
-    build_sbt(entries, sbt, sbtree->more);
-  }
-}
-
-static void free_sbtree(sb_tree **sbtree) {
-  if (*sbtree) {
-    free_sbtree(&((*sbtree)->less));
-    free_sbtree(&((*sbtree)->more));
-    gpc_free<sb_tree>(*sbtree);
-  }
-}
-
-static int count_optimal_vertices(gpc_vertex_list c) {
-  int result = 0;
-  int i = 0;
-
-  /* Ignore non-contributing contours */
-  if (c.num_vertices > 0) {
-    for (i = 0; i < c.num_vertices; i++) {
-      /* Ignore superfluous vertices embedded in horizontal edges */
-      if (gpc_optimal(c.vertex, i, c.num_vertices)) {
-        result++;
-      }
-    }
-  }
-  return result;
-}
-
-static edge_node *build_lmt(lmt_node **lmt, sb_tree **sbtree, int *sbt_entries,
-                            gpc_polygon *p, int type, gpc_op op) {
-  int c = 0;
-  int i = 0;
-  int min = 0;
-  int max = 0;
-  int num_edges = 0;
-  int v = 0;
-  int num_vertices = 0;
-  int total_vertices = 0;
-  int e_index = 0;
-  edge_node *e = NULL;
-  edge_node *edge_table = NULL;
-
-  for (c = 0; c < p->num_contours; c++) {
-    total_vertices += count_optimal_vertices(p->contour[c]);
-  }
-
-  /* Create the entire input polygon edge table in one go */
-  gpc_malloc<edge_node>(edge_table, total_vertices * sizeof(edge_node),
-                        const_cast<char *>("edge table creation"));
-
-  for (c = 0; c < p->num_contours; c++) {
-    if (p->contour[c].num_vertices < 0) {
-      /* Ignore the non-contributing contour and repair the vertex count */
-      p->contour[c].num_vertices = -p->contour[c].num_vertices;
-    } else {
-      /* Perform contour optimisation */
-      num_vertices = 0;
-      for (i = 0; i < p->contour[c].num_vertices; i++) {
-        if (gpc_optimal(p->contour[c].vertex, i, p->contour[c].num_vertices)) {
-          edge_table[num_vertices].vertex.x = p->contour[c].vertex[i].x;
-          edge_table[num_vertices].vertex.y = p->contour[c].vertex[i].y;
-
-          /* Record vertex in the scanbeam table */
-          add_to_sbtree(sbt_entries, sbtree, edge_table[num_vertices].vertex.y);
-
-          num_vertices++;
-        }
-      }
-
-      /* Do the contour forward pass */
-      for (min = 0; min < num_vertices; min++) {
-        /* If a forward local minimum... */
-        if (gpc_fwd_min(edge_table, min, num_vertices)) {
-          /* Search for the next local maximum... */
-          num_edges = 1;
-          max = gpc_next_index(min, num_vertices);
-          while (gpc_not_fmax(edge_table, max, num_vertices)) {
-            num_edges++;
-            max = gpc_next_index(max, num_vertices);
-          }
-
-          /* Build the next edge list */
-          e = &edge_table[e_index];
-          e_index += num_edges;
-          v = min;
-          e[0].bstate[BELOW] = UNBUNDLED;
-          e[0].bundle[BELOW][CLIP] = 0;
-          e[0].bundle[BELOW][SUBJ] = 0;
-          for (i = 0; i < num_edges; i++) {
-            e[i].xb = edge_table[v].vertex.x;
-            e[i].bot.x = edge_table[v].vertex.x;
-            e[i].bot.y = edge_table[v].vertex.y;
-
-            v = gpc_next_index(v, num_vertices);
-
-            e[i].top.x = edge_table[v].vertex.x;
-            e[i].top.y = edge_table[v].vertex.y;
-            e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
-                      (e[i].top.y - e[i].bot.y);
-            e[i].type = type;
-            e[i].outp[ABOVE] = NULL;
-            e[i].outp[BELOW] = NULL;
-            e[i].next = NULL;
-            e[i].prev = NULL;
-            e[i].succ =
-                ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
-            e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
-            e[i].next_bound = NULL;
-            e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
-            e[i].bside[SUBJ] = LEFT;
-          }
-          insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
-        }
-      }
-
-      /* Do the contour reverse pass */
-      for (min = 0; min < num_vertices; min++) {
-        /* If a reverse local minimum... */
-        if (gpc_rev_min(edge_table, min, num_vertices)) {
-          /* Search for the previous local maximum... */
-          num_edges = 1;
-          max = gpc_prev_index(min, num_vertices);
-          while (gpc_not_rmax(edge_table, max, num_vertices)) {
-            num_edges++;
-            max = gpc_prev_index(max, num_vertices);
-          }
-
-          /* Build the previous edge list */
-          e = &edge_table[e_index];
-          e_index += num_edges;
-          v = min;
-          e[0].bstate[BELOW] = UNBUNDLED;
-          e[0].bundle[BELOW][CLIP] = 0;
-          e[0].bundle[BELOW][SUBJ] = 0;
-          for (i = 0; i < num_edges; i++) {
-            e[i].xb = edge_table[v].vertex.x;
-            e[i].bot.x = edge_table[v].vertex.x;
-            e[i].bot.y = edge_table[v].vertex.y;
-
-            v = gpc_prev_index(v, num_vertices);
-
-            e[i].top.x = edge_table[v].vertex.x;
-            e[i].top.y = edge_table[v].vertex.y;
-            e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
-                      (e[i].top.y - e[i].bot.y);
-            e[i].type = type;
-            e[i].outp[ABOVE] = NULL;
-            e[i].outp[BELOW] = NULL;
-            e[i].next = NULL;
-            e[i].prev = NULL;
-            e[i].succ =
-                ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
-            e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
-            e[i].next_bound = NULL;
-            e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
-            e[i].bside[SUBJ] = LEFT;
-          }
-          insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
-        }
-      }
-    }
-  }
-  return edge_table;
-}  // NOLINT
-
-static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) {
-  if (!*aet) {
-    /* Append edge onto the tail end of the AET */
-    *aet = edge;
-    edge->prev = prev;
-    edge->next = NULL;
-  } else {
-    /* Do primary sort on the xb field */
-    if (edge->xb < (*aet)->xb) {
-      /* Insert edge here (before the AET edge) */
-      edge->prev = prev;
-      edge->next = *aet;
-      (*aet)->prev = edge;
-      *aet = edge;
-    } else {
-      if (edge->xb == (*aet)->xb) {
-        /* Do secondary sort on the dx field */
-        if (edge->dx < (*aet)->dx) {
-          /* Insert edge here (before the AET edge) */
-          edge->prev = prev;
-          edge->next = *aet;
-          (*aet)->prev = edge;
-          *aet = edge;
-        } else {
-          /* Head further into the AET */
-          add_edge_to_aet(&((*aet)->next), edge, *aet);
-        }
-      } else {
-        /* Head further into the AET */
-        add_edge_to_aet(&((*aet)->next), edge, *aet);
-      }
-    }
-  }
-}
-
-static void add_intersection(it_node **it, edge_node *edge0, edge_node *edge1,
-                             double x, double y) {
-  it_node *existing_node;
-
-  if (!*it) {
-    /* Append a new node to the tail of the list */
-    gpc_malloc<it_node>(*it, sizeof(it_node),
-                        const_cast<char *>("IT insertion"));
-    (*it)->ie[0] = edge0;
-    (*it)->ie[1] = edge1;
-    (*it)->point.x = x;
-    (*it)->point.y = y;
-    (*it)->next = NULL;
-  } else {
-    if ((*it)->point.y > y) {
-      /* Insert a new node mid-list */
-      existing_node = *it;
-      gpc_malloc<it_node>(*it, sizeof(it_node),
-                          const_cast<char *>("IT insertion"));
-      (*it)->ie[0] = edge0;
-      (*it)->ie[1] = edge1;
-      (*it)->point.x = x;
-      (*it)->point.y = y;
-      (*it)->next = existing_node;
-    } else {
-      /* Head further down the list */
-      add_intersection(&((*it)->next), edge0, edge1, x, y);
-    }
-  }
-}
-
-static void add_st_edge(st_node **st, it_node **it, edge_node *edge,
-                        double dy) {
-  st_node *existing_node;
-  double den = 0.0;
-  double r = 0.0;
-  double x = 0.0;
-  double y = 0.0;
-
-  if (!*st) {
-    /* Append edge onto the tail end of the ST */
-    gpc_malloc<st_node>(*st, sizeof(st_node),
-                        const_cast<char *>("ST insertion"));
-    (*st)->edge = edge;
-    (*st)->xb = edge->xb;
-    (*st)->xt = edge->xt;
-    (*st)->dx = edge->dx;
-    (*st)->prev = NULL;
-  } else {
-    den = ((*st)->xt - (*st)->xb) - (edge->xt - edge->xb);
-
-    /* If new edge and ST edge don't cross */
-    if ((edge->xt >= (*st)->xt) || (edge->dx == (*st)->dx) ||
-        (fabs(den) <= DBL_EPSILON)) {
-      /* No intersection - insert edge here (before the ST edge) */
-      existing_node = *st;
-      gpc_malloc<st_node>(*st, sizeof(st_node),
-                          const_cast<char *>("ST insertion"));
-      (*st)->edge = edge;
-      (*st)->xb = edge->xb;
-      (*st)->xt = edge->xt;
-      (*st)->dx = edge->dx;
-      (*st)->prev = existing_node;
-    } else {
-      /* Compute intersection between new edge and ST edge */
-      r = (edge->xb - (*st)->xb) / den;
-      x = (*st)->xb + r * ((*st)->xt - (*st)->xb);
-      y = r * dy;
-
-      /* Insert the edge pointers and the intersection point in the IT */
-      add_intersection(it, (*st)->edge, edge, x, y);
-
-      /* Head further into the ST */
-      add_st_edge(&((*st)->prev), it, edge, dy);
-    }
-  }
-}
-
-static void build_intersection_table(it_node **it, edge_node *aet, double dy) {
-  st_node *st;
-  st_node *stp;
-  edge_node *edge = NULL;
-
-  /* Build intersection table for the current scanbeam */
-  reset_it(it);
-  st = NULL;
-
-  /* Process each AET edge */
-  for (edge = aet; edge; edge = edge->next) {
-    if ((edge->bstate[ABOVE] == BUNDLE_HEAD) || edge->bundle[ABOVE][CLIP] ||
-        edge->bundle[ABOVE][SUBJ]) {
-      add_st_edge(&st, it, edge, dy);
-    }
-  }
-
-  /* Free the sorted edge table */
-  while (st) {
-    stp = st->prev;
-    gpc_free<st_node>(st);
-    st = stp;
-  }
-}
-
-static int count_contours(polygon_node *polygon) {
-  int nc = 0;
-  int nv = 0;
-  vertex_node *v = NULL;
-  vertex_node *nextv = NULL;
-
-  for (nc = 0; polygon; polygon = polygon->next) {
-    if (polygon->active) {
-      /* Count the vertices in the current contour */
-      nv = 0;
-      for (v = polygon->proxy->v[LEFT]; v; v = v->next) {
-        nv++;
-      }
-
-      /* Record valid vertex counts in the active field */
-      if (nv > 2) {
-        polygon->active = nv;
-        nc++;
-      } else {
-        /* Invalid contour: just free the heap */
-        for (v = polygon->proxy->v[LEFT]; v; v = nextv) {
-          nextv = v->next;
-          gpc_free<vertex_node>(v);
-        }
-        polygon->active = 0;
-      }
-    }
-  }
-  return nc;
-}
-
-static void add_left(polygon_node *p, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(p);
-  vertex_node *nv = NULL;
-
-  /* Create a new vertex node and set its fields */
-  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
-                          const_cast<char *>("vertex node creation"));
-  nv->x = x;
-  nv->y = y;
-
-  /* Add vertex nv to the left end of the polygon's vertex list */
-  nv->next = p->proxy->v[LEFT];
-
-  /* Update proxy->[LEFT] to point to nv */
-  p->proxy->v[LEFT] = nv;
-}
-
-static void merge_left(polygon_node *p, polygon_node *q, polygon_node *list) {
-  polygon_node *target = NULL;
-
-  /* Label contour as a hole */
-  q->proxy->hole = 1;
-
-  if (p->proxy != q->proxy) {
-    /* Assign p's vertex list to the left end of q's list */
-    p->proxy->v[RIGHT]->next = q->proxy->v[LEFT];
-    q->proxy->v[LEFT] = p->proxy->v[LEFT];
-
-    /* Redirect any p->proxy references to q->proxy */
-
-    for (target = p->proxy; list; list = list->next) {
-      if (list->proxy == target) {
-        list->active = 0;
-        list->proxy = q->proxy;
-      }
-    }
-  }
-}
-
-static void add_right(polygon_node *p, double x, double y) {
-  vertex_node *nv = NULL;
-
-  /* Create a new vertex node and set its fields */
-  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
-                          const_cast<char *>("vertex node creation"));
-  nv->x = x;
-  nv->y = y;
-  nv->next = NULL;
-
-  /* Add vertex nv to the right end of the polygon's vertex list */
-  p->proxy->v[RIGHT]->next = nv;
-
-  /* Update proxy->v[RIGHT] to point to nv */
-  p->proxy->v[RIGHT] = nv;
-}
-
-static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
-  PADDLE_ENFORCE_NOT_NULL(p);
-  polygon_node *target = NULL;
-
-  /* Label contour as external */
-  q->proxy->hole = 0;
-
-  if (p->proxy != q->proxy) {
-    /* Assign p's vertex list to the right end of q's list */
-    q->proxy->v[RIGHT]->next = p->proxy->v[LEFT];
-    q->proxy->v[RIGHT] = p->proxy->v[RIGHT];
-
-    /* Redirect any p->proxy references to q->proxy */
-    for (target = p->proxy; list; list = list->next) {
-      if (list->proxy == target) {
-        list->active = 0;
-        list->proxy = q->proxy;
-      }
-    }
-  }
-}
-
-static void add_local_min(polygon_node **p, edge_node *edge, double x,
-                          double y) {
-  polygon_node *existing_min = NULL;
-  vertex_node *nv = NULL;
-
-  existing_min = *p;
-
-  gpc_malloc<polygon_node>(*p, sizeof(polygon_node),
-                           const_cast<char *>("polygon node creation"));
-
-  /* Create a new vertex node and set its fields */
-  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
-                          const_cast<char *>("vertex node creation"));
-  nv->x = x;
-  nv->y = y;
-  nv->next = NULL;
-
-  /* Initialise proxy to point to p itself */
-  (*p)->proxy = (*p);
-  (*p)->active = 1;
-  (*p)->next = existing_min;
-
-  /* Make v[LEFT] and v[RIGHT] point to new vertex nv */
-  (*p)->v[LEFT] = nv;
-  (*p)->v[RIGHT] = nv;
-
-  /* Assign polygon p to the edge */
-  edge->outp[ABOVE] = *p;
-}
-
-static int count_tristrips(polygon_node *tn) {
-  int total = 0;
-
-  for (total = 0; tn; tn = tn->next) {
-    if (tn->active > 2) {
-      total++;
-    }
-  }
-  return total;
-}
-
-void add_vertex(vertex_node **t, double x, double y) {
-  if (!(*t)) {
-    gpc_malloc<vertex_node>(*t, sizeof(vertex_node),
-                            const_cast<char *>("tristrip vertex creation"));
-    (*t)->x = x;
-    (*t)->y = y;
-    (*t)->next = NULL;
-  } else {
-    /* Head further down the list */
-    add_vertex(&((*t)->next), x, y);
-  }
-}
-
-void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(e);
-  add_vertex(&(e->outp[p]->v[s]), x, y);
-  e->outp[p]->active++;
-}
-
-static void new_tristrip(polygon_node **tn, edge_node *edge, double x,
-                         double y) {
-  if (!(*tn)) {
-    gpc_malloc<polygon_node>(*tn, sizeof(polygon_node),
-                             const_cast<char *>("tristrip node creation"));
-    (*tn)->next = NULL;
-    (*tn)->v[LEFT] = NULL;
-    (*tn)->v[RIGHT] = NULL;
-    (*tn)->active = 1;
-    add_vertex(&((*tn)->v[LEFT]), x, y);
-    edge->outp[ABOVE] = *tn;
-  } else {
-    /* Head further down the list */
-    new_tristrip(&((*tn)->next), edge, x, y);
-  }
-}
-
-static bbox *create_contour_bboxes(gpc_polygon *p) {
-  bbox *box;
-  int c = 0;
-  int v = 0;
-
-  gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
-                   const_cast<char *>("Bounding box creation"));
-  PADDLE_ENFORCE_NOT_NULL(box);
-
-  /* Construct contour bounding boxes */
-  for (c = 0; c < p->num_contours; c++) {
-    /* Initialise bounding box extent */
-    box[c].xmin = DBL_MAX;
-    box[c].ymin = DBL_MAX;
-    box[c].xmax = -DBL_MAX;
-    box[c].ymax = -DBL_MAX;
-
-    for (v = 0; v < p->contour[c].num_vertices; v++) {
-      /* Adjust bounding box */
-      if (p->contour[c].vertex[v].x < box[c].xmin) {
-        box[c].xmin = p->contour[c].vertex[v].x;
-      }
-      if (p->contour[c].vertex[v].y < box[c].ymin) {
-        box[c].ymin = p->contour[c].vertex[v].y;
-      }
-      if (p->contour[c].vertex[v].x > box[c].xmax) {
-        box[c].xmax = p->contour[c].vertex[v].x;
-      }
-      if (p->contour[c].vertex[v].y > box[c].ymax) {
-        box[c].ymax = p->contour[c].vertex[v].y;
-      }
-    }
-  }
-  return box;
-}
-
-static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) {
-  bbox *s_bbox;
-  bbox *c_bbox;
-  int s = 0;
-  int c = 0;
-  int *o_table = NULL;
-  int overlap = 0;
-
-  s_bbox = create_contour_bboxes(subj);
-  c_bbox = create_contour_bboxes(clip);
-
-  gpc_malloc<int>(o_table,
-                  subj->num_contours * clip->num_contours * sizeof(int),
-                  const_cast<char *>("overlap table creation"));
-
-  /* Check all subject contour bounding boxes against clip boxes */
-  for (s = 0; s < subj->num_contours; s++) {
-    for (c = 0; c < clip->num_contours; c++) {
-      o_table[c * subj->num_contours + s] =
-          (!((s_bbox[s].xmax < c_bbox[c].xmin) ||
-             (s_bbox[s].xmin > c_bbox[c].xmax))) &&
-          (!((s_bbox[s].ymax < c_bbox[c].ymin) ||
-             (s_bbox[s].ymin > c_bbox[c].ymax)));
-    }
-  }
-
-  /* For each clip contour, search for any subject contour overlaps */
-  for (c = 0; c < clip->num_contours; c++) {
-    overlap = 0;
-    for (s = 0; (!overlap) && (s < subj->num_contours); s++) {
-      overlap = o_table[c * subj->num_contours + s];
-    }
-
-    if (!overlap) {
-      /* Flag non contributing status by negating vertex count */
-      clip->contour[c].num_vertices = -clip->contour[c].num_vertices;
-    }
-  }
-
-  if (op == GPC_INT) {
-    /* For each subject contour, search for any clip contour overlaps */
-    for (s = 0; s < subj->num_contours; s++) {
-      overlap = 0;
-      for (c = 0; (!overlap) && (c < clip->num_contours); c++) {
-        overlap = o_table[c * subj->num_contours + s];
-      }
-
-      if (!overlap) {
-        /* Flag non contributing status by negating vertex count */
-        subj->contour[s].num_vertices = -subj->contour[s].num_vertices;
-      }
-    }
-  }
-
-  gpc_free<bbox>(s_bbox);
-  gpc_free<bbox>(c_bbox);
-  gpc_free<int>(o_table);
-}
-
-/*
-===========================================================================
-                             Public Functions
-===========================================================================
-*/
-
-void gpc_free_polygon(gpc_polygon *p) {
-  int c = 0;
-
-  for (c = 0; c < p->num_contours; c++) {
-    gpc_free<gpc_vertex>(p->contour[c].vertex);
-  }
-  gpc_free<int>(p->hole);
-  gpc_free<gpc_vertex_list>(p->contour);
-  p->num_contours = 0;
-}
-
-/*
-void gpc_read_polygon(FILE *fp, int read_hole_flags, gpc_polygon *p) {
-  int c = 0;
-  int v = 0;
-
-  fscanf(fp, "%d", &(p->num_contours));
-  gpc_malloc<int>(p->hole, p->num_contours * sizeof(int),
-                  (char *)"hole flag array creation");
-  gpc_malloc<gpc_vertex_list>(p->contour,
-                              p->num_contours * sizeof(gpc_vertex_list),
-                              (char *)"contour creation");
-  for (c = 0; c < p->num_contours; c++) {
-    fscanf(fp, "%d", &(p->contour[c].num_vertices));
-
-    if (read_hole_flags) {
-      fscanf(fp, "%d", &(p->hole[c]));
-    } else {
-      p->hole[c] = 0; // Assume all contours to be external
-    }
-
-    gpc_malloc<gpc_vertex>(p->contour[c].vertex,
-                           p->contour[c].num_vertices * sizeof(gpc_vertex),
-                           (char *)"vertex creation");
-    for (v = 0; v < p->contour[c].num_vertices; v++) {
-      fscanf(fp, "%lf %lf", &(p->contour[c].vertex[v].x),
-             &(p->contour[c].vertex[v].y));
-    }
-  }
-}
-
-void gpc_write_polygon(FILE *fp, int write_hole_flags, gpc_polygon *p) {
-  int c = 0;
-  int v = 0;
-
-  fprintf(fp, "%d\n", p->num_contours);
-  for (c = 0; c < p->num_contours; c++) {
-    fprintf(fp, "%d\n", p->contour[c].num_vertices);
-
-    if (write_hole_flags) {
-      fprintf(fp, "%d\n", p->hole[c]);
-    }
-
-    for (v = 0; v < p->contour[c].num_vertices; v++) {
-      fprintf(fp, "% .*lf % .*lf\n", DBL_DIG, p->contour[c].vertex[v].x,
-              DBL_DIG, p->contour[c].vertex[v].y);
-    }
-  }
-}
-*/
-
-void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
-  int *extended_hole = NULL;
-  int c = 0;
-  int v = 0;
-  gpc_vertex_list *extended_contour = NULL;
-
-  /* Create an extended hole array */
-  gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
-                  const_cast<char *>("contour hole addition"));
-  PADDLE_ENFORCE_NOT_NULL(extended_hole);
-
-  /* Create an extended contour array */
-  gpc_malloc<gpc_vertex_list>(extended_contour,
-                              (p->num_contours + 1) * sizeof(gpc_vertex_list),
-                              const_cast<char *>("contour addition"));
-
-  /* Copy the old contour and hole data into the extended arrays */
-  for (c = 0; c < p->num_contours; c++) {
-    extended_hole[c] = p->hole[c];
-    extended_contour[c] = p->contour[c];
-  }
-
-  /* Copy the new contour and hole onto the end of the extended arrays */
-  c = p->num_contours;
-  extended_hole[c] = hole;
-  extended_contour[c].num_vertices = new_contour->num_vertices;
-  gpc_malloc<gpc_vertex>(extended_contour[c].vertex,
-                         new_contour->num_vertices * sizeof(gpc_vertex),
-                         const_cast<char *>("contour addition"));
-  for (v = 0; v < new_contour->num_vertices; v++) {
-    extended_contour[c].vertex[v] = new_contour->vertex[v];
-  }
-
-  /* Dispose of the old contour */
-  gpc_free<gpc_vertex_list>(p->contour);
-  gpc_free<int>(p->hole);
-
-  /* Update the polygon information */
-  p->num_contours++;
-  p->hole = extended_hole;
-  p->contour = extended_contour;
-}
-
-// gpc_polygon_clip
-void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
-                      gpc_polygon *result) {
-  sb_tree *sbtree = NULL;
-  it_node *it = NULL;
-  it_node *intersect = NULL;
-  edge_node *edge = NULL;
-  edge_node *prev_edge = NULL;
-  edge_node *next_edge = NULL;
-  edge_node *succ_edge = NULL;
-  edge_node *e0 = NULL;
-  edge_node *e1 = NULL;
-  edge_node *aet = NULL;
-  edge_node *c_heap = NULL;
-  edge_node *s_heap = NULL;
-  lmt_node *lmt = NULL;
-  lmt_node *local_min = NULL;
-  polygon_node *out_poly = NULL;
-  polygon_node *p = NULL;
-  polygon_node *q = NULL;
-  polygon_node *poly = NULL;
-  polygon_node *npoly = NULL;
-  polygon_node *cf = NULL;
-  vertex_node *vtx = NULL;
-  vertex_node *nv = NULL;
-  h_state horiz[2];
-  int in[2];
-  int exists[2];
-  int parity[2] = {LEFT, LEFT};
-  int c = 0;
-  int v = 0;
-  int contributing = 0;
-  int search = 0;
-  int scanbeam = 0;
-  int sbt_entries = 0;
-  int vclass = 0;
-  int bl = 0;
-  int br = 0;
-  int tl = 0;
-  int tr = 0;
-  double *sbt = NULL;
-  double xb = 0.0;
-  double px = 0.0;
-  double yb = 0.0;
-  double yt = 0.0;
-  double dy = 0.0;
-  double ix = 0.0;
-  double iy = 0.0;
-
-  /* Test for trivial NULL result cases */
-  if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
-      ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
-      ((clip->num_contours == 0) && (op == GPC_INT))) {
-    result->num_contours = 0;
-    result->hole = NULL;
-    result->contour = NULL;
-    return;
-  }
-  /* Identify potentialy contributing contours */
-  if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
-      (clip->num_contours > 0)) {
-    minimax_test(subj, clip, op);
-  }
-  /* Build LMT */
-  if (subj->num_contours > 0) {
-    s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
-  }
-  if (clip->num_contours > 0) {
-    c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
-  }
-  /* Return a NULL result if no contours contribute */
-  if (lmt == NULL) {
-    result->num_contours = 0;
-    result->hole = NULL;
-    result->contour = NULL;
-    reset_lmt(&lmt);
-    gpc_free<edge_node>(s_heap);
-    gpc_free<edge_node>(c_heap);
-    return;
-  }
-
-  /* Build scanbeam table from scanbeam tree */
-  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
-                     const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
-  build_sbt(&scanbeam, sbt, sbtree);
-  scanbeam = 0;
-  free_sbtree(&sbtree);
-  /* Allow pointer re-use without causing memory leak */
-  if (subj == result) {
-    gpc_free_polygon(subj);
-  }
-  if (clip == result) {
-    gpc_free_polygon(clip);
-  }
-  /* Invert clip polygon for difference operation */
-  if (op == GPC_DIFF) {
-    parity[CLIP] = RIGHT;
-  }
-  local_min = lmt;
-
-  // Process each scanbeam
-  while (scanbeam < sbt_entries) {
-    /* Set yb and yt to the bottom and top of the scanbeam */
-    yb = sbt[scanbeam++];
-    if (scanbeam < sbt_entries) {
-      yt = sbt[scanbeam];
-      dy = yt - yb;
-    }
-    /* === SCANBEAM BOUNDARY PROCESSING ================================ */
-    /* If LMT node corresponding to yb exists */
-    if (local_min) {
-      if (local_min->y == yb) {
-        /* Add edges starting at this local minimum to the AET */
-        for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
-          add_edge_to_aet(&aet, edge, NULL);
-        }
-        local_min = local_min->next;
-      }
-    }
-    /* Set dummy previous x value */
-    px = -DBL_MAX;
-    /* Create bundles within AET */
-    e0 = aet;
-    e1 = aet;
-    /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
-    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
-    aet->bundle[ABOVE][!aet->type] = 0;
-    aet->bstate[ABOVE] = UNBUNDLED;
-
-    for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
-      /* Set up bundle fields of next edge */
-      next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
-      next_edge->bundle[ABOVE][!next_edge->type] = 0;
-      next_edge->bstate[ABOVE] = UNBUNDLED;
-      /* Bundle edges above the scanbeam boundary if they coincide */
-      if (next_edge->bundle[ABOVE][next_edge->type]) {
-        if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
-            (e0->top.y != yb)) {
-          next_edge->bundle[ABOVE][next_edge->type] ^=
-              e0->bundle[ABOVE][next_edge->type];
-          next_edge->bundle[ABOVE][!next_edge->type] =
-              e0->bundle[ABOVE][!next_edge->type];
-          next_edge->bstate[ABOVE] = BUNDLE_HEAD;
-          e0->bundle[ABOVE][CLIP] = 0;
-          e0->bundle[ABOVE][SUBJ] = 0;
-          e0->bstate[ABOVE] = BUNDLE_TAIL;
-        }
-        e0 = next_edge;
-      }
-    }
-    horiz[CLIP] = NH;
-    horiz[SUBJ] = NH;
-
-    // Process each edge at this scanbeam boundary
-    for (edge = aet; edge; edge = edge->next) {
-      exists[CLIP] =
-          edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
-      exists[SUBJ] =
-          edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
-      if (exists[CLIP] || exists[SUBJ]) {
-        /* Set bundle side */
-        edge->bside[CLIP] = parity[CLIP];
-        edge->bside[SUBJ] = parity[SUBJ];
-        /* Determine contributing status and quadrant occupancies */
-        switch (op) {
-          case GPC_DIFF:
-          case GPC_INT:
-            contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) && (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_XOR:
-            contributing = exists[CLIP] || exists[SUBJ];
-            br = (parity[CLIP]) ^ (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_UNION:
-            contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) || (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-        }
-        // Update parity
-        parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
-        parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
-        /* Update horizontal state */
-        if (exists[CLIP]) {
-          horiz[CLIP] = next_h_state[horiz[CLIP]]
-                                    [((exists[CLIP] - 1) << 1) + parity[CLIP]];
-        }
-        if (exists[SUBJ]) {
-          horiz[SUBJ] = next_h_state[horiz[SUBJ]]
-                                    [((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
-        }
-        vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-        if (contributing) {
-          xb = edge->xb;
-          switch (vclass) {
-            case EMN:
-            case IMN:
-              add_local_min(&out_poly, edge, xb, yb);
-              px = xb;
-              cf = edge->outp[ABOVE];
-              break;
-            case ERI:
-              if (xb != px) {
-                add_right(cf, xb, yb);
-                px = xb;
-              }
-              edge->outp[ABOVE] = cf;
-              cf = NULL;
-              break;
-            case ELI:
-              add_left(edge->outp[BELOW], xb, yb);
-              px = xb;
-              cf = edge->outp[BELOW];
-              break;
-            case EMX:
-              if (xb != px) {
-                add_left(cf, xb, yb);
-                px = xb;
-              }
-              merge_right(cf, edge->outp[BELOW], out_poly);
-              cf = NULL;
-              break;
-            case ILI:
-              if (xb != px) {
-                add_left(cf, xb, yb);
-                px = xb;
-              }
-              edge->outp[ABOVE] = cf;
-              cf = NULL;
-              break;
-            case IRI:
-              add_right(edge->outp[BELOW], xb, yb);
-              px = xb;
-              cf = edge->outp[BELOW];
-              edge->outp[BELOW] = NULL;
-              break;
-            case IMX:
-              if (xb != px) {
-                add_right(cf, xb, yb);
-                px = xb;
-              }
-              merge_left(cf, edge->outp[BELOW], out_poly);
-              cf = NULL;
-              edge->outp[BELOW] = NULL;
-              break;
-            case IMM:
-              if (xb != px) {
-                add_right(cf, xb, yb);
-                px = xb;
-              }
-              merge_left(cf, edge->outp[BELOW], out_poly);
-              edge->outp[BELOW] = NULL;
-              add_local_min(&out_poly, edge, xb, yb);
-              cf = edge->outp[ABOVE];
-              break;
-            case EMM:
-              if (xb != px) {
-                add_left(cf, xb, yb);
-                px = xb;
-              }
-              merge_right(cf, edge->outp[BELOW], out_poly);
-              edge->outp[BELOW] = NULL;
-              add_local_min(&out_poly, edge, xb, yb);
-              cf = edge->outp[ABOVE];
-              break;
-            case LED:
-              if (edge->bot.y == yb) {
-                add_left(edge->outp[BELOW], xb, yb);
-              }
-              edge->outp[ABOVE] = edge->outp[BELOW];
-              px = xb;
-              break;
-            case RED:
-              if (edge->bot.y == yb) {
-                add_right(edge->outp[BELOW], xb, yb);
-              }
-              edge->outp[ABOVE] = edge->outp[BELOW];
-              px = xb;
-              break;
-            default:
-              break;
-          } /* End of switch */
-        }   /* End of contributing conditional */
-      }     /* End of edge exists conditional */
-    }       // End of AET loop
-
-    /* Delete terminating edges from the AET, otherwise compute xt */
-    for (edge = aet; edge; edge = edge->next) {
-      if (edge->top.y == yb) {
-        prev_edge = edge->prev;
-        next_edge = edge->next;
-        if (prev_edge) {
-          prev_edge->next = next_edge;
-        } else {
-          aet = next_edge;
-        }
-        if (next_edge) {
-          next_edge->prev = prev_edge;
-        }
-        /* Copy bundle head state to the adjacent tail edge if required */
-        if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
-          if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
-            prev_edge->outp[BELOW] = edge->outp[BELOW];
-            prev_edge->bstate[BELOW] = UNBUNDLED;
-            if (prev_edge->prev) {
-              if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
-                prev_edge->bstate[BELOW] = BUNDLE_HEAD;
-              }
-            }
-          }
-        }
-      } else {
-        if (edge->top.y == yt) {
-          edge->xt = edge->top.x;
-        } else {
-          edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
-        }
-      }
-    }
-
-    if (scanbeam < sbt_entries) {
-      /* === SCANBEAM INTERIOR PROCESSING ============================== */
-      build_intersection_table(&it, aet, dy);
-      /* Process each node in the intersection table */
-      for (intersect = it; intersect; intersect = intersect->next) {
-        e0 = intersect->ie[0];
-        e1 = intersect->ie[1];
-        /* Only generate output for contributing intersections */
-        if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
-            (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
-          p = e0->outp[ABOVE];
-          q = e1->outp[ABOVE];
-          ix = intersect->point.x;
-          iy = intersect->point.y + yb;
-
-          in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
-                     (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
-                     (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
-                      e0->bside[CLIP] && e1->bside[CLIP]);
-          in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
-                     (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
-                     (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
-                      e0->bside[SUBJ] && e1->bside[SUBJ]);
-
-          // Determine quadrant occupancies
-          switch (op) {
-            case GPC_DIFF:
-            case GPC_INT:
-              tr = (in[CLIP]) && (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_XOR:
-              tr = (in[CLIP]) ^ (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_UNION:
-              tr = (in[CLIP]) || (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-          }
-          vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-          switch (vclass) {
-            case EMN:
-              add_local_min(&out_poly, e0, ix, iy);
-              e1->outp[ABOVE] = e0->outp[ABOVE];
-              break;
-            case ERI:
-              if (p) {
-                add_right(p, ix, iy);
-                e1->outp[ABOVE] = p;
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case ELI:
-              if (q) {
-                add_left(q, ix, iy);
-                e0->outp[ABOVE] = q;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case EMX:
-              if (p && q) {
-                add_left(p, ix, iy);
-                merge_right(p, q, out_poly);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMN:
-              add_local_min(&out_poly, e0, ix, iy);
-              e1->outp[ABOVE] = e0->outp[ABOVE];
-              break;
-            case ILI:
-              if (p) {
-                add_left(p, ix, iy);
-                e1->outp[ABOVE] = p;
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case IRI:
-              if (q) {
-                add_right(q, ix, iy);
-                e0->outp[ABOVE] = q;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMX:
-              if (p && q) {
-                add_right(p, ix, iy);
-                merge_left(p, q, out_poly);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMM:
-              if (p && q) {
-                add_right(p, ix, iy);
-                merge_left(p, q, out_poly);
-                add_local_min(&out_poly, e0, ix, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-              }
-              break;
-            case EMM:
-              if (p && q) {
-                add_left(p, ix, iy);
-                merge_right(p, q, out_poly);
-                add_local_min(&out_poly, e0, ix, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-              }
-              break;
-            default:
-              break;
-          }  // End of switch
-        }    /* End of contributing intersection conditional */
-
-        /* Swap bundle sides in response to edge crossing */
-        if (e0->bundle[ABOVE][CLIP]) {
-          e1->bside[CLIP] = !e1->bside[CLIP];
-        }
-        if (e1->bundle[ABOVE][CLIP]) {
-          e0->bside[CLIP] = !e0->bside[CLIP];
-        }
-        if (e0->bundle[ABOVE][SUBJ]) {
-          e1->bside[SUBJ] = !e1->bside[SUBJ];
-        }
-        if (e1->bundle[ABOVE][SUBJ]) {
-          e0->bside[SUBJ] = !e0->bside[SUBJ];
-        }
-
-        /* Swap e0 and e1 bundles in the AET */
-        prev_edge = e0->prev;
-        next_edge = e1->next;
-        if (next_edge) {
-          next_edge->prev = e0;
-        }
-        if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
-          search = 1;
-          while (search) {
-            prev_edge = prev_edge->prev;
-            if (prev_edge) {
-              if (prev_edge->bstate[ABOVE] != BUNDLE_TAIL) {
-                search = 0;
-              }
-            } else {
-              search = 0;
-            }
-          }
-        }
-        if (!prev_edge) {
-          aet->prev = e1;
-          e1->next = aet;
-          aet = e0->next;
-        } else {
-          prev_edge->next->prev = e1;
-          e1->next = prev_edge->next;
-          prev_edge->next = e0->next;
-        }
-        e0->next->prev = prev_edge;
-        e1->next->prev = e1;
-        e0->next = next_edge;
-      } /* End of IT loop*/
-
-      // Prepare for next scanbeam
-      for (edge = aet; edge; edge = next_edge) {
-        next_edge = edge->next;
-        succ_edge = edge->succ;
-        if ((edge->top.y == yt) && succ_edge) {
-          /* Replace AET edge by its successor */
-          succ_edge->outp[BELOW] = edge->outp[ABOVE];
-          succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
-          succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          prev_edge = edge->prev;
-          if (prev_edge) {
-            prev_edge->next = succ_edge;
-          } else {
-            aet = succ_edge;
-          }
-          if (next_edge) {
-            next_edge->prev = succ_edge;
-          }
-          succ_edge->prev = prev_edge;
-          succ_edge->next = next_edge;
-        } else {
-          /* Update this edge */
-          edge->outp[BELOW] = edge->outp[ABOVE];
-          edge->bstate[BELOW] = edge->bstate[ABOVE];
-          edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          edge->xb = edge->xt;
-        }
-        edge->outp[ABOVE] = NULL;
-      }
-    }
-  } /* === END OF SCANBEAM PROCESSING ================================== */
-  // Generate result polygon from out_poly
-  result->contour = NULL;
-  result->hole = NULL;
-  result->num_contours = count_contours(out_poly);
-  if (result->num_contours > 0) {
-    gpc_malloc<int>(result->hole, result->num_contours * sizeof(int),
-                    const_cast<char *>("hole flag table creation"));
-    gpc_malloc<gpc_vertex_list>(result->contour,
-                                result->num_contours * sizeof(gpc_vertex_list),
-                                const_cast<char *>("contour creation"));
-
-    c = 0;
-    for (poly = out_poly; poly; poly = npoly) {
-      npoly = poly->next;
-      if (poly->active) {
-        result->hole[c] = poly->proxy->hole;
-        result->contour[c].num_vertices = poly->active;
-        gpc_malloc<gpc_vertex>(
-            result->contour[c].vertex,
-            result->contour[c].num_vertices * sizeof(gpc_vertex),
-            const_cast<char *>("vertex creation"));
-
-        v = result->contour[c].num_vertices - 1;
-        for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) {
-          nv = vtx->next;
-          result->contour[c].vertex[v].x = vtx->x;
-          result->contour[c].vertex[v].y = vtx->y;
-          gpc_free<vertex_node>(vtx);
-          v--;
-        }
-        c++;
-      }
-      gpc_free<polygon_node>(poly);
-    }
-  } else {
-    for (poly = out_poly; poly; poly = npoly) {
-      npoly = poly->next;
-      gpc_free<polygon_node>(poly);
-    }
-  }
-
-  // Tidy up
-  reset_it(&it);
-  reset_lmt(&lmt);
-  gpc_free<edge_node>(c_heap);
-  gpc_free<edge_node>(s_heap);
-  gpc_free<double>(sbt);
-}  // NOLINT
-
-void gpc_free_tristrip(gpc_tristrip *t) {
-  int s = 0;
-  for (s = 0; s < t->num_strips; s++) {
-    gpc_free<gpc_vertex>(t->strip[s].vertex);
-  }
-  gpc_free<gpc_vertex_list>(t->strip);
-  t->num_strips = 0;
-}
-
-void gpc_polygon_to_tristrip(gpc_polygon *s, gpc_tristrip *t) {
-  gpc_polygon c;
-  c.num_contours = 0;
-  c.hole = NULL;
-  c.contour = NULL;
-  gpc_tristrip_clip(GPC_DIFF, s, &c, t);
-}
-
-// gpc_tristrip_clip
-void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
-                       gpc_tristrip *result) {
-  sb_tree *sbtree = NULL;
-  it_node *it = NULL;
-  it_node *intersect = NULL;
-  edge_node *edge = NULL;
-  edge_node *prev_edge = NULL;
-  edge_node *next_edge = NULL;
-  edge_node *succ_edge = NULL;
-  edge_node *e0 = NULL;
-  edge_node *e1 = NULL;
-  edge_node *aet = NULL;
-  edge_node *c_heap = NULL;
-  edge_node *s_heap = NULL;
-  edge_node *cf = NULL;
-  lmt_node *lmt = NULL;
-  lmt_node *local_min = NULL;
-  polygon_node *tlist = NULL;
-  polygon_node *tn = NULL;
-  polygon_node *tnn = NULL;
-  polygon_node *p = NULL;
-  polygon_node *q = NULL;
-  vertex_node *lt = NULL;
-  vertex_node *ltn = NULL;
-  vertex_node *rt = NULL;
-  vertex_node *rtn = NULL;
-  h_state horiz[2];
-  vertex_type cft = NUL;
-  int in[2];
-  int exists[2];
-  int parity[2] = {LEFT, LEFT};
-  int s = 0;
-  int v = 0;
-  int contributing = 0;
-  int search = 0;
-  int scanbeam = 0;
-  int sbt_entries = 0;
-  int vclass = 0;
-  int bl = 0;
-  int br = 0;
-  int tl = 0;
-  int tr = 0;
-  double *sbt = NULL;
-  double xb = 0.0;
-  double px = 0.0;
-  double nx = 0.0;
-  double yb = 0.0;
-  double yt = 0.0;
-  double dy = 0.0;
-  double ix = 0.0;
-  double iy = 0.0;
-
-  /* Test for trivial NULL result cases */
-  if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
-      ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
-      ((clip->num_contours == 0) && (op == GPC_INT))) {
-    result->num_strips = 0;
-    result->strip = NULL;
-    return;
-  }
-
-  /* Identify potentialy contributing contours */
-  if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
-      (clip->num_contours > 0)) {
-    minimax_test(subj, clip, op);
-  }
-  /* Build LMT */
-  if (subj->num_contours > 0) {
-    s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
-  }
-  if (clip->num_contours > 0) {
-    c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
-  }
-  /* Return a NULL result if no contours contribute */
-  if (lmt == NULL) {
-    result->num_strips = 0;
-    result->strip = NULL;
-    reset_lmt(&lmt);
-    gpc_free<edge_node>(s_heap);
-    gpc_free<edge_node>(c_heap);
-    return;
-  }
-
-  /* Build scanbeam table from scanbeam tree */
-  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
-                     const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
-  build_sbt(&scanbeam, sbt, sbtree);
-  scanbeam = 0;
-  free_sbtree(&sbtree);
-
-  /* Invert clip polygon for difference operation */
-  if (op == GPC_DIFF) {
-    parity[CLIP] = RIGHT;
-  }
-  local_min = lmt;
-
-  // Process each scanbeam
-  while (scanbeam < sbt_entries) {
-    /* Set yb and yt to the bottom and top of the scanbeam */
-    yb = sbt[scanbeam++];
-    if (scanbeam < sbt_entries) {
-      yt = sbt[scanbeam];
-      dy = yt - yb;
-    }
-
-    /* === SCANBEAM BOUNDARY PROCESSING ================================ */
-    /* If LMT node corresponding to yb exists */
-    if (local_min) {
-      if (local_min->y == yb) {
-        /* Add edges starting at this local minimum to the AET */
-        for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
-          add_edge_to_aet(&aet, edge, NULL);
-        }
-        local_min = local_min->next;
-      }
-    }
-    /* Set dummy previous x value */
-    /* Create bundles within AET */
-    px = -DBL_MAX;
-    e0 = aet;
-    e1 = aet;
-
-    /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
-    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
-    aet->bundle[ABOVE][!aet->type] = 0;
-    aet->bstate[ABOVE] = UNBUNDLED;
-
-    for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
-      /* Set up bundle fields of next edge */
-      next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
-      next_edge->bundle[ABOVE][!next_edge->type] = 0;
-      next_edge->bstate[ABOVE] = UNBUNDLED;
-
-      /* Bundle edges above the scanbeam boundary if they coincide */
-      if (next_edge->bundle[ABOVE][next_edge->type]) {
-        if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
-            (e0->top.y != yb)) {
-          next_edge->bundle[ABOVE][next_edge->type] ^=
-              e0->bundle[ABOVE][next_edge->type];
-          next_edge->bundle[ABOVE][!next_edge->type] =
-              e0->bundle[ABOVE][!next_edge->type];
-          next_edge->bstate[ABOVE] = BUNDLE_HEAD;
-          e0->bundle[ABOVE][CLIP] = 0;
-          e0->bundle[ABOVE][SUBJ] = 0;
-          e0->bstate[ABOVE] = BUNDLE_TAIL;
-        }
-        e0 = next_edge;
-      }
-    }
-    horiz[CLIP] = NH;
-    horiz[SUBJ] = NH;
-
-    /* Process each edge at this scanbeam boundary */
-    for (edge = aet; edge; edge = edge->next) {
-      exists[CLIP] =
-          edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
-      exists[SUBJ] =
-          edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
-
-      if (exists[CLIP] || exists[SUBJ]) {
-        /* Set bundle side */
-        edge->bside[CLIP] = parity[CLIP];
-        edge->bside[SUBJ] = parity[SUBJ];
-
-        /* Determine contributing status and quadrant occupancies */
-        switch (op) {
-          case GPC_DIFF:
-          case GPC_INT:
-            contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) && (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_XOR:
-            contributing = exists[CLIP] || exists[SUBJ];
-            br = (parity[CLIP]) ^ (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_UNION:
-            contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) || (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-        }
-
-        // Update parity
-        parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
-        parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
-
-        /* Update horizontal state */
-        if (exists[CLIP]) {
-          horiz[CLIP] = next_h_state[horiz[CLIP]]
-                                    [((exists[CLIP] - 1) << 1) + parity[CLIP]];
-        }
-        if (exists[SUBJ]) {
-          horiz[SUBJ] = next_h_state[horiz[SUBJ]]
-                                    [((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
-        }
-        vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-
-        if (contributing) {
-          xb = edge->xb;
-          switch (vclass) {
-            case EMN:
-              new_tristrip(&tlist, edge, xb, yb);
-              cf = edge;
-              break;
-            case ERI:
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              if (xb != cf->xb) {
-                gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
-              }
-              cf = NULL;
-              break;
-            case ELI:
-              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              cf = edge;
-              break;
-            case EMX:
-              if (xb != cf->xb) {
-                gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-              }
-              edge->outp[ABOVE] = NULL;
-              cf = NULL;
-              break;
-            case IMN:
-              if (cft == LED) {
-                if (cf->bot.y != yb) {
-                  gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
-                }
-                new_tristrip(&tlist, cf, cf->xb, yb);
-              }
-              if (cf) edge->outp[ABOVE] = cf->outp[ABOVE];
-              gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
-              break;
-            case ILI:
-              new_tristrip(&tlist, edge, xb, yb);
-              cf = edge;
-              cft = ILI;
-              break;
-            case IRI:
-              if (cft == LED) {
-                if (cf->bot.y != yb) {
-                  gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
-                }
-                new_tristrip(&tlist, cf, cf->xb, yb);
-              }
-              gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              break;
-            case IMX:
-              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              cft = IMX;
-              break;
-            case IMM:
-              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              if (xb != cf->xb) {
-                gpc_vertex_create(cf, ABOVE, RIGHT, xb, yb);
-              }
-              cf = edge;
-              break;
-            case EMM:
-              gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              new_tristrip(&tlist, edge, xb, yb);
-              cf = edge;
-              break;
-            case LED:
-              if (edge->bot.y == yb) {
-                gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              }
-              edge->outp[ABOVE] = edge->outp[BELOW];
-              cf = edge;
-              cft = LED;
-              break;
-            case RED:
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              if (cft == LED) {
-                if (cf->bot.y == yb) {
-                  gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-                } else {
-                  if (edge->bot.y == yb) {
-                    gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
-                    gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-                  }
-                }
-              } else {
-                gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-                gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
-              }
-              cf = NULL;
-              break;
-            default:
-              break;
-          } /* End of switch */
-        }   /* End of contributing conditional */
-      }     /* End of edge exists conditional */
-    }       // End of AET loop
-
-    /* Delete terminating edges from the AET, otherwise compute xt */
-    for (edge = aet; edge; edge = edge->next) {
-      if (edge->top.y == yb) {
-        prev_edge = edge->prev;
-        next_edge = edge->next;
-        if (prev_edge) {
-          prev_edge->next = next_edge;
-        } else {
-          aet = next_edge;
-        }
-        if (next_edge) {
-          next_edge->prev = prev_edge;
-        }
-
-        /* Copy bundle head state to the adjacent tail edge if required */
-        if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
-          if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
-            prev_edge->outp[BELOW] = edge->outp[BELOW];
-            prev_edge->bstate[BELOW] = UNBUNDLED;
-            if (prev_edge->prev) {
-              if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
-                prev_edge->bstate[BELOW] = BUNDLE_HEAD;
-              }
-            }
-          }
-        }
-      } else {
-        if (edge->top.y == yt) {
-          edge->xt = edge->top.x;
-        } else {
-          edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
-        }
-      }
-    }
-
-    if (scanbeam < sbt_entries) {
-      /* === SCANBEAM INTERIOR PROCESSING ============================== */
-      build_intersection_table(&it, aet, dy);
-      /* Process each node in the intersection table */
-      for (intersect = it; intersect; intersect = intersect->next) {
-        e0 = intersect->ie[0];
-        e1 = intersect->ie[1];
-
-        /* Only generate output for contributing intersections */
-        if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
-            (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
-          p = e0->outp[ABOVE];
-          q = e1->outp[ABOVE];
-          ix = intersect->point.x;
-          iy = intersect->point.y + yb;
-
-          in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
-                     (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
-                     (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
-                      e0->bside[CLIP] && e1->bside[CLIP]);
-          in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
-                     (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
-                     (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
-                      e0->bside[SUBJ] && e1->bside[SUBJ]);
-
-          switch (op) {  // Determine quadrant occupancies
-            case GPC_DIFF:
-            case GPC_INT:
-              tr = (in[CLIP]) && (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_XOR:
-              tr = (in[CLIP]) ^ (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_UNION:
-              tr = (in[CLIP]) || (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-          }
-
-          vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-          switch (vclass) {
-            case EMN:
-              new_tristrip(&tlist, e1, ix, iy);
-              e0->outp[ABOVE] = e1->outp[ABOVE];
-              break;
-            case ERI:
-              if (p) {
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case ELI:
-              if (q) {
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                e0->outp[ABOVE] = e1->outp[ABOVE];
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case EMX:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMN:
-              gpc_p_edge(prev_edge, e0, ABOVE);
-              gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-              gpc_n_edge(next_edge, e1, ABOVE);
-              gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              new_tristrip(&tlist, prev_edge, px, iy);
-              e1->outp[ABOVE] = prev_edge->outp[ABOVE];
-              gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
-              new_tristrip(&tlist, e0, ix, iy);
-              next_edge->outp[ABOVE] = e0->outp[ABOVE];
-              gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              break;
-            case ILI:
-              if (p) {
-                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case IRI:
-              if (q) {
-                gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                e0->outp[ABOVE] = e1->outp[ABOVE];
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMX:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
-                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                new_tristrip(&tlist, prev_edge, px, iy);
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                next_edge->outp[ABOVE] = prev_edge->outp[ABOVE];
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              }
-              break;
-            case IMM:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
-                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                new_tristrip(&tlist, prev_edge, px, iy);
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                e1->outp[ABOVE] = prev_edge->outp[ABOVE];
-                gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
-                new_tristrip(&tlist, e0, ix, iy);
-                next_edge->outp[ABOVE] = e0->outp[ABOVE];
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              }
-              break;
-            case EMM:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
-                new_tristrip(&tlist, e1, ix, iy);
-                e0->outp[ABOVE] = e1->outp[ABOVE];
-              }
-              break;
-            default:
-              break;
-          } /* End of switch */
-        }   /* End of contributing intersection conditional */
-
-        // Swap bundle sides in response to edge crossing
-        if (e0->bundle[ABOVE][CLIP]) {
-          e1->bside[CLIP] = !e1->bside[CLIP];
-        }
-        if (e1->bundle[ABOVE][CLIP]) {
-          e0->bside[CLIP] = !e0->bside[CLIP];
-        }
-        if (e0->bundle[ABOVE][SUBJ]) {
-          e1->bside[SUBJ] = !e1->bside[SUBJ];
-        }
-        if (e1->bundle[ABOVE][SUBJ]) {
-          e0->bside[SUBJ] = !e0->bside[SUBJ];
-        }
-
-        /* Swap e0 and e1 bundles in the AET */
-        prev_edge = e0->prev;
-        next_edge = e1->next;
-        if (e1->next) {
-          e1->next->prev = e0;
-        }
-
-        if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
-          search = 1;
-          while (search) {
-            prev_edge = prev_edge->prev;
-            if (prev_edge) {
-              if (prev_edge->bundle[ABOVE][CLIP] ||
-                  prev_edge->bundle[ABOVE][SUBJ] ||
-                  (prev_edge->bstate[ABOVE] == BUNDLE_HEAD)) {
-                search = 0;
-              }
-            } else {
-              search = 0;
-            }
-          }
-        }
-        if (!prev_edge) {
-          e1->next = aet;
-          aet = e0->next;
-        } else {
-          e1->next = prev_edge->next;
-          prev_edge->next = e0->next;
-        }
-        e0->next->prev = prev_edge;
-        e1->next->prev = e1;
-        e0->next = next_edge;
-      } /* End of IT loop*/
-
-      /* Prepare for next scanbeam */
-      for (edge = aet; edge; edge = next_edge) {
-        next_edge = edge->next;
-        succ_edge = edge->succ;
-
-        if ((edge->top.y == yt) && succ_edge) {
-          /* Replace AET edge by its successor */
-          succ_edge->outp[BELOW] = edge->outp[ABOVE];
-          succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
-          succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          prev_edge = edge->prev;
-          if (prev_edge) {
-            prev_edge->next = succ_edge;
-          } else {
-            aet = succ_edge;
-          }
-          if (next_edge) {
-            next_edge->prev = succ_edge;
-          }
-          succ_edge->prev = prev_edge;
-          succ_edge->next = next_edge;
-        } else {
-          /* Update this edge */
-          edge->outp[BELOW] = edge->outp[ABOVE];
-          edge->bstate[BELOW] = edge->bstate[ABOVE];
-          edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          edge->xb = edge->xt;
-        }
-        edge->outp[ABOVE] = NULL;
-      }
-    }
-  } /* === END OF SCANBEAM PROCESSING ================================== */
-
-  // Generate result tristrip from tlist
-  result->strip = NULL;
-  result->num_strips = count_tristrips(tlist);
-  if (result->num_strips > 0) {
-    gpc_malloc<gpc_vertex_list>(result->strip,
-                                result->num_strips * sizeof(gpc_vertex_list),
-                                const_cast<char *>("tristrip list creation"));
-
-    s = 0;
-    for (tn = tlist; tn; tn = tnn) {
-      tnn = tn->next;
-      if (tn->active > 2) {
-        /* Valid tristrip: copy the vertices and free the heap */
-        result->strip[s].num_vertices = tn->active;
-        gpc_malloc<gpc_vertex>(result->strip[s].vertex,
-                               tn->active * sizeof(gpc_vertex),
-                               const_cast<char *>("tristrip creation"));
-        v = 0;
-        if (0) {
-          lt = tn->v[RIGHT];
-          rt = tn->v[LEFT];
-        } else {
-          lt = tn->v[LEFT];
-          rt = tn->v[RIGHT];
-        }
-        while (lt || rt) {
-          if (lt) {
-            ltn = lt->next;
-            result->strip[s].vertex[v].x = lt->x;
-            result->strip[s].vertex[v].y = lt->y;
-            v++;
-            gpc_free<vertex_node>(lt);
-            lt = ltn;
-          }
-          if (rt) {
-            rtn = rt->next;
-            result->strip[s].vertex[v].x = rt->x;
-            result->strip[s].vertex[v].y = rt->y;
-            v++;
-            gpc_free<vertex_node>(rt);
-            rt = rtn;
-          }
-        }
-        s++;
-      } else {
-        /* Invalid tristrip: just free the heap */
-        for (lt = tn->v[LEFT]; lt; lt = ltn) {
-          ltn = lt->next;
-          gpc_free<vertex_node>(lt);
-        }
-        for (rt = tn->v[RIGHT]; rt; rt = rtn) {
-          rtn = rt->next;
-          gpc_free<vertex_node>(rt);
-        }
-      }
-      gpc_free<polygon_node>(tn);
-    }
-  }
-  // Tidy up
-  reset_it(&it);
-  reset_lmt(&lmt);
-  gpc_free<edge_node>(c_heap);
-  gpc_free<edge_node>(s_heap);
-  gpc_free<double>(sbt);
-}  // NOLINT
-
-}  // namespace gpc
-
-/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/paddle/fluid/operators/detection/gpc.h b/paddle/fluid/operators/detection/gpc.h
deleted file mode 100644
index ee86262ef2c486e4eaeeeaf56c2392d2a1c5851b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/gpc.h
+++ /dev/null
@@ -1,246 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/***************************************************************************
- *
- * Copyright (c) 2015 Baidu.com, Inc. All Rights Reserved
- *
- **************************************************************************/
-
-/**
- * @file include/gpc.h
- * @author huhan02(com@baidu.com)
- * @date 2015/12/18 13:52:10
- * @brief
- *
- * @modified by sunyipeng
- * @email sunyipeng@baidu.com
- * @date 2018/6/12
- **/
-
-#ifndef PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_  // GPC_H_
-#define PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_  // GPC_H_
-
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-namespace gpc {
-
-typedef enum {  // Set operation type
-  GPC_DIFF,     // Difference
-  GPC_INT,      // Intersection
-  GPC_XOR,      // Exclusive or
-  GPC_UNION     // Union
-} gpc_op;
-
-typedef struct {  // Polygon vertex structure
-  double x;       // Vertex x component
-  double y;       // vertex y component
-} gpc_vertex;
-
-typedef struct {       // Vertex list structure
-  int num_vertices;    // Number of vertices in list
-  gpc_vertex *vertex;  // Vertex array pointer
-} gpc_vertex_list;
-
-typedef struct {             // Polygon set structure
-  int num_contours;          // Number of contours in polygon
-  int *hole;                 // Hole  external contour flags
-  gpc_vertex_list *contour;  // Contour array pointer
-} gpc_polygon;
-
-typedef struct {           // Tristrip set structure
-  int num_strips;          // Number of tristrips
-  gpc_vertex_list *strip;  // Tristrip array pointer
-} gpc_tristrip;
-
-typedef enum { LEFT, RIGHT } gpc_left_right;
-
-typedef enum { ABOVE, BELOW } gpc_above_below;
-
-typedef enum { CLIP, SUBJ } gpc_clip_subj;
-
-typedef enum {      /* Edge intersection classes         */
-               NUL, /* Empty non-intersection            */
-               EMX, /* External maximum                  */
-               ELI, /* External left intermediate        */
-               TED, /* Top edge                          */
-               ERI, /* External right intermediate       */
-               RED, /* Right edge                        */
-               IMM, /* Internal maximum and minimum      */
-               IMN, /* Internal minimum                  */
-               EMN, /* External minimum                  */
-               EMM, /* External maximum and minimum      */
-               LED, /* Left edge                         */
-               ILI, /* Internal left intermediate        */
-               BED, /* Bottom edge                       */
-               IRI, /* Internal right intermediate       */
-               IMX, /* Internal maximum                  */
-               FUL  /* Full non-intersection             */
-} vertex_type;
-
-typedef enum {     /* Horizontal edge states            */
-               NH, /* No horizontal edge                */
-               BH, /* Bottom horizontal edge            */
-               TH  /* Top horizontal edge               */
-} h_state;
-
-typedef enum {              /* Edge bundle state                 */
-               UNBUNDLED,   /* Isolated edge not within a bundle */
-               BUNDLE_HEAD, /* Bundle head node                  */
-               BUNDLE_TAIL  /* Passive bundle tail node          */
-} bundle_state;
-
-typedef struct v_shape { /* Internal vertex list datatype     */
-  double x;              /* X coordinate component            */
-  double y;              /* Y coordinate component            */
-  struct v_shape *next;  /* Pointer to next vertex in list    */
-} vertex_node;
-
-typedef struct p_shape { /* Internal contour / tristrip type  */
-  int active;            /* Active flag / vertex count        */
-  int hole;              /* Hole / external contour flag      */
-  vertex_node *v[2];     /* Left and right vertex list ptrs   */
-  struct p_shape *next;  /* Pointer to next polygon contour   */
-  struct p_shape *proxy; /* Pointer to actual structure used  */
-} polygon_node;
-
-typedef struct edge_shape {
-  gpc_vertex vertex;             /* Piggy-backed contour vertex data  */
-  gpc_vertex bot;                /* Edge lower (x, y) coordinate      */
-  gpc_vertex top;                /* Edge upper (x, y) coordinate      */
-  double xb;                     /* Scanbeam bottom x coordinate      */
-  double xt;                     /* Scanbeam top x coordinate         */
-  double dx;                     /* Change in x for a unit y increase */
-  int type;                      /* Clip / subject edge flag          */
-  int bundle[2][2];              /* Bundle edge flags                 */
-  int bside[2];                  /* Bundle left / right indicators    */
-  bundle_state bstate[2];        /* Edge bundle state                 */
-  polygon_node *outp[2];         /* Output polygon / tristrip pointer */
-  struct edge_shape *prev;       /* Previous edge in the AET          */
-  struct edge_shape *next;       /* Next edge in the AET              */
-  struct edge_shape *pred;       /* Edge connected at the lower end   */
-  struct edge_shape *succ;       /* Edge connected at the upper end   */
-  struct edge_shape *next_bound; /* Pointer to next bound in LMT      */
-} edge_node;
-
-inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); }
-
-inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); }
-
-inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); }
-
-inline int gpc_next_index(int i, int n) { return ((i + 1) % n); }
-
-inline int gpc_optimal(gpc_vertex *v, int i, int n) {
-  return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y);
-}
-
-inline int gpc_fwd_min(edge_node *v, int i, int n) {
-  return (v[(i + 1) % n].vertex.y > v[i].vertex.y &&
-          v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y);
-}
-
-inline int gpc_not_fmax(edge_node *v, int i, int n) {
-  return (v[(i + 1) % n].vertex.y > v[i].vertex.y);
-}
-
-inline int gpc_rev_min(edge_node *v, int i, int n) {
-  return (v[(i + 1) % n].vertex.y >= v[i].vertex.y &&
-          v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
-}
-
-inline int gpc_not_rmax(edge_node *v, int i, int n) {
-  return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
-}
-
-// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j)
-// {
-inline void gpc_p_edge(edge_node *d, edge_node *e, int p) {
-  d = e;
-  do {
-    d = d->prev;
-  } while (!d->outp[p]);
-  // i = d->bot.x + d->dx * (j - d->bot.y);
-}
-
-// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j)
-// {
-inline void gpc_n_edge(edge_node *d, edge_node *e, int p) {
-  d = e;
-  do {
-    d = d->next;
-  } while (!d->outp[p]);
-  // i = d->bot.x + d->dx * (j - d->bot.y);
-}
-
-template <typename T>
-void gpc_malloc(T *&p, int b, char *s) {
-  if (b > 0) {
-    p = (T *)malloc(b);
-
-    if (!p) {
-      fprintf(stderr, "gpc malloc failure: %s\n", s);
-      exit(0);
-    }
-  } else {
-    p = NULL;
-  }
-}
-template <typename T>
-void gpc_free(T *&p) {
-  if (p) {
-    free(p);
-    p = NULL;
-  }
-}
-
-/*
-===========================================================================
-                       Public Function Prototypes
-===========================================================================
-*/
-
-void add_vertex(vertex_node **t, double x, double y);
-
-void gpc_vertex_create(edge_node *e, int p, int s, double x, double y);
-
-/*
-void gpc_read_polygon(FILE *infile_ptr, int read_hole_flags,
-                      gpc_polygon *polygon);
-
-void gpc_write_polygon(FILE *outfile_ptr, int write_hole_flags,
-                       gpc_polygon *polygon);
-*/
-void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole);
-
-void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
-                      gpc_polygon *clip_polygon, gpc_polygon *result_polygon);
-
-void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
-                       gpc_polygon *clip_polygon,
-                       gpc_tristrip *result_tristrip);
-
-void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip);
-
-void gpc_free_polygon(gpc_polygon *polygon);
-
-void gpc_free_tristrip(gpc_tristrip *tristrip);
-
-}  // namespace gpc
-
-#endif  // PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_
-/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
deleted file mode 100644
index 9c89b7ca9af1b235659554afc805600d31ef8ea6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/iou_similarity_op.h"
-
-namespace paddle {
-namespace operators {
-
-class IOUSimilarityOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of IOUSimilarityOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of IOUSimilarityOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]");
-    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2.");
-    PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]");
-
-    ctx->ShareLoD("X", /*->*/ "Out");
-    ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]}));
-  }
-};
-
-class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) "
-             "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
-             "each box is represented as [xmin, ymin, xmax, ymax], "
-             "the shape of X is [N, 4]. [xmin, ymin] is the left top "
-             "coordinate of the box if the input is image feature map, they "
-             "are close to the origin of the coordinate system. "
-             "[xmax, ymax] is the right bottom coordinate of the box. "
-             "This tensor can contain LoD information to represent a batch "
-             "of inputs. One instance of this batch can contain different "
-             "numbers of entities.");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>) "
-             "Box list Y holds M boxes, each box is represented as "
-             "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
-             "[xmin, ymin] is the left top coordinate of the box if the "
-             "input is image feature map, and [xmax, ymax] is the right "
-             "bottom coordinate of the box.");
-
-    AddOutput("Out",
-              "(LoDTensor, the lod is same as input X) The output of "
-              "iou_similarity op, a tensor with shape [N, M] "
-              "representing pairwise iou scores.");
-
-    AddComment(R"DOC(
-**IOU Similarity Operator**
-
-Computes intersection-over-union (IOU) between two box lists.
-Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
-boxes in 'Y' are shared by all instance of the batched inputs of X.
-Given two boxes A and B, the calculation of IOU is as follows:
-
-$$
-IOU(A, B) = 
-\\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)}
-$$
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(iou_similarity, ops::IOUSimilarityOp,
-                  ops::IOUSimilarityOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    iou_similarity,
-    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cu b/paddle/fluid/operators/detection/iou_similarity_op.cu
deleted file mode 100644
index 8342b4138c87e6ea1803146bac6d6954a569ef5f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/iou_similarity_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/iou_similarity_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    iou_similarity,
-    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.h b/paddle/fluid/operators/detection/iou_similarity_op.h
deleted file mode 100644
index 9f193ebc59b7be44b987db7d068c209ef7f5a8da..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/iou_similarity_op.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-template <typename T>
-inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
-                                  T ymin2, T xmax2, T ymax2) {
-  constexpr T zero = static_cast<T>(0);
-  T area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
-  T area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
-  T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
-  T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
-  T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
-  T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
-  T inter_height = inter_ymax - inter_ymin;
-  T inter_width = inter_xmax - inter_xmin;
-  inter_height = inter_height > zero ? inter_height : zero;
-  inter_width = inter_width > zero ? inter_width : zero;
-  T inter_area = inter_width * inter_height;
-  T union_area = area1 + area2 - inter_area;
-  T sim_score = inter_area / union_area;
-  return sim_score;
-}
-
-template <typename T>
-struct IOUSimilarityFunctor {
-  IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols)
-      : x_(x), y_(y), z_(z), cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t tid) const {
-    size_t row_id = tid / cols_;
-    size_t col_id = tid % cols_;
-
-    T x_min1 = x_[row_id * 4];
-    T y_min1 = x_[row_id * 4 + 1];
-    T x_max1 = x_[row_id * 4 + 2];
-    T y_max1 = x_[row_id * 4 + 3];
-
-    T x_min2 = y_[col_id * 4];
-    T y_min2 = y_[col_id * 4 + 1];
-    T x_max2 = y_[col_id * 4 + 2];
-    T y_max2 = y_[col_id * 4 + 3];
-
-    T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2,
-                          x_max2, y_max2);
-
-    z_[row_id * cols_ + col_id] = sim;
-  }
-  const T* x_;
-  const T* y_;
-  T* z_;
-  const size_t cols_;
-};
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class IOUSimilarityKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
-    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
-    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
-
-    int x_n = in_x->dims()[0];
-    int y_n = in_y->dims()[0];
-    IOUSimilarityFunctor<T> functor(in_x->data<T>(), in_y->data<T>(),
-                                    out->mutable_data<T>(ctx.GetPlace()), y_n);
-
-    platform::ForRange<DeviceContext> for_range(
-        static_cast<const DeviceContext&>(ctx.device_context()), x_n * y_n);
-    for_range(functor);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc
deleted file mode 100644
index bd6fee713815345152fce73e85a45aa5cd68b1da..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/mask_util.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/mask_util.h"
-#include <math.h>
-#include <stdlib.h>
-#include <algorithm>
-#include <limits>
-#include <utility>
-#include "paddle/fluid/memory/memory.h"
-
-namespace paddle {
-namespace operators {
-
-uint32_t UMax(uint32_t a, uint32_t b) { return (a > b) ? a : b; }
-
-static inline int Compare(const void* a, const void* b) {
-  uint32_t c = *(reinterpret_cast<const uint32_t*>(a));
-  uint32_t d = *(reinterpret_cast<const uint32_t*>(b));
-  return c > d ? 1 : c < d ? -1 : 0;
-}
-
-void Decode(const uint32_t* cnts, int m, uint8_t* mask) {
-  uint8_t v = 0;
-  for (int j = 0; j < m; j++) {
-    for (uint32_t k = 0; k < cnts[j]; k++) {
-      *(mask++) = v;
-    }
-    v = !v;
-  }
-}
-
-typedef uint32_t uint;
-void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) {
-  int j, m = 0;
-  double scale = 5;
-  int *x, *y, *u, *v;
-  uint *a, *b;
-  platform::CPUPlace cpu;
-  auto xptr = memory::Alloc(cpu, sizeof(int) * (k + 1) * 2);
-  x = reinterpret_cast<int*>(xptr->ptr());
-  y = x + (k + 1);
-
-  for (j = 0; j < k; j++) x[j] = static_cast<int>(scale * xy[j * 2 + 0] + .5);
-  x[k] = x[0];
-  for (j = 0; j < k; j++) y[j] = static_cast<int>(scale * xy[j * 2 + 1] + .5);
-  y[k] = y[0];
-  for (j = 0; j < k; j++) {
-    m += UMax(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1])) + 1;
-  }
-  auto vptr = memory::Alloc(cpu, sizeof(int) * m * 2);
-  u = reinterpret_cast<int*>(vptr->ptr());
-  v = u + m;
-  m = 0;
-  for (j = 0; j < k; j++) {
-    int xs = x[j], xe = x[j + 1], ys = y[j], ye = y[j + 1], dx, dy, t, d;
-    int flip;
-    double s;
-    dx = abs(xe - xs);
-    dy = abs(ys - ye);
-    flip = (dx >= dy && xs > xe) || (dx < dy && ys > ye);
-    if (flip) {
-      t = xs;
-      xs = xe;
-      xe = t;
-      t = ys;
-      ys = ye;
-      ye = t;
-    }
-    if (dx >= dy) {
-      s = dx == 0 ? 0 : static_cast<double>(ye - ys) / dx;
-      for (d = 0; d <= dx; d++) {
-        t = flip ? dx - d : d;
-        u[m] = t + xs;
-        v[m] = static_cast<int>(ys + s * t + .5);
-        m++;
-      }
-    } else {
-      s = dy == 0 ? 0 : static_cast<double>(xe - xs) / dy;
-      for (d = 0; d <= dy; d++) {
-        t = flip ? dy - d : d;
-        v[m] = t + ys;
-        u[m] = static_cast<int>(xs + s * t + .5);
-        m++;
-      }
-    }
-  }
-  /* get points along y-boundary and downsample */
-  k = m;
-  m = 0;
-  double xd, yd;
-  auto xyptr = memory::Alloc(cpu, sizeof(int) * k * 2);
-  x = reinterpret_cast<int*>(xyptr->ptr());
-  y = x + k;
-  for (j = 1; j < k; j++) {
-    if (u[j] != u[j - 1]) {
-      xd = static_cast<double>(u[j] < u[j - 1] ? u[j] : u[j] - 1);
-      xd = (xd + .5) / scale - .5;
-      if (floor(xd) != xd || xd < 0 || xd > w - 1) continue;
-      yd = static_cast<double>(v[j] < v[j - 1] ? v[j] : v[j - 1]);
-      yd = (yd + .5) / scale - .5;
-      if (yd < 0)
-        yd = 0;
-      else if (yd > h)
-        yd = h;
-      yd = ceil(yd);
-      x[m] = static_cast<int>(xd);
-      y[m] = static_cast<int>(yd);
-      m++;
-    }
-  }
-  /* compute rle encoding given y-boundary points */
-  k = m;
-  auto aptr = memory::Alloc(cpu, sizeof(uint) * (k + 1));
-  a = reinterpret_cast<uint*>(aptr->ptr());
-  for (j = 0; j < k; j++) a[j] = static_cast<uint>(x[j] * h + y[j]);
-  a[k++] = static_cast<uint>(h * w);
-
-  qsort(a, k, sizeof(uint), Compare);
-  uint p = 0;
-  for (j = 0; j < k; j++) {
-    uint t = a[j];
-    a[j] -= p;
-    p = t;
-  }
-  auto bptr = memory::Alloc(cpu, sizeof(uint32_t) * k);
-  b = reinterpret_cast<uint32_t*>(bptr->ptr());
-  j = m = 0;
-  b[m++] = a[j++];
-  while (j < k) {
-    if (a[j] > 0) {
-      b[m++] = a[j++];
-    } else {
-      j++;
-      if (j < k) b[m - 1] += a[j++];
-    }
-  }
-
-  // convert to mask
-  auto mskptr = memory::Alloc(cpu, sizeof(uint8_t) * h * w);
-  uint8_t* msk = reinterpret_cast<uint8_t*>(mskptr->ptr());
-  Decode(b, m, msk);
-
-  for (int ii = 0; ii < h; ++ii) {
-    for (int jj = 0; jj < w; ++jj) {
-      mask[ii * w + jj] = msk[jj * h + ii];
-    }
-  }
-}
-
-void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
-                float* boxes) {
-  // lists
-  for (size_t i = 0; i < polys.size(); ++i) {
-    float x0 = std::numeric_limits<float>::max();
-    float x1 = std::numeric_limits<float>::min();
-    float y0 = std::numeric_limits<float>::max();
-    float y1 = std::numeric_limits<float>::min();
-    // each list may have more than one polys
-    for (size_t j = 0; j < polys[i].size(); ++j) {
-      for (size_t k = 0; k < polys[i][j].size() / 2; ++k) {
-        x0 = std::min(x0, polys[i][j][2 * k]);
-        x1 = std::max(x1, polys[i][j][2 * k]);
-        y0 = std::min(y0, polys[i][j][2 * k + 1]);
-        y1 = std::max(y1, polys[i][j][2 * k + 1]);
-      }
-    }
-    boxes[i * 4] = x0;
-    boxes[i * 4 + 1] = y0;
-    boxes[i * 4 + 2] = x1;
-    boxes[i * 4 + 3] = y1;
-  }
-}
-
-void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
-                      const float* box, int M, uint8_t* mask) {
-  float w = box[2] - box[0];
-  float h = box[3] - box[1];
-  w = std::max(w, static_cast<float>(1.));
-  h = std::max(h, static_cast<float>(1.));
-
-  uint8_t* msk = nullptr;
-  if (polygons.size() == 1UL) {
-    msk = mask;
-  } else {
-    msk = reinterpret_cast<uint8_t*>(
-        malloc(M * M * polygons.size() * sizeof(uint8_t)));
-  }
-  for (size_t i = 0; i < polygons.size(); ++i) {
-    int k = polygons[i].size() / 2;
-    std::vector<float> p;
-    for (int j = 0; j < k; ++j) {
-      float pw = (polygons[i][2 * j] - box[0]) * M / w;
-      float ph = (polygons[i][2 * j + 1] - box[1]) * M / h;
-      p.push_back(pw);
-      p.push_back(ph);
-    }
-    uint8_t* msk_i = msk + i * M * M;
-    Poly2Mask(p.data(), k, M, M, msk_i);
-  }
-
-  if (polygons.size() > 1UL) {
-    for (size_t i = 0; i < polygons.size(); ++i) {
-      uint8_t* msk_i = msk + i * M * M;
-      for (int j = 0; j < M * M; ++j) {
-        if (i == 0) {
-          mask[j] = msk_i[j];
-        } else {
-          mask[j] = (mask[j] + msk_i[j]) > 0 ? 1 : 0;
-        }
-      }
-    }
-    free(msk);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h
deleted file mode 100644
index 4e0ea54f6d89ff273382afc1e9a151cfd9773cc6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/mask_util.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdint.h>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-void Poly2Mask(const float* ploy, int k, int h, int w, uint8_t* mask);
-
-void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
-                float* boxes);
-
-void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
-                      const float* box, int M, uint8_t* mask);
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/paddle/fluid/operators/detection/mask_util_test.cc
deleted file mode 100644
index de904e947463977229545897b723b98b4d0708d6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/mask_util_test.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/mask_util.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/memory/memory.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void Compare(const T* a, const T* b, const int n) {
-  for (int i = 0; i < n; i++) {
-    EXPECT_EQ(a[i], b[i]);
-  }
-}
-
-TEST(MaskUtil, Poly2MaskTest) {
-  float polys[] = {1.97f, 1.88f, 5.81f, 1.88f, 1.69f,
-                   6.53f, 5.94f, 6.38f, 1.97f, 1.88f};
-  int h = 8, w = 8;
-  int k = 5;  // length(polys) / 2
-  // clang-format off
-  uint8_t expect_mask[] = {
-      0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 1, 1, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 1, 0, 0, 0, 0,
-      0, 0, 1, 1, 1, 0, 0, 0,
-      0, 0, 1, 1, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0
-  };
-  // clang-format on
-
-  // the groud-truth mask is computed by coco API:
-  //
-  // import pycocotools.mask as mask_util
-  // import numpy as np
-  // segm = [1.97, 1.88, 5.81, 1.88, 1.69, 6.53, 5.94, 6.38, 1.97, 1.88]
-  // rles = mask_util.frPyObjects([segm], im_h, im_w)
-  // mask = mask_util.decode(rles)
-  // print mask
-  platform::CPUPlace cpu;
-  auto allocation = memory::Alloc(cpu, sizeof(expect_mask));
-  uint8_t* mask = reinterpret_cast<uint8_t*>(allocation->ptr());
-  Poly2Mask(polys, k, h, w, mask);
-  Compare<uint8_t>(expect_mask, mask, h * w);
-}
-
-TEST(MaskUtil, Poly2BoxesTest) {
-  // clang-format off
-  std::vector<std::vector<std::vector<float>>> polys = {
-      {{1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}},
-      {{2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}
-  };
-  float expect_boxes[] = {
-      1.69f, 1.88f, 5.94f, 6.53f,
-      1.69f, 0.88f, 6.94f, 6.63f
-  };
-  // clang-format on
-
-  platform::CPUPlace cpu;
-  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
-  float* boxes = reinterpret_cast<float*>(allocation->ptr());
-  Poly2Boxes(polys, boxes);
-  Compare<float>(expect_boxes, boxes, 8);
-}
-
-TEST(MaskUtil, Polys2MaskWrtBoxTest) {
-  // clang-format off
-  std::vector<std::vector<std::vector<float>>> polys = {{
-      {1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f},
-      {2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}};
-  float expect_boxes[] = {
-      1.69f, 0.88f, 6.94f, 6.63f
-  };
-  uint8_t expect_mask[] = {
-      0, 0, 0, 0, 0, 0, 0, 0,
-      0, 1, 1, 1, 1, 1, 0, 0,
-      0, 0, 1, 1, 1, 0, 0, 0,
-      0, 0, 1, 1, 1, 0, 0, 0,
-      0, 0, 1, 1, 1, 0, 0, 0,
-      0, 1, 1, 1, 1, 1, 0, 0,
-      0, 1, 1, 1, 1, 1, 1, 0,
-      1, 1, 1, 1, 1, 1, 1, 1
-  };
-  // clang-format on
-
-  platform::CPUPlace cpu;
-  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
-  float* boxes = reinterpret_cast<float*>(allocation->ptr());
-  Poly2Boxes(polys, boxes);
-  Compare<float>(expect_boxes, boxes, 4);
-
-  auto allocat_mask = memory::Alloc(cpu, sizeof(expect_mask));
-  uint8_t* mask = reinterpret_cast<uint8_t*>(allocat_mask->ptr());
-  int M = 8;
-  Polys2MaskWrtBox(polys[0], expect_boxes, M, mask);
-  Compare<uint8_t>(expect_mask, mask, M * M);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
deleted file mode 100644
index c68fe2439cad9bc5a49a742c1a38e704a7618156..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ /dev/null
@@ -1,341 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-enum MiningType { kNone = 0, kMaxNegative, kHardExample };
-
-template <typename T>
-bool SortScoreDescend(const std::pair<float, T>& pair1,
-                      const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
-                             const float match_dist,
-                             const float neg_dist_threshold) {
-  if (mining_type == MiningType::kMaxNegative) {
-    return match_idx == -1 && match_dist < neg_dist_threshold;
-  } else if (mining_type == MiningType::kHardExample) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-inline MiningType GetMiningType(std::string str) {
-  if (str == "max_negative") {
-    return MiningType::kMaxNegative;
-  } else if (str == "hard_example") {
-    return MiningType::kHardExample;
-  } else {
-    return MiningType::kNone;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class MineHardExamplesKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
-    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
-    auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
-    auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
-    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
-    T neg_dist_threshold =
-        static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
-    int sample_size = ctx.Attr<int>("sample_size");
-    MiningType mining_type =
-        GetMiningType(ctx.Attr<std::string>("mining_type"));
-
-    auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
-    auto out_match_indices =
-        ctx.Output<framework::Tensor>("UpdatedMatchIndices");
-
-    framework::TensorCopy(*in_matched_indices, ctx.GetPlace(),
-                          out_match_indices);
-
-    int batch_size = in_matched_indices->dims()[0];
-    int prior_num = in_matched_indices->dims()[1];
-
-    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
-
-    auto match_indices_et =
-        framework::EigenMatrix<int>::From(*out_match_indices);
-
-    auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
-
-    const T* cls_loss = in_cls_loss->data<T>();
-    const T* loc_loss = nullptr;
-    if (in_loc_loss) {
-      loc_loss = in_loc_loss->data<T>();
-    }
-
-    std::vector<std::vector<int>> all_neg_indices;
-    std::vector<size_t> batch_starts = {0};
-    for (int n = 0; n < batch_size; ++n) {
-      std::vector<std::pair<T, size_t>> loss_idx;
-      int neg_sel = 0;
-      for (int m = 0; m < prior_num; ++m) {
-        if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
-                             neg_dist_threshold)) {
-          T loss = cls_loss[n * prior_num + m];
-          if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
-            loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
-          }
-          loss_idx.push_back(std::make_pair(loss, m));
-          ++neg_sel;
-        }
-      }
-
-      if (mining_type == MiningType::kMaxNegative) {
-        int num_pos = 0;
-        for (int m = 0; m < prior_num; ++m) {
-          if (match_indices(n, m) != -1) ++num_pos;
-        }
-        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
-      } else if (mining_type == MiningType::kHardExample) {
-        neg_sel = std::min(sample_size, neg_sel);
-      }
-
-      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
-      std::set<int> sel_indices;
-      std::vector<int> neg_indices;
-      std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
-                     std::inserter(sel_indices, sel_indices.begin()),
-                     [](std::pair<T, size_t>& l) -> int {
-                       return static_cast<int>(l.second);
-                     });
-
-      if (mining_type == MiningType::kHardExample) {
-        for (int m = 0; m < prior_num; ++m) {
-          if (match_indices(n, m) > -1) {
-            if (sel_indices.find(m) == sel_indices.end()) {
-              match_indices_et(n, m) = -1;
-            }
-          } else {
-            if (sel_indices.find(m) != sel_indices.end()) {
-              neg_indices.push_back(m);
-            }
-          }
-        }
-      } else {
-        neg_indices.resize(sel_indices.size());
-        std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
-      }
-
-      all_neg_indices.push_back(neg_indices);
-      batch_starts.push_back(batch_starts.back() + neg_indices.size());
-    }
-
-    framework::LoD out_neg_indices_lod;
-    out_neg_indices_lod.emplace_back(batch_starts);
-    int neg_offset = 0;
-    auto neg_data = out_neg_indices->mutable_data<int>(
-        framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
-        ctx.GetPlace());
-
-    for (auto neg_indices : all_neg_indices) {
-      std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
-      neg_offset += neg_indices.size();
-    }
-    out_neg_indices->set_lod(out_neg_indices_lod);
-    return;
-  }
-};
-
-class MineHardExamplesOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
-                   "Input(ClsLoss) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("MatchIndices"),
-        "Input(MatchIndices) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("MatchDist"),
-        "Input(MatchDist) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("NegIndices"),
-        "Output(NegIndices) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
-                   "Output(UpdatedMatchIndices) of MineHardExamplesOp should "
-                   "not be null.");
-
-    auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
-    auto idx_dims = ctx->GetInputDim("MatchIndices");
-    auto dis_dims = ctx->GetInputDim("MatchDist");
-
-    PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
-                      "The shape of ClsLoss is [N, Np].");
-    PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
-                      "The shape of MatchIndices is [N, Np].");
-    PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
-                      "The shape of MatchDist is [N, Np].");
-
-    if (ctx->HasInput("LocLoss")) {
-      auto loc_loss_dims = ctx->GetInputDim("LocLoss");
-      PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
-                        "The shape of LocLoss is [N, Np].");
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            cls_loss_dims[0], loc_loss_dims[0],
-            "Batch size of ClsLoss and LocLoss must be the same.");
-        PADDLE_ENFORCE_EQ(
-            cls_loss_dims[1], loc_loss_dims[1],
-            "Prior box number of ClsLoss and LocLoss must be the same.");
-      }
-    }
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          cls_loss_dims[0], idx_dims[0],
-          "Batch size of ClsLoss and MatchIndices must be the same.");
-      PADDLE_ENFORCE_EQ(
-          cls_loss_dims[1], idx_dims[1],
-          "Prior box number of ClsLoss and MatchIndices must be the same.");
-
-      PADDLE_ENFORCE_EQ(
-          cls_loss_dims[0], dis_dims[0],
-          "Batch size of ClsLoss and MatchDist must be the same.");
-      PADDLE_ENFORCE_EQ(
-          cls_loss_dims[1], idx_dims[1],
-          "Prior box number of ClsLoss and MatchDist must be the same.");
-    }
-
-    auto mining_type =
-        GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
-
-    PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
-                      "mining_type must be hard_example or max_negative");
-
-    if (mining_type == MiningType::kMaxNegative) {
-      auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
-      auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
-      PADDLE_ENFORCE_GT(
-          neg_pos_ratio, 0.0f,
-          "neg_pos_ratio must greater than zero in max_negative mode");
-      PADDLE_ENFORCE_LT(
-          neg_dist_threshold, 1.0f,
-          "neg_dist_threshold must less than one in max_negative mode");
-      PADDLE_ENFORCE_GT(
-          neg_dist_threshold, 0.0f,
-          "neg_dist_threshold must greater than zero in max_negative mode");
-    } else if (mining_type == MiningType::kHardExample) {
-      auto sample_size = ctx->Attrs().Get<int>("sample_size");
-      PADDLE_ENFORCE_GT(
-          sample_size, 0,
-          "sample_size must greater than zero in hard_example mode");
-    }
-
-    ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
-    // The first dimension of NegIndices will be set correcttly in Compute.
-    ctx->SetOutputDim("NegIndices", {-1, 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>("ClsLoss")->type(), platform::CPUPlace());
-  }
-};
-
-class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "ClsLoss",
-        "(Tensor, default Tensor<float>), The classification loss with shape "
-        "[N, Np], N is the batch size and Np is the number of prior box.");
-    AddInput("LocLoss",
-             "(Tensor, optional, default Tensor<float>), The localization loss "
-             "with shape [N, Np], N is the batch size and Np is the number of "
-             "prior box.")
-        .AsDispensable();
-    AddInput("MatchIndices",
-             "(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
-             "the batch size and Np is the number of prior box. "
-             "MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
-             "instance does not match any entity, otherwise means it is "
-             "matched to row.");
-    AddInput("MatchDist",
-             "(Tensor, default Tensor<float>) Matched indices with shape [N, "
-             "Np], N is the batch size and Np is the number of prior box.");
-    AddAttr<float>("neg_pos_ratio",
-                   "(float) The ratio of the negative box to the positive "
-                   "box. Use only when mining_type is max_negative.")
-        .SetDefault(1.0);
-    AddAttr<float>("neg_dist_threshold",
-                   "(float) The negative overlap upper bound for the unmatched "
-                   "predictions. Use only when mining_type is max_negative.")
-        .SetDefault(0.5);
-    AddAttr<int>("sample_size",
-                 "(float) The max sample size of negative box. Use only when "
-                 "mining_type is hard_example.")
-        .SetDefault(0);
-    AddAttr<std::string>("mining_type",
-                         "(float) The mining algorithm name, the value is "
-                         "hard_example or max_negative.")
-        .SetDefault("max_negative")
-        .InEnum({"hard_example", "max_negative"});
-
-    AddOutput(
-        "NegIndices",
-        "(LoDTensor<int>) The output of negative example indices. a LoDTensor "
-        "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
-        "and each element is the prior box index. "
-        "For example, the batch size is 2, the lod is [[0, 1, 2]], "
-        "the sample 0's box 1(MatchIndices[0][1]) is selected, "
-        "and sample 1's box 0 is selected. The output NegIndices is "
-        "[[1], [0]].");
-
-    AddOutput("UpdatedMatchIndices",
-              "(Tensor<int>) The output of updated MatchIndices, a tensor with "
-              "shape [N, Np]. Only update when mining_type is "
-              "hard_example. The input MatchIndices elements will be update to "
-              "-1 when it is not in the candidate high loss list of negative "
-              "examples.");
-
-    AddComment(R"DOC(
-Mine hard examples Operator.
-This operator implements hard example mining to select a subset of negative box indices.
-For each image, selects the box with highest losses. subject to the condition that the 
-box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. 
-The selected number is min(sample_size, max_negative_box_number) when mining_type is 
-hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) 
-when mining_type is max_negative, where the max_negative_box_number is the count of 
-MatchIndices elements with value -1.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(mine_hard_examples, ops::MineHardExamplesOp,
-                  ops::MineHardExamplesOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    mine_hard_examples,
-    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
deleted file mode 100644
index f5b9be14ad6819f040b915f42d6e7ffb7dcdc908..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ /dev/null
@@ -1,602 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/poly_util.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-class MultiClassNMSOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
-                   "Input(BBoxes) of MultiClassNMS should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scores"),
-                   "Input(Scores) of MultiClassNMS should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MultiClassNMS should not be null.");
-
-    auto box_dims = ctx->GetInputDim("BBoxes");
-    auto score_dims = ctx->GetInputDim("Scores");
-    auto score_size = score_dims.size();
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE(score_size == 2 || score_size == 3,
-                     "The rank of Input(Scores) must be 2 or 3");
-      PADDLE_ENFORCE_EQ(box_dims.size(), 3,
-                        "The rank of Input(BBoxes) must be 3");
-      if (score_size == 3) {
-        PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 ||
-                           box_dims[2] == 16 || box_dims[2] == 24 ||
-                           box_dims[2] == 32,
-                       "The last dimension of Input(BBoxes) must be 4 or 8, "
-                       "represents the layout of coordinate "
-                       "[xmin, ymin, xmax, ymax] or "
-                       "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
-                       "8 points: [xi, yi] i= 1,2,...,8 or "
-                       "12 points: [xi, yi] i= 1,2,...,12 or "
-                       "16 points: [xi, yi] i= 1,2,...,16");
-        PADDLE_ENFORCE_EQ(
-            box_dims[1], score_dims[2],
-            "The 2nd dimension of Input(BBoxes) must be equal to "
-            "last dimension of Input(Scores), which represents the "
-            "predicted bboxes.");
-      } else {
-        PADDLE_ENFORCE(box_dims[2] == 4,
-                       "The last dimension of Input(BBoxes) must be 4");
-        PADDLE_ENFORCE_EQ(box_dims[1], score_dims[1],
-                          "The 2nd dimension of Input(BBoxes)"
-                          "must be equal to the 2nd dimension"
-                          " of Input(Scores)");
-      }
-    }
-    // Here the box_dims[0] is not the real dimension of output.
-    // It will be rewritten in the computing kernel.
-    if (score_size == 3) {
-      ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
-    } else {
-      ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("Scores")->type(),
-        platform::CPUPlace());
-  }
-};
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
-    T inter_w = inter_xmax - inter_xmin + norm;
-    T inter_h = inter_ymax - inter_ymin + norm;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-T PolyIoU(const T* box1, const T* box2, const size_t box_size,
-          const bool normalized) {
-  T bbox1_area = PolyArea<T>(box1, box_size, normalized);
-  T bbox2_area = PolyArea<T>(box2, box_size, normalized);
-  T inter_area = PolyOverlapArea<T>(box1, box2, box_size, normalized);
-  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
-    // If coordinate values are invalid
-    // if area size <= 0,  return 0.
-    return T(0.);
-  } else {
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-void SliceOneClass(const platform::DeviceContext& ctx,
-                   const framework::Tensor& items, const int class_id,
-                   framework::Tensor* one_class_item) {
-  T* item_data = one_class_item->mutable_data<T>(ctx.GetPlace());
-  const T* items_data = items.data<T>();
-  const int64_t num_item = items.dims()[0];
-  const int class_num = items.dims()[1];
-  if (items.dims().size() == 3) {
-    int item_size = items.dims()[2];
-    for (int i = 0; i < num_item; ++i) {
-      std::memcpy(item_data + i * item_size,
-                  items_data + i * class_num * item_size + class_id * item_size,
-                  sizeof(T) * item_size);
-    }
-  } else {
-    for (int i = 0; i < num_item; ++i) {
-      item_data[i] = items_data[i * class_num + class_id];
-    }
-  }
-}
-
-template <typename T>
-class MultiClassNMSKernel : public framework::OpKernel<T> {
- public:
-  void NMSFast(const Tensor& bbox, const Tensor& scores,
-               const T score_threshold, const T nms_threshold, const T eta,
-               const int64_t top_k, std::vector<int>* selected_indices,
-               const bool normalized) const {
-    // The total boxes for each instance.
-    int64_t num_boxes = bbox.dims()[0];
-    // 4: [xmin ymin xmax ymax]
-    // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
-    // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
-    int64_t box_size = bbox.dims()[1];
-
-    std::vector<T> scores_data(num_boxes);
-    std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-    std::vector<std::pair<T, int>> sorted_indices;
-    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-    selected_indices->clear();
-    T adaptive_threshold = nms_threshold;
-    const T* bbox_data = bbox.data<T>();
-
-    while (sorted_indices.size() != 0) {
-      const int idx = sorted_indices.front().second;
-      bool keep = true;
-      for (size_t k = 0; k < selected_indices->size(); ++k) {
-        if (keep) {
-          const int kept_idx = (*selected_indices)[k];
-          T overlap = T(0.);
-          // 4: [xmin ymin xmax ymax]
-          if (box_size == 4) {
-            overlap =
-                JaccardOverlap<T>(bbox_data + idx * box_size,
-                                  bbox_data + kept_idx * box_size, normalized);
-          }
-          // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
-          if (box_size == 8 || box_size == 16 || box_size == 24 ||
-              box_size == 32) {
-            overlap = PolyIoU<T>(bbox_data + idx * box_size,
-                                 bbox_data + kept_idx * box_size, box_size,
-                                 normalized);
-          }
-          keep = overlap <= adaptive_threshold;
-        } else {
-          break;
-        }
-      }
-      if (keep) {
-        selected_indices->push_back(idx);
-      }
-      sorted_indices.erase(sorted_indices.begin());
-      if (keep && eta < 1 && adaptive_threshold > 0.5) {
-        adaptive_threshold *= eta;
-      }
-    }
-  }
-
-  void MultiClassNMS(const framework::ExecutionContext& ctx,
-                     const Tensor& scores, const Tensor& bboxes,
-                     const int scores_size,
-                     std::map<int, std::vector<int>>* indices,
-                     int* num_nmsed_out) const {
-    int64_t background_label = ctx.Attr<int>("background_label");
-    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
-    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
-    bool normalized = ctx.Attr<bool>("normalized");
-    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
-    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
-    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-
-    int num_det = 0;
-
-    int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
-    Tensor bbox_slice, score_slice;
-    for (int64_t c = 0; c < class_num; ++c) {
-      if (c == background_label) continue;
-      if (scores_size == 3) {
-        score_slice = scores.Slice(c, c + 1);
-        bbox_slice = bboxes;
-      } else {
-        score_slice.Resize({scores.dims()[0], 1});
-        bbox_slice.Resize({scores.dims()[0], 4});
-        SliceOneClass<T>(dev_ctx, scores, c, &score_slice);
-        SliceOneClass<T>(dev_ctx, bboxes, c, &bbox_slice);
-      }
-      NMSFast(bbox_slice, score_slice, score_threshold, nms_threshold, nms_eta,
-              nms_top_k, &((*indices)[c]), normalized);
-      if (scores_size == 2) {
-        std::stable_sort((*indices)[c].begin(), (*indices)[c].end());
-      }
-      num_det += (*indices)[c].size();
-    }
-
-    *num_nmsed_out = num_det;
-    const T* scores_data = scores.data<T>();
-    if (keep_top_k > -1 && num_det > keep_top_k) {
-      const T* sdata;
-      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-      for (const auto& it : *indices) {
-        int label = it.first;
-        if (scores_size == 3) {
-          sdata = scores_data + label * scores.dims()[1];
-        } else {
-          score_slice.Resize({scores.dims()[0], 1});
-          SliceOneClass<T>(dev_ctx, scores, label, &score_slice);
-          sdata = score_slice.data<T>();
-        }
-        const std::vector<int>& label_indices = it.second;
-        for (size_t j = 0; j < label_indices.size(); ++j) {
-          int idx = label_indices[j];
-          score_index_pairs.push_back(
-              std::make_pair(sdata[idx], std::make_pair(label, idx)));
-        }
-      }
-      // Keep top k results per image.
-      std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                       SortScorePairDescend<std::pair<int, int>>);
-      score_index_pairs.resize(keep_top_k);
-
-      // Store the new indices.
-      std::map<int, std::vector<int>> new_indices;
-      for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-        int label = score_index_pairs[j].second.first;
-        int idx = score_index_pairs[j].second.second;
-        new_indices[label].push_back(idx);
-      }
-      if (scores_size == 2) {
-        for (const auto& it : new_indices) {
-          int label = it.first;
-          std::stable_sort(new_indices[label].begin(),
-                           new_indices[label].end());
-        }
-      }
-      new_indices.swap(*indices);
-      *num_nmsed_out = keep_top_k;
-    }
-  }
-
-  void MultiClassOutput(const platform::DeviceContext& ctx,
-                        const Tensor& scores, const Tensor& bboxes,
-                        const std::map<int, std::vector<int>>& selected_indices,
-                        const int scores_size, Tensor* outs,
-                        int* oindices = nullptr, const int offset = 0) const {
-    int64_t class_num = scores.dims()[1];
-    int64_t predict_dim = scores.dims()[1];
-    int64_t box_size = bboxes.dims()[1];
-    if (scores_size == 2) {
-      box_size = bboxes.dims()[2];
-    }
-    int64_t out_dim = box_size + 2;
-    auto* scores_data = scores.data<T>();
-    auto* bboxes_data = bboxes.data<T>();
-    auto* odata = outs->data<T>();
-    const T* sdata;
-    Tensor bbox;
-    bbox.Resize({scores.dims()[0], box_size});
-    int count = 0;
-    for (const auto& it : selected_indices) {
-      int label = it.first;
-      const std::vector<int>& indices = it.second;
-      if (scores_size == 2) {
-        SliceOneClass<T>(ctx, bboxes, label, &bbox);
-      } else {
-        sdata = scores_data + label * predict_dim;
-      }
-      for (size_t j = 0; j < indices.size(); ++j) {
-        int idx = indices[j];
-        odata[count * out_dim] = label;  // label
-        const T* bdata;
-        if (scores_size == 3) {
-          bdata = bboxes_data + idx * box_size;
-          odata[count * out_dim + 1] = sdata[idx];  // score
-          if (oindices != nullptr) {
-            oindices[count] = offset + idx;
-          }
-        } else {
-          bdata = bbox.data<T>() + idx * box_size;
-          odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
-          if (oindices != nullptr) {
-            oindices[count] = offset + idx * class_num + label;
-          }
-        }
-        // xmin, ymin, xmax, ymax or multi-points coordinates
-        std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
-        count++;
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* boxes = ctx.Input<LoDTensor>("BBoxes");
-    auto* scores = ctx.Input<LoDTensor>("Scores");
-    auto* outs = ctx.Output<LoDTensor>("Out");
-    bool return_index = ctx.HasOutput("Index") ? true : false;
-    auto index = ctx.Output<LoDTensor>("Index");
-    auto score_dims = scores->dims();
-    auto score_size = score_dims.size();
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-
-    std::vector<std::map<int, std::vector<int>>> all_indices;
-    std::vector<size_t> batch_starts = {0};
-    int64_t batch_size = score_dims[0];
-    int64_t box_dim = boxes->dims()[2];
-    int64_t out_dim = box_dim + 2;
-    int num_nmsed_out = 0;
-    Tensor boxes_slice, scores_slice;
-    int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
-    for (int i = 0; i < n; ++i) {
-      if (score_size == 3) {
-        scores_slice = scores->Slice(i, i + 1);
-        scores_slice.Resize({score_dims[1], score_dims[2]});
-        boxes_slice = boxes->Slice(i, i + 1);
-        boxes_slice.Resize({score_dims[2], box_dim});
-      } else {
-        auto boxes_lod = boxes->lod().back();
-        scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]);
-        boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]);
-      }
-      std::map<int, std::vector<int>> indices;
-      MultiClassNMS(ctx, scores_slice, boxes_slice, score_size, &indices,
-                    &num_nmsed_out);
-      all_indices.push_back(indices);
-      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-    }
-
-    int num_kept = batch_starts.back();
-    if (num_kept == 0) {
-      if (return_index) {
-        outs->mutable_data<T>({0, out_dim}, ctx.GetPlace());
-        index->mutable_data<int>({0, 1}, ctx.GetPlace());
-      } else {
-        T* od = outs->mutable_data<T>({1, 1}, ctx.GetPlace());
-        od[0] = -1;
-        batch_starts = {0, 1};
-      }
-    } else {
-      outs->mutable_data<T>({num_kept, out_dim}, ctx.GetPlace());
-      int offset = 0;
-      int* oindices = nullptr;
-      for (int i = 0; i < n; ++i) {
-        if (score_size == 3) {
-          scores_slice = scores->Slice(i, i + 1);
-          boxes_slice = boxes->Slice(i, i + 1);
-          scores_slice.Resize({score_dims[1], score_dims[2]});
-          boxes_slice.Resize({score_dims[2], box_dim});
-          if (return_index) {
-            offset = i * score_dims[2];
-          }
-        } else {
-          auto boxes_lod = boxes->lod().back();
-          scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]);
-          boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]);
-          if (return_index) {
-            offset = boxes_lod[i] * score_dims[1];
-          }
-        }
-        int64_t s = batch_starts[i];
-        int64_t e = batch_starts[i + 1];
-        if (e > s) {
-          Tensor out = outs->Slice(s, e);
-          if (return_index) {
-            int* output_idx =
-                index->mutable_data<int>({num_kept, 1}, ctx.GetPlace());
-            oindices = output_idx + s;
-          }
-          MultiClassOutput(dev_ctx, scores_slice, boxes_slice, all_indices[i],
-                           score_dims.size(), &out, oindices, offset);
-        }
-      }
-    }
-
-    framework::LoD lod;
-    lod.emplace_back(batch_starts);
-    if (return_index) {
-      index->set_lod(lod);
-    }
-    outs->set_lod(lod);
-  }
-};
-
-class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("BBoxes",
-             "Two types of bboxes are supported:"
-             "1. (Tensor) A 3-D Tensor with shape "
-             "[N, M, 4 or 8 16 24 32] represents the "
-             "predicted locations of M bounding bboxes, N is the batch size. "
-             "Each bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax], when box size equals to 4."
-             "2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]"
-             "M is the number of bounding boxes, C is the class number");
-    AddInput("Scores",
-             "Two types of scores are supported:"
-             "1. (Tensor) A 3-D Tensor with shape [N, C, M] represents the "
-             "predicted confidence predictions. N is the batch size, C is the "
-             "class number, M is number of bounding boxes. For each category "
-             "there are total M scores which corresponding M bounding boxes. "
-             " Please note, M is equal to the 2nd dimension of BBoxes. "
-             "2. (LoDTensor) A 2-D LoDTensor with shape [M, C]. "
-             "M is the number of bbox, C is the class number. In this case, "
-             "Input BBoxes should be the second case with shape [M, C, 4].");
-    AddAttr<int>(
-        "background_label",
-        "(int, default: 0) "
-        "The index of background label, the background label will be ignored. "
-        "If set to -1, then all categories will be considered.")
-        .SetDefault(0);
-    AddAttr<float>("score_threshold",
-                   "(float) "
-                   "Threshold to filter out bounding boxes with low "
-                   "confidence score. If not provided, consider all boxes.");
-    AddAttr<int>("nms_top_k",
-                 "(int64_t) "
-                 "Maximum number of detections to be kept according to the "
-                 "confidences aftern the filtering detections based on "
-                 "score_threshold");
-    AddAttr<float>("nms_threshold",
-                   "(float, default: 0.3) "
-                   "The threshold to be used in NMS.")
-        .SetDefault(0.3);
-    AddAttr<float>("nms_eta",
-                   "(float) "
-                   "The parameter for adaptive NMS.")
-        .SetDefault(1.0);
-    AddAttr<int>("keep_top_k",
-                 "(int64_t) "
-                 "Number of total bboxes to be kept per image after NMS "
-                 "step. -1 means keeping all bboxes after NMS step.");
-    AddAttr<bool>("normalized",
-                  "(bool, default true) "
-                  "Whether detections are normalized.")
-        .SetDefault(true);
-    AddOutput("Out",
-              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
-              "detections. Each row has 6 values: "
-              "[label, confidence, xmin, ymin, xmax, ymax] or "
-              "(LoDTensor) A 2-D LoDTensor with shape [No, 10] represents the "
-              "detections. Each row has 10 values: "
-              "[label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the "
-              "total number of detections in this mini-batch."
-              "For each instance, "
-              "the offsets in first dimension are called LoD, the number of "
-              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
-              "no detected bbox.");
-    AddComment(R"DOC(
-This operator is to do multi-class non maximum suppression (NMS) on a batched
-of boxes and scores.
-In the NMS step, this operator greedily selects a subset of detection bounding
-boxes that have high scores larger than score_threshold, if providing this
-threshold, then selects the largest nms_top_k confidences scores if nms_top_k
-is larger than -1. Then this operator pruns away boxes that have high IOU
-(intersection over union) overlap with already selected boxes by adaptive
-threshold NMS based on parameters of nms_threshold and nms_eta.
-Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
-per image if keep_top_k is larger than -1.
-This operator support multi-class and batched inputs. It applying NMS
-independently for each class. The outputs is a 2-D LoDTenosr, for each
-image, the offsets in first dimension of LoDTensor are called LoD, the number
-of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
-means there is no detected bbox for this image.
-)DOC");
-  }
-};
-
-class MultiClassNMS2Op : public MultiClassNMSOp {
- public:
-  MultiClassNMS2Op(const std::string& type,
-                   const framework::VariableNameMap& inputs,
-                   const framework::VariableNameMap& outputs,
-                   const framework::AttributeMap& attrs)
-      : MultiClassNMSOp(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    MultiClassNMSOp::InferShape(ctx);
-
-    auto box_dims = ctx->GetInputDim("BBoxes");
-    auto score_dims = ctx->GetInputDim("Scores");
-    auto score_size = score_dims.size();
-    if (score_size == 3) {
-      ctx->SetOutputDim("Index", {box_dims[1], 1});
-    } else {
-      ctx->SetOutputDim("Index", {-1, 1});
-    }
-  }
-};
-
-class MultiClassNMS2OpMaker : public MultiClassNMSOpMaker {
- public:
-  void Make() override {
-    MultiClassNMSOpMaker::Make();
-    AddOutput("Index",
-              "(LoDTensor) A 2-D LoDTensor with shape [No, 1] represents the "
-              "index of selected bbox. The index is the absolute index cross "
-              "batches.")
-        .AsIntermediate();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
-                  ops::MultiClassNMSOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
-                       ops::MultiClassNMSKernel<double>);
-REGISTER_OPERATOR(multiclass_nms2, ops::MultiClassNMS2Op,
-                  ops::MultiClassNMS2OpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(multiclass_nms2, ops::MultiClassNMSKernel<float>,
-                       ops::MultiClassNMSKernel<double>);
diff --git a/paddle/fluid/operators/detection/poly_util.cc b/paddle/fluid/operators/detection/poly_util.cc
deleted file mode 100644
index 1af2c95c6cf526d651b196b54614a21a9cddde8c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/poly_util.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef POLY_UTIL_CC_
-#define POLY_UTIL_CC_
-
-#include "paddle/fluid/operators/detection/poly_util.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using gpc::gpc_polygon_clip;
-using gpc::gpc_free_polygon;
-
-template <class T>
-void Array2PointVec(const T*& box, const size_t box_size,
-                    std::vector<Point_<T>>& vec) {
-  size_t pts_num = box_size / 2;
-  vec.resize(pts_num);
-  for (size_t i = 0; i < pts_num; i++) {
-    vec.at(i).x = box[2 * i];
-    vec.at(i).y = box[2 * i + 1];
-  }
-}
-
-template <class T>
-void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly) {
-  size_t pts_num = box_size / 2;
-  poly.num_contours = 1;
-  poly.hole = (int*)malloc(sizeof(int));
-  poly.hole[0] = 0;
-  poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
-  poly.contour->num_vertices = pts_num;
-  poly.contour->vertex =
-      (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
-  for (size_t i = 0; i < pts_num; ++i) {
-    poly.contour->vertex[i].x = box[2 * i];
-    poly.contour->vertex[i].y = box[2 * i + 1];
-  }
-}
-
-template <class T>
-void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly) {
-  int pts_num = vec.size();
-  poly.num_contours = 1;
-  poly.hole = (int*)malloc(sizeof(int));
-  poly.hole[0] = 0;
-  poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
-  poly.contour->num_vertices = pts_num;
-  poly.contour->vertex =
-      (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
-  for (size_t i = 0; i < pts_num; ++i) {
-    poly.contour->vertex[i].x = vec[i].x;
-    poly.contour->vertex[i].y = vec[i].y;
-  }
-}
-
-template <class T>
-void Poly2PointVec(const gpc::gpc_vertex_list& contour,
-                   std::vector<Point_<T>>& vec) {
-  int pts_num = contour.num_vertices;
-  vec.resize(pts_num);
-  for (int i = 0; i < pts_num; i++) {
-    vec.at(i).x = contour.vertex[i].x;
-    vec.at(i).y = contour.vertex[i].y;
-  }
-}
-
-template <class T>
-T GetContourArea(std::vector<Point_<T>>& vec) {
-  size_t pts_num = vec.size();
-  if (pts_num < 3) return T(0.);
-  T area = T(0.);
-  for (size_t i = 0; i < pts_num; ++i) {
-    area += vec[i].x * vec[(i + 1) % pts_num].y -
-            vec[i].y * vec[(i + 1) % pts_num].x;
-  }
-  return std::fabs(area / 2.0);
-}
-
-template <class T>
-T PolyArea(const T* box, const size_t box_size, const bool normalized) {
-  // If coordinate values are is invalid
-  // if area size <= 0,  return 0.
-  std::vector<Point_<T>> vec;
-  Array2PointVec<T>(box, box_size, vec);
-  return GetContourArea<T>(vec);
-}
-
-template <class T>
-T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
-                  const bool normalized) {
-  gpc::gpc_polygon poly1;
-  gpc::gpc_polygon poly2;
-  Array2Poly<T>(box1, box_size, poly1);
-  Array2Poly<T>(box2, box_size, poly2);
-  gpc::gpc_polygon respoly;
-  gpc::gpc_op op = gpc::GPC_INT;
-  gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
-
-  T inter_area = T(0.);
-  int contour_num = respoly.num_contours;
-  for (int i = 0; i < contour_num; ++i) {
-    std::vector<Point_<T>> resvec;
-    Poly2PointVec<T>(respoly.contour[i], resvec);
-    // inter_area += std::fabs(cv::contourArea(resvec)) + 0.5f *
-    // (cv::arcLength(resvec, true));
-    inter_area += GetContourArea<T>(resvec);
-  }
-
-  gpc::gpc_free_polygon(&poly1);
-  gpc::gpc_free_polygon(&poly2);
-  gpc::gpc_free_polygon(&respoly);
-  return inter_area;
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/operators/detection/poly_util.h b/paddle/fluid/operators/detection/poly_util.h
deleted file mode 100644
index f07baf72d9ff07b8fcb45dcfb2a35741fb1aeed0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/poly_util.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef POLY_UTIL_H_
-#define POLY_UTIL_H_
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/gpc.h"
-
-namespace paddle {
-namespace operators {
-
-template <class T>
-class Point_ {
- public:
-  // default constructor
-  Point_() {}
-  Point_(T _x, T _y) {}
-  Point_(const Point_& pt) {}
-
-  Point_& operator=(const Point_& pt);
-  // conversion to another data type
-  // template<typename _T> operator Point_<_T>() const;
-  // conversion to the old-style C structures
-  // operator Vec<T, 2>() const;
-
-  // checks whether the point is inside the specified rectangle
-  // bool inside(const Rect_<T>& r) const;
-  T x;  //!< x coordinate of the point
-  T y;  //!< y coordinate of the point
-};
-
-template <class T>
-void Array2PointVec(const T*& box, const size_t box_size,
-                    std::vector<Point_<T>>& vec);
-
-template <class T>
-void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly);
-
-template <class T>
-void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly);
-
-template <class T>
-void Poly2PointVec(const gpc::gpc_vertex_list& contour,
-                   std::vector<Point_<T>>& vec);
-
-template <class T>
-T GetContourArea(std::vector<Point_<T>>& vec);
-
-template <class T>
-T PolyArea(const T* box, const size_t box_size, const bool normalized);
-
-template <class T>
-T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
-                  const bool normalized);
-}  // namespace operators
-}  // namespace paddle
-
-#include "paddle/fluid/operators/detection/poly_util.cc"
-
-#endif  // POLY_UTIL_H_
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
deleted file mode 100644
index 4b3bc2edb58fe23393d906094c41b6ad62c71155..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* in = ctx.Input<Tensor>("Input");
-    auto in_dims = in->dims();
-    const T* in_data = in->data<T>();
-    auto* out = ctx.Output<Tensor>("Output");
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    int batch_size = in_dims[0];
-    int geo_channel = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int id = 0;
-    for (int id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
-      for (int id_h = 0; id_h < height; ++id_h) {
-        for (int id_w = 0; id_w < width; ++id_w) {
-          id = id_n * height * width + width * id_h + id_w;
-          if (id_n % 2 == 0) {
-            out_data[id] = id_w * 4 - in_data[id];
-          } else {
-            out_data[id] = id_h * 4 - in_data[id];
-          }
-        }
-      }
-    }
-  }
-};
-
-class PolygonBoxTransformOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Input"),
-        "Input (Input) of polygon_box transform op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Output"),
-        "Output (Output) of polygon_box transform op should not be null.");
-
-    auto in_dim = ctx->GetInputDim("Input");
-
-    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "input's rank must be 4.");
-    PADDLE_ENFORCE_EQ(in_dim[1] % 2, 0,
-                      "input's second dimension must be even.");
-
-    ctx->SetOutputDim("Output", in_dim);
-  }
-};
-
-class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "Input",
-        "The input with shape [batch_size, geometry_channels, height, width]");
-    AddOutput("Output", "The output with the same shape as input");
-
-    AddComment(R"DOC(
-PolygonBoxTransform Operator.
-
-PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
-
-The input is the final geometry output in detection network.
-We use 2*n numbers to denote the coordinate shift from n corner vertices of
-the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
-the geometry output contains 2*n channels.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(polygon_box_transform, ops::PolygonBoxTransformOp,
-                  ops::PolygonBoxTransformOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    polygon_box_transform,
-    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
deleted file mode 100644
index e1eaf084a3413dd1d13514e2d7b22572d21dd119..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::PADDLE_CUDA_NUM_THREADS;
-#define CUDA_BLOCK_SIZE 16
-
-template <typename T>
-__global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
-                                          const T* input, T* output) {
-  int id_n = threadIdx.x + blockDim.x * blockIdx.x;
-  int id_h = threadIdx.y + blockDim.y * blockIdx.y;
-  int id_w = threadIdx.z + blockDim.z * blockIdx.z;
-  if (id_n < n && id_h < h && id_w < w) {
-    int id = id_n * h * w + w * id_h + id_w;
-    if (id_n % 2 == 0) {
-      output[id] = id_w * 4 - input[id];
-    } else {
-      output[id] = id_h * 4 - input[id];
-    }
-  }
-}
-
-template <typename T>
-class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* in = ctx.Input<Tensor>("Input");
-    auto in_dims = in->dims();
-    const T* in_data = in->data<T>();
-    auto* out = ctx.Output<Tensor>("Output");
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    int batch_size = in_dims[0];
-    int geo_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    dim3 threadsPerBlock(
-        PADDLE_CUDA_NUM_THREADS / (CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE),
-        CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
-    dim3 numBlocks((batch_size * geo_channels) / threadsPerBlock.x,
-                   (height + threadsPerBlock.y - 1) / threadsPerBlock.y,
-                   (width + threadsPerBlock.z - 1) / threadsPerBlock.z);
-    auto stream = ctx.cuda_device_context().stream();
-    PolygonBoxTransformKernel<T><<<numBlocks, threadsPerBlock, 0, stream>>>(
-        batch_size * geo_channels, height, width, in_data, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    polygon_box_transform,
-    paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
-    paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc
deleted file mode 100644
index 9709cbc058900cfc64839b450484957b18604583..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/prior_box_op.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class PriorBoxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of PriorBoxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Image"),
-                   "Input(Image) of PriorBoxOp should not be null.");
-
-    auto image_dims = ctx->GetInputDim("Image");
-    auto input_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
-
-    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
-                      "The height of input must smaller than image.");
-
-    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
-                      "The width of input must smaller than image.");
-
-    auto min_sizes = ctx->Attrs().Get<std::vector<float>>("min_sizes");
-    auto max_sizes = ctx->Attrs().Get<std::vector<float>>("max_sizes");
-    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
-    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
-    bool flip = ctx->Attrs().Get<bool>("flip");
-
-    std::vector<float> aspect_ratios_vec;
-    ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
-
-    size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
-    if (max_sizes.size() > 0) {
-      PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(),
-                        "The number of min_size and max_size must be equal.");
-      num_priors += max_sizes.size();
-      for (size_t i = 0; i < max_sizes.size(); ++i) {
-        PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i],
-                          "max_size[%d] must be greater than min_size[%d].", i,
-                          i);
-      }
-    }
-
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_priors;
-    dim_vec[3] = 4;
-    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
-    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_input_type = ctx.Input<framework::Tensor>("Input")->type();
-
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
-#ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
-      auto input_image_type = ctx.Input<framework::Tensor>("Image")->type();
-      int customized_type_value =
-          framework::OpKernelType::kDefaultCustomizedTypeValue;
-      if (input_image_type == framework::DataTypeTrait<float>::DataType()) {
-        customized_type_value = kPriorBoxFLOAT;
-      } else if (input_image_type ==
-                 framework::DataTypeTrait<double>::DataType()) {
-        customized_type_value = kPriorBoxDOUBLE;
-      }
-      return framework::OpKernelType(input_input_type, ctx.GetPlace(), layout_,
-                                     library_, customized_type_value);
-    }
-#endif
-    return framework::OpKernelType(input_input_type, ctx.GetPlace(), layout_,
-                                   library_);
-  }
-};
-
-class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(Tensor, default Tensor<float>), "
-             "the input feature data of PriorBoxOp, The layout is NCHW.");
-    AddInput("Image",
-             "(Tensor, default Tensor<float>), "
-             "the input image data of PriorBoxOp, The layout is NCHW.");
-    AddOutput("Boxes",
-              "(Tensor, default Tensor<float>), the output prior boxes of "
-              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
-              "H is the height of input, W is the width of input, num_priors "
-              "is the box count of each position.");
-    AddOutput("Variances",
-              "(Tensor, default Tensor<float>), the expanded variances of "
-              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
-              "H is the height of input, W is the width of input, num_priors "
-              "is the box count of each position.");
-
-    AddAttr<std::vector<float>>("min_sizes",
-                                "(vector<float>) List of min sizes "
-                                "of generated prior boxes.")
-        .AddCustomChecker([](const std::vector<float>& min_sizes) {
-          PADDLE_ENFORCE_GT(min_sizes.size(), 0,
-                            "Size of min_sizes must be at least 1.");
-          for (size_t i = 0; i < min_sizes.size(); ++i) {
-            PADDLE_ENFORCE_GT(min_sizes[i], 0.0,
-                              "min_sizes[%d] must be positive.", i);
-          }
-        });
-    AddAttr<std::vector<float>>(
-        "max_sizes",
-        "(vector<float>) List of max sizes of generated prior boxes.")
-        .SetDefault(std::vector<float>{});
-    AddAttr<std::vector<float>>(
-        "aspect_ratios",
-        "(vector<float>) List of aspect ratios of generated prior boxes.");
-
-    AddAttr<std::vector<float>>(
-        "variances",
-        "(vector<float>) List of variances to be encoded in prior boxes.")
-        .AddCustomChecker([](const std::vector<float>& variances) {
-          PADDLE_ENFORCE_EQ(variances.size(), 4,
-                            "Must and only provide 4 variance.");
-          for (size_t i = 0; i < variances.size(); ++i) {
-            PADDLE_ENFORCE_GT(variances[i], 0.0,
-                              "variance[%d] must be greater than 0.", i);
-          }
-        });
-    AddAttr<bool>("flip", "(bool) Whether to flip aspect ratios.")
-        .SetDefault(true);
-    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
-        .SetDefault(true);
-
-    AddAttr<float>("step_w",
-                   "Prior boxes step across width, 0.0 for auto calculation.")
-        .SetDefault(0.0)
-        .AddCustomChecker([](const float& step_w) {
-          PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0.");
-        });
-    AddAttr<float>("step_h",
-                   "Prior boxes step across height, 0.0 for auto calculation.")
-        .SetDefault(0.0)
-        .AddCustomChecker([](const float& step_h) {
-          PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0.");
-        });
-
-    AddAttr<float>("offset",
-                   "(float) "
-                   "Prior boxes center offset.")
-        .SetDefault(0.5);
-    AddAttr<bool>(
-        "min_max_aspect_ratios_order",
-        "(bool) If set True, the output prior box is in order of"
-        "[min, max, aspect_ratios], which is consistent with Caffe."
-        "Please note, this order affects the weights order of convolution layer"
-        "followed by and does not affect the final detection results.")
-        .SetDefault(false);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Prior box operator
-Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
-Each position of the input produce N prior boxes, N is determined by
- the count of min_sizes, max_sizes and aspect_ratios, The size of the
- box is in range(min_size, max_size) interval, which is generated in
- sequence according to the aspect_ratios.
-
-Please get more information from the following papers:
-https://arxiv.org/abs/1512.02325.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(prior_box, ops::PriorBoxOpKernel<float, float>,
-                       ops::PriorBoxOpKernel<double, double>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN,
-                                    ::paddle::platform::CPUPlace, FF,
-                                    ops::kPriorBoxFLOAT,
-                                    ops::PriorBoxOpKernel<float, float>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN,
-                                    ::paddle::platform::CPUPlace, DD,
-                                    ops::kPriorBoxDOUBLE,
-                                    ops::PriorBoxOpKernel<double, double>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN,
-                                    ::paddle::platform::CPUPlace, U8F,
-                                    ops::kPriorBoxFLOAT,
-                                    ops::PriorBoxOpKernel<uint8_t, float>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN,
-                                    ::paddle::platform::CPUPlace, S8F,
-                                    ops::kPriorBoxFLOAT,
-                                    ops::PriorBoxOpKernel<int8_t, float>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN,
-                                    ::paddle::platform::CPUPlace, U8D,
-                                    ops::kPriorBoxDOUBLE,
-                                    ops::PriorBoxOpKernel<uint8_t, double>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN,
-                                    ::paddle::platform::CPUPlace, S8D,
-                                    ops::kPriorBoxDOUBLE,
-                                    ops::PriorBoxOpKernel<int8_t, double>);
diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu
deleted file mode 100644
index 1ea8cfc1d2af8cc6c332768a467cdcd4c0166319..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/prior_box_op.cu
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/prior_box_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__device__ inline T clip(T in) {
-  return min(max(in, 0.), 1.);
-}
-
-template <typename T>
-__global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
-                            const int width, const int im_height,
-                            const int im_width, const int as_num,
-                            const T offset, const T step_width,
-                            const T step_height, const T* min_sizes,
-                            const T* max_sizes, const int min_num, bool is_clip,
-                            bool min_max_aspect_ratios_order) {
-  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
-  int box_num = height * width * num_priors;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
-       i += blockDim.x * gridDim.x) {
-    int h = i / (num_priors * width);
-    int w = (i / num_priors) % width;
-    int p = i % num_priors;
-    int m = max_sizes ? p / (as_num + 1) : p / as_num;
-    T cx = (w + offset) * step_width;
-    T cy = (h + offset) * step_height;
-    T bw, bh;
-    T min_size = min_sizes[m];
-    if (max_sizes) {
-      int s = p % (as_num + 1);
-      if (!min_max_aspect_ratios_order) {
-        if (s < as_num) {
-          T ar = aspect_ratios[s];
-          bw = min_size * sqrt(ar) / 2.;
-          bh = min_size / sqrt(ar) / 2.;
-        } else {
-          T max_size = max_sizes[m];
-          bw = sqrt(min_size * max_size) / 2.;
-          bh = bw;
-        }
-      } else {
-        if (s == 0) {
-          bw = bh = min_size / 2.;
-        } else if (s == 1) {
-          T max_size = max_sizes[m];
-          bw = sqrt(min_size * max_size) / 2.;
-          bh = bw;
-        } else {
-          T ar = aspect_ratios[s - 1];
-          bw = min_size * sqrt(ar) / 2.;
-          bh = min_size / sqrt(ar) / 2.;
-        }
-      }
-    } else {
-      int s = p % as_num;
-      T ar = aspect_ratios[s];
-      bw = min_size * sqrt(ar) / 2.;
-      bh = min_size / sqrt(ar) / 2.;
-    }
-    T xmin = (cx - bw) / im_width;
-    T ymin = (cy - bh) / im_height;
-    T xmax = (cx + bw) / im_width;
-    T ymax = (cy + bh) / im_height;
-    out[i * 4] = is_clip ? clip<T>(xmin) : xmin;
-    out[i * 4 + 1] = is_clip ? clip<T>(ymin) : ymin;
-    out[i * 4 + 2] = is_clip ? clip<T>(xmax) : xmax;
-    out[i * 4 + 3] = is_clip ? clip<T>(ymax) : ymax;
-  }
-}
-
-template <typename T>
-__global__ void SetVariance(T* out, const T* var, const int vnum,
-                            const int num) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    out[i] = var[i % vnum];
-  }
-}
-
-template <typename T>
-class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
-
-    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
-    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
-    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-    auto flip = ctx.Attr<bool>("flip");
-    auto clip = ctx.Attr<bool>("clip");
-    auto min_max_aspect_ratios_order =
-        ctx.Attr<bool>("min_max_aspect_ratios_order");
-
-    std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-
-    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
-    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
-    T offset = static_cast<T>(ctx.Attr<float>("offset"));
-
-    auto im_width = image->dims()[3];
-    auto im_height = image->dims()[2];
-
-    auto width = input->dims()[3];
-    auto height = input->dims()[2];
-
-    T step_width, step_height;
-    if (step_w == 0 || step_h == 0) {
-      step_width = static_cast<T>(im_width) / width;
-      step_height = static_cast<T>(im_height) / height;
-    } else {
-      step_width = step_w;
-      step_height = step_h;
-    }
-
-    int num_priors = aspect_ratios.size() * min_sizes.size();
-    if (max_sizes.size() > 0) {
-      num_priors += max_sizes.size();
-    }
-    int min_num = static_cast<int>(min_sizes.size());
-    int box_num = width * height * num_priors;
-
-    int block = 512;
-    int grid = (box_num + block - 1) / block;
-
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-
-    boxes->mutable_data<T>(ctx.GetPlace());
-    vars->mutable_data<T>(ctx.GetPlace());
-
-    framework::Tensor r;
-    framework::TensorFromVector(aspect_ratios, ctx.device_context(), &r);
-
-    framework::Tensor min;
-    framework::TensorFromVector(min_sizes, ctx.device_context(), &min);
-
-    T* max_data = nullptr;
-    framework::Tensor max;
-    if (max_sizes.size() > 0) {
-      framework::TensorFromVector(max_sizes, ctx.device_context(), &max);
-      max_data = max.data<T>();
-    }
-
-    GenPriorBox<T><<<grid, block, 0, stream>>>(
-        boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
-        aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
-        max_data, min_num, clip, min_max_aspect_ratios_order);
-
-    framework::Tensor v;
-    framework::TensorFromVector(variances, ctx.device_context(), &v);
-    grid = (box_num * 4 + block - 1) / block;
-    SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
-                                               variances.size(), box_num * 4);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(prior_box, ops::PriorBoxOpCUDAKernel<float>,
-                        ops::PriorBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
deleted file mode 100644
index 71c67b44eafd109eca6f85cfc4b1b2af603d67c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr int kPriorBoxFLOAT = 1;
-constexpr int kPriorBoxDOUBLE = 2;
-
-inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
-                               bool flip,
-                               std::vector<float>* output_aspect_ratior) {
-  constexpr float epsilon = 1e-6;
-  output_aspect_ratior->clear();
-  output_aspect_ratior->push_back(1.0f);
-  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-    float ar = input_aspect_ratior[i];
-    bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
-      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
-        already_exist = true;
-        break;
-      }
-    }
-    if (!already_exist) {
-      output_aspect_ratior->push_back(ar);
-      if (flip) {
-        output_aspect_ratior->push_back(1.0f / ar);
-      }
-    }
-  }
-}
-
-template <typename T, typename K>
-class PriorBoxOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
-
-    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
-    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
-    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-    auto flip = ctx.Attr<bool>("flip");
-    auto clip = ctx.Attr<bool>("clip");
-    auto min_max_aspect_ratios_order =
-        ctx.Attr<bool>("min_max_aspect_ratios_order");
-
-    std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-
-    K step_w = static_cast<K>(ctx.Attr<float>("step_w"));
-    K step_h = static_cast<K>(ctx.Attr<float>("step_h"));
-    K offset = static_cast<K>(ctx.Attr<float>("offset"));
-
-    auto img_width = image->dims()[3];
-    auto img_height = image->dims()[2];
-
-    auto feature_width = input->dims()[3];
-    auto feature_height = input->dims()[2];
-
-    K step_width, step_height;
-    if (step_w == 0 || step_h == 0) {
-      step_width = static_cast<K>(img_width) / feature_width;
-      step_height = static_cast<K>(img_height) / feature_height;
-    } else {
-      step_width = step_w;
-      step_height = step_h;
-    }
-
-    int num_priors = aspect_ratios.size() * min_sizes.size();
-    if (max_sizes.size() > 0) {
-      num_priors += max_sizes.size();
-    }
-
-    boxes->mutable_data<K>(ctx.GetPlace());
-    vars->mutable_data<K>(ctx.GetPlace());
-
-    K* b_t = boxes->data<K>();
-    for (int h = 0; h < feature_height; ++h) {
-      for (int w = 0; w < feature_width; ++w) {
-        K center_x = (w + offset) * step_width;
-        K center_y = (h + offset) * step_height;
-        K box_width, box_height;
-        for (size_t s = 0; s < min_sizes.size(); ++s) {
-          auto min_size = min_sizes[s];
-          if (min_max_aspect_ratios_order) {
-            box_width = box_height = min_size / 2.;
-            b_t[0] = (center_x - box_width) / img_width;
-            b_t[1] = (center_y - box_height) / img_height;
-            b_t[2] = (center_x + box_width) / img_width;
-            b_t[3] = (center_y + box_height) / img_height;
-            b_t += 4;
-            if (max_sizes.size() > 0) {
-              auto max_size = max_sizes[s];
-              // square prior with size sqrt(minSize * maxSize)
-              box_width = box_height = sqrt(min_size * max_size) / 2.;
-              b_t[0] = (center_x - box_width) / img_width;
-              b_t[1] = (center_y - box_height) / img_height;
-              b_t[2] = (center_x + box_width) / img_width;
-              b_t[3] = (center_y + box_height) / img_height;
-              b_t += 4;
-            }
-            // priors with different aspect ratios
-            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-              float ar = aspect_ratios[r];
-              if (fabs(ar - 1.) < 1e-6) {
-                continue;
-              }
-              box_width = min_size * sqrt(ar) / 2.;
-              box_height = min_size / sqrt(ar) / 2.;
-              b_t[0] = (center_x - box_width) / img_width;
-              b_t[1] = (center_y - box_height) / img_height;
-              b_t[2] = (center_x + box_width) / img_width;
-              b_t[3] = (center_y + box_height) / img_height;
-              b_t += 4;
-            }
-          } else {
-            // priors with different aspect ratios
-            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-              float ar = aspect_ratios[r];
-              box_width = min_size * sqrt(ar) / 2.;
-              box_height = min_size / sqrt(ar) / 2.;
-              b_t[0] = (center_x - box_width) / img_width;
-              b_t[1] = (center_y - box_height) / img_height;
-              b_t[2] = (center_x + box_width) / img_width;
-              b_t[3] = (center_y + box_height) / img_height;
-              b_t += 4;
-            }
-            if (max_sizes.size() > 0) {
-              auto max_size = max_sizes[s];
-              // square prior with size sqrt(minSize * maxSize)
-              box_width = box_height = sqrt(min_size * max_size) / 2.;
-              b_t[0] = (center_x - box_width) / img_width;
-              b_t[1] = (center_y - box_height) / img_height;
-              b_t[2] = (center_x + box_width) / img_width;
-              b_t[3] = (center_y + box_height) / img_height;
-              b_t += 4;
-            }
-          }
-        }
-      }
-    }
-
-    if (clip) {
-      K* dt = boxes->data<K>();
-      std::transform(dt, dt + boxes->numel(), dt, [](K v) -> K {
-        return std::min<K>(std::max<K>(v, 0.), 1.);
-      });
-    }
-
-    framework::Tensor var_t;
-    var_t.mutable_data<K>(
-        framework::make_ddim({1, static_cast<int>(variances.size())}),
-        ctx.GetPlace());
-    auto var_et = framework::EigenTensor<K, 2>::From(var_t);
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-    for (size_t i = 0; i < variances.size(); ++i) {
-      var_et(0, i) = variances[i];
-    }
-
-    int box_num = feature_height * feature_width * num_priors;
-    auto var_dim = vars->dims();
-    vars->Resize({box_num, static_cast<int>(variances.size())});
-
-    auto e_vars = framework::EigenMatrix<K, Eigen::RowMajor>::From(*vars);
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(2)
-#endif
-    for (int i = 0; i < box_num; ++i) {
-      for (int j = 0; j < variances.size(); ++j) {
-        e_vars(i, j) = variances[j];
-      }
-    }
-    vars->Resize(var_dim);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
deleted file mode 100644
index 4a6dfec12e660431844682694632a3b18d91bf3e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ /dev/null
@@ -1,566 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-class RetinanetDetectionOutputOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("BBoxes").size(), 1UL,
-        "Input(BBoxes) of RetinanetDetectionOutput should not be null.");
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("Scores").size(), 1UL,
-        "Input(Scores) of RetinanetDetectionOutput should not be null.");
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("Anchors").size(), 1UL,
-        "Input(Anchors) of RetinanetDetectionOutput should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("BBoxes").size(), ctx->Inputs("Scores").size(),
-        "Input tensors(BBoxes and Scores) should have the same size.");
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("BBoxes").size(), ctx->Inputs("Anchors").size(),
-        "Input tensors(BBoxes and Anchors) should have the same size.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("ImInfo"),
-        "Input(ImInfo) of RetinanetDetectionOutput should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(Out) of RetinanetDetectionOutput should not be null.");
-
-    auto bboxes_dims = ctx->GetInputsDim("BBoxes");
-    auto scores_dims = ctx->GetInputsDim("Scores");
-    auto anchors_dims = ctx->GetInputsDim("Anchors");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-
-    const size_t b_n = bboxes_dims.size();
-    PADDLE_ENFORCE_GT(b_n, 0, "Input bbox tensors count should > 0.");
-    const size_t s_n = scores_dims.size();
-    PADDLE_ENFORCE_GT(s_n, 0, "Input score tensors count should > 0.");
-    const size_t a_n = anchors_dims.size();
-    PADDLE_ENFORCE_GT(a_n, 0, "Input anchor tensors count should > 0.");
-
-    auto bbox_dims = bboxes_dims[0];
-    auto score_dims = scores_dims[0];
-    auto anchor_dims = anchors_dims[0];
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(score_dims.size(), 3,
-                        "The rank of Input(Scores) must be 3");
-      PADDLE_ENFORCE_EQ(bbox_dims.size(), 3,
-                        "The rank of Input(BBoxes) must be 3");
-      PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
-                        "The rank of Input(Anchors) must be 2");
-      PADDLE_ENFORCE(bbox_dims[2] == 4,
-                     "The last dimension of Input(BBoxes) must be 4, "
-                     "represents the layout of coordinate "
-                     "[xmin, ymin, xmax, ymax]");
-      PADDLE_ENFORCE_EQ(bbox_dims[1], score_dims[1],
-                        "The 2nd dimension of Input(BBoxes) must be equal to "
-                        "2nd dimension of Input(Scores), which represents the "
-                        "number of the predicted boxes.");
-
-      PADDLE_ENFORCE_EQ(anchor_dims[0], bbox_dims[1],
-                        "The 1st dimension of Input(Anchors) must be equal to "
-                        "2nd dimension of Input(BBoxes), which represents the "
-                        "number of the predicted boxes.");
-      PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
-                        "The rank of Input(ImInfo) must be 2.");
-    }
-    // Here the box_dims[0] is not the real dimension of output.
-    // It will be rewritten in the computing kernel.
-    ctx->SetOutputDim("Out", {bbox_dims[1], bbox_dims[2] + 2});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::GetDataTypeOfVar(ctx.MultiInputVar("Scores")[0]);
-
-    return framework::OpKernelType(input_data_type,
-                                   platform::CPUPlace());  // ctx.GetPlace());
-  }
-};
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-bool SortScoreTwoPairDescend(const std::pair<float, std::pair<T, T>>& pair1,
-                             const std::pair<float, std::pair<T, T>>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const std::vector<T>& box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const std::vector<T>& box1,
-                               const std::vector<T>& box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
-    T inter_w = inter_xmax - inter_xmin + norm;
-    T inter_h = inter_ymax - inter_ymin + norm;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
- public:
-  void NMSFast(const std::vector<std::vector<T>>& cls_dets,
-               const T nms_threshold, const T eta,
-               std::vector<int>* selected_indices) const {
-    int64_t num_boxes = cls_dets.size();
-    std::vector<std::pair<T, int>> sorted_indices;
-    for (int64_t i = 0; i < num_boxes; ++i) {
-      sorted_indices.push_back(std::make_pair(cls_dets[i][4], i));
-    }
-    // Sort the score pair according to the scores in descending order
-    std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
-                     SortScorePairDescend<int>);
-    selected_indices->clear();
-    T adaptive_threshold = nms_threshold;
-
-    while (sorted_indices.size() != 0) {
-      const int idx = sorted_indices.front().second;
-      bool keep = true;
-      for (size_t k = 0; k < selected_indices->size(); ++k) {
-        if (keep) {
-          const int kept_idx = (*selected_indices)[k];
-          T overlap = T(0.);
-
-          overlap = JaccardOverlap<T>(cls_dets[idx], cls_dets[kept_idx], false);
-          keep = overlap <= adaptive_threshold;
-        } else {
-          break;
-        }
-      }
-      if (keep) {
-        selected_indices->push_back(idx);
-      }
-      sorted_indices.erase(sorted_indices.begin());
-      if (keep && eta < 1 && adaptive_threshold > 0.5) {
-        adaptive_threshold *= eta;
-      }
-    }
-  }
-
-  void DeltaScoreToPrediction(
-      const std::vector<T>& bboxes_data, const std::vector<T>& anchors_data,
-      T im_height, T im_width, T im_scale, int class_num,
-      const std::vector<std::pair<T, int>>& sorted_indices,
-      std::map<int, std::vector<std::vector<T>>>* preds) const {
-    im_height = static_cast<T>(round(im_height / im_scale));
-    im_width = static_cast<T>(round(im_width / im_scale));
-    T zero(0);
-    int i = 0;
-    for (const auto& it : sorted_indices) {
-      T score = it.first;
-      int idx = it.second;
-      int a = idx / class_num;
-      int c = idx % class_num;
-
-      int box_offset = a * 4;
-      T anchor_box_width =
-          anchors_data[box_offset + 2] - anchors_data[box_offset] + 1;
-      T anchor_box_height =
-          anchors_data[box_offset + 3] - anchors_data[box_offset + 1] + 1;
-      T anchor_box_center_x = anchors_data[box_offset] + anchor_box_width / 2;
-      T anchor_box_center_y =
-          anchors_data[box_offset + 1] + anchor_box_height / 2;
-      T target_box_center_x = 0, target_box_center_y = 0;
-      T target_box_width = 0, target_box_height = 0;
-      target_box_center_x =
-          bboxes_data[box_offset] * anchor_box_width + anchor_box_center_x;
-      target_box_center_y =
-          bboxes_data[box_offset + 1] * anchor_box_height + anchor_box_center_y;
-      target_box_width =
-          std::exp(bboxes_data[box_offset + 2]) * anchor_box_width;
-      target_box_height =
-          std::exp(bboxes_data[box_offset + 3]) * anchor_box_height;
-      T pred_box_xmin = target_box_center_x - target_box_width / 2;
-      T pred_box_ymin = target_box_center_y - target_box_height / 2;
-      T pred_box_xmax = target_box_center_x + target_box_width / 2 - 1;
-      T pred_box_ymax = target_box_center_y + target_box_height / 2 - 1;
-      pred_box_xmin = pred_box_xmin / im_scale;
-      pred_box_ymin = pred_box_ymin / im_scale;
-      pred_box_xmax = pred_box_xmax / im_scale;
-      pred_box_ymax = pred_box_ymax / im_scale;
-
-      pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero);
-      pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero);
-      pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero);
-      pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero);
-
-      std::vector<T> one_pred;
-      one_pred.push_back(pred_box_xmin);
-      one_pred.push_back(pred_box_ymin);
-      one_pred.push_back(pred_box_xmax);
-      one_pred.push_back(pred_box_ymax);
-      one_pred.push_back(score);
-      (*preds)[c].push_back(one_pred);
-      i++;
-    }
-  }
-
-  void MultiClassNMS(const std::map<int, std::vector<std::vector<T>>>& preds,
-                     int class_num, const int keep_top_k, const T nms_threshold,
-                     const T nms_eta, std::vector<std::vector<T>>* nmsed_out,
-                     int* num_nmsed_out) const {
-    std::map<int, std::vector<int>> indices;
-    int num_det = 0;
-    for (int c = 0; c < class_num; ++c) {
-      if (static_cast<bool>(preds.count(c))) {
-        const std::vector<std::vector<T>> cls_dets = preds.at(c);
-        NMSFast(cls_dets, nms_threshold, nms_eta, &(indices[c]));
-        num_det += indices[c].size();
-      }
-    }
-
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : indices) {
-      int label = it.first;
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        score_index_pairs.push_back(std::make_pair(preds.at(label)[idx][4],
-                                                   std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                     SortScoreTwoPairDescend<int>);
-    if (num_det > keep_top_k) {
-      score_index_pairs.resize(keep_top_k);
-    }
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (const auto& it : score_index_pairs) {
-      int label = it.second.first;
-      int idx = it.second.second;
-      std::vector<T> one_pred;
-      one_pred.push_back(label);
-      one_pred.push_back(preds.at(label)[idx][4]);
-      one_pred.push_back(preds.at(label)[idx][0]);
-      one_pred.push_back(preds.at(label)[idx][1]);
-      one_pred.push_back(preds.at(label)[idx][2]);
-      one_pred.push_back(preds.at(label)[idx][3]);
-      nmsed_out->push_back(one_pred);
-    }
-
-    *num_nmsed_out = (num_det > keep_top_k ? keep_top_k : num_det);
-  }
-
-  void RetinanetDetectionOutput(const framework::ExecutionContext& ctx,
-                                const std::vector<Tensor>& scores,
-                                const std::vector<Tensor>& bboxes,
-                                const std::vector<Tensor>& anchors,
-                                const Tensor& im_info,
-                                std::vector<std::vector<T>>* nmsed_out,
-                                int* num_nmsed_out) const {
-    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
-    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
-    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
-    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
-    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
-
-    int64_t class_num = scores[0].dims()[1];
-    std::map<int, std::vector<std::vector<T>>> preds;
-    for (size_t l = 0; l < scores.size(); ++l) {
-      // Fetch per level score
-      Tensor scores_per_level = scores[l];
-      // Fetch per level bbox
-      Tensor bboxes_per_level = bboxes[l];
-      // Fetch per level anchor
-      Tensor anchors_per_level = anchors[l];
-
-      int64_t scores_num = scores_per_level.numel();
-      int64_t bboxes_num = bboxes_per_level.numel();
-      std::vector<T> scores_data(scores_num);
-      std::vector<T> bboxes_data(bboxes_num);
-      std::vector<T> anchors_data(bboxes_num);
-      std::copy_n(scores_per_level.data<T>(), scores_num, scores_data.begin());
-      std::copy_n(bboxes_per_level.data<T>(), bboxes_num, bboxes_data.begin());
-      std::copy_n(anchors_per_level.data<T>(), bboxes_num,
-                  anchors_data.begin());
-      std::vector<std::pair<T, int>> sorted_indices;
-
-      // For the highest level, we take the threshold 0.0
-      T threshold = (l < (scores.size() - 1) ? score_threshold : 0.0);
-      GetMaxScoreIndex(scores_data, threshold, nms_top_k, &sorted_indices);
-      auto* im_info_data = im_info.data<T>();
-      auto im_height = im_info_data[0];
-      auto im_width = im_info_data[1];
-      auto im_scale = im_info_data[2];
-      DeltaScoreToPrediction(bboxes_data, anchors_data, im_height, im_width,
-                             im_scale, class_num, sorted_indices, &preds);
-    }
-
-    MultiClassNMS(preds, class_num, keep_top_k, nms_threshold, nms_eta,
-                  nmsed_out, num_nmsed_out);
-  }
-
-  void MultiClassOutput(const platform::DeviceContext& ctx,
-                        const std::vector<std::vector<T>>& nmsed_out,
-                        Tensor* outs) const {
-    auto* odata = outs->data<T>();
-    int count = 0;
-    int64_t out_dim = 6;
-    for (size_t i = 0; i < nmsed_out.size(); ++i) {
-      odata[count * out_dim] = nmsed_out[i][0] + 1;  // label
-      odata[count * out_dim + 1] = nmsed_out[i][1];  // score
-      odata[count * out_dim + 2] = nmsed_out[i][2];  // xmin
-      odata[count * out_dim + 3] = nmsed_out[i][3];  // xmin
-      odata[count * out_dim + 4] = nmsed_out[i][4];  // xmin
-      odata[count * out_dim + 5] = nmsed_out[i][5];  // xmin
-      count++;
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto boxes = ctx.MultiInput<Tensor>("BBoxes");
-    auto scores = ctx.MultiInput<Tensor>("Scores");
-    auto anchors = ctx.MultiInput<Tensor>("Anchors");
-    auto* im_info = ctx.Input<LoDTensor>("ImInfo");
-    auto* outs = ctx.Output<LoDTensor>("Out");
-
-    std::vector<Tensor> boxes_list(boxes.size());
-    std::vector<Tensor> scores_list(scores.size());
-    std::vector<Tensor> anchors_list(anchors.size());
-    for (size_t j = 0; j < boxes_list.size(); ++j) {
-      boxes_list[j] = *boxes[j];
-      scores_list[j] = *scores[j];
-      anchors_list[j] = *anchors[j];
-    }
-    auto score_dims = scores_list[0].dims();
-    int64_t batch_size = score_dims[0];
-    auto box_dims = boxes_list[0].dims();
-    int64_t box_dim = box_dims[2];
-    int64_t out_dim = box_dim + 2;
-
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-
-    std::vector<std::vector<std::vector<T>>> all_nmsed_out;
-    std::vector<size_t> batch_starts = {0};
-    for (int i = 0; i < batch_size; ++i) {
-      int num_nmsed_out = 0;
-      std::vector<Tensor> box_per_batch_list(boxes_list.size());
-      std::vector<Tensor> score_per_batch_list(scores_list.size());
-      for (size_t j = 0; j < boxes_list.size(); ++j) {
-        auto score_dims = scores_list[j].dims();
-        score_per_batch_list[j] = scores_list[j].Slice(i, i + 1);
-        score_per_batch_list[j].Resize({score_dims[1], score_dims[2]});
-        box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1);
-        box_per_batch_list[j].Resize({score_dims[1], box_dim});
-      }
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-
-      std::vector<std::vector<T>> nmsed_out;
-      RetinanetDetectionOutput(ctx, score_per_batch_list, box_per_batch_list,
-                               anchors_list, im_info_slice, &nmsed_out,
-                               &num_nmsed_out);
-      all_nmsed_out.push_back(nmsed_out);
-      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-    }
-
-    int num_kept = batch_starts.back();
-    if (num_kept == 0) {
-      outs->Resize({0, out_dim});
-    } else {
-      outs->mutable_data<T>({num_kept, out_dim}, ctx.GetPlace());
-      for (int i = 0; i < batch_size; ++i) {
-        int64_t s = batch_starts[i];
-        int64_t e = batch_starts[i + 1];
-        if (e > s) {
-          Tensor out = outs->Slice(s, e);
-          MultiClassOutput(dev_ctx, all_nmsed_out[i], &out);
-        }
-      }
-    }
-
-    framework::LoD lod;
-    lod.emplace_back(batch_starts);
-
-    outs->set_lod(lod);
-  }
-};
-
-class RetinanetDetectionOutputOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("BBoxes",
-             "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D Tensor with shape [N, Mi, 4] represents the "
-             "predicted locations of Mi bounding boxes, N is the batch size. "
-             "Mi is the number of bounding boxes from i-th FPN level. Each "
-             "bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax].")
-        .AsDuplicable();
-    AddInput("Scores",
-             "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D Tensor with shape [N, Mi, C] represents the "
-             "predicted confidence from its FPN level. N is the batch size, "
-             "C is the class number (excluding background), Mi is the number "
-             "of bounding boxes from i-th FPN level. For each bounding box, "
-             "there are total C scores.")
-        .AsDuplicable();
-    AddInput("Anchors",
-             "(List) A list of tensors from multiple FPN levels. Each"
-             "element is a 2-D Tensor with shape [Mi, 4] represents the "
-             "locations of Mi anchor boxes from i-th FPN level. Each "
-             "bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax].")
-        .AsDuplicable();
-    AddInput("ImInfo",
-             "(LoDTensor) A 2-D LoDTensor with shape [N, 3] represents the "
-             "image information. N is the batch size, each image information "
-             "includes height, width and scale.");
-    AddAttr<float>("score_threshold",
-                   "(float) "
-                   "Threshold to filter out bounding boxes with a confidence "
-                   "score.");
-    AddAttr<int>("nms_top_k",
-                 "(int64_t) "
-                 "Maximum number of detections per FPN layer to be kept "
-                 "according to the confidence before NMS.");
-    AddAttr<float>("nms_threshold",
-                   "(float) "
-                   "The threshold to be used in NMS.");
-    AddAttr<float>("nms_eta",
-                   "(float) "
-                   "The parameter for adaptive NMS.");
-    AddAttr<int>(
-        "keep_top_k",
-        "(int64_t) "
-        "Number of total bounding boxes to be kept per image after NMS "
-        "step.");
-    AddOutput("Out",
-              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
-              "detections. Each row has 6 values: "
-              "[label, confidence, xmin, ymin, xmax, ymax]"
-              "No is the total number of detections in this mini-batch."
-              "For each instance, "
-              "the offsets in first dimension are called LoD, the number of "
-              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
-              "no detected bbox.");
-    AddComment(R"DOC(
-This operator is to decode boxes and scores from each FPN layer and do
-multi-class non maximum suppression (NMS) on merged predictions.
-
-Top-scoring predictions per FPN layer are decoded with the anchor
-information. This operator greedily selects a subset of detection bounding
-boxes from each FPN layer that have high scores larger than score_threshold,
-if providing this threshold, then selects the largest nms_top_k confidences
-scores per FPN layer, if nms_top_k is larger than -1.
-The decoding schema is described below:
-
-ox = (pw * pxv * tx * + px) - tw / 2
-
-oy = (ph * pyv * ty * + py) - th / 2
-
-ow = exp(pwv * tw) * pw + tw / 2
-
-oh = exp(phv * th) * ph + th / 2
-
-where `tx`, `ty`, `tw`, `th` denote the predicted box's center coordinates, width
-and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
-anchor's center coordinates, width and height. `pxv`, `pyv`, `pwv`,
-`phv` denote the variance of the anchor box and `ox`, `oy`, `ow`, `oh` denote the
-decoded coordinates, width and height. 
-
-Then the top decoded prediction from all levels are merged followed by NMS.
-In the NMS step, this operator prunes away boxes that have high IOU
-(intersection over union) overlap with already selected boxes by adaptive
-threshold NMS based on parameters of nms_threshold and nms_eta.
-After NMS step, at most keep_top_k number of total bounding boxes are to be kept
-per image if keep_top_k is larger than -1.
-This operator support multi-class and batched inputs. It applying NMS
-independently for each class. The outputs is a 2-D LoDTenosr, for each
-image, the offsets in first dimension of LoDTensor are called LoD, the number
-of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
-means there is no detected bounding box for this image. If there is no detected boxes
-for all images, all the elements in LoD are set to 0, and the output tensor is
-empty (None).
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(retinanet_detection_output, ops::RetinanetDetectionOutputOp,
-                  ops::RetinanetDetectionOutputOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(retinanet_detection_output,
-                       ops::RetinanetDetectionOutputKernel<float>,
-                       ops::RetinanetDetectionOutputKernel<double>);
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
deleted file mode 100644
index ce10de40a9682204f9643296be0b02c74300cebe..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ /dev/null
@@ -1,653 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
-}
-
-template <typename T>
-bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
-}
-
-template <typename T>
-bool GT(T a, T b) {
-  return (a - b) > 1e-4;
-}
-
-/*
-*check if (x, y) is in the boundary of roi
-*/
-template <typename T>
-bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
-  for (int i = 0; i < 4; i++) {
-    T xs = roi_x[i];
-    T ys = roi_y[i];
-    T xe = roi_x[(i + 1) % 4];
-    T ye = roi_y[(i + 1) % 4];
-    if (fabs(ys - ye) < 1e-4) {
-      if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 &&
-          GT_E<T>(x, std::min(xs, xe)) && LT_E<T>(x, std::max(xs, xe))) {
-        return true;
-      }
-    } else {
-      T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
-      if (fabs(intersec_x - x) < 1e-4 && GT_E<T>(y, std::min(ys, ye)) &&
-          LT_E<T>(y, std::max(ys, ye))) {
-        return true;
-      }
-    }
-  }
-
-  int n_cross = 0;
-  for (int i = 0; i < 4; i++) {
-    T xs = roi_x[i];
-    T ys = roi_y[i];
-    T xe = roi_x[(i + 1) % 4];
-    T ye = roi_y[(i + 1) % 4];
-    if (fabs(ys - ye) < 1e-4) {
-      continue;
-    }
-    if (LT_E<T>(y, std::min(ys, ye)) || GT<T>(y, std::max(ys, ye))) {
-      continue;
-    }
-    T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
-    if (fabs(intersec_x - x) < 1e-4) {
-      return true;
-    }
-    if (GT<T>(intersec_x, x)) {
-      n_cross++;
-    }
-  }
-  return (n_cross % 2 == 1);
-}
-
-/**
- * Get the matrix of perspective transform.
- *
- * dx1 = x1 - x2
- * dx2 = x3 - x2
- * dx3 = x0 - x1 + x2 - x3
- * dy1 = y1 - y2
- * dy2 = y3 - y2
- * dy3 = y0 - y1 + y2 - y3
- *
- * a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1)
- * a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1)
- * a13 = x0
- * a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1)
- * a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1)
- * a23 = y0
- * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
- * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
- * a33 = 1
- */
-template <typename T>
-void get_transform_matrix(const int transformed_width,
-                          const int transformed_height, T roi_x[], T roi_y[],
-                          T matrix[]) {
-  T x0 = roi_x[0];
-  T x1 = roi_x[1];
-  T x2 = roi_x[2];
-  T x3 = roi_x[3];
-  T y0 = roi_y[0];
-  T y1 = roi_y[1];
-  T y2 = roi_y[2];
-  T y3 = roi_y[3];
-
-  // Estimate the height and width of RoI
-  T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
-  T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
-  T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
-  T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
-  T estimated_height = (len2 + len4) / 2.0;
-  T estimated_width = (len1 + len3) / 2.0;
-
-  // Get the normalized height and normalized width
-  int normalized_height = std::max(2, transformed_height);
-  int normalized_width =
-      std::round(estimated_width * (normalized_height - 1) / estimated_height) +
-      1;
-  normalized_width = std::max(2, std::min(normalized_width, transformed_width));
-
-  T dx1 = x1 - x2;
-  T dx2 = x3 - x2;
-  T dx3 = x0 - x1 + x2 - x3;
-  T dy1 = y1 - y2;
-  T dy2 = y3 - y2;
-  T dy3 = y0 - y1 + y2 - y3;
-
-  matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 + 1e-5) /
-              (normalized_width - 1);
-  matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 + 1e-5) /
-              (normalized_height - 1);
-  matrix[8] = 1;
-
-  matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
-              (normalized_width - 1);
-  matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
-              (normalized_height - 1);
-  matrix[5] = y0;
-
-  matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
-              (normalized_width - 1);
-  matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
-              (normalized_height - 1);
-  matrix[2] = x0;
-}
-
-/**
- * Get the source coordinates in the input feature map.
- *
- * (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix
- *
- * in_w = u / w
- * in_h = v / w
- *
- */
-template <typename T>
-void get_source_coords(T matrix[], int out_w, int out_h, T* in_w, T* in_h) {
-  T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
-  T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
-  T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
-
-  in_w[0] = u / w;
-  in_h[0] = v / w;
-}
-
-/**
- * Perform bilinear interpolation in the input feature map.
- */
-template <typename T>
-void bilinear_interpolate(const T* in_data, const int channels, const int width,
-                          const int height, int in_n, int in_c, T in_w, T in_h,
-                          T* val) {
-  // Deal with cases that source coords are out of feature map boundary
-  if (GT<T>(-0.5, in_w) || GT<T>(in_w, width - 0.5) || GT<T>(-0.5, in_h) ||
-      GT<T>(in_h, height - 0.5)) {
-    // empty
-    val[0] = 0.0;
-    return;
-  }
-
-  if (GT<T>(0, in_w)) {
-    in_w = 0;
-  }
-  if (GT<T>(0, in_h)) {
-    in_h = 0;
-  }
-
-  int in_w_floor = floor(in_w);
-  int in_h_floor = floor(in_h);
-  int in_w_ceil;
-  int in_h_ceil;
-
-  if (GT_E<T>(in_w_floor, width - 1)) {
-    in_w_ceil = in_w_floor = width - 1;
-    in_w = static_cast<T>(in_w_floor);
-  } else {
-    in_w_ceil = in_w_floor + 1;
-  }
-
-  if (GT_E<T>(in_h_floor, height - 1)) {
-    in_h_ceil = in_h_floor = height - 1;
-    in_h = static_cast<T>(in_h_floor);
-  } else {
-    in_h_ceil = in_h_floor + 1;
-  }
-  T w_floor = in_w - in_w_floor;
-  T h_floor = in_h - in_h_floor;
-  T w_ceil = 1 - w_floor;
-  T h_ceil = 1 - h_floor;
-  const T* data = in_data + (in_n * channels + in_c) * height * width;
-  // Do bilinear interpolation
-  T v1 = data[in_h_floor * width + in_w_floor];
-  T v2 = data[in_h_ceil * width + in_w_floor];
-  T v3 = data[in_h_ceil * width + in_w_ceil];
-  T v4 = data[in_h_floor * width + in_w_ceil];
-  T w1 = w_ceil * h_ceil;
-  T w2 = w_ceil * h_floor;
-  T w3 = w_floor * h_floor;
-  T w4 = w_floor * h_ceil;
-  val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
-}
-
-template <typename T>
-class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* mask = ctx.Output<framework::Tensor>("Mask");
-    auto* out_transform_matrix =
-        ctx.Output<framework::Tensor>("TransformMatrix");
-    auto transformed_height = ctx.Attr<int>("transformed_height");
-    auto transformed_width = ctx.Attr<int>("transformed_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int channels = in_dims[1];
-    int in_height = in_dims[2];
-    int in_width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    const T* input_data = in->data<T>();
-    int* mask_data = mask->mutable_data<int>(ctx.GetPlace());
-
-    framework::Tensor roi2image;
-    roi2image.Resize({rois_num});
-    int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
-    auto lod = rois->lod().back();
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-        roi2image_data[j] = i;
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* rois_data = rois->data<T>();
-
-    T* transform_matrix =
-        out_transform_matrix->mutable_data<T>({rois_num, 9}, ctx.GetPlace());
-
-    for (int n = 0; n < rois_num; ++n) {
-      const T* n_rois = rois_data + n * 8;
-      T roi_x[4];
-      T roi_y[4];
-      for (int k = 0; k < 4; ++k) {
-        roi_x[k] = n_rois[2 * k] * spatial_scale;
-        roi_y[k] = n_rois[2 * k + 1] * spatial_scale;
-      }
-      int image_id = roi2image_data[n];
-      // Get transform matrix
-      T matrix[9];
-      get_transform_matrix<T>(transformed_width, transformed_height, roi_x,
-                              roi_y, matrix);
-      for (int i = 0; i < 9; i++) {
-        transform_matrix[n * 9 + i] = matrix[i];
-      }
-      for (int c = 0; c < channels; ++c) {
-        for (int out_h = 0; out_h < transformed_height; ++out_h) {
-          for (int out_w = 0; out_w < transformed_width; ++out_w) {
-            int out_index =
-                n * channels * transformed_height * transformed_width +
-                c * transformed_height * transformed_width +
-                out_h * transformed_width + out_w;
-            T in_w, in_h;
-            get_source_coords<T>(matrix, out_w, out_h, &in_w, &in_h);
-            if (in_quad<T>(in_w, in_h, roi_x, roi_y)) {
-              if (GT<T>(-0.5, in_w) ||
-                  GT<T>(in_w, static_cast<T>(in_width - 0.5)) ||
-                  GT<T>(-0.5, in_h) ||
-                  GT<T>(in_h, static_cast<T>(in_height - 0.5))) {
-                output_data[out_index] = 0.0;
-                mask_data[(n * transformed_height + out_h) * transformed_width +
-                          out_w] = 0;
-              } else {
-                bilinear_interpolate(input_data, channels, in_width, in_height,
-                                     image_id, c, in_w, in_h,
-                                     output_data + out_index);
-                mask_data[(n * transformed_height + out_h) * transformed_width +
-                          out_w] = 1;
-              }
-            } else {
-              output_data[out_index] = 0.0;
-              mask_data[(n * transformed_height + out_h) * transformed_width +
-                        out_w] = 0;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-T get_feature_gradient(T xs, T ys, int w, int h, const int width,
-                       const int height) {
-  if (GT<T>(-0.5, xs) || GT<T>(xs, width - 0.5) || GT<T>(-0.5, ys) ||
-      GT<T>(ys, height - 0.5)) {
-    return 0;
-  }
-
-  if (GT<T>(0, xs)) {
-    xs = 0;
-  }
-  if (GT<T>(0, ys)) {
-    ys = 0;
-  }
-
-  int xs_floor = floor(xs);
-  int ys_floor = floor(ys);
-  int xs_ceil;
-  int ys_ceil;
-
-  if (GT_E(xs_floor, width - 1)) {
-    xs_ceil = xs_floor = width - 1;
-    xs = static_cast<T>(xs_floor);
-  } else {
-    xs_ceil = xs_floor + 1;
-  }
-
-  if (GT_E(ys_floor, height - 1)) {
-    ys_ceil = ys_floor = height - 1;
-    ys = static_cast<T>(ys_floor);
-  } else {
-    ys_ceil = ys_floor + 1;
-  }
-
-  T weight = 0;
-  if (w == xs_floor) {
-    if (h == ys_floor) {
-      weight = (w + 1 - xs) * (h + 1 - ys);
-    } else if (h == ys_ceil) {
-      weight = (w + 1 - xs) * (ys + 1 - h);
-    }
-  } else if (w == xs_ceil) {
-    if (h == ys_floor) {
-      weight = (xs + 1 - w) * (h + 1 - ys);
-    } else if (h == ys_ceil) {
-      weight = (xs + 1 - w) * (ys + 1 - h);
-    }
-  }
-  return weight;
-}
-
-template <typename T>
-class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto transformed_height = ctx.Attr<int>("transformed_height");
-    auto transformed_width = ctx.Attr<int>("transformed_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int in_height = in_dims[2];
-    int in_width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const T* rois_data = rois->data<T>();
-
-    framework::Tensor roi2image;
-    roi2image.Resize({rois_num});
-    int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
-    auto lod = rois->lod().back();
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-        roi2image_data[j] = i;
-      }
-    }
-
-    for (int n = 0; n < batch_size; ++n) {
-      for (int c = 0; c < channels; ++c) {
-        for (int in_h = 0; in_h < in_height; ++in_h) {
-          for (int in_w = 0; in_w < in_width; ++in_w) {
-            T gradient = 0.0;
-            for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
-              const T* rois = rois_data + roi_idx * 8;
-              T roi_x[4];
-              T roi_y[4];
-              for (int k = 0; k < 4; ++k) {
-                roi_x[k] = rois[2 * k] * spatial_scale;
-                roi_y[k] = rois[2 * k + 1] * spatial_scale;
-              }
-
-              // Get transform matrix
-              T matrix[9];
-              get_transform_matrix<T>(transformed_width, transformed_height,
-                                      roi_x, roi_y, matrix);
-              const T* out_grad_ptr = out_grad_data +
-                                      (roi_idx * channels + c) *
-                                          transformed_height *
-                                          transformed_width;
-              for (int out_h = 0; out_h < transformed_height; ++out_h) {
-                for (int out_w = 0; out_w < transformed_width; ++out_w) {
-                  T src_w;
-                  T src_h;
-                  get_source_coords<T>(matrix, out_w, out_h, &src_w, &src_h);
-                  if (in_quad<T>(src_w, src_h, roi_x, roi_y)) {
-                    if (GT<T>(-0.5, src_w) ||
-                        GT<T>(src_w, static_cast<T>(in_width - 0.5)) ||
-                        GT<T>(-0.5, src_h) ||
-                        GT<T>(src_h, static_cast<T>(in_height - 0.5))) {
-                      continue;
-                    }
-                    T weight = get_feature_gradient<T>(src_w, src_h, in_w, in_h,
-                                                       in_width, in_height);
-                    gradient +=
-                        out_grad_ptr[out_h * transformed_width + out_w] *
-                        weight;
-                  }
-                }
-              }
-            }
-            int out_idx = (n * channels + c) * in_height * in_width +
-                          in_h * in_width + in_w;
-            in_grad_data[out_idx] = gradient;
-          }
-        }
-      }
-    }
-  }
-};
-
-class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ROIPerspectiveTransformOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("ROIs"),
-        "Input(ROIs) of ROIPerspectiveTransformOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(Out) of ROIPerspectiveTransformOp should not be null.");
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-    PADDLE_ENFORCE(input_dims.size() == 4,
-                   "The format of input tensor is NCHW.");
-    PADDLE_ENFORCE(rois_dims.size() == 2,
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 8)"
-                   "given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...]");
-    PADDLE_ENFORCE(rois_dims[1] == 8,
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 8)"
-                   "given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...].");
-    int transformed_height = ctx->Attrs().Get<int>("transformed_height");
-    int transformed_width = ctx->Attrs().Get<int>("transformed_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(transformed_height, 0,
-                      "The transformed output height must greater than 0");
-    PADDLE_ENFORCE_GT(transformed_width, 0,
-                      "The transformed output width must greater than 0");
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      "The spatial scale must greater than 0");
-    std::vector<int64_t> out_dims_v({rois_dims[0],   // num_rois
-                                     input_dims[1],  // channels
-                                     static_cast<int64_t>(transformed_height),
-                                     static_cast<int64_t>(transformed_width)});
-    auto out_dims = framework::make_ddim(out_dims_v);
-
-    std::vector<int64_t> mask_dims_v({rois_dims[0],  // num_rois
-                                      1,             // channels
-                                      static_cast<int64_t>(transformed_height),
-                                      static_cast<int64_t>(transformed_width)});
-    auto mask_dims = framework::make_ddim(mask_dims_v);
-
-    std::vector<int64_t> matrix_dims_v({rois_dims[0], 9});
-    auto matrix_dims = framework::make_ddim(matrix_dims_v);
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("Mask", mask_dims);
-    ctx->SetOutputDim("TransformMatrix", matrix_dims);
-    ctx->SetOutputDim("Out2InIdx", out_dims);
-    ctx->SetOutputDim("Out2InWeights", out_dims);
-    ctx->ShareLoD("ROIs", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The gradient of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
-                   "The gradient of X should not be null.");
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ROIPerspectiveTransformOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor), "
-             "the input of ROIPerspectiveTransformOp. "
-             "The format of input tensor is NCHW. Where N is batch size, "
-             "C is the number of input channels, "
-             "H is the height of the feature, and "
-             "W is the width of the feature.");
-    AddInput("ROIs",
-             "(LoDTensor), "
-             "ROIs (Regions of Interest) to be transformed. "
-             "should be a 2-D LoDTensor of shape (num_rois, 8)"
-             "given as [[x1, y1, x2, y2, x3, y3, x4, y4], ...]."
-             "(x1, y1) is the top left coordinates, and "
-             "(x2, y2) is the top right coordinates, and"
-             "(x3, y3) is the bottom right coordinates, and"
-             "(x4, y4) is the bottom left coordinates.");
-    AddOutput(
-        "Out",
-        "(Tensor), "
-        "The output of ROIPerspectiveTransformOp is a 4-D tensor with shape "
-        "(num_rois, channels, transformed_h, transformed_w).");
-    AddOutput("Mask",
-              "(Tensor), "
-              "The output mask of ROIPerspectiveTransformOp is a 4-D tensor "
-              "with shape "
-              "(num_rois, 1, transformed_h, transformed_w).");
-    AddOutput("TransformMatrix",
-              "(Tensor), "
-              "The output transform matrix of ROIPerspectiveTransformOp is a "
-              "1-D tensor with shape "
-              "(num_rois, 9).");
-    AddOutput("Out2InIdx",
-              "(Tensor), "
-              "An intermediate tensor used to map indexes of input feature map "
-              "and indexes of output feature map."
-              "The shape of the tensor is [out_size, 4] and out_size is the "
-              "number of elements in output feature map.")
-        .AsIntermediate();
-    AddOutput("Out2InWeights",
-              "(Tensor), "
-              "An intermediate tensor used to record the weights of bilinear "
-              "interpolatein for each element in output. The shape of the "
-              "tensor is [out_size, 4] and out_size is the number of elements "
-              "in output feature map.")
-        .AsIntermediate();
-    AddAttr<float>("spatial_scale",
-                   "(float, default 1.0), "
-                   "Spatial scale factor to scale ROI coords.")
-        .SetDefault(1.0);
-    AddAttr<int>("transformed_height",
-                 "(int, default 1), "
-                 "The height of transformed output.")
-        .SetDefault(1);
-    AddAttr<int>("transformed_width",
-                 "(int, default 1), "
-                 "The width of transformed output.")
-        .SetDefault(1);
-    AddComment(R"DOC(
-**ROIPerspectiveTransform Operator**
-
-    )DOC");
-  }
-};
-
-class ROIPerspectiveTransformGradDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("roi_perspective_transform_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("ROIs", Input("ROIs"));
-    op->SetInput("Out2InIdx", Output("Out2InIdx"));
-    op->SetInput("Out2InWeights", Output("Out2InWeights"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp,
-                  ops::ROIPerspectiveTransformOpMaker,
-                  ops::ROIPerspectiveTransformGradDescMaker);
-REGISTER_OPERATOR(roi_perspective_transform_grad,
-                  ops::ROIPerspectiveTransformGradOp);
-REGISTER_OP_CPU_KERNEL(roi_perspective_transform,
-                       ops::CPUROIPerspectiveTransformOpKernel<float>);
-REGISTER_OP_CPU_KERNEL(roi_perspective_transform_grad,
-                       ops::CPUROIPerspectiveTransformGradOpKernel<float>);
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
deleted file mode 100644
index 8c9ca9462c383fc79ee1c16b7f2bbcdc52d60dc4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ /dev/null
@@ -1,511 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
-using paddle::platform::float16;
-
-namespace paddle {
-namespace operators {
-
-// CUDA: index helpers
-#define idx4_4(index, d1, d2, d3, d4) (index % d4)
-#define idx4_3(index, d1, d2, d3, d4) ((index / d4) % d3)
-#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
-#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1)
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename T>
-__device__ bool GT_E(T a, T b) {
-  return (a > b) || Eigen::numext::abs(a - b) < 1e-4;
-}
-
-template <typename T>
-__device__ bool LT_E(T a, T b) {
-  return (a < b) || Eigen::numext::abs(a - b) < 1e-4;
-}
-
-template <typename T>
-__device__ bool GT(T a, T b) {
-  return (a - b) > 1e-4;
-}
-
-template <typename T>
-__device__ T max(T a, T b) {
-  return a > b ? a : b;
-}
-
-template <typename T>
-__device__ T min(T a, T b) {
-  return a < b ? a : b;
-}
-
-/*
-* check if (x, y) is in the boundary of roi
-*/
-template <typename T>
-__device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
-  for (int i = 0; i < 4; i++) {
-    T start_w = roi_x[i];
-    T start_h = roi_y[i];
-    T end_w = roi_x[(i + 1) % 4];
-    T end_h = roi_y[(i + 1) % 4];
-    if (fabs(start_h - end_h) < 1e-4) {
-      if (fabs(y - start_h) < 1e-4 && fabs(y - end_h) < 1e-4 &&
-          GT_E<T>(x, min<T>(start_w, end_w)) &&
-          LT_E<T>(x, max<T>(start_w, end_w))) {
-        return true;
-      }
-    } else {
-      T intersec_x =
-          (y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w;
-      if (fabs(intersec_x - x) < 1e-4 && GT_E(y, min<T>(start_h, end_h)) &&
-          LT_E<T>(y, max<T>(start_h, end_h))) {
-        return true;
-      }
-    }
-  }
-
-  int n_cross = 0;
-  for (int i = 0; i < 4; i++) {
-    T start_w = roi_x[i];
-    T start_h = roi_y[i];
-    T end_w = roi_x[(i + 1) % 4];
-    T end_h = roi_y[(i + 1) % 4];
-    if (fabs(start_h - end_h) < 1e-4) {
-      continue;
-    }
-    if (LT_E<T>(y, min<T>(start_h, end_h)) ||
-        GT<T>(y, max<T>(start_h, end_h))) {
-      continue;
-    }
-    T intersec_x =
-        (y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w;
-    if (fabs(intersec_x - x) < 1e-4) {
-      return true;
-    }
-    if (GT<T>(intersec_x, x)) {
-      n_cross++;
-    }
-  }
-  return (n_cross % 2 == 1);
-}
-
-/**
- * Perform bilinear interpolation in the input feature map.
- */
-template <typename T>
-__device__ void bilinear_interpolate(const T* in_data, const int channels,
-                                     const int width, const int height,
-                                     int in_n, int in_c, T in_w, T in_h, T* val,
-                                     int out_idx, int* out2in_idx,
-                                     T* out2in_w) {
-  // Deal with cases that source coords are out of feature map boundary
-  if (GT<T>(-0.5, in_w) || GT<T>(in_w, width - 0.5) || GT<T>(-0.5, in_h) ||
-      GT<T>(in_h, height - 0.5)) {
-    val[0] = 0.0;
-    return;
-  }
-
-  if (GT<T>(0, in_w)) {
-    in_w = 0;
-  }
-  if (GT<T>(0, in_h)) {
-    in_h = 0;
-  }
-
-  int in_w_floor = floor(in_w);
-  int in_h_floor = floor(in_h);
-  int in_w_ceil;
-  int in_h_ceil;
-
-  if (GT_E<T>(in_w_floor, width - 1)) {
-    in_w_ceil = in_w_floor = width - 1;
-    in_w = static_cast<T>(in_w_floor);
-  } else {
-    in_w_ceil = in_w_floor + 1;
-  }
-
-  if (GT_E<T>(in_h_floor, height - 1)) {
-    in_h_ceil = in_h_floor = height - 1;
-    in_h = static_cast<T>(in_h_floor);
-  } else {
-    in_h_ceil = in_h_floor + 1;
-  }
-
-  T w_floor = in_w - in_w_floor;
-  T h_floor = in_h - in_h_floor;
-  T w_ceil = 1 - w_floor;
-  T h_ceil = 1 - h_floor;
-  const T* data = in_data + (in_n * channels + in_c) * height * width;
-  // Do bilinear interpolation
-  T v1 = data[in_h_floor * width + in_w_floor];
-  T v2 = data[in_h_ceil * width + in_w_floor];
-  T v3 = data[in_h_ceil * width + in_w_ceil];
-  T v4 = data[in_h_floor * width + in_w_ceil];
-  T w1 = w_ceil * h_ceil;
-  T w2 = w_ceil * h_floor;
-  T w3 = w_floor * h_floor;
-  T w4 = w_floor * h_ceil;
-  val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
-
-  int base_idx = (in_n * channels + in_c) * height * width;
-  out2in_idx[out_idx * 4] = base_idx + in_h_floor * width + in_w_floor;
-  out2in_idx[out_idx * 4 + 1] = base_idx + in_h_ceil * width + in_w_floor;
-  out2in_idx[out_idx * 4 + 2] = base_idx + in_h_ceil * width + in_w_ceil;
-  out2in_idx[out_idx * 4 + 3] = base_idx + in_h_floor * width + in_w_ceil;
-  out2in_w[out_idx * 4] = w1;
-  out2in_w[out_idx * 4 + 1] = w2;
-  out2in_w[out_idx * 4 + 2] = w3;
-  out2in_w[out_idx * 4 + 3] = w4;
-}
-
-/**
- * Get the source coordinates in the input feature map.
- *
- * (u, v, w)^matrix = T * (out_w, out_h, 1)^matrix
- *
- * in_w = u / w
- * in_h = v / w
- *
- */
-template <typename T>
-__device__ void get_source_coords(T matrix[], int out_w, int out_h, T* in_w,
-                                  T* in_h) {
-  T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
-  T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
-  T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
-
-  in_w[0] = u / w;
-  in_h[0] = v / w;
-}
-
-/**
- * Get the matrix of perspective transform.
- *
- * dx1 = x1 - x2
- * dx2 = x3 - x2
- * dx3 = x0 - x1 + x2 - x3
- * dy1 = y1 - y2
- * dy2 = y3 - y2
- * dy3 = y0 - y1 + y2 - y3
- *
- * a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1)
- * a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1)
- * a13 = x0
- * a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1)
- * a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1)
- * a23 = y0
- * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
- * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
- * a33 = 1
- *
- */
-template <typename T>
-__device__ void get_transform_matrix(const int transformed_width,
-                                     const int transformed_height, T roi_x[],
-                                     T roi_y[], T matrix[]) {
-  T x0 = roi_x[0];
-  T x1 = roi_x[1];
-  T x2 = roi_x[2];
-  T x3 = roi_x[3];
-  T y0 = roi_y[0];
-  T y1 = roi_y[1];
-  T y2 = roi_y[2];
-  T y3 = roi_y[3];
-
-  // Estimate the height and width of RoI
-  T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
-  T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
-  T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
-  T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
-  T estimated_height = (len2 + len4) / 2.0;
-  T estimated_width = (len1 + len3) / 2.0;
-
-  // Get the normalized height and normalized width
-  int normalized_height = max(2, transformed_height);
-  int normalized_width =
-      round(estimated_width * (normalized_height - 1) / estimated_height) + 1;
-  normalized_width = max(2, min(normalized_width, transformed_width));
-
-  T dx1 = x1 - x2;
-  T dx2 = x3 - x2;
-  T dx3 = x0 - x1 + x2 - x3;
-  T dy1 = y1 - y2;
-  T dy2 = y3 - y2;
-  T dy3 = y0 - y1 + y2 - y3;
-
-  matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 + 1e-5) /
-              (normalized_width - 1);
-  matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 + 1e-5) /
-              (normalized_height - 1);
-  matrix[8] = 1;
-
-  matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
-              (normalized_width - 1);
-  matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
-              (normalized_height - 1);
-  matrix[5] = y0;
-
-  matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
-              (normalized_width - 1);
-  matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
-              (normalized_height - 1);
-  matrix[2] = x0;
-}
-
-template <typename T>
-__global__ void RoiTransformKernel(const float* input_data,
-                                   const float* rois_data,
-                                   const int* roi2image_data, int num_rois,
-                                   int in_height, int in_width, int channels,
-                                   int transformed_height,
-                                   int transformed_width, float spatial_scale,
-                                   T* output_data, int* out2in_idx, T* out2in_w,
-                                   int* mask, T* transform_matrix) {
-  int output_size =
-      num_rois * transformed_height * transformed_width * channels;
-
-  CUDA_1D_KERNEL_LOOP(index, output_size) {
-    // (n, c, out_h, out_w) is an element in the transformed output
-    int out_w = idx4_4(index, num_rois, channels, transformed_height,
-                       transformed_width);
-    int out_h = idx4_3(index, num_rois, channels, transformed_height,
-                       transformed_width);
-    int c = idx4_2(index, num_rois, channels, transformed_height,
-                   transformed_width);
-    int n = idx4_1(index, num_rois, channels, transformed_height,
-                   transformed_width);
-
-    auto bottom_rois = rois_data + n * 8;
-    int roi_batch_ind = bottom_rois[0];
-    T roi_x[4];
-    T roi_y[4];
-    for (int k = 0; k < 4; ++k) {
-      roi_x[k] = bottom_rois[2 * k] * spatial_scale;
-      roi_y[k] = bottom_rois[2 * k + 1] * spatial_scale;
-    }
-
-    // Get transform matrix
-    T matrix[9];
-    get_transform_matrix<T>(transformed_width, transformed_height, roi_x, roi_y,
-                            matrix);
-    for (int i = 0; i < 9; i++) {
-      transform_matrix[n * 9 + i] = matrix[i];
-    }
-    // Get source coords
-    T in_w;
-    T in_h;
-    get_source_coords<T>(matrix, out_w, out_h, &in_w, &in_h);
-
-    if (in_quad<T>(in_w, in_h, roi_x, roi_y)) {
-      if (GT<T>(-0.5, in_w) || GT<T>(in_w, static_cast<T>(in_width - 0.5)) ||
-          GT<T>(-0.5, in_h) || GT<T>(in_h, static_cast<T>(in_height - 0.5))) {
-        // Skip if source coords is not in input image
-        output_data[index] = 0.0;
-        mask[(n * transformed_height + out_h) * transformed_width + out_w] = 0;
-      } else {
-        // Perform bilinear interpolation
-        int in_n = roi2image_data[n];
-        bilinear_interpolate<T>(input_data, channels, in_width, in_height, in_n,
-                                c, in_w, in_h, output_data + index, index,
-                                out2in_idx, out2in_w);
-        mask[(n * transformed_height + out_h) * transformed_width + out_w] = 1;
-      }
-
-    } else {
-      // Skip if source coords is not in quad
-      output_data[index] = 0.0;
-      mask[(n * transformed_height + out_h) * transformed_width + out_w] = 0;
-    }
-  }
-}
-
-template <typename T>
-class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* out2in_idx = ctx.Output<framework::Tensor>("Out2InIdx");
-    auto* out2in_w = ctx.Output<framework::Tensor>("Out2InWeights");
-    auto* mask = ctx.Output<framework::Tensor>("Mask");
-    auto* out_transform_matrix =
-        ctx.Output<framework::Tensor>("TransformMatrix");
-
-    int* mask_data = mask->mutable_data<int>(ctx.GetPlace());
-    int* out2in_idx_data =
-        out2in_idx->mutable_data<int>({out->numel(), 4}, ctx.GetPlace());
-    T* out2in_w_data =
-        out2in_w->mutable_data<T>({out->numel(), 4}, ctx.GetPlace());
-
-    math::SetConstant<platform::CUDADeviceContext, int> init;
-    init(ctx.cuda_device_context(), out2in_idx, static_cast<int>(-1));
-
-    auto transformed_height = ctx.Attr<int>("transformed_height");
-    auto transformed_width = ctx.Attr<int>("transformed_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int in_height = in_dims[2];
-    int in_width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    const T* input_data = in->data<T>();
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* rois_data = rois->data<T>();
-
-    framework::Tensor roi2image;
-    framework::Tensor roi2image_dev;
-    roi2image.Resize({rois_num});
-    int* roi2image_data = roi2image.mutable_data<int>(platform::CPUPlace());
-    auto lod = rois->lod().back();
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-        roi2image_data[j] = i;
-      }
-    }
-    TensorCopySync(roi2image, ctx.GetPlace(), &roi2image_dev);
-
-    int out_size = rois_num * transformed_height * transformed_width * channels;
-    auto stream = ctx.cuda_device_context().stream();
-    int block = 512;
-    int grid = (out_size + block - 1) / block;
-
-    // Get transform matrix
-    T* matrix =
-        out_transform_matrix->mutable_data<T>({rois_num, 9}, ctx.GetPlace());
-
-    RoiTransformKernel<T><<<grid, block, 0, stream>>>(
-        input_data, rois_data, roi2image_dev.data<int>(), rois_num, in_height,
-        in_width, channels, transformed_height, transformed_width,
-        spatial_scale, output_data, out2in_idx_data, out2in_w_data, mask_data,
-        matrix);
-  }
-};
-
-template <typename T>
-__device__ T get_feature_gradient(T xs, T ys, int w, int h, const int width,
-                                  const int height) {
-  if (GT<T>(-0.5, xs) || GT<T>(xs, width - 0.5) || GT<T>(-0.5, ys) ||
-      GT<T>(ys, height - 0.5)) {
-    return 0;
-  }
-
-  if (GT<T>(0, xs)) {
-    xs = 0;
-  }
-  if (GT<T>(0, ys)) {
-    ys = 0;
-  }
-
-  int xs_floor = floor(xs);
-  int ys_floor = floor(ys);
-  int xs_ceil;
-  int ys_ceil;
-
-  if (GT_E<T>(xs_floor, width - 1)) {
-    xs_ceil = xs_floor = width - 1;
-    xs = static_cast<T>(xs_floor);
-  } else {
-    xs_ceil = xs_floor + 1;
-  }
-
-  if (GT_E(ys_floor, height - 1)) {
-    ys_ceil = ys_floor = height - 1;
-    ys = static_cast<T>(ys_floor);
-  } else {
-    ys_ceil = ys_floor + 1;
-  }
-
-  T weight = 0;
-  if (w == xs_floor) {
-    if (h == ys_floor) {
-      weight = (w + 1 - xs) * (h + 1 - ys);
-    } else if (h == ys_ceil) {
-      weight = (w + 1 - xs) * (ys + 1 - h);
-    }
-  } else if (w == xs_ceil) {
-    if (h == ys_floor) {
-      weight = (xs + 1 - w) * (h + 1 - ys);
-    } else if (h == ys_ceil) {
-      weight = (xs + 1 - w) * (ys + 1 - h);
-    }
-  }
-  return weight;
-}
-
-template <typename T>
-__global__ void RoiTransformGradKernel(int out_size, const int* out2in_idx_data,
-                                       const T* out2in_w_data,
-                                       const T* out_grad_data,
-                                       T* in_grad_data) {
-  CUDA_1D_KERNEL_LOOP(index, out_size * 4) {
-    int in_idx = out2in_idx_data[index];
-    if (in_idx >= 0) {
-      int out_idx = index / 4;
-      atomicAdd(in_grad_data + in_idx,
-                out_grad_data[out_idx] * out2in_w_data[index]);
-    }
-  }
-}
-
-template <typename T>
-class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out2in_idx = ctx.Input<framework::LoDTensor>("Out2InIdx");
-    auto* out2in_w = ctx.Input<framework::LoDTensor>("Out2InWeights");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
-
-    const T* out_grad_data = out_grad->data<T>();
-    const int* out2in_idx_data = out2in_idx->data<int>();
-    const T* out2in_w_data = out2in_w->data<T>();
-
-    int out_size = out_grad->numel();
-    auto stream = ctx.cuda_device_context().stream();
-    int block = 512;
-    int grid = (out_size * 4 + block - 1) / block;
-
-    RoiTransformGradKernel<T><<<grid, block, 0, stream>>>(
-        out_size, out2in_idx_data, out2in_w_data, out_grad_data, in_grad_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(roi_perspective_transform,
-                        ops::CUDAROIPerspectiveTransformOpKernel<float>);
-REGISTER_OP_CUDA_KERNEL(roi_perspective_transform_grad,
-                        ops::CUDAROIPerspectiveTransformGradOpKernel<float>);
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
deleted file mode 100644
index 338954346c5af2c04ff6bf09b11873caec4a04dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ /dev/null
@@ -1,1035 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-class RpnTargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Anchor"),
-                   "Input(Anchor) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
-                   "Input(GtBoxes) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
-                   "Input(Anchor) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("ImInfo"),
-                   "Input(ImInfo) of RpnTargetAssignOp should not be null");
-
-    PADDLE_ENFORCE(
-        ctx->HasOutput("LocationIndex"),
-        "Output(LocationIndex) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("ScoreIndex"),
-        "Output(ScoreIndex) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("TargetLabel"),
-        "Output(TargetLabel) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("TargetBBox"),
-        "Output(TargetBBox) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("BBoxInsideWeight"),
-        "Output(BBoxInsideWeight) of RpnTargetAssignOp should not be null");
-
-    auto anchor_dims = ctx->GetInputDim("Anchor");
-    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-    PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
-                      "The rank of Input(Anchor) must be 2.");
-    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
-                      "The rank of Input(GtBoxes) must be 2.");
-    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
-                      "The rank of Input(ImInfo) must be 2.");
-
-    ctx->SetOutputDim("LocationIndex", {-1});
-    ctx->SetOutputDim("ScoreIndex", {-1});
-    ctx->SetOutputDim("TargetLabel", {-1, 1});
-    ctx->SetOutputDim("TargetBBox", {-1, 4});
-    ctx->SetOutputDim("BBoxInsideWeight", {-1, 4});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("Anchor")->type(),
-        platform::CPUPlace());
-  }
-};
-
-template <typename T>
-void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) {
-  auto* out_data = out->data<T>();
-  auto* to_add_data = to_add->data<T>();
-  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
-}
-
-template <typename T>
-std::vector<Tensor> FilterStraddleAnchor(
-    const platform::CPUDeviceContext& context, const Tensor* anchor,
-    const float rpn_straddle_thresh, T im_height, T im_width) {
-  std::vector<int> inds_inside;
-  int anchor_num = anchor->dims()[0];
-  auto* anchor_data = anchor->data<T>();
-  if (rpn_straddle_thresh >= 0) {
-    int index;
-    for (int i = 0; i < anchor_num; ++i) {
-      index = i * 4;
-      if ((anchor_data[index + 0] >= -rpn_straddle_thresh) &&
-          (anchor_data[index + 1] >= -rpn_straddle_thresh) &&
-          (anchor_data[index + 2] < im_width + rpn_straddle_thresh) &&
-          (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) {
-        inds_inside.emplace_back(i);
-      }
-    }
-  } else {
-    for (int i = 0; i < anchor_num; ++i) {
-      inds_inside.emplace_back(i);
-    }
-  }
-  int inside_num = inds_inside.size();
-  Tensor inds_inside_t;
-  int* inds_inside_data =
-      inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
-  std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
-  Tensor inside_anchor_t;
-  T* inside_anchor_data =
-      inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
-  Gather<T>(anchor->data<T>(), 4, inds_inside_data, inside_num,
-            inside_anchor_data);
-  std::vector<Tensor> res;
-  res.emplace_back(inds_inside_t);
-  res.emplace_back(inside_anchor_t);
-  return res;
-}
-
-template <typename T>
-Tensor FilterCrowdGt(const platform::CPUDeviceContext& context,
-                     Tensor* gt_boxes, Tensor* is_crowd) {
-  int gt_num = gt_boxes->dims()[0];
-  std::vector<int> not_crowd_inds;
-  auto* is_crowd_data = is_crowd->data<int>();
-  for (int i = 0; i < gt_num; ++i) {
-    if (is_crowd_data[i] == 0) {
-      not_crowd_inds.emplace_back(i);
-    }
-  }
-  int ncrowd_num = not_crowd_inds.size();
-  Tensor ncrowd_gt_boxes;
-  T* ncrowd_gt_boxes_data =
-      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
-  Gather<T>(gt_boxes->data<T>(), 4, not_crowd_inds.data(), ncrowd_num,
-            ncrowd_gt_boxes_data);
-  return ncrowd_gt_boxes;
-}
-
-void ReservoirSampling(const int num, std::vector<int>* inds,
-                       std::minstd_rand engine, bool use_random) {
-  std::uniform_real_distribution<float> uniform(0, 1);
-  size_t len = inds->size();
-  if (len > static_cast<size_t>(num)) {
-    if (use_random) {
-      for (size_t i = num; i < len; ++i) {
-        int rng_ind = std::floor(uniform(engine) * i);
-        if (rng_ind < num)
-          std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
-      }
-    }
-    inds->resize(num);
-  }
-}
-
-template <typename T>
-void ScoreAssign(const T* anchor_by_gt_overlap_data,
-                 const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max,
-                 const int rpn_batch_size_per_im, const float rpn_fg_fraction,
-                 const float rpn_positive_overlap,
-                 const float rpn_negative_overlap, std::vector<int>* fg_inds,
-                 std::vector<int>* bg_inds, std::vector<int>* tgt_lbl,
-                 std::vector<int>* fg_fake, std::vector<T>* bbox_inside_weight,
-                 std::minstd_rand engine, bool use_random) {
-  float epsilon = 0.00001;
-  int anchor_num = anchor_to_gt_max.dims()[0];
-  int gt_num = gt_to_anchor_max.dims()[0];
-  std::vector<int> target_label(anchor_num, -1);
-  std::vector<int> fg_inds_fake;
-  std::vector<int> bg_inds_fake;
-  const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
-  const T* gt_to_anchor_max_data = gt_to_anchor_max.data<T>();
-  // TODO(buxingyuan): Match with Detectron now
-  // but it seems here is a bug in two directions assignment
-  // in which the later one may overwrites the former one.
-  for (int64_t i = 0; i < anchor_num; ++i) {
-    bool is_anchors_with_max_overlap = false;
-    for (int64_t j = 0; j < gt_num; ++j) {
-      T value = anchor_by_gt_overlap_data[i * gt_num + j];
-      T diff = std::abs(value - gt_to_anchor_max_data[j]);
-      if (diff < epsilon) {
-        is_anchors_with_max_overlap = true;
-        break;
-      }
-    }
-    bool is_anchor_great_than_thresh =
-        (anchor_to_gt_max_data[i] >= rpn_positive_overlap);
-    if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) {
-      fg_inds_fake.push_back(i);
-    }
-  }
-
-  // Reservoir Sampling
-  int fg_num = 0;
-  if (rpn_fg_fraction > 0 && rpn_batch_size_per_im > 0) {
-    fg_num = static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);
-    ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
-  } else {
-    fg_num = static_cast<int>(fg_inds_fake.size());
-  }
-  int fg_fake_num = static_cast<int>(fg_inds_fake.size());
-  for (int64_t i = 0; i < fg_fake_num; ++i) {
-    target_label[fg_inds_fake[i]] = 1;
-  }
-
-  for (int64_t i = 0; i < anchor_num; ++i) {
-    if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
-      bg_inds_fake.push_back(i);
-    }
-  }
-  int bg_num = 0;
-  if (rpn_fg_fraction > 0 && rpn_batch_size_per_im > 0) {
-    bg_num = rpn_batch_size_per_im - fg_fake_num;
-    ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
-    bg_num = static_cast<int>(bg_inds_fake.size());
-  } else {
-    bg_num = static_cast<int>(bg_inds_fake.size());
-  }
-
-  int fake_num = 0;
-  for (int64_t i = 0; i < bg_num; ++i) {
-    // fg fake found
-    if (target_label[bg_inds_fake[i]] == 1) {
-      fake_num++;
-      fg_fake->emplace_back(fg_inds_fake[0]);
-      for (int j = 0; j < 4; ++j) {
-        bbox_inside_weight->emplace_back(T(0.));
-      }
-    }
-    target_label[bg_inds_fake[i]] = 0;
-  }
-
-  for (int64_t i = 0; i < (fg_fake_num - fake_num) * 4; ++i) {
-    bbox_inside_weight->emplace_back(T(1.));
-  }
-
-  for (int64_t i = 0; i < anchor_num; ++i) {
-    if (target_label[i] == 1) {
-      fg_inds->emplace_back(i);
-      fg_fake->emplace_back(i);
-    }
-    if (target_label[i] == 0) bg_inds->emplace_back(i);
-  }
-  fg_num = fg_inds->size();
-  bg_num = bg_inds->size();
-
-  tgt_lbl->resize(fg_num + bg_num, 0);
-  std::vector<int> fg_lbl(fg_num, 1);
-  std::vector<int> bg_lbl(bg_num, 0);
-  std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data());
-  std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num);
-}
-
-template <typename T>
-std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
-                                    const Tensor& anchor_by_gt_overlap,
-                                    const int rpn_batch_size_per_im,
-                                    const float rpn_positive_overlap,
-                                    const float rpn_negative_overlap,
-                                    const float rpn_fg_fraction,
-                                    std::minstd_rand engine, bool use_random) {
-  auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
-  int anchor_num = anchor_by_gt_overlap.dims()[0];
-  int gt_num = anchor_by_gt_overlap.dims()[1];
-
-  std::vector<int> fg_inds;
-  std::vector<int> bg_inds;
-  std::vector<int> gt_inds;
-  std::vector<int> tgt_lbl;
-  std::vector<int> fg_fake;
-  std::vector<T> bbox_inside_weight;
-  // Calculate the max IoU between anchors and gt boxes
-  // Map from anchor to gt box that has highest overlap
-  auto place = ctx.GetPlace();
-  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
-  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
-  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
-  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
-
-  auto anchor_by_gt_overlap_et =
-      framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
-  auto anchor_to_gt_max_et =
-      framework::EigenVector<T>::Flatten(anchor_to_gt_max);
-  auto gt_to_anchor_max_et =
-      framework::EigenVector<T>::Flatten(gt_to_anchor_max);
-  auto anchor_to_gt_argmax_et =
-      framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
-  anchor_to_gt_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
-  anchor_to_gt_argmax_et =
-      anchor_by_gt_overlap_et.argmax(1).template cast<int>();
-  gt_to_anchor_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
-
-  // Follow the Faster RCNN's implementation
-  ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max,
-              rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap,
-              rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, &fg_fake,
-              &bbox_inside_weight, engine, use_random);
-
-  int fg_num = fg_inds.size();
-  int bg_num = bg_inds.size();
-  int fg_fake_num = fg_fake.size();
-  gt_inds.reserve(fg_fake_num);
-  for (int i = 0; i < fg_fake_num; ++i) {
-    gt_inds.emplace_back(argmax[fg_fake[i]]);
-  }
-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
-  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
-  int* score_index_data =
-      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_fake_num}, place);
-  T* bbox_inside_weight_data =
-      bbox_inside_weight_t.mutable_data<T>({fg_fake_num, 4}, place);
-  std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data);
-  std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
-  std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
-  std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
-  std::copy(bbox_inside_weight.begin(), bbox_inside_weight.end(),
-            bbox_inside_weight_data);
-  std::vector<Tensor> loc_score_tgtlbl_gt;
-  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
-  loc_score_tgtlbl_gt.emplace_back(score_index_t);
-  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
-  loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
-  loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t);
-
-  return loc_score_tgtlbl_gt;
-}
-
-template <typename T>
-class RpnTargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
-    auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
-    auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
-    auto* im_info = context.Input<LoDTensor>("ImInfo");
-
-    auto* loc_index = context.Output<LoDTensor>("LocationIndex");
-    auto* score_index = context.Output<LoDTensor>("ScoreIndex");
-    auto* tgt_bbox = context.Output<LoDTensor>("TargetBBox");
-    auto* tgt_lbl = context.Output<LoDTensor>("TargetLabel");
-    auto* bbox_inside_weight = context.Output<LoDTensor>("BBoxInsideWeight");
-
-    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
-                      "RpnTargetAssignOp gt_boxes needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
-                      "RpnTargetAssignOp is_crowd needs 1 level of LoD");
-    int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
-    int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
-
-    int rpn_batch_size_per_im = context.Attr<int>("rpn_batch_size_per_im");
-    float rpn_straddle_thresh = context.Attr<float>("rpn_straddle_thresh");
-    float rpn_positive_overlap = context.Attr<float>("rpn_positive_overlap");
-    float rpn_negative_overlap = context.Attr<float>("rpn_negative_overlap");
-    float rpn_fg_fraction = context.Attr<float>("rpn_fg_fraction");
-    bool use_random = context.Attr<bool>("use_random");
-
-    int64_t max_num = batch_num * rpn_batch_size_per_im;
-    auto place = context.GetPlace();
-
-    loc_index->mutable_data<int>({max_num}, place);
-    score_index->mutable_data<int>({max_num}, place);
-    tgt_bbox->mutable_data<T>({max_num, 4}, place);
-    tgt_lbl->mutable_data<int>({max_num, 1}, place);
-    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed = rnd();
-    engine.seed(seed);
-
-    framework::LoD lod_loc, loc_score;
-    std::vector<size_t> lod0_loc(1, 0);
-    std::vector<size_t> lod0_score(1, 0);
-
-    int total_loc_num = 0;
-    int total_score_num = 0;
-    auto gt_boxes_lod = gt_boxes->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    for (int i = 0; i < batch_num; ++i) {
-      Tensor gt_boxes_slice =
-          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor is_crowd_slice =
-          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      auto* im_info_data = im_info_slice.data<T>();
-      auto im_height = im_info_data[0];
-      auto im_width = im_info_data[1];
-      auto im_scale = im_info_data[2];
-
-      // Filter straddle anchor
-      std::vector<Tensor> filter_output = FilterStraddleAnchor<T>(
-          dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
-      Tensor inds_inside = filter_output[0];
-      Tensor inside_anchor = filter_output[1];
-
-      // Filter crowd gt
-      Tensor ncrowd_gt_boxes =
-          FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
-      auto ncrowd_gt_boxes_et =
-          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
-      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
-
-      Tensor anchor_by_gt_overlap;
-      anchor_by_gt_overlap.mutable_data<T>(
-          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
-      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
-
-      auto loc_score_tgtlbl_gt = SampleRpnFgBgGt<T>(
-          dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im,
-          rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine,
-          use_random);
-
-      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
-
-      int loc_num = sampled_loc_index.dims()[0];
-      int score_num = sampled_score_index.dims()[0];
-      // unmap to all anchor
-      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
-      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
-      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
-      Gather<int>(inds_inside.data<int>(), 1, sampled_loc_index.data<int>(),
-                  loc_num, sampled_loc_index_unmap.data<int>());
-      Gather<int>(inds_inside.data<int>(), 1, sampled_score_index.data<int>(),
-                  score_num, sampled_score_index_unmap.data<int>());
-
-      // get target bbox deltas
-      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
-      auto* sampled_anchor_data =
-          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
-      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
-      Gather<T>(anchor->data<T>(), 4, sampled_loc_index_unmap.data<int>(),
-                loc_num, sampled_anchor_data);
-      Gather<T>(ncrowd_gt_boxes.data<T>(), 4, sampled_gt_index.data<int>(),
-                loc_num, sampled_gt_data);
-      sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
-      BoxToDelta<T>(loc_num, sampled_anchor, sampled_gt, nullptr, false,
-                    &sampled_tgt_bbox);
-
-      // Add anchor offset
-      int anchor_offset = i * anchor_num;
-      auto sampled_loc_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
-      sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
-      auto sampled_score_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
-      sampled_score_index_unmap_et =
-          sampled_score_index_unmap_et + anchor_offset;
-      AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
-      AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
-      AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
-      AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
-      AppendRpns<T>(bbox_inside_weight, total_loc_num * 4,
-                    &sampled_bbox_inside_weight);
-      total_loc_num += loc_num;
-
-      total_score_num += score_num;
-      lod0_loc.emplace_back(total_loc_num);
-      lod0_score.emplace_back(total_score_num);
-    }
-
-    PADDLE_ENFORCE_LE(total_loc_num, max_num);
-    PADDLE_ENFORCE_LE(total_score_num, max_num);
-
-    lod_loc.emplace_back(lod0_loc);
-    loc_score.emplace_back(lod0_score);
-    loc_index->set_lod(lod_loc);
-    score_index->set_lod(loc_score);
-    tgt_bbox->set_lod(lod_loc);
-    tgt_lbl->set_lod(loc_score);
-    bbox_inside_weight->set_lod(lod_loc);
-    loc_index->Resize({total_loc_num});
-    score_index->Resize({total_score_num});
-    tgt_bbox->Resize({total_loc_num, 4});
-    tgt_lbl->Resize({total_score_num, 1});
-    bbox_inside_weight->Resize({total_loc_num, 4});
-  }
-};
-
-class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Anchor",
-             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
-    AddInput("GtBoxes",
-             "(LoDTensor) input ground-truth bbox with shape [K, 4].");
-    AddInput("IsCrowd",
-             "(LoDTensor) input which indicates ground-truth is crowd.");
-    AddInput("ImInfo",
-             "(LoDTensor) input image information with shape [N, 3]. "
-             "N is the batch size, each image information includes height, "
-             "width and scale.");
-    AddAttr<int>("rpn_batch_size_per_im",
-                 "Total number of RPN examples per image.")
-        .SetDefault(256);
-    AddAttr<float>(
-        "rpn_straddle_thresh",
-        "Remove RPN anchors that go outside the image by straddle_thresh "
-        "pixels, "
-        "Set to -1 or a large value, e.g. 100000, to disable pruning anchors.");
-    AddAttr<float>(
-        "rpn_positive_overlap",
-        "Minimum overlap required between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a positive example.")
-        .SetDefault(0.7);
-    AddAttr<float>(
-        "rpn_negative_overlap",
-        "Maximum overlap allowed between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a negative examples.")
-        .SetDefault(0.3);
-    AddAttr<float>(
-        "rpn_fg_fraction",
-        "Target fraction of RoI minibatch that "
-        "is labeled foreground (i.e. class > 0), 0-th class is background.")
-        .SetDefault(0.25);
-    AddAttr<bool>("use_random",
-                  "A flag indicating whether to use a ReservoirSampling. "
-                  "NOTE: DO NOT set this flag to false in training. "
-                  "Setting this flag to false is only useful in unittest.")
-        .SetDefault(true);
-    AddOutput(
-        "LocationIndex",
-        "(Tensor), The indexes of foreground anchors in all RPN anchors, the "
-        "shape of the LocationIndex is [F], F depends on the value of input "
-        "tensor and attributes.");
-    AddOutput(
-        "ScoreIndex",
-        "(Tensor), The indexes of foreground and background anchors in all "
-        "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B are sampled foreground and background "
-        " number.");
-    AddOutput("TargetBBox",
-              "(Tensor), The target bbox deltas with shape "
-              "[F, 4], F is the sampled foreground number.");
-    AddOutput(
-        "TargetLabel",
-        "(Tensor<int>), The target labels of each anchor with shape "
-        "[F + B, 1], F and B are sampled foreground and background number.");
-    AddOutput("BBoxInsideWeight",
-              "(Tensor), The bbox inside weight with shape "
-              "[F, 4], F is the sampled foreground number.");
-    AddComment(R"DOC(
-This operator can be, for a given set of ground truth bboxes and the
-anchors, to assign classification and regression targets to each prediction.
-The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU.
-The rest anchors would not contibute to the RPN training loss
-
-ScoreIndex is composed of foreground anchor indexes(positive labels) and
-background anchor indexes(negative labels). LocationIndex is exactly same
-as the foreground anchor indexes since we can not assign regression target to 
-the background anchors.
-
-The classification targets(TargetLabel) is a binary class label (of being
-an object or not). Following the paper of Faster-RCNN, the positive labels
-are two kinds of anchors: (i) the anchor/anchors with the highest IoU
-overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
-higher than rpn_positive_overlap(0.7) with any ground-truth box. Note that
-a single ground-truth box may assign positive labels to multiple anchors.
-A non-positive anchor is when its IoU ratio is lower than rpn_negative_overlap
-(0.3) for all ground-truth boxes. Anchors that are neither positive nor
-negative do not contribute to the training objective.
-
-)DOC");
-  }
-};
-
-class RetinanetTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Anchor",
-             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
-    AddInput("GtBoxes",
-             "(LoDTensor) input ground-truth bbox with shape [K, 4].");
-    AddInput("GtLabels",
-             "(LoDTensor) input ground-truth label with shape [K, 1].");
-    AddInput("IsCrowd",
-             "(LoDTensor) input which indicates ground-truth is crowd.");
-    AddInput("ImInfo",
-             "(LoDTensor) input image information with shape [N, 3]. "
-             "N is the batch size, each image information includes height, "
-             "width and scale.");
-    AddAttr<float>(
-        "positive_overlap",
-        "Minimum overlap required between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a positive example.")
-        .SetDefault(0.5);
-    AddAttr<float>(
-        "negative_overlap",
-        "Maximum overlap allowed between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a negative examples.")
-        .SetDefault(0.4);
-    AddOutput(
-        "LocationIndex",
-        "(Tensor), The indexes of foreground anchors in all anchors, the "
-        "shape of the LocationIndex is [F], F depends on the value of input "
-        "tensor and attributes.");
-    AddOutput(
-        "ScoreIndex",
-        "(Tensor), The indexes of foreground and background anchors in all "
-        "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B are foreground and background "
-        " number.");
-    AddOutput("TargetBBox",
-              "(Tensor), The target bbox deltas with shape "
-              "[F, 4], F is the foreground number.");
-    AddOutput("TargetLabel",
-              "(Tensor<int>), The target labels of each anchor with shape "
-              "[F + B, 1], F and B are foreground and background number.");
-    AddOutput("BBoxInsideWeight",
-              "(Tensor), The bbox inside weight with shape "
-              "[F, 4], F is the foreground number.");
-    AddOutput("ForegroundNumber",
-              "(Tensor), The foreground number. "
-              "[1, 1].");
-    AddComment(R"DOC(
-    This layer can be, for given the Intersection-over-Union (IoU) overlap
-    between anchors and ground truth boxes, to assign classification and
-    regression targets to each anchor, these target labels are used for
-    train retinanet. 
-    
-    Every anchor is assigned with a length C one-hot vector of
-    classification targets, and a 4-vector of box regression targets,
-    where C is the class number. The assignment rules are as followed:
-    
-    1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
-    IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
-    than positive_overlap(0.5) with any ground-truth box.
-    
-    2. Anchors are assigned to background when its IoU ratio is lower than
-    negative_overlap (0.4) for all ground-truth boxes.
-
-    When an anchor is assigned with a ground-truth box which is the i-th category,
-    the i-th entry in its C vector of targets is set to 1 and all other entries
-    are set to 0. When an anchor is assigned with background, all entries are set
-    to 0. Anchors that are not assigned do not contribute to the training
-    objective. The regression targets are the encoded ground-truth boxes
-    associated with the assigned anchors.
-
-)DOC");
-  }
-};
-
-class RetinanetTargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Anchor"),
-        "Input(Anchor) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasInput("GtBoxes"),
-        "Input(GtBoxes) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasInput("GtLabels"),
-        "Input(GtLabels) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasInput("IsCrowd"),
-        "Input(Anchor) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasInput("ImInfo"),
-        "Input(ImInfo) of RetinanetTargetAssignOp should not be null");
-
-    PADDLE_ENFORCE(
-        ctx->HasOutput("LocationIndex"),
-        "Output(LocationIndex) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("ScoreIndex"),
-        "Output(ScoreIndex) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("TargetLabel"),
-        "Output(TargetLabel) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("TargetBBox"),
-        "Output(TargetBBox) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("BBoxInsideWeight"),
-                   "Output(BBoxInsideWeight) of RetinanetTargetAssignOp should "
-                   "not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("ForegroundNumber"),
-                   "Output(ForegroundNumber) of RetinanetTargetAssignOp should "
-                   "not be null");
-
-    auto anchor_dims = ctx->GetInputDim("Anchor");
-    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto gt_labels_dims = ctx->GetInputDim("GtLabels");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-
-    PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
-                      "The rank of Input(Anchor) must be 2.");
-    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
-                      "The rank of Input(GtBoxes) must be 2.");
-    PADDLE_ENFORCE_EQ(gt_labels_dims.size(), 2,
-                      "The rank of Input(GtLabels) must be 2.");
-    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
-                      "The rank of Input(ImInfo) must be 2.");
-
-    ctx->SetOutputDim("LocationIndex", {gt_labels_dims[0]});
-    ctx->SetOutputDim("ScoreIndex", {gt_labels_dims[0]});
-    ctx->SetOutputDim("TargetBBox", {gt_labels_dims[0], 4});
-    ctx->SetOutputDim("TargetLabel", {gt_labels_dims[0], 1});
-    ctx->SetOutputDim("BBoxInsideWeight", {gt_labels_dims[0], 4});
-    ctx->SetOutputDim("ForegroundNumber", {gt_labels_dims[0], 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("Anchor")->type(),
-        platform::CPUPlace());
-  }
-};
-
-template <typename T>
-std::vector<Tensor> FilterCrowdGtBoxLabel(
-    const platform::CPUDeviceContext& context, Tensor* gt_boxes,
-    Tensor* gt_labels, Tensor* is_crowd) {
-  int gt_num = gt_boxes->dims()[0];
-  std::vector<int> not_crowd_inds;
-  auto* is_crowd_data = is_crowd->data<int>();
-  for (int i = 0; i < gt_num; ++i) {
-    if (is_crowd_data[i] == 0) {
-      not_crowd_inds.emplace_back(i);
-    }
-  }
-  int ncrowd_num = not_crowd_inds.size();
-  Tensor ncrowd_gt_boxes, ncrowd_gt_labels;
-  T* ncrowd_gt_boxes_data =
-      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
-  int* ncrowd_gt_labels_data =
-      ncrowd_gt_labels.mutable_data<int>({ncrowd_num, 1}, context.GetPlace());
-  Gather<T>(gt_boxes->data<T>(), 4, not_crowd_inds.data(), ncrowd_num,
-            ncrowd_gt_boxes_data);
-  Gather<int>(gt_labels->data<int>(), 1, not_crowd_inds.data(), ncrowd_num,
-              ncrowd_gt_labels_data);
-  std::vector<Tensor> res;
-  res.emplace_back(ncrowd_gt_boxes);
-  res.emplace_back(ncrowd_gt_labels);
-  return res;
-}
-
-template <typename T>
-std::vector<Tensor> GetAllFgBgGt(const platform::CPUDeviceContext& ctx,
-                                 const Tensor& anchor_by_gt_overlap,
-                                 const Tensor& ncrowd_gt_labels,
-                                 const float positive_overlap,
-                                 const float negative_overlap,
-                                 std::minstd_rand engine) {
-  auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
-  int anchor_num = anchor_by_gt_overlap.dims()[0];
-  int gt_num = anchor_by_gt_overlap.dims()[1];
-
-  std::vector<int> fg_inds;
-  std::vector<int> bg_inds;
-  std::vector<int> gt_inds;
-  std::vector<int> tgt_lbl;
-  std::vector<int> fg_fake;
-  std::vector<T> bbox_inside_weight;
-  // Calculate the max IoU between anchors and gt boxes
-  // Map from anchor to gt box that has highest overlap
-  auto place = ctx.GetPlace();
-  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
-  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
-  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
-  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
-
-  auto anchor_by_gt_overlap_et =
-      framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
-  auto anchor_to_gt_max_et =
-      framework::EigenVector<T>::Flatten(anchor_to_gt_max);
-  auto gt_to_anchor_max_et =
-      framework::EigenVector<T>::Flatten(gt_to_anchor_max);
-  auto anchor_to_gt_argmax_et =
-      framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
-  anchor_to_gt_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
-  anchor_to_gt_argmax_et =
-      anchor_by_gt_overlap_et.argmax(1).template cast<int>();
-  gt_to_anchor_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
-
-  ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max, -1,
-              -1, positive_overlap, negative_overlap, &fg_inds, &bg_inds,
-              &tgt_lbl, &fg_fake, &bbox_inside_weight, engine, false);
-  const int* gt_labels_data = ncrowd_gt_labels.data<int>();
-  int64_t fg_num = fg_inds.size();
-  for (int64_t i = 0; i < fg_num; ++i) {
-    int gt_idx = argmax[fg_inds[i]];
-    tgt_lbl[i] = gt_labels_data[gt_idx];
-  }
-
-  int bg_num = bg_inds.size();
-  int fg_fake_num = fg_fake.size();
-  gt_inds.reserve(fg_fake_num);
-  for (int i = 0; i < fg_fake_num; ++i) {
-    gt_inds.emplace_back(argmax[fg_fake[i]]);
-  }
-
-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
-  Tensor fg_num_t;
-  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
-  int* score_index_data =
-      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_fake_num}, place);
-  int* fg_num_data = fg_num_t.mutable_data<int>({1}, place);
-  T* bbox_inside_weight_data =
-      bbox_inside_weight_t.mutable_data<T>({fg_fake_num, 4}, place);
-  std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data);
-  std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
-  std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
-  std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
-  std::copy(bbox_inside_weight.begin(), bbox_inside_weight.end(),
-            bbox_inside_weight_data);
-  fg_num_data[0] = fg_fake.size() + 1;
-  std::vector<Tensor> loc_score_tgtlbl_gt;
-  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
-  loc_score_tgtlbl_gt.emplace_back(score_index_t);
-  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
-  loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
-  loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t);
-  loc_score_tgtlbl_gt.emplace_back(fg_num_t);
-
-  return loc_score_tgtlbl_gt;
-}
-
-template <typename T>
-class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
-    auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
-    auto* gt_labels = context.Input<LoDTensor>("GtLabels");
-    auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
-    auto* im_info = context.Input<LoDTensor>("ImInfo");
-
-    auto* loc_index = context.Output<LoDTensor>("LocationIndex");
-    auto* score_index = context.Output<LoDTensor>("ScoreIndex");
-    auto* tgt_bbox = context.Output<LoDTensor>("TargetBBox");
-    auto* tgt_lbl = context.Output<LoDTensor>("TargetLabel");
-    auto* bbox_inside_weight = context.Output<LoDTensor>("BBoxInsideWeight");
-    auto* fg_num = context.Output<LoDTensor>("ForegroundNumber");
-
-    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
-                      "RetinanetTargetAssignOp gt_boxes needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(gt_labels->lod().size(), 1UL,
-                      "RetinanetTargetAssignOp gt_boxes needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
-                      "RetinanetTargetAssignOp is_crowd needs 1 level of LoD");
-
-    int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
-    int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
-
-    float positive_overlap = context.Attr<float>("positive_overlap");
-    float negative_overlap = context.Attr<float>("negative_overlap");
-
-    int64_t max_num = batch_num * anchor_num;
-    auto place = context.GetPlace();
-
-    loc_index->mutable_data<int>({max_num}, place);
-    score_index->mutable_data<int>({max_num}, place);
-    tgt_bbox->mutable_data<T>({max_num, 4}, place);
-    tgt_lbl->mutable_data<int>({max_num, 1}, place);
-    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
-    fg_num->mutable_data<int>({batch_num, 1}, place);
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed = rnd();
-    engine.seed(seed);
-
-    framework::LoD lod_loc, loc_score, lod_fg;
-    std::vector<size_t> lod0_loc(1, 0);
-    std::vector<size_t> lod0_score(1, 0);
-    std::vector<size_t> lod0_fg(1, 0);
-
-    int total_loc_num = 0;
-    int total_score_num = 0;
-    int total_fg_num = 0;
-    auto gt_boxes_lod = gt_boxes->lod().back();
-    auto gt_labels_lod = gt_labels->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    for (int i = 0; i < batch_num; ++i) {
-      Tensor gt_boxes_slice =
-          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor gt_labels_slice =
-          gt_labels->Slice(gt_labels_lod[i], gt_labels_lod[i + 1]);
-      Tensor is_crowd_slice =
-          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      auto* im_info_data = im_info_slice.data<T>();
-      auto im_height = im_info_data[0];
-      auto im_width = im_info_data[1];
-      auto im_scale = im_info_data[2];
-
-      // Filter straddle anchor
-      std::vector<Tensor> filter_output =
-          FilterStraddleAnchor<T>(dev_ctx, anchor, -1, im_height, im_width);
-      Tensor inds_inside = filter_output[0];
-      Tensor inside_anchor = filter_output[1];
-
-      // Filter crowd gt
-      std::vector<Tensor> ncrowd_output = FilterCrowdGtBoxLabel<T>(
-          dev_ctx, &gt_boxes_slice, &gt_labels_slice, &is_crowd_slice);
-      Tensor ncrowd_gt_boxes = ncrowd_output[0];
-      Tensor ncrowd_gt_labels = ncrowd_output[1];
-
-      auto ncrowd_gt_boxes_et =
-          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
-      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
-
-      Tensor anchor_by_gt_overlap;
-      anchor_by_gt_overlap.mutable_data<T>(
-          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
-      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
-
-      auto loc_score_tgtlbl_gt =
-          GetAllFgBgGt<T>(dev_ctx, anchor_by_gt_overlap, ncrowd_gt_labels,
-                          positive_overlap, negative_overlap, engine);
-
-      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
-      Tensor sampled_fg_num = loc_score_tgtlbl_gt[5];
-
-      int loc_num = sampled_loc_index.dims()[0];
-      int score_num = sampled_score_index.dims()[0];
-      // unmap to all anchor
-      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
-      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
-      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
-      Gather<int>(inds_inside.data<int>(), 1, sampled_loc_index.data<int>(),
-                  loc_num, sampled_loc_index_unmap.data<int>());
-      Gather<int>(inds_inside.data<int>(), 1, sampled_score_index.data<int>(),
-                  score_num, sampled_score_index_unmap.data<int>());
-
-      // get target bbox deltas
-      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
-      auto* sampled_anchor_data =
-          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
-      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
-      Gather<T>(anchor->data<T>(), 4, sampled_loc_index_unmap.data<int>(),
-                loc_num, sampled_anchor_data);
-      Gather<T>(ncrowd_gt_boxes.data<T>(), 4, sampled_gt_index.data<int>(),
-                loc_num, sampled_gt_data);
-      sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
-      BoxToDelta<T>(loc_num, sampled_anchor, sampled_gt, nullptr, false,
-                    &sampled_tgt_bbox);
-
-      // Add anchor offset
-      int anchor_offset = i * anchor_num;
-      auto sampled_loc_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
-      sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
-      auto sampled_score_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
-      sampled_score_index_unmap_et =
-          sampled_score_index_unmap_et + anchor_offset;
-      AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
-      AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
-      AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
-      AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
-      AppendRpns<T>(bbox_inside_weight, total_loc_num * 4,
-                    &sampled_bbox_inside_weight);
-      AppendRpns<int>(fg_num, total_fg_num, &sampled_fg_num);
-
-      total_loc_num += loc_num;
-      total_score_num += score_num;
-      total_fg_num += 1;
-      lod0_loc.emplace_back(total_loc_num);
-      lod0_score.emplace_back(total_score_num);
-      lod0_fg.emplace_back(total_fg_num);
-    }
-
-    PADDLE_ENFORCE_LE(total_loc_num, max_num);
-    PADDLE_ENFORCE_LE(total_score_num, max_num);
-    PADDLE_ENFORCE_LE(total_fg_num, batch_num);
-
-    lod_loc.emplace_back(lod0_loc);
-    loc_score.emplace_back(lod0_score);
-    lod_fg.emplace_back(lod0_fg);
-    loc_index->set_lod(lod_loc);
-    score_index->set_lod(loc_score);
-    tgt_bbox->set_lod(lod_loc);
-    tgt_lbl->set_lod(loc_score);
-    bbox_inside_weight->set_lod(lod_loc);
-    fg_num->set_lod(lod_fg);
-    loc_index->Resize({total_loc_num});
-    score_index->Resize({total_score_num});
-    tgt_bbox->Resize({total_loc_num, 4});
-    tgt_lbl->Resize({total_score_num, 1});
-    bbox_inside_weight->Resize({total_loc_num, 4});
-    fg_num->Resize({total_fg_num, 1});
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(rpn_target_assign, ops::RpnTargetAssignOp,
-                  ops::RpnTargetAssignOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(rpn_target_assign, ops::RpnTargetAssignKernel<float>,
-                       ops::RpnTargetAssignKernel<double>);
-REGISTER_OPERATOR(retinanet_target_assign, ops::RetinanetTargetAssignOp,
-                  ops::RetinanetTargetAssignOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(retinanet_target_assign,
-                       ops::RetinanetTargetAssignKernel<float>,
-                       ops::RetinanetTargetAssignKernel<double>);
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
deleted file mode 100644
index 50ff3cb120e8199f51af1f3aaa71368da0561d3b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class SigmoidFocalLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("FgNum"), "Input(FgNum) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-    auto fg_dims = ctx->GetInputDim("FgNum");
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
-                      "Input(X) and Input(Label) shall have the same rank.");
-    PADDLE_ENFORCE_EQ(fg_dims.size(), 1, "The rank of Input(FgNum) must be 1.");
-    bool check = true;
-    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
-                                framework::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                        framework::slice_ddim(labels_dims, 0, rank - 1),
-                        "Input(X) and Input(Label) shall have the same shape "
-                        "except the last dimension.");
-    }
-
-    PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL,
-                      "The last dimension of input(Label) should be 1.");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class SigmoidFocalLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("FgNum"), "Input(FgNum) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-    auto fg_dims = ctx->GetInputDim("FgNum");
-    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
-                      "Input(X) and Input(Label) shall have the same rank.");
-    PADDLE_ENFORCE_EQ(fg_dims.size(), 1, "The rank of Input(FgNum) must be 1.");
-    bool check = true;
-    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
-                                framework::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                        framework::slice_ddim(labels_dims, 0, rank - 1),
-                        "Input(X) and Input(Label) shall have the same shape.");
-
-      PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL,
-                        "The last dimension of input(Label) should be 1.");
-
-      PADDLE_ENFORCE_EQ(
-          framework::slice_ddim(x_dims, 0, rank),
-          framework::slice_ddim(dout_dims, 0, rank),
-          "Input(X) and Input(Out@Grad) shall have the same shape.");
-    }
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class SigmoidFocalLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N, D], "
-             "where N is the batch size and D is the number of classes "
-             "(excluding background). This input is a tensor of logits "
-             "computed by the previous operator.");
-    AddInput("Label",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape [N, 1]. "
-             "This input is a tensor of probabilistic labels.");
-    AddInput("FgNum",
-             "(Tensor, default Tensor<int>), a 1-D tensor with shape [1]. "
-             "This input is the number of foreground.");
-    AddOutput(
-        "Out",
-        "(Tensor, default Tensor<float>), a 2-D tensor with shape [N, D]. "
-        "This output is the focal loss.");
-    AddAttr<float>(
-        "gamma",
-        "Hyper-parameter of sigmoid focal loss op, which is to balance the "
-        "easy and hard examples. "
-        "A float scalar with default value 2.0.")
-        .SetDefault(2.0);
-    AddAttr<float>(
-        "alpha",
-        "Hyper-parameter of sigmoid focal loss op, which is to balance the "
-        "positive and negative examples. "
-        "A float scalar with default value 0.5.")
-        .SetDefault(0.25);
-    AddComment(R"DOC(
-Sigmoid Focal Loss Operator.
-
-Focal loss is used to address the foreground-background class imbalance existed
-on the training phase of one-stage detectors. This operator computes the sigmoid
-value for each element in the input tensor, after which focal loss is measured.
-
-The focal loss is given as follows:
-
-$$Loss_j = (-Label_j * alpha * \pow(1 - \sigma(X_j), gamma) * \log(\sigma(X_j)) -
-(1 - Labels_j) * (1 - alpha) * \pow(\sigma(X_j), gamma) * \log(1 - \sigma(X_j)))
-/ FgNum, j = 1,...,K$$
-
-We know that $$\sigma(X_j) = \\frac{1}{1 + \exp(-X_j)}$$.
-
-)DOC");
-  }
-};
-
-class SigmoidFocalLossGradOpDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sigmoid_focal_loss_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Label", Input("Label"));
-    op->SetInput("FgNum", Input("FgNum"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sigmoid_focal_loss, ops::SigmoidFocalLossOp,
-                  ops::SigmoidFocalLossOpMaker,
-                  ops::SigmoidFocalLossGradOpDescMaker);
-REGISTER_OPERATOR(sigmoid_focal_loss_grad, ops::SigmoidFocalLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_focal_loss,
-    ops::SigmoidFocalLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SigmoidFocalLossKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_focal_loss_grad,
-    ops::SigmoidFocalLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SigmoidFocalLossGradKernel<paddle::platform::CPUDeviceContext,
-                                    double>);
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
deleted file mode 100644
index 4031554aa72b51a82feaaacc894af7c1dbf6e382..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "cub/cub.cuh"
-#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename T>
-__global__ void GPUSigmoidFocalLossForward(const T *x_data,
-                                           const int *label_data,
-                                           const int *fg_num_data,
-                                           const T gamma, const T alpha,
-                                           const int num_classes,
-                                           const int limit, T *out_data) {
-  CUDA_1D_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    int a = i / num_classes;  // current sample
-    int d = i % num_classes;  // current class
-    int g = label_data[a];    // target
-
-    // check whether the input data is positive or negative
-    // the target classes are in range 1-81
-    // and the d is in range 0-80
-    T c_pos = static_cast<T>(g == (d + 1));
-    T c_neg = static_cast<T>((g != -1) & (g != (d + 1)));
-
-    T fg_num = static_cast<T>((fg_num_data[0] > 1) ? fg_num_data[0] : 1);
-    T s_neg = (1.0 - alpha) / fg_num;
-    T s_pos = alpha / fg_num;
-
-    // p = 1. / 1. + expf(-x)
-    T p = 1. / (1. + real_exp(-x));
-
-    // (1 - p)**gamma * log(p)
-    T term_pos = std::pow(static_cast<T>(1. - p), gamma) *
-                 real_log(p > FLT_MIN ? p : FLT_MIN);
-    // p**gamma * log(1 - p)
-    T term_neg =
-        std::pow(p, gamma) *
-        (-1. * x * (x >= 0) - real_log(1. + real_exp(x - 2. * x * (x >= 0))));
-
-    out_data[i] = 0.0;
-    out_data[i] += -c_pos * term_pos * s_pos;
-    out_data[i] += -c_neg * term_neg * s_neg;
-  }
-}
-
-template <typename T>
-__global__ void GPUSigmoidFocalLossBackward(
-    const T *x_data, const int *label_data, const int *fg_num_data,
-    const T gamma, const T alpha, const int num_classes, const T *dout_data,
-    const int limit, T *dx_data) {
-  CUDA_1D_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T dout = dout_data[i];
-
-    int a = i / num_classes;  // current sample
-    int d = i % num_classes;  // current class
-
-    T fg_num = static_cast<T>((fg_num_data[0] > 1) ? fg_num_data[0] : 1);
-    T s_neg = (1.0 - alpha) / fg_num;
-    T s_pos = alpha / fg_num;
-
-    int g = label_data[a];
-    T c_pos = static_cast<T>(g == (d + 1));
-    T c_neg = static_cast<T>((g != -1) & (g != (d + 1)));
-
-    T p = 1. / (1. + real_exp(-x));
-
-    // (1-p)**g * (1 - p - g*p*log(p))
-    T term_pos = std::pow(static_cast<T>(1. - p), gamma) *
-                 (1. - p - (p * gamma * real_log(p > FLT_MIN ? p : FLT_MIN)));
-    // (p**g) * (g*(1-p)*log(1-p) - p)
-    T term_neg =
-        std::pow(p, gamma) *
-        ((-1. * x * (x >= 0) - real_log(1. + real_exp(x - 2. * x * (x >= 0)))) *
-             (1. - p) * gamma -
-         p);
-
-    dx_data[i] = 0.0;
-    dx_data[i] += -c_pos * s_pos * term_pos;
-    dx_data[i] += -c_neg * s_neg * term_neg;
-    dx_data[i] = dx_data[i] * dout;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GPUSigmoidFocalLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *FgNum = context.Input<Tensor>("FgNum");
-    Tensor *Out = context.Output<Tensor>("Out");
-    T gamma = static_cast<T>(context.Attr<float>("gamma"));
-    T alpha = static_cast<T>(context.Attr<float>("alpha"));
-    auto x_dims = X->dims();
-    int num_classes = static_cast<int>(x_dims[1]);
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.cuda_device_context();
-
-    int limit = Out->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    GPUSigmoidFocalLossForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        X->data<T>(), Labels->data<int>(), FgNum->data<int>(), gamma, alpha,
-        num_classes, limit, out_data);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GPUSigmoidFocalLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *FgNum = context.Input<Tensor>("FgNum");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-    T gamma = static_cast<T>(context.Attr<float>("gamma"));
-    T alpha = static_cast<T>(context.Attr<float>("alpha"));
-    auto x_dims = X->dims();
-    int num_classes = static_cast<int>(x_dims[1]);
-
-    auto &dev_ctx = context.cuda_device_context();
-
-    int limit = dX->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    GPUSigmoidFocalLossBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        X->data<T>(), Labels->data<int>(), FgNum->data<int>(), gamma, alpha,
-        num_classes, dOut->data<T>(), limit, dx_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_focal_loss,
-    ops::GPUSigmoidFocalLossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUSigmoidFocalLossKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_focal_loss_grad,
-    ops::GPUSigmoidFocalLossGradKernel<paddle::platform::CUDADeviceContext,
-                                       float>,
-    ops::GPUSigmoidFocalLossGradKernel<paddle::platform::CUDADeviceContext,
-                                       double>);
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
deleted file mode 100644
index 518295958630c00bc44015ffd80d13a1b1d1f68c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class SigmoidFocalLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *FgNum = context.Input<Tensor>("FgNum");
-    Tensor *Out = context.Output<Tensor>("Out");
-    T gamma = static_cast<T>(context.Attr<float>("gamma"));
-    T alpha = static_cast<T>(context.Attr<float>("alpha"));
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-    int limit = Out->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<int>();
-    auto fg_num_data = FgNum->data<int>();
-    auto x_dims = X->dims();
-    int num_classes = static_cast<int>(x_dims[1]);
-
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      int a = idx / num_classes;  // current sample
-      int d = idx % num_classes;  // current class
-      int g = label_data[a];      // target
-
-      // Check whether the input data is positive or negative
-      // The target classes are in range 1-81
-      // and the d is in range 0-80
-      T c_pos = static_cast<T>(g == (d + 1));
-      T c_neg = static_cast<T>((g != -1) & (g != (d + 1)));
-      T fg_num = static_cast<T>((fg_num_data[0] > 1) ? fg_num_data[0] : 1);
-      T s_neg = (1.0 - alpha) / fg_num;
-      T s_pos = alpha / fg_num;
-
-      // p = 1. / 1. + expf(-x)
-      T p = 1. / (1. + std::exp(-x));
-
-      // (1 - p)**gamma * log(p) where
-      T term_pos = std::pow(static_cast<T>(1. - p), gamma) *
-                   std::log(p > FLT_MIN ? p : FLT_MIN);
-      // p**gamma * log(1 - p)
-      T term_neg =
-          std::pow(p, gamma) *
-          (-1. * x * (x >= 0) - std::log(1. + std::exp(x - 2. * x * (x >= 0))));
-
-      out_data[idx] = 0.0;
-      out_data[idx] += -c_pos * term_pos * s_pos;
-      out_data[idx] += -c_neg * term_neg * s_neg;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SigmoidFocalLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *FgNum = context.Input<Tensor>("FgNum");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-    T gamma = static_cast<T>(context.Attr<float>("gamma"));
-    T alpha = static_cast<T>(context.Attr<float>("alpha"));
-    auto x_dims = X->dims();
-    int num_classes = static_cast<int>(x_dims[1]);
-
-    int limit = dX->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<int>();
-    auto fg_num_data = FgNum->data<int>();
-    auto dout_data = dOut->data<T>();
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      int a = idx / num_classes;  // current sample
-      int d = idx % num_classes;  // current class
-
-      T fg_num = static_cast<T>((fg_num_data[0] > 1) ? fg_num_data[0] : 1);
-      T s_neg = static_cast<T>((1.0 - alpha) / fg_num);
-      T s_pos = alpha / fg_num;
-      int g = label_data[a];
-
-      T c_pos = static_cast<T>(g == (d + 1));
-      T c_neg = static_cast<T>((g != -1) & (g != (d + 1)));
-      T p = 1. / (1. + std::exp(-x));
-
-      // (1-p)**g * (1 - p - g*p*log(p))
-      T term_pos = std::pow(static_cast<T>(1. - p), gamma) *
-                   (1. - p - (p * gamma * std::log(p > FLT_MIN ? p : FLT_MIN)));
-      // (p**g) * (g*(1-p)*log(1-p) - p)
-      T term_neg = std::pow(p, gamma) *
-                   ((-1. * x * (x >= 0) -
-                     std::log(1. + std::exp(x - 2. * x * (x >= 0)))) *
-                        (1. - p) * gamma -
-                    p);
-      dx_data[idx] = 0.0;
-      dx_data[idx] += -c_pos * s_pos * term_pos;
-      dx_data[idx] += -c_neg * s_neg * term_neg;
-      dx_data[idx] = dx_data[idx] * dout_data[idx];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
deleted file mode 100644
index c057c82ce0f5eef67c09d0ed719ddd24382f451d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/target_assign_op.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/target_assign_op.h"
-
-namespace paddle {
-namespace operators {
-
-class TargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of TargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
-                   "Input(MatchIndices) of TargetAssignOp should not be null");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of TargetAssignOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutWeight"),
-                   "Output(OutWeight) of TargetAssignOp should not be null.");
-
-    auto in_dims = ctx->GetInputDim("X");
-    auto mi_dims = ctx->GetInputDim("MatchIndices");
-
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "The rank of Input(X) must be 3.");
-    PADDLE_ENFORCE_EQ(mi_dims.size(), 2,
-                      "The rank of Input(MatchIndices) must be 2.");
-
-    if (ctx->HasInput("NegIndices")) {
-      auto neg_dims = ctx->GetInputDim("NegIndices");
-      PADDLE_ENFORCE_EQ(neg_dims.size(), 2,
-                        "The rank of Input(NegIndices) must be 2.");
-      PADDLE_ENFORCE_EQ(neg_dims[1], 1,
-                        "The last dimenstion of Out(NegIndices) must be 1.");
-    }
-
-    auto n = mi_dims[0];
-    auto m = mi_dims[1];
-    auto k = in_dims[in_dims.size() - 1];
-    ctx->SetOutputDim("Out", {n, m, k});
-    ctx->SetOutputDim("OutWeight", {n, m, 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor), This input is a 3D LoDTensor with shape [M, P, K]. "
-             "Some elements in X will be assigned to Out based on the "
-             "MatchIndices and NegIndices.");
-    AddInput("MatchIndices",
-             "(Tensor, default Tensor<int>), The input matched indices "
-             "with shape [N, P], If MatchIndices[i][j] is -1, the j-th entity "
-             "of column is not matched to any entity of row in i-th instance.");
-    AddInput("NegIndices",
-             "(LoDTensor, default LoDTensor<int>), The input negative example "
-             "indices are an optional input with shape [Neg, 1], where Neg is "
-             "the total number of negative example indices.")
-        .AsDispensable();
-    AddAttr<int>("mismatch_value",
-                 "(int, default 0), Fill this value to the "
-                 "mismatched location.")
-        .SetDefault(0);
-    AddOutput("Out",
-              "(Tensor), The output is a 3D Tensor with shape [N, P, K], "
-              "N and P is the same as they are in NegIndices, K is the "
-              "same as it in input of X. If MatchIndices[i][j] "
-              "is -1, the Out[i][j][0 : K] is the mismatch_value.");
-    AddOutput("OutWeight",
-              "(Tensor), The weight for output with the shape of [N, P, 1]");
-    AddComment(R"DOC(
-This operator can be, for given the target bounding boxes or labels,
-to assign classification and regression targets to each prediction as well as
-weights to prediction. The weights is used to specify which prediction would
-not contribute to training loss.
-
-For each instance, the output `Out` and`OutWeight` are assigned based on
-`MatchIndices` and `NegIndices`.
-Assumed that the row offset for each instance in `X` is called lod,
-this operator assigns classification/regression targets by performing the
-following steps:
-
-1. Assigning all outpts based on `MatchIndices`:
-
-If id = MatchIndices[i][j] > 0,
-
-    Out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
-    OutWeight[i][j] = 1.
-
-Otherwise, 
-
-    Out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
-    OutWeight[i][j] = 0.
-
-2. Assigning OutWeight based on `NegIndices` if `NegIndices` is provided:
-
-Assumed that the row offset for each instance in `NegIndices` is called neg_lod,
-for i-th instance and each `id` of NegIndices in this instance:
-
-    Out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
-    OutWeight[i][id] = 1.0
-
-    )DOC");
-  }
-};
-
-template <typename T, typename WT>
-struct NegTargetAssignFunctor<platform::CPUDeviceContext, T, WT> {
-  void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
-                  const size_t* lod, const int N, const int M, const int K,
-                  const int mismatch_value, T* out, WT* out_wt) {
-    for (int i = 0; i < N; ++i) {
-      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-        int id = neg_indices[j];
-        int off = (i * M + id) * K;
-        for (int k = 0; k < K; ++k) {
-          out[off + k] = mismatch_value;
-          out_wt[off + k] = static_cast<WT>(1.0);
-        }
-      }
-    }
-  }
-};
-
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext, int, float>;
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float,
-                                       float>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(target_assign, ops::TargetAssignOp, ops::TargetAssignOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float, float>);
diff --git a/paddle/fluid/operators/detection/target_assign_op.cu b/paddle/fluid/operators/detection/target_assign_op.cu
deleted file mode 100644
index ddf6889942355457fb281b6c33430ab8337db3ed..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/target_assign_op.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/target_assign_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename WT>
-__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
-                                      const int N, const int M, const int K,
-                                      const int mismatch_value, T* out,
-                                      WT* out_wt) {
-  int bidx = blockIdx.x;
-  int st = lod[bidx];
-  int ed = lod[bidx + 1];
-
-  int row_start = bidx * M;
-  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
-    int id = row_start + neg_indices[i];
-    for (int k = 0; k < K; ++k) {
-      out[id * K + k] = T(mismatch_value);
-      out_wt[id * K + k] = WT(1.);
-    }
-  }
-}
-
-template <typename T, typename WT>
-struct NegTargetAssignFunctor<platform::CUDADeviceContext, T, WT> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const int* neg_indices, const size_t* lod, const int N,
-                  const int M, const int K, const int mismatch_value, T* out,
-                  WT* out_wt) {
-    const int block_size = 256;
-    const int grid_size = N;
-    NegTargetAssignKernel<T, WT><<<grid_size, block_size, 0, ctx.stream()>>>(
-        neg_indices, lod, N, M, K, mismatch_value, out, out_wt);
-  }
-};
-
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, int, float>;
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float,
-                                       float>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, int, float>,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float, float>);
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
deleted file mode 100644
index 691e3276f9bbaadd1c438c1fb01264a29b05fdee..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-template <typename T, typename WT>
-struct TargetAssignFunctor {
-  const T* in_;
-  const int* match_indices_;
-  const size_t* lod_;
-  const int mismatch_value_;
-  const int64_t N_;
-  const int64_t M_;
-  const int64_t P_;
-  const int64_t K_;
-
-  T* out_;
-  WT* out_wt_;
-
-  TargetAssignFunctor(const T* input, const int* match_indices,
-                      const size_t* lod, const int mismatch_value,
-                      const int64_t N, const int64_t M, const int64_t P,
-                      const int64_t K, T* out, WT* out_wt)
-      : in_(input),
-        match_indices_(match_indices),
-        lod_(lod),
-        mismatch_value_(mismatch_value),
-        N_(N),
-        M_(M),
-        P_(P),
-        K_(K),
-        out_(out),
-        out_wt_(out_wt) {}
-
-  HOSTDEVICE void operator()(size_t i) const {
-    int h = i / M_;
-    int w = i - h * M_;
-
-    size_t off = lod_[h];
-    int id = match_indices_[i];
-
-    T* out = out_ + i * K_;
-    WT* out_wt = out_wt_ + i;
-
-    if (id > -1) {
-      int w_off = w % P_;
-      const T* in = in_ + ((off + id) * P_ + w_off) * K_;
-      for (int64_t k = 0; k < K_; ++k) {
-        out[k] = in[k];
-      }
-      out_wt[0] = static_cast<WT>(1.);
-    } else {
-      for (int64_t k = 0; k < K_; ++k) {
-        out[k] = static_cast<T>(mismatch_value_);
-      }
-      out_wt[0] = static_cast<WT>(0.);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, typename WT>
-struct NegTargetAssignFunctor {
-  void operator()(const platform::DeviceContext& ctx, const int* neg_indices,
-                  const size_t* lod, const int N, const int M, const int K,
-                  const int mismatch_value, T* out, WT* out_wt) const;
-};
-
-template <typename DeviceContext, typename T, typename WT>
-class TargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices");
-
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* out_wt = ctx.Output<framework::Tensor>("OutWeight");
-
-    PADDLE_ENFORCE_EQ(x->lod().size(), 1UL);
-    int mismatch_value = ctx.Attr<int>("mismatch_value");
-
-    const T* x_data = x->data<T>();
-    const int* match_idx_data = match_indices->data<int>();
-
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    WT* out_wt_data = out_wt->mutable_data<WT>(ctx.GetPlace());
-
-    int64_t n = match_indices->dims()[0];
-    int64_t m = match_indices->dims()[1];
-    int64_t p = x->dims()[1];
-    int64_t k = x->dims()[2];
-
-    auto x_lod = x->lod().back();
-#if defined(PADDLE_WITH_CUDA)
-    size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
-#else
-    size_t* x_lod_data = x_lod.data();
-#endif
-
-    TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
-                                       mismatch_value, n, m, p, k, out_data,
-                                       out_wt_data);
-
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(device_ctx, n * m);
-    for_range(functor);
-
-    auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
-    if (neg_indices) {
-      PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
-      const int* neg_idx_data = neg_indices->data<int>();
-      auto neg_lod = neg_indices->lod().back();
-#if defined(PADDLE_WITH_CUDA)
-      size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
-#else
-      size_t* neg_lod_data = neg_lod.data();
-#endif
-      NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
-      neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
-                      mismatch_value, out_data, out_wt_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
deleted file mode 100644
index e0d7e25d944cf2321799da4c73de9f74d9fd287d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class YoloBoxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of YoloBoxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ImgSize"),
-                   "Input(ImgSize) of YoloBoxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Boxes"),
-                   "Output(Boxes) of YoloBoxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Scores"),
-                   "Output(Scores) of YoloBoxOp should not be null.");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_imgsize = ctx->GetInputDim("ImgSize");
-    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
-    int anchor_num = anchors.size() / 2;
-    auto class_num = ctx->Attrs().Get<int>("class_num");
-
-    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        dim_x[1], anchor_num * (5 + class_num),
-        "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
-        "+ class_num)).");
-    PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2,
-                      "Input(ImgSize) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        dim_imgsize[0], dim_x[0],
-        "Input(ImgSize) dim[0] and Input(X) dim[0] should be same.");
-    PADDLE_ENFORCE_EQ(dim_imgsize[1], 2, "Input(ImgSize) dim[1] should be 2.");
-    PADDLE_ENFORCE_GT(anchors.size(), 0,
-                      "Attr(anchors) length should be greater than 0.");
-    PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
-                      "Attr(anchors) length should be even integer.");
-    PADDLE_ENFORCE_GT(class_num, 0,
-                      "Attr(class_num) should be an integer greater than 0.");
-
-    int box_num = dim_x[2] * dim_x[3] * anchor_num;
-    std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
-    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_boxes));
-
-    std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
-    ctx->SetOutputDim("Scores", framework::make_ddim(dim_scores));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of YoloBox operator is a 4-D tensor with "
-             "shape of [N, C, H, W]. The second dimension(C) stores "
-             "box locations, confidence score and classification one-hot "
-             "keys of each anchor box. Generally, X should be the output "
-             "of YOLOv3 network.");
-    AddInput("ImgSize",
-             "The image size tensor of YoloBox operator, "
-             "This is a 2-D tensor with shape of [N, 2]. This tensor holds "
-             "height and width of each input image used for resizing output "
-             "box in input image scale.");
-    AddOutput("Boxes",
-              "The output tensor of detection boxes of YoloBox operator, "
-              "This is a 3-D tensor with shape of [N, M, 4], N is the "
-              "batch num, M is output box number, and the 3rd dimension "
-              "stores [xmin, ymin, xmax, ymax] coordinates of boxes.");
-    AddOutput("Scores",
-              "The output tensor of detection boxes scores of YoloBox "
-              "operator, This is a 3-D tensor with shape of "
-              "[N, M, :attr:`class_num`], N is the batch num, M is "
-              "output box number.");
-
-    AddAttr<int>("class_num", "The number of classes to predict.");
-    AddAttr<std::vector<int>>("anchors",
-                              "The anchor width and height, "
-                              "it will be parsed pair by pair.")
-        .SetDefault(std::vector<int>{});
-    AddAttr<int>("downsample_ratio",
-                 "The downsample ratio from network input to YoloBox operator "
-                 "input, so 32, 16, 8 should be set for the first, second, "
-                 "and thrid YoloBox operators.")
-        .SetDefault(32);
-    AddAttr<float>("conf_thresh",
-                   "The confidence scores threshold of detection boxes. "
-                   "Boxes with confidence scores under threshold should "
-                   "be ignored.")
-        .SetDefault(0.01);
-    AddComment(R"DOC(
-         This operator generates YOLO detection boxes from output of YOLOv3 network.
-         
-         The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, H and W specify the grid size, each grid point predict 
-         given number boxes, this given number, which following will be represented as S,
-         is specified by the number of anchors. In the second dimension(the channel
-         dimension), C should be equal to S * (5 + class_num), class_num is the object 
-         category number of source dataset(such as 80 in coco dataset), so the 
-         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
-         also includes confidence score of the box and class one-hot key of each anchor 
-         box.
-
-         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box 
-         predictions should be as follows:
-
-         $$
-         b_x = \\sigma(t_x) + c_x
-         $$
-         $$
-         b_y = \\sigma(t_y) + c_y
-         $$
-         $$
-         b_w = p_w e^{t_w}
-         $$
-         $$
-         b_h = p_h e^{t_h}
-         $$
-
-         in the equation above, :math:`c_x, c_y` is the left top corner of current grid
-         and :math:`p_w, p_h` is specified by anchors.
-
-         The logistic regression value of the 5th channel of each anchor prediction boxes
-         represents the confidence score of each prediction box, and the logistic
-         regression value of the last :attr:`class_num` channels of each anchor prediction 
-         boxes represents the classifcation scores. Boxes with confidence scores less than
-         :attr:`conf_thresh` should be ignored, and box final scores is the product of 
-         confidence scores and classification scores.
-
-         $$
-         score_{pred} = score_{conf} * score_{class}
-         $$
-
-         )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
-                       ops::YoloBoxKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
deleted file mode 100644
index 08ea62bc14e47f0ecad9a51215ae8a42590d0109..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
-                            T* scores, const float conf_thresh,
-                            const int* anchors, const int n, const int h,
-                            const int w, const int an_num, const int class_num,
-                            const int box_num, int input_size) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  T box[4];
-  for (; tid < n * box_num; tid += stride) {
-    int grid_num = h * w;
-    int i = tid / box_num;
-    int j = (tid % box_num) / grid_num;
-    int k = (tid % grid_num) / w;
-    int l = tid % w;
-
-    int an_stride = (5 + class_num) * grid_num;
-    int img_height = imgsize[2 * i];
-    int img_width = imgsize[2 * i + 1];
-
-    int obj_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
-    T conf = sigmoid<T>(input[obj_idx]);
-    if (conf < conf_thresh) {
-      continue;
-    }
-
-    int box_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
-    GetYoloBox<T>(box, input, anchors, l, k, j, h, input_size, box_idx,
-                  grid_num, img_height, img_width);
-    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
-    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width);
-
-    int label_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
-    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
-    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
-                      grid_num);
-  }
-}
-
-template <typename T>
-class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* img_size = ctx.Input<Tensor>("ImgSize");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* scores = ctx.Output<Tensor>("Scores");
-
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float conf_thresh = ctx.Attr<float>("conf_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int box_num = boxes->dims()[1];
-    const int an_num = anchors.size() / 2;
-    int input_size = downsample_ratio * h;
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    int bytes = sizeof(int) * anchors.size();
-    auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
-    int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
-    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    const auto cplace = platform::CPUPlace();
-    memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
-                 dev_ctx.stream());
-
-    const T* input_data = input->data<T>();
-    const int* imgsize_data = img_size->data<int>();
-    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    T* scores_data =
-        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(dev_ctx, boxes, static_cast<T>(0));
-    set_zero(dev_ctx, scores, static_cast<T>(0));
-
-    int grid_dim = (n * box_num + 512 - 1) / 512;
-    grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-    KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
-        anchors_data, n, h, w, an_num, class_num, box_num, input_size);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel<float>,
-                        ops::YoloBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
deleted file mode 100644
index 8b7c7df0f3cf754f59c994dbe5b1cc2ac5fb773b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-HOSTDEVICE inline T sigmoid(T x) {
-  return 1.0 / (1.0 + std::exp(-x));
-}
-
-template <typename T>
-HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
-                                  int j, int an_idx, int grid_size,
-                                  int input_size, int index, int stride,
-                                  int img_height, int img_width) {
-  box[0] = (i + sigmoid<T>(x[index])) * img_width / grid_size;
-  box[1] = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
-  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-           input_size;
-  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
-           img_height / input_size;
-}
-
-HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
-                                    int an_num, int an_stride, int stride,
-                                    int entry) {
-  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
-}
-
-template <typename T>
-HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
-                                        const int img_height,
-                                        const int img_width) {
-  boxes[box_idx] = box[0] - box[2] / 2;
-  boxes[box_idx + 1] = box[1] - box[3] / 2;
-  boxes[box_idx + 2] = box[0] + box[2] / 2;
-  boxes[box_idx + 3] = box[1] + box[3] / 2;
-
-  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
-  boxes[box_idx + 1] =
-      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
-  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
-                           ? boxes[box_idx + 2]
-                           : static_cast<T>(img_width - 1);
-  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
-                           ? boxes[box_idx + 3]
-                           : static_cast<T>(img_height - 1);
-}
-
-template <typename T>
-HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input,
-                                      const int label_idx, const int score_idx,
-                                      const int class_num, const T conf,
-                                      const int stride) {
-  for (int i = 0; i < class_num; i++) {
-    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
-  }
-}
-
-template <typename T>
-class YoloBoxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* imgsize = ctx.Input<Tensor>("ImgSize");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* scores = ctx.Output<Tensor>("Scores");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float conf_thresh = ctx.Attr<float>("conf_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int box_num = boxes->dims()[1];
-    const int an_num = anchors.size() / 2;
-    int input_size = downsample_ratio * h;
-
-    const int stride = h * w;
-    const int an_stride = (class_num + 5) * stride;
-
-    Tensor anchors_;
-    auto anchors_data =
-        anchors_.mutable_data<int>({an_num * 2}, ctx.GetPlace());
-    std::copy(anchors.begin(), anchors.end(), anchors_data);
-
-    const T* input_data = input->data<T>();
-    const int* imgsize_data = imgsize->data<int>();
-    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    memset(boxes_data, 0, boxes->numel() * sizeof(T));
-    T* scores_data =
-        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    memset(scores_data, 0, scores->numel() * sizeof(T));
-
-    T box[4];
-    for (int i = 0; i < n; i++) {
-      int img_height = imgsize_data[2 * i];
-      int img_width = imgsize_data[2 * i + 1];
-
-      for (int j = 0; j < an_num; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            int obj_idx =
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4);
-            T conf = sigmoid<T>(input_data[obj_idx]);
-            if (conf < conf_thresh) {
-              continue;
-            }
-
-            int box_idx =
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
-            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, input_size,
-                          box_idx, stride, img_height, img_width);
-            box_idx = (i * box_num + j * stride + k * w + l) * 4;
-            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height,
-                                img_width);
-
-            int label_idx =
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
-            int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
-            CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
-                              class_num, conf, stride);
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
deleted file mode 100644
index 5732b180526c502efea0ca72af87b38e45bfbec2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ /dev/null
@@ -1,299 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/detection/yolov3_loss_op.h"
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class Yolov3LossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of Yolov3LossOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("GTBox"),
-                   "Input(GTBox) of Yolov3LossOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("GTLabel"),
-                   "Input(GTLabel) of Yolov3LossOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
-                   "Output(Loss) of Yolov3LossOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("ObjectnessMask"),
-        "Output(ObjectnessMask) of Yolov3LossOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("GTMatchMask"),
-                   "Output(GTMatchMask) of Yolov3LossOp should not be null.");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_gtbox = ctx->GetInputDim("GTBox");
-    auto dim_gtlabel = ctx->GetInputDim("GTLabel");
-    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
-    int anchor_num = anchors.size() / 2;
-    auto anchor_mask = ctx->Attrs().Get<std::vector<int>>("anchor_mask");
-    int mask_num = anchor_mask.size();
-    auto class_num = ctx->Attrs().Get<int>("class_num");
-
-    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
-    PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
-                      "Input(X) dim[3] and dim[4] should be euqal.");
-    PADDLE_ENFORCE_EQ(
-        dim_x[1], mask_num * (5 + class_num),
-        "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
-        "+ class_num)).");
-    PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
-                      "Input(GTBox) should be a 3-D tensor");
-    PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5");
-    PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2,
-                      "Input(GTLabel) should be a 2-D tensor");
-    PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0],
-                      "Input(GTBox) and Input(GTLabel) dim[0] should be same");
-    PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1],
-                      "Input(GTBox) and Input(GTLabel) dim[1] should be same");
-    PADDLE_ENFORCE_GT(anchors.size(), 0,
-                      "Attr(anchors) length should be greater then 0.");
-    PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
-                      "Attr(anchors) length should be even integer.");
-    for (size_t i = 0; i < anchor_mask.size(); i++) {
-      PADDLE_ENFORCE_LT(
-          anchor_mask[i], anchor_num,
-          "Attr(anchor_mask) should not crossover Attr(anchors).");
-    }
-    PADDLE_ENFORCE_GT(class_num, 0,
-                      "Attr(class_num) should be an integer greater then 0.");
-
-    if (ctx->HasInput("GTScore")) {
-      auto dim_gtscore = ctx->GetInputDim("GTScore");
-      PADDLE_ENFORCE_EQ(dim_gtscore.size(), 2,
-                        "Input(GTScore) should be a 2-D tensor");
-      PADDLE_ENFORCE_EQ(
-          dim_gtscore[0], dim_gtbox[0],
-          "Input(GTBox) and Input(GTScore) dim[0] should be same");
-      PADDLE_ENFORCE_EQ(
-          dim_gtscore[1], dim_gtbox[1],
-          "Input(GTBox) and Input(GTScore) dim[1] should be same");
-    }
-
-    std::vector<int64_t> dim_out({dim_x[0]});
-    ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
-
-    std::vector<int64_t> dim_obj_mask({dim_x[0], mask_num, dim_x[2], dim_x[3]});
-    ctx->SetOutputDim("ObjectnessMask", framework::make_ddim(dim_obj_mask));
-
-    std::vector<int64_t> dim_gt_match_mask({dim_gtbox[0], dim_gtbox[1]});
-    ctx->SetOutputDim("GTMatchMask", framework::make_ddim(dim_gt_match_mask));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of YOLOv3 loss operator, "
-             "This is a 4-D tensor with shape of [N, C, H, W]."
-             "H and W should be same, and the second dimention(C) stores"
-             "box locations, confidence score and classification one-hot"
-             "keys of each anchor box");
-    AddInput("GTBox",
-             "The input tensor of ground truth boxes, "
-             "This is a 3-D tensor with shape of [N, max_box_num, 5], "
-             "max_box_num is the max number of boxes in each image, "
-             "In the third dimention, stores x, y, w, h coordinates, "
-             "x, y is the center cordinate of boxes and w, h is the "
-             "width and height and x, y, w, h should be divided by "
-             "input image height to scale to [0, 1].");
-    AddInput("GTLabel",
-             "The input tensor of ground truth label, "
-             "This is a 2-D tensor with shape of [N, max_box_num], "
-             "and each element should be an integer to indicate the "
-             "box class id.");
-    AddInput("GTScore",
-             "The score of GTLabel, This is a 2-D tensor in same shape "
-             "GTLabel, and score values should in range (0, 1). This "
-             "input is for GTLabel score can be not 1.0 in image mixup "
-             "augmentation.")
-        .AsDispensable();
-    AddOutput("Loss",
-              "The output yolov3 loss tensor, "
-              "This is a 1-D tensor with shape of [N]");
-    AddOutput("ObjectnessMask",
-              "This is an intermediate tensor with shape of [N, M, H, W], "
-              "M is the number of anchor masks. This parameter caches the "
-              "mask for calculate objectness loss in gradient kernel.")
-        .AsIntermediate();
-    AddOutput("GTMatchMask",
-              "This is an intermediate tensor with shape of [N, B], "
-              "B is the max box number of GT boxes. This parameter caches "
-              "matched mask index of each GT boxes for gradient calculate.")
-        .AsIntermediate();
-
-    AddAttr<int>("class_num", "The number of classes to predict.");
-    AddAttr<std::vector<int>>("anchors",
-                              "The anchor width and height, "
-                              "it will be parsed pair by pair.")
-        .SetDefault(std::vector<int>{});
-    AddAttr<std::vector<int>>("anchor_mask",
-                              "The mask index of anchors used in "
-                              "current YOLOv3 loss calculation.")
-        .SetDefault(std::vector<int>{});
-    AddAttr<int>("downsample_ratio",
-                 "The downsample ratio from network input to YOLOv3 loss "
-                 "input, so 32, 16, 8 should be set for the first, second, "
-                 "and thrid YOLOv3 loss operators.")
-        .SetDefault(32);
-    AddAttr<float>("ignore_thresh",
-                   "The ignore threshold to ignore confidence loss.")
-        .SetDefault(0.7);
-    AddAttr<bool>("use_label_smooth",
-                  "Whether to use label smooth. Default True.")
-        .SetDefault(true);
-    AddComment(R"DOC(
-         This operator generates yolov3 loss based on given predict result and ground
-         truth boxes.
-         
-         The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, H and W specify the grid size, each grid point predict 
-         given number bounding boxes, this given number, which following will be represented as S,
-         is specified by the number of anchor clusters in each scale. In the second dimension(the channel
-         dimension), C should be equal to S * (class_num + 5), class_num is the object 
-         category number of source dataset(such as 80 in coco dataset), so in the 
-         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
-         also includes confidence score of the box and class one-hot key of each anchor box.
-
-         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
-         should be as follows:
-
-         $$
-         b_x = \\sigma(t_x) + c_x
-         $$
-         $$
-         b_y = \\sigma(t_y) + c_y
-         $$
-         $$
-         b_w = p_w e^{t_w}
-         $$
-         $$
-         b_h = p_h e^{t_h}
-         $$
-
-         In the equation above, :math:`c_x, c_y` is the left top corner of current grid
-         and :math:`p_w, p_h` is specified by anchors.
-
-         As for confidence score, it is the logistic regression value of IoU between
-         anchor boxes and ground truth boxes, the score of the anchor box which has 
-         the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
-         thresh, the confidence score loss of this anchor box will be ignored.
-
-         Therefore, the yolov3 loss consists of three major parts: box location loss,
-         objectness loss and classification loss. The L1 loss is used for 
-         box coordinates (w, h), sigmoid cross entropy loss is used for box 
-         coordinates (x, y), objectness loss and classification loss.
-
-         Each groud truth box finds a best matching anchor box in all anchors. 
-         Prediction of this anchor box will incur all three parts of losses, and
-         prediction of anchor boxes with no GT box matched will only incur objectness
-         loss.
-
-         In order to trade off box coordinate losses between big boxes and small 
-         boxes, box coordinate losses will be mutiplied by scale weight, which is
-         calculated as follows.
-
-         $$
-         weight_{box} = 2.0 - t_w * t_h
-         $$
-
-         Final loss will be represented as follows.
-
-         $$
-         loss = (loss_{xy} + loss_{wh}) * weight_{box}
-              + loss_{conf} + loss_{class}
-         $$
-
-         While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
-         target will be smoothed when calculating classification loss, target of 
-         positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
-         negetive samples will be smoothed to :math:`1.0 / class\_num`.
-
-         While :attr:`GTScore` is given, which means the mixup score of ground truth 
-         boxes, all losses incured by a ground truth box will be multiplied by its 
-         mixup score.
-         )DOC");
-  }
-};
-
-class Yolov3LossOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input(Loss@GRAD) should not be null");
-    auto dim_x = ctx->GetInputDim("X");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType("yolov3_loss_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("GTBox", Input("GTBox"));
-    op->SetInput("GTLabel", Input("GTLabel"));
-    op->SetInput("GTScore", Input("GTScore"));
-    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
-    op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
-    op->SetInput("GTMatchMask", Output("GTMatchMask"));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("GTBox"), {});
-    op->SetOutput(framework::GradVarName("GTLabel"), {});
-    op->SetOutput(framework::GradVarName("GTScore"), {});
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker,
-                  ops::Yolov3LossGradMaker);
-REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad);
-REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel<float>,
-                       ops::Yolov3LossKernel<double>);
-REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel<float>,
-                       ops::Yolov3LossGradKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
deleted file mode 100644
index f8d49960c7c5e718d68e7af2bea3dec825fc35fd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ /dev/null
@@ -1,502 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-static inline bool LessEqualZero(T x) {
-  return x < 1e-6;
-}
-
-template <typename T>
-static T SigmoidCrossEntropy(T x, T label) {
-  return (x > 0 ? x : 0.0) - x * label + std::log(1.0 + std::exp(-std::abs(x)));
-}
-
-template <typename T>
-static T L1Loss(T x, T y) {
-  return std::abs(y - x);
-}
-
-template <typename T>
-static T SigmoidCrossEntropyGrad(T x, T label) {
-  return 1.0 / (1.0 + std::exp(-x)) - label;
-}
-
-template <typename T>
-static T L1LossGrad(T x, T y) {
-  return x > y ? 1.0 : -1.0;
-}
-
-static int GetMaskIndex(std::vector<int> mask, int val) {
-  for (size_t i = 0; i < mask.size(); i++) {
-    if (mask[i] == val) {
-      return i;
-    }
-  }
-  return -1;
-}
-
-template <typename T>
-struct Box {
-  T x, y, w, h;
-};
-
-template <typename T>
-static inline T sigmoid(T x) {
-  return 1.0 / (1.0 + std::exp(-x));
-}
-
-template <typename T>
-static inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
-                                int j, int an_idx, int grid_size,
-                                int input_size, int index, int stride) {
-  Box<T> b;
-  b.x = (i + sigmoid<T>(x[index])) / grid_size;
-  b.y = (j + sigmoid<T>(x[index + stride])) / grid_size;
-  b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] / input_size;
-  b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] / input_size;
-  return b;
-}
-
-template <typename T>
-static inline Box<T> GetGtBox(const T* gt, int batch, int max_boxes, int idx) {
-  Box<T> b;
-  b.x = gt[(batch * max_boxes + idx) * 4];
-  b.y = gt[(batch * max_boxes + idx) * 4 + 1];
-  b.w = gt[(batch * max_boxes + idx) * 4 + 2];
-  b.h = gt[(batch * max_boxes + idx) * 4 + 3];
-  return b;
-}
-
-template <typename T>
-static inline T BoxOverlap(T c1, T w1, T c2, T w2) {
-  T l1 = c1 - w1 / 2.0;
-  T l2 = c2 - w2 / 2.0;
-  T left = l1 > l2 ? l1 : l2;
-  T r1 = c1 + w1 / 2.0;
-  T r2 = c2 + w2 / 2.0;
-  T right = r1 < r2 ? r1 : r2;
-  return right - left;
-}
-
-template <typename T>
-static inline T CalcBoxIoU(Box<T> b1, Box<T> b2) {
-  T w = BoxOverlap(b1.x, b1.w, b2.x, b2.w);
-  T h = BoxOverlap(b1.y, b1.h, b2.y, b2.h);
-  T inter_area = (w < 0 || h < 0) ? 0.0 : w * h;
-  T union_area = b1.w * b1.h + b2.w * b2.h - inter_area;
-  return inter_area / union_area;
-}
-
-static inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num,
-                                int an_stride, int stride, int entry) {
-  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
-}
-
-template <typename T>
-static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
-                                std::vector<int> anchors, int an_idx,
-                                int box_idx, int gi, int gj, int grid_size,
-                                int input_size, int stride, T score) {
-  T tx = gt.x * grid_size - gi;
-  T ty = gt.y * grid_size - gj;
-  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
-  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
-
-  T scale = (2.0 - gt.w * gt.h) * score;
-  loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
-  loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
-  loss[0] += L1Loss<T>(input[box_idx + 2 * stride], tw) * scale;
-  loss[0] += L1Loss<T>(input[box_idx + 3 * stride], th) * scale;
-}
-
-template <typename T>
-static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
-                                    Box<T> gt, std::vector<int> anchors,
-                                    int an_idx, int box_idx, int gi, int gj,
-                                    int grid_size, int input_size, int stride,
-                                    T score) {
-  T tx = gt.x * grid_size - gi;
-  T ty = gt.y * grid_size - gj;
-  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
-  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
-
-  T scale = (2.0 - gt.w * gt.h) * score;
-  input_grad[box_idx] =
-      SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
-  input_grad[box_idx + stride] =
-      SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
-  input_grad[box_idx + 2 * stride] =
-      L1LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
-  input_grad[box_idx + 3 * stride] =
-      L1LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
-}
-
-template <typename T>
-static inline void CalcLabelLoss(T* loss, const T* input, const int index,
-                                 const int label, const int class_num,
-                                 const int stride, const T pos, const T neg,
-                                 T score) {
-  for (int i = 0; i < class_num; i++) {
-    T pred = input[index + i * stride];
-    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? pos : neg) * score;
-  }
-}
-
-template <typename T>
-static inline void CalcLabelLossGrad(T* input_grad, const T loss,
-                                     const T* input, const int index,
-                                     const int label, const int class_num,
-                                     const int stride, const T pos, const T neg,
-                                     T score) {
-  for (int i = 0; i < class_num; i++) {
-    T pred = input[index + i * stride];
-    input_grad[index + i * stride] =
-        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? pos : neg) * score *
-        loss;
-  }
-}
-
-template <typename T>
-static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
-                                   const int n, const int an_num, const int h,
-                                   const int w, const int stride,
-                                   const int an_stride) {
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < an_num; j++) {
-      for (int k = 0; k < h; k++) {
-        for (int l = 0; l < w; l++) {
-          T obj = objness[k * w + l];
-          if (obj > 1e-5) {
-            // positive sample: obj = mixup score
-            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0) * obj;
-          } else if (obj > -0.5) {
-            // negetive sample: obj = 0
-            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
-          }
-        }
-      }
-      objness += stride;
-      input += an_stride;
-    }
-  }
-}
-
-template <typename T>
-static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
-                                       const T* input, const T* objness,
-                                       const int n, const int an_num,
-                                       const int h, const int w,
-                                       const int stride, const int an_stride) {
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < an_num; j++) {
-      for (int k = 0; k < h; k++) {
-        for (int l = 0; l < w; l++) {
-          T obj = objness[k * w + l];
-          if (obj > 1e-5) {
-            input_grad[k * w + l] =
-                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * obj *
-                loss[i];
-          } else if (obj > -0.5) {
-            input_grad[k * w + l] =
-                SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
-          }
-        }
-      }
-      objness += stride;
-      input += an_stride;
-      input_grad += an_stride;
-    }
-  }
-}
-
-template <typename T>
-static void inline GtValid(bool* valid, const T* gtbox, const int n,
-                           const int b) {
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < b; j++) {
-      if (LessEqualZero(gtbox[j * 4 + 2]) || LessEqualZero(gtbox[j * 4 + 3])) {
-        valid[j] = false;
-      } else {
-        valid[j] = true;
-      }
-    }
-    valid += b;
-    gtbox += b * 4;
-  }
-}
-
-template <typename T>
-class Yolov3LossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* gt_box = ctx.Input<Tensor>("GTBox");
-    auto* gt_label = ctx.Input<Tensor>("GTLabel");
-    auto* gt_score = ctx.Input<Tensor>("GTScore");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
-    auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
-    int class_num = ctx.Attr<int>("class_num");
-    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int an_num = anchors.size() / 2;
-    const int mask_num = anchor_mask.size();
-    const int b = gt_box->dims()[1];
-    int input_size = downsample_ratio * h;
-
-    const int stride = h * w;
-    const int an_stride = (class_num + 5) * stride;
-
-    T label_pos = 1.0;
-    T label_neg = 0.0;
-    if (use_label_smooth) {
-      T smooth_weight = std::min(1.0 / static_cast<T>(class_num), 1.0 / 40);
-      label_pos = 1.0 - smooth_weight;
-      label_neg = smooth_weight;
-    }
-
-    const T* input_data = input->data<T>();
-    const T* gt_box_data = gt_box->data<T>();
-    const int* gt_label_data = gt_label->data<int>();
-    T* loss_data = loss->mutable_data<T>({n}, ctx.GetPlace());
-    memset(loss_data, 0, loss->numel() * sizeof(T));
-    T* obj_mask_data =
-        objness_mask->mutable_data<T>({n, mask_num, h, w}, ctx.GetPlace());
-    memset(obj_mask_data, 0, objness_mask->numel() * sizeof(T));
-    int* gt_match_mask_data =
-        gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());
-
-    const T* gt_score_data;
-    if (!gt_score) {
-      Tensor gtscore;
-      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
-      math::SetConstant<platform::CPUDeviceContext, T>()(
-          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
-          static_cast<T>(1.0));
-      gt_score = &gtscore;
-      gt_score_data = gtscore.data<T>();
-    } else {
-      gt_score_data = gt_score->data<T>();
-    }
-
-    // calc valid gt box mask, avoid calc duplicately in following code
-    Tensor gt_valid_mask;
-    bool* gt_valid_mask_data =
-        gt_valid_mask.mutable_data<bool>({n, b}, ctx.GetPlace());
-    GtValid<T>(gt_valid_mask_data, gt_box_data, n, b);
-
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < mask_num; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            // each predict box find a best match gt box, if overlap is bigger
-            // then ignore_thresh, ignore the objectness loss.
-            int box_idx =
-                GetEntryIndex(i, j, k * w + l, mask_num, an_stride, stride, 0);
-            Box<T> pred = GetYoloBox(input_data, anchors, l, k, anchor_mask[j],
-                                     h, input_size, box_idx, stride);
-            T best_iou = 0;
-            for (int t = 0; t < b; t++) {
-              if (!gt_valid_mask_data[i * b + t]) {
-                continue;
-              }
-              Box<T> gt = GetGtBox(gt_box_data, i, b, t);
-              T iou = CalcBoxIoU(pred, gt);
-              if (iou > best_iou) {
-                best_iou = iou;
-              }
-            }
-
-            // If best IoU is bigger then ignore_thresh,
-            // ignore the objectness loss.
-            if (best_iou > ignore_thresh) {
-              int obj_idx = (i * mask_num + j) * stride + k * w + l;
-              obj_mask_data[obj_idx] = static_cast<T>(-1);
-            }
-            // all losses should be calculated if best IoU
-            // is bigger then truth thresh, but currently,
-            // truth thresh is an unreachable value as 1.0.
-          }
-        }
-      }
-      for (int t = 0; t < b; t++) {
-        if (!gt_valid_mask_data[i * b + t]) {
-          gt_match_mask_data[i * b + t] = -1;
-          continue;
-        }
-        Box<T> gt = GetGtBox(gt_box_data, i, b, t);
-        int gi = static_cast<int>(gt.x * w);
-        int gj = static_cast<int>(gt.y * h);
-        Box<T> gt_shift = gt;
-        gt_shift.x = 0.0;
-        gt_shift.y = 0.0;
-        T best_iou = 0.0;
-        int best_n = 0;
-        // each gt box find a best match anchor box as positive sample,
-        // for positive sample, all losses should be calculated, and for
-        // other samples, only objectness loss is required.
-        for (int an_idx = 0; an_idx < an_num; an_idx++) {
-          Box<T> an_box;
-          an_box.x = 0.0;
-          an_box.y = 0.0;
-          an_box.w = anchors[2 * an_idx] / static_cast<T>(input_size);
-          an_box.h = anchors[2 * an_idx + 1] / static_cast<T>(input_size);
-          float iou = CalcBoxIoU<T>(an_box, gt_shift);
-          if (iou > best_iou) {
-            best_iou = iou;
-            best_n = an_idx;
-          }
-        }
-
-        int mask_idx = GetMaskIndex(anchor_mask, best_n);
-        gt_match_mask_data[i * b + t] = mask_idx;
-        if (mask_idx >= 0) {
-          T score = gt_score_data[i * b + t];
-          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
-                                      an_stride, stride, 0);
-          CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
-                                 box_idx, gi, gj, h, input_size, stride, score);
-
-          int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
-          obj_mask_data[obj_idx] = score;
-
-          int label = gt_label_data[i * b + t];
-          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
-                                        an_stride, stride, 5);
-          CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
-                           class_num, stride, label_pos, label_neg, score);
-        }
-      }
-    }
-
-    CalcObjnessLoss<T>(loss_data, input_data + 4 * stride, obj_mask_data, n,
-                       mask_num, h, w, stride, an_stride);
-  }
-};
-
-template <typename T>
-class Yolov3LossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* gt_box = ctx.Input<Tensor>("GTBox");
-    auto* gt_label = ctx.Input<Tensor>("GTLabel");
-    auto* gt_score = ctx.Input<Tensor>("GTScore");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
-    auto* gt_match_mask = ctx.Input<Tensor>("GTMatchMask");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
-    int class_num = ctx.Attr<int>("class_num");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
-
-    const int n = input_grad->dims()[0];
-    const int c = input_grad->dims()[1];
-    const int h = input_grad->dims()[2];
-    const int w = input_grad->dims()[3];
-    const int mask_num = anchor_mask.size();
-    const int b = gt_match_mask->dims()[1];
-    int input_size = downsample_ratio * h;
-
-    const int stride = h * w;
-    const int an_stride = (class_num + 5) * stride;
-
-    T label_pos = 1.0;
-    T label_neg = 0.0;
-    if (use_label_smooth) {
-      T smooth_weight = std::min(1.0 / static_cast<T>(class_num), 1.0 / 40);
-      label_pos = 1.0 - smooth_weight;
-      label_neg = smooth_weight;
-    }
-
-    const T* input_data = input->data<T>();
-    const T* gt_box_data = gt_box->data<T>();
-    const int* gt_label_data = gt_label->data<int>();
-    const T* loss_grad_data = loss_grad->data<T>();
-    const T* obj_mask_data = objness_mask->data<T>();
-    const int* gt_match_mask_data = gt_match_mask->data<int>();
-    T* input_grad_data =
-        input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
-
-    const T* gt_score_data;
-    if (!gt_score) {
-      Tensor gtscore;
-      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
-      math::SetConstant<platform::CPUDeviceContext, T>()(
-          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
-          static_cast<T>(1.0));
-      gt_score = &gtscore;
-      gt_score_data = gtscore.data<T>();
-    } else {
-      gt_score_data = gt_score->data<T>();
-    }
-
-    for (int i = 0; i < n; i++) {
-      for (int t = 0; t < b; t++) {
-        int mask_idx = gt_match_mask_data[i * b + t];
-        if (mask_idx >= 0) {
-          T score = gt_score_data[i * b + t];
-          Box<T> gt = GetGtBox(gt_box_data, i, b, t);
-          int gi = static_cast<int>(gt.x * w);
-          int gj = static_cast<int>(gt.y * h);
-
-          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
-                                      an_stride, stride, 0);
-          CalcBoxLocationLossGrad<T>(input_grad_data, loss_grad_data[i],
-                                     input_data, gt, anchors,
-                                     anchor_mask[mask_idx], box_idx, gi, gj, h,
-                                     input_size, stride, score);
-
-          int label = gt_label_data[i * b + t];
-          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
-                                        an_stride, stride, 5);
-          CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
-                               label_idx, label, class_num, stride, label_pos,
-                               label_neg, score);
-        }
-      }
-    }
-
-    CalcObjnessLossGrad<T>(input_grad_data + 4 * stride, loss_grad_data,
-                           input_data + 4 * stride, obj_mask_data, n, mask_num,
-                           h, w, stride, an_stride);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
deleted file mode 100644
index dff97f7c77fc26af4cd4e7794d9092aec14cfa6e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection_map_op.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection_map_op.h"
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class DetectionMAPOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("DetectRes"),
-                   "Input(DetectRes) of DetectionMAPOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input(Label) of DetectionMAPOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AccumPosCount"),
-        "Output(AccumPosCount) of DetectionMAPOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AccumTruePos"),
-        "Output(AccumTruePos) of DetectionMAPOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AccumFalsePos"),
-        "Output(AccumFalsePos) of DetectionMAPOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MAP"),
-                   "Output(MAP) of DetectionMAPOp should not be null.");
-
-    auto det_dims = ctx->GetInputDim("DetectRes");
-    PADDLE_ENFORCE_EQ(det_dims.size(), 2UL,
-                      "The rank of Input(DetectRes) must be 2, "
-                      "the shape is [N, 6].");
-    PADDLE_ENFORCE_EQ(det_dims[1], 6UL,
-                      "The shape is of Input(DetectRes) [N, 6].");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2,
-                      "The rank of Input(Label) must be 2, "
-                      "the shape is [N, 6].");
-    if (ctx->IsRuntime() || label_dims[1] > 0) {
-      PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5,
-                     "The shape of Input(Label) is [N, 6] or [N, 5].");
-    }
-
-    if (ctx->HasInput("PosCount")) {
-      PADDLE_ENFORCE(ctx->HasInput("TruePos"),
-                     "Input(TruePos) of DetectionMAPOp should not be null when "
-                     "Input(TruePos) is not null.");
-      PADDLE_ENFORCE(
-          ctx->HasInput("FalsePos"),
-          "Input(FalsePos) of DetectionMAPOp should not be null when "
-          "Input(FalsePos) is not null.");
-    }
-
-    ctx->SetOutputDim("MAP", framework::make_ddim({1}));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>("DetectRes")->type(),
-        platform::CPUPlace());
-  }
-};
-
-class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("DetectRes",
-             "(LoDTensor) A 2-D LoDTensor with shape [M, 6] represents the "
-             "detections. Each row has 6 values: "
-             "[label, confidence, xmin, ymin, xmax, ymax], M is the total "
-             "number of detect results in this mini-batch. For each instance, "
-             "the offsets in first dimension are called LoD, the number of "
-             "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
-             "no detected data.");
-    AddInput("Label",
-             "(LoDTensor) A 2-D LoDTensor represents the"
-             "Labeled ground-truth data. Each row has 6 values: "
-             "[label, xmin, ymin, xmax, ymax, is_difficult] or 5 values: "
-             "[label, xmin, ymin, xmax, ymax], where N is the total "
-             "number of ground-truth data in this mini-batch. For each "
-             "instance, the offsets in first dimension are called LoD, "
-             "the number of offset is N + 1, if LoD[i + 1] - LoD[i] == 0, "
-             "means there is no ground-truth data.");
-    AddInput("HasState",
-             "(Tensor<int>) A tensor with shape [1], 0 means ignoring input "
-             "states, which including PosCount, TruePos, FalsePos.")
-        .AsDispensable();
-    AddInput("PosCount",
-             "(Tensor) A tensor with shape [Ncls, 1], store the "
-             "input positive example count of each class, Ncls is the count of "
-             "input classification. "
-             "This input is used to pass the AccumPosCount generated by the "
-             "previous mini-batch when the multi mini-batches cumulative "
-             "calculation carried out. "
-             "When the input(PosCount) is empty, the cumulative "
-             "calculation is not carried out, and only the results of the "
-             "current mini-batch are calculated.")
-        .AsDispensable();
-    AddInput("TruePos",
-             "(LoDTensor) A 2-D LoDTensor with shape [Ntp, 2], store the "
-             "input true positive example of each class."
-             "This input is used to pass the AccumTruePos generated by the "
-             "previous mini-batch when the multi mini-batches cumulative "
-             "calculation carried out. ")
-        .AsDispensable();
-    AddInput("FalsePos",
-             "(LoDTensor) A 2-D LoDTensor with shape [Nfp, 2], store the "
-             "input false positive example of each class."
-             "This input is used to pass the AccumFalsePos generated by the "
-             "previous mini-batch when the multi mini-batches cumulative "
-             "calculation carried out. ")
-        .AsDispensable();
-    AddOutput("AccumPosCount",
-              "(Tensor) A tensor with shape [Ncls, 1], store the "
-              "positive example count of each class. It combines the input "
-              "input(PosCount) and the positive example count computed from "
-              "input(Detection) and input(Label).");
-    AddOutput("AccumTruePos",
-              "(LoDTensor) A LoDTensor with shape [Ntp', 2], store the "
-              "true positive example of each class. It combines the "
-              "input(TruePos) and the true positive examples computed from "
-              "input(Detection) and input(Label).");
-    AddOutput("AccumFalsePos",
-              "(LoDTensor) A LoDTensor with shape [Nfp', 2], store the "
-              "false positive example of each class. It combines the "
-              "input(FalsePos) and the false positive examples computed from "
-              "input(Detection) and input(Label).");
-    AddOutput("MAP",
-              "(Tensor) A tensor with shape [1], store the mAP evaluate "
-              "result of the detection.");
-    AddAttr<int>("class_num",
-                 "(int) "
-                 "The class number.");
-    AddAttr<int>(
-        "background_label",
-        "(int, default: 0) "
-        "The index of background label, the background label will be ignored. "
-        "If set to -1, then all categories will be considered.")
-        .SetDefault(0);
-    AddAttr<float>(
-        "overlap_threshold",
-        "(float) "
-        "The lower bound jaccard overlap threshold of detection output and "
-        "ground-truth data.")
-        .SetDefault(.5f);
-    AddAttr<bool>("evaluate_difficult",
-                  "(bool, default true) "
-                  "Switch to control whether the difficult data is evaluated.")
-        .SetDefault(true);
-    AddAttr<std::string>("ap_type",
-                         "(string, default 'integral') "
-                         "The AP algorithm type, 'integral' or '11point'.")
-        .SetDefault("integral")
-        .InEnum({"integral", "11point"})
-        .AddCustomChecker([](const std::string& ap_type) {
-          PADDLE_ENFORCE_NE(GetAPType(ap_type), APType::kNone,
-                            "The ap_type should be 'integral' or '11point.");
-        });
-    AddComment(R"DOC(
-Detection mAP evaluate operator.
-The general steps are as follows. First, calculate the true positive and
-false positive according to the input of detection and labels, then
-calculate the mAP evaluate value.
-Supporting '11 point' and 'integral' mAP algorithm. Please get more information
-from the following articles:
-https://sanchom.wordpress.com/tag/average-precision/
-https://arxiv.org/abs/1512.02325
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(detection_map, ops::DetectionMAPOp, ops::DetectionMAPOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    detection_map, ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, float>,
-    ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
deleted file mode 100644
index dd5d138a1e979826d59c4731920379b030e3b492..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection_map_op.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-enum APType { kNone = 0, kIntegral, k11point };
-
-APType GetAPType(std::string str) {
-  if (str == "integral") {
-    return APType::kIntegral;
-  } else if (str == "11point") {
-    return APType::k11point;
-  } else {
-    return APType::kNone;
-  }
-}
-
-template <typename T>
-inline bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                                 const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <typename T>
-inline void GetAccumulation(std::vector<std::pair<T, int>> in_pairs,
-                            std::vector<int>* accu_vec) {
-  std::stable_sort(in_pairs.begin(), in_pairs.end(), SortScorePairDescend<int>);
-  accu_vec->clear();
-  size_t sum = 0;
-  for (size_t i = 0; i < in_pairs.size(); ++i) {
-    auto count = in_pairs[i].second;
-    sum += count;
-    accu_vec->push_back(sum);
-  }
-}
-
-template <typename Place, typename T>
-class DetectionMAPOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_detect = ctx.Input<framework::LoDTensor>("DetectRes");
-    auto* in_label = ctx.Input<framework::LoDTensor>("Label");
-    auto* out_map = ctx.Output<framework::Tensor>("MAP");
-
-    auto* in_pos_count = ctx.Input<framework::Tensor>("PosCount");
-    auto* in_true_pos = ctx.Input<framework::LoDTensor>("TruePos");
-    auto* in_false_pos = ctx.Input<framework::LoDTensor>("FalsePos");
-
-    auto* out_pos_count = ctx.Output<framework::Tensor>("AccumPosCount");
-    auto* out_true_pos = ctx.Output<framework::LoDTensor>("AccumTruePos");
-    auto* out_false_pos = ctx.Output<framework::LoDTensor>("AccumFalsePos");
-
-    float overlap_threshold = ctx.Attr<float>("overlap_threshold");
-    bool evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
-    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
-    int class_num = ctx.Attr<int>("class_num");
-
-    auto& label_lod = in_label->lod();
-    auto& detect_lod = in_detect->lod();
-    PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
-                      "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
-                      "The batch_size of input(Label) and input(Detection) "
-                      "must be the same.");
-
-    std::vector<std::map<int, std::vector<Box>>> gt_boxes;
-    std::vector<std::map<int, std::vector<std::pair<T, Box>>>> detect_boxes;
-
-    GetBoxes(*in_label, *in_detect, &gt_boxes, detect_boxes);
-
-    std::map<int, int> label_pos_count;
-    std::map<int, std::vector<std::pair<T, int>>> true_pos;
-    std::map<int, std::vector<std::pair<T, int>>> false_pos;
-
-    auto* has_state = ctx.Input<framework::LoDTensor>("HasState");
-    int state = 0;
-    if (has_state) {
-      state = has_state->data<int>()[0];
-    }
-
-    if (in_pos_count != nullptr && state) {
-      GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, &label_pos_count,
-                  &true_pos, &false_pos, class_num);
-    }
-
-    CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult,
-                             overlap_threshold, &label_pos_count, &true_pos,
-                             &false_pos);
-
-    int background_label = ctx.Attr<int>("background_label");
-    T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos,
-                    background_label);
-
-    GetOutputPos(ctx, label_pos_count, true_pos, false_pos, out_pos_count,
-                 out_true_pos, out_false_pos, class_num);
-
-    T* map_data = out_map->mutable_data<T>(ctx.GetPlace());
-    map_data[0] = map;
-  }
-
- protected:
-  struct Box {
-    Box(T xmin, T ymin, T xmax, T ymax)
-        : xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax), is_difficult(false) {}
-
-    T xmin, ymin, xmax, ymax;
-    bool is_difficult;
-  };
-
-  inline T JaccardOverlap(const Box& box1, const Box& box2) const {
-    if (box2.xmin > box1.xmax || box2.xmax < box1.xmin ||
-        box2.ymin > box1.ymax || box2.ymax < box1.ymin) {
-      return 0.0;
-    } else {
-      T inter_xmin = std::max(box1.xmin, box2.xmin);
-      T inter_ymin = std::max(box1.ymin, box2.ymin);
-      T inter_xmax = std::min(box1.xmax, box2.xmax);
-      T inter_ymax = std::min(box1.ymax, box2.ymax);
-
-      T inter_width = inter_xmax - inter_xmin;
-      T inter_height = inter_ymax - inter_ymin;
-      T inter_area = inter_width * inter_height;
-
-      T bbox_area1 = (box1.xmax - box1.xmin) * (box1.ymax - box1.ymin);
-      T bbox_area2 = (box2.xmax - box2.xmin) * (box2.ymax - box2.ymin);
-
-      return inter_area / (bbox_area1 + bbox_area2 - inter_area);
-    }
-  }
-
-  inline void ClipBBox(const Box& bbox, Box* clipped_bbox) const {
-    T one = static_cast<T>(1.0);
-    T zero = static_cast<T>(0.0);
-    clipped_bbox->xmin = std::max(std::min(bbox.xmin, one), zero);
-    clipped_bbox->ymin = std::max(std::min(bbox.ymin, one), zero);
-    clipped_bbox->xmax = std::max(std::min(bbox.xmax, one), zero);
-    clipped_bbox->ymax = std::max(std::min(bbox.ymax, one), zero);
-  }
-
-  void GetBoxes(const framework::LoDTensor& input_label,
-                const framework::LoDTensor& input_detect,
-                std::vector<std::map<int, std::vector<Box>>>* gt_boxes,
-                std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
-                    detect_boxes) const {
-    auto labels = framework::EigenTensor<T, 2>::From(input_label);
-    auto detect = framework::EigenTensor<T, 2>::From(input_detect);
-
-    auto& label_lod = input_label.lod();
-    auto& detect_lod = input_detect.lod();
-
-    int batch_size = label_lod[0].size() - 1;
-    auto& label_index = label_lod[0];
-
-    for (int n = 0; n < batch_size; ++n) {
-      std::map<int, std::vector<Box>> boxes;
-      for (size_t i = label_index[n]; i < label_index[n + 1]; ++i) {
-        int label = labels(i, 0);
-        if (input_label.dims()[1] == 6) {
-          Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
-          auto is_difficult = labels(i, 1);
-          if (std::abs(is_difficult - 0.0) < 1e-6)
-            box.is_difficult = false;
-          else
-            box.is_difficult = true;
-          boxes[label].push_back(box);
-        } else {
-          PADDLE_ENFORCE_EQ(input_label.dims()[1], 5);
-          Box box(labels(i, 1), labels(i, 2), labels(i, 3), labels(i, 4));
-          boxes[label].push_back(box);
-        }
-      }
-      gt_boxes->push_back(boxes);
-    }
-
-    auto detect_index = detect_lod[0];
-    for (int n = 0; n < batch_size; ++n) {
-      std::map<int, std::vector<std::pair<T, Box>>> boxes;
-      for (size_t i = detect_index[n]; i < detect_index[n + 1]; ++i) {
-        Box box(detect(i, 2), detect(i, 3), detect(i, 4), detect(i, 5));
-        int label = detect(i, 0);
-        auto score = detect(i, 1);
-        boxes[label].push_back(std::make_pair(score, box));
-      }
-      detect_boxes.push_back(boxes);
-    }
-  }
-
-  void GetOutputPos(
-      const framework::ExecutionContext& ctx,
-      const std::map<int, int>& label_pos_count,
-      const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-      const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-      framework::Tensor* output_pos_count,
-      framework::LoDTensor* output_true_pos,
-      framework::LoDTensor* output_false_pos, const int class_num) const {
-    int true_pos_count = 0;
-    int false_pos_count = 0;
-    for (auto it = true_pos.begin(); it != true_pos.end(); ++it) {
-      auto tp = it->second;
-      true_pos_count += tp.size();
-    }
-    for (auto it = false_pos.begin(); it != false_pos.end(); ++it) {
-      auto fp = it->second;
-      false_pos_count += fp.size();
-    }
-
-    int* pos_count_data = output_pos_count->mutable_data<int>(
-        framework::make_ddim({class_num, 1}), ctx.GetPlace());
-
-    T* true_pos_data = output_true_pos->mutable_data<T>(
-        framework::make_ddim({true_pos_count, 2}), ctx.GetPlace());
-    T* false_pos_data = output_false_pos->mutable_data<T>(
-        framework::make_ddim({false_pos_count, 2}), ctx.GetPlace());
-    true_pos_count = 0;
-    false_pos_count = 0;
-    std::vector<size_t> true_pos_starts = {0};
-    std::vector<size_t> false_pos_starts = {0};
-    for (int i = 0; i < class_num; ++i) {
-      auto it_count = label_pos_count.find(i);
-      pos_count_data[i] = 0;
-      if (it_count != label_pos_count.end()) {
-        pos_count_data[i] = it_count->second;
-      }
-      auto it_true_pos = true_pos.find(i);
-      if (it_true_pos != true_pos.end()) {
-        const std::vector<std::pair<T, int>>& true_pos_vec =
-            it_true_pos->second;
-        for (const std::pair<T, int>& tp : true_pos_vec) {
-          true_pos_data[true_pos_count * 2] = tp.first;
-          true_pos_data[true_pos_count * 2 + 1] = static_cast<T>(tp.second);
-          true_pos_count++;
-        }
-      }
-      true_pos_starts.push_back(true_pos_count);
-
-      auto it_false_pos = false_pos.find(i);
-      if (it_false_pos != false_pos.end()) {
-        const std::vector<std::pair<T, int>>& false_pos_vec =
-            it_false_pos->second;
-        for (const std::pair<T, int>& fp : false_pos_vec) {
-          false_pos_data[false_pos_count * 2] = fp.first;
-          false_pos_data[false_pos_count * 2 + 1] = static_cast<T>(fp.second);
-          false_pos_count++;
-        }
-      }
-      false_pos_starts.push_back(false_pos_count);
-    }
-
-    framework::LoD true_pos_lod;
-    true_pos_lod.emplace_back(true_pos_starts);
-    framework::LoD false_pos_lod;
-    false_pos_lod.emplace_back(false_pos_starts);
-
-    output_true_pos->set_lod(true_pos_lod);
-    output_false_pos->set_lod(false_pos_lod);
-  }
-
-  void GetInputPos(const framework::Tensor& input_pos_count,
-                   const framework::LoDTensor& input_true_pos,
-                   const framework::LoDTensor& input_false_pos,
-                   std::map<int, int>* label_pos_count,
-                   std::map<int, std::vector<std::pair<T, int>>>* true_pos,
-                   std::map<int, std::vector<std::pair<T, int>>>* false_pos,
-                   const int class_num) const {
-    const int* pos_count_data = input_pos_count.data<int>();
-    for (int i = 0; i < class_num; ++i) {
-      (*label_pos_count)[i] = pos_count_data[i];
-    }
-
-    auto SetData = [](const framework::LoDTensor& pos_tensor,
-                      std::map<int, std::vector<std::pair<T, int>>>& pos) {
-      const T* pos_data = pos_tensor.data<T>();
-      auto& pos_data_lod = pos_tensor.lod()[0];
-      for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
-        for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
-          T score = pos_data[j * 2];
-          int flag = pos_data[j * 2 + 1];
-          pos[i].push_back(std::make_pair(score, flag));
-        }
-      }
-    };
-
-    SetData(input_true_pos, *true_pos);
-    SetData(input_false_pos, *false_pos);
-    return;
-  }
-
-  void CalcTrueAndFalsePositive(
-      const std::vector<std::map<int, std::vector<Box>>>& gt_boxes,
-      const std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
-          detect_boxes,
-      bool evaluate_difficult, float overlap_threshold,
-      std::map<int, int>* label_pos_count,
-      std::map<int, std::vector<std::pair<T, int>>>* true_pos,
-      std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
-    int batch_size = gt_boxes.size();
-    for (int n = 0; n < batch_size; ++n) {
-      auto& image_gt_boxes = gt_boxes[n];
-      for (auto& image_gt_box : image_gt_boxes) {
-        size_t count = 0;
-        auto& labeled_bboxes = image_gt_box.second;
-        if (evaluate_difficult) {
-          count = labeled_bboxes.size();
-        } else {
-          for (auto& box : labeled_bboxes) {
-            if (!box.is_difficult) {
-              ++count;
-            }
-          }
-        }
-        if (count == 0) {
-          continue;
-        }
-        int label = image_gt_box.first;
-        if (label_pos_count->find(label) == label_pos_count->end()) {
-          (*label_pos_count)[label] = count;
-        } else {
-          (*label_pos_count)[label] += count;
-        }
-      }
-    }
-
-    for (size_t n = 0; n < detect_boxes.size(); ++n) {
-      auto image_gt_boxes = gt_boxes[n];
-      auto detections = detect_boxes[n];
-
-      if (image_gt_boxes.size() == 0) {
-        for (auto it = detections.begin(); it != detections.end(); ++it) {
-          auto pred_boxes = it->second;
-          int label = it->first;
-          for (size_t i = 0; i < pred_boxes.size(); ++i) {
-            auto score = pred_boxes[i].first;
-            (*true_pos)[label].push_back(std::make_pair(score, 0));
-            (*false_pos)[label].push_back(std::make_pair(score, 1));
-          }
-        }
-        continue;
-      }
-
-      for (auto it = detections.begin(); it != detections.end(); ++it) {
-        int label = it->first;
-        auto pred_boxes = it->second;
-        if (image_gt_boxes.find(label) == image_gt_boxes.end()) {
-          for (size_t i = 0; i < pred_boxes.size(); ++i) {
-            auto score = pred_boxes[i].first;
-            (*true_pos)[label].push_back(std::make_pair(score, 0));
-            (*false_pos)[label].push_back(std::make_pair(score, 1));
-          }
-          continue;
-        }
-
-        auto matched_bboxes = image_gt_boxes.find(label)->second;
-        std::vector<bool> visited(matched_bboxes.size(), false);
-        // Sort detections in descend order based on scores
-        std::sort(pred_boxes.begin(), pred_boxes.end(),
-                  SortScorePairDescend<Box>);
-        for (size_t i = 0; i < pred_boxes.size(); ++i) {
-          T max_overlap = -1.0;
-          size_t max_idx = 0;
-          auto score = pred_boxes[i].first;
-          for (size_t j = 0; j < matched_bboxes.size(); ++j) {
-            Box& pred_box = pred_boxes[i].second;
-            ClipBBox(pred_box, &pred_box);
-            T overlap = JaccardOverlap(pred_box, matched_bboxes[j]);
-            if (overlap > max_overlap) {
-              max_overlap = overlap;
-              max_idx = j;
-            }
-          }
-          if (max_overlap > overlap_threshold) {
-            bool match_evaluate_difficult =
-                evaluate_difficult ||
-                (!evaluate_difficult && !matched_bboxes[max_idx].is_difficult);
-            if (match_evaluate_difficult) {
-              if (!visited[max_idx]) {
-                (*true_pos)[label].push_back(std::make_pair(score, 1));
-                (*false_pos)[label].push_back(std::make_pair(score, 0));
-                visited[max_idx] = true;
-              } else {
-                (*true_pos)[label].push_back(std::make_pair(score, 0));
-                (*false_pos)[label].push_back(std::make_pair(score, 1));
-              }
-            }
-          } else {
-            (*true_pos)[label].push_back(std::make_pair(score, 0));
-            (*false_pos)[label].push_back(std::make_pair(score, 1));
-          }
-        }
-      }
-    }
-  }
-
-  T CalcMAP(APType ap_type, const std::map<int, int>& label_pos_count,
-            const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-            const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-            const int background_label) const {
-    T mAP = 0.0;
-    int count = 0;
-    for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) {
-      int label = it->first;
-      int label_num_pos = it->second;
-      if (label_num_pos == background_label ||
-          true_pos.find(label) == true_pos.end()) {
-        continue;
-      }
-      auto label_true_pos = true_pos.find(label)->second;
-      auto label_false_pos = false_pos.find(label)->second;
-      // Compute average precision.
-      std::vector<int> tp_sum;
-      GetAccumulation<T>(label_true_pos, &tp_sum);
-      std::vector<int> fp_sum;
-      GetAccumulation<T>(label_false_pos, &fp_sum);
-      std::vector<T> precision, recall;
-      size_t num = tp_sum.size();
-      // Compute Precision.
-      for (size_t i = 0; i < num; ++i) {
-        precision.push_back(static_cast<T>(tp_sum[i]) /
-                            static_cast<T>(tp_sum[i] + fp_sum[i]));
-        recall.push_back(static_cast<T>(tp_sum[i]) / label_num_pos);
-      }
-      // VOC2007 style
-      if (ap_type == APType::k11point) {
-        std::vector<T> max_precisions(11, 0.0);
-        int start_idx = num - 1;
-        for (int j = 10; j >= 0; --j)
-          for (int i = start_idx; i >= 0; --i) {
-            if (recall[i] < j / 10.) {
-              start_idx = i;
-              if (j > 0) max_precisions[j - 1] = max_precisions[j];
-              break;
-            } else {
-              if (max_precisions[j] < precision[i])
-                max_precisions[j] = precision[i];
-            }
-          }
-        for (int j = 10; j >= 0; --j) mAP += max_precisions[j] / 11;
-        ++count;
-      } else if (ap_type == APType::kIntegral) {
-        // Nature integral
-        float average_precisions = 0.;
-        float prev_recall = 0.;
-        for (size_t i = 0; i < num; ++i) {
-          if (fabs(recall[i] - prev_recall) > 1e-6)
-            average_precisions += precision[i] * fabs(recall[i] - prev_recall);
-          prev_recall = recall[i];
-        }
-        mAP += average_precisions;
-        ++count;
-      } else {
-        LOG(FATAL) << "Unkown ap version: " << ap_type;
-      }
-    }
-    if (count != 0) mAP /= count;
-    return mAP;
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
deleted file mode 100644
index 6ebad4de3c8ebc57823709c04498a1f4311942a5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/operators/dgc_clip_by_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-class DGCClipByNormOp : public ClipByNormOp {
- public:
-  using ClipByNormOp::ClipByNormOp;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("current_step"),
-                   "current_step should be set.");
-
-    return ClipByNormOp::InferShape(ctx);
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const framework::Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "current_step") {
-      VLOG(10) << "var_name:" << var_name << " need not to transform";
-      return expected_kernel_type;
-    }
-
-    return framework::OperatorWithKernel::GetKernelTypeForVar(
-        var_name, tensor, expected_kernel_type);
-  }
-};
-
-class DGCClipByNormOpMaker : public ClipByNormOpMaker {
- public:
-  void Make() override {
-    AddInput("current_step", "(Tensor) Current step.");
-    AddAttr<float>("rampup_begin_step",
-                   "(float, -1.0)"
-                   "The period when begin k_select.")
-        .SetDefault(-1.0);
-
-    return ClipByNormOpMaker::Make();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(dgc_clip_by_norm, ops::DGCClipByNormOp,
-                             ops::DGCClipByNormOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    dgc_clip_by_norm,
-    ops::DGCClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cu b/paddle/fluid/operators/dgc_clip_by_norm_op.cu
deleted file mode 100644
index e7f564b7ab4d1c11810dc096faec7f5a375b8563..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/dgc_clip_by_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    dgc_clip_by_norm,
-    ops::DGCClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h
deleted file mode 100644
index 197bf59b2a470e1f6e4e31c6706d1e3f8e73fbbc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/clip_by_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DGCClipByNormKernel : public ClipByNormKernel<DeviceContext, T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
-    if (static_cast<int>(rampup_begin_step) < 0) {
-      return;
-    }
-
-    auto current_step_tensor = context.Input<framework::Tensor>("current_step");
-    auto* current_step = current_step_tensor->data<T>();
-
-    VLOG(10) << "current_step:" << *current_step
-             << ", rampup_begin_step:" << rampup_begin_step;
-
-    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
-      VLOG(10) << "current_step:" << *current_step
-               << " < rampup_begin_step:" << rampup_begin_step
-               << " so does't use dgc_clip_by_norm";
-      return;
-    }
-
-    return ClipByNormKernel<DeviceContext, T>::Compute(context);
-  };
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc
deleted file mode 100644
index ccdeea2d0a96342a57ca56ae2b686f81b32fd866..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dgc_op.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/dgc_op.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class DGCOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("current_step"),
-                   "Input(current_step) of DGCop should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("U_out"),
-                   "Output(U_out) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("V_out"),
-                   "Output(V_out) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("k"),
-                   "Output(k) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("EncodeGrad"),
-                   "Output(EncodeGrad) of DGCop should not be null.");
-  }
-
- protected:
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const framework::Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "current_step" || var_name == "rampup_step" ||
-        var_name == "k") {
-      VLOG(10) << "var_name:" << var_name << " need not to transform";
-      return expected_kernel_type;
-    }
-
-    return framework::OperatorWithKernel::GetKernelTypeForVar(
-        var_name, tensor, expected_kernel_type);
-  }
-};
-
-class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("U", "(Tensor) Middle tensor of DGC");
-    AddInput("V", "(Tensor) Middle tensor of DGC");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("current_step", "(Tensor) Current step.");
-
-    AddOutput("U_out",
-              "(Tensor) "
-              "Output encoded gradient");
-    AddOutput("V_out",
-              "(Tensor) "
-              "Output encoded gradient");
-    AddOutput("EncodeGrad",
-              "(Tensor) "
-              "Output encoded gradient");
-    AddOutput("Grad_out",
-              "(Tensor) "
-              "Output grad gradient");
-    AddOutput("k",
-              "(Tensor) "
-              "Output top-k value");
-
-    AddAttr<float>("m",
-                   "(float, 0.9) "
-                   "The momentum of learning rate.")
-        .SetDefault(0.9);
-
-    AddAttr<bool>("use_nesterov",
-                  "(bool, true)"
-                  "The momentum of learning rate.")
-        .SetDefault(true);
-
-    AddAttr<std::vector<float>>("sparsity",
-                                "(vecotr, float)"
-                                "The period sparsity of k_select.");
-
-    AddAttr<float>("rampup_begin_step",
-                   "(float, 0.0)"
-                   "The period when begin k_select.")
-        .SetDefault(0.0);
-
-    AddAttr<float>("rampup_step",
-                   "(float, 0.0)"
-                   "The period when begin k_select.");
-
-    AddComment(R"DOC(
-    Original paper is https://arxiv.org/abs/1712.01887
-
-    DGC reduce the communication bandwidth by sending only the important gradients (sparse update):\
-        only gradients larger than a threshold are transmitted.
-
-    To avoid losing information, DGC accumulate the rest of the gradients locally.
-
-    Eventually, these gradients become large enough to be transmitted.
-
-    Thus, DGC send the large gradients immediately but eventually send all of the gradients over time.
-
-    To ensure no loss of accuracy, DGC employs momentum correc-tionandlocal gradient clipping on top of the gradient sparsification to maintain model performance.
-
-    DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
-
-    This optimizer will do two things:
-        
-        1. Compress the gradient by get TopK import value from tensor \
-            and use it for allreduce to reduce network bandwidth.
-    
-        2. Call momentum to optimize on the cost.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(dgc, ops::DGCOp, ops::DGCOpMaker);
diff --git a/paddle/fluid/operators/dgc_op.cu b/paddle/fluid/operators/dgc_op.cu
deleted file mode 100644
index 0f0bf441a70bef9cb69362a9cf333aeb51e835b6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dgc_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/dgc_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    dgc, ops::DGCOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
deleted file mode 100644
index 1285daae094ab28cd4ec059094d4baf603870d7d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dgc_op.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "dgc/dgc.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-
-namespace paddle {
-namespace operators {
-
-inline float get_period_sparcity(const std::vector<float>& sparsity,
-                                 float cur_step, float rampup_steps) {
-  PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0);
-
-  size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps);
-  if (idx >= sparsity.size()) {
-    return 0.999;
-  }
-
-  PADDLE_ENFORCE_LT(idx, sparsity.size());
-  return sparsity[idx];
-}
-
-template <typename DeviceContext, typename T>
-class DGCOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto u = ctx.Input<framework::Tensor>("U");
-    auto v = ctx.Input<framework::Tensor>("V");
-    auto g = ctx.Input<framework::Tensor>("Grad");
-
-    // attrs
-    float m = ctx.Attr<float>("m");
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-    auto sparsity = ctx.Attr<std::vector<float>>("sparsity");
-    auto rampup_begin_step = ctx.Attr<float>("rampup_begin_step");
-    auto rampup_step = ctx.Attr<float>("rampup_step");
-
-    // current step
-    auto current_step_tensor = ctx.Input<framework::Tensor>("current_step");
-    const float* current_step = current_step_tensor->data<float>();
-
-    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
-      VLOG(10) << "current_step:" << *current_step
-               << " < rampup_begin_step:" << rampup_begin_step
-               << " so does't use dgc";
-      return;
-    }
-
-    float ratio =
-        1 - get_period_sparcity(sparsity, static_cast<float>(*current_step),
-                                rampup_step);
-    PADDLE_ENFORCE_GE(ratio, 0.0);
-    PADDLE_ENFORCE_LT(ratio, 1.0);
-    int k = static_cast<int>(g->numel() * ratio);
-
-    VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov
-             << ", rampup_begin_step:" << rampup_begin_step
-             << ", rampup_step:" << rampup_step
-             << ",  current_step:" << *current_step << ", ratio:" << ratio
-             << ", k:" << k;
-
-    auto k_out = ctx.Output<framework::Tensor>("k");
-    T* k_out_data = k_out->data<T>();
-    *k_out_data = k;
-
-    auto u_out = ctx.Output<framework::Tensor>("U_out");
-    auto v_out = ctx.Output<framework::Tensor>("V_out");
-    auto encode_grad_out = ctx.Output<framework::Tensor>("EncodeGrad");
-
-    // FIXME(gongwb): use cublas.
-    auto u_out_e = framework::EigenVector<T>::Flatten(*u_out);
-    auto u_e = framework::EigenVector<T>::Flatten(*u);
-    auto g_e = framework::EigenVector<T>::Flatten(*g);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto& eigen_ctx = *dev_ctx.eigen_device();
-    if (use_nesterov) {
-      // u = m * (u + g)
-      u_out_e.device(eigen_ctx) = m * (u_e + g_e);
-
-      // v = u + v + g
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, u, v, 0, AddFunctor<T>(), v_out);
-
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, g, v, 0, AddFunctor<T>(), v_out);
-    } else {
-      // u = m * u + g
-      u_out_e.device(eigen_ctx) = m * u_e + g_e;
-
-      // v = u + v
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, u, v, 0, AddFunctor<T>(), v_out);
-    }
-
-    T* v_out_data = v_out->mutable_data<T>(ctx.GetPlace());
-    T* u_out_data = u_out->mutable_data<T>(ctx.GetPlace());
-    T* encode_grad_out_data = encode_grad_out->mutable_data<T>(
-        framework::DDim{2 * k}, ctx.GetPlace());
-
-    int buf_size = paddle::communication::dgc::get_buffer_size(k);
-    auto tmp_ious_data = memory::Alloc(dev_ctx, buf_size);
-    void* buf = reinterpret_cast<void*>(tmp_ious_data->ptr());
-
-    if (!paddle::communication::dgc::k_select(
-            static_cast<void*>(encode_grad_out_data), k, v_out_data,
-            static_cast<int>(v_out->numel()), buf, dev_ctx.stream(),
-            u_out_data)) {
-      LOG(FATAL) << "v_out numel:" << v_out->numel();
-    }
-
-    auto grad_out = ctx.Output<framework::Tensor>("Grad_out");
-    math::SetConstant<DeviceContext, T> tset;
-    tset(dev_ctx, grad_out, static_cast<T>(0));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/diag_op.cc b/paddle/fluid/operators/diag_op.cc
deleted file mode 100644
index 5fb18a1d695381d14203b19653eb3cbc2508ee4d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/diag_op.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/diag_op.h"
-
-namespace paddle {
-namespace operators {
-
-class DiagOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Diagonal"),
-                   "Input(Diagonal) of DiagOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of DiagOp should not be null.");
-
-    auto s_dims = ctx->GetInputDim("Diagonal");
-    PADDLE_ENFORCE(s_dims.size() == 1,
-                   "The rank of Input(Diagonal) should only be 1.");
-
-    ctx->SetOutputDim("Out", {s_dims[0], s_dims[0]});
-  }
-};
-
-class DiagOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Diagonal",
-             "Diagonal values of square matrix. It is a tensor with rank 1.");
-    AddOutput("Out", "A square matrix.");
-    AddComment(R"DOC(
-    Return a square matrix with specified diagonal values. 
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(diag, ops::DiagOp, ops::DiagOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    diag, ops::DiagKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/diag_op.cu b/paddle/fluid/operators/diag_op.cu
deleted file mode 100644
index 9fe1b83b66d54a03e8d812589c1e9a3bf995f69c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/diag_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/diag_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    diag, ops::DiagKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::DiagKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::DiagKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DiagKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/diag_op.h b/paddle/fluid/operators/diag_op.h
deleted file mode 100644
index f89415ae08974293fa27bbd398d01df165eb901c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/diag_op.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DiagFunctor {
-  DiagFunctor(const T* diagonal, int64_t numel, T* output)
-      : diagonal_(diagonal), numel_(numel), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    output_[idx * numel_ + idx] = diagonal_[idx];
-  }
-
-  const T* diagonal_;
-  int64_t numel_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-class DiagKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* diagonal = context.Input<framework::Tensor>("Diagonal");
-    auto* diag_data = diagonal->data<T>();
-    auto numel = diagonal->numel();
-    auto* out = context.Output<framework::Tensor>("Out");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, out, static_cast<T>(0));
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    DiagFunctor<T> functor(diag_data, numel, out_data);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
deleted file mode 100644
index 8909135d234a67a6a8d1fbc21eb0b04e67f8d17b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-if(NOT WITH_DISTRIBUTE)
-    return()
-endif()
-
-if(WITH_GRPC)
-    set(cc_generic_services "false")
-else()
-    set(cc_generic_services "true")
-endif()
-configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
-
-cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool)
-cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder)
-
-# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-if(WITH_GRPC)
-  set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
-  set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc)
-  grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
-        request_handler_impl.cc rpc_client.cc rpc_server.cc
-        variable_response.cc
-        collective_client.cc collective_server.cc
-        ${GRPC_SRCS}
-      PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder)
-
-  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
-
-  cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc 
-    DEPS ${RPC_DEPS} scope profiler math_function)
-
-else()
-  set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
-  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
-  set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib)
-
-  brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
-      request_handler_impl.cc rpc_client.cc rpc_server.cc
-      variable_response.cc
-      collective_client.cc collective_server.cc
-      ${BRPC_SRCS}
-    PROTO send_recv.proto
-    DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS})
-
-  set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS})
-  cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
-      DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op)
-endif()
-
-
-cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op)
-cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
-cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
-cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
-cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
-cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv)
-cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
-if(WITH_GPU)
-    cc_test(collective_server_test SRCS collective_server_test.cc 
-        DEPS sendrecvop_rpc executor ${RPC_DEPS}
-        selected_rows_functor  scope math_function)
-endif()
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
deleted file mode 100644
index 3f3b6b959e30194c10b1a58d6fc3e7a61ad01313..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag AsyncSparseParamUpdateRecorder::init_flag_;
-std::unique_ptr<AsyncSparseParamUpdateRecorder>
-    AsyncSparseParamUpdateRecorder::recorder_(nullptr);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
deleted file mode 100644
index eadd842c7f6ead56006fd0c34814b1b7bd9b62f4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include <ThreadPool.h>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class ConcurrentSet {
- public:
-  ConcurrentSet() : pool_(new ::ThreadPool(1)) {}
-  ~ConcurrentSet() {}
-
-  std::future<void> Update(const std::vector<int64_t>& rows) {
-    auto task = [this, rows] {
-      if (VLOG_IS_ON(3)) {
-        std::ostringstream sstream;
-        sstream << "[";
-        for (auto& id : rows) {
-          sstream << id << ", ";
-        }
-        sstream << "]";
-        VLOG(3) << "update ids -> " << sstream.str();
-      }
-      for (auto row : rows) {
-        set_.insert(row);
-      }
-    };
-    return pool_->enqueue(std::move(task));
-  }
-
-  std::future<void> GetAndClear(std::vector<int64_t>* result) {
-    auto task = [this, &result] {
-      result->clear();
-      for (auto& id : set_) {
-        result->push_back(id);
-      }
-      if (VLOG_IS_ON(3)) {
-        std::ostringstream sstream;
-        sstream << "[";
-        for (auto& id : *result) {
-          sstream << id << ", ";
-        }
-        sstream << "]";
-        VLOG(3) << "result ids size: " << result->size() << " "
-                << sstream.str();
-      }
-      set_.clear();
-    };
-    return pool_->enqueue(std::move(task));
-  }
-
- private:
-  std::unordered_set<int64_t> set_;
-  std::unique_ptr<::ThreadPool> pool_{nullptr};
-};
-
-class AsyncSparseParamUpdateRecorder {
-  using TrainerToRows = std::vector<std::unique_ptr<ConcurrentSet>>;
-
- public:
-  AsyncSparseParamUpdateRecorder(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param)
-      : trainer_num_(trainer_num), grad_to_param_(grad_to_param) {
-    if (VLOG_IS_ON(3)) {
-      std::ostringstream sstream;
-      sstream << "[";
-      for (auto& item : grad_to_param) {
-        sstream << item.first << ":" << item.second << ", ";
-      }
-      sstream << "]";
-      VLOG(3) << "trainer_num: " << trainer_num
-              << " grad_to_param_: " << sstream.str();
-    }
-    for (auto& iter : grad_to_param) {
-      param_to_grad_[iter.second] = iter.first;
-      auto& param_name = iter.second;
-      param_to_updated_rows_[param_name] = TrainerToRows();
-      auto& trainer_to_rows = param_to_updated_rows_[param_name];
-      for (auto i = 0; i < trainer_num; ++i) {
-        trainer_to_rows.emplace_back(new ConcurrentSet());
-      }
-    }
-  }
-
-  ~AsyncSparseParamUpdateRecorder() = default;
-
-  void Update(const std::string& grad_name,
-              const std::vector<int64_t>& update_rows) {
-    VLOG(3) << "update grad: " << grad_name
-            << " row size: " << update_rows.size();
-    auto& param_name = grad_to_param_.at(grad_name);
-    auto& trainer_to_rows = param_to_updated_rows_.at(param_name);
-
-    std::vector<std::future<void>> fs;
-    for (auto& set : trainer_to_rows) {
-      fs.push_back(set->Update(update_rows));
-    }
-    for (auto& f : fs) {
-      f.wait();
-    }
-  }
-
-  void GetAndClear(const std::string& param_name, int trainer_id,
-                   std::vector<int64_t>* result) {
-    VLOG(3) << "GetAndClear param: " << param_name
-            << " for trainer: " << trainer_id;
-    PADDLE_ENFORCE_LT(trainer_id, trainer_num_);
-    param_to_updated_rows_.at(param_name)[trainer_id]
-        ->GetAndClear(result)
-        .wait();
-  }
-
-  bool HasParam(const std::string& param_name) {
-    return param_to_grad_.find(param_name) != param_to_grad_.end();
-  }
-
-  bool HasGrad(const std::string& grad_name) {
-    return grad_to_param_.find(grad_name) != grad_to_param_.end();
-  }
-
- private:
-  const int trainer_num_;
-  std::unordered_map<std::string, std::string> grad_to_param_;
-  std::unordered_map<std::string, std::string> param_to_grad_;
-  std::unordered_map<std::string, TrainerToRows> param_to_updated_rows_;
-
-  // init recorder
- public:
-  static void Init(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param) {
-    InitImpl(trainer_num, grad_to_param);
-  }
-
-  static AsyncSparseParamUpdateRecorder* GetInstance() {
-    return recorder_.get();
-  }
-
- private:
-  // Init is called by GetInstance.
-  static void InitImpl(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param) {
-    if (recorder_ == nullptr) {
-      recorder_.reset(
-          new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param));
-    }
-  }
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<AsyncSparseParamUpdateRecorder> recorder_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
deleted file mode 100644
index 67e8fd8a0edc4510d0abe885c821e75b528254f8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-
-#include <algorithm>
-
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-TEST(ConcurrentSet, All) {
-  ConcurrentSet concurrent_set;
-  std::vector<int64_t> in1 = {1, 2, 3, 4};
-  std::vector<int64_t> in2 = {2, 3, 5, 6};
-
-  std::vector<std::future<void>> futures;
-  futures.push_back(concurrent_set.Update(in1));
-  futures.push_back(concurrent_set.Update(in2));
-
-  for (auto &f : futures) {
-    f.wait();
-  }
-
-  std::unordered_set<int64_t> in;
-  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
-  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
-
-  std::vector<int64_t> ret;
-  concurrent_set.GetAndClear(&ret).wait();
-
-  std::unordered_set<int64_t> out;
-  std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
-
-  EXPECT_EQ(in, out);
-
-  concurrent_set.GetAndClear(&ret).wait();
-  EXPECT_EQ(ret.size(), 0);
-}
-
-TEST(AsyncSparseParamUpdateRecorder, All) {
-  std::unordered_map<std::string, std::string> grad_to_param;
-  grad_to_param["grad1"] = "param1";
-  grad_to_param["grad2"] = "param2";
-
-  int trainer_num = 10;
-
-  AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param);
-  std::vector<int64_t> in1 = {1, 2, 3, 4};
-  std::vector<int64_t> in2 = {2, 3, 5, 6};
-
-  std::unordered_set<int64_t> in;
-  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
-  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
-
-  recorder.Update("grad1", in1);
-  recorder.Update("grad1", in2);
-
-  EXPECT_TRUE(recorder.HasParam("param1"));
-  EXPECT_TRUE(recorder.HasParam("param2"));
-  EXPECT_FALSE(recorder.HasParam("param3"));
-
-  EXPECT_TRUE(recorder.HasGrad("grad1"));
-  EXPECT_TRUE(recorder.HasGrad("grad2"));
-  EXPECT_FALSE(recorder.HasGrad("grad3"));
-
-  std::vector<int64_t> ret;
-  EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret));
-
-  for (int i = 0; i < trainer_num; ++i) {
-    std::vector<int64_t> ret;
-    std::unordered_set<int64_t> out;
-
-    recorder.GetAndClear("param1", i, &ret);
-    std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
-
-    EXPECT_EQ(in, out);
-
-    recorder.GetAndClear("param1", i, &ret);
-    EXPECT_EQ(ret.size(), 0);
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
deleted file mode 100644
index 4c22ad8eb4d4b2e23d8a6720e726eb9e2998314e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ /dev/null
@@ -1,456 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
-DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
-
-BRPCClient::~BRPCClient() { Wait(); }
-
-void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response,
-                        VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                        ChannelContextPtr ch_ctx, BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
-
-  // this channel can be used by other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    LOG(FATAL) << "Fail to send SendVar: " << var_h->name()
-               << ", error text: " << cntl->ErrorText();
-    var_h->Finish(false);
-    cls->DecreaseReqCount();
-    return;
-  }
-  var_h->Finish(true);
-  cls->DecreaseReqCount();
-
-  VLOG(4) << "HandleSendResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-  VLOG(4) << "Finish HandleSendResponse";
-}
-
-VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = kSendRPC;
-  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-    cntl->set_timeout_ms(time_out);
-
-    auto* var = p_scope->FindVar(var_name_val);
-    sendrecv::VariableMessage request;
-    distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request,
-                                  &cntl->request_attachment(), "", false,
-                                  trainer_id_);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    platform::RecordRPCEvent record_event(method);
-
-    ch_ctx->stub->SendVariable(cntl, &request, response, done);
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-  req_count_++;
-
-  return var_h;
-}
-void HandleFetchBarrierResponse(brpc::Controller* cntl,
-                                sendrecv::VariableMessage* response,
-                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                ChannelContextPtr ch_ctx, BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
-
-  // this channel can be used other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    LOG(FATAL) << "Fail to get HandleFetchBarrierResponse: " << var_h->name()
-               << ", error text: " << cntl->ErrorText();
-    var_h->Finish(false);
-    cls->DecreaseReqCount();
-    return;
-  }
-
-  var_h->Finish(true);
-  cls->DecreaseReqCount();
-
-  VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-  VLOG(4) << "Finish HandleFetchBarrierResponse";
-}
-void HandleGetResponse(brpc::Controller* cntl,
-                       sendrecv::VariableMessage* response, VarHandlePtr var_h,
-                       ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx,
-                       BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
-
-  // this channel can be used other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    LOG(FATAL) << "Fail to GetVar: " << var_h->name()
-               << ", error text: " << cntl->ErrorText();
-    cls->DecreaseReqCount();
-    var_h->Finish(false);
-    return;
-  }
-
-  VLOG(4) << "HandleGetResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-
-  framework::Variable* outvar = nullptr;
-  int trainer_id;
-  distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(),
-                                    *var_h->ctx(), var_h->scope(), &outvar,
-                                    &trainer_id);
-  VLOG(4) << "Finish HandleGetResponse";
-  cls->DecreaseReqCount();
-  var_h->Finish(true);
-}
-
-VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      const std::string& out_var_name,
-                                      const std::string& method_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const std::string out_varname_val = out_var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = kGetRPC;
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-    cntl->set_timeout_ms(time_out);
-
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-    req.set_out_varname(out_varname_val);
-    req.set_trainer_id(trainer_id_);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    platform::RecordRPCEvent record_event(method);
-
-    if (method_name == kGetMonomerRPC) {
-      ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
-    } else if (method_name == kGetNoBarrierRPC) {
-      ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done);
-    } else {
-      ch_ctx->stub->GetVariable(cntl, &req, response, done);
-    }
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-
-  req_count_++;
-
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncGetVarNoBarrier(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    const std::string& out_var_name, int64_t time_out) {
-  std::string var_name_no_barrier =
-      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
-
-  return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name,
-                      kGetNoBarrierRPC, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC,
-                      time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
-                                                const std::string& var_name,
-                                                int64_t time_out) {
-  return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
-                                     const platform::DeviceContext& ctx,
-                                     const framework::Scope& scope,
-                                     const std::string& var_name,
-                                     const std::string& out_var_name,
-                                     const std::string& table_name,
-                                     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
-                      time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& in_var_name,
-                                          const std::string& out_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string in_var_name_val = in_var_name;
-  const std::string out_var_name_val = out_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-
-  const std::string method = kPrefetchRPC;
-
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-    cntl->set_timeout_ms(time_out);
-
-    auto* var = p_scope->FindVar(in_var_name_val);
-    sendrecv::VariableMessage req;
-    distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req,
-                                  &cntl->request_attachment(), out_var_name_val,
-                                  false, 0, table_name_val);
-
-    platform::RecordRPCEvent record_event(method);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    ch_ctx->stub->PrefetchVariable(cntl, &req, response, done);
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-
-  req_count_++;
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE,
-                          time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  auto ch_ptr = GetChannel(ep);
-  auto ch_ctx = ch_ptr->Pop();
-
-  brpc::Controller* cntl = new brpc::Controller();
-  sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-  cntl->set_timeout_ms(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(FETCH_BARRIER_MESSAGE);
-
-  const std::string method = kFetchBarrierRPC;
-  // var handle
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
-
-  platform::RecordRPCEvent record_event(method);
-
-  google::protobuf::Closure* done = brpc::NewCallback(
-      &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-  ch_ctx->stub->GetVariable(cntl, &req, response, done);
-
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    var_h->Wait();
-  }
-
-  return var_h;
-}
-
-bool BRPCClient::Wait() {
-  VLOG(9) << "begin to brpcclient wait";
-  {
-    std::unique_lock<std::mutex> lk(sync_mutex_);
-    sync_cond_.wait(lk, [this] { return req_count_ == 0; });
-  }
-  VLOG(9) << "end to brpcclient wait";
-  return true;
-}
-
-ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
-  VLOG(4) << "begin to GetChannel:" << ep;
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    auto it = channels_.find(ep);
-    if (it != channels_.end()) {
-      VLOG(4) << "end to GetChannel:" << ep;
-      return it->second;
-    }
-  }
-
-  ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
-
-  brpc::ChannelOptions options;
-#ifdef PADDLE_WITH_BRPC_RDMA
-  options.use_rdma = true;
-#endif
-  options.protocol = "baidu_std";
-  // don't use pooled type. the server can't afford that.
-  options.connection_type = "single";
-  options.connect_timeout_ms = 1000;
-  options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
-  options.max_retry = FLAGS_max_retry;
-
-  VLOG(1) << "create " << brpc_channel_num_per_server_
-          << " brpc channels to pserver:" << ep;
-
-  for (int i = 0; i < brpc_channel_num_per_server_; ++i) {
-    std::shared_ptr<ChannelContext> c(new ChannelContext());
-    if (c->channel.Init(ep.c_str(), &options) != 0) {
-      LOG(FATAL) << "Fail to initialize channel";
-      return nullptr;
-    }
-
-    c->stub.reset(new sendrecv::SendRecvService_Stub(
-        static_cast<google::protobuf::RpcChannel*>(&c->channel)));
-    q->Push(c);
-  }
-
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    channels_[ep] = q;
-  }
-
-  VLOG(4) << "end to GetChannel:" << ep;
-  return q;
-}
-
-VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
-                                           int64_t time_out) {
-  return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out);
-}
-
-void BRPCClient::SendComplete() {
-  for (auto& kv : channels_) {
-    AsyncSendComplete(kv.first);
-  }
-}
-
-VarHandlePtr BRPCClient::AsyncSendVarMessage(
-    const std::string& ep, const std::string& method_name,
-    const sendrecv::VariableMessage& req, int64_t time_out) {
-  auto ch_ptr = GetChannel(ep);
-  auto ch_ctx = ch_ptr->Pop();
-
-  brpc::Controller* cntl = new brpc::Controller();
-  sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-  cntl->set_timeout_ms(time_out);
-
-  platform::RecordRPCEvent record_event(method_name);
-
-  VarHandlePtr var_h(
-      new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
-
-  google::protobuf::Closure* done = brpc::NewCallback(
-      &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-  if (method_name == kCheckPointNotifyRPC) {
-    ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
-  } else if (method_name == kSendMonomerFetchBarrierRPC) {
-    ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
-  } else {
-    ch_ctx->stub->SendVariable(cntl, &req, response, done);
-  }
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    var_h->Wait();
-  }
-
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep,
-                                          const std::string& method_name,
-                                          const std::string& message,
-                                          int64_t time_out) {
-  sendrecv::VariableMessage req;
-  req.set_varname(message);
-
-  return AsyncSendVarMessage(ep, method_name, req, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep,
-                                               const std::string& dir,
-                                               int64_t time_out) {
-  sendrecv::VariableMessage req;
-  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
-  req.set_out_varname(dir);
-
-  return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out);
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
deleted file mode 100644
index 51864dfdca53eb4b1d9045188a6347781130e785..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-
-#include <chrono>  // NOLINT
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-struct ChannelContext {
-  brpc::Channel channel;
-  std::shared_ptr<sendrecv::SendRecvService_Stub> stub;
-};
-
-typedef std::shared_ptr<ChannelContext> ChannelContextPtr;
-typedef std::shared_ptr<framework::BlockingQueue<ChannelContextPtr>>
-    ChannelQueuePtr;
-
-class BRPCClient : public RPCClient {
- public:
-  BRPCClient() {}
-  virtual ~BRPCClient();
-
-  VarHandlePtr AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           const std::string& out_var_name,
-                           const std::string& table_name = "",
-                           int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep,
-                                    const platform::DeviceContext& ctx,
-                                    const framework::Scope& scope,
-                                    const std::string& var_name,
-                                    const std::string& out_varname,
-                                    int64_t time_out = FLAGS_rpc_deadline);
-
-  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendFetchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dir,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool Wait() override;
-
-  void SendComplete() override;
-
- private:
-  VarHandlePtr _AsyncGetVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_var_name, const std::string& method_name,
-      const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline);
-
-  void Proceed();
-  ChannelQueuePtr GetChannel(const std::string& ep);
-
-  VarHandlePtr AsyncSendComplete(const std::string& ep,
-                                 int64_t time_out = FLAGS_rpc_deadline);
-
-  VarHandlePtr AsyncSendMessage(const std::string& ep,
-                                const std::string& method_name,
-                                const std::string& message, int64_t time_out);
-
-  VarHandlePtr AsyncSendVarMessage(const std::string& ep,
-                                   const std::string& method_name,
-                                   const sendrecv::VariableMessage& req,
-                                   int64_t time_out);
-
-  friend void HandleSendResponse(brpc::Controller* cntl,
-                                 sendrecv::VoidMessage* response,
-                                 VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                 ChannelContextPtr ch_ctx, BRPCClient* cls);
-
-  friend void HandleGetResponse(brpc::Controller* cntl,
-                                sendrecv::VariableMessage* response,
-                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                ChannelContextPtr ch_ctx, BRPCClient* cls);
-
-  friend void HandleFetchBarrierResponse(brpc::Controller* cntl,
-                                         sendrecv::VariableMessage* response,
-                                         VarHandlePtr var_h,
-                                         ChannelQueuePtr ch_ptr,
-                                         ChannelContextPtr ch_ctx,
-                                         BRPCClient* cls);
-  void DecreaseReqCount() {
-    if (--req_count_ <= 0) {
-      sync_cond_.notify_all();
-    }
-  }
-
- private:
-  std::unordered_map<std::string, ChannelQueuePtr> channels_;
-
-  // mutex for Wait client sync
-  std::mutex sync_mutex_;
-  std::condition_variable sync_cond_;
-  std::atomic<int64_t> req_count_{0};
-
-  static constexpr int brpc_channel_num_per_server_ = 4;
-
-  // mutex for GetChannel thread safety
-  std::mutex chan_mutex_;
-  DISABLE_COPY_AND_ASSIGN(BRPCClient);
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
deleted file mode 100644
index d5c614001e0b2ff24812d5326318883de938fbb8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
-#include "brpc/channel.h"
-#include "brpc/rdma/rdma_helper.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-RdmaMemPool& RdmaMemPool::Instance() {
-  static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool();
-  return *g_rdma_mem_pool;
-}
-
-void* RdmaMemPool::Find(const std::string& varname, int64_t size) {
-  pthread_rwlock_rdlock(&access_);
-  auto it = pool_.find(varname);
-  if (it == pool_.end()) {
-    pthread_rwlock_unlock(&access_);
-    return nullptr;
-  }
-
-  auto info = it->second;
-  if (info.data_size != size) {
-    pthread_rwlock_unlock(&access_);
-    PADDLE_ENFORCE(false, "var:%s size:%ld != %ld", varname, size,
-                   info.data_size);
-    return nullptr;
-  }
-
-  pthread_rwlock_unlock(&access_);
-  return info.data;
-}
-
-void RdmaMemPool::Register(const std::string& varname, void* data,
-                           int64_t data_size) {
-  void* old = Find(varname, data_size);
-  if (old != nullptr) {
-    if (data != old) {
-      PADDLE_ENFORCE(false, "var:%s data:%ld != %ld", varname, data, old);
-    }
-    VLOG(7) << "Find on rdma:" << varname << " data:" << data
-            << " data_size:" << data_size;
-    return;
-  }
-
-  VarInfo info;
-  info.data = data;
-  info.data_size = data_size;
-
-  pthread_rwlock_wrlock(&access_);
-  pool_[varname] = info;
-  pthread_rwlock_unlock(&access_);
-
-  if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) {
-    LOG(FATAL) << "register " << varname << " data:" << data
-               << " data_size:" << data_size << " error";
-  }
-
-  VLOG(4) << "register on rdma:" << varname << " data:" << data
-          << " data_size:" << data_size;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
deleted file mode 100644
index 156a93ec5784715c0a68c1af2e31d640dfc60277..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#ifdef PADDLE_WITH_BRPC_RDMA
-
-#include <pthread.h>  // NOLINT
-#include <string>
-#include <unordered_map>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-/*
- * This class is used to avoid duplicated registion of brpc::rdma.
- */
-class RdmaMemPool {
- public:
-  static RdmaMemPool& Instance();
-  RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {}
-
-  virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); }
-
-  void Register(const std::string& varname, void* data, int64_t size);
-  void* Find(const std::string& varname, int64_t size);
-
- private:
-  struct VarInfo {
-    void* data;
-    int64_t data_size;
-
-    VarInfo() : data(nullptr), data_size(0) {}
-  };
-
- private:
-  std::unordered_map<std::string, VarInfo> pool_;
-  pthread_rwlock_t access_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
deleted file mode 100644
index 49e048f07a2396824a51db5c6012206bd8848e82..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-#include <nccl.h>
-#endif
-#include <sys/time.h>
-#include <limits>
-#include <thread>  // NOLINT
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class IOBufWriter {
- public:
-  static void Append(const std::string& varname, butil::IOBuf* iobuf, int k,
-                     const char* v, int64_t vlen) {
-    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
-      LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen;
-    }
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-    iobuf->append(v, vlen);
-  }
-
-  static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v,
-                                int64_t vlen, bool in_cuda_pinned,
-                                void (*destroy)(void*), void* user_data) {
-    VLOG(7) << "AppendTCPZeroCopy "
-            << " k:" << k
-            << " data:" << static_cast<void*>(const_cast<char*>(v))
-            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-
-    // FIXME(gongwb): use append_zerocopy
-    /*
-    if (in_cuda_pinned) {
-      iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory);
-    } else {
-      iobuf->append_zerocopy(v, vlen, nullptr);
-    }
-    */
-    iobuf->append(v, vlen);
-    destroy(user_data);
-  }
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-  static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf,
-                                 int k, const char* v, int64_t vlen,
-                                 bool in_cuda_pinned, void (*destroy)(void*),
-                                 void* user_data) {
-    VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k
-            << " data:" << static_cast<void*>(const_cast<char*>(v))
-            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-
-    RdmaMemPool::Instance().Register(
-        varname, static_cast<void*>(const_cast<char*>(v)), vlen);
-
-    // FIXME(gongwb): use append_zerocopy
-    // iobuf->append_zerocopy(v, vlen, nullptr);
-    iobuf->append(v, vlen);
-    destroy(user_data);
-    return;
-  }
-#endif
-
-  static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf,
-                             int k, const char* v, int64_t vlen,
-                             bool in_cuda_pinned, void (*destroy)(void*),
-                             void* user_data) {
-    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
-      LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen;
-    }
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-    IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned,
-                                    destroy, user_data);
-#else
-    IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy,
-                                   user_data);
-#endif
-  }
-};
-
-void SerializeToIOBuf(const std::string& name, framework::Variable* var,
-                      const platform::DeviceContext& ctx, VarMsg* request,
-                      butil::IOBuf* iobuf, const std::string& out_varname,
-                      bool var_is_not_stable, int trainer_id,
-                      const std::string& table_name) {
-  std::unique_ptr<TensorPayload> payload;
-
-  request->set_varname(name);
-  request->set_trainer_id(trainer_id);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request->set_profile(platform::kEnableProfiler);
-    } else {
-      request->set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_varname.empty()) {
-    request->set_out_varname(out_varname);
-  }
-  if (!table_name.empty()) {
-    request->set_table_name(table_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request->set_type(::sendrecv::LOD_TENSOR);
-    payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request)));
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request->set_type(::sendrecv::SELECTED_ROWS);
-    payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
-#ifdef PADDLE_WITH_CUDA
-  } else if (var->IsType<ncclUniqueId>()) {
-    request->set_type(::sendrecv::NCCL_ID);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    // TODO(gongwb): use append_zero to avoid data copy.
-    IOBufWriter::Append(name, iobuf,
-                        sendrecv::VariableMessage::kSerializedFieldNumber,
-                        uid.internal, NCCL_UNIQUE_ID_BYTES);
-    return;
-#endif
-  } else {
-    PADDLE_THROW("Serialize does not support type: %s",
-                 typeid(var->Type()).name());
-  }
-
-  PADDLE_ENFORCE_NOT_NULL(payload);
-
-  // FIXME(gongwb): it seems that can use zero copy.
-  if (var_is_not_stable) {
-    IOBufWriter::Append(
-        name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-        static_cast<const char*>(payload->ptr()), payload->memory_size());
-  } else {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      IOBufWriter::AppendZeroCopy(
-          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-          static_cast<const char*>(payload->ptr()), payload->memory_size(),
-          true, SerializeDestroyCallback, static_cast<void*>(payload.get()));
-      payload.release();
-#endif
-    } else {
-      IOBufWriter::AppendZeroCopy(
-          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-          static_cast<const char*>(payload->ptr()), payload->memory_size(),
-          false, SerializeDestroyCallback, static_cast<void*>(payload.get()));
-      payload.release();
-    }
-  }
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name());
-    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
-
-    IOBufWriter::Append(name, iobuf,
-                        ::sendrecv::VariableMessage::kRowsFieldNumber,
-                        reinterpret_cast<const char*>(slr->rows().data()),
-                        static_cast<int64_t>(rows_memory_size));
-  }
-}
-
-void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta,
-                          const butil::IOBuf& iobuf,
-                          const platform::DeviceContext& ctx,
-                          const framework::Scope* scope,
-                          framework::Variable** var, int* trainer_id) {
-  operators::distributed::BRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE(resp.Parse(iobuf, meta) == 0, "parse iobuf to tensor error!");
-  *var = resp.GetVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
deleted file mode 100644
index a5bdc331eb29c7c0fe00d7f346025426b51e1cb3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <sys/time.h>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void SerializeToIOBuf(const std::string& name, framework::Variable* var,
-                      const platform::DeviceContext& ctx, VarMsg* request,
-                      butil::IOBuf* iobuf, const std::string& out_varname,
-                      bool var_is_not_stable, const int trainer_id = 0,
-                      const std::string& table_name = std::string());
-
-void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf,
-                          const platform::DeviceContext& ctx,
-                          const framework::Scope* scope,
-                          framework::Variable** var, int* trainer_id);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
deleted file mode 100644
index b902d3db487789a417ed0e5ffc032e7e06ba43fb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "brpc/channel.h"
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
-namespace memory = paddle::memory;
-
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  butil::IOBuf iobuf;
-  sendrecv::VariableMessage msg;
-  int tensor_numel = 564 * 128;
-
-  // serialize var to IOBuf
-  {
-    framework::Variable var;
-    auto* slr = var.GetMutable<framework::SelectedRows>();
-    slr->set_height(1000);
-    auto* tensor = slr->mutable_value();
-    auto* rows = slr->mutable_rows();
-    tensor->Resize(framework::make_ddim({564, 128}));
-    tensor->mutable_data<float>(place);
-    math::set_constant(ctx, tensor, 32.7);
-    for (int i = 0; i < 564; ++i) rows->push_back(i);
-
-    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
-                                             "", false);
-  }
-
-  // desrialize
-  {
-    framework::Scope scope;
-    scope.Var("myvar");
-    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
-    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
-
-    framework::Variable* var2 = resp.GetVar();
-
-    auto* slr2 = var2->GetMutable<framework::SelectedRows>();
-    auto* tensor2 = slr2->mutable_value();
-    auto* rows2 = slr2->mutable_rows();
-    float* tensor_data2 = nullptr;
-    framework::Tensor tmp_tensor;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      platform::CPUPlace cpu;
-      framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
-      tensor_data2 = tmp_tensor.data<float>();
-    } else {
-      tensor_data2 = const_cast<float*>(tensor2->data<float>());
-    }
-    const int64_t* rows_data2 = rows2->data();
-
-    for (int i = 0; i < tensor_numel; ++i) {
-      EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-    }
-    for (size_t i = 0; i < rows2->size(); ++i) {
-      EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
-    }
-    EXPECT_EQ(slr2->height(), 1000);
-  }
-}
-
-void RunTestLodTensor(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  // serialize var to ByteBuffer
-  butil::IOBuf iobuf;
-  sendrecv::VariableMessage msg;
-  int tensor_numel = 512 * 8 * 4 * 2;
-  {
-    framework::Variable var;
-    auto* tensor = var.GetMutable<framework::LoDTensor>();
-    tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
-    framework::LoD lod;
-    lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-    tensor->set_lod(lod);
-    tensor->mutable_data<float>(place);
-    math::set_constant(ctx, tensor, 31.9);
-
-    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
-                                             "", false);
-  }
-
-  // check sendrecv::VariableMessage meta data
-  {
-    EXPECT_EQ(msg.varname(), "myvar");
-    EXPECT_EQ(msg.type(), 0);
-    EXPECT_EQ(msg.dims()[0], 512);
-    EXPECT_EQ(msg.dims()[1], 8);
-    EXPECT_EQ(msg.dims()[2], 4);
-    EXPECT_EQ(msg.dims()[3], 2);
-    EXPECT_EQ(msg.lod_level(), 1);
-    EXPECT_EQ(msg.lod(0).lod_data(0), 1);
-    EXPECT_EQ(msg.lod(0).lod_data(1), 3);
-    EXPECT_EQ(msg.lod(0).lod_data(2), 8);
-  }
-
-  // deserialize
-  {
-    framework::Scope scope;
-    scope.Var("myvar");
-    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
-    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
-
-    framework::Variable* var2 = resp.GetVar();
-
-    auto tensor2 = var2->Get<framework::LoDTensor>();
-    float* tensor_data2 = nullptr;
-    framework::Tensor tmp_tensor;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      platform::CPUPlace cpu;
-      framework::TensorCopy(tensor2, cpu, &tmp_tensor);
-      tensor_data2 = tmp_tensor.data<float>();
-    } else {
-      tensor_data2 = const_cast<float*>(tensor2.data<float>());
-    }
-
-    for (int i = 0; i < tensor_numel; ++i)
-      EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
-  }
-}
-
-TEST(LodTensor, Run) {
-  platform::CPUPlace place;
-  RunTestLodTensor(place);
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu(0);
-  RunTestLodTensor(gpu);
-#endif
-}
-
-TEST(SelectedRows, Run) {
-  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(gpu);
-#endif
-}
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
deleted file mode 100644
index fea9b09414638b607ca7f7d558ce14a2d5bfa03d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc
+++ /dev/null
@@ -1,403 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace sendrecv {
-
-namespace distributed = paddle::operators::distributed;
-
-typedef std::unordered_map<std::string, distributed::RequestHandler*>
-    HandlerMap;
-
-class BRPCServiceImpl : public SendRecvService {
- public:
-  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map,
-                           distributed::RPCServer* rpc_server)
-      : rpc_server_(rpc_server) {
-    VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size();
-    auto it = rpc_call_map.find(distributed::kRequestSend);
-    if (it != rpc_call_map.end()) {
-      request_send_h_ = it->second;
-      send_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestSend)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGet);
-    if (it != rpc_call_map.end()) {
-      request_get_h_ = it->second;
-      get_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestGet)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetNoBarrier);
-    if (it != rpc_call_map.end()) {
-      request_getnobarrier_h_ = it->second;
-      getnobarrier_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestPrefetch);
-    if (it != rpc_call_map.end()) {
-      request_prefetch_h_ = it->second;
-      prefetch_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestCheckpoint);
-    if (it != rpc_call_map.end()) {
-      request_checkpoint_h_ = it->second;
-      checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetMonomerVariable);
-    if (it != rpc_call_map.end()) {
-      request_get_monomer_handler_h_ = it->second;
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier);
-    if (it != rpc_call_map.end()) {
-      request_get_monomer_barrier_handler_h_ = it->second;
-    }
-  }
-
-  virtual ~BRPCServiceImpl() {}
-  void SendVariable(google::protobuf::RpcController* cntl_butil,
-                    const VariableMessage* request, VoidMessage* response,
-                    google::protobuf::Closure* done) override {
-    send_threads_->Run(
-        [=] { _SendVariable(cntl_butil, request, response, done); });
-  }
-
-  void _SendVariable(google::protobuf::RpcController* cntl_butil,
-                     const VariableMessage* request, VoidMessage* response,
-                     google::protobuf::Closure* done) {
-    PADDLE_ENFORCE(request_send_h_ != nullptr,
-                   "RequestSend handler should be registed first!");
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    VLOG(3) << "RequestSend var_name:" << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    distributed::BRPCVariableResponse resp(request_send_h_->scope(),
-                                           request_send_h_->dev_ctx(),
-                                           !request_send_h_->sync_mode());
-    PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0,
-                   "parse iobuf to tensor error!");
-
-    auto scope = resp.GetMutableLocalScope();
-    auto invar = resp.GetVar();
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id);
-  }
-
-  void GetVariable(google::protobuf::RpcController* cntl_butil,
-                   const VariableMessage* request, VariableMessage* response,
-                   google::protobuf::Closure* done) override {
-    get_threads_->Run(
-        [=] { _GetVariable(cntl_butil, request, response, done); });
-  }
-
-  void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
-                            const VariableMessage* request,
-                            VariableMessage* response,
-                            google::protobuf::Closure* done) override {
-    getnobarrier_threads_->Run(
-        [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); });
-  }
-
-  void _GetVariable(google::protobuf::RpcController* cntl_butil,
-                    const VariableMessage* request, VariableMessage* response,
-                    google::protobuf::Closure* done) {
-    PADDLE_ENFORCE(request_get_h_ != nullptr,
-                   "RequestGet handler should be registed first!");
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    std::string out_varname = request->out_varname();
-    VLOG(3) << "RequestGet varname:" << varname
-            << ", out_varname:" << out_varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    auto scope = request_get_h_->scope();
-    paddle::framework::Variable* invar = nullptr;
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id,
-                           out_varname);
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(out_varname, outvar,
-                                    *request_get_h_->dev_ctx(), response,
-                                    &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
-                             const VariableMessage* request,
-                             VariableMessage* response,
-                             google::protobuf::Closure* done) {
-    PADDLE_ENFORCE(request_getnobarrier_h_ != nullptr,
-                   "RequestGetNoBarrier handler should be registed first!");
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    std::string out_varname = request->out_varname();
-    int trainer_id = request->trainer_id();
-
-    VLOG(3) << "RequestGetNoBarrier varname:" << varname
-            << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id
-            << ", from:" << cntl->remote_side();
-
-    auto scope = request_getnobarrier_h_->scope();
-    paddle::framework::Variable* invar = nullptr;
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id,
-                                    out_varname);
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(
-          out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response,
-          &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
-                        const VariableMessage* request,
-                        VariableMessage* response,
-                        google::protobuf::Closure* done) override {
-    prefetch_threads_->Run(
-        [=] { _PrefetchVariable(cntl_butil, request, response, done); });
-  }
-
-  void _PrefetchVariable(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request,
-                         VariableMessage* response,
-                         google::protobuf::Closure* done) {
-    PADDLE_ENFORCE(request_prefetch_h_ != nullptr,
-                   "kRequestPrefetch handler should be registed first!");
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    // prefetch process...
-    std::string in_var_name = request->varname();
-    std::string out_var_name = request->out_varname();
-    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
-            << ", out_var_name: " << out_var_name
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    distributed::BRPCVariableResponse resp(
-        request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true);
-
-    PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0,
-                   "parse iobuf to tensor error!");
-
-    auto scope = resp.GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    std::string table_name = request->table_name();
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = scope->Var(out_var_name);
-
-    request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                                out_var_name, table_name);
-
-    distributed::SerializeToIOBuf(out_var_name, outvar,
-                                  *request_prefetch_h_->dev_ctx(), response,
-                                  &cntl->response_attachment(), "", true);
-  }
-
-  void CheckpointNotify(google::protobuf::RpcController* cntl_butil,
-                        const VariableMessage* request, VoidMessage* response,
-                        google::protobuf::Closure* done) override {
-    checkpoint_notify_threads_->Run(
-        [=] { _CheckpointNotify(cntl_butil, request, response, done); });
-  }
-
-  void _CheckpointNotify(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request, VoidMessage* response,
-                         google::protobuf::Closure* done) {
-    PADDLE_ENFORCE(
-        request_checkpoint_h_ != nullptr,
-        "kRequestCheckpointNotify handler should be registed first!");
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(),
-                                           request_checkpoint_h_->dev_ctx());
-
-    auto scope = resp.GetMutableLocalScope();
-
-    std::string checkpoint_notify = request->varname();
-    std::string checkpoint_dir = request->out_varname();
-    int trainer_id = request->trainer_id();
-
-    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
-            << ", dir: " << checkpoint_dir
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr,
-                                  trainer_id, checkpoint_dir);
-  }
-
-  void GetMonomerVariable(google::protobuf::RpcController* cntl_butil,
-                          const VariableMessage* request,
-                          VariableMessage* response,
-                          google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE(
-        request_get_monomer_handler_h_ != nullptr,
-        "kRequestGetMonomerVariable handler should be registed first!");
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    // proc request.
-    std::string varname = request->varname();
-    VLOG(3) << "GetMonomerVariable " << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    rpc_server_->WaitVarCond(varname);
-    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    auto scope = h.scope_;
-    auto invar = scope->FindVar(varname);
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar,
-                                           request->trainer_id());
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response,
-                                    &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request, VoidMessage* response,
-                         google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE(
-        request_get_monomer_barrier_handler_h_ != nullptr,
-        "RequestGetMonomerBarrier handler should be registed first!");
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    rpc_server_->WaitVarCond(varname);
-    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    paddle::framework::Scope* scope = nullptr;
-    paddle::framework::Variable* invar = nullptr;
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_monomer_barrier_handler_h_->Handle(
-        varname, scope, invar, &outvar, request->trainer_id());
-  }
-
- private:
-  distributed::RequestHandler* request_send_h_{nullptr};
-  distributed::RequestHandler* request_get_h_{nullptr};
-  distributed::RequestHandler* request_getnobarrier_h_{nullptr};
-  distributed::RequestHandler* request_prefetch_h_{nullptr};
-  distributed::RequestHandler* request_checkpoint_h_{nullptr};
-  distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
-  distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr};
-
-  distributed::RPCServer* rpc_server_{nullptr};
-
-  // FIXME(gongwb): brpc should support process one rpc use one threadpool.
-  std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> getnobarrier_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
-};
-}  // namespace sendrecv
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void AsyncBRPCServer::StartServer() {
-  // Instance of your service.
-  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this);
-
-  // Add the service into server. Notice the second parameter, because the
-  // service is put on stack, we don't want server to delete it, otherwise
-  // use brpc::SERVER_OWNS_SERVICE.
-  if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
-    LOG(FATAL) << "Fail to add service";
-    return;
-  }
-
-  brpc::ServerOptions options;
-#ifdef PADDLE_WITH_BRPC_RDMA
-  options.use_rdma = true;
-#endif
-  options.idle_timeout_sec = idle_timeout_s_;
-  options.max_concurrency = max_concurrency_;
-  if (server_.Start(bind_address_.c_str(), &options) != 0) {
-    LOG(FATAL) << "Fail to start EchoServer" << bind_address_;
-    return;
-  }
-
-  butil::EndPoint ep = server_.listen_address();
-  selected_port_ = ep.port;
-
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_ready_);
-    ready_ = 1;
-  }
-  condition_ready_.notify_all();
-
-  server_.Join();
-}
-
-void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
-
-void AsyncBRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
-  std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h
deleted file mode 100644
index 78bbe5adc0813d7cf29963c78947d52bcaea9643..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <mutex>               // NOLINT
-#include <string>
-
-#include "brpc/server.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class AsyncBRPCServer final : public RPCServer {
- public:
-  explicit AsyncBRPCServer(const std::string& address, int client_num)
-      : RPCServer(address, client_num), ready_(0) {}
-
-  virtual ~AsyncBRPCServer() {}
-  void StartServer() override;
-  void WaitServerReady() override;
-
- private:
-  void ShutDownImpl() override;
-
-  brpc::Server server_;
-
-  static constexpr int idle_timeout_s_ = -1;
-  static constexpr int max_concurrency_ = 0;
-
-  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
-  int ready_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
deleted file mode 100644
index eb78917ad2d8b49f1b2d1f8dfb2cbca8a9a9610d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-namespace pb = ::google::protobuf;
-using vr = ::sendrecv::VariableMessage;
-
-int BRPCVariableResponse::Parse(Source* source) {
-  pb::io::ZeroCopyInputStream* input_stream = source->contents();
-  pb::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (1) {
-    unsigned int tag = 0;
-    if (!input.ReadLittleEndian32(&tag)) {
-      break;
-    }
-
-    uint64_t num_bytes = 0;
-    if (!input.ReadLittleEndian64(&num_bytes)) {
-      break;
-    }
-
-    int field = static_cast<int>(tag);
-    int ret = field == 0 ? -1 : field;
-    switch (field) {
-      case vr::kSerializedFieldNumber: {
-        if (!ProcSerializedField(field, &input, num_bytes)) {
-          return ret;
-        }
-        break;
-      }
-      case vr::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return ret;
-        }
-        break;
-      }
-      default: {
-        PADDLE_ENFORCE(false, "not surpported %u fieldnumber", field);
-        return ret;
-      }
-    }
-  }
-
-  return 0;
-}
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
deleted file mode 100644
index 6282f08a725367f74dbcf1fa6a2ad49469d64725..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
+++ /dev/null
@@ -1,67 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class BRPCSourceWrapper : public Source {
- public:
-  explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {}
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    return &source_;
-  }
-
- private:
-  butil::IOBufAsZeroCopyInputStream source_;
-};
-
-class BRPCVariableResponse : public VariableResponse {
- public:
-  BRPCVariableResponse(const framework::Scope* scope,
-                       const platform::DeviceContext* dev_ctx,
-                       bool create_scope = false)
-      : VariableResponse(scope, dev_ctx, create_scope) {}
-
-  virtual ~BRPCVariableResponse() {}
-
-  // parse attachment from iobuf
-  int Parse(Source* source) override;
-  int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) {
-    BRPCSourceWrapper wrapper(iobuf);
-    return VariableResponse::Parse(&wrapper, meta);
-  }
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc
deleted file mode 100644
index 6d3f53431113621fc859eda8e7448383772d20a3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/collective_client.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <condition_variable>  // NOLINT
-#include <string>
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/collective_client.h"
-
-DECLARE_int32(rpc_deadline);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-std::once_flag CollectiveClient::init_flag_;
-std::unique_ptr<CollectiveClient> CollectiveClient::client_(nullptr);
-
-bool CollectiveClient::Gather(const std::vector<RemoteVar>& remote_vars,
-                              std::vector<const framework::SelectedRows*>* dst,
-                              const platform::DeviceContext& ctx,
-                              framework::Scope* scope, int64_t time_out) {
-  for (auto r : remote_vars) {
-    VLOG(50) << "begin gather from ep:" << r.String();
-    scope->Var(r.var_name_)->GetMutable<framework::SelectedRows>();
-    VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable(
-        r.ep_, ctx, *scope, r.var_name_, time_out);
-  }
-
-  rpc_client_->Wait();
-
-  for (auto r : remote_vars) {
-    auto select_rows =
-        scope->FindVar(r.var_name_)->GetMutable<framework::SelectedRows>();
-    dst->push_back(select_rows);
-
-    VLOG(4) << "gather from ep:" << r.String()
-            << ", select_rows:" << GetSelectedRowsInfo(*select_rows);
-
-    rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_);
-  }
-
-  rpc_client_->Wait();
-  return true;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h
deleted file mode 100644
index 6a3a450a1fd2e52c341f824f4816ca13784bda85..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-DECLARE_int32(rpc_deadline);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) {
-  std::stringstream ss;
-  ss << ", height:" << slr.height() << ", rows:[";
-  for (unsigned int i = 0; i < slr.rows().size(); i++) {
-    if (i != slr.rows().size() - 1) {
-      ss << slr.rows()[i] << ",";
-    } else {
-      ss << slr.rows()[i];
-    }
-  }
-  ss << "], dims:" << slr.value().dims();
-  return ss.str();
-}
-
-struct RemoteVar {
-  std::string ep_;
-  std::string var_name_;
-  int trainer_id_{0};
-
-  std::string String() {
-    std::stringstream ss;
-    ss << "ep:" << ep_ << ", var_name:" << var_name_
-       << ", trainer_id:" << trainer_id_;
-
-    return ss.str();
-  }
-};
-
-class CollectiveClient {
- public:
-  CollectiveClient() {
-    rpc_client_.reset(new RPCCLIENT_T());
-    rpc_client_->InitImpl();
-  }
-  virtual ~CollectiveClient() {}
-
-  // note this function will retain the rank order.
-  bool Gather(const std::vector<RemoteVar>& remote_vars,
-              std::vector<const framework::SelectedRows*>* dst,
-              const platform::DeviceContext& ctx, framework::Scope* scope,
-              int64_t time_out = FLAGS_rpc_deadline);
-
-  static CollectiveClient* GetInstance() {
-    std::call_once(init_flag_, [&]() {
-      if (client_.get() == nullptr) {
-        client_.reset(new CollectiveClient());
-      }
-    });
-    return client_.get();
-  }
-
- private:
-  std::unique_ptr<RPCClient> rpc_client_;
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<CollectiveClient> client_;
-};
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc
deleted file mode 100644
index c95652400c27acd406ca3f70a0dfa8d329e94358..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/collective_server.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "paddle/fluid/operators/distributed/collective_server.h"
-
-DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag CollectiveServer::init_flag_;
-std::shared_ptr<CollectiveServer> CollectiveServer::collective_server_(nullptr);
-
-CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) {
-  VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in;
-  rpc_server_.reset(new RPCSERVER_T(end_point, fan_in));
-}
-
-void CollectiveServer::Stop() {
-  rpc_server_->ShutDown();
-  server_thread_->join();
-  loop_thread_->join();
-}
-
-void CollectiveServer::StartServer() {
-  get_monomer_handler_.reset(new GetMonomerHandler());
-  get_monomer_handler_->SetRPCServer(rpc_server_.get());
-
-  get_barrier_handler_.reset(new GetMonomerBarrierHandler());
-  get_barrier_handler_->SetRPCServer(rpc_server_.get());
-
-  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable,
-                           get_monomer_handler_.get(),
-                           FLAGS_collective_get_thread_num);
-  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier,
-                           get_barrier_handler_.get(), 1);
-
-  server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); }));
-  rpc_server_->WaitServerReady();
-
-  loop_thread_.reset(new std::thread([&]() {
-    while (true) {
-      if (rpc_server_->IsExit()) {
-        LOG(WARNING) << "get exit!rpc_processor break!";
-        break;
-      }
-      sleep(1);
-    }
-    VLOG(1) << "CollectiveServer loop_thread end";
-  }));
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h
deleted file mode 100644
index 03c688a78e1cbaba4afe4585e619956188a767a1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class CollectiveServer;
-
-class GetMonomerHandler final : public RequestHandler {
- public:
-  GetMonomerHandler() : RequestHandler(true) {}
-  virtual ~GetMonomerHandler() {}
-  bool Handle(const std::string& var_name, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override {
-    VLOG(50) << "GetMonomerHandler recv " << var_name;
-
-    *outvar = scope->FindVar(var_name);
-    PADDLE_ENFORCE(outvar != nullptr, "%s not found", var_name);
-
-    return true;
-  }
-};
-
-class GetMonomerBarrierHandler final : public RequestHandler {
- public:
-  GetMonomerBarrierHandler() : RequestHandler(true) {}
-  virtual ~GetMonomerBarrierHandler() {}
-  bool Handle(const std::string& var_name, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override {
-    VLOG(50) << "GetMonomerHandler recv " << var_name;
-
-    rpc_server_->IncreaseVarBarrier(var_name);
-
-    return true;
-  }
-};
-
-class CollectiveServer final {
- public:
-  explicit CollectiveServer(const std::string& end_point, int fan_in);
-
-  virtual ~CollectiveServer() {}
-
-  void StartServer();
-
-  static CollectiveServer* GetInstance(const std::string& end_point,
-                                       int fan_in) {
-    std::call_once(init_flag_, [&]() {
-      if (collective_server_.get() == nullptr) {
-        collective_server_.reset(new CollectiveServer(end_point, fan_in));
-        collective_server_->StartServer();
-      }
-    });
-
-    return collective_server_.get();
-  }
-
-  std::shared_ptr<RPCServer> GetRPCServer() { return rpc_server_; }
-
-  void Stop();
-
- private:
-  std::unique_ptr<GetMonomerHandler> get_monomer_handler_;
-  std::unique_ptr<GetMonomerBarrierHandler> get_barrier_handler_;
-
-  std::shared_ptr<distributed::RPCServer> rpc_server_;
-  std::shared_ptr<std::thread> server_thread_;
-  std::shared_ptr<std::thread> loop_thread_;
-
-  bool ready_{false};
-
-  static std::once_flag init_flag_;
-  static std::shared_ptr<CollectiveServer> collective_server_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
deleted file mode 100644
index be8c7a7dd40697d4abb8e53215ce09ae6619f18e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-#include "paddle/fluid/operators/distributed/collective_client.h"
-#include "paddle/fluid/operators/distributed/collective_server.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace distributed = paddle::operators::distributed;
-
-std::unique_ptr<distributed::CollectiveServer> StartServer(
-    const std::string& ep, int fan_in, framework::Scope* scope,
-    platform::DeviceContext* dev_ctx) {
-  distributed::CollectiveServer* server =
-      distributed::CollectiveServer::GetInstance(ep, fan_in);
-
-  auto rpc_server = server->GetRPCServer();
-  rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable,
-                          scope, dev_ctx);
-
-  std::cout << "StartServer return" << std::endl;
-  return std::unique_ptr<distributed::CollectiveServer>(server);
-}
-
-std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  framework::Scope* scope = new framework::Scope();
-  framework::Variable* var = scope->Var("var1");
-  auto* slr = var->GetMutable<framework::SelectedRows>();
-  slr->set_height(20000);
-
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-
-  tensor->Resize(framework::make_ddim({3, 1024}));
-  tensor->mutable_data<float>(place);
-
-  paddle::operators::math::set_constant(ctx, tensor, 32.7);
-  for (int i = 0; i < 3; ++i) rows->push_back(i);
-
-  std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr);
-
-  return std::unique_ptr<framework::Scope>(scope);
-}
-
-void Gather(const std::vector<distributed::RemoteVar>& vars,
-            platform::DeviceContext* dev_ctx) {
-  distributed::CollectiveClient* client =
-      distributed::CollectiveClient::GetInstance();
-
-  framework::Scope* scope = new framework::Scope();
-  framework::Variable* var = scope->Var("var1");
-  var->GetMutable<framework::SelectedRows>();
-
-  std::vector<const framework::SelectedRows*> dst;
-  client->Gather(vars, &dst, *dev_ctx, scope);
-  std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
-  dev_ctx->Wait();
-
-  ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024}));
-  ASSERT_EQ(dst[0]->height(), 20000);
-  ASSERT_EQ(dst[0]->rows().size(), static_cast<size_t>(3));
-  for (int i = 0; i < 3; i++) {
-    ASSERT_EQ(dst[0]->rows()[i], i);
-  }
-
-  std::vector<float> vec;
-  TensorToVector(dst[0]->value(), *dev_ctx, &vec);
-  for (size_t i = 0; i < 3 * 1024; i++) {
-    ASSERT_FLOAT_EQ(vec[i], 32.7);
-  }
-}
-
-TEST(CollectiveServer, GPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-
-  platform::CUDAPlace place;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  std::string ep = "127.0.0.1:7164";
-  auto scope = GenerateVars(place);
-
-  auto* v1 = scope->FindVar("var1");
-  std::cout << "var1:" << v1 << std::endl;
-
-  auto server = StartServer(ep, 2, scope.get(), &ctx);
-  auto rpc_server = server->GetRPCServer();
-
-  distributed::RemoteVar var;
-  var.ep_ = ep;
-  var.var_name_ = "var1";
-  var.trainer_id_ = 0;
-
-  std::vector<distributed::RemoteVar> vars{var};
-  Gather(vars, &ctx);
-  Gather(vars, &ctx);
-
-  std::cout << "begin WaitVarBarrier" << std::endl;
-  rpc_server->WaitVarBarrier("var1");
-  rpc_server->ClearRegisteredVars();
-  server->Stop();
-
-  scope.release();
-  server.release();
-}
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
deleted file mode 100644
index 683d4ca98aad9e241fa3654b8b76c555207fb543..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-#include <gflags/gflags.h>
-#include <paddle/fluid/framework/program_desc.h>
-#include <chrono>  // NOLINT
-#include <thread>  // NOLINT
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-
-DECLARE_int32(communicator_max_merge_var_num);
-DECLARE_int32(communicator_send_queue_size);
-
-DEFINE_bool(communicator_independent_recv_thread, true,
-            "use an independent to recv vars from parameter server");
-DEFINE_int32(communicator_min_send_grad_num_before_recv, 20,
-             "max grad num to send before recv parameters");
-DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
-DEFINE_int32(communicator_send_wait_times, 5,
-             "times that send thread will wait if merge num does not reach "
-             "max_merge_var_num");
-DEFINE_bool(communicator_fake_rpc, false,
-            "fake mode does not really send any thing");
-DEFINE_bool(communicator_merge_sparse_grad, true,
-            "merge sparse gradient before sending");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
-
-std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
-
-Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
-                           const RpcCtxMap &recv_varname_to_ctx,
-                           Scope *recv_scope)
-    : send_varname_to_ctx_(send_varname_to_ctx),
-      recv_varname_to_ctx_(recv_varname_to_ctx),
-      recv_scope_(recv_scope) {
-  // get all send information from graph, build vars_to_send
-  VLOG(0) << "communicator_independent_recv_thread: "
-          << FLAGS_communicator_independent_recv_thread;
-  VLOG(0) << "communicator_send_queue_size: "
-          << FLAGS_communicator_send_queue_size;
-  VLOG(0) << "communicator_min_send_grad_num_before_recv: "
-          << FLAGS_communicator_min_send_grad_num_before_recv;
-  VLOG(0) << "communicator_thread_pool_size: "
-          << FLAGS_communicator_thread_pool_size;
-  VLOG(0) << "communicator_send_wait_times: "
-          << FLAGS_communicator_send_wait_times;
-  VLOG(0) << "communicator_max_merge_var_num: "
-          << FLAGS_communicator_max_merge_var_num;
-  VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc;
-  VLOG(0) << "communicator_merge_sparse_grad: "
-          << FLAGS_communicator_merge_sparse_grad;
-
-  if (send_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be send, will not start send_thread";
-  } else {
-    send_scope_.reset(new Scope());
-    for (auto &iter : send_varname_to_ctx_) {
-      send_varname_to_queue_[iter.first] =
-          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
-              FLAGS_communicator_send_queue_size);
-    }
-    send_threadpool_.reset(
-        new ::ThreadPool(FLAGS_communicator_thread_pool_size));
-  }
-
-  if (recv_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be received, will not start recv_thread";
-  } else {
-    recv_threadpool_.reset(
-        new ::ThreadPool(FLAGS_communicator_thread_pool_size));
-  }
-}
-
-Communicator::~Communicator() {
-  if (FLAGS_v >= 3) {
-    std::string msg("~Communicator");
-    fwrite(msg.c_str(), msg.length(), 1, stdout);
-  }
-  running_ = false;
-  if (send_thread_) send_thread_->join();
-  if (recv_thread_) recv_thread_->join();
-  if (FLAGS_v >= 3) {
-    std::string msg("~Communicator done");
-    fwrite(msg.c_str(), msg.length(), 1, stdout);
-  }
-}
-
-void Communicator::SendThread() {
-  VLOG(3) << "SendThread start!";
-  while (running_) {
-    std::vector<std::future<void>> task_futures;
-    task_futures.reserve(send_varname_to_ctx_.size());
-    VLOG(3) << "run send graph";
-    auto before_run_send_graph = GetCurrentUS();
-    for (auto &iter : send_varname_to_queue_) {
-      auto &var_name = iter.first;
-      auto &var_queue = iter.second;
-      if (var_queue->Size() > 0) {
-        auto send_task = [this, &var_name, &var_queue] {
-          VLOG(3) << var_name << " merge and send";
-          std::vector<std::shared_ptr<Variable>> vars;
-          size_t merged_var_num = 0;
-          size_t wait_times = 0;
-          while (merged_var_num < FLAGS_communicator_max_merge_var_num) {
-            if (var_queue->Size() == 0) {
-              VLOG(3) << "wait_times -> " << wait_times;
-              if (wait_times >= FLAGS_communicator_send_wait_times) {
-                break;
-              }
-              std::this_thread::sleep_for(std::chrono::milliseconds(10));
-              wait_times++;
-              continue;
-            } else {
-              wait_times = 0;
-
-              vars.push_back(var_queue->Pop());
-              // only count the send number of the first var
-              if (var_name == send_varname_to_queue_.begin()->first) {
-                grad_num_.fetch_add(1, std::memory_order_relaxed);
-              }
-              merged_var_num++;
-            }
-          }
-          auto before_merge = GetCurrentUS();
-          MergeVars(var_name, vars, send_scope_.get());
-          auto after_merge = GetCurrentUS();
-          VLOG(3) << "merge " << merged_var_num << " " << var_name
-                  << " use time " << after_merge - before_merge;
-          auto send_functor = distributed::ParameterSend<float>();
-          auto &ctx = send_varname_to_ctx_.at(var_name);
-          if (!FLAGS_communicator_fake_rpc) {
-            send_functor(ctx, *send_scope_, true);
-          }
-          auto after_send = GetCurrentUS();
-          VLOG(3) << "send " << var_name << " use time "
-                  << after_send - after_merge;
-        };
-        task_futures.emplace_back(
-            send_threadpool_->enqueue(std::move(send_task)));
-      } else {
-        VLOG(4) << var_name << " queue empty";
-      }
-    }
-    for (auto &task_f : task_futures) {
-      task_f.wait();
-    }
-    auto after_run_send_graph = GetCurrentUS();
-
-    VLOG(3) << "run send graph use time "
-            << after_run_send_graph - before_run_send_graph;
-    RecvNonIndependent();
-  }
-  VLOG(0) << "communicator stopped, send thread exit";
-}
-
-void Communicator::RecvNonIndependent() {
-  if (FLAGS_communicator_independent_recv_thread) {
-    return;
-  }
-
-  auto grad_num = grad_num_.load();
-  if (grad_num > 0) {
-    RecvAll();
-    grad_num_.store(0);
-  } else {
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  }
-}
-
-void Communicator::RecvAll() {
-  VLOG(3) << "parallel run recv graph";
-  if (!running_) return;
-  auto before_send = GetCurrentUS();
-  std::vector<std::future<void>> task_futures;
-  task_futures.reserve(recv_varname_to_ctx_.size());
-  for (auto &iter : recv_varname_to_ctx_) {
-    auto recv_task = [this, &iter] {
-      auto &var_name = iter.first;
-      VLOG(4) << "recv var " << var_name;
-      auto recv_functor = distributed::ParameterRecv<float>();
-      if (!FLAGS_communicator_fake_rpc) {
-        recv_functor(iter.second, *recv_scope_);
-      }
-    };
-    task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task)));
-  }
-  for (auto &task : task_futures) {
-    task.wait();
-  }
-  auto after_recv = GetCurrentUS();
-  VLOG(1) << "run recv graph use time " << after_recv - before_send;
-}
-
-void Communicator::RecvThread() {
-  VLOG(3) << "RecvThread start!";
-  while (running_) {
-    auto grad_num = grad_num_.load();
-    if (grad_num > FLAGS_communicator_min_send_grad_num_before_recv) {
-      VLOG(1) << "current grad num " << grad_num;
-      RecvAll();
-      grad_num_.store(0);
-    } else {
-      std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    }
-  }
-  VLOG(0) << "communicator stopped, recv thread exit";
-}
-
-void Communicator::Send(const std::string &var_name,
-                        const framework::Scope &scope) {
-  VLOG(3) << "communicator send " << var_name;
-  // push var into send queue by var_name
-  auto *grad_var = scope.FindVar(var_name);
-  PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited");
-  if (grad_var->IsType<framework::SelectedRows>() &&
-      !FLAGS_communicator_merge_sparse_grad) {
-    auto send_functor = distributed::ParameterSend<float>();
-    auto &ctx = send_varname_to_ctx_.at(var_name);
-    if (!FLAGS_communicator_fake_rpc) {
-      send_functor(ctx, scope, true);
-    }
-  } else {
-    auto tmp_grad_var = std::make_shared<Variable>();
-    framework::CopyVariable(*grad_var, tmp_grad_var.get());
-    auto &queue = send_varname_to_queue_.at(var_name);
-    VLOG(3) << "send " << var_name << " queue size " << queue->Size();
-    queue->Push(tmp_grad_var);
-  }
-}
-
-void Communicator::Init(const paddle::framework::ProgramDesc &program,
-                        Scope *param_scope) {
-  using RpcCtxMap = operators::distributed::RpcCtxMap;
-  VLOG(3) << "ProcessGraph";
-  RpcCtxMap send_varname_to_ctx;
-  RpcCtxMap recv_varname_to_ctx;
-  for (auto *op : program.Block(0).AllOps()) {
-    VLOG(3) << "node name " << op->Type();
-    if (op->Type() == "send") {
-      auto send_var_name = op->Input("X")[0];
-      auto send_varnames = boost::get<std::vector<std::string>>(
-          op->GetNullableAttr("send_varnames"));
-      auto epmap =
-          boost::get<std::vector<std::string>>(op->GetNullableAttr("epmap"));
-      auto height_section =
-          boost::get<std::vector<int64_t>>(op->GetNullableAttr("sections"));
-      auto trainer_id = boost::get<int>(op->GetNullableAttr("trainer_id"));
-      send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext(
-          send_var_name, send_varnames, epmap, height_section, trainer_id);
-      VLOG(3) << "find and init an send op: "
-              << send_varname_to_ctx[send_var_name];
-    } else if (op->Type() == "recv") {
-      auto do_not_run = boost::get<int>(op->GetNullableAttr("do_not_run"));
-      PADDLE_ENFORCE_GT(do_not_run, 0, "recv should not run!");
-      auto recv_var_name = op->Output("Out")[0];
-      auto recv_varnames = boost::get<std::vector<std::string>>(
-          op->GetNullableAttr("recv_varnames"));
-      auto epmap =
-          boost::get<std::vector<std::string>>(op->GetNullableAttr("epmap"));
-      auto trainer_id = boost::get<int>(op->GetNullableAttr("trainer_id"));
-      recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(
-          recv_var_name, recv_varnames, epmap, {}, trainer_id);
-    }
-  }
-
-  // init communicator here
-  if (send_varname_to_ctx.size() == 0 && recv_varname_to_ctx.size() == 0) {
-    LOG(WARNING) << "no var need to send and recv!!";
-  }
-  operators::distributed::Communicator::Init(send_varname_to_ctx,
-                                             recv_varname_to_ctx, param_scope);
-}
-
-Communicator *Communicator::GetInstance() { return communicator_.get(); }
-
-std::shared_ptr<Communicator> Communicator::GetInstantcePtr() {
-  return communicator_;
-}
-
-void Communicator::Start() {
-  VLOG(0) << "Communicator start";
-  if (!communicator_) {
-    VLOG(0) << "Communicator is not inited, do nothing";
-  } else {
-    VLOG(1) << "start send thread and recv thread";
-    running_ = true;
-    // start send and recv thread
-    send_thread_.reset(
-        new std::thread(std::bind(&Communicator::SendThread, this)));
-    if (FLAGS_communicator_independent_recv_thread) {
-      recv_thread_.reset(
-          new std::thread(std::bind(&Communicator::RecvThread, this)));
-    }
-  }
-}
-
-void Communicator::Stop() {
-  VLOG(0) << "Communicator stop";
-  running_ = false;
-  if (!communicator_) {
-    VLOG(0) << "Communicator is not inited, do nothing";
-  } else {
-    if (send_thread_) {
-      VLOG(1) << "stop send thread";
-      send_thread_->join();
-      send_thread_.reset(nullptr);
-    }
-    if (recv_thread_) {
-      VLOG(1) << "stop recv thread";
-      recv_thread_->join();
-      recv_thread_.reset(nullptr);
-    }
-  }
-  VLOG(0) << "Communicator stop done";
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
deleted file mode 100644
index b3079f51c4d6c900b443d023f3cec5b9125427e4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/communicator.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <deque>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include <ThreadPool.h>
-
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/rpc_common.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-
-template <typename T>
-class BlockingQueue {
- public:
-  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {
-    PADDLE_ENFORCE_GT(capacity_, 0, "The capacity must be greater than 0.");
-  }
-
-  bool Push(const T& elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      PADDLE_ENFORCE_LT(queue_.size(), capacity_);
-      queue_.push_back(elem);
-    }
-    cv_.notify_one();
-    return true;
-  }
-
-  bool Push(T&& elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      PADDLE_ENFORCE_LT(queue_.size(), capacity_);
-      queue_.emplace_back(std::move(elem));
-    }
-    cv_.notify_one();
-    return true;
-  }
-
-  T Pop() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [=] { return !queue_.empty(); });
-    T rc(std::move(queue_.front()));
-    queue_.pop_front();
-    cv_.notify_one();
-    return rc;
-  }
-
-  size_t Cap() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return capacity_;
-  }
-
-  size_t Size() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return queue_.size();
-  }
-
- private:
-  const size_t capacity_;
-  std::deque<T> queue_;
-
-  mutable std::mutex mutex_;
-  std::condition_variable cv_;
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-inline void MergeVars(const std::string& var_name,
-                      const std::vector<std::shared_ptr<Variable>>& vars,
-                      Scope* scope) {
-  PADDLE_ENFORCE(!vars.empty(), "should have value to merge!");
-  auto cpu_place = platform::CPUPlace();
-  auto& var0 = vars[0];
-  auto* out_var = scope->Var(var_name);
-  if (var0->IsType<framework::LoDTensor>()) {
-    auto dims = var0->Get<framework::LoDTensor>().dims();
-    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims;
-
-    // init output tensor
-    auto* out_t = out_var->GetMutable<framework::LoDTensor>();
-    out_t->mutable_data<float>(dims, cpu_place);
-
-    // check the input dims
-    for (auto& var : vars) {
-      auto& var_t = var->Get<framework::LoDTensor>();
-      PADDLE_ENFORCE_EQ(var_t.dims(), dims, "should have the same dims");
-    }
-
-    // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
-    math::SetConstant<paddle::platform::CPUDeviceContext, float>
-        constant_functor;
-    constant_functor(cpu_ctx, out_t, static_cast<float>(0));
-
-    // sum all vars to out
-    auto result = EigenVector<float>::Flatten(*out_t);
-    for (auto& var : vars) {
-      auto& in_t = var->Get<framework::LoDTensor>();
-      auto in = EigenVector<float>::Flatten(in_t);
-      result.device(*cpu_ctx.eigen_device()) = result + in;
-    }
-    result.device(*cpu_ctx.eigen_device()) =
-        result / static_cast<float>(vars.size());
-  } else if (var0->IsType<framework::SelectedRows>()) {
-    auto& slr0 = var0->Get<framework::SelectedRows>();
-    auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
-    out_slr->mutable_rows()->clear();
-    out_slr->mutable_value()->mutable_data<float>({{}}, cpu_place);
-    std::vector<const paddle::framework::SelectedRows*> inputs;
-    inputs.reserve(vars.size());
-    for (auto& var : vars) {
-      inputs.push_back(&var->Get<framework::SelectedRows>());
-    }
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
-    math::scatter::MergeAverage<paddle::platform::CPUDeviceContext, float>
-        merge_average;
-    merge_average(dev_ctx, inputs, out_slr);
-    VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height()
-            << " dims: " << slr0.value().dims();
-  } else {
-    PADDLE_THROW("unsupported var type!");
-  }
-}
-
-using RpcCtxMap = std::unordered_map<std::string, RpcContext>;
-
-class Communicator {
- public:
-  Communicator(const RpcCtxMap& send_varname_to_ctx,
-               const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope);
-
-  ~Communicator();
-
-  void Start();
-  void Stop();
-
-  bool IsRunning() { return running_; }
-
-  // send grad
-  void Send(const std::string& var_name, const framework::Scope& scope);
-
- private:
-  // recv all parameter
-  void RecvAll();
-  void RecvNonIndependent();
-  void SendThread();
-  void RecvThread();
-
-  bool running_ = false;
-  std::unordered_map<std::string,
-                     std::shared_ptr<BlockingQueue<std::shared_ptr<Variable>>>>
-      send_varname_to_queue_;
-  RpcCtxMap send_varname_to_ctx_;
-  RpcCtxMap recv_varname_to_ctx_;
-  std::unique_ptr<std::thread> send_thread_{nullptr};
-  std::unique_ptr<std::thread> recv_thread_{nullptr};
-  Scope* recv_scope_;                  // should be global scope
-  std::unique_ptr<Scope> send_scope_;  // an independent scope
-  std::unique_ptr<::ThreadPool> send_threadpool_{nullptr};
-  std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr};
-  std::atomic_uint grad_num_{0};  // the num of gradient sent since last recv
-
-  // the following code is for initialize the commnunicator
- public:
-  static void Init(const RpcCtxMap& send_varname_to_ctx,
-                   const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope) {
-    if (communicator_ == nullptr) {
-      communicator_.reset(new Communicator(send_varname_to_ctx,
-                                           recv_varname_to_ctx, recv_scope));
-    }
-  }
-
-  static void Init(const paddle::framework::ProgramDesc& program,
-                   Scope* param_scope);
-
-  static Communicator* GetInstance();
-
-  static std::shared_ptr<Communicator> GetInstantcePtr();
-
- private:
-  static std::shared_ptr<Communicator> communicator_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc
deleted file mode 100644
index 66e36d012b10a0e1d627ee44dcde9e68f66cc719..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/communicator_test.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-
-TEST(communicator, merge_lod_tensors) {
-  auto cpu_place = platform::CPUPlace();
-  auto dims = framework::make_ddim({2, 3});
-  std::vector<std::shared_ptr<framework::Variable>> in_vars;
-  float out_value = 0;
-  for (auto i = 0; i < 10; ++i) {
-    auto var = std::make_shared<Variable>();
-    in_vars.emplace_back(var);
-    auto *tensor = var->GetMutable<LoDTensor>();
-    auto *data = tensor->mutable_data<float>(dims, cpu_place);
-    for (auto j = 0; j < tensor->numel(); ++j) {
-      data[j] = static_cast<float>(i);
-    }
-    out_value += static_cast<float>(i);
-  }
-  out_value = out_value / 10.0;
-  const std::string out_name = "Out";
-  std::unique_ptr<framework::Scope> scope;
-  scope.reset(new framework::Scope());
-  scope->Var(out_name);
-  for (auto i = 0; i < 10; ++i) {
-    MergeVars(out_name, in_vars, scope.get());
-  }
-  auto &out_tensor = scope->FindVar(out_name)->Get<LoDTensor>();
-  auto *out_data = out_tensor.data<float>();
-  ASSERT_EQ(out_tensor.dims(), dims);
-  for (auto i = 0; i < out_tensor.numel(); ++i) {
-    ASSERT_EQ(out_data[i], out_value);
-  }
-}
-
-TEST(communicator, merge_selected_rows) {
-  auto cpu_place = platform::CPUPlace();
-  int64_t width = 10;
-  std::vector<std::shared_ptr<framework::Variable>> in_vars;
-  const int64_t height = 100;
-  for (auto i = 0; i < 10; ++i) {
-    std::vector<int64_t> rows;
-    for (auto k = 0; k <= i; ++k) {
-      rows.push_back(k);
-    }
-    auto var = std::make_shared<Variable>();
-    in_vars.emplace_back(var);
-    auto *slr = var->GetMutable<SelectedRows>();
-    slr->set_height(height);
-    slr->set_rows(rows);
-    auto dims =
-        framework::make_ddim({static_cast<int64_t>(rows.size()), width});
-    auto *data = slr->mutable_value()->mutable_data<float>(dims, cpu_place);
-    for (auto i = 0; i < rows.size(); ++i) {
-      for (auto j = 0; j < width; ++j) {
-        data[i * width + j] = static_cast<float>(rows[i]);
-      }
-    }
-  }
-  const std::string out_name = "Out";
-  std::unique_ptr<framework::Scope> scope;
-  scope.reset(new framework::Scope());
-  scope->Var(out_name);
-  for (auto i = 0; i < 10; ++i) {
-    MergeVars(out_name, in_vars, scope.get());
-  }
-  auto &out_slr = scope->FindVar(out_name)->Get<SelectedRows>();
-  auto &out_t = out_slr.value();
-  auto *out_data = out_t.data<float>();
-  ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width}));
-  std::vector<float> out_values;
-  out_values.reserve(10);
-  for (auto i = 0; i < 10; ++i) {
-    out_values.push_back(static_cast<float>((i * (10 - i)) / 10.0));
-  }
-  for (auto i = 0; i < out_slr.rows().size(); ++i) {
-    ASSERT_EQ(out_slr.rows()[i], i);
-    for (auto j = 0; j < width; ++j) {
-      ASSERT_EQ(out_data[i * width + j], out_values[i]);
-    }
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/distributed.h b/paddle/fluid/operators/distributed/distributed.h
deleted file mode 100644
index 3a9f92259875749ab2ddf26c18cd230c58a61c44..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/distributed.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-
-#ifdef PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
-#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
-#define RPCCLIENT_T paddle::operators::distributed::GRPCClient
-
-#else  // PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
-#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
-#define RPCCLIENT_T paddle::operators::distributed::BRPCClient
-
-#endif  // PADDLE_WITH_GRPC
-
-#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/operators/distributed/distributed_pb.h
deleted file mode 100644
index f1c662be9af67b418e17987e4eb1ff0a2809c3e3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/distributed_pb.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-
-#ifdef PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-#else  // PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-#endif  // PADDLE_WITH_GRPC
-
-#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
deleted file mode 100644
index c2cb0d7f04eb67275030e841740f0cdb291d9f87..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-GrpcByteBufferSource::GrpcByteBufferSource() {}
-
-bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
-  cur_ = -1;
-  left_ = 0;
-  ptr_ = nullptr;
-  byte_count_ = 0;
-  bool ok = src.Dump(&slices_).ok();
-  if (!ok) {
-    slices_.clear();
-  }
-  return ok;
-}
-
-bool GrpcByteBufferSource::Next(const void** data, int* size) {
-  // Use loop instead of if in case buffer contained empty slices.
-  while (left_ == 0) {
-    // Advance to next slice.
-    cur_++;
-    if (cur_ >= slices_.size()) {
-      return false;
-    }
-    const ::grpc::Slice& s = slices_[cur_];
-    left_ = s.size();
-    ptr_ = reinterpret_cast<const char*>(s.begin());
-  }
-
-  *data = ptr_;
-  *size = left_;
-  byte_count_ += left_;
-  ptr_ += left_;
-  left_ = 0;
-  return true;
-}
-
-void GrpcByteBufferSource::BackUp(int count) {
-  ptr_ -= count;
-  left_ += count;
-  byte_count_ -= count;
-}
-
-bool GrpcByteBufferSource::Skip(int count) {
-  const void* data;
-  int size;
-  while (Next(&data, &size)) {
-    if (size >= count) {
-      BackUp(size - count);
-      return true;
-    }
-    // size < count;
-    count -= size;
-  }
-  // error or we have too large count;
-  return false;
-}
-
-google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
-  return byte_count_;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
deleted file mode 100644
index e9074574cdd163bbf7e62939df9283352706f840..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <vector>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-namespace grpc {
-// A ZeroCopyInputStream that reads from grpc_byte_buffer
-class GrpcBufferReader final
-    : public ::google::protobuf::io::ZeroCopyInputStream {
-  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    (g_core_codegen_interface->*ptr)(reader, buffer);
-  }
-  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
-    (void)result;
-  }
-
- public:
-  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
-      : byte_count_(0), backup_count_(0) {
-    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
-               buffer);
-  }
-  ~GrpcBufferReader() override {
-    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
-  }
-
-  bool Next(const void** data, int* size) override {
-    if (backup_count_ > 0) {
-      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
-              backup_count_;
-      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = static_cast<int>(backup_count_);
-      backup_count_ = 0;
-      return true;
-    }
-    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
-                                                                &slice_)) {
-      return false;
-    }
-    g_core_codegen_interface->grpc_slice_unref(slice_);
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
-    return true;
-  }
-
-  void BackUp(int count) override { backup_count_ = count; }
-
-  bool Skip(int count) override {
-    const void* data;
-    int size;
-    while (Next(&data, &size)) {
-      if (size >= count) {
-        BackUp(size - count);
-        return true;
-      }
-      // size < count;
-      count -= size;
-    }
-    // error or we have too large count;
-    return false;
-  }
-
-  ::google::protobuf::int64 ByteCount() const override {
-    return byte_count_ - backup_count_;
-  }
-
- private:
-  int64_t byte_count_;
-  int64_t backup_count_;
-  grpc_byte_buffer_reader reader_;
-  grpc_slice slice_;
-};
-
-};  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
-class GrpcByteBufferSource
-    : public ::google::protobuf::io::ZeroCopyInputStream {
- public:
-  GrpcByteBufferSource();
-  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
-  bool Next(const void** data, int* size) override;
-  void BackUp(int count) override;
-  bool Skip(int count) override;
-  ::google::protobuf::int64 ByteCount() const override;
-
- private:
-  std::vector<::grpc::Slice> slices_;
-  size_t cur_;       // Current slice index.
-  int left_;         // Number of bytes in slices_[cur_] left to yield.
-  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
-  ::google::protobuf::int64 byte_count_;
-};
-
-class GrpcByteBufferSourceWrapper : public Source {
- public:
-  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
-      : source_(source) {}
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    return source_;
-  }
-
- private:
-  GrpcByteBufferSource* source_;
-};
-
-class GrpcByteSource : public Source {
- public:
-  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
-  ~GrpcByteSource() override { DeleteStream(); }
-
-  typedef ::grpc::GrpcBufferReader Reader;
-
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    DeleteStream();
-    stream_ = new (&space_) Reader(buffer_);
-    return stream_;
-  }
-
- private:
-  void DeleteStream() {
-    if (stream_) {
-      stream_->~Reader();
-    }
-  }
-
-  grpc_byte_buffer* buffer_;  // Not owned
-  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
-  char space_[sizeof(Reader)];
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
deleted file mode 100644
index 053fe202fe9c57441eda66ec10b146c21700b2fc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ /dev/null
@@ -1,535 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <limits>
-
-#include "glog/logging.h"  // For VLOG
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DECLARE_bool(rpc_disable_reuse_port);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void GRPCClient::InitImpl() {
-  // start the client process thread
-  // TODO(wuyi): can make this in a threadpool
-  PADDLE_ENFORCE(client_thread_ == nullptr,
-                 "please not re init proceed thread");
-  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
-}
-
-void GRPCClient::SendComplete() {
-  std::unique_lock<std::mutex> lk(completed_mutex_);
-  if (!completed_) {
-    for (auto& it : channels_) {
-      VLOG(3) << "send complete message to " << it.first;
-      this->AsyncSendComplete(it.first);
-    }
-    PADDLE_ENFORCE(this->Wait(), "internal grpc error");
-    completed_ = true;
-  }
-}
-
-GRPCClient::~GRPCClient() {
-  stopped_ = true;
-  Wait();
-  cq_.Shutdown();
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    for (auto& it : channels_) {
-      it.second.reset();
-    }
-    channels_.clear();
-  }
-  client_thread_->join();
-}
-
-VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-  const std::string method = kSendRPC;
-
-  int retry_times_ = 0;
-
-  while (true) {
-    SendProcessor* s = new SendProcessor(ch);
-    VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-
-    framework::AsyncIO([var_name_val, p_scope, p_ctx, s, method, h, this] {
-      auto* var = p_scope->FindVar(var_name_val);
-
-      ::grpc::ByteBuffer req;
-      SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = nullptr;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call = s->stub_g_.PrepareUnaryCall(
-          s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req,
-          &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-void ProcGetResponse(const VarHandle& var_h,
-                     const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(4) << "ProcGetResponse";
-  framework::Variable* outvar = nullptr;
-  // get response's trainer_id is not used
-  int trainer_id;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
-                            &trainer_id);
-}
-
-template <typename T>
-void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
-  ::grpc::Slice slice(proto.ByteSizeLong());
-  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
-  ::grpc::ByteBuffer tmp(&slice, 1);
-  result->Swap(&tmp);
-}
-
-VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
-                                     const platform::DeviceContext& ctx,
-                                     const framework::Scope& scope,
-                                     const std::string& var_name,
-                                     const std::string& out_varname,
-                                     const std::string& table_name,
-                                     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
-                      "/sendrecv.SendRecvService/GetVariable", table_name,
-                      time_out);
-}
-
-VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    const std::string& out_varname, int64_t time_out) {
-  std::string var_name_no_barrier =
-      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
-
-  return _AsyncGetVar(
-      ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
-      "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out);
-}
-
-VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
-                      "/sendrecv.SendRecvService/GetMonomerVariable", "",
-                      time_out);
-}
-
-VarHandlePtr GRPCClient::_AsyncGetVar(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& method,
-    const std::string& var_name, const std::string& out_varname,
-    const std::string& rpc_path, const std::string& table_name,
-    int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const std::string out_varname_val = out_varname;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  int retry_times_ = 0;
-
-  while (true) {
-    GetProcessor* s = new GetProcessor(ch);
-
-    VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-
-    framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s,
-                        method, p_ctx, h, rpc_path, this] {
-      // prepare input
-      sendrecv::VariableMessage req;
-      req.set_varname(var_name_val);
-      req.set_out_varname(out_varname_val);
-      req.set_trainer_id(trainer_id_);
-      req.set_table_name(table_name_val);
-      ::grpc::ByteBuffer buf;
-      RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = ProcGetResponse;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call =
-          s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& in_var_name,
-                                          const std::string& out_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string in_var_name_val = in_var_name;
-  const std::string out_var_name_val = out_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  const std::string method = kPrefetchRPC;
-  int retry_times_ = 0;
-
-  while (true) {
-    GetProcessor* s = new GetProcessor(ch);
-    VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-
-    framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope,
-                        p_ctx, s, method, h, table_name_val, this] {
-      auto* var = p_scope->FindVar(in_var_name_val);
-
-      ::grpc::ByteBuffer req;
-      SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req,
-                            out_var_name_val, 0, table_name_val);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = ProcGetResponse;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call = s->stub_g_.PrepareUnaryCall(
-          s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
-          &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kBatchBarrierRPC;
-  VarHandlePtr h(
-      new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(BATCH_BARRIER_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  const std::string method = kFetchBarrierRPC;
-  VarHandlePtr h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(FETCH_BARRIER_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
-                                                const std::string& var_name,
-                                                int64_t time_out) {
-  const auto ch = GetChannel(ep);
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kSendMonomerFetchBarrierRPC;
-  VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
-
-  sendrecv::VariableMessage req;
-  req.set_varname(var_name);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
-                                           int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kSendCompleteRPC;
-  VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(COMPLETE_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
-                                               const std::string& dir,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
-
-  const std::string method = kCheckPointNotifyRPC;
-
-  VarHandlePtr h(
-      new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
-  req.set_out_varname(dir);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-bool GRPCClient::Wait() {
-  std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
-  return ok_;
-}
-
-void GRPCClient::Proceed() {
-  void* tag = nullptr;
-  bool ok = false;
-
-  VLOG(3) << "GRPCClient Proceed begin";
-  while (!stopped_ && cq_.Next(&tag, &ok)) {
-    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
-    GPR_ASSERT(ok);
-    PADDLE_ENFORCE(c);
-
-    if (c->status_.ok()) {
-      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
-      c->Process();
-    } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
-      LOG(FATAL) << c->GetVarHandlePtr()->String()
-                 << " meets grpc error, error_code:" << c->status_.error_code()
-                 << " error_message:" << c->status_.error_message()
-                 << " error_details:" << c->status_.error_details();
-      {
-        std::lock_guard<std::mutex> lk(sync_mutex_);
-        ok_ = false;
-      }
-      c->Finish(false);
-    } else if (c->status_.error_code() == grpc::StatusCode::UNAVAILABLE) {
-      VLOG(3) << c->GetVarHandlePtr()->String()
-              << " meets grpc error, error_code:" << c->status_.error_code()
-              << " error_message:" << c->status_.error_message()
-              << " error_details:" << c->status_.error_details()
-              << " should retry!";
-      c->GetVarHandlePtr()->should_retry = true;
-      c->Finish(false);
-    } else {
-      LOG(FATAL) << c->GetVarHandlePtr()->String()
-                 << " meets grpc error, error_code:" << c->status_.error_code()
-                 << " error_message:" << c->status_.error_message()
-                 << " error_details:" << c->status_.error_details();
-
-      c->Finish(false);
-    }
-
-    bool notify = false;
-    {
-      std::lock_guard<std::mutex> lk(sync_mutex_);
-      req_count_--;
-      notify = (req_count_ <= 0 || !c->status_.ok());
-    }
-
-    delete c;
-
-    if (notify) {
-      sync_cond_.notify_all();
-    }
-  }
-
-  // Last log message
-  // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a
-  // static Mutex log_mutex is used for synchronization, which might have been
-  // destructed at this moment.
-  if (FLAGS_v >= 3) {
-    std::string msg("GRPCClient Proceed end");
-    fwrite(msg.c_str(), msg.length(), 1, stderr);
-  }
-}
-
-std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
-  std::lock_guard<std::mutex> guard(chan_mutex_);
-  auto it = channels_.find(ep);
-  if (it != channels_.end()) {
-    return it->second;
-  }
-
-  // Channel configurations:
-  grpc::ChannelArguments args;
-  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
-  if (FLAGS_rpc_disable_reuse_port) {
-    args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
-  }
-  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
-  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-
-  auto ch =
-      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
-  channels_[ep] = ch;
-  return ch;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
deleted file mode 100644
index ad2f04a6d1dda34e35b67b21dce8ac612ff697a0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-#include <atomic>
-
-#include <chrono>              // NOLINT
-#include <condition_variable>  // NOLINT
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <vector>
-
-#include "grpc++/channel.h"
-#include "grpc++/generic/generic_stub.h"
-#include "grpc++/grpc++.h"
-#include "grpc++/support/byte_buffer.h"
-#include "grpc++/support/slice.h"
-#include "grpc/support/log.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
-
-class BaseProcessor {
- public:
-  BaseProcessor() { context_ = nullptr; }
-
-  virtual ~BaseProcessor() {}
-
-  virtual void Prepare(VarHandlePtr h, int64_t time_out) {
-    var_h_ = h;
-
-    context_.reset(new grpc::ClientContext());
-    context_->set_wait_for_ready(true);
-    if (time_out) {
-      std::chrono::system_clock::time_point deadline =
-          std::chrono::system_clock::now() +
-          std::chrono::milliseconds(time_out);
-      context_->set_deadline(deadline);
-    }
-  }
-
-  void Process() {
-    ProcessImpl();
-    var_h_->Finish(true);
-  }
-
-  VarHandlePtr GetVarHandlePtr() { return var_h_; }
-  bool Wait() { return var_h_->Wait(); }
-  void Finish(bool ok) { return var_h_->Finish(ok); }
-  virtual void ProcessImpl() = 0;
-
-  std::unique_ptr<grpc::ClientContext> context_;
-  grpc::Status status_;
-
- protected:
-  VarHandlePtr var_h_;
-};
-
-typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
-    RequestSendCallBack;
-
-class SendProcessor : public BaseProcessor {
- public:
-  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~SendProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_.get(), reply_);
-    }
-  }
-
-  ::grpc::GenericStub stub_g_;
-  ::grpc::ByteBuffer reply_;
-  RequestSendCallBack response_call_back_ = nullptr;
-};
-
-typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
-    RequestGetCallBack;
-
-class GetProcessor : public BaseProcessor {
- public:
-  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~GetProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_.get(), reply_);
-    }
-  }
-
-  ::grpc::ByteBuffer reply_;
-  ::grpc::GenericStub stub_g_;
-  RequestGetCallBack response_call_back_ = ProcGetResponse;
-};
-
-class BatchBarrierProcessor : public BaseProcessor {
- public:
-  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~BatchBarrierProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VoidMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class FetchBarrierProcessor : public BaseProcessor {
- public:
-  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~FetchBarrierProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VariableMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class CheckpointNotifyProcessor : public BaseProcessor {
- public:
-  explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~CheckpointNotifyProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VoidMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class GRPCClient : public RPCClient {
- public:
-  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
-  virtual ~GRPCClient();
-
-  VarHandlePtr AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           const std::string& out_varname,
-                           const std::string& table_name = "",
-                           int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVarNoBarrier(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_varname,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out) override;
-
-  VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dir,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendComplete(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool Wait() override;
-
-  void SendComplete() override;
-
-  void InitImpl() override;
-
- private:
-  void Proceed();
-
-  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-  VarHandlePtr _AsyncGetVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& method,
-      const std::string& var_name, const std::string& out_varname,
-      const std::string& rpc_path, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline);
-
- private:
-  grpc::CompletionQueue cq_;
-  std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  std::unique_ptr<std::thread> client_thread_{nullptr};
-
-  // mutex for Wait client sync
-  std::mutex sync_mutex_;
-  std::condition_variable sync_cond_;
-  std::atomic<int64_t> req_count_{0};
-  bool ok_;
-
-  // mutex for GetChannel thread safety
-  std::mutex chan_mutex_;
-  DISABLE_COPY_AND_ASSIGN(GRPCClient);
-
-  // mutex for sending complete message only once
-  std::mutex completed_mutex_;
-  bool completed_;
-
-  volatile bool stopped_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
deleted file mode 100644
index 91c398d0c84db1fc67740cd2368d178610ef0841..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-#include <nccl.h>
-#endif
-#include <limits>
-#include <memory>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg, const std::string& out_name,
-                           const int trainer_id,
-                           const std::string& table_name) {
-  platform::RecordRPCEvent record_event("serial");
-  VarMsg request;
-  TensorPayload* payload = nullptr;
-
-  request.set_varname(name);
-  request.set_trainer_id(trainer_id);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request.set_profile(platform::kEnableProfiler);
-    } else {
-      request.set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_name.empty()) {
-    request.set_out_varname(out_name);
-  }
-  if (!table_name.empty()) {
-    request.set_table_name(table_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request.set_type(::sendrecv::LOD_TENSOR);
-    payload = new TensorPayload(GetTensorPayload(var, ctx, &request));
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request.set_type(::sendrecv::SELECTED_ROWS);
-    payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request));
-#ifdef PADDLE_WITH_CUDA
-  } else if (var->IsType<ncclUniqueId>()) {
-    request.set_type(::sendrecv::NCCL_ID);
-#endif
-  } else {
-    PADDLE_THROW("Serialize does not support type: %s",
-                 typeid(var->Type()).name());
-  }
-
-  std::string header;
-  request.AppendToString(&header);
-  auto buffer = std::unique_ptr<char[]>(new char[1024]);
-  void* buf = buffer.get();
-  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
-  e.WriteRawBytes(std::string(header.data(), header.size()));
-// NCCLID is copied directly to the message, return bytebuffer
-// with only one slice if serializing NCCLID.
-#ifdef PADDLE_WITH_CUDA
-  if (var->IsType<ncclUniqueId>()) {
-    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                              NCCL_UNIQUE_ID_BYTES);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
-
-    // for serialize NCCL_ID
-    ::grpc::Slice slices(e.size());
-    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
-    ::grpc::ByteBuffer tmp(&slices, 1);
-    msg->Swap(&tmp);
-    return;
-  }
-#endif
-  PADDLE_ENFORCE_NOT_NULL(payload);
-
-  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                            payload->memory_size());
-  if (payload->memory_size() >= std::numeric_limits<int>::max()) {
-    LOG(FATAL) << "FATAL error: varname:" << name
-               << ", vlen:" << payload->memory_size()
-               << " >= std::numeric_limits<int>::max():"
-               << std::numeric_limits<int>::max() << ", so exit!";
-  }
-  // steal reference of tensor data
-  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
-  int num_slices = 2;       // only SelectedRows have rows buffer
-  slices[0] = ::grpc::Slice(e.size());
-  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
-  slices[1] = ::grpc::Slice(
-      grpc_slice_new_with_user_data(payload->ptr(), payload->memory_size(),
-                                    SerializeDestroyCallback, payload),
-      ::grpc::Slice::STEAL_REF);
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-
-    PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name());
-    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
-
-    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
-    slices[2] = ::grpc::Slice(e2.size());
-    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
-
-    slices[3] = ::grpc::Slice(
-        grpc_slice_new_with_user_data(
-            const_cast<void*>(
-                reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size, [](void* backing) {},
-            const_cast<char*>(
-                reinterpret_cast<const char*>(slr->rows().data()))),
-        ::grpc::Slice::STEAL_REF);
-    num_slices = 4;
-  }
-
-  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
-  msg->Swap(&tmp);
-}
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var, int* trainer_id) {
-  platform::RecordRPCEvent record_event("deserial");
-  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
-  *var = resp.GetVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
deleted file mode 100644
index c9a57beb3a6a7a7cc9973ff0e5325a3daa6d98a9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/port.h"
-
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-typedef void (*DestroyCallback)(void*);
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_varname = std::string(),
-                           const int trainer_id = 0,
-                           const std::string& table_name = std::string());
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var, int* trainer_id);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
deleted file mode 100644
index 749c1bf39a48608876c77a74aa98be51947cf3b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
-namespace memory = paddle::memory;
-
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* slr = var.GetMutable<framework::SelectedRows>();
-  slr->set_height(1000);
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-  tensor->Resize(framework::make_ddim({564, 128}));
-  tensor->mutable_data<float>(place);
-  int tensor_numel = 564 * 128;
-  math::set_constant(ctx, tensor, 32.7);
-  for (int i = 0; i < 564; ++i) rows->push_back(i);
-
-  ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-
-  // deserialize bytebuffer
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 1);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  const int64_t* rows_data =
-      reinterpret_cast<const int64_t*>(varmsg.rows().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
-  }
-  for (int i = 0; i < 564; ++i) {
-    EXPECT_EQ(rows_data[i], i);
-  }
-
-  // deserialize zero-copy
-  // framework::Variable var2;
-  // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
-  framework::Scope scope;
-  scope.Var("myvar");
-  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
-  EXPECT_EQ(resp.Parse(msg), 0);
-
-  framework::Variable* var2 = resp.GetVar();
-
-  auto* slr2 = var2->GetMutable<framework::SelectedRows>();
-  auto* tensor2 = slr2->mutable_value();
-  auto* rows2 = slr2->mutable_rows();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2->data<float>());
-  }
-  const int64_t* rows_data2 = rows2->data();
-
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-  }
-  for (size_t i = 0; i < rows2->size(); ++i) {
-    EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
-  }
-  EXPECT_EQ(slr2->height(), 1000);
-}
-
-void RunTestLodTensor(platform::Place place, int from_type = 0) {
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* tensor = var.GetMutable<framework::LoDTensor>();
-  tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
-  framework::LoD lod;
-  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-  tensor->set_lod(lod);
-  int tensor_numel = 512 * 8 * 4 * 2;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-  tensor->mutable_data<float>(place);
-  math::set_constant(ctx, tensor, 31.9);
-
-  ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg,
-                                                "outvar", 0, "table_name");
-  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 0);
-  EXPECT_EQ(varmsg.dims()[0], 512);
-  EXPECT_EQ(varmsg.dims()[1], 8);
-  EXPECT_EQ(varmsg.dims()[2], 4);
-  EXPECT_EQ(varmsg.dims()[3], 2);
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
-  }
-
-  // message binary
-  std::string str;
-  varmsg.SerializeToString(&str);
-
-  // message bytebuffer
-  ::grpc::Slice slices_2[1];
-  int num_slices = 1;
-  slices_2[0] = ::grpc::Slice(str.length());
-  memcpy(const_cast<uint8_t*>(slices_2[0].begin()), str.c_str(), str.length());
-  ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices);
-
-  // deserialize zero-copy
-  framework::Scope scope;
-  scope.Var("myvar");
-  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
-  if (from_type == 0) {
-    EXPECT_EQ(resp.Parse(msg), 0);
-  } else {
-    EXPECT_EQ(resp.Parse(bytebuffer2), 0);
-  }
-
-  framework::Variable* var2 = resp.GetVar();
-
-  auto tensor2 = var2->Get<framework::LoDTensor>();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2.data<float>());
-  }
-
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
-}
-
-TEST(LodTensor, Run) {
-  platform::CPUPlace place;
-  RunTestLodTensor(place);
-  RunTestLodTensor(place, 1);
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu(0);
-  RunTestLodTensor(gpu);
-  RunTestLodTensor(gpu, 1);
-#endif
-}
-
-TEST(SelectedRows, Run) {
-  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
-
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(gpu);
-#endif
-}
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
deleted file mode 100644
index 75526bed0f0eadada65279ec05757da7a469f984..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ /dev/null
@@ -1,594 +0,0 @@
-/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <limits>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
-
-using ::grpc::ServerAsyncResponseWriter;
-
-DECLARE_bool(rpc_disable_reuse_port);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-enum CallStatus { PROCESS = 0, FINISH };
-
-// reference:
-// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
-class RequestBase {
- public:
-  explicit RequestBase(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq,
-                       RequestHandler* request_handler, int req_id)
-      : service_(service),
-        cq_(cq),
-        status_(PROCESS),
-        request_handler_(request_handler),
-        req_id_(req_id) {
-    PADDLE_ENFORCE(cq_);
-  }
-  virtual ~RequestBase() {}
-  virtual void Process() = 0;
-
-  std::string Status2String(const std::string& method) {
-    std::string status = "Process";
-    if (status_ == FINISH) {
-      status = "Finish";
-    }
-
-    std::ostringstream s;
-    s << method << " name:[" << GetReqName() << "]"
-      << ", ep:[" << ctx_.peer() << "]"
-      << " " << status << " using req_id:" << req_id_;
-    return s.str();
-  }
-
-  CallStatus Status() const {
-    std::lock_guard<std::mutex> l(status_mu_);
-    return status_;
-  }
-
-  template <typename T>
-  void Finish(const T& reply, ServerAsyncResponseWriter<T>* responder) {
-    std::lock_guard<std::mutex> l(status_mu_);
-    status_ = FINISH;
-    responder->Finish(reply, ::grpc::Status::OK,
-                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
-  }
-  virtual std::string GetReqName() = 0;
-
- protected:
-  mutable std::mutex status_mu_;
-  ::grpc::ServerContext ctx_;
-  GrpcService::AsyncService* service_;
-  ::grpc::ServerCompletionQueue* cq_;
-  CallStatus status_;
-  RequestHandler* request_handler_;
-  int req_id_;
-};
-
-class RequestSend final : public RequestBase {
- public:
-  explicit RequestSend(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq,
-                       RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(),
-                                            !request_handler->sync_mode()));
-    int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-  virtual ~RequestSend() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string varname = GetReqName();
-    VLOG(4) << "RequestSend var_name:" << varname;
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = request_->GetVar();
-    int trainer_id = request_->GetTrainerId();
-    framework::Variable* outvar = nullptr;
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VoidMessage reply_;
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestGet final : public RequestBase {
- public:
-  explicit RequestGet(GrpcService::AsyncService* service,
-                      ::grpc::ServerCompletionQueue* cq,
-                      RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGet() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    std::string out_varname = request_.out_varname();
-    std::string table_name = request_.table_name();
-    int trainer_id = request_.trainer_id();
-
-    VLOG(4) << "RequestGet " << out_varname << " from " << varname;
-
-    auto scope = request_handler_->scope();
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    tmp_scope_ = std::move(scope->NewTmpScope());
-    request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar,
-                             trainer_id, out_varname, table_name);
-
-    VLOG(1) << "before SerializeToByteBuffer";
-    if (outvar) {
-      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
-                            &reply_);
-    }
-    VLOG(1) << "after SerializeToByteBuffer";
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  std::unique_ptr<framework::Scope> tmp_scope_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-class RequestGetNoBarrier final : public RequestBase {
- public:
-  explicit RequestGetNoBarrier(GrpcService::AsyncService* service,
-                               ::grpc::ServerCompletionQueue* cq,
-                               RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetVariableNoBarrier);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetNoBarrier() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    std::string out_varname = request_.out_varname();
-    int trainer_id = request_.trainer_id();
-
-    VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname;
-
-    auto scope = request_handler_->scope();
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
-                             out_varname);
-
-    if (outvar) {
-      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
-                            &reply_);
-    }
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-class RequestGetMonomerVariable final : public RequestBase {
- public:
-  explicit RequestGetMonomerVariable(GrpcService::AsyncService* service,
-                                     ::grpc::ServerCompletionQueue* cq,
-                                     RequestHandler* request_handler,
-                                     int req_id, RPCServer* rpc_server)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        rpc_server_(rpc_server) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetMonomerVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetMonomerVariable() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-
-    rpc_server_->WaitVarCond(varname);
-    MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    auto scope = h.scope_;
-    auto invar = scope->FindVar(varname);
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar,
-                             request_.trainer_id());
-
-    if (outvar) {
-      SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_);
-    }
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  RPCServer* rpc_server_{nullptr};
-};
-
-class RequestGetMonomerBarrier final : public RequestBase {
- public:
-  explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service,
-                                    ::grpc::ServerCompletionQueue* cq,
-                                    RequestHandler* request_handler, int req_id,
-                                    RPCServer* rpc_server)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        rpc_server_(rpc_server) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetMonomerBarrier);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetMonomerBarrier() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    VLOG(4) << "RequestGetMonomerBarrier " << varname;
-
-    rpc_server_->WaitVarCond(varname);
-    MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    framework::Scope* scope = nullptr;
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar,
-                             request_.trainer_id());
-
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  sendrecv::VoidMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-  RPCServer* rpc_server_{nullptr};
-};
-
-class RequestPrefetch final : public RequestBase {
- public:
-  explicit RequestPrefetch(GrpcService::AsyncService* service,
-                           ::grpc::ServerCompletionQueue* cq,
-                           RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        local_scope_(nullptr) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestPrefetch() {}
-
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    // prefetch process...
-    std::string in_var_name = request_->Varname();
-    std::string out_var_name = request_->OutVarname();
-    std::string table_name = request_->TableName();
-    int trainer_id = request_->GetTrainerId();
-    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
-            << " out_var_name: " << out_var_name;
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    // out var must be created in local scope!
-    framework::Variable* outvar = scope->Var(out_var_name);
-
-    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                             out_var_name, table_name);
-
-    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
-                          &reply_);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* local_scope_;
-};
-
-class RequestCheckpointNotify final : public RequestBase {
- public:
-  explicit RequestCheckpointNotify(GrpcService::AsyncService* service,
-                                   ::grpc::ServerCompletionQueue* cq,
-                                   RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx()));
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestCheckpointNotify() {}
-
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    auto scope = request_->GetMutableLocalScope();
-
-    std::string checkpoint_notify = request_->Varname();
-    std::string checkpoint_dir = request_->OutVarname();
-    int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
-            << ", dir: " << checkpoint_dir;
-
-    request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
-                             trainer_id, checkpoint_dir);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  sendrecv::VoidMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-void AsyncGRPCServer::WaitServerReady() {
-  VLOG(4) << "AsyncGRPCServer is waiting server ready";
-  std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
-}
-
-// Define an option subclass in order to disable SO_REUSEPORT for the
-// server socket.
-// Come from:
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
-class NoReusePortOption : public ::grpc::ServerBuilderOption {
- public:
-  void UpdateArguments(::grpc::ChannelArguments* args) override {
-    args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
-  }
-
-  void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
-                         plugins) override {}
-};
-
-void AsyncGRPCServer::StartServer() {
-  ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
-                           &selected_port_);
-
-  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-  if (FLAGS_rpc_disable_reuse_port) {
-    builder.SetOption(
-        std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
-  }
-  builder.RegisterService(&service_);
-
-  for (auto t : rpc_call_map_) {
-    rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
-  }
-
-  server_ = builder.BuildAndStart();
-  LOG(INFO) << "Server listening on " << bind_address_
-            << " selected port: " << selected_port_;
-
-  std::function<void(const std::string&, int)> f =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
-                std::placeholders::_1, std::placeholders::_2);
-
-  for (auto& t : rpc_call_map_) {
-    auto& rpc_name = t.first;
-    auto& cq = rpc_cq_[rpc_name];
-    auto threadnum = rpc_thread_num_[rpc_name];
-    auto& reqs = rpc_reqs_[rpc_name];
-
-    reqs.reserve(kRequestBufSize);
-
-    for (int i = 0; i < kRequestBufSize; i++) {
-      VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
-      TryToRegisterNewOne(rpc_name, i);
-    }
-
-    for (int i = 0; i < threadnum; i++) {
-      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
-          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(4) << t.first << " creates threads!";
-    }
-  }
-
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_ready_);
-    ready_ = 1;
-  }
-  condition_ready_.notify_all();
-
-  // wait server
-  server_->Wait();
-
-  for (auto& t : rpc_threads_) {
-    auto& threads = t.second;
-    for (size_t i = 0; i < threads.size(); ++i) {
-      threads[i]->join();
-      VLOG(4) << t.first << " threads ends!";
-    }
-  }
-}
-
-void AsyncGRPCServer::ShutdownQueue() {
-  for (auto& t : rpc_cq_) {
-    t.second->Shutdown();
-    VLOG(4) << t.first << " queue shutdown!";
-  }
-}
-
-void AsyncGRPCServer::ShutDownImpl() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  is_shut_down_ = true;
-  ShutdownQueue();
-
-  VLOG(4) << "server_ shutdown!";
-  server_->Shutdown();
-}
-
-void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
-                                          int req_id) {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
-    return;
-  }
-
-  VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
-          << " REQ ID: " << req_id;
-
-  auto& reqs = rpc_reqs_[rpc_name];
-  auto& handler = rpc_call_map_[rpc_name];
-  auto& cq = rpc_cq_[rpc_name];
-
-  RequestBase* b = nullptr;
-  if (rpc_name == kRequestSend) {
-    b = new RequestSend(&service_, cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestGet) {
-    b = new RequestGet(&service_, cq.get(), handler, req_id);
-
-  } else if (rpc_name == kRequestGetNoBarrier) {
-    b = new RequestGetNoBarrier(&service_, cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestGetMonomerVariable) {
-    b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id,
-                                      this);
-  } else if (rpc_name == kRequestGetMonomerBarrier) {
-    b = new RequestGetMonomerBarrier(&service_, cq.get(), handler, req_id,
-                                     this);
-  } else if (rpc_name == kRequestPrefetch) {
-    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestCheckpoint) {
-    b = new RequestCheckpointNotify(&service_, cq.get(), handler, req_id);
-  } else {
-    PADDLE_ENFORCE(false, "not supported rpc");
-  }
-
-  reqs[req_id] = b;
-
-  VLOG(4) << "TryToRegisterNewOne status:" << b->Status();
-}
-
-void AsyncGRPCServer::HandleRequest(
-    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
-    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
-  void* tag = NULL;
-  bool ok = false;
-
-  while (true) {
-    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
-    if (!cq->Next(&tag, &ok)) {
-      LOG(WARNING) << "CompletionQueue " << rpc_name << " shutdown!";
-      break;
-    }
-
-    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
-            << " get next";
-
-    auto& reqs = rpc_reqs_[rpc_name];
-    RequestBase* base = nullptr;
-    {
-      PADDLE_ENFORCE(req_id >= 0 && req_id < kRequestBufSize);
-      std::unique_lock<std::mutex> lock(cq_mutex_);
-      base = reqs[req_id];
-    }
-
-    VLOG(3) << base->Status2String(rpc_name);
-
-    // reference:
-    // https://github.com/tensorflow/tensorflow/issues/5596
-    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
-    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
-    if (!ok) {
-      VLOG(4) << "completion queue:" << rpc_name << " recv no regular event"
-              << " context:" << base->Status2String(rpc_name);
-      TryToRegisterNewOne(rpc_name, req_id);
-      delete base;
-      continue;
-    }
-
-    switch (base->Status()) {
-      case PROCESS: {
-        base->Process();
-        break;
-      }
-      case FINISH: {
-        TryToRegisterNewOne(rpc_name, req_id);
-        delete base;
-        break;
-      }
-      default: { assert(false); }
-    }
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h
deleted file mode 100644
index 2fd3a7a74073b52770158cf47b1c86cedae78291..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_service.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestBase;
-
-class AsyncGRPCServer final : public RPCServer {
- public:
-  explicit AsyncGRPCServer(const std::string& address, int client_num)
-      : RPCServer(address, client_num), ready_(0) {}
-
-  virtual ~AsyncGRPCServer() {}
-  void WaitServerReady() override;
-  void StartServer() override;
-
- private:
-  // HandleRequest needs to be thread-safe.
-  void HandleRequest(
-      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
-      std::function<void(const std::string&, int)> TryToRegisterNewOne);
-
-  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
-  void ShutdownQueue();
-  void ShutDownImpl() override;
-
- private:
-  static const int kRequestBufSize = 100;
-
-  std::mutex cq_mutex_;
-  volatile bool is_shut_down_ = false;
-
-  GrpcService::AsyncService service_;
-  std::unique_ptr<::grpc::Server> server_;
-
-  // condition of the sub program
-  std::condition_variable barrier_condition_;
-
-  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
-
-  int ready_;
-
-  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
-  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
-  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
deleted file mode 100644
index 2965fe4490bedd0253682f0aef44e096232fc2fc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ /dev/null
@@ -1,136 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <grpc++/impl/codegen/async_stream.h>
-#include <grpc++/impl/codegen/async_unary_call.h>
-#include <grpc++/impl/codegen/proto_utils.h>
-#include <grpc++/impl/codegen/rpc_method.h>
-#include <grpc++/impl/codegen/service_type.h>
-#include <grpc++/impl/codegen/status.h>
-#include <grpc++/impl/codegen/stub_options.h>
-#include <grpc++/impl/codegen/sync_stream.h>
-#include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/platform/profiler.h"
-
-// NOTE: This method was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       method and did some modifications so that we can parse gRPC
-//       requests without too much copying of the tensor data.
-
-namespace grpc {
-class CompletionQueue;
-class Channel;
-class RpcService;
-class ServerCompletionQueue;
-class ServerContext;
-
-// Support parsing/unparsing of tensorflow::VariableResponse.
-// Wire-format is identical to RecvVariableResponse.
-template <>
-class SerializationTraits<
-    paddle::operators::distributed::GRPCVariableResponse> {
- public:
-  static Status Serialize(
-      const paddle::operators::distributed::GRPCVariableResponse& msg,
-      grpc_byte_buffer** bp, bool* own_buffer) {
-    PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
-    return Status();
-  }
-  static Status Deserialize(
-      grpc_byte_buffer* buffer,
-      paddle::operators::distributed::GRPCVariableResponse* msg,
-      int max_message_size = INT_MAX) {
-    if (buffer == nullptr) {
-      return Status(StatusCode::INTERNAL, "No payload");
-    }
-
-    Status result = g_core_codegen_interface->ok();
-    if (result.ok()) {
-      paddle::operators::distributed::GrpcByteSource source(buffer);
-      int ret = msg->Parse(&source);
-      if (ret != 0) {
-        result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
-      }
-    }
-    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
-    return result;
-  }
-};
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum class GrpcMethod {
-  kSendVariable,
-  kGetVariable,
-  kPrefetchVariable,
-  kCheckpointNotify,
-  kGetVariableNoBarrier,
-  kGetMonomerVariable,
-  kGetMonomerBarrier,
-};
-
-static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kGetMonomerBarrier) + 1;
-
-inline const char* GrpcMethodName(GrpcMethod id) {
-  switch (id) {
-    case GrpcMethod::kSendVariable:
-      return "/sendrecv.SendRecvService/SendVariable";
-    case GrpcMethod::kGetVariable:
-      return "/sendrecv.SendRecvService/GetVariable";
-    case GrpcMethod::kGetVariableNoBarrier:
-      return "/sendrecv.SendRecvService/GetVariableNoBarrier";
-    case GrpcMethod::kGetMonomerVariable:
-      return "/sendrecv.SendRecvService/GetMonomerVariable";
-    case GrpcMethod::kGetMonomerBarrier:
-      return "/sendrecv.SendRecvService/GetMonomerBarrier";
-    case GrpcMethod::kPrefetchVariable:
-      return "/sendrecv.SendRecvService/PrefetchVariable";
-    case GrpcMethod::kCheckpointNotify:
-      return "/sendrecv.SendRecvService/CheckpointNotify";
-  }
-
-  // Shouldn't be reached.
-  PADDLE_ENFORCE(false, "Invalid id: not found valid method name");
-  return nullptr;
-}
-
-class GrpcService final {
- public:
-  class AsyncService : public ::grpc::Service {
-   public:
-    AsyncService() {
-      for (int i = 0; i < kGrpcNumMethods; ++i) {
-        AddMethod(new ::grpc::internal::RpcServiceMethod(
-            GrpcMethodName(static_cast<GrpcMethod>(i)),
-            ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
-        ::grpc::Service::MarkMethodAsync(i);
-      }
-    }
-    virtual ~AsyncService() {}
-
-    // Make RequestAsyncUnary public for grpc_call.h
-    using ::grpc::Service::RequestAsyncUnary;
-  };
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
deleted file mode 100644
index 87e83ca53bf13ac4a015d56572ba073e51722c3e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <utility>
-#include <vector>
-#ifdef PADDLE_WITH_CUDA
-#include <nccl.h>
-#endif
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum WireType {
-  WIRETYPE_VARINT = 0,
-  WIRETYPE_LENGTH_DELIMITED = 2,
-};
-
-inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
-
-inline WireType GetTagWireType(uint32_t tag) {
-  return static_cast<WireType>(tag & 0x7);
-}
-
-bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
-                         int* result) {
-  uint64_t v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
-    *result = static_cast<int>(v);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
-  GrpcByteBufferSource source;
-  source.Init(byte_buffer);
-  GrpcByteBufferSourceWrapper r(&source);
-
-  return Parse(&r);
-}
-
-bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
-                  std::vector<int64_t>* lod) {
-  while (true) {
-    auto p = input->ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-
-    if (!p.second) {
-      return (tag == 0);
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
-        uint64_t v;
-        if (wt == WIRETYPE_VARINT) {
-          if (!input->ReadVarint64(&v)) {
-            return false;
-          }
-          lod->push_back(v);
-          break;
-        }
-
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input->CurrentPosition();
-          while (input->CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input->ReadVarint64(&v)) {
-              return tag;
-            }
-            lod->push_back(v);
-          }
-          break;
-        }
-
-        return false;
-      }
-      default: { return false; }
-    }
-  }
-
-  return true;
-}
-
-int GRPCVariableResponse::Parse(Source* source) {
-  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
-      source->contents();
-  ::google::protobuf::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (true) {
-    auto p = input.ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-    if (!p.second) {
-      if (tag != 0) {
-        return -1;
-      }
-      return 0;
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage::kVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kTypeFieldNumber: {
-        uint32_t v;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_type(static_cast<::sendrecv::VarType>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
-        uint32_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDimsFieldNumber: {
-        // not packed
-        if (wt == WIRETYPE_VARINT) {
-          uint64_t v;
-          if (!input.ReadVarint64(&v)) {
-            return tag;
-          }
-          meta_.add_dims(v);
-          break;
-        }
-
-        // packed
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input.CurrentPosition();
-          while (input.CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input.ReadVarint64(&v)) {
-              return tag;
-            }
-            meta_.add_dims(v);
-          }
-          break;
-        }
-        return tag;
-      }
-      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_lod_level(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kLodFieldNumber: {
-        int length = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
-          return tag;
-        }
-
-        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
-            input.IncrementRecursionDepthAndPushLimit(length);
-
-        std::vector<int64_t> lod_data;
-        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
-          return tag;
-        }
-
-        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
-          return tag;
-        }
-
-        if (lod_data.size() == 0) {
-          break;
-        }
-
-        auto lod = meta_.add_lod();
-        for (uint32_t i = 0; i < lod_data.size(); i++) {
-          lod->add_lod_data(lod_data[i]);
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_slr_height(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kSerializedFieldNumber: {
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!ProcSerializedField(tag, &input, num_bytes)) {
-          return tag;
-        }
-
-        break;
-      }
-      case sendrecv::VariableMessage::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return tag;
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_out_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kProfileFieldNumber: {
-        uint64_t profiling = 0;
-        if (!input.ReadVarint64(&profiling)) {
-          return tag;
-        }
-        meta_.set_profile(profiling);
-        int64_t listener_id = platform::ListenerId();
-        if (listener_id <= 0) {
-          break;
-        }
-        if (profiling == platform::kEnableProfiler &&
-            !platform::IsProfileEnabled()) {
-          platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (profiling == platform::kDisableProfiler &&
-                   platform::IsProfileEnabled()) {
-          platform::DisableProfiler(
-              platform::EventSortingKey::kDefault,
-              string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path,
-                              listener_id));
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kTrainerIdFieldNumber: {
-        uint64_t trainer_id = 0;
-        if (!input.ReadVarint64(&trainer_id)) {
-          return tag;
-        }
-        meta_.set_trainer_id(trainer_id);
-        break;
-      }
-      case sendrecv::VariableMessage::kTableNameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_table_name(temp);
-        break;
-      }
-      default: {
-        // Unknown tag, return unknown error.
-        return -1;
-      }
-    }
-  }
-
-  return 0;
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
deleted file mode 100644
index 3ca1d89f750313791c833a9f1f58760406e690c2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class GRPCVariableResponse : public VariableResponse {
- public:
-  GRPCVariableResponse(const framework::Scope* scope,
-                       const platform::DeviceContext* dev_ctx,
-                       bool create_scope = false)
-      : VariableResponse(scope, dev_ctx, create_scope) {}
-
-  virtual ~GRPCVariableResponse() {}
-
-  int Parse(Source* source) override;
-
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  int Parse(const ::grpc::ByteBuffer& byte_buffer);
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
deleted file mode 100644
index c8b8561d673efca21e72aa31a64214bbe4afd96c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ /dev/null
@@ -1,268 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-static std::vector<std::vector<int64_t>> SplitIds(
-    const std::vector<int64_t>& ids_vector,
-    const std::vector<int64_t>& height_section) {
-  std::set<int64_t> all_ids;
-  for (auto id : ids_vector) {
-    all_ids.insert(id);
-  }
-
-  auto abs_sections = ToAbsoluteSection(height_section);
-  std::vector<std::vector<int64_t>> splited_ids;
-  splited_ids.resize(height_section.size() + 1);
-  for (auto& id : all_ids) {
-    auto section_index = GetSectionIndex(id, abs_sections);
-    splited_ids[section_index].push_back(id - abs_sections[section_index]);
-  }
-  return splited_ids;
-}
-
-static void SplitIdsIntoMultipleVarsBySection(
-    const std::vector<std::string>& in_var_names,
-    const std::vector<int64_t>& height_section,
-    const std::vector<std::vector<int64_t>>& splited_ids,
-    framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");
-
-  auto place = platform::CPUPlace();
-
-  for (size_t i = 0; i < in_var_names.size(); ++i) {
-    auto* id_tensor =
-        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
-    auto& ids = splited_ids[i];
-    if (!ids.empty()) {
-      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
-    }
-  }
-}
-
-typedef std::vector<std::pair<std::string, std::string>> TableAndEndpoints;
-
-void prefetch_core(
-    const std::vector<int64_t>& ids, const TableAndEndpoints& tables,
-    const std::vector<int64_t>& height_sections,
-    const framework::ExecutionContext& context, const framework::Scope& scope,
-    std::unordered_map<int64_t, std::vector<float>>* recved_vec_map) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& actual_ctx = *pool.Get(context.GetPlace());
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (size_t i = 0; i < tables.size(); ++i) {
-    in_var_names.push_back("prefetch_send@" + tables[i].second);
-    out_var_names.push_back("prefetch_recv@" + tables[i].second);
-  }
-
-  auto splited_ids = SplitIds(ids, height_sections);
-  SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
-                                    local_scope.get());
-
-  // create output var in local scope
-  for (auto& name : out_var_names) {
-    local_scope->Var(name)->GetMutable<framework::LoDTensor>();
-  }
-
-  distributed::RPCClient* rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          context.Attr<int>("trainer_id"));
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < in_var_names.size(); i++) {
-    if (NeedSend(*local_scope.get(), in_var_names[i])) {
-      VLOG(3) << "sending " << in_var_names[i] << " to " << tables[i].second
-              << " to get " << out_var_names[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(
-          tables[i].second, actual_ctx, *local_scope.get(), in_var_names[i],
-          out_var_names[i], tables[i].first));
-    } else {
-      VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
-    }
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-  }
-
-  PADDLE_ENFORCE_EQ(out_var_names.size(), height_sections.size(), "");
-
-  auto abs_sections = ToAbsoluteSection(height_sections);
-  for (size_t section_idx = 0; section_idx < out_var_names.size();
-       ++section_idx) {
-    auto& ids_in_this_section = splited_ids[section_idx];
-    if (!ids_in_this_section.empty()) {
-      auto& prefetch_out_var = local_scope->Var(out_var_names[section_idx])
-                                   ->Get<framework::LoDTensor>();
-      const auto* out_var_data = prefetch_out_var.data<float>();
-      auto& dims = prefetch_out_var.dims();
-
-      PADDLE_ENFORCE_EQ(dims.size(), 2, "");
-      PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
-
-      auto row_numel = dims[1];
-
-      for (int64_t i = 0; i < dims[0]; ++i) {
-        auto id = ids_in_this_section[i];
-        auto origin_id = id + abs_sections[section_idx];
-        std::vector<float> vecs(row_numel);
-        std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin());
-        (*recved_vec_map)[origin_id] = vecs;
-      }
-    } else {
-      VLOG(3) << "ids in this section is empty";
-    }
-  }
-}
-
-void prefetch(const std::string& id_name, const std::string& out_name,
-              const std::string& persistable_var_name, const bool backfill,
-              const std::vector<std::string>& table_names,
-              const std::vector<std::string>& endpoints,
-              const std::vector<int64_t>& height_sections,
-              const framework::ExecutionContext& context,
-              const framework::Scope& scope) {
-  prefetchs({id_name}, {out_name}, persistable_var_name, backfill, table_names,
-            endpoints, height_sections, context, scope);
-}
-
-void prefetchs(const std::vector<std::string>& id_var_names,
-               const std::vector<std::string>& out_var_names,
-               const std::string& persistable_var_name, const bool backfill,
-               const std::vector<std::string>& table_names,
-               const std::vector<std::string>& endpoints,
-               const std::vector<int64_t>& height_sections,
-               const framework::ExecutionContext& context,
-               const framework::Scope& scope) {
-  PADDLE_ENFORCE_GT(id_var_names.size(), 0, "");
-  PADDLE_ENFORCE_EQ(id_var_names.size(), out_var_names.size(), "");
-  PADDLE_ENFORCE_EQ(table_names.size(), endpoints.size(), "");
-  PADDLE_ENFORCE_EQ(table_names.size(), height_sections.size(), "");
-
-  auto* reconstruct_var =
-      scope.FindVar(persistable_var_name)->GetMutable<framework::LoDTensor>();
-  const auto vec_dim_1 = reconstruct_var->dims()[1];
-
-  const auto place =
-      scope.FindVar(id_var_names[0])->Get<framework::LoDTensor>().place();
-
-  if (!platform::is_cpu_place(place)) {
-    PADDLE_THROW("multi prefetch only support CPU currently");
-  }
-
-  std::vector<std::vector<int64_t>> ids_group;
-  std::vector<int64_t> ids_union;
-  std::vector<framework::LoD> ids_lods;
-  TableAndEndpoints tables;
-
-  for (auto& id_name : id_var_names) {
-    auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
-    auto* id_data = id_tensor.data<int64_t>();
-    std::vector<int64_t> ids;
-
-    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
-      ids.push_back(id_data[i]);
-      ids_union.push_back(id_data[i]);
-    }
-    ids_group.push_back(ids);
-    ids_lods.push_back(id_tensor.lod());
-  }
-
-  std::unordered_set<int64_t> s(ids_union.begin(), ids_union.end());
-  ids_union.assign(s.begin(), s.end());
-
-  for (int i = 0; i < table_names.size(); i++) {
-    tables.push_back(std::make_pair(table_names[i], endpoints[i]));
-  }
-
-  std::unordered_map<int64_t, std::vector<float>> recved_vec_map;
-  prefetch_core(ids_union, tables, height_sections, context, scope,
-                &recved_vec_map);
-
-  auto padding_idx = distributed::kNoPadding;
-
-  if (context.HasAttr("padding_idx")) {
-    padding_idx = context.Attr<int64_t>("padding_idx");
-  }
-
-  // copy vectors to out vars
-  for (int i = 0; i < out_var_names.size(); i++) {
-    auto& ids = ids_group[i];
-    auto* out_t =
-        scope.FindVar(out_var_names[i])->GetMutable<framework::LoDTensor>();
-    out_t->Resize(
-        framework::make_ddim({static_cast<int64_t>(ids.size()), vec_dim_1}));
-    out_t->set_lod(ids_lods[i]);
-
-    auto* out_d = out_t->mutable_data<float>(place);
-
-    for (int idx = 0; idx < ids.size(); idx++) {
-      const auto& id = ids[idx];
-
-      if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-        memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
-      } else {
-        std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
-                    out_d + idx * vec_dim_1);
-      }
-    }
-  }
-
-  if (backfill) {
-    VLOG(3) << "backfill persistable var's id with vecs";
-
-    auto* reconstruct_d = reconstruct_var->data<float>();
-    for (auto& id : ids_union) {
-      std::copy(recved_vec_map[id].begin(), recved_vec_map[id].end(),
-                reconstruct_d + id * vec_dim_1);
-    }
-  }
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
deleted file mode 100644
index a531c87f57ca19fe0fd55ea41e833c0d6ff161ae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-constexpr int64_t kNoPadding = -1;
-
-void prefetchs(const std::vector<std::string>& id_var_names,
-               const std::vector<std::string>& out_var_names,
-               const std::string& persistable_var_name, const bool backfill,
-               const std::vector<std::string>& table_names,
-               const std::vector<std::string>& endpoints,
-               const std::vector<int64_t>& height_sections,
-               const framework::ExecutionContext& context,
-               const framework::Scope& scope);
-
-void prefetch(const std::string& id_name, const std::string& out_name,
-              const std::string& persistable_var_name, const bool backfill,
-              const std::vector<std::string>& table_names,
-              const std::vector<std::string>& endpoints,
-              const std::vector<int64_t>& height_sections,
-              const framework::ExecutionContext& context,
-              const framework::Scope& scope);
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
deleted file mode 100644
index da73167ae603fb8c8ba9deabe118269891d1f52a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-template <typename T>
-void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
-                                  const framework::Scope &scope) {
-  VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx = *pool.Get(platform::CPUPlace());
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  auto *recv_var = scope.FindVar(rpc_ctx.var_name);
-
-  // recv all vars to local scope
-  if (recv_var->IsType<framework::LoDTensor>()) {
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
-      auto &recv_var_name = rpc_ctx.splited_var_names[i];
-      local_scope->Var(recv_var_name);
-      VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
-      rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx,
-                                             *local_scope.get(), recv_var_name,
-                                             recv_var_name));
-    }
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-    }
-  } else {
-    PADDLE_THROW("unsupported var type to recv!");
-  }
-
-  // concat recved tensor into one var
-  {
-    size_t output_offset = 0;
-    size_t row_offset = 0;
-    framework::Tensor *recv_tensor =
-        recv_var->GetMutable<framework::LoDTensor>();
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
-    int64_t recv_numel = 0;
-    for (auto &recv_var_name : rpc_ctx.splited_var_names) {
-      auto *recv_var = local_scope->FindVar(recv_var_name);
-      if (recv_var->IsType<framework::LoDTensor>()) {
-        auto &in = recv_var->Get<framework::LoDTensor>();
-        recv_numel += in.numel();
-        auto in_stride = framework::stride_numel(in.dims());
-        auto out_stride = framework::stride_numel(recv_tensor->dims());
-        StridedNumelCopyWithAxis<T>(
-            dev_ctx, 0, recv_tensor->data<T>() + output_offset, out_stride,
-            in.data<T>(), in_stride, in_stride[0]);
-        output_offset += in_stride[0];
-      } else if (recv_var->IsType<framework::SelectedRows>()) {
-        auto &recv_slr = recv_var->Get<framework::SelectedRows>();
-        auto &recv_dims = recv_tensor->dims();
-        int64_t width = recv_dims[1];
-        recv_numel += recv_slr.height() * width;
-        PADDLE_ENFORCE_EQ(recv_slr.value().dims()[1], width);
-        PADDLE_ENFORCE_EQ(recv_slr.value().dims()[0], recv_slr.rows().size());
-        VLOG(3) << "recv slr " << recv_var_name << " dims "
-                << recv_slr.value().dims();
-        if (VLOG_IS_ON(3)) {
-          std::ostringstream sstream;
-          sstream << "[";
-          for (auto &row_id : recv_slr.rows()) {
-            sstream << row_id << ", ";
-          }
-          sstream << "]";
-          VLOG(3) << "recv_slr size: " << recv_slr.rows().size() << " "
-                  << sstream.str();
-        }
-
-        for (auto i = 0; i < recv_slr.rows().size(); ++i) {
-          auto row_id = recv_slr.rows()[i] + row_offset;
-          PADDLE_ENFORCE_LT(row_id, recv_dims[0]);
-          memcpy(recv_tensor->data<T>() + row_id * width,
-                 recv_slr.value().data<T>() + i * width, sizeof(T) * width);
-        }
-        row_offset += recv_slr.height();
-      } else {
-        PADDLE_THROW("unsupported recieved var type");
-      }
-    }
-    auto numel = recv_tensor->numel();
-    if (recv_numel != numel) {
-      LOG(FATAL) << "recv_numel: " << recv_numel << " acture numel: " << numel;
-    }
-    PADDLE_ENFORCE_EQ(recv_numel, numel);
-  }
-
-  VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
-}
-
-template struct ParameterRecv<float>;
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h
deleted file mode 100644
index e955fca7250ecc88f3b1a08611f380da50df788d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_recv.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distributed/rpc_common.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-template <typename T>
-struct ParameterRecv {
-  void operator()(const RpcContext &rpc_ctx, const framework::Scope &scope);
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
deleted file mode 100644
index dfabad567af590b65b9e777824d476fce2b17238..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-template <typename T>
-void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
-                                  const framework::Scope &scope, bool sync) {
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx = *pool.Get(platform::CPUPlace());
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  auto *send_var = scope.FindVar(rpc_ctx.var_name);
-  size_t out_num = rpc_ctx.splited_var_names.size();
-  if (send_var->IsType<framework::LoDTensor>()) {
-    if (out_num > 1) {
-      auto &send_tensor = send_var->Get<framework::LoDTensor>();
-      auto &send_tensor_dims = send_tensor.dims();
-      std::vector<framework::DDim> outs_dims;
-      outs_dims.reserve(out_num);
-
-      // infer output shape
-      PADDLE_ENFORCE_EQ(rpc_ctx.height_sections.size(), out_num,
-                        "tensor split sections size"
-                        "should be equal to output size.");
-      for (size_t i = 0; i < out_num; ++i) {
-        auto dim = send_tensor_dims;
-        dim[0] = rpc_ctx.height_sections[i];
-        outs_dims.push_back(dim);
-      }
-
-      // create output var in local scope
-      size_t row_offset = 0;
-      for (auto i = 0; i < out_num; ++i) {
-        framework::Tensor *out = local_scope->Var(rpc_ctx.splited_var_names[i])
-                                     ->GetMutable<framework::LoDTensor>();
-        *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
-        row_offset += outs_dims[i][0];
-      }
-    }
-  } else if (send_var->IsType<framework::SelectedRows>()) {
-    auto &send_slr = send_var->Get<framework::SelectedRows>();
-    auto abs_sections = ToAbsoluteSection(rpc_ctx.height_sections);
-
-    auto &send_rows = send_slr.rows();
-    std::vector<std::vector<size_t>> outs_rows_idx;
-    std::vector<std::vector<size_t>> outs_dense_idx;
-
-    outs_rows_idx.resize(out_num);
-    outs_dense_idx.resize(out_num);
-
-    auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0];
-    auto *src = send_slr.value().data<T>();
-
-    // create output var in local scope
-    std::vector<framework::SelectedRows *> outs;
-    for (auto &name : rpc_ctx.splited_var_names) {
-      auto *out = local_scope->Var(name)->GetMutable<framework::SelectedRows>();
-      outs.push_back(out);
-    }
-
-    // split rows index into output sparse vars
-    for (size_t i = 0; i < send_rows.size(); ++i) {
-      size_t out_idx = GetSectionIndex(send_rows[i], abs_sections);
-      outs_rows_idx[out_idx].push_back(send_rows[i]);
-      outs_dense_idx[out_idx].push_back(i);
-    }
-    auto place = platform::CPUPlace();
-
-    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
-      auto rows_idx = outs_rows_idx[i];
-      outs[i]->set_height(rpc_ctx.height_sections[i]);
-      auto dims = send_slr.GetCompleteDims();
-      dims[0] = rows_idx.size();
-      outs[i]->mutable_rows()->clear();
-      outs[i]->mutable_value()->mutable_data<T>(dims, send_slr.place());
-      if (rows_idx.size() > 0) {
-        for (auto idx : rows_idx) {
-          outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
-        }
-        auto dst = outs[i]->mutable_value()->mutable_data<T>(place);
-        for (size_t j = 0; j < rows_idx.size(); j++) {
-          if (platform::is_cpu_place(place)) {
-            memory::Copy(
-                platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
-                src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
-          } else {
-            PADDLE_THROW("do not support GPU now");
-            /*
-            #ifdef PADDLE_WITH_CUDA
-                        auto stream = ctx.cuda_device_context().stream();
-                        memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
-                                     platform::CUDAPlace(),
-                                     src + outs_dense_idx[i][j] * row_numel,
-                                     sizeof(T) * row_numel, stream);
-            #else
-                        PADDLE_THROW("Paddle is not compiled with GPU");
-            #endif
-            */
-          }
-        }
-      }
-      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
-                        "rows should has the same size with tensor dim 0");
-    }
-
-  } else {
-    PADDLE_THROW("unsupported var type to send!");
-  }
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
-    auto &send_var_name = rpc_ctx.splited_var_names[i];
-    auto &endpoint = rpc_ctx.epmap[i];
-    if (NeedSend(*local_scope.get(), send_var_name)) {
-      VLOG(3) << "sending " << send_var_name << " to " << endpoint;
-      rets.push_back(rpc_client->AsyncSendVar(
-          endpoint, cpu_ctx, *local_scope.get(), send_var_name));
-    } else {
-      VLOG(3) << "don't send non-initialized variable: "
-              << rpc_ctx.splited_var_names[i];
-    }
-  }
-
-  if (sync) {
-    for (auto &handle : rets) {
-      PADDLE_ENFORCE(handle->Wait(), "internal error in RPCClient");
-    }
-  }
-}
-
-template struct ParameterSend<float>;
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h
deleted file mode 100644
index 9077f4a4fb9fd9d7152e8be72519f16b1999e93d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_send.h
+++ /dev/null
@@ -1,35 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distributed/rpc_common.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-template <typename T>
-struct ParameterSend {
-  void operator()(const RpcContext &rpc_ctx, const framework::Scope &scope,
-                  bool sync);
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
deleted file mode 100644
index e9f06f54327875c0568c571627e9effb998e15be..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <string>
-
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-char* EncodeVarint32(char* dst, uint32_t v) {
-  // Operate on characters as unsigneds
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  static const int B = 128;
-  if (v < (1 << 7)) {
-    *(ptr++) = v;
-  } else if (v < (1 << 14)) {
-    *(ptr++) = v | B;
-    *(ptr++) = v >> 7;
-  } else if (v < (1 << 21)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = v >> 14;
-  } else if (v < (1 << 28)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = v >> 21;
-  } else {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = (v >> 21) | B;
-    *(ptr++) = v >> 28;
-  }
-  return reinterpret_cast<char*>(ptr);
-}
-
-char* EncodeVarint64(char* dst, uint64_t v) {
-  static const int B = 128;
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  while (v >= B) {
-    *(ptr++) = (v & (B - 1)) | B;
-    v >>= 7;
-  }
-  *(ptr++) = static_cast<unsigned char>(v);
-  return reinterpret_cast<char*>(ptr);
-}
-
-int VarintLength(uint64_t v) {
-  int len = 1;
-  while (v >= 128) {
-    v >>= 7;
-    len++;
-  }
-  return len;
-}
-
-class ProtoEncodeHelper {
- public:
-  ProtoEncodeHelper(char* buf, int max_size)
-      : base_(buf), p_(buf), limit_(base_ + max_size) {}
-
-  ~ProtoEncodeHelper() {
-#define REPLACE_ENFORCE_GLOG 1
-    // Make sure callers didn't do operations that went over max_size promised
-    if (paddle::platform::is_error(p_ <= limit_)) {
-      paddle::platform::throw_on_error(p_ <= limit_, "");
-    }
-#undef REPLACE_ENFORCE_GLOG
-  }
-
-  const char* data() const { return base_; }
-  size_t size() const { return p_ - base_; }
-
-  void WriteUint64(int tag, uint64_t v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    Encode64(v);
-  }
-  void WriteBool(int tag, bool v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    EncodeBool(v);
-  }
-  void WriteString(int tag, const std::string& v) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(v.size());
-    EncodeBytes(v.data(), v.size());
-  }
-  void WriteVarlengthBeginning(int tag, uint32_t len) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(len);
-  }
-  void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
-
- private:
-  // Note: this module's behavior must match the protocol buffer wire encoding
-  // format.
-  enum {
-    WIRETYPE_VARINT = 0,
-    WIRETYPE_LENGTH_DELIMITED = 2,
-  };
-  static uint32_t combine(uint32_t tag, uint32_t type) {
-    return ((tag << 3) | type);
-  }
-  inline void Encode32(uint32_t v) {
-    if (v < 128) {
-      // Fast path for single-byte values.  Many of the calls will use a
-      // constant value for v, so the comparison will get optimized away
-      // when Encode32 is inlined into the caller.
-      *p_ = v;
-      p_++;
-    } else {
-      p_ = EncodeVarint32(p_, v);
-    }
-  }
-  void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
-  void EncodeBool(bool v) {
-    *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
-    p_++;
-  }
-  void EncodeBytes(const char* bytes, int N) {
-    memcpy(p_, bytes, N);
-    p_ += N;
-  }
-
-  char* base_;
-  char* p_;
-  char* limit_;  // Just for CHECKs
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
deleted file mode 100644
index 22083d92ed42f0e4f13768b0fa4d3254171c0d4d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ /dev/null
@@ -1,246 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <time.h>
-#include <condition_variable>  // NOLINT
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-constexpr char kRequestSend[] = "RequestSend";
-constexpr char kRequestGet[] = "RequestGet";
-constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable";
-constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
-constexpr char kRequestPrefetch[] = "RequestPrefetch";
-constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
-constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
-constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
-
-constexpr char kSendRPC[] = "SendRPC";
-constexpr char kGetRPC[] = "GetRPC";
-constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC";
-constexpr char kGetMonomerRPC[] = "GetMonomerRPC";
-constexpr char kPrefetchRPC[] = "PrefetchRPC";
-constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC";
-constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
-constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
-constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
-constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
-
-#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
-#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
-#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
-#define COMPLETE_MESSAGE "COMPLETE@RECV"
-#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV"
-
-#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
-#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
-
-class RPCServer;
-
-class VarHandle {
- public:
-  VarHandle(const std::string ep, const std::string& method,
-            const std::string& name,
-            const platform::DeviceContext* p_ctx = nullptr,
-            const framework::Scope* p_scope = nullptr)
-      : status_(kDefaultState) {
-    ep_ = ep;
-    ctx_ = p_ctx;
-    scope_ = p_scope;
-    name_ = name;
-    method_ = method;
-  }
-
-  virtual ~VarHandle() {}
-
- public:
-  bool should_retry = false;
-
-  bool Wait() {
-    int ret = kDefaultState;
-    {
-      std::unique_lock<std::mutex> lk(sync_mutex_);
-      wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
-      ret = status_;
-    }
-    VLOG(7) << "VarHandle wait:" << ret;
-    return ret != kErrorState;
-  }
-
-  void Finish(bool ok) {
-    {
-      std::unique_lock<std::mutex> lk(sync_mutex_);
-      status_ = ok ? kFinishState : kErrorState;
-    }
-    VLOG(7) << "VarHandle finish:" << ok;
-    wait_cond_.notify_all();
-  }
-
-  std::string String() const {
-    std::ostringstream s;
-    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:["
-      << status_ << "]";
-    return s.str();
-  }
-
-  std::string ep() const { return ep_; }
-  const platform::DeviceContext* ctx() const { return ctx_; }
-  const framework::Scope* scope() const { return scope_; }
-  std::string name() const { return name_; }
-  std::string method() const { return method_; }
-
- protected:
-  // RPC endpoint.
-  std::string ep_;
-  const platform::DeviceContext* ctx_;
-  const framework::Scope* scope_;
-  // Variable name.
-  std::string name_;
-  // RPC method name.
-  std::string method_;
-
- protected:
-  std::mutex sync_mutex_;
-  std::condition_variable wait_cond_;
-
-  enum VarHandleStatus {
-    kDefaultState = -1,
-    kErrorState = 0,
-    kFinishState = 1,
-  };
-  VarHandleStatus status_;
-
- private:
-  DISABLE_COPY_AND_ASSIGN(VarHandle);
-};
-
-typedef std::shared_ptr<VarHandle> VarHandlePtr;
-
-class RequestHandler {
- public:
-  explicit RequestHandler(bool sync_mode)
-      : sync_mode_(sync_mode),
-        dev_ctx_(nullptr),
-        executor_(nullptr),
-        scope_(nullptr),
-        program_(nullptr),
-        rpc_server_(nullptr) {}
-
-  virtual ~RequestHandler() {}
-
-  // Set attributes.
-  void SetScope(framework::Scope* scope) { scope_ = scope; }
-  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
-  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
-  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
-
-  // Used for dist lookup table prefetch
-  void SetPrefetchPreparedCtx(
-      std::unordered_map<
-          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
-    prefetch_var_name_to_prepared_ctx_ = g;
-  }
-
-  void SetCheckpointNotifyPreparedCtx(
-      std::shared_ptr<framework::ExecutorPrepareContext> g) {
-    checkpoint_prepared_ctx_ = g;
-  }
-
-  // Used for async.
-  void SetGradToPreparedCtx(
-      std::unordered_map<
-          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
-    grad_to_prepared_ctx_ = g;
-  }
-
-  void SetSparseGradToParam(std::unordered_map<std::string, std::string>* g) {
-    sparse_grad_to_param_ = g;
-  }
-
-  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
-
-  // Get attributes.
-  bool sync_mode() { return sync_mode_; }
-  framework::Scope* scope() { return scope_; }
-  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
-  framework::ProgramDesc* program() { return program_; }
-  framework::Executor* executor() { return executor_; }
-
-  // This function processes user's rpc request.
-  // The implemention is in request_handler_impl.
-  // example:
-  //    std::string varname = request_.varname();
-  //
-  //    auto scope = request_handler_->scope();
-  //    auto invar = scope->FindVar(varname);
-  //    framework::Variable* outvar = nullptr;
-  //
-  //    request_handler_->Handle(varname, scope, invar, &outvar);
-  //    if (outvar) {
-  //        SerializeToByteBuffer(varname, outvar,
-  //           *request_handler_->dev_ctx(), &reply_);
-  //    }
-  virtual bool Handle(const std::string& varname, framework::Scope* scope,
-                      framework::Variable* var, framework::Variable** outvar,
-                      const int trainer_id,
-                      const std::string& out_var_name = "",
-                      const std::string& table_name = "") = 0;
-
- protected:
-  const bool sync_mode_;
-
-  const platform::DeviceContext* dev_ctx_;
-  framework::Executor* executor_;
-  framework::Scope* scope_;
-  framework::ProgramDesc* program_;
-
-  // used for distribute lookup table prefetch
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>*
-      prefetch_var_name_to_prepared_ctx_;
-  // used for checkpoint notify
-  std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_prepared_ctx_;
-
-  // Used for async.
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>*
-      grad_to_prepared_ctx_;
-  std::unordered_map<std::string, std::string>* sparse_grad_to_param_;
-
-  RPCServer* rpc_server_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
deleted file mode 100644
index c2368ab10ebcc6c7972e2bf6abf017b140356772..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/string/piece.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
-// to directory specified.
-constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
-
-bool RequestSendHandler::Handle(const std::string& varname,
-                                framework::Scope* scope,
-                                framework::Variable* invar,
-                                framework::Variable** outvar,
-                                const int trainer_id,
-                                const std::string& out_var_name,
-                                const std::string& table_name) {
-  VLOG(4) << "RequestSendHandler:" << varname;
-
-  // Sync
-  if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
-    rpc_server_->IncreaseBatchBarrier(kRequestSend);
-  } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(3) << "sync: recv complete message";
-    rpc_server_->Complete();
-  } else {
-    // Async
-    if (!sync_mode_) {
-      VLOG(3) << "async process var: " << varname;
-      if (varname == BATCH_BARRIER_MESSAGE) {
-        PADDLE_THROW(
-            "async mode should not recv BATCH_BARRIER_MESSAGE or "
-            "COMPLETE_MESSAGE");
-      }
-      if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(varname)) {
-        auto& grad_slr =
-            scope->FindVar(varname)->Get<framework::SelectedRows>();
-        AsyncSparseParamUpdateRecorder::GetInstance()->Update(varname,
-                                                              grad_slr.rows());
-      }
-      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
-                                    scope);
-      return true;
-    } else {  // sync
-      rpc_server_->WaitCond(kRequestSend);
-      VLOG(3) << "sync: processing received var: " << varname;
-
-      if (invar == nullptr) {
-        LOG(FATAL) << "sync: Can not find server side var: " << varname;
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool RequestGetHandler::Handle(const std::string& varname,
-                               framework::Scope* scope,
-                               framework::Variable* invar,
-                               framework::Variable** outvar,
-                               const int trainer_id,
-                               const std::string& out_var_name,
-                               const std::string& table_name) {
-  VLOG(3) << "RequestGetHandler:" << varname
-          << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id
-          << " table_name: " << table_name;
-
-  if (sync_mode_) {
-    if (varname == FETCH_BARRIER_MESSAGE) {
-      VLOG(3) << "sync: recv fetch barrier message";
-      rpc_server_->IncreaseBatchBarrier(kRequestGet);
-    } else {
-      rpc_server_->WaitCond(kRequestGet);
-      *outvar = scope_->FindVar(varname);
-    }
-  } else {
-    if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
-      if (enable_dc_asgd_) {
-        // NOTE: the format is determined by distribute_transpiler.py
-        std::string param_bak_name =
-            string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
-        VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
-        auto var = scope_->FindVar(varname);
-        auto t_orig = var->Get<framework::LoDTensor>();
-        auto param_bak = scope_->Var(param_bak_name);
-        auto t = param_bak->GetMutable<framework::LoDTensor>();
-        t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
-        VLOG(3) << "copying " << varname << " to " << param_bak_name;
-        framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
-      }
-      *outvar = scope_->FindVar(varname);
-    }
-  }
-  return true;
-}
-
-bool RequestGetNoBarrierHandler::Handle(const std::string& varname,
-                                        framework::Scope* scope,
-                                        framework::Variable* invar,
-                                        framework::Variable** outvar,
-                                        const int trainer_id,
-                                        const std::string& out_var_name,
-                                        const std::string& table_name) {
-  VLOG(4) << "RequestGetNoBarrierHandler:" << varname
-          << " out_var_name: " << out_var_name;
-
-  // get var from pserver immediately without barriers
-  string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE);
-  string::Piece var_name_piece = string::Piece(varname);
-
-  if (string::Contains(var_name_piece, without_barrier_piece)) {
-    var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece);
-    VLOG(4) << "Get var " << var_name_piece << " with "
-            << WITHOUT_BARRIER_MESSAGE;
-    *outvar = scope_->FindVar(var_name_piece.ToString());
-    return true;
-  } else {
-    PADDLE_THROW("GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE);
-  }
-  return true;
-}
-
-bool RequestPrefetchHandler::Handle(const std::string& varname,
-                                    framework::Scope* scope,
-                                    framework::Variable* invar,
-                                    framework::Variable** outvar,
-                                    const int trainer_id,
-                                    const std::string& out_var_name,
-                                    const std::string& table_name) {
-  VLOG(4) << "RequestPrefetchHandler " << varname;
-
-  if (table_name.empty()) {
-    auto var_desc = program_->Block(0).FindVar(out_var_name);
-    InitializeVariable(*outvar, var_desc->GetType());
-    executor_->RunPreparedContext(
-        (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
-  } else {
-    (*outvar)->GetMutable<framework::LoDTensor>();
-    auto lookup_table_op =
-        BuildLookupTableOp(table_name, varname, out_var_name);
-    paddle::platform::CPUPlace cpu_place;
-    lookup_table_op->Run(*scope, cpu_place);
-  }
-  return true;
-}
-
-bool RequestCheckpointHandler::Handle(const std::string& varname,
-                                      framework::Scope* scope,
-                                      framework::Variable* invar,
-                                      framework::Variable** outvar,
-                                      const int trainer_id,
-                                      const std::string& out_var_name,
-                                      const std::string& table_name) {
-  PADDLE_ENFORCE(
-      checkpoint_notify_id != -1,
-      "when checkpoint_notify_id = -1, there should be no RPC invoke.");
-
-  // TODO(tangwei12): find out why scope will be error.
-  auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
-  lt_var->clear();
-  lt_var->append(out_var_name);
-  VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
-          << out_var_name;
-  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
-  return true;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
deleted file mode 100644
index f3c1b24526b8b28033c0c979f74d44a3d7a94201..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <time.h>
-
-#include <functional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestSendHandler final : public RequestHandler {
- public:
-  explicit RequestSendHandler(bool sync_mode, bool enable_dc_asgd = false)
-      : RequestHandler(sync_mode) {
-    enable_dc_asgd_ = enable_dc_asgd;
-  }
-  virtual ~RequestSendHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  bool enable_dc_asgd_;
-};
-
-class RequestGetHandler final : public RequestHandler {
- public:
-  explicit RequestGetHandler(bool sync_mode, bool enable_dc_asgd = false)
-      : RequestHandler(sync_mode) {
-    enable_dc_asgd_ = enable_dc_asgd;
-  }
-  virtual ~RequestGetHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  bool enable_dc_asgd_;
-};
-
-class RequestGetNoBarrierHandler final : public RequestHandler {
- public:
-  RequestGetNoBarrierHandler() : RequestHandler(false) {}
-  virtual ~RequestGetNoBarrierHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-};
-
-static inline void BuildVar(const std::string& param_name,
-                            std::initializer_list<const char*> arguments,
-                            paddle::framework::proto::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    *var->mutable_arguments()->Add() = arg_name;
-  }
-}
-
-class RequestPrefetchHandler final : public RequestHandler {
- public:
-  explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
-  virtual ~RequestPrefetchHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  std::unique_ptr<paddle::framework::OperatorBase> BuildLookupTableOp(
-      const std::string& table_name, const std::string& id_name,
-      const std::string& out_name) {
-    paddle::framework::proto::OpDesc op_desc;
-    op_desc.set_type("lookup_table");
-    BuildVar("W", {table_name.data()}, op_desc.add_inputs());
-    BuildVar("Ids", {id_name.data()}, op_desc.add_inputs());
-    BuildVar("Out", {out_name.data()}, op_desc.add_outputs());
-
-    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    return op;
-  }
-};
-
-class RequestCheckpointHandler final : public RequestHandler {
- public:
-  explicit RequestCheckpointHandler(bool sync_mode, int checkpoint_notify_id)
-      : RequestHandler(sync_mode) {
-    this->checkpoint_notify_id = checkpoint_notify_id;
-  }
-  virtual ~RequestCheckpointHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  int checkpoint_notify_id;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc
deleted file mode 100644
index 57ce54870decf2d56c321efbaddbc108fb113ea7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "gflags/gflags.h"
-
-// default to 3min to avoid temprary network failures.
-DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
-DEFINE_int32(rpc_retry_times, 3, "retry times for rpc");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag RPCClient::init_flag_;
-std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
-int RPCClient::trainer_id_ = 0;
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
deleted file mode 100644
index d0b971e0cb1bde477fed9264b5ecee7b249a2c09..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <memory>
-#include <string>
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-DECLARE_int32(rpc_deadline);
-DECLARE_int32(rpc_retry_times);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient {
- public:
-  RPCClient() {}
-  virtual ~RPCClient() {}
-  virtual VarHandlePtr AsyncSendVar(const std::string& ep,
-                                    const platform::DeviceContext& ctx,
-                                    const framework::Scope& scope,
-                                    const std::string& var_name,
-                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetVar(const std::string& ep,
-                                   const platform::DeviceContext& ctx,
-                                   const framework::Scope& scope,
-                                   const std::string& var_name,
-                                   const std::string& out_varname,
-                                   const std::string& table_name = "",
-                                   int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetVarNoBarrier(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_varname,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncPrefetchVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& in_var_name,
-      const std::string& out_var_name, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendFetchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dir,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendComplete(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  // Complete tells all the pserver instances that finishe the training,
-  // the pserver can reduce it's barrier count, and continue to train
-  // with other trainers.
-  virtual void SendComplete() = 0;
-
-  virtual bool Wait() = 0;
-
-  template <typename T>
-  static RPCClient* GetInstance(int trainer_id) {
-    std::call_once(init_flag_, &RPCClient::Init<T>, trainer_id);
-    return rpc_client_.get();
-  }
-
-  // Init is called by GetInstance.
-  template <typename T>
-  static void Init(int trainer_id) {
-    VLOG(0) << "init rpc client with trainer_id " << trainer_id;
-    trainer_id_ = trainer_id;
-    if (rpc_client_.get() == nullptr) {
-      rpc_client_.reset(new T());
-      rpc_client_->InitImpl();
-    }
-  }
-
-  virtual void InitImpl() {}
-
- protected:
-  // each trainer have exact one trainer id, it should be static
-  static int trainer_id_;
-
- private:
-  static std::once_flag init_flag_;
-  static std::unique_ptr<RPCClient> rpc_client_;
-};
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h
deleted file mode 100644
index eb127bf4ad5a5c9a28210e2fbcdb69b07543f4b9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_common.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-struct RpcContext {
-  RpcContext() = default;
-
-  RpcContext(const std::string &name, const std::vector<std::string> &names,
-             const std::vector<std::string> &emap,
-             const std::vector<int64_t> &sections, int id)
-      : var_name(name),
-        splited_var_names(names),
-        epmap(emap),
-        height_sections(sections),
-        trainer_id(id) {}
-
-  RpcContext(const RpcContext &ctx) {
-    var_name = ctx.var_name;
-    splited_var_names = ctx.splited_var_names;
-    epmap = ctx.epmap;
-    height_sections = ctx.height_sections;
-    trainer_id = ctx.trainer_id;
-  }
-
-  std::string var_name;
-  std::vector<std::string> splited_var_names;
-  std::vector<std::string> epmap;
-  std::vector<int64_t> height_sections;
-  int trainer_id;
-};
-
-inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) {
-  os << "{";
-  os << "var_name: " << rpc_ctx.var_name << "\n";
-
-  os << "splited_var_names: [";
-  for (auto &name : rpc_ctx.splited_var_names) {
-    os << name << ", ";
-  }
-  os << "]\n";
-
-  os << "epmap: [";
-  for (auto &ep : rpc_ctx.epmap) {
-    os << ep << ", ";
-  }
-  os << "]\n";
-
-  os << "height_sections: [";
-  for (auto &section : rpc_ctx.height_sections) {
-    os << section << ", ";
-  }
-  os << "]\n";
-  os << "}";
-  return os;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
deleted file mode 100644
index c3a46e348c69a20953f013c7de772a37db5f4844..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <string>
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void RPCServer::ShutDown() {
-  LOG(INFO) << "RPCServer ShutDown ";
-  ShutDownImpl();
-
-  exit_flag_ = true;
-  barrier_cond_.notify_all();
-  rpc_cond_.notify_all();
-}
-
-void RPCServer::SavePort() const {
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  std::ofstream port_file;
-  port_file.open(file_path);
-  port_file << selected_port_;
-  port_file.close();
-  VLOG(3) << "selected port written to " << file_path;
-}
-
-void RPCServer::WaitBarrier(const std::string& rpc_name) {
-  VLOG(3) << "WaitBarrier in: " << rpc_name;
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  barrier_cond_.wait(lock, [this, &rpc_name] {
-    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
-            exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitBarrier out: " << rpc_name
-          << " counter: " << barrier_counter_[rpc_name];
-}
-
-void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
-  // barrier msg should make sure that it's in the right cond(send|recv)
-  WaitCond(rpc_name);
-  int b = 0;
-  std::unique_lock<std::mutex> lock(mutex_);
-  b = ++barrier_counter_[rpc_name];
-  VLOG(3) << rpc_name << " barrier_counter: " << b;
-  if (b >= client_num_) {
-    lock.unlock();
-    VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for "
-            << rpc_name;
-    barrier_cond_.notify_all();
-    lock.lock();
-  }
-}
-
-void RPCServer::Complete() {
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    client_num_--;
-    need_reset_all_vars_ = true;
-
-    VLOG(3) << "decrease client_num to: " << client_num_;
-    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
-      barrier_counter_[kRequestGet]--;
-    }
-  }
-  barrier_cond_.notify_all();
-}
-
-bool RPCServer::NeedResetAllVars() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  return need_reset_all_vars_;
-}
-
-int RPCServer::GetClientNum() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  return client_num_;
-}
-
-void RPCServer::ResetBarrierCounter() {
-  VLOG(3) << "RPCServer ResetBarrierCounter ";
-  std::unique_lock<std::mutex> lock(mutex_);
-  for (auto& t : barrier_counter_) {
-    t.second = 0;
-  }
-  need_reset_all_vars_ = false;
-}
-
-void RPCServer::RegisterRPC(const std::string& rpc_name,
-                            RequestHandler* handler, int thread_num) {
-  rpc_call_map_[rpc_name] = handler;
-  rpc_thread_num_[rpc_name] = thread_num;
-
-  static int cond = -1;
-  rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler
-          << ", cond: " << rpc_cond_map_[rpc_name];
-}
-
-void RPCServer::SetCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer SetCond " << rpc_name;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cur_cond_ = rpc_cond_map_[rpc_name];
-  }
-
-  rpc_cond_.notify_all();
-}
-
-void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer WaitCond in " << rpc_name;
-  int cond = 0;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cond = rpc_cond_map_[rpc_name];
-  }
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  rpc_cond_.wait(
-      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
-  VLOG(3) << "RPCServer WaitCond out " << rpc_name;
-}
-
-void RPCServer::RegisterVar(const std::string& var_name,
-                            const std::string& rpc_name,
-                            framework::Scope* scope,
-                            platform::DeviceContext* dev_ctx) {
-  MonomerHandle h;
-  h.var_name_ = var_name;
-  h.rpc_name_ = rpc_name;
-  h.scope_ = scope;
-  h.dev_ctx_ = dev_ctx;
-
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (var_map_.find(var_name) != var_map_.end()) {
-      PADDLE_ENFORCE(false, "%s alreay in var_map", var_name);
-    }
-    var_map_[var_name] = h;
-  }
-
-  rpc_cond_.notify_all();
-  VLOG(3) << "RegisterVar context:" << h.String();
-}
-
-void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
-  int b = 0;
-  MonomerHandle h;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    b = ++var_map_[var_name].barrier_;
-    h = var_map_[var_name];
-  }
-
-  if (b >= client_num_) {
-    barrier_cond_.notify_all();
-  }
-
-  VLOG(3) << "IncreaseVarBarrier context:" << h.String();
-}
-
-void RPCServer::WaitVarBarrier(const std::string& var_name) {
-  VLOG(3) << "WaitVarBarrier var_name:" << var_name;
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  barrier_cond_.wait(lock, [&]() {
-    return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) ||
-            exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String();
-}
-
-void RPCServer::SetVarCond(const std::string& var_name) {
-  VLOG(3) << "SetVarCond var_name:" << var_name;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (var_map_.find(var_name) != var_map_.end()) {
-      rpc_cond_.notify_all();
-    }
-  }
-}
-
-void RPCServer::WaitVarCond(const std::string& var_name) {
-  VLOG(3) << "WaitVarCond var_name:" << var_name;
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  rpc_cond_.wait(lock, [=] {
-    return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitVarCond var_name:" << var_name << " end";
-}
-
-MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
-  MonomerHandle h;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    h = var_map_[var_name];
-  }
-
-  return h;
-}
-
-void RPCServer::ClearRegisteredVars() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  var_map_.clear();
-}
-
-void RPCServer::ClearVar(const std::string& var_name) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  var_map_.erase(var_name);
-}
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
deleted file mode 100644
index 8c7b7f1d7eeeced24d2ade2bcff4261b24587624..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-struct MonomerHandle {
-  std::string var_name_;
-  std::string rpc_name_;
-  framework::Scope* scope_{nullptr};
-  platform::DeviceContext* dev_ctx_{nullptr};
-  int64_t barrier_{0};
-
-  std::string String() {
-    std::stringstream ss;
-    ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_
-       << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_
-       << ", barrier_:" << barrier_;
-    return ss.str();
-  }
-};
-
-class RPCServer {
- public:
-  explicit RPCServer(const std::string& address, int client_num)
-      : cur_cond_(0),
-        bind_address_(address),
-        exit_flag_(false),
-        selected_port_(0),
-        client_num_(client_num),
-        need_reset_all_vars_(false) {}
-
-  virtual ~RPCServer() {}
-  virtual void StartServer() = 0;
-  virtual void WaitServerReady() = 0;
-
-  void ShutDown();
-
-  bool IsExit() { return exit_flag_.load(); }
-
-  int GetSelectedPort() const { return selected_port_; }
-
-  int GetClientNum();
-
-  void SavePort() const;
-
-  // RegisterRPC, register the rpc method name to a handler
-  // class, and auto generate a condition id for this call
-  // to be used for the barrier.
-  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
-                   int thread_num = 5);
-
-  int GetThreadNum(const std::string& rpc_name) {
-    return rpc_thread_num_[rpc_name];
-  }
-
-  // Wait util all the clients have reached the barrier for one
-  // rpc method. This function should be called in the
-  // RequestHandler if you want to run the server/client in a
-  // synchronous mode.
-  void WaitBarrier(const std::string& rpc_name);
-
-  void SetCond(const std::string& rpc_name);
-  void WaitCond(const std::string& rpc_name);
-  void IncreaseBatchBarrier(const std::string rpc_name);
-
-  void RegisterVar(const std::string& var_name, const std::string& rpc_name,
-                   framework::Scope* scope, platform::DeviceContext* dev_ctx);
-  void IncreaseVarBarrier(const std::string& var_name);
-  void WaitVarBarrier(const std::string& var_name);
-  void SetVarCond(const std::string& var_name);
-  void WaitVarCond(const std::string& var_name);
-  void ClearRegisteredVars();
-  void ClearVar(const std::string& var_name);
-  MonomerHandle GetMonomer(const std::string& var_name);
-
-  void Complete();
-
-  void ResetBarrierCounter();
-
-  bool NeedResetAllVars();
-
- protected:
-  virtual void ShutDownImpl() = 0;
-
- private:
-  std::mutex mutex_;
-  std::unordered_map<std::string, int> barrier_counter_;
-  std::condition_variable barrier_cond_;
-
-  std::unordered_map<std::string, int> rpc_cond_map_;
-  std::atomic<int> cur_cond_;
-  std::condition_variable rpc_cond_;
-
- protected:
-  std::string bind_address_;
-  std::atomic<int> exit_flag_;
-  int selected_port_;
-  int client_num_;
-  bool need_reset_all_vars_;
-
-  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
-  std::unordered_map<std::string, int> rpc_thread_num_;
-  friend class RequestHandler;
-
-  // TODO(gongwb): use more cond to notify or wait;
-  std::unordered_map<std::string, MonomerHandle> var_map_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
deleted file mode 100644
index 45e97d966fc9d469d24e40f8c77784d618280461..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace distributed = paddle::operators::distributed;
-
-USE_NO_KERNEL_OP(lookup_sparse_table);
-
-std::unique_ptr<distributed::RPCServer> g_rpc_service;
-std::unique_ptr<distributed::RequestHandler> g_req_handler;
-
-framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
-  auto root_block = program->MutableBlock(0);
-  auto* block = program->AppendBlock(*root_block);
-
-  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
-  framework::VariableNameMap output({{"Output", {"out"}}});
-  auto op = block->AppendOp();
-  op->SetType("lookup_sparse_table");
-  op->SetInput("W", {"w"});
-  op->SetInput("Ids", {"ids"});
-  op->SetOutput("Out", {"out"});
-
-  auto& out = *root_block->Var("out");
-  out.SetType(framework::proto::VarType::LOD_TENSOR);
-  out.SetShape({10, 10});
-
-  return block;
-}
-
-void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
-  auto w_var = scope->Var("w");
-  w_var->GetMutable<framework::SelectedRows>();
-
-  auto out_var = scope->Var("out");
-  out_var->GetMutable<framework::LoDTensor>();
-
-  auto ids_var = scope->Var("ids");
-  ids_var->GetMutable<framework::LoDTensor>();
-}
-
-void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
-                         int64_t rows_numel) {
-  CreateVarsOnScope(scope, place);
-  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
-  int64_t* ids_ptr =
-      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
-  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
-}
-
-void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
-                         int64_t rows_numel) {
-  CreateVarsOnScope(scope, place);
-  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto w_value = w->mutable_value();
-  w_value->Resize({rows_numel, 10});
-  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
-
-  auto ptr = w_value->mutable_data<float>(*place);
-
-  for (int64_t i = 0; i < w_value->numel(); ++i) {
-    ptr[i] = static_cast<float>(i / 10);
-  }
-}
-
-void StartServer(const std::string& rpc_name) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-  auto* block = AppendPrefetchBlcok(&program);
-  std::string in_var_name("ids");
-  std::vector<int> prefetch_block_ids{block->ID()};
-  auto prepared = exe.Prepare(program, prefetch_block_ids);
-  InitTensorsOnServer(&scope, &place, 10);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared;
-  prefetch_var_name_to_prepared[in_var_name] = prepared[0];
-
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-TEST(PREFETCH, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-  g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-  std::thread server_thread(StartServer, distributed::kRequestPrefetch);
-  g_rpc_service->WaitServerReady();
-
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-
-  framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
-  {
-    // create var on local scope
-    int64_t rows_numel = 5;
-    InitTensorsOnClient(&scope, &place, rows_numel);
-    std::string in_var_name("ids");
-    std::string out_var_name("out");
-
-    client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
-    client->Wait();
-    auto var = scope.Var(out_var_name);
-    auto value = var->GetMutable<framework::LoDTensor>();
-    auto ptr = value->mutable_data<float>(place);
-
-    for (int64_t i = 0; i < rows_numel; ++i) {
-      EXPECT_EQ(ptr[0 + i * value->dims()[1]], static_cast<float>(i * 2));
-    }
-  }
-
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  LOG(INFO) << "begin reset";
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
-
-TEST(COMPLETE, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-  g_req_handler.reset(new distributed::RequestSendHandler(true));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-  PADDLE_ENFORCE(client != nullptr);
-  std::thread server_thread(StartServer, distributed::kRequestSend);
-  g_rpc_service->WaitServerReady();
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-  client->AsyncSendComplete(ep);
-  client->Wait();
-
-  EXPECT_EQ(g_rpc_service->GetClientNum(), 1);
-
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
deleted file mode 100644
index 6303667884361be050ac62c604274c87caa72444..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
-the Apache License, Version 2.0 (the "License"); you may not use this file
-except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto3";
-package sendrecv;
-
-option cc_generic_services = @cc_generic_services@;
-
-service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors.
-  // Send and recv only one tensor
-  // TODO(typhoonzero): add streaming API
-  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
-  // Argument VariableMessage for GetVariable should only contain varname.
-  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {}
-  // pre-fetch variable by given variable name and Ids
-  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
-
-  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
-
-  rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
-}
-
-// It can be: LoDTensor、SelectedRows or NCCL_ID
-enum VarType {
-  LOD_TENSOR = 0;
-  SELECTED_ROWS = 1;
-  NCCL_ID = 2;
-}
-
-// VariableMessage is serialized paddle variable message.
-// NOTICE(gongwb):don't modify this proto if you are not
-//   not familar with how we serialize in sendrecvop_utils.h
-//   and deserilize it in  variable_response.h.
-message VariableMessage {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-  }
-
-  message LodData { repeated int64 lod_data = 1; }
-  string varname = 1;
-  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
-  VarType type = 2;
-  // bool persistable is not needed for sending.
-  // tensor info:
-  Type data_type = 3;
-  repeated int64 dims = 4;
-
-  // lod details:
-  int64 lod_level = 5;
-  repeated LodData lod = 6;
-  // selected_rows height, aka. original dim0
-  int64 slr_height = 7;
-  // tensor data
-  bytes serialized = 8;
-  // selected_rows data
-  bytes rows = 9;
-  // Look up table block execution output variable name.
-  string out_varname = 10;
-  // If 1, the ps server will start profiling, the ps
-  // server stops profiling and generates a profile to /tmp/profile_ps_*
-  // when profile switches from 1 to 2.
-  int64 profile = 11;
-  int64 trainer_id = 12;
-  string table_name = 13;
-}
-
-message VoidMessage {}
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
deleted file mode 100644
index 9bd2c9928ccdb6416976b76e776fb22b28ea1f5d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-#include <nccl.h>
-#endif
-#include <memory>
-#include <thread>  // NOLINT
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/platform/port.h"
-
-DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using VarMsg = sendrecv::VariableMessage;
-
-static TensorPayload GetCommunicationAllocationFromTensor(
-    const platform::DeviceContext& ctx, const framework::Tensor& tensor) {
-  if (is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(is_gpu_place(tensor.place()));
-    auto& gpu_dev_ctx =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
-    auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-    platform::CUDAPinnedPlace cuda_pinned;
-    auto result = memory::AllocShared(cuda_pinned, copy_size);
-
-    memory::Copy(cuda_pinned, result->ptr(),
-                 boost::get<platform::CUDAPlace>(tensor.place()),
-                 tensor.data<void>(), copy_size, gpu_dev_ctx.stream());
-    ctx.Wait();
-    return TensorPayload(result);
-#else
-    PADDLE_THROW("This situation should not be happened");
-#endif
-  } else {
-    return TensorPayload(tensor);
-  }
-}
-TensorPayload GetTensorPayload(framework::Variable* var,
-                               const platform::DeviceContext& ctx,
-                               VarMsg* request) {
-  auto tensor = var->Get<framework::LoDTensor>();
-  // FIXME(wuyi): data types in send_recv.proto is copied from
-  // framework.proto
-  request->set_data_type(static_cast<VarMsg::Type>(tensor.type()));
-  for (auto& dim : framework::vectorize(tensor.dims())) {
-    request->add_dims(dim);
-  }
-  const framework::LoD lod = tensor.lod();
-  if (lod.size() > 0) {
-    request->set_lod_level(lod.size());
-    for (auto& each : lod) {
-      VarMsg::LodData* lod_inner = request->add_lod();
-      for (auto& d : each) {
-        lod_inner->add_lod_data(d);
-      }
-    }
-  }
-  return GetCommunicationAllocationFromTensor(ctx, tensor);
-}
-
-TensorPayload GetSelectedRowsPayload(framework::Variable* var,
-                                     const platform::DeviceContext& ctx,
-                                     VarMsg* request) {
-  auto* slr = var->GetMutable<framework::SelectedRows>();
-  request->set_data_type(static_cast<VarMsg::Type>(slr->value().type()));
-  request->set_lod_level(0);
-  request->set_slr_height(slr->height());
-
-  for (auto& dim : framework::vectorize(slr->value().dims())) {
-    request->add_dims(dim);
-  }
-
-  auto* tensor = slr->mutable_value();
-  return GetCommunicationAllocationFromTensor(ctx, *tensor);
-}
-
-TensorPayload::TensorPayload(std::shared_ptr<memory::Allocation> allocation)
-    : allocation_(allocation), offset_(0), memory_size_(allocation->size()) {}
-TensorPayload::TensorPayload(const framework::Tensor& tensor)
-    : allocation_(tensor.Holder()),
-      offset_(tensor.offset()),
-      memory_size_(tensor.numel() * framework::SizeOfType(tensor.type())) {}
-void* TensorPayload::ptr() const {
-  return reinterpret_cast<void*>(
-      reinterpret_cast<uintptr_t>(allocation_->ptr()) + offset_);
-}
-size_t TensorPayload::memory_size() const { return memory_size_; }
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
deleted file mode 100644
index 5457101a5c9f3eb22f76877676f4a8a750a0f914..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <string>
-#include <typeindex>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using VarMsg = sendrecv::VariableMessage;
-
-class TensorPayload final {
- public:
-  explicit TensorPayload(const framework::Tensor& tensor);
-  explicit TensorPayload(std::shared_ptr<memory::Allocation> allocation);
-
-  TensorPayload(const TensorPayload& o) = default;
-  TensorPayload& operator=(const TensorPayload& o) = default;
-
-  void* ptr() const;
-  size_t memory_size() const;
-
- private:
-  std::shared_ptr<memory::Allocation> allocation_;
-  size_t offset_;
-  size_t memory_size_;
-};
-
-inline void SerializeDestroyCallback(void* payload) {
-  if (payload != nullptr) {
-    auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
-    delete shared_payload;
-  }
-}
-
-TensorPayload GetTensorPayload(framework::Variable* var,
-                               const platform::DeviceContext& ctx,
-                               VarMsg* request);
-
-TensorPayload GetSelectedRowsPayload(framework::Variable* var,
-                                     const platform::DeviceContext& ctx,
-                                     VarMsg* request);
-
-inline framework::proto::VarType::Type ToVarType(
-    sendrecv::VariableMessage::Type type) {
-  switch (type) {
-    case sendrecv::VariableMessage::FP32:
-      return framework::proto::VarType::FP32;  // NOLINT
-    case sendrecv::VariableMessage::FP64:
-      return framework::proto::VarType::FP64;  // NOLINT
-    case sendrecv::VariableMessage::INT32:
-      return framework::proto::VarType::INT32;  // NOLINT
-    case sendrecv::VariableMessage::INT64:
-      return framework::proto::VarType::INT64;  // NOLINT
-    case sendrecv::VariableMessage::BOOL:
-      return framework::proto::VarType::BOOL;  // NOLINT
-    default:
-      PADDLE_THROW("Not support type %d", type);
-  }
-}
-
-template <template <typename> class T, typename Elem>
-std::string VectorElemName(const T<Elem>& arg) {
-  return typeid(Elem).name();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc
deleted file mode 100644
index a0fcaf886475c5e03d959ffd6af22b2123526b9f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-using paddle::operators::distributed::VarHandlePtr;
-using paddle::operators::distributed::VarHandle;
-
-void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); }
-
-void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); }
-
-TEST(VarHandle, Run) {
-  std::vector<VarHandlePtr> a;
-  for (int i = 0; i < 12; i++) {
-    VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr));
-    a.push_back(s);
-  }
-
-  std::vector<std::unique_ptr<std::thread>> t;
-  for (int i = 0; i < 6; i++) {
-    t.emplace_back(new std::thread(WaitFalse, a[i]));
-  }
-
-  for (int i = 0; i < 6; i++) {
-    a[i]->Finish(false);
-    t[i]->join();
-  }
-
-  for (int i = 6; i < 12; i++) {
-    t.emplace_back(new std::thread(WaitTrue, a[i]));
-  }
-
-  for (int i = 6; i < 12; i++) {
-    a[i]->Finish(true);
-    t[i]->join();
-  }
-}
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
deleted file mode 100644
index 7825b4fc82b1f7580fea8ab4961facaf7fd64397..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include <vector>
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-DEFINE_string(rpc_server_profile_path, "./profile_ps",
-              "the profile log file path");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
-                               const platform::DeviceContext& dev_ctx,
-                               platform::Place place, void* dest,
-                               int64_t size) {
-  const void* data = NULL;
-  int size_to_write = 0;
-  int64_t length = size;
-  int total_written = 0;
-
-  if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
-    auto& gpu_dev_ctx =
-        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
-    platform::CPUPlace cpu;
-
-    char* p = reinterpret_cast<char*>(dest);
-    while (total_written < length) {
-      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-        return false;
-      }
-      // NOTE: if raw buffer is large and have two neighbor fields of raw
-      // buffers GetDirectBufferPointer can get all of them, use length to
-      // truncate it.
-      if (total_written + size_to_write > length) {
-        size_to_write = length - total_written;
-      }
-      // This log is useful to see how long a internal block size is of rpc.
-      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
-      memory::Copy(boost::get<platform::CUDAPlace>(place),
-                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
-                   gpu_dev_ctx.stream());
-      p += size_to_write;
-      total_written += size_to_write;
-
-      input->Skip(size_to_write);
-    }
-    gpu_dev_ctx.Wait();
-#else
-    PADDLE_THROW("Unexpected branch");
-#endif
-    return true;
-  }
-
-  char* p = reinterpret_cast<char*>(dest);
-  while (total_written < length) {
-    if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-      return false;
-    }
-    // NOTE: if raw buffer is large and have two neighbor fields of raw buffers
-    // GetDirectBufferPointer can get all of them, use length to truncate it.
-    if (total_written + size_to_write > length) {
-      size_to_write = length - total_written;
-    }
-    // TODO(gongwb): can we avoid copy?
-    platform::CPUPlace cpu;
-    // This log is useful to see how long a internal block size is of rpc.
-    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
-    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
-
-    p += size_to_write;
-    total_written += size_to_write;
-
-    input->Skip(size_to_write);
-  }
-
-  return true;
-}
-
-bool VariableResponse::CopyLodTensorData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, const framework::DDim& dims,
-    int length) {
-  auto server_var = GetVar();
-  if (!server_var) {
-    LOG(ERROR) << "recved var should not on current server: "
-               << meta_.varname();
-    return false;
-  }
-  auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
-  tensor->Resize(dims);
-  framework::LoD lod;
-  for (int i = 0; i < meta_.lod_level(); ++i) {
-    framework::Vector<size_t> v;
-    for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) {
-      v.push_back(meta_.lod(i).lod_data(j));
-    }
-    lod.push_back(v);
-  }
-  tensor->set_lod(lod);
-
-  void* tensor_data =
-      tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type()));
-
-  VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
-          << ", Buffer Size = " << length << ", dims:" << dims
-          << ", numel:" << tensor->numel();
-  PADDLE_ENFORCE_GE(tensor->memory_size(), static_cast<unsigned int>(length));
-  return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
-}
-
-inline framework::DDim GetDims(
-    const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) {
-  std::vector<int> vecdims;
-  for (auto& d : dims) {
-    vecdims.push_back(d);
-  }
-  return framework::make_ddim(vecdims);
-}
-
-bool VariableResponse::CopySelectRowsTensorData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, const framework::DDim& dims,
-    int length) {
-  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
-  slr->set_height(meta_.slr_height());
-  auto* tensor = slr->mutable_value();
-  tensor->Resize(dims);
-  PADDLE_ENFORCE_EQ(
-      static_cast<size_t>(tensor->numel()),
-      length / framework::SizeOfType(paddle::operators::distributed::ToVarType(
-                   meta_.data_type())));
-  void* tensor_data = tensor->mutable_data(
-      ctx.GetPlace(),
-      paddle::operators::distributed::ToVarType(meta_.data_type()));
-
-  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool VariableResponse::CopySelectRowsData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, int length) {
-  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
-  slr->mutable_rows()->clear();
-  slr->mutable_rows()->resize(length / sizeof(int64_t));  // int64
-  int64_t* rows_data = slr->mutable_rows()->data();
-
-  // copy rows CPU data, GPU data will be copied lazily.
-  platform::CPUPlace cpu;
-  if (!ReadRaw(input, ctx, cpu, rows_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool VariableResponse::ProcSerializedField(
-    int tag, ::google::protobuf::io::CodedInputStream* input,
-    int64_t num_bytes) {
-  PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                  meta_.type() == sendrecv::LOD_TENSOR ||
-                  meta_.type() == sendrecv::NCCL_ID) &&
-                     meta_.varname() != "",
-                 "meta info should be got first!");
-
-  if (meta_.type() == sendrecv::NCCL_ID) {
-#ifdef PADDLE_WITH_CUDA
-    auto* var = scope_->FindVar(meta_.varname());
-    if (var != nullptr) {
-      ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
-      if (!ReadRaw(input, *dev_ctx_, platform::CPUPlace(), id->internal,
-                   num_bytes)) {
-        return false;
-      }
-    }
-    return true;
-#else
-    PADDLE_THROW("Not compiled with CUDA!");
-    return false;
-#endif
-  }
-
-  VLOG(7) << "ProcSerializedField:" << meta_.varname()
-          << ", type:" << meta_.type() << std::endl;
-  framework::DDim dims = GetDims(meta_.dims());
-  if (meta_.type() == sendrecv::LOD_TENSOR) {
-    PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
-    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
-      return false;
-    }
-
-    return true;
-  }
-
-  if (meta_.type() == sendrecv::SELECTED_ROWS) {
-    if (!CopySelectRowsTensorData(input, *dev_ctx_, dims, num_bytes)) {
-      return false;
-    }
-    return true;
-  }
-
-  PADDLE_ENFORCE("not supported var types:", meta_.varname(), meta_.type());
-
-  return false;
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
deleted file mode 100644
index 3cabcd22cd52222aff2555a8449e558de2c287c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ /dev/null
@@ -1,132 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-
-DECLARE_string(rpc_server_profile_path);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// Source provides a way for a particular RPC implementation to provide
-// received data to ParseFrom.
-class Source {
- public:
-  virtual ~Source() {}
-
-  // Return the stream that contains the data to be parsed.
-  // Note that this method might be invoked more than once if
-  // ParseFrom needs to fall back to a more expensive parsing method.
-  // Every call must return a stream pointing at the beginning of
-  // the serialized RecvTensorResponse.
-  //
-  // Note that a subsequent call to contents() invalidates previous
-  // results of contents().
-  //
-  // Ownership of the returned stream is retained by the Source and
-  // should not be deleted by the caller.
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
-};
-
-class VariableResponse {
- public:
-  VariableResponse(const framework::Scope* scope,
-                   const platform::DeviceContext* dev_ctx,
-                   bool create_scope = false)
-      : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
-    if (create_scope) {
-      local_scope_ = scope->NewTmpScope().release();
-    }
-  }
-
-  virtual ~VariableResponse() {
-    if (local_scope_) {
-      delete local_scope_;
-      local_scope_ = nullptr;
-    }
-  }
-
-  int Parse(Source* source, const sendrecv::VariableMessage& meta) {
-    meta_ = meta;
-    return Parse(source);
-  }
-
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  virtual int Parse(Source* source) = 0;
-
-  inline const framework::Scope& GetLocalScope() const { return *local_scope_; }
-  inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
-  inline std::string Varname() const { return meta_.varname(); }
-  inline std::string OutVarname() const { return meta_.out_varname(); }
-  inline std::string TableName() const { return meta_.table_name(); }
-
-  // should call parse first.
-  framework::Variable* GetVar() {
-    if (create_scope_) {
-      return local_scope_->Var(meta_.varname());
-    }
-    return scope_->FindVar(meta_.varname());
-  }
-
-  int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
-
- protected:
-  bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
-               const platform::DeviceContext& dev_ctx, platform::Place place,
-               void* dest, int64_t size);
-
-  bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
-                                const platform::DeviceContext& ctx,
-                                const framework::DDim& dims, int length);
-
-  bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
-                          const platform::DeviceContext& ctx, int length);
-
-  bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
-                         const platform::DeviceContext& ctx,
-                         const framework::DDim& dims, int length);
-
-  bool ProcSerializedField(int tag,
-                           ::google::protobuf::io::CodedInputStream* input,
-                           int64_t num_bytes);
-
- protected:
-  const framework::Scope* scope_;
-  const platform::DeviceContext* dev_ctx_;
-  bool create_scope_ = false;
-  framework::Scope* local_scope_ = nullptr;
-
-  sendrecv::VariableMessage meta_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
deleted file mode 100644
index 829e67a53b07048bfa36b2611f00773c8397df5e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-include(operators)
-
-set(DISTRIBUTE_DEPS "")
-if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
-else()
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
-    if(WITH_BRPC_RDMA)
-        find_library(IBVERBS_LIBRARY NAMES ibverbs)
-        ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
-
-
-        find_library(RDMACM_LIBRARY NAMES rdmacm)
-        ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
-
-        set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
-    endif()
-endif()
-
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-list(REMOVE_DUPLICATES OPS)
-
-foreach(src ${OPS})
-    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-endforeach()
-
-register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
-
-if(WITH_GPU AND NOT WIN32)
-    set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
-    op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common)
-endif()
-
-set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
-set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cc
deleted file mode 100644
index 57d68eb931f089e46df07f45186246568bc297c8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AllReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be allreduced.");
-    AddOutput("Out", "(Tensor) the result of allreduced.");
-    AddAttr<int>("reduce_type", "(int) determin the reduce type.")
-        .SetDefault(0);
-    AddAttr<bool>(
-        "sync_mode",
-        "(bool) whether to synchronize the CUDA stream after nccl call.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-***AllReduce Operator***
-
-Call NCCL AllReduce internally. Note that this op must be used when one
-thread is managing one GPU device.
-
-For speed reasons, reduce_type should be an integer:
-
-0: sum
-1: prod
-2: max
-3: min
-
-If input and output are the same variable, in-place allreduce will be used.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(allreduce, ops::AllReduceOp,
-                             ops::AllReduceOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    allreduce, ops::AllReduceOpKernel<plat::CPUDeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
deleted file mode 100644
index 9b70f78399026b9f853b8315f0acf6dbad64242a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    allreduce, ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.h b/paddle/fluid/operators/distributed_ops/allreduce_op.h
deleted file mode 100644
index 0275f6a9cf3aa8bab89b3d8c599b304702f590a8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AllReduceOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place),
-                   "AllReduce op can run on gpu place only for now.");
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-
-    int dtype = platform::ToNCCLDataType(in->type());
-    int64_t numel = in->numel();
-    auto* sendbuff = in->data<void>();
-    out->Resize(in->dims());
-    void* recvbuff = out->mutable_data<T>(place);
-
-    auto* comm = dev_ctx.nccl_comm();
-    // FIXME(typhoonzero): should use nccl stream here.
-    auto stream = dev_ctx.stream();
-    PADDLE_ENFORCE_NOT_NULL(stream, "Should initialize NCCL firstly.");
-
-    int reduce_type = ctx.Attr<int>("reduce_type");
-    ncclRedOp_t red_type = ncclSum;
-    switch (reduce_type) {
-      case 0:
-        red_type = ncclSum;
-        break;
-      case 1:
-        red_type = ncclProd;
-        break;
-      case 2:
-        red_type = ncclMax;
-        break;
-      case 3:
-        red_type = ncclMin;
-        break;
-    }
-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
-        comm, stream));
-    if (ctx.Attr<bool>("sync_mode")) {
-      cudaError_t e_sync = cudaStreamSynchronize(stream);
-      if (e_sync != 0) {
-        LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync);
-      }
-    }
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
deleted file mode 100644
index 6ae98af1e2ac1916b431e72b137e148d90df747f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <ostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class BroadcastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of BroadcastOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Output) of ConvOp should not be null.");
-  }
-};
-
-class BroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be broadcast.");
-    AddOutput("Out", "(Tensor) the result of broadcast.");
-    AddAttr<bool>(
-        "sync_mode",
-        "(bool) whether to synchronize the CUDA stream after nccl call.")
-        .SetDefault(false);
-    AddAttr<int>("root", "(int).").SetDefault(0).EqualGreaterThan(0);
-    AddComment(R"DOC(
-***Broadcast Operator***
-
-Call NCCL Broadcast internally. Note that this op must be used when one
-thread is managing one GPU device.
-)DOC");
-  }
-};
-
-template <typename T>
-class BroadcastOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("Broadcast op can run on gpu place only for now.");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(broadcast, ops::BroadcastOp,
-                             ops::BroadcastOpMaker);
-
-REGISTER_OP_CPU_KERNEL(broadcast, ops::BroadcastOpKernel<float>,
-                       ops::BroadcastOpKernel<double>,
-                       ops::BroadcastOpKernel<int>,
-                       ops::BroadcastOpKernel<int64_t>,
-                       ops::BroadcastOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
deleted file mode 100644
index c9b40e6863f444983999e9d74a9efe288465fe27..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "The place of ExecutionContext should be CUDAPlace.");
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    int dev_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).device;
-    int root_dev_id = ctx.Attr<int>("root");
-
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    PADDLE_ENFORCE(out->IsInitialized(),
-                   "Currently, the output of broadcast op must be initialized, "
-                   "because this op can only be an In-Place operation.");
-    void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
-    PADDLE_ENFORCE_EQ(
-        send_recv_buffer, in->data<void>(),
-        "Currently, the broadcast op can only be an In-Place operation.");
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto comm = dev_ctx.nccl_comm();
-    auto stream = dev_ctx.stream();
-
-    PADDLE_ENFORCE(platform::dynload::ncclBcast(
-        send_recv_buffer, static_cast<size_t>(in->numel()),
-        platform::ToNCCLDataType(in->type()), root_dev_id, comm, stream));
-
-    VLOG(3) << "Bcast " << ctx.Inputs("X")[0] << ", (" << in->numel() << ")"
-            << " From " << root_dev_id << " to " << dev_id;
-
-    if (ctx.Attr<bool>("sync_mode")) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-    }
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(broadcast, ops::NCCLBroadcastOpKernel<float>,
-                        ops::NCCLBroadcastOpKernel<double>,
-                        ops::NCCLBroadcastOpKernel<int>,
-                        ops::NCCLBroadcastOpKernel<int64_t>,
-                        ops::NCCLBroadcastOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
deleted file mode 100644
index a09bff351fc0c7ae3858358701bf309e9d2f592a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace paddle {
-namespace operators {
-
-class CheckpointNotifyOp : public framework::OperatorBase {
- public:
-  CheckpointNotifyOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    std::string dir = Attr<std::string>("dir");
-    std::string lookup_table_name = Attr<std::string>("lookup_table");
-    int trainer_id = Attr<int>("trainer_id");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-    for (size_t i = 0; i < epmap.size(); i++) {
-      auto lookup_table_save_dir =
-          string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
-      rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir);
-      VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
-              << " and dir:" << dir << " to " << epmap[i];
-    }
-    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
-  }
-};
-
-class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default  127.0.0.1:6164)"
-                                      "Parameter Server endpoints in the order")
-        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<std::string>(
-        "dir", "(string, default '') indicate the folder checkpoint will use");
-    AddAttr<std::string>("lookup_table",
-                         "(string, default '') the lookup table name");
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddComment(R"DOC(
-CheckpointNotify operator
-
-This operator will send lookup table and it's checkpoint direcoty to listen_and_serve op at
-the parameter server.
-)DOC");
-  }
-};
-
-class CheckpointNotifyOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(checkpoint_notify, ops::CheckpointNotifyOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CheckpointNotifyOpMaker,
-                  ops::CheckpointNotifyOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
deleted file mode 100644
index 3e354791ea9af4fa833026e3170856d823a5fd78..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class DistributedLookupTableOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("Ids"),
-                   "Input(Ids) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(W) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
-                   "Output(Outs) of LookupTableOp should not be null.");
-
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    auto table_dims = ctx->GetInputDim("W");
-
-    PADDLE_ENFORCE_EQ(table_dims.size(), 2,
-                      "Only 2 dimensions of the 'Embedding' is supported.");
-
-    for (auto &ids_dim : ids_dims) {
-      PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
-                        "The dimension of the 'Ids' tensor must be 2.");
-      PADDLE_ENFORCE_EQ(ids_dim[1], 1,
-                        "The last dimension of the 'Ids' tensor must be 1.");
-    }
-
-    auto lookup_tables =
-        ctx->Attrs().Get<std::vector<std::string>>("table_names");
-    auto height_sections =
-        ctx->Attrs().Get<std::vector<int64_t>>("height_sections");
-    auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
-
-    PADDLE_ENFORCE(lookup_tables.size() == height_sections.size() &&
-                       lookup_tables.size() == endpoints.size() &&
-                       lookup_tables.size() != 0,
-                   "Attrs lookup_tables/height_sections/endpoints must have "
-                   "save size and can not be 0.");
-
-    auto outputs_dims = std::vector<framework::DDim>();
-
-    for (auto &ids_dim : ids_dims) {
-      outputs_dims.push_back(framework::make_ddim({ids_dim[0], table_dims[1]}));
-    }
-
-    ctx->SetOutputsDim("Outputs", outputs_dims);
-    ctx->ShareLoD("Ids", /*->*/ "Outputs");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-template <typename T>
-class DistributedLookupTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto ids_vars = context.MultiInputVar("Ids");
-    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
-
-    auto id_names = context.Inputs("Ids");
-    auto embedding_name = context.Inputs("W").front();
-    auto out_names = context.Outputs("Outputs");
-
-    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
-    auto height_sections =
-        context.Attr<std::vector<int64_t>>("height_sections");
-    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
-
-    operators::distributed::prefetchs(
-        id_names, out_names, embedding_name, false, lookup_tables, endpoints,
-        height_sections, context, context.scope());
-  }
-};
-
-class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.")
-        .AsDuplicable();
-
-    AddInput("W",
-             "(Tensor) The input represents embedding tensors, "
-             "which is a learnable parameter.");
-
-    AddOutput("Outputs",
-              "(LoDTensor) The lookup results, which have the same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, such as emb_block0, emb_block1)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({""});
-
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-
-    AddAttr<std::vector<std::string>>(
-        "endpoints",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(distributed::kNoPadding);
-
-    AddComment(R"DOC(
-Lookup Tablel Prefetch Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
-                  ops::DistributedLookupTableOpMaker);
-
-REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
-                       ops::DistributedLookupTableKernel<float>);
diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
deleted file mode 100644
index 5ee35e0458a64dacc1c469a435edd28de1b78e6b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class FakeInitInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FakeInitOp should not be null.");
-    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    ctx->SetOutputDim("Out", framework::make_ddim(shape));
-  }
-};
-
-class FakeInitOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    framework::Tensor *tensor = nullptr;
-
-    auto &out_var = *scope.FindVar(Output("Out"));
-
-    if (out_var.IsType<framework::LoDTensor>()) {
-      tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var.IsType<framework::SelectedRows>()) {
-      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else {
-      PADDLE_THROW(
-          "fake init op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-  }
-};
-
-class FakeInitOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {}
-};
-
-class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output");
-    AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
-              "with the specified value");
-    AddComment(R"DOC(
-FakeInit Operator.
-
-Init an variable but not alloc memory for it, it is used for init the
-table parameter at trainer side in distributed lookup table.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fake_init, ops::FakeInitOp, ops::FakeInitInferShape,
-                  ops::FakeInitOpMaker, paddle::framework::EmptyGradOpMaker,
-                  ops::FakeInitOpVarTypeInference);
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
deleted file mode 100644
index ae4b687ffc4c85501d9ef0325960ff8767ee5704..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-class FetchBarrierOp : public framework::OperatorBase {
- public:
-  FetchBarrierOp(const std::string& type,
-                 const framework::VariableNameMap& inputs,
-                 const framework::VariableNameMap& outputs,
-                 const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    std::vector<distributed::VarHandlePtr> rets;
-    for (auto& ep : eps) {
-      VLOG(3) << "fetch barrier, ep: " << ep;
-      rets.push_back(rpc_client->AsyncSendFetchBarrier(ep));
-    }
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
-    }
-  }
-};
-
-class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SendBarrier operator
-
-This operator will send a send barrier signal to list_and_serv op, so that
-the Parameter Server would knew all variables have been sent.
-)DOC");
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({"127.0.0.1:6164"});
-  }
-};
-
-class FetchBarrierOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(fetch_barrier, ops::FetchBarrierOp,
-                  paddle::framework::EmptyGradOpMaker, ops::FetchBarrierOpMaker,
-                  ops::FetchBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
deleted file mode 100644
index 07c864eefe29f07607b95115ce2a427f43435f3e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
+++ /dev/null
@@ -1,279 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(flrpc_send_thread_num, 12, "number of threads for rpc send");
-DEFINE_int32(flrpc_get_thread_num, 12, "number of threads for rpc get");
-
-namespace paddle {
-namespace operators {
-
-void FlRunServer(std::shared_ptr<distributed::RPCServer> service) {
-  service->StartServer();
-}
-static void flsplit(const std::string &str, char sep,
-                    std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-static void FlParallelExecuteBlocks(
-    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
-    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
-        &prepared,
-    framework::ProgramDesc *program, framework::Scope *scope) {
-  std::vector<std::future<void>> fs;
-  for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
-      int run_block = idx;  // thread local
-      try {
-        VLOG(3) << "running server block: " << run_block
-                << "pointer: " << prepared[run_block].get();
-        executor->RunPreparedContext(prepared[run_block].get(), scope);
-      } catch (const std::exception &e) {
-        LOG(FATAL) << "run sub program:" << idx << " error " << e.what();
-      }
-    }));
-  }
-  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-}
-
-FlListenAndServOp::FlListenAndServOp(const std::string &type,
-                                     const framework::VariableNameMap &inputs,
-                                     const framework::VariableNameMap &outputs,
-                                     const framework::AttributeMap &attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
-
-FlListenAndServOp::~FlListenAndServOp() {}
-
-void FlListenAndServOp::SavePort() const {
-  // NOTE: default write file to /tmp/paddle.selected_port
-  rpc_service_->SavePort();
-}
-
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
-void FlListenAndServOp::RunSyncLoop(framework::Executor *executor,
-                                    framework::ProgramDesc *program,
-                                    framework::Scope *recv_scope,
-                                    platform::DeviceContext *dev_ctx) const {
-  VLOG(2) << "RunSyncLoop";
-  size_t num_blocks = program->Size();
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
-
-  // Prepare all the server block
-  std::vector<int> optimize_blocks_list;
-  for (size_t i = 1; i < program->Size(); ++i) {
-    optimize_blocks_list.push_back(i);
-  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
-  // Insert placeholder for block0 which holds current op itself,
-  // NOTE the first block in `optimize_prepared` should never be ran.
-  optimize_prepared.insert(
-      optimize_prepared.begin(),
-      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
-
-  while (true) {
-    // Get from multiple trainers, we don't care about the order in which
-    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    VLOG(3) << "wait all clients to get pserver parameters back";
-    rpc_service_->SetCond(distributed::kRequestGet);
-    VLOG(3) << "wait all clients to send fetch_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-
-    if (rpc_service_->IsExit()) {
-      rpc_service_->SetCond(distributed::kRequestGet);
-      break;
-    }
-
-    VLOG(3) << "wait all clients to send after_optimizer parameters";
-    rpc_service_->SetCond(distributed::kRequestSend);
-    VLOG(3) << "wait all clients to send send_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestSend);
-    VLOG(3) << "ResetBarrierCounter";
-    rpc_service_->ResetBarrierCounter();
-    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
-    // and this will still work.
-    // The optimize blocks which have the same parent ID would run parallel
-    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
-    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(optimize_blocks[0]->ID());
-    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
-      // skip the first optimize block because it is already in the
-      // parallel_blkids.
-      int blkid = optimize_blocks[i]->ID();
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                                program, recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
-      }
-      parallel_blkids.push_back(blkid);
-    }
-    FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                            program, recv_scope);
-    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-  }  // while(true)
-}
-
-static void FillRequestCtx(distributed::RequestHandler *h,
-                           framework::Scope *scope,
-                           platform::DeviceContext *dev_ctx,
-                           framework::Executor *executor,
-                           framework::ProgramDesc *program,
-                           distributed::RPCServer *rpc_server) {
-  h->SetScope(scope);
-  h->SetDevCtx(dev_ctx);
-  h->SetExecutor(executor);
-  h->SetProgram(program);
-  h->SetRPCServer(rpc_server);
-}
-
-void FlListenAndServOp::RunImpl(const framework::Scope &scope,
-                                const platform::Place &dev_place) const {
-  // Mark this as PS that it should decide profiling by listening from trainer.
-  platform::SetProfileListener();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  bool sync_mode = Attr<bool>("sync_mode");
-  auto fan_in = Attr<int>("Fanin");
-  auto inputs = Inputs("X");
-
-  PADDLE_ENFORCE_EQ(!rpc_service_, true, "rpc_service_ must null");
-  std::string endpoint = Attr<std::string>("endpoint");
-
-  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
-          << ", end_point:" << endpoint;
-
-  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-
-  request_send_handler_.reset(
-      new distributed::RequestSendHandler(sync_mode, false));
-  request_get_handler_.reset(
-      new distributed::RequestGetHandler(sync_mode, false));
-
-  rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get(),
-                            FLAGS_flrpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get(),
-                            FLAGS_flrpc_get_thread_num);
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(
-      optimize_blocks.size(), 1,
-      "optimize blocks should be 1 at least on the pserver side.");
-  auto *program = optimize_blocks[0]->Program();
-  framework::Executor executor(dev_place);
-
-  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
-                     &dev_ctx, &executor, program, rpc_service_.get());
-
-  f(request_send_handler_.get());
-  f(request_get_handler_.get());
-
-  // start the server listening after all member initialized.
-  server_thread_.reset(new std::thread(FlRunServer, rpc_service_));
-  VLOG(3) << "wait server thread to become ready...";
-  rpc_service_->WaitServerReady();
-
-  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
-  signal(SIGINT, FlSignalHandler::StopAndExit);
-  signal(SIGTERM, FlSignalHandler::StopAndExit);
-
-  // Cache the type of the received vars as `sparse_vars_` and `dense_vars_`
-  // so that we can reset them at the end of each iteration.
-  // NOTE: only used in sync update
-
-  // Write to a file of server selected port for python use.
-  SavePort();
-  RunSyncLoop(&executor, program, &recv_scope, &dev_ctx);
-}
-
-class FlListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
-" will start a RPC server which can receive variables from send_op and send" +
-"back variables to recv_op.)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<int>("Fanin", "How many clients send to this server.")
-        .SetDefault(1);
-    AddAttr<std::vector<framework::BlockDesc *>>(
-        kOptimizeBlocks, "Optimize blocks to run on server side.")
-        .SetDefault({});
-  }
-};
-
-void FlSignalHandler::StopAndExit(int signal_num) {
-  // Do not use VLOG here for the device for printing maybe already released.
-  // exit will release interal allocated resoureces.
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-  exit(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(fl_listen_and_serv, ops::FlListenAndServOp,
-                             ops::FlListenAndServOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
deleted file mode 100644
index 1199a63d16a4ecddf04eef468aea42d147608783..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <atomic>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr char kOptimizeBlocks[] = "optimize_blocks";
-
-void FlRunServer(std::shared_ptr<distributed::RPCServer> service);
-
-template <class TKey, class TValue>
-class DoubleFindMap : public std::unordered_map<TKey, TValue> {
- public:
-  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
-    return std::find_if(this->begin(), this->end(),
-                        [&v](const std::pair<const std::string, int> p) {
-                          return p.second == v;
-                        });
-  }
-};
-
-class FlListenAndServOp : public framework::OperatorBase {
- public:
-  FlListenAndServOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs);
-  virtual ~FlListenAndServOp();
-
-  void RunSyncLoop(framework::Executor* executor,
-                   framework::ProgramDesc* program,
-                   framework::Scope* recv_scope,
-                   platform::DeviceContext* dev_ctx) const;
-
-  void SavePort() const;
-
-  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override;
-
- protected:
-  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
-
-  mutable std::shared_ptr<std::thread> server_thread_;
-  mutable std::vector<std::string> sparse_vars_;
-  mutable std::vector<std::string> dense_vars_;
-};
-
-class FlSignalHandler {
- public:
-  static void StopAndExit(int signal_num);
-
- private:
-  DISABLE_COPY_AND_ASSIGN(FlSignalHandler);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
deleted file mode 100644
index c33842c06e49267e014c2927f6a7070cbe9a27ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <nccl.h>
-#include <stdint.h>
-#include <ostream>
-#include <string>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-
-namespace paddle {
-namespace operators {
-
-class GenNCCLIdOp : public framework::OperatorBase {
- public:
-  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
-              const framework::VariableNameMap& outputs,
-              const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    // put nccl id in CPUPlace
-    auto& dev_ctx = *pool.Get(platform::CPUPlace());
-    int trainer_id = Attr<int>("trainer_id");
-
-    std::vector<std::string> trainers =
-        Attr<std::vector<std::string>>("trainers");
-    PADDLE_ENFORCE(
-        trainer_id >= 0 && trainer_id < static_cast<int>(trainers.size()),
-        "trainer_id:%d must be in trainers.size range", trainer_id);
-    std::string endpoint = trainers[trainer_id];
-
-    framework::Scope& local_scope = scope.NewScope();
-
-    int nccl_comm_num = Attr<int>("nccl_comm_num");
-    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
-    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
-
-    int inter_trainer_id = -1;
-    int exter_trainer_id = -1;
-    if (use_hierarchical_allreduce) {
-      PADDLE_ENFORCE(trainers.size() > 1, "trainers.size():%llu < 1",
-                     trainers.size());
-      PADDLE_ENFORCE(inter_nranks > 1, "inter_nranks:%d < 1", inter_nranks);
-      PADDLE_ENFORCE((trainers.size() % inter_nranks == 0),
-                     "trainers.size():%llu mod inter_nranks:%d != 0",
-                     trainers.size(), inter_nranks);
-
-      inter_trainer_id = trainer_id % inter_nranks;
-
-      if (trainer_id % inter_nranks == 0) {
-        exter_trainer_id = trainer_id / inter_nranks;
-      }
-    }
-
-    if (trainer_id != 0) {
-      GetIdByServer(endpoint, &local_scope, dev_ctx, nccl_comm_num,
-                    use_hierarchical_allreduce, trainer_id, inter_trainer_id,
-                    exter_trainer_id);
-    }
-
-    std::ostringstream ss;
-    for (size_t i = 0; i < trainers.size(); i++) {
-      ss << trainers[i] << ",";
-    }
-
-    VLOG(1) << "trainer_id:" << trainer_id
-            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
-            << ", inter_nranks:" << inter_nranks
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << ", trainers:" << ss.str();
-
-    // init flat
-    if (trainer_id == 0) {
-      std::vector<std::string> flat_endpoints;
-      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
-                            trainers.end());
-      // flat nccl_id
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string var_name = platform::GetFlatNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, var_name, flat_endpoints);
-      }
-    }
-
-    if (!use_hierarchical_allreduce) {
-      return;
-    }
-
-    PADDLE_ENFORCE(trainers.size() % inter_nranks == 0,
-                   "enpoints.size:%llu mod inter_nranks:%d should ==0",
-                   trainers.size(), inter_nranks);
-    PADDLE_ENFORCE(inter_nranks > 1, "inter_nranks:%d must > 1", inter_nranks);
-
-    // hierarchical inter ncclid
-    if (inter_trainer_id == 0) {
-      std::ostringstream ss;
-      ss << endpoint;
-      std::vector<std::string> inter_endpoints;
-      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
-                                   i < static_cast<int>(trainers.size());
-           i++) {
-        ss << ",";
-        inter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string nccl_var_name =
-            platform::GetHierarchicalInterNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, nccl_var_name, inter_endpoints);
-      }
-    }
-
-    // hierarchical exter ncclid
-    if (exter_trainer_id == 0) {
-      std::ostringstream ss;
-      std::vector<std::string> exter_endpoints;
-      ss << endpoint;
-      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
-        ss << ",";
-        exter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string nccl_var_name =
-            platform::GetHierarchicalExterNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, nccl_var_name, exter_endpoints);
-      }
-    }
-  }
-
- private:
-  void GenerateAndSend(framework::Scope* scope,
-                       const platform::DeviceContext& dev_ctx,
-                       const std::string& nccl_id_name,
-                       const std::vector<std::string>& endpoint_list) const {
-    auto var = scope->FindVar(nccl_id_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "can't find nccl_id_var_name:%s",
-                            nccl_id_name);
-    auto id = var->GetMutable<ncclUniqueId>();
-    PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(id));
-
-    distributed::RPCClient* client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-    for (auto& ep : endpoint_list) {
-      VLOG(3) << "sending nccl_id_var:" << nccl_id_name << " to " << ep;
-      client->AsyncSendVar(ep, dev_ctx, *scope, nccl_id_name);
-    }
-    client->Wait();
-    for (auto& ep : endpoint_list) {
-      client->AsyncSendBatchBarrier(ep);
-    }
-    client->Wait();
-    VLOG(3) << "sending completed...";
-  }
-
-  void GetIdByServer(const std::string& endpoint, framework::Scope* scope,
-                     const platform::DeviceContext& dev_ctx, int nccl_comm_num,
-                     bool use_hierarchical_allreduce, int trainer_id,
-                     int inter_trainer_id, int exter_trainer_id) const {
-    // std::string endpoint = Attr<std::string>("endpoint");
-    // NOTE: Can not use unique_ptr here because the default
-    // deleter will call GRPC Server's base class's dtor and
-    // that will cause a wired crash.
-    distributed::RequestSendHandler rpc_h(true);
-    std::unique_ptr<distributed::RPCServer> rpc_service(
-        new RPCSERVER_T(endpoint, 1));
-
-    rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
-    rpc_h.SetRPCServer(rpc_service.get());
-
-    framework::ProgramDesc empty_program;
-    framework::Executor executor(dev_ctx.GetPlace());
-    rpc_h.SetScope(scope);
-    rpc_h.SetDevCtx(&dev_ctx);
-    rpc_h.SetProgram(&empty_program);
-    rpc_h.SetExecutor(&executor);
-
-    std::thread server_thread(
-        std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
-
-    for (int i = 0; i < nccl_comm_num; i++) {
-      rpc_service->SetCond(distributed::kRequestSend);
-      VLOG(3) << "trainer_id:" << trainer_id
-              << " start getting nccl id from trainer 0, nccl_comm_no:" << i;
-      rpc_service->WaitBarrier(distributed::kRequestSend);
-      rpc_service->ResetBarrierCounter();
-    }
-
-    if (use_hierarchical_allreduce) {
-      if (inter_trainer_id > 0) {
-        for (int i = 0; i < nccl_comm_num; i++) {
-          rpc_service->SetCond(distributed::kRequestSend);
-          VLOG(3) << "trainer_id:" << trainer_id
-                  << ", inter_trainer_id:" << inter_trainer_id
-                  << " start getting nccl id from inter_trainer:" << i;
-          rpc_service->WaitBarrier(distributed::kRequestSend);
-          rpc_service->ResetBarrierCounter();
-        }
-      }
-
-      if (exter_trainer_id > 0) {
-        for (int i = 0; i < nccl_comm_num; i++) {
-          rpc_service->SetCond(distributed::kRequestSend);
-          VLOG(3)
-              << "trainer_id:" << trainer_id
-              << ", exter_trainer_id:" << exter_trainer_id
-              << " start getting nccl id from exter_trainer 0, nccl_comm_no:"
-              << i;
-          rpc_service->WaitBarrier(distributed::kRequestSend);
-          rpc_service->ResetBarrierCounter();
-        }
-      }
-    }
-
-    VLOG(3) << "traier_id:" << trainer_id
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << " got nccl id and stop server...";
-    rpc_service->ShutDown();
-    VLOG(3) << "rpc server stopped";
-    server_thread.join();
-  }
-};
-
-class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("NCCLID", "Raw variable contains a NCCL UniqueId instaces.");
-    AddComment(R"DOC(
-GenNCCLId operator
-
-For trainer 0: generate a new UniqueId and send it to all the other trainers.
-For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
-)DOC");
-    AddAttr<std::vector<std::string>>(
-        "trainers",
-        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
-        "list of all trainer endpoints")
-        .SetDefault({});
-    AddAttr<int>("trainer_id",
-                 "(int) "
-                 "The index of the trainer in distributed training.");
-    AddAttr<int>("nccl_comm_num",
-                 "(int default 1) "
-                 "The number of nccl communicator num.")
-        .SetDefault(1);
-    AddAttr<bool>("use_hierarchical_allreduce",
-                  "(bool default false) "
-                  "Wheter to use hierarchical allreduce.")
-        .SetDefault(false);
-    AddAttr<int>("hierarchical_allreduce_inter_nranks",
-                 "(int default 1) "
-                 "Wheter to use hierarchical allreduce.")
-        .SetDefault(-1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(gen_nccl_id, ops::GenNCCLIdOp, ops::GenNCCLIdOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
deleted file mode 100644
index 14b53086d1c848bd313f5dee85cf1db851d63bd1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ /dev/null
@@ -1,525 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
-DEFINE_int32(rpc_get_thread_num, 12, "number of threads for rpc get");
-DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch");
-
-namespace paddle {
-namespace operators {
-
-void RunServer(std::shared_ptr<distributed::RPCServer> service) {
-  service->StartServer();
-  VLOG(4) << "RunServer thread end";
-}
-static void split(const std::string &str, char sep,
-                  std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-static void ParallelExecuteBlocks(
-    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
-    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
-        &prepared,
-    framework::ProgramDesc *program, framework::Scope *scope) {
-  std::vector<std::future<void>> fs;
-  for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
-      int run_block = idx;  // thread local
-      try {
-        VLOG(3) << "running server block: " << run_block
-                << "pointer: " << prepared[run_block].get();
-        executor->RunPreparedContext(prepared[run_block].get(), scope);
-      } catch (const std::exception &e) {
-        LOG(FATAL) << "run sub program:" << idx << " error " << e.what();
-      }
-    }));
-  }
-  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-}
-
-ListenAndServOp::ListenAndServOp(const std::string &type,
-                                 const framework::VariableNameMap &inputs,
-                                 const framework::VariableNameMap &outputs,
-                                 const framework::AttributeMap &attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
-
-ListenAndServOp::~ListenAndServOp() { Stop(); }
-
-void ListenAndServOp::Stop() {
-  rpc_service_->ShutDown();
-  server_thread_->join();
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-}
-
-void ListenAndServOp::SavePort() const {
-  // NOTE: default write file to /tmp/paddle.selected_port
-  rpc_service_->SavePort();
-}
-
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
-void ListenAndServOp::RunSyncLoop(
-    framework::Executor *executor, framework::ProgramDesc *program,
-    framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
-    const std::vector<int> &prefetch_block_id_list,
-    const int checkpoint_point_block_id) const {
-  VLOG(2) << "RunSyncLoop";
-  size_t num_blocks = program->Size();
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
-
-  // Prepare all the server block
-  std::vector<int> optimize_blocks_list;
-  for (size_t i = 1; i < program->Size(); ++i) {
-    optimize_blocks_list.push_back(i);
-  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
-  // Insert placeholder for block0 which holds current op itself,
-  // NOTE the first block in `optimize_prepared` should never be ran.
-  optimize_prepared.insert(
-      optimize_prepared.begin(),
-      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
-
-  // Trainers will get all parameters from pserver in the
-  // startup program, so we will wait RequestGet first
-  rpc_service_->SetCond(distributed::kRequestGet);
-  rpc_service_->WaitBarrier(distributed::kRequestGet);
-  rpc_service_->ResetBarrierCounter();
-
-  while (true) {
-    // Get from multiple trainers, we don't care about the order in which
-    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    VLOG(3) << "wait all clients to send gradient";
-    rpc_service_->SetCond(distributed::kRequestSend);
-    VLOG(3) << "wait all clients to send send_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestSend);
-
-    if (rpc_service_->IsExit()) {
-      LOG(WARNING) << "get exit!rpc_processor break!";
-      rpc_service_->SetCond(distributed::kRequestGet);
-      break;
-    }
-
-    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
-    // and this will still work.
-    // The optimize blocks which have the same parent ID would run parallel
-    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
-    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(optimize_blocks[0]->ID());
-    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
-      // skip the first optimize block because it is already in the
-      // parallel_blkids.
-      int blkid = optimize_blocks[i]->ID();
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                              program, recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
-      }
-      parallel_blkids.push_back(blkid);
-    }
-    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
-                          recv_scope);
-    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-
-    VLOG(3) << "ResetReceivedVars";
-    ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
-
-    VLOG(3) << "wait all clients to get parameters back";
-    rpc_service_->SetCond(distributed::kRequestGet);
-    VLOG(3) << "wait all clients to send fetch_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-    VLOG(3) << "ResetBarrierCounter";
-    rpc_service_->ResetBarrierCounter();
-  }  // while(true)
-}
-
-void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
-                                        platform::DeviceContext *dev_ctx,
-                                        bool reset_all) const {
-  for (auto &varname : sparse_vars_) {
-    auto var = recv_scope->FindVar(varname);
-    if (var == nullptr) {
-      VLOG(2) << "can not find var " << varname << " in received scope";
-      continue;
-    }
-    if (var->IsType<framework::SelectedRows>()) {
-      VLOG(3) << "reset sparse var: " << varname;
-      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-    } else {
-      PADDLE_THROW("The type of sparse var should be SelectedRows");
-    }
-  }
-  if (UNLIKELY(reset_all)) {
-    for (auto &varname : dense_vars_) {
-      auto var = recv_scope->FindVar(varname);
-      if (var == nullptr) {
-        VLOG(2) << "can not find var " << varname << " in received scope";
-        continue;
-      }
-      if (var->IsType<framework::LoDTensor>()) {
-        math::set_constant(*dev_ctx, var->GetMutable<framework::LoDTensor>(),
-                           static_cast<float>(0));
-      } else if (var->IsType<framework::Tensor>()) {
-        math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
-                           static_cast<float>(0));
-      } else {
-        PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]");
-      }
-    }
-  }
-}
-
-void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
-                                   framework::ProgramDesc *program,
-                                   framework::Scope *recv_scope) const {
-  VLOG(2) << "RunAsyncLoop";
-  auto grad_to_block_id_str =
-      Attr<std::vector<std::string>>("grad_to_block_id");
-  DoubleFindMap<std::string, int32_t> grad_to_block_id;
-
-  auto append_block_maps = [](DoubleFindMap<std::string, int32_t> *out_map,
-                              const std::string &grad_and_id) {
-    std::vector<std::string> pieces;
-    split(grad_and_id, ':', &pieces);
-    VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
-    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
-
-    int block_id = std::stoi(pieces[1]);
-    (*out_map)[pieces[0]] = block_id;
-  };
-
-  for (const auto &grad_and_id : grad_to_block_id_str) {
-    append_block_maps(&grad_to_block_id, grad_and_id);
-  }
-
-  size_t num_blocks = program->Size();
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
-
-  std::vector<int> block_list;
-  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
-  }
-  auto optimize_prepared = executor->Prepare(*program, block_list);
-  // execute global block if needed, block id 1 in the program is global
-  // block if it's not bind to a grad var for it's update.
-  if (block_list[0] == 1 &&
-      grad_to_block_id.find_value(static_cast<int32_t>(1)) ==
-          grad_to_block_id.end()) {
-    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
-  }
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      grad_to_prepared_ctx, param_to_prepared_ctx;
-  for (size_t i = 0; i < block_list.size(); ++i) {
-    auto blkid = block_list[i];
-    auto it = grad_to_block_id.find_value(blkid);
-    if (it != grad_to_block_id.end()) {
-      grad_to_prepared_ctx[it->first] = optimize_prepared[i];
-    }
-  }
-
-  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-
-  while (true) {
-    if (rpc_service_->IsExit()) {
-      VLOG(4) << "get exit!rpc_processor break!";
-      break;
-    }
-
-    sleep(1);
-  }  // while(true)
-}
-
-static void FillRequestCtx(
-    distributed::RequestHandler *h, framework::Scope *scope,
-    platform::DeviceContext *dev_ctx, framework::Executor *executor,
-    framework::ProgramDesc *program,
-    std::unordered_map<std::string,
-                       std::shared_ptr<framework::ExecutorPrepareContext>>
-        *prefetch_ctx,
-    std::unordered_map<std::string, std::string>
-        *sparse_grad_name_to_param_name,
-    std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
-    distributed::RPCServer *rpc_server) {
-  h->SetScope(scope);
-  h->SetDevCtx(dev_ctx);
-  h->SetExecutor(executor);
-  h->SetProgram(program);
-  h->SetPrefetchPreparedCtx(prefetch_ctx);
-  h->SetSparseGradToParam(sparse_grad_name_to_param_name);
-  h->SetRPCServer(rpc_server);
-  h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
-}
-
-void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
-                                    const framework::Scope &scope) const {
-  for (const auto &varname : varnames) {
-    auto var = scope.FindVar(varname);
-    PADDLE_ENFORCE(var != nullptr,
-                   "Received var should be initialized in the received scope.");
-    if (var->IsType<framework::SelectedRows>()) {
-      sparse_vars_.push_back(varname);
-    } else if (var->IsType<framework::LoDTensor>() ||
-               var->IsType<framework::Tensor>()) {
-      dense_vars_.push_back(varname);
-    } else {
-      PADDLE_THROW(
-          "The type of received var should be in [SelectedRows, LoDTensor, "
-          "Tensor].");
-    }
-  }
-}
-
-void ListenAndServOp::RunImpl(const framework::Scope &scope,
-                              const platform::Place &dev_place) const {
-  // Mark this as PS that it should decide profiling by listening from trainer.
-  platform::SetProfileListener();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  bool sync_mode = Attr<bool>("sync_mode");
-  bool dc_sgd = Attr<bool>("dc_asgd");
-  auto fan_in = Attr<int>("Fanin");
-  auto inputs = Inputs("X");
-
-  PADDLE_ENFORCE(!rpc_service_);
-  std::string endpoint = Attr<std::string>("endpoint");
-  int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
-
-  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
-          << ", end_point:" << endpoint
-          << ", checkpoint_block_id: " << checkpoint_block_id;
-
-  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-
-  request_send_handler_.reset(
-      new distributed::RequestSendHandler(sync_mode, dc_sgd));
-  request_get_handler_.reset(
-      new distributed::RequestGetHandler(sync_mode, dc_sgd));
-  request_prefetch_handler_.reset(
-      new distributed::RequestPrefetchHandler(sync_mode));
-  request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
-      sync_mode, checkpoint_block_id));
-  request_get_no_barrier_handler_.reset(
-      new distributed::RequestGetNoBarrierHandler());
-
-  rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get(),
-                            FLAGS_rpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get(),
-                            FLAGS_rpc_get_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
-                            request_prefetch_handler_.get(),
-                            FLAGS_rpc_prefetch_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
-                            request_checkpoint_handler_.get());
-  rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier,
-                            request_get_no_barrier_handler_.get());
-
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE(optimize_blocks.size() >= 1,
-                 "optimize blocks should be 1 at least on the pserver side.");
-  auto *program = optimize_blocks[0]->Program();
-  framework::Executor executor(dev_place);
-
-  std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
-  if (checkpoint_block_id != -1) {
-    auto ctx = executor.Prepare(*program, checkpoint_block_id);
-    // see: https://stackoverflow.com/a/14856553
-    ckpt_pre_context = std::move(ctx);
-  }
-
-  // prepare for prefetch
-  std::vector<int> prefetch_block_id_list;
-  std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
-
-  auto prefetch_var_name_to_block_id_str =
-      Attr<std::vector<std::string>>(kPrefetchVarNameToBlockId);
-  for (const auto &prefetch_var_name_and_id :
-       prefetch_var_name_to_block_id_str) {
-    std::vector<std::string> pieces;
-    split(prefetch_var_name_and_id, ':', &pieces);
-    VLOG(3) << "after split, prefetch_var = " << pieces[0]
-            << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
-
-    int block_id = std::stoi(pieces[1]);
-    prefetch_block_id_list.push_back(block_id);
-    block_id_to_prefetch_var_name[block_id] = pieces[0];
-  }
-
-  auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared_ctx;
-  for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) {
-    auto block_id = prefetch_block_id_list[i];
-    auto prefetch_var_name = block_id_to_prefetch_var_name[block_id];
-    prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
-  }
-
-  // parse attr of kSparseGradToParam  sparse_grad_name -> param_name
-  std::unordered_map<std::string, std::string> sparse_grad_name_to_param_name;
-  auto sparse_grad_name_to_param_name_str =
-      Attr<std::vector<std::string>>(kSparseGradToParam);
-  for (const auto &sparse_grad_name_and_param_name :
-       sparse_grad_name_to_param_name_str) {
-    std::vector<std::string> pieces;
-    split(sparse_grad_name_and_param_name, ':', &pieces);
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
-    VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
-            << ", param_name = " << pieces[1];
-    sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
-  }
-
-  auto f = std::bind(
-      FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, &executor,
-      program, &prefetch_var_name_to_prepared_ctx,
-      &sparse_grad_name_to_param_name, ckpt_pre_context, rpc_service_.get());
-
-  f(request_send_handler_.get());
-  f(request_get_handler_.get());
-  f(request_prefetch_handler_.get());
-  f(request_checkpoint_handler_.get());
-  f(request_get_no_barrier_handler_.get());
-
-  // start the server listening after all member initialized.
-  server_thread_.reset(new std::thread(RunServer, rpc_service_));
-  VLOG(3) << "wait server thread to become ready...";
-  rpc_service_->WaitServerReady();
-
-  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
-  signal(SIGINT, SignalHandler::StopAndExit);
-  signal(SIGTERM, SignalHandler::StopAndExit);
-
-  // Cache the type of the received vars as `sparse_vars_` and `dense_vars_`
-  // so that we can reset them at the end of each iteration.
-  // NOTE: only used in sync update
-  CacheVarsType(inputs, recv_scope);
-
-  // Write to a file of server selected port for python use.
-  SavePort();
-  if (sync_mode) {
-    RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
-                prefetch_block_id_list, checkpoint_block_id);
-  } else {
-    distributed::AsyncSparseParamUpdateRecorder::Init(
-        fan_in, sparse_grad_name_to_param_name);
-    RunAsyncLoop(&executor, program, &recv_scope);
-  }
-}
-
-class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
-" will start a RPC server which can receive variables from send_op and send" +
-"back variables to recv_op.)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<std::vector<std::string>>(
-        "grad_to_block_id",
-        "['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
-        "a map from grad name to it's optimize block id")
-        .SetDefault({});
-    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<bool>("dc_asgd", "set to true will enable DC-ASGD training.")
-        .SetDefault(false);
-    AddAttr<std::vector<framework::BlockDesc *>>(
-        kOptimizeBlocks, "Optimize blocks to run on server side.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
-                                      "prefetch blocks to run on server side.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        kSparseGradToParam,
-        "sparse grad name to param name. like: 'emb@Grad:emb'")
-        .SetDefault({});
-    AddAttr<int>("Fanin", "How many clients send to this server.")
-        .SetDefault(1);
-    AddAttr<int>(kCheckpointBlockId,
-                 "BolckID to run save checkpoint on pserer.")
-        .SetDefault(-1);
-  }
-};
-
-void SignalHandler::StopAndExit(int signal_num) {
-  // Do not use VLOG here for the device for printing maybe already released.
-  // exit will release interal allocated resoureces.
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-  exit(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
-                  ops::ListenAndServOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
deleted file mode 100644
index 1cf2130d7a593077d1145b4f3be379c32557dd53..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <atomic>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr char kOptimizeBlocks[] = "optimize_blocks";
-constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
-constexpr char kCheckpointBlockId[] = "checkpint_block_id";
-constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
-
-void RunServer(std::shared_ptr<distributed::RPCServer> service);
-
-template <class TKey, class TValue>
-class DoubleFindMap : public std::unordered_map<TKey, TValue> {
- public:
-  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
-    return std::find_if(this->begin(), this->end(),
-                        [&v](const std::pair<const std::string, int> p) {
-                          return p.second == v;
-                        });
-  }
-};
-
-class ListenAndServOp : public framework::OperatorBase {
- public:
-  ListenAndServOp(const std::string& type,
-                  const framework::VariableNameMap& inputs,
-                  const framework::VariableNameMap& outputs,
-                  const framework::AttributeMap& attrs);
-  virtual ~ListenAndServOp();
-
-  void RunSyncLoop(framework::Executor* executor,
-                   framework::ProgramDesc* program,
-                   framework::Scope* recv_scope,
-                   platform::DeviceContext* dev_ctx,
-                   const std::vector<int>& prefetch_block_id_list,
-                   const int checkpoint_point_block_id) const;
-
-  void RunAsyncLoop(framework::Executor* executor,
-                    framework::ProgramDesc* program,
-                    framework::Scope* recv_scope) const;
-
-  void SavePort() const;
-
-  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
-
-  void Stop() override;
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override;
-
-  void ResetReceivedVars(framework::Scope* recv_scope,
-                         platform::DeviceContext* dev_ctx,
-                         bool reset_all = false) const;
-
-  void CacheVarsType(const std::vector<std::string>& varnames,
-                     const framework::Scope& scope) const;
-
- protected:
-  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_get_no_barrier_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_prefetch_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_checkpoint_handler_;
-
-  mutable std::shared_ptr<std::thread> server_thread_;
-  mutable std::vector<std::string> sparse_vars_;
-  mutable std::vector<std::string> dense_vars_;
-};
-
-class SignalHandler {
- public:
-  static void StopAndExit(int signal_num);
-
- private:
-  DISABLE_COPY_AND_ASSIGN(SignalHandler);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
deleted file mode 100644
index 1b0b4dd31693340bc39c0da8995a2a2d40b13e00..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/merge_ids_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
-        .AsDuplicable();
-    AddInput("Rows", "(LoDTensor) the input ids with shape{row_size, 1}, ")
-        .AsDuplicable();
-    AddInput("X",
-             "(LoDTensors) multi input tensor with shape{Rows, N}, N is the "
-             "size of embedding table")
-        .AsDuplicable();
-    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.")
-        .AsDuplicable();
-
-    AddComment(R"DOC(
-Merge multi LoDTensor's into one according to Ids's shard num.
-
-
-split_ids_op -> prefetch_op -> merge_ids_op
-
-
-merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op
- will split input Ids into multiple tensors according to Id's shard number.
-prefetch_op will send them to parameter server to prefetch embedding value
-back. During split, the order of ids is disordered. In merge_ids_op we use
-the original Ids to restore the order of the fetched embedding value and
- also pass the lod information to the merged output.
-
-
-Example:
-
-    Ids = [1,2,3,4,5,6] # 3 shared
-
-split_ids_op ->
-
-    Id0 = [3, 6] # id % 3 == 0
-    Id1 = [1, 4] # id % 3 == 1
-    Id2 = [2, 5] # id % 3 == 2
-
-prefetch_op ->
-
-    X0 = [[0.3 0.3]   # 3
-          [0.6 0.6]]  # 6
-    X1 = [[0.1 0.1]   # 1
-          [0.4 0.4]]  # 4
-    X2 = [[0.2 0.2]   # 2
-          [0.5 0.5]]  # 5
-
-merge_ids_op ->
-
-    Out = [[0.1 0.1]  # 1
-           [0.2 0.2]  # 2
-           [0.3 0.3]  # 3
-           [0.4 0.4]  # 4
-           [0.5 0.5]  # 5
-           [0.6 0.6]] # 6
-)DOC");
-  }
-};
-
-class MergeIdsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("Ids"),
-                   "MergeIdsOp must has multi input Ids.");
-    PADDLE_ENFORCE(ctx->HasInputs("Rows"),
-                   "MergeIdsOp must has multi input Rows.");
-    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has multi input X.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
-                   "MergeIdsOp must has multi output Out.");
-
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims[0].size(), 2);
-      PADDLE_ENFORCE_EQ(ids_dims[0][1], 1);
-    }
-    auto x_var_type = ctx->GetInputsVarType("X");
-    for (auto &var_type : x_var_type) {
-      PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR,
-                        "input X only support lod tensors");
-    }
-    ctx->ShareLoD("Ids", "Out");
-  }
-
- private:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.MultiInput<framework::Tensor>("X").front()->type(), ctx.GetPlace());
-  }
-};
-
-class MergeIdsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
-    for (auto &out_var : ctx->Output("Out")) {
-      ctx->SetType(out_var, input_type);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker,
-                  ops::MergeIdsOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    merge_ids, ops::MergeIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.h b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
deleted file mode 100644
index 05c00251b97bb5071102a43208c1fbbfa4ef8d2d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <tuple>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MergeIdsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    if (!platform::is_cpu_place(place)) {
-      PADDLE_THROW("MergeIds do not support GPU kernel");
-    }
-
-    const auto ids = ctx.MultiInput<framework::LoDTensor>("Ids");
-    const auto row_ids = ctx.MultiInput<framework::LoDTensor>("Rows");
-    const auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-
-    PADDLE_ENFORCE_EQ(row_ids.size(), x_tensors.size(),
-                      "the number of Rows and X should be the same");
-    PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
-                      "the number of Ids and Out should be the same");
-
-    int64_t row_ids_size = 0;
-    int64_t row_size = 0;
-    int64_t embedding_size = 0;
-
-    for (size_t i = 0; i < x_tensors.size(); ++i) {
-      const auto *x_tensor = x_tensors[i];
-      const auto *row_id = row_ids[i];
-
-      if (embedding_size == 0) {
-        embedding_size = x_tensor->dims()[1];
-      }
-      PADDLE_ENFORCE_EQ(embedding_size, x_tensor->dims()[1],
-                        "embedding size of all input should be the same");
-      row_size += x_tensor->dims()[0];
-      row_ids_size += row_id->dims()[0];
-    }
-
-    PADDLE_ENFORCE_EQ(
-        row_size, row_ids_size,
-        "the merged X dim[0] and merged Rows dim[0] should be the same");
-
-    std::unordered_map<int64_t, std::tuple<int64_t, int64_t>>
-        selected_rows_idx_map;
-    for (size_t i = 0; i < x_tensors.size(); ++i) {
-      const auto *row_id = row_ids[i];
-
-      for (auto j = 0; j < row_id->numel(); ++j) {
-        int64_t key = row_id->data<int64_t>()[j];
-        std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
-        selected_rows_idx_map.insert(std::make_pair(key, val));
-      }
-    }
-    PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(),
-                      "the rows and tensor map size should be the same");
-
-    for (size_t i = 0; i < outs.size(); ++i) {
-      auto *out_ids = ids[i];
-      auto *out = outs[i];
-
-      out->set_lod(out_ids->lod());
-
-      auto nums = out_ids->dims()[0];
-      auto *out_data = out->mutable_data<T>(
-          framework::make_ddim({nums, embedding_size}), place);
-      for (auto j = 0; j < nums; ++j) {
-        auto id = out_ids->data<int64_t>()[j];
-        auto row_tuple = selected_rows_idx_map.at(id);
-        auto row_idx = std::get<1>(row_tuple);
-        const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
-
-        memcpy(out_data + embedding_size * j,
-               x_tensor->data<T>() + row_idx * embedding_size,
-               sizeof(T) * embedding_size);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
deleted file mode 100644
index 52b96d5f8ef7851aa0a99c1d64771f1dc84c66ad..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-
-namespace paddle {
-namespace operators {
-
-class PrefetchOp : public framework::OperatorBase {
- public:
-  PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-    auto outs = Outputs("Out");
-
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < ins.size(); i++) {
-      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
-                << outs[i] << " back";
-        rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
-                                                    ins[i], outs[i]));
-      } else {
-        VLOG(3) << "don't send no-initialied variable: " << ins[i];
-      }
-    }
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-    }
-  }
-};
-
-class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
-    AddOutput("Out",
-              "(LoDTensor) result "
-              "to be fetched from parameter server")
-        .AsDuplicable();
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-    AddComment(R"DOC(
-Prefetch operator
-
-This operator will send Ids variables to listen_and_serve op at
-the parameter server and fetch result back.
-)DOC");
-  }
-};
-
-class PrefetchOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(prefetch, ops::PrefetchOp,
-                  paddle::framework::EmptyGradOpMaker, ops::PrefetchOpMaker,
-                  ops::PrefetchOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
deleted file mode 100644
index 30a161fe2565e2f8fc6c86537b17d82a5905deac..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/operators/distributed/rpc_common.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-class RecvOp : public framework::OperatorBase {
- public:
-  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    int do_not_run = Attr<int>("do_not_run");
-    if (do_not_run) {
-      VLOG(3) << "recv do not run!";
-      return;
-    }
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    std::vector<std::string> varnames =
-        Attr<std::vector<std::string>>("varnames");
-
-    auto outs = Outputs("Out");
-    bool with_barrier = Attr<bool>("with_barrier");
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &ctx = *pool.Get(place);
-    auto trainer_id = Attr<int>("trainer_id");
-
-    distributed::RPCClient *rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-    std::vector<std::string> recv_varnames =
-        Attr<std::vector<std::string>>("recv_varnames");
-
-    if (recv_varnames.size() > 0) {
-      auto recv_functor = distributed::ParameterRecv<float>();
-      auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {},
-                                             trainer_id);
-      recv_functor(rpc_ctx, scope);
-    } else {
-      std::vector<distributed::VarHandlePtr> rets;
-      if (with_barrier) {
-        for (size_t i = 0; i < outs.size(); i++) {
-          std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
-          VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
-                  << varname << " and with AsyncGetVar";
-          rets.push_back(
-              rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
-        }
-      } else {
-        for (size_t i = 0; i < outs.size(); i++) {
-          std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
-          VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
-                  << varname << " and with AsyncGetVarNoBarrier";
-          rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
-                                                          varname, outs[i]));
-        }
-      }
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_recv " << outs[i] << "from " << epmap[i];
-        PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
-        VLOG(7) << "after sync_recv " << outs[i] << "from " << epmap[i];
-      }
-    }
-  }
-};
-
-class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDuplicable();
-    AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
-    AddComment(R"DOC(
-Recv operator
-
-This operator can get variables from server side.
-)DOC");
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({});
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<bool>("with_barrier",
-                  "(bool, default True) if with_barrier=False, will use "
-                  "AsyncGetVarNoBarrier get variable from pserver immediately")
-        .SetDefault(true);
-    AddAttr<std::vector<std::string>>(
-        "varnames",
-        "(string vector, default {}) "
-        "sometimes we need to put received var in another name "
-        "for example: we need var named 'moment_1@127.0.0.1:1001', "
-        "and it real name on parameter server is 'moment_1'. ")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "recv_varnames",
-        "(vector<string>) "
-        "the splited parameter varnames to be recved from pserver")
-        .SetDefault(std::vector<std::string>{});
-    AddAttr<int>("do_not_run", "if recv need to really run").SetDefault(0);
-  }
-};
-
-class RecvOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(recv, ops::RecvOp, paddle::framework::EmptyGradOpMaker,
-                  ops::RecvOpMaker, ops::RecvOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
deleted file mode 100644
index 7e16e6ff66b603634aa7cd26f71a4f2d3159c4e4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class RefByTrainerIdOp : public framework::OperatorWithKernel {
- public:
-  RefByTrainerIdOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("X"),
-                   "Input(X) of RefByTrainerIdOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("TrainerId"),
-                   "Input(TrainerId) of RefByTrainerIdOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of RefByTrainerIdOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("TrainerId").size(), 1,
-                      "TrainerId should be a scalar.");
-    // Out's shape is determined at runtime.
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.MultiInput<framework::Tensor>("X")[0]->type(), ctx.GetPlace());
-  }
-};
-
-class RefByTrainerIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor list.").AsDuplicable();
-    AddInput("TrainerId", "(Tensor) Scalar int, the trainer id runtime value.");
-    AddOutput("Out", "(Tensor) Return one tensor reference of X[trainer_id]");
-    AddComment(R"DOC(
-**RefByTrainerId operator**
-
-Return a reference of a tensor, using trainer_id as the index to find from the input.
-
-$$Out = X[TrainerId]$$
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(ref_by_trainer_id, ops::RefByTrainerIdOp,
-                             ops::RefByTrainerIdOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    ref_by_trainer_id,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc
deleted file mode 100644
index 168cd51355de56c2e2a83ba73d7eb14f6ba6e533..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    ref_by_trainer_id,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            float>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            double>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            int>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
deleted file mode 100644
index 34192278d84758d720e021215c14a54349ba0c62..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class RefByTrainerIdKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto in_list = context.MultiInput<framework::Tensor>("X");
-    auto* trainer_id_t = context.Input<framework::Tensor>("TrainerId");
-    int64_t trainer_id = 0;
-    auto* trainer_id_data = trainer_id_t->data<int64_t>();
-    if (platform::is_gpu_place(context.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      auto stream = context.cuda_device_context().stream();
-      memory::Copy<>(platform::CPUPlace(), &trainer_id,
-                     boost::get<platform::CUDAPlace>(context.GetPlace()),
-                     trainer_id_data, sizeof(int64_t), stream);
-#endif
-    } else {
-      trainer_id = *trainer_id_data;
-    }
-    PADDLE_ENFORCE_LT((size_t)trainer_id, in_list.size());
-    out->mutable_data<T>(context.GetPlace());
-    out->ShareDataWith(*(in_list[trainer_id]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
deleted file mode 100644
index 558d0090d734b8f4dc1c2b5ac4e894573cecfc7e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-class SendBarrierOp : public framework::OperatorBase {
- public:
-  SendBarrierOp(const std::string& type,
-                const framework::VariableNameMap& inputs,
-                const framework::VariableNameMap& outputs,
-                const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    VLOG(3) << "SendBarrierOp sync";
-
-    std::vector<distributed::VarHandlePtr> rets;
-
-    for (auto& ep : eps) {
-      VLOG(3) << "send barrier, ep: " << ep;
-      rets.push_back(rpc_client->AsyncSendBatchBarrier(ep));
-    }
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
-    }
-  }
-};
-
-class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SendBarrier operator
-
-This operator will send a send barrier signal to list_and_serv op, so that
-the Parameter Server would knew all variables have been sent.
-)DOC");
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({"127.0.0.1:6164"});
-  }
-};
-
-class SendBarrierOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
-                  paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
-                  ops::SendBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
deleted file mode 100644
index acb25b17d563c18500d1ea7edbea809283bccd06..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include "paddle/fluid/operators/distributed/rpc_common.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-class SendOp : public framework::OperatorBase {
- public:
-  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-
-    auto epmap = Attr<std::vector<std::string>>("epmap");
-    auto trainer_id = Attr<int>("trainer_id");
-
-    auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
-    auto height_sections = Attr<std::vector<int64_t>>("sections");
-
-    if (send_varnames.size() > 0) {
-      PADDLE_ENFORCE_EQ(ins.size(), 1, "");
-      if (distributed::Communicator::GetInstance() == nullptr) {
-        auto send_functor = distributed::ParameterSend<float>();
-        auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
-                                               height_sections, trainer_id);
-        send_functor(rpc_ctx, scope, true);
-      } else {
-        distributed::Communicator::GetInstance()->Send(ins[0], scope);
-      }
-    } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      auto& ctx = *pool.Get(place);
-
-      distributed::RPCClient* rpc_client =
-          distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-      std::vector<distributed::VarHandlePtr> rets;
-      for (size_t i = 0; i < ins.size(); i++) {
-        if (NeedSend(scope, ins[i])) {
-          VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-          rets.push_back(
-              rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
-        } else {
-          VLOG(3) << "don't send no-initialied variable: " << ins[i];
-        }
-      }
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
-        PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
-        VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
-      }
-    }
-  }
-};
-
-class SendOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-Send operator
-
-This operator will send variables to listen_and_serve op at the parameter server.
-)DOC");
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<std::vector<int64_t>>("sections",
-                                  "(vector<int>) "
-                                  "the length of each output along the "
-                                  "specified axis.")
-        .SetDefault(std::vector<int64_t>{});
-    AddAttr<std::vector<std::string>>(
-        "send_varnames",
-        "(vector<string>) "
-        "the splited output varnames to send to pserver")
-        .SetDefault(std::vector<std::string>{});
-    AddAttr<int>("num",
-                 "(int, default 0)"
-                 "Number of sub-tensors. This must evenly divide "
-                 "Input.dims()[axis]")
-        .SetDefault(0);
-  }
-};
-
-class SendOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(send, ops::SendOp, paddle::framework::EmptyGradOpMaker,
-                  ops::SendOpMaker, ops::SendOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc b/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
deleted file mode 100644
index a6e1805cddbf3ff2cb3eb21f31187c2947f09bf1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/string/printf.h"
-
-USE_NO_KERNEL_OP(send);
-USE_NO_KERNEL_OP(listen_and_serv);
-USE_OP(sum);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-
-// global for simplicity.
-std::unique_ptr<f::OperatorBase> listen_and_serv_op;
-int selected_port;
-
-void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
-  p::CPUDeviceContext ctx(place);
-  for (int i = 0; i < 2; ++i) {
-    auto var_name = paddle::string::Sprintf("x%d", i);
-    auto var = scope->Var(var_name);
-    auto tensor = var->GetMutable<f::LoDTensor>();
-    tensor->Resize({10, 10});
-    float *expect = tensor->mutable_data<float>(place);
-    for (int64_t i = 0; i < tensor->numel(); ++i) {
-      expect[i] = static_cast<float>(i);
-    }
-  }
-
-  auto out_var = scope->Var("Out");
-  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
-  out_tensor->Resize({10, 10});
-  out_tensor->mutable_data<float>(place);  // allocate
-}
-
-void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
-  p::CPUDeviceContext ctx(place);
-  int64_t height = 10;
-  int64_t row_numel = 10;
-  m::SetConstant<p::CPUDeviceContext, float> set_one;
-  // init x0
-  std::vector<int64_t> rows0{0, 4, 7};
-  auto x0_var = scope->Var("x0");
-  auto x0 = x0_var->GetMutable<f::SelectedRows>();
-  x0->set_rows(rows0);
-  x0->set_height(height);
-  auto x0_value = x0->mutable_value();
-  x0_value->mutable_data<float>(
-      f::make_ddim({static_cast<int64_t>(rows0.size()), row_numel}), place);
-  set_one(ctx, x0_value, 1.0);
-
-  // init x1
-  std::vector<int64_t> rows1{2, 9};
-  auto x1_var = scope->Var("x1");
-  auto x1 = x1_var->GetMutable<f::SelectedRows>();
-  x1->set_rows(rows1);
-  x1->set_height(height);
-  auto x1_value = x1->mutable_value();
-  x1_value->mutable_data<float>(
-      f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
-  set_one(ctx, x1_value, 1.0);
-
-  auto out_var = scope->Var("Out");
-  auto out = out_var->GetMutable<f::SelectedRows>();
-  auto out_value = out->mutable_value();
-  out->set_height(height);
-  out_value->mutable_data<float>(f::make_ddim({5, 10}), place);
-}
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           f::BlockDesc *block, bool is_sparse) {
-  // insert output
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(f::proto::VarType::FP32);
-      var->SetPersistable(true);
-      if (is_sparse) {
-        var->SetType(f::proto::VarType::SELECTED_ROWS);
-      }
-    }
-  }
-
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
-  f::Scope scope;
-  p::CPUPlace place;
-  VLOG(4) << "before init tensor";
-  if (is_sparse) {
-    InitSelectedRowsInScope(place, &scope);
-  } else {
-    InitTensorsInScope(place, &scope);
-  }
-  // sub program run in listen_and_serv_op, for simple test we use sum
-  f::ProgramDesc program;
-  const auto &root_block = program.Block(0);
-  std::vector<framework::BlockDesc *> optimize_blocks;
-  auto *optimize_block = program.AppendBlock(root_block);
-  optimize_blocks.push_back(optimize_block);
-
-  auto *prefetch_block = program.AppendBlock(root_block);
-  // X for server side tensors, RX for received tensors, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block,
-        is_sparse);
-  f::AttributeMap attrs;
-  attrs.insert({"endpoint", std::string("127.0.0.1:0")});
-  attrs.insert({"Fanin", 1});
-  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
-  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"optimize_blocks", optimize_blocks});
-  attrs.insert({"PrefetchBlock", prefetch_block});
-  attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
-  attrs.insert({"sync_mode", true});
-  VLOG(4) << "before init op";
-  listen_and_serv_op =
-      f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
-  *initialized = true;
-  listen_and_serv_op->Run(scope, place);
-  LOG(INFO) << "server exit";
-}
-
-TEST(SendRecvOp, CPUDense) {
-  std::atomic<bool> initialized{false};
-  std::thread server_thread(StartServerNet, false, &initialized);
-  while (!initialized) {
-  }
-
-  static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
-      ->WaitServerReady();
-
-  // local net
-  f::Scope scope;
-  p::CPUPlace place;
-  InitTensorsInScope(place, &scope);
-  // create rpc client var
-  scope.Var("RPC_CLIENT_VAR");
-
-  f::AttributeMap attrs;
-  auto *listen_and_serv_op_ptr =
-      static_cast<paddle::operators::ListenAndServOp *>(
-          listen_and_serv_op.get());
-  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
-  selected_port = listen_and_serv_op_ptr->GetSelectedPort();
-  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
-  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
-  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  const f::VariableNameMap &inputs = {{"X", {"x1"}}};
-  const f::VariableNameMap &outputs = {{"Out", {"Out"}}};
-
-  auto send_op = f::OpRegistry::CreateOp("send", inputs, outputs, attrs);
-  send_op->Run(scope, place);
-
-  auto in_var = scope.Var("x1");
-  auto tensor = in_var->GetMutable<f::LoDTensor>();
-  float *expected = tensor->data<float>();
-  auto out_var = scope.Var("Out");
-  auto target = out_var->GetMutable<f::LoDTensor>();
-  // x1 * 2 == x0
-  EXPECT_NE(target->memory_size(), size_t(0));
-  float *actual = target->data<float>();
-  for (int64_t i = 0; i < target->numel(); ++i) {
-    EXPECT_EQ(expected[i] * 2, actual[i]);
-  }
-  listen_and_serv_op->Stop();
-  server_thread.join();
-  listen_and_serv_op.reset(nullptr);
-  paddle::operators::ListenAndServOp::ResetPort();
-}
-
-TEST(SendRecvOp, CPUSparse) {
-  std::atomic<bool> initialized;
-  initialized = false;
-  std::thread server_thread(StartServerNet, true, &initialized);
-  while (!initialized) {
-  }
-  auto *listen_and_serv_op_ptr =
-      static_cast<paddle::operators::ListenAndServOp *>(
-          listen_and_serv_op.get());
-  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
-  listen_and_serv_op_ptr->WaitServerReady();
-
-  // local net
-  f::Scope scope;
-  p::CPUPlace place;
-  p::CPUDeviceContext ctx(place);
-  InitSelectedRowsInScope(place, &scope);
-  scope.Var("RPC_CLIENT_VAR");
-  f::AttributeMap attrs;
-  selected_port = listen_and_serv_op_ptr->GetSelectedPort();
-  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
-  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
-  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
-  send_op->Run(scope, place);
-
-  auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
-  auto x1 = scope.Var("x1")->GetMutable<f::SelectedRows>();
-  auto out = scope.Var("Out")->GetMutable<f::SelectedRows>();
-  auto actual = out->mutable_value();
-
-  std::unique_ptr<f::SelectedRows> expect{new f::SelectedRows()};
-  auto expect_value = expect->mutable_value();
-  expect_value->mutable_data<float>(f::make_ddim({5, 10}), place);
-
-  m::SelectedRowsAdd<p::CPUDeviceContext, float> add_functor;
-  add_functor(ctx, *x0, *x1, expect.get());
-
-  EXPECT_EQ(actual->numel(), expect_value->numel());
-  EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size());
-
-  for (int64_t i = 0; i < expect_value->numel(); ++i) {
-    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
-              actual->mutable_data<float>(place)[i]);
-  }
-  listen_and_serv_op->Stop();
-  server_thread.join();
-  listen_and_serv_op.reset();
-  paddle::operators::ListenAndServOp::ResetPort();
-}
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
deleted file mode 100644
index c05a1ff1da8803c1ef3161d0e9d8604f9f1e5f3b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-
-namespace paddle {
-namespace operators {
-
-inline bool NeedSend(const framework::Scope& scope,
-                     const std::string& varname) {
-  // dummy variable is only used in parallel executor to represent
-  // some dependency relationship, we don't need to send/recv it.
-  // TODO(paddle-dev): Why would parallel executor logic leaked into here?
-  if (varname.find(framework::ir::Node::kControlDepVarName) !=
-      std::string::npos)
-    return false;
-  auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
-  if (var->IsType<framework::LoDTensor>()) {
-    return var->Get<framework::LoDTensor>().IsInitialized();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
-  } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
-  }
-  return false;
-}
-
-inline std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int64_t>& height_sections) {
-  std::vector<int64_t> abs_sections;
-  abs_sections.resize(height_sections.size());
-  abs_sections[0] = 0;
-  for (size_t i = 1; i < height_sections.size(); ++i) {
-    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
-  }
-  return abs_sections;
-}
-
-inline size_t GetSectionIndex(int64_t id,
-                              const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (id < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cc b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
deleted file mode 100644
index 43980107c14176f1751a3db2858c80cb65c764de..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
-#include "paddle/fluid/operators/split_op.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-class SplitByrefOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SplitOp should not be null.");
-    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      "Outputs(Out) of SplitOp should not be empty.");
-    auto in_dims = ctx->GetInputDim("X");
-    auto outs_names = ctx->Outputs("Out");
-    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    auto sections = ctx->Attrs().Get<std::vector<int>>("sections");
-    const size_t outs_number = outs_names.size();
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.reserve(outs_number);
-
-    if (num > 0) {
-      int64_t in_axis_dim = 0;
-      if (ctx->IsRuntime()) {
-        in_axis_dim = in_dims[0];
-      }
-      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
-                        "tensor split does not result"
-                        " in an equal division");
-      size_t out_axis_dim = in_axis_dim / num;
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[0] = out_axis_dim;
-        outs_dims.push_back(dim);
-      }
-    } else if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
-                        "tensor split sections size"
-                        "should be equal to output size.");
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[0] = sections[i];
-        outs_dims.push_back(dim);
-      }
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
-  }
-};
-
-class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of the split operator.");
-    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SplitByref operator
-
-Split source tensor to sevaral tensors by axis 0. No copy in this operator
-is performed, output tensor shares the same blocks of memory.
-)DOC");
-    AddAttr<std::vector<int>>("sections",
-                              "(vector<int>) "
-                              "the length of each output along the "
-                              "specified axis.")
-        .SetDefault(std::vector<int>{});
-    AddAttr<int>("num",
-                 "(int, default 0)"
-                 "Number of sub-tensors. This must evenly divide "
-                 "Input.dims()[axis]")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-// NOTE: concat op default axis must be 0!
-USE_CPU_ONLY_OP(concat);
-
-REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
-                  ops::SplitGradMaker);
-REGISTER_OP_CPU_KERNEL(
-    split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc b/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc
deleted file mode 100644
index 056659c3ea61f6233a6dda56ca1e272e72770d4a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split_byref,
-    ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.h b/paddle/fluid/operators/distributed_ops/split_byref_op.h
deleted file mode 100644
index fedd7218dd6cc9481e94a92a3820cafbe4157bd0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitByrefOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto place = ctx.GetPlace();
-
-    size_t row_offset = 0;
-    for (size_t i = 0; i < outs.size(); ++i) {
-      // NOTE: no need to call mutable_data here to allocate memory.
-      auto* out = outs[i];
-      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
-      *out = in->Slice(row_offset, row_offset + out->dims()[0]);
-      row_offset += out->dims()[0];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
deleted file mode 100644
index d46b57e7e15807756efd85fde765454260ea9d7b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
-        .AsDuplicable();
-
-    AddOutput("Out", "(LoDTensors) The outputs of the input Ids.")
-        .AsDuplicable();
-
-    AddComment(R"DOC(
-Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
-Example:
-  Input:
-    X = [[1,2,3,4,5,6],[2,3]]
-
-  Out(3 output):
-    if compress is True:
-        out0 = [3, 3, 6]
-        out1 = [1, 4]
-        out2 = [2, 2, 5]
-    else:
-        out0 = [3, 6]
-        out1 = [1, 4]
-        out2 = [2, 5]
-)DOC");
-  }
-};
-
-class SplitIdsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("Ids"), "SplitIdsOp must has input Ids.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
-
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims[0].size(), 2);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::GetDataTypeOfVar(ctx.MultiInputVar("Ids").front()),
-        ctx.GetPlace());
-  }
-};
-
-class SplitIdsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
-    for (auto &out_var : ctx->Output("Out")) {
-      ctx->SetType(out_var, input_type);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
-                  ops::SplitIdsOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
-    ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h
deleted file mode 100644
index 6676ecd1c85d70cd5961af2fb1537e77b10e41bc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iterator>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitIdsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    if (!platform::is_cpu_place(place)) {
-      PADDLE_THROW("SplitIds do not support GPU kernel");
-    }
-
-    const auto ids_vars = ctx.MultiInputVar("Ids");
-
-    PADDLE_ENFORCE_GT(ids_vars.size(), 0, "The number of Ids should > 0");
-    auto *ids_var = ids_vars[0];
-
-    if (ids_var->IsType<framework::LoDTensor>()) {
-      int batch_size = 0;
-      const auto ids_tensors = ctx.MultiInput<framework::LoDTensor>("Ids");
-      for (size_t i = 0; i < ids_tensors.size(); ++i) {
-        batch_size += ids_tensors[i]->dims()[0];
-      }
-      VLOG(4) << "Get Total BatchSize is: " << batch_size;
-
-      std::vector<T> all_ids(batch_size);
-      int offset = 0;
-      for (size_t i = 0; i < ids_tensors.size(); ++i) {
-        const auto *ids = ids_tensors[i];
-        std::memcpy(all_ids.data() + offset, ids->data<T>(),
-                    ids->numel() * sizeof(T));
-        offset += ids->numel();
-      }
-
-      std::set<T> st(all_ids.begin(), all_ids.end());
-      all_ids.assign(st.begin(), st.end());
-
-      auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-      const size_t shard_num = outs.size();
-      std::vector<std::vector<T>> out_ids;
-      out_ids.resize(outs.size());
-
-      // split id by their shard_num.
-      for (size_t i = 0; i < all_ids.size(); ++i) {
-        T id = all_ids[i];
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        out_ids[shard_id].push_back(id);
-      }
-
-      // create tensor for each shard and send to parameter server
-      for (size_t i = 0; i < out_ids.size(); ++i) {
-        auto *shard_t = outs[i];
-        std::vector<T> ids = out_ids[i];
-        auto *shard_data = shard_t->mutable_data<T>(
-            framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-        for (size_t i = 0; i < ids.size(); ++i) {
-          shard_data[i] = ids[i];
-        }
-      }
-    } else if (ids_var->IsType<framework::SelectedRows>()) {
-      const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
-      auto &ids_dims = ids_selected_rows->value().dims();
-      PADDLE_ENFORCE_EQ(ids_dims[0],
-                        static_cast<int64_t>(ids_selected_rows->rows().size()),
-                        "");
-      const T *ids_data = ids_selected_rows->value().data<T>();
-      const auto &ids_rows = ids_selected_rows->rows();
-      auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-      const size_t shard_num = outs.size();
-      for (auto &out : outs) {
-        out->mutable_rows()->clear();
-      }
-      // get rows for outputs
-      std::unordered_map<int64_t, size_t> id_to_index;
-      for (size_t i = 0; i < ids_rows.size(); ++i) {
-        id_to_index[ids_rows[i]] = i;
-        size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;
-        outs[shard_id]->mutable_rows()->push_back(ids_rows[i]);
-      }
-
-      int64_t row_width = ids_dims[1];
-      for (auto &out : outs) {
-        out->set_height(ids_selected_rows->height());
-        framework::DDim ddim = framework::make_ddim(
-            {static_cast<int64_t>(out->rows().size()), row_width});
-        T *output = out->mutable_value()->mutable_data<T>(ddim, place);
-        for (int64_t i = 0; i < ddim[0]; ++i) {
-          memcpy(output + i * row_width,
-                 ids_data + id_to_index[out->rows()[i]] * row_width,
-                 row_width * sizeof(T));
-        }
-      }
-    } else {
-      PADDLE_THROW(
-          "% should be LoDTensor or SelectedRows, but the received type is %s",
-          ctx.Inputs("Ids")[0], framework::ToTypeName(ids_var->Type()));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
deleted file mode 100644
index 1598e1d0a47efe317e1dcf7d8595fa5b18829553..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#include "paddle/fluid/string/printf.h"
-
-#ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#endif
-
-USE_NO_KERNEL_OP(listen_and_serv);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-namespace distributed = paddle::operators::distributed;
-namespace string = paddle::string;
-
-std::unique_ptr<distributed::RPCServer> g_rpc_service;
-std::unique_ptr<distributed::RequestHandler> g_req_handler;
-
-void StartServer() {
-  f::Scope scope;
-  p::CPUPlace place;
-  scope.Var(NCCL_ID_VARNAME);
-  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(p::CPUPlace());
-
-  f::ProgramDesc empty_program;
-  f::Executor executor(dev_ctx.GetPlace());
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetDevCtx(&dev_ctx);
-  g_req_handler->SetProgram(&empty_program);
-  g_req_handler->SetExecutor(&executor);
-
-  g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  g_rpc_service->SetCond(distributed::kRequestSend);
-  g_rpc_service->WaitBarrier(distributed::kRequestSend);
-
-  LOG(INFO) << "got nccl id and stop server...";
-  g_rpc_service->ShutDown();
-  server_thread.join();
-}
-
-TEST(SendNcclId, RPCServer) {
-  g_req_handler.reset(new distributed::RequestSendHandler(true));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-
-  std::thread server_thread(StartServer);
-  g_rpc_service->WaitServerReady();
-
-  f::Scope scope;
-  p::CPUPlace place;
-  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(p::CPUPlace());
-
-  auto var = scope.Var(NCCL_ID_VARNAME);
-  auto id = var->GetMutable<ncclUniqueId>();
-  p::dynload::ncclGetUniqueId(id);
-
-  int port = g_rpc_service->GetSelectedPort();
-
-  std::string ep = string::Sprintf("127.0.0.1:%d", port);
-
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-  LOG(INFO) << "connect to server" << ep;
-  client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
-  client->Wait();
-  client->AsyncSendBatchBarrier(ep);
-  client->Wait();
-
-  server_thread.join();
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
deleted file mode 100644
index 273015f9763c2c7375aa0609436a2e8ab190b696..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dropout_op.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/dropout_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class DropoutOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    if (ctx->Attrs().Get<bool>("is_test") == false) {
-      ctx->SetOutputDim("Mask", x_dims);
-    }
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of dropout op.");
-    AddOutput("Out", "The output of dropout op.");
-    AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
-
-    AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
-        .SetDefault(.5f)
-        .AddCustomChecker([](const float& drop_p) {
-          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
-                         "'dropout_prob' must be between 0.0 and 1.0.");
-        });
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddAttr<bool>("fix_seed",
-                  "A flag indicating whether to use a fixed seed to generate "
-                  "random mask. NOTE: DO NOT set this flag to true in "
-                  "training. Setting this flag to true is only useful in "
-                  "unittest or for debug that always the same output units "
-                  "will be dropped.")
-        .SetDefault(false);
-    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
-    AddAttr<std::string>(
-        "dropout_implementation",
-        "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
-        "There are two kinds of ways to implement dropout"
-        "(the mask below is a tensor have the same shape with input"
-        "the value of mask is 0 or 1, the ratio of 0 is dropout_prob)"
-        "1. downgrade_in_infer(default), downgrade the outcome at inference "
-        "time"
-        "   train: out = input * mask"
-        "   inference: out = input * (1.0 - dropout_prob)"
-        "2. upscale_in_train, upscale the outcome at training time, do nothing "
-        "in inference"
-        "   train: out = input * mask / ( 1.0 - dropout_prob )"
-        "   inference: out = input"
-        "   dropout op can be removed from the program. the program will be "
-        "efficient")
-        .SetDefault("downgrade_in_infer")
-        .AddCustomChecker([](const std::string& type) {
-          PADDLE_ENFORCE(
-              type == "downgrade_in_infer" || type == "upscale_in_train",
-              "dropout_implementation can only be downgrade_in_infer or "
-              "upscale_in_train");
-        });
-
-    AddComment(R"DOC(
-Dropout Operator.
-
-Dropout refers to randomly dropping out units in a nerual network. It is a
-regularization technique for reducing overfitting by preventing neuron
-co-adaption during training. The dropout operator randomly set (according to
-the given dropout probability) the outputs of some units to zero, while others
-are set equal to their corresponding inputs.
-
-)DOC");
-  }
-};
-
-class DropoutOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
-                      "GradOp is only callable when is_test is false");
-
-    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) must not be null.");
-
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
-    ctx->ShareLoD(framework::GradVarName("Out"),
-                  /*->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-};
-
-class DropoutGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("dropout_grad");
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("Mask", Output("Mask"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
-                  ops::DropoutGradOpDescMaker);
-REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
deleted file mode 100644
index 3e0cb76d0435c0b2bc22ec68eb08581183da0672..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dropout_op.cu
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda.h>
-#include <curand_kernel.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include <string>
-#include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/platform/dynload/curand.h"
-#include "paddle/fluid/platform/float16.h"
-namespace paddle {
-namespace operators {
-
-template <typename T, typename MaskType>
-__global__ void RandomGenerator(const size_t n, const int seed,
-                                const float dropout_prob, const T* src,
-                                MaskType* mask_data, T* dst,
-                                bool is_upscale_in_train) {
-  curandStatePhilox4_32_10_t state;
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int step_size = 0;
-
-  MaskType mask;
-  T dest;
-  for (; idx < n; idx += blockDim.x * gridDim.x) {
-    T s = src[idx];
-    if (step_size == 0) {
-      curand_init(seed, idx, idx, &state);
-      step_size = blockDim.x * gridDim.x;
-    } else {
-      curand_init(seed, idx, step_size, &state);
-    }
-    if (curand_uniform(&state) < dropout_prob) {
-      mask = 0;
-      dest = 0;
-    } else {
-      mask = 1;
-      if (is_upscale_in_train) {
-        dest = s / static_cast<T>(1.0f - dropout_prob);
-      } else {
-        dest = s;
-      }
-    }
-    mask_data[idx] = mask;
-    dst[idx] = dest;
-  }
-}
-
-// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename Place, typename T>
-class GPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Output<Tensor>("Out");
-    y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-
-    auto& place = *context.template device_context<Place>().eigen_device();
-    if (!context.Attr<bool>("is_test")) {
-      int64_t x_numel = x->numel();
-      auto stream = context.cuda_device_context().stream();
-
-      auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
-      size_t size = framework::product(mask->dims());
-      auto* x_data = x->data<T>();
-      auto* y_data = y->mutable_data<T>(context.GetPlace());
-      if (dropout_prob == 1.0f) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
-        PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
-            mask_data, 0, x_numel * sizeof(*mask_data), stream));
-        return;
-      }
-
-      std::random_device rnd;
-      int seed =
-          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
-
-      int threads = 512;
-      int grid = (x_numel + threads - 1) / threads;
-      RandomGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
-          size, seed, dropout_prob, x_data, mask_data, y_data,
-          upscale_in_train);
-    } else {
-      auto X = EigenMatrix<T>::Reshape(*x, 1);
-      auto Y = EigenMatrix<T>::Reshape(*y, 1);
-      if (upscale_in_train) {
-        Y.device(place) = X;
-      } else {
-        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    dropout_grad, ops::DropoutGradKernel<plat::CUDADeviceContext, float>,
-    ops::DropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::DropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
deleted file mode 100644
index 20742f9a453c5ad3c3702fd939e28312263323f5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dropout_op.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <cstring>
-#include <random>
-#include <string>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class CPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Output<Tensor>("Out");
-    const auto* x_data = x->data<T>();
-    auto* y_data = y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-    if (!context.Attr<bool>("is_test")) {
-      auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
-      size_t size = framework::product(mask->dims());
-
-      // Special case when dropout_prob is 1.0
-      if (dropout_prob == 1.0f) {
-        std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
-        std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
-        return;
-      }
-
-      // NOTE: fixed seed should only be used in unittest or for debug.
-      // Guarantee to use random seed in training.
-      std::random_device rnd;
-      std::minstd_rand engine;
-      int seed =
-          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
-      engine.seed(seed);
-
-      std::uniform_real_distribution<float> dist(0, 1);
-
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(engine) < dropout_prob) {
-          mask_data[i] = 0;
-          y_data[i] = 0;
-        } else {
-          mask_data[i] = 1;
-          if (upscale_in_train) {
-            y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
-          } else {
-            y_data[i] = x_data[i];
-          }
-        }
-      }
-    } else {
-      if (upscale_in_train) {
-        const auto* X_data = x->data<T>();
-        auto* Y_data = y->mutable_data<T>(context.GetPlace());
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-        for (int i = 0; i < x->numel(); i++) {
-          Y_data[i] = X_data[i];
-        }
-      } else {
-        auto X = EigenMatrix<T>::Reshape(*x, 1);
-        auto Y = EigenMatrix<T>::Reshape(*y, 1);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
-        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(!context.Attr<bool>("is_test"),
-                   "GradOp is only callable when is_test is false");
-
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-
-    auto M = EigenMatrix<uint8_t>::Reshape(*mask, 1);
-    auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
-    auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    if (dropout_implementation == "upscale_in_train") {
-      float dropout_prob = context.Attr<float>("dropout_prob");
-      if (dropout_prob == 1.0f) {
-        dX.device(place) = static_cast<T>(0) * dY;
-      } else {
-        dX.device(place) =
-            dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
-      }
-    } else {
-      dX.device(place) = dY * M.cast<T>();
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
deleted file mode 100644
index 3e401d1c4f9f4fa89cbbe04df1ca69d05132eb51..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-
-USE_OP(dropout);
-
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto var = scope->Var("X");
-  auto tensor = var->GetMutable<f::LoDTensor>();
-  tensor->Resize({10, 10});
-
-  std::vector<float> init;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init.push_back(1.0);
-  }
-
-  TensorFromVector(init, ctx, tensor);
-
-  auto place = ctx.GetPlace();
-  auto out_var = scope->Var("Out");
-  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
-  out_tensor->Resize({10, 10});
-  out_tensor->mutable_data<float>(place);  // allocate
-
-  auto mask_var = scope->Var("Mask");
-  auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
-  mask_tensor->Resize({10, 10});
-  mask_tensor->mutable_data<float>(place);  // allocate
-
-  // run
-  f::AttributeMap attrs;
-  float dropout_prob = 0.5;
-  attrs.insert({"fix_seed", 1});
-  attrs.insert({"seed", 3});
-  attrs.insert({"dropout_prob", dropout_prob});
-  auto dropout_op = f::OpRegistry::CreateOp(
-      "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
-
-  dropout_op->Run(*scope, place);
-
-  std::vector<float> out_vec;
-  TensorToVector(*out_tensor, ctx, &out_vec);
-
-  std::vector<float> std_out = {
-      0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
-      1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
-      1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
-      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
-      1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1};
-
-  EXPECT_EQ(out_vec.size(), std_out.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], std_out[i]);
-  }
-}
-
-// TODO(wyi): Due to
-// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily
-// disable this test to remove the prevention of the merge of
-// unrelated PRs.
-/*
-TEST(Dropout, CPUDense) {
-  f::Scope scope;
-  p::CPUPlace place;
-  p::CPUDeviceContext ctx(place);
-  Compare(scope, ctx);
-}
-
-TEST(Dropout, GPUDense) {
-  f::Scope scope;
-  p::CUDAPlace place;
-  p::CUDADeviceContext ctx(place);
-  Compare(scope, ctx);
-}
-*/
diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc
deleted file mode 100644
index a854d470dddab074813d99f8c64d2e68ec291892..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/edit_distance_op.h"
-
-namespace paddle {
-namespace operators {
-
-class EditDistanceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Hyps"), "Input(Hyps) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Refs"), "Input(Refs) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("SequenceNum"),
-                   "Output(SequenceNum) shouldn't be null.");
-    auto hyp_dims = ctx->GetInputDim("Hyps");
-    auto ref_dims = ctx->GetInputDim("Refs");
-
-    if (ctx->HasInput("HypsLength") && ctx->HasInput("RefsLength")) {
-      auto hyp_length_dims = ctx->GetInputDim("HypsLength");
-      auto ref_length_dims = ctx->GetInputDim("RefsLength");
-
-      PADDLE_ENFORCE(hyp_dims.size() == 2 && ref_dims.size() == 2 &&
-                         hyp_dims[0] == ref_dims[0],
-                     "Input(Hyps) and Input(Refs) must be 2-D Tensors with "
-                     "identical first dimension");
-      PADDLE_ENFORCE(hyp_length_dims[0] == ref_length_dims[0] &&
-                         hyp_length_dims[0] == hyp_dims[0],
-                     "Input(HypsLength), Input(RefsLength) and Input(Hyps) "
-                     "should have identical first dimension");
-    } else {
-      PADDLE_ENFORCE(
-          hyp_dims.size() == 2 && hyp_dims[1] == 1,
-          "Input(Hyps) must be a 2-D LoDTensor with the 2nd dimension "
-          "equal to 1.");
-      PADDLE_ENFORCE(
-          ref_dims.size() == 2 && ref_dims[1] == 1,
-          "Input(Refs) must be a 2-D LoDTensor with the 2nd dimension "
-          "equal to 1.");
-    }
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("Refs"));
-    ctx->SetOutputDim("SequenceNum", {1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::FP32,
-                                   ctx.device_context());
-  }
-};
-
-class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Hyps",
-             "2-D Tensor<int64_t>, or 2-D LoDTensor<int64_t> with last "
-             "dimension being 1. "
-             "The indices for hypothesis strings.");
-    AddInput("Refs",
-             "2-D Tensor<int64_t>, or 2-D LoDTensor<int64_t> with last "
-             "dimension being 1. "
-             "The indices for reference strings.");
-    AddInput("HypsLength",
-             "1-D Tensor<int64_t>. "
-             "Sequence length for hyps when hyps is a tensor")
-        .AsDispensable();
-    AddInput("RefsLength",
-             "1-D Tensor<int64_t>. "
-             "Sequence length for refs when refs is a tensor")
-        .AsDispensable();
-    AddOutput("SequenceNum", "The sequence count of current batch");
-    AddAttr<bool>("normalized",
-                  "(bool, default false) Indicated whether to normalize "
-                  "the edit distance by the length of reference string.")
-        .SetDefault(false);
-    AddOutput("Out",
-              "(2-D Tensor with shape [`batch_size` x 1]) "
-              "The output edit distances of EditDistance operator.");
-    AddComment(R"DOC(
-
-EditDistance operator computes the edit distances between a batch of hypothesis
-strings and their references.
-
-Edit distance, also called Levenshtein distance, measures how dissimilar two strings
-are by counting the minimum number of operations to transform one string into anthor.
-Here the operations include insertion, deletion, and substitution. For example,
-given hypothesis string A = "kitten" and reference B = "sitting", the edit distance
-is 3 for A will be transformed into B at least after two substitutions and one
-insertion:
-
-   "kitten" -> "sitten" -> "sittin" -> "sitting"
-
-Input(Hyps) is a 2-D Tensor or a 2-D LoDTensor consisting of all the hypothesis strings.
-And the `batch_size` reference strings are arranged in order in the same way in the
-Input(Refs).
-
-Output(Out) contains the `batch_size` results and each stands for the edit distance
-for a pair of strings respectively. If Attr(normalized) is true, the edit distance
-will be divided by the length of reference string.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(edit_distance, ops::EditDistanceOp, ops::EditDistanceOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    edit_distance, ops::EditDistanceKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
deleted file mode 100644
index c7217b9f750b5a83f95b8df161de23a89241925d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/edit_distance_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-__global__ void FillFirstRow(T* dist, const int N) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx < N + 1) {
-    dist[idx] = idx;
-  }
-}
-
-template <typename T>
-__global__ void FillFirstColumn(T* dist, const int M, const int N) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx < M + 1) {
-    dist[idx * (N + 1)] = idx;
-  }
-}
-
-template <typename T>
-__global__ void Levenshtein(T* dist, const int64_t* x1, const int64_t* x2,
-                            const int M, const int N, const int start) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int offset = N;
-  int index = start + idx * offset;
-  int row = index / (N + 1);
-  int col = index % (N + 1);
-  if (row > 0 && col > 0 && row < M + 1 && col < N + 1) {
-    int cost = x1[row - 1] == x2[col - 1] ? 0 : 1;
-    int dels = dist[(row - 1) * (N + 1) + col] + 1;
-    int ins = dist[row * (N + 1) + col - 1] + 1;
-    int subs = dist[(row - 1) * (N + 1) + (col - 1)] + cost;
-    dist[index] = min(dels, min(ins, subs));
-  }
-}
-
-template <typename T>
-__global__ void SetOutput(T* out, const T* dist, const int M, const int N,
-                          bool normalized) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx == 0) {
-    out[0] = normalized ? dist[M * (N + 1) + N] / N : dist[M * (N + 1) + N];
-  }
-}
-
-template <typename Place, typename T>
-class EditDistanceGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::Tensor>("Out");
-
-    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
-    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
-    auto* sequence_num = ctx.Output<framework::Tensor>("SequenceNum");
-    sequence_num->mutable_data<int64_t>(ctx.GetPlace());
-    auto batch_size = x1_t->dims()[0];
-
-    auto normalized = ctx.Attr<bool>("normalized");
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
-
-    framework::Vector<size_t> hyp_lod(batch_size + 1);
-    framework::Vector<size_t> ref_lod(batch_size + 1);
-
-    bool use_length = ctx.HasInput("HypsLength");
-
-    if (use_length) {
-      // build lod when using padding
-      auto* hyp_length = ctx.Input<framework::Tensor>("HypsLength");
-      auto* ref_length = ctx.Input<framework::Tensor>("RefsLength");
-
-      framework::Tensor hyp_length_cpu;
-      framework::Tensor ref_length_cpu;
-      framework::TensorCopy(*hyp_length, platform::CPUPlace(), &hyp_length_cpu);
-      framework::TensorCopy(*ref_length, platform::CPUPlace(), &ref_length_cpu);
-
-      for (auto i = 0; i < batch_size; i++) {
-        hyp_lod[i + 1] = hyp_lod[i] + hyp_length_cpu.data<int64_t>()[i];
-        ref_lod[i + 1] = ref_lod[i] + ref_length_cpu.data<int64_t>()[i];
-      }
-
-    } else {
-      hyp_lod = x1_t->lod()[0];
-      ref_lod = x2_t->lod()[0];
-    }
-
-    if (normalized) {
-      for (size_t i = 1; i < ref_lod.size(); ++i) {
-        PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
-                       "Reference string %d is empty.", i);
-      }
-    }
-
-    const size_t num_strs = hyp_lod.size() - 1;
-    math::SetConstant<platform::CUDADeviceContext, int64_t> set_constant;
-    set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
-                 sequence_num, static_cast<int64_t>(num_strs));
-
-    out_t->Resize({static_cast<int64_t>(num_strs), 1});
-    out_t->mutable_data<T>(ctx.GetPlace());
-    auto out = out_t->data<T>();
-
-    T distance = 0.0;
-    for (size_t num = 0; num < num_strs; num++) {
-      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
-      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
-      if (m == 0 || n == 0) {
-        distance = std::max(m, n);
-        if (normalized) {
-          distance = distance / n;
-        }
-        memory::Copy(boost::get<Place>(ctx.GetPlace()), out + num,
-                     platform::CPUPlace(), &distance, sizeof(T), stream);
-      } else {
-        framework::Tensor dist_t;
-        dist_t.Resize({m + 1, n + 1});
-        dist_t.mutable_data<T>(ctx.GetPlace());
-        auto dist = dist_t.data<T>();
-        auto hyp_offset = use_length ? num * x1_t->dims()[1] : hyp_lod[num];
-        auto ref_offset = use_length ? num * x2_t->dims()[1] : ref_lod[num];
-        auto x1 = x1_t->data<int64_t>() + hyp_offset;
-        auto x2 = x2_t->data<int64_t>() + ref_offset;
-
-        FillFirstColumn<T><<<1 + m / PADDLE_CUDA_NUM_THREADS,
-                             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n);
-
-        FillFirstRow<T><<<1 + n / PADDLE_CUDA_NUM_THREADS,
-                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, n);
-
-        // Compute the elements of distance matrix in the anti-diagonal diretion
-        for (int64_t slice = 2; slice < m + n + 1; ++slice) {
-          int z_m = slice < m + 1 ? 0 : slice - m;
-          int z_n = slice < n + 1 ? 0 : slice - n;
-          int size = slice - (z_m + z_n) + 1;  // number of elments in the same
-                                               // anti-diagonal line to update
-          // the start index at which computes from
-          int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1;
-          Levenshtein<T><<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS,
-                           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, x1, x2,
-                                                                 m, n, start);
-        }
-        SetOutput<T><<<1, 1, 0, stream>>>(out + num, dist, m, n, normalized);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    edit_distance,
-    ops::EditDistanceGPUKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/fluid/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h
deleted file mode 100644
index 3e1aec7ceeec781dbf00ac5a24a8a4e95c999850..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/edit_distance_op.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-
-template <typename Place, typename T>
-class EditDistanceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::Tensor>("Out");
-
-    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
-    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
-    auto* sequence_num = ctx.Output<framework::Tensor>("SequenceNum");
-    int64_t* seq_num_data = sequence_num->mutable_data<int64_t>(ctx.GetPlace());
-    auto batch_size = x1_t->dims()[0];
-
-    auto normalized = ctx.Attr<bool>("normalized");
-
-    framework::Vector<size_t> hyp_lod(batch_size + 1);
-    framework::Vector<size_t> ref_lod(batch_size + 1);
-
-    bool use_length = ctx.HasInput("HypsLength");
-
-    if (use_length) {
-      // build lod when using padding
-      auto hyp_length_ptr =
-          ctx.Input<framework::Tensor>("HypsLength")->data<int64_t>();
-      auto ref_length_ptr =
-          ctx.Input<framework::Tensor>("RefsLength")->data<int64_t>();
-
-      for (auto i = 0; i < batch_size; i++) {
-        hyp_lod[i + 1] = hyp_lod[i] + hyp_length_ptr[i];
-        ref_lod[i + 1] = ref_lod[i] + ref_length_ptr[i];
-      }
-
-    } else {
-      hyp_lod = x1_t->lod()[0];
-      ref_lod = x2_t->lod()[0];
-    }
-
-    if (normalized) {
-      for (size_t i = 1; i < ref_lod.size(); ++i) {
-        PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
-                       "Reference string %d is empty.", i);
-      }
-    }
-    auto num_strs = hyp_lod.size() - 1;
-    *seq_num_data = static_cast<int64_t>(num_strs);
-
-    out_t->Resize({static_cast<int64_t>(num_strs), 1});
-    out_t->mutable_data<float>(ctx.GetPlace());
-    auto out = out_t->data<T>();
-
-    T distance = 0.0;
-    for (size_t num = 0; num < num_strs; ++num) {
-      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
-      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
-
-      if (m == 0) {
-        distance = n;
-      } else if (n == 0) {
-        distance = m;
-      } else {
-        framework::Tensor dist_t;
-        dist_t.Resize({m + 1, n + 1});
-        dist_t.mutable_data<T>(ctx.GetPlace());
-        auto dist = dist_t.data<T>();
-        auto hyp_offset = use_length ? num * x1_t->dims()[1] : hyp_lod[num];
-        auto ref_offset = use_length ? num * x2_t->dims()[1] : ref_lod[num];
-        auto x1 = x1_t->data<int64_t>() + hyp_offset;
-        auto x2 = x2_t->data<int64_t>() + ref_offset;
-        for (int64_t i = 0; i < m + 1; ++i) {
-          dist[i * (n + 1)] = i;
-        }
-        for (int64_t j = 0; j < n + 1; ++j) {
-          dist[j] = j;
-        }
-        for (int64_t i = 1; i < m + 1; ++i) {
-          for (int64_t j = 1; j < n + 1; ++j) {
-            int cost = x1[i - 1] == x2[j - 1] ? 0 : 1;
-            int dels = dist[(i - 1) * (n + 1) + j] + 1;
-            int ins = dist[i * (n + 1) + (j - 1)] + 1;
-            int subs = dist[(i - 1) * (n + 1) + (j - 1)] + cost;
-            dist[i * (n + 1) + j] = std::min(dels, std::min(ins, subs));
-          }
-        }
-        distance = dist[m * (n + 1) + n];
-      }
-
-      if (normalized) {
-        PADDLE_ENFORCE(n > 0,
-                       "The reference string (#%d) cannot be empty "
-                       "when Attr(normalized) is enabled.",
-                       n);
-        distance = distance / n;
-      }
-      out[num] = distance;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/CMakeLists.txt b/paddle/fluid/operators/elementwise/CMakeLists.txt
deleted file mode 100644
index 94886066ca59aad291c711a0fbb027854e908bf5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include(operators)
-register_operators()
-
-cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
-cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor)
-cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
deleted file mode 100644
index fd93aa441eda78613422fee4809d7b0d4467fc95..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseAddDoubleGradDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_add_grad_grad");
-    op->SetInput("Y", Input("Y"));
-    op->SetInput("DOut", Input(framework::GradVarName("Out")));
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
-REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_add, "Add",
-                                           "Out = X + Y");
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_add_grad, ops::ElementwiseOpExplicitGrad,
-                  ops::ElementwiseGradOpInplace,
-                  ops::ElementwiseGradNoBufVarsInference,
-                  ops::ElementwiseAddDoubleGradDescMaker);
-REGISTER_OPERATOR(elementwise_add_grad_grad,
-                  ops::ElementwiseOpDoubleGradWithoutDXDY,
-                  ops::ElementwiseDoubleGradOpInplace,
-                  ops::ElementwiseDoubleGradNoBufVarsInference);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add_grad_grad,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
deleted file mode 100644
index 15b4bff0b783f8d9a942b49b67c8a13c8e9dbf3f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add_grad_grad,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
deleted file mode 100644
index 7f8b0ffe92fd40d7944f05282c4edc8271547e00..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename DeviceContext, typename T>
-void default_elementwise_add(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *x,
-                             const framework::Tensor *y, framework::Tensor *z) {
-  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        AddFunctor<T>(), z);
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_same_dims(const framework::ExecutionContext &ctx,
-                          const framework::Tensor *x,
-                          const framework::Tensor *y, framework::Tensor *z) {
-  auto blas = math::GetBlas<DeviceContext, T>(ctx);
-  blas.VADD(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    !std::is_floating_point<T>::value ||
-    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_same_dims(const framework::ExecutionContext &ctx,
-                          const framework::Tensor *x,
-                          const framework::Tensor *y, framework::Tensor *z) {
-  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
-  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
-
-  auto &place = *ctx.template device_context<DeviceContext>().eigen_device();
-  eigen_z.device(place) = eigen_x + eigen_y;
-}
-
-template <typename DeviceContext, typename T>
-class ElementwiseAddKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    auto *y = ctx.Input<framework::LoDTensor>("Y");
-    auto *z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto dims_equal = x->dims() == y->dims();
-    if (dims_equal) {
-      elementwise_add_same_dims<DeviceContext, T>(ctx, x, y, z);
-    } else {
-      default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
-    }
-  }
-};
-
-template <typename T>
-struct IdentityGrad {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename DeviceContext, typename T>
-void default_elementwise_add_grad(const framework::ExecutionContext &ctx,
-                                  const framework::Tensor *x,
-                                  const framework::Tensor *y,
-                                  const framework::Tensor *out,
-                                  const framework::Tensor *dout,
-                                  framework::Tensor *dx,
-                                  framework::Tensor *dy) {
-  int axis = ctx.Attr<int>("axis");
-
-  ElemwiseExplicitGradCompute<DeviceContext, T, IdentityGrad<T>,
-                              IdentityGrad<T>>(ctx, *x, *y, *out, *dout, axis,
-                                               dx, dy, IdentityGrad<T>(),
-                                               IdentityGrad<T>());
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_grad(const framework::ExecutionContext &ctx,
-                     const framework::Tensor *x, const framework::Tensor *y,
-                     const framework::Tensor *out,
-                     const framework::Tensor *dout, framework::Tensor *dx,
-                     framework::Tensor *dy) {
-  auto blas = math::GetBlas<DeviceContext, T>(ctx);
-
-  if (dx) {
-    blas.VCOPY(dout->numel(), dout->data<T>(),
-               dx->mutable_data<T>(ctx.GetPlace()));
-  }
-
-  if (dy) {
-    blas.VCOPY(dout->numel(), dout->data<T>(),
-               dy->mutable_data<T>(ctx.GetPlace()));
-  }
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    !std::is_floating_point<T>::value ||
-    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_grad(const framework::ExecutionContext &ctx,
-                     const framework::Tensor *x, const framework::Tensor *y,
-                     const framework::Tensor *out,
-                     const framework::Tensor *dout, framework::Tensor *dx,
-                     framework::Tensor *dy) {
-  default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-}
-
-template <typename DeviceContext, typename T>
-class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
-    using Tensor = framework::Tensor;
-
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    // skip out, x, y
-    auto *out = dout;
-    auto *x = dout, *y = dout;
-
-    if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr &&
-        dy != nullptr && (dx->dims() == dy->dims())) {
-      elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-    } else {
-      default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
-                                                     dy);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseAddDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto *y = ctx.Input<Tensor>("Y");
-    auto *dout = ctx.Input<Tensor>("DOut");
-    auto *ddx = ctx.Input<Tensor>("DDX");
-    auto *ddy = ctx.Input<Tensor>("DDY");
-
-    auto *ddout = ctx.Output<Tensor>("DDOut");
-
-    // ddOut = ddx + ddy
-    if (ddout) {
-      Tensor ddx_safe, ddy_safe;
-      GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dout, ddx, &ddx_safe);
-      GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-      ddout->mutable_data<T>(ctx.GetPlace());
-      default_elementwise_add<DeviceContext, T>(ctx, &ddx_safe, &ddy_safe,
-                                                ddout);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
deleted file mode 100644
index f025a8452059f9b6f97b9f73ad667d12ccf37a7e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseDivOpMaker : public ElementwiseOpMaker {
- protected:
-  std::string GetName() const override { return "Div"; }
-  std::string GetEquation() const override { return "Out = X / Y"; }
-};
-
-class ElementwiseDivGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_div_grad");
-    op->SetInput("Y", Input("Y"));
-    op->SetInput("Out", Output("Out"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class ElementwiseDivDoubleGradDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_div_grad_grad");
-    op->SetInput("Y", Input("Y"));
-    op->SetInput("Out", Input("Out"));
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
-    op->SetInput("DX", Output(framework::GradVarName("X")));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    op->SetOutput("DOut", InputGrad("Out"));
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
-                  ops::ElementwiseDivOpMaker, ops::ElementwiseOpInferVarType,
-                  ops::ElementwiseDivGradOpDescMaker);
-
-REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad,
-                  ops::ElementwiseDivDoubleGradDescMaker);
-REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad,
-                  ops::ElementwiseDivDoubleGradOpInplace);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
deleted file mode 100644
index 4cd17b94e5dcd10e390a769f4cf77b3b772a7a86..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::float16>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
deleted file mode 100644
index a1c5684ea800a93ed7a56fa5d99b947691cd4488..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DivFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          DivFunctor<T>(), z);
-  }
-};
-
-template <typename T>
-struct DivGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
-};
-
-template <typename T>
-struct DivGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return -dout * out / y;
-  }
-};
-
-template <typename T>
-struct DivDoubleDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return y * out * dout - x * dout;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto* x = dout;  // Fake x, not used
-
-    ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
-  }
-};
-
-class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput("DOut")) {
-      ctx->ShareDim("DX", "DOut");
-      ctx->ShareLoD("DX", "DOut");
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->ShareDim("Y", y_grad_name);
-      ctx->ShareLoD("Y", y_grad_name);
-    }
-    if (ctx->HasOutput("DDOut")) {
-      ctx->ShareDim("DX", "DDOut");
-      ctx->ShareLoD("DX", "DDOut");
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("DDX")->type();
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (platform::CanMKLDNNBeUsed(ctx)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivDoubleGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::Tensor;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* Y = ctx.Input<Tensor>("Y");
-    auto* Out = ctx.Input<Tensor>("Out");
-    auto* ddX = ctx.Input<Tensor>("DDX");
-    auto* ddY = ctx.Input<Tensor>("DDY");
-    auto* dX = ctx.Input<Tensor>("DX");
-
-    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* dOut = ctx.Output<Tensor>("DOut");
-    auto* ddOut = ctx.Output<Tensor>("DDOut");
-
-    int axis = ctx.Attr<int>("axis");
-
-    if (dY) dY->mutable_data<T>(Y->dims(), ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    // ddX_safe == null ? 0 : ddX
-    // ddY_safe == null ? 0 : ddY
-    Tensor ddX_safe, ddY_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Out, ddX, &ddX_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Y, ddY, &ddY_safe);
-
-    // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-    // dY = Out * dX * ddY / Y - dX * ddX / Y
-    // dOut = - dX * ddY
-    // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
-    // inplace ddx
-    Tensor tmp;
-    if (dOut) {
-      tmp = *dOut;
-    } else {
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      tmp = ctx.AllocateTmpTensor<T, DeviceContext>(Out->dims(), dev_ctx);
-    }
-    if (dY) {
-      // dX_div_Y = dX / Y;
-      Tensor dX_div_Y = tmp;
-      ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(
-          ctx, dX, Y, axis, DivFunctor<T>(), &dX_div_Y);
-
-      // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
-      // first output tensor is nullptr, the branch to calculate first
-      // output tensor will not be activated, DivGradDx function will not
-      // be called and can be ignored, the first branch has little effect
-      // on running speed.
-
-      // dY = Out * dX * ddY / Y - dX * ddX / Y
-      ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivDoubleDY<T>>(
-          ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY,
-          DivGradDX<T>(), DivDoubleDY<T>());
-    }
-
-    if (ddOut) {
-      // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-      default_elementwise_mul<DeviceContext, T>(ctx, Out, &ddY_safe, &tmp);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &ddX_safe, &tmp, 0, SubFunctor<T>(), &tmp);
-      ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(
-          ctx, &tmp, Y, axis, DivFunctor<T>(), ddOut);
-    }
-
-    if (dOut) {
-      // dOut = - dX * ddY
-      default_elementwise_mul<DeviceContext, T>(ctx, dX, &ddY_safe, dOut);
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      auto dout = framework::EigenVector<T>::Flatten(*dOut);
-      dout.device(place) = static_cast<T>(-1) * dout;
-    }
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(ElementwiseDivDoubleGradOpInplace, {"DDX", "DDOut"});
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
deleted file mode 100644
index 66c56da417487e3b2ee94ad572d83a971958ab62..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseFloorDivOpMaker : public ElementwiseOpMaker {
- protected:
-  std::string GetName() const override { return "FloorDiv"; }
-  std::string GetEquation() const override { return "Out = X // Y"; }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
-                             ops::ElementwiseFloorDivOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
deleted file mode 100644
index 60846d1e8fee1c7f68ac101f18355750c2c15a4d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
deleted file mode 100644
index 2d24e394d5c823dbd22c837210e46cefeceba1be..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct FloorDivFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
-};
-
-template <typename DeviceContext, typename T>
-void elementwise_floor_div(const framework::ExecutionContext &ctx,
-                           const framework::Tensor *x,
-                           const framework::Tensor *y, framework::Tensor *z) {
-  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-      ctx, x, y, axis, FloorDivFunctor<T>(), z);
-}
-
-template <typename DeviceContext, typename T>
-class ElementwiseFloorDivKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    auto *y = ctx.Input<framework::LoDTensor>("Y");
-    auto *z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-
-    // dtype of x and y is int64 or int32
-    elementwise_floor_div<DeviceContext, T>(ctx, x, y, z);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
deleted file mode 100644
index b7df9c6f845dfc941e3c6acbc986a584e984a1de..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
- protected:
-  std::string GetName() const override { return "Max"; }
-  std::string GetEquation() const override { return "Out = max(X, Y)"; }
-};
-
-class ElementwiseMaxGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_max_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Y", Input("Y"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
-                  ops::ElementwiseMaxOpMaker, ops::ElementwiseOpInferVarType,
-                  ops::ElementwiseMaxGradOpDescMaker);
-
-REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_max,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_max_grad,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
deleted file mode 100644
index 5d086a1b29febd8e57507eced7683f414ca34e07..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_max,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_max_grad,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
deleted file mode 100644
index abdb1b9671de80d02b9a6a788088f47929fcc6f0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct MaxFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a > b ? a : b; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MaxFunctor<T>(), z);
-  }
-};
-
-template <typename T>
-struct MaxGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * (x > y);
-  }
-};
-
-template <typename T>
-struct MaxGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * (x <= y);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* out = dout;  // Fake out, not used
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
deleted file mode 100644
index f60c0ed8a0faad384f4eaa631c2758f83bc56414..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseMinOpMaker : public ElementwiseOpMaker {
- protected:
-  std::string GetName() const override { return "Min"; }
-  std::string GetEquation() const override { return "Out = min(X, Y)"; }
-};
-
-class ElementwiseMinGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_min_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Y", Input("Y"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
-                  ops::ElementwiseMinOpMaker, ops::ElementwiseOpInferVarType,
-                  ops::ElementwiseMinGradOpDescMaker);
-
-REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_min,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_min_grad,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
deleted file mode 100644
index cf93e5a97a3f3110aae907c593f58dbab0f9d090..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_min,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_min_grad,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
deleted file mode 100644
index 1a49a6013987ae1ec685ec91ca656e4756ba7c32..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct MinFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a < b ? a : b; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMinKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MinFunctor<T>(), z);
-  }
-};
-
-template <typename T>
-struct MinGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * (x < y);
-  }
-};
-
-template <typename T>
-struct MinGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * (x >= y);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* out = dout;  // Fake out, not used
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
deleted file mode 100644
index 451c7816b9af1832b8504a05aeb1e0f51c5001c8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseModOpMaker : public ElementwiseOpMaker {
- protected:
-  std::string GetName() const override { return "Mod"; }
-  std::string GetEquation() const override { return "Out = X \\\\% Y"; }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(elementwise_mod, ops::ElementwiseOp,
-                             ops::ElementwiseModOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mod,
-    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseModFPKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseModFPKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
deleted file mode 100644
index 92991ab3a0a24c0969a403c2e2e2d1b1cb950d2f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mod, ops::ElementwiseModKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseModKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseModFPKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseModFPKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
deleted file mode 100644
index e568a5dc72c08a5673147af0fb9b38bdca3c9921..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct ModFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a % b; }
-};
-
-template <typename T>
-struct ModFunctorFP {
-  inline HOSTDEVICE T operator()(T a, T b) const { return std::fmod(a, b); }
-};
-
-template <typename DeviceContext, typename T>
-void elementwise_mod(const framework::ExecutionContext &ctx,
-                     const framework::Tensor *x, const framework::Tensor *y,
-                     framework::Tensor *z) {
-  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        ModFunctor<T>(), z);
-}
-
-template <typename DeviceContext, typename T>
-void elementwise_mod_fp(const framework::ExecutionContext &ctx,
-                        const framework::Tensor *x, const framework::Tensor *y,
-                        framework::Tensor *z) {
-  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<ModFunctorFP<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          ModFunctorFP<T>(), z);
-}
-
-template <typename DeviceContext, typename T>
-class ElementwiseModKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    auto *y = ctx.Input<framework::LoDTensor>("Y");
-    auto *z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-
-    // dtype of x and y is int64 or int32
-    elementwise_mod<DeviceContext, T>(ctx, x, y, z);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseModFPKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    auto *y = ctx.Input<framework::LoDTensor>("Y");
-    auto *z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-
-    // dtype of x and y is float or double
-    elementwise_mod_fp<DeviceContext, T>(ctx, x, y, z);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
deleted file mode 100644
index 69900e0637ce587b326123d326d322fe73c75617..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseMulOpGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_mul_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Y", Input("Y"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetAttrMap(Attrs());
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    return op;
-  }
-};
-
-class ElementwiseMulOpMaker : public ElementwiseOpMaker {
- protected:
-  virtual std::string GetName() const { return "Mul"; }
-  virtual std::string GetEquation() const { return "Out = X \\\\odot Y"; }
-};
-
-class ElementwiseMulDoubleGradDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_mul_grad_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Y", Input("Y"));
-    op->SetInput("DOut", Input(framework::GradVarName("Out")));
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp,
-                  ops::ElementwiseMulOpMaker, ops::ElementwiseOpInferVarType,
-                  ops::ElementwiseMulOpGradDescMaker);
-REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad,
-                  ops::ElementwiseMulDoubleGradDescMaker);
-REGISTER_OPERATOR(elementwise_mul_grad_grad, ops::ElementwiseOpDoubleGrad,
-                  ops::ElementwiseMulDoubleGradOpInplace);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
deleted file mode 100644
index d3c0dcb40958c21d96e266425b501dcd763b8f3a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-#define TILE_SIZE 512
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
-                                                       const T* out,
-                                                       const T* dout,
-                                                       int64_t size, T* dx,
-                                                       T* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    T o = dout[col];
-    dx[col] = y[col] * o;
-    dy[col] = x[col] * o;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
-template <typename T>
-class ElementwiseMulGradKernel<plat::CUDADeviceContext, T>
-    : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = dout;  // out is not necessary
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    if (x->dims() == y->dims() && dx && dy) {
-      dim3 block_size = dim3(TILE_SIZE, 1);
-      auto size = x->numel();
-      dim3 gird_size = dim3((size + TILE_SIZE - 1) / TILE_SIZE, 1);
-      SimpleElemwiseMulGradCUDAKernel<T><<<
-          gird_size, block_size, 0,
-          ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
-          x->data<T>(), y->data<T>(), out->data<T>(), dout->data<T>(), size,
-          dx->mutable_data<T>(ctx.GetPlace()),
-          dy->mutable_data<T>(ctx.GetPlace()));
-      return;
-    } else {
-      ElemwiseGradCompute<plat::CUDADeviceContext, T, MulGradDX<T>,
-                          MulGradDY<T>>(ctx, *x, *y, *out, *dout, axis, dx, dy,
-                                        MulGradDX<T>(), MulGradDY<T>());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul, ops::ElementwiseMulKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
deleted file mode 100644
index 581caad62ed5d382af8957631ff8dbdbc401b1cb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct MulFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
-};
-
-template <typename DeviceContext, typename T>
-void default_elementwise_mul(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y, framework::Tensor* z) {
-  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        MulFunctor<T>(), z);
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_mul_same_dims(const framework::ExecutionContext& ctx,
-                          const framework::Tensor* x,
-                          const framework::Tensor* y, framework::Tensor* z) {
-  auto blas = math::GetBlas<DeviceContext, T>(ctx);
-  blas.VMUL(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    !std::is_floating_point<T>::value ||
-    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_mul_same_dims(const framework::ExecutionContext& ctx,
-                          const framework::Tensor* x,
-                          const framework::Tensor* y, framework::Tensor* z) {
-  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
-  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
-
-  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-  eigen_z.device(place) = eigen_x * eigen_y;
-}
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x_var = ctx.InputVar("X");
-    PADDLE_ENFORCE(x_var != nullptr,
-                   "Cannot get input Variable X, variable name = %s",
-                   ctx.op().Input("X"));
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-
-    framework::Tensor x, *z;
-    if (x_var->IsType<framework::SelectedRows>()) {
-      PADDLE_ENFORCE(y->dims().size() == 1 && y->dims()[0] == 1,
-                     "For elementwise_op, if X is Sparse, Y must be scalar.");
-      auto& x_sele = x_var->Get<framework::SelectedRows>();
-      auto out_sele = ctx.Output<framework::SelectedRows>("Out");
-      x = x_sele.value();
-      out_sele->set_rows(x_sele.rows());
-      out_sele->set_height(x_sele.height());
-      out_sele->mutable_value()->Resize(x_sele.value().dims());
-      out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type());
-      z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
-    } else if (x_var->IsType<framework::LoDTensor>()) {
-      x = x_var->Get<framework::LoDTensor>();
-      z = ctx.Output<framework::LoDTensor>("Out");
-    } else {
-      PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
-                   framework::ToTypeName(x_var->Type()));
-    }
-
-    z->mutable_data<T>(ctx.GetPlace());
-    if (x.numel() == y->numel()) {
-      elementwise_mul_same_dims<DeviceContext, T>(ctx, &x, y, z);
-    } else {
-      default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
-    }
-  }
-};
-
-template <typename T>
-struct MulGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
-};
-
-template <typename T>
-struct MulGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = dout;  // out is not necessary
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-
-    if (ddout) ddout->mutable_data<T>(ctx.GetPlace());
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    // dx = dout * ddy
-    // dy = dout * ddx
-    // ddout = ddx * y + x * ddy
-    // change computation sequence to save memory, so ddout can inplace ddx and
-    // dx can be used as 'tmp' tensor
-    // (1) dx = x * ddy
-    // (2) dy = dout * ddx
-    // (3) ddout = ddx * y
-    // (4) ddout = ddout + dx
-    // (5) dx = dout *ddy
-    if (ddout) {
-      // use dx to save memory, other than alloc tmp tensor
-      Tensor* ddout_tmp = dx;
-
-      default_elementwise_mul<DeviceContext, T>(ctx, x, &ddy_safe, ddout_tmp);
-      int axis = ctx.Attr<int>("axis");
-      // NOTE: in the following ElemwiseGradCompute, for the
-      // first output tensor is nullptr, the branch to calculate first
-      // output tensor will not be activated, DivGradDx function will not
-      // be called and can be ignored, the first branch has little effect
-      // on running speed.
-      ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-          ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy,
-          MulGradDX<T>(), MulGradDY<T>());
-      default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, y, ddout);
-
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-      auto ddout_tmp_t = framework::EigenVector<T>::Flatten(*ddout_tmp);
-      ddout_t.device(place) = ddout_t + ddout_tmp_t;
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, &ddy_safe, dx);
-    }
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(ElementwiseMulDoubleGradOpInplace, {"DDX", "DDOut"});
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
deleted file mode 100644
index da678c5ee435af4e725d67d2c4c28e58a0f07598..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ /dev/null
@@ -1,405 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of elementwise op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of elementwise op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of elementwise op should not be null.");
-
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Y").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s [%s]",
-        ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front());
-
-    if (ctx->GetInputsVarType("X").front() ==
-        framework::proto::VarType::LOD_TENSOR) {
-      auto x_dim = ctx->GetInputDim("X");
-      auto y_dim = ctx->GetInputDim("Y");
-      PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                        "Rank of first input must >= rank of second input.");
-    } else if (ctx->GetInputsVarType("X").front() ==
-               framework::proto::VarType::SELECTED_ROWS) {
-      PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
-                         (ctx->GetInputDim("Y")[0] == 1),
-                     "For elementwise_op, if X is Sparse, "
-                     "Y must be scalar.");
-    } else {
-      PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
-                   ctx->GetInputsVarType("X").front());
-    }
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (platform::CanMKLDNNBeUsed(ctx)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ElementwiseOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
-  }
-};
-
-class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput("X", "(Tensor), The first input tensor of elementwise op.");
-    AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    AddOutput("Out", "The output of elementwise op.");
-    AddAttr<int>("axis",
-                 "(int, default -1). The start dimension index "
-                 "for broadcasting Y onto X.")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
-    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "x_data_format",
-        "(string, default NCHW) Only used in mkldnn"
-        "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". "
-        "Defaults to \"\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("");
-    AddAttr<std::string>(
-        "y_data_format",
-        "(string, default \"\") Only used in mkldnn"
-        "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". "
-        "Defaults to \"\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("");
-    AddComment(string::Sprintf(R"DOC(
-Elementwise %s Operator
-
-The equation is:
-
-$$%s$$
-
-- $X$: a tensor of any dimension. 
-- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.
-
-There are two cases for this operator:
-
-1. The shape of $Y$ is the same with $X$.
-2. The shape of $Y$ is a continuous subsequence of $X$.
-
-For case 2:
-
-1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index 
-   for broadcasting $Y$ onto $X$. 
-2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
-3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of 
-   subsequence, such as shape(Y) = (2, 1) => (2).
-
-For example:
-
-  .. code-block:: text
-
-    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
-    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-
-The inputs $X$ and $Y$ can carry the different LoD information. 
-But the output only shares the LoD information with the input $X$.
-
-)DOC",
-                               GetName(), GetEquation()));
-  }
-
- protected:
-  virtual std::string GetName() const = 0;
-
-  virtual std::string GetEquation() const = 0;
-};
-
-class ElementwiseOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    auto out_grad_name = framework::GradVarName("Out");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(out_grad_name),
-                   "Input(Out@GRAD) should not be null");
-
-    auto x_dims = ctx->GetInputDim(out_grad_name);
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.");
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->ShareDim(out_grad_name, /*->*/ x_grad_name);
-      ctx->ShareLoD(out_grad_name, /*->*/ x_grad_name);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->ShareDim("Y", /*->*/ y_grad_name);
-      ctx->ShareLoD("Y", /*->*/ y_grad_name);
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type();
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (platform::CanMKLDNNBeUsed(ctx)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ElementwiseOpDoubleGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->ShareDim("X", x_grad_name);
-      ctx->ShareLoD("X", x_grad_name);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->ShareDim("Y", y_grad_name);
-      ctx->ShareLoD("Y", y_grad_name);
-    }
-    if (ctx->HasOutput("DDOut")) {
-      ctx->ShareDim("DOut", "DDOut");
-      ctx->ShareLoD("DOut", "DDOut");
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("DOut")->type();
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (platform::CanMKLDNNBeUsed(ctx)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ElementwiseOpDoubleGradWithoutDXDY
-    : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasOutput("DDOut")) {
-      ctx->ShareDim("DOut", "DDOut");
-      ctx->ShareLoD("DOut", "DDOut");
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::proto::VarType::Type input_data_type;
-    if (ctx.HasInput("DDX") == false) {
-      PADDLE_ENFORCE_EQ(ctx.HasInput("DDY"), true,
-                        "Input(DDY) should not be null");
-      input_data_type = ctx.Input<Tensor>("DDY")->type();
-    } else if (ctx.HasInput("DDY") == false) {
-      PADDLE_ENFORCE_EQ(ctx.HasInput("DDX"), true,
-                        "Input(DDX) should not be null");
-      input_data_type = ctx.Input<Tensor>("DDX")->type();
-    } else {
-      input_data_type = ctx.Input<Tensor>("DDX")->type();
-    }
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (platform::CanMKLDNNBeUsed(ctx)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-// For Add, Sub op, the X, Out is not needed.
-class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
- public:
-  using operators::ElementwiseOpGrad::ElementwiseOpGrad;
-  using operators::ElementwiseOpGrad::GetExpectedKernelType;
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->ShareDim(framework::GradVarName("Out"), /*->*/ x_grad_name);
-      ctx->ShareLoD(framework::GradVarName("Out"), /*->*/ x_grad_name);
-    }
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(y_grad_name)) {
-      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-
-      ctx->ShareDim("Y", /*->*/ y_grad_name);
-      ctx->ShareLoD("Y", /*->*/ y_grad_name);
-    }
-  }
-};
-
-template <typename T>
-class ElemwiseGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *dx =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    if (dx != nullptr) {
-      auto &dout =
-          *context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-      dx->set_lod(dout.lod());
-    }
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(ElementwiseOpInplace, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(ElementwiseGradOpInplace,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-DECLARE_INPLACE_OP_INFERER(ElementwiseDoubleGradOpInplace, {"DDX", "DDOut"});
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "Y");
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseDoubleGradNoBufVarsInference,
-                                      "Y", "DOut");
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name)                   \
-  class kernel_type##GradMaker                                               \
-      : public paddle::framework::SingleGradOpDescMaker {                    \
-   public:                                                                   \
-    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
-                                                                             \
-   protected:                                                                \
-    std::unique_ptr<paddle::framework::OpDesc> Apply() const override {      \
-      auto *op = new paddle::framework::OpDesc();                            \
-      op->SetType(#kernel_type "_grad");                                     \
-      op->SetInput("Y", Input("Y"));                                         \
-      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
-                   OutputGrad("Out"));                                       \
-      op->SetAttrMap(Attrs());                                               \
-      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
-      op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y"));  \
-      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
-    }                                                                        \
-  }
-
-#define REGISTER_ELEMWISE_OP(op_type, op_name, equation)                \
-  class __ElemwiseOp##op_type##Maker__                                  \
-      : public ::paddle::operators::ElementwiseOpMaker {                \
-   protected:                                                           \
-    virtual std::string GetName() const { return op_name; }             \
-    virtual std::string GetEquation() const { return equation; }        \
-  };                                                                    \
-  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,        \
-                    __ElemwiseOp##op_type##Maker__,                     \
-                    ::paddle::operators::ElementwiseOpInferVarType,     \
-                    ::paddle::framework::DefaultGradOpDescMaker<true>); \
-  REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
-
-#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation)   \
-  class __ElemwiseOp##op_type##Maker__                              \
-      : public ::paddle::operators::ElementwiseOpMaker {            \
-   protected:                                                       \
-    virtual std::string GetName() const { return op_name; }         \
-    virtual std::string GetEquation() const { return equation; }    \
-  };                                                                \
-  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,    \
-                    __ElemwiseOp##op_type##Maker__,                 \
-                    ::paddle::operators::ElementwiseOpInferVarType, \
-                    op_type##GradMaker,                             \
-                    ::paddle::operators::ElementwiseOpInplace);     \
-  REGISTER_OPERATOR(op_type##_grad,                                 \
-                    ::paddle::operators::ElementwiseOpExplicitGrad, \
-                    ::paddle::operators::ElementwiseGradOpInplace,  \
-                    ::paddle::operators::ElementwiseGradNoBufVarsInference)
-
-#define REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(op_type, op_name, equation) \
-  class __ElemwiseOp##op_type##Maker__                                         \
-      : public ::paddle::operators::ElementwiseOpMaker {                       \
-   protected:                                                                  \
-    virtual std::string GetName() const { return op_name; }                    \
-    virtual std::string GetEquation() const { return equation; }               \
-  };                                                                           \
-  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,               \
-                    __ElemwiseOp##op_type##Maker__,                            \
-                    ::paddle::operators::ElementwiseOpInferVarType,            \
-                    op_type##GradMaker,                                        \
-                    ::paddle::operators::ElementwiseOpInplace);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
deleted file mode 100644
index 59a9c3086df7937e987632a02a1a9203faf3bff9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ /dev/null
@@ -1,1814 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <algorithm>
-#include <iterator>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/transform.h"
-
-#ifdef __NVCC__
-#include <cuda.h>
-#include <thrust/iterator/iterator_adaptor.h>
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
-#endif
-
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-/*
- * Out = X ⊙ Y
- * If Y's shape does not match X' shape, they will be reshaped.
- * For example:
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
- *
- * New parameter: *mid_flag* is added to solve m*n*k & m*1*k
- * broadcast cases.
- * 3. shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1, 4, 5)
- *    mid_flag should not be NULL.
- *    x.shape(2, 3, 20) * y.shape(2, 1, 20).broadcast(2, 3, 20)
- */
-inline void get_mid_dims(const framework::DDim &x_dims,
-                         const framework::DDim &y_dims, const int axis,
-                         int *pre, int *n, int *post, int *mid_flag = NULL) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  if (mid_flag != NULL) {
-    *mid_flag = 0;
-    int mid = 0;
-    for (int i = 0; i < axis; ++i) {
-      (*pre) *= x_dims[i];
-    }
-    for (int i = 0; i < y_dims.size(); ++i) {
-      if (x_dims[i + axis] != y_dims[i]) {
-        // only support single y_dims[i] = 1 now.
-        PADDLE_ENFORCE_EQ(*mid_flag, 0,
-                          "Broadcast support y_dims with single 1.");
-        PADDLE_ENFORCE_EQ(y_dims[i], 1, "Broadcast dimension mismatch.");
-        // m*n*k m*1*k
-        for (int j = 0; j < i; ++j) {
-          (*pre) *= y_dims[j];
-        }
-        *n = std::max(x_dims[i + axis], y_dims[i]);
-        *mid_flag = 1;
-        mid = i;
-        break;
-      }
-      (*n) *= y_dims[i];
-    }
-    if (*mid_flag) {
-      for (int i = mid + 1; i < x_dims.size(); ++i) {
-        (*post) *= x_dims[i];
-      }
-    } else {
-      for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-        (*post) *= x_dims[i];
-      }
-    }
-  } else {  // for fused_elementwise_activation_op. keep the old version.
-    for (int i = 0; i < axis; ++i) {
-      (*pre) *= x_dims[i];
-    }
-
-    for (int i = 0; i < y_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
-                        "Broadcast dimension mismatch.");
-      (*n) *= y_dims[i];
-    }
-
-    for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-      (*post) *= x_dims[i];
-    }
-  }
-}
-
-inline framework::DDim trim_trailing_singular_dims(
-    const framework::DDim &dims) {
-  // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims.size();
-  for (; actual_dims_size != 0; --actual_dims_size) {
-    if (dims[actual_dims_size - 1] != 1) break;
-  }
-
-  std::vector<int> trim_dims;
-  trim_dims.resize(actual_dims_size);
-  for (int i = 0; i < actual_dims_size; ++i) {
-    trim_dims[i] = dims[i];
-  }
-  if (trim_dims.size() == 0) {
-    return framework::DDim(framework::make_dim());
-  }
-  framework::DDim actual_dims = framework::make_ddim(trim_dims);
-  return actual_dims;
-}
-
-template <typename T, typename DeviceContext>
-class RowwiseTransformIterator;
-
-template <typename T, typename DeviceContext>
-class MidWiseTransformIterator;
-
-// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
-template <typename T>
-class RowwiseTransformIterator<T, platform::CPUDeviceContext>
-    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
-                           T *, T &> {
- public:
-  RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
-
-  RowwiseTransformIterator<T, platform::CPUDeviceContext> &operator++() {
-    ++i_;
-    if (UNLIKELY(i_ == n_)) {
-      i_ = 0;
-    }
-    return *this;
-  }
-
-  RowwiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
-    while (n-- > 0) {
-      ++i_;
-      if (UNLIKELY(i_ == n_)) {
-        i_ = 0;
-      }
-    }
-
-    return *this;
-  }
-
-  bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>
-                      &rhs) const {
-    return (ptr_ + i_) == &(*rhs);
-  }
-
-  bool operator!=(const RowwiseTransformIterator<T, platform::CPUDeviceContext>
-                      &rhs) const {
-    return (ptr_ + i_) != &(*rhs);
-  }
-
-  const T &operator*() { return ptr_[i_]; }
-
- private:
-  const T *ptr_;
-  int i_;
-  int64_t n_;
-};
-
-template <typename T>
-class MidWiseTransformIterator<T, platform::CPUDeviceContext>
-    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
-                           T *, T &> {
- public:
-  MidWiseTransformIterator(const T *ptr, int n, int post)
-      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
-
-  MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator++() {
-    ++j_;
-    if (UNLIKELY(j_ == post_)) {
-      ++i_;
-      j_ = 0;
-      if (UNLIKELY(i_ == n_)) {
-        i_ = 0;
-      }
-    }
-    return *this;
-  }
-
-  MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
-    while (n-- > 0) {
-      ++j_;
-      if (UNLIKELY(j_ == post_)) {
-        ++i_;
-        j_ = 0;
-        if (UNLIKELY(i_ == n_)) {
-          i_ = 0;
-        }
-      }
-    }
-    return *this;
-  }
-
-  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
-                      &rhs) const {
-    return (ptr_ + i_) == &(*rhs);
-  }
-
-  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
-                      &rhs) const {
-    return (ptr_ + i_) != &(*rhs);
-  }
-
-  const T &operator*() { return ptr_[i_]; }
-
- private:
-  const T *ptr_;
-  int64_t i_;
-  int64_t j_;
-  int64_t n_;
-  int64_t post_;
-};
-
-#ifdef __NVCC__
-template <typename T>
-class RowwiseTransformIterator<T, platform::CUDADeviceContext>
-    : public thrust::iterator_adaptor<
-          RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T *> {
- public:
-  typedef thrust::iterator_adaptor<
-      RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T *>
-      super_t;
-  HOSTDEVICE RowwiseTransformIterator(const T *x, int n)
-      : super_t(x), begin_(x), n_(n) {}
-  friend class thrust::iterator_core_access;
-
- private:
-  unsigned int n_;
-  const T *begin_;
-  HOSTDEVICE typename super_t::reference dereference() const {
-    return *(begin_ + (this->base() - begin_) % n_);
-  }
-};
-
-template <typename T>
-class MidWiseTransformIterator<T, platform::CUDADeviceContext>
-    : public thrust::iterator_adaptor<
-          MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T *> {
- public:
-  typedef thrust::iterator_adaptor<
-      MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T *>
-      super_t;
-  HOSTDEVICE MidWiseTransformIterator(const T *x, int n, int post)
-      : super_t(x), begin_(x), n_(n), post_(post) {}
-  friend class thrust::iterator_core_access;
-
- private:
-  unsigned int post_;
-  unsigned int n_;
-  const T *begin_;
-  HOSTDEVICE typename super_t::reference dereference() const {
-    return *(begin_ + (((this->base() - begin_) / post_) % n_));
-  }
-};
-#endif
-
-template <typename Functor, typename T, typename DeviceContext,
-          typename OutType = T>
-class TransformFunctor {
- public:
-  TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
-                   framework::Tensor *z, const DeviceContext &ctx, Functor func)
-      : x_(x->data<T>()),
-        y_(y->data<T>()),
-        z_(z->mutable_data<OutType>(ctx.GetPlace())),
-        nx_(x->numel()),
-        ctx_(ctx),
-        func_(func) {}
-
-  inline void Run() const {
-    platform::Transform<DeviceContext> trans;
-    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
-  }
-
-  inline void RunRowWise(int n, int pre) const {
-    platform::Transform<DeviceContext> trans;
-    trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, DeviceContext>(y_, n),
-          z_, func_);
-  }
-
-  inline void RunMidWise(int n, int pre, int post) const {
-    platform::Transform<DeviceContext> trans;
-    trans(ctx_, x_, x_ + nx_,
-          MidWiseTransformIterator<T, DeviceContext>(y_, n, post), z_, func_);
-  }
-
-  inline void RunMidRowWise(int n, int pre, int post) const {
-    platform::Transform<DeviceContext> trans;
-    for (int i = 0; i < pre; i++) {
-      trans(ctx_, x_ + i * n * post, x_ + (i + 1) * n * post,
-            RowwiseTransformIterator<T, DeviceContext>(y_ + i * post, post),
-            z_ + i * n * post, func_);
-    }
-  }
-
- private:
-  const T *x_;
-  const T *y_;
-  OutType *z_;
-  int64_t nx_;
-  const DeviceContext &ctx_;
-  Functor func_;
-};
-
-template <typename T, typename DX_OP, typename DY_OP>
-struct ElemwiseGradNoBroadcast {
-  const T *x_;
-  const T *y_;
-  const T *out_;
-  const T *dout_;
-
-  HOSTDEVICE void operator()(size_t i) {
-    if (dx_ != nullptr) {
-      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
-    }
-    if (dy_ != nullptr) {
-      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
-    }
-  }
-
-  DX_OP dx_op_;
-  DY_OP dy_op_;
-  T *dx_;
-  T *dy_;
-};
-
-template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcast1CPU(const T *x, const T *y, const T *out,
-                                      const T *dout, int h, int w, DX_OP dx_op,
-                                      DY_OP dy_op, T *dx, T *dy) {
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; ++j) {
-      int x_offset = i * w + j;
-      if (dx != nullptr) {
-        dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-      }
-      if (dy != nullptr) {
-        T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-        if (i == 0) {
-          dy[j] = tmp;
-        } else {
-          dy[j] += tmp;
-        }
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename T, typename DX_OP, typename DY_OP>
-static __global__ void ElemwiseGradBroadcast1CUDAKernel(
-    const T *x, const T *y, const T *out, const T *dout, int h, int w,
-    DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-  int tid = threadIdx.x;
-  T val(0);
-
-  do {
-    int x_offset = i * w + j;
-    if (dx) {
-      dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-    }
-    if (dy) {
-      val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-    }
-    i += ELEMWISE_MAX_BLOCK_DIM;
-  } while (i < h);
-
-  if (dy) {
-    h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-    val = paddle::platform::reduceSum(val, tid, h);
-    if (threadIdx.x == 0) {
-      dy[j] = val;
-    }
-  }
-}
-
-#define BLOCK_X 32
-#define BLOCK_Y 32
-
-// suppose use 2D block is fast because more parallel
-// and memory coalesced
-template <typename T, typename DX_OP, typename DY_OP>
-static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
-    const T *x, const T *y, const T *out, const T *dout, int h, int w,
-    DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
-
-  T val(0);
-  size_t width_stride = gridDim.x * blockDim.x;
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  size_t full_width =
-      (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
-  size_t full_height =
-      (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
-
-  for (int m = idx; m < full_width; m += width_stride) {
-    sdata[threadIdx.y][threadIdx.x] = 0;
-    for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
-      int x_offset = n * w + m;
-      if (dx && m < w && n < h) {
-        dx[x_offset] = dx_op(x[x_offset], y[m], out[x_offset], dout[x_offset]);
-      }
-      if (dy) {
-        if (m < w && n < h) {
-          T val = dy_op(x[x_offset], y[m], out[x_offset], dout[x_offset]);
-          sdata[threadIdx.y][threadIdx.x] += val;
-        }
-        __syncthreads();
-      }
-    }
-    if (dy) {
-      T my_val = sdata[threadIdx.x][threadIdx.y];
-      for (int i = warpSize >> 1; i > 0; i >>= 1)
-        my_val += platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
-      __syncthreads();
-      if ((threadIdx.x == 0)) {
-        sdata[0][threadIdx.y] = my_val;
-      }
-      __syncthreads();
-      if (threadIdx.y == 0 && m < w) {
-        dy[m] = sdata[0][threadIdx.x];
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x,
-                                       const T *y, const T *out, const T *dout,
-                                       int h, int w, DX_OP dx_op, DY_OP dy_op,
-                                       T *dx, T *dy) {
-  // For small case use 1D block
-  constexpr int half_walf = 16;
-  if (w < half_walf || h < half_walf) {
-    int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-    int gird_size = w;
-    ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
-        x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
-  } else {
-    // suppose perfoemance improves with h increased.
-    dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
-    int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-    FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
-        x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
-  }
-}
-
-#endif
-
-template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcast2CPU(const T *x, const T *y, const T *out,
-                                      const T *dout, int pre, int n, int post,
-                                      DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  for (int i = 0; i < pre; ++i) {
-    for (int j = 0; j < n; ++j) {
-      for (int k = 0; k < post; ++k) {
-        int x_offset = i * n * post + j * post + k;
-        if (dx != nullptr) {
-          dx[x_offset] =
-              dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-        }
-        if (dy != nullptr) {
-          T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-          if (i == 0 && k == 0) {
-            dy[j] = tmp;
-          } else {
-            dy[j] += tmp;
-          }
-        }
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename T, typename DX_OP, typename DY_OP>
-static __global__ void ElemwiseGradBroadcast2CUDAKernel(
-    const T *x, const T *y, const T *out, const T *dout, int pre, int n,
-    int post, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  int tid = threadIdx.x;
-  int j = blockIdx.x;
-
-  T val(0);
-  int ttid = tid;
-
-  while (true) {
-    int i = ttid / post;
-    int k = ttid % post;
-    if (i >= pre) break;
-
-    int x_offset = i * n * post + j * post + k;
-
-    if (dx != nullptr) {
-      dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-    }
-
-    if (dy != nullptr) {
-      val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-    }
-
-    ttid += ELEMWISE_MAX_BLOCK_DIM;
-  }
-
-  if (dy) {
-    int h = pre * post;
-    h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-    val = paddle::platform::reduceSum(val, tid, h);
-    if (threadIdx.x == 0) {
-      dy[j] = val;
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T *x,
-                                       const T *y, const T *out, const T *dout,
-                                       int pre, int n, int post, DX_OP dx_op,
-                                       DY_OP dy_op, T *dx, T *dy) {
-  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
-  int gird_size = n;
-  ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, 0, stream>>>(
-      x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy);
-}
-
-#endif
-
-template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcastMid2CPU(const T *x, const T *y, const T *out,
-                                         const T *dout, int pre, int n,
-                                         int post, DX_OP dx_op, DY_OP dy_op,
-                                         T *dx, T *dy) {
-  for (int i = 0; i < pre; ++i) {
-    for (int j = 0; j < n; ++j) {
-      for (int k = 0; k < post; ++k) {
-        int x_offset = i * n * post + j * post + k;
-        int y_offset = i * post + k;
-        if (dx != nullptr) {
-          dx[x_offset] =
-              dx_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]);
-        }
-        if (dy != nullptr) {
-          T tmp =
-              dy_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]);
-          if (j == 0) {
-            dy[y_offset] = tmp;
-          } else {
-            dy[y_offset] += tmp;
-          }
-        }
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename T, typename DX_OP, typename DY_OP>
-static __global__ void ElemwiseGradBroadcastMid2CUDAKernel(
-    const T *x, const T *y, const T *out, const T *dout, int pre, int n,
-    int post, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  int j = threadIdx.x;
-  int tid = blockIdx.x;
-
-  T val(0);
-  int ttid = tid;
-
-  while (true) {
-    int i = ttid / post;
-    int k = ttid % post;
-    if (i >= pre) break;
-
-    int x_offset = i * n * post + j * post + k;
-    int y_offset = i * post + k;
-    if (dx != nullptr) {
-      dx[x_offset] =
-          dx_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]);
-    }
-
-    if (dy != nullptr) {
-      val += dy_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]);
-    }
-
-    ttid += ELEMWISE_MAX_BLOCK_DIM;
-  }
-
-  if (dy) {
-    int h = n;
-    h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-    val = paddle::platform::reduceSum(val, j, h);
-    if (threadIdx.x == 0) {
-      dy[tid] = val;
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcastMid2CUDA(cudaStream_t stream, const T *x,
-                                          const T *y, const T *out,
-                                          const T *dout, int pre, int n,
-                                          int post, DX_OP dx_op, DY_OP dy_op,
-                                          T *dx, T *dy) {
-  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, n);
-  int gird_size = pre * post;
-  ElemwiseGradBroadcastMid2CUDAKernel<<<gird_size, block_size, 0, stream>>>(
-      x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy);
-}
-
-#endif
-
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
-void ElemwiseGradComputeNoBroadcast(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
-    const framework::DDim &y_dim, const framework::Tensor &x,
-    const framework::Tensor &y, const framework::Tensor &out,
-    const framework::Tensor &dout, int axis, framework::Tensor *dx,
-    framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
-  size_t N = static_cast<size_t>(framework::product(x_dim));
-#if !defined(_WIN32)
-  platform::ForRange<DeviceContext> for_range(
-      ctx.template device_context<DeviceContext>(), N);
-#else
-  platform::ForRange<DeviceContext> for_range(
-      ctx.device_context<DeviceContext>(), N);
-#endif  // !_WIN32
-  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
-      x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
-      dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-      dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
-}
-
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
-void ElemwiseGradComputeWithBroadcast(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
-    const framework::DDim &y_dim_untrimed, const framework::Tensor &x,
-    const framework::Tensor &y, const framework::Tensor &out,
-    const framework::Tensor &dout, int axis, framework::Tensor *dx,
-    framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
-  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
-  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
-  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
-
-  int pre, n, post, mid_flag = 0;
-  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, &mid_flag);
-  if (mid_flag) {
-    PADDLE_ENFORCE_EQ(mid_flag, 1, "mid_flag should be no more than 1.");
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-      ElemwiseGradBroadcastMid2CUDA(
-          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-          y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
-          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-    } else {
-      ElemwiseGradBroadcastMid2CPU(
-          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
-          dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-    }
-  } else if (post == 1) {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-      ElemwiseGradBroadcast1CUDA(
-          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-          y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-    } else {
-      ElemwiseGradBroadcast1CPU(
-          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n,
-          dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-    }
-  } else {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-      ElemwiseGradBroadcast2CUDA(
-          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-          y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
-          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-    } else {
-      ElemwiseGradBroadcast2CPU(
-          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
-          dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
-void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
-                         const framework::Tensor &x, const framework::Tensor &y,
-                         const framework::Tensor &out,
-                         const framework::Tensor &dout, int axis,
-                         framework::Tensor *dx, framework::Tensor *dy,
-                         DX_OP dx_op, DY_OP dy_op) {
-  const framework::DDim &x_dim = x.dims();
-  const framework::DDim &y_dim = y.dims();
-  if (x.dims() == y.dims()) {
-    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
-        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  } else {  // Y is a scalar
-    ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
-        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  }
-}
-
-// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
-// explicit gradient can cut off X, Y, Out from gradient op
-// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
-// elementwise code.
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
-void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx,
-                                 const framework::Tensor &x,
-                                 const framework::Tensor &y,
-                                 const framework::Tensor &out,
-                                 const framework::Tensor &dout, int axis,
-                                 framework::Tensor *dx, framework::Tensor *dy,
-                                 DX_OP dx_op, DY_OP dy_op) {
-  if (dy == nullptr) {
-    const framework::DDim &dx_dims = dout.dims();
-    auto dy_dims = dx_dims;
-    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
-        ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  } else {
-    if (dout.dims() == dy->dims()) {
-      const framework::DDim &dx_dims = dout.dims();
-      const framework::DDim &dy_dims = dy->dims();
-      ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
-          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-    } else {  // Y is a scalar
-      auto dx_dims = dout.dims();
-      const framework::DDim &dy_dims = dy->dims();
-      ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
-          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-    }
-  }
-}
-
-// Deprecated
-template <typename DeviceContext, typename T, typename functor,
-          typename broadcastfunctor, typename broadcast2functor>
-void ElementwiseGradCompute(const framework::ExecutionContext &ctx,
-                            const framework::Tensor *x,
-                            const framework::Tensor *y,
-                            const framework::Tensor *out,
-                            const framework::Tensor *dout, int axis,
-                            framework::Tensor *dx, framework::Tensor *dy) {
-  auto &place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-
-  if (dx) {
-    dx->mutable_data<T>(ctx.GetPlace());
-  }
-  if (dy) {
-    dy->mutable_data<T>(ctx.GetPlace());
-  }
-
-  if (x_dims == y_dims) {
-    functor f;
-    f(place, x, y, out, dx, dy, dout);
-    return;
-  }
-
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  trim_trailing_singular_dims(y_dims);
-  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-
-  if (post == 1) {
-    broadcastfunctor f;
-    f(place, x, y, out, dx, dy, dout, pre, n);
-    return;
-  } else {
-    broadcast2functor f;
-    f(place, x, y, out, dx, dy, dout, pre, n, post);
-    return;
-  }
-}
-
-template <typename Functor, typename DeviceContext, typename T,
-          typename OutType = T>
-
-void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
-                          const framework::Tensor *x,
-                          const framework::Tensor *y, int axis, Functor func,
-                          framework::Tensor *z) {
-  TransformFunctor<Functor, T, DeviceContext, OutType> functor(
-      x, y, z, ctx.template device_context<DeviceContext>(), func);
-  auto x_dims = x->dims();
-  auto y_dims_untrimed = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
-                    "Rank of first input must >= rank of second input.");
-  if (x_dims == y_dims_untrimed) {
-    functor.Run();
-    return;
-  }
-
-  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                 "Axis should be in range [0, x_dims)");
-  auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
-  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
-  int pre, n, post, mid_flag = 0;
-  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &mid_flag);
-  if (mid_flag) {
-    functor.RunMidRowWise(n, pre, post);
-    return;
-  }
-  if (post == 1) {
-    functor.RunRowWise(n, pre);
-    return;
-  } else {
-    functor.RunMidWise(n, pre, post);
-    return;
-  }
-}
-
-// FusedElemwiseAndAct
-// --- forward
-template <typename T, typename CompoundFunctor, bool KeepIntermediateOut>
-struct FusedElemwiseAndActNoBroadcast {
-  HOSTDEVICE void operator()(size_t i) {
-    T y_val = y_[i];
-    T x_val = x_[i];
-    if (KeepIntermediateOut) {
-      T intermeidiate_out = compound_functor_.GetIntermediateOut(x_val, y_val);
-      intermediate_out_[i] = intermeidiate_out;
-      out_[i] =
-          compound_functor_.GetOutUseIntermediateOut(x_val, intermeidiate_out);
-    } else {
-      out_[i] = compound_functor_.GetOut(x_val, y_val);
-    }
-  }
-
-  const T *x_;
-  const T *y_;
-  CompoundFunctor compound_functor_;
-  T *out_;
-  T *intermediate_out_;
-};
-
-// FusedElemwiseAndActBroadcast1:
-// In this case, X and Y can be reshaped to a matrix.
-// For example shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) and axis = -1 or 2,
-// X can be reshaped to (6, 20) and Y can be reshaped to (1, 20)
-template <typename T, typename CompoundFunctor, bool BcastY,
-          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
-static void FusedElemwiseAndActBroadcast1CPU(const T *x, const T *y,
-                                             CompoundFunctor compound_functor,
-                                             int h, int w, T *out,
-                                             T *intermediate_out) {
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; ++j) {
-      int offset = i * w + j;
-
-      T y_val = BcastY ? y[j] : y[offset];
-      T x_val = BcastY ? x[offset] : x[j];
-      int64_t intermediate_out_offset;
-      if (KeepIntermediateOut) {
-        T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val);
-
-        if (SameShapeOfIntermediateOutAndOut) {
-          // for the case of f1(f2(x, y))
-          intermediate_out_offset = offset;
-        } else if (BcastY) {
-          intermediate_out_offset = j;
-        } else {
-          intermediate_out_offset = offset;
-        }
-
-        intermediate_out[intermediate_out_offset] = intermeidiate_out;
-        out[offset] =
-            compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out);
-      } else {
-        out[offset] = compound_functor.GetOut(x_val, y_val);
-      }
-    }
-  }
-}
-
-// FusedElemwiseAndActBroadcast2
-// In this case, X and Y can be reshaped to a matrix.
-// For example shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4) and axis = 1,
-// X can be reshaped to (2, 12, 5) and Y can be reshaped to (1, 12, 1)
-// pre = 2, n = 12, post = 5
-template <typename T, typename CompoundFunctor, bool BcastY,
-          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
-static void FusedElemwiseAndActBroadcast2CPU(const T *x, const T *y, int pre,
-                                             int n, int post,
-                                             CompoundFunctor compound_functor,
-                                             T *out, T *intermediate_out) {
-  for (int i = 0; i < pre; ++i) {
-    for (int j = 0; j < n; ++j) {
-      for (int k = 0; k < post; ++k) {
-        int offset = i * n * post + j * post + k;
-
-        T y_val = BcastY ? y[j] : y[offset];
-        T x_val = BcastY ? x[offset] : x[j];
-        int64_t intermediate_out_offset;
-
-        if (KeepIntermediateOut) {
-          T intermeidiate_out =
-              compound_functor.GetIntermediateOut(x_val, y_val);
-
-          if (SameShapeOfIntermediateOutAndOut) {
-            // for the case of f1(f2(x, y))
-            intermediate_out_offset = offset;
-          } else if (BcastY) {
-            intermediate_out_offset = j;
-          } else {
-            intermediate_out_offset = offset;
-          }
-
-          intermediate_out[intermediate_out_offset] = intermeidiate_out;
-          out[offset] = compound_functor.GetOutUseIntermediateOut(
-              x_val, intermeidiate_out);
-        } else {
-          out[offset] = compound_functor.GetOut(x_val, y_val);
-        }
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename T, typename CompoundFunctor, bool BcastY,
-          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
-static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel(
-    const T *x, const T *y, int h, int w, CompoundFunctor compound_functor,
-    T *out, T *intermediate_out) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-
-  while (i < h) {
-    int offset = i * w + j;
-
-    T y_val = BcastY ? y[j] : y[offset];
-    T x_val = BcastY ? x[offset] : x[j];
-    int64_t intermediate_out_offset;
-
-    if (KeepIntermediateOut) {
-      T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val);
-
-      if (SameShapeOfIntermediateOutAndOut) {
-        // for the case of f1(f2(x, y))
-        intermediate_out_offset = offset;
-      } else if (BcastY) {
-        intermediate_out_offset = j;
-      } else {
-        intermediate_out_offset = offset;
-      }
-
-      intermediate_out[intermediate_out_offset] = intermeidiate_out;
-      out[offset] =
-          compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out);
-    } else {
-      out[offset] = compound_functor.GetOut(x_val, y_val);
-    }
-
-    i += ELEMWISE_MAX_BLOCK_DIM;
-  }
-}
-
-template <typename T, typename CompoundFunctor, bool BcastY,
-          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
-static void FusedElemwiseAndActBroadcast1CUDA(cudaStream_t stream, const T *x,
-                                              const T *y,
-                                              CompoundFunctor compound_functor,
-                                              int h, int w, T *out,
-                                              T *intermediate_out) {
-  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-  int gird_size = w;
-  FusedElemwiseAndActBroadcast1CUDAKernel<
-      T, CompoundFunctor, BcastY, KeepIntermediateOut,
-      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
-      x, y, h, w, compound_functor, out, intermediate_out);
-}
-
-template <typename T, typename CompoundFunctor, bool BcastY,
-          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
-static __global__ void FusedElemwiseAndActBroadcast2CUDAKernel(
-    const T *x, const T *y, CompoundFunctor compound_functor, int pre, int n,
-    int post, T *out, T *intermediate_out) {
-  int tid = threadIdx.x;
-  int j = blockIdx.x;
-
-  while (true) {
-    int i = tid / post;
-    int k = tid % post;
-    if (i >= pre) break;
-
-    int offset = i * n * post + j * post + k;
-
-    T y_val = BcastY ? y[j] : y[offset];
-    T x_val = BcastY ? x[offset] : x[j];
-    int64_t intermediate_out_offset;
-
-    if (KeepIntermediateOut) {
-      T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val);
-
-      if (SameShapeOfIntermediateOutAndOut) {
-        // for the case of f1(f2(x, y))
-        intermediate_out_offset = offset;
-      } else if (BcastY) {
-        intermediate_out_offset = j;
-      } else {
-        intermediate_out_offset = offset;
-      }
-
-      intermediate_out[intermediate_out_offset] = intermeidiate_out;
-      out[offset] =
-          compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out);
-    } else {
-      out[offset] = compound_functor.GetOut(x_val, y_val);
-    }
-
-    tid += ELEMWISE_MAX_BLOCK_DIM;
-  }
-}
-
-template <typename T, typename CompoundFunctor, bool BcastY,
-          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
-static void FusedElemwiseAndActBroadcast2CUDA(cudaStream_t stream, const T *x,
-                                              const T *y, int pre, int n,
-                                              int post,
-                                              CompoundFunctor compound_functor,
-                                              T *out, T *intermediate_out) {
-  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
-  int gird_size = n;
-
-  FusedElemwiseAndActBroadcast2CUDAKernel<
-      T, CompoundFunctor, BcastY, KeepIntermediateOut,
-      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
-      x, y, compound_functor, pre, n, post, out, intermediate_out);
-}
-
-#endif
-
-template <typename DeviceContext, typename T, typename CompoundFunctor,
-          bool KeepIntermediateOut>
-void FusedElemwiseAndActComputeNoBroadcast(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
-    const framework::Tensor &x, const framework::Tensor &y,
-    CompoundFunctor compound_functor, framework::Tensor *out,
-    framework::Tensor *intermediate_out) {
-  size_t N = static_cast<size_t>(framework::product(x_dim));
-
-  platform::ForRange<DeviceContext> for_range(
-      ctx.template device_context<DeviceContext>(), N);
-
-  for_range(
-      FusedElemwiseAndActNoBroadcast<T, CompoundFunctor, KeepIntermediateOut>{
-          x.data<T>(), y.data<T>(), compound_functor,
-          out->mutable_data<T>(ctx.GetPlace()),
-          intermediate_out == nullptr
-              ? nullptr
-              : intermediate_out->mutable_data<T>(ctx.GetPlace())});
-}
-
-template <typename DeviceContext, typename T, typename CompoundFunctor,
-          bool BcastY, bool KeepIntermediateOut,
-          bool SameShapeOfIntermediateOutAndOut>
-void FusedElemwiseAndActComputeWithBroadcast(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
-    const framework::DDim &y_dim_untrimed, const framework::Tensor &x,
-    const framework::Tensor &y, CompoundFunctor compound_functor, int axis,
-    framework::Tensor *out, framework::Tensor *intermediate_out) {
-  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
-  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
-  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
-
-  int pre, n, post;
-  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
-
-  if (post == 1) {
-    int h = pre;
-    int w = n;
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-      FusedElemwiseAndActBroadcast1CUDA<T, CompoundFunctor, BcastY,
-                                        KeepIntermediateOut,
-                                        SameShapeOfIntermediateOutAndOut>(
-          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-          y.data<T>(), compound_functor, h, w,
-          out->mutable_data<T>(ctx.GetPlace()),
-          intermediate_out == nullptr
-              ? nullptr
-              : intermediate_out->mutable_data<T>(ctx.GetPlace()));
-#endif
-    } else {
-      FusedElemwiseAndActBroadcast1CPU<T, CompoundFunctor, BcastY,
-                                       KeepIntermediateOut,
-                                       SameShapeOfIntermediateOutAndOut>(
-          x.data<T>(), y.data<T>(), compound_functor, h, w,
-          out->mutable_data<T>(ctx.GetPlace()),
-          intermediate_out == nullptr
-              ? nullptr
-              : intermediate_out->mutable_data<T>(ctx.GetPlace()));
-    }
-  } else {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-      FusedElemwiseAndActBroadcast2CUDA<T, CompoundFunctor, BcastY,
-                                        KeepIntermediateOut,
-                                        SameShapeOfIntermediateOutAndOut>(
-          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-          y.data<T>(), pre, n, post, compound_functor,
-          out->mutable_data<T>(ctx.GetPlace()),
-          intermediate_out == nullptr
-              ? nullptr
-              : intermediate_out->mutable_data<T>(ctx.GetPlace()));
-#endif
-    } else {
-      FusedElemwiseAndActBroadcast2CPU<T, CompoundFunctor, BcastY,
-                                       KeepIntermediateOut,
-                                       SameShapeOfIntermediateOutAndOut>(
-          x.data<T>(), y.data<T>(), pre, n, post, compound_functor,
-          out->mutable_data<T>(ctx.GetPlace()),
-          intermediate_out == nullptr
-              ? nullptr
-              : intermediate_out->mutable_data<T>(ctx.GetPlace()));
-    }
-  }
-}
-
-// --- backward
-template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
-          bool UseIntermediateOut>
-struct FusedElemwiseAndActGradNoBroadcast {
-  HOSTDEVICE void operator()(size_t i) {
-    T x_val = x_[i];
-    T y_val = y_[i];
-    T out_val = out_[i];
-    T dout_val = dout_[i];
-    T intermediate_out_val = UseIntermediateOut
-                                 ? intermediate_out_[i]
-                                 : dx_op_.GetIntermediateOut(x_val, y_val);
-    if (dx_ != nullptr) {
-      dx_[i] = dx_op_.UseIntermediateOut(x_val, y_val, intermediate_out_val,
-                                         out_val, dout_val);
-    }
-    if (dy_ != nullptr) {
-      dy_[i] = dy_op_.UseIntermediateOut(x_val, y_val, intermediate_out_val,
-                                         out_val, dout_val);
-    }
-    if (dintermediate_ != nullptr) {
-      dintermediate_[i] = dintermediate_op_.UseIntermediateOut(
-          x_val, intermediate_out_val, out_val, dout_val);
-    }
-  }
-
-  const T *x_;
-  const T *y_;
-  const T *intermediate_out_;
-  const T *out_;
-  const T *dout_;
-  DX_OP dx_op_;
-  DY_OP dy_op_;
-  DIntermediate_OP dintermediate_op_;
-  T *dx_;
-  T *dy_;
-  T *dintermediate_;
-};
-
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
-          typename DIntermediate_OP, bool UseIntermediateOut>
-void FusedElemwiseAndActGradComputeNoBroadcast(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
-    const framework::DDim &y_dim, const framework::Tensor *x,
-    const framework::Tensor *y, const framework::Tensor *intermediate_out,
-    const framework::Tensor *out, const framework::Tensor *dout, int axis,
-    framework::Tensor *dx, framework::Tensor *dy,
-    framework::Tensor *dintermediate, DX_OP dx_op, DY_OP dy_op,
-    DIntermediate_OP dintermediate_op) {
-  size_t N = static_cast<size_t>(framework::product(x_dim));
-  platform::ForRange<DeviceContext> for_range(
-      ctx.template device_context<DeviceContext>(), N);
-  for_range(
-      FusedElemwiseAndActGradNoBroadcast<T, DX_OP, DY_OP, DIntermediate_OP,
-                                         UseIntermediateOut>{
-          x->data<T>(), y->data<T>(),
-          intermediate_out ? intermediate_out->data<T>() : nullptr,
-          out->data<T>(), dout->data<T>(), dx_op, dy_op, dintermediate_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace())});
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
-          bool UseIntermediateOut, bool BcastY,
-          bool SameShapeOfIntermediateOutAndOut>
-static void FusedElemwiseAndActGradBroadcast1CPU(
-    const T *x, const T *y, const T *intermediate_out, const T *out,
-    const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op,
-    DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
-  int64_t tmp_out_idx, x_idx, y_idx;
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; ++j) {
-      int offset = i * w + j;
-
-      tmp_out_idx = BcastY ? j : offset;
-      y_idx = BcastY ? j : offset;
-      x_idx = BcastY ? offset : j;
-
-      if (SameShapeOfIntermediateOutAndOut) {
-        tmp_out_idx = offset;
-      }
-
-      if (dx != nullptr) {
-        T tmp = UseIntermediateOut
-                    ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx],
-                                               intermediate_out[tmp_out_idx],
-                                               out[offset], dout[offset])
-                    : dx_op.Recompute(x[x_idx], y[y_idx], out[offset],
-                                      dout[offset]);
-
-        if (BcastY) {
-          dx[x_idx] = tmp;
-        } else {
-          if (i == 0) {
-            dx[x_idx] = tmp;
-          } else {
-            dx[x_idx] += tmp;
-          }
-        }
-      }
-      if (dy != nullptr) {
-        T tmp = UseIntermediateOut
-                    ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx],
-                                               intermediate_out[tmp_out_idx],
-                                               out[offset], dout[offset])
-                    : dy_op.Recompute(x[x_idx], y[y_idx], out[offset],
-                                      dout[offset]);
-        if (BcastY) {
-          if (i == 0) {
-            dy[y_idx] = tmp;
-          } else {
-            dy[y_idx] += tmp;
-          }
-        } else {
-          dy[y_idx] = tmp;
-        }
-      }
-      if (d_intermediate != nullptr) {
-        T tmp = UseIntermediateOut
-                    ? dintermediate_op.UseIntermediateOut(
-                          x[x_idx], intermediate_out[tmp_out_idx], out[offset],
-                          dout[offset])
-                    : dintermediate_op.Recompute(x[x_idx], y[y_idx],
-                                                 out[offset], dout[i]);
-        if (SameShapeOfIntermediateOutAndOut) {
-          d_intermediate[tmp_out_idx] = tmp;
-        } else {
-          if (i == 0) {
-            d_intermediate[tmp_out_idx] = tmp;
-          } else {
-            d_intermediate[tmp_out_idx] += tmp;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
-          bool UseIntermediateOut, bool BcastY,
-          bool SameShapeOfIntermediateOutAndOut>
-static void FusedElemwiseAndActGradBroadcast2CPU(
-    const T *x, const T *y, const T *intermediate_out, const T *out,
-    const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op,
-    DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
-  int64_t tmp_out_idx, x_idx, y_idx;
-  for (int i = 0; i < pre; ++i) {
-    for (int j = 0; j < n; ++j) {
-      for (int k = 0; k < post; ++k) {
-        int offset = i * n * post + j * post + k;
-
-        tmp_out_idx = BcastY ? j : offset;
-        y_idx = BcastY ? j : offset;
-        x_idx = BcastY ? offset : j;
-
-        if (SameShapeOfIntermediateOutAndOut) {
-          tmp_out_idx = offset;
-        }
-
-        if (dx != nullptr) {
-          T tmp = UseIntermediateOut
-                      ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx],
-                                                 intermediate_out[tmp_out_idx],
-                                                 out[offset], dout[offset])
-                      : dx_op.Recompute(x[x_idx], y[y_idx], out[offset],
-                                        dout[offset]);
-
-          if (BcastY) {
-            dx[x_idx] = tmp;
-          } else {
-            if (i == 0 && k == 0) {
-              dx[x_idx] = tmp;
-            } else {
-              dx[x_idx] += tmp;
-            }
-          }
-        }
-        if (dy != nullptr) {
-          T tmp = UseIntermediateOut
-                      ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx],
-                                                 intermediate_out[tmp_out_idx],
-                                                 out[offset], dout[offset])
-                      : dy_op.Recompute(x[x_idx], y[y_idx], out[offset],
-                                        dout[offset]);
-          if (BcastY) {
-            if (i == 0 && k == 0) {
-              dy[y_idx] = tmp;
-            } else {
-              dy[y_idx] += tmp;
-            }
-          } else {
-            dy[y_idx] = tmp;
-          }
-        }
-        if (d_intermediate != nullptr) {
-          T tmp = UseIntermediateOut
-                      ? dintermediate_op.UseIntermediateOut(
-                            x[x_idx], intermediate_out[tmp_out_idx],
-                            out[offset], dout[offset])
-                      : dintermediate_op.Recompute(x[x_idx], y[y_idx],
-                                                   out[offset], dout[i]);
-          if (SameShapeOfIntermediateOutAndOut) {
-            d_intermediate[tmp_out_idx] = tmp;
-          } else {
-            if (i == 0) {
-              d_intermediate[tmp_out_idx] = tmp;
-            } else {
-              d_intermediate[tmp_out_idx] += tmp;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
-          bool UseIntermediateOut, bool BcastY,
-          bool SameShapeOfIntermediateOutAndOut>
-static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(
-    const T *x, const T *y, const T *intermediate_out, const T *out,
-    const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op,
-    DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-  int tid = threadIdx.x;
-  T val(0), inter_val(0);
-  int64_t tmp_out_idx, x_idx, y_idx;
-
-  do {
-    int offset = i * w + j;
-
-    tmp_out_idx = BcastY ? j : offset;
-    y_idx = BcastY ? j : offset;
-    x_idx = BcastY ? offset : j;
-
-    if (SameShapeOfIntermediateOutAndOut) {
-      tmp_out_idx = offset;
-    }
-
-    if (dx != nullptr) {
-      T tmp =
-          UseIntermediateOut
-              ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx],
-                                         intermediate_out[tmp_out_idx],
-                                         out[offset], dout[offset])
-              : dx_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]);
-
-      if (BcastY) {
-        dx[x_idx] = tmp;
-      } else {
-        val += tmp;
-      }
-    }
-    if (dy != nullptr) {
-      T tmp =
-          UseIntermediateOut
-              ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx],
-                                         intermediate_out[tmp_out_idx],
-                                         out[offset], dout[offset])
-              : dy_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]);
-      if (BcastY) {
-        val += tmp;
-      } else {
-        dy[y_idx] = tmp;
-      }
-    }
-    if (d_intermediate != nullptr) {
-      T tmp = UseIntermediateOut
-                  ? dintermediate_op.UseIntermediateOut(
-                        y[y_idx], intermediate_out[tmp_out_idx], out[offset],
-                        dout[offset])
-                  : dintermediate_op.Recompute(x[x_idx], y[y_idx], out[offset],
-                                               dout[offset]);
-      if (SameShapeOfIntermediateOutAndOut) {
-        d_intermediate[tmp_out_idx] = tmp;
-      } else {
-        inter_val += tmp;
-      }
-    }
-
-    i += ELEMWISE_MAX_BLOCK_DIM;
-  } while (i < h);
-
-  h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-  if (BcastY) {
-    if (dy) {
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
-        dy[j] = val;
-      }
-    }
-  } else {
-    if (dx) {
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
-        dx[j] = val;
-      }
-    }
-  }
-  if (!SameShapeOfIntermediateOutAndOut) {
-    if (d_intermediate) {
-      inter_val = paddle::platform::reduceSum(inter_val, tid, h);
-      if (threadIdx.x == 0) {
-        d_intermediate[j] = inter_val;
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
-          bool UseIntermediateOut, bool BcastY,
-          bool SameShapeOfIntermediateOutAndOut>
-static void FusedElemwiseAndActGradBroadcast1CUDA(
-    cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
-    const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op,
-    DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
-  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-  int gird_size = w;
-  FusedElemwiseAndActGradBroadcast1CUDAKernel<
-      T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY,
-      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
-      x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dintermediate_op,
-      dx, dy, d_intermediate);
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
-          bool UseIntermediateOut, bool BcastY,
-          bool SameShapeOfIntermediateOutAndOut>
-static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
-    const T *x, const T *y, const T *intermediate_out, const T *out,
-    const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op,
-    DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
-  int tid = threadIdx.x;
-  int j = blockIdx.x;
-
-  T val(0), inter_val(0);
-  int ttid = tid;
-  int64_t tmp_out_idx, x_idx, y_idx;
-  while (true) {
-    int i = ttid / post;
-    int k = ttid % post;
-    if (i >= pre) break;
-
-    int offset = i * n * post + j * post + k;
-
-    tmp_out_idx = BcastY ? j : offset;
-    y_idx = BcastY ? j : offset;
-    x_idx = BcastY ? offset : j;
-
-    if (SameShapeOfIntermediateOutAndOut) {
-      tmp_out_idx = offset;
-    }
-
-    if (dx != nullptr) {
-      T tmp =
-          UseIntermediateOut
-              ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx],
-                                         intermediate_out[tmp_out_idx],
-                                         out[offset], dout[offset])
-              : dx_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]);
-
-      if (BcastY) {
-        dx[x_idx] = tmp;
-      } else {
-        val += tmp;
-      }
-    }
-    if (dy != nullptr) {
-      T tmp =
-          UseIntermediateOut
-              ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx],
-                                         intermediate_out[tmp_out_idx],
-                                         out[offset], dout[offset])
-              : dy_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]);
-      if (BcastY) {
-        val += tmp;
-      } else {
-        dy[y_idx] = tmp;
-      }
-    }
-    if (d_intermediate != nullptr) {
-      T tmp = UseIntermediateOut
-                  ? dintermediate_op.UseIntermediateOut(
-                        y[y_idx], intermediate_out[tmp_out_idx], out[offset],
-                        dout[offset])
-                  : dintermediate_op.Recompute(x[x_idx], y[y_idx], out[offset],
-                                               dout[offset]);
-      if (SameShapeOfIntermediateOutAndOut) {
-        d_intermediate[tmp_out_idx] = tmp;
-      } else {
-        inter_val += tmp;
-      }
-    }
-    ttid += ELEMWISE_MAX_BLOCK_DIM;
-  }
-
-  int h = pre * post;
-  h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-  if (BcastY) {
-    if (dy) {
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
-        dy[j] = val;
-      }
-    }
-  } else {
-    if (dx) {
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
-        dx[j] = val;
-      }
-    }
-  }
-  if (!SameShapeOfIntermediateOutAndOut) {
-    if (d_intermediate) {
-      inter_val = paddle::platform::reduceSum(inter_val, tid, h);
-      if (threadIdx.x == 0) {
-        d_intermediate[j] = inter_val;
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
-          bool UseIntermediateOut, bool BcastY,
-          bool SameShapeOfIntermediateOutAndOut>
-static void FusedElemwiseAndActGradBroadcast2CUDA(
-    cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
-    const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op,
-    DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy,
-    T *dintermediate) {
-  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
-  int gird_size = n;
-  FusedElemwiseAndActGradBroadcast2CUDAKernel<
-      T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY,
-      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
-      x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op,
-      dintermediate_op, dx, dy, dintermediate);
-}
-#endif
-
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
-          typename DIntermediate_OP, bool UseIntermediateOut, bool BcastY,
-          bool SameShapeOfIntermediateOutAndOut>
-void FusedElemwiseAndActGradComputeWithBroadcast(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
-    const framework::DDim &y_dim_untrimed, const framework::Tensor *x,
-    const framework::Tensor *y, const framework::Tensor *intermediate_out,
-    const framework::Tensor *out, const framework::Tensor *dout, int axis,
-    framework::Tensor *dx, framework::Tensor *dy,
-    framework::Tensor *dintermediate, DX_OP dx_op, DY_OP dy_op,
-    DIntermediate_OP dintermediate_op) {
-  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
-  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
-  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
-
-  int pre, n, post;
-  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
-  if (post == 1) {
-    int h = pre;
-    int w = n;
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-      FusedElemwiseAndActGradBroadcast1CUDA<T, DX_OP, DY_OP, DIntermediate_OP,
-                                            UseIntermediateOut, BcastY,
-                                            SameShapeOfIntermediateOutAndOut>(
-          ctx.template device_context<DeviceContext>().stream(), x->data<T>(),
-          y->data<T>(),
-          intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
-          out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op, dintermediate_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
-#endif
-    } else {
-      FusedElemwiseAndActGradBroadcast1CPU<T, DX_OP, DY_OP, DIntermediate_OP,
-                                           UseIntermediateOut, BcastY,
-                                           SameShapeOfIntermediateOutAndOut>(
-          x->data<T>(), y->data<T>(),
-          intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
-          out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op, dintermediate_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
-    }
-  } else {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-      FusedElemwiseAndActGradBroadcast2CUDA<T, DX_OP, DY_OP, DIntermediate_OP,
-                                            UseIntermediateOut, BcastY,
-                                            SameShapeOfIntermediateOutAndOut>(
-          ctx.template device_context<DeviceContext>().stream(), x->data<T>(),
-          y->data<T>(),
-          intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
-          out->data<T>(), dout->data<T>(), pre, n, post, dx_op, dy_op,
-          dintermediate_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
-#endif
-    } else {
-      FusedElemwiseAndActGradBroadcast2CPU<T, DX_OP, DY_OP, DIntermediate_OP,
-                                           UseIntermediateOut, BcastY,
-                                           SameShapeOfIntermediateOutAndOut>(
-          x->data<T>(), y->data<T>(),
-          intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
-          out->data<T>(), dout->data<T>(), pre, n, post, dx_op, dy_op,
-          dintermediate_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
-          typename DIntermediate_OP, bool UseIntermediateOut,
-          bool SameShapeOfIntermediateOutAndOut>
-void FusedElemwiseAndActGradComputeEx(
-    const framework::ExecutionContext &ctx, const framework::Tensor *x,
-    const framework::Tensor *y, const framework::Tensor *out,
-    const framework::Tensor *intermediate_out, const framework::Tensor *dout,
-    int axis, framework::Tensor *dx, framework::Tensor *dy,
-    framework::Tensor *dintermediate, DX_OP dx_op, DY_OP dy_op,
-    DIntermediate_OP dintermediate_op) {
-  const framework::DDim &x_dim = x->dims();
-  const framework::DDim &y_dim = y->dims();
-  if (UseIntermediateOut) {
-    PADDLE_ENFORCE(intermediate_out, "intermediate_out should not be nullptr");
-  }
-  if (x_dim == y_dim) {
-    FusedElemwiseAndActGradComputeNoBroadcast<
-        DeviceContext, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut>(
-        ctx, x_dim, y_dim, x, y, intermediate_out, out, dout, axis, dx, dy,
-        dintermediate, dx_op, dy_op, dintermediate_op);
-  } else {  // Y is a scalar
-    bool bcast_y = x_dim.size() >= y_dim.size();
-    if (x_dim.size() == y_dim.size()) {
-      for (int i = 0; i < x_dim.size(); ++i) {
-        if (x_dim[i] < y_dim[i]) {
-          bcast_y = false;
-          break;
-        }
-      }
-    }
-
-    // z = f1(x, f2(y))
-    // z = f1(f2(x, y))
-    if (bcast_y) {  // Y should be broadcast.
-      FusedElemwiseAndActGradComputeWithBroadcast<
-          DeviceContext, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut,
-          true /*BcastY*/, SameShapeOfIntermediateOutAndOut>(
-          ctx, x_dim, y_dim, x, y, intermediate_out, out, dout, axis, dx, dy,
-          dintermediate, dx_op, dy_op, dintermediate_op);
-    } else {
-      FusedElemwiseAndActGradComputeWithBroadcast<
-          DeviceContext, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut,
-          false /*BcastY*/, SameShapeOfIntermediateOutAndOut>(
-          ctx, y_dim, x_dim, x, y, intermediate_out, out, dout, axis, dx, dy,
-          dintermediate, dx_op, dy_op, dintermediate_op);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename CompoundFunctor,
-          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
-void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx,
-                                  const framework::Tensor &x,
-                                  const framework::Tensor &y, int axis,
-                                  CompoundFunctor compound_functor,
-                                  framework::Tensor *out,
-                                  framework::Tensor *intermediate_out) {
-  if (KeepIntermediateOut) {
-    PADDLE_ENFORCE(intermediate_out,
-                   "The save_intermediate_out is opened, "
-                   "intermediate_out should not be nullptr.");
-  }
-
-  const framework::DDim &x_dim = x.dims();
-  const framework::DDim &y_dim = y.dims();
-  if (x.dims() == y.dims()) {
-    FusedElemwiseAndActComputeNoBroadcast<DeviceContext, T, CompoundFunctor,
-                                          KeepIntermediateOut>(
-        ctx, x_dim, x, y, compound_functor, out, intermediate_out);
-  } else {
-    // Whether the shape of Y is a continuous subsequence of X,
-    // For more information please refer to the op's introduction.
-    bool bcast_y = x.dims().size() >= y.dims().size();
-    if (x.dims().size() == y.dims().size()) {
-      for (int i = 0; i < x.dims().size(); ++i) {
-        if (x.dims()[i] < y.dims()[i]) {
-          bcast_y = false;
-          break;
-        }
-      }
-    }
-
-    // z = f1(x, f2(y))
-    // z = f1(f2(x, y))
-    if (bcast_y) {  // Y should be broadcast.
-      // In this case,
-      // for 'f2(y)', the shape of intermediate_out should be equal to the
-      // shape
-      // of Y.
-      // for 'f2(x, y)', the shape of intermediate_out should be equal to the
-      // shape of Out.
-      // the shape of Out should be equal to the shape of X.
-      FusedElemwiseAndActComputeWithBroadcast<
-          DeviceContext, T, CompoundFunctor, true /*BcastY*/,
-          KeepIntermediateOut, SameShapeOfIntermediateOutAndOut>(
-          ctx, x_dim /*OutShape*/, y_dim, x, y, compound_functor, axis, out,
-          intermediate_out);
-    } else {
-      // In this case,
-      // for 'f2(y)', the shape of intermediate_out should be equal to the
-      // shape
-      // of Out.
-      // for 'f2(x, y)', the shape of intermediate_out should be equal to the
-      // shape of Out.
-      // the shape of Out should be equal to the shape of Y.
-      FusedElemwiseAndActComputeWithBroadcast<
-          DeviceContext, T, CompoundFunctor, false /*BcastY*/,
-          KeepIntermediateOut, SameShapeOfIntermediateOutAndOut>(
-          ctx, y_dim /*OutShape*/, x_dim, x, y, compound_functor, axis, out,
-          intermediate_out);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-static inline void GetDoubleGradSafeTensor(
-    const framework::ExecutionContext &ctx, const framework::Tensor *x,
-    const framework::Tensor *ddx, framework::Tensor *ddx_safe) {
-  if (ddx) {
-    *ddx_safe = *ddx;
-  } else {
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    *ddx_safe = ctx.AllocateTmpTensor<T, DeviceContext>(x->dims(), dev_ctx);
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(ctx.template device_context<DeviceContext>(), ddx_safe,
-             static_cast<T>(0));
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
deleted file mode 100644
index 59ec9a2d4a5dd6751f5ea5c6124f49b2e99d057e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwisePowOpGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_pow_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Y", Input("Y"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetAttrMap(Attrs());
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    return op;
-  }
-};
-class ElementwisePowOpMaker : public ElementwiseOpMaker {
- protected:
-  std::string GetName() const override { return "Pow"; }
-  std::string GetEquation() const override { return "Out = X ^ Y"; }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_pow, ops::ElementwiseOp,
-                  ops::ElementwisePowOpMaker, ops::ElementwiseOpInferVarType,
-                  ops::ElementwisePowOpGradDescMaker);
-REGISTER_OPERATOR(elementwise_pow_grad, ops::ElementwiseOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_pow,
-    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_pow_grad,
-    ops::ElementwisePowGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwisePowGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwisePowGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwisePowGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
deleted file mode 100644
index 320d1e7b38da8e4f77015ef2b7bcc73e5db7675f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_pow,
-    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_pow_grad,
-    ops::ElementwisePowGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwisePowGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwisePowGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwisePowGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
deleted file mode 100644
index 1363485ced4e12bd7c67c04037ea3a2cd27b0e54..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct PowFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwisePowKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::LoDTensor;
-    auto* x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE(x != nullptr,
-                   "Cannot get input Variable X, variable name = %s",
-                   ctx.op().Input("X"));
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          PowFunctor<T>(), z);
-  }
-};
-
-template <typename T>
-struct PowGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * y * std::pow(x, y - 1);
-  }
-};
-
-template <typename T>
-struct PowGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * std::log(x) * std::pow(x, y);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwisePowGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = dout;
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, PowGradDX<T>, PowGradDY<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, PowGradDX<T>(), PowGradDY<T>());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
deleted file mode 100644
index b3003092c76fe97fadedaaeab317fbd4364beafb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseSubDoubleGradDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_sub_grad_grad");
-    op->SetInput("Y", Input("Y"));
-    op->SetInput("DOut", Input(framework::GradVarName("Out")));
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
-REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_sub, "Sub",
-                                           "Out = X - Y");
-
-REGISTER_OPERATOR(elementwise_sub_grad, ops::ElementwiseOpExplicitGrad,
-                  ops::ElementwiseGradOpInplace,
-                  ops::ElementwiseGradNoBufVarsInference,
-                  ops::ElementwiseSubDoubleGradDescMaker);
-REGISTER_OPERATOR(elementwise_sub_grad_grad,
-                  ops::ElementwiseOpDoubleGradWithoutDXDY,
-                  ops::ElementwiseDoubleGradOpInplace,
-                  ops::ElementwiseDoubleGradNoBufVarsInference);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub_grad_grad,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
deleted file mode 100644
index 52fad7fd04b0083c81089899d4dab80853441ca7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub_grad_grad,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
deleted file mode 100644
index 5049d587b582a71981f45a72dc5bfc133dadb52d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          SubFunctor<T>(), z);
-  }
-};
-
-template <typename T>
-struct SubGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-struct SubGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    // skip out, x, y
-    auto* out = dout;
-    auto *x = dout, *y = dout;
-
-    ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-
-    // DDOut = ddx - ddy
-    if (ddout) {
-      Tensor ddx_safe, ddy_safe;
-      GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dout, ddx, &ddx_safe);
-      GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-      ddout->mutable_data<T>(ctx.GetPlace());
-      int axis = ctx.Attr<int>("axis");
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &ddx_safe, &ddy_safe, axis, SubFunctor<T>(), ddout);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
deleted file mode 100644
index 97b1f3831c0b96f57bb81eabc493bb143268403c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::DataLayout;
-using framework::Tensor;
-using mkldnn::memory;
-using mkldnn::reorder;
-using mkldnn::primitive;
-using mkldnn::stream;
-using mkldnn::sum;
-
-template <typename T>
-class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    const T* x_data = x->data<T>();
-    const T* y_data = y->data<T>();
-    T* z_data = z->mutable_data<T>(ctx.GetPlace());
-
-    int axis = ctx.Attr<int>("axis");
-
-    auto x_dims = x->dims();
-    auto y_dims_untrimed = y->dims();
-    auto z_dims = z->dims();
-
-    // Execute default elementwise_add operator when
-    // broadcast operations need to performed.
-    if (x_dims != y_dims_untrimed) {
-      Tensor _x;
-      MKLDNNMemoryFormat format;
-      std::vector<int> src_x_tz = framework::vectorize<int>(x_dims);
-
-      if ((src_x_tz.size() == 3 &&
-           x->format() != (format = MKLDNNMemoryFormat::ncw)) ||
-          (src_x_tz.size() == 4 &&
-           x->format() != (format = MKLDNNMemoryFormat::nchw)) ||
-          (src_x_tz.size() == 5 &&
-           x->format() != (format = MKLDNNMemoryFormat::ncdhw))) {
-        _x.Resize(x_dims);
-
-        mkldnn::memory::data_type in_type = platform::MKLDNNGetDataType<T>();
-        auto out_format = platform::MKLDNNFormatForSize(
-            x_dims.size(), MKLDNNMemoryFormat::nchw);
-
-        const std::string key = platform::CreateKey(
-            src_x_tz, x->format(), out_format, std::to_string(in_type));
-
-        platform::ReorderMKLDNNHandler handler(src_x_tz, x->type(), in_type,
-                                               dev_ctx, mkldnn_engine, key);
-
-        auto user_x_memory_p = handler.AcquireSrcMemory(
-            x->format(), paddle::platform::to_void_cast(x_data));
-
-        auto x_memory_p =
-            handler.AcquireDstMemory(&_x, out_format, ctx.GetPlace());
-
-        auto x_reorder = handler.AcquireReorder(x_memory_p, user_x_memory_p);
-
-        std::vector<primitive> pipeline;
-        pipeline.push_back(*x_reorder);
-        stream(stream::kind::eager).submit(pipeline).wait();
-      } else {
-        format = x->format();
-        _x.ShareDataWith(*x);
-      }
-
-      auto sum_func = [](T a, T b) -> T { return a + b; };
-
-      TransformFunctor<decltype(sum_func), T,
-                       paddle::platform::CPUDeviceContext, T>
-          functor(
-              &_x, y, z,
-              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
-              sum_func);
-
-      axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
-      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                     "Axis should be in range [0, x_dims)");
-
-      auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
-      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
-
-      int pre, n, post;
-      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-
-      if (post == 1) {
-        functor.RunRowWise(n, pre);
-      } else {
-        functor.RunMidWise(n, pre, post);
-      }
-      z->set_layout(DataLayout::kMKLDNN);
-      z->set_format(format);
-    } else {
-      PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for X tensor");
-      PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for X tensor");
-
-      PADDLE_ENFORCE_EQ(y->layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for Y tensor");
-      PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for Y tensor");
-
-      std::vector<int> src_x_tz = framework::vectorize<int>(x_dims);
-      std::vector<int> src_y_tz = framework::vectorize<int>(y_dims_untrimed);
-      std::vector<int> dst_tz = framework::vectorize<int>(z_dims);
-
-      std::vector<memory::primitive_desc> srcs_pd;
-      std::vector<float> scales = {1.0f, 1.0f};
-
-      const std::string key = platform::CreateKey(
-          src_x_tz, ctx.op().Output("Out") + std::to_string(x->format()) +
-                        std::to_string(y->format()));
-
-      platform::SumMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
-
-      auto src_x_memory = handler.AcquireSrcMemory(
-          {{src_x_tz}, platform::MKLDNNGetDataType<T>(), x->format()},
-          paddle::platform::to_void_cast(x_data));
-
-      auto src_y_memory = handler.AcquireSecondSrcMemory(
-          {{src_y_tz}, platform::MKLDNNGetDataType<T>(), y->format()},
-          paddle::platform::to_void_cast(y_data));
-
-      auto dst_md = memory::desc({dst_tz}, platform::MKLDNNGetDataType<T>(),
-                                 MKLDNNMemoryFormat::any);
-
-      auto sum_pd = handler.AcquireSumPrimitiveDescriptor(
-          {src_x_memory, src_y_memory}, scales, dst_md);
-
-      auto dst_memory = handler.AcquireDstMemoryFromPrimitive(z_data);
-
-      std::vector<primitive::at> inputs({*src_x_memory, *src_y_memory});
-
-      auto sum_prim = handler.AcquireSum(dst_memory, &inputs);
-
-      std::vector<primitive> pipeline;
-      pipeline.push_back(*sum_prim);
-      stream(stream::kind::eager).submit(pipeline).wait();
-
-      z->set_layout(DataLayout::kMKLDNN);
-      z->set_format((MKLDNNMemoryFormat)dst_memory->get_primitive_desc()
-                        .desc()
-                        .data.format);
-    }
-  }
-};
-
-template <typename T>
-class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    // skip out, x, y,
-    // dout length is larger or equal than dx, dy.
-    auto* out = dout;
-    auto *x = dout, *y = dout;
-
-    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
-      in->set_layout(DataLayout::kMKLDNN);
-      in->set_format(out->format());
-    };
-
-    if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
-      if (dx->dims() == dy->dims()) {
-        auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
-        if (dx) {
-          blas.VCOPY(dout->numel(), dout->data<T>(),
-                     dx->mutable_data<T>(ctx.GetPlace()));
-          set_mkldnn_format(dx, dout);
-        }
-
-        if (dy) {
-          blas.VCOPY(dout->numel(), dout->data<T>(),
-                     dy->mutable_data<T>(ctx.GetPlace()));
-          set_mkldnn_format(dy, dout);
-        }
-      }
-    } else {
-      // Execute default kernel when broadcast is needed
-      ElemwiseExplicitGradCompute<paddle::platform::CPUDeviceContext, T,
-                                  IdentityGrad<T>, IdentityGrad<T>>(
-          ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
-          IdentityGrad<T>());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseAddMKLDNNKernel<float>)
-
-REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseAddMKLDNNGradKernel<float>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
deleted file mode 100644
index cc723844278687c43faa3bd312acec193f550e02..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <mkldnn/include/mkldnn.hpp>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-
-#ifdef PADDLE_WITH_XBYAK
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using framework::DataLayout;
-using mkldnn::memory;
-using platform::StringToMKLDNNFormat;
-
-static void UpdateDataFormat(const framework::ExecutionContext& ctx,
-                             framework::Tensor* tensor, const char* attribute) {
-  if (ctx.op().HasAttr(attribute)) {
-    auto format_as_string = ctx.Attr<std::string>(attribute);
-    auto format = StringToMKLDNNFormat(&format_as_string);
-    if (format != MKLDNNMemoryFormat::any) {
-      tensor->set_format(format);
-    }
-  }
-}
-
-template <typename T>
-static void ReorderInput(framework::Tensor* tensor,
-                         const platform::Place& place,
-                         const mkldnn::engine& engine, bool isFourDim) {
-  using platform::to_void_cast;
-  auto dims = paddle::framework::vectorize<int>(tensor->dims());
-  framework::Tensor out_tensor;
-  out_tensor.Resize(tensor->dims());
-  out_tensor.set_format(isFourDim ? MKLDNNMemoryFormat::nchw
-                                  : MKLDNNMemoryFormat::nc);
-  out_tensor.set_layout(tensor->layout());
-  mkldnn::memory input_memory = {
-      {{dims, platform::MKLDNNGetDataType<T>(), tensor->format()}, engine},
-      to_void_cast<T>(tensor->data<T>())};
-  mkldnn::memory output_memory = {
-      {{dims, platform::MKLDNNGetDataType<T>(), out_tensor.format()}, engine},
-      to_void_cast<T>(out_tensor.mutable_data<T>(place))};
-  platform::Reorder(input_memory, output_memory);
-  tensor->ShareDataWith(out_tensor);
-}
-
-template <typename T>
-class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    int axis = ctx.Attr<int>("axis");
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    const T* x_data = x->data<T>();
-    const T* y_data = y->data<T>();
-    T* z_data = z->mutable_data<T>(ctx.GetPlace());
-
-    auto x_dims = x->dims();
-    auto y_dims_untrimmed = y->dims();
-    auto x_int_dims = paddle::framework::vectorize<int>(x_dims);
-
-    UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
-    UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");
-
-    const bool is_avx512_enabled = platform::MayIUse(platform::avx512f);
-    const bool are_dims_divisable = !(x_int_dims[1] % 16);
-    const bool is_x_format_correct = x->format() == MKLDNNMemoryFormat::nChw16c;
-    const bool is_y_format_correct = y->format() == MKLDNNMemoryFormat::nc;
-    if (is_x_format_correct && is_y_format_correct && are_dims_divisable &&
-        is_avx512_enabled) {
-      int pre, n, post;
-      get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post);
-
-      if (post == 1) {
-        PADDLE_THROW("Not implemented when post is 1");
-      } else {
-        // Just check whether it works for RE-Resnext.
-        PADDLE_ENFORCE_EQ(x_dims.size(), 4, "X should have 4 dimensions");
-
-        int n = x_dims[0];
-        int c = x_dims[1];
-        int h = x_dims[2];
-        int w = x_dims[3];
-
-        PADDLE_ENFORCE(y_dims_untrimmed[0] == n && y_dims_untrimmed[1] == c,
-                       "Y should be in nc format");
-
-        constexpr int simd_width = 16;
-        int C = c / simd_width;
-
-        auto multiply = jit::KernelFuncs<jit::NCHW16CMulNCTuple<T>,
-                                         platform::CPUPlace>::Cache()
-                            .At(0);
-#pragma omp parallel for collapse(2)
-        for (int ni = 0; ni < n; ni++) {
-          for (int ci = 0; ci < C; ci++) {
-            auto ptr_x =
-                x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-
-            auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
-            auto ptr_z =
-                z_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-
-            multiply(ptr_x, ptr_y, ptr_z, h, w);
-          }
-        }
-      }
-
-      z->set_layout(DataLayout::kMKLDNN);
-      z->set_format(x->format());
-    } else {
-      // Fallback to naive version:
-      const bool are_inputs_in_same_format = x->format() == y->format();
-      const bool is_x_nchw = x->format() == MKLDNNMemoryFormat::nchw;
-      const bool is_x_nc = x->format() == MKLDNNMemoryFormat::nc;
-      const bool is_x_x = x->format() == MKLDNNMemoryFormat::x;
-      const bool is_y_nchw = y->format() == MKLDNNMemoryFormat::nchw;
-      const bool is_y_nc = y->format() == MKLDNNMemoryFormat::nc;
-      const bool is_y_x = y->format() == MKLDNNMemoryFormat::x;
-      if (!are_inputs_in_same_format) {
-        using platform::MKLDNNDeviceContext;
-        auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-        const auto& mkldnn_engine = dev_ctx.GetEngine();
-        if (!(is_x_nchw || is_x_nc || is_x_x))
-          ReorderInput<T>(const_cast<Tensor*>(x), ctx.GetPlace(), mkldnn_engine,
-                          x->dims().size() == 4);
-        if (!(is_y_nchw || is_y_nc || is_y_x))
-          ReorderInput<T>(const_cast<Tensor*>(y), ctx.GetPlace(), mkldnn_engine,
-                          y->dims().size() == 4);
-      }
-
-      auto mul_func = [](T a, T b) -> T { return a * b; };
-
-      TransformFunctor<decltype(mul_func), T,
-                       paddle::platform::CPUDeviceContext, T>
-          functor(
-              x, y, z,
-              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
-              mul_func);
-
-      axis = (axis == -1 ? x_dims.size() - y_dims_untrimmed.size() : axis);
-      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                     "Axis should be in range [0, x_dims)");
-
-      auto y_dims = trim_trailing_singular_dims(y_dims_untrimmed);
-      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
-
-      int pre, n, post;
-      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-
-      if (post == 1) {
-        functor.RunRowWise(n, pre);
-      } else {
-        functor.RunMidWise(n, pre, post);
-      }
-      z->set_layout(DataLayout::kMKLDNN);
-      z->set_format(x->format());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(elementwise_mul, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ElementwiseMulMKLDNNKernel<float>)
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
deleted file mode 100644
index 532084f4929877dcc2cd02038c05bfe3b9bca281..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <cstdlib>
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-USE_OP(elementwise_add);
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class TestElementwiseAddGradGradWithoutDout
-    : public TestElementwiseOpGradGrad<T> {
- public:
-  TestElementwiseAddGradGradWithoutDout(const platform::Place &place,
-                                        const framework::DDim &dims)
-      : TestElementwiseOpGradGrad<T>("elementwise_add_grad_grad", place, dims,
-                                     {"Y", "DOut", "DDY"}, {"DDOut"}) {}
-
-  using TestElementwiseOpGradGrad<T>::feed_datas_;
-  using TestElementwiseOpGradGrad<T>::expected_outs_;
-  using TestElementwiseOpGradGrad<T>::dims_;
-  void ComputeExpectedOuts() override {
-    size_t numel = static_cast<size_t>(framework::product(dims_));
-    std::vector<T> dy(numel);
-    std::vector<T> ddout(numel);
-    for (size_t i = 0; i < numel; ++i) {
-      // ddOut = ddX + ddY = ddY if ddX empty
-      ddout[i] = feed_datas_["DDY"][i];
-    }
-    expected_outs_["DDOut"] = ddout;
-  }
-
-  std::unique_ptr<framework::OperatorBase> CreateTestOp() override {
-    auto op = framework::OpRegistry::CreateOp(
-        this->op_type_, {{"Y", {"Y"}}, {"DOut", {"DOut"}}, {"DDY", {"DDY"}}},
-        {{"DDOut", {"DDOut"}}}, {{"use_mkldnn", false}, {"axis", 0}});
-    return op;
-  }
-};
-
-TEST(test_elementwise_add_grad_grad_without_ddx, cpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CPUPlace p;
-  TestElementwiseAddGradGradWithoutDout<float> test(p, dims);
-  ASSERT_TRUE(test.Check());
-}
-#ifdef PADDLE_WITH_CUDA
-TEST(test_elementwise_add_grad_grad_without_ddx, gpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CUDAPlace p(0);
-  TestElementwiseAddGradGradWithoutDout<float> test(p, dims);
-  ASSERT_TRUE(test.Check());
-}
-#endif
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
deleted file mode 100644
index b8163169734bd2c64412bab7286aca9cc5e1b830..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <cstdlib>
-#include <memory>
-#include <random>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-USE_OP(elementwise_add);
-
-namespace paddle {
-namespace operators {
-
-static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) {
-  if (copy_to_gpu) {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
-#else
-    PADDLE_THROW("Not compiled with cuda");
-#endif
-  } else {
-    std::memcpy(dst, src, n);
-  }
-}
-
-template <typename T>
-bool TestMain(const platform::Place &place, const framework::DDim &dims,
-              bool inplace) {
-  framework::Scope scope;
-  auto *x = scope.Var("x")->GetMutable<framework::LoDTensor>();
-  auto *y = scope.Var("y")->GetMutable<framework::LoDTensor>();
-  auto *z = scope.Var("z")->GetMutable<framework::LoDTensor>();
-
-  x->Resize(dims);
-  y->Resize(dims);
-  z->Resize(dims);
-
-  size_t numel = static_cast<size_t>(framework::product(dims));
-
-  auto x_ptr = x->mutable_data<T>(place);
-  auto y_ptr = y->mutable_data<T>(place);
-  auto z_ptr = z->mutable_data<T>(place);
-
-  std::uniform_real_distribution<T> dist(static_cast<T>(10.0),
-                                         static_cast<T>(20.0));
-  std::mt19937 engine;
-  std::vector<T> x_data(numel), y_data(numel), z_data(numel);
-  std::vector<T> sum_result(numel);
-
-  for (size_t i = 0; i < numel; ++i) {
-    x_data[i] = dist(engine);
-    y_data[i] = dist(engine);
-    sum_result[i] = x_data[i] + y_data[i];
-    z_data[i] = -1.0;  // set some data that is not existed
-  }
-
-  auto bytes = sizeof(T) * numel;
-  bool is_gpu_place = platform::is_gpu_place(place);
-  Memcpy(x_ptr, x_data.data(), bytes, is_gpu_place);
-  Memcpy(y_ptr, y_data.data(), bytes, is_gpu_place);
-  Memcpy(z_ptr, z_data.data(), bytes, is_gpu_place);
-
-  const char *out_name = inplace ? "x" : "z";
-  auto op = framework::OpRegistry::CreateOp("elementwise_add",
-                                            {{"X", {"x"}}, {"Y", {"y"}}},
-                                            {{"Out", {out_name}}}, {});
-  op->Run(scope, place);
-  platform::DeviceContextPool::Instance().Get(place)->Wait();
-
-  framework::LoDTensor cpu_out;
-  auto &out_tensor = scope.FindVar(out_name)->Get<framework::LoDTensor>();
-  PADDLE_ENFORCE(scope.kids().empty());
-  if (inplace) {
-    PADDLE_ENFORCE_EQ(&out_tensor, x);
-  } else {
-    PADDLE_ENFORCE_EQ(&out_tensor, z);
-  }
-
-  if (is_gpu_place) {
-    framework::TensorCopySync(out_tensor, platform::CPUPlace(), &cpu_out);
-  } else {
-    cpu_out = out_tensor;
-  }
-
-  auto *out_ptr = cpu_out.data<T>();
-  bool is_equal = std::equal(out_ptr, out_ptr + numel, sum_result.data());
-  return is_equal;
-}
-
-TEST(test_elementwise_add_inplace, cpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CPUPlace p;
-  ASSERT_TRUE(TestMain<float>(p, dims, true));
-}
-
-TEST(test_elementwise_add_not_inplace, cpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CPUPlace p;
-  ASSERT_TRUE(TestMain<float>(p, dims, false));
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(test_elementwise_add_inplace, gpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CUDAPlace p(0);
-  ASSERT_TRUE(TestMain<float>(p, dims, true));
-}
-
-TEST(test_elementwise_add_not_inplace, gpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CUDAPlace p(0);
-  ASSERT_TRUE(TestMain<float>(p, dims, false));
-}
-#endif
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
deleted file mode 100644
index e1f893dd2b8ae3a2abe687a0f3448c58dcfb486e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <cstdlib>
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-USE_OP(elementwise_div);
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class TestElementwiseDivGradGradWithoutDout
-    : public TestElementwiseOpGradGrad<T> {
- public:
-  TestElementwiseDivGradGradWithoutDout(const platform::Place &place,
-                                        const framework::DDim &dims)
-      : TestElementwiseOpGradGrad<T>("elementwise_div_grad_grad", place, dims,
-                                     {"Y", "Out", "DDX", "DDY", "DX"},
-                                     {"Y@GRAD", "DDOut"}) {}
-
-  using TestElementwiseOpGradGrad<T>::feed_datas_;
-  using TestElementwiseOpGradGrad<T>::expected_outs_;
-  using TestElementwiseOpGradGrad<T>::dims_;
-  void ComputeExpectedOuts() override {
-    size_t numel = static_cast<size_t>(framework::product(dims_));
-    std::vector<T> dy(numel);
-    std::vector<T> ddout(numel);
-    for (size_t i = 0; i < numel; ++i) {
-      // dY(Y@GRAD) = Out * dX * ddY / Y - dX * ddX / Y
-      dy[i] = (feed_datas_["DX"][i] / feed_datas_["Y"][i]) *
-              (feed_datas_["Out"][i] * feed_datas_["DDY"][i] -
-               feed_datas_["DDX"][i]);
-      // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-      ddout[i] = (feed_datas_["DDX"][i] -
-                  feed_datas_["Out"][i] * feed_datas_["DDY"][i]) /
-                 (feed_datas_["Y"][i]);
-    }
-    expected_outs_["Y@GRAD"] = dy;
-    expected_outs_["DDOut"] = ddout;
-  }
-
-  std::unique_ptr<framework::OperatorBase> CreateTestOp() override {
-    auto op = framework::OpRegistry::CreateOp(
-        this->op_type_, {{"Y", {"Y"}},
-                         {"Out", {"Out"}},
-                         {"DDX", {"DDX"}},
-                         {"DDY", {"DDY"}},
-                         {"DX", {"DX"}}},
-        {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}},
-        {{"use_mkldnn", false}, {"axis", 0}});
-    return op;
-  }
-};
-
-TEST(test_elementwise_div_grad_grad_without_dout, cpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CPUPlace p;
-  TestElementwiseDivGradGradWithoutDout<float> test(p, dims);
-  ASSERT_TRUE(test.Check());
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(test_elementwise_div_grad_grad_without_dout, gpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CUDAPlace p(0);
-  TestElementwiseDivGradGradWithoutDout<float> test(p, dims);
-  ASSERT_TRUE(test.Check());
-}
-#endif
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
deleted file mode 100644
index c7ce5142c09f093f35ecf48ca409f1226ae268ed..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cstdlib>
-#include <map>
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-// currently, this test class only support same dims
-template <typename T>
-class TestElementwiseOpGradGrad {
- public:
-  TestElementwiseOpGradGrad(const std::string &op_type,
-                            const platform::Place &place,
-                            const framework::DDim &dims,
-                            const std::vector<std::string> &inputs,
-                            const std::vector<std::string> &outputs)
-      : op_type_(op_type),
-        place_(place),
-        dims_(dims),
-        inputs_(inputs),
-        outputs_(outputs) {}
-
-  void InitVarInScope(std::string var_name) {
-    in_out_tensors_[var_name] =
-        scope_.Var(var_name)->template GetMutable<framework::LoDTensor>();
-    in_out_tensors_[var_name]->Resize(dims_);
-    in_out_tensors_[var_name]->template mutable_data<T>(place_);
-  }
-
-  void InitFeedData(std::string var_name, size_t size) {
-    // generate random data
-    std::uniform_real_distribution<T> dist(static_cast<T>(10.0),
-                                           static_cast<T>(20.0));
-    std::mt19937 engine;
-    std::vector<T> data(size);
-    for (size_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
-    }
-    feed_datas_[var_name] = data;
-  }
-
-  void Setup() {
-    size_t numel = static_cast<size_t>(framework::product(dims_));
-    // init vars in scope and feed inputs
-    for (auto in_name : inputs_) {
-      InitVarInScope(in_name);
-      InitFeedData(in_name, numel);
-    }
-    for (auto out_name : outputs_) {
-      InitVarInScope(out_name);
-    }
-
-    // feeding: copy data to tensor, out tensor don't need init
-    auto bytes = sizeof(T) * numel;
-    for (auto &in_name : inputs_) {
-      auto dst = in_out_tensors_[in_name]->template data<T>();
-      auto src = feed_datas_[in_name].data();
-      auto src_place = platform::CPUPlace();
-      if (platform::is_cpu_place(place_)) {
-        auto dst_place = boost::get<platform::CPUPlace>(place_);
-        memory::Copy(dst_place, dst, src_place, src, bytes);
-      } else if (platform::is_gpu_place(place_)) {
-#ifdef PADDLE_WITH_CUDA
-        auto dst_place = boost::get<platform::CUDAPlace>(place_);
-        memory::Copy(dst_place, dst, src_place, src, bytes, nullptr);
-#else
-        PADDLE_THROW("Not compiled with cuda");
-#endif
-      }
-    }
-
-    // calculate expected outputs
-    ComputeExpectedOuts();
-  }
-
-  bool Check() {
-    Setup();
-    auto op = CreateTestOp();
-    op->Run(scope_, place_);
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
-    framework::LoDTensor cpu_out;
-    PADDLE_ENFORCE_EQ(scope_.kids().empty(), true, "scope has child scopes");
-
-    // get outputs from scope and compare them with expected_outs
-    bool all_equal = true;
-    for (auto &out_name : outputs_) {
-      auto &out_tensor =
-          scope_.FindVar(out_name)->template Get<framework::LoDTensor>();
-      if (platform::is_gpu_place(place_)) {
-        framework::TensorCopySync(out_tensor, platform::CPUPlace(), &cpu_out);
-      } else {
-        cpu_out = out_tensor;
-      }
-      auto *out_ptr = cpu_out.data<T>();
-      size_t numel = static_cast<size_t>(framework::product(dims_));
-      auto is_equal =
-          std::equal(out_ptr, out_ptr + numel, expected_outs_[out_name].data());
-      if (!is_equal) {
-        all_equal = false;
-        break;
-      }
-    }
-    return all_equal;
-  }
-
-  virtual std::unique_ptr<framework::OperatorBase> CreateTestOp() = 0;
-  virtual void ComputeExpectedOuts() = 0;
-  virtual ~TestElementwiseOpGradGrad() {}
-
- protected:
-  std::string op_type_;
-  platform::Place place_;
-  framework::DDim dims_;
-  std::vector<std::string> inputs_;
-  std::vector<std::string> outputs_;
-  std::map<std::string, paddle::framework::LoDTensor *> in_out_tensors_;
-  std::map<std::string, std::vector<T>> feed_datas_;
-  std::map<std::string, std::vector<T>> expected_outs_;
-  framework::Scope scope_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
deleted file mode 100644
index b95373178d458dc3b3619bcbc056b24c8a05d6be..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/expand_op.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/expand_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class ExpandOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto expand_times = ctx->Attrs().Get<std::vector<int>>("expand_times");
-
-    if (expand_times.size() == 0) {
-      expand_times = std::vector<int>(x_dims.size(), -1);
-    }
-
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims.size()), expand_times.size(),
-                      "The number of Attr(expand_times)'s value must be equal "
-                      "to the rank of Input(X).");
-    PADDLE_ENFORCE_LE(x_dims.size(), 6,
-                      "The rank of Input(X) must not be greater than 6.");
-
-    std::vector<int64_t> out_shape(x_dims.size());
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      if (x_dims[i] == -1 || expand_times[i] == -1) {
-        out_shape[i] = -1;
-      } else {
-        PADDLE_ENFORCE_GT(
-            expand_times[i], 0,
-            "The element of Attr(expand_times) must greater than 0.");
-        out_shape[i] = x_dims[i] * expand_times[i];
-      }
-    }
-
-    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
-    if (out_shape[0] == x_dims[0]) {
-      ctx->ShareLoD("X", "Out");
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "expand_times_tensor" || var_name == "ExpandTimes") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
-             "X is the input to be expanded.");
-    AddInput("ExpandTimes",
-             "(Tensor<int>), optional). If provided, expand according to "
-             "this given expand times. It has a higher priority than "
-             "expand_times_tensor and expand_times.")
-        .AsDispensable();
-    AddInput("expand_times_tensor",
-             "(Tensor Tensor<int>), epxand times for X."
-             "It has a higher priority than expand_times, but a lower priority "
-             "than ExpandTimes")
-        .AsDuplicable()
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
-              "The rank of Output(Out) have the same with Input(X). "
-              "After expanding, size of each dimension of Output(Out) is equal "
-              "to size of the corresponding dimension of Input(X) multiplying "
-              "the corresponding value given by Attr(expand_times).");
-    AddAttr<std::vector<int>>("expand_times",
-                              "Expand times number for each dimension.")
-        .SetDefault({});
-    AddComment(R"DOC(
-Expand operator tiles the input by given times number. You should set times
-number for each dimension by providing attribute 'expand_times'. The rank of X
-should be in [1, 6]. Please note that size of 'expand_times' must be the same
-with X's rank. Following is a using case:
-
-Input(X) is a 3-D tensor with shape [2, 3, 1]:
-
-        [
-           [[1], [2], [3]],
-           [[4], [5], [6]]
-        ]
-
-Attr(expand_times):  [1, 2, 2]
-
-Output(Out) is a 3-D tensor with shape [2, 6, 2]:
-
-        [
-            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
-            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
-        ]
-
-)DOC");
-  }
-};
-
-class ExpandGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    std::vector<int> expand_times =
-        ctx->Attrs().Get<std::vector<int>>("expand_times");
-
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    size_t start_pos = 0u;
-    if (!ctx->IsRuntime() && x_dims[0] < 0) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0], out_dims[0],
-          "The first dimension size of Input(Out@GRAD) should be "
-          "equal to the crroresponding dimension size of Input(X)");
-      start_pos = 1u;
-    }
-
-    for (size_t i = start_pos; i < expand_times.size(); ++i) {
-      if (expand_times[i] == -1) {
-        continue;
-      } else {
-        PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
-                          "Each dimension size of Input(Out@GRAD) should be "
-                          "equal to multiplication of crroresponding dimension "
-                          "size of Input(X) and Attr(expand_times) value.");
-      }
-    }
-    auto x_grad_name = framework::GradVarName("X");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "expand_times_tensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("expand_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetInput("expand_times_tensor", Input("expand_times_tensor"));
-    op->SetInput("ExpandTimes", Input("ExpandTimes"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ExpandGradNoNeedBufVarsInferer, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
-                  ops::ExpandGradOpDescMaker);
-REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp,
-                  ops::ExpandGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu
deleted file mode 100644
index 50a506b294db14f0d170c60a0ed760dcf280ad60..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/expand_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
deleted file mode 100644
index eb3b46f91348d1af097327bf2cbcd8285a61068d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/expand_op.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include <boost/preprocessor/arithmetic/div.hpp>
-#include <boost/preprocessor/arithmetic/mod.hpp>
-#include <boost/preprocessor/comparison/greater.hpp>
-#include <boost/preprocessor/comparison/greater_equal.hpp>
-#include <boost/preprocessor/control/if.hpp>
-#include <boost/preprocessor/repetition/repeat.hpp>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-#define MAX_RANK_SUPPORTED 6
-
-#define EXPAND_TEMPLATE(z, n, data) \
-  case n + 1: {                     \
-    Expand<n + 1>(context);         \
-    break;                          \
-  }
-#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
-#define COND(n)                                               \
-  BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \
-                         BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_GRAD_CASE(n)                                        \
-  case n: {                                                        \
-    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                         \
-  }
-#define EXPAND_GRAD_TEMPLATE(z, n, data) \
-  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
-#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
-
-namespace paddle {
-namespace operators {
-inline std::vector<int> get_expand_times(
-    const framework::ExecutionContext& ctx) {
-  if (ctx.HasInput("ExpandTimes")) {
-    auto* expand_tensor = ctx.Input<framework::LoDTensor>("ExpandTimes");
-    auto* expand_data = expand_tensor->data<int>();
-    framework::Tensor cpu_expand_tensor;
-    if (platform::is_gpu_place(expand_tensor->place())) {
-      TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
-      expand_data = cpu_expand_tensor.data<int>();
-    }
-    auto vec_epxand_times =
-        std::vector<int>(expand_data, expand_data + expand_tensor->numel());
-    return vec_epxand_times;
-  }
-
-  auto list_expand_times_tensor =
-      ctx.MultiInput<framework::Tensor>("expand_times_tensor");
-  if (list_expand_times_tensor.size() > 0) {
-    // get tensor from
-    std::vector<int> vec_epxand_times;
-    for (size_t i = 0; i < list_expand_times_tensor.size(); ++i) {
-      auto tensor = list_expand_times_tensor[i];
-      if (platform::is_gpu_place(tensor->place())) {
-        framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_epxand_times.push_back(*temp.data<int32_t>());
-      } else {
-        vec_epxand_times.push_back(*tensor->data<int32_t>());
-      }
-    }
-
-    return vec_epxand_times;
-  } else {
-    return ctx.Attr<std::vector<int>>("expand_times");
-  }
-}
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class ExpandKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED)
-      default:
-        PADDLE_ENFORCE(false,
-                       "Only support tensor with rank being between 1 and 6.");
-    }
-  }
-
- protected:
-  template <int Rank>
-  void Expand(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-
-    auto in_dims = in0->dims();
-    auto expand_times = get_expand_times(context);
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(in_dims.size()), expand_times.size(),
-                      "The number of Attr(expand_times)'s value must be equal "
-                      "to the rank of Input(X).");
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      bcast_dims[i] = expand_times[i];
-    }
-
-    framework::DDim out_dims(in_dims);
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      out_dims[i] *= expand_times[i];
-    }
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    y.device(place) = x.broadcast(bcast_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    // auto& expand_times = context.Attr<std::vector<int>>("expand_times");
-    auto expand_times = get_expand_times(context);
-    auto x_dims = in0->dims();
-    // 1. reshape_dims_vec is the broadcast parameter. For each dimension i,
-    //    if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two
-    //    dimensions [expand_times[i], x_dims[i]].
-    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
-    //    each dimension expanded, the gradients should be summed to original
-    //    size.
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      if (expand_times[i] == 1) {
-        reshape_dims_vec.push_back(x_dims[i]);
-      } else {
-        if (x_dims[i] == 1) {
-          reduce_dims_vec.push_back(reshape_dims_vec.size());
-          reshape_dims_vec.push_back(expand_times[i]);
-        } else {
-          reduce_dims_vec.push_back(reshape_dims_vec.size());
-          reshape_dims_vec.push_back(expand_times[i]);
-          reshape_dims_vec.push_back(x_dims[i]);
-        }
-      }
-    }
-
-    int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED +
-               reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1;
-    // no need reduce, just copy
-    if (reduce_dims_vec.size() == 0) {
-      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-      out0->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
-                            out0);
-    } else {
-      switch (dims) {
-        REP_EXPAND_GRAD_TEMPLATE(72)
-        default:
-          PADDLE_ENFORCE(
-              false, "Only support tensor with rank being between 1 and 6.");
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void ExpandBackward(const framework::ExecutionContext& context,
-                      const std::vector<int>& reshape_dims_vec,
-                      const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1;
-    size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1;
-    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
-                      "Inconsistent size between template Dims and "
-                      "reshape dimensions.");
-    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
-                      "Inconsistent size between template Dims and "
-                      "reduce dimensions.");
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims / MAX_RANK_SUPPORTED + 1> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<int, Dims % MAX_RANK_SUPPORTED + 1> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
deleted file mode 100644
index 40848b963350202b684dbfb7625eb8d4427bdb4a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eye_op.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/eye_op.h"
-
-namespace paddle {
-namespace operators {
-
-class EyeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of EyeOP should not be null.");
-    auto num_rows = ctx->Attrs().Get<int64_t>("num_rows");
-    PADDLE_ENFORCE(num_rows >= 0,
-                   "The value of Input(num_rows) should be non-negative int.");
-    auto num_columns = ctx->Attrs().Get<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-    PADDLE_ENFORCE(
-        num_columns >= 0,
-        "The value of Input(num_columns) should be non-negative int.");
-    ctx->SetOutputDim("Out", {num_rows, num_columns});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class EyeOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(ctx->GetAttr("dtype")));
-    auto& out_var_name = ctx->Output("Out").front();
-    ctx->SetDataType(out_var_name, data_type);
-  }
-};
-
-class EyeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<int64_t>("num_rows",
-                     "(int64_t) the number of rows in output tensor");
-    AddAttr<int64_t>("num_columns",
-                     "(int64_t) the number of columns in output tensor."
-                     "Default -1 means that num_columns=num_rows")
-        .SetDefault(-1);
-    AddOutput("Out",
-              "(Tensor) Construct an identity tensor with "
-              "specified shape [num_rows, num_columns]");
-    AddComment(R"DOC(
-Return an identity tensor whose shape is [num_rows, num_columns].
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-using float16 = paddle::platform::float16;
-
-REGISTER_OPERATOR(eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel<CPU, float>,
-                       ops::EyeKernel<CPU, double>,
-                       ops::EyeKernel<CPU, int64_t>, ops::EyeKernel<CPU, int>,
-                       ops::EyeKernel<CPU, float16>);
diff --git a/paddle/fluid/operators/eye_op.cu b/paddle/fluid/operators/eye_op.cu
deleted file mode 100644
index 8d55235a54c70b1a4db4bd7f355332c923207591..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eye_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/eye_op.h"
-
-namespace ops = paddle::operators;
-namespace plf = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    eye, ops::EyeKernel<plf::CUDADeviceContext, float>,
-    ops::EyeKernel<plf::CUDADeviceContext, double>,
-    ops::EyeKernel<plf::CUDADeviceContext, int64_t>,
-    ops::EyeKernel<plf::CUDADeviceContext, int>,
-    ops::EyeKernel<plf::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/eye_op.h b/paddle/fluid/operators/eye_op.h
deleted file mode 100644
index 0eefe7d2163bb967596480f2427b995a6a87ff6e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eye_op.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct EyeFunctor {
-  EyeFunctor(int64_t num_columns, T* output)
-      : num_columns_(num_columns), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    output_[idx * num_columns_ + idx] = static_cast<T>(1);
-  }
-
-  int64_t num_columns_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-class EyeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto num_rows = ctx.Attr<int64_t>("num_rows");
-    auto num_columns = ctx.Attr<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-
-    auto* out_tensor = ctx.Output<framework::Tensor>("Out");
-    T* out_data = out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    set_zero(dev_ctx, out_tensor, static_cast<T>(0));
-
-    int64_t num_eyes = std::min(num_rows, num_columns);
-    platform::ForRange<DeviceContext> for_range(dev_ctx, num_eyes);
-    EyeFunctor<T> functor(num_columns, out_data);
-    for_range(functor);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
deleted file mode 100644
index 4a8937ba1c7ef9827ecc9bf575d9893c95a3b22b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fake_dequantize_op.h"
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DequantizeFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
-                  const framework::Tensor* in, const framework::Tensor* scale,
-                  T max_range, framework::Tensor* out) {
-    auto in_e = framework::EigenVector<T>::Flatten(*in);
-    const T* scale_factor = scale->data<T>();
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-
-    auto& dev = *dev_ctx.eigen_device();
-    out_e.device(dev) = (scale_factor[0] / max_range) * in_e;
-  }
-};
-
-template <typename T>
-struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
-                  const framework::Tensor* in, const framework::Tensor** scales,
-                  const int scale_num, T max_range, framework::Tensor* out) {
-    if (scale_num == 1) {
-      const int channel = in->dims()[0];
-      const T* scale_factor = scales[0]->data<T>();
-      for (int i = 0; i < channel; i++) {
-        T s = scale_factor[i];
-        framework::Tensor one_channel_in = in->Slice(i, i + 1);
-        framework::Tensor one_channel_out = out->Slice(i, i + 1);
-        auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
-        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-        auto& dev = *dev_ctx.eigen_device();
-        out_e.device(dev) = (s / max_range) * in_e;
-      }
-    } else if (scale_num == 2) {
-      int batch_size = in->dims()[0];
-      int channel = in->dims()[1];
-      const T* scale_one = scales[0]->data<T>();
-      const T* scale_two = scales[1]->data<T>();
-      for (int i = 0; i < batch_size; i++) {
-        framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
-            framework::slice_ddim(in->dims(), 1, in->dims().size()));
-        framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
-            framework::slice_ddim(out->dims(), 1, out->dims().size()));
-        for (int j = 0; j < channel; j++) {
-          T s = scale_one[j];
-          framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
-          framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
-          auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
-          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-          auto& dev = *dev_ctx.eigen_device();
-          out_e.device(dev) = (s * scale_two[0] / max_range) * in_e;
-        }
-      }
-    }
-  }
-};
-
-template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
-template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
-template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, float>;
-template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, double>;
-
-class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
- public:
-  FakeDequantizeMaxAbsOp(const std::string& type,
-                         const framework::VariableNameMap& inputs,
-                         const framework::VariableNameMap& outputs,
-                         const framework::AttributeMap& attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FakeDequantizeMaxAbsOp should not be null.");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor) The input with float-32/64 type is the "
-             "low precision tensor.");
-    AddInput("Scale", "(float) The scale in quantization stage.");
-    AddOutput("Out",
-              "(Tensor) The output is the dequantized high "
-              "precision tensor.");
-    AddAttr<float>("max_range", "(float) The max range in quantization stage.");
-    AddComment(R"DOC(
-FakeDequantizeMaxAbsOp operator.
-
-This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
-
-$$Out = \frac{scale*X}{ max_range }$$
-
-)DOC");
-  }
-};
-
-class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("X"),
-        "Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInputs("Scales"),
-                   "Input(Scales) of FakeChannelWiseDequantizeMaxAbsOp "
-                   "should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class FakeChannelWiseDequantizeMaxAbsOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor) The input with float-32/64 type is the "
-             "low precision tensor.");
-    AddInput("Scales",
-             "(Tensors) The scales in quantization stage. "
-             "Now, `Scales` is a vector with at most two tensors. "
-             "If Scales has two elements, the second tensor should only have "
-             "one value.")
-        .AsDuplicable();
-    AddOutput("Out",
-              "(Tensor) The output is the dequantized high "
-              "precision tensor.");
-    AddAttr<std::vector<int>>(
-        "quant_bits",
-        "Quantization bit numbers in quantization stage. "
-        "The size of `quant_bits` should be equal to the size of `Scales`.")
-        .SetDefault({8});
-
-    AddComment(R"DOC(
-FakeChannelWiseDequantizeMaxAbsOp operator.
-
-This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp:
-
-$$Out_c = \frac{X_c\prod_{i=1}^{n}Scales_{ic}}{\prod_{i=1}^{n}(2^{quant\_bits_i-1}-1)}$$
-
-In the above formula, the range value of $c$ can be represented as $0 \leq c \lt \ the\ channel\ number\ of\ X$.
-Besides, the size of $quant\_bits$ should be equal to the size of $Scales$, and it is called $n$  in the formula.
-
-Notes: In general, the per-channel quantization is only applied to weights and the activations use per-layer quantization.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-
-REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
-                  ops::FakeDequantizeMaxAbsOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
-                       ops::FakeDequantizeMaxAbsKernel<CPU, float>,
-                       ops::FakeDequantizeMaxAbsKernel<CPU, double>);
-
-REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs,
-                  ops::FakeChannelWiseDequantizeMaxAbsOp,
-                  ops::FakeChannelWiseDequantizeMaxAbsOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs,
-                       ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, float>,
-                       ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
deleted file mode 100644
index 02f9dc827d68cbb58447ed1557ff4bf310b2c017..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fake_dequantize_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void KeDequantize(const T* in, const T* scale, T max_range, int num,
-                             T* out) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < num) {
-    out[idx] = in[idx] * scale[0] / max_range;
-  }
-}
-
-template <typename T>
-struct DequantizeFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor* in, const framework::Tensor* scale,
-                  T max_range, framework::Tensor* out) {
-    const T* in_data = in->data<T>();
-    const T* scale_factor = scale->data<T>();
-    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-    int num = in->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-
-    KeDequantize<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        in_data, scale_factor, max_range, num, out_data);
-  }
-};
-
-template <typename T>
-__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
-                                   int num, int channel, T* out) {
-  int tid = threadIdx.x;
-  int channel_size = num / channel;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
-  for (int i = tid; i < channel_size; i += blockDim.x) {
-    out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
-  }
-}
-
-template <typename T>
-__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
-                                   const T* scale_two, T max_range, int num,
-                                   int batch_size, int channel, T* out) {
-  int tid = threadIdx.x;
-  int channel_size = num / (batch_size * channel);
-  int scale_index = blockIdx.x % channel;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
-  for (int i = tid; i < channel_size; i += blockDim.x) {
-    out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
-  }
-}
-
-template <typename T>
-struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor* in, const framework::Tensor** scales,
-                  const int scale_num, T max_range, framework::Tensor* out) {
-    const T* in_data = in->data<T>();
-    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-    if (scale_num == 1) {
-      int num = in->numel();
-      int channel = in->dims()[0];
-      const T* scale_factor = scales[0]->data<T>();
-      int block = 1024;
-      int grid = channel;
-      DequantizeOneScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          in_data, scale_factor, max_range, num, channel, out_data);
-    } else if (scale_num == 2) {
-      int num = in->numel();
-      int batch_size = in->dims()[0];
-      int channel = in->dims()[1];
-      const T* scale_one = scales[0]->data<T>();
-      const T* scale_two = scales[1]->data<T>();
-      int block = 1024;
-      int grid = batch_size * channel;
-      DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          in_data, scale_one, scale_two, max_range, num, batch_size, channel,
-          out_data);
-    }
-  }
-};
-
-template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
-template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
-template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
-template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
-                        ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
-                        ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(
-    fake_channel_wise_dequantize_max_abs,
-    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, float>,
-    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
deleted file mode 100644
index ed9a0a4d65fab5ce1ef48835c332fade978d2bae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-struct DequantizeFunctor {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
-                  const framework::Tensor* scale, T max_range,
-                  framework::Tensor* out);
-};
-
-template <typename DeviceContext, typename T>
-struct ChannelDequantizeFunctor {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
-                  const framework::Tensor** scales, const int scale_num,
-                  T max_range, framework::Tensor* out);
-};
-
-template <typename DeviceContext, typename T>
-class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    float max_range = ctx.Attr<float>("max_range");
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    out->mutable_data<T>(dev_ctx.GetPlace());
-
-    DequantizeFunctor<DeviceContext, T>()(dev_ctx, in, scale,
-                                          static_cast<T>(max_range), out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto scales = ctx.MultiInput<framework::Tensor>("Scales");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
-    int max_range = 1;
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    out->mutable_data<T>(dev_ctx.GetPlace());
-    int scale_num = scales.size();
-    if (scale_num == 1) {
-      PADDLE_ENFORCE_EQ(
-          scales[0]->numel(), in->dims()[0],
-          "The number of first scale values must be the same with "
-          "first dimension value of Input(X) when the `Scales` has only one "
-          "element.");
-      max_range *= (std::pow(2, quant_bits[0] - 1) - 1);
-    } else if (scale_num == 2) {
-      PADDLE_ENFORCE_EQ(
-          scales[0]->numel(), in->dims()[1],
-          "The number of first scale values must be the same with "
-          "second dimension value of Input(X) when the `Scales` has two "
-          "elements.");
-      PADDLE_ENFORCE_EQ(
-          scales[1]->numel(), 1,
-          "The second scale tensor should only have one value at now.");
-      max_range *= (std::pow(2, quant_bits[0] - 1) - 1) *
-                   (std::pow(2, quant_bits[1] - 1) - 1);
-    }
-    ChannelDequantizeFunctor<DeviceContext, T>()(
-        dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range), out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
deleted file mode 100644
index 034f3c7dcebf906e600b9a6a651a1c857ddc4189..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ /dev/null
@@ -1,532 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fake_quantize_op.h"
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/clip_op.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct Compare {
- public:
-  bool operator()(const T a, const T b) { return (std::abs(a) < std::abs(b)); }
-};
-
-template <typename T>
-struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
-                  const int num, T* out) {
-    *out = std::abs(*(std::max_element(in + 0, in + num, Compare<T>())));
-  }
-};
-
-template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
-
-template <typename T>
-struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
-                  const int num, const int channel, T* out) {
-    const int channel_size = num / channel;
-    for (int i = 0; i < channel; i++) {
-      auto* start = in + i * channel_size;
-      auto* end = in + (i + 1) * channel_size;
-      out[i] = std::abs(*(std::max_element(start, end, Compare<T>())));
-    }
-  }
-};
-
-template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;
-
-template <typename T>
-struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    T s = scale.data<T>()[0];
-    platform::Transform<platform::CPUDeviceContext> trans;
-    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
-          out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round();
-  }
-};
-
-template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
-
-template <typename T>
-struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    T s = scale.data<T>()[0];
-    platform::Transform<platform::CPUDeviceContext> trans;
-    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
-          out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e.device(*ctx.eigen_device()) =
-        (s / bin_cnt) * (bin_cnt / s * out_e).round();
-  }
-};
-template struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext,
-                                               float>;
-
-template <typename T>
-struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int channel,
-                  framework::Tensor* out) {
-    auto* scale_data = scale.data<T>();
-    auto* in_data = in.data<T>();
-    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
-    const int channel_size = in.numel() / channel;
-    platform::Transform<platform::CPUDeviceContext> trans;
-    for (int i = 0; i < channel; i++) {
-      T s = scale_data[i];
-      auto* start = in_data + i * channel_size;
-      auto* end = in_data + (i + 1) * channel_size;
-      trans(ctx, start, end, out_data + i * channel_size,
-            ClipFunctor<T>(-s, s));
-    }
-    for (int i = 0; i < channel; i++) {
-      T s = scale_data[i];
-      framework::Tensor one_channel_out = out->Slice(i, i + 1);
-      auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-      out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round();
-    }
-  }
-};
-
-template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
-                                               float>;
-
-template <typename T>
-struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& cur_scale,
-                  const framework::Tensor& last_scale,
-                  const framework::Tensor& iter, const int window_size,
-                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
-    T* scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
-    int64_t it = iter.data<int64_t>()[0];
-    int idx = it % window_size;
-    T removed = scale_arr[idx];
-    T cur = cur_scale.data<T>()[0];
-    scale_arr[idx] = cur;
-
-    T max = last_scale.data<T>()[0];
-    if (max < cur) {
-      max = cur;
-    } else if (fabs(removed - max) < 1e-6) {
-      int size = (it > window_size) ? window_size : it;
-      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(ctx, scale_arr, size,
-                                                         &max);
-    }
-    out_scale->mutable_data<T>(ctx.GetPlace())[0] = max;
-  }
-};
-
-template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
-
-template <typename T>
-struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in_accum,
-                  const framework::Tensor& in_state, const T* cur_scale,
-                  const float rate, framework::Tensor* out_state,
-                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
-    T accum = in_accum.data<T>()[0];
-    T state = in_state.data<T>()[0];
-    T scale = cur_scale[0];
-
-    state = rate * state + 1;
-    accum = rate * accum + scale;
-    scale = accum / state;
-
-    out_state->mutable_data<T>(ctx.GetPlace())[0] = state;
-    out_accum->mutable_data<T>(ctx.GetPlace())[0] = accum;
-    out_scale->mutable_data<T>(ctx.GetPlace())[0] = scale;
-  }
-};
-
-template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,
-                                               float>;
-
-class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
- public:
-  FakeQuantizeAbsMaxOp(const std::string& type,
-                       const framework::VariableNameMap& inputs,
-                       const framework::VariableNameMap& outputs,
-                       const framework::AttributeMap& attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of FakeQuantizeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FakeQuantizeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
-                   "Output(Scale) of FakeQuantizeOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("OutScale", {1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input is float data type.");
-    AddOutput("Out",
-              "(Tensor) Output of quantized low level tensor, "
-              "but also saved as float data type.");
-    AddOutput("OutScale", "(Tensor) Current scale");
-    AddAttr<int>("bit_length", "(int, default 8)")
-        .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
-                         "'bit_length' should be between 1 and 16.");
-        });
-    AddComment(R"DOC(
-FakeQuantize operator
-
-$$scale = max(abs(X))$$
-$$range = 2^{bit_length - 1} - 1$$
-$$Out = round(X/scale * range)$$
-
-)DOC");
-  }
-};
-
-class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of FakeChannelWiseQuantizeOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("OutScale"),
-        "Output(Scale) of FakeChannelWiseQuantizeOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class FakeChannelWiseQuantizeAbsMaxOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input is float data type.");
-    AddOutput("Out",
-              "(Tensor) Output of quantized low level tensor, "
-              "but also saved as float data type.");
-    AddOutput("OutScale", "(Tensor) Current channel wise scale");
-    AddAttr<int>("bit_length", "(int, default 8)")
-        .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
-                         "'bit_length' should be between 1 and 16.");
-        });
-    AddComment(R"DOC(
-The scale of FakeChannelWiseQuantize operator is a vector.
-In detail, each channel of the input X has a scale value.
-
-$$scale_c = max(abs(X_c))$$
-$$range = 2^{bit\_length - 1} - 1$$
-$$Out_c = round(\frac{X_c * range} {scale_c})$$
-In above three formulas, the range value of c is as follow:
-$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
-)DOC");
-  }
-};
-
-class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
- public:
-  FakeQuantizeRangeAbsMaxOp(const std::string& type,
-                            const framework::VariableNameMap& inputs,
-                            const framework::VariableNameMap& outputs,
-                            const framework::AttributeMap& attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of FakeQuantizeRangeAbsMaxOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(Out) of FakeQuantizeRangeAbsMaxOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("OutScale"),
-        "Output(OutScale) of FakeQuantizeRangeAbsMaxOp should not be null");
-    if (ctx->HasOutput("OutScales")) {
-      int window_size = ctx->Attrs().Get<int>("window_size");
-      ctx->SetOutputDim("OutScales", {window_size});
-    }
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("OutScale", {1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class FakeQuantizeRangeAbsMaxOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input is float data type.");
-    AddInput("InScale", "Last scale.");
-    AddInput("Iter", "Global step iteration.").AsDispensable();
-    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
-    AddOutput("OutScale", " Current scale");
-    AddOutput("OutScales", "(Tensor) scale buffer.").AsDispensable();
-    AddAttr<int>("window_size", "(int, default 10000) window range size.")
-        .SetDefault(10000);
-    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
-        .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
-                         "'bit_length' should be between 1 and 16.");
-        });
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-FakeQuantize operator is used in static quantization.
-
-$$scale = max(max(abs(x)), history_abs_max)$$
-$$range = 2^{bit_length - 1} - 1$$
-$$Out = round(X/scale * range)$$
-
-)DOC");
-  }
-};
-
-class FakeQuantOrWithDequantMovingAverageAbsMaxOp
-    : public framework::OperatorWithKernel {
- public:
-  FakeQuantOrWithDequantMovingAverageAbsMaxOp(
-      const std::string& type, const framework::VariableNameMap& inputs,
-      const framework::VariableNameMap& outputs,
-      const framework::AttributeMap& attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of FakeQuantOrWithDequantMovingAverageAbsMaxOp "
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FakeQuantOrWithDequantMovingAverageAbsMaxOp "
-                   "should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("OutScale"),
-        "Output(OutScale) of FakeQuantOrWithDequantMovingAverageAbsMaxOp "
-        "should not be null");
-    if (ctx->HasOutput("OutState")) {
-      ctx->SetOutputDim("OutState", {1});
-    }
-    if (ctx->HasOutput("OutAccum")) {
-      ctx->SetOutputDim("OutAccum", {1});
-    }
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("OutScale", {1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input is float data type.");
-    AddInput("InScale", "Last scale.");
-    AddInput("InAccum", "Last accum.").AsDispensable();
-    AddInput("InState", "Last state.").AsDispensable();
-    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
-    AddOutput("OutScale", " Current scale");
-    AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
-    AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
-    AddAttr<float>("moving_rate", "(float, default 0.9) moving rate.")
-        .SetDefault(0.9);
-    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
-        .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
-                         "'bit_length' should be between 1 and 16.");
-        });
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-This is a Base Op which support FakeQuantMovingAverageAbsMaxOp and FakeQuantDequantMovingAverageAbsMaxOp
-FakeQuantMovingAverageAbsMaxOp operator is used in static quantization.
-
-$$scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)$$
-$$range = 2^{bit\_length - 1} - 1$$
-$$Out = round(X/scale * range)$$
-
-FakeQuantDequantMovingAverageAbsMaxOp operator do the moving_average_abs_max op quant and then dequant.
-
-$$scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)$$
-$$range = 2^{bit\_length - 1} - 1$$
-$$Out = round(X/scale * range) * scale / range$$
-
-)DOC");
-  }
-};
-
-class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("X"),
-        "Input(X) of MovingAverageAbsMaxScaleOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(Out) of MovingAverageAbsMaxScaleOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
-                   "Output(OutScale) of MovingAverageAbsMaxScaleOp"
-                   "should not be null");
-    if (ctx->HasOutput("OutState")) {
-      ctx->SetOutputDim("OutState", {1});
-    }
-    if (ctx->HasOutput("OutAccum")) {
-      ctx->SetOutputDim("OutAccum", {1});
-    }
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("OutScale", {1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class MovingAverageAbsMaxScaleOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input is float data type.");
-    AddInput("InAccum", "Last accum.").AsDispensable();
-    AddInput("InState", "Last state.").AsDispensable();
-    AddOutput("Out",
-              "(Tensor) Output tensor is just equivalent to the input tensor.");
-    AddOutput("OutScale", " Current scale");
-    AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
-    AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
-    AddAttr<float>("moving_rate", "(float, default 0.9) moving rate.")
-        .SetDefault(0.9);
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set true for inference only and false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-MovingAverageAbsMaxScale operator is only used for calculating the quantization scale.
-And it will not quantize the input tensor.
-
-$$scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)$$
-$$Out = X$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-
-REGISTER_OPERATOR(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxOp,
-                  ops::FakeQuantizeAbsMaxOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
-                       ops::FakeQuantizeAbsMaxKernel<CPU, float>);
-
-REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
-                  ops::FakeQuantizeRangeAbsMaxOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
-                       ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
-
-REGISTER_OPERATOR(fake_quantize_moving_average_abs_max,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
-                       ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
-
-REGISTER_OPERATOR(fake_quantize_dequantize_moving_average_abs_max,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fake_quantize_dequantize_moving_average_abs_max,
-    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CPU, float>);
-
-REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
-                  ops::FakeChannelWiseQuantizeAbsMaxOp,
-                  ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
-                       ops::FakeChannelWiseQuantizeAbsMaxKernel<CPU, float>);
-
-REGISTER_OPERATOR(moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleOp,
-                  ops::MovingAverageAbsMaxScaleOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
-                       ops::MovingAverageAbsMaxScaleKernel<CPU, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
deleted file mode 100644
index e9a7201bc0826414ec4adbd3bf2804db013a4571..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ /dev/null
@@ -1,345 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/fake_quantize_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
-  int tid = threadIdx.x;
-
-  extern __shared__ T shared_max_data[];
-  if (gridDim.x > 1) {
-    shared_max_data[tid] = T(0);
-    for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-      T tmp = fabs(in[i]);
-      if (tmp > shared_max_data[tid]) {
-        shared_max_data[tid] = tmp;
-      }
-    }
-  } else {
-    if (bid < n) {
-      shared_max_data[tid] = fabs(in[bid]);
-    } else {
-      shared_max_data[tid] = T(0);
-    }
-  }
-  __syncthreads();
-
-  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
-    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
-      shared_max_data[tid] = shared_max_data[tid + i];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    out[blockIdx.x] = shared_max_data[0];
-  }
-}
-
-template <typename T>
-struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
-                  const int num, T* out) {
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    grid = (grid > block) ? block : grid;
-
-    framework::Tensor max;
-    T* max_data =
-        max.mutable_data<T>(framework::make_ddim({grid}), ctx.GetPlace());
-    FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
-        in, num, max_data);
-    FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
-        max_data, grid, out);
-  }
-};
-
-template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
-
-template <typename T>
-__global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c,
-                                        T* out) {
-  int tid = threadIdx.x;
-  int channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  extern __shared__ T shared_max_data[];
-  shared_max_data[tid] = T(0);
-  for (int i = tid; i < channel_size; i += blockDim.x) {
-    T tmp = fabs(in_c[i]);
-    if (tmp > shared_max_data[tid]) {
-      shared_max_data[tid] = tmp;
-    }
-  }
-  __syncthreads();
-  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
-    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
-      shared_max_data[tid] = shared_max_data[tid + i];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    out[blockIdx.x] = shared_max_data[0];
-  }
-}
-
-template <typename T>
-struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
-                  const int num, const int channel, T* out) {
-    int block = 1024;
-    int grid = channel;
-    FindChannelAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
-        in, num, channel, out);
-  }
-};
-
-template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
-
-template <typename T>
-__global__ void ClipAndQuantKernel(const T* in, const T* scale,
-                                   const int bin_cnt, const int n, T* out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
-  int tid = threadIdx.x;
-
-  T s = scale[0];
-  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
-    T v = x > s ? s : x;
-    v = v < -s ? -s : v;
-    v = bin_cnt / s * v;
-    out[i] = round(v);
-  }
-}
-
-template <typename T>
-__global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
-                                          const int bin_cnt, const int n,
-                                          T* out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
-  int tid = threadIdx.x;
-
-  T s = scale[0];
-  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
-    T v = x > s ? s : x;
-    v = v < -s ? -s : v;
-    v = bin_cnt / s * v;
-    out[i] = round(v) * s / bin_cnt;
-  }
-}
-
-template <typename T>
-struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, out_data);
-  }
-};
-
-template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
-
-template <typename T>
-struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    ClipAndQuantDequantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, out_data);
-  }
-};
-
-template struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext,
-                                               float>;
-
-template <typename T>
-__global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
-                                          const int bin_cnt, const int n,
-                                          const int c, T* out) {
-  int tid = threadIdx.x;
-
-  int channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
-
-  T s = scale[blockIdx.x];
-  for (int i = tid; i < channel_size; i += blockDim.x) {
-    T x = in_c[i];
-    T v = x > s ? s : x;
-    v = v < -s ? -s : v;
-    v = bin_cnt / s * v;
-    out_c[i] = round(v);
-  }
-}
-
-template <typename T>
-struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int channel,
-                  framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = channel;
-
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    ChannelClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, channel, out_data);
-  }
-};
-
-template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
-                                               float>;
-
-template <typename T>
-__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
-                                            const T* last_scale,
-                                            const int64_t* iter,
-                                            const int window_size, T* scale_arr,
-                                            T* out_scale, int* need_find_max,
-                                            int* out_size) {
-  int it = iter[0];
-  int idx = it % window_size;
-  T removed = scale_arr[idx];
-  T cur = cur_scale[0];
-  scale_arr[idx] = cur;
-  T max = last_scale[0];
-  out_scale[0] = max < cur ? cur : max;
-  if (fabs(removed - max) < 1e-6) {
-    need_find_max[0] = 1;
-    out_size[0] = it > window_size ? window_size : it;
-  } else {
-    need_find_max[0] = 0;
-  }
-}
-
-template <typename T>
-struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& cur_scale,
-                  const framework::Tensor& last_scale,
-                  const framework::Tensor& iter, const int window_size,
-                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
-    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-
-    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
-    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
-
-    framework::Tensor need_find_max, out_size;
-    int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
-    int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
-
-    FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
-        cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
-        window_size, scale_arr, out_scale_data, find_max, out_size_data);
-
-    int g_find_max;
-    memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
-                 sizeof(int), ctx.stream());
-    ctx.Wait();
-    if (g_find_max) {
-      int len;
-      memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
-                   sizeof(int), ctx.stream());
-      ctx.Wait();
-      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
-                                                          out_scale_data);
-    }
-  }
-};
-
-template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
-
-template <typename T>
-struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in_accum,
-                  const framework::Tensor& in_state, const T* cur_scale,
-                  const float rate, framework::Tensor* out_state,
-                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
-    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-
-    T accum;
-    T state;
-    T scale;
-    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
-                 sizeof(T), ctx.stream());
-    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
-                 sizeof(T), ctx.stream());
-    memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
-                 ctx.stream());
-    ctx.Wait();
-    state = rate * state + 1;
-    accum = rate * accum + scale;
-    scale = accum / state;
-
-    memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &accum, sizeof(T), ctx.stream());
-    memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &state, sizeof(T), ctx.stream());
-    memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &scale, sizeof(T), ctx.stream());
-    ctx.Wait();
-  }
-};
-
-template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
-                                               float>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
-                        ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
-                        ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
-                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(
-    fake_quantize_moving_average_abs_max,
-    ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale,
-                        ops::MovingAverageAbsMaxScaleKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(
-    fake_quantize_dequantize_moving_average_abs_max,
-    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
deleted file mode 100644
index 285947567e3603079737f1073ce5120f908238dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-struct FindAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const T* in, const int num, T* out);
-};
-
-template <typename DeviceContext, typename T>
-struct ClipAndFakeQuantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  framework::Tensor* out);
-};
-
-template <typename DeviceContext, typename T>
-struct ClipAndFakeQuantDequantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  framework::Tensor* out);
-};
-
-template <typename DeviceContext, typename T>
-struct FindRangeAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& cur_scale,
-                  const framework::Tensor& last_scale,
-                  const framework::Tensor& iter, const int window_size,
-                  framework::Tensor* scales_arr, framework::Tensor* out_scale);
-};
-
-template <typename DeviceContext, typename T>
-struct FindChannelAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const T* in, const int num,
-                  const int channel, T* out);
-};
-
-template <typename DeviceContext, typename T>
-struct ChannelClipAndFakeQuantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  const int channel, framework::Tensor* out);
-};
-
-template <typename DeviceContext, typename T>
-struct FindMovingAverageAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
-                  const framework::Tensor& in_state,
-                  const framework::Tensor& cur_scale,
-                  framework::Tensor* out_state, framework::Tensor* out_accum,
-                  framework::Tensor* out_scale);
-};
-
-template <typename DeviceContext, typename T>
-class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    T* out_s = out_scale->mutable_data<T>(context.GetPlace());
-
-    int bit_length = context.Attr<int>("bit_length");
-    int bin_cnt = std::pow(2, bit_length - 1) - 1;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    const T* in_data = in->data<T>();
-    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in_data, in->numel(), out_s);
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
-                                                bin_cnt, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
-    out->mutable_data<T>(context.GetPlace());
-
-    int bit_length = context.Attr<int>("bit_length");
-    int bin_cnt = std::pow(2, bit_length - 1) - 1;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    FindChannelAbsMaxFunctor<DeviceContext, T>()(
-        dev_ctx, in->data<T>(), in->numel(), in->dims()[0], out_scale_data);
-    ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
-        dev_ctx, *in, *out_scale, bin_cnt, in->dims()[0], out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* in_scale = context.Input<framework::Tensor>("InScale");
-
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    bool is_test = context.Attr<bool>("is_test");
-    int bit_length = context.Attr<int>("bit_length");
-    int bin_cnt = std::pow(2, bit_length - 1) - 1;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    // testing
-    if (is_test) {
-      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
-                                                  bin_cnt, out);
-      return;
-    }
-
-    // training
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    auto* out_scales = context.Output<framework::Tensor>("OutScales");
-    auto* iter = context.Input<framework::Tensor>("Iter");
-
-    int window_size = context.Attr<int>("window_size");
-    out_scale->mutable_data<T>(context.GetPlace());
-
-    framework::Tensor cur_scale;
-    T* cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
-    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
-                                          cur_scale_data);
-    FindRangeAbsMaxFunctor<DeviceContext, T>()(dev_ctx, cur_scale, *in_scale,
-                                               *iter, window_size, out_scales,
-                                               out_scale);
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
-                                                bin_cnt, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
- public:
-  ~FakeMovingAverageAbsMaxKernelBase() {}
-  virtual void RunClipFunctor(const DeviceContext& dev_ctx,
-                              const framework::Tensor& in,
-                              const framework::Tensor& in_scale, int bin_cnt,
-                              framework::Tensor* out) const = 0;
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* in_scale = context.Input<framework::Tensor>("InScale");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    bool is_test = context.Attr<bool>("is_test");
-    int bit_length = context.Attr<int>("bit_length");
-    int bin_cnt = std::pow(2, bit_length - 1) - 1;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    // testing
-    if (is_test) {
-      RunClipFunctor(dev_ctx, *in, *in_scale, bin_cnt, out);
-      return;
-    }
-
-    // training
-    auto* in_accum = context.Input<framework::Tensor>("InAccum");
-    auto* in_state = context.Input<framework::Tensor>("InState");
-    auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
-    T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
-
-    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
-                                          cur_scale_data);
-
-    auto* out_state = context.Output<framework::Tensor>("OutState");
-    auto* out_accum = context.Output<framework::Tensor>("OutAccum");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    out_state->mutable_data<T>(context.GetPlace());
-    out_accum->mutable_data<T>(context.GetPlace());
-    out_scale->mutable_data<T>(context.GetPlace());
-    float moving_rate = context.Attr<float>("moving_rate");
-
-    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
-        dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
-        out_accum, out_scale);
-
-    RunClipFunctor(dev_ctx, *in, *out_scale, bin_cnt, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FakeQuantizeMovingAverageAbsMaxKernel
-    : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
- public:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& in_scale, int bin_cnt,
-                      framework::Tensor* out) const override {
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, in, in_scale, bin_cnt,
-                                                out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FakeQuantizeDequantizeMovingAverageAbsMaxKernel
-    : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
- public:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& in_scale, int bin_cnt,
-                      framework::Tensor* out) const override {
-    ClipAndFakeQuantDequantFunctor<DeviceContext, T>()(dev_ctx, in, in_scale,
-                                                       bin_cnt, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
-
-    bool is_test = context.Attr<bool>("is_test");
-    // testing
-    if (is_test) {
-      return;
-    }
-
-    // training
-    auto* in_accum = context.Input<framework::Tensor>("InAccum");
-    auto* in_state = context.Input<framework::Tensor>("InState");
-    auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
-    T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
-
-    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
-                                          cur_scale_data);
-
-    auto* out_state = context.Output<framework::Tensor>("OutState");
-    auto* out_accum = context.Output<framework::Tensor>("OutAccum");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    out_state->mutable_data<T>(context.GetPlace());
-    out_accum->mutable_data<T>(context.GetPlace());
-    out_scale->mutable_data<T>(context.GetPlace());
-    float moving_rate = context.Attr<float>("moving_rate");
-
-    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
-        dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
-        out_accum, out_scale);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
deleted file mode 100644
index da30fef555e1657d60e9493ab9e70beea838e801..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fc_op.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fc_op.h"
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class FCOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      "X(Input) of Fully Connected should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Out(Output) of Fully Connected should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                      "W(Input) of Fully Connected should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    auto w_dims = ctx->GetInputDim("W");
-
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      if (bias_dims.size() == 2) {
-        PADDLE_ENFORCE_EQ(bias_dims[0], 1,
-                          "The shape of Bias must be [1, dim].");
-        PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
-                          "The shape of Bias must be [1, dim].");
-      } else if (bias_dims.size() == 1) {
-        PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1],
-                          "The shape of Bias must be [1, dim].");
-      }
-    }
-
-    auto& activation_type = ctx->Attrs().Get<std::string>("activation_type");
-    if (!activation_type.empty()) {
-      PADDLE_ENFORCE_EQ(activation_type, "relu",
-                        "Activation %s is not supportetd in fc now.",
-                        activation_type.c_str());
-    }
-    if (ctx->Attrs().Get<bool>("use_mkldnn")) {
-      PADDLE_ENFORCE_EQ(in_dims.size() == 2 || in_dims.size() == 4, true,
-                        "Fully Connected input should be 2-D or 4-D tensor.");
-    }
-    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "Fully Connected input should be 2-D tensor.");
-    int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
-    PADDLE_ENFORCE_GT(
-        in_dims.size(), in_num_col_dims,
-        "The input tensor Input's rank of FCOp should be larger than "
-        "in_num_col_dims.");
-
-    std::vector<int64_t> output_dims;
-    FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims);
-
-    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
-    ctx->ShareLoD("Input", "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-    if (ctx.Attr<bool>("use_mkldnn")) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-    }
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   ctx.GetPlace(), layout, library);
-  }
-};
-
-void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto w_dims = ctx->GetInputDim("W");
-
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("W"))) {
-    ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
-  }
-
-  if (ctx->HasInput("Bias")) {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Bias")), true,
-                      "Should have bias grad");
-    auto bias_dims = ctx->GetInputDim("Bias");
-    ctx->SetOutputDim(framework::GradVarName("Bias"), bias_dims);
-  }
-}
-
-framework::OpKernelType FCOpGrad::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library = framework::LibraryType::kPlain;
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-  if (ctx.Attr<bool>("use_mkldnn")) {
-    library = framework::LibraryType::kMKLDNN;
-    layout = framework::DataLayout::kMKLDNN;
-  }
-  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                 ctx.GetPlace(), layout, library);
-}
-
-class FCOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(Tensor), The input tensor of fully connected operator.");
-    AddInput("W", "(Tensor), The weight fc op with shape (I, O).");
-    AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor) The output tensor of fully connected operator. ");
-    AddAttr<int>("in_num_col_dims",
-                 "(int, default 1), The fc op can take tensors with more than "
-                 "two dimensions as its inputs.")
-        .SetDefault(1)
-        .EqualGreaterThan(1);
-    AddAttr<std::string>("activation_type",
-                         "Activation type used in fully connected operator.")
-        .SetDefault("");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
-                  "Skip calling InferShape() function in the runtime.")
-        .SetDefault(true);
-    AddComment(R"DOC(
-Fully Connected Operator.
-
-The fully connected operation calculates the output based on the input, weights and bias.
-The size of each dimension of the parameters checked in the infer-shape.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fc, ops::FCOp, ops::FCOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(fc_grad, ops::FCOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    fc, ops::FCOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FCOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fc_op.cu.cc b/paddle/fluid/operators/fc_op.cu.cc
deleted file mode 100644
index 2fd33aeb1283ec7888e83dd0f3b94af24726e741..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fc_op.cu.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fc_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fc, ops::FCOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FCOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
deleted file mode 100644
index bf08e6ba6866e3929fdbe58619507ddccb7162ad..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fc_op.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/fc.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class FCOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-inline void FCOutputSize(const framework::DDim& in_dims,
-                         const framework::DDim& w_dims,
-                         std::vector<int64_t>& out_dims,  // NOLINT
-                         int in_num_col_dims) {
-  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
-  PADDLE_ENFORCE_EQ(
-      in_mat_dims[1], w_dims[0],
-      "Fully Connected input and weigth size do not match. %s, %s");
-
-  out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
-  for (int i = 0; i < in_num_col_dims; ++i) {
-    out_dims.push_back(in_dims[i]);
-  }
-  out_dims.push_back(w_dims[1]);
-}
-
-template <typename DeviceContext, typename T>
-class FCOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::LoDTensor>("Input");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* output = ctx.Output<framework::LoDTensor>("Out");
-    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
-    bool with_relu =
-        (ctx.Attr<std::string>("activation_type") == "relu") ? true : false;
-
-    auto w_dims = w->dims();
-
-    std::vector<int64_t> output_dims;
-    FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims);
-    output->Resize(framework::make_ddim(output_dims));
-    output->set_lod(input->lod());
-
-    auto out_dims = output->dims();
-    int M = framework::product(out_dims) / w_dims[1];
-
-    const T* input_data = input->data<T>();
-    const T* w_data = w->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::FCFunctor<DeviceContext, T> fc;
-    fc(dev_ctx, M, w_dims[1], w_dims[0], input_data, w_data, output_data,
-       bias ? bias->data<T>() : NULL, with_relu);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_any_like_op.cc b/paddle/fluid/operators/fill_any_like_op.cc
deleted file mode 100644
index 43a71c917267b72d97a915c6fe52daf8ab4b9510..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_any_like_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FillAnyLikeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of FillAnyLikeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FillAnyLikeOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class FillAnyLikeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of fill-zeros-like op.");
-    AddOutput("Out", "The variable will be filled up with specified value.");
-    AddAttr<float>("value", "The filled value").SetDefault(0.0);
-    AddComment(R"DOC(
-FillAnyLike Operator.
-
-Fill up a variable with Attr(value).
-The output will have the same shape and dtype as the input.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fill_any_like, ops::FillAnyLikeOp,
-                             ops::FillAnyLikeOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    fill_any_like,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::float16>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_any_like_op.cu b/paddle/fluid/operators/fill_any_like_op.cu
deleted file mode 100644
index 26b215d1e7f64d3071eee3d50a91b451d5479c9a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_any_like_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/fill_any_like_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_any_like,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext,
-                           paddle::platform::float16>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
deleted file mode 100644
index f0b6bcb16e20e26512e04c8cc2216d441057db52..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <limits>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillAnyLikeKernel : public framework::OpKernel<T> {
- public:
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, platform::float16>::value,
-                                float, T>::type>::type;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    // TODO(fangzeyang): Once context.Attribute supports double dtype, this
-    // kernel should be updated to support double dtype, too.
-    float value = context.Attr<float>("value");
-
-    auto common_type_value = static_cast<CommonType>(value);
-
-    PADDLE_ENFORCE(
-        (common_type_value >=
-         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-            (common_type_value <=
-             static_cast<CommonType>(std::numeric_limits<T>::max())),
-        "filled value is out of range for targeted type in fill_any_like "
-        "kernel");
-
-    PADDLE_ENFORCE(!std::isnan(value), "filled value is NaN");
-
-    math::SetConstant<DeviceContext, T> setter;
-    setter(context.template device_context<DeviceContext>(), out,
-           static_cast<T>(value));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
deleted file mode 100644
index b8921b171cf37be17fb62d270a5c22f9d1806c64..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
-#include "paddle/fluid/operators/batch_size_like.h"
-
-namespace paddle {
-namespace operators {
-
-class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
- protected:
-  using BatchSizeLikeOp::BatchSizeLikeOp;
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
-  }
-};
-
-class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
- protected:
-  void Apply() override {
-    AddAttr<int>(
-        "dtype",
-        "It could be numpy.dtype. Output data type. Default is float32")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<float>("value", "default 0. The value to be filled")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-This function creates a tensor of specified *shape*, *dtype* and batch size,
-and initializes this with a constant supplied in *value*. The batch size is
-obtained from the `input` tensor.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant_batch_size_like,
-                  ops::FillConstantBatchSizeLikeOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::FillConstantBatchSizeLikeOpMaker,
-                  ops::BatchSizeLikeNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
deleted file mode 100644
index 2cbbd05bfbb5dfea161cc69bc32ecfc77e7d8f18..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           int64_t>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
deleted file mode 100644
index 63ea60678f80708f5a8340edd22588553b9ec139..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* in = ctx.Input<framework::LoDTensor>("Input");
-    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
-      // set the correct batch size for the LoDTensor.
-      auto odims = out->dims();
-      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
-      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
-      out->mutable_data<T>(odims, ctx.GetPlace());
-    }
-    out->mutable_data<T>(ctx.GetPlace());
-    auto value = ctx.Attr<float>("value");
-
-    math::SetConstant<DeviceContext, T> setter;
-    setter(ctx.template device_context<DeviceContext>(), out,
-           static_cast<T>(value));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
deleted file mode 100644
index cf2f4776cf2ae9a707d3b841c2a41b7f82ca7833..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_constant_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FillConstantOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FillConstantOp should not be null.");
-    auto& shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    ctx->SetOutputDim("Out", framework::make_ddim(shape));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class FillConstantOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(ctx->GetAttr("dtype")));
-    auto& out_var_name = ctx->Output("Out").front();
-    ctx->SetDataType(out_var_name, data_type);
-  }
-};
-
-class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output");
-    AddAttr<float>("value", "(float, default 0) The value to be filled")
-        .SetDefault(0.0f);
-    AddAttr<bool>("force_cpu",
-                  "(bool, default false) Force fill output variable to cpu "
-                  "memory. Otherwise, fill output variable to the running "
-                  "device")
-        .SetDefault(false);
-    AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
-              "with the specified value");
-    AddComment(R"DOC(
-FillConstantBatchSizeLike Operator.
-
-Fill up a variable with specified constant value.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
-                  ops::FillConstantOpVarTypeInference,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
-                       ops::FillConstantKernel<double>,
-                       ops::FillConstantKernel<int64_t>,
-                       ops::FillConstantKernel<int>,
-                       ops::FillConstantKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc
deleted file mode 100644
index 77027b5a87d4ab4c16a1b850d1d38c929f68dcf3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_constant_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
-                        ops::FillConstantKernel<double>,
-                        ops::FillConstantKernel<int64_t>,
-                        ops::FillConstantKernel<int>,
-                        ops::FillConstantKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
deleted file mode 100644
index 417c5b4da611c7f0c2a4408d0eb3c69e56208f47..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_constant_op.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class FillConstantKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto value = ctx.Attr<float>("value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-
-    framework::Tensor *tensor = nullptr;
-
-    framework::Variable *out_var = ctx.OutputVar("Out");
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      tensor = out_var->GetMutable<framework::LoDTensor>();
-      tensor->Resize(
-          framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(
-          framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
-    } else {
-      PADDLE_THROW(
-          "fill constant op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-
-    if (force_cpu) {
-      tensor->mutable_data(platform::CPUPlace(), data_type);
-    } else {
-      tensor->mutable_data(ctx.GetPlace(), data_type);
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(ctx.GetPlace());
-    math::set_constant(dev_ctx, tensor, value);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
deleted file mode 100644
index 4f7cfcf112a0595641b16447b417cbe86db31120..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_op.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class FillOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddComment(R"DOC(Fill operator
-
-Fill an tensor with `value` and `shape`. The type of the tensor is specify by
-`dtype`.
-)DOC");
-    AddOutput("Out", "(LoDTensor) The output tensor.");
-    AddAttr<std::vector<float>>(
-        "value", "The float values of tensor, which are flatten in row major");
-    AddAttr<std::vector<int>>("shape", "The shape of output tensor");
-    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<bool>("force_cpu",
-                  "Whether the output tensor must be at CPU memory or not. "
-                  "Default is false.")
-        .SetDefault(false);
-  }
-};
-
-class FillOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE_EQ(context->HasOutput("Out"), true,
-                      "Output(Out) of FillOp should not be null.");
-    auto& shape = context->Attrs().Get<std::vector<int>>("shape");
-    context->SetOutputDim("Out", framework::make_ddim(shape));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class FillOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(ctx->GetAttr("dtype")));
-    auto& out_var_name = ctx->Output("Out").front();
-    ctx->SetDataType(out_var_name, data_type);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpMaker,
-                  ops::FillOpVarTypeInference,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(fill, ops::FillKernel<float>, ops::FillKernel<double>,
-                       ops::FillKernel<int64_t>, ops::FillKernel<int>,
-                       ops::FillKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_op.cu.cc b/paddle/fluid/operators/fill_op.cu.cc
deleted file mode 100644
index fdef8ab2a17080bdd204e3ab5ae83d4107957fc5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_op.cu.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(fill, ops::FillKernel<float>, ops::FillKernel<double>,
-                        ops::FillKernel<int64_t>, ops::FillKernel<int>,
-                        ops::FillKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_op.h b/paddle/fluid/operators/fill_op.h
deleted file mode 100644
index fa2d5b858d95bcafbdcbf975dea1e183444bf118..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_op.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include <algorithm>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-
-struct FillOpVisitor {
-  FillOpVisitor(framework::LoDTensor *tensor, const std::vector<float> &value)
-      : tensor_(tensor), value_(value) {}
-
-  template <typename T>
-  void apply() const {
-    platform::CPUPlace cpu;
-    auto *data = tensor_->mutable_data<T>(cpu);
-    std::transform(value_.data(), value_.data() + tensor_->numel(), data,
-                   [](float dat) { return static_cast<T>(dat); });
-  }
-
-  framework::LoDTensor *tensor_;
-  const std::vector<float> &value_;
-};
-
-template <typename T>
-class FillKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto &out =
-        detail::Ref(ctx.Output<framework::LoDTensor>("Out"),
-                    "Cannot get output lod tensor Out, variable name = %s",
-                    ctx.op().Output("Out"));
-    out.Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
-    auto dtype =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    platform::CPUPlace cpu;
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : ctx.GetPlace(), dtype);
-
-    framework::LoDTensor tensor;
-
-    if (force_cpu || platform::is_cpu_place(ctx.GetPlace())) {
-      tensor.ShareDataWith(out);
-    } else {
-      // Always make tensor in CPU memory.
-      tensor.Resize(out.dims());
-      tensor.mutable_data(cpu, dtype);
-    }
-
-    framework::VisitDataType(
-        dtype, FillOpVisitor(&tensor, ctx.Attr<std::vector<float>>("value")));
-
-    if (!force_cpu && platform::is_gpu_place(ctx.GetPlace())) {
-      // Copy tensor to out
-      framework::TensorCopy(
-          tensor, ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(), &out);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
deleted file mode 100644
index 107f83e3f885bcd5a71aaf1e51cbd0bd39b676f0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_zeros_like_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FillZerosLikeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FillZerosLikeOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of fill-zeros-like op.");
-    AddOutput("Out", "The variable will be filled up with zeros.");
-    ExtraMake();
-    AddComment(R"DOC(
-FillZerosLike Operator.
-
-Fill up a variable with zeros.
-The output will have the same size as the input.
-
-)DOC");
-  }
-
- protected:
-  virtual void ExtraMake() {}
-};
-
-class FillZerosLikeOp2 : public FillZerosLikeOp {
- public:
-  using FillZerosLikeOp::FillZerosLikeOp;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class FillZerosLikeOp2Maker : public FillZerosLikeOpMaker {
- protected:
-  void ExtraMake() override {
-    this->AddAttr<int>("dtype",
-                       "(int, default 5(FP32)) "
-                       "Output data type.")
-        .SetDefault(framework::proto::VarType::FP32);
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(FillZerosLikeOp2NoNeedBufferVarsInference,
-                                      "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
-                             ops::FillZerosLikeOpMaker);
-
-REGISTER_OPERATOR(fill_zeros_like2, ops::FillZerosLikeOp2,
-                  ops::FillZerosLikeOp2Maker,
-                  ops::FillZerosLikeOp2NoNeedBufferVarsInference,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
-
-REGISTER_OP_CPU_KERNEL(
-    fill_zeros_like2,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
deleted file mode 100644
index 1831635def79b3ccb713dbc14cc70b8beeb609fc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_zeros_like_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::float16>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
-
-REGISTER_OP_CUDA_KERNEL(
-    fill_zeros_like2,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::float16>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h
deleted file mode 100644
index 4bbe0df6b6890122381c87494e510cf125792377..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillZerosLikeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    math::SetConstant<DeviceContext, T> setter;
-    setter(context.template device_context<DeviceContext>(), out,
-           static_cast<T>(0));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc
deleted file mode 100644
index ebf44e5b9a5b3d0fe421a6d512f70f74a4146d56..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/filter_by_instag_op.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/filter_by_instag_op.h"
-
-#include <memory>
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-
-namespace paddle {
-namespace operators {
-class FilterByInstagOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ins"), true,
-                      "Input(Ins) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ins_tag"), true,
-                      "Input(Ins_tag) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Filter_tag"), true,
-                      "Input(Filter_tag) should be not null.");
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("LossWeight"), true,
-                      "Output(LossWeight) shoudl not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("IndexMap"), true,
-                      "Output(IndexMap) should be not null.");
-
-    auto x1_dims = ctx->GetInputDim("Ins");  // batch_size * vec
-
-    ctx->SetOutputDim("Out", framework::make_ddim({-1, x1_dims[1]}));
-    ctx->SetOutputDim("LossWeight", framework::make_ddim({-1, 1}));
-    ctx->SetOutputDim("IndexMap", framework::make_ddim({-1, 2}));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Ins"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class FilterByInstagOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ins", "(LoDTensor) embeded tensor");
-    AddInput("Ins_tag", "(LoDTensor) ins tag list");
-    AddInput("Filter_tag", "(1D Tensor) filter tag list");
-    AddAttr<bool>("is_lod", "is Ins with LoD info or not, default True");
-    AddOutput("Out", "(LoDTensor) embeded tensor filtered by instag");
-    AddOutput("LossWeight", "(Tensor) loss weight.");
-    AddOutput("IndexMap", "(LoDTensor) mapping from Out rows to X1 rows");
-    AddComment(R"DOC(
-Filter By Instag Op 
-
-This operator is used to filter embeded ins.
-
-There are 3 inputs. First is embeded ins, Second is tags for ins, 
-Third is tags to filter.
-
-There are 3 outputs. First is filtered embeded ins, Second is Loss Weight,
-Third is the IndexMap from Out line number to X1 line number. 
-)DOC");
-  }
-};
-
-class FilterByInstagOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("IndexMap"), true,
-                      "Input(IndexMap) should be not null");
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Grad Input(Out) should be not null");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ins"), true,
-                      "Input(Ins) should be not null");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("LossWeight"), true,
-                      "Input(LossWeight) should be not null");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Ins")), true,
-                      "Grad Output(Ins) should be not null");
-
-    auto grad_out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x1_dims = ctx->GetInputDim("Ins");
-    ctx->SetOutputDim(framework::GradVarName("Ins"),
-                      framework::make_ddim({x1_dims[0], grad_out_dims[1]}));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(
-        ctx.InputVar(framework::GradVarName("Out")));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class FilterByInstagGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("filter_by_instag_grad");
-    op->SetInput("IndexMap", Output("IndexMap"));
-    op->SetInput("Ins", Input("Ins"));
-    op->SetAttrMap(Attrs());
-    op->SetInput("LossWeight", Output("LossWeight"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("Ins"), InputGrad("Ins"));
-    return op;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(filter_by_instag, ops::FilterByInstagOp,
-                  ops::FilterByInstagOpMaker,
-                  ops::FilterByInstagGradOpDescMaker);
-
-REGISTER_OPERATOR(filter_by_instag_grad, ops::FilterByInstagOpGrad);
-
-REGISTER_OP_CPU_KERNEL(filter_by_instag, ops::FilterByInstagKernel<float>,
-                       ops::FilterByInstagKernel<double>,
-                       ops::FilterByInstagKernel<int32_t>,
-                       ops::FilterByInstagKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(filter_by_instag_grad,
-                       ops::FilterByInstagGradKernel<float>,
-                       ops::FilterByInstagGradKernel<double>,
-                       ops::FilterByInstagGradKernel<int32_t>,
-                       ops::FilterByInstagGradKernel<int64_t>);
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
deleted file mode 100644
index f082d0dfc1273cba4ef9b400022c2ba15f164cec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ /dev/null
@@ -1,201 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstring>
-#include <random>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
-using LoDTensor = framework::LoDTensor;
-#if defined(PADDLE_WITH_CUDA)
-template <typename T>
-using Vector = framework::Vector<T>;
-#else
-template <typename T>
-using Vector = framework::CPUVector<T>;
-#endif
-
-template <typename T>
-class FilterByInstagKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // X1 is global FC output
-    // Dim [batch size, embedding size]
-    auto* x1 = context.Input<LoDTensor>("Ins");
-    bool is_x1_lod = context.Attr<bool>("is_lod");
-    // X2 is ins tag list
-    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
-    auto* x2 = context.Input<LoDTensor>("Ins_tag");
-    // X3 is local fc tag list
-    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
-    auto* x3 = context.Input<Tensor>("Filter_tag");
-
-    std::unordered_set<int64_t> filter_tag;
-    auto* x3_data = x3->data<int64_t>();
-    size_t len = x3->dims()[0];
-    for (size_t i = 0; i < len; i++) {
-      filter_tag.insert(x3_data[i]);
-    }
-
-    // expected auto = const int64_t
-    auto* x2_data = x2->data<int64_t>();
-    // e.g get [0, 1, 2, 3, ...]
-    auto x2_lods = x2->lod()[0];
-    Vector<size_t> x1_lods(1, 0);
-    if (!is_x1_lod) {
-      for (size_t i = 0; i < x1->dims()[0]; i++) {
-        x1_lods.push_back(i + 1);
-      }
-    } else {
-      x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
-    }
-    std::unordered_map<int64_t, int64_t> mmap_aux;
-    Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
-      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
-        if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
-          size_t batch_len = x1_lods[i + 1] - x1_lods[i];
-          mmap_aux[out_lods.back()] = x1_lods[i];
-          out_lods.push_back(out_lods.back() + batch_len);
-          break;
-        }
-      }
-    }
-    // set output value
-    // for those whose ins been dropout, set 0 for whole lines.
-    // otherwise, copy whole line
-    // Dim [local fc count, batch size, embedding size]
-    LoDTensor* out = context.Output<LoDTensor>("Out");
-    LoDTensor* map = context.Output<LoDTensor>("IndexMap");
-    LoDTensor* loss_weight = context.Output<LoDTensor>("LossWeight");
-    // expected auto = const T
-    auto* x1_data = x1->data<T>();
-    // expected auto = T
-    size_t x1_embed_size = x1->dims()[1];
-    if (out_lods.size() - 1 > 0) {
-      out->Resize(framework::make_ddim(
-          {(int64_t)out_lods.back(), (int64_t)x1_embed_size}));
-      map->Resize(framework::make_ddim({(int64_t)out_lods.size() - 1, 3}));
-      loss_weight->Resize(
-          framework::make_ddim({(int64_t)out_lods.size() - 1, 1}));
-    } else {
-      out->Resize(framework::make_ddim({1, (int64_t)x1_embed_size}));
-      map->Resize(framework::make_ddim({1, 3}));
-      loss_weight->Resize(framework::make_ddim({1, 1}));
-    }
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    auto* map_data = map->mutable_data<int64_t>(context.GetPlace());
-    auto* loss_weight_data =
-        loss_weight->mutable_data<float>(context.GetPlace());
-    if (out_lods.size() - 1 > 0) {
-      Vector<size_t> map_lods;
-      for (size_t i = 0; i < out_lods.size() - 1; i++) {
-        map_data[i * 3] = (int64_t)out_lods[i];
-        map_data[i * 3 + 1] = mmap_aux[map_data[i * 3]];
-        map_data[i * 3 + 2] = out_lods[i + 1] - out_lods[i];
-        map_lods.push_back(i);
-      }
-      map_lods.push_back(out_lods.size() - 1);
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.push_back(map_lods);
-
-      map->set_lod(map_lod_info);
-      loss_weight->set_lod(map_lod_info);
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.push_back(out_lods);
-      out->set_lod(out_lod_info);
-      memset(out_data, 0, out->numel() * sizeof(T));
-      for (size_t i = 0; i < loss_weight->numel(); i++) {
-        loss_weight_data[i] = 1;
-      }
-
-      for (size_t i = 0; i < out_lods.size() - 1; i++) {
-        size_t pos = out_lods[i];
-        for (size_t k = map_data[i * 3 + 1];
-             k < map_data[i * 3 + 1] + map_data[i * 3 + 2]; k++) {
-          memcpy(out_data + pos * x1_embed_size, x1_data + k * x1_embed_size,
-                 x1_embed_size * sizeof(T));
-          ++pos;
-        }
-      }
-    } else {
-      Vector<size_t> map_lods;
-      map_data[0] = 0;
-      map_data[1] = 1;
-      map_data[2] = 1;
-      map_lods.push_back(0);
-      map_lods.push_back(1);
-      out_lods.push_back(1);
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.push_back(map_lods);
-      map->set_lod(map_lod_info);
-      loss_weight->set_lod(map_lod_info);
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.push_back(out_lods);
-      out->set_lod(out_lod_info);
-      memset(out_data, 0, out->numel() * sizeof(T));
-      loss_weight_data[0] = 0;
-    }
-  }
-};
-
-template <typename T>
-class FilterByInstagGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* x1_grad = context.Output<LoDTensor>(framework::GradVarName("Ins"));
-    auto* loss_weight = context.Input<LoDTensor>("LossWeight");
-    auto* mmap = context.Input<LoDTensor>("IndexMap");
-    auto* x1 = context.Input<LoDTensor>("Ins");
-    x1_grad->set_lod(context.Input<LoDTensor>("Ins")->lod());
-    x1_grad->Resize(x1->dims());
-    auto mmap_data = mmap->data<int64_t>();
-    // expected auto = T
-    auto* output_grad_data = output_grad->data<T>();
-
-    auto* loss_weight_data = loss_weight->data<float>();
-    // expected auto = T
-    auto* x1_grad_data = x1_grad->mutable_data<T>(context.GetPlace());
-    memset(x1_grad_data, 0, x1->dims()[0] * x1->dims()[1] * sizeof(T));
-    if (loss_weight->numel() != 1 || loss_weight_data[0] != 0) {
-      auto output_dims = output_grad->dims();
-      for (size_t i = 0; i < mmap->dims()[0]; i++) {
-        int src_ln = mmap_data[i * 3], dst_ln = mmap_data[i * 3 + 1];
-        int line_cnt = mmap_data[i * 3 + 2];
-        for (size_t l = 0; l < line_cnt; l++) {
-          for (size_t j = 0; j < output_dims[1]; j++) {
-            x1_grad_data[(dst_ln + l) * output_dims[1] + j] =
-                output_grad_data[(src_ln + l) * output_dims[1] + j];
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
deleted file mode 100644
index 9f2a122203bf9bed2d8737dc2056b16b4d7b7b8e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/flatten_op.cc
+++ /dev/null
@@ -1,275 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/flatten_op.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class FlattenOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input (X) of Flatten op should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output (Output) of Flatten op should not be null.");
-    const auto &axis = ctx->Attrs().Get<int>("axis");
-    const auto &in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(axis, 0,
-                      "The axis should be greater than or equal to 0.");
-    PADDLE_ENFORCE_LE(
-        axis, in_dims.size(),
-        "The axis should be less than or equal to input tensor's rank.");
-
-    const auto &out_dims = GetOutputShape(axis, in_dims);
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-    if (in_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-  }
-
-  static std::vector<int32_t> GetOutputShape(const int axis,
-                                             const framework::DDim &in_dims) {
-    int64_t outer = 1, inner = 1;
-    for (int i = 0; i < in_dims.size(); ++i) {
-      if (i < axis) {
-        outer *= in_dims[i];
-      } else {
-        inner *= in_dims[i];
-      }
-    }
-    std::vector<int32_t> out_shape(2);
-    out_shape[0] = outer;
-    out_shape[1] = inner;
-    return out_shape;
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class FlattenOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) A tensor of rank >= axis.");
-    AddOutput("Out",
-              "A 2D tensor is reshaped input tensor. The input dimensions"
-              "up to axis are flattened to the outer dimension of the output"
-              "and the remaining input dimensions are flattened into the inner"
-              "dimension of the output.");
-    AddAttr<int>("axis",
-                 "(int)"
-                 "Indicate up to which input dimensions (exclusive) should be"
-                 "flattened to the outer dimension of the output. The value"
-                 "for axis must be in the range [0, R], where R is the rank of"
-                 "the input tensor. When axis = 0, the shape of the output"
-                 "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
-                 "input tensor is (d_0, d_1, ... d_n).")
-        .SetDefault(1);
-    AddComment(R"DOC(
-Flatten Operator
-
-Flattens the input tensor into a 2D matrix.
-
-Examples:
-Case 1:
-  Given
-    X.shape = (3, 100, 100, 4)
-  and
-    axis = 2
-  We get:
-    Out.shape = (3 * 100, 4 * 100)
-
-Case 2:
-  Given
-    X.shape = (3, 100, 100, 4)
-  and
-    axis = 0
-  We get:
-    Out.shape = (1, 3 * 100 * 100 * 4)
-)DOC");
-  }
-};
-
-class FlattenGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    context->SetOutputDim(framework::GradVarName("X"),
-                          context->GetInputDim("X"));
-    context->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
-// the XShape is used to carry the shape and lod of X which will be used in
-// flatten_grad, in this way, the framework can reuse the memory of X
-// immediately the flatten2_op is finished.
-// Considering compatibility issues, we could not fix flatten2_op
-class Flatten2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input (X) of Flatten op should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output (Output) of Flatten op should not be null.");
-    const auto &axis = ctx->Attrs().Get<int>("axis");
-    const auto &in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(axis, 0,
-                      "The axis should be greater than or equal to 0.");
-    PADDLE_ENFORCE_LE(
-        axis, in_dims.size(),
-        "The axis should be less than or equal to input tensor's rank.");
-
-    const auto &out_dims = FlattenOp::GetOutputShape(axis, in_dims);
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-    if (in_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true,
-                      "Output (XShape) of Flatten op should not be null.");
-    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < in_dims.size(); ++i) {
-      xshape_dims[i + 1] = in_dims[i];
-    }
-    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", "XShape");
-  }
-};
-
-class Flatten2OpMaker : public FlattenOpMaker {
- public:
-  void Make() override {
-    FlattenOpMaker::Make();
-    AddOutput("XShape",
-              "XShape is just used to store the shape and lod of X, which will "
-              "be used in FlattenGradOp.")
-        .AsIntermediate();
-  }
-};
-
-class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("flatten2_grad");
-    grad_op->SetInput("XShape", Output("XShape"));
-    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class Flatten2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE_EQ(context->HasInput("XShape"), true,
-                      "Input(XShape) shouldn't be null.");
-    PADDLE_ENFORCE_EQ(context->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) shouldn't be null.");
-    auto xshape_dims = context->GetInputDim("XShape");
-    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    context->SetOutputDim(framework::GradVarName("X"), x_dims);
-    context->ShareLoD("XShape", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInToOut, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceinToOut,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>,
-                  ops::FlattenOpInplaceInToOut);
-REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp,
-                  ops::FlattenGradInplaceinToOut);
-
-REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
-                  ops::Flatten2GradOpMaker, ops::FlattenOpInplaceInToOut);
-REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
-                  ops::FlattenGradInplaceinToOut);
-
-REGISTER_OP_CPU_KERNEL(
-    flatten, ops::FlattenKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FlattenKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FlattenKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FlattenKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::FlattenKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    flatten_grad,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    flatten2, ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    flatten2_grad,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc
deleted file mode 100644
index ac4ad8e2dc1c09f5ee9f0adfb8b19e0e4ec374a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/flatten_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    flatten, ops::FlattenKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FlattenKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FlattenKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FlattenKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::FlattenKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    flatten_grad,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    flatten2, ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    flatten2_grad,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
deleted file mode 100644
index 165832c0e68bdef38f0382ea29f7655a18345805..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/flatten_op.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FlattenKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<framework::LoDTensor>("X");
-    auto *out = context.Output<framework::LoDTensor>("Out");
-
-    auto &axes = context.Attr<int>("axis");
-    auto x_dims = in->dims();
-    auto out_dims = framework::make_ddim(GetOutputShape(axes, x_dims));
-
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(), out);
-    out->Resize(out_dims);
-  }
-
-  static std::vector<int32_t> GetOutputShape(const int axis,
-                                             const framework::DDim &in_dims) {
-    int64_t outer = 1, inner = 1;
-    for (int i = 0; i < in_dims.size(); ++i) {
-      if (i < axis) {
-        outer *= in_dims[i];
-      } else {
-        inner *= in_dims[i];
-      }
-    }
-    std::vector<int32_t> out_shape(2);
-    out_shape[0] = outer;
-    out_shape[1] = inner;
-    return out_shape;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FlattenGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *d_out =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto in_dims = ctx.Input<framework::LoDTensor>("X")->dims();
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    d_x->Resize(in_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Flatten2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &axes = context.Attr<int>("axis");
-
-    auto *in = context.Input<framework::LoDTensor>("X");
-    auto x_dims = in->dims();
-
-    auto *out = context.Output<framework::LoDTensor>("Out");
-
-    auto out_dims = framework::make_ddim(
-        FlattenKernel<DeviceContext, T>::GetOutputShape(axes, x_dims));
-
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(), out);
-    out->Resize(out_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Flatten2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *d_out =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-
-    auto xshape_dims = ctx.Input<framework::LoDTensor>("XShape")->dims();
-    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    d_x->Resize(x_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fsp_op.cc b/paddle/fluid/operators/fsp_op.cc
deleted file mode 100644
index fbe8e56a6160219175bd573a2ff186eb35e56fdf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fsp_op.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fsp_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FSPOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of FSPOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of FSPOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FSPOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE(
-        x_dims.size() == 4,
-        "The Input(X) must have shape [batch_size, channel, height, width].");
-    PADDLE_ENFORCE(
-        y_dims.size() == 4,
-        "The Input(Y) must have shape [batch_size, channel, height, width].");
-    PADDLE_ENFORCE(
-        (x_dims[2] == y_dims[2]) && (x_dims[3] == y_dims[3]),
-        "The Input(X) and Input(Y) should have the same height and width.");
-
-    ctx->SetOutputDim("Out", {x_dims[0], x_dims[1], y_dims[1]});
-    ctx->ShareLoD("X", "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context(), layout_, library_);
-  }
-};
-
-class FSPOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor) The input of FSP op with shape [batch_size, x_channel, "
-             "height, width]");
-    AddInput("Y",
-             "(Tensor) The input of FSP op with shape"
-             "[batch_size, y_channel, height, width]."
-             "The y_channel can be different with the x_channel of Input(X)"
-             " while the other dimensions must be the same with Input(X)'s.");
-    AddOutput(
-        "Out",
-        "(Tensor) The output of FSP op with shape "
-        "[batch_size, x_channel, y_channel]. The x_channel is the channel "
-        "of Input(X) and the y_channel is the channel of Input(Y).");
-    AddComment(R"DOC(
-    This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps.
-    Given feature map x with shape [x_channel, h, w] and feature map y with shape
-    [y_channel, h, w], we can get the fsp matrix of x and y in two steps:
-
-        step 1: reshape x into matrix with shape [x_channel, h * w] and reshape and
-                transpose y into matrix with shape [h * w, y_channel]
-        step 2: multiply x and y to get fsp matrix with shape [x_channel, y_channel]
-
-    The output is a batch of fsp matrices.
-    )DOC");
-  }
-};
-
-class FSPOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fsp, ops::FSPOp, ops::FSPOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(fsp_grad, ops::FSPOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    fsp, ops::FSPOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FSPOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    fsp_grad, ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fsp_op.cu b/paddle/fluid/operators/fsp_op.cu
deleted file mode 100644
index 4fd7ba04ff9af1806963427ad58c68fc216e82ac..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fsp_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/fsp_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(fsp, ops::FSPOpKernel<plat::CUDADeviceContext, float>,
-                        ops::FSPOpKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(fsp_grad,
-                        ops::FSPGradOpKernel<plat::CUDADeviceContext, float>,
-                        ops::FSPGradOpKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/fsp_op.h b/paddle/fluid/operators/fsp_op.h
deleted file mode 100644
index 544af2b7d9b9729fe5dce08793da6c983fbcc6fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fsp_op.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class FSPOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-
-    auto batch_size = x_dims[0];
-    auto x_channel = x_dims[1];
-    auto y_channel = y_dims[1];
-    auto height = x_dims[2];
-    auto width = x_dims[3];
-
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-
-    math::MatDescriptor x_mat_desc;
-    x_mat_desc.height_ = x_channel;
-    x_mat_desc.width_ = height * width;
-    x_mat_desc.batch_size_ = batch_size;
-    x_mat_desc.stride_ = x_channel * height * width;
-
-    math::MatDescriptor y_mat_desc;
-    y_mat_desc.height_ = height * width;
-    y_mat_desc.width_ = y_channel;
-    y_mat_desc.batch_size_ = batch_size;
-    y_mat_desc.stride_ = y_channel * height * width;
-    y_mat_desc.trans_ = true;
-
-    blas.MatMul(*x, x_mat_desc, *y, y_mat_desc,
-                static_cast<T>(1.0 / (height * width)), output,
-                static_cast<T>(0.0));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FSPGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_y = context.Output<Tensor>(framework::GradVarName("Y"));
-    if (d_x == nullptr && d_y == nullptr) {
-      return;
-    }
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto d_out_dims = d_out->dims();
-    auto batch_size = d_out_dims[0];
-    auto x_channel = d_out_dims[1];
-    auto y_channel = d_out_dims[2];
-    int64_t h = 0;
-    int64_t w = 0;
-
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-    math::SetConstant<DeviceContext, T> set_zero;
-    if (d_x != nullptr) {
-      d_x->mutable_data<T>(context.GetPlace());
-      set_zero(context.template device_context<DeviceContext>(), d_x,
-               static_cast<T>(0));
-      auto* y = context.Input<Tensor>("Y");
-      auto y_dims = y->dims();
-      h = y_dims[2];
-      w = y_dims[3];
-
-      math::MatDescriptor d_out_mat_desc;
-      d_out_mat_desc.height_ = x_channel;
-      d_out_mat_desc.width_ = y_channel;
-      d_out_mat_desc.batch_size_ = batch_size;
-      d_out_mat_desc.stride_ = x_channel * y_channel;
-
-      math::MatDescriptor y_mat_desc;
-      y_mat_desc.height_ = y_channel;
-      y_mat_desc.width_ = h * w;
-      y_mat_desc.batch_size_ = batch_size;
-      y_mat_desc.stride_ = y_channel * h * w;
-
-      blas.MatMul(*d_out, d_out_mat_desc, *y, y_mat_desc,
-                  static_cast<T>(1.0 / (h * w)), d_x, static_cast<T>(0.0));
-    }
-
-    if (d_y != nullptr) {
-      d_y->mutable_data<T>(context.GetPlace());
-      set_zero(context.template device_context<DeviceContext>(), d_y,
-               static_cast<T>(0));
-      auto* x = context.Input<Tensor>("X");
-      auto x_dims = x->dims();
-      h = x_dims[2];
-      w = x_dims[3];
-
-      math::MatDescriptor d_out_mat_desc;
-      d_out_mat_desc.height_ = y_channel;
-      d_out_mat_desc.width_ = x_channel;
-      d_out_mat_desc.batch_size_ = batch_size;
-      d_out_mat_desc.stride_ = x_channel * y_channel;
-      d_out_mat_desc.trans_ = true;
-
-      math::MatDescriptor x_mat_desc;
-      x_mat_desc.height_ = x_channel;
-      x_mat_desc.width_ = h * w;
-      x_mat_desc.batch_size_ = batch_size;
-      x_mat_desc.stride_ = x_channel * h * w;
-
-      blas.MatMul(*d_out, d_out_mat_desc, *x, x_mat_desc,
-                  static_cast<T>(1.0 / (h * w)), d_y, static_cast<T>(0.0));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
deleted file mode 100644
index a31531c599a71e7da0697825a12ab86f5d809a51..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-include(operators)
-register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op fused_fc_elementwise_layernorm_op)
-if (WITH_GPU)
-  op_library(fusion_transpose_flatten_concat_op)
-  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
-  if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
-      op_library(fusion_conv_inception_op)
-      file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n")
-  endif()
-  op_library(fused_fc_elementwise_layernorm_op)
-  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_fc_elementwise_layernorm);\n")
-endif()
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
deleted file mode 100644
index 1cd6c40aa0540f5e5c9ea4b3e3e771dcc827eccf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ /dev/null
@@ -1,357 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
-#include <memory>
-#include <unordered_set>
-
-namespace paddle {
-namespace operators {
-
-bool IsUnaryCompound(const std::vector<std::string> &functor_list) {
-  PADDLE_ENFORCE_EQ(functor_list.size(), 2);
-  static std::unordered_set<std::string> binary_fun = {
-      "elementwise_add", "elementwise_mul", "elementwise_add_grad",
-      "elementwise_mul_grad"};
-  return binary_fun.count(functor_list[1]) != 0;
-}
-
-bool HasInPlaceUnary(const std::vector<std::string> &functor_list) {
-  PADDLE_ENFORCE_EQ(functor_list.size(), 2);
-  static std::unordered_set<std::string> InplaceOpSet = {"relu", "relu_grad"};
-  bool is_in_place = false;
-  for (auto &func_name : functor_list) {
-    is_in_place |= (InplaceOpSet.count(func_name) == 1);
-  }
-  return is_in_place;
-}
-
-bool InputXCanBeAbsent(const std::vector<std::string> &functor_list) {
-  PADDLE_ENFORCE_EQ(functor_list.size(), 2);
-  static std::unordered_set<std::string> binary_fun = {"elementwise_add_grad"};
-  return binary_fun.count(functor_list[0]) != 0 ||
-         binary_fun.count(functor_list[1]) != 0;
-}
-
-/*
- * Whether the compound function is supported.
- * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
- * out.
- */
-static bool IsSupportedCompound(const std::vector<std::string> &functors) {
-  PADDLE_ENFORCE_EQ(functors.size(), 2UL);
-
-  static std::unordered_set<std::string> unary_fun = {"scale", "relu", "tanh",
-                                                      "sigmoid"};
-  static std::unordered_set<std::string> binary_fun = {"elementwise_add",
-                                                       "elementwise_mul"};
-
-  std::string unary_fun_str;
-  if (binary_fun.count(functors[0])) {
-    unary_fun_str = functors[1];
-  } else if (binary_fun.count(functors[1])) {
-    unary_fun_str = functors[0];
-  } else {
-    PADDLE_THROW("%s and %s are not included in fused_list.", functors[0],
-                 functors[1]);
-  }
-  PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1,
-                    "%s is not included in fused_list.", unary_fun_str);
-  return true;
-}
-
-class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("X"),
-        "Input(X) of FusedElemwiseActivationOp op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("Y"),
-        "Input(Y) of FusedElemwiseActivationOp op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(Out) of FusedElemwiseActivationOp op should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto y_dim = ctx->GetInputDim("Y");
-
-    // Whether the shape of Y is a continuous subsequence of X,
-    // For more information please refer to the op's introduction.
-    bool bcast_y = IsBcastY(x_dim, y_dim);
-
-    auto &out_dim = bcast_y ? x_dim : y_dim;
-    std::string out_lod = bcast_y ? "X" : "Y";
-
-    if (ctx->Attrs().Get<bool>("save_intermediate_out")) {
-      PADDLE_ENFORCE(ctx->HasOutput("IntermediateOut"),
-                     "Output(IntermediateOut) of FusedElemwiseActivationOp "
-                     "should not be null.");
-
-      if (IsUnaryCompound(
-              ctx->Attrs().Get<std::vector<std::string>>("functor_list"))) {
-        // for Unary(Binary(X, Y)), the shape and lod of out and
-        // intermediate_out are the same.
-        ctx->SetOutputDim("IntermediateOut", out_dim);
-        // set the lod of intermediate_out
-        ctx->ShareLoD(out_lod, /*->*/ "IntermediateOut");
-      } else {
-        // for Binary(X, Unary(Y)), the shape and lod of Y and
-        // intermediate_out are the same.
-        ctx->SetOutputDim("IntermediateOut", y_dim);
-        // set the lod of intermediate_out
-        ctx->ShareLoD("Y", /*->*/ "IntermediateOut");
-      }
-    }
-    ctx->SetOutputDim("Out", out_dim);
-    ctx->ShareLoD(out_lod, /*->*/ "Out");
-  }
-
-  static bool IsBcastY(const framework::DDim &x_dim,
-                       const framework::DDim &y_dim) {
-    bool bcast_y = x_dim.size() >= y_dim.size();
-    if (x_dim.size() == y_dim.size()) {
-      for (int i = 0; i < x_dim.size(); ++i) {
-        if (x_dim[i] < y_dim[i]) {
-          bcast_y = false;
-          break;
-        }
-      }
-    }
-    return bcast_y;
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->type(),
-                      ctx.Input<framework::Tensor>("Y")->type(),
-                      "The element's type of input should be the same.");
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of fused_elemwise_activation operator.");
-    AddInput(
-        "Y",
-        "(Tensor) The input tensor of fused_elemwise_activation operator.");
-    AddOutput("Out",
-              "vector<Tensor> The output tensor of fused_elemwise_activation "
-              "operator.");
-    AddOutput("IntermediateOut",
-              "Tensor The IntermediateOut tensor of fused_elemwise_activation "
-              "operator.")
-        .AsIntermediate();
-    AddAttr<int>("axis",
-                 "axis is used by elementwise_op, the default value is -1.")
-        .SetDefault(-1);
-    AddAttr<float>("scale",
-                   "scale is used by scale_op, the default value is 0.0.")
-        .SetDefault(0.0);
-    AddAttr<bool>("save_intermediate_out",
-                  "Whether to save the intermediate_out.")
-        .SetDefault(false);
-    AddAttr<std::vector<std::string>>("functor_list",
-                                      "The functors that should be fused.")
-        .AddCustomChecker([&](const std::vector<std::string> &functor_list) {
-          PADDLE_ENFORCE(IsSupportedCompound(functor_list));
-        });
-
-    AddComment(R"DOC(
-FusedElemwiseActivation Operator.
-
-At present, FusedElemwiseActivation only supports Two kinds of compound
-operators (elementwise_op and activation_op):
-
-    Z = Binary(X, Unary(Y))
-    Z = Unary(Binary(X, Y))
-
-There are two cases for this operator:
-
-1. The shape of $Y$ and $X$ is the same.
-2. The shape of $Y$ is a continuous subsequence of $X$ or the shape of $X$ is a continuous subsequence of $Y$.
-
-For case 2 (assume that the shape of $Y$ is a continuous subsequence of $X$ ):
-
-1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index
-   for broadcasting $Y$ onto $X$.
-2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
-3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of
-   subsequence, such as shape(Y) = (2, 1) => (2).
-
-For example:
-
-  .. code-block:: python
-
-    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
-    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-
-
-The inputs $X$ and $Y$ can carry the different LoD information.
-But the output only shares the LoD information with the one whose shape is the same with Out.
-The attributions of activation_op can be get from fused_elemwise_activation_op's.
-The functor_list records the functions to be fused, for example
-["scale", "elementwise_add"].
-
-)DOC");
-  }
-};
-
-class FusedElemwiseActivationGradMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType(this->ForwardOpType() + "_grad");
-
-    for (auto &input_param : this->InputNames()) {
-      grad_op->SetInput(input_param, this->Input(input_param));
-      grad_op->SetOutput(framework::GradVarName(input_param),
-                         this->InputGrad(input_param, true));
-    }
-
-    grad_op->SetInput("Out", this->Output("Out"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-
-    grad_op->SetAttrMap(this->Attrs());
-
-    std::vector<std::string> functor_names =
-        boost::get<std::vector<std::string>>(grad_op->GetAttr("functor_list"));
-
-    functor_names[0] += "_grad";
-    functor_names[1] += "_grad";
-    grad_op->SetAttr("functor_list", functor_names);
-
-    if (boost::get<bool>(grad_op->GetAttr("save_intermediate_out"))) {
-      PADDLE_ENFORCE_NE(Output("IntermediateOut").size(), 0);
-      grad_op->SetInput("IntermediateOut", this->Output("IntermediateOut"));
-      grad_op->SetOutput(framework::GradVarName("IntermediateOut"),
-                         this->OutputGrad("IntermediateOut"));
-    } else {
-      grad_op->SetInput("IntermediateOut", {});
-      grad_op->SetOutput(framework::GradVarName("IntermediateOut"), {});
-    }
-
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@Grad) should not be null");
-
-    auto functor_list =
-        ctx->Attrs().Get<std::vector<std::string>>("functor_list");
-
-    if (ctx->Attrs().Get<bool>("save_intermediate_out")) {
-      PADDLE_ENFORCE(ctx->HasInput("IntermediateOut"),
-                     "Input(IntermediateOut) should not be null");
-    } else {
-      if (!InputXCanBeAbsent(functor_list)) {
-        PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-      }
-    }
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    auto inter_grad_name = framework::GradVarName("IntermediateOut");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      if (ctx->HasInputs("X")) {
-        ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
-        ctx->ShareLoD("X", x_grad_name);
-      } else {
-        // Currently, only when Binary is elementwise_add or elementwise_sub,
-        // the "X" could be absent.
-        PADDLE_ENFORCE(InputXCanBeAbsent(functor_list),
-                       "Only when BinaryFunctor is elementwise_add, the 'X' "
-                       "could be absent.");
-
-        // Node: If "X" is absence, the shape of Y should be a continuous
-        // subsequence of X, otherwise, we could not infer the shape of dx.
-
-        ctx->SetOutputDim(x_grad_name,
-                          ctx->GetInputDim(framework::GradVarName("Out")));
-        ctx->ShareLoD(framework::GradVarName("Out"), x_grad_name);
-      }
-    }
-
-    if (ctx->HasOutput(y_grad_name)) {
-      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-      ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("Y"));
-      ctx->ShareLoD("Y", y_grad_name);
-    }
-
-    if (ctx->HasOutput(inter_grad_name)) {
-      // For Unary(Binary(X, Y)), IntermediateOut should not be empty.
-      if (IsUnaryCompound(functor_list)) {
-        ctx->SetOutputDim(inter_grad_name,
-                          ctx->GetInputDim(framework::GradVarName("Out")));
-        ctx->ShareLoD(framework::GradVarName("Out"), inter_grad_name);
-      } else {
-        ctx->SetOutputDim(inter_grad_name, ctx->GetInputDim("Y"));
-        ctx->ShareLoD("Y", inter_grad_name);
-      }
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("Y")->type(),
-                                   ctx.GetPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fused_elemwise_activation, ops::FusedElemwiseActivationOp,
-                  ops::FusedElemwiseActivationMaker,
-                  ops::FusedElemwiseActivationGradMaker);
-REGISTER_OPERATOR(fused_elemwise_activation_grad,
-                  ops::FusedElemwiseActivationOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    fused_elemwise_activation,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
-                                       float>,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
-                                       double>);
-
-REGISTER_OP_CPU_KERNEL(
-    fused_elemwise_activation_grad,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                           float>,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                           double>);
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
deleted file mode 100644
index e10693bae1859307c9cf266965d4ce20e6de1bf9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fused_elemwise_activation,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
-                                       float>,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
-                                       double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    fused_elemwise_activation_grad,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
-                                           double>);
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
deleted file mode 100644
index 7cb753211eab328680ed78c9f3aa5409f487dc41..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ /dev/null
@@ -1,477 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/compound_functors.h"
-#include "paddle/fluid/operators/math/functors.h"
-
-namespace paddle {
-namespace operators {
-
-/**
- * Whether the compound function is Unary(Binary(X, Y)).
- * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
- * out.
- */
-bool IsUnaryCompound(const std::vector<std::string> &functor_list);
-
-/**
- *  For the in-place unary functor, the inputs of op_desc only have Out and
- *  Out@Grad.
- */
-bool HasInPlaceUnary(const std::vector<std::string> &functor_list);
-
-/**
- * Whether the Input(X) could be absent.
- */
-bool InputXCanBeAbsent(const std::vector<std::string> &functor_list);
-
-template <typename DeviceContext, typename T, typename BinaryFunctor,
-          typename UnaryFunctor>
-static void RunBinaryCompoundFunctor(
-    const framework::ExecutionContext &ctx, const BinaryFunctor &binary_functor,
-    const UnaryFunctor &unary_functor, const framework::Tensor &in_x,
-    const framework::Tensor &in_y, std::vector<framework::Tensor *> *outputs) {
-  // Z = Binary(X, Unary(Y))
-  // intermediate_out = Unary(Y)
-  // out = Binary(X, Unary(Y))
-  // In this case, the shape of intermediate_out and out are different.
-  paddle::operators::math::BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>
-      compound_func(binary_functor, unary_functor);
-  int axis = ctx.Attr<int>("axis");
-  if (ctx.Attr<bool>("save_intermediate_out")) {
-    FusedElemwiseAndActComputeEx<DeviceContext, T,
-                                 paddle::operators::math::BinaryCompoundFunctor<
-                                     T, BinaryFunctor, UnaryFunctor>,
-                                 true /*KeepIntermediateValue*/,
-                                 false /*SameShapeOfIntermediateOutAndOut*/>(
-        ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
-  } else {
-    FusedElemwiseAndActComputeEx<DeviceContext, T,
-                                 paddle::operators::math::BinaryCompoundFunctor<
-                                     T, BinaryFunctor, UnaryFunctor>,
-                                 false /*KeepIntermediateValue*/,
-                                 false /*SameShapeOfIntermediateOutAndOut*/>(
-        ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
-  }
-}
-
-template <typename DeviceContext, typename T, typename UnaryFunctor,
-          typename BinaryFunctor>
-static void RunUnaryCompoundFunctors(
-    const framework::ExecutionContext &ctx, const UnaryFunctor &unary_functor,
-    const BinaryFunctor &binary_functor, const framework::Tensor &in_x,
-    const framework::Tensor &in_y, std::vector<framework::Tensor *> *outputs) {
-  // Z = Unary(Binary(X, Y))
-  // intermediate_out = Binary(X, Y)
-  // out = Unary(Binary(X, Y))
-  // In this case, the shape of intermediate_out and out are the same.
-  int axis = ctx.Attr<int>("axis");
-
-  paddle::operators::math::UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>
-      compound_func(unary_functor, binary_functor);
-
-  if (ctx.Attr<bool>("save_intermediate_out")) {
-    FusedElemwiseAndActComputeEx<DeviceContext, T,
-                                 paddle::operators::math::UnaryCompoundFunctor<
-                                     T, UnaryFunctor, BinaryFunctor>,
-                                 true /*KeepIntermediateValue*/,
-                                 true /*SameShapeOfIntermediateOutAndOut*/>(
-        ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
-  } else {
-    FusedElemwiseAndActComputeEx<DeviceContext, T,
-                                 paddle::operators::math::UnaryCompoundFunctor<
-                                     T, UnaryFunctor, BinaryFunctor>,
-                                 false /*KeepIntermediateValue*/,
-                                 true /*SameShapeOfIntermediateOutAndOut*/>(
-        ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]);
-  }
-}
-
-template <typename DeviceContext, typename T, typename BinaryGradFunctor,
-          typename UnaryFunctor, typename UnaryGradFunctor, bool InPlace>
-static void RunBinaryCompoundGradFunctors(
-    const framework::ExecutionContext &ctx,
-    const BinaryGradFunctor &binary_grad_functor,
-    const UnaryFunctor &unary_functor,
-    const UnaryGradFunctor &unary_grad_functor, const framework::Tensor *in_x,
-    const framework::Tensor *in_y, const framework::Tensor *in_out,
-    const framework::Tensor *in_intermediate_out,
-    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
-    framework::Tensor *y_grad, framework::Tensor *d_intermediate_out) {
-  // Z = Binary(X, Unary(Y))
-  int axis = ctx.Attr<int>("axis");
-
-  using BinaryCompoundDxFunctor =
-      paddle::operators::math::BinaryCompoundGradDxFunctor<T, BinaryGradFunctor,
-                                                           UnaryFunctor>;
-  using BinaryCompoundDyFunctor =
-      paddle::operators::math::BinaryCompoundGradDyFunctor<
-          T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor, InPlace>;
-  using BinaryCompoundDIntermedaiteOutFunctor =
-      paddle::operators::math::BinaryCompoundGradDIntermedaiteOutFunctor<
-          T, BinaryGradFunctor, UnaryFunctor>;
-
-  if (in_intermediate_out) {
-    FusedElemwiseAndActGradComputeEx<
-        DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor,
-        BinaryCompoundDIntermedaiteOutFunctor, true /*UseIntermediateOut*/,
-        false /*SameShapeOfIntermediateOutAndOut*/>(
-        ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
-        y_grad, d_intermediate_out,
-        BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
-        BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
-                                unary_grad_functor),
-        BinaryCompoundDIntermedaiteOutFunctor(binary_grad_functor,
-                                              unary_functor));
-  } else {
-    FusedElemwiseAndActGradComputeEx<
-        DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor,
-        BinaryCompoundDIntermedaiteOutFunctor, false /*UseIntermediateOut*/,
-        false /*SameShapeOfIntermediateOutAndOut*/>(
-        ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
-        y_grad, d_intermediate_out,
-        BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
-        BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
-                                unary_grad_functor),
-        BinaryCompoundDIntermedaiteOutFunctor(binary_grad_functor,
-                                              unary_functor));
-  }
-}
-
-template <typename DeviceContext, typename T, typename UnaryGradFunctor,
-          typename BinaryFunctor, typename BinaryGradFunctor, bool InPlace>
-static void RunUnaryCompoundGradFunctors(
-    const framework::ExecutionContext &ctx,
-    const UnaryGradFunctor &unary_grad_functor,
-    const BinaryFunctor &binary_functor,
-    const BinaryGradFunctor &binary_grad_functor, const framework::Tensor *in_x,
-    const framework::Tensor *in_y, const framework::Tensor *in_out,
-    const framework::Tensor *in_intermediate_out,
-    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
-    framework::Tensor *y_grad, framework::Tensor *d_intermediate_out) {
-  // Z = Unary(Binary(X, Y))
-  int axis = ctx.Attr<int>("axis");
-
-  using UnaryCompoundDxFunctor =
-      paddle::operators::math::UnaryCompoundGradDxFunctor<
-          T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>;
-  using UnaryCompoundDyFunctor =
-      paddle::operators::math::UnaryCompoundGradDyFunctor<
-          T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>;
-  using UnaryCompoundDIntermediateFunctor =
-      paddle::operators::math::UnaryCompoundGradDIntermediateFunctor<
-          T, UnaryGradFunctor, BinaryFunctor, InPlace>;
-
-  if (in_intermediate_out) {
-    FusedElemwiseAndActGradComputeEx<
-        DeviceContext, T, UnaryCompoundDxFunctor, UnaryCompoundDyFunctor,
-        UnaryCompoundDIntermediateFunctor, true /*UseIntermediateOut*/,
-        true /*SameShapeOfIntermediateOutAndOut*/>(
-        ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
-        y_grad, d_intermediate_out,
-        UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
-                               binary_grad_functor),
-        UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
-                               binary_grad_functor),
-        UnaryCompoundDIntermediateFunctor(unary_grad_functor, binary_functor));
-  } else {
-    FusedElemwiseAndActGradComputeEx<
-        DeviceContext, T, UnaryCompoundDxFunctor, UnaryCompoundDyFunctor,
-        UnaryCompoundDIntermediateFunctor, false /*UseIntermediateOut*/,
-        true /*SameShapeOfIntermediateOutAndOut*/>(
-        ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
-        y_grad, d_intermediate_out,
-        UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
-                               binary_grad_functor),
-        UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
-                               binary_grad_functor),
-        UnaryCompoundDIntermediateFunctor(unary_grad_functor, binary_functor));
-  }
-}
-
-template <typename DeviceContext, typename T>
-static void RunFunctors(const framework::ExecutionContext &ctx,
-                        const framework::Tensor &in_x,
-                        const framework::Tensor &in_y,
-                        std::vector<framework::Tensor *> *outputs) {
-  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
-
-  // TODO(zcd): The following code can be refined.
-  auto funcs_str = functors[0] + "," + functors[1];
-  if (funcs_str == "elementwise_add,scale") {
-    // Z = Binary(X, Unary(Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::AddFunctor<T>,
-                             paddle::operators::math::ScaleFunctor<T>>(
-        ctx, paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::ScaleFunctor<T>(scale), in_x, in_y, outputs);
-  } else if (funcs_str == "scale,elementwise_add") {
-    // Z = Unary(Binary(X, Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunUnaryCompoundFunctors<DeviceContext, T,
-                             paddle::operators::math::ScaleFunctor<T>,
-                             paddle::operators::math::AddFunctor<T>>(
-        ctx, paddle::operators::math::ScaleFunctor<T>(scale),
-        paddle::operators::math::AddFunctor<T>(), in_x, in_y, outputs);
-  } else if (funcs_str == "elementwise_add,relu") {
-    // Z = Binary(X, Unary(Y))
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::AddFunctor<T>,
-                             paddle::operators::math::ReluFunctor<T>>(
-        ctx, paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::ReluFunctor<T>(), in_x, in_y, outputs);
-  } else if (funcs_str == "relu,elementwise_add") {
-    // Z = Unary(Binary(X, Y))
-    RunUnaryCompoundFunctors<DeviceContext, T,
-                             paddle::operators::math::ReluFunctor<T>,
-                             paddle::operators::math::AddFunctor<T>>(
-        ctx, paddle::operators::math::ReluFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(), in_x, in_y, outputs);
-  } else if (funcs_str == "elementwise_mul,scale") {
-    // Z = Binary(X, Unary(Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::MulFunctor<T>,
-                             paddle::operators::math::ScaleFunctor<T>>(
-        ctx, paddle::operators::math::MulFunctor<T>(),
-        paddle::operators::math::ScaleFunctor<T>(scale), in_x, in_y, outputs);
-  } else if (funcs_str == "tanh,elementwise_add") {
-    // Z = Unary(Binary(X, Y))
-    RunUnaryCompoundFunctors<DeviceContext, T,
-                             paddle::operators::math::TanhFunctor<T>,
-                             paddle::operators::math::AddFunctor<T>>(
-        ctx, paddle::operators::math::TanhFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(), in_x, in_y, outputs);
-  } else if (funcs_str == "elementwise_mul,tanh") {
-    // Z = Binary(X, Unary(Y))
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::MulFunctor<T>,
-                             paddle::operators::math::TanhFunctor<T>>(
-        ctx, paddle::operators::math::MulFunctor<T>(),
-        paddle::operators::math::TanhFunctor<T>(), in_x, in_y, outputs);
-  } else if (funcs_str == "elementwise_mul,sigmoid") {
-    // Z = Binary(X, Unary(Y))
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::MulFunctor<T>,
-                             paddle::operators::math::SigmoidFunctor<T>>(
-        ctx, paddle::operators::math::MulFunctor<T>(),
-        paddle::operators::math::SigmoidFunctor<T>(), in_x, in_y, outputs);
-  } else {
-    PADDLE_THROW("%s has not been implemented.", funcs_str);
-  }
-}
-
-template <typename DeviceContext, typename T, bool InPlace>
-static void RunGradFunctors(
-    const framework::ExecutionContext &ctx, const framework::Tensor *in_x,
-    const framework::Tensor *in_y, const framework::Tensor *in_out,
-    const framework::Tensor *in_intermediate_out,
-    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
-    framework::Tensor *y_grad, framework::Tensor *d_intermediate_out) {
-  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
-  auto funcs_str = functors[0] + "," + functors[1];
-
-  if (funcs_str == "elementwise_add_grad,scale_grad") {
-    // The backward of Z = Binary(X, Unary(Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::AddGradFunctor<T>,
-        paddle::operators::math::ScaleFunctor<T>,
-        paddle::operators::math::ScaleGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::AddGradFunctor<T>(),
-        paddle::operators::math::ScaleFunctor<T>(scale),
-        paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else if (funcs_str == "scale_grad,elementwise_add_grad") {
-    // The backward of Z = Unary(Binary(X, Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunUnaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::ScaleGradFunctor<T>,
-        paddle::operators::math::AddFunctor<T>,
-        paddle::operators::math::AddGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::ScaleGradFunctor<T>(scale),
-        paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else if (funcs_str == "elementwise_add_grad,relu_grad") {
-    // The backward of Z = Binary(X, Unary(Y))
-    RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::AddGradFunctor<T>,
-        paddle::operators::math::ReluFunctor<T>,
-        paddle::operators::math::ReluGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::AddGradFunctor<T>(),
-        paddle::operators::math::ReluFunctor<T>(),
-        paddle::operators::math::ReluGradFunctor<T>(), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else if (funcs_str == "relu_grad,elementwise_add_grad") {
-    // The backward of Z = Unary(Binary(X, Y))
-    RunUnaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::ReluGradFunctor<T>,
-        paddle::operators::math::AddFunctor<T>,
-        paddle::operators::math::AddGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::ReluGradFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else if (funcs_str == "elementwise_mul_grad,scale_grad") {
-    // The backward of Z = Binary(X, Unary(Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::MulGradFunctor<T>,
-        paddle::operators::math::ScaleFunctor<T>,
-        paddle::operators::math::ScaleGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::MulGradFunctor<T>(),
-        paddle::operators::math::ScaleFunctor<T>(scale),
-        paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else if (funcs_str == "tanh_grad,elementwise_add_grad") {
-    // The backward of Z = Unary(Binary(X, Y))
-    RunUnaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::TanhGradFunctor<T>,
-        paddle::operators::math::AddFunctor<T>,
-        paddle::operators::math::AddGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::TanhGradFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else if (funcs_str == "elementwise_mul_grad,tanh_grad") {
-    // The backward of Z = Binary(X, Unary(Y))
-    RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::MulGradFunctor<T>,
-        paddle::operators::math::TanhFunctor<T>,
-        paddle::operators::math::TanhGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::MulGradFunctor<T>(),
-        paddle::operators::math::TanhFunctor<T>(),
-        paddle::operators::math::TanhGradFunctor<T>(), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else if (funcs_str == "elementwise_mul_grad,sigmoid_grad") {
-    // The backward of Z = Binary(X, Unary(Y))
-    RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::MulGradFunctor<T>,
-        paddle::operators::math::SigmoidFunctor<T>,
-        paddle::operators::math::SigmoidGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::MulGradFunctor<T>(),
-        paddle::operators::math::SigmoidFunctor<T>(),
-        paddle::operators::math::SigmoidGradFunctor<T>(), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else {
-    PADDLE_THROW("%s has not been implemented.", funcs_str);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
-                             "Cannot get input tensor %s, variable name = %s",
-                             "X", ctx.op().Input("X"));
-    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
-                             "Cannot get input tensor %s, variable name = %s",
-                             "Y", ctx.op().Input("Y"));
-    PADDLE_ENFORCE(ctx.HasOutput("Out"), "The output(Out) should not be empty");
-    auto output = ctx.Output<framework::Tensor>("Out");
-
-    std::vector<framework::Tensor *> outputs;
-    outputs.emplace_back(output);
-
-    if (ctx.Attr<bool>("save_intermediate_out")) {
-      PADDLE_ENFORCE(ctx.HasOutput("IntermediateOut"),
-                     "The save_intermediate_out is enable, so the "
-                     "IntermediateOut should not be empty.");
-      auto intermediate_out = ctx.Output<framework::Tensor>("IntermediateOut");
-      outputs.emplace_back(intermediate_out);
-    } else {
-      outputs.emplace_back(nullptr);
-    }
-
-    RunFunctors<DeviceContext, T>(ctx, in_x, in_y, &outputs);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto in_y = ctx.Input<framework::Tensor>("Y");
-    PADDLE_ENFORCE(in_y != nullptr, "Input(Y) should not be nullptr.");
-    auto in_out = ctx.Input<framework::Tensor>("Out");
-    PADDLE_ENFORCE(in_out != nullptr, "Input(Out) should not be nullptr.");
-    auto in_out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(in_out_grad != nullptr,
-                   "Input(Out@Grad) should not be nullptr.");
-    framework::Tensor *in_x =
-        const_cast<framework::Tensor *>(ctx.Input<framework::Tensor>("X"));
-    framework::Tensor *x_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    framework::Tensor *y_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    framework::Tensor *d_intermediate_out = ctx.Output<framework::Tensor>(
-        framework::GradVarName("IntermediateOut"));
-
-    auto functor_list = ctx.Attr<std::vector<std::string>>("functor_list");
-
-    // Get intermediate_out
-    framework::Tensor *in_intermediate_out = nullptr;
-    if (ctx.Attr<bool>("save_intermediate_out")) {
-      // if save_intermediate_out is true, for Unary(Binary(x, y)) and
-      // Binary(x, Unary(y)), the Binary(x, y) and Unary(y) not need to
-      // recompute.
-      in_intermediate_out = const_cast<framework::Tensor *>(
-          ctx.Input<framework::Tensor>("IntermediateOut"));
-      PADDLE_ENFORCE(in_intermediate_out != nullptr,
-                     "The option of 'save_intermediate_out' is opened, "
-                     "so the number of 'Out' should be two.");
-    } else {
-      if (!InputXCanBeAbsent(functor_list)) {
-        PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be null.");
-      }
-    }
-
-    // Get in_x
-    if (ctx.HasInput("X")) {
-      PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be nullptr.");
-    } else {
-      // If functor_list contains elementwise_add, the backward doesn't use
-      // in_x, in_y and in_out.
-      PADDLE_ENFORCE(InputXCanBeAbsent(functor_list),
-                     "Only when the compoundfunctor contains "
-                     "elementwise_add_grad, the 'X' could be absent.");
-      in_x = const_cast<framework::Tensor *>(in_out_grad);
-    }
-
-    bool has_in_place = HasInPlaceUnary(functor_list);
-    if (has_in_place) {
-      RunGradFunctors<DeviceContext, T, true /*InPlace*/>(
-          ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad,
-          y_grad, d_intermediate_out);
-    } else {
-      RunGradFunctors<DeviceContext, T, false /*InPlace*/>(
-          ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad,
-          y_grad, d_intermediate_out);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
deleted file mode 100644
index 4c13d39406be3bb5ed6b6103032b7fe811078ca1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ /dev/null
@@ -1,595 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h"
-#include <string>
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-void FusedEmbeddingFCLSTMOp::InferShape(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Embeddings"),
-                 "Assert only one Input(Embeddings) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
-                 "Assert only one Input(WeightH) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Assert only one Output(Hidden) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                 "Assert only one Output(Cell) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasInput("Ids"),
-                 "Input(Ids) of LookupTableOp should not be null.");
-
-  auto table_dims = ctx->GetInputDim("Embeddings");
-  auto ids_dims = ctx->GetInputDim("Ids");
-  int ids_rank = ids_dims.size();
-
-  PADDLE_ENFORCE_EQ(table_dims.size(), 2);
-  PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
-                    "The last dimension of the 'Ids' tensor must be 1.");
-
-  auto x_dims = ctx->GetInputDim("Ids");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(Ids)'s rank must be 2.");
-
-  if (ctx->HasInput("H0")) {
-    PADDLE_ENFORCE(ctx->HasInput("C0"),
-                   "Input(Cell) and Input(Hidden) of LSTM should not "
-                   "be null at the same time.");
-    auto h_dims = ctx->GetInputDim("H0");
-    auto c_dims = ctx->GetInputDim("C0");
-    PADDLE_ENFORCE(h_dims == c_dims,
-                   "The dimension of Input(H0) and Input(C0) "
-                   "should be the same.");
-  }
-
-  auto embeddings_dims = ctx->GetInputDim("Embeddings");
-  PADDLE_ENFORCE_EQ(embeddings_dims.size(), 2,
-                    "The rank of Input(Embeddings) should be 2.");
-
-  auto wh_dims = ctx->GetInputDim("WeightH");
-  int frame_size = wh_dims[1] / 4;
-  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
-                    "The rank of Input(WeightH) should be 2.");
-  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
-                    "The first dimension of Input(WeightH) "
-                    "should be %d.",
-                    frame_size);
-  PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
-                    "The second dimension of Input(WeightH) "
-                    "should be 4 * %d.",
-                    frame_size);
-
-  auto b_dims = ctx->GetInputDim("Bias");
-  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
-  PADDLE_ENFORCE_EQ(b_dims[0], 1,
-                    "The first dimension of Input(Bias) should be 1.");
-  PADDLE_ENFORCE_EQ(
-      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
-      "The second dimension of Input(Bias) should be "
-      "7 * %d if enable peepholes connection or"
-      "4 * %d if disable peepholes",
-      frame_size, frame_size);
-
-  framework::DDim out_dims({x_dims[0], frame_size});
-  ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("Cell", out_dims);
-  ctx->ShareLoD("Ids", "Hidden");
-  ctx->ShareLoD("Ids", "Cell");
-  if (!ctx->Attrs().Get<bool>("use_seq")) {
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                   "Assert only one Output(BatchedInput) of LSTM.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                   "Assert only one Output(BatchedHidden) of LSTM.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
-                   "Assert only one Output(BatchedCell) of LSTM.");
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                   "Assert only one Output(ReorderedH0) of LSTM");
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
-                   "Assert only one Output(ReorderedC0) of LSTM.");
-    ctx->SetOutputDim("BatchedInput", {x_dims[0], wh_dims[1]});
-    ctx->SetOutputDim("BatchedHidden", out_dims);
-    ctx->SetOutputDim("BatchedCell", out_dims);
-  }
-  ctx->SetOutputDim("XX", {x_dims[0], wh_dims[1]});
-  ctx->ShareLoD("Ids", "XX");
-}
-
-framework::OpKernelType FusedEmbeddingFCLSTMOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      ctx.Input<framework::LoDTensor>("Embeddings")->type(),
-      ctx.device_context());
-}
-
-void FusedEmbeddingFCLSTMOpMaker::Make() {
-  AddInput("Ids",
-           "An input with type int32 or int64 "
-           "contains the ids to be looked up in W. "
-           "The last dimension size must be 1.");
-  AddInput("Embeddings",
-           "(Tensor) the learnable weights of X."
-           " - The shape is (M x 4D), where M is the dim size of x, D is the "
-           "hidden size. "
-           " - Weight = {W_cx, W_ix, W_fx, W_ox}");
-  AddInput("WeightH",
-           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
-           " - The shape is (D x 4D), where D is the hidden size. "
-           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
-  AddInput("Bias",
-           "(Tensor) the learnable weights. Almost same as LSTMOp"
-           "Note: we should add the fc bias into this (1x4D) in bias."
-           "input-hidden bias weight and peephole connections weight if "
-           "setting `use_peepholes` True. "
-           "1. `use_peepholes = False` "
-           " - The shape is (1 x 4D). "
-           " - Bias = {b_c, b_i, b_f, b_o}."
-           "2. `use_peepholes = True` "
-           " - The shape is (1 x 7D). "
-           " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
-  AddInput("H0",
-           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
-           "optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size and D is the hidden size.")
-      .AsDispensable();
-  AddInput("C0",
-           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
-           "optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size. `H0` and `C0` can be NULL but only at the same time.")
-      .AsDispensable();
-  AddOutput("Hidden",
-            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
-            "The shape is (T x D), and lod is the same with the `Input`.");
-  AddOutput("Cell",
-            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
-            "The shape is (T x D), and lod is the same with the `Input`.");
-  AddOutput("XX",
-            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
-            " or batched_X (size is T x M), this will be automatically chosen,"
-            " where T is the total time steps in this mini-batch,"
-            " D is the hidden size, M is the dim size of x input.")
-      .AsIntermediate();
-  AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate();
-  AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate();
-  AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
-  AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
-  AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
-  AddAttr<bool>("use_peepholes",
-                "(bool, default: True) "
-                "whether to enable diagonal/peephole connections.")
-      .SetDefault(true);
-  AddAttr<bool>("is_reverse",
-                "(bool, default: False) "
-                "whether to compute reversed LSTM.")
-      .SetDefault(false);
-  AddAttr<bool>("use_seq",
-                "(bool, default: True) "
-                "whether to use seq mode to compute.")
-      .SetDefault(true);
-  AddAttr<std::string>("gate_activation",
-                       "(string, default: sigmoid)"
-                       "The activation for input gate, forget gate and output "
-                       "gate, `sigmoid` by default.")
-      .SetDefault("sigmoid")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddAttr<std::string>("cell_activation",
-                       "(string, default: tanh)"
-                       "The activation for cell output, `tanh` by default.")
-      .SetDefault("tanh")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddAttr<std::string>("candidate_activation",
-                       "(string, default: tanh)"
-                       "The activation for candidate hidden state, "
-                       "`tanh` by default.")
-      .SetDefault("tanh")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddComment(R"DOC(
-Fusion Long-Short Term Memory (LSTM) Operator.
-This operator fuse the X into LSTM, more details can refer to LSTM op.
-)DOC");
-}
-
-template <typename T>
-class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
- public:
-#define INIT_VEC_FUNC                                                          \
-  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
-  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
-  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
-  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
-  if (platform::MayIUse(platform::avx)) {                                      \
-    math::VecActivations<T, platform::avx> act_functor;                        \
-    act_gate = act_functor(act_gate_str);                                      \
-    act_cell = act_functor(act_cell_str);                                      \
-    act_cand = act_functor(act_cand_str);                                      \
-  } else {                                                                     \
-    math::VecActivations<T, platform::isa_any> act_functor;                    \
-    act_gate = act_functor(act_gate_str);                                      \
-    act_cell = act_functor(act_cell_str);                                      \
-    act_cand = act_functor(act_cand_str);                                      \
-  }
-
-#define INIT_BASE_INPUT_OUTPUT                        \
-  auto* ids = ctx.Input<LoDTensor>("Ids");            \
-  auto* h0 = ctx.Input<Tensor>("H0");                 \
-  auto* c0 = ctx.Input<Tensor>("C0");                 \
-  auto* embeddings = ctx.Input<Tensor>("Embeddings"); \
-  auto* wh = ctx.Input<Tensor>("WeightH");            \
-  auto* bias = ctx.Input<Tensor>("Bias");             \
-  auto* xx = ctx.Output<LoDTensor>("XX");             \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
-  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
-  bool use_peepholes = ctx.Attr<bool>("use_peepholes");
-
-#define INIT_BASE_SIZES                                      \
-  auto ids_dims = ids->dims();                   /* T x M*/  \
-  auto ids_numel = framework::product(ids_dims); /* T x 1*/  \
-  auto wh_dims = wh->dims();                     /* D x 4D*/ \
-  const int D = wh_dims[0];                                  \
-  const int D2 = D * 2;                                      \
-  const int D3 = D * 3;                                      \
-  int64_t row_number = embeddings->dims()[0];                \
-  int64_t row_width = embeddings->dims()[1];                 \
-  const int D4 = wh_dims[1];
-
-#define INIT_BASE_INPUT_DATAS                                        \
-  const int64_t* ids_data = ids->data<int64_t>();                    \
-  const T* embeddings_data = embeddings->data<T>();                  \
-  const T* wh_data = wh->data<T>();                                  \
-  /* diagonal weight*/                                               \
-  const T* wc_data = bias->data<T>() + D4;                           \
-  /* for peephole only*/                                             \
-  Tensor checked_cell;                                               \
-  T* checked_cell_data = nullptr;                                    \
-  auto place = ctx.GetPlace();                                       \
-  if (use_peepholes) {                                               \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                 \
-    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
-  }
-
-/// Compute LSTM
-#define GEMM_WH_ADDON(bs, prev, out)                                           \
-  blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
-            wh_data, D4, static_cast<T>(1), out, D4)
-
-// gates: W_ch, W_ih, W_fh, W_oh
-#define GET_Ct(ct_1, gates, ct)                   \
-  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
-  act_cand(D, gates, gates);                      \
-  blas.VMUL(D, gates, gates + D, gates + D);      \
-  blas.VMUL(D, ct_1, gates + D2, gates + D2);     \
-  blas.VADD(D, gates + D, gates + D2, ct)
-
-#define GET_Ht(ct, gates, ht)        \
-  /* H_t = act_cell(C_t) * ogated */ \
-  act_cell(D, ct, gates + D2);       \
-  blas.VMUL(D, gates + D2, gates + D3, ht)
-
-#define GET_Ct_NOH0C0(gates, ct)     \
-  /* C_t = igated * cgated*/         \
-  act_gate(D, gates + D, gates + D); \
-  act_cand(D, gates, gates);         \
-  blas.VMUL(D, gates, gates + D, ct)
-
-#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
-  GET_Ct_NOH0C0(gates, ct);                \
-  act_gate(D, gates + D3, gates + D3);     \
-  GET_Ht(ct, gates, ht)
-
-#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
-  GET_Ct_NOH0C0(gates, ct);                         \
-  /* get outgated, put W_oc * C_t on igated */      \
-  blas.VMUL(D, wc_data + D2, ct, gates + D);        \
-  blas.VADD(D, gates + D, gates + D3, gates + D3);  \
-  act_gate(D, gates + D3, gates + D3);              \
-  GET_Ht(ct, gates, ht)
-
-#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
-  act_gate(D3, gates + D, gates + D);     \
-  GET_Ct(ct_1, gates, ct);                \
-  GET_Ht(ct, gates, ht)
-
-#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht)        \
-  /* get fgated and igated*/                              \
-  blas.VMUL(D, wc_data, ct_1, checked_cell_data);         \
-  blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
-  blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
-  act_gate(D2, gates + D, gates + D);                     \
-  GET_Ct(ct_1, gates, ct);                                \
-  /* get ogated*/                                         \
-  blas.VMUL(D, wc_data + D2, ct, gates + D);              \
-  blas.VADD(D, gates + D, gates + D3, gates + D3);        \
-  act_gate(D, gates + D3, gates + D3);                    \
-  GET_Ht(ct, gates, ht)
-
-  void SeqCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
-    INIT_VEC_FUNC
-    INIT_BASE_INPUT_DATAS
-
-    //  std::cout << "====> SeqCompute" << std::endl;
-    auto ids_lod = ids->lod();
-    const int total_T = ids_dims[0];
-    const int N = ids_lod[0].size() - 1;
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
-    const T* c0_data = c0 ? c0->data<T>() : nullptr;
-    T* xx_data = xx->mutable_data<T>(place);
-    T* h_out_data = hidden_out->mutable_data<T>(place);
-    T* c_out_data = cell_out->mutable_data<T>(place);
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-
-    for (int64_t i = 0; i < ids_numel; ++i) {
-      PADDLE_ENFORCE_LT(ids_data[i], row_number);
-      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
-      memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
-             row_width * sizeof(T));
-    }
-
-    int xx_offset = D4;
-    int gate_offset = D;
-    if (is_reverse) {
-      const int offset = (total_T - 1) * D;
-      xx_data = xx_data + offset * 4;
-      h_out_data = h_out_data + offset;
-      c_out_data = c_out_data + offset;
-      xx_offset = -D4;
-      gate_offset = -D;
-    }
-
-#define MOVE_ONE_STEP                    \
-  prev_h_data = h_out_data;              \
-  prev_c_data = c_out_data;              \
-  xx_data = xx_data + xx_offset;         \
-  h_out_data = h_out_data + gate_offset; \
-  c_out_data = c_out_data + gate_offset
-
-#define PROCESS_H0C0_DEFINES                           \
-  int bid = is_reverse ? N - 1 - i : i;                \
-  int seq_len = ids_lod[0][bid + 1] - ids_lod[0][bid]; \
-  const T* prev_c_data = nullptr;                      \
-  const T* prev_h_data = nullptr;                      \
-  int tstart = 0
-
-#define PROCESS_H0C0_PEEPHOLE                                      \
-  PROCESS_H0C0_DEFINES;                                            \
-  if (h0_data) {                                                   \
-    prev_h_data = h0_data + bid * D;                               \
-    prev_c_data = c0_data + bid * D;                               \
-  } else {                                                         \
-    COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
-    MOVE_ONE_STEP;                                                 \
-    tstart = 1;                                                    \
-  }
-
-#define PROCESS_H0C0                                      \
-  PROCESS_H0C0_DEFINES;                                   \
-  if (h0_data) {                                          \
-    prev_h_data = h0_data + bid * D;                      \
-    prev_c_data = c0_data + bid * D;                      \
-  } else {                                                \
-    COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
-    MOVE_ONE_STEP;                                        \
-    tstart = 1;                                           \
-  }
-
-    if (use_peepholes) {
-      for (int i = 0; i < N; ++i) {
-        PROCESS_H0C0_PEEPHOLE
-        for (int step = tstart; step < seq_len; ++step) {
-          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
-          MOVE_ONE_STEP;
-        }
-      }
-    } else {
-      for (int i = 0; i < N; ++i) {
-        PROCESS_H0C0
-        for (int step = tstart; step < seq_len; ++step) {
-          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
-          MOVE_ONE_STEP;
-        }
-      }
-    }
-#undef PROCESS_H0C0_DEFINES
-#undef PROCESS_H0C0_PEEPHOLE
-#undef PROCESS_H0C0
-#undef MOVE_ONE_STEP
-  }
-
-  void BatchCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = platform::CPUDeviceContext;
-    INIT_BASE_INPUT_OUTPUT
-    if (ids->lod()[0].size() == 2) {
-      SeqCompute(ctx);
-      return;
-    }
-    INIT_BASE_SIZES
-    INIT_VEC_FUNC
-    INIT_BASE_INPUT_DATAS
-
-    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
-    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
-    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
-    auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
-    auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
-    T* xx_data = xx->mutable_data<T>(place);
-    T* batched_input_data = batched_input->mutable_data<T>(place);
-    T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
-    T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
-    hidden_out->mutable_data<T>(place);
-    cell_out->mutable_data<T>(place);
-
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-
-    for (int64_t i = 0; i < ids_numel; ++i) {
-      PADDLE_ENFORCE_LT(ids_data[i], row_number);
-      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
-      memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
-             row_width * sizeof(T));
-    }
-
-    to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
-
-    auto batched_lod = batched_input->lod();
-    const auto& seq_order = batched_lod[2];
-    const int max_bs = seq_order.size();
-    reordered_h0->Resize({max_bs, D});
-    reordered_c0->Resize({max_bs, D});
-
-    int tstart = 0;
-    T* prev_h_data = nullptr;
-    T* prev_c_data = nullptr;
-    if (h0) {
-      // reorder h0, c0
-      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
-      T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
-      const T* h0_data = h0->data<T>();
-      const T* c0_data = c0->data<T>();
-      prev_h_data = reordered_h0_data;
-      prev_c_data = reordered_c0_data;
-      size_t sz = sizeof(T) * D;
-      for (int i = 0; i < max_bs; ++i) {
-        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
-        std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
-        reordered_h0_data += D;
-        reordered_c0_data += D;
-      }
-    } else {
-      // compute without h0, c0
-      T* cur_in_data = batched_input_data;
-      T* cur_h_out_data = batched_h_out_data;
-      T* cur_c_out_data = batched_c_out_data;
-      for (int i = 0; i < max_bs; ++i) {
-        GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
-        if (use_peepholes) {
-          blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
-          blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
-        }
-        act_gate(D, cur_in_data + D3, cur_in_data + D3);
-        GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
-        cur_in_data += D4;
-        cur_c_out_data += D;
-        cur_h_out_data += D;
-      }
-      tstart = 1;
-      prev_h_data = batched_h_out_data;
-      prev_c_data = batched_c_out_data;
-    }
-    const auto& batch_starts = batched_lod[0];
-    const int max_seq_len = batch_starts.size() - 1;
-    const int offset = tstart * max_bs * D;
-    batched_input_data = batched_input_data + offset * 4;
-    batched_h_out_data = batched_h_out_data + offset;
-    batched_c_out_data = batched_c_out_data + offset;
-
-#define DEFINE_CUR                        \
-  T* cur_in_data = batched_input_data;    \
-  T* cur_prev_c_data = prev_c_data;       \
-  T* cur_c_out_data = batched_c_out_data; \
-  T* cur_h_out_data = batched_h_out_data
-
-#define MOVE_ONE_BATCH  \
-  cur_in_data += D4;    \
-  cur_prev_c_data += D; \
-  cur_c_out_data += D;  \
-  cur_h_out_data += D
-
-#define MOVE_ONE_STEP                  \
-  prev_c_data = batched_c_out_data;    \
-  prev_h_data = batched_h_out_data;    \
-  batched_c_out_data = cur_c_out_data; \
-  batched_h_out_data = cur_h_out_data; \
-  batched_input_data = cur_in_data
-
-    if (use_peepholes) {
-      for (int step = tstart; step < max_seq_len; ++step) {
-        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-        DEFINE_CUR;
-        for (int i = 0; i < cur_bs; ++i) {
-          COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                                cur_h_out_data);
-          MOVE_ONE_BATCH;
-        }
-        MOVE_ONE_STEP;
-      }
-    } else {
-      for (int step = tstart; step < max_seq_len; ++step) {
-        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-        DEFINE_CUR;
-        for (int i = 0; i < cur_bs; ++i) {
-          COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                       cur_h_out_data);
-          MOVE_ONE_BATCH;
-        }
-        MOVE_ONE_STEP;
-      }
-    }
-#undef MOVE_ONE_STEP
-#undef MOVE_ONE_BATCH
-#undef DEFINE_CUR
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batched_h_out->set_lod(batched_lod);
-    to_seq(dev_ctx, *batched_h_out, hidden_out);
-    batched_c_out->set_lod(batched_lod);
-    to_seq(dev_ctx, *batched_c_out, cell_out);
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    if (ctx.Attr<bool>("use_seq")) {
-      SeqCompute(ctx);
-    } else {
-      BatchCompute(ctx);
-    }
-  }
-
-#undef COMPUTE_CtHt_PEEPHOLE
-#undef COMPUTE_CtHt
-#undef GET_Ct_NOH0C0
-#undef COMPUTE_CtHt_NOH0C0
-#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
-#undef GET_Ht
-#undef GET_Ct
-#undef GEMM_WH_ADDON
-#undef INIT_BASE_INPUT_DATAS
-#undef INIT_BASE_SIZES
-#undef INIT_BASE_INPUT_OUTPUT
-#undef INIT_VEC_FUNC
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fused_embedding_fc_lstm, ops::FusedEmbeddingFCLSTMOp,
-                  ops::FusedEmbeddingFCLSTMOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fused_embedding_fc_lstm,
-                       ops::FusedEmbeddingFCLSTMKernel<float>,
-                       ops::FusedEmbeddingFCLSTMKernel<double>);
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
deleted file mode 100644
index 2775b2ac04d2890355fe6d75a1e2507a2668dc95..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-class FusedEmbeddingFCLSTMOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusedEmbeddingFCLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
deleted file mode 100644
index 9110099013a20f2718038a31ec94c1e76583149b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-
-namespace paddle {
-namespace operators {
-
-class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input W of FusedEmbeddingSeqPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Ids"),
-                   "Input Ids of FusedEmbeddingSeqPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output of FusedEmbeddingSeqPoolOp should not be null.");
-
-    auto table_dims = ctx->GetInputDim("W");
-    auto ids_dims = ctx->GetInputDim("Ids");
-    const std::string& combiner = ctx->Attrs().Get<std::string>("combiner");
-
-    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
-    PADDLE_ENFORCE_GE(ids_dims.size(), 1,
-                      "The dim size of the 'Ids' tensor must greater than 1.");
-    PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1,
-                      "The last dimension of the 'Ids' tensor must be 1.");
-    // we only support sum now
-    PADDLE_ENFORCE_EQ(combiner, "sum");
-
-    int64_t last_dim = FusedEmbeddingSeqPoolLastDim(table_dims, ids_dims);
-    // in compile time, the lod level of ids must be 1
-    framework::VarDesc* ids_desc =
-        boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Ids")[0]);
-    PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1);
-
-    // in compile time, the shape from Ids -> output
-    // should be [-1, 1] -> [-1, embedding_size]
-    ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim}));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("W",
-             "(Tensor) The input represents embedding tensors, "
-             "which is a learnable parameter.");
-    AddInput("Ids",
-             "An input with type int32 or int64 "
-             "contains the ids to be looked up in W. "
-             "The last dimension size must be 1.");
-    AddOutput("Out", "The lookup results, which have the same type as W.");
-    AddAttr<std::string>("combiner",
-                         "(string, default sum) "
-                         "A string specifying the reduction op. Currently sum "
-                         "are supported, sum computes the weighted sum of the "
-                         "embedding results for each row.")
-        .SetDefault("sum");
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(kNoPadding);
-    // NOTE(minqiyang): grad_inplace is an temporal attribute,
-    // please do NOT set this attribute in python layer.
-    AddAttr<bool>("grad_inplace",
-                  "(boolean, default false) "
-                  "If the grad op reuse the input's variable.")
-        .SetDefault(false);
-    AddAttr<bool>("is_sparse",
-                  "(boolean, default false) "
-                  "Sparse update.")
-        .SetDefault(false);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
-                  "Skip calling InferShape() function in the runtime.")
-        .SetDefault(true);
-    AddComment(R"DOC(
-FusedEmbeddingSeqPool Operator.
-
-Computes embeddings for the given ids and weights.
-
-This operator is used to perform lookups on the parameter W,
-then computes the weighted sum of the lookups results for each row
-and concatenated into a dense tensor.
-
-The input Ids should carry the LoD (Level of Details) information.
-And the output will change the LoD information with input Ids.
-
-)DOC");
-  }
-};
-
-class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto table_dims = ctx->GetInputDim("W");
-    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class FusedEmbeddingSeqPoolOpGradVarTypeInference
-    : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
-    auto attr = ctx->GetAttr("is_sparse");
-    bool is_sparse = boost::get<bool>(attr);
-    if (is_sparse) {
-      VLOG(3) << "fused_embedding_seq_pool_grad op "
-              << framework::GradVarName("W") << " is set to SelectedRows";
-      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
-    } else {
-      VLOG(3) << "fused_embedding_seq_pool_grad op "
-              << framework::GradVarName("W") << " is set to LoDTensor";
-      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
-    }
-    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp,
-                  paddle::framework::DefaultGradOpDescMaker<true>,
-                  ops::FusedEmbeddingSeqPoolOpMaker);
-REGISTER_OPERATOR(fused_embedding_seq_pool_grad,
-                  ops::FusedEmbeddingSeqPoolOpGrad,
-                  ops::FusedEmbeddingSeqPoolOpGradVarTypeInference);
-
-REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool,
-                       ops::FusedEmbeddingSeqPoolKernel<float>,
-                       ops::FusedEmbeddingSeqPoolKernel<double>);
-REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad,
-                       ops::FusedEmbeddingSeqPoolGradKernel<float>,
-                       ops::FusedEmbeddingSeqPoolGradKernel<double>);
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
deleted file mode 100644
index 3fffdf7e020a7c0e931adc3250c28c784e667fdf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ /dev/null
@@ -1,284 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-constexpr int64_t kNoPadding = -1;
-
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__)
-template <typename T>
-void prepare_csr_data(const std::vector<uint64_t> &offset,
-                      const int64_t *ids_data, const size_t idx_width,
-                      T *csr_vals, int *csr_colmuns, int *csr_row_idx,
-                      int64_t padding_idx = kNoPadding) {
-  int val_idx = 0;
-  int row_idx = 0;
-  csr_row_idx[0] = 0;
-
-  std::map<int, int> ids_map;
-
-  // for each sequence in batch
-  for (size_t i = 0; i < offset.size() - 1; ++i) {
-    for (size_t idx = 0; idx < idx_width; ++idx) {
-      ids_map.clear();
-
-      // construct a map for creating csr
-      for (size_t j = offset[i]; j < offset[i + 1]; ++j) {
-        auto ids_value = ids_data[idx + j * idx_width];
-        if (ids_value != padding_idx) {
-          unsigned int word_idx = static_cast<unsigned int>(ids_value);
-          ++ids_map[word_idx];
-        }
-      }
-
-      VLOG(4) << "====sequence %d====" << i;
-      for (std::map<int, int>::const_iterator it = ids_map.begin();
-           it != ids_map.end(); ++it) {
-        VLOG(4) << it->first << " => " << it->second;
-        csr_vals[val_idx] = it->second;
-        csr_colmuns[val_idx] = it->first;
-        ++val_idx;
-      }
-      csr_row_idx[row_idx + 1] = csr_row_idx[row_idx] + ids_map.size();
-      ++row_idx;
-    }
-  }
-}
-#else
-template <typename T>
-struct EmbeddingVSumFunctor {
-  void operator()(const framework::ExecutionContext &context,
-                  const LoDTensor *table_t, const LoDTensor *ids_t,
-                  LoDTensor *output_t) {
-    auto *table = table_t->data<T>();
-    int64_t table_height = table_t->dims()[0];
-    int64_t table_width = table_t->dims()[1];
-    int64_t out_width = output_t->dims()[1];
-    const int64_t *ids = ids_t->data<int64_t>();
-    auto ids_lod = ids_t->lod()[0];
-    int64_t idx_width = ids_t->numel() / ids_lod.back();
-    auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-    PADDLE_ENFORCE_LE(table_width * idx_width, out_width);
-    PADDLE_ENFORCE_GT(ids_lod.size(), 1UL, "The LoD[0] could NOT be empty");
-
-    jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width,
-                                  out_width, jit::SeqPoolType::kSum);
-    for (size_t i = 0; i != ids_lod.size() - 1; ++i) {
-      attr.index_height = ids_lod[i + 1] - ids_lod[i];
-      auto emb_seqpool =
-          jit::KernelFuncs<jit::EmbSeqPoolTuple<T>, platform::CPUPlace>::Cache()
-              .At(attr);
-      emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width,
-                  &attr);
-    }
-  }
-};
-#endif
-
-inline int FusedEmbeddingSeqPoolLastDim(const framework::DDim &table_dims,
-                                        const framework::DDim &ids_dims) {
-  int64_t last_dim = table_dims[1];
-  for (int i = 1; i != ids_dims.size(); ++i) {
-    last_dim *= ids_dims[i];
-  }
-  return last_dim;
-}
-
-template <typename T>
-class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const LoDTensor *ids_t = context.Input<LoDTensor>("Ids");  // int tensor
-    LoDTensor *output_t = context.Output<LoDTensor>("Out");    // float tensor
-    const LoDTensor *table_var = context.Input<LoDTensor>("W");
-    const std::string &combiner_type = context.Attr<std::string>("combiner");
-
-    int64_t last_dim =
-        FusedEmbeddingSeqPoolLastDim(table_var->dims(), ids_t->dims());
-    const auto &ids_lod = ids_t->lod();
-    // in run time, the LoD of ids must be 1
-    PADDLE_ENFORCE_EQ(ids_lod.size(), 1UL,
-                      "The LoD level of Input(Ids) must be 1");
-    int64_t batch_size = ids_lod[0].size() - 1;
-    // in run time, the shape from Ids -> output
-    // should be [seq_length, 1] -> [batch_size, last_dim]
-    output_t->Resize({batch_size, last_dim});
-
-    if (combiner_type == "sum") {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__)
-      int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-      auto output = output_t->mutable_data<T>(context.GetPlace());
-      int64_t table_height = table_var->dims()[0];
-      int64_t table_width = table_var->dims()[1];
-      auto weights = table_var->data<T>();
-
-      const std::vector<uint64_t> offset = ids_lod[0];
-      auto len = ids_t->numel();
-      int idx_width = len / offset.back();
-
-      Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
-      csr_vals_t.Resize({len});
-      csr_colmuns_t.Resize({len});
-      csr_row_idx_t.Resize({(batch_size + 1) * idx_width});
-      auto csr_vals = csr_vals_t.mutable_data<T>(context.GetPlace());
-      auto csr_colmuns = csr_colmuns_t.mutable_data<int>(context.GetPlace());
-      auto csr_row_idx = csr_row_idx_t.mutable_data<int>(context.GetPlace());
-      prepare_csr_data<T>(offset, ids_t->data<int64_t>(), idx_width, csr_vals,
-                          csr_colmuns, csr_row_idx, padding_idx);
-
-      const char transa = 'N';
-      const T alpha = 1.0;
-      const T beta = 0.0;
-      const char matdescra[] = {'G', 'L', 'N', 'C'};
-
-      const int m = batch_size * idx_width;
-      const int n = table_width;
-      const int k = table_height;
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-      blas.CSRMM(&transa, &m, &n, &k, &alpha, matdescra, (const T *)csr_vals,
-                 (const int *)csr_colmuns, (const int *)csr_row_idx,
-                 (const int *)csr_row_idx + 1, weights, &n, &beta, output, &n);
-
-#else
-      EmbeddingVSumFunctor<T> functor;
-      functor(context, table_var, ids_t, output_t);
-#endif
-    }
-  }
-};
-
-template <typename T>
-class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *table_var = context.InputVar("W");
-    DDim table_dim;
-    if (table_var->IsType<LoDTensor>()) {
-      table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
-      table_dim = table_t->value().dims();
-    } else {
-      PADDLE_THROW(
-          "The parameter W of a LookupTable "
-          "must be either LoDTensor or SelectedRows");
-    }
-
-    bool is_sparse = context.Attr<bool>("is_sparse");
-    // Since paddings are not trainable and fixed in forward, the gradient of
-    // paddings makes no sense and we don't deal with it in backward.
-    if (is_sparse) {
-      auto *ids = context.Input<LoDTensor>("Ids");
-      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
-      // runtime shape
-      d_table->set_height(table_dim[0]);
-
-      auto *ids_data = ids->data<int64_t>();
-      int64_t ids_num = ids->numel();
-      auto lod = ids->lod()[0];
-      int64_t out_width = d_output->dims()[1];
-
-      framework::Vector<int64_t> *new_rows = d_table->mutable_rows();
-      new_rows->resize(ids_num);
-      std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t));
-
-      auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_num, table_dim[1]});
-      T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
-      const T *d_output_data = d_output->data<T>();
-
-      auto vbroadcast =
-          jit::KernelFuncs<jit::VBroadcastTuple<T>, platform::CPUPlace>::Cache()
-              .At(out_width);
-      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-        int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-        const T *src = d_output_data + i * out_width;
-        T *dst = d_table_data + lod[i] * out_width;
-        vbroadcast(src, dst, h, out_width);
-      }
-    } else {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__)
-      auto *ids = context.Input<LoDTensor>("Ids");
-      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-      int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-
-      d_table->Resize(table_dim);
-      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
-      memset(d_table_data, 0, d_table->numel() * sizeof(T));
-
-      const auto &ids_lod = ids->lod();
-      PADDLE_ENFORCE_EQ(ids_lod.size(), 1UL,
-                        "The LoD level of Input(Ids) must be 1");
-      const std::vector<uint64_t> offset = ids_lod[0];
-      auto len = ids->numel();
-      int idx_width = len / offset.back();
-
-      Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
-      csr_vals_t.Resize({len});
-      csr_colmuns_t.Resize({len});
-      int64_t batch_size = ids_lod[0].size() - 1;
-      csr_row_idx_t.Resize({(batch_size + 1) * idx_width});
-      auto csr_vals = csr_vals_t.mutable_data<T>(context.GetPlace());
-      auto csr_colmuns = csr_colmuns_t.mutable_data<int>(context.GetPlace());
-      auto csr_row_idx = csr_row_idx_t.mutable_data<int>(context.GetPlace());
-      prepare_csr_data<T>(offset, ids->data<int64_t>(), idx_width, csr_vals,
-                          csr_colmuns, csr_row_idx, padding_idx);
-
-      auto *d_output_data = d_output->data<T>();
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-      int width = static_cast<int>(table_dim[1]);
-      int num_seq = batch_size * idx_width;
-      LOG(INFO) << "num seq = " << num_seq << " width = " << width;
-      for (int i = 0; i < num_seq; ++i) {
-        for (int j = csr_row_idx[i]; j < csr_row_idx[i + 1]; ++j) {
-          unsigned int word_idx = csr_colmuns[j];
-          T val = csr_vals[j];
-          blas.AXPY(width, val, d_output_data + i * width,
-                    d_table_data + word_idx * width);
-        }
-      }
-#else
-      LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";
-#endif
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
deleted file mode 100644
index 7c5d0c71226871a3af10c8ddc16269526f0d88b9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        "Input(X) of fused_fc_elementwise_layernorm should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("W"), true,
-        "Input(W) of fused_fc_elementwise_layernorm should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"), true,
-        "Input(Y) of fused_fc_elementwise_layernorm should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        "Output(Out) of fused_fc_elementwise_layernorm should not be null.");
-
-    auto w_dims = ctx->GetInputDim("W");
-    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "Fully Connected input should be 2-D tensor.");
-
-    if (ctx->HasInput("Bias0")) {
-      auto bias0_dims = ctx->GetInputDim("Bias0");
-      if (bias0_dims.size() == 2) {
-        PADDLE_ENFORCE_EQ(bias0_dims[0], 1,
-                          "The shape of Bias must be [1, dim].");
-        PADDLE_ENFORCE_EQ(bias0_dims[1], w_dims[1],
-                          "The shape of Bias must be [1, dim].");
-      } else if (bias0_dims.size() == 1) {
-        PADDLE_ENFORCE_EQ(bias0_dims[0], w_dims[1],
-                          "The shape of Bias must be [1, dim].");
-      }
-    }
-
-    auto x_dims = ctx->GetInputDim("X");
-    int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), x_num_col_dims,
-        "The input tensor Input's rank of FCOp should be larger than "
-        "in_num_col_dims.");
-
-    auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-    PADDLE_ENFORCE_EQ(
-        x_mat_dims[1], w_dims[0],
-        "Fully Connected input and weigth size do not match. %s, %s");
-
-    std::vector<int64_t> fc_out_dims;
-    for (int i = 0; i < x_num_col_dims; ++i) {
-      fc_out_dims.push_back(x_dims[i]);
-    }
-    fc_out_dims.push_back(w_dims[1]);
-
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(framework::make_ddim(fc_out_dims), y_dims);
-
-    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
-    PADDLE_ENFORCE_LT(
-        begin_norm_axis, y_dims.size(),
-        "'begin_norm_axis' must be less than the rank of Input(Y).");
-
-    auto y_mat_dim = framework::flatten_to_2d(y_dims, begin_norm_axis);
-    int64_t dim_0 = y_mat_dim[0];
-    int64_t dim_1 = y_mat_dim[1];
-    if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
-
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], dim_1,
-                          "scale should with right");
-      }
-    }
-    if (ctx->HasInput("Bias1")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias1").size(), 1);
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias1")[0], dim_1,
-                          "bias should with right");
-      }
-    }
-
-    ctx->SetOutputDim("Out", y_dims);
-    if (ctx->HasOutput("Mean")) {
-      ctx->SetOutputDim("Mean", {dim_0});
-    }
-    if (ctx->HasOutput("Variance")) {
-      ctx->SetOutputDim("Variance", {dim_0});
-    }
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class FusedFCElementwiseLayerNormOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of fully connected operation");
-    AddInput("W",
-             "(Tensor), The weight tensor of fully connected operation. It is "
-             "a 2-D Tensor with shape (I, O)");
-    AddInput("Bias0",
-             "(Tensor, optional), The bias tensor of fully connecred "
-             "operation. It is a 1-D Tensor with shape (O), or a 2-D Tensor "
-             "with shape (1, O).")
-        .AsDispensable();
-    AddInput("Y",
-             "(Tensor), The second input tensor of elementwise_add operation. "
-             "Note that the shape should be the same as fully connect's result "
-             "tensor.");
-    AddInput(
-        "Scale",
-        "(Tensor, optional), It is a 1-D input Tensor of layer_norm operation.")
-        .AsDispensable();
-    AddInput(
-        "Bias1",
-        "(Tensor, optional), It is a 1-D input Tensor of layer_norm operation.")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor), Output after normalization. The shape is the shame as "
-              "layer_norm's input.");
-    AddOutput("Mean", "(Tensor, optional), Mean of the current minibatch")
-        .AsDispensable();
-    AddOutput("Variance",
-              "(Tensor, optional), Variance of the current minibatch")
-        .AsDispensable();
-    AddAttr<int>("x_num_col_dims",
-                 "(int, default 1), This op can take tensors with more than "
-                 "two dimensions as its inputs.")
-        .SetDefault(1)
-        .EqualGreaterThan(1);
-    AddAttr<std::string>("activation_type",
-                         "Activation type used in fully connected operator.")
-        .SetDefault("");
-    AddAttr<float>("epsilon",
-                   "Constant for numerical stability [default 1e-5].")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE_GE(epsilon, 0.0f,
-                            "'epsilon' should be between 0.0 and 0.001.");
-          PADDLE_ENFORCE_LE(epsilon, 0.001f,
-                            "'epsilon' should be between 0.0 and 0.001.");
-        });
-    AddAttr<int>("begin_norm_axis",
-                 "the axis of `begin_norm_axis ... Rank(Y) - 1` will be "
-                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H]. [default 1].")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &begin_norm_axis) {
-          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
-                            "'begin_norm_axis' should be greater than zero.");
-        });
-    AddComment(R"DOC(
-fc_out <= fc(X, W, Bias0)
-add_out <= elementwise_add(fc_out, Y)
-(out, mean, variance) <= layer_norm(add_out, Scale, Bias1)
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fused_fc_elementwise_layernorm,
-                  ops::FusedFCElementwiseLayerNormOp,
-                  ops::FusedFCElementwiseLayerNormOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
deleted file mode 100644
index 74d345257a4f3aeb8eb9db9a7b4e0060e4ba1621..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cub/cub.cuh>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static __device__ __forceinline__ T Relu(T x) {
-  return (x > 0) ? x : 0;
-}
-
-static __device__ __forceinline__ float RealSqrt(float x) { return sqrtf(x); }
-static __device__ __forceinline__ double RealSqrt(double x) { return sqrt(x); }
-
-template <typename T>
-struct PairForLayerNorm {
-  __device__ __forceinline__ PairForLayerNorm() {}
-  __device__ __forceinline__ PairForLayerNorm(const T& first, const T& second)
-      : first_(first), second_(second) {}
-
-  T first_;
-  T second_;
-};
-
-template <typename T>
-struct PairForLayerNormAddFunctor {
-  __device__ __forceinline__ PairForLayerNorm<T> operator()(
-      const PairForLayerNorm<T>& p1, const PairForLayerNorm<T>& p2) {
-    return PairForLayerNorm<T>(p1.first_ + p2.first_, p1.second_ + p2.second_);
-  }
-};
-
-template <typename T, bool DoRelu, int BlockDim>
-__global__ void InplaceAddReluAddLayerNormKernel(const T* y, const T* bias_0,
-                                                 const T* bias_1,
-                                                 const T* scale, T* out,
-                                                 T* mean, T* variance, int M,
-                                                 int N, float epsilon) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T shared_mem[BlockDim + 2];
-
-  for (int i = blockIdx.x; i < M; i += gridDim.x) {
-    int index = i * N + threadIdx.x;
-
-    // The fisrt BlockDim elements will be saved to shared memory.
-    int save_index = threadIdx.x;
-    T* save_ptr = shared_mem;
-
-    double sum_i = 0;
-    double square_sum_i = 0;
-    for (int j = threadIdx.x; j < N; j += blockDim.x) {
-      T tmp_0 = out[index];
-      // Add bias
-      T tmp_1 = bias_0 ? tmp_0 + bias_0[j] : tmp_0;
-      // Relu
-      T tmp_2 = DoRelu ? Relu(tmp_1) : tmp_1;
-      // elementwise_add
-      T tmp_3 = tmp_2 + y[index];
-
-      // Save
-      save_ptr[save_index] = tmp_3;
-      save_ptr = out;
-
-      index += blockDim.x;
-      save_index = index;
-
-      // For layer_norm, reduce to calculate mean and std
-      sum_i += tmp_3;
-      square_sum_i += (tmp_3 * tmp_3);
-    }
-
-    auto pair = BlockReduce(temp_storage)
-                    .Reduce(PairForLayerNorm<double>(sum_i, square_sum_i),
-                            PairForLayerNormAddFunctor<double>());
-
-    if (threadIdx.x == 0) {
-      T mean_i = static_cast<T>(pair.first_ / N);
-      T variance_i = static_cast<T>(pair.second_ / N - mean_i * mean_i);
-      shared_mem[BlockDim] = mean_i;
-      shared_mem[BlockDim + 1] = variance_i;
-      if (mean) {
-        mean[blockIdx.x] = mean_i;
-      }
-      if (variance) {
-        variance[blockIdx.x] = variance_i;
-      }
-    }
-    __syncthreads();
-    T mean_i = shared_mem[BlockDim];
-    T std_i = static_cast<T>(RealSqrt(shared_mem[BlockDim + 1] + epsilon));
-
-    index = i * N + threadIdx.x;
-    // First BlockDim elements loading from shared memory.
-    save_index = threadIdx.x;
-    save_ptr = shared_mem;
-
-    // For layer_norm, calculate out
-    for (int j = threadIdx.x; j < N; j += blockDim.x) {
-      T tmp_0 = (save_ptr[save_index] - mean_i) / std_i;
-      T tmp_1 = scale ? scale[j] * tmp_0 : tmp_0;
-      out[index] = bias_1 ? tmp_1 + bias_1[j] : tmp_1;
-
-      save_ptr = out;
-      index += blockDim.x;
-      save_index = index;
-    }
-  }
-}
-
-template <typename T>
-class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* w = ctx.Input<framework::Tensor>("W");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto w_dims = w->dims();
-    int N = w_dims[1];
-    int K = w_dims[0];
-    int M = framework::product(x->dims()) / K;
-
-    const T* x_data = x->data<T>();
-    const T* w_data = w->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
-    blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), x_data, K, w_data, N,
-              static_cast<T>(0.0), out_data, N);
-
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* bias_0 = ctx.Input<framework::Tensor>("Bias0");
-    auto* bias_1 = ctx.Input<framework::Tensor>("Bias1");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-
-    const T* y_data = y->data<T>();
-    const T* bias_0_data = bias_0 ? bias_0->data<T>() : nullptr;
-    const T* bias_1_data = bias_1 ? bias_1->data<T>() : nullptr;
-    const T* scale_data = scale ? scale->data<T>() : nullptr;
-
-    auto* mean = ctx.Output<framework::Tensor>("Mean");
-    auto* variance = ctx.Output<framework::Tensor>("Variance");
-
-    T* mean_data = mean ? mean->mutable_data<T>(ctx.GetPlace()) : nullptr;
-    T* variance_data =
-        variance ? variance->mutable_data<T>(ctx.GetPlace()) : nullptr;
-
-    bool with_relu =
-        (ctx.Attr<std::string>("activation_type") == "relu") ? true : false;
-    float epsilon = ctx.Attr<float>("epsilon");
-
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    if (with_relu) {
-      switch (platform::RoundToPowerOfTwo(N)) {
-        CUDA_LAUNCH_KERNEL_HELPER(
-            InplaceAddReluAddLayerNormKernel<
-                T, true,
-                kPowerOfTwoDim><<<std::max(max_threads / kPowerOfTwoDim, 1),
-                                  kPowerOfTwoDim, 0, dev_ctx.stream()>>>(
-                y_data, bias_0_data, bias_1_data, scale_data, out_data,
-                mean_data, variance_data, M, N, epsilon));
-      }
-    } else {
-      switch (platform::RoundToPowerOfTwo(N)) {
-        CUDA_LAUNCH_KERNEL_HELPER(
-            InplaceAddReluAddLayerNormKernel<
-                T, false,
-                kPowerOfTwoDim><<<std::max(max_threads / kPowerOfTwoDim, 1),
-                                  kPowerOfTwoDim, 0, dev_ctx.stream()>>>(
-                y_data, bias_0_data, bias_1_data, scale_data, out_data,
-                mean_data, variance_data, M, N, epsilon));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(fused_fc_elementwise_layernorm,
-                        ops::FusedFCElementwiseLayerNormOpKernel<float>,
-                        ops::FusedFCElementwiseLayerNormOpKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
deleted file mode 100644
index 80cc0eb8cb6f6961e8e6a284ac50c9de35d6e36d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-
-namespace paddle {
-namespace operators {
-
-class ConvInceptionFusionOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // 1 x
-    auto in_dims = ctx->GetInputDim("Input");
-    // 4 filters
-    auto w_dims = ctx->GetInputsDim("Filter");
-
-    PADDLE_ENFORCE(in_dims.size(), 4, "Conv intput should be 4-D tensor.");
-    PADDLE_ENFORCE_EQ(w_dims.size(), 4, "There should be 4 filters");
-    PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1]);
-    PADDLE_ENFORCE_EQ(w_dims[1][1], in_dims[1]);
-
-    int n = in_dims[0];
-    // compute output channel
-    // 1st channel
-    int c = w_dims[0][0];
-    // add 2nd channel
-    c += (w_dims[1][0] - w_dims[2][1] * 2);
-    // add 3rd channel
-    c += (w_dims[2][0] - w_dims[3][1]);
-    // add 4-th channel
-    c += w_dims[3][0];
-
-    int h = in_dims[2];
-    int w = in_dims[3];
-
-    ctx->SetOutputDim("Output", {n, c, h, w});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
-  }
-};
-
-class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker {
- protected:
-  void Make() override {
-    AddInput("Input", "(Tensor) NCHW layout.");
-    AddInput("Filter", "(vector<Tensor>) 4 aggregated filters").AsDuplicable();
-    AddInput("Bias", "(vector<Tensor>) it's length is equal to Filter")
-        .AsDuplicable();
-    AddOutput("Output",
-              "(Tensor) The output tensor of convolution operator. "
-              "The format of output tensor is also NCHW.");
-    AddOutput("TempOutput", "").AsDuplicable();
-    AddAttr<std::string>(
-        "pooling_type",
-        "(string), pooling type, can be \"max\" for max-pooling "
-        "and \"avg\" for average-pooling.")
-        .InEnum({"max", "avg"});
-    AddAttr<bool>(
-        "exclusive",
-        "(bool, default True) When true, will exclude the zero-padding in the "
-        "averaging calculating, otherwise, include the zero-padding. Note, it "
-        "is only used when pooling_type is avg. The default is True.")
-        .SetDefault(true);
-    AddAttr<std::string>(
-        "activation",
-        "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
-        "'relux' , 'tanh', 'band_pass'")
-        .SetDefault("relu");
-    AddAttr<int>("workspace_size_MB",
-                 "Only used in cudnn kernel. Need set use_cudnn to true."
-                 "workspace size for cudnn, in MB, "
-                 "workspace is a section of GPU memory which will be "
-                 "allocated/freed each time the operator runs, larger "
-                 "workspace size can increase performance but also requires "
-                 "better hardware. This size should be chosen carefully.")
-        .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB);
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(conv2d_inception_fusion, ops::ConvInceptionFusionOp,
-                  ops::ConvInceptionFusionOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
deleted file mode 100644
index 63e97ab5d98cdb906bab1aaf759de00d16455729..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-DECLARE_uint64(conv_workspace_size_limit);
-
-namespace paddle {
-namespace operators {
-
-#if CUDNN_VERSION >= 7100
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
-using DataLayout = platform::DataLayout;
-
-using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
-using PoolingMode = platform::PoolingMode;
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-
-template <typename T>
-class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto* input = ctx.Input<Tensor>("Input");
-    auto filters = ctx.MultiInput<framework::Tensor>("Filter");
-    auto bias = ctx.MultiInput<framework::Tensor>("Bias");
-
-    auto* output = ctx.Output<Tensor>("Output");
-    auto temp_outs = ctx.MultiOutput<framework::Tensor>("TempOutput");
-
-    const std::string pool_type = ctx.Attr<std::string>("pooling_type");
-    const std::string activation = ctx.Attr<std::string>("activation");
-    const bool exclusive = ctx.Attr<bool>("exclusive");
-
-    int64_t user_workspace_size =
-        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    T* temp_data = temp_outs[0]->mutable_data<T>(input->dims(), ctx.GetPlace());
-
-    DataLayout layout = DataLayout::kNCHW;
-    std::vector<int> in_dim = framework::vectorize<int>(input->dims());
-
-    // ------------------- cudnn descriptors ---------------------
-    PoolingMode pooling_mode;
-    if (pool_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : (PoolingMode::kAverageInclusive);
-    }
-    std::vector<int> k0x0 = {0, 0};
-    std::vector<int> k1x1 = {1, 1};
-    std::vector<int> k1x1_2 = {1, 1};
-    std::vector<int> k3x3 = {3, 3};
-    ScopedPoolingDescriptor pool_desc;
-    ScopedActivationDescriptor act_desc;
-    ScopedTensorDescriptor out_pool_desc;
-    ScopedTensorDescriptor input_desc;
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1);
-
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()));
-    cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()));
-
-    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
-    cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4];
-    cudnnFilterDescriptor_t* filter_desc = new cudnnFilterDescriptor_t[4];
-    cudnnTensorDescriptor_t* bias_desc = new cudnnTensorDescriptor_t[4];
-    cudnnTensorDescriptor_t* in_desc = new cudnnTensorDescriptor_t[4];
-    cudnnConvolutionDescriptor_t* conv_desc =
-        new cudnnConvolutionDescriptor_t[4];
-    for (int i = 0; i < 4; ++i) {
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
-    }
-
-    std::vector<std::vector<int>> filter_dims;
-    std::vector<std::vector<int>> bias_dims;
-    std::vector<std::vector<int>> in_dims;
-    std::vector<std::vector<int>> out_dims;
-    std::vector<std::vector<int>> in_strides;
-    std::vector<std::vector<int>> out_strides;
-    std::vector<std::vector<int>> bias_strides;
-
-    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
-    int n = in_dim[0];
-    int h = in_dim[2];
-    int w = in_dim[3];
-    int oc = output->dims()[1];
-
-    cudnnDataType_t compute_type = (cudnn_dtype == CUDNN_DATA_DOUBLE)
-                                       ? CUDNN_DATA_DOUBLE
-                                       : CUDNN_DATA_FLOAT;
-
-    for (int i = 0; i < 4; ++i) {
-      filter_dims.push_back(framework::vectorize<int>(filters[i]->dims()));
-      CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
-          filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
-      bias_dims.push_back({1, filter_dims[i][0], 1, 1});
-      bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(),
-          bias_strides[i].data()));
-      in_dims.push_back({n, filter_dims[i][1], h, w});
-      out_dims.push_back({n, filter_dims[i][0], h, w});
-      in_strides.push_back({filter_dims[i][1] * h * w, h * w, w, 1});
-      out_strides.push_back({oc * h * w, h * w, w, 1});
-
-      if (i < 2) {
-        CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
-            conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(),
-            CUDNN_CROSS_CORRELATION, compute_type));
-      } else {
-        CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
-            conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(),
-            CUDNN_CROSS_CORRELATION, compute_type));
-      }
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-          conv_desc[i], CUDNN_DEFAULT_MATH));
-    }
-    in_dims[2][1] *= 2;
-    in_strides[2][0] = oc * h * w;
-    out_strides[2][0] = filter_dims[2][0] * h * w;  // this out is continuous.
-    in_strides[3][0] = filter_dims[2][0] * h * w;
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
-
-    cudnnConvolutionFwdAlgo_t algo[4];
-    auto handle = dev_ctx.cudnn_handle();
-    size_t workspace_size_in_bytes = 0;  // final workspace to allocate.
-
-    size_t workspace_size_limit = 0;
-    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
-      int64_t max_user_size =
-          std::min(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
-                   user_workspace_size);
-      workspace_size_limit = max_user_size * 1024 * 1024;
-    }
-
-    for (int i = 0; i < 4; ++i) {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
-          out_strides[i].data()));
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
-          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit,
-          &algo[i]));
-      size_t tmp_size = 0;
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-          handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
-          algo[i], &tmp_size));
-      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
-    }
-    cudnnActivationDescriptor_t cudnn_act_desc =
-        act_desc.descriptor<T>(activation);
-
-    int oc0 = filter_dims[0][0];
-    int oc1 = filter_dims[1][0] - filter_dims[2][1] * 2;
-    int oc3 = filter_dims[3][0];
-    int oc2 = oc - oc0 - oc1 - oc3;
-
-    // branch1: pool + 1x1 conv
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
-        pool_out_desc, temp_data));
-
-    std::vector<const void*> in_datas;
-    in_datas.push_back(static_cast<const void*>(temp_data));
-    in_datas.push_back(static_cast<const void*>(input_data));
-    in_datas.push_back(
-        static_cast<const void*>(output_data + (oc0 + oc1) * h * w));
-    T* temp2_data = temp_outs[1]->mutable_data<T>(
-        framework::make_ddim(out_dims[2]), ctx.GetPlace());
-    in_datas.push_back(static_cast<const void*>(temp2_data + oc2 * h * w));
-
-    std::vector<void*> out_datas;
-    out_datas.push_back(static_cast<void*>(output_data));
-    out_datas.push_back(static_cast<void*>(output_data + oc0 * h * w));
-    out_datas.push_back(static_cast<void*>(temp2_data));
-    out_datas.push_back(
-        static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
-
-    for (int i = 0; i < 4; ++i) {
-      auto func = [&](void* cudnn_workspace) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-            handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
-            static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
-            algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
-            out_desc[i], out_datas[i], bias_desc[i],
-            static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
-            out_desc[i], out_datas[i]));
-      };
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-      workspace_handle.RunFunc(func, workspace_size_in_bytes);
-    }
-
-    cudnnTensorDescriptor_t x_desc;
-    cudnnTensorDescriptor_t y_desc;
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
-    CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor(
-        handle, CudnnDataType<T>::kOne(), x_desc,
-        static_cast<const void*>(out_datas[2]), CudnnDataType<T>::kZero(),
-        y_desc, static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
-
-    for (int i = 0; i < 4; ++i) {
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
-    }
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
-  }
-};
-#endif
-
-}  // namespace operators
-}  // namespace paddle
-
-#if CUDNN_VERSION >= 7100
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion,
-                        ops::CUDNNConvInceptionFusionOpKernel<float>,
-                        ops::CUDNNConvInceptionFusionOpKernel<double>);
-#endif
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
deleted file mode 100644
index 5c89509907375b5f2089224c21dd1ef67872c2fd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ /dev/null
@@ -1,404 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_gru_op.h"
-#include <cstring>  // for memcpy
-#include <string>
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-
-void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of GRU.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
-                 "Assert only one Input(WeightX) of GRU.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
-                 "Assert only one Input(WeightH) of GRU.");
-  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of GRU.");
-  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Assert only one Output(Hidden) of GRU.");
-
-  auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-
-  auto wx_dims = ctx->GetInputDim("WeightX");
-  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
-                    "The rank of Input(WeightX) should be 2.");
-  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
-                    "The first dimension of Input(WeightX) "
-                    "should be %d.",
-                    x_dims[1]);
-
-  int frame_size = wx_dims[1] / 3;
-  auto wh_dims = ctx->GetInputDim("WeightH");
-  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
-                    "The rank of Input(WeightH) should be 2.");
-  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
-                    "The first dimension of Input(WeightH) "
-                    "should be %d.",
-                    frame_size);
-  PADDLE_ENFORCE_EQ(wh_dims[1], 3 * frame_size,
-                    "The second dimension of Input(WeightH) "
-                    "should be 3 * %d.",
-                    frame_size);
-
-  if (ctx->HasInput("H0")) {
-    auto h0_dims = ctx->GetInputDim("H0");
-    PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
-                      "The width of H0 must be equal to frame_size.");
-  }
-  if (ctx->HasInput("Bias")) {
-    auto b_dims = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
-    PADDLE_ENFORCE_EQ(b_dims[0], 1,
-                      "The first dimension of Input(Bias) should be 1.");
-    PADDLE_ENFORCE_EQ(b_dims[1], frame_size * 3,
-                      "The shape of Bias must be [1, frame_size * 3].");
-  }
-  framework::DDim out_dims({x_dims[0], frame_size});
-  ctx->SetOutputDim("Hidden", out_dims);
-  ctx->ShareLoD("X", "Hidden");
-  int xx_width;
-  if (ctx->Attrs().Get<bool>("use_seq")) {
-    xx_width = wx_dims[1];
-  } else {
-    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                   "Assert only one Output(ReorderedH0) of GRU.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                   "Assert only one Output(BatchedInput) of GRU.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
-                   "Assert only one Output(BatchedOut) of GRU.");
-    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
-    ctx->SetOutputDim("BatchedOut", out_dims);
-  }
-  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
-  ctx->ShareLoD("X", "XX");
-}
-
-framework::OpKernelType FusionGRUOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                 ctx.device_context());
-}
-
-void FusionGRUOpMaker::Make() {
-  AddInput("X",
-           "(LoDTensor) the input is a LodTensor, which support "
-           "variable-time length input sequence. The underlying tensor in "
-           "this LoDTensor is a matrix with shape (T X M), where T is the "
-           "total time steps in this mini-batch, M is the dim size of x.");
-  AddInput("H0",
-           "(Tensor, optional) The initial hidden state is an optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size, D is the hidden size.")
-      .AsDispensable();
-  AddInput("WeightX",
-           "(Tensor) The FC weight with shape (M x 3D),"
-           "where M is the dim size of x, D is the hidden size. ");
-  AddInput("WeightH",
-           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
-           "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
-           "Acutally they are D x 2D and D x D two part weights."
-           "{W_update, W_reset; W_state}"
-           "{D x (D + D); D x D}");
-  AddInput("Bias",
-           "(Tensor, optional) (1 x 3D)."
-           "Almost same as GRUOp."
-           "Note: if have FC bias it should be added on this bias.")
-      .AsDispensable();
-  AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.")
-      .AsIntermediate();
-  AddOutput("XX",
-            "(LoDTensor) the result after X * WeightX (size is T x 3D)"
-            " or batched_X (size is T x M), this will be automatically chosen,"
-            " where T is the total time steps in this mini-batch,"
-            " D is the hidden size, M is the dim size of x input.")
-      .AsIntermediate();
-  AddOutput("BatchedInput",
-            "(LoDTensor) This is the batched result of input X"
-            "or the batched result after fc, shape (T x 3D)")
-      .AsIntermediate();
-  AddOutput("BatchedOut", "(LoDTensor) (T X D) save batched hidden.")
-      .AsIntermediate();
-  AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp");
-  AddAttr<std::string>("activation",
-                       "(string, default tanh) "
-                       "The activation type used for output candidate {h}_t.")
-      .SetDefault("tanh");
-  AddAttr<std::string>(
-      "gate_activation",
-      "(string, default sigmoid) "
-      "The activation type used in update gate and reset gate.")
-      .SetDefault("sigmoid");
-  AddAttr<bool>("is_reverse",
-                "(bool, default: False) "
-                "whether to compute reversed GRU.")
-      .SetDefault(false);
-  AddAttr<bool>("use_seq",
-                "(bool, default: True) "
-                "whether to use seq mode to compute GRU.")
-      .SetDefault(true);
-  AddComment(R"DOC(
-The Fusion complete GRU Operator.
-This operator fuse the fully-connected operator into GRU, 
-more details can refer to GRU op.
-)DOC");
-}
-
-template <typename T>
-class FusionGRUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    if (ctx.Attr<bool>("use_seq")) {
-      SeqCompute(ctx);
-    } else {
-      BatchCompute(ctx);
-    }
-  }
-
-#define INIT_BASE_DEFINES                  \
-  auto* x = ctx.Input<LoDTensor>("X");     \
-  auto* wh = ctx.Input<Tensor>("WeightH"); \
-  auto* xx = ctx.Output<LoDTensor>("XX");  \
-  auto x_lod = x->lod();                   \
-  auto x_dims = x->dims();   /* T x M*/    \
-  auto wh_dims = wh->dims(); /* D x 3D*/   \
-  const int total_T = x_dims[0];           \
-  const int D3 = wh_dims[1]
-
-#define INIT_OTHER_DEFINES                                                   \
-  auto* h0 = ctx.Input<Tensor>("H0");                                        \
-  auto* wx = ctx.Input<Tensor>("WeightX");                                   \
-  auto* bias = ctx.Input<Tensor>("Bias");                                    \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                        \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");                            \
-  const int M = x_dims[1];                                                   \
-  const int D = wh_dims[0];                                                  \
-  const int D2 = D * 2;                                                      \
-  const jit::gru_attr_t attr(                                                \
-      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),       \
-      jit::to_kerneltype(ctx.Attr<std::string>("activation")));              \
-  jit::gru_t one_step;                                                       \
-  auto ComputeH1 =                                                           \
-      jit::KernelFuncs<jit::GRUH1Tuple<T>, platform::CPUPlace>::Cache().At(  \
-          attr);                                                             \
-  auto ComputeHtPart1 =                                                      \
-      jit::KernelFuncs<jit::GRUHtPart1Tuple<T>, platform::CPUPlace>::Cache() \
-          .At(attr);                                                         \
-  auto ComputeHtPart2 =                                                      \
-      jit::KernelFuncs<jit::GRUHtPart2Tuple<T>, platform::CPUPlace>::Cache() \
-          .At(attr);                                                         \
-  const T* x_data = x->data<T>();                                            \
-  const T* wx_data = wx->data<T>();                                          \
-  const T* wh_data = wh->data<T>();                                          \
-  auto place = ctx.GetPlace();                                               \
-  T* xx_data = xx->mutable_data<T>(place)
-
-  void SeqCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
-    INIT_BASE_DEFINES;
-    INIT_OTHER_DEFINES;
-    const int N = x_lod[0].size() - 1;
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
-    const T* wh_state_data = wh_data + D * D2;
-    T* hidden_out_data = hidden_out->mutable_data<T>(place);
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::FCFunctor<DeviceContext, T> fc;
-    fc(dev_ctx, total_T, D3, M, x_data, wx_data, xx_data,
-       bias ? bias->data<T>() : nullptr);
-
-    int xx_offset = D3;
-    int gate_offset = D;
-    if (is_reverse) {
-      const int offset = (total_T - 1) * D;
-      xx_data = xx_data + offset * 3;
-      hidden_out_data = hidden_out_data + offset;
-      xx_offset = -D3;
-      gate_offset = -D;
-    }
-    auto move_step = [&]() {
-      xx_data = xx_data + xx_offset;
-      hidden_out_data = hidden_out_data + gate_offset;
-    };
-    for (int i = 0; i < N; ++i) {
-      int bid = is_reverse ? N - 1 - i : i;
-      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
-      const T* prev_hidden_data = nullptr;
-      int tstart = 0;
-      if (h0_data) {
-        prev_hidden_data = h0_data + bid * D;
-      } else {
-        one_step.gates = xx_data;
-        one_step.ht = hidden_out_data;
-        ComputeH1(&one_step, &attr);
-        prev_hidden_data = hidden_out_data;
-        tstart = 1;
-        move_step();
-      }
-      for (int step = tstart; step < seq_len; ++step) {
-        // gemm prev * (Wu + Wr)
-        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
-                  prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
-                  D3);
-        one_step.gates = xx_data;
-        one_step.ht_1 = prev_hidden_data;
-        one_step.ht = hidden_out_data;
-        ComputeHtPart1(&one_step, &attr);
-        // gemm rt * Ws
-        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
-                  hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
-                  xx_data + D2, D3);
-        ComputeHtPart2(&one_step, &attr);
-        // save prev
-        prev_hidden_data = hidden_out_data;
-        move_step();
-      }
-    }
-  }
-
-  void BatchCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
-    INIT_BASE_DEFINES;
-    if (x_lod[0].size() == 2) {
-      xx->Resize({total_T, D3});
-      SeqCompute(ctx);
-      return;
-    }
-    INIT_OTHER_DEFINES;
-    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
-    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
-    auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");
-    T* batched_input_data = batched_input->mutable_data<T>(place);
-    T* batched_out_data = batched_out->mutable_data<T>(place);
-    hidden_out->mutable_data<T>(place);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-
-    math::FCFunctor<DeviceContext, T> fc;
-    if (M > D3) {
-      fc(dev_ctx, total_T, D3, M, x_data, wx_data, xx_data,
-         bias ? bias->data<T>() : nullptr);
-      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
-    } else {
-      to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_input->set_lod(xx->lod());
-      fc(dev_ctx, total_T, D3, M, xx_data, wx_data, batched_input_data,
-         bias ? bias->data<T>() : nullptr);
-    }
-
-    auto batched_lod = batched_input->lod();
-    const auto& seq_order = batched_lod[2];
-    const int max_bs = seq_order.size();
-    reordered_h0->Resize({max_bs, D});
-
-    int tstart = 0;
-    T* prev_hidden_data = nullptr;
-    if (h0) {
-      // reorder h0
-      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
-      const T* h0_data = h0->data<T>();
-      prev_hidden_data = reordered_h0_data;
-      size_t sz = sizeof(T) * D;
-      for (int i = 0; i < max_bs; ++i) {
-        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
-        reordered_h0_data += D;
-      }
-    } else {
-      // compute without h0
-      T* cur_in_data = batched_input_data;
-      T* cur_out_data = batched_out_data;
-      // W: {W_update, W_reset; W_state}
-      for (int i = 0; i < max_bs; ++i) {
-        one_step.gates = cur_in_data;
-        one_step.ht = cur_out_data;
-        ComputeH1(&one_step, &attr);
-        // add offset
-        cur_in_data += D3;
-        cur_out_data += D;
-      }
-      tstart = 1;
-      prev_hidden_data = batched_out_data;
-    }
-    // Then start from next
-    const T* wh_state_data = wh_data + D * D2;
-    const auto& batch_starts = batched_lod[0];
-    const int max_seq_len = batch_starts.size() - 1;
-    batched_input_data = batched_input_data + tstart * max_bs * D3;
-    batched_out_data = batched_out_data + tstart * max_bs * D;
-    for (int step = tstart; step < max_seq_len; ++step) {
-      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-      // gemm prev * (Wu + Wr)
-      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D2, D, static_cast<T>(1),
-                prev_hidden_data, D, wh_data, D2, static_cast<T>(1),
-                batched_input_data, D3);
-
-      T* cur_batched_data = batched_input_data;
-      T* cur_out_data = batched_out_data;
-      T* cur_prev_hidden_data = prev_hidden_data;
-      for (int i = 0; i < cur_bs; ++i) {
-        one_step.gates = cur_batched_data;
-        one_step.ht_1 = cur_prev_hidden_data;
-        one_step.ht = cur_out_data;
-        ComputeHtPart1(&one_step, &attr);
-
-        cur_batched_data += D3;
-        cur_prev_hidden_data += D;
-        cur_out_data += D;
-      }
-
-      cur_batched_data = batched_input_data;
-      cur_out_data = batched_out_data;
-      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D, D, static_cast<T>(1),
-                cur_out_data, D, wh_state_data, D, static_cast<T>(1),
-                cur_batched_data + D2, D3);
-
-      cur_prev_hidden_data = prev_hidden_data;
-      for (int i = 0; i < cur_bs; ++i) {
-        one_step.gates = cur_batched_data;
-        one_step.ht_1 = cur_prev_hidden_data;
-        one_step.ht = cur_out_data;
-        ComputeHtPart2(&one_step, &attr);
-        cur_batched_data += D3;
-        cur_prev_hidden_data += D;
-        cur_out_data += D;
-      }
-      prev_hidden_data = batched_out_data;
-      batched_out_data = cur_out_data;
-      batched_input_data = cur_batched_data;
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batched_out->set_lod(batched_lod);
-    to_seq(dev_ctx, *batched_out, hidden_out);
-  }
-#undef INIT_OTHER_DEFINES
-#undef INIT_BASE_DEFINES
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel<float>,
-                       ops::FusionGRUKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.h b/paddle/fluid/operators/fused/fusion_gru_op.h
deleted file mode 100644
index eaa59cd412f8f2fd0089428f5e25202c70f032c7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_gru_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-class FusionGRUOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusionGRUOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
deleted file mode 100644
index 32f0e37a64b98d7e184bd6522504b6821a548af4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ /dev/null
@@ -1,481 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
-#include <string>
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-
-void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
-                 "Assert only one Input(WeightX) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
-                 "Assert only one Input(WeightH) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Assert only one Output(Hidden) of LSTM.");
-  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                 "Assert only one Output(Cell) of LSTM.");
-
-  auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-
-  if (ctx->HasInput("H0")) {
-    PADDLE_ENFORCE(ctx->HasInput("C0"),
-                   "Input(Cell) and Input(Hidden) of LSTM should not "
-                   "be null at the same time.");
-    auto h_dims = ctx->GetInputDim("H0");
-    auto c_dims = ctx->GetInputDim("C0");
-    PADDLE_ENFORCE(h_dims == c_dims,
-                   "The dimension of Input(H0) and Input(C0) "
-                   "should be the same.");
-  }
-
-  auto wx_dims = ctx->GetInputDim("WeightX");
-  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
-                    "The rank of Input(WeightX) should be 2.");
-  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
-                    "The first dimension of Input(WeightX) "
-                    "should be %d.",
-                    x_dims[1]);
-
-  int frame_size = wx_dims[1] / 4;
-  auto wh_dims = ctx->GetInputDim("WeightH");
-  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
-                    "The rank of Input(WeightH) should be 2.");
-  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
-                    "The first dimension of Input(WeightH) "
-                    "should be %d.",
-                    frame_size);
-  PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
-                    "The second dimension of Input(WeightH) "
-                    "should be 4 * %d.",
-                    frame_size);
-
-  auto b_dims = ctx->GetInputDim("Bias");
-  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
-  PADDLE_ENFORCE_EQ(b_dims[0], 1,
-                    "The first dimension of Input(Bias) should be 1.");
-  if (ctx->Attrs().Get<bool>("use_peepholes")) {
-    PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
-                      "The second dimension of Input(Bias) should be "
-                      "7 * %d if enable peepholes connection",
-                      frame_size);
-    ctx->SetOutputDim("CheckedCell", {2, frame_size});
-  } else {
-    PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
-                      "The second dimension of Input(Bias) should be "
-                      "4 * %d if disable peepholes",
-                      frame_size);
-  }
-
-  framework::DDim out_dims({x_dims[0], frame_size});
-  ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("Cell", out_dims);
-  ctx->ShareLoD("X", "Hidden");
-  ctx->ShareLoD("X", "Cell");
-  int xx_width;
-  if (ctx->Attrs().Get<bool>("use_seq")) {
-    xx_width = wx_dims[1];
-  } else {
-    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                   "Assert only one Output(BatchedInput) of LSTM.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                   "Assert only one Output(BatchedHidden) of LSTM.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
-                   "Assert only one Output(BatchedCell) of LSTM.");
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                   "Assert only one Output(ReorderedH0) of LSTM");
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
-                   "Assert only one Output(ReorderedC0) of LSTM.");
-    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
-    ctx->SetOutputDim("BatchedHidden", out_dims);
-    ctx->SetOutputDim("BatchedCell", out_dims);
-  }
-  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
-  ctx->ShareLoD("X", "XX");
-}
-
-framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                 ctx.device_context());
-}
-
-void FusionLSTMOpMaker::Make() {
-  AddInput("X",
-           "(LoDTensor) the input is a LodTensor, which support "
-           "variable-time length input sequence. The underlying tensor in "
-           "this LoDTensor is a matrix with shape (T X M), where T is the "
-           "total time steps in this mini-batch, M is the dim size of x.");
-  AddInput("WeightX",
-           "(Tensor) the learnable weights of X."
-           " - The shape is (M x 4D), where M is the dim size of x, D is the "
-           "hidden size. "
-           " - Weight = {W_cx, W_ix, W_fx, W_ox}");
-  AddInput("WeightH",
-           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
-           " - The shape is (D x 4D), where D is the hidden size. "
-           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
-  AddInput("Bias",
-           "(Tensor) the learnable weights. Almost same as LSTMOp"
-           "Note: we should add the fc bias into this (1x4D) in bias."
-           "input-hidden bias weight and peephole connections weight if "
-           "setting `use_peepholes` True. "
-           "1. `use_peepholes = False` "
-           " - The shape is (1 x 4D). "
-           " - Bias = {b_c, b_i, b_f, b_o}."
-           "2. `use_peepholes = True` "
-           " - The shape is (1 x 7D). "
-           " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
-  AddInput("H0",
-           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
-           "optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size and D is the hidden size.")
-      .AsDispensable();
-  AddInput("C0",
-           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
-           "optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size. `H0` and `C0` can be NULL but only at the same time.")
-      .AsDispensable();
-  AddOutput("Hidden",
-            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
-            "The shape is (T x D), and lod is the same with the `Input`.");
-  AddOutput("Cell",
-            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
-            "The shape is (T x D), and lod is the same with the `Input`.");
-  AddOutput("XX",
-            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
-            " or batched_X (size is T x M), this will be automatically chosen,"
-            " where T is the total time steps in this mini-batch,"
-            " D is the hidden size, M is the dim size of x input.")
-      .AsIntermediate();
-  AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate();
-  AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate();
-  AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
-  AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
-  AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
-  AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.")
-      .AsIntermediate();
-  AddAttr<bool>("use_peepholes",
-                "(bool, default: True) "
-                "whether to enable diagonal/peephole connections.")
-      .SetDefault(true);
-  AddAttr<bool>("is_reverse",
-                "(bool, default: False) "
-                "whether to compute reversed LSTM.")
-      .SetDefault(false);
-  AddAttr<bool>("use_seq",
-                "(bool, default: True) "
-                "whether to use seq mode to compute.")
-      .SetDefault(true);
-  AddAttr<std::string>("gate_activation",
-                       "(string, default: sigmoid)"
-                       "The activation for input gate, forget gate and output "
-                       "gate, `sigmoid` by default.")
-      .SetDefault("sigmoid")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddAttr<std::string>("cell_activation",
-                       "(string, default: tanh)"
-                       "The activation for cell output, `tanh` by default.")
-      .SetDefault("tanh")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddAttr<std::string>("candidate_activation",
-                       "(string, default: tanh)"
-                       "The activation for candidate hidden state, "
-                       "`tanh` by default.")
-      .SetDefault("tanh")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddComment(R"DOC(
-Fusion Long-Short Term Memory (LSTM) Operator.
-This operator fuse the X into LSTM, more details can refer to LSTM op.
-)DOC");
-}
-
-template <typename T>
-class FuisonLSTMKernel : public framework::OpKernel<T> {
- public:
-#define INIT_BASE_DEFINES                                   \
-  using DeviceContext = paddle::platform::CPUDeviceContext; \
-  auto* x = ctx.Input<LoDTensor>("X");                      \
-  auto* h0 = ctx.Input<Tensor>("H0");                       \
-  auto* c0 = ctx.Input<Tensor>("C0");                       \
-  auto* wx = ctx.Input<Tensor>("WeightX");                  \
-  auto* wh = ctx.Input<Tensor>("WeightH");                  \
-  auto* bias = ctx.Input<Tensor>("Bias");                   \
-  auto* xx = ctx.Output<LoDTensor>("XX");                   \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");       \
-  auto* cell_out = ctx.Output<LoDTensor>("Cell");           \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");           \
-  bool use_peepholes = ctx.Attr<bool>("use_peepholes");     \
-  auto x_dims = x->dims();   /* T x M*/                     \
-  auto wh_dims = wh->dims(); /* D x 4D*/                    \
-  const int M = x_dims[1];                                  \
-  const int D = wh_dims[0];                                 \
-  const int D4 = wh_dims[1]
-
-#define INIT_OTHER_DEFINES                                                     \
-  const T* x_data = x->data<T>();                                              \
-  const T* wx_data = wx->data<T>();                                            \
-  const T* wh_data = wh->data<T>();                                            \
-  /* diagonal weight*/                                                         \
-  const T* wp_data = bias->data<T>() + D4;                                     \
-  /* for peephole only*/                                                       \
-  T* checked_cell_data = nullptr;                                              \
-  auto place = ctx.GetPlace();                                                 \
-  if (use_peepholes) {                                                         \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                           \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                    \
-    checked_cell_data = checked_cell->mutable_data<T>(place);                  \
-  }                                                                            \
-  const jit::lstm_attr_t attr(                                                 \
-      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),         \
-      jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")),       \
-      jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")),            \
-      use_peepholes);                                                          \
-  jit::lstm_t one_step;                                                        \
-  one_step.wp = wp_data;                                                       \
-  one_step.checked = checked_cell_data;                                        \
-  auto ComputeC1H1 =                                                           \
-      jit::KernelFuncs<jit::LSTMC1H1Tuple<T>, platform::CPUPlace>::Cache().At( \
-          attr);                                                               \
-  auto ComputeCtHt =                                                           \
-      jit::KernelFuncs<jit::LSTMCtHtTuple<T>, platform::CPUPlace>::Cache().At( \
-          attr)
-
-// Wh GEMM
-#define GEMM_WH_ADDON(bs, prev, out)                                           \
-  blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
-            wh_data, D4, static_cast<T>(1), out, D4)
-
-  void SeqCompute(const framework::ExecutionContext& ctx) const {
-    INIT_BASE_DEFINES;
-    INIT_OTHER_DEFINES;
-    auto x_lod = x->lod();
-    const int total_T = x_dims[0];
-    const int N = x_lod[0].size() - 1;
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
-    const T* c0_data = c0 ? c0->data<T>() : nullptr;
-    T* xx_data = xx->mutable_data<T>(place);
-    T* h_out_data = hidden_out->mutable_data<T>(place);
-    T* c_out_data = cell_out->mutable_data<T>(place);
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::FCFunctor<DeviceContext, T> fc;
-    fc(dev_ctx, total_T, D4, M, x_data, wx_data, xx_data, bias->data<T>());
-
-    int xx_offset = D4;
-    int gate_offset = D;
-    if (is_reverse) {
-      const int offset = (total_T - 1) * D;
-      xx_data = xx_data + offset * 4;
-      h_out_data = h_out_data + offset;
-      c_out_data = c_out_data + offset;
-      xx_offset = -D4;
-      gate_offset = -D;
-    }
-
-    for (int i = 0; i < N; ++i) {
-      int bid = is_reverse ? N - 1 - i : i;
-      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
-      const T* prev_c_data = nullptr;
-      const T* prev_h_data = nullptr;
-      int tstart = 0;
-      if (h0_data) {
-        prev_h_data = h0_data + bid * D;
-        prev_c_data = c0_data + bid * D;
-      } else {
-        one_step.gates = xx_data;
-        one_step.ct = c_out_data;
-        one_step.ht = h_out_data;
-        ComputeC1H1(&one_step, &attr);
-        tstart = 1;
-        // move one step
-        prev_h_data = h_out_data;
-        prev_c_data = c_out_data;
-        xx_data = xx_data + xx_offset;
-        h_out_data = h_out_data + gate_offset;
-        c_out_data = c_out_data + gate_offset;
-      }
-      for (int step = tstart; step < seq_len; ++step) {
-        GEMM_WH_ADDON(1, prev_h_data, xx_data);
-
-        one_step.gates = xx_data;
-        one_step.ct_1 = prev_c_data;
-        one_step.ct = c_out_data;
-        one_step.ht = h_out_data;
-        ComputeCtHt(&one_step, &attr);
-        // move one step
-        prev_h_data = h_out_data;
-        prev_c_data = c_out_data;
-        xx_data = xx_data + xx_offset;
-        h_out_data = h_out_data + gate_offset;
-        c_out_data = c_out_data + gate_offset;
-      }
-    }
-  }
-
-  void BatchCompute(const framework::ExecutionContext& ctx) const {
-    INIT_BASE_DEFINES;
-    if (x->lod()[0].size() == 2) {
-      xx->Resize({x_dims[0], D4});
-      SeqCompute(ctx);
-      return;
-    }
-    INIT_OTHER_DEFINES;
-
-    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
-    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
-    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
-    auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
-    auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
-    T* xx_data = xx->mutable_data<T>(place);
-    T* batched_input_data = batched_input->mutable_data<T>(place);
-    T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
-    T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
-    hidden_out->mutable_data<T>(place);
-    cell_out->mutable_data<T>(place);
-
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    math::FCFunctor<DeviceContext, T> fc;
-    if (M > D4) {
-      fc(dev_ctx, x_dims[0], D4, M, x_data, wx_data, xx_data, bias->data<T>());
-      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
-    } else {
-      to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_input->set_lod(xx->lod());
-      fc(dev_ctx, x_dims[0], D4, M, xx_data, wx_data, batched_input_data,
-         bias->data<T>());
-    }
-
-    auto batched_lod = batched_input->lod();
-    const auto& seq_order = batched_lod[2];
-    const int max_bs = seq_order.size();
-    reordered_h0->Resize({max_bs, D});
-    reordered_c0->Resize({max_bs, D});
-
-    int tstart = 0;
-    T* prev_h_data = nullptr;
-    T* prev_c_data = nullptr;
-    if (h0) {
-      // reorder h0, c0
-      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
-      T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
-      const T* h0_data = h0->data<T>();
-      const T* c0_data = c0->data<T>();
-      prev_h_data = reordered_h0_data;
-      prev_c_data = reordered_c0_data;
-      size_t sz = sizeof(T) * D;
-      for (int i = 0; i < max_bs; ++i) {
-        blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data);
-        blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data);
-        reordered_h0_data += D;
-        reordered_c0_data += D;
-      }
-    } else {
-      // compute without h0, c0
-      T* cur_in_data = batched_input_data;
-      T* cur_h_out_data = batched_h_out_data;
-      T* cur_c_out_data = batched_c_out_data;
-      for (int i = 0; i < max_bs; ++i) {
-        one_step.gates = cur_in_data;
-        one_step.ct = cur_c_out_data;
-        one_step.ht = cur_h_out_data;
-        ComputeC1H1(&one_step, &attr);
-
-        cur_in_data += D4;
-        cur_c_out_data += D;
-        cur_h_out_data += D;
-      }
-      tstart = 1;
-      prev_h_data = batched_h_out_data;
-      prev_c_data = batched_c_out_data;
-    }
-
-    // compute kernel part
-    const auto& batch_starts = batched_lod[0];
-    const int max_seq_len = batch_starts.size() - 1;
-    const int offset = tstart * max_bs * D;
-    batched_input_data = batched_input_data + offset * 4;
-    batched_h_out_data = batched_h_out_data + offset;
-    batched_c_out_data = batched_c_out_data + offset;
-    for (int step = tstart; step < max_seq_len; ++step) {
-      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-      GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-      T* cur_in_data = batched_input_data;
-      T* cur_prev_c_data = prev_c_data;
-      T* cur_c_out_data = batched_c_out_data;
-      T* cur_h_out_data = batched_h_out_data;
-      for (int i = 0; i < cur_bs; ++i) {
-        one_step.gates = cur_in_data;
-        one_step.ct_1 = cur_prev_c_data;
-        one_step.ct = cur_c_out_data;
-        one_step.ht = cur_h_out_data;
-        ComputeCtHt(&one_step, &attr);
-
-        // move one batch
-        cur_in_data += D4;
-        cur_prev_c_data += D;
-        cur_c_out_data += D;
-        cur_h_out_data += D;
-      }
-      // move one step
-      prev_c_data = batched_c_out_data;
-      prev_h_data = batched_h_out_data;
-      batched_c_out_data = cur_c_out_data;
-      batched_h_out_data = cur_h_out_data;
-      batched_input_data = cur_in_data;
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batched_h_out->set_lod(batched_lod);
-    to_seq(dev_ctx, *batched_h_out, hidden_out);
-    batched_c_out->set_lod(batched_lod);
-    to_seq(dev_ctx, *batched_c_out, cell_out);
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    if (ctx.Attr<bool>("use_seq")) {
-      SeqCompute(ctx);
-    } else {
-      BatchCompute(ctx);
-    }
-  }
-
-#undef GEMM_WH_ADDON
-#undef INIT_OTHER_DEFINES
-#undef INIT_BASE_DEFINES
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fusion_lstm, ops::FuisonLSTMKernel<float>,
-                       ops::FuisonLSTMKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.h b/paddle/fluid/operators/fused/fusion_lstm_op.h
deleted file mode 100644
index 7f79601602348ac454fc6c0cefcba0643ad8e6e2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_lstm_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-class FusionLSTMOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
deleted file mode 100644
index 4c11482f5077eeeb2d446dc0cbe9c08f890f390f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/jit/kernels.h"
-
-namespace paddle {
-namespace operators {
-
-void FusionRepeatedFCReluOp::InferShape(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"),
-                 "Input(X) of FusionRepeatedFCReluOp should not be null.");
-  auto sz = ctx->Inputs("W").size();
-  PADDLE_ENFORCE_GT(
-      sz, 1UL, "Inputs(W) of FusionRepeatedFCReluOp should larger than 1.");
-  PADDLE_ENFORCE_EQ(ctx->Inputs("Bias").size(), sz,
-                    "Size of inputs(Bias) of FusionRepeatedFCReluOp should be "
-                    "equal to inputs size.");
-  PADDLE_ENFORCE_EQ(ctx->Outputs("ReluOut").size(), sz - 1,
-                    "Size of output(ReluOut) of FusionRepeatedFCReluOp should "
-                    "be equal to inputs size -1.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Output(Out) of FusionRepeatedFCReluOp should not be null.");
-
-  auto i_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(i_dims.size(), 2, "Input shape size should be 2");
-
-  auto w_dims = ctx->GetInputsDim("W");
-  auto b_dims = ctx->GetInputsDim("Bias");
-  PADDLE_ENFORCE_EQ(w_dims.size(), b_dims.size(),
-                    "Shape size of weight and bias should be equal");
-  PADDLE_ENFORCE_EQ(w_dims.size(), sz,
-                    "Shape size of weight and bias should be equal");
-  PADDLE_ENFORCE_EQ(i_dims[1], w_dims[0][0],
-                    "inpute width should be equal with weight height");
-
-  for (size_t i = 1; i < sz; ++i) {
-    PADDLE_ENFORCE_EQ(w_dims[i].size(), 2,
-                      "Every weight shape size should be 2.");
-    PADDLE_ENFORCE_EQ(framework::product(b_dims[i]), w_dims[i][1],
-                      "The length of Bias must be equal with w_dims[1].");
-  }
-  ctx->SetOutputDim("Out", {i_dims[0], w_dims[sz - 1][1]});
-  ctx->ShareLoD("X", /*->*/ "Out");
-}
-
-framework::OpKernelType FusionRepeatedFCReluOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(framework::GetDataTypeOfVar(ctx.InputVar("X")),
-                                 ctx.GetPlace());
-}
-
-void FusionRepeatedFCReluOpMaker::Make() {
-  AddInput("X", "(LoDTensor) Input tensors of this operator.");
-  AddInput("W", "(Tensor) The weight tensors of this operator.").AsDuplicable();
-  AddInput("Bias", "(Tensor) The bias tensors of this operator.")
-      .AsDuplicable();
-  AddOutput("ReluOut", "(Tensor) The output tensor of each relu operator.")
-      .AsDuplicable()
-      .AsIntermediate();
-  AddOutput("Out", "(LoDTensor) Output tensor of this operator.");
-  AddComment(R"DOC(
-  Fusion Repeated FC with Relu Operator.
-)DOC");
-}
-
-template <typename T>
-static void fc_relu(const T* x, const T* w, const T* b, T* y,
-                    const jit::matmul_attr_t& attr) {
-  auto matmul =
-      jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
-          attr);
-  auto addbias_relu =
-      jit::KernelFuncs<jit::VAddReluTuple<T>, platform::CPUPlace>::Cache().At(
-          attr.n);
-  matmul(x, w, y, &attr);
-  T* dst = y;
-  for (int i = 0; i < attr.m; ++i) {
-    addbias_relu(b, dst, dst, attr.n);
-    dst += attr.n;
-  }
-}
-
-template <typename T>
-class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto in = ctx.Input<Tensor>("X");
-    auto weights = ctx.MultiInput<Tensor>("W");
-    auto biases = ctx.MultiInput<Tensor>("Bias");
-    auto relus = ctx.MultiOutput<Tensor>("ReluOut");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto place = ctx.GetPlace();
-    int weight_sz = static_cast<int>(weights.size());
-
-    auto i_dims = in->dims();
-    auto w_dims = weights[0]->dims();
-    jit::matmul_attr_t attr;
-    attr.m = i_dims[0];
-    attr.n = w_dims[1];
-    attr.k = w_dims[0];
-    relus[0]->Resize({attr.m, attr.n});
-    fc_relu(in->data<T>(), weights[0]->data<T>(), biases[0]->data<T>(),
-            relus[0]->mutable_data<T>(place), attr);
-
-    for (int i = 1; i < weight_sz - 1; ++i) {
-      auto i_dims = relus[i - 1]->dims();
-      auto w_dims = weights[i]->dims();
-      attr.m = i_dims[0];
-      attr.n = w_dims[1];
-      attr.k = w_dims[0];
-      relus[i]->Resize({attr.m, attr.n});
-      fc_relu(relus[i - 1]->data<T>(), weights[i]->data<T>(),
-              biases[i]->data<T>(), relus[i]->mutable_data<T>(place), attr);
-    }
-
-    auto i_dims_last = relus[weight_sz - 2]->dims();
-    auto w_dims_last = weights[weight_sz - 1]->dims();
-    attr.m = i_dims_last[0];
-    attr.n = w_dims_last[1];
-    attr.k = w_dims_last[0];
-    fc_relu(relus[weight_sz - 2]->data<T>(), weights[weight_sz - 1]->data<T>(),
-            biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place),
-            attr);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_repeated_fc_relu, ops::FusionRepeatedFCReluOp,
-                  ops::FusionRepeatedFCReluOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fusion_repeated_fc_relu,
-                       ops::FusionRepeatedFCReluKernel<float>,
-                       ops::FusionRepeatedFCReluKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
deleted file mode 100644
index cdcaf8b4833464100ed579a5962c60013edecdb0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-class FusionRepeatedFCReluOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusionRepeatedFCReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
deleted file mode 100644
index 519670cc6a7b73b679645e5ee6d98b74613cdacc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h"
-#include <algorithm>  // for min, max
-#include <string>
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/fc.h"
-
-namespace paddle {
-namespace operators {
-
-void FusionSeqConvEltAddReluOp::InferShape(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"),
-                 "Input(X) of FusionSeqConvEltAddReluOp should not be null.");
-  PADDLE_ENFORCE(
-      ctx->HasInput("Filter"),
-      "Input(Filter) of FusionSeqConvEltAddReluOp should not be null.");
-  PADDLE_ENFORCE(
-      ctx->HasInput("Bias"),
-      "Input(Bias) of FusionSeqConvEltAddReluOp should not be null.");
-  PADDLE_ENFORCE(
-      ctx->HasOutput("Out"),
-      "Output(Out) of FusionSeqConvEltAddReluOp should not be null.");
-  PADDLE_ENFORCE(
-      ctx->HasOutput("ColMat"),
-      "Output(ColMat) of FusionSeqConvEltAddReluOp should not be null.");
-
-  auto x_dims = ctx->GetInputDim("X");
-  auto w_dims = ctx->GetInputDim("Filter");
-  int context_length = ctx->Attrs().Get<int>("contextLength");
-  PADDLE_ENFORCE(
-      ctx->Attrs().Get<int>("contextStride") == 1,
-      "Currently, FusionSeqConvEltAddReluOp only supports contextStride=1.");
-  PADDLE_ENFORCE(x_dims.size() == 2 && w_dims.size() == 2,
-                 "Input(X, Filter) should be 2-D tensor.");
-  PADDLE_ENFORCE(x_dims.size() == 2 && w_dims.size() == 2,
-                 "Input(X, Filter) should be 2-D tensor.");
-  PADDLE_ENFORCE(w_dims[0] == context_length * x_dims[1],
-                 "Filter's height should be context_length * "
-                 "input_hidden_size .");
-  PADDLE_ENFORCE_GT(context_length + ctx->Attrs().Get<int>("contextStart"), 0,
-                    "contextStart size should be smaller than contextLength.");
-
-  ctx->SetOutputDim("Out", {x_dims[0], w_dims[1]});
-  ctx->SetOutputDim("ColMat", {x_dims[0], w_dims[0]});
-  ctx->ShareLoD("X", "Out");
-}
-
-framework::OpKernelType FusionSeqConvEltAddReluOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                 ctx.device_context());
-}
-
-void FusionSeqConvEltAddReluOpMaker::Make() {
-  AddInput("X",
-           "(LoDTensor) the input is a LodTensor, which support "
-           "variable-time length input sequence. The underlying tensor in "
-           "this LoDTensor is a matrix with shape (T X M), where T is the "
-           "total time steps in this mini-batch, M is the dim size of x.");
-  // PaddingData only support false yet, should be ensured at pass.
-  AddInput("Filter",
-           "(Tensor) same as the input(Filter) of sequence conv op is an "
-           "learnable parameter."
-           "This is a tensor with shape (K, N), where K is the "
-           "context_length * dim size of x, N is the output feature size.");
-  AddInput("Bias",
-           "(Tensor) the learnable weights. shape (1, N), where N is the "
-           "output feature size");
-  AddOutput(
-      "Out",
-      "(LoDTensor) the output(Out) is a LodTensor, which support "
-      "variable-time length output sequence. The underlying tensor in "
-      "this LoDTensor is a matrix with shape (T, N), where, T is the "
-      "total time steps in this mini-batch, N is the output feature size.");
-  AddOutput("ColMat",
-            "(Tensor) (T, K), where T is where T is the "
-            "total time steps in this mini-batch, K is height of Filter")
-      .AsIntermediate();
-  AddAttr<int>("contextLength",
-               "(int) the contextLength of FusionSeqConvEltAddReluOp is the "
-               "height of the convolution kernel.")
-      .GreaterThan(0);
-  AddAttr<int>("contextStart",
-               "(int, default:0) the contextStart of FusionSeqConvEltAddReluOp "
-               "represents the beginning of the convolution of the number of "
-               "rows of sequence, which can be negative. The negative number "
-               "means to pad contextStart time-steps of zeros or learnable "
-               "parameters at the beginning of each instance. The positive "
-               "number means to skip contextStart time-steps of each "
-               "instance.")
-      .SetDefault(0);
-  AddAttr<int>(
-      "contextStride",
-      "(int, default:1) the contextStride of FusionSeqConvEltAddReluOp "
-      "represents the stride length of convolution kernel. "
-      "Currently, FusionSeqConvEltAddReluOp only supports"
-      "contextStride=1.")
-      .SetDefault(1)
-      .GreaterThan(0);
-  AddComment(R"DOC(
-Fusion Sequence Conv and ElementwiseAdd Operator.
-)DOC");
-}
-
-template <typename T>
-class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* w = ctx.Input<Tensor>("Filter");
-    auto* b = ctx.Input<Tensor>("Bias");
-    auto* y = ctx.Output<LoDTensor>("Out");
-    auto* col = ctx.Output<Tensor>("ColMat");
-
-    auto x_lod = x->lod();
-    auto x_dims = x->dims();
-    auto w_dims = w->dims();
-    PADDLE_ENFORCE_EQ(b->numel(), w_dims[1],
-                      "bias size should be equal to output feature size.");
-    PADDLE_ENFORCE_EQ(x_lod.size(), 1UL,
-                      "Only support one level sequence now.");
-
-    const T* x_data = x->data<T>();
-    const T* w_data = w->data<T>();
-    const T* b_data = b->data<T>();
-    T* y_data = y->mutable_data<T>(ctx.GetPlace());
-    T* col_data = col->mutable_data<T>(ctx.GetPlace());
-
-    int context_start = ctx.Attr<int>("contextStart");
-    int context_length = ctx.Attr<int>("contextLength");
-    int up_pad = std::max(0, -context_start);
-    int down_pad = std::max(0, context_start + context_length - 1);
-    // im2col
-    int src_mat_w = static_cast<int>(x_dims[1]);
-    int src_mat_w_sz = src_mat_w * sizeof(T);
-    int col_mat_w = static_cast<int>(w_dims[0]);
-    int col_mat_w_sz = col_mat_w * sizeof(T);
-    for (int i = 0; i < static_cast<int>(x_lod[0].size()) - 1; ++i) {
-      int st = x_lod[0][i];
-      int ed = x_lod[0][i + 1];
-      const T* src_data = x_data + st * src_mat_w;
-      T* dst_data = col_data + st * col_mat_w;
-      int seq_len = ed - st;
-      if (seq_len > up_pad + down_pad) {
-        // zero all up_pad and fill data
-        std::memset(dst_data, 0, up_pad * col_mat_w_sz);
-        dst_data = dst_data + up_pad * src_mat_w;
-        int copy_size = col_mat_w_sz - up_pad * src_mat_w_sz;
-        for (int j = 0; j < up_pad; ++j) {
-          // blas.VCOPY?
-          std::memcpy(dst_data, src_data, copy_size);
-          dst_data += (col_mat_w - src_mat_w);
-          copy_size += src_mat_w_sz;
-        }
-        // fill data
-        for (int j = 0; j < seq_len - up_pad - down_pad; ++j) {
-          std::memcpy(dst_data, src_data, copy_size);
-          dst_data += col_mat_w;
-          src_data += src_mat_w;
-        }
-        // zero all down_pad and fill data
-        std::memset(dst_data, 0, down_pad * col_mat_w_sz);
-        copy_size -= src_mat_w_sz;
-        for (int j = 0; j < down_pad; ++j) {
-          std::memcpy(dst_data, src_data, copy_size);
-          dst_data += col_mat_w;
-          src_data += src_mat_w;
-          copy_size -= src_mat_w_sz;
-        }
-      } else {
-        PADDLE_ENFORCE_GE(context_length, up_pad + down_pad + 1);
-        std::memset(dst_data, 0, seq_len * col_mat_w_sz);
-        dst_data = dst_data + up_pad * src_mat_w;
-        int zero_sz = up_pad * src_mat_w_sz;
-        int cur_src_sz = seq_len * src_mat_w_sz;
-        for (int j = 0; j < std::min(up_pad, seq_len); ++j) {
-          int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
-          std::memcpy(dst_data, src_data, copy_size);
-          dst_data += (col_mat_w - src_mat_w);
-          zero_sz -= src_mat_w_sz;
-        }
-        // from bottom
-        dst_data = col_data + ed * col_mat_w;
-        src_data = x_data + st * src_mat_w;
-        zero_sz = down_pad * src_mat_w_sz;
-        for (int j = 1; j <= std::min(down_pad, seq_len); ++j) {
-          int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
-          std::memcpy(dst_data - (zero_sz + copy_size) / sizeof(T),
-                      src_data + std::max(seq_len - j - up_pad, 0) * src_mat_w,
-                      copy_size);
-          dst_data -= col_mat_w;
-          zero_sz -= src_mat_w_sz;
-        }
-      }
-    }
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::FCFunctor<DeviceContext, T> fc;
-    fc(dev_ctx, x_dims[0], w_dims[1], w_dims[0], col_data, w_data, y_data,
-       b_data, true);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_seqconv_eltadd_relu, ops::FusionSeqConvEltAddReluOp,
-                  ops::FusionSeqConvEltAddReluOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fusion_seqconv_eltadd_relu,
-                       ops::FusionSeqConvEltAddReluKernel<float>,
-                       ops::FusionSeqConvEltAddReluKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
deleted file mode 100644
index 028d79dc2a1ee8d789fe4b8724b320442041a71b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-class FusionSeqConvEltAddReluOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusionSeqConvEltAddReluOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
deleted file mode 100644
index 95a08d3b0f030e7dae6668a788b52cfe66daa250..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h"
-#include <string>
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-void FusionSeqExpandConcatFCOp::InferShape(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_GT(
-      ctx->Inputs("X").size(), 1UL,
-      "Inputs(X) of FusionSeqExpandConcatFCOp should larger than 1.");
-  PADDLE_ENFORCE(
-      ctx->HasInput("FCWeight"),
-      "Input(FCWeight) of FusionSeqExpandConcatFCOp should not be null.");
-  PADDLE_ENFORCE(
-      ctx->HasOutput("Out"),
-      "Output(Out) of FusionSeqExpandConcatFCOp should not be null.");
-  PADDLE_ENFORCE(
-      ctx->HasOutput("FCOut"),
-      "Output(FCOut) of FusionSeqExpandConcatFCOp should not be null.");
-
-  auto ins_dims = ctx->GetInputsDim("X");
-  auto w_dims = ctx->GetInputDim("FCWeight");  // (M0+M1+M2+..) x D
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(FCWeight)'s rank must be 2.");
-  const int D = w_dims[1];
-  int sum = ins_dims[0][1];
-  for (size_t i = 1; i < ins_dims.size(); ++i) {
-    sum += ins_dims[i][1];
-  }
-  PADDLE_ENFORCE_EQ(sum, w_dims[0],
-                    "FC height should be sum of all inputs width.");
-  if (ctx->HasInput("FCBias")) {
-    auto b_dims = ctx->GetInputDim("FCBias");
-    PADDLE_ENFORCE(b_dims.size() == 1 || b_dims.size() == 2,
-                   "b_dims should be 1 or 2, get %d", b_dims.size());
-    if (b_dims.size() == 1) {
-      PADDLE_ENFORCE_EQ(b_dims[0], D, "FCBias shapes must be %d.", D);
-    } else {
-      PADDLE_ENFORCE_EQ(b_dims[0], 1, "FCBias shapes must be 1x%d.", D);
-      PADDLE_ENFORCE_EQ(b_dims[1], D, "FCBias shapes must be 1x%d.", D);
-    }
-  }
-
-  ctx->SetOutputDim("Out", {ins_dims[0][0], D});
-  // fcout should be reshape when run since can not get lod in infershape
-  // explicit share the ref lod
-  ctx->ShareLoD("X", "Out", 0);
-}
-
-framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(ctx.MultiInput<LoDTensor>("X")[0]->type(),
-                                 ctx.device_context());
-}
-
-void FusionSeqExpandConcatFCOpMaker::Make() {
-  AddInput("X",
-           "(LoDTensor) input LodDTensors, the first one must be have ref lod "
-           "for sequence expand, and the rest input should have same lod.")
-      .AsDuplicable();
-  AddInput("FCWeight", "(Tensor) the weights of fc.");
-  AddInput("FCBias", "(Tensor, optional) the bias of fc.").AsDispensable();
-  AddOutput("Out", "(LoDTensor) Output LodTensor.");
-  AddOutput(
-      "FCOut",
-      "(Tensor) the intermediate tensor to keep the result of fc."
-      "Shape is (N x D), where N is the batch size, D is the output dim of fc")
-      .AsIntermediate();
-  AddAttr<std::string>("fc_activation",
-                       "(string, default: identity)"
-                       "The activation for the result of fc."
-                       "`identity` by default.")
-      .SetDefault("identity")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddComment(R"DOC(
-Fusion Sequence expand + concat + fc Operator.
-
-All below conditions should be meet:
-
-The ref_level of seq_expand should be 0.
-
-The ref lod of seq_expand level is the first input of concat.
-
-The other inputs should have same lod and same batch size of ref lod.
-
-The seq len of other inputs should be 1.
-
-The concat axis should be 1.
-
-)DOC");
-}
-
-template <typename T>
-class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
-    auto ins = ctx.MultiInput<LoDTensor>("X");
-    auto* w = ctx.Input<Tensor>("FCWeight");
-    auto* b = ctx.Input<Tensor>("FCBias");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    auto* fc_out = ctx.Output<Tensor>("FCOut");
-
-    auto* ref_in = ins[0];
-    auto ref_lod = ref_in->lod();
-    auto in1_lod = ins[1]->lod();
-    auto ref_dims = ref_in->dims();  // T x M0
-    auto in1_dims = ins[1]->dims();  // N x M1
-    auto w_dims = w->dims();
-    const int N = ref_lod[0].size() - 1;
-    const int total_T = ref_dims[0];
-    const int M0 = ref_dims[1];
-    const int M1 = in1_dims[1];
-    const int D = w_dims[1];
-
-    // some check and fcout should be reshape here
-    // since infershape can not get lod info
-    PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1.");
-    PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1.");
-    PADDLE_ENFORCE_EQ(static_cast<int>(in1_lod[0].size() - 1), N,
-                      "Batch size of all inputs should be equal.");
-    PADDLE_ENFORCE_EQ(static_cast<int>(in1_lod[0][N]), N,
-                      "Seq_length of other inputs should be 1.");
-    PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size.");
-    for (size_t i = 2; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_EQ(ins[i]->dims()[0], N,
-                        "All other inputs height should be equal");
-      PADDLE_ENFORCE_EQ(ins[i]->lod(), in1_lod,
-                        "All other inputs should have same lod");
-    }
-    fc_out->Resize({N, D});
-
-    std::function<void(const int, const T*, T*)> fc_act;
-    auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
-    if (platform::MayIUse(platform::avx)) {
-      math::VecActivations<T, platform::avx> act_functor;
-      fc_act = act_functor(fc_act_str);
-    } else {
-      math::VecActivations<T, platform::isa_any> act_functor;
-      fc_act = act_functor(fc_act_str);
-    }
-
-    const T* ref_in_data = ref_in->data<T>();
-    const T* in1_data = ins[1]->data<T>();
-    const T* w_data = w->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
-
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    math::FCFunctor<DeviceContext, T> fc;
-    fc(dev_ctx, total_T, D, M0, ref_in_data, w_data, out_data,
-       b ? b->data<T>() : NULL);
-    w_data = w_data + M0 * D;
-    // first write on
-    blas.MatMul(N, D, M1, in1_data, w_data, fc_out_data);
-    w_data = w_data + M1 * D;
-    for (size_t i = 2; i < ins.size(); ++i) {
-      // add on
-      const T* in_data = ins[i]->data<T>();
-      const int K = ins[i]->dims()[1];
-      blas.GEMM(CblasNoTrans, CblasNoTrans, N, D, K, static_cast<T>(1), in_data,
-                K, w_data, D, static_cast<T>(1), fc_out_data, D);
-      w_data = w_data + K * D;
-    }
-    T* cur_out_data = out_data;
-    for (int i = 0; i < N; ++i) {
-      int seq_len = ref_lod[0][i + 1] - ref_lod[0][i];
-      T* src = fc_out_data + i * D;
-      for (int step = 0; step < seq_len; ++step) {
-        blas.VADD(D, cur_out_data, src, cur_out_data);
-        cur_out_data = cur_out_data + D;
-      }
-    }
-    fc_act(total_T * D, out_data, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOp,
-                  ops::FusionSeqExpandConcatFCOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc,
-                       ops::FusionSeqExpandConcatFCOpKernel<float>,
-                       ops::FusionSeqExpandConcatFCOpKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
deleted file mode 100644
index f78e820f603354944bd7fc23aff2d1d72e5ba750..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-class FusionSeqExpandConcatFCOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusionSeqExpandConcatFCOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
deleted file mode 100644
index b14ee88aa53b64791fa09c848e23d4f01826e339..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_seqpool_concat_op.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/jit/kernels.h"
-
-namespace paddle {
-namespace operators {
-
-void FusionSeqPoolConcatOp::InferShape(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                    "Inputs(X) of FusionSeqPoolConcatOp should not be empty.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Output(Out) of FusionSeqPoolConcatOp should not be null.");
-  int axis = ctx->Attrs().Get<int>("axis");
-  PADDLE_ENFORCE_EQ(axis, 1,
-                    "FusionSeqPoolConcatOp only supports concat axis=1 yet.");
-
-  auto ins_dims = ctx->GetInputsDim("X");
-  const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n, 0UL, "Input tensors count should > 0.");
-  if (n == 1) {
-    LOG(WARNING) << "Only have one input, may waste memory";
-  }
-
-  // The output height should be confirmed in Compute,
-  // since input lod is not accessible here.
-  PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
-                    "The dims size of first input should be 2.");
-  ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
-}
-
-framework::OpKernelType FusionSeqPoolConcatOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]), ctx.GetPlace());
-}
-
-void FusionSeqPoolConcatOpMaker::Make() {
-  AddInput("X", "(LoDTensor) Input tensors of this operator.").AsDuplicable();
-  AddOutput("Out", "(LoDTensor) Output tensor of concat operator.");
-  AddAttr<std::string>("pooltype",
-                       "(string, default 'SUM') some of the pooling "
-                       "pooltype of SequencePoolOp.")
-      .SetDefault("SUM")
-      .InEnum({"AVERAGE", "SUM", "SQRT"});
-  AddAttr<int>("axis",
-               "The axis along which the input tensors will be concatenated. "
-               "Only supports concat axis=1 yet.")
-      .SetDefault(1);
-  AddComment(R"DOC(
-Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator.
-)DOC");
-}
-
-template <typename T>
-class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    std::string pooltype = ctx.Attr<std::string>("pooltype");
-    auto x0_lod = ins[0]->lod();
-    auto x0_dims = ins[0]->dims();
-    auto y_dims = out->dims();
-    size_t bs = x0_lod[0].size() - 1;
-    out->Resize({static_cast<int64_t>(bs), y_dims[1]});
-    framework::LoD y_lod(1);
-    y_lod[0].resize(bs + 1);
-    for (size_t i = 0; i <= bs; ++i) {
-      y_lod[0][i] = i;
-    }
-    out->set_lod(y_lod);
-    auto place = ctx.GetPlace();
-    T* y_data = out->mutable_data<T>(place);
-
-    int w = ins[0]->numel() / x0_dims[0];
-    PADDLE_ENFORCE_EQ(y_dims[1] % w, 0,
-                      "The output of dims[1] should be dividable of w");
-    jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum);
-    if (pooltype == "AVERAGE") {
-      attr.type = jit::SeqPoolType::kAvg;
-    } else if (pooltype == "SQRT") {
-      attr.type = jit::SeqPoolType::kSqrt;
-    }
-    auto seqpool =
-        jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(
-            attr);
-    size_t n = ins.size();
-    size_t dst_step_size = n * w;
-    for (size_t i = 0; i < n; ++i) {
-      auto x_dims = ins[i]->dims();
-      auto x_lod = ins[i]->lod()[0];
-      const T* src = ins[i]->data<T>();
-      T* dst = y_data + i * w;
-      PADDLE_ENFORCE_EQ(static_cast<int>(ins[i]->numel() / x_dims[0]), w,
-                        "Width of all inputs should be equal.");
-      PADDLE_ENFORCE_EQ(x_lod.size(), bs + 1,
-                        "Batchsize of all inputs should be equal.");
-      for (size_t j = 0; j < bs; ++j) {
-        attr.h = static_cast<int>(x_lod[j + 1] - x_lod[j]);
-        seqpool(src, dst, &attr);
-        dst += dst_step_size;
-        src += attr.h * attr.w;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_seqpool_concat, ops::FusionSeqPoolConcatOp,
-                  ops::FusionSeqPoolConcatOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fusion_seqpool_concat,
-                       ops::FusionSeqPoolConcatKernel<float>,
-                       ops::FusionSeqPoolConcatKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
deleted file mode 100644
index 9f882a59d351cdb360203f3212543bfca295fc65..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-class FusionSeqPoolConcatOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusionSeqPoolConcatOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
deleted file mode 100644
index 14e327bb37d1381affe0189ce220fe13c63eac99..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/jit/kernels.h"
-
-namespace paddle {
-namespace operators {
-
-void FusionSeqPoolCVMConcatOp::InferShape(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_GE(
-      ctx->Inputs("X").size(), 1UL,
-      "Inputs(X) of FusionSeqPoolCVMConcatOp should not be empty.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Output(Out) of FusionSeqPoolCVMConcatOp should not be null.");
-  int axis = ctx->Attrs().Get<int>("axis");
-  PADDLE_ENFORCE_EQ(
-      axis, 1, "FusionSeqPoolCVMConcatOp only supports concat axis=1 yet.");
-  bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
-  PADDLE_ENFORCE_EQ(
-      use_cvm, true,
-      "FusionSeqPoolCVMConcatOp only supports use_cvm is true yet.");
-
-  auto ins_dims = ctx->GetInputsDim("X");
-  const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n, 0UL, "Input tensors count should > 0.");
-  if (n == 1) {
-    LOG(WARNING) << "Only have one input, may waste memory";
-  }
-
-  // The output height should be confirmed in Compute,
-  // since input lod is not accessible here.
-  PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
-                    "The dims size of first input should be 2.");
-  ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
-}
-
-framework::OpKernelType FusionSeqPoolCVMConcatOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]), ctx.GetPlace());
-}
-
-void FusionSeqPoolCVMConcatOpMaker::Make() {
-  AddInput("X", "(LoDTensor) Input tensors of this operator.").AsDuplicable();
-  AddInput("CVM",
-           "(Tensor),  a 2-D Tensor with shape [N x 2], where N is the batch "
-           "size, 2 is show and click.");
-  AddOutput("Out", "(LoDTensor) Output tensor of concat operator.");
-  AddAttr<std::string>("pooltype",
-                       "(string, default 'SUM') some of the pooling "
-                       "pooltype of SequencePoolOp.")
-      .SetDefault("SUM")
-      .InEnum({"AVERAGE", "SUM", "SQRT"});
-  AddAttr<bool>("use_cvm", "bool, use cvm or not").SetDefault(true);
-  AddAttr<int>("axis",
-               "The axis along which the input tensors will be concatenated. "
-               "Only supports concat axis=1 yet.")
-      .SetDefault(1);
-  AddComment(R"DOC(
-Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator.
-)DOC");
-}
-
-template <typename T>
-class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    std::string pooltype = ctx.Attr<std::string>("pooltype");
-    auto x0_lod = ins[0]->lod();
-    auto x0_dims = ins[0]->dims();
-    auto y_dims = out->dims();
-    size_t bs = x0_lod[0].size() - 1;
-    out->Resize({static_cast<int64_t>(bs), y_dims[1]});
-    framework::LoD y_lod(1);
-    y_lod[0].resize(bs + 1);
-    for (size_t i = 0; i <= bs; ++i) {
-      y_lod[0][i] = i;
-    }
-    out->set_lod(y_lod);
-    auto place = ctx.GetPlace();
-    T* y_data = out->mutable_data<T>(place);
-
-    int w = ins[0]->numel() / x0_dims[0];
-    PADDLE_ENFORCE_EQ(y_dims[1] % w, 0,
-                      "The output of dims[1] should be dividable of w");
-    jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum);
-    if (pooltype == "AVERAGE") {
-      attr.type = jit::SeqPoolType::kAvg;
-    } else if (pooltype == "SQRT") {
-      attr.type = jit::SeqPoolType::kSqrt;
-    }
-    auto seqpool =
-        jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(
-            attr);
-    size_t n = ins.size();
-    size_t dst_step_size = n * w;
-    for (size_t i = 0; i < n; ++i) {
-      auto x_dims = ins[i]->dims();
-      auto x_lod = ins[i]->lod()[0];
-      const T* src = ins[i]->data<T>();
-      T* dst = y_data + i * w;
-      PADDLE_ENFORCE_EQ(static_cast<int>(ins[i]->numel() / x_dims[0]), w,
-                        "Width of all inputs should be equal.");
-      PADDLE_ENFORCE_EQ(x_lod.size(), bs + 1,
-                        "Batchsize of all inputs should be equal.");
-      for (size_t j = 0; j < bs; ++j) {
-        attr.h = static_cast<int>(x_lod[j + 1] - x_lod[j]);
-        seqpool(src, dst, &attr);
-
-        // Currently only use_cvm is true.
-        dst[0] = log(dst[0] + 1);
-        dst[1] = log(dst[1] + 1) - dst[0];
-
-        dst += dst_step_size;
-        src += attr.h * attr.w;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_seqpool_cvm_concat, ops::FusionSeqPoolCVMConcatOp,
-                  ops::FusionSeqPoolCVMConcatOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fusion_seqpool_cvm_concat,
-                       ops::FusionSeqPoolCVMConcatKernel<float>,
-                       ops::FusionSeqPoolCVMConcatKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
deleted file mode 100644
index 75e8556c31a819572b1e73464f6dba235642ddcd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-class FusionSeqPoolCVMConcatOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusionSeqPoolCVMConcatOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
deleted file mode 100644
index 2d10056044efa851898c8cf597fa14e495305fce..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/jit/kernels.h"
-
-namespace paddle {
-namespace operators {
-
-void FusionSquaredMatSubOp::InferShape(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"),
-                 "Input(X) of FusionSquaredMatSubOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Y"),
-                 "Input(Y) of FusionSquaredMatSubOp should not be null.");
-  PADDLE_ENFORCE(
-      ctx->HasOutput("SquaredX"),
-      "Output(SquaredX) of FusionSquaredMatSubOp should not be null.");
-  PADDLE_ENFORCE(
-      ctx->HasOutput("SquaredY"),
-      "Output(SquaredY) of FusionSquaredMatSubOp should not be null.");
-  PADDLE_ENFORCE(
-      ctx->HasOutput("SquaredXY"),
-      "Output(SquaredXY) of FusionSquaredMatSubOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Output(Out) of FusionSquaredMatSubOp should not be null.");
-
-  auto x_dims = ctx->GetInputDim("X");
-  auto y_dims = ctx->GetInputDim("Y");
-  PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
-                    "Input tensors dims size should be equal.");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input tensors should be a Matrix.");
-  PADDLE_ENFORCE_EQ(x_dims[1], y_dims[0], "Inputs Matrix should be multiply.");
-
-  ctx->SetOutputDim("SquaredX", x_dims);
-  ctx->SetOutputDim("SquaredY", y_dims);
-  ctx->SetOutputDim("SquaredXY", {x_dims[0], y_dims[1]});
-  ctx->SetOutputDim("Out", {x_dims[0], y_dims[1]});
-}
-
-framework::OpKernelType FusionSquaredMatSubOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(framework::GetDataTypeOfVar(ctx.InputVar("X")),
-                                 ctx.GetPlace());
-}
-
-void FusionSquaredMatSubOpMaker::Make() {
-  AddInput("X", "(Tensor) Input Mat A of this operator.");
-  AddInput("Y", "(Tensor) Input Mat B of this operator.");
-  AddOutput("SquaredX", "(Tensor) Squared X.").AsIntermediate();
-  AddOutput("SquaredY", "(Tensor) Squared Y.").AsIntermediate();
-  AddOutput("SquaredXY", "(Tensor) Squared X*Y.").AsIntermediate();
-  AddOutput("Out", "(Tensor) Output tensor of concat operator.");
-  AddAttr<float>("scalar", "The scalar on output matrix.").SetDefault(1.f);
-  AddComment(R"DOC(
-    Fusion Squared Matrix and substrct operator.
-    
-    ( (X * Y).^2 - (X.^2 * Y.^2) ) .* scalar
-)DOC");
-}
-
-template <typename T>
-class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.Input<Tensor>("X");
-    auto y = ctx.Input<Tensor>("Y");
-    auto* squared_x = ctx.Output<Tensor>("SquaredX");
-    auto* squared_y = ctx.Output<Tensor>("SquaredY");
-    auto* squared_xy = ctx.Output<Tensor>("SquaredXY");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto place = ctx.GetPlace();
-    T scalar = static_cast<T>(ctx.Attr<float>("scalar"));
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    jit::matmul_attr_t attr;
-    attr.m = x_dims[0];
-    attr.k = x_dims[1];
-    attr.n = y_dims[1];
-    int o_numel = attr.m * attr.n;
-
-    auto vsquare_x =
-        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
-            attr.m * attr.k);
-    auto vsquare_y =
-        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
-            attr.k * attr.n);
-    auto vsquare_xy =
-        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
-            o_numel);
-    auto vsub =
-        jit::KernelFuncs<jit::VSubTuple<T>, platform::CPUPlace>::Cache().At(
-            o_numel);
-    auto vscal =
-        jit::KernelFuncs<jit::VScalTuple<T>, platform::CPUPlace>::Cache().At(
-            o_numel);
-    auto matmul =
-        jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
-            attr);
-
-    const T* x_data = x->data<T>();
-    const T* y_data = y->data<T>();
-    T* squared_x_data = squared_x->mutable_data<T>(place);
-    T* squared_y_data = squared_y->mutable_data<T>(place);
-    T* squared_xy_data = squared_xy->mutable_data<T>(place);
-    T* o_data = out->mutable_data<T>(place);
-
-    matmul(x_data, y_data, squared_xy_data, &attr);
-    vsquare_xy(squared_xy_data, squared_xy_data, o_numel);
-
-    vsquare_x(x_data, squared_x_data, attr.m * attr.k);
-    vsquare_y(y_data, squared_y_data, attr.k * attr.n);
-    matmul(squared_x_data, squared_y_data, o_data, &attr);
-
-    vsub(squared_xy_data, o_data, o_data, o_numel);
-    vscal(&scalar, o_data, o_data, o_numel);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_squared_mat_sub, ops::FusionSquaredMatSubOp,
-                  ops::FusionSquaredMatSubOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fusion_squared_mat_sub,
-                       ops::FusionSquaredMatSubKernel<float>,
-                       ops::FusionSquaredMatSubKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
deleted file mode 100644
index 0ab2c2bb10a15cc6d9a472142416bd363e65944f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-// ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
-class FusionSquaredMatSubOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusionSquaredMatSubOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
deleted file mode 100644
index 39356c9afccbf9af3eacf99a6bccb15e18f7e485..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class TransposeFlattenConcatFusionOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                      "Inputs(X) of ConcatOp should be empty.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ConcatOp should not be null.");
-
-    auto ins = ctx->GetInputsDim("X");
-    const size_t n = ins.size();
-    PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
-
-    std::vector<int> trans_axis =
-        ctx->Attrs().Get<std::vector<int>>("trans_axis");
-    int flatten_axis = ctx->Attrs().Get<int>("flatten_axis");
-    int concat_axis = ctx->Attrs().Get<int>("concat_axis");
-
-    size_t x_rank = ins[0].size();
-    size_t trans_axis_size = trans_axis.size();
-    PADDLE_ENFORCE_EQ(x_rank, trans_axis_size,
-                      "The input tensor's rank(%d) "
-                      "should be equal to the permutation axis's size(%d)",
-                      x_rank, trans_axis_size);
-
-    auto dims0 =
-        GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[0]));
-    std::vector<int> out_dims(dims0);
-    for (size_t i = 1; i < n; i++) {
-      auto dimsi =
-          GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[i]));
-      for (int j = 0; j < static_cast<int>(dims0.size()); j++) {
-        if (j == concat_axis) {
-          out_dims[concat_axis] += dimsi[j];
-        } else {
-          PADDLE_ENFORCE_EQ(out_dims[j], dimsi[j],
-                            "After flatting, the %d-th dim should be save "
-                            "except the specify axis.",
-                            j);
-        }
-      }
-    }
-    if (out_dims[concat_axis] < 0) {
-      out_dims[concat_axis] = -1;
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-  }
-};
-
-class TransposeFlattenConcatFusionOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor, tensors with rank up to 6 are supported.")
-        .AsDuplicable();
-    AddOutput("Out", "(Tensor)The output tensor.");
-    AddAttr<std::vector<int>>(
-        "trans_axis",
-        "(vector<int>) A list of values, and the size of the list should be "
-        "the same with the input tensor rank. This operator permutes the input "
-        "tensor's axes according to the values given.");
-    AddAttr<int>("flatten_axis",
-                 "(int)"
-                 "Indicate up to which input dimensions (exclusive) should be"
-                 "flattened to the outer dimension of the output. The value"
-                 "for axis must be in the range [0, R], where R is the rank of"
-                 "the input tensor. When axis = 0, the shape of the output"
-                 "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
-                 "input tensor is (d_0, d_1, ... d_n).");
-    AddAttr<int>("concat_axis",
-                 "The axis along which the input tensors will be concatenated. "
-                 "It should be 0 or 1, since the tensor is 2D after flatting.");
-    AddComment(R"DOC(
-
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_transpose_flatten_concat,
-                  ops::TransposeFlattenConcatFusionOp,
-                  ops::TransposeFlattenConcatFusionOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
deleted file mode 100644
index 6ccb670d73c803bb1b9827f0f30b99d272bfce79..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-
-template <typename T>
-class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    auto odims = out->dims();
-
-    std::vector<int> trans_axis = ctx.Attr<std::vector<int>>("trans_axis");
-    int flatten_axis = ctx.Attr<int>("flatten_axis");
-    int concat_axis = ctx.Attr<int>("concat_axis");
-
-    int rank = ins[0]->dims().size();
-    // use at least 4D in cudnnTransformTensor
-    int max_dim = rank < 4 ? 4 : rank;
-    std::vector<int> stride_x(max_dim, 0);
-    std::vector<int> stride_y(max_dim, 0);
-    std::vector<int> dims_y(max_dim, 0);
-
-    cudnnTensorDescriptor_t in_desc;
-    cudnnTensorDescriptor_t out_desc;
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
-    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-
-    T* odata = out->data<T>();
-    for (size_t k = 0; k < ins.size(); ++k) {
-      auto perm_shape = GetPermuteShape(trans_axis, ins[k]->dims());
-      int osize = 1;
-      auto idims = ins[k]->dims();
-      for (int i = 0; i < rank; i++) {
-        stride_x[i] = 1;
-        for (int j = trans_axis[i] + 1; j < rank; j++) {
-          stride_x[i] *= idims[j];
-        }
-        dims_y[i] = perm_shape[i];
-        osize *= perm_shape[i];
-      }
-      stride_y[rank - 1] = 1;
-      for (int i = rank - 2; i >= 0; i--) {
-        if (((i + 1) == flatten_axis) && (concat_axis == 1)) {
-          stride_y[i] = odims[1];
-        } else {
-          stride_y[i] = stride_y[i + 1] * perm_shape[i + 1];
-        }
-      }
-
-      // Since concat is aftern flatten, the output is 2D tensor.
-      // If concat_axis is 0, each input's permutated tensor is continuous.
-      // If concat_axis is 1, the stride of 0-th dim of each input's
-      // permutated tensor is odims()[1].
-
-      for (int i = rank; i < max_dim; i++) {
-        stride_x[i] = 1;
-        stride_y[i] = 1;
-        dims_y[i] = 1;
-      }
-
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
-
-      CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor(
-          handle, CudnnDataType<T>::kOne(), in_desc,
-          static_cast<const void*>(ins[k]->data<T>()),
-          CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
-      if (concat_axis == 0) {
-        odata += osize;
-      } else {
-        auto flat_shape = GetFlattenShape(flatten_axis, perm_shape);
-        odata += flat_shape[1];
-      }
-    }
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(fusion_transpose_flatten_concat,
-                        ops::TransposeFlattenConcatFusionKernel<float>,
-                        ops::TransposeFlattenConcatFusionKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
deleted file mode 100644
index 66d5bea679fc85ce6b1ba64921107aef987ccaa8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-inline std::vector<int32_t> GetPermuteShape(const std::vector<int>& axis,
-                                            const framework::DDim& in_dims) {
-  std::vector<int32_t> out_dims(in_dims.size());
-  for (size_t i = 0; i < axis.size(); i++) {
-    out_dims[i] = in_dims[axis[i]];
-  }
-  return out_dims;
-}
-
-inline std::vector<int32_t> GetFlattenShape(const int axis,
-                                            const std::vector<int>& in_dims) {
-  int64_t outer = 1, inner = 1;
-  for (int i = 0; i < static_cast<int>(in_dims.size()); ++i) {
-    if (i < axis) {
-      outer *= in_dims[i];
-    } else {
-      inner *= in_dims[i];
-    }
-  }
-  std::vector<int32_t> out_shape(2);
-  out_shape[0] = outer;
-  out_shape[1] = inner;
-  return out_shape;
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
deleted file mode 100644
index b3264ec0ad3fa984726244d911dab6f7bd8e95b8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather.cu.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using platform::DeviceContext;
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename T, typename IndexT = int>
-__global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
-                                 T* output, size_t index_size,
-                                 size_t slice_size) {
-  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT gather_i = indices[indices_i];
-    IndexT params_i = gather_i * slice_size + slice_i;
-    *(output + i) = *(params + params_i);
-  }
-}
-
-template <typename T, typename IndexT = int>
-__global__ void GatherNdCUDAKernel(const T* input, const int* input_dims,
-                                   const IndexT* indices, T* output,
-                                   size_t remain_size, size_t slice_size,
-                                   size_t end_size) {
-  CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT gather_i = 0;
-    int64_t temp = slice_size;
-    for (int64_t j = end_size - 1; j >= 0; --j) {
-      auto index_value = indices[indices_i * end_size + j];
-      assert(index_value >= 0 && index_value < input_dims[j]);
-      gather_i += (index_value * temp);
-      temp *= input_dims[j];
-    }
-    IndexT input_i = gather_i + slice_i;
-    *(output + i) = *(input + input_i);
-  }
-}
-
-/**
- * A thin wrapper on gpu tensor
- * Return a new tensor from source tensor, gathered according to index
- * input[src]: type-T source Tensor
- * input[index]: type-IndexT index Tensor (1-D)
- * return: output tensor
- */
-template <typename T, typename IndexT = int>
-void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
-  // check index of shape 1-D
-  if (index.dims().size() == 1) {
-    PADDLE_ENFORCE_GT(index.dims()[0], 0,
-                      "The index of gather_op should not be empty when the "
-                      "index's rank is 1.");
-  } else if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      " If the index's rank of gather_op is 2, the second "
-                      "dimension should be 1.");
-  }
-
-  int index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
-  output_dims[0] = index_size;
-
-  // slice size
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  int block = 512;
-  int n = slice_size * index_size;
-  int grid = (n + block - 1) / block;
-
-  GatherCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_src, p_index, p_output, index_size, slice_size);
-}
-
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUGatherNd(const framework::ExecutionContext& context,
-                 const Tensor& input, const Tensor& index, Tensor* output) {
-  const auto& ctx = context.template device_context<DeviceContext>();
-  const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  auto cplace = platform::CPUPlace();
-
-  auto index_dims = index.dims();
-  auto index_dims_size = index_dims.size();
-  auto input_dims = input.dims();
-  auto input_dims_size = input_dims.size();
-
-  const T* p_input = input.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  // final dim
-  int64_t end_size = index_dims[index_dims_size - 1];
-  // remain dim
-  auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = framework::product(remain_ddim);
-  // slice size
-  int64_t slice_size = 1;
-  for (int64_t i = end_size; i < input_dims_size; ++i) {
-    slice_size *= input_dims[i];
-  }
-  // source dim
-  std::vector<int> v_input_dims(input_dims_size);
-  for (int i = 0; i < input_dims_size; ++i) {
-    v_input_dims[i] = static_cast<int>(input_dims[i]);
-  }
-
-  auto& dev_ctx = context.cuda_device_context();
-  int bytes = input_dims_size * sizeof(int);
-  auto p_input_dims = memory::Alloc(dev_ctx, bytes);
-  int* g_input_dims = reinterpret_cast<int*>(p_input_dims->ptr());
-  memory::Copy(gplace, g_input_dims, cplace, v_input_dims.data(), bytes,
-               ctx.stream());
-
-  int block = 512;
-  int n = slice_size * remain_numel;
-  int grid = (n + block - 1) / block;
-
-  GatherNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_input, g_input_dims, p_index, p_output, remain_numel, slice_size,
-      end_size);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
deleted file mode 100644
index 26fb93c2ebb295fc73832d50c2f8472e96bcb25f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory.h>
-#include <cstring>
-
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-/**
- * A thin wrapper for gathering on cpu tensor
- * Return a new tensor from source tensor, gathered according to index
- * input[src]: type-T source Tensor
- * input[index]: type-IndexT index Tensor (1-D)
- * return: output tensor
- */
-template <typename T, typename IndexT = int>
-void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true);
-  // check index of shape 1-D
-  if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      "index.dims()[1] should be 1 when index.dims().size() == "
-                      "2 in gather_op.");
-  } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      "index.dims().size() should be 1 or 2 in gather_op.");
-  }
-  int64_t index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-
-  const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  // slice size
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int64_t i = 0; i < index_size; ++i) {
-    IndexT index_ = p_index[i];
-    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
-  }
-}
-
-template <typename T, typename IndexT = int>
-void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
-                 const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                    "It should be running on the CPU");
-
-  auto index_dims = index.dims();
-  auto index_dims_size = index_dims.size();
-  auto input_dims = input.dims();
-  auto input_dims_size = input_dims.size();
-
-  const T* p_input = input.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  // final dim
-  int64_t end_size = index_dims[index_dims_size - 1];
-  // remain dim
-  auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = framework::product(remain_ddim);
-  // slice size
-  int64_t slice_size = 1;
-  for (int64_t i = end_size; i < input_dims_size; ++i) {
-    slice_size *= input_dims[i];
-  }
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int64_t i = 0; i < remain_numel; ++i) {
-    int64_t index_ = 0;
-    int64_t temp = 1;
-    for (int64_t j = end_size - 1; j >= 0; --j) {
-      IndexT index_value = p_index[i * end_size + j];
-      PADDLE_ENFORCE_LT(index_value, input_dims[j],
-                        "Input(index[-1)] has wrong value, it is %d",
-                        index_value);
-      PADDLE_ENFORCE_GE(index_value, 0UL,
-                        "The value of Input(index) must be no less than 0");
-
-      index_ += (index_value * temp);
-      temp *= input_dims[j];
-    }
-    memcpy(p_output + i * slice_size, p_input + index_ * slice_size,
-           slice_bytes);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
deleted file mode 100644
index aed0f824e6966b2d15e50bddbef4f782566420c4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather_nd_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-class GatherNdOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of GatherNdOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      "Input(Index) of GatherNdOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of GatherNdOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_dims_size = x_dims.size();
-    auto index_dims = ctx->GetInputDim("Index");
-    auto index_dims_size = index_dims.size();
-
-    PADDLE_ENFORCE_LE(
-        index_dims[index_dims_size - 1], x_dims_size,
-        "Input(Index).shape[-1] should be no greater than Input(X).rank");
-    PADDLE_ENFORCE_GE(index_dims_size, 2UL,
-                      "The rank of Input(Index) should be greater than 1");
-
-    std::vector<int64_t> result_dims;
-    // The result dims is
-    //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
-    for (int i = 0; i < index_dims_size - 1; ++i) {
-      result_dims.emplace_back(index_dims[i]);
-    }
-    for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) {
-      result_dims.emplace_back(x_dims[i]);
-    }
-
-    ctx->SetOutputDim("Out", framework::make_ddim(result_dims));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class GatherNdGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The source input of gather_nd op");
-    AddInput("Index", "The index input of gather_nd op");
-    AddOutput("Out", "The output of gather_nd op");
-    AddComment(R"DOC(
-    Gather_Nd Operator.
-
-    This function is actually a high-dimensional extension of gather 
-    and supports for simultaneous indexing by multiple axes. Out is 
-    obtained by gathering slices from X into a tensor with shape 
-    Index.shape[:-1] + X.shape[Index.shape[-1]:].
-
-    Example:
-   
-    Given:
-         X = [[[ 0,  1,  2,  3],
-               [ 4,  5,  6,  7],
-               [ 8,  9, 10, 11]],
-              [[12, 13, 14, 15],
-               [16, 17, 18, 19],
-               [20, 21, 22, 23]]]
-       
-         X.shape = (2, 3, 4)
-
-   *Case 1:
-
-       Index = [[1]]
-
-    we get:
-       Out = 
-            [[12, 13, 14, 15],
-             [16, 17, 18, 19],
-             [20, 21, 22, 23]]
-
-   *Case 2:
-
-       Index = [[0,2]]
-
-    we get:
-        
-       Out =  [8, 9, 10, 11]
-
-   *Case 3:
-
-       Index = [[1, 2, 3]]
-
-    we get:
-
-       Out = [23]
-
-)DOC");
-  }
-};
-
-class GatherNdGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("gather_nd_grad");
-    op->SetInput("Index", Input("Index"));
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(GatherNdGradNoNeedBufferVarInference,
-                                      "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker,
-                  ops::GatherNdGradOpDescMaker);
-
-REGISTER_OPERATOR(gather_nd_grad, ops::GatherNdGradOp,
-                  ops::GatherNdGradNoNeedBufferVarInference);
-
-REGISTER_OP_CPU_KERNEL(gather_nd, ops::GatherNdOpKernel<float>,
-                       ops::GatherNdOpKernel<double>,
-                       ops::GatherNdOpKernel<int64_t>,
-                       ops::GatherNdOpKernel<int>,
-                       ops::GatherNdOpKernel<uint8_t>);
-
-REGISTER_OP_CPU_KERNEL(gather_nd_grad, ops::GatherNdGradOpKernel<float>,
-                       ops::GatherNdGradOpKernel<double>,
-                       ops::GatherNdGradOpKernel<int64_t>,
-                       ops::GatherNdGradOpKernel<int>,
-                       ops::GatherNdGradOpKernel<uint8_t>);
diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu
deleted file mode 100644
index 1ad335039a9cd6b95bb60a5329438e8759e97a5c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_nd_op.cu
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/gather_nd_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on GPU device.");
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUGatherNd<DeviceContext, T, int>(ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGatherNd<DeviceContext, T, int64_t>(ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on GPU device.");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *dO, *index, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdOpCUDAKernel<CUDA, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(gather_nd_grad,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, plat::float16>);
diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h
deleted file mode 100644
index 059ca54c468663686abf0270dedfca727689b6db..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_nd_op.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherNdOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on CPU.");
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGatherNd<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGatherNd<T, int64_t>(ctx.device_context(), *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherNdGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on CPU.");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      ScatterNdAdd<T, int64_t>(ctx, *dO, *index, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
deleted file mode 100644
index cbabd59cf634f09c0a55d3822995b4d0f5f170ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_op.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-class GatherOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of GatherOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Index"),
-                   "Input(Index) of GatherOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of GatherOp should not be null.");
-
-    auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE(index_dims.size() == 1 ||
-                   (index_dims.size() == 2 && index_dims[1] == 1));
-    int batch_size = ctx->GetInputDim("Index")[0];
-    framework::DDim output_dims(ctx->GetInputDim("X"));
-    output_dims[0] = batch_size;
-    ctx->SetOutputDim("Out", output_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class GatherGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The source input of gather op");
-    AddInput("Index", "The index input of gather op");
-    AddOutput("Out", "The output of gather op");
-    AddAttr<bool>(
-        "overwrite",
-        "(bool, default: False) "
-        "In backward process, calc the grad when has same index,"
-        "If true, update the grad using the overwrite mode in same index,"
-        "If false, using the accumulate mode in same index.")
-        .SetDefault(true);
-    AddComment(R"DOC(
-Gather Operator.
-
-$Out = X[Index]$
-
-Out is obtained by gathering entries of the outer-most dimension
-of X indexed by Index and concatenate them together.
-
-Example:
-
-X = [[1, 2],
-     [3, 4],
-     [5, 6]]
-
-Index = [[1, 2]]
-
-Then:
-
-Out = [[3, 4],
-       [5, 6]]
-
-)DOC");
-  }
-};
-
-class GatherGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("gather_grad");
-    op->SetInput("Index", Input("Index"));
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(GatherGradNoNeedBufferVarInference, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
-                  ops::GatherGradOpDescMaker);
-REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
-                  ops::GatherGradNoNeedBufferVarInference);
-REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
-                       ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
-                       ops::GatherOpKernel<uint8_t>,
-                       ops::GatherOpKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
-                       ops::GatherGradientOpKernel<double>,
-                       ops::GatherGradientOpKernel<int>,
-                       ops::GatherGradientOpKernel<uint8_t>,
-                       ops::GatherGradientOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
deleted file mode 100644
index 061f92c76c32fbc599bd8f5d32bb110c276d748f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_op.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GatherOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE(
-        index_type_match,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUGather<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE(
-        index_type_match,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int>(ctx, *dO, *index, dX,
-                               ctx.Attr<bool>("overwrite"));
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterAssign<T, int64_t>(ctx, *dO, *index, dX,
-                                   ctx.Attr<bool>("overwrite"));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
-                        ops::GatherOpCUDAKernel<double>,
-                        ops::GatherOpCUDAKernel<int64_t>,
-                        ops::GatherOpCUDAKernel<int>,
-                        ops::GatherOpCUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
-                        ops::GatherGradOpCUDAKernel<double>,
-                        ops::GatherGradOpCUDAKernel<int64_t>,
-                        ops::GatherGradOpCUDAKernel<int>,
-                        ops::GatherGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
deleted file mode 100644
index 852790a4c63c85d89dd19a870fa84991798219eb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_op.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE(
-        index_type_match,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGather<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-    bool overwrite = ctx.Attr<bool>("overwrite");
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE(
-        index_type_match,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      if (overwrite) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *dO, *index, dX);
-      } else {
-        ScatterAssignAdd<T, int32_t>(ctx, *dO, *index, dX);
-      }
-    } else if (index_type == framework::proto::VarType::INT64) {
-      if (overwrite) {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *dO, *index, dX);
-      } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *dO, *index, dX);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
deleted file mode 100644
index f6b156eb30dae154395b34dcfc26319cd89edbca..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <iostream>
-#include <string>
-
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/platform/place.h"
-
-TEST(Gather, GatherData) {
-  paddle::framework::Tensor* src = new paddle::framework::Tensor();
-  paddle::framework::Tensor* index = new paddle::framework::Tensor();
-  paddle::framework::Tensor* output = new paddle::framework::Tensor();
-
-  int* p_src = nullptr;
-  int* p_index = nullptr;
-  p_src = src->mutable_data<int>(paddle::framework::make_ddim({3, 4}),
-                                 paddle::platform::CPUPlace());
-  p_index = index->mutable_data<int>(paddle::framework::make_ddim({2}),
-                                     paddle::platform::CPUPlace());
-
-  for (int i = 0; i < 12; ++i) p_src[i] = i;
-  p_index[0] = 1;
-  p_index[1] = 0;
-
-  int* p_output = output->mutable_data<int>(
-      paddle::framework::make_ddim({2, 4}), paddle::platform::CPUPlace());
-
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
-  delete cpu_place;
-  cpu_place = NULL;
-  for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
-  for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
-
-  delete src;
-  delete index;
-  delete output;
-}
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
deleted file mode 100644
index 4d39358477016d4a2ae01aba635347bc26727474..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/batch_size_like.h"
-
-namespace paddle {
-namespace operators {
-
-class GaussianRandomBatchSizeLikeOp : public BatchSizeLikeOp {
- protected:
-  using BatchSizeLikeOp::BatchSizeLikeOp;
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
- protected:
-  void Apply() override {
-    AddAttr<float>("mean",
-                   "(float, default 0.0) "
-                   "The mean (or center) of the gaussian distribution.")
-        .SetDefault(.0f);
-    AddAttr<float>("std",
-                   "(float, default 1.0) "
-                   "The standard deviation (std, or spread) of the "
-                   "gaussian distribution.")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed of generator."
-                 "0 means use system wide seed."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
-    AddAttr<int>("dtype",
-                 "(int, default 5(FP32)) "
-                 "Output data type.")
-        .SetDefault(framework::proto::VarType::FP32);
-
-    AddComment(R"DOC(
-
-Used to initialize tensors with gaussian random generator.
-The default mean of the distribution is 0. and default standard
-deviation (std) of the distribution is 1.. Uers can set mean and std
-by input arguments.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(gaussian_random_batch_size_like,
-                  paddle::operators::GaussianRandomBatchSizeLikeOp,
-                  paddle::operators::GaussianRandomBatchSizeLikeOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::BatchSizeLikeNoNeedBufferVarsInference);
-
-// Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
deleted file mode 100644
index c70d5b8bc7569c38cbc003aca7c62dc503df11cf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-#include "paddle/fluid/framework/op_registry.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::normal_distribution<T> dist(mean, std);
-    int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
-    }
-  }
-};
-
-class GaussianRandomOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of GaussianRandomOp should not be null.");
-    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    std::vector<int64_t> temp;
-    temp.reserve(shape.size());
-    for (auto dim : shape) {
-      temp.push_back(static_cast<int64_t>(dim));
-    }
-    PADDLE_ENFORCE(shape.size() > 0UL,
-                   "shape can be one int or array. shape must be set.");
-    ctx->SetOutputDim("Out", framework::make_ddim(temp));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library{framework::LibraryType::kPlain};
-    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-    }
-#endif
-
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context(), layout, library);
-  }
-};
-
-class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "Output matrix of gaussian random op");
-
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) "
-                                  "The dimension of random tensor.");
-    AddAttr<float>("mean",
-                   "(float, default 0.0) "
-                   "mean of random tensor.")
-        .SetDefault(.0f);
-    AddAttr<float>("std",
-                   "(float, default 1.0) "
-                   "std of random tensor.")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed of generator."
-                 "0 means use system wide seed."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
-    AddAttr<int>("dtype",
-                 "(int, default 5(FP32)) "
-                 "Output data type.")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddComment(R"DOC(
-GaussianRandom Operator.
-
-Used to initialize tensors with gaussian random generator.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
-                             ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
-                       ops::CPUGaussianRandomKernel<double>);
-REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
-                       ops::CPUGaussianRandomKernel<float>,
-                       ops::CPUGaussianRandomKernel<double>);
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
deleted file mode 100644
index 7784856417e579fd43f79fa331d46df8af6c36b8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GaussianGenerator {
-  T mean_, std_;
-  unsigned int seed_;
-
-  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
-      : mean_(mean), std_(std), seed_(seed) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::normal_distribution<T> dist(mean_, std_);
-    rng.discard(n);
-    return dist(rng);
-  }
-};
-
-template <typename T>
-class GPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-    }
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      GaussianGenerator<T>(mean, std, seed));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(gaussian_random,
-                        paddle::operators::GPUGaussianRandomKernel<float>,
-                        paddle::operators::GPUGaussianRandomKernel<double>);
-REGISTER_OP_CUDA_KERNEL(gaussian_random_batch_size_like,
-                        paddle::operators::GPUGaussianRandomKernel<float>,
-                        paddle::operators::GPUGaussianRandomKernel<double>);
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
deleted file mode 100644
index c0893359af2f4de4ed8fd88ebff122447e8d84c7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "GetTensorFromSelectedRowsOp must has input X.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "GetTensorFromSelectedRowsOp must has output Out.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("X").front() ==
-            framework::proto::VarType::SELECTED_ROWS,
-        "The input X's type should be SelectedRows, but the received is %s",
-        ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front());
-    PADDLE_ENFORCE(
-        ctx->GetOutputsVarType("Out").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The output Out's type should be LoDTensor, but the received is %s",
-        ctx->Outputs("Out").front(), ctx->GetOutputsVarType("Out").front());
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.device_context());
-  }
-};
-
-class GetTensorFromSelectedRowsKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *x = ctx.Input<framework::SelectedRows>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
-
-    out->Resize(x->value().dims());
-    out->mutable_data(ctx.GetPlace(), x->value().type());
-    framework::TensorCopy(x->value(), ctx.GetPlace(), ctx.device_context(),
-                          out);
-  }
-};
-
-class GetTensorFromSelectedRowsOpProtoMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input type is SelectedRows.");
-    AddOutput("Out", "The output type is LoDTensor.");
-    AddComment(
-        R"DOC(
-GetTensorFromSelectedRows Operator
-
-GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
-
-)DOC");
-  }
-};
-
-class GetTensorFromSelectedRowsOpVarTypeInference
-    : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const {  // NOLINT
-    auto out_var_name = ctx->Output("Out").front();
-    auto in_var_name = ctx->Input("X").front();
-
-    ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
-    ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(get_tensor_from_selected_rows,
-                  ops::GetTensorFromSelectedRowsOp,
-                  ops::GetTensorFromSelectedRowsOpProtoMaker,
-                  ops::GetTensorFromSelectedRowsOpVarTypeInference);
-
-REGISTER_OP_CPU_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
-                               ops::GetTensorFromSelectedRowsKernel, double,
-                               ops::GetTensorFromSelectedRowsKernel, int,
-                               ops::GetTensorFromSelectedRowsKernel, int64_t,
-                               ops::GetTensorFromSelectedRowsKernel);
-
-#ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
-                                ops::GetTensorFromSelectedRowsKernel, double,
-                                ops::GetTensorFromSelectedRowsKernel, int,
-                                ops::GetTensorFromSelectedRowsKernel, int64_t,
-                                ops::GetTensorFromSelectedRowsKernel);
-#endif
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
deleted file mode 100644
index c765d344d0be88b56a9d7b5b9ac1d572d2aa5c24..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using DataLayout = platform::DataLayout;
-using ScopedSpatialTransformerDescriptor =
-    platform::ScopedSpatialTransformerDescriptor;
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-
-template <typename T>
-class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace");
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output = ctx.Output<Tensor>("Output");
-
-    int n = input->dims()[0];
-    int c = input->dims()[1];
-    int h = input->dims()[2];
-    int w = input->dims()[3];
-    const int size[4] = {n, c, h, w};
-
-    const T* input_data = input->data<T>();
-    const T* grid_data = grid->data<T>();
-    T* output_data = output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-
-    ScopedSpatialTransformerDescriptor st_desc;
-    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
-        st_desc.descriptor<T>(4, size);
-
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        DataLayout::kNCHW, framework::vectorize<int>(input->dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        DataLayout::kNCHW, framework::vectorize<int>(output->dims()));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerForward(
-        handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
-        input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
-        output_data));
-  }
-};
-
-template <typename T>
-class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace");
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-
-    auto output_grad_dims = output_grad->dims();
-    const int n = output_grad_dims[0];
-    const int c = output_grad_dims[1];
-    const int h = output_grad_dims[2];
-    const int w = output_grad_dims[3];
-    const int size[4] = {n, c, h, w};
-
-    ScopedSpatialTransformerDescriptor st_dest;
-    cudnnSpatialTransformerDescriptor_t cudnn_st_dest =
-        st_dest.descriptor<T>(4, size);
-
-    const T* input_data = input->data<T>();
-    const T* grid_data = grid->data<T>();
-    const T* output_grad_data = output_grad->data<T>();
-    T* input_grad_data =
-        input_grad->mutable_data<T>(output_grad_dims, ctx.GetPlace());
-    T* grid_grad_data =
-        grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
-
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor input_grad_desc;
-    ScopedTensorDescriptor output_grad_desc;
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        DataLayout::kNCHW, framework::vectorize<int>(input->dims()));
-    cudnnTensorDescriptor_t cudnn_input_grad_desc =
-        input_grad_desc.descriptor<T>(
-            DataLayout::kNCHW, framework::vectorize<int>(input_grad->dims()));
-    cudnnTensorDescriptor_t cudnn_output_grad_desc =
-        output_grad_desc.descriptor<T>(
-            DataLayout::kNCHW, framework::vectorize<int>(output_grad->dims()));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerBackward(
-        handle, cudnn_st_dest, CudnnDataType<T>::kOne(), cudnn_input_desc,
-        input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
-        input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
-        output_grad_data, grid_data, CudnnDataType<T>::kZero(),
-        grid_grad_data));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace plat = paddle::platform;
-REGISTER_OP_KERNEL(grid_sampler, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNGridSampleOpKernel<float>,
-                   paddle::operators::CUDNNGridSampleOpKernel<double>);
-REGISTER_OP_KERNEL(grid_sampler_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNGridSampleGradOpKernel<float>,
-                   paddle::operators::CUDNNGridSampleGradOpKernel<double>);
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
deleted file mode 100644
index 57a1fcd42da04a766ebd8713e3863f259b3784ac..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/grid_sampler_op.h"
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class GridSampleOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of GridSampleOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grid"),
-                   "Input(Grid) of GridSampleOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                   "Output(Output) of GridSampleOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto grid_dims = ctx->GetInputDim("Grid");
-    PADDLE_ENFORCE(x_dims.size() == 4,
-                   "Input(X) of GridSampleOp should be 4-D Tensor.");
-    PADDLE_ENFORCE(grid_dims.size() == 4,
-                   "Input(Grid) of GridSampleOp should be 4-D Tensor.");
-    if (ctx->IsRuntime() || grid_dims[3] > 0) {
-      PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
-    }
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
-                        "Input(X) and Input(Grid) dims[0] should be equal.");
-      PADDLE_ENFORCE_EQ(
-          grid_dims[1], x_dims[2],
-          "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
-      PADDLE_ENFORCE_EQ(
-          grid_dims[2], x_dims[3],
-          "Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
-    }
-
-    ctx->SetOutputDim("Output", x_dims);
-    ctx->ShareLoD("X", "Output");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kCUDNN;
-    }
-#endif
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace(),
-                                   framework::DataLayout::kAnyLayout, library_);
-  }
-};
-
-class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor) The input data of GridSampleOp, "
-             "This is a 4-D tensor with shape of [N, C, H, W]");
-    AddInput(
-        "Grid",
-        "(Tensor) The input grid of GridSampleOp generated by AffineGridOp, "
-        "This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation "
-        "of x and y coordinates with shape [N, H, W] in last dimention");
-    AddOutput("Output", "(Tensor) Output tensor with shape [N, C, H, W]");
-    AddAttr<bool>(
-        "use_cudnn",
-        "(bool, default true) Only used in cudnn kernel, need install cudnn")
-        .SetDefault(true);
-
-    AddComment(R"DOC(
-      This operation samples input X by using bilinear interpolation based on 
-      flow field grid, which is usually gennerated by affine_grid. The grid of
-      shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
-      with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
-      (in width dimension) of input data x and grid_y is indexng the 3rd 
-      dimention (in height dimension), finally results is the bilinear 
-      interpolation value of 4 nearest corner points.
-
-      Step 1:
-        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
-
-        grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
-        grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
-
-      Step 2:
-        Indices input data X with grid (x, y) in each [H, W] area, and bilinear 
-        interpolate point value by 4 nearest points.
-
-          wn ------- y_n ------- en
-          |           |           |
-          |          d_n          |
-          |           |           |
-         x_w --d_w-- grid--d_e-- x_e
-          |           |           |
-          |          d_s          |
-          |           |           |
-          ws ------- y_s ------- wn
-
-        x_w = floor(x)              // west side x coord
-        x_e = x_w + 1               // east side x coord
-        y_n = floor(y)              // north side y coord
-        y_s = y_s + 1               // south side y coord
-
-        d_w = grid_x - x_w          // distance to west side
-        d_e = x_e - grid_x          // distance to east side
-        d_n = grid_y - y_n          // distance to north side
-        d_s = y_s - grid_y          // distance to south side
-
-        wn = X[:, :, y_n, x_w]      // north-west point value
-        en = X[:, :, y_n, x_e]      // north-east point value
-        ws = X[:, :, y_s, x_w]      // south-east point value
-        es = X[:, :, y_s, x_w]      // north-east point value
-
-        output = wn * d_e * d_s + en * d_w * d_s
-               + ws * d_e * d_n + es * d_w * d_n
-        )DOC");
-  }
-};
-
-class GridSampleOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto input_dims = ctx->GetInputDim("X");
-    auto grid_dims = ctx->GetInputDim("Grid");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Grid"))) {
-      ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kCUDNN;
-    }
-#endif
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace(),
-                                   framework::DataLayout::kAnyLayout, library_);
-  }
-};
-
-class GridSampleGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType("grid_sampler_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Grid", Input("Grid"));
-    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Grid"), InputGrad("Grid"));
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
-                  ops::GridSampleGradMaker);
-REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler_grad,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
deleted file mode 100644
index 08a6043eb07a6e44d46428ee195f6cb28c2ee77c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-using Array3 = Eigen::DSizes<int64_t, 3>;
-using Array4 = Eigen::DSizes<int64_t, 4>;
-
-template <typename T>
-static inline bool isInBound(T x, T y, T x_max, T y_max) {
-  if (x < 0 || x > x_max || y < 0 || y > y_max) {
-    return false;
-  }
-  return true;
-}
-
-template <typename T>
-static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
-                              const Tensor& grid, Tensor* x_w, Tensor* x_e,
-                              Tensor* y_n, Tensor* y_s, Tensor* d_w,
-                              Tensor* d_e, Tensor* d_n, Tensor* d_s) {
-  auto& place = *ctx.eigen_device();
-  const int n = grid.dims()[0];
-  const int h = grid.dims()[1];
-  const int w = grid.dims()[2];
-  const T x_max = static_cast<T>(w - 1);
-  const T y_max = static_cast<T>(h - 1);
-
-  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  Tensor grid_x, grid_y;
-  T* grid_x_data = grid_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  T* grid_y_data = grid_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * h * w; i++) {
-    grid_x_data[i] = grid_data[2 * i];
-    grid_y_data[i] = grid_data[(2 * i) + 1];
-  }
-
-  Tensor ones;
-  ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
-  Tensor half_xmax;
-  Tensor half_ymax;
-  half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto half_xmax_t =
-      EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
-  half_ymax.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto half_ymax_t =
-      EigenTensor<T, 3>::From(half_ymax).setConstant(0.5 * y_max);
-
-  // scale grid to [0, h-1/w-1]
-  auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-  grid_x_t.device(place) = (grid_x_t + ones_t) * half_xmax_t;
-  grid_y_t.device(place) = (grid_y_t + ones_t) * half_ymax_t;
-
-  // calculate coords of 4 corner points
-  x_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  x_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  y_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  y_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
-  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
-  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
-  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
-  x_w_t.device(place) = grid_x_t.floor();
-  x_e_t.device(place) = x_w_t + ones_t;
-  y_n_t.device(place) = grid_y_t.floor();
-  y_s_t.device(place) = y_n_t + ones_t;
-
-  // calculate distances to 4 sides
-  d_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
-  d_w_t.device(place) = grid_x_t - x_w_t;
-  d_e_t.device(place) = x_e_t - grid_x_t;
-  d_n_t.device(place) = grid_y_t - y_n_t;
-  d_s_t.device(place) = y_s_t - grid_y_t;
-}
-
-template <typename T>
-static void GetGridPointValue(const Tensor& input, Tensor* output,
-                              const Tensor& x, const Tensor& y) {
-  const int n = input.dims()[0];
-  const int c = input.dims()[1];
-  const int h = input.dims()[2];
-  const int w = input.dims()[3];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
-  auto input_t = EigenTensor<T, 4>::From(input);
-
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < h; k++) {
-      for (int l = 0; l < w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
-          for (int j = 0; j < c; j++) {
-            output_t(i, j, k, l) =
-                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                        static_cast<int>(round(x_t(i, k, l))));
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void GatherOutputGradToInputGrad(const Tensor& output_grad,
-                                        Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y, const Tensor& d1,
-                                        const Tensor& d2) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int h = output_grad.dims()[2];
-  const int w = output_grad.dims()[3];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto d1_t = EigenTensor<T, 3>::From(d1);
-  auto d2_t = EigenTensor<T, 3>::From(d2);
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < h; k++) {
-      for (int l = 0; l < w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
-          for (int j = 0; j < c; j++) {
-            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                         static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GridSampleOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-
-    const int n = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    // calc locations and distances of 4 corner points
-    Tensor x_w, x_e, y_n, y_s;
-    Tensor d_w, d_e, d_n, d_s;
-    CalcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
-        &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
-
-    auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), output,
-        static_cast<T>(0));
-
-    // calc 4 corner points value
-    Tensor v_wn, v_en, v_ws, v_es;
-    v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
-    GetGridPointValue<T>(*input, &v_en, x_e, y_n);
-    GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
-    GetGridPointValue<T>(*input, &v_es, x_e, y_s);
-
-    auto d_w_t = EigenTensor<T, 3>::From(d_w);
-    auto d_e_t = EigenTensor<T, 3>::From(d_e);
-    auto d_n_t = EigenTensor<T, 3>::From(d_n);
-    auto d_s_t = EigenTensor<T, 3>::From(d_s);
-    auto d_w_scaled_t =
-        d_w_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_e_scaled_t =
-        d_e_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_n_scaled_t =
-        d_n_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_s_scaled_t =
-        d_s_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-    auto v_en_t = EigenTensor<T, 4>::From(v_en);
-    auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-    auto v_es_t = EigenTensor<T, 4>::From(v_es);
-    auto output_t = EigenTensor<T, 4>::From(*output);
-    // bilinear interpolaetion by 4 corner points
-    output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
-                             v_en_t * d_w_scaled_t * d_s_scaled_t +
-                             v_ws_t * d_e_scaled_t * d_n_scaled_t +
-                             v_es_t * d_w_scaled_t * d_n_scaled_t;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GridSampleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-
-    const int n = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), input_grad,
-        static_cast<T>(0));
-    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-    grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), grid_grad,
-        static_cast<T>(0));
-
-    Tensor x_w, x_e, y_n, y_s;
-    Tensor d_w, d_e, d_n, d_s;
-    CalcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
-        &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
-
-    // gather output grad value to input grad by corner point coords and weight
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_n, d_e,
-                                   d_s);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_s, d_e,
-                                   d_n);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_n, d_w,
-                                   d_s);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_s, d_w,
-                                   d_n);
-
-    // calc 4 corner points value
-    Tensor v_wn, v_en, v_ws, v_es;
-    v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
-    GetGridPointValue<T>(*input, &v_en, x_e, y_n);
-    GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
-    GetGridPointValue<T>(*input, &v_es, x_e, y_s);
-    auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-    auto v_en_t = EigenTensor<T, 4>::From(v_en);
-    auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-    auto v_es_t = EigenTensor<T, 4>::From(v_es);
-
-    auto d_w_t = EigenTensor<T, 3>::From(d_w);
-    auto d_e_t = EigenTensor<T, 3>::From(d_e);
-    auto d_n_t = EigenTensor<T, 3>::From(d_n);
-    auto d_s_t = EigenTensor<T, 3>::From(d_s);
-
-    auto output_grad_t = EigenTensor<T, 4>::From(*output_grad);
-
-    Tensor grid_grad_x, grid_grad_y;
-    grid_grad_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
-    grid_grad_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
-    auto grid_grad_x_t = EigenTensor<T, 3>::From(grid_grad_x).setConstant(0.0);
-    auto grid_grad_y_t = EigenTensor<T, 3>::From(grid_grad_y).setConstant(0.0);
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            grid_grad_x_t(i, k, l) +=
-                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-            grid_grad_y_t(i, k, l) +=
-                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-    const T x_max = static_cast<T>(w - 1);
-    const T y_max = static_cast<T>(h - 1);
-    grid_grad_x_t = grid_grad_x_t * (x_max / (T)2);
-    grid_grad_y_t = grid_grad_y_t * (y_max / (T)2);
-
-    // gather grid_grad [x, y] in 3rd Dim
-    T* grid_grad_data = grid_grad->data<T>();
-    T* grid_grad_x_data = grid_grad_x.data<T>();
-    T* grid_grad_y_data = grid_grad_y.data<T>();
-    for (int i = 0; i < n * h * w; i++) {
-      grid_grad_data[2 * i] = grid_grad_x_data[i];
-      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
deleted file mode 100644
index e184ff14a5534dc40e87af0be45ca3409f1bdb18..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/group_norm_op.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/group_norm_op.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-class GroupNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of GroupNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of GroupNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
-                   "Output(Mean) of GroupNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
-                   "Output(Variance) of GroupNormOp should not be null.");
-    auto x_dim = ctx->GetInputDim("X");
-    const DataLayout data_layout = framework::StringToDataLayout(
-        ctx->Attrs().Get<std::string>("data_layout"));
-    const int64_t channel_num =
-        (data_layout == DataLayout::kNCHW ? x_dim[1] : x_dim[x_dim.size() - 1]);
-    auto batch_size = x_dim[0];
-    auto groups = ctx->Attrs().Get<int>("groups");
-    PADDLE_ENFORCE_LE(
-        groups, channel_num,
-        "'groups' must be less equal than the number of channels.");
-    PADDLE_ENFORCE_GE(groups, 1, "'groups' must be greater equal than 1.");
-
-    if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], channel_num);
-    }
-    if (ctx->HasInput("Bias")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], channel_num);
-    }
-
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("Mean", {batch_size, groups});
-    ctx->SetOutputDim("Variance", {batch_size, groups});
-    ctx->ShareLoD("X", "Y");
-  }
-};
-
-class GroupNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor.");
-    AddInput("Scale",
-             "Scale is a 1-dimensional tensor of size C"
-             "that is applied to the output.")
-        .AsDispensable();
-    AddInput("Bias",
-             "Bias is a 1-dimensional tensor of size C "
-             "that is applied to the output")
-        .AsDispensable();
-    AddOutput("Y", "Result after normalization.");
-    AddOutput("Mean", "Mean of each group.").AsIntermediate();
-    AddOutput("Variance", "Variance of each group.").AsIntermediate();
-
-    AddAttr<float>("epsilon",
-                   "Constant for numerical stability [default 1e-5].")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 1.0f,
-                         "'epsilon' should be between 0.0 and 1.0.");
-        });
-    AddAttr<int>("groups", "The number of groups that divided from channels.")
-        .AddCustomChecker([](const int &groups) {
-          PADDLE_ENFORCE_GT(groups, 0, "'groups' should be greater than zero.");
-        });
-    AddAttr<std::string>("data_layout",
-                         "An optional string from: \"NHWC\", \"NCHW\". ")
-        .SetDefault("NCHW");
-    AddComment(R"DOC(
-Group Normalization
-
-Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_
-)DOC");
-  }
-};
-
-class GroupNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // check input
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of GroupNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Variance"),
-                   "Input(Variance) of GroupNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) of GroupNormOp should not be null.");
-
-    // check output
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Y"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-      ctx->SetOutputDim(framework::GradVarName("Scale"),
-                        ctx->GetInputDim("Scale"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-      ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Bias"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-    if (var == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
-    } else if (var->IsType<LoDTensor>()) {
-      t = &var->Get<LoDTensor>();
-    }
-    if (t == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    return framework::OpKernelType(t->type(), ctx.GetPlace());
-  }
-};
-
-class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op = new framework::OpDesc();
-    op->SetType("group_norm_grad");
-    op->SetInput("Scale", Input("Scale"));
-    op->SetInput("Bias", Input("Bias"));
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-    op->SetInput("Y", Output("Y"));
-    op->SetInput("Variance", Output("Variance"));
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
-
-    op->SetAttrMap(Attrs());
-
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(GroupNormInplaceInToOut, {"X", "Y"});
-DECLARE_INPLACE_OP_INFERER(GroupNormGradInplaceInToOut,
-                           {framework::GradVarName("Y"),
-                            framework::GradVarName("X")});
-
-class GroupNormOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return {{"X", /*->*/ "Y"}};
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
-                  ops::GroupNormOpInferVarType, ops::GroupNormGradMaker,
-                  ops::GroupNormInplaceInToOut);
-REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp,
-                  ops::GroupNormGradInplaceInToOut);
-REGISTER_OP_CPU_KERNEL(
-    group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    group_norm_grad,
-    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
deleted file mode 100644
index b7f79be45be84f2557c34300922506a5840c5dd5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/group_norm_op.cu
+++ /dev/null
@@ -1,372 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cub/cub.cuh"
-#include "paddle/fluid/operators/group_norm_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-
-namespace paddle {
-namespace operators {
-
-using DataLayout = framework::DataLayout;
-enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
-
-#define CHECK_CASE(i, flags, kernel_name, ...)                              \
-  if (i == flags) {                                                         \
-    kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(__VA_ARGS__); \
-  }
-
-// 0 for no scale, no bias
-// 1 for has scale, no bias
-// 2 for no scale, has bias
-// 3 for has scale, has bias
-#define UNROLL_ALL_CASES(flags, kernel_name, ...) \
-  CHECK_CASE(0, flags, kernel_name, __VA_ARGS__)  \
-  CHECK_CASE(1, flags, kernel_name, __VA_ARGS__)  \
-  CHECK_CASE(2, flags, kernel_name, __VA_ARGS__)  \
-  CHECK_CASE(3, flags, kernel_name, __VA_ARGS__)
-
-template <typename T>
-__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
-  typedef cub::WarpReduce<T> WarpReduce;
-  typename WarpReduce::TempStorage temp_storage;
-  value = WarpReduce(temp_storage).Sum(value);
-  if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
-}
-
-template <typename T>
-__global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
-                                              int imsize, int groups,
-                                              int group_size, T* mean, T* var,
-                                              const DataLayout data_layout) {
-  int gid = blockIdx.y;
-  int cid = blockIdx.x;
-  int bid = blockIdx.z;
-  int H = imsize / W;
-  int number = min(group_size, static_cast<int>(C - gid * group_size));
-  int ccid = gid * group_size + cid;
-  if (ccid >= C) return;
-  T x_mean = 0, x_var = 0;
-  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    T val;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid];
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      val = x[(bid * H + hid) * W * C + wid * C + ccid];
-    }
-    x_mean += val;
-    x_var += val * val;
-  }
-  x_mean /= number * imsize;
-  x_var /= number * imsize;
-  CudaAtomicAddWithWarp(&mean[bid * groups + gid], x_mean);
-  CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
-}
-
-template <typename T, int flags>
-__global__ void GroupNormForward(const T* x, const T* mean, const T* var,
-                                 const T* scale, const T* bias, int N, int C,
-                                 int W, int imsize, int groups, int group_size,
-                                 T epsilon, T* y, T* real_var,
-                                 const DataLayout data_layout) {
-  int gid = blockIdx.y;
-  int cid = blockIdx.x;
-  int bid = blockIdx.z;
-  int H = imsize / W;
-  int ccid = gid * group_size + cid;
-  if (ccid >= C) return;
-  T x_mean = mean[bid * groups + gid];
-  T x_var = var[bid * groups + gid];
-  x_var = x_var - x_mean * x_mean;
-  T var_inv = 1.0 / sqrt(x_var + epsilon);
-  if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
-  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    T val;
-    int hid, wid;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid];
-    } else {
-      hid = imid / W;
-      wid = imid % W;
-      val = x[(bid * H + hid) * W * C + wid * C + ccid];
-    }
-    val = (val - x_mean) * var_inv;
-    if (flags & kHasScale) val *= scale[gid * group_size + cid];
-    if (flags & kHasBias) val += bias[gid * group_size + cid];
-    if (data_layout == DataLayout::kNCHW) {
-      y[(bid * C + ccid) * imsize + imid] = val;
-    } else {
-      y[(bid * H + hid) * W * C + wid * C + ccid] = val;
-    }
-  }
-}
-
-template <typename T>
-class GroupNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* x = ctx.Input<Tensor>("X");
-
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
-    const auto groups = ctx.Attr<int>("groups");
-
-    const auto x_dims = x->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int group_size = (C - 1) / groups + 1;
-    const int W =
-        (data_layout == DataLayout::kNCHW ? x_dims[x_dims.size() - 1]
-                                          : x_dims[x_dims.size() - 2]);
-
-    y->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    Tensor temp_var;
-    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
-
-    set_zero(dev_ctx, mean, static_cast<T>(0));
-    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
-
-    auto* x_data = x->data<T>();
-    auto* y_data = y->data<T>();
-    auto* mean_data = mean->data<T>();
-    auto* var_data = var->data<T>();
-    auto* temp_var_data = temp_var.data<T>();
-
-    const T* scale_data = nullptr;
-    if (scale) scale_data = scale->data<T>();
-    const T* bias_data = nullptr;
-    if (bias) bias_data = bias->data<T>();
-
-    int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3]
-                                                   : x_dims[1] * x_dims[2]);
-
-    int block_size = std::min(1024, imsize);
-    dim3 grid(group_size, groups, x_dims[0]);
-    dim3 threads(block_size, 1, 1);
-    GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
-        temp_var_data, data_layout);
-    int flags =
-        (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
-    UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data,
-                     scale_data, bias_data, x_dims[0], C, W, imsize, groups,
-                     group_size, epsilon, y_data, var_data, data_layout);
-  }
-};
-
-template <typename T, int flags>
-__global__ void GroupNormBackwardGetMeanAndVar(
-    const T* x, const T* scale, const T* bias, const T* d_y, int N, int C,
-    int W, int imsize, int groups, int group_size, T epsilon, T* d_mean,
-    T* d_var, T* d_scale, T* d_bias, const DataLayout data_layout) {
-  int gid = blockIdx.y;
-  int cid = blockIdx.x;
-  int bid = blockIdx.z;
-  int H = imsize / W;
-  int number = min(group_size, static_cast<int>(C - gid * group_size));
-  int ccid = gid * group_size + cid;
-  if (ccid >= C) return;
-  T x_scale = (flags & kHasScale) ? scale[ccid] : 1;
-  T x_bias = (flags & kHasBias) ? bias[ccid] : 0;
-  T x_scale_inv = 0;
-  if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
-  T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0;
-
-  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    T val, dval;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid] - x_bias;
-      dval = d_y[(bid * C + ccid) * imsize + imid];
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
-      dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
-    }
-
-    d_var_data += val * dval;
-    d_mean_data += dval * x_scale;
-
-    val = val * x_scale_inv;
-    d_bias_data += dval;
-    d_scale_data += val * dval;
-  }
-  CudaAtomicAddWithWarp(&d_mean[bid * groups + gid], d_mean_data);
-  CudaAtomicAddWithWarp(&d_var[bid * groups + gid], d_var_data);
-  if (flags & kHasScale) CudaAtomicAddWithWarp(&d_scale[ccid], d_scale_data);
-  if (flags & kHasBias) CudaAtomicAddWithWarp(&d_bias[ccid], d_bias_data);
-}
-
-template <typename T, int flags>
-__global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
-                                  const T* bias, const T* var, const T* d_mean,
-                                  const T* d_var, int N, int C, int W,
-                                  int imsize, int groups, int group_size,
-                                  T epsilon, T* d_x,
-                                  const DataLayout data_layout) {
-  int gid = blockIdx.y;
-  int cid = blockIdx.x;
-  int bid = blockIdx.z;
-  int H = imsize / W;
-  int number = min(group_size, static_cast<int>(C - gid * group_size));
-  int ccid = gid * group_size + cid;
-  if (ccid >= C) return;
-  T x_var = var[bid * groups + gid];
-  T d_x_mean = d_mean[bid * groups + gid];
-  T d_x_var = d_var[bid * groups + gid];
-
-  T x_var_inv = 1.0 / sqrt(x_var + epsilon);
-  T number_inv = 1.0 / (number * imsize);
-
-  T x_scale = (flags & kHasScale) ? scale[ccid] : 1;
-  T x_bias = (flags & kHasBias) ? bias[ccid] : 0;
-  T x_scale_inv = 0;
-  if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
-
-  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    if (data_layout == DataLayout::kNCHW) {
-      T tmp = x[(bid * C + ccid) * imsize + imid];
-      T v_y = (tmp - x_bias) * x_scale_inv;
-      T dly = d_y[(bid * C + ccid) * imsize + imid];
-      d_x[(bid * C + ccid) * imsize + imid] =
-          x_var_inv *
-          (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
-      T v_y = (tmp - x_bias) * x_scale_inv;
-      T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid];
-      d_x[(bid * H + hid) * W * C + wid * C + ccid] =
-          x_var_inv *
-          (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
-    }
-  }
-}
-
-template <typename T>
-class GroupNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("Y");
-    auto* var = ctx.Input<Tensor>("Variance");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto groups = ctx.Attr<int>("groups");
-
-    // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    const auto& x_dims = x->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int group_size = (C - 1) / groups + 1;
-    const int W =
-        (data_layout == DataLayout::kNCHW ? x_dims[x_dims.size() - 1]
-                                          : x_dims[x_dims.size() - 2]);
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    Tensor temp_var;
-    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
-    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
-    T* temp_var_data = temp_var.data<T>();
-
-    Tensor temp_mean;
-    temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
-    set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
-    T* temp_mean_data = temp_mean.data<T>();
-
-    auto* x_data = x->data<T>();
-    T* d_x_data = nullptr;
-    if (d_x) d_x_data = d_x->data<T>();
-    auto* y_data = d_y->data<T>();
-    auto* var_data = var->data<T>();
-    T* d_scale_data = nullptr;
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_scale, static_cast<T>(0));
-      d_scale_data = d_scale->data<T>();
-    }
-    T* d_bias_data = nullptr;
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_bias, static_cast<T>(0));
-      d_bias_data = d_bias->data<T>();
-    }
-
-    const T* scale_data = nullptr;
-    if (scale) scale_data = scale->data<T>();
-    const T* bias_data = nullptr;
-    if (bias) bias_data = bias->data<T>();
-
-    int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3]
-                                                   : x_dims[1] * x_dims[2]);
-
-    int block_size = std::min(1024, imsize);
-    dim3 grid(group_size, groups, x_dims[0]);
-    dim3 threads(block_size, 1, 1);
-    int flags =
-        (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
-    UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data,
-                     bias_data, y_data, x_dims[0], C, W, imsize, groups,
-                     group_size, epsilon, temp_mean_data, temp_var_data,
-                     d_scale_data, d_bias_data, data_layout);
-    if (d_x_data != nullptr) {
-      UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data,
-                       bias_data, var_data, temp_mean_data, temp_var_data,
-                       x_dims[0], C, W, imsize, groups, group_size, epsilon,
-                       d_x_data, data_layout);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    group_norm,
-    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    group_norm_grad,
-    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
deleted file mode 100644
index d4a1b3f036bba7eb193e5854cff2c239be18425c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/group_norm_op.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <string>
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename DeviceContext, typename T>
-class GroupNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* x = ctx.Input<Tensor>("X");
-
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
-    const auto groups = ctx.Attr<int>("groups");
-
-    const auto x_dims = x->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int group_size = (C - 1) / groups + 1;
-
-    y->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto* x_data = x->data<T>();
-    auto* y_data = y->data<T>();
-    auto* mean_data = mean->data<T>();
-    auto* var_data = var->data<T>();
-
-    const T* scale_data = nullptr;
-    if (scale) scale_data = scale->data<T>();
-    const T* bias_data = nullptr;
-    if (bias) bias_data = bias->data<T>();
-
-    int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3]
-                                                   : x_dims[1] * x_dims[2]);
-
-    auto* iter_x_data = x_data;
-    auto* iter_y_data = y_data;
-    for (int bid = 0; bid < x_dims[0]; bid++) {
-      for (int gid = 0; gid < groups; gid++) {
-        T x_mean = 0, x_var = 0;
-        int number =
-            std::min(group_size, static_cast<int>(C - gid * group_size));
-        auto* tmp_x = iter_x_data;
-        auto* x_src_data = iter_x_data;
-        auto* tmp_y = iter_y_data;
-        auto* y_src_data = iter_y_data;
-
-        if (data_layout == DataLayout::kNCHW) {
-          for (int cid = 0; cid < number; cid++) {
-            for (int imid = 0; imid < imsize; imid++, iter_x_data++) {
-              x_mean += iter_x_data[0];
-              x_var += iter_x_data[0] * iter_x_data[0];
-            }
-          }
-        } else {
-          for (int cid = 0; cid < number; cid++) {
-            iter_x_data = tmp_x + cid;
-            for (int imid = 0; imid < imsize; imid++, iter_x_data += C) {
-              x_mean += iter_x_data[0];
-              x_var += iter_x_data[0] * iter_x_data[0];
-            }
-          }
-          iter_x_data = tmp_x + group_size;
-        }
-
-        x_mean /= number * imsize;
-        x_var /= number * imsize;
-        x_var = x_var - x_mean * x_mean;
-        T var_inv = 1.0 / sqrt(x_var + epsilon);
-        mean_data[bid * groups + gid] = x_mean;
-        var_data[bid * groups + gid] = x_var;
-
-        if (data_layout == DataLayout::kNCHW) {
-          for (int cid = 0; cid < number; cid++) {
-            for (int imid = 0; imid < imsize; imid++, tmp_x++, iter_y_data++) {
-              T val = (tmp_x[0] - x_mean) * var_inv;
-              if (scale_data) val *= scale_data[gid * group_size + cid];
-              if (bias_data) val += bias_data[gid * group_size + cid];
-              iter_y_data[0] = val;
-            }
-          }
-        } else {
-          for (int cid = 0; cid < number; cid++) {
-            tmp_x = x_src_data + cid;
-            iter_y_data = y_src_data + cid;
-            for (int imid = 0; imid < imsize;
-                 imid++, tmp_x += C, iter_y_data += C) {
-              T val = (tmp_x[0] - x_mean) * var_inv;
-              if (scale_data) val *= scale_data[gid * group_size + cid];
-              if (bias_data) val += bias_data[gid * group_size + cid];
-              iter_y_data[0] = val;
-            }
-          }
-          iter_y_data = tmp_y + group_size;
-        }
-      }
-      if (data_layout == DataLayout::kNHWC) {
-        iter_x_data = x_data + (bid + 1) * C * imsize;
-        iter_y_data = y_data + (bid + 1) * C * imsize;
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GroupNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("Y");
-    auto* var = ctx.Input<Tensor>("Variance");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto groups = ctx.Attr<int>("groups");
-
-    // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    const auto& x_dims = x->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int group_size = (C - 1) / groups + 1;
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto* x_data = x->data<T>();
-    auto* d_x_data = d_x->data<T>();
-    auto* y_data = d_y->data<T>();
-    auto* var_data = var->data<T>();
-    T* d_scale_data = nullptr;
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_scale, static_cast<T>(0));
-      d_scale_data = d_scale->data<T>();
-    }
-    T* d_bias_data = nullptr;
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_bias, static_cast<T>(0));
-      d_bias_data = d_bias->data<T>();
-    }
-
-    const T* scale_data = nullptr;
-    if (scale) scale_data = scale->data<T>();
-    const T* bias_data = nullptr;
-    if (bias) bias_data = bias->data<T>();
-
-    int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3]
-                                                   : x_dims[1] * x_dims[2]);
-    auto* iter_x_data = x_data;
-    auto* iter_d_x_data = d_x_data;
-    auto* iter_y_data = y_data;
-    for (int bid = 0; bid < x_dims[0]; bid++) {
-      for (int gid = 0; gid < groups; gid++) {
-        T x_var = var_data[bid * groups + gid];
-        T var_inv = 1.0 / sqrt(x_var + epsilon);
-        int number =
-            std::min(group_size, static_cast<int>(C - gid * group_size));
-        T number_inv = 1.0 / (number * imsize);
-        auto* tmp_x = iter_x_data;
-        auto* tmp_y = iter_y_data;
-        auto* tmp_d_x = iter_d_x_data;
-        auto* x_src_data = iter_x_data;
-        auto* y_src_data = iter_y_data;
-        auto* iter_x_data_backup = iter_x_data;
-        auto* iter_y_data_backup = iter_y_data;
-        auto* iter_d_x_data_backup = iter_d_x_data;
-        T dp_scale = 0, dp_bias = 0;
-
-        if (data_layout == DataLayout::kNCHW) {
-          for (int cid = 0; cid < number; cid++) {
-            for (int imid = 0; imid < imsize;
-                 imid++, iter_x_data++, iter_y_data++) {
-              T val = iter_x_data[0];
-              if (bias_data) val -= bias_data[gid * group_size + cid];
-              T dval = iter_y_data[0];
-              dp_scale += val * dval;
-              dp_bias += dval * scale_data[gid * group_size + cid];
-
-              if (scale_data && scale_data[gid * group_size + cid] != 0)
-                val /= scale_data[gid * group_size + cid];
-              if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
-              if (d_scale_data)
-                d_scale_data[gid * group_size + cid] += val * dval;
-            }
-          }
-
-          for (int cid = 0; cid < number; cid++) {
-            for (int imid = 0; imid < imsize;
-                 imid++, iter_d_x_data++, tmp_x++, tmp_y++) {
-              T v_y = tmp_x[0];
-              T dly = tmp_y[0];
-              T dss = dp_scale;
-              T dbs = dp_bias;
-              T v_scale = scale_data[gid * group_size + cid];
-              T v_bias = bias_data[gid * group_size + cid];
-              v_y -= v_bias;
-              if (v_scale != 0) v_y /= v_scale;
-              iter_d_x_data[0] =
-                  (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) *
-                  var_inv;
-            }
-          }
-        } else {
-          for (int cid = 0; cid < number; cid++) {
-            iter_x_data = x_src_data + cid;
-            iter_y_data = y_src_data + cid;
-            for (int imid = 0; imid < imsize;
-                 imid++, iter_x_data += C, iter_y_data += C) {
-              T val = iter_x_data[0];
-              if (bias_data) val -= bias_data[gid * group_size + cid];
-              T dval = iter_y_data[0];
-              dp_scale += val * dval;
-              dp_bias += dval * scale_data[gid * group_size + cid];
-
-              if (scale_data && scale_data[gid * group_size + cid] != 0)
-                val /= scale_data[gid * group_size + cid];
-              if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
-              if (d_scale_data)
-                d_scale_data[gid * group_size + cid] += val * dval;
-            }
-          }
-
-          for (int cid = 0; cid < number; cid++) {
-            tmp_x = x_src_data + cid;
-            tmp_y = y_src_data + cid;
-            iter_d_x_data = tmp_d_x + cid;
-            for (int imid = 0; imid < imsize;
-                 imid++, iter_d_x_data += C, tmp_x += C, tmp_y += C) {
-              T v_y = tmp_x[0];
-              T dly = tmp_y[0];
-              T dss = dp_scale;
-              T dbs = dp_bias;
-              T v_scale = scale_data[gid * group_size + cid];
-              T v_bias = bias_data[gid * group_size + cid];
-              v_y -= v_bias;
-              if (v_scale != 0) v_y /= v_scale;
-              iter_d_x_data[0] =
-                  (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) *
-                  var_inv;
-            }
-          }
-          iter_x_data = iter_x_data_backup + group_size;
-          iter_y_data = iter_y_data_backup + group_size;
-          iter_d_x_data = iter_d_x_data_backup + group_size;
-        }
-      }
-      if (data_layout == DataLayout::kNHWC) {
-        iter_x_data = x_data + (bid + 1) * C * imsize;
-        iter_d_x_data = d_x_data + (bid + 1) * C * imsize;
-        iter_y_data = y_data + (bid + 1) * C * imsize;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
deleted file mode 100644
index 9797160c9622671a820e2b9872305745df176979..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gru_op.cc
+++ /dev/null
@@ -1,390 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gru_op.h"
-#include <string>
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/gru_kernel.h"
-
-DECLARE_int32(paddle_num_threads);
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class GRUOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(%s) of GRUOp should not be null.", "Input");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(%s) of GRUOp should not be null.", "Weight");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
-                   "Output(%s) of GRUOp should not be null.", "BatchGate");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
-                   "Output(%s) of GRUOp should not be null.",
-                   "BatchResetHiddenPrev");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
-                   "Output(%s) of GRUOp should not be null.", "BatchHidden");
-    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                   "Output(%s) of GRUOp should not be null.", "Hidden");
-    auto input_dims = ctx->GetInputDim("Input");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    int input_size = input_dims[1];
-    int frame_size = weight_dims[0];
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          input_size, frame_size * 3,
-          "The input_size must be 3 times of frame_size in GRUOp.");
-    }
-    PADDLE_ENFORCE_EQ(
-        weight_dims[1], frame_size * 3,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    if (ctx->HasInput("H0")) {
-      auto h0_dims = ctx->GetInputDim("H0");
-      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
-                        "The width of H0 must be equal to frame_size.");
-    }
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      int bias_height = bias_dims[0];
-      int bias_width = bias_dims[1];
-      PADDLE_ENFORCE_EQ(bias_height, 1,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
-                        "The shape of Bias must be [1, frame_size * 3].");
-    }
-    ctx->SetOutputDim("BatchGate", input_dims);
-    ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
-    ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
-    ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
-    ctx->ShareLoD("Input", "Hidden");
-  }
-};
-
-class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(LoDTensor) The first input is a LodTensor, which supports "
-             "variable-time length input sequence. The underlying tensor in "
-             "this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
-             "total time steps in this mini-batch, D is the hidden size.");
-    AddInput("H0",
-             "(Tensor, optional) The initial hidden state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size, D is the hidden size.")
-        .AsDispensable();
-    AddInput(
-        "Weight",
-        "(Tensor) The learnable hidden-hidden weight matrix with shape "
-        "(D x 3D), where D is the hidden size. The elements continuous in "
-        "memory can be divided into two parts. The first part are weights of "
-        "the update gate and reset gate with shape (D x 2D), and the second "
-        "part are weights of output candidate with shape (D x D).");
-    AddInput("Bias",
-             "(Tensor, optional) Bias vector with shape (1 x 3D) concating "
-             "bias of the update gate, reset gate and output candidate.")
-        .AsDispensable();
-    AddOutput("BatchGate",
-              "(LoDTensor) To compute with batches, sequence data will be "
-              "reorganized into several successive batches each containing "
-              "data from the same time step. The LoDTensor BatchGate contains "
-              "the update gate, reset gate and output candidate values "
-              "organized in batches. The LoD size is 2. The first LoD contains "
-              "the batch offsets and the second LoD contains the indexes in "
-              "the raw sequence data.")
-        .AsIntermediate();
-    AddOutput(
-        "BatchResetHiddenPrev",
-        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
-        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
-        "with `BatchGate`.")
-        .AsIntermediate();
-    AddOutput(
-        "BatchHidden",
-        "(LoDTensor) The hidden state LoDTensor organized in batches.  "
-        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
-        "with `BatchGate`.")
-        .AsIntermediate();
-    AddOutput(
-        "Hidden",
-        "(LoDTensor) the hidden state LoDTensor organized in sequences. "
-        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
-        "with `BatchGate`.");
-    AddAttr<std::string>("activation",
-                         "(string, default tanh) "
-                         "The activation type used for output candidate {h}_t.")
-        .SetDefault("tanh");
-    AddAttr<std::string>(
-        "gate_activation",
-        "(string, default sigmoid) "
-        "The activation type used in update gate and reset gate.")
-        .SetDefault("sigmoid");
-    AddAttr<bool>("is_reverse",
-                  "(bool, default: False) "
-                  "whether to compute reversed GRU.")
-        .SetDefault(false);
-    AddAttr<bool>("origin_mode",
-                  "bool"
-                  "use origin mode in article https://arxiv.org/abs/1412.3555")
-        .SetDefault(false);
-    AddComment(R"DOC(
-GRU Operator implements part calculations of the complete GRU as following:
-
-$$
-update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
-reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
-output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
-output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
-$$
-
-@note To implement the complete GRU, fully-connected operator must be used
-before to feed xu, xr and xc as the Input of GRU operator.
-)DOC");
-  }
-};
-
-class GRUGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(%s) of GRUGradOp should not be null.", "Input");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(%s) of GRUGradOp should not be null.", "Weight");
-    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
-                   "Input(%s) of GRUGradOp should not be null.", "BatchGate");
-    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
-                   "Input(%s) of GRUGradOp should not be null.",
-                   "BatchResetHiddenPrev");
-    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
-                   "Input(%s) of GRUOp should not be null.", "BatchHidden");
-    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
-                   "Input(%s) of GRUGradOp should not be null.", "Hidden");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
-                   "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
-    auto input_dims = ctx->GetInputDim("Input");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    int input_size = input_dims[1];
-    int frame_size = weight_dims[0];
-    int weight_height = weight_dims[0];
-    int weight_width = weight_dims[1];
-    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
-                      "The input_size must be 3 times of frame_size in GRUOp.");
-    PADDLE_ENFORCE_EQ(
-        weight_height, frame_size,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    PADDLE_ENFORCE_EQ(
-        weight_width, frame_size * 3,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    if (ctx->HasInput("H0")) {
-      auto h0_dims = ctx->GetInputDim("H0");
-      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
-                        "The width of H0 must be equal to frame_size.");
-      auto h0_grad_name = framework::GradVarName("H0");
-      if (ctx->HasOutput(h0_grad_name))
-        ctx->SetOutputDim(h0_grad_name, h0_dims);
-    }
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      int bias_height = bias_dims[0];
-      int bias_width = bias_dims[1];
-      PADDLE_ENFORCE_EQ(bias_height, 1,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      auto bias_grad_name = framework::GradVarName("Bias");
-      if (ctx->HasOutput(bias_grad_name))
-        ctx->SetOutputDim(bias_grad_name, bias_dims);
-    }
-    auto input_grad_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(input_grad_name))
-      ctx->SetOutputDim(input_grad_name, input_dims);
-    auto weight_grad_name = framework::GradVarName("Weight");
-    if (ctx->HasOutput(weight_grad_name))
-      ctx->SetOutputDim(weight_grad_name, weight_dims);
-  }
-};
-
-template <typename T>
-class GRUCPUKernel : public framework::OpKernel<T> {
- public:
-  void BatchCompute(const framework::ExecutionContext& context) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
-    bool origin_mode = context.Attr<bool>("origin_mode");
-    auto* input = context.Input<LoDTensor>("Input");
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
-    const T* weight_data = weight->data<T>();
-    auto* bias = context.Input<Tensor>("Bias");
-    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
-    batch_gate->mutable_data<T>(context.GetPlace());
-    auto* batch_reset_hidden_prev =
-        context.Output<LoDTensor>("BatchResetHiddenPrev");
-    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
-    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
-    batch_hidden->mutable_data<T>(context.GetPlace());
-    auto* hidden = context.Output<LoDTensor>("Hidden");
-    hidden->mutable_data<T>(context.GetPlace());
-
-    auto hidden_dims = hidden->dims();
-
-    bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
-
-    if (bias) {
-      math::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
-    }
-
-    int frame_size = hidden_dims[1];
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(weight_data);
-    gru_value.state_weight =
-        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (h0) {
-      // Since the batch computing for GRU reorders the input sequences
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), *h0, order,
-          &ordered_h0, true);
-      gru_value.prev_out_value = ordered_h0.data<T>();
-    } else {
-      gru_value.prev_out_value = nullptr;
-    }
-    auto batch_starts = batch_gate->lod()[0];
-    size_t seq_len = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
-        context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        context.Attr<std::string>("gate_activation"));
-
-#ifdef PADDLE_WITH_MKLML
-    // use MKL packed to speedup GEMM
-    if (FLAGS_paddle_num_threads >= 4) {
-      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
-                                       frame_size * 2 /*width of weight*/,
-                                       frame_size /*height of height*/);
-      PADDLE_ENFORCE(packed_gate);
-      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
-                     frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
-                     packed_gate);
-      T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
-                                        frame_size /*width of weight*/,
-                                        frame_size /*height of height*/);
-      PADDLE_ENFORCE(packed_state);
-      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
-                     frame_size, T(1.0), gru_value.state_weight, frame_size,
-                     packed_state);
-      for (size_t n = 0; n < seq_len; n++) {
-        int bstart = static_cast<int>(batch_starts[n]);
-        int bend = static_cast<int>(batch_starts[n + 1]);
-        int cur_batch_size = bend - bstart;
-
-        Tensor gate_t = batch_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        gru_value.output_value = hidden_t.data<T>();
-        gru_value.gate_value = gate_t.data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-        if (gru_value.prev_out_value) {
-          blas.GEMM_COMPUTE(
-              CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
-              frame_size, gru_value.prev_out_value, frame_size, packed_gate,
-              frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
-        }
-
-        math::detail::forward_reset_output(
-            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_gate);
-
-        if (gru_value.prev_out_value) {
-          blas.GEMM_COMPUTE(
-              CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
-              gru_value.reset_output_value, frame_size, packed_state,
-              frame_size, T(1), gru_value.gate_value + frame_size * 2,
-              frame_size * 3);
-        }
-
-        math::detail::forward_final_output(
-            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_node, origin_mode);
-
-        gru_value.prev_out_value = gru_value.output_value;
-      }
-
-      blas.GEMM_FREE(packed_gate);
-      blas.GEMM_FREE(packed_state);
-    } else {
-#endif
-      for (size_t n = 0; n < seq_len; n++) {
-        int bstart = static_cast<int>(batch_starts[n]);
-        int bend = static_cast<int>(batch_starts[n + 1]);
-        int cur_batch_size = bend - bstart;
-
-        Tensor gate_t = batch_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        gru_value.output_value = hidden_t.data<T>();
-        gru_value.gate_value = gate_t.data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-        math::GRUUnitFunctor<DeviceContext, T>::compute(
-            dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-            active_gate, origin_mode);
-
-        gru_value.prev_out_value = gru_value.output_value;
-      }
-#ifdef PADDLE_WITH_MKLML
-    }
-#endif
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden->set_lod(batch_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, hidden);
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    BatchCompute(context);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(gru_grad, ops::GRUGradOp);
-REGISTER_OP_CPU_KERNEL(gru, ops::GRUCPUKernel<float>,
-                       ops::GRUCPUKernel<double>);
-REGISTER_OP_CPU_KERNEL(
-    gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
deleted file mode 100644
index ba918b3def22e3c60c4155f77ecbaad85d520928..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gru_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class GRUKernel : public framework::OpKernel<T> {
- public:
-  void BatchCompute(const framework::ExecutionContext& context) const {
-    bool origin_mode = context.Attr<bool>("origin_mode");
-    auto* input = context.Input<LoDTensor>("Input");
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
-    const T* weight_data = weight->data<T>();
-    auto* bias = context.Input<Tensor>("Bias");
-    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
-    batch_gate->mutable_data<T>(context.GetPlace());
-    auto* batch_reset_hidden_prev =
-        context.Output<LoDTensor>("BatchResetHiddenPrev");
-    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
-    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
-    batch_hidden->mutable_data<T>(context.GetPlace());
-    auto* hidden = context.Output<LoDTensor>("Hidden");
-    hidden->mutable_data<T>(context.GetPlace());
-
-    auto hidden_dims = hidden->dims();
-
-    bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
-
-    if (bias) {
-      math::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
-    }
-
-    int frame_size = hidden_dims[1];
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(weight_data);
-    gru_value.state_weight =
-        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (h0) {
-      // Since the batch computing for GRU reorders the input sequences
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), *h0, order,
-          &ordered_h0, true);
-      gru_value.prev_out_value = ordered_h0.data<T>();
-    } else {
-      gru_value.prev_out_value = nullptr;
-    }
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
-        context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        context.Attr<std::string>("gate_activation"));
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-      int cur_batch_size = bend - bstart;
-
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      gru_value.output_value = hidden_t.data<T>();
-      gru_value.gate_value = gate_t.data<T>();
-      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-      math::GRUUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-          active_gate, origin_mode);
-      gru_value.prev_out_value = gru_value.output_value;
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden->set_lod(batch_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, hidden);
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    BatchCompute(context);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    gru, ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    gru_grad, ops::GRUGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
deleted file mode 100644
index bcca992e2b426677e32d2c82e853d79534d114a6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gru_op.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
-template <typename DeviceContext, typename T>
-class GRUGradKernel : public framework::OpKernel<T> {
- public:
-  void BatchCompute(const framework::ExecutionContext& context) const {
-    bool origin_mode = context.Attr<bool>("origin_mode");
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
-    const T* weight_data = weight->data<T>();
-    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
-    auto* batch_reset_hidden_prev =
-        context.Input<LoDTensor>("BatchResetHiddenPrev");
-    auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
-    auto* hidden = context.Input<LoDTensor>("Hidden");
-    auto* hidden_grad =
-        context.Input<LoDTensor>(framework::GradVarName("Hidden"));
-    auto* input_grad =
-        context.Output<LoDTensor>(framework::GradVarName("Input"));
-    auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
-    auto* weight_grad =
-        context.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto gate_dims = batch_gate->dims();
-    auto hidden_dims = hidden->dims();
-    int frame_size = hidden_dims[1];
-
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
-    batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
-    batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
-    batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
-                                                 context.GetPlace());
-    math::SetConstant<DeviceContext, T> zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
-    zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
-    zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
-
-    Tensor ordered_h0, ordered_h0_grad;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (h0) {
-      ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
-                                         true);
-    }
-    if (h0_grad) {
-      ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
-      zero(context.template device_context<DeviceContext>(), &ordered_h0_grad,
-           static_cast<T>(0.0));
-    }
-
-    bool is_reverse = context.Attr<bool>("is_reverse");
-    batch_hidden_grad.set_lod(batch_hidden->lod());
-    to_batch(dev_ctx, *hidden_grad, &batch_hidden_grad, false, is_reverse);
-
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(weight_data);
-    gru_value.state_weight =
-        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-
-    math::GRUMetaGrad<T> gru_grad;
-    if (weight_grad) {
-      gru_grad.gate_weight_grad =
-          weight_grad->mutable_data<T>(context.GetPlace());
-      zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-      gru_grad.state_weight_grad =
-          weight_grad->data<T>() + 2 * frame_size * frame_size;
-    } else {
-      gru_grad.gate_weight_grad = nullptr;
-      gru_grad.state_weight_grad = nullptr;
-    }
-
-    auto batch_starts = batch_hidden_grad.lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
-        context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        context.Attr<std::string>("gate_activation"));
-    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-      int cur_batch_size = bend - bstart;
-
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      gru_value.gate_value = gate_t.data<T>();
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
-      gru_grad.output_grad = hidden_grad_t.data<T>();
-      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
-      gru_grad.gate_grad = gate_grad_t.data<T>();
-      Tensor reset_hidden_prev_grad_t =
-          batch_reset_hidden_prev_grad.Slice(bstart, bend);
-      gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
-      if (n == 0) {
-        gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
-        gru_grad.prev_out_grad =
-            h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
-      } else {
-        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
-        gru_value.prev_out_value = hidden_prev_t.data<T>();
-        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
-        gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
-      }
-      gru_value.output_value = nullptr;
-      math::GRUUnitGradFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
-          active_gate, origin_mode);
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-      batch_gate_grad.set_lod(batch_gate->lod());
-      to_seq(dev_ctx, batch_gate_grad, input_grad);
-    }
-    if (bias_grad) {
-      bias_grad->mutable_data<T>(context.GetPlace());
-      math::ColwiseSum<DeviceContext, T> col_sum;
-      col_sum(dev_ctx, batch_gate_grad, bias_grad);
-    }
-    if (h0 && h0_grad) {
-      ReorderInitState<DeviceContext, T>(dev_ctx, ordered_h0_grad, order,
-                                         h0_grad, false);
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    BatchCompute(context);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
deleted file mode 100644
index e3beedcf10b6286c92371c48cae7912aef35e7a3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gru_unit_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class GRUUnitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(%s) of GRUUnitOp should not be null.", "Input");
-    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
-                   "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(%s) of GRUUnitOp should not be null.", "Weight");
-    PADDLE_ENFORCE(ctx->HasOutput("Gate"),
-                   "Output(%s) of GRUUnitOp should not be null.", "Gate");
-    PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"),
-                   "Output(%s) of GRUUnitOp should not be null.",
-                   "ResetHiddenPrev");
-    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                   "Output(%s) of GRUUnitOp should not be null.", "Hidden");
-    auto input_dims = ctx->GetInputDim("Input");
-    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    int batch_size = input_dims[0];
-    int input_size = input_dims[1];
-    int frame_size = hidden_prev_dims[1];
-    int weight_height = weight_dims[0];
-    int weight_width = weight_dims[1];
-    PADDLE_ENFORCE_EQ(
-        input_size, frame_size * 3,
-        "The input_size must be 3 times of frame_size in GRUUnitOp.");
-    PADDLE_ENFORCE_EQ(
-        weight_height, frame_size,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    PADDLE_ENFORCE_EQ(
-        weight_width, frame_size * 3,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      int bias_height = bias_dims[0];
-      int bias_width = bias_dims[1];
-      PADDLE_ENFORCE_EQ(bias_height, 1,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
-                        "The shape of Bias must be [1, frame_size * 3].");
-    }
-    ctx->SetOutputDim("Gate", {batch_size, frame_size * 3});
-    ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size});
-    ctx->SetOutputDim("Hidden", {batch_size, frame_size});
-  }
-};
-
-class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
-             "input.");
-    AddInput("HiddenPrev",
-             "(Tensor) Matrix with shape [batch_size, frame_size] for the "
-             "states of previous time step.");
-    AddInput(
-        "Weight",
-        "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
-        "The elements continuous in memory can be divided into two parts. "
-        "The first part are weights of the update gate and reset gate "
-        "with shape [frame_size, frame_size * 2], and the second part are "
-        "weights of output candidate with shape [frame_size, frame_size].");
-    AddInput(
-        "Bias",
-        "(Tensor) Bias vector with shape [1, frame_size * 3] concatenating "
-        "bias of the update gate, reset gate and output candidate.")
-        .AsDispensable();
-    AddOutput("Gate",
-              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
-              "output of update gate, reset gate and output candidate.")
-        .AsIntermediate();
-    AddOutput("ResetHiddenPrev",
-              "(Tensor) Matrix with shape [batch_size, frame_size] for the "
-              "reseted hidden state of previous time step.")
-        .AsIntermediate();
-    AddOutput("Hidden",
-              "(Tensor) The GRU hidden state of the current time step "
-              "with shape [batch_size, frame_size].");
-    AddAttr<int>("activation",
-                 "(enum int, default tanh) "
-                 "The activation type used for output candidate {h}_t.")
-        .SetDefault(tanh)
-        .InEnum({identity, sigmoid, tanh, relu});
-    AddAttr<int>("gate_activation",
-                 "(enum int, default sigmoid) "
-                 "The activation type used in update gate and reset gate.")
-        .SetDefault(sigmoid)
-        .InEnum({identity, sigmoid, tanh, relu});
-    AddAttr<bool>("origin_mode",
-                  "bool"
-                  "use origin mode in article <Learning Phrase Representations "
-                  "using RNN Encoder–Decoder\n"
-                  "for Statistical Machine "
-                  "Translation>(https://arxiv.org/pdf/1406.1078.pdf)")
-        .SetDefault(false);
-    AddComment(R"DOC(
-GRUUnit Operator implements partial calculations of the GRU unit as following:
-
-$$
-update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
-output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
-output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
-$$
-
-which is same as one time step of GRU Operator.
-
-@note To implement the complete GRU unit, fully-connected operator must be
-used before to feed xu, xr and xc as the Input of GRUUnit operator.
-
-)DOC");
-  }
-};
-
-class GRUUnitGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(%s) of GRUUnitGradOp should not be null.", "Input");
-    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
-                   "Input(%s) of GRUUnitGradOp should not be null.",
-                   "HiddenPrev");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(%s) of GRUUnitGradOp should not be null.", "Weight");
-    PADDLE_ENFORCE(ctx->HasInput("Gate"),
-                   "Input(%s) of GRUUnitGradOp should not be null.", "Gate");
-    PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"),
-                   "Input(%s) of GRUUnitGradOp should not be null.",
-                   "ResetHiddenPrev");
-    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
-                   "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "Hidden");
-    auto input_dims = ctx->GetInputDim("Input");
-    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    // int batch_size = input_dims[0];
-    int input_size = input_dims[1];
-    int frame_size = hidden_prev_dims[1];
-    int weight_height = weight_dims[0];
-    int weight_width = weight_dims[1];
-    PADDLE_ENFORCE_EQ(
-        input_size, frame_size * 3,
-        "The input_size must be 3 times of frame_size in GRUUnitOp.");
-    PADDLE_ENFORCE_EQ(
-        weight_height, frame_size,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    PADDLE_ENFORCE_EQ(
-        weight_width, frame_size * 3,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      int bias_height = bias_dims[0];
-      int bias_width = bias_dims[1];
-      PADDLE_ENFORCE_EQ(bias_height, 1,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      auto bias_grad_name = framework::GradVarName("Bias");
-      if (ctx->HasOutput(bias_grad_name))
-        ctx->SetOutputDim(bias_grad_name, bias_dims);
-    }
-    auto input_grad_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(input_grad_name))
-      ctx->SetOutputDim(input_grad_name, input_dims);
-    auto hidden_prev_grad_name = framework::GradVarName("HiddenPrev");
-    if (ctx->HasOutput(hidden_prev_grad_name))
-      ctx->SetOutputDim(hidden_prev_grad_name, hidden_prev_dims);
-    auto weight_grad_name = framework::GradVarName("Weight");
-    if (ctx->HasOutput(weight_grad_name))
-      ctx->SetOutputDim(weight_grad_name, weight_dims);
-  }
-};
-
-class GRUUnitGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType("gru_unit_grad");
-
-    op->SetInput("Input", Input("Input"));
-    op->SetInput("HiddenPrev", Input("HiddenPrev"));
-    op->SetInput("Weight", Input("Weight"));
-    op->SetInput("Bias", Input("Bias"));
-
-    op->SetInput("Hidden", Output("Hidden"));
-    op->SetInput("Gate", Output("Gate"));
-    op->SetInput("ResetHiddenPrev", Output("ResetHiddenPrev"));
-    op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden"));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("HiddenPrev"),
-                  InputGrad("HiddenPrev"));
-    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
-    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker,
-                  ops::GRUUnitGradOpMaker);
-REGISTER_OPERATOR(gru_unit_grad, ops::GRUUnitGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    gru_unit_grad,
-    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu
deleted file mode 100644
index 37689901ecbeeda44f52a2fc7a686f4edf6682bb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gru_unit_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/gru_unit_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    gru_unit, ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    gru_unit_grad,
-    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
deleted file mode 100644
index 712ef05d8631ac74b92795321202cb5590286e82..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gru_unit_op.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
-
-template <typename DeviceContext, typename T>
-class GRUUnitKernel : public framework::OpKernel<T> {
- public:
-  template <typename Device, typename X, typename Y>
-  void ActCompute(const int act_type, const Device& d, X x, Y y) const {
-    if (act_type == identity)
-      y.device(d) = x;
-    else if (act_type == sigmoid)
-      SigmoidFunctor<T>()(d, x, y);
-    else if (act_type == tanh)
-      TanhFunctor<T>()(d, x, y);
-    else if (act_type == relu)
-      ReluFunctor<T>()(d, x, y);
-    else
-      PADDLE_THROW("unsupported activation type");
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("Input");
-    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
-    auto* weight = context.Input<Tensor>("Weight");
-    auto* bias = context.Input<Tensor>("Bias");
-    auto* gate = context.Output<Tensor>("Gate");
-    gate->mutable_data<T>(context.GetPlace());
-    auto* reset_hidden_prev = context.Output<Tensor>("ResetHiddenPrev");
-    reset_hidden_prev->mutable_data<T>(context.GetPlace());
-    auto* hidden = context.Output<Tensor>("Hidden");
-    hidden->mutable_data<T>(context.GetPlace());
-
-    int batch_size = input->dims()[0];
-    int frame_size = hidden_prev->dims()[1];
-
-    auto x = EigenMatrix<T>::From(*input);
-    auto h_p = EigenMatrix<T>::From(*hidden_prev);
-    auto g = EigenMatrix<T>::From(*gate);
-    auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
-    auto h = EigenMatrix<T>::From(*hidden);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    // calculate unactivated gate outputs
-    if (bias) {
-      auto b = EigenMatrix<T>::From(*bias);
-      g.device(place) = x +
-                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
-                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
-    } else {
-      g.device(place) = x;
-    }
-    const T* hidden_prev_data = hidden_prev->data<T>();
-    const T* weight_data = weight->data<T>();
-    T* gate_data = gate->data<T>();
-    T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-    blas.GEMM(false, false, batch_size, 2 * frame_size, frame_size, 1,
-              hidden_prev_data, frame_size, weight_data, frame_size * 2, 1,
-              gate_data, frame_size * 3);
-
-    // calculate activited gate
-    Eigen::array<int, 2> extents{{batch_size, frame_size}};
-    Eigen::array<int, 2> u_offsets{{0, 0}};
-    ActCompute(context.Attr<int>("gate_activation"), place,
-               g.slice(u_offsets, extents), g.slice(u_offsets, extents));
-    auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets{{0, frame_size}};
-    ActCompute(context.Attr<int>("gate_activation"), place,
-               g.slice(r_offsets, extents), g.slice(r_offsets, extents));
-    auto r = g.slice(r_offsets, extents);  // reset gate
-    r_h_p.device(place) = r * h_p;         // reset previous hidden state
-    blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
-              reset_hidden_prev_data, frame_size,
-              weight_data + frame_size * frame_size * 2, frame_size, 1,
-              gate_data + frame_size * 2, frame_size * 3);
-
-    Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
-    ActCompute(context.Attr<int>("activation"), place,
-               g.slice(c_offsets, extents), g.slice(c_offsets, extents));
-    auto c = g.slice(c_offsets, extents);  // output candidate
-
-    // calculate final output
-    if (context.Attr<bool>("origin_mode")) {
-      h.device(place) = c + u * (h_p - c);  // (1 - u) * c + u * h_p
-    } else {
-      h.device(place) = u * (c - h_p) + h_p;  // u * c + (1 - u) * h_p
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GRUUnitGradKernel : public framework::OpKernel<T> {
- public:
-  template <typename Device, typename X, typename Y, typename DX, typename DY>
-  void ActGradCompute(const int act_type, const Device& d, X x, Y y, DX dx,
-                      DY dy) const {
-    // x is dummy and won't be used even in Relu(use y instead)
-    if (act_type == identity)
-      dx.device(d) = dy;
-    else if (act_type == sigmoid)
-      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == tanh)
-      TanhGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == relu)
-      ReluGradFunctor<T>()(d, x, y, dy, dx);
-    else
-      PADDLE_THROW("unsupported activation type");
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("Input");
-    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
-    auto* weight = context.Input<Tensor>("Weight");
-    auto* gate = context.Input<Tensor>("Gate");
-    auto* reset_hidden_prev = context.Input<Tensor>("ResetHiddenPrev");
-    auto* hidden_grad = context.Input<Tensor>(framework::GradVarName("Hidden"));
-    auto* input_grad = context.Output<Tensor>(framework::GradVarName("Input"));
-    auto* hidden_prev_grad =
-        context.Output<Tensor>(framework::GradVarName("HiddenPrev"));
-    auto* weight_grad =
-        context.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
-    Tensor gate_grad;
-    Tensor reset_hidden_prev_grad;
-
-    const T* hidden_prev_data = hidden_prev->data<T>();
-    const T* weight_data = weight->data<T>();
-    T* gate_grad_data =
-        gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
-    const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
-        reset_hidden_prev->dims(), context.GetPlace());
-
-    auto h_p = EigenMatrix<T>::From(*hidden_prev);
-    auto g = EigenMatrix<T>::From(*gate);
-    auto d_h = EigenMatrix<T>::From(*hidden_grad);
-    auto d_g = EigenMatrix<T>::From(gate_grad);
-    auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    int batch_size = input->dims()[0];
-    int frame_size = hidden_prev->dims()[1];
-
-    Eigen::array<int, 2> extents{{batch_size, frame_size}};
-    Eigen::array<int, 2> u_offsets{{0, 0}};
-    auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets{{0, frame_size}};
-    auto r = g.slice(r_offsets, extents);  // reset gate
-    Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
-    auto c = g.slice(c_offsets, extents);  // output candidate
-
-    // backward for unactivated update gate
-    if (context.Attr<bool>("origin_mode")) {
-      ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
-                     d_g.slice(u_offsets, extents), d_h * (h_p - c));
-      // backward for unactivated output candidate
-      ActGradCompute(context.Attr<int>("activation"), place, c, c,
-                     d_g.slice(c_offsets, extents), d_h * (1 - u));
-    } else {
-      ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
-                     d_g.slice(u_offsets, extents), d_h * (c - h_p));
-      // backward for unactivated output candidate
-      ActGradCompute(context.Attr<int>("activation"), place, c, c,
-                     d_g.slice(c_offsets, extents), d_h * u);
-    }
-    // backward for reset_hidden_prev
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-    blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
-              gate_grad_data + frame_size * 2, frame_size * 3,
-              weight_data + frame_size * frame_size * 2, frame_size, 0,
-              reset_hidden_prev_grad_data, frame_size);
-    // backward for unactivated reset gate
-    ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
-                   d_g.slice(r_offsets, extents), d_r_h_p * h_p);
-    // backward for weight
-    if (weight_grad) {
-      T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
-      // backward for state_weight
-      blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
-                reset_hidden_prev_data, frame_size,
-                gate_grad_data + frame_size * 2, frame_size * 3, 0,
-                weight_grad_data + frame_size * frame_size * 2, frame_size);
-
-      // backward for update_gate_weight and reset_gate_weight
-      blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
-                hidden_prev_data, frame_size, gate_grad_data, frame_size * 3, 0,
-                weight_grad_data, frame_size * 2);
-    }
-    // backward for hidden_prev
-    if (hidden_prev_grad) {
-      T* hidden_prev_grad_data =
-          hidden_prev_grad->mutable_data<T>(context.GetPlace());
-      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
-      if (context.Attr<bool>("origin_mode")) {
-        d_h_p.device(place) = d_r_h_p * r + d_h * u;
-      } else {
-        d_h_p.device(place) = d_r_h_p * r + d_h * (1 - u);
-      }
-      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
-                gate_grad_data, frame_size * 3, weight_data, frame_size * 2, 1,
-                hidden_prev_grad_data, frame_size);
-    }
-    // backward for input
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      auto d_x = EigenMatrix<T>::From(*input_grad);
-      d_x.device(place) = d_g;
-    }
-    // backward for bias
-    if (bias_grad) {
-      bias_grad->mutable_data<T>(context.GetPlace());
-      auto d_b = EigenVector<T>::Flatten(*bias_grad);
-      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
deleted file mode 100644
index 5ef91dcb66638d5786e9769802bfc3790ffc6079..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/hash_op.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/hash_op.h"
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class HashOp : public framework::OperatorWithKernel {
- public:
-  HashOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of HashOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of HashOp should not be null.");
-
-    auto dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(dims.size(), 2UL,
-                      "The input of hash_op's dimensions must be 2");
-    std::vector<int64_t> out_dims;
-    int num_hash = ctx->Attrs().Get<int>("num_hash");
-    HashOutputSize(dims, out_dims, num_hash);
-
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class HashOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of hash operator.");
-    AddOutput("Out", "(Tensor) Output tensor of hash operator.");
-    AddComment(R"DOC(
-        Execute `num_hash` times xxHash algorithm on all elements on second dimension of input. 
-)DOC");
-    AddAttr<int>("num_hash", "").SetDefault(1);
-    AddAttr<int64_t>("mod_by", "").SetDefault(100000);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
-                  "Skip calling InferShape() function in the runtime.")
-        .SetDefault(true);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(hash, ops::HashOp, ops::HashOpMaker);
-REGISTER_OP_CPU_KERNEL(hash, ops::HashKernel<int>, ops::HashKernel<int64_t>);
diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h
deleted file mode 100644
index c2d530004912287b0720ab5d00da90c4e1b5cbc7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/hash_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-extern "C" {
-#include <xxhash.h>
-}
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-inline void HashOutputSize(const framework::DDim& in_dims,
-                           std::vector<int64_t>& out_dims,  // NOLINT
-                           int num_hash) {
-  out_dims.reserve(in_dims.size() + 1);
-  // copy all dims except the last one
-  for (int i = 0u; i != in_dims.size() - 1; ++i) {
-    out_dims.emplace_back(in_dims[i]);
-  }
-  out_dims.emplace_back(num_hash);
-  // keep the last dim to 1
-  out_dims.emplace_back(1);
-}
-
-template <typename T>
-class HashKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out_t = context.Output<framework::LoDTensor>("Out");
-    auto* in_t = context.Input<framework::LoDTensor>("X");
-    int64_t mod_by = context.Attr<int64_t>("mod_by");
-    int num_hash = context.Attr<int>("num_hash");
-
-    auto in_dims = in_t->dims();
-
-    std::vector<int64_t> out_dims;
-    HashOutputSize(in_dims, out_dims, num_hash);
-    out_t->Resize(framework::make_ddim(out_dims));
-    auto* output = out_t->mutable_data<T>(context.GetPlace());
-
-    auto seq_length = in_dims[0];
-    auto last_dim = in_dims[in_dims.size() - 1];
-    auto* input = in_t->data<T>();
-    for (int idx = 0; idx < seq_length; ++idx) {
-      for (int ihash = 0; ihash != num_hash; ++ihash) {
-        output[idx * num_hash + ihash] =
-            XXH64(input, sizeof(T) * last_dim, ihash) % mod_by;
-      }
-      input += last_dim;
-    }
-
-    out_t->set_lod(in_t->lod());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
deleted file mode 100644
index 2b3e2e5c484a1f04c03f0c2482072f0452382aa1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ /dev/null
@@ -1,283 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
-#include <string>
-#include <vector>
-namespace paddle {
-namespace operators {
-
-/**
- * Organize the classes into a binary tree. At each node, a sigmoid function
- * is used to calculate the probability of belonging to the right branch.
- * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
- * Hierarchical Probabilistic Neural Network Language Model."
- *
- * Here we uses a simple way of making the binary tree.
- * Assuming the number of classes C = 6,
- * The classes are organized as a binary tree in the following way:
- *
- * @code{.py}
- * *-*-*- 2
- * | | |- 3
- * | |
- * | |-*- 4
- * |   |- 5
- * |
- * |-*- 0
- *   |- 1
- * @endcode
- *
- * where * indicates an internal node, and each leaf node represents a class.
- * - Node 0 ... C-2 are internal nodes.
- * - Node C-1 ... 2C-2 are leaf nodes.
- * - Class c is represented by leaf node \f$c+C-1\f$.
- *
- * We assign an id for each node:
- * - the id of root be 0.
- * - the left child of a node i is 2*i+1.
- * - the right child of a node i is 2*i+2.
- *
- * It's easy to see that:
- * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
- * - the j-th level ancestor of node i is
- * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
- * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
- *
- */
-
-class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
-                   "Output(PreOut) should not be null.");
-    auto with_prefetch = ctx->Attrs().Get<bool>("remote_prefetch");
-    if (with_prefetch) {
-      PADDLE_ENFORCE(ctx->HasOutput("W_Out"),
-                     "Output(W_Out) should not be null.");
-    }
-    const int64_t batch_size = ctx->GetInputDim("X")[0];
-    std::vector<int64_t> output_shape({batch_size, 1});
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-/*
- * Inputs: X, W, Label, PathTable, PathCode, Bias
- * Outputs: Out, PreOut, W_out
- */
-template <typename AttrType>
-class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, required) The input tensor with shape [N, D], "
-             "where N is the size of mini-batch, and D is the feature size.");
-    AddInput("W",
-             "(LoDTensor, required), The parameters of hierarchical "
-             "sigmoid operator, each of them is a 2-D tensor, the shape is"
-             "[K, D]. Which K is the num of non-leaf node in Path Tree");
-    AddInput("Label",
-             "(LoDTensor, required), The labels of training data. It's a"
-             "tensor with shape [N, 1].");
-    AddInput("PathTable",
-             "(LoDTensor, optional), The Path Table from root to current word"
-             "it should have shape like [N, L], L is the length of the Path")
-        .AsDispensable();
-    AddInput(
-        "PathCode",
-        "(LoDTensor, optional), The Code on each Node of the Path from root "
-        "to current word"
-        "it should have shape like [N, L], L is the length of the Path")
-        .AsDispensable();
-    AddInput("Bias",
-             "(LoDTensor, optional), The bias is a tensor with shape or "
-             "[num_classes, 1]"
-             "[num_classes - 1, 1].")
-        .AsDispensable();
-    AddOutput(
-        "Out",
-        "(LoDTensor, required) The output of hierarchical sigmoid operator."
-        "The shape is [N, 1].");
-    AddOutput("PreOut",
-              "(LoDTensor, required) A intermedia 2-D tensor with shape "
-              "[batch_size, code_length], where code_length represents the "
-              "maximum path length from root to leaf nodes.")
-        .AsIntermediate();
-    AddOutput(
-        "W_Out",
-        "(LoDTensor, optinal) using input 'W' as Output to make it mutable"
-        "When we are using prefetch")
-        .AsIntermediate();
-    AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
-        .SetDefault(2);
-    // for parameter prefetch
-    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, the splited table names that will be fetched from "
-        "parameter server)"
-        "in the order of input variables for mapping")
-        .SetDefault({});
-    AddComment(R"DOC(
-The hierarchical sigmoid operator organize the classes into a binary tree.
-At each node, a sigmoid function is used to calculate the probability of
-belonging to the right branch. This idea is from
-"F. Morin, Y. Bengio (AISTATS 05):
-Hierarchical Probabilistic Neural Network Language Model."
-      )DOC");
-    AddAttr<bool>("is_sparse",
-                  "(boolean, default false) "
-                  "Sparse update.")
-        .SetDefault(false);
-  }
-};
-
-/*
- * Inputs: X, W, Label, PathTable, PathCode, PreOut, Out@GRAD
- * Outputs: X@GRAD, W@GRAD, Bias@GRAD
- */
-class HierarchicalSigmoidGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType(this->ForwardOpType() + "_grad");
-    // Inputs: X, W, Label, PathTable, PathCode, PreOut, Out@GRAD
-    op->SetInput("X", Input("X"));
-    op->SetInput("W", Input("W"));
-    op->SetInput("Bias", Input("Bias"));
-    op->SetInput("Label", Input("Label"));
-    op->SetInput("PathTable", Input("PathTable"));
-    op->SetInput("PathCode", Input("PathCode"));
-    op->SetInput("PreOut", Output("PreOut"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-
-    // Outputs: X@GRAD, W@GRAD, Bias@GRAD
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("W"), InputGrad("W"));
-    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-    op->SetAttrMap(Attrs());
-
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@Grad) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("PreOut"),
-                   "Input(Preout) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
-                   "Output(W@Grad should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@Grad should not be null.");
-
-    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-      ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Bias"));
-    }
-    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class HierarchicalSigmoidGradOpGradVarTypeInference
-    : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto w_grad_var_name = ctx->Output(framework::GradVarName("W")).front();
-    auto bias_grad_var_name_vec = ctx->Output(framework::GradVarName("Bias"));
-    std::string bias_grad_var_name;
-    bool hasBias = false;
-    if (bias_grad_var_name_vec.size()) {
-      hasBias = true;
-      bias_grad_var_name = ctx->Output(framework::GradVarName("Bias")).front();
-    }
-    auto attr = ctx->GetAttr("is_sparse");
-    bool is_sparse = boost::get<bool>(attr);
-    if (is_sparse) {
-      VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
-              << " is set to SelectedRows";
-      ctx->SetType(w_grad_var_name, framework::proto::VarType::SELECTED_ROWS);
-    } else {
-      VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
-              << " is set to LoDTensor";
-      ctx->SetType(w_grad_var_name, framework::proto::VarType::LOD_TENSOR);
-    }
-    if (hasBias) {
-      VLOG(3) << "hierarchical_sigmoid_grad op "
-              << framework::GradVarName("Bias") << " is set to LoDTensor";
-      ctx->SetType(bias_grad_var_name, framework::proto::VarType::LOD_TENSOR);
-    }
-    ctx->SetDataType(w_grad_var_name, ctx->GetDataType(ctx->Input("W")[0]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
-                  ops::HierarchicalSigmoidOpMaker<int>,
-                  ops::HierarchicalSigmoidGradMaker);
-REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp,
-                  ops::HierarchicalSigmoidGradOpGradVarTypeInference);
-REGISTER_OP_CPU_KERNEL(
-    hierarchical_sigmoid,
-    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-REGISTER_OP_CPU_KERNEL(
-    hierarchical_sigmoid_grad,
-    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
-                                         float>,
-    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
-                                         double>);
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
deleted file mode 100644
index d20a7e96b105079b75d2cd8ab7e41a6abbb77258..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/clip_op.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/matrix_bit_code.h"
-#include "paddle/fluid/platform/transform.h"
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-using platform::Transform;
-
-static std::vector<int64_t> PathToRows(const framework::LoDTensor& path) {
-  std::set<int64_t> rows;
-  const int64_t* paths = path.data<int64_t>();
-  for (int64_t i = 0; i < path.numel(); ++i) {
-    int64_t row = paths[i];
-    if (row < 0) {
-      continue;
-    }
-    rows.emplace(row);
-  }
-  return std::vector<int64_t>(rows.begin(), rows.end());
-}
-template <typename DeviceContext, typename T>
-class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
-    auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
-    auto* path = ctx.Input<framework::LoDTensor>("PathTable");
-    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
-    auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
-    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
-    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-    // for remote prefetch
-
-    auto remote_prefetch = ctx.Attr<bool>("remote_prefetch");
-    auto epmap = ctx.Attr<std::vector<std::string>>("epmap");
-    if (remote_prefetch && !epmap.empty()) {
-      // if epmap is not empty, then the parameter will be fetched from remote
-      // parameter
-      // server
-      auto height_sections = ctx.Attr<std::vector<int64_t>>("height_sections");
-      auto table_names = ctx.Attr<std::vector<std::string>>("table_names");
-      std::vector<int64_t> real_rows = PathToRows(*path);
-      framework::Scope& local_scope = ctx.scope().NewScope();
-      auto* ids = local_scope.Var("Ids@Prefetch");
-      auto* x_tensor = ids->GetMutable<framework::LoDTensor>();
-
-      x_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(real_rows.size()), 1}),
-          ctx.GetPlace());
-      // copy.
-
-      std::memcpy(x_tensor->data<int64_t>(), real_rows.data(),
-                  real_rows.size() * sizeof(int64_t));
-
-      framework::DDim w_dims = ctx.Input<Tensor>("W")->dims();
-      w_dims[0] = x_tensor->dims()[0];
-      auto* w_tensor =
-          local_scope.Var("W@Prefetch")->GetMutable<framework::LoDTensor>();
-      w_tensor->Resize(w_dims);
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-      // w_Out is set to used by prefetch, never change it in other cases
-      auto weight = ctx.Outputs("W_Out").front();
-      operators::distributed::prefetch("Ids@Prefetch", "W@Prefetch", weight,
-                                       true, table_names, epmap,
-                                       height_sections, ctx, local_scope);
-#else
-      PADDLE_THROW(
-          "paddle is not compiled with distribute support, can not do "
-          "parameter prefetch!");
-#endif
-    }
-
-    bool is_custom = false;
-    if (path) {
-      is_custom = true;
-    }
-    int64_t code_length =
-        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
-    int64_t batch_size = in.dims()[0];
-    framework::LoDTensor sum;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* pre_out_data = pre_out->mutable_data<T>(
-        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
-    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
-    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
-    // 0s can avoid out of path's loss.
-    math::SetConstant<DeviceContext, T> zero;
-    zero(dev_ctx, pre_out, static_cast<T>(0.0));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    math::RowwiseSum<DeviceContext, T> row_sum;
-
-    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
-    if (!is_custom) {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
-                                                       label.data<int64_t>()));
-    } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
-                                                       label.data<int64_t>()));
-    }
-
-    std::vector<int64_t> sum_dims({batch_size, 1UL});
-    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
-    auto sum_mat = EigenMatrix<T>::From(sum);
-    out->mutable_data<T>(ctx.GetPlace());
-    auto out_mat = framework::EigenMatrix<T>::From(*out);
-    if (bias) {
-      bit_code->Add(*bias, pre_out);
-    }
-    bit_code->Mul(pre_out, w, in);
-    // clip to [-40, 40]
-    Transform<DeviceContext> trans;
-    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
-          pre_out_data + pre_out->numel(), pre_out_data,
-          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
-    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
-    // use softrelu to calculate cross entropy
-    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
-    row_sum(dev_ctx, *pre_out, &sum);
-    // TODO(guosheng): Subtract the out of path's loss, since not all
-    // class(leaf) nodes' path lengths equal code_length. But it won't break the
-    // gradient check since both have the out of path's loss and will cancel out
-    // each other.
-    out_mat.device(place) = sum_mat + out_mat;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
-    auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
-    auto* path = ctx.Input<framework::LoDTensor>("PathTable");
-    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
-    auto* in_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    bool is_sparse = ctx.Attr<bool>("is_sparse");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-    auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
-    auto& pre_out = detail::Ref(ctx.Input<framework::LoDTensor>("PreOut"));
-    auto& out_grad = detail::Ref(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out")));
-    framework::LoDTensor pre_out_grad;
-
-    pre_out_grad.mutable_data<T>(pre_out.dims(), ctx.GetPlace());
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    zero(dev_ctx, in_grad, static_cast<T>(0.0));
-
-    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-
-    bool is_custom = false;
-    if (path) {
-      is_custom = true;
-    }
-
-    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
-    if (!is_custom) {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
-                                                       label.data<int64_t>()));
-    } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
-                                                       label.data<int64_t>()));
-    }
-
-    // softrelu derivative
-
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-
-    auto* pre_out_grad_data = pre_out_grad.data<T>();
-    auto* pre_out_data = pre_out.data<T>();
-    auto n = pre_out.numel();
-    blas.VEXP(n, pre_out_data, pre_out_grad_data);
-    blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
-    for (int64_t i = 0; i < n; ++i) {
-      pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
-    }
-    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
-    auto* out_grad_data = out_grad.data<T>();
-
-    int64_t dim0 = pre_out_grad.dims()[0];
-    int64_t dim1 = pre_out_grad.dims()[1];
-    for (int64_t i = 0; i < dim0; ++i) {
-      T tmp = out_grad_data[i];
-      blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
-    }
-    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
-    // be consistent with the clipping in forward.
-    auto* bias_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
-    if (bias_grad) {
-      bias_grad->mutable_data<T>(ctx.GetPlace());
-      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
-      bit_code->AddGrad(pre_out_grad, bias_grad);
-    }
-    if (!is_sparse) {
-      auto* w_grad =
-          ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
-      w_grad->mutable_data<T>(ctx.GetPlace());
-      zero(dev_ctx, w_grad, static_cast<T>(0.0));
-      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
-    } else {
-      PADDLE_ENFORCE(path != nullptr,
-                     "Sparse mode should not be used without custom tree!");
-      framework::Vector<int64_t> real_rows = PathToRows(*path);
-      auto* w_grad =
-          ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
-      w_grad->set_rows(real_rows);
-      // Build a map of id -> row_index to speed up finding the index of one id
-      w_grad->set_height(w.dims()[0]);
-      auto* w_grad_value = w_grad->mutable_value();
-      framework::DDim temp_dim(w.dims());
-      temp_dim[0] = real_rows.size();
-      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
-      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
-      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
-    }
-    bit_code->MulGradError(pre_out_grad, w, in_grad);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
deleted file mode 100644
index b6cfa9cc43c312e60a1b7c5e13d1ecbe6bc5dc7d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/hinge_loss_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class HingeLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) must be initialized.");
-
-    auto pred_dims = ctx->GetInputDim("Logits");
-    auto label_dims = ctx->GetInputDim("Labels");
-
-    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
-    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
-                      "The rank of Input(Logits) must be 2 and the shape is "
-                      "[batch_size, 1].");
-    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
-                      "Each row of Input(Logits) contains a real value, "
-                      "so the 2nd dimension of Input(Logits) must be 1.");
-
-    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
-    ctx->ShareLoD("Logits", "Loss");
-  }
-};
-
-template <typename AttrType>
-class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Logits",
-             "The input value (Logits) of Hinge loss op."
-             "Logits is a 2-D tensor with shape [batch_size, 1].");
-    AddInput("Labels",
-             "The target value (Labels) of Hinge loss op."
-             "Labels is a 2-D tensor with shape [batch_size, 1].");
-    AddOutput("Loss",
-              "The output tensor with shape [batch_size, 1] "
-              "which represents the hinge loss.");
-    AddComment(R"DOC(
-HingeLoss Operator.
-
-Let x be a logit (prediction) and y be the actual label. The logit can
-take any values from (-inf, inf), but the labels should be either -1 or 1.
-Then, the hinge loss is computed as follows:
-
-$$
-L_(x, y) = max(1 - y.x, 0) 
-$$
-
-Note that the labels passed as input will have values as either 0 or 1.
-
-)DOC");
-  }
-};
-
-class HingeLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input(Loss@GRAD) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
-                   "Input(Logits@GRAD) should not be null.");
-
-    auto pred_dims = ctx->GetInputDim("Logits");
-    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
-
-    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
-
-    auto pred_grad_name = framework::GradVarName("Logits");
-    ctx->SetOutputDim(pred_grad_name, pred_dims);
-  }
-};
-
-class HingeLossGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("hinge_loss_grad");
-    op->SetInput("Logits", Input("Logits"));
-    op->SetInput("Labels", Input("Labels"));
-    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
-    op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
-                  ops::HingeLossGradOpDescMaker);
-REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    hinge_loss,
-    ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/hinge_loss_op.cu b/paddle/fluid/operators/hinge_loss_op.cu
deleted file mode 100644
index b5ea0a702e0e540c1831ca241a5def19f86c239c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/hinge_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/hinge_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    hinge_loss,
-    ops::HingeLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/hinge_loss_op.h b/paddle/fluid/operators/hinge_loss_op.h
deleted file mode 100644
index 10c17a0982fd7995056aeb1f70648fd78b3d9c05..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/hinge_loss_op.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class HingeLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pred = context.Input<framework::Tensor>("Logits");
-    auto* label = context.Input<framework::Tensor>("Labels");
-    auto* loss = context.Output<framework::Tensor>("Loss");
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = framework::EigenVector<T>::Flatten(*pred);
-    auto y = framework::EigenVector<T>::Flatten(*label);
-    loss->mutable_data<T>(context.GetPlace());
-    auto l = framework::EigenVector<T>::Flatten(*loss);
-    l.device(place) =
-        (static_cast<T>(1) - x * (static_cast<T>(2) * y - static_cast<T>(1)))
-            .cwiseMax(static_cast<T>(0));
-  }
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class HingeLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pred = context.Input<framework::Tensor>("Logits");
-    auto* label = context.Input<framework::Tensor>("Labels");
-    auto* dloss =
-        context.Input<framework::Tensor>(framework::GradVarName("Loss"));
-    auto* dpred =
-        context.Output<framework::Tensor>(framework::GradVarName("Logits"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = framework::EigenVector<T>::Flatten(*pred);
-    auto y = framework::EigenVector<T>::Flatten(*label);
-    auto dl = framework::EigenVector<T>::Flatten(*dloss);
-
-    if (dpred) {
-      dpred->mutable_data<T>(context.GetPlace());
-      auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      auto alt_labels = static_cast<T>(2) * y - static_cast<T>(1);
-      dx.device(place) =
-          dl * ((x * alt_labels) < static_cast<T>(1)).template cast<T>() *
-          (-alt_labels);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
deleted file mode 100644
index 51e2ec5f50a9e4d86aeecc75920bb2204cc7eb88..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/huber_loss_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class HuberLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) must be initialized.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                      "Input(Y) must be initialized.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    int rank = x_dims.size();
-
-    if (rank == y_dims.size()) {
-      PADDLE_ENFORCE_EQ(y_dims[rank - 1], 1U,
-                        "The last dimension of Input(Y) should be equal to 1.");
-    } else {
-      PADDLE_ENFORCE_EQ(rank, y_dims.size() + 1,
-                        "The rank of Input(X) should be equal to "
-                        "the rank of Input(Y) plus 1.");
-    }
-    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
-                               framework::contain_unknown_dim(y_dims);
-    if (ctx->IsRuntime() || !contain_unknown_dim) {
-      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                        framework::slice_ddim(y_dims, 0, rank - 1),
-                        "The Input(X) and Input(Label) should have the same "
-                        "shape except the last dimension.");
-    }
-
-    auto out_dims = y_dims;
-    ctx->SetOutputDim("Residual", out_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-template <typename AttrType>
-class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input value of huber loss op."
-             "X is a 2-D tensor with shape [batch_size, 1].");
-    AddInput("Y",
-             "The target value of huber loss op."
-             "Y is a 2-D tensor with shape [batch_size, 1].");
-    AddOutput("Residual",
-              "Intermediate tensor to cache residual value between Y and X."
-              "The shape is same as Input(X) and will be reused in backward.")
-        .AsIntermediate();
-    AddOutput("Out",
-              "The output tensor with shape [batch_size, 1] "
-              "which represents the huber loss.");
-    AddAttr<AttrType>("delta", "Hyper parameter in huber loss.");
-    AddComment(R"DOC(
-HuberLoss Operator.
-
-Huber loss is a loss function used in robust regression. We define X as the
-input value and Y as the target value. Huber loss can evaluate the fitness of
-X to Y. Different from MSE loss, Huber loss is more robust for outliers. The
-shape of X and Y are [batch_size, 1]. The equation is:
-
-$$
-Out_{\delta}(X, Y)_i =
-\begin{cases}
-0.5 * (Y_i - X_i)^2,
-\quad |Y_i - X_i| \leq \delta \\
-\delta * (|Y_i - X_i| - 0.5 * \delta),
-\quad otherwise
-\end{cases}
-$$
-
-In the above equation, $Out_\delta(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
-element of Out, X and Y.
-
-)DOC");
-  }
-};
-
-class HuberLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) should not be null.");
-
-    auto residual_dims = ctx->GetInputDim("Residual");
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, residual_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, residual_dims);
-    }
-  }
-};
-
-class HuberLossGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("huber_loss_grad");
-    op->SetInput("Residual", Output("Residual"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
-                  ops::HuberLossGradOpDescMaker);
-REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu
deleted file mode 100644
index 09c743c4275169ba8c53ccbd428100b2fc4483d6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/huber_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    huber_loss,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h
deleted file mode 100644
index 7000b5d3acc07d722c0129f86f0cfa2ee65b4288..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/huber_loss_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-struct HuberLossForward {
-  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val <= delta) {
-      return static_cast<T>(0.5) * val * val;
-    } else {
-      return delta * (abs_val - static_cast<T>(0.5) * delta);
-    }
-  }
-
-  T delta;
-};
-
-template <typename DeviceContext, typename T>
-class HuberLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<Tensor>("Residual");
-    auto* out1 = context.Output<Tensor>("Out");
-    auto delta = static_cast<T>(context.Attr<float>("delta"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = EigenVector<T>::Flatten(*in0);
-    auto y = EigenVector<T>::Flatten(*in1);
-    out0->mutable_data<T>(context.GetPlace());
-    auto residual = EigenVector<T>::Flatten(*out0);
-    residual.device(place) = y - x;
-    out1->mutable_data<T>(context.GetPlace());
-    auto loss = EigenVector<T>::Flatten(*out1);
-    loss.device(place) = residual.unaryExpr(HuberLossForward<T>(delta));
-  }
-};
-
-template <typename T>
-struct HuberLossBackward {
-  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
-      : sign(sign), delta(delta) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val <= delta) {
-      return sign * val;
-    } else {
-      if (val > 0) {
-        return sign * delta;
-      } else {
-        return -1 * sign * delta;
-      }
-    }
-  }
-
-  T sign;
-  T delta;
-};
-
-template <typename DeviceContext, typename T>
-class HuberLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Residual");
-    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
-    auto delta = static_cast<T>(context.op().Attr<float>("delta"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto residual = EigenVector<T>::Flatten(*in0);
-    auto out_grad = EigenVector<T>::Flatten(*in1);
-
-    if (out0) {
-      out0->mutable_data<T>(context.GetPlace());
-      auto x_grad = EigenVector<T>::Flatten(*out0);
-      x_grad.device(place) =
-          residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
-      x_grad.device(place) = out_grad * x_grad;
-    }
-
-    if (out1) {
-      out1->mutable_data<T>(context.GetPlace());
-      auto y_grad = EigenVector<T>::Flatten(*out1);
-      y_grad.device(place) =
-          residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
-      y_grad.device(place) = out_grad * y_grad;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
deleted file mode 100644
index 0fa7322fbd65c85f574c18f822e2c189c8f87646..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/im2sequence_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class Im2SequenceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of Im2SequenceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of Im2SequenceOp op should not be null.");
-    auto in_dim = ctx->GetInputDim("X");
-
-    PADDLE_ENFORCE_EQ(in_dim.size(), 4,
-                      "Input(X) format must be 4D tensor, eg., NCHW.");
-    auto img_channels = in_dim[1];
-
-    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
-    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-
-    ctx->SetOutputDim("Out",
-                      {in_dim[0], img_channels * kernels[0] * kernels[1]});
-  }
-};
-
-class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor) The input tensor has NCHW format."
-             "N: batch size"
-             "C: channels"
-             "H: height"
-             "W: width");
-    AddInput("Y",
-             "(Tensor) The input tensor of image real size(H, W)."
-             "2-D with shape [batchsize, 2]")
-        .AsDispensable();
-    AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
-    AddAttr<std::vector<int>>("kernels",
-                              "(vector<int>), the "
-                              "kernels(kernel_height, kernel_width)");
-    AddAttr<std::vector<int>>("strides",
-                              "(vector<int> default:{1, 1}), the "
-                              "strides(h_stride, w_stride)")
-        .SetDefault({1, 1});
-    AddAttr<std::vector<int>>("paddings",
-                              "(vector<int> default:{0, 0, 0, 0}), the "
-                              "paddings(up_pad, left_pad, down_pad, right_pad)")
-        .SetDefault({0, 0, 0, 0});
-    AddAttr<std::vector<int>>("out_stride",
-                              "the attribute is valid only when input(Y)"
-                              "is not NULL.this attribute represents the"
-                              "scaling of the pic through the CNN"
-                              "(vector<int> dedault:{1,1}),the out_stride"
-                              " (out_stride_height, out_stride_width)")
-        .SetDefault({1, 1});
-    AddComment(R"DOC(
-This op uses kernels to scan images and converts these images to sequences.
-After expanding, The number of time steps are output_height * output_width
-and the dimension of each time step is kernel_height * kernel_width * channels,
-in which:
-
-output_height =
-    1 + (padding_height + padding_down + img_height - kernel_height + stride_height - 1) /
-            stride_height;
-output_width =
-    1 + (padding_left + padding+right + img_width - kernel_width + stride_width - 1) /
-            stride_width;
-
-This op can be used after convolution neural network, and before recurrent neural network.
-
-Given:
-
-x = [[[[ 6.  2.  1.]
-       [ 8.  3.  5.]
-       [ 0.  2.  6.]]
-
-      [[ 2.  4.  4.]
-       [ 6.  3.  0.]
-       [ 6.  4.  7.]]]
-
-     [[[ 6.  7.  1.]
-       [ 5.  7.  9.]
-       [ 2.  4.  8.]]
-
-      [[ 1.  2.  1.]
-       [ 1.  3.  5.]
-       [ 9.  0.  8.]]]]
-x.dims = {2, 2, 3, 3}
-
-And:
-
-kernels = [2, 2]
-strides = [1, 1]
-paddings = [0, 0, 0, 0]
-
-Then:
-
-output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
-               [ 2.  1.  3.  5.  4.  4.  3.  0.]
-               [ 8.  3.  0.  2.  6.  3.  6.  4.]
-               [ 3.  5.  2.  6.  3.  0.  4.  7.]
-               [ 6.  7.  5.  7.  1.  2.  1.  3.]
-               [ 7.  1.  7.  9.  2.  1.  3.  5.]
-               [ 5.  7.  2.  4.  1.  3.  9.  0.]
-               [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-output.dims = {8, 8}
-output.lod = [[0, 4, 8]]
-
-)DOC");
-  }
-};
-
-class Im2SequenceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-class Im2SequenceGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("im2sequence_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
-                  ops::Im2SequenceGradDescMaker);
-REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp);
-REGISTER_OP_CPU_KERNEL(
-    im2sequence,
-    ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    im2sequence_grad,
-    ops::Im2SequenceGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.cu b/paddle/fluid/operators/im2sequence_op.cu
deleted file mode 100644
index 1c34640618d58d3b5fe627fa6596260a7b687d05..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/im2sequence_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/fluid/operators/im2sequence_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    im2sequence,
-    ops::Im2SequenceKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    im2sequence_grad,
-    ops::Im2SequenceGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
deleted file mode 100644
index 9c9069b722763d0ec0d39d2f6fb35477c7578f30..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/im2sequence_op.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0,
-                            int padding_1, int stride) {
-  const int output_size =
-      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
-  return output_size;
-}
-
-template <typename DeviceContext, typename T>
-class Im2SequenceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* in = ctx.Input<Tensor>("X");
-    LoDTensor* out = ctx.Output<LoDTensor>("Out");
-    auto in_dim = in->dims();
-    int batch_size = in_dim[0];
-    int img_channels = in_dim[1];
-    int img_height = in_dim[2];
-    int img_width = in_dim[3];
-    auto kernels = ctx.Attr<std::vector<int>>("kernels");
-    auto strides = ctx.Attr<std::vector<int>>("strides");
-    auto paddings = ctx.Attr<std::vector<int>>("paddings");
-    if (ctx.HasInput("Y") && batch_size > 1) {
-      const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
-      auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
-      Tensor cpu_shape_tensor;
-      TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
-      std::vector<int> imgreal_h;
-      std::vector<int> imgreal_w;
-      std::vector<int> output_height;
-      std::vector<int> output_width;
-      int result = 0;
-      for (int i = 0; i < batch_size; i++) {
-        int tmp_real_h = static_cast<int>((cpu_shape_tensor.data<T>())[2 * i]);
-        int tmp_real_w =
-            static_cast<int>((cpu_shape_tensor.data<T>())[2 * i + 1]);
-        if (tmp_real_h % out_stride[0] == 0) {
-          tmp_real_h = tmp_real_h / out_stride[0];
-        } else {
-          tmp_real_h = tmp_real_h / out_stride[0] + 1;
-        }
-        if (tmp_real_w % out_stride[1] == 0) {
-          tmp_real_w = tmp_real_w / out_stride[1];
-        } else {
-          tmp_real_w = tmp_real_w / out_stride[1] + 1;
-        }
-        imgreal_h.push_back(tmp_real_h);
-        imgreal_w.push_back(tmp_real_w);
-        output_height.push_back(Im2SeqOutputSize(
-            imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
-        output_width.push_back(Im2SeqOutputSize(
-            imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
-        result += output_height[i] * output_width[i];
-      }
-
-      out->mutable_data<T>({result, img_channels * kernels[0] * kernels[1]},
-                           ctx.GetPlace());
-
-      const std::vector<int> dilations({1, 1});
-      int offset_out = 0;
-      for (int i = 0; i < batch_size; i++) {
-        const Tensor src =
-            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-        Tensor dst = out->Slice(offset_out,
-                                offset_out + output_height[i] * output_width[i])
-                         .Resize({output_height[i], output_width[i],
-                                  img_channels, kernels[0], kernels[1]});
-        offset_out += output_height[i] * output_width[i];
-
-        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
-        auto& dev_ctx = ctx.template device_context<DeviceContext>();
-        f(dev_ctx, src, dilations, strides, paddings, &dst);
-      }
-      framework::LoD lod(1);
-      lod[0].reserve(batch_size + 1);
-      int offset = 0;
-      lod[0].push_back(offset);
-      for (int i = 0; i < batch_size; ++i) {
-        offset += output_height[i] * output_width[i];
-        lod[0].push_back(offset);
-      }
-      out->set_lod(lod);
-    } else {
-      int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                           paddings[2], strides[0]);
-      int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                          paddings[3], strides[1]);
-      out->mutable_data<T>(
-          {static_cast<int64_t>(batch_size) * output_height * output_width,
-           static_cast<int64_t>(img_channels) * kernels[0] * kernels[1]},
-          ctx.GetPlace());
-      const std::vector<int> dilations({1, 1});
-      auto out_dims = out->dims();
-      out->Resize({batch_size, out->numel() / batch_size});
-      for (int i = 0; i < batch_size; i++) {
-        const Tensor src =
-            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-        Tensor dst =
-            out->Slice(i, i + 1).Resize({output_height, output_width,
-                                         img_channels, kernels[0], kernels[1]});
-
-        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
-        auto& dev_ctx = ctx.template device_context<DeviceContext>();
-        f(dev_ctx, src, dilations, strides, paddings, &dst);
-      }
-      out->Resize(out_dims);
-      framework::LoD lod(1);
-      lod[0].reserve(batch_size + 1);
-      int offset = 0;
-      lod[0].push_back(offset);
-      for (int i = 0; i < batch_size; ++i) {
-        offset += output_height * output_width;
-        lod[0].push_back(offset);
-      }
-      out->set_lod(lod);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Im2SequenceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    Tensor* d_out =
-        const_cast<Tensor*>(ctx.Input<Tensor>(framework::GradVarName("Out")));
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-
-    auto x_v = framework::EigenVector<T>::Flatten(*d_x);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    x_v.device(place) = x_v.constant(0.0);
-
-    auto in_dim = in->dims();
-    int batch_size = in_dim[0];
-    int img_channels = in_dim[1];
-    int img_height = in_dim[2];
-    int img_width = in_dim[3];
-
-    auto kernels = ctx.Attr<std::vector<int>>("kernels");
-    auto strides = ctx.Attr<std::vector<int>>("strides");
-    auto paddings = ctx.Attr<std::vector<int>>("paddings");
-    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                         paddings[2], strides[0]);
-    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                        paddings[3], strides[1]);
-
-    const std::vector<int> dilations({1, 1});
-
-    auto d_out_dims = d_out->dims();
-    d_out->Resize({batch_size, d_out->numel() / batch_size});
-    for (int i = 0; i < batch_size; i++) {
-      Tensor dst =
-          d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      const Tensor src = d_out->Slice(i, i + 1).Resize(
-          {output_height, output_width, img_channels, kernels[0], kernels[1]});
-      math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      f(dev_ctx, src, dilations, strides, paddings, &dst);
-    }
-    d_out->Resize(d_out_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
deleted file mode 100644
index f0ffc9706689f5afe4546c3483114b38bc2b7872..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/increment_op.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/increment_op.h"
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class IncrementOp : public framework::OperatorWithKernel {
- public:
-  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of IncrementOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of IncrementOp should not be null.");
-    PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X")));
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
-    // IncrementOp kernel's device type is decided by input tensor place
-    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
-    return kt;
-  }
-};
-
-class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input tensor of increment operator");
-    AddOutput("Out", "(Tensor) The output tensor of increment operator.");
-    AddAttr<float>("step",
-                   "(float, default 1.0) "
-                   "The step size by which the "
-                   "input tensor will be incremented.")
-        .SetDefault(1.0);
-    AddComment(R"DOC(
-Increment Operator.
-
-The equation is: 
-$$Out = X + step$$
-
-)DOC");
-  }
-};
-
-class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("increment");
-    grad_op->SetInput("X", Output("Out"));
-    grad_op->SetOutput("Out", Input("X"));
-    grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
-                  ops::IncrementGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/operators/increment_op.cu
deleted file mode 100644
index 228063bf3d4b24bbd03649189f6ddba9a5f0ca30..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/increment_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/increment_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h
deleted file mode 100644
index d0e8c66255ef68b975701fb6b3c145be2590e271..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/increment_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class IncrementKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_tensor = context.Input<framework::Tensor>("X");
-    auto* out_tensor = context.Output<framework::Tensor>("Out");
-    float step = context.Attr<float>("step");
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    framework::EigenScalar<T>::From(*out_tensor).device(dev) =
-        framework::EigenScalar<T>::From(*x_tensor) + static_cast<T>(step);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
deleted file mode 100644
index 6375c92de2d219e9e66ce8899fed991a1a75d00d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ /dev/null
@@ -1,646 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/instance_norm_op.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "Input(X) of Instance Norm Op should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true,
-                    "Input(Scale) of Instance Norm Op should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Bias"), true,
-                    "Input(Bias) of Instance Norm Op should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true,
-                    "Output(Y) of Instance Norm Op should not be null.");
-
-  PADDLE_ENFORCE_EQ(
-      ctx->HasOutput("SavedMean"), true,
-      "Output(SavedMean) of Instance Norm Op should not be null.");
-  PADDLE_ENFORCE_EQ(
-      ctx->HasOutput("SavedVariance"), true,
-      "Output(SavedVariance) of Instance Norm Op should not be null.");
-
-  const auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                    "the dimension of input X must greater than or equal to 2");
-  PADDLE_ENFORCE_LE(x_dims.size(), 5,
-                    "the dimension of input X must smaller than or equal to 5");
-  auto N = x_dims[0];
-  auto C = x_dims[1];
-  auto NxC = N * C;
-
-  auto scale_dim = ctx->GetInputDim("Scale");
-  auto bias_dim = ctx->GetInputDim("Bias");
-
-  PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL);
-  PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL);
-
-  bool check = !((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
-                                         framework::product(bias_dim) <= 0));
-
-  if (check) {
-    PADDLE_ENFORCE_EQ(scale_dim[0], C);
-    PADDLE_ENFORCE_EQ(bias_dim[0], C);
-  }
-
-  ctx->SetOutputDim("Y", x_dims);
-  ctx->SetOutputDim("SavedMean", {NxC});
-  ctx->SetOutputDim("SavedVariance", {NxC});
-  ctx->ShareLoD("X", "Y");
-}
-
-framework::OpKernelType InstanceNormOp::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
-  auto input_data_type = ctx.Input<Tensor>("X")->type();
-  // By default, the type of the scale, bias, mean,
-  // and var tensors should both be float. (For float or float16 input tensor)
-  // or double (For double input tensor).
-  auto in_param_type = framework::proto::VarType::FP32;
-  if (input_data_type == framework::proto::VarType::FP64) {
-    in_param_type = framework::proto::VarType::FP64;
-  }
-  PADDLE_ENFORCE_EQ(in_param_type, ctx.Input<Tensor>("Scale")->type(),
-                    "Scale input should be of float type");
-  PADDLE_ENFORCE_EQ(in_param_type, ctx.Input<Tensor>("Bias")->type(),
-                    "Bias input should be of float type");
-
-  return framework::OpKernelType(input_data_type, ctx.GetPlace());
-}
-
-void InstanceNormOpMaker::Make() {
-  AddAttr<float>("epsilon", "")
-      .SetDefault(1e-5)
-      .AddCustomChecker([](const float &epsilon) {
-        PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
-                          "'epsilon' should be between 0.0 and 0.001.");
-      });
-  AddInput("X", "The input tensor");
-  AddInput("Scale",
-           "Scale is a 1-dimensional tensor of size C "
-           "that is applied to the output");
-  AddInput("Bias",
-           "Bias is a 1-dimensional tensor of size C "
-           "that is applied to the output");
-  AddOutput("Y", "result after normalization");
-  AddOutput("SavedMean",
-            "Mean of the current mini batch, "
-            "will apply to output when training")
-      .AsIntermediate();
-  AddOutput("SavedVariance",
-            "Variance of the current mini batch, "
-            "will apply to output when training")
-      .AsIntermediate();
-  AddComment(R"DOC(
-Instance Normalization.
-
-Instance Norm has been implemented as disscussed in the paper:
-https://arxiv.org/pdf/1607.08022.pdf
-Can be used as a normalizer function for conv2d and fully_connected operations.
-The required data format for this layer is as following:
-NCHW `[batch, in_channels, in_height, in_width]`
-
-)DOC");
-}
-
-template <typename T>
-class InstanceNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int NxC = N * C;
-
-    const int sample_size = x->numel() / N / C;
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto *place = dev_ctx.eigen_device();
-
-    Eigen::DSizes<int, 2> bcast(1, sample_size);
-    Eigen::DSizes<int, 2> C_shape(C, 1);
-    Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
-
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, saved_mean, static_cast<T>(0));
-    set_constant(dev_ctx, saved_variance, static_cast<T>(0));
-
-    auto saved_mean_a = framework::EigenVector<T>::Flatten(*saved_mean);
-    auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
-    auto saved_variance_a = framework::EigenVector<T>::Flatten(*saved_variance);
-    auto saved_variance_e = saved_variance_a.reshape(NxC_shape);
-
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto x_arr = x_e.reshape(shape);
-
-    Eigen::DSizes<int, 1> rdims(1);
-
-    saved_mean_e.device(*place) = x_arr.mean(rdims);
-    auto saved_variance_arr =
-        (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
-
-    saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    auto scale_e = framework::EigenVector<T>::Flatten(*scale);
-    auto scale_arr = scale_e.reshape(C_shape);
-    auto bias_e = framework::EigenVector<T>::Flatten(*bias);
-    auto bias_arr = bias_e.reshape(C_shape);
-
-    y->mutable_data<T>(ctx.GetPlace());
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto y_arr = y_e.reshape(shape);
-
-    // (x - mean) * inv_std * scale + bias
-    Eigen::DSizes<int, 2> bcast_param(N, sample_size);
-    y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
-                               saved_variance_e.broadcast(bcast) *
-                               scale_arr.broadcast(bcast_param) +
-                           bias_arr.broadcast(bcast_param);
-  }
-};
-
-void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true,
-                    "Input(scale) should not be null");
-
-  PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Y")), true,
-                    "Input(Y@GRAD) should not be null");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"), true,
-                    "Input(SavedMean) should not be null");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"), true,
-                    "Input(SavedVariance) should not be null");
-
-  // check output
-  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                    "Output(x@GRAD) should not be null");
-  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Bias")), true,
-                      "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
-                      "null at the same time");
-  }
-  const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
-  ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
-  }
-}
-
-framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
-  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-  if (var == nullptr) {
-    PADDLE_THROW("cannot find Y@GRAD");
-  }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
-  } else if (var->IsType<LoDTensor>()) {
-    t = &var->Get<LoDTensor>();
-  }
-  if (t == nullptr) {
-    PADDLE_THROW("cannot find Y@GRAD");
-  }
-  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                 ctx.GetPlace());
-}
-
-template <typename T>
-class InstanceNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-
-    const auto &x_dims = x->dims();
-
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int NxC = N * C;
-    const int sample_size = x->numel() / N / C;
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto *place = dev_ctx.eigen_device();
-
-    Eigen::DSizes<int, 1> rdims(0);
-    Eigen::DSizes<int, 1> mean_rdims(1);
-    Eigen::DSizes<int, 2> rshape(NxC, sample_size);
-    Eigen::DSizes<int, 2> bcast(1, sample_size);
-    Eigen::DSizes<int, 2> C_shape(C, 1);
-    Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 2> param_shape(N, C);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
-
-    auto scale_e = framework::EigenVector<T>::Flatten(*scale);
-    auto mean_e = framework::EigenVector<T>::Flatten(*saved_mean);
-    auto inv_var_e = framework::EigenVector<T>::Flatten(*saved_inv_variance);
-    auto dy_e = framework::EigenVector<T>::Flatten(*d_y);
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-
-    auto scale_arr = scale_e.reshape(C_shape);
-    auto mean_arr = mean_e.reshape(NxC_shape);
-    auto inv_var_arr = inv_var_e.reshape(NxC_shape);
-    auto dy_arr = dy_e.reshape(shape);
-    auto x_arr = x_e.reshape(shape);
-
-    auto tmp =
-        (x_arr - mean_arr.broadcast(bcast)) * inv_var_arr.broadcast(bcast);
-
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
-    // math: d_bias = np.sum(d_y, axis=(n,h,w))
-    // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, d_scale, static_cast<T>(0));
-      set_constant(dev_ctx, d_bias, static_cast<T>(0));
-
-      auto d_scale_e = framework::EigenVector<T>::Flatten(*d_scale);
-      auto d_bias_e = framework::EigenVector<T>::Flatten(*d_bias);
-      auto d_scale_data = d_scale_e.reshape(C_shape);
-      auto d_bias_data = d_bias_e.reshape(C_shape);
-      d_bias_data.device(*place) =
-          dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
-      d_scale_data.device(*place) =
-          (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
-    }
-
-    auto dy_mean = dy_arr.mean(mean_rdims).reshape(NxC_shape).broadcast(bcast);
-
-    Eigen::DSizes<int, 2> bcast_param(N, sample_size);
-    set_constant(dev_ctx, d_x, static_cast<T>(0));
-    // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
-    // axis=(h,w))
-    //             - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
-    //             mean),
-    //             axis=(h,w))
-    auto dx_e = framework::EigenVector<T>::Flatten(*d_x);
-    auto dx_arr = dx_e.reshape(shape);
-    dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
-                            inv_var_arr.broadcast(bcast) *
-                            (dy_arr - dy_mean -
-                             tmp *
-                                 (dy_arr * tmp)
-                                     .mean(mean_rdims)
-                                     .reshape(NxC_shape)
-                                     .broadcast(bcast));
-  }
-};
-
-std::unique_ptr<framework::OpDesc> InstanceNormGradMaker::Apply() const {
-  auto *op = new framework::OpDesc();
-  op->SetType("instance_norm_grad");
-  op->SetInput("X", Input("X"));
-  op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-
-  op->SetInput("Scale", Input("Scale"));
-  op->SetInput("Bias", Input("Bias"));
-  op->SetInput("SavedMean", Output("SavedMean"));
-  op->SetInput("SavedVariance", Output("SavedVariance"));
-
-  op->SetAttrMap(Attrs());
-  op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-  op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
-  op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-
-  return std::unique_ptr<framework::OpDesc>(op);
-}
-
-void InstanceNormDoubleGradOp::InferShape(
-    framework::InferShapeContext *ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true,
-                    "Input(Scale) should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"), true,
-                    "Input(SavedMean) should not be null");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"), true,
-                    "Input(SavedVariance) should not be null");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("DDX"), true,
-                    "Input(DDX) should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("DY"), true,
-                    "Input(Y@GRAD) should not be null");
-
-  // check output
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("DX"), true,
-                    "Output(DX) should not be null");
-
-  const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
-  if (ctx->HasOutput("DX")) {
-    ctx->SetOutputDim("DX", x_dims);
-  }
-  if (ctx->HasOutput("DScale")) {
-    ctx->SetOutputDim("DScale", {C});
-  }
-  if (ctx->HasOutput("DDY")) {
-    ctx->ShareDim("X", "DDY");
-  }
-}
-
-framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
-  const auto *var = ctx.InputVar("DY");
-  if (var == nullptr) {
-    PADDLE_THROW("cannot find Y@GRAD");
-  }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
-  } else if (var->IsType<LoDTensor>()) {
-    t = &var->Get<LoDTensor>();
-  }
-  if (t == nullptr) {
-    PADDLE_THROW("cannot find Y@GRAD");
-  }
-  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                 ctx.GetPlace());
-}
-
-std::unique_ptr<framework::OpDesc> InstanceNormDoubleGradMaker::Apply() const {
-  auto *op = new framework::OpDesc();
-  op->SetType("instance_norm_grad_grad");
-  op->SetInput("X", Input("X"));
-  op->SetInput("Scale", Input("Scale"));
-  op->SetInput("SavedMean", Input("SavedMean"));
-  op->SetInput("SavedVariance", Input("SavedVariance"));
-  op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-  op->SetInput("DDScale", OutputGrad(framework::GradVarName("Scale")));
-  op->SetInput("DDBias", OutputGrad(framework::GradVarName("Bias")));
-  op->SetInput("DY", Input(framework::GradVarName("Y")));
-
-  op->SetAttrMap(Attrs());
-  op->SetOutput("DX", InputGrad("X"));
-  op->SetOutput("DScale", InputGrad("Scale"));
-  op->SetOutput("DDY", InputGrad(framework::GradVarName("Y")));
-  return std::unique_ptr<framework::OpDesc>(op);
-}
-
-template <typename T>
-class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    const auto &x_dims = X->dims();
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    const int sample_size = X->numel() / N / C;
-    const int NxC = N * C;
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *inv_var_data = Saved_variance->data<T>();
-    Tensor mean_tensor;
-    Tensor inv_var_tensor;
-    ConstEigenArrayMap<T> x_arr(X->data<T>(), sample_size, NxC);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, NxC);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, NxC);
-
-    Tensor mean_tile;
-    mean_tile.Resize({sample_size, NxC});
-    mean_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
-                                    sample_size, NxC);
-
-    Tensor inv_var_tile;
-    inv_var_tile.Resize({sample_size, NxC});
-    inv_var_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> inv_var_tile_data(
-        inv_var_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-
-    mean_tile_data = mean_arr.transpose().replicate(sample_size, 1);
-    inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1);
-
-    ConstEigenVectorArrayMap<T> scale_arr(Scale->data<T>(), C);
-
-    Tensor scale_tile;
-    scale_tile.Resize({sample_size, NxC});
-    scale_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
-                                     sample_size, NxC);
-    scale_tile_data = scale_arr.transpose().replicate(sample_size, N);
-
-    ConstEigenArrayMap<T> dy_arr(dY->data<T>(), sample_size, NxC);
-    ConstEigenArrayMap<T> ddx_arr(ddX->data<T>(), sample_size, NxC);
-
-    // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx,
-    // axis=(h,w)) *
-    //          np.sum(dy, axis=(h,w)) -
-    //          np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean),
-    //          axis=(h,w)) * inv_var.pow(2) *
-    //          np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW *
-    //          np.sum(ddx * (x - mean)) *
-    //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
-    //          np.sum(dy,
-    //          axis=(h,w)) * (x - mean) *
-    //          (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var
-    //          *
-    //          np.mean(dy, axis=(h,w)) -
-    //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-    //          axis=(h,w))))
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    Tensor x_sub_mean_mul_invstd;
-    x_sub_mean_mul_invstd.Resize({sample_size, NxC});
-    x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
-        x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), sample_size,
-        NxC);
-    x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
-
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, dX, static_cast<T>(0));
-      EigenArrayMap<T> dx_arr(dX->mutable_data<T>(ctx.GetPlace()), sample_size,
-                              NxC);
-
-      if (ddX) {
-        dx_arr +=
-            x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
-            sample_size *
-            (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size -
-             (dy_arr * ddx_arr).colwise().sum() +
-             3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() *
-                 (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                 sample_size);
-
-        dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                  sample_size * inv_var_tile_data * inv_var_tile_data *
-                  (dy_arr.colwise().sum() / sample_size - dy_arr);
-
-        dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                  sample_size * inv_var_tile_data * inv_var_tile_data *
-                  (ddx_arr.colwise().sum() / sample_size - ddx_arr);
-
-        dx_arr = scale_tile_data * dx_arr.eval();
-      }
-      if (ddScale) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({sample_size, NxC});
-        ddscale_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
-
-        dx_arr += (dy_arr * inv_var_tile_data -
-                   dy_arr.colwise().sum() / sample_size * inv_var_tile_data -
-                   x_sub_mean_mul_invstd_arr * inv_var_tile_data *
-                       (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                       sample_size) *
-                  ddscale_tile_data;
-      }
-    }
-    if (dScale) {
-      // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) *
-      //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx
-      dScale->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, dScale, static_cast<T>(0));
-      EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
-                                        C);
-      if (ddX) {
-        Tensor first_grad;
-        first_grad.Resize({sample_size, NxC});
-        first_grad.mutable_data<T>(ctx.GetPlace());
-        set_constant(dev_ctx, &first_grad, static_cast<T>(0));
-        EigenArrayMap<T> first_grad_arr(
-            first_grad.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-
-        first_grad_arr +=
-            inv_var_tile_data *
-            (dy_arr - dy_arr.colwise().sum() / sample_size -
-             x_sub_mean_mul_invstd_arr *
-                 (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                 sample_size);
-        first_grad_arr = first_grad_arr.eval() * ddx_arr;
-        for (int nc = 0; nc < NxC; ++nc) {
-          int c = nc % C;
-          dscale_arr(c) += first_grad_arr.colwise().sum()(nc);
-        }
-      }
-    }
-    if (ddY) {
-      // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
-      //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
-      //           np.mean(ddx * (x - mean), axis=(h,w)))
-      ddY->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, ddY, static_cast<T>(0));
-      EigenArrayMap<T> ddy_arr(ddY->mutable_data<T>(ctx.GetPlace()),
-                               sample_size, NxC);
-      if (ddX) {
-        ddy_arr += scale_tile_data * inv_var_tile_data *
-                   (ddx_arr - ddx_arr.colwise().sum() / sample_size -
-                    x_sub_mean_mul_invstd_arr *
-                        (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                        sample_size);
-      }
-      if (ddScale && ddBias) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({sample_size, NxC});
-        ddscale_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
-
-        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-        Tensor ddbias_tile;
-        ddbias_tile.Resize({sample_size, NxC});
-        ddbias_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddbias_tile_data(
-            ddbias_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N);
-
-        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
-        ddy_arr += ddbias_tile_data;
-      }
-    }
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInference,
-                           {"DY", "DDY"});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(instance_norm, ops::InstanceNormOp, ops::InstanceNormOpMaker,
-                  ops::InstanceNormOpInferVarType, ops::InstanceNormGradMaker);
-REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp,
-                  ops::InstanceNormDoubleGradMaker);
-REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp,
-                  ops::InstanceNormDoubleGradOpInplaceInference);
-
-REGISTER_OP_CPU_KERNEL(
-    instance_norm,
-    ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    instance_norm_grad_grad,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                      double>);
diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu
deleted file mode 100644
index 3f0799fbdbd40c29a6098a4ffcd93b4bd31fb70f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ /dev/null
@@ -1,655 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <cfloat>
-#include <string>
-#include <vector>
-#include "cub/cub.cuh"
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/instance_norm_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-template <typename T>
-using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
-
-template <typename T>
-static __global__ void repeat_param(const T *input, T *output,
-                                    const int repeat_num, const int C) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < repeat_num * C;
-       i += blockDim.x * gridDim.x) {
-    int index = i % C;
-    output[i] = input[index];
-  }
-}
-
-template <typename T, int BlockDim, bool AVG>
-static __global__ void add_param(const T *input, T *output,
-                                 const int repeat_num, const int C) {
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ou_storage;
-  for (int i = blockIdx.x; i < C; i += gridDim.x) {
-    T ou = static_cast<T>(0);
-    for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
-      const int index = j * C + i;
-      ou += static_cast<T>(input[index]);
-    }
-    ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
-    if (threadIdx.x == 0) {
-      output[i] = ou;
-    }
-    __syncthreads();
-
-    if (AVG) {
-      output[i] /= repeat_num;
-    }
-  }
-}
-
-template <typename T>
-class InstanceNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must be CUDAPlace.");
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), 2,
-        "the dimension of input X must greater than or equal to 2");
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), 5,
-        "the dimension of input X must smaller than or equal to 5");
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-    Tensor x_tmp;
-    x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
-
-    auto *y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t in_param_desc_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-    VLOG(3) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    dims = {1, NxC, H, W, D};
-    strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    Tensor scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    scale_tmp.mutable_data<T>(ctx.GetPlace());
-    Tensor bias_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    bias_tmp.mutable_data<T>(ctx.GetPlace());
-
-    const int n = x->numel();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = std::min((NxC + block - 1) / block, max_blocks);
-
-    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        scale->data<T>(), scale_tmp.data<T>(), N, C);
-    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        bias->data<T>(), bias_tmp.data<T>(), N, C);
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-        functor;
-
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
-    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
-        handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
-        data_desc_, y->template mutable_data<T>(ctx.GetPlace()), in_param_desc_,
-        scale_tmp.template data<BatchNormParamType<T>>(),
-        bias_tmp.template data<BatchNormParamType<T>>(), 0, nullptr, nullptr,
-        epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
-                     ctx.GetPlace()),
-        saved_variance->template mutable_data<BatchNormParamType<T>>(
-            ctx.GetPlace())));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
-  }
-};
-
-template <typename T, int BlockDim>
-static __global__ void GradComputeDX(const T *dy,
-                                     const BatchNormParamType<T> *scale,
-                                     const BatchNormParamType<T> *mean,
-                                     const T *x,
-                                     const BatchNormParamType<T> *variance,
-                                     const int C, const int sample_size,
-                                     T *dx) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  BatchNormParamType<T> mean_val = mean[ncid];
-  BatchNormParamType<T> inv_var_val = variance[ncid];
-
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
-  __shared__ BatchNormParamType<T> dy_sum_val;
-  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-
-  BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
-  BatchNormParamType<T> dy_x_sub_mean_sum =
-      static_cast<BatchNormParamType<T>>(0);
-
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
-    dy_sum += dy_i;
-    dy_x_sub_mean_sum +=
-        dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
-  }
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  dy_x_sub_mean_sum =
-      BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    dx[i] =
-        (static_cast<BatchNormParamType<T>>(dy[i]) -
-         dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
-         (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
-             dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
-        scale[c] * inv_var_val;
-  }
-}
-
-template <typename T>
-class InstanceNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-
-    Tensor x_tmp, d_y_tmp;
-    x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
-    d_y_tmp.ShareDataWith(*d_y).Resize({1, NxC, H, W, D});
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-    }
-    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
-    PADDLE_ENFORCE_EQ(scale->dims()[0], C);
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    const int n = x->numel();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = std::min(NxC, max_blocks);
-    const int grid1 = (C + block - 1) / block;
-
-    Tensor scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    scale_tmp.mutable_data<T>(ctx.GetPlace());
-    Tensor d_scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    Tensor d_bias_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        scale->data<T>(), scale_tmp.data<T>(), N, C);
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    dims = {1, NxC, H, W, D};
-    strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
-
-    if ((H * W * D) == 1) {
-      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-          functor;
-      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-      return;
-    }
-
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t in_param_desc_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
-
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const auto *saved_mean_data =
-        saved_mean->template data<BatchNormParamType<T>>();
-    const auto *saved_var_data =
-        saved_var->template data<BatchNormParamType<T>>();
-    if (d_scale && d_bias) {
-      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-          dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL,
-          CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-          CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
-          x_tmp.template data<T>(), data_desc_, d_y_tmp.template data<T>(),
-          data_desc_, d_x->template mutable_data<T>(ctx.GetPlace()),
-          in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
-          d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace()),
-          d_bias_tmp.template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace()),
-          epsilon, saved_mean_data, saved_var_data));
-    } else {
-      if (d_x) {
-        GradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-            d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-            saved_mean_data, x->data<T>(), saved_var_data, C, H * W * D,
-            d_x->data<T>());
-      }
-    }
-
-    if (d_scale && d_bias) {
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
-    }
-
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
-  }
-};
-
-static __device__ __forceinline__ float real_sqrt(float x) {
-  return 1. / sqrtf(x);
-}
-static __device__ __forceinline__ double real_sqrt(double x) {
-  return 1. / sqrt(x);
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDX(const T *x, const T *mean,
-                                    const T *variance, const T *ddx,
-                                    const T *dy, const T *scale,
-                                    const T *ddscale, int C, int sample_size,
-                                    const double epsilon, T *dx) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
-  __shared__ T dy_sum_val;
-  __shared__ T ddx_sum_val;
-  __shared__ T dy_mul_ddx_sum_val;
-  __shared__ T dy_mul_x_sub_mean_sum_val;
-  __shared__ T ddx_mul_x_sub_mean_sum_val;
-
-  T dy_sum = 0;
-  T ddx_sum = 0;
-  T dy_mul_ddx_sum = 0;
-  T dy_mul_x_sub_mean_sum = 0;
-  T ddx_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T ddx_i = ddx[i];
-    T dy_i = dy[i];
-    T tmp = x[i] - mean_val;
-
-    dy_sum += dy_i;
-    ddx_sum += ddx_i;
-    dy_mul_ddx_sum += (ddx_i * dy_i);
-
-    dy_mul_x_sub_mean_sum += (dy_i * tmp);
-    ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
-  }
-
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
-  dy_mul_ddx_sum =
-      BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
-  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
-                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
-  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
-                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    ddx_sum_val = ddx_sum;
-    dy_mul_ddx_sum_val = dy_mul_ddx_sum;
-    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
-    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dx[i] +=
-          ((x[i] - mean_val) * var_val * var_val * var_val / sample_size *
-               (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
-                3. * dy_mul_x_sub_mean_sum_val * var_val *
-                    ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
-           ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
-               var_val * (dy_sum_val / sample_size - dy[i]) +
-           dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
-               var_val * (ddx_sum_val / sample_size - ddx[i])) *
-          scale[c];
-    }
-  }
-  __syncthreads();
-  if (ddscale != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val -
-                (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val *
-                    var_val / sample_size) *
-               ddscale[c];
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
-                                     const T *variance, const T *ddscale,
-                                     const T *ddbias, const T *ddx,
-                                     const T *scale, int C, int sample_size,
-                                     const double epsilon, T *ddy) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ddx_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
-  __shared__ T ddx_sum_val;
-  __shared__ T ddx_mul_x_sub_mean_sum_val;
-
-  T ddx_sum = 0;
-  T ddx_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T ddx_i = ddx[i];
-    ddx_sum += ddx_i;
-    ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val));
-  }
-  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
-  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
-                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    ddx_sum_val = ddx_sum;
-    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += scale[c] * var_val *
-                (ddx[i] - ddx_sum_val / sample_size -
-                 (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val *
-                     var_val / sample_size);
-    }
-  }
-  __syncthreads();
-  if (ddscale != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += (x[i] - mean_val) * var_val * ddscale[c];
-    }
-  }
-  __syncthreads();
-  if (ddbias != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += ddbias[c];
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
-                                        const T *variance, const T *ddx,
-                                        const T *dy, int C, int sample_size,
-                                        const double epsilon, T *dscale) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
-  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
-  __shared__ T dy_sum_val;
-  __shared__ T dy_mul_x_sub_mean_sum_val;
-
-  T dy_sum = 0;
-  T dy_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T dy_i = dy[i];
-    dy_sum += dy_i;
-    dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val));
-  }
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
-                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    T dscale_tmp = 0;
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dscale_tmp +=
-          ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size -
-                              dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) *
-                                  var_val * var_val / sample_size);
-    }
-    dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
-
-    if (threadIdx.x == 0) {
-      dscale[ncid] += dscale_tmp;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T>
-class InstanceNormDoubleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
-    const auto *running_var = ctx.Input<Tensor>("Variance");
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    const T *x_data = X->data<T>();
-    const T *scale_data = Scale->data<T>();
-    const T *dy_data = dY->data<T>();
-    const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
-
-    const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
-    const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data<T>());
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *variance_data = Saved_variance->data<T>();
-
-    auto &x_dims = X->dims();
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-    const int n = X->numel();
-    int sample_size = n / N / C;
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = NxC;
-    const int grid1 = (C + block - 1) / block;
-
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
-
-    if (dX) {
-      T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dX, static_cast<T>(0));
-      DoubleGradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
-          ddscale_data, C, sample_size, epsilon, dx_data);
-    }
-    if (dScale) {
-      Tensor dscale_tmp =
-          ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-      set_zero(dev_ctx, &dscale_tmp, static_cast<T>(0));
-      T *dscale_tmp_data = dscale_tmp.mutable_data<T>(ctx.GetPlace());
-
-      T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dScale, static_cast<T>(0));
-      DoubleGradComputeDScale<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddx_data, dy_data, C, sample_size,
-          epsilon, dscale_tmp_data);
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          dscale_tmp.data<T>(), dScale->data<T>(), N, C);
-    }
-    if (ddY) {
-      T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, ddY, static_cast<T>(0));
-      DoubleGradComputeDDY<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data,
-          scale_data, C, sample_size, epsilon, ddy_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm, ops::InstanceNormKernel<plat::CUDADeviceContext, float>,
-    ops::InstanceNormKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad_grad,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                      float>,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                      double>);
diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h
deleted file mode 100644
index 509c1ff038d1f2169bc54bbdef5f8dc210a78120..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/instance_norm_op.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/norm_utils.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
-class InstanceNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override;
-};
-
-class InstanceNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override;
-};
-
-class InstanceNormDoubleGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override;
-};
-
-class InstanceNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-class InstanceNormGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override;
-};
-
-class InstanceNormDoubleGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override;
-};
-
-class InstanceNormOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", "Y"}};
-  }
-};
-
-template <typename DeviceContext, typename T>
-class InstanceNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class InstanceNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class InstanceNormDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
deleted file mode 100644
index 612f770bb7cee695724a39635bdd2d884813d7ca..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/interpolate_op.cc
+++ /dev/null
@@ -1,484 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/interpolate_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using DataLayout = framework::DataLayout;
-
-static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
-  auto dim_x = ctx->GetInputDim("X");
-  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
-
-  PADDLE_ENFORCE(
-      "bilinear" == interp_method || "nearest" == interp_method,
-      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
-      "Input(X) dimension is 4");
-  const DataLayout data_layout = framework::StringToDataLayout(
-      ctx->Attrs().Get<std::string>("data_layout"));
-
-  if (ctx->HasInputs("SizeTensor")) {
-    // top prority size
-    auto inputs_name = ctx->Inputs("SizeTensor");
-    PADDLE_ENFORCE_EQ(
-        inputs_name.size(), 2,
-        "Input(SizeTensor)'size of Op(interpolate) must be 2. "
-        "Attr(out_shape)'s length must be 2 for 4-D input tensor.");
-    int out_h = ctx->Attrs().Get<int>("out_h");
-    int out_w = ctx->Attrs().Get<int>("out_w");
-    framework::DDim dim_out;
-    if (data_layout == DataLayout::kNCHW) {
-      dim_out = {dim_x[0], dim_x[1], out_h, out_w};
-    } else {
-      dim_out = {dim_x[0], out_h, out_w, dim_x[3]};
-    }
-    ctx->SetOutputDim("Out", dim_out);
-
-    return;
-  }
-
-  int out_h, out_w;
-  if (ctx->HasInput("Scale")) {
-    auto scale_tensor = ctx->GetInputDim("Scale");
-    PADDLE_ENFORCE_EQ(scale_tensor.size(), 1,
-                      "Scale's dimension size must be 1.");
-    out_h = -1;
-    out_w = -1;
-  } else {
-    float scale = ctx->Attrs().Get<float>("scale");
-    if (scale > 0) {
-      // round down
-      out_h = (data_layout == DataLayout::kNCHW
-                   ? static_cast<int>(dim_x[2] * scale)
-                   : static_cast<int>(dim_x[1] * scale));
-      out_w = (data_layout == DataLayout::kNCHW
-                   ? static_cast<int>(dim_x[3] * scale)
-                   : static_cast<int>(dim_x[2] * scale));
-      // protect when input shape is -1
-      out_h = out_h > 0 ? out_h : -1;
-      out_w = out_w > 0 ? out_w : -1;
-    } else {
-      out_h = ctx->Attrs().Get<int>("out_h");
-      out_w = ctx->Attrs().Get<int>("out_w");
-    }
-  }
-
-  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
-    auto out_size_dim = ctx->GetInputDim("OutSize");
-    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
-                      "OutSize's dimension size must be 1");
-    PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
-    ctx->ShareLoD("X", "Out");
-    return;
-  }
-
-  framework::DDim dim_out;
-  if (data_layout == DataLayout::kNCHW) {
-    dim_out = {dim_x[0], dim_x[1], out_h, out_w};
-  } else {
-    dim_out = {dim_x[0], out_h, out_w, dim_x[3]};
-  }
-  ctx->SetOutputDim("Out", dim_out);
-}
-
-static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
-  auto dim_x = ctx->GetInputDim("X");
-  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
-
-  PADDLE_ENFORCE("trilinear" == interp_method,
-                 "Interpolation method can only be \"trilinear\" when Input(X) "
-                 "dimension is 5");
-  const DataLayout data_layout = framework::StringToDataLayout(
-      ctx->Attrs().Get<std::string>("data_layout"));
-
-  if (ctx->HasInputs("SizeTensor")) {
-    // top prority size
-    auto inputs_name = ctx->Inputs("SizeTensor");
-    PADDLE_ENFORCE_EQ(
-        inputs_name.size(), 3,
-        "Input(SizeTensor)'s size of Op(interpolate) must be 3. "
-        "Attr(out_shape)'s length must be 3 for 5-D input tensor.");
-    int out_d = ctx->Attrs().Get<int>("out_d");
-    int out_h = ctx->Attrs().Get<int>("out_h");
-    int out_w = ctx->Attrs().Get<int>("out_w");
-    framework::DDim dim_out;
-    if (data_layout == DataLayout::kNCHW) {
-      dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w};
-    } else {
-      dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]};
-    }
-    ctx->SetOutputDim("Out", dim_out);
-
-    return;
-  }
-
-  int out_d, out_h, out_w;
-  if (ctx->HasInput("Scale")) {
-    auto scale_tensor = ctx->GetInputDim("Scale");
-    PADDLE_ENFORCE_EQ(scale_tensor.size(), 1,
-                      "Scale's dimension size must be 1");
-    out_d = -1;
-    out_h = -1;
-    out_w = -1;
-  } else {
-    float scale = ctx->Attrs().Get<float>("scale");
-    if (scale > 0) {
-      // round down
-      out_d = (data_layout == DataLayout::kNCHW
-                   ? static_cast<int>(dim_x[2] * scale)
-                   : static_cast<int>(dim_x[1] * scale));
-      out_h = (data_layout == DataLayout::kNCHW
-                   ? static_cast<int>(dim_x[3] * scale)
-                   : static_cast<int>(dim_x[2] * scale));
-      out_w = (data_layout == DataLayout::kNCHW
-                   ? static_cast<int>(dim_x[4] * scale)
-                   : static_cast<int>(dim_x[3] * scale));
-      // protect when input shape is -1
-      out_d = out_d > 0 ? out_d : -1;
-      out_h = out_h > 0 ? out_h : -1;
-      out_w = out_w > 0 ? out_w : -1;
-    } else {
-      out_d = ctx->Attrs().Get<int>("out_d");
-      out_h = ctx->Attrs().Get<int>("out_h");
-      out_w = ctx->Attrs().Get<int>("out_w");
-    }
-  }
-
-  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
-    auto out_size_dim = ctx->GetInputDim("OutSize");
-    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
-                      "OutSize's dimension size must be 1");
-    PADDLE_ENFORCE_EQ(out_size_dim[0], 3, "OutSize's dim[0] must be 3");
-    ctx->ShareLoD("X", "Out");
-    return;
-  }
-
-  framework::DDim dim_out;
-  if (data_layout == DataLayout::kNCHW) {
-    dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w};
-  } else {
-    dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]};
-  }
-  ctx->SetOutputDim("Out", dim_out);
-}
-
-class InterpolateOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of InterpolateOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of InterpolationOp should not be null.");
-
-    auto dim_x = ctx->GetInputDim("X");  // NCHW format
-    PADDLE_ENFORCE(dim_x.size() == 4 || dim_x.size() == 5,
-                   "Input(X) dimension must be 4 or 5");
-
-    if (dim_x.size() == 4) {
-      // shape check for 2D interpolate for input tensor shape NCHW
-      Interpolate2DInferShapeCheck(ctx);
-    } else {  // dim_x.size() == 5
-      // shape check for 3D interpolate for input tensor shape NCDHW
-      Interpolate3DInferShapeCheck(ctx);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "SizeTensor" || var_name == "Scale") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of interpolate operator, "
-             "This is a 4-D tensor with shape of [N, C, H, W] or a "
-             "5-D tensor with shape of [N, C, D, H, W].");
-    AddInput("OutSize",
-             "This is a 1-D tensor with two numbers to specify output size. "
-             "It should be [output_height, output_width] when input is a 4-D "
-             "tensor and should be [output_depth, output_height, output_width] "
-             "when input is a 5-D tensor. It has a higher priority than "
-             "the attr(out_d), attr(out_h), attr(out_w) and attr(scale).")
-        .AsDispensable();
-    AddInput("SizeTensor",
-             "(vector<Tensor<int32>>, optional). If provided, interpolate will "
-             "use this. The shape of the tensor in vector MUST BE [1]. "
-             "It has the highest priority compare with Input(OutSize) and "
-             "attr(out_d), attr(out_h), attr(out_w) and attr(scale).")
-        .AsDuplicable()
-        .AsDispensable();
-    AddInput("Scale",
-             "This is a 1-D tensor with one number to specify output scale. "
-             "It has the higher priority compare with attr(scale).")
-        .AsDispensable();
-    AddOutput("Out",
-              "The output tensor of interpolate operator, "
-              "This is a tensor in same rank with Input(X).");
-
-    AddAttr<std::string>(
-        "data_layout",
-        "(string, default NCHW) Only used in "
-        "an optional string from: \"NHWC\", \"NCHW\". "
-        "Specify that the data format of the input and output data is "
-        "channel_first or channel_last.")
-        .SetDefault("NCHW");
-    AddAttr<int>("out_d", "output depth of interpolate op.").SetDefault(0);
-    AddAttr<int>("out_h", "output height of interpolate op.").SetDefault(0);
-    AddAttr<int>("out_w", "output width of interpolate op.").SetDefault(0);
-    AddAttr<float>("scale", "scale factor of interpolate op.").SetDefault(0.);
-    AddAttr<std::string>("interp_method",
-                         "(string, default \"bilinear\"), interpolation "
-                         "method, can be \"bilinear\" for "
-                         "bilinear interpolation, \"trilinear\" for trilinear "
-                         "interpolation and \"nearest\" for nearest "
-                         "neighbor interpolation.")
-        .SetDefault("bilinear");
-    AddAttr<bool>(
-        "align_corners",
-        "an optional bool. Defaults to True. "
-        "If True, the centers of 4 corner pixels of the input and output "
-        "tensors are aligned, preserving the values at the corner pixels, "
-        "If False, are not aligned")
-        .SetDefault(true);
-    AddAttr<int>("align_mode",
-                 "(int, default \'1\'), optional for bilinear interpolation, "
-                 "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
-                 "can be \'1\' for src_idx = scale*dst_index .")
-        .SetDefault(1);
-    AddComment(R"DOC(
-          This operator samples input X to given output shape by using specified
-          interpolation method, the interpolation methods can be \"nearest\"
-          for nearest neighbor interpolation and \"bilinear\" for bilinear 
-          interpolation.
-
-          Nearest neighbor interpolation is to perform nearest neighbor interpolation
-          in both the 3rd dimention(in height direction) and the 4th dimention(in width 
-          direction) on input tensor.
-            
-          Bilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of two variables (e.g. H-direction and 
-          W-direction in this op) on a rectilinear 2D grid. The key idea is 
-          to perform linear interpolation first in one direction, and then 
-          again in the other direction.
-
-          Trilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of three variables (e.g. D-direction, 
-          H-direction and W-direction in this op) on a rectilinear 3D grid. 
-          The linear interpolation is performed on three directions.
-
-          Align_corners and align_mode are optinal parameters,the calculation method 
-          of interpolation can be selected by them.
-          
-          Example:
-
-          For scale:
-          
-            if align_corners = True and out_{size}>1 :
-
-              scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
-            
-            else:
-              
-              scale_{factor} = float(in_{size}/out_{size})
-            
-          
-          Nearest neighbor interpolation:
-          
-          if:
-              align_corners = False
-
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-
-              H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
-              W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
-
-          else:
-              align_corners = True
-
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-
-              H_out = round(H_{in} * scale_{factor})
-              W_out = round(W_{in} * scale_{factor})
-
-          Bilinear interpolation:
-
-          if:
-              align_corners = False , align_mode = 0
-              
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-
-
-          else:
-           
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-
-          Trilinear interpolation:
-
-          if:
-              align_corners = False , align_mode = 0
-              
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              
-              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-
-
-          else:
-           
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-
-              D_out = D_{in} * scale_{factor}
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-          
-
-          For details of nearest neighbor interpolation, please refer to Wikipedia: 
-          https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
-
-          For details of bilinear interpolation, please refer to Wikipedia: 
-          https://en.wikipedia.org/wiki/Bilinear_interpolation
-
-          For details of trilinear interpolation, please refer to Wikipedia: 
-          https://en.wikipedia.org/wiki/Trilinear_interpolation
-         )DOC");
-  }
-};
-
-class InterpolateOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto dim_x = ctx->GetInputDim("X");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "SizeTensor" || var_name == "Scale") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType(ForwardOp().Type() + "_grad");
-    op->SetInput("X", Input("X"));
-    if (ForwardOp().Inputs().count("SizeTensor") > 0) {
-      op->SetInput("SizeTensor", Input("SizeTensor"));
-    }
-    if (ForwardOp().Inputs().count("OutSize") > 0) {
-      op->SetInput("OutSize", Input("OutSize"));
-    }
-    if (ForwardOp().Inputs().count("Scale") > 0) {
-      op->SetInput("Scale", Input("Scale"));
-    }
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(InterpolateGradNoNeedBufferVarsInference,
-                                      "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
-                  ops::InterpolateGradDescMaker);
-REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad,
-                  ops::InterpolateGradNoNeedBufferVarsInference);
-REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
-                  ops::InterpolateGradDescMaker);
-REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad,
-                  ops::InterpolateGradNoNeedBufferVarsInference);
-REGISTER_OPERATOR(trilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
-                  ops::InterpolateGradDescMaker);
-REGISTER_OPERATOR(trilinear_interp_grad, ops::InterpolateOpGrad,
-                  ops::InterpolateGradNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel<float>,
-                       ops::InterpolateKernel<double>,
-                       ops::InterpolateKernel<uint8_t>);
-REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, ops::InterpolateGradKernel<float>,
-                       ops::InterpolateGradKernel<double>);
-REGISTER_OP_CPU_KERNEL(nearest_interp, ops::InterpolateKernel<float>,
-                       ops::InterpolateKernel<double>,
-                       ops::InterpolateKernel<uint8_t>);
-REGISTER_OP_CPU_KERNEL(nearest_interp_grad, ops::InterpolateGradKernel<float>,
-                       ops::InterpolateGradKernel<double>);
-REGISTER_OP_CPU_KERNEL(trilinear_interp, ops::InterpolateKernel<float>,
-                       ops::InterpolateKernel<double>,
-                       ops::InterpolateKernel<uint8_t>);
-REGISTER_OP_CPU_KERNEL(trilinear_interp_grad, ops::InterpolateGradKernel<float>,
-                       ops::InterpolateGradKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
deleted file mode 100644
index 6121389c12864140b4822cb1e6a9bb3ec60b2239..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/interpolate_op.cu
+++ /dev/null
@@ -1,964 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <string>
-#include "paddle/fluid/operators/interpolate_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-__global__ void KeNearestNeighborInterpFw(
-    const T* in, const size_t in_img_h, const size_t in_img_w,
-    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
-    const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const float ratio_h, const float ratio_w,
-    const bool align_corners, const DataLayout data_layout) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-
-    int channel_id, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idy = (out_id_w % out_img_size) / out_img_w;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idy = out_id_w / (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
-    }
-
-    int in_img_idy = (align_corners)
-                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
-                         : static_cast<int>(ratio_h * out_img_idy);
-    int in_img_idx = (align_corners)
-                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
-                         : static_cast<int>(ratio_w * out_img_idx);
-
-    if (data_layout == DataLayout::kNCHW) {
-      out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
-                    in_img_idy * in_img_w + in_img_idx];
-    } else {
-      out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
-                    in_img_idx * num_channels + channel_id];
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeNearestNeighborInterpBw(
-    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
-    const size_t input_w, const T* out, const size_t out_img_h,
-    const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const float ratio_h, const float ratio_w,
-    const bool align_corners, const DataLayout data_layout) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-
-    int channel_id, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idy = (out_id_w % out_img_size) / out_img_w;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idy = out_id_w / (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
-    }
-
-    int in_img_idy = (align_corners)
-                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
-                         : static_cast<int>(ratio_h * out_img_idy);
-    int in_img_idx = (align_corners)
-                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
-                         : static_cast<int>(ratio_w * out_img_idx);
-
-    T* in_pos;
-    if (data_layout == DataLayout::kNCHW) {
-      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
-                   in_img_idy * in_img_w + in_img_idx];
-    } else {
-      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
-                   in_img_idx * num_channels + channel_id];
-    }
-    const T out_pos = out[out_id_h * output_w + out_id_w];
-    platform::CudaAtomicAdd(in_pos, out_pos);
-  }
-}
-
-template <typename T>
-__global__ void KeBilinearInterpFw(
-    const T* in, const size_t in_img_h, const size_t in_img_w,
-    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
-    const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const float ratio_h, const float ratio_w,
-    const bool align_corners, const int align_mode,
-    const DataLayout data_layout) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-
-    int channel_id, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idy = (out_id_w % out_img_size) / out_img_w;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idy = out_id_w / (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
-    }
-
-    int in_img_idy = align_flag
-                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * out_img_idy);
-    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
-    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
-    src_h = (src_h > 0) ? src_h : 0;
-    T h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
-
-    int in_img_idx = align_flag
-                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
-                         : static_cast<int>(ratio_w * out_img_idx);
-    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
-    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
-
-    if (data_layout == DataLayout::kNCHW) {
-      const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
-                            in_img_idy * in_img_w + in_img_idx];
-
-      // bilinear interpolation
-      out[out_id_h * output_w + out_id_w] =
-          h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
-          h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
-                      w1lambda * in_pos[h_id * in_img_w + w_id]);
-    } else {
-      const T* in_pos =
-          &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
-              in_img_idx * num_channels + channel_id];
-
-      // bilinear interpolation
-      out[out_id_h * output_w + out_id_w] =
-          h2lambda *
-              (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) +
-          h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] +
-                      w1lambda * in_pos[h_id * in_img_w * num_channels +
-                                        w_id * num_channels]);
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeBilinearInterpBw(
-    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
-    const size_t input_w, const T* out, const size_t out_img_h,
-    const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const T ratio_h, const T ratio_w,
-    const bool align_corners, const int align_mode,
-    const DataLayout data_layout) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-
-    int channel_id, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idy = (out_id_w % out_img_size) / out_img_w;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idy = out_id_w / (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
-    }
-
-    int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5
-                                : ratio_h * out_img_idy;
-    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
-    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
-    src_h = (src_h > 0) ? src_h : 0;
-    T h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
-
-    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
-                                : ratio_w * out_img_idx;
-    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
-    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
-
-    T* in_pos;
-    if (data_layout == DataLayout::kNCHW) {
-      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
-                   in_img_idy * in_img_w + in_img_idx];
-    } else {
-      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
-                   in_img_idx * num_channels + channel_id];
-    }
-
-    const T* out_pos = &out[out_id_h * output_w + out_id_w];
-
-    if (data_layout == DataLayout::kNCHW) {
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
-                              h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
-                              h1lambda * w1lambda * out_pos[0]);
-    } else {
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                              h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
-                              h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(
-          &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
-          h1lambda * w1lambda * out_pos[0]);
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeTrilinearInterpFw(
-    const T* in, const size_t in_img_d, const size_t in_img_h,
-    const size_t in_img_w, const size_t input_h, const size_t input_w, T* out,
-    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
-    const size_t output_h, const size_t output_w, const size_t num_channels,
-    const float ratio_d, const float ratio_h, const float ratio_w,
-    const bool align_corners, const int align_mode,
-    const DataLayout data_layout) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-
-    int channel_id, out_img_idt, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
-      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
-      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
-                    (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
-    }
-
-    int in_img_idt = align_flag
-                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
-                         : static_cast<int>(ratio_d * out_img_idt);
-    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
-    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
-    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
-    src_d = (src_d > 0) ? src_d : 0;
-    T d1lambda =
-        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
-    T d2lambda = 1.f - d1lambda;
-
-    int in_img_idy = align_flag
-                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * out_img_idy);
-    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
-    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
-    src_h = (src_h > 0) ? src_h : 0;
-    T h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
-
-    int in_img_idx = align_flag
-                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
-                         : static_cast<int>(ratio_w * out_img_idx);
-    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
-    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
-
-    if (data_layout == DataLayout::kNCHW) {
-      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
-                        (in_img_idt * in_img_h + in_img_idy) * in_img_w +
-                        in_img_idx;
-      const T* in_pos1 = &in[in_pos1_idx];
-      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
-      const T* in_pos2 = &in[in_pos2_idx];
-
-      // trilinear interpolation
-      out[out_id_h * output_w + out_id_w] =
-          d2lambda *
-              (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) +
-               h1lambda * (w2lambda * in_pos1[h_id * in_img_w] +
-                           w1lambda * in_pos1[h_id * in_img_w + w_id])) +
-          d1lambda *
-              (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) +
-               h1lambda * (w2lambda * in_pos2[h_id * in_img_w] +
-                           w1lambda * in_pos2[h_id * in_img_w + w_id]));
-
-    } else {
-      int in_pos1_idx = out_id_h * input_w +
-                        in_img_idt * in_img_h * in_img_w * num_channels +
-                        in_img_idy * in_img_w * num_channels +
-                        in_img_idx * num_channels + channel_id;
-      const T* in_pos1 = &in[in_pos1_idx];
-      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels;
-      const T* in_pos2 = &in[in_pos2_idx];
-
-      // trilinear interpolation
-      out[out_id_h * output_w + out_id_w] =
-          d2lambda *
-              (h2lambda * (w2lambda * in_pos1[0] +
-                           w1lambda * in_pos1[w_id * num_channels]) +
-               h1lambda * (w2lambda * in_pos1[h_id * in_img_w * num_channels] +
-                           w1lambda * in_pos1[h_id * in_img_w * num_channels +
-                                              w_id * num_channels])) +
-          d1lambda *
-              (h2lambda * (w2lambda * in_pos2[0] +
-                           w1lambda * in_pos2[w_id * num_channels]) +
-               h1lambda * (w2lambda * in_pos2[h_id * in_img_w * num_channels] +
-                           w1lambda * in_pos2[h_id * in_img_w * num_channels +
-                                              w_id * num_channels]));
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeTrilinearInterpBw(
-    T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w,
-    const size_t input_h, const size_t input_w, const T* out,
-    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
-    const size_t output_h, const size_t output_w, const size_t num_channels,
-    const T ratio_d, const T ratio_h, const T ratio_w, const bool align_corners,
-    const int align_mode, const DataLayout data_layout) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-
-    int channel_id, out_img_idt, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
-      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
-      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
-                    (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
-    }
-
-    int in_img_idt = align_flag
-                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
-                         : static_cast<int>(ratio_d * out_img_idt);
-    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
-    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
-    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
-    src_d = (src_d > 0) ? src_d : 0;
-    T d1lambda =
-        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
-    T d2lambda = 1.f - d1lambda;
-
-    int in_img_idy = align_flag
-                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * out_img_idy);
-    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
-    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
-    src_h = (src_h > 0) ? src_h : 0;
-    T h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
-
-    int in_img_idx = align_flag
-                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
-                         : static_cast<int>(ratio_w * out_img_idx);
-    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
-    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
-
-    if (data_layout == DataLayout::kNCHW) {
-      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
-                        (in_img_idt * in_img_h + in_img_idy) * in_img_w +
-                        in_img_idx;
-      T* in_pos1 = &in[in_pos1_idx];
-      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
-      T* in_pos2 = &in[in_pos2_idx];
-
-      const T* out_pos = &out[out_id_h * output_w + out_id_w];
-
-      // trilinear interpolation grad
-      platform::CudaAtomicAdd(&in_pos1[0],
-                              d2lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[w_id],
-                              d2lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
-                              d2lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
-                              d2lambda * h1lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[0],
-                              d1lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[w_id],
-                              d1lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
-                              d1lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
-                              d1lambda * h1lambda * w1lambda * out_pos[0]);
-    } else {
-      int in_pos1_idx = out_id_h * input_w +
-                        in_img_idt * in_img_h * in_img_w * num_channels +
-                        in_img_idy * in_img_w * num_channels +
-                        in_img_idx * num_channels + channel_id;
-      T* in_pos1 = &in[in_pos1_idx];
-      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels;
-      T* in_pos2 = &in[in_pos2_idx];
-
-      const T* out_pos = &out[out_id_h * output_w + out_id_w];
-
-      // trilinear interpolation grad
-      platform::CudaAtomicAdd(&in_pos1[0],
-                              d2lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[w_id * num_channels],
-                              d2lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
-                              d2lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(
-          &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
-          d2lambda * h1lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[0],
-                              d1lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[w_id * num_channels],
-                              d1lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
-                              d1lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(
-          &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
-          d1lambda * h1lambda * w1lambda * out_pos[0]);
-    }
-  }
-}
-
-template <typename T>
-static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
-                                 const Tensor& input, Tensor* output) {
-  auto* input_data = input.data<T>();
-
-  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
-  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-
-  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
-  if (list_new_shape_tensor.size() > 0) {
-    // have size tensor
-    auto new_size = get_new_shape(list_new_shape_tensor);
-    out_h = new_size[0];
-    out_w = new_size[1];
-  } else {
-    float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
-    if (scale_tensor != nullptr) {
-      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-      scale = scale_data[0];
-    } else {
-      scale = ctx.Attr<float>("scale");
-    }
-    if (scale > 0) {
-      out_h = static_cast<int>(in_h * scale);
-      out_w = static_cast<int>(in_w * scale);
-    }
-    auto out_size = ctx.Input<Tensor>("OutSize");
-    if (out_size != nullptr) {
-      Tensor sizes;
-      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
-      auto size_data = sizes.data<int>();
-      out_h = size_data[0];
-      out_w = size_data[1];
-    }
-  }
-  PADDLE_ENFORCE_GT(
-      out_h, 0,
-      "out_h in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  PADDLE_ENFORCE_GT(
-      out_w, 0,
-      "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-
-  framework::DDim dim_out;
-  if (data_layout == DataLayout::kNCHW) {
-    dim_out = {n, c, out_h, out_w};
-  } else {
-    dim_out = {n, out_h, out_w, c};
-  }
-  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
-
-  if (in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(input, ctx.GetPlace(), output);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  int in_hw = in_h * in_w;
-  int out_hw = out_h * out_w;
-  int in_chw = c * in_hw;
-  int out_chw = c * out_hw;
-
-  int pixelNum = n * out_chw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-  if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
-  } else if ("bilinear" == interp_method) {
-    KeBilinearInterpFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
-  }
-}
-
-template <typename T>
-static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
-                                 const Tensor& input, Tensor* output) {
-  auto* input_data = input.data<T>();
-
-  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
-  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_d = ctx.Attr<int>("out_d");
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-
-  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
-  if (list_new_shape_tensor.size() > 0) {
-    // have size tensor
-    auto new_size = get_new_shape(list_new_shape_tensor);
-    out_d = new_size[0];
-    out_h = new_size[1];
-    out_w = new_size[2];
-  } else {
-    float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
-    if (scale_tensor != nullptr) {
-      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-      scale = scale_data[0];
-    } else {
-      scale = ctx.Attr<float>("scale");
-    }
-    if (scale > 0) {
-      out_d = static_cast<int>(in_d * scale);
-      out_h = static_cast<int>(in_h * scale);
-      out_w = static_cast<int>(in_w * scale);
-    }
-    auto out_size = ctx.Input<Tensor>("OutSize");
-    if (out_size != nullptr) {
-      Tensor sizes;
-      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
-      auto size_data = sizes.data<int>();
-      out_d = size_data[0];
-      out_h = size_data[1];
-      out_w = size_data[2];
-    }
-  }
-  PADDLE_ENFORCE_GT(
-      out_d, 0,
-      "out_d in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  PADDLE_ENFORCE_GT(
-      out_h, 0,
-      "out_h in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  PADDLE_ENFORCE_GT(
-      out_w, 0,
-      "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-
-  framework::DDim dim_out;
-  if (data_layout == DataLayout::kNCHW) {
-    dim_out = {n, c, out_d, out_h, out_w};
-  } else {
-    dim_out = {n, out_d, out_h, out_w, c};
-  }
-  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
-
-  if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(input, ctx.GetPlace(), output);
-    return;
-  }
-
-  float ratio_d = 0.f;
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_d > 1) {
-    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
-  }
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  int in_dhw = in_d * in_h * in_w;
-  int out_dhw = out_d * out_h * out_w;
-  int in_cdhw = c * in_dhw;
-  int out_cdhw = c * out_dhw;
-
-  int pixelNum = n * out_cdhw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-  if ("trilinear" == interp_method) {
-    KeTrilinearInterpFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
-        out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
-        align_mode, data_layout);
-  }
-}
-
-template <typename T>
-static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
-                                 Tensor* input_grad, const Tensor output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
-  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
-  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale;
-  auto scale_tensor = ctx.Input<Tensor>("Scale");
-  if (scale_tensor != nullptr) {
-    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-    scale = scale_data[0];
-  } else {
-    scale = ctx.Attr<float>("scale");
-  }
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    Tensor sizes;
-    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
-    auto size_data = sizes.data<int>();
-    out_h = size_data[0];
-    out_w = size_data[1];
-  }
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
-  if (list_new_size_tensor.size() > 0) {
-    // have size tensor
-    auto new_size = get_new_shape(list_new_size_tensor);
-    out_h = new_size[0];
-    out_w = new_size[1];
-  }
-
-  auto* output_grad_data = output_grad.data<T>();
-  framework::DDim dim_grad;
-  if (data_layout == DataLayout::kNCHW) {
-    dim_grad = {n, c, in_h, in_w};
-  } else {
-    dim_grad = {n, in_h, in_w, c};
-  }
-  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
-  zero(device_ctx, input_grad, static_cast<T>(0.0));
-
-  if (in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  int in_hw = in_h * in_w;
-  int out_hw = out_h * out_w;
-  int in_chw = c * in_hw;
-  int out_chw = c * out_hw;
-
-  int pixelNum = n * out_chw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-  if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
-  } else if ("bilinear" == interp_method) {
-    KeBilinearInterpBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
-        data_layout);
-  }
-}
-
-template <typename T>
-static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
-                                 Tensor* input_grad,
-                                 const Tensor& output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
-  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
-  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_d = ctx.Attr<int>("out_d");
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale;
-  auto scale_tensor = ctx.Input<Tensor>("Scale");
-  if (scale_tensor != nullptr) {
-    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-    scale = scale_data[0];
-  } else {
-    scale = ctx.Attr<float>("scale");
-  }
-  if (scale > 0) {
-    out_d = static_cast<int>(in_d * scale);
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    Tensor sizes;
-    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
-    auto size_data = sizes.data<int>();
-    out_d = size_data[0];
-    out_h = size_data[1];
-    out_w = size_data[2];
-  }
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
-  if (list_new_size_tensor.size() > 0) {
-    // have size tensor
-    auto new_size = get_new_shape(list_new_size_tensor);
-    out_d = new_size[0];
-    out_h = new_size[1];
-    out_w = new_size[2];
-  }
-
-  auto* output_grad_data = output_grad.data<T>();
-  framework::DDim dim_grad;
-  if (data_layout == DataLayout::kNCHW) {
-    dim_grad = {n, c, in_d, in_h, in_w};
-  } else {
-    dim_grad = {n, in_d, in_h, in_w, c};
-  }
-  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
-  zero(device_ctx, input_grad, static_cast<T>(0.0));
-
-  if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
-    return;
-  }
-
-  float ratio_d = 0.f;
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_d > 1) {
-    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
-  }
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  int in_dhw = in_d * in_h * in_w;
-  int out_dhw = out_d * out_h * out_w;
-  int in_cdhw = c * in_dhw;
-  int out_cdhw = c * out_dhw;
-
-  int pixelNum = n * out_cdhw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-  if ("trilinear" == interp_method) {
-    KeTrilinearInterpBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
-        out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
-        align_mode, data_layout);
-  }
-}
-
-template <typename T>
-class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-
-    auto input_dims = input->dims();
-    if (input_dims.size() == 4) {  // 2D interpolation
-      Interpolate2DCUDAFwd<T>(ctx, *input, output);
-    } else if (input_dims.size() == 5) {  // 3D interpolation
-      Interpolate3DCUDAFwd<T>(ctx, *input, output);
-    }
-  }
-};
-
-template <typename T>
-class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto output_grad_dims = output_grad->dims();
-    if (output_grad_dims.size() == 4) {  // 2D interpolation
-      Interpolate2DCUDABwd<T>(ctx, input_grad, *output_grad);
-    } else if (output_grad_dims.size() == 5) {  // 3D interpolation
-      Interpolate3DCUDABwd<T>(ctx, input_grad, *output_grad);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(bilinear_interp, ops::InterpolateOpCUDAKernel<float>,
-                        ops::InterpolateOpCUDAKernel<double>,
-                        ops::InterpolateOpCUDAKernel<int>);
-REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad,
-                        ops::InterpolateGradOpCUDAKernel<float>,
-                        ops::InterpolateGradOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(nearest_interp, ops::InterpolateOpCUDAKernel<float>,
-                        ops::InterpolateOpCUDAKernel<double>,
-                        ops::InterpolateOpCUDAKernel<int>);
-REGISTER_OP_CUDA_KERNEL(nearest_interp_grad,
-                        ops::InterpolateGradOpCUDAKernel<float>,
-                        ops::InterpolateGradOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(trilinear_interp, ops::InterpolateOpCUDAKernel<float>,
-                        ops::InterpolateOpCUDAKernel<double>,
-                        ops::InterpolateOpCUDAKernel<int>);
-REGISTER_OP_CUDA_KERNEL(trilinear_interp_grad,
-                        ops::InterpolateGradOpCUDAKernel<float>,
-                        ops::InterpolateGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
deleted file mode 100644
index b107d1e6656bec8d2b2214a94dc7a44246771bbb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/interpolate_op.h
+++ /dev/null
@@ -1,882 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
-
-inline std::vector<int> get_new_shape(
-    const std::vector<const Tensor*>& list_new_shape_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_shape;
-  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
-    auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
-                      "shape of dim tensor should be [1]");
-    if (platform::is_gpu_place(tensor->place())) {
-      framework::Tensor temp;
-      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-
-      vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
-    } else {
-      vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
-    }
-  }
-
-  return vec_new_shape;
-}
-
-template <typename T>
-inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
-  std::vector<T> vec_new_data;
-  auto* new_data = new_data_tensor->data<T>();
-  framework::Tensor cpu_starts_tensor;
-  if (platform::is_gpu_place(new_data_tensor->place())) {
-    TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
-    new_data = cpu_starts_tensor.data<T>();
-  }
-  vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
-  return vec_new_data;
-}
-
-inline void ExtractNCDWH(const framework::DDim& dims,
-                         const DataLayout& data_layout, int* N, int* C, int* D,
-                         int* H, int* W) {
-  *N = dims[0];
-  if (dims.size() == 4) {
-    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[3];
-    *D = 1;
-    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
-    *W = data_layout == DataLayout::kNCHW ? dims[3] : dims[2];
-  } else {
-    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[4];
-    *D = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
-    *H = data_layout == DataLayout::kNCHW ? dims[3] : dims[2];
-    *W = data_layout == DataLayout::kNCHW ? dims[4] : dims[3];
-  }
-}
-
-template <typename T>
-static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
-                                       const float ratio_h, const float ratio_w,
-                                       const int n, const int c,
-                                       const int out_h, const int out_w,
-                                       const bool align_corners,
-                                       const DataLayout& data_layout) {
-  auto input_t = EigenTensor<T, 4>::From(input);
-  auto output_t = EigenTensor<T, 4>::From(*output);
-  for (int k = 0; k < out_h; k++) {  // loop for images
-    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
-                               : static_cast<int>(ratio_h * k);
-
-    for (int l = 0; l < out_w; l++) {
-      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
-                                 : static_cast<int>(ratio_w * l);
-
-      for (int i = 0; i < n; i++) {    // loop for batches
-        for (int j = 0; j < c; j++) {  // loop for channels
-          if (data_layout == DataLayout::kNCHW) {
-            output_t(i, j, k, l) = input_t(i, j, in_k, in_l);
-          } else {
-            output_t(i, k, l, j) = input_t(i, in_k, in_l, j);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void BilinearInterpolation(const Tensor& input, Tensor* output,
-                                  const float ratio_h, const float ratio_w,
-                                  const int in_h, const int in_w, const int n,
-                                  const int c, const int out_h, const int out_w,
-                                  const bool align_corners,
-                                  const bool align_mode,
-                                  const DataLayout data_layout) {
-  auto input_t = EigenTensor<T, 4>::From(input);
-  auto output_t = EigenTensor<T, 4>::From(*output);
-  bool align_flag = (align_mode == 0 && !align_corners);
-
-  std::vector<int> vy_n, vy_s;
-  std::vector<float> vd_n, vd_s;
-  vy_n.reserve(out_h);
-  vy_s.reserve(out_h);
-  vd_n.reserve(out_h);
-  vd_s.reserve(out_h);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int k = 0; k < out_h; k++) {
-    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * k);
-    y_n = (y_n > 0) ? y_n : 0;
-    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
-    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
-    float d_s = 1.f - d_n;
-    {
-      vy_n[k] = y_n;
-      vy_s[k] = y_s;
-      vd_n[k] = d_n;
-      vd_s[k] = d_s;
-    }
-  }
-
-  std::vector<int> vx_w, vx_e;
-  std::vector<float> vd_w, vd_e;
-  vx_w.reserve(out_w);
-  vx_e.reserve(out_w);
-  vd_w.reserve(out_w);
-  vd_e.reserve(out_w);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int l = 0; l < out_w; l++) {
-    int x_w = (align_mode == 0 && !align_corners)
-                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                  : static_cast<int>(ratio_w * l);
-    x_w = (x_w > 0) ? x_w : 0;
-    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
-    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
-    float d_e = 1.f - d_w;
-    {
-      vx_w[l] = x_w;
-      vx_e[l] = x_e;
-      vd_w[l] = d_w;
-      vd_e[l] = d_e;
-    }
-  }
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(4)
-#endif
-  for (int i = 0; i < n; i++) {          // loop for batches
-    for (int j = 0; j < c; j++) {        // loop for channels
-      for (int k = 0; k < out_h; k++) {  // loop for images
-        for (int l = 0; l < out_w; l++) {
-          // bilinear interpolation
-          T out_t;
-          if (data_layout == DataLayout::kNCHW) {
-            out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] +
-                    input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] +
-                    input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] +
-                    input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l];
-            output_t(i, j, k, l) = out_t;
-
-          } else {
-            out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] +
-                    input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] +
-                    input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] +
-                    input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l];
-            output_t(i, k, l, j) = out_t;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void TrilinearInterpolation(
-    const Tensor& input, Tensor* output, const float ratio_d,
-    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
-    const int in_w, const int n, const int c, const int out_d, const int out_h,
-    const int out_w, const bool align_corners, const bool align_mode,
-    const DataLayout& data_layout) {
-  auto input_t = EigenTensor<T, 5>::From(input);
-  auto output_t = EigenTensor<T, 5>::From(*output);
-  bool align_flag = (align_mode == 0 && !align_corners);
-
-  std::vector<int> vt_f, vt_b;
-  std::vector<float> vd_f, vd_b;
-  vt_f.reserve(out_d);
-  vt_b.reserve(out_d);
-  vd_f.reserve(out_d);
-  vd_b.reserve(out_d);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int j = 0; j < out_d; j++) {
-    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
-                         : static_cast<int>(ratio_d * j);
-    t_f = (t_f > 0) ? t_f : 0;
-    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
-    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
-    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
-    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
-    float d_b = 1.f - d_f;
-    {
-      vt_f[j] = t_f;
-      vt_b[j] = t_b;
-      vd_f[j] = d_f;
-      vd_b[j] = d_b;
-    }
-  }
-
-  std::vector<int> vy_n, vy_s;
-  std::vector<float> vd_n, vd_s;
-  vy_n.reserve(out_h);
-  vy_s.reserve(out_h);
-  vd_n.reserve(out_h);
-  vd_s.reserve(out_h);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int k = 0; k < out_h; k++) {
-    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * k);
-    y_n = (y_n > 0) ? y_n : 0;
-    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
-    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
-    float d_s = 1.f - d_n;
-    {
-      vy_n[k] = y_n;
-      vy_s[k] = y_s;
-      vd_n[k] = d_n;
-      vd_s[k] = d_s;
-    }
-  }
-
-  std::vector<int> vx_w, vx_e;
-  std::vector<float> vd_w, vd_e;
-  vx_w.reserve(out_w);
-  vx_e.reserve(out_w);
-  vd_w.reserve(out_w);
-  vd_e.reserve(out_w);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int l = 0; l < out_w; l++) {
-    int x_w = (align_mode == 0 && !align_corners)
-                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                  : static_cast<int>(ratio_w * l);
-    x_w = (x_w > 0) ? x_w : 0;
-    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
-    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
-    float d_e = 1.f - d_w;
-    {
-      vx_w[l] = x_w;
-      vx_e[l] = x_e;
-      vd_w[l] = d_w;
-      vd_e[l] = d_e;
-    }
-  }
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(5)
-#endif
-  for (int b = 0; b < n; b++) {          // loop for batches
-    for (int i = 0; i < c; i++) {        // loop for channels
-      for (int j = 0; j < out_d; j++) {  // loop for D, H, W
-        for (int k = 0; k < out_h; k++) {
-          for (int l = 0; l < out_w; l++) {
-            // trilinear interpolation
-            if (data_layout == DataLayout::kNCHW) {
-              T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] *
-                            vd_s[k] * vd_e[l] +
-                        input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] *
-                            vd_s[k] * vd_w[l] +
-                        input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] *
-                            vd_n[k] * vd_e[l] +
-                        input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] *
-                            vd_n[k] * vd_w[l] +
-                        input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] *
-                            vd_s[k] * vd_e[l] +
-                        input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] *
-                            vd_s[k] * vd_w[l] +
-                        input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] *
-                            vd_n[k] * vd_e[l] +
-                        input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] *
-                            vd_n[k] * vd_w[l];
-              output_t(b, i, j, k, l) = out_t;
-            } else {
-              T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] *
-                            vd_s[k] * vd_e[l] +
-                        input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] *
-                            vd_s[k] * vd_w[l] +
-                        input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] *
-                            vd_n[k] * vd_e[l] +
-                        input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] *
-                            vd_n[k] * vd_w[l] +
-                        input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] *
-                            vd_s[k] * vd_e[l] +
-                        input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] *
-                            vd_s[k] * vd_w[l] +
-                        input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] *
-                            vd_n[k] * vd_e[l] +
-                        input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] *
-                            vd_n[k] * vd_w[l];
-              output_t(b, j, k, l, i) = out_t;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void NearestNeighborInterpolateGrad(
-    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
-    const float ratio_w, const int n, const int c, const int out_h,
-    const int out_w, const bool align_corners, const DataLayout data_layout) {
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
-  for (int k = 0; k < out_h; k++) {  // loop for images
-    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
-                               : static_cast<int>(ratio_h * k);
-
-    for (int l = 0; l < out_w; l++) {
-      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
-                                 : static_cast<int>(ratio_w * l);
-
-      for (int i = 0; i < n; i++) {    // loop for batches
-        for (int j = 0; j < c; j++) {  // loop for channels
-          if (data_layout == DataLayout::kNCHW) {
-            input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l);
-          } else {
-            input_grad_t(i, in_k, in_l, j) += output_grad_t(i, k, l, j);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void BilinearInterpolationGrad(
-    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
-    const float ratio_w, const int in_h, const int in_w, const int n,
-    const int c, const int out_h, const int out_w, const bool align_corners,
-    const int align_mode, const DataLayout data_layout) {
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (int k = 0; k < out_h; k++) {  // loop for images
-    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * k);
-    y_n = (y_n > 0) ? y_n : 0;
-    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
-    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
-    float d_s = 1.f - d_n;
-
-    for (int l = 0; l < out_w; l++) {
-      int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                           : static_cast<int>(ratio_w * l);
-      x_w = (x_w > 0) ? x_w : 0;
-      int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-      float idx_src_x = ratio_w * (l + 0.5) - 0.5;
-      idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-      float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
-      float d_e = 1.f - d_w;
-
-      for (int i = 0; i < n; i++) {    // loop for batches
-        for (int j = 0; j < c; j++) {  // loop for channels
-          // bilinear interpolation grad
-          if (data_layout == DataLayout::kNCHW) {
-            const T grad = output_grad_t(i, j, k, l);
-            input_grad_t(i, j, y_n, x_w) += static_cast<T>(grad * d_s * d_e);
-            input_grad_t(i, j, y_s, x_w) += static_cast<T>(grad * d_n * d_e);
-            input_grad_t(i, j, y_n, x_e) += static_cast<T>(grad * d_s * d_w);
-            input_grad_t(i, j, y_s, x_e) += static_cast<T>(grad * d_n * d_w);
-          } else {
-            const T grad = output_grad_t(i, k, l, j);
-            input_grad_t(i, y_n, x_w, j) += static_cast<T>(grad * d_s * d_e);
-            input_grad_t(i, y_s, x_w, j) += static_cast<T>(grad * d_n * d_e);
-            input_grad_t(i, y_n, x_e, j) += static_cast<T>(grad * d_s * d_w);
-            input_grad_t(i, y_s, x_e, j) += static_cast<T>(grad * d_n * d_w);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void TrilinearInterpolationGrad(
-    const Tensor& output_grad, Tensor* input_grad, const float ratio_d,
-    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
-    const int in_w, const int n, const int c, const int out_d, const int out_h,
-    const int out_w, const bool align_corners, const int align_mode,
-    const DataLayout data_layout) {
-  auto input_grad_t = EigenTensor<T, 5>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 5>::From(output_grad);
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (int j = 0; j < out_d; j++) {  // loop for D
-    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
-                         : static_cast<int>(ratio_d * j);
-    t_f = (t_f > 0) ? t_f : 0;
-    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
-    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
-    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
-    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
-    float d_b = 1.f - d_f;
-
-    for (int k = 0; k < out_h; k++) {  // loop for H
-      int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                           : static_cast<int>(ratio_h * k);
-      y_n = (y_n > 0) ? y_n : 0;
-      int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-      float idx_src_y = ratio_h * (k + 0.5) - 0.5;
-      idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-      float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
-      float d_s = 1.f - d_n;
-
-      for (int l = 0; l < out_w; l++) {  // loop for W
-        int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                             : static_cast<int>(ratio_w * l);
-        x_w = (x_w > 0) ? x_w : 0;
-        int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-        float idx_src_x = ratio_w * (l + 0.5) - 0.5;
-        idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-        float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
-        float d_e = 1.f - d_w;
-
-        for (int b = 0; b < n; b++) {    // loop for batches
-          for (int i = 0; i < c; i++) {  // loop for channels
-            // trilinear interpolation grad
-            if (data_layout == DataLayout::kNCHW) {
-              const T grad = output_grad_t(b, i, j, k, l);
-              input_grad_t(b, i, t_f, y_n, x_w) +=
-                  static_cast<T>(grad * d_b * d_s * d_e);
-              input_grad_t(b, i, t_f, y_n, x_e) +=
-                  static_cast<T>(grad * d_b * d_s * d_w);
-              input_grad_t(b, i, t_f, y_s, x_w) +=
-                  static_cast<T>(grad * d_b * d_n * d_e);
-              input_grad_t(b, i, t_f, y_s, x_e) +=
-                  static_cast<T>(grad * d_b * d_n * d_w);
-              input_grad_t(b, i, t_b, y_n, x_w) +=
-                  static_cast<T>(grad * d_f * d_s * d_e);
-              input_grad_t(b, i, t_b, y_n, x_e) +=
-                  static_cast<T>(grad * d_f * d_s * d_w);
-              input_grad_t(b, i, t_b, y_s, x_w) +=
-                  static_cast<T>(grad * d_f * d_n * d_e);
-              input_grad_t(b, i, t_b, y_s, x_e) +=
-                  static_cast<T>(grad * d_f * d_n * d_w);
-            } else {
-              const T grad = output_grad_t(b, j, k, l, i);
-              input_grad_t(b, t_f, y_n, x_w, i) +=
-                  static_cast<T>(grad * d_b * d_s * d_e);
-              input_grad_t(b, t_f, y_n, x_e, i) +=
-                  static_cast<T>(grad * d_b * d_s * d_w);
-              input_grad_t(b, t_f, y_s, x_w, i) +=
-                  static_cast<T>(grad * d_b * d_n * d_e);
-              input_grad_t(b, t_f, y_s, x_e, i) +=
-                  static_cast<T>(grad * d_b * d_n * d_w);
-              input_grad_t(b, t_b, y_n, x_w, i) +=
-                  static_cast<T>(grad * d_f * d_s * d_e);
-              input_grad_t(b, t_b, y_n, x_e, i) +=
-                  static_cast<T>(grad * d_f * d_s * d_w);
-              input_grad_t(b, t_b, y_s, x_w, i) +=
-                  static_cast<T>(grad * d_f * d_n * d_e);
-              input_grad_t(b, t_b, y_s, x_e, i) +=
-                  static_cast<T>(grad * d_f * d_n * d_w);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
-                                const Tensor& input, Tensor* output) {
-  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
-  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
-  if (list_new_size_tensor.size() > 0) {
-    // have size tensor
-    auto new_size = get_new_shape(list_new_size_tensor);
-    out_h = new_size[0];
-    out_w = new_size[1];
-  } else {
-    float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
-    if (scale_tensor != nullptr) {
-      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-      scale = scale_data[0];
-    } else {
-      scale = ctx.Attr<float>("scale");
-    }
-    if (scale > 0) {
-      out_h = static_cast<int>(in_h * scale);
-      out_w = static_cast<int>(in_w * scale);
-    }
-    auto out_size = ctx.Input<Tensor>("OutSize");
-    if (out_size != nullptr) {
-      auto out_size_data = get_new_data_from_tensor<int>(out_size);
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-  }
-  PADDLE_ENFORCE_GT(
-      out_h, 0,
-      "out_h in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  PADDLE_ENFORCE_GT(
-      out_w, 0,
-      "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  framework::DDim dim_out;
-  if (data_layout == DataLayout::kNCHW) {
-    dim_out = {n, c, out_h, out_w};
-  } else {
-    dim_out = {n, out_h, out_w, c};
-  }
-  output->mutable_data<T>(dim_out, ctx.GetPlace());
-
-  if (in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(input, ctx.GetPlace(), output);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  if ("bilinear" == interp_method) {
-    BilinearInterpolation<T>(input, output, ratio_h, ratio_w, in_h, in_w, n, c,
-                             out_h, out_w, align_corners, align_mode,
-                             data_layout);
-  } else if ("nearest" == interp_method) {
-    NearestNeighborInterpolate<T>(input, output, ratio_h, ratio_w, n, c, out_h,
-                                  out_w, align_corners, data_layout);
-  }
-}
-
-template <typename T>
-static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
-                                const Tensor& input, Tensor* output) {
-  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
-  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_d = ctx.Attr<int>("out_d");
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
-  if (list_new_size_tensor.size() > 0) {
-    // have size tensor
-    auto new_size = get_new_shape(list_new_size_tensor);
-    out_d = new_size[0];
-    out_h = new_size[1];
-    out_w = new_size[2];
-  } else {
-    float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
-    if (scale_tensor != nullptr) {
-      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-      scale = scale_data[0];
-    } else {
-      scale = ctx.Attr<float>("scale");
-    }
-    if (scale > 0) {
-      out_d = static_cast<int>(in_d * scale);
-      out_h = static_cast<int>(in_h * scale);
-      out_w = static_cast<int>(in_w * scale);
-    }
-    auto out_size = ctx.Input<Tensor>("OutSize");
-    if (out_size != nullptr) {
-      auto out_size_data = get_new_data_from_tensor<int>(out_size);
-      out_d = out_size_data[0];
-      out_h = out_size_data[1];
-      out_w = out_size_data[2];
-    }
-  }
-  PADDLE_ENFORCE_GT(
-      out_d, 0,
-      "out_d in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  PADDLE_ENFORCE_GT(
-      out_h, 0,
-      "out_h in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  PADDLE_ENFORCE_GT(
-      out_w, 0,
-      "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-
-  framework::DDim dim_out;
-  if (data_layout == DataLayout::kNCHW) {
-    dim_out = {n, c, out_d, out_h, out_w};
-  } else {
-    dim_out = {n, out_d, out_h, out_w, c};
-  }
-
-  output->mutable_data<T>(dim_out, ctx.GetPlace());
-
-  if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(input, ctx.GetPlace(), output);
-    return;
-  }
-
-  float ratio_d = 0.f;
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_d > 1) {
-    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
-  }
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  if ("trilinear" == interp_method) {
-    TrilinearInterpolation<T>(input, output, ratio_d, ratio_h, ratio_w, in_d,
-                              in_h, in_w, n, c, out_d, out_h, out_w,
-                              align_corners, align_mode, data_layout);
-  }
-}
-
-template <typename T>
-static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
-                                Tensor* input_grad, const Tensor& output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
-  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
-  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale;
-  auto scale_tensor = ctx.Input<Tensor>("Scale");
-  if (scale_tensor != nullptr) {
-    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-    scale = scale_data[0];
-  } else {
-    scale = ctx.Attr<float>("scale");
-  }
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    auto out_size_data = get_new_data_from_tensor<int>(out_size);
-    out_h = out_size_data[0];
-    out_w = out_size_data[1];
-  }
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
-  if (list_new_size_tensor.size() > 0) {
-    // have size tensor
-    auto new_size = get_new_shape(list_new_size_tensor);
-    out_h = new_size[0];
-    out_w = new_size[1];
-  }
-
-  framework::DDim dim_grad;
-  if (data_layout == DataLayout::kNCHW) {
-    dim_grad = {n, c, in_h, in_w};
-  } else {
-    dim_grad = {n, in_h, in_w, c};
-  }
-  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-
-  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
-  zero(device_ctx, input_grad, static_cast<T>(0.0));
-
-  if (in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  if ("bilinear" == interp_method) {
-    BilinearInterpolationGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
-                                 in_h, in_w, n, c, out_h, out_w, align_corners,
-                                 align_mode, data_layout);
-  } else if ("nearest" == interp_method) {
-    NearestNeighborInterpolateGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
-                                      n, c, out_h, out_w, align_corners,
-                                      data_layout);
-  }
-}
-
-template <typename T>
-static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
-                                Tensor* input_grad, const Tensor output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
-  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  int n, c, in_d, in_h, in_w;
-  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_d = ctx.Attr<int>("out_d");
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale;
-  auto scale_tensor = ctx.Input<Tensor>("Scale");
-  if (scale_tensor != nullptr) {
-    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-    scale = scale_data[0];
-  } else {
-    scale = ctx.Attr<float>("scale");
-  }
-  if (scale > 0) {
-    out_d = static_cast<int>(in_d * scale);
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    auto out_size_data = get_new_data_from_tensor<int>(out_size);
-    out_d = out_size_data[0];
-    out_h = out_size_data[1];
-    out_w = out_size_data[2];
-  }
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
-  if (list_new_size_tensor.size() > 0) {
-    // have size tensor
-    auto new_size = get_new_shape(list_new_size_tensor);
-    out_d = new_size[0];
-    out_h = new_size[1];
-    out_w = new_size[2];
-  }
-
-  framework::DDim dim_grad;
-  if (data_layout == DataLayout::kNCHW) {
-    dim_grad = {n, c, in_d, in_h, in_w};
-  } else {
-    dim_grad = {n, in_d, in_h, in_w, c};
-  }
-  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
-  zero(device_ctx, input_grad, static_cast<T>(0.0));
-
-  if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
-    return;
-  }
-
-  float ratio_d = 0.f;
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_d > 1) {
-    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
-  }
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  if ("trilinear" == interp_method) {
-    TrilinearInterpolationGrad<T>(
-        output_grad, input_grad, ratio_d, ratio_h, ratio_w, in_d, in_h, in_w, n,
-        c, out_d, out_h, out_w, align_corners, align_mode, data_layout);
-  }
-}
-
-template <typename T>
-class InterpolateKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-
-    auto input_dims = input->dims();
-    if (input_dims.size() == 4) {  // 2D interpolation
-      Interpolate2DCPUFwd<T>(ctx, *input, output);
-    } else if (input_dims.size() == 5) {  // 3D interpolation
-      Interpolate3DCPUFwd<T>(ctx, *input, output);
-    }
-  }
-};
-
-template <typename T>
-class InterpolateGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto output_grad_dims = output_grad->dims();
-    if (output_grad_dims.size() == 4) {  // 2D interpolation grad
-      Interpolate2DCPUBwd<T>(ctx, input_grad, *output_grad);
-    } else if (output_grad_dims.size() == 5) {  // 3D interpolation grad
-      Interpolate3DCPUBwd<T>(ctx, input_grad, *output_grad);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
deleted file mode 100644
index 092a6eae6f5b7edcc5656522377de10a08a01ea8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/is_empty_op.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/is_empty_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-class IsEmptyOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of IsEmptyOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of IsEmptyOp should not be null.");
-    ctx->SetOutputDim("Out", {1});
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    return framework::OpKernelType(x->type(), x->place());
-  }
-};
-
-class IsEmptyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(LoDTensor) Tensor which is to be checked.");
-    AddOutput("Out",
-              "(LoDTensor) a boolean Tensor that indicate empty or not.");
-    AddComment(R"DOC(
-IsEmpty Operator which checks whether a tensor is empty.
-
-It will just return product(tensor.ddims()) > 0;
-              )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    is_empty, ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/fluid/operators/is_empty_op.cu.cc
deleted file mode 100644
index 3c256503baf6ba3bc8f8dff866a2ce9c57ec5bf1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/is_empty_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/is_empty_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    is_empty, ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/is_empty_op.h b/paddle/fluid/operators/is_empty_op.h
deleted file mode 100644
index 4f6419eb577709836275481cf617c07ea6c7f4c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/is_empty_op.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class IsEmptyOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // get input
-    auto* input_tensor = context.Input<framework::LoDTensor>("X");
-    // get output
-    auto* output_tensor = context.Output<framework::LoDTensor>("Out");
-
-    // Note: is_empty is always executed on CPU and the output data should
-    // always be allocated for CPUPlace. We reigister CUDA kernel for this op to
-    // avoid the unnecessary data transform.
-    output_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
-        framework::product(input_tensor->dims()) == 0;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
deleted file mode 100644
index 1312eecfa4466755b0e6ff3cf153a807276bfd1f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/isfinite_op.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/isfinite_op.h"
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class OverflowOp : public framework::OperatorWithKernel {
- public:
-  OverflowOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of OverflowOp should not be null.");
-
-    ctx->SetOutputDim("Out", {1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    int dtype = -1;
-    auto *x_var = ctx.InputVar("X");
-    if (x_var->IsType<framework::LoDTensor>()) {
-      dtype = x_var->Get<framework::LoDTensor>().type();
-    } else if (x_var->IsType<framework::SelectedRows>()) {
-      dtype = x_var->Get<framework::SelectedRows>().value().type();
-    } else {
-      PADDLE_THROW("Cannot find the input data type by all input data");
-    }
-    return framework::OpKernelType(framework::proto::VarType::Type(dtype),
-                                   ctx.GetPlace());
-  }
-};
-
-class OverflowOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input tensors of overflow operator.");
-    AddOutput("Out",
-              "(Tensor) 1-dim tensor, contains a bool scalar. The output "
-              "tensor of overflow operator.");
-    AddComment(string::Sprintf(R"DOC(
-Overflow %s operator.
-
-$$Out = any(X)$$
-
-If any X contains Inf or Nan, the Out will generate a indicator.
-Out = Inf if any X contains Inf,
-Out = Nan if any X contains Nan,
-Out = 0 if no Inf/Nan detected.
-If X contains both Inf/Nan, it will return the first indicator it meeted.
-
-%s
-)DOC",
-                               GetName(), GetComments()));
-  }
-
- protected:
-  virtual std::string GetName() const = 0;
-  virtual std::string GetComments() const = 0;
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-#define REGISTER_OP_MAKER(op_type, comment)             \
-  namespace paddle {                                    \
-  namespace operators {                                 \
-  class _##op_type##OverflowOpMaker                     \
-      : public ::paddle::operators::OverflowOpMaker {   \
-   protected:                                           \
-    std::string GetName() const { return #op_type; }    \
-    std::string GetComments() const { return comment; } \
-  };                                                    \
-  }                                                     \
-  }                                                     \
-  REGISTER_OPERATOR(op_type, ops::OverflowOp,           \
-                    ops::_##op_type##OverflowOpMaker,   \
-                    paddle::framework::EmptyGradOpMaker)
-
-#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)                      \
-  REGISTER_OP_CPU_KERNEL(                                                   \
-      op_type, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int, \
-                                   ops::functor>,                           \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,        \
-                          ops::functor>,                                    \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,       \
-                          ops::functor>);
-
-REGISTER_OP_MAKER(isinf, "isinf(X)");
-REGISTER_OP_MAKER(isnan, "isnan(X)");
-REGISTER_OP_MAKER(isfinite, "isfinite(X)");
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_OVERFLOW_CPU_KERNEL);
diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu
deleted file mode 100644
index 995969cd42f08c7fa948262e42793106e745b3a7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/isfinite_op.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/isfinite_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#define REGISTER_OVERFLOW_CUDA_KERNEL(op_type, functor)                       \
-  REGISTER_OP_CUDA_KERNEL(                                                    \
-      op_type, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,  \
-                                   ops::functor>,                             \
-      ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,         \
-                          ops::functor>,                                      \
-      ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,        \
-                          ops::functor>,                                      \
-      ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16, \
-                          ops::functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_OVERFLOW_CUDA_KERNEL);
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
deleted file mode 100644
index 83b080856366ac3332c5856a19b721893bb80eb3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/isfinite_op.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-struct InfinityFunctor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorContainsInf(tensor, out);
-  }
-};
-
-struct NANFunctor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorContainsNAN(tensor, out);
-  }
-};
-
-struct IsfiniteFunctor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorIsfinite(tensor, out);
-  }
-};
-
-template <typename DeviceContext, typename T, typename Functor>
-class OverflowKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* x = ctx.InputVar("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    Functor functor;
-    if (x->IsType<framework::LoDTensor>()) {
-      auto* in = ctx.Input<framework::Tensor>("X");
-      functor(*in, out);
-    } else if (x->IsType<framework::SelectedRows>()) {
-      auto& in = ctx.Input<framework::SelectedRows>("X")->value();
-      functor(in, out);
-    } else {
-      PADDLE_THROW("Unsupported input type.");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define FOR_EACH_KERNEL_FUNCTOR(__macro) \
-  __macro(isinf, InfinityFunctor);       \
-  __macro(isnan, NANFunctor);            \
-  __macro(isfinite, IsfiniteFunctor);
diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt
deleted file mode 100644
index 47d6c83f2adf8c4b7476410ce7c1d435633a8bfe..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-
-set(jit_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h)
-file(WRITE ${jit_file} "// Generated by the paddle/fluid/operators/jit/CMakeLists.txt.  DO NOT EDIT!\n\n")
-file(APPEND ${jit_file} "\#pragma once\n")
-file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
-file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
-
-set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash)
-
-file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
-cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
-
-# refer must go first
-add_subdirectory(refer)
-add_subdirectory(more)
-if(WITH_XBYAK)
-    add_subdirectory(gen)
-endif()
-
-cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
-cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
-if(NOT WIN32)
-    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer tensor)
-endif()
diff --git a/paddle/fluid/operators/jit/README.en.md b/paddle/fluid/operators/jit/README.en.md
deleted file mode 100644
index 7d4dc6d47a512ee7ed75d99800968a38de98f090..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/README.en.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# JIT Kernel
-
-JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic.
-Each implementation has its own condition to use, defined in `CanBeUsed`.
-They are combined together to get the best performance of one single independent function.
-They could be some very simple functions like vector multiply, or some complicated functions like LSTM.
-And they can be composed with some other exited jit kernels to build up a complex function. 
-Currently it's only supported on CPU yet.
-
-## Contents
-
-```txt
-PaddlePaddle/Paddle/paddle/fluid/
-├── ...
-└── operators/
-    ├── .../
-    └── jit/
-        ├── ...
-        ├── gen/
-        │   └── ...
-        |── more/
-        │   ├── ...
-        │   ├── mkl/
-        │   │   └── ...
-        │   ├── mkldnn/
-        │   │   └── ...
-        │   ├── mix/
-        │   │   └── ...
-        │   ├── intrinsic/
-        │   │   └── ...
-        │   └── openblas/
-        │       └── ...
-        └── refer/
-            └── ...
-```
-
-All basical definations of jit kernels are addressed in `paddle/fluid/operators/jit` including these three key folders `refer`, `gen`, `more`. There is only one unique name for each kernel while may have seraval implementations with same functionality.
-
-- `refer`: Each kernel must have one reference implementation on CPU, and it should only focus on the correctness and should not depends on any third-party libraries.
-- `gen`: The code generated should be kept here. They should be designed focusing on the best performance, which depends on Xbyak.
-- `more`: All other implementations should be kept in this folder with one directory corresponding to one library kind or method kind, such as mkl, mkldnn, openblas or intrinsic code. Each implementation should have it advantage. 
-
-## How to use
-
-We present these methods to get the functions:
-- `GetAllCandidateFuncs`. It can return all the implementations supported. All of the implementations can get the same result. You can do some runtime benchmark to choose which should actually be used.
-- `GetDefaultBestFunc`. It only return one default function pointer, which is tuning offline with some genenal configures and attributes. This should cover most situations.
-- `KernelFuncs::Cache()`. It can get the default functions and save it for next time with the same attribute. 
-- `GetReferFunc`. It can only get the reference code in CPU, and all the others implementations have same logic with this reference code.
-
-And here are some examples:
-
-Get from cache:
-
-```cpp
-    using T = float;
-    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
-    auto seqpool_func = jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(attr);
-    seqpool_func(src_data, dst_data, &attr);
-```
-
-Get all implementations and run once:
-
-```cpp
-    using T = float;
-    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
-    auto funcs = jit::GetAllCandidateFuncsWithTypes<jit::SeqPoolTuple<T>, platform::CPUPlace>(attr);
-    for (auto f : funcs) {
-        LOG(INFO) << "Kernel implementation type: " << f.first;
-        f.second(src_data, dst_data, &attr);
-    }
-```
-
-All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, which is automatically generated in compile time, you can only include this one header to get all the registered kernels.
-
-## Solid Test
-
-- Unit Test
-    All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`.
-- Benchmark
-    All functions should be tested, and make sure the `jit::GetDefaultBestFunc` function obtain the best performance with all attributes.
-
-# How to add new kernel
-
-## Required
-
-1. Add `your_key` at `KernelType`.
-2. Add your new `KernelTuple` which must include `your_key`. It should be a combination of the data type, attribute type and function type. You can refer `SeqPoolTuple`.
-3. Add reference function of `your_key`. 
-Note:
-    - this should be run on CPU and do not depend on any third-party.
-    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
-4. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
-Test more data type for some special functions if necessary, for example `int8`.
-5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one.
-
-## Optional
-
-Add more implementations of `your_kery` for performance enhancement.
-
-1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have correpsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
-2. If new attribute type is added, you should specialize `JitCodeKey` of this type.
-3. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md
deleted file mode 100644
index 770548c5260f73f038f52e0b06b77ba698851997..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/README.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# JIT Kernel
-
-结合函数模板和JIT生成需要的kernel函数。
-这里的kernel是比Operator中kernel更小级别的算子单元，更侧重的是在不同硬件上的性能。可以有多重第三方库的实现，每种实现有自己的`CanBeUsed`函数负责什么条件下可以被调用。
-这里实现的函数可以非常细粒度的函数方法，比如Vector MUL， 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。
-目前仅支持CPU上的高性能计算。
-
-## 目录结构
-
-```txt
-PaddlePaddle/Paddle/paddle/fluid/
-├── ...
-└── operators/
-    ├── .../
-    └── jit/
-        ├── ...
-        ├── gen/
-        │   └── ...
-        |── more/
-        │   ├── ...
-        │   ├── mkl/
-        │   │   └── ...
-        │   ├── mkldnn/
-        │   │   └── ...
-        │   ├── mix/
-        │   │   └── ...
-        │   ├── intrinsic/
-        │   │   └── ...
-        │   └── openblas/
-        │       └── ...
-        └── refer/
-            └── ...
-```
-
-基本类的定义都放在根目录下，根目录下包括gen,more和refer三个目录。每个目录下都是一种或者多种实现，每种kernel算子都需要有reference的实现，用作单元测试的基准，其他的实现都是可选的。
-- gen: 代表使用jit生成的code，需要依赖xbyak库。该实现最关心的就是性能。
-- refer: 代表reference的实现，每种kernel算子都需要有在CPU上的reference的实现，他主要关心的算法逻辑的正确性。
-- more: 下面可以放入跟多实现，可以包括mkl，mkldnn，intrinsic，openblas等，也可以是自身已有的kernel组合。
-
-## 动态获取
-
-- 提供`GetAllCandidateFuncs`方法，根据输入的kernel类别，获取满足要求的所有函数实现。所有实现保证结果一致，但是速度不一致，可以根据具体输入属性大小，动态测试得到当前最优实现，手动选择最优函数。
-- 提供`GetDefaultBestFunc`方法，返回一个默认最优的函数实现。该函数是根据一些通用配置离线tuning之后的结果，能覆盖大多数情况下最优结果。
-- 提供`KernelFuncs::Cache()`方法，该方法会返回默认最优的函数，同时会缓存该函数指针，如果出现属性一致的情况，直接返回上次的函数指针，如果不存在则根据属性新建。
-- 提供`GetReferFunc` 方法，返回该kernel最原始的逻辑函数。该方法与kernel的输入大小和属性没有任何关系，有且并只有一个在CPU上的实现。该方法表征了kernel的原始逻辑，其他所有实现的逻辑与它保持一致。
-
-### 例子
-
-所有kernel的调用只需要在头文件中包含`"paddle/fluid/operators/jit/kernels.h"`， 该文件是编译时自动生成的。
-
-直接从缓存中获取默认最优的函数。
-
-```cpp
-    using T = float;
-    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
-    auto seqpool_func = jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(attr);
-    seqpool_func(src_data, dst_data, &attr);
-```
-
-跑一遍所有实现，并输出实现类别。
-
-```cpp
-    using T = float;
-    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
-    auto funcs = jit::GetAllCandidateFuncsWithTypes<jit::SeqPoolTuple<T>, platform::CPUPlace>(attr);
-    for (auto f : funcs) {
-        LOG(INFO) << "Kernel implementation type: " << f.first;
-        f.second(src_data, dst_data, &attr);
-    }
-```
-
-## 测试
-
-- 逻辑测试
-    所有实现都要与refer的code对比，需要满足精度要求， 包括float和double的数据类型
-- 性能测试
-    所有实现的性能对比，并且与最终的`jit::GetDefaultBestFunc`方法对比，该方法拿到的性能需要在各种条件下都是最好的。
-
-# 如何添加新的算子
-
-1. 在`KernelType` 中添加 `your_key` 。
-2. 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。
-3. (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
-4. (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
-5. 添加新的`KernelTuple`，需要与`KernelType`一一对应，是所有类型的一个打包，包括数据类型，属性的类型，以及返回的函数类型。可以参考`SeqPoolTuple`，新加的Attr类型需要特例化`JitCodeKey`方法。
-6. 在`test.cc`中添加unit test，至少需要测试`float`和`double`两种数据类型，如有必要需要支持额外的数据类型，比如`int8`的相关函数。
-7. 在`benchmark.cc`中添加相应的性能对比，同一种kernel需要对比所有实现，并且确保`GetDefaultBestFunc`得到的实现一直是速度最快的。
-
-# 优点
-- 接口方便，灵活调用。
-- 同一套逻辑可以有多套实现，可以依赖多套第三方库，互不影响。
-- 目录结构清晰，不会在某个文件中有多个宏定义，导致的可读性差问题。
-- 优化方便，可以直接针对某种属性针对性优化，并不影响其他属性下的性能。
-- 可以支持多种平台，包括Linux，Mac 和 Windows，至少可以保证每种平台都可以正常work。后期也可以针对不同平台有针对的优化。框架层面可以使用统一接口，不必关心底层实现。
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
deleted file mode 100644
index 9ff1fe478d7f292e9b956c49920b016318db1c38..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ /dev/null
@@ -1,568 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include <iostream>
-#include <random>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/platform/device_tracer.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/variant.h"  // for UNUSED
-
-DEFINE_int32(burning, 10, "Burning times.");
-DEFINE_int32(repeat, 3000, "Repeat times.");
-DEFINE_int32(max_size, 1000, "The Max size would be tested.");
-DEFINE_string(filter, "", "The Benchmark name would be run.");
-
-class BenchJITKernel {
- public:
-  BenchJITKernel() = default;
-  virtual ~BenchJITKernel() = default;
-  virtual void Run() = 0;
-  virtual const char* Name() = 0;
-  virtual const char* Dtype() = 0;
-  virtual const char* Place() = 0;
-};
-
-static std::vector<BenchJITKernel*> g_all_benchmarks;
-
-BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
-  g_all_benchmarks.push_back(b);
-  return b;
-}
-
-#define BENCH_JITKERNEL(name, dtype, place)                                    \
-  class BenchJITKernel_##name##_##dtype##_##place##_ : public BenchJITKernel { \
-   public:                                                                     \
-    const char* Name() override { return #name; }                              \
-    const char* Dtype() override { return #dtype; }                            \
-    const char* Place() override { return #place; }                            \
-    void Run() override;                                                       \
-  };                                                                           \
-  static auto inserted_##name##_##dtype##_##place##_ UNUSED =                  \
-      InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_());     \
-  void BenchJITKernel_##name##_##dtype##_##place##_::Run()
-
-void RUN_ALL_BENCHMARK() {
-  for (auto p : g_all_benchmarks) {
-    if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) {
-      continue;
-    }
-    LOG(INFO) << "Benchmark " << p->Name() << "." << p->Dtype() << "."
-              << p->Place();
-    p->Run();
-  }
-}
-
-template <typename T>
-void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
-               const T upper = static_cast<T>(20.f), unsigned int seed = 100) {
-  std::mt19937 rng(seed);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  for (int i = 0; i < n; ++i) {
-    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-std::vector<int> TestSizes() {
-  std::vector<int> s;
-  for (int i = 1; i <= FLAGS_max_size; ++i) {
-    s.push_back(i);
-  }
-  return s;
-}
-
-template <typename KernelTuple, typename... Args>
-struct BenchFunc {
-  // return this function avg time
-  // TODO(TJ): clear cache every time
-  double operator()(const typename KernelTuple::func_type tgt, Args... args) {
-    for (int i = 0; i < FLAGS_burning; ++i) {
-      tgt(args...);
-    }
-    auto start = paddle::platform::PosixInNsec() * 1e-3;
-    for (int i = 0; i < FLAGS_repeat; ++i) {
-      tgt(args...);
-    }
-    auto end = paddle::platform::PosixInNsec() * 1e-3;
-    return static_cast<double>(end - start) / FLAGS_repeat;
-  }
-};
-
-namespace jit = paddle::operators::jit;
-
-template <typename KernelTuple, typename PlaceType, typename... Args>
-void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
-  BenchFunc<KernelTuple, Args...> benchmark;
-  std::vector<std::pair<std::string, double>> infos;
-  auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
-  for (auto f : funcs) {
-    infos.push_back(std::make_pair(f.first, benchmark(f.second, args...)));
-  }
-
-  // Test result from Get function
-  auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(attr);
-  if (!tgt) {
-    LOG(FATAL) << "Target can not be empty!";
-  }
-  infos.push_back(std::make_pair("Target", benchmark(tgt, args...)));
-
-  // print
-  std::ostringstream loginfos;
-  loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": "
-           << attr << ": ";
-  for (auto pair : infos) {
-    loginfos << pair.first << " takes " << pair.second << " us; ";
-  }
-  LOG(INFO) << loginfos.str();
-}
-
-using Tensor = paddle::framework::Tensor;
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelXYZN() {
-  using T = typename KernelTuple::data_type;
-  for (int d : TestSizes()) {
-    Tensor x, y, z;
-    x.Resize({d});
-    y.Resize({d});
-    z.Resize({d});
-    T* x_data = x.mutable_data<T>(PlaceType());
-    T* y_data = y.mutable_data<T>(PlaceType());
-    T* z_data = z.mutable_data<T>(PlaceType());
-    RandomVec<T>(d, x_data);
-    RandomVec<T>(d, y_data);
-    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y.data<T>(), z_data,
-                                          d);
-    // test inplace
-    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), z_data, z_data, d);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelAXYN() {
-  using T = typename KernelTuple::data_type;
-  for (int d : TestSizes()) {
-    const T a = static_cast<T>(3);
-    Tensor x, y;
-    x.Resize({d});
-    y.Resize({d});
-    T* x_data = x.mutable_data<T>(PlaceType());
-    T* y_data = y.mutable_data<T>(PlaceType());
-    RandomVec<T>(d, x_data);
-    BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), y_data, d);
-    // test inplace
-    BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), x_data, d);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelXRN() {
-  using T = typename KernelTuple::data_type;
-  for (int d : TestSizes()) {
-    Tensor x;
-    RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
-    T res;
-    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), &res, d);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelXYN() {
-  using T = typename KernelTuple::data_type;
-  for (int d : TestSizes()) {
-    Tensor x, y;
-    x.Resize({d});
-    y.Resize({d});
-    T* x_data = x.mutable_data<T>(PlaceType());
-    T* y_data = y.mutable_data<T>(PlaceType());
-    RandomVec<T>(d, x_data);
-    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y_data, d);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelLSTM() {
-  using T = typename KernelTuple::data_type;
-  for (bool use_peephole : {true, false}) {
-    for (int d : TestSizes()) {
-      const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
-                                  use_peephole);
-      Tensor x, ct_1, ct, ht, wp, checked;
-      x.Resize({4 * d});
-      ct_1.Resize({d});
-      ct.Resize({d});
-      ht.Resize({d});
-      wp.Resize({3 * d});
-      checked.Resize({2 * d});
-      auto place = PlaceType();
-      RandomVec<T>(x.numel(), x.mutable_data<T>(place), -2.f, 2.f);
-      RandomVec<T>(wp.numel(), wp.mutable_data<T>(place), -2.f, 2.f);
-      RandomVec<T>(ct_1.numel(), ct_1.mutable_data<T>(place), -2.f, 2.f);
-      const T* ct_1_data = ct_1.data<T>();
-      const T* wp_data = wp.data<T>();
-      T* x_data = x.mutable_data<T>(place);
-      T* checked_data = checked.mutable_data<T>(place);
-      T* ct_data = ct.mutable_data<T>(place);
-      T* ht_data = ht.mutable_data<T>(place);
-      jit::lstm_t step;
-      step.gates = x_data;
-      step.ct_1 = ct_1_data;
-      step.ct = ct_data;
-      step.ht = ht_data;
-      if (use_peephole) {
-        step.wp = wp_data;
-        step.checked = checked_data;
-      }
-      BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelGRU() {
-  using T = typename KernelTuple::data_type;
-  for (int d : TestSizes()) {
-    const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
-    auto place = PlaceType();
-    Tensor x, ht_1, ht;
-    x.Resize({3 * d});
-    ht_1.Resize({d});
-    ht.Resize({d});
-    RandomVec<T>(3 * d, x.mutable_data<T>(place), -2.f, 2.f);
-    RandomVec<T>(d, ht_1.mutable_data<T>(place), -2.f, 2.f);
-    const T* ht_1_data = ht_1.data<T>();
-    T* x_data = x.mutable_data<T>(place);
-    T* ht_data = ht.mutable_data<T>(place);
-    jit::gru_t step;
-    step.gates = x_data;
-    step.ht_1 = ht_1_data;
-    step.ht = ht_data;
-    BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelSeqPool() {
-  using T = typename KernelTuple::data_type;
-  std::vector<jit::SeqPoolType> pool_types = {
-      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
-  for (auto type : pool_types) {
-    for (int w : TestSizes()) {
-      jit::seq_pool_attr_t attr(w, type);
-      for (int h : TestSizes()) {
-        attr.h = h;
-        Tensor x, y;
-        x.Resize({h * w});
-        y.Resize({w});
-        RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        const T* x_data = x.data<T>();
-        T* y_data = y.mutable_data<T>(PlaceType());
-        BenchAllImpls<KernelTuple, PlaceType>(attr, x_data, y_data, &attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelEmbSeqPool() {
-  using T = typename KernelTuple::data_type;
-  std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
-  int64_t tbl_h = 1e4;
-  for (int tbl_w : {10, 16, 256}) {
-    Tensor table;
-    table.Resize({tbl_h, tbl_w});
-    RandomVec<T>(tbl_h * tbl_w, table.mutable_data<T>(PlaceType()), -2.f, 2.f);
-    const T* table_data = table.data<T>();
-    for (auto type : pool_types) {
-      for (int idx_w : {1, 2, 10, 16}) {
-        for (int idx_h : {1, 2, 9, 13, 16}) {
-          int64_t out_w = tbl_w * idx_w;
-          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
-                                        type);
-          Tensor idx, out;
-          idx.Resize({idx_h, idx_w});
-          out.Resize({out_w});
-          RandomVec<int64_t>(idx_h * idx_w,
-                             idx.mutable_data<int64_t>(PlaceType()), 0,
-                             tbl_h - 1);
-          const int64_t* idx_data = idx.data<int64_t>();
-          T* o_data = out.mutable_data<T>(PlaceType());
-          BenchAllImpls<KernelTuple, PlaceType>(attr, table_data, idx_data,
-                                                o_data, &attr);
-        }
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelSgd() {
-  using T = typename KernelTuple::data_type;
-  const T lr = 0.1;
-  auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
-                                  const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
-    std::vector<int64_t> all, out;
-    for (int i = 0; i < n; ++i) {
-      all.push_back(i);
-    }
-    std::random_shuffle(all.begin(), all.end());
-    out.insert(out.begin(), all.begin(), all.begin() + n);
-    return out;
-  };
-  for (int param_h : {1, 1000}) {
-    for (int grad_w : {1, 2, 8, 16, 30, 256}) {
-      // only benchmark inplace
-      Tensor param;
-      param.Resize({param_h, grad_w});
-      T* param_data = param.mutable_data<T>(PlaceType());
-      RandomVec<T>(param_h * grad_w, param_data, -2.f, 2.f);
-      for (int rows_size = 1; rows_size <= std::min(param_h, 10); ++rows_size) {
-        Tensor grad;
-        grad.Resize({rows_size, grad_w});
-        std::vector<int64_t> rows =
-            UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
-        RandomVec<T>(rows_size * grad_w, grad.mutable_data<T>(PlaceType()),
-                     -2.f, 2.f);
-        const T* grad_data = grad.data<T>();
-        const int64_t* rows_data = rows.data();
-        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
-        BenchAllImpls<KernelTuple, PlaceType>(attr, &lr, param_data, grad_data,
-                                              rows_data, param_data, &attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelMatMul() {
-  using T = typename KernelTuple::data_type;
-  for (int m : {1, 2, 3, 4}) {
-    for (int n : TestSizes()) {
-      for (int k : TestSizes()) {
-        Tensor a, b, c;
-        a.Resize({m * k});
-        b.Resize({k * n});
-        c.Resize({m * n});
-        RandomVec<T>(m * k, a.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        RandomVec<T>(k * n, b.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        const T* a_data = a.data<T>();
-        const T* b_data = b.data<T>();
-        T* c_data = c.mutable_data<T>(PlaceType());
-        const jit::matmul_attr_t attr{m, n, k};
-        BenchAllImpls<KernelTuple, PlaceType>(attr, a_data, b_data, c_data,
-                                              &attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelSoftmax() {
-  using T = typename KernelTuple::data_type;
-  for (int bs : {1, 2, 10}) {
-    for (int n : TestSizes()) {
-      Tensor x, y;
-      x.Resize({bs, n});
-      y.Resize({bs, n});
-      RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
-      const T* x_data = x.data<T>();
-      T* y_data = y.mutable_data<T>(PlaceType());
-      BenchAllImpls<KernelTuple, PlaceType>(n, x_data, y_data, n, bs, 1);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelLayerNorm() {
-  using T = typename KernelTuple::data_type;
-  const T epsilon = 9.99999975e-06;
-  for (int n : {1, 2, 10}) {
-    for (int x_dim_0 : {1, 9, 17, 50}) {
-      int left = n * x_dim_0;
-      for (int x_dim_1 : TestSizes()) {
-        int right = x_dim_1;
-        int sz = left * right;
-        Tensor x, mean, var, scale, bias, out;
-        x.Resize({n, x_dim_0, x_dim_1});
-        out.Resize({n, x_dim_0, x_dim_1});
-        mean.Resize({n, x_dim_0});
-        var.Resize({n, x_dim_0});
-        scale.Resize({x_dim_1});
-        bias.Resize({x_dim_1});
-
-        RandomVec<T>(sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        RandomVec<T>(left, mean.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        RandomVec<T>(left, var.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        RandomVec<T>(right, scale.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        RandomVec<T>(right, bias.mutable_data<T>(PlaceType()), -2.f, 2.f);
-
-        const T* scale_data = scale.data<T>();
-        const T* bias_data = bias.data<T>();
-        T* x_data = x.data<T>();
-        T* mean_data = mean.data<T>();
-        T* var_data = var.data<T>();
-        T* out_data = out.mutable_data<T>(PlaceType());
-
-        BenchAllImpls<KernelTuple, PlaceType>(right, x_data, out_data,
-                                              mean_data, var_data, scale_data,
-                                              bias_data, left, epsilon, right);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelCRFDecoding() {
-  using T = typename KernelTuple::data_type;
-  constexpr int state_trans_base_idx = 2;
-  for (int seq_len : {1, 11, 17, 50}) {
-    for (int tag_num : TestSizes()) {
-      int x_sz = seq_len * tag_num;
-      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
-      Tensor x, w, alpha, track;
-      x.Resize({seq_len, tag_num});
-      w.Resize({tag_num + state_trans_base_idx, tag_num});
-      alpha.Resize({seq_len, tag_num});
-      track.Resize({seq_len, tag_num});
-
-      RandomVec<T>(x_sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
-      RandomVec<T>(w_sz, w.mutable_data<T>(PlaceType()), -2.f, 2.f);
-
-      const T* x_data = x.data<T>();
-      const T* w_data = w.data<T>();
-      T* alpha_data = alpha.mutable_data<T>(PlaceType());
-      int* track_data = track.mutable_data<int>(PlaceType());
-
-      BenchAllImpls<KernelTuple, PlaceType>(tag_num, seq_len, x_data, w_data,
-                                            alpha_data, track_data, tag_num);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelVBroadcast() {
-  using T = typename KernelTuple::data_type;
-  for (int64_t w : {1, 16, 64, 100, 256}) {
-    Tensor x;
-    x.Resize({w});
-    RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
-    const T* x_data = x.data<T>();
-    for (int h : TestSizes()) {
-      Tensor y;
-      y.Resize({h * w});
-      T* y_data = y.mutable_data<T>(PlaceType());
-      BenchAllImpls<KernelTuple, PlaceType>(w, x_data, y_data,
-                                            static_cast<int64_t>(h), w);
-    }
-  }
-}
-
-#define BenchKernelVMul BenchKernelXYZN
-#define BenchKernelVAdd BenchKernelXYZN
-#define BenchKernelVAddRelu BenchKernelXYZN
-#define BenchKernelVSub BenchKernelXYZN
-
-#define BenchKernelVScal BenchKernelAXYN
-#define BenchKernelVAddBias BenchKernelAXYN
-
-#define BenchKernelVRelu BenchKernelXYN
-#define BenchKernelVIdentity BenchKernelXYN
-#define BenchKernelVSquare BenchKernelXYN
-#define BenchKernelVExp BenchKernelXYN
-#define BenchKernelVSigmoid BenchKernelXYN
-#define BenchKernelVTanh BenchKernelXYN
-#define BenchKernelVCopy BenchKernelXYN
-
-#define BenchKernelHMax BenchKernelXRN
-#define BenchKernelHSum BenchKernelXRN
-
-#define BenchKernelLSTMCtHt BenchKernelLSTM
-#define BenchKernelLSTMC1H1 BenchKernelLSTM
-
-#define BenchKernelGRUH1 BenchKernelGRU
-#define BenchKernelGRUHtPart1 BenchKernelGRU
-#define BenchKernelGRUHtPart2 BenchKernelGRU
-
-using CPUPlace = paddle::platform::CPUPlace;
-
-#define BENCH_FP32_CPU(name)                                \
-  BENCH_JITKERNEL(name, FP32, CPU) {                        \
-    BenchKernel##name<jit::name##Tuple<float>, CPUPlace>(); \
-  }
-
-// xyzn
-BENCH_FP32_CPU(VMul);
-BENCH_FP32_CPU(VAdd);
-BENCH_FP32_CPU(VAddRelu);
-BENCH_FP32_CPU(VSub);
-
-// axyn
-BENCH_FP32_CPU(VScal);
-BENCH_FP32_CPU(VAddBias);
-
-// xyn
-BENCH_FP32_CPU(VRelu);
-BENCH_FP32_CPU(VIdentity);
-BENCH_FP32_CPU(VSquare);
-BENCH_FP32_CPU(VExp);
-BENCH_FP32_CPU(VSigmoid);
-BENCH_FP32_CPU(VTanh);
-BENCH_FP32_CPU(VCopy);
-
-// xrn
-BENCH_FP32_CPU(HMax);
-BENCH_FP32_CPU(HSum);
-
-// LSTM
-BENCH_FP32_CPU(LSTMCtHt);
-BENCH_FP32_CPU(LSTMC1H1);
-
-// GRU
-BENCH_FP32_CPU(GRUH1);
-BENCH_FP32_CPU(GRUHtPart1);
-BENCH_FP32_CPU(GRUHtPart2);
-
-BENCH_FP32_CPU(LayerNorm);
-BENCH_FP32_CPU(CRFDecoding);
-
-BENCH_FP32_CPU(SeqPool);
-BENCH_FP32_CPU(EmbSeqPool);
-BENCH_FP32_CPU(MatMul);
-BENCH_FP32_CPU(Softmax);
-BENCH_FP32_CPU(Sgd);
-BENCH_FP32_CPU(VBroadcast);
-
-// Benchmark all jit kernels including jitcode, mkl and refer.
-// To use this tool, run command: ./benchmark [options...]
-// Options:
-//     --burning: the burning time before count
-//     --repeat: the repeat times
-//     --max_size: the max size would be tested
-//     --filter: the bench name would be run
-int main(int argc, char* argv[]) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  google::InitGoogleLogging(argv[0]);
-  LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat
-            << " times.";
-
-  RUN_ALL_BENCHMARK();
-}
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
deleted file mode 100644
index 99244ea9bd919a018732b75d1ab811e8bf338516..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-
-file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-
-cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)
-
-function(USE_JITKERNEL_GEN TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
-endfunction()
-
-# use gen jitcode kernel by name
-USE_JITKERNEL_GEN(kMatMul)
-USE_JITKERNEL_GEN(kVMul)
-USE_JITKERNEL_GEN(kVAdd)
-USE_JITKERNEL_GEN(kVSub)
-USE_JITKERNEL_GEN(kVAddRelu)
-USE_JITKERNEL_GEN(kVScal)
-USE_JITKERNEL_GEN(kVAddBias)
-USE_JITKERNEL_GEN(kVRelu)
-USE_JITKERNEL_GEN(kVSquare)
-USE_JITKERNEL_GEN(kVIdentity)
-USE_JITKERNEL_GEN(kVExp)
-USE_JITKERNEL_GEN(kVSigmoid)
-USE_JITKERNEL_GEN(kVTanh)
-USE_JITKERNEL_GEN(kLSTMCtHt)
-USE_JITKERNEL_GEN(kLSTMC1H1)
-USE_JITKERNEL_GEN(kGRUH1)
-USE_JITKERNEL_GEN(kGRUHtPart1)
-USE_JITKERNEL_GEN(kGRUHtPart2)
-USE_JITKERNEL_GEN(kNCHW16CMulNC)
-USE_JITKERNEL_GEN(kSeqPool)
-USE_JITKERNEL_GEN(kHMax)
-USE_JITKERNEL_GEN(kHSum)
-USE_JITKERNEL_GEN(kEmbSeqPool)
-USE_JITKERNEL_GEN(kSgd)
-USE_JITKERNEL_GEN(kVBroadcast)
diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc
deleted file mode 100644
index ad68e792c7a8ec4fb600a5b04153ad45895d761a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen/act.h"
-#include <memory>
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
-    REPEAT_8TIMES(1.f),
-    REPEAT_8TIMES(2.f),
-    REPEAT_8TIMES(0.5f),
-    REPEAT_8TIMES(EXP_HIG),
-    REPEAT_8TIMES(EXP_LOW),
-    REPEAT_8TIMES(CEPHES_LOG2EF),
-    REPEAT_8TIMES(CEPHES_EXP_C1),
-    REPEAT_8TIMES(CEPHES_EXP_C2),
-    REPEAT_8TIMES(CEPHES_EXP_P0),
-    REPEAT_8TIMES(CEPHES_EXP_P1),
-    REPEAT_8TIMES(CEPHES_EXP_P2),
-    REPEAT_8TIMES(CEPHES_EXP_P3),
-    REPEAT_8TIMES(CEPHES_EXP_P4),
-    REPEAT_8TIMES(CEPHES_EXP_P5),
-    REPEAT_8TIMES(EXP_MAX_INPUT),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
-
-const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
-int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
-
-void VActJitCode::genCode() {
-  int offset = 0;
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    vmovups(ymm_src, ptr[param1 + offset]);
-    act<ymm_t>(ymm_dst, ymm_src, type_);
-    vmovups(ptr[param2 + offset], ymm_dst);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-  int rest = num_ % YMM_FLOAT_BLOCK;
-  while (rest > 0) {
-    int block = XMM_FLOAT_BLOCK;
-    if (rest >= 4) {
-      block = 4;
-      vmovups(xmm_src, ptr[param1 + offset]);
-    } else if (rest >= 2) {
-      block = 2;
-      vmovq(xmm_src, ptr[param1 + offset]);
-    } else {
-      block = 1;
-      vmovss(xmm_src, ptr[param1 + offset]);
-    }
-    act<xmm_t>(xmm_dst, xmm_src, type_);
-    if (rest >= 4) {
-      vmovups(ptr[param2 + offset], xmm_dst);
-    } else if (rest >= 2) {
-      vmovq(ptr[param2 + offset], xmm_dst);
-    } else {
-      vmovss(ptr[param2 + offset], xmm_dst);
-    }
-    offset += sizeof(float) * block;
-    rest -= block;
-  }
-  ret();
-}
-
-#define DECLARE_ACT_CREATOR(name)                                            \
-  class name##Creator : public JitCodeCreator<int> {                         \
-   public:                                                                   \
-    bool CanBeUsed(const int& attr) const override;                          \
-    size_t CodeSize(const int& d) const override;                            \
-    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
-      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
-    }                                                                        \
-  }
-
-DECLARE_ACT_CREATOR(VRelu);
-DECLARE_ACT_CREATOR(VSquare);
-DECLARE_ACT_CREATOR(VIdentity);
-DECLARE_ACT_CREATOR(VExp);
-DECLARE_ACT_CREATOR(VSigmoid);
-DECLARE_ACT_CREATOR(VTanh);
-
-// TODO(TJ): tuning use me
-bool VReluCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx);
-}
-
-bool VSquareCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx);
-}
-
-bool VIdentityCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx);
-}
-
-bool VExpCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx) && d < 32;
-}
-
-bool VSigmoidCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx);
-}
-
-bool VTanhCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx);
-}
-
-size_t VReluCreator::CodeSize(const int& d) const {
-  return 96 /* init size */ +
-         (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
-             8 /* average bytes for each instruction */;
-}
-
-size_t VSquareCreator::CodeSize(const int& d) const {
-  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8;
-}
-
-size_t VIdentityCreator::CodeSize(const int& d) const {
-  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8;
-}
-
-size_t VExpCreator::CodeSize(const int& d) const {
-  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 70 * 8;
-}
-
-size_t VSigmoidCreator::CodeSize(const int& d) const {
-  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 82 * 8;
-}
-
-size_t VTanhCreator::CodeSize(const int& d) const {
-  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 84 * 8;
-}
-
-#undef DECLARE_ACT_CREATOR
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace gen = paddle::operators::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator);
-REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator);
-REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator);
-REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator);
-REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator);
-REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator);
diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h
deleted file mode 100644
index 13d98577e21db9041686822f57cb4992e5ad71ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/act.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "paddle/fluid/operators/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-extern const float exp_float_consts[];
-extern const int exp_int_0x7f[];
-extern int g_tmp_mem[];
-
-#define EXP_HIG 88.3762626647949f
-#define EXP_LOW -88.3762626647949f
-#define CEPHES_LOG2EF 1.44269504088896341
-#define CEPHES_EXP_C1 0.693359375
-#define CEPHES_EXP_C2 -2.12194440e-4
-#define CEPHES_EXP_P0 1.9875691500E-4
-#define CEPHES_EXP_P1 1.3981999507E-3
-#define CEPHES_EXP_P2 8.3334519073E-3
-#define CEPHES_EXP_P3 4.1665795894E-2
-#define CEPHES_EXP_P4 1.6666665459E-1
-#define CEPHES_EXP_P5 5.0000001201E-1
-
-#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
-
-#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
-
-class VActFunc : public JitCode {
- public:
-  explicit VActFunc(size_t code_size, void* code_ptr)
-      : JitCode(code_size, code_ptr) {}
-  virtual void genCode() = 0;
-
- protected:
-  // compute RELU with ymm, xmm
-  template <typename JMM>
-  void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) {  // NOLINT
-    JMM zero = JMM(zero_idx);
-    vxorps(zero, zero, zero);
-    vmaxps(dst, src, zero);
-  }
-
-  // compute SQUARE with ymm, xmm
-  template <typename JMM>
-  void square_jmm(JMM& dst, JMM& src) {  // NOLINT
-    vmulps(dst, src, src);
-  }
-
-  // compute EXP with ymm, xmm
-  template <typename JMM>
-  void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12,  // NOLINT
-               int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) {
-    using namespace platform;  // NOLINT
-    // check all idx can not equal
-    JMM jmm_src = JMM(src_idx);
-    JMM jmm_fx = JMM(fx_idx);
-    JMM jmm_fy = JMM(fy_idx);
-    JMM jmm_mask = JMM(mask_idx);
-    JMM jmm_tmp = JMM(tmp_idx);
-    reg64_t reg_ptr_global = rax;
-    push(reg_ptr_global);
-    vmovaps(jmm_src, src);
-    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
-    vminps(jmm_src, jmm_src, jmm_tmp);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
-    vmaxps(jmm_src, jmm_src, jmm_tmp);
-    // express exp(x) as exp(g + n*log(2))
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
-    vmulps(jmm_fx, jmm_src, jmm_tmp);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
-    vaddps(jmm_fx, jmm_fx, jmm_tmp);
-    vroundps(jmm_fy, jmm_fx, 0x01);
-    // if greater, substract 1
-    vcmpgtps(jmm_mask, jmm_fy, jmm_fx);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
-    vandps(jmm_mask, jmm_mask, jmm_tmp);
-    vsubps(jmm_fx, jmm_fy, jmm_mask);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
-    vmulps(jmm_fy, jmm_fx, jmm_tmp);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
-    JMM ymm_z = JMM(jmm_mask.getIdx());
-    vmulps(ymm_z, jmm_fx, jmm_tmp);
-    vsubps(jmm_src, jmm_src, jmm_fy);
-    vsubps(jmm_src, jmm_src, ymm_z);
-    vmulps(ymm_z, jmm_src, jmm_src);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
-    vmulps(dst, jmm_src, jmm_tmp);
-    for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
-         i += (YMM_FLOAT_BLOCK * sizeof(float))) {
-      vmovaps(jmm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
-      vaddps(dst, dst, jmm_tmp);
-      vmulps(dst, dst, jmm_src);
-    }
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
-    vaddps(dst, dst, jmm_tmp);
-    vmulps(dst, dst, ymm_z);
-    vaddps(dst, dst, jmm_src);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
-    vaddps(dst, dst, jmm_tmp);
-    // build 2^n
-    JMM ymm_int = jmm_fx;
-    vcvttps2dq(ymm_int, jmm_fx);
-    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
-    vmovdqa(jmm_tmp, ptr[reg_ptr_global]);
-    if (MayIUse(avx2) || std::is_same<JMM, xmm_t>::value) {
-      vpaddd(ymm_int, ymm_int, jmm_tmp);
-      vpslld(ymm_int, ymm_int, 23);
-    } else if (MayIUse(avx)) {
-      xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
-      xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx());
-      reg64_t reg_ptr_tmp = reg_ptr_global;
-      mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
-      vmovdqa(ptr[reg_ptr_tmp], ymm_int);
-      vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp);
-      vpaddd(xtmp1, xtmp1, xtmp2);
-      vpslld(xtmp1, xtmp1, 23);
-      vmovdqa(ptr[reg_ptr_tmp], xtmp1);
-      // next 128bits
-      vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]);
-      vmovdqa(xtmp2, ptr[reg_ptr_tmp +
-                         (YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]);
-      vpaddd(xtmp1, xtmp1, xtmp2);
-      vpslld(xtmp1, xtmp1, 23);
-      vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1);
-      // load out
-      vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
-    }
-    vmulps(dst, dst, ymm_int);
-    pop(reg_ptr_global);
-  }
-
-  // compute SIGMOID with ymm, xmm
-  template <typename JMM>
-  void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
-                   int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
-                   int tmp_idx = 15) {
-    // y = 1 / (1 + e^-x)
-    JMM jmm_tmp = JMM(tmp_idx);
-    JMM jmm_src = JMM(src_idx);
-    reg64_t reg_ptr_global = rax;
-    push(reg_ptr_global);
-    vmovaps(jmm_src, src);
-    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
-    vminps(jmm_src, jmm_src, jmm_tmp);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
-    vmaxps(jmm_src, jmm_src, jmm_tmp);
-    vxorps(jmm_tmp, jmm_tmp, jmm_tmp);
-    vsubps(jmm_src, jmm_tmp, jmm_src);
-    exp_jmm<JMM>(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-    vaddps(dst, dst, jmm_tmp);
-    vdivps(dst, jmm_tmp, dst);
-    pop(reg_ptr_global);
-  }
-
-  // compute TANH with ymm, xmm
-  template <typename JMM>
-  void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
-                int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
-                int tmp_idx = 15) {
-    // y = 2 / (1 + e^(-2x)) - 1
-    JMM jmm_src = JMM(src_idx);
-    JMM jmm_tmp = JMM(tmp_idx);
-    JMM jmm_zero = JMM(mask_idx);
-    reg64_t reg_ptr_global = rax;
-    push(reg_ptr_global);
-    vmovaps(jmm_src, src);
-    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
-    vxorps(jmm_zero, jmm_zero, jmm_zero);
-    vsubps(jmm_tmp, jmm_zero, jmm_tmp);
-    vmulps(jmm_src, jmm_src, jmm_tmp);
-    exp_jmm<JMM>(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-    vaddps(dst, dst, jmm_tmp);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
-    vdivps(dst, jmm_tmp, dst);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-    vsubps(dst, dst, jmm_tmp);
-    pop(reg_ptr_global);
-  }
-
-  // compute IDENTITY with ymm, xmm
-  template <typename JMM>
-  void identity_jmm(JMM& dst, JMM& src, int zero_idx) {  // NOLINT
-    JMM zero = JMM(zero_idx);
-    vxorps(zero, zero, zero);
-    vaddps(dst, src, zero);
-    // TODO(TJ): use below
-    // dst.setIdx(src.getIdx());
-  }
-
-  template <typename JMM>
-  void act(JMM& dst, JMM& src, operand_type type) {  // NOLINT
-    // use 11~15
-    switch (type) {
-      case operand_type::RELU:
-        relu_jmm<JMM>(dst, src, 15);
-        break;
-      case operand_type::SQUARE:
-        square_jmm<JMM>(dst, src);
-        break;
-      case operand_type::EXP:
-        exp_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
-        break;
-      case operand_type::SIGMOID:
-        sigmoid_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
-        break;
-      case operand_type::TANH:
-        tanh_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
-        break;
-      case operand_type::IDENTITY:
-        identity_jmm<JMM>(dst, src, 15);
-        break;
-      default:
-        LOG(FATAL) << "Do not support this operand type: " << type;
-        break;
-    }
-  }
-};
-
-class VActJitCode : public VActFunc {
- public:
-  explicit VActJitCode(int d, operand_type type, size_t code_size,
-                       void* code_ptr = nullptr)
-      : VActFunc(code_size, code_ptr), num_(d), type_(type) {
-    if (!(type_ == operand_type::RELU || type_ == operand_type::EXP ||
-          type_ == operand_type::SIGMOID || type_ == operand_type::TANH ||
-          type_ == operand_type::IDENTITY || type_ == operand_type::SQUARE)) {
-      LOG(FATAL) << "Do not support this operand type: " << type_;
-    }
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "VActJitCode";
-    switch (type_) {
-      case operand_type::RELU:
-        base += "_Relu";
-        break;
-      case operand_type::SQUARE:
-        base += "_Square";
-        break;
-      case operand_type::EXP:
-        base += "_Exp";
-        break;
-      case operand_type::SIGMOID:
-        base += "_Sigmoid";
-        break;
-      case operand_type::TANH:
-        base += "_Tanh";
-        break;
-      case operand_type::IDENTITY:
-        base += "_Identity";
-        break;
-      default:
-        break;
-    }
-    return base;
-  }
-  void genCode() override;
-
- protected:
-  int num_;
-  operand_type type_;
-  reg64_t param1{abi_param1};
-  reg64_t param2{abi_param2};
-
-  xmm_t xmm_src = xmm_t(0);
-  ymm_t ymm_src = ymm_t(0);
-
-  xmm_t xmm_dst = xmm_t(1);
-  ymm_t ymm_dst = ymm_t(1);
-};
-
-#define DECLARE_ACT_JITCODE(name, op_type)                                    \
-  class name##JitCode : public VActJitCode {                                  \
-   public:                                                                    \
-    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \
-        : VActJitCode(d, op_type, code_size, code_ptr) {}                     \
-  };
-
-DECLARE_ACT_JITCODE(VRelu, operand_type::RELU);
-DECLARE_ACT_JITCODE(VSquare, operand_type::SQUARE);
-DECLARE_ACT_JITCODE(VIdentity, operand_type::IDENTITY);
-DECLARE_ACT_JITCODE(VExp, operand_type::EXP);
-DECLARE_ACT_JITCODE(VSigmoid, operand_type::SIGMOID);
-DECLARE_ACT_JITCODE(VTanh, operand_type::TANH);
-
-#undef DECLARE_ACT_JITCODE
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc
deleted file mode 100644
index c126b9077ae50f528074210bae39227a9fcd3277..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/blas.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen/blas.h"
-#include <memory>
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-void VXXJitCode::genCode() {
-  // do not need push stack, and do not need save avx512reg if do not use avx512
-  int offset = 0;
-  if (with_relu_) {
-    vxorps(ymm_zero, ymm_zero, ymm_zero);
-  }
-  if (scalar_index_ == 1) {
-    vbroadcastss(ymm_src1, ptr[param1]);
-  } else if (scalar_index_ == 2) {
-    vbroadcastss(ymm_src2, ptr[param2]);
-  }
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    if (scalar_index_ != 1) {
-      vmovups(ymm_src1, ptr[param1 + offset]);
-    }
-    if (scalar_index_ != 2) {
-      vmovups(ymm_src2, ptr[param2 + offset]);
-    }
-    if (type_ == operand_type::MUL) {
-      vmulps(ymm_dst, ymm_src1, ymm_src2);
-    } else if (type_ == operand_type::ADD) {
-      vaddps(ymm_dst, ymm_src1, ymm_src2);
-    } else if (type_ == operand_type::SUB) {
-      vsubps(ymm_dst, ymm_src1, ymm_src2);
-    }
-    if (with_relu_) {
-      vmaxps(ymm_dst, ymm_zero, ymm_dst);
-    }
-    vmovups(ptr[param3 + offset], ymm_dst);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-  int rest = num_ % YMM_FLOAT_BLOCK;
-  while (rest > 0) {
-    int block = XMM_FLOAT_BLOCK;
-    if (rest >= 4) {
-      block = 4;
-      if (scalar_index_ != 1) {
-        vmovups(xmm_src1, ptr[param1 + offset]);
-      }
-      if (scalar_index_ != 2) {
-        vmovups(xmm_src2, ptr[param2 + offset]);
-      }
-    } else if (rest >= 2) {
-      block = 2;
-      if (scalar_index_ != 1) {
-        vmovq(xmm_src1, ptr[param1 + offset]);
-      }
-      if (scalar_index_ != 2) {
-        vmovq(xmm_src2, ptr[param2 + offset]);
-      }
-    } else {
-      block = 1;
-      if (scalar_index_ != 1) {
-        vmovss(xmm_src1, ptr[param1 + offset]);
-      }
-      if (scalar_index_ != 2) {
-        vmovss(xmm_src2, ptr[param2 + offset]);
-      }
-    }
-    switch (type_) {
-      case operand_type::MUL:
-        vmulps(xmm_dst, xmm_src1, xmm_src2);
-        break;
-      case operand_type::ADD:
-        vaddps(xmm_dst, xmm_src1, xmm_src2);
-        break;
-      case operand_type::SUB:
-        vsubps(xmm_dst, xmm_src1, xmm_src2);
-        break;
-      default:
-        break;
-    }
-    if (with_relu_) {
-      vmaxps(xmm_dst, xmm_zero, xmm_dst);
-    }
-    if (rest >= 4) {
-      vmovups(ptr[param3 + offset], xmm_dst);
-    } else if (rest >= 2) {
-      vmovq(ptr[param3 + offset], xmm_dst);
-    } else {
-      vmovss(ptr[param3 + offset], xmm_dst);
-    }
-    offset += sizeof(float) * block;
-    rest -= block;
-  }
-  ret();
-}
-
-void NCHW16CMulNCJitCode::genCode() {
-  // RDI is ptr x_input
-  // RSI is ptr y_input
-  // RDX is ptr output
-  // RCX is height
-  // r8 is width
-
-  push(rbx);
-
-  xor_(rax, rax);
-  xor_(r10, r10);
-  vmovups(zmm3, ptr[rsi]);
-
-  L("h_loop");
-  xor_(rbx, rbx);
-  L("w_loop");
-  vmovups(zmm2, ptr[rdi + rax]);
-  vmulps(zmm1, zmm2, zmm3);
-  vmovups(ptr[rdx + rax], zmm1);
-  add(rax, 64);
-  inc(rbx);
-  cmp(r8, rbx);
-  jnz("w_loop");
-  inc(r10);
-  cmp(r10, rcx);
-  jnz("h_loop");
-
-  pop(rbx);
-  ret();
-}
-
-class NCHW16CMulNCCreator : public JitCodeCreator<int> {
- public:
-  bool CanBeUsed(const int& attr) const override {
-    return platform::MayIUse(platform::avx512f);
-  }
-  size_t CodeSize(const int& d) const override { return 256 * 1024; }
-  std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {
-    return make_unique<NCHW16CMulNCJitCode>(attr, CodeSize(attr));
-  }
-};
-
-#define DECLARE_BLAS_CREATOR(name)                                           \
-  class name##Creator : public JitCodeCreator<int> {                         \
-   public:                                                                   \
-    bool CanBeUsed(const int& attr) const override {                         \
-      return platform::MayIUse(platform::avx) && attr <= 1024;               \
-    }                                                                        \
-    size_t CodeSize(const int& d) const override {                           \
-      return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \
-    }                                                                        \
-    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
-      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
-    }                                                                        \
-  }
-
-DECLARE_BLAS_CREATOR(VMul);
-DECLARE_BLAS_CREATOR(VAdd);
-DECLARE_BLAS_CREATOR(VSub);
-DECLARE_BLAS_CREATOR(VAddRelu);
-DECLARE_BLAS_CREATOR(VScal);
-DECLARE_BLAS_CREATOR(VAddBias);
-
-#undef DECLARE_BLAS_CREATOR
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace gen = paddle::operators::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator);
-REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator);
-REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator);
-REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator);
-REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator);
-REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator);
-REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h
deleted file mode 100644
index 70312bbe5e97fcf465ce13ef71e5acc9bab4874e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "paddle/fluid/operators/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
-class VXXJitCode : public JitCode {
- public:
-  explicit VXXJitCode(int d, operand_type type, int scalar_index,
-                      bool with_relu, size_t code_size = 256 * 1024,
-                      void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr),
-        num_(d),
-        type_(type),
-        scalar_index_(scalar_index),
-        with_relu_(with_relu) {
-    if (!(type_ == operand_type::MUL || type_ == operand_type::ADD ||
-          type_ == operand_type::SUB)) {
-      LOG(FATAL) << "Do not support this operand type: " << type_;
-    }
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "VXXJitCode";
-    if (scalar_index_ == 1) {
-      base += "_Scalar";
-    } else {
-      base += "_Vec";
-    }
-    if (type_ == operand_type::MUL) {
-      base += "_Mul";
-    } else if (type_ == operand_type::ADD) {
-      base += "_Add";
-    } else if (type_ == operand_type::SUB) {
-      base += "_SUB";
-    }
-    if (scalar_index_ == 2) {
-      base += "_Scalar";
-    } else {
-      base += "_Vec";
-    }
-    base += (with_relu_ ? "_Relu" : "");
-    base += "_D" + std::to_string(num_);
-    return base;
-  }
-  void genCode() override;
-
- private:
-  int num_;
-  operand_type type_;
-  int scalar_index_;
-  bool with_relu_;
-  reg64_t param1{abi_param1};
-  reg64_t param2{abi_param2};
-  reg64_t param3{abi_param3};
-
-  xmm_t xmm_src1 = xmm_t(0);
-  xmm_t xmm_src2 = xmm_t(1);
-  xmm_t xmm_dst = xmm_t(2);
-  xmm_t xmm_zero = xmm_t(3);
-
-  ymm_t ymm_src1 = ymm_t(0);
-  ymm_t ymm_src2 = ymm_t(1);
-  ymm_t ymm_dst = ymm_t(2);
-  ymm_t ymm_zero = ymm_t(3);
-};
-
-#define DECLARE_BLAS_JITCODE(name, op_type, scalar_idx, with_relu)             \
-  class name##JitCode : public VXXJitCode {                                    \
-   public:                                                                     \
-    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr)  \
-        : VXXJitCode(d, op_type, scalar_idx, with_relu, code_size, code_ptr) { \
-    }                                                                          \
-  };
-
-DECLARE_BLAS_JITCODE(VMul, operand_type::MUL, 0, false);
-DECLARE_BLAS_JITCODE(VAdd, operand_type::ADD, 0, false);
-DECLARE_BLAS_JITCODE(VSub, operand_type::SUB, 0, false);
-DECLARE_BLAS_JITCODE(VAddRelu, operand_type::ADD, 0, true);
-DECLARE_BLAS_JITCODE(VScal, operand_type::MUL, 1, false);
-DECLARE_BLAS_JITCODE(VAddBias, operand_type::ADD, 1, false);
-
-#undef DECLARE_BLAS_JITCODE
-
-// nChw16c = nChw16c .* NC
-class NCHW16CMulNCJitCode : public JitCode {
- public:
-  DECLARE_JIT_CODE(NCHW16CMulNCJitCode);
-  explicit NCHW16CMulNCJitCode(int d /*unused*/, size_t code_size,
-                               void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr) {
-    this->genCode();
-  }
-  void genCode() override;
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc
deleted file mode 100644
index 331a4b0d0753b37843c3d112256abfbabe9a4913..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen/embseqpool.h"
-#include <stddef.h>  // offsetof
-#include <memory>
-#include <vector>
-#include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-void EmbSeqPoolJitCode::genCode() {
-  preCode();
-  constexpr int block = YMM_FLOAT_BLOCK;
-  constexpr int max_num_regs = 8;
-  const int num_block = tbl_w_ / block;
-  const int num_groups = num_block / max_num_regs;
-  const size_t block_size = sizeof(float) * block;
-  std::vector<int> groups(num_groups, max_num_regs);
-  int rest_num_regs = num_block % max_num_regs;
-  if (rest_num_regs > 0) {
-    groups.push_back(rest_num_regs);
-  }
-
-  // protect param_dst
-  mov(reg_ptr_param_dst, param_dst);
-  mov(reg_idx_width_in_byte,
-      qword[param_attr + offsetof(emb_seq_pool_attr_t, index_width)]);
-  mov(reg_idx_height,
-      qword[param_attr + offsetof(emb_seq_pool_attr_t, index_height)]);
-  mov(rax, sizeof(int64_t));
-  mul(reg_idx_width_in_byte);
-  mov(reg_idx_width_in_byte, rax);
-  const size_t tbl_width_in_byte = sizeof(float) * tbl_w_;
-  int acc_num_regs = 0;
-  for (int num_regs : groups) {
-    Label l_next_idx_w, l_next_idx_h, l_save_now;
-    xor_(reg_idx_w_i_in_byte, reg_idx_w_i_in_byte);
-    mov(reg_ptr_dst_i, reg_ptr_param_dst);
-    add(reg_ptr_dst_i, acc_num_regs * block_size);
-
-    L(l_next_idx_w);
-    {
-      // h == 0
-      mov(reg_ptr_idx_i, param_idx);
-      add(reg_ptr_idx_i, reg_idx_w_i_in_byte);
-      mov(reg_idx, qword[reg_ptr_idx_i]);
-      mov(rax, tbl_width_in_byte);
-      mul(reg_idx);
-      mov(reg_ptr_tbl_i, rax);        // reg is offset now
-      add(reg_ptr_tbl_i, param_tbl);  // reg is ptr_i now
-      size_t w_offset = 0;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_tbl_i + w_offset]);
-        w_offset += block_size;
-      }
-      add(reg_ptr_idx_i, reg_idx_width_in_byte);
-
-      // end condition of idx h
-      mov(reg_idx_h_end, reg_idx_height);
-      mov(rax, reg_idx_width_in_byte);
-      mul(reg_idx_h_end);
-      mov(reg_idx_h_end, rax);
-      add(reg_idx_h_end, reg_idx_w_i_in_byte);
-      add(reg_idx_h_end, param_idx);
-
-      cmp(reg_ptr_idx_i, reg_idx_h_end);
-      jge(l_save_now, T_NEAR);
-      L(l_next_idx_h);
-      {
-        mov(reg_idx, qword[reg_ptr_idx_i]);
-        mov(reg_ptr_tbl_i, reg_idx);
-        mov(rax, tbl_width_in_byte);
-        mul(reg_idx);
-        mov(reg_ptr_tbl_i, rax);
-        add(reg_ptr_tbl_i, param_tbl);
-        size_t w_offset = 0;
-        for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-          vmovups(ymm_t(reg_i), ptr[reg_ptr_tbl_i + w_offset]);
-          vaddps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs),
-                 ymm_t(reg_i));
-          w_offset += block_size;
-        }
-        add(reg_ptr_idx_i, reg_idx_width_in_byte);
-        cmp(reg_ptr_idx_i, reg_idx_h_end);
-        jl(l_next_idx_h, T_NEAR);
-      }  // end of idx h
-      L(l_save_now);
-      // avg or sqrt here, if needed
-      w_offset = 0;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i + num_regs));
-        w_offset += block_size;
-      }
-      add(reg_ptr_dst_i, tbl_width_in_byte);
-      add(reg_idx_w_i_in_byte, sizeof(int64_t));
-      cmp(reg_idx_w_i_in_byte, reg_idx_width_in_byte);
-      jl(l_next_idx_w, T_NEAR);
-    }  // end of idx w
-
-    acc_num_regs += num_regs;
-    add(param_tbl, num_regs * block_size);  // do not use acc_num_regs
-  }                                         // end of groups
-  postCode();
-}
-
-class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
- public:
-  bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override {
-    return platform::MayIUse(platform::avx) &&
-           attr.table_width % YMM_FLOAT_BLOCK == 0;
-  }
-  size_t CodeSize(const emb_seq_pool_attr_t& attr) const override {
-    return 96 + (attr.table_width / YMM_FLOAT_BLOCK) * 96 * 8;
-  }
-  std::unique_ptr<GenBase> CreateJitCode(
-      const emb_seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.table_height, 0);
-    PADDLE_ENFORCE_GT(attr.table_width, 0);
-    PADDLE_ENFORCE_GT(attr.index_height, 0);
-    PADDLE_ENFORCE_GT(attr.index_width, 0);
-    PADDLE_ENFORCE_GT(attr.out_width, 0);
-    return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace gen = paddle::operators::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.h b/paddle/fluid/operators/jit/gen/embseqpool.h
deleted file mode 100644
index 5afcfbdc1786bef160864fcde06f8738207751be..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/embseqpool.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "paddle/fluid/operators/jit/gen/jitcode.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-class EmbSeqPoolJitCode : public JitCode {
- public:
-  explicit EmbSeqPoolJitCode(const emb_seq_pool_attr_t& attr,
-                             size_t code_size = 256 * 1024,
-                             void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr),
-        tbl_w_(attr.table_width),
-        type_(attr.pool_type) {
-    if (type_ != SeqPoolType::kSum) {
-      LOG(FATAL) << "Only support sum pool yet ";
-    }
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "EmbSeqPoolJitCode";
-    if (type_ == SeqPoolType::kSum) {
-      base += "_Sum";
-    } else if (type_ == SeqPoolType::kAvg) {
-      base += "_Avg";
-    } else if (type_ == SeqPoolType::kSqrt) {
-      base += "_Sqrt";
-    }
-    base += ("_W" + std::to_string(tbl_w_));
-    return base;
-  }
-  void genCode() override;
-
- private:
-  int tbl_w_;
-  SeqPoolType type_;
-  reg64_t param_tbl{abi_param1};
-  reg64_t param_idx{abi_param2};
-  reg64_t param_dst{abi_param3};
-  reg64_t param_attr{abi_param4};
-
-  reg64_t reg_tmp{rax};
-
-  reg64_t reg_idx_width_in_byte{r8};
-  reg64_t reg_idx_height{r9};
-
-  reg64_t reg_ptr_tbl_i{r10};
-  reg64_t reg_idx{r10};  // could use same of reg_ptr_tbl_i
-  reg64_t reg_ptr_idx_i{r11};
-  reg64_t reg_ptr_dst_i{r12};
-  reg64_t reg_ptr_param_dst{r13};  // rdx is used in mul so protect param_dst
-
-  reg64_t reg_idx_w_i_in_byte{r14};
-  reg64_t reg_idx_h_end{r15};
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc
deleted file mode 100644
index b5b0cffa80612c61829766027013f172962b5069..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/gru.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen/gru.h"
-#include <stddef.h>  // offsetof
-#include <memory>
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-void GRUJitCode::genCode() {
-  reg64_t reg_ptr_gates = rax;
-  reg64_t reg_ptr_ht_1 = r9;
-  reg64_t reg_ptr_ht = r10;
-  mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]);
-  mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]);
-  mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]);
-  ymm_t ymm_one = ymm_t(0);
-
-  if (id_ == 2) {
-    reg64_t reg_ptr_tmp = r11;
-    mov(reg_ptr_tmp, reinterpret_cast<size_t>(exp_float_consts));
-    vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]);
-  }
-  int offset = 0;
-  int d = num_ * sizeof(float);
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    ymm_t ymm_u = ymm_t(1);
-    ymm_t ymm_r = ymm_t(2);
-    ymm_t ymm_s = ymm_t(3);
-    ymm_t ymm_ht_1 = ymm_t(4);
-    // W: {W_update, W_reset; W_state}
-    if (id_ == 0 || id_ == 2) {
-      vmovups(ymm_u, ptr[reg_ptr_gates + offset]);
-      vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]);
-    }
-    if (id_ == 1) {
-      vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]);
-    }
-    if (id_ == 1 || id_ == 2) {
-      vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]);
-    }
-
-    if (id_ == 0) {
-      // ht = act_gate(u) * act_cand(s)
-      act<ymm_t>(ymm_u, ymm_u, act_gate_);
-      act<ymm_t>(ymm_s, ymm_s, act_cand_);
-      vmulps(ymm_s, ymm_s, ymm_u);
-      vmovups(ptr[reg_ptr_ht + offset], ymm_s);
-    } else if (id_ == 1) {
-      // ht = act_gate(r) * ht_1
-      act<ymm_t>(ymm_r, ymm_r, act_gate_);
-      vmulps(ymm_r, ymm_r, ymm_ht_1);
-      vmovups(ptr[reg_ptr_ht + offset], ymm_r);
-    } else if (id_ == 2) {
-      // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
-      ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx());
-      act<ymm_t>(ymm_u, ymm_u, act_gate_);
-      act<ymm_t>(ymm_s, ymm_s, act_cand_);
-      vmulps(ymm_s, ymm_s, ymm_u);
-      vsubps(ymm_u, ymm_one_inner, ymm_u);
-      vmulps(ymm_u, ymm_ht_1, ymm_u);
-      vaddps(ymm_u, ymm_s, ymm_u);
-      vmovups(ptr[reg_ptr_ht + offset], ymm_u);
-    }
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-  ret();
-}
-
-#define DECLARE_GRU_CREATOR(name)                                 \
-  class name##Creator : public JitCodeCreator<gru_attr_t> {       \
-   public:                                                        \
-    /* TODO(TJ): enable more */                                   \
-    bool CanBeUsed(const gru_attr_t& attr) const override {       \
-      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
-    }                                                             \
-    size_t CodeSize(const gru_attr_t& attr) const override {      \
-      return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8;          \
-    }                                                             \
-    std::unique_ptr<GenBase> CreateJitCode(                       \
-        const gru_attr_t& attr) const override {                  \
-      return make_unique<name##JitCode>(attr, CodeSize(attr));    \
-    }                                                             \
-  }
-
-DECLARE_GRU_CREATOR(GRUH1);
-DECLARE_GRU_CREATOR(GRUHtPart1);
-DECLARE_GRU_CREATOR(GRUHtPart2);
-
-#undef DECLARE_GRU_CREATOR
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace gen = paddle::operators::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator);
-REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator);
-REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator);
diff --git a/paddle/fluid/operators/jit/gen/gru.h b/paddle/fluid/operators/jit/gen/gru.h
deleted file mode 100644
index d91f828e6aa7673265a460524dfcad119758aa77..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/gru.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "paddle/fluid/operators/jit/gen/act.h"
-#include "paddle/fluid/operators/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-class GRUJitCode : public VActFunc {
- public:
-  explicit GRUJitCode(int id, const gru_attr_t& attr, size_t code_size,
-                      void* code_ptr = nullptr)
-      : VActFunc(code_size, code_ptr), id_(id), num_(attr.d) {
-    auto typeExchange = [](KernelType type) -> gen::operand_type {
-      if (type == KernelType::kVSigmoid) {
-        return operand_type::SIGMOID;
-      } else if (type == KernelType::kVRelu) {
-        return operand_type::RELU;
-      } else if (type == KernelType::kVTanh) {
-        return operand_type::TANH;
-      } else if (type == KernelType::kVIdentity) {
-        return operand_type::IDENTITY;
-      } else {
-        LOG(FATAL) << "Do not support this jit::KernelType: " << type;
-      }
-      return operand_type::IDENTITY;
-    };
-    act_gate_ = typeExchange(attr.act_gate);
-    act_cand_ = typeExchange(attr.act_cand);
-
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "GRUJitCode";
-    if (id_ == 0) {
-      base += "_H1";
-    } else if (id_ == 1) {
-      base += "_HtPart1";
-    } else if (id_ == 2) {
-      base += "_HtPart2";
-    }
-    auto AddTypeStr = [&](operand_type type) {
-      switch (type) {
-        case operand_type::RELU:
-          base += "_Relu";
-          break;
-        case operand_type::EXP:
-          base += "_Exp";
-          break;
-        case operand_type::SIGMOID:
-          base += "_Sigmoid";
-          break;
-        case operand_type::TANH:
-          base += "_Tanh";
-          break;
-        case operand_type::IDENTITY:
-          base += "_Identity";
-          break;
-        default:
-          break;
-      }
-    };
-    AddTypeStr(act_gate_);
-    AddTypeStr(act_cand_);
-    return base;
-  }
-  void genCode() override;
-
- protected:
-  int id_;
-  int num_;
-  operand_type act_gate_;
-  operand_type act_cand_;
-  reg64_t param1{abi_param1};
-};
-
-#define DECLARE_GRU_JITCODE(name, id)                                \
-  class name##JitCode : public GRUJitCode {                          \
-   public:                                                           \
-    explicit name##JitCode(const gru_attr_t& attr, size_t code_size, \
-                           void* code_ptr = nullptr)                 \
-        : GRUJitCode(id, attr, code_size, code_ptr) {}               \
-  };
-
-DECLARE_GRU_JITCODE(GRUH1, 0);
-DECLARE_GRU_JITCODE(GRUHtPart1, 1);
-DECLARE_GRU_JITCODE(GRUHtPart2, 2);
-
-#undef DECLARE_GRU_JITCODE
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc
deleted file mode 100644
index 462ac68a932e14b1200d503a937a35454c0e0618..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/hopv.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen/hopv.h"
-#include <memory>
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-void HOPVJitCode::genCode() {
-  const int num_blocks = num_ / YMM_FLOAT_BLOCK;
-  int offset = 0;
-
-  if (num_blocks > 0) {
-    // load one firstly
-    vmovups(ymm_tmp, ptr[param_src]);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-    for (int i = 1; i < num_blocks; ++i) {
-      vmovups(ymm_src, ptr[param_src + offset]);
-      process(ymm_tmp, ymm_src, ymm_tmp);
-      offset += sizeof(float) * YMM_FLOAT_BLOCK;
-    }
-    vextractf128(xmm_dst, ymm_tmp, 1);
-    process(xmm_dst, xmm_dst, xmm_tmp);
-  } else {
-    if (type_ == operand_type::MAX) {
-      vbroadcastss(ymm_dst, ptr[param_src]);
-    } else if (type_ == operand_type::ADD) {
-      vxorps(ymm_dst, ymm_dst, ymm_dst);
-    }
-  }
-
-  int rest = num_ % YMM_FLOAT_BLOCK;
-  if (rest >= 4) {
-    vmovups(xmm_src, ptr[param_src + offset]);
-    offset += sizeof(float) * 4;
-    rest -= 4;
-    process(xmm_dst, xmm_dst, xmm_src);
-  }
-
-  vpermilps(xmm_tmp, xmm_dst, 16 + 8 + 3);
-  process(xmm_dst, xmm_dst, xmm_tmp);
-
-  if (rest >= 2) {
-    vmovq(xmm_src, ptr[param_src + offset]);
-    offset += sizeof(float) * 2;
-    rest -= 2;
-    process(xmm_dst, xmm_dst, xmm_src);
-  }
-
-  vpermilps(xmm_tmp, xmm_dst, 1);
-  process(xmm_dst, xmm_dst, xmm_tmp);
-
-  if (rest >= 1) {
-    vmovss(xmm_src, ptr[param_src + offset]);
-    process(xmm_dst, xmm_dst, xmm_src);
-  }
-  vmovss(ptr[param_dst], xmm_dst);
-  ret();
-}
-
-#define DECLARE_HOP_CREATOR(name)                                            \
-  class name##Creator : public JitCodeCreator<int> {                         \
-   public:                                                                   \
-    bool CanBeUsed(const int& attr) const override {                         \
-      return platform::MayIUse(platform::avx);                               \
-    }                                                                        \
-    size_t CodeSize(const int& d) const override {                           \
-      return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \
-    }                                                                        \
-    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
-      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
-    }                                                                        \
-  }
-
-DECLARE_HOP_CREATOR(HMax);
-DECLARE_HOP_CREATOR(HSum);
-
-#undef DECLARE_HOP_CREATOR
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace gen = paddle::operators::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator);
-REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator);
diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h
deleted file mode 100644
index 28d213e5e48749f84405454a2708d9289b9d290c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/hopv.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "paddle/fluid/operators/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-// horizontal operand vector
-class HOPVJitCode : public JitCode {
- public:
-  explicit HOPVJitCode(int d, operand_type type, size_t code_size = 256 * 1024,
-                       void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), num_(d), type_(type) {
-    if (!(type_ == operand_type::MAX || type_ == operand_type::ADD)) {
-      LOG(FATAL) << "Do not support this operand type: " << type_;
-    }
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "VXXJitCode";
-    if (type_ == operand_type::MAX) {
-      base += "_MAX";
-    } else {
-      base += "_SUM";
-    }
-    return base;
-  }
-  void genCode() override;
-
- protected:
-  template <typename JMM>
-  void process(JMM& dst, JMM& src1, JMM& src2) {  // NOLINT
-    if (type_ == operand_type::MAX) {
-      vmaxps(dst, src1, src2);
-    } else if (type_ == operand_type::ADD) {
-      vaddps(dst, src1, src2);
-    }
-  }
-
- private:
-  int num_;
-  operand_type type_;
-  reg64_t param_src{abi_param1};
-  reg64_t param_dst{abi_param2};
-  reg64_t param_attr{abi_param3};
-
-  ymm_t ymm_tmp = ymm_t(0);
-  ymm_t ymm_src = ymm_t(1);
-  ymm_t ymm_dst = ymm_t(2);
-
-  xmm_t xmm_tmp = xmm_t(0);
-  xmm_t xmm_src = xmm_t(1);
-  xmm_t xmm_dst = xmm_t(2);
-};
-
-#define DECLARE_HOP_JITCODE(name, op_type)                                    \
-  class name##JitCode : public HOPVJitCode {                                  \
-   public:                                                                    \
-    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \
-        : HOPVJitCode(d, op_type, code_size, code_ptr) {}                     \
-  };
-
-DECLARE_HOP_JITCODE(HMax, operand_type::MAX);
-DECLARE_HOP_JITCODE(HSum, operand_type::ADD);
-
-#undef DECLARE_HOP_JITCODE
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
deleted file mode 100644
index 228db7cc721099750da30adeaa828ae31f521422..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <type_traits>
-#include "paddle/fluid/operators/jit/gen_base.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-#define XBYAK_USE_MMAP_ALLOCATOR
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-// Application Binary Interface
-constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI),
-    abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX),
-    abi_param4(Xbyak::Operand::RCX), abi_param5(Xbyak::Operand::R8),
-    abi_param6(Xbyak::Operand::R9);
-
-constexpr Xbyak::Operand::Code g_abi_regs[] = {
-    Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12,
-    Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15};
-
-constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]);
-
-using reg64_t = const Xbyak::Reg64;
-using reg32_t = const Xbyak::Reg32;
-using xmm_t = const Xbyak::Xmm;
-using ymm_t = const Xbyak::Ymm;
-using zmm_t = const Xbyak::Zmm;
-using Label = Xbyak::Label;
-
-typedef enum {
-  MUL = 0,
-  MAX,
-  ADD,
-  SUB,
-  RELU,
-  EXP,
-  SQUARE,
-  SIGMOID,
-  TANH,
-  IDENTITY
-} operand_type;
-
-#define DECLARE_JIT_CODE(codename) \
-  std::string name() const override { return #codename; }
-
-class JitCode : public GenBase, public Xbyak::CodeGenerator {
- public:
-  explicit JitCode(size_t code_size, void* code_ptr = nullptr)
-      : Xbyak::CodeGenerator(
-            (code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size),
-            code_ptr) {}
-
-  virtual void genCode() = 0;
-
-  size_t getSize() const override { return CodeGenerator::getSize(); }
-  const unsigned char* getCodeInternal() const override {
-    const Xbyak::uint8* code = CodeGenerator::getCode();
-    return code;
-  }
-
- protected:
-  Xbyak::Reg64 param1{abi_param1};
-  const int EVEX_max_8b_offt = 0x200;
-  const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp;
-
-  virtual void preCode() {
-    for (int i = 0; i < num_g_abi_regs; ++i) {
-      push(Xbyak::Reg64(g_abi_regs[i]));
-    }
-    if (platform::MayIUse(platform::avx512f)) {
-      mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
-    }
-  }
-  virtual void postCode() {
-    for (int i = 0; i < num_g_abi_regs; ++i) {
-      pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i]));
-    }
-    ret();
-  }
-  void L(const char* label) { Xbyak::CodeGenerator::L(label); }
-  void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); }
-  // Enhanced vector extension
-  Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt,
-                                    bool bcast = false) {
-    int scale = 0;
-    // Learn from https://github.com/intel/mkl-dnn
-    if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) {
-      offt = offt - 2 * EVEX_max_8b_offt;
-      scale = 1;
-    } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) {
-      offt = offt - 4 * EVEX_max_8b_offt;
-      scale = 2;
-    }
-    auto re = Xbyak::RegExp() + base + offt;
-    if (scale) {
-      re = re + reg_EVEX_max_8b_offt * scale;
-    }
-    if (bcast) {
-      return zword_b[re];
-    } else {
-      return zword[re];
-    }
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc
deleted file mode 100644
index 2c3bc985e9a8b224835d848d30e0a3ef641ed2f9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/lstm.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen/lstm.h"
-#include <stddef.h>  // offsetof
-#include <memory>
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-void LSTMJitCode::genCode() {
-  if (use_peephole_) {
-    preCode();
-  }
-  reg64_t reg_ptr_gates = rax;
-  reg64_t reg_ptr_ct_1 = r9;
-  reg64_t reg_ptr_ct = r10;
-  reg64_t reg_ptr_ht = r11;
-  reg64_t reg_ptr_wp = r12;
-  mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
-  mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
-  mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
-  mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
-  if (use_peephole_) {
-    mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]);
-  }
-
-  int offset = 0;
-  int d = num_ * sizeof(float);
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    /* gates: W_ch, W_ih, W_fh, W_oh */
-    ymm_t ymm_c = ymm_t(0);
-    ymm_t ymm_i = ymm_t(1);
-    ymm_t ymm_f = ymm_t(2);
-    ymm_t ymm_o = ymm_t(3);
-    ymm_t ymm_ct_1 = ymm_t(4);
-    ymm_t ymm_wp0 = ymm_t(5);
-    ymm_t ymm_wp1 = ymm_t(6);
-    ymm_t ymm_wp2 = ymm_t(7);
-    vmovups(ymm_c, ptr[reg_ptr_gates + offset]);
-    vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]);
-    vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]);
-    vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]);
-    if (!compute_c1h1_) {
-      vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]);
-    }
-    if (use_peephole_) {
-      vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]);
-      vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]);
-      vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]);
-    }
-    /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */
-    // act_cand(c)
-    act<ymm_t>(ymm_c, ymm_c, act_cand_);
-    // act_gate(i) or act_gate(ct_1 * wp0 + i)
-    if (!compute_c1h1_ && use_peephole_) {
-      vmulps(ymm_wp0, ymm_ct_1, ymm_wp0);
-      vaddps(ymm_i, ymm_i, ymm_wp0);
-    }
-    act<ymm_t>(ymm_i, ymm_i, act_gate_);
-    vmulps(ymm_c, ymm_c, ymm_i);
-    if (!compute_c1h1_) {
-      // act_gate(f) or act_gate(ct_1 * wp1 + f)
-      if (use_peephole_) {
-        vmulps(ymm_wp1, ymm_ct_1, ymm_wp1);
-        vaddps(ymm_f, ymm_f, ymm_wp1);
-      }
-      act<ymm_t>(ymm_f, ymm_f, act_gate_);
-      // ct
-      vmulps(ymm_f, ymm_f, ymm_ct_1);
-      vaddps(ymm_f, ymm_f, ymm_c);
-    }
-    /* H_t = act_cell(C_t) * act_gate(o) */
-    // act_cell(C_t)
-    ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f;
-    ymm_t ymm_tmp = ymm_i;
-    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
-    // act_gate(o) or act_gate(ct * wp2 + o)
-    if (use_peephole_) {
-      vmulps(ymm_wp2, ymm_ct, ymm_wp2);
-      vaddps(ymm_o, ymm_o, ymm_wp2);
-    }
-    act<ymm_t>(ymm_o, ymm_o, act_gate_);
-    // ht
-    vmulps(ymm_o, ymm_o, ymm_tmp);
-    // save ct and ht
-    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);
-    vmovups(ptr[reg_ptr_ht + offset], ymm_o);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-
-  if (use_peephole_) {
-    postCode();
-  } else {
-    ret();
-  }
-}
-
-#define DECLARE_LSTM_CREATOR(name)                                \
-  class name##Creator : public JitCodeCreator<lstm_attr_t> {      \
-   public:                                                        \
-    /* TODO(TJ): enable more */                                   \
-    bool CanBeUsed(const lstm_attr_t& attr) const override {      \
-      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
-    }                                                             \
-    size_t CodeSize(const lstm_attr_t& attr) const override {     \
-      return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8;          \
-    }                                                             \
-    std::unique_ptr<GenBase> CreateJitCode(                       \
-        const lstm_attr_t& attr) const override {                 \
-      return make_unique<name##JitCode>(attr, CodeSize(attr));    \
-    }                                                             \
-  }
-
-DECLARE_LSTM_CREATOR(LSTMCtHt);
-DECLARE_LSTM_CREATOR(LSTMC1H1);
-
-#undef DECLARE_LSTM_CREATOR
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace gen = paddle::operators::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator);
-REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator);
diff --git a/paddle/fluid/operators/jit/gen/lstm.h b/paddle/fluid/operators/jit/gen/lstm.h
deleted file mode 100644
index fa560b6230d7164be907f0172fb1d91860c05db2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/lstm.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "paddle/fluid/operators/jit/gen/act.h"
-#include "paddle/fluid/operators/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-class LSTMJitCode : public VActFunc {
- public:
-  explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr,
-                       size_t code_size, void* code_ptr = nullptr)
-      : VActFunc(code_size, code_ptr),
-        num_(attr.d),
-        compute_c1h1_(compute_c1h1),
-        use_peephole_(attr.use_peephole) {
-    auto typeExchange = [](KernelType type) -> gen::operand_type {
-      if (type == KernelType::kVSigmoid) {
-        return operand_type::SIGMOID;
-      } else if (type == KernelType::kVRelu) {
-        return operand_type::RELU;
-      } else if (type == KernelType::kVTanh) {
-        return operand_type::TANH;
-      } else if (type == KernelType::kVIdentity) {
-        return operand_type::IDENTITY;
-      } else {
-        LOG(FATAL) << "Do not support this jit::KernelType: " << type;
-      }
-      return operand_type::IDENTITY;
-    };
-    act_gate_ = typeExchange(attr.act_gate);
-    act_cand_ = typeExchange(attr.act_cand);
-    act_cell_ = typeExchange(attr.act_cell);
-
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "LSTMJitCode";
-    if (use_peephole_) {
-      base += "_Peephole";
-    }
-    if (compute_c1h1_) {
-      base += "_C1H1";
-    }
-    auto AddTypeStr = [&](operand_type type) {
-      switch (type) {
-        case operand_type::RELU:
-          base += "_Relu";
-          break;
-        case operand_type::EXP:
-          base += "_Exp";
-          break;
-        case operand_type::SIGMOID:
-          base += "_Sigmoid";
-          break;
-        case operand_type::TANH:
-          base += "_Tanh";
-          break;
-        case operand_type::IDENTITY:
-          base += "_Identity";
-          break;
-        default:
-          break;
-      }
-    };
-    AddTypeStr(act_gate_);
-    AddTypeStr(act_cand_);
-    AddTypeStr(act_cell_);
-    return base;
-  }
-  void genCode() override;
-
- protected:
-  int num_;
-  bool compute_c1h1_;
-  bool use_peephole_;
-  operand_type act_gate_;
-  operand_type act_cand_;
-  operand_type act_cell_;
-  reg64_t param1{abi_param1};
-};
-
-#define DECLARE_LSTM_JITCODE(name, compute_c1h1)                      \
-  class name##JitCode : public LSTMJitCode {                          \
-   public:                                                            \
-    explicit name##JitCode(const lstm_attr_t& attr, size_t code_size, \
-                           void* code_ptr = nullptr)                  \
-        : LSTMJitCode(compute_c1h1, attr, code_size, code_ptr) {}     \
-  };
-
-DECLARE_LSTM_JITCODE(LSTMCtHt, false);
-DECLARE_LSTM_JITCODE(LSTMC1H1, true);
-
-#undef DECLARE_LSTM_JITCODE
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
deleted file mode 100644
index d9955c8cc655f86bbc6c8135bdfa6c83493727f2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen/matmul.h"
-#include <stddef.h>  // offsetof
-#include <memory>
-#include <vector>
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-void MatMulJitCode::genCode() {
-  preCode();
-  int block, rest;
-  const auto groups = packed_groups(n_, k_, &block, &rest);
-  PADDLE_ENFORCE_GT(groups.front(), 0);
-
-  const int block_len = sizeof(float) * block;
-  const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
-  const int w_reg_idx = x_reg_idx - 1;
-  // from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t,
-  // packed_weight)]);
-  mov(reg_ptr_wgt, param_y);
-  size_t z_offset = 0;
-  size_t wgt_offset = 0;
-  for (size_t g = 0; g < groups.size(); ++g) {
-    size_t x_offset = 0;
-    for (int k = 0; k < k_; ++k) {
-      vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
-      // clean
-      if (k == 0) {
-        for (int i = 0; i < groups[g]; ++i) {
-          vxorps(zmm_t(i), zmm_t(i), zmm_t(i));
-        }
-      }
-      for (int i = 0; i < groups[g]; ++i) {
-        vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]);
-        vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
-        wgt_offset += block_len;
-      }
-      // last one, save
-      if (k == k_ - 1) {
-        for (int i = 0; i < groups[g]; ++i) {
-          // only rest save should be careful
-          if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) {
-            break;
-          }
-          vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i));
-        }
-      }
-      x_offset += sizeof(float);
-    }
-    z_offset += block_len * groups[g];
-  }
-
-  if (rest != 0) {
-    // below should refine with mask
-    int reg_idx = groups.back() - 1;
-    z_offset = (n_ - rest) * sizeof(float);
-    int inner_block = 8;
-    while (rest > 0) {
-      if (rest >= 8) {
-        inner_block = 8;
-        vmovups(ptr[param_z + z_offset], ymm_t(reg_idx));
-        // shift zmm of inner_block, change reg_idx if update
-      } else if (rest >= 4) {
-        inner_block = 4;
-        vmovups(ptr[param_z + z_offset], xmm_t(reg_idx));
-      } else if (rest >= 2) {
-        inner_block = 2;
-        vmovq(ptr[param_z + z_offset], xmm_t(reg_idx));
-      } else {
-        inner_block = 1;
-        vmovss(ptr[param_z + z_offset], xmm_t(reg_idx));
-      }
-      z_offset += inner_block * sizeof(float);
-      rest -= inner_block;
-    }
-  }
-
-  postCode();
-}
-
-class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
- public:
-  bool CanBeUsed(const matmul_attr_t& attr) const override {
-    return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
-           attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
-  }
-  size_t CodeSize(const matmul_attr_t& attr) const override {
-    int block = YMM_FLOAT_BLOCK;
-    if (platform::MayIUse(platform::avx512f)) {
-      block = ZMM_FLOAT_BLOCK;
-    }
-    return 96 + 4 * attr.k * (attr.n / block + 1) * 8;
-  }
-  std::unique_ptr<GenBase> CreateJitCode(
-      const matmul_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.m, 0);
-    PADDLE_ENFORCE_GT(attr.n, 0);
-    PADDLE_ENFORCE_GT(attr.k, 0);
-    return make_unique<MatMulJitCode>(attr, CodeSize(attr));
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace gen = paddle::operators::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
deleted file mode 100644
index 881cea581acc27a7aa7d395c041d40a4d3281947..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <stdlib.h>  // for malloc and free
-#include <string>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/operators/jit/gen/jitcode.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-class MatMulJitCode : public JitCode {
- public:
-  explicit MatMulJitCode(const matmul_attr_t& attr,
-                         size_t code_size = 256 * 1024,
-                         void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "MatMulJitCode";
-    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
-           std::to_string(k_);
-    return base;
-  }
-  void genCode() override;
-
- private:
-  int m_, n_, k_;
-
-  reg64_t param_x{abi_param1};
-  reg64_t param_y{abi_param2};
-  reg64_t param_z{abi_param3};
-  reg64_t param_attr{abi_param4};
-  reg64_t reg_tmp{rax};
-
-  reg64_t reg_ptr_wgt{r10};
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc
deleted file mode 100644
index ec8e4e9827441bc0a817c6da455cb9e530c8c1bf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen/seqpool.h"
-#include <memory>
-#include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-void SeqPoolJitCode::genCode() {
-  constexpr int block = YMM_FLOAT_BLOCK;
-  constexpr int max_num_regs = 8;
-  const int num_block = w_ / block;
-  const int num_groups = num_block / max_num_regs;
-  int rest_num_regs = num_block % max_num_regs;
-  mov(reg32_int_h, dword[param_attr]);
-  if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
-    mov(reg_tmp, reinterpret_cast<size_t>(exp_float_consts));
-    vmovups(xmm_t(1), ptr[reg_tmp + OFFSET_EXP_ONE]);
-    mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
-    fild(dword[param_attr]);
-    fstp(dword[reg_tmp]);
-    vmovss(xmm_t(0), ptr[reg_tmp]);
-    if (type_ == SeqPoolType::kSqrt) {
-      vsqrtps(xmm_t(0), xmm_t(0));
-    }
-    vdivps(xmm_t(1), xmm_t(1), xmm_t(0));
-    vmovss(ptr[reg_tmp], xmm_t(1));
-  }
-  const int group_len = max_num_regs * block * sizeof(float);
-  for (int g = 0; g < num_groups; ++g) {
-    pool_height<ymm_t>(g * group_len, block, max_num_regs);
-  }
-  if (rest_num_regs > 0) {
-    pool_height<ymm_t>(num_groups * group_len, block, rest_num_regs);
-  }
-  // part of rest_w * height
-  const int rest = w_ % block;
-  pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs);
-  ret();
-}
-
-class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
- public:
-  bool CanBeUsed(const seq_pool_attr_t& attr) const override {
-    return platform::MayIUse(platform::avx);
-  }
-  size_t CodeSize(const seq_pool_attr_t& attr) const override {
-    return 96 +
-           ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
-                4 /* load, mul and save */ +
-            256) *
-               16;
-  }
-  std::unique_ptr<GenBase> CreateJitCode(
-      const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0);
-    PADDLE_ENFORCE_GT(attr.h, 0);
-    return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace gen = paddle::operators::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);
diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h
deleted file mode 100644
index e909bc7c7939ee5cb7a2d367c7a452b96e6a91c2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "paddle/fluid/operators/jit/gen/jitcode.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-class SeqPoolJitCode : public JitCode {
- public:
-  explicit SeqPoolJitCode(const seq_pool_attr_t& attr,
-                          size_t code_size = 256 * 1024,
-                          void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
-    if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
-          type_ == SeqPoolType::kSqrt)) {
-      LOG(FATAL) << "Only supported pool type: sum, avg and sqrt.";
-    }
-    fp_h_[0] = 1.f;
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "SeqPoolJitCode";
-    if (type_ == SeqPoolType::kSum) {
-      base += "_Sum";
-    } else if (type_ == SeqPoolType::kAvg) {
-      base += "_Avg";
-    } else if (type_ == SeqPoolType::kSqrt) {
-      base += "_Sqrt";
-    }
-    base += ("_W" + std::to_string(w_));
-    return base;
-  }
-  void genCode() override;
-
- protected:
-  template <typename JMM>
-  void pool_height(int w_offset, int block, int max_num_regs) {
-    int offset = w_offset;
-    for (int i = 0; i < max_num_regs; ++i) {
-      vmovups(JMM(i), ptr[param_src + offset]);
-      offset += sizeof(float) * block;
-    }
-    cmp(reg32_int_h, 1);
-    Label l_next_h, l_h_done;
-    jle(l_h_done, T_NEAR);
-    mov(reg_h_i, 1);
-    mov(reg_tmp, param_src);
-    add(reg_tmp, w_ * sizeof(float) + w_offset);
-    L(l_next_h);
-    {
-      mov(reg_ptr_src_i, reg_tmp);
-      for (int i = 0; i < max_num_regs; ++i) {
-        vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]);
-        // sum anyway
-        vaddps(JMM(i), JMM(i), JMM(i + max_num_regs));
-        add(reg_ptr_src_i, sizeof(float) * block);
-      }
-      inc(reg_h_i);
-      add(reg_tmp, w_ * sizeof(float));
-      cmp(reg_h_i, reg32_int_h);
-      jl(l_next_h, T_NEAR);
-    }
-    L(l_h_done);
-    // save right now
-    if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
-      mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
-      vbroadcastss(JMM(max_num_regs), ptr[reg_tmp]);
-    }
-    offset = w_offset;
-    for (int i = 0; i < max_num_regs; ++i) {
-      if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
-        vmulps(JMM(i), JMM(i), JMM(max_num_regs));
-      }
-      vmovups(ptr[param_dst + offset], JMM(i));
-      offset += sizeof(float) * block;
-    }
-  }
-
-  void pool_height_of_rest_width(int rest, int w_offset, int max_num_regs) {
-    const int rest_used_num_regs = load_rest(rest, w_offset, 0);
-    const bool has_block4 = rest / 4 > 0;
-    const bool has_block2 = (rest % 4) / 2 > 0;
-    const bool has_block1 = (rest % 2) == 1;
-    cmp(reg32_int_h, 1);
-    Label l_next_h, l_h_done;
-    jle(l_h_done, T_NEAR);
-    mov(reg_h_i, 1);
-    mov(reg_tmp, param_src);
-    add(reg_tmp, w_ * sizeof(float) + w_offset);
-    L(l_next_h);
-    {
-      int reg_idx = 0;
-      mov(reg_ptr_src_i, reg_tmp);
-      if (has_block4) {
-        vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
-        add(reg_ptr_src_i, sizeof(float) * 4);
-        reg_idx++;
-      }
-      if (has_block2) {
-        vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
-        add(reg_ptr_src_i, sizeof(float) * 2);
-        reg_idx++;
-      }
-      if (has_block1) {
-        vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
-        reg_idx++;
-      }
-      PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs,
-                        "All heights should use same regs");
-      for (int i = 0; i < reg_idx; ++i) {
-        vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
-      }
-      inc(reg_h_i);
-      add(reg_tmp, w_ * sizeof(float));
-      cmp(reg_h_i, reg32_int_h);
-      jl(l_next_h, T_NEAR);
-    }
-    L(l_h_done);
-    // save right now
-    if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
-      mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
-      vbroadcastss(xmm_t(max_num_regs), ptr[reg_tmp]);
-      for (int i = 0; i < rest_used_num_regs; ++i) {
-        vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs));
-      }
-    }
-    save_rest(rest, w_offset);
-  }
-
-  // return the number of used regs, use start from reg 0
-  int load_rest(int rest, int w_offset, const int num_shift_regs,
-                const int reg_start = 0) {
-    const bool has_block4 = rest / 4 > 0;
-    const bool has_block2 = (rest % 4) / 2 > 0;
-    const bool has_block1 = (rest % 2) == 1;
-    int reg_idx = reg_start;
-    if (has_block4) {
-      vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
-      w_offset += sizeof(float) * 4;
-      reg_idx++;
-    }
-    if (has_block2) {
-      vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
-      w_offset += sizeof(float) * 2;
-      reg_idx++;
-    }
-    if (has_block1) {
-      vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
-      reg_idx++;
-    }
-    return reg_idx;
-  }
-
-  // use reg start from 0
-  void save_rest(int rest, int w_offset, int reg_start = 0) {
-    const bool has_block4 = rest / 4 > 0;
-    const bool has_block2 = (rest % 4) / 2 > 0;
-    const bool has_block1 = (rest % 2) == 1;
-    int reg_idx = reg_start;
-    if (has_block4) {
-      vmovups(ptr[param_dst + w_offset], xmm_t(reg_idx));
-      w_offset += sizeof(float) * 4;
-      reg_idx++;
-    }
-    if (has_block2) {
-      vmovq(ptr[param_dst + w_offset], xmm_t(reg_idx));
-      w_offset += sizeof(float) * 2;
-      reg_idx++;
-    }
-    if (has_block1) {
-      vmovss(ptr[param_dst + w_offset], xmm_t(reg_idx));
-    }
-  }
-
- private:
-  float ALIGN32_BEG fp_h_[1] ALIGN32_END;
-  int w_;
-  SeqPoolType type_;
-  reg64_t param_src{abi_param1};
-  reg64_t param_dst{abi_param2};
-  reg64_t param_attr{abi_param3};
-  reg64_t reg_tmp{rax};
-
-  reg32_t reg32_int_h{r8d};
-  reg32_t reg32_fp_h{r9d};
-
-  reg64_t reg_h_i{r10};
-  reg64_t reg_ptr_src_i{r11};
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc
deleted file mode 100644
index e65d3500b496c811b2da39752417ce5ef3ab3914..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen/sgd.h"
-#include <stddef.h>  // offsetof
-#include <memory>
-#include <vector>
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-void SgdJitCode::genCode() {
-  preCode();
-  constexpr int block = YMM_FLOAT_BLOCK;
-  constexpr int max_num_regs = 7;
-  const int num_block = w_ / block;
-  const int num_groups = num_block / max_num_regs;
-  const size_t block_size = sizeof(float) * block;
-  const size_t width_size = w_ * sizeof(float);
-  std::vector<int> groups(num_groups, max_num_regs);
-  int rest_num_regs = num_block % max_num_regs;
-  if (rest_num_regs > 0) {
-    groups.push_back(rest_num_regs);
-  }
-
-  vbroadcastss(ymm_lr, ptr[param_lr]);
-  // protect rdx
-  mov(reg_ptr_grad_i, param_grad);
-  mov(reg_ptr_rows_i, param_rows);
-
-  mov(reg_rows_size_in_byte,
-      qword[param_attr + offsetof(sgd_attr_t, selected_rows_size)]);
-  mov(rax, sizeof(int64_t));
-  mul(reg_rows_size_in_byte);
-  mov(reg_rows_size_in_byte, rax);
-  add(reg_rows_size_in_byte, reg_ptr_rows_i);
-
-  Label l_next_row;
-  L(l_next_row);
-  {
-    mov(reg_row, qword[reg_ptr_rows_i]);
-    mov(rax, width_size);
-    mul(reg_row);
-    mov(reg_row, rax);
-
-    mov(reg_ptr_param_i, param_param);
-    mov(reg_ptr_out_i, param_out);
-    add(reg_ptr_param_i, reg_row);
-    add(reg_ptr_out_i, reg_row);
-
-    size_t w_offset = 0;
-    for (int num_regs : groups) {
-      // load grad
-      size_t inner_offfset = w_offset;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ymm_t(reg_i), ptr[reg_ptr_grad_i + inner_offfset]);
-        inner_offfset += block_size;
-      }
-
-      // load param
-      inner_offfset = w_offset;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_param_i + inner_offfset]);
-        inner_offfset += block_size;
-      }
-
-      // compute out
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmulps(ymm_t(reg_i), ymm_t(reg_i), ymm_lr);
-        vsubps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i));
-      }
-
-      // save out
-      inner_offfset = w_offset;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ptr[reg_ptr_out_i + inner_offfset], ymm_t(reg_i + num_regs));
-        inner_offfset += block_size;
-      }
-      w_offset += (block_size * num_regs);
-    }
-
-    add(reg_ptr_grad_i, width_size);
-    add(reg_ptr_rows_i, sizeof(int64_t));
-    cmp(reg_ptr_rows_i, reg_rows_size_in_byte);
-    jl(l_next_row, T_NEAR);
-  }
-
-  postCode();
-}
-
-class SgdCreator : public JitCodeCreator<sgd_attr_t> {
- public:
-  bool CanBeUsed(const sgd_attr_t& attr) const override {
-    return platform::MayIUse(platform::avx) &&
-           attr.grad_width % YMM_FLOAT_BLOCK == 0;
-  }
-  size_t CodeSize(const sgd_attr_t& attr) const override {
-    return 96 + (attr.grad_width / YMM_FLOAT_BLOCK) * 32 * 8;
-  }
-  std::unique_ptr<GenBase> CreateJitCode(
-      const sgd_attr_t& attr) const override {
-    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
-    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
-    PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
-    return make_unique<SgdJitCode>(attr, CodeSize(attr));
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace gen = paddle::operators::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator);
diff --git a/paddle/fluid/operators/jit/gen/sgd.h b/paddle/fluid/operators/jit/gen/sgd.h
deleted file mode 100644
index 317edcd2bcb5fea1f14f32260fd16c9c706eaf00..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/sgd.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "paddle/fluid/operators/jit/gen/jitcode.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-class SgdJitCode : public JitCode {
- public:
-  explicit SgdJitCode(const sgd_attr_t& attr, size_t code_size = 256 * 1024,
-                      void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), w_(attr.grad_width) {
-    this->genCode();
-  }
-
-  DECLARE_JIT_CODE(SgdJitCode);
-  void genCode() override;
-
- private:
-  int w_;
-  reg64_t param_lr{abi_param1};
-  reg64_t param_param{abi_param2};
-  reg64_t param_grad{abi_param3};
-  reg64_t param_rows{abi_param4};
-  reg64_t param_out{abi_param5};
-  reg64_t param_attr{abi_param6};
-
-  ymm_t ymm_lr = ymm_t(15);
-
-  reg64_t reg_ptr_grad_i{r10};
-  reg64_t reg_ptr_rows_i{r11};
-  reg64_t reg_rows_size_in_byte{r12};
-  reg64_t reg_row{r13};
-  reg64_t reg_ptr_param_i{r14};
-  reg64_t reg_ptr_out_i{r15};
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc
deleted file mode 100644
index 66a8d75fd4de5bae3ba37cf7fe7b1645938aa855..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
-#include <memory>
-#include <vector>
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-void VBroadcastJitCode::genCode() {
-  preCode();
-  constexpr int block = YMM_FLOAT_BLOCK;
-  constexpr int max_num_regs = 16;
-  const int num_block = w_ / block;
-  const int num_groups = num_block / max_num_regs;
-  const size_t block_size = sizeof(float) * block;
-  std::vector<int> groups(num_groups, max_num_regs);
-  int rest_num_regs = num_block % max_num_regs;
-  if (rest_num_regs > 0) {
-    groups.push_back(rest_num_regs);
-  }
-
-  // protect param_h
-  mov(reg_height, param_h);
-  Label l_next_h;
-  xor_(reg_h_i, reg_h_i);
-  mov(reg_ptr_dst_i, param_dst);
-  L(l_next_h);
-  {
-    mov(reg_ptr_src_i, param_src);
-    for (int num_regs : groups) {
-      size_t w_offset = 0;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]);
-        w_offset += block_size;
-      }
-      add(reg_ptr_src_i, num_regs * block_size);
-
-      w_offset = 0;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i));
-        w_offset += block_size;
-      }
-      add(reg_ptr_dst_i, num_regs * block_size);
-    }  // end of groups
-    inc(reg_h_i);
-    cmp(reg_h_i, reg_height);
-    jl(l_next_h, T_NEAR);
-  }  // end of l_next_h
-
-  postCode();
-}
-
-class VBroadcastCreator : public JitCodeCreator<int64_t> {
- public:
-  bool CanBeUsed(const int64_t& w) const override {
-    return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
-  }
-  size_t CodeSize(const int64_t& w) const override {
-    return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
-  }
-  std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
-    PADDLE_ENFORCE_GT(w, 0);
-    return make_unique<VBroadcastJitCode>(w, CodeSize(w));
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace gen = paddle::operators::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.h b/paddle/fluid/operators/jit/gen/vbroadcast.h
deleted file mode 100644
index 27c75f6f710e9514c7d91181e7f447d9dd997081..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen/vbroadcast.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "paddle/fluid/operators/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace gen {
-
-class VBroadcastJitCode : public JitCode {
- public:
-  explicit VBroadcastJitCode(const int64_t& w, size_t code_size = 256 * 1024,
-                             void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), w_(w) {
-    this->genCode();
-  }
-
-  DECLARE_JIT_CODE(VBroadcastJitCode);
-  void genCode() override;
-
- private:
-  int w_;
-  reg64_t param_src{abi_param1};
-  reg64_t param_dst{abi_param2};
-  reg64_t param_h{abi_param3};
-  reg64_t param_w{abi_param4};
-
-  reg64_t reg_height{r9};
-  reg64_t reg_h_i{r10};
-  reg64_t reg_ptr_src_i{r11};
-  reg64_t reg_ptr_dst_i{r12};
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
deleted file mode 100644
index 4c49eff49e3efc0664a084f9fa2bb897db0c6f1d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/gen_base.h"
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include "paddle/fluid/memory/allocation/cpu_allocator.h"  // for posix_memalign
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#ifndef _WIN32
-#define posix_memalign_free free
-#endif
-
-DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-// refer do not need CanBeUsed, it would be the last one.
-void GenBase::dumpCode(const unsigned char* code) const {
-  if (code) {
-    static int counter = 0;
-    std::ostringstream filename;
-    filename << "paddle_jitcode_" << name() << "." << counter << ".bin";
-    counter++;
-    std::ofstream fout(filename.str(), std::ios::out);
-    if (fout.is_open()) {
-      fout.write(reinterpret_cast<const char*>(code), this->getSize());
-      fout.close();
-    }
-  }
-}
-
-void* GenBase::operator new(size_t size) {
-  void* ptr;
-  constexpr size_t alignment = 32ul;
-  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0,
-                    "GenBase Alloc %ld error!", size);
-  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
-  return ptr;
-}
-
-void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
-
-std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
-  int block;
-  int max_num_regs;
-  if (platform::MayIUse(platform::avx512f)) {
-    block = ZMM_FLOAT_BLOCK;
-    max_num_regs = 32;
-  } else {
-    block = YMM_FLOAT_BLOCK;
-    max_num_regs = 16;
-  }
-  // one for x, one for y, others for z
-  const int max_used_regs_for_n = max_num_regs - 2;
-  const int aligned_n = n % block == 0 ? n : (n / block + 1) * block;
-  const int num_block = aligned_n / block;
-  const int num_groups = num_block / max_used_regs_for_n;
-  std::vector<int> groups(num_groups, max_used_regs_for_n);
-  int rest_num_regs = num_block % max_used_regs_for_n;
-  if (rest_num_regs != 0) {
-    groups.push_back(rest_num_regs);
-  }
-  if (block_out) {
-    *block_out = block;
-  }
-  if (rest_out) {
-    *rest_out = n % block;
-  }
-  return groups;
-}
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
deleted file mode 100644
index 033c603c07c288ba621ceaa912ea0c476fe86cd6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/gen_base.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <gflags/gflags.h>
-#include <memory>  // for unique_ptr
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/jit/kernel_base.h"
-
-DECLARE_bool(dump_jitcode);
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-class GenBase : public Kernel {
- public:
-  virtual ~GenBase() = default;
-  virtual std::string name() const = 0;
-  virtual size_t getSize() const = 0;
-  virtual const unsigned char* getCodeInternal() const = 0;
-  const char* ImplType() const override { return "JitCode"; }
-  template <typename Func>
-  Func getCode() const {
-    const unsigned char* code = this->getCodeInternal();
-    if (FLAGS_dump_jitcode) {
-      this->dumpCode(code);
-    }
-    // Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
-    // then workaround with const_cast. Any better idea is appreciated.
-    return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
-  }
-
-  void* operator new(size_t size);
-  void operator delete(void* ptr);
-  void* operator new[](size_t size) { return operator new(size); }
-  void operator delete[](void* ptr) { operator delete(ptr); }
-
- protected:
-  void dumpCode(const unsigned char* code) const;
-};
-
-// Creator is used to creat the jitcode and save in pool.
-// Every JitCode should have one creator.
-class GenCreator {
- public:
-  virtual ~GenCreator() = default;
-};
-
-template <typename Attr>
-class JitCodeCreator : public GenCreator {
- public:
-  virtual ~JitCodeCreator() = default;
-
-  // condition when this jit code can be used.
-  virtual bool CanBeUsed(const Attr& attr) const = 0;
-
-  // estimate this code size
-  virtual size_t CodeSize(const Attr& attr) const = 0;
-
-  // create this code
-  virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0;
-};
-
-// unify the method of packed groups
-// output the packed groups which used in weights, the block size and rest size
-std::vector<int> packed_groups(int n, int k, int* block = nullptr,
-                               int* rest = nullptr);
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
deleted file mode 100644
index f868c847bd80e874da2d2babde58129122e0bc70..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/helper.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/helper.h"
-#include <algorithm>  // tolower
-#include <numeric>
-#include <string>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-#define ONE_CASE(key) \
-  case key:           \
-    return #key
-
-const char* to_string(KernelType kt) {
-  switch (kt) {
-    ONE_CASE(kNone);
-    ONE_CASE(kVMul);
-    ONE_CASE(kVAdd);
-    ONE_CASE(kVAddRelu);
-    ONE_CASE(kVSub);
-    ONE_CASE(kVScal);
-    ONE_CASE(kStrideScal);
-    ONE_CASE(kVAddBias);
-    ONE_CASE(kVRelu);
-    ONE_CASE(kVBroadcast);
-    ONE_CASE(kVCopy);
-    ONE_CASE(kVIdentity);
-    ONE_CASE(kVExp);
-    ONE_CASE(kVSquare);
-    ONE_CASE(kVSigmoid);
-    ONE_CASE(kVTanh);
-    ONE_CASE(kLSTMCtHt);
-    ONE_CASE(kLSTMC1H1);
-    ONE_CASE(kGRUH1);
-    ONE_CASE(kGRUHtPart1);
-    ONE_CASE(kGRUHtPart2);
-    ONE_CASE(kCRFDecoding);
-    ONE_CASE(kLayerNorm);
-    ONE_CASE(kNCHW16CMulNC);
-    ONE_CASE(kSeqPool);
-    ONE_CASE(kMatMul);
-    ONE_CASE(kHMax);
-    ONE_CASE(kHSum);
-    ONE_CASE(kStrideASum);
-    ONE_CASE(kSoftmax);
-    ONE_CASE(kEmbSeqPool);
-    ONE_CASE(kSgd);
-    default:
-      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
-      return "NOT JITKernel";
-  }
-  return nullptr;
-}
-
-const char* to_string(SeqPoolType tp) {
-  switch (tp) {
-    ONE_CASE(kNonePoolType);
-    ONE_CASE(kSum);
-    ONE_CASE(kAvg);
-    ONE_CASE(kSqrt);
-    default:
-      PADDLE_THROW("Not support type: %d, or forget to add it.", tp);
-      return "NOT PoolType";
-  }
-  return nullptr;
-}
-#undef ONE_CASE
-
-KernelType to_kerneltype(const std::string& act) {
-  std::string lower = act;
-  std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
-  if (lower == "relu" || lower == "vrelu") {
-    return kVRelu;
-  } else if (lower == "identity" || lower == "videntity" || lower == "") {
-    return kVIdentity;
-  } else if (lower == "exp" || lower == "vexp") {
-    return kVExp;
-  } else if (lower == "sigmoid" || lower == "vsigmoid") {
-    return kVSigmoid;
-  } else if (lower == "tanh" || lower == "vtanh") {
-    return kVTanh;
-  }
-  PADDLE_THROW("Not support type: %s, or forget to add this case", act);
-  return kNone;
-}
-
-template <>
-void pack_weights<float>(const float* src, float* dst, int n, int k) {
-  int block, rest;
-  const auto groups = packed_groups(n, k, &block, &rest);
-  std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
-  });
-  int sum = std::accumulate(groups.begin(), groups.end(), 0);
-  std::memset(dst, 0, k * sum * block * sizeof(float));
-  PADDLE_ENFORCE_GE(sum * block, n,
-                    "The packed n should be equal to or larger than n");
-
-  const int block_len = sizeof(float) * block;
-  int n_offset = 0;
-
-  for (size_t g = 0; g < groups.size(); ++g) {
-    const float* from = src + n_offset;
-    for (int j = 0; j < k; ++j) {
-      size_t copy_sz = groups[g] * block_len;
-      if (g == groups.size() - 1 && rest != 0) {
-        copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float);
-      }
-      std::memcpy(dst, from + j * n, copy_sz);
-      dst += groups[g] * block;
-    }
-    n_offset += groups[g] * block;
-  }
-}
-
-template <typename T>
-typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
-    const T* src, T* dst, int n, int k) {
-  PADDLE_THROW("Only support pack with float type.");
-}
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
deleted file mode 100644
index 1ac5318d461c2e8bc4f43569602a88f95a76befb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/helper.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <unordered_map>
-#include <utility>  // for std::move
-#include <vector>
-#include "paddle/fluid/operators/jit/gen_base.h"
-#include "paddle/fluid/operators/jit/kernel_base.h"
-#include "paddle/fluid/operators/jit/kernel_key.h"
-#include "paddle/fluid/operators/jit/kernel_pool.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-template <typename KernelTuple, typename PlaceType>
-inline typename std::enable_if<
-    std::is_same<typename KernelTuple::data_type, float>::value &&
-        std::is_same<PlaceType, platform::CPUPlace>::value,
-    const Kernel*>::type
-GetJitCode(const typename KernelTuple::attr_type& attr) {
-  using Attr = typename KernelTuple::attr_type;
-  int64_t key = JitCodeKey<Attr>(attr);
-  auto& codes = JitCodePool<KernelTuple::kernel_type>::Instance();
-  if (codes.Has(key)) {
-    return codes.AllKernels().at(key).get();
-  }
-
-  // creator is not related with attr, so can use KernelKey as key
-  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
-  // pool: (KernelKey(type, place), vector<GenCreatorPtr>)
-  auto& creator_map = JitCodeCreatorPool::Instance().AllCreators();
-  auto iter = creator_map.find(kkey);
-  if (iter != creator_map.end()) {
-    auto& creators = iter->second;
-    for (auto& cur : creators) {
-      auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get());
-      if (i && i->CanBeUsed(attr)) {
-        auto p = i->CreateJitCode(attr);
-        if (p) {
-          auto res = p.get();
-          codes.Insert(key, std::move(p));
-          return res;
-        }
-      }
-    }
-  }
-  return nullptr;
-}
-
-template <typename KernelTuple, typename PlaceType>
-inline typename std::enable_if<
-    !std::is_same<typename KernelTuple::data_type, float>::value ||
-        !std::is_same<PlaceType, platform::CPUPlace>::value,
-    const Kernel*>::type
-GetJitCode(const typename KernelTuple::attr_type& attr) {
-  return nullptr;
-}
-
-// Refer code do not related with attr, which is just for cast
-// Refer is always on CPUPlace
-template <typename KernelTuple>
-inline const Kernel* GetReferKernel() {
-  auto& ref_pool = ReferKernelPool::Instance().AllKernels();
-  KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
-  auto ref_iter = ref_pool.find(kkey);
-  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
-                 "Every Kernel should have reference function.");
-  auto& ref_impls = ref_iter->second;
-  for (auto& impl : ref_impls) {
-    auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
-    if (i) {
-      return i;
-    }
-  }
-  return nullptr;
-}
-
-template <typename KernelTuple>
-inline typename KernelTuple::func_type GetReferFunc() {
-  auto ker = GetReferKernel<KernelTuple>();
-  auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
-  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
-  return p->GetFunc();
-}
-
-// Return all Kernels that can be used
-template <typename KernelTuple, typename PlaceType>
-std::vector<const Kernel*> GetAllCandidateKernels(
-    const typename KernelTuple::attr_type& attr) {
-  // the search order shoudl be jitcode > more > refer
-  std::vector<const Kernel*> res;
-  auto jitker = GetJitCode<KernelTuple, PlaceType>(attr);
-  if (jitker) {
-    res.emplace_back(jitker);
-  }
-
-  // more kernelpool: (KernelKey(type, place), vector<KernelPtr>)
-  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
-  auto& pool = KernelPool::Instance().AllKernels();
-  auto iter = pool.find(kkey);
-  if (iter != pool.end()) {
-    auto& impls = iter->second;
-    for (auto& impl : impls) {
-      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(impl.get());
-      if (i && i->CanBeUsed(attr)) {
-        res.emplace_back(i);
-      }
-    }
-  }
-
-  // The last implementation should be reference function on CPUPlace.
-  auto ref = GetReferKernel<KernelTuple>();
-  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
-  res.emplace_back(ref);
-  return res;
-}
-
-template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
-std::vector<std::pair<std::string, typename KernelTuple::func_type>>
-GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
-  using Func = typename KernelTuple::func_type;
-  auto kers = GetAllCandidateKernels<KernelTuple, PlaceType>(attr);
-  std::vector<std::pair<std::string, Func>> res;
-  for (auto k : kers) {
-    std::string name = k->ImplType();
-    if (name == "JitCode") {
-      auto i = dynamic_cast<const GenBase*>(k);
-      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
-      res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
-    } else {
-      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
-      PADDLE_ENFORCE(i, "kernel cast can not fail.");
-      res.emplace_back(std::make_pair(name, i->GetFunc()));
-    }
-  }
-  return res;
-}
-
-template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
-std::vector<typename KernelTuple::func_type> GetAllCandidateFuncs(
-    const typename KernelTuple::attr_type& attr) {
-  auto funcs = GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
-  std::vector<typename KernelTuple::func_type> res;
-  for (auto& i : funcs) {
-    res.emplace_back(i.second);
-  }
-  return res;
-}
-
-template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
-typename KernelTuple::func_type GetDefaultBestFunc(
-    const typename KernelTuple::attr_type& attr) {
-  auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
-  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
-  // Here could do some runtime benchmark of this attr and return the best one.
-  // But yet just get the first one as the default best one,
-  // which is searched in order and tuned by offline.
-  return funcs[0];
-}
-
-template <typename KernelTuple, typename PlaceType>
-class KernelFuncs {
- public:
-  KernelFuncs() = default;
-  static KernelFuncs& Cache() {
-    static thread_local KernelFuncs<KernelTuple, PlaceType> g_func_cache;
-    return g_func_cache;
-  }
-
-  // the exposed interface to use
-  typename KernelTuple::func_type At(
-      const typename KernelTuple::attr_type& attr) {
-    // Maybe here is not good enough, not all kernels should have jitcode
-    int64_t key = JitCodeKey<typename KernelTuple::attr_type>(attr);
-    if (Has(key)) {
-      return funcs_.at(key);
-    }
-    // If do not have this attr in cache then get the default best
-    auto func = GetDefaultBestFunc<KernelTuple, PlaceType>(attr);
-    Insert(key, func);
-    return func;
-  }
-
-  typename KernelTuple::func_type operator[](
-      const typename KernelTuple::attr_type& attr) {
-    return At(attr);
-  }
-
- protected:
-  bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); }
-  void Insert(int64_t key, typename KernelTuple::func_type func) {
-    funcs_.emplace(key, func);
-  }
-
- private:
-  std::unordered_map<int64_t, typename KernelTuple::func_type> funcs_;
-  DISABLE_COPY_AND_ASSIGN(KernelFuncs);
-};
-
-const char* to_string(KernelType kt);
-const char* to_string(SeqPoolType kt);
-
-KernelType to_kerneltype(const std::string& act);
-
-inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) {
-  os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
-     << "],act_cand[" << to_string(attr.act_cand) << "],act_cell["
-     << to_string(attr.act_cell) << "],use_peephole["
-     << (attr.use_peephole ? "True" : "False") << "]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
-  os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
-     << "],act_cand[" << to_string(attr.act_cand) << "]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
-  os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
-     << to_string(attr.type) << "]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os,
-                                const emb_seq_pool_attr_t& attr) {
-  os << "table_height[" << attr.table_height << "],table_width["
-     << attr.table_width << "],index_height[" << attr.index_height
-     << "],index_width[" << attr.index_width << "],output_width["
-     << attr.out_width << "],pool_type[" << to_string(attr.pool_type) << "]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const sgd_attr_t& attr) {
-  os << "param_height[" << attr.param_height << "],param_width["
-     << attr.param_width << "],grad_height[" << attr.grad_height
-     << "],grad_width[" << attr.grad_width << "],selected_rows_size["
-     << attr.selected_rows_size << "]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
-  os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
-  return os;
-}
-
-// expose the method to pack matmul weight
-template <typename T>
-void pack_weights(const T* src, T* dst, int n, int k);
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
deleted file mode 100644
index 6e0393b820f3780940d37659a067a630a6a0ae2b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include <cstdint>
-#include "paddle/fluid/operators/jit/macro.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-typedef enum {
-  kNone = 0,
-  // sort by alphabet
-  kCRFDecoding = 1,
-  kEmbSeqPool = 2,
-  kGRUH1,
-  kGRUHtPart1,
-  kGRUHtPart2,
-  kHSum,  // horizontal max
-  kHMax,  // horizontal sum
-  kLSTMCtHt,
-  kLSTMC1H1,
-  kLayerNorm,
-  kMatMul,
-  kNCHW16CMulNC,
-  kSeqPool,
-  kSoftmax,
-  kStrideASum,
-  kStrideScal,
-  kVAdd,
-  kVAddBias,
-  kVAddRelu,
-  kVBroadcast,
-  kVCopy,
-  kVExp,
-  kVIdentity,
-  kVMul,
-  kVRelu,
-  kVScal,
-  kSgd,
-  kVSigmoid,
-  kVSquare,
-  kVSub,
-  kVTanh,
-} KernelType;
-
-typedef enum {
-  kNonePoolType = 0,
-  kSum = 1,
-  kAvg,
-  kSqrt,
-} SeqPoolType;
-
-// x, y, z, n
-template <typename T>
-struct XYZNTuple {
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, int);
-};
-
-// a, x, y, n
-template <typename T>
-struct AXYNTuple : public XYZNTuple<T> {};
-
-// a, x, y, n, stride
-template <typename T>
-struct AXYNSTuple {
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, int, int);
-};
-
-// x, y, n
-template <typename T>
-struct XYNTuple {
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, T*, int);
-};
-
-// x, returned value, n
-template <typename T>
-struct XRNTuple : public XYNTuple<T> {};
-
-// x, returned value, n, stride
-template <typename T>
-struct XRNSTuple {
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, T*, int, int);
-};
-
-#define DECLARE_KERNELTUPLE(kernel_tuple, type)        \
-  template <typename T>                                \
-  struct type##Tuple : public kernel_tuple<T> {        \
-    static constexpr KernelType kernel_type = k##type; \
-  }
-
-// Tuple should be corresponding to the KernelType
-DECLARE_KERNELTUPLE(XYZNTuple, VMul);
-DECLARE_KERNELTUPLE(XYZNTuple, VAdd);
-DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu);
-DECLARE_KERNELTUPLE(XYZNTuple, VSub);
-
-DECLARE_KERNELTUPLE(AXYNTuple, VScal);
-DECLARE_KERNELTUPLE(AXYNTuple, VAddBias);
-
-DECLARE_KERNELTUPLE(AXYNSTuple, StrideScal);
-
-DECLARE_KERNELTUPLE(XYNTuple, VRelu);
-DECLARE_KERNELTUPLE(XYNTuple, VIdentity);
-DECLARE_KERNELTUPLE(XYNTuple, VSquare);
-DECLARE_KERNELTUPLE(XYNTuple, VExp);
-DECLARE_KERNELTUPLE(XYNTuple, VSigmoid);
-DECLARE_KERNELTUPLE(XYNTuple, VTanh);
-DECLARE_KERNELTUPLE(XYNTuple, VCopy);
-
-DECLARE_KERNELTUPLE(XRNTuple, HMax);
-DECLARE_KERNELTUPLE(XRNTuple, HSum);
-
-DECLARE_KERNELTUPLE(XRNSTuple, StrideASum);
-
-typedef struct {
-  void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
-  const void* ct_1;
-  void* ct;
-  void* ht;
-  /* weight_peephole and checked data are only used in peephole*/
-  const void* wp{nullptr};  //  W_ic, W_fc, W_oc
-  void* checked{nullptr};   // size: 2 * d
-} lstm_t;
-
-typedef struct {
-  void* gates;  // gates: {x_update, x_reset; x_state}
-  const void* ht_1;
-  void* ht;
-} gru_t;
-
-struct rnn_attr_s {
-  int d;
-  KernelType act_gate, act_cand;
-  rnn_attr_s() = default;
-  explicit rnn_attr_s(int _d, KernelType _act_gate, KernelType _act_cand)
-      : d(_d), act_gate(_act_gate), act_cand(_act_cand) {}
-};
-
-struct lstm_attr_s : public rnn_attr_s {
-  bool use_peephole;
-  KernelType act_cell;
-  lstm_attr_s() = default;
-  explicit lstm_attr_s(int _d, KernelType _act_gate, KernelType _act_cand,
-                       KernelType _act_cell, bool _use_peephole = false)
-      : rnn_attr_s(_d, _act_gate, _act_cand),
-        use_peephole(_use_peephole),
-        act_cell(_act_cell) {}
-};
-
-typedef struct rnn_attr_s gru_attr_t;
-typedef struct lstm_attr_s lstm_attr_t;
-
-template <typename T>
-struct LSTMTuple {
-  typedef T data_type;
-  typedef lstm_attr_t attr_type;
-  typedef void (*func_type)(lstm_t*, const lstm_attr_t*);
-};
-
-template <typename T>
-struct GRUTuple {
-  typedef T data_type;
-  typedef gru_attr_t attr_type;
-  typedef void (*func_type)(gru_t*, const gru_attr_t*);
-};
-
-DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt);
-DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1);
-
-DECLARE_KERNELTUPLE(GRUTuple, GRUH1);
-DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1);
-DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2);
-
-#undef DECLARE_KERNELTUPLE
-
-template <typename T>
-struct VBroadcastTuple {
-  static constexpr KernelType kernel_type = kVBroadcast;
-  typedef T data_type;
-  typedef int64_t attr_type;
-  typedef void (*func_type)(const T*, T*, int64_t, int64_t);
-};
-
-typedef struct seq_pool_attr_s {
-  int h, w;  // h should always be the first one
-  SeqPoolType type;
-  seq_pool_attr_s() = default;
-  explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1)
-      : h(height), w(width), type(pool_type) {}
-} seq_pool_attr_t;
-
-template <typename T>
-struct SeqPoolTuple {
-  static constexpr KernelType kernel_type = kSeqPool;
-  typedef T data_type;
-  typedef seq_pool_attr_t attr_type;
-  typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
-};
-
-typedef struct emb_seq_pool_attr_s {
-  int64_t table_height, table_width;
-  int64_t index_height, index_width;
-  int64_t out_width;
-  SeqPoolType pool_type;
-  emb_seq_pool_attr_s() = default;
-  explicit emb_seq_pool_attr_s(int64_t tbl_height, int64_t tbl_width,
-                               int64_t idx_height, int64_t idx_width,
-                               int64_t output_width,
-                               SeqPoolType seqpool_type = SeqPoolType::kSum)
-      : table_height(tbl_height),
-        table_width(tbl_width),
-        index_height(idx_height),
-        index_width(idx_width),
-        out_width(output_width),
-        pool_type(seqpool_type) {}
-} emb_seq_pool_attr_t;
-
-template <typename T>
-struct EmbSeqPoolTuple {
-  static constexpr KernelType kernel_type = kEmbSeqPool;
-  typedef T data_type;
-  typedef emb_seq_pool_attr_t attr_type;
-  typedef void (*func_type)(const T*, const int64_t*, T*,
-                            const emb_seq_pool_attr_t*);
-};
-
-typedef struct sgd_attr_s {
-  int64_t param_height, param_width;
-  int64_t grad_height, grad_width;
-  int64_t selected_rows_size;
-  sgd_attr_s() = default;
-  explicit sgd_attr_s(int64_t param_h, int64_t param_w, int64_t grad_h,
-                      int64_t grad_w, int64_t selected_rows_sz)
-      : param_height(param_h),
-        param_width(param_w),
-        grad_height(grad_h),
-        grad_width(grad_w),
-        selected_rows_size(selected_rows_sz) {}
-} sgd_attr_t;
-
-template <typename T>
-struct SgdTuple {
-  static constexpr KernelType kernel_type = kSgd;
-  typedef T data_type;
-  typedef sgd_attr_t attr_type;
-  typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*,
-                            const sgd_attr_t*);
-};
-
-typedef struct matmul_attr_s {
-  int m, n, k;
-  void* packed_weight{nullptr};
-  matmul_attr_s() = default;
-  explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr)
-      : m(m_), n(n_), k(k_), packed_weight(packed_weight_) {}
-} matmul_attr_t;
-
-template <typename T>
-struct MatMulTuple {
-  static constexpr KernelType kernel_type = kMatMul;
-  typedef T data_type;
-  typedef matmul_attr_t attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
-};
-
-template <typename T>
-struct CRFDecodingTuple {
-  static constexpr KernelType kernel_type = kCRFDecoding;
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const int, const T*, const T*, T*, int*, int);
-};
-
-template <typename T>
-struct LayerNormTuple {
-  static constexpr KernelType kernel_type = kLayerNorm;
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int,
-                            const float, int);
-};
-
-template <typename T>
-struct SoftmaxTuple {
-  static constexpr KernelType kernel_type = kSoftmax;
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, T*, int, int, int);
-};
-
-// nChw16c = nChw16c .* NC
-template <typename T>
-struct NCHW16CMulNCTuple {
-  static constexpr KernelType kernel_type = kNCHW16CMulNC;
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, int, int);
-};
-
-// Just for adding to kernel pool without template
-class Kernel {
- public:
-  Kernel() = default;
-  virtual ~Kernel() = default;
-  virtual const char* ImplType() const = 0;
-  DISABLE_COPY_AND_ASSIGN(Kernel);
-};
-
-template <typename KernelTuple>
-class KernelMore : public Kernel {
- public:
-  using T = typename KernelTuple::data_type;
-  using Func = typename KernelTuple::func_type;
-  using Attr = typename KernelTuple::attr_type;
-  virtual Func GetFunc() const { return func; }
-  // specify this kernel can be used, means it should not fail if use it.
-  virtual bool CanBeUsed(const Attr& attr) const = 0;
-
- protected:
-  Func func{nullptr};
-};
-
-template <typename KernelTuple>
-class ReferKernel : public KernelMore<KernelTuple> {
- public:
-  // Refer code can always be used
-  bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override {
-    return true;
-  }
-  const char* ImplType() const override { return "Refer"; }
-};
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc
deleted file mode 100644
index 1ad220b3972a3d3920610ab8f7ea416892a80d22..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/kernel_key.h"
-#include <xxhash.h>  // XXH64: 13.8 GB/s
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-template <>
-int64_t JitCodeKey<int>(const int& d) {
-  return d;
-}
-
-template <>
-int64_t JitCodeKey<int64_t>(const int64_t& d) {
-  return d;
-}
-
-template <>
-int64_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
-  return XXH64(&attr, sizeof(gru_attr_t), 0);
-}
-
-template <>
-int64_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
-  int keys[5] = {
-      attr.d, static_cast<int>(attr.act_gate), static_cast<int>(attr.act_cand),
-      static_cast<int>(attr.act_cell), static_cast<int>(attr.use_peephole)};
-  return XXH64(keys, sizeof(int) * 5, 0);
-}
-
-template <>
-int64_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
-  int keys[2] = {attr.w, static_cast<int>(attr.type)};
-  return XXH64(keys, sizeof(int) * 2, 0);
-}
-
-template <>
-int64_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
-  return XXH64(&attr, sizeof(int) * 3, 0);  // m, n, k
-}
-
-template <>
-int64_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
-  return attr.table_width;
-}
-
-template <>
-int64_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
-  return attr.grad_width;
-}
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h
deleted file mode 100644
index b2cf92f23e8ccff5fff7c6e193f7118fbb4765f0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/kernel_key.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/jit/kernel_base.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-struct KernelKey {
-  struct Hash {
-    size_t operator()(const KernelKey& key) const {
-      int place = key.place_.which();               // less than 2^8
-      int type = static_cast<int>(key.type_) << 8;  // less than 2^(32-8)
-      std::hash<int> hasher;
-      return hasher(place + type);
-    }
-  };
-
-  KernelType type_;
-  platform::Place place_;
-
-  KernelKey(KernelType type, platform::Place place)
-      : type_(type), place_(place) {}
-  size_t hash_key() const { return Hash()(*this); }
-
-  bool operator==(const KernelKey& o) const {
-    return platform::places_are_same_class(place_, o.place_) &&
-           type_ == o.type_;
-  }
-  bool operator!=(const KernelKey& o) const { return !(*this == o); }
-};
-
-// Every JitCode should have a method to get the key from attribution
-template <typename Attr>
-int64_t JitCodeKey(const Attr& attr);
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_pool.cc b/paddle/fluid/operators/jit/kernel_pool.cc
deleted file mode 100644
index bc98c644fbee2cd54faf4dc9fe151b8be131bd7b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/kernel_pool.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/kernel_pool.h"
-#include <memory>  // for shared_ptr
-#include <string>
-#include <unordered_map>
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-JitCodeCreatorPool& JitCodeCreatorPool::Instance() {
-  static JitCodeCreatorPool g_creator_pool;
-  return g_creator_pool;
-}
-
-KernelPool& KernelPool::Instance() {
-  static KernelPool g_kernel_pool;
-  return g_kernel_pool;
-}
-
-ReferKernelPool& ReferKernelPool::Instance() {
-  static ReferKernelPool g_refer_kernel_pool;
-  return g_refer_kernel_pool;
-}
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h
deleted file mode 100644
index 04710a54ac9ddf2ecb8f6a1f2ca33ef158d2d73f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/kernel_pool.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <memory>  // for unique_ptr
-#include <string>
-#include <unordered_map>
-#include <utility>  // for move
-#include <vector>
-#include "paddle/fluid/operators/jit/gen_base.h"
-#include "paddle/fluid/operators/jit/kernel_base.h"
-#include "paddle/fluid/operators/jit/kernel_key.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-template <KernelType KT>
-class JitCodePool {
-  typedef std::unique_ptr<GenBase> GenBasePtr;
-  typedef std::unordered_map<int64_t, GenBasePtr> JitCodeMap;
-
- public:
-  JitCodePool() = default;
-  static JitCodePool& Instance() {
-    static thread_local JitCodePool<KT> g_jit_codes;
-    return g_jit_codes;
-  }
-
-  const JitCodeMap& AllKernels() { return codes_; }
-
-  bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); }
-
-  void Insert(int64_t key, GenBasePtr value) {
-    codes_.emplace(key, std::move(value));
-  }
-
- private:
-  JitCodeMap codes_;
-  DISABLE_COPY_AND_ASSIGN(JitCodePool);
-};
-
-class JitCodeCreatorPool {
-  typedef std::unique_ptr<const GenCreator> GenCreatorPtr;
-  typedef std::unordered_map<KernelKey, std::vector<GenCreatorPtr>,
-                             KernelKey::Hash>
-      GenCreatorPtrMap;
-
- public:
-  JitCodeCreatorPool() = default;
-  static JitCodeCreatorPool& Instance();
-  GenCreatorPtrMap& AllCreators() { return creators_; }
-  void Insert(const KernelKey& key, GenCreatorPtr value) {
-    if (creators_.find(key) == creators_.end()) {
-      creators_.emplace(key, std::vector<GenCreatorPtr>());
-    }
-    creators_.at(key).emplace_back(std::move(value));
-  }
-
- private:
-  GenCreatorPtrMap creators_;
-  DISABLE_COPY_AND_ASSIGN(JitCodeCreatorPool);
-};
-
-typedef std::unique_ptr<const Kernel> KernelPtr;
-typedef std::unordered_map<KernelKey, std::vector<KernelPtr>, KernelKey::Hash>
-    KernelMap;
-
-class KernelPool {
- public:
-  static KernelPool& Instance();
-  KernelPool() = default;
-  KernelMap& AllKernels() { return pool_; }
-  void Insert(const KernelKey& key, KernelPtr value) {
-    if (pool_.find(key) == pool_.end()) {
-      pool_.emplace(key, std::vector<KernelPtr>());
-    }
-    pool_.at(key).emplace_back(std::move(value));
-  }
-
- private:
-  KernelMap pool_;
-  DISABLE_COPY_AND_ASSIGN(KernelPool);
-};
-
-// Every kernel should have refer code and it should be used in unit tests,
-// so refer kernels should have it's independent kernel pool
-class ReferKernelPool {
- public:
-  static ReferKernelPool& Instance();
-  ReferKernelPool() = default;
-  KernelMap& AllKernels() { return pool_; }
-  void Insert(const KernelKey& key, KernelPtr value) {
-    if (pool_.find(key) == pool_.end()) {
-      pool_.emplace(key, std::vector<KernelPtr>());
-    }
-    pool_.at(key).emplace_back(std::move(value));
-  }
-
- private:
-  KernelMap pool_;
-  DISABLE_COPY_AND_ASSIGN(ReferKernelPool);
-};
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/macro.h b/paddle/fluid/operators/jit/macro.h
deleted file mode 100644
index b2622eba8b70cc553a2da44638d577c9d7751b25..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/macro.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include <type_traits>
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/CMakeLists.txt b/paddle/fluid/operators/jit/more/CMakeLists.txt
deleted file mode 100644
index fa503356baa73cb76e50ff19901a56d0c987ad99..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-
-function(USE_JITKERNEL_MORE TARGET TYPE)
-    file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n")
-endfunction()
-
-if(WITH_MKLML)
-    add_subdirectory(mkl)
-endif()
-
-if(WITH_AVX)
-    add_subdirectory(intrinsic)
-endif()
-
-# mix should be last
-add_subdirectory(mix)
-
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
deleted file mode 100644
index 468937a4f6b27ae525bfd0d8e99cc891eedbc353..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-
-file(GLOB jit_kernel_cc_intrinsic RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_base)
-
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE)
-
-# use mkl kernels by name and type
-USE_JITKERNEL_MORE(kCRFDecoding, intrinsic)
-USE_JITKERNEL_MORE(kLayerNorm, intrinsic)
diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
deleted file mode 100644
index 7e1f7ab8bf8b0950a15fd55a03b0d61a509591da..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h"
-#include <limits>
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace more {
-namespace intrinsic {
-// Note: intrinsic code is not runtime build.
-// For example, if you build code on AVX, and run on AVX512 it can only use AVX
-
-void CRFDecoding(const int seq_len, const float* x, const float* w,
-                 float* alpha, int* track, int tag_num) {
-#ifdef __AVX512F__
-  const int step_size = ZMM_FLOAT_BLOCK;
-#else
-  const int step_size = YMM_FLOAT_BLOCK;
-#endif
-  const int end = tag_num / step_size;
-  const int rest = tag_num % step_size;
-  /* Setup the alpha initial value.*/
-  int i_offset = 0;
-  int last_offset = rest - step_size;
-  for (int i = 0; i <= end; ++i) {
-#ifdef __AVX512F__
-    // Declare the variable for the content of weights, input and alpha values.
-    __m512 w_content, x_content, alpha_content;
-    // Load the relevant data into the variables from un-aligned address.
-    w_content = _mm512_loadu_ps(w + i_offset);
-    x_content = _mm512_loadu_ps(x + i_offset);
-    alpha_content = _mm512_add_ps(w_content, x_content);
-    // Save the alpha value.
-    _mm512_storeu_ps(alpha + i_offset, alpha_content);
-#else
-    // AVX or AVX2
-    // weights, input and alpha values.
-    __m256 w_content, x_content, alpha_content;
-    // Load the relevant data into the variables from un-aligned address.
-    w_content = _mm256_loadu_ps(w + i_offset);
-    x_content = _mm256_loadu_ps(x + i_offset);
-    alpha_content = _mm256_add_ps(w_content, x_content);
-    _mm256_storeu_ps(alpha + i_offset, alpha_content);
-#endif
-    i_offset += step_size;
-    if (i == end - 1) {
-      if (rest > 0) {
-        i_offset += last_offset;
-      } else {
-        break;
-      }
-    }
-  }
-  // Use the column-major strategy to get the location of maximum score.
-  int seq_offset = 0;
-  constexpr int state_trans_base_idx = 2;
-  for (int k = 1; k < seq_len; ++k) {
-    int j_offset = 0;
-    for (int j = 0; j <= end; ++j) {
-/* Initialize the variables of maximum score and location.*/
-#ifdef __AVX512F__
-      __m512 max_score = _mm512_set1_ps(-std::numeric_limits<float>::max());
-      __m512i max_j = _mm512_setzero_si512();
-#else
-      __m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max());
-      __m256i max_j = _mm256_set1_epi32(0);
-#endif
-      /* Calculate the offset of transition_weights.*/
-      int trans_offset = state_trans_base_idx * tag_num + j_offset;
-      for (int i = 0; i < tag_num; ++i) {
-/* Initalize the content of alpha variable with related offset.*/
-#ifdef __AVX512F__
-        __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i));
-        /* Obtain the content of weights from un-aligned address.*/
-        __m512 w_content = _mm512_loadu_ps(w + trans_offset);
-        __m512 score_v = _mm512_add_ps(alpha_content, w_content);
-        __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
-        /* AVX512 instructions.*/
-        max_j = _mm512_mask_set1_epi32(max_j, mask, i);
-        /* Update the max_score value.*/
-        max_score = _mm512_max_ps(max_score, score_v);
-
-#else
-        __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i);
-        /* Obtain the content of weights from un-aligned address.*/
-        __m256 w_content = _mm256_loadu_ps(w + trans_offset);
-        __m256 score_v = _mm256_add_ps(alpha_content, w_content);
-        __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
-/* According to the mask value, update the index of the max_score.*/
-#ifdef __AVX2__
-        max_j = _mm256_or_si256(
-            _mm256_andnot_si256((__m256i)mask, max_j),
-            _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
-#else
-        __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
-        __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
-        __m128i lo_mask =
-            _mm256_extractf128_si256(*(__m256i*)&mask, 0);  // NOLINT
-        __m128i hi_mask =
-            _mm256_extractf128_si256(*(__m256i*)&mask, 1);  // NOLINT
-        lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
-        hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
-        lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
-        hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
-        lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
-        hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
-        max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
-        max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
-#endif
-        /* Update the max_score value.*/
-        max_score = _mm256_max_ps(max_score, score_v);
-
-#endif
-
-        trans_offset += tag_num;
-      }
-/* Update the alpha and track values. */
-#ifdef __AVX512F__
-      __m512 x_content = _mm512_loadu_ps(x + seq_offset + tag_num + j_offset);
-      max_score = _mm512_add_ps(max_score, x_content);
-      _mm512_storeu_ps(alpha + seq_offset + tag_num + j_offset, max_score);
-      _mm512_storeu_si512(
-          reinterpret_cast<__m512i*>(track + seq_offset + tag_num + j_offset),
-          max_j);
-#else
-      __m256 x_content = _mm256_loadu_ps(x + seq_offset + tag_num + j_offset);
-      max_score = _mm256_add_ps(max_score, x_content);
-      _mm256_storeu_ps(alpha + seq_offset + tag_num + j_offset, max_score);
-      _mm256_storeu_si256(
-          reinterpret_cast<__m256i*>(track + seq_offset + tag_num + j_offset),
-          max_j);
-#endif
-
-      /* Calculate the offset of next step*/
-      j_offset += step_size;
-      if (j == end - 1) {
-        if (rest > 0) {
-          j_offset += last_offset;
-        } else {
-          break;
-        }
-      }
-    }
-    seq_offset += tag_num;
-  }
-}
-
-bool CRFDecodingKernel::CanBeUsed(const int& d) const {
-#ifdef __AVX512F__
-  constexpr int block = ZMM_FLOAT_BLOCK;
-#else
-  constexpr int block = YMM_FLOAT_BLOCK;
-#endif
-  return platform::MayIUse(platform::avx) && d >= block;
-}
-
-}  // namespace intrinsic
-}  // namespace more
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace intrinsic = paddle::operators::jit::more::intrinsic;
-
-REGISTER_JITKERNEL_MORE(kCRFDecoding, intrinsic, intrinsic::CRFDecodingKernel);
diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
deleted file mode 100644
index 49b1a1fea4b16f435120bb37c7d9c8c07a4cc4f5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <type_traits>
-#include "paddle/fluid/operators/jit/kernel_base.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace more {
-namespace intrinsic {
-
-void CRFDecoding(const int seq_len, const float* x, const float* w,
-                 float* alpha, int* track, int tag_num);
-
-class CRFDecodingKernel : public KernelMore<CRFDecodingTuple<float>> {
- public:
-  CRFDecodingKernel() { this->func = CRFDecoding; }
-  bool CanBeUsed(
-      const typename CRFDecodingTuple<float>::attr_type&) const override;
-  const char* ImplType() const override { return "Intrinsic"; }
-};
-
-}  // namespace intrinsic
-}  // namespace more
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
deleted file mode 100644
index a4e3246f10495b67871c08fd8cb7ccd1cf085c9e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/more/intrinsic/layer_norm.h"
-#include <limits>
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace more {
-namespace intrinsic {
-
-void LayerNorm(float* x, float* out, float* mean, float* var,
-               const float* scale, const float* bias, int height,
-               const float epsilon, int right) {
-  __m256 sum;
-  __m256 mean_vec, var_vec;
-  __m128 hi, lo;
-  __m256 tmp;
-  size_t offset;
-  size_t j;
-  int block = YMM_FLOAT_BLOCK;
-  const int rest = right % block;
-  const int end = right - rest;
-
-  __m256 reverse_num_vec =
-      _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(right));
-  __m256 epsilon_vec = _mm256_set1_ps(epsilon);
-  int rest_mask =
-      ((-1) & (~((~0U) >> (sizeof(int) * 8 - (block - rest))))) & 0x0ff;
-  __m256i mask_vec = _mm256_set_epi32(
-      rest_mask & 0x80 ? 0xffffffff : 0, rest_mask & 0x40 ? 0xffffffff : 0,
-      rest_mask & 0x20 ? 0xffffffff : 0, rest_mask & 0x10 ? 0xffffffff : 0,
-      rest_mask & 0x8 ? 0xffffffff : 0, rest_mask & 0x4 ? 0xffffffff : 0,
-      rest_mask & 0x2 ? 0xffffffff : 0, rest_mask & 0x1 ? 0xffffffff : 0);
-
-  for (int i = 0; i < height; ++i) {
-    offset = i * right;
-
-    /* get mean */
-    sum = _mm256_setzero_ps();
-    for (j = offset; j < end + offset; j += block) {
-      sum = _mm256_add_ps(sum, _mm256_loadu_ps((const float*)x + j));
-    }
-    if (rest != 0) {
-      j = offset + right - block;
-      tmp = _mm256_loadu_ps((const float*)x + j);
-      tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp,
-                             *(__m256*)&mask_vec);  // NOLINT
-      sum = _mm256_add_ps(sum, tmp);
-    }
-    hi = _mm256_extractf128_ps(sum, 1);
-    lo = _mm256_extractf128_ps(sum, 0);
-    sum = _mm256_add_ps(
-        sum, _mm256_insertf128_ps(
-                 _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1));
-    sum = _mm256_hadd_ps(sum, sum);
-    sum = _mm256_hadd_ps(sum, sum);
-    mean_vec = _mm256_mul_ps(sum, reverse_num_vec);
-    mean[i] = *reinterpret_cast<float*>(&mean_vec);
-
-    /* get variance */
-    sum = _mm256_setzero_ps();
-    for (j = offset; j < end + offset; j += block) {
-      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
-      tmp = _mm256_mul_ps(tmp, tmp);
-      sum = _mm256_add_ps(sum, tmp);
-    }
-    if (rest != 0) {
-      j = offset + right - block;
-      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
-      tmp = _mm256_mul_ps(tmp, tmp);
-      tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp,
-                             *(__m256*)&mask_vec);  // NOLINT
-      sum = _mm256_add_ps(sum, tmp);
-    }
-    hi = _mm256_extractf128_ps(sum, 1);
-    lo = _mm256_extractf128_ps(sum, 0);
-    sum = _mm256_add_ps(
-        sum, _mm256_insertf128_ps(
-                 _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1));
-    sum = _mm256_hadd_ps(sum, sum);
-    sum = _mm256_hadd_ps(sum, sum);
-    var_vec = _mm256_mul_ps(sum, reverse_num_vec);
-    var[i] = *reinterpret_cast<float*>(&var_vec);
-
-    /* get x_norm and calculate output*/
-    for (j = offset; j < end + offset; j += block) {
-      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
-      tmp = _mm256_div_ps(tmp,
-                          _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec)));
-      _mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp);
-    }
-    if (rest != 0) {
-      j = offset + right - block;
-      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
-      tmp = _mm256_div_ps(tmp,
-                          _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec)));
-      _mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp);
-    }
-
-    if (scale) {
-      if (rest != 0) {
-        j = offset + right - block;
-        tmp = _mm256_loadu_ps((const float*)out + j);
-      }
-      for (j = offset; j < end + offset; j += block) {
-        _mm256_storeu_ps(
-            reinterpret_cast<float*>(out) + j,
-            _mm256_mul_ps(_mm256_loadu_ps((const float*)out + j),
-                          _mm256_loadu_ps((const float*)scale + j - offset)));
-      }
-      if (rest != 0) {
-        j = offset + right - block;
-        _mm256_storeu_ps(
-            reinterpret_cast<float*>(out) + j,
-            _mm256_mul_ps(tmp,
-                          _mm256_loadu_ps((const float*)scale + j - offset)));
-      }
-    }
-
-    if (bias) {
-      if (rest != 0) {
-        j = offset + right - block;
-        tmp = _mm256_loadu_ps((const float*)out + j);
-      }
-      for (j = offset; j < end + offset; j += block) {
-        _mm256_storeu_ps(
-            reinterpret_cast<float*>(out) + j,
-            _mm256_add_ps(_mm256_loadu_ps((const float*)out + j),
-                          _mm256_loadu_ps((const float*)bias + j - offset)));
-      }
-      if (rest != 0) {
-        j = offset + right - block;
-        _mm256_storeu_ps(reinterpret_cast<float*>(out) + j,
-                         _mm256_add_ps(tmp, _mm256_loadu_ps((const float*)bias +
-                                                            j - offset)));
-      }
-    }
-  }
-}
-
-bool LayerNormKernel::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK;
-}
-
-}  // namespace intrinsic
-}  // namespace more
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace intrinsic = paddle::operators::jit::more::intrinsic;
-
-REGISTER_JITKERNEL_MORE(kLayerNorm, intrinsic, intrinsic::LayerNormKernel);
diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
deleted file mode 100644
index 7b9f676050d806314edd1e46611416a8b7170add..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <type_traits>
-#include "paddle/fluid/operators/jit/kernel_base.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace more {
-namespace intrinsic {
-
-void LayerNorm(float* x, float* out, float* mean, float* var,
-               const float* scale, const float* bias, int height,
-               const float epsilon, int right);
-
-class LayerNormKernel : public KernelMore<LayerNormTuple<float>> {
- public:
-  LayerNormKernel() { this->func = LayerNorm; }
-  bool CanBeUsed(
-      const typename LayerNormTuple<float>::attr_type&) const override;
-  const char* ImplType() const override { return "Intrinsic"; }
-};
-
-}  // namespace intrinsic
-}  // namespace more
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
deleted file mode 100644
index dd039d29152961210958470a48f086a133ab640c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-file(GLOB jit_kernel_mix_cc RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base)
-
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE)
-
-USE_JITKERNEL_MORE(kVSigmoid, mix)
-USE_JITKERNEL_MORE(kVTanh, mix)
-USE_JITKERNEL_MORE(kLSTMCtHt, mix)
-USE_JITKERNEL_MORE(kLSTMC1H1, mix)
-USE_JITKERNEL_MORE(kGRUH1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart2, mix)
-USE_JITKERNEL_MORE(kSoftmax, mix)
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
deleted file mode 100644
index f5b7bfff89825bfcd6cbe4b1008628d3e1093f4c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/more/mix/mix.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace more {
-namespace mix {
-
-using CPUPlace = platform::CPUPlace;
-
-void VSigmoid(const T* x, T* y, int n) {
-  const float min = SIGMOID_THRESHOLD_MIN;
-  const float max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(0) - y[i];
-  }
-  auto compute = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
-  compute(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
-  }
-}
-
-void VTanh(const T* x, T* y, int n) {
-  const T a = 2, b = -1;
-  auto compute_scal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_addbias = KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_sigmoid = KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(n);
-  compute_scal(&a, x, y, n);
-  compute_sigmoid(y, y, n);
-  compute_scal(&a, y, y, n);
-  compute_addbias(&b, y, y, n);
-}
-
-// remain is the product of dimension shapes after the axis dimension
-void Softmax(const T* x, T* y, int n, int bs, int remain) {
-  auto compute_hmax = KernelFuncs<HMaxTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_hsum = KernelFuncs<HSumTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_vscal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_strideasum =
-      KernelFuncs<StrideASumTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_stridescal =
-      KernelFuncs<StrideScalTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_vaddbias =
-      KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_vexp = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
-
-  for (int i = 0; i < bs; ++i) {
-    T scalar;
-    compute_hmax(x, &scalar, n);
-    scalar = static_cast<T>(0) - scalar;
-    compute_vaddbias(&scalar, x, y, n);  // x - max
-    compute_vexp(y, y, n);
-    if (remain == 1) {
-      compute_hsum(y, &scalar, n);
-      scalar = static_cast<T>(1) / scalar;
-      compute_vscal(&scalar, y, y, n);
-    } else {
-      for (int j = 0; j < remain; ++j) {
-        compute_strideasum(&y[j], &scalar, n, remain);
-        scalar = static_cast<T>(1) / scalar;
-        compute_stridescal(&scalar, &y[j], &y[j], n, remain);
-      }
-    }
-    x += n;
-    y += n;
-  }
-}
-
-void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
-  if (type == kVSigmoid) {
-    return KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(d);
-  } else if (type == kVRelu) {
-    return KernelFuncs<VReluTuple<T>, CPUPlace>::Cache().At(d);
-  } else if (type == kVTanh) {
-    return KernelFuncs<VTanhTuple<T>, CPUPlace>::Cache().At(d);
-  } else if (type == kVIdentity) {
-    return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
-  }
-  PADDLE_THROW("Not support type: %s", type);
-  return nullptr;
-}
-
-void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
-  T* ct = reinterpret_cast<T*>(step->ct);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* wp = reinterpret_cast<const T*>(step->wp);
-  T* checked = reinterpret_cast<T*>(step->checked);
-  const int d = attr->d;
-  const int d2 = d * 2;
-  const int d3 = d * 3;
-  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
-  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
-  auto vadd_d2 = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d2);
-  auto act_gate_d = getActFunc(attr->act_gate, d);
-  auto act_gate_d2 = getActFunc(attr->act_gate, d2);
-  auto act_gate_d3 = getActFunc(attr->act_gate, d3);
-  auto act_cand_d = getActFunc(attr->act_cand, d);
-  auto act_cell_d = getActFunc(attr->act_cell, d);
-
-  if (attr->use_peephole) {
-    vmul_d(wp, ct_1, checked, d);
-    vmul_d(wp + d, ct_1, checked + d, d);
-    vadd_d2(checked, gates + d, gates + d, d2);
-    act_gate_d2(gates + d, gates + d, d2);
-  } else {
-    act_gate_d3(gates + d, gates + d, d3);
-  }
-
-  // C_t = C_t-1 * fgated + cand_gated * igated
-  act_cand_d(gates, gates, d);
-  vmul_d(gates, gates + d, gates + d, d);
-  vmul_d(ct_1, gates + d2, gates + d2, d);
-  vadd_d(gates + d, gates + d2, ct, d);
-
-  if (attr->use_peephole) {
-    // get ogated
-    vmul_d(wp + d2, ct, gates + d, d);
-    vadd_d(gates + d, gates + d3, gates + d3, d);
-    act_gate_d(gates + d3, gates + d3, d);
-  }
-  // H_t = act_cell(C_t) * ogated
-  act_cell_d(ct, gates + d2, d);
-  vmul_d(gates + d2, gates + d3, ht, d);
-}
-
-void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ct = reinterpret_cast<T*>(step->ct);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  int d = attr->d;
-  int d2 = d * 2;
-  int d3 = d * 3;
-  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
-  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
-  auto act_gate_d = getActFunc(attr->act_gate, d);
-  auto act_cand_d = getActFunc(attr->act_cand, d);
-  auto act_cell_d = getActFunc(attr->act_cell, d);
-  /* C_t = igated * cgated*/
-  act_gate_d(gates + d, gates + d, d);
-  act_cand_d(gates, gates, d);
-  vmul_d(gates, gates + d, ct, d);
-  if (attr->use_peephole) {
-    // get outgated, put W_oc * C_t on igated
-    const T* wp = reinterpret_cast<const T*>(step->wp);
-    vmul_d(wp + d2, ct, gates + d, d);
-    vadd_d(gates + d, gates + d3, gates + d3, d);
-  }
-  /* H_t = act_cell(C_t) * ogated */
-  act_gate_d(gates + d3, gates + d3, d);
-  act_cell_d(ct, gates + d2, d);
-  vmul_d(gates + d2, gates + d3, ht, d);
-}
-
-// compute h1 without h0
-void GRUH1(gru_t* step, const gru_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  int d = attr->d;
-  int d2 = d * 2;
-  auto act_gate = getActFunc(attr->act_gate, d);
-  auto act_cand = getActFunc(attr->act_cand, d);
-  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
-  act_gate(gates, gates, d);
-  act_cand(gates + d2, gates + d2, d);
-  vmul_d(gates, gates + d2, ht, d);
-}
-
-// compute the first part of GRU: ht = act_gate(r) * ht_1
-void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
-  // W: {W_update, W_reset; W_state}
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
-  auto act_gate = getActFunc(attr->act_gate, attr->d);
-  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(attr->d);
-  act_gate(gates + attr->d, gates + attr->d, attr->d);
-  vmul_d(ht_1, gates + attr->d, ht, attr->d);
-}
-
-// compute the second part of GRU:
-// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
-void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
-  int d = attr->d;
-  auto act_gate = getActFunc(attr->act_gate, d);
-  auto act_cand = getActFunc(attr->act_cand, d);
-  T* y = gates + d * 2;
-  act_gate(gates, gates, d);
-  act_cand(y, y, d);
-  // out = zt*ht~ + (1-zt)*ht_1
-  for (int i = 0; i < d; ++i) {
-    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
-  }
-}
-
-// TODO(TJ): tuning me
-bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; }
-
-bool VTanhKernel::CanBeUsed(const int& d) const { return true; }
-
-bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; }
-
-bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
-
-bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
-
-bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
-
-bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
-
-bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
-
-}  // namespace mix
-}  // namespace more
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace mix = paddle::operators::jit::more::mix;
-
-#define REGISTER_MORE_KERNEL(func) \
-  REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel)
-
-REGISTER_MORE_KERNEL(VSigmoid);
-REGISTER_MORE_KERNEL(VTanh);
-REGISTER_MORE_KERNEL(Softmax);
-REGISTER_MORE_KERNEL(LSTMCtHt);
-REGISTER_MORE_KERNEL(LSTMC1H1);
-REGISTER_MORE_KERNEL(GRUH1);
-REGISTER_MORE_KERNEL(GRUHtPart1);
-REGISTER_MORE_KERNEL(GRUHtPart2);
-
-#undef REGISTER_MORE_KERNEL
diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h
deleted file mode 100644
index 035425317edca95bc574807fa029ff373a7e10b8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <type_traits>
-#include "paddle/fluid/operators/jit/kernel_base.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace more {
-namespace mix {
-using T = float;
-
-void VSigmoid(const T* x, T* y, int n);
-void VTanh(const T* x, T* y, int n);
-void Softmax(const T* x, T* y, int n, int bs, int remain);
-
-void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
-void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);
-void GRUH1(gru_t* step, const gru_attr_t* attr);
-void GRUHtPart1(gru_t* step, const gru_attr_t* attr);
-void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
-
-#define DECLARE_MORE_KERNEL(name)                                             \
-  class name##Kernel : public KernelMore<name##Tuple<T>> {                    \
-   public:                                                                    \
-    name##Kernel() { this->func = name; }                                     \
-    bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
-    const char* ImplType() const override { return "Mixed"; }                 \
-  }
-
-// XYN
-DECLARE_MORE_KERNEL(VSigmoid);
-DECLARE_MORE_KERNEL(VTanh);
-
-// XRN
-DECLARE_MORE_KERNEL(Softmax);
-
-DECLARE_MORE_KERNEL(LSTMCtHt);
-DECLARE_MORE_KERNEL(LSTMC1H1);
-
-DECLARE_MORE_KERNEL(GRUH1);
-DECLARE_MORE_KERNEL(GRUHtPart1);
-DECLARE_MORE_KERNEL(GRUHtPart2);
-
-#undef DECLARE_MORE_KERNEL
-
-}  // namespace mix
-}  // namespace more
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
deleted file mode 100644
index 56f1a62ad4e06807dace2a81156d92f6b02a14df..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-
-cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml)
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE)
-
-# use mkl kernels by name and type
-USE_JITKERNEL_MORE(kMatMul, mkl)
-USE_JITKERNEL_MORE(kVMul, mkl)
-USE_JITKERNEL_MORE(kVAdd, mkl)
-USE_JITKERNEL_MORE(kVScal, mkl)
-USE_JITKERNEL_MORE(kStrideScal, mkl)
-USE_JITKERNEL_MORE(kVExp, mkl)
-USE_JITKERNEL_MORE(kVSquare, mkl)
-USE_JITKERNEL_MORE(kVCopy, mkl)
-USE_JITKERNEL_MORE(kVSigmoid, mkl)
-USE_JITKERNEL_MORE(kVTanh, mkl)
-USE_JITKERNEL_MORE(kSeqPool, mkl)
-USE_JITKERNEL_MORE(kSoftmax, mkl)
-USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
-USE_JITKERNEL_MORE(kSgd, mkl)
-USE_JITKERNEL_MORE(kVBroadcast, mkl)
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
deleted file mode 100644
index 75ebddb125989b121b62d42b50e896eccd392a71..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ /dev/null
@@ -1,310 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/more/mkl/mkl.h"
-#include "paddle/fluid/operators/jit/refer/refer.h"
-#include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/dynload/mklml.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace more {
-namespace mkl {
-
-template <>
-void MatMul<float>(const float* a, const float* b, float* c,
-                   const matmul_attr_t* attr) {
-  platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-                                 attr->m, attr->n, attr->k, 1.f, a, attr->k, b,
-                                 attr->n, 0.f, c, attr->n);
-}
-
-template <>
-void MatMul<double>(const double* a, const double* b, double* c,
-                    const matmul_attr_t* attr) {
-  platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-                                 attr->m, attr->n, attr->k, 1.0, a, attr->k, b,
-                                 attr->n, 0.0, c, attr->n);
-}
-
-template <>
-void VMul<float>(const float* x, const float* y, float* z, int n) {
-  platform::dynload::vsMul(n, x, y, z);
-}
-
-template <>
-void VMul<double>(const double* x, const double* y, double* z, int n) {
-  platform::dynload::vdMul(n, x, y, z);
-}
-
-template <>
-void VAdd<float>(const float* x, const float* y, float* z, int n) {
-  platform::dynload::vsAdd(n, x, y, z);
-}
-
-template <>
-void VAdd<double>(const double* x, const double* y, double* z, int n) {
-  platform::dynload::vdAdd(n, x, y, z);
-}
-
-template <>
-void VScal<float>(const float* a, const float* x, float* y, int n) {
-  if (x == y) {
-    platform::dynload::cblas_sscal(n, *a, y, 1);
-  } else {
-    refer::VScal<float>(a, x, y, n);
-  }
-}
-
-template <>
-void VScal<double>(const double* a, const double* x, double* y, int n) {
-  if (x == y) {
-    platform::dynload::cblas_dscal(n, *a, y, 1);
-  } else {
-    refer::VScal<double>(a, x, y, n);
-  }
-}
-
-template <>
-void StrideScal<float>(const float* a, const float* x, float* y, int n,
-                       int stride) {
-  if (x == y) {
-    platform::dynload::cblas_sscal(n / stride, *a, y, stride);
-  } else {
-    refer::StrideScal<float>(a, x, y, n, stride);
-  }
-}
-
-template <>
-void StrideScal<double>(const double* a, const double* x, double* y, int n,
-                        int stride) {
-  if (x == y) {
-    platform::dynload::cblas_dscal(n / stride, *a, y, stride);
-  } else {
-    refer::StrideScal<double>(a, x, y, n, stride);
-  }
-}
-
-template <>
-void VExp<float>(const float* x, float* y, int n) {
-  platform::dynload::vsExp(n, x, y);
-}
-
-template <>
-void VExp<double>(const double* x, double* y, int n) {
-  platform::dynload::vdExp(n, x, y);
-}
-
-template <>
-void VSquare<float>(const float* x, float* y, int n) {
-  platform::dynload::vsSqr(n, x, y);
-}
-
-template <>
-void VSquare<double>(const double* x, double* y, int n) {
-  platform::dynload::vdSqr(n, x, y);
-}
-
-template <>
-void VCopy<float>(const float* x, float* y, int n) {
-  platform::dynload::cblas_scopy(n, x, 1, y, 1);
-}
-
-template <>
-void VCopy<double>(const double* x, double* y, int n) {
-  platform::dynload::cblas_dcopy(n, x, 1, y, 1);
-}
-
-template <>
-void VAXPY<float>(float a, const float* x, float* y, int n) {
-  platform::dynload::cblas_saxpy(n, a, x, 1, y, 1);
-}
-
-template <>
-void VAXPY<double>(double a, const double* x, double* y, int n) {
-  platform::dynload::cblas_daxpy(n, a, x, 1, y, 1);
-}
-
-template <>
-void ASum<float>(const float* x, float* res, int n) {
-  res[0] = platform::dynload::cblas_sasum(n, x, 1);
-}
-
-template <>
-void ASum<double>(const double* x, double* res, int n) {
-  res[0] = platform::dynload::cblas_dasum(n, x, 1);
-}
-
-template <>
-void StrideASum<float>(const float* x, float* res, int n, int stride) {
-  res[0] = platform::dynload::cblas_sasum(n / stride, x, stride);
-}
-
-template <>
-void StrideASum<double>(const double* x, double* res, int n, int stride) {
-  res[0] = platform::dynload::cblas_dasum(n / stride, x, stride);
-}
-
-// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
-template <>
-bool VMulKernel<float>::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx512f) && d > 512;
-}
-
-template <>
-bool VAddKernel<float>::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx) && d > 512;
-}
-
-template <>
-bool VScalKernel<float>::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx512f) && d > 512;
-}
-
-template <>
-bool StrideScalKernel<float>::CanBeUsed(const int& d) const {
-  return true;
-}
-
-template <>
-bool VExpKernel<float>::CanBeUsed(const int& d) const {
-  return d > 7;
-}
-
-template <>
-bool VSquareKernel<float>::CanBeUsed(const int& d) const {
-  return d > 7;
-}
-
-template <>
-bool VCopyKernel<float>::CanBeUsed(const int& d) const {
-  return d > 15;
-}
-
-template <>
-bool VBroadcastKernel<float>::CanBeUsed(const int64_t& d) const {
-  return d > 127;
-}
-
-template <>
-bool VBroadcastKernel<double>::CanBeUsed(const int64_t& attr) const {
-  return true;
-}
-
-template <>
-bool VSigmoidKernel<float>::CanBeUsed(const int& d) const {
-  return d > 7;
-}
-
-template <>
-bool VTanhKernel<float>::CanBeUsed(const int& d) const {
-  return d > 7;
-}
-
-template <>
-bool SeqPoolKernel<float>::CanBeUsed(const seq_pool_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool SeqPoolKernel<double>::CanBeUsed(const seq_pool_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool EmbSeqPoolKernel<float>::CanBeUsed(const emb_seq_pool_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool EmbSeqPoolKernel<double>::CanBeUsed(
-    const emb_seq_pool_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool SgdKernel<float>::CanBeUsed(const sgd_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool MatMulKernel<float>::CanBeUsed(const matmul_attr_t& attr) const {
-  return platform::MayIUse(platform::avx);
-}
-
-template <>
-bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
-  // tuned on avx2
-  return platform::MayIUse(platform::avx) && d < 60;
-}
-
-#define AWALYS_USE_ME_WITH_DOUBLE(func)                      \
-  template <>                                                \
-  bool func##Kernel<double>::CanBeUsed(const int& d) const { \
-    return true;                                             \
-  }
-
-AWALYS_USE_ME_WITH_DOUBLE(VMul);
-AWALYS_USE_ME_WITH_DOUBLE(VAdd);
-AWALYS_USE_ME_WITH_DOUBLE(VScal);
-AWALYS_USE_ME_WITH_DOUBLE(StrideScal);
-AWALYS_USE_ME_WITH_DOUBLE(VExp);
-AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
-AWALYS_USE_ME_WITH_DOUBLE(VTanh);
-AWALYS_USE_ME_WITH_DOUBLE(VSquare);
-AWALYS_USE_ME_WITH_DOUBLE(VCopy);
-AWALYS_USE_ME_WITH_DOUBLE(Softmax);
-
-#undef AWALYS_USE_ME_WITH_DOUBLE
-}  // namespace mkl
-}  // namespace more
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
-
-namespace mkl = paddle::operators::jit::more::mkl;
-
-#define REGISTER_MKL_KERNEL(func)                                 \
-  REGISTER_JITKERNEL_MORE(k##func, mkl, mkl::func##Kernel<float>, \
-                          mkl::func##Kernel<double>)
-
-REGISTER_MKL_KERNEL(MatMul);
-REGISTER_MKL_KERNEL(VMul);
-REGISTER_MKL_KERNEL(VAdd);
-REGISTER_MKL_KERNEL(VScal);
-REGISTER_MKL_KERNEL(StrideScal);
-REGISTER_MKL_KERNEL(VExp);
-REGISTER_MKL_KERNEL(VSquare);
-REGISTER_MKL_KERNEL(VCopy);
-REGISTER_MKL_KERNEL(VBroadcast);
-REGISTER_MKL_KERNEL(VSigmoid);
-REGISTER_MKL_KERNEL(VTanh);
-REGISTER_MKL_KERNEL(SeqPool);
-REGISTER_MKL_KERNEL(EmbSeqPool);
-REGISTER_MKL_KERNEL(Softmax);
-REGISTER_MKL_KERNEL(Sgd);
-
-#undef REGISTER_MKL_KERNEL
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
deleted file mode 100644
index b38cc107b8e3038e04db4ed809d647e9a20d45fc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <type_traits>
-#include <vector>
-#include "paddle/fluid/operators/jit/kernel_base.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace more {
-namespace mkl {
-
-template <typename T>
-void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr);
-
-template <typename T>
-void VMul(const T* x, const T* y, T* z, int n);
-
-template <typename T>
-void VAdd(const T* x, const T* y, T* z, int n);
-
-template <typename T>
-void VScal(const T* a, const T* x, T* y, int n);
-
-template <typename T>
-void VExp(const T* x, T* y, int n);
-
-template <typename T>
-void VSquare(const T* x, T* y, int n);
-
-template <typename T>
-void VCopy(const T* x, T* y, int n);
-
-template <typename T>
-void VAXPY(T a, const T* x, T* y, int n);
-
-template <typename T>
-void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) {
-  for (int64_t h = 0; h < y_h; ++h) {
-    VCopy(x, y + h * x_len, x_len);
-  }
-}
-
-template <typename T>
-void VSigmoid(const T* x, T* y, int n) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(0) - y[i];
-  }
-  VExp(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
-  }
-}
-
-template <typename T>
-void VTanh(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * x[i];
-  }
-  VSigmoid(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
-  }
-}
-
-template <typename T>
-void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
-  VCopy<T>(x, y, attr->w);
-  for (int h = 1; h != attr->h; ++h) {
-    VAXPY<T>(static_cast<T>(1), x + h * attr->w, y, attr->w);
-  }
-  if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) {
-    T scalar = static_cast<T>(1);
-    if (attr->type == SeqPoolType::kAvg) {
-      scalar = scalar / static_cast<T>(attr->h);
-    } else {
-      scalar = scalar / std::sqrt(static_cast<T>(attr->h));
-    }
-    VScal<T>(&scalar, y, y, attr->w);
-  }
-}
-
-template <typename T>
-void EmbSeqPool(const T* table, const int64_t* idx, T* out,
-                const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
-  auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
-                      idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
-  };
-
-  for (int64_t w = 0; w != attr->index_width; ++w) {
-    check_idx_value_valid(w);
-    VCopy<T>(table + idx[w] * attr->table_width, out + w * attr->table_width,
-             attr->table_width);
-  }
-
-  for (int64_t h = 1; h < attr->index_height; ++h) {
-    for (int64_t w = 0; w < attr->index_width; ++w) {
-      int64_t i = h * attr->index_width + w;
-      check_idx_value_valid(i);
-      VAXPY<T>(static_cast<T>(1), table + idx[i] * attr->table_width,
-               out + w * attr->table_width, attr->table_width);
-    }
-  }
-}
-
-template <typename T>
-void ASum(const T* x, T* res, int n);
-
-template <typename T>
-void StrideASum(const T* x, T* res, int n, int stride);
-
-template <typename T>
-void StrideScal(const T* a, const T* x, T* y, int n, int stride);
-
-// remain is the product of dimension shapes after the axis dimension
-template <typename T>
-void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
-  std::vector<T> entities(bs);
-  for (int i = 0; i < bs; ++i) {
-    entities[i] = x[i * n];
-    for (int c = 1; c < n; ++c) {
-      entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i];
-    }
-    for (int c = 0; c < n; ++c) {
-      y[i * n + c] = x[i * n + c] - entities[i];
-    }
-  }
-  VExp(y, y, n * bs);
-  for (int i = 0; i < bs; ++i) {
-    T sum;
-    if (remain == 1) {
-      ASum(&y[i * n], &sum, n);
-      sum = static_cast<T>(1) / sum;
-      VScal(&sum, &y[i * n], &y[i * n], n);
-    } else {
-      for (int j = 0; j < remain; ++j) {
-        StrideASum(&y[i * n + j], &sum, n, remain);
-        sum = static_cast<T>(1) / sum;
-        StrideScal(&sum, &y[i * n + j], &y[i * n + j], n, remain);
-      }
-    }
-  }
-}
-
-template <typename T>
-void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
-         T* out, const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
-  T scalar = -lr[0];
-  int width = attr->grad_width;
-  if (out == param) {
-    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
-      auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
-      VAXPY(scalar, grad + i * width, out + h_idx * width, width);
-    }
-  } else {
-    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
-      auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
-      VScal(&scalar, grad + i * width, out + h_idx * width, width);
-      VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
-           width);
-    }
-  }
-}
-
-#define DECLARE_MKL_KERNEL(name)                                              \
-  template <typename T>                                                       \
-  class name##Kernel : public KernelMore<name##Tuple<T>> {                    \
-   public:                                                                    \
-    name##Kernel() { this->func = name<T>; }                                  \
-    bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
-    const char* ImplType() const override { return "MKL"; }                   \
-  }
-
-// ABCMNK
-DECLARE_MKL_KERNEL(MatMul);
-
-// XYZN
-DECLARE_MKL_KERNEL(VMul);
-DECLARE_MKL_KERNEL(VAdd);
-
-// AXYN
-DECLARE_MKL_KERNEL(VScal);
-DECLARE_MKL_KERNEL(StrideScal);
-
-// XYN
-DECLARE_MKL_KERNEL(VExp);
-DECLARE_MKL_KERNEL(VSigmoid);
-DECLARE_MKL_KERNEL(VTanh);
-DECLARE_MKL_KERNEL(VSquare);
-DECLARE_MKL_KERNEL(VCopy);
-
-// others
-DECLARE_MKL_KERNEL(SeqPool);
-DECLARE_MKL_KERNEL(EmbSeqPool);
-DECLARE_MKL_KERNEL(Softmax);
-DECLARE_MKL_KERNEL(Sgd);
-DECLARE_MKL_KERNEL(VBroadcast);
-
-#undef DECLARE_MKL_KERNEL
-
-}  // namespace mkl
-}  // namespace more
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt
deleted file mode 100644
index 7133f596620410d37ffe52a2ee92b7a9974bf1cc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-
-cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base)
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE)
-
-function(USE_JITKERNEL_REFER TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")
-endfunction()
-
-# use refer kernel by name
-USE_JITKERNEL_REFER(kVMul)
-USE_JITKERNEL_REFER(kVAdd)
-USE_JITKERNEL_REFER(kVAddRelu)
-USE_JITKERNEL_REFER(kVSub)
-USE_JITKERNEL_REFER(kVScal)
-USE_JITKERNEL_REFER(kStrideScal)
-USE_JITKERNEL_REFER(kVAddBias)
-USE_JITKERNEL_REFER(kVCopy)
-USE_JITKERNEL_REFER(kVRelu)
-USE_JITKERNEL_REFER(kVIdentity)
-USE_JITKERNEL_REFER(kVExp)
-USE_JITKERNEL_REFER(kVSigmoid)
-USE_JITKERNEL_REFER(kVTanh)
-USE_JITKERNEL_REFER(kLSTMCtHt)
-USE_JITKERNEL_REFER(kLSTMC1H1)
-USE_JITKERNEL_REFER(kGRUH1)
-USE_JITKERNEL_REFER(kGRUHtPart1)
-USE_JITKERNEL_REFER(kGRUHtPart2)
-USE_JITKERNEL_REFER(kCRFDecoding)
-USE_JITKERNEL_REFER(kLayerNorm)
-USE_JITKERNEL_REFER(kNCHW16CMulNC)
-USE_JITKERNEL_REFER(kSeqPool)
-USE_JITKERNEL_REFER(kMatMul)
-USE_JITKERNEL_REFER(kVSquare)
-USE_JITKERNEL_REFER(kHSum)
-USE_JITKERNEL_REFER(kHMax)
-USE_JITKERNEL_REFER(kStrideASum)
-USE_JITKERNEL_REFER(kSoftmax)
-USE_JITKERNEL_REFER(kEmbSeqPool)
-USE_JITKERNEL_REFER(kSgd)
-USE_JITKERNEL_REFER(kVBroadcast)
diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc
deleted file mode 100644
index 460cb6c58076d7f6c49b60fed45584bd9b506c63..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/jit/refer/refer.h"
-#include "paddle/fluid/operators/jit/registry.h"
-
-namespace refer = paddle::operators::jit::refer;
-
-#define REGISTER_REFER_KERNEL(func)                             \
-  REGISTER_JITKERNEL_REFER(k##func, refer::func##Kernel<float>, \
-                           refer::func##Kernel<double>)
-
-REGISTER_REFER_KERNEL(VMul);
-REGISTER_REFER_KERNEL(VAdd);
-REGISTER_REFER_KERNEL(VAddRelu);
-REGISTER_REFER_KERNEL(VSub);
-
-REGISTER_REFER_KERNEL(VScal);
-REGISTER_REFER_KERNEL(StrideScal);
-REGISTER_REFER_KERNEL(VAddBias);
-
-REGISTER_REFER_KERNEL(VRelu);
-REGISTER_REFER_KERNEL(VCopy);
-REGISTER_REFER_KERNEL(VIdentity);
-REGISTER_REFER_KERNEL(VSquare);
-REGISTER_REFER_KERNEL(VExp);
-REGISTER_REFER_KERNEL(VSigmoid);
-REGISTER_REFER_KERNEL(VTanh);
-
-REGISTER_REFER_KERNEL(LSTMCtHt);
-REGISTER_REFER_KERNEL(LSTMC1H1);
-
-REGISTER_REFER_KERNEL(GRUH1);
-REGISTER_REFER_KERNEL(GRUHtPart1);
-REGISTER_REFER_KERNEL(GRUHtPart2);
-
-REGISTER_REFER_KERNEL(CRFDecoding);
-REGISTER_REFER_KERNEL(LayerNorm);
-REGISTER_REFER_KERNEL(NCHW16CMulNC);
-REGISTER_REFER_KERNEL(SeqPool);
-REGISTER_REFER_KERNEL(MatMul);
-REGISTER_REFER_KERNEL(HMax);
-REGISTER_REFER_KERNEL(HSum);
-REGISTER_REFER_KERNEL(StrideASum);
-REGISTER_REFER_KERNEL(Softmax);
-REGISTER_REFER_KERNEL(EmbSeqPool);
-REGISTER_REFER_KERNEL(Sgd);
-REGISTER_REFER_KERNEL(VBroadcast);
-
-#undef REGISTER_REFER_KERNEL
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
deleted file mode 100644
index 136b99e0aeffec8e93e11c2e5e4f7bd35dd1c8d4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <limits>
-#include <string>
-#include "paddle/fluid/operators/jit/helper.h"
-#include "paddle/fluid/operators/jit/kernel_base.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace jit {
-namespace refer {
-
-// Refer code only focus on correctness
-template <typename T>
-void VMul(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-template <typename T>
-void VAdd(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-  }
-}
-
-template <typename T>
-void VAddRelu(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-    z[i] = z[i] > 0 ? z[i] : 0;
-  }
-}
-
-template <typename T>
-void VSub(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] - y[i];
-  }
-}
-
-template <typename T>
-void VScal(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] * x[i];
-  }
-}
-
-template <typename T>
-void VAddBias(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] + x[i];
-  }
-}
-
-template <typename T>
-void VCopy(const T* x, T* y, int n) {
-  std::memcpy(y, x, n * sizeof(T));
-}
-
-// x shape: (x_len)
-// y shape: (h, x_len)
-template <typename T>
-void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) {
-  for (int64_t h = 0; h < y_h; ++h) {
-    VCopy(x, y + h * x_len, x_len);
-  }
-}
-
-template <typename T>
-void VRelu(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
-}
-
-template <typename T>
-inline void VIdentity(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i];
-  }
-}
-
-template <typename T>
-inline void VSquare(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] * x[i];
-  }
-}
-
-template <typename T>
-void VExp(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
-template <typename T>
-void VSigmoid(const T* x, T* y, int n) {
-  // y = 1 / (1 + e^-x)
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
-  }
-}
-
-template <typename T>
-void VTanh(const T* x, T* y, int n) {
-  // y = 2 * sigmoid(2x) - 1
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * x[i];
-  }
-  VSigmoid(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
-  }
-}
-
-template <typename T>
-void (*getActFunc(KernelType type))(const T*, T*, int) {  // NOLINT
-  if (type == kVSigmoid) {
-    return VSigmoid<T>;
-  } else if (type == kVRelu) {
-    return VRelu<T>;
-  } else if (type == kVTanh) {
-    return VTanh<T>;
-  } else if (type == kVIdentity) {
-    return VIdentity<T>;
-  }
-  PADDLE_THROW("Not support type: %s", type);
-  return nullptr;
-}
-
-// TODO(TJ): add refer gemm and make LSTM kernels combine as same GRU kernels
-
-// compute ct and ht
-template <typename T>
-void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
-  T* ct = reinterpret_cast<T*>(step->ct);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* wp = reinterpret_cast<const T*>(step->wp);
-  T* checked = reinterpret_cast<T*>(step->checked);
-  auto act_gate = getActFunc<T>(attr->act_gate);
-  auto act_cand = getActFunc<T>(attr->act_cand);
-  auto act_cell = getActFunc<T>(attr->act_cell);
-  int d = attr->d;
-  int d2 = d * 2;
-  int d3 = d * 3;
-  // gates: W_ch, W_ih, W_fh, W_oh
-  if (attr->use_peephole) {
-    VMul(wp, ct_1, checked, d);
-    VMul(wp + d, ct_1, checked + d, d);
-    VAdd(checked, gates + d, gates + d, d2);
-    act_gate(gates + d, gates + d, d2);
-  } else {
-    act_gate(gates + d, gates + d, d3);
-  }
-
-  // C_t = C_t-1 * fgated + cand_gated * igated
-  act_cand(gates, gates, d);
-  VMul(gates, gates + d, gates + d, d);
-  VMul(ct_1, gates + d2, gates + d2, d);
-  VAdd(gates + d, gates + d2, ct, d);
-
-  if (attr->use_peephole) {
-    // get ogated
-    VMul(wp + d2, ct, gates + d, d);
-    VAdd(gates + d, gates + d3, gates + d3, d);
-    act_gate(gates + d3, gates + d3, d);
-  }
-  // H_t = act_cell(C_t) * ogated
-  act_cell(ct, gates + d2, d);
-  VMul(gates + d2, gates + d3, ht, d);
-}
-
-// compute c1 and h1 without c0 or h0
-template <typename T>
-void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ct = reinterpret_cast<T*>(step->ct);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  auto act_gate = getActFunc<T>(attr->act_gate);
-  auto act_cand = getActFunc<T>(attr->act_cand);
-  auto act_cell = getActFunc<T>(attr->act_cell);
-  int d = attr->d;
-  int d2 = d * 2;
-  int d3 = d * 3;
-  /* C_t = igated * cgated*/
-  act_gate(gates + d, gates + d, d);
-  act_cand(gates, gates, d);
-  VMul(gates, gates + d, ct, d);
-  if (attr->use_peephole) {
-    // get outgated, put W_oc * C_t on igated
-    const T* wp = reinterpret_cast<const T*>(step->wp);
-    VMul(wp + d2, ct, gates + d, d);
-    VAdd(gates + d, gates + d3, gates + d3, d);
-  }
-  /* H_t = act_cell(C_t) * ogated */
-  act_gate(gates + d3, gates + d3, d);
-  act_cell(ct, gates + d2, d);
-  VMul(gates + d2, gates + d3, ht, d);
-}
-
-// compute h1 without h0
-template <typename T>
-void GRUH1(gru_t* step, const gru_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  auto act_gate = getActFunc<T>(attr->act_gate);
-  auto act_cand = getActFunc<T>(attr->act_cand);
-  int d = attr->d;
-  int d2 = d * 2;
-  act_gate(gates, gates, d);
-  act_cand(gates + d2, gates + d2, d);
-  VMul(gates, gates + d2, ht, d);
-}
-
-// compute the first part of GRU: ht = act_gate(r) * ht_1
-template <typename T>
-void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
-  // W: {W_update, W_reset; W_state}
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
-  auto act_gate = getActFunc<T>(attr->act_gate);
-  act_gate(gates + attr->d, gates + attr->d, attr->d);
-  VMul(ht_1, gates + attr->d, ht, attr->d);
-}
-
-// compute the second part of GRU:
-// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
-template <typename T>
-void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
-  auto act_gate = getActFunc<T>(attr->act_gate);
-  auto act_cand = getActFunc<T>(attr->act_cand);
-  int d = attr->d;
-  T* y = gates + d * 2;
-  act_gate(gates, gates, d);
-  act_cand(y, y, d);
-  // out = zt*ht~ + (1-zt)*ht_1
-  for (int i = 0; i < d; ++i) {
-    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
-  }
-}
-
-template <typename T>
-void CRFDecoding(const int seq_len, const T* x, const T* w, T* alpha,
-                 int* track, int right) {
-  constexpr int state_trans_base_idx = 2;
-  for (int i = 0; i < right; ++i) {
-    alpha[i] = w[i] + x[i];
-  }
-  for (int k = 1; k < seq_len; ++k) {
-    for (int i = 0; i < right; ++i) {
-      T max_score = -std::numeric_limits<T>::max();
-      int max_j = 0;
-      for (int j = 0; j < right; ++j) {
-        T score = alpha[(k - 1) * right + j] +
-                  w[(j + state_trans_base_idx) * right + i];
-        if (score > max_score) {
-          max_score = score;
-          max_j = j;
-        }
-      }
-      alpha[k * right + i] = max_score + x[k * right + i];
-      track[k * right + i] = max_j;
-    }
-  }
-}
-
-template <typename T>
-void LayerNorm(T* x, T* out, T* mean, T* var, const T* scale, const T* bias,
-               int height, const float epsilon, int right) {
-  // get mean
-  for (int i = 0; i < height; i++) {
-    T sum = 0.0;
-    int offset = i * right;
-    for (int j = 0; j < right; j++) {
-      sum += x[offset + j];
-    }
-    mean[i] = sum / right;
-  }
-
-  // get variance
-  for (int i = 0; i < height; i++) {
-    T sum = 0.0;
-    int offset = i * right;
-    for (int j = 0; j < right; j++) {
-      sum += (x[offset + j] - mean[i]) * (x[offset + j] - mean[i]);
-    }
-    var[i] = sum / right;
-  }
-
-  for (int i = 0; i < height; i++) {
-    int offset = i * right;
-    T sqrt_var = std::sqrt(var[i] + (T)epsilon);
-    for (int j = 0; j < right; j++) {
-      out[offset + j] = (x[offset + j] - mean[i]) / sqrt_var;
-    }
-  }
-  if (scale) {
-    for (int i = 0; i < height; i++) {
-      int offset = i * right;
-      for (int j = 0; j < right; j++) {
-        out[offset + j] *= scale[j];
-      }
-    }
-  }
-
-  if (bias) {
-    for (int i = 0; i < height; i++) {
-      int offset = i * right;
-      for (int j = 0; j < right; j++) {
-        out[offset + j] += bias[j];
-      }
-    }
-  }
-}
-
-template <typename T>
-void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) {
-  int offset = 0;
-  for (int h = 0; h < height; ++h) {
-    for (int w = 0; w < width; ++w) {
-      for (int i = 0; i < 16; ++i) {
-        z[i + offset] = y[i] * x[i + offset];
-      }
-      offset += ZMM_FLOAT_BLOCK;
-    }
-  }
-}
-
-template <typename T>
-void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
-  for (int w = 0; w < attr->w; ++w) {
-    const T* src = x + w;
-    T* dst = y + w;
-    *dst = static_cast<T>(0);
-    for (int h = 0; h < attr->h; ++h) {
-      *dst = *dst + *src;
-      src += attr->w;
-    }
-  }
-  if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) {
-    T scalar = static_cast<T>(1);
-    if (attr->type == SeqPoolType::kAvg) {
-      scalar = scalar / static_cast<T>(attr->h);
-    } else {
-      scalar = scalar / std::sqrt(static_cast<T>(attr->h));
-    }
-    VScal<T>(&scalar, y, y, attr->w);
-  }
-}
-
-// A(M,K) * B(K,N) = C(M,N)
-template <typename T>
-void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) {
-  int M = attr->m;
-  int N = attr->n;
-  int K = attr->k;
-  for (int m = 0; m < M; ++m) {
-    const T* pa = A + m * K;
-    T* pc = C + m * N;
-    for (int n = 0; n < N; ++n) {
-      const T* pb = B + n;
-      pc[n] = pa[0] * pb[0];
-      for (int k = 1; k < K; ++k) {
-        pc[n] += pa[k] * pb[k * N];
-      }
-    }
-  }
-}
-
-template <typename T>
-void HMax(const T* x, T* res, int n) {
-  res[0] = x[0];
-  for (int i = 1; i < n; ++i) {
-    res[0] = res[0] < x[i] ? x[i] : res[0];
-  }
-}
-
-template <typename T>
-void HSum(const T* x, T* res, int n) {
-  res[0] = x[0];
-  for (int i = 1; i < n; ++i) {
-    res[0] += x[i];
-  }
-}
-
-template <typename T>
-void StrideASum(const T* x, T* res, int n, int stride) {
-  res[0] = x[0];
-  for (int i = stride; i < n; i += stride) {
-    res[0] += std::abs(x[i]);
-  }
-}
-
-template <typename T>
-void StrideScal(const T* a, const T* x, T* y, int n, int stride) {
-  for (int i = 0; i < n; ++i) {
-    if (i % stride == 0) {
-      y[i] = x[i] * a[0];
-    } else {
-      y[i] = x[i];
-    }
-  }
-}
-
-// y = e^(x - max(x))
-// y = y / sum(y)
-// remain is the product of dimension shapes after the axis dimension
-template <typename T>
-void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
-  for (int i = 0; i < bs; ++i) {
-    T scalar;
-    HMax(x, &scalar, n);
-    scalar = static_cast<T>(0) - scalar;
-    VAddBias(&scalar, x, y, n);  // x - max
-    VExp(y, y, n);
-    if (remain == 1) {
-      HSum(y, &scalar, n);
-      scalar = static_cast<T>(1) / scalar;
-      VScal(&scalar, y, y, n);
-    } else {
-      for (int j = 0; j < remain; j++) {
-        StrideASum(&y[j], &scalar, n, remain);
-        scalar = static_cast<T>(1) / scalar;
-        StrideScal(&scalar, &y[j], &y[j], n, remain);
-      }
-    }
-    x += n;
-    y += n;
-  }
-}
-
-// embedding seq pool
-// table is a matrix with (tbl_h, tbl_w)
-// idx is a matrix with (idx_h, idx_w)
-// output is a vector with length tbl_w * idx_w
-template <typename T>
-void EmbSeqPool(const T* table, const int64_t* idx, T* out,
-                const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
-
-  auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
-                      idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
-  };
-
-  for (int64_t w = 0; w != attr->index_width; ++w) {
-    check_idx_value_valid(w);
-    std::memcpy(out + w * attr->table_width, table + idx[w] * attr->table_width,
-                attr->table_width * sizeof(T));
-  }
-
-  for (int64_t h = 1; h < attr->index_height; ++h) {
-    for (int64_t w = 0; w < attr->index_width; ++w) {
-      int64_t i = h * attr->index_width + w;
-      check_idx_value_valid(i);
-      VAdd(table + idx[i] * attr->table_width, out + w * attr->table_width,
-           out + w * attr->table_width, attr->table_width);
-    }
-  }
-}
-
-// SGD algorithm:
-// lr is pointor of learning rate scalar
-// param is an input matrix with (param_h, param_w)
-// grad is an input matrix with (grad_h, grad_w), here grad_w == param_w
-// selected_rows is a vectot<int64_t> with size selected_rows_size( <= grad_h )
-// out is an output matrix with (param_h, param_w)
-//
-// support both regular and sparse grad
-// regular SGD: out[:] = param[:] - lr[0] * grad[:];
-// sparse SGD: out[rows[i]][:] = param[rows[i]][:] - lr[0] * grad[i][:]
-//
-// Note: when use sparse SGD, and if out != param,
-// the out rows which are not selected have not beed changed, which maybe empty
-template <typename T>
-void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
-         T* out, const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
-  for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
-    auto h_idx = rows[i];
-    PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-    PADDLE_ENFORCE_GE(h_idx, 0);
-    for (int64_t j = 0; j < attr->grad_width; ++j) {
-      out[h_idx * attr->grad_width + j] =
-          param[h_idx * attr->grad_width + j] -
-          lr[0] * grad[i * attr->grad_width + j];
-    }
-  }
-}
-
-#define DECLARE_REFER_KERNEL(name)                          \
-  template <typename T>                                     \
-  class name##Kernel : public ReferKernel<name##Tuple<T>> { \
-   public:                                                  \
-    name##Kernel() { this->func = name<T>; }                \
-  }
-
-// const T* x, const T* y, T* z, int n
-DECLARE_REFER_KERNEL(VMul);
-DECLARE_REFER_KERNEL(VAdd);
-DECLARE_REFER_KERNEL(VAddRelu);
-DECLARE_REFER_KERNEL(VSub);
-
-// const T* a, const T* x, T* y, int n
-DECLARE_REFER_KERNEL(VScal);
-DECLARE_REFER_KERNEL(VAddBias);
-
-// const T* a, const T* x, T* y, int n, int stride
-DECLARE_REFER_KERNEL(StrideScal);
-
-// const T* x, T* y, int n
-DECLARE_REFER_KERNEL(VRelu);
-DECLARE_REFER_KERNEL(VIdentity);
-DECLARE_REFER_KERNEL(VExp);
-DECLARE_REFER_KERNEL(VSigmoid);
-DECLARE_REFER_KERNEL(VTanh);
-DECLARE_REFER_KERNEL(VSquare);
-DECLARE_REFER_KERNEL(VCopy);
-
-// lstm_t*, const lstm_attr_t*
-DECLARE_REFER_KERNEL(LSTMCtHt);
-DECLARE_REFER_KERNEL(LSTMC1H1);
-
-// gru_t*, const gru_attr_t*
-DECLARE_REFER_KERNEL(GRUH1);
-DECLARE_REFER_KERNEL(GRUHtPart1);
-DECLARE_REFER_KERNEL(GRUHtPart2);
-
-DECLARE_REFER_KERNEL(HMax);
-DECLARE_REFER_KERNEL(HSum);
-
-DECLARE_REFER_KERNEL(StrideASum);
-
-// others
-DECLARE_REFER_KERNEL(CRFDecoding);
-DECLARE_REFER_KERNEL(LayerNorm);
-DECLARE_REFER_KERNEL(NCHW16CMulNC);
-DECLARE_REFER_KERNEL(SeqPool);
-DECLARE_REFER_KERNEL(MatMul);
-DECLARE_REFER_KERNEL(Softmax);
-DECLARE_REFER_KERNEL(EmbSeqPool);
-DECLARE_REFER_KERNEL(Sgd);
-DECLARE_REFER_KERNEL(VBroadcast);
-
-#undef DECLARE_REFER_KERNEL
-
-}  // namespace refer
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h
deleted file mode 100644
index 567a903236979ff4ac6095033f53d2a473f4eb2c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/registry.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <tuple>
-#include <type_traits>
-#include <utility>  // for std::move
-#include "paddle/fluid/operators/jit/kernel_base.h"
-#include "paddle/fluid/operators/jit/kernel_pool.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/variant.h"  // for UNUSED
-
-namespace paddle {
-namespace operators {
-namespace jit {
-
-// make_unique is supported since c++14
-template <typename T, typename... Args>
-inline std::unique_ptr<T> make_unique(Args&&... args) {
-  static_assert(!std::is_array<T>::value, "T must not be array");
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-template <typename Pool, typename PlaceType, bool IsEnd, size_t I,
-          typename... KernelImpls>
-struct JitKernelRegistrarFunctor;
-
-template <typename Pool, typename PlaceType, size_t I, typename... KernelImpls>
-struct JitKernelRegistrarFunctor<Pool, PlaceType, true, I, KernelImpls...> {
-  void operator()(KernelType kt) const {}
-};
-
-template <typename Pool, typename PlaceType, size_t I, typename... KernelImpls>
-struct JitKernelRegistrarFunctor<Pool, PlaceType, false, I, KernelImpls...> {
-  using KERNEL_IMPL_TYPE =
-      typename std::tuple_element<I, std::tuple<KernelImpls...>>::type;
-
-  void operator()(KernelType kt) const {
-    KernelKey kkey(kt, PlaceType());
-    Pool::Instance().Insert(kkey,
-                            std::move(make_unique<const KERNEL_IMPL_TYPE>()));
-    constexpr auto size = std::tuple_size<std::tuple<KernelImpls...>>::value;
-    JitKernelRegistrarFunctor<Pool, PlaceType, I + 1 == size, I + 1,
-                              KernelImpls...>
-        func;
-    func(kt);
-  }
-};
-
-template <typename Pool, typename PlaceType, typename... KernelImpls>
-class JitKernelRegistrar {
- public:
-  explicit JitKernelRegistrar(KernelType kt) {
-    JitKernelRegistrarFunctor<Pool, PlaceType, false, 0, KernelImpls...> func;
-    func(kt);
-  }
-  void Touch() {}
-};
-
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-
-// Refer always on CPUPlace
-#define REGISTER_JITKERNEL_REFER(kernel_type, ...)                             \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                    \
-      __reg_jitkernel_##kernel_type##_refer_CPUPlace,                          \
-      "REGISTER_KERNEL_REFER must be called in global namespace");             \
-  static ::paddle::operators::jit::JitKernelRegistrar<                         \
-      ::paddle::operators::jit::ReferKernelPool, ::paddle::platform::CPUPlace, \
-      __VA_ARGS__>                                                             \
-      __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_(                  \
-          ::paddle::operators::jit::KernelType::kernel_type);                  \
-  int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() {                    \
-    __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch();            \
-    return 0;                                                                  \
-  }
-
-// kernel_type: should be in paddle::operators::jit::KernelType
-// place_type: should be one of CPUPlace and GPUPlace in paddle::platform
-#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...)         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                   \
-      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type,             \
-      "REGISTER_KERNEL_MORE must be called in global namespace");             \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();             \
-  static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \
-      UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();           \
-  static ::paddle::operators::jit::JitKernelRegistrar<                        \
-      ::paddle::operators::jit::KernelPool, ::paddle::platform::place_type,   \
-      __VA_ARGS__>                                                            \
-      __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_(   \
-          ::paddle::operators::jit::KernelType::kernel_type);                 \
-  int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() {     \
-    __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_      \
-        .Touch();                                                             \
-    return 0;                                                                 \
-  }
-
-#define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \
-  REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
-
-#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \
-  REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
-
-#define REGISTER_JITKERNEL_GEN(kernel_type, ...)                    \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,                \
-      "REGISTER_JITKERNEL_GEN must be called in global namespace"); \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
-  static int __assert_gen_##kernel_type##_has_refer_ UNUSED =       \
-      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();          \
-  static ::paddle::operators::jit::JitKernelRegistrar<              \
-      ::paddle::operators::jit::JitCodeCreatorPool,                 \
-      ::paddle::platform::CPUPlace, __VA_ARGS__>                    \
-      __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_(         \
-          ::paddle::operators::jit::KernelType::kernel_type);       \
-  int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() {           \
-    __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch();   \
-    return 0;                                                       \
-  }
-
-#define USE_JITKERNEL_GEN(kernel_type)                            \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                       \
-      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,              \
-      "USE_JITKERNEL_GEN must be called in global namespace");    \
-  extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_();   \
-  static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
-      TouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
-
-#define USE_JITKERNEL_REFER(kernel_type)                            \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_##kernel_type##_refer_CPUPlace_,              \
-      "USE_JITKERNEL_REFER must be called in global namespace");    \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
-  static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
-      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
-
-#define USE_KERNEL_MORE(kernel_type, impl_type, place_type)              \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                              \
-      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_,     \
-      "USE_JITKERNEL_MORE must be called in global namespace");          \
-  extern int                                                             \
-      TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
-  static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \
-      UNUSED =                                                           \
-          TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
-
-#define USE_JITKERNEL_MORE(kernel_type, impl_type) \
-  USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace)
-
-}  // namespace jit
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
deleted file mode 100644
index 875d4f864353c131ca4d72b5176adcae8aff724a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/jit/test.cc
+++ /dev/null
@@ -1,1382 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <iostream>
-#include <random>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/place.h"
-
-DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
-
-template <typename T>
-void RandomVec(const int n, T* a, const T lower = static_cast<T>(-2.f),
-               const T upper = static_cast<T>(2.f)) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  for (int i = 0; i < n; ++i) {
-    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-template <typename T>
-void ExpectEQ(const T* target, const T* refer, size_t n) {
-  if (std::is_floating_point<T>::value) {
-    for (size_t i = 0; i < n; ++i) {
-      EXPECT_NEAR(target[i], refer[i], FLAGS_acc) << " at index : " << i;
-    }
-  } else {
-    for (size_t i = 0; i < n; ++i) {
-      EXPECT_EQ(target[i], refer[i]) << " at index : " << i;
-    }
-  }
-}
-
-std::vector<int> TestSizes() {
-  std::vector<int> s;
-  for (int i = 1; i < 32; ++i) {
-    s.push_back(i);
-  }
-  // test some large size
-  s.push_back(100);
-  s.push_back(1000);
-  s.push_back(2000);
-  return s;
-}
-
-namespace jit = paddle::operators::jit;
-using CPUPlace = paddle::platform::CPUPlace;
-
-template <typename KernelTuple, typename PlaceType, typename Tester,
-          typename... Args>
-void TestAllImpls(const typename KernelTuple::attr_type& attr,
-                  const Tester& verifier, const Args&... args) {
-  auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
-  for (auto f : funcs) {
-    VLOG(10) << "Test Kernel " << f.first;
-    verifier(f.second, args...);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelXYZN() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int d : TestSizes()) {
-    auto ref = jit::GetReferFunc<KernelTuple>();
-    EXPECT_TRUE(ref != nullptr);
-
-    std::vector<T> x(d), y(d), zref(d);
-    RandomVec<T>(d, x.data());
-    RandomVec<T>(d, y.data());
-
-    std::vector<T> xinp(d), yinp(d);  // inplace test
-    std::copy(x.begin(), x.end(), xinp.begin());
-    std::copy(y.begin(), y.end(), yinp.begin());
-
-    const T* x_data = x.data();
-    const T* y_data = y.data();
-    T* zref_data = zref.data();
-    T* xinp_data = xinp.data();
-    T* yinp_data = yinp.data();
-
-    // test refer code inplace
-    ref(x_data, y_data, zref_data, d);
-    ref(x_data, yinp_data, yinp_data, d);
-    ref(xinp_data, y_data, xinp_data, d);
-    ExpectEQ<T>(xinp_data, zref_data, d);
-    ExpectEQ<T>(yinp_data, zref_data, d);
-
-    auto verifier = [](const typename KernelTuple::func_type tgt,
-                       const std::vector<T>& x, const std::vector<T>& y,
-                       const std::vector<T>& zref) {
-      EXPECT_TRUE(tgt != nullptr);
-      EXPECT_EQ(zref.size(), x.size());
-      EXPECT_EQ(zref.size(), y.size());
-      const T* x_data = x.data();
-      const T* y_data = y.data();
-      const T* zref_data = zref.data();
-      const int d = zref.size();
-
-      std::vector<T> ztgt(d);
-      T* ztgt_data = ztgt.data();
-      // test normal
-      tgt(x_data, y_data, ztgt_data, d);
-      ExpectEQ<T>(ztgt_data, zref_data, d);
-      // test inplace x
-      std::copy(x.begin(), x.end(), ztgt.begin());
-      tgt(ztgt_data, y_data, ztgt_data, d);
-      ExpectEQ<T>(ztgt_data, zref_data, d);
-      // test inplace y
-      std::copy(y.begin(), y.end(), ztgt.begin());
-      tgt(x_data, ztgt_data, ztgt_data, d);
-      ExpectEQ<T>(ztgt_data, zref_data, d);
-    };
-
-    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, y, zref);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelAXYN() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int d : TestSizes()) {
-    auto ref = jit::GetReferFunc<KernelTuple>();
-    EXPECT_TRUE(ref != nullptr);
-
-    const T a = static_cast<T>(3);
-    std::vector<T> x(d), yref(d);
-    std::vector<T> xinp(d);  // inplace test
-    RandomVec<T>(d, x.data());
-    std::copy(x.begin(), x.end(), xinp.begin());
-
-    const T* x_data = x.data();
-    T* yref_data = yref.data();
-    T* xinp_data = xinp.data();
-    // test refer code inplace
-    ref(&a, x_data, yref_data, d);
-    ref(&a, xinp_data, xinp_data, d);
-    ExpectEQ<T>(xinp_data, yref_data, d);
-
-    auto verifier = [](const typename KernelTuple::func_type tgt, const T a,
-                       const std::vector<T>& x, const std::vector<T>& yref) {
-      EXPECT_TRUE(tgt != nullptr);
-      EXPECT_EQ(yref.size(), x.size());
-      const T* x_data = x.data();
-      const T* yref_data = yref.data();
-      const int d = yref.size();
-      std::vector<T> ytgt(d);
-      T* ytgt_data = ytgt.data();
-      // test normal
-      tgt(&a, x_data, ytgt_data, d);
-      ExpectEQ<T>(ytgt_data, yref_data, d);
-      // test inplace x
-      std::copy(x.begin(), x.end(), ytgt.begin());
-      tgt(&a, ytgt_data, ytgt_data, d);
-      ExpectEQ<T>(ytgt_data, yref_data, d);
-    };
-    TestAllImpls<KernelTuple, PlaceType>(d, verifier, a, x, yref);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelXYN() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int d : TestSizes()) {
-    auto ref = jit::GetReferFunc<KernelTuple>();
-    EXPECT_TRUE(ref != nullptr);
-
-    std::vector<T> x(d), yref(d);
-    std::vector<T> xinp(d);  // inplace test
-    RandomVec<T>(d, x.data());
-    std::copy(x.begin(), x.end(), xinp.begin());
-
-    const T* x_data = x.data();
-    T* yref_data = yref.data();
-    T* xinp_data = xinp.data();
-    // test refer code inplace
-    ref(x_data, yref_data, d);
-    ref(xinp_data, xinp_data, d);
-    ExpectEQ<T>(xinp_data, yref_data, d);
-    auto verifier = [](const typename KernelTuple::func_type tgt,
-                       const std::vector<T>& x, const std::vector<T>& yref) {
-      EXPECT_TRUE(tgt != nullptr);
-      EXPECT_EQ(yref.size(), x.size());
-      const T* x_data = x.data();
-      const T* yref_data = yref.data();
-      const int d = yref.size();
-      std::vector<T> ytgt(d);
-      T* ytgt_data = ytgt.data();
-      // test normal
-      tgt(x_data, ytgt_data, d);
-      ExpectEQ<T>(ytgt_data, yref_data, d);
-      // test inplace x
-      std::copy(x.begin(), x.end(), ytgt.begin());
-      tgt(ytgt_data, ytgt_data, d);
-      ExpectEQ<T>(ytgt_data, yref_data, d);
-    };
-    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, yref);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelXRN() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  auto last_acc = FLAGS_acc;
-  FLAGS_acc = 1e-4;
-  for (int d : TestSizes()) {
-    auto ref = jit::GetReferFunc<KernelTuple>();
-    EXPECT_TRUE(ref != nullptr);
-    std::vector<T> x(d);
-    RandomVec<T>(d, x.data());
-    T ref_res;
-    ref(x.data(), &ref_res, d);
-
-    auto verifier = [](const typename KernelTuple::func_type tgt,
-                       const std::vector<T>& x, const T ref_res) {
-      EXPECT_TRUE(tgt != nullptr);
-      T tgt_res;
-      tgt(x.data(), &tgt_res, x.size());
-      ExpectEQ<T>(&tgt_res, &ref_res, 1);
-    };
-    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, ref_res);
-  }
-  FLAGS_acc = last_acc;
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelLSTM() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (int d : test_sizes) {
-    for (bool use_peephole : {true, false}) {
-      for (auto& act_gate : all_acts) {
-        for (auto& act_cand : all_acts) {
-          for (auto& act_cell : all_acts) {
-            const jit::lstm_attr_t attr(
-                d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand),
-                jit::to_kerneltype(act_cell), use_peephole);
-            auto ref = jit::GetReferFunc<KernelTuple>();
-            EXPECT_TRUE(ref != nullptr);
-            std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
-            std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
-            RandomVec<T>(4 * d, xsrc.data());
-            RandomVec<T>(3 * d, wp.data(), -1.f, 1.f);
-            RandomVec<T>(d, ct_1.data(), -1.f, 1.f);
-            // x could be changed after compute, so copy to save src
-            std::vector<T> x(xsrc.size());
-            std::copy(xsrc.begin(), xsrc.end(), x.begin());
-            const T* ct_1_data = ct_1.data();
-            const T* wp_data = wp.data();
-            T* x_data = x.data();
-            T* checked_data = checked.data();
-            T* ct_ref_data = ct_ref.data();
-            T* ht_ref_data = ht_ref.data();
-            jit::lstm_t step;
-            step.gates = x_data;
-            step.ct_1 = ct_1_data;
-            step.ct = ct_ref_data;
-            step.ht = ht_ref_data;
-            if (use_peephole) {
-              step.wp = wp_data;
-              step.checked = checked_data;
-            }
-            ref(&step, &attr);
-            VLOG(10) << attr;
-
-            auto verifier = [](
-                const typename KernelTuple::func_type tgt,
-                const std::vector<T>& xsrc, const std::vector<T>& wp,
-                const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
-                const std::vector<T>& ht_ref,
-                const typename KernelTuple::attr_type& attr) {
-              EXPECT_TRUE(tgt != nullptr);
-              EXPECT_EQ(ct_ref.size(), ht_ref.size());
-              EXPECT_EQ(ct_1.size(), ht_ref.size());
-              EXPECT_EQ(xsrc.size(), 4 * ht_ref.size());
-              EXPECT_EQ(wp.size(), 3 * ht_ref.size());
-
-              // x could be changed after compute, so copy to save src
-              int d = ht_ref.size();
-              std::vector<T> x(xsrc.size()), ct(ct_ref.size()),
-                  ht(ht_ref.size());
-              std::vector<T> checked(2 * d);
-              std::copy(xsrc.begin(), xsrc.end(), x.begin());
-
-              const T* ct_1_data = ct_1.data();
-              const T* wp_data = wp.data();
-              const T* ct_ref_data = ct_ref.data();
-              const T* ht_ref_data = ht_ref.data();
-              T* x_data = x.data();
-              T* ct_data = ct.data();
-              T* ht_data = ht.data();
-              T* checked_data = checked.data();
-
-              jit::lstm_t step;
-              step.gates = x_data;
-              step.ct_1 = ct_1_data;
-              step.ct = ct_data;
-              step.ht = ht_data;
-              if (attr.use_peephole) {
-                step.wp = wp_data;
-                step.checked = checked_data;
-              }
-
-              tgt(&step, &attr);
-              ExpectEQ<T>(ct_data, ct_ref_data, d);
-              ExpectEQ<T>(ht_data, ht_ref_data, d);
-            };
-            TestAllImpls<KernelTuple, PlaceType>(attr, verifier, xsrc, wp, ct_1,
-                                                 ct_ref, ht_ref, attr);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelGRU() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (int d : test_sizes) {
-    for (auto& act_gate : all_acts) {
-      for (auto& act_cand : all_acts) {
-        const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate),
-                                   jit::to_kerneltype(act_cand));
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        std::vector<T> xsrc(3 * d), ht_1(d), ht_ref(d);
-        RandomVec<T>(3 * d, xsrc.data());
-        RandomVec<T>(d, ht_1.data());
-        // x could be changed after compute, so copy to save src
-        std::vector<T> x(xsrc.size());
-        std::copy(xsrc.begin(), xsrc.end(), x.begin());
-        const T* ht_1_data = ht_1.data();
-        T* x_data = x.data();
-        T* ht_ref_data = ht_ref.data();
-        jit::gru_t step;
-        step.gates = x_data;
-        step.ht_1 = ht_1_data;
-        step.ht = ht_ref_data;
-        ref(&step, &attr);
-        VLOG(10) << attr;
-        auto verifier = [](const typename KernelTuple::func_type tgt,
-                           const std::vector<T>& xsrc,
-                           const std::vector<T>& ht_1,
-                           const std::vector<T>& ht_ref,
-                           const typename KernelTuple::attr_type& attr) {
-          EXPECT_TRUE(tgt != nullptr);
-          EXPECT_EQ(ht_1.size(), ht_ref.size());
-          EXPECT_EQ(xsrc.size(), 3 * ht_ref.size());
-
-          // x could be changed after compute, so copy to save src
-          int d = ht_ref.size();
-          std::vector<T> x(xsrc.size()), ht(ht_ref.size());
-          std::copy(xsrc.begin(), xsrc.end(), x.begin());
-          const T* ht_1_data = ht_1.data();
-          const T* ht_ref_data = ht_ref.data();
-          T* x_data = x.data();
-          T* ht_data = ht.data();
-          jit::gru_t step;
-          step.gates = x_data;
-          step.ht_1 = ht_1_data;
-          step.ht = ht_data;
-          tgt(&step, &attr);
-          ExpectEQ<T>(ht_data, ht_ref_data, d);
-        };
-        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, xsrc, ht_1, ht_ref,
-                                             attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelNCHW16CMulNC() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  const int n = 3, c = 16 * 4, h = 10, w = 10;
-  auto ref = jit::GetReferFunc<KernelTuple>();
-  EXPECT_TRUE(ref != nullptr);
-  int sz = n * c * h * w;
-  std::vector<T> x(sz), y(n * c), zref(sz);
-  std::vector<T> ztgt(sz), zjit(sz);
-  RandomVec<T>(sz, x.data());
-  RandomVec<T>(n * c, y.data());
-
-  const T* x_data = x.data();
-  const T* y_data = y.data();
-  T* zref_data = zref.data();
-  T* ztgt_data = ztgt.data();
-  T* zjit_data = zjit.data();
-  constexpr int simd_width = ZMM_FLOAT_BLOCK;
-  int C = c / simd_width;
-  auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(0);
-  auto funcs = jit::GetAllCandidateFuncs<KernelTuple, PlaceType>(0);
-  EXPECT_GT(funcs.size(), 0UL);
-  auto jitcode = funcs[0];
-  EXPECT_TRUE(tgt != nullptr);
-
-  if (std::is_same<T, float>::value &&
-      paddle::platform::MayIUse(paddle::platform::avx512f)) {
-    EXPECT_TRUE(jitcode != nullptr);
-  }
-  for (int ni = 0; ni < n; ni++) {
-    for (int ci = 0; ci < C; ci++) {
-      auto ptr_x =
-          x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-      auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
-      auto ptr_zref =
-          zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-      auto ptr_ztgt =
-          ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-
-      ref(ptr_x, ptr_y, ptr_zref, h, w);
-      tgt(ptr_x, ptr_y, ptr_ztgt, h, w);
-
-      if (jitcode) {
-        auto ptr_zjit =
-            zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-        jitcode(ptr_x, ptr_y, ptr_zjit, h, w);
-      }
-    }
-  }
-  ExpectEQ<T>(ztgt_data, zref_data, sz);
-  if (jitcode) {
-    ExpectEQ<T>(zjit_data, zref_data, sz);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelLayerNorm() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  const T epsilon = 9.99999975e-06;
-  for (int n : {1, 2, 10}) {
-    for (int x_dim_0 : {1, 9, 17, 50}) {
-      int left = n * x_dim_0;
-      for (int x_dim_1 : TestSizes()) {
-        int right = x_dim_1;
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        int sz = left * right;
-        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
-            outref(sz);
-        RandomVec<T>(sz, x.data());
-        RandomVec<T>(left, mean.data());
-        RandomVec<T>(left, var.data());
-        RandomVec<T>(right, scale.data());
-        RandomVec<T>(right, bias.data());
-
-        const T* scale_data = scale.data();
-        const T* bias_data = bias.data();
-        T* x_data = x.data();
-        T* mean_data = mean.data();
-        T* var_data = var.data();
-        T* outref_data = outref.data();
-
-        ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
-            left, epsilon, right);
-
-        auto verifier = [](
-            const typename KernelTuple::func_type tgt, const std::vector<T>& x_,
-            const std::vector<T>& outref_, const std::vector<T>& mean_,
-            const std::vector<T>& var_, const std::vector<T>& scale,
-            const std::vector<T>& bias, const int& left, const float& epsilon,
-            const typename KernelTuple::attr_type& right) {
-          EXPECT_TRUE(tgt != nullptr);
-          std::vector<T> outtgt(outref_.size());
-          std::vector<T> x(x_.size());
-          std::vector<T> mean(mean_.size());
-          std::vector<T> var(var_.size());
-          std::vector<T> outref(outref_.size());
-          std::copy(x_.begin(), x_.end(), x.begin());
-          std::copy(mean_.begin(), mean_.end(), mean.begin());
-          std::copy(var_.begin(), var_.end(), var.begin());
-          std::copy(outref_.begin(), outref_.end(), outref.begin());
-
-          EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
-          EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
-          EXPECT_EQ(mean.size(), static_cast<size_t>(left));
-          EXPECT_EQ(var.size(), static_cast<size_t>(left));
-          EXPECT_EQ(scale.size(), static_cast<size_t>(right));
-          EXPECT_EQ(bias.size(), static_cast<size_t>(right));
-
-          const T* scale_data = scale.data();
-          const T* bias_data = bias.data();
-          T* x_data = x.data();
-          T* mean_data = mean.data();
-          T* var_data = var.data();
-          T* outref_data = outref.data();
-          T* outtgt_data = outtgt.data();
-          tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data,
-              left, epsilon, right);
-          ExpectEQ<T>(outtgt_data, outref_data, left * right);
-        };
-        TestAllImpls<KernelTuple, PlaceType>(right, verifier, x, outref, mean,
-                                             var, scale, bias, left, epsilon,
-                                             right);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelCRFDecoding() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  constexpr int state_trans_base_idx = 2;
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000));
-  for (int seq_len : {1, 11, 17, 50}) {
-    for (int tag_num : test_sizes) {
-      auto ref = jit::GetReferFunc<KernelTuple>();
-      EXPECT_TRUE(ref != nullptr);
-      int x_sz = seq_len * tag_num;
-      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
-      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
-      std::vector<int> trackref(x_sz);
-      RandomVec<T>(x_sz, x.data());
-      RandomVec<T>(w_sz, w.data());
-
-      ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
-          trackref.data(), tag_num);
-
-      auto verifier = [](
-          const typename KernelTuple::func_type tgt, const int& seq_len,
-          const std::vector<T>& x, const std::vector<T>& w,
-          const std::vector<T>& alpharef, const std::vector<int>& trackref,
-          const typename KernelTuple::attr_type& tag_num) {
-        constexpr int state_trans_base_idx = 2;
-        EXPECT_TRUE(tgt != nullptr);
-        EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
-        EXPECT_EQ(w.size(), static_cast<size_t>(
-                                (tag_num + state_trans_base_idx) * tag_num));
-        EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
-        EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
-        std::vector<T> alphatgt(alpharef.size());
-        std::vector<int> tracktgt(trackref.size());
-        memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int));
-        tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
-            tracktgt.data(), tag_num);
-        ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
-        ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
-      };
-      TestAllImpls<KernelTuple, PlaceType>(tag_num, verifier, seq_len, x, w,
-                                           alpharef, trackref, tag_num);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelSeqPool() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  std::vector<jit::SeqPoolType> pool_types = {
-      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (auto type : pool_types) {
-    for (int w : test_sizes) {
-      jit::seq_pool_attr_t attr(w, type);
-      for (int h : test_sizes) {
-        attr.h = h;
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        std::vector<T> x(h * w), yref(w);
-        RandomVec<T>(h * w, x.data());
-        const T* x_data = x.data();
-        T* yref_data = yref.data();
-        ref(x_data, yref_data, &attr);
-        VLOG(10) << attr;
-        auto verifier = [](const typename KernelTuple::func_type tgt,
-                           const std::vector<T>& x, const std::vector<T>& yref,
-                           const typename KernelTuple::attr_type& attr) {
-          EXPECT_TRUE(tgt != nullptr);
-          EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
-          int w = yref.size();
-          std::vector<T> y(w);
-          const T* x_data = x.data();
-          const T* yref_data = yref.data();
-          T* y_data = y.data();
-          tgt(x_data, y_data, &attr);
-          ExpectEQ<T>(y_data, yref_data, w);
-        };
-        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, x, yref, attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelEmbSeqPool() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  int64_t tbl_h = 1e4;
-  std::vector<jit::SeqPoolType> pool_types = {
-      jit::SeqPoolType::kSum};  // only support sum yet
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (int tbl_w : test_sizes) {
-    std::vector<T> table(tbl_h * tbl_w);
-    RandomVec<T>(tbl_h * tbl_w, table.data());
-    const T* table_data = table.data();
-    for (auto type : pool_types) {
-      for (int idx_w : {1, 2, 10, 16}) {
-        for (int idx_h : {1, 2, 9, 13, 16}) {
-          auto ref = jit::GetReferFunc<KernelTuple>();
-          EXPECT_TRUE(ref != nullptr);
-          std::vector<int64_t> idx(idx_h * idx_w);
-          RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
-          int64_t out_w = tbl_w * idx_w;
-          std::vector<T> oref(out_w);
-          const int64_t* idx_data = idx.data();
-          T* o_data = oref.data();
-          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
-                                        type);
-          ref(table_data, idx_data, o_data, &attr);
-
-          auto verifier = [](const typename KernelTuple::func_type tgt,
-                             const std::vector<T>& table,
-                             const std::vector<int64_t>& idx,
-                             const std::vector<T>& oref,
-                             const typename KernelTuple::attr_type& attr) {
-            EXPECT_TRUE(tgt != nullptr);
-            EXPECT_EQ(table.size(), static_cast<size_t>(attr.table_height *
-                                                        attr.table_width));
-            EXPECT_EQ(idx.size(), static_cast<size_t>(attr.index_height *
-                                                      attr.index_width));
-            EXPECT_EQ(oref.size(),
-                      static_cast<size_t>(attr.table_width * attr.index_width));
-            const T* table_data = table.data();
-            const int64_t* idx_data = idx.data();
-            const T* oref_data = oref.data();
-            int o_w = oref.size();
-            std::vector<T> out(o_w);
-            T* o_data = out.data();
-            tgt(table_data, idx_data, o_data, &attr);
-            ExpectEQ<T>(o_data, oref_data, o_w);
-          };
-          TestAllImpls<KernelTuple, PlaceType>(attr, verifier, table, idx, oref,
-                                               attr);
-        }
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelMatMul() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  auto last_acc = FLAGS_acc;
-  // export MKL_CBWR=AVX would make MKL force to use AVX
-  // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic
-  FLAGS_acc = 1e-3;
-  for (int m : {1, 2, 3, 4}) {
-    for (int n : {1, 2, 3, 4}) {
-      for (int k : TestSizes()) {
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        std::vector<T> a(m * k), b(k * n), c(m * n);
-        RandomVec<T>(m * k, a.data());
-        RandomVec<T>(k * n, b.data());
-        const T* a_data = a.data();
-        const T* b_data = b.data();
-        T* c_data = c.data();
-        const jit::matmul_attr_t attr{m, n, k};
-        ref(a_data, b_data, c_data, &attr);
-        auto verifier = [](const typename KernelTuple::func_type tgt,
-                           const std::vector<T>& a, const std::vector<T>& b,
-                           const std::vector<T>& cref,
-                           const typename KernelTuple::attr_type& attr) {
-          EXPECT_TRUE(tgt != nullptr);
-          EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
-          EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
-          EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
-          std::vector<T> c(cref.size());
-          const T* a_data = a.data();
-          const T* b_data = b.data();
-          const T* cref_data = cref.data();
-          T* c_data = c.data();
-          tgt(a_data, b_data, c_data, &attr);
-          ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
-        };
-        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, a, b, c, attr);
-      }
-    }
-  }
-  FLAGS_acc = last_acc;
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelSoftmax() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int bs : {1, 2, 10}) {
-    for (int n : TestSizes()) {
-      for (int m : {1, 2, 3}) {  // remain
-        if (m > n || n % m != 0) {
-          continue;
-        }
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        std::vector<T> x(bs * n), y(bs * n);
-        RandomVec<T>(bs * n, x.data());
-        const T* x_data = x.data();
-        T* y_data = y.data();
-
-        std::vector<T> xinp(x.size());  // inplace test
-        std::copy(x.begin(), x.end(), xinp.begin());
-        ref(x_data, y_data, n, bs, m);
-        T* xinp_data = xinp.data();
-        ref(xinp_data, xinp_data, n, bs, m);
-        ExpectEQ<T>(xinp_data, y_data, n * bs);
-
-        auto verifier = [](const typename KernelTuple::func_type tgt,
-                           const std::vector<T>& x, const std::vector<T>& yref,
-                           int n, int bs, int m) {
-          EXPECT_TRUE(tgt != nullptr);
-          EXPECT_EQ(yref.size(), x.size());
-          EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
-          const T* x_data = x.data();
-          const T* yref_data = yref.data();
-          std::vector<T> ytgt(n * bs);
-          T* ytgt_data = ytgt.data();
-          // test normal
-          tgt(x_data, ytgt_data, n, bs, m);
-          ExpectEQ<T>(ytgt_data, yref_data, n * bs);
-          // test inplace x
-          std::copy(x.begin(), x.end(), ytgt.begin());
-          tgt(ytgt_data, ytgt_data, n, bs, m);
-          ExpectEQ<T>(ytgt_data, yref_data, n * bs);
-        };
-        TestAllImpls<KernelTuple, PlaceType>(n, verifier, x, y, n, bs, m);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelStrideASum() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int d : TestSizes()) {
-    for (int m : {1, 2, 3}) {  // stride
-      if (m > d || d % m != 0) {
-        continue;
-      }
-      auto ref = jit::GetReferFunc<KernelTuple>();
-      EXPECT_TRUE(ref != nullptr);
-      std::vector<T> x(d);
-      RandomVec<T>(d, x.data());
-      T ref_res;
-      ref(x.data(), &ref_res, d, m);
-
-      auto verifier = [](const typename KernelTuple::func_type tgt,
-                         const std::vector<T>& x, const T ref_res,
-                         const int m) {
-        EXPECT_TRUE(tgt != nullptr);
-        T tgt_res;
-        tgt(x.data(), &tgt_res, x.size(), m);
-        ExpectEQ<T>(&tgt_res, &ref_res, 1);
-      };
-      TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, ref_res, m);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelStrideScal() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int d : TestSizes()) {
-    for (int m : {1, 2, 3}) {  // stride
-      if (m > d || d % m != 0) {
-        continue;
-      }
-      auto ref = jit::GetReferFunc<KernelTuple>();
-      EXPECT_TRUE(ref != nullptr);
-
-      const T a = static_cast<T>(3);
-      std::vector<T> x(d), yref(d);
-      std::vector<T> xinp(d);  // inplace test
-      RandomVec<T>(d, x.data());
-      std::copy(x.begin(), x.end(), xinp.begin());
-
-      const T* x_data = x.data();
-      T* yref_data = yref.data();
-      T* xinp_data = xinp.data();
-      // test refer code inplace
-      ref(&a, x_data, yref_data, d, m);
-      ref(&a, xinp_data, xinp_data, d, m);
-      ExpectEQ<T>(xinp_data, yref_data, d);
-
-      auto verifier = [](const typename KernelTuple::func_type tgt, const T a,
-                         const std::vector<T>& x, const std::vector<T>& yref,
-                         const int m) {
-        EXPECT_TRUE(tgt != nullptr);
-        EXPECT_EQ(yref.size(), x.size());
-        const T* x_data = x.data();
-        const T* yref_data = yref.data();
-        const int d = yref.size();
-        std::vector<T> ytgt(d);
-        T* ytgt_data = ytgt.data();
-        // test normal
-        tgt(&a, x_data, ytgt_data, d, m);
-        ExpectEQ<T>(ytgt_data, yref_data, d);
-        // test inplace x
-        std::copy(x.begin(), x.end(), ytgt.begin());
-        tgt(&a, ytgt_data, ytgt_data, d, m);
-        ExpectEQ<T>(ytgt_data, yref_data, d);
-      };
-      TestAllImpls<KernelTuple, PlaceType>(d, verifier, a, x, yref, m);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelSgd() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  const T lr = 0.1;
-  auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
-                                  const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
-    std::vector<int64_t> all, out;
-    for (int i = 0; i < n; ++i) {
-      all.push_back(i);
-    }
-    std::random_shuffle(all.begin(), all.end());
-    out.insert(out.begin(), all.begin(), all.begin() + n);
-    return out;
-  };
-  for (int param_h : {1, 10}) {
-    for (int grad_w : TestSizes()) {
-      std::vector<T> param(param_h * grad_w);
-      std::vector<T> param_out(param_h * grad_w);
-      RandomVec<T>(param_h * grad_w, param.data());
-      const T* param_data = param.data();
-      T* out_data = param_out.data();
-      for (int rows_size = 1; rows_size <= param_h; ++rows_size) {
-        std::vector<T> grad(rows_size * grad_w);
-        std::vector<int64_t> rows =
-            UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
-        RandomVec<T>(rows_size * grad_w, grad.data());
-        const int64_t* rows_data = rows.data();
-        const T* grad_data = grad.data();
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
-        ref(&lr, param_data, grad_data, rows_data, out_data, &attr);
-
-        // inplace test
-        std::vector<T> inp(param.size());
-        std::copy(param.begin(), param.end(), inp.begin());
-        T* inp_data = inp.data();
-        ref(&lr, inp_data, grad_data, rows_data, inp_data, &attr);
-        // only the selected rows should be equal
-        for (int i = 0; i < rows_size; ++i) {
-          ExpectEQ<T>(inp_data + rows[i] * grad_w, out_data + rows[i] * grad_w,
-                      grad_w);
-        }
-
-        auto verifier = [](
-            const typename KernelTuple::func_type tgt, const T lr,
-            const std::vector<T>& param, const std::vector<T>& grad,
-            const std::vector<int64_t>& rows, const std::vector<T>& oref,
-            const typename KernelTuple::attr_type& attr) {
-          EXPECT_TRUE(tgt != nullptr);
-          EXPECT_EQ(param.size(),
-                    static_cast<size_t>(attr.param_height * attr.param_width));
-          EXPECT_EQ(grad.size(),
-                    static_cast<size_t>(attr.grad_height * attr.grad_width));
-          EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
-          EXPECT_EQ(param.size(), oref.size());
-          const T* param_data = param.data();
-          const T* grad_data = grad.data();
-          const int64_t* rows_data = rows.data();
-          const T* oref_data = oref.data();
-
-          std::vector<T> out(oref.size());
-          T* o_data = out.data();
-          tgt(&lr, param_data, grad_data, rows_data, o_data, &attr);
-          // only the selected rows should be equal
-          for (size_t i = 0; i < rows.size(); ++i) {
-            ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
-                        oref_data + rows[i] * attr.grad_width, attr.grad_width);
-          }
-
-          // inplace
-          std::copy(param.begin(), param.end(), out.begin());
-          tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
-          for (size_t i = 0; i < rows.size(); ++i) {
-            ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
-                        oref_data + rows[i] * attr.grad_width, attr.grad_width);
-          }
-        };
-        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, lr, param, grad,
-                                             rows, param_out, attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelVBroadcast() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int w : TestSizes()) {
-    std::vector<T> x(w);
-    RandomVec<T>(w, x.data());
-    const T* x_data = x.data();
-    for (int64_t h : {1, 2, 6}) {
-      auto ref = jit::GetReferFunc<KernelTuple>();
-      EXPECT_TRUE(ref != nullptr);
-      std::vector<T> y(w * h);
-      T* y_data = y.data();
-      ref(x_data, y_data, h, w);
-
-      auto verifier = [](const typename KernelTuple::func_type tgt,
-                         const std::vector<T>& x, const std::vector<T>& yref,
-                         const int64_t& h,
-                         const typename KernelTuple::attr_type& attr) {
-        EXPECT_TRUE(tgt != nullptr);
-        EXPECT_EQ(x.size(), static_cast<size_t>(attr));
-        EXPECT_EQ(yref.size(), x.size() * h);
-        std::vector<T> y(yref.size());
-        const T* x_data = x.data();
-        const T* yref_data = yref.data();
-        T* y_data = y.data();
-        tgt(x_data, y_data, h, attr);
-        ExpectEQ<T>(y_data, yref_data, yref.size());
-      };
-      TestAllImpls<KernelTuple, PlaceType>(static_cast<int64_t>(w), verifier, x,
-                                           y, h, static_cast<int64_t>(w));
-    }
-  }
-}
-
-// test pool
-TEST(JITKernel_pool, jitcreator) {
-  const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators();
-#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(jitcreators.size(), 0UL);
-#else
-  EXPECT_EQ(jitcreators.size(), 25UL);
-#endif
-}
-
-TEST(JITKernel_pool, jitpool) {
-  // jitpool is related with attr
-  const auto& kers = jit::JitCodePool<jit::kVAdd>().Instance().AllKernels();
-  EXPECT_EQ(kers.size(), 0UL);
-  jit::GetAllCandidateKernels<jit::VAddTuple<float>, CPUPlace>(3);
-// after call GetAllCandidateKernels, it will create jitcode Automatically
-#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(kers.size(), 0UL);
-#else
-  EXPECT_EQ(kers.size(), 1UL);
-#endif
-}
-
-TEST(JITKernel_pool, more) {
-  const auto& kers = jit::KernelPool::Instance().AllKernels();
-  size_t target_num = 8;
-
-#ifdef __AVX__
-  target_num += 2;
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-  target_num += 12;
-#endif
-
-  EXPECT_EQ(kers.size(), target_num);
-}
-
-TEST(JITKernel_pool, refer) {
-  const auto& kers = jit::ReferKernelPool::Instance().AllKernels();
-  EXPECT_EQ(kers.size(), 31UL);
-}
-
-// test helper
-TEST(JITKernel_helper, GetAllCandidateKernels) {
-  auto fp_kers =
-      jit::GetAllCandidateKernels<jit::VExpTuple<float>, CPUPlace>(10);
-#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
-  EXPECT_GE(fp_kers.size(), 1UL);  // refer
-#else
-#ifdef PADDLE_WITH_MKLML
-  EXPECT_GE(fp_kers.size(), 3UL);  // jitcode, mkl, refer
-#else
-  EXPECT_GE(fp_kers.size(), 2UL);  // jitcode, refer
-#endif
-#endif
-
-  auto db_kers =
-      jit::GetAllCandidateKernels<jit::VExpTuple<double>, CPUPlace>(10);
-#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
-  EXPECT_GE(db_kers.size(), 1UL);  // refer
-#else
-#ifdef PADDLE_WITH_MKLML
-  EXPECT_GE(db_kers.size(), 2UL);  // mkl, refer
-#else
-  EXPECT_GE(db_kers.size(), 1UL);  // refer
-#endif
-#endif
-}
-
-TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) {
-  auto fp_kers =
-      jit::GetAllCandidateFuncsWithTypes<jit::VExpTuple<float>, CPUPlace>(10);
-#if defined(__APPLE__) || defined(__OSX__)
-  EXPECT_GE(fp_kers.size(), 1UL);  // refer
-#else
-#if !defined(PADDLE_WITH_MKLML) || defined(_WIN32)
-  EXPECT_GE(fp_kers.size(), 2UL);  // jitcode/mkl, refer
-#else
-  EXPECT_GE(fp_kers.size(), 3UL);  // jitcode, mkl, refer
-#endif
-#endif
-
-  auto db_kers =
-      jit::GetAllCandidateFuncsWithTypes<jit::VExpTuple<double>, CPUPlace>(10);
-#if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML)
-  EXPECT_GE(db_kers.size(), 1UL);  // refer
-#else
-  EXPECT_GE(db_kers.size(), 2UL);  // mkl, refer
-#endif
-}
-
-TEST(JITKernel_helper, KernelFuncs) {
-  auto f1 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache().At(3);
-  auto f2 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache()[3];
-  EXPECT_TRUE(f1 != nullptr);
-  EXPECT_TRUE(f1 == f2);
-
-  auto f3 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache()[5];
-#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
-  EXPECT_TRUE(f2 == f3);
-#else
-  EXPECT_TRUE(f2 != f3);
-#endif
-}
-
-TEST(JITKernel_helper, GetAllCandidateFuncs) {
-  auto funcs = jit::GetAllCandidateFuncs<jit::VExpTuple<float>, CPUPlace>(10);
-  auto kers = jit::GetAllCandidateKernels<jit::VExpTuple<float>, CPUPlace>(10);
-  EXPECT_EQ(funcs.size(), kers.size());
-
-  std::vector<float> x(10), tgt(10);
-  RandomVec<float>(10, x.data());
-  auto best = jit::GetDefaultBestFunc<jit::VExpTuple<float>, CPUPlace>(10);
-  best(x.data(), tgt.data(), 10);
-  for (auto f : funcs) {
-    std::vector<float> y(10);
-    f(x.data(), y.data(), 10);
-    ExpectEQ<float>(y.data(), tgt.data(), 10);
-  }
-}
-
-TEST(JITKernel_helper, pack_weights) {
-  const int N = 8 * 60, K = 2;
-  float src[K][N], yref[K][N], y[K * N];
-  float* x = &(src[0][0]);
-  float* ref = &(yref[0][0]);
-  for (int i = 0; i < N * K; ++i) {
-    *(x + i) = static_cast<float>(i);
-  }
-  int block = 0;
-  std::vector<int> groups;
-  if (paddle::platform::MayIUse(paddle::platform::avx512f)) {
-    block = ZMM_FLOAT_BLOCK;
-    groups.push_back(30);
-  } else {
-    block = YMM_FLOAT_BLOCK;
-    groups.insert(groups.end(), {14, 14, 14, 14, 4});
-  }
-
-  int offset = 0;
-  int acc = 0;
-  for (int g : groups) {
-    g = g * block;
-    for (int k = 0; k < K; ++k) {
-      for (int i = 0; i < g; ++i) {
-        *(ref + offset) = src[k][i + acc];
-        offset++;
-      }
-    }
-    acc += g;
-  }
-
-  jit::pack_weights<float>(x, y, N, K);
-  ExpectEQ<float>(y, ref, N * K);
-}
-
-TEST(JITKernel_helper, attr) {
-  std::ostringstream out;
-  // KernelTypes
-  out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding)
-      << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1)
-      << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2)
-      << jit::to_string(jit::kHSum) << jit::to_string(jit::kHMax)
-      << jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1)
-      << jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul)
-      << jit::to_string(jit::kNCHW16CMulNC) << jit::to_string(jit::kSeqPool)
-      << jit::to_string(jit::kSoftmax) << jit::to_string(jit::kVAdd)
-      << jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu)
-      << jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy)
-      << jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity)
-      << jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu)
-      << jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd)
-      << jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare)
-      << jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh);
-  EXPECT_EQ(out.str().size(), 234);
-
-  // SeqPoolTypes
-  out.str("");
-  out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg)
-      << jit::to_string(jit::kSqrt);
-  EXPECT_EQ(out.str().size(), 13);
-
-  EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu);
-  EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity);
-  EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp);
-  EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid);
-  EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh);
-
-  out.str("");
-  out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
-  EXPECT_EQ(out.str().size(), 89);
-
-  out.str("");
-  out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid);
-  EXPECT_EQ(out.str().size(), 52);
-
-  out.str("");
-  out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum);
-  EXPECT_EQ(out.str().size(), 44);
-
-  out.str("");
-  out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg);
-  EXPECT_EQ(out.str().size(), 93);
-
-  out.str("");
-  out << jit::sgd_attr_t(1, 2, 3, 4, 5);
-  EXPECT_EQ(out.str().size(), 81);
-
-  out.str("");
-  out << jit::matmul_attr_t(1, 2, 3);
-  EXPECT_EQ(out.str().size(), 14);
-}
-
-// test keys
-TEST(JITKernel_key, int) {
-  EXPECT_TRUE(jit::JitCodeKey<int>(2) == jit::JitCodeKey<int>(2));
-  EXPECT_TRUE(jit::JitCodeKey<int>(2) == jit::JitCodeKey<int64_t>(2));
-  EXPECT_TRUE(jit::JitCodeKey<int>(2) != jit::JitCodeKey<int>(3));
-}
-
-TEST(JITKernel_key, gru) {
-  jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh);
-  jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh);
-  jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh);
-  jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity);
-  jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity);
-
-  auto key1 = jit::JitCodeKey<jit::gru_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::gru_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::gru_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::gru_attr_t>(attr4);
-  auto key5 = jit::JitCodeKey<jit::gru_attr_t>(attr5);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 != key3);
-  EXPECT_TRUE(key2 != key4);
-  EXPECT_TRUE(key2 != key5);
-  EXPECT_TRUE(key3 != key4);
-  EXPECT_TRUE(key3 != key5);
-  EXPECT_TRUE(key4 != key5);
-}
-
-TEST(JITKernel_key, lstm) {
-  jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
-  jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
-  jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
-  jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh);
-  jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true);
-  jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true);
-
-  auto key1 = jit::JitCodeKey<jit::lstm_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::lstm_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::lstm_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::lstm_attr_t>(attr4);
-  auto key5 = jit::JitCodeKey<jit::lstm_attr_t>(attr5);
-  auto key6 = jit::JitCodeKey<jit::lstm_attr_t>(attr6);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 != key3);
-  EXPECT_TRUE(key2 != key4);
-  EXPECT_TRUE(key2 != key5);
-  EXPECT_TRUE(key3 != key4);
-  EXPECT_TRUE(key3 != key5);
-  EXPECT_TRUE(key4 != key5);
-  EXPECT_TRUE(key5 == key6);
-}
-
-TEST(JITKernel_key, seq_pool) {
-  jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1);
-  jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3);
-  jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3);
-  jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3);
-
-  auto key1 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr4);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 != key3);
-  EXPECT_TRUE(key2 != key4);
-  EXPECT_TRUE(key3 != key4);
-}
-
-TEST(JITKernel_key, matmul) {
-  jit::matmul_attr_t attr1(1, 2, 3);
-  jit::matmul_attr_t attr2(1, 2, 3);
-  jit::matmul_attr_t attr3(1, 3, 3);
-  jit::matmul_attr_t attr4(2, 3, 4);
-
-  auto key1 = jit::JitCodeKey<jit::matmul_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::matmul_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::matmul_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::matmul_attr_t>(attr4);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 != key3);
-  EXPECT_TRUE(key2 != key4);
-  EXPECT_TRUE(key3 != key4);
-}
-
-TEST(JITKernel_key, emb_seq_pool) {
-  jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum);
-  jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum);
-  jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg);
-  jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum);
-  jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum);
-
-  auto key1 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr4);
-  auto key5 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr5);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 == key3);
-  EXPECT_TRUE(key2 != key4);
-  EXPECT_TRUE(key2 != key5);
-  EXPECT_TRUE(key4 != key5);
-}
-
-TEST(JITKernel_key, sgd) {
-  jit::sgd_attr_t attr1(1, 2, 3, 4, 5);
-  jit::sgd_attr_t attr2(1, 2, 3, 4, 5);
-  jit::sgd_attr_t attr3(9, 8, 7, 4, 6);
-  jit::sgd_attr_t attr4(1, 2, 3, 6, 5);
-  jit::sgd_attr_t attr5(10, 9, 8, 7, 6);
-
-  auto key1 = jit::JitCodeKey<jit::sgd_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::sgd_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::sgd_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::sgd_attr_t>(attr4);
-  auto key5 = jit::JitCodeKey<jit::sgd_attr_t>(attr5);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 == key3);
-  EXPECT_TRUE(key3 != key4);
-  EXPECT_TRUE(key3 != key5);
-  EXPECT_TRUE(key4 != key5);
-}
-
-// test kernerls
-#define TestKernelVMul TestKernelXYZN
-#define TestKernelVAdd TestKernelXYZN
-#define TestKernelVAddRelu TestKernelXYZN
-#define TestKernelVSub TestKernelXYZN
-
-#define TestKernelVScal TestKernelAXYN
-#define TestKernelVAddBias TestKernelAXYN
-
-#define TestKernelVRelu TestKernelXYN
-#define TestKernelVIdentity TestKernelXYN
-#define TestKernelVSquare TestKernelXYN
-#define TestKernelVExp TestKernelXYN
-#define TestKernelVSigmoid TestKernelXYN
-#define TestKernelVTanh TestKernelXYN
-#define TestKernelVCopy TestKernelXYN
-
-#define TestKernelHMax TestKernelXRN
-#define TestKernelHSum TestKernelXRN
-
-#define TestKernelLSTMCtHt TestKernelLSTM
-#define TestKernelLSTMC1H1 TestKernelLSTM
-
-#define TestKernelGRUH1 TestKernelGRU
-#define TestKernelGRUHtPart1 TestKernelGRU
-#define TestKernelGRUHtPart2 TestKernelGRU
-
-#define TEST_CPU_KERNEL(kernel_type)                                      \
-  TEST(JITKernel, kernel_type) {                                          \
-    TestKernel##kernel_type<jit::kernel_type##Tuple<float>, CPUPlace>();  \
-    TestKernel##kernel_type<jit::kernel_type##Tuple<double>, CPUPlace>(); \
-  }
-
-TEST_CPU_KERNEL(VMul);
-TEST_CPU_KERNEL(VAdd);
-TEST_CPU_KERNEL(VAddRelu);
-TEST_CPU_KERNEL(VSub);
-
-TEST_CPU_KERNEL(VScal);
-TEST_CPU_KERNEL(VAddBias);
-
-TEST_CPU_KERNEL(VRelu);
-TEST_CPU_KERNEL(VIdentity);
-TEST_CPU_KERNEL(VSquare);
-TEST_CPU_KERNEL(VExp);
-TEST_CPU_KERNEL(VSigmoid);
-TEST_CPU_KERNEL(VTanh);
-TEST_CPU_KERNEL(VCopy);
-
-TEST_CPU_KERNEL(HMax);
-TEST_CPU_KERNEL(HSum);
-
-TEST_CPU_KERNEL(LSTMCtHt);
-TEST_CPU_KERNEL(LSTMC1H1);
-
-TEST_CPU_KERNEL(GRUH1);
-TEST_CPU_KERNEL(GRUHtPart1);
-TEST_CPU_KERNEL(GRUHtPart2);
-
-TEST_CPU_KERNEL(NCHW16CMulNC);
-TEST_CPU_KERNEL(LayerNorm);
-TEST_CPU_KERNEL(CRFDecoding);
-
-TEST_CPU_KERNEL(SeqPool);
-TEST_CPU_KERNEL(EmbSeqPool);
-TEST_CPU_KERNEL(MatMul);
-TEST_CPU_KERNEL(Softmax);
-TEST_CPU_KERNEL(Sgd);
-TEST_CPU_KERNEL(VBroadcast);
-
-TEST_CPU_KERNEL(StrideASum);
-TEST_CPU_KERNEL(StrideScal);
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
deleted file mode 100644
index a7c5d6305b09afb93be0b3b8524a91bd53e719fe..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/kldiv_loss_op.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class KLDivLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of KLDivLossOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Target"),
-                   "Input(Target) of KLDivLossOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
-                   "Output(Loss) of KLDivLossOp should not be null.");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_target = ctx->GetInputDim("Target");
-    PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
-                      "Input(X) rank and Input(Target) rank should be same.");
-    for (int i = 0; i < dim_x.size(); i++) {
-      if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
-        PADDLE_ENFORCE_EQ(dim_x[i], dim_target[i],
-                          "Input(X) and Input(Target) should in same shape.");
-      }
-    }
-
-    auto reduction = ctx->Attrs().Get<std::string>("reduction");
-
-    PADDLE_ENFORCE(
-        "mean" == reduction || "sum" == reduction || "batchmean" == reduction ||
-            "none" == reduction,
-        "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'.");
-
-    if ("none" == reduction) {
-      ctx->SetOutputDim("Loss", dim_x);
-    } else {
-      ctx->SetOutputDim("Loss", {1});
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class KLDivLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of KL divergence loss operator. "
-             "This is a tensor with shape of [N, *], where N is the "
-             "batch size, * means any number of additional dimensions.");
-    AddInput("Target",
-             "The  tensor of KL divergence loss operator. "
-             "This is a tensor with shape of Input(X).");
-    AddOutput(
-        "Loss",
-        "The output KL divergence loss tensor. if Attr(reduction) is "
-        "'none', this tensor should be in same shape of of Input(X), else "
-        "this tensor should be in shape of [1].");
-
-    AddAttr<std::string>(
-        "reduction",
-        "The reduction type to apply to the output, available types "
-        "are 'none' | 'batchmean' | 'mean' | 'sum', 'none' for no "
-        "reduction, 'batchmean' for the sum of output divided by "
-        "batch size, 'mean' for the average value of all output, "
-        "'sum' for the sum of the output.")
-        .SetDefault("mean");
-
-    AddComment(R"DOC(
-         This operator calculates the Kullback-Leibler divergence loss
-         between Input(X) and Input(Target).
-
-         KL divergence loss is calculated as follows:
-
-         $$l(x, y) = y * (\log(y) - x)$$
-
-         While :math:`x` is Input(X) and :math:`y` is Input(Target).
-
-         While :attr:`reduction` is :attr:`none`, output loss is in
-         the same shape as Input(X), loss in each point is calculated 
-         seperately and no reduction is applied.
-         
-         While :attr:`reduction` is :attr:`mean`, output loss is in
-         shape of [1] and loss value is the mean value of all losses.
-         
-         While :attr:`reduction` is :attr:`sum`, output loss is in
-         shape of [1] and loss value is the sum value of all losses.
-         
-         While :attr:`reduction` is :attr:`batchmean`, output loss is 
-         in shape of [1] and loss value is the sum value of all losses
-         divided by batch size.
-         
-         )DOC");
-  }
-};
-
-class KLDivLossOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Target"), "Input(Target) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input(Loss@GRAD) should not be null");
-    auto dim_x = ctx->GetInputDim("X");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class KLDivLossOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType("kldiv_loss_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Target", Input("Target"));
-    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
-                  ops::KLDivLossOpGradMaker);
-REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    kldiv_loss, ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    kldiv_loss_grad,
-    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.cu b/paddle/fluid/operators/kldiv_loss_op.cu
deleted file mode 100644
index 5226cb8c08e3db4a0bfbbe4440c27264903f06e3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/kldiv_loss_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    kldiv_loss,
-    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    kldiv_loss_grad,
-    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
deleted file mode 100644
index 625e16e298d9f842fa621aca727c6df2cb045301..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-using Array1 = Eigen::DSizes<int64_t, 1>;
-
-template <typename T>
-struct KLDivLossForward {
-  HOSTDEVICE KLDivLossForward() {}
-
-  HOSTDEVICE T operator()(const T& target, const T& input) const {
-    if (target <= 0) {
-      return 0;
-    } else {
-      return target * (std::log(target) - input);
-    }
-  }
-};
-
-template <typename T>
-struct KLDivLossBackward {
-  HOSTDEVICE KLDivLossBackward() {}
-
-  HOSTDEVICE T operator()(const T& target, const T& grad) const {
-    if (target <= 0) {
-      return 0;
-    } else {
-      return static_cast<T>(-1.) * grad;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KLDivLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* target = ctx.Input<Tensor>("Target");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    const int n = input->dims()[0];
-
-    loss->mutable_data<T>(ctx.GetPlace());
-    auto input_t = EigenVector<T>::Flatten(*input);
-    auto target_t = EigenVector<T>::Flatten(*target);
-    auto loss_t = EigenVector<T>::Flatten(*loss);
-    auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
-    if ("none" == reduction) {
-      loss_t.device(place) = output;
-    } else if ("batchmean" == reduction) {
-      auto output_sum = output.sum().eval();
-      loss_t.device(place) = output_sum / output_sum.constant(n);
-    } else if ("mean" == reduction) {
-      loss_t.device(place) = output.mean();
-    } else if ("sum" == reduction) {
-      loss_t.device(place) = output.sum();
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KLDivLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* target = ctx.Input<Tensor>("Target");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-
-    const int n = input_grad->dims()[0];
-    const int numel = input_grad->numel();
-    const int expand = numel / loss_grad->numel();
-
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto target_t = EigenVector<T>::Flatten(*target);
-
-    auto input_grad_t = EigenVector<T>::Flatten(*input_grad);
-    auto loss_grad_t = EigenVector<T>::Flatten(*loss_grad);
-
-    auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
-    auto grad_t = target_t * loss_grad_expand;
-    input_grad_t.device(place) =
-        target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
-
-    if ("mean" == reduction) {
-      input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
-    } else if ("batchmean" == reduction) {
-      input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
deleted file mode 100644
index 2696d0bef9e322fce1251984c9e0f5b7429eeea8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/l1_norm_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class L1NormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
-
-    ctx->SetOutputDim("Out", {1});
-  }
-};
-
-class L1NormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should be not null.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input of l1_norm op.");
-    AddOutput("Out", "(Scalar) The output of l1_norm op.");
-    AddComment(R"DOC(
-L1 Norm Operator.
-
-Computes the L1 norm of a tensor.
-
-$$Out = \sum{|X|}$$
-
-)DOC");
-  }
-};
-
-class L1NormGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("l1_norm_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker,
-                  ops::L1NormGradDescMaker);
-REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
-REGISTER_OP_CPU_KERNEL(
-    l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    l1_norm_grad,
-    ops::L1NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/l1_norm_op.cu b/paddle/fluid/operators/l1_norm_op.cu
deleted file mode 100644
index a5c29bbf5debdd11f6e5b28b3a8b48c2c484517a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/l1_norm_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/l1_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    l1_norm_grad,
-    ops::L1NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/l1_norm_op.h b/paddle/fluid/operators/l1_norm_op.h
deleted file mode 100644
index 7c6503bb2150a89dd2d38e3eb05d1da28446cb63..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/l1_norm_op.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-// Out = sum(abs(X))
-template <typename DeviceContext, typename T>
-class L1NormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenScalar<T>::From(*Out);
-    auto &place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    out.device(place) = x.abs().sum();
-  }
-};
-
-// dX = dout * sign(X)
-template <typename DeviceContext, typename T>
-class L1NormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *x = context.Input<framework::Tensor>("X");
-    const framework::Tensor *d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(d_out->numel() == 1, "L1 Norm Gradient should be scalar");
-    framework::Tensor *dx =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(context.GetPlace());
-
-    auto x_eigen = framework::EigenVector<T>::Flatten(*x);
-    auto d_out_eigen = framework::EigenVector<T>::Flatten(*d_out);
-    auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
-    auto &place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> x_dsize(x->numel());
-    dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
deleted file mode 100644
index 6d0af573184b10a783f9c5802d1db3630eb55538..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/label_smooth_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class LabelSmoothOp : public framework::OperatorWithKernel {
- public:
-  LabelSmoothOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LabelSmoothOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LabelSmoothOp should not be null.");
-    auto in_dims = ctx->GetInputDim("X");
-    if (ctx->HasInput("PriorDist")) {
-      auto noise_dims = ctx->GetInputDim("PriorDist");
-      auto noise_numel = paddle::framework::product(noise_dims);
-      PADDLE_ENFORCE(
-          in_dims[1] == noise_numel,
-          "The number of elements in Input(PriorDist) must be equal to the "
-          "dimension of each label.");
-    }
-    ctx->ShareLoD("X", /*->*/ "Out");
-    ctx->SetOutputDim("Out", in_dims);
-  }
-};
-
-class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor) The input labels of LabelSmooth operator. This "
-             "input can be batched labels in one-hot encoding or output from "
-             "softmax, with shape [N x K], where N is the batch size and K is "
-             "the number of classes");
-    AddInput("PriorDist",
-             "(Tensor, optional)"
-             "The prior distribution to be added to the smoothed label. It is "
-             "fixed during training and the number of elements should be equal "
-             "to the dimension K of each label. Default is uniform "
-             "distribution and each element will be set to 1/K if not provided "
-             "in input.")
-        .AsDispensable();
-    AddOutput("Out",
-              "(loDTensor) The smoothed label of LabelSmooth operator. It has"
-              "the same shape and LoD with the Input(LoDTensor).");
-    AddAttr<float>("epsilon",
-                   "(float, default 0.0f)"
-                   "The smoothing parameter of LabelSmooth operator.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-LabelSmooth Operator.
-
-Label smoothing is a mechanism to regularize the classifier layer. In machine 
-learning, optimizing the log-likelihood of the correct label directly may 
-cause two problems. First, it may result in overfitting: if the model learns 
-to assign full probability to the ground-truth label for each training example,
-it is not guaranteed to generalize. Second, it encourages the differences 
-between the largest logit and all others to become large, reducing the ability 
-of the model to adapt. Label smoothing is proposed to encourage the model to 
-be less confident, which replaces the ground-truth label $y$ with the weighted 
-sum of itself and some fixed distribution $\mu$, i.e.
-
-$$
-    \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
-$$
-
-where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and 
-$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for 
-$\mu$. This change in the ground-truth label is called label-smoothing 
-regularization or LSR.
-
-See more details about label smoothing in https://arxiv.org/abs/1512.00567.
-
-)DOC");
-  }
-};
-
-class LabelSmoothGradOp : public framework::OperatorWithKernel {
- public:
-  LabelSmoothGradOp(const std::string &type,
-                    const framework::VariableNameMap &inputs,
-                    const framework::VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"),
-                      ctx->GetInputDim(framework::GradVarName("Out")));
-  }
-};
-
-class LabelSmoothGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("label_smooth_grad");
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
-                  ops::LabelSmoothGradDescMaker);
-REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp);
-REGISTER_OP_CPU_KERNEL(
-    label_smooth,
-    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    label_smooth_grad,
-    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu
deleted file mode 100644
index 89f1d28e9988281c77e0cefa349bd7181b432c20..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/label_smooth_op.h"
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void LabelSmoothRunOriginKernel(const int N, const float epsilon,
-                                           const int label_dim, const T* src,
-                                           T* dst) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    dst[idx] = static_cast<T>(1 - epsilon) * src[idx] +
-               static_cast<T>(epsilon / label_dim);
-  }
-}
-
-template <typename T>
-__global__ void LabelSmoothRunDistKernel(const int N, const float epsilon,
-                                         const int dist_numel, const T* src,
-                                         const T* dist_data, T* dst) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    int dist_idx = idx - (idx / dist_numel) * dist_numel;
-    dst[idx] = static_cast<T>(1 - epsilon) * src[idx] +
-               static_cast<T>(epsilon) * dist_data[dist_idx];
-  }
-}
-
-template <typename T>
-__global__ void LabelSmoothGradRunKernel(const int N, const float epsilon,
-                                         const T* src, T* dst) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    dst[idx] = static_cast<T>(1 - epsilon) * src[idx];
-  }
-}
-
-template <typename DeviceContext, typename T>
-class LabelSmoothGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
-    auto* in_t = ctx.Input<framework::LoDTensor>("X");
-    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
-    auto label_dim = in_t->dims()[1];
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto size_prob = in_t->numel();
-    const T* in_data = in_t->data<T>();
-    T* out_data = out_t->mutable_data<T>(ctx.GetPlace());
-    int threads = 512;
-    int grid = (size_prob + threads - 1) / threads;
-    auto stream = ctx.cuda_device_context().stream();
-    if (dist_t) {
-      auto dist_numel = dist_t->numel();
-      const T* dist_data = dist_t->data<T>();
-      LabelSmoothRunDistKernel<T><<<grid, threads, 0, stream>>>(
-          size_prob, epsilon, dist_numel, in_data, dist_data, out_data);
-
-    } else {
-      LabelSmoothRunOriginKernel<T><<<grid, threads, 0, stream>>>(
-          size_prob, epsilon, label_dim, in_data, out_data);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LabelSmoothGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_in_t->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    const T* in_data = d_out_t->data<T>();
-    auto size_prob = d_out_t->numel();
-    T* out_data = d_in_t->mutable_data<T>(ctx.GetPlace());
-    int threads = 512;
-    int grid = (size_prob + threads - 1) / threads;
-    auto stream = ctx.cuda_device_context().stream();
-    LabelSmoothGradRunKernel<T><<<grid, threads, 0, stream>>>(
-        size_prob, epsilon, in_data, out_data);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    label_smooth,
-    ops::LabelSmoothGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LabelSmoothGPUKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    label_smooth_grad,
-    ops::LabelSmoothGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LabelSmoothGradGPUKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h
deleted file mode 100644
index f3da17de011053fa118b5a4257bb5c3b00084741..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/label_smooth_op.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LabelSmoothKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
-    auto* in_t = ctx.Input<framework::LoDTensor>("X");
-    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
-    auto label_dim = in_t->dims()[1];
-    out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto out = framework::EigenVector<T>::Flatten(*out_t);
-    auto in = framework::EigenVector<T>::Flatten(*in_t);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    if (dist_t) {
-      auto dist = framework::EigenVector<T>::Flatten(*dist_t);
-      out.device(dev) =
-          static_cast<T>(1 - epsilon) * in +
-          static_cast<T>(epsilon) *
-              dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
-    } else {
-      out.device(dev) = static_cast<T>(1 - epsilon) * in +
-                        static_cast<T>(epsilon / label_dim);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LabelSmoothGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_in_t->mutable_data<T>(ctx.GetPlace());
-
-    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-    auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
-
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
deleted file mode 100644
index 1aac60ef36c62703f8f9a3b896c17a1483642f53..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/layer_norm_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-class LayerNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
-                   "Output(Mean) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
-                   "Output(Variance) of LayerNormOp should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
-    PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
-                      "'begin_norm_axis' must be less than the rank of X.");
-
-    auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
-
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right,
-                          "scale should with right");
-      }
-    }
-    if (ctx->HasInput("Bias")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1);
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right,
-                          "bias should with right");
-      }
-    }
-
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("Mean", {left});
-    ctx->SetOutputDim("Variance", {left});
-    ctx->ShareLoD("X", "Y");
-  }
-};
-
-class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor.");
-    AddInput("Scale",
-             "(optional) Scale is a 1-dimensional tensor of size "
-             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
-             "It is applied to the output.")
-        .AsDispensable();
-    AddInput("Bias",
-             "(optional) Bias is a 1-dimensional tensor of size "
-             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
-             "It is applied to the output.")
-        .AsDispensable();
-    AddOutput("Y", "Result after normalization.");
-    AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate();
-    AddOutput("Variance", "Variance of the current mini batch.")
-        .AsIntermediate();
-
-    AddAttr<float>("epsilon",
-                   "Constant for numerical stability [default 1e-5].")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
-                         "'epsilon' should be between 0.0 and 0.001.");
-        });
-    AddAttr<int>("begin_norm_axis",
-                 "the axis of `begin_norm_axis ... Rank(X) - 1` will be "
-                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H]. [default 1].")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &begin_norm_axis) {
-          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
-                            "'begin_norm_axis' should be greater than zero.");
-        });
-
-    AddComment(R"DOC(
-Assume feature vectors exist on dimensions
-:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
-along these dimensions for each feature vector :math:`a` with size
-:math:`H`, then normalize each feature vector using the corresponding
-statistics. After that, apply learnable gain and bias on the normalized
-tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
-
-Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
-)DOC");
-  }
-};
-
-class LayerNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // check input
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Mean"),
-                   "Input(Mean) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Variance"),
-                   "Input(Variance) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) of LayerNormOp should not be null.");
-
-    // check output
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-      ctx->SetOutputDim(framework::GradVarName("Scale"),
-                        ctx->GetInputDim("Scale"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-      ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Scale"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-    if (var == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
-    } else if (var->IsType<LoDTensor>()) {
-      t = &var->Get<LoDTensor>();
-    }
-    if (t == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    return framework::OpKernelType(t->type(), ctx.GetPlace());
-  }
-};
-
-class LayerNormGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("layer_norm_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Mean", Output("Mean"));
-    op->SetInput("Variance", Output("Variance"));
-    if (ForwardOp().Inputs().count("Scale") > 0) {
-      op->SetInput("Scale", Input("Scale"));
-      op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
-    }
-
-    if (ForwardOp().Inputs().count("Bias") > 0) {
-      op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-    }
-
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
-                  ops::LayerNormGradOpDescMaker);
-REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
deleted file mode 100644
index 22343d7724b2f0dc01bff8c2274e3dd914bf70ef..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ /dev/null
@@ -1,529 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cub/cub.cuh>
-#include "paddle/fluid/operators/layer_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-inline static int GetDesiredBlockDim(int block_dim) {
-  const int kMaxBlockDim = 512;
-  return block_dim >= kMaxBlockDim
-             ? kMaxBlockDim
-             : (1 << (static_cast<int>(std::log2f(block_dim))));
-}
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)              \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__)
-
-static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); }
-static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); }
-
-template <typename T>
-struct PairForLayerNorm {
-  __device__ __forceinline__ PairForLayerNorm() {}
-  __device__ __forceinline__ PairForLayerNorm(const T &first, const T &second)
-      : first_(first), second_(second) {}
-
-  T first_;
-  T second_;
-};
-
-template <typename T>
-struct PairForLayerNormAddFunctor {
-  __device__ __forceinline__ PairForLayerNorm<T> operator()(
-      const PairForLayerNorm<T> &p1, const PairForLayerNorm<T> &p2) {
-    return PairForLayerNorm<T>(p1.first_ + p2.first_, p1.second_ + p2.second_);
-  }
-};
-
-template <typename T, int BlockDim>
-__global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
-                                 T *y, T *mean, T *var, float epsilon,
-                                 int feature_size) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * feature_size;
-
-  // Step 1: Reduce to calculate mean and var
-  double mean_val = 0;
-  double var_val = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T tmp = x[i];
-    mean_val += tmp;
-    var_val += (tmp * tmp);
-  }
-  auto pair = BlockReduce(temp_storage)
-                  .Reduce(PairForLayerNorm<double>(mean_val, var_val),
-                          PairForLayerNormAddFunctor<double>());
-  if (threadIdx.x == 0) {
-    auto tmp = pair.first_ / feature_size;
-    mean[blockIdx.x] = static_cast<T>(tmp);
-    var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
-  }
-  __syncthreads();
-  mean_val = mean[blockIdx.x];
-  var_val = static_cast<T>(real_sqrt(var[blockIdx.x] + epsilon));
-
-  // Step 2: Calculate y
-  if (scale != nullptr) {
-    if (bias != nullptr) {
-      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
-           i += BlockDim, j += BlockDim) {
-        y[i] = scale[j] * (x[i] - mean_val) / var_val + bias[j];
-      }
-    } else {
-      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
-           i += BlockDim, j += BlockDim) {
-        y[i] = scale[j] * (x[i] - mean_val) / var_val;
-      }
-    }
-  } else {  // scale == nullptr
-    if (bias != nullptr) {
-      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
-           i += BlockDim, j += BlockDim) {
-        y[i] = (x[i] - mean_val) / var_val + bias[j];
-      }
-    } else {
-      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
-           i += BlockDim, j += BlockDim) {
-        y[i] = (x[i] - mean_val) / var_val;
-      }
-    }
-  }
-}
-
-// Make sure that d_scale != nullptr && d_bias != nullptr
-// Since d_scale != nullptr, scale would not be nullptr
-template <typename T, int BlockDim, bool HasDx>
-__global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y,
-                                             T *d_scale, T *d_bias, T *d_x,
-                                             const T *mean, const T *var,
-                                             const T *scale, float epsilon,
-                                             int batch_size, int feature_size) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  int beg_idx = threadIdx.x * feature_size + blockIdx.x;
-  int end_idx = batch_size * feature_size + blockIdx.x;
-  int stride = BlockDim * feature_size;
-
-  T d_scale_partial = 0, d_bias_partial = 0;
-
-  for (int i = beg_idx; i < end_idx; i += stride) {
-    int row_idx = i / feature_size;
-    auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
-    d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
-    d_bias_partial += d_y[i];
-    if (HasDx) {
-      d_x[i] = d_y[i] * scale[blockIdx.x] / var_val;
-    }
-  }
-
-  auto pair = BlockReduce(temp_storage)
-                  .Reduce(PairForLayerNorm<T>(d_scale_partial, d_bias_partial),
-                          PairForLayerNormAddFunctor<T>());
-
-  if (threadIdx.x == 0) {
-    d_scale[blockIdx.x] = pair.first_;
-    d_bias[blockIdx.x] = pair.second_;
-  }
-}
-
-// Make sure that there is only one true expression: d_scale != nullptr
-// or d_bias != nullptr
-// Notice: scale may be nullptr
-template <typename T, int BlockDim, bool HasDx, bool HasDScale>
-__global__ void LayerNormBackwardGradientScaleOrBias(
-    const T *x, const T *d_y, T *d_scale, T *d_bias, T *d_x, const T *mean,
-    const T *var, const T *scale, float epsilon, int batch_size,
-    int feature_size) {
-  using BlockReduce = cub::BlockReduce<T, BlockDim>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  int beg_idx = threadIdx.x * feature_size + blockIdx.x;
-  int end_idx = batch_size * feature_size + blockIdx.x;
-  int stride = BlockDim * feature_size;
-  T d_scale_or_d_bias_partial = 0;
-
-  for (int i = beg_idx; i < end_idx; i += stride) {
-    int row_idx = i / feature_size;
-    auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
-    if (HasDScale) {
-      d_scale_or_d_bias_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
-    } else {  // d_bias != nullptr
-      d_scale_or_d_bias_partial += d_y[i];
-    }
-
-    if (HasDx) {
-      if (scale != nullptr) {
-        d_x[i] = d_y[i] * scale[blockIdx.x] / var_val;
-      } else {
-        d_x[i] = d_y[i] / var_val;
-      }
-    }
-  }
-
-  d_scale_or_d_bias_partial =
-      BlockReduce(temp_storage).Reduce(d_scale_or_d_bias_partial, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    if (HasDScale) {
-      d_scale[blockIdx.x] = d_scale_or_d_bias_partial;
-    } else {
-      d_bias[blockIdx.x] = d_scale_or_d_bias_partial;
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void LayerNormBackwardPostProcessToCalculateDX(const T *x, T *d_x,
-                                                          const T *mean,
-                                                          const T *var,
-                                                          float epsilon,
-                                                          int feature_size) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T d_x_reduce_tmp[2];
-
-  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * feature_size;
-
-  T block_mean = mean[blockIdx.x];
-  T block_var = var[blockIdx.x];
-  T d_x_mean_partial = 0, d_x_var_partial = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    d_x_mean_partial += d_x[i];
-    d_x_var_partial += d_x[i] * (x[i] - block_mean);
-  }
-
-  auto pair =
-      BlockReduce(temp_storage)
-          .Reduce(PairForLayerNorm<T>(d_x_mean_partial, d_x_var_partial),
-                  PairForLayerNormAddFunctor<T>());
-
-  if (threadIdx.x == 0) {
-    d_x_reduce_tmp[0] = pair.first_ / feature_size;
-    d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon));
-  }
-  __syncthreads();
-
-  d_x_mean_partial = d_x_reduce_tmp[0];
-  d_x_var_partial = d_x_reduce_tmp[1];
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    d_x[i] -= d_x_mean_partial;
-    d_x[i] -= (x[i] - block_mean) * d_x_var_partial;
-  }
-}
-
-// Here, we only calculate d_x
-template <typename T, int BlockDim>
-__global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y,
-                                                T *d_x, const T *mean,
-                                                const T *var, const T *scale,
-                                                float epsilon,
-                                                int feature_size) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T d_x_reduce_tmp[2];
-
-  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * feature_size;
-
-  T block_mean = mean[blockIdx.x], block_var = var[blockIdx.x];
-  T d_x_mean_partial = 0, d_x_var_partial = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    auto var_val = static_cast<T>(real_sqrt(block_var + epsilon));
-    if (scale != nullptr) {
-      int col_idx = i % feature_size;
-      d_x[i] = d_y[i] * scale[col_idx] / var_val;
-    } else {
-      d_x[i] = d_y[i] / var_val;
-    }
-    d_x_mean_partial += d_x[i];
-    d_x_var_partial += d_x[i] * (x[i] - block_mean);
-  }
-
-  auto pair =
-      BlockReduce(temp_storage)
-          .Reduce(PairForLayerNorm<T>(d_x_mean_partial, d_x_var_partial),
-                  PairForLayerNormAddFunctor<T>());
-
-  if (threadIdx.x == 0) {
-    d_x_reduce_tmp[0] = pair.first_ / feature_size;
-    d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon));
-  }
-  __syncthreads();
-
-  d_x_mean_partial = d_x_reduce_tmp[0];
-  d_x_var_partial = d_x_reduce_tmp[1];
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    d_x[i] -= d_x_mean_partial;
-    d_x[i] -= (x[i] - block_mean) * d_x_var_partial;
-  }
-}
-
-template <typename T>
-__global__ void LayerNormBackwardWhenBatchSizeIsOne(
-    const T *x, const T *d_y, T *d_x, T *d_scale, T *d_bias, const T *mean,
-    const T *var, const T *scale, float epsilon, int feature_size) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < feature_size) {
-    auto var_val = static_cast<T>(real_sqrt(var[idx] + epsilon));
-    if (d_x != nullptr) {
-      if (d_scale == nullptr) {
-        d_x[idx] = d_y[idx] / var_val;
-      } else {
-        d_x[idx] = d_y[idx] * scale[idx] / var_val;
-      }
-    }
-
-    if (d_scale != nullptr) {
-      d_scale[idx] = d_y[idx] * (x[idx] - mean[idx]) / var_val;
-    }
-
-    if (d_bias != nullptr) d_bias[idx] = d_y[idx];
-  }
-}
-
-template <typename T>
-static void LayerNormBackward(const T *x, const T *d_y, const T *scale,
-                              const T *mean, const T *var, T *d_x, T *d_scale,
-                              T *d_bias, float epsilon, int batch_size,
-                              int feature_size, cudaStream_t stream) {
-  const int kMaxBlockDim = 512;
-  int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
-                      ((d_scale != nullptr ? 1 : 0) << 1) |
-                      ((d_bias != nullptr ? 1 : 0));
-  if (gradient_flag == 0) return;
-
-  if (batch_size == 1) {
-    LayerNormBackwardWhenBatchSizeIsOne<
-        T><<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0,
-             stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon,
-                       feature_size);
-
-    if (d_x != nullptr) {
-      switch (GetDesiredBlockDim(feature_size)) {
-        FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX<
-                             T, kBlockDim><<<1, kBlockDim, 0, stream>>>(
-            x, d_x, mean, var, epsilon, feature_size));
-      }
-    }
-    return;
-  }
-
-  auto block_dim = GetDesiredBlockDim(batch_size);
-  switch (gradient_flag) {
-    case 1:  // d_x == nulptr, d_scale == nullptr, d_bias != nullptr
-      switch (block_dim) {
-        FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
-                             T, kBlockDim, false,
-                             false><<<feature_size, kBlockDim, 0, stream>>>(
-            x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
-            feature_size));
-      }
-      break;
-    case 2:  // d_x == nullptr, d_scale != nullptr, d_bias == nullptr
-      switch (block_dim) {
-        FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
-                             T, kBlockDim, false,
-                             true><<<feature_size, kBlockDim, 0, stream>>>(
-            x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
-            feature_size));
-      }
-      break;
-    case 3:  // d_x == nullptr, d_scale != nulptr, d_bias != nullptr
-      switch (block_dim) {
-        FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardGradientAll<
-                T, kBlockDim, false><<<feature_size, kBlockDim, 0, stream>>>(
-                x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
-                batch_size, feature_size));
-      }
-      break;
-    case 4:  // d_x != nullptr, d_scale == nullptr, d_bias == nullptr
-      switch (GetDesiredBlockDim(feature_size)) {
-        FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardGradientOnlyDX<
-                T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-                x, d_y, d_x, mean, var, scale, epsilon, feature_size));
-      }
-      break;
-    case 5:  // d_x != nulptr, d_scale == nullptr, d_bias != nullptr
-      switch (block_dim) {
-        FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
-                             T, kBlockDim, true,
-                             false><<<feature_size, kBlockDim, 0, stream>>>(
-            x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
-            feature_size));
-      }
-      switch (GetDesiredBlockDim(feature_size)) {
-        FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardPostProcessToCalculateDX<
-                T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-                x, d_x, mean, var, epsilon, feature_size));
-      }
-      break;
-    case 6:  // d_x != nullptr, d_scale != nullptr, d_bias == nullptr
-      switch (block_dim) {
-        FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
-                             T, kBlockDim, true,
-                             true><<<feature_size, kBlockDim, 0, stream>>>(
-            x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
-            feature_size));
-      }
-      switch (GetDesiredBlockDim(feature_size)) {
-        FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardPostProcessToCalculateDX<
-                T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-                x, d_x, mean, var, epsilon, feature_size));
-      }
-      break;
-    case 7:  // d_x != nullptr, d_scale != nullptr, d_bias != nullptr
-      switch (block_dim) {
-        FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardGradientAll<
-                T, kBlockDim, true><<<feature_size, kBlockDim, 0, stream>>>(
-                x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
-                batch_size, feature_size));
-      }
-      switch (GetDesiredBlockDim(feature_size)) {
-        FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardPostProcessToCalculateDX<
-                T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-                x, d_x, mean, var, epsilon, feature_size));
-      }
-      break;
-    default:
-      break;
-  }
-}
-
-template <typename T>
-class LayerNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *x = ctx.Input<Tensor>("X");
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x->dims();
-    auto *x_data = x->data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    auto *mean_data = mean->mutable_data<T>(ctx.GetPlace());
-    auto *var_data = var->mutable_data<T>(ctx.GetPlace());
-    auto *scale_data = (scale == nullptr ? nullptr : scale->data<T>());
-    auto *bias_data = (bias == nullptr ? nullptr : bias->data<T>());
-
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int batch_size = static_cast<int>(matrix_dim[0]);
-    int feature_size = static_cast<int>(matrix_dim[1]);
-
-    auto stream = ctx.cuda_device_context().stream();
-
-    switch (GetDesiredBlockDim(feature_size)) {
-      FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-              x_data, scale_data, bias_data, y_data, mean_data, var_data,
-              epsilon, feature_size));
-      default:
-        PADDLE_THROW(
-            "Product from begin_norm_axis to end must be larger than 1");
-        break;
-    }
-  }
-};
-
-template <typename T>
-class LayerNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    // d_x, d_scale, d_bias may be nullptr
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *mean = ctx.Input<Tensor>("Mean");
-    auto *var = ctx.Input<Tensor>("Variance");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    auto *x_data = x->data<T>();
-    auto *d_y_data = d_y->data<T>();
-    auto *mean_data = mean->data<T>();
-    auto *var_data = var->data<T>();
-    auto *scale_data = (scale == nullptr ? nullptr : scale->data<T>());
-    auto *d_scale_data =
-        (d_scale == nullptr ? nullptr
-                            : d_scale->mutable_data<T>(ctx.GetPlace()));
-    auto *d_bias_data =
-        (d_bias == nullptr ? nullptr : d_bias->mutable_data<T>(ctx.GetPlace()));
-    auto *d_x_data =
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace()));
-
-    const auto &x_dims = x->dims();
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int batch_size = static_cast<int>(matrix_dim[0]);
-    int feature_size = static_cast<int>(matrix_dim[1]);
-
-    auto stream = ctx.cuda_device_context().stream();
-
-    LayerNormBackward<T>(x_data, d_y_data, scale_data, mean_data, var_data,
-                         d_x_data, d_scale_data, d_bias_data, epsilon,
-                         batch_size, feature_size, stream);
-  }
-};
-
-#undef FIXED_BLOCK_DIM_CASE_BASE
-#undef FIXED_BLOCK_DIM_CASE
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
deleted file mode 100644
index db794ed42116144f310b9d7dc529cff49ba2c405..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/layer_norm_op.h
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
-#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__)
-#include "paddle/fluid/operators/jit/kernels.h"
-#endif
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-// Wrap RowwiseMean and ColwiseMean.
-// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
-// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
-// implementation only considers 2D.
-template <typename DeviceContext, typename T>
-struct RowwiseMean2D {
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-class RowwiseMean2D<platform::CUDADeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({right_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    math::set_constant(dev_ctx, &divisor_, 1.0 / right);
-  }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    math::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class RowwiseMean2D<platform::CPUDeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    row_mean_(context, input, out);
-  }
-
- private:
-  math::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
-};
-
-template <typename DeviceContext, typename T>
-struct ColwiseSum2D {
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-class ColwiseSum2D<platform::CUDADeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({left_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    math::set_constant(dev_ctx, &divisor_, 1.0);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    math::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class ColwiseSum2D<platform::CPUDeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    col_wise_(context, input, out);
-  }
-
- private:
-  math::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
-};
-
-template <typename T>
-struct SubAndSquareFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
-};
-
-template <typename T>
-struct DivAndSqrtFunctor {
-  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a / (sqrt(b + epsilon_));
-  }
-
- private:
-  T epsilon_;
-};
-
-template <typename T>
-struct MulFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
-};
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
-};
-
-template <typename T>
-struct MulInvVarFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a * std::sqrt(1.0 / b);
-  }
-};
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename DeviceContext, typename T>
-class LayerNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto x = *ctx.Input<Tensor>("X");
-
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x.dims();
-
-    y->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    x.Resize(matrix_shape);
-    Tensor out;
-    out.ShareDataWith(*y);
-    out.Resize(matrix_shape);
-
-#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
-    defined(__OSX__)
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
-
-    // get mean
-    row_mean(dev_ctx, x, mean);
-
-    // get variance
-    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
-    row_mean(dev_ctx, out, var);
-
-    // get x_norm
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
-    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-        ctx, &out, var, /*axis*/ 0,
-        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
-
-    if (scale) {
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
-    }
-    if (bias) {
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
-    }
-#else
-    PADDLE_ENFORCE_EQ(mean->numel(), left);
-    PADDLE_ENFORCE_EQ(var->numel(), left);
-    PADDLE_ENFORCE_EQ(scale->numel(), right);
-    PADDLE_ENFORCE_EQ(bias->numel(), right);
-
-    auto ker =
-        jit::KernelFuncs<jit::LayerNormTuple<T>, platform::CPUPlace>::Cache()
-            .At(right);
-    ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
-        scale->data<T>(), bias->data<T>(), static_cast<int>(left),
-        static_cast<const float>(epsilon), right);
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LayerNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto x = *ctx.Input<Tensor>("X");
-    auto* mean = ctx.Input<Tensor>("Mean");
-    auto* var = ctx.Input<Tensor>("Variance");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    const auto& x_dims = x.dims();
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    d_y.Resize(matrix_shape);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
-                                               ctx.device_context());
-
-    Tensor temp;
-    Tensor temp_norm;
-    if (d_scale || d_x) {
-      x.Resize(matrix_shape);
-      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
-
-      temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
-      // get x_norm
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
-    }
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      colwise_sum(dev_ctx, d_y, d_bias);
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
-      colwise_sum(dev_ctx, temp, d_scale);
-    }
-
-    if (d_x) {
-      framework::DDim vec_shape({left});
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto dx_dim = d_x->dims();
-      Tensor temp_vec;
-      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
-
-      RowwiseMean2D<DeviceContext, T> row_mean(left, right,
-                                               ctx.device_context());
-
-      if (d_scale) {
-        // dy_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
-        framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, temp, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      } else {
-        // dy_dx
-        framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, d_y, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      }
-      // dy_var_dx
-      row_mean(dev_ctx, temp, &temp_vec);
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, d_x, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
-      d_x->Resize(dx_dim);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
deleted file mode 100644
index ed09c64ffda684a097c7ab6043d8e04b381c2f96..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ /dev/null
@@ -1,356 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/linear_chain_crf_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Emission",
-             "(LoDTensor/Tensor<float>). When a LoDTensor input,A 2-D LoDTensor"
-             " with shape [N x D], where N is the size of the "
-             "mini-batch and D is the total tag number. The unscaled emission "
-             "weight matrix for the linear chain CRF. When a Tensor input,"
-             "A Tensor with shape [N x S x D], where N is batch number,"
-             "S is max length of sequences, D is the total tag number.");
-    AddInput("Transition",
-             "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
-             "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
-             "operator. See more details in the operator's comments.");
-    AddInput("Label",
-             "(LoDTensor/Tensor<int64_t>), when a LoDTensor input,  "
-             "[N x 1], where N is the total element number in a mini-batch. "
-             "when a Tensor input, [N x S], where N is batch number. "
-             "S is max length of sequences. The ground truth.");
-    AddInput("length",
-             "(Tensor, default Tensor<int64_t>) A Tensor with shape "
-             "[M x 1], where M is the sequence number in a mini-batch.")
-        .AsDispensable();
-    AddOutput(
-        "Alpha",
-        "(Tensor, default Tensor<float>), the same shape with Emission. "
-        "The forward vectors for the entire batch. Denote it as $\alpha$. "
-        "$\alpha$ is a memo table used to calculate the normalization "
-        "factor in CRF. $\alpha[k, v]$ stores the unnormalized "
-        "probabilites of all possible unfinished sequences of tags that end at "
-        "position $k$ with tag $v$. For each $k$, "
-        "$\alpha[k, v]$ is a vector of length $D$ with a component for "
-        "each tag value $v$. This vector is called a forward vecotr and "
-        "will also be used in backward computations.")
-        .AsIntermediate();
-    AddOutput(
-        "EmissionExps",
-        "(Tensor, default Tensor<float>), the same shape with Emission. "
-        "The exponentials of Input(Emission). This is an intermediate "
-        "computational result in forward computation, and will be reused in "
-        "backward computation.")
-        .AsIntermediate();
-    AddOutput(
-        "TransitionExps",
-        "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
-        "[(D + 2) x D]. The exponentials of Input(Transition). This is an "
-        "intermediate computational result in forward computation, and "
-        "will be reused in backward computation.")
-        .AsIntermediate();
-    AddOutput(
-        "LogLikelihood",
-        "(Tensor, default Tensor<float>) The logarithm of the conditional "
-        "likelihood of each training sample in a mini-batch. This is a 2-D "
-        "tensor with shape [S x 1], where S is the sequence number in a "
-        "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
-        "The output is no longer a LoDTensor.");
-    AddComment(R"DOC(
-Conditional Random Field defines an undirected probabilistic graph with nodes
-denoting random variables and edges denoting dependencies between these
-variables. CRF learns the conditional probability $P(Y|X)$, where
-$X = (x_1, x_2, ... , x_n)$ are structured inputs and
-$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs.
-
-Linear chain CRF is a special case of CRF that is useful for sequence labeling
-task. Sequence labeling tasks do not assume a lot of conditional
-independences among inputs. The only constraint they impose is that the input
-and output must be linear sequences. Thus, the graph of such a CRF is a simple
-chain or a line, which results in the linear chain CRF.
-
-This operator implements the Forward-Backward algorithm for the linear chain
-CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
-http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
-
-Equation:
-
-1. Denote Input(Emission) to this operator as $x$ here.
-2. The first D values of Input(Transition) to this operator are for starting
-weights, denoted as $a$ here.
-3. The next D values of Input(Transition) of this operator are for ending
-weights, denoted as $b$ here.
-4. The remaning values of Input(Transition) are for transition weights,
-denoted as $w$ here.
-5. Denote Input(Label) as $s$ here.
-
-The probability of a sequence $s$ of length $L$ is defined as:
-$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
-                + \sum_{l=1}^L x_{s_l}
-                + \sum_{l=2}^L w_{s_{l-1},s_l})$$
-
-where $Z$ is a normalization value so that the sum of $P(s)$ over
-all possible sequences is 1, and $x$ is the emission feature weight
-to the linear chain CRF.
-
-Finally, the linear chain CRF operator outputs the logarithm of the conditional
-likelihood of each training sample in a mini-batch.
-
-NOTE:
-
-1. The feature function for a CRF is made up of the emission features and the
-transition features. The emission feature weights are NOT computed in
-this operator. They MUST be computed first before this operator is called.
-
-2. Because this operator performs global normalization over all possible
-sequences internally, it expects UNSCALED emission feature weights.
-Please do not call this op with the emission feature being output of any
-nonlinear activation.
-
-3. The 2nd dimension of Input(Emission) MUST be equal to the tag number.
-
-)DOC");
-  }
-};
-
-class LinearChainCRFOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Emission"),
-                   "Input(Emission) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Transition"),
-                   "Input(Transition) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Alpha"),
-                   "Output(Alpha) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("EmissionExps"),
-                   "Output(EmissionExps) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("TransitionExps"),
-                   "Output(TransitionExps) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("LogLikelihood"),
-                   "Output(LogLikelihood) should be not null.");
-
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
-                      "The Input(Transition) should be a 2-D tensor.");
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (transition_dims[0] <= 0 || transition_dims[1] <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          transition_dims[0] - 2, transition_dims[1],
-          "An invalid dimension for the Input(Transition), which should "
-          "be a 2-D tensor with shape [(D + 2) x D].");
-    }
-    auto emission_dims = ctx->GetInputDim("Emission");
-    PADDLE_ENFORCE_NE(emission_dims[0], 0,
-                      "An empty mini-batch is not allowed.");
-    if (ctx->HasInput("length")) {
-      PADDLE_ENFORCE_EQ(emission_dims.size(), 3,
-                        "The Input(Emission) should be a 3-D tensor.");
-      auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE_EQ(label_dims.size(), 3,
-                        "The Input(Label) should be a 3-D tensor");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_dims[0], label_dims[0],
-          "The batch size of Input(Emission) and Input(Label) "
-          "should be the same.");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_dims[1], label_dims[1],
-          "The max length of Input(Emission) and Input(Label) "
-          "should be the same.");
-    } else {
-      PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
-                        "The Input(Emission) should be a 2-D tensor.");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_dims[1], transition_dims[1],
-          "The 2nd dimension of the Input(Emission) and the Input(Transition) "
-          "should be equal to the tag number.");
-
-      auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE_EQ(label_dims.size(), 2,
-                        "The Input(Label) should be a 2-D tensor with the 2nd "
-                        "dimensions fixed to 1.");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_dims[0], label_dims[0],
-          "The height of Input(Emission) and the height of Input(Label) "
-          "should be the same.");
-    }
-    ctx->SetOutputDim("Alpha", emission_dims);
-    ctx->SetOutputDim("EmissionExps", emission_dims);
-    ctx->SetOutputDim("TransitionExps", transition_dims);
-    // TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
-    // is the sequence number in a mini-batch. The dimension set here should be
-    // resized to its correct size in the function Compute. Fix this once we can
-    // get LoD information in the InferShape interface.
-    ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1});
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of linear_chain_crf
-  // is determined by its input "Emission".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<LoDTensor>("Emission")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class LinearChainCRFGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("EmissionExps"),
-                   "Input(EmissionExps) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("TransitionExps"),
-                   "Input(TransitionExps) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("LogLikelihood")),
-                   "Input(LogLikelihood@GRAD) shoudl be not null.");
-
-    auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
-    PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2,
-                      "The Input(TransitionExps) should be a 2-D tensor.");
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (transition_exps_dims[0] <= 0 || transition_exps_dims[1] <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          transition_exps_dims[0] - 2, transition_exps_dims[1],
-          "An invalid dimension for the Input(TransitionExps), which should "
-          "be a 2-D tensor with shape [(D + 2) x D].");
-    }
-
-    auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
-    auto label_dims = ctx->GetInputDim("Label");
-    if (ctx->HasInput("length")) {
-      PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 3,
-                        "The Input(EmissionExps) should be a 3-D tensor.");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_exps_dims[2], transition_exps_dims[1],
-          "The 3nd dimension of the Input(EmissionExps) and the "
-          "Input(TransitionExps) should be equal to the tag number.");
-      PADDLE_ENFORCE_EQ(label_dims.size(), 3,
-                        "The Input(Label) should be a 3-D tensor with the 3nd "
-                        "dimensions fixed to 1.");
-    } else {
-      PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2,
-                        "The Input(EmissionExps) should be a 2-D tensor.");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_exps_dims[1], transition_exps_dims[1],
-          "The 2nd dimension of the Input(EmissionExps) and the "
-          "Input(TransitionExps) should be equal to the tag number.");
-      PADDLE_ENFORCE_EQ(label_dims.size(), 2,
-                        "The Input(Label) should be a 2-D tensor");
-      PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                        "The Input(Label) 2nd dimensions fixed to 1.");
-    }
-    PADDLE_ENFORCE_NE(emission_exps_dims[0], 0,
-                      "An empty mini-batch is not allowed.");
-
-    PADDLE_INFERSHAPE_ENFORCE_EQ(
-        ctx, emission_exps_dims[0], label_dims[0],
-        "The height of Input(EmissionExps) and the height of Input(Label) "
-        "should be the same.");
-
-    if (ctx->HasOutput(framework::GradVarName("Emission"))) {
-      ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims);
-      if (ctx->HasInput("length") == false) {
-        ctx->ShareLoD("Emission", framework::GradVarName("Emission"));
-      }
-    }
-    // ctx->SetOutputDim(framework::GradVarName("Emission"),
-    // emission_exps_dims);
-    if (ctx->HasOutput(framework::GradVarName("Transition"))) {
-      ctx->SetOutputDim(framework::GradVarName("Transition"),
-                        transition_exps_dims);
-      ctx->ShareLoD("Transition", framework::GradVarName("Transition"));
-    }
-  }
-
- protected:
-  // Explicitly set that the data type of output of the linear_chain_crf_grad
-  // operator is determined by its input: gradients of LogLikelihood.
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))->type(),
-        platform::CPUPlace());
-  }
-};
-
-class LinearChainCRFGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("linear_chain_crf_grad");
-    op->SetAttrMap(Attrs());
-    op->SetInput("Emission", Input("Emission"));
-    op->SetInput("Transition", Input("Transition"));
-    op->SetInput("Label", Input("Label"));
-    op->SetInput("Alpha", Output("Alpha"));
-    op->SetInput("EmissionExps", Output("EmissionExps"));
-    op->SetInput("TransitionExps", Output("TransitionExps"));
-    if (ForwardOp().Inputs().count("length") > 0) {
-      op->SetInput("length", Input("length"));
-    }
-    op->SetInput(framework::GradVarName("LogLikelihood"),
-                 OutputGrad("LogLikelihood"));
-
-    op->SetOutput(framework::GradVarName("Emission"), InputGrad("Emission"));
-    op->SetOutput(framework::GradVarName("Transition"),
-                  InputGrad("Transition"));
-
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    LinearChainCRFGradNoNeedBufferVarsInference, "Transition", "Emission");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp,
-                  ops::LinearChainCRFOpMaker, ops::LinearChainCRFGradDescMaker);
-REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp,
-                  ops::LinearChainCRFGradNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(
-    linear_chain_crf,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext,
-                                    double>);
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
deleted file mode 100755
index 8cd3cdadc91ff302f17f4e0b8b8f104b7f57e0fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ /dev/null
@@ -1,427 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static inline T NormalizeL1(T* x, size_t len) {
-  T sum = 0.;
-  for (size_t i = 0; i < len; ++i) sum += x[i];
-  // (This comment is from the old LinearChainCRFLayer.)
-  // Right now, we just bet that sum won't be zero. If this really happens, we
-  // will figure out what should be done then.
-  PADDLE_ENFORCE(sum,
-                 "The unnormalized probabilities of all possible unfinished "
-                 "sequences must be greater than 0.");
-  T s = 1. / sum;
-  for (size_t i = 0; i < len; ++i) x[i] *= s;
-  return sum;
-}
-
-template <typename T>
-struct ScalarMul {
-  explicit ScalarMul(const T& scalar) : scalar(scalar) {}
-  T operator()(const T& val) const { return val * scalar; }
-
-  T scalar;
-};
-
-using framework::LoDTensor;
-using framework::LoD;
-using framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class LinearChainCRFOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* emission_weights = ctx.Input<framework::Tensor>("Emission");
-    const Tensor* transition_weights =
-        ctx.Input<framework::Tensor>("Transition");
-
-    Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
-    Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
-    Tensor* alpha = ctx.Output<Tensor>("Alpha");
-    Tensor* ll = ctx.Output<Tensor>("LogLikelihood");
-
-    // Because the computation codes only runs on CPU, here the memory for all
-    // the outputs is FIXED to be allocated on the CPU memory.
-    auto* emission_exps_data =
-        emission_exps->mutable_data<T>(platform::CPUPlace());
-    auto* alpha_data = alpha->mutable_data<T>(platform::CPUPlace());
-    transition_exps->mutable_data<T>(platform::CPUPlace());
-    // Resize the output tensor to its correct dimension.
-    memset(emission_exps_data, 0, emission_exps->numel() * sizeof(T));
-    memset(alpha_data, 0, alpha->numel() * sizeof(T));
-    auto emission_dims = emission_weights->dims();
-
-    const Tensor* label = ctx.Input<framework::Tensor>("Label");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    Tensor emission_weights_tmp = ctx.AllocateTmpTensor<T, DeviceContext>(
-        emission_weights->dims(), dev_ctx);
-    emission_weights_tmp.ShareDataWith(*emission_weights);
-    Tensor label_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(label->dims(), dev_ctx);
-    label_tmp.ShareDataWith(*label);
-    Tensor emission_exps_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(emission_exps->dims(), dev_ctx);
-    emission_exps_tmp.ShareDataWith(*emission_exps);
-    Tensor alpha_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(alpha->dims(), dev_ctx);
-    alpha_tmp.ShareDataWith(*alpha);
-    size_t seq_num = 0;
-    size_t batch_size;
-    size_t tag_num;
-    const int64_t* length_data = nullptr;
-    framework::Vector<size_t> in_lod;
-    if (ctx.HasInput("length")) {
-      const Tensor* label_length = ctx.Input<framework::Tensor>("length");
-      length_data = label_length->data<int64_t>();
-      seq_num = label_length->numel();
-      batch_size = emission_dims[0] * emission_dims[1];
-      tag_num = emission_dims[2];
-      emission_weights_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-      auto label_dims = label->dims();
-      label_tmp.Resize({label_dims[0] * label_dims[1], label_dims[2]});
-      alpha_tmp.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
-      emission_exps_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-      PADDLE_ENFORCE_EQ(seq_num, emission_dims[0],
-                        "the size of Input(length) must be equal to "
-                        "emission_dims[0].");
-      PADDLE_ENFORCE_EQ(seq_num, label_dims[0],
-                        "the size of Input(length) must be equal to "
-                        "label_dims[0].");
-    } else {
-      seq_num = ctx.Input<LoDTensor>("Label")->lod()[0].size() - 1;
-      batch_size = emission_dims[0];
-      tag_num = emission_dims[1];
-      in_lod = ctx.Input<LoDTensor>("Label")->lod()[0];
-      PADDLE_ENFORCE_NE(in_lod.size(), 0, "Input(Label) must be a sequence.");
-    }
-
-    ll->Resize({static_cast<int>(seq_num), 1});
-    ll->mutable_data<T>(platform::CPUPlace());
-    // Now, all the inputs and outputs should be on the CPU memory.
-    Tensor emission_row_max;
-    emission_row_max.mutable_data<T>(
-        framework::make_ddim({static_cast<int64_t>(batch_size), 1}),
-        platform::CPUPlace());
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    auto x = EigenMatrix<T>::From(emission_weights_tmp);
-    auto x_row_max = EigenMatrix<T>::From(emission_row_max);
-    x_row_max.device(place) =
-        x.maximum(Eigen::DSizes<int, 1>(1))
-            .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
-    auto x_exps = EigenMatrix<T>::From(emission_exps_tmp);
-    x_exps.device(place) =
-        (x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
-    auto w = EigenMatrix<T>::From(*transition_weights);
-    auto w_exps = EigenMatrix<T>::From(*transition_exps);
-    w_exps.device(place) = w.exp();
-    T* log_likelihood = ll->data<T>();
-    for (size_t i = 0; i < seq_num; ++i) {
-      int start_pos = 0;
-      int end_pos = 0;
-      if (ctx.HasInput("length")) {
-        if (length_data[i] == 0) continue;
-        start_pos = i * emission_dims[1];
-        end_pos = start_pos + static_cast<int>(length_data[i]);
-      } else {
-        start_pos = static_cast<int>(in_lod[i]);
-        end_pos = static_cast<int>(in_lod[i + 1]);
-      }
-      if (end_pos == start_pos) {
-        // If an empty input sequence is given, pad 0 for its cost.
-        log_likelihood[i] = 0.;
-        continue;
-      }
-      const Tensor one_seq = emission_weights_tmp.Slice(start_pos, end_pos);
-      Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos);
-      Tensor one_seq_exps = emission_exps_tmp.Slice(start_pos, end_pos);
-      const Tensor one_seq_label = label_tmp.Slice(start_pos, end_pos);
-      Tensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos);
-      log_likelihood[i] = ForwardOneSequence(
-          one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
-          *transition_exps, one_seq_label, &one_seq_alpha);
-    }
-  };
-
- private:
-  T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max,
-                       const Tensor& emission_exps, const Tensor& trans_weights,
-                       const Tensor& trans_weight_exps, const Tensor& label,
-                       Tensor* alpha) const {
-    const T* x = emission.data<T>();
-    const T* x_row_max = emission_row_max.data<T>();
-    const T* x_exps = emission_exps.data<T>();
-    const T* w = trans_weights.data<T>();
-    const T* w_exps = trans_weight_exps.data<T>();
-    T* alpha_value = alpha->data<T>();
-
-    auto x_dims = emission.dims();
-    const size_t seq_length = x_dims[0];
-    const size_t tag_num = x_dims[1];
-    // The 1st row of w are transition weights for start mask.
-    // The 2nd row of w are transition weights for end mask.
-    // Transition weights between other tags begin from the 3rd row of w.
-    const size_t state_trans_base_idx = 2;
-
-    for (size_t i = 0; i < tag_num; ++i) {
-      alpha_value[i] = w_exps[i] * x_exps[i];
-    }
-    T ll = -x_row_max[0] - std::log(NormalizeL1<T>(alpha_value, tag_num));
-
-    for (size_t k = 1; k < seq_length; ++k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T sum = 0.;
-        for (size_t j = 0; j < tag_num; ++j) {
-          sum += alpha_value[(k - 1) * tag_num + j] *  // (*)
-                 w_exps[(j + state_trans_base_idx) * tag_num + i];
-        }
-        alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum;
-      }
-      // NormalizeL1 is to avoid underflow or overflow at (*).
-      ll -= x_row_max[k] +
-            std::log(NormalizeL1<T>(alpha_value + k * tag_num, tag_num));
-    }
-    T sum = 0.;
-    for (size_t i = 0; i < tag_num; ++i) {
-      sum += alpha_value[(seq_length - 1) * tag_num + i] * w_exps[tag_num + i];
-    }
-    ll -= std::log(sum);
-    // Now ll is equal to -log(Z).
-
-    const int64_t* lbl = label.data<int64_t>();
-    PADDLE_ENFORCE_LT(
-        static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
-        "An invalid tag label that execesses the largest tag number.");
-
-    // Calculate the nominator part, which depends on the label sequence.
-    ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] +
-          w[tag_num + lbl[seq_length - 1]] /*end transition*/;
-    for (size_t k = 1; k < seq_length; ++k) {
-      ll += x[k * tag_num + lbl[k]] +
-            w[(lbl[k - 1] + state_trans_base_idx) * tag_num + lbl[k]];
-    }
-    return -ll;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
-    const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
-    const Tensor* alpha = ctx.Input<Tensor>("Alpha");
-    const T* ll_grad =
-        ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    Tensor* emission_grad =
-        ctx.Output<Tensor>(framework::GradVarName("Emission"));
-    auto* emission_grad_data =
-        emission_grad->mutable_data<T>(platform::CPUPlace());
-    memset(emission_grad_data, 0, emission_grad->numel() * sizeof(T));
-    Tensor alpha_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(alpha->dims(), dev_ctx);
-    alpha_tmp.ShareDataWith(*alpha);
-    Tensor label_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(label->dims(), dev_ctx);
-    label_tmp.ShareDataWith(*label);
-    Tensor emission_exps_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(emission_exps->dims(), dev_ctx);
-    emission_exps_tmp.ShareDataWith(*emission_exps);
-    Tensor emission_grad_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(emission_grad->dims(), dev_ctx);
-    emission_grad_tmp.ShareDataWith(*emission_grad);
-    // getting seq_num  using padding or not
-    size_t seq_num = 0;
-    framework::Vector<size_t> lod;
-    const int64_t* length_data = nullptr;
-    if (ctx.HasInput("length")) {
-      const Tensor* label_length = ctx.Input<framework::Tensor>("length");
-      length_data = label_length->data<int64_t>();
-      seq_num = label_length->numel();
-      auto emission_dims = emission_grad->dims();
-      auto label_dims = label->dims();
-      emission_grad_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-      label_tmp.Resize({label_dims[0] * label_dims[1], label_dims[2]});
-      alpha_tmp.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
-      emission_exps_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-    } else {
-      seq_num = ctx.Input<LoDTensor>("Label")->lod()[0].size() - 1;
-      lod = ctx.Input<LoDTensor>("Label")->lod()[0];
-      PADDLE_ENFORCE_NE(lod.size(), 0, "Input(Label) must be a sequence.");
-    }
-
-    Tensor* transition_grad =
-        ctx.Output<Tensor>(framework::GradVarName("Transition"));
-
-    // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
-    // data reader operator, it can have no gradients.
-    if (transition_grad) {
-      transition_grad->mutable_data<T>(platform::CPUPlace());
-      math::set_constant(ctx.device_context(), transition_grad, 0.);
-    }
-    // Now, all the inputs and outputs should be on the CPU memory.
-    auto emission_dims = emission_exps->dims();
-    // Beta is the memo table used in dynamic programming to calculate the
-    // backwark vectors. For a backward vector i (the i-th row of beta), it
-    // captures the unnormalized probabilities of partial sequences starting
-    // at position i.
-    Tensor beta;
-    auto* beta_data = beta.mutable_data<T>(emission_dims, platform::CPUPlace());
-    memset(beta_data, 0, beta.numel() * sizeof(T));
-    if (ctx.HasInput("length")) {
-      beta.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
-    }
-    for (size_t i = 0; i < seq_num; ++i) {
-      int start_pos = 0;
-      int end_pos = 0;
-      if (ctx.HasInput("length")) {
-        if (length_data[i] == 0) continue;
-        start_pos = i * emission_dims[1];
-        end_pos = start_pos + static_cast<int>(length_data[i]);
-      } else {
-        start_pos = static_cast<int>(lod[i]);
-        end_pos = static_cast<int>(lod[i + 1]);
-      }
-      const Tensor one_seq_emission_exps =
-          emission_exps_tmp.Slice(start_pos, end_pos);
-      const Tensor one_seq_label = label_tmp.Slice(start_pos, end_pos);
-      const Tensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos);
-      Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
-      Tensor one_seq_emission_grad =
-          emission_grad_tmp.Slice(start_pos, end_pos);
-      BackwardOneSequence(
-          ctx.template device_context<platform::CPUDeviceContext>(), ll_grad[i],
-          one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
-          &one_seq_beta, transition_grad, &one_seq_emission_grad);
-    }
-  };
-
- private:
-  void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
-                           const T ll_grad, const Tensor& emission_exps,
-                           const Tensor& transition_exps, const Tensor& alpha,
-                           const Tensor& label, Tensor* beta,
-                           Tensor* transition_grad,
-                           Tensor* emission_grad) const {
-    const T* w_exps = transition_exps.data<T>();
-    const T* x_exps = emission_exps.data<T>();
-    const int64_t* label_value = label.data<int64_t>();
-    T* beta_value = beta->data<T>();
-    auto x_dims = emission_exps.dims();
-    const size_t seq_length = x_dims[0];
-    const size_t tag_num = x_dims[1];
-    const size_t state_trans_base_idx = 2;
-
-    // Calculate the backward vectors: beta.
-    // First, calculate the initialition state.
-    for (size_t i = 0; i < tag_num; ++i) {
-      beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i];
-    }
-    NormalizeL1<T>(beta_value + (seq_length - 1) * tag_num, tag_num);
-    for (int k = static_cast<int>(seq_length) - 2; k >= 0; --k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T sum = 0.;
-        for (size_t j = 0; j < tag_num; ++j) {
-          sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
-                 x_exps[(k + 1) * tag_num + j] *
-                 beta_value[(k + 1) * tag_num + j];
-        }
-        beta_value[k * tag_num + i] = sum;
-      }
-      // NormalizeL1 is to avoid underflow or overflow at (**).
-      NormalizeL1<T>(beta_value + k * tag_num, tag_num);
-    }
-
-    auto x_grad_mat = EigenMatrix<T>::From(*emission_grad);
-    auto alpha_mat = EigenMatrix<T>::From(alpha);
-    auto beta_mat = EigenMatrix<T>::From(*beta);
-
-    auto* place = ctx.eigen_device();
-    auto prob = alpha_mat * beta_mat;
-    auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
-                       .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
-                       .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
-    x_grad_mat.device(*place) =
-        (prob / row_sum).unaryExpr(ScalarMul<T>(ll_grad));
-
-    for (size_t k = 0; k < seq_length; ++k) {
-      x_grad_mat(k, label_value[k]) -= static_cast<T>(ll_grad);
-    }
-
-    if (transition_grad) {
-      T* trans_grad = transition_grad->data<T>();
-      for (size_t k = 0; k < tag_num; ++k) {
-        // Do not multiply by the output gradient here, because x_grad_mat has
-        // alrealy done this.
-        trans_grad[k] += x_grad_mat(/*from start state*/ 0, k);
-        trans_grad[tag_num + k] +=
-            x_grad_mat(/*to end state*/ seq_length - 1, k);
-      }
-
-      auto x_exps_mat = EigenMatrix<T>::From(emission_exps);
-
-      // TODO(caoying): Fix this to avoid using this local variable if we can
-      // profile the training process.
-      Tensor tmp;
-      tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
-      auto tmp_mat = EigenMatrix<T>::From(tmp);
-      auto prob = beta_mat * x_exps_mat;
-      auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
-                         .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
-                         .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
-      tmp_mat.device(*place) = prob / row_sum;
-
-      for (size_t k = 1; k < seq_length; ++k) {
-        T sum = 0.;
-        for (size_t i = 0; i < tag_num; ++i) {
-          for (size_t j = 0; j < tag_num; ++j) {
-            sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
-                   alpha_mat(k - 1, i) * tmp_mat(k, j);
-          }
-        }
-        sum = 1. / sum;
-        for (size_t i = 0; i < tag_num; ++i) {
-          for (size_t j = 0; j < tag_num; ++j) {
-            trans_grad[(i + state_trans_base_idx) * tag_num + j] +=
-                sum * w_exps[(i + state_trans_base_idx) * tag_num + j] *
-                alpha_mat(k - 1, i) * tmp_mat(k, j) * ll_grad;
-          }
-        }
-        trans_grad[(label_value[k - 1] + state_trans_base_idx) * tag_num +
-                   label_value[k]] -= static_cast<T>(ll_grad);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
deleted file mode 100644
index f4aeb062d8dfae31a72b8ebccb3d377276662da6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/linspace_op.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/linspace_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LinspaceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Start"),
-                   "Input(Start) of LinspaceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Stop"),
-                   "Input(Stop) of LinspaceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Num"),
-                   "Input(Num) of LinspaceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(OUt) of LinspaceOp should not be null.");
-
-    auto s_dims = ctx->GetInputDim("Start");
-    PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
-                   "The shape of Input(Start) should be [1].");
-
-    auto e_dims = ctx->GetInputDim("Stop");
-    PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
-                   "The shape of Input(Stop) should be [1].");
-
-    auto step_dims = ctx->GetInputDim("Num");
-    PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
-                   "The shape of Input(Num) should be [1].");
-
-    ctx->SetOutputDim("Out", {-1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>("Start")->type(), ctx.device_context(),
-        layout_, library_);
-  }
-};
-
-class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Start",
-             "First entry in the sequence. It is a tensor of shape [1], should "
-             "be of type float32 or float64.");
-    AddInput("Stop",
-             "Last entry in the sequence. It is a tensor of shape [1], should "
-             "be of type float32 or float64.");
-    AddInput("Num",
-             "Number of entry in the sequence. It is a tensor of shape [1], "
-             "should be of type int32.");
-    AddOutput("Out", "A sequence of numbers.");
-    AddComment(R"DOC(
-    Return fixed number of evenly spaced values within a given interval. First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
-REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
-                       ops::CPULinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
deleted file mode 100644
index 90bd17cda0e0d1f78810233537bb502f9115fbd0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/linspace_op.cu
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/linspace_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename T>
-__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
-  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
-}
-
-template <typename T>
-__global__ void LinspaceSpecialKernel(T start, T* out) {
-  out[0] = start;
-}
-
-template <typename T>
-class CUDALinspaceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<framework::Tensor>("Start");
-    auto* stop_t = context.Input<framework::Tensor>("Stop");
-    auto* num_t = context.Input<framework::Tensor>("Num");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    framework::Tensor n;
-    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
-    T start = n.data<T>()[0];
-    framework::TensorCopy(*stop_t, platform::CPUPlace(), &n);
-    T stop = n.data<T>()[0];
-    framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
-    int32_t num = n.data<int32_t>()[0];
-
-    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
-
-    out->Resize(framework::make_ddim({num}));
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    T step = 0;
-    if (num != 1) {
-      step = (stop - start) / (num - 1);
-    }
-
-    auto stream = context.cuda_device_context().stream();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, step, num, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
-                        ops::CUDALinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
deleted file mode 100644
index b1fcac73b0ad249aa19859bde770a8554cdb7408..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/linspace_op.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CPULinspaceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
-    T stop = context.Input<framework::Tensor>("Stop")->data<T>()[0];
-    int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
-    auto* out = context.Output<framework::Tensor>("Out");
-    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
-
-    out->Resize(framework::make_ddim({num}));
-
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    if (num > 1) {
-      T step = (stop - start) / (num - 1);
-      T value = start;
-      for (int i = 0; i < num; ++i) {
-        out_data[i] = value;
-        value += step;
-      }
-    } else {
-      out_data[0] = start;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
deleted file mode 100644
index 63d3f809f263588bc1fbcd9ee4305e2ce9321e38..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/load_combine_op.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/load_combine_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LoadCombineOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = framework::OpKernelType(
-        framework::proto::VarType::FP32, ctx.GetPlace());
-    return kt;
-  }
-};
-
-class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput(
-        "Out",
-        "(vector) The output LoDTensors that will be read from the input file.")
-        .AsDuplicable();
-    AddAttr<bool>(
-        "load_as_fp16",
-        "(boolean, default false)"
-        "If true, the tensor will be first loaded and then "
-        "converted to float16 data type. Otherwise, the tensor will be "
-        "directly loaded without data type conversion.")
-        .SetDefault(false);
-    AddAttr<std::string>("file_path",
-                         "(string) "
-                         "LoDTensors will be loaded from \"file_path\".")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-    AddAttr<bool>("model_from_memory",
-                  "(boolean, default false)"
-                  "If true, file_path is in memory, and LoDTensors will be "
-                  "loaded directly from memory")
-        .SetDefault(false);
-    AddComment(R"DOC(
-LoadCombine Operator.
-
-LoadCombine operator loads LoDTensor variables from a file, which could be
-loaded in memory already. The file should contain one or more LoDTensors
-serialized using the SaveCombine operator. The
-LoadCombine operator applies a deserialization strategy to appropriately load
-the LodTensors, and this strategy complements the serialization strategy used
-in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
-with the SaveCombine operator, and can only deserialize one or more LoDTensors
-that were saved using the SaveCombine operator.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
-                  ops::LoadCombineOpProtoMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    load_combine,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_combine_op.cu b/paddle/fluid/operators/load_combine_op.cu
deleted file mode 100644
index 2a42c0daa7fc58165e85d851c602a65ec287c905..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/load_combine_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/load_combine_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    load_combine,
-    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h
deleted file mode 100644
index 9f6565ddf3aa71db4e7504e91f72c852984a39b7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/load_combine_op.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class LoadCombineOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    auto filename = ctx.Attr<std::string>("file_path");
-    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
-    auto model_from_memory = ctx.Attr<bool>("model_from_memory");
-    auto &out_var_names = ctx.Outputs("Out");
-
-    PADDLE_ENFORCE_GT(
-        static_cast<int>(out_var_names.size()), 0,
-        "The number of output variables should be greater than 0.");
-    if (!model_from_memory) {
-      std::ifstream fin(filename, std::ios::binary);
-      PADDLE_ENFORCE(static_cast<bool>(fin),
-                     "OP(LoadCombine) fail to open file %s, please check "
-                     "whether the model file is complete or damaged.",
-                     filename);
-      LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
-    } else {
-      PADDLE_ENFORCE(!filename.empty(),
-                     "OP(LoadCombine) fail to open file %s, please check "
-                     "whether the model file is complete or damaged.",
-                     filename);
-      std::stringstream fin(filename, std::ios::in | std::ios::binary);
-      LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
-    }
-  }
-
-  void LoadParamsFromBuffer(
-      const framework::ExecutionContext &context, const platform::Place &place,
-      std::istream *buffer, bool load_as_fp16,
-      const std::vector<std::string> &out_var_names) const {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    auto out_vars = context.MultiOutputVar("Out");
-
-    for (size_t i = 0; i < out_var_names.size(); i++) {
-      PADDLE_ENFORCE(out_vars[i] != nullptr,
-                     "Output variable %s cannot be found", out_var_names[i]);
-
-      auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
-
-      // Error checking
-      PADDLE_ENFORCE(
-          static_cast<bool>(*buffer),
-          "There is a problem with loading model parameters. "
-          "Please check whether the model file is complete or damaged.");
-
-      // Get data from fin to tensor
-      DeserializeFromStream(*buffer, tensor, dev_ctx);
-
-      auto in_dtype = tensor->type();
-      auto out_dtype =
-          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-      if (in_dtype != out_dtype) {
-        // convert to float16 tensor
-        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-        framework::LoDTensor fp16_tensor;
-        // copy LoD info to the new tensor
-        fp16_tensor.set_lod(tensor->lod());
-        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
-                                 &fp16_tensor);
-
-        // reset output tensor
-        out_vars[i]->Clear();
-        tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
-        tensor->set_lod(fp16_tensor.lod());
-        tensor->ShareDataWith(fp16_tensor);
-      }
-    }
-    buffer->peek();
-    PADDLE_ENFORCE(buffer->eof(),
-                   "You are not allowed to load partial data via "
-                   "load_combine_op, use load_op instead.");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
deleted file mode 100644
index 435c755df3642ae0ba5144a89ed30ed6e0b63258..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/load_op.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/operators/load_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LoadOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = framework::OpKernelType(
-        framework::proto::VarType::FP32, ctx.GetPlace());
-    return kt;
-  }
-};
-
-class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "The LoDTensor / SelectedRows need to be loaded");
-    AddAttr<bool>(
-        "load_as_fp16",
-        "If true, the tensor will be first loaded and then "
-        "converted to float16 data type. Otherwise, the tensor will be "
-        "directly loaded without data type conversion. Default is false.")
-        .SetDefault(false);
-    AddAttr<std::string>("file_path",
-                         R"(Variable will be loaded from "file_path")")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-    AddComment(
-        "Load operator will load a LoDTensor / SelectedRows variable from disk "
-        "file.");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    load, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op.cu b/paddle/fluid/operators/load_op.cu
deleted file mode 100644
index 90f78110f8f349ebc834570c4fb9f15af24b144d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/load_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/load_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    load, ops::LoadOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h
deleted file mode 100644
index 3bf3c6bed2f0ddf352a2bad65b0d710097016b28..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/load_op.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <string>
-
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class LoadOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    auto filename = ctx.Attr<std::string>("file_path");
-    std::ifstream fin(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
-                   filename);
-
-    auto out_var_name = ctx.Outputs("Out").data();
-    auto *out_var = ctx.OutputVar("Out");
-
-    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found ",
-                   out_var_name);
-
-    PADDLE_ENFORCE(out_var != nullptr, "Output variable cannot be found ");
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      LoadLodTensor(fin, place, out_var, ctx);
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      LoadSelectedRows(fin, place, out_var);
-    } else {
-      PADDLE_ENFORCE(
-          false,
-          "Load only support LoDTensor and SelectedRows, %s has wrong type",
-          out_var_name);
-    }
-  }
-
-  void LoadLodTensor(std::istream &fin, const platform::Place &place,
-                     framework::Variable *var,
-                     const framework::ExecutionContext &ctx) const {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    auto *tensor = var->GetMutable<framework::LoDTensor>();
-    DeserializeFromStream(fin, tensor, dev_ctx);
-
-    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
-    auto in_dtype = tensor->type();
-    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-    if (in_dtype != out_dtype) {
-      // convert to float16 tensor
-      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-      framework::LoDTensor fp16_tensor;
-      // copy LoD info to the new tensor
-      fp16_tensor.set_lod(tensor->lod());
-      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
-                               &fp16_tensor);
-
-      // reset output tensor
-      var->Clear();
-      tensor = var->GetMutable<framework::LoDTensor>();
-      tensor->set_lod(fp16_tensor.lod());
-      tensor->ShareDataWith(fp16_tensor);
-    }
-  }
-
-  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
-                        framework::Variable *var) const {
-    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
-    selectedRows->SyncIndex();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc
deleted file mode 100644
index e4551b8ba681fe92ac5f21bb0b509f43439f6b66..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class LoDArrayLengthOp : public framework::OperatorBase {
- public:
-  LoDArrayLengthOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    out.Resize({1});
-    auto cpu = platform::CPUPlace();
-    *out.mutable_data<int64_t>(cpu) = static_cast<int64_t>(x.size());
-  }
-};
-
-class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(LoDTensorArray) The input tensor array.");
-    AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
-    AddComment(R"DOC(
-LoDArrayLength Operator.
-
-This operator obtains the length of lod tensor array:
-
-$$Out = len(X)$$
-
-NOTE: The output is a CPU Tensor since the control variable should be only in
-CPU and the length of LoDTensorArray should be used as control variables.
-
-)DOC");
-  }
-};
-
-class LoDArrayLengthInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"));
-    PADDLE_ENFORCE(context->HasOutput("Out"));
-    context->SetOutputDim("Out", {1});
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lod_array_length, ops::LoDArrayLengthOp,
-                  ops::LoDArrayLengthInferShape, ops::LoDArrayLengthProtoMaker,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
deleted file mode 100644
index 0a43ac0c52f9bc98eacf743480166682482cc3c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-
-class LoDRankTableOp : public framework::OperatorBase {
- public:
-  LoDRankTableOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto *out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
-    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
-    out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
-    VLOG(10) << Input("X") << "'s lod information is " << *out;
-  }
-};
-
-class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor) input lod tensor, must contain lod information.");
-    AddOutput("Out", "(LoDRankTable) The rank table of specific level.");
-    AddAttr<int>("level", "(int) the specific lod level to rank.")
-        .SetDefault(0)
-        .EqualGreaterThan(0);
-    AddComment(R"DOC(Create LoDRanTable by LoDTensor
-
-LoD Rank Table stores the `level` of `lod` which is ordered by sequence
-length in descending order. It is useful when implement dynamic RNN and is
-shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
-output operators.
-)DOC");
-  }
-};
-
-class LoDRankTableInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"), "LoDRankTable must has input X");
-  }
-};
-
-class LoDRankTableInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    for (auto &o : ctx->Output("Out")) {
-      ctx->SetType(o, framework::proto::VarType::LOD_RANK_TABLE);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(lod_rank_table, paddle::operators::LoDRankTableOp,
-                  paddle::operators::LoDRankTableOpProtoMaker,
-                  paddle::operators::LoDRankTableInferShape,
-                  paddle::operators::LoDRankTableInferVarType,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
deleted file mode 100644
index 409f8397eb22cfcf11d7485d86b5b4b9bdddd81e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lod_reset_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class LoDResetOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LoDResetOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LoDResetOp should not be null.");
-
-    if (!ctx->HasInput("Y")) {
-      auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
-      PADDLE_ENFORCE_GT(level0.size(), 0,
-                        "If Input(Y) not provided, the target lod should be "
-                        "specified by attribute `target_lod`.");
-    } else if (ctx->IsRuntime()) {
-      ctx->ShareLoD("Y", "Out");
-    }
-    auto append = ctx->Attrs().Get<bool>("append");
-    if (append) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class LoDResetOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_var_name = ctx->Input("X").front();
-    auto out_var_name = ctx->Output("Out").front();
-    bool append = boost::get<bool>(ctx->GetAttr("append"));
-    if (ctx->HasInput("Y")) {
-      auto y_var_name = ctx->Input("Y").front();
-      auto y_lod_level = std::max(ctx->GetLoDLevel(y_var_name), 1);
-      ctx->SetLoDLevel(out_var_name, y_lod_level);
-    } else if (append) {
-      auto x_lod_level = std::max(ctx->GetLoDLevel(x_var_name), 1);
-      ctx->SetLoDLevel(out_var_name, x_lod_level);
-    } else {
-      ctx->SetLoDLevel(out_var_name, 1);
-    }
-    ctx->SetDataType(out_var_name, ctx->GetDataType(x_var_name));
-    ctx->SetType(out_var_name, paddle::framework::proto::VarType::LOD_TENSOR);
-  }
-};
-
-class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, LoDTensor) Input variable of LoDResetOp which "
-             "could be a Tensor or LoDTensor, where the data of output "
-             "variable inherits from.");
-    AddInput("Y",
-             "(Tensor, LoDTensor, optional) If provided and Y is LoDTensor, "
-             "lod of Input(Y) would be considered as the target lod first, "
-             "otherwise data of Input(Y) would be considered as the "
-             "target lod.")
-        .AsDispensable();
-    AddOutput("Out",
-              "(LoDTensor) Output variable of LoDResetOp which should be a "
-              "LoDTensor.");
-    AddAttr<std::vector<int>>("target_lod",
-                              "The target level 0 LoD from Attr().")
-        .SetDefault(std::vector<int>{});
-    AddAttr<bool>("append", "Append data to lod vector.").SetDefault(false);
-    AddComment(R"DOC(LoDReset operator
-
-Set LoD of `X` to a new one specified by `Y` or attribute `target_lod`. When `Y`
-provided and `Y` is a LoDTensor, `Y.lod` would be considered as target LoD
-first, otherwise `Y.data` would be considered as target LoD. If `Y` is not
-provided, target LoD should be specified by attribute `target_lod`.
-If target LoD is specified by `Y.data` or `target_lod`, only one level LoD
-is supported.
-
-Example 1:
-
-Given a 1-level LoDTensor input(X):
-    X.lod =  [[ 0,     2,                   5      6 ]]
-    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-    X.dims = [6, 1]
-
-attr(target_lod): [0, 4, 6]
-
-then we get a 1-level LoDTensor:
-    Out.lod =  [[ 0,                   4,            6 ]]
-    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-    Out.dims = [6, 1]
-
-Example 2:
-
-Given a 1-level LoDTensor input(X):
-    X.lod =  [[ 0,     2,                   5      6 ]]
-    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-    X.dims = [6, 1]
-
-input(Y) is a Tensor:
-    Y.data = [[0, 2, 6]]
-    Y.dims = [1, 3]
-
-then we get a 1-level LoDTensor:
-    Out.lod =  [[ 0,     2,                          6 ]]
-    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-    Out.dims = [6, 1]
-
-Example 3:
-
-Given a 1-level LoDTensor input(X):
-    X.lod =  [[ 0,      2,                   5     6 ]]
-    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-    X.dims = [6, 1]
-
-input(Y) is a 2-level LoDTensor:
-    Y.lod =  [[0, 2, 4], [0, 2, 5, 6]]
-    Y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]]
-    Y.dims = [6, 1]
-
-then we get a 2-level LoDTensor:
-    Out.lod =  [[0, 2, 4], [0, 2, 5, 6]]
-    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-    Out.dims = [6, 1]
-
-)DOC");
-  }
-};
-
-class LoDResetGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LoDResetGradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@Grad) of LoDResetGradOp should not be null.");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class LoDResetGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("lod_reset_grad");
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("X", Input("X"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference,
-                                      "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
-                  ops::LoDResetGradDescMaker, ops::LoDResetOpVarTypeInference);
-REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
-                  ops::LoDResetGradNoNeedBufferVarInference);
-
-REGISTER_OP_CPU_KERNEL(
-    lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
-    ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
-    ops::LoDResetKernel<paddle::platform::CPUPlace, int>,
-    ops::LoDResetKernel<paddle::platform::CPUPlace, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
-    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>,
-    ops::LoDResetGradKernel<paddle::platform::CPUPlace, int>,
-    ops::LoDResetGradKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/lod_reset_op.cu b/paddle/fluid/operators/lod_reset_op.cu
deleted file mode 100644
index 888d4c12eb4e3f4fd94d8dd4178c59acd0abb23b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lod_reset_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lod_reset_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    lod_reset, ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    lod_reset_grad,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
deleted file mode 100644
index d827f2a2ebbe99183b801ab1b9787c69a924c293..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lod_reset_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LoDResetKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    auto* in = ctx.Input<framework::LoDTensor>("X");
-    auto* lod_t = ctx.Input<framework::LoDTensor>("Y");
-    bool append = ctx.Attr<bool>("append");
-
-    out->ShareDataWith(*in);
-
-    std::vector<int> level0;
-    if (lod_t) {
-      if (lod_t->lod().size() > 0) {
-        auto y_lod = lod_t->lod();
-        auto last_level = y_lod[y_lod.size() - 1];
-        PADDLE_ENFORCE_EQ((int64_t)(last_level.back()), in->dims()[0],
-                          "Last value of `Y`'s last level LoD should be equal "
-                          "to the first dimension of `X`");
-        out->set_lod(y_lod);
-        return;  // early return, since lod already set
-      } else {
-        auto* lod = lod_t->data<int>();
-        if (platform::is_gpu_place(ctx.GetPlace())) {
-          framework::Tensor lod_cpu;
-          framework::TensorCopySync(*lod_t, platform::CPUPlace(), &lod_cpu);
-          lod = lod_cpu.data<int>();
-        }
-        level0 = std::vector<int>(lod, lod + lod_t->numel());
-      }
-    } else {
-      level0 = ctx.Attr<std::vector<int>>("target_lod");
-    }
-
-    PADDLE_ENFORCE_GT(level0.size(), 1UL,
-                      "Size of target LoD should be greater than 1.");
-    PADDLE_ENFORCE_EQ(level0[0], 0,
-                      "Target LoD should be a vector starting from 0.");
-    PADDLE_ENFORCE_EQ(level0.back(), in->dims()[0],
-                      "Target LoD should be a vector end with the "
-                      "first dimension of Input(X).");
-    for (size_t i = 0; i < level0.size() - 1; ++i) {
-      PADDLE_ENFORCE(level0[i + 1] >= level0[i],
-                     "Target LoD should be an ascending vector.");
-    }
-
-    // cast level0 to size_t
-    std::vector<size_t> ulevel0(level0.size(), 0);
-    std::transform(level0.begin(), level0.end(), ulevel0.begin(),
-                   [](int a) { return static_cast<size_t>(a); });
-    if (append) {
-      auto* out_lod = out->mutable_lod();
-      out_lod->push_back(ulevel0);
-    } else {
-      framework::LoD target_lod;
-      target_lod.push_back(ulevel0);
-      out->set_lod(target_lod);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LoDResetGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    d_x->ShareDataWith(*d_out);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
deleted file mode 100644
index 962822f33e6e69bfd1ee90a100473ba1c8185495..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <algorithm>
-#include <map>
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace operators {
-
-struct CopyRange {
-  size_t begin;
-  size_t end;
-};
-
-struct LoDTensorToArrayFunctor;
-
-template <typename DeviceContext>
-struct LoDTensorToArrayFunctorImpl {
-  const LoDTensorToArrayFunctor *prev_functor_;
-  DeviceContext *dev_ctx_;
-  template <typename T>
-  void apply();
-};
-
-struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
-  std::vector<const framework::Tensor *> ref_inputs_;
-  mutable std::vector<framework::Tensor *> outputs_;
-  const framework::Tensor &input_;
-
-  explicit LoDTensorToArrayFunctor(const framework::Tensor &input)
-      : input_(input) {}
-
-  void AddOutput(framework::Tensor *t) {
-    outputs_.emplace_back(t);
-    ref_inputs_.emplace_back(t);
-  }
-
-  template <typename Place>
-  void operator()(Place place) const {
-    auto &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx = pool.Get(place);
-    if (std::is_same<Place, platform::CPUPlace>::value) {
-      Apply(static_cast<platform::CPUDeviceContext *>(dev_ctx));
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      Apply(static_cast<platform::CUDADeviceContext *>(dev_ctx));
-#else
-      PADDLE_THROW("Not compiled with cuda");
-#endif
-    }
-  }
-
-  template <typename DeviceContext>
-  void Apply(DeviceContext *dev_ctx) const {
-    LoDTensorToArrayFunctorImpl<DeviceContext> func;
-    func.prev_functor_ = this;
-    func.dev_ctx_ = dev_ctx;
-    framework::VisitDataType(input_.type(), func);
-  }
-};
-
-template <typename DeviceContext>
-template <typename T>
-void LoDTensorToArrayFunctorImpl<DeviceContext>::apply() {
-  math::SplitFunctor<DeviceContext, T> func;
-  func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0,
-       &prev_functor_->outputs_);
-}
-
-class LoDTensorToArrayOp : public framework::OperatorBase {
- public:
-  LoDTensorToArrayOp(const std::string &type,
-                     const framework::VariableNameMap &inputs,
-                     const framework::VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
-                          Input("X"))
-                  .Get<framework::LoDTensor>();
-    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")))
-                           .Get<framework::LoDRankTable>();
-    auto &out = *detail::Ref(scope.FindVar(Output("Out")))
-                     .GetMutable<framework::LoDTensorArray>();
-    auto &items = rank_table.items();
-    auto max_seq_len = items[0].length;
-    auto rank_level = rank_table.level();
-
-    PADDLE_ENFORCE_LT(rank_level, x.lod().size(),
-                      "Input should be a LOD tensor, and size is at least %d",
-                      rank_level + 1);
-    out.resize(max_seq_len);
-    std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
-
-    // set out[i] lod
-    for (size_t t = 0; t < max_seq_len; t++) {
-      auto &lod = *out[t].mutable_lod();
-      lod.clear();
-      for (auto &item : items) {
-        if (t >= item.length) {
-          break;
-        }
-        size_t start_idx = x.lod()[rank_level][item.index] + t;
-        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
-            x.lod(), start_idx, start_idx + 1, rank_level + 1);
-        auto &lod_length = lod_and_offset.first;
-        framework::AppendLoD(&lod, lod_length);
-        size_t start_offset = lod_and_offset.second.first;
-        size_t end_offset = lod_and_offset.second.second;
-        copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
-      }
-    }
-
-    std::map<size_t, framework::Tensor> outputs;
-
-    for (size_t i = 0; i < max_seq_len; ++i) {
-      auto &ranges = copy_ranges[i];
-      size_t height = std::accumulate(
-          ranges.begin(), ranges.end(), 0UL,
-          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
-      auto x_dim = x.dims();
-      x_dim[0] = static_cast<int64_t>(height);
-      out[i].Resize(x_dim);
-      out[i].mutable_data(x.place(), x.type());
-      size_t offset = 0;
-      for (auto &each_range : ranges) {
-        size_t len = each_range.end - each_range.begin;
-        if (len == 0) {
-          continue;
-        }
-        // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
-        auto slice = out[i].Slice(static_cast<int>(offset),
-                                  static_cast<int>(offset + len));
-        outputs.insert({each_range.begin, slice});
-        offset += len;
-      }
-    }
-
-    LoDTensorToArrayFunctor functor(x);
-    for (auto &out_pair : outputs) {
-      functor.AddOutput(&out_pair.second);
-    }
-    platform::VisitPlace(place, functor);
-  }
-};
-
-class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "");
-    AddInput("RankTable", "");
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class LoDTensorToArrayInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "Input(X) of LoDTensorToArrayOp should not be null.");
-    PADDLE_ENFORCE(
-        context->HasInput("RankTable"),
-        "Input(RankTable) of LoDTensorToArrayOp should not be null.");
-
-    PADDLE_ENFORCE(context->HasOutput("Out"),
-                   "Output(Out) of LoDTensorToArrayOp should not be null.");
-
-    auto x_dim = context->GetInputDim("X");
-    // The first dim of each LoDTensor in Output can only be set at run-time.;
-    // We still have to Resize each LoDTensor in Output.
-    context->SetOutputDim("Out", x_dim);
-    // The lod level should be passed to out in compile time.
-    if (!context->IsRuntime()) {
-      context->DecreaseLoDLevel("X", /*->*/ "Out");
-    }
-  }
-};
-
-class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    for (auto &out_var : ctx->Output("Out")) {
-      ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
-    }
-  }
-};
-
-class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("array_to_lod_tensor");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetInput("RankTable", Input("RankTable"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lod_tensor_to_array, ops::LoDTensorToArrayOp,
-                  ops::LoDTensorToArrayOpProtoMaker,
-                  ops::LoDTensorToArrayInferShape,
-                  ops::LoDTensorToArrayInferVarType,
-                  ops::LoDTensorToArrayGradMaker);
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
deleted file mode 100644
index 0048c75ccf04687b42f990dc5aa79541359645c1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/log_loss_op.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/log_loss_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class LogLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
-                   "Input(Predicted) must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) must be initialized.");
-
-    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
-
-    if (ctx->IsRuntime() || (framework::product(pred_dims) > 0 &&
-                             framework::product(label_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(pred_dims, label_dims);
-    }
-    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
-                      "The rank of Input(Predicted) must be 2 and the shape is "
-                      "[batch_size, 1].");
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(pred_dims[1], 1,
-                        "Each row of Input(Predicted) contains a real value, "
-                        "so the 2nd dimension of Input(X) must be 1.");
-    }
-    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
-    ctx->ShareLoD("Predicted", "Loss");
-  }
-};
-
-template <typename AttrType>
-class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Predicted",
-             "The input value (Predicted) of Log loss op."
-             "Predicted is a 2-D tensor with shape [batch_size, 1].");
-    AddInput("Labels",
-             "The target value (Labels) of Log loss op."
-             "Labels is a 2-D tensor with shape [batch_size, 1].");
-    AddOutput("Loss",
-              "The output tensor with shape [batch_size, 1] "
-              "which represents the log loss.");
-    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
-    AddComment(R"DOC(
-LogLoss Operator.
-
-Log loss is a loss function used for binary classification. Log Loss quantifies
-the accuracy of a classifier by penalising false classifications. Minimising the
-Log Loss is equivalent to maximising the accuracy of the classifier. We define
-Predicted as the values predicted by our model and Labels as the target ground
-truth value. Log loss can evaluate how close the predicted values are to the
-target. The shapes of Predicted and Labels are both [batch_size, 1].
-The equation is:
-
-$$
-Loss = - Labels * log(Predicted + \epsilon) -
-        (1 - Labels) * log(1 - Predicted + \epsilon)
-$$
-
-)DOC");
-  }
-};
-
-class LogLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
-                   "Input(Predicted) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input(Loss@GRAD) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
-                   "Output(Predicted@GRAD) should not be null.");
-
-    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
-    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
-
-    auto pred_grad_name = framework::GradVarName("Predicted");
-    ctx->SetOutputDim(pred_grad_name, pred_dims);
-  }
-};
-
-class LogLossGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("log_loss_grad");
-    op->SetInput("Predicted", Input("Predicted"));
-    op->SetInput("Labels", Input("Labels"));
-    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
-    op->SetOutput(framework::GradVarName("Predicted"), InputGrad("Predicted"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
-                  ops::LogLossGradDescMaker);
-REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/operators/log_loss_op.cu
deleted file mode 100644
index 280913c43a2749ddd5fbd3ae1905f1b823dd525d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/log_loss_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/log_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h
deleted file mode 100644
index e62de17a98603109786e49725537867c3fe7831a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/log_loss_op.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class LogLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* loss_out = ctx.Output<Tensor>("Loss");
-
-    loss_out->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
-    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
-
-    auto loss = EigenVector<T>::Flatten(*loss_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    loss.device(place) = (-(label * (prediction + epsilon).log()) -
-                          ((static_cast<T>(1) - label) *
-                           (static_cast<T>(1) - prediction + epsilon).log()));
-  }
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class LogLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
-    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
-
-    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
-
-    auto dl = EigenVector<T>::Flatten(*dloss);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    if (dpred) {
-      dpred->mutable_data<T>(ctx.GetPlace());
-      auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
-                               ((static_cast<T>(1) - label) /
-                                (static_cast<T>(1) - prediction + epsilon)));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
deleted file mode 100644
index 4840a7ac1e79840f314cb40365d25332a1292c46..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr int64_t kNoPadding = -1;
-
-class LookupSparseTableInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LookupSparseTableOp should not be null.");
-    auto shape_w = ctx->GetInputDim("W");
-    auto shape_ids = ctx->GetInputDim("Ids");
-    shape_w[0] = shape_ids.size();
-    ctx->SetOutputDim("Out", shape_w);
-  }
-};
-
-class LookupSparseTableOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto out_var = scope.FindVar(Output("Out"));
-    auto w_var = scope.FindVar(Input("W"));
-    auto ids_var = scope.FindVar(Input("Ids"));
-    auto is_test = Attr<bool>("is_test");
-
-    PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
-                   "The type of Out var should be LodTensor.");
-    PADDLE_ENFORCE(w_var->IsType<framework::SelectedRows>(),
-                   "The type of W var should be SelectedRows.");
-    PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
-                   "The type of Ids var should be LoDTensor.");
-    auto &ids_t = ids_var->Get<framework::LoDTensor>();
-    auto out_t = out_var->GetMutable<framework::LoDTensor>();
-    auto w_t = w_var->GetMutable<framework::SelectedRows>();
-
-    // TODO(Yancey1989): support CUDA Place for the sparse table
-    platform::CPUPlace cpu;
-    auto out_shape = w_t->value().dims();
-    out_shape[0] = ids_t.numel();
-    out_t->Resize(out_shape);
-    out_t->mutable_data(cpu, w_t->value().type());
-    PADDLE_ENFORCE_EQ(w_t->value().type(), framework::proto::VarType::FP32,
-                      "The sparse table only support FP32");
-    w_t->Get(ids_t, out_t, true, is_test);
-    out_t->set_lod(ids_t.lod());
-  }
-};
-
-class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("W",
-             "(SelectedRows) The input represents embedding table, "
-             "which is a learnable parameter.");
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.");
-    AddOutput("Out",
-              "(LoDTensor) The lookup results, which have the "
-              "same type as W.");
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(kNoPadding);
-    AddAttr<bool>("auto_grown_table",
-                  "(bool default false)"
-                  "Whether create new value if for nonexistent key.")
-        .SetDefault(true);
-    AddAttr<bool>("is_test",
-                  "In test mode, lookup_sparse_table will "
-                  "return a 0 for unknown id")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lookup_sparse_table, ops::LookupSparseTableOp,
-                  ops::LookupSparseTableInferShape,
-                  ops::LookupSparseTableOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
deleted file mode 100644
index 8b7d7a52704d5452487373d38d75626ea2b239c8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lookup_table_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-
-namespace paddle {
-namespace operators {
-
-class LookupTableOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(W) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Ids"),
-                   "Input(Ids) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LookupTableOp should not be null.");
-
-    auto table_dims = ctx->GetInputDim("W");
-    auto ids_dims = ctx->GetInputDim("Ids");
-    int ids_rank = ids_dims.size();
-    VLOG(5) << "ids rank is " << ids_rank << std::endl;
-    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
-    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
-                      "The last dimension of the 'Ids' tensor must be 1.");
-
-    auto output_dims =
-        framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
-    output_dims.push_back(table_dims[1]);
-    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
-
-    if (ctx->GetOutputsVarType("Out")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("Ids", /*->*/ "Out");
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("W",
-             "(Tensor) The input represents embedding tensors, "
-             "which is a learnable parameter.");
-    AddInput("Ids",
-             "An input with type int32 or int64 "
-             "contains the ids to be looked up in W. "
-             "The last dimension size must be 1.");
-    AddOutput("Out", "The lookup results, which have the same type as W.");
-    AddAttr<bool>("is_sparse",
-                  "(boolean, default false) "
-                  "Sparse update.")
-        .SetDefault(false);
-    AddAttr<bool>("is_distributed",
-                  "(boolean, default false) distributed lookup table.")
-        .SetDefault(false);
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(kNoPadding);
-    // NOTE(minqiyang): grad_inplace is an temporal attribute,
-    // please do NOT set this attribute in python layer.
-    AddAttr<bool>("grad_inplace",
-                  "(boolean, default false) "
-                  "If the grad op reuse the input's variable.")
-        .SetDefault(false);
-
-    // for parameter prefetch
-    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, the splited table names that will be fetched from "
-        "parameter server)"
-        "in the order of input variables for mapping")
-        .SetDefault({});
-
-    AddComment(R"DOC(
-Lookup Table Operator.
-
-This operator is used to perform lookups on the parameter W,
-then concatenated into a dense tensor.
-
-The input Ids can carry the LoD (Level of Details) information,
-or not. And the output only shares the LoD information with input Ids.
-
-)DOC");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LookupTableGradOpNoBuffer, "W");
-
-class LookupTableGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("lookup_table_grad");
-
-    op->SetInput("W", Input("W"));
-    op->SetInput("Ids", Input("Ids"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-
-    op->SetOutput(framework::GradVarName("W"), InputGrad("W"));
-
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class LookupTableOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto table_dims = ctx->GetInputDim("W");
-    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(
-        ctx.InputVar(framework::GradVarName("Out")));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
-    auto attr = ctx->GetAttr("is_sparse");
-    bool is_sparse = boost::get<bool>(attr);
-    if (is_sparse) {
-      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
-              << " is set to SelectedRows";
-      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
-    } else {
-      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
-              << " is set to LoDTensor";
-      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
-    }
-    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
-                  ops::LookupTableGradOpDescMaker);
-
-REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
-                  ops::LookupTableGradOpNoBuffer,
-                  ops::LookupTableOpGradVarTypeInference);
-
-REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
-                       ops::LookupTableKernel<double>);
-REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>,
-                       ops::LookupTableGradKernel<double>);
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
deleted file mode 100644
index f9e12e01c13d2a9da2668c80e8479b09a31f280b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/lookup_table_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
-          bool PaddingFlag>
-__global__ void LookupTable(T *output, const T *table, const int64_t *ids,
-                            const int64_t N, const int64_t K, const int64_t D,
-                            const int64_t padding_idx) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
-
-  while (idy < K) {
-    int64_t id = ids[idy];
-    PADDLE_ENFORCE(
-        id >= 0,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
-    PADDLE_ENFORCE(
-        id < N,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
-    T *out = output + idy * D;
-    const T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
-      if (PaddingFlag) {
-        if (id == padding_idx)
-          out[i] = static_cast<T>(0);
-        else
-          out[i] = tab[i];
-      } else {
-        out[i] = tab[i];
-      }
-    }
-    idy += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
-                                const int64_t N, const int64_t K,
-                                const int64_t D) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
-
-  while (idy < K) {
-    int64_t id = ids[idy];
-    PADDLE_ENFORCE(
-        id >= 0,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
-    PADDLE_ENFORCE(
-        id < N,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
-    const T *out = output + idy * D;
-    T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
-      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
-    }
-    idy += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T>
-class LookupTableCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *table_t = context.Input<LoDTensor>("W");
-    auto *ids_t = context.Input<LoDTensor>("Ids");
-    auto *output_t = context.Output<LoDTensor>("Out");
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-
-    auto id_name = context.Inputs("Ids").front();
-    auto out_name = context.Outputs("Out").front();
-
-    size_t N = table_t->dims()[0];
-    size_t D = table_t->dims()[1];
-    size_t K = ids_t->numel();
-
-    auto *ids = ids_t->data<int64_t>();
-    auto *table = table_t->data<T>();
-    auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-
-    if (padding_idx == -1)
-      LookupTable<
-          T, 128, 8, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
-    else
-      LookupTable<
-          T, 128, 8, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
-  }
-};
-
-template <typename T>
-class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    bool is_sparse = context.Attr<bool>("is_sparse");
-
-    // Since paddings are not trainable and fixed in forward, the gradient of
-    // paddings makes no sense and we don't deal with it in backward.
-    if (is_sparse) {
-      auto *ids = context.Input<LoDTensor>("Ids");
-      auto *table = context.Input<LoDTensor>("W");
-      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
-
-      auto *ids_data = ids->data<int64_t>();
-      int64_t ids_num = ids->numel();
-
-      auto stream = dev_ctx.stream();
-      // copy GPU memory to CPU pinned memory
-      framework::Vector<int64_t> new_rows;
-      new_rows.resize(ids_num);
-      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
-
-      // TODO(yuyang18): Strange code here.
-      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
-                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
-      d_table->set_rows(new_rows);
-
-      auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_num, table->dims()[1]});
-      d_table_value->mutable_data<T>(context.GetPlace());
-
-      auto *d_table_data = d_table_value->data<T>();
-      auto *d_output_data = d_output->data<T>();
-      auto d_output_dims = d_output->dims();
-      PADDLE_ENFORCE_EQ(
-          d_table_value->dims(),
-          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
-      memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
-                   d_output->numel() * sizeof(T), stream);
-
-    } else {
-      auto ids_t = context.Input<LoDTensor>("Ids");
-      auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
-
-      int N = d_table_t->dims()[0];
-      int D = d_table_t->dims()[1];
-      int K = ids_t->numel();
-      const int64_t *ids = ids_t->data<int64_t>();
-      const T *d_output = d_output_t->data<T>();
-      T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
-
-      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
-
-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
-      LookupTableGrad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
-          d_table, d_output, ids, N, K, D);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
-                        ops::LookupTableCUDAKernel<double>,
-                        ops::LookupTableCUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
-                        ops::LookupTableGradCUDAKernel<float>,
-                        ops::LookupTableGradCUDAKernel<double>,
-                        ops::LookupTableGradCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
deleted file mode 100644
index 4863ed17424cdcc1bece27770722cc8359be2f92..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lookup_table_op.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-constexpr int64_t kNoPadding = -1;
-
-template <typename T>
-class LookupTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *ids_t = context.Input<LoDTensor>("Ids");      // int tensor
-    auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
-    auto *table_var = context.InputVar("W");
-
-    auto id_name = context.Inputs("Ids").front();
-    auto embedding_name = context.Inputs("W").front();
-    auto out_name = context.Outputs("Out").front();
-
-    // for remote prefetch
-    auto epmap = context.Attr<std::vector<std::string>>("epmap");
-    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
-    auto height_sections =
-        context.Attr<std::vector<int64_t>>("height_sections");
-    auto table_names = context.Attr<std::vector<std::string>>("table_names");
-
-    if (remote_prefetch && !epmap.empty()) {
-// if epmap is not empty, then the parameter will be fetched from remote
-// parameter server
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-      operators::distributed::prefetch(id_name, out_name, embedding_name, false,
-                                       table_names, epmap, height_sections,
-                                       context, context.scope());
-#else
-      PADDLE_THROW(
-          "paddle is not compiled with distribute support, can not do "
-          "parameter prefetch!");
-#endif
-    } else {
-      int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-      int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-      int64_t ids_numel = ids_t->numel();
-
-      if (table_var->IsType<LoDTensor>()) {
-        auto *table_t = context.Input<LoDTensor>("W");
-        int64_t row_number = table_t->dims()[0];
-        int64_t row_width = table_t->dims()[1];
-
-        auto *table = table_t->data<T>();
-        auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-        for (int64_t i = 0; i < ids_numel; ++i) {
-          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-            memset(output + i * row_width, 0, row_width * sizeof(T));
-          } else {
-            PADDLE_ENFORCE_LT(
-                ids[i], row_number,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            PADDLE_ENFORCE_GE(
-                ids[i], 0,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            memcpy(output + i * row_width, table + ids[i] * row_width,
-                   row_width * sizeof(T));
-          }
-        }
-      } else if (table_var->IsType<SelectedRows>()) {
-        const auto &table_t = table_var->Get<SelectedRows>();
-        int64_t row_width = table_t.value().dims()[1];
-        const auto *table = table_t.value().data<T>();
-        auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-        auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-        for (int64_t i = 0; i < ids_numel; ++i) {
-          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-            memset(output + i * row_width, 0, row_width * sizeof(T));
-          } else {
-            PADDLE_ENFORCE_GE(ids[i], 0);
-            auto id_index = table_t.Index(ids[i]);
-            PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
-            blas.VCOPY(row_width, table + id_index * row_width,
-                       output + i * row_width);
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class LookupTableGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *table_var = context.InputVar("W");
-    DDim table_dim;
-    if (table_var->IsType<LoDTensor>()) {
-      table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
-      table_dim = table_t->value().dims();
-    } else {
-      PADDLE_THROW(
-          "The parameter W of a LookupTable "
-          "must be either LoDTensor or SelectedRows");
-    }
-
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    bool is_sparse = context.Attr<bool>("is_sparse");
-    // Since paddings are not trainable and fixed in forward, the gradient of
-    // paddings makes no sense and we don't deal with it in backward.
-    if (is_sparse) {
-      auto *ids = context.Input<LoDTensor>("Ids");
-      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
-
-      auto *ids_data = ids->data<int64_t>();
-      int64_t ids_num = ids->numel();
-
-      std::vector<int64_t> new_rows;
-      new_rows.resize(ids_num);
-      std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
-      d_table->set_rows(new_rows);
-
-      auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_num, table_dim[1]});
-      // FIXME(minqiyang):
-      // memory optimization will NOT reuse Tensor with SelectedRows
-      // so we could just share the tensor here directly.
-      // However, the InferVarType method will infer the output SelectedRows
-      // to Tensor sometimes, which is a bug, so we will add an attribute
-      // here to indicate the inplace and remove this attribute after
-      // the InferVarType's bug was fixed
-      bool grad_inplace = context.Attr<bool>("grad_inplace");
-      if (grad_inplace) {
-        d_table_value->ShareDataWith(*d_output);
-      } else {
-        d_table_value->mutable_data<T>(context.GetPlace());
-
-        d_table->set_height(table_dim[0]);
-
-        auto *d_output_data = d_output->data<T>();
-        auto *d_table_data = d_table_value->data<T>();
-
-        auto d_output_dims = d_output->dims();
-        PADDLE_ENFORCE_EQ(
-            d_table_value->dims(),
-            framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
-        memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
-      }
-    } else {
-      auto *ids = context.Input<LoDTensor>("Ids");
-      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-
-      auto *ids_data = ids->data<int64_t>();
-
-      int64_t N = table_dim[0];
-      int64_t D = table_dim[1];
-
-      auto *d_output_data = d_output->data<T>();
-      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
-
-      memset(d_table_data, 0, d_table->numel() * sizeof(T));
-
-      for (int64_t i = 0; i < ids->numel(); ++i) {
-        if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
-          // the gradient of padding_idx should be 0, already done by memset, so
-          // do nothing.
-        } else {
-          PADDLE_ENFORCE_LT(
-              ids_data[i], N,
-              "Variable value (input) of OP(fluid.layers.embedding) "
-              "expected >= 0 and < %ld, but got %ld. Please check input value.",
-              N, ids_data[i]);
-          PADDLE_ENFORCE_GE(
-              ids_data[i], 0,
-              "Variable value (input) of OP(fluid.layers.embedding) "
-              "expected >= 0 and < %ld, but got %ld. Please check input value.",
-              N, ids_data[i]);
-          for (int j = 0; j < D; ++j) {
-            d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc
deleted file mode 100644
index f1b982356a80ff299b16550b7f7eb57122ced418..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lookup_table_v2_op.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lookup_table_v2_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-
-namespace paddle {
-namespace operators {
-
-class LookupTableV2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                      "Input(W) of LookupTableV2Op should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ids"), true,
-                      "Input(Ids) of LookupTableV2Op should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of LookupTableV2Op should not be null.");
-
-    auto table_dims = ctx->GetInputDim("W");
-    auto ids_dims = ctx->GetInputDim("Ids");
-    int ids_rank = ids_dims.size();
-    VLOG(5) << "ids rank is " << ids_rank << std::endl;
-    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
-
-    auto output_dims = framework::vectorize(ids_dims);
-    output_dims.push_back(table_dims[1]);
-    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
-
-    if (ctx->GetOutputsVarType("Out")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("Ids", /*->*/ "Out");
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class LookupTableV2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("W",
-             "(Tensor) The input represents embedding tensors, "
-             "which is a learnable parameter.");
-    AddInput("Ids",
-             "An input with type int32 or int64 "
-             "contains the ids to be looked up in W. "
-             "The last dimension size must be 1.");
-    AddOutput("Out", "The lookup results, which have the same type as W.");
-    AddAttr<bool>("is_sparse",
-                  "(boolean, default false) "
-                  "Sparse update.")
-        .SetDefault(false);
-    AddAttr<bool>("is_distributed",
-                  "(boolean, default false) distributed lookup table.")
-        .SetDefault(false);
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(kNoPadding);
-
-    // for parameter prefetch
-    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, the splited table names that will be fetched from "
-        "parameter server)"
-        "in the order of input variables for mapping")
-        .SetDefault({});
-
-    AddComment(R"DOC(
-Lookup Table V2 Operator.
-
-This operator is used to perform lookups on the parameter W,
-then concatenated into a dense tensor.
-
-The input Ids can carry the LoD (Level of Details) information,
-or not. And the output only shares the LoD information with input Ids.
-
-)DOC");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LookupTableV2GradOpNoBuffer, "W");
-
-class LookupTableV2GradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("lookup_table_v2_grad");
-
-    op->SetInput("W", Input("W"));
-    op->SetInput("Ids", Input("Ids"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-
-    op->SetOutput(framework::GradVarName("W"), InputGrad("W"));
-
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class LookupTableV2OpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto table_dims = ctx->GetInputDim("W");
-    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(
-        ctx.InputVar(framework::GradVarName("Out")));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class LookupTableV2OpGradVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
-    auto attr = ctx->GetAttr("is_sparse");
-    bool is_sparse = boost::get<bool>(attr);
-    if (is_sparse) {
-      VLOG(3) << "lookup_table_v2_grad op " << framework::GradVarName("W")
-              << " is set to SelectedRows";
-      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
-    } else {
-      VLOG(3) << "lookup_table_v2_grad op " << framework::GradVarName("W")
-              << " is set to LoDTensor";
-      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
-    }
-    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lookup_table_v2, ops::LookupTableV2Op,
-                  ops::LookupTableV2OpMaker, ops::LookupTableV2GradOpDescMaker);
-
-REGISTER_OPERATOR(lookup_table_v2_grad, ops::LookupTableV2OpGrad,
-                  ops::LookupTableV2GradOpNoBuffer,
-                  ops::LookupTableV2OpGradVarTypeInference);
-
-REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel<float>,
-                       ops::LookupTableV2Kernel<double>);
-REGISTER_OP_CPU_KERNEL(lookup_table_v2_grad,
-                       ops::LookupTableV2GradKernel<float>,
-                       ops::LookupTableV2GradKernel<double>);
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
deleted file mode 100644
index e7f580c5fdbbde74bd739a81d8a5abed80788fd2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/lookup_table_v2_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
-          bool PaddingFlag>
-__global__ void LookupTableV2(T *output, const T *table, const int64_t *ids,
-                              const int64_t N, const int64_t K, const int64_t D,
-                              const int64_t padding_idx) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
-
-  while (idy < K) {
-    int64_t id = ids[idy];
-    PADDLE_ENFORCE(
-        id >= 0,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
-    PADDLE_ENFORCE(
-        id < N,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
-    T *out = output + idy * D;
-    const T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
-      if (PaddingFlag) {
-        if (id == padding_idx)
-          out[i] = static_cast<T>(0);
-        else
-          out[i] = tab[i];
-      } else {
-        out[i] = tab[i];
-      }
-    }
-    idy += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids,
-                                  const int64_t N, const int64_t K,
-                                  const int64_t D) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
-
-  while (idy < K) {
-    int64_t id = ids[idy];
-    PADDLE_ENFORCE(
-        id >= 0,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
-    PADDLE_ENFORCE(
-        id < N,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
-    const T *out = output + idy * D;
-    T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
-      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
-    }
-    idy += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T>
-class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *table_t = context.Input<LoDTensor>("W");
-    auto *ids_t = context.Input<LoDTensor>("Ids");
-    auto *output_t = context.Output<LoDTensor>("Out");
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-
-    auto id_name = context.Inputs("Ids").front();
-    auto out_name = context.Outputs("Out").front();
-
-    size_t N = table_t->dims()[0];
-    size_t D = table_t->dims()[1];
-    size_t K = ids_t->numel();
-
-    auto *ids = ids_t->data<int64_t>();
-    auto *table = table_t->data<T>();
-    auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-
-    if (padding_idx == -1)
-      LookupTableV2<
-          T, 128, 8, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
-    else
-      LookupTableV2<
-          T, 128, 8, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
-  }
-};
-
-template <typename T>
-class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    bool is_sparse = context.Attr<bool>("is_sparse");
-
-    // Since paddings are not trainable and fixed in forward, the gradient of
-    // paddings makes no sense and we don't deal with it in backward.
-    if (is_sparse) {
-      auto *ids = context.Input<LoDTensor>("Ids");
-      auto *table = context.Input<LoDTensor>("W");
-      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
-
-      auto *ids_data = ids->data<int64_t>();
-      int64_t ids_num = ids->numel();
-
-      auto stream = dev_ctx.stream();
-      // copy GPU memory to CPU pinned memory
-      framework::Vector<int64_t> new_rows;
-      new_rows.resize(ids_num);
-      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
-
-      // TODO(yuyang18): Strange code here.
-      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
-                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
-      d_table->set_rows(new_rows);
-
-      auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_num, table->dims()[1]});
-      d_table_value->mutable_data<T>(context.GetPlace());
-
-      auto *d_table_data = d_table_value->data<T>();
-      auto *d_output_data = d_output->data<T>();
-      auto d_output_dims = d_output->dims();
-      PADDLE_ENFORCE_EQ(
-          d_table_value->dims(),
-          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
-      memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
-                   d_output->numel() * sizeof(T), stream);
-
-    } else {
-      auto ids_t = context.Input<LoDTensor>("Ids");
-      auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
-
-      int N = d_table_t->dims()[0];
-      int D = d_table_t->dims()[1];
-      int K = ids_t->numel();
-      const int64_t *ids = ids_t->data<int64_t>();
-      const T *d_output = d_output_t->data<T>();
-      T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
-
-      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
-
-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
-      LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
-          d_table, d_output, ids, N, K, D);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(lookup_table_v2, ops::LookupTableV2CUDAKernel<float>,
-                        ops::LookupTableV2CUDAKernel<double>,
-                        ops::LookupTableV2CUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(lookup_table_v2_grad,
-                        ops::LookupTableV2GradCUDAKernel<float>,
-                        ops::LookupTableV2GradCUDAKernel<double>,
-                        ops::LookupTableV2GradCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
deleted file mode 100644
index 16f4d7c4171b0b160d32352deeb0fa0f460a3291..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-constexpr int64_t kNoPadding = -1;
-
-template <typename T>
-class LookupTableV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *ids_t = context.Input<LoDTensor>("Ids");      // int tensor
-    auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
-    auto *table_var = context.InputVar("W");
-
-    auto id_name = context.Inputs("Ids").front();
-    auto embedding_name = context.Inputs("W").front();
-    auto out_name = context.Outputs("Out").front();
-
-    // for remote prefetch
-    auto epmap = context.Attr<std::vector<std::string>>("epmap");
-    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
-    auto height_sections =
-        context.Attr<std::vector<int64_t>>("height_sections");
-    auto table_names = context.Attr<std::vector<std::string>>("table_names");
-
-    if (remote_prefetch && !epmap.empty()) {
-// if epmap is not empty, then the parameter will be fetched from remote
-// parameter server
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-      operators::distributed::prefetch(id_name, out_name, embedding_name, false,
-                                       table_names, epmap, height_sections,
-                                       context, context.scope());
-#else
-      PADDLE_THROW(
-          "paddle is not compiled with distribute support, can not do "
-          "parameter prefetch!");
-#endif
-    } else {
-      int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-      int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-      int64_t ids_numel = ids_t->numel();
-
-      if (table_var->IsType<LoDTensor>()) {
-        auto *table_t = context.Input<LoDTensor>("W");
-        int64_t row_number = table_t->dims()[0];
-        int64_t row_width = table_t->dims()[1];
-
-        auto *table = table_t->data<T>();
-        auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-        for (int64_t i = 0; i < ids_numel; ++i) {
-          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-            memset(output + i * row_width, 0, row_width * sizeof(T));
-          } else {
-            PADDLE_ENFORCE_LT(
-                ids[i], row_number,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            PADDLE_ENFORCE_GE(
-                ids[i], 0,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            memcpy(output + i * row_width, table + ids[i] * row_width,
-                   row_width * sizeof(T));
-          }
-        }
-      } else if (table_var->IsType<SelectedRows>()) {
-        const auto &table_t = table_var->Get<SelectedRows>();
-        int64_t row_width = table_t.value().dims()[1];
-        const auto *table = table_t.value().data<T>();
-        auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-        auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-        for (int64_t i = 0; i < ids_numel; ++i) {
-          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-            memset(output + i * row_width, 0, row_width * sizeof(T));
-          } else {
-            PADDLE_ENFORCE_GE(ids[i], 0);
-            auto id_index = table_t.Index(ids[i]);
-            PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
-            blas.VCOPY(row_width, table + id_index * row_width,
-                       output + i * row_width);
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class LookupTableV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *table_var = context.InputVar("W");
-    DDim table_dim;
-    if (table_var->IsType<LoDTensor>()) {
-      table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
-      table_dim = table_t->value().dims();
-    } else {
-      PADDLE_THROW(
-          "The parameter W of a LookupTableV2 "
-          "must be either LoDTensor or SelectedRows");
-    }
-
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    bool is_sparse = context.Attr<bool>("is_sparse");
-    // Since paddings are not trainable and fixed in forward, the gradient of
-    // paddings makes no sense and we don't deal with it in backward.
-    if (is_sparse) {
-      auto *ids = context.Input<LoDTensor>("Ids");
-      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
-
-      auto *ids_data = ids->data<int64_t>();
-      int64_t ids_num = ids->numel();
-
-      std::vector<int64_t> new_rows;
-      new_rows.resize(ids_num);
-      std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
-      d_table->set_rows(new_rows);
-
-      auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_num, table_dim[1]});
-
-      d_table_value->mutable_data<T>(context.GetPlace());
-
-      d_table->set_height(table_dim[0]);
-
-      auto *d_output_data = d_output->data<T>();
-      auto *d_table_data = d_table_value->data<T>();
-
-      auto d_output_dims = d_output->dims();
-      PADDLE_ENFORCE_EQ(
-          d_table_value->dims(),
-          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
-      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
-
-    } else {
-      auto *ids = context.Input<LoDTensor>("Ids");
-      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-
-      auto *ids_data = ids->data<int64_t>();
-
-      int64_t N = table_dim[0];
-      int64_t D = table_dim[1];
-
-      auto *d_output_data = d_output->data<T>();
-      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
-
-      memset(d_table_data, 0, d_table->numel() * sizeof(T));
-
-      for (int64_t i = 0; i < ids->numel(); ++i) {
-        if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
-          // the gradient of padding_idx should be 0, already done by memset, so
-          // do nothing.
-        } else {
-          PADDLE_ENFORCE_LT(
-              ids_data[i], N,
-              "Variable value (input) of OP(fluid.layers.embedding) "
-              "expected >= 0 and < %ld, but got %ld. Please check input value.",
-              N, ids_data[i]);
-          PADDLE_ENFORCE_GE(
-              ids_data[i], 0,
-              "Variable value (input) of OP(fluid.layers.embedding) "
-              "expected >= 0 and < %ld, but got %ld. Please check input value.",
-              N, ids_data[i]);
-          for (int j = 0; j < D; ++j) {
-            d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
deleted file mode 100644
index 5ad94cfde901bedae4af28e5b2a43bad08e28cf9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lrn_op.cc
+++ /dev/null
@@ -1,297 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lrn_op.h"
-#include <string>
-#include "paddle/fluid/operators/math/blas.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T>
-struct LRNFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out,
-                  framework::Tensor* mid, int N, int C, int H, int W, int n,
-                  T k, T alpha, T beta) {
-    const T* idata = input.data<T>();
-    auto place = ctx.GetPlace();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    T* odata = out->mutable_data<T>(place);
-    T* mdata = mid->mutable_data<T>(place);
-    Tensor squared;
-    T* sdata = squared.mutable_data<T>({1, C + n - 1, H, W}, place);
-    std::memset(sdata, 0, sizeof(T) * squared.numel());
-    for (int i = 0; i < mid->numel(); ++i) {
-      mdata[i] = k;
-    }
-    int img_size = H * W;
-    int fea_size = C * img_size;
-    int pre_pad = (n - 1) / 2;
-    // compute batches one by one
-    for (int i = 0; i < N; ++i) {
-      blas.VSQUARE(fea_size, idata + i * fea_size, sdata + pre_pad * img_size);
-      // init the first channel of mid
-      for (int c = 0; c < n; ++c) {
-        blas.AXPY(img_size, alpha, sdata + c * img_size, mdata + i * fea_size);
-      }
-      for (int c = 1; c < C; ++c) {
-        // copy previous scale
-        int mid_offset = i * fea_size + c * img_size;
-        std::memcpy(mdata + mid_offset, mdata + mid_offset - img_size,
-                    img_size * sizeof(T));
-        // add last
-        blas.AXPY(img_size, alpha, sdata + (c + n - 1) * img_size,
-                  mdata + mid_offset);
-        // sub rest
-        blas.AXPY(img_size, -alpha, sdata + (c - 1) * img_size,
-                  mdata + mid_offset);
-      }
-    }
-    // compute the final output
-    blas.VPOW(mid->numel(), mdata, -beta, odata);
-    blas.VMUL(mid->numel(), odata, idata, odata);
-  }
-};
-template struct LRNFunctor<platform::CPUDeviceContext, float>;
-template struct LRNFunctor<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct LRNGradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& x, const framework::Tensor& out,
-                  const framework::Tensor& mid, framework::Tensor* x_g,
-                  const framework::Tensor& out_g, int N, int C, int H, int W,
-                  int n, T alpha, T beta) {
-    T ratio = -2 * alpha * beta;
-    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
-    x_g_e = x_g_e.constant(0.0);
-
-    auto e_x = framework::EigenTensor<T, 4>::From(x);
-    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
-    auto e_out = framework::EigenTensor<T, 4>::From(out);
-    auto e_out_g = framework::EigenTensor<T, 4>::From(out_g);
-    auto e_mid = framework::EigenTensor<T, 4>::From(mid);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                             Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                     Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        i_x_g = i_mid.pow(-beta) * i_out_g;
-        for (int c = start; c < end; c++) {
-          int ch = i + c;
-          if (ch < 0 || ch >= C) {
-            continue;
-          }
-
-          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                       Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          i_x_g += ratio * c_out_g * c_out * i_x / c_mid;
-        }
-      }
-    }
-  }
-};
-template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
-template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
-
-namespace {
-framework::OpKernelType GetExpectedLRNKernel(
-    const framework::ExecutionContext& ctx) {
-  framework::LibraryType library_{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-#ifdef PADDLE_WITH_MKLDNN
-  if (library_ == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kMKLDNN;
-    layout_ = framework::DataLayout::kMKLDNN;
-  }
-#endif
-
-  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
-                                 layout_, library_);
-}
-}  // namespace
-
-class LRNOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LRNOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LRNOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MidOut"),
-                   "MidOut(Out) of LRNOp should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
-
-    int n = ctx->Attrs().Get<int>("n");
-    PADDLE_ENFORCE(n > 0 && n % 2 == 1, "n should be positive odd value");
-
-    ctx->SetOutputDim("Out", x_dim);
-    ctx->ShareLoD("X", /*->*/ "Out");
-    ctx->SetOutputDim("MidOut", x_dim);
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetExpectedLRNKernel(ctx);
-  }
-};
-
-template <typename T>
-class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor) The input of LRN operator. "
-             "It must be a 4D tenor with NCHW format.");
-    AddOutput("Out",
-              "(Tensor) The output of LRN operator, which is also the 4D "
-              "tensor with NCHW format.");
-    AddOutput("MidOut",
-              "(Tensor) Middle result of LRN operator. It's computed in "
-              "forward process and also used in backward process.");
-
-    AddAttr<int>("n",
-                 "(int default 5) "
-                 "n is the \"adjacent\" kernel that maps "
-                 "at the same spatial position.")
-        .SetDefault(5)
-        .GreaterThan(0);
-
-    AddAttr<T>("k",
-               "(float, default 2.0) "
-               "k is the bias.")
-        .SetDefault(2.0)
-        .GreaterThan(0.0);
-
-    AddAttr<T>("alpha",
-               "(float, default 0.0001) "
-               "alpha is the scale number.")
-        .SetDefault(0.0001)
-        .GreaterThan(0.0);
-
-    AddAttr<T>("beta",
-               "(float, default 0.75) "
-               "beta is the power number.")
-        .SetDefault(0.75)
-        .GreaterThan(0.0);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout");
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-
-    AddComment(R"DOC(
-Local Response Normalization Operator.
-
-This operator comes from the paper:
-<<ImageNet Classification with Deep Convolutional Neural Networks>>.
-
-The original formula is:
-
-$$
-Output(i, x, y) = Input(i, x, y) / \left(
-k + \alpha \sum\limits^{\min(C-1, i + n/2)}_{j = \max(0, i - n/2)}
-(Input(j, x, y))^2
-\right)^{\beta}
-$$
-
-Function implementation:
-
-Inputs and outpus are in NCHW format, while input.shape.ndims() equals 4.
-And dimensions 0 ~ 3 represent batch size, feature maps, rows,
-and columns, respectively.
-
-Input and Output in the formula above is for each map(i) of one image, and
-Input(i, x, y), Output(i, x, y) represents an element in an image.
-
-C is the number of feature maps of one image. n is a hyper-parameter
-configured when operator is initialized. The sum in the denominator
-is the sum of the same positions in the neighboring maps.
-
-)DOC");
-  }
-};
-
-class LRNOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetExpectedLRNKernel(ctx);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lrn, ops::LRNOp, ops::LRNOpMaker<float>,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    lrn_grad, ops::LRNGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu
deleted file mode 100644
index 64f3fea6be24e60c6fcef3190c0df4e01953b133..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lrn_op.cu
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lrn_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C,
-                                   int H, int W, int size, T k, T alpha) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < img_size) {
-    const int w = idx % W;
-    const int h = (idx / W) % H;
-    const int n = idx / W / H;
-    const int offset = (n * C * H + h) * W + w;
-
-    in += offset;
-    mid += offset;
-    const int step = H * W;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-
-    T accum = 0;
-    int index = 0;
-    while (index < C + post_pad) {
-      if (index < C) {
-        T val = in[index * step];
-        accum += val * val;
-      }
-      if (index >= size) {
-        T val = in[(index - size) * step];
-        accum -= val * val;
-      }
-      if (index >= post_pad) {
-        mid[(index - post_pad) * step] = k + accum * alpha;
-      }
-      ++index;
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid,
-                                T negative_beta, T* out) {
-  const int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < input_size) {
-    out[index] = in[index] * pow(mid[index], negative_beta);
-  }
-}
-
-template <typename T>
-void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs,
-                    T* outputs, T* mid, int N, int C, int H, int W, int n, T k,
-                    T alpha, T beta) {
-  int img_size = N * H * W;
-  const int block_size = 1024;
-  int grid_size = (img_size + block_size - 1) / block_size;
-
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  KeCMRNormFillScale<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      img_size, inputs, mid, C, H, W, n, k, alpha);
-
-  int input_size = N * H * W * C;
-  grid_size = (input_size + block_size - 1) / block_size;
-  KeCMRNormOutput<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      input_size, inputs, mid, -beta, outputs);
-}
-
-template <typename T>
-struct LRNFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out,
-                  framework::Tensor* mid, int N, int C, int H, int W, int n,
-                  T k, T alpha, T beta) {
-    CrossMapNormal<T>(
-        ctx, input.data<T>(), out->mutable_data<T>(ctx.GetPlace()),
-        mid->mutable_data<T>(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta);
-  }
-};
-
-template struct LRNFunctor<platform::CUDADeviceContext, float>;
-template struct LRNFunctor<platform::CUDADeviceContext, double>;
-
-template <typename T>
-__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out,
-                              const T* mid, T* x_g, const T* out_g, int C,
-                              int H, int W, int size, T negative_beta,
-                              T ratio) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < img_size) {
-    const int w = idx % W;
-    const int h = (idx / W) % H;
-    const int n = idx / W / H;
-    const int offset = (n * C * H + h) * W + w;
-    x += offset;
-    out += offset;
-    mid += offset;
-    out_g += offset;
-    x_g += offset;
-
-    const int step = H * W;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-
-    int index = 0;
-    T accum = 0;
-    // TODO(gongwb): optimize this with thread shared array.
-    while (index < C + post_pad) {
-      if (index < C) {
-        x_g[index * step] = 0.0;
-        accum += out_g[index * step] * out[index * step] / mid[index * step];
-      }
-      if (index >= size) {
-        accum -= out_g[(index - size) * step] * out[(index - size) * step] /
-                 mid[(index - size) * step];
-      }
-      if (index >= post_pad) {
-        x_g[(index - post_pad) * step] +=
-            out_g[(index - post_pad) * step] *
-                pow(mid[(index - post_pad) * step], negative_beta) -
-            ratio * x[(index - post_pad) * step] * accum;
-      }
-      ++index;
-    }
-  }
-}
-
-template <typename T>
-void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x,
-                        const T* out, const T* mid, T* x_g, const T* out_g,
-                        int N, int C, int H, int W, int n, T alpha, T beta) {
-  int img_size = N * H * W;
-
-  const int block_size = 1024;
-  int grid_size = (img_size + block_size - 1) / block_size;
-
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  KeCMRNormDiff<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta,
-      2.0f * alpha * beta);
-}
-
-template <typename T>
-struct LRNGradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& x, const framework::Tensor& out,
-                  const framework::Tensor& mid, framework::Tensor* x_g,
-                  const framework::Tensor& out_g, int N, int C, int H, int W,
-                  int n, T alpha, T beta) {
-    CrossMapNormalGrad<T>(ctx, x.data<T>(), out.data<T>(), mid.data<T>(),
-                          x_g->mutable_data<T>(ctx.GetPlace()), out_g.data<T>(),
-                          N, C, H, W, n, alpha, beta);
-  }
-};
-
-template struct LRNGradFunctor<platform::CUDADeviceContext, float>;
-template struct LRNGradFunctor<platform::CUDADeviceContext, double>;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lrn, ops::LRNKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    lrn_grad, ops::LRNGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
deleted file mode 100644
index 12d39c3815395896343238b536110aecac66a376..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lrn_op.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename place, typename T>
-struct LRNFunctor {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out,
-                  framework::Tensor* mid, int N, int C, int H, int W, int n,
-                  T k, T alpha, T beta);
-};
-
-template <typename DeviceContext, typename T>
-class LRNKernel : public framework::OpKernel<T> {
- public:
-  using Tensor = framework::Tensor;
-
-  // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta)
-  // x represents inputs
-  // f(x) represents outputs
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // input
-    const Tensor& x = *ctx.Input<Tensor>("X");
-    auto x_dims = x.dims();
-
-    // NCHW
-    int N = x_dims[0];
-    int C = x_dims[1];
-    int H = x_dims[2];
-    int W = x_dims[3];
-
-    Tensor* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    // MidOut save the intermediate result for backward
-    Tensor* mid = ctx.Output<Tensor>("MidOut");
-    mid->mutable_data<T>(ctx.GetPlace());
-
-    int n = ctx.Attr<int>("n");
-    T alpha = ctx.Attr<float>("alpha");
-    T beta = ctx.Attr<float>("beta");
-    T k = ctx.Attr<float>("k");
-
-    PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0");
-    PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
-    PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
-
-    LRNFunctor<DeviceContext, T> f;
-    f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta);
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct LRNGradFunctor {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& x, const framework::Tensor& out,
-                  const framework::Tensor& mid, framework::Tensor* x_g,
-                  const framework::Tensor& out_g, int N, int C, int H, int W,
-                  int n, T alpha, T beta);
-};
-
-/**
- * \brief Backward calculation for normalization with across maps.
- *
- * Function implementation:
- *
- * The implementation of this Function is derived from the
- * CrossMapNormalFunc implementation.
- *
- * InputGrad = OutputGrad * MidOut ^ (-beta)
- *    -- upper
- *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue
- *    -- lower
- *
- * The data of inputs/outputs format is the same as the forward interface
- * and is NCHW.
- *
- * The upper and lower is the same as forward. The logic of the sum
- * is also the same as forward.
- */
-template <typename DeviceContext, typename T>
-class LRNGradKernel : public framework::OpKernel<T> {
- public:
-  using Tensor = framework::Tensor;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor& x = *ctx.Input<Tensor>("X");
-    const Tensor& out = *ctx.Input<Tensor>("Out");
-    const Tensor& out_g = *ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor& mid = *ctx.Input<Tensor>("MidOut");
-
-    auto x_g = ctx.Output<Tensor>(framework::GradVarName("X"));
-    x_g->mutable_data<T>(ctx.GetPlace());
-
-    auto x_dims = x.dims();
-    int N = x_dims[0];
-    int C = x_dims[1];
-    int H = x_dims[2];
-    int W = x_dims[3];
-
-    int n = ctx.Attr<int>("n");
-    T alpha = ctx.Attr<T>("alpha");
-    T beta = ctx.Attr<T>("beta");
-
-    PADDLE_ENFORCE(
-        !ctx.Attr<bool>("is_test"),
-        "is_test attribute should be set to False in training phase.");
-
-    LRNGradFunctor<DeviceContext, T> f;
-    f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
deleted file mode 100644
index bf68c57e67fbff9216f51d805c78e49714fdb736..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lstm_op.cc
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstm_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class LSTMOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of LSTM should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                   "Output(Hidden) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                   "Output(Cell) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
-                   "Output(BatchGate) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
-                   "Output(BatchGate) of LSTM should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2.");
-
-    if (ctx->HasInput("H0")) {
-      PADDLE_ENFORCE(ctx->HasInput("C0"),
-                     "Input(Cell) and Input(Hidden) of LSTM should not "
-                     "be null at the same time.");
-      auto h_dims = ctx->GetInputDim("H0");
-      auto c_dims = ctx->GetInputDim("C0");
-      PADDLE_ENFORCE(h_dims == c_dims,
-                     "The dimension of Input(H0) and Input(C0) "
-                     "should be the same.");
-    }
-
-    int frame_size = in_dims[1] / 4;
-    auto w_dims = ctx->GetInputDim("Weight");
-    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "The rank of Input(Weight) should be 2.");
-    PADDLE_ENFORCE_EQ(w_dims[0], frame_size,
-                      "The first dimension of Input(Weight) "
-                      "should be %d.",
-                      frame_size);
-    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
-                      "The second dimension of Input(Weight) "
-                      "should be 4 * %d.",
-                      frame_size);
-
-    auto b_dims = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
-    PADDLE_ENFORCE_EQ(b_dims[0], 1,
-                      "The first dimension of Input(Bias) should be 1.");
-
-    if (ctx->Attrs().Get<bool>("use_peepholes")) {
-      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
-                        "The second dimension of Input(Bias) should be "
-                        "7 * %d if enable peepholes connection",
-                        frame_size);
-    } else {
-      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
-                        "The second dimension of Input(Bias) should be "
-                        "4 * %d if disable peepholes connection",
-                        frame_size);
-    }
-
-    framework::DDim out_dims({in_dims[0], frame_size});
-    ctx->SetOutputDim("Hidden", out_dims);
-    ctx->SetOutputDim("Cell", out_dims);
-    ctx->SetOutputDim("BatchGate", in_dims);
-    ctx->SetOutputDim("BatchCellPreAct", out_dims);
-    ctx->ShareLoD("Input", "Hidden");
-    ctx->ShareLoD("Input", "Cell");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
-  }
-};
-
-class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(LoDTensor) the first input is a LodTensor, which support "
-             "variable-time length input sequence. The underlying tensor in "
-             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
-             "total time steps in this mini-batch, D is the hidden size.");
-    AddInput("H0",
-             "(Tensor, optional) the initial hidden state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size and D is the hidden size.")
-        .AsDispensable();
-    AddInput("C0",
-             "(Tensor, optional) the initial cell state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. `H0` and `C0` can be NULL but only at the same time.")
-        .AsDispensable();
-    AddInput("Weight",
-             "(Tensor) the learnable hidden-hidden weights."
-             " - The shape is (D x 4D), where D is the hidden size. "
-             " - Weight = {W_ch, W_ih, W_fh, W_oh}");
-    AddInput("Bias",
-             "(Tensor) the learnable weights, which contains two parts: "
-             "input-hidden bias weight and peephole connections weight if "
-             "setting `use_peepholes` True. "
-             "1. `use_peepholes = False` "
-             " - The shape is (1 x 4D). "
-             " - Bias = {b_c, b_i, b_f, b_o}."
-             "2. `use_peepholes = True` "
-             " - The shape is (1 x 7D). "
-             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
-    AddOutput("Hidden",
-              "(LoDTensor) the hidden state of LSTM operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput("Cell",
-              "(LoDTensor) the cell state of LSTM operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput("BatchGate",
-              "(LoDTensor) This LoDTensor contains input gate, forget gate "
-              "and output gate after the nonlinear computation. This "
-              "LoDTensor has the same shape as the reorganized input, which "
-              "is also be called batch input. The LoD size is 2. The first "
-              "LoD is the batch offsets and the second LoD contains the "
-              "indexes, which denote the position of reorganized sequence "
-              "in the raw input.")
-        .AsIntermediate();
-    AddOutput("BatchCellPreAct",
-              "(LoDTensor) This LoDTensor is obtained in the forward and used "
-              "in the backward.")
-        .AsIntermediate();
-    AddAttr<bool>("use_peepholes",
-                  "(bool, default: True) "
-                  "whether to enable diagonal/peephole connections.")
-        .SetDefault(true);
-    AddAttr<bool>("is_reverse",
-                  "(bool, default: False) "
-                  "whether to compute reversed LSTM.")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "gate_activation",
-        "(string, default: sigmoid)"
-        "The activation for input gate, forget gate and output "
-        "gate, `sigmoid` by default.")
-        .SetDefault("sigmoid")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("cell_activation",
-                         "(string, default: tanh)"
-                         "The activation for cell output, `tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("candidate_activation",
-                         "(string, default: tanh)"
-                         "The activation for candidate hidden state, "
-                         "`tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddComment(R"DOC(
-Long-Short Term Memory (LSTM) Operator.
-
-The default implementation is diagonal/peephole connection
-(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
-
-$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$
-
-$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$
-
-$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$
-
-$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$
-
-$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
-
-$$ h_t = o_t \\odot act_h(c_t) $$
-
-- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-  of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-  are diagonal weight matrices for peephole connections. In our implementation,
-  we use vectors to represent these diagonal weight matrices.
-- The b terms denote bias vectors ($b_i$ is the input gate bias vector).
-- $\sigma$ is the non-line activations, such as logistic sigmoid function.
-- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-  and cell activation vectors, respectively, all of which have the same size as
-  the cell output activation vector $h$.
-- The $\odot$ is the element-wise product of the vectors.
-- $act_g$ and $act_h$ are the cell input and cell output activation functions
-  and `tanh` is usually used for them.
-- $\tilde{c_t}$ is also called candidate hidden state,
-  which is computed based on the current input and the previous hidden state.
-
-Set `use_peepholes` False to disable peephole connection. The formula
-is omitted here, please refer to the paper
-http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-
-Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
-operations on the input $x_{t}$ are NOT included in this operator.
-Users can choose to use fully-connect operator before LSTM operator.
-
-)DOC");
-  }
-};
-
-class LSTMGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
-                   "Input(Hidden) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Cell"),
-                   "Input(Cell) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of LSTM should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
-                   "Input(BatchGate) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
-                   "Input(BatchGate) of LSTM should not be null.");
-
-    auto SetOutGradDim = [&ctx](const std::string& name) {
-      auto g_name = framework::GradVarName(name);
-      if (ctx->HasOutput(g_name))
-        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
-    };
-
-    SetOutGradDim("Input");
-    SetOutGradDim("Weight");
-    SetOutGradDim("Bias");
-    SetOutGradDim("H0");
-    SetOutGradDim("C0");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
-  }
-};
-
-class LSTMGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("lstm_grad");
-    op->SetAttrMap(Attrs());
-    op->SetInput("Input", Input("Input"));
-    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-
-    if (ForwardOp().Inputs().count("H0") > 0) {
-      op->SetInput("H0", Input("H0"));
-      op->SetOutput(framework::GradVarName("H0"), InputGrad("H0"));
-    }
-
-    if (ForwardOp().Inputs().count("C0") > 0) {
-      op->SetInput("C0", Input("C0"));
-      op->SetOutput(framework::GradVarName("C0"), InputGrad("C0"));
-    }
-
-    op->SetInput("Weight", Input("Weight"));
-    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
-
-    op->SetInput("Bias", Input("Bias"));
-    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-
-    op->SetInput("Cell", Output("Cell"));
-
-    op->SetInput("Hidden", Output("Hidden"));
-    op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden"));
-
-    op->SetInput("BatchGate", Output("BatchGate"));
-    op->SetInput("BatchCellPreAct", Output("BatchCellPreAct"));
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker,
-                  ops::LSTMGradOpDescMaker);
-REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
-REGISTER_OP_CPU_KERNEL(
-    lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    lstm_grad, ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc
deleted file mode 100644
index c1cbfada41d04b52faf74dcfe76bcbb35edfa71b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lstm_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lstm, ops::LSTMKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    lstm_grad, ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
deleted file mode 100644
index ca998826dd0118ab4b1ecc23bed8ef882f1bcc92..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lstm_op.h
+++ /dev/null
@@ -1,379 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
-template <typename DeviceContext, typename T>
-class LSTMKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* cell_t0 = ctx.Input<Tensor>("C0");
-
-    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
-    batch_gate->mutable_data<T>(ctx.GetPlace());
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    hidden_out->mutable_data<T>(ctx.GetPlace());
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    cell_out->mutable_data<T>(ctx.GetPlace());
-
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, batch_gate, true, is_reverse);
-
-    auto in_dims = input->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    framework::DDim dims({in_dims[0], frame_size});
-
-    if (bias) {
-      Tensor b = *bias;
-      b.Resize({bias->numel(), 1});
-      Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
-    }
-
-    math::LstmMetaValue<T> lstm_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      // the code style in LstmMetaValue will be updated later.
-
-      lstm_value.check_ig = bias_data + 4 * frame_size;
-      lstm_value.check_fg = lstm_value.check_ig + frame_size;
-      lstm_value.check_og = lstm_value.check_fg + frame_size;
-    } else {
-      lstm_value.check_ig = nullptr;
-      lstm_value.check_fg = nullptr;
-      lstm_value.check_og = nullptr;
-    }
-    lstm_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTM reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
-                                         &ordered_c0, true);
-      lstm_value.prev_state_value = ordered_c0.data<T>();
-    }
-
-    // Use the local variable as here.
-    LoDTensor batch_hidden, batch_cell;
-    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
-    batch_hidden.mutable_data<T>(dims, ctx.GetPlace());
-    batch_cell.mutable_data<T>(dims, ctx.GetPlace());
-    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor out_t = batch_hidden.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(pre_hidden_t, false, *weight, false, static_cast<T>(1.0),
-                    &gate_t, static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTM reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-        Tensor ordered_h0;
-        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
-                                           &ordered_h0, true);
-        blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
-                    &gate_t, static_cast<T>(1.0));
-      }
-
-      lstm_value.gate_value = gate_t.data<T>();
-      lstm_value.output_value = out_t.data<T>();
-      lstm_value.state_value = cell_t.data<T>();
-      lstm_value.state_active_value = cell_pre_act_t.data<T>();
-      T cell_clip = 0.0;
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip,
-          gate_act, cell_act, cand_act);
-      lstm_value.prev_state_value = lstm_value.state_value;
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden.set_lod(batch_gate->lod());
-    // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(device_ctx, batch_hidden, hidden_out);
-
-    batch_cell.set_lod(batch_gate->lod());
-    // restore the output cell state in LoDTensor from the batch cell
-    to_seq(device_ctx, batch_cell, cell_out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LSTMGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* hidden_out = ctx.Input<LoDTensor>("Hidden");
-    auto* cell_out = ctx.Input<LoDTensor>("Cell");
-
-    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
-    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
-
-    auto* hidden_g = ctx.Input<LoDTensor>(framework::GradVarName("Hidden"));
-
-    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
-    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* c0 = ctx.Input<Tensor>("C0");
-
-    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
-    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
-
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-    if (weight_g) {
-      weight_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, weight_g, static_cast<T>(0.0));
-    }
-
-    // ordered_h0/c0 is the reordered hidden/cell initialization.
-    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
-    // initialization.
-    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (c0) {
-      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
-                                         true);
-    }
-    if (c0 && c0_g) {
-      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
-    }
-
-    auto in_dims = input->dims();
-    auto out_dims = hidden_g->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
-
-    math::LstmMetaValue<T> lstm_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      lstm_value.check_ig = bias_data + 4 * frame_size;
-      lstm_value.check_fg = lstm_value.check_ig + frame_size;
-      lstm_value.check_og = lstm_value.check_fg + frame_size;
-    } else {
-      lstm_value.check_ig = nullptr;
-      lstm_value.check_fg = nullptr;
-      lstm_value.check_og = nullptr;
-    }
-
-    math::LstmMetaGrad<T> lstm_grad;
-
-    if (bias && bias_g) {
-      bias_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, bias_g, static_cast<T>(0.0));
-    }
-    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_g_data = bias_g->data<T>();
-      lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
-      lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
-      lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
-    } else {
-      lstm_grad.check_ig_grad = nullptr;
-      lstm_grad.check_fg_grad = nullptr;
-      lstm_grad.check_og_grad = nullptr;
-    }
-
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-
-    auto ToBatch = [&batch_gate, &to_batch](
-        const DeviceContext& ctx, const framework::LoDTensor& src,
-        const framework::DDim& dims, framework::LoDTensor& dst) {
-      dst.mutable_data<T>(dims, ctx.GetPlace());
-      dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, &dst, false);
-    };
-
-    LoDTensor batch_hidden, batch_hidden_g, batch_cell;
-    ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden);
-    ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g);
-    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);
-
-    LoDTensor batch_cell_g, batch_gate_g;
-    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    // TODO(qingqing) support the case output cell has gradient.
-    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
-    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
-    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
-    batch_gate_g.set_lod(batch_gate->lod());
-
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
-    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor gate = batch_gate->Slice(bstart, bend);
-      Tensor cell = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-      lstm_value.gate_value = gate.data<T>();
-      lstm_value.state_value = cell.data<T>();
-      lstm_value.state_active_value = cell_pre_act.data<T>();
-
-      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
-      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
-      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
-      lstm_grad.state_grad = cell_g.data<T>();
-      lstm_grad.gate_grad = gate_g.data<T>();
-      lstm_grad.output_grad = out_g.data<T>();
-
-      if (n > 0) {
-        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-        lstm_value.prev_state_value = cell_pre.data<T>();
-        lstm_grad.prev_state_grad = cell_pre_g.data<T>();
-      } else {
-        lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
-        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
-      }
-
-      // lstm_value.output_value not used in bp, set to nullptr
-      // lstm_grad.state_active_grad not used in bp, set to nullptr
-      lstm_value.output_value = nullptr;
-      lstm_grad.state_active_grad = nullptr;
-      int cur_batch_size = bend - bstart;
-      T cell_clip = 0.0;
-      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
-          device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
-          cell_clip, gate_act, cell_act, cand_act);
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
-                    &pre_hidden_g, static_cast<T>(1.0));
-        if (weight_g) {
-          /* backward weight */
-          auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
-          blas.MatMul(pre_hidden, true, gate_g, false, static_cast<T>(1.0),
-                      weight_g, static_cast<T>(1.0));
-        }
-      } else {
-        if (h0 && weight_g) {
-          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
-                                             &ordered_h0, true);
-          blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
-                      weight_g, static_cast<T>(1.0));
-        }
-        if (h0 && h0_g) {
-          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
-                      &ordered_h0_g, static_cast<T>(0.0));
-        }
-      }
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    if (in_g) {
-      /* backward data */
-      in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, in_g);
-    }
-    if (bias && bias_g) {
-      /* backward bias */
-      Tensor b_g = *bias_g;
-      b_g.Resize({bias_g->numel(), 1});
-      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<DeviceContext, T> col_sum;
-      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
-    }
-
-    if (h0 && h0_g) {
-      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
-                                         false);
-    }
-    if (c0 && c0_g) {
-      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
-                                         false);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
deleted file mode 100644
index 47d695475c2e240d273fe873352cf5c213e2026e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstm_unit_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LstmUnitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("C_prev"),
-                   "Input(C_prev) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("C"),
-                   "Output(C) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("H"),
-                   "Output(H) of LSTM should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto c_prev_dims = ctx->GetInputDim("C_prev");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
-                        "Batch size of inputs and states must be equal");
-      PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
-                        "Dimension of FC should equal to prev state * 4");
-    }
-
-    int b_size = c_prev_dims[0];  // batch size
-    int s_dim = c_prev_dims[1];   // state dim
-    ctx->SetOutputDim("C", {b_size, s_dim});
-    ctx->SetOutputDim("H", {b_size, s_dim});
-  }
-};
-
-class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "Lstm unit only applies non-linear activations, please make sure"
-             "that linear tranformation has already been applied to `X`. "
-             "Linear tranformation can be applied by adding a `fc` layer");
-    AddInput(
-        "C_prev",
-        "The cell state tensor of last time-step in the Lstm Unit operator.");
-    AddOutput("C", "The cell tensor of Lstm Unit operator.");
-    AddOutput("H", "The hidden state tensor of Lstm Unit operator.");
-    AddAttr<float>("forget_bias",
-                   "(float, default 0.0) "
-                   "The forget bias of Lstm Unit.")
-        .SetDefault(0.0);
-    AddComment(R"DOC(
-Lstm Unit Operator
-
-Equation:
-
-$$
-i, f, o, j = split(X) \\
-C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\
-H = C * sigm(o)
-$$
-
-)DOC");
-  }
-};
-
-class LstmUnitGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")),
-                   "Input(C@GRAD) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")),
-                   "Input(H@GRAD) should not be null");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->SetOutputDim(framework::GradVarName("C_prev"),
-                      ctx->GetInputDim("C_prev"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(lstm_unit_grad, ops::LstmUnitGradOp);
-REGISTER_OP_CPU_KERNEL(lstm_unit,
-                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
-                       ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(
-    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>,
-    ops::LstmUnitGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
deleted file mode 100644
index 87451cb1271cb68e6fc5e2f969e8028fac27d0c3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/* Acknowledgement: the following code is strongly inspired by
-https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu
-*/
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/cross_entropy_op.h"
-#include "paddle/fluid/operators/lstm_unit_op.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename Dtype>
-__device__ Dtype cuda_sigmoid(const Dtype x) {
-  return Dtype(1) / (Dtype(1) + exp(-x));
-}
-
-template <typename Dtype>
-__device__ Dtype cuda_tanh(const Dtype x) {
-  return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x));
-}
-
-template <typename T>
-__global__ void LSTMUnitKernel(const int nthreads, const int dim,
-                               const T* C_prev, const T* X, T* C, T* H,
-                               const T forget_bias) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int n = index / dim;
-    const int d = index % dim;
-
-    const T* X_offset = X + 4 * dim * n;
-    const T i = cuda_sigmoid(X_offset[d]);
-    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
-    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
-    const T g = cuda_tanh(X_offset[3 * dim + d]);
-    const T c_prev = C_prev[index];
-    const T c = f * c_prev + i * g;
-    C[index] = c;
-    const T tanh_c = cuda_tanh(c);
-    H[index] = o * tanh_c;
-  }
-}
-
-template <typename T>
-__global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
-                                       const T* C_prev, const T* X, const T* C,
-                                       const T* H, const T* C_diff,
-                                       const T* H_diff, T* C_prev_diff,
-                                       T* X_diff, const T forget_bias) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int n = index / dim;
-    const int d = index % dim;
-    const T* X_offset = X + 4 * dim * n;
-    T* c_prev_diff = C_prev_diff + index;
-    T* X_diff_offset = X_diff + 4 * dim * n;
-    T* i_diff = X_diff_offset + d;
-    T* f_diff = X_diff_offset + 1 * dim + d;
-    T* o_diff = X_diff_offset + 2 * dim + d;
-    T* g_diff = X_diff_offset + 3 * dim + d;
-
-    const T i = cuda_sigmoid(X_offset[d]);
-    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
-    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
-    const T g = cuda_tanh(X_offset[3 * dim + d]);
-    const T c_prev = C_prev[index];
-    const T c = C[index];
-    const T tanh_c = cuda_tanh(c);
-    const T c_term_diff =
-        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
-    *c_prev_diff = c_term_diff * f;
-    *i_diff = c_term_diff * g * i * (1 - i);
-    *f_diff = c_term_diff * c_prev * f * (1 - f);
-    *o_diff = H_diff[index] * tanh_c * o * (1 - o);
-    *g_diff = c_term_diff * i * (1 - g * g);
-  }
-}
-
-template <typename T>
-class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-
-    auto* x_tensor = ctx.Input<framework::Tensor>("X");
-    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
-    auto* c_tensor = ctx.Output<framework::Tensor>("C");
-    auto* h_tensor = ctx.Output<framework::Tensor>("H");
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    int b_size = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    const T* X = x_tensor->data<T>();
-    const T* C_prev = c_prev_tensor->data<T>();
-
-    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
-    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
-
-    int block = 512;
-    int n = b_size * D;
-    int grid = (n + block - 1) / block;
-
-    LSTMUnitKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, forget_bias);
-  }
-};
-
-template <typename T>
-class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-
-    auto x_tensor = ctx.Input<Tensor>("X");
-    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
-    auto c_tensor = ctx.Input<Tensor>("C");
-    auto h_tensor = ctx.Input<Tensor>("H");
-
-    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
-    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
-
-    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto c_prev_diff_tensor =
-        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
-
-    auto* X = x_tensor->data<T>();
-    auto* C_prev = c_prev_tensor->data<T>();
-    auto* C = c_tensor->data<T>();
-    auto* H = h_tensor->data<T>();
-
-    auto* H_diff = hdiff_tensor->data<T>();
-    auto* C_diff = cdiff_tensor->data<T>();
-
-    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
-    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
-
-    int N = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    int block = 512;
-    int n = N * D;
-    int grid = (n + block - 1) / block;
-
-    LSTMUnitGradientKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, C_diff,
-                                               H_diff, C_prev_diff, X_diff,
-                                               forget_bias);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
-                        ops::LstmUnitOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
-                        ops::LstmUnitGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h
deleted file mode 100644
index 4ead9c22934dde6e42f9ede47cc1ddf502948fc4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lstm_unit_op.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/* Acknowledgement: the following code is strongly inspired by
-https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h
-*/
-
-#pragma once
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T>
-inline T sigmoid(T x) {
-  return 1. / (1. + exp(-x));
-}
-
-template <typename T>
-inline T tanh(T x) {
-  return 2. * sigmoid(2. * x) - 1.;
-}
-
-template <typename DeviceContext, typename T>
-class LstmUnitKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto* x_tensor = ctx.Input<framework::Tensor>("X");
-    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
-    auto* c_tensor = ctx.Output<framework::Tensor>("C");
-    auto* h_tensor = ctx.Output<framework::Tensor>("H");
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    int b_size = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
-    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
-
-    const T* X = x_tensor->data<T>();
-    const T* C_prev = c_prev_tensor->data<T>();
-
-    for (int n = 0; n < b_size; ++n) {
-      for (int d = 0; d < D; ++d) {
-        const T i = sigmoid(X[d]);
-        const T f = sigmoid(X[1 * D + d] + forget_bias);
-        const T o = sigmoid(X[2 * D + d]);
-        const T g = tanh(X[3 * D + d]);
-        const T c_prev = C_prev[d];
-        const T c = f * c_prev + i * g;
-        C[d] = c;
-        const T tanh_c = tanh(c);
-        H[d] = o * tanh_c;
-      }
-      C_prev += D;
-      X += 4 * D;
-      C += D;
-      H += D;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LstmUnitGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto x_tensor = ctx.Input<Tensor>("X");
-    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
-    auto c_tensor = ctx.Input<Tensor>("C");
-    auto h_tensor = ctx.Input<Tensor>("H");
-
-    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
-    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
-
-    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto c_prev_diff_tensor =
-        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
-
-    auto* X = x_tensor->data<T>();
-    auto* C_prev = c_prev_tensor->data<T>();
-    auto* C = c_tensor->data<T>();
-    auto* H = h_tensor->data<T>();
-
-    auto* H_diff = hdiff_tensor->data<T>();
-    auto* C_diff = cdiff_tensor->data<T>();
-
-    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
-    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
-
-    int N = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    for (int n = 0; n < N; ++n) {
-      for (int d = 0; d < D; ++d) {
-        T* c_prev_diff = C_prev_diff + d;
-        T* i_diff = X_diff + d;
-        T* f_diff = X_diff + 1 * D + d;
-        T* o_diff = X_diff + 2 * D + d;
-        T* g_diff = X_diff + 3 * D + d;
-
-        const T i = sigmoid(X[d]);
-        const T f = sigmoid(X[1 * D + d] + forget_bias);
-        const T o = sigmoid(X[2 * D + d]);
-        const T g = tanh(X[3 * D + d]);
-        const T c_prev = C_prev[d];
-        const T c = C[d];
-        const T tanh_c = tanh(c);
-        const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
-        *c_prev_diff = c_term_diff * f;
-        *i_diff = c_term_diff * g * i * (1 - i);
-        *f_diff = c_term_diff * c_prev * f * (1 - f);
-        *o_diff = H_diff[d] * tanh_c * o * (1 - o);
-        *g_diff = c_term_diff * i * (1 - g * g);
-      }
-      C_prev += D;
-      X += 4 * D;
-      C += D;
-      H += D;
-      C_diff += D;
-      H_diff += D;
-      X_diff += 4 * D;
-      C_prev_diff += D;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
deleted file mode 100644
index b9f42237180007eecc8b558c6939a7156dfc6e45..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lstmp_op.cc
+++ /dev/null
@@ -1,366 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstmp_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class LSTMPOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
-                   "Input(ProjWeight) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of LSTMP operator should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Projection"),
-                   "Output(Projection) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                   "Output(Cell) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
-                   "Output(BatchGate) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
-                   "Output(BatchCellPreAct) of LSTMP operator should not be "
-                   "null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
-                   "Output(BatchHidden) of LSTMP operator should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
-                      "Input(X)'s rank of LSTMP operator must be 2.");
-
-    int frame_size = in_dims[1] / 4;
-    auto w_dims = ctx->GetInputDim("Weight");
-    auto proj_dims = ctx->GetInputDim("ProjWeight");
-    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "The rank of Input(Weight) should be 2.");
-    PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1],
-                      "The first dimension of Input(Weight) "
-                      "should be %d.",
-                      proj_dims[1]);
-    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
-                      "The second dimension of Input(Weight) "
-                      "should be 4 * %d.",
-                      frame_size);
-
-    PADDLE_ENFORCE_EQ(proj_dims.size(), 2,
-                      "The rank of Input(ProjWeight) should be 2.");
-    PADDLE_ENFORCE_EQ(proj_dims[0], frame_size,
-                      "The first dimension of Input(ProjWeight) "
-                      "should be %d.",
-                      frame_size);
-
-    if (ctx->HasInput("H0")) {
-      PADDLE_ENFORCE(ctx->HasInput("C0"),
-                     "Input(C0) of LSTMP operator should not be null after "
-                     "Input(H0) provided.");
-    }
-
-    auto b_dims = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
-    PADDLE_ENFORCE_EQ(b_dims[0], 1,
-                      "The first dimension of Input(Bias) should be 1.");
-
-    if (ctx->Attrs().Get<bool>("use_peepholes")) {
-      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
-                        "The second dimension of Input(Bias) should be "
-                        "7 * %d if enable peepholes connection",
-                        frame_size);
-    } else {
-      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
-                        "The second dimension of Input(Bias) should be "
-                        "4 * %d if disable peepholes connection",
-                        frame_size);
-    }
-
-    framework::DDim out_dims({in_dims[0], frame_size});
-    framework::DDim proj_out_dims({in_dims[0], proj_dims[1]});
-    ctx->SetOutputDim("Projection", proj_out_dims);
-    ctx->SetOutputDim("Cell", out_dims);
-    ctx->SetOutputDim("BatchGate", in_dims);
-    ctx->SetOutputDim("BatchCellPreAct", out_dims);
-    ctx->SetOutputDim("BatchHidden", out_dims);
-    ctx->ShareLoD("Input", "Projection");
-    ctx->ShareLoD("Input", "Cell");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
-  }
-};
-
-class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "(LoDTensor) the input for sequence data, which supports "
-             "variable-time length input sequence. The underlying tensor in "
-             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
-             "total time steps in this mini-batch, D is the hidden size.");
-    AddInput("H0",
-             "(Tensor, optional) the initial hidden state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size and D is the hidden size.")
-        .AsDispensable();
-    AddInput("C0",
-             "(Tensor, optional) the initial cell state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. `C0` should not be null if `H0` provided.")
-        .AsDispensable();
-    AddInput("Weight",
-             "(Tensor) the learnable hidden-hidden weights."
-             " - The shape is (P x 4D), where P is the projection layer size "
-             "and  D is the hidden size."
-             " - Weight = {W_cr, W_ir, W_fr, W_or}");
-    AddInput("ProjWeight",
-             "(Tensor) the learnable weight of the projection layer."
-             " - The shape is (D x P), where P is the recurrent projection "
-             "layer size and  D is the hidden size."
-             " - ProjWeight = {W_rh}");
-    AddInput("Bias",
-             "(Tensor) the learnable biases, which contains two parts: "
-             "input-hidden biases and peephole connections weights if "
-             "setting `use_peepholes` to `True`. "
-             "1. `use_peepholes = False` "
-             " - The shape is (1 x 4D). "
-             " - Bias = {b_c, b_i, b_f, b_o}."
-             "2. `use_peepholes = True` "
-             " - The shape is (1 x 7D). "
-             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
-    AddOutput("Projection",
-              "(LoDTensor) the projection of the hidden state of LSTMP "
-              "operator. The shape is (T x P), and LoD is the same with the "
-              "`Input`.");
-    AddOutput("Cell",
-              "(LoDTensor) the cell state of LSTMP operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput("BatchGate",
-              "(LoDTensor) This LoDTensor contains input gate, forget gate "
-              "and output gate after the activations. This LoDTensor has the "
-              "same shape as the reorganized input, which is also be called "
-              "batch input. The LoD size is 2. The first-level LoD is the "
-              "batch offsets and the second contains the indices, which "
-              "denotes the position of reorganized sequence in the raw input.")
-        .AsIntermediate();
-    AddOutput("BatchCellPreAct",
-              "(LoDTensor) the pre-activation cell state reorganized in batch. "
-              "This LoDTensor is obtained in the forward and used in the "
-              "backward.")
-        .AsIntermediate();
-    AddOutput("BatchHidden",
-              "(LoDTensor) the hidden state reorganized in batch. "
-              "This LoDTensor is obtained in the forward and used in the "
-              "backward.")
-        .AsIntermediate();
-    AddAttr<bool>("use_peepholes",
-                  "(bool, default: True) "
-                  "whether to enable diagonal/peephole connections.")
-        .SetDefault(true);
-    AddAttr<bool>("is_reverse",
-                  "(bool, default: False) "
-                  "whether to compute reversed LSTMP.")
-        .SetDefault(false);
-    AddAttr<float>("cell_clip",
-                   "(float, default: 0.0) "
-                   "Clip for Tensor for cell state tensor when clip value is "
-                   "greater than 0.0")
-        .SetDefault(0.0);
-    AddAttr<float>("proj_clip",
-                   "(float, default: 0.0) "
-                   "Clip for Tensor for projection tensor when clip value is "
-                   "greater than 0.0")
-        .SetDefault(0.0);
-    AddAttr<std::string>(
-        "gate_activation",
-        "(string, default: sigmoid)"
-        "The activation for input gate, forget gate and output "
-        "gate, `sigmoid` by default.")
-        .SetDefault("sigmoid")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("cell_activation",
-                         "(string, default: tanh)"
-                         "The activation for cell output, `tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("candidate_activation",
-                         "(string, default: tanh)"
-                         "The activation for candidate hidden state, "
-                         "`tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("proj_activation",
-                         "(string, default: tanh)"
-                         "The activation for projection output, "
-                         "`tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddComment(R"DOC(
-Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
-
-LSTMP has a separate projection layer after the LSTM layer, projecting the 
-original hidden state to a lower-dimensional one, which is proposed to reduce 
-the number of total parameters and furthermore computational complexity for 
-the LSTM, espeacially for the case that the size of output units is relative 
-large (https://research.google.com/pubs/archive/43905.pdf). 
-
-The formula is as follows:
-
-$$
-i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
-
-f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
-
-\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\
-
-o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\
-
-c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
-
-h_t = o_t \odot act_h(c_t) \\
-
-r_t = \overline{act_h}(W_{rh}h_t)
-$$
-
-where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-are diagonal weight matrices for peephole connections. In our implementation,
-we use vectors to represent these diagonal weight matrices. The b terms
-denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
-is the activation, such as logistic sigmoid function, and
-$i, f, o$ and $c$ are the input gate, forget gate, output gate,
-and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$. Here $h$ is usually called the hidden 
-state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also 
-called the candidate hidden state, whose computation is based on the current 
-input and previous hidden state.
-
-The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
-are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\overline{act_h}$ is the activation function for the 
-projection output, usually using `identity` or same as $act_h$.
-
-Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
-operations on the input $x_{t}$ are NOT included in this operator.
-Users can choose to use fully-connected operator before LSTMP operator.
-
-)DOC");
-  }
-};
-
-class LSTMPGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* grad_op = new framework::OpDesc();
-    grad_op->SetType("lstmp_grad");
-    grad_op->SetInput("Weight", Input("Weight"));
-    grad_op->SetInput("ProjWeight", Input("ProjWeight"));
-    grad_op->SetInput("Bias", Input("Bias"));
-
-    grad_op->SetInput("Projection", Output("Projection"));
-    grad_op->SetInput("Cell", Output("Cell"));
-    grad_op->SetInput("BatchGate", Output("BatchGate"));
-    grad_op->SetInput("BatchCellPreAct", Output("BatchCellPreAct"));
-    grad_op->SetInput("BatchHidden", Output("BatchHidden"));
-    grad_op->SetInput("H0", Input("H0"));
-    grad_op->SetInput("C0", Input("C0"));
-
-    grad_op->SetInput(framework::GradVarName("Projection"),
-                      OutputGrad("Projection"));
-
-    grad_op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    grad_op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
-    grad_op->SetOutput(framework::GradVarName("ProjWeight"),
-                       InputGrad("ProjWeight"));
-    grad_op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
-    grad_op->SetOutput(framework::GradVarName("H0"), InputGrad("H0"));
-    grad_op->SetOutput(framework::GradVarName("C0"), InputGrad("C0"));
-
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class LSTMPGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Projection"),
-                   "Input(Projection) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Cell"),
-                   "Input(Cell) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
-                   "Input(ProjWeight) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of LSTMP operator should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
-                   "Input(BatchGate) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
-                   "Input(BatchGate) of LSTMP operator should not be null.");
-
-    auto SetOutGradDim = [&ctx](const std::string& name) {
-      auto g_name = framework::GradVarName(name);
-      if (ctx->HasOutput(g_name))
-        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
-    };
-
-    ctx->SetOutputDim(framework::GradVarName("Input"),
-                      ctx->GetInputDim("BatchGate"));
-    SetOutGradDim("Weight");
-    SetOutGradDim("ProjWeight");
-    SetOutGradDim("Bias");
-    SetOutGradDim("H0");
-    SetOutGradDim("C0");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("BatchGate")->type(),
-        ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, ops::LSTMPGradMaker);
-REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
-REGISTER_OP_CPU_KERNEL(
-    lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    lstmp_grad, ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstmp_op.cu b/paddle/fluid/operators/lstmp_op.cu
deleted file mode 100644
index f601b897af69a3e9f3b1d86d5873e99cca4d988f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lstmp_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstmp_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lstmp, ops::LSTMPKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMPKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    lstmp_grad,
-    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
deleted file mode 100644
index 36da882639a235f27b4e5a9e77bf0813ea9c0ee3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lstmp_op.h
+++ /dev/null
@@ -1,522 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-using platform::Transform;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class _ClipFunctor {
- public:
-  explicit _ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x) const {
-    if (x < min_)
-      return min_;
-    else if (x > max_)
-      return max_;
-    else
-      return x;
-  }
-
- private:
-  T min_;
-  T max_;
-};
-
-template <typename T>
-class _ClipGradFunctor {
- public:
-  explicit _ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x, const T& y) const {
-    return (y > min_ && y < max_) ? x : 0;
-  }
-
- private:
-  T min_;
-  T max_;
-};
-
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, dst, indexed_src);
-}
-
-template <typename DeviceContext, typename T>
-class LSTMPKernel : public framework::OpKernel<T> {
- public:
-  template <typename Device, typename X, typename Y>
-  void ActCompute(const math::detail::ActivationType act_type, const Device& d,
-                  X x, Y y) const {
-    if (act_type == math::detail::ActivationType::kIdentity)
-      y.device(d) = x;
-    else if (act_type == math::detail::ActivationType::kSigmoid)
-      SigmoidFunctor<T>()(d, x, y);
-    else if (act_type == math::detail::ActivationType::kTanh)
-      TanhFunctor<T>()(d, x, y);
-    else if (act_type == math::detail::ActivationType::kReLU)
-      ReluFunctor<T>()(d, x, y);
-    else
-      PADDLE_THROW("unsupported activation type");
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* cell_t0 = ctx.Input<Tensor>("C0");
-
-    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
-    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
-
-    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
-    batch_gate->mutable_data<T>(ctx.GetPlace());
-    auto* proj_out = ctx.Output<LoDTensor>("Projection");
-    proj_out->mutable_data<T>(ctx.GetPlace());
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    cell_out->mutable_data<T>(ctx.GetPlace());
-
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, batch_gate, true, is_reverse);
-
-    auto in_dims = input->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    framework::DDim dims({in_dims[0], frame_size});
-    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
-
-    if (bias) {
-      Tensor b = *bias;
-      b.Resize({bias->numel(), 1});
-      Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
-    }
-
-    math::LstmMetaValue<T> lstmp_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      // the code style in LstmpMetaValue will be updated later.
-
-      lstmp_value.check_ig = bias_data + 4 * frame_size;
-      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
-      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
-    } else {
-      lstmp_value.check_ig = nullptr;
-      lstmp_value.check_fg = nullptr;
-      lstmp_value.check_og = nullptr;
-    }
-    lstmp_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
-    Tensor ordered_h0;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTMP reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
-                                         &ordered_c0, true);
-      lstmp_value.prev_state_value = ordered_c0.data<T>();
-    }
-
-    // Use the local variable as here.
-    LoDTensor batch_proj, batch_cell;
-    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
-    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
-    auto* batch_hidden = ctx.Output<LoDTensor>("BatchHidden");
-    batch_hidden->mutable_data<T>(dims, ctx.GetPlace());    // T x D
-    batch_proj.mutable_data<T>(proj_dims, ctx.GetPlace());  // T x P
-    batch_cell.mutable_data<T>(dims, ctx.GetPlace());       // T x D
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-    auto proj_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("proj_activation"));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      Tensor proj_t = batch_proj.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(pre_proj_t, false, *weight, false, static_cast<T>(1.0),
-                    &gate_t, static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTMP reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
-                                           &ordered_h0, true);
-        blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
-                    &gate_t, static_cast<T>(1.0));
-      }
-
-      lstmp_value.gate_value = gate_t.data<T>();
-      lstmp_value.output_value = hidden_t.data<T>();
-      lstmp_value.state_value = cell_t.data<T>();
-      lstmp_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip,
-          gate_act, cell_act, cand_act);
-      lstmp_value.prev_state_value = lstmp_value.state_value;
-      blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
-                  &proj_t, static_cast<T>(0.0));
-      if (proj_act != math::detail::ActivationType::kIdentity) {
-        auto proj_t_dev = EigenMatrix<T>::From(proj_t);
-        ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
-      }
-      if (proj_clip && proj_clip > 0.0) {
-        T* x_data = proj_t.data<T>();
-        int64_t numel = proj_t.numel();
-        Transform<DeviceContext> trans;
-        trans(ctx.template device_context<DeviceContext>(), x_data,
-              x_data + numel, x_data,
-              _ClipFunctor<T>(-1.0 * proj_clip, proj_clip));
-      }
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_proj.set_lod(batch_gate->lod());
-    // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(device_ctx, batch_proj, proj_out);
-
-    batch_cell.set_lod(batch_gate->lod());
-    // restore the output cell state in LoDTensor from the batch cell
-    to_seq(device_ctx, batch_cell, cell_out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LSTMPGradKernel : public framework::OpKernel<T> {
- public:
-  template <typename Device, typename X, typename Y, typename DX, typename DY>
-  void ActGradCompute(const math::detail::ActivationType act_type,
-                      const Device& d, X x, Y y, DX dx, DY dy) const {
-    // x is dummy and won't be used even in Relu(use y instead)
-    if (act_type == math::detail::ActivationType::kIdentity)
-      dx.device(d) = dy;
-    else if (act_type == math::detail::ActivationType::kSigmoid)
-      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == math::detail::ActivationType::kTanh)
-      TanhGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == math::detail::ActivationType::kReLU)
-      ReluGradFunctor<T>()(d, x, y, dy, dx);
-    else
-      PADDLE_THROW("unsupported activation type");
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* proj_out = ctx.Input<LoDTensor>("Projection");
-    auto* cell_out = ctx.Input<LoDTensor>("Cell");
-
-    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
-    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
-
-    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
-    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
-    auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
-
-    auto* projection_g =
-        ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
-
-    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
-    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* proj_weight_g =
-        ctx.Output<Tensor>(framework::GradVarName("ProjWeight"));
-    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* c0 = ctx.Input<Tensor>("C0");
-
-    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
-    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
-
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-    if (weight_g) {
-      weight_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, weight_g, static_cast<T>(0.0));
-    }
-    if (proj_weight_g) {
-      proj_weight_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, proj_weight_g, static_cast<T>(0.0));
-    }
-
-    // ordered_h0/c0 is the reordered hidden/cell initialization.
-    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
-    // initialization.
-    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (c0) {
-      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
-                                         true);
-    }
-    if (c0 && c0_g) {
-      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
-    }
-
-    // batch_gate dims equal to input dims
-    auto in_dims = batch_gate->dims();
-    auto out_dims = cell_out->dims();
-    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
-
-    math::LstmMetaValue<T> lstmp_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      lstmp_value.check_ig = bias_data + 4 * frame_size;
-      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
-      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
-    } else {
-      lstmp_value.check_ig = nullptr;
-      lstmp_value.check_fg = nullptr;
-      lstmp_value.check_og = nullptr;
-    }
-
-    math::LstmMetaGrad<T> lstmp_grad;
-
-    if (bias && bias_g) {
-      bias_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, bias_g, static_cast<T>(0.0));
-    }
-    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_g_data = bias_g->data<T>();
-      lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size;
-      lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size;
-      lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size;
-    } else {
-      lstmp_grad.check_ig_grad = nullptr;
-      lstmp_grad.check_fg_grad = nullptr;
-      lstmp_grad.check_og_grad = nullptr;
-    }
-
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-
-    auto ToBatch = [&batch_gate, &to_batch](
-        const DeviceContext& ctx, const framework::LoDTensor& src,
-        const framework::DDim& dims, framework::LoDTensor& dst) {
-      dst.mutable_data<T>(dims, ctx.GetPlace());
-      dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, &dst, false);
-    };
-
-    LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
-    batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    ToBatch(device_ctx, *proj_out, proj_dims, batch_proj);        // T x P
-    ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g);  // T x P
-    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);         // T x D
-
-    LoDTensor batch_cell_g, batch_gate_g;
-    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    // TODO(qingqing) support the case output cell has gradient.
-    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
-    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
-    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
-    batch_gate_g.set_lod(batch_gate->lod());
-
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-    auto proj_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("proj_activation"));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
-    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor cur_proj = batch_proj.Slice(bstart, bend);
-      Tensor proj_g = batch_proj_g.Slice(bstart, bend);
-
-      if (proj_clip && proj_clip > 0.0) {
-        T* dx_data = proj_g.data<T>();
-        T* x_data = cur_proj.data<T>();
-        int64_t numel = proj_g.numel();
-        Transform<DeviceContext> trans;
-        trans(ctx.template device_context<DeviceContext>(), dx_data,
-              dx_data + numel, x_data, dx_data,
-              _ClipGradFunctor<T>(-1.0 * proj_clip, proj_clip));
-      }
-
-      if (proj_act != math::detail::ActivationType::kIdentity) {
-        auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
-        auto proj_g_dev = EigenMatrix<T>::From(proj_g);
-        ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
-                       proj_g_dev);
-      }
-      /* hidden state backwarad */
-      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
-      blas.MatMul(proj_g, false, *proj_weight, true, static_cast<T>(1.0),
-                  &out_g, static_cast<T>(0.0));
-      /* projection weight backward*/
-      if (proj_weight_g) {
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        blas.MatMul(hidden_t, true, proj_g, false, static_cast<T>(1.0),
-                    proj_weight_g, static_cast<T>(1.0));
-      }
-
-      Tensor gate = batch_gate->Slice(bstart, bend);
-      Tensor cell = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-      lstmp_value.gate_value = gate.data<T>();
-      lstmp_value.state_value = cell.data<T>();
-      lstmp_value.state_active_value = cell_pre_act.data<T>();
-
-      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
-      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
-      lstmp_grad.state_grad = cell_g.data<T>();
-      lstmp_grad.gate_grad = gate_g.data<T>();
-      lstmp_grad.output_grad = out_g.data<T>();
-
-      if (n > 0) {
-        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-        lstmp_value.prev_state_value = cell_pre.data<T>();
-        lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
-      } else {
-        lstmp_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
-        lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
-      }
-
-      int cur_batch_size = bend - bstart;
-      // lstmp_value.output_value not used in bp, set to null
-      // lstmp_grad.state_active_grad not used in bp, set to null
-      lstmp_value.output_value = nullptr;
-      lstmp_grad.state_active_grad = nullptr;
-
-      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
-          device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
-          cell_clip, gate_act, cell_act, cand_act);
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
-                    &pre_proj_g, static_cast<T>(1.0));
-        if (weight_g) {
-          /* weight backward*/
-          auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
-          blas.MatMul(pre_proj, true, gate_g, false, static_cast<T>(1.0),
-                      weight_g, static_cast<T>(1.0));
-        }
-      } else {
-        if (h0 && weight_g) {
-          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
-                                             &ordered_h0, true);
-          if (weight_g) {
-            blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
-                        weight_g, static_cast<T>(1.0));
-          }
-        }
-        if (h0 && (h0_g || proj_weight_g)) {
-          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
-                      &ordered_h0_g, static_cast<T>(0.0));
-        }
-      }
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    if (in_g) {
-      /* backward data */
-      in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, in_g);
-    }
-    if (bias && bias_g) {
-      /* backward bias */
-      Tensor b_g = *bias_g;
-      b_g.Resize({bias_g->numel(), 1});
-      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<DeviceContext, T> col_sum;
-      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
-    }
-
-    if (h0 && h0_g) {
-      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
-                                         false);
-    }
-    if (c0 && c0_g) {
-      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
-                                         false);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
deleted file mode 100644
index fca3532551730a39bda7cfad60151de97ef881de..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/margin_rank_loss_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class MarginRankLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto x1_dims = ctx->GetInputDim("X1");
-    auto x2_dims = ctx->GetInputDim("X2");
-    PADDLE_ENFORCE(
-        (label_dims == x1_dims) && (x1_dims == x2_dims) &&
-            (label_dims.size() == 2) && (label_dims[1] == 1),
-        "All inputs must be 2-D tensor with shape [batch_size x 1].");
-    ctx->SetOutputDim("Activated", label_dims);
-    ctx->SetOutputDim("Out", label_dims);
-  }
-};
-
-template <typename T>
-class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X1",
-             "(2-D tensor with shape [batch_size x 1]) The score for "
-             "one item X1 to be ranked, from pairwise ranking model.");
-    AddInput("X2",
-             "(2-D tensor with shape [batch_size x 1]) The score for "
-             "another item X2 to be ranked, from pairwise ranking model.");
-    AddInput("Label",
-             "(2-D tensor with shape [batch_size x 1]) "
-             "The label indicating X1 ranked higher than X2 or not, "
-             "can only be +1 or -1.");
-    AddOutput("Activated",
-              "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
-              "to indicate whether each element of Output(Out) is activated.")
-        .AsIntermediate();
-    AddOutput("Out",
-              "(2-D tensor with shape [batch_size x 1]) "
-              "The output loss of MarginRankLoss operator.");
-    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
-        .SetDefault(static_cast<T>(0));
-    AddComment(R"DOC(
-MarginRankLoss Operator.
-
-This operator measures the loss given a pair of training sample
-{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
-indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
-is calculated as:
-
-$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
-
-The attribute `margin` here helps make the predictions more robust.
-Denote the item ranked higher as the positive sample, otherwise the negative 
-sample. If the score of the two samples satisfies 
-
-$positive sample - negative sample < margin$
-
-the pair of samples will contribute to the final loss, which will backpropagate 
-and train the ranking model to enlarge the difference between the two scores.
-
-For batch input with size `batch_size`, `X1`, `X2` and `Label`
-all have the same shape [batch_size x 1].
-
-)DOC");
-  }
-};
-
-class MarginRankLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Activated"),
-                   "Intermediate(Activated) shouldn't be null.");
-    auto dims = ctx->GetInputDim("Label");
-    ctx->SetOutputDim(framework::GradVarName("X1"), dims);
-    ctx->SetOutputDim(framework::GradVarName("X2"), dims);
-  }
-};
-
-class MarginRankLossGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("margin_rank_loss_grad");
-    op->SetInput("Activated", Output("Activated"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("Label", Input("Label"));
-    op->SetOutput(framework::GradVarName("X1"), InputGrad("X1"));
-    op->SetOutput(framework::GradVarName("X2"), InputGrad("X2"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp,
-                  ops::MarginRankLossOpMaker<float>,
-                  ops::MarginRankLossGradDescMaker);
-REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu
deleted file mode 100644
index d7e77e923029ea65bcf7bdb40f98b693107ba3a9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/margin_rank_loss_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/margin_rank_loss_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/margin_rank_loss_op.h b/paddle/fluid/operators/margin_rank_loss_op.h
deleted file mode 100644
index c1bf44510766b03935c2290916d21979b4aab7fd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/margin_rank_loss_op.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct ReLU {
-  HOSTDEVICE T operator()(const T& val) const {
-    return val > 0 ? val : static_cast<T>(0);
-  }
-};
-
-template <typename T>
-struct Heaviside {
-  HOSTDEVICE T operator()(const T& val) const {
-    return static_cast<T>(val > 0 ? 1 : 0);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MarginRankLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::Tensor>("Out");
-    auto* act_t = ctx.Output<framework::Tensor>("Activated");
-
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-    auto* x1_t = ctx.Input<framework::Tensor>("X1");
-    auto* x2_t = ctx.Input<framework::Tensor>("X2");
-
-    out_t->mutable_data<T>(ctx.GetPlace());
-    act_t->mutable_data<T>(ctx.GetPlace());
-
-    auto margin = static_cast<T>(ctx.Attr<T>("margin"));
-    auto out = framework::EigenVector<T>::Flatten(*out_t);
-    auto act = framework::EigenVector<T>::Flatten(*act_t);
-
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto x1 = framework::EigenVector<T>::Flatten(*x1_t);
-    auto x2 = framework::EigenVector<T>::Flatten(*x2_t);
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU<T>());
-    act.device(dev) = out.unaryExpr(Heaviside<T>());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MarginRankLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_x1_t =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X1"));
-    auto* d_x2_t =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X2"));
-
-    auto* act_t = ctx.Input<framework::Tensor>("Activated");
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-
-    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-    auto act = framework::EigenVector<T>::Flatten(*act_t);
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    // compute d_x1
-    if (d_x1_t) {
-      d_x1_t->mutable_data<T>(ctx.GetPlace());
-      auto d_x1 = framework::EigenVector<T>::Flatten(*d_x1_t);
-      d_x1.device(dev) = -d_out * act * label;
-    }
-    // compute d_x2
-    if (d_x2_t) {
-      d_x2_t->mutable_data<T>(ctx.GetPlace());
-      auto d_x2 = framework::EigenVector<T>::Flatten(*d_x2_t);
-      d_x2.device(dev) = d_out * act * label;
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
deleted file mode 100644
index e9a645d2e0b7b6ac2f4b204f5150161cda9d7d39..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <vector>
-
-#include "paddle/fluid/operators/match_matrix_tensor_op.h"
-#include "paddle/fluid/operators/search_compute.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "X(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                    "Y(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                    "W(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                    "Out(Output) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Tmp"), true,
-                    "Tmp(Output) of MatchMatrix should not be null.");
-
-  auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    "The rank of Input(X) can't be less than 2.");
-
-  auto y_dims = ctx->GetInputDim("Y");
-  PADDLE_ENFORCE_EQ(y_dims.size(), 2,
-                    "The rank of Input(Y) can't be less than 2.");
-
-  auto w_dims = ctx->GetInputDim("W");
-  PADDLE_ENFORCE_EQ(w_dims.size(), 3UL, "W should be 3-D tensor");
-
-  int dim_t = ctx->Attrs().Get<int>("dim_t");
-  PADDLE_ENFORCE_EQ(w_dims[0], x_dims[1],
-                    "W 's shape must satisfy: W[0] = X[1]");
-  PADDLE_ENFORCE_EQ(w_dims[1], dim_t, "W 's shape must satisfy: W[1] = dim_t");
-  PADDLE_ENFORCE_EQ(w_dims[2], y_dims[1],
-                    "W 's shape must satisfy: W[2] = Y[1]");
-
-  int out_dim_0 = -1;
-  int tmp_dim_0 = -1;
-  if (ctx->IsRuntime()) {
-    framework::Variable* x_var =
-        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
-    const auto& x_lod = x_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE_EQ(x_lod.empty(), false, "The Input(X) must hold lod info.");
-    const auto& x_lod_0 = x_lod[0];
-    PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
-                      "The Input(X)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], static_cast<int64_t>(x_lod_0.back()),
-        "The Input(X)'s lod info mismatches the actual tensor shape.");
-
-    framework::Variable* y_var =
-        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
-    const auto& y_lod = y_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE_EQ(y_lod.empty(), false, "The Input(Y) must hold lod info.");
-    const auto& y_lod_0 = y_lod[0];
-    PADDLE_ENFORCE_GE(y_lod_0.size(), 2,
-                      "The Input(Y)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        y_dims[0], static_cast<int64_t>(y_lod_0.back()),
-        "The Input(Y)'s lod info mismatches the actual tensor shape.");
-
-    PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(),
-                      "The Length of X and Y must be equal.");
-
-    out_dim_0 = 0;
-    for (size_t i = 1; i < x_lod_0.size(); i++) {
-      int x_len = x_lod_0[i] - x_lod_0[i - 1];
-      int y_len = y_lod_0[i] - y_lod_0[i - 1];
-      out_dim_0 += (x_len * y_len);
-    }
-    out_dim_0 *= dim_t;
-
-    tmp_dim_0 = x_dims[0] * dim_t * x_dims[1];
-  } else {
-    // compile time
-    framework::VarDesc* x_desc =
-        boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
-    PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
-    framework::VarDesc* y_desc =
-        boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Y")[0]);
-    PADDLE_ENFORCE_GE(y_desc->GetLoDLevel(), 1);
-  }
-
-  std::vector<int64_t> out_dims_vec{out_dim_0};
-  out_dims_vec.push_back(1);
-  std::vector<int64_t> tmp_dims_vec{tmp_dim_0};
-  tmp_dims_vec.push_back(1);
-  ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
-  ctx->SetOutputDim("Tmp", framework::make_ddim(tmp_dims_vec));
-}
-
-void MatchMatrixTensorOpGrad::InferShape(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "Input(X) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                    "Input(Y) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                    "Input(W) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                    "Input(Out@GRAD) of SequencePadGradOp should not be null.");
-
-  if (ctx->HasOutput(framework::GradVarName("X"))) {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-  if (ctx->HasOutput(framework::GradVarName("Y"))) {
-    ctx->SetOutputDim(framework::GradVarName("Y"), ctx->GetInputDim("Y"));
-    ctx->ShareLoD("Y", /*->*/ framework::GradVarName("Y"));
-  }
-  if (ctx->HasOutput(framework::GradVarName("W"))) {
-    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
-  }
-}
-
-void MatchMatrixTensorOpMaker::Make() {
-  AddInput("X",
-           "X (LoDTensor, default LoDTensor<float>) Input variable which "
-           "should contain lod information.");
-  AddInput("Y",
-           "Y (LoDTensor, default LoDTensor<float>) Input variable which "
-           "should contain lod information.");
-  AddInput("W", "W (Tensor), The weight of X and Y.");
-  AddAttr<int>("dim_t", "the dim of W").SetDefault(1);
-  AddOutput("Out",
-            "(LoDTensor, default LoDTensor<float>) Output variable which "
-            "is X * W * Y");
-  AddOutput("Tmp",
-            "(LoDTensor, default LoDTensor<float>) tmp variable which is "
-            "used for X * W");
-  AddComment(R"DOC(
-      Match Matrix Tensor Operator
-
-      This operator calculate X * W * Y, only support 2-D for X and Y.
-      the output is a level-1 LodTensor: 
-        level_0: dim_t
-      
-      NOTE: only support 'float32' data type now.
-
-    )DOC");
-}
-
-template <typename DeviceContext, typename T>
-class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* y = ctx.Input<LoDTensor>("Y");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    auto* tmp = ctx.Output<LoDTensor>("Tmp");
-
-    int dim_t = ctx.Attr<int>("dim_t");
-    int dim_in = x->dims()[1];
-
-    const auto& offset_l = x->lod()[0];
-    const auto& offset_r = y->lod()[0];
-
-    std::vector<size_t> top_offset;
-    int top_size = 0;
-    top_offset.push_back(top_size);
-    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-      int len_l = offset_l[b + 1] - offset_l[b];
-      int len_r = offset_r[b + 1] - offset_r[b];
-      top_size += dim_t * len_l * len_r;
-      top_offset.push_back(top_size);
-    }
-    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
-    memset(out_data, 0.0, out->dims()[0] * out->dims()[1] * sizeof(T));
-
-    auto* bottom_l_data = x->data<T>();
-    auto* bottom_r_data = y->data<T>();
-    auto* t_data = w->data<T>();
-    auto* bottom_l_trans_data = tmp->mutable_data<T>(ctx.GetPlace());
-    memset(bottom_l_trans_data, 0.0,
-           tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-
-    call_gemm(blas, CblasNoTrans, CblasNoTrans, x->dims()[0], dim_t * dim_in,
-              dim_in, 1.0f, bottom_l_data, t_data, 0.0f, bottom_l_trans_data);
-
-    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-      for (int t = 0; t < dim_t; t++) {
-        int len_l = offset_l[b + 1] - offset_l[b];
-        int len_r = offset_r[b + 1] - offset_r[b];
-        auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
-        const auto* l_t_data =
-            bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
-        const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
-        auto blas_2 = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-        call_gemm_with_lda(blas_2, CblasNoTrans, CblasTrans, len_l, len_r,
-                           dim_in, 1.0f, l_t_data, r_data, 0.0f, top_data,
-                           dim_t * dim_in);
-      }
-    }
-
-    framework::LoD out_lod;
-    out_lod.push_back(top_offset);
-
-    out->set_lod(out_lod);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* y = ctx.Input<LoDTensor>("Y");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* tmp = ctx.Input<LoDTensor>("Tmp");
-
-    int dim_t = ctx.Attr<int>("dim_t");
-    int dim_in = x->dims()[1];
-
-    const auto& offset_l = x->lod()[0];
-    const auto& offset_r = y->lod()[0];
-    std::vector<int> top_offset;
-    int top_size = 0;
-    top_offset.push_back(top_size);
-    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-      int len_l = offset_l[b + 1] - offset_l[b];
-      int len_r = offset_r[b + 1] - offset_r[b];
-      top_size += dim_t * len_l * len_r;
-      top_offset.push_back(top_size);
-    }
-
-    auto* bottom_l_data = x->data<T>();
-    auto* bottom_r_data = y->data<T>();
-    auto* bottom_l_trans_data = tmp->data<T>();
-
-    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* d_y = ctx.Output<LoDTensor>(framework::GradVarName("Y"));
-
-    Tensor tmp_grad;
-    tmp_grad.Resize(tmp->dims());
-    auto* d_tmp_data = tmp_grad.mutable_data<T>(ctx.GetPlace());
-    auto* top_diff = d_out->data<T>();
-    auto* bottom_l_diff = d_x->mutable_data<T>(ctx.GetPlace());
-    auto* bottom_r_diff = d_y->mutable_data<T>(ctx.GetPlace());
-    auto* bottom_l_trans_diff = const_cast<T*>(d_tmp_data);
-    memset(bottom_l_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T));
-    memset(bottom_r_diff, 0.0, y->dims()[0] * y->dims()[1] * sizeof(T));
-    memset(bottom_l_trans_diff, 0.0,
-           tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
-
-    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-      for (int t = 0; t < dim_t; t++) {
-        int len_l = offset_l[b + 1] - offset_l[b];
-        int len_r = offset_r[b + 1] - offset_r[b];
-
-        for (int i = 0; i < len_l; i++) {
-          for (int j = 0; j < len_r; j++) {
-            auto diff =
-                top_diff[top_offset[b] + t * len_l * len_r + i * len_r + j];
-            auto* l_trans_data = bottom_l_trans_data +
-                                 (offset_l[b] + i) * dim_in * dim_t +
-                                 t * dim_in;
-            auto* l_trans_diff = bottom_l_trans_diff +
-                                 (offset_l[b] + i) * dim_in * dim_t +
-                                 t * dim_in;
-            auto* r_data = bottom_r_data + (offset_r[b] + j) * dim_in;
-            auto* r_diff = bottom_r_diff + (offset_r[b] + j) * dim_in;
-            if (diff != 0.0) {
-              sse_axpy(r_data, l_trans_diff, dim_in, diff);
-              sse_axpy(l_trans_data, r_diff, dim_in, diff);
-            }
-          }
-        }
-      }
-    }
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-
-    auto* t_data = w->data<T>();
-    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
-    auto* t_diff = d_w->mutable_data<T>(ctx.GetPlace());
-    memset(t_diff, 0.0, w->dims()[0] * w->dims()[1] * w->dims()[2] * sizeof(T));
-    // bottom_diff
-    call_gemm(blas, CblasNoTrans, CblasTrans, x->dims()[0], dim_in,
-              dim_t * dim_in, 1.0f, bottom_l_trans_diff, t_data, 1.0f,
-              bottom_l_diff);
-
-    // t_diff
-    call_gemm(blas, CblasTrans, CblasNoTrans, dim_in, dim_t * dim_in,
-              x->dims()[0], 1.0f, bottom_l_data, bottom_l_trans_diff, 1.0f,
-              t_diff);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(match_matrix_tensor, ops::MatchMatrixTensorOP,
-                  ops::MatchMatrixTensorOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(match_matrix_tensor_grad, ops::MatchMatrixTensorOpGrad);
-
-REGISTER_OP_CPU_KERNEL(match_matrix_tensor,
-                       ops::CPUMatchMatrixTensorOPKernel<
-                           paddle::platform::CPUDeviceContext, float>);
-//     ops::CPUMatchMatrixTensorOPKernel<paddle::platform::CPUDeviceContext,
-//                                       double>
-
-REGISTER_OP_CPU_KERNEL(match_matrix_tensor_grad,
-                       ops::CPUMatchMatrixTensorOPGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
-//     ops::CPUMatchMatrixTensorOPGradKernel<paddle::platform::CPUDeviceContext,
-//                                           double>
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.h b/paddle/fluid/operators/match_matrix_tensor_op.h
deleted file mode 100644
index b067d1c028bd3efed1d32e04579744c529c15424..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/match_matrix_tensor_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-class MatchMatrixTensorOP : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class MatchMatrixTensorOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class MatchMatrixTensorOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h
deleted file mode 100644
index 3b28928a52892db865523c71ea72b234bd1a5edc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-#include "math.h"  // NOLINT
-
-namespace paddle {
-namespace operators {
-
-inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) {
-  return static_cast<platform::float16>(::expf(static_cast<float>(x)));
-}
-
-inline HOSTDEVICE float real_exp(float x) { return ::expf(x); }
-
-inline HOSTDEVICE double real_exp(double x) { return ::exp(x); }
-
-inline HOSTDEVICE platform::float16 real_log(platform::float16 x) {
-  return static_cast<platform::float16>(::logf(static_cast<float>(x)));
-}
-
-inline HOSTDEVICE float real_log(float x) { return ::logf(x); }
-
-inline HOSTDEVICE double real_log(double x) { return ::log(x); }
-
-inline HOSTDEVICE float real_min(float x, float y) { return ::fminf(x, y); }
-
-inline HOSTDEVICE double real_min(double x, double y) { return ::fmin(x, y); }
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
deleted file mode 100644
index ca0c92b4fbee1f0275dd2a02a5b7bbfcd496b9ae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-add_subdirectory(detail)
-
-function(math_library TARGET)
-    # math_library is a function to create math library.
-    # The interface is the same as cc_library.
-    # But it handle split GPU/CPU code and link some common library.
-    set(cc_srcs)
-    set(cu_srcs)
-    set(hip_srcs)
-    set(math_common_deps device_context framework_proto enforce)
-    set(multiValueArgs DEPS)
-    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-        list(APPEND cc_srcs ${TARGET}.cc)
-    endif()
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-        list(APPEND cu_srcs ${TARGET}.cu)
-    endif()
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
-        list(APPEND hip_srcs ${TARGET}.hip.cu)
-    endif()
-
-    list(LENGTH cc_srcs cc_srcs_len)
-    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    elseif (WITH_AMD_GPU)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    elseif(${cc_srcs_len} GREATER 0)
-        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    endif()
-endfunction()
-
-# please add new math_library in alphabetical order
-math_library(concat_and_split)
-math_library(context_project DEPS im2col math_function)
-math_library(cross_entropy)
-math_library(cos_sim_functor)
-math_library(depthwise_conv DEPS cub)
-math_library(im2col)
-math_library(sample_prob)
-math_library(sampler)
-
-math_library(gru_compute DEPS activation_functions math_function)
-math_library(lstm_compute DEPS activation_functions)
-
-cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
-math_library(math_function DEPS blas)
-math_library(maxouting)
-math_library(pooling)
-math_library(selected_rows_functor DEPS selected_rows math_function blas)
-math_library(sequence2batch)
-math_library(sequence_padding)
-math_library(sequence_pooling DEPS math_function jit_kernel_helper)
-math_library(sequence_scale)
-math_library(softmax DEPS math_function jit_kernel_helper)
-math_library(beam_search DEPS math_function)
-math_library(fc DEPS blas)
-
-math_library(matrix_bit_code)
-
-math_library(unpooling)
-math_library(vol2col)
-math_library(prelu)
-math_library(tree2col DEPS math_function)
-
-cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
-cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
-cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
-cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
-cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
-cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
-cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
-if(WITH_GPU)
-    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
-    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
-endif()
-cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
-cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h
deleted file mode 100644
index 2e75b6abce5e1f43742ee15bff1dac4801186cd4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/algorithm.h
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>  // for int64_t
-#include <numeric>
-
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
-  int64_t beg = 0, end = num - 1;
-  while (beg <= end) {
-    auto mid = ((beg + end) >> 1);
-    if (x[mid] == val)
-      return mid;
-    else if (x[mid] < val)
-      beg = mid + 1;
-    else
-      end = mid - 1;
-  }
-  return -1;
-}
-
-template <typename T>
-HOSTDEVICE inline size_t LowerBound(const T *x, size_t num, const T &val) {
-#ifdef __CUDA_ARCH__
-  // The following code is from
-  // https://en.cppreference.com/w/cpp/algorithm/lower_bound
-  auto *first = x;
-  int64_t count = static_cast<int64_t>(num);
-  while (count > 0) {
-    int64_t step = (count >> 1);
-    auto *it = first + step;
-    if (*it < val) {
-      first = ++it;
-      count -= (step + 1);
-    } else {
-      count = step;
-    }
-  }
-  return static_cast<size_t>(first - x);
-#else
-  return static_cast<size_t>(std::lower_bound(x, x + num, val) - x);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline size_t UpperBound(const T *x, size_t num, const T &val) {
-#ifdef __CUDA_ARCH__
-  // The following code is from
-  // https://en.cppreference.com/w/cpp/algorithm/upper_bound
-  auto *first = x;
-  int64_t count = static_cast<int64_t>(num);
-  while (count > 0) {
-    auto step = (count >> 1);
-    auto *it = first + step;
-    if (val < *it) {
-      count = step;
-    } else {
-      first = ++it;
-      count -= (step + 1);
-    }
-  }
-  return static_cast<size_t>(first - x);
-#else
-  return static_cast<size_t>(std::upper_bound(x, x + num, val) - x);
-#endif
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
deleted file mode 100644
index 0155ef188ef967fbf67505d28beeeaf956bb3a70..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/beam_search.cc
+++ /dev/null
@@ -1,289 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/beam_search.h"
-#include <algorithm>
-#include <map>
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class BeamSearchFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext &context,
-                  const framework::LoDTensor *pre_ids,
-                  const framework::LoDTensor *pre_scores,
-                  const framework::LoDTensor *ids,
-                  const framework::LoDTensor *scores,
-                  framework::LoDTensor *selected_ids,
-                  framework::LoDTensor *selected_scores,
-                  framework::Tensor *parent_idx, size_t level, size_t beam_size,
-                  int end_id, bool is_accumulated) {
-    auto abs_lod = framework::ToAbsOffset(scores->lod());
-    auto &high_level = abs_lod[level];
-
-    auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level,
-                                        beam_size, end_id, is_accumulated);
-    auto selected_items = ToMap(items, high_level.back());
-    if (FLAGS_v == 3) {
-      VLOG(3) << "selected_items:";
-      for (size_t i = 0; i < selected_items.size(); ++i) {
-        VLOG(3) << "offset: " << i;
-        for (auto &item : selected_items[i]) {
-          VLOG(3) << item.ToString();
-        }
-      }
-    }
-
-    PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
-    // calculate the output tensor's height
-    size_t num_instances = std::accumulate(
-        std::begin(selected_items), std::end(selected_items), 0,
-        [](size_t a, std::vector<Item> &b) { return a + b.size(); });
-    // the output tensor shape should be [num_instances, 1]
-    auto dims = framework::make_ddim(
-        std::vector<int64_t>({static_cast<int>(num_instances), 1}));
-    auto *selected_ids_data =
-        selected_ids->mutable_data<int64_t>(dims, platform::CPUPlace());
-    auto *selected_scores_data =
-        selected_scores->mutable_data<float>(dims, platform::CPUPlace());
-    auto *parent_idx_data =
-        parent_idx
-            ? parent_idx->mutable_data<int>(
-                  {static_cast<int64_t>(num_instances)}, platform::CPUPlace())
-            : nullptr;
-
-    // fill in data
-    std::vector<size_t> low_level;
-    size_t low_offset = 0;
-    for (auto &items : selected_items) {
-      low_level.push_back(low_offset);
-      for (auto &item : items) {
-        if (parent_idx) {
-          parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
-        }
-        selected_ids_data[low_offset] = item.id;
-        selected_scores_data[low_offset] = item.score;
-        low_offset++;
-      }
-    }
-    low_level.push_back(low_offset);
-
-    // fill lod
-    framework::LoD lod(2);
-    lod[0].assign(high_level.begin(), high_level.end());
-    lod[1].assign(low_level.begin(), low_level.end());
-    if (!framework::CheckLoD(lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
-    }
-    selected_ids->set_lod(lod);
-    selected_scores->set_lod(lod);
-  }
-
-  /*
-   * The basic items help to sort.
-   */
-  struct Item {
-    Item() {}
-    Item(size_t offset, size_t id, float score)
-        : offset(offset), id(id), score(score) {}
-    // offset in the higher lod level.
-    size_t offset;
-    // prefix id in the lower lod level.
-    // size_t prefix;
-    // the candidate id
-    size_t id;
-    // the corresponding score
-    float score;
-
-    inline bool operator<(const Item &in) const {
-      return (score < in.score) ||
-             ((score == in.score) && (offset < in.offset));
-    }
-
-    inline void operator=(const Item &in) {
-      offset = in.offset;
-      id = in.id;
-      score = in.score;
-    }
-
-    std::string ToString() {
-      std::ostringstream os;
-      os << "{";
-      os << "offset: " << offset << ", ";
-      os << "id: " << id << ", ";
-      os << "score: " << score << "";
-      os << "}";
-      return os.str();
-    }
-  };
-
- protected:
-  /*
-   * Prune the source sentences all branchs finished, and it is optional.
-   * Pruning must one step later than finishing (thus pre_ids is needed here),
-   * since the end tokens must be writed out.
-   */
-  void PruneEndBeams(const framework::LoDTensor *pre_ids,
-                     const framework::LoD &abs_lod,
-                     std::vector<std::vector<Item>> *items, size_t lod_level,
-                     int end_id) {
-    auto *pre_ids_data = pre_ids->data<int64_t>();
-    auto &high_level = abs_lod[lod_level];
-    for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
-      size_t src_prefix_start = high_level[src_idx];
-      size_t src_prefix_end = high_level[src_idx + 1];
-      bool finish_flag = true;
-      for (size_t offset = src_prefix_start; offset < src_prefix_end;
-           offset++) {
-        for (auto &item : items->at(offset)) {
-          if (item.id != static_cast<size_t>(end_id) ||
-              pre_ids_data[offset] != end_id) {
-            finish_flag = false;
-            break;
-          }
-        }
-        if (!finish_flag) break;
-      }
-      if (finish_flag) {  // all branchs of the beam (source sentence) end and
-                          // prune this beam
-        for (size_t offset = src_prefix_start; offset < src_prefix_end;
-             offset++)
-          items->at(offset).clear();
-      }
-    }
-  }
-
-  /*
-   * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance.
-   */
-  std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>> &items, size_t element_num) {
-    std::vector<std::vector<Item>> result;
-    result.resize(element_num);
-    for (auto &entries : items) {
-      for (const auto &item : entries) {
-        result[item.offset].push_back(item);
-      }
-    }
-    return result;
-  }
-
-  void Insert(std::vector<Item> *top_beam_ptr, const Item &item,
-              size_t beam_size) {
-    std::vector<Item> &top_beam = *top_beam_ptr;
-
-    size_t num_beams = top_beam.size();
-    if (num_beams < beam_size) {
-      top_beam.resize(num_beams + 1);
-      num_beams++;
-    } else {
-      if (item < top_beam[beam_size - 1]) {
-        return;
-      }
-    }
-
-    for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
-      if (top_beam[k] < item) {
-        top_beam[k + 1] = top_beam[k];
-      } else {
-        top_beam[k + 1] = item;
-        return;
-      }
-    }
-    top_beam[0] = item;
-  }
-
-  /*
-   * For each source, select top beam_size records.
-   */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
-      const framework::LoDTensor *pre_ids,
-      const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids,
-      const framework::LoDTensor *scores, size_t lod_level, size_t beam_size,
-      int end_id, bool is_accumulated) {
-    std::vector<std::vector<Item>> result;
-
-    // find the current candidates
-    auto abs_lod = framework::ToAbsOffset(scores->lod());
-
-    auto *pre_ids_data = pre_ids->data<int64_t>();
-    auto *pre_scores_data = pre_scores->data<float>();
-
-    auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
-    auto *scores_data = scores->data<float>();
-
-    size_t num_seqs = scores->NumElements(lod_level);
-    size_t seq_width = 1;
-    for (int i = 1; i < scores->dims().size(); i++) {
-      seq_width *= scores->dims()[i];
-    }
-
-    for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
-      size_t seq_offset_start = abs_lod[lod_level][seq_id];
-      size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
-
-      std::vector<Item> top_beam;
-      top_beam.reserve(beam_size);
-
-      for (size_t offset = seq_offset_start; offset < seq_offset_end;
-           ++offset) {
-        auto pre_id = pre_ids_data[offset];
-        auto pre_score = pre_scores_data[offset];
-        if (pre_id == end_id) {
-          // Allocate all probability mass to end_id for finished branchs and
-          // the other candidate ids can be ignored.
-          Item item(offset, end_id, pre_score);
-          Insert(&top_beam, item, beam_size);
-        } else {
-          size_t index = offset * seq_width;
-          for (size_t d = 0; d < seq_width; d++, index++) {
-            int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
-            float score = is_accumulated
-                              ? scores_data[index]
-                              : pre_score + std::log(scores_data[index]);
-            Item item(offset, id, score);
-            Insert(&top_beam, item, beam_size);
-          }
-        }
-      }
-
-      result.emplace_back(top_beam);
-    }
-
-    if (FLAGS_v == 3) {
-      VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
-      for (auto &items : result) {
-        VLOG(3) << "item set:";
-        for (auto &item : items) {
-          VLOG(3) << item.ToString();
-        }
-      }
-    }
-
-    return result;
-  }
-};
-
-template class BeamSearchFunctor<platform::CPUDeviceContext, int>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, int64_t>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, float>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
deleted file mode 100644
index ecfeba338482a99735488fec08be8c3adcf4d0f4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/beam_search.cu
+++ /dev/null
@@ -1,426 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/beam_search.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-struct Triple {
-  __device__ __forceinline__ Triple() {}
-  __device__ __forceinline__ Triple(int o, int i, float s)
-      : offset(o), id(i), score(s) {}
-
-  __device__ __forceinline__ void set(int o, int i, float s) {
-    offset = o;
-    id = i;
-    score = s;
-  }
-
-  __device__ __forceinline__ void operator=(const Triple& in) {
-    offset = in.offset;
-    id = in.id;
-    score = in.score;
-  }
-
-  __device__ __forceinline__ bool operator<(const float s) const {
-    return score < s;
-  }
-
-  __device__ __forceinline__ bool operator<(const Triple& in) const {
-    return (score < in.score) || ((score == in.score) && (offset < in.offset));
-  }
-
-  int offset;
-  int id;
-  float score;
-};
-
-__device__ __forceinline__ void Insert(Triple* top_beam, const Triple& p,
-                                       int beam_size) {
-  if (p < top_beam[beam_size - 1]) {
-    return;
-  }
-  for (int k = beam_size - 2; k >= 0; --k) {
-    if (top_beam[k] < p) {
-      top_beam[k + 1] = top_beam[k];
-    } else {
-      top_beam[k + 1] = p;
-      return;
-    }
-  }
-  top_beam[0] = p;
-}
-
-template <int MaxThreadsPerSeq, bool IsAccumulated = true>
-__device__ __forceinline__ int SelectTopBeam(
-    Triple* top_beam, const int64_t* pre_ids, const float* pre_scores,
-    const int64_t* ids, const float* scores, const int seq_offset_start,
-    const int seq_offset_end, const int seq_width, int beam_size, int end_id,
-    int used_threads) {
-  // top_beam is shared memory
-  const int tid = threadIdx.x;
-  const int tid_of_seq = threadIdx.x % MaxThreadsPerSeq;
-
-  int num_used_threads = used_threads;
-
-  Triple* top_beam_local = top_beam + tid * beam_size;
-  if (tid_of_seq < num_used_threads) {
-    for (int i = 0; i < beam_size; ++i) {
-      top_beam_local[i].set(-1, -1, -INFINITY);
-    }
-
-    for (int offset = seq_offset_start; offset < seq_offset_end; ++offset) {
-      int pre_id = static_cast<int>(pre_ids[offset]);
-      if (pre_id == end_id) {
-        if (tid_of_seq == 0) {
-          Triple tmp(offset, end_id, pre_scores[offset]);
-          Insert(top_beam_local, tmp, beam_size);
-        }
-      } else {
-        int index = offset * seq_width + tid_of_seq;
-        if (!IsAccumulated) {
-          float pre_score = pre_scores[offset];
-          for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
-            float score = pre_score + __logf(scores[index]);
-            int id = ids ? static_cast<int>(ids[index]) : i;
-            Triple tmp(offset, id, score);
-            Insert(top_beam_local, tmp, beam_size);
-            index += num_used_threads;
-          }
-        } else {
-          for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
-            int id = ids ? static_cast<int>(ids[index]) : i;
-            float score = scores[index];
-            Triple tmp(offset, id, score);
-            Insert(top_beam_local, tmp, beam_size);
-            index += num_used_threads;
-          }
-        }
-      }
-    }
-  }
-
-  while (num_used_threads > 1) {
-    if (num_used_threads > 16) {
-      __syncthreads();
-    }
-
-    if ((num_used_threads & 0x1) != 0) {
-      // If num_used_threads is a odd number, merge local top_beam of thread 0
-      // and num_used_threads - 1
-      if (tid_of_seq == 0) {
-        int index_in_sh = (num_used_threads - 1 + tid) * beam_size;
-        for (int i = 0; i < beam_size; i++) {
-          Insert(top_beam_local, top_beam[index_in_sh], beam_size);
-          index_in_sh++;
-        }
-      }
-    }
-
-    num_used_threads = num_used_threads >> 1;
-    if (tid_of_seq < num_used_threads) {
-      int index_in_sh = (num_used_threads + tid) * beam_size;
-      for (int i = 0; i < beam_size; i++) {
-        Insert(top_beam_local, top_beam[index_in_sh], beam_size);
-        index_in_sh++;
-      }
-    }
-  }
-
-  if (tid_of_seq == 0) {
-    int num_items = 0;
-    for (int i = 0; i < beam_size; ++i) {
-      num_items =
-          (top_beam_local[i].score > -INFINITY) ? num_items + 1 : num_items;
-    }
-    return num_items;
-  }
-
-  return 0;
-}
-
-__device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
-                                              const int64_t* pre_ids,
-                                              const int end_id, int num_items) {
-  bool finish_flag = true;
-  for (int i = 0; i < num_items; ++i) {
-    int offset = top_beam_local[i].offset;
-    if (top_beam_local[i].id != end_id ||
-        static_cast<int>(pre_ids[offset]) != end_id) {
-      finish_flag = false;
-      break;
-    }
-  }
-  return finish_flag;
-}
-
-template <bool ReturnParentIdx = false>
-__device__ __forceinline__ void WriteBack(
-    int64_t* selected_ids, float* selected_scores, int* parent_idx,
-    size_t* selected_offsets, Triple* top_beam_local,
-    const int seq_offset_start, const int seq_offset_end,
-    const int selected_seq_start, const int selected_seq_length) {
-  const int tid = threadIdx.x;  // use 1 thread only for each sequence
-  int global_index = selected_seq_start;
-  for (int global_offset = seq_offset_start; global_offset < seq_offset_end;
-       ++global_offset) {
-    for (int local_index = 0; local_index < selected_seq_length;
-         ++local_index) {
-      if (top_beam_local[local_index].offset == global_offset) {
-        selected_ids[global_index] =
-            static_cast<int64_t>(top_beam_local[local_index].id);
-        selected_scores[global_index] = top_beam_local[local_index].score;
-        if (ReturnParentIdx) {
-          parent_idx[global_index] = static_cast<int>(global_offset);
-        }
-        global_index++;
-      }
-    }
-    selected_offsets[global_offset + 1] = static_cast<size_t>(global_index);
-  }
-}
-
-template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
-__device__ void BeamSearchDetails(
-    int64_t* selected_ids, float* selected_scores, int* parent_idx,
-    size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
-    const int64_t* ids, const float* scores, const int seq_offset_start,
-    const int seq_offset_end, const int seq_width, int beam_size, int end_id,
-    bool is_accumulated, int num_used_threads) {
-  __shared__ Triple top_beam[MaxLength];
-
-  int num_items = 0;
-  if (is_accumulated) {
-    num_items = SelectTopBeam<MaxThreadsPerSeq, true>(
-        top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
-        seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
-  } else {
-    num_items = SelectTopBeam<MaxThreadsPerSeq, false>(
-        top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
-        seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
-  }
-
-  const int tid = threadIdx.x;  // use 1 thread only for each sequence
-  const int tid_of_seq = tid % MaxThreadsPerSeq;
-  if (tid_of_seq == 0) {
-    // Use 1 thread for each sequence.
-    Triple* top_beam_local = top_beam + tid * beam_size;
-    bool finish_flag =
-        PruneEndBeams(top_beam_local, pre_ids, end_id, num_items);
-
-    int selected_seq_start = 0;
-    int selected_seq_length = finish_flag ? 0 : num_items;
-
-    if (MaxSeqs > 1) {
-      const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
-      __shared__ int shared_mem[MaxSeqs];
-
-      // [0, MaxSeqs - 1], length of each sequences
-      shared_mem[seq_id] = selected_seq_length;
-      __syncthreads();
-
-      for (int s = 0; s < seq_id; ++s) {
-        selected_seq_start += shared_mem[s];
-      }
-
-      if (seq_id == 0) {
-        selected_offsets[0] = 0;
-      }
-    } else {
-      selected_offsets[0] = 0;
-    }
-
-    if (parent_idx) {
-      WriteBack<true>(selected_ids, selected_scores, parent_idx,
-                      selected_offsets, top_beam_local, seq_offset_start,
-                      seq_offset_end, selected_seq_start, selected_seq_length);
-    } else {
-      WriteBack<false>(selected_ids, selected_scores, parent_idx,
-                       selected_offsets, top_beam_local, seq_offset_start,
-                       seq_offset_end, selected_seq_start, selected_seq_length);
-    }
-  }
-}
-
-template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
-__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
-                                 int* parent_idx, size_t* selected_offsets,
-                                 const int64_t* pre_ids,
-                                 const float* pre_scores, const int64_t* ids,
-                                 const float* scores, const size_t* seq_offsets,
-                                 const int num_seqs, const int seq_width,
-                                 int beam_size, int end_id, bool is_accumulated,
-                                 int num_used_threads) {
-  const int tid = threadIdx.x;
-  const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
-
-  int seq_offset_start = static_cast<int>(seq_offsets[seq_id]);
-  int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
-
-  BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>(
-      selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
-      pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
-      beam_size, end_id, is_accumulated, num_used_threads);
-}
-
-template <int MaxLength, int MaxThreadsPerSeq>
-__global__ void BeamSearchKernelSingle(
-    int64_t* selected_ids, float* selected_scores, int* parent_idx,
-    size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
-    const int64_t* ids, const float* scores, const int seq_length,
-    const int seq_width, int beam_size, int end_id, bool is_accumulated,
-    int num_used_threads) {
-  const int seq_offset_start = 0;
-  const int seq_offset_end = seq_length;
-
-  BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>(
-      selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
-      pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
-      beam_size, end_id, is_accumulated, num_used_threads);
-}
-
-static inline int GetNumUsedThreads(const int max_threads_per_seq,
-                                    const int seq_width, int beam_size) {
-  int num_used_threads = (seq_width + beam_size - 1) / beam_size;
-  num_used_threads = max_threads_per_seq < num_used_threads
-                         ? max_threads_per_seq
-                         : num_used_threads;
-
-  num_used_threads =
-      num_used_threads > 32
-          ? (num_used_threads >> 5) << 5
-          : (num_used_threads > 16
-                 ? 32
-                 : (num_used_threads > 8
-                        ? 16
-                        : (num_used_threads > 4
-                               ? 8
-                               : (num_used_threads > 2 ? 4
-                                                       : num_used_threads))));
-  return num_used_threads;
-}
-
-template <typename T>
-class BeamSearchFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor* pre_ids,
-                  const framework::LoDTensor* pre_scores,
-                  const framework::LoDTensor* ids,
-                  const framework::LoDTensor* scores,
-                  framework::LoDTensor* selected_ids,
-                  framework::LoDTensor* selected_scores,
-                  framework::Tensor* parent_idx, size_t level, size_t beam_size,
-                  int end_id, bool is_accumulated) {
-    auto abs_lod = framework::ToAbsOffset(scores->lod());
-
-    const int64_t* pre_ids_data = pre_ids->data<int64_t>();
-    const float* pre_scores_data = pre_scores->data<float>();
-    const int64_t* ids_data = ids ? ids->data<int64_t>() : nullptr;
-    const float* scores_data = scores->data<float>();
-
-    const size_t num_seqs = abs_lod[level].size() - 1;
-    size_t seq_width = 1;
-    for (int i = 1; i < scores->dims().size(); i++) {
-      seq_width *= scores->dims()[i];
-    }
-
-    // Reserve a big enough memory.
-    auto selected_dims =
-        framework::make_ddim({static_cast<int64_t>(num_seqs * beam_size), 1});
-    int64_t* selected_ids_data =
-        selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
-    float* selected_scores_data =
-        selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
-    int* parent_idx_data =
-        parent_idx
-            ? parent_idx->mutable_data<int>(
-                  {static_cast<int64_t>(num_seqs * beam_size)},
-                  context.GetPlace())
-            : nullptr;
-
-    framework::LoD selected_lod(2);
-    selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
-    selected_lod[1].resize(scores->dims()[0] + 1);
-    size_t* selected_offsets =
-        selected_lod[1].CUDAMutableData(context.GetPlace());
-
-    if (num_seqs == 1) {
-      const int seq_length = static_cast<int>(abs_lod[level][1]);
-      const int kMaxThreadsPerSeq = 1024;
-      int num_used_threads =
-          GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
-                            static_cast<int>(beam_size));
-      switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) {
-        CUDA_LAUNCH_KERNEL_HELPER(
-            BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
-                1, kMaxThreadsPerSeq, 0, context.stream()>>>(
-                selected_ids_data, selected_scores_data, parent_idx_data,
-                selected_offsets, pre_ids_data, pre_scores_data, ids_data,
-                scores_data, seq_length, static_cast<int>(seq_width),
-                static_cast<int>(beam_size), static_cast<int>(end_id),
-                is_accumulated, num_used_threads));
-      }
-    } else if (num_seqs <= 4) {
-      const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace());
-      // Use only 1 block
-      const int kMaxThreadsPerSeq = 32;
-      const int kMaxSeqs = 4;
-      int num_used_threads =
-          GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
-                            static_cast<int>(beam_size));
-      switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) {
-        CUDA_LAUNCH_KERNEL_HELPER(
-            BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
-                1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
-                selected_ids_data, selected_scores_data, parent_idx_data,
-                selected_offsets, pre_ids_data, pre_scores_data, ids_data,
-                scores_data, seq_offsets, static_cast<int>(num_seqs),
-                static_cast<int>(seq_width), static_cast<int>(beam_size),
-                end_id, is_accumulated, num_used_threads));
-      }
-    } else {
-      LOG(FATAL) << "Not implemented.";
-    }
-
-    context.Wait();
-    if (!framework::CheckLoD(selected_lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
-    }
-
-    selected_ids->set_lod(selected_lod);
-    selected_scores->set_lod(selected_lod);
-    if (selected_lod[1].back() < num_seqs * beam_size) {
-      auto final_selected_dims = framework::make_ddim(
-          {static_cast<int64_t>(selected_lod[1].back()), 1});
-      selected_ids->Resize(final_selected_dims);
-      selected_scores->Resize(final_selected_dims);
-      if (parent_idx) {
-        parent_idx->Resize({static_cast<int64_t>(selected_lod[1].back())});
-      }
-    }
-  }
-};
-
-template class BeamSearchFunctor<platform::CUDADeviceContext, int>;
-template class BeamSearchFunctor<platform::CUDADeviceContext, int64_t>;
-template class BeamSearchFunctor<platform::CUDADeviceContext, float>;
-template class BeamSearchFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h
deleted file mode 100644
index 4474e7ea52affed792572d02202ec2577c471e50..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/beam_search.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * This is an implementation of beam search.
- *
- * To explain the details, lets take machine translation task for example, in
- * this task, one source sentence is translated to multiple target sentences,
- * during this period, one sentence will be translated to multiple translation
- * prefixes(target sentence that have not ended), in each time step a prefix
- * will have some candidates, input the candidate ids and their corresponding
- * scores (probabilities), it will sort and select the top beam_size candidates
- * for each source sentence, and store the selected candidates's score and their
- * corresponding ids to LoDTensors.
- *
- * A detailed example:
- *
- *  Input
- *
- *    ids:
- *      - LoD (should have 2 levels)
- *        - first level: [0, 1, 4]
- *        - second level: [0, 1, 2, 3, 4]
- *      - tensor's data:
- *          [[4, 2, 5]
- *           [2, 1, 3]
- *           [3, 5, 2]
- *           [8, 2, 1]]
- *
- *    scores:
- *      - LoD same as `ids`
- *      - tensor's data
- *          [[0.5, 0.3, 0.2]
- *           [0.6, 0.3, 0.1]
- *           [0.9, 0.5, 0.1]
- *           [0.7, 0.5, 0.1]]
- *
- * The inputs means that there are 2 source sentences to translate, and the
- * first source has 1 prefix, the second source has 2 prefix.
- *
- * Lets assume beam size is 2, and the beam search's output should be
- *      - LoD
- *        - first level: [0, 1, 2]
- *        - second level: [0, 2, 4]
- *      - id tensor's data
- *          [[4,
- *            1,
- *            3,
- *            8]]
- *      - score tensor's data
- *          [[0.5,
- *            0.3,
- *            0.9,
- *            0.7]]
- *
- * TODO all the prune operations should be in the beam search, so it is better
- * to split the beam search algorithm into a sequence of smaller operators, and
- * the prune operators can be inserted in this sequence.
- */
-template <typename DeviceContext, typename T>
-class BeamSearchFunctor {
- public:
-  /*
-   * The main function of beam search.
-   *
-   * @selected_ids: a [None, 1]-shaped tensor with LoD.
-   *   In a machine translation model, it might be the candidate term id sets,
-   *   each set stored as a varience-length sequence.
-   *   The format might be described with a two-level LoD
-   *   - [[0 1],
-   *      [0 1 2]]
-   *   - [[]
-   *      [0 1]]
-   *   the first level of LoD tells that there are two source sentences. The
-   *   second level describes the details of the candidate id set's offsets in
-   * the source sentences.
-   *
-   *  @selected_scores: a LoD tensor with the same shape and LoD with
-   * selected_ids.
-   *   It stores the corresponding scores of candidate ids in selected_ids.
-   *
-   * Return false if all the input tensor is empty, in machine translation task
-   * that means no candidates is provided, and the task will stop running.
-   */
-  void operator()(
-      const DeviceContext& context, const framework::LoDTensor* pre_ids,
-      const framework::LoDTensor* pre_scores, const framework::LoDTensor* ids,
-      const framework::LoDTensor* scores, framework::LoDTensor* selected_ids,
-      framework::LoDTensor* selected_scores, framework::Tensor* parent_idx,
-      size_t level, size_t beam_size, int end_id, bool is_accumulated);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
deleted file mode 100644
index 7ea8eb8b00db328ca13d3d33d751aca4eac66dae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/beam_search.h"
-#include <gtest/gtest.h>
-#include <vector>
-
-void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
-                       paddle::framework::LoDTensor* scores,
-                       paddle::framework::LoDTensor* pre_ids,
-                       paddle::framework::LoDTensor* pre_scores) {
-  // lod
-  paddle::framework::LoD lod;
-  std::vector<size_t> level0({0, 2, 4});
-  std::vector<size_t> level1({0, 1, 2, 3, 4});
-  lod.push_back(level0);
-  lod.push_back(level1);
-  ids->set_lod(lod);
-  scores->set_lod(lod);
-
-  auto dims = paddle::framework::make_ddim({4, 3});
-  ids->Resize(dims);
-  scores->Resize(dims);
-
-  paddle::platform::CPUPlace place;
-  auto* ids_data = ids->mutable_data<int64_t>(place);
-  auto* scores_data = scores->mutable_data<float>(place);
-  std::vector<int64_t> ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
-  std::vector<float> scores_vec_data(
-      {0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
-
-  CHECK_EQ(static_cast<size_t>(ids->numel()), ids_vec_data.size());
-  CHECK_EQ(static_cast<size_t>(ids->numel()), scores_vec_data.size());
-
-  for (int i = 0; i < ids->numel(); i++) {
-    ids_data[i] = ids_vec_data[i];
-    scores_data[i] = scores_vec_data[i];
-  }
-
-  // pre_ids
-  pre_ids->Resize(paddle::framework::make_ddim({4, 1}));
-  for (int i = 0; i < 4; i++) {
-    pre_ids->mutable_data<int64_t>(place)[i] = i + 1;
-  }
-
-  // pre_scores
-  pre_scores->Resize(paddle::framework::make_ddim({4, 1}));
-  for (int i = 0; i < 4; i++) {
-    pre_scores->mutable_data<float>(place)[i] = 0.1 * (i + 1);
-  }
-}
-
-template <typename DeviceContext, typename Place>
-void TestBeamSearch() {
-  paddle::framework::LoDTensor ids;
-  paddle::framework::LoDTensor scores;
-  paddle::framework::LoDTensor pre_ids;
-  paddle::framework::LoDTensor pre_scores;
-
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-  if (paddle::platform::is_cpu_place(*place)) {
-    PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
-  } else {
-    paddle::framework::LoDTensor cpu_ids;
-    paddle::framework::LoDTensor cpu_scores;
-    paddle::framework::LoDTensor cpu_pre_ids;
-    paddle::framework::LoDTensor cpu_pre_scores;
-
-    PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
-
-    TensorCopySync(cpu_ids, *place, &ids);
-    TensorCopySync(cpu_scores, *place, &scores);
-    TensorCopySync(cpu_pre_ids, *place, &pre_ids);
-    TensorCopySync(cpu_pre_scores, *place, &pre_scores);
-
-    ids.set_lod(cpu_ids.lod());
-    scores.set_lod(cpu_scores.lod());
-    pre_ids.set_lod(cpu_pre_ids.lod());
-    pre_scores.set_lod(cpu_pre_scores.lod());
-  }
-
-  paddle::framework::LoDTensor selected_ids;
-  paddle::framework::LoDTensor selected_scores;
-  paddle::framework::LoDTensor parent_idx;
-
-  size_t level = 0;
-  size_t beam_size = 2;
-  int end_id = 0;
-  paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
-  beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
-             &selected_scores, &parent_idx, level, beam_size, end_id, true);
-
-  ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
-
-  paddle::framework::LoDTensor cpu_selected_ids;
-  paddle::framework::LoDTensor cpu_selected_scores;
-  if (paddle::platform::is_cpu_place(*place)) {
-    cpu_selected_ids = selected_ids;
-    cpu_selected_scores = selected_scores;
-  } else {
-    TensorCopySync(selected_ids, paddle::platform::CPUPlace(),
-                   &cpu_selected_ids);
-    TensorCopySync(selected_scores, paddle::platform::CPUPlace(),
-                   &cpu_selected_scores);
-    cpu_selected_ids.set_lod(selected_ids.lod());
-    cpu_selected_scores.set_lod(selected_scores.lod());
-  }
-
-  std::vector<int64_t> expected_ids({4, 5, 3, 8});
-  std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
-  for (int i = 0; i < 4; i++) {
-    ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
-    ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
-  }
-
-  delete place;
-  delete context;
-}
-
-TEST(BeamSearch, CPU) {
-  TestBeamSearch<paddle::platform::CPUDeviceContext,
-                 paddle::platform::CPUPlace>();
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(BeamSearch, GPU) {
-  TestBeamSearch<paddle::platform::CUDADeviceContext,
-                 paddle::platform::CUDAPlace>();
-}
-#endif
diff --git a/paddle/fluid/operators/math/blas.cc b/paddle/fluid/operators/math/blas.cc
deleted file mode 100644
index 6a143b3c056455595fdedc131b0c5f4ee756e1e0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/blas.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/math/blas.h"
-
-#include <utility>
-namespace paddle {
-namespace operators {
-namespace math {
-MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim,
-                                     int num_flatten_cols, bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
-  MatDescriptor retv;
-  if (num_flatten_cols > 1) {
-    auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols);
-    retv.height_ = flatten_dim[0];
-    retv.width_ = flatten_dim[1];
-  } else {
-    if (tensor_dim.size() == 2) {
-      retv.height_ = tensor_dim[0];
-      retv.width_ = tensor_dim[1];
-    } else {
-      auto dim_vec = framework::vectorize(tensor_dim);
-      retv.batch_size_ = 1;
-      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
-        retv.batch_size_ *= dim_vec[i];
-      }
-      retv.height_ = dim_vec[dim_vec.size() - 2];
-      retv.width_ = dim_vec[dim_vec.size() - 1];
-      retv.stride_ = retv.height_ * retv.width_;
-    }
-  }
-  if (trans) {
-    std::swap(retv.width_, retv.height_);
-  }
-  retv.trans_ = trans;
-  return retv;
-}
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
deleted file mode 100644
index a15dab935552c6e93fd9c0d9963985d6ea024f35..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/blas.h
+++ /dev/null
@@ -1,363 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-#ifdef PADDLE_WITH_LIBXSMM
-#include <libxsmm.h>
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/**
- * Matrix Descriptor of a memory buffer.
- *
- * It is used for Blas::MatMul. MatMul operator can be batched.
- * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
- * `batch_size` times of GEMM. The batched GEMM could be faster base on the
- * implementation of the blas library. The batch size could be zero. If any
- * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
- * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
- * [BatchSize, H1, W2]
- *
- * The boolean flag, `trans`, describe the memory is the transpose of matrix or
- * not. If the trans is true, the last two dims of matrix are transposed. The
- * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
- *
- * The MatDescriptor is not only the dimension or shape of a matrix, it also
- * contains the layout, stride of matrix. It is clearer to have a structure than
- * reuse `DDim`.
- */
-struct MatDescriptor {
-  int64_t height_;
-  int64_t width_;
-  int64_t stride_{0};
-  int64_t batch_size_{0};
-  bool trans_;
-};
-
-/**
- * Create Matrix Descriptor from a tensor dim, num_flatten_cols, and transpose
- * flag
- *
- * @param tensor_dim: The dimension of the tensor. The rank of this dimension
- * must larger than 1.
- *
- * @param num_flatten_cols:  Reshape a tensor to a matrix. The matrix's first
- * dimension(column length) will be the product of tensor's first `num_col_dims`
- * dimensions. If num_flatten_cols is zero, the first N-2 dimension will be the
- * batch_size of descriptor.
- *
- * @param trans: True if the matrix is transposed.
- */
-extern MatDescriptor CreateMatrixDescriptor(const framework::DDim& tensor_dim,
-                                            int num_flatten_cols, bool trans);
-
-template <typename DeviceContext>
-class Blas {
- public:
-  explicit Blas(const DeviceContext& context) : context_(context) {}
-
-  template <typename T>
-  void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-            T alpha, const T* A, const T* B, T beta, T* C) const;
-
-  template <typename T>
-  void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
-            int lda, const T* B, int ldb, T beta, T* C, int ldc) const;
-
-  template <typename T>
-  void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-            T alpha, const T* A, int lda, const T* B, int ldb, T beta, T* C,
-            int ldc) const;
-
-#ifdef PADDLE_WITH_MKLML
-  template <typename T>
-  T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N,
-                const int K) const;
-
-  template <typename T>
-  void GEMM_PACK(const CBLAS_IDENTIFIER id, const CBLAS_TRANSPOSE trans, int M,
-                 int N, int K, const T alpha, const T* src, const int ld,
-                 T* dst) const;
-
-  template <typename T>
-  void GEMM_COMPUTE(int transA, int transB, int M, int N, int K, const T* A,
-                    const int lda, const T* B, const int ldb, T beta, T* C,
-                    const int ldc) const;
-
-  template <typename T>
-  void GEMM_FREE(T* data) const;
-
-  template <typename T>
-  void CSRMM(const char* transa, const int* m, const int* n, const int* k,
-             const T* alpha, const char* matdescra, const T* val,
-             const int* indx, const int* pntrb, const int* pntre, const T* b,
-             const int* ldb, const T* beta, T* c, const int* ldc) const;
-
-#if !defined(PADDLE_WITH_CUDA)
-  template <typename T>
-  void MatMulWithHead(const framework::Tensor& mat_a,
-                      const MatDescriptor& dim_a,
-                      const framework::Tensor& mat_b,
-                      const MatDescriptor& dim_b, T alpha, int head_number,
-                      framework::Tensor* mat_out, T beta,
-                      bool mat_y_split_vertical) const;
-#endif
-#endif
-
-  template <typename T>
-  void MatMul(const int M, const int N, const int K, const T* A, const T* B,
-              T* C) const;
-
-  template <typename T>
-  void MatMul(const framework::Tensor& mat_a, bool trans_a,
-              const framework::Tensor& mat_b, bool trans_b, T alpha,
-              framework::Tensor* mat_out, T beta) const;
-
-  template <typename T>
-  void MatMul(const framework::Tensor& mat_a, bool trans_a,
-              const framework::Tensor& mat_b, bool trans_b,
-              framework::Tensor* mat_out) const {
-    MatMul(mat_a, trans_a, mat_b, trans_b, static_cast<T>(1.0), mat_out,
-           static_cast<T>(0.0));
-  }
-
-  template <typename T>
-  void MatMul(const framework::Tensor& mat_a, const framework::Tensor& mat_b,
-              framework::Tensor* mat_out) const {
-    this->template MatMul<T>(mat_a, false, mat_b, false, mat_out);
-  }
-
-  template <typename T>
-  void AXPY(int n, T alpha, const T* x, T* y) const;
-
-  template <typename T>
-  void VADD(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VMUL(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VCOPY(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VEXP(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VSQUARE(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VPOW(int n, const T* x, T alpha, T* y) const;
-
-  template <typename T>
-  void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
-            T* C) const;
-
-  template <typename T>
-  T DOT(int n, const T* x, const T* y) const;
-
-  template <typename T>
-  void SCAL(int n, const T a, T* x) const;
-
-  template <typename T>
-  T ASUM(int n, T* x, int inc) const;
-
-  template <typename T>
-  void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
-                   int K, T alpha, const T* A, const T* B, T beta, T* C,
-                   int batchCount, int64_t strideA, int64_t strideB) const;
-
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
-  template <typename T>
-  void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
-                           int W1, int H1, int W2, int H2, T alpha, const T* A,
-                           const T* B, T beta, T* C, int batchCount,
-                           int64_t strideA, int64_t strideB,
-                           int64_t head_number, bool split_b_vertical) const;
-#endif
-
-  template <typename T>
-  void MatMul(const framework::Tensor& mat_a, const MatDescriptor& dim_a,
-              const framework::Tensor& mat_b, const MatDescriptor& dim_b,
-              T alpha, framework::Tensor* mat_out, T beta) const;
-
-  template <typename T>
-  void VINV(int n, const T* a, T* y) const;
-
-  template <typename T>
-  void VMERF(int n, const T* a, T* y, int64_t mode) const;
-
- private:
-  const DeviceContext& context_;
-};
-
-template <typename DeviceContext, typename T>
-class BlasT : private Blas<DeviceContext> {
- public:
-  using Blas<DeviceContext>::Blas;
-
-  template <typename... ARGS>
-  void GEMM(ARGS... args) const {
-    Base()->template GEMM<T>(args...);
-  }
-
-#ifdef PADDLE_WITH_MKLML
-  template <typename... ARGS>
-  T* GEMM_ALLOC(ARGS... args) const {
-    return Base()->template GEMM_ALLOC<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_PACK(ARGS... args) const {
-    Base()->template GEMM_PACK<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_COMPUTE(ARGS... args) const {
-    Base()->template GEMM_COMPUTE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_FREE(ARGS... args) const {
-    Base()->template GEMM_FREE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void CSRMM(ARGS... args) const {
-    Base()->template CSRMM<T>(args...);
-  }
-
-#if !defined(PADDLE_WITH_CUDA)
-  template <typename... ARGS>
-  void MatMulWithHead(ARGS... args) const {
-    Base()->template MatMulWithHead<T>(args...);
-  }
-#endif
-#endif
-
-  template <typename... ARGS>
-  void MatMul(ARGS... args) const {
-    Base()->template MatMul<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void AXPY(ARGS... args) const {
-    Base()->template AXPY<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VADD(ARGS... args) const {
-    Base()->template VADD<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VMUL(ARGS... args) const {
-    Base()->template VMUL<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VCOPY(ARGS... args) const {
-    Base()->template VCOPY<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VEXP(ARGS... args) const {
-    Base()->template VEXP<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VSQUARE(ARGS... args) const {
-    Base()->template VSQUARE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VPOW(ARGS... args) const {
-    Base()->template VPOW<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMV(ARGS... args) const {
-    Base()->template GEMV<T>(args...);
-  }
-
-  template <typename... ARGS>
-  T DOT(ARGS... args) const {
-    return Base()->template DOT<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void SCAL(ARGS... args) const {
-    Base()->template SCAL<T>(args...);
-  }
-
-  template <typename... ARGS>
-  T ASUM(ARGS... args) const {
-    return Base()->template ASUM<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void BatchedGEMM(ARGS... args) const {
-    Base()->template BatchedGEMM<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VINV(ARGS... args) const {
-    Base()->template VINV<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VMERF(ARGS... args) const {
-    Base()->template VMERF<T>(args...);
-  }
-
- private:
-  const Blas<DeviceContext>* Base() const {
-    return static_cast<const Blas<DeviceContext>*>(this);
-  }
-};
-
-template <typename DeviceContext, typename T>
-inline BlasT<DeviceContext, T> GetBlas(
-    const framework::ExecutionContext& exe_ctx) {
-  return BlasT<DeviceContext, T>(
-      exe_ctx.template device_context<DeviceContext>());
-}
-
-template <typename DeviceContext, typename T>
-inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
-  return BlasT<DeviceContext, T>(dev_ctx);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
-
-#include "paddle/fluid/operators/math/blas_impl.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/operators/math/blas_impl.cu.h"
-#endif
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
deleted file mode 100644
index 4188e26fc9830e63381c040d17670931045b2630..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ /dev/null
@@ -1,382 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/dynload/cublas.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-DECLARE_bool(enable_cublas_tensor_op_math);
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct CUBlas;
-
-template <>
-struct CUBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemm(args...));
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSaxpy(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemv(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_STRIDED_BATCH(ARGS... args) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSgemmStridedBatched(args...));
-#else
-    PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
-#endif
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(platform::CUDADeviceContext *dev_ctx,
-                      cublasOperation_t transa, cublasOperation_t transb, int m,
-                      int n, int k, const float *alpha, const void *A,
-                      cudaDataType_t Atype, int lda, const void *B,
-                      cudaDataType_t Btype, int ldb, const float *beta, void *C,
-                      cudaDataType_t Ctype, int ldc) {
-// Because the gcc 4.8 doesn't expand template parameter pack that
-// appears in a lambda-expression, I can not use template parameter pack
-// here.
-#if CUDA_VERSION >= 8000
-    VLOG(5) << "use_tensor_op_math: "
-            << (dev_ctx->tensor_core_available() ? "True" : "False");
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemmEx(
-          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
-          beta, C, Ctype, ldc));
-    });
-#else
-    PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
-#endif
-  }
-};
-
-template <>
-struct CUBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemm(args...));
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDaxpy(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemv(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_STRIDED_BATCH(ARGS... args) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDgemmStridedBatched(args...));
-#else
-    PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
-#endif
-  }
-
-  template <typename... ARGS>
-  static void GEMM_EX(ARGS... args) {
-    PADDLE_THROW("Currently there are not cublasDgemmEx.");
-  }
-};
-
-template <>
-struct CUBlas<platform::float16> {
-  using float16 = platform::float16;
-
-  static void GEMM(cublasHandle_t handle, cublasOperation_t transa,
-                   cublasOperation_t transb, int m, int n, int k,
-                   const float16 *alpha, const float16 *A, int lda,
-                   const float16 *B, int ldb, const float16 *beta, float16 *C,
-                   int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasHgemm(handle, transa, transb, m, n, k,
-                                       reinterpret_cast<const __half *>(alpha),
-                                       reinterpret_cast<const __half *>(A), lda,
-                                       reinterpret_cast<const __half *>(B), ldb,
-                                       reinterpret_cast<const __half *>(beta),
-                                       reinterpret_cast<__half *>(C), ldc));
-  }
-
-  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
-                                 cublasOperation_t transa,
-                                 cublasOperation_t transb, int m, int n, int k,
-                                 const float16 *alpha, const float16 *A,
-                                 int lda, long long int strideA,  // NOLINT
-                                 const float16 *B,                // NOLINT
-                                 int ldb, long long int strideB,  // NOLINT
-                                 const float16 *beta, float16 *C, int ldc,
-                                 long long int strideC,  // NOLINT
-                                 int batchCount) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasHgemmStridedBatched(
-        handle, transa, transb, m, n, k,
-        reinterpret_cast<const __half *>(alpha),
-        reinterpret_cast<const __half *>(A), lda, strideA,
-        reinterpret_cast<const __half *>(B), ldb, strideB,
-        reinterpret_cast<const __half *>(beta), reinterpret_cast<__half *>(C),
-        ldc, strideC, batchCount));
-#else
-    PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
-#endif
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(platform::CUDADeviceContext *dev_ctx,
-                      cublasOperation_t transa, cublasOperation_t transb, int m,
-                      int n, int k, const void *alpha, const void *A,
-                      cudaDataType_t Atype, int lda, const void *B,
-                      cudaDataType_t Btype, int ldb, const void *beta, void *C,
-                      cudaDataType_t Ctype, int ldc,
-                      cudaDataType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx(
-          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
-          beta, C, Ctype, ldc, computeType, algo));
-    });
-#else
-    PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
-#endif
-  }
-};
-
-template <>
-template <typename T>
-void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                             CBLAS_TRANSPOSE transB, int M,
-                                             int N, int K, T alpha, const T *A,
-                                             const T *B, T beta, T *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-#if CUDA_VERSION >= 8000
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
-    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
-                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
-                       CUDA_R_32F, N);
-  } else {
-#endif  // CUDA_VERSION >= 8000
-    context_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-                      lda, &beta, C, N);
-    });
-
-#if CUDA_VERSION >= 8000
-  }
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <>
-inline void Blas<platform::CUDADeviceContext>::GEMM(
-    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-    platform::float16 alpha, const platform::float16 *A,
-    const platform::float16 *B, platform::float16 beta,
-    platform::float16 *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
-                    "cublas fp16 gemm requires GPU compute capability >= 53");
-
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
-#if CUDA_VERSION >= 8000
-  // cublasHgemm does true FP16 computation which is slow for non-Volta
-  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
-  // input/output in fp16, computation in fp32, which can also be accelerated
-  // using tensor cores in volta GPUs.
-  auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
-  CUBlas<platform::float16>::GEMM_EX(
-      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A,
-      CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
-#else
-  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K,
-                                    &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C,
-                                    N);
-  });
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <typename T>
-void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
-                                             int N, int K, T alpha, const T *A,
-                                             int lda, const T *B, int ldb,
-                                             T beta, T *C, int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-#if CUDA_VERSION >= 8000
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
-    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
-                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
-                       CUDA_R_32F, ldc);
-  } else {
-#endif  // CUDA_VERSION >= 8000
-
-    context_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-                      lda, &beta, C, ldc);
-    });
-
-#if CUDA_VERSION >= 8000
-  }
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <>
-inline void Blas<platform::CUDADeviceContext>::GEMM(
-    bool transA, bool transB, int M, int N, int K, platform::float16 alpha,
-    const platform::float16 *A, int lda, const platform::float16 *B, int ldb,
-    platform::float16 beta, platform::float16 *C, int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
-                                    B, ldb, A, lda, &beta, C, ldc);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
-                                             T *y) const {
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
-                                             T alpha, const T *A, const T *B,
-                                             T beta, T *C) const {
-  cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<platform::CUDADeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-    T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
-    int64_t strideA, int64_t strideB) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-
-#if CUDA_VERSION >= 9010
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = context_.tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-
-    context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
-          handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb,
-          strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc,
-          strideC, batchCount, CUDA_R_32F, algo));
-    });
-  } else {
-#endif  // CUDA_VERSION >= 9010
-
-    context_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
-                                    B, ldb, strideB, A, lda, strideA, &beta, C,
-                                    ldc, strideC, batchCount);
-    });
-
-#if CUDA_VERSION >= 9010
-  }
-#endif  // CUDA_VERSION >= 9010
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
deleted file mode 100644
index e2620bcfd9298f38f887f8a5b35aa8efba6b7053..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/blas_impl.h
+++ /dev/null
@@ -1,848 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <cmath>
-#include <limits>
-#include <vector>
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct CBlas;
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-struct CBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    platform::dynload::cblas_sgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static float *GEMM_ALLOC(ARGS... args) {
-    return platform::dynload::cblas_sgemm_alloc(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_PACK(ARGS... args) {
-    platform::dynload::cblas_sgemm_pack(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_COMPUTE(ARGS... args) {
-    platform::dynload::cblas_sgemm_compute(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_FREE(ARGS... args) {
-    platform::dynload::cblas_sgemm_free(args...);
-  }
-
-#ifdef PADDLE_WITH_LIBXSMM
-  template <typename... ARGS>
-  static void SMM_GEMM(ARGS... args) {
-    libxsmm_sgemm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    platform::dynload::cblas_saxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    platform::dynload::cblas_scopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    platform::dynload::cblas_sgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static float DOT(ARGS... args) {
-    return platform::dynload::cblas_sdot(args...);
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    platform::dynload::cblas_sscal(args...);
-  }
-
-  template <typename... ARGS>
-  static float ASUM(ARGS... args) {
-    return platform::dynload::cblas_sasum(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    platform::dynload::cblas_sgemm_batch(args...);
-  }
-
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    platform::dynload::vsAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    platform::dynload::vsMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VEXP(ARGS... args) {
-    platform::dynload::vsExp(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSQUARE(ARGS... args) {
-    platform::dynload::vsSqr(args...);
-  }
-
-  template <typename... ARGS>
-  static void VPOW(ARGS... args) {
-    platform::dynload::vsPowx(args...);
-  }
-
-  template <typename... ARGS>
-  static void VINV(ARGS... args) {
-    platform::dynload::vsInv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMERF(ARGS... args) {
-    platform::dynload::vmsErf(args...);
-  }
-#if !defined(_WIN32)
-  template <typename... ARGS>
-  static void CSRMM(ARGS... args) {
-    platform::dynload::mkl_scsrmm(args...);
-  }
-#endif
-};
-
-template <>
-struct CBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    platform::dynload::cblas_dgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static double *GEMM_ALLOC(ARGS... args) {
-    return platform::dynload::cblas_dgemm_alloc(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_PACK(ARGS... args) {
-    platform::dynload::cblas_dgemm_pack(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_COMPUTE(ARGS... args) {
-    platform::dynload::cblas_dgemm_compute(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_FREE(ARGS... args) {
-    platform::dynload::cblas_dgemm_free(args...);
-  }
-
-#ifdef PADDLE_WITH_LIBXSMM
-  template <typename... ARGS>
-  static void SMM_GEMM(ARGS... args) {
-    libxsmm_dgemm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    platform::dynload::cblas_daxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    platform::dynload::cblas_dcopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    platform::dynload::cblas_dgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static double DOT(ARGS... args) {
-    return platform::dynload::cblas_ddot(args...);
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    platform::dynload::cblas_dscal(args...);
-  }
-
-  template <typename... ARGS>
-  static double ASUM(ARGS... args) {
-    return platform::dynload::cblas_dasum(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    platform::dynload::cblas_dgemm_batch(args...);
-  }
-
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    platform::dynload::vdAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    platform::dynload::vdMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VEXP(ARGS... args) {
-    platform::dynload::vdExp(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSQUARE(ARGS... args) {
-    platform::dynload::vdSqr(args...);
-  }
-
-  template <typename... ARGS>
-  static void VPOW(ARGS... args) {
-    platform::dynload::vdPowx(args...);
-  }
-
-  template <typename... ARGS>
-  static void VINV(ARGS... args) {
-    platform::dynload::vdInv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMERF(ARGS... args) {
-    platform::dynload::vmdErf(args...);
-  }
-#if !defined(_WIN32)
-  template <typename... ARGS>
-  static void CSRMM(ARGS... args) {
-    platform::dynload::mkl_dcsrmm(args...);
-  }
-#endif
-};
-
-#else
-
-template <>
-struct CBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    cblas_sgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    cblas_saxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_scopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    cblas_sgemv(args...);
-  }
-};
-
-template <>
-struct CBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    cblas_dgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    cblas_daxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_dcopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    cblas_dgemv(args...);
-  }
-};
-#endif
-
-template <>
-struct CBlas<platform::float16> {
-  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
-  static void SMM_GEMM(...) {
-    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
-  }
-  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
-  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
-  static void VSQUARE(...) {
-    PADDLE_THROW("float16 VSQUARE not supported on CPU");
-  }
-  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
-  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
-  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
-  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
-#ifdef PADDLE_WITH_MKLML
-  static void GEMM_BATCH(...) {
-    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
-  }
-#endif
-};
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-template <typename T>
-T *Blas<platform::CPUDeviceContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
-                                                const int M, const int N,
-                                                const int K) const {
-  return CBlas<T>::GEMM_ALLOC(id, M, N, K);
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
-                                                 const CBLAS_TRANSPOSE trans,
-                                                 int M, int N, int K,
-                                                 const T alpha, const T *src,
-                                                 const int ld, T *dst) const {
-  CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::GEMM_COMPUTE(
-    int transA, int transB, int M, int N, int K, const T *A, const int lda,
-    const T *B, const int ldb, T beta, T *C, const int ldc) const {
-  CBlas<T>::GEMM_COMPUTE(CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb,
-                         beta, C, ldc);
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::GEMM_FREE(T *data) const {
-  CBlas<T>::GEMM_FREE(data);
-}
-#endif
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                            CBLAS_TRANSPOSE transB, int M,
-                                            int N, int K, T alpha, const T *A,
-                                            const T *B, T beta, T *C) const {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-                 beta, C, ldc);
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
-                                            int N, int K, T alpha, const T *A,
-                                            int lda, const T *B, int ldb,
-                                            T beta, T *C, int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-                 lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                            CBLAS_TRANSPOSE transB, int M,
-                                            int N, int K, T alpha, const T *A,
-                                            int lda, const T *B, int ldb,
-                                            T beta, T *C, int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-                 beta, C, ldc);
-}
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
-                                 const framework::Tensor &mat_b, bool trans_b,
-                                 T alpha, framework::Tensor *mat_out,
-                                 T beta) const {
-  auto dim_a = mat_a.dims();
-  auto dim_b = mat_b.dims();
-  auto dim_out = mat_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-  PADDLE_ENFORCE(
-      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(),
-      "The places of matrices must be same");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = !trans_a ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !trans_b ? CblasNoTrans : CblasTrans;
-
-  this->GEMM(transA, transB, M, N, K, alpha, mat_a.data<T>(), mat_b.data<T>(),
-             beta, mat_out->data<T>());
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::AXPY(int n, T alpha, const T *x,
-                                            T *y) const {
-  CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::VCOPY(int n, const T *x, T *y) const {
-  CBlas<T>::VCOPY(n, x, 1, y, 1);
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
-                                            T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VADD(n, x, y, z);
-#else
-  if (x == z) {
-    this->template AXPY<T>(n, 1., y, z);
-  } else {
-    this->template VCOPY<T>(n, y, z);
-    this->template AXPY<T>(n, 1., x, z);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
-                                            T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMUL(n, x, y, z);
-#else
-  // try to find if openblas support vmul
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VEXP(n, x, y);
-#else
-  // try to find if openblas support vexp
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::VSQUARE(int n, const T *x, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSQUARE(n, x, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] * x[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::VPOW(int n, const T *x, T a,
-                                            T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VPOW(n, x, a, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::pow(x[i], a);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  return CBlas<T>::DOT(n, x, 1, y, 1);
-#else
-  // try to find if openblas support cblas_dot
-  T sum = 0;
-  for (int i = 0; i < n; ++i) {
-    sum += x[i] * y[i];
-  }
-  return sum;
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::SCAL(n, a, x, 1);
-#else
-  // try to find if openblas support cblas_scal
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-T Blas<platform::CPUDeviceContext>::ASUM(int n, T *x, int inc) const {
-  auto sum = static_cast<T>(0.0);
-#ifdef PADDLE_WITH_MKLML
-  sum = CBlas<T>::ASUM(n, x, inc);
-#else
-  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
-  for (int c = 0; c < n; ++c) {
-    sum += x[c];
-  }
-#endif
-  return sum;
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
-                                            const T *A, const T *B, T beta,
-                                            T *C) const {
-  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
-  CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-    T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
-    int64_t strideA, int64_t strideB) const {
-#ifdef PADDLE_WITH_MKLML
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-  for (int k = 0; k < batchCount; ++k) {
-    a_array[k] = &A[k * strideA];
-    b_array[k] = &B[k * strideB];
-    c_array[k] = &C[k * M * N];
-  }
-
-  CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
-                       a_array.data(), &lda, b_array.data(), &ldb, &beta,
-                       c_array.data(), &ldc, 1 /* group_count */, &batchCount);
-#else
-  for (int k = 0; k < batchCount; ++k) {
-    auto *Ak = &A[k * strideA];
-    auto *Bk = &B[k * strideB];
-    auto *Ck = &C[k * M * N];
-    this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
-  }
-#endif
-}
-
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::BatchedGEMMWithHead(
-    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int W1, int H1, int W2,
-    int H2, T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
-    int64_t strideA, int64_t strideB, int64_t head_number,
-    bool split_b_vertical) const {
-  int lda = (transA == CblasNoTrans) ? W1 : H1;
-  int ldb = (transB == CblasNoTrans) ? W2 : H2;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-
-  if (split_b_vertical) {
-    int ldc = W2;
-    int sub_width = W2 / head_number;
-
-    for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W2 / head_number)
-                                : i * (W2 / head_number) * H2;
-      int sub_matC_offset = i * W2 / head_number;
-      for (int k = 0; k < batchCount; ++k) {
-        a_array[k] = &A[k * strideA] + sub_matA_offset;
-        b_array[k] = &B[k * strideB] + sub_matB_offset;
-        c_array[k] = &C[k * H1 * W2] + sub_matC_offset;
-      }
-
-      CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &H1, &sub_width,
-                           &H2, &alpha, a_array.data(), &lda, b_array.data(),
-                           &ldb, &beta, c_array.data(), &ldc,
-                           1 /* group_count */, &batchCount);
-    }
-
-  } else {
-    PADDLE_ENFORCE_EQ(W1, H2);
-    int ldc = W2 * head_number;
-    int sub_width = W1 / head_number;
-
-    for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W1 / head_number) * W2
-                                : i * (W1 / head_number);
-      int sub_matC_offset = i * W2;
-      for (int k = 0; k < batchCount; ++k) {
-        a_array[k] = &A[k * strideA] + sub_matA_offset;
-        b_array[k] = &B[k * strideB] + sub_matB_offset;
-        c_array[k] = &C[k * H1 * head_number * W2] + sub_matC_offset;
-      }
-
-      CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &H1, &W2,
-                           &sub_width, &alpha, a_array.data(), &lda,
-                           b_array.data(), &ldb, &beta, c_array.data(), &ldc,
-                           1 /* group_count */, &batchCount);
-    }
-  }
-}
-#endif
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(const int M, const int N, const int K,
-                                 const T *A, const T *B, T *C) const {
-  this->template GEMM<T>(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
-                         static_cast<T>(1), A, K, B, N, static_cast<T>(0), C,
-                         N);
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::MatMul(const int M, const int N,
-                                              const int K, const T *A,
-                                              const T *B, T *C) const {
-#ifdef PADDLE_WITH_LIBXSMM
-  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
-  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
-
-  // Since the matrix is very small,
-  // so the unit of calculation is already very fast,
-  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
-  // use xsmm directly.
-  // Note: SMM use ColMajor
-  const char transa = 'N';
-  const char transb = 'N';
-  const T alpha = static_cast<T>(1);
-  const T beta = static_cast<T>(0);
-  CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta,
-                     C, &N);
-  return;
-#endif
-
-  CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
-                 static_cast<T>(1), A, K, B, N, static_cast<T>(0), C, N);
-}
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
-                                 const MatDescriptor &dim_a,
-                                 const framework::Tensor &mat_b,
-                                 const MatDescriptor &dim_b, T alpha,
-                                 framework::Tensor *mat_out, T beta) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
-  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
-  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
-    this->template GEMM<T>(transA, transB, dim_a.height_, dim_b.width_,
-                           dim_a.width_, alpha, mat_a.data<T>(),
-                           mat_b.data<T>(), beta, mat_out->data<T>());
-  } else {
-    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0,
-                   "dim_a.batch_size should be equal to dim_b.batch_size, or "
-                   "one of dim_a.batch_size and dim_b.batch_size should be 0. "
-                   "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
-                   dim_a.batch_size_, dim_b.batch_size_);
-    this->template BatchedGEMM<T>(
-        transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
-        mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
-        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
-        dim_a.stride_, dim_b.stride_);
-  }
-}
-
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
-/*
- * Multiple two matrixes with multiple heads
- *
- * A new parameter, i.e head_number is added compared to normal MatMul.
- * The head_number describes the number of heads a matrix is vertically
- * split.
- *
- * When user calls this API, the multiplication of two big matrixes is split
- * into multiplication of several (head_number_) small matrixes. e.g. if Mat A
- * is [3, 24] and Mat B is [24, 4], when multiple A and B with head_number as
- * 4, Mat A will be splitted as 4 matrix of [3, 6] and Mat B will be
- * (horizontally) splitted as 4 matrix of [6, 4]. The result of final matrix
- * will be 4 matrix of [3, 4], i.e. [3, 16].
- * Another example is A is [3, 8], B is [2, 16], head_number is 4. In this
- * case, A will be splitted as [3, 2], B will be (vertically) splitted as
- * [2, 4]. The final result will be 4 matrix of 4 matrix of [3,4], i.e. [3, 16]
- */
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
-                                         const MatDescriptor &dim_a,
-                                         const framework::Tensor &mat_b,
-                                         const MatDescriptor &dim_b, T alpha,
-                                         int head_number,
-                                         framework::Tensor *mat_out, T beta,
-                                         bool mat_b_split_vertical) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_ % head_number, 0);
-  PADDLE_ENFORCE_GE(head_number, 1);
-  PADDLE_ENFORCE_LE(head_number, dim_a.width_);
-  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
-
-  if (mat_b_split_vertical) {
-    PADDLE_ENFORCE_EQ(dim_b.height_, dim_a.width_ / head_number);
-    PADDLE_ENFORCE_EQ(dim_b.width_ % head_number, 0);
-  }
-
-  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
-    int lda = !dim_a.trans_ ? dim_a.width_ : dim_a.height_;
-    int ldb = !dim_b.trans_ ? dim_b.width_ : dim_b.height_;
-    int sub_matA_offset;
-    int sub_matB_offset;
-    int sub_matC_offset;
-    int sub_mat_M = dim_a.height_;
-    int sub_mat_N;
-    int sub_mat_K;
-    int ldc;
-
-    for (int i = 0; i < head_number; i++) {
-      sub_matA_offset = dim_a.trans_
-                            ? i * (dim_a.width_ / head_number) * dim_a.height_
-                            : i * (dim_a.width_ / head_number);
-      if (mat_b_split_vertical) {
-        sub_matB_offset = dim_b.trans_
-                              ? i * (dim_b.width_ / head_number) * dim_b.height_
-                              : i * (dim_b.width_ / head_number);
-        sub_matC_offset = i * dim_b.width_ / head_number;
-
-        sub_mat_N = dim_b.width_ / head_number;
-        sub_mat_K = dim_b.height_;
-
-        ldc = dim_b.width_;
-      } else {
-        sub_matB_offset =
-            dim_b.trans_ ? i * (dim_b.height_ / head_number)
-                         : i * (dim_b.height_ / head_number) * dim_b.width_;
-        sub_matC_offset = i * dim_b.width_;
-
-        sub_mat_N = dim_b.width_;
-        sub_mat_K = dim_a.width_ / head_number;
-
-        ldc = head_number * dim_b.width_;
-      }
-
-      this->template GEMM<T>(transA, transB, sub_mat_M, sub_mat_N, sub_mat_K,
-                             alpha, mat_a.data<T>() + sub_matA_offset, lda,
-                             mat_b.data<T>() + sub_matB_offset, ldb, beta,
-                             mat_out->data<T>() + sub_matC_offset, ldc);
-    }
-  } else {
-    PADDLE_ENFORCE_EQ((dim_a.batch_size_ == dim_b.batch_size_ ||
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0),
-                      true);
-
-    this->template BatchedGEMMWithHead<T>(
-        transA, transB, dim_a.width_, dim_a.height_, dim_b.width_,
-        dim_b.height_, alpha, mat_a.data<T>(), mat_b.data<T>(), beta,
-        mat_out->data<T>(),
-        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
-        dim_a.stride_, dim_b.stride_, head_number, mat_b_split_vertical);
-  }
-}
-#endif
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VINV(n, a, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = 1.0 / a[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::VMERF(int n, const T *a, T *y,
-                                             int64_t mode) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMERF(n, a, y, mode);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::erf(a[i]);
-  }
-#endif
-}
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::CSRMM(
-    const char *transa, const int *m, const int *n, const int *k,
-    const T *alpha, const char *matdescra, const T *val, const int *indx,
-    const int *pntrb, const int *pntre, const T *b, const int *ldb,
-    const T *beta, T *c, const int *ldc) const {
-  CBlas<T>::CSRMM(transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntre, b,
-                  ldb, beta, c, ldc);
-}
-#endif
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/compound_functors.h b/paddle/fluid/operators/math/compound_functors.h
deleted file mode 100644
index 6a43215bf52a9b231a47241d1bb27695da031957..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/compound_functors.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// Z = BinaryFunctor(X, UnaryFunctor(Y))
-template <typename T, typename BinaryFunctor, typename UnaryFunctor>
-struct BinaryCompoundFunctor {
-  BinaryCompoundFunctor(const BinaryFunctor func1, const UnaryFunctor func2)
-      : func1_(func1), func2_(func2) {}
-
-  inline HOSTDEVICE T GetOut(T x, T y) { return func1_(x, func2_(y)); }
-
-  inline HOSTDEVICE T GetOutUseIntermediateOut(T x, T intermediat_out) {
-    return func1_(x, intermediat_out);
-  }
-
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return func2_(y); }
-
-  BinaryFunctor func1_;
-  UnaryFunctor func2_;
-};
-
-// Z = UnaryFunctor(BinaryFunctor(X, Y))
-template <typename T, typename UnaryFunctor, typename BinaryFunctor>
-struct UnaryCompoundFunctor {
-  UnaryCompoundFunctor(const UnaryFunctor func1, const BinaryFunctor func2)
-      : func1_(func1), func2_(func2) {}
-
-  inline HOSTDEVICE T GetOut(T x, T y) { return func1_(func2_(x, y)); }
-
-  inline HOSTDEVICE T GetOutUseIntermediateOut(T x, T intermediat_out) {
-    return func1_(intermediat_out);
-  }
-
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return func2_(x, y); }
-
-  UnaryFunctor func1_;
-  BinaryFunctor func2_;
-};
-
-// Z = BinaryFunctor(X, UnaryFunctor(Y))
-template <typename T, typename DBinaryFun, typename UnaryFun>
-struct BinaryCompoundGradDxFunctor {
-  BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun,
-                              const UnaryFun &unary_fun)
-      : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
-
-  inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
-    return dout * d_binary_fun_.Dx(x, unary_fun_(y));
-  }
-
-  inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
-                                         T dout) {
-    return dout * d_binary_fun_.Dx(x, intermediate_out);
-  }
-
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return unary_fun_(y); }
-
- private:
-  DBinaryFun d_binary_fun_;
-  UnaryFun unary_fun_;
-};
-
-// Z = BinaryFunctor(X, UnaryFunctor(Y))
-template <typename T, typename DBinaryFun, typename UnaryFun,
-          typename DUnaryFun, bool InPlace>
-struct BinaryCompoundGradDyFunctor {
-  BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun,
-                              const UnaryFun &unary_fun,
-                              const DUnaryFun &d_unary_fun)
-      : d_binary_fun_(d_binary_fun),
-        unary_fun_(unary_fun),
-        d_unary_fun_(d_unary_fun) {}
-
-  inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
-    return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_.UseX(y);
-  }
-
-  inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
-                                         T dout) {
-    if (InPlace) {
-      return dout * d_binary_fun_.Dy(x, intermediate_out) *
-             d_unary_fun_.UseOut(intermediate_out);
-    } else {
-      return dout * d_binary_fun_.Dy(x, intermediate_out) *
-             d_unary_fun_.UseXAndOut(y, intermediate_out);
-    }
-  }
-
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return unary_fun_(y); }
-
- private:
-  DBinaryFun d_binary_fun_;
-  UnaryFun unary_fun_;
-  DUnaryFun d_unary_fun_;
-};
-
-// Z = UnaryFunctor(BinaryFunctor(X, Y))
-template <typename T, typename DUnaryFun, typename BinaryFun,
-          typename DBinaryFun, bool InPlace>
-struct UnaryCompoundGradDxFunctor {
-  UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun,
-                             const BinaryFun &binary_fun,
-                             const DBinaryFun &d_binary_fun)
-      : d_unary_fun_(d_unary_fun),
-        binary_fun_(binary_fun),
-        d_binary_fun_(d_binary_fun) {}
-
-  inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
-    T base;
-    if (InPlace) {
-      base = dout * d_unary_fun_.UseOut(out);
-    } else {
-      base = dout * d_unary_fun_.UseXAndOut(binary_fun_(x, y), out);
-    }
-    return base * d_binary_fun_.Dx(x, y);
-  }
-
-  inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
-                                         T dout) {
-    T base;
-    if (InPlace) {
-      base = dout * d_unary_fun_.UseOut(out);
-    } else {
-      base = dout * d_unary_fun_.UseXAndOut(intermediate_out, out);
-    }
-    return base * d_binary_fun_.Dx(x, y);
-  }
-
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return binary_fun_(x, y); }
-
- private:
-  DUnaryFun d_unary_fun_;
-  BinaryFun binary_fun_;
-  DBinaryFun d_binary_fun_;
-};
-
-// Z = UnaryFunctor(BinaryFunctor(X, Y))
-template <typename T, typename DUnaryFun, typename BinaryFun,
-          typename DBinaryFun, bool InPlace>
-struct UnaryCompoundGradDyFunctor {
-  UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun,
-                             const BinaryFun &binary_fun,
-                             const DBinaryFun &d_binary_fun)
-      : d_unary_fun_(d_unary_fun),
-        binary_fun_(binary_fun),
-        d_binary_fun_(d_binary_fun) {}
-
-  inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
-    T base;
-    if (InPlace) {
-      base = dout * d_unary_fun_.UseOut(out);
-    } else {
-      base = dout * d_unary_fun_.UseXAndOut(binary_fun_(x, y), out);
-    }
-    return base * d_binary_fun_.Dy(x, y);
-  }
-
-  inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
-                                         T dout) {
-    T base;
-    if (InPlace) {
-      base = dout * d_unary_fun_.UseOut(out);
-    } else {
-      base = dout * d_unary_fun_.UseXAndOut(intermediate_out, out);
-    }
-    return base * d_binary_fun_.Dy(x, y);
-  }
-
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return binary_fun_(x, y); }
-
- private:
-  DUnaryFun d_unary_fun_;
-  BinaryFun binary_fun_;
-  DBinaryFun d_binary_fun_;
-};
-
-// Z = BinaryFunctor(X, UnaryFunctor(Y))
-template <typename T, typename DBinaryFun, typename UnaryFun>
-struct BinaryCompoundGradDIntermedaiteOutFunctor {
-  BinaryCompoundGradDIntermedaiteOutFunctor(const DBinaryFun &d_binary_fun,
-                                            const UnaryFun &unary_fun)
-      : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
-
-  inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
-    return dout * d_binary_fun_.Dy(x, unary_fun_(y));
-  }
-
-  inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, T out,
-                                         T dout) {
-    return dout * d_binary_fun_.Dy(x, intermediate_out);
-  }
-
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return unary_fun_(y); }
-
- private:
-  DBinaryFun d_binary_fun_;
-  UnaryFun unary_fun_;
-};
-
-// Z = UnaryFunctor(BinaryFunctor(X, Y))
-template <typename T, typename DUnaryFun, typename BinaryFun, bool InPlace>
-struct UnaryCompoundGradDIntermediateFunctor {
-  UnaryCompoundGradDIntermediateFunctor(const DUnaryFun &d_unary_fun,
-                                        const BinaryFun &binary_fun)
-      : d_unary_fun_(d_unary_fun), binary_fun_(binary_fun) {}
-
-  inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
-    if (InPlace) {
-      return dout * d_unary_fun_.UseOut(out);
-    } else {
-      return dout * d_unary_fun_.UseXAndOut(binary_fun_(x, y), out);
-    }
-  }
-
-  inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, T out,
-                                         T dout) {
-    if (InPlace) {
-      return dout * d_unary_fun_.UseOut(out);
-    } else {
-      return dout * d_unary_fun_.UseXAndOut(intermediate_out, out);
-    }
-  }
-
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return binary_fun_(x, y); }
-
- private:
-  DUnaryFun d_unary_fun_;
-  BinaryFun binary_fun_;
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat.hip.cu b/paddle/fluid/operators/math/concat.hip.cu
deleted file mode 100644
index eacef0438883891671fec6e4001f862f619723cb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/concat.hip.cu
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <hip/hip_runtime.h>
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
deleted file mode 100644
index c6e17fd042f19bbeee3507e4cd64f49cff369682..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T>
-class ConcatFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const std::vector<framework::Tensor>& input, int axis,
-                  framework::Tensor* output) {
-    // TODO(zcd): Add input data validity checking
-    int num = input.size();
-
-    int rows = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
-    }
-    int out_rows = rows, out_cols = 0;
-
-    std::vector<int64_t> input_cols(input.size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
-      out_cols += t_cols;
-      input_cols[i] = t_cols;
-    }
-    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
-
-    // computation
-    auto output_data = output->data<T>();
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = input_cols[j];
-      auto input_data = input[j].data<T>();
-      for (int k = 0; k < out_rows; ++k) {
-        memory::Copy(cpu_place, output_data + k * out_cols + col_idx, cpu_place,
-                     input_data + k * col_len, sizeof(T) * col_len);
-      }
-      col_idx += col_len;
-    }
-  }
-};
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T>
-class SplitFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
-                  const int axis, std::vector<framework::Tensor*>* outputs) {
-    // TODO(zcd): Add input data validity checking
-    size_t num = outputs->size();
-
-    int input_rows = 1;
-    auto dim_0 = ref_inputs[0]->dims();
-    for (int i = 0; i < axis; ++i) {
-      input_rows *= dim_0[i];
-    }
-
-    int input_cols = 0;
-
-    std::vector<int64_t> output_cols(outputs->size());
-    for (size_t i = 0; i < num; ++i) {
-      int t_cols = ref_inputs[i]->numel() / input_rows;
-      input_cols += t_cols;
-      output_cols[i] = t_cols;
-    }
-    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
-
-    // computation
-    for (int k = 0; k < input_rows; ++k) {
-      const T* src_ptr = input.data<T>() + k * input_cols;
-      int col_idx = 0;
-      for (size_t j = 0; j < num; ++j) {
-        int col_len = output_cols[j];
-        auto* out_tensor = outputs->at(j);
-        if (out_tensor != nullptr) {
-          T* dst_ptr = out_tensor->data<T>() + k * col_len;
-          memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
-                       sizeof(T) * col_len);
-        }
-        col_idx += col_len;
-      }
-    }
-  }
-};
-#define DEFINE_FUNCTOR(type)                                      \
-  template class ConcatFunctor<platform::CPUDeviceContext, type>; \
-  template class SplitFunctor<platform::CPUDeviceContext, type>;
-
-FOR_ALL_TYPES(DEFINE_FUNCTOR);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
deleted file mode 100644
index 5a7cd602c857b1345f4f48f3e799403130782a48..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-__global__ void ConcatKernel(const T** inputs, const int* input_cols,
-                             int col_size, const int output_rows,
-                             const int output_cols, T* output) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int curr_segment = 0;
-  int curr_offset = input_cols[0];
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int curr_col_offset = input_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = input_cols[curr_segment + 1];
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-
-    const T* input_ptr = inputs[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * segment_width + local_col];
-  }
-}
-
-template <typename T>
-__device__ void ConcatKernelDetail(const T** inputs_data,
-                                   const int fixed_in_col, const int out_rows,
-                                   const int out_cols, T* output_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * 1.0 / fixed_in_col;
-    int in_offset = tid_x - split * fixed_in_col;
-    const T* input_ptr = inputs_data[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
-      output_data[tid_y * out_cols + tid_x] =
-          input_ptr[tid_y * fixed_in_col + in_offset];
-    }
-  }
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const int fixed_in_col, const int out_rows,
-                             const int out_cols, T* output_data) {
-  const T* inputs_data[2];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const T* input_addr2, const int fixed_in_col,
-                             const int out_rows, const int out_cols,
-                             T* output_data) {
-  const T* inputs_data[3];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const T* input_addr2, const T* input_addr3,
-                             const int fixed_in_col, const int out_rows,
-                             const int out_cols, T* output_data) {
-  const T* inputs_data[4];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  inputs_data[3] = input_addr3;
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T** inputs_data, const int in_num,
-                             const int fixed_in_col, const int out_rows,
-                             const int out_cols, T* output_data) {
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int* out_cols,
-                            int out_cols_size, T** outputs_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int curr_segment = 0;
-  int curr_offset = out_cols[0];
-  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    int curr_col_offset = out_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = out_cols[curr_segment + 1];
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs_data[curr_segment];
-    if (output_ptr != nullptr) {
-      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * segment_width + local_col] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__device__ void SplitKernelDetail(const T* input_data, const int in_row,
-                                  const int in_col, const int fixed_out_col,
-                                  T** outputs_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x / fixed_out_col;
-    int in_offset = tid_x - split * fixed_out_col;
-    T* output_ptr = outputs_data[split];
-    if (output_ptr != nullptr) {
-      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * fixed_out_col + in_offset] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
-                            T** outputs_data) {
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
-                            T* outputs_addr0, T* outputs_addr1) {
-  T* outputs_data[2];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
-                            T* outputs_addr0, T* outputs_addr1,
-                            T* outputs_addr2) {
-  T* outputs_data[3];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
-                            T* outputs_addr0, T* outputs_addr1,
-                            T* outputs_addr2, T* outputs_addr3) {
-  T* outputs_data[4];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  outputs_data[3] = outputs_addr3;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-static inline void GetBlockDims(const platform::CUDADeviceContext& context,
-                                int num_rows, int num_cols, dim3* block_dims,
-                                dim3* grid_dims) {
-  // Set the thread block and grid according to CurrentDeviceId
-  const int kThreadsPerBlock = 1024;
-  int block_cols = kThreadsPerBlock;
-  if (num_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-    block_cols = ((num_cols + 31) >> 5) << 5;
-  }
-  int block_rows = kThreadsPerBlock / block_cols;
-  *block_dims = dim3(block_cols, block_rows, 1);
-
-  int max_threads = context.GetMaxPhysicalThreadCount();
-  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
-  int grid_cols =
-      std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
-  int grid_rows =
-      std::min(max_blocks / grid_cols, std::max(num_rows / block_rows, 1));
-  *grid_dims = dim3(grid_cols, grid_rows, 1);
-}
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T>
-class ConcatFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const std::vector<framework::Tensor>& input, int axis,
-                  framework::Tensor* output) {
-    // TODO(zcd): Add input data validity checking
-    int in_num = input.size();
-    int in_row = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      in_row *= dim_0[i];
-    }
-    int in_col = input[0].numel() / in_row;
-    int out_row = in_row, out_col = 0;
-
-    std::vector<const T*> inputs_data(in_num);
-    std::vector<int> inputs_col(in_num + 1);
-
-    inputs_col[0] = 0;
-    bool has_same_shape = true;
-    for (int i = 0; i < in_num; ++i) {
-      int t_cols = input[i].numel() / in_row;
-      if (has_same_shape) {
-        if (t_cols != in_col) has_same_shape = false;
-      }
-      out_col += t_cols;
-      inputs_col[i + 1] = out_col;
-      inputs_data[i] = input[i].data<T>();
-    }
-
-    dim3 block_dims;
-    dim3 grid_dims;
-    GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims);
-
-    memory::allocation::AllocationPtr tmp_dev_ins_data;
-    const T** dev_ins_data = nullptr;
-    if (!has_same_shape || in_num < 2 || in_num > 4) {
-      tmp_dev_ins_data =
-          memory::Alloc(context, inputs_data.size() * sizeof(T*));
-      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
-                   tmp_dev_ins_data->ptr(), platform::CPUPlace(),
-                   static_cast<void*>(inputs_data.data()),
-                   inputs_data.size() * sizeof(T*), context.stream());
-      dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
-    }
-
-    if (has_same_shape) {
-      if (in_num == 2) {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            inputs_data[0], inputs_data[1], in_col, out_row, out_col,
-            output->data<T>());
-      } else if (in_num == 3) {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            inputs_data[0], inputs_data[1], inputs_data[2], in_col, out_row,
-            out_col, output->data<T>());
-      } else if (in_num == 4) {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            inputs_data[0], inputs_data[1], inputs_data[2], inputs_data[3],
-            in_col, out_row, out_col, output->data<T>());
-      } else {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
-      }
-    } else {
-      auto tmp_dev_ins_col_data =
-          memory::Alloc(context, inputs_col.size() * sizeof(int));
-      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
-                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                   static_cast<void*>(inputs_col.data()),
-                   inputs_col.size() * sizeof(int), context.stream());
-      int* dev_ins_col_data = static_cast<int*>(tmp_dev_ins_col_data->ptr());
-
-      ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-          dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
-          out_row, out_col, output->data<T>());
-    }
-  }
-};
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T>
-class SplitFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
-                  int axis, std::vector<framework::Tensor*>* outputs) {
-    // TODO(zcd): Add input data validity checking
-    int o_num = outputs->size();
-    int out_row = 1;
-    auto dim_0 = ref_inputs[0]->dims();
-    for (int i = 0; i < axis; ++i) {
-      out_row *= dim_0[i];
-    }
-
-    int out0_col = ref_inputs[0]->numel() / out_row;
-    int in_col = 0, in_row = out_row;
-    bool has_same_shape = true;
-
-    std::vector<T*> outputs_data(o_num);
-    std::vector<int> outputs_cols(o_num + 1);
-
-    outputs_cols[0] = 0;
-    for (int i = 0; i < o_num; ++i) {
-      int t_col = ref_inputs.at(i)->numel() / out_row;
-      if (has_same_shape) {
-        if (t_col != out0_col) has_same_shape = false;
-      }
-      in_col += t_col;
-      outputs_cols[i + 1] = in_col;
-      if (outputs->at(i) != nullptr) {
-        outputs_data[i] = outputs->at(i)->data<T>();
-      } else {
-        outputs_data[i] = nullptr;
-      }
-    }
-
-    dim3 block_dims;
-    dim3 grid_dims;
-    GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
-
-    memory::allocation::AllocationPtr tmp_dev_outs_data;
-    T** dev_out_gpu_data = nullptr;
-    if (!has_same_shape || o_num < 2 || o_num > 4) {
-      tmp_dev_outs_data =
-          memory::Alloc(context, outputs_data.size() * sizeof(T*));
-      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
-                   tmp_dev_outs_data->ptr(), platform::CPUPlace(),
-                   reinterpret_cast<void*>(outputs_data.data()),
-                   outputs_data.size() * sizeof(T*), context.stream());
-      dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
-    }
-
-    if (has_same_shape) {
-      if (o_num == 2) {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
-            outputs_data[1]);
-      } else if (o_num == 3) {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
-            outputs_data[1], outputs_data[2]);
-      } else if (o_num == 4) {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
-            outputs_data[1], outputs_data[2], outputs_data[3]);
-      } else {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
-      }
-    } else {
-      auto tmp_dev_ins_col_data =
-          memory::Alloc(context,
-
-                        outputs_cols.size() * sizeof(int));
-      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
-                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                   reinterpret_cast<void*>(outputs_cols.data()),
-                   outputs_cols.size() * sizeof(int), context.stream());
-      int* dev_outs_col_data =
-          reinterpret_cast<int*>(tmp_dev_ins_col_data->ptr());
-
-      SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(), in_row, in_col, dev_outs_col_data,
-          static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
-    }
-  }
-};
-
-#define DEFINE_FUNCTOR(type)                                       \
-  template class ConcatFunctor<platform::CUDADeviceContext, type>; \
-  template class SplitFunctor<platform::CUDADeviceContext, type>
-
-FOR_ALL_TYPES(DEFINE_FUNCTOR);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
deleted file mode 100644
index 3a5eddcbf4af699a89ae1a21571337155699a1f3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * \brief Concatenate the input tensors along the dimension axis.
- *  TODO(zcd): maybe it needs to be more detailed.
- *  Examples:
- *     Input[0] = [[1,2],[3,4]]
- *     Input[1] = [[5,6]]
- *     axis = 0
- *
- *     Output = [[1,2],
- *               [3,4],
- *               [5,6]]
- */
-template <typename DeviceContext, typename T>
-class ConcatFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const std::vector<framework::Tensor>& input, int axis,
-                  framework::Tensor* output);
-};
-
-/*
- * \brief Split the input tensors along the dimension axis into outputs.
- *  TODO(zcd): maybe it needs to be more detailed.
- *  Examples:
- *     Input = [[1,2],
- *              [3,4],
- *              [5,6]]
- *     axis = 0
- *
- *     Output[0] = [[1,2],[3,4]]
- *     Output[1] = [[5,6]]
- */
-template <typename DeviceContext, typename T>
-class SplitFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
-                  int axis, std::vector<framework::Tensor*>* outputs);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
-
-#define FOR_ALL_TYPES(macro) \
-  macro(int);                \
-  macro(float);              \
-  macro(double);             \
-  macro(bool);               \
-  macro(int64_t);            \
-  macro(int16_t);            \
-  macro(uint8_t);            \
-  macro(int8_t);             \
-  macro(::paddle::platform::float16)
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
deleted file mode 100644
index 411dbca25bb48c99dfd16779f54e46a3e80d0d4e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/concat_test.cc
+++ /dev/null
@@ -1,395 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-
-/**
- * case 1:
- *    inputs:
- *        t_a.shape: [2, 3, 4]
- *        t_b.shape: [3, 3, 4]
- *    output:
- *        out.shape: [5, 3, 4]
- */
-template <typename DeviceContext, typename Place>
-void ConcatCase1(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
-
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
-
-  auto dim_a = paddle::framework::make_ddim({2, 3, 4});
-  auto dim_b = paddle::framework::make_ddim({3, 3, 4});
-  auto dim_out = paddle::framework::make_ddim({5, 3, 4});
-
-  input_a.mutable_data<int>(dim_a, Place());
-  input_b.mutable_data<int>(dim_b, Place());
-  out.mutable_data<int>(dim_out, Place());
-
-  if (paddle::platform::is_gpu_place(Place())) {
-    input_a_cpu.mutable_data<int>(dim_a, paddle::platform::CPUPlace());
-    input_b_cpu.mutable_data<int>(dim_b, paddle::platform::CPUPlace());
-    out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
-  }
-
-  int* a_ptr = nullptr;
-  int* b_ptr = nullptr;
-  if (paddle::platform::is_gpu_place(Place())) {
-    a_ptr = input_a_cpu.data<int>();
-    b_ptr = input_b_cpu.data<int>();
-  } else {
-    a_ptr = input_a.data<int>();
-    b_ptr = input_b.data<int>();
-  }
-
-  for (int i = 0; i < 2 * 3 * 4; ++i) {
-    a_ptr[i] = i;
-  }
-  for (int i = 0; i < 3 * 3 * 4; ++i) {
-    b_ptr[i] = i;
-  }
-
-  if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
-    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
-  }
-
-  std::vector<paddle::framework::Tensor> input;
-  input.push_back(input_a);
-  input.push_back(input_b);
-
-  paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
-  concat_functor(*context, input, 0, &out);
-
-  // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
-
-  int* out_ptr = nullptr;
-  if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
-                                      &out_cpu);
-    out_ptr = out_cpu.data<int>();
-  } else {
-    out_ptr = out.data<int>();
-  }
-
-  int cols = 2 * 3 * 4;
-  int idx_a = 0, idx_b = 0;
-  for (int j = 0; j < 5 * 3 * 4; ++j) {
-    if (j >= cols) {
-      PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b]);
-      ++idx_b;
-    } else {
-      PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a]);
-      ++idx_a;
-    }
-  }
-}
-
-/**
-  * case 2:
-  *    inputs:
-  *        t_a.shape: [2, 3, 4]
-  *        t_b.shape: [2, 4, 4]
-  *    output:
-  *        out.shape: [2, 7, 4]
-  */
-template <typename DeviceContext, typename Place>
-void ConcatCase2(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
-
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
-
-  auto dim_a = paddle::framework::make_ddim({2, 3, 4});
-  auto dim_b = paddle::framework::make_ddim({2, 4, 4});
-  auto dim_out = paddle::framework::make_ddim({2, 7, 4});
-
-  input_a.mutable_data<int>(dim_a, Place());
-  input_b.mutable_data<int>(dim_b, Place());
-  out.mutable_data<int>(dim_out, Place());
-
-  if (paddle::platform::is_gpu_place(Place())) {
-    input_a_cpu.mutable_data<int>(dim_a, paddle::platform::CPUPlace());
-    input_b_cpu.mutable_data<int>(dim_b, paddle::platform::CPUPlace());
-    out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
-  }
-
-  int* a_ptr = nullptr;
-  int* b_ptr = nullptr;
-  if (paddle::platform::is_gpu_place(Place())) {
-    a_ptr = input_a_cpu.data<int>();
-    b_ptr = input_b_cpu.data<int>();
-  } else {
-    a_ptr = input_a.data<int>();
-    b_ptr = input_b.data<int>();
-  }
-
-  for (int i = 0; i < 2 * 3 * 4; ++i) {
-    a_ptr[i] = i;
-  }
-  for (int i = 0; i < 2 * 4 * 4; ++i) {
-    b_ptr[i] = i;
-  }
-
-  if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
-    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
-  }
-
-  std::vector<paddle::framework::Tensor> input;
-  input.push_back(input_a);
-  input.push_back(input_b);
-
-  paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
-  concat_functor(*context, input, 1, &out);
-
-  // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
-
-  int* out_ptr = nullptr;
-  if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
-                                      &out_cpu);
-    out_ptr = out_cpu.data<int>();
-  } else {
-    out_ptr = out.data<int>();
-  }
-
-  int cols = 3 * 4;
-  int idx_a = 0, idx_b = 0;
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 28; ++j) {
-      if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], b_ptr[idx_b]);
-        ++idx_b;
-      } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], a_ptr[idx_a]);
-        ++idx_a;
-      }
-    }
-  }
-}
-
-/**
-  * case 3:
-  *    inputs:
-  *        t_a.shape: [2, 3, 5]
-  *        t_b.shape: [2, 3, 4]
-  *    output:
-  *        out.shape: [2, 3, 9]
-  */
-template <typename DeviceContext, typename Place>
-void ConcatCase3(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
-
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
-
-  auto dim_a = paddle::framework::make_ddim({2, 3, 4});
-  auto dim_b = paddle::framework::make_ddim({2, 3, 5});
-  auto dim_out = paddle::framework::make_ddim({2, 3, 9});
-
-  input_a.mutable_data<int>(dim_a, Place());
-  input_b.mutable_data<int>(dim_b, Place());
-  out.mutable_data<int>(dim_out, Place());
-
-  if (paddle::platform::is_gpu_place(Place())) {
-    input_a_cpu.mutable_data<int>(dim_a, paddle::platform::CPUPlace());
-    input_b_cpu.mutable_data<int>(dim_b, paddle::platform::CPUPlace());
-    out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
-  }
-
-  int* a_ptr = nullptr;
-  int* b_ptr = nullptr;
-  if (paddle::platform::is_gpu_place(Place())) {
-    a_ptr = input_a_cpu.data<int>();
-    b_ptr = input_b_cpu.data<int>();
-  } else {
-    a_ptr = input_a.data<int>();
-    b_ptr = input_b.data<int>();
-  }
-
-  for (int i = 0; i < 2 * 3 * 4; ++i) {
-    a_ptr[i] = i;
-  }
-  for (int i = 0; i < 2 * 3 * 5; ++i) {
-    b_ptr[i] = i;
-  }
-
-  if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
-    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
-  }
-
-  std::vector<paddle::framework::Tensor> input;
-  input.push_back(input_a);
-  input.push_back(input_b);
-
-  paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
-  concat_functor(*context, input, 2, &out);
-
-  // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
-
-  int* out_ptr = nullptr;
-  if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
-                                      &out_cpu);
-    out_ptr = out_cpu.data<int>();
-  } else {
-    out_ptr = out.data<int>();
-  }
-
-  // check the data
-  int cols = 4;
-  int idx_a = 0, idx_b = 0;
-  for (int i = 0; i < 6; ++i) {
-    for (int j = 0; j < 9; ++j) {
-      if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], b_ptr[idx_b]);
-        ++idx_b;
-      } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], a_ptr[idx_a]);
-        ++idx_a;
-      }
-    }
-  }
-}
-
-/**
-  * case 4:
-  *    inputs:
-  *        axis = 1
-  *        t_a.shape: [2, 3, 4]
-  *        t_b.shape: [2, 3, 4]
-  *    output:
-  *        out.shape: [2, 6, 4]
-  */
-template <typename DeviceContext, typename Place>
-void ConcatCase4(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
-
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
-
-  auto dim_a = paddle::framework::make_ddim({2, 3, 4});
-  auto dim_b = paddle::framework::make_ddim({2, 3, 4});
-  auto dim_out = paddle::framework::make_ddim({2, 6, 4});
-
-  input_a.mutable_data<int>(dim_a, Place());
-  input_b.mutable_data<int>(dim_b, Place());
-  out.mutable_data<int>(dim_out, Place());
-
-  if (paddle::platform::is_gpu_place(Place())) {
-    input_a_cpu.mutable_data<int>(dim_a, paddle::platform::CPUPlace());
-    input_b_cpu.mutable_data<int>(dim_b, paddle::platform::CPUPlace());
-    out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
-  }
-
-  int* a_ptr = nullptr;
-  int* b_ptr = nullptr;
-  if (paddle::platform::is_gpu_place(Place())) {
-    a_ptr = input_a_cpu.data<int>();
-    b_ptr = input_b_cpu.data<int>();
-  } else {
-    a_ptr = input_a.data<int>();
-    b_ptr = input_b.data<int>();
-  }
-
-  for (int i = 0; i < 2 * 3 * 4; ++i) {
-    a_ptr[i] = i;
-  }
-  for (int i = 0; i < 2 * 3 * 4; ++i) {
-    b_ptr[i] = i;
-  }
-
-  if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
-    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
-  }
-
-  std::vector<paddle::framework::Tensor> input;
-  input.push_back(input_a);
-  input.push_back(input_b);
-
-  paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
-  concat_functor(*context, input, 1, &out);
-  context->Wait();
-
-  // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
-
-  int* out_ptr = nullptr;
-  if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
-                                      &out_cpu);
-    out_ptr = out_cpu.data<int>();
-  } else {
-    out_ptr = out.data<int>();
-  }
-
-  // check the data
-  int cols = 12;
-  int idx_a = 0, idx_b = 0;
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 24; ++j) {
-      if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], b_ptr[idx_b]);
-        ++idx_b;
-      } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], a_ptr[idx_a]);
-        ++idx_a;
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename Place>
-void TestConcatMain() {
-  DeviceContext* context = new DeviceContext(Place());
-
-  ConcatCase1<DeviceContext, Place>(context);
-  ConcatCase2<DeviceContext, Place>(context);
-  ConcatCase3<DeviceContext, Place>(context);
-  ConcatCase4<DeviceContext, Place>(context);
-}
-
-TEST(math, concat) {
-  TestConcatMain<paddle::platform::CPUDeviceContext,
-                 paddle::platform::CPUPlace>();
-#ifdef PADDLE_WITH_CUDA
-  TestConcatMain<paddle::platform::CUDADeviceContext,
-                 paddle::platform::CUDAPlace>();
-#endif
-}
diff --git a/paddle/fluid/operators/math/context_project.cc b/paddle/fluid/operators/math/context_project.cc
deleted file mode 100644
index 537d0b47868fb68d59e3b3556a54ba85d5f06960..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/context_project.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/context_project.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
-template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu
deleted file mode 100644
index f04b2d15349be329ee228fc8903c9b38a5349634..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/context_project.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/math/context_project.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template class ContextProjectFunctor<platform::CUDADeviceContext, float>;
-template class ContextProjectFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
deleted file mode 100644
index e9019c6d2fe6890ee92cb5a3b047666e3c2a7e04..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/context_project.h
+++ /dev/null
@@ -1,324 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/im2col.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-/*
- * \brief Context projection concatenates features in adjacent time-steps in
- * a sequence. The i-th row of the output is the concatenation of
- * context_length rows of the input. The context_length rows are the
- * consecutive rows from the i+shift_start row.
- * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor.
- *
- * \param in            Input data.
- * \param Shape         The shape of Input data:
- *                        [mini-batch, input_hidden_size].
- *
- * \param padding_data  Padding data.
- * \param Shape         The shape of Padding data:
- *                        [up_pad + down_pad, input_hidden_size].
- *
- * \param col           Col data.
- * \param Shape         The shape of Col data:
- *                        [mini-batch, context_length * input_hidden_size].
- *
- * For a mini-batch of 2 variable lengths sentences, containing 3, and 1
- * time-steps:
- *
- * Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3,
- * 4].
- * Besides, for the sake of simplicity, we assume M=1 and N=2.
- *
- * X = [[a1, a2;
- *       b1, b2;
- *       c1, c2]
- *      [d1, d2]]
- *
- * This is to say that input (X) has 4 words and the dimension of each word
- * representation is 2.
- *
- * - Case1:
- *   If context_start is -1 and padding_trainable is false, we use zero to pad
- *   instead of learned weight to pad,
- *   and the context_length is 3, the output (Out) is:
- *
- *   Out =[[0,  0,  a1, a2, b1, b2;
- *          a1, a2, b1, b2, c1, c2;
- *          b1, b2, c1, c2, 0,  0 ]
- *          [0,  0, d1, d2, 0,  0 ]]
- *
- * - Case2:
- *   If context_start is -1 and padding_trainable is true, we use learned weight
- *   to pad,
- *   and the context_length is 3, the output (Out) is:
- *
- *   Out = [[w1, w2, a1, a2, b1, b2;
- *           a1, a2, b1, b2, c1, c2;
- *           b1, b2, c1, c2, w3, w4]
- *          [w1, w2, d1, d2, w3, w4]]
- *
- */
-
-template <typename DeviceContext, typename T>
-class ContextProjectFunctor {
- public:
-  void operator()(const DeviceContext& context, const LoDTensor& in,
-                  const Tensor* padding_data, bool padding_trainable,
-                  const int context_start, const int context_length,
-                  const int context_stride, const int up_pad,
-                  const int down_pad, Tensor* col) {
-    auto lod_level_0 = in.lod()[0];
-
-    math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, float> im2col_ocf;
-
-    std::vector<int> dilation({1, 1});
-    std::vector<int> padding({up_pad, 0, down_pad, 0});
-    std::vector<int> stride({context_stride, 1});
-
-    int input_row_begin, input_row_end;
-    int sequence_height, sequence_width;
-    sequence_width = in.dims()[1];
-
-    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-      if (lod_level_0[i] == lod_level_0[i + 1]) continue;
-
-      input_row_begin = (context_start > 0)
-                            ? static_cast<int>(lod_level_0[i]) + context_start
-                            : static_cast<int>(lod_level_0[i]);
-      input_row_end = static_cast<int>(lod_level_0[i + 1]);
-
-      Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                static_cast<int>(lod_level_0[i + 1]));
-
-      sequence_height = static_cast<int>(out_t.dims()[0]);
-
-      if (input_row_begin < input_row_end) {
-        Tensor in_t = in.Slice(input_row_begin, input_row_end);
-
-        std::vector<int64_t> output_shape(
-            {sequence_height, 1, 1, context_length,
-             sequence_width});  // output_height, output_width,
-        // input_channels, filter_height, filter_width
-        out_t.Resize(framework::make_ddim(output_shape));
-
-        std::vector<int64_t> input_shape(
-            {1, input_row_end - input_row_begin,
-             sequence_width});  // input_channels, input_height, input_width
-        in_t.Resize(framework::make_ddim(input_shape));
-        im2col_ocf(context, in_t, dilation, stride, padding, &out_t);
-        out_t.Resize({sequence_height, context_length * sequence_width});
-      }
-    }
-    if (padding_trainable) {
-      PADDLE_ENFORCE_NOT_NULL(padding_data);
-      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-        if (lod_level_0[i] == lod_level_0[i + 1]) continue;
-
-        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                  static_cast<int>(lod_level_0[i + 1]));
-
-        sequence_height = static_cast<int>(out_t.dims()[0]);
-
-        // add up trainable data
-        out_t.Resize({static_cast<int64_t>(sequence_height) * context_length,
-                      sequence_width});
-
-        if (up_pad > 0) {  // add up pad
-          int padding_rows = std::min(
-              up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
-
-          for (int k = 0; k < padding_rows; ++k) {
-            int padding_size =
-                k + context_length < up_pad ? context_length : up_pad - k;
-            Tensor out_t_sub = out_t.Slice(k * context_length,
-                                           k * context_length + padding_size);
-            Tensor w_sub = padding_data->Slice(k, k + padding_size);
-            framework::TensorCopy(w_sub, context.GetPlace(), context,
-                                  &out_t_sub);
-          }
-        }
-        if (down_pad > 0) {  // add down pad
-          int down_pad_begin_row =
-              std::max(0,
-                       (sequence_height - context_start - context_length) + 1) +
-              1;
-          int padding_begin = std::max(0, context_start - sequence_height);
-          int padding_size =
-              sequence_height - context_start >= context_length
-                  ? 1
-                  : context_length - (sequence_height - context_start);
-          if (context_start >= sequence_height) padding_size = context_length;
-          int padding_idx = padding_begin;
-          for (int t = 0; t + down_pad_begin_row <= sequence_height;
-               ++t, ++padding_size) {
-            if (context_start >= sequence_height) padding_size = context_length;
-            if (padding_size > context_length) {
-              padding_size = context_length;
-              padding_idx++;
-            }
-            if (padding_begin > 0 || sequence_height == context_start)
-              padding_idx = padding_begin + t;
-
-            Tensor out_t_sub = out_t.Slice(
-                (down_pad_begin_row + t) * context_length - padding_size,
-                (down_pad_begin_row + t) * context_length);
-            Tensor w_sub = padding_data->Slice(
-                up_pad + padding_idx, up_pad + padding_idx + padding_size);
-            framework::TensorCopy(w_sub, context.GetPlace(), context,
-                                  &out_t_sub);
-          }
-        }
-        out_t.Resize({sequence_height,
-                      static_cast<int64_t>(context_length) * sequence_width});
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ContextProjectGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const LoDTensor& in,
-                  bool padding_trainable, const int context_start,
-                  const int context_length, const int context_stride,
-                  const int up_pad, const int down_pad, bool pad_grad,
-                  bool input_grad, Tensor* padding_data, Tensor* col) {
-    auto lod_level_0 = in.lod()[0];
-
-    math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, float> col2im_ocf;
-
-    std::vector<int> dilation({1, 1});
-    std::vector<int> padding({up_pad, 0, down_pad, 0});
-    std::vector<int> stride({context_stride, 1});
-
-    int input_row_begin, input_row_end;
-    int sequence_height, sequence_width;
-    sequence_width = in.dims()[1];
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-
-    if (input_grad) {
-      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-        if (lod_level_0[i] == lod_level_0[i + 1]) continue;
-
-        input_row_begin = (context_start > 0)
-                              ? static_cast<int>(lod_level_0[i]) + context_start
-                              : static_cast<int>(lod_level_0[i]);
-        input_row_end = static_cast<int>(lod_level_0[i + 1]);
-
-        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                  static_cast<int>(lod_level_0[i + 1]));
-
-        sequence_height = static_cast<int>(out_t.dims()[0]);
-
-        if (input_row_begin < input_row_end) {
-          Tensor in_t = in.Slice(input_row_begin, input_row_end);
-
-          std::vector<int64_t> output_shape(
-              {sequence_height, 1, 1, context_length,
-               sequence_width});  // output_height, output_width,
-          // input_channels, filter_height, filter_width
-          out_t.Resize(framework::make_ddim(output_shape));
-
-          std::vector<int64_t> input_shape(
-              {1, input_row_end - input_row_begin,
-               sequence_width});  // input_channels, input_height, input_width
-          in_t.Resize(framework::make_ddim(input_shape));
-
-          col2im_ocf(context, out_t, dilation, stride, padding, &in_t);
-          out_t.Resize({sequence_height, context_length * sequence_width});
-        }
-      }
-    }
-    if (pad_grad) {
-      if (padding_trainable) {
-        for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-          if (lod_level_0[i] == lod_level_0[i + 1]) continue;
-
-          Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                    static_cast<int>(lod_level_0[i + 1]));
-
-          sequence_height = static_cast<int>(out_t.dims()[0]);
-          out_t.Resize({static_cast<int64_t>(sequence_height) * context_length,
-                        sequence_width});
-
-          if (up_pad > 0) {
-            int padding_rows = std::min(
-                up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
-
-            for (int k = 0; k < padding_rows; ++k) {
-              int padding_size =
-                  k + context_length < up_pad ? context_length : up_pad - k;
-              Tensor out_t_sub = out_t.Slice(k * context_length,
-                                             k * context_length + padding_size);
-              Tensor w_sub = padding_data->Slice(k, k + padding_size);
-              blas.AXPY(w_sub.numel(), static_cast<T>(1), out_t_sub.data<T>(),
-                        w_sub.data<T>());
-            }
-          }
-          if (down_pad > 0) {
-            int down_pad_begin_row =
-                std::max(
-                    0, (sequence_height - context_start - context_length) + 1) +
-                1;
-            int padding_begin = std::max(0, context_start - sequence_height);
-            int padding_size =
-                sequence_height - context_start >= context_length
-                    ? 1
-                    : context_length - (sequence_height - context_start);
-            if (context_start >= sequence_height) padding_size = context_length;
-            int padding_idx = padding_begin;
-            for (int t = 0; t + down_pad_begin_row <= sequence_height;
-                 ++t, ++padding_size) {
-              if (context_start >= sequence_height)
-                padding_size = context_length;
-              if (padding_size > context_length) {
-                padding_size = context_length;
-                padding_idx++;
-              }
-              if (padding_begin > 0 || sequence_height == context_start)
-                padding_idx = padding_begin + t;
-
-              Tensor out_t_sub = out_t.Slice(
-                  (down_pad_begin_row + t) * context_length - padding_size,
-                  (down_pad_begin_row + t) * context_length);
-              Tensor w_sub = padding_data->Slice(
-                  up_pad + padding_idx, up_pad + padding_idx + padding_size);
-              blas.AXPY(w_sub.numel(), static_cast<T>(1), out_t_sub.data<T>(),
-                        w_sub.data<T>());
-            }
-          }
-          out_t.Resize({sequence_height,
-                        static_cast<int64_t>(context_length) * sequence_width});
-        }
-      }
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc
deleted file mode 100644
index cbe16999124a67ac764afade92f7320a12129cd1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cos_sim_functor.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/cos_sim_functor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm,
-                  const T* y_norm, const T* x, const T* y, const T* z,
-                  const T* dz, const size_t rows, const size_t cols,
-                  T* dy) const {
-    for (size_t row_id = 0; row_id < rows; ++row_id) {
-      auto xy_norm_prod = x_norm[row_id] * y_norm[0];
-      auto dz_data = dz[row_id];
-      auto z_data = z[row_id];
-      auto* x_data = x + cols * row_id;
-      auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-
-      auto y_norm_square = y_norm[0] * y_norm[0];
-      auto reciprocal_y_norm_square = 1 / y_norm_square;
-      for (size_t i = 0; i < cols; ++i) {
-        dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod -
-                            z_data * y[i] * reciprocal_y_norm_square);
-      }
-    }
-  }
-};
-
-template struct CosSimDyFunctor<platform::CPUDeviceContext, float>;
-template struct CosSimDyFunctor<platform::CPUDeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu
deleted file mode 100644
index 537c7e47155fe9a12196869ceaed84fca198335b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/cos_sim_functor.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x,
-                               const T* y, const T* z, const T* dz,
-                               const size_t rows, const size_t cols, T* dy) {
-  int grid_size = blockDim.x * gridDim.x;
-  T y_norm_data = y_norm[0];
-  for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows;
-       row_id += grid_size) {
-    T xy_norm_prod = x_norm[row_id] * y_norm_data;
-    T dz_data = dz[row_id];
-    T z_data = z[row_id];
-    const T* x_data = x + cols * row_id;
-    T reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-
-    T y_norm_square = y_norm_data * y_norm_data;
-    T reciprocal_y_norm_square = 1 / y_norm_square;
-    for (size_t i = 0; i < cols; ++i) {
-      T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
-                             z_data * y[i] * reciprocal_y_norm_square);
-      platform::CudaAtomicAdd(dy + i, dy_data);
-    }
-  }
-}
-
-template <typename T>
-struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm,
-                  const T* y_norm, const T* x, const T* y, const T* z,
-                  const T* dz, const size_t rows, const size_t cols,
-                  T* dy) const {
-    const int block_size = 512;
-    dim3 threads(block_size, 1);
-    dim3 grid((rows + block_size - 1) / block_size, 1);
-    CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
-        x_norm, y_norm, x, y, z, dz, rows, cols, dy);
-  }
-};
-
-template struct CosSimDyFunctor<platform::CUDADeviceContext, float>;
-template struct CosSimDyFunctor<platform::CUDADeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cos_sim_functor.h b/paddle/fluid/operators/math/cos_sim_functor.h
deleted file mode 100644
index 30ea5e60e87b5b7838d840c4d5b53d904281ab2a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cos_sim_functor.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <stdlib.h>
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, bool same_row>
-struct CosSimFunctor {
-  CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols)
-      : x_norm_(x_norm),
-        y_norm_(y_norm),
-        x_(x),
-        y_(y),
-        z_(z),
-        cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    auto* x = x_ + cols_ * row_id;
-    T xx = 0, xy = 0, yy = 0;
-    if (same_row) {
-      auto* y = y_ + cols_ * row_id;
-      T tep_x, tep_y;
-      for (size_t i = 0; i < cols_; ++i) {
-        tep_x = x[i];
-        tep_y = y[i];
-        xx += tep_x * tep_x;
-        yy += tep_y * tep_y;
-        xy += tep_x * tep_y;
-      }
-      xx = sqrt(xx);
-      yy = sqrt(yy);
-      y_norm_[row_id] = yy;
-      x_norm_[row_id] = xx;
-      z_[row_id] = xy / (xx * yy);
-    } else {  // This can be wrote in a better way.
-      T tep_x, tep_y;
-      for (size_t i = 0; i < cols_; ++i) {
-        tep_x = x[i];
-        tep_y = y_[i];
-        xx += tep_x * tep_x;
-        yy += tep_y * tep_y;
-        xy += tep_x * tep_y;
-      }
-      xx = sqrt(xx);
-      yy = sqrt(yy);
-      if (row_id == 0) y_norm_[0] = yy;
-      x_norm_[row_id] = xx;
-      z_[row_id] = xy / (xx * yy);
-    }
-  }
-
-  T* x_norm_;
-  T* y_norm_;
-  const T* x_;
-  const T* y_;
-  T* z_;
-  const size_t cols_;
-};
-
-template <typename T>
-struct CosSimGradFunctor {
-  CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
-                    const T* z, const T* dz, T* dx, int cols)
-      : x_norm_(x_norm),
-        y_norm_(y_norm),
-        x_(x),
-        y_(y),
-        z_(z),
-        dz_(dz),
-        dx_(dx),
-        cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
-    auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id];
-    auto dz = dz_[row_id];
-    auto z = z_[row_id];
-
-    auto* dx = dx_ + cols_ * row_id;
-    auto* x = x_ + cols_ * row_id;
-    auto* y = y_ + cols_ * row_id;
-
-    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-    auto reciprocal_x_norm_square = 1 / x_norm_square;
-    for (size_t i = 0; i < cols_; ++i) {
-      dx[i] = dz * (y[i] * reciprocal_xy_norm_prod -
-                    z * x[i] * reciprocal_x_norm_square);
-    }
-  }
-
-  const T* x_norm_;
-  const T* y_norm_;
-  const T* x_;
-  const T* y_;
-  const T* z_;
-  const T* dz_;
-  T* dx_;
-  const size_t cols_;
-};
-
-template <typename T>
-struct CosSimDxFunctor {
-  CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
-                  const T* z, const T* dz, T* dx, int cols)
-      : x_norm_(x_norm),
-        y_norm_(y_norm),
-        x_(x),
-        y_(y),
-        z_(z),
-        dz_(dz),
-        dx_(dx),
-        cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    auto xy_norm_prod = x_norm_[row_id] * y_norm_[0];
-    auto dz = dz_[row_id];
-    auto z = z_[row_id];
-    auto* x = x_ + cols_ * row_id;
-    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
-    auto* dx = dx_ + cols_ * row_id;
-    auto reciprocal_x_norm_square = 1 / x_norm_square;
-
-    for (size_t i = 0; i < cols_; ++i) {
-      dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod -
-                    z * x[i] * reciprocal_x_norm_square);
-    }
-  }
-  const T* x_norm_;
-  const T* y_norm_;
-  const T* x_;
-  const T* y_;
-  const T* z_;
-  const T* dz_;
-  T* dx_;
-  const size_t cols_;
-};
-
-template <typename DeviceContext, typename T>
-struct CosSimDyFunctor {
-  void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm,
-                  const T* x, const T* y, const T* z, const T* dz,
-                  const size_t rows, const size_t cols, T* dy) const;
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
deleted file mode 100644
index 8940a41424b01c975f1264ca309cc09fc3c7ae85..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ /dev/null
@@ -1,630 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <functional>
-#include <string>
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-
-#define YMM_FLOAT_BLOCK 8
-#define AVX_DOUBLE_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define AVX2_DOUBLE_BLOCK 4
-#define ZMM_FLOAT_BLOCK 16
-#define AVX512_DOUBLE_BLOCK 8
-
-template <typename T>
-inline void vec_exp(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
-template <typename T>
-inline void vec_scal(const int n, const T a, T* x) {
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-}
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-inline void vec_exp<float>(const int n, const float* x, float* y) {
-  constexpr int small_enough = 128;
-  if (n < small_enough) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = std::exp(x[i]);
-    }
-  } else {
-    platform::dynload::vsExp(n, x, y);
-  }
-}
-
-template <>
-inline void vec_exp<double>(const int n, const double* x, double* y) {
-  platform::dynload::vdExp(n, x, y);
-}
-
-template <>
-inline void vec_scal<float>(const int n, const float a, float* x) {
-  platform::dynload::cblas_sscal(n, a, x, 1);
-}
-
-template <>
-inline void vec_scal<double>(const int n, const double a, double* x) {
-  platform::dynload::cblas_dscal(n, a, x, 1);
-}
-#endif
-
-// MKL scal only support inplace, choose this if src and dst are not equal
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_scal(const int n, const T a, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a * x[i];
-  }
-}
-
-template <>
-inline void vec_scal<float, platform::avx>(const int n, const float a,
-                                           const float* x, float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_scal<float, platform::isa_any>(n, a, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 scalar = _mm256_set1_ps(a);
-  __m256 tmp;
-#define MOVE_ONE_STEP               \
-  tmp = _mm256_loadu_ps(x + i);     \
-  tmp = _mm256_mul_ps(tmp, scalar); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    y[i] = a * x[i];
-  }
-#else
-  vec_scal<float, platform::isa_any>(n, a, x, y);
-#endif
-}
-
-template <>
-inline void vec_scal<float, platform::avx2>(const int n, const float a,
-                                            const float* x, float* y) {
-  vec_scal<float, platform::avx>(n, a, x, y);
-}
-
-template <>
-inline void vec_scal<float, platform::avx512f>(const int n, const float a,
-                                               const float* x, float* y) {
-  // TODO(TJ): enable me
-  vec_scal<float, platform::avx2>(n, a, x, y);
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_sum(const size_t n, const T* x, T* s) {
-  s[0] = x[0];
-  for (size_t i = 1; i < n; ++i) {
-    s[0] += x[i];
-  }
-}
-
-template <>
-inline void vec_sum<float, platform::avx>(const size_t n, const float* x,
-                                          float* s) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_sum<float, platform::isa_any>(n, x, s);
-    return;
-  }
-
-  unsigned int i, end;
-  i = end = 0;
-  s[0] = 0.f;
-
-  end = n & ~(block - 1);
-  __m256 tmp = _mm256_setzero_ps();
-  for (i = 0; i < end; i += block) {
-    tmp = _mm256_add_ps(tmp, _mm256_loadu_ps(x + i));
-  }
-
-  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
-  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
-  _mm_store_ss(s, _mm_hadd_ps(_mm256_castps256_ps128(hsum),
-                              _mm256_castps256_ps128(hsum)));
-
-  for (; i < n; i++) {
-    s[0] += x[i];
-  }
-#else
-  vec_sum<float, platform::isa_any>(n, x, s);
-#endif
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
-  for (size_t i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-template <>
-inline void vec_mul<float, platform::avx>(const size_t n, const float* x,
-                                          const float* y, float* z) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_mul<float, platform::isa_any>(n, x, y, z);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  end = n & ~(block - 1);
-  for (i = 0; i < end; i += block) {
-    _mm256_storeu_ps(
-        z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
-  }
-
-  for (; i < n; i++) {
-    z[i] = x[i] * y[i];
-  }
-#else
-  vec_mul<float, platform::isa_any>(n, x, y, z);
-#endif
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
-  z[0] = x[0] * y[0];
-  for (size_t i = 1; i < n; ++i) {
-    z[0] += x[i] * y[i];
-  }
-}
-
-template <>
-inline void vec_mul_reduce<float, platform::avx>(const size_t n, const float* x,
-                                                 const float* y, float* z) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  z[0] = 0.f;
-
-  end = n & ~(block - 1);
-  __m256 tmp = _mm256_setzero_ps();
-  for (i = 0; i < end; i += block) {
-    tmp = _mm256_add_ps(
-        tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
-  }
-
-  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
-  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
-  _mm_store_ss(z, _mm_hadd_ps(_mm256_castps256_ps128(hsum),
-                              _mm256_castps256_ps128(hsum)));
-
-  for (; i < n; i++) {
-    z[0] += x[i] * y[i];
-  }
-#else
-  vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
-#endif
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a - x[i];
-  }
-}
-
-template <>
-inline void vec_bias_sub<float, platform::avx>(const int n, const float a,
-                                               const float* x, float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_bias_sub<float, platform::isa_any>(n, a, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 bias = _mm256_set1_ps(a);
-  __m256 tmp;
-#define MOVE_ONE_STEP             \
-  tmp = _mm256_loadu_ps(x + i);   \
-  tmp = _mm256_sub_ps(bias, tmp); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    y[i] = a - x[i];
-  }
-#else
-  vec_bias_sub<float, platform::isa_any>(n, a, x, y);
-#endif
-}
-
-template <>
-inline void vec_bias_sub<float, platform::avx2>(const int n, const float a,
-                                                const float* x, float* y) {
-  vec_bias_sub<float, platform::avx>(n, a, x, y);
-}
-
-template <>
-inline void vec_bias_sub<float, platform::avx512f>(const int n, const float a,
-                                                   const float* x, float* y) {
-  // TODO(TJ): enable me
-  vec_bias_sub<float, platform::avx2>(n, a, x, y);
-}
-
-// out = x*y + (1-x)*z
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
-  for (int i = 0; i < n; ++i) {
-    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
-  }
-}
-
-template <>
-inline void vec_cross<float, platform::avx>(const int n, const float* x,
-                                            const float* y, const float* z,
-                                            float* out) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_cross<float, platform::isa_any>(n, x, y, z, out);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 bias = _mm256_set1_ps(1.f);
-  __m256 tmpx, tmpy, tmpz;
-  for (i = 0; i < end; i += block) {
-    tmpx = _mm256_loadu_ps(x + i);
-    tmpy = _mm256_loadu_ps(y + i);
-    tmpz = _mm256_loadu_ps(z + i);
-    tmpy = _mm256_mul_ps(tmpx, tmpy);
-    tmpx = _mm256_sub_ps(bias, tmpx);
-    tmpz = _mm256_mul_ps(tmpx, tmpz);
-    tmpz = _mm256_add_ps(tmpy, tmpz);
-    _mm256_storeu_ps(out + i, tmpz);
-  }
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
-  }
-#else
-  vec_cross<float, platform::isa_any>(n, x, y, z, out);
-#endif
-}
-
-template <>
-inline void vec_cross<float, platform::avx2>(const int n, const float* x,
-                                             const float* y, const float* z,
-                                             float* out) {
-  vec_cross<float, platform::avx>(n, x, y, z, out);
-}
-
-template <>
-inline void vec_cross<float, platform::avx512f>(const int n, const float* x,
-                                                const float* y, const float* z,
-                                                float* out) {
-  // TODO(TJ): enable me
-  vec_cross<float, platform::avx>(n, x, y, z, out);
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
-  for (size_t i = 0; i < n; ++i) {
-    y[i] = x[i] < a ? a : x[i];
-  }
-}
-
-template <>
-inline void vec_clip<float, platform::avx>(const size_t n, const float a,
-                                           const float* x, float* y) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_clip<float, platform::isa_any>(n, a, x, y);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  end = n & ~(block - 1);
-  __m256 threshold = _mm256_set1_ps(a);
-
-  for (i = 0; i < end; i += block) {
-    _mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold));
-  }
-
-  for (; i < n; i++) {
-    y[i] = x[i] < a ? a : x[i];
-  }
-#else
-  vec_clip<float, platform::isa_any>(n, a, x, y);
-#endif
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] + a;
-  }
-}
-
-template <>
-inline void vec_add_bias<float, platform::avx>(const int n, const float a,
-                                               const float* x, float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_add_bias<float, platform::isa_any>(n, a, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 bias = _mm256_set1_ps(a);
-  __m256 tmp;
-#define MOVE_ONE_STEP             \
-  tmp = _mm256_loadu_ps(x + i);   \
-  tmp = _mm256_add_ps(tmp, bias); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    y[i] = x[i] + a;
-  }
-#else
-  vec_add_bias<float, platform::isa_any>(n, a, x, y);
-#endif
-}
-
-template <>
-inline void vec_add_bias<float, platform::avx2>(const int n, const float a,
-                                                const float* x, float* y) {
-  vec_add_bias<float, platform::avx>(n, a, x, y);
-}
-
-template <>
-inline void vec_add_bias<float, platform::avx512f>(const int n, const float a,
-                                                   const float* x, float* y) {
-  // TODO(TJ): enable me
-  vec_add_bias<float, platform::avx2>(n, a, x, y);
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_identity(const int n, const T* x, T* y) {
-  // do nothing
-  return;
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_sigmoid(const int n, const T* x, T* y) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(0) - y[i];
-  }
-  vec_exp<T>(n, y, y);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
-  }
-}
-
-template <>
-inline void vec_sigmoid<float, platform::avx>(const int n, const float* x,
-                                              float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_sigmoid<float, platform::isa_any>(n, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-  __m256 zeros = _mm256_setzero_ps();
-  __m256 tmp;
-#define MOVE_ONE_STEP              \
-  tmp = _mm256_loadu_ps(x + i);    \
-  tmp = _mm256_max_ps(tmp, min);   \
-  tmp = _mm256_min_ps(tmp, max);   \
-  tmp = _mm256_sub_ps(zeros, tmp); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest != 0) {
-    // can not continue move step since the src and dst address could be equal
-    const float xmin = SIGMOID_THRESHOLD_MIN;
-    const float xmax = SIGMOID_THRESHOLD_MAX;
-    for (i = n - rest; i < n; ++i) {
-      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
-    }
-  }
-
-  vec_exp<float>(n, y, y);
-
-  __m256 ones = _mm256_set1_ps(1.0f);
-#define MOVE_ONE_STEP             \
-  tmp = _mm256_loadu_ps(y + i);   \
-  tmp = _mm256_add_ps(ones, tmp); \
-  tmp = _mm256_div_ps(ones, tmp); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step
-  for (i = n - rest; i < n; ++i) {
-    y[i] = 1.f / (1.f + y[i]);
-  }
-#else
-  vec_sigmoid<float, platform::isa_any>(n, x, y);
-#endif
-}
-
-template <>
-inline void vec_sigmoid<float, platform::avx2>(const int n, const float* x,
-                                               float* y) {
-  vec_sigmoid<float, platform::avx>(n, x, y);
-}
-
-template <>
-inline void vec_sigmoid<float, platform::avx512f>(const int n, const float* x,
-                                                  float* y) {
-  // TODO(TJ): enable me
-  vec_sigmoid<float, platform::avx2>(n, x, y);
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_tanh(const int n, const T* x, T* y) {
-  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
-  vec_sigmoid<T, isa>(n, y, y);
-  vec_scal<T>(n, static_cast<T>(2), y);
-  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
-}
-
-// TODO(TJ): make relu clip
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_relu(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
-}
-
-template <>
-inline void vec_relu<float, platform::avx>(const int n, const float* x,
-                                           float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block * 4) {
-    vec_relu<float, platform::isa_any>(n, x, y);
-    return;
-  }
-
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 zeros = _mm256_setzero_ps();
-  __m256 tmp;
-#define MOVE_ONE_STEP              \
-  tmp = _mm256_loadu_ps(x + i);    \
-  tmp = _mm256_max_ps(tmp, zeros); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-  if (rest == 0) {
-    return;
-  }
-  i = n - block;
-  MOVE_ONE_STEP;
-#undef MOVE_ONE_STEP
-
-#else
-  vec_relu<float, platform::isa_any>(n, x, y);
-#endif
-}
-
-template <>
-inline void vec_relu<float, platform::avx2>(const int n, const float* x,
-                                            float* y) {
-  vec_relu<float, platform::avx>(n, x, y);
-}
-
-template <>
-inline void vec_relu<float, platform::avx512f>(const int n, const float* x,
-                                               float* y) {
-  // TODO(TJ): enable me
-  vec_relu<float, platform::avx2>(n, x, y);
-}
-
-// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-class VecActivations {
- public:
-  std::function<void(const int, const T*, T*)> operator()(
-      const std::string& type) {
-    if (type == "sigmoid") {
-      return vec_sigmoid<T, isa>;
-    } else if (type == "relu") {
-      return vec_relu<T, isa>;
-    } else if (type == "tanh") {
-      return vec_tanh<T, isa>;
-    } else if (type == "identity" || type == "") {
-      return vec_identity<T, isa>;
-    }
-    PADDLE_THROW("Not support type: %s", type);
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
deleted file mode 100644
index f2f80f836fdce21e4f41ef11472805253cd6ec57..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ /dev/null
@@ -1,324 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <cstring>
-#include <random>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/port.h"
-
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
-constexpr int repeat = 1000;
-
-template <typename T>
-inline T _sigmoid(T x) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  T tmp = (x < min) ? min : ((x > max) ? max : x);
-  return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
-}
-
-template <typename T>
-inline T _tanh(T x) {
-  return static_cast<T>(2) * _sigmoid<T>(static_cast<T>(2) * x) -
-         static_cast<T>(1);
-}
-
-template <typename T>
-void ref_sigmoid(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = _sigmoid(x[i]);
-  }
-}
-
-template <typename T>
-void ref_tanh(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = _tanh(x[i]);
-  }
-}
-template <typename T>
-void ref_relu(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
-}
-
-template <typename T>
-void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
-               const T upper = static_cast<T>(20.f)) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  for (int i = 0; i < n; ++i) {
-    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-template <typename T>
-void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
-                  std::function<void(const int, const T*, T*)> ref) {
-  std::vector<T> x(n);
-  std::vector<T> ytgt(n), yref(n);
-  RandomVec<T>(n, x.data());
-
-  const T* x_data = x.data();
-  T* ytgt_data = ytgt.data();
-  T* yref_data = yref.data();
-  auto st = GetCurrentUS();
-  for (int i = 0; i < repeat; ++i) {
-    tgt(n, x_data, ytgt_data);
-  }
-  auto mt = GetCurrentUS();
-  for (int i = 0; i < repeat; ++i) {
-    ref(n, x_data, yref_data);
-  }
-  auto et = GetCurrentUS();
-
-  VLOG(3) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
-          << " us, tgt takes: " << (mt - st) / repeat;
-  for (int i = 0; i < n; ++i) {
-    EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
-  }
-}
-
-TEST(CpuVecTest, sigmoid) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx>,
-                        ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx2>,
-                        ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx512f>,
-                        ref_sigmoid<float>);
-  }
-  TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
-}
-
-TEST(CpuVecTest, tanh) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, platform::avx512f>,
-                        ref_tanh<float>);
-  }
-  TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
-}
-
-TEST(CpuVecTest, relu) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, platform::avx512f>,
-                        ref_relu<float>);
-  }
-  TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
-}
-
-template <typename T>
-void compare_sum(size_t n, std::function<void(const size_t, const T*, T*)> tgt,
-                 std::function<void(const size_t, const T*, T*)> ref) {
-  std::vector<T> x(n);
-  T ytgt_data, yref_data;
-  RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
-
-  const T* x_data = x.data();
-  tgt(n, x_data, &ytgt_data);
-  ref(n, x_data, &yref_data);
-  EXPECT_NEAR(ytgt_data, yref_data, 1e-3);
-}
-
-TEST(CpuVecTest, vec_sum) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_sum<float>(sz, vec_sum<float>, vec_sum<float, platform::isa_any>);
-    compare_sum<float>(sz, vec_sum<float, platform::avx>,
-                       vec_sum<float, platform::isa_any>);
-  }
-  compare_sum<double>(30U, vec_sum<double>, vec_sum<double, platform::isa_any>);
-}
-
-template <typename T>
-void compare_clip(
-    size_t n, T threshold,
-    std::function<void(const size_t, const T, const T*, T*)> tgt,
-    std::function<void(const size_t, const T, const T*, T*)> ref) {
-  std::vector<T> x(n);
-  std::vector<T> ytgt(n), yref(n);
-  RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
-
-  const T* x_data = x.data();
-  T* yref_data = yref.data();
-  T* ytgt_data = ytgt.data();
-  tgt(n, threshold, x_data, ytgt_data);
-  ref(n, threshold, x_data, yref_data);
-  for (int i = 0; i < n; ++i) {
-    EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
-  }
-}
-
-TEST(CpuVecTest, vec_clip) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_clip<float>(sz, -4.f, vec_clip<float>,
-                        vec_clip<float, platform::isa_any>);
-    compare_clip<float>(sz, -1.1f, vec_clip<float, platform::avx>,
-                        vec_clip<float, platform::isa_any>);
-  }
-  compare_clip<double>(30U, 1.0, vec_clip<double>,
-                       vec_clip<double, platform::isa_any>);
-}
-
-template <typename T>
-void compare_mul(
-    size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
-    std::function<void(const size_t, const T*, const T*, T*)> ref) {
-  std::vector<T> x(n), y(n);
-  std::vector<T> ztgt(n), zref(n);
-
-  RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
-  RandomVec<T>(n, y.data(), static_cast<T>(-2), static_cast<T>(2));
-
-  const T* x_data = x.data();
-  const T* y_data = y.data();
-  T* ztgt_data = ztgt.data();
-  T* zref_data = zref.data();
-
-  tgt(n, x_data, y_data, ztgt_data);
-  ref(n, x_data, y_data, zref_data);
-  for (size_t i = 0; i < n; ++i) {
-    EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-  }
-}
-
-TEST(CpuVecTest, vec_mul) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_mul<float>(sz, vec_mul<float>, vec_mul<float, platform::isa_any>);
-    compare_mul<float>(sz, vec_mul<float, platform::avx>,
-                       vec_mul<float, platform::isa_any>);
-  }
-  compare_mul<double>(30U, vec_mul<double>, vec_mul<double, platform::isa_any>);
-}
-
-template <typename T>
-void compare_mul_reduce(
-    size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
-    std::function<void(const size_t, const T*, const T*, T*)> ref) {
-  std::vector<T> x(n), y(n);
-  T ztgt_data, zref_data;
-
-  RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
-  RandomVec<T>(n, y.data(), static_cast<T>(-2), static_cast<T>(2));
-
-  const T* x_data = x.data();
-  const T* y_data = y.data();
-
-  tgt(n, x_data, y_data, &ztgt_data);
-  ref(n, x_data, y_data, &zref_data);
-  EXPECT_NEAR(ztgt_data, zref_data, 1e-3);
-}
-
-TEST(CpuVecTest, vec_mul_reduce) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_mul_reduce<float>(sz, vec_mul_reduce<float>,
-                              vec_mul_reduce<float, platform::isa_any>);
-    compare_mul_reduce<float>(sz, vec_mul_reduce<float, platform::avx>,
-                              vec_mul_reduce<float, platform::isa_any>);
-  }
-  compare_mul_reduce<double>(30U, vec_mul_reduce<double>,
-                             vec_mul_reduce<double, platform::isa_any>);
-}
-
-template <typename T>
-void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
-                 std::function<void(const int, const T*, T*)> ref) {
-  std::vector<T> x(n);
-  std::vector<T> ytgt(n), yref(n);
-  RandomVec<T>(n, x.data());
-
-  const T* x_data = x.data();
-  T* yref_data = yref.data();
-  T* ytgt_data = ytgt.data();
-  std::memcpy(yref_data, x_data, sizeof(T) * n);
-  std::memcpy(ytgt_data, x_data, sizeof(T) * n);
-
-  ref(n, yref_data, yref_data);
-  tgt(n, ytgt_data, ytgt_data);
-
-  for (int i = 0; i < n; ++i) {
-    EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
-  }
-}
-
-TEST(CpuVecTest, inplace_sigmoid) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx>,
-                       ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx2>,
-                       ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx512f>,
-                       ref_sigmoid<float>);
-  }
-  TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
-}
-
-TEST(CpuVecTest, inplace_tanh) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, platform::avx512f>, ref_tanh<float>);
-  }
-  TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
-}
-
-TEST(CpuVecTest, inplace_relu) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, platform::avx512f>, ref_relu<float>);
-  }
-  TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
-}
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
deleted file mode 100644
index 9f7884fe05f2f446b1fb6eb7dfd53e293d8e19aa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/cross_entropy.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
-                  const framework::Tensor* prob,
-                  const framework::Tensor* labels, const bool softLabel,
-                  const int ignore_index, const int axis_dim) {
-    const int batch_size = prob->dims()[0];
-    const int num_classes = prob->dims()[1];
-    const int num_remain = num_classes / axis_dim;
-
-    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-
-    if (softLabel) {
-      auto in = EigenMatrix<T>::From(*prob);
-      auto lbl = EigenMatrix<T>::From(*labels);
-      auto loss = EigenMatrix<T>::From(*out);
-
-      loss.device(*ctx.eigen_device()) =
-          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
-                .reshape(batch_axis_remain)
-                .sum(Eigen::DSizes<int, 1>(1)));
-    } else {
-      const T* prob_data = prob->data<T>();
-      T* loss_data = out->data<T>();
-
-      const int64_t* label_data = labels->data<int64_t>();
-      for (int i = 0; i < batch_size; ++i) {
-        for (int j = 0; j < num_remain; j++) {
-          int lbl = label_data[i * num_remain + j];
-          PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
-          int index = i * num_classes + lbl * num_remain + j;
-          int loss_idx = i * num_remain + j;
-          loss_data[loss_idx] =
-              lbl == ignore_index
-                  ? 0
-                  : -math::TolerableValue<T>()(std::log(prob_data[index]));
-        }
-      }
-    }
-  }
-};
-
-template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
-template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
deleted file mode 100644
index 2d871c6e14b855c01b7783bd90103a1e49c71ac2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
-                                   const int N, const int D,
-                                   const int ignore_index) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    PADDLE_ENFORCE(label[i] >= 0 && label[i] < D || label[i] == ignore_index,
-                   "label[%d] expected >= 0 and < %ld, or == %ld, but got "
-                   "%ld. Please check input value.",
-                   i, D, ignore_index, label[i]);
-    Y[i] = ignore_index == label[i]
-               ? static_cast<T>(0)
-               : -math::TolerableValue<T>()(real_log(X[i * D + label[i]]));
-  }
-}
-
-template <typename T>
-__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
-                                       const int class_num) {
-  int tid = threadIdx.x;
-  T val(0);
-
-  int idx = blockIdx.x * class_num + tid;
-  int end = blockIdx.x * class_num + class_num;
-  for (; idx < end; idx += blockDim.x) {
-    val += math::TolerableValue<T>()(real_log(X[idx])) * label[idx];
-  }
-
-  val = paddle::platform::reduceSum(val, tid, blockDim.x);
-  if (threadIdx.x == 0) {
-    Y[blockIdx.x] = -val;
-  }
-}
-
-template <typename T>
-class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  framework::Tensor* out, const framework::Tensor* prob,
-                  const framework::Tensor* labels, const bool softLabel,
-                  const int ignore_index, const int axis_dim) {
-    const T* prob_data = prob->data<T>();
-    T* loss_data = out->mutable_data<T>(ctx.GetPlace());
-
-    int batch_size = prob->dims()[0];
-    int class_num = prob->dims()[1];
-
-    if (softLabel) {
-      const T* label_data = labels->data<T>();
-      int block = class_num > 512
-                      ? 512
-                      : pow(2, static_cast<int>(std::log2(class_num)));
-
-      SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>(
-          loss_data, prob_data, label_data, class_num);
-    } else {
-      const int64_t* label_data = labels->data<int64_t>();
-      int block = 512;
-      int grid = (batch_size + block - 1) / block;
-      CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
-          loss_data, prob_data, label_data, batch_size, class_num,
-          ignore_index);
-    }
-  }
-};
-
-template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext,
-                                   platform::float16>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
deleted file mode 100644
index db19818951d7c9f7e55f8acf2f2de7e3e3819694..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <limits>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct TolerableValue {
-  HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ENFORCE(std::is_floating_point<T>::value,
-                   "TolerableValue should be float in cross_entropy.");
-    const T kApproInf = 1e20;
-
-    if (x == INFINITY) return kApproInf;
-    if (x == -INFINITY) return -kApproInf;
-    return x;
-  }
-};
-
-// NOTE(dzh): float16 value clip behave different.
-// 1. Our ValueClipping has a  hardcore threshold 1e20
-// for float number. 1e20 will resulting in overflow in float16.
-// 2. float16 should expose the the real number overflow to python.
-// because mixed-training depends the inf/nan value to determine
-// if the scale value will be adjusted.
-// Also. In standard implementation of cross entropy, other
-// framework not has the ValueClipping.
-template <>
-struct TolerableValue<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& x) const {
-    if (platform::isfinite(x))
-      return x;
-    else if (x > static_cast<platform::float16>(0))
-      return std::numeric_limits<platform::float16>::max();
-    else
-      return std::numeric_limits<platform::float16>::min();
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CrossEntropyFunctor {
- public:
-  void operator()(const DeviceContext& context, framework::Tensor* out,
-                  const framework::Tensor* prob,
-                  const framework::Tensor* labels, const bool softLabel,
-                  const int ignore_index, const int axis_dim);
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
deleted file mode 100644
index a372f6fa718e5db6b8de5d77391e0171d82f18dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ /dev/null
@@ -1,669 +0,0 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <vector>
-#include "cub/cub.cuh"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
-  typedef cub::WarpReduce<T> WarpReduce;
-  typename WarpReduce::TempStorage temp_storage;
-  value = WarpReduce(temp_storage).Sum(value);
-  if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
-}
-
-#define ARG_DEFINE_KernelDepthwiseConv                                         \
-  const T *const input_data, const T *const filter_data, const int batch_size, \
-      const int output_channels, const int output_height,                      \
-      const int output_width, const int input_channels,                        \
-      const int input_height, const int input_width,                           \
-      const int filter_multiplier, const int filter_height,                    \
-      const int filter_width, const int stride_height, const int stride_width, \
-      const int padding_height, const int padding_width,                       \
-      const int dilate_height, const int dilate_width, T *const output_data
-
-// A Cuda kernel to compute the depthwise convolution forward pass
-// in NCHW format.
-template <typename T, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) {
-  for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
-    for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
-      const int batch = blockIdx.y;
-      const int c_out = blockIdx.x;
-
-      const int c_in = c_out / filter_multiplier;
-      const T* weight = filter_data + c_out * filter_height * filter_width;
-      T value = 0;
-      const int h_in_start = -padding_height + h_out * stride_height;
-      const int w_in_start = -padding_width + w_out * stride_width;
-      const int h_in_end = h_in_start + filter_height * dilate_height;
-      const int w_in_end = w_in_start + filter_width * dilate_width;
-
-      const int in_offset =
-          ((batch * input_channels + c_in) * input_height) * input_width;
-
-      const int h_end = h_in_end < input_height ? h_in_end : input_height;
-      const int w_end = w_in_end < input_width ? w_in_end : input_width;
-      const int h_start = h_in_start > 0 ? h_in_start : 0;
-      const int w_start = w_in_start > 0 ? w_in_start : 0;
-      int weight_offset = 0;
-
-      for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) {
-        for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) {
-          if (h_in >= h_start && h_in < h_end && w_in >= w_start &&
-              w_in < w_end) {
-            const int offset = in_offset + h_in * input_width + w_in;
-            if (fuse_relu_before_conv) {
-              value += weight[weight_offset] * max(0.0f, input_data[offset]);
-            } else {
-              value += weight[weight_offset] * input_data[offset];
-            }
-          }
-          weight_offset++;
-        }
-      }
-      int index =
-          ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
-          w_out;
-      output_data[index] = value;
-    }
-  }
-}
-
-template <typename T, int c_filter, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvCFilter(
-    ARG_DEFINE_KernelDepthwiseConv) {
-  const int kWeghtSize = c_filter * c_filter;
-  T r_weight[kWeghtSize];
-  const int batch = blockIdx.y;
-  const int c_out = blockIdx.x;
-  const T* weight = filter_data + c_out * c_filter * c_filter;
-  for (int i = 0; i < c_filter * c_filter; i++) r_weight[i] = weight[i];
-
-  for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
-    for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
-      const int batch = blockIdx.y;
-      const int c_out = blockIdx.x;
-
-      const int c_in = c_out / filter_multiplier;
-      T value = 0;
-      const int h_in_start = -padding_height + h_out * stride_height;
-      const int w_in_start = -padding_width + w_out * stride_width;
-      const int h_in_end = h_in_start + c_filter * dilate_height;
-      const int w_in_end = w_in_start + c_filter * dilate_width;
-
-      const int in_offset =
-          ((batch * input_channels + c_in) * input_height) * input_width;
-
-      const int h_end = h_in_end < input_height ? h_in_end : input_height;
-      const int w_end = w_in_end < input_width ? w_in_end : input_width;
-      const int h_start = h_in_start > 0 ? h_in_start : 0;
-      const int w_start = w_in_start > 0 ? w_in_start : 0;
-
-      for (int h_in = h_in_start, h_f = 0; h_f < c_filter;
-           h_in += dilate_height, h_f++) {
-        for (int w_in = w_in_start, w_f = 0; w_f < c_filter;
-             w_in += dilate_width, w_f++) {
-          if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
-              w_in < input_width) {
-            const int offset = in_offset + h_in * input_width + w_in;
-            if (fuse_relu_before_conv) {
-              value += r_weight[h_f * c_filter + w_f] *
-                       max(0.0f, input_data[offset]);
-            } else {
-              value += r_weight[h_f * c_filter + w_f] * input_data[offset];
-            }
-          }
-        }
-      }
-      int index =
-          ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
-          w_out;
-      output_data[index] = value;
-    }
-  }
-}
-
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          bool fuse_relu_before_conv>
-__global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
-  if (c_filter_multiplier == 0) {
-    if (c_filter == -1)
-      KernelDepthwiseConv<T, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          filter_multiplier, filter_height, filter_width, stride_height,
-          stride_width, padding_height, padding_width, dilate_height,
-          dilate_width, output_data);
-    else
-      KernelDepthwiseConvCFilter<T, c_filter, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          filter_multiplier, filter_height, filter_width, stride_height,
-          stride_width, padding_height, padding_width, dilate_height,
-          dilate_width, output_data);
-  } else {
-    if (c_filter == -1)
-      KernelDepthwiseConv<T, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          c_filter_multiplier, filter_height, filter_height, c_stride, c_stride,
-          padding_height, padding_width, dilate_height, dilate_width,
-          output_data);
-    else
-      KernelDepthwiseConvCFilter<T, c_filter, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          c_filter_multiplier, filter_height, filter_height, c_stride, c_stride,
-          padding_height, padding_width, dilate_height, dilate_width,
-          output_data);
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
-#define ARG_DEFINE_KernelDepthwiseConvInputGrad                                \
-  const T *const input_data, const T *const output_grad_data,                  \
-      const T *const filter_data, const int batch_size,                        \
-      const int output_channels, const int output_height,                      \
-      const int output_width, const int input_channels,                        \
-      const int input_height, const int input_width,                           \
-      const int filter_multiplier, const int filter_height,                    \
-      const int filter_width, const int stride_height, const int stride_width, \
-      const int padding_height, const int padding_width,                       \
-      const int dilate_height, const int dilate_width,                         \
-      T *const input_grad_data
-
-template <typename T, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvInputGrad(
-    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
-    for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-      const int batch = blockIdx.y;
-      const int c_in = blockIdx.x;
-
-      const int c_out_start = c_in * filter_multiplier;
-
-      int h_out_start =
-          h_in - (filter_height - 1) * dilate_height + padding_height;
-
-      int h_out_end = h_in + padding_height;
-
-      int w_out_start =
-          w_in - (filter_width - 1) * dilate_width + padding_width;
-
-      int w_out_end = w_in + padding_width;
-
-      T value = 0;
-      int index =
-          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-          w_in;
-      if (fuse_relu_before_conv) {
-        if (input_data[index] <= 0) {
-          input_grad_data[index] = 0;
-          continue;
-        }
-      }
-
-      for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
-           c_out++) {
-        int filter_offset = (c_out + 1) * filter_height * filter_width;
-        for (int h_out = h_out_start; h_out <= h_out_end;
-             h_out += dilate_height) {
-          for (int w_out = w_out_start; w_out <= w_out_end;
-               w_out += dilate_width) {
-            filter_offset--;
-            int s_h_out = h_out / stride_height;
-            int s_w_out = w_out / stride_width;
-            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
-                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
-                s_w_out < output_width) {
-              const int output_grad_offset =
-                  ((batch * output_channels + c_out) * output_height +
-                   s_h_out) *
-                      output_width +
-                  s_w_out;
-              value += output_grad_data[output_grad_offset] *
-                       filter_data[filter_offset];
-            }
-          }
-        }
-      }
-      input_grad_data[index] = value;
-    }
-  }
-}
-
-template <typename T, int c_filter, int c_filter_multiplier,
-          bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
-    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1;
-  T r_weight[kWeghtSize];
-  const int batch = blockIdx.y;
-  const int c_in = blockIdx.x;
-
-  for (int c_i = 0; c_i < filter_multiplier; c_i++) {
-    int c_out = c_in * filter_multiplier + c_i;
-    const T* weight = filter_data + c_out * c_filter * c_filter;
-    for (int i = 0; i < c_filter * c_filter; i++)
-      r_weight[i + c_i * c_filter * c_filter] =
-          weight[c_filter * c_filter - i - 1];
-  }
-
-  for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
-    for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-      const int batch = blockIdx.y;
-      const int c_in = blockIdx.x;
-
-      int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height;
-
-      int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
-
-      T value = 0;
-      int index =
-          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-          w_in;
-      if (fuse_relu_before_conv) {
-        if (input_data[index] <= 0) {
-          input_grad_data[index] = 0;
-          continue;
-        }
-      }
-
-      for (int c_i = 0; c_i < filter_multiplier; c_i++) {
-        int c_out = c_in * filter_multiplier + c_i;
-        for (int h_out = h_out_start, h_f = 0; h_f < c_filter;
-             h_out += dilate_height, h_f++) {
-          for (int w_out = w_out_start, w_f = 0; w_f < c_filter;
-               w_out += dilate_width, w_f++) {
-            int s_h_out = h_out / stride_height;
-            int s_w_out = w_out / stride_width;
-            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
-                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
-                s_w_out < output_width) {
-              const int output_grad_offset =
-                  ((batch * output_channels + c_out) * output_height +
-                   s_h_out) *
-                      output_width +
-                  s_w_out;
-              value +=
-                  output_grad_data[output_grad_offset] *
-                  r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter];
-            }
-          }
-        }
-      }
-      input_grad_data[index] = value;
-    }
-  }
-}
-
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          bool fuse_relu_before_conv>
-__global__ void KernelDepthwiseConvInputGradSp(
-    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  if (c_filter_multiplier == 0)
-    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, input_grad_data);
-  else if (c_filter == -1)
-    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
-        padding_height, padding_width, dilate_height, dilate_width,
-        input_grad_data);
-  else
-    KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier,
-                                        fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
-        padding_height, padding_width, dilate_height, dilate_width,
-        input_grad_data);
-}
-
-// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvFilterGrad(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
-  T s = 0;
-
-  int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
-
-  for (int image_w = threadIdx.x; image_w < output_width;
-       image_w += blockDim.x) {
-    for (int bid = 0; bid < num; bid++) {
-      for (int image_h = threadIdx.y; image_h < output_height;
-           image_h += blockDim.y) {
-        int kernel_id = blockIdx.z;
-        int kernel_h = blockIdx.y * dilate_height - padding_height;
-        int kernel_w = blockIdx.x * dilate_width - padding_width;
-
-        int image_hk = image_h * stride_height + kernel_h;
-        int image_wk = image_w * stride_width + kernel_w;
-        if (image_hk < 0 || image_hk >= input_height) continue;
-        if (image_wk < 0 || image_wk >= input_width) continue;
-#define gaid(N, C, H, W) \
-  ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
-        int input_id = ((bid * (gridDim.z / filter_multiplier) +
-                         kernel_id / filter_multiplier) *
-                            input_height +
-                        image_hk) *
-                           input_width +
-                       image_wk;
-        if (fuse_relu_before_conv) {
-          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-               max(0.0f, input_data[input_id]);
-        } else {
-          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-               input_data[input_id];
-        }
-
-#undef gaid
-      }
-    }
-  }
-  CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
-}
-
-template <typename T, int c_filter_multiplier, bool fuse_relu_before_conv>
-__global__ void KernelDepthwiseConvFilterGradSp(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
-  if (c_filter_multiplier == 0)
-    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
-        output_grad_data, input_data, num, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, filter_grad_data);
-  else
-    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
-        output_grad_data, input_data, num, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, filter_grad_data);
-}
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <class T, bool fuse_relu_before_conv>
-class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
-                           fuse_relu_before_conv> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = filter.dims()[2];
-    const int ksize_width = filter.dims()[3];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int dilate_height = dilations[0];
-    const int dilate_width = dilations[1];
-
-    const T* input_data = input.data<T>();
-    const T* filter_data = filter.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    int thread = 512;
-    if (output_width > 1024 && output_width <= 2048)
-      thread = (output_width - 1) / 2 + 1;
-    else if (output_width > 512 && output_width <= 1024)
-      thread = output_width;
-    int blocks = std::min(std::max(thread / output_width, 1), output_height);
-    dim3 threads(std::min(output_width, thread), blocks, 1);
-    dim3 grid(output_channels, batch_size, 1);
-    int filter_multiplier = output_channels / input_channels;
-#define check_case(c_filter_multiplier, c_stride, c_filter)                  \
-  if (c_filter_multiplier == 0 ||                                            \
-      filter_multiplier == c_filter_multiplier &&                            \
-          stride_height == stride_width && stride_height == c_stride &&      \
-          (ksize_height == ksize_width && ksize_height == c_filter ||        \
-           c_filter == -1)) {                                                \
-    KernelDepthwiseConvSp<                                                   \
-        T, c_filter_multiplier, c_stride, c_filter,                          \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-        input_data, filter_data, batch_size, output_channels, output_height, \
-        output_width, input_channels, input_height, input_width,             \
-        filter_multiplier, ksize_height, ksize_width, stride_height,         \
-        stride_width, padding_height, padding_width, dilate_height,          \
-        dilate_width, output_data);                                          \
-    return;                                                                  \
-  }
-    check_case(1, 1, 3);
-    check_case(1, 1, 5);
-    check_case(1, 1, -1);
-    check_case(1, 2, 3);
-    check_case(1, 2, 5);
-    check_case(1, 2, -1);
-    check_case(2, 1, 3);
-    check_case(2, 1, 5);
-    check_case(2, 1, -1);
-    check_case(2, 2, 3);
-    check_case(2, 2, 5);
-    check_case(2, 2, -1);
-    check_case(0, 0, -1);
-// NOTE(liangdun): 0,0 for other case
-// add other case if needed, e.g. check_case(2^n,1)
-#undef check_case
-  }
-};
-
-template <typename T, bool fuse_relu_before_conv>
-class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
-                                    fuse_relu_before_conv> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output_grad.dims()[1];
-    const int output_height = output_grad.dims()[2];
-    const int output_width = output_grad.dims()[3];
-    const int ksize_height = filter.dims()[2];
-    const int ksize_width = filter.dims()[3];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int dilate_height = dilations[0];
-    const int dilate_width = dilations[1];
-
-    const T* input_data = input.data<T>();
-    const T* filter_data = filter.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int thread = 512;
-    if (input_width > 1024 && input_width <= 2048)
-      thread = (input_width - 1) / 2 + 1;
-    else if (input_width > 512 && input_width <= 1024)
-      thread = input_width;
-    int blocks = std::min(std::max(thread / input_width, 1), input_height);
-    dim3 threads(std::min(input_width, thread), blocks, 1);
-    dim3 grid(input_channels, batch_size, 1);
-    int filter_multiplier = output_channels / input_channels;
-
-#define check_case(c_filter_multiplier, c_stride, c_filter)             \
-  if (c_filter_multiplier == 0 ||                                       \
-      filter_multiplier == c_filter_multiplier &&                       \
-          stride_height == stride_width && stride_height == c_stride && \
-          (ksize_height == ksize_width && ksize_height == c_filter ||   \
-           c_filter == -1)) {                                           \
-    KernelDepthwiseConvInputGradSp<                                     \
-        T, c_filter_multiplier, c_stride, c_filter,                     \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-        input_data, output_grad_data, filter_data, batch_size,          \
-        output_channels, output_height, output_width, input_channels,   \
-        input_height, input_width, filter_multiplier, ksize_height,     \
-        ksize_width, stride_height, stride_width, padding_height,       \
-        padding_width, dilate_height, dilate_width, input_grad_data);   \
-    return;                                                             \
-  }
-    check_case(1, 1, 3);
-    check_case(1, 1, 5);
-    check_case(1, 1, -1);
-    check_case(1, 2, 3);
-    check_case(1, 2, 5);
-    check_case(1, 2, -1);
-    check_case(2, 1, 3);
-    check_case(2, 1, 5);
-    check_case(2, 1, -1);
-    check_case(2, 2, 3);
-    check_case(2, 2, 5);
-    check_case(2, 2, -1);
-    check_case(0, 0, -1);
-// NOTE(liangdun): 0,0 for other case
-// add other case if needed, e.g. check_case(2^n,1)
-#undef check_case
-  }
-};
-
-template <typename T, bool fuse_relu_before_conv>
-class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
-                                     fuse_relu_before_conv> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* filter_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output_grad.dims()[1];
-    const int output_height = output_grad.dims()[2];
-    const int output_width = output_grad.dims()[3];
-    const int ksize_height = filter_grad->dims()[2];
-    const int ksize_width = filter_grad->dims()[3];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int dilate_height = dilations[0];
-    const int dilate_width = dilations[1];
-
-    const T* input_data = input.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
-
-    int block_size = 512;
-    if (output_width > 1024 && output_width <= 2048)
-      block_size = (output_width - 1) / 2 + 1;
-    else if (output_width > 512 && output_width <= 1024)
-      block_size = output_width;
-    int crop_output_height =
-        std::min(std::max(block_size / output_width, 1), output_height);
-    dim3 grid(ksize_width, ksize_height, output_channels);
-    dim3 threads(std::min(output_width, block_size), crop_output_height, 1);
-    int filter_multiplier = output_channels / input_channels;
-
-#define check_case(c_filter_multiplier)                                       \
-  if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
-    KernelDepthwiseConvFilterGradSp<                                          \
-        T, c_filter_multiplier,                                               \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(       \
-        output_grad_data, input_data, batch_size, output_channels,            \
-        output_height, output_width, input_channels, input_height,            \
-        input_width, filter_multiplier, ksize_height, ksize_width,            \
-        stride_height, stride_width, padding_height, padding_width,           \
-        dilate_height, dilate_width, filter_grad_data);                       \
-    return;                                                                   \
-  }
-    check_case(1);
-    check_case(0);
-#undef check_case
-  }
-};
-
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, false>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, false>;
-
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
-                                             false>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double, false>;
-
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float, false>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double, false>;
-
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, true>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, true>;
-
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
-                                             true>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double, true>;
-
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float, true>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double, true>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h
deleted file mode 100644
index 56648e4125bf1f5ef3eb128efa7849bbf5d4ad71..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * \brief Compute the depthwise convolution which include
- * forward process and backpropagation process
- */
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvInputGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvFilterGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* filter_grad);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/CMakeLists.txt b/paddle/fluid/operators/math/detail/CMakeLists.txt
deleted file mode 100644
index 0df1c060f9042067b655d987560a278f9fc46a5b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detail/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-cc_library(activation_functions SRCS avx_functions.cc)
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
deleted file mode 100644
index 24df1f93edd85145d703ed3277b0d1ca06e67009..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <string>
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-
-enum ActivationType {
-  kSigmoid,
-  kReLU,
-  kTanh,
-  kIdentity,
-};
-
-inline ActivationType GetActivationType(const std::string &type) {
-  if (type == "sigmoid") {
-    return ActivationType::kSigmoid;
-  } else if (type == "relu") {
-    return ActivationType::kReLU;
-  } else if (type == "tanh") {
-    return ActivationType::kTanh;
-  } else if (type == "identity" || type == "") {
-    return ActivationType::kIdentity;
-  }
-  PADDLE_THROW("Not support type %s.", type);
-}
-
-namespace forward {
-
-template <typename T>
-DEVICE T Identity(const T a) {
-  return a;
-}
-
-template <typename T>
-DEVICE T Relu(const T a) {
-  return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0);
-}
-
-template <typename T>
-DEVICE T Sigmoid(const T a) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  T tmp = (a < min) ? min : ((a > max) ? max : a);
-  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
-}
-
-template <typename T>
-DEVICE T Tanh(const T a) {
-  T tmp = -2.0 * a;
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
-}
-
-}  // namespace forward
-
-namespace backward {
-
-template <typename T>
-DEVICE T Identity(const T a, const T b) {
-  return a;
-}
-
-template <typename T>
-DEVICE T Relu(const T a, const T b) {
-  return a * (b > 0.0 ? 1.0 : 0.0);
-}
-
-template <typename T>
-DEVICE T Sigmoid(const T a, const T b) {
-  return a * b * (1.0 - b);
-}
-
-template <typename T>
-DEVICE T Tanh(const T a, const T b) {
-  return a * (1.0 - b * b);
-}
-
-}  // namespace backward
-
-template <typename T>
-struct Active {
-  typedef T (*Act)(T);
-  typedef T (*ActGrad)(T, T);
-};
-
-static DEVICE Active<float>::Act kActFloat[] = {
-    &forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>,
-    &forward::Identity<float>};
-
-static DEVICE Active<float>::ActGrad kActGradFloat[] = {
-    &backward::Sigmoid<float>, &backward::Relu<float>, &backward::Tanh<float>,
-    &backward::Identity<float>};
-
-static DEVICE Active<double>::Act kActDouble[] = {
-    &forward::Sigmoid<double>, &forward::Relu<double>, &forward::Tanh<double>,
-    &forward::Identity<double>};
-
-static DEVICE Active<double>::ActGrad kActGradDouble[] = {
-    &backward::Sigmoid<double>, &backward::Relu<double>,
-    &backward::Tanh<double>, &backward::Identity<double>};
-
-namespace forward {
-inline DEVICE float activation(float a, int index) {
-  return kActFloat[index](a);
-}
-
-inline DEVICE double activation(double a, int index) {
-  return kActDouble[index](a);
-}
-
-}  // namespace forward
-
-namespace backward {
-inline DEVICE float activation(float a, float b, int index) {
-  return kActGradFloat[index](a, b);
-}
-
-inline DEVICE double activation(double a, double b, int index) {
-  return kActGradDouble[index](a, b);
-}
-}  // namespace backward
-
-#ifdef __AVX__
-namespace forward {
-namespace avx {
-__m256 Relu(const __m256 a);
-__m256 Sigmoid(const __m256 a);
-__m256 Tanh(const __m256 a);
-__m256 Identity(const __m256 a);
-}  // namespace avx
-}  // namespace forward
-
-namespace backward {
-namespace avx {
-__m256 Relu(const __m256 a, const __m256 b);
-__m256 Sigmoid(const __m256 a, const __m256 b);
-__m256 Tanh(const __m256 a, const __m256 b);
-__m256 Identity(const __m256 a, const __m256 b);
-}  // namespace avx
-}  // namespace backward
-
-static Active<__m256>::Act kActAvx[] = {
-    &forward::avx::Sigmoid, &forward::avx::Relu, &forward::avx::Tanh,
-    &forward::avx::Identity};
-
-static Active<__m256>::ActGrad kActGradAvx[] = {
-    &backward::avx::Sigmoid, &backward::avx::Relu, &backward::avx::Tanh,
-    &backward::avx::Identity};
-
-namespace forward {
-inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); }
-}  // namespace forward
-
-namespace backward {
-inline __m256 activation(__m256 a, __m256 b, int index) {
-  return kActGradAvx[index](a, b);
-}
-}  // namespace backward
-
-#endif
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc
deleted file mode 100644
index 022ffc533779363b08404b8715ac37194a4be392..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __AVX__
-
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/detail/avx_mathfun.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-__m256 Exp(__m256 a) { return exp256_ps(a); }
-
-namespace forward {
-namespace avx {
-__m256 Relu(const __m256 a) {
-  __m256 tmp = _mm256_set1_ps(0.0f);
-  return _mm256_max_ps(a, tmp);
-}
-
-__m256 Sigmoid(const __m256 a) {
-  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-  __m256 tmp = _mm256_max_ps(a, min);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-  tmp = Exp(tmp);
-  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-  return tmp;
-}
-
-__m256 Tanh(const __m256 a) {
-  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
-  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = Exp(tmp);
-  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
-                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
-                       _mm256_set1_ps(1.0f));
-}
-
-__m256 Identity(const __m256 a) { return a; }
-
-}  // namespace avx
-}  // namespace forward
-
-namespace backward {
-namespace avx {
-__m256 Relu(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(
-      a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-                       _mm256_set1_ps(1.0f)));
-}
-
-__m256 Sigmoid(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(_mm256_mul_ps(a, b),
-                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
-}
-
-__m256 Tanh(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(
-      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
-}
-
-__m256 Identity(const __m256 a, const __m256 b) { return a; }
-}  // namespace avx
-}  // namespace backward
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/operators/math/detail/avx_mathfun.h b/paddle/fluid/operators/math/detail/avx_mathfun.h
deleted file mode 100644
index d7cf91134e4553dfcd935a31993e06dfa74650ac..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detail/avx_mathfun.h
+++ /dev/null
@@ -1,731 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/*
-   AVX implementation of sin, cos, sincos, exp and log
-
-   Based on "sse_mathfun.h", by Julien Pommier
-   http://gruntthepeon.free.fr/ssemath/
-
-   Copyright (C) 2012 Giovanni Garberoglio
-   Interdisciplinary Laboratory for Computational Science (LISC)
-   Fondazione Bruno Kessler and University of Trento
-   via Sommarive, 18
-   I-38123 Trento (Italy)
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-
-  (this is the zlib license)
-*/
-
-#include "paddle/fluid/platform/cpu_info.h"
-
-/* __m128 is ugly to write */
-typedef __m256 v8sf;   // vector of 8 float (avx)
-typedef __m256i v8si;  // vector of 8 int   (avx)
-typedef __m128i v4si;  // vector of 8 int   (avx)
-
-#define _PI32AVX_CONST(Name, Val)                                          \
-  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, \
-                                                                 Val, Val}
-
-_PI32AVX_CONST(1, 1);
-_PI32AVX_CONST(inv1, ~1);
-_PI32AVX_CONST(2, 2);
-_PI32AVX_CONST(4, 4);
-
-/* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val)                                   \
-  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-#define _PI32_CONST256(Name, Val)                                  \
-  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-#define _PS256_CONST_TYPE(Name, Type, Val)                       \
-  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-
-_PS256_CONST(1, 1.0f);
-_PS256_CONST(0p5, 0.5f);
-/* the smallest non denormalized float number */
-_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
-_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
-_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
-
-_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
-_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
-
-_PI32_CONST256(0, 0);
-_PI32_CONST256(1, 1);
-_PI32_CONST256(inv1, ~1);
-_PI32_CONST256(2, 2);
-_PI32_CONST256(4, 4);
-_PI32_CONST256(0x7f, 0x7f);
-
-_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
-_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
-_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
-_PS256_CONST(cephes_log_q1, -2.12194440e-4);
-_PS256_CONST(cephes_log_q2, 0.693359375);
-
-#ifndef __AVX2__
-
-typedef union imm_xmm_union {
-  v8si imm;
-  v4si xmm[2];
-} imm_xmm_union;
-
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)  \
-  {                                          \
-    imm_xmm_union ALIGN32_BEG u ALIGN32_END; \
-    u.imm = imm_;                            \
-    xmm0_ = u.xmm[0];                        \
-    xmm1_ = u.xmm[1];                        \
-  }
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)  \
-  {                                          \
-    imm_xmm_union ALIGN32_BEG u ALIGN32_END; \
-    u.xmm[0] = xmm0_;                        \
-    u.xmm[1] = xmm1_;                        \
-    imm_ = u.imm;                            \
-  }
-
-#define AVX2_BITOP_USING_SSE2(fn)                        \
-  static inline v8si avx2_mm256_##fn(v8si x, int a) {    \
-    /* use SSE2 instruction to perform the bitop AVX2 */ \
-    v4si x1, x2;                                         \
-    v8si ret;                                            \
-    COPY_IMM_TO_XMM(x, x1, x2);                          \
-    x1 = _mm_##fn(x1, a);                                \
-    x2 = _mm_##fn(x2, a);                                \
-    COPY_XMM_TO_IMM(x1, x2, ret);                        \
-    return (ret);                                        \
-  }
-
-//#warning "Using SSE2 to perform AVX2 bitshift ops"
-AVX2_BITOP_USING_SSE2(slli_epi32)
-AVX2_BITOP_USING_SSE2(srli_epi32)
-
-#define AVX2_INTOP_USING_SSE2(fn)                                     \
-  static inline v8si avx2_mm256_##fn(v8si x, v8si y) {                \
-    /* use SSE2 instructions to perform the AVX2 integer operation */ \
-    v4si x1, x2;                                                      \
-    v4si y1, y2;                                                      \
-    v8si ret;                                                         \
-    COPY_IMM_TO_XMM(x, x1, x2);                                       \
-    COPY_IMM_TO_XMM(y, y1, y2);                                       \
-    x1 = _mm_##fn(x1, y1);                                            \
-    x2 = _mm_##fn(x2, y2);                                            \
-    COPY_XMM_TO_IMM(x1, x2, ret);                                     \
-    return (ret);                                                     \
-  }
-
-//#warning "Using SSE2 to perform AVX2 integer ops"
-AVX2_INTOP_USING_SSE2(and_si128)
-AVX2_INTOP_USING_SSE2(andnot_si128)
-AVX2_INTOP_USING_SSE2(cmpeq_epi32)
-AVX2_INTOP_USING_SSE2(sub_epi32)
-AVX2_INTOP_USING_SSE2(add_epi32)
-#define avx2_mm256_and_si256 avx2_mm256_and_si128
-#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128
-#else
-#define avx2_mm256_slli_epi32 _mm256_slli_epi32
-#define avx2_mm256_srli_epi32 _mm256_srli_epi32
-#define avx2_mm256_and_si256 _mm256_and_si256
-#define avx2_mm256_andnot_si256 _mm256_andnot_si256
-#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32
-#define avx2_mm256_sub_epi32 _mm256_sub_epi32
-#define avx2_mm256_add_epi32 _mm256_add_epi32
-#endif /* __AVX2__ */
-
-/* natural logarithm computed for 8 simultaneous float
-   return NaN for x <= 0
-*/
-v8sf log256_ps(v8sf x) {
-  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
-
-  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
-  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
-
-  x = _mm256_max_ps(
-      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
-
-  // can be done with AVX2
-  imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
-
-  /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
-
-  // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
-  v8sf e = _mm256_cvtepi32_ps(imm0);
-
-  e = _mm256_add_ps(e, one);
-
-  /* part2:
-     if( x < SQRTHF ) {
-       e -= 1;
-       x = x + x - 1.0;
-     } else { x = x - 1.0; }
-  */
-  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
-  v8sf tmp = _mm256_and_ps(x, mask);
-  x = _mm256_sub_ps(x, one);
-  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
-  x = _mm256_add_ps(x, tmp);
-
-  v8sf z = _mm256_mul_ps(x, x);
-
-  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
-  y = _mm256_mul_ps(y, x);
-
-  y = _mm256_mul_ps(y, z);
-
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
-  y = _mm256_add_ps(y, tmp);
-
-  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
-  x = _mm256_add_ps(x, y);
-  x = _mm256_add_ps(x, tmp);
-  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
-  return x;
-}
-
-_PS256_CONST(exp_hi, 88.3762626647949f);
-_PS256_CONST(exp_lo, -88.3762626647949f);
-
-_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
-_PS256_CONST(cephes_exp_C1, 0.693359375);
-_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
-
-_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
-_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
-_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
-_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
-_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
-_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
-
-v8sf exp256_ps(v8sf x) {
-  v8sf tmp = _mm256_setzero_ps(), fx;
-  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
-
-  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
-
-  /* how to perform a floorf with SSE: just below */
-  // imm0 = _mm256_cvttps_epi32(fx);
-  // tmp  = _mm256_cvtepi32_ps(imm0);
-
-  tmp = _mm256_floor_ps(fx);
-
-  /* if greater, substract 1 */
-  // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
-  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
-  mask = _mm256_and_ps(mask, one);
-  fx = _mm256_sub_ps(tmp, mask);
-
-  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
-  x = _mm256_sub_ps(x, tmp);
-  x = _mm256_sub_ps(x, z);
-
-  z = _mm256_mul_ps(x, x);
-
-  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, x);
-  y = _mm256_add_ps(y, one);
-
-  /* build 2^n */
-  imm0 = _mm256_cvttps_epi32(fx);
-  // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
-  imm0 = avx2_mm256_slli_epi32(imm0, 23);
-  v8sf pow2n = _mm256_castsi256_ps(imm0);
-  y = _mm256_mul_ps(y, pow2n);
-  return y;
-}
-
-_PS256_CONST(minus_cephes_DP1, -0.78515625);
-_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
-_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
-_PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1, 8.3321608736E-3);
-_PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0, 2.443315711809948E-005);
-_PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2, 4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
-
-/* evaluation of 8 sines at onces using AVX intrisics
-
-   The code is the exact rewriting of the cephes sinf function.
-   Precision is excellent as long as x < 8192 (I did not bother to
-   take into account the special handling they have for greater values
-   -- it does not return garbage for arguments over 8192, though, but
-   the extra precision is missing).
-
-   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-   surprising but correct result.
-
-*/
-v8sf sin256_ps(v8sf x) {  // any x
-  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
-  v8si imm0, imm2;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-#endif
-
-  sign_bit = x;
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-  /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-/*
-  Here we start a series of integer operations, which are in the
-  realm of AVX2.
-  If we don't have AVX, let's perform them using SSE2 directives
-*/
-
-#ifdef __AVX2__
-  /* store the integer part of y in mm0 */
-  imm2 = _mm256_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask
-     there is one polynom for 0 <= x <= Pi/4
-     and another one for Pi/4<x<=Pi/2
-
-     Both branches will be computed.
-  */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-#else
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-
-  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x, x);
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
-  y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y, y2);
-  /* update the sign */
-  y = _mm256_xor_ps(y, sign_bit);
-
-  return y;
-}
-
-/* almost the same as sin_ps */
-v8sf cos256_ps(v8sf x) {  // any x
-  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
-  v8si imm0, imm2;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-#endif
-
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-#ifdef __AVX2__
-  /* store the integer part of y in mm0 */
-  imm2 = _mm256_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-  y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
-
-  /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-#else
-
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-
-  v8sf sign_bit = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x, x);
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
-  y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y, y2);
-  /* update the sign */
-  y = _mm256_xor_ps(y, sign_bit);
-
-  return y;
-}
-
-/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
-   replace both of them..
-   it is almost as fast, and gives you a free cosine with your sine */
-void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
-  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
-  v8si imm0, imm2, imm4;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-  v4si imm4_1, imm4_2;
-#endif
-
-  sign_bit_sin = x;
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-  /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-#ifdef __AVX2__
-  /* store the integer part of y in imm2 */
-  imm2 = _mm256_cvttps_epi32(y);
-
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-
-  y = _mm256_cvtepi32_ps(imm2);
-  imm4 = imm2;
-
-  /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
-
-  /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-// v8sf poly_mask = _mm256_castsi256_ps(imm2);
-#else
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm4_1 = imm2_1;
-  imm4_2 = imm2_2;
-
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-#ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
-  imm4 = avx2_mm256_slli_epi32(imm4, 29);
-#else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
-
-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
-
-  imm4_1 = _mm_slli_epi32(imm4_1, 29);
-  imm4_2 = _mm_slli_epi32(imm4_2, 29);
-
-  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
-#endif
-
-  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
-
-  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  v8sf z = _mm256_mul_ps(x, x);
-  y = *(v8sf *)_ps256_coscof_p0;
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
-  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
-  y2 = _mm256_sub_ps(y2, ysin2);
-  y = _mm256_sub_ps(y, ysin1);
-
-  xmm1 = _mm256_add_ps(ysin1, ysin2);
-  xmm2 = _mm256_add_ps(y, y2);
-
-  /* update the sign */
-  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
-  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
-}
diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
deleted file mode 100644
index c6dd972e12b763283a4212d4c56844afb1c2fd7a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ /dev/null
@@ -1,499 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-#ifndef __NVCC__
-
-template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                       T *gate_value, T *reset_output_value,
-                                       T *prev_output_value, int frame_size,
-                                       ActivationType active_gate) {
-  T r_value_update_gate;
-  T r_value_reset_gate;
-  T r_value_reset_output;
-  T r_prev_out = 0;
-  T *update_gate = gate_value;
-  T *reset_gate = gate_value + frame_size;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_reset_gate = reset_gate[i];
-    if (prev_output_value) {
-      r_prev_out = prev_output_value[i];
-    }
-
-    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
-                    &r_value_reset_output, active_gate);
-
-    update_gate[i] = r_value_update_gate;
-    reset_gate[i] = r_value_reset_gate;
-    reset_output_value[i] = r_value_reset_output;
-  }
-}
-
-template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
-                                       T *gate_value, T *prev_output_value,
-                                       T *output_value, int frame_size,
-                                       ActivationType active_node,
-                                       bool origin_mode) {
-  T r_value_update_gate;
-  T r_value_frame_state;
-  T r_prev_out = 0;
-  T r_output;
-  T *update_gate = gate_value;
-  T *frame_state = gate_value + frame_size * 2;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_frame_state = frame_state[i];
-    if (prev_output_value) {
-      r_prev_out = prev_output_value[i];
-    }
-
-    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                    &r_output, active_node, origin_mode);
-
-    frame_state[i] = r_value_frame_state;
-    output_value[i] = r_output;
-  }
-}
-
-template <class OpResetOutput, typename T>
-void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                     T *gate_value, T *reset_output_value,
-                                     T *prev_output_value, int frame_size,
-                                     ActivationType active_gate) {
-#ifdef __AVX__
-  __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
-  __m256 r_value_reset_gate, r_value_reset_gate_last = _mm256_set1_ps(0.0f);
-  __m256 r_value_reset_output;
-  __m256 r_prev_out = _mm256_set1_ps(0.0f),
-         r_prev_out_last = _mm256_set1_ps(0.0f);
-  T *update_gate = gate_value;
-  T *reset_gate = gate_value + frame_size;
-  int block = 8;
-  const int n = frame_size;
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-
-  if (rest > 0) {
-    i = n - block;
-    r_value_update_gate_last =
-        _mm256_loadu_ps((const float *)(update_gate + i));
-    r_value_reset_gate_last = _mm256_loadu_ps((const float *)(reset_gate + i));
-    if (prev_output_value) {
-      r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i));
-    }
-  }
-
-  for (i = 0; i < end; i += block) {
-    r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i));
-    r_value_reset_gate = _mm256_loadu_ps((const float *)(reset_gate + i));
-    if (prev_output_value) {
-      r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
-    }
-
-    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
-                    &r_value_reset_output, active_gate);
-
-    _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
-                     r_value_update_gate);
-    _mm256_storeu_ps(reinterpret_cast<float *>(reset_gate + i),
-                     r_value_reset_gate);
-    _mm256_storeu_ps(reinterpret_cast<float *>(reset_output_value + i),
-                     r_value_reset_output);
-  }
-
-  if (rest > 0) {
-    i = n - block;
-
-    op_reset_output(&r_value_update_gate_last, &r_value_reset_gate_last,
-                    &r_prev_out_last, &r_value_reset_output, active_gate);
-
-    _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
-                     r_value_update_gate_last);
-    _mm256_storeu_ps(reinterpret_cast<float *>(reset_gate + i),
-                     r_value_reset_gate_last);
-    _mm256_storeu_ps(reinterpret_cast<float *>(reset_output_value + i),
-                     r_value_reset_output);
-  }
-#endif
-}
-
-template <class OpFinalOutput, typename T>
-void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
-                                     T *gate_value, T *prev_output_value,
-                                     T *output_value, int frame_size,
-                                     ActivationType active_node,
-                                     bool origin_mode) {
-#ifdef __AVX__
-  __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
-  __m256 r_value_frame_state, r_value_frame_state_last = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out = _mm256_set1_ps(0.0f),
-         r_prev_out_last = _mm256_set1_ps(0.0f);
-  __m256 r_output;
-  T *update_gate = gate_value;
-  T *frame_state = gate_value + frame_size * 2;
-  int block = 8;
-  const int n = frame_size;
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-
-  if (rest > 0) {
-    i = n - block;
-    r_value_update_gate_last =
-        _mm256_loadu_ps((const float *)(update_gate + i));
-    r_value_frame_state_last =
-        _mm256_loadu_ps((const float *)(frame_state + i));
-    if (prev_output_value) {
-      r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i));
-    }
-  }
-
-  for (i = 0; i < end; i += block) {
-    r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i));
-    r_value_frame_state = _mm256_loadu_ps((const float *)(frame_state + i));
-    if (prev_output_value) {
-      r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
-    }
-
-    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                    &r_output, active_node, origin_mode);
-
-    _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
-                     r_value_frame_state);
-    _mm256_storeu_ps(reinterpret_cast<float *>(output_value + i), r_output);
-  }
-
-  if (rest > 0) {
-    i = n - block;
-    op_final_output(&r_value_update_gate_last, &r_value_frame_state_last,
-                    &r_prev_out_last, &r_output, active_node, origin_mode);
-
-    _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
-                     r_value_frame_state_last);
-    _mm256_storeu_ps(reinterpret_cast<float *>(output_value + i), r_output);
-  }
-
-#endif
-}
-
-template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput op_reset_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_gate) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
-        (sizeof(T) == 4)) {
-      hl_avx_gru_forward_reset_output(
-          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prev_out_value, frame_size, active_gate);
-    } else {
-      hl_naive_gru_forward_reset_output(
-          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prev_out_value, frame_size, active_gate);
-    }
-
-    value.gate_value += frame_size * 3;
-    value.reset_output_value += frame_size;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-  }
-}
-
-template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput op_final_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node,
-                                 bool origin_mode) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
-        (sizeof(T) == 4)) {
-      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
-                                      value.prev_out_value, value.output_value,
-                                      frame_size, active_node, origin_mode);
-    } else {
-      hl_naive_gru_forward_final_output(
-          op_final_output, value.gate_value, value.prev_out_value,
-          value.output_value, frame_size, active_node, origin_mode);
-    }
-
-    value.gate_value += frame_size * 3;
-    value.output_value += frame_size;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-  }
-}
-
-template <class OpStateGrad, typename T>
-void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
-                                      T *gate_grad, T *prev_out_value,
-                                      T *prev_out_grad, T *output_grad,
-                                      int frame_size,
-                                      ActivationType active_node,
-                                      bool origin_mode) {
-  T r_update_gate_value;
-  T r_update_gate_grad;
-  T r_frame_state_value;
-  T r_frame_state_grad;
-  T r_out_grad;
-  T r_prev_out_value = 0;
-  T r_prev_out_grad = 0;
-  T *update_gate_value = gate_value;
-  T *update_gate_grad = gate_grad;
-  T *frame_state_value = gate_value + frame_size * 2;
-  T *frame_state_grad = gate_grad + frame_size * 2;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_frame_state_value = frame_state_value[i];
-    r_out_grad = output_grad[i];
-    if (prev_out_value) {
-      r_prev_out_value = prev_out_value[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = prev_out_grad[i];
-    }
-
-    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_out_grad, active_node, origin_mode);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    frame_state_grad[i] = r_frame_state_grad;
-    if (prev_out_grad) {
-      prev_out_grad[i] = r_prev_out_grad;
-    }
-  }
-}
-
-template <class OpResetGrad, typename T>
-void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
-                                      T *gate_grad, T *prev_out_value,
-                                      T *prev_out_grad, T *reset_output_grad,
-                                      int frame_size,
-                                      ActivationType active_gate) {
-  T r_update_gate_value;
-  T r_update_gate_grad;
-  T r_reset_gate_value;
-  T r_reset_gate_grad;
-  T r_reset_output_grad = 0;
-  T r_prev_out_value = 0;
-  T r_prev_out_grad = 0;
-  T *update_gate_value = gate_value;
-  T *update_gate_grad = gate_grad;
-  T *reset_gate_value = gate_value + frame_size;
-  T *reset_gate_grad = gate_grad + frame_size;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_update_gate_grad = update_gate_grad[i];
-    r_reset_gate_value = reset_gate_value[i];
-
-    if (prev_out_value && prev_out_grad) {
-      r_reset_output_grad = reset_output_grad[i];
-    }
-    if (prev_out_value) {
-      r_prev_out_value = prev_out_value[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = prev_out_grad[i];
-    }
-
-    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    reset_gate_grad[i] = r_reset_gate_grad;
-    if (prev_out_grad) {
-      prev_out_grad[i] = r_prev_out_grad;
-    }
-  }
-}
-
-template <class OpStateGrad, typename T>
-void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
-                                    T *gate_grad, T *prev_out_value,
-                                    T *prev_out_grad, T *output_grad,
-                                    int frame_size, ActivationType active_node,
-                                    bool origin_mode) {
-#ifdef __AVX__
-  __m256 r_update_gate_value;
-  __m256 r_update_gate_grad;
-  __m256 r_frame_state_value;
-  __m256 r_frame_state_grad;
-  __m256 r_out_grad;
-  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
-  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
-  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
-  __m256 *frame_state_value =
-      reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
-  __m256 *frame_state_grad =
-      reinterpret_cast<__m256 *>(gate_grad + frame_size * 2);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_frame_state_value = frame_state_value[i];
-    r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i];
-    if (prev_out_value) {
-      r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
-    }
-
-    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_out_grad, active_node, origin_mode);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    frame_state_grad[i] = r_frame_state_grad;
-    if (prev_out_grad) {
-      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
-    }
-  }
-#endif
-}
-
-template <class OpResetGrad, typename T>
-void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
-                                    T *gate_grad, T *prev_out_value,
-                                    T *prev_out_grad, T *reset_output_grad,
-                                    int frame_size,
-                                    ActivationType active_gate) {
-#ifdef __AVX__
-  __m256 r_update_gate_value;
-  __m256 r_update_gate_grad;
-  __m256 r_reset_gate_value;
-  __m256 r_reset_gate_grad;
-  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
-  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
-  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
-  __m256 *reset_gate_value =
-      reinterpret_cast<__m256 *>(gate_value + frame_size);
-  __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_update_gate_grad = update_gate_grad[i];
-    r_reset_gate_value = reset_gate_value[i];
-
-    if (prev_out_value && prev_out_grad) {
-      r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
-    }
-    if (prev_out_value) {
-      r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
-    }
-
-    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
-                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
-                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    reset_gate_grad[i] = r_reset_gate_grad;
-    if (prev_out_grad) {
-      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
-    }
-  }
-#endif
-}
-
-template <class OpStateGrad, typename T>
-inline void backward_state_grad(OpStateGrad op_state_grad,
-                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                                int frame_size, int batch_size,
-                                ActivationType active_node, bool origin_mode) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward_state_grad(op_state_grad, value.gate_value,
-                                     grad.gate_grad, value.prev_out_value,
-                                     grad.prev_out_grad, grad.output_grad,
-                                     frame_size, active_node, origin_mode);
-    } else {
-      hl_naive_gru_backward_state_grad(op_state_grad, value.gate_value,
-                                       grad.gate_grad, value.prev_out_value,
-                                       grad.prev_out_grad, grad.output_grad,
-                                       frame_size, active_node, origin_mode);
-    }
-
-    value.gate_value += frame_size * 3;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-
-    grad.gate_grad += frame_size * 3;
-    grad.output_grad += frame_size;
-    if (grad.prev_out_grad) {
-      grad.prev_out_grad += frame_size;
-    }
-  }
-}
-
-template <class OpResetGrad, typename T>
-inline void backward_reset_grad(OpResetGrad op_reset_grad,
-                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                                int frame_size, int batch_size,
-                                ActivationType active_gate) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward_reset_grad(
-          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
-    } else {
-      hl_naive_gru_backward_reset_grad(
-          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
-    }
-
-    value.gate_value += frame_size * 3;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-
-    grad.gate_grad += frame_size * 3;
-    grad.reset_output_grad += frame_size;
-    if (grad.prev_out_grad) {
-      grad.prev_out_grad += frame_size;
-    }
-  }
-}
-
-#endif
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
deleted file mode 100644
index 77d7ff57cda7416705bed7eb393366e1f87232a0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class OpResetOutput, bool is_batch, typename T>
-__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
-                                        T *gate_value, T *reset_output_value,
-                                        T *prev_output_value, int frame_size,
-                                        int batch_size,
-                                        ActivationType active_gate) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    gate_value += batch_idx * 3 * frame_size;
-    reset_output_value += batch_idx * frame_size;
-  }
-
-  T r_prev_out = 0;
-  T r_value_reset_output;
-  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
-  T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
-
-  if (prev_output_value) {
-    if (is_batch) prev_output_value += batch_idx * frame_size;
-    r_prev_out = prev_output_value[frame_idx];
-  }
-
-  op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
-                  &r_value_reset_output, active_gate);
-
-  gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
-  gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
-  reset_output_value[frame_idx] = r_value_reset_output;
-}
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class OpFinalOutput, bool is_batch, typename T>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
-                                        T *gate_value, T *prev_output_value,
-                                        T *output_value, int frame_size,
-                                        int batch_size,
-                                        ActivationType active_node,
-                                        bool origin_mode) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    gate_value += batch_idx * 3 * frame_size;
-    output_value += batch_idx * frame_size;
-  }
-
-  T r_output;
-  T r_prev_out = 0;
-  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
-  T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
-
-  if (prev_output_value) {
-    if (is_batch) prev_output_value += batch_idx * frame_size;
-    r_prev_out = prev_output_value[frame_idx];
-  }
-
-  op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                  &r_output, active_node, origin_mode);
-
-  gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
-  output_value[frame_idx] = r_output;
-}
-
-/*
- * threads(tile_size, 1)
- * grid(frame_blocks, 1)
- */
-template <class T, int Tiled_size>
-__global__ void KeFastCollectiveGruGate(T *gate_value, T *prev_output_value,
-                                        T *gate_weight, T *reset_output,
-                                        int frame_size,
-                                        ActivationType active_node) {
-  T xt_0 = 0.0f;
-  T a0 = 0.0f;
-  T c0 = 0.0f;
-  T b0[Tiled_size];
-
-  int COL = blockIdx.x * blockDim.x + threadIdx.x;
-  int Tiled_mask = ((1 << Tiled_size) - 1);
-  // Tiled  matrix multiply using register shift, faster than sm.
-  if (prev_output_value) {
-    for (int k = 0; k < (((frame_size - 1) / Tiled_size) + 1); ++k) {
-      a0 = 0;
-      if ((threadIdx.x + k * Tiled_size) < frame_size) {
-        a0 = prev_output_value[threadIdx.x + (k * Tiled_size)];
-      }
-      for (int i = 0; i < Tiled_size; i++) {
-        if (COL < frame_size * 2 && (i + k * Tiled_size) < frame_size) {
-          b0[i] = gate_weight[(i + k * Tiled_size) * frame_size * 2 + COL];
-        }
-      }
-
-      for (int i = 0; i < Tiled_size; ++i) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-        c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i];
-#else
-        c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i];
-#endif
-      }
-    }
-  }
-
-  __syncthreads();
-
-  if (COL < frame_size * 2) {
-    xt_0 = gate_value[COL];
-    c0 += xt_0;
-    c0 = forward::activation(c0, active_node);
-    gate_value[COL] = c0;
-    if (frame_size <= COL && COL < frame_size * 2) {
-      T htp_0 = 0.0;
-      if (prev_output_value) {
-        htp_0 = prev_output_value[COL - frame_size];
-      }
-      reset_output[COL - frame_size] = c0 * htp_0;
-    } else if (COL < frame_size) {
-      gate_value[COL] = c0;
-    }
-  }
-}
-
-/*
- * threads(tile_size, 1)
- * grid(frame_blocks, 1)
- */
-template <class T, int Tiled_size>
-__global__ void KeFastCollectiveGruOut(T *gate_weight, T *prev_out_value,
-                                       T *output_value, T *gate_value,
-                                       T *reset_value, int frame_size,
-                                       ActivationType act_node,
-                                       bool origin_mode) {
-  int COL = blockIdx.x * blockDim.x + threadIdx.x;
-
-  T a0 = 0.0f;
-  T b0[Tiled_size];
-  T c0 = 0.0f;
-
-  int Tiled_mask = ((1 << Tiled_size) - 1);
-  //- Tiled  matrix multiply with register shift
-  if (prev_out_value) {
-    for (int k = 0; k < (((frame_size - 1) / Tiled_size) + 1); ++k) {
-      a0 = 0;
-      if ((threadIdx.x + k * Tiled_size) < frame_size) {
-        a0 = reset_value[threadIdx.x + (k * Tiled_size)];
-      }
-      for (int i = 0; i < Tiled_size; i++) {
-        if (COL < frame_size && (i + k * Tiled_size) < frame_size) {
-          b0[i] = gate_weight[(i + k * Tiled_size) * frame_size + COL];
-        }
-      }
-
-      for (int i = 0; i < Tiled_size; ++i) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-        c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i];
-#else
-        c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i];
-#endif
-      }
-    }
-  }
-
-  __syncthreads();
-
-  if (COL < frame_size) {
-    T xt_0 = gate_value[COL + 2 * frame_size];
-    T gta_0 = gate_value[COL];
-    T htp_0 = 0;
-    if (prev_out_value) htp_0 = prev_out_value[COL];
-    c0 += xt_0;
-    c0 = forward::activation(c0, act_node);
-    gate_value[COL + 2 * frame_size] = c0;
-    if (origin_mode) {
-      output_value[COL] = htp_0 * gta_0 + (1 - gta_0) * c0;
-    } else {
-      output_value[COL] = c0 * gta_0 + (1 - gta_0) * htp_0;
-    }
-  }
-}
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class OpStateGrad, bool is_batch, typename T>
-__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
-                                       T *gate_grad, T *prev_out_value,
-                                       T *prev_out_grad, T *output_grad,
-                                       int frame_size, int batch_size,
-                                       ActivationType active_node,
-                                       bool origin_mode) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    gate_value += batch_idx * 3 * frame_size;
-    gate_grad += batch_idx * 3 * frame_size;
-    output_grad += batch_idx * frame_size;
-  }
-
-  T r_update_gate_grad;
-  T r_frame_state_grad;
-  T r_prev_out_value = 0;
-  T r_prev_out_grad = 0;
-  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
-  T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
-  T r_out_grad = output_grad[frame_idx];
-
-  if (prev_out_value && prev_out_grad) {
-    if (is_batch) prev_out_value += batch_idx * frame_size;
-    r_prev_out_value = prev_out_value[frame_idx];
-
-    if (is_batch) prev_out_grad += batch_idx * frame_size;
-    r_prev_out_grad = prev_out_grad[frame_idx];
-  }
-
-  op_state_grad(&r_update_gate_value, &r_update_gate_grad, &r_frame_state_value,
-                &r_frame_state_grad, &r_prev_out_value, &r_prev_out_grad,
-                &r_out_grad, active_node, origin_mode);
-
-  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
-  gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
-  if (prev_out_grad) {
-    prev_out_grad[frame_idx] = r_prev_out_grad;
-  }
-}
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class OpResetGrad, bool is_batch, typename T>
-__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
-                                       T *gate_grad, T *prev_out_value,
-                                       T *prev_out_grad, T *reset_output_grad,
-                                       int frame_size, int batch_size,
-                                       ActivationType active_gate) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    gate_value += batch_idx * 3 * frame_size;
-    gate_grad += batch_idx * 3 * frame_size;
-    reset_output_grad += batch_idx * frame_size;
-  }
-
-  T r_reset_gate_grad;
-  T r_prev_out_value = 0;
-  T r_prev_out_grad = 0;
-  T r_reset_output_grad = 0;
-  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
-  T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
-  T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
-
-  if (prev_out_value && prev_out_grad) {
-    if (is_batch) prev_out_value += batch_idx * frame_size;
-    if (is_batch) prev_out_grad += batch_idx * frame_size;
-    r_prev_out_value = prev_out_value[frame_idx];
-    r_prev_out_grad = prev_out_grad[frame_idx];
-    r_reset_output_grad = reset_output_grad[frame_idx];
-  }
-
-  op_reset_grad(&r_update_gate_value, &r_update_gate_grad, &r_reset_gate_value,
-                &r_reset_gate_grad, &r_prev_out_value, &r_prev_out_grad,
-                &r_reset_output_grad, active_gate);
-
-  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
-  gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
-  if (prev_out_grad) {
-    prev_out_grad[frame_idx] = r_prev_out_grad;
-  }
-}
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h
deleted file mode 100644
index 894f5f04d2451151964965bd721ff35e353ff2b5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-// TODO(guosheng): refine code style in gru_kernel
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-namespace forward {
-
-template <typename T>
-class gru_resetOutput {
- public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *value_reset_gate,
-                             T *prev_out, T *value_reset_output,
-                             ActivationType act_gate) {
-    *value_update_gate = activation(*value_update_gate, act_gate);
-    *value_reset_gate = activation(*value_reset_gate, act_gate);
-    *value_reset_output = (*prev_out) * (*value_reset_gate);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *value_reset_gate, __m256 *prev_out,
-                             __m256 *value_reset_output,
-                             ActivationType act_gate) {
-    *value_update_gate = activation(*value_update_gate, act_gate);
-    *value_reset_gate = activation(*value_reset_gate, act_gate);
-    *value_reset_output = _mm256_mul_ps(*prev_out, *value_reset_gate);
-  }
-#endif
-#endif
-};
-
-template <typename T>
-class gru_finalOutput {
- public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *value_frame_state,
-                             T *prev_out, T *value_output,
-                             ActivationType act_input, bool origin_mode) {
-    *value_frame_state = activation(*value_frame_state, act_input);
-    if (origin_mode) {
-      *value_output = ((*value_update_gate) * (*prev_out)) +
-                      *value_frame_state -
-                      ((*value_update_gate) * (*value_frame_state));
-    } else {
-      *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
-                      ((*value_update_gate) * (*value_frame_state));
-    }
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *value_frame_state, __m256 *prev_out,
-                             __m256 *value_output, ActivationType act_input,
-                             bool origin_mode) {
-    *value_frame_state = activation(*value_frame_state, act_input);
-    if (origin_mode) {
-      *value_output = _mm256_sub_ps(
-          _mm256_add_ps(_mm256_mul_ps(*value_update_gate, *prev_out),
-                        *value_frame_state),
-          _mm256_mul_ps(*value_update_gate, *value_frame_state));
-    } else {
-      *value_output = _mm256_add_ps(
-          _mm256_sub_ps(*prev_out,
-                        _mm256_mul_ps(*value_update_gate, *prev_out)),
-          _mm256_mul_ps(*value_update_gate, *value_frame_state));
-    }
-  }
-#endif
-#endif
-};
-}  // namespace forward
-
-namespace backward {
-
-template <typename T>
-class gru_stateGrad {
- public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
-                             T *value_frame_state, T *grad_frame_state,
-                             T *value_prev_out, T *grad_prev_out,
-                             T *grad_output, ActivationType act_input,
-                             bool origin_mode) {
-    if (origin_mode) {
-      *grad_update_gate =
-          (*grad_output) * ((*value_prev_out) - (*value_frame_state));
-      *grad_prev_out += (*grad_output * (*value_update_gate));
-      *grad_frame_state = activation(
-          *grad_output * (static_cast<T>(1.0) - (*value_update_gate)),
-          *value_frame_state, act_input);
-    } else {
-      *grad_update_gate =
-          (*grad_output) * ((*value_frame_state) - (*value_prev_out));
-      *grad_prev_out +=
-          (*grad_output * (static_cast<T>(1.0) - *value_update_gate));
-      *grad_frame_state = activation(*grad_output * (*value_update_gate),
-                                     *value_frame_state, act_input);
-    }
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *grad_update_gate,
-                             __m256 *value_frame_state,
-                             __m256 *grad_frame_state, __m256 *value_prev_out,
-                             __m256 *grad_prev_out, __m256 *grad_output,
-                             ActivationType act_input, bool origin_mode) {
-    if (origin_mode) {
-      *grad_update_gate = _mm256_mul_ps(
-          *grad_output, _mm256_sub_ps(*value_prev_out, *value_frame_state));
-      *grad_prev_out = _mm256_add_ps(
-          *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate));
-      *grad_frame_state = activation(
-          _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f),
-                                                    *value_update_gate)),
-          *value_frame_state, act_input);
-    } else {
-      *grad_update_gate = _mm256_mul_ps(
-          *grad_output, _mm256_sub_ps(*value_frame_state, *value_prev_out));
-      *grad_prev_out = _mm256_add_ps(
-          *grad_prev_out,
-          _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f),
-                                                    *value_update_gate)));
-      *grad_frame_state =
-          activation(_mm256_mul_ps(*grad_output, *value_update_gate),
-                     *value_frame_state, act_input);
-    }
-  }
-#endif
-#endif
-};
-
-template <typename T>
-class gru_resetGrad {
- public:
-  HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
-                             T *value_reset_gate, T *grad_reset_gate,
-                             T *value_prev_out, T *grad_prev_out,
-                             T *grad_reset_output, ActivationType act_gate) {
-    *grad_reset_gate = (*grad_reset_output * (*value_prev_out));
-    *grad_prev_out += (*grad_reset_output * (*value_reset_gate));
-    *grad_update_gate =
-        activation(*grad_update_gate, *value_update_gate, act_gate);
-    *grad_reset_gate =
-        activation(*grad_reset_gate, *value_reset_gate, act_gate);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *grad_update_gate, __m256 *value_reset_gate,
-                             __m256 *grad_reset_gate, __m256 *value_prev_out,
-                             __m256 *grad_prev_out, __m256 *grad_reset_output,
-                             ActivationType act_gate) {
-    *grad_reset_gate = _mm256_mul_ps(*grad_reset_output, *value_prev_out);
-    *grad_prev_out = _mm256_add_ps(
-        *grad_prev_out, _mm256_mul_ps(*grad_reset_output, *value_reset_gate));
-    *grad_update_gate =
-        activation(*grad_update_gate, *value_update_gate, act_gate);
-    *grad_reset_gate =
-        activation(*grad_reset_gate, *value_reset_gate, act_gate);
-  }
-#endif
-#endif
-};
-
-}  // namespace backward
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
deleted file mode 100644
index ad79c58063a8a12c703979fe32a8e671a5ade857..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-
-#if defined(_WIN32)
-#if defined(__AVX2__) || defined(__AVX__)
-inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
-#endif
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-#ifndef __NVCC__
-
-template <class T, class Op>
-void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frame_size, T cell_clip,
-                                     ActivationType active_node,
-                                     ActivationType active_gate,
-                                     ActivationType active_state) {
-  T r_value_in;
-  T r_value_ig;
-  T r_value_fg;
-  T r_value_og;
-  T r_checkI;
-  T r_checkF;
-  T r_checkO;
-  T r_state;
-  T r_prev_state = 0;
-  T r_state_atv;
-  T r_out;
-
-  T *value_in = value.gate_value;
-  T *value_ig = value.gate_value + frame_size;
-  T *value_fg = value.gate_value + frame_size * 2;
-  T *value_og = value.gate_value + frame_size * 3;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_value_in = value_in[i];
-    r_value_ig = value_ig[i];
-    r_value_fg = value_fg[i];
-    r_value_og = value_og[i];
-    r_checkI = value.check_ig ? value.check_ig[i] : 0;
-    r_checkF = value.check_fg ? value.check_fg[i] : 0;
-    r_checkO = value.check_og ? value.check_og[i] : 0;
-
-    if (value.prev_state_value) {
-      r_prev_state = value.prev_state_value[i];
-    }
-
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
-       &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       &cell_clip, active_node, active_gate, active_state);
-
-    value_in[i] = r_value_in;
-    value_ig[i] = r_value_ig;
-    value_fg[i] = r_value_fg;
-    value_og[i] = r_value_og;
-    value.state_value[i] = r_state;
-    value.state_active_value[i] = r_state_atv;
-    value.output_value[i] = r_out;
-  }
-}
-
-template <class T, class Op>
-void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                      LstmMetaGrad<T> grad, int frame_size,
-                                      T cell_clip, ActivationType active_node,
-                                      ActivationType active_gate,
-                                      ActivationType active_state) {
-  T r_value_in;
-  T r_value_ig;
-  T r_value_fg;
-  T r_value_og;
-  T r_grad_in;
-  T r_grad_ig;
-  T r_grad_fg;
-  T r_grad_og;
-  T r_prev_state = 0;
-  T r_prev_state_grad;
-  T r_state;
-  T r_state_grad;
-  T r_state_atv;
-  T r_output_grad;
-  T r_checkI;
-  T r_checkF;
-  T r_checkO;
-  T r_checkIGrad;
-  T r_checkFGrad;
-  T r_checkOGrad;
-
-  T *value_in = value.gate_value;
-  T *value_ig = value.gate_value + frame_size;
-  T *value_fg = value.gate_value + frame_size * 2;
-  T *value_og = value.gate_value + frame_size * 3;
-  T *grad_in = grad.gate_grad;
-  T *grad_ig = grad.gate_grad + frame_size;
-  T *grad_fg = grad.gate_grad + frame_size * 2;
-  T *grad_og = grad.gate_grad + frame_size * 3;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_value_in = value_in[i];
-    r_value_ig = value_ig[i];
-    r_value_fg = value_fg[i];
-    r_value_og = value_og[i];
-    r_checkI = value.check_ig ? value.check_ig[i] : 0;
-    r_checkF = value.check_fg ? value.check_fg[i] : 0;
-    r_checkO = value.check_og ? value.check_og[i] : 0;
-    r_state = value.state_value[i];
-    r_state_atv = value.state_active_value[i];
-    r_output_grad = grad.output_grad[i];
-    r_state_grad = grad.state_grad[i];
-    if (value.prev_state_value) {
-      r_prev_state = value.prev_state_value[i];
-    }
-
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in,
-       &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
-       &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
-       &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       &cell_clip, active_node, active_gate, active_state);
-
-    grad_in[i] = r_grad_in;
-    grad_ig[i] = r_grad_ig;
-    grad_fg[i] = r_grad_fg;
-    grad_og[i] = r_grad_og;
-    grad.state_grad[i] = r_state_grad;
-
-    if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
-    if (value.prev_state_value) {
-      if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
-      if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
-    }
-    if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
-  }
-}
-
-template <class T, class Op>
-void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                   int frame_size, T cell_clip,
-                                   ActivationType active_node,
-                                   ActivationType active_gate,
-                                   ActivationType active_state) {
-#ifdef __AVX__
-  __m256 r_value_in;
-  __m256 r_value_ig;
-  __m256 r_value_fg;
-  __m256 r_value_og;
-  __m256 r_checkI = _mm256_set1_ps(0.0f);
-  __m256 r_checkF = _mm256_set1_ps(0.0f);
-  __m256 r_checkO = _mm256_set1_ps(0.0f);
-  __m256 r_state;
-  __m256 r_prev_state = _mm256_set1_ps(0.0f);
-  __m256 r_state_atv;
-  __m256 r_out;
-
-  __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
-  __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
-  __m256 *value_fg =
-      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
-  __m256 *value_og =
-      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_value_in = value_in[i];
-    r_value_ig = value_ig[i];
-    r_value_fg = value_fg[i];
-    r_value_og = value_og[i];
-    if (value.check_ig) {
-      r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
-      r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
-      r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
-    }
-
-    if (value.prev_state_value) {
-      r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
-    }
-
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
-       &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       &cell_clip, active_node, active_gate, active_state);
-
-    value_in[i] = r_value_in;
-    value_ig[i] = r_value_ig;
-    value_fg[i] = r_value_fg;
-    value_og[i] = r_value_og;
-    (reinterpret_cast<__m256 *>(value.state_value))[i] = r_state;
-    (reinterpret_cast<__m256 *>(value.state_active_value))[i] = r_state_atv;
-    (reinterpret_cast<__m256 *>(value.output_value))[i] = r_out;
-  }
-#endif
-}
-
-template <class T, class Op>
-void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                    LstmMetaGrad<T> grad, int frame_size,
-                                    T cell_clip, ActivationType active_node,
-                                    ActivationType active_gate,
-                                    ActivationType active_state) {
-#ifdef __AVX__
-  __m256 r_value_in;
-  __m256 r_value_ig;
-  __m256 r_value_fg;
-  __m256 r_value_og;
-  __m256 r_grad_in;
-  __m256 r_grad_ig;
-  __m256 r_grad_fg;
-  __m256 r_grad_og;
-  __m256 r_prev_state = _mm256_set1_ps(0.0f);
-  __m256 r_prev_state_grad;
-  __m256 r_state_grad;
-  __m256 r_state;
-  __m256 r_state_atv;
-  __m256 r_output_grad;
-  __m256 r_checkI = _mm256_set1_ps(0.0f);
-  __m256 r_checkF = _mm256_set1_ps(0.0f);
-  __m256 r_checkO = _mm256_set1_ps(0.0f);
-  __m256 r_checkIGrad;
-  __m256 r_checkFGrad;
-  __m256 r_checkOGrad;
-
-  __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
-  __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
-  __m256 *value_fg =
-      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
-  __m256 *value_og =
-      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
-  __m256 *grad_in = reinterpret_cast<__m256 *>(grad.gate_grad);
-  __m256 *grad_ig = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size);
-  __m256 *grad_fg = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 2);
-  __m256 *grad_og = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 3);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_value_in = value_in[i];
-    r_value_ig = value_ig[i];
-    r_value_fg = value_fg[i];
-    r_value_og = value_og[i];
-    if (value.check_ig) {
-      r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
-      r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
-      r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
-    }
-    r_state = (reinterpret_cast<__m256 *>(value.state_value))[i];
-    r_state_atv = (reinterpret_cast<__m256 *>(value.state_active_value))[i];
-    r_output_grad = (reinterpret_cast<__m256 *>(grad.output_grad))[i];
-    r_state_grad = (reinterpret_cast<__m256 *>(grad.state_grad))[i];
-    if (value.prev_state_value) {
-      r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
-    }
-
-    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in,
-       &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
-       &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
-       &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       &cell_clip, active_node, active_gate, active_state);
-
-    grad_in[i] = r_grad_in;
-    grad_ig[i] = r_grad_ig;
-    grad_fg[i] = r_grad_fg;
-    grad_og[i] = r_grad_og;
-    (reinterpret_cast<__m256 *>(grad.state_grad))[i] = r_state_grad;
-
-    if (grad.prev_state_grad)
-      (reinterpret_cast<__m256 *>(grad.prev_state_grad))[i] = r_prev_state_grad;
-    if (value.prev_state_value) {
-      if (grad.check_ig_grad)
-        (reinterpret_cast<__m256 *>(grad.check_ig_grad))[i] += r_checkIGrad;
-      if (grad.check_fg_grad)
-        (reinterpret_cast<__m256 *>(grad.check_fg_grad))[i] += r_checkFGrad;
-    }
-    if (grad.check_og_grad)
-      (reinterpret_cast<__m256 *>(grad.check_og_grad))[i] += r_checkOGrad;
-  }
-#endif
-}
-
-template <class T, class Op>
-void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
-                      T cell_clip, ActivationType active_node,
-                      ActivationType active_gate, ActivationType active_state) {
-  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
-                                     active_node, active_gate, active_state);
-  } else {
-    naive_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
-                                       active_node, active_gate, active_state);
-  }
-}
-
-template <class T, class Op>
-void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, T cell_clip, ActivationType active_node,
-                       ActivationType active_gate,
-                       ActivationType active_state) {
-  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
-                                      active_node, active_gate, active_state);
-  } else {
-    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
-                                        active_node, active_gate, active_state);
-  }
-}
-
-#endif
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
deleted file mode 100644
index 24885d37020dc94a67063ff4a9d142550904a97b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class T, class Op, bool is_batch>
-__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
-                              int batch_size, T cell_clip,
-                              ActivationType active_node,
-                              ActivationType active_gate,
-                              ActivationType active_state) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    value.gate_value += batch_idx * frame_size * 4;
-    value.output_value += batch_idx * frame_size;
-    value.state_value += batch_idx * frame_size;
-    value.state_active_value += batch_idx * frame_size;
-  }
-
-  T r_state;
-  T r_prev_state = 0;
-  T r_state_atv;
-  T r_out;
-  T r_value_in;
-  T r_value_ig;
-  T r_value_fg;
-  T r_value_og;
-
-  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
-  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
-  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
-
-  r_value_in = value.gate_value[frame_idx];
-  r_value_ig = value.gate_value[frame_idx + frame_size];
-  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
-  r_value_og = value.gate_value[frame_idx + frame_size * 3];
-
-  if (value.prev_state_value) {
-    if (is_batch) value.prev_state_value += batch_idx * frame_size;
-    r_prev_state = value.prev_state_value[frame_idx];
-  }
-
-  op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
-     &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-     &cell_clip, active_node, active_gate, active_state);
-
-  value.gate_value[frame_idx] = r_value_in;
-  value.gate_value[frame_idx + frame_size] = r_value_ig;
-  value.gate_value[frame_idx + frame_size * 2] = r_value_fg;
-  value.gate_value[frame_idx + frame_size * 3] = r_value_og;
-
-  value.state_value[frame_idx] = r_state;
-  value.state_active_value[frame_idx] = r_state_atv;
-  value.output_value[frame_idx] = r_out;
-}
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class T, class Op, bool is_batch>
-__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
-                               LstmMetaGrad<T> grad, int frame_size,
-                               int batch_size, T cell_clip,
-                               ActivationType active_node,
-                               ActivationType active_gate,
-                               ActivationType active_state) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    value.gate_value += batch_idx * frame_size * 4;
-    value.state_value += batch_idx * frame_size;
-    value.state_active_value += batch_idx * frame_size;
-    grad.gate_grad += batch_idx * frame_size * 4;
-    grad.state_grad += batch_idx * frame_size;
-    grad.output_grad += batch_idx * frame_size;
-  }
-
-  T r_value_in;
-  T r_value_ig;
-  T r_value_fg;
-  T r_value_og;
-  T r_grad_in;
-  T r_grad_ig;
-  T r_grad_fg;
-  T r_grad_og;
-  T r_prev_state = 0;
-  T r_prev_state_grad;
-  T r_state;
-  T r_state_grad;
-  T r_state_atv;
-  T r_output_grad;
-  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
-  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
-  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
-
-  T r_checkIGrad;
-  T r_checkFGrad;
-  T r_checkOGrad;
-
-  r_value_in = value.gate_value[frame_idx];
-  r_value_ig = value.gate_value[frame_idx + frame_size];
-  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
-  r_value_og = value.gate_value[frame_idx + frame_size * 3];
-  r_state = value.state_value[frame_idx];
-  r_state_atv = value.state_active_value[frame_idx];
-  r_output_grad = grad.output_grad[frame_idx];
-  r_state_grad = grad.state_grad[frame_idx];
-
-  if (value.prev_state_value) {
-    if (is_batch) value.prev_state_value += batch_idx * frame_size;
-    r_prev_state = value.prev_state_value[frame_idx];
-  }
-
-  op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
-     &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
-     &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
-     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip,
-     active_node, active_gate, active_state);
-
-  grad.gate_grad[frame_idx] = r_grad_in;
-  grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
-  grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg;
-  grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og;
-  grad.state_grad[frame_idx] = r_state_grad;
-  if (grad.prev_state_grad) {
-    if (is_batch) grad.prev_state_grad += batch_idx * frame_size;
-    grad.prev_state_grad[frame_idx] = r_prev_state_grad;
-  }
-
-  if (is_batch) {
-    if (value.prev_state_value) {
-      if (grad.check_ig_grad)
-        paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
-                                        r_checkIGrad);
-      if (grad.check_fg_grad)
-        paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
-                                        r_checkFGrad);
-    }
-    if (grad.check_og_grad)
-      paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
-                                      r_checkOGrad);
-  } else {
-    if (value.prev_state_value) {
-      if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
-      if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad;
-    }
-    if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad;
-  }
-}
-
-template <class T, class Op>
-void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      T cell_clip, ActivationType active_node,
-                      ActivationType active_gate, ActivationType active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batch_size == 1) {
-    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-    int frame_blocks = (frame_size + 1024 - 1) / 1024;
-    threads = dim3(frame_per_block, 1);
-    grid = dim3(frame_blocks, 1);
-  } else {
-    /* frame_per_block = 32 batch_per_block = 16 */
-    threads = dim3(32, 16);
-    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
-  }
-
-  auto stream =
-      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
-  if (batch_size == 1) {
-    KeLstmForward<T, Op,
-                  /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
-        active_state);
-  } else {
-    KeLstmForward<T, Op,
-                  /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
-        active_state);
-  }
-}
-
-template <class T, class Op>
-void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
-                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, int batch_size, T cell_clip,
-                       ActivationType active_node, ActivationType active_gate,
-                       ActivationType active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batch_size == 1) {
-    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-    int frame_blocks = (frame_size + 1024 - 1) / 1024;
-    threads = dim3(frame_per_block, 1);
-    grid = dim3(frame_blocks, 1);
-  } else {
-    /* frame_per_block = 32 batch_per_block = 16 */
-    threads = dim3(32, 16);
-    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
-  }
-
-  auto stream =
-      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
-  if (batch_size == 1) {
-    KeLstmBackward<T, Op,
-                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, cell_clip, active_node,
-        active_gate, active_state);
-  } else {
-    KeLstmBackward<T, Op,
-                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, cell_clip, active_node,
-        active_gate, active_state);
-  }
-}
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h
deleted file mode 100644
index 8149686c97a030b91e0c4de708b9abf07f83203d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-namespace forward {
-
-template <class T>
-class lstm {
- public:
-  HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
-                             T *prev_state, T *state, T *state_atv, T *output,
-                             T *checkI, T *checkF, T *checkO, T *cell_clip,
-                             ActivationType active_node,
-                             ActivationType active_gate,
-                             ActivationType active_state) {
-    *value_in = activation(*value_in, active_node);
-    *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
-    *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
-    *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
-
-    if (*cell_clip > 0.0) {
-      if (*state < -1.0 * (*cell_clip)) {
-        *state = -1.0 * (*cell_clip);
-      }
-      if (*state > *cell_clip) {
-        *state = *cell_clip;
-      }
-    }
-    *value_og = activation(*value_og + (*state) * (*checkO), active_gate);
-    *state_atv = activation(*state, active_state);
-    *output = (*value_og) * (*state_atv);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
-  static const bool avx = false;
-#else
-  // Only float support AVX optimization
-  static const bool avx = std::is_same<T, float>::value;
-
-  HOSTDEVICE void operator()(__m256 *value_in, __m256 *value_ig,
-                             __m256 *value_fg, __m256 *value_og,
-                             __m256 *prev_state, __m256 *state,
-                             __m256 *state_atv, __m256 *output, __m256 *checkI,
-                             __m256 *checkF, __m256 *checkO, T *cell_clip,
-                             ActivationType active_node,
-                             ActivationType active_gate,
-                             ActivationType active_state) {
-    *value_in = activation(*value_in, active_node);
-    *value_ig = activation(
-        _mm256_add_ps(*value_ig, _mm256_mul_ps(*prev_state, *checkI)),
-        active_gate);
-    *value_fg = activation(
-        _mm256_add_ps(*value_fg, _mm256_mul_ps(*prev_state, *checkF)),
-        active_gate);
-    *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig),
-                           _mm256_mul_ps(*prev_state, *value_fg));
-
-    if (*cell_clip > 0.0f) {
-      __m256 min = _mm256_set1_ps(0.0f - *cell_clip);
-      __m256 max = _mm256_set1_ps(*cell_clip);
-      *state = _mm256_min_ps(max, *state);
-      *state = _mm256_max_ps(min, *state);
-    }
-    *value_og = activation(
-        _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate);
-    *state_atv = activation(*state, active_state);
-    *output = _mm256_mul_ps(*value_og, *state_atv);
-  }
-#endif
-#endif
-};
-
-}  // namespace forward
-
-namespace backward {
-
-template <class T>
-class lstm {
- public:
-  HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
-                             T *grad_in, T *grad_ig, T *grad_fg, T *grad_og,
-                             T *prev_state, T *prev_state_grad, T *state,
-                             T *state_grad, T *state_atv, T *output_grad,
-                             T *checkI, T *checkF, T *checkO, T *checkIGrad,
-                             T *checkFGrad, T *checkOGrad, T *cell_clip,
-                             ActivationType active_node,
-                             ActivationType active_gate,
-                             ActivationType active_state) {
-    *grad_og =
-        activation((*output_grad) * (*state_atv), *value_og, active_gate);
-    if (*cell_clip > 0.0f) {
-      if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) {
-        *state_grad = 0.0f;
-      } else {
-        *state_grad +=
-            activation((*output_grad) * (*value_og), *state_atv, active_state) +
-            (*grad_og) * (*checkO);
-      }
-    } else {
-      *state_grad +=
-          activation((*output_grad) * (*value_og), *state_atv, active_state) +
-          (*grad_og) * (*checkO);
-    }
-
-    *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
-    *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
-    *grad_fg =
-        activation((*state_grad) * (*prev_state), *value_fg, active_gate);
-    *prev_state_grad = (*grad_ig) * (*checkI) + (*grad_fg) * (*checkF) +
-                       (*state_grad) * (*value_fg);
-    *checkIGrad = (*grad_ig) * (*prev_state);
-    *checkFGrad = (*grad_fg) * (*prev_state);
-    *checkOGrad = (*grad_og) * (*state);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
-  static const bool avx = false;
-#else
-  // Only float support AVX optimization
-  static const bool avx = std::is_same<T, float>::value;
-  HOSTDEVICE void operator()(
-      __m256 *value_in, __m256 *value_ig, __m256 *value_fg, __m256 *value_og,
-      __m256 *grad_in, __m256 *grad_ig, __m256 *grad_fg, __m256 *grad_og,
-      __m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
-      __m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
-      __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
-      __m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip,
-      ActivationType active_node, ActivationType active_gate,
-      ActivationType active_state) {
-    *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
-                          active_gate);
-    if (*cell_clip > 0.0f) {
-      T *state_ = reinterpret_cast<T *>(state);
-      if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
-        *state_grad = _mm256_set1_ps(0.0f);
-      } else {
-        *state_grad =
-            _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
-                                     *state_atv, active_state),
-                          *state_grad);
-        *state_grad =
-            _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
-      }
-    }
-    *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
-                          active_node);
-    *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,
-                          active_gate);
-    *grad_fg = activation(_mm256_mul_ps(*state_grad, *prev_state), *value_fg,
-                          active_gate);
-    *prev_state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_ig, *checkI),
-                                     _mm256_mul_ps(*grad_fg, *checkF));
-    *prev_state_grad =
-        _mm256_add_ps(_mm256_mul_ps(*state_grad, *value_fg), *prev_state_grad);
-    *checkIGrad = _mm256_mul_ps(*grad_ig, *prev_state);
-    *checkFGrad = _mm256_mul_ps(*grad_fg, *prev_state);
-    *checkOGrad = _mm256_mul_ps(*grad_og, *state);
-  }
-#endif
-#endif
-};
-
-}  // namespace backward
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/fc.cc b/paddle/fluid/operators/math/fc.cc
deleted file mode 100644
index b5479a1b435682384e555c6607a097c9e0c82bd8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/fc.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/fc.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class FCFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context, const int M,
-                  const int N, const int K, const T* X, const T* W, T* Y,
-                  const T* B = nullptr, bool relu = false) {
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    blas.MatMul(M, N, K, X, W, Y);
-    if (B == NULL) {
-      return;
-    }
-    if (relu) {
-      auto compute =
-          jit::KernelFuncs<jit::VAddReluTuple<T>, platform::CPUPlace>::Cache()
-              .At(N);
-      for (int i = 0; i < M; i++) {
-        T* dst = Y + i * N;
-        compute(B, dst, dst, N);
-      }
-    } else {
-      auto compute =
-          jit::KernelFuncs<jit::VAddTuple<T>, platform::CPUPlace>::Cache().At(
-              N);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-      for (int i = 0; i < M; i++) {
-        T* dst = Y + i * N;
-        compute(B, dst, dst, N);
-      }
-    }
-  }
-};
-
-template class FCFunctor<platform::CPUDeviceContext, float>;
-template class FCFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/fc.cu b/paddle/fluid/operators/math/fc.cu
deleted file mode 100644
index 1b22b81039954bfcf8ea0f6819d778d3fa126cab..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/fc.cu
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/fc.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, bool DoRelu>
-__global__ void InplaceAddReluKernel(const T* bias, T* data, int M, int N) {
-  for (int i = blockIdx.x; i < M; i += gridDim.x) {
-    int index = i * N + threadIdx.x;
-    for (int j = threadIdx.x; j < N; j += blockDim.x) {
-      T tmp = data[index] + bias[j];
-      if (DoRelu) {
-        data[index] = (tmp > 0) ? tmp : 0;
-      } else {
-        data[index] = tmp;
-      }
-      index += blockDim.x;
-    }
-  }
-}
-
-template <typename T>
-class FCFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context, const int M,
-                  const int N, const int K, const T* X, const T* W, T* Y,
-                  const T* B = nullptr, bool relu = false) {
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
-    blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), X, K, W, N,
-              static_cast<T>(0.0), Y, N);
-    if (B == NULL) {
-      return;
-    }
-
-    const int kThreadsPerBlock = 1024;
-    int max_threads = context.GetMaxPhysicalThreadCount();
-    int num_threads = std::min(kThreadsPerBlock, (((N + 31) >> 5) << 5));
-    int num_blocks = std::max(max_threads / num_threads, 1);
-    if (relu) {
-      InplaceAddReluKernel<
-          T, true><<<num_blocks, num_threads, 0, context.stream()>>>(B, Y, M,
-                                                                     N);
-    } else {
-      InplaceAddReluKernel<
-          T, false><<<num_blocks, num_threads, 0, context.stream()>>>(B, Y, M,
-                                                                      N);
-    }
-  }
-};
-
-template class FCFunctor<platform::CUDADeviceContext, float>;
-template class FCFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/fc.h b/paddle/fluid/operators/math/fc.h
deleted file mode 100644
index 9bef496fb9d3977b286338a79f641fde514d8303..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/fc.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-class FCFunctor {
- public:
-  void operator()(const DeviceContext& context, const int M, const int N,
-                  const int K, const T* X, const T* W, T* Y,
-                  const T* B = nullptr, bool relu = false);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h
deleted file mode 100644
index e98bf82169aae0f541e51a6a30f02f602272bb34..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/functors.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// MulFunctor
-template <typename T>
-struct MulFunctor {
-  // out = x * y;
-  inline HOSTDEVICE T operator()(T x, T y) { return x * y; }
-};
-
-template <typename T>
-struct MulGradFunctor {
-  inline HOSTDEVICE T Dx(T x, T y) { return y; }
-  inline HOSTDEVICE T Dy(T x, T y) { return x; }
-};
-
-// AddFunctor
-template <typename T>
-struct AddFunctor {
-  // out = x + y;
-  inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
-};
-
-template <typename T>
-struct AddGradFunctor {
-  inline HOSTDEVICE T Dx(T x, T y) { return 1; }
-  inline HOSTDEVICE T Dy(T x, T y) { return 1; }
-};
-
-template <typename T>
-struct ScaleFunctor {
-  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
-
-  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
-
- private:
-  T coeff_;
-};
-
-template <typename T>
-struct ScaleGradFunctor {
-  explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {}
-
-  inline HOSTDEVICE T UseX(T x) { return coeff_; }
-  inline HOSTDEVICE T UseOut(T out) { return coeff_; }
-  inline HOSTDEVICE T UseXAndOut(T x, T out) { return coeff_; }
-
- private:
-  T coeff_;
-};
-
-template <typename T>
-struct ReluFunctor {
-  inline HOSTDEVICE T operator()(T x) { return x * (x > 0); }
-};
-
-template <typename T>
-struct ReluGradFunctor {
-  inline HOSTDEVICE T UseX(T x) { return x > 0 ? 1 : 0; }
-  inline HOSTDEVICE T UseOut(T out) { return out > 0 ? 1 : 0; }
-  inline HOSTDEVICE T UseXAndOut(T x, T out) { return out > 0 ? 1 : 0; }
-};
-
-template <typename T>
-struct TanhFunctor {
-  const T kMin = static_cast<T>(-40);
-  const T kMax = static_cast<T>(13);
-  inline HOSTDEVICE T operator()(T x) {
-    // y = 2 / (1 + e^-2x) - 1
-    T t0 = 2 * x;
-    T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0);
-    return static_cast<T>(2) / (static_cast<T>(1) + std::exp(-t1)) -
-           static_cast<T>(1);
-  }
-};
-
-template <typename T>
-struct TanhGradFunctor {
-  inline HOSTDEVICE T UseX(T x) { return static_cast<T>(1) - x * x; }
-  inline HOSTDEVICE T UseOut(T out) { return static_cast<T>(1) - out * out; }
-  inline HOSTDEVICE T UseXAndOut(T x, T out) {
-    return static_cast<T>(1) - out * out;
-  }
-};
-
-template <typename T>
-struct SigmoidFunctor {
-  const T kMin = static_cast<T>(-40);
-  const T kMax = static_cast<T>(13);
-  inline HOSTDEVICE T operator()(T x) {
-    // y = 1 / (1 + e^-x)
-    T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x);
-    return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
-  }
-};
-
-template <typename T>
-struct SigmoidGradFunctor {
-  inline HOSTDEVICE T UseX(T x) { return x * (static_cast<T>(1) - x); }
-  inline HOSTDEVICE T UseOut(T out) { return out * (static_cast<T>(1) - out); }
-  inline HOSTDEVICE T UseXAndOut(T x, T out) {
-    return out * (static_cast<T>(1) - out);
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
deleted file mode 100644
index 07c5cbf33378e6f6cee8a82448f55399966a2574..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/gru_kernel.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext &context,
-                      GRUMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode) {
-#ifndef __NVCC__
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    if (value.prev_out_value) {
-      blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
-                value.prev_out_value, frame_size, value.gate_weight,
-                frame_size * 2, 1, value.gate_value, frame_size * 3);
-    }
-
-    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frame_size, batch_size, active_gate);
-
-    if (value.prev_out_value) {
-      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
-                value.reset_output_value, frame_size, value.state_weight,
-                frame_size, 1, value.gate_value + frame_size * 2,
-                frame_size * 3);
-    }
-
-    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
-                                 frame_size, batch_size, active_node,
-                                 origin_mode);
-#endif
-  }
-};
-
-template <typename T>
-struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext &context,
-                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode) {
-#ifndef __NVCC__
-    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
-                                grad, frame_size, batch_size, active_node,
-                                origin_mode);
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    if (value.prev_out_value && grad.prev_out_grad) {
-      blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
-                grad.gate_grad + frame_size * 2, frame_size * 3,
-                value.state_weight, frame_size, 0, grad.reset_output_grad,
-                frame_size);
-
-      if (grad.state_weight_grad) {
-        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
-                  value.reset_output_value, frame_size,
-                  grad.gate_grad + frame_size * 2, frame_size * 3, 1,
-                  grad.state_weight_grad, frame_size);
-      }
-    }
-
-    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
-                                grad, frame_size, batch_size, active_gate);
-    if (grad.prev_out_grad && value.prev_out_value) {
-      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
-                grad.gate_grad, frame_size * 3, value.gate_weight,
-                frame_size * 2, 1, grad.prev_out_grad, frame_size);
-
-      if (grad.gate_weight_grad) {
-        blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
-                  value.prev_out_value, frame_size, grad.gate_grad,
-                  frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2);
-      }
-    }
-#endif
-  }
-};
-
-template struct GRUUnitFunctor<platform::CPUDeviceContext, float>;
-template struct GRUUnitFunctor<platform::CPUDeviceContext, double>;
-template struct GRUUnitGradFunctor<platform::CPUDeviceContext, float>;
-template struct GRUUnitGradFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu
deleted file mode 100644
index b564f990b4920a3a01b6ce0dd53e8f5e5d0464aa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/fluid/platform/device_context.h>
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/gru_kernel.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext &context,
-                      GRUMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode) {
-    auto stream = context.stream();
-    dim3 threads;
-    dim3 grid;
-    if (batch_size == 1) {
-      if (context.GetComputeCapability() >= 70) {
-        constexpr int tiled_size = 16;
-        int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
-        threads = dim3(tiled_size, 1);
-        grid = dim3(frame_blocks, 1);
-        detail::KeFastCollectiveGruGate<
-            T, tiled_size><<<grid, threads, 0, stream>>>(
-            value.gate_value, value.prev_out_value, value.gate_weight,
-            value.reset_output_value, frame_size, active_gate);
-
-        frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
-        grid = dim3(frame_blocks, 1);
-        detail::KeFastCollectiveGruOut<
-            T, tiled_size><<<grid, threads, 0, stream>>>(
-            value.state_weight, value.prev_out_value, value.output_value,
-            value.gate_value, value.reset_output_value, frame_size, active_node,
-            origin_mode);
-
-        return;
-      } else {
-        int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-        int frame_blocks = (frame_size + 1024 - 1) / 1024;
-        threads = dim3(frame_per_block, 1);
-        grid = dim3(frame_blocks, 1);
-      }
-    } else {
-      threads = dim3(32, 32);
-      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
-    }
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
-    if (value.prev_out_value) {
-      blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
-                value.prev_out_value, frame_size, value.gate_weight,
-                frame_size * 2, 1, value.gate_value, frame_size * 3);
-    }
-
-    if (batch_size == 1) {
-      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* is_batch= */ false,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gate_value,
-          value.reset_output_value, value.prev_out_value, frame_size,
-          batch_size, active_gate);
-    } else {
-      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* is_batch= */ true,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gate_value,
-          value.reset_output_value, value.prev_out_value, frame_size,
-          batch_size, active_gate);
-    }
-
-    if (value.prev_out_value) {
-      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
-                value.reset_output_value, frame_size, value.state_weight,
-                frame_size, 1, value.gate_value + frame_size * 2,
-                frame_size * 3);
-    }
-
-    if (batch_size == 1) {
-      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* is_batch= */ false,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gate_value,
-          value.prev_out_value, value.output_value, frame_size, batch_size,
-          active_node, origin_mode);
-    } else {
-      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* is_batch= */ true,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gate_value,
-          value.prev_out_value, value.output_value, frame_size, batch_size,
-          active_node, origin_mode);
-    }
-  }
-};
-
-template <typename T>
-struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext &context,
-                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode) {
-    auto stream = context.stream();
-    dim3 threads;
-    dim3 grid;
-    if (batch_size == 1) {
-      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-      int frame_blocks = (frame_size + 1024 - 1) / 1024;
-      threads = dim3(frame_per_block, 1);
-      grid = dim3(frame_blocks, 1);
-    } else {
-      threads = dim3(32, 32);
-      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
-    }
-
-    if (batch_size == 1) {
-      detail::KeGruBackwardStateGrad<
-          detail::backward::gru_stateGrad<T>,
-          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.output_grad, frame_size, batch_size, active_node, origin_mode);
-    } else {
-      detail::KeGruBackwardStateGrad<
-          detail::backward::gru_stateGrad<T>,
-          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.output_grad, frame_size, batch_size, active_node, origin_mode);
-    }
-
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
-
-    if (value.prev_out_value && grad.prev_out_grad) {
-      blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
-                grad.gate_grad + frame_size * 2, frame_size * 3,
-                value.state_weight, frame_size, 0, grad.reset_output_grad,
-                frame_size);
-
-      if (grad.state_weight_grad) {
-        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
-                  value.reset_output_value, frame_size,
-                  grad.gate_grad + frame_size * 2, frame_size * 3, 1,
-                  grad.state_weight_grad, frame_size);
-      }
-    }
-
-    if (batch_size == 1) {
-      detail::KeGruBackwardResetGrad<
-          detail::backward::gru_resetGrad<T>,
-          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.reset_output_grad, frame_size, batch_size, active_gate);
-    } else {
-      detail::KeGruBackwardResetGrad<
-          detail::backward::gru_resetGrad<T>,
-          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.reset_output_grad, frame_size, batch_size, active_gate);
-    }
-
-    if (grad.prev_out_grad && value.prev_out_value) {
-      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
-                grad.gate_grad, frame_size * 3, value.gate_weight,
-                frame_size * 2, 1, grad.prev_out_grad, frame_size);
-
-      if (grad.gate_weight_grad) {
-        blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
-                  value.prev_out_value, frame_size, grad.gate_grad,
-                  frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2);
-      }
-    }
-  }
-};
-
-template struct GRUUnitFunctor<platform::CUDADeviceContext, float>;
-template struct GRUUnitFunctor<platform::CUDADeviceContext, double>;
-template struct GRUUnitGradFunctor<platform::CUDADeviceContext, float>;
-template struct GRUUnitGradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h
deleted file mode 100644
index f5ddec0aaa275a32a5a9937699066a170edc0825..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/gru_compute.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUMetaValue {
-  T *gate_weight;
-  T *state_weight;
-  T *gate_value;
-  T *reset_output_value;
-  T *output_value;
-  T *prev_out_value;
-};
-
-template <typename T>
-struct GRUMetaGrad {
-  T *gate_weight_grad;
-  T *state_weight_grad;
-  T *gate_grad;
-  T *reset_output_grad;
-  T *output_grad;
-  T *prev_out_grad;
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitFunctor {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode);
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitGradFunctor {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
deleted file mode 100644
index 1472edbbf47e3e4d6b22c65349713904b13647d2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/im2col.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/im2col.h"
-#include <vector>
-#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
-
-    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
-        dilation[1] == 1) {
-      if (padding[0] == 0 && padding[1] == 0) {
-        im2col_sh1sw1dh1dw1ph0pw0<T>(im, col);
-        return;
-      } else if (padding[0] == 1 && padding[1] == 1) {
-        im2col_sh1sw1dh1dw1ph1pw1<T>(im, col);
-        return;
-      }
-      // TODO(TJ): complete padding >=2
-    }
-    im2col_common<T>(im, dilation, stride, padding, col);
-  }
-};
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int col_height = col.dims()[3];
-    int col_width = col.dims()[4];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-
-    int channels_col = im_channels * filter_height * filter_width;
-
-    T* im_data = im->data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
-              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
-                col_data[(c * col_height + h) * col_width + w];
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext, float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext, double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext, float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext, double>;
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[3];
-    int filter_width = col->dims()[4];
-    int col_height = col->dims()[0];
-    int col_width = col->dims()[1];
-
-    const T* im_data = im.data<T>();
-    T* col_data = col->data<T>();
-
-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
-
-              int col_offset =
-                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
-
-              int im_offset = (channel * im_height + im_row_offset) * im_width +
-                              im_col_offset;
-              col_data[col_offset] =
-                  (im_row_offset < 0 || im_row_offset >= im_height ||
-                   im_col_offset < 0 || im_col_offset >= im_width)
-                      ? static_cast<T>(0)
-                      : im_data[im_offset];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int col_height = col.dims()[0];
-    int col_width = col.dims()[1];
-
-    PADDLE_ENFORCE_EQ(
-        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
-
-    T* im_data = im->data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
-
-              int col_offset =
-                  (((col_row_idx * col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
-
-              if (im_row_offset >= 0 && im_row_offset < im_height &&
-                  im_col_offset >= 0 && im_col_offset < im_width) {
-                int im_offset =
-                    (channel * im_height + im_row_offset) * im_width +
-                    im_col_offset;
-                im_data[im_offset] += col_data[col_offset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext, float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext, double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext, float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
deleted file mode 100644
index 809014ea3d6ce51fd0dae478e7f0bedca2420412..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/im2col.cu
+++ /dev/null
@@ -1,396 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-__global__ void im2col(const T* data_im, int num_outs, int im_height,
-                       int im_width, int dilation_h, int dilation_w,
-                       int filter_height, int filter_width, int stride_height,
-                       int stride_width, int padding_height, int padding_width,
-                       int col_height, int col_width, T* data_col) {
-  const int index =
-      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < num_outs) {
-    int w_out = index % col_width;
-    int h_out = (index / col_width) % col_height;
-    int channel_in = index / col_width / col_height;
-    int channel_out = channel_in * filter_height * filter_width;
-    int h_in = h_out * stride_height - padding_height;
-    int w_in = w_out * stride_width - padding_width;
-
-    data_col += (channel_out * col_height + h_out) * col_width + w_out;
-    data_im += (channel_in * im_height + h_in) * im_width + w_in;
-    for (int i = 0; i < filter_height; ++i) {
-      for (int j = 0; j < filter_width; ++j) {
-        int rIdx = h_in + i * dilation_h;
-        int cIdx = w_in + j * dilation_w;
-        *data_col =
-            (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0)
-                ? 0
-                : data_im[i * dilation_h * im_width + j * dilation_w];
-        data_col += col_height * col_width;
-      }
-    }
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* col) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3);
-    PADDLE_ENFORCE_EQ(col->dims().size(), 5);
-
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
-
-    int num_outputs = im_channels * col_height * col_width;
-    int blocks = (num_outputs + 1024 - 1) / 1024;
-    int block_x = 512;
-    int block_y = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(block_x, block_y);
-    im2col<T><<<grid, threads, 0, context.stream()>>>(
-        im.data<T>(), num_outputs, im_height, im_width, dilation[0],
-        dilation[1], filter_height, filter_width, stride[0], stride[1],
-        padding[0], padding[1], col_height, col_width, col->data<T>());
-  }
-};
-
-template <class T>
-__global__ void col2im(int n, const T* data_col, int im_height, int im_width,
-                       int dilation_h, int dilation_w, int filter_height,
-                       int filter_width, int stride_height, int stride_width,
-                       int padding_height, int padding_width, int col_height,
-                       int col_width, T* data_im) {
-  const int index =
-      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
-  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
-
-  if (index < n) {
-    T val = 0;
-    int w = index % im_width + padding_width;
-    int h = (index / im_width) % im_height + padding_height;
-    int c = index / (im_width * im_height);
-
-    // compute the start and end of the output
-    int w_col_start =
-        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
-    int w_col_end = min(w / stride_width + 1, col_width);
-    int h_col_start =
-        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
-    int h_col_end = min(h / stride_height + 1, col_height);
-
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        int h_off = (h - h_col * stride_height);
-        int w_off = (w - w_col * stride_width);
-        if (h_off % dilation_h == 0 && w_off % dilation_w == 0) {
-          h_off /= dilation_h;
-          w_off /= dilation_w;
-          int data_col_index =
-              (((c * filter_height + h_off) * filter_width + w_off) *
-                   col_height +
-               h_col) *
-                  col_width +
-              w_col;
-
-          val += data_col[data_col_index];
-        }
-      }
-    }
-    data_im[index] = val;
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* im) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3);
-    PADDLE_ENFORCE_EQ(col.dims().size(), 5);
-
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int col_height = col.dims()[3];
-    int col_width = col.dims()[4];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
-    size_t num_kernels = im_channels * im_height * im_width;
-
-    size_t blocks = (num_kernels + 1024 - 1) / 1024;
-    size_t block_x = 512;
-    size_t block_y = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(block_x, block_y);
-
-    // To avoid involving atomic operations, we will launch one kernel per
-    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<<grid, threads, 0, context.stream()>>>(
-        num_kernels, col.data<T>(), im_height, im_width, dilation[0],
-        dilation[1], filter_height, filter_width, stride[0], stride[1],
-        padding[0], padding[2], col_height, col_width, im->data<T>());
-  }
-};
-
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext, float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext, double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext, float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext, double>;
-
-template <class T>
-__global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
-                          int im_width, int filter_height, int filter_width,
-                          int stride_height, int stride_width,
-                          int padding_height, int padding_width, int col_height,
-                          int col_width, T* col_data) {
-  int swid = blockIdx.x;
-  int shid = blockIdx.y;
-  for (int channelid = threadIdx.z; channelid < im_channels;
-       channelid += blockDim.z) {
-    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
-      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
-        int width_offset = idx + swid * stride_width - padding_width;
-        int height_offset = idy + shid * stride_height - padding_height;
-        int im_offset = width_offset + height_offset * im_width +
-                        channelid * im_height * im_width;
-
-        int col_offset = idx + idy * filter_width +
-                         channelid * filter_height * filter_width +
-                         (shid * col_width + swid) *
-                             (im_channels * filter_height * filter_width);
-
-        col_data[col_offset] =
-            (height_offset >= im_height || height_offset < 0 ||
-             width_offset >= im_width || width_offset < 0)
-                ? T(0)
-                : im_data[im_offset];
-      }
-    }
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* col) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3);
-    PADDLE_ENFORCE_EQ(col->dims().size(), 5);
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[3];
-    int filter_width = col->dims()[4];
-    int col_height = col->dims()[0];
-    int col_width = col->dims()[1];
-
-    int block_dim_x = 0;
-    int block_dim_y = 0;
-    if (filter_height <= 4 && filter_width <= 4) {
-      block_dim_x = 4;
-      block_dim_y = 4;
-    } else if (filter_height <= 8 && filter_width <= 8) {
-      block_dim_x = 8;
-      block_dim_y = 8;
-    } else if (filter_height <= 16 && filter_width <= 16) {
-      block_dim_x = 16;
-      block_dim_y = 16;
-    } else {
-      block_dim_x = 32;
-      block_dim_y = 32;
-    }
-
-    int block_dim_z = 1024 / block_dim_x / block_dim_y;
-    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
-    dim3 grid(col_width, col_height);
-    im2colOCF<T><<<grid, threads, 0, context.stream()>>>(
-        im.data<T>(), im_channels, im_height, im_width, filter_height,
-        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
-        col_width, col->data<T>());
-  }
-};
-
-template <class T>
-__global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
-                          int im_width, int filter_height, int filter_width,
-                          int stride_height, int stride_width,
-                          int padding_height, int padding_width, int col_height,
-                          int col_width, T* im_data) {
-  int swid = blockIdx.x;
-  int shid = blockIdx.y;
-  for (int channelid = threadIdx.z; channelid < im_channels;
-       channelid += blockDim.z) {
-    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
-      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
-        int width_offset = idx + swid * stride_width - padding_width;
-        int height_offset = idy + shid * stride_height - padding_height;
-        int im_offset = width_offset + height_offset * im_width +
-                        channelid * im_height * im_width;
-
-        int col_offset = idx + idy * filter_width +
-                         channelid * filter_height * filter_width +
-                         (shid * col_width + swid) *
-                             (im_channels * filter_height * filter_width);
-
-        if (height_offset >= 0 && height_offset < im_height &&
-            width_offset >= 0 && width_offset < im_width) {
-          paddle::platform::CudaAtomicAdd(im_data + im_offset,
-                                          col_data[col_offset]);
-        }
-      }
-    }
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* im) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3);
-    PADDLE_ENFORCE_EQ(col.dims().size(), 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int col_height = col.dims()[0];
-    int col_width = col.dims()[1];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
-    int block_dim_x = 0;
-    int block_dim_y = 0;
-    if (filter_height <= 4 && filter_width <= 4) {
-      block_dim_x = 4;
-      block_dim_y = 4;
-    } else if (filter_height <= 8 && filter_width <= 8) {
-      block_dim_x = 8;
-      block_dim_y = 8;
-    } else if (filter_height <= 16 && filter_width <= 16) {
-      block_dim_x = 16;
-      block_dim_y = 16;
-    } else {
-      block_dim_x = 32;
-      block_dim_y = 32;
-    }
-
-    int block_dim_z = 1024 / block_dim_x / block_dim_y;
-    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
-    dim3 grid(col_width, col_height);
-    col2imOCF<T><<<grid, threads, 0, context.stream()>>>(
-        col.data<T>(), im_channels, im_height, im_width, filter_height,
-        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
-        col_width, im->data<T>());
-  }
-};
-
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext, float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext, double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext, float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col.h b/paddle/fluid/operators/math/im2col.h
deleted file mode 100644
index 26d94e0f2e6163eb7452cf1fbea5966b4344ace1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/im2col.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
-enum class ColFormat { kCFO = 0, kOCF = 1 };
-
-/*
- * \brief Converts the image data of three dimensions(CHW) into a colData of
- *        five dimensions in the Im2ColFunctor calculation,
- *        And in the Col2ImFunctor calculation, it is reversed.
- *
- * \param imData   Image data.
- * \param imShape  The shape of imData,
- *                 [input_channels, input_height, input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 2-dimension  [dilation_height, dilation_width].
- *
- * \param strides      stride data.
- * \param 2-dimension  [stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
- *
- * If the template argument Format is kCFO, the shape of colData is:
- * [input_channels, filter_height, filter_width, output_height, output_width]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * input_channels * filter_height * filter_width, and the width is equal
- * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_height,
- *      output_width]
- *
- * If the template argument Format is kOCF, the shape of colData is:
- * [output_height, output_width, input_channels, filter_height, filter_width]
- * So, it is easy to reshape into a sequence matrix for rnn calculation.
- * The shape of sequence matrix is [seq_length, step_size], where the seq_length
- * is equal output_height * output_width, and the step_size is equal
- * input_channels * filter_height * filter_width.
- *
- * Reshape:
- *     shape of colData             shape of sequence matrix
- *     [output_height,
- *      output_width,
- *      input_channels,    ======>    [seqLength, stepSize]
- *      filter_height,
- *      filter_width]
- *
- * \note The caller needs to ensure that imShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-template <ColFormat Format, typename DeviceContext, typename T>
-class Im2ColFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& im,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* col);
-};
-
-template <ColFormat Format, typename DeviceContext, typename T>
-class Col2ImFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* im);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
deleted file mode 100644
index 0d32bc5bd0d7f25479370959cabeb9b9c9e7e2d6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/im2col_cfo_cpu.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/**
- * The most common im2col algorithm.
- * Support dilation, stride and padding.
- */
-template <typename T>
-inline void im2col_common(const framework::Tensor& im,
-                          const std::vector<int>& dilation,
-                          const std::vector<int>& stride,
-                          const std::vector<int>& padding,
-                          framework::Tensor* col) {
-  int im_channels = im.dims()[0];
-  int im_height = im.dims()[1];
-  int im_width = im.dims()[2];
-  int filter_height = col->dims()[1];
-  int filter_width = col->dims()[2];
-  int output_height = col->dims()[3];
-  int output_width = col->dims()[4];
-  int channels_col = im_channels * filter_height * filter_width;
-
-  const T* im_data = im.data<T>();
-  T* col_data = col->data<T>();
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % filter_width;
-    int h_offset = (c / filter_width) % filter_height;
-    int c_im = c / (filter_width * filter_height);
-    for (int h = 0; h < output_height; ++h) {
-      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-      for (int w = 0; w < output_width; ++w) {
-        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-        int col_idx = (c * output_height + h) * output_width + w;
-        int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                             im_col_idx < 0 || im_col_idx >= im_width)
-                                ? static_cast<T>(0)
-                                : im_data[im_idx];
-      }
-    }
-  }
-}
-
-/**
- * im2col algorithm with strides == 1, dilations == 1, paddings == 0
- */
-template <typename T>
-inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
-                                      framework::Tensor* col) {
-  int im_channels = im.dims()[0];
-  int im_height = im.dims()[1];
-  int im_width = im.dims()[2];
-  int filter_height = col->dims()[1];
-  int filter_width = col->dims()[2];
-  int output_height = col->dims()[3];
-  int output_width = col->dims()[4];
-
-  const T* im_data = im.data<T>();
-  T* col_data = col->data<T>();
-  int col_matrix_width = output_width * output_height;
-  int im_size = im_height * im_width;
-  size_t copy_size = sizeof(T) * output_width;
-  const T* im_data_oh = im_data;
-  T* dst_data_oh = col_data;
-  for (int oh = 0; oh < output_height; ++oh) {
-    const T* src_data_ic = im_data_oh;
-    T* dst_data = dst_data_oh;
-    for (int ic = 0; ic < im_channels; ++ic) {
-      const T* src_data = src_data_ic;
-      for (int kh = 0; kh < filter_height; ++kh) {
-        for (int kw = 0; kw < filter_width; ++kw) {
-          std::memcpy(dst_data, src_data + kw, copy_size);
-          dst_data = dst_data + col_matrix_width;
-        }
-        src_data = src_data + im_width;
-      }
-      src_data_ic = src_data_ic + im_size;
-    }
-    im_data_oh = im_data_oh + im_width;
-    dst_data_oh = dst_data_oh + output_width;
-  }
-}
-
-/**
- * im2col algorithm with strides == 1, dilations == 1, paddings == 1
- * and filter_width == 1 have a special implementation
- */
-template <typename T>
-inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
-                                      framework::Tensor* col) {
-  int im_channels = im.dims()[0];
-  int im_height = im.dims()[1];
-  int im_width = im.dims()[2];
-  int filter_height = col->dims()[1];
-  int filter_width = col->dims()[2];
-  int output_height = col->dims()[3];
-  int output_width = col->dims()[4];
-
-  constexpr int plh = 1;
-  constexpr int prh = 1;
-  constexpr int plw = 1;
-  constexpr int prw = 1;
-
-  const T* im_data = im.data<T>();
-  T* col_data = col->data<T>();
-  int im_size = im_height * im_width;
-  int col_matrix_width = output_width * output_height;
-  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
-  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
-
-  // fill height padding
-  {
-    size_t copy_size = sizeof(T) * output_width;
-    T* col_start_l = col_data;
-    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
-                     col_matrix_width - output_width;
-    for (int ic = 0; ic < im_channels; ++ic) {
-      T* dst_data_l = col_start_l;
-      T* dst_data_r = col_start_r;
-      for (int kw = 0; kw < filter_width; ++kw) {
-        std::memset(dst_data_l, 0, copy_size);
-        std::memset(dst_data_r, 0, copy_size);
-        dst_data_l = dst_data_l + col_matrix_width;
-        dst_data_r = dst_data_r + col_matrix_width;
-      }
-      col_start_l = col_start_l + col_block_ic;
-      col_start_r = col_start_r + col_block_ic;
-    }
-  }
-
-  auto pad = static_cast<T>(0);
-  if (filter_width == 1) {
-    // fill width padding
-    T* dst_data_ic = col_data;
-    for (int ic = 0; ic < im_channels; ++ic) {
-      T* dst_data_kh = dst_data_ic;
-      for (int kh = 0; kh < filter_height; ++kh) {
-        T* dst_data = dst_data_kh;
-        for (int oh = 0; oh < output_height; ++oh) {
-          *dst_data = pad;
-          dst_data = dst_data + output_width - 1;
-          *dst_data = pad;
-          ++dst_data;
-        }
-        dst_data_kh = dst_data_kh + col_block_fh;
-      }
-      dst_data_ic = dst_data_ic + col_block_ic;
-    }
-    // fill core
-    size_t copy_size = sizeof(T) * (output_width - plw - prw);
-    for (int oh = 0; oh < output_height; ++oh) {
-      const T* im_data_start =
-          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
-      T* dst_data = col_data + oh * output_width;
-      for (int ic = 0; ic < im_channels; ++ic) {
-        const T* src_data = im_data_start + ic * im_size;
-        for (int kh = 0; kh < filter_height; ++kh) {
-          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
-                                         kh > (filter_height - prh - 1))) {
-            dst_data = dst_data + col_matrix_width;
-            continue;
-          }
-          std::memcpy(dst_data + plw, src_data, copy_size);
-          dst_data = dst_data + col_matrix_width;
-          src_data = src_data + im_width;
-        }
-      }
-    }
-    return;
-  }
-
-  // filter_width != 1
-  // fill width padding
-  T* dst_data_ic = col_data;
-  for (int ic = 0; ic < im_channels; ++ic) {
-    T* dst_data_kh = dst_data_ic;
-    for (int kh = 0; kh < filter_height; ++kh) {
-      for (T* dst_data :
-           {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
-                             output_width - 1}) {
-        // TODO(TJ): from plh, saving repeated assignment
-        for (int oh = 0; oh < output_height; ++oh) {
-          *dst_data = pad;
-          dst_data = dst_data + output_width;
-        }
-      }
-      dst_data_kh = dst_data_kh + col_block_fh;
-    }
-    dst_data_ic = dst_data_ic + col_block_ic;
-  }
-
-  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
-  // (output_width-1)}
-  // length of copy_size is equal kw.
-  for (int oh = 0; oh < output_height; ++oh) {
-    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
-    T* dst_data = col_data + oh * output_width;
-    for (int ic = 0; ic < im_channels; ++ic) {
-      const T* src_data = im_data_start + ic * im_size;
-      for (int kh = 0; kh < filter_height; ++kh) {
-        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
-                                       kh > (filter_height - prh - 1))) {
-          dst_data = dst_data + filter_width * col_matrix_width;
-          continue;
-        }
-        // TODO(TJ): reuse plw-kw outside this for
-        // try to unify
-        for (int kw = 0; kw < plw; ++kw) {
-          std::memcpy(dst_data + (plw - kw), src_data,
-                      sizeof(T) * (output_width - (plw - kw)));
-          dst_data = dst_data + col_matrix_width;
-        }
-        for (int kw = plw; kw < filter_width - prw; ++kw) {
-          std::memcpy(dst_data, src_data + (kw - plw),
-                      sizeof(T) * output_width);
-          dst_data = dst_data + col_matrix_width;
-        }
-        int i = 1;
-        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
-          std::memcpy(dst_data, src_data + (kw - plw),
-                      sizeof(T) * (output_width - i));
-          dst_data = dst_data + col_matrix_width;
-        }
-        src_data = src_data + im_width;
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
deleted file mode 100644
index 521cd7801abd6bea0c4f27422141742921dddf53..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/im2col.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
-#include "paddle/fluid/platform/port.h"
-
-template <typename DeviceContext, typename Place>
-void testIm2col() {
-  paddle::framework::Tensor input_tmp;
-  paddle::framework::Tensor input;
-  paddle::framework::Tensor output_cfo;
-  paddle::framework::Tensor output_ocf;
-  paddle::framework::Tensor output_tmp;
-
-  /**
-   * input = [0, 1, 2,
-   *          3, 4, 5]
-   *
-   * output_cfo = [0, 1
-   *               1, 2
-   *               3, 4
-   *               4, 5]
-   *
-   * output_ocf = [0, 1, 3, 4
-   *               1, 2, 4, 5]
-   *
-   * col2im_cfo = [0, 2, 2
-   *               3, 4, 5]
-   *
-   * col2im_ocf = [0, 2, 2
-   *               3, 4, 5]
-   */
-  int input_height = 2;
-  int input_width = 3;
-  int filter_size = 2;
-  std::vector<int> stride({1, 1});  // stride_y, stride_x
-  std::vector<int> padding(
-      {0, 0, 0, 0});                  // up_pad, left_pad, down_pad, right_pad
-  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
-  int output_height =
-      (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
-  int output_width =
-      (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
-  float* input_ptr = input_tmp.mutable_data<float>(
-      {1, input_height, input_width}, paddle::platform::CPUPlace());
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input_ptr, arr, 6 * sizeof(float));
-
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    TensorCopySync(input_tmp, *place, &input);
-  }
-  output_cfo.mutable_data<float>(
-      {1, filter_size, filter_size, output_height, output_width}, *place);
-  output_ocf.mutable_data<float>(
-      {output_height, output_width, 1, filter_size, filter_size}, *place);
-
-  // Im2Col
-  paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
-      im2col;
-  paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
-      im2col_ocf;
-
-  im2col(*context, input, dilation, stride, padding, &output_cfo);
-  im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
-
-  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
-  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
-
-  float* out_cfo_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    out_cfo_ptr = output_cfo.data<float>();
-  } else {
-    TensorCopySync(output_cfo, paddle::platform::CPUPlace(), &output_tmp);
-    out_cfo_ptr = output_tmp.data<float>();
-  }
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
-  }
-
-  float* out_ocf_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    out_ocf_ptr = output_ocf.data<float>();
-  } else {
-    TensorCopySync(output_ocf, paddle::platform::CPUPlace(), &output_tmp);
-    out_ocf_ptr = output_tmp.data<float>();
-  }
-
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
-  }
-
-  // Col2Im: kCFO
-  paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
-      col2im;
-  paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
-      col2im_ocf;
-  float col2im_data[] = {0, 2, 2, 3, 8, 5};
-
-  memset(input_ptr, 0, 6 * sizeof(float));
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    TensorCopySync(input_tmp, *place, &input);
-  }
-
-  col2im(*context, output_cfo, dilation, stride, padding, &input);
-
-  float* in_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    in_ptr = input.data<float>();
-  } else {
-    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
-    in_ptr = input_tmp.data<float>();
-  }
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(in_ptr[i], col2im_data[i]);
-  }
-
-  // Col2Im: kOCF
-  memset(input_ptr, 0, 6 * sizeof(float));
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    TensorCopySync(input_tmp, *place, &input);
-  }
-
-  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    in_ptr = input.data<float>();
-  } else {
-    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
-    in_ptr = input_tmp.data<float>();
-  }
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(in_ptr[i], col2im_data[i]);
-  }
-
-  delete place;
-  delete context;
-}
-
-TEST(math, im2col) {
-  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
-#ifdef PADDLE_WITH_CUDA
-  testIm2col<paddle::platform::CUDADeviceContext,
-             paddle::platform::CUDAPlace>();
-#endif
-}
-
-#define PREPARE_IM2COL_CPU                                                   \
-  paddle::platform::CPUPlace place;                                          \
-  paddle::platform::CPUDeviceContext context(place);                         \
-  paddle::framework::Tensor input;                                           \
-  paddle::framework::Tensor out;                                             \
-  paddle::framework::Tensor ref;                                             \
-  std::vector<int> padding({ph, pw});                                        \
-  std::vector<int> stride({1, 1});                                           \
-  std::vector<int> dilation({1, 1});                                         \
-  float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place);         \
-  for (int i = 0; i < input.numel(); ++i) {                                  \
-    input_ptr[i] = static_cast<float>(i + 1);                                \
-  }                                                                          \
-  int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1;            \
-  int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1;             \
-  out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
-  ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
-  paddle::operators::math::Im2ColFunctor<                                    \
-      paddle::operators::math::ColFormat::kCFO,                              \
-      paddle::platform::CPUDeviceContext, float>                             \
-      im2col
-
-void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
-  PREPARE_IM2COL_CPU;
-
-  im2col(context, input, dilation, stride, padding, &out);
-  paddle::operators::math::im2col_common<float>(input, dilation, stride,
-                                                padding, &ref);
-
-  float* ref_data = ref.data<float>();
-  float* out_data = out.data<float>();
-  for (int i = 0; i < out.numel(); ++i) {
-    EXPECT_EQ(out_data[i], ref_data[i]);
-  }
-}
-
-void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
-  PREPARE_IM2COL_CPU;
-  constexpr int repeat = 100;
-  auto GetCurrentMs = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
-  };
-  auto t1 = GetCurrentMs();
-  for (int i = 0; i < repeat; ++i) {
-    im2col(context, input, dilation, stride, padding, &out);
-  }
-  auto t2 = GetCurrentMs();
-
-  for (int i = 0; i < repeat; ++i) {
-    paddle::operators::math::im2col_common<float>(input, dilation, stride,
-                                                  padding, &ref);
-  }
-  auto t3 = GetCurrentMs();
-
-  LOG(INFO) << "before: " << (t3 - t2) / repeat
-            << ",after: " << (t2 - t1) / repeat
-            << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%";
-}
-
-TEST(math, im2col_cputest) {
-  // padding_h == padding_w
-  for (int p = 0; p < 4; ++p) {
-    // width == height
-    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p,
-                  /*pw*/ p);
-    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p,
-                  /*pw*/ p);
-    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p,
-                  /*pw*/ p);
-
-    // height != width
-    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p,
-                  /*pw*/ p);
-    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p,
-                  /*pw*/ p);
-    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p,
-                  /*pw*/ p);
-
-    // filter == 1
-    testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
-                  /*pw*/ p);
-    testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
-                  /*pw*/ p);
-  }
-
-  // padding_h != padding_w
-  testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1,
-                /*pw*/ 2);
-
-  // benchmark
-  for (int p : {0, 1}) {
-    for (int k : {1, 3, 5}) {
-      LOG(INFO) << "padding == " << p << ", filter == " << k;
-      benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k,
-                  /*ph*/ p, /*pw*/ p);
-    }
-  }
-}
diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc
deleted file mode 100644
index 94bbcbb50670d9f0b11b77cf6a54a99c227521bf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/lstm_kernel.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      T cell_clip, const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
-                               cell_clip, cand_act, gate_act, cell_act);
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-    }
-  }
-};
-
-template <class T>
-struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size, T cell_clip,
-                      const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
-                                frame_size, cell_clip, cand_act, gate_act,
-                                cell_act);
-
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-
-      grad.gate_grad += frame_size * 4;
-      grad.state_grad += frame_size;
-      grad.state_active_grad += frame_size;
-      grad.output_grad += frame_size;
-      if (grad.prev_state_grad) {
-        grad.prev_state_grad += frame_size;
-      }
-    }
-  }
-};
-
-template class LstmUnitFunctor<platform::CPUDeviceContext, float>;
-template class LstmUnitFunctor<platform::CPUDeviceContext, double>;
-template class LstmUnitGradFunctor<platform::CPUDeviceContext, float>;
-template class LstmUnitGradFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu
deleted file mode 100644
index e7445d3d40ae92ff66e7d33a38bfdebfc8455f0a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/detail/lstm_gpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/lstm_kernel.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext& context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      T cell_clip, const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
-    detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
-                                frame_size, batch_size, cell_clip, cand_act,
-                                gate_act, cell_act);
-  }
-};
-
-template <class T>
-struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext& context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size, T cell_clip,
-                      const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
-    detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
-                              frame_size, batch_size, cell_clip, cand_act,
-                              gate_act, cell_act);
-  }
-};
-
-template class LstmUnitFunctor<platform::CUDADeviceContext, float>;
-template class LstmUnitFunctor<platform::CUDADeviceContext, double>;
-template class LstmUnitGradFunctor<platform::CUDADeviceContext, float>;
-template class LstmUnitGradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/fluid/operators/math/lstm_compute.h
deleted file mode 100644
index 80af5639387aaf6a983365e13c3478353c27a617..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/lstm_compute.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-struct LstmMetaValue {
-  T *gate_value;
-  T *prev_state_value;
-  T *state_value;
-  T *state_active_value;
-  T *output_value;
-  T *check_ig;
-  T *check_fg;
-  T *check_og;
-};
-
-template <class T>
-struct LstmMetaGrad {
-  T *gate_grad;
-  T *prev_state_grad;
-  T *state_grad;
-  T *state_active_grad;
-  T *output_grad;
-  T *check_ig_grad;
-  T *check_fg_grad;
-  T *check_og_grad;
-};
-
-template <typename DeviceContext, typename T>
-class LstmUnitFunctor {
- public:
-  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      int frame_size, int batch_size, T cell_clip,
-                      const detail::ActivationType &gate_act,
-                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act);
-};
-
-template <typename DeviceContext, typename T>
-class LstmUnitGradFunctor {
- public:
-  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
-                      T cell_clip, const detail::ActivationType &gate_act,
-                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
deleted file mode 100644
index e1491a8156ca3de96d7ab669d11d886833b73a5b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/math_function.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/math_function.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#endif
-
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using float16 = paddle::platform::float16;
-
-template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::CPUDeviceContext, float>;
-template struct SetConstant<platform::CPUDeviceContext, double>;
-template struct SetConstant<platform::CPUDeviceContext, int>;
-template struct SetConstant<platform::CPUDeviceContext, int64_t>;
-template struct SetConstant<platform::CPUDeviceContext, bool>;
-template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
-
-#define DEFINE_CPU_TRANS(RANK)                                             \
-  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
-                            RANK>;                                         \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
-
-DEFINE_CPU_TRANS(1);
-DEFINE_CPU_TRANS(2);
-DEFINE_CPU_TRANS(3);
-DEFINE_CPU_TRANS(4);
-DEFINE_CPU_TRANS(5);
-DEFINE_CPU_TRANS(6);
-
-struct TensorSetConstantCPU {
-  TensorSetConstantCPU(framework::Tensor* tensor, float value)
-      : tensor_(tensor), value_(value) {}
-  template <typename T>
-  void apply() const {
-    auto cpu = platform::CPUPlace();
-    auto* begin = tensor_->mutable_data<T>(cpu);
-    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
-  }
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::CPUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-template <>
-void set_constant_with_place<platform::CUDAPinnedPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
-  TensorSetConstantWithPlace(const platform::DeviceContext& context,
-                             framework::Tensor* tensor, float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename Place>
-  void operator()(Place place) const {
-    set_constant_with_place<Place>(context_, tensor_, value_);
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, float value) {
-  TensorSetConstantWithPlace func(context, tensor, value);
-#ifdef PADDLE_WITH_CUDA
-  tensor->place().apply_visitor(func);
-#else
-  func(platform::CPUPlace());
-#endif
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector, framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
-
-    auto in = framework::EigenMatrix<T>::From(input);
-    auto vec = framework::EigenVector<T>::Flatten(vector);
-    auto out = framework::EigenMatrix<T>::From(*output);
-
-    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
-    }
-  }
-};
-
-template struct RowwiseAdd<platform::CPUDeviceContext, float>;
-template struct RowwiseAdd<platform::CPUDeviceContext, double>;
-
-template struct ColwiseSum<platform::CPUDeviceContext, float>;
-template struct ColwiseSum<platform::CPUDeviceContext, double>;
-template struct ColwiseSum<platform::CPUDeviceContext, int>;
-template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
-
-template struct RowwiseSum<platform::CPUDeviceContext, float>;
-template struct RowwiseSum<platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<platform::CPUDeviceContext, float>;
-template struct RowwiseMean<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
deleted file mode 100644
index fdbd77a5c8ff4a588d15104b277fa5714ae51753..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/math_function.cu
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using float16 = paddle::platform::float16;
-
-template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
-template struct SetConstant<platform::CUDADeviceContext, float>;
-template struct SetConstant<platform::CUDADeviceContext, double>;
-template struct SetConstant<platform::CUDADeviceContext, int>;
-template struct SetConstant<platform::CUDADeviceContext, int64_t>;
-template struct SetConstant<platform::CUDADeviceContext, bool>;
-
-#define DEFINE_GPU_TRANS(RANK)                                           \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;
-
-DEFINE_GPU_TRANS(1);
-DEFINE_GPU_TRANS(2);
-DEFINE_GPU_TRANS(3);
-DEFINE_GPU_TRANS(4);
-DEFINE_GPU_TRANS(5);
-DEFINE_GPU_TRANS(6);
-
-struct TensorSetConstantGPU {
-  TensorSetConstantGPU(const platform::DeviceContext& context,
-                       framework::Tensor* tensor, float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename T>
-  void apply() const {
-    SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
-            tensor_, static_cast<T>(value_));
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::CUDAPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(),
-                           TensorSetConstantGPU(context, tensor, value));
-}
-
-template <typename T>
-__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
-                                 int num) {
-  T tmp = 1.0 / width;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    int h = i * tmp;
-    int w = i - h * width;
-    c[i] = a[i] + b[w];
-  }
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector, framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
-    int blocks = 512;
-    int grids = (input.numel() + blocks - 1) / blocks;
-    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
-        input.data<T>(), vector.data<T>(), output->data<T>(),
-        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
-  }
-};
-
-template struct RowwiseAdd<platform::CUDADeviceContext, float>;
-template struct RowwiseAdd<platform::CUDADeviceContext, double>;
-template struct ColwiseSum<platform::CUDADeviceContext, float>;
-template struct ColwiseSum<platform::CUDADeviceContext, int>;
-template struct ColwiseSum<platform::CUDADeviceContext, int64_t>;
-// template struct ColwiseSum<platform::CUDADeviceContext, double>;
-// The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
-// and only failed for this case. So reimplemented it.
-template <>
-void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
-    const platform::CUDADeviceContext& context, const framework::Tensor& input,
-    framework::Tensor* vector) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
-  framework::Tensor one;
-  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
-  SetConstant<platform::CUDADeviceContext, double> set;
-  set(context, &one, static_cast<double>(1.0));
-  GetBlas<platform::CUDADeviceContext, double>(context).GEMV(
-      true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]), 1.0,
-      input.data<double>(), one.data<double>(), 0.0, vector->data<double>());
-}
-
-template struct RowwiseSum<platform::CUDADeviceContext, float>;
-// template struct RowwiseSum<platform::CUDADeviceContext, double>;
-// TODO(zcd): Following ColwiseSum format, need to confirm.
-// The RowwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
-// and only failed for this case. So reimplemented it.
-template <>
-void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
-    const platform::CUDADeviceContext& context, const framework::Tensor& input,
-    framework::Tensor* vector) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
-  framework::Tensor one;
-  one.mutable_data<double>({size}, context.GetPlace());
-  SetConstant<platform::CUDADeviceContext, double> set;
-  set(context, &one, static_cast<double>(1.0));
-  GetBlas<platform::CUDADeviceContext, double>(context).GEMV(
-      true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]), 1.0,
-      one.data<double>(), input.data<double>(), 0.0, vector->data<double>());
-}
-
-template struct RowwiseMean<platform::CUDADeviceContext, float>;
-template struct RowwiseMean<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
deleted file mode 100644
index b4f19417b6eabf24805c5c8128c2a6d423ddac69..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/math_function.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename DeviceContext, typename T, int Rank>
-struct Transpose {
-  void operator()(const DeviceContext& context, const framework::Tensor& in,
-                  framework::Tensor* out, const std::vector<int>& axis);
-};
-
-template <typename DeviceContext, typename T>
-struct SetConstant {
-  void operator()(const DeviceContext& context, framework::Tensor* tensor,
-                  T num);
-};
-
-template <typename Place>
-void set_constant_with_place(const platform::DeviceContext& context,
-                             framework::Tensor* tensor, float value);
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, float value);
-
-template <typename DeviceContext, typename T>
-struct RowwiseAdd {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& vec, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-struct ColwiseSum {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* vec);
-};
-
-template <typename DeviceContext, typename T>
-struct RowwiseSum {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* vec);
-};
-
-template <typename DeviceContext, typename T>
-struct RowwiseMean {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* vec);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
deleted file mode 100644
index d1127ce4a246136cdd1385ef09d905efe63178d8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
-                                               framework::Tensor* tensor,
-                                               T num) {
-  auto t = framework::EigenVector<T>::Flatten(*tensor);
-  t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
-}
-
-template <typename DeviceContext, typename T, int Rank>
-void Transpose<DeviceContext, T, Rank>::operator()(
-    const DeviceContext& context, const framework::Tensor& in,
-    framework::Tensor* out, const std::vector<int>& axis) {
-  Eigen::array<int, Rank> permute;
-  for (int i = 0; i < Rank; i++) {
-    permute[i] = axis[i];
-  }
-  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
-  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
-  auto* dev = context.eigen_device();
-  eigen_out.device(*dev) = eigen_in.shuffle(permute);
-}
-
-template <typename DeviceContext, typename T>
-void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              framework::Tensor* out) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size);
-
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenVector<T>::Flatten(*out);
-
-  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
-}
-
-// Specialize for CPU, since Eigen implement a general reduce. However,
-// colwise-sum can be easily implemented. General reduce has a huge overhead in
-// CPU
-template <typename T>
-class ColwiseSum<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    auto& in_dims = input.dims();
-    auto height = in_dims[0];
-    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
-
-    T* out_buf = out->mutable_data<T>(out->place());
-    const T* in_buf = input.data<T>();
-
-    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
-      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
-        if (i == 0) {
-          out_buf[j] = in_buf[i * size + j];
-        } else {
-          out_buf[j] += in_buf[i * size + j];
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
-                                               const framework::Tensor& input,
-                                               framework::Tensor* out) {
-  auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
-
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenVector<T>::Flatten(*out);
-
-  vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
-}
-// TODO(zcd): Following ColwiseSum format, need to confirm.
-// Specialize for CPU, since Eigen implement a general reduce. However,
-// rowwise-sum can be easily implemented. General reduce has a huge overhead in
-// CPU
-template <typename T>
-class RowwiseMean<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-    auto height = in_dims[0];
-    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
-    auto inv_size = 1.0 / size;
-    T* out_buf = out->mutable_data<T>(out->place());
-    const T* in_buf = input.data<T>();
-
-    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
-      T sum = 0;
-      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
-        sum += in_buf[i * size + j];
-      }
-      out_buf[i] = sum * inv_size;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              framework::Tensor* out) {
-  auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
-
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenVector<T>::Flatten(*out);
-
-  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
-}
-// TODO(zcd): Following ColwiseSum format, need to confirm.
-// Specialize for CPU, since Eigen implement a general reduce. However,
-// rowwise-sum can be easily implemented. General reduce has a huge overhead in
-// CPU
-template <typename T>
-class RowwiseSum<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-    auto height = in_dims[0];
-    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
-
-    T* out_buf = out->mutable_data<T>(out->place());
-    const T* in_buf = input.data<T>();
-
-    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
-      T sum = 0;
-      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
-        sum += in_buf[i * size + j];
-      }
-      out_buf[i] = sum;
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
deleted file mode 100644
index 2343e0ee965303c9fdb2ad3faf9ddf6e5bb7782f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/math/math_function.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-template <typename T>
-inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
-GetBlas(const paddle::platform::CPUDeviceContext& context) {
-  return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
-                                          T>(context);
-}
-
-TEST(math_function, gemm_notrans_cblas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
-  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, input1_ptr, 3,
-                               input2_ptr + 1, 4, 1, input3_ptr + 1, 4);
-
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-}
-#ifdef PADDLE_WITH_LIBXSMM
-template <typename T>
-void MklSmmCompare(int m, int n, int k) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor mat_b;
-  paddle::framework::Tensor mat_c_smm;
-  paddle::framework::Tensor mat_c_mkl;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-
-  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
-  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
-  T* CSMM = mat_c_smm.mutable_data<T>({m, n}, *cpu_place);
-  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
-  T alpha = static_cast<T>(1);
-  T beta = static_cast<T>(0);
-  for (int i = 0; i < mat_a.numel(); ++i) {
-    A[i] = static_cast<T>(i);
-  }
-  for (int i = 0; i < mat_b.numel(); ++i) {
-    B[i] = static_cast<T>(i);
-  }
-  // lda,ldb,ldc follow RowMajor
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-
-  auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
-    const char transa = 'N';
-    const char transb = 'N';
-    paddle::operators::math::CBlas<T>::SMM_GEMM(&transa, &transb, &n, &m, &k,
-                                                &alpha, B, &ldb, A, &lda, &beta,
-                                                CSMM, &ldc);
-  };
-
-  auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
-    paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
-                                            CblasNoTrans, m, n, k, alpha, A,
-                                            lda, B, ldb, beta, CMKL, ldc);
-  };
-
-  smm();
-  mkl();
-  ASSERT_EQ(mat_c_mkl.numel(), mat_c_smm.numel());
-  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
-    EXPECT_FLOAT_EQ(CSMM[i], CMKL[i]);
-  }
-}
-TEST(math_function, gemm_mkl_vs_smm) {
-  MklSmmCompare<float>(1, 2, 3);
-  MklSmmCompare<double>(1, 2, 3);
-  MklSmmCompare<float>(3, 2, 1);
-  MklSmmCompare<double>(3, 2, 1);
-  MklSmmCompare<float>(3, 8, 5);
-  MklSmmCompare<double>(3, 8, 5);
-}
-#endif
-
-TEST(math_function, gemm_trans_cblas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
-  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
-                               input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
-  delete cpu_place;
-  cpu_place = NULL;
-
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-}
-
-TEST(math_function, zero) {
-  paddle::framework::Tensor tensor;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
-  functor(context, &tensor, 0);
-  EXPECT_EQ(t[0], 0);
-  EXPECT_EQ(t[1], 0);
-  EXPECT_EQ(t[2], 0);
-  EXPECT_EQ(t[3], 0);
-
-  functor(context, &tensor, 1);
-
-  EXPECT_EQ(t[0], 1);
-  EXPECT_EQ(t[1], 1);
-  EXPECT_EQ(t[2], 1);
-  EXPECT_EQ(t[3], 1);
-}
-
-template <typename T>
-void GemvTest(int m, int n, bool trans) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor vec_b;
-  paddle::framework::Tensor vec_c;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  int b_num = trans ? m : n;
-  int c_num = trans ? n : m;
-
-  T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
-  T* data_b = vec_b.mutable_data<T>({b_num}, *cpu_place);
-  T* data_c = vec_c.mutable_data<T>({c_num}, *cpu_place);
-  for (int i = 0; i < mat_a.numel(); ++i) {
-    data_a[i] = static_cast<T>(i);
-  }
-  for (int i = 0; i < vec_b.numel(); ++i) {
-    data_b[i] = static_cast<T>(i);
-  }
-
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
-                           data_a, data_b, 0., data_c);
-
-  if (!trans) {
-    for (int i = 0; i < m; ++i) {
-      T sum = 0.0;
-      for (int j = 0; j < n; ++j) {
-        sum += data_a[i * n + j] * data_b[j];
-      }
-      ASSERT_FLOAT_EQ(data_c[i], sum);
-    }
-  } else {
-    for (int i = 0; i < n; ++i) {
-      T sum = 0.0;
-      for (int j = 0; j < m; ++j) {
-        sum += data_a[j * n + i] * data_b[j];
-      }
-      ASSERT_FLOAT_EQ(data_c[i], sum);
-    }
-  }
-}
-
-TEST(math_function, gemv) {
-  GemvTest<float>(3, 13, false);
-  GemvTest<double>(4, 5, false);
-  GemvTest<float>(12, 7, true);
-  GemvTest<double>(7, 9, true);
-}
-
-TEST(math_funciton, set_constant) {
-  paddle::framework::Tensor t;
-  t.Resize({10, 10});
-  t.mutable_data<int>(paddle::platform::CPUPlace());
-  auto* ctx = new paddle::platform::CPUDeviceContext();
-  paddle::operators::math::set_constant(*ctx, &t, 10);
-  for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
-  }
-  delete ctx;
-}
-
-template <typename T>
-void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor mat_b;
-  paddle::framework::Tensor mat_c_ref;
-  paddle::framework::Tensor mat_c_mkl;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-
-  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
-  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
-  T* CREF = mat_c_ref.mutable_data<T>({m, n}, *cpu_place);
-  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
-
-  ASSERT_EQ(mat_c_mkl.numel(), mat_c_ref.numel());
-  for (int i = 0; i < mat_a.numel(); ++i) {
-    A[i] = static_cast<T>(i);
-  }
-  for (int i = 0; i < mat_b.numel(); ++i) {
-    B[i] = static_cast<T>(i + 1);
-  }
-  for (int i = 0; i < mat_c_ref.numel(); ++i) {
-    CREF[i] = static_cast<T>(i + 2);
-    CMKL[i] = CREF[i];
-  }
-
-  // this would call gemm_warp
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<T>(context).GEMM(CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B,
-                           beta, CREF);
-
-  // lda,ldb,ldc follow RowMajor
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-  paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
-                                          CblasNoTrans, m, n, k, alpha, A, lda,
-                                          B, ldb, beta, CMKL, ldc);
-
-  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
-    EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
-  }
-}
-
-TEST(math_function, gemm_warp) {
-  GemmWarpTest<float>(3, 2, 5, 1.f, 0.f);
-  GemmWarpTest<float>(3, 2, 5, 2.f, 1.f);
-  GemmWarpTest<float>(8, 5, 6, 1.f, 0.f);
-  GemmWarpTest<float>(8, 5, 6, 2.f, 1.f);
-  GemmWarpTest<double>(3, 2, 5, 1.0, 0.0);
-  GemmWarpTest<double>(3, 2, 5, 2.0, 1.0);
-  GemmWarpTest<double>(8, 5, 6, 1.0, 0.0);
-  GemmWarpTest<double>(8, 5, 6, 2.0, 1.0);
-}
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
deleted file mode 100644
index bcbb4a8274f149240b9f0990f38d9f38bdd0e5b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ /dev/null
@@ -1,467 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
-
-void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
-                    const std::vector<float>& data) {
-  PADDLE_ENFORCE_EQ(size, data.size());
-  for (size_t i = 0; i < data.size(); ++i) {
-    in_ptr[i] = paddle::platform::float16(data[i]);
-  }
-}
-
-template <typename T>
-inline paddle::operators::math::BlasT<paddle::platform::CUDADeviceContext, T>
-GetBlas(const paddle::platform::CUDADeviceContext& context) {
-  return paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext,
-                                          T>(context);
-}
-
-TEST(math_function, notrans_mul_trans_fp32) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
-
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr, 6 * sizeof(float));
-
-  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
-  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
-
-  out_gpu.mutable_data<float>({2, 2}, gpu_place);
-  GetBlas<float>(context).MatMul(input1_gpu, false, input2_gpu, true, 1,
-                                 &out_gpu, 0);
-
-  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
-
-  float* out_ptr = out.data<float>();
-  context.Wait();
-  EXPECT_EQ(out_ptr[0], 5);
-  EXPECT_EQ(out_ptr[1], 14);
-  EXPECT_EQ(out_ptr[2], 14);
-  EXPECT_EQ(out_ptr[3], 50);
-}
-
-TEST(math_function, notrans_mul_trans_fp16) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
-
-  // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
-    return;
-  }
-
-  paddle::platform::float16* input1_ptr =
-      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
-  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
-
-  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
-  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
-
-  out_gpu.mutable_data<paddle::platform::float16>({2, 2}, gpu_place);
-
-  GetBlas<paddle::platform::float16>(context).MatMul(
-      input1_gpu, false, input2_gpu, true, paddle::platform::float16(1),
-      &out_gpu, paddle::platform::float16(0));
-
-  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
-
-  paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
-  context.Wait();
-  EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
-  EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
-  EXPECT_EQ(static_cast<float>(out_ptr[2]), 14);
-  EXPECT_EQ(static_cast<float>(out_ptr[3]), 50);
-}
-
-TEST(math_function, trans_mul_notrans_fp32) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
-
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr, 6 * sizeof(float));
-
-  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
-  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
-
-  out_gpu.mutable_data<float>({3, 3}, gpu_place);
-
-  GetBlas<float>(context).MatMul(input1_gpu, true, input2_gpu, false, 1,
-                                 &out_gpu, 0);
-
-  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
-
-  float* out_ptr = out.data<float>();
-  context.Wait();
-  EXPECT_EQ(out_ptr[0], 9);
-  EXPECT_EQ(out_ptr[1], 12);
-  EXPECT_EQ(out_ptr[2], 15);
-  EXPECT_EQ(out_ptr[3], 12);
-  EXPECT_EQ(out_ptr[4], 17);
-  EXPECT_EQ(out_ptr[5], 22);
-  EXPECT_EQ(out_ptr[6], 15);
-  EXPECT_EQ(out_ptr[7], 22);
-  EXPECT_EQ(out_ptr[8], 29);
-}
-
-TEST(math_function, trans_mul_notrans_fp16) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
-
-  // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
-    return;
-  }
-
-  paddle::platform::float16* input1_ptr =
-      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
-  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
-
-  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
-  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
-
-  out_gpu.mutable_data<paddle::platform::float16>({3, 3}, gpu_place);
-
-  GetBlas<paddle::platform::float16>(context).MatMul(
-      input1_gpu, true, input2_gpu, false, paddle::platform::float16(1),
-      &out_gpu, paddle::platform::float16(0));
-
-  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
-
-  paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
-  context.Wait();
-  EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
-  EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
-  EXPECT_EQ(static_cast<float>(out_ptr[2]), 15);
-  EXPECT_EQ(static_cast<float>(out_ptr[3]), 12);
-  EXPECT_EQ(static_cast<float>(out_ptr[4]), 17);
-  EXPECT_EQ(static_cast<float>(out_ptr[5]), 22);
-  EXPECT_EQ(static_cast<float>(out_ptr[6]), 15);
-  EXPECT_EQ(static_cast<float>(out_ptr[7]), 22);
-  EXPECT_EQ(static_cast<float>(out_ptr[8]), 29);
-}
-
-TEST(math_function, gemm_notrans_cublas_fp32) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({3, 4}, cpu_place);
-  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
-  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
-  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
-  float* a = input1_gpu.data<float>();
-  float* b = input2_gpu.data<float>();
-  float* c = input3_gpu.mutable_data<float>(gpu_place);
-
-  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, a, 3, b + 1, 4, 1,
-                               c + 1, 4);
-
-  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
-
-  // numpy code:
-  // a = np.arange(6).reshape(2, 3)
-  // b = np.arange(12).reshape(3, 4)[:, 1:]
-  // c = np.arange(8).reshape(2, 4)[:, 1:]
-  // out = np.arange(8).reshape(2, 4)
-  // out[:, 1:] = np.dot(a, b) + c
-  context.Wait();
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-}
-
-TEST(math_function, gemm_notrans_cublas_fp16) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
-
-  // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
-    return;
-  }
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  paddle::platform::float16* input1_ptr =
-      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
-  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
-  paddle::platform::float16* input2_ptr =
-      input2.mutable_data<paddle::platform::float16>({3, 4}, cpu_place);
-  fill_fp16_data(input2_ptr, input2.numel(),
-                 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
-  paddle::platform::float16* input3_ptr =
-      input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
-  fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
-
-  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
-  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
-  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
-  paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
-  paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
-  paddle::platform::float16* c =
-      input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
-
-  GetBlas<paddle::platform::float16>(context).GEMM(
-      false, false, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
-      b + 1, 4, static_cast<paddle::platform::float16>(1), c + 1, 4);
-
-  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
-
-  // numpy code:
-  // a = np.arange(6).reshape(2, 3)
-  // b = np.arange(12).reshape(3, 4)[:, 1:]
-  // c = np.arange(8).reshape(2, 4)[:, 1:]
-  // out = np.arange(8).reshape(2, 4)
-  // out[:, 1:] = np.dot(a, b) + c
-  context.Wait();
-  EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
-  EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
-  EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
-  EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
-  EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
-  EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
-  EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
-  EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
-}
-
-TEST(math_function, gemm_trans_cublas_fp32) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({4, 3}, cpu_place);
-  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
-  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
-  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
-  float* a = input1_gpu.data<float>();
-  float* b = input2_gpu.data<float>();
-  float* c = input3_gpu.mutable_data<float>(gpu_place);
-
-  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, a, 3, b + 3, 3, 1,
-                               c + 1, 4);
-
-  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
-
-  context.Wait();
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-}
-
-TEST(math_function, gemm_trans_cublas_fp16) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
-
-  // fp16 GEMM in cublas requires GPU compute capability >= 53
-  if (context.GetComputeCapability() < 53) {
-    return;
-  }
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  paddle::platform::float16* input1_ptr =
-      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
-  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
-  paddle::platform::float16* input2_ptr =
-      input2.mutable_data<paddle::platform::float16>({4, 3}, cpu_place);
-  fill_fp16_data(input2_ptr, input2.numel(),
-                 {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
-  paddle::platform::float16* input3_ptr =
-      input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
-  fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
-
-  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
-  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
-  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
-  paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
-  paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
-  paddle::platform::float16* c =
-      input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
-
-  GetBlas<paddle::platform::float16>(context).GEMM(
-      false, true, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
-      b + 3, 3, static_cast<paddle::platform::float16>(1), c + 1, 4);
-
-  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
-
-  context.Wait();
-  EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
-  EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
-  EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
-  EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
-  EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
-  EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
-  EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
-  EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
-}
-
-template <typename T>
-void GemvTest(int m, int n, bool trans) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor vec_b;
-  paddle::framework::Tensor vec_c;
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
-
-  T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
-  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
-  T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);
-
-  paddle::framework::Tensor g_mat_a;
-  paddle::framework::Tensor g_vec_b;
-  paddle::framework::Tensor g_vec_c;
-  T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
-  T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
-  T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
-
-  for (int i = 0; i < mat_a.numel(); ++i) {
-    data_a[i] = static_cast<T>(i);
-  }
-  for (int i = 0; i < vec_b.numel(); ++i) {
-    data_b[i] = static_cast<T>(i);
-  }
-
-  paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
-  paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
-
-  GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
-                           g_data_a, g_data_b, 0., g_data_c);
-
-  paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);
-
-  if (!trans) {
-    for (int i = 0; i < m; ++i) {
-      T sum = 0.0;
-      for (int j = 0; j < n; ++j) {
-        sum += data_a[i * n + j] * data_b[j];
-      }
-      ASSERT_FLOAT_EQ(data_c[i], sum);
-    }
-  } else {
-    for (int i = 0; i < n; ++i) {
-      T sum = 0.0;
-      for (int j = 0; j < m; ++j) {
-        sum += data_a[j * n + i] * data_b[j];
-      }
-      ASSERT_FLOAT_EQ(data_c[i], sum);
-    }
-  }
-}
-
-TEST(math_function, gemv) {
-  GemvTest<float>(3, 13, false);
-  GemvTest<double>(3, 13, false);
-  GemvTest<float>(3, 13, true);
-  GemvTest<double>(3, 13, true);
-}
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
deleted file mode 100644
index d6f51c6e5c693becb14ff0bac0088bb9dc2b2f55..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ /dev/null
@@ -1,363 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_bit_code.h"
-#include <iostream>
-#include <map>
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct MatrixBitCodeFunctorAdd : public boost::static_visitor<void> {
-  const framework::Tensor &vec_;
-  framework::Tensor *tmat_;
-
-  MatrixBitCodeFunctorAdd(const framework::Tensor &vec, framework::Tensor *tmat)
-      : vec_(vec), tmat_(tmat) {}
-
-  template <typename CodeTable>
-  void operator()(const CodeTable &code_table) {
-    size_t batch_size = tmat_->dims()[0];
-    size_t width = tmat_->dims()[1];
-    auto *tmat_data = tmat_->data<T>();
-    auto *vec_data = vec_.data<T>();
-    for (size_t i = 0; i < batch_size; ++i) {
-      auto code = code_table.get_code(i);
-      int code_length = code.get_length();
-      for (int j = 0; j < code_length; ++j) {
-        size_t index = code.calc_index(j);
-        tmat_data[i * width + j] += vec_data[index];
-      }
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::Add(const framework::Tensor &vec,
-                                  framework::Tensor *tmat) {
-  MatrixBitCodeFunctorAdd<T> func(vec, tmat);
-  code_table_.apply_visitor(func);
-}
-
-template <typename T>
-struct MatrixBitCodeFunctorAddGrad : public boost::static_visitor<void> {
-  const framework::Tensor &tmat_;
-  framework::Tensor *vec_;
-  MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat,
-                              framework::Tensor *vec)
-      : tmat_(tmat), vec_(vec) {}
-
-  template <typename CodeTable>
-  void operator()(const CodeTable &table) {
-    size_t batch_size = tmat_.dims()[0];
-    size_t width = tmat_.dims()[1];
-    auto *vec_data = vec_->data<T>();
-    auto *tmat_data = tmat_.data<T>();
-    for (size_t i = 0; i < batch_size; ++i) {
-      auto code = table.get_code(i);
-      int code_length = code.get_length();
-      for (int j = 0; j < code_length; ++j) {
-        size_t index = code.calc_index(j);
-        vec_data[index] += tmat_data[i * width + j];
-      }
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
-                                      framework::Tensor *vec) {
-  MatrixBitCodeFunctorAddGrad<T> func(tmat, vec);
-  code_table_.apply_visitor(func);
-}
-
-template <typename T>
-struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> {
-  const framework::Tensor &tmat_;
-  framework::Tensor *sum_;
-  T scale_sum_;
-
-  MatrixBitCodeFunctorSum(const framework::Tensor &tmat, framework::Tensor *sum,
-                          T scale_sum)
-      : tmat_(tmat), sum_(sum), scale_sum_(scale_sum) {}
-
-  template <typename CodeTable>
-  void operator()(const CodeTable &code_table) {
-    size_t num_samples = tmat_.dims()[0];
-    size_t o_width = tmat_.dims()[1];
-    auto *tmat_data = tmat_.data<T>();
-    auto *sum_data = sum_->data<T>();
-    for (size_t i = 0; i < num_samples; ++i) {
-      T sm = static_cast<T>(0.0);
-      auto code = code_table.get_code(i);
-      int code_length = code.get_length();
-      for (int j = 0; j < code_length; ++j) {
-        if (code.calc_bit(j)) {
-          // calc_bit starts from right most bit, while data in tmat[i] is in
-          // the
-          // reverse order.
-          sm += tmat_data[i * o_width + j];
-        }
-      }
-      sum_data[i] = scale_sum_ * sm;
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor &tmat,
-                                  framework::Tensor *sum, T scale_sum) {
-  MatrixBitCodeFunctorSum<T> func(tmat, sum, scale_sum);
-  code_table_.apply_visitor(func);
-}
-
-template <typename T>
-struct MatrixBitCodeFunctorMul : public boost::static_visitor<void> {
-  framework::Tensor *tmat_;
-  const framework::Tensor &weight_;
-  const framework::Tensor &input_;
-
-  MatrixBitCodeFunctorMul(framework::Tensor *tmat,
-                          const framework::Tensor &weight,
-                          const framework::Tensor &input)
-      : tmat_(tmat), weight_(weight), input_(input) {}
-
-  template <typename CodeTable>
-  void operator()(const CodeTable &code_table) {
-    auto blas =
-        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-    size_t num_samples = tmat_->dims()[0];
-    size_t tmat_width = tmat_->dims()[1];
-    size_t input_width = input_.dims()[1];
-    size_t weight_width = weight_.dims()[1];
-    auto tmat_value = tmat_->data<T>();
-    auto weight_value = weight_.data<T>();
-    auto input_value = input_.data<T>();
-    for (size_t i = 0; i < num_samples; ++i) {
-      auto code = code_table.get_code(i);
-      int code_length = code.get_length();
-      const T *input_row = input_value + input_width * i;
-      for (int j = 0; j < code_length; ++j) {
-        size_t index = code.calc_index(j);
-        const T *weight_row = weight_value + weight_width * index;
-        T sum = blas.DOT(input_width, weight_row, input_row);
-        tmat_value[i * tmat_width + j] += sum;
-      }
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::Mul(framework::Tensor *tmat,
-                                  const framework::Tensor &weight,
-                                  const framework::Tensor &input) {
-  MatrixBitCodeFunctorMul<T> func(tmat, weight, input);
-  code_table_.apply_visitor(func);
-}
-
-template <typename T, size_t N>
-class ReservedVector : public std::vector<T> {
- public:
-  ReservedVector() { this->reserve(N); }
-};
-
-template <typename T>
-struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor<void> {
-  const framework::Tensor &tmat_;
-  framework::Tensor *weight_;
-  const framework::Tensor &input_;
-  MatrixBitCodeFunctorMulGradWeight(const framework::Tensor &tmat,
-                                    framework::Tensor *weight,
-                                    const framework::Tensor &input)
-      : tmat_(tmat), weight_(weight), input_(input) {}
-  template <typename CodeTable>
-  void operator()(const CodeTable &code_table) {
-    auto blas =
-        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-    size_t num_samples = tmat_.dims()[0];
-    size_t input_width = input_.dims()[1];
-    size_t tmat_width = tmat_.dims()[1];
-    size_t weight_width = weight_->dims()[1];
-    auto tmat_value = tmat_.data<T>();
-    auto weight_value = weight_->data<T>();
-    auto input_value = input_.data<T>();
-
-    std::map<int, ReservedVector<std::pair<T, const T *>, 8u>> ops;
-    for (size_t i = 0; i < num_samples; ++i) {
-      auto code = code_table.get_code(i);
-      int code_length = code.get_length();
-      const T *input_value_row = input_value + input_width * i;
-      const T *tmat_row = tmat_value + i * tmat_width;
-      for (int j = 0; j < code_length; ++j) {
-        ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row);
-      }
-    }
-    for (auto &op : ops) {
-      auto &op_in_row = op.second;
-      for (auto &pair : op_in_row) {
-        auto &scale = pair.first;
-        auto *input_row = pair.second;
-        T *weight_row = weight_value + op.first * weight_width;
-        blas.AXPY(input_width, scale, input_row, weight_row);
-      }
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
-                                            framework::Tensor *weight,
-                                            const framework::Tensor &input) {
-  MatrixBitCodeFunctorMulGradWeight<T> func(tmat, weight, input);
-  code_table_.apply_visitor(func);
-}
-
-template <typename T>
-struct MatrixBitCodeFunctorMulGradWeightSR
-    : public boost::static_visitor<void> {
-  const framework::Tensor &tmat_;
-  framework::SelectedRows *weight_;
-  const framework::Tensor &input_;
-
-  MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat,
-                                      framework::SelectedRows *weight,
-                                      const framework::Tensor &input)
-      : tmat_(tmat), weight_(weight), input_(input) {}
-
-  template <typename CodeTable>
-  void operator()(const CodeTable &code_table) {
-    auto blas =
-        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-    size_t num_samples = tmat_.dims()[0];
-    size_t input_width = input_.dims()[1];
-    size_t tmat_width = tmat_.dims()[1];
-    size_t weight_width = weight_->value().dims()[1];
-    auto tmat_value = tmat_.data<T>();
-    auto weight_value = weight_->mutable_value()->data<T>();
-    auto input_value = input_.data<T>();
-
-    std::unordered_map<int, std::vector<std::pair<T, const T *>>> ops;
-    ops.reserve(weight_->rows().size());
-
-    for (size_t i = 0; i < num_samples; ++i) {
-      auto code = code_table.get_code(i);
-      int code_length = code.get_length();
-      const T *input_value_row = input_value + input_width * i;
-      const T *tmat_row = tmat_value + i * tmat_width;
-      for (int j = 0; j < code_length; ++j) {
-        ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row);
-      }
-    }
-
-    for (auto &row : weight_->rows()) {
-      auto &op_in_row = ops[row];
-      for (auto &pair : op_in_row) {
-        auto &scale = pair.first;
-        auto *input_row = pair.second;
-        blas.AXPY(input_width, scale, input_row, weight_value);
-      }
-      weight_value += weight_width;
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
-                                            framework::SelectedRows *weight,
-                                            const framework::Tensor &input) {
-  MatrixBitCodeFunctorMulGradWeightSR<T> func(tmat, weight, input);
-  code_table_.apply_visitor(func);
-}
-
-template <typename T>
-struct MatrixBitCodeFunctorMulGradError : public boost::static_visitor<void> {
-  const framework::Tensor &tmat_;
-  const framework::Tensor &weight_;
-  framework::Tensor *input_;
-
-  MatrixBitCodeFunctorMulGradError(const framework::Tensor &tmat,
-                                   const framework::Tensor &weight,
-                                   framework::Tensor *input)
-      : tmat_(tmat), weight_(weight), input_(input) {}
-  template <typename CodeTable>
-  void operator()(const CodeTable &code_table) {
-    size_t num_samples = tmat_.dims()[0];
-    size_t tmat_width = tmat_.dims()[1];
-    size_t input_width = input_->dims()[1];
-    size_t weight_width = weight_.dims()[1];
-    auto tmat_value = tmat_.data<T>();
-    auto weight_value = weight_.data<T>();
-    auto input_value = input_->data<T>();
-
-    for (size_t i = 0; i < num_samples; ++i) {
-      auto code = code_table.get_code(i);
-      int code_length = code.get_length();
-      for (int j = 0; j < code_length; ++j) {
-        size_t index = code.calc_index(j);
-
-        for (size_t k = 0; k < input_width; ++k) {
-          input_value[input_width * i + k] +=
-              tmat_value[i * tmat_width + j] *
-              weight_value[weight_width * index + k];
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor &tmat,
-                                           const framework::Tensor &weight,
-                                           framework::Tensor *input) {
-  MatrixBitCodeFunctorMulGradError<T> func(tmat, weight, input);
-  code_table_.apply_visitor(func);
-}
-
-template <typename T>
-struct MatrixBitCodeFunctorSub : public boost::static_visitor<void> {
-  framework::Tensor *tmat_;
-
-  explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {}
-
-  template <typename CodeTable>
-  void operator()(const CodeTable &code_table) {
-    size_t num_samples = tmat_->dims()[0];
-    size_t o_width = tmat_->dims()[1];
-    auto *tmat_data = tmat_->data<T>();
-    for (size_t i = 0; i < num_samples; ++i) {
-      auto code = code_table.get_code(i);
-      int code_length = code.get_length();
-      for (int j = 0; j < code_length; ++j) {
-        if (code.calc_bit(j)) {
-          tmat_data[i * o_width + j] -= 1;
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::Sub(framework::Tensor *tmat) {
-  MatrixBitCodeFunctorSub<T> func(tmat);
-  code_table_.apply_visitor(func);
-}
-
-template class MatrixBitCodeFunctor<float>;
-template class MatrixBitCodeFunctor<double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
deleted file mode 100644
index c399cb5d44aaa50fab00fd170c021c8c70eee990..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <map>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/variant.h"
-
-#if defined(_WIN32)
-#include <intrin.h>
-#include <windows.h>
-#endif  // _WIN32
-
-namespace paddle {
-namespace operators {
-namespace math {
-/**
- * SimpleCodeTable class should support 3 functions:
- *
- * size_t size()
- *   return the number of ids
- *
- * int get_max_code_length()
- *   return the maximal code length
- *
- * SimpleCode operator()(size_t i)
- *   return the i-th code. Code class is descriebed below.
- *
- * SimpleCode class should support 3 functions:
- *
- * int get_length()
- *   return the length of the code
- *
- * size_t cal_index(int bit)
- *   bit ranges from 0 to get_length() - 1
- *   return the index for the (1+bit) level parent
- *
- * bool calc_bit(int bit)
- *   return true if the bit level parent is the right child of (1+bit) level
- *   parent
- *
- */
-
-/**
- * return the 1-based index of the highest bit set
- *
- * for x > 0:
- * \f[
- *    FindLastSet(x) = 1 + \floor*{\log_{2}x}
- * \f]
- */
-#if !defined(_WIN32)
-inline constexpr size_t FindLastSet(size_t x) {
-  return std::is_same<size_t, unsigned int>::value
-             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
-             : (std::is_same<size_t, unsigned long>::value  // NOLINT
-                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
-                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
-}
-#else
-// windows don't have built-in clz, ctz function
-template <typename T>
-inline int ctz(const T& value) {
-  DWORD trailing_zero = 0;
-  if (_BitScanForward(&trailing_zero, value)) {
-    return static_cast<int>(trailing_zero);
-  } else {
-    return static_cast<int>(0);
-  }
-}
-
-template <typename T>
-inline int clz(const T& value) {
-  DWORD leadning_zero = 0;
-  if (_BitScanReverse(&leadning_zero, value)) {
-    return static_cast<int>(sizeof(T) * 8 - leadning_zero);
-  } else {
-    return static_cast<int>(0);
-  }
-}
-
-inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
-#endif  // !_WIN32
-class SimpleCode {
- public:
-  SimpleCode(size_t code, size_t num_classes, const int64_t* ids)
-      : c_(static_cast<size_t>(ids[code]) + num_classes) {}
-  /**
-   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
-   * is `c + num_classes` and all siblings can get the same weight indice using
-   * prefixes.
-   * Weight index is the prefixes of encoding, thus leave out the right most
-   * bit in calc_index.
-   * Binary classification path is the suffixes of encoding, thus leave out the
-   * left most bit in calc_bit.
-   */
-  size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
-  bool calc_bit(int bit) const { return c_ & (1 << bit); }
-  int get_length() const { return FindLastSet(c_) - 1; }
-
- private:
-  size_t c_;
-};
-
-template <typename T>
-class CustomCode {
- public:
-  CustomCode(const framework::Tensor& path_table,
-             const framework::Tensor& path_code, const int64_t* ids,
-             int index) {
-    seq_len_ = path_table.dims()[1];
-    path_table_data_ = path_table.data<T>() + seq_len_ * index;
-    path_code_data_ = path_code.data<T>() + seq_len_ * index;
-  }
-  /**
-   * Here the id of root should be 1 rather than 0, thus the encoding of class c
-   * is `c + num_classes` and all siblings can get the same weight indice using
-   * prefixes.
-   * Weight index is the prefixes of encoding, thus leave out the right most
-   * bit in calc_index.
-   * Binary classification path is the suffixes of encoding, thus leave out the
-   * left most bit in calc_bit.
-   */
-  size_t calc_index(int bit) const { return path_table_data_[bit]; }
-  bool calc_bit(int bit) const { return path_code_data_[bit]; }
-
-  // NOTE: this function is not thread-safe.
-  int get_length() const {
-    if (length_ < 0) {
-      auto len = seq_len_;
-      length_ = static_cast<int>(
-          std::find_if(path_table_data_, path_table_data_ + len,
-                       [](const T& val) { return val < 0; }) -
-          path_table_data_);
-    }
-    return length_;
-  }
-
- private:
-  int64_t seq_len_;
-  const T* path_table_data_;
-  const T* path_code_data_;
-  mutable int length_{-1};
-};
-
-class SimpleCodeTable {
- public:
-  SimpleCodeTable(size_t num_classes, const int64_t* ids)
-      : num_classes_(num_classes), ids_(ids) {}
-
-  SimpleCode get_code(int64_t code) const {
-    return SimpleCode(code, num_classes_, ids_);
-  }
-
-  size_t size() const { return num_classes_; }
-  int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
-
- private:
-  size_t num_classes_;
-  const int64_t* ids_;
-};
-
-template <typename T>
-class CustomCodeTable {
- public:
-  CustomCodeTable(const framework::Tensor& path_table,
-                  const framework::Tensor& path_code, const int64_t* ids)
-      : ptable_(path_table), pcode_(path_code), ids_(ids) {}
-
-  CustomCode<T> get_code(int64_t code) const {
-    return CustomCode<T>(ptable_, pcode_, ids_, code);
-  }
-
-  size_t size() const { return static_cast<size_t>(ptable_.dims()[1]); }
-  int get_max_code_length() const {
-    return static_cast<size_t>(ptable_.dims()[1]);
-  }
-
- private:
-  const framework::Tensor& ptable_;
-  const framework::Tensor& pcode_;
-  const int64_t* ids_;
-};
-
-using CodeTable = boost::variant<SimpleCodeTable, CustomCodeTable<int64_t>>;
-
-template <typename T>
-class MatrixBitCodeFunctor {
- public:
-  MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
-      : num_classes_(num_classes),
-        ids_(ids),
-        code_table_(SimpleCodeTable(num_classes, ids)) {}
-
-  MatrixBitCodeFunctor(const framework::Tensor& path_table,
-                       const framework::Tensor& path_code, const int64_t* ids)
-      : num_classes_(static_cast<size_t>(path_table.dims()[1])),
-        ids_(ids),
-        code_table_(CustomCodeTable<int64_t>(path_table, path_code, ids)) {}
-  /* For j < code_length
-       tmat(i, j) += vec(0, index(i, j))
-  */
-  void Add(const framework::Tensor& vec, framework::Tensor* tmat);
-
-  /* For j < code_length
-       vec(0, index(i, j)) += tmat(i, j)
-  */
-  void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
-
-  /* For j < code_length
-    sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
-  */
-  void Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum);
-
-  /* For j < code_length
-       tmat(i, j) -= bit(i, j)
-  */
-  void Sub(framework::Tensor* tmat);
-  /* For j < code_length
-       input.row(i) += tmat(i, j) * weight.row(index(i, j))
-  */
-  void Mul(framework::Tensor* tmat, const framework::Tensor& weight,
-           const framework::Tensor& input);
-
-  /* For index(i, j) >= 0:
-      weight.row(index(i, j)) += tmat(i, j) * input.row(i)
-  */
-  void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight,
-                     const framework::Tensor& input);
-  /* For SelectedRows Weight, For index(i, j) >= 0:
-      weight.row(index(i, j)) += tmat(i, j) * input.row(i)
-  */
-  void MulGradWeight(const framework::Tensor& tmat,
-                     framework::SelectedRows* weight,
-                     const framework::Tensor& input);
-  /* For j < code_length
-    input.row(i) += tmat(i, j) * weight.row(index(i, j))
-  */
-  void MulGradError(const framework::Tensor& tmat,
-                    const framework::Tensor& weight, framework::Tensor* input);
-
-  size_t num_classes_;
-  const int64_t* ids_;
-  CodeTable code_table_;
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc
deleted file mode 100644
index 730f71e96b63d16b2da0bd1412d7e07fe1861d0a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/maxouting.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/maxouting.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// All tensors are in NCHW format, and the groups must be greater than 1
-template <typename T>
-class MaxOutFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  int groups) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    int fea_size = input_height * input_width;
-    // c_size means the output size of each sample
-    int c_size = fea_size * output_channels;
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; ++i) {
-      int new_bindex = c_size * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int new_cindex = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          T ele = static_cast<T>(-FLT_MAX);
-          for (int ph = 0; ph < groups; ++ph) {
-            T x = input_data[(new_bindex + new_cindex) * groups +
-                             ph * fea_size + f];
-            ele = ele > x ? ele : x;
-          }
-          output_data[(new_bindex + new_cindex + f)] = ele;
-        }
-      }
-    }
-  }
-};
-
-template <class T>
-class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, int groups) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    int fea_size = input_height * input_width;
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; ++i) {
-      int blen = fea_size * output_channels * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int clen = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          int input_idx0 = (blen + clen) * groups + f;
-          bool continue_match = true;
-          int output_idx = blen + clen + f;
-          for (int g = 0; g < groups && continue_match; ++g) {
-            int input_idx = input_idx0 + fea_size * g;
-            if (input_data[input_idx] == output_data[output_idx]) {
-              input_grad_data[input_idx] += output_grad_data[output_idx];
-              continue_match = false;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
-template class MaxOutFunctor<platform::CPUDeviceContext, float>;
-template class MaxOutFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
deleted file mode 100644
index d9a23299a4d5750fc8c7fe3e5d1f8cd94bcb9cae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/maxouting.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/maxouting.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-__global__ void KernelMaxOut(const int nthreads, const T* input_data,
-                             const int channels, const int input_height,
-                             const int input_width, int groups,
-                             T* output_data) {
-  const int size = input_height * input_width * channels / groups;
-  const int feat_len = input_height * input_width;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int batch_idx = i / size;
-    int batch_offset = i % size;
-    int channel_idx = batch_offset / feat_len;
-    int feat_idx = batch_offset % feat_len;
-    int data_idx =
-        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
-    T ele = static_cast<T>(-FLT_MAX);
-    for (int g = 0; g < groups; ++g) {
-      T x = input_data[data_idx + g * feat_len];
-      ele = ele > x ? ele : x;
-    }
-    output_data[i] = ele;
-  }
-}
-template <typename T>
-__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
-                                 const T* output_data, const T* output_grad,
-                                 T* input_grad, const int channels,
-                                 const int input_height, const int input_width,
-                                 int groups) {
-  const int size = input_height * input_width * channels / groups;
-  const int feat_len = input_height * input_width;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int batch_idx = i / size;
-    int batch_offset = i % size;
-    int channel_idx = batch_offset / feat_len;
-    int feat_idx = batch_offset % feat_len;
-    int data_idx =
-        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
-    int max_index = -1;
-    bool continue_match = true;
-    for (int g = 0; g < groups && continue_match; ++g) {
-      if (input_data[data_idx + g * feat_len] == output_data[i]) {
-        max_index = data_idx + g * feat_len;
-        continue_match = false;
-        break;
-      }
-    }
-    if (max_index != -1) {
-      input_grad[max_index] += output_grad[index];
-    }
-  }
-}
-/*
- * All tensors are in NCHW format.
- */
-template <typename T>
-class MaxOutFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  int groups) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads = output->numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width, groups,
-        output_data);
-  }
-};
-/*
- * All tensors are in NCHW format.
- */
-template <typename T>
-class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, int groups) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads = output.numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, groups);
-  }
-};
-
-template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
-
-template class MaxOutFunctor<platform::CUDADeviceContext, float>;
-template class MaxOutFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
deleted file mode 100644
index e4d378dc23210e95605c6e09eda8a190cc5c6b4f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/maxouting.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/hostdevice.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-class MaxOutFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* output, int groups);
-};
-
-template <typename DeviceContext, class T>
-class MaxOutGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, int groups);
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/padding.h b/paddle/fluid/operators/math/padding.h
deleted file mode 100644
index 3ae25eae98b25bca015ec4383c7126eb81e52b8a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/padding.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T, size_t D>
-void PadFunction(const framework::ExecutionContext& context,
-                 const std::vector<int>& pads, const framework::Tensor& src,
-                 T pad_value, framework::Tensor* out) {
-  Eigen::array<std::pair<int, int>, D> paddings;
-
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    paddings[i].first = pads[i * 2];
-    paddings[i].second = pads[i * 2 + 1];
-  }
-
-  auto src_tensor = EigenTensor<T, D>::From(src);
-  auto out_tensor = EigenTensor<T, D>::From(*out);
-
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  out_tensor.device(place) = src_tensor.pad(paddings, pad_value);
-}
-
-template <typename DeviceContext, typename T, size_t D>
-void PadGradFunction(const framework::ExecutionContext& context,
-                     const std::vector<int>& pads, const framework::Tensor& src,
-                     framework::Tensor* d_out) {
-  Eigen::array<std::pair<int, int>, D> paddings;
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    paddings[i].first = -pads[i * 2];
-    paddings[i].second = -pads[i * 2 + 1];
-  }
-
-  auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-  auto src_tensor = EigenTensor<T, D>::From(src);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  d_out_tensor.device(place) = src_tensor.pad(paddings, 0);
-}
-
-template <typename DeviceContext, typename T>
-void PaddingFunctor(int rank, const framework::ExecutionContext& context,
-                    const std::vector<int>& pads, T pad_value,
-                    const framework::Tensor& src, framework::Tensor* out) {
-  switch (rank) {
-    case 1:
-      PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
-      break;
-    case 2:
-      PadFunction<DeviceContext, T, 2>(context, pads, src, pad_value, out);
-      break;
-    case 3:
-      PadFunction<DeviceContext, T, 3>(context, pads, src, pad_value, out);
-      break;
-    case 4:
-      PadFunction<DeviceContext, T, 4>(context, pads, src, pad_value, out);
-      break;
-    case 5:
-      PadFunction<DeviceContext, T, 5>(context, pads, src, pad_value, out);
-      break;
-    case 6:
-      PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
-      break;
-    default:
-      PADDLE_THROW(
-          "PadOp only support tensors with no more than 6 dimensions.");
-  }
-}
-
-template <typename DeviceContext, typename T>
-void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
-                        const std::vector<int>& pads,
-                        const framework::Tensor& src, framework::Tensor* out) {
-  switch (rank) {
-    case 1:
-      PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
-      break;
-    case 2:
-      PadGradFunction<DeviceContext, T, 2>(context, pads, src, out);
-      break;
-    case 3:
-      PadGradFunction<DeviceContext, T, 3>(context, pads, src, out);
-      break;
-    case 4:
-      PadGradFunction<DeviceContext, T, 4>(context, pads, src, out);
-      break;
-    case 5:
-      PadGradFunction<DeviceContext, T, 5>(context, pads, src, out);
-      break;
-    case 6:
-      PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
-      break;
-    default:
-      PADDLE_THROW(
-          "PadOp only support tensors with no more than 6 dimensions.");
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
deleted file mode 100644
index 30873e9f87f22fa5b39cbf519760a9ec3979f98b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/pooling.cc
+++ /dev/null
@@ -1,860 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/math/pooling.h"
-#include <algorithm>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, bool adaptive, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          if (adaptive) {
-            hstart = AdaptStartIndex(ph, input_height, output_height);
-            hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-          }
-          for (int pw = 0; pw < output_width; ++pw) {
-            if (adaptive) {
-              wstart = AdaptStartIndex(pw, input_width, output_width);
-              wend = AdaptEndIndex(pw, input_width, output_width);
-            } else {
-              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-            }
-
-            T ele = pool_process.initial();
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                pool_process.compute(input_data[h * input_width + w], &ele);
-              }
-            }
-            int pool_size = (exclusive || adaptive)
-                                ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
-            pool_process.finalize(static_cast<T>(pool_size), &ele);
-            output_data[ph * output_width + pw] = ele;
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
-* All tensors are in NCHW format.
-* Ksize, strides, paddings are two elements. These two elements represent height
-* and width, respectively.
-*/
-template <typename PoolProcess, class T>
-class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
- public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      bool exclusive, bool adaptive, framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          if (adaptive) {
-            hstart = AdaptStartIndex(ph, input_height, output_height);
-            hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-          }
-          for (int pw = 0; pw < output_width; ++pw) {
-            if (adaptive) {
-              wstart = AdaptStartIndex(pw, input_width, output_width);
-              wend = AdaptEndIndex(pw, input_width, output_width);
-            } else {
-              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-            }
-            int pool_size = (exclusive || adaptive)
-                                ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
-            float scale = 1.0 / pool_size;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                pool_grad_process.compute(
-                    input_data[h * input_width + w],
-                    output_data[ph * output_width + pw],
-                    output_grad_data[ph * output_width + pw],
-                    static_cast<T>(scale),
-                    input_grad_data + h * input_width + w);
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <class T>
-class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
-          for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
-
-            bool stop = false;
-            for (int h = hstart; h < hend && !stop; ++h) {
-              for (int w = wstart; w < wend && !stop; ++w) {
-                int input_idx = h * input_width + w;
-                int output_idx = ph * output_width + pw;
-                if (input_data[input_idx] == output_data[output_idx]) {
-                  input_grad_data[input_idx] += output_grad_data[output_idx];
-                  stop = true;
-                }
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
-
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, bool adaptive, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
-    const int output_depth = output->dims()[2];
-    const int output_height = output->dims()[3];
-    const int output_width = output->dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          if (adaptive) {
-            dstart = AdaptStartIndex(pd, input_depth, output_depth);
-            dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
-          }
-          for (int ph = 0; ph < output_height; ++ph) {
-            if (adaptive) {
-              hstart = AdaptStartIndex(ph, input_height, output_height);
-              hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
-            }
-            for (int pw = 0; pw < output_width; ++pw) {
-              if (adaptive) {
-                wstart = AdaptStartIndex(pw, input_width, output_width);
-                wend = AdaptEndIndex(pw, input_width, output_width);
-              } else {
-                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
-              }
-              int output_idx = (pd * output_height + ph) * output_width + pw;
-              T ele = pool_process.initial();
-              for (int d = dstart; d < dend; ++d) {
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    pool_process.compute(
-                        input_data[(d * input_height + h) * input_width + w],
-                        &ele);
-                  }
-                }
-              }
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
-              pool_process.finalize(static_cast<T>(pool_size), &ele);
-              output_data[output_idx] = ele;
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
- public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      bool exclusive, bool adaptive, framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          if (adaptive) {
-            dstart = AdaptStartIndex(pd, input_depth, output_depth);
-            dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
-          }
-          for (int ph = 0; ph < output_height; ++ph) {
-            if (adaptive) {
-              hstart = AdaptStartIndex(ph, input_height, output_height);
-              hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
-            }
-            for (int pw = 0; pw < output_width; ++pw) {
-              if (adaptive) {
-                wstart = AdaptStartIndex(pw, input_width, output_width);
-                wend = AdaptEndIndex(pw, input_width, output_width);
-              } else {
-                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
-              }
-
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
-              float scale = 1.0 / pool_size;
-              for (int d = dstart; d < dend; ++d) {
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    int input_idx = (d * input_height + h) * input_width + w;
-                    int output_idx =
-                        (pd * output_height + ph) * output_width + pw;
-                    pool_grad_process.compute(
-                        input_data[input_idx], output_data[output_idx],
-                        output_grad_data[output_idx], static_cast<T>(scale),
-                        input_grad_data + input_idx);
-                  }
-                }
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <class T>
-class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
-          for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-            for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-              bool stop = false;
-              for (int d = dstart; d < dend && !stop; ++d) {
-                for (int h = hstart; h < hend && !stop; ++h) {
-                  for (int w = wstart; w < wend && !stop; ++w) {
-                    int input_idx = (d * input_height + h) * input_width + w;
-                    int output_idx =
-                        (pd * output_height + ph) * output_width + pw;
-
-                    if (input_data[input_idx] == output_data[output_idx]) {
-                      input_grad_data[input_idx] +=
-                          output_grad_data[output_idx];
-                      stop = true;
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
-
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
-
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          if (adaptive) {
-            hstart = AdaptStartIndex(ph, input_height, output_height);
-            hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-          }
-          for (int pw = 0; pw < output_width; ++pw) {
-            if (adaptive) {
-              wstart = AdaptStartIndex(pw, input_width, output_width);
-              wend = AdaptEndIndex(pw, input_width, output_width);
-            } else {
-              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-            }
-
-            T1 ele = static_cast<T1>(-FLT_MAX);
-            int index = -1;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                if (ele < input_data[h * input_width + w]) {
-                  ele = input_data[h * input_width + w];
-                  index = h * input_width + w;
-                }
-              }
-            }
-            output_data[ph * output_width + pw] = ele;
-            mask_data[ph * output_width + pw] = index;
-          }
-        }
-        // offset
-        input_data += input_stride;
-        output_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input_grad->dims()[0];
-    const int input_height = input_grad->dims()[2];
-    const int input_width = input_grad->dims()[3];
-    const int output_channels = output_grad.dims()[1];
-    const int output_height = output_grad.dims()[2];
-    const int output_width = output_grad.dims()[3];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T2* mask_data = mask.data<T2>();
-    const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
-
-    for (int n = 0; n < batch_size; ++n) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          for (int pw = 0; pw < output_width; ++pw) {
-            const int output_idx = ph * output_width + pw;
-            const int input_idx = static_cast<int>(mask_data[output_idx]);
-            input_grad_data[input_idx] += output_grad_data[output_idx];
-          }
-        }
-        // offset
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, float,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, double,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
-                                             int>;
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
-    const int output_depth = output->dims()[2];
-    const int output_height = output->dims()[3];
-    const int output_width = output->dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
-
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          if (adaptive) {
-            dstart = AdaptStartIndex(pd, input_depth, output_depth);
-            dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
-          }
-          for (int ph = 0; ph < output_height; ++ph) {
-            if (adaptive) {
-              hstart = AdaptStartIndex(ph, input_height, output_height);
-              hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
-            }
-            for (int pw = 0; pw < output_width; ++pw) {
-              if (adaptive) {
-                wstart = AdaptStartIndex(pw, input_width, output_width);
-                wend = AdaptEndIndex(pw, input_width, output_width);
-              } else {
-                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
-              }
-
-              int output_idx = (pd * output_height + ph) * output_width + pw;
-              T1 ele = static_cast<T1>(-FLT_MAX);
-              int index = -1;
-              for (int d = dstart; d < dend; ++d) {
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    int input_idx = (d * input_height + h) * input_width + w;
-                    if (ele < input_data[input_idx]) {
-                      index = input_idx;
-                      ele = input_data[input_idx];
-                    }
-                  }
-                }
-              }
-              output_data[output_idx] = ele;
-              mask_data[output_idx] = index;
-            }
-          }
-        }
-        // offset
-        input_data += input_stride;
-        output_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input_grad->dims()[0];
-    const int input_depth = input_grad->dims()[2];
-    const int input_height = input_grad->dims()[3];
-    const int input_width = input_grad->dims()[4];
-    const int output_channels = output_grad.dims()[1];
-    const int output_depth = output_grad.dims()[2];
-    const int output_height = output_grad.dims()[3];
-    const int output_width = output_grad.dims()[4];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T2* mask_data = mask.data<T2>();
-    const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
-
-    for (int n = 0; n < batch_size; ++n) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          for (int ph = 0; ph < output_height; ++ph) {
-            for (int pw = 0; pw < output_width; ++pw) {
-              const int output_idx =
-                  (pd * output_height + ph) * output_width + pw;
-              const int input_idx = static_cast<int>(mask_data[output_idx]);
-              input_grad_data[input_idx] += output_grad_data[output_idx];
-            }
-          }
-        }
-        // offset
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, float,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, double,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, double,
-                                             int>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
deleted file mode 100644
index efce3f899a449c72ae3298f7ce0defb166ee8329..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/pooling.cu
+++ /dev/null
@@ -1,1219 +0,0 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename PoolProcess, typename T>
-__global__ void KernelPool2D(const int nthreads, const T* input_data,
-                             const int channels, const int input_height,
-                             const int input_width, const int output_height,
-                             const int output_width, const int ksize_height,
-                             const int ksize_width, const int stride_height,
-                             const int stride_width, const int padding_height,
-                             const int padding_width, PoolProcess pool_process,
-                             bool exclusive, bool adaptive, T* output_data) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int c = (index / output_width / output_height) % channels;
-    int batch_idx = index / output_width / output_height / channels;
-
-    int hstart, hend;
-    int wstart, wend;
-    if (adaptive) {
-      hstart = AdaptStartIndex(ph, input_height, output_height);
-      hend = AdaptEndIndex(ph, input_height, output_height);
-
-      wstart = AdaptStartIndex(pw, input_width, output_width);
-      wend = AdaptEndIndex(pw, input_width, output_width);
-    } else {
-      hstart = ph * stride_height - padding_height;
-      hend = min(hstart + ksize_height, input_height);
-      hstart = max(hstart, 0);
-
-      wstart = pw * stride_width - padding_width;
-      wend = min(wstart + ksize_width, input_width);
-      wstart = max(wstart, 0);
-    }
-
-    input_data += (batch_idx * channels + c) * input_height * input_width;
-    T ele = pool_process.initial();
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        pool_process.compute(input_data[h * input_width + w], &ele);
-      }
-    }
-    int pool_size = (exclusive || adaptive) ? (hend - hstart) * (wend - wstart)
-                                            : ksize_height * ksize_width;
-    pool_process.finalize(static_cast<T>(pool_size), &ele);
-    output_data[index] = ele;
-  }
-}
-
-template <typename PoolProcess, typename T>
-__global__ void KernelPool2DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_height,
-    const int input_width, const int output_height, const int output_width,
-    const int ksize_height, const int ksize_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width + padding_width;
-    int h_offset = (index / input_width) % input_height + padding_height;
-    int offsetC = (index / input_width / input_height) % channels;
-    int batch_idx = index / input_width / input_height / channels;
-
-    int phstart, phend;
-    int pwstart, pwend;
-    if (adaptive) {
-      phstart = h_offset * output_height / input_height;
-      phend =
-          min((h_offset + 1) * output_height / input_height + 1, output_height);
-      pwstart = w_offset * output_width / input_width;
-      pwend =
-          min((w_offset + 1) * output_width / input_width + 1, output_width);
-    } else {
-      phstart = (h_offset < ksize_height)
-                    ? 0
-                    : (h_offset - ksize_height) / stride_height + 1;
-      pwstart = (w_offset < ksize_width)
-                    ? 0
-                    : (w_offset - ksize_width) / stride_width + 1;
-      phend = min(h_offset / stride_height + 1, output_height);
-      pwend = min(w_offset / stride_width + 1, output_width);
-    }
-    T gradient = 0;
-    T input = input_data[index];
-    int output_idx =
-        (batch_idx * channels + offsetC) * output_height * output_width;
-    output_data += output_idx;
-    output_grad += output_idx;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        int pool_size;
-        if (adaptive) {
-          pool_size = static_cast<int>(ceil(static_cast<double>(input_height) /
-                                            ksize_height)) *
-                      static_cast<int>(
-                          ceil(static_cast<double>(input_width) / ksize_width));
-        } else {
-          int hstart = ph * stride_height - padding_height;
-          int wstart = pw * stride_width - padding_width;
-          int hend = min(hstart + ksize_height, input_height);
-          int wend = min(wstart + ksize_width, input_width);
-          hstart = max(hstart, 0);
-          wstart = max(wstart, 0);
-          pool_size = exclusive ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
-        }
-        int output_sub_idx = ph * output_width + pw;
-        pool_process.compute(input, output_data[output_sub_idx],
-                             output_grad[output_sub_idx],
-                             static_cast<T>(1.0 / pool_size), &gradient);
-      }
-    }
-    input_grad[index] = gradient;
-  }
-}
-
-template <typename T>
-__global__ void KernelMaxPool2DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_height,
-    const int input_width, const int output_height, const int output_width,
-    const int ksize_height, const int ksize_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    T* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int c = (index / output_width / output_height) % channels;
-    int batch_idx = index / output_width / output_height / channels;
-
-    int hstart = ph * stride_height - padding_height;
-    int hend = min(hstart + ksize_height, input_height);
-    hstart = max(hstart, 0);
-
-    int wstart = pw * stride_width - padding_width;
-    int wend = min(wstart + ksize_width, input_width);
-    wstart = max(wstart, 0);
-
-    input_data += (batch_idx * channels + c) * input_height * input_width;
-    input_grad += (batch_idx * channels + c) * input_height * input_width;
-
-    T ele = output_data[index];
-    int maxIndex = -1;
-    bool stop = false;
-    for (int h = hstart; h < hend && !stop; ++h) {
-      for (int w = wstart; w < wend && !stop; ++w) {
-        if (ele == input_data[h * input_width + w]) {
-          maxIndex = h * input_width + w;
-          stop = true;
-        }
-      }
-    }
-
-    if (maxIndex != -1) {
-      // atomic add
-      platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
-    }
-  }
-}
-
-template <typename PoolProcess, typename T>
-void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
-    const T* input, const std::vector<int>& input_shape,
-    const std::vector<int>& output_shape, const std::vector<int>& ksize,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    PoolProcess pool_compute, bool exclusive, T* output, cudaStream_t stream) {
-  const int batch_size = input_shape[0];
-  const int input_channels = input_shape[1];
-  const int input_height = input_shape[2];
-  const int input_width = input_shape[3];
-  const int output_channels = output_shape[1];
-  const int output_height = output_shape[2];
-  const int output_width = output_shape[3];
-  const int ksize_height = ksize[0];
-  const int ksize_width = ksize[1];
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
-
-  int nthreads = batch_size * output_channels * output_height * output_width;
-  int blocks = (nthreads + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-
-  KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
-      nthreads, input, input_channels, input_height, input_width, output_height,
-      output_width, ksize_height, ksize_width, stride_height, stride_width,
-      padding_height, padding_width, pool_compute, exclusive, false, output);
-}
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, bool adaptive, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_process, exclusive,
-        adaptive, output_data);
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename PoolProcess, typename T>
-class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        pool_process, exclusive, adaptive, input_grad_data);
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T>
-class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        input_grad_data);
-  }
-};
-
-template class Pool2dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
-                                       float>;
-template class Pool2dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
-                                       float>;
-
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
-
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(
-    const int nthreads, const T* input_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int pd = (index / output_width / output_height) % output_depth;
-    int c = (index / output_width / output_height / output_depth) % channels;
-    int batch_idx =
-        index / output_width / output_height / output_depth / channels;
-
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
-    if (adaptive) {
-      dstart = AdaptStartIndex(pd, input_depth, output_depth);
-      dend = AdaptEndIndex(pd, input_depth, output_depth);
-
-      hstart = AdaptStartIndex(ph, input_height, output_height);
-      hend = AdaptEndIndex(ph, input_height, output_height);
-
-      wstart = AdaptStartIndex(pw, input_width, output_width);
-      wend = AdaptEndIndex(pw, input_width, output_width);
-    } else {
-      dstart = pd * stride_depth - padding_depth;
-      hstart = ph * stride_height - padding_height;
-      wstart = pw * stride_width - padding_width;
-      dend = min(dstart + ksize_depth, input_depth);
-      hend = min(hstart + ksize_height, input_height);
-      wend = min(wstart + ksize_width, input_width);
-      dstart = max(dstart, 0);
-      hstart = max(hstart, 0);
-      wstart = max(wstart, 0);
-    }
-    T ele = pool_process.initial();
-    input_data +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
-    for (int d = dstart; d < dend; ++d) {
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          pool_process.compute(
-              input_data[(d * input_height + h) * input_width + w], &ele);
-        }
-      }
-    }
-    int pool_size = (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
-    pool_process.finalize(static_cast<T>(pool_size), &ele);
-    output_data[index] = ele;
-  }
-}
-
-template <typename PoolProcess, typename T>
-__global__ void KernelPool3DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_depth,
-    const int input_height, const int input_width, const int output_depth,
-    const int output_height, const int output_width, const int ksize_depth,
-    const int ksize_height, const int ksize_width, const int stride_depth,
-    const int stride_height, const int stride_width, const int padding_depth,
-    const int padding_height, const int padding_width, PoolProcess pool_process,
-    bool exclusive, bool adaptive, T* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width + padding_width;
-    int h_offset = (index / input_width) % input_height + padding_height;
-    int d_offset =
-        (index / input_width / input_height) % input_depth + padding_depth;
-    int offsetC = (index / input_width / input_height / input_depth) % channels;
-    int batch_idx = index / input_width / input_height / input_depth / channels;
-
-    int pdstart, pdend;
-    int phstart, phend;
-    int pwstart, pwend;
-    if (adaptive) {
-      pdstart = d_offset * output_depth / input_depth;
-      pdend =
-          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
-      phstart = h_offset * output_height / input_height;
-      phend =
-          min((h_offset + 1) * output_height / input_height + 1, output_height);
-      pwstart = w_offset * output_width / input_width;
-      pwend =
-          min((w_offset + 1) * output_width / input_width + 1, output_width);
-    } else {
-      pdstart = (d_offset < ksize_depth)
-                    ? 0
-                    : (d_offset - ksize_depth) / stride_depth + 1;
-      phstart = (h_offset < ksize_height)
-                    ? 0
-                    : (h_offset - ksize_height) / stride_height + 1;
-      pwstart = (w_offset < ksize_width)
-                    ? 0
-                    : (w_offset - ksize_width) / stride_width + 1;
-      pdend = min((d_offset) / stride_depth + 1, output_depth);
-      phend = min((h_offset) / stride_height + 1, output_height);
-      pwend = min((w_offset) / stride_width + 1, output_width);
-    }
-
-    T gradient = 0;
-    T input = input_data[index];
-    int output_idx = (batch_idx * channels + offsetC) * output_depth *
-                     output_height * output_width;
-    output_data += output_idx;
-    output_grad += output_idx;
-
-    for (int pd = pdstart; pd < pdend; ++pd) {
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          // figure out the pooling size
-          int pool_size;
-          if (adaptive) {
-            pool_size =
-                static_cast<int>(
-                    ceil(static_cast<double>(input_depth) / ksize_depth)) *
-                static_cast<int>(
-                    ceil(static_cast<double>(input_height) / ksize_height)) *
-                static_cast<int>(
-                    ceil(static_cast<double>(input_width) / ksize_width));
-          } else {
-            int dstart = pd * stride_depth - padding_depth;
-            int hstart = ph * stride_height - padding_height;
-            int wstart = pw * stride_width - padding_width;
-            int dend = min(dstart + ksize_depth, input_depth);
-            int hend = min(hstart + ksize_height, input_height);
-            int wend = min(wstart + ksize_width, input_width);
-            dstart = max(dstart, 0);
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            pool_size =
-                exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                          : ksize_depth * ksize_height * ksize_width;
-          }
-          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
-          pool_process.compute(input, output_data[output_sub_idx],
-                               output_grad[output_sub_idx],
-                               static_cast<T>(1.0 / pool_size), &gradient);
-        }
-      }
-    }
-    input_grad[index] = gradient;
-  }
-}
-
-template <typename T>
-__global__ void KernelMaxPool3DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_depth,
-    const int input_height, const int input_width, const int output_depth,
-    const int output_height, const int output_width, const int ksize_depth,
-    const int ksize_height, const int ksize_width, const int stride_depth,
-    const int stride_height, const int stride_width, const int padding_depth,
-    const int padding_height, const int padding_width, T* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int pd = (index / output_width / output_height) % output_depth;
-    int c = (index / output_width / output_height / output_depth) % channels;
-    int batch_idx =
-        index / output_width / output_height / output_depth / channels;
-    int dstart = pd * stride_depth - padding_depth;
-    int hstart = ph * stride_height - padding_height;
-    int wstart = pw * stride_width - padding_width;
-    int dend = min(dstart + ksize_depth, input_depth);
-    int hend = min(hstart + ksize_height, input_height);
-    int wend = min(wstart + ksize_width, input_width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    T ele = output_data[index];
-    bool stop = false;
-    int maxIdx = -1;
-    input_data +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
-    input_grad +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
-
-    for (int d = dstart; d < dend && !stop; ++d) {
-      for (int h = hstart; h < hend && !stop; ++h) {
-        for (int w = wstart; w < wend && !stop; ++w) {
-          if (ele == input_data[(d * input_height + h) * input_width + w]) {
-            stop = true;
-            maxIdx = (d * input_height + h) * input_width + w;
-          }
-        }
-      }
-    }
-    if (maxIdx != -1) {
-      // atomic add
-      platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
-    }
-  }
-}
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, bool adaptive, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
-    const int output_depth = output->dims()[2];
-    const int output_height = output->dims()[3];
-    const int output_width = output->dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_depth * output_height *
-                   output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, pool_process, exclusive,
-        adaptive, output_data);
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int nthreads =
-        batch_size * input_channels * input_depth * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, exclusive, adaptive, input_grad_data);
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <class T>
-class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_depth * output_height *
-                   output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, input_grad_data);
-  }
-};
-
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
-
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-template <typename T1, typename T2>
-__global__ void KernelMaxPool2dWithIdx(
-    const int nthreads, const T1* input_data, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, bool adaptive, T1* output_data, T2* mask_data) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int c = (index / output_width / output_height) % channels;
-    int batch_idx = index / output_width / output_height / channels;
-
-    int hstart, hend;
-    int wstart, wend;
-    if (adaptive) {
-      hstart = AdaptStartIndex(ph, input_height, output_height);
-      hend = AdaptEndIndex(ph, input_height, output_height);
-
-      wstart = AdaptStartIndex(pw, input_width, output_width);
-      wend = AdaptEndIndex(pw, input_width, output_width);
-    } else {
-      hstart = ph * stride_height - padding_height;
-      hend = min(hstart + ksize_height, input_height);
-      hstart = max(hstart, 0);
-
-      wstart = pw * stride_width - padding_width;
-      wend = min(wstart + ksize_width, input_width);
-      wstart = max(wstart, 0);
-    }
-
-    input_data += (batch_idx * channels + c) * input_height * input_width;
-    T1 ele = -FLT_MAX;
-    int max_index = -1;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int input_index = h * input_width + w;
-        if (ele < input_data[input_index]) {
-          max_index = input_index;
-          ele = input_data[input_index];
-        }
-      }
-    }
-    output_data[index] = ele;
-    mask_data[index] = max_index;
-  }
-}
-
-template <typename T1, typename T2>
-__global__ void KernelMaxPool2DWithIdxGrad(
-    const int nthreads, const T1* output_grad, const T2* mask_data,
-    const int channels, const int input_height, const int input_width,
-    const int output_height, const int output_width, const int ksize_height,
-    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, bool adaptive,
-    T1* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width;
-    int h_offset = (index / input_width) % input_height;
-    int offsetC = (index / input_width / input_height) % channels;
-    int batch_idx = index / input_width / input_height / channels;
-
-    int phstart, phend;
-    int pwstart, pwend;
-    if (adaptive) {
-      phstart = h_offset * output_height / input_height;
-      phend =
-          min((h_offset + 1) * output_height / input_height + 1, output_height);
-      pwstart = w_offset * output_width / input_width;
-      pwend =
-          min((w_offset + 1) * output_width / input_width + 1, output_width);
-    } else {
-      phstart =
-          (h_offset + padding_height < ksize_height)
-              ? 0
-              : (h_offset + padding_height - ksize_height) / stride_height + 1;
-      pwstart =
-          (w_offset + padding_width < ksize_width)
-              ? 0
-              : (w_offset + padding_width - ksize_width) / stride_width + 1;
-      phend =
-          min((h_offset + padding_height) / stride_height + 1, output_height);
-      pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
-    }
-
-    T1 gradient = 0;
-    int input_current_featuremap_idx = h_offset * input_width + w_offset;
-    int output_idx =
-        (batch_idx * channels + offsetC) * output_height * output_width;
-
-    mask_data += output_idx;
-    output_grad += output_idx;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        if (mask_data[ph * output_width + pw] == input_current_featuremap_idx)
-          gradient += output_grad[ph * output_width + pw];
-      }
-    }
-    input_grad[index] = gradient;
-  }
-}
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, adaptive, output_data,
-        mask_data);
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input_grad->dims()[0];
-    const int input_channels = input_grad->dims()[1];
-    const int input_height = input_grad->dims()[2];
-    const int input_width = input_grad->dims()[3];
-    const int output_height = output_grad.dims()[2];
-    const int output_width = output_grad.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T2* mask_data = mask.data<T2>();
-    const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
-
-    int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, mask_data, input_channels, input_height,
-        input_width, output_height, output_width, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width, adaptive,
-        input_grad_data);
-  }
-};
-
-template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, float,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, double,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext,
-                                             double, int>;
-
-template <typename T1, typename T2>
-__global__ void KernelMaxPool3DWithIdx(
-    const int nthreads, const T1* input_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    bool adaptive, T1* output_data, T2* mask_data) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int pd = (index / output_width / output_height) % output_depth;
-    int c = (index / output_width / output_height / output_depth) % channels;
-    int batch_idx =
-        index / output_width / output_height / output_depth / channels;
-
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
-    if (adaptive) {
-      dstart = AdaptStartIndex(pd, input_depth, output_depth);
-      dend = AdaptEndIndex(pd, input_depth, output_depth);
-
-      hstart = AdaptStartIndex(ph, input_height, output_height);
-      hend = AdaptEndIndex(ph, input_height, output_height);
-
-      wstart = AdaptStartIndex(pw, input_width, output_width);
-      wend = AdaptEndIndex(pw, input_width, output_width);
-    } else {
-      dstart = pd * stride_depth - padding_depth;
-      hstart = ph * stride_height - padding_height;
-      wstart = pw * stride_width - padding_width;
-      dend = min(dstart + ksize_depth, input_depth);
-      hend = min(hstart + ksize_height, input_height);
-      wend = min(wstart + ksize_width, input_width);
-      dstart = max(dstart, 0);
-      hstart = max(hstart, 0);
-      wstart = max(wstart, 0);
-    }
-
-    T1 ele = -FLT_MAX;
-    int max_index = -1;
-    input_data +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
-
-    for (int d = dstart; d < dend; ++d) {
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          if (ele < input_data[(d * input_height + h) * input_width + w]) {
-            max_index = (d * input_height + h) * input_width + w;
-            ele = input_data[max_index];
-          }
-        }
-      }
-    }
-    output_data[index] = ele;
-    mask_data[index] = max_index;
-  }
-}
-
-template <typename T1, typename T2>
-__global__ void KernelMaxPool3DWithIdxGrad(
-    const int nthreads, const T1* output_grad, const T2* mask,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, bool adaptive, T1* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width;
-    int h_offset = (index / input_width) % input_height;
-    int d_offset = (index / input_width / input_height) % input_depth;
-    int offsetC = (index / input_width / input_height / input_depth) % channels;
-    int batch_idx = index / input_width / input_height / input_depth / channels;
-
-    int pdstart, pdend;
-    int phstart, phend;
-    int pwstart, pwend;
-    if (adaptive) {
-      pdstart = d_offset * output_depth / input_depth;
-      pdend =
-          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
-      phstart = h_offset * output_height / input_height;
-      phend =
-          min((h_offset + 1) * output_height / input_height + 1, output_height);
-      pwstart = w_offset * output_width / input_width;
-      pwend =
-          min((w_offset + 1) * output_width / input_width + 1, output_width);
-    } else {
-      pdstart =
-          (d_offset + padding_depth < ksize_depth)
-              ? 0
-              : (d_offset + padding_depth - ksize_depth) / stride_depth + 1;
-      phstart =
-          (h_offset + padding_height < ksize_height)
-              ? 0
-              : (h_offset + padding_height - ksize_height) / stride_height + 1;
-      pwstart =
-          (w_offset + padding_width < ksize_width)
-              ? 0
-              : (w_offset + padding_width - ksize_width) / stride_width + 1;
-      pdend = min((d_offset + padding_depth) / stride_depth + 1, output_depth);
-      phend =
-          min((h_offset + padding_height) / stride_height + 1, output_height);
-      pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
-    }
-
-    T1 gradient = 0;
-    int input_current_feature_map_idx =
-        (d_offset * input_height + h_offset) * input_width + w_offset;
-    int output_idx = (batch_idx * channels + offsetC) * output_depth *
-                     output_height * output_width;
-    mask += output_idx;
-    output_grad += output_idx;
-
-    for (int pd = pdstart; pd < pdend; ++pd) {
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (mask[(pd * output_height + ph) * output_width + pw] ==
-              input_current_feature_map_idx)
-            gradient +=
-                output_grad[(pd * output_height + ph) * output_width + pw];
-        }
-      }
-    }
-    input_grad[index] = gradient;
-  }
-}
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
-    const int output_depth = output->dims()[2];
-    const int output_height = output->dims()[3];
-    const int output_width = output->dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_depth * output_height *
-                   output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, adaptive, output_data,
-        mask_data);
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input_grad->dims()[0];
-    const int input_channels = input_grad->dims()[1];
-    const int input_depth = input_grad->dims()[2];
-    const int input_height = input_grad->dims()[3];
-    const int input_width = input_grad->dims()[4];
-    const int output_depth = output_grad.dims()[2];
-    const int output_height = output_grad.dims()[3];
-    const int output_width = output_grad.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const T1* output_grad_data = output_grad.data<T1>();
-    const T2* mask_data = mask.data<T2>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
-
-    int nthreads =
-        batch_size * input_channels * input_depth * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, mask_data, input_channels, input_depth,
-        input_height, input_width, output_depth, output_height, output_width,
-        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width, adaptive,
-        input_grad_data);
-  }
-};
-
-template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, float,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, double,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext,
-                                             double, int>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
deleted file mode 100644
index e1f8e6df1d19b519e48bff326bc1aa9548c96905..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/pooling.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/hostdevice.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * \brief Extracting simple operations from pooling.
- *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
- * operation.
- *        MaxPool initializes temp variable to the negative maximum to find the
- * maximum value in the pooling field.
- *        AvgPool initializes temp variable to the zero to accumulate all values
- * in pool pooling, and finally takes the average.
- *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
- */
-template <class T>
-class MaxPool {
- public:
-  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
-  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
-  DEVICE inline void finalize(const T& pool_field, T* y) {}
-};
-
-template <class T>
-class AvgPool {
- public:
-  DEVICE inline T initial() { return static_cast<T>(0); }
-  DEVICE inline void compute(const T& x, T* y) { *y += x; }
-  DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; }
-};
-
-template <class T>
-class MaxPoolGrad {
- public:
-  DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
-                             T* dx) {
-    *dx += dy * (x == y);
-  }
-};
-
-template <class T>
-class AvgPoolGrad {
- public:
-  DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
-                             T* dx) {
-    *dx += (scale * dy);
-  }
-};
-
-/* used for adaptive pool to calculate start and end index of each divided grid
- */
-HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      floor(static_cast<double>(ph * input_size) / output_size));
-}
-
-HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
-}
-
-/*
- * \brief Getting pooling results, and calculating gradient.
- *
- * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
- * number of channels, H and W is the height and width of feature.
- * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
- * number of channels, D, H and W is the depth, height and width of feature.
- *
- * In max pooling, it is possible that the pooling region has multiple maximum
- * elements. In this case, we should compute the gradient of the first maximum
- * element.
- * This is different from average pooling. So we rewrite the max_pool_grad:
- * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
- */
-#ifdef PADDLE_WITH_CUDA
-template <typename PoolProcess, typename T>
-class Pool2dDirectCUDAFunctor {
- public:
-  void operator()(const T* input, const std::vector<int>& input_shape,
-                  const std::vector<int>& output_shape,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, T* output, cudaStream_t stream);
-};
-#endif
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool2dFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, bool adaptive, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool2dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, class T>
-class MaxPool2dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool3dFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, bool adaptive, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool3dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, class T>
-class MaxPool3dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-};
-
-/*
- * \brief Getting max pooling results and corresponding max index, and
- * calculating gradient.
- * In up-sampling-pooling, it is necessary to know max element index.
- * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
- * NCDHW format.
- */
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool2dWithIndexFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool3dWithIndexFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/prelu.cu b/paddle/fluid/operators/math/prelu.cu
deleted file mode 100644
index 701a802080f65ea32b95402682dc46362ccf0966..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/prelu.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/prelu.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-static const int CUDA_NUM_THREADS = 1024;
-static const int CUDA_MAX_NUM_BLOCKS = 65535;
-inline static int GET_NUM_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-template <typename T>
-__global__ void PReluChannelWiseKernel(const T *input, const T *alpha,
-                                       T *output, int channel,
-                                       size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const T *in = input + offset;
-  T *out = output + offset;
-  T scale = alpha[blockIdx.x % channel];
-
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    T x = in[i];
-    out[i] = (x > 0) ? x : scale * x;
-  }
-}
-
-template <typename T>
-__global__ void PReluElementWiseKernel(const T *input, const T *alpha,
-                                       T *output, size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const T *in = input + offset;
-  const T *scale = alpha + offset;
-  T *out = output + offset;
-
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    T x = in[i];
-    out[i] = (x > 0) ? x : scale[i] * x;
-  }
-}
-
-template <typename T>
-__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output,
-                                  size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const T *in = input + offset;
-  T scale = *alpha;
-  T *out = output + offset;
-
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    T x = in[i];
-    out[i] = (x > 0) ? x : scale * x;
-  }
-}
-
-template <typename T>
-static inline void PReluChannelWise(cudaStream_t stream, const T *input,
-                                    const T *alpha, T *output,
-                                    std::vector<int> input_shape) {
-  size_t unroll = input_shape[0] * input_shape[1];
-  size_t spatial_size = input_shape[2] * input_shape[3];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, input_shape[1], spatial_size);
-}
-
-template <typename T>
-static inline void PReluElementWise(cudaStream_t stream, const T *input,
-                                    const T *alpha, T *output,
-                                    std::vector<int> input_shape) {
-  size_t unroll = input_shape[0] * input_shape[1];
-  size_t spatial_size = input_shape[2] * input_shape[3];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, spatial_size);
-}
-
-template <typename T>
-static inline void PReluScalar(cudaStream_t stream, const T *input,
-                               const T *alpha, T *output,
-                               std::vector<int> input_shape) {
-  size_t unroll = input_shape[0] * input_shape[1];
-  size_t spatial_size = input_shape[2] * input_shape[3];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, spatial_size);
-}
-
-template <typename T>
-void PreluChannelWiseDirectCUDAFunctor<T>::operator()(
-    cudaStream_t stream, const T *input, const T *alpha, T *output,
-    std::vector<int> input_shape) {
-  size_t unroll = input_shape[0] * input_shape[1];
-  size_t spatial_size = input_shape[2] * input_shape[3];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, input_shape[1], spatial_size);
-}
-
-template <typename T>
-void PreluElementWiseDirectCUDAFunctor<T>::operator()(
-    cudaStream_t stream, const T *input, const T *alpha, T *output,
-    std::vector<int> input_shape) {
-  size_t unroll = input_shape[0] * input_shape[1];
-  size_t spatial_size = input_shape[2] * input_shape[3];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, spatial_size);
-}
-
-template <typename T>
-void PreluScalarDirectCUDAFunctor<T>::operator()(cudaStream_t stream,
-                                                 const T *input, const T *alpha,
-                                                 T *output,
-                                                 std::vector<int> input_shape) {
-  size_t unroll = input_shape[0] * input_shape[1];
-  size_t spatial_size = input_shape[2] * input_shape[3];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, spatial_size);
-}
-
-template class PreluChannelWiseDirectCUDAFunctor<float>;
-template class PreluChannelWiseDirectCUDAFunctor<double>;
-
-template class PreluElementWiseDirectCUDAFunctor<float>;
-template class PreluElementWiseDirectCUDAFunctor<double>;
-
-template class PreluScalarDirectCUDAFunctor<float>;
-template class PreluScalarDirectCUDAFunctor<double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
deleted file mode 100644
index 3237c6d4cbf956aafb4046ea2ffa42efe62e7b28..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/prelu.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-class PreluChannelWiseDirectCUDAFunctor {
- public:
-  void operator()(cudaStream_t stream, const T *input, const T *alpha,
-                  T *output, std::vector<int> input_shape);
-};
-
-template <typename T>
-class PreluElementWiseDirectCUDAFunctor {
- public:
-  void operator()(cudaStream_t stream, const T *input, const T *alpha,
-                  T *output, std::vector<int> input_shape);
-};
-
-template <typename T>
-class PreluScalarDirectCUDAFunctor {
- public:
-  void operator()(cudaStream_t stream, const T *input, const T *alpha,
-                  T *output, std::vector<int> input_shape);
-};
-#endif
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sample_prob.cc b/paddle/fluid/operators/math/sample_prob.cc
deleted file mode 100644
index 99aa318453eae161807353198a78e11085cd6237..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sample_prob.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/sample_prob.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template class SampleWithProb<platform::CPUDeviceContext, float>;
-template class SampleWithProb<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
deleted file mode 100644
index 6aabfb069454e8cd5ad14855bc9b1f08bcfe5db4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <thrust/random.h>
-#include <thrust/sort.h>
-#include <iostream>
-#include <vector>
-
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/sample_prob.h"
-#include "paddle/fluid/operators/math/sampler.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__device__ T gpu_adjust_prob(const T prob, const int num_samples,
-                             const int num_tries) {
-  if (num_samples == num_tries) {
-    return prob * num_samples;
-  } else {
-    return -expm1(num_tries * log1p(-prob));
-  }
-}
-
-class GPULogUniformSampler {
- public:
-  __device__ int64_t Sample(float random, const int range,
-                            const float log_range) const;
-  __device__ float Probability(int64_t value, const float log_range) const;
-};
-
-__device__ int64_t GPULogUniformSampler::Sample(float random, const int range,
-                                                const float log_range) const {
-  // Got Log Uniform distribution from uniform distribution by
-  // inverse_transform_sampling method
-  const int64_t value = static_cast<int64_t>(exp(random * log_range)) - 1;
-  // Mathematically, value should be <= range_, but might not be due to some
-  // floating point roundoff, so we mod by range_.
-  return value % range;
-}
-
-__device__ float GPULogUniformSampler::Probability(
-    int64_t value, const float log_range) const {
-  // Given f(x) = 1/[(x+1) * log_range_]
-  // The value's  probability  is integral of f(x) from value to (value + 1)
-  return (log((value + 2.0) / (value + 1.0))) / log_range;
-}
-
-template <typename T>
-__global__ void SamplingCondidate(
-    const size_t n, const int num_tries, const int range, const float log_range,
-    const int num_true, const std::size_t num_samples,
-    const int64_t* label_data, int64_t* samples_data, T* probabilities_data) {
-  const int num_sampled_classes = num_true + num_samples;
-
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int step_size = 0;
-  GPULogUniformSampler sampler;
-
-  for (; idx < n; idx += blockDim.x * gridDim.x) {
-    int col_idx = idx % num_sampled_classes;
-    int row_idx = idx / num_sampled_classes;
-    if (col_idx < num_true) {
-      samples_data[idx] = label_data[row_idx * num_true + col_idx];
-    } else {
-      samples_data[idx] = samples_data[col_idx];
-    }
-    probabilities_data[idx] = sampler.Probability(samples_data[idx], log_range);
-    probabilities_data[idx] =
-        gpu_adjust_prob(probabilities_data[idx], num_samples, num_tries);
-  }
-}
-
-template <typename T>
-int UniqSampler(const Sampler& sampler, const std::size_t num_samples,
-                int64_t* samples_data) {
-  // sample num_samles unique samples for an example, note that they are not
-  // all negative samples
-  std::unordered_set<int64_t> tmp_samples;
-  tmp_samples.clear();
-  int num_tries = 0;
-  int j = 0;
-  while (j < num_samples) {
-    ++num_tries;
-    auto v = sampler.Sample();
-    auto insert_ok = tmp_samples.insert(v).second;
-    if (!insert_ok) {
-      continue;
-    }
-    samples_data[j] = v;
-    ++j;
-  }
-  return num_tries;
-}
-
-template <typename T>
-void GPUSampleWithProb<T>::operator()(
-    const platform::CUDADeviceContext& context, const int seed,
-    const int dict_size, const bool uniq, const std::size_t num_samples,
-    const Tensor* L, Tensor* S, Tensor* P) {
-  // UNDERSTAND: dimension issues
-  const auto lbl_dim = L->dims();
-  const int batch_size = lbl_dim[0];
-  const int num_true = lbl_dim[1];
-  const int num_sampled_classes = num_true + num_samples;
-  framework::DDim ret_dim{batch_size, num_sampled_classes};
-
-  // UNDERSTAND: raw data view
-  const int64_t* label_data = L->data<int64_t>();
-  int64_t* samples_data = S->data<int64_t>();
-  T* probabilities_data = P->data<T>();
-
-  int s_size = num_samples;
-  framework::DDim s_dim{s_size};
-  Tensor s;
-  int64_t* s_data = s.mutable_data<int64_t>(s_dim, platform::CPUPlace());
-
-  math::LogUniformSampler sampler(dict_size, seed);
-
-  int range = dict_size;
-  float log_range = log(range + 1);
-
-  int num_tries = UniqSampler<T>(sampler, num_samples, s_data);
-  VLOG(1) << "num_tries: " << num_tries;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(samples_data + num_true, s_data,
-                                         sizeof(int64_t) * num_samples,
-                                         cudaMemcpyHostToDevice));
-
-  int threads = 512;
-  const size_t size = batch_size * num_sampled_classes;
-  int grid = (batch_size * num_sampled_classes + threads - 1) / threads;
-  SamplingCondidate<T><<<grid, threads, 0, context.stream()>>>(
-      size, num_tries, range, log_range, num_true, num_samples, label_data,
-      samples_data, probabilities_data);
-}
-
-template class GPUSampleWithProb<float>;
-template class GPUSampleWithProb<double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
deleted file mode 100644
index e5a6d84cb2b0527c606e62a19ef02d669945ecb1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sample_prob.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/sampler.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using Tensor = framework::Tensor;
-
-/* UNDERSTAND: utility function to adjust probability for unique sampling,
-return whatever as it is if not using unique samping */
-template <typename T>
-static T adjust_prob(const T prob, const int num_samples, const int num_tries) {
-  if (num_samples == num_tries) {
-    return prob * num_samples;
-  } else {
-    return -expm1(num_tries * log1p(-prob));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class SampleWithProb {
- public:
-  void operator()(const DeviceContext& context, const Sampler& sampler,
-                  const std::size_t num_samples, const Tensor* L, Tensor* S,
-                  Tensor* P) {
-    // UNDERSTAND: dimension issues
-    const auto lbl_dim = L->dims();
-    const int batch_size = lbl_dim[0];
-    const int num_true = lbl_dim[1];
-    const int num_sampled_classes = num_true + num_samples;
-    framework::DDim ret_dim{batch_size, num_sampled_classes};
-
-    // UNDERSTAND: raw data view
-    const int64_t* label_data = L->data<int64_t>();
-    int64_t* samples_data =
-        S->mutable_data<int64_t>(ret_dim, context.GetPlace());
-    T* probabilities_data = P->mutable_data<T>(ret_dim, context.GetPlace());
-
-    // temp sets for unique sampling
-    std::unordered_set<int64_t> tmp_samples;
-    int j = 0;  // column index
-    // add true labels, not that efficient
-    while (j < num_true) {
-      for (int i = 0; i < batch_size; ++i) {
-        auto samples_index = i * num_sampled_classes + j;
-        auto v = label_data[i * num_true + j];
-        samples_data[samples_index] = v;
-        probabilities_data[samples_index] = sampler.Probability(v);
-      }
-      ++j;
-    }
-
-    // sample num_samles unique samples for an example, note that they are not
-    // all negative samples
-    tmp_samples.clear();
-    int num_tries = 0;
-    while (j < num_sampled_classes) {
-      ++num_tries;
-      auto v = sampler.Sample();
-      auto insert_ok = tmp_samples.insert(v).second;
-      if (!insert_ok) {
-        continue;
-      }
-      auto p = sampler.Probability(v);
-      for (int i = 0; i < batch_size; ++i) {
-        auto samples_index = i * num_sampled_classes + j;
-        samples_data[samples_index] = v;
-        probabilities_data[samples_index] = p;
-      }
-      ++j;
-    }
-
-    // compute Q(y|x), because of unique sampling, probabilities need to be
-    // adjusted
-    for (int k = 0; k < num_sampled_classes; ++k) {
-      for (int i = 0; i < batch_size; ++i) {
-        auto samples_index = i * num_sampled_classes + k;
-        probabilities_data[samples_index] = adjust_prob(
-            probabilities_data[samples_index], num_samples, num_tries);
-      }
-    }
-  }
-};
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-class GPUSampleWithProb {
- public:
-  void operator()(const platform::CUDADeviceContext& context, const int seed,
-                  const int dict_size, const bool uniq,
-                  const std::size_t num_samples, const Tensor* L, Tensor* S,
-                  Tensor* P);
-};
-#endif
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
deleted file mode 100644
index 238d9f2905058d267ffbee0669594920d7a9e031..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sampler.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/sampler.h"
-#include <glog/logging.h>
-#include <iostream>
-#include <queue>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-Sampler::~Sampler() {}
-
-UniformSampler::UniformSampler(int64_t range, unsigned int seed)
-    : Sampler(range, seed), inv_range_(1.0 / (range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
-  dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
-}
-
-int64_t UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
-
-float UniformSampler::Probability(int64_t value) const { return inv_range_; }
-
-LogUniformSampler::LogUniformSampler(int64_t range, unsigned int seed)
-    : Sampler(range, seed), log_range_(log(range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
-  dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
-}
-
-int64_t LogUniformSampler::Sample() const {
-  // Got Log Uniform distribution from uniform distribution by
-  // inverse_transform_sampling method
-  // More details:
-  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
-  const int64_t value =
-      static_cast<int64_t>(exp((*dist_)(*random_engine_) * log_range_)) - 1;
-  // Mathematically, value should be <= range_, but might not be due to some
-  // floating point roundoff, so we mod by range_.
-  return value % range_;
-}
-
-float LogUniformSampler::Probability(int64_t value) const {
-  // Given f(x) = 1/[(x+1) * log_range_]
-  // The value's  probability  is integral of f(x) from value to (value + 1)
-  // More details:
-  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler
-  return (log((value + 2.0) / (value + 1.0))) / log_range_;
-}
-
-CustomSampler::CustomSampler(int64_t range, const float *probabilities,
-                             const int *alias, const float *alias_probabilities,
-                             unsigned int seed)
-    : Sampler(range, seed) {
-  random_engine_ = std::make_shared<std::mt19937>(seed_);
-  real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
-  int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
-
-  alias_probs_ = alias_probabilities;
-  probs_ = probabilities;
-  alias_ = alias;
-}
-
-int64_t CustomSampler::Sample() const {
-  auto index = (*int_dist_)(*random_engine_);
-  auto p = (*real_dist_)(*random_engine_);
-  if (p > alias_probs_[index]) {
-    int alias = alias_[index];
-
-    if (alias == exceptional_val) {
-      LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val;
-      return index;
-    }
-
-    return alias;
-  } else {
-    return index;
-  }
-}
-
-float CustomSampler::Probability(int64_t value) const { return probs_[value]; }
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
deleted file mode 100644
index 3fa5a7ae336a9be984324411b88570aea99c2c78..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sampler.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <random>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// TODO(wanghaoshuang): Support for GPU
-
-/**
-* Sample integers from [0, range).
-*/
-class Sampler {
- public:
-  explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    //    PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
-    if (seed == 0) {
-      std::random_device r;
-      seed_ = r();
-    } else {
-      seed_ = seed;
-    }
-  }
-
-  virtual ~Sampler();
-
-  // Sample a single value
-  virtual int64_t Sample() const = 0;
-
-  // The probability that a single call to Sample() returns the given value.
-  virtual float Probability(int64_t value) const = 0;
-
-  int64_t range() { return range_; }
-
- protected:
-  const int64_t range_;
-  unsigned int seed_;
-};
-
-/**
- * Sample integers from [0, range).
- * And the distribution function is:
- * P(x) = 1 / range
- */
-class UniformSampler : public Sampler {
- public:
-  explicit UniformSampler(int64_t range, unsigned int seed = 0UL);
-
-  ~UniformSampler() override {}
-
-  int64_t Sample() const override;
-
-  float Probability(int64_t value) const override;
-
- private:
-  const float inv_range_;
-  std::shared_ptr<std::mt19937_64> random_engine_;
-  std::shared_ptr<std::uniform_int_distribution<>> dist_;
-};
-
-/**
- * Sample integers from [0, range).
- * And the distribution function is:
- * P(x) = (1/ln(range+1)) * ln(1 + 1/(x + 1))
- */
-class LogUniformSampler : public Sampler {
- public:
-  explicit LogUniformSampler(int64_t range, unsigned int seed = 0UL);
-
-  ~LogUniformSampler() override {}
-
-  int64_t Sample() const override;
-
-  float Probability(int64_t value) const override;
-
- private:
-  const float log_range_;
-  std::shared_ptr<std::mt19937_64> random_engine_;
-  std::shared_ptr<std::uniform_real_distribution<>> dist_;
-};
-
-/**
- * Sample integers from [0, range) from custom distribution.
- */
-class CustomSampler : public Sampler {
- public:
-  explicit CustomSampler(int64_t range, const float* probabilities,
-                         const int* alias, const float* alias_probabilities,
-                         unsigned int seed = 0UL);
-
-  ~CustomSampler() override {}
-
-  int64_t Sample() const override;
-
-  float Probability(int64_t value) const override;
-
- private:
-  const float* alias_probs_;
-  const int* alias_;
-  const float* probs_;
-  const int exceptional_val = -1;
-  std::shared_ptr<std::mt19937> random_engine_;
-  std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
-  std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
deleted file mode 100644
index f73c9bb9dc57345c74678813cc5a7656ca29e134..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ /dev/null
@@ -1,552 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <set>
-#include <unordered_map>
-
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename T>
-struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::SelectedRows& input2,
-                  framework::SelectedRows* output) {
-    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
-    output->set_height(in1_height);
-
-    auto& in1_rows = input1.rows();
-    auto& in2_rows = input2.rows();
-    std::vector<int64_t> out_rows;
-    out_rows.reserve(in1_rows.size() + in2_rows.size());
-
-    // concat rows
-    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
-    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
-    output->set_rows(out_rows);
-
-    auto* out_value = output->mutable_value();
-    auto& in1_value = input1.value();
-    auto& in2_value = input2.value();
-
-    auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
-
-    auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
-    auto in2_place = input2.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
-    auto out_place = context.GetPlace();
-    PADDLE_ENFORCE(platform::is_cpu_place(out_place));
-
-    auto* out_data = out_value->data<T>();
-    auto* in1_data = in1_value.data<T>();
-    memory::Copy(boost::get<platform::CPUPlace>(out_place), out_data,
-                 boost::get<platform::CPUPlace>(in1_place), in1_data,
-                 in1_value.numel() * sizeof(T));
-
-    auto* in2_data = in2_value.data<T>();
-    memory::Copy(boost::get<platform::CPUPlace>(out_place),
-                 out_data + in1_value.numel(),
-                 boost::get<platform::CPUPlace>(in2_place), in2_data,
-                 in2_value.numel() * sizeof(T));
-  }
-};
-
-template struct SelectedRowsAdd<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAdd<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::Tensor& input2, framework::Tensor* output) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
-
-    SetConstant<platform::CPUDeviceContext, T> functor;
-    functor(context, output, 0.0);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* out_data = output->data<T>();
-
-    for (size_t i = 0; i < in1_rows.size(); i++) {
-      for (int64_t j = 0; j < in1_row_numel; j++) {
-        out_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-      }
-    }
-
-    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
-    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
-  }
-};
-
-template struct SelectedRowsAddTensor<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddTensor<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const int64_t input2_offset,
-                  framework::SelectedRows* input2) {
-    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
-
-    auto& in1_rows = input1.rows();
-    auto& in2_rows = *(input2->mutable_rows());
-
-    auto& in1_value = input1.value();
-    auto* in2_value = input2->mutable_value();
-
-    // concat rows
-    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
-
-    auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
-    auto in2_place = input2->place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->data<T>();
-    memory::Copy(boost::get<platform::CPUPlace>(in2_place),
-                 in2_data + input2_offset,
-                 boost::get<platform::CPUPlace>(in1_place), in1_data,
-                 in1_value.numel() * sizeof(T));
-  }
-};
-
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
-
-template <typename T>
-struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const std::vector<framework::SelectedRows*>& input1,
-                  const std::vector<int64_t>& input2_offsets,
-                  framework::SelectedRows* input2) {
-    // Ensure all selected rows have the same height
-    size_t size = 0u;
-    for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
-      auto& in_rows = (*iter)->rows();
-      size += in_rows.end() - in_rows.begin();
-      auto in1_height = (*iter)->height();
-      PADDLE_ENFORCE_EQ(in1_height, input2->height());
-    }
-    // concat rows
-    std::vector<int64_t> in2_rows;
-    in2_rows.reserve(in2_rows.size() + size);
-    for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
-      const framework::Vector<int64_t>& in_rows = (*iter)->rows();
-      in2_rows.insert(in2_rows.end(), in_rows.begin(), in_rows.end());
-    }
-    input2->set_rows(in2_rows);
-
-    auto* in2_value = input2->mutable_value();
-    auto* in2_data = in2_value->data<T>();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    size_t offset = 0u;
-    for (size_t i = 0u; i != input1.size(); ++i) {
-      auto& in_value = input1[i]->value();
-      const auto* in_data = in_value.data<T>();
-      offset += input2_offsets[i];
-      blas.VCOPY(in_value.numel(), in_data, in2_data + offset);
-    }
-  }
-};
-
-template struct SelectedRowsSumTo<platform::CPUDeviceContext, float>;
-template struct SelectedRowsSumTo<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    if (UNLIKELY(input1.rows().size() == 0)) {
-      LOG(WARNING) << "input selected rows is empty!";
-      return;
-    }
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
-
-    for (size_t i = 0; i < in1_rows.size(); i++) {
-      for (int64_t j = 0; j < in1_row_numel; j++) {
-        input2_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-      }
-    }
-  }
-};
-
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
-
-// This is a separated namespace for manipulate SelectedRows typed
-// data. Like merge duplicated rows, adding two SelectedRows etc.
-//
-// Another group of functors is called "scatter updates", which means
-// use SelectedRows to update a dense tensor with different Ops, like
-// add or mul.
-namespace scatter {
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
-                   size_t data_len, const T* in, T* out) {
-  blas->AXPY(data_len, 1., in, out);
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    !std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
-                   size_t data_len, const T* in, T* out) {
-  for (size_t i = 0; i < data_len; i++) {
-    out[i] += in[i];
-  }
-}
-
-template <typename T>
-struct MergeAdd<platform::CPUDeviceContext, T> {
-  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                                     const framework::SelectedRows& input,
-                                     const bool sorted_result = false) {
-    framework::SelectedRows out;
-    (*this)(context, input, &out, sorted_result);
-    return out;
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output,
-                  const bool sorted_result = false) {
-    std::vector<const framework::SelectedRows*> inputs;
-    inputs.push_back(&input);
-    (*this)(context, inputs, output, sorted_result);
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output,
-                  const bool sorted_result = false) {
-    if (inputs.size() == 0) {
-      VLOG(3) << "no input! return";
-      return;
-    }
-    const framework::SelectedRows* has_value_input = nullptr;
-    for (auto* in : inputs) {
-      if (in->rows().size() > 0) {
-        has_value_input = in;
-        break;
-      }
-    }
-    if (has_value_input == nullptr) {
-      VLOG(3) << "no input has value! just return" << std::endl;
-      return;
-    }
-    auto input_width = has_value_input->value().dims()[1];
-    auto input_height = has_value_input->height();
-    framework::SelectedRows& out = *output;
-    std::set<int64_t> merged_row_set;
-    size_t row_num = 0;
-    for (auto* input : inputs) {
-      if (input->rows().size() == 0) {
-        continue;
-      }
-      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
-      PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
-      row_num += input->rows().size();
-      merged_row_set.insert(input->rows().begin(), input->rows().end());
-    }
-
-    out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merged_row_set.size()), input_width}),
-        context.GetPlace());
-    auto* out_data = out.mutable_value()->data<T>();
-
-    if (merged_row_set.size() == row_num && !sorted_result) {
-      // no duplicated ids, just concat the result together
-      std::vector<int64_t> merge_rows;
-      merge_rows.reserve(row_num);
-      // concat rows
-      for (auto* in : inputs) {
-        merge_rows.insert(merge_rows.end(), in->rows().begin(),
-                          in->rows().end());
-      }
-      out.set_rows(merge_rows);
-      auto in_place = inputs[0]->place();
-      auto out_place = out.place();
-      int64_t copied_numel = 0;
-      for (auto* in : inputs) {
-        auto* in_data = in->value().data<T>();
-        auto in_numel = in->value().numel();
-        memory::Copy(boost::get<platform::CPUPlace>(out_place),
-                     out_data + copied_numel,
-                     boost::get<platform::CPUPlace>(in_place), in_data,
-                     in_numel * sizeof(T));
-        copied_numel += in_numel;
-      }
-    } else {
-      std::vector<int64_t> merge_rows(merged_row_set.begin(),
-                                      merged_row_set.end());
-
-      if (sorted_result) {
-        std::sort(merge_rows.begin(), merge_rows.end());
-      }
-
-      out.set_rows(merge_rows);
-
-      math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-      constant_functor(context, out.mutable_value(), 0.0);
-
-      std::unordered_map<int64_t, size_t> rows_to_id;
-      for (size_t i = 0; i < merge_rows.size(); ++i) {
-        rows_to_id[merge_rows[i]] = i;
-      }
-
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-      for (auto* input : inputs) {
-        if (input->rows().size() == 0) {
-          continue;
-        }
-        auto* input_data = input->value().data<T>();
-        auto& input_rows = input->rows();
-
-        for (size_t i = 0; i < input_rows.size(); i++) {
-          size_t out_i = rows_to_id[input_rows[i]];
-          elementwise_add_to<platform::CPUDeviceContext, T>(
-              context, &blas, static_cast<size_t>(input_width),
-              &input_data[i * input_width], &out_data[out_i * input_width]);
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-struct MergeAverage<platform::CPUDeviceContext, T> {
-  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                                     const framework::SelectedRows& input) {
-    framework::SelectedRows out;
-    (*this)(context, input, &out);
-    return out;
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output) {
-    std::vector<const framework::SelectedRows*> inputs;
-    inputs.push_back(&input);
-    (*this)(context, inputs, output);
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output) {
-    if (inputs.size() == 0) {
-      VLOG(3) << "no input! return";
-      return;
-    }
-    const framework::SelectedRows* has_value_input = nullptr;
-    for (auto* in : inputs) {
-      if (in->rows().size() > 0) {
-        has_value_input = in;
-        break;
-      }
-    }
-    if (has_value_input == nullptr) {
-      VLOG(3) << "no input has value! just return" << std::endl;
-      return;
-    }
-    auto input_width = has_value_input->value().dims()[1];
-    auto input_height = has_value_input->height();
-    framework::SelectedRows& out = *output;
-    std::set<int64_t> merged_row_set;
-    size_t row_num = 0;
-    for (auto* input : inputs) {
-      if (input->rows().size() == 0) {
-        continue;
-      }
-      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
-      PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
-      row_num += input->rows().size();
-      merged_row_set.insert(input->rows().begin(), input->rows().end());
-    }
-
-    out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merged_row_set.size()), input_width}),
-        context.GetPlace());
-    auto* out_data = out.mutable_value()->data<T>();
-
-    std::vector<int64_t> merge_rows(merged_row_set.begin(),
-                                    merged_row_set.end());
-    std::sort(merge_rows.begin(), merge_rows.end());
-
-    out.set_rows(merge_rows);
-
-    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
-
-    std::unordered_map<int64_t, size_t> rows_to_id;
-    for (size_t i = 0; i < merge_rows.size(); ++i) {
-      rows_to_id[merge_rows[i]] = i;
-    }
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    for (auto* input : inputs) {
-      if (input->rows().size() == 0) {
-        continue;
-      }
-      auto* input_data = input->value().data<T>();
-      auto& input_rows = input->rows();
-
-      for (size_t i = 0; i < input_rows.size(); i++) {
-        size_t out_i = rows_to_id[input_rows[i]];
-        elementwise_add_to<platform::CPUDeviceContext, T>(
-            context, &blas, static_cast<size_t>(input_width),
-            &input_data[i * input_width], &out_data[out_i * input_width]);
-      }
-    }
-    size_t input_width_cast = static_cast<size_t>(input_width);
-    T count = static_cast<T>(inputs.size());
-    for (size_t i = 0; i < merge_rows.size(); i++) {
-      for (size_t j = 0; j < input_width_cast; j++) {
-        out_data[i * input_width + j] = out_data[i * input_width + j] / count;
-      }
-    }
-  }
-};
-
-template struct MergeAdd<platform::CPUDeviceContext, int>;
-template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
-template struct MergeAdd<platform::CPUDeviceContext, float>;
-template struct MergeAdd<platform::CPUDeviceContext, double>;
-
-template struct MergeAverage<platform::CPUDeviceContext, int>;
-template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
-template struct MergeAverage<platform::CPUDeviceContext, float>;
-template struct MergeAverage<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct UpdateToTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const ScatterOps& op, const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
-
-    // FIXME(typhoonzero): use macro fix the below messy code.
-    switch (op) {
-      case ScatterOps::ASSIGN:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] =
-            in1_data[i * in1_row_numel + j];
-        break;
-      case ScatterOps::ADD:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-        break;
-      case ScatterOps::SUB:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] -=
-            in1_data[i * in1_row_numel + j];
-        break;
-      case ScatterOps::SUBBY:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] =
-            in1_data[i * in1_row_numel + j] -
-            input2_data[in1_rows[i] * in1_row_numel + j];
-        break;
-      case ScatterOps::MUL:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] *=
-            in1_data[i * in1_row_numel + j];
-        break;
-      case ScatterOps::DIV:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] /=
-            in1_data[i * in1_row_numel + j];
-        break;
-      case ScatterOps::DIVBY:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] =
-            in1_data[i * in1_row_numel + j] /
-            input2_data[in1_rows[i] * in1_row_numel + j];
-        break;
-    }
-  }
-};
-
-}  // namespace scatter
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
deleted file mode 100644
index b3e2c8a6674d65f0c3ac3aa2109bcc9908aa2e8e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ /dev/null
@@ -1,472 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-#include <vector>
-
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename T>
-struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::SelectedRows& input2,
-                  framework::SelectedRows* output) {
-    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
-    output->set_height(in1_height);
-
-    framework::Vector<int64_t> in1_rows(input1.rows());
-    auto& in2_rows = input2.rows();
-    std::vector<int64_t> out_rows;
-    out_rows.reserve(in1_rows.size() + in2_rows.size());
-
-    // concat rows
-    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
-    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
-    output->set_rows(out_rows);
-
-    auto* out_value = output->mutable_value();
-    auto& in1_value = input1.value();
-    auto& in2_value = input2.value();
-
-    auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
-
-    auto* out_data = out_value->data<T>();
-    auto* in1_data = in1_value.data<T>();
-
-    auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
-    auto in2_place = input2.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
-    auto out_place = context.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true);
-
-    memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
-                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
-                 in1_value.numel() * sizeof(T), context.stream());
-
-    auto* in2_data = in2_value.data<T>();
-    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
-                 out_data + in1_value.numel(),
-                 boost::get<platform::CUDAPlace>(in2_place), in2_data,
-                 in2_value.numel() * sizeof(T), context.stream());
-  }
-};
-
-template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
-
-namespace {
-template <typename T, int block_size>
-__global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
-                                            const int64_t* rows, T* tensor_out,
-                                            int64_t row_numel) {
-  const int ty = blockIdx.x;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we can not use
-    // tensor_out[index] += selected_rows[index]; Instead, we have to use
-    // AtomicAdd to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
-  }
-}
-}  // namespace
-
-template <typename T>
-struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::Tensor& input2, framework::Tensor* output) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2.data<T>();
-    auto* out_data = output->data<T>();
-
-    SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(context, output, static_cast<T>(0));
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(in1_rows.size(), 1);
-    SelectedRowsAddTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
-        in1_row_numel);
-
-    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
-    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
-  }
-};
-
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
-                                      platform::float16>;
-
-template <typename T>
-struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const int64_t input2_offset,
-                  framework::SelectedRows* input2) {
-    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
-
-    auto& in1_rows = input1.rows();
-    auto& in2_rows = *(input2->mutable_rows());
-
-    auto& in1_value = input1.value();
-    auto* in2_value = input2->mutable_value();
-
-    // concat rows
-    if (in1_rows.size()) {
-      in2_rows.Extend(in1_rows.begin(), in1_rows.end());
-    }
-
-    auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
-    auto in2_place = input2->place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->data<T>();
-    memory::Copy(boost::get<platform::CUDAPlace>(in2_place),
-                 in2_data + input2_offset,
-                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
-                 in1_value.numel() * sizeof(T), context.stream());
-  }
-};
-
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext,
-                                  platform::float16>;
-
-namespace {
-template <typename T, int block_size>
-__global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
-                                              const int64_t* rows,
-                                              T* tensor_out,
-                                              int64_t row_numel) {
-  const int ty = blockIdx.x;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we have to use
-    // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
-  }
-}
-}  // namespace
-
-template <typename T>
-struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2->data<T>();
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(in1_rows.size(), 1);
-    SelectedRowsAddToTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
-        in1_row_numel);
-  }
-};
-
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
-                                        platform::float16>;
-
-namespace scatter {
-
-template <typename T, int block_size>
-__global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
-                               T* out, const int64_t* out_rows,
-                               size_t out_rows_size, int64_t row_numel) {
-  const int ty = blockIdx.x;
-  int tid = threadIdx.x;
-  __shared__ size_t out_idx;
-
-  if (tid == 0) {
-    for (size_t i = 0; i < out_rows_size; i++) {
-      if (input_rows[ty] == out_rows[i]) {
-        out_idx = i;
-      }
-    }
-  }
-
-  __syncthreads();
-
-  input += ty * row_numel;
-  out += out_idx * row_numel;
-  for (int index = tid; index < row_numel; index += block_size) {
-    paddle::platform::CudaAtomicAdd(out + index, input[index]);
-  }
-}
-
-template <typename T>
-struct MergeAdd<platform::CUDADeviceContext, T> {
-  framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
-                                     const framework::SelectedRows& input,
-                                     const bool sorted_result = false) {
-    framework::SelectedRows out;
-    (*this)(context, input, &out);
-    return out;
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output,
-                  const bool sorted_result = false) {
-    framework::Vector<int64_t> input_rows(input.rows());
-    if (input_rows.size() == 0) {
-      return;
-    }
-
-    framework::SelectedRows& out = *output;
-    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
-    std::vector<int64_t> merge_rows_cpu(row_set.begin(), row_set.end());
-    framework::Vector<int64_t> merge_rows(merge_rows_cpu);
-
-    auto input_width = input.value().dims()[1];
-
-    out.set_rows(merge_rows);
-    out.set_height(input.height());
-    out.mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), static_cast<T>(0));
-
-    auto* out_data = out.mutable_value()->data<T>();
-    auto* input_data = input.value().data<T>();
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid1(input_rows.size(), 1);
-
-    MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
-        input_data, input_rows.CUDAData(context.GetPlace()), out_data,
-        out.mutable_rows()->CUDAMutableData(context.GetPlace()),
-        out.rows().size(), input_width);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output,
-                  const bool sorted_result = false) {
-    if (inputs.size() == 0) {
-      VLOG(3) << "no input! return";
-      return;
-    }
-    const framework::SelectedRows* has_value_input = nullptr;
-    for (auto* in : inputs) {
-      if (in->rows().size() > 0) {
-        has_value_input = in;
-        break;
-      }
-    }
-    if (has_value_input == nullptr) {
-      VLOG(3) << "no input has value! just return" << std::endl;
-      return;
-    }
-    auto input_width = has_value_input->value().dims()[1];
-    auto input_height = has_value_input->height();
-    framework::SelectedRows& out = *output;
-    std::set<int64_t> merged_row_set;
-    for (auto* input : inputs) {
-      if (input->rows().size() == 0) {
-        continue;
-      }
-      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
-      PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
-      merged_row_set.insert(input->rows().begin(), input->rows().end());
-    }
-    std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
-                                        merged_row_set.end());
-    framework::Vector<int64_t> merge_rows(merge_rows_cpu);
-
-    out.set_rows(merge_rows);
-    out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), static_cast<T>(0));
-
-    auto* out_data = out.mutable_value()->data<T>();
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-
-    for (auto* input : inputs) {
-      if (input->rows().size() == 0) {
-        continue;
-      }
-      auto* input_data = input->value().data<T>();
-      auto& input_rows = input->rows();
-      dim3 grid1(input_rows.size(), 1);
-
-      MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
-          input_data, input_rows.CUDAData(context.GetPlace()), out_data,
-          out.mutable_rows()->CUDAMutableData(context.GetPlace()),
-          out.rows().size(), input_width);
-    }
-  }
-};
-
-template struct MergeAdd<platform::CUDADeviceContext, float>;
-template struct MergeAdd<platform::CUDADeviceContext, double>;
-template struct MergeAdd<platform::CUDADeviceContext, int>;
-template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
-template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
-
-template <typename T, int block_size>
-__global__ void UpdateToTensorKernel(const T* selected_rows,
-                                     const int64_t* rows, const ScatterOps& op,
-                                     T* tensor_out, int64_t row_numel) {
-  const int ty = blockIdx.x;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-  // FIXME(typhoonzero): use macro fix the below messy code.
-  switch (op) {
-    case ScatterOps::ASSIGN:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] = selected_rows[index];
-      }
-      break;
-    case ScatterOps::ADD:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] += selected_rows[index];
-      }
-      break;
-    case ScatterOps::SUB:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] -= selected_rows[index];
-      }
-      break;
-    case ScatterOps::SUBBY:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] = selected_rows[index] - tensor_out[index];
-      }
-      break;
-    case ScatterOps::MUL:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] *= selected_rows[index];
-      }
-      break;
-    case ScatterOps::DIV:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] /= selected_rows[index];
-      }
-      break;
-    case ScatterOps::DIVBY:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] = selected_rows[index] / tensor_out[index];
-      }
-      break;
-  }
-}
-
-template <typename T>
-struct UpdateToTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const ScatterOps& op, const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    // NOTE: Use SelectedRowsAddToTensor for better performance
-    //       no additional MergeAdd called.
-    MergeAdd<platform::CUDADeviceContext, T> merge_func;
-    auto merged_in1 = merge_func(context, input1);
-
-    auto in1_height = merged_in1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-
-    auto& in1_value = merged_in1.value();
-    auto& in1_rows = merged_in1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
-
-    auto* in1_data = in1_value.template data<T>();
-    auto* in2_data = input2->data<T>();
-
-    dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
-    dim3 grid(in1_rows.size(), 1);
-    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
-        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
-                                              op, in2_data, in1_row_numel);
-  }
-};
-}  // namespace scatter
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
deleted file mode 100644
index a1eb69db7cfce0ec11aa09180fbc73c4bd0a23f6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <map>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
-
-#define INLINE_FOR2(sizei, sizej)     \
-  for (int64_t i = 0; i < sizei; i++) \
-    for (int64_t j = 0; j < sizej; j++)
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// SelectedRows + SelectedRows will simplely concat value and rows.
-// The real computation happens in dealing with LoDTensor.
-template <typename DeviceContext, typename T>
-struct SelectedRowsAdd {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::SelectedRows& input2,
-                  framework::SelectedRows* output);
-};
-
-template <typename DeviceContext, typename T>
-struct SelectedRowsAddTensor {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::Tensor& input2, framework::Tensor* output);
-};
-
-// input2 = input1 + input2
-template <typename DeviceContext, typename T>
-struct SelectedRowsAddTo {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const int64_t input2_offset, framework::SelectedRows* input2);
-};
-
-// input2 = [all input in input1] + input2
-template <typename DeviceContext, typename T>
-struct SelectedRowsSumTo {
-  void operator()(const DeviceContext& context,
-                  const std::vector<framework::SelectedRows*>& input1,
-                  const std::vector<int64_t>& input2_offsets,
-                  framework::SelectedRows* input2);
-};
-
-// FIXME: The result of SelectedRowsAddToTensor maybe non deterministic,
-// because it uses CudaAtomicAdd.
-// input2 = input1 + input2
-template <typename DeviceContext, typename T>
-struct SelectedRowsAddToTensor {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2);
-};
-
-namespace scatter {
-// functors for manuplating SelectedRows data
-template <typename DeviceContext, typename T>
-struct MergeAdd {
-  // unary functor, merge by adding duplicated rows in
-  // the input SelectedRows object.
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input,
-                                     const bool sorted_result = false);
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output,
-                  const bool sorted_result = false);
-  void operator()(const DeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output,
-                  const bool sorted_result = false);
-};
-
-template <typename DeviceContext, typename T>
-struct MergeAverage {
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input);
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output);
-  void operator()(const DeviceContext& context,
-                  const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output);
-};
-
-enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
-
-// out = selected_rows_in / tensor
-template <typename DeviceContext, typename T>
-struct UpdateToTensor {
-  void operator()(const DeviceContext& context, const ScatterOps& op,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2);
-};
-
-}  // namespace scatter
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
deleted file mode 100644
index b7a499aa968035a46c4632d7a575594e1cb4ebcd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ /dev/null
@@ -1,558 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-#include <memory>
-#include <vector>
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/operators/math/math_function.h"
-
-TEST(selected_rows_functor, cpu_add) {
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows1.size()), row_numel}),
-      cpu_place);
-  functor(ctx, in1_value, 1.0);
-
-  std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows2.size()), row_numel}),
-      cpu_place);
-  functor(ctx, in2_value, 2.0);
-
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
-  auto* out_value = output->mutable_value();
-
-  // simplely concat two SelectedRows
-  out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
-                                 cpu_place);
-
-  paddle::operators::math::SelectedRowsAdd<paddle::platform::CPUDeviceContext,
-                                           float>
-      add_functor;
-  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
-
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output->rows();
-
-  // input1 rows
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-  // input2 rows
-  EXPECT_EQ(out_rows[3], 0);
-  EXPECT_EQ(out_rows[4], 5);
-  EXPECT_EQ(out_rows[5], 7);
-  EXPECT_EQ(out_rows[6], 9);
-
-  auto* out_data = output->value().data<float>();
-  // input1 value
-  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
-  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
-  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
-  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
-  // input2 value
-  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
-  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
-  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
-  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
-  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
-
-  std::unique_ptr<paddle::framework::Tensor> tensor1{
-      new paddle::framework::Tensor()};
-  tensor1->mutable_data<float>(
-      paddle::framework::make_ddim({height, row_numel}), cpu_place);
-  functor(ctx, tensor1.get(), 3.0);
-
-  std::unique_ptr<paddle::framework::Tensor> tensor2{
-      new paddle::framework::Tensor()};
-  tensor2->mutable_data<float>(
-      paddle::framework::make_ddim({height, row_numel}), cpu_place);
-
-  paddle::operators::math::SelectedRowsAddTensor<
-      paddle::platform::CPUDeviceContext, float>
-      add_tensor_functor;
-  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
-
-  auto* tensor2_data = tensor2->data<float>();
-  // row0: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor2_data[0 * row_numel + 0], 6.0);
-  // row1: 3.0
-  EXPECT_EQ(tensor2_data[1 * row_numel + 1], 3.0);
-  // row4 : 1.0 + 3.0
-  EXPECT_EQ(tensor2_data[4 * row_numel + 6], 4.0);
-  // row5: 2.0 + 3.0
-  EXPECT_EQ(tensor2_data[5 * row_numel + 7], 5.0);
-  // row6: 3.0
-  EXPECT_EQ(tensor2_data[6 * row_numel + 1], 3.0);
-  // row7: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor2_data[7 * row_numel + 3], 6.0);
-  // row9: 2.0 + 3.0
-  EXPECT_EQ(tensor2_data[9 * row_numel + 6], 5.0);
-}
-
-TEST(selected_rows_functor, cpu_add_to) {
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows1.size()), row_numel}),
-      cpu_place);
-  functor(ctx, in1_value, 1.0);
-
-  std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows2.size()), row_numel}),
-      cpu_place);
-  functor(ctx, in2_value, 2.0);
-
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
-  output->set_height(height);
-  auto* out_value = output->mutable_value();
-
-  // simplely concat two SelectedRows
-  out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
-                                 cpu_place);
-
-  paddle::operators::math::SelectedRowsAddTo<paddle::platform::CPUDeviceContext,
-                                             float>
-      add_to_functor;
-  add_to_functor(ctx, *selected_rows1, 0, output.get());
-  add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
-
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output->rows();
-
-  // input1 rows
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-  // input2 rows
-  EXPECT_EQ(out_rows[3], 0);
-  EXPECT_EQ(out_rows[4], 5);
-  EXPECT_EQ(out_rows[5], 7);
-  EXPECT_EQ(out_rows[6], 9);
-
-  auto* out_data = output->value().data<float>();
-  // input1 value
-  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
-  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
-  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
-  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
-  // input2 value
-  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
-  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
-  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
-  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
-  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
-
-  std::unique_ptr<paddle::framework::Tensor> tensor1{
-      new paddle::framework::Tensor()};
-  tensor1->mutable_data<float>(
-      paddle::framework::make_ddim({height, row_numel}), cpu_place);
-  functor(ctx, tensor1.get(), 3.0);
-
-  paddle::operators::math::SelectedRowsAddToTensor<
-      paddle::platform::CPUDeviceContext, float>
-      add_to_tensor_functor;
-  add_to_tensor_functor(ctx, *output, tensor1.get());
-
-  auto* tensor1_data = tensor1->data<float>();
-  // row0: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0);
-  // row1: 3.0
-  EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0);
-  // row4 : 1.0 + 3.0
-  EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0);
-  // row5: 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0);
-  // row6: 3.0
-  EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0);
-  // row7: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0);
-  // row9: 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
-}
-
-TEST(selected_rows_functor, cpu_merge_average_float) {
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows{0, 4, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
-      new paddle::framework::SelectedRows(rows, height)};
-  auto* in_value = selected_rows->mutable_value();
-  in_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows.size()), row_numel}),
-      cpu_place);
-  functor(ctx, in_value, 1.0);
-
-  paddle::operators::math::scatter::MergeAverage<
-      paddle::platform::CPUDeviceContext, float>
-      merge_average_functor;
-  paddle::framework::SelectedRows output =
-      merge_average_functor(ctx, *selected_rows);
-
-  auto out_height = output.height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output.rows();
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-
-  auto* out_data = output.value().data<float>();
-
-  EXPECT_EQ(out_data[0 * row_numel], 1.0);
-  EXPECT_EQ(out_data[1 * row_numel], 2.0);
-  EXPECT_EQ(out_data[2 * row_numel], 1.0);
-}
-
-TEST(selected_rows_functor, cpu_merge_add_float) {
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows{0, 4, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
-      new paddle::framework::SelectedRows(rows, height)};
-  auto* in_value = selected_rows->mutable_value();
-  in_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows.size()), row_numel}),
-      cpu_place);
-  functor(ctx, in_value, 1.0);
-
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
-
-  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
-                                             float>
-      merge_add_functor;
-  merge_add_functor(ctx, *selected_rows, output.get());
-
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output->rows();
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-
-  auto* out_data = output->value().data<float>();
-
-  EXPECT_EQ(out_data[0 * row_numel], 1.0);
-  EXPECT_EQ(out_data[1 * row_numel], 2.0);
-  EXPECT_EQ(out_data[2 * row_numel], 1.0);
-}
-
-TEST(selected_rows_functor, cpu_merge_add_int) {
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, int>
-      functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows{0, 4, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
-      new paddle::framework::SelectedRows(rows, height)};
-  auto* in_value = selected_rows->mutable_value();
-  in_value->mutable_data<int>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows.size()), row_numel}),
-      cpu_place);
-  functor(ctx, in_value, 1);
-
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
-
-  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
-                                             int>
-      merge_add_functor;
-  merge_add_functor(ctx, *selected_rows, output.get());
-
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output->rows();
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-
-  auto* out_data = output->value().data<int>();
-
-  EXPECT_EQ(out_data[0 * row_numel], 1);
-  EXPECT_EQ(out_data[1 * row_numel], 2);
-  EXPECT_EQ(out_data[2 * row_numel], 1);
-}
-
-TEST(selected_rows_functor, cpu_merge_add_multi) {
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      set_const;
-
-  int64_t height = 10;
-  int64_t row_numel = 8;
-
-  std::vector<int64_t> rows1{5, 2, 5, 3, 5};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows1.size()), row_numel}),
-      cpu_place);
-  set_const(ctx, in1_value, 1.0);
-
-  std::vector<int64_t> rows2{2, 5, 3, 5, 3};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows2.size()), row_numel}),
-      cpu_place);
-  set_const(ctx, in2_value, 1.0);
-
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
-  output->set_height(height);
-  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
-                                             float>
-      merge_add_functor;
-
-  std::vector<const paddle::framework::SelectedRows*> inputs;
-  inputs.push_back(selected_rows1.get());
-  inputs.push_back(selected_rows2.get());
-  merge_add_functor(ctx, inputs, output.get());
-
-  EXPECT_EQ(output->height(), height);
-  EXPECT_EQ(output->value().dims(),
-            paddle::framework::make_ddim({3, row_numel}));
-
-  std::vector<int64_t> ret_rows{2, 3, 5};
-  EXPECT_EQ(output->rows(), ret_rows);
-
-  auto* out_data = output->value().data<float>();
-  for (size_t i = 0; i < ret_rows.size(); ++i) {
-    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
-      EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
-    }
-  }
-}
-
-TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      set_const;
-
-  int64_t height = 10;
-  int64_t row_numel = 8;
-
-  std::vector<int64_t> rows1{1, 3, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows1.size()), row_numel}),
-      cpu_place);
-  set_const(ctx, in1_value, 1.0);
-
-  std::vector<int64_t> rows2{0, 2, 4, 6, 8};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows2.size()), row_numel}),
-      cpu_place);
-  set_const(ctx, in2_value, 2.0);
-
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
-  output->set_height(height);
-  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
-                                             float>
-      merge_add_functor;
-
-  std::vector<const paddle::framework::SelectedRows*> inputs;
-  inputs.push_back(selected_rows1.get());
-  inputs.push_back(selected_rows2.get());
-  merge_add_functor(ctx, inputs, output.get());
-
-  EXPECT_EQ(output->height(), height);
-  EXPECT_EQ(output->value().dims(),
-            paddle::framework::make_ddim({10, row_numel}));
-
-  std::vector<int64_t> ret_rows{1, 3, 5, 7, 9, 0, 2, 4, 6, 8};
-  EXPECT_EQ(output->rows(), ret_rows);
-
-  auto* out_data = output->value().data<float>();
-  for (size_t i = 0; i < ret_rows.size(); ++i) {
-    float data_value = 0;
-    if (i < 5) {
-      data_value = 1.0;
-    } else {
-      data_value = 2.0;
-    }
-    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
-      EXPECT_EQ(out_data[i * row_numel + j], data_value);
-    }
-  }
-}
-
-TEST(selected_rows_functor, cpu_sum_to) {
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-  std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows1.size()), row_numel}),
-      cpu_place);
-
-  functor(ctx, in1_value, 1.0);
-  std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows2.size()), row_numel}),
-      cpu_place);
-
-  functor(ctx, in2_value, 2.0);
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
-  output->set_height(height);
-  auto* out_value = output->mutable_value();
-  // simplely concat two SelectedRows
-  out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
-                                 cpu_place);
-  paddle::operators::math::SelectedRowsSumTo<paddle::platform::CPUDeviceContext,
-                                             float>
-      sum_to_functor;
-  sum_to_functor(ctx, std::vector<paddle::framework::SelectedRows*>(
-                          {selected_rows1.get(), selected_rows2.get()}),
-                 std::vector<int64_t>({0, in1_value->numel()}), output.get());
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-  auto& out_rows = output->rows();
-  // input1 rows
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-  // input2 rows
-  EXPECT_EQ(out_rows[3], 0);
-  EXPECT_EQ(out_rows[4], 5);
-  EXPECT_EQ(out_rows[5], 7);
-  EXPECT_EQ(out_rows[6], 9);
-  auto* out_data = output->value().data<float>();
-  // input1 value
-  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
-  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
-  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
-  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
-  // input2 value
-  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
-  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
-  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
-  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
-  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
-  std::unique_ptr<paddle::framework::Tensor> tensor1{
-      new paddle::framework::Tensor()};
-  tensor1->mutable_data<float>(
-      paddle::framework::make_ddim({height, row_numel}), cpu_place);
-  functor(ctx, tensor1.get(), 3.0);
-  paddle::operators::math::SelectedRowsAddToTensor<
-      paddle::platform::CPUDeviceContext, float>
-      add_to_tensor_functor;
-  add_to_tensor_functor(ctx, *output, tensor1.get());
-  auto* tensor1_data = tensor1->data<float>();
-  // row0: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0);
-  // row1: 3.0
-  EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0);
-  // row4 : 1.0 + 3.0
-  EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0);
-  // row5: 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0);
-  // row6: 3.0
-  EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0);
-  // row7: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0);
-  // row9: 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
-}
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
deleted file mode 100644
index 74892316e6decdeab3a08396fa2f4bdeb8eb7b73..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-TEST(selected_rows_functor, gpu_add) {
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDADeviceContext& ctx =
-      *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
-          paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
-                                       float>
-      functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows1.size()), row_numel}),
-      gpu_place);
-  functor(ctx, in1_value, 1.0);
-  PADDLE_ENFORCE(cudaDeviceSynchronize());
-
-  std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows2.size()), row_numel}),
-      gpu_place);
-  functor(ctx, in2_value, 2.0);
-
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
-  auto* out_value = output->mutable_value();
-
-  // simply concat two SelectedRows
-  out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
-                                 gpu_place);
-
-  paddle::operators::math::SelectedRowsAdd<paddle::platform::CUDADeviceContext,
-                                           float>
-      add_functor;
-  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
-
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output->rows();
-
-  // input1 rows
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-  // input2 rows
-  EXPECT_EQ(out_rows[3], 0);
-  EXPECT_EQ(out_rows[4], 5);
-  EXPECT_EQ(out_rows[5], 7);
-  EXPECT_EQ(out_rows[6], 9);
-
-  paddle::framework::Tensor out_cpu;
-  paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
-  ctx.Wait();
-
-  auto* out_cpu_data = out_cpu.data<float>();
-  // input1 value
-  EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0);
-  EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0);
-  EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0);
-  EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0);
-  // input2 value
-  EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0);
-  EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0);
-  EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0);
-  EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
-  EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
-
-  std::unique_ptr<paddle::framework::Tensor> tensor1{
-      new paddle::framework::Tensor()};
-  tensor1->mutable_data<float>(
-      paddle::framework::make_ddim({height, row_numel}), gpu_place);
-  functor(ctx, tensor1.get(), 3.0);
-
-  std::unique_ptr<paddle::framework::Tensor> tensor2{
-      new paddle::framework::Tensor()};
-  tensor2->mutable_data<float>(
-      paddle::framework::make_ddim({height, row_numel}), gpu_place);
-
-  paddle::operators::math::SelectedRowsAddTensor<
-      paddle::platform::CUDADeviceContext, float>
-      add_tensor_functor;
-  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
-
-  paddle::framework::Tensor tensor2_cpu;
-  paddle::framework::TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu);
-  ctx.Wait();
-
-  auto* tensor2_cpu_data = tensor2_cpu.data<float>();
-  // row0: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor2_cpu_data[0 * row_numel + 0], 6.0);
-  // row1: 3.0
-  EXPECT_EQ(tensor2_cpu_data[1 * row_numel + 1], 3.0);
-  // row4 : 1.0 + 3.0
-  EXPECT_EQ(tensor2_cpu_data[4 * row_numel + 6], 4.0);
-  // row5: 2.0 + 3.0
-  EXPECT_EQ(tensor2_cpu_data[5 * row_numel + 7], 5.0);
-  // row6: 3.0
-  EXPECT_EQ(tensor2_cpu_data[6 * row_numel + 1], 3.0);
-  // row7: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor2_cpu_data[7 * row_numel + 3], 6.0);
-  // row9: 2.0 + 3.0
-  EXPECT_EQ(tensor2_cpu_data[9 * row_numel + 6], 5.0);
-}
-
-TEST(selected_rows_functor, gpu_add_to) {
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDADeviceContext& ctx =
-      *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
-          paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
-                                       float>
-      functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows1.size()), row_numel}),
-      gpu_place);
-  functor(ctx, in1_value, 1.0);
-
-  std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows2.size()), row_numel}),
-      gpu_place);
-  functor(ctx, in2_value, 2.0);
-
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
-  output->set_height(height);
-  auto* out_value = output->mutable_value();
-
-  // simply concat two SelectedRows
-  out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
-                                 gpu_place);
-
-  paddle::operators::math::SelectedRowsAddTo<
-      paddle::platform::CUDADeviceContext, float>
-      add_to_functor;
-  add_to_functor(ctx, *selected_rows1, 0, output.get());
-  add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
-
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output->rows();
-
-  // input1 rows
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-  // input2 rows
-  EXPECT_EQ(out_rows[3], 0);
-  EXPECT_EQ(out_rows[4], 5);
-  EXPECT_EQ(out_rows[5], 7);
-  EXPECT_EQ(out_rows[6], 9);
-
-  paddle::framework::Tensor out_cpu;
-  paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
-  ctx.Wait();
-
-  auto* out_cpu_data = out_cpu.data<float>();
-  // input1 value
-  EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0);
-  EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0);
-  EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0);
-  EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0);
-  // input2 value
-  EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0);
-  EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0);
-  EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0);
-  EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
-  EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
-
-  std::unique_ptr<paddle::framework::Tensor> tensor1{
-      new paddle::framework::Tensor()};
-  tensor1->mutable_data<float>(
-      paddle::framework::make_ddim({height, row_numel}), gpu_place);
-  functor(ctx, tensor1.get(), 3.0);
-
-  paddle::operators::math::SelectedRowsAddToTensor<
-      paddle::platform::CUDADeviceContext, float>
-      add_to_tensor_functor;
-  add_to_tensor_functor(ctx, *output, tensor1.get());
-
-  paddle::framework::Tensor tensor1_cpu;
-  paddle::framework::TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu);
-  ctx.Wait();
-
-  auto* tensor1_cpu_data = tensor1_cpu.data<float>();
-  // row0: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor1_cpu_data[0 * row_numel + 0], 6.0);
-  // row1: 3.0
-  EXPECT_EQ(tensor1_cpu_data[1 * row_numel + 1], 3.0);
-  // row4 : 1.0 + 3.0
-  EXPECT_EQ(tensor1_cpu_data[4 * row_numel + 6], 4.0);
-  // row5: 2.0 + 3.0
-  EXPECT_EQ(tensor1_cpu_data[5 * row_numel + 7], 5.0);
-  // row6: 3.0
-  EXPECT_EQ(tensor1_cpu_data[6 * row_numel + 1], 3.0);
-  // row7: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor1_cpu_data[7 * row_numel + 3], 6.0);
-  // row9: 2.0 + 3.0
-  EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0);
-}
-
-TEST(selected_rows_functor, gpu_merge_add) {
-  paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDADeviceContext& ctx =
-      *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
-          paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
-                                       float>
-      set_const;
-
-  int64_t height = 10;
-  int64_t row_numel = 8;
-
-  std::vector<int64_t> rows1{5, 2, 5, 3, 5};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
-      new paddle::framework::SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows1.size()), row_numel}),
-      gpu_place);
-  set_const(ctx, in1_value, 1.0);
-
-  std::vector<int64_t> rows2{2, 5, 3, 5, 3};
-  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-      new paddle::framework::SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      paddle::framework::make_ddim(
-          {static_cast<int64_t>(rows2.size()), row_numel}),
-      gpu_place);
-  set_const(ctx, in2_value, 1.0);
-
-  std::unique_ptr<paddle::framework::SelectedRows> output{
-      new paddle::framework::SelectedRows()};
-  output->set_height(height);
-  paddle::operators::math::scatter::MergeAdd<
-      paddle::platform::CUDADeviceContext, float>
-      merge_add_functor;
-
-  std::vector<const paddle::framework::SelectedRows*> inputs;
-  inputs.push_back(selected_rows1.get());
-  inputs.push_back(selected_rows2.get());
-  merge_add_functor(ctx, inputs, output.get());
-
-  paddle::framework::Tensor output_cpu;
-  paddle::framework::TensorCopy(output->value(), cpu_place, ctx, &output_cpu);
-  ctx.Wait();
-
-  EXPECT_EQ(output->height(), height);
-  EXPECT_EQ(output->value().dims(),
-            paddle::framework::make_ddim({3, row_numel}));
-
-  std::vector<int64_t> ret_rows{2, 3, 5};
-  EXPECT_EQ(output->rows(), ret_rows);
-
-  auto* out_data = output_cpu.data<float>();
-  for (size_t i = 0; i < ret_rows.size(); ++i) {
-    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
-      EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
-    }
-  }
-}
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
deleted file mode 100644
index e4ffeedb5a0061dd60ca3a30aa9928ef8b05887c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
-                  bool is_src_index) {
-    size_t* index = index_lod.data();
-    auto src_dims = src.dims();
-    auto dst_dims = dst->dims();
-    PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
-                      "The src must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
-    auto height = dst_dims[0];
-    auto width = dst_dims[1];
-    auto* src_data = src.data<T>();
-    auto* dst_data = dst->data<T>();
-    const int sz = width * sizeof(T);
-    if (is_src_index) {
-      for (int i = 0; i < height; ++i) {
-        memcpy(dst_data + i * width, src_data + index[i] * width, sz);
-      }
-    } else {
-      for (int i = 0; i < height; ++i) {
-        memcpy(dst_data + index[i] * width, src_data + i * width, sz);
-      }
-    }
-  }
-};
-
-template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, float>;
-template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, double>;
-
-template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, float>;
-template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, double>;
-template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
deleted file mode 100644
index 9ab13659c1cc5b59d28395bcebcfb43fac5b4544..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/math/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
-                                     int64_t height, int64_t width,
-                                     bool is_src_index) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int id = blockIdx.x + idy * GridDimX;
-  while (id < height) {
-    int src_idx = is_src_index ? index[id] : id;
-    int dst_idx = is_src_index ? id : index[id];
-    const T* src_data = src + src_idx * width;
-    T* dst_data = dst + dst_idx * width;
-    for (int i = idx; i < width; i += BlockDimX) {
-      dst_data[i] = src_data[i];
-    }
-    id += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T>
-class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
-                  bool is_src_index) {
-    auto src_dims = src.dims();
-    auto dst_dims = dst->dims();
-    PADDLE_ENFORCE_EQ(src_dims.size(), 2,
-                      "The src must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
-    auto height = dst_dims[0];
-    auto width = dst_dims[1];
-    auto* src_data = src.data<T>();
-    auto* dst_data = dst->data<T>();
-
-    dim3 threads(128, 8);
-    dim3 grid(8, 1);
-    auto stream = context.stream();
-    CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
-        src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height,
-        width, is_src_index);
-  }
-};
-
-template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, float>;
-template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, double>;
-
-template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, float>;
-template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, double>;
-template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
deleted file mode 100644
index a3186f82d0c0cc6c9585735ddf7e9bb4db7126cb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class CopyMatrixRowsFunctor {
- public:
-  // If is_src_index is true,
-  // copy the indexed rows of input src to the output dst.
-  // If is_src_index is false,
-  // copy the input src to the indexed rows of output dst.
-  // The indexed rows are based on the input index.
-  void operator()(const DeviceContext& context, const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
-                  bool is_src_index);
-};
-
-template <typename DeviceContext, typename T>
-class LoDTensor2BatchFunctor {
-  // Calculate the length of each sequence and
-  // sort sequence index by the length.
-  // example:  sequences = {s0, s1, s2}
-  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
-  //
-  struct SeqInfo {
-    SeqInfo(int start, int length, int seq_idx)
-        : start(start), length(length), seq_idx(seq_idx) {}
-    int start;
-    int length;
-    int seq_idx;
-  };
-
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& lod_tensor,
-                  framework::LoDTensor* batch, bool is_cal_batch_lod,
-                  bool is_reverse = false) const {
-    if (!is_cal_batch_lod) {
-      auto lods = batch->lod();
-      PADDLE_ENFORCE_GT(lods.size(), 2UL,
-                        "The LoD of LoDTensor should inlcude at least 2-level "
-                        "sequence information.");
-      PADDLE_ENFORCE_EQ(
-          lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
-          "The LoD information should be consistent with the dims.");
-      CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
-      to_batch(context, lod_tensor, lods[1], batch, true);
-      return;
-    }
-
-    auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
-
-    const auto& lod = lods[0];
-
-    std::vector<SeqInfo> seq_info;
-    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
-      int length = lod[seq_id + 1] - lod[seq_id];
-      seq_info.emplace_back(lod[seq_id], length, seq_id);
-    }
-
-    std::sort(seq_info.begin(), seq_info.end(),
-              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
-
-    // Calculate the start position of each batch.
-    // example:  sequences = {s0, s1, s2}
-    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           max_seqlen = 5,
-    //           batchIndex = {b0, b1, b2, b3, b4}
-    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
-    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
-    //              batch_start_positions[0] = len(b0)
-    //              batch_start_positions[1] = len(b0) + len(b1)
-    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
-    //              ...
-    //           seq2batch_idx[12] = {4, 0, 9,
-    //                                5, 1, 10,
-    //                                6, 2, 11,
-    //                                7, 3,
-    //                                8}
-    //           seq_order = {1, 0, 2}, the sort order.
-    //               where 1 is the second sequence,
-    //                     0 is the first sequence,
-    //                     2 is the third sequence.
-    // The max_seqlen represents batch size after rearranging the
-    // input LodTensor. It is also the maximum length of input sequence.
-
-    paddle::framework::LoD batch_lods;
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-
-    // batch_lods[0] is the start positions for batch LoDTensor
-    int max_seqlen = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
-    // batch_lods[1] is the raw index in the input LoDTensor
-    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
-    // batch_lods[2] is the sort order for the input LoDTensor.
-    batch_lods[2].resize(seq_info.size());
-
-    size_t* batch_starts = batch_lods[0].data();
-    size_t* seq2batch_idx = batch_lods[1].data();
-    batch_starts[0] = 0;
-    for (int n = 0; n < max_seqlen; n++) {
-      auto batch_id = static_cast<int>(batch_starts[n]);
-      for (size_t i = 0; i < seq_info.size(); ++i) {
-        int seq_len = seq_info[i].length;
-        int start = seq_info[i].start;
-        if (n < seq_len) {
-          seq2batch_idx[batch_id] =
-              is_reverse ? start + seq_len - 1 - n : start + n;
-          batch_id++;
-        } else {
-          break;
-        }
-      }
-      batch_starts[n + 1] = static_cast<size_t>(batch_id);
-    }
-    size_t* seq_order = batch_lods[2].data();
-    for (size_t i = 0; i < seq_info.size(); ++i) {
-      seq_order[i] = seq_info[i].seq_idx;
-    }
-    batch->set_lod(batch_lods);
-
-    CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
-    to_batch(context, lod_tensor, batch_lods[1], batch, true);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Batch2LoDTensorFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& batch,
-                  framework::LoDTensor* lod_tensor) const {
-    auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(), 2UL,
-                      "The LoD of LoDTensor should inlcude at least 2-level "
-                      "sequence information.");
-    PADDLE_ENFORCE_EQ(
-        in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
-        "The LoD information should be consistent with the dims.");
-    CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
-    to_seq(context, batch, in_lod[1], lod_tensor, false);
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
deleted file mode 100644
index 4630689dec160da145e607f662a802444ac98b55..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/sequence_padding.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-void CopyValidData(framework::Tensor* dst_tensor,
-                   const framework::Tensor* src_tensor,
-                   const framework::Vector<size_t>& seq_offsets,
-                   int pad_seq_len, int step_width, bool norm_by_len,
-                   CopyType type, PadLayout layout) {
-  int seq_num = seq_offsets.size() - 1;
-  const T* src_data = src_tensor->data<T>();
-  T* dst_data = dst_tensor->data<T>();
-
-  int seq_cpy_gap = step_width;
-  int pad_cpy_gap =
-      layout == kBatchLengthWidth ? step_width : seq_num * step_width;
-  for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
-    int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
-    PADDLE_ENFORCE_GE(
-        pad_seq_len, valid_seq_len,
-        "The padded sequence length can not be less than its original length.");
-    int seq_data_offset = seq_offsets[seq_idx] * step_width;
-    int pad_data_offset = layout == kBatchLengthWidth
-                              ? seq_idx * pad_seq_len * step_width
-                              : seq_idx * step_width;
-    float scale = 1.0f / static_cast<float>(valid_seq_len);
-
-    for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
-      const T* src =
-          src_data + (type == kSeqToPad ? seq_data_offset : pad_data_offset);
-      T* dst =
-          dst_data + (type == kSeqToPad ? pad_data_offset : seq_data_offset);
-      memcpy(dst, src, step_width * sizeof(T));
-      if (norm_by_len) {
-        for (int i = 0; i < step_width; ++i) {
-          *(dst + i) *= scale;
-        }
-      }
-      seq_data_offset += seq_cpy_gap;
-      pad_data_offset += pad_cpy_gap;
-    }
-  }
-}
-
-template <typename T>
-static void fast_mem_init(void* dest, size_t dest_size, const T* src,
-                          size_t num_bytes) {
-  if (dest == nullptr || dest_size == 0 || src == nullptr) return;
-
-  memcpy(dest, src, num_bytes);
-
-  dest_size *= num_bytes;
-  while (dest_size > num_bytes) {
-    size_t remaining = dest_size - num_bytes;
-    size_t count = (remaining > num_bytes) ? num_bytes : remaining;
-    memcpy((unsigned char*)dest + num_bytes, dest, count);
-    num_bytes += count;
-  }
-}
-
-template <typename T>
-class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& seq_tensor,
-                  framework::LoDTensor* pad_tensor,
-                  const framework::LoDTensor& pad_value, int pad_seq_len = -1,
-                  int lod_level = 0, bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_lod = seq_tensor.lod();
-    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
-    const auto& seq_tensor_dims = seq_tensor.dims();
-    const auto& pad_tensor_dims = pad_tensor->dims();
-    if (pad_seq_len == -1) {
-      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    }
-    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
-
-    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
-              step_width, layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
-
-    // fill padding value
-    T* pad_data = pad_tensor->data<T>();
-    const T* pad_value_data = pad_value.data<T>();
-    if (pad_value.numel() == 1) {
-      fast_mem_init<T>(pad_data, pad_tensor->numel(), pad_value_data,
-                       sizeof(T));
-    } else {
-      for (int i = 0; i < pad_tensor->numel(); i += step_width) {
-        memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
-      }
-    }
-
-    CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len,
-                     step_width, norm_by_times, kSeqToPad, layout);
-  }
-};
-
-template <typename T>
-class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& pad_tensor,
-                  framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
-                  int lod_level = 0, bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
-    const auto& seq_tensor_dims = seq_tensor->dims();
-    const auto& pad_tensor_dims = pad_tensor.dims();
-    if (pad_seq_len == -1) {
-      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    }
-    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
-
-    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
-              step_width, layout);
-
-    CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len,
-                     step_width, norm_by_times, kPadToSeq, layout);
-  }
-};
-
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
deleted file mode 100644
index 1b433067900af71bb8a6833cef019d41f9c76858..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/operators/math/sequence_padding.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, CopyType Type>
-__global__ void SequencePaddingKernel(
-    T* dst, const T* src, const T* pad_value, bool is_constant_pad,
-    const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len,
-    const size_t step_width, bool norm_by_len, const PadLayout layout) {
-  size_t seq_idx = blockIdx.y;
-  size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
-
-  size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y;
-  size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width;
-  size_t pad_data_offset = layout == kBatchLengthWidth
-                               ? (seq_idx * pad_seq_len + step_idx) * step_width
-                               : (step_idx * seq_num + seq_idx) * step_width;
-
-  T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset);
-  const T* src_data =
-      src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
-
-  if (step_idx < seq_len) {
-    float scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
-    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
-      dst_data[i] = scale * src_data[i];
-    }
-  } else if (step_idx < pad_seq_len && Type == kSeqToPad) {
-    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
-      dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i];
-    }
-  }
-}
-
-template <typename T>
-class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& seq_tensor,
-                  framework::LoDTensor* pad_tensor,
-                  const framework::LoDTensor& pad_value, int pad_seq_len = -1,
-                  int lod_level = 0, bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_lod = seq_tensor.lod();
-    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
-    const auto& seq_tensor_dims = seq_tensor.dims();
-    const auto& pad_tensor_dims = pad_tensor->dims();
-    int max_seq_len = MaximumSequenceLength(seq_offsets);
-    if (pad_seq_len == -1) {
-      pad_seq_len = max_seq_len;
-    }
-    PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len,
-                      "The pad_seq_len must be equal to or greater than the "
-                      "original max sequence length.");
-    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
-    int seq_num = seq_offsets.size() - 1;
-
-    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
-              step_width, layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
-
-    const int kBlockSize = 512;
-
-    /* At least use 32 threads to copy sequence_width elements,
-     * and at least 8 elements for each thread.
-     */
-    size_t block_dim_x =
-        std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
-    size_t block_dim_y = kBlockSize / block_dim_x;
-    dim3 threads(block_dim_x, block_dim_y);
-
-    size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = seq_num;
-    dim3 grid(grid_dim_x, grid_dim_y);
-
-    const T* seq_data = seq_tensor.data<T>();
-    T* pad_data = pad_tensor->data<T>();
-    const T* pad_value_data = pad_value.data<T>();
-
-    SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
-        pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
-        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
-        step_width, norm_by_times, layout);
-  }
-};
-
-template <typename T>
-class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& pad_tensor,
-                  framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
-                  int lod_level = 0, bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
-    const auto& seq_tensor_dims = seq_tensor->dims();
-    const auto& pad_tensor_dims = pad_tensor.dims();
-    int max_seq_len = MaximumSequenceLength(seq_offsets);
-    if (pad_seq_len == -1) {
-      pad_seq_len = max_seq_len;
-    }
-    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
-    int seq_num = seq_offsets.size() - 1;
-
-    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
-              step_width, layout);
-    /*
-    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
-      TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor);
-      seq_tensor->Resize(seq_tensor_dims);
-      return;
-    }
-    */
-
-    const int kBlockSize = 512;
-
-    /* At least use 32 threads to copy sequence_width elements,
-     * and at least 8 elements for each thread.
-     */
-    size_t block_dim_x =
-        std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
-    size_t block_dim_y = kBlockSize / block_dim_x;
-    dim3 threads(block_dim_x, block_dim_y);
-
-    size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = seq_num;
-    dim3 grid(grid_dim_x, grid_dim_y);
-
-    const T* pad_data = pad_tensor.data<T>();
-    T* seq_data = seq_tensor->data<T>();
-
-    SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
-        seq_data, pad_data, nullptr, false,
-        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
-        step_width, norm_by_times, layout);
-  }
-};
-
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
-
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
deleted file mode 100644
index e752aa58979dddba4d010071d2c4b5dc3e0c6756..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
-
-enum CopyType { kSeqToPad, kPadToSeq };
-
-inline static size_t MaximumSequenceLength(
-    const framework::Vector<size_t>& seq_offset) {
-  size_t seq_num = seq_offset.size() - 1;
-  size_t max_seq_len = 0;
-  for (size_t i = 0; i < seq_num; ++i) {
-    max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
-  }
-  return max_seq_len;
-}
-
-inline static void CheckDims(const framework::DDim& seq_tensor_dims,
-                             const framework::DDim& pad_tensor_dims,
-                             const framework::Vector<size_t>& seq_offset,
-                             int64_t padded_seq_len, int64_t step_width,
-                             const PadLayout& layout) {
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
-                    "Value of 1st dimension of the sequence tensor should be "
-                    "equal to sum of lengths of all sequences.");
-
-  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
-                     seq_tensor_dims.size() == pad_tensor_dims.size(),
-                 "pad_tensor's rank should be 1 greater than seq_tensor's "
-                 "rank, or be equal with it.");
-}
-
-/*
- * \brief   Padding/Unpadding LoDTensor to/from normal Tensor of the shape
- *          [max_sequence_length, num_sequences, sequence_width].
- *
- *  Padding sequence:
- *        padding[i] = seq[lod[level][i]]
- *  Unpadding sequence:
- *        seq[lod[level][i]] = padding[i]
- *
- *  All sequences will be padded to the same length and stored in a transposed
- * shape.
- *  Example:
- *    seq     (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
- *    padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
- *
- * \param context       device context of this functor.
- * \param seq           LoDTensor which is stored in sequence format, the shape
- *                      is [total_sequence_length, sequence_width] where
- *                      total_sequence_length is the sum of all sequences'
- *                      length.
- * \param padding       Tensor which is padded to the same length, the shape is
- *                      [max_sequence_length, num_sequences, sequence_width].
- * \param norm_by_times whether dividing sequence's length.
- *
- * \note  transposition is also done in this functor.
- */
-template <typename DeviceContext, typename T>
-class PaddingLoDTensorFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& seq_tensor,
-                  framework::LoDTensor* pad_tensor,
-                  const framework::LoDTensor& pad_value, int pad_seq_len = -1,
-                  int lod_level = 0, bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth);
-};
-
-template <typename DeviceContext, typename T>
-class UnpaddingLoDTensorFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& pad_tensor,
-                  framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
-                  int lod_level = 0, bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
deleted file mode 100644
index 4f61b1029c65aedaf4fce771866964fe1d0d6112..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/sequence_padding.h"
-#include <gtest/gtest.h>
-#include <vector>
-
-template <typename DeviceContext, typename Place, typename T>
-void TestSequencePadding(const paddle::framework::LoD& lod,
-                         const size_t sequence_width) {
-  paddle::framework::LoDTensor cpu_seq;
-  paddle::framework::LoDTensor cpu_seq_back;
-  paddle::framework::LoDTensor seq;
-  paddle::framework::LoDTensor seq_back;
-  paddle::framework::LoDTensor padding;
-  paddle::framework::LoDTensor cpu_pad_value;
-  paddle::framework::LoDTensor pad_value;
-
-  const size_t level = lod.size() - 1;
-  auto seq_dims =
-      paddle::framework::make_ddim({static_cast<int64_t>(lod[level].back()),
-                                    static_cast<int64_t>(sequence_width)});
-
-  cpu_seq.set_lod(lod);
-  cpu_seq.mutable_data<T>(seq_dims, paddle::platform::CPUPlace());
-  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
-    cpu_seq.data<T>()[i] = static_cast<T>(i);
-  }
-
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-  if (paddle::platform::is_cpu_place(*place)) {
-    seq = cpu_seq;
-  } else {
-    TensorCopySync(cpu_seq, *place, &seq);
-    seq.set_lod(lod);
-  }
-
-  const size_t max_sequence_length =
-      paddle::operators::math::MaximumSequenceLength(lod[level]);
-  const size_t num_sequences = lod[level].size() - 1;
-  auto padding_dims =
-      paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
-                                    static_cast<int64_t>(num_sequences),
-                                    static_cast<int64_t>(sequence_width)});
-
-  padding.mutable_data<T>(padding_dims, *place);
-
-  T* pad_value_data =
-      cpu_pad_value.mutable_data<T>({1}, paddle::platform::CPUPlace());
-  *pad_value_data = static_cast<T>(0);
-  if (paddle::platform::is_cpu_place(*place)) {
-    pad_value = cpu_pad_value;
-  } else {
-    TensorCopySync(cpu_pad_value, *place, &pad_value);
-  }
-
-  paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-      *context, seq, &padding, pad_value, -1, 0, false,
-      paddle::operators::math::kLengthBatchWidth);
-
-  seq_back.set_lod(lod);
-  seq_back.mutable_data<T>(seq_dims, *place);
-  paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-      *context, padding, &seq_back, -1, 0, false,
-      paddle::operators::math::kLengthBatchWidth);
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    cpu_seq_back = seq_back;
-  } else {
-    TensorCopySync(seq_back, paddle::platform::CPUPlace(), &cpu_seq_back);
-    cpu_seq_back.set_lod(lod);
-  }
-
-  EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel());
-  EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims());
-  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
-    EXPECT_EQ(cpu_seq.data<T>()[i], cpu_seq_back.data<T>()[i]);
-  }
-
-  delete place;
-  delete context;
-}
-
-TEST(Seq2BatchPadding, CPU) {
-  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePadding<paddle::platform::CPUDeviceContext,
-                      paddle::platform::CPUPlace, float>(lod1, 16);
-
-  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePadding<paddle::platform::CPUDeviceContext,
-                      paddle::platform::CPUPlace, float>(lod2, 128);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(SequencePadding, CUDA) {
-  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePadding<paddle::platform::CUDADeviceContext,
-                      paddle::platform::CUDAPlace, float>(lod1, 16);
-
-  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePadding<paddle::platform::CUDADeviceContext,
-                      paddle::platform::CUDAPlace, float>(lod2, 128);
-}
-#endif
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
deleted file mode 100644
index cc3fbd587668b17b7edde50b157adca83e81eddc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/sequence_pooling.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, bool is_test>
-class MaxSeqPoolFunctor {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& input, T pad_value,
-                  framework::LoDTensor* output, framework::Tensor* index) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1,
-                      "The rank of input shall be greater than 1.");
-    PADDLE_ENFORCE_GT(out_dims.size(), 1,
-                      "The rank of output shall be greater than 1.");
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i],
-                        "The dimension of input and output shall be same.");
-    }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims,
-                      "The dimension of index and output shall be same.");
-
-    auto lod_level = input.lod().size();
-    auto starts = input.lod()[lod_level - 1];
-    const T* in_data = input.data<T>();
-    T* out_data = output->data<T>();
-    int* max_index = index->data<int>();
-
-    int64_t num_seq = out_dims[0];
-    int64_t dim = output->numel() / num_seq;
-    for (int64_t i = 0; i < num_seq; ++i) {
-      if (starts[i] == starts[i + 1]) {
-        for (int64_t k = 0; k < dim; ++k) {
-          out_data[i * dim + k] = pad_value;
-          max_index[i * dim + k] = -1;
-        }
-        continue;
-      }
-      for (int64_t k = 0; k < dim; ++k) {
-        out_data[i * dim + k] = in_data[starts[i] * dim + k];
-        max_index[i * dim + k] = starts[i];
-      }
-      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
-        for (int64_t k = 0; k < dim; ++k) {
-          if (in_data[j * dim + k] > out_data[i * dim + k]) {
-            out_data[i * dim + k] = in_data[j * dim + k];
-            max_index[i * dim + k] = j;
-          }
-        }
-      }
-    }
-  }
-};
-// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
-// index buffer
-template <typename T>
-class MaxSeqPoolFunctor<T, true> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& input, T pad_value,
-                  framework::LoDTensor* output, framework::Tensor* index) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1,
-                      "The rank of input shall be greater than 1.");
-    PADDLE_ENFORCE_GT(out_dims.size(), 1,
-                      "The rank of output shall be greater than 1.");
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i],
-                        "The dimension of input and output shall be same.");
-    }
-
-    auto lod_level = input.lod().size();
-    auto starts = input.lod()[lod_level - 1];
-    const T* in_data = input.data<T>();
-    T* out_data = output->data<T>();
-
-    int64_t num_seq = out_dims[0];
-    int64_t dim = output->numel() / num_seq;
-    for (int64_t i = 0; i < num_seq; ++i) {
-      if (starts[i] == starts[i + 1]) {
-        for (int64_t k = 0; k < dim; ++k) {
-          out_data[i * dim + k] = pad_value;
-        }
-        continue;
-      }
-      std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim],
-                  dim * sizeof(T));
-      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
-        for (int64_t k = 0; k < dim; ++k) {
-          if (in_data[j * dim + k] > out_data[i * dim + k]) {
-            out_data[i * dim + k] = in_data[j * dim + k];
-          }
-        }
-      }
-    }
-  }
-};
-template <typename T>
-class MaxSeqPoolGradFunctor {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& out_grad,
-                  const framework::Tensor& index,
-                  framework::LoDTensor* in_grad) {
-    auto og_dims = out_grad.dims();
-    auto ig_dims = in_grad->dims();
-    auto idx_dims = index.dims();
-    PADDLE_ENFORCE_GT(og_dims.size(), 1,
-                      "The rank of output@Grad shall be greater than 1.");
-    PADDLE_ENFORCE_GT(ig_dims.size(), 1,
-                      "The rank of input@Grad shall be greater than 1.");
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          og_dims[i], ig_dims[i],
-          "The dimension of input@Grad and output@Grad shall be same.");
-    }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims,
-                      "The dimension of index and output@Grad shall be same.");
-
-    const T* og_data = out_grad.data<T>();
-    const int* max_index = index.data<int>();
-    T* ig_data = in_grad->data<T>();
-
-    SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context, in_grad, static_cast<T>(0.0));
-    int64_t num_seq = og_dims[0];
-    int64_t dim = out_grad.numel() / num_seq;
-    for (int64_t i = 0; i < num_seq; ++i) {
-      for (int64_t j = 0; j < dim; ++j) {
-        int step_id = max_index[i * dim + j];
-        if (step_id == -1) continue;
-        ig_data[step_id * dim + j] = og_data[i * dim + j];
-      }
-    }
-  }
-};
-
-template <typename T>
-class LastSeqPoolFunctor {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& input, T pad_value,
-                  framework::LoDTensor* output) {
-    // Create pointers to input and output data
-    auto* in_data = input.data<T>();
-    auto* out_data = output->data<T>();
-
-    // Calculate the size of each item in sequence
-    int64_t item_size = input.numel() / input.dims()[0];
-    auto lod_level = input.lod().size();
-    auto lod = input.lod()[lod_level - 1];
-    int seq_num = static_cast<int>(lod.size()) - 1;
-    for (int i = 0; i < seq_num; ++i) {
-      // Calculate the length of each sequence
-      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      if (seq_len == 0) {
-        for (int j = 0; j < item_size; ++j) {
-          out_data[j] = pad_value;
-        }
-      } else {
-        // Point to the begin of next sequence
-        in_data += seq_len * item_size;
-        // Copy the last item of sequence to output
-        std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T));
-      }
-      out_data += item_size;
-    }
-  }
-};
-
-template <typename T>
-class FirstSeqPoolFunctor {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& input, T pad_value,
-                  framework::LoDTensor* output) {
-    // Create pointers to input and output data
-    auto* in_data = input.data<T>();
-    auto* out_data = output->data<T>();
-
-    // Calculate the size of each item in sequence
-    int64_t item_size = input.numel() / input.dims()[0];
-    auto lod_level = input.lod().size();
-    auto lod = input.lod()[lod_level - 1];
-    int seq_num = static_cast<int>(lod.size()) - 1;
-    for (int i = 0; i < seq_num; ++i) {
-      // Calculate the length of each sequence
-      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      if (seq_len == 0) {
-        for (int j = 0; j < item_size; ++j) {
-          out_data[j] = pad_value;
-        }
-      } else {
-        // Copy the first item of sequence to output
-        std::memcpy(out_data, in_data, item_size * sizeof(T));
-        // Point to the next sequence
-        in_data += seq_len * item_size;
-      }
-      out_data += item_size;
-    }
-  }
-};
-
-template <typename T>
-class SumSeqPoolGradFunctor {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& out_grad,
-                  framework::LoDTensor* in_grad) {
-    auto lod_level = in_grad->lod().size();
-    auto lod = in_grad->lod()[lod_level - 1];
-    int64_t out_w = out_grad.numel() / out_grad.dims()[0];
-    int64_t in_w = in_grad->numel() / in_grad->dims()[0];
-    PADDLE_ENFORCE_EQ(
-        in_w, out_w,
-        "The feature size of input@Grad and output@Grad shall be same.");
-    const T* out_g_data = out_grad.data<T>();
-    T* in_g_data = in_grad->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      if (h == 0) continue;
-      int64_t in_offset = lod[i] * in_w;
-      const T* out_pos = out_g_data + i * out_w;
-      T* in_pos = in_g_data + in_offset;
-      for (int r = 0; r != h; ++r) {
-        blas.VCOPY(in_w, out_pos, in_pos + r * in_w);
-      }
-    }
-  }
-};
-
-template <typename T>
-class SequencePoolFunctor<platform::CPUDeviceContext, T> {
- public:
-  /* max pool has index output */
-  void operator()(const platform::CPUDeviceContext& context,
-                  const std::string pooltype, T pad_value,
-                  const framework::LoDTensor& input,
-                  framework::LoDTensor* output, bool is_test,
-                  framework::Tensor* index = nullptr) {
-    if (pooltype == "MAX") {
-      if (is_test) {
-        math::MaxSeqPoolFunctor<T, true> max_pool;
-        max_pool(context, input, pad_value, output, index);
-      } else {
-        math::MaxSeqPoolFunctor<T, false> max_pool;
-        max_pool(context, input, pad_value, output, index);
-      }
-      return;
-    }
-    if (pooltype == "LAST") {
-      math::LastSeqPoolFunctor<T> last_pool;
-      last_pool(context, input, pad_value, output);
-      return;
-    }
-    if (pooltype == "FIRST") {
-      math::FirstSeqPoolFunctor<T> first_pool;
-      first_pool(context, input, pad_value, output);
-      return;
-    }
-    auto lod_level = input.lod().size();
-    auto lod = input.lod()[lod_level - 1];
-    if (pooltype == "SUM") {
-      auto place = context.GetPlace();
-      PADDLE_ENFORCE_EQ(
-          platform::is_cpu_place(place), true,
-          "Sequence_pool should run on CPU Device when pooltype is SUM");
-      const T* src = input.data<T>();
-      T* dst = output->mutable_data<T>(place);
-      jit::seq_pool_attr_t attr(
-          static_cast<int>(input.numel() / input.dims()[0]),
-          jit::SeqPoolType::kSum);
-      auto seqpool =
-          jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache()
-              .At(attr);
-      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-        attr.h = static_cast<int>(lod[i + 1] - lod[i]);
-        if (attr.h == 0) {
-          for (int j = 0; j < attr.w; ++j) {
-            dst[j] = pad_value;
-          }
-        } else {
-          seqpool(src, dst, &attr);
-        }
-        dst += attr.w;
-        src += attr.h * attr.w;
-      }
-      return;
-    }
-    auto& place = *context.eigen_device();
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      Tensor out_t = output->Slice(i, i + 1);
-      int64_t w = input.numel() / input.dims()[0];
-      if (lod[i] == lod[i + 1]) {
-        for (int j = 0; j < w; ++j) {
-          out_t.data<T>()[j] = pad_value;
-        }
-        continue;
-      }
-      Tensor in_t =
-          input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
-      auto out_e = EigenVector<T>::Flatten(out_t);
-      if (pooltype == "AVERAGE") {
-        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SQRT") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
-                              std::sqrt(static_cast<T>(h));
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
-    }
-  }
-};
-
-template <typename T>
-class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const std::string pooltype,
-                  const framework::LoDTensor& out_grad,
-                  framework::LoDTensor* in_grad,
-                  /* max pool has index */
-                  const framework::Tensor* index = nullptr) {
-    if (pooltype == "MAX") {
-      math::MaxSeqPoolGradFunctor<T> max_pool_grad;
-      max_pool_grad(context, out_grad, *index, in_grad);
-      return;
-    }
-
-    if (pooltype == "LAST" || pooltype == "FIRST") {
-      // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
-      functor(context, in_grad, 0);
-    }
-
-    if (pooltype == "SUM") {
-      math::SumSeqPoolGradFunctor<T> sum_pool_grad;
-      sum_pool_grad(context, out_grad, in_grad);
-      return;
-    }
-
-    auto lod_level = in_grad->lod().size();
-    auto lod = in_grad->lod()[lod_level - 1];
-    auto& place = *context.eigen_device();
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      if (lod[i] == lod[i + 1]) continue;
-      auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
-                                   static_cast<int>(lod[i + 1]));
-      auto out_g_t = out_grad.Slice(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      int64_t w = in_grad->numel() / in_grad->dims()[0];
-      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
-      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
-      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
-      Eigen::DSizes<int, 2> bcast(h, 1);
-
-      if (pooltype == "AVERAGE") {
-        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
-      } else if (pooltype == "SQRT") {
-        in_g_e.device(place) =
-            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
-      } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
-      } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(place) = out_g_e_v;
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
-    }
-  }
-};
-
-template class SequencePoolFunctor<platform::CPUDeviceContext, float>;
-template class SequencePoolFunctor<platform::CPUDeviceContext, double>;
-template class SequencePoolGradFunctor<platform::CPUDeviceContext, float>;
-template class SequencePoolGradFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
deleted file mode 100644
index 91545131e4cbb5d6dcae9c111e97598ee54cc898..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ /dev/null
@@ -1,385 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/sequence_pooling.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct MaxPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      T max_val = static_cast<T>(-FLT_MAX);
-      int max_index = -1;
-      if (start == end) {
-        output[tid] = pad_value;
-        index[tid] = -1;
-      } else {
-        for (int i = start; i < end; ++i) {
-          if (max_val < input[item_dim * i + tid]) {
-            max_val = input[item_dim * i + tid];
-            max_index = i;
-          }
-        }
-        output[tid] = max_val;
-        index[tid] = max_index;
-      }
-    }
-  }
-};
-
-template <typename T>
-struct AvgPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      if (start == end) {
-        output[tid] = pad_value;
-      } else {
-        T val = static_cast<T>(0);
-        for (int i = start; i < end; ++i) {
-          val += input[item_dim * i + tid];
-        }
-        // end, start is lod, so end - start != 0
-        output[tid] = val / static_cast<T>(end - start);
-      }
-    }
-  }
-};
-
-template <typename T>
-struct SumPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      if (start == end) {
-        output[tid] = pad_value;
-      } else {
-        T val = static_cast<T>(0);
-        for (int i = start; i < end; ++i) {
-          val += input[item_dim * i + tid];
-        }
-        output[tid] = val;
-      }
-    }
-  }
-};
-
-template <typename T>
-struct SqrtPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      if (start == end) {
-        output[tid] = pad_value;
-      } else {
-        T val = static_cast<T>(0);
-        for (int i = start; i < end; ++i) {
-          val += input[item_dim * i + tid];
-        }
-        // end, start is lod, so end - start != 0
-        output[tid] = val / sqrt(end - start);
-      }
-    }
-  }
-};
-
-template <typename T>
-struct LastPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      if (start == end) {
-        output[tid] = pad_value;
-      } else {
-        output[tid] = input[item_dim * (end - 1) + tid];
-      }
-    }
-  }
-};
-
-template <typename T>
-struct FirstPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      if (start == end) {
-        output[tid] = pad_value;
-      } else {
-        output[tid] = input[item_dim * start + tid];
-      }
-    }
-  }
-};
-
-template <typename T, typename Range_OP>
-__global__ void sequence_pool_kernel(Range_OP op, const T* input,
-                                     const T pad_value, const size_t* lod,
-                                     const size_t lod_size,
-                                     const size_t item_dim, T* output,
-                                     int* index) {
-  int bid = blockIdx.x;
-  if (bid >= lod_size - 1) return;
-  size_t start = lod[bid];
-  size_t end = lod[bid + 1];
-  int* index_offset = nullptr;
-  if (index != nullptr) {
-    index_offset = &index[bid * item_dim];
-  }
-  op(input, pad_value, start, end, item_dim, &output[bid * item_dim],
-     index_offset);
-}
-
-template <typename T>
-class SequencePoolFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const std::string pooltype, T pad_value,
-                  const framework::LoDTensor& input,
-                  framework::LoDTensor* output, bool is_test,
-                  framework::Tensor* index = nullptr) {
-    auto lod_level = input.lod().size();
-    auto& lod = input.lod()[lod_level - 1];
-    const size_t item_dim = output->numel() / output->dims()[0];
-    dim3 threads(1024, 1);
-    dim3 grid(lod.size(), 1);
-    if (pooltype == "MAX") {
-      sequence_pool_kernel<
-          T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          MaxPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), index->data<int>());
-    } else if (pooltype == "AVERAGE") {
-      sequence_pool_kernel<
-          T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          AvgPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
-    } else if (pooltype == "SUM") {
-      sequence_pool_kernel<
-          T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SumPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
-    } else if (pooltype == "SQRT") {
-      sequence_pool_kernel<
-          T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SqrtPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
-    } else if (pooltype == "LAST") {
-      sequence_pool_kernel<
-          T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          LastPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
-    } else if (pooltype == "FIRST") {
-      sequence_pool_kernel<
-          T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          FirstPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
-    } else {
-      PADDLE_THROW("unsupported pooling pooltype");
-    }
-  }
-};
-
-template <typename T>
-struct MaxPoolGradFunctor {
-  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
-                             const size_t end, const size_t item_dim,
-                             T* in_grad, const int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      for (int i = start; i < end; ++i) {
-        if (i == index[tid]) {
-          in_grad[item_dim * i + tid] = out_grad[tid];
-        } else {
-          in_grad[item_dim * i + tid] = static_cast<T>(0);
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-struct AvgPoolGradFunctor {
-  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
-                             const size_t end, const size_t item_dim,
-                             T* in_grad, const int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      for (int i = start; i < end; ++i) {
-        in_grad[item_dim * i + tid] = out_grad[tid] / (end - start);
-      }
-    }
-  }
-};
-
-template <typename T>
-struct SumPoolGradFunctor {
-  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
-                             const size_t end, const size_t item_dim,
-                             T* in_grad, const int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      for (int i = start; i < end; ++i) {
-        in_grad[item_dim * i + tid] = out_grad[tid];
-      }
-    }
-  }
-};
-
-template <typename T>
-struct SqrtPoolGradFunctor {
-  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
-                             const size_t end, const size_t item_dim,
-                             T* in_grad, const int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      for (int i = start; i < end; ++i) {
-        in_grad[item_dim * i + tid] =
-            out_grad[tid] / (sqrt(static_cast<T>(end - start)));
-      }
-    }
-  }
-};
-
-template <typename T>
-struct LastPoolGradFunctor {
-  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
-                             const size_t end, const size_t item_dim,
-                             T* in_grad, const int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      for (int i = start; i < end; ++i) {
-        if (i == end - 1) {
-          in_grad[item_dim * i + tid] = out_grad[tid];
-        } else {
-          in_grad[item_dim * i + tid] = static_cast<T>(0);
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-struct FirstPoolGradFunctor {
-  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
-                             const size_t end, const size_t item_dim,
-                             T* in_grad, const int* index) {
-    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      for (int i = start; i < end; ++i) {
-        if (i == start) {
-          in_grad[item_dim * i + tid] = out_grad[tid];
-        } else {
-          in_grad[item_dim * i + tid] = static_cast<T>(0);
-        }
-      }
-    }
-  }
-};
-
-template <typename T, typename Range_OP>
-__global__ void sequence_pool_grad_kernel(Range_OP op, const T* out_grad,
-                                          const size_t* lod,
-                                          const size_t lod_size,
-                                          const size_t item_dim, T* in_grad,
-                                          const int* index) {
-  int bid = blockIdx.x;
-  if (bid >= lod_size - 1) return;
-  size_t start = lod[bid];
-  size_t end = lod[bid + 1];
-  const int* index_offset = nullptr;
-  if (index != nullptr) {
-    index_offset = &index[bid * item_dim];
-  }
-  op(&out_grad[bid * item_dim], start, end, item_dim, in_grad, index_offset);
-}
-
-template <typename T>
-class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const std::string pooltype,
-                  const framework::LoDTensor& out_grad,
-                  framework::LoDTensor* in_grad,
-                  /* max pool has index */
-                  const framework::Tensor* index = nullptr) {
-    auto lod_level = in_grad->lod().size();
-    auto& lod = in_grad->lod()[lod_level - 1];
-    const size_t item_dim = in_grad->numel() / in_grad->dims()[0];
-    dim3 threads(1024, 1);
-    dim3 grid(lod.size(), 1);
-    if (pooltype == "MAX") {
-      sequence_pool_grad_kernel<
-          T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          MaxPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
-    } else if (pooltype == "AVERAGE") {
-      sequence_pool_grad_kernel<
-          T, AvgPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          AvgPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
-    } else if (pooltype == "SUM") {
-      sequence_pool_grad_kernel<
-          T, SumPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SumPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
-    } else if (pooltype == "SQRT") {
-      sequence_pool_grad_kernel<
-          T, SqrtPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
-    } else if (pooltype == "LAST") {
-      sequence_pool_grad_kernel<
-          T, LastPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          LastPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
-    } else if (pooltype == "FIRST") {
-      sequence_pool_grad_kernel<
-          T, FirstPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          FirstPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
-
-    } else {
-      PADDLE_THROW("unsupported pooling pooltype");
-    }
-  }
-};
-
-// sequence pooling
-template class SequencePoolFunctor<platform::CUDADeviceContext, float>;
-template class SequencePoolFunctor<platform::CUDADeviceContext, double>;
-template class SequencePoolGradFunctor<platform::CUDADeviceContext, float>;
-template class SequencePoolGradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
deleted file mode 100644
index 847d0bca951a7e54a74a6c803a4f24d50672228f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-class SequencePoolFunctor {
- public:
-  /* max pool has index output */
-  void operator()(const DeviceContext& context, const std::string pooltype,
-                  T pad_value, const framework::LoDTensor& input,
-                  framework::LoDTensor* output, bool is_test = false,
-                  framework::Tensor* index = nullptr);
-};
-
-template <typename DeviceContext, typename T>
-class SequencePoolGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const std::string pooltype,
-                  const framework::LoDTensor& out_grad,
-                  framework::LoDTensor* in_grad,
-                  /* max pool has index */
-                  const framework::Tensor* index = nullptr);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
deleted file mode 100644
index cf6e89b3d9f11f2b68322ef15ddf026625f6a5a5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/sequence_pooling.h"
-#include <gtest/gtest.h>
-#include <vector>
-
-template <typename DeviceContext, typename Place, typename T>
-void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
-  paddle::framework::LoDTensor cpu_out_grad;
-  paddle::framework::LoDTensor cpu_in_grad;
-  paddle::framework::LoDTensor out_grad;
-  paddle::framework::LoDTensor in_grad;
-  const size_t second_dim = 128u;
-
-  // construct out_grad's tensor in cpu
-  const size_t out_first_dim = lod[0].size() - 1;
-  auto out_dims = paddle::framework::make_ddim(
-      {static_cast<int64_t>(out_first_dim), static_cast<int64_t>(second_dim)});
-
-  cpu_out_grad.mutable_data<T>(out_dims, paddle::platform::CPUPlace());
-  for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) {
-    cpu_out_grad.data<T>()[i] = static_cast<T>(i);
-  }
-
-  // copy to dst out_grad
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-  if (paddle::platform::is_cpu_place(*place)) {
-    out_grad = cpu_out_grad;
-  } else {
-    TensorCopySync(cpu_out_grad, *place, &out_grad);
-  }
-
-  // construct in_grad
-  in_grad.set_lod(lod);
-  auto in_dims = paddle::framework::make_ddim(
-      {static_cast<int64_t>(lod[0].back()), static_cast<int64_t>(second_dim)});
-  in_grad.mutable_data<T>(in_dims, context->GetPlace());
-
-  // check tensor contruction result
-  PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
-  for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
-  }
-
-  // call functor
-  paddle::operators::math::SequencePoolGradFunctor<DeviceContext, T>()(
-      *context, "SUM", out_grad, &in_grad);
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    cpu_in_grad = in_grad;
-  } else {
-    TensorCopySync(in_grad, paddle::platform::CPUPlace(), &cpu_in_grad);
-    cpu_in_grad.set_lod(in_grad.lod());
-  }
-
-  EXPECT_EQ(in_grad.numel(), static_cast<int64_t>(lod[0].back() * second_dim));
-  EXPECT_EQ(in_grad.lod(), lod);
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
-      int64_t begin = in_grad.lod()[0][i];
-      int64_t end = in_grad.lod()[0][i + 1];
-      paddle::framework::Tensor tmp = in_grad.Slice(begin, end);
-      for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
-        for (int64_t m = 0; m != second_dim; ++m) {
-          EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
-                    out_grad.data<T>()[m + i * second_dim]);
-        }
-      }
-    }
-  } else {
-    for (size_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) {
-      int64_t begin = cpu_in_grad.lod()[0][i];
-      int64_t end = cpu_in_grad.lod()[0][i + 1];
-      paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end);
-      for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
-        for (int64_t m = 0; m != second_dim; ++m) {
-          EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
-                    cpu_out_grad.data<T>()[m + i * second_dim]);
-        }
-      }
-    }
-  }
-
-  delete place;
-  delete context;
-}
-
-TEST(SequencePoolingGrad, CPU_SUM) {
-  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
-                         paddle::platform::CPUPlace, float>(lod1);
-
-  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
-                         paddle::platform::CPUPlace, float>(lod2);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(SequencePoolingGrad, CUDA_SUM) {
-  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
-                         paddle::platform::CUDAPlace, float>(lod1);
-
-  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
-                         paddle::platform::CUDAPlace, float>(lod2);
-}
-#endif
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
deleted file mode 100644
index ee5b22ca855b4fa26e9626aadb84fa9b93b72952..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/sequence_scale.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context, const T* scales,
-                  framework::LoDTensor* seq) {
-    const size_t level = 0;
-    auto lod = seq->lod();
-    const size_t num_seq = lod[level].size() - 1;
-    size_t seq_width = seq->dims()[1];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    T* seq_data = seq->mutable_data<T>(context.GetPlace());
-    for (size_t i = 0; i < num_seq; ++i) {
-      for (size_t j = lod[level][i] * seq_width;
-           j < lod[level][i + 1] * seq_width; ++j) {
-        seq_data[j] *= scales[i];
-      }
-    }
-  }
-};
-
-template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
deleted file mode 100644
index 079338c1d3dac6a9403c5871f3face9f1f8e77d2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/sequence_scale.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T, int BlockSize>
-__global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales,
-                                    const size_t seq_width) {
-  for (int i = threadIdx.x;
-       i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * seq_width;
-       i += BlockSize) {
-    int idx = lod[blockIdx.x] * seq_width + i;
-    seq[idx] *= scales[blockIdx.x];
-  }
-}
-
-template <typename T>
-class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context, const T* scales,
-                  framework::LoDTensor* seq) {
-    const size_t level = 0;
-    auto lod = seq->lod();
-    const size_t num_seq = lod[level].size() - 1;
-    const size_t seq_width = seq->numel() / seq->dims()[0];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-    T* seq_data = seq->mutable_data<T>(context.GetPlace());
-
-    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
-        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
-        scales, seq_width);
-  }
-};
-
-template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h
deleted file mode 100644
index 202243985c125cd518a27477eb370bf1a325fe16..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * \brief   Scale a sequence.
- *
- *  All sequences will be padded to the same length and stored in a transposed
- * shape.
- *  Example:
- *    Given:
- *      seq = (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
- *      scales = (2, 3, 4, 5)
- *    then:
- *      result = (2*s0, 2*s0, 2*s0, 2*s0; 3*s1, 3*s1; 4*s2, 4*s2, 4*s2; 5*s3)
-
- *
- * \param context       Device context of this functor.
- * \param seq           LoDTensor which is stored in sequence format, the shape
- *                      is [total_sequence_length, sequence_width] where
- *                      total_sequence_length is the sum of all sequences'
- *                      length.
- * \param scales        Array<T>. The i-th sequence will be scaled by scales[i].
- * \param num_seq       Number of sequence
- *
- */
-template <typename DeviceContext, typename T>
-class ScaleLoDTensorFunctor {
- public:
-  void operator()(const DeviceContext& context, const T* scales,
-                  framework::LoDTensor* seq);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
deleted file mode 100644
index fa2018178f44ff4e3b14937c1f508fa8a698e20e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/softmax.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/math/softmax_impl.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template class SoftmaxFunctor<platform::CPUDeviceContext, float, true>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, float, false>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double, true>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
-template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
-template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
deleted file mode 100644
index 1c0970c0aa4692bcc51bd69b025b82ecff5bec65..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/softmax.cu
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
-
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/math/softmax_impl.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using DataLayout = platform::DataLayout;
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-
-template <typename T>
-void SoftmaxCUDNNFunctor<T>::operator()(
-    const platform::CUDADeviceContext& context, const framework::Tensor* X,
-    framework::Tensor* Y) {
-  // ------------------- cudnn descriptors ---------------------
-  ScopedTensorDescriptor xDesc;
-  ScopedTensorDescriptor yDesc;
-  std::vector<int> cudnn_tensor_dims = framework::vectorize<int>(X->dims());
-  DataLayout layout = DataLayout::kNCHW;
-  if (cudnn_tensor_dims.size() == 5) {
-    layout = DataLayout::kNCDHW;
-  }
-  // NOTE(*) : cudnn softmax only support >= 4D Tensor,
-  // fill 1 at unused dims
-  if (cudnn_tensor_dims.size() <= 2) {
-    cudnn_tensor_dims.resize(4, 1);
-  }
-  cudnnTensorDescriptor_t cudnn_x_desc =
-      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  cudnnTensorDescriptor_t cudnn_y_desc =
-      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxForward(
-      context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
-      CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
-      X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
-      Y->mutable_data<T>(context.GetPlace())));
-}
-
-template <typename T>
-void SoftmaxGradCUDNNFunctor<T>::operator()(
-    const platform::CUDADeviceContext& context, const framework::Tensor* Y,
-    const framework::Tensor* YGrad, framework::Tensor* XGrad) {
-  // ------------------- cudnn descriptors ---------------------
-  ScopedTensorDescriptor yDesc;
-  ScopedTensorDescriptor dyDesc;
-  ScopedTensorDescriptor dxDesc;
-  std::vector<int> cudnn_tensor_dims = framework::vectorize<int>(Y->dims());
-  DataLayout layout = DataLayout::kNCHW;
-  if (cudnn_tensor_dims.size() == 5) {
-    layout = DataLayout::kNCDHW;
-  }
-  // NOTE(*) : cudnn softmax only support >= 4D Tensor,
-  // fill 1 at unused dims
-  if (cudnn_tensor_dims.size() <= 2) {
-    cudnn_tensor_dims.resize(4, 1);
-  }
-  cudnnTensorDescriptor_t cudnn_y_desc =
-      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  cudnnTensorDescriptor_t cudnn_xgrad_desc =
-      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  cudnnTensorDescriptor_t cudnn_ygrad_desc =
-      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
-      context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
-      CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
-      Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
-      CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
-      XGrad->mutable_data<T>(context.GetPlace())));
-}
-
-template class SoftmaxCUDNNFunctor<platform::float16>;
-template class SoftmaxCUDNNFunctor<float>;
-template class SoftmaxCUDNNFunctor<double>;
-template class SoftmaxGradCUDNNFunctor<float>;
-template class SoftmaxGradCUDNNFunctor<double>;
-template class SoftmaxGradCUDNNFunctor<platform::float16>;
-
-template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
-                              false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
-                              true>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double, true>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext,
-                                  platform::float16>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h
deleted file mode 100644
index 7a4306efef97ea572f90929cd79f4b9092a64d1f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/softmax.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T, bool is_test,
-          typename Enable = void>
-class SoftmaxFunctor {
- public:
-  void operator()(const DeviceContext& context, const int axis_dim,
-                  const framework::Tensor* X, framework::Tensor* Y);
-};
-
-template <typename DeviceContext, typename T, typename Enable = void>
-class SoftmaxGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const int axis_dim,
-                  const framework::Tensor* y, const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad);
-};
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-class SoftmaxCUDNNFunctor {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor* X, framework::Tensor* Y);
-};
-
-template <typename T>
-class SoftmaxGradCUDNNFunctor {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor* Y, const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad);
-};
-
-#endif
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
deleted file mode 100644
index fae5160cc82cd9a93c8a80b74293da82175c9a43..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct ValueClip {
-  HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = static_cast<T>(-64.);
-    return x < kThreshold ? kThreshold : x;
-  }
-};
-
-template <typename DeviceContext, typename T, bool is_test>
-void SoftmaxEigen(const DeviceContext& context, const int axis_dim,
-                  const framework::Tensor* X, framework::Tensor* Y) {
-  constexpr int kBatchDim = 0;
-  constexpr int kClassDim = 1;
-  constexpr int kAxisDim = 1;
-
-  auto logits = EigenMatrix<T>::From(*X);
-  auto softmax = EigenMatrix<T>::From(*Y);
-
-  const int batch_size = logits.dimension(kBatchDim);
-  const int num_classes = logits.dimension(kClassDim);
-  const int num_remain = num_classes / axis_dim;
-
-  Eigen::DSizes<int, 1> along_axis(kAxisDim);
-  Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
-  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-  Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
-  Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
-  Eigen::DSizes<int, 2> one_axis(1, axis_dim);
-  Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-
-  // For numerical stability, logits should be shifted by maximum number along
-  // axis, calculate shifted_logits into softmax tensor for memory reuse.
-  if (num_remain == 1) {
-    // axis == -1, axis and class in same dimension, calculate along
-    // class dimension directly for higher performance
-    softmax.device(*context.eigen_device()) = (logits -
-                                               logits.maximum(along_axis)
-                                                   .eval()
-                                                   .reshape(batch_by_one)
-                                                   .broadcast(one_by_class))
-                                                  .unaryExpr(ValueClip<T>());
-  } else {
-    // axis != -1, class dimension split into (axis, remain), max and sum
-    // should be calculated along axis dimension
-    softmax.device(*context.eigen_device()) =
-        (logits.reshape(batch_axis_remain) -
-         logits.reshape(batch_axis_remain)
-             .maximum(along_axis)
-             .eval()
-             .reshape(batch_one_remain)
-             .broadcast(one_axis_one)
-             .reshape(batch_classes))
-            .unaryExpr(ValueClip<T>());
-  }
-
-  softmax.device(*context.eigen_device()) = softmax.exp();
-  softmax.device(*context.eigen_device()) = (softmax *
-                                             softmax.reshape(batch_axis_remain)
-                                                 .sum(along_axis)
-                                                 .inverse()
-                                                 .eval()
-                                                 .broadcast(one_axis));
-}
-
-template <typename DeviceContext, typename T, bool is_test, typename Enable>
-void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
-    const DeviceContext& context, const int axis_dim,
-    const framework::Tensor* X, framework::Tensor* Y) {
-  SoftmaxEigen<DeviceContext, T, is_test>(context, axis_dim, X, Y);
-}
-
-template <class DeviceContext>
-using enable_if_CPU = typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type;
-
-template <typename DeviceContext, typename T, bool is_test>
-class SoftmaxFunctor<DeviceContext, T, is_test, enable_if_CPU<DeviceContext>> {
- public:
-  void operator()(const DeviceContext& context, const int axis_dim,
-                  const framework::Tensor* X, framework::Tensor* Y) {
-    auto in_dims = X->dims();
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-
-    const int num_classes = in_dims[kClassDim];
-    const int batch_size = in_dims[kBatchDim];
-    const int num_remain = num_classes / axis_dim;
-
-    if (num_remain == 1 && platform::MayIUse(platform::avx)) {
-      const T* in_data = X->data<T>();
-      T* out_data = Y->data<T>();
-      for (int bs = 0; bs < batch_size; ++bs) {
-        T max_val = *std::max_element(in_data, in_data + num_classes);
-        max_val *= static_cast<T>(-1);
-        vec_add_bias<T, platform::avx>(num_classes, max_val, in_data, out_data);
-        vec_clip<T, platform::avx>(num_classes, static_cast<T>(-64), out_data,
-                                   out_data);
-        vec_exp<T>(num_classes, out_data, out_data);
-
-        T sum = 0;
-        vec_sum<T, platform::avx>(num_classes, out_data, &sum);
-        sum = static_cast<T>(1) / sum;
-        vec_scal<T, platform::avx>(num_classes, sum, out_data, out_data);
-
-        in_data += num_classes;
-        out_data += num_classes;
-      }
-    } else {
-      SoftmaxEigen<DeviceContext, T, is_test>(context, axis_dim, X, Y);
-    }
-  }
-};
-
-template <typename DeviceContext>
-class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
- public:
-  void operator()(const DeviceContext& context, const int axis_dim,
-                  const framework::Tensor* X, framework::Tensor* Y) {
-    auto in_dims = X->dims();
-    const float* in_data = X->data<float>();
-    float* out_data = Y->data<float>();
-    const int kBatchDim = 0;
-    const int kClassDim = 1;
-    // 2D data. Batch x C
-    auto compute_softmax =
-        jit::KernelFuncs<jit::SoftmaxTuple<float>, platform::CPUPlace>::Cache()
-            .At(in_dims[kClassDim]);
-    compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim],
-                    in_dims[kClassDim] / axis_dim);
-  }
-};
-
-template <typename DeviceContext, typename T>
-void SoftmaxGradEigen(const DeviceContext& context, const int axis_dim,
-                      const framework::Tensor* y,
-                      const framework::Tensor* y_grad,
-                      framework::Tensor* x_grad) {
-  auto softmax = EigenMatrix<T>::From(*y);
-  auto softmax_grad = EigenMatrix<T>::From(*y_grad);
-  auto logits_grad = EigenMatrix<T>::From(*x_grad);
-
-  constexpr int kBatchDim = 0;
-  constexpr int kClassDim = 1;
-
-  const int batch_size = softmax.dimension(kBatchDim);
-  const int num_classes = softmax.dimension(kClassDim);
-  const int num_remain = num_classes / axis_dim;
-
-  Eigen::DSizes<int, 1> along_class(kClassDim);
-  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-  Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-  Eigen::DSizes<int, 2> one_axis(1, axis_dim);
-
-  auto dot = (softmax * softmax_grad)
-                 .reshape(batch_axis_remain)
-                 .sum(along_class)
-                 .eval()
-                 .broadcast(one_axis);
-  logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
-}
-
-template <typename DeviceContext, typename T, typename Enable>
-void SoftmaxGradFunctor<DeviceContext, T, Enable>::operator()(
-    const DeviceContext& context, const int axis_dim,
-    const framework::Tensor* y, const framework::Tensor* y_grad,
-    framework::Tensor* x_grad) {
-  SoftmaxGradEigen<DeviceContext, T>(context, axis_dim, y, y_grad, x_grad);
-}
-
-template <typename DeviceContext, typename T>
-class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
- public:
-  void operator()(const DeviceContext& context, const int axis_dim,
-                  const framework::Tensor* y, const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad) {
-    auto out_dims = y->dims();
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-    const int num_classes = out_dims[kClassDim];
-    const int batch_size = out_dims[kBatchDim];
-    const int num_remain = num_classes / axis_dim;
-
-    if (num_remain == 1 && platform::MayIUse(platform::avx)) {
-      const T* out_data = y->data<T>();
-      const T* out_grad = y_grad->data<T>();
-      T* in_grad = x_grad->data<T>();
-      for (int bs = 0; bs < batch_size; ++bs) {
-        T scalar;
-        vec_mul_reduce<T, platform::avx>(num_classes, out_grad, out_data,
-                                         &scalar);
-        scalar *= static_cast<T>(-1);
-        vec_add_bias<T, platform::avx>(num_classes, scalar, out_grad, in_grad);
-        vec_mul<T, platform::avx>(num_classes, out_data, in_grad, in_grad);
-        out_data += num_classes;
-        out_grad += num_classes;
-        in_grad += num_classes;
-      }
-    } else {
-      SoftmaxGradEigen<DeviceContext, T>(context, axis_dim, y, y_grad, x_grad);
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
deleted file mode 100644
index 05ce5bc7a205ae51ae147450e7c0f23ee0fe28e2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/tree2col.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/math/tree2col.h"
-#include <deque>
-#include <stack>
-
-namespace paddle {
-namespace operators {
-namespace math {
-using Tensor = framework::Tensor;
-std::vector<TreeNode> Tree2ColUtil::construct_patch(
-    size_t root, int max_depth, const std::vector<std::vector<int>> &tr) {
-  std::stack<TreeNode, std::deque<TreeNode>> stack;
-  std::unordered_map<int, bool> visited;
-  std::vector<TreeNode> patch;
-
-  stack.push(TreeNode(root, 1, 1, 0));
-  patch.emplace_back(TreeNode(root, 1, 1, 0));
-  visited[root] = true;
-
-  while (!stack.empty()) {
-    TreeNode &u = stack.top();
-    bool end = true;
-    size_t node = u.get_node(), sz = tr[node].size();
-    visited[node] = true;
-    for (size_t i = 0; i < sz; i++) {
-      size_t v = tr[node][i];
-      if (!visited[v] && static_cast<int>(u.get_depth()) + 1 < max_depth) {
-        visited[v] = true;
-        stack.push(TreeNode(v, i, sz, u.get_depth() + 1));
-        patch.push_back(TreeNode(v, i + 1, sz, u.get_depth() + 1));
-        end = false;
-      }
-    }
-    if (end) {
-      stack.pop();
-    }
-  }
-  return patch;
-}
-
-void Tree2ColUtil::construct_tree(const paddle::Tensor &EdgeSet,
-                                  std::vector<std::vector<int>> *tr,
-                                  size_t *node_count) {
-  auto edge_set_dims = EdgeSet.dims();
-  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2);
-  int64_t edge_count = EdgeSet.numel();
-
-  const int *edge_data = EdgeSet.data<int>();
-
-  for (int64_t i = 0; i < edge_count; i += 2) {
-    int u = edge_data[i], v = edge_data[i + 1];
-    if (u != 0 && v != 0) (*node_count)++;
-  }
-  (*node_count)++;
-
-  tr->resize(static_cast<size_t>(*node_count + 1));
-
-  for (int64_t i = 0; i < edge_count; i += 2) {
-    int u = edge_data[i], v = edge_data[i + 1];
-    if (u != 0 && v != 0) {
-      tr->at(u).push_back(v);
-    } else {
-      break;
-    }
-  }
-}
-
-template <typename T>
-class Tree2ColFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext &context,
-                  const framework::Tensor &EdgeSet,
-                  const framework::Tensor &node_features,
-                  framework::Tensor *patch, int max_depth) {
-    std::vector<std::vector<int>> tr;
-    auto feature_dims = node_features.dims();
-    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
-    math::SetConstant<platform::CPUDeviceContext, T> constant;
-    int64_t feature_size = feature_dims[1];
-    size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
-    size_t node_count = 0, patch_count = 0, patch_size;
-    Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count);
-    std::vector<std::vector<TreeNode>> processing_list;
-    for (size_t u = 1; u <= node_count; u++) {
-      std::vector<TreeNode> temp_patch =
-          Tree2ColUtil::construct_patch(u, max_depth, tr);
-      if (!temp_patch.empty()) {
-        processing_list.emplace_back(temp_patch);
-      }
-    }
-    patch_size = processing_list.size();
-
-    T *patch_data =
-        patch->mutable_data<T>({static_cast<int64_t>(patch_size),
-                                static_cast<int64_t>(patch_elem_size)},
-                               cpu_place);
-    constant(context, patch, 0);
-    const T *features = node_features.data<T>();
-
-    for (auto &patch_item : processing_list) {
-      size_t pointer_base = patch_count * patch_elem_size;
-      for (auto &v : patch_item) {
-        T eta_l = v.eta_l<T>(max_depth), eta_r = v.eta_r<T>(max_depth),
-          eta_t = v.eta_t<T>(max_depth);
-        size_t id = v.get_node() - 1;
-        for (int i = 0; i < feature_size; i++) {
-          patch_data[pointer_base + i * 3] +=
-              eta_l * features[id * feature_size + i];
-          patch_data[pointer_base + i * 3 + 1] +=
-              eta_r * features[id * feature_size + i];
-          patch_data[pointer_base + i * 3 + 2] +=
-              eta_t * features[id * feature_size + i];
-        }
-      }
-      patch_count++;
-    }
-    patch->Resize({static_cast<int64_t>(patch_count),
-                   static_cast<int64_t>(patch_elem_size)});
-  }
-};
-template <typename T>
-class Col2TreeFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext &context,
-                  const framework::Tensor &EdgeSet,
-                  const framework::Tensor &out_grad, framework::Tensor *in_grad,
-                  int max_depth) {
-    std::vector<std::vector<int>> tr;
-    auto output_dims = out_grad.dims();
-    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
-    math::SetConstant<platform::CPUDeviceContext, T> constant;
-    int64_t output_size = output_dims[1];
-    size_t grad_elem_size = 3 * static_cast<size_t>(output_size);
-    size_t node_count = 0, grad_count = 0;
-    Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count);
-    std::vector<std::vector<TreeNode>> processing_list;
-    std::vector<std::vector<TreeNode>> grad_list;
-    grad_list.resize(node_count);
-    for (size_t u = 1; u <= node_count; u++) {
-      std::vector<TreeNode> tmp =
-          Tree2ColUtil::construct_patch(u, max_depth, tr);
-      if (!tmp.empty()) {
-        processing_list.push_back(tmp);
-      }
-    }
-    for (size_t patch_id = 0; patch_id < processing_list.size(); patch_id++) {
-      for (auto v : processing_list[patch_id]) {
-        grad_list[v.get_node() - 1].push_back(v.change_node(patch_id + 1));
-      }
-    }
-    T *grad_data =
-        in_grad->mutable_data<T>({static_cast<int64_t>(node_count),
-                                  static_cast<int64_t>(grad_elem_size)},
-                                 cpu_place);
-
-    constant(context, in_grad, 0);
-    const T *out_g = out_grad.data<T>();
-    for (auto &patch_item : grad_list) {
-      size_t pointer_base = grad_count * grad_elem_size;
-      for (auto &v : patch_item) {
-        T eta_l = v.eta_l<T>(max_depth), eta_r = v.eta_r<T>(max_depth),
-          eta_t = v.eta_t<T>(max_depth);
-        size_t id = v.get_node() - 1;
-        for (int i = 0; i < output_size; i++) {
-          grad_data[pointer_base + i * 3] +=
-              eta_l * out_g[id * output_size + i];
-          grad_data[pointer_base + i * 3 + 1] +=
-              eta_r * out_g[id * output_size + i];
-          grad_data[pointer_base + i * 3 + 2] +=
-              eta_t * out_g[id * output_size + i];
-        }
-      }
-      grad_count++;
-    }
-  }
-};
-
-template class Tree2ColFunctor<platform::CPUDeviceContext, float>;
-template class Tree2ColFunctor<platform::CPUDeviceContext, double>;
-template class Col2TreeFunctor<platform::CPUDeviceContext, float>;
-template class Col2TreeFunctor<platform::CPUDeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
deleted file mode 100644
index 3c50a525c2eac440ace3cb1d87af6abb3c5a9628..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/tree2col.cu
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stack>
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/tree2col.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-using Tensor = framework::Tensor;
-using Node = paddle::operators::math::TreeNode;
-template <typename T>
-__global__ void tree2col(const T* eta, const int* node, const int* index,
-                         const T* vectors, T* result, int feature_size, int n) {
-  const int thread_id =
-      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  const int patch_id = thread_id / feature_size;
-  const int j = thread_id % feature_size;
-  if (patch_id < n) {
-    const int begin_o = patch_id * 3 * feature_size;
-    const int begin = index[patch_id * 2], end = index[patch_id * 2 + 1];
-    T res_l = 0, res_r = 0, res_t = 0;
-    for (int i = begin; i < end; i++) {
-      const int id = node[i];
-      const T vec = vectors[id * feature_size + j];
-      res_l += eta[i * 3] * vec;
-      res_r += eta[i * 3 + 1] * vec;
-      res_t += eta[i * 3 + 2] * vec;
-    }
-    result[begin_o + j * 3] = res_l;
-    result[begin_o + j * 3 + 1] = res_r;
-    result[begin_o + j * 3 + 2] = res_t;
-  }
-}
-template <typename T>
-class Tree2ColFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const paddle::platform::CUDADeviceContext& context,
-                  const framework::Tensor& EdgeSet,
-                  const framework::Tensor& node_features,
-                  framework::Tensor* patch, int max_depth) {
-    std::vector<std::vector<int>> tr;
-    auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
-    auto cpu_place = platform::CPUPlace();
-    auto stream = context.stream();
-    auto feature_dims = node_features.dims();
-    math::SetConstant<platform::CUDADeviceContext, T> constant;
-
-    Tensor EdgeSet_cpu;
-    framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
-    int64_t feature_size = feature_dims[1];
-    size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
-    size_t node_count = 0, patch_count = 0, total_size = 0;
-    size_t max_size = feature_dims[0];
-    Tree2ColUtil::construct_tree(EdgeSet_cpu, &tr, &node_count);
-
-    std::vector<std::vector<Node>> processing_list;
-    for (size_t u = 1; u <= node_count; u++) {
-      std::vector<Node> tmp = Tree2ColUtil::construct_patch(u, max_depth, tr);
-      if (!tmp.empty()) {
-        processing_list.push_back(tmp);
-        total_size += tmp.size();
-      }
-    }
-
-    size_t patch_size = processing_list.size();
-    Tensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
-    int* node = node_cpu.mutable_data<int>({static_cast<int64_t>(total_size)},
-                                           cpu_place);
-    T* eta = eta_cpu.mutable_data<T>({static_cast<int64_t>(total_size * 3)},
-                                     cpu_place);
-    int* index = index_cpu.mutable_data<int>(
-        {static_cast<int64_t>(patch_size * 2)}, cpu_place);
-
-    int idx = 0, index_idx = 0;
-    for (auto& tmp : processing_list) {
-      index[index_idx++] = idx;
-      for (auto& v : tmp) {
-        node[idx] = static_cast<int>(v.node - 1);
-        eta[idx * 3] = v.eta_l<T>(max_depth);
-        eta[idx * 3 + 1] = v.eta_r<T>(max_depth);
-        eta[idx * 3 + 2] = v.eta_t<T>(max_depth);
-        idx++;
-      }
-      index[index_idx++] = idx;
-    }
-    framework::TensorCopy(node_cpu, gpu_place, context, &node_gpu);
-    framework::TensorCopy(eta_cpu, gpu_place, context, &eta_gpu);
-    framework::TensorCopy(index_cpu, gpu_place, context, &index_gpu);
-
-    int elem_size = patch_size * feature_size;
-    int blocks = (elem_size + 1024 - 1) / 1024;
-    int block_x = 512;
-    int block_y = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(block_x, block_y);
-
-    patch->mutable_data<T>(
-        {static_cast<int64_t>(max_size), static_cast<int64_t>(patch_elem_size)},
-        gpu_place);
-    constant(context, patch, 0);
-    tree2col<T><<<grid, threads, 0, stream>>>(
-        eta_gpu.data<T>(), node_gpu.data<int>(), index_gpu.data<int>(),
-        node_features.data<T>(), patch->data<T>(), feature_size, patch_size);
-  }
-};
-template <typename T>
-class Col2TreeFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& EdgeSet,
-                  const framework::Tensor& patch_grad,
-                  framework::Tensor* embedding_grad, int max_depth) {
-    std::vector<std::vector<int>> tr;
-    auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
-    auto cpu_place = platform::CPUPlace();
-    auto stream = context.stream();
-    auto output_dims = patch_grad.dims();
-    math::SetConstant<platform::CUDADeviceContext, T> constant;
-
-    Tensor EdgeSet_cpu;
-    framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
-    int64_t output_size = output_dims[1];
-    size_t patch_elem_size = 3 * static_cast<size_t>(output_size);
-    size_t node_count = 0, patch_count = 0;
-    size_t max_size = output_dims[0];
-    Tree2ColUtil::construct_tree(EdgeSet_cpu, &tr, &node_count);
-    std::vector<std::vector<Node>> processing_list;
-    std::vector<std::vector<Node>> grad_list;
-    grad_list.resize(node_count);
-    size_t total_size = 0, grad_size = node_count;
-    for (size_t u = 1; u <= node_count; u++) {
-      std::vector<Node> tmp = Tree2ColUtil::construct_patch(u, max_depth, tr);
-      if (!tmp.empty()) {
-        processing_list.push_back(tmp);
-      }
-    }
-    for (size_t patch_id = 0; patch_id < processing_list.size(); patch_id++) {
-      for (auto v : processing_list[patch_id]) {
-        grad_list[v.get_node() - 1].push_back(v.change_node(patch_id + 1));
-      }
-    }
-    for (auto& tmp : grad_list) {
-      total_size += tmp.size();
-    }
-
-    Tensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
-    int* node = node_cpu.mutable_data<int>({static_cast<int64_t>(total_size)},
-                                           cpu_place);
-    T* eta = eta_cpu.mutable_data<T>({static_cast<int64_t>(total_size * 3)},
-                                     cpu_place);
-    int* index = index_cpu.mutable_data<int>(
-        {static_cast<int64_t>(grad_size * 2)}, cpu_place);
-
-    size_t idx = 0, index_idx = 0;
-    for (auto& tmp : grad_list) {
-      index[index_idx++] = idx;
-      for (auto& v : tmp) {
-        node[idx] = static_cast<int>(v.node - 1);
-        eta[idx * 3] = v.eta_l<T>(max_depth);
-        eta[idx * 3 + 1] = v.eta_r<T>(max_depth);
-        eta[idx * 3 + 2] = v.eta_t<T>(max_depth);
-        idx++;
-      }
-      index[index_idx++] = idx;
-    }
-    framework::TensorCopy(node_cpu, gpu_place, &node_gpu);
-    framework::TensorCopy(eta_cpu, gpu_place, &eta_gpu);
-    framework::TensorCopy(index_cpu, gpu_place, &index_gpu);
-
-    int elem_size = output_size * grad_size;
-    int blocks = (elem_size + 1024 - 1) / 1024;
-    int block_x = 512;
-    int block_y = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(block_x, block_y);
-
-    embedding_grad->mutable_data<T>(
-        {static_cast<int64_t>(max_size), static_cast<int64_t>(patch_elem_size)},
-        gpu_place);
-
-    constant(context, embedding_grad, 0);
-    tree2col<T><<<grid, threads, 0, stream>>>(
-        eta_gpu.data<T>(), node_gpu.data<int>(), index_gpu.data<int>(),
-        patch_grad.data<T>(), embedding_grad->data<T>(), output_size,
-        grad_size);
-  }
-};
-
-template class Tree2ColFunctor<platform::CUDADeviceContext, float>;
-template class Tree2ColFunctor<platform::CUDADeviceContext, double>;
-template class Col2TreeFunctor<platform::CUDADeviceContext, float>;
-template class Col2TreeFunctor<platform::CUDADeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/tree2col.h b/paddle/fluid/operators/math/tree2col.h
deleted file mode 100644
index 478ba78e259d327cc440f34161c8cf476109bb8c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/tree2col.h
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <array>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-namespace operators {
-namespace math {
-class TreeNode {
- public:
-  size_t node;
-  explicit TreeNode(size_t node = 0, size_t index = 0, size_t pclen = 0,
-                    size_t depth = 0)
-      : node(node), index(index), pclen(pclen), depth(depth) {}
-  template <typename T>
-  T eta_t(T filter_depth) {
-    return ((filter_depth - this->depth) / filter_depth);
-  }
-  template <typename T>
-  T eta_l(T filter_depth) {
-    T temp;
-    if (this->pclen == 1) {
-      temp = 0.5;
-    } else {
-      temp = (this->index - 1.0) / (this->pclen - 1.0);
-    }
-    return (1.0 - this->eta_t<T>(filter_depth)) * temp;
-  }
-  template <typename T>
-  T eta_r(T filter_depth) {
-    return (1.0 - this->eta_t<T>(filter_depth)) *
-           (1.0 - this->eta_l<T>(filter_depth));
-  }
-  TreeNode change_node(size_t v) {
-    return TreeNode(v, this->index, this->pclen, this->depth);
-  }
-  size_t get_node() { return this->node; }
-  size_t get_depth() { return this->depth; }
-
- private:
-  size_t index, pclen, depth;
-};
-class Tree2ColUtil {
- public:
-  static std::vector<TreeNode> construct_patch(
-      size_t root, int max_depth, const std::vector<std::vector<int>> &tr);
-
-  static void construct_tree(const Tensor &EdgeSet,
-                             std::vector<std::vector<int>> *tr,
-                             size_t *node_count);
-};
-
-template <typename DeviceContext, typename T>
-class Tree2ColFunctor {
- public:
-  void operator()(const DeviceContext &context,
-                  const framework::Tensor &EdgeSet,
-                  const framework::Tensor &node_features,
-                  framework::Tensor *patch, int max_depth);
-};
-template <typename DeviceContext, typename T>
-class Col2TreeFunctor {
- public:
-  void operator()(const DeviceContext &context,
-                  const framework::Tensor &EdgeSet,
-                  const framework::Tensor &out_grad, framework::Tensor *in_grad,
-                  int max_depth);
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
deleted file mode 100644
index 13f0845bb8579615381c06072eb1e32507ffd3cf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/unpooling.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/unpooling.h"
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename T>
-class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    int input_feasize = input_height * input_width;
-    int output_feasize = output_height * output_width;
-    const T* input_data = input.data<T>();
-    const int* indices_data = indices.data<int>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    for (int b = 0; b < batch_size; ++b) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int i = 0; i < input_feasize; ++i) {
-          int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
-          output_data[index] = input_data[i];
-        }
-        input_data += input_feasize;
-        indices_data += input_feasize;
-        output_data += output_feasize;
-      }
-    }
-  }
-};
-template <class T>
-class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    int input_feasize = input_height * input_width;
-    int output_feasize = output_height * output_width;
-    const int* indices_data = indices.data<int>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    for (int b = 0; b < batch_size; ++b) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int i = 0; i < input_feasize; ++i) {
-          int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
-          input_grad_data[i] = output_grad_data[index];
-        }
-        input_grad_data += input_feasize;
-        indices_data += input_feasize;
-        output_grad_data += output_feasize;
-      }
-    }
-  }
-};
-template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, float>;
-template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, double>;
-template class Unpool2dMaxFunctor<platform::CPUDeviceContext, float>;
-template class Unpool2dMaxFunctor<platform::CPUDeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
deleted file mode 100644
index d78e3385efb29cbba540d50433bf0fe35cedd448..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/unpooling.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/unpooling.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename T>
-__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
-                                  const int* indices_data,
-                                  const int input_height, const int input_width,
-                                  const int channels, T* output_data,
-                                  const int output_height,
-                                  const int output_width) {
-  int in_n_stride = input_height * input_width * channels;
-  int in_c_stride = input_height * input_width;
-  int out_n_stride = output_height * output_width * channels;
-  int out_c_stride = output_height * output_width;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int bidx = i / in_n_stride;
-    int boffset = i % in_n_stride;
-    int cidx = boffset / in_c_stride;
-    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
-    int out_index = indices_data[i];
-    PADDLE_ENFORCE(out_index < out_c_stride,
-                   "out_index < out_c_stride. Expected %ld < %ld, but got "
-                   "%ld >= %ld. Please check input value.",
-                   out_index, out_c_stride, out_index, out_c_stride);
-    output_data[out_offset + out_index] = input_data[i];
-  }
-}
-template <typename T>
-__global__ void KernelUnpool2dMaxGrad(
-    const int nthreads, const T* input_data, const int* indices_data,
-    const int input_height, const int input_width, const int channels,
-    const T* output_data, const T* output_grad, const int output_height,
-    const int output_width, T* input_grad) {
-  int in_n_stride = input_height * input_width * channels;
-  int in_c_stride = input_height * input_width;
-  int out_n_stride = output_height * output_width * channels;
-  int out_c_stride = output_height * output_width;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int bidx = i / in_n_stride;
-    int boffset = i % in_n_stride;
-    int cidx = boffset / in_c_stride;
-    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
-    int out_index = indices_data[i];
-    PADDLE_ENFORCE(out_index < out_c_stride,
-                   "out_index < out_c_stride. Expected %ld < %ld, but got "
-                   "%ld >= %ld. Please check input value.",
-                   out_index, out_c_stride, out_index, out_c_stride);
-    input_grad[i] = output_grad[out_offset + out_index];
-  }
-}
-/*
- * All tensors are in NCHW format.
- */
-template <typename T>
-class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const T* input_data = input.data<T>();
-    const int* indices_data = indices.data<int>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int threads = 1024;
-    int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMax<T><<<grid, threads, 0, context.stream()>>>(
-        input.numel(), input_data, indices_data, input_height, input_width,
-        output_channels, output_data, output_height, output_width);
-  }
-};
-/*
- * All tensors are in NCHW format.
- */
-template <typename T>
-class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const T* input_data = input.data<T>();
-    const int* indices_data = indices.data<int>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int threads = 1024;
-    int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
-        input.numel(), input_data, indices_data, input_height, input_width,
-        output_channels, output_data, output_grad_data, output_height,
-        output_width, input_grad_data);
-  }
-};
-template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
-template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
-template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
-template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.h b/paddle/fluid/operators/math/unpooling.h
deleted file mode 100644
index 74ca39d114e266be3ed8278f5e941d685c7b0d3d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/unpooling.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename DeviceContext, typename T>
-class Unpool2dMaxFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& indices, framework::Tensor* output);
-};
-template <typename DeviceContext, class T>
-class Unpool2dMaxGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad);
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
deleted file mode 100644
index e92adc09ba01b032aba8eba94bcb4ba96524c641..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/vol2col.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/vol2col.h"
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * vol = [input_channels, input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <class T>
-class Vol2ColFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* col) const {
-    PADDLE_ENFORCE(vol.dims().size() == 4);
-    PADDLE_ENFORCE(col->dims().size() == 7);
-
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
-
-    const T* vol_data = vol.data<T>();
-    T* col_data = col->data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int c_in = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            int col_idx =
-                ((c * output_depth + d) * output_height + h) * output_width + w;
-            int vol_idx =
-                ((c_in * input_depth + d_pad) * input_height + h_pad) *
-                    input_width +
-                w_pad;
-            col_data[col_idx] =
-                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
-                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
-                    ? static_cast<T>(0)
-                    : vol_data[vol_idx];
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * vol = [input_channels,input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <class T>
-class Col2VolFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* vol) const {
-    PADDLE_ENFORCE(vol->dims().size() == 4);
-    PADDLE_ENFORCE(col.dims().size() == 7);
-
-    int input_channels = vol->dims()[0];
-    int input_depth = vol->dims()[1];
-    int input_height = vol->dims()[2];
-    int input_width = vol->dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
-    T* vol_data = vol->data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int cIm = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
-                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx =
-                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
-                      input_width +
-                  w_pad;
-
-              int col_idx =
-                  ((c * output_depth + d) * output_height + h) * output_width +
-                  w;
-              vol_data[vol_idx] += col_data[col_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
-template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
-template class Col2VolFunctor<platform::CPUDeviceContext, float>;
-template class Col2VolFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
deleted file mode 100644
index 25d8a247bca3ed7b77722421456bf434f396e8d1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/vol2col.cu
+++ /dev/null
@@ -1,264 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-__global__ void vol2col(int num_kernels, const T* data_vol, int depth,
-                        int height, int width, int dilation_d, int dilation_h,
-                        int dilation_w, int filter_depth, int filter_height,
-                        int filter_width, int stride_depth, int stride_height,
-                        int stride_width, int padding_depth, int padding_height,
-                        int padding_width, int output_detph, int output_height,
-                        int output_width, T* data_col) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    int w_out = index % output_width;
-    int h_out = (index / output_width) % output_height;
-    int d_out = (index / output_width / output_height) % output_detph;
-    int channel_in = index / output_width / output_height / output_detph;
-    int channel_out = channel_in * filter_depth * filter_height * filter_width;
-    int w_in = w_out * stride_width - padding_width;
-    int h_in = h_out * stride_height - padding_height;
-    int d_in = d_out * stride_depth - padding_depth;
-
-    data_col += ((channel_out * output_detph + d_out) * output_height + h_out) *
-                    output_width +
-                w_out;
-    data_vol += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
-    for (int k = 0; k < filter_depth; ++k) {
-      for (int i = 0; i < filter_height; ++i) {
-        for (int j = 0; j < filter_width; ++j) {
-          int d = d_in + k * dilation_d;
-          int h = h_in + i * dilation_h;
-          int w = w_in + j * dilation_w;
-          int col_idx = (k * dilation_d * height + i * dilation_h) * width +
-                        j * dilation_w;
-          *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
-                       w < width)
-                          ? data_vol[col_idx]
-                          : 0;
-          data_col += output_detph * output_height * output_width;
-        }
-      }
-    }
-  }
-}
-
-/*
- * im = [input_channels,intpu_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <class T>
-class Vol2ColFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* col) const {
-    PADDLE_ENFORCE_EQ(vol.dims().size(), 4);
-    PADDLE_ENFORCE_EQ(col->dims().size(), 7);
-
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "Mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "Mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "Mismatching.");
-
-    int num_outputs =
-        input_channels * output_depth * output_height * output_width;
-
-    const int threads = 1024;
-    const int blocks = (num_outputs + 1024 - 1) / 1024;
-    vol2col<T><<<blocks, threads, 0, context.stream()>>>(
-        num_outputs, vol.data<T>(), input_depth, input_height, input_width,
-        dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
-        filter_width, strides[0], strides[1], strides[2], paddings[0],
-        paddings[1], paddings[2], output_depth, output_height, output_width,
-        col->data<T>());
-  }
-};
-
-template <class T>
-__global__ void col2vol(int num_kernels, const T* data_col, int depth,
-                        int height, int width, int dilation_d, int dilation_h,
-                        int dilation_w, int filter_depth, int filter_height,
-                        int filter_width, int stride_depth, int stride_height,
-                        int stride_width, int padding_depth, int padding_height,
-                        int padding_width, int output_detph, int output_height,
-                        int output_width, T* data_vol) {
-  const int d_filter_depth = dilation_d * (filter_depth - 1) + 1;
-  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
-  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
-
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    T src_val = 0;
-    int w = index % width + padding_width;
-    int h = (index / width) % height + padding_height;
-    int d = (index / width / height) % depth + padding_depth;
-    int c = index / width / height / depth;
-
-    // compute the start and end of the output
-    int w_col_start =
-        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
-    int w_col_end = min(w / stride_width + 1, output_width);
-    int h_col_start =
-        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
-    int h_col_end = min(h / stride_height + 1, output_height);
-    int d_col_start =
-        (d < d_filter_depth) ? 0 : (d - d_filter_depth) / stride_depth + 1;
-    int d_col_end = min(d / stride_depth + 1, output_detph);
-
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          int d_off = (d - d_col * stride_depth);
-          int h_off = (h - h_col * stride_height);
-          int w_off = (w - w_col * stride_width);
-          if (d_off % dilation_d == 0 && h_off % dilation_h == 0 &&
-              w_off % dilation_w == 0) {
-            d_off /= dilation_d;
-            h_off /= dilation_h;
-            w_off /= dilation_w;
-
-            int data_col_index =
-                (((((c * filter_depth + d_off) * filter_height + h_off) *
-                       filter_width +
-                   w_off)));
-            data_col_index =
-                ((data_col_index * output_detph + d_col) * output_height +
-                 h_col) *
-                    output_width +
-                w_col;
-            src_val += data_col[data_col_index];
-          }
-        }
-      }
-    }
-    data_vol[index] = src_val;
-  }
-}
-
-/*
- * im = [input_channels, input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <class T>
-class Col2VolFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* vol) const {
-    PADDLE_ENFORCE_EQ(vol->dims().size(), 4);
-    PADDLE_ENFORCE_EQ(col.dims().size(), 7);
-
-    int input_channels = vol->dims()[0];
-    int input_depth = vol->dims()[1];
-    int input_height = vol->dims()[2];
-    int input_width = vol->dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "Mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "Mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "Mismatching.");
-
-    int num_kernels = input_channels * input_depth * input_height * input_width;
-
-    const int threads = 1024;
-    const int blocks = (num_kernels + 1024 - 1) / 1024;
-
-    col2vol<T><<<blocks, threads, 0, context.stream()>>>(
-        num_kernels, col.data<T>(), input_depth, input_height, input_width,
-        dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
-        filter_width, strides[0], strides[1], strides[2], paddings[0],
-        paddings[1], paddings[2], output_depth, output_height, output_width,
-        vol->data<T>());
-  }
-};
-
-template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
-template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
-template class Col2VolFunctor<platform::CUDADeviceContext, float>;
-template class Col2VolFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col.h b/paddle/fluid/operators/math/vol2col.h
deleted file mode 100644
index 5f59de8f02a52209a3901ca03680eb2d0dbc2658..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/vol2col.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-/*
- * \brief Converts the feature data of four dimensions(CDHW) into a colData of
- *        seven dimensions in the Vol2ColFunctor calculation,
- *        And in the Col2VolFunctor calculation, it is reversed.
- *
- * \param volData   Vol data.
- * \param volShape  The shape of volData,
- *                 [input_channels, input_depth, input_height, input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 3-dimension  [dilation_depth, dilation_height, dilation_width].
- *
- * \param strides      stride data.
- * \param 3-dimension  [stride_depth, stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 3-dimension  [d_pad, h_pad, w_pad].
- *
- * The shape of colData is:
- * [input_channels, filter_depth, filter_height, filter_width, output_depth,
- * output_height, output_width]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * input_channels * filter_depth * filter_height * filter_width, and the width
- * is equal output_depth * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_depth,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_depth,
- *      output_height,
- *      output_width]
- *
- * \note The caller needs to ensure that volShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-template <typename DeviceContext, typename T>
-class Vol2ColFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* col) const;
-};
-
-template <typename DeviceContext, typename T>
-class Col2VolFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* vol) const;
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
deleted file mode 100644
index aa979c4f10907e604758c3e2cfb776cb994c9ceb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/vol2col.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <vector>
-
-template <typename DeviceContext, typename Place>
-void testVol2col() {
-  paddle::framework::Tensor input;
-  paddle::framework::Tensor input_tmp;
-  paddle::framework::Tensor output;
-  paddle::framework::Tensor output_tmp;
-
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-
-  /**
-   * input = [[0, 1, 2,
-   *          3, 4, 5]
-   *          [6, 7, 8,
-   *          9, 10, 11]]
-   *
-   * output = [0, 1
-   *           1, 2
-   *           3, 4
-   *           4, 5
-   *           6, 7
-   *           7, 8
-   *           9, 10
-   *           10, 11]
-   *
-   * col2vol = [[0, 2, 2,
-   *             3, 8, 5]
-   *            [6, 14, 8,
-   *             9, 20, 11]]
-   *
-   */
-  int input_depth = 2;
-  int input_height = 2;
-  int input_width = 3;
-  int filter_size = 2;
-  std::vector<int> strides({1, 1, 1});
-  std::vector<int> paddings({0, 0, 0});
-  std::vector<int> dilations({1, 1, 1});
-  int output_depth =
-      (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
-  int output_height =
-      (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
-  int output_width =
-      (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
-
-  // Vol2Col test
-  float* input_ptr =
-      input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
-                                    paddle::platform::CPUPlace());
-  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  memcpy(input_ptr, arr, 12 * sizeof(float));
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    paddle::framework::TensorCopySync(input_tmp, *place, &input);
-  }
-  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
-                              output_depth, output_height, output_width},
-                             *place);
-
-  paddle::operators::math::Vol2ColFunctor<DeviceContext, float> vol2col;
-  vol2col(*context, input, dilations, strides, paddings, &output);
-
-  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
-  float* out_cfo_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    out_cfo_ptr = output.data<float>();
-  } else {
-    TensorCopySync(output, paddle::platform::CPUPlace(), &output_tmp);
-    out_cfo_ptr = output_tmp.data<float>();
-  }
-
-  for (int i = 0; i < 16; ++i) {
-    EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
-  }
-
-  // Col2Vol test
-  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
-  memset(input_ptr, 0, 12 * sizeof(float));
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    TensorCopySync(input_tmp, *place, &input);
-  }
-
-  paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
-  col2vol(*context, output, dilations, strides, paddings, &input);
-
-  float* in_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    in_ptr = input.data<float>();
-  } else {
-    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
-    in_ptr = input_tmp.data<float>();
-  }
-
-  for (int i = 0; i < 12; ++i) {
-    EXPECT_EQ(in_ptr[i], col_2_vol[i]);
-  }
-}
-
-TEST(math, vol2col) {
-  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
-#ifdef PADDLE_WITH_CUDA
-  testVol2col<paddle::platform::CUDADeviceContext,
-              paddle::platform::CUDAPlace>();
-#endif  // PADDLE_WITH_CUDA
-}
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
deleted file mode 100644
index eb43f43daf446a4b8ca872f89e4dcb18ff7323d8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/matmul_op.cc
+++ /dev/null
@@ -1,486 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-/**
- * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
- * original x_dim is returned.
- */
-static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
-  if (x_dim.size() > 1) {
-    return x_dim;
-  }
-  return framework::make_ddim({1, x_dim[0]});
-}
-
-/**
- * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
- * original y_dim is returned.
- */
-static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
-  if (y_dim.size() > 1) {
-    return y_dim;
-  }
-  return framework::make_ddim({y_dim[0], 1});
-}
-
-template <typename DeviceContext, typename T>
-class MatMulKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &x =
-        detail::Ref(context.Input<framework::Tensor>("X"), "Cannot find X");
-    auto &y =
-        detail::Ref(context.Input<framework::Tensor>("Y"), "Cannot find Y");
-    auto *out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-    auto mat_dim_a = math::CreateMatrixDescriptor(
-        RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X"));
-    auto mat_dim_b = math::CreateMatrixDescriptor(
-        ColumnMatrixFromVector(y.dims()), 0, context.Attr<bool>("transpose_Y"));
-    auto scale = static_cast<T>(context.Attr<float>("alpha"));
-
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
-    int head_number = context.Attr<int>("head_number");
-    bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_);
-
-    if (head_number > 1) {
-      blas.MatMulWithHead(x, mat_dim_a, y, mat_dim_b, scale, head_number, out,
-                          T(0), split_vertical_y);
-    } else {
-      blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0));
-    }
-#else
-    blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0));
-#endif
-  }
-};
-
-// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
-// Identity op if the tensor is not of rank 3.
-static framework::Tensor FoldInitDims(const framework::Tensor &input) {
-  auto output = input;
-  auto in_dims = input.dims();
-  if (in_dims.size() == 3) {
-    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
-  }
-  return output;
-}
-
-// Reshape a rank-3 tensor from P x M x N to M x (P * N).
-// (Warning: This requires transposing data and writes into new memory.)
-// Identity op if the tensor is not of rank 3.
-template <typename DeviceContext, typename T>
-static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context,
-                                             const framework::Tensor &input) {
-  auto in_dims = input.dims();
-  if (in_dims.size() != 3) {
-    return input;
-  }
-  framework::Tensor output;
-  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
-  output.mutable_data<T>(context.GetPlace());
-  std::vector<int> axis = {1, 0, 2};
-  math::Transpose<DeviceContext, T, 3> trans;
-  trans(context, input, &output, axis);
-  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
-
-  return output;
-}
-
-/**
- * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor.
- *
- * The shape would be [BatchSize, H, W] or [H, W].
- * If transposed, `H,W` will be swapped.
- */
-static void ReshapeTensorIntoMatrixSequence(
-    framework::Tensor *x, const math::MatDescriptor &descriptor) {
-  int64_t h, w;
-  h = descriptor.height_;
-  w = descriptor.width_;
-  if (descriptor.trans_) {
-    std::swap(w, h);
-  }
-  if (descriptor.batch_size_) {
-    x->Resize({descriptor.batch_size_, h, w});
-  } else {
-    x->Resize({h, w});
-  }
-}
-
-/**
- * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
- * Out = matmul(x, y)
- *
- * This method will first calculate X,Y matrix sequence, and then calculate
- * the out shape.
- *
- * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
- * The out = [BatchSize, H1, W2]
- *
- * If there is no batch size in `X` and `Y`, the out will be [H1, W2]
- * If any of `X` and `Y` has batch size BatchSize, the out will have the
- * BatchSize.
- */
-static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
-                                           framework::Tensor *y,
-                                           framework::Tensor *out, bool trans_x,
-                                           bool trans_y) {
-  auto x_dim = RowMatrixFromVector(x->dims());
-  auto y_dim = ColumnMatrixFromVector(y->dims());
-  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
-  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
-  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
-    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
-  } else {
-    out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
-                 mat_dim_x.height_, mat_dim_y.width_});
-  }
-
-  ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
-  ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
-}
-
-// Using dimensional constraints on matrix multiplication, it is
-// straight-forward to check the following table for when X and Y
-// are both matrices.
-//
-// transpose_X | False    | True     | False    | True
-// transpose_Y | False    | False    | True     | True
-// -----------+----------+----------+----------+-----------
-//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
-//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
-//
-// When X is a vector of size K, we treat it instead as a matrix of shape
-// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
-// a matrix of shape (K, 1).
-//
-// When X and Y are both 3-dimensional tensors, then the first dimension
-// the batch dimension can be ignored and the exact same formulas apply
-// as for two matrices.
-//
-// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
-// up with formulas like
-//
-//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
-//
-// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
-// to X: (P * M) x K, dOut: (P * M) x N.
-template <typename DeviceContext, typename T>
-class MatMulGradKernel : public framework::OpKernel<T> {
- public:
-  void MatMul(const framework::ExecutionContext &context,
-              const framework::Tensor &a, bool trans_a,
-              const framework::Tensor &b, bool trans_b,
-              framework::Tensor *out) const {
-    out->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
-    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
-    blas.MatMul(a, mat_dim_a, b, mat_dim_b,
-                static_cast<T>(context.Attr<float>("alpha")), out, T(0));
-  }
-
-  void CalcInputGrad(const framework::ExecutionContext &context,
-                     const framework::Tensor &a, bool trans_a,
-                     bool is_fold_init_dims_a, const framework::Tensor &b,
-                     bool trans_b, bool is_fold_init_dims_b,
-                     framework::Tensor *out) const {
-    if (out == nullptr) return;
-    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
-                        out->dims().size() == 2;
-    if (!need_combine) {
-      MatMul(context, a, trans_a, b, trans_b, out);
-    } else {
-      auto &ctx = context.template device_context<DeviceContext>();
-      MatMul(context, is_fold_init_dims_a
-                          ? FoldInitDims(a)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
-             trans_a, is_fold_init_dims_b
-                          ? FoldInitDims(b)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
-             trans_b, out);
-    }
-  }
-
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto x = *context.Input<framework::Tensor>("X");
-    auto y = *context.Input<framework::Tensor>("Y");
-    auto dout =
-        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-    bool transpose_x = context.Attr<bool>("transpose_X");
-    bool transpose_y = context.Attr<bool>("transpose_Y");
-
-    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
-    framework::DDim dx_dims;
-    if (dx) {
-      dx_dims = dx->dims();
-      if (dx_dims != x.dims()) {
-        dx->Resize(x.dims());
-      }
-    }
-
-    framework::DDim dy_dims;
-    if (dy) {
-      dy_dims = dy->dims();
-      if (dy_dims != y.dims()) {
-        dy->Resize(y.dims());
-      }
-    }
-
-    if (transpose_x && transpose_y) {
-      CalcInputGrad(context, y, true, true, dout, true, false, dx);
-      CalcInputGrad(context, dout, true, true, x, true, false, dy);
-    } else if (transpose_x) {
-      CalcInputGrad(context, y, false, false, dout, true, false, dx);
-      CalcInputGrad(context, x, false, false, dout, false, true, dy);
-    } else if (transpose_y) {
-      CalcInputGrad(context, dout, false, false, y, false, true, dx);
-      CalcInputGrad(context, dout, true, true, x, false, true, dy);
-    } else {
-      CalcInputGrad(context, dout, false, false, y, true, false, dx);
-      CalcInputGrad(context, x, true, true, dout, false, true, dy);
-    }
-
-    if (dx) {
-      if (dx_dims != x.dims()) {
-        dx->Resize(dx_dims);
-      }
-    }
-    if (dy) {
-      if (dy_dims != y.dims()) {
-        dy->Resize(dy_dims);
-      }
-    }
-  }
-};
-
-class MatMulOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "Input(X) of MatMulOp should not be null.");
-    PADDLE_ENFORCE(context->HasInput("Y"),
-                   "Input(Y) of MatMulOp should not be null.");
-    PADDLE_ENFORCE(context->HasOutput("Out"),
-                   "Output(Out) of MatMulOp should not be null.");
-
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-
-    auto mat_dim_x =
-        math::CreateMatrixDescriptor(RowMatrixFromVector(dim_x), 0,
-                                     context->Attrs().Get<bool>("transpose_X"));
-    auto mat_dim_y =
-        math::CreateMatrixDescriptor(ColumnMatrixFromVector(dim_y), 0,
-                                     context->Attrs().Get<bool>("transpose_Y"));
-
-    if (context->IsRuntime()) {
-      PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
-                     mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
-    }
-    std::vector<int64_t> dim_out;
-    int64_t dim_out_y = mat_dim_y.width_;
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
-    int head_number = context->Attrs().Get<int>("head_number");
-    bool split_vertical_y = (mat_dim_x.width_ != mat_dim_y.height_);
-    PADDLE_ENFORCE_LE(head_number, mat_dim_x.width_);
-
-    if (!split_vertical_y && head_number > 0) {
-      dim_out_y = head_number * mat_dim_y.width_;
-    }
-#else
-    PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_);
-#endif
-
-    if (mat_dim_x.batch_size_ != 0) {
-      dim_out = framework::vectorize(dim_x);
-      dim_out[dim_out.size() - 2] = mat_dim_x.height_;
-      dim_out[dim_out.size() - 1] = dim_out_y;
-    } else if (mat_dim_y.batch_size_ != 0) {
-      dim_out = framework::vectorize(dim_y);
-      dim_out[dim_out.size() - 2] = mat_dim_x.height_;
-      dim_out[dim_out.size() - 1] = dim_out_y;
-    } else {
-      dim_out = {mat_dim_x.height_, dim_out_y};
-    }
-
-    if (dim_x.size() == 1 && dim_out[dim_out.size() - 2] == 1) {
-      std::swap(dim_out[dim_out.size() - 2], dim_out[dim_out.size() - 1]);
-      dim_out.resize(dim_out.size() - 1);
-    }
-
-    if (dim_y.size() == 1 && dim_out[dim_out.size() - 1] == 1) {
-      dim_out.resize(dim_out.size() - 1);
-    }
-
-    if (dim_out.empty()) {
-      dim_out = {1};
-    }
-    context->SetOutputDim("Out", framework::make_ddim(dim_out));
-    context->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The first input of MatMul op");
-    AddInput("Y", "The second input of MatMul op");
-    AddOutput("Out", "The output of MatMul op");
-    AddAttr<bool>("transpose_X",
-                  R"DOC(If true, use the transpose of `X`.
-        )DOC")
-        .SetDefault(false);
-    AddAttr<bool>("transpose_Y",
-                  R"DOC(If true, use the transpose of `Y`.
-        )DOC")
-        .SetDefault(false);
-    AddAttr<float>("alpha", "The scale of Out").SetDefault(1.0f);
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
-    AddAttr<int>("head_number", "The number of heads of the matrix")
-        .SetDefault(1);
-#endif
-    AddComment(R"DOC(
-MatMul Operator.
-
-
-This operator is used to perform (batched) matrix multiplication
-over the last two dimensions of the input tensors `X` and `Y`.
-
-If a transpose flag is specified, the last two dimensions of the
-tensor are transposed. If the tensor is rank-1 of shape [D], then
-for `X` it is treated as [1, D] in nontransposed form and as [D, 1]
-in transposed form, whereas for `Y` it is the opposite: It is treated
-as [D, 1] in nontransposed form and as [1, D] in transposed form.
-
-Examples without transpose:
-- X: [K], Y: [K] => Out: [1]
-- X: [K], Y: [K, N] => Out: [N]
-- X: [B, M, K], Y: [K] => Out: [B, M]
-- X: [M, K], Y: [B, K, N] => Out: [B, M, N]
-- X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
-- X: [B, ..., M, K], Y: [B, ..., K, N] => Out: [B, ..., M, N]
-
-Example of matrix multiplication with head_number of H
-- X: [B, M, K], Y: [B, K, N] => Out: [B, M, H * N]
-
-The behavior is designed to be similar to the `numpy.matmul` function.
-The differences are:
-- When the rank of the input data is less than or equal to 3, it
-  is similar to the `numpy.matmul` function.
-- When the rank of the input is greater than 3, the rank of X and
-  Y must be equal, and the first `rank - 2` dimensions must be equal.
-- We add `transpose_X` and `transpose_Y` flags.
-- We add `head_number` attribute, which is used to multiple two matrixes head
-  by head, and eventually concatenates the output of several (head_number)
-  small matrixes multiplication.
-
-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
-
-)DOC");
-  }
-};
-
-class MatMulOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = context->GetInputDim("X");
-    auto y_dims = context->GetInputDim("Y");
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-
-    if (context->HasOutput(x_grad_name)) {
-      context->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (context->HasOutput(y_grad_name)) {
-      context->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-class MatMulOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *retv = new framework::OpDesc();
-    retv->SetType("matmul_grad");
-    retv->SetInput("X", Input("X"));
-    retv->SetInput("Y", Input("Y"));
-    retv->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    retv->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    retv->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    retv->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(retv);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
-                  ops::MatMulOpGradMaker);
-REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatMulKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MatMulKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::float16>);
-REGISTER_OP_CPU_KERNEL(
-    matmul_grad,
-    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::float16>);
-
-#ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL(
-    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MatMulKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MatMulKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    matmul_grad,
-    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext,
-                          paddle::platform::float16>);
-#endif
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
deleted file mode 100644
index b1e69f375d3274aade3184af02f7f914dba5db71..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-class MaxSeqenceLenOp : public framework::OperatorBase {
- public:
-  MaxSeqenceLenOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &rank_table =
-        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
-    auto *out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    int64_t *out_ptr = out->mutable_data<int64_t>({1}, platform::CPUPlace());
-    *out_ptr = rank_table.items()[0].length;
-  }
-};
-
-class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("RankTable", "Input variable which is a LoDRankTable object");
-    AddOutput("Out", "The max sequence length");
-    AddComment(R"DOC(
-    Given a LoDRankTable object, this layer returns the max length of
-    a batch of sequences. In fact, a LoDRankTable object contains a list of
-    tuples(<sequence index, sequence length>) and the list is already sorted by
-    sequence length in descending order, so the operator just returns the
-    sequence length of the first tuple element
-)DOC");
-  }
-};
-
-class MaxSeqenceLenInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("RankTable"));
-    context->SetOutputDim("Out", {1});
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(max_sequence_len, paddle::operators::MaxSeqenceLenOp,
-                  paddle::operators::MaxSeqenceLenOpProtoMaker,
-                  paddle::operators::MaxSeqenceLenInferShape,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
deleted file mode 100644
index 078d7bade7e0fdf088bd1bd84714bacc237b971e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/maxout_op.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#include "paddle/fluid/operators/maxout_op.h"
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of maxout operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of maxout operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is "
-              "the number of channels, H and W is the height and "
-              "width of feature.");
-    AddAttr<int>(
-        "groups",
-        "(int),"
-        "Specifies how many groups the input tensor will be split"
-        "in the channel dimension. And the number of output channel is "
-        "the number of channels divided by groups.");
-    AddComment(R"DOC(
-MaxOut Operator.
-
-Assumed the input shape is (N, Ci, H, W).
-The output shape is (N, Co, H, W).
-Then $Co = Ci / groups$ and the operator formula is as follows:
-
-$$ y_{si+j} = \max_{k} x_{gsi + sk + j} $$
-$$ g = groups $$
-$$ s = \\frac{input.size}{num\\_channels} $$
-$$ 0 \\le i < \\frac{num\\_channels}{groups} $$
-$$ 0 \\le j < s $$
-$$ 0 \\le k < groups $$
-
-Please refer to Paper:
-  - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-  - Multi-digit Number Recognition from Street View \
-    Imagery using Deep Convolutional Neural Networks: \
-    https://arxiv.org/pdf/1312.6082v4.pdf
-
-)DOC");
-  }
-};
-
-class MaxOutOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of MaxoutOpshould not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MaxoutOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
-    int groups = ctx->Attrs().Get<int>("groups");
-    // check groups > 1
-    PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
-    output_shape.push_back(in_x_dims[2]);
-    output_shape.push_back(in_x_dims[3]);
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  }
-};
-
-class MaxOutOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of MaxOutOpGrad must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(Grad@X) of MaxOutOpGrad should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(maxout, ops::MaxOutOp, ops::MaxOutOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/fluid/operators/maxout_op.cu.cc
deleted file mode 100644
index be1e81bb869a3a5144b72ef54af22f75b2146bc5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/maxout_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/maxout_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
deleted file mode 100644
index 5b9e003cb0978d400d8386f2ff492a83332d5e78..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/maxout_op.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/maxouting.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class MaxOutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    int groups = context.template Attr<int>("groups");
-
-    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
-    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
-                   groups);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MaxOutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int groups = context.template Attr<int>("groups");
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
-      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc
deleted file mode 100644
index bb290046f3a62d971dccd95f8550acdd5f68c847..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mean_iou_op.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mean_iou_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MeanIoUOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Predictions"),
-                   "Input (Predictions) of MeanIoU op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input (labels) of MeanIoU op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutMeanIou"),
-                   "Output (OutMeanIou) of MeanIoU op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutWrong"),
-                   "Output (OutWrong) of MeanIoU op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutCorrect"),
-                   "Output (OutWrong) of MeanIoU op should not be null.");
-
-    int64_t num_classes =
-        static_cast<int64_t>(ctx->Attrs().Get<int>("num_classes"));
-
-    ctx->SetOutputDim("OutMeanIou", {1});
-    ctx->SetOutputDim("OutWrong", {num_classes});
-    ctx->SetOutputDim("OutCorrect", {num_classes});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Predictions")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Predictions",
-             "(Tensor), A Tensor of prediction results for semantic labels"
-             " with type int32 or int64. The rank should be greater than 1.");
-    AddInput(
-        "Labels",
-        "(Tensor), A Tensor of ground truth labels with type int32 or int64."
-        "Its shape should be the same as Input(Predictions).");
-    AddInput("InWrongs",
-             "(vector<Tensor>), A list of Tensor with shape "
-             "[num_classes]. They are used to collect wrong number among "
-             "batches. Empty list is also valid here.")
-        .AsDuplicable()
-        .AsDispensable();
-    AddInput(
-        "InCorrects",
-        "(vector<Tensor>), A list of Tensor with shape "
-        "[num_classes]. They are used to collect correct number among batches. "
-        "Empty list is also valid here.")
-        .AsDuplicable()
-        .AsDispensable();
-    AddInput("InMeanIou",
-             "(vector<Tensor>), A list of Tensor that Output(mean_iou) should "
-             "be added to. Empty list is also valid here.")
-        .AsDuplicable()
-        .AsDispensable();
-    AddOutput("OutMeanIou",
-              "(vector<Tensor>), A Tensor representing the"
-              " mean intersection-over-union with shape [1].");
-    AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. ");
-    AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. ");
-    AddAttr<int>("num_classes", "(int), The possible number of labels.");
-
-    AddComment(R"DOC(
-mean-IOU Operator.
-Mean Intersection-Over-Union is a common evaluation metric for
-semantic image segmentation, which first computes the IOU for each
-semantic class and then computes the average over classes. 
-IOU is defined as follows: 
-    IOU = true_positive / (true_positive + false_positive + false_negative).
-It is based on pixel level area while "IOU Similarity Operator" 
-is based on area of rectangle.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(mean_iou, ops::MeanIoUOp, ops::MeanIoUOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(mean_iou, ops::MeanIoUKernel<int>,
-                       ops::MeanIoUKernel<int32_t>,
-                       ops::MeanIoUKernel<int64_t>);
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
deleted file mode 100644
index ada1892f43dcf33cf4db64215732189947f03579..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/mean_iou_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename T>
-__global__ void CountCUDAKernel(const int num_classes, const int count,
-                                const T* predictions, const T* labels,
-                                int* wrong, int* correct) {
-  extern __shared__ int blcok_cache[];
-  int* wrong_c = blcok_cache;
-  int* correct_c = blcok_cache + num_classes;
-  // init cache
-  for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) {
-    blcok_cache[i] = 0;
-  }
-  __syncthreads();
-
-  T pred;
-  T label;
-  CUDA_1D_KERNEL_LOOP(i, count) {
-    pred = predictions[i];
-    label = labels[i];
-    if (pred == label) {
-      atomicAdd(correct_c + pred, 1);
-    } else {
-      atomicAdd(wrong_c + pred, 1);
-      atomicAdd(wrong_c + label, 1);
-    }
-  }
-
-  __syncthreads();
-
-  for (int i = threadIdx.x; i < num_classes; i += blockDim.x) {
-    atomicAdd(wrong + i, wrong_c[i]);
-    atomicAdd(correct + i, correct_c[i]);
-  }
-}
-
-__global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
-                                     int* correct, float* ious, float* iou) {
-  __shared__ int valid_count_c;
-  if (threadIdx.x == 0) {
-    valid_count_c = 0;
-  }
-  __syncthreads();
-  CUDA_1D_KERNEL_LOOP(i, num_classes) {
-    int wrong_n = wrong[i];
-    int correct_n = correct[i];
-    int denominator = wrong_n + correct_n;
-    if (denominator > 0) {
-      atomicAdd(&valid_count_c, 1);
-      ious[i] = static_cast<float>(correct_n) / denominator;
-    } else {
-      ious[i] = 0;
-    }
-  }
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    float iou_sum = 0;
-    for (int i = 0; i < num_classes; ++i) {
-      iou_sum += ious[i];
-    }
-    iou[0] += iou_sum / valid_count_c;
-  }
-}
-
-template <typename T>
-class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto& place = *dev_ctx.eigen_device();
-    // get input and output tensor
-    auto* predictions = ctx.Input<Tensor>("Predictions");
-    auto* labels = ctx.Input<Tensor>("Labels");
-    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
-    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
-    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
-    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
-
-    // Get data ptr
-    const T* predictions_data = predictions->data<T>();
-    const T* labels_data = labels->data<T>();
-    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
-    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
-    float* out_mean_iou_data =
-        out_mean_iou->mutable_data<float>(ctx.GetPlace());
-
-    // Get Eigen tensor
-    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
-    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
-    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
-
-    // Temporary memory
-    auto tmp_ious_data = memory::Alloc(dev_ctx, num_classes * sizeof(float));
-    float* ious_data = static_cast<float*>(tmp_ious_data->ptr());
-
-    // Init out_wrong, out_correct and out_mean_iou
-    out_wrong_t.device(place) = out_wrong_t.constant(0);
-    out_correct_t.device(place) = out_correct_t.constant(0);
-    out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f);
-
-    // collect pre wrong, correct and mean_iou
-    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
-    for (int i = 0; i < in_mean_ious.size(); ++i) {
-      out_mean_iou_t.device(place) +=
-          EigenTensor<float, 1>::From(*in_mean_ious[i]);
-    }
-    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
-    for (int i = 0; i < in_wrongs.size(); ++i) {
-      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
-    }
-    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
-    for (int i = 0; i < in_corrects.size(); ++i) {
-      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
-    }
-    // compute
-    auto stream = ctx.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    int grid = (predictions->numel() + block - 1) / block;
-    int cache_size = (num_classes * 2 + 1) * sizeof(int);
-    CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
-        num_classes, predictions->numel(), predictions_data, labels_data,
-        out_wrong_data, out_correct_data);
-
-    ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
-                                                  out_correct_data, ious_data,
-                                                  out_mean_iou_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(mean_iou, ops::MeanIoUCUDAOpKernel<int>,
-                        ops::MeanIoUCUDAOpKernel<int64_t>,
-                        ops::MeanIoUCUDAOpKernel<int32_t>);
diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h
deleted file mode 100644
index 9fa00e60e05504e0bb8658c6908e4d4ac46b2ca4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mean_iou_op.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename T, int D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-template <typename T>
-class MeanIoUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    // get input and output tensor
-    auto* predictions = ctx.Input<Tensor>("Predictions");
-    auto* labels = ctx.Input<Tensor>("Labels");
-    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
-    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
-    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
-    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
-
-    // get data ptr
-    const T* predictions_data = predictions->data<T>();
-    const T* labels_data = labels->data<T>();
-    float* out_mean_iou_data =
-        out_mean_iou->mutable_data<float>(ctx.GetPlace());
-    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
-    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
-
-    // get eigen tensor
-    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
-    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
-    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
-
-    // Tmp tensor
-    Tensor denominator;
-    Tensor valid_count;
-    Tensor iou_sum;
-
-    // get data ptr of tmp tensor
-    int* denominator_data = denominator.mutable_data<int>(
-        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
-    int* valid_count_data = valid_count.mutable_data<int>({1}, ctx.GetPlace());
-    float* iou_sum_data = iou_sum.mutable_data<float>({1}, ctx.GetPlace());
-
-    // get eigen tensor of tmp tensor
-    auto denominator_t = EigenTensor<int, 1>::From(denominator);
-    auto valid_count_t = EigenTensor<int, 1>::From(valid_count);
-    auto iou_sum_t = EigenTensor<float, 1>::From(iou_sum);
-
-    // init out_wrong, out_correct and out_mean_iou
-    out_wrong_t = out_wrong_t.constant(0);
-    out_correct_t = out_correct_t.constant(0);
-    out_mean_iou_t = out_mean_iou_t.constant(0);
-
-    // collect pre wrong, correct and mean_iou
-    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
-    for (size_t i = 0; i < in_mean_ious.size(); ++i) {
-      out_mean_iou_t.device(place) +=
-          EigenTensor<float, 1>::From(*in_mean_ious[i]);
-    }
-    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
-    for (size_t i = 0; i < in_wrongs.size(); ++i) {
-      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
-    }
-    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
-    for (size_t i = 0; i < in_corrects.size(); ++i) {
-      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
-    }
-
-    // compute
-    for (int64_t i = 0; i < predictions->numel(); ++i) {
-      if (predictions_data[i] == labels_data[i]) {
-        out_correct_data[predictions_data[i]] += 1;
-      } else {
-        out_wrong_data[labels_data[i]] += 1;
-        out_wrong_data[predictions_data[i]] += 1;
-      }
-    }
-
-    denominator_t = out_wrong_t + out_correct_t;
-    valid_count_t =
-        (denominator_t > denominator_t.constant(0.0f)).cast<int>().sum();
-
-    for (int i = 0; i < num_classes; ++i) {
-      if (denominator_data[i] == 0) {
-        denominator_data[i] = 1;
-      }
-    }
-
-    iou_sum_t =
-        (out_correct_t.cast<float>() / denominator_t.cast<float>()).sum();
-    out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
deleted file mode 100644
index 2b2f8450768b9885381f10b19631a6a200c7f703..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mean_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mean_op.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-namespace paddle {
-namespace operators {
-
-class MeanOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of MeanOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MeanOp should not be null.");
-    ctx->SetOutputDim("Out", {1});
-  }
-};
-
-class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input of mean op");
-    AddOutput("Out", "(Tensor) The output of mean op");
-    AddComment(R"DOC(
-Mean Operator calculates the mean of all elements in X.
-
-)DOC");
-  }
-};
-
-class MeanOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
-  }
-};
-
-class MeanGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type();
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-class MeanGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* grad_op = new framework::OpDesc();
-    grad_op->SetType("mean_grad");
-    grad_op->SetInput("X", Input("X"));
-    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(MeanGradNoNeedBufferVarsInference, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType,
-                  ops::MeanGradMaker);
-REGISTER_OPERATOR(mean_grad, ops::MeanGradOp,
-                  ops::MeanGradNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(
-    mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
deleted file mode 100644
index 921c2e1298906655767c1e7f30dc34b2c564c671..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mean_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/mean_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
deleted file mode 100644
index 360b2f68a749f630d3c7ed009c16cb51ec150581..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mean_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class MeanKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    output->mutable_data<T>(context.GetPlace());
-
-    auto X = EigenVector<T>::Flatten(*input);
-    auto y = EigenScalar<T>::From(*output);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    y.device(place) = X.mean();
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MeanGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(OG->numel() == 1, "Mean Gradient should be scalar");
-    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
-    IG->mutable_data<T>(context.GetPlace());
-
-    T ig_size = static_cast<T>(IG->numel());
-    Eigen::DSizes<int, 1> bcast(static_cast<int>(ig_size));
-    EigenVector<T>::Flatten(*IG).device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        (EigenVector<T>::From(*OG) / ig_size).broadcast(bcast);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
deleted file mode 100644
index 6a9d8222c4435c470460fbf3564cdc8d668783ce..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-using LoD = framework::LoD;
-
-class MergeLoDTensorOp : public framework::OperatorBase {
- public:
-  MergeLoDTensorOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- protected:
-  void RunBase(const framework::Scope &scope,
-               const platform::Place &dev_place) const {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
-    auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
-    auto &in_false =
-        scope.FindVar(Input("InFalse"))->Get<framework::LoDTensor>();
-    auto *out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    auto level = static_cast<size_t>(Attr<int>("level"));
-
-    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
-                   "Input(InTrue) or Input(InFalse) should be initialized.");
-
-    auto &mask_dim = mask.dims();
-    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
-    if (platform::is_cpu_place(mask.place())) {
-      cpu_mask->ShareDataWith(mask);
-    } else if (platform::is_gpu_place(mask.place())) {
-#ifdef PADDLE_WITH_CUDA
-      framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
-                            cpu_mask.get());
-#else
-      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
-#endif
-    }
-    auto *mask_data = cpu_mask->data<bool>();
-
-    platform::Place place = dev_place;
-    int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
-    auto data_type = in_true.IsInitialized() ? in_true.type() : in_false.type();
-    int rank;
-    framework::DDim in_dims;
-    if (in_true.IsInitialized()) {
-      rank = in_true.dims().size();
-      in_dims = framework::slice_ddim(in_true.dims(), 1, rank);
-    } else {
-      rank = in_false.dims().size();
-      in_dims = framework::slice_ddim(in_false.dims(), 1, rank);
-    }
-
-    auto in_dim_vec = framework::vectorize(in_dims);
-    in_dim_vec.insert(in_dim_vec.begin(), batch_size);
-
-    framework::DDim out_dims = framework::make_ddim(in_dim_vec);
-    out->Resize(out_dims);
-
-    out->mutable_data(place, data_type);
-
-    auto *out_lod = out->mutable_lod();
-    out_lod->clear();
-    size_t out_offset = 0;
-
-    // Build LoDTensor `out`
-
-    size_t in_true_idx = 0;
-    size_t in_false_idx = 0;
-    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
-      const framework::LoDTensor *input = nullptr;
-      size_t *in_idx = nullptr;
-      if (static_cast<int>(mask_data[i]) == 0) {
-        input = &in_false;
-        in_idx = &in_false_idx;
-      } else {
-        input = &in_true;
-        in_idx = &in_true_idx;
-      }
-      auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
-          input->lod(), *in_idx, (*in_idx) + 1, 0);
-      auto &lod_length = lod_and_offset.first;
-
-      framework::AppendLoD(out_lod, lod_length);
-
-      size_t start_offset = lod_and_offset.second.first;
-      size_t end_offset = lod_and_offset.second.second;
-
-      PADDLE_ENFORCE_GE(end_offset, start_offset);
-      size_t len = end_offset - start_offset;
-      if (len == 0) {
-        continue;
-      }
-      auto slice = out->Slice(out_offset, out_offset + len);
-      framework::TensorCopy(input->Slice(start_offset, end_offset), place,
-                            dev_ctx, &slice);
-      out_offset += len;
-      (*in_idx) += 1;
-    }
-
-    for (size_t i = 0; i < level; i++) {
-      out_lod->insert(out_lod->begin(), x.lod()[i]);
-    }
-  }
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    RunBase(scope, dev_place);
-  }
-};
-
-class MergeLoDTensorInferOp : public MergeLoDTensorOp {
- public:
-  MergeLoDTensorInferOp(const std::string &type,
-                        const framework::VariableNameMap &inputs,
-                        const framework::VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs)
-      : MergeLoDTensorOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    RunBase(scope, dev_place);
-    framework::Variable *in_true_var = scope.FindVar(Input("InTrue"));
-    framework::Variable *in_false_var = scope.FindVar(Input("InFalse"));
-    in_true_var->Clear();
-    in_false_var->Clear();
-    in_true_var->GetMutable<framework::LoDTensor>();
-    in_false_var->GetMutable<framework::LoDTensor>();
-  }
-};
-
-class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input LoDTensor, contains complete lod information to "
-             "construct the output");
-    AddInput("Mask", "A bool column vector which mask the input");
-    AddInput("InTrue", "The True branch to be merged");
-    AddInput("InFalse", "The False branch to be merged");
-    AddOutput("Out", "The merged output LoDTensor");
-    AddAttr<int>("level", "(int) the specific lod level to rank.")
-        .SetDefault(0)
-        .EqualGreaterThan(0);
-    AddComment(
-        R"DOC(
-        Merge True and False branches of LoDTensor into a single Output,
-        with a mask at certain lod level. X is used to obtain complete
-        lod information. Please refer to SplitLoDTensorOp.)DOC");
-  }
-};
-
-class MergeLoDTensorInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "MergeLoDTensorOp must has input X.");
-    PADDLE_ENFORCE(context->HasInput("Mask"),
-                   "MergeLoDTensorOp must has input Mask.");
-    PADDLE_ENFORCE(context->HasInput("InTrue"),
-                   "MergeLoDTensorOp must has input InTrue.");
-    PADDLE_ENFORCE(context->HasInput("InFalse"),
-                   "MergeLoDTensorOp must has input InFalse.");
-    PADDLE_ENFORCE(context->HasOutput("Out"),
-                   "MergeLoDTensorOp must has output Out");
-
-    auto mask_dim = context->GetInputDim("Mask");
-    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
-    if (context->IsRuntime() || mask_dim[1] > 0) {
-      PADDLE_ENFORCE_EQ(mask_dim[1], 1);
-    }
-
-    context->SetOutputDim("Out", context->GetInputDim("InTrue"));
-  }
-};
-
-class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("split_lod_tensor");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetInput("Mask", Input("Mask"));
-    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
-    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp,
-                  ops::MergeLoDTensorOpProtoMaker,
-                  ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker);
-REGISTER_OPERATOR(merge_lod_tensor_infer, ops::MergeLoDTensorInferOp,
-                  ops::MergeLoDTensorOpProtoMaker,
-                  ops::MergeLoDTensorInferShape,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc
deleted file mode 100644
index 50f44c7fc5ec90420d7c38f0f536ff7adb8f9ec4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/merge_selected_rows_op.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/merge_selected_rows_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MergeSelectedRowsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of MergeSelectedRowsOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MergeSelectedRowsOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X").front(),
-                      framework::proto::VarType::SELECTED_ROWS,
-                      "Input X only should be SelectedRows.");
-    PADDLE_ENFORCE_EQ(ctx->GetOutputsVarType("Out").front(),
-                      framework::proto::VarType::SELECTED_ROWS,
-                      "Output Y only should be SelectedRows.");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-  }
-};
-
-class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input type is SelectedRows, and the selected rows may be "
-             "duplicated.");
-    AddOutput("Out",
-              "The output type is SelectedRows, and the selected rows are not "
-              "duplicated.");
-    AddComment(
-        R"DOC(
-MergeSelectedRows Operator.
-
-MergeSelectedRows is used to merge the duplicated rows of the input. The
-output's row has no duplicated, and it's order is incremental.
-
-Example:
-  Input:
-    X.rows is [0, 5, 5, 4, 19]
-    X.height is 20
-    X.value is:
-        [[1, 1]
-         [2, 2]
-         [3, 3]
-         [4, 4]
-         [6, 6]]
-
-   Output:
-    Out.row is [0, 4, 5, 19]
-    Out.height is 20
-    Out.value is:
-        [[1, 1]
-         [4, 4]
-         [5, 5]
-         [6, 6]]
-)DOC");
-  }
-};
-
-class MergeSelectedRowsOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OPERATOR(merge_selected_rows, ops::MergeSelectedRowsOp,
-                  ops::MergeSelectedRowsOpMaker,
-                  ops::MergeSelectedRowsOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    merge_selected_rows,
-    ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, float>,
-    ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/merge_selected_rows_op.cu.cc b/paddle/fluid/operators/merge_selected_rows_op.cu.cc
deleted file mode 100644
index 90d5fb3eaeb1f155eeea29ea0cf3f5ecd610f5f0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/merge_selected_rows_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/merge_selected_rows_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    merge_selected_rows,
-    ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, float>,
-    ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h
deleted file mode 100644
index 4c977e94b175c988e4253b273365b0cabc4b87aa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/merge_selected_rows_op.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MergeSelectedRowsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::SelectedRows>("X");
-    auto* out = context.Output<framework::SelectedRows>("Out");
-
-    math::scatter::MergeAdd<DeviceContext, T> merge_func;
-    merge_func(context.template device_context<DeviceContext>(), *x, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/CMakeLists.txt b/paddle/fluid/operators/metrics/CMakeLists.txt
deleted file mode 100644
index 5d468316e8eacb73c4a4ce81c784880bb5e46c2d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-include(operators)
-register_operators()
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
deleted file mode 100644
index 26e6ab1568d15362c7793fe1eb1e970e4a8946d7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AccuracyOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"),
-                   "Input (Out) of accuracy op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input (Indices) of accuracy op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input (Label) of accuracy op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
-                   "Output (Accuracy) of AccuracyOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Correct"),
-                   "Output (Correct) of AccuracyOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Total"),
-                   "Output (Total) of AccuracyOp should not be null.");
-
-    auto inference_dim = ctx->GetInputDim("Out");
-    auto label_dim = ctx->GetInputDim("Label");
-    // Assume indices has same shape as inference, because
-    // it's the output of topk.
-
-    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
-    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, label_dim[1], 1,
-                                 "label's second dimension must be 1");
-    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, inference_dim[0], label_dim[0],
-                                 "the inference tensor's num_rows must be"
-                                 " the same as label.");
-
-    ctx->SetOutputDim("Accuracy", {1});
-    ctx->SetOutputDim("Correct", {1});
-    ctx->SetOutputDim("Total", {1});
-    ctx->ShareLoD("Out", /*->*/ "Accuracy");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Out")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    // TODO(typhoonzero): support both inference value and indices.
-    AddInput("Out", "The network output of topk (inferences)");
-    AddInput("Indices", "The the network output of topk (indices)");
-    AddInput("Label", "Label of the training data");
-    // TODO(typhoonzero): AddInput("Weight", ...
-    AddOutput("Accuracy", "The accuracy of current batch");
-    AddOutput("Correct", "The correct samples count of current batch");
-    AddOutput("Total", "The samples count of current batch");
-
-    AddComment(R"DOC(
-Accuracy Operator. 
-
-It will print accuracy rate for classification.
-The accuracy is calculated as follows:
-
-$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
-
-Both the input Out and Label can carry the LoD (Level of Details)
-information, or not. But the output only shares the LoD information 
-with the input Out(Inference).
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-// FIXME(typhoonzero): types of T is for infernece data.
-// label data is always int.
-REGISTER_OP_CPU_KERNEL(accuracy,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
deleted file mode 100644
index 4682940f7e15bc8af5dcda24ea058ac7351887c6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/execution_policy.h>
-#include <thrust/reduce.h>
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <int BlockSize>
-__global__ void AccuracyCudaKernel(const int N, const int D,
-                                   const int64_t* Xdata,
-                                   const int64_t* labeldata, int* correct_data,
-                                   float* accuracy, int* total_data) {
-  int count = 0;
-  __shared__ int total[BlockSize];
-
-  // support only 1 block
-  for (int i = threadIdx.x; i < (N); i += BlockSize) {
-    for (int j = 0; j < D; ++j) {
-      if (Xdata[i * D + j] == labeldata[i]) {
-        ++count;
-        break;
-      }
-    }
-  }
-  total[threadIdx.x] = count;
-  __syncthreads();
-
-  // reduce the count with init value 0, and output accuracy.
-  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
-  if (threadIdx.x == 0) {
-    *correct_data = result;
-    *accuracy = static_cast<float>(result) / static_cast<float>(N);
-    *total_data = N;
-  }
-}
-
-template <typename T>
-class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-    // FIXME(typhoonzero): only support indices currently
-    // if add support for output values, how to detect the data type?
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    int num_samples = static_cast<int>(inference->dims()[0]);
-    size_t infer_width = inference->dims()[1];
-    auto stream = ctx.cuda_device_context().stream();
-    platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    AccuracyCudaKernel<
-        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data, total_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-// FIXME(typhoonzero): types of T is for inference data.
-// label data is always int64
-REGISTER_OP_CUDA_KERNEL(
-    accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-    paddle::operators::AccuracyOpCUDAKernel<double>,
-    paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h
deleted file mode 100644
index 803244dd48efc634bf5e654a35cb3dd572842882..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AccuracyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    size_t num_samples = inference->dims()[0];
-    size_t class_dim = inference->dims()[1];
-    *accuracy_data = 0.0f;
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    int num_correct = 0;
-    // assume inference is already the topk of the output
-    for (size_t i = 0; i < num_samples; ++i) {
-      PADDLE_ENFORCE_GE(label_data[i], 0, "label must >= 0");
-      for (size_t j = 0; j < class_dim; ++j) {
-        if (indices_data[i * class_dim + j] == label_data[i]) {
-          ++num_correct;
-          break;
-        }
-      }
-    }
-
-    *correct_data = num_correct;
-    *total_data = num_samples;
-    *accuracy_data =
-        static_cast<float>(num_correct) / static_cast<float>(num_samples);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc
deleted file mode 100644
index e0eebad08bb6b9a15d9c0f356215404884bee0e9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/metrics/auc_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AucOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Predict"),
-                   "Input of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input of Label should not be null.");
-    auto predict_width = ctx->GetInputDim("Predict")[1];
-    PADDLE_INFERSHAPE_ENFORCE_LE(ctx, predict_width, 2,
-                                 "Only support binary classification,"
-                                 "prediction dims[1] should be 1 or 2");
-    auto predict_height = ctx->GetInputDim("Predict")[0];
-    auto label_height = ctx->GetInputDim("Label")[0];
-
-    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, predict_height, label_height,
-                                 "Out and Label should have same height.");
-
-    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
-    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
-
-    PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1");
-    PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number");
-
-    ctx->SetOutputDim("AUC", {1});
-
-    slide_steps = slide_steps == 0 ? 1 : slide_steps;
-    ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets});
-    ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Predict")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class AucOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Predict",
-             "A floating point 2D tensor with shape [batch_size, 2], values "
-             "are in the range [0, 1]."
-             "Typically, this tensor indicates the probability of each label");
-    AddInput("Label",
-             "A 2D int tensor indicating the label of the training data. "
-             "shape: [batch_size, 1]");
-
-    // TODO(typhoonzero): support weight input
-    AddInput("StatPos", "Statistic value when label = 1");
-    AddInput("StatNeg", "Statistic value when label = 0");
-
-    AddOutput("AUC",
-              "A scalar representing the "
-              "current area-under-the-curve.");
-
-    AddOutput("StatPosOut", "Statistic value when label = 1");
-    AddOutput("StatNegOut", "Statistic value when label = 0");
-
-    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
-        .SetDefault("ROC");
-
-    AddAttr<int>(
-        "num_thresholds",
-        "The number of thresholds to use when discretizing the roc curve.")
-        .SetDefault((2 << 12) - 1);
-    AddAttr<int>("slide_steps", "Use slide steps to calc batch auc.")
-        .SetDefault(1);
-    AddComment(R"DOC(
-Area Under The Curve (AUC) Operator.
-
-This implementation computes the AUC according to forward output and label.
-It is used very widely in binary classification evaluation. As a note:
-If input label contains values other than 0 and 1, it will be cast
-to bool. You can find the relevant definitions here:
-https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
-
-There are two types of possible curves:
-1. ROC: Receiver operating characteristic
-2. PR: Precision Recall
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker);
-REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h
deleted file mode 100644
index 6fb4749b35a37dfbb18d322920b2744d7a0882d4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/auc_op.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AucKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *predict = ctx.Input<Tensor>("Predict");
-    auto *label = ctx.Input<Tensor>("Label");
-
-    std::string curve = ctx.Attr<std::string>("curve");
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    // buckets contain numbers from 0 to num_thresholds
-    int num_pred_buckets = num_thresholds + 1;
-    int slide_steps = ctx.Attr<int>("slide_steps");
-
-    // Only use output var for now, make sure it's persistable and
-    // not cleaned up for each batch.
-    auto *auc = ctx.Output<Tensor>("AUC");
-    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
-    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
-
-    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-
-    std::vector<int64_t> stat_pos_data(num_pred_buckets, 0);
-    std::vector<int64_t> stat_neg_data(num_pred_buckets, 0);
-
-    auto stat_pos_calc = stat_pos_data.data();
-    auto stat_neg_calc = stat_neg_data.data();
-
-    statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps,
-            origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc);
-
-    calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc);
-  }
-
- private:
-  inline static double trapezoidArea(double X1, double X2, double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  inline static void statAuc(const framework::Tensor *label,
-                             const framework::Tensor *predict,
-                             const int num_pred_buckets,
-                             const int num_thresholds, const int slide_steps,
-                             int64_t *origin_stat_pos, int64_t *origin_stat_neg,
-                             int64_t **stat_pos, int64_t **stat_neg) {
-    size_t batch_size = predict->dims()[0];
-    size_t inference_width = predict->dims()[1];
-    const T *inference_data = predict->data<T>();
-    const auto *label_data = label->data<int64_t>();
-
-    for (size_t i = 0; i < batch_size; i++) {
-      // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
-      // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
-      auto predict_data =
-          inference_data[i * inference_width + (inference_width - 1)];
-      PADDLE_ENFORCE_LE(predict_data, 1,
-                        "The predict data must less or equal 1.");
-      PADDLE_ENFORCE_GE(predict_data, 0,
-                        "The predict data must gather or equal 0.");
-
-      uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
-      if (label_data[i]) {
-        (*stat_pos)[binIdx] += 1.0;
-      } else {
-        (*stat_neg)[binIdx] += 1.0;
-      }
-    }
-
-    int bucket_length = num_pred_buckets * sizeof(int64_t);
-
-    // will stat auc unlimited.
-    if (slide_steps == 0) {
-      for (int slide = 0; slide < num_pred_buckets; ++slide) {
-        origin_stat_pos[slide] += (*stat_pos)[slide];
-        origin_stat_neg[slide] += (*stat_neg)[slide];
-      }
-
-      *stat_pos = origin_stat_pos;
-      *stat_neg = origin_stat_neg;
-
-    } else {
-      for (int slide = 1; slide < slide_steps; ++slide) {
-        int dst_idx = (slide - 1) * num_pred_buckets;
-        int src_inx = slide * num_pred_buckets;
-        std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx,
-                    bucket_length);
-        std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx,
-                    bucket_length);
-      }
-
-      std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets,
-                  *stat_pos, bucket_length);
-      std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets,
-                  *stat_neg, bucket_length);
-
-      std::memset(*stat_pos, 0, bucket_length);
-      std::memset(*stat_neg, 0, bucket_length);
-
-      for (int slide = 0; slide < num_pred_buckets; ++slide) {
-        int stat_pos_steps = 0;
-        int stat_neg_steps = 0;
-        for (int step = 0; step < slide_steps; ++step) {
-          stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets];
-          stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets];
-        }
-        (*stat_pos)[slide] += stat_pos_steps;
-        (*stat_neg)[slide] += stat_neg_steps;
-      }
-    }
-  }
-
-  inline static void calcAuc(const framework::ExecutionContext &ctx,
-                             int64_t *stat_pos, int64_t *stat_neg,
-                             int num_thresholds,
-                             framework::Tensor *auc_tensor) {
-    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
-    *auc = 0.0f;
-
-    double totPos = 0.0;
-    double totNeg = 0.0;
-    double totPosPrev = 0.0;
-    double totNegPrev = 0.0;
-
-    int idx = num_thresholds;
-
-    while (idx >= 0) {
-      totPosPrev = totPos;
-      totNegPrev = totNeg;
-      totPos += stat_pos[idx];
-      totNeg += stat_neg[idx];
-      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-      --idx;
-    }
-
-    if (totPos > 0.0 && totNeg > 0.0) {
-      *auc = *auc / totPos / totNeg;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc
deleted file mode 100644
index f6d6ffc668c9aaa40e12e7289d4f97fc656e2c70..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/metrics/precision_recall_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PrecisionRecallOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("MaxProbs"),
-                   "Input(MaxProbs) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input(Indices) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchMetrics"),
-                   "Output(BatchMetrics) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("AccumMetrics"),
-                   "Output(AccumMetrics) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("AccumStatesInfo"),
-                   "Output(AccumStatesInfo) should not be null.");
-
-    int64_t cls_num =
-        static_cast<int64_t>(ctx->Attrs().Get<int>("class_number"));
-    auto max_probs_dims = ctx->GetInputDim("MaxProbs");
-    auto labels_dims = ctx->GetInputDim("Labels");
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
-                        "Each instance contains one max probability, so the "
-                        "shape of Input(MaxProbs) should be [batch_size, 1].");
-      PADDLE_ENFORCE_EQ(
-          ctx->GetInputDim("Indices"), max_probs_dims,
-          "The shape of Input(Indices) should bes same with max_probs_dims");
-      PADDLE_ENFORCE_EQ(
-          max_probs_dims[0], labels_dims[0],
-          "The 1st dimension of Input(MaxProbs) and "
-          "Input(Labels) both are batch_size and the shape should "
-          "be the same.");
-      PADDLE_ENFORCE_EQ(labels_dims[1], 1,
-                        "The 2nd dimension of Input(Labels) contains instance "
-                        "label and the shape should be equal to 1.");
-    }
-    if (ctx->HasInput("Weights")) {
-      auto weights_dims = ctx->GetInputDim("Weights");
-
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(weights_dims,
-                          framework::make_ddim({max_probs_dims[0], 1}),
-                          "The shape of Input(Weights) should be "
-                          "[batch_size, 1].");
-      }
-    }
-    if (ctx->HasInput("StatesInfo")) {
-      auto states_dims = ctx->GetInputDim("StatesInfo");
-
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
-                          "The shape of Input(StatesInfo) should be "
-                          "[class_number, 4].");
-      }
-    }
-
-    // Layouts of BatchMetrics and AccumMetrics both are:
-    // [
-    //  macro average precision, macro average recall, macro average F1 score,
-    //  micro average precision, micro average recall, micro average F1 score
-    // ]
-    ctx->SetOutputDim("BatchMetrics", {6});
-    ctx->SetOutputDim("AccumMetrics", {6});
-    // Shape of AccumStatesInfo is [class_number, 4]
-    // The layout of each row is:
-    // [ TP, FP, TN, FN ]
-    ctx->SetOutputDim("AccumStatesInfo", {cls_num, 4});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("MaxProbs")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("MaxProbs",
-             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each row contains the max probability "
-             "of an instance which computed by the previous top_k (k=1) "
-             "operator.");
-    AddInput("Indices",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each row contains the corresponding "
-             "index which computed by the previous top_k (k=1) operator.");
-    AddInput("Labels",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each element is a label and the "
-             "value should be in [0, class_number - 1].");
-    AddInput("Weights",
-             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. This input is optional. If provided, "
-             "weight of instance would be considered when computing metrics.")
-        .AsDispensable();
-    AddInput("StatesInfo",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape D x 4, "
-             "where D is the number of classes. This input is optional. If "
-             "provided, current state will be accumulated to this state and "
-             "the accumulation state will be the output state.")
-        .AsDispensable();
-    AddOutput("BatchMetrics",
-              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
-              "This output tensor contains metrics for current batch data. "
-              "The layout is [macro average precision, macro average recall, "
-              "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score].");
-    AddOutput("AccumMetrics",
-              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
-              "This output tensor contains metrics for accumulated data. "
-              "The layout is [macro average precision, macro average recall, "
-              "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score].");
-    AddOutput("AccumStatesInfo",
-              "(Tensor, default Tensor<float>) A 2-D tensor with shape D x 4, "
-              "where D is equal to class number. This output tensor contains "
-              "accumulated state variables used to compute metrics. The layout "
-              "for each class is [true positives, false positives, "
-              "true negatives, false negatives].");
-    AddAttr<int>("class_number", "(int) Number of classes to be evaluated.");
-    AddComment(R"DOC(
-Precision Recall Operator.
-
-When given Input(Indices) and Input(Labels), this operator can be used
-to compute various metrics including:
-1. macro average precision
-2. macro average recall
-3. macro f1 score
-4. micro average precision
-5. micro average recall
-6. micro f1 score
-
-To compute the above metrics, we need to do statistics for true positives,
-false positives and false negatives. Here the count of true negatives is not
-necessary, but counting it may provide potential usage and the cost is
-trivial, so the operator also provides the count of true negatives.
-
-We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
-state contains statistic variables for corresponding class. Layout of each row
-is: TP(true positives), FP(false positives), TN(true negatives),
-FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be
-calculated by given weight instead of the instance count.
-
-This operator also supports metrics computing for cross-batch situation. To
-achieve this, Input(StatesInfo) should be provided. State of current batch
-data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo)
-is the accumulation state.
-
-Output(BatchMetrics) is metrics of current batch data while
-Output(AccumStatesInfo) is metrics of accumulation data.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(precision_recall, ops::PrecisionRecallOp,
-                             ops::PrecisionRecallOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    precision_recall,
-    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, float>,
-    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.h b/paddle/fluid/operators/metrics/precision_recall_op.h
deleted file mode 100644
index d6d4a5adc3ed0760f2a1356c70eda275c3195969..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/precision_recall_op.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-enum StateVariable { TP = 0, FP, TN, FN };
-
-template <typename DeviceContext, typename T>
-class PrecisionRecallKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in0 = ctx.Input<Tensor>("Indices");
-    auto* in1 = ctx.Input<Tensor>("Labels");
-    auto* in2 = ctx.Input<Tensor>("Weights");
-    auto* in3 = ctx.Input<Tensor>("StatesInfo");
-    auto* out0 = ctx.Output<Tensor>("BatchMetrics");
-    auto* out1 = ctx.Output<Tensor>("AccumMetrics");
-    auto* out2 = ctx.Output<Tensor>("AccumStatesInfo");
-
-    const int* ids_data = in0->data<int>();
-    const int* labels_data = in1->data<int>();
-    size_t cls_num = static_cast<size_t>(ctx.Attr<int>("class_number"));
-    const T* weights_data = in2 ? in2->data<T>() : nullptr;
-    const T* states_data = in3 ? in3->data<T>() : nullptr;
-    double* batch_metrics_data = out0->mutable_data<double>(ctx.GetPlace());
-    double* accum_metrics_data = out1->mutable_data<double>(ctx.GetPlace());
-    out2->mutable_data<T>(ctx.GetPlace());
-    auto accum_states = EigenMatrix<T>::From(*out2);
-    accum_states.setZero();
-    T* accum_states_data = out2->data<T>();
-
-    size_t sample_num = in0->dims()[0];
-    size_t state_var_num = 4;  // TP FP TN FN
-
-    // get states info for current batch
-    for (size_t i = 0; i < sample_num; ++i) {
-      size_t idx = ids_data[i];
-      size_t label = labels_data[i];
-
-      PADDLE_ENFORCE(idx >= 0 && idx < cls_num,
-                     "Class index of each instance should be in "
-                     "[0, class_number).");
-      PADDLE_ENFORCE(label >= 0 && label < cls_num,
-                     "Label of each instance should be in [0, class_number).");
-
-      T w = weights_data ? weights_data[i] : 1.0;
-      if (idx == label) {
-        accum_states_data[idx * state_var_num + TP] += w;
-        for (size_t j = 0; j < cls_num; ++j) {
-          accum_states_data[j * state_var_num + TN] += w;
-        }
-        accum_states_data[idx * state_var_num + TN] -= w;
-      } else {
-        accum_states_data[label * state_var_num + FN] += w;
-        accum_states_data[idx * state_var_num + FP] += w;
-        for (size_t j = 0; j < cls_num; ++j) {
-          accum_states_data[j * state_var_num + TN] += w;
-        }
-        accum_states_data[idx * state_var_num + TN] -= w;
-        accum_states_data[label * state_var_num + TN] -= w;
-      }
-    }
-
-    ComputeMetrics(accum_states_data, batch_metrics_data, state_var_num,
-                   cls_num);
-
-    if (states_data) {
-      for (size_t i = 0; i < cls_num; ++i) {
-        for (size_t j = 0; j < state_var_num; ++j) {
-          size_t idx = i * state_var_num + j;
-          accum_states_data[idx] += states_data[idx];
-        }
-      }
-    }
-
-    ComputeMetrics(accum_states_data, accum_metrics_data, state_var_num,
-                   cls_num);
-  }
-
-  // expose to be reused
-  static inline T CalcPrecision(T tp_count, T fp_count) {
-    if (tp_count > 0.0 || fp_count > 0.0) {
-      return tp_count / (tp_count + fp_count);
-    }
-    return 1.0;
-  }
-
-  static inline T CalcRecall(T tp_count, T fn_count) {
-    if (tp_count > 0.0 || fn_count > 0.0) {
-      return tp_count / (tp_count + fn_count);
-    }
-    return 1.0;
-  }
-
-  static inline T CalcF1Score(T precision, T recall) {
-    if (precision > 0.0 || recall > 0.0) {
-      return 2 * precision * recall / (precision + recall);
-    }
-    return 0.0;
-  }
-
- protected:
-  void ComputeMetrics(const T* states_data, double* metrics_data,
-                      size_t state_var_num, size_t cls_num) const {
-    T total_tp_count = 0;
-    T total_fp_count = 0;
-    T total_fn_count = 0;
-    T macro_avg_precision = 0.0;
-    T macro_avg_recall = 0.0;
-
-    for (size_t i = 0; i < cls_num; ++i) {
-      T tp_count = states_data[i * state_var_num + TP];
-      T fp_count = states_data[i * state_var_num + FP];
-      T fn_count = states_data[i * state_var_num + FN];
-      total_tp_count += tp_count;
-      total_fp_count += fp_count;
-      total_fn_count += fn_count;
-      macro_avg_precision += CalcPrecision(tp_count, fp_count);
-      macro_avg_recall += CalcRecall(tp_count, fn_count);
-    }
-    macro_avg_precision /= cls_num;
-    macro_avg_recall /= cls_num;
-    T macro_f1_score = CalcF1Score(macro_avg_precision, macro_avg_recall);
-
-    T micro_avg_precision = CalcPrecision(total_tp_count, total_fp_count);
-    T micro_avg_recall = CalcRecall(total_tp_count, total_fn_count);
-    T micro_f1_score = CalcF1Score(micro_avg_precision, micro_avg_recall);
-
-    // fill metrics data
-    metrics_data[0] = macro_avg_precision;
-    metrics_data[1] = macro_avg_recall;
-    metrics_data[2] = macro_f1_score;
-    metrics_data[3] = micro_avg_precision;
-    metrics_data[4] = micro_avg_recall;
-    metrics_data[5] = micro_f1_score;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
deleted file mode 100644
index 02a90d77b6e54475f4e722266d0a3b2046ea33ed..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/minus_op.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/minus_op.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class MinusOp : public framework::OperatorWithKernel {
- public:
-  MinusOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of MinusOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of MinusOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MinusOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    if (ctx->IsRuntime() ||
-        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(
-          x_dims, y_dims,
-          "Minus operator must take two tensor with same num of elements");
-    }
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The left tensor of minus operator.");
-    AddInput("Y", "The right tensor of minus operator.");
-    AddOutput("Out", "The output tensor of minus operator.");
-
-    AddComment(R"DOC(
-Minus Operator.
-
-Equation:
-
-    $Out = X - Y$
-
-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
-
-)DOC");
-  }
-};
-
-class MinusGradMaker : public framework::GradOpDescMakerBase {
- public:
-  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<framework::OpDesc>> ops;
-    auto x_g = InputGrad("X");
-    if (!x_g.empty()) {
-      auto *x_g_op = new framework::OpDesc();
-      x_g_op->SetType("scale");
-      x_g_op->SetInput("X", OutputGrad("Out"));
-      x_g_op->SetOutput("Out", x_g);
-      x_g_op->SetAttr("scale", 1.0f);
-      ops.emplace_back(x_g_op);
-    }
-
-    auto y_g = InputGrad("Y");
-    if (!y_g.empty()) {
-      auto *y_g_op = new framework::OpDesc();
-      y_g_op->SetType("scale");
-      y_g_op->SetInput("X", OutputGrad("Out"));
-      y_g_op->SetOutput("Out", y_g);
-      y_g_op->SetAttr("scale", -1.0f);
-      ops.emplace_back(y_g_op);
-    }
-
-    return ops;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
-REGISTER_OP_CPU_KERNEL(
-    minus, ops::MinusKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/minus_op.cu b/paddle/fluid/operators/minus_op.cu
deleted file mode 100644
index 956d935da9b96696e9148fc4dfab23a6a6c29016..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/minus_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/minus_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    minus,
-    paddle::operators::MinusKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/minus_op.h b/paddle/fluid/operators/minus_op.h
deleted file mode 100644
index 7791b1456a81516e48db645501c717d9c4cf8749..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/minus_op.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MinusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* left_tensor = context.Input<framework::Tensor>("X");
-    auto* right_tensor = context.Input<framework::Tensor>("Y");
-    auto* out_tensor = context.Output<framework::Tensor>("Out");
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
-        framework::EigenVector<T>::Flatten(*left_tensor) -
-        framework::EigenVector<T>::Flatten(*right_tensor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
deleted file mode 100644
index 414576f1a29d54b5a467be507ef34c76cff3b7ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::DataLayout;
-using framework::Tensor;
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::stream;
-using platform::GetMKLDNNFormat;
-using platform::MKLDNNDeviceContext;
-using platform::to_void_cast;
-
-template <typename Functor>
-class MKLDNNActivationKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for X tensor");
-    PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for X tensor");
-
-    Functor functor;
-    functor(ctx);
-  }
-};
-
-template <typename Functor>
-class MKLDNNActivationGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input OutGrad tensor");
-    PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input OutGrad tensor");
-
-    PADDLE_ENFORCE_EQ(
-        ctx.Attr<bool>("is_test"), false,
-        "is_test attribute should be set to False in training phase.");
-
-    Functor functor;
-    functor(ctx);
-  }
-};
-
-template <typename T>
-void eltwise_forward(const framework::ExecutionContext &ctx,
-                     mkldnn::algorithm algorithm) {
-  PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                 "It must use CPUPlace.");
-  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-
-  const auto *x = ctx.Input<Tensor>("X");
-  auto *y = ctx.Output<Tensor>("Out");
-
-  const T alpha = ctx.op().HasAttr("alpha") ? ctx.Attr<T>("alpha") : 0;
-  const T beta = ctx.op().HasAttr("beta") ? ctx.Attr<T>("beta") : 0;
-
-  PADDLE_ENFORCE(
-      x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4,
-      "Input dim must be with 2, 3 or 4");
-
-  auto src_tz = framework::vectorize<int>(x->dims());
-
-  auto src_format = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : x->format();
-
-  bool is_test = ctx.Attr<bool>("is_test");
-
-  platform::ActivationMKLDNNHandler<T> handler(
-      src_tz, algorithm, alpha, beta, src_format, is_test, dev_ctx,
-      ctx.GetPlace(), ctx.op().Input("X"));
-
-  auto src_memory_p = handler.AcquireSrcMemory(x);
-  auto dst_memory_p = handler.AcquireDstMemory(y);
-  auto activation_p =
-      handler.AcquireForwardPrimitive(*src_memory_p, *dst_memory_p);
-
-  // push primitive to stream and wait until it's executed
-  std::vector<primitive> pipeline;
-  pipeline.push_back(*activation_p);
-  stream(stream::kind::eager).submit(pipeline).wait();
-
-  y->set_layout(DataLayout::kMKLDNN);
-  y->set_format(GetMKLDNNFormat(*dst_memory_p));
-}
-
-template <typename T>
-void eltwise_grad(const framework::ExecutionContext &ctx,
-                  mkldnn::algorithm algorithm) {
-  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-
-  const auto *x = ctx.Input<Tensor>("X");
-  const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-  const T alpha = ctx.op().HasAttr("alpha") ? ctx.Attr<T>("alpha") : 0;
-  const T beta = ctx.op().HasAttr("beta") ? ctx.Attr<T>("beta") : 0;
-
-  auto diff_dst_tz = framework::vectorize<int>(diff_y->dims());
-
-  // diff_dst and src dims should be the same
-  auto src_format =
-      diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : x->format();
-
-  auto diff_y_format =
-      diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : diff_y->format();
-
-  platform::ActivationMKLDNNHandler<T> handler(
-      diff_dst_tz, algorithm, alpha, beta, src_format, diff_y_format, dev_ctx,
-      ctx.GetPlace(), ctx.op().Input("X"));
-
-  auto src_memory_p = handler.AcquireBackwardSrcMemory(x);
-  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y);
-  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(diff_x);
-  auto activation_backward_p = handler.AcquireBackwardPrimitive(
-      *src_memory_p, *diff_dst_memory_p, *diff_src_memory_p);
-
-  // push primitive to stream and wait until it's executed
-  std::vector<primitive> pipeline;
-  pipeline.push_back(*activation_backward_p);
-  stream(stream::kind::eager).submit(pipeline).wait();
-
-  diff_x->set_layout(DataLayout::kMKLDNN);
-  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory_p));
-}
-
-template <typename T, mkldnn::algorithm algorithm>
-struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    eltwise_forward<T>(ctx, algorithm);
-  }
-};
-
-template <typename T, mkldnn::algorithm algorithm>
-struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    eltwise_grad<T>(ctx, algorithm);
-  }
-};
-
-template <typename T>
-using ReluMKLDNNFunctor =
-    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
-
-template <typename T>
-using TanhMKLDNNFunctor =
-    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
-
-template <typename T>
-using SqrtMKLDNNFunctor =
-    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
-
-template <typename T>
-using AbsMKLDNNFunctor =
-    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
-
-template <typename T>
-using ReluMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
-
-template <typename T>
-using TanhMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
-
-template <typename T>
-using SqrtMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
-
-template <typename T>
-using AbsMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
-  REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace,       \
-                     ops::MKLDNNActivationKernel<ops::functor<float>>);    \
-  REGISTER_OP_KERNEL(                                                      \
-      act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,               \
-      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
-
-#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                  \
-  __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);       \
-  __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
-  __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor);       \
-  __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor);       \
-  __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);
-
-FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
deleted file mode 100644
index f3209151b359aaba52d8bd5259013d79f130096d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ /dev/null
@@ -1,526 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "mkldnn.hpp"
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using batch_norm_bwd = mkldnn::batch_normalization_backward;
-using batch_norm_fwd = mkldnn::batch_normalization_forward;
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::reorder;
-using mkldnn::stream;
-using paddle::platform::MKLDNNDeviceContext;
-using paddle::platform::MKLDNNMemDesc;
-using platform::to_void_cast;
-
-namespace {
-template <typename T>
-struct bn_type_traits {
-  using op_type = T;
-  using op_desc = typename op_type::desc;
-  using op_prim = typename op_type::primitive_desc;
-};
-
-class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
- public:
-  BatchNormMKLDNNHandler(const platform::MKLDNNDeviceContext &dev_ctx,
-                         mkldnn::engine engine, const std::string &base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
-
-  std::shared_ptr<memory> AcquireScaleshiftMemoryFromPrimitive(void *ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        batch_norm_pd_->weights_primitive_desc(), ptr, "@scaleshift_mem_p");
-  }
-
-  std::shared_ptr<memory> AcquireMeanMemoryFromPrimitive(void *ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        batch_norm_pd_->mean_primitive_desc(), ptr, "@mean_mem_p");
-  }
-
-  std::shared_ptr<memory> AcquireVarianceMemoryFromPrimitive(void *ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p");
-  }
-
-  template <typename T>
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(
-      framework::Tensor *output, platform::Place place) {
-    T *ptr = output->mutable_data<T>(
-        place, batch_norm_pd_->dst_primitive_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(
-        batch_norm_pd_->dst_primitive_desc(), ptr, "@dst_mem_p");
-  }
-
-  std::shared_ptr<batch_norm_fwd::primitive_desc>
-  AcquireBatchNormPrimitiveDescriptor(const batch_norm_fwd::desc &bn_fwd_desc,
-                                      const mkldnn::engine &engine) {
-    // BatchNorm PD has to be passed to Grad op that
-    // may be executed by diffrent thread, hence
-    // for that one we use key that does not contain TID
-    const std::string key_batch_norm_fwd_pd = key_common_ + "@bn_fwd_pd";
-    batch_norm_pd_ = std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
-        dev_ctx_.GetBlob(key_batch_norm_fwd_pd));
-
-    if (batch_norm_pd_ == nullptr) {
-      static std::mutex acquire_barrier;
-      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
-          acquire_barrier);
-      batch_norm_pd_ = std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
-          dev_ctx_.GetBlob(key_batch_norm_fwd_pd));
-      if (batch_norm_pd_ == nullptr) {
-        batch_norm_pd_.reset(
-            new batch_norm_fwd::primitive_desc(bn_fwd_desc, engine));
-        dev_ctx_.SetBlob(key_batch_norm_fwd_pd, batch_norm_pd_);
-      }
-    }
-    return batch_norm_pd_;
-  }
-
-  std::shared_ptr<batch_norm_fwd> AcquireTestTrainingBatchNormFwd(
-      std::shared_ptr<memory> src_memory,
-      std::shared_ptr<memory> scaleshift_memory,
-      std::shared_ptr<memory> dst_memory, std::shared_ptr<memory> mean_memory,
-      std::shared_ptr<memory> variance_memory, bool is_test) {
-    auto prim_key = key_ + "@batch_norm_p";
-    auto batch_norm_p =
-        std::static_pointer_cast<batch_norm_fwd>(dev_ctx_.GetBlob(prim_key));
-
-    if (batch_norm_p == nullptr) {
-      if (is_test) {
-        batch_norm_p = std::make_shared<batch_norm_fwd>(
-            *batch_norm_pd_, *src_memory,
-            (const mkldnn::primitive::at &)*mean_memory,
-            (const mkldnn::primitive::at &)*variance_memory, *scaleshift_memory,
-            *dst_memory);
-      } else {
-        batch_norm_p = std::make_shared<batch_norm_fwd>(
-            *batch_norm_pd_, *src_memory, *scaleshift_memory, *dst_memory,
-            *mean_memory, *variance_memory);
-      }
-
-      dev_ctx_.SetBlob(prim_key, batch_norm_p);
-    }
-
-    return batch_norm_p;
-  }
-
- private:
-  std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_pd_;
-};
-
-std::shared_ptr<memory> UpdateMemoryData(
-    const platform::MKLDNNDeviceContext &dev_ctx, const std::string &key,
-    void *new_ptr) {
-  auto mem = std::static_pointer_cast<memory>(dev_ctx.GetBlob(key));
-  PADDLE_ENFORCE(
-      mem != nullptr,
-      (std::string("Fail to find memory in device context [key: ") + key + "]")
-          .c_str());
-  mem->set_data_handle(new_ptr);
-  return mem;
-}
-
-template <typename T, typename Container>
-void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
-                     Container *c) {
-  auto it = std::begin(*c);
-
-  std::copy(scale_begin, scale_end, std::inserter(*c, it));
-  std::copy(
-      shift_begin, shift_end,
-      std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end))));
-}
-
-}  // namespace
-
-template <typename T>
-class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
-    bool global_stats = is_test || use_global_stats;
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *mean = ctx.Input<Tensor>("Mean");
-    const auto *variance = ctx.Input<Tensor>("Variance");
-
-    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    auto mkldnn_engine = dev_ctx.GetEngine();
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *batch_mean = ctx.Output<Tensor>("SavedMean");
-    auto *batch_variance = ctx.Output<Tensor>("SavedVariance");
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *shift = ctx.Input<Tensor>("Bias");
-
-    PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for X tensor");
-    PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for X tensor");
-
-    const T *x_data = x->data<T>();
-    const T *mean_data = mean->data<T>();
-    const T *variance_data = variance->data<T>();
-    T *mean_out_data = mean_out->mutable_data<T>(ctx.GetPlace());
-    T *variance_out_data = variance_out->mutable_data<T>(ctx.GetPlace());
-    T *batch_mean_data = nullptr;
-    T *batch_variance_data = nullptr;
-
-    if (!global_stats) {
-      batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
-      batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
-    }
-
-    auto propagation = global_stats == true
-                           ? mkldnn::prop_kind::forward_scoring
-                           : mkldnn::prop_kind::forward_training;
-
-    auto src_tz = paddle::framework::vectorize<int>(x->dims());
-    auto scale_tz = paddle::framework::vectorize<int>(scale->dims());
-    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
-    const unsigned int ic = scale_tz[0];
-
-    // MKLDNN requires a single piece of memory for scale and shift/bias data
-    const size_t scaleshift_size = 2 * ic;
-    std::vector<T> scaleshift_data;
-    scaleshift_data.reserve(scaleshift_size);
-
-    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
-                    shift->data<T>() + ic, &scaleshift_data);
-
-    unsigned flags = mkldnn::use_scale_shift;
-    if (global_stats) flags |= mkldnn::use_global_stats;
-    if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
-
-    // create mkldnn memory from input x tensor
-    MKLDNNMemoryFormat input_format =
-        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
-
-    // keys for backward pass
-    const std::string key =
-        platform::CreateKey(src_tz, epsilon, flags, global_stats, input_format,
-                            ctx.op().Output("SavedMean"));
-    BatchNormMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
-
-    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
-
-    // create primitive descriptor for batch norm forward
-    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
-    auto batch_norm_fwd_desc =
-        bn_fwd_types::op_desc{propagation, user_src_md, epsilon, flags};
-
-    auto batch_norm_fwd_pd = handler.AcquireBatchNormPrimitiveDescriptor(
-        batch_norm_fwd_desc, mkldnn_engine);
-
-    auto src_memory =
-        handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
-
-    // crate mkldnn memory for weights(scale/shift)
-    auto scaleshift_memory =
-        handler.AcquireScaleshiftMemoryFromPrimitive(scaleshift_data.data());
-
-    // create mkldnn memory for output y tensor
-    auto dst_memory =
-        handler.AcquireDstMemoryFromPrimitive<T>(y, ctx.GetPlace());
-
-    std::shared_ptr<batch_norm_fwd> batch_norm_p;
-    if (global_stats) {
-      // create mkldnn memory for stats (as input)
-      std::shared_ptr<memory> mean_memory =
-          handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data));
-      std::shared_ptr<memory> variance_memory =
-          handler.AcquireVarianceMemoryFromPrimitive(
-              to_void_cast(variance_data));
-
-      batch_norm_p = handler.AcquireTestTrainingBatchNormFwd(
-          src_memory, scaleshift_memory, dst_memory, mean_memory,
-          variance_memory, true);
-    } else {
-      // create mkldnn memory for stats (as output)
-      std::shared_ptr<memory> mean_memory =
-          handler.AcquireMeanMemoryFromPrimitive(batch_mean_data);
-      std::shared_ptr<memory> variance_memory =
-          handler.AcquireVarianceMemoryFromPrimitive(batch_variance_data);
-
-      batch_norm_p = handler.AcquireTestTrainingBatchNormFwd(
-          src_memory, scaleshift_memory, dst_memory, mean_memory,
-          variance_memory, false);
-    }
-
-    y->set_layout(DataLayout::kMKLDNN);
-    y->set_format(platform::GetMKLDNNFormat(*dst_memory));
-
-    std::vector<mkldnn::primitive> pipeline;
-    pipeline.push_back(*batch_norm_p);
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-
-    if (!global_stats) {
-      // mkldnn only compute stats for current batch
-      // so we need compute momentum stats via Eigen lib
-      EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
-      EigenVectorArrayMap<T> batch_variance_e(batch_variance_data, ic);
-      ConstEigenVectorArrayMap<T> mean_e(mean_data, ic);
-      ConstEigenVectorArrayMap<T> variance_e{variance_data, ic};
-
-      EigenVectorArrayMap<T> running_mean_e(mean_out_data, ic);
-      EigenVectorArrayMap<T> running_variance_e(variance_out_data, ic);
-
-      auto one_minus_momentum = 1. - momentum;
-      running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum;
-      running_variance_e =
-          variance_e * momentum + batch_variance_e * one_minus_momentum;
-    }
-  }
-};
-
-template <typename T>
-class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    auto mkldnn_engine = dev_ctx.GetEngine();
-
-    const float epsilon = ctx.Attr<float>("epsilon");
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *shift = ctx.Input<Tensor>("Bias");
-    const auto *batch_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *batch_variance = ctx.Input<Tensor>("SavedVariance");
-
-    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input diff_y tensor");
-    PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input diff_y tensor");
-
-    const T *x_data = x->data<T>();
-    const T *diff_y_data = diff_y->data<T>();
-    const T *batch_mean_data = batch_mean->data<T>();
-    const T *batch_variance_data = batch_variance->data<T>();
-    const T *scale_data = scale->data<T>();
-    const T *shift_data = shift->data<T>();
-    T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
-
-    T *diff_scale_data = diff_scale->mutable_data<T>(ctx.GetPlace());
-    T *diff_shift_data = diff_shift->mutable_data<T>(ctx.GetPlace());
-
-    auto src_tz = paddle::framework::vectorize<int>(x->dims());
-    auto diff_src_tz = src_tz;
-    auto dst_tz = src_tz;
-    auto diff_dst_tz = dst_tz;
-    auto scale_tz = paddle::framework::vectorize<int>(scale->dims());
-    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
-
-    const unsigned int ic = scale_tz[0];
-
-    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
-
-    MKLDNNMemoryFormat dst_format =
-        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
-
-    MKLDNNMemoryFormat input_format =
-        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
-
-    unsigned flags = mkldnn::use_scale_shift;
-
-    // keys from forward pass
-    const std::string key =
-        platform::CreateKey(src_tz, epsilon, flags, false, input_format,
-                            ctx.op().Input("SavedMean"));
-    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
-
-    // keys for primitives reuse
-    const std::string key_with_hash =
-        key + platform::CreateKey(src_tz, epsilon, flags, false, input_format);
-    const std::string key_batch_norm_bwd_p =
-        key_with_hash + "@batch_norm_bwd_p";
-    const std::string key_batch_norm_src_mem_p =
-        key_with_hash + "@batch_norm_bwd_src_mem_p";
-    const std::string key_batch_norm_mean_mem_p =
-        key_with_hash + "@batch_norm_bwd_mean_mem_p";
-    const std::string key_batch_norm_variance_mem_p =
-        key_with_hash + "@batch_norm_bwd_variance_mem_p";
-    const std::string key_batch_norm_scaleshift_mem_p =
-        key_with_hash + "@batch_norm_bwd_scaleshift_mem_p";
-    const std::string key_batch_norm_diff_scaleshift_mem_p =
-        key_with_hash + "@batch_norm_bwd_diff_scaleshift_mem_p";
-    const std::string key_batch_norm_diff_src_mem_p =
-        key_with_hash + "@batch_norm_bwd_diff_src_mem_p";
-    const std::string key_batch_norm_diff_dst_mem_p =
-        key_with_hash + "@batch_norm_bwd_diff_dst_mem_p";
-
-    primitive reorder_diff_dst;
-    bool is_diff_dst_reordered = false;
-    auto user_diff_dst_memory = memory(
-        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
-        to_void_cast(diff_y_data));
-
-    // MKLDNN requires a single piece of memory for scale and shift/bias data
-    const size_t scaleshift_size = 2 * ic;
-
-    std::vector<T> scaleshift_data;
-    scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic,
-                    &scaleshift_data);
-
-    std::vector<T> diff_scaleshift_data;
-    diff_scaleshift_data.reserve(scaleshift_size);
-
-    auto batch_norm_fwd_pd =
-        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
-            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
-    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
-                   "Fail to find batch_norm_fwd_pd in device context");
-
-    auto batch_norm_bwd_p = std::static_pointer_cast<batch_norm_bwd>(
-        dev_ctx.GetBlob(key_batch_norm_bwd_p));
-
-    if (batch_norm_bwd_p == nullptr) {
-      auto src_memory = std::shared_ptr<memory>(new memory(
-          {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
-          to_void_cast(x_data)));
-
-      // for diff_dst, try to use same format as dst in forward pass
-      auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
-      auto diff_dst_md = diff_dst_pd.desc();
-
-      // create primitive descriptor for batch norm backward
-      auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
-          mkldnn::prop_kind::backward, diff_dst_md,
-          src_memory->get_primitive_desc().desc(), epsilon, flags};
-      auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
-          batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
-
-      // reorder user_diff_dst if it's not in preferred format
-      auto diff_dst_memory = std::make_shared<memory>(user_diff_dst_memory);
-      if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
-        diff_dst_memory = std::make_shared<memory>(diff_dst_pd);
-        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
-        is_diff_dst_reordered = true;
-      }
-
-      // create mkldnn memory for input tensors (src/mean/variance)
-      auto mean_memory =
-          std::make_shared<memory>(batch_norm_bwd_pd.mean_primitive_desc(),
-                                   to_void_cast(batch_mean_data));
-      auto variance_memory =
-          std::make_shared<memory>(batch_norm_bwd_pd.variance_primitive_desc(),
-                                   to_void_cast(batch_variance_data));
-
-      // create mkldnn memory for input tensors (scale/shift)
-      auto scaleshift_memory = std::make_shared<memory>(
-          batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data());
-
-      // create mkldnn memory for output diff weights (combined scale/shift)
-      auto diff_scaleshift_memory = std::make_shared<memory>(
-          batch_norm_bwd_pd.diff_weights_primitive_desc(),
-          diff_scaleshift_data.data());
-
-      // here assume diff_src is in the same format of src
-      auto diff_src_memory = std::make_shared<memory>(
-          src_memory->get_primitive_desc(), diff_x_data);
-
-      // finally create batch_norm backward primitive
-      batch_norm_bwd_p = std::make_shared<batch_norm_bwd>(
-          batch_norm_bwd_pd, *src_memory, *mean_memory, *variance_memory,
-          *diff_dst_memory, *scaleshift_memory, *diff_src_memory,
-          *diff_scaleshift_memory);
-
-      dev_ctx.SetBlob(key_batch_norm_bwd_p, batch_norm_bwd_p);
-      dev_ctx.SetBlob(key_batch_norm_src_mem_p, src_memory);
-      dev_ctx.SetBlob(key_batch_norm_mean_mem_p, mean_memory);
-      dev_ctx.SetBlob(key_batch_norm_variance_mem_p, variance_memory);
-      dev_ctx.SetBlob(key_batch_norm_scaleshift_mem_p, scaleshift_memory);
-      dev_ctx.SetBlob(key_batch_norm_diff_scaleshift_mem_p,
-                      diff_scaleshift_memory);
-      dev_ctx.SetBlob(key_batch_norm_diff_src_mem_p, diff_src_memory);
-      dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory);
-
-      // set layout/format of output tensors
-      diff_x->set_layout(DataLayout::kMKLDNN);
-      diff_x->set_format(
-          (MKLDNNMemoryFormat)diff_src_memory->get_primitive_desc()
-              .desc()
-              .data.format);
-    } else {
-      // primitives already exist
-      UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data));
-      UpdateMemoryData(dev_ctx, key_batch_norm_mean_mem_p,
-                       to_void_cast(batch_mean_data));
-      UpdateMemoryData(dev_ctx, key_batch_norm_variance_mem_p,
-                       to_void_cast(batch_variance_data));
-      UpdateMemoryData(dev_ctx, key_batch_norm_scaleshift_mem_p,
-                       scaleshift_data.data());
-      UpdateMemoryData(dev_ctx, key_batch_norm_diff_scaleshift_mem_p,
-                       diff_scaleshift_data.data());
-      auto diff_src_memory = UpdateMemoryData(
-          dev_ctx, key_batch_norm_diff_src_mem_p, to_void_cast(diff_x_data));
-      auto diff_dst_memory = UpdateMemoryData(
-          dev_ctx, key_batch_norm_diff_dst_mem_p, to_void_cast(diff_y_data));
-
-      // reorder user_diff_dst if it's not in preferred format
-      if (diff_dst_memory->get_primitive_desc() !=
-          user_diff_dst_memory.get_primitive_desc()) {
-        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
-        is_diff_dst_reordered = true;
-      }
-
-      // set layout/format of output tensors
-      diff_x->set_layout(DataLayout::kMKLDNN);
-      diff_x->set_format(
-          (MKLDNNMemoryFormat)diff_src_memory->get_primitive_desc()
-              .desc()
-              .data.format);
-    }
-
-    // execute optional reorder and batch_norm backward primitive
-    std::vector<primitive> pipeline;
-    if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst);
-    pipeline.push_back(*batch_norm_bwd_p);
-    stream(stream::kind::eager).submit(pipeline).wait();
-
-    // copy back diff sacle/shift to output tensors (diff scale/shift)
-    diff_scaleshift_data.resize(scaleshift_size);
-    auto it = std::begin(diff_scaleshift_data);
-    std::copy(it, std::next(it, ic), diff_scale_data);
-    std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
-              diff_shift_data);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::BatchNormMKLDNNOpKernel<float>);
-REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::BatchNormMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
deleted file mode 100644
index 8823e086555ced4b738b2da8971e716a0540b354..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include "paddle/fluid/operators/concat_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::DataLayout;
-using framework::Tensor;
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::concat;
-using mkldnn::stream;
-using platform::to_void_cast;
-
-static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
-  for (auto* input : inputs) {
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
-  }
-}
-
-static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
-                                                const mkldnn::engine& engine,
-                                                const memory::data_type& dt) {
-  const auto dims = paddle::framework::vectorize<int>(input.dims());
-  const auto format = input.format();
-  auto description = memory::desc(dims, dt, format);
-  auto mem_prim_desc = memory::primitive_desc(description, engine);
-  return mem_prim_desc;
-}
-
-static MKLDNNMemoryFormat GetDstMemFormat(
-    const concat::primitive_desc& concat_pd) {
-  return (MKLDNNMemoryFormat)concat_pd.dst_primitive_desc().desc().data.format;
-}
-
-static platform::CPUPlace GetCpuPlace(
-    const paddle::framework::ExecutionContext& ctx) {
-  auto place = ctx.GetPlace();
-  PADDLE_ENFORCE(paddle::platform::is_cpu_place(place),
-                 "It must use CPUPlace.");
-  return boost::get<platform::CPUPlace>(place);
-}
-
-static const mkldnn::engine& GetMKLDNNEngine(
-    const paddle::framework::ExecutionContext& ctx) {
-  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
-  return dev_ctx.GetEngine();
-}
-
-template <typename T>
-class ConcatPrimitiveFactory {
- public:
-  concat::primitive_desc CreateConcatPrimDescriptor(
-      const std::vector<const Tensor*> multi_input, Tensor* output,
-      int concat_axis, const mkldnn::engine& mkldnn_engine,
-      const memory::data_type& dt = memory::data_type::f32) {
-    CreateSourcesDescriptors(multi_input, mkldnn_engine, dt);
-    auto dst_desc = CreateDstMemDescriptor(output, dt);
-    return concat::primitive_desc(dst_desc, concat_axis, srcs_pd);
-  }
-
-  concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd,
-                               Tensor* output, platform::CPUPlace place) {
-    CreateSourcePrimitiveAts();
-    dst_mem = CreateDstMemory(concat_pd, output, place);
-    return concat(concat_pd, inputs, dst_mem.get());
-  }
-
-  void SetSrcDataHandleByIndex(const std::vector<memory>& srcs, const size_t& i,
-                               void* handler) {
-    srcs[i].set_data_handle(handler);
-  }
-
-  void SetDstDataHandle(const memory& dst_mem, void* handler) {
-    dst_mem.set_data_handle(handler);
-  }
-
-  std::vector<memory> GetSrcs() { return srcs; }
-
-  memory GetDst() { return dst_mem.get(); }
-
- private:
-  memory::desc CreateDstMemDescriptor(Tensor* output,
-                                      const memory::data_type& dt) {
-    auto dst_dims = paddle::framework::vectorize<int>(output->dims());
-    return memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
-  }
-
-  mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd,
-                                 Tensor* output,
-                                 const platform::CPUPlace& place) {
-    return memory(concat_pd.dst_primitive_desc(),
-                  output->mutable_data<T>(place));
-  }
-
-  void CreateSourcesDescriptors(const std::vector<const Tensor*> multi_input,
-                                const mkldnn::engine& mkldnn_engine,
-                                const memory::data_type& dt) {
-    for (size_t i = 0; i < multi_input.size(); i++) {
-      auto mem_prim_desc =
-          CreateMemPrimDesc(*multi_input[i], mkldnn_engine, dt);
-      srcs_pd.push_back(mem_prim_desc);
-      srcs.push_back(
-          memory(mem_prim_desc, to_void_cast(multi_input[i]->data<T>())));
-    }
-  }
-
-  void CreateSourcePrimitiveAts() {
-    inputs.reserve(srcs.size());
-    for (size_t i = 0; i < srcs.size(); i++) {
-      inputs.push_back(srcs[i]);
-    }
-  }
-
- private:
-  std::vector<memory::primitive_desc> srcs_pd;
-  std::vector<memory> srcs;
-  std::vector<primitive::at> inputs;
-  boost::optional<memory> dst_mem;
-};
-
-template <typename T>
-class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto multi_input = ctx.MultiInput<Tensor>("X");
-    EnforceLayouts(multi_input);
-    Tensor* output = ctx.Output<Tensor>("Out");
-    int64_t concat_axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    auto place = GetCpuPlace(ctx);
-
-    memory::data_type dt =
-        paddle::framework::ToMKLDNNDataType(multi_input[0]->type());
-
-    ConcatPrimitiveFactory<T> prim_creator;
-    std::string key = platform::CreateKey(
-        paddle::framework::vectorize<int>(multi_input[0]->dims()), concat_axis,
-        ctx.op().Output("Out"), dt, multi_input[0]->format(),
-        platform::ThreadIDasStr());
-    const std::string key_prim = key + "@concat_p";
-    const std::string key_concat_pd = key + "@concat_pd";
-    const std::string key_srcs = key + "@concat_srcs";
-    const std::string key_dst = key + "@concat_dst";
-
-    std::shared_ptr<concat::primitive_desc> concat_pd;
-    std::shared_ptr<std::vector<memory>> srcs;
-    std::shared_ptr<memory> dst_mem;
-    auto concat_p = std::static_pointer_cast<concat>(dev_ctx.GetBlob(key_prim));
-
-    if (concat_p == nullptr) {
-      const auto& mkldnn_engine = dev_ctx.GetEngine();
-      concat_pd = std::make_shared<concat::primitive_desc>(
-          prim_creator.CreateConcatPrimDescriptor(multi_input, output,
-                                                  static_cast<int>(concat_axis),
-                                                  mkldnn_engine, dt));
-      concat_p = std::make_shared<concat>(
-          prim_creator.CreateConcatPrimitive(*concat_pd, output, place));
-      srcs = std::make_shared<std::vector<memory>>(prim_creator.GetSrcs());
-      dst_mem = std::make_shared<memory>(prim_creator.GetDst());
-      dev_ctx.SetBlob(key_prim, concat_p);
-      dev_ctx.SetBlob(key_concat_pd, concat_pd);
-      dev_ctx.SetBlob(key_srcs, srcs);
-      dev_ctx.SetBlob(key_dst, dst_mem);
-    } else {
-      srcs = std::static_pointer_cast<std::vector<memory>>(
-          dev_ctx.GetBlob(key_srcs));
-      dst_mem = std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_dst));
-      concat_pd = std::static_pointer_cast<concat::primitive_desc>(
-          dev_ctx.GetBlob(key_concat_pd));
-      for (size_t i = 0; i < multi_input.size(); i++) {
-        prim_creator.SetSrcDataHandleByIndex(
-            *srcs, i, to_void_cast<T>(multi_input[i]->data<T>()));
-      }
-      prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data<T>(place));
-    }
-
-    stream(stream::kind::eager).submit({*concat_p}).wait();
-
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetDstMemFormat(*concat_pd));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ConcatMKLDNNOpKernel<float>,
-                   ops::ConcatMKLDNNOpKernel<int8_t>,
-                   ops::ConcatMKLDNNOpKernel<uint8_t>);
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
deleted file mode 100644
index 86c7d7a5cc624c37519245c0db47eeaeffed6375..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ /dev/null
@@ -1,826 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <unordered_map>
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::DataLayout;
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::reorder;
-using mkldnn::stream;
-using platform::to_void_cast;
-using platform::GetMKLDNNFormat;
-
-constexpr int same_scale_mask = 0;
-constexpr int o_slice_mask = 1 << 0;                         // 1
-constexpr int g_slice_mask = 1 << 1;                         // 2
-constexpr int g_o_slice_mask = g_slice_mask | o_slice_mask;  // 3
-
-static int ComputeMask(bool is_multi_channel, int multi_channel_mask) {
-  return is_multi_channel ? multi_channel_mask : same_scale_mask;
-}
-
-static int ComputeWeightsMask(int is_multi_channel, int g) {
-  int multi_channel_mask = g > 1 ? g_o_slice_mask : o_slice_mask;
-  return ComputeMask(is_multi_channel, multi_channel_mask);
-}
-
-static int ComputeBiasMask(int is_multi_channel) {
-  return ComputeMask(is_multi_channel, o_slice_mask);
-}
-
-inline void GetWeightsTz(std::vector<int>& weights_tz, int groups) {  // NOLINT
-  if (groups > 1) {
-    // if (is_conv3d) [o, i, dimension, h, w]->[g, o/g, i, dimension, h, w]
-    // else [o, i, h, w] -> [g, o/g, i, h, w]
-    weights_tz.push_back(0);
-    std::rotate(weights_tz.begin(), weights_tz.end() - 1, weights_tz.end());
-    weights_tz[0] = groups;
-    weights_tz[1] = weights_tz[1] / groups;
-  }
-}
-
-inline MKLDNNMemoryFormat GetWeightsFormat(MKLDNNMemoryFormat format,
-                                           int groups, bool is_conv3d) {
-  if (is_conv3d) {
-    return (groups == 1) ? format : MKLDNNMemoryFormat::goidhw;
-  } else {
-    return (groups == 1) ? format : MKLDNNMemoryFormat::goihw;
-  }
-}
-
-static std::vector<float> ComputeOutputShiftScale(
-    const float scale_out_data, const float scale_in_data,
-    const std::vector<float>& scale_weights_data) {
-  int count = scale_weights_data.size();
-  std::vector<float> output_shift_scale(count);
-#pragma omp parallel for
-  for (int i = 0; i < count; i++) {
-    if (scale_weights_data[i] == 0.0) {
-      output_shift_scale[i] = scale_out_data;
-    } else {
-      output_shift_scale[i] =
-          static_cast<float>(static_cast<double>(scale_out_data) /
-                             (static_cast<double>(scale_in_data) *
-                              static_cast<double>(scale_weights_data[i])));
-    }
-  }
-  return output_shift_scale;
-}
-
-static std::vector<float> ComputeBiasScale(
-    const float scale_in_data, const std::vector<float>& scale_weights_data) {
-  int count = scale_weights_data.size();
-  std::vector<float> scale_bias_data(count);
-#pragma omp parallel for if (count > 1)
-  for (int i = 0; i < count; i++) {
-    scale_bias_data[i] = scale_in_data * scale_weights_data[i];
-  }
-  return scale_bias_data;
-}
-
-static mkldnn::memory::data_type GetDstType(bool is_int8,
-                                            bool force_fp32_output,
-                                            std::string fuse_activation,
-                                            bool fuse_residual_conn,
-                                            const Tensor* residual_param) {
-  auto dst_dt = mkldnn::memory::data_type::f32;  // uint8_t, int8_t, float
-  if (is_int8 && !force_fp32_output) {
-    if (fuse_residual_conn && residual_param) {
-      // when residual exists, dst_dt will follow the residual_param type,
-      // but output will to be set to u8 if relu exists
-      auto residual_dt = framework::ToMKLDNNDataType(residual_param->type());
-      dst_dt = residual_dt;
-    } else {
-      // when residual does not exist, if (b)relu exist s8 else s8
-      dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6")
-                   ? mkldnn::memory::data_type::u8
-                   : mkldnn::memory::data_type::s8;
-    }
-  }
-  return dst_dt;
-}
-
-template <typename T>
-class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-    bool is_INT8 =
-        std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
-    if (!is_INT8) {
-      ComputeFP32(ctx);
-    } else {
-      std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
-      bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
-      bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-      auto residual_param = ctx.Input<Tensor>("ResidualData");
-      auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation,
-                               fuse_residual_conn, residual_param);
-      if (dst_dt == mkldnn::memory::data_type::f32) {
-        ComputeINT8<float>(ctx);
-      } else if (dst_dt == mkldnn::memory::data_type::u8) {
-        ComputeINT8<uint8_t>(ctx);
-      } else if (dst_dt == mkldnn::memory::data_type::s8) {
-        ComputeINT8<int8_t>(ctx);
-      }
-    }
-  }
-
-  void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
-    const bool is_test = ctx.Attr<bool>("is_test");
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
-    auto* output = ctx.Output<Tensor>("Output");
-
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
-
-    PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Filter tensor");
-    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Filter tensor");
-
-    PADDLE_ENFORCE_GE(
-        input->dims().size(), 4,
-        "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
-    PADDLE_ENFORCE_LE(
-        input->dims().size(), 5,
-        "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
-
-    PADDLE_ENFORCE_GE(
-        filter->dims().size(), 4,
-        "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
-    PADDLE_ENFORCE_LE(
-        filter->dims().size(), 5,
-        "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
-
-    if (bias) {
-      PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for Bias tensor");
-      PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for Bias tensor");
-
-      PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
-                        "Bias must only have 1 dimension, i.e. X");
-    }
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
-    float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-    float fuse_beta = ctx.Attr<float>("fuse_beta");
-    bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
-    int groups = ctx.Attr<int>("groups");
-    bool is_conv3d = strides.size() == 3U;
-
-    PADDLE_ENFORCE(
-        is_conv3d
-            ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
-                  dilations[2] == 1
-            : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-        "dilation in convolution is not implemented yet");
-
-    const T* input_data = input->data<T>();
-    const T* filter_data = filter->data<T>();
-
-    auto src_tz = paddle::framework::vectorize<int>(input->dims());
-    auto weights_tz = paddle::framework::vectorize<int>(filter->dims());
-    int g = std::max(groups, 1);
-    GetWeightsTz(weights_tz, g);
-    auto dst_tz = paddle::framework::vectorize<int>(output->dims());
-
-    // Get unique name for storing MKLDNN primitives
-    const std::string key = platform::CreateKey(
-        src_tz, weights_tz, fuse_activation, strides, paddings, dilations,
-        groups, ctx.op().Input("Input") + ctx.op().Input("Filter"));
-
-    std::vector<primitive> pipeline;
-
-    auto src_format = input->format();
-    MKLDNNMemoryFormat weights_format =
-        GetWeightsFormat(filter->format(), g, is_conv3d);
-
-    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
-    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
-
-    /* create memory descriptor for convolution without specified format
-     * ('any') which lets a primitive (convolution in this case) choose
-     * the memory format preferred for best performance
-     */
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    auto chosen_memory_format =
-        platform::data_format_to_memory_format(data_format);
-
-    weights_format = MKLDNNMemoryFormat::any;
-    // Check the format for user's special output
-    if (chosen_memory_format != MKLDNNMemoryFormat::any) {
-      if (is_conv3d) {
-        chosen_memory_format =
-            platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
-      }
-    }
-
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-    auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
-    std::vector<int> bias_tz;
-    auto dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-
-    platform::ConvMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
-
-    // create a conv primitive descriptor and save it for usage in backward
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
-    auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
-                                 : mkldnn::prop_kind::forward_training;
-    if (bias) {
-      bias_tz = paddle::framework::vectorize<int>(bias->dims());
-      auto bias_md = platform::MKLDNNMemDesc(
-          bias_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
-      conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
-          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,
-          fwd_prop_kind);
-    } else {
-      conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
-          src_md, weights_md, boost::none, dst_md, strides, paddings,
-          mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
-          fuse_residual_conn, fwd_prop_kind);
-    }
-
-    // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory_p =
-        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
-    auto user_weights_memory_p = handler.AcquireWeightsMemory(
-        user_weights_md, to_void_cast<T>(filter_data));
-
-    // create reorder primitive if the input format is not the preferred one
-    auto src_memory_p =
-        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
-    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
-        user_weights_memory_p, pipeline, is_test);
-
-    std::shared_ptr<mkldnn::memory> dst_memory_p, user_residual_memory_p;
-
-    if (fuse_residual_conn) {
-      auto residual_param = ctx.Input<Tensor>("ResidualData");
-      auto residual_param_data = residual_param->data<T>();
-
-      PADDLE_ENFORCE(
-          residual_param_data != nullptr,
-          "Provide data if you want MKLDNN conv+elementwise_add fusion");
-      PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
-                        "Output and elementwise parameter need to have the "
-                        "same dimension sizes");
-
-      if (residual_param->format() != handler.GetDstFormat()) {
-        auto output_data =
-            output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
-        auto residual_data_tz =
-            paddle::framework::vectorize<int>(residual_param->dims());
-        auto residual_data_type =
-            paddle::framework::ToMKLDNNDataType(residual_param->type());
-
-        auto user_residual_md = platform::MKLDNNMemDesc(
-            residual_data_tz, residual_data_type, residual_param->format());
-        user_residual_memory_p = handler.AcquireResidualDataMemory(
-            user_residual_md, to_void_cast<T>(residual_param_data));
-
-        dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory(
-            user_residual_memory_p, to_void_cast<T>(output_data), pipeline);
-      } else {
-        output->ShareDataWith(*residual_param);
-        auto output_data = output->mutable_data<T>(ctx.GetPlace());
-        dst_memory_p =
-            handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
-      }
-    } else {
-      auto output_data =
-          output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
-      dst_memory_p =
-          handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
-    }
-
-    // create convolution op primitive
-    std::shared_ptr<mkldnn::convolution_forward> conv_p;
-    std::shared_ptr<mkldnn::memory> user_bias_memory_p, bias_memory_p;
-    if (bias) {
-      const T* bias_data = bias->data<T>();
-      auto user_bias_md = platform::MKLDNNMemDesc(
-          {bias_tz}, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
-      user_bias_memory_p =
-          handler.AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
-
-      bias_memory_p =
-          handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
-      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
-                                          bias_memory_p, dst_memory_p);
-    } else {
-      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
-                                          dst_memory_p);
-    }
-
-    // push primitive to stream and wait until it's executed
-    pipeline.push_back(*conv_p);
-    stream(stream::kind::eager).submit(pipeline).wait();
-
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory_p));
-  }
-
-  template <typename T_out>
-  void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
-    const bool is_test = ctx.Attr<bool>("is_test");
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
-    auto* output = ctx.Output<Tensor>("Output");
-
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
-
-    PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Filter tensor");
-    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Filter tensor");
-
-    PADDLE_ENFORCE_GE(
-        input->dims().size(), 4,
-        "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
-    PADDLE_ENFORCE_LE(
-        input->dims().size(), 5,
-        "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
-
-    PADDLE_ENFORCE_GE(
-        filter->dims().size(), 4,
-        "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
-    PADDLE_ENFORCE_LE(
-        filter->dims().size(), 5,
-        "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
-
-    if (bias) {
-      PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for Bias tensor");
-      PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for Bias tensor");
-
-      PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
-                        "Bias must only have 1 dimension, i.e. X");
-    }
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
-    float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-    float fuse_beta = ctx.Attr<float>("fuse_beta");
-    bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
-    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-    bool unsigned_output =
-        (fuse_activation == "relu" || fuse_activation == "relu6");
-    auto scale_in_data = ctx.Attr<float>("Scale_in");
-    auto scale_in_eltwise_data = ctx.Attr<float>("Scale_in_eltwise");
-    auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
-    auto scale_out_data =
-        force_fp32_output ? 1.0f : ctx.Attr<float>("Scale_out");
-
-    PADDLE_ENFORCE(!fuse_residual_conn || !force_fp32_output,
-                   "residual fusion does not support force output with fp32");
-
-    bool is_conv3d = strides.size() == 3U;
-    PADDLE_ENFORCE(
-        is_conv3d
-            ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
-                  dilations[2] == 1
-            : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-        "dilation in convolution is not implemented yet");
-
-    PADDLE_ENFORCE_NE(is_conv3d, true,
-                      "int8 does not support conv3d currently");
-
-    const T* input_data = input->data<T>();
-
-    auto src_tz = paddle::framework::vectorize<int>(input->dims());
-    auto weights_tz = paddle::framework::vectorize<int>(filter->dims());
-    int g = std::max(groups, 1);
-    GetWeightsTz(weights_tz, g);
-    auto dst_tz = paddle::framework::vectorize<int>(output->dims());
-
-    mkldnn::memory::data_type src_dt =
-        paddle::framework::ToMKLDNNDataType(input->type());
-
-    std::string key = platform::CreateKey(
-        src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
-        input->format(), fuse_activation, fuse_residual_conn,
-        ctx.op().Input("Input") + ctx.op().Input("Filter"));
-
-    std::shared_ptr<mkldnn::convolution_forward> conv_p;
-    std::shared_ptr<mkldnn::memory> src_memory_p;
-    std::shared_ptr<mkldnn::memory> user_src_memory_p;
-    std::vector<primitive> pipeline;
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
-    std::shared_ptr<mkldnn::memory> dst_memory_p, user_residual_memory_p;
-
-    const float* filter_data = filter->data<float>();
-    bool is_multi_channel = scale_weights_data.size() > 1;
-
-    auto output_shift_scale = ComputeOutputShiftScale(
-        scale_out_data, scale_in_data, scale_weights_data);
-
-    float scale_residual =
-        fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
-    auto user_src_md =
-        platform::MKLDNNMemDesc({src_tz}, src_dt, input->format());
-    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<float>(),
-        ((g) == 1) ? mkldnn::memory::format::oihw
-                   : mkldnn::memory::format::goihw);
-
-    /* create memory descriptor for convolution without specified format
-    * ('any') which lets a primitive (convolution in this case) choose
-    * the memory format preferred for best performance
-    */
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    auto chosen_memory_format =
-        platform::data_format_to_memory_format(data_format);
-
-    auto src_md = platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format);
-    auto weights_md = platform::MKLDNNMemDesc(weights_tz, memory::data_type::s8,
-                                              chosen_memory_format);
-    auto dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
-
-    platform::ConvMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
-    auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
-                               : mkldnn::prop_kind::forward_training;
-
-    std::vector<int> bias_tz;
-
-    if (bias) {
-      bias_tz = paddle::framework::vectorize<int>(bias->dims());
-      auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32,
-                                             mkldnn::memory::format::x);
-      conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
-          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,
-          propagation, output_shift_scale, scale_residual);
-    } else {
-      conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
-          src_md, weights_md, boost::none, dst_md, strides, paddings,
-          mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
-          fuse_residual_conn, propagation, output_shift_scale, scale_residual);
-    }
-
-    // create mkldnn memory from input tensors (data/weights)
-    user_src_memory_p =
-        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
-    auto user_weights_memory_p = handler.AcquireWeightsMemory(
-        user_weights_md, to_void_cast<float>(filter_data));
-
-    // create reorder primitive if the input format is not the preferred one
-    src_memory_p =
-        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
-
-    std::shared_ptr<mkldnn::memory> weights_memory_p;
-
-    int mask_reorder = ComputeWeightsMask(is_multi_channel, g);
-
-    weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
-        user_weights_memory_p, pipeline, is_test, true, scale_weights_data,
-        mask_reorder);
-
-    if (fuse_residual_conn) {
-      auto residual_param = ctx.Input<Tensor>("ResidualData");
-      auto residual_param_data = residual_param->data<T_out>();
-      PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
-                        "Output and elementwise parameter need to have the "
-                        "same dimension sizes");
-      auto residual_dt =
-          paddle::framework::ToMKLDNNDataType(residual_param->type());
-      if (residual_param->format() != handler.GetDstFormat()) {
-        auto residual_data_tz =
-            paddle::framework::vectorize<int>(residual_param->dims());
-        auto user_residual_md = platform::MKLDNNMemDesc(
-            residual_data_tz, residual_dt, residual_param->format());
-
-        user_residual_memory_p = handler.AcquireResidualDataMemory(
-            user_residual_md, to_void_cast<T_out>(residual_param_data));
-
-        T_out* output_data = output->mutable_data<T_out>(ctx.GetPlace());
-        dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory(
-            user_residual_memory_p, to_void_cast<T_out>(output_data), pipeline);
-
-      } else {
-        output->ShareDataWith(*residual_param);
-        auto output_data = output->mutable_data<T_out>(ctx.GetPlace());
-        dst_memory_p = handler.AcquireDstMemoryFromPrimitive(
-            to_void_cast<T_out>(output_data));
-      }
-    } else {
-      T_out* output_data = output->mutable_data<T_out>(
-          ctx.GetPlace(), handler.GetDstMemorySize());
-      dst_memory_p = handler.AcquireDstMemoryFromPrimitive(
-          to_void_cast<T_out>(output_data));
-    }
-
-    // create convolution op primitive
-    if (bias) {
-      const float* bias_data = bias->data<float>();
-      auto user_bias_md = platform::MKLDNNMemDesc(
-          {bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x);
-      auto user_bias_memory_p = handler.AcquireBiasMemory(
-          user_bias_md, to_void_cast<float>(bias_data));
-      std::shared_ptr<mkldnn::memory> bias_memory_p;
-
-      auto scale_bias_data =
-          ComputeBiasScale(scale_in_data, scale_weights_data);
-      int mask_bias_reorder = ComputeBiasMask(is_multi_channel);
-      bias_memory_p = handler.AcquireBiasMemoryFromPrimitive(
-          user_bias_memory_p, pipeline, is_test, true, scale_bias_data,
-          mask_bias_reorder);
-      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
-                                          bias_memory_p, dst_memory_p);
-    } else {
-      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
-                                          dst_memory_p);
-    }
-    // push primitive to stream and wait until it's executed
-    pipeline.push_back(*conv_p);
-
-    // push primitive to stream and wait until it's executed
-    stream(stream::kind::eager).submit(pipeline).wait();
-    if (platform::MKLDNNGetDataType<T_out>() == memory::data_type::s8 &&
-        unsigned_output) {
-      output->mutable_data<uint8_t>(ctx.GetPlace());
-    }
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory_p));
-  }
-};
-
-template <typename T>
-class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
-
-    PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Filter tensor");
-    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Filter tensor");
-
-    PADDLE_ENFORCE_EQ(output_grad->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for output_grad tensor");
-    PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for output_grad tensor");
-
-    PADDLE_ENFORCE_EQ(
-        ctx.Attr<bool>("is_test"), false,
-        "is_test attribute should be set to False in training phase.");
-
-    if (!input_grad && !filter_grad) return;
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-
-    bool is_conv3d = strides.size() == 3U;
-    const T* input_data = input->data<T>();
-    const T* filter_data = filter->data<T>();
-    const T* output_grad_data = output_grad->data<T>();
-    T* input_grad_data = nullptr;
-    T* filter_grad_data = nullptr;
-
-    auto src_tz = paddle::framework::vectorize<int>(input->dims());
-    auto weights_tz = paddle::framework::vectorize<int>(filter->dims());
-    int g = std::max(groups, 1);
-    GetWeightsTz(weights_tz, g);
-    auto dst_tz = paddle::framework::vectorize<int>(output_grad->dims());
-    auto src_format = input->format();
-    MKLDNNMemoryFormat weights_format =
-        GetWeightsFormat(filter->format(), g, is_conv3d);
-
-    // Get an unique name from "argument" name of "input" and "Filter" variable
-    // as well as attributes of primitive to be created
-    // This name will be used as key when saving info into device context
-    const std::string key = platform::CreateKey(
-        src_tz, weights_tz, "", strides, paddings, dilations, groups,
-        ctx.op().Input("Input") + ctx.op().Input("Filter"));
-
-    const std::string key_conv_pd = key + "@conv_pd";
-    std::vector<primitive> pipeline;
-
-    // Create user memory descriptors
-    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
-    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
-    auto user_diff_dst_md = platform::MKLDNNMemDesc(
-        {dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format());
-
-    /* create memory descriptor for conv backward without specified format
-     * ('any') which lets a primitive (conv backward in this case) choose
-     * the memory format preferred for best performance
-     */
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    auto chosen_memory_format =
-        platform::data_format_to_memory_format(data_format);
-
-    weights_format = MKLDNNMemoryFormat::any;
-    // Check the format for user's special output
-    if (chosen_memory_format != MKLDNNMemoryFormat::any) {
-      if (is_conv3d) {
-        chosen_memory_format =
-            platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
-      }
-    }
-
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-    auto diff_src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-    auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
-    auto diff_weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
-    auto diff_dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-
-    // Retrieve conv_pd from device context
-    auto conv_pd =
-        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_conv_pd));
-    PADDLE_ENFORCE(conv_pd != nullptr,
-                   "Fail to find conv_pd in device context");
-
-    // create backward convolution weights primitive descriptor
-    auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
-        mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md,
-        strides, paddings, paddings, mkldnn::padding_kind::zero);
-    auto conv_bwd_weights_pd =
-        std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
-            conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
-
-    // create backward convolution data primitive descriptor
-    auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
-        mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
-        strides, paddings, paddings, mkldnn::padding_kind::zero);
-    auto conv_bwd_data_pd =
-        std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
-            conv_bwd_data_desc, mkldnn_engine, *conv_pd);
-
-    platform::ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd,
-                                        conv_bwd_weights_pd, dev_ctx,
-                                        mkldnn_engine, key);
-
-    // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory_p =
-        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
-    auto user_weights_memory_p = handler.AcquireWeightsMemory(
-        user_weights_md, to_void_cast<T>(filter_data));
-    auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory(
-        user_diff_dst_md, to_void_cast<T>(output_grad_data));
-
-    // create backward conv primitive for weights
-    if (filter_grad) {
-      auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive(
-          user_src_memory_p, pipeline);
-
-      auto diff_dst_memory_4filter_p =
-          handler.AcquireDiffDstMemoryFromWeightsPrimitive(
-              user_diff_dst_memory_p, pipeline);
-
-      const size_t size = handler.GetDiffWeightsMemorySize();
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
-
-      auto diff_weights_memory_p =
-          handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
-              reinterpret_cast<void*>(filter_grad_data));
-
-      auto conv_bwd_weights_p = handler.AcquireConvolutionBackwardWeights(
-          src_memory_p, diff_dst_memory_4filter_p, diff_weights_memory_p);
-
-      // push primitive to stream and wait until it's executed
-      pipeline.push_back(*conv_bwd_weights_p);
-
-      filter_grad->set_layout(DataLayout::kMKLDNN);
-      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
-    }
-
-    if (input_grad) {
-      auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive(
-          user_weights_memory_p, pipeline);
-
-      auto diff_dst_memory_4data_p =
-          handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p,
-                                                        pipeline);
-
-      const size_t size = handler.GetDiffSourceMemorySize();
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
-
-      auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
-          reinterpret_cast<void*>(input_grad_data));
-
-      auto conv_bwd_data_p = handler.AcquireConvolutionBackwardData(
-          diff_dst_memory_4data_p, weights_memory_p, diff_src_memory_p);
-
-      pipeline.push_back(*conv_bwd_data_p);
-
-      input_grad->set_layout(DataLayout::kMKLDNN);
-      input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
-    }
-    stream(stream::kind::eager).submit(pipeline).wait();
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
-                                    ::paddle::platform::CPUPlace, FP32,
-                                    ops::kConvMKLDNNFP32,
-                                    ops::ConvMKLDNNOpKernel<float>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
-                                    ::paddle::platform::CPUPlace, U8,
-                                    ops::kConvMKLDNNINT8,
-                                    ops::ConvMKLDNNOpKernel<uint8_t>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
-                                    ::paddle::platform::CPUPlace, S8,
-                                    ops::kConvMKLDNNINT8,
-                                    ops::ConvMKLDNNOpKernel<int8_t>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
-                                    ::paddle::platform::CPUPlace, FP32,
-                                    ops::kConvMKLDNNFP32,
-                                    ops::ConvMKLDNNGradOpKernel<float>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
-                                    ::paddle::platform::CPUPlace, FP32,
-                                    ops::kConvMKLDNNFP32,
-                                    ops::ConvMKLDNNOpKernel<float>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN,
-                                    ::paddle::platform::CPUPlace, FP32,
-                                    ops::kConvMKLDNNFP32,
-                                    ops::ConvMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
deleted file mode 100644
index 84240d30fe1d1aa6d1e04b2c103d22887b79f4ca..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "boost/optional.hpp"
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using framework::DataLayout;
-
-template <typename T>
-class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    const bool is_test = ctx.Attr<bool>("is_test");
-    PADDLE_ENFORCE(
-        is_test == true,
-        "ConvTransposeMKLDNN works only for inference!. Set is_test = True");
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
-    auto* output = ctx.Output<Tensor>("Output");
-
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
-
-    PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Filter tensor");
-    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Filter tensor");
-
-    PADDLE_ENFORCE_EQ(input->dims().size(), 4,
-                      "Input must be with 4 dimensions, i.e. NCHW");
-    PADDLE_ENFORCE_EQ(filter->dims().size(), 4,
-                      "Filter must be with 4 dimensions, i.e. OIHW");
-
-    if (bias) {
-      PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for Bias tensor");
-      PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for Bias tensor");
-
-      PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
-                        "Bias must only have 1 dimension, i.e. X");
-    }
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-
-    PADDLE_ENFORCE(
-        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-        "dilation in convolution is not implemented yet");
-
-    const T* input_data = input->data<T>();
-    const T* filter_data = filter->data<T>();
-
-    auto src_tz = paddle::framework::vectorize<int>(input->dims());
-    auto iohw_weights_tz = paddle::framework::vectorize<int>(filter->dims());
-    auto weights_tz = iohw_weights_tz;
-
-    // IOHW -> OIHW
-    weights_tz[0] = iohw_weights_tz[1];
-    weights_tz[1] = iohw_weights_tz[0];
-
-    // Custom Reorder from IOHW to OIHW
-    auto iohw2oihw_reorder =
-        [&iohw_weights_tz](const T* filter_data) -> std::shared_ptr<T> {
-      int o = iohw_weights_tz[1];
-      int c = iohw_weights_tz[0];
-      int h = iohw_weights_tz[2];
-      int w = iohw_weights_tz[3];
-      std::shared_ptr<T> reordered_filter_data(new T[o * c * h * w](),
-                                               std::default_delete<T[]>());
-      for (int i = 0; i < c; ++i) {
-        for (int j = 0; j < o; ++j) {
-          int in_offset = j * h * w + i * o * h * w;
-          int out_offset = j * c * h * w + i * h * w;
-          std::memcpy(&(reordered_filter_data.get())[out_offset],
-                      &filter_data[in_offset], h * w * sizeof(T));
-        }
-      }
-
-      return reordered_filter_data;
-    };
-
-    int g = std::max(groups, 1);
-    if (g > 1) {
-      int o = weights_tz[0];
-      int i = weights_tz[1];
-      int h = weights_tz[2];
-      int w = weights_tz[3];
-      weights_tz.resize(5);
-      weights_tz[0] = g;
-      weights_tz[1] = o / g;
-      weights_tz[2] = i;
-      weights_tz[3] = h;
-      weights_tz[4] = w;
-    }
-    auto dst_tz = paddle::framework::vectorize<int>(output->dims());
-
-    // Get unique name for storing MKLDNN primitives
-    const std::string key =
-        platform::CreateKey(src_tz, weights_tz, strides, paddings, dilations,
-                            groups, ctx.op().Output("Output"));
-
-    std::vector<mkldnn::primitive> pipeline;
-
-    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
-    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(),
-        (g == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw);
-
-    /* create memory descriptor for convolution without specified format
-     * ('any') which lets a primitive (convolution in this case) choose
-     * the memory format preferred for best performance
-     */
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    auto chosen_memory_format =
-        platform::data_format_to_memory_format(data_format);
-    std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
-    float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-    float fuse_beta = ctx.Attr<float>("fuse_beta");
-
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-    auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-    std::vector<int> bias_tz;
-    auto dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-
-    platform::ConvTransposeMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
-    // create a deconv(conv transpose) primitive descriptor and save it for
-    // usage in backward
-    std::shared_ptr<mkldnn::deconvolution_forward::primitive_desc>
-        conv_transpose_pd;
-    auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
-                                 : mkldnn::prop_kind::forward_training;
-    if (bias) {
-      bias_tz = paddle::framework::vectorize<int>(bias->dims());
-      auto bias_md = platform::MKLDNNMemDesc(
-          bias_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
-      conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
-          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_activation, fuse_alpha, fuse_beta, false, fwd_prop_kind);
-    } else {
-      conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
-          src_md, weights_md, boost::none, dst_md, strides, paddings,
-          mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, false,
-          fwd_prop_kind);
-    }
-
-    // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory_p = handler.AcquireSrcMemory(
-        user_src_md, platform::to_void_cast<T>(input_data));
-    auto user_weights_memory_p = handler.AcquireWeightsMemory(
-        user_weights_md, platform::to_void_cast<T>(filter_data),
-        is_test ? iohw2oihw_reorder : platform::user_function());
-
-    // create reorder primitive if the input format is not the preferred one
-    auto src_memory_p =
-        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
-    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
-        user_weights_memory_p, pipeline, is_test);
-
-    std::shared_ptr<mkldnn::memory> dst_memory_p;
-
-    auto output_data =
-        output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
-    dst_memory_p = handler.AcquireDstMemoryFromPrimitive(
-        platform::to_void_cast<T>(output_data));
-
-    // create convolution op primitive
-    std::shared_ptr<mkldnn::deconvolution_forward> conv_p;
-    if (bias) {
-      const T* bias_data = bias->data<T>();
-      auto user_bias_md = platform::MKLDNNMemDesc(
-          {bias_tz}, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
-      auto user_bias_memory_p = handler.AcquireBiasMemory(
-          user_bias_md, platform::to_void_cast<T>(bias_data));
-
-      auto bias_memory_p =
-          handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
-      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
-                                          bias_memory_p, dst_memory_p);
-    } else {
-      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
-                                          dst_memory_p);
-    }
-
-    // push primitive to stream and wait until it's executed
-    pipeline.push_back(*conv_p);
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(conv2d_transpose, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ConvTransposeMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
deleted file mode 100644
index b74e7127ea3314695f866e0c2f2693cc3489b145..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "mkldnn.hpp"
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/dequantize_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::reorder;
-using platform::to_void_cast;
-using Tensor = framework::Tensor;
-using framework::DataLayout;
-using mkldnn::stream;
-using platform::GetMKLDNNFormat;
-
-template <typename T>
-class DeQuantOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto scale_data = ctx.Attr<float>("Scale");
-    auto* output = ctx.Output<Tensor>("Output");
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& engine = dev_ctx.GetEngine();
-
-    const T* input_data = input->data<T>();
-    float* output_data = output->mutable_data<float>(ctx.GetPlace());
-    std::vector<float> reorder_scale = {1.0f / scale_data};
-
-    std::vector<primitive> pipeline;
-    auto src_tz = paddle::framework::vectorize<int>(input->dims());
-    auto dst_tz = paddle::framework::vectorize<int>(output->dims());
-    mkldnn::memory::data_type src_dt =
-        paddle::framework::ToMKLDNNDataType(input->type());
-    MKLDNNMemoryFormat src_fmt = input->format();
-    std::string key = platform::CreateKey(src_dt, src_tz, reorder_scale[0],
-                                          ctx.op().Output("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
-
-    std::shared_ptr<mkldnn::memory> src_memory;
-    std::shared_ptr<mkldnn::memory> dst_memory;
-    std::shared_ptr<reorder> reorder_p;
-    reorder_p = std::static_pointer_cast<reorder>(dev_ctx.GetBlob(key_prim));
-
-    if (reorder_p == nullptr) {
-      mkldnn::primitive_attr attri;
-      int mask = 0;
-      attri.set_output_scales(mask, reorder_scale);
-
-      auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt);
-      auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
-      src_memory =
-          std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
-      std::shared_ptr<primitive::at> src_memory_p =
-          std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
-
-      auto dst_md = platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32,
-                                            MKLDNNMemoryFormat::nchw);
-      auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
-      dst_memory = std::make_shared<mkldnn::memory>(
-          dst_pd, to_void_cast<float>(output_data));
-
-      auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
-          new reorder::primitive_desc(src_pd, dst_pd, attri));
-      reorder_p = std::shared_ptr<reorder>(
-          new reorder(*reorder_pd, *src_memory_p, *dst_memory));
-      dev_ctx.SetBlob(key_prim, reorder_p);
-      dev_ctx.SetBlob(key_src_mem, src_memory);
-      dev_ctx.SetBlob(key_dst_mem, dst_memory);
-    } else {
-      src_memory = std::static_pointer_cast<mkldnn::memory>(
-          dev_ctx.GetBlob(key_src_mem));
-      src_memory->set_data_handle(to_void_cast<T>(input_data));
-
-      dst_memory = std::static_pointer_cast<mkldnn::memory>(
-          dev_ctx.GetBlob(key_dst_mem));
-      dst_memory->set_data_handle(output->mutable_data<float>(ctx.GetPlace()));
-    }
-
-    pipeline.push_back(*reorder_p);
-    stream(stream::kind::eager).submit(pipeline).wait();
-
-    output->set_format(GetMKLDNNFormat(*dst_memory));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::DeQuantOpKernel<uint8_t>, ops::DeQuantOpKernel<int8_t>);
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
deleted file mode 100644
index 01837cfe36bdfe7215af8379c43f13963215bfe5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <mkldnn/include/mkldnn_types.h>
-#include <memory>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/fc_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::DataLayout;
-using framework::Tensor;
-using framework::LoDTensor;
-using framework::DDim;
-using framework::ExecutionContext;
-using platform::MKLDNNDeviceContext;
-using platform::to_void_cast;
-using platform::GetMKLDNNFormat;
-using mkldnn::memory;
-using mkldnn::inner_product_forward;
-using mkldnn::primitive;
-using mkldnn::stream;
-using mkldnn::prop_kind;
-
-template <typename T>
-class FCPrimitiveFactory {
- public:
-  explicit FCPrimitiveFactory(const mkldnn::engine& engine) : engine_(engine) {}
-
-  inner_product_forward CreateFcPrimitive(const LoDTensor* input,
-                                          const Tensor* weights,
-                                          const Tensor* bias, LoDTensor* output,
-                                          const ExecutionContext& ctx) {
-    RecomputeOutputDims(ctx, input, weights, output);
-    if (fc_) {
-      UpdateDataPointers(ctx, output, input);
-      return *fc_;
-    }
-    auto src_desc = CreateMemDescriptor(input, input->format());
-    input_ = CreateMemory(src_desc, input);
-
-    weights_ = TransposeWeights(weights);
-    if (src_desc.data.ndims == 4) {
-      weights_ = CreateFourDimWeightsMemory(input, weights);
-    }
-
-    auto dst_desc = CreateMemDescriptor(output, MKLDNNMemoryFormat::any);
-
-    fc_ = CreateFcPrimitive(*input_, *weights_, dst_desc, bias, output, ctx);
-    return *fc_;
-  }
-
- private:
-  void UpdateDataPointers(const ExecutionContext& ctx, Tensor* out,
-                          const Tensor* in) {
-    input_->set_data_handle(const_cast<T*>(in->data<T>()));
-    output_->set_data_handle(out->mutable_data<T>(ctx.GetPlace()));
-    if (out->format() == MKLDNNMemoryFormat::format_undef) {
-      auto output_format = output_->get_primitive_desc().desc().data.format;
-      out->set_format((MKLDNNMemoryFormat)output_format);
-    }
-  }
-
-  MKLDNNMemoryFormat MatchWeightFormat(MKLDNNMemoryFormat fmt) {
-    using format = MKLDNNMemoryFormat;
-    switch (fmt) {
-      case format::nChw16c:
-        return format::oIhw16i;
-      case format::nChw8c:
-        return format::oIhw8i;
-      case format::nchw:
-        return format::oihw;
-      default:
-        return format::format_undef;
-    }
-  }
-
-  mkldnn::memory Reorder(const memory::desc& src_desc,
-                         const memory::desc& dst_desc, const void* src_data) {
-    auto src_mem = memory({src_desc, engine_}, const_cast<void*>(src_data));
-    auto dst_mem = memory({dst_desc, engine_});
-
-    auto reorder = mkldnn::reorder(src_mem, dst_mem);
-    stream(stream::kind::eager).submit({reorder}).wait();
-
-    return dst_mem;
-  }
-
-  static mkldnn::memory::desc CreateMemDescriptor(const std::vector<int>& dims,
-                                                  MKLDNNMemoryFormat format) {
-    return platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType<T>(),
-                                   format);
-  }
-
-  static mkldnn::memory::desc CreateMemDescriptor(const Tensor* tensor,
-                                                  MKLDNNMemoryFormat format) {
-    auto dims = framework::vectorize<int>(tensor->dims());
-    return CreateMemDescriptor(dims, format);
-  }
-
-  mkldnn::memory CreateMemory(const mkldnn::memory::desc& desc,
-                              const Tensor* tensor) {
-    return CreateMemory(desc, tensor->data<T>());
-  }
-
-  mkldnn::memory CreateMemory(const mkldnn::memory::desc& desc,
-                              const void* data) {
-    return memory({desc, engine_}, const_cast<void*>(data));
-  }
-
-  mkldnn::memory TransposeWeights(const Tensor* weights) {
-    auto dims = framework::vectorize<int>(weights->dims());
-    std::swap(dims[0], dims[1]);  // Correct output dimensions
-    auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::io);
-    auto dst_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oi);
-    return Reorder(src_desc, dst_desc, weights->data<T>());
-  }
-
-  inner_product_forward CreateFcPrimitive(const memory& src_memory,
-                                          const memory& weights_memory,
-                                          const memory::desc& dst_desc,
-                                          const Tensor* bias, Tensor* output,
-                                          const ExecutionContext& ctx) {
-    const auto weights_desc = weights_memory.get_primitive_desc().desc();
-    const auto src_desc = src_memory.get_primitive_desc().desc();
-    if (bias) {
-      auto bias_desc = CreateMemDescriptor(bias, bias->format());
-      bias_ = CreateMemory(bias_desc, bias);
-      auto fc_prim_desc =
-          CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc);
-
-      output_ = CreateDstMemory(fc_prim_desc, ctx, output);
-
-      return inner_product_forward(fc_prim_desc, src_memory, weights_memory,
-                                   *bias_, *output_);
-    } else {
-      auto fc_prim_desc = CreateFcPrimDesc(src_desc, weights_desc, dst_desc);
-
-      output_ = CreateDstMemory(fc_prim_desc, ctx, output);
-
-      return inner_product_forward(fc_prim_desc, src_memory, weights_memory,
-                                   *output_);
-    }
-  }
-
-  mkldnn::inner_product_forward::primitive_desc CreateFcPrimDesc(
-      const mkldnn::memory::desc& input_desc,
-      const mkldnn::memory::desc& weights_desc,
-      const mkldnn::memory::desc& bias_desc,
-      const mkldnn::memory::desc& dst_desc) {
-    auto fc_desc =
-        inner_product_forward::desc(prop_kind::forward_scoring, input_desc,
-                                    weights_desc, bias_desc, dst_desc);
-
-    return inner_product_forward::primitive_desc(fc_desc, engine_);
-  }
-
-  mkldnn::inner_product_forward::primitive_desc CreateFcPrimDesc(
-      const mkldnn::memory::desc& input_desc,
-      const mkldnn::memory::desc& weights_desc,
-      const mkldnn::memory::desc& dst_desc) {
-    auto fc_desc = inner_product_forward::desc(prop_kind::forward, input_desc,
-                                               weights_desc, dst_desc);
-
-    return inner_product_forward::primitive_desc(fc_desc, engine_);
-  }
-
-  mkldnn::memory CreateFourDimWeightsMemory(const Tensor* input,
-                                            const Tensor* weights) {
-    auto input_dims = framework::vectorize<int>(input->dims());
-    auto weight_dims = framework::vectorize<int>(weights->dims());
-    auto dims = {weight_dims[1], input_dims[1], input_dims[2], input_dims[3]};
-
-    auto dst_format = MatchWeightFormat(input->format());
-    auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oihw);
-    auto dst_desc = CreateMemDescriptor(dims, dst_format);
-
-    return Reorder(src_desc, dst_desc, weights_->get_data_handle());
-  }
-
-  mkldnn::memory CreateDstMemory(
-      const mkldnn::inner_product_forward::primitive_desc& fc_prim_desc,
-      const ExecutionContext& ctx, Tensor* output) {
-    auto dst_prim_desc = fc_prim_desc.dst_primitive_desc();
-    auto buffer_size = dst_prim_desc.get_size();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace(), buffer_size);
-    output->set_format((MKLDNNMemoryFormat)dst_prim_desc.desc().data.format);
-    return memory(dst_prim_desc, to_void_cast<T>(output_data));
-  }
-
-  void RecomputeOutputDims(const ExecutionContext& ctx, const LoDTensor* input,
-                           const Tensor* w, LoDTensor* output) {
-    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
-    std::vector<int64_t> output_dims;
-    FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims);
-    output->Resize(framework::make_ddim(output_dims));
-    output->set_lod(input->lod());
-  }
-
- private:
-  const mkldnn::engine& engine_;
-  boost::optional<memory> bias_;
-  boost::optional<memory> input_;
-  boost::optional<memory> output_;
-  boost::optional<memory> weights_;
-  boost::optional<inner_product_forward> fc_;
-};
-
-template <typename T>
-std::shared_ptr<FCPrimitiveFactory<T>> GetPrimitiveFactory(
-    const MKLDNNDeviceContext& dev_ctx, const ExecutionContext& ctx,
-    const Tensor* input, const Tensor* weights,
-    const mkldnn::engine& mkldnn_engine) {
-  const std::string key = platform::CreateKey(
-      input->format(), framework::vectorize<int>(weights->dims()),
-      ctx.op().Output("Out"));
-
-  auto prim_creator =
-      std::static_pointer_cast<FCPrimitiveFactory<T>>(dev_ctx.GetBlob(key));
-  if (prim_creator == nullptr) {
-    prim_creator = std::make_shared<FCPrimitiveFactory<T>>(mkldnn_engine);
-    dev_ctx.SetBlob(key, prim_creator);
-  }
-
-  return prim_creator;
-}
-
-template <typename T>
-class FCMKLDNNOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto input = ctx.Input<LoDTensor>("Input");
-    auto w = ctx.Input<Tensor>("W");
-    auto bias = ctx.Input<Tensor>("Bias");
-    auto output = ctx.Output<LoDTensor>("Out");
-
-    auto prim_creator =
-        GetPrimitiveFactory<T>(dev_ctx, ctx, input, w, mkldnn_engine);
-    auto fc = prim_creator->CreateFcPrimitive(input, w, bias, output, ctx);
-    stream(stream::kind::eager).submit({fc}).wait();
-
-    output->set_layout(DataLayout::kMKLDNN);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_KERNEL(fc, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::FCMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
deleted file mode 100644
index d992765ce91b128984a5544d61f5b600ae38ef69..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include "paddle/fluid/operators/mean_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::DataLayout;
-template <typename T>
-class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::normal_distribution<T> dist(mean, std);
-    int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
-    }
-
-    tensor->set_layout(DataLayout::kMKLDNN);
-    tensor->set_format(mkldnn::memory::format::oihw);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(gaussian_random, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::GaussianMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
deleted file mode 100644
index fe1ead8fed6a7dc075be6afb57b815a49e90fb4e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/lrn_op.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
-
-template <typename T>
-class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    const bool is_float_type = std::is_same<T, float>::value;
-    PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data.");
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "MKLDNN LRN must use CPUPlace.");
-
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-
-    auto x = ctx.Input<Tensor>("X");
-    auto out = ctx.Output<Tensor>("Out");
-    auto mid = ctx.Output<Tensor>("MidOut");
-
-    const int n = ctx.Attr<int>("n");
-    // MKL-DNN implements LRN in a caffe way:
-    // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html
-    // Where sum of squares is divided by size of normalization window
-    // this is not the case for PaddlePaddle LRN.
-    // Hence we need to compensate for this diffrence by
-    // multipliing alpha by size of window(n)
-    const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
-    const float beta = ctx.Attr<float>("beta");
-    const float k = ctx.Attr<float>("k");
-    bool is_test = ctx.Attr<bool>("is_test");
-
-    auto dims = paddle::framework::vectorize<int>(x->dims());
-
-    platform::LRNMKLDNNHandler<T> handler(dims, n, alpha, beta, k, x->format(),
-                                          is_test, dev_ctx, ctx.GetPlace(),
-                                          ctx.op().Output("Out"));
-
-    auto src_memory = handler.AcquireSrcMemory(x);
-    auto dst_memory = handler.AcquireDstMemory(out);
-
-    std::shared_ptr<mkldnn::memory> workspace_memory;
-    std::shared_ptr<mkldnn::lrn_forward> lrn_p;
-    if (is_test == false) {
-      workspace_memory = handler.AcquireWorkspaceMemory(mid);
-      lrn_p = handler.AcquireForwardPrimitive(*src_memory, *workspace_memory,
-                                              *dst_memory);
-    } else {
-      // mid has to be allocated and filled
-      // k to pass LRN unit tests
-      // TODO(jczaja): Disable checking mid in unit tests (Require API change)
-      mid->mutable_data<T>(ctx.GetPlace());
-      auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-      e_mid = e_mid.constant(k);
-      lrn_p = handler.AcquireForwardPrimitive(*src_memory, *dst_memory);
-    }
-
-    std::vector<mkldnn::primitive> pipeline = {*lrn_p};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-
-    auto output_format =
-        (mkldnn::memory::format)dst_memory->get_primitive_desc()
-            .desc()
-            .data.format;
-
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(output_format);
-  }
-};
-
-template <typename T>
-class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    const bool is_float_type = std::is_same<T, float>::value;
-    PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data.");
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "MKLDNN LRN must use CPUPlace.");
-    PADDLE_ENFORCE(
-        !ctx.Attr<bool>("is_test"),
-        "is_test attribute should be set to False in training phase.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto mid = ctx.Input<Tensor>("MidOut");
-
-    auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    const int n = ctx.Attr<int>("n");
-    const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
-    const float beta = ctx.Attr<float>("beta");
-    const float k = ctx.Attr<float>("k");
-
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-
-    auto dims = paddle::framework::vectorize<int>(x->dims());
-
-    platform::LRNMKLDNNHandler<T> handler(
-        dims, n, alpha, beta, k, x->format(), out_grad->format(), dev_ctx,
-        ctx.GetPlace(), ctx.op().Input("Out"));
-
-    auto src_memory = handler.AcquireSrcMemory(x);
-    auto workspace = handler.AcquireBackwardWorkspaceMemory(mid);
-    auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad);
-    auto diff_src_memory = handler.AcquireDiffSrcMemory(x_grad);
-
-    auto lrn_bwd = handler.AcquireBackwardPrimitive(
-        *src_memory, *diff_dst_memory, *workspace, *diff_src_memory);
-
-    std::vector<mkldnn::primitive> pipeline = {*lrn_bwd};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-
-    auto output_format =
-        (mkldnn::memory::format)diff_src_memory->get_primitive_desc()
-            .desc()
-            .data.format;
-
-    x_grad->set_layout(framework::DataLayout::kMKLDNN);
-    x_grad->set_format(output_format);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(lrn, MKLDNN, paddle::platform::CPUPlace,
-                   ops::LRNMKLDNNOpKernel<float>);
-REGISTER_OP_KERNEL(lrn_grad, MKLDNN, paddle::platform::CPUPlace,
-                   ops::LRNMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h
deleted file mode 100644
index 85664623d7330e9473286d995bec67879510dbd7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor>
-class MKLDNNActivationKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(context.Input<framework::Tensor>("X") != nullptr,
-                   "Cannot get input tensor X, variable name = %s",
-                   context.op().Input("X"));
-    PADDLE_ENFORCE(context.Output<framework::Tensor>("Out") != nullptr,
-                   "Cannot find output tensor Out, variable name = %s",
-                   context.op().Output("Out"));
-    Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    functor(context);
-  }
-};
-
-template <typename Functor>
-class MKLDNNActivationGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    functor(context);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
deleted file mode 100644
index 5c635e58ecbd4e78159cf50c3797eb126cb3ac7e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ /dev/null
@@ -1,426 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/mul_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::DataLayout;
-using framework::DDim;
-using framework::ExecutionContext;
-using framework::Tensor;
-using mkldnn::inner_product_forward;
-using mkldnn::memory;
-using mkldnn::prop_kind;
-using mkldnn::stream;
-using platform::MKLDNNDeviceContext;
-using platform::to_void_cast;
-
-template <typename XT, typename YT, typename OT>
-class MulPrimitiveFactory {
- public:
-  explicit MulPrimitiveFactory(const mkldnn::engine &engine)
-      : engine_(engine) {}
-
-  virtual ~MulPrimitiveFactory() {}
-
-  virtual inner_product_forward CreateMulPrimitive(
-      const Tensor *input_x, const Tensor *input_y, Tensor *output,
-      const ExecutionContext &ctx) {
-    /* check format and reorder if need */
-    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
-
-    auto x_matrix = UpdateDataFormat<XT>(input_x, x_num_col_dims, ctx);
-    auto y_matrix = UpdateDataFormat<YT>(input_y, y_num_col_dims, ctx);
-
-    auto output_dim = output->dims();
-    if (output_dim.size() != 2) {
-      output->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-    }
-
-    if (mul_) {
-      UpdateDataPointers(ctx, output, &x_matrix);
-      return *mul_;
-    }
-
-    auto src_desc = CreateMemDescriptor<XT>(&x_matrix, MKLDNNMemoryFormat::nc);
-    x_input_ = CreateMemory<XT>(src_desc, &x_matrix);
-    y_input_ = TransposeInputY(&y_matrix);
-    auto dst_desc = CreateMemDescriptor<OT>(output, MKLDNNMemoryFormat::any);
-
-    mul_ = CreateMulPrimitive(*x_input_, *y_input_, dst_desc, output, ctx);
-    return *mul_;
-  }
-
- protected:
-  template <typename T>
-  Tensor UpdateDataFormat(const Tensor *data, int num_col_dims,
-                          const ExecutionContext &ctx) {
-    Tensor x_tmp;
-    Tensor data_matrix;
-    MKLDNNMemoryFormat src_fmt = data->format();
-    MKLDNNMemoryFormat dst_fmt;
-    auto src_mdesc = CreateMemDescriptor<T>(data, src_fmt);
-
-    if ((data->dims().size() == 4 &&
-         src_fmt != (dst_fmt = MKLDNNMemoryFormat::nchw)) ||
-        (data->dims().size() == 5 &&
-         src_fmt != (dst_fmt = MKLDNNMemoryFormat::ncdhw))) {
-      auto dst_mdesc = CreateMemDescriptor<T>(data, dst_fmt);
-      x_tmp.mutable_data<T>(ctx.GetPlace(), data->memory_size());
-
-      Reorder(src_mdesc, dst_mdesc, to_void_cast<T>(data->data<T>()),
-              to_void_cast<T>(x_tmp.data<T>()));
-
-      x_tmp.Resize(data->dims());
-      x_tmp.set_format((MKLDNNMemoryFormat)dst_mdesc.data.format);
-      data_matrix = framework::ReshapeToMatrix(x_tmp, num_col_dims);
-    } else {
-      data_matrix = framework::ReshapeToMatrix(*data, num_col_dims);
-    }
-
-    return data_matrix;
-  }
-
-  void UpdateDataPointers(const ExecutionContext &ctx, Tensor *out,
-                          const Tensor *in) {
-    x_input_->set_data_handle(to_void_cast<XT>(in->data<XT>()));
-    output_->set_data_handle(out->mutable_data<OT>(ctx.GetPlace()));
-
-    if (out->format() == MKLDNNMemoryFormat::format_undef) {
-      auto output_format = output_->get_primitive_desc().desc().data.format;
-      out->set_format((MKLDNNMemoryFormat)output_format);
-    }
-  }
-
-  template <typename T>
-  memory::desc CreateMemDescriptor(
-      const Tensor *tensor, MKLDNNMemoryFormat format,
-      memory::data_type type = platform::MKLDNNGetDataType<T>()) {
-    auto dims = framework::vectorize<int>(tensor->dims());
-    return platform::MKLDNNMemDesc(dims, type, format);
-  }
-
-  template <typename T>
-  memory::desc CreateMemDescriptor(
-      const std::vector<int> &dims, MKLDNNMemoryFormat format,
-      memory::data_type type = platform::MKLDNNGetDataType<T>()) {
-    return platform::MKLDNNMemDesc(dims, type, format);
-  }
-
-  template <typename T>
-  memory CreateMemory(const memory::desc &desc, const Tensor *tensor) {
-    return memory({desc, engine_}, to_void_cast<T>(tensor->data<T>()));
-  }
-
-  memory CreateDstMemory(
-      const inner_product_forward::primitive_desc &mul_prim_desc,
-      const ExecutionContext &ctx, Tensor *output) {
-    auto dst_prim_desc = mul_prim_desc.dst_primitive_desc();
-    auto buffer_size = dst_prim_desc.get_size();
-
-    OT *output_data = output->mutable_data<OT>(ctx.GetPlace(), buffer_size);
-    output->set_format((MKLDNNMemoryFormat)dst_prim_desc.desc().data.format);
-    return memory(dst_prim_desc, to_void_cast<OT>(output_data));
-  }
-
-  memory Reorder(const memory::desc &src_desc, const memory::desc &dst_desc,
-                 void *src_data, void *dst_data = NULL) {
-    auto src_mem = memory({src_desc, engine_}, src_data);
-    auto dst_mem = dst_data ? memory({dst_desc, engine_}, dst_data)
-                            : memory({dst_desc, engine_});
-
-    auto reorder = mkldnn::reorder(src_mem, dst_mem);
-    stream(stream::kind::eager).submit({reorder}).wait();
-
-    return dst_mem;
-  }
-
-  memory TransposeInputY(const Tensor *input_y) {
-    auto dims = framework::vectorize<int>(input_y->dims());
-    std::swap(dims[0], dims[1]);  // Correct output dimensions
-    auto src_desc = CreateMemDescriptor<YT>(dims, MKLDNNMemoryFormat::io);
-    auto dst_desc = CreateMemDescriptor<YT>(dims, MKLDNNMemoryFormat::oi);
-    return Reorder(src_desc, dst_desc, to_void_cast<YT>(input_y->data<YT>()));
-  }
-
-  inner_product_forward CreateMulPrimitive(const memory &x_memory,
-                                           const memory &y_memory,
-                                           const memory::desc &dst_desc,
-                                           Tensor *output,
-                                           const ExecutionContext &ctx) {
-    const auto y_desc = y_memory.get_primitive_desc().desc();
-    const auto x_desc = x_memory.get_primitive_desc().desc();
-
-    auto mul_prim_desc = CreateMulPrimDesc(x_desc, y_desc, dst_desc);
-    output_ = CreateDstMemory(mul_prim_desc, ctx, output);
-
-    return inner_product_forward(mul_prim_desc, x_memory, y_memory, *output_);
-  }
-
-  inner_product_forward::primitive_desc CreateMulPrimDesc(
-      const memory::desc &x_desc, const memory::desc &y_desc,
-      const memory::desc &dst_desc) {
-    auto mul_desc = inner_product_forward::desc(prop_kind::forward, x_desc,
-                                                y_desc, dst_desc);
-
-    return inner_product_forward::primitive_desc(mul_desc, engine_);
-  }
-
- protected:
-  const mkldnn::engine &engine_;
-  boost::optional<memory> x_input_;
-  boost::optional<memory> y_input_;
-  boost::optional<memory> output_;
-  boost::optional<inner_product_forward> mul_;
-};  // namespace operators
-
-template <typename XT, typename YT, typename OT>
-class QuantMulPrimitiveFactory : public MulPrimitiveFactory<XT, YT, OT> {
- public:
-  using MulPrimitiveFactory<XT, YT, OT>::MulPrimitiveFactory;
-
-  virtual inner_product_forward CreateMulPrimitive(
-      const Tensor *x_input, const Tensor *y_input, Tensor *output,
-      const ExecutionContext &ctx) {
-    /* check data format and reorder if need */
-    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
-    auto scale_y = ctx.Attr<std::vector<float>>("scale_y");
-
-    // TODO(intel-minghui) : Remove the restriction that only supports Input(Y)
-    // as weights
-    bool enforce = std::is_same<YT, float>::value;
-    PADDLE_ENFORCE(
-        enforce == true,
-        "Input(Y) supposed to be fp32 data type since only fp32 data type is "
-        "supported in the current design of MKLDNN INT8.");
-
-    auto x_matrix =
-        this->template UpdateDataFormat<XT>(x_input, x_num_col_dims, ctx);
-    auto y_matrix =
-        this->template UpdateDataFormat<YT>(y_input, y_num_col_dims, ctx);
-
-    auto output_dim = output->dims();
-    if (output_dim.size() != 2) {
-      output->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-    }
-
-    if (this->mul_) {
-      this->UpdateDataPointers(ctx, output, &x_matrix);
-      return *(this->mul_);
-    }
-
-    auto src_desc = this->template CreateMemDescriptor<XT>(
-        &x_matrix, MKLDNNMemoryFormat::nc);
-    this->x_input_ = this->template CreateMemory<XT>(src_desc, &x_matrix);
-
-    const auto trans_y = this->TransposeInputY(&y_matrix);
-    this->y_input_ = QuantInputY(trans_y, scale_y);
-
-    auto dst_desc =
-        this->template CreateMemDescriptor<OT>(output, MKLDNNMemoryFormat::any);
-
-    this->mul_ = CreateMulPrimitive(*(this->x_input_), *(this->y_input_),
-                                    dst_desc, output, ctx);
-    return *(this->mul_);
-  }
-
-  memory ReorderWithScale(const memory::desc &src_desc,
-                          const memory::desc &dst_desc, void *src_data,
-                          const std::vector<float> &scale) {
-    auto mask = scale.size() > 1 ? 1 : 0;
-    mkldnn::primitive_attr attr;
-    attr.set_output_scales(mask, scale);
-
-    auto src_mem = memory({src_desc, this->engine_}, src_data);
-    auto dst_mem = memory({dst_desc, this->engine_});
-
-    auto reorder_pd = mkldnn::reorder::primitive_desc(
-        src_mem.get_primitive_desc(), dst_mem.get_primitive_desc(), attr);
-
-    auto reorder = mkldnn::reorder(reorder_pd, src_mem, dst_mem);
-    stream(stream::kind::eager).submit({reorder}).wait();
-
-    return dst_mem;
-  }
-
-  memory QuantInputY(memory input_y, const std::vector<float> &scale_y) {
-    const auto &dims = input_y.get_primitive_desc().desc().data.dims;
-    auto ndims = input_y.get_primitive_desc().desc().data.ndims;
-    auto y_dims = std::vector<int>(dims, dims + ndims);
-
-    auto user_y_desc =
-        this->template CreateMemDescriptor<YT>(y_dims, MKLDNNMemoryFormat::oi);
-    auto y_desc = this->template CreateMemDescriptor<int8_t>(
-        y_dims, MKLDNNMemoryFormat::oi);
-
-    return ReorderWithScale(user_y_desc, y_desc, input_y.get_data_handle(),
-                            scale_y);
-  }
-
-  mkldnn::primitive_attr CreateMulAttr(const ExecutionContext &ctx,
-                                       bool force_fp32_output) {
-    mkldnn::primitive_attr mul_attr;
-
-    auto scale_y_data = ctx.Attr<std::vector<float>>("scale_y");
-    auto scale_x_data = ctx.Attr<float>("scale_x");
-    auto scale_out_data =
-        force_fp32_output ? 1.0f : ctx.Attr<float>("scale_out");
-
-    bool is_multi_channel = scale_y_data.size() > 1;
-    int count = is_multi_channel ? scale_y_data.size() : 1;
-    std::vector<float> output_shift_scale(count);
-    for (int i = 0; i < count; i++) {
-      if (scale_y_data[i] == 0.0)
-        output_shift_scale[i] = scale_out_data;
-      else
-        output_shift_scale[i] =
-            scale_out_data / (scale_x_data * scale_y_data[i]);
-    }
-    int mul_mask = is_multi_channel ? 1 : 0;
-    mul_attr.set_output_scales(mul_mask, output_shift_scale);
-
-    return mul_attr;
-  }
-
-  inner_product_forward CreateMulPrimitive(const memory &x_memory,
-                                           const memory &y_memory,
-                                           const memory::desc &dst_desc,
-                                           Tensor *output,
-                                           const ExecutionContext &ctx) {
-    const auto x_desc = x_memory.get_primitive_desc().desc();
-    const auto y_desc = y_memory.get_primitive_desc().desc();
-    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-
-    mkldnn::primitive_attr mul_attr = CreateMulAttr(ctx, force_fp32_output);
-    auto mul_prim_desc = CreateMulPrimDesc(x_desc, y_desc, dst_desc, mul_attr);
-
-    this->output_ = this->CreateDstMemory(mul_prim_desc, ctx, output);
-
-    return inner_product_forward(mul_prim_desc, x_memory, y_memory,
-                                 *(this->output_));
-  }
-
-  inner_product_forward::primitive_desc CreateMulPrimDesc(
-      const memory::desc &x_desc, const memory::desc &y_desc,
-      const memory::desc &dst_desc, const mkldnn::primitive_attr &mul_attr) {
-    const auto &mul_desc = inner_product_forward::desc(
-        prop_kind::forward, x_desc, y_desc, dst_desc);
-
-    return inner_product_forward::primitive_desc(mul_desc, mul_attr,
-                                                 this->engine_);
-  }
-};
-
-/* OT: output data type */
-template <typename XT, typename YT, typename OT>
-std::shared_ptr<MulPrimitiveFactory<XT, YT, OT>> GetPrimitiveFactory(
-    const MKLDNNDeviceContext &dev_ctx, const ExecutionContext &ctx,
-    const Tensor *input_x, const Tensor *input_y,
-    const mkldnn::engine &mkldnn_engine, bool enable_quant) {
-  const std::string key = platform::CreateKey(
-      input_x->format(), input_x->type(),
-      framework::vectorize<int>(input_x->dims()), input_y->format(),
-      input_y->type(), framework::vectorize<int>(input_y->dims()),
-      ctx.op().Output("Out"));
-
-  auto prim_creator = std::static_pointer_cast<MulPrimitiveFactory<XT, YT, OT>>(
-      dev_ctx.GetBlob(key));
-
-  if (prim_creator == nullptr) {
-    prim_creator =
-        enable_quant
-            ? std::make_shared<QuantMulPrimitiveFactory<XT, YT, OT>>(
-                  mkldnn_engine)
-            : std::make_shared<MulPrimitiveFactory<XT, YT, OT>>(mkldnn_engine);
-    dev_ctx.SetBlob(key, prim_creator);
-  }
-
-  return prim_creator;
-}
-
-template <typename XT, typename YT>
-inner_product_forward GetMulPrimitive(const MKLDNNDeviceContext &dev_ctx,
-                                      const ExecutionContext &ctx,
-                                      const Tensor *input_x,
-                                      const Tensor *input_y, Tensor *output,
-                                      const mkldnn::engine &mkldnn_engine) {
-  bool enable_quant =
-      std::is_same<XT, int8_t>::value || std::is_same<XT, uint8_t>::value;
-  bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-
-  if (enable_quant && !force_fp32_output) {
-    return GetPrimitiveFactory<XT, YT, int8_t>(dev_ctx, ctx, input_x, input_y,
-                                               mkldnn_engine, enable_quant)
-        ->CreateMulPrimitive(input_x, input_y, output, ctx);
-
-  } else {
-    return GetPrimitiveFactory<XT, YT, float>(dev_ctx, ctx, input_x, input_y,
-                                              mkldnn_engine, enable_quant)
-        ->CreateMulPrimitive(input_x, input_y, output, ctx);
-  }
-}
-
-/* XT: input x data type, YT: input y data type */
-template <typename XT, typename YT>
-class MulMKLDNNKernel : public framework::OpKernel<XT> {
- public:
-  void Compute(const ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto &mkldnn_engine = dev_ctx.GetEngine();
-
-    const Tensor *x = ctx.Input<Tensor>("X");
-    const Tensor *y = ctx.Input<Tensor>("Y");
-    Tensor *out = ctx.Output<Tensor>("Out");
-    auto out_dims = out->dims();
-
-    auto mul = GetMulPrimitive<XT, YT>(dev_ctx, ctx, x, y, out, mkldnn_engine);
-
-    stream(stream::kind::eager).submit({mul}).wait();
-
-    if (out_dims.size() != 2) {
-      out->Resize(out_dims);
-    }
-    out->set_layout(DataLayout::kMKLDNN);
-    out->set_format(platform::MKLDNNFormatForSize(
-        out_dims.size(), mkldnn::memory::format::nchw));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace,
-                                    U8, ops::kMULMKLDNNINT8,
-                                    ops::MulMKLDNNKernel<uint8_t, float>);
-
-REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace,
-                                    S8, ops::kMULMKLDNNINT8,
-                                    ops::MulMKLDNNKernel<int8_t, float>);
-
-REGISTER_OP_KERNEL(mul, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::MulMKLDNNKernel<uint8_t, float>);
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
deleted file mode 100644
index 83e9cfd90a8515c8fd15842c114e6b2c59f45d18..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/operators/pool_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::DataLayout;
-using mkldnn::memory;
-using mkldnn::pooling_backward;
-using mkldnn::pooling_forward;
-using mkldnn::primitive;
-using mkldnn::reorder;
-using mkldnn::stream;
-using platform::to_void_cast;
-
-template <typename T>
-class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-
-    const Tensor* input = ctx.Input<Tensor>("X");
-    Tensor* output = ctx.Output<Tensor>("Out");
-
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-
-    if (ctx.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(input->dims()[i + 2]);
-      }
-    }
-
-    // Only 2D pooling is supported now
-    PADDLE_ENFORCE(ksize.size() == 2, "ksize must be 2D, i.e. 2D pooling");
-    PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg",
-                   "pooling_type must be 'max' or 'avg'");
-    PADDLE_ENFORCE(input->dims().size() == 4,
-                   "Input dim must be with 4, i.e. NCHW");
-
-    auto src_tz = paddle::framework::vectorize<int>(input->dims());
-    auto dst_tz = paddle::framework::vectorize<int>(output->dims());
-
-    auto is_test = ctx.Attr<bool>("is_test");
-
-    platform::PoolingMKLDNNHandler<T> handler(
-        src_tz, dst_tz, ksize, strides, paddings, pooling_type,
-        ctx.Attr<bool>("ceil_mode"), input->format(),
-        paddle::framework::ToMKLDNNDataType(input->type()), is_test, dev_ctx,
-        ctx.GetPlace(), ctx.op().Output("Out"));
-
-    auto src_memory = handler.AcquireSrcMemory(input);
-    auto dst_memory = handler.AcquireDstMemory(output);
-
-    std::shared_ptr<mkldnn::pooling_forward> pool_p;
-    std::shared_ptr<mkldnn::memory> workspace_memory;
-    if ((is_test == false) && (pooling_type == "max")) {
-      // Training
-      workspace_memory = handler.AcquireWorkspaceMemory();
-      pool_p = handler.AcquireForwardPrimitive(*src_memory, *dst_memory,
-                                               *workspace_memory);
-    } else {
-      // Inference
-      pool_p = handler.AcquireForwardPrimitive(*src_memory, *dst_memory);
-    }
-
-    // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{*pool_p};
-    stream(stream::kind::eager).submit(pipeline).wait();
-
-    auto output_format =
-        (MKLDNNMemoryFormat)dst_memory->get_primitive_desc().desc().data.format;
-
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(output_format);
-  }
-};
-
-template <typename T>
-class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    const Tensor* in_x = ctx.Input<Tensor>("X");
-    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
-
-    PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input output_grad tensor");
-    PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input output_grad tensor");
-
-    PADDLE_ENFORCE_EQ(
-        ctx.Attr<bool>("is_test"), false,
-        "is_test attribute should be set to False in training phase.");
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-
-    if (ctx.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-      }
-    }
-
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-
-    std::vector<mkldnn::primitive> pipeline;
-
-    auto diff_src_tz = paddle::framework::vectorize<int>(in_x_grad->dims());
-    auto diff_dst_tz = paddle::framework::vectorize<int>(out_grad->dims());
-
-    // Get an unique name from "argument" name of "Out" variable
-    // This name will be used as key when referring info from device context
-    const std::string key = platform::CreateKey(
-        diff_src_tz, pooling_type, ksize, strides, paddings,
-        memory::data_type::f32, in_x->format(), ctx.op().Input("Out"));
-
-    platform::PoolingMKLDNNHandler<T> handler(
-        diff_dst_tz, diff_src_tz, ksize, strides, paddings, pooling_type,
-        ctx.Attr<bool>("ceil_mode"), in_x->format(), out_grad->format(),
-        paddle::framework::ToMKLDNNDataType(out_grad->type()), dev_ctx,
-        ctx.GetPlace(), ctx.op().Input("Out"));
-
-    auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad);
-    auto diff_src_memory = handler.AcquireDiffSrcMemory(in_x_grad);
-
-    std::shared_ptr<mkldnn::pooling_backward> pool_bwd_p;
-    std::shared_ptr<mkldnn::memory> workspace_memory;
-    if (pooling_type == "max") {
-      // Max - pooling needs Workspace
-      workspace_memory = handler.AcquireWorkspaceMemory();
-      pool_bwd_p = handler.AcquireBackwardPrimitive(
-          *diff_dst_memory, *workspace_memory, *diff_src_memory);
-    } else {
-      // Average Pooling
-      pool_bwd_p =
-          handler.AcquireBackwardPrimitive(*diff_dst_memory, *diff_src_memory);
-    }
-
-    pipeline.push_back(*pool_bwd_p);
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-
-    auto in_x_grad_format =
-        (MKLDNNMemoryFormat)diff_src_memory->get_primitive_desc()
-            .desc()
-            .data.format;
-    in_x_grad->set_layout(DataLayout::kMKLDNN);
-    in_x_grad->set_format(in_x_grad_format);
-  }  // Compute()
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::PoolMKLDNNOpKernel<float>,
-                   ops::PoolMKLDNNOpKernel<int8_t>,
-                   ops::PoolMKLDNNOpKernel<uint8_t>);
-
-REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::PoolMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
deleted file mode 100644
index 788e3f279318f34f85e9b1f006595beb24bdd88e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "mkldnn.hpp"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/quantize_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::reorder;
-using platform::to_void_cast;
-using Tensor = framework::Tensor;
-using framework::DataLayout;
-using mkldnn::stream;
-using platform::GetMKLDNNFormat;
-
-template <typename T>
-class QuantOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto scale_data = ctx.Attr<float>("Scale");
-    auto* output = ctx.Output<Tensor>("Output");
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& engine = dev_ctx.GetEngine();
-
-    std::vector<primitive> pipeline;
-    auto src_tz = paddle::framework::vectorize<int>(input->dims());
-    auto dst_tz = paddle::framework::vectorize<int>(output->dims());
-
-    const T* input_data = input->data<T>();
-
-    bool is_negative = ctx.Attr<bool>("is_negative_input");
-    std::string key = platform::CreateKey(src_tz, scale_data, is_negative,
-                                          ctx.op().Output("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
-
-    std::shared_ptr<mkldnn::memory> src_memory;
-    std::shared_ptr<mkldnn::memory> dst_memory;
-    std::shared_ptr<reorder> reorder_p;
-    reorder_p = std::static_pointer_cast<reorder>(dev_ctx.GetBlob(key_prim));
-
-    if (reorder_p == nullptr) {
-      mkldnn::primitive_attr attri;
-      int mask = 0;
-      attri.set_output_scales(mask, {scale_data});
-
-      auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32,
-                                            input->format());
-      auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
-      src_memory =
-          std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
-      std::shared_ptr<primitive::at> src_memory_p =
-          std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
-
-      std::shared_ptr<mkldnn::memory::primitive_desc> dst_pd;
-      if (is_negative) {
-        platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
-                                                dst_pd, dst_memory);
-      } else {
-        platform::SetDstMemoryQuantized<uint8_t>(ctx, output, dst_tz, engine,
-                                                 dst_pd, dst_memory);
-      }
-      auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
-          new reorder::primitive_desc(src_pd, *dst_pd, attri));
-      reorder_p = std::shared_ptr<reorder>(
-          new reorder(*reorder_pd, *src_memory_p, *dst_memory));
-
-      dev_ctx.SetBlob(key_prim, reorder_p);
-      dev_ctx.SetBlob(key_src_mem, src_memory);
-      dev_ctx.SetBlob(key_dst_mem, dst_memory);
-    } else {
-      src_memory = std::static_pointer_cast<mkldnn::memory>(
-          dev_ctx.GetBlob(key_src_mem));
-      src_memory->set_data_handle(to_void_cast<T>(input_data));
-
-      dst_memory = std::static_pointer_cast<mkldnn::memory>(
-          dev_ctx.GetBlob(key_dst_mem));
-      auto place = ctx.GetPlace();
-      if (is_negative) {
-        dst_memory->set_data_handle(output->mutable_data<int8_t>(place));
-      } else {
-        dst_memory->set_data_handle(output->mutable_data<uint8_t>(place));
-      }
-    }
-
-    pipeline.push_back(*reorder_p);
-    stream(stream::kind::eager).submit(pipeline).wait();
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(quantize, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::QuantOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
deleted file mode 100644
index a5e1e5041fb1e4503cd1aa36b8785be3148a7e75..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "mkldnn.hpp"
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/requantize_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::reorder;
-using platform::to_void_cast;
-using Tensor = framework::Tensor;
-using framework::DataLayout;
-using mkldnn::stream;
-using platform::GetMKLDNNFormat;
-
-template <typename T>
-class ReQuantOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto scale_in = ctx.Attr<float>("Scale_in");
-    auto scale_out = ctx.Attr<float>("Scale_out");
-    auto* output = ctx.Output<Tensor>("Output");
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& engine = dev_ctx.GetEngine();
-
-    std::vector<primitive> pipeline;
-    auto src_tz = paddle::framework::vectorize<int>(input->dims());
-    auto dst_tz = paddle::framework::vectorize<int>(output->dims());
-    mkldnn::memory::data_type src_dt =
-        paddle::framework::ToMKLDNNDataType(input->type());
-    mkldnn::memory::data_type dst_dt = src_dt;
-    MKLDNNMemoryFormat src_fmt = MKLDNNMemoryFormat::nhwc;
-    MKLDNNMemoryFormat dst_fmt = MKLDNNMemoryFormat::nhwc;
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    float scale_shift = scale_out / scale_in;
-
-    mkldnn::primitive_attr attri;
-    int mask = 0;
-    attri.set_output_scales(mask, {scale_shift});
-
-    auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt);
-    auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
-    auto src_memory =
-        std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
-    std::shared_ptr<primitive::at> src_memory_p =
-        std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
-
-    auto dst_md = platform::MKLDNNMemDesc({dst_tz}, dst_dt, dst_fmt);
-    auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
-    auto dst_memory = mkldnn::memory(dst_pd, to_void_cast<T>(output_data));
-
-    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
-        new reorder::primitive_desc(src_pd, dst_pd, attri));
-
-    auto reorder_p = std::shared_ptr<reorder>(
-        new reorder(*reorder_pd, *src_memory_p, dst_memory));
-    pipeline.push_back(*reorder_p);
-    stream(stream::kind::eager).submit(pipeline).wait();
-
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(dst_memory));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(requantize, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ReQuantOpKernel<int8_t>, ops::ReQuantOpKernel<uint8_t>);
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
deleted file mode 100644
index 690f9271fb7cb17032ef56d4904855a0ec115e6a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <numeric>
-#include "mkldnn.hpp"
-#include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
-using paddle::platform::MKLDNNMemDesc;
-
-using mkldnn::memory;  // Note: paddle has also "memory" namespace
-using mkldnn::primitive;
-using mkldnn::prop_kind;
-using mkldnn::softmax_backward;
-using mkldnn::softmax_forward;
-using mkldnn::stream;
-using platform::to_void_cast;
-
-template <typename T>
-class SoftmaxMKLDNNHandler
-    : public platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
-                                      mkldnn::softmax_backward> {
- public:
-  SoftmaxMKLDNNHandler(const std::vector<int>& dims,
-                       const MKLDNNMemoryFormat fmt, const int& axis,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       platform::Place cpu_place, const std::string& uniq_name)
-      : platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
-                                 mkldnn::softmax_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dims, axis, uniq_name)) {
-    auto md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
-
-    this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md,
-                                            axis);
-  }
-
-  SoftmaxMKLDNNHandler(const std::vector<int>& dims,
-                       const MKLDNNMemoryFormat fmt,
-                       const MKLDNNMemoryFormat diff_fmt, const int& axis,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       platform::Place cpu_place, const std::string& uniq_name)
-      : platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
-                                 mkldnn::softmax_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dims, axis, uniq_name)) {
-    auto data_softmax_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
-    auto diff_softmax_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
-
-    this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md,
-                                             axis);
-  }
-};
-
-template <typename T>
-class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const Tensor* input = ctx.Input<Tensor>("X");
-    Tensor* output = ctx.Output<Tensor>("Out");
-    PADDLE_ENFORCE_EQ(
-        input->dims(), output->dims(),
-        "The shape of softmax's input and output must be identical.");
-
-    auto dims = input->dims();  // input and output share the same shape
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
-
-    auto softmax_tz = paddle::framework::vectorize<int>(dims);
-
-    SoftmaxMKLDNNHandler<T> handler(softmax_tz, input->format(), axis, dev_ctx,
-                                    ctx.GetPlace(), ctx.op().Output("Out"));
-
-    auto softmax_src_memory_p = handler.AcquireSrcMemory(input);
-    auto softmax_dst_memory_p = handler.AcquireDstMemory(output);
-    auto softmax_p = handler.AcquireForwardPrimitive(*softmax_src_memory_p,
-                                                     *softmax_dst_memory_p);
-
-    std::vector<primitive> pipeline{*softmax_p};
-    stream(stream::kind::eager).submit(pipeline).wait();
-
-    const bool is_test = ctx.Attr<bool>("is_test");
-    if (!is_test) {
-      T* output_data = output->mutable_data<T>(ctx.GetPlace());
-      int size = std::accumulate(begin(softmax_tz), end(softmax_tz), 1,
-                                 std::multiplies<int>());
-      std::for_each(output_data, &output_data[size], [](T& val) {
-        val = std::max(val, static_cast<T>(exp(-64)));
-      });
-    }
-
-    output->set_layout(framework::DataLayout::kMKLDNN);
-    // Softmax output format is the same as input one
-    output->set_format(input->format());
-  }
-};
-
-template <typename T>
-class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const Tensor* output = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx =
-        ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-
-    PADDLE_ENFORCE_EQ(
-        dout->dims(), dx->dims(),
-        "The shape of softmax_grad's input and output must be identical.");
-
-    auto dims = dout->dims();  // input and output share the same shape
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
-
-    std::vector<int> softmax_tz = paddle::framework::vectorize<int>(dims);
-
-    SoftmaxMKLDNNHandler<T> handler(softmax_tz, output->format(),
-                                    dout->format(), axis, dev_ctx,
-                                    ctx.GetPlace(), ctx.op().Input("Out"));
-
-    auto dst_memory_p = handler.AcquireDstMemory(output);
-    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
-    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
-
-    auto softmax_bwd_p = handler.AcquireBackwardPrimitive(
-        *dst_memory_p, *diff_dst_memory_p, *diff_src_memory_p);
-
-    std::vector<primitive> pipeline{*softmax_bwd_p};
-    stream(stream::kind::eager).submit(pipeline).wait();
-
-    dx->set_layout(framework::DataLayout::kMKLDNN);
-    dx->set_format(dout->format());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::SoftmaxMKLDNNKernel<float>);
-REGISTER_OP_KERNEL(softmax_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::SoftmaxMKLDNNGradKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
deleted file mode 100644
index 1a8e9d6911dc4756f8c7d3338d58c6893caa47bd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*Licensed under the Apache License, Version 2.0(the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. */
-
-#include "mkldnn.hpp"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::DataLayout;
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::reorder;
-using mkldnn::stream;
-using mkldnn::sum;
-using paddle::framework::Tensor;
-using paddle::platform::CPUDeviceContext;
-using paddle::platform::MKLDNNDeviceContext;
-using platform::to_void_cast;
-
-template <typename T>
-class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    auto in_vars = ctx.MultiInputVar("X");
-
-    const int N = in_vars.size();
-    auto out_var = ctx.OutputVar("Out");
-    bool in_place = out_var == in_vars[0];
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      LoDTensor* output = ctx.Output<LoDTensor>("Out");
-      T* output_data = output->mutable_data<T>(ctx.GetPlace());
-
-      auto dst_tz = framework::vectorize<int>(output->dims());
-      auto src_tz = dst_tz;
-      MKLDNNMemoryFormat output_format{MKLDNNMemoryFormat::format_undef};
-      std::vector<float> scales;
-      std::vector<memory::primitive_desc> srcs_mpd;
-      std::vector<mkldnn::memory> srcs_mem;
-
-      PADDLE_ENFORCE_EQ(in_vars[0]->IsType<LoDTensor>(), true,
-                        "Input[0] must be LoDTensors");
-      auto& input0 = in_vars[0]->Get<LoDTensor>();
-      PADDLE_ENFORCE_EQ(input0.layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for inputs[0] tensor");
-      PADDLE_ENFORCE_NE(input0.format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for inputs[0] tensor");
-
-      MKLDNNMemoryFormat input_format = input0.format();
-
-      for (int i = 0; i < N; i++) {
-        PADDLE_ENFORCE_EQ(in_vars[i]->IsType<LoDTensor>(), true,
-                          "all inputs must be all LoDTensors");
-        auto& input = in_vars[i]->Get<LoDTensor>();
-        PADDLE_ENFORCE_EQ(input.layout(), DataLayout::kMKLDNN,
-                          "Wrong layout set for inputs");
-        PADDLE_ENFORCE_NE(input.format(), MKLDNNMemoryFormat::format_undef,
-                          "Wrong format set for inputs");
-
-        if (input.numel() == 0) {
-          continue;
-        }
-
-        const T* input_data = input.data<T>();
-
-        auto src_md =
-            memory::desc(src_tz, memory::data_type::f32, input_format);
-        auto src_mpd = memory::primitive_desc(src_md, mkldnn_engine);
-        auto src_mem = memory(src_mpd, to_void_cast(input_data));
-        srcs_mpd.push_back(src_mpd);
-        srcs_mem.push_back(src_mem);
-        scales.push_back(1.0);
-      }
-
-      auto dst_md =
-          memory::desc(dst_tz, memory::data_type::f32, MKLDNNMemoryFormat::any);
-
-      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
-
-      std::shared_ptr<memory> dst_mem;
-      if (in_place) {
-        dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
-      } else {
-        dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
-      }
-      std::vector<mkldnn::primitive::at> inputs;
-      for (size_t i = 0; i < srcs_mem.size(); ++i) {
-        inputs.push_back(srcs_mem[i]);
-      }
-
-      auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem);
-      output_format = (MKLDNNMemoryFormat)platform::GetMKLDNNFormat(sum_pd);
-
-      primitive reorder_prim;
-      std::shared_ptr<memory> target_mem;
-      if (in_place) {
-        output_format = input_format;
-        target_mem.reset(new memory(
-            {{{src_tz}, memory::data_type::f32, output_format}, mkldnn_engine},
-            output_data));
-        reorder_prim = reorder(*dst_mem, *target_mem);
-      }
-
-      std::vector<primitive> pipeline;
-      pipeline.push_back(sum_prim);
-      if (in_place) pipeline.push_back(reorder_prim);
-      stream(stream::kind::eager).submit(pipeline).wait();
-
-      output->set_layout(DataLayout::kMKLDNN);
-      output->set_format(output_format);
-    } else {  // Fallback to naive version
-      SumKernel<CPUDeviceContext, T> reference_kernel;
-      reference_kernel.Compute(ctx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_KERNEL(sum, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::SumMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
deleted file mode 100644
index bcf919fadcf5e164136c57f3cd640bea288bda1f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using framework::DataLayout;
-
-template <typename T>
-class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    const T* input_data = input->data<T>();
-
-    if (ndims == 1) {
-      output->ShareDataWith(*input);
-      return;
-    }
-
-    auto nchw_tz = paddle::framework::vectorize<int>(input->dims());
-
-    const std::string key =
-        platform::CreateKey(nchw_tz, axis, ctx.op().Output("Out") +
-                                               std::to_string(input->format()));
-
-    platform::TransposeMKLDNNHandler handler(nchw_tz, axis, dev_ctx,
-                                             mkldnn_engine, key);
-
-    auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        input->format(), platform::to_void_cast<T>(input_data));
-    auto transpose_dst_memory_p =
-        handler.AcquireDstMemory(output, ctx.GetPlace());
-    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
-                                                transpose_src_memory_p);
-
-    std::vector<mkldnn::primitive> pipeline;
-    pipeline.push_back(*transpose_p);
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-
-    output->set_layout(DataLayout::kNCHW);
-    output->set_format(MKLDNNMemoryFormat::format_undef);
-  }
-};
-
-template <typename T>
-class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (!x_grad) return;
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-    int ndims = axis.size();
-    if (ndims == 1) {
-      x_grad->ShareDataWith(*out_grad);
-      return;
-    }
-
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-
-    const T* out_grad_data = out_grad->data<T>();
-    x_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto nchw_tz = paddle::framework::vectorize<int>(out_grad->dims());
-
-    const std::string key = platform::CreateKey(
-        nchw_tz, axis, ctx.op().Output(framework::GradVarName("X")));
-
-    platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
-                                             mkldnn_engine, key);
-
-    auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        out_grad->format(), platform::to_void_cast<T>(out_grad_data));
-    auto transpose_dst_memory_p =
-        handler.AcquireDstMemory(x_grad, ctx.GetPlace());
-    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
-                                                transpose_src_memory_p);
-
-    std::vector<mkldnn::primitive> pipeline;
-    pipeline.push_back(*transpose_p);
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::TransposeMKLDNNOpKernel<float>);
-
-REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::TransposeMKLDNNOpKernel<float>);
-
-REGISTER_OP_KERNEL(transpose_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::TransposeMKLDNNGradOpKernel<float>);
-REGISTER_OP_KERNEL(transpose2_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::TransposeMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
deleted file mode 100644
index 14d75aee754bc3d5b951a4f53a34ea8661c08cca..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/modified_huber_loss_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ModifiedHuberLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The tensor rank of X must be 2.");
-    if (ctx->IsRuntime() ||
-        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(x_dims, y_dims,
-                        "The shape of X and Y must be the same.");
-    }
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
-    }
-
-    ctx->SetOutputDim("IntermediateVal", x_dims);
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-  }
-};
-
-class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of modified huber loss op. "
-             "X is 2-D tensor with shape [batch_size, 1].");
-    AddInput("Y",
-             "The target labels of modified huber loss op. "
-             "The shape of Y is the same as X. Values of Y must be 0 or 1.");
-    AddOutput("IntermediateVal",
-              "Variable to save intermediate result which will be reused in "
-              "backward processing.")
-        .AsIntermediate();
-    AddOutput("Out", "Classification loss for X.");
-    AddComment(R"DOC(
-Modified Huber Loss Operator.
-
-This operator is used in binary classification problem. The shape of
-input X and target Y are both [N, 1] and so is the shape of the output loss.
-Since target Y is not differentiable, calculating gradient for Y is illegal.
-The formula of modified huber loss is:
-
-$$
-L(y, f(x)) = 
-\begin{cases}
-(\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
-             -4yf(x),    \quad \text{otherwise}
-\end{cases}
-$$
-
-Make sure the values of target label Y are in {0, 1} here. This operator will
-scale values of Y to {-1, +1} when computing losses and gradients.
-
-)DOC");
-  }
-};
-
-class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("IntermediateVal"),
-                   "Intermediate value must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@Grad) must not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          intermediate_dims, x_dims,
-          "The shape of X and intermediate value must be the same.");
-      PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
-                        "The shape of Input(Out@Grad) and X must be the same.");
-    }
-
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(modified_huber_loss, ops::ModifiedHuberLossOp,
-                  ops::ModifiedHuberLossOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(modified_huber_loss_grad, ops::ModifiedHuberLossGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad,
-                       ops::ModifiedHuberLossGradCPUKernel<float>);
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
deleted file mode 100644
index 71bfacb9283850d4fb2939a6380594fcf1c0cfbb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <thrust/for_each.h>
-#include <thrust/tuple.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/modified_huber_loss_op.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-struct ModifiedHuberLossBackward {
-  template <typename Tuple>
-  HOSTDEVICE void operator()(Tuple t) const {
-    auto inter_val = thrust::get<1>(t);
-    auto y_val = thrust::get<2>(t);
-    auto out_grad = thrust::get<3>(t);
-    if (inter_val < -1) {
-      thrust::get<0>(t) = -4 * (2 * y_val - 1) * out_grad;
-    } else if (inter_val < 1) {
-      thrust::get<0>(t) = -2 * (1 - inter_val) * (2 * y_val - 1) * out_grad;
-    } else {
-      thrust::get<0>(t) = 0;
-    }
-  }
-};
-
-template <typename T>
-class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Y");
-    auto* in1 = context.Input<Tensor>("IntermediateVal");
-    auto* in2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-
-    if (out0) {
-      auto counts = framework::product(in1->dims());
-      auto y_ptr = thrust::device_pointer_cast(in0->data<T>());
-      auto inter_val_ptr = thrust::device_pointer_cast(in1->data<T>());
-      auto out_grad_ptr = thrust::device_pointer_cast(in2->data<T>());
-      thrust::device_ptr<T> x_grad_ptr(
-          out0->mutable_data<T>(context.GetPlace()));
-
-      auto iter_begin = thrust::make_zip_iterator(
-          thrust::make_tuple(x_grad_ptr, inter_val_ptr, y_ptr, out_grad_ptr));
-
-      auto iter_end = thrust::make_zip_iterator(
-          thrust::make_tuple(x_grad_ptr + counts, inter_val_ptr + counts,
-                             y_ptr + counts, out_grad_ptr + counts));
-
-      thrust::for_each(iter_begin, iter_end, ModifiedHuberLossBackward());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad,
-                        ops::ModifiedHuberLossGradGPUKernel<float>);
diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
deleted file mode 100644
index d6dd5539af018c171c7ae00b945f7cfc858b7903..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-struct CheckLabelValue {
-  HOSTDEVICE T operator()(const T& val) const {
-    PADDLE_ENFORCE(val == static_cast<T>(0) || val == static_cast<T>(1),
-                   "LabelValue of modified_huber_loss_op expected to be 0 "
-                   "or 1, but got %ld. Please check input value.",
-                   val);
-  }
-};
-
-template <typename T>
-struct ModifiedHuberLossForward {
-  HOSTDEVICE T operator()(const T& val) const {
-    if (val < -1) {
-      return -4 * val;
-    } else if (val < 1) {
-      return (1 - val) * (1 - val);
-    } else {
-      return static_cast<T>(0);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ModifiedHuberLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<framework::Tensor>("IntermediateVal");
-    auto* out1 = context.Output<framework::Tensor>("Out");
-
-    out0->mutable_data<T>(context.GetPlace());
-    out1->mutable_data<T>(context.GetPlace());
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = EigenVector<T>::Flatten(*in0);
-    auto y = EigenVector<T>::Flatten(*in1);
-    // make sure value's of Y in {0, 1}
-    y.unaryExpr(CheckLabelValue<T>());
-    auto inter_val = EigenVector<T>::Flatten(*out0);
-    // scale y to {-1, +1} and compute x * y
-    inter_val.device(place) = x * (2 * y - static_cast<T>(1));
-    auto loss = EigenVector<T>::Flatten(*out1);
-    loss.device(place) = inter_val.unaryExpr(ModifiedHuberLossForward<T>());
-  }
-};
-
-// CPU backward kernel
-template <typename T>
-class ModifiedHuberLossGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Y");
-    auto* in1 = context.Input<framework::Tensor>("IntermediateVal");
-    auto* in2 = context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    if (out0) {
-      const T* y_ptr = in0->data<T>();
-      const T* inter_val_ptr = in1->data<T>();
-      const T* out_grad_ptr = in2->data<T>();
-      size_t counts = static_cast<size_t>(framework::product(in1->dims()));
-      T* x_grad_ptr = out0->mutable_data<T>(context.GetPlace());
-      for (size_t i = 0; i < counts; ++i) {
-        if (inter_val_ptr[i] < -1) {
-          x_grad_ptr[i] = -4 * (2 * y_ptr[i] - 1) * out_grad_ptr[i];
-        } else if (inter_val_ptr[i] < 1) {
-          x_grad_ptr[i] = -2 * (1 - inter_val_ptr[i]) * (2 * y_ptr[i] - 1) *
-                          out_grad_ptr[i];
-        } else {
-          x_grad_ptr[i] = 0;
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
deleted file mode 100644
index 0823ea8f4d3a95be6637e71bff818dfe9490ed1b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mul_op.cc
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mul_op.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using framework::OpKernelType;
-using framework::Tensor;
-
-class MulOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MulOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
-    int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
-
-    VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
-            << " x_num_col_dims=" << x_num_col_dims
-            << " y_num_col_dims=" << y_num_col_dims;
-
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), x_num_col_dims,
-        "The input tensor X's rank of MulOp should be larger than "
-        "x_num_col_dims.");
-    PADDLE_ENFORCE_GT(
-        y_dims.size(), y_num_col_dims,
-        "The input tensor Y's rank of MulOp should be larger than "
-        "y_num_col_dims: %ld vs %ld",
-        y_dims.size(), y_num_col_dims);
-
-    auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-    auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
-
-    PADDLE_ENFORCE_EQ(x_mat_dims[1], y_mat_dims[0],
-                      "First matrix's width must be equal with second matrix's "
-                      "height. %s, %s",
-                      x_mat_dims[1], y_mat_dims[0]);
-    std::vector<int64_t> output_dims;
-    output_dims.reserve(
-        static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
-
-    for (int i = 0; i < x_num_col_dims; ++i) {
-      output_dims.push_back(x_dims[i]);
-    }
-
-    for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
-      output_dims.push_back(y_dims[i]);
-    }
-
-    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-    int customized_type_value =
-        framework::OpKernelType::kDefaultCustomizedTypeValue;
-    auto input_data_type = ctx.Input<Tensor>("X")->type();
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-
-      if (input_data_type == framework::DataTypeTrait<int8_t>::DataType() ||
-          input_data_type == framework::DataTypeTrait<uint8_t>::DataType()) {
-        customized_type_value = kMULMKLDNNINT8;
-      }
-    }
-#endif
-
-    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                   library, customized_type_value);
-  }
-};
-
-class MulOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The first input tensor of mul op.");
-    AddInput("Y", "(Tensor), The second input tensor of mul op.");
-    AddOutput("Out", "(Tensor), The output tensor of mul op.");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<int>(
-        "x_num_col_dims",
-        R"DOC((int, default 1), The mul_op can take tensors with more than two
-              dimensions as its inputs. If the input $X$ is a tensor with more
-              than two dimensions, $X$ will be flattened into a two-dimensional
-              matrix first. The flattening rule is: the first `num_col_dims`
-              will be flattened to form the first dimension of the final matrix
-              (the height of the matrix), and the rest `rank(X) - num_col_dims`
-              dimensions are flattened to form the second dimension of the final
-              matrix (the width of the matrix). As a result, height of the
-              flattened matrix is equal to the product of $X$'s first
-              `x_num_col_dims` dimensions' sizes, and width of the flattened
-              matrix is equal to the product of $X$'s last `rank(x) - num_col_dims`
-              dimensions' size. For example, suppose $X$ is a 6-dimensional
-              tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3.
-              Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] =
-              [24, 30].
-        )DOC")
-        .SetDefault(1)
-        .EqualGreaterThan(1);
-    AddAttr<int>(
-        "y_num_col_dims",
-        R"DOC((int, default 1), The mul_op can take tensors with more than two,
-              dimensions as its inputs. If the input $Y$ is a tensor with more
-              than two dimensions, $Y$ will be flattened into a two-dimensional
-              matrix first. The attribute `y_num_col_dims` determines how $Y$ is
-              flattened. See comments of `x_num_col_dims` for more details.
-        )DOC")
-        .SetDefault(1)
-        .EqualGreaterThan(1);
-    AddAttr<float>(
-        "scale_x",
-        "scale_x to be used for int8 mul input data x. scale_x has the"
-        "same purpose as scale_in in OPs that support quantization."
-        "Only to be used with MKL-DNN INT8")
-        .SetDefault(1.0f);
-    AddAttr<std::vector<float>>(
-        "scale_y",
-        "scale_y to be used for int8 mul input data y. scale_y has the"
-        "same purpose as scale_weights in OPs that support quantization."
-        "Only to be used with MKL-DNN INT8")
-        .SetDefault({1.0f});
-    AddAttr<float>("scale_out",
-                   "scale_out to be used for int8 output data."
-                   "Only used with MKL-DNN INT8")
-        .SetDefault(1.0f);
-    AddAttr<bool>(
-        "force_fp32_output",
-        "(bool, default false) Force quantize kernel output FP32, only "
-        "used in quantized MKL-DNN.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Mul Operator.
-
-This operator is used to perform matrix multiplication for input $X$ and $Y$.
-
-The equation is:
-
-$$Out = X * Y$$
-
-Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input $X$.
-
-)DOC");
-  }
-};
-
-class MulOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
-  }
-};
-
-class MulGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-class MulOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
-    retv->SetType("mul_grad");
-    retv->SetInput("X", Input("X"));
-    retv->SetInput("Y", Input("Y"));
-    retv->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    retv->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    retv->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    retv->SetAttrMap(Attrs());
-    return retv;
-  }
-};
-
-class MulDoubleGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("DOut"), "Input(DOut) should not be null");
-
-    if (ctx->HasOutput("DDOut") && ctx->HasInput("DDX")) {
-      ctx->ShareDim("DOut", "DDOut");
-    }
-    if (ctx->HasOutput("DX") && ctx->HasInput("DDY")) {
-      ctx->ShareDim("X", "DX");
-    }
-    if (ctx->HasOutput("DY") && ctx->HasInput("DDX")) {
-      ctx->ShareDim("Y", "DY");
-    }
-  }
-};
-
-class MulDoubleGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
-    retv->SetType("mul_grad_grad");
-
-    retv->SetInput("X", Input("X"));
-    retv->SetInput("Y", Input("Y"));
-    retv->SetInput("DOut", Input(framework::GradVarName("Out")));
-    retv->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    retv->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
-
-    auto ddx = OutputGrad(framework::GradVarName("X"));
-    auto ddw = OutputGrad(framework::GradVarName("Y"));
-    std::vector<std::string> empty_str = {};
-
-    retv->SetOutput("DDOut", (ddx.empty())
-                                 ? empty_str
-                                 : InputGrad(framework::GradVarName("Out")));
-    retv->SetOutput("DX", ddw.empty() ? empty_str : InputGrad("X"));
-    retv->SetOutput("DY", ddx.empty() ? empty_str : InputGrad("Y"));
-
-    retv->SetAttrMap(Attrs());
-    return retv;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpInferVarType,
-                  ops::MulOpGradMaker);
-
-REGISTER_OPERATOR(mul_grad, ops::MulGradOp, ops::MulDoubleGradMaker);
-
-REGISTER_OPERATOR(mul_grad_grad, ops::MulDoubleGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    mul_grad_grad,
-    ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
deleted file mode 100644
index 6e841712b9bffc06ca56afddcb866af8b3f9b0d8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mul_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
-                        ops::MulKernel<plat::CUDADeviceContext, double>,
-                        ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
-    ops::MulGradKernel<plat::CUDADeviceContext, double>,
-    ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad_grad,
-    ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
deleted file mode 100644
index 3a13e0576e3472700908ebb6884e7306760a24c6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mul_op.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-constexpr int kMULMKLDNNINT8 = 1;
-
-template <typename DeviceContext, typename T>
-class MulKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* y = context.Input<Tensor>("Y");
-    Tensor* z = context.Output<Tensor>("Out");
-    const Tensor x_matrix =
-        x->dims().size() > 2
-            ? framework::ReshapeToMatrix(
-                  *x, context.template Attr<int>("x_num_col_dims"))
-            : *x;
-    const Tensor y_matrix =
-        y->dims().size() > 2
-            ? framework::ReshapeToMatrix(
-                  *y, context.template Attr<int>("y_num_col_dims"))
-            : *y;
-
-    z->mutable_data<T>(context.GetPlace());
-    auto z_dim = z->dims();
-    if (z_dim.size() != 2) {
-      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-    }
-
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-
-    blas.MatMul(x_matrix, y_matrix, z);
-    if (z_dim.size() != 2) {
-      z->Resize(z_dim);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MulGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto x_matrix = x->dims().size() > 2
-                        ? framework::ReshapeToMatrix(*x, x_num_col_dims)
-                        : static_cast<const Tensor&>(*x);
-    auto y_matrix = y->dims().size() > 2
-                        ? framework::ReshapeToMatrix(*y, y_num_col_dims)
-                        : static_cast<const Tensor&>(*y);
-    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
-                     framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
-
-    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
-
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      Tensor dx_matrix = dx->dims().size() > 2
-                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
-                             : *dx;
-
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      Tensor dy_matrix = dy->dims().size() > 2
-                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
-                             : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MulDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto x_mat = x->dims().size() > 2
-                     ? framework::ReshapeToMatrix(*x, x_num_col_dims)
-                     : static_cast<const Tensor&>(*x);
-    auto y_mat = y->dims().size() > 2
-                     ? framework::ReshapeToMatrix(*y, y_num_col_dims)
-                     : static_cast<const Tensor&>(*y);
-
-    const int m = framework::flatten_to_2d(x->dims(), x_num_col_dims)[0];
-    const int n = framework::flatten_to_2d(y->dims(), y_num_col_dims)[1];
-
-    auto* dout = ctx.Input<framework::LoDTensor>("DOut");
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize({m, n});
-
-    auto* ddx = ctx.Input<framework::LoDTensor>("DDX");
-    auto* ddy = ctx.Input<framework::LoDTensor>("DDY");
-
-    auto* dx = ctx.Output<framework::LoDTensor>("DX");
-    auto* dy = ctx.Output<framework::LoDTensor>("DY");
-    auto* ddout = ctx.Output<framework::LoDTensor>("DDOut");
-
-    Tensor ddout_mat;
-    if (ddout) {
-      ddout->set_lod(dout->lod());
-      // allocate and reshape ddout
-      ddout->mutable_data<T>(ctx.GetPlace());
-      ddout_mat.ShareDataWith(*ddout);
-      ddout_mat.Resize({m, n});
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    // a flag to specify whether ddout value has been set, if flag
-    // is false, MatMul beta should be 0 to set ddout, if flag is
-    // true, MatMul beta should be 1 to add result to ddout.
-    bool ddout_flag = false;
-    if (ddx) {
-      auto ddx_mat = ddx->dims().size() > 2
-                         ? framework::ReshapeToMatrix(*ddx, x_num_col_dims)
-                         : static_cast<const Tensor&>(*ddx);
-
-      // dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
-      if (dy) {
-        dy->set_lod(y->lod());
-        // allocate and reshape dy
-        dy->mutable_data<T>(ctx.GetPlace());
-        Tensor dy_mat = dy->dims().size() > 2
-                            ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
-                            : *dy;
-        blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
-      }
-      // ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
-      if (ddout) {
-        blas.MatMul(ddx_mat, false, y_mat, false, static_cast<T>(1.0),
-                    &ddout_mat, static_cast<T>(ddout_flag));
-        ddout_flag = true;
-      }
-    }
-    if (ddy) {
-      auto ddy_mat = ddy->dims().size() > 2
-                         ? framework::ReshapeToMatrix(*ddy, y_num_col_dims)
-                         : static_cast<const Tensor&>(*ddy);
-      // dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
-      if (dx) {
-        dx->set_lod(x->lod());
-        // allocate and reshape dx
-        dx->mutable_data<T>(ctx.GetPlace());
-        Tensor dx_mat = dx->dims().size() > 2
-                            ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
-                            : *dx;
-        blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
-      }
-      // ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
-      if (ddout) {
-        blas.MatMul(x_mat, false, ddy_mat, false, static_cast<T>(1.0),
-                    &ddout_mat, static_cast<T>(ddout_flag));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
deleted file mode 100644
index 7cb213e89958e017c62d7cded261570307d3e64b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/multiplex_op.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/multiplex_op.h"
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class MultiplexOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) shouldn't be null.");
-    PADDLE_ENFORCE(!ctx->Inputs("X").empty(),
-                   "MultiInput(X) shouldn't be empty.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
-    auto ids_dim = ctx->GetInputDim("Ids");
-    PADDLE_ENFORCE(
-        ids_dim.size() == 2 && ids_dim[1] == 1,
-        "The index tensor must be a vector with size batchSize x 1.");
-
-    auto ins_dims = ctx->GetInputsDim("X");
-    auto num_ins = ins_dims.size();
-    PADDLE_ENFORCE(num_ins > 1,
-                   "multiplex operator should have more than "
-                   "one candidate input tensors.");
-
-    auto in_dim = ins_dims[0];
-    PADDLE_ENFORCE(in_dim.size() >= 2,
-                   "The rank of candidate tensors must be not less than 2.");
-    for (size_t i = 1; i < num_ins; i++) {
-      auto dim = ins_dims[i];
-      PADDLE_ENFORCE(in_dim == dim,
-                     "All the candidate tensors must have the same size.");
-    }
-    ctx->SetOutputDim("Out", in_dim);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.MultiInput<Tensor>("X")[0]->type(),
-                                   ctx.device_context());
-  }
-};
-
-class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "Tensor<int32>, index variable which is a 2-D tensor with shape "
-             "[M, 1] where M is the batch size.");
-    AddInput("X",
-             "A list of variables to gather from. All variables have the same "
-             "shape and the rank is at least 2.")
-        .AsDuplicable();
-    AddOutput("Out", "The output tensor of multiplex operator.");
-    AddComment(R"DOC(
-Referring to the given index variable, this layer selects rows from the
-input variables to construct a multiplex variable. Assuming that there are
-:math:`m` input variables and :math:`I_i` represents the i-th input
-variable and :math:`i` is in [0, :math:`m`). All input variables are
-tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
-Please note that rank of the input tensor should be at least 2. Each input
-variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
-where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
-* ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
-variable. The given index variable should be a 2-D tensor with shape
-[:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
-Then the output variable will be a tensor with shape [:math:`d_0`,
-:math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
-matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
-row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
-
-* Ids: the index tensor.
-
-* X[0 : N - 1]: the candidate tensors for output (N >= 2).
-
-* For each index i from 0 to batchSize - 1, the output is the i-th row of the
-the (Ids[i])-th tensor.
-
-For i-th row of the output tensor:
-
-$$
-y[i] = x_{k}[i]
-$$
-
-where $y$ is the output tensor, $x_{k}$ is the k-th input tensor,
-and $k = Ids[i]$.
-
-)DOC");
-  }
-};
-
-class MultiplexGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto& dxs = ctx->Outputs(framework::GradVarName("X"));
-    PADDLE_ENFORCE(!dxs.empty(), "Output(X@Grad) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
-    ctx->SetOutputsDim(framework::GradVarName("X"),
-                       std::vector<framework::DDim>(dxs.size(), dout_dim));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class MultiplexGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("multiplex_grad");
-    op->SetInput("Ids", Input("Ids"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
-                  ops::MultiplexGradDescMaker);
-REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
-REGISTER_OP_CPU_KERNEL(
-    multiplex,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
deleted file mode 100644
index 1ef54ecc732f3d2098ed51d955f8feed4cb1a821..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/multiplex_op.cu
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/multiplex_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename Place, typename T>
-class MultiplexGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
-      PADDLE_ENFORCE_LT((size_t)k, ins.size(),
-                        "index exceeds the number of candidate tensors.");
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
-    }
-  }
-};
-
-template <typename Place, typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<Place>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-
-        idx = i;
-      }
-    }
-
-    if (idx == -1UL) return;
-
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    multiplex,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h
deleted file mode 100644
index 44d6cc84a6493a326257d96f19b43c83c62f7b31..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/multiplex_op.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MultiplexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto ids = ctx.Input<framework::Tensor>("Ids");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    auto index = ids->data<int32_t>();
-    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        "index exceeds the number of candidate tensors.");
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<framework::Tensor>("Ids");
-    auto d_ins =
-        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-
-        idx = i;
-      }
-    }
-
-    if (idx == -1UL) return;
-
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    auto* index = ids->data<int32_t>();
-    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
deleted file mode 100644
index 9b26e19cc7ed05038e05308f9277b200a885dc10..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-if(WITH_GPU AND NOT WIN32)
-  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
-endif()
-
-if(WITH_GPU)
-    op_library(nccl_op DEPS nccl_common)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
-    set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common PARENT_SCOPE)
-endif()
-
-if(NOT WIN32)
-    nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
-endif()
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
deleted file mode 100644
index 08b61765c2f0fb90056c97618c0ce345155a274c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace platform {
-namespace {
-// TODO(panyx0718): Where to destroy them.
-std::unique_ptr<std::vector<ncclComm_t>> global_comms;
-std::unique_ptr<std::unordered_map<int, int>> comm_id_map;
-bool inited = false;
-size_t last_num_gpus = -1;
-// TODO(panyx0718): Need to decide whether Paddle supports parallel
-// runs with different number GPUs. If true, current solution is not enough.
-std::mutex comm_mu;
-}
-
-int Communicator::GetCommId(int device_id) const {
-  std::lock_guard<std::mutex> guard(comm_mu);
-  return comm_id_map->at(device_id);
-}
-
-void Communicator::InitAll(const std::vector<int>& gpus) {
-  std::lock_guard<std::mutex> guard(comm_mu);
-  if (inited && last_num_gpus == gpus.size()) {
-    return;
-  }
-  last_num_gpus = gpus.size();
-  if (global_comms) {
-    for (size_t i = 0; i < global_comms->size(); ++i) {
-      // FIXME(dzh) : PADDLE_ENFORCE return void
-      dynload::ncclCommDestroy((*global_comms)[i]);
-    }
-  }
-  global_comms.reset(new std::vector<ncclComm_t>());
-  comm_id_map.reset(new std::unordered_map<int, int>());
-  global_comms->resize(gpus.size());
-  for (size_t i = 0; i < gpus.size(); ++i) {
-    (*comm_id_map)[gpus[i]] = i;
-  }
-  PADDLE_ENFORCE(
-      dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data()));
-  inited = true;
-}
-
-const std::vector<ncclComm_t>& Communicator::comms() const {
-  std::lock_guard<std::mutex> guard(comm_mu);
-  return *global_comms;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h
deleted file mode 100644
index 558ff4cc09603eebbcd95a234ff1aa63ada7fbb2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <condition_variable>  // NOLINT
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/dynload/nccl.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace platform {
-constexpr int kInvalidGPUId = -1;
-
-struct Communicator {
-  Communicator() {}
-
-  int GetCommId(int device_id) const;
-
-  void InitAll(const std::vector<int>& gpus);
-
-  const std::vector<ncclComm_t>& comms() const;
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc
deleted file mode 100644
index 6a0ae0dede695d80508bcc92a7a13ae9f73c3c57..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr char kParallelScopes[] = "parallel_scopes";
-
-// NCCLinitOp
-class NCCLInitOp : public framework::OperatorBase {
- public:
-  NCCLInitOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kParallelScopes)),
-                            "Can not find variable '%s' in the scope.",
-                            kParallelScopes);
-    const auto &name = Output("Communicator");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
-                            "Can not find variable '%s' in the scope.", name);
-    // A parallel do may not use all the gpus. For example, the batch size is 7
-    // in the last batch while we have 8 gpu. In this case, parallel_do will
-    // create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
-    auto &parallel_scopes = scope.FindVar(Input(kParallelScopes))
-                                ->Get<std::vector<framework::Scope *>>();
-    std::vector<int> gpus(parallel_scopes.size());
-    for (int i = 0; i < static_cast<int>(parallel_scopes.size()); ++i) {
-      gpus[i] = i;
-    }
-    PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus.");
-
-    if (scope.FindVar(name) == nullptr) {
-      PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
-    }
-
-    platform::Communicator *comm =
-        scope.FindVar(name)->GetMutable<platform::Communicator>();
-    comm->InitAll(gpus);
-  }
-};
-
-class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto out_var_name = ctx->Output("Communicator").front();
-    ctx->SetType(out_var_name, framework::proto::VarType::RAW);
-  }
-};
-
-class NCCLInitOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kParallelScopes, "The working place of parallel do.");
-    AddOutput("Communicator",
-              "Create Communicator for communicating between gpus");
-    AddComment(R"DOC(
-NCCLInit Operator.
-
-Create communicator.
-
-)DOC");
-  }
-};
-
-// AllReduceOp
-class NCCLAllReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   " Input(X) of AllReduce op input should not be NULL");
-    PADDLE_ENFORCE(
-        ctx->HasInput("Communicator"),
-        " Input(Communicator) of AllReduce op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Output(Out) of AllReduce op output should not be NULL");
-    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
-                    reduction == "ncclMin" || reduction == "ncclMax"),
-                   "invalid reduction.");
-
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-// AllReduceOp
-class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of AllReduce op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of AllReduce op");
-    AddAttr<std::string>("reduction",
-                         "(string, default 'ncclSum') "
-                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
-        .SetDefault("ncclSum");
-    AddComment(R"DOC(
-NCCLAllReduce Operator.
-
-AllReduce the input tensors.
-
-)DOC");
-  }
-};
-
-// ReduceOp
-class NCCLReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   " Input(X) of Reduce op input should not be NULL");
-    PADDLE_ENFORCE(
-        ctx->HasInput("Communicator"),
-        " Input(Communicator) of Reduce op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Input(X) of Reduce op input should not be NULL");
-
-    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
-                    reduction == "ncclMin" || reduction == "ncclMax"),
-                   "invalid reduction.");
-
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-// ReduceOp
-class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of Reduce op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of Reduce op");
-    AddAttr<std::string>("reduction",
-                         "(string, default 'ncclSum') "
-                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
-        .SetDefault("ncclSum");
-    AddAttr<int>("root",
-                 "(int, default kInvalidGPUId) "
-                 "Root gpu of the parameter. If not, "
-                 "set(platform::kInvalidGPUId). Hashed by name.")
-        .SetDefault(platform::kInvalidGPUId);
-    AddComment(R"DOC(
-NCCLReduce Operator.
-
-Reduce the tensors.
-
-)DOC");
-  }
-};
-
-// BcastOp
-class NCCLBcastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   " Input(X) of Bcast op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
-                   " Input(Communicator) of Bcast op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Output(Out) of Bcast op output should not be NULL");
-
-    int root = ctx->Attrs().Get<int>("root");
-    PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
-
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-// BcastOp
-class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of BcastSend op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of Bcast");
-    AddAttr<int>("root",
-                 "(int, default kInvalidGPUId) "
-                 "Root gpu of the parameter. If not, "
-                 "set(platform::kInvalidGPUId). Hashed by name.")
-        .SetDefault(platform::kInvalidGPUId);
-    AddComment(R"DOC(
-NCCLBcast Operator.
-
-Bcast the tensors.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp,
-                  paddle::framework::EmptyGradOpMaker, ops::NCCLInitOpMaker,
-                  ops::NCCLInitOpVarTypeInference,
-                  ops::NCCLInitOpShapeInference);
-
-REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
-                             ops::NCCLAllReduceOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, ops::NCCLBcastOp,
-                             ops::NCCLBcastOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
-                             ops::NCCLReduceOpMaker);
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
deleted file mode 100644
index 8de974bc2b333fb6ccc5b5f0bb1af86533139925..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenseshashernless required by applicable law or agreed
-to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using platform::Communicator;
-using framework::LoDTensor;
-
-template <typename Type>
-class NCCLTypeWrapper;
-
-template <>
-class NCCLTypeWrapper<float> {
- public:
-  static const ncclDataType_t type = ncclFloat;
-};
-
-template <>
-class NCCLTypeWrapper<double> {
- public:
-  static const ncclDataType_t type = ncclDouble;
-};
-
-template <typename T>
-class NCCLAllReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    auto* comm = ctx.Input<Communicator>("Communicator");
-    std::string reduction = ctx.Attr<std::string>("reduction");
-
-    ncclRedOp_t reduction_op_ = ncclSum;
-    if (reduction == "ncclMin") {
-      reduction_op_ = ncclMin;
-    } else if (reduction == "ncclMax") {
-      reduction_op_ = ncclMax;
-    } else if (reduction == "ncclSum") {
-      reduction_op_ = ncclSum;
-    } else if (reduction == "ncclProd") {
-      reduction_op_ = ncclProd;
-    } else {
-      PADDLE_THROW("Invalid reduction. default ncclSum.");
-    }
-    // device id
-    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
-    int idx = comm->GetCommId(gpu_id);
-    VLOG(3) << "gpu : "
-            << " invoke allreduce. send " << x->numel() << " recv "
-            << out->numel();
-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-        x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
-        NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
-        ctx.cuda_device_context().stream()));
-    VLOG(3) << "gpu : "
-            << " finished allreduce. send " << x->numel() << " recv "
-            << out->numel();
-  }
-};
-
-template <typename T>
-class NCCLReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto x = ctx.Input<LoDTensor>("X");  // x0, x1, x2
-    auto out = ctx.Output<LoDTensor>("Out");
-    auto* comm = ctx.Input<Communicator>("Communicator");
-    int root = ctx.Attr<int>("root");
-    std::string reduction = ctx.Attr<std::string>("reduction");
-
-    ncclRedOp_t reduction_op_ = ncclSum;
-    if (reduction == "ncclMin") {
-      reduction_op_ = ncclMin;
-    } else if (reduction == "ncclMax") {
-      reduction_op_ = ncclMax;
-    } else if (reduction == "ncclSum") {
-      reduction_op_ = ncclSum;
-    } else if (reduction == "ncclProd") {
-      reduction_op_ = ncclProd;
-    } else {
-      PADDLE_THROW("Invalid reduction. default ncclSum.");
-    }
-    // device id
-    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
-    int idx = comm->GetCommId(gpu_id);
-    T* recvbuffer = nullptr;
-    if (root == gpu_id) {
-      recvbuffer = out->mutable_data<T>(ctx.GetPlace());
-    } else {
-      out->Resize(framework::make_ddim({0}));
-    }
-    VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
-            << " recv " << out->numel();
-    PADDLE_ENFORCE(platform::dynload::ncclReduce(
-        x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
-        reduction_op_, root, comm->comms().at(idx),
-        ctx.cuda_device_context().stream()));
-    VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
-            << " recv " << out->numel();
-  }
-};
-
-template <typename T>
-class NCCLBcastKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    int root = ctx.Attr<int>("root");
-    auto* comm = ctx.Input<Communicator>("Communicator");
-    // device id
-    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
-    int idx = comm->GetCommId(gpu_id);
-    if (idx == root) {
-      auto* x = ctx.Input<LoDTensor>("X");
-      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
-      PADDLE_ENFORCE(platform::dynload::ncclBcast(
-          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
-          NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
-          ctx.cuda_device_context().stream()));
-      VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
-    } else {
-      auto* out = ctx.Output<LoDTensor>("Out");
-      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
-              << framework::product(out->dims());
-      PADDLE_ENFORCE(platform::dynload::ncclBcast(
-          out->mutable_data<T>(ctx.GetPlace()), out->numel(),
-          NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
-          ctx.cuda_device_context().stream()));
-      VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel();
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
-REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
-REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
deleted file mode 100644
index d5fb7a12e5d9757f3e639f6de7f0129bd531e2a1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ /dev/null
@@ -1,287 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <memory>
-#include <mutex>   // NOLINT
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-
-USE_NO_KERNEL_OP(ncclInit);
-USE_CUDA_ONLY_OP(ncclAllReduce);
-USE_CUDA_ONLY_OP(ncclReduce);
-USE_CUDA_ONLY_OP(ncclBcast);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-// test data amount
-const f::DDim kDims = {20, 20};
-
-// nccl op common tester, init communicator.
-class NCCLTester : public ::testing::Test {
- public:
-  void SetUp() override {
-    int count = p::GetCUDADeviceCount();
-    if (count <= 1) {
-      LOG(WARNING)
-          << "Cannot test multi-gpu nccl, because the CUDA device count is "
-          << count;
-      exit(0);
-    }
-    for (int i = 0; i < count; ++i) {
-      gpu_list_.emplace_back(i);
-    }
-
-    paddle::platform::CPUPlace cpu_place;
-    for (size_t i = 0; i < gpu_list_.size(); ++i) {
-      p::CUDAPlace place(i);
-      dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
-    }
-
-    NCCLInitOp();
-  }
-
-  void TearDown() override {
-    for (auto &device_context : dev_ctxs_) {
-      delete device_context;
-    }
-  }
-
-  void NCCLInitOp() {
-    paddle::platform::CPUPlace cpu_place;
-    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
-
-    op1->SetType("ncclInit");
-    op1->SetInput("parallel_scopes", {"p_scopes"});
-    op1->SetOutput("Communicator", {"comm"});
-
-    auto *var = g_scope_.Var("comm");
-    var->GetMutable<p::Communicator>();
-
-    auto *scope_var = g_scope_.Var("p_scopes");
-    auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
-    (*p_scopes).resize(gpu_list_.size());
-
-    auto op = f::OpRegistry::CreateOp(*op1);
-    VLOG(1) << "invoke NCCLInitOp.";
-    op->Run(g_scope_, cpu_place);
-    VLOG(1) << "NCCLInitOp finished.";
-  }
-
-  int GetGPUData(int gpu_id) { return gpu_id + 42; }
-
-  template <class T>
-  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
-    std::unique_lock<std::mutex> lk(mu_);
-    const f::OpDesc *op1 = &op_desc;
-
-    p::CUDAPlace place(gpu_id);
-    auto &ctx = dev_ctxs_.at(gpu_id);
-
-    auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
-    auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
-
-    if (!send_tensor->numel()) {
-      send_tensor->mutable_data<T>(kDims, place);
-
-      std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
-      paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
-      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
-    }
-
-    lk.unlock();
-
-    PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
-                   "Tensor numel not match!");
-
-    auto op = f::OpRegistry::CreateOp(*op1);
-
-    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
-    VLOG(1) << " send_tensor : " << send_tensor->numel()
-            << " recv_tensor : " << recv_tensor->numel();
-    op->Run(*scope, place);
-    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
-  }
-
- public:
-  std::vector<p::DeviceContext *> dev_ctxs_;
-  f::Scope g_scope_;
-  std::mutex mu_;
-  std::vector<int> gpu_list_;
-};
-
-// ncclInitOp with desc
-TEST_F(NCCLTester, ncclInitOp) {}
-
-// ncclAllReduceOp with desc
-// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
-/*
-TEST_F(NCCLTester, ncclAllReduceOp) {
-  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  op2->SetType("ncclAllReduce");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope_.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
-                   *op2.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    ths[i].join();
-  }
-
-  float expected_result = 0.0;
-  for (int gpu_id : gpu_list_) {
-    expected_result = expected_result + GetGPUData(gpu_id);
-  }
-
-  for (size_t i = 0; i < dev_scopes.size(); ++i) {
-    p::CPUPlace cpu_place;
-    p::CUDAPlace gpu_place(gpu_list_[i]);
-
-    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
-    auto *rt = recv_tensor.data<float>();
-    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
-    result_tensor->Resize(kDims);
-    auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-    paddle::memory::Copy(
-        cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
-        recv_tensor.numel() * sizeof(float),
-        static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());
-
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], expected_result, 1e-5);
-    }
-  }
-}
-*/
-
-// ncclReduceOp with desc
-TEST_F(NCCLTester, ncclReduceOp) {
-  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  const int kRoot = 0;
-  op2->SetType("ncclReduce");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", kRoot);
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope_.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
-                   *op2.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    ths[i].join();
-  }
-
-  float expected_result = 0.0;
-  for (int gpu_id : gpu_list_) {
-    expected_result = expected_result + GetGPUData(gpu_id);
-  }
-
-  p::CPUPlace cpu_place;
-  p::CUDAPlace gpu_place(gpu_list_[kRoot]);
-
-  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
-  auto *rt = recv_tensor.data<float>();
-  auto *result_tensor =
-      dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
-  result_tensor->Resize(kDims);
-  auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-  paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
-                       recv_tensor.numel() * sizeof(float), nullptr);
-
-  for (int64_t j = 0; j < f::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], expected_result, 1e-5);
-  }
-}
-
-// ncclBcastOp with desc
-// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
-/*
-TEST_F(NCCLTester, ncclBcastOp) {
-  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  const int kRoot = 0;
-  op2->SetType("ncclBcast");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", kRoot);
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope_.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
-                   *op2.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    ths[i].join();
-  }
-
-  const int idx = 1;
-  float result = GetGPUData(kRoot);
-
-  p::CPUPlace cpu_place;
-  p::CUDAPlace gpu_place(gpu_list_[idx]);
-
-  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
-  auto *rt = recv_tensor.data<float>();
-  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
-  result_tensor->Resize(kDims);
-  auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-  paddle::memory::Copy(
-      cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
-      recv_tensor.numel() * sizeof(float),
-      static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
-
-  for (int64_t j = 0; j < f::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], result, 1e-5);
-  }
-}
-*/
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
deleted file mode 100644
index 0ccc5d30b3141b029b157fd8a046c4dbeab22c23..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/nce_op.cc
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/nce_op.h"
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class NCEOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"));
-    PADDLE_ENFORCE(ctx->HasInput("Label"));
-    PADDLE_ENFORCE(ctx->HasInput("Weight"));
-    PADDLE_ENFORCE(ctx->HasOutput("Cost"));
-    PADDLE_ENFORCE(ctx->HasOutput("SampleLogits"));
-    PADDLE_ENFORCE(ctx->HasOutput("SampleLabels"));
-
-    auto x_dims = ctx->GetInputDim("Input");
-    auto label_dims = ctx->GetInputDim("Label");
-    if (ctx->IsRuntime() || (x_dims[0] > 0 && label_dims[0] > 0)) {
-      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
-    }
-    int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
-    if (ctx->HasInput("Bias")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
-                        ctx->GetInputDim("Bias")[0]);
-    }
-    auto num_neg_samples = ctx->Attrs().Get<int>("num_neg_samples");
-    auto num_total_classes = ctx->Attrs().Get<int>("num_total_classes");
-    std::vector<int> custom_neg_classes =
-        ctx->Attrs().Get<std::vector<int>>("custom_neg_classes");
-    PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]);
-    if (custom_neg_classes.size() > 0) {
-      PADDLE_ENFORCE_EQ(custom_neg_classes.size(),
-                        static_cast<size_t>(num_neg_samples));
-    }
-    // set dims of output(Out)
-    std::vector<int64_t> out_dims;
-    out_dims.push_back(x_dims[0]);
-    out_dims.push_back(1);
-    ctx->SetOutputDim("Cost", framework::make_ddim(out_dims));
-
-    // set dims of output(SampleOut)
-    std::vector<int64_t> sample_out_dims;
-    sample_out_dims.push_back(x_dims[0]);
-    sample_out_dims.push_back(
-        (num_true_classes == -1) ? -1 : (num_neg_samples + num_true_classes));
-    ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
-    ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
-    AddInput(
-        "Label",
-        "(Tensor) A tensor of shape [batch_size, num_true_class]. "
-        "'num_true_class' is the number of target classes in each sample."
-        "The number of target classes per sample should be same. "
-        "If you have a variable number of target classes, "
-        "you can pad them out to a constant number by either repeating them"
-        " or by padding with an otherwise unused class.)");
-    AddInput("Weight",
-             "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the "
-             "total number of class.");
-    AddInput(
-        "Bias",
-        "(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total "
-        "number of class. It is a dispensable input.")
-        .AsDispensable();
-    AddInput("SampleWeight",
-             "(Tensor) A tensor of shape [batch_size, 1] storing a weight for "
-             "each sample. And it is a dispensable input. The default value of "
-             "sample is 1.")
-        .AsDispensable();
-
-    AddInput(
-        "CustomDistProbs",
-        "(Tensor) It is used in 'CostumDist' sampler. "
-        "It is a tensor with shape [num_total_classes]."
-        "The i-th element is the probsbility of the i-th class being sampled.")
-        .AsDispensable();
-    AddInput(
-        "CustomDistAlias",
-        "(Tensor) It is used in 'CostumDist' sampler. "
-        "It is a tensor with shape [num_total_classes]."
-        "The i-th element is the probsbility of the i-th class being sampled.")
-        .AsDispensable();
-    AddInput(
-        "CustomDistAliasProbs",
-        "(Tensor) It is used in 'CostumDist' sampler. "
-        "It is a tensor with shape [num_total_classes]."
-        "The i-th element is the probsbility of the i-th class being sampled.")
-        .AsDispensable();
-
-    AddOutput("Cost",
-              "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
-    AddOutput("SampleLogits",
-              "An intermediate tensor of shape[batch_size, num_neg_samples + "
-              "num_pos_samples]."
-              "This tensor is output of forward kernel and used in backward "
-              "kernel to compute grads."
-              "Given X is  the dot product of input tensor and sampled labels' "
-              "weights."
-              "Then 'SampleLogits' is sigmoid(X).")
-        .AsIntermediate();
-    AddOutput("SampleLabels",
-              "An intermediate tensor of shape[batch_size, num_neg_samples + "
-              "num_pos_samples]."
-              "This tensor is output of forward kernel and used in backward "
-              "kernel to compute grads."
-              "")
-        .AsIntermediate();
-
-    AddAttr<int>("num_total_classes",
-                 "Total number of classes in all samples.");
-    AddAttr<int>("num_neg_samples",
-                 "The number of negative classes. The default value is 10.")
-        .SetDefault(10);
-    AddAttr<int>("sampler",
-                 "(int) Which sampler to be used to sample negative class."
-                 "0: Uniform; 1: LogUniform; 2: CostumDist.")
-        .SetDefault(0);
-    AddAttr<int>("seed",
-                 "(int) The seed used in sampler. If it is 0, "
-                 "the sampler will generate a seed randomly.")
-        .SetDefault(0);
-    AddAttr<bool>("is_sparse", "(boolean, default false) Sparse update.")
-        .SetDefault(false);
-
-    // for parameter prefetch
-    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, the splited table names that will be fetched from "
-        "parameter server)"
-        "in the order of input variables for mapping")
-        .SetDefault({});
-
-    AddAttr<std::vector<int>>("custom_neg_classes",
-                              "This attribute only be used in unitest. Classes "
-                              "in this list wiil be used as negative classes "
-                              "for every samples. Under normal conditions, "
-                              "user should avoid setting this attribute.")
-        .SetDefault({});
-    AddComment(R"DOC(
-Compute and return the noise-contrastive estimation training loss. See
-`Noise-contrastive estimation: A new estimation principle for unnormalized
-statistical models
- <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
-By default this operator uses a uniform distribution for sampling.
-)DOC");
-  }
-};
-
-class NCEOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"));
-    PADDLE_ENFORCE(ctx->HasInput("Weight"));
-    PADDLE_ENFORCE(ctx->HasInput("Cost"));
-    PADDLE_ENFORCE(ctx->HasInput("SampleLogits"));
-    PADDLE_ENFORCE(ctx->HasInput("SampleLabels"));
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")),
-                   "The input(Out@GRAD) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("Input");
-    auto x_grad_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-
-    auto w_dims = ctx->GetInputDim("Weight");
-    auto w_grad_name = framework::GradVarName("Weight");
-    if (ctx->HasOutput(w_grad_name)) {
-      ctx->SetOutputDim(w_grad_name, w_dims);
-    }
-
-    auto bias_grad_name = framework::GradVarName("Bias");
-    if (ctx->HasOutput(bias_grad_name)) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      ctx->SetOutputDim(bias_grad_name, bias_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class NCEOpGradVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto weight_grad = ctx->Output(framework::GradVarName("Weight")).front();
-
-    auto attr = ctx->GetAttr("is_sparse");
-    bool is_sparse = boost::get<bool>(attr);
-    if (is_sparse) {
-      VLOG(3) << "nce_op_grad op " << weight_grad << " and "
-              << " is set to SelectedRows";
-      ctx->SetType(weight_grad, framework::proto::VarType::SELECTED_ROWS);
-    } else {
-      VLOG(3) << "nce_op_grad op " << weight_grad << " and "
-              << " is set to LoDTensor";
-      ctx->SetType(weight_grad, framework::proto::VarType::LOD_TENSOR);
-    }
-    ctx->SetDataType(weight_grad, ctx->GetDataType(ctx->Input("Input")[0]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(nce, ops::NCEOp,
-                  paddle::framework::DefaultGradOpDescMaker<true>,
-                  ops::NCEOpMaker);
-REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad, ops::NCEOpGradVarTypeInference);
-REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
-                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(nce_grad,
-                       ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
deleted file mode 100644
index 1f2f778bcd75d083e33ae43ed66f5ba345356003..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/nce_op.h
+++ /dev/null
@@ -1,407 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <math.h>
-#include <iterator>
-#include <random>
-#include <set>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/math/sampler.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using Sampler = math::Sampler;
-using DDim = framework::DDim;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-void PrepareSamples(const framework::ExecutionContext &context,
-                    Sampler *sampler) {
-  auto label = context.Input<Tensor>("Label");
-  const int64_t *label_data = label->data<int64_t>();
-  auto label_dims = label->dims();
-  // for unitest
-  std::vector<int> custom_neg_classes =
-      context.Attr<std::vector<int>>("custom_neg_classes");
-
-  auto sample_labels = context.Output<Tensor>("SampleLabels");
-  auto sample_labels_dims = sample_labels->dims();
-  int64_t *sample_labels_data =
-      sample_labels->mutable_data<int64_t>(context.GetPlace());
-
-  int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
-  int index = 0;
-  for (int64_t i = 0; i < label_dims[0]; ++i) {
-    int j = 0;
-    for (; j < num_label; ++j) {
-      sample_labels_data[index++] = label_data[i * num_label + j];
-    }
-    if (custom_neg_classes.size() > 0) {
-      for (auto label : custom_neg_classes) {
-        sample_labels_data[index++] = label;
-      }
-    } else {
-      for (; j < sample_labels_dims[1]; ++j) {
-        // TODO(wanghaoshuang): support more distribution sampling
-        sample_labels_data[index++] = sampler->Sample();
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class NCEKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    int sampler_type = context.Attr<int>("sampler");
-    int seed = context.Attr<int>("seed");
-    int num_total_classes = context.Attr<int>("num_total_classes");
-    int num_neg_samples = context.Attr<int>("num_neg_samples");
-
-    Sampler *sampler;
-    switch (sampler_type) {
-      case 0: {
-        sampler = new math::UniformSampler(num_total_classes - 1, seed);
-        break;
-      }
-      case 1: {
-        sampler = new math::LogUniformSampler(num_total_classes - 1, seed);
-        break;
-      }
-      case 2: {
-        auto dist_probs = context.Input<Tensor>("CustomDistProbs");
-        auto dist_alias = context.Input<Tensor>("CustomDistAlias");
-        auto dist_alias_probs = context.Input<Tensor>("CustomDistAliasProbs");
-
-        PADDLE_ENFORCE_EQ(dist_probs->numel(), num_total_classes);
-        PADDLE_ENFORCE_EQ(dist_alias->numel(), num_total_classes);
-        PADDLE_ENFORCE_EQ(dist_alias_probs->numel(), num_total_classes);
-
-        const float *probs_data = dist_probs->data<float>();
-        const int *alias_data = dist_alias->data<int>();
-        const float *alias_probs_data = dist_alias_probs->data<float>();
-        sampler = new math::CustomSampler(num_total_classes - 1, probs_data,
-                                          alias_data, alias_probs_data, seed);
-        break;
-      }
-      default: { PADDLE_THROW("Unsupported SamplerType."); }
-    }
-
-    PrepareSamples<DeviceContext, T>(context, sampler);
-    auto sample_labels = context.Output<Tensor>("SampleLabels");
-    const int64_t *sample_labels_data = sample_labels->data<int64_t>();
-
-    for (int x = 0; x < sample_labels->numel(); x++) {
-      PADDLE_ENFORCE_GE(sample_labels_data[x], 0, "nce sample label %d", x);
-    }
-
-    auto sample_out = context.Output<Tensor>("SampleLogits");
-    T *sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
-    auto label = context.Input<Tensor>("Label");
-    auto sample_weight = context.Input<Tensor>("SampleWeight");
-    const T *sample_weight_data = nullptr;
-    if (sample_weight != nullptr) {
-      sample_weight_data = sample_weight->data<T>();
-    }
-    auto out = context.Output<Tensor>("Cost");
-    T *out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t num_true_class = 1;
-    if (label != nullptr) {
-      num_true_class = label->dims()[1];
-    }
-    int64_t sampled_labels_num = sample_labels->dims()[1];
-    //    T b = 1. / num_total_classes * num_neg_samples;
-    // forward bias
-    auto bias = context.Input<Tensor>("Bias");
-    if (bias != nullptr) {
-      const T *bias_data = bias->data<T>();
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        sample_out_data[i] = bias_data[sample_labels_data[i]];
-      }
-    } else {
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        sample_out_data[i] = 0;
-      }
-    }
-    // forward mul
-    auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
-
-    // for remote prefetch
-    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
-    auto epmap = context.Attr<std::vector<std::string>>("epmap");
-
-    if (remote_prefetch && !epmap.empty()) {
-      // if epmap is not empty, then the parameter will be fetched from remote
-      // parameter
-      // server
-
-      std::vector<int64_t> labels;
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        labels.push_back(sample_labels_data[i]);
-      }
-      std::set<T> st(labels.begin(), labels.end());
-      labels.assign(st.begin(), st.end());
-
-      framework::Scope &local_scope = context.scope().NewScope();
-
-      auto height_sections =
-          context.Attr<std::vector<int64_t>>("height_sections");
-      auto table_names = context.Attr<std::vector<std::string>>("table_names");
-
-      auto *ids = local_scope.Var("Ids@Prefetch");
-      auto *x_tensor = ids->GetMutable<framework::LoDTensor>();
-      x_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(labels.size()), 1}),
-          context.GetPlace());
-      // copy.
-      std::memcpy(x_tensor->data<int64_t>(), labels.data(),
-                  labels.size() * sizeof(int64_t));
-
-      std::vector<int> w_dims = paddle::framework::vectorize<int>(
-          context.Input<Tensor>("Weight")->dims());
-      w_dims[0] = static_cast<int>(labels.size());
-
-      auto *w_tensor = local_scope.Var("Weight@Prefetch")
-                           ->GetMutable<framework::LoDTensor>();
-      w_tensor->Resize(framework::make_ddim(w_dims));
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-      auto weight = context.Inputs("Weight").front();
-      operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch",
-                                       weight, false, table_names, epmap,
-                                       height_sections, context, local_scope);
-#else
-      PADDLE_THROW(
-          "paddle is not compiled with distribute support, can not do "
-          "parameter prefetch!");
-#endif
-
-      auto weight_mat = EigenMatrix<T>::From(
-          (local_scope.Var("Weight@Prefetch")->Get<framework::LoDTensor>()));
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        std::vector<int64_t>::iterator it =
-            std::find(labels.begin(), labels.end(), sample_labels_data[i]);
-        int idx = std::distance(labels.begin(), it);
-
-        Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
-            (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
-             weight_mat.chip(idx, 0))
-                .sum();
-        sample_out_data[i] += result(0);
-        sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
-      }
-      context.scope().DeleteScope(&local_scope);
-    } else {
-      auto weight_mat =
-          EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
-            (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
-             weight_mat.chip(sample_labels_data[i], 0))
-                .sum();
-        sample_out_data[i] += result(0);
-        sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
-      }
-    }
-
-    // forward cost
-    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
-      out_data[i] = 0;
-      T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
-      for (int64_t j = 0; j < sampled_labels_num; ++j) {
-        int64_t target = sample_labels_data[i * sampled_labels_num + j];
-        T o = sample_out_data[i * sampled_labels_num + j];
-        float b = sampler->Probability(target) * num_neg_samples;
-        T cost = (j < num_true_class) ? -log(o / (o + b)) : -log(b / (o + b));
-        out_data[i] += w * cost;
-      }
-    }
-    delete sampler;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NCEGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
-    const T *d_out_data = d_out->data<T>();
-    auto label = context.Input<Tensor>("Label");
-    auto sample_out = context.Input<Tensor>("SampleLogits");
-    const T *sample_out_data = sample_out->data<T>();
-    auto sample_labels = context.Input<Tensor>("SampleLabels");
-    const int64_t *sample_labels_data = sample_labels->data<int64_t>();
-    auto sample_weight = context.Input<Tensor>("SampleWeight");
-    const T *sample_weight_data = nullptr;
-    if (sample_weight != nullptr) {
-      sample_weight_data = sample_weight->data<T>();
-    }
-    int num_neg_samples = context.Attr<int>("num_neg_samples");
-    int num_total_classes = context.Attr<int>("num_total_classes");
-    int num_true_class = 1;
-    if (label != nullptr) {
-      num_true_class = label->dims()[1];
-    }
-
-    int sampler_type = context.Attr<int>("sampler");
-    int seed = context.Attr<int>("seed");
-    Sampler *sampler;
-    switch (sampler_type) {
-      case 0: {
-        sampler = new math::UniformSampler(num_total_classes - 1, seed);
-        break;
-      }
-      case 1: {
-        sampler = new math::LogUniformSampler(num_total_classes - 1, seed);
-        break;
-      }
-      case 2: {
-        auto dist_probs = context.Input<Tensor>("CustomDistProbs");
-        auto dist_alias = context.Input<Tensor>("CustomDistAlias");
-        auto dist_alias_probs = context.Input<Tensor>("CustomDistAliasProbs");
-
-        PADDLE_ENFORCE_EQ(dist_probs->numel(), num_total_classes);
-        PADDLE_ENFORCE_EQ(dist_alias->numel(), num_total_classes);
-        PADDLE_ENFORCE_EQ(dist_alias_probs->numel(), num_total_classes);
-
-        const float *probs_data = dist_probs->data<float>();
-        const int *alias_data = dist_alias->data<int>();
-        const float *alias_probs_data = dist_alias_probs->data<float>();
-        sampler = new math::CustomSampler(num_total_classes - 1, probs_data,
-                                          alias_data, alias_probs_data, seed);
-        break;
-      }
-      default: { PADDLE_THROW("Unsupported SamplerType."); }
-    }
-
-    //    T b = 1. / num_total_classes * num_neg_samples;
-    Tensor sample_grad;  // tmp tensor
-    T *sample_grad_data =
-        sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
-    // backward cost
-    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-      int64_t label_idx = i % sample_labels->dims()[1];
-      int64_t sample_idx = i / sample_labels->dims()[1];
-      float b = sampler->Probability(sample_labels_data[i]) * num_neg_samples;
-      T o = sample_out_data[i];
-      T w = sample_weight == nullptr ? 1 : sample_weight_data[sample_idx];
-      sample_grad_data[i] = label_idx < num_true_class
-                                ? w * (b / (o + b)) * (o - 1)
-                                : w * (o * (1 - o) / (o + b));
-      sample_grad_data[i] *= d_out_data[sample_idx];
-    }
-
-    // get d_bias
-    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
-    if (d_bias != nullptr) {
-      T *d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
-      std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
-      }
-    }
-
-    bool is_sparse = context.Attr<bool>("is_sparse");
-
-    if (!is_sparse) {
-      // get d_w
-      auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
-      if (d_w != nullptr) {
-        auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
-        std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
-        auto d_w_matrix = EigenMatrix<T>::From(*d_w);
-        auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
-        for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-          d_w_matrix.chip(sample_labels_data[i], 0) +=
-              x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
-              sample_grad_data[i];
-        }
-      }
-    } else {
-      std::vector<int64_t> labels;
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        labels.push_back(sample_labels_data[i]);
-      }
-      std::set<T> st(labels.begin(), labels.end());
-      labels.assign(st.begin(), st.end());
-
-      auto *table_var = context.InputVar("Weight");
-      DDim table_dim;
-      if (table_var->IsType<LoDTensor>()) {
-        table_dim = context.Input<LoDTensor>("Weight")->dims();
-      } else if (table_var->IsType<SelectedRows>()) {
-        auto *table_t = context.Input<SelectedRows>("Weight");
-        table_dim = table_t->value().dims();
-      } else {
-        PADDLE_THROW(
-            "The parameter Weight of a NCE_OP "
-            "must be either LoDTensor or SelectedRows");
-      }
-
-      auto d_w = context.Output<SelectedRows>(framework::GradVarName("Weight"));
-
-      d_w->set_rows(labels);
-      d_w->set_height(table_dim[0]);
-
-      auto *d_table_value = d_w->mutable_value();
-      d_table_value->Resize(
-          {static_cast<int64_t>(labels.size()), table_dim[1]});
-      auto d_w_data = d_table_value->mutable_data<T>(context.GetPlace());
-      std::fill(d_w_data, d_w_data + d_table_value->numel(), 0.0);
-
-      auto d_w_matrix = EigenMatrix<T>::From(*d_table_value);
-      auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_w_matrix.chip(d_w->Index(sample_labels_data[i]), 0) +=
-            x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
-            sample_grad_data[i];
-      }
-    }
-
-    // get d_x
-    auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
-    if (d_x != nullptr) {
-      auto *d_x_data = d_x->mutable_data<T>(context.GetPlace());
-      std::fill(d_x_data, d_x_data + d_x->numel(), 0.0);
-      auto d_x_matrix = EigenMatrix<T>::From(*d_x);
-      auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) +=
-            w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
-      }
-    }
-
-    delete sampler;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt
deleted file mode 100644
index 7559d29ce233dfcebf8b3118b4c700c35fe15d32..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-if(WITH_NGRAPH)
-  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-  cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
-  op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
-  add_subdirectory(ops)
-endif()
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
deleted file mode 100644
index 9ea7db2a6769dfd9840bcbaae9b369c42efac84a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <unordered_set>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
-#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-constexpr int64_t kNoPadding = -1;
-
-namespace paddle {
-namespace operators {
-
-bool NgraphBridge::isRegister(const std::string& str) {
-  return ops::NgraphSingleton::Lookup(str);
-}
-
-bool NgraphBridge::isSupported(
-    const std::unique_ptr<framework::OperatorBase>& op) {
-  static std::unordered_set<std::string> skip_op_list{
-      "reshape", "reshape2", "lookup_table", "lookup_table_grad"};
-  bool result = true;
-  auto& op_type = op->Type();
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  if (!isRegister(op_type)) {
-    if (skip_op_list.count(op_type)) {
-      if (op_type == "lookup_table" || op_type == "lookup_table_grad") {
-        if (op_attrs.Get<bool>("is_sparse")) {
-          result = false;
-        }
-      } else if ((op_type == "reshape") || (op_type == "reshape2")) {
-        if (op->Input("Shape") != paddle::framework::kEmptyVarName) {
-          result = false;
-        }
-      } else {
-        result = false;
-      }
-    }
-  } else {
-    result = false;
-  }
-  return result;
-}
-
-void NgraphBridge::BuildNgNode(
-    const std::shared_ptr<framework::OperatorBase>& op) {
-  auto& op_type = op->Type();
-  ops::NgraphSingleton::BuildNode(ngb_node_map_, op, op_type);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h
deleted file mode 100644
index 0b43ec53874d962699abef3cf843c5518d6f072d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/node.hpp"
-
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-class NgraphBridge {
- public:
-  explicit NgraphBridge(
-      std::shared_ptr<
-          std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-          var_node_map)
-      : ngb_node_map_(var_node_map) {}
-
-  void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op);
-
-  static bool isRegister(const std::string& str);
-
-  static bool isSupported(const std::unique_ptr<framework::OperatorBase>& op);
-
- private:
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      ngb_node_map_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
deleted file mode 100644
index 3c53c87c6ff4795c28be9eedc2f3e870e0a20916..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ /dev/null
@@ -1,631 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
-#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
-
-namespace paddle {
-namespace operators {
-
-static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
-  ngraph::Shape sp;
-  for (int i = 0; i < dims.size(); ++i) {
-    sp.emplace_back(dims[i]);
-  }
-  return sp;
-}
-
-static framework::DDim Shape2Ddim(const ngraph::Shape& shape) {
-  std::vector<int64_t> dims;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    int64_t k = shape[i];
-    dims.emplace_back(k);
-  }
-  return framework::make_ddim(dims);
-}
-
-static std::map<framework::proto::VarType::Type, ngraph::element::Type>
-    pd2ng_type_map = {
-        {framework::proto::VarType::FP32, ngraph::element::f32},
-        {framework::proto::VarType::FP64, ngraph::element::f64},
-        {framework::proto::VarType::INT32, ngraph::element::i32},
-        {framework::proto::VarType::INT64, ngraph::element::i64},
-        {framework::proto::VarType::UINT8, ngraph::element::u8},
-        {framework::proto::VarType::BOOL, ngraph::element::boolean}};
-
-static std::map<ngraph::element::Type, framework::proto::VarType::Type>
-    ng2pd_type_map = {
-        {ngraph::element::f32, framework::proto::VarType::FP32},
-        {ngraph::element::f64, framework::proto::VarType::FP64},
-        {ngraph::element::i32, framework::proto::VarType::INT32},
-        {ngraph::element::i64, framework::proto::VarType::INT64},
-        {ngraph::element::u8, framework::proto::VarType::UINT8},
-        {ngraph::element::boolean, framework::proto::VarType::BOOL}};
-
-std::vector<std::string> NgraphEngine::feed_vars = {};
-
-std::weak_ptr<ngraph::runtime::Backend> NgraphEngine::wp_backend_;
-
-std::mutex NgraphEngine::ng_mutex_;
-
-static std::vector<std::vector<int>> NgraphOpIntervals(
-    std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
-  NgraphEngine::feed_vars.clear();
-  std::vector<std::vector<int>> intervals;
-
-  int size = ops->size();
-  int left = 0, feed_idx = -1;
-  while (left < size && ops->at(left)->Type() != framework::kFeedOpType &&
-         ops->at(left)->Type() != "read" &&
-         ops->at(left)->Type() != framework::kFetchOpType) {
-    ++left;
-  }
-
-  if (left < size) {
-    auto op_type = ops->at(left)->Type();
-    if (op_type == framework::kFeedOpType || op_type == "read") {
-      feed_idx = left;
-    }
-  }
-
-  while (left < size && (ops->at(left)->Type() == framework::kFeedOpType ||
-                         ops->at(left)->Type() == "read")) {
-    for (auto& var_name_item : ops->at(left)->Outputs()) {
-      for (auto& var_name : var_name_item.second) {
-        NgraphEngine::feed_vars.emplace_back(var_name);
-      }
-    }
-    ++left;
-  }
-
-  int right = left;
-  while (right < size && ops->at(right)->Type() != framework::kFetchOpType) {
-    ++right;
-  }
-
-  int index = right;
-  while (index < size && ops->at(index)->Type() == framework::kFetchOpType) {
-    ++index;
-  }
-
-  if (left == size || ops->at(left)->Type() == framework::kFetchOpType) {
-    left = 0;
-  }
-
-  // (left, right - 1) represents indices between feed and fetch
-  int pivot = left;
-  while (pivot < right) {
-    auto op_type = ops->at(pivot)->Type();
-    if (!NgraphBridge::isSupported(ops->at(pivot))) {
-      ++pivot;
-    } else {
-      int start = pivot, end = start;
-      while (pivot < right && (NgraphBridge::isSupported(ops->at(pivot)))) {
-        ++pivot;
-        ++end;
-      }
-      std::vector<int> interval = {start, end};
-      if (feed_idx != -1 && start > feed_idx) {
-        intervals.emplace_back(interval);
-      }
-    }
-  }  // end while
-  return intervals;
-}
-
-static void SubstituteNgraphOp(
-    std::vector<std::unique_ptr<framework::OperatorBase>>* ops,
-    std::string engine_key, std::string block_str, std::vector<int> interval) {
-  framework::OpDesc ng_op_desc(nullptr);
-  ng_op_desc.SetType("ngraph_engine");
-  ng_op_desc.SetAttr("interval", interval);
-  ng_op_desc.SetAttr("engine_key", engine_key);
-  ng_op_desc.SetAttr("graph", block_str);
-  ng_op_desc.SetInput("Xs", std::vector<std::string>(0));
-  ng_op_desc.SetOutput("Ys", std::vector<std::string>(0));
-
-  ops->erase(ops->begin() + interval[0], ops->begin() + interval[1]);
-  ops->insert(ops->begin() + interval[0],
-              framework::OpRegistry::CreateOp(ng_op_desc));
-}
-
-std::string SerializedBlock(const framework::BlockDesc& bdesc) {
-  framework::proto::BlockDesc block_proto;
-  framework::BlockDesc block_desc(nullptr, &block_proto);
-  block_desc.Proto()->set_parent_idx(-1);
-  block_desc.Proto()->set_idx(0);
-
-  for (auto& op_desc : bdesc.AllOps()) {
-    auto* op = block_desc.AppendOp();
-    *op->Proto() = *op_desc->Proto();
-  }
-
-  auto* vars = block_desc.Proto()->mutable_vars();
-  for (auto& var_desc : bdesc.AllVars()) {
-    *vars->Add() = *var_desc->Proto();
-  }
-
-  return block_desc.Proto()->SerializeAsString();
-}
-
-std::string GenerateEngineKey(const framework::BlockDesc& bdesc) {
-  framework::proto::BlockDesc block_proto;
-  framework::BlockDesc block_desc(nullptr, &block_proto);
-  block_desc.Proto()->set_parent_idx(-1);
-  block_desc.Proto()->set_idx(0);
-
-  for (auto& op_desc : bdesc.AllOps()) {
-    auto* op = block_desc.AppendOp();
-    *op->Proto() = *op_desc->Proto();
-  }
-  auto engine_key = std::to_string(
-      std::hash<std::string>()(block_desc.Proto()->SerializeAsString()));
-  return engine_key;
-}
-
-std::string GenerateEngineKey(const std::vector<std::string>& engine_inputs,
-                              const std::vector<std::string>& engine_outputs,
-                              int size) {
-  std::string engine_hash_key = "";
-  for (auto name : engine_inputs) {
-    engine_hash_key += name;
-  }
-  for (auto name : engine_outputs) {
-    engine_hash_key += name;
-  }
-  engine_hash_key += std::to_string(size);
-  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
-  return engine_key;
-}
-
-void NgraphEngine::FuseNgraphOps(
-    const framework::BlockDesc& block_desc,
-    std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
-  auto intervals = NgraphOpIntervals(ops);
-  std::string serialized_block = SerializedBlock(block_desc);
-  std::string engine_key =
-      std::to_string(std::hash<std::string>()(serialized_block));
-  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
-    SubstituteNgraphOp(ops, engine_key, serialized_block, *it);
-  }
-}
-
-NgraphEngine::NgraphEngine(const framework::Scope& scope,
-                           const platform::Place& place,
-                           const framework::ExecutionContext& ctx)
-    : scope_(scope), place_(place) {
-  var_in_node_map_ = std::make_shared<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-
-  var_node_map_ = std::make_shared<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-
-  std::lock_guard<std::mutex> lock(ng_mutex_);
-
-  if (!wp_backend_.lock()) {
-    try {
-      VLOG(3) << "ngraph creating CPU  backend.";
-      backend_ = ngraph::runtime::Backend::create("CPU");
-    } catch (...) {
-      PADDLE_THROW("Unsupported nGraph backend");
-    }
-    wp_backend_ = backend_;
-  } else {
-    backend_ = wp_backend_.lock();
-  }
-
-  GetNgFunction(ctx);
-}
-
-void NgraphEngine::Prepare(const framework::ExecutionContext& ctx) {
-  auto interval = ctx.Attr<std::vector<int>>("interval");
-  std::string serialized_graph = ctx.Attr<std::string>("graph");
-
-  framework::proto::BlockDesc block_proto;
-  if (!serialized_graph.empty()) block_proto.ParseFromString(serialized_graph);
-  framework::BlockDesc block_desc(nullptr, &block_proto);
-
-  for (auto& var : block_desc.AllVars()) {
-    if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
-          var->GetType() == framework::proto::VarType::LOD_TENSOR ||
-          var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
-      continue;
-    }
-
-    auto var_name = var->Name();
-    if (var->Name() == framework::kEmptyVarName) {
-      continue;
-    }
-
-    if (var_name != framework::kFeedOpType &&
-        var_name != framework::kFetchOpType) {
-      auto pd_type = var->GetDataType();
-      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
-        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
-                     var_name);
-      }
-      var_type_map_[var_name] = pd2ng_type_map[pd_type];
-    }
-
-    if (var->Persistable()) {
-      persistables_.insert(var->Name());
-    }
-  }
-
-  std::vector<paddle::framework::OpDesc*> ops_desc;
-  for (auto op_desc : block_desc.AllOps()) {
-    ops_desc.emplace_back(op_desc);
-    if (op_desc->Type().find("_grad") != std::string::npos) {
-      this->is_test_ = false;
-    }
-  }
-
-  int idx = interval[0];
-  while (idx < interval[1]) {
-    this->fused_ops_.emplace_back(
-        framework::OpRegistry::CreateOp(*(ops_desc[idx])));
-    ++idx;
-  }
-  while (idx < static_cast<int>(ops_desc.size())) {
-    auto op_desc = ops_desc.at(idx);
-    for (auto& var_name_item : op_desc->Inputs()) {
-      for (auto& var_name : var_name_item.second) {
-        this->post_op_inputs_.insert(var_name);
-      }
-    }
-    ++idx;
-  }
-
-  auto input_vars = ctx.Inputs("Xs");
-  if (!input_vars.empty()) {
-    feed_vars = input_vars;
-    var_in_ = input_vars;
-  }
-
-  auto output_vars = ctx.Outputs("Ys");
-  if (!output_vars.empty()) {
-    var_out_ = output_vars;
-  }
-
-  if (var_in_.empty() && var_out_.empty()) {
-    BuildNgIO(ops_desc, interval);
-  }
-
-  for (size_t i = 0; i < var_in_.size(); ++i) {
-    auto var_name = var_in_[i];
-    if (persistables_.find(var_name) == persistables_.end()) {
-      var_in_updates_.emplace_back(i);
-    }
-  }
-}
-
-void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
-                             const std::vector<int>& interval) {
-  std::unordered_set<std::string> inputs;
-  std::unordered_set<std::string> outputs;
-
-  for (int i = interval[0]; i < interval[1]; ++i) {
-    auto op = ops_desc[i];
-    for (auto& var_name_item : op->Inputs()) {
-      for (auto& var_name : var_name_item.second) {
-        inputs.insert(var_name);
-        const bool is_output = outputs.find(var_name) != outputs.end();
-        if (!is_output &&
-            std::find(var_in_.begin(), var_in_.end(), var_name) ==
-                var_in_.end() &&
-            scope_.FindVar(var_name)) {
-          // fill var_in here to keep lhs and rhs order
-          this->var_in_.emplace_back(var_name);
-        }
-      }
-    }
-
-    for (auto& var_name_item : op->Outputs()) {
-      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
-                        "op %s has more than 1 output - Not handling yet",
-                        op->Type());
-      for (auto& var_name : var_name_item.second) {
-        outputs.insert(var_name);
-      }
-    }
-  }
-
-  // var_out.clear();
-  for (int i = interval[0]; i < interval[1]; ++i) {
-    auto op = ops_desc[i];
-    for (auto& var_name_item : op->Outputs()) {
-      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
-                        "op %s has more than 1 output - Not handling yet",
-                        op->Type());
-      for (auto& var_name : var_name_item.second) {
-        if (this->is_test_) {
-          if (post_op_inputs_.find(var_name) != post_op_inputs_.end()) {
-            this->var_out_.emplace_back(var_name);
-          }
-        } else {
-          if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
-              persistables_.find(var_name) != persistables_.end()) {
-            this->var_out_.emplace_back(var_name);
-          }
-        }
-      }
-    }
-  }
-  // remove output duplicates
-  std::unordered_set<std::string> var_out_set;
-  for (int i = static_cast<int>(var_out_.size()) - 1; i >= 0; --i) {
-    std::string var_name = var_out_.at(i);
-    if (var_out_set.count(var_name)) {
-      var_out_.erase(var_out_.begin() + i);
-    }
-    var_out_set.insert(var_name);
-  }
-}
-
-void NgraphEngine::GetNgInputShape() {
-  for (auto& var_name : var_in_) {
-    auto* var = scope_.FindVar(var_name);
-    if (var && var->IsType<framework::LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      auto sp = Ddim2Shape(tensor_pd->dims());
-      auto ng_type = var_type_map_[var_name];
-      auto prm = std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
-      (*var_node_map_)[var_name] = prm;
-      (*var_in_node_map_)[var_name] = prm;
-    }
-  }
-}
-
-void NgraphEngine::BuildNgNodes() {
-  for (auto& op : fused_ops_) {
-    for (auto& var_name_item : op->Outputs()) {
-      for (auto& var_name : var_name_item.second) {
-        if (var_node_map_->find(var_name) == var_node_map_->end()) {
-          auto* var = scope_.FindVar(var_name);
-          if (var && var->IsType<framework::LoDTensor>()) {
-            auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-            auto& ddim = tensor_pd->dims();
-            auto ng_shape = Ddim2Shape(ddim);
-            auto ng_type = var_type_map_[var_name];
-            auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
-                                                               ng_shape, true);
-            (*var_node_map_)[var_name] = prm;
-          }
-        }
-      }
-    }
-  }
-  NgraphBridge ngb(var_node_map_);
-  for (auto& op : fused_ops_) {
-    ngb.BuildNgNode(op);
-  }
-}
-
-std::shared_ptr<ngraph::Function> NgraphEngine::BuildNgFunction(
-    const framework::ExecutionContext& ctx) {
-  Prepare(ctx);
-  GetNgInputShape();
-  BuildNgNodes();
-  ngraph::NodeVector func_outputs;
-  ngraph::ParameterVector func_inputs;
-
-  for (auto& vo : var_out_) {
-    PADDLE_ENFORCE_GT(var_node_map_->count(vo), 0,
-                      "Cannot find vo %s in var_node_map_", vo);
-    func_outputs.emplace_back(var_node_map_->at(vo));
-  }
-
-  for (auto& vi : var_in_) {
-    PADDLE_ENFORCE_GT(var_node_map_->count(vi), 0,
-                      "Cannot find vi %s in var_node_map_", vi);
-    std::shared_ptr<ngraph::op::Parameter> prm =
-        std::dynamic_pointer_cast<ngraph::op::Parameter>(
-            var_in_node_map_->at(vi));
-    func_inputs.emplace_back(prm);
-  }
-
-  return std::make_shared<ngraph::Function>(func_outputs, func_inputs);
-}
-
-void NgraphEngine::ClearNgCache() {
-  auto& engine_cache = main_engine_cache::fetch();
-  auto& t_in_cache_ = main_t_in_cache::fetch();
-
-  auto it = engine_cache.begin();
-  while (it != engine_cache.end()) {
-    auto ng_engine = it->second;
-    ng_engine.ngraph_backend->remove_compiled_function(ng_engine.ngraph_handle);
-    ng_engine.ngraph_backend.reset();
-    ++it;
-  }
-  engine_cache.clear();
-  auto it_tensor = t_in_cache_.begin();
-  while (it_tensor != t_in_cache_.end()) {
-    auto t_vec = it_tensor->second;
-    for (auto t_in : t_vec) {
-      t_in.reset();
-    }
-    ++it_tensor;
-  }
-  t_in_cache_.clear();
-}
-
-void NgraphEngine::GetNgFunction(const framework::ExecutionContext& ctx) {
-  auto interval = ctx.Attr<std::vector<int>>("interval");
-  std::string engine_key = ctx.Attr<std::string>("engine_key");
-
-  // set to flase, to debug cache or recompile everytime.
-  bool use_cache = true;
-  if (!use_cache) ClearNgCache();
-
-  this->func_cache_key_ = "";
-  for (int i = 0; i < static_cast<int>(feed_vars.size()); ++i) {
-    auto* var = scope_.FindVar(feed_vars[i]);
-    if (var && var->IsType<framework::LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      auto dims = tensor_pd->dims();
-      for (int j = 0; j < dims.size(); ++j) {
-        func_cache_key_ += std::to_string(dims[j]);
-      }
-    }
-  }
-  func_cache_key_ += std::to_string(interval[0]) + "_" +
-                     std::to_string(interval[1]) + engine_key;
-  func_cache_key_ = std::to_string(std::hash<std::string>()(func_cache_key_));
-
-  auto& engine_cache = main_engine_cache::fetch();
-
-  if (engine_cache.find(func_cache_key_) != engine_cache.end()) {
-    if (engine_cache[func_cache_key_].persistables.size() == 0) {
-      ClearNgCache();
-    }
-  }
-
-  if (engine_cache.find(func_cache_key_) == engine_cache.end()) {
-    if (engine_cache.size() > 5) ClearNgCache();
-    auto func = BuildNgFunction(ctx);
-    // Due to optimization backend may produce results in other layouts,
-    // make sure we get default layout for results.
-    for (auto& r : func->get_results()) {
-      r->set_needs_default_layout(true);
-    }
-    engine_cache[func_cache_key_].ngraph_backend = backend_;
-    engine_cache[func_cache_key_].ngraph_handle = backend_->compile(func);
-    engine_cache[func_cache_key_].persistables = this->persistables_;
-    engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_;
-    engine_cache[func_cache_key_].var_in = this->var_in_;
-    engine_cache[func_cache_key_].var_out = this->var_out_;
-    engine_cache[func_cache_key_].is_test = this->is_test_;
-  }
-}
-
-void NgraphEngine::Run(const framework::Scope& scope,
-                       const platform::Place& place) const {
-  VLOG(3) << "NgraphEngine Run ...";
-  std::shared_ptr<ngraph::runtime::Executable> ng_handle;
-  std::shared_ptr<ngraph::runtime::Backend> ng_backend;
-  const std::set<std::string>* p_persistables;
-  const std::vector<size_t>* p_var_in_updates;
-  const std::vector<std::string>* p_var_in;
-  const std::vector<std::string>* p_var_out;
-
-  auto& engine_cache = main_engine_cache::fetch();
-  auto& t_in_cache_ = main_t_in_cache::fetch();
-
-  PADDLE_ENFORCE_GT(engine_cache.count(func_cache_key_), 0,
-                    "Cannot find cached data to run ngraph function");
-  ng_handle = engine_cache[func_cache_key_].ngraph_handle;
-  ng_backend = engine_cache[func_cache_key_].ngraph_backend;
-  p_persistables = &(engine_cache[func_cache_key_].persistables);
-  p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates);
-  p_var_in = &(engine_cache[func_cache_key_].var_in);
-  p_var_out = &(engine_cache[func_cache_key_].var_out);
-
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>>* p_t_in;
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in = {};
-
-  auto m_parameters = ng_handle->get_parameters();
-  auto m_results = ng_handle->get_results();
-  if (is_inference_ && t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) {
-    p_t_in = &(t_in_cache_[func_cache_key_]);
-    for (size_t i = 0; i < p_var_in_updates->size(); ++i) {
-      int index = p_var_in_updates->at(i);
-      auto vi = p_var_in->at(index);
-      auto sp = m_parameters[index]->get_shape();
-      auto ng_type = m_parameters[index]->get_element_type();
-      std::shared_ptr<ngraph::runtime::Tensor> ti;
-      auto* var = scope.FindVar(vi);
-      if (var && var->IsType<framework::LoDTensor>()) {
-        auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
-        void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
-        ti = ng_backend->create_tensor(ng_type, sp, pd_arr);
-        (*p_t_in)[index] = ti;
-      } else {
-        PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
-      }
-    }
-  } else {
-    if (is_inference_) {
-      p_t_in = &(t_in_cache_[func_cache_key_]);
-    } else {
-      p_t_in = &t_in;
-    }
-
-    for (size_t i = 0; i < p_var_in->size(); ++i) {
-      auto vi = p_var_in->at(i);
-      auto sp = m_parameters[i]->get_shape();
-      auto ng_type = m_parameters[i]->get_element_type();
-      std::shared_ptr<ngraph::runtime::Tensor> ti;
-      auto* var = scope.FindVar(vi);
-      if (var && var->IsType<framework::LoDTensor>()) {
-        auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
-        void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
-        ti = ng_backend->create_tensor(ng_type, sp, pd_arr);
-      } else {
-        PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
-      }
-      bool is_persistable =
-          (p_persistables->find(vi) != p_persistables->end()) ? true : false;
-      if (is_inference_ && is_persistable) {
-        ti->set_stale(false);
-      }
-      (*p_t_in).emplace_back(ti);
-    }
-  }
-
-  for (auto& op : fused_ops_) {
-    framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
-    op->RuntimeInferShape(scope_, place_, ctx);
-  }
-
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out = {};
-  for (size_t i = 0; i < p_var_out->size(); ++i) {
-    auto vo = p_var_out->at(i);
-    auto* var = scope.FindVar(vo);
-    if (var && var->IsType<framework::LoDTensor>()) {
-      auto sp = m_results[i]->get_shape();
-      var->GetMutable<framework::LoDTensor>()->Resize(Shape2Ddim(sp));
-      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
-      auto ng_type = m_results[i]->get_element_type();
-      void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
-      std::shared_ptr<ngraph::runtime::Tensor> to =
-          ng_backend->create_tensor(ng_type, sp, pd_arr);
-      t_out.emplace_back(to);
-    } else {
-      PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
-    }
-  }
-
-  ng_handle->call(t_out, *p_t_in);
-}  // NgraphEngine::Run
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h
deleted file mode 100644
index 0fb2d167496b3eabd8e840fe18adb8900d5fb527..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <list>
-#include <memory>
-#include <mutex>  //NOLINT
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/var_desc.h"
-
-#include "ngraph/ngraph.hpp"
-
-namespace paddle {
-namespace operators {
-
-// cache engine repetitives
-struct EngineCache {
-  std::shared_ptr<ngraph::runtime::Executable> ngraph_handle = nullptr;
-  std::shared_ptr<ngraph::runtime::Backend> ngraph_backend = nullptr;
-  std::set<std::string> persistables;
-  std::vector<std::string> var_in;
-  std::vector<std::string> var_out;
-  std::vector<size_t> var_in_updates;
-  bool is_test = true;
-};
-
-template <class T, class Engine, int separator = 0>
-class NgraphThreadCache {
- public:
-  typedef decltype(Engine::getMutex()) mutex_type;
-  typedef std::lock_guard<mutex_type> guard_type;
-  typedef T& ref_type;
-  enum class type_of_thread { unknown, forward, backward };
-
-  template <class S>
-  struct MetaInfo {
-    std::thread::id owner_tid;   // owner of the cache, future use;
-    type_of_thread worker_type;  // future use
-    S real_content;
-    MetaInfo()
-        : owner_tid{std::this_thread::get_id()},
-          worker_type{type_of_thread::unknown} {}
-  };
-
-  typedef std::unique_ptr<MetaInfo<T>> content_type;
-  typedef std::list<content_type> storage_type;
-
- protected:
-  static storage_type l;
-  static mutex_type getMutex() { return Engine::getMutex(); }
-  static void remove_from_list(const T* raw_ptr) {
-    guard_type guard(getMutex());
-    l.remove_if([raw_ptr](const content_type& sh) {
-      return &(sh->real_content) == raw_ptr;
-    });
-  }
-
-  template <class TRaw>
-  struct TLSDescriptor {
-    TRaw* raw_ptr;
-    TLSDescriptor() : raw_ptr{nullptr} {}
-    ~TLSDescriptor() {
-      // if thread die
-      NgraphThreadCache::remove_from_list(raw_ptr);
-
-      /* TODO : Parallel executor swap */
-      // FastMultiThreadCache::keep_alive_for_backward_thread(raw_ptr);
-    }
-  };
-
- public:
-  NgraphThreadCache() = delete;
-  NgraphThreadCache(const NgraphThreadCache& copy) = delete;
-
-  static T& fetch() {
-    thread_local TLSDescriptor<T> tls;
-    if (!tls.raw_ptr) {
-      using elem_type = typename content_type::element_type;
-      content_type _p(new elem_type());
-      if (!_p) PADDLE_THROW("Cannot alloc memory for thread-cache ");
-      guard_type guard(getMutex());
-      l.push_back(std::move(_p));
-      tls.raw_ptr = &l.back()->real_content;
-    }
-    return *(tls.raw_ptr);
-  }
-  auto getSize() -> decltype(l.size()) {
-    guard_type guard(getMutex());
-    return l.size();
-  }
-
-  template <class F>
-  void for_each_cache(F f) {
-    guard_type guard(getMutex());
-    std::for_each(l.begin(), l.end(), f);
-  }
-};
-
-template <class T, class Engine, int separator>
-typename NgraphThreadCache<T, Engine, separator>::storage_type
-    NgraphThreadCache<T, Engine, separator>::l;
-
-// perform graph build through bridge and execute computation
-class NgraphEngine {
- public:
-  explicit NgraphEngine(const framework::Scope& scope,
-                        const platform::Place& place,
-                        const framework::ExecutionContext& ctx);
-
-  void Run(const framework::Scope& scope, const platform::Place& place) const;
-
-  static std::vector<std::string> feed_vars;
-
-  static void FuseNgraphOps(
-      const framework::BlockDesc& prog,
-      std::vector<std::unique_ptr<framework::OperatorBase>>* ops);
-
-  static std::recursive_mutex& getMutex() {
-    static std::recursive_mutex mx;
-    return mx;
-  }
-
- private:
-  template <class T>
-  using ThCache =
-      NgraphThreadCache<std::unordered_map<std::string, T>, NgraphEngine>;
-
-  using main_engine_cache = ThCache<EngineCache>;
-  using main_t_in_cache =
-      ThCache<std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>;
-
-  const framework::Scope& scope_;
-  const platform::Place& place_;
-  std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::set<std::string> persistables_;
-  std::unordered_set<std::string> post_op_inputs_;
-  // it is test for a single run, it can be a validation during training
-  bool is_test_{true};
-  // inference only. eg. CAPI inference
-  bool is_inference_{false};
-  std::string func_cache_key_;
-  // use a weak pointer to keep backend_ alive
-  // to avoid it to be destropyed too earlier
-  static std::weak_ptr<ngraph::runtime::Backend> wp_backend_;
-  // use mutex to keep it thread safe
-  static std::mutex ng_mutex_;
-  // ngraph backend eg. CPU
-  std::shared_ptr<ngraph::runtime::Backend> backend_;
-  // var_name of inputs
-  std::vector<std::string> var_in_;
-  // var_name of outputs from  fetch in order
-  std::vector<std::string> var_out_;
-  // non-persitable var_in
-  std::vector<size_t> var_in_updates_;
-  // map input vars to nodes
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      var_in_node_map_;
-  // map each var name with a ngraph node
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      var_node_map_;
-  // prepare info for ngraph engine need
-  void Prepare(const framework::ExecutionContext& ctx);
-  // get ngraph engine input and output list
-  void BuildNgIO(const std::vector<framework::OpDesc*>& op_descs,
-                 const std::vector<int>& interval);
-  // get ngraph input and define ngraph input parameters
-  void GetNgInputShape();
-  // Call ngraph bridge to map ops
-  void BuildNgNodes();
-  // build ngraph function call
-  std::shared_ptr<ngraph::Function> BuildNgFunction(
-      const framework::ExecutionContext& ctx);
-  // clear ngraph engine cache and t_in cache
-  void ClearNgCache();
-  // Check cache for ngraph function or otherwise build the function
-  void GetNgFunction(const framework::ExecutionContext& ctx);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
deleted file mode 100644
index 479c95ba08c316be3d1d983ea736fcc505332d6e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/ngraph/ngraph_engine_op.h"
-
-namespace paddle {
-namespace operators {
-
-class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Xs", "A list of inputs.").AsDispensable();
-    AddOutput("Ys", "A list of outputs").AsDispensable();
-    AddAttr<std::string>("graph", "the graph.");
-    AddAttr<std::string>("engine_key", "the engine hash key.");
-    AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
-    AddComment("ngraph engine operator.");
-  }
-};
-
-class NgraphEngineInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(ngraph_engine, ops::NgraphEngineOp, ops::NgraphEngineOpMaker,
-                  ops::NgraphEngineOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    ngraph_engine,
-    ops::NgraphEngineKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
deleted file mode 100644
index c9b2a3970e17c1a06fa0cc67aa15df304a30656e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-class NgraphEngineOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::OpKernelType kt = framework::OpKernelType(
-        framework::proto::VarType::FP32, platform::CPUPlace());
-    return kt;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NgraphEngineKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& scope = ctx.scope();
-    auto place = ctx.GetPlace();
-
-    NgraphEngine ngraph_engine(scope, place, ctx);
-    ngraph_engine.Run(scope, place);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ops/CMakeLists.txt b/paddle/fluid/operators/ngraph/ops/CMakeLists.txt
deleted file mode 100644
index 7dee3308b74a70a2daf35055d3ac80a14de99ac1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-file(GLOB LIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
-set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/ngraph/ngraph_ops.h)
-file(APPEND ${pass_file} "\#pragma once\n")
-file(WRITE ${pass_file} "// Generated by the /paddle/fluid/operators/ngraph/ops/CMakeLists.txt.  DO NOT EDIT!\n\n")
-
-foreach(OPS_NAME ${LIST_OPS})
-    file(APPEND ${pass_file} "\#include \"paddle/fluid/operators/ngraph/ops/${OPS_NAME}\"\n")
-endforeach(OPS_NAME)
diff --git a/paddle/fluid/operators/ngraph/ops/accuracy_op.h b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
deleted file mode 100644
index 0da57517a733985ce1208732f13b08cd7bb8ca30..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/accuracy_op.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildAccuracyNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto indices = platform::GetInputNode(op, "Indices", ngb_node_map);
-  auto label = platform::GetInputNode(op, "Label", ngb_node_map);
-  auto inference = platform::GetInputNode(op, "Out", ngb_node_map);
-  auto inference_shape = inference->get_shape();
-  size_t num_samples = inference_shape.at(0);
-  size_t k = inference_shape.at(1);
-
-  std::shared_ptr<ngraph::Node> label_k = label;
-  if (k > 1) {
-    auto label_1d = std::make_shared<ngraph::op::Reshape>(
-        label, ngraph::AxisVector{0, 1}, ngraph::Shape{num_samples});
-    label_k = std::make_shared<ngraph::op::Broadcast>(label_1d, inference_shape,
-                                                      ngraph::AxisSet{1});
-  }
-
-  auto node_equal = std::make_shared<ngraph::op::Equal>(indices, label_k);
-  auto node_eq_int =
-      std::make_shared<ngraph::op::Convert>(node_equal, ngraph::element::i64);
-  auto num_correct_0d =
-      std::make_shared<ngraph::op::Sum>(node_eq_int, ngraph::AxisSet{0, 1});
-  std::shared_ptr<ngraph::Node> num_correct =
-      platform::NgReshaper(num_correct_0d, ngraph::Shape{1});
-  std::shared_ptr<ngraph::Node> n_samples = ngraph::op::Constant::create(
-      ngraph::element::i64, ngraph::Shape{1}, {num_samples});
-  std::shared_ptr<ngraph::Node> accuracy = std::make_shared<ngraph::op::Divide>(
-      std::make_shared<ngraph::op::Convert>(num_correct, ngraph::element::f32),
-      std::make_shared<ngraph::op::Convert>(n_samples, ngraph::element::f32));
-
-  platform::SetOutputNode(op, "Accuracy", accuracy, ngb_node_map);
-  platform::SetOutputNode(op, "Correct", num_correct, ngb_node_map);
-  platform::SetOutputNode(op, "Total", n_samples, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(accuracy, BuildAccuracyNode);
diff --git a/paddle/fluid/operators/ngraph/ops/activation_op.h b/paddle/fluid/operators/ngraph/ops/activation_op.h
deleted file mode 100644
index 884ec659267a5386b5715b9f8b38be8900123823..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/activation_op.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildGeluNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-  auto half = paddle::platform::CreateConstant(input->get_element_type(),
-                                               input->get_shape(), {0.5});
-  auto one = paddle::platform::CreateConstant(input->get_element_type(),
-                                              input->get_shape(), {1});
-  auto sqrt_two =
-      std::make_shared<ngraph::op::Sqrt>(paddle::platform::CreateConstant(
-          input->get_element_type(), input->get_shape(), {2}));
-  auto out = half * input *
-             (one + std::make_shared<ngraph::op::Erf>(input / sqrt_two));
-  platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-void BuildGeluGradNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-  auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto half = paddle::platform::CreateConstant(input->get_element_type(),
-                                               input->get_shape(), {0.5});
-  auto minus_half = paddle::platform::CreateConstant(
-      input->get_element_type(), input->get_shape(), {-0.5});
-  auto one = paddle::platform::CreateConstant(input->get_element_type(),
-                                              input->get_shape(), {1});
-  auto two = paddle::platform::CreateConstant(input->get_element_type(),
-                                              input->get_shape(), {2});
-  auto pi = paddle::platform::CreateConstant(
-      input->get_element_type(), input->get_shape(), {3.14159265359});
-  auto sqrt_two = std::make_shared<ngraph::op::Sqrt>(two);
-  auto sqrt_pi = std::make_shared<ngraph::op::Sqrt>(pi);
-
-  auto first =
-      half * (one + std::make_shared<ngraph::op::Erf>(input * one / sqrt_two));
-  auto second = half * (two / sqrt_pi) * (one / sqrt_two) * input *
-                std::make_shared<ngraph::op::Exp>(minus_half * input * input);
-  auto gelu_grad = dout * (first + second);
-  platform::SetOutputNode(op, "X@GRAD", gelu_grad, ngb_node_map);
-}
-
-void BuildReluGradNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto out = platform::GetInputNode(op, "Out", ngb_node_map);
-  auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto relu_grad = std::make_shared<ngraph::op::ReluBackprop>(out, dout);
-  platform::SetOutputNode(op, "X@GRAD", relu_grad, ngb_node_map);
-}
-
-void BuildSquareNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-  auto out = input * input;
-  platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-void BuildTanhGradNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto out = platform::GetInputNode(op, "Out", ngb_node_map);
-  auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto shape = out->get_shape();
-  auto node_const =
-      ngraph::op::Constant::create(ngraph::element::f32, shape, {1});
-  auto result = dout * (node_const - out * out);
-  platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(gelu, BuildGeluNode);
-REGISTER_NG_OP(gelu_grad, BuildGeluGradNode);
-REGISTER_NG_OP(relu_grad, BuildReluGradNode);
-REGISTER_NG_OP(square, BuildSquareNode);
-REGISTER_NG_OP(tanh_grad, BuildTanhGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/adam_op.h b/paddle/fluid/operators/ngraph/ops/adam_op.h
deleted file mode 100644
index beba5d3d237d4dea578651f440b65a15251d5ad2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/adam_op.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildAdamNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  auto beta1pow = platform::GetInputNode(op, "Beta1Pow", ngb_node_map);
-  auto beta2pow = platform::GetInputNode(op, "Beta2Pow", ngb_node_map);
-  auto grad = platform::GetInputNode(op, "Grad", ngb_node_map);
-  auto learning_rate = platform::GetInputNode(op, "LearningRate", ngb_node_map);
-  auto moment1 = platform::GetInputNode(op, "Moment1", ngb_node_map);
-  auto moment2 = platform::GetInputNode(op, "Moment2", ngb_node_map);
-  auto param = platform::GetInputNode(op, "Param", ngb_node_map);
-
-  auto epsilon = op_attrs.Get<float>("epsilon");
-  auto beta2 = op_attrs.Get<float>("beta2");
-  auto beta1 = op_attrs.Get<float>("beta1");
-
-  auto moment1_shape = moment1->get_shape();
-  auto grad_shape = grad->get_shape();
-
-  auto moment1out = std::make_shared<ngraph::op::Add>(
-      ElementwiseScalar<ngraph::op::Multiply>(beta1, moment1),
-      ElementwiseScalar<ngraph::op::Multiply>(1. - beta1, grad));
-
-  auto grad_square = std::make_shared<ngraph::op::Multiply>(grad, grad);
-  auto moment2out = std::make_shared<ngraph::op::Add>(
-      ElementwiseScalar<ngraph::op::Multiply>(beta2, moment2),
-      ElementwiseScalar<ngraph::op::Multiply>(1. - beta2, grad_square));
-  auto node_sqrt = std::make_shared<ngraph::op::Sqrt>(
-      ElementwiseScalar<ngraph::op::Subtract>(1., beta2pow));
-  auto lr = std::make_shared<ngraph::op::Divide>(
-      node_sqrt, ElementwiseScalar<ngraph::op::Subtract>(1., beta1pow));
-  auto updated_lr = std::make_shared<ngraph::op::Multiply>(learning_rate, lr);
-
-  auto moment2_sqrt = std::make_shared<ngraph::op::Sqrt>(moment2out);
-  auto param_grad = std::make_shared<ngraph::op::Divide>(
-      moment1out, ElementwiseScalar<ngraph::op::Add>(epsilon, moment2_sqrt));
-  auto delta = ElementwiseScalar<ngraph::op::Multiply>(updated_lr, param_grad);
-  auto param_out = std::make_shared<ngraph::op::Subtract>(param, delta);
-
-  platform::SetOutputNode(op, "Moment1Out", moment1out, ngb_node_map);
-  platform::SetOutputNode(op, "Moment2Out", moment2out, ngb_node_map);
-  platform::SetOutputNode(op, "ParamOut", param_out, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(adam, BuildAdamNode);
diff --git a/paddle/fluid/operators/ngraph/ops/assign_op.h b/paddle/fluid/operators/ngraph/ops/assign_op.h
deleted file mode 100644
index 1815c2ee2d551d199f388f6d70ba8064ba0709da..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/assign_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-static void BuildAssignNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-  auto out = input;
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(assign, BuildAssignNode);
diff --git a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
deleted file mode 100644
index 01fe78cdb24652429f713d09ea2abb8c73bbddf5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildBatchNormNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  auto& data_layout = op_attrs.Get<std::string>("data_layout");
-
-  auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map);
-  auto mean = paddle::platform::GetInputNode(op, "Mean", ngb_node_map);
-  auto variance = paddle::platform::GetInputNode(op, "Variance", ngb_node_map);
-  auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-
-  const bool is_test = op_attrs.Get<bool>("is_test");
-  const float epsilon = op_attrs.Get<float>("epsilon");
-  const float momentum = op_attrs.Get<float>("momentum");
-
-  PADDLE_ENFORCE(
-      data_layout == "NHWC" || data_layout == "NCHW" || data_layout == "NC",
-      "The BatchNorm operator only supports NHWC/NCHW/NC data format");
-
-  if (data_layout == "NHWC") {
-    x = paddle::platform::Nhwc2Nchw(x);
-  }
-
-  std::shared_ptr<ngraph::Node> mean_out, saved_mean, saved_variance,
-      variance_out, y;
-
-  if (!is_test) {
-    auto BN = std::make_shared<ngraph::op::BatchNormTraining>(epsilon, scale,
-                                                              bias, x);
-    y = std::make_shared<ngraph::op::GetOutputElement>(BN, 0);
-    saved_mean = std::make_shared<ngraph::op::GetOutputElement>(BN, 1);
-    saved_variance = std::make_shared<ngraph::op::GetOutputElement>(BN, 2);
-
-    mean_out = std::make_shared<ngraph::op::Add>(
-        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
-            momentum, mean),
-        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
-            1. - momentum, saved_mean));
-    variance_out = std::make_shared<ngraph::op::Add>(
-        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
-            momentum, variance),
-        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
-            1. - momentum, saved_variance));
-
-    if (data_layout == "NHWC") {
-      y = paddle::platform::Nchw2Nhwc(y);
-    }
-
-    paddle::platform::SetOutputNode(op, "MeanOut", mean_out, ngb_node_map);
-    paddle::platform::SetOutputNode(op, "VarianceOut", variance_out,
-                                    ngb_node_map);
-    paddle::platform::SetOutputNode(op, "SavedMean", saved_mean, ngb_node_map);
-    paddle::platform::SetOutputNode(op, "SavedVariance", saved_variance,
-                                    ngb_node_map);
-    paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map);
-  } else {
-    y = std::make_shared<ngraph::op::BatchNormInference>(epsilon, scale, bias,
-                                                         x, mean, variance);
-    paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map);
-  }
-}
-
-void BuildBatchNormGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  auto& data_layout = op_attrs.Get<std::string>("data_layout");
-
-  auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map);
-  auto saved_mean =
-      paddle::platform::GetInputNode(op, "SavedMean", ngb_node_map);
-  auto saved_variance =
-      paddle::platform::GetInputNode(op, "SavedVariance", ngb_node_map);
-  auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
-  auto x_shape = x->get_shape();
-  auto dy_shape = dy->get_shape();
-
-  PADDLE_ENFORCE(x_shape.size() == 2 || x_shape.size() == 4,
-                 "BN grap input size needs to be 2 or 4");
-  PADDLE_ENFORCE_EQ(x_shape.size(), dy_shape.size(),
-                    "BN grap input and delta size needs to be equal");
-  PADDLE_ENFORCE(
-      data_layout == "NHWC" || data_layout == "NCHW" || data_layout == "NC",
-      "The BatchNorm operator only supports NHWC/NCHW/NC data format");
-
-  if (x_shape.size() == 2) {
-    x = std::make_shared<ngraph::op::Reshape>(
-        x, ngraph::AxisVector{0, 1},
-        ngraph::Shape{x_shape.at(0), x_shape.at(1), 1, 1});
-    dy = std::make_shared<ngraph::op::Reshape>(
-        dy, ngraph::AxisVector{0, 1},
-        ngraph::Shape{dy_shape.at(0), dy_shape.at(1), 1, 1});
-  }
-
-  if (data_layout == "NHWC") {
-    x = paddle::platform::Nhwc2Nchw(dy);
-    dy = paddle::platform::Nhwc2Nchw(dy);
-  }
-  const float epsilon = op_attrs.Get<float>("epsilon");
-
-  auto bn_bprop = std::make_shared<ngraph::op::BatchNormTrainingBackprop>(
-      epsilon, scale, bias, x, saved_mean, saved_variance, dy);
-
-  std::shared_ptr<ngraph::Node> dx =
-      std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 0);
-  auto dscale = std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 1);
-  auto dbias = std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 2);
-  paddle::platform::SetOutputNode(op, "Bias@GRAD", dbias, ngb_node_map);
-  paddle::platform::SetOutputNode(op, "Scale@GRAD", dscale, ngb_node_map);
-  if (x_shape.size() == 2) {
-    paddle::platform::SetOutputNode(
-        op, "X@GRAD", paddle::platform::NgReshaper(dx, x_shape), ngb_node_map);
-  } else {
-    if (data_layout == "NHWC") {
-      dx = paddle::platform::Nchw2Nhwc(dx);
-    }
-    paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
-  }
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(batch_norm, BuildBatchNormNode);
-REGISTER_NG_OP(batch_norm_grad, BuildBatchNormGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
deleted file mode 100644
index b8e9f3d85847e2441057a1041c55a1046ff15cee..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-template <typename T>
-static void BuildBinaryNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
-  auto out = std::make_shared<T>(x, y);
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-template <typename T>
-static void BuildUnaryNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto out = std::make_shared<T>(input);
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(abs, BuildUnaryNode<ngraph::op::Abs>);
-REGISTER_NG_OP(relu, BuildUnaryNode<ngraph::op::Relu>);
-REGISTER_NG_OP(tanh, BuildUnaryNode<ngraph::op::Tanh>);
-REGISTER_NG_OP(sigmoid, BuildUnaryNode<ngraph::op::Sigmoid>);
-
-REGISTER_NG_OP(logical_and, BuildBinaryNode<ngraph::op::And>);
-REGISTER_NG_OP(logical_or, BuildBinaryNode<ngraph::op::Or>);
-REGISTER_NG_OP(logical_not, BuildUnaryNode<ngraph::op::Not>);
diff --git a/paddle/fluid/operators/ngraph/ops/cast_op.h b/paddle/fluid/operators/ngraph/ops/cast_op.h
deleted file mode 100644
index 8e385f61bee10b8d4dfb2fdcc723637a6f3c2a07..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/cast_op.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-static void BuildCastNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  auto ng_dtype =
-      platform::GetNgType(static_cast<paddle::framework::proto::VarType::Type>(
-          op_attrs.Get<int>("out_dtype")));
-  auto out = std::make_shared<ngraph::op::Convert>(input, ng_dtype);
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-static void BuildCastGradNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  auto ng_dtype =
-      platform::GetNgType(static_cast<paddle::framework::proto::VarType::Type>(
-          op_attrs.Get<int>("out_dtype")));
-  auto out = std::make_shared<ngraph::op::Convert>(input, ng_dtype);
-  platform::SetOutputNode(op, "X@GRAD", out, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(cast, BuildCastNode);
-REGISTER_NG_OP(cast_grad, BuildCastGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/concat_op.h b/paddle/fluid/operators/ngraph/ops/concat_op.h
deleted file mode 100644
index f34e161177bdb0d08d9bcdf0afd2a29ce604ff92..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/concat_op.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildConcatNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  std::vector<std::shared_ptr<ngraph::Node>> args;
-  for (auto& var_name_item : op->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto& node0 = ngb_node_map->at(var_name);
-      args.push_back(node0);
-    }
-  }
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  int axis = op_attrs.Get<int>("axis");
-  if (axis < 0) {
-    axis = axis + args[0]->get_shape().size();
-  }
-  auto out = std::make_shared<ngraph::op::Concat>(args, axis);
-  platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(concat, BuildConcatNode);
diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
deleted file mode 100644
index ab88d870c4762ce3b2a76ca5e32326222f479b55..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-std::shared_ptr<ngraph::Node> GroupedConvolution(
-    const std::shared_ptr<ngraph::Node>& data_batch,
-    const std::shared_ptr<ngraph::Node>& filters, const ngraph::Strides strides,
-    const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings,
-    size_t groups) {
-  auto& data_shape = data_batch->get_shape();
-  auto& filter_shape = filters->get_shape();
-  ngraph::NodeVector ng_slices;
-
-  for (size_t i = 0; i < groups; ++i) {
-    size_t channel_step = filter_shape.at(1);
-    const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0};
-    const std::vector<size_t> upper_bound{data_shape.at(0),
-                                          (i + 1) * channel_step,
-                                          data_shape.at(2), data_shape.at(3)};
-    auto data_slice = std::make_shared<ngraph::op::Slice>(
-        data_batch, lower_bound, upper_bound);
-
-    size_t filter_step = filter_shape.at(0) / groups;
-    const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
-    const std::vector<size_t> filter_upper_bound{
-        (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2),
-        filter_shape.at(3)};
-    auto filter_slice = std::make_shared<ngraph::op::Slice>(
-        filters, filter_lower_bound, filter_upper_bound);
-    auto ng_conv = std::make_shared<ngraph::op::Convolution>(
-        data_slice, filter_slice, strides, dilations, paddings, paddings);
-    ng_slices.push_back(ng_conv);
-  }
-
-  size_t concat_axis = 1;
-  return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis);
-}
-
-std::shared_ptr<ngraph::Node> GroupedGradConvolutionFilter(
-    const std::shared_ptr<ngraph::Node>& data_batch,
-    const std::shared_ptr<ngraph::Node>& filters,
-    const std::shared_ptr<ngraph::Node>& doutput, const ngraph::Strides strides,
-    const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings,
-    size_t groups) {
-  auto& data_shape = data_batch->get_shape();
-  auto& filter_shape = filters->get_shape();
-  auto& out_shape = doutput->get_shape();
-  ngraph::NodeVector ng_slices;
-
-  for (size_t i = 0; i < groups; ++i) {
-    size_t channel_step = filter_shape.at(1);
-    const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0};
-    const std::vector<size_t> upper_bound{data_shape.at(0),
-                                          (i + 1) * channel_step,
-                                          data_shape.at(2), data_shape.at(3)};
-    auto data_slice = std::make_shared<ngraph::op::Slice>(
-        data_batch, lower_bound, upper_bound);
-
-    size_t filter_step = filter_shape.at(0) / groups;
-
-    const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
-    const std::vector<size_t> filter_upper_bound{
-        (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2),
-        filter_shape.at(3)};
-    auto filter_slice = std::make_shared<ngraph::op::Slice>(
-        filters, filter_lower_bound, filter_upper_bound);
-
-    const std::vector<size_t> olower_bound{0, i * filter_step, 0, 0};
-    const std::vector<size_t> oupper_bound{out_shape.at(0),
-                                           (i + 1) * filter_step,
-                                           out_shape.at(2), out_shape.at(3)};
-    auto out_slice = std::make_shared<ngraph::op::Slice>(doutput, olower_bound,
-                                                         oupper_bound);
-
-    auto ng_conv = std::make_shared<ngraph::op::ConvolutionBackpropFilters>(
-        data_slice, filter_slice->get_shape(), out_slice, strides, dilations,
-        paddings, paddings, ngraph::Strides{1, 1});
-
-    ng_slices.push_back(ng_conv);
-  }
-
-  size_t concat_axis = 0;
-  return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis);
-}
-
-std::shared_ptr<ngraph::Node> GroupedGradConvolutionData(
-    const std::shared_ptr<ngraph::Node>& data_batch,
-    const std::shared_ptr<ngraph::Node>& filters,
-    const std::shared_ptr<ngraph::Node>& doutput, const ngraph::Strides strides,
-    const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings,
-    size_t groups) {
-  auto& data_shape = data_batch->get_shape();
-  auto& filter_shape = filters->get_shape();
-  auto& out_shape = doutput->get_shape();
-  ngraph::NodeVector ng_slices;
-
-  for (size_t i = 0; i < groups; ++i) {
-    size_t channel_step = filter_shape.at(1);
-    const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0};
-    const std::vector<size_t> upper_bound{data_shape.at(0),
-                                          (i + 1) * channel_step,
-                                          data_shape.at(2), data_shape.at(3)};
-    auto data_slice = std::make_shared<ngraph::op::Slice>(
-        data_batch, lower_bound, upper_bound);
-
-    size_t filter_step = filter_shape.at(0) / groups;
-
-    const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
-    const std::vector<size_t> filter_upper_bound{
-        (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2),
-        filter_shape.at(3)};
-    auto filter_slice = std::make_shared<ngraph::op::Slice>(
-        filters, filter_lower_bound, filter_upper_bound);
-
-    const std::vector<size_t> olower_bound{0, i * filter_step, 0, 0};
-    const std::vector<size_t> oupper_bound{out_shape.at(0),
-                                           (i + 1) * filter_step,
-                                           out_shape.at(2), out_shape.at(3)};
-    auto out_slice = std::make_shared<ngraph::op::Slice>(doutput, olower_bound,
-                                                         oupper_bound);
-
-    auto ng_conv = std::make_shared<ngraph::op::ConvolutionBackpropData>(
-        data_slice->get_shape(), filter_slice, out_slice, strides, dilations,
-        paddings, paddings, ngraph::Strides{1, 1});
-    ng_slices.push_back(ng_conv);
-  }
-
-  size_t concat_axis = 1;
-  return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis);
-}
-
-void BuildConv2dNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  auto filters = paddle::platform::GetInputNode(op, "Filter", ngb_node_map);
-  auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map);
-
-  std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
-  std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
-  std::vector<int> dilations = op_attrs.Get<std::vector<int>>("dilations");
-
-  const ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
-                                   static_cast<size_t>(strides.at(1))};
-  const ngraph::Strides ng_dilations{static_cast<size_t>(dilations.at(0)),
-                                     static_cast<size_t>(dilations.at(1))};
-  const ngraph::CoordinateDiff ng_paddings{
-      static_cast<std::ptrdiff_t>(paddings.at(0)),
-      static_cast<std::ptrdiff_t>(paddings.at(1))};
-
-  int groups = static_cast<size_t>(op_attrs.Get<int>("groups"));
-  PADDLE_ENFORCE_GE(groups, 1, "conv groups needs be no less than 1");
-
-  std::shared_ptr<ngraph::Node> result;
-  if (groups == 1) {
-    result = std::make_shared<ngraph::op::Convolution>(
-        input, filters, ng_strides, ng_dilations, ng_paddings, ng_paddings);
-  } else {
-    result = GroupedConvolution(input, filters, ng_strides, ng_dilations,
-                                ng_paddings, groups);
-  }
-  paddle::platform::SetOutputNode(op, "Output", result, ngb_node_map);
-}
-
-void BuildConv2dGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  auto filter = paddle::platform::GetInputNode(op, "Filter", ngb_node_map);
-  auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map);
-  auto doutput =
-      paddle::platform::GetInputNode(op, "Output@GRAD", ngb_node_map);
-
-  int groups = op_attrs.Get<int>("groups");
-  std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
-  std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
-  std::vector<int> dilations = op_attrs.Get<std::vector<int>>("dilations");
-
-  const ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
-                                   static_cast<size_t>(strides.at(1))};
-  const ngraph::Strides ng_dilations{static_cast<size_t>(dilations.at(0)),
-                                     static_cast<size_t>(dilations.at(1))};
-  const ngraph::CoordinateDiff ng_paddings{
-      static_cast<std::ptrdiff_t>(paddings.at(0)),
-      static_cast<std::ptrdiff_t>(paddings.at(1))};
-
-  std::shared_ptr<ngraph::Node> dfilter;
-  std::shared_ptr<ngraph::Node> dinput;
-  if (groups == 1) {
-    dfilter = std::make_shared<ngraph::op::ConvolutionBackpropFilters>(
-        input, filter->get_shape(), doutput, ng_strides, ng_dilations,
-        ng_paddings, ng_paddings, ngraph::Strides{1, 1});
-
-    dinput = std::make_shared<ngraph::op::ConvolutionBackpropData>(
-        input->get_shape(), filter, doutput, ng_strides, ng_dilations,
-        ng_paddings, ng_paddings, ngraph::Strides{1, 1});
-
-  } else {
-    dfilter = GroupedGradConvolutionFilter(input, filter, doutput, ng_strides,
-                                           ng_dilations, ng_paddings, groups);
-    dinput = GroupedGradConvolutionData(input, filter, doutput, ng_strides,
-                                        ng_dilations, ng_paddings, groups);
-  }
-
-  paddle::platform::SetOutputNode(op, "Filter@GRAD", dfilter, ngb_node_map);
-  paddle::platform::SetOutputNode(op, "Input@GRAD", dinput, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(conv2d, BuildConv2dNode);
-REGISTER_NG_OP(conv2d_grad, BuildConv2dGradNode);
-REGISTER_NG_OP(depthwise_conv2d, BuildConv2dNode);
diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
deleted file mode 100644
index e06446aca90f6fc02918680253fa72ae03dc2ad4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-std::shared_ptr<ngraph::Node> remove_trailing_one(
-    const std::shared_ptr<ngraph::Node>& input) {
-  auto shape = input->get_shape();
-  if (shape.back() == 1 && shape.size() > 1) {
-    shape.pop_back();
-    return platform::NgReshaper(input, shape);
-  } else {
-    return input;
-  }
-}
-
-std::shared_ptr<ngraph::Node> flatten_node(
-    const std::shared_ptr<ngraph::Node>& input) {
-  auto shape = input->get_shape();
-  auto rank = shape.size();
-  auto output = input;
-  if (rank > 2) {
-    auto shape_2d = paddle::platform::FlattenTo2d(shape, rank - 1);
-    output = paddle::platform::NgReshaper(input, shape_2d);
-  }
-  return output;
-}
-
-std::shared_ptr<ngraph::Node> convert_to_node_type(
-    const std::shared_ptr<ngraph::Node>& input,
-    const std::shared_ptr<ngraph::Node>& ref) {
-  auto output = input;
-  if (input->get_element_type() != ref->get_element_type()) {
-    output =
-        std::make_shared<ngraph::op::Convert>(input, ref->get_element_type());
-  }
-  return output;
-}
-
-std::shared_ptr<ngraph::Node> create_xe(
-    const std::shared_ptr<ngraph::Node>& one_hot,
-    const std::shared_ptr<ngraph::Node>& x) {
-  auto node_log = std::make_shared<ngraph::op::Log>(x);
-
-  auto node_mul = one_hot * node_log;
-  auto node_sum = std::make_shared<ngraph::op::Sum>(
-      node_mul, ngraph::AxisSet{x->get_shape().size() - 1});
-
-  auto shape = x->get_shape();
-  shape.back() = 1;
-  return platform::NgReshaper(-node_sum, shape);
-}
-
-std::shared_ptr<ngraph::Node> create_mask(
-    const std::shared_ptr<ngraph::Node>& label, int ignore_index) {
-  auto ignore_node = paddle::platform::CreateConstant(
-      label->get_element_type(), label->get_shape(), {ignore_index});
-  auto not_equal_node =
-      std::make_shared<ngraph::op::NotEqual>(label, ignore_node);
-  return not_equal_node;
-}
-
-std::shared_ptr<ngraph::Node> create_one_hot(
-    const std::shared_ptr<ngraph::Node>& label,
-    const std::shared_ptr<ngraph::Node>& x) {
-  auto label_shape = label->get_shape();
-  return std::make_shared<ngraph::op::OneHot>(
-      remove_trailing_one(label), x->get_shape(), x->get_shape().size() - 1);
-}
-
-std::shared_ptr<ngraph::Node> GetCrossEntropy(
-    std::shared_ptr<ngraph::Node> x, std::shared_ptr<ngraph::Node> label,
-    const bool is_soft_label, int ignore_index) {
-  std::shared_ptr<ngraph::Node> node_1_hot = label;
-  if (!is_soft_label) {
-    node_1_hot = create_one_hot(label, x);
-  }
-  node_1_hot = convert_to_node_type(node_1_hot, x);
-
-  auto xe = create_xe(node_1_hot, x);
-  if (!is_soft_label) {
-    auto mask = convert_to_node_type(create_mask(label, ignore_index), xe);
-    xe = xe * mask;
-  }
-  return xe;
-}
-
-std::shared_ptr<ngraph::Node> GetCrossEntropyGrad(
-    std::shared_ptr<ngraph::Node> x, std::shared_ptr<ngraph::Node> label,
-    std::shared_ptr<ngraph::Node> dy, const bool is_soft_label,
-    int ignore_index) {
-  auto x_shape = x->get_shape();
-  auto rank = x_shape.size();
-
-  std::shared_ptr<ngraph::Node> mask;
-  if (!is_soft_label) {
-    mask = convert_to_node_type(create_mask(label, ignore_index), x);
-    mask = std::make_shared<ngraph::op::Broadcast>(
-        remove_trailing_one(mask), x_shape, ngraph::AxisSet{rank - 1});
-    label = create_one_hot(label, x);
-  }
-
-  auto dy_reshape = remove_trailing_one(dy);
-  auto dy_bcast = std::make_shared<ngraph::op::Broadcast>(
-      dy_reshape, x_shape, ngraph::AxisSet{rank - 1});
-
-  label = convert_to_node_type(label, x);
-
-  auto xe_grad = -label * dy_bcast / x;
-
-  if (!is_soft_label) {
-    xe_grad = xe_grad * mask;
-  }
-  return xe_grad;
-}
-
-void BuildCrossEntropyNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
-  int ignore_index = op_attrs.Get<int>("ignore_index");
-  auto xe = GetCrossEntropy(x, label, is_soft_label, ignore_index);
-  paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
-}
-
-void BuildCrossEntropyGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
-  int ignore_index = op_attrs.Get<int>("ignore_index");
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
-  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
-  auto xe_grad = GetCrossEntropyGrad(x, label, dy, is_soft_label, ignore_index);
-  paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map);
-}
-
-void BuildCrossEntropy2Node(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int ignore_index = op_attrs.Get<int>("ignore_index");
-
-  auto rank = x->get_shape().size();
-
-  auto one_hot = convert_to_node_type(create_one_hot(label, x), x);
-  auto xe = create_xe(one_hot, x);
-  auto mask = convert_to_node_type(create_mask(label, ignore_index), xe);
-
-  xe = xe * mask;
-
-  std::shared_ptr<ngraph::Node> node_sum =
-      std::make_shared<ngraph::op::Sum>(one_hot * x, ngraph::AxisSet{rank - 1});
-  node_sum = paddle::platform::NgReshaper(node_sum, mask->get_shape());
-  auto matchx = mask * node_sum;
-
-  paddle::platform::SetOutputNode(op, "MatchX", matchx, ngb_node_map);
-  platform::SetOutputNode(op, "XShape", x, ngb_node_map);
-  paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
-}
-
-void BuildCrossEntropyGrad2Node(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int ignore_index = op_attrs.Get<int>("ignore_index");
-  auto matchx = paddle::platform::GetInputNode(op, "MatchX", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
-  auto x = paddle::platform::GetInputNode(op, "XShape", ngb_node_map);
-  auto dy = paddle::platform::GetInputNode(op, framework::GradVarName("Y"),
-                                           ngb_node_map);
-
-  matchx = remove_trailing_one(matchx);
-  label = remove_trailing_one(label);
-  x = remove_trailing_one(x);
-  dy = remove_trailing_one(dy);
-
-  auto x_shape = x->get_shape();
-  auto rank = x_shape.size();
-
-  auto one_hot = convert_to_node_type(create_one_hot(label, x), x);
-  auto mask = convert_to_node_type(create_mask(label, ignore_index), x);
-
-  auto zero = paddle::platform::CreateConstant(matchx->get_element_type(),
-                                               matchx->get_shape(), {0});
-  auto one = paddle::platform::CreateConstant(matchx->get_element_type(),
-                                              matchx->get_shape(), {1});
-  auto is_zero = std::make_shared<ngraph::op::Equal>(matchx, zero);
-  matchx = std::make_shared<ngraph::op::Select>(is_zero, one, matchx);
-
-  auto dy_bcast = std::make_shared<ngraph::op::Broadcast>(
-      mask * dy, x_shape, ngraph::AxisSet{rank - 1});
-  auto matchx_bcast = std::make_shared<ngraph::op::Broadcast>(
-      matchx, x_shape, ngraph::AxisSet{rank - 1});
-
-  auto xe_grad = -dy_bcast * one_hot / matchx_bcast;
-  paddle::platform::SetOutputNode(op, framework::GradVarName("X"), xe_grad,
-                                  ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(cross_entropy, BuildCrossEntropyNode);
-REGISTER_NG_OP(cross_entropy_grad, BuildCrossEntropyGradNode);
-REGISTER_NG_OP(cross_entropy2, BuildCrossEntropy2Node);
-REGISTER_NG_OP(cross_entropy_grad2, BuildCrossEntropyGrad2Node);
diff --git a/paddle/fluid/operators/ngraph/ops/dropout_op.h b/paddle/fluid/operators/ngraph/ops/dropout_op.h
deleted file mode 100644
index 3fb55980d76c126c8db88d9f52866e4d667ef6da..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/dropout_op.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/ngraph.hpp"
-#include "ngraph/op/experimental/generate_mask.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-static void BuildDropoutNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  auto dropout_prob = op_attrs.Get<float>("dropout_prob");
-  auto dropout_implementation =
-      op_attrs.Get<std::string>("dropout_implementation");
-  auto is_test = op_attrs.Get<bool>("is_test");
-  auto seed = op_attrs.Get<int>("seed");
-  auto fix_seed = op_attrs.Get<bool>("fix_seed");
-  float value = 1.0f - dropout_prob;
-  bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-
-  if (is_test) {
-    if (upscale_in_train) {
-      platform::SetOutputNode(op, "Out", input, ngb_node_map);
-    } else {
-      auto mask_val = paddle::platform::CreateConstant(
-          input->get_element_type(), input->get_shape(), {value});
-      auto out = input * mask_val;
-      platform::SetOutputNode(op, "Out", out, ngb_node_map);
-    }
-  } else {
-    auto one = paddle::platform::CreateConstant(input->get_element_type(),
-                                                ngraph::Shape{}, {1});
-
-    auto gen_mask = std::make_shared<ngraph::op::GenerateMask>(
-        one, input->get_shape(), input->get_element_type(), seed, value,
-        fix_seed);
-
-    if (upscale_in_train) {
-      auto mask_val = paddle::platform::CreateConstant(
-          input->get_element_type(), input->get_shape(), {value});
-
-      auto out = value ? input * gen_mask / mask_val : input * gen_mask;
-      platform::SetOutputNode(op, "Mask", gen_mask, ngb_node_map);
-      platform::SetOutputNode(op, "Out", out, ngb_node_map);
-    } else {
-      auto out = input * gen_mask;
-      platform::SetOutputNode(op, "Mask", gen_mask, ngb_node_map);
-      platform::SetOutputNode(op, "Out", out, ngb_node_map);
-    }
-  }
-}
-
-static void BuildDropoutGradNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto dy = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto mask = platform::GetInputNode(op, "Mask", ngb_node_map);
-  if (dy->get_element_type() != mask->get_element_type()) {
-    mask = std::make_shared<ngraph::op::Convert>(mask, dy->get_element_type());
-  }
-
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  auto dropout_prob = op_attrs.Get<float>("dropout_prob");
-  auto dropout_implementation =
-      op_attrs.Get<std::string>("dropout_implementation");
-  auto dx = dy * mask;
-
-  if (dropout_implementation == "upscale_in_train") {
-    if (dropout_prob == 1.0f) {
-      dx = ElementwiseScalar<ngraph::op::Multiply>(0., dy);
-    } else {
-      dx =
-          ElementwiseScalar<ngraph::op::Multiply>(1. / (1. - dropout_prob), dx);
-    }
-  }
-  platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(dropout, BuildDropoutNode);
-REGISTER_NG_OP(dropout_grad, BuildDropoutGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
deleted file mode 100644
index d7485a706a193a52113cb993a3604c444b4303c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildElementwiseAddNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  BuildElementwiseBinaryNode<ngraph::op::Add>(op, ngb_node_map);
-}
-
-void BuildElementwiseAddGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int axis = op_attrs.Get<int>("axis");
-
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
-  auto dout_shape = dout->get_shape();
-  auto y_shape = y->get_shape();
-
-  if (dout_shape == y_shape) {
-    paddle::platform::SetOutputNode(op, "X@GRAD", dout, ngb_node_map);
-    paddle::platform::SetOutputNode(op, "Y@GRAD", dout, ngb_node_map);
-  } else {
-    axis = (axis == -1 ? dout_shape.size() - y_shape.size() : axis);
-    paddle::platform::TrimTrailingSingularDims(&y_shape);
-    axis = (y_shape.size() == 0 ? dout_shape.size() : axis);
-
-    int pre, n, post;
-    paddle::platform::GetMidDims(dout_shape, y_shape, axis, &pre, &n, &post);
-
-    ngraph::Shape lhs_shape{};
-    lhs_shape.push_back(pre);
-    lhs_shape.push_back(n);
-    if (post != 1) {
-      lhs_shape.push_back(post);
-    }
-
-    std::vector<size_t> lhs_order(dout_shape.size());
-    std::iota(std::begin(lhs_order), std::end(lhs_order), 0);
-    auto dout_reshape = std::make_shared<ngraph::op::Reshape>(
-        dout, ngraph::AxisVector(lhs_order), lhs_shape);
-
-    ngraph::AxisSet axis_set{0};
-    if (post != 1) {
-      axis_set.insert(2);
-    }
-
-    auto dout_sum = std::make_shared<ngraph::op::Sum>(dout_reshape, axis_set);
-    auto dy = std::make_shared<ngraph::op::Reshape>(
-        dout_sum, ngraph::AxisVector{0}, y->get_shape());
-
-    paddle::platform::SetOutputNode(op, "X@GRAD", dout, ngb_node_map);
-    paddle::platform::SetOutputNode(op, "Y@GRAD", dy, ngb_node_map);
-  }
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(elementwise_add, BuildElementwiseAddNode);
-REGISTER_NG_OP(elementwise_add_grad, BuildElementwiseAddGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h b/paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h
deleted file mode 100644
index e4e17f5bb219bdf82db99fce2ea4fe5dbcb6e0c9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-ngraph::NodeVector ElementwiseBinaryNodePrepare(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int axis = op_attrs.Get<int>("axis");
-  auto lhs = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto rhs = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
-
-  auto lhs_shape = lhs->get_shape();
-  auto rhs_shape = rhs->get_shape();
-
-  PADDLE_ENFORCE_GE(lhs_shape.size(), rhs_shape.size(),
-                    "Rank of first input must >= rank of second input.");
-  if (lhs_shape == rhs_shape) {
-    return ngraph::NodeVector{lhs, rhs};
-  }
-  axis = (rhs_shape.size() == 0) ? lhs_shape.size() - 1 : axis;
-  axis = (axis == -1 ? lhs_shape.size() - rhs_shape.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < (int)(lhs_shape.size()),
-                 "Axis should be in range [0, lhs_shape)");
-  paddle::platform::TrimTrailingSingularDims(&rhs_shape);
-
-  int pre, n, post;
-  paddle::platform::GetMidDims(lhs_shape, rhs_shape, axis, &pre, &n, &post);
-
-  ngraph::Shape l_shape{};
-  l_shape.push_back(pre);
-  l_shape.push_back(n);
-  l_shape.push_back(post);
-
-  std::vector<size_t> rhs_order(rhs->get_shape().size());
-  std::iota(std::begin(rhs_order), std::end(rhs_order), 0);
-  ngraph::Shape r_shape{};
-  r_shape.push_back(n);
-  auto rhs_reshape = std::make_shared<ngraph::op::Reshape>(
-      rhs, ngraph::AxisVector(rhs_order), r_shape);
-  auto rhs_bcast = std::make_shared<ngraph::op::Broadcast>(
-      rhs_reshape, l_shape, ngraph::AxisSet{0, 2});
-  std::vector<size_t> bcast_order(rhs_bcast->get_shape().size());
-  std::iota(std::begin(bcast_order), std::end(bcast_order), 0);
-  std::shared_ptr<ngraph::Node> rhs_bcast_reshape =
-      std::make_shared<ngraph::op::Reshape>(
-          rhs_bcast, ngraph::AxisVector(bcast_order), lhs_shape);
-  return ngraph::NodeVector{lhs, rhs_bcast_reshape};
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_div_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_div_op.h
deleted file mode 100644
index b4cc2f862ba8cfbb26c21d41f061dfdd10f11903..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/elementwise_div_op.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildElementwiseDivGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int axis = op_attrs.Get<int>("axis");
-
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
-  auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
-  auto dout_shape = dout->get_shape();
-  auto y_shape = y->get_shape();
-  if (dout->get_element_type() != y->get_element_type()) {
-    y = std::make_shared<ngraph::op::Convert>(y, dout->get_element_type());
-  }
-  auto dy_hd = std::make_shared<ngraph::op::Multiply>(out, dout);
-  if (dout_shape == y_shape) {
-    auto dx = std::make_shared<ngraph::op::Divide>(dout, y);
-    auto dy = std::make_shared<ngraph::op::Divide>(dy_hd, -y);
-    paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
-    paddle::platform::SetOutputNode(op, "Y@GRAD", dy, ngb_node_map);
-  } else {
-    auto dy_hd_shape = dy_hd->get_shape();
-    axis = (axis == -1 ? dy_hd_shape.size() - y_shape.size() : axis);
-    paddle::platform::TrimTrailingSingularDims(&y_shape);
-    axis = (y_shape.size() == 0 ? dy_hd_shape.size() : axis);
-    int pre, n, post;
-    paddle::platform::GetMidDims(dy_hd_shape, y_shape, axis, &pre, &n, &post);
-    ngraph::Shape lhs_shape{};
-    lhs_shape.push_back(pre);
-    lhs_shape.push_back(n);
-    if (post != 1) {
-      lhs_shape.push_back(post);
-    }
-
-    std::vector<size_t> dy_order(dout_shape.size());
-    std::iota(std::begin(dy_order), std::end(dy_order), 0);
-    auto dy_hd_reshape = std::make_shared<ngraph::op::Reshape>(
-        dy_hd, ngraph::AxisVector(dy_order), lhs_shape);
-
-    ngraph::AxisSet axis_set{0};
-    if (post != 1) {
-      axis_set.insert(2);
-    }
-
-    auto dy_sum = std::make_shared<ngraph::op::Sum>(dy_hd_reshape, axis_set);
-    auto dy_sum_yshape = std::make_shared<ngraph::op::Reshape>(
-        dy_sum, ngraph::AxisVector{0}, y->get_shape());
-    auto dy_ = std::make_shared<ngraph::op::Divide>(dy_sum_yshape, -y);
-    paddle::platform::SetOutputNode(op, "Y@GRAD", dy_, ngb_node_map);
-
-    y_shape = y->get_shape();
-    std::vector<size_t> y_order(y_shape.size() == 0 ? 1 : y_shape.size());
-    std::iota(std::begin(y_order), std::end(y_order), 0);
-    auto y_reshape = std::make_shared<ngraph::op::Reshape>(
-        y, ngraph::AxisVector(y_order), ngraph::Shape{(size_t)n});
-    auto y_broadcast =
-        std::make_shared<ngraph::op::Broadcast>(y_reshape, lhs_shape, axis_set);
-    std::vector<size_t> lhs_order(lhs_shape.size());
-    std::iota(std::begin(lhs_order), std::end(lhs_order), 0);
-    auto y_broadcast_reshape = std::make_shared<ngraph::op::Reshape>(
-        y_broadcast, ngraph::AxisVector(lhs_order), dout_shape);
-    auto dx = std::make_shared<ngraph::op::Divide>(dout, y_broadcast_reshape);
-
-    paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
-  }
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(elementwise_div_grad, BuildElementwiseDivGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_mul_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_mul_op.h
deleted file mode 100644
index c74b103ebf3bc4390500a5f872e8f24860ada6f0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/elementwise_mul_op.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildElementwiseMulNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  BuildElementwiseBinaryNode<ngraph::op::Multiply>(op, ngb_node_map);
-}
-
-void BuildElementwiseMulGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int axis = op_attrs.Get<int>("axis");
-
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto dout_shape = dout->get_shape();
-  auto y_shape = y->get_shape();
-  auto x_shape = x->get_shape();
-  if (dout->get_element_type() != y->get_element_type()) {
-    y = std::make_shared<ngraph::op::Convert>(y, dout->get_element_type());
-  }
-  if (dout_shape == y_shape) {
-    auto dx = std::make_shared<ngraph::op::Multiply>(dout, y);
-    auto dy = std::make_shared<ngraph::op::Multiply>(dout, x);
-    paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
-    paddle::platform::SetOutputNode(op, "Y@GRAD", dy, ngb_node_map);
-  } else {
-    auto dy_hd = std::make_shared<ngraph::op::Multiply>(dout, x);
-    auto dy_hd_shape = dy_hd->get_shape();
-    axis = (axis == -1 ? dy_hd_shape.size() - y_shape.size() : axis);
-    paddle::platform::TrimTrailingSingularDims(&y_shape);
-    axis = (y_shape.size() == 0 ? dy_hd_shape.size() : axis);
-    int pre, n, post;
-    paddle::platform::GetMidDims(dy_hd_shape, y_shape, axis, &pre, &n, &post);
-    ngraph::Shape lhs_shape{};
-    lhs_shape.push_back(pre);
-    lhs_shape.push_back(n);
-    if (post != 1) {
-      lhs_shape.push_back(post);
-    }
-
-    std::vector<size_t> dy_order(dout_shape.size());
-    std::iota(std::begin(dy_order), std::end(dy_order), 0);
-    auto dy_hd_reshape = std::make_shared<ngraph::op::Reshape>(
-        dy_hd, ngraph::AxisVector(dy_order), lhs_shape);
-
-    ngraph::AxisSet axis_set{0};
-    if (post != 1) {
-      axis_set.insert(2);
-    }
-
-    auto dy_sum = std::make_shared<ngraph::op::Sum>(dy_hd_reshape, axis_set);
-    auto dy_sum_yshape = std::make_shared<ngraph::op::Reshape>(
-        dy_sum, ngraph::AxisVector{0}, y->get_shape());
-    paddle::platform::SetOutputNode(op, "Y@GRAD", dy_sum_yshape, ngb_node_map);
-
-    y_shape = y->get_shape();
-    std::vector<size_t> y_order(y_shape.size() == 0 ? 1 : y_shape.size());
-    std::iota(std::begin(y_order), std::end(y_order), 0);
-    auto y_reshape = std::make_shared<ngraph::op::Reshape>(
-        y, ngraph::AxisVector(y_order), ngraph::Shape{(size_t)n});
-    auto y_broadcast =
-        std::make_shared<ngraph::op::Broadcast>(y_reshape, lhs_shape, axis_set);
-    std::vector<size_t> lhs_order(lhs_shape.size());
-    std::iota(std::begin(lhs_order), std::end(lhs_order), 0);
-    auto y_broadcast_reshape = std::make_shared<ngraph::op::Reshape>(
-        y_broadcast, ngraph::AxisVector(lhs_order), dout_shape);
-    auto dx = std::make_shared<ngraph::op::Multiply>(y_broadcast_reshape, dout);
-    paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
-  }
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(elementwise_mul, BuildElementwiseMulNode);
-REGISTER_NG_OP(elementwise_mul_grad, BuildElementwiseMulGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_node.h b/paddle/fluid/operators/ngraph/ops/elementwise_node.h
deleted file mode 100644
index 2b10af4588c350e8581e304cdfdd075f56be53fd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/elementwise_node.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-template <typename T>
-void BuildElementwiseBinaryNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto nodes = ElementwiseBinaryNodePrepare(op, ngb_node_map);
-  std::shared_ptr<ngraph::Node>& x = nodes.at(0);
-  std::shared_ptr<ngraph::Node>& y = nodes.at(1);
-
-  if (x->get_element_type() != y->get_element_type()) {
-    y = std::make_shared<ngraph::op::Convert>(y, x->get_element_type());
-  }
-  auto out = std::make_shared<T>(x, y);
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-template <typename T>
-void BuildElementwiseCompareNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto nodes = ElementwiseBinaryNodePrepare(op, ngb_node_map);
-  std::shared_ptr<ngraph::Node>& x = nodes.at(0);
-  std::shared_ptr<ngraph::Node>& y = nodes.at(1);
-
-  if (x->get_element_type() != y->get_element_type()) {
-    x = std::make_shared<ngraph::op::Convert>(x, ngraph::element::f64);
-    y = std::make_shared<ngraph::op::Convert>(y, ngraph::element::f64);
-  }
-  auto out = std::make_shared<T>(x, y);
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(elementwise_max,
-               BuildElementwiseBinaryNode<ngraph::op::Maximum>);
-REGISTER_NG_OP(elementwise_pow, BuildElementwiseBinaryNode<ngraph::op::Power>);
-REGISTER_NG_OP(elementwise_sub,
-               BuildElementwiseBinaryNode<ngraph::op::Subtract>);
-REGISTER_NG_OP(elementwise_min,
-               BuildElementwiseBinaryNode<ngraph::op::Minimum>);
-REGISTER_NG_OP(less_than, BuildElementwiseCompareNode<ngraph::op::Less>);
-REGISTER_NG_OP(elementwise_div, BuildElementwiseBinaryNode<ngraph::op::Divide>);
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
deleted file mode 100644
index 8f5092963c8b79501ea68c1f521c4678977635ea..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-template <typename T>
-std::shared_ptr<ngraph::Node> ElementwiseScalar(
-    float scale, std::shared_ptr<ngraph::Node> node) {
-  auto node_shape = node->get_shape();
-  auto scale_const = ngraph::op::Constant::create(node->get_element_type(),
-                                                  node_shape, {scale});
-  return std::make_shared<T>(scale_const, node);
-}
-
-template <typename T>
-std::shared_ptr<ngraph::Node> ElementwiseScalar(
-    std::shared_ptr<ngraph::Node> scale_1d,
-    std::shared_ptr<ngraph::Node> node) {
-  auto scale_shape = scale_1d->get_shape();
-  PADDLE_ENFORCE_EQ(scale_shape.size(), 1, "Supporting 1d scale node");
-  PADDLE_ENFORCE_EQ(scale_shape.at(0), 1, "scale 1d in in shape {1}");
-
-  auto node_shape = node->get_shape();
-  ngraph::AxisSet axis_set;
-  for (size_t i = 0; i < node_shape.size(); ++i) {
-    axis_set.insert(i);
-  }
-  node_shape.push_back(1);
-
-  auto scale_bcast =
-      std::make_shared<ngraph::op::Broadcast>(scale_1d, node_shape, axis_set);
-
-  auto scale_reshape =
-      paddle::platform::NgReshaper(scale_bcast, node->get_shape());
-
-  return std::make_shared<T>(scale_reshape, node);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
deleted file mode 100644
index fee5f57e4862a8a033a28885a01a0dafea35f7f0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildFillConstantNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  auto vsp = op_attrs.Get<std::vector<int64_t>>("shape");
-  ngraph::Shape shape;
-  for (auto& sp : vsp) {
-    shape.push_back(sp);
-  }
-  float value = op_attrs.Get<float>("value");
-  auto ng_dtype =
-      platform::GetNgType(static_cast<paddle::framework::proto::VarType::Type>(
-          op_attrs.Get<int>("dtype")));
-  auto out = ngraph::op::Constant::create(ng_dtype, shape, {value});
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(fill_constant, BuildFillConstantNode);
diff --git a/paddle/fluid/operators/ngraph/ops/fill_zeros_like_op.h b/paddle/fluid/operators/ngraph/ops/fill_zeros_like_op.h
deleted file mode 100644
index 163734be877737dbf5633b2a45763bfb62f887da..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/fill_zeros_like_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-static void BuildFillZerosLikeNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = platform::GetInputNode(op, "X", ngb_node_map);
-  auto out = paddle::platform::CreateConstant(x->get_element_type(),
-                                              x->get_shape(), {0});
-  platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(fill_zeros_like, BuildFillZerosLikeNode);
diff --git a/paddle/fluid/operators/ngraph/ops/gather_op.h b/paddle/fluid/operators/ngraph/ops/gather_op.h
deleted file mode 100644
index 7d369b27d3dd2873f7d73bafd5eb08187736959f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/gather_op.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildGatherNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = platform::GetInputNode(op, "X", ngb_node_map);
-  PADDLE_ENFORCE_NOT_NULL(x);
-
-  auto index = platform::GetInputNode(op, "Index", ngb_node_map);
-  auto& index_shape = index->get_shape();
-  PADDLE_ENFORCE(index_shape.size() == 1 ||
-                 (index_shape.size() == 2 && index_shape[1] == 1));
-  if (index_shape.size() == 2) {
-    index = platform::NgReshaper(index, ngraph::Shape{index_shape[0]});
-  }
-
-  auto out = std::make_shared<ngraph::op::Gather>(x, index);
-
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-void BuildGatherGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  PADDLE_ENFORCE_NOT_NULL(dout);
-  auto x = platform::GetInputNode(op, "X", ngb_node_map);
-
-  auto index = platform::GetInputNode(op, "Index", ngb_node_map);
-  auto& index_shape = index->get_shape();
-  PADDLE_ENFORCE(index_shape.size() == 1 ||
-                 (index_shape.size() == 2 && index_shape[1] == 1));
-  if (index_shape.size() == 2) {
-    index = platform::NgReshaper(index, ngraph::Shape{index_shape[0]});
-  }
-
-  std::shared_ptr<ngraph::Node> x0 = paddle::platform::CreateConstant(
-      dout->get_element_type(), x->get_shape(), {0});
-  auto dx = std::make_shared<ngraph::op::ScatterAdd>(x0, index, dout);
-  paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(gather, BuildGatherNode);
-REGISTER_NG_OP(gather_grad, BuildGatherGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/increment_op.h b/paddle/fluid/operators/ngraph/ops/increment_op.h
deleted file mode 100644
index 4c4287e274be09c425c0b991f42b7f8d5ac56691..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/increment_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildIncrementNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  float step = op_attrs.Get<float>("step");
-  auto step_op = std::make_shared<ngraph::op::Constant>(
-      x->get_element_type(), x->get_shape(), std::vector<float>{step});
-  std::shared_ptr<ngraph::Node> out =
-      std::make_shared<ngraph::op::Add>(x, step_op);
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(increment, BuildIncrementNode);
diff --git a/paddle/fluid/operators/ngraph/ops/layer_norm_op.h b/paddle/fluid/operators/ngraph/ops/layer_norm_op.h
deleted file mode 100644
index f56110f969747553ee10e43d91cf4cc5107fadab..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/layer_norm_op.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-std::shared_ptr<ngraph::Node> reshape_reduction(
-    std::shared_ptr<ngraph::Node> node, const ngraph::Shape shape,
-    int begin_norm_axis) {
-  ngraph::Shape keepdims_shape(shape.begin(), shape.begin() + begin_norm_axis);
-  return paddle::platform::NgReshaper(node, keepdims_shape);
-}
-
-std::shared_ptr<ngraph::Node> broadcast_reduction(
-    std::shared_ptr<ngraph::Node> node, const ngraph::Shape shape,
-    int begin_norm_axis) {
-  ngraph::AxisSet axis_set;
-  for (size_t i = begin_norm_axis; i < shape.size(); ++i) axis_set.insert(i);
-  auto reshape = reshape_reduction(node, shape, begin_norm_axis);
-  return std::make_shared<ngraph::op::Broadcast>(reshape, shape, axis_set);
-}
-
-std::shared_ptr<ngraph::Node> reshape_bias_scale(
-    std::shared_ptr<ngraph::Node> node, const ngraph::Shape shape,
-    int begin_norm_axis) {
-  ngraph::Shape keepdims_shape(shape.begin() + begin_norm_axis, shape.end());
-  return paddle::platform::NgReshaper(node, keepdims_shape);
-}
-
-std::shared_ptr<ngraph::Node> broadcast_bias_scale(
-    std::shared_ptr<ngraph::Node> node, const ngraph::Shape shape,
-    int begin_norm_axis) {
-  auto reshape = reshape_bias_scale(node, shape, begin_norm_axis);
-  ngraph::AxisSet axis_set;
-  for (int i = 0; i < begin_norm_axis; ++i) axis_set.insert(i);
-  return std::make_shared<ngraph::op::Broadcast>(reshape, shape, axis_set);
-}
-
-std::shared_ptr<ngraph::Node> flatten(const std::shared_ptr<ngraph::Node>& node,
-                                      bool insert_leading_one = false) {
-  size_t out = 1;
-  for (auto s : node->get_shape()) out *= s;
-  if (insert_leading_one) {
-    return paddle::platform::NgReshaper(node, ngraph::Shape{1, out});
-  } else {
-    return paddle::platform::NgReshaper(node, ngraph::Shape{out});
-  }
-}
-
-static void BuildLayerNormNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  const auto begin_norm_axis = op_attrs.Get<int>("begin_norm_axis");
-  const auto epsilon = op_attrs.Get<float>("epsilon");
-
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
-  auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map);
-
-  auto shape = x->get_shape();
-  std::vector<size_t> reduction_axes(shape.size() - begin_norm_axis);
-  std::iota(reduction_axes.begin(), reduction_axes.end(), begin_norm_axis);
-
-  auto mean = ngraph::builder::mean(x, reduction_axes);
-  auto broadcast_mean = broadcast_reduction(mean, shape, begin_norm_axis);
-
-  auto delta = x - broadcast_mean;
-  auto variance = ngraph::builder::mean(delta * delta, reduction_axes);
-
-  auto eps = paddle::platform::CreateConstant(variance->get_element_type(),
-                                              variance->get_shape(), {epsilon});
-
-  auto stddev = std::make_shared<ngraph::op::Sqrt>(variance + eps);
-  auto broadcast_stddev = broadcast_reduction(stddev, shape, begin_norm_axis);
-
-  auto norm = delta / broadcast_stddev;
-
-  if (scale) {
-    auto broadcast_scale = broadcast_bias_scale(scale, shape, begin_norm_axis);
-    norm = norm * broadcast_scale;
-  }
-  if (bias) {
-    auto broadcast_bias = broadcast_bias_scale(bias, shape, begin_norm_axis);
-    norm = norm + broadcast_bias;
-  }
-  mean = flatten(mean);
-  variance = flatten(variance);
-  paddle::platform::SetOutputNode(op, "Y", norm, ngb_node_map);
-  paddle::platform::SetOutputNode(op, "Mean", mean, ngb_node_map);
-  paddle::platform::SetOutputNode(op, "Variance", variance, ngb_node_map);
-}
-
-static void BuildLayerNormGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  const auto begin_norm_axis = op_attrs.Get<int>("begin_norm_axis");
-  const auto epsilon = op_attrs.Get<float>("epsilon");
-
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto mean = paddle::platform::GetInputNode(op, "Mean", ngb_node_map);
-  auto variance = paddle::platform::GetInputNode(op, "Variance", ngb_node_map);
-  auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
-  auto dy = paddle::platform::GetInputNode(op, framework::GradVarName("Y"),
-                                           ngb_node_map);
-
-  auto dx = paddle::platform::GetOutputNode(op, framework::GradVarName("X"),
-                                            ngb_node_map);
-  auto dscale = paddle::platform::GetOutputNode(
-      op, framework::GradVarName("Scale"), ngb_node_map);
-  auto dbias = paddle::platform::GetOutputNode(
-      op, framework::GradVarName("Bias"), ngb_node_map);
-
-  auto shape = x->get_shape();
-
-  auto broadcast_mean = broadcast_reduction(mean, shape, begin_norm_axis);
-
-  auto delta = x - broadcast_mean;
-  auto eps = paddle::platform::CreateConstant(variance->get_element_type(),
-                                              variance->get_shape(), {epsilon});
-
-  auto stddev = std::make_shared<ngraph::op::Sqrt>(variance + eps);
-  auto broadcast_stddev = broadcast_reduction(stddev, shape, begin_norm_axis);
-
-  auto norm = delta / broadcast_stddev;
-
-  if (dbias) {
-    std::vector<size_t> reduction_axes(begin_norm_axis);
-    std::iota(reduction_axes.begin(), reduction_axes.end(), 0);
-    auto sum_dy = std::make_shared<ngraph::op::Sum>(dy, reduction_axes);
-    paddle::platform::SetOutputNode(op, framework::GradVarName("Bias"),
-                                    flatten(sum_dy), ngb_node_map);
-  }
-  if (dscale) {
-    std::vector<size_t> reduction_axes(begin_norm_axis);
-    std::iota(reduction_axes.begin(), reduction_axes.end(), 0);
-    auto sum_dy = std::make_shared<ngraph::op::Sum>(dy * norm, reduction_axes);
-    paddle::platform::SetOutputNode(op, framework::GradVarName("Scale"),
-                                    flatten(sum_dy), ngb_node_map);
-  }
-
-  if (dx) {
-    std::shared_ptr<ngraph::Node> dx_end = dy / broadcast_stddev;
-    if (dscale)
-      dx_end = dx_end * broadcast_bias_scale(scale, shape, begin_norm_axis);
-
-    std::vector<size_t> reduction_axes(shape.size() - begin_norm_axis);
-    std::iota(reduction_axes.begin(), reduction_axes.end(), begin_norm_axis);
-
-    auto dx_mean = broadcast_reduction(
-        ngraph::builder::mean(-dx_end, reduction_axes), shape, begin_norm_axis);
-
-    auto dx_std =
-        norm * broadcast_reduction(
-                   ngraph::builder::mean(-dx_end * norm, reduction_axes), shape,
-                   begin_norm_axis);
-
-    paddle::platform::SetOutputNode(op, framework::GradVarName("X"),
-                                    dx_end + dx_mean + dx_std, ngb_node_map);
-  }
-}
-
-REGISTER_NG_OP(layer_norm, BuildLayerNormNode);
-REGISTER_NG_OP(layer_norm_grad, BuildLayerNormGradNode);
-
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ops/lookup_table_op.h b/paddle/fluid/operators/ngraph/ops/lookup_table_op.h
deleted file mode 100644
index 45bb31599b017b8de07c521dc8494c91e4b0edd9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/lookup_table_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "ngraph/op/embedding_lookup.hpp"
-#include "paddle/fluid/operators/lookup_table_op.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildLookupTableNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  const bool is_sparse = op_attrs.Get<bool>("is_sparse");
-  const int64_t padding_idx = op_attrs.Get<int64_t>("padding_idx");
-
-  auto ng_ids = paddle::platform::GetInputNode(op, "Ids", ngb_node_map);
-  PADDLE_ENFORCE_NOT_NULL(ng_ids);
-
-  const auto ng_w = paddle::platform::GetInputNode(op, "W", ngb_node_map);
-  PADDLE_ENFORCE_NOT_NULL(ng_w);
-
-  if (is_sparse) {
-    PADDLE_THROW("Sparsity is not yet supported in nGraph lookup_table op.");
-  }
-  auto ng_w_mask = ng_w;
-  if (padding_idx != kNoPadding) {
-    auto w_shape = ng_w->get_shape();
-
-    std::vector<int> maskV(w_shape[0], 1);
-    maskV[padding_idx] = 0;
-    auto maskV_node = std::make_shared<ngraph::op::Constant>(
-        ng_w->get_element_type(), ngraph::Shape{w_shape[0]}, maskV);
-    ngraph::AxisSet axis_set;
-    for (unsigned int i = 1; i < w_shape.size(); ++i) axis_set.insert(i);
-    auto maskV_bd =
-        std::make_shared<ngraph::op::Broadcast>(maskV_node, w_shape, axis_set);
-    ng_w_mask = std::make_shared<ngraph::op::Multiply>(ng_w, maskV_bd);
-  }
-  auto shape = ng_ids->get_shape();
-  if (shape.back() == 1) {
-    shape.pop_back();
-    ng_ids = platform::NgReshaper(ng_ids, shape);
-  }
-
-  auto ng_lookup = std::make_shared<ngraph::op::Gather>(ng_w_mask, ng_ids);
-  platform::SetOutputNode(op, "Out", ng_lookup, ngb_node_map);
-}
-
-void BuildLookupTableGradNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  const bool is_sparse = op_attrs.Get<bool>("is_sparse");
-  auto ng_ids = paddle::platform::GetInputNode(op, "Ids", ngb_node_map);
-  PADDLE_ENFORCE_NOT_NULL(ng_ids);
-
-  const auto ng_w = paddle::platform::GetInputNode(op, "W", ngb_node_map);
-  PADDLE_ENFORCE_NOT_NULL(ng_w);
-
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-
-  if (is_sparse) {
-    PADDLE_THROW("Sparsity is not yet supported in nGraph lookup_table op.");
-  }
-
-  auto shape = ng_ids->get_shape();
-  if (shape.back() == 1) {
-    shape.pop_back();
-    ng_ids = platform::NgReshaper(ng_ids, shape);
-  }
-
-  std::shared_ptr<ngraph::Node> W0 = paddle::platform::CreateConstant(
-      dout->get_element_type(), ng_w->get_shape(), {0});
-  auto dW = std::make_shared<ngraph::op::ScatterAdd>(W0, ng_ids, dout);
-  platform::SetOutputNode(op, "W@GRAD", dW, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(lookup_table, BuildLookupTableNode);
-REGISTER_NG_OP(lookup_table_grad, BuildLookupTableGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/lrn_op.h b/paddle/fluid/operators/ngraph/ops/lrn_op.h
deleted file mode 100644
index 68a0eea08928ae6bfdae50bef9a3f5c2fddde9c8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/lrn_op.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-static void BuildLrnNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  const int n = op_attrs.Get<int>("n");
-  const float alpha = op_attrs.Get<float>("alpha") * static_cast<float>(n);
-  const float beta = op_attrs.Get<float>("beta");
-  const float k = op_attrs.Get<float>("k");
-
-  auto lrn_out = std::make_shared<ngraph::op::LRN>(input, alpha, beta, k, n);
-  std::shared_ptr<ngraph::Node> mid_out = paddle::platform::CreateConstant(
-      input->get_element_type(), input->get_shape(), {k});
-
-  platform::SetOutputNode(op, "MidOut", mid_out, ngb_node_map);
-  platform::SetOutputNode(op, "Out", lrn_out, ngb_node_map);
-}
-
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(lrn, BuildLrnNode);
diff --git a/paddle/fluid/operators/ngraph/ops/matmul_op.h b/paddle/fluid/operators/ngraph/ops/matmul_op.h
deleted file mode 100644
index fe239afa0bef4a01dfdf73d11629e21e71f6582f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/matmul_op.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-std::shared_ptr<ngraph::Node> transposeAndFlat3D(
-    const std::shared_ptr<ngraph::Node>& input, const bool transpose,
-    bool x = true) {
-  auto shape = input->get_shape();
-  size_t n = shape.size();
-  std::shared_ptr<ngraph::Node> output;
-  if (n >= 3) {
-    std::vector<size_t> order(n);
-    std::iota(std::begin(order), std::end(order), 0);
-    size_t outer = 1;
-    for (size_t i = 0; i < n - 2; i++) {
-      outer = outer * shape[i];
-    }
-    std::vector<size_t> reshape{outer, shape[n - 2], shape[n - 1]};
-
-    if (transpose == true) {
-      order[n - 2] = n - 1;
-      order[n - 1] = n - 2;
-      reshape[2] = shape[n - 2];
-      reshape[1] = shape[n - 1];
-    }
-    output = std::make_shared<ngraph::op::Reshape>(
-        input, ngraph::AxisVector(order), ngraph::Shape(reshape));
-  } else {
-    std::shared_ptr<ngraph::Node> temp;
-    if (n == 1 && x == true) {
-      temp = std::make_shared<ngraph::op::Reshape>(input, ngraph::AxisVector{0},
-                                                   ngraph::Shape{1, shape[0]});
-    } else if (n == 1 && x == false) {
-      temp = std::make_shared<ngraph::op::Reshape>(input, ngraph::AxisVector{0},
-                                                   ngraph::Shape{shape[0], 1});
-    } else {
-      temp = input;
-    }
-    auto temp_shape = temp->get_shape();
-    if (transpose == true) {
-      output = std::make_shared<ngraph::op::Reshape>(
-          temp, ngraph::AxisVector{1, 0},
-          ngraph::Shape{temp_shape[1], temp_shape[0]});
-    } else {
-      output = temp;
-    }
-  }
-  return output;
-}
-std::shared_ptr<ngraph::Node> broadcast3D(
-    const std::shared_ptr<ngraph::Node>& input, size_t axis0) {
-  auto shape = input->get_shape();
-  size_t n = shape.size();
-  if (n == 2) {
-    auto output = std::make_shared<ngraph::op::Broadcast>(
-        input, ngraph::Shape{axis0, shape[0], shape[1]}, ngraph::AxisSet{0});
-    return output;
-  }
-  return input;
-}
-std::shared_ptr<ngraph::Node> dotOp(const std::shared_ptr<ngraph::Node>& a,
-                                    const std::shared_ptr<ngraph::Node>& b) {
-  std::shared_ptr<ngraph::Node> out;
-  auto a_shape = a->get_shape();
-  auto na = a_shape.size();
-  auto b_shape = b->get_shape();
-  auto nb = b_shape.size();
-  if (na > 2 && nb > 2) {
-    out = std::make_shared<ngraph::op::BatchMatMul>(a, b);
-  } else {
-    out = std::make_shared<ngraph::op::Dot>(a, b);
-  }
-  return out;
-}
-std::shared_ptr<ngraph::Node> reshapeToOriginal(
-    std::shared_ptr<ngraph::Node> input, const ngraph::Shape& shape) {
-  auto input_shape = input->get_shape();
-  std::vector<size_t> axis(input_shape.size());
-  std::iota(axis.begin(), axis.end(), 0);
-  auto out = std::make_shared<ngraph::op::Reshape>(input, axis, shape);
-  return out;
-}
-void BuildMatMulNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  bool transpose_x = op_attrs.Get<bool>("transpose_X");
-  bool transpose_y = op_attrs.Get<bool>("transpose_Y");
-  float alpha = op_attrs.Get<float>("alpha");
-
-  std::shared_ptr<ngraph::Node> out;
-  auto x_shape = x->get_shape();
-  auto y_shape = y->get_shape();
-  size_t nx = x_shape.size();
-  size_t ny = y_shape.size();
-  x = transposeAndFlat3D(x, transpose_x, true);
-  y = transposeAndFlat3D(y, transpose_y, false);
-  auto y_shape3 = y->get_shape();
-  auto x_shape3 = x->get_shape();
-  if (nx > 2 || ny > 2) {
-    ngraph::Shape out_shape = x_shape;
-    if (nx != 3) {
-      x = broadcast3D(x, y_shape3[0]);
-      out_shape = y_shape;
-    }
-    if (ny != 3) {
-      y = broadcast3D(y, x_shape3[0]);
-      out_shape = x_shape;
-    }
-    auto nout = out_shape.size();
-    auto out3 = std::make_shared<ngraph::op::BatchMatMul>(x, y);
-    auto out3_shape = out3->get_shape();
-    out_shape[nout - 1] = out3_shape[2];
-    out_shape[nout - 2] = out3_shape[1];
-    out = std::make_shared<ngraph::op::Reshape>(
-        out3, ngraph::AxisVector{0, 1, 2}, out_shape);
-  } else {
-    out = std::make_shared<ngraph::op::Dot>(x, y);
-  }
-  auto out_shape = out->get_shape();
-  std::vector<size_t> axis(out_shape.size());
-  std::iota(axis.begin(), axis.end(), 0);
-  for (size_t i = out_shape.size() - 1; i > 0; i--) {
-    if (out_shape[i] == 1) {
-      out_shape.erase(out_shape.begin() + i);
-    }
-  }
-  auto out_ = std::make_shared<ngraph::op::Reshape>(
-      out, ngraph::AxisVector(axis), out_shape);
-  auto out_alpha = ElementwiseScalar<ngraph::op::Multiply>(alpha, out_);
-  paddle::platform::SetOutputNode(op, "Out", out_alpha, ngb_node_map);
-}
-
-void BuildMatMulGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-
-  bool is_dx = paddle::platform::HasOutput(op, "X@GRAD") ? true : false;
-  bool is_dy = paddle::platform::HasOutput(op, "Y@GRAD") ? true : false;
-  bool transpose_x = op_attrs.Get<bool>("transpose_X");
-  bool transpose_y = op_attrs.Get<bool>("transpose_Y");
-  float alpha = op_attrs.Get<float>("alpha");
-  auto dout_shape = dout->get_shape();
-  auto x_shape = x->get_shape();
-  auto y_shape = y->get_shape();
-  size_t nx = x_shape.size();
-  size_t ny = y_shape.size();
-  size_t ndout = dout_shape.size();
-  std::shared_ptr<ngraph::Node> x2, y2;
-  std::shared_ptr<ngraph::Node> dout2;
-
-  x2 = transposeAndFlat3D(x, false);
-  y2 = transposeAndFlat3D(y, false, false);
-  dout2 = transposeAndFlat3D(dout, false);
-  auto x2_shape = x2->get_shape();
-  auto y2_shape = y2->get_shape();
-  if (nx >= 3 || ny >= 3) {
-    std::shared_ptr<ngraph::Node> dout_temp;
-    if (ndout == 2) {
-      dout_temp = std::make_shared<ngraph::op::Reshape>(
-          dout, ngraph::AxisVector{0, 1},
-          ngraph::Shape{dout_shape[0], dout_shape[1], 1});
-      if (ny < 3) {
-        dout2 = dout_temp;
-      } else {
-        dout2 = transposeAndFlat3D(dout_temp, true);
-      }
-    }
-    x2 = broadcast3D(x2, y_shape[0]);
-    y2 = broadcast3D(y2, x_shape[0]);
-
-  } else {
-    dout2 = transposeAndFlat3D(dout, false, nx == 1 && transpose_x == false);
-  }
-
-  if (transpose_y == false) {
-    y2 = transposeAndFlat3D(y2, true);
-  }
-  if (transpose_x == false) {
-    x2 = transposeAndFlat3D(x2, true);
-  }
-  auto dx = dotOp(dout2, y2);
-  auto dy = dotOp(x2, dout2);
-  if (transpose_x == true) {
-    dx = transposeAndFlat3D(dx, true);
-  }
-  if (transpose_y == true) {
-    dy = transposeAndFlat3D(dy, true);
-  }
-
-  if (nx < 3 && ny >= 3) {
-    dx = std::make_shared<ngraph::op::Sum>(dx, ngraph::AxisSet{0});
-  }
-  if (ny < 3 && nx >= 3) {
-    dy = std::make_shared<ngraph::op::Sum>(dy, ngraph::AxisSet{0});
-  }
-  auto dx_t = reshapeToOriginal(dx, x_shape);
-  auto dy_t = reshapeToOriginal(dy, y_shape);
-  auto dx_scale = ElementwiseScalar<ngraph::op::Multiply>(1 / alpha, dx_t);
-  auto dy_scale = ElementwiseScalar<ngraph::op::Multiply>(1 / alpha, dy_t);
-  if (is_dx)
-    paddle::platform::SetOutputNode(op, "X@GRAD", dx_scale, ngb_node_map);
-  if (is_dy)
-    paddle::platform::SetOutputNode(op, "Y@GRAD", dy_scale, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(matmul, BuildMatMulNode);
-REGISTER_NG_OP(matmul_grad, BuildMatMulGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/mean_op.h b/paddle/fluid/operators/ngraph/ops/mean_op.h
deleted file mode 100644
index 86e697d260eb0f26428258b5faea958a7319948c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/mean_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildMeanNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  ngraph::AxisSet axes;
-  for (size_t i = 0; i < input->get_shape().size(); ++i) {
-    axes.insert(i);
-  }
-
-  auto mean = ngraph::builder::mean(input, axes);
-  auto mean_1d = std::make_shared<ngraph::op::Reshape>(
-      mean, ngraph::AxisVector{}, ngraph::Shape{1});
-  paddle::platform::SetOutputNode(op, "Out", mean_1d, ngb_node_map);
-}
-
-void BuildMeanGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto og = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto x_shape = x->get_shape();
-  float x_size = std::accumulate(std::begin(x_shape), std::end(x_shape), 1,
-                                 std::multiplies<float>());
-  auto node_const = ngraph::op::Constant::create(og->get_element_type(),
-                                                 ngraph::Shape{1}, {x_size});
-  auto node_div = std::make_shared<ngraph::op::Divide>(og, node_const);
-
-  auto result = ElementwiseScalar<ngraph::op::Add>(
-      og / node_const,
-      ngraph::op::Constant::create(og->get_element_type(), x_shape, {0}));
-  paddle::platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(mean, BuildMeanNode);
-REGISTER_NG_OP(mean_grad, BuildMeanGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/momentum_op.h b/paddle/fluid/operators/ngraph/ops/momentum_op.h
deleted file mode 100644
index 84bddacba89d2921bca4915af7f64dcfbfdd42db..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/momentum_op.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildMomentumNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  auto param = paddle::platform::GetInputNode(op, "Param", ngb_node_map);
-  auto grad = paddle::platform::GetInputNode(op, "Grad", ngb_node_map);
-  auto velocity = paddle::platform::GetInputNode(op, "Velocity", ngb_node_map);
-  auto learning_rate =
-      paddle::platform::GetInputNode(op, "LearningRate", ngb_node_map);
-
-  auto mu = op_attrs.Get<float>("mu");
-  bool use_nesterov = op_attrs.Get<bool>("use_nesterov");
-
-  auto param_shape = param->get_shape();
-  auto velocity_shape = velocity->get_shape();
-  auto grad_shape = grad->get_shape();
-  auto lr_shape = learning_rate->get_shape();
-
-  auto shape_velocity = ngraph::Shape{velocity_shape};
-  auto mu_create =
-      ngraph::op::Constant::create(ngraph::element::f32, shape_velocity, {mu});
-
-  auto vel_mul = std::make_shared<ngraph::op::Multiply>(velocity, mu_create);
-  auto vel_out = std::make_shared<ngraph::op::Add>(vel_mul, grad);
-
-  ngraph::NodeVector result;
-  if (use_nesterov) {
-    auto mul_res = std::make_shared<ngraph::op::Multiply>(vel_out, mu_create);
-    auto add_res = std::make_shared<ngraph::op::Add>(grad, mul_res);
-
-    auto add_2d = paddle::platform::FlattenTo2d(add_res->get_shape(), 0);
-    auto vel_reshape = paddle::platform::NgReshaper(vel_out, add_2d);
-
-    auto lr_bcast = std::make_shared<ngraph::op::Broadcast>(
-        learning_rate, vel_reshape->get_shape(),
-        ngraph::AxisSet{vel_reshape->get_shape().size() - 1});
-
-    auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0);
-    auto lr_reshape = std::make_shared<ngraph::op::Reshape>(
-        lr_bcast, ngraph::AxisVector{0, 1}, lr_1d);
-
-    lr_reshape = std::make_shared<ngraph::op::Reshape>(
-        lr_reshape, ngraph::AxisVector{0}, param->get_shape());
-
-    auto mul_res1 = std::make_shared<ngraph::op::Multiply>(add_res, lr_reshape);
-    auto res = std::make_shared<ngraph::op::Subtract>(param, mul_res1);
-    paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map);
-  } else {
-    auto vel_2d = paddle::platform::FlattenTo2d(vel_out->get_shape(), 0);
-    auto vel_reshape = paddle::platform::NgReshaper(vel_out, vel_2d);
-
-    auto lr_bcast = std::make_shared<ngraph::op::Broadcast>(
-        learning_rate, vel_reshape->get_shape(),
-        ngraph::AxisSet{vel_reshape->get_shape().size() - 1});
-
-    auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0);
-    auto lr_reshape = std::make_shared<ngraph::op::Reshape>(
-        lr_bcast, ngraph::AxisVector{0, 1}, lr_1d);
-
-    lr_reshape = std::make_shared<ngraph::op::Reshape>(
-        lr_reshape, ngraph::AxisVector{0}, param->get_shape());
-
-    auto mul_result =
-        std::make_shared<ngraph::op::Multiply>(lr_reshape, vel_out);
-
-    auto res = std::make_shared<ngraph::op::Subtract>(param, mul_result);
-    paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map);
-  }
-  paddle::platform::SetOutputNode(op, "VelocityOut", vel_out, ngb_node_map);
-}
-
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(momentum, BuildMomentumNode);
diff --git a/paddle/fluid/operators/ngraph/ops/mul_op.h b/paddle/fluid/operators/ngraph/ops/mul_op.h
deleted file mode 100644
index cb46478ee8ad4f4c51a6ff9d6f5de4e66f6a505f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-static void BuildMulNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int x_num_col_dims = op_attrs.Get<int>("x_num_col_dims");
-  int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
-  int y_rank = y->get_shape().size();
-
-  auto x_reshape = x;
-  auto y_reshape = y;
-
-  if (x->get_shape().size() > 2) {
-    auto x_2d = paddle::platform::FlattenTo2d(x->get_shape(), x_num_col_dims);
-    x_reshape = paddle::platform::NgReshaper(x, x_2d);
-  }
-
-  if (y->get_shape().size() > 2) {
-    auto y_2d = paddle::platform::FlattenTo2d(y->get_shape(), y_num_col_dims);
-    y_reshape = paddle::platform::NgReshaper(y, y_2d);
-  }
-
-  std::shared_ptr<ngraph::Node> out =
-      std::make_shared<ngraph::op::Dot>(x_reshape, y_reshape);
-
-  ngraph::Shape out_shape;
-  for (int i = 0; i < x_num_col_dims; ++i) {
-    out_shape.push_back(x->get_shape()[i]);
-  }
-  for (int i = y_num_col_dims; i < y_rank; ++i) {
-    out_shape.push_back(y->get_shape()[i]);
-  }
-  out = paddle::platform::NgReshaper(out, out_shape);
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-static void BuildMulGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int x_num_col_dims = op_attrs.Get<int>("x_num_col_dims");
-  int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-
-  bool is_dx = paddle::platform::HasOutput(op, "X@GRAD") ? true : false;
-  bool is_dy = paddle::platform::HasOutput(op, "Y@GRAD") ? true : false;
-
-  auto x_shape = x->get_shape();
-  auto y_shape = y->get_shape();
-
-  auto x_reshape = x;
-  auto y_reshape = y;
-
-  if (x_shape.size() > 2) {
-    auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_num_col_dims);
-    x_reshape = paddle::platform::NgReshaper(x, x_2d_shape);
-  }
-
-  if (y_shape.size() > 2) {
-    auto y_2d_shape = paddle::platform::FlattenTo2d(y_shape, y_num_col_dims);
-    y_reshape = paddle::platform::NgReshaper(y, y_2d_shape);
-  }
-
-  auto x_reshape_shape = x_reshape->get_shape();
-  std::reverse(x_reshape_shape.begin(), x_reshape_shape.end());
-  auto x_transpose = std::make_shared<ngraph::op::Reshape>(
-      x_reshape, ngraph::AxisVector{1, 0}, x_reshape_shape);
-
-  auto y_reshape_shape = y_reshape->get_shape();
-  std::reverse(y_reshape_shape.begin(), y_reshape_shape.end());
-  auto y_transpose = std::make_shared<ngraph::op::Reshape>(
-      y_reshape, ngraph::AxisVector{1, 0}, y_reshape_shape);
-
-  if (is_dx) {
-    if (dout->get_shape().size() > 2) {
-      auto dout_2d_shape = paddle::platform::FlattenTo2d(dout->get_shape(), 2);
-      dout = paddle::platform::NgReshaper(dout, dout_2d_shape);
-    }
-    auto dx = std::make_shared<ngraph::op::Dot>(dout, y_transpose);
-
-    if (dx->get_shape() == x_shape) {
-      paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
-    } else {
-      auto dx_reshape = paddle::platform::NgReshaper(dx, x_shape);
-      paddle::platform::SetOutputNode(op, "X@GRAD", dx_reshape, ngb_node_map);
-    }
-  }
-
-  if (is_dy) {
-    if (dout->get_shape().size() > 2) {
-      auto dout_2d_shape = paddle::platform::FlattenTo2d(dout->get_shape(), 2);
-      dout = paddle::platform::NgReshaper(dout, dout_2d_shape);
-    }
-    auto dy = std::make_shared<ngraph::op::Dot>(x_transpose, dout);
-
-    if (dy->get_shape() == y_shape) {
-      paddle::platform::SetOutputNode(op, "Y@GRAD", dy, ngb_node_map);
-    } else {
-      auto dy_reshape = paddle::platform::NgReshaper(dy, y_shape);
-      paddle::platform::SetOutputNode(op, "Y@GRAD", dy_reshape, ngb_node_map);
-    }
-  }
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(mul, BuildMulNode);
-REGISTER_NG_OP(mul_grad, BuildMulGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/op_bridge.h b/paddle/fluid/operators/ngraph/ops/op_bridge.h
deleted file mode 100644
index 93df0ad8062745380d9cd4ca5027bef1425083bf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/op_bridge.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <map>
-#include <string>
-#include <unordered_map>
-
-#include "ngraph/node.hpp"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace ops {
-
-class NgraphSingleton {
-  NgraphSingleton() = default;
-  NgraphSingleton(NgraphSingleton const&) = delete;
-  void operator=(NgraphSingleton const) = delete;
-
-  ~NgraphSingleton() = default;
-
-  static std::map<
-      std::string,
-      std::function<void(const std::shared_ptr<framework::OperatorBase>&,
-                         std::shared_ptr<std::unordered_map<
-                             std::string, std::shared_ptr<ngraph::Node>>>)>>
-      ng_node_maps_;
-
- public:
-  template <typename TF>
-  static void Register(TF&& tf, const std::string& name) {
-    ng_node_maps_[name] = tf;
-  }
-
-  static bool Lookup(const std::string& name) {
-    auto it = ng_node_maps_.find(name);
-    if (it == ng_node_maps_.end()) {
-      return true;
-    }
-    return false;
-  }
-
-  static void BuildNode(
-      const std::shared_ptr<std::unordered_map<
-          std::string, std::shared_ptr<ngraph::Node>>>& ng_maps,
-      const std::shared_ptr<framework::OperatorBase>& op,
-      const std::string& name) {
-    ng_node_maps_[name](op, ng_maps);
-  }
-};
-
-std::map<std::string,
-         std::function<void(const std::shared_ptr<framework::OperatorBase>&,
-                            std::shared_ptr<std::unordered_map<
-                                std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphSingleton::ng_node_maps_;
-
-}  // namespace ops
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_NG_OP(op_type__, Converter__)                  \
-  struct ng_##op_type__##_converter {                           \
-    ng_##op_type__##_converter() {                              \
-      paddle::operators::ops::NgraphSingleton::Register(        \
-          paddle::operators::ngraphs::Converter__, #op_type__); \
-    }                                                           \
-  };                                                            \
-  ng_##op_type__##_converter ng_##op_type__##_converter__;
diff --git a/paddle/fluid/operators/ngraph/ops/pool2d_op.h b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
deleted file mode 100644
index e5542d4715740ad9f2ab7315dcfa20434a08f3fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildPool2dNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto x_shape = x->get_shape();
-
-  std::string pooling_type = op_attrs.Get<std::string>("pooling_type");
-  std::vector<int> ksize = op_attrs.Get<std::vector<int>>("ksize");
-  std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
-  std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
-
-  PADDLE_ENFORCE_EQ(x_shape.size() - 2, ksize.size(),
-                    "Handling 2d pooling only");
-
-  if (op_attrs.Get<bool>("global_pooling")) {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(x_shape.at(i + 2));
-    }
-  }
-
-  ngraph::Shape ng_padding_below{static_cast<size_t>(paddings.at(0)),
-                                 static_cast<size_t>(paddings.at(1))};
-  ngraph::Shape ng_padding_above{static_cast<size_t>(paddings.at(0)),
-                                 static_cast<size_t>(paddings.at(1))};
-  ngraph::Shape ng_ksize_shape{static_cast<size_t>(ksize.at(0)),
-                               static_cast<size_t>(ksize.at(1))};
-  ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
-                             static_cast<size_t>(strides.at(1))};
-
-  auto ComputeFlooredOutput = [](size_t in, size_t k, size_t p, size_t s) {
-    return (in - k + 2 * p) / s + 1;
-  };
-  auto ComputeCeiledOutput = [](size_t in, size_t k, size_t p, size_t s) {
-    return ceil(static_cast<float>(in - k + 2 * p) / s) + 1;
-  };
-
-  if (op_attrs.Get<bool>("ceil_mode")) {
-    for (size_t i = 0; i < ng_padding_above.size(); ++i) {
-      auto ceiled_size = ComputeCeiledOutput(x_shape[i + 2], ksize[i],
-                                             paddings[i], strides[i]);
-      auto floored_size = ComputeFlooredOutput(x_shape[i + 2], ksize[i],
-                                               paddings[i], strides[i]);
-      if (ceiled_size != floored_size) {
-        ng_padding_above[i] += strides[i];
-      }
-    }
-  }
-
-  bool padding_exclusive = op_attrs.Get<bool>("exclusive");
-  if (pooling_type == "max") {
-    auto pool2d = std::make_shared<ngraph::op::MaxPool>(
-        x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above);
-    paddle::platform::SetOutputNode(op, "Out", pool2d, ngb_node_map);
-  } else if (pooling_type == "avg") {
-    std::shared_ptr<ngraph::Node> pool2d;
-    if (op_attrs.Get<bool>("adaptive")) {
-      auto ComputeAdaptive = [](size_t in, size_t k) {
-        return std::floor(in / k);
-      };
-      ng_strides[0] = x_shape.size() == 4
-                          ? ComputeAdaptive(x_shape[3], ksize[0])
-                          : ng_strides[0];
-      ng_strides[1] = x_shape.size() == 4
-                          ? ComputeAdaptive(x_shape[3], ksize[0])
-                          : ng_strides[1];
-      pool2d =
-          std::make_shared<ngraph::op::AvgPool>(x, ng_ksize_shape, ng_strides);
-    } else {
-      if ((ng_padding_below[0] == 0) && (ng_padding_below[1] == 0) &&
-          (ng_padding_above[0] == 0) && (ng_padding_above[1] == 0)) {
-        padding_exclusive = false;
-      }
-      pool2d = std::make_shared<ngraph::op::AvgPool>(
-          x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above,
-          !padding_exclusive);
-    }
-    paddle::platform::SetOutputNode(op, "Out", pool2d, ngb_node_map);
-  } else {
-    PADDLE_THROW("Support max and avg pooling only");
-  }
-}
-
-void BuildPool2dGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto x_shape = x->get_shape();
-
-  std::string pooling_type = op_attrs.Get<std::string>("pooling_type");
-  std::vector<int> ksize = op_attrs.Get<std::vector<int>>("ksize");
-  std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
-  std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
-
-  PADDLE_ENFORCE_EQ(x_shape.size() - 2, ksize.size(),
-                    "Handling 2d pooling only");
-
-  if (op_attrs.Get<bool>("global_pooling")) {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(x_shape.at(i + 2));
-    }
-  }
-
-  ngraph::Shape ng_padding_below{static_cast<size_t>(paddings.at(0)),
-                                 static_cast<size_t>(paddings.at(1))};
-  ngraph::Shape ng_padding_above{static_cast<size_t>(paddings.at(0)),
-                                 static_cast<size_t>(paddings.at(1))};
-  ngraph::Shape ng_ksize_shape{static_cast<size_t>(ksize.at(0)),
-                               static_cast<size_t>(ksize.at(1))};
-  ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
-                             static_cast<size_t>(strides.at(1))};
-
-  bool padding_exclusive = op_attrs.Get<bool>("exclusive");
-  if (pooling_type == "max") {
-    auto pool2d_grad = std::make_shared<ngraph::op::MaxPoolBackprop>(
-        x, dout, out, ng_ksize_shape, ng_strides, ng_padding_below,
-        ng_padding_above);
-    paddle::platform::SetOutputNode(op, "X@GRAD", pool2d_grad, ngb_node_map);
-  } else if (pooling_type == "avg") {
-    std::shared_ptr<ngraph::Node> pool2d_grad;
-    if (op_attrs.Get<bool>("adaptive")) {
-      auto ComputeAdaptive = [](size_t in, size_t k) {
-        return std::floor(in / k);
-      };
-      ng_strides[0] = x_shape.size() == 4
-                          ? ComputeAdaptive(x_shape[3], ksize[0])
-                          : ng_strides[0];
-      ng_strides[1] = x_shape.size() == 4
-                          ? ComputeAdaptive(x_shape[3], ksize[0])
-                          : ng_strides[1];
-      pool2d_grad = std::make_shared<ngraph::op::AvgPoolBackprop>(
-          x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
-          ng_padding_above, !padding_exclusive);
-    } else {
-      if ((ng_padding_below[0] == 0) && (ng_padding_below[1] == 0) &&
-          (ng_padding_above[0] == 0) && (ng_padding_above[1] == 0)) {
-        padding_exclusive = false;
-      }
-      pool2d_grad = std::make_shared<ngraph::op::AvgPoolBackprop>(
-          x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
-          ng_padding_above, !padding_exclusive);
-    }
-    paddle::platform::SetOutputNode(op, "X@GRAD", pool2d_grad, ngb_node_map);
-  } else {
-    PADDLE_THROW("Support max and avg pooling only");
-  }
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(pool2d, BuildPool2dNode);
-REGISTER_NG_OP(pool2d_grad, BuildPool2dGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/reduce_sum_op.h b/paddle/fluid/operators/ngraph/ops/reduce_sum_op.h
deleted file mode 100644
index ad89052880e372b271d4b12cddd985cb86d25d65..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/reduce_sum_op.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildReduceSumNode(
-    const std::shared_ptr<paddle::framework::OperatorBase> &op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  bool reduce_all = op_attrs.Get<bool>("reduce_all");
-  bool keep_dim = op_attrs.Get<bool>("keep_dim");
-  std::vector<int> dim = op_attrs.Get<std::vector<int>>("dim");
-  auto input_shape = input->get_shape();
-  ngraph::AxisSet axes;
-  if (reduce_all == true) {
-    for (size_t i = 0; i < input_shape.size(); ++i) {
-      axes.insert(i);
-    }
-  } else {
-    for (auto &i : dim) {
-      if (i < 0) {
-        axes.insert(input_shape.size() + i);
-      } else {
-        axes.insert(i);
-      }
-    }
-  }
-  std::shared_ptr<ngraph::Node> reduce_sum =
-      std::make_shared<ngraph::op::Sum>(input, axes);
-
-  if (keep_dim == true) {
-    std::vector<size_t> dim_shape;
-    std::copy(input_shape.begin(), input_shape.end(),
-              std::back_inserter(dim_shape));
-    for (auto &i : dim) {
-      if (i < 0) {
-        i = input_shape.size() + i;
-      }
-      dim_shape[i] = 1;
-    }
-
-    std::vector<size_t> axis_vector(input_shape.size() - dim.size());
-    std::iota(axis_vector.begin(), axis_vector.end(), 0);
-
-    auto reduce_sum_dim = std::make_shared<ngraph::op::Reshape>(
-        reduce_sum, ngraph::AxisVector(axis_vector), ngraph::Shape(dim_shape));
-
-    paddle::platform::SetOutputNode(op, "Out", reduce_sum_dim, ngb_node_map);
-  } else {
-    if (reduce_sum->get_shape() == ngraph::Shape{}) {
-      reduce_sum = paddle::platform::NgReshaper(reduce_sum, ngraph::Shape{1});
-    }
-    paddle::platform::SetOutputNode(op, "Out", reduce_sum, ngb_node_map);
-  }
-}
-
-void BuildReduceSumGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase> &op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto og = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  std::vector<int> dim = op_attrs.Get<std::vector<int>>("dim");
-  bool reduce_all = op_attrs.Get<bool>("reduce_all");
-  bool keep_dim = op_attrs.Get<bool>("keep_dim");
-
-  auto og_shape = og->get_shape();
-  auto x_shape = x->get_shape();
-  float x_size = std::accumulate(std::begin(x_shape), std::end(x_shape), 1,
-                                 std::multiplies<float>());
-  float og_size = std::accumulate(std::begin(og_shape), std::end(og_shape), 1,
-                                  std::multiplies<float>());
-  ngraph::AxisSet axes;
-
-  if (reduce_all == true) {
-    for (size_t i = 0; i < x_shape.size(); i++) {
-      axes.insert(i);
-    }
-  } else {
-    for (auto &i : dim) {
-      if (i < 0) {
-        axes.insert(x_shape.size() + i);
-      } else {
-        axes.insert(i);
-      }
-    }
-  }
-  std::vector<size_t> axis_vector(og_shape.size());
-  std::iota(axis_vector.begin(), axis_vector.end(), 0);
-  std::vector<size_t> dim_shape;
-
-  for (size_t i = 0; i < x_shape.size(); i++) {
-    if (std::find(dim.begin(), dim.end(), i) == dim.end() &&
-        std::find(dim.begin(), dim.end(), i - x_shape.size()) == dim.end()) {
-      dim_shape.push_back(x_shape[i]);
-    }
-  }
-
-  if (keep_dim == true) {
-    // reshape
-    if (x_size == og_size) {
-      paddle::platform::SetOutputNode(op, "X@GRAD", og, ngb_node_map);
-      return;
-    }
-    auto og_dim = std::make_shared<ngraph::op::Reshape>(
-        og, ngraph::AxisVector(axis_vector), ngraph::Shape(dim_shape));
-    auto result =
-        std::make_shared<ngraph::op::Broadcast>(og_dim, x_shape, axes);
-    paddle::platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
-
-  } else {
-    if (x_size == og_size) {
-      auto og_dim = std::make_shared<ngraph::op::Reshape>(
-          og, ngraph::AxisVector(axis_vector), x_shape);
-      paddle::platform::SetOutputNode(op, "X@GRAD", og_dim, ngb_node_map);
-    } else {
-      if (og->get_shape().size() == 1 && og->get_shape()[0] == 1) {
-        og = std::make_shared<ngraph::op::Reshape>(og, ngraph::AxisVector{0},
-                                                   ngraph::Shape{});
-      }
-      auto result = std::make_shared<ngraph::op::Broadcast>(og, x_shape, axes);
-      paddle::platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
-    }
-  }
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(reduce_sum, BuildReduceSumNode);
-REGISTER_NG_OP(reduce_sum_grad, BuildReduceSumGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/reshape_op.h b/paddle/fluid/operators/ngraph/ops/reshape_op.h
deleted file mode 100644
index 89ad04f06f61ba0b91c06965ed985c84842ee634..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/reshape_op.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-ngraph::Shape calc_output_shape(const ngraph::Shape& input_shape,
-                                const std::vector<int>& v_shape) {
-  auto out_shape = v_shape;
-  for (size_t i = 0; i < v_shape.size(); ++i) {
-    if (v_shape[i] == 0) {
-      out_shape[i] = input_shape[i];
-    }
-  }
-  int size_input = ngraph::shape_size(input_shape);
-  int size_out = 1;
-  for (auto o : out_shape) {
-    if (o > 0) size_out *= o;
-  }
-  for (auto& o : out_shape) {
-    if (o == -1) o = size_input / size_out;
-  }
-  return ngraph::Shape(out_shape.begin(), out_shape.end());
-}
-
-template <bool is_v2>
-static void BuildReshapeNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  std::shared_ptr<ngraph::Node> input =
-      platform::GetInputNode(op, "X", ngb_node_map);
-  auto input_shape = input->get_shape();
-
-  std::shared_ptr<ngraph::Node> shape =
-      platform::GetInputNode(op, "Shape", ngb_node_map);
-
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  std::vector<int> v_shape = op_attrs.Get<std::vector<int>>("shape");
-  auto out = input;
-  if (shape != nullptr) {
-    ngraph::Shape new_shape;
-    for (auto& it : shape->get_shape()) {
-      new_shape.push_back(it);
-    }
-    out = platform::NgReshaper(input, shape->get_shape());
-  } else {
-    auto out_shape = calc_output_shape(input_shape, v_shape);
-    out = platform::NgReshaper(input, out_shape);
-  }
-
-  if (is_v2) {
-    ngraph::Shape input_xshape(input_shape.size() + 1);
-    input_xshape[0] = 0;
-    std::copy(input_shape.begin(), input_shape.end(), input_xshape.begin() + 1);
-    auto xshape_node = std::make_shared<ngraph::op::Constant>(
-        input->get_element_type(), input_xshape, std::vector<std::string>{});
-    platform::SetOutputNode(op, "XShape", xshape_node, ngb_node_map);
-  }
-  platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-template <bool is_v2>
-void BuildReshapeGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  ngraph::Shape out_shape;
-  if (is_v2) {
-    auto& xshape =
-        platform::GetInputNode(op, "XShape", ngb_node_map)->get_shape();
-    out_shape.resize(xshape.size() - 1);
-    std::copy(xshape.begin() + 1, xshape.end(), out_shape.begin());
-  } else {
-    auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-    out_shape = input->get_shape();
-  }
-  auto dx = platform::NgReshaper(dout, out_shape);
-  paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(reshape, BuildReshapeNode<false>);
-REGISTER_NG_OP(reshape2, BuildReshapeNode<true>);
-REGISTER_NG_OP(reshape_grad, BuildReshapeGradNode<false>);
-REGISTER_NG_OP(reshape2_grad, BuildReshapeGradNode<true>);
diff --git a/paddle/fluid/operators/ngraph/ops/scale_op.h b/paddle/fluid/operators/ngraph/ops/scale_op.h
deleted file mode 100644
index 1461b85b16ece79548f3ca95be811fb31136c610..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/scale_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildScaleNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  float scale = op_attrs.Get<float>("scale");
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto out = ElementwiseScalar<ngraph::op::Multiply>(scale, x);
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(scale, BuildScaleNode);
diff --git a/paddle/fluid/operators/ngraph/ops/slice_op.h b/paddle/fluid/operators/ngraph/ops/slice_op.h
deleted file mode 100644
index f5ab413540891e6b8c7f684751b2d701ec423d2d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/slice_op.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildSliceNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map);
-  auto input_shape = input->get_shape();
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  auto axes = op_attrs.Get<std::vector<int>>("axes");
-  auto starts = op_attrs.Get<std::vector<int>>("starts");
-  auto ends = op_attrs.Get<std::vector<int>>("ends");
-  ngraph::Coordinate ng_start, ng_end;
-  int axis, start, end;
-  for (size_t i = 0; i < input_shape.size(); ++i) {
-    ng_start.push_back(0);
-    ng_end.push_back(input_shape[i]);
-  }
-  for (size_t i = 0; i < axes.size(); ++i) {
-    axis = input_shape[axes[i]];
-    start = starts[i] < 0 ? (starts[i] + axis) : starts[i];
-    end = ends[i] < 0 ? (ends[i] + axis) : ends[i];
-    start = std::max(start, 0);
-    end = std::max(end, 0);
-    start = std::min(start, axis);
-    end = std::min(end, axis);
-    start = std::min(start, end);
-    ng_start[axes[i]] = start;
-    ng_end[axes[i]] = end;
-  }
-  auto out = std::make_shared<ngraph::op::Slice>(input, ng_start, ng_end);
-  auto out_shape = out->get_shape();
-
-  std::vector<size_t> out_axis_vec(out_shape.size());
-  std::iota(out_axis_vec.begin(), out_axis_vec.end(), 0);
-
-  paddle::platform::TrimTrailingSingularDims(&out_shape);
-  auto out_dim = std::make_shared<ngraph::op::Reshape>(
-      out, ngraph::AxisVector(out_axis_vec), ngraph::Shape(out_shape));
-
-  platform::SetOutputNode(op, "Out", out_dim, ngb_node_map);
-}
-
-void BuildSliceGradNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map);
-  auto input_shape = input->get_shape();
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  auto axes = op_attrs.Get<std::vector<int>>("axes");
-  auto starts = op_attrs.Get<std::vector<int>>("starts");
-  auto ends = op_attrs.Get<std::vector<int>>("ends");
-  auto reshape = input_shape;
-  ngraph::Coordinate ng_start, ng_end;
-  int axis, start, end;
-  for (size_t i = 0; i < input_shape.size(); ++i) {
-    ng_start.push_back(0);
-    ng_end.push_back(input_shape[i]);
-  }
-  for (size_t i = 0; i < axes.size(); ++i) {
-    axis = input_shape[axes[i]];
-    start = starts[i] < 0 ? (starts[i] + axis) : starts[i];
-    end = ends[i] < 0 ? (ends[i] + axis) : ends[i];
-    start = std::max(start, 0);
-    end = std::max(end, 0);
-    start = std::min(start, axis);
-    end = std::min(end, axis);
-    start = std::min(start, end);
-    ng_start[axes[i]] = start;
-    ng_end[axes[i]] = end;
-    reshape[axes[i]] = end - start;
-  }
-  std::vector<size_t> axisVec(dout->get_shape().size());
-  std::iota(axisVec.begin(), axisVec.end(), 0);
-  auto dout_reshape = std::make_shared<ngraph::op::Reshape>(
-      dout, ngraph::AxisVector(axisVec), reshape);
-
-  std::shared_ptr<ngraph::Node> input0 = paddle::platform::CreateConstant(
-      dout->get_element_type(), input_shape, {0});
-
-  auto din = std::make_shared<ngraph::op::ReplaceSlice>(input0, dout_reshape,
-                                                        ng_start, ng_end);
-  platform::SetOutputNode(op, "Input@GRAD", din, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(slice, BuildSliceNode);
-REGISTER_NG_OP(slice_grad, BuildSliceGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/softmax_op.h b/paddle/fluid/operators/ngraph/ops/softmax_op.h
deleted file mode 100644
index e1f6e8d3cfdc56c00229bbe1c3b183c309d0394e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/softmax_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-std::shared_ptr<ngraph::Node> GetSoftmax(std::shared_ptr<ngraph::Node> x,
-                                         int axis = -1) {
-  auto x_shape = x->get_shape();
-  size_t rank = x_shape.size();
-  size_t softmax_axis = axis;
-  if (axis < 0) softmax_axis = rank + axis;
-
-  auto x_max =
-      std::make_shared<ngraph::op::Max>(x, ngraph::AxisSet{softmax_axis});
-  auto x_max_bcast = std::make_shared<ngraph::op::Broadcast>(
-      x_max, x_shape, ngraph::AxisSet{softmax_axis});
-  auto x_shifted = x - x_max_bcast;
-  auto x_clipped =
-      paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Maximum>(
-          -64., x_shifted);
-  auto softmax = std::make_shared<ngraph::op::Softmax>(
-      x_clipped, ngraph::AxisSet{softmax_axis});
-  return softmax;
-}
-
-std::shared_ptr<ngraph::Node> GetSoftmaxGrad(std::shared_ptr<ngraph::Node> out,
-                                             std::shared_ptr<ngraph::Node> dout,
-                                             int axis = -1) {
-  auto out_shape = out->get_shape();
-  size_t rank = out_shape.size();
-  size_t softmax_axis = axis;
-  if (axis < 0) softmax_axis = rank + axis;
-
-  auto node_sum = std::make_shared<ngraph::op::Sum>(
-      out * dout, ngraph::AxisSet{softmax_axis});
-  auto node_bcast = std::make_shared<ngraph::op::Broadcast>(
-      node_sum, out_shape, ngraph::AxisSet{softmax_axis});
-  auto dx = (dout - node_bcast) * out;
-  return dx;
-}
-
-void BuildSoftmaxNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto softmax = GetSoftmax(x, op_attrs.Get<int>("axis"));
-  paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map);
-}
-
-void BuildSoftmaxGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
-  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto dx = GetSoftmaxGrad(out, dout, op_attrs.Get<int>("axis"));
-  paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(softmax, BuildSoftmaxNode);
-REGISTER_NG_OP(softmax_grad, BuildSoftmaxGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h
deleted file mode 100644
index a6bdf4de9522e08caf4a9ae606db8277f98cdab3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/cross_entropy_op.h"
-#include "paddle/fluid/operators/ngraph/ops/softmax_op.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildSoftmaxWithCrossEntropyNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto logits = paddle::platform::GetInputNode(op, "Logits", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
-  auto softmax = paddle::operators::ngraphs::GetSoftmax(logits);
-
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
-  int ignore_index = op_attrs.Get<int>("ignore_index");
-  auto xe = paddle::operators::ngraphs::GetCrossEntropy(
-      softmax, label, is_soft_label, ignore_index);
-
-  paddle::platform::SetOutputNode(op, "Softmax", softmax, ngb_node_map);
-  paddle::platform::SetOutputNode(op, "Loss", xe, ngb_node_map);
-}
-
-void BuildSoftmaxWithCrossEntropyGradNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
-  auto softmax = paddle::platform::GetInputNode(op, "Softmax", ngb_node_map);
-  auto loss_grad =
-      paddle::platform::GetInputNode(op, "Loss@GRAD", ngb_node_map);
-  auto softmax_shape = softmax->get_shape();
-  auto rank = softmax_shape.size();
-  if (!is_soft_label) {
-    auto label_shape = label->get_shape();
-    label_shape.pop_back();
-    label = platform::NgReshaper(label, label_shape);
-
-    label =
-        std::make_shared<ngraph::op::OneHot>(label, softmax_shape, rank - 1);
-  }
-
-  auto loss_grad_shape = loss_grad->get_shape();
-  loss_grad_shape.pop_back();
-  auto loss_grad_reshape = platform::NgReshaper(loss_grad, loss_grad_shape);
-  auto loss_grad_bcast = std::make_shared<ngraph::op::Broadcast>(
-      loss_grad_reshape, softmax_shape, ngraph::AxisSet{rank - 1});
-  if (softmax->get_element_type() != label->get_element_type()) {
-    label = std::make_shared<ngraph::op::Convert>(label,
-                                                  softmax->get_element_type());
-  }
-
-  auto logits_grad = loss_grad_bcast * (softmax - label);
-  paddle::platform::SetOutputNode(op, "Logits@GRAD", logits_grad, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(softmax_with_cross_entropy, BuildSoftmaxWithCrossEntropyNode);
-REGISTER_NG_OP(softmax_with_cross_entropy_grad,
-               BuildSoftmaxWithCrossEntropyGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/stack_op.h b/paddle/fluid/operators/ngraph/ops/stack_op.h
deleted file mode 100644
index d0e9545fd7669235e0b929403e2473835fe14c4c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/stack_op.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildStackNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  auto axis = op_attrs.Get<int>("axis");
-  std::vector<std::shared_ptr<ngraph::Node>> args;
-  for (auto& var_name_item : op->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto& node = ngb_node_map->at(var_name);
-      auto shape = node->get_shape();
-      axis = (axis < 0) ? axis + shape.size() + 1 : axis;
-      shape.insert(shape.begin() + axis, 1);
-      std::vector<size_t> input_order(shape.size() - 1);
-      std::iota(std::begin(input_order), std::end(input_order), 0);
-      args.push_back(std::make_shared<ngraph::op::Reshape>(
-          node, ngraph::AxisVector(input_order), shape));
-    }
-  }
-  auto out = std::make_shared<ngraph::op::Concat>(args, axis);
-  platform::SetOutputNode(op, "Y", out, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(stack, BuildStackNode);
diff --git a/paddle/fluid/operators/ngraph/ops/sum_op.h b/paddle/fluid/operators/ngraph/ops/sum_op.h
deleted file mode 100644
index ab8cdb8f4d847c0acb60b39d07dc83f085b60bbd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/sum_op.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildSumNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  std::vector<std::string> op_inputs;
-  for (auto& var_name_item : op->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      op_inputs.push_back(var_name);
-      if (ngb_node_map->find(var_name) == ngb_node_map->end()) {
-        PADDLE_THROW("op % input varname %s is not found in var_node_map",
-                     op->Type(), var_name);
-      }
-    }
-  }
-  std::shared_ptr<ngraph::Node>& sum = ngb_node_map->at(op_inputs[0]);
-  for (size_t k = 1; k < op_inputs.size(); ++k) {
-    std::shared_ptr<ngraph::Node>& nodek = ngb_node_map->at(op_inputs[k]);
-    if (nodek->get_element_type() != sum->get_element_type()) {
-      nodek =
-          std::make_shared<ngraph::op::Convert>(nodek, sum->get_element_type());
-    }
-    sum = sum + nodek;
-  }
-  platform::SetOutputNode(op, "Out", sum, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(sum, BuildSumNode);
diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h
deleted file mode 100644
index cdc26f6afd58700c3a1f57fa955d60bc8925d2d1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-void BuildTopKNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int k = op_attrs.Get<int>("k");
-  auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto top_k = std::make_shared<ngraph::op::TopK>(
-      input, input->get_shape().size() - 1, ngraph::element::i64, k);
-  std::shared_ptr<ngraph::Node> indices =
-      std::make_shared<ngraph::op::GetOutputElement>(top_k, 0);
-  std::shared_ptr<ngraph::Node> out =
-      std::make_shared<ngraph::op::GetOutputElement>(top_k, 1);
-  paddle::platform::SetOutputNode(op, "Indices", indices, ngb_node_map);
-  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(top_k, BuildTopKNode);
diff --git a/paddle/fluid/operators/ngraph/ops/transpose_op.h b/paddle/fluid/operators/ngraph/ops/transpose_op.h
deleted file mode 100644
index 7d9428977ab3c6c35cae37afd68eb746afc926a7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ops/transpose_op.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ngraph/ngraph.hpp"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
-#include "paddle/fluid/platform/ngraph_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace ngraphs {
-
-template <bool is_v2>
-static void BuildTransposeNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  std::vector<int> axis = op_attrs.Get<std::vector<int>>("axis");
-
-  auto input_shape = input->get_shape();
-  ngraph::Shape x_reshape_shape;
-  ngraph::AxisVector axis_vec;
-  for (auto& v : axis) {
-    axis_vec.push_back(v);
-    x_reshape_shape.push_back(input_shape[v]);
-  }
-  std::shared_ptr<ngraph::Node> x_transpose =
-      std::make_shared<ngraph::op::Reshape>(input, axis_vec, input_shape);
-  x_transpose = platform::NgReshaper(x_transpose, x_reshape_shape);
-  platform::SetOutputNode(op, "Out", x_transpose, ngb_node_map);
-  if (is_v2) {
-    ngraph::Shape input_xshape(input_shape.size() + 1);
-    input_xshape[0] = 0;
-    std::copy(input_shape.begin(), input_shape.end(), input_xshape.begin() + 1);
-    auto xshape_node = std::make_shared<ngraph::op::Constant>(
-        input->get_element_type(), input_xshape, std::vector<std::string>{});
-    platform::SetOutputNode(op, "XShape", xshape_node, ngb_node_map);
-  }
-}
-
-template <bool is_v2>
-static void BuildTransposeGradNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto op_attrs = framework::AttrReader(op->Attrs());
-  std::vector<int> axis = op_attrs.Get<std::vector<int>>("axis");
-
-  ngraph::AxisVector axis_vec(axis.size());
-  for (size_t i = 0; i < axis.size(); ++i) {
-    axis_vec[axis.at(i)] = i;
-  }
-
-  ngraph::Shape out_shape;
-  if (is_v2) {
-    auto& xshape =
-        platform::GetInputNode(op, "XShape", ngb_node_map)->get_shape();
-    out_shape.resize(xshape.size() - 1);
-    std::copy(xshape.begin() + 1, xshape.end(), out_shape.begin());
-  } else {
-    out_shape = platform::GetInputNode(op, "X", ngb_node_map)->get_shape();
-  }
-
-  std::shared_ptr<ngraph::Node> x_transpose =
-      std::make_shared<ngraph::op::Reshape>(input, axis_vec, out_shape);
-
-  platform::SetOutputNode(op, "X@GRAD", x_transpose, ngb_node_map);
-}
-
-}  // namespace ngraphs
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_NG_OP(transpose, BuildTransposeNode<false>);
-REGISTER_NG_OP(transpose_grad, BuildTransposeGradNode<false>);
-REGISTER_NG_OP(transpose2, BuildTransposeNode<true>);
-REGISTER_NG_OP(transpose2_grad, BuildTransposeGradNode<true>);
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
deleted file mode 100644
index 81fbe3e514241ecdd2832141eba4250ced2017a9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/norm_op.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/norm_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class NormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) A tensor of rank >= axis.");
-    AddAttr<int>("axis",
-                 "The axis on which to apply normalization. If axis < 0, "
-                 "the dimension to normalization is rank(X) + axis. -1 is "
-                 "the last dimension.");
-    AddAttr<float>("epsilon",
-                   "(float, default 1e-10) The epsilon value is used "
-                   "to avoid division by zero.")
-        .SetDefault(1.0e-10f);
-    AddOutput("Norm",
-              "(Tensor) A tensor saved the `sqrt(sum(x) + epsion)` will "
-              "be used in backward kernel.")
-        .AsIntermediate();
-    AddOutput("Out", "(Tensor) A tensor of the same shape as X.");
-    AddComment(R"DOC(
-
-Given a tensor, apply 2-normalization along the provided axis.
-
-$$
-y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
-$$
-
-where, $\sum {x^2}$ is calculated along the `axis` dimension.
-        
-)DOC");
-  }
-};
-
-class NormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of NormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of NormOp should not be null.");
-    auto xdim = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", xdim);
-    int axis = ctx->Attrs().Get<int>("axis");
-    if (axis < 0) axis = xdim.size() + axis;
-    xdim[axis] = 1;
-    ctx->SetOutputDim("Norm", xdim);
-  }
-};
-
-class NormOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-class NormOpGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("norm_grad");
-    op->SetAttrMap(Attrs());
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("Norm", Output("Norm"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-
-REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
-                  ops::NormOpGradOpDescMaker);
-REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
-REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel<CPU, float>,
-                       ops::NormKernel<CPU, double>);
-REGISTER_OP_CPU_KERNEL(norm_grad, ops::NormGradKernel<CPU, float>,
-                       ops::NormGradKernel<CPU, double>);
diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu
deleted file mode 100644
index 67449aa4c67bee6606928ef3a2d986a1bdec038f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/norm_op.cu
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "cub/cub.cuh"
-#include "paddle/fluid/operators/norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-__device__ __forceinline__ float square_root(float x) { return sqrtf(x); }
-
-__device__ __forceinline__ double square_root(double x) { return sqrt(x); }
-
-template <typename T, int BlockDim>
-__global__ void Normalize(const T* x, const int pre,
-                          const int axis_n,  // dim in axis
-                          const int post, const T eps, T* y, T* out_norm) {
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  int num = pre * post;
-  for (int i = blockIdx.x; i < num; i += gridDim.x) {
-    int base = (i / post) * post * axis_n + (i % post);
-
-    T sum = 0.0;
-    __shared__ T norm;
-    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
-      const T x_ij = x[base + j * post];
-      sum += x_ij * x_ij;
-    }
-    T reduce_result = BlockReduce(temp_storage).Sum(sum);
-
-    if (threadIdx.x == 0) {
-      norm = square_root(reduce_result + eps);
-      out_norm[i] = norm;
-    }
-    __syncthreads();
-    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
-      const int index = base + j * post;
-      y[index] = x[index] / norm;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class NormCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_x = ctx.Input<framework::Tensor>("X");
-    auto* out_y = ctx.Output<framework::Tensor>("Out");
-    auto* out_norm = ctx.Output<framework::Tensor>("Norm");
-    const T* x = in_x->data<T>();
-    T* y = out_y->mutable_data<T>(ctx.GetPlace());
-    T* norm = out_norm->mutable_data<T>(ctx.GetPlace());
-
-    auto xdim = in_x->dims();
-    auto ndim = out_norm->dims();
-    int axis = ctx.Attr<int>("axis");
-    T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
-    if (axis < 0) axis = xdim.size() + axis;
-    int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
-
-    auto& dev_ctx = ctx.cuda_device_context();
-
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    int grid = std::min(max_blocks, pre * post);
-    Normalize<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
-                                                              eps, y, norm);
-  }
-};
-
-template <typename T, int BlockDim>
-__global__ void NormalizeGradient(const T* x, const T* x_norm, const T* y_grad,
-                                  const int pre, const int axis_n,
-                                  const int post, T* x_grad) {
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage_sum;
-  int num = pre * post;
-  for (int i = blockIdx.x; i < num; i += gridDim.x) {
-    T sum = 0.0;
-    __shared__ T row_sum;
-    __shared__ T row_sqrt_norm;
-    __shared__ T row_norm;
-
-    auto base = (i / post) * post * axis_n + (i % post);
-
-    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
-      int index = base + j * post;
-      sum += x[index] * y_grad[index];
-    }
-    T reduce_result = BlockReduce(temp_storage_sum).Sum(sum);
-
-    if (threadIdx.x == 0) {
-      row_sum = reduce_result;
-      row_sqrt_norm = x_norm[i];
-      row_norm = row_sqrt_norm * row_sqrt_norm;
-    }
-    __syncthreads();
-    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
-      int index = base + j * post;
-      const T x_ij = x[index];
-      const T dy_ij = y_grad[index];
-      x_grad[index] = (dy_ij - x_ij * row_sum / row_norm) / row_sqrt_norm;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class NormGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_x = ctx.Input<framework::Tensor>("X");
-    auto* in_norm = ctx.Input<framework::Tensor>("Norm");
-    auto* in_dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* out_dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    T* dx = out_dx->mutable_data<T>(ctx.GetPlace());
-    const T* x = in_x->data<T>();
-    const T* x_norm = in_norm->data<T>();
-    const T* dy = in_dy->data<T>();
-
-    auto xdim = in_x->dims();
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis = xdim.size() + axis;
-    int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
-
-    auto& dev_ctx = ctx.cuda_device_context();
-
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    int grid = std::min(max_blocks, pre * post);
-    NormalizeGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-        x, x_norm, dy, pre, n, post, dx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-
-REGISTER_OP_CUDA_KERNEL(norm, ops::NormCUDAKernel<CUDA, float>,
-                        ops::NormCUDAKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradCUDAKernel<CUDA, float>,
-                        ops::NormGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h
deleted file mode 100644
index f81cbc2c733af2a42f27e2ecb05ee2f8e2f8c17b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/norm_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
-                    int* post) {
-  *pre = 1;
-  *post = 1;
-  *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
-  }
-}
-
-template <typename DeviceContext, typename T>
-class NormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_x = ctx.Input<framework::Tensor>("X");
-    auto* out_y = ctx.Output<framework::Tensor>("Out");
-    auto* out_norm = ctx.Output<framework::Tensor>("Norm");
-    out_y->mutable_data<T>(ctx.GetPlace());
-    out_norm->mutable_data<T>(ctx.GetPlace());
-
-    auto xdim = in_x->dims();
-    T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis = xdim.size() + axis;
-    int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
-
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 3> shape(pre, n, post);
-    Eigen::DSizes<int, 2> norm_shape(pre, post);
-
-    auto x_e = framework::EigenVector<T>::Flatten(*in_x);
-    auto y_e = framework::EigenVector<T>::Flatten(*out_y);
-    auto norm_e = framework::EigenVector<T>::Flatten(*out_norm);
-    auto x = x_e.reshape(shape);
-    auto y = y_e.reshape(shape);
-    auto norm = norm_e.reshape(norm_shape);
-
-    Eigen::DSizes<int, 1> rdim(1);
-    // y = x / sqrt((sum(x * x) + epsilon))
-    // norm = sqrt(sum(x * x) + epsilon)
-    auto x2 = x * x;
-    auto sum = x2.sum(rdim) + eps;
-    norm.device(*place) = sum.sqrt();
-
-    // y = x / norm
-    Eigen::DSizes<int, 3> rshape(pre, 1, post);
-    Eigen::DSizes<int, 3> bcast(1, n, 1);
-    y.device(*place) = x / norm.reshape(rshape).broadcast(bcast);
-  }
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class NormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_x = ctx.Input<framework::Tensor>("X");
-    auto* in_norm = ctx.Input<framework::Tensor>("Norm");
-    auto* in_dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* out_dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    out_dx->mutable_data<T>(ctx.GetPlace());
-
-    auto xdim = in_x->dims();
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis = xdim.size() + axis;
-    int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
-
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    auto x_e = framework::EigenVector<T>::Flatten(*in_x);
-    auto dy_e = framework::EigenVector<T>::Flatten(*in_dy);
-    auto norm_e = framework::EigenVector<T>::Flatten(*in_norm);
-    auto dx_e = framework::EigenVector<T>::Flatten(*out_dx);
-
-    Eigen::DSizes<int, 3> shape(pre, n, post);
-    Eigen::DSizes<int, 3> rshape(pre, 1, post);
-    auto x = x_e.reshape(shape);
-    auto dy = dy_e.reshape(shape);
-    auto norm = norm_e.reshape(rshape);
-    auto dx = dx_e.reshape(shape);
-
-    framework::Tensor rsum;
-    rsum.mutable_data<T>({pre, post}, ctx.GetPlace());
-    auto sum = framework::EigenTensor<T, 2>::From(rsum);
-
-    Eigen::DSizes<int, 1> rdim(1);
-    Eigen::DSizes<int, 3> bcast(1, n, 1);
-
-    // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)]
-    //    = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x))
-    //    = [dy - x * sum(x*dy) / (sum(x*x) + e)] / sqrt(sum(x*x))
-    // 1. sum = sum(x*dy)
-    sum.device(*place) = (x * dy).sum(rdim);
-    // 2. dx = x * sum
-    dx.device(*place) = sum.reshape(rshape).broadcast(bcast) * x;
-    // 3. dx / (sum(x*x) + e)
-    // where, norm.pow(2) = sum(x*x) + e, which is calculated in forward.
-    dx.device(*place) = dx / norm.pow(2).broadcast(bcast);
-    // 4. [dy - dx] / sqrt(sum(x*x))
-    dx.device(*place) = (dy - dx) / norm.broadcast(bcast);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/norm_utils.h b/paddle/fluid/operators/norm_utils.h
deleted file mode 100644
index fee06fe5dd4fae2e843bc639bba4afc259b78ea5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/norm_utils.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using DataLayout = framework::DataLayout;
-
-inline void ExtractNCWHD(const framework::DDim &dims,
-                         const DataLayout &data_layout, int *N, int *C, int *H,
-                         int *W, int *D) {
-  *N = dims[0];
-  if (dims.size() == 2) {
-    *C = dims[1];
-    *H = 1;
-    *W = 1;
-    *D = 1;
-  } else {
-    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
-    *W = dims.size() > 3
-             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
-             : 1;
-    *D = dims.size() > 4
-             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
-             : 1;
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
deleted file mode 100644
index 6042b97bf57a699b566ec2cee955f7db3bb7b2de..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/one_hot_op.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/one_hot_op.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
-
-namespace paddle {
-namespace operators {
-
-class OneHotOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of OneHotOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of OneHotOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "Rank of Input(X) should be at least 2.");
-    if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) {
-      PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
-                        "Last dimension of Input(X) should be 1.");
-    }
-
-    framework::DDim out_dims(x_dims);
-    int depth = ctx->Attrs().Get<int>("depth");
-    if (ctx->HasInput("depth_tensor")) {
-      depth = -1;
-    }
-
-    out_dims[out_dims.size() - 1] = depth;
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "depth_tensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
-             "The last dimension of X should be 1. Each value of X is an index "
-             "to indicate the position.");
-    AddInput("depth_tensor", "(Tensor, Tensor<int>), Length of one-hot vector")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor, Tensor<float>) Output tensor with same rank as X. "
-              "The tensor consists of one-hot representations of values in X.");
-
-    AddAttr<int>("depth",
-                 "A positive integer to specify the length of one-hot vector.")
-        .SetDefault(-1);
-    AddAttr<int>("dtype",
-                 "An integer to specify the data type of one-hot "
-                 "vector. The default value is FP32.")
-        .SetDefault(paddle::framework::proto::VarType::FP32);
-    AddAttr<bool>("allow_out_of_range",
-                  "If it is set true and the input data is out of range, "
-                  "the output tensor will be filled zeros. The default value "
-                  "is false.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-One Hot Operator. This operator creates the one-hot representations for input
-index values. The following example will help to explain the function of this
-operator:
-
-X is a LoDTensor:
-  X.lod = [[0, 1, 4]]
-  X.shape = [4, 1]
-  X.data = [[1], [1], [3], [0]]
-
-set depth = 4
-
-Out is a LoDTensor:
-  Out.lod = [[0, 1, 4]]
-  Out.shape = [4, 4]
-  Out.data = [[0., 1., 0., 0.],
-              [0., 1., 0., 0.],
-              [0., 0., 0., 1.],
-              [1., 0., 0., 0.]]
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(one_hot, ops::OneHotOp, ops::OneHotOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    one_hot, ops::OneHotKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::OneHotKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
deleted file mode 100644
index bffd1d5305127cab29f39ac1f51566325ab20612..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/one_hot_op.cu
+++ /dev/null
@@ -1,98 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/one_hot_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename InT, typename OutT>
-__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
-                                 const int64_t numel, const int depth) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
-    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
-  }
-}
-
-template <typename DeviceContext, typename InT>
-struct OneHotOpCUDAFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  const DeviceContext& ctx_;
-  int depth_;
-
-  OneHotOpCUDAFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                      int depth, const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    auto stream = ctx_.stream();
-    math::set_constant(ctx_, out_, 0.0);
-
-    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                           PADDLE_CUDA_NUM_THREADS,
-                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        p_in_data, p_out_data, numel, depth_);
-  }
-};
-
-using LoDTensor = framework::LoDTensor;
-template <typename DeviceContext, typename T>
-class OneHotCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-
-    int depth = -1;
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<framework::Tensor>("depth_tensor");
-      if (platform::is_gpu_place(depth_tensor->place())) {
-        framework::Tensor temp;
-        TensorCopySync(*depth_tensor, platform::CPUPlace(), &temp);
-        depth = *temp.data<int32_t>();
-      } else {
-        depth = *depth_tensor->data<int32_t>();
-      }
-
-      auto in_dims = in->dims();
-      framework::DDim out_dims(in_dims);
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    } else {
-      depth = context.Attr<int>("depth");
-    }
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotOpCUDAFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    one_hot, ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
deleted file mode 100644
index 0e2284941b41ffee66f0bebc8693a724b218de4a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/one_hot_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename InT>
-struct OneHotOpFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  int depth_;
-  const DeviceContext& ctx_;
-  bool allow_out_of_range_;
-
-  OneHotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                  int depth, const DeviceContext& ctx,
-                  bool allow_out_of_range = false)
-      : in_(in),
-        out_(out),
-        depth_(depth),
-        ctx_(ctx),
-        allow_out_of_range_(allow_out_of_range) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    math::set_constant(ctx_, out_, 0.0);
-
-    if (allow_out_of_range_) {
-      for (int i = 0; i < numel; ++i) {
-        if (p_in_data[i] >= 0 && p_in_data[i] < depth_) {
-          *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
-        }
-      }
-    } else {
-      for (int i = 0; i < numel; ++i) {
-        PADDLE_ENFORCE_GE(p_in_data[i], 0,
-                          "Illegal index value, should be at least 0.");
-        PADDLE_ENFORCE_LT(
-            p_in_data[i], depth_,
-            "Illegal index value, should be less than depth (%d).", depth_);
-        *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
-      }
-    }
-  }
-};
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-template <typename DeviceContext, typename T>
-class OneHotKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int depth = context.Attr<int>("depth");
-    bool allow_out_of_range = context.Attr<bool>("allow_out_of_range");
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
-      auto* depth_data = depth_tensor->data<int32_t>();
-      depth = depth_data[0];
-      auto in_dims = in->dims();
-      framework::DDim out_dims(in_dims);
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotOpFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>(),
-            allow_out_of_range));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
deleted file mode 100644
index 7a75afca09cea13eb07749eb565ea880f8a5acf0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/one_hot_v2_op.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
-
-namespace paddle {
-namespace operators {
-
-class OneHotV2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of OneHotOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of OneHotOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 1,
-                      "Rank of Input(X) should be at least 1.");
-
-    int depth = ctx->Attrs().Get<int>("depth");
-    if (ctx->HasInput("depth_tensor")) {
-      depth = -1;
-    }
-
-    auto out_dims_vec = framework::vectorize(x_dims);
-    out_dims_vec.push_back(depth);
-    auto out_dims = framework::make_ddim(out_dims_vec);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "depth_tensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class OneHotV2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
-             "The last dimension of X should be 1. Each value of X is an index "
-             "to indicate the position.");
-    AddInput("depth_tensor", "(Tensor, Tensor<int>), Length of one-hot vector")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor, Tensor<float>) Output tensor with same rank as X. "
-              "The tensor consists of one-hot representations of values in X.");
-
-    AddAttr<int>("depth",
-                 "A positive integer to specify the length of one-hot vector.")
-        .SetDefault(-1);
-    AddAttr<int>("dtype",
-                 "An integer to specify the data type of one-hot "
-                 "vector. The default value is FP32.")
-        .SetDefault(paddle::framework::proto::VarType::FP32);
-    AddAttr<bool>("allow_out_of_range",
-                  "If it is set true and the input data is out of range, "
-                  "the output tensor will be filled zeros. The default value "
-                  "is false.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-One Hot Operator. This operator creates the one-hot representations for input
-index values. The following example will help to explain the function of this
-operator:
-
-X is a LoDTensor:
-  X.lod = [[0, 1, 4]]
-  X.shape = [4]
-  X.data = [1, 1, 3, 0]
-
-set depth = 4
-
-Out is a LoDTensor:
-  Out.lod = [[0, 1, 4]]
-  Out.shape = [4, 4]
-  Out.data = [[0., 1., 0., 0.],
-              [0., 1., 0., 0.],
-              [0., 0., 0., 1.],
-              [1., 0., 0., 0.]]
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(one_hot_v2, ops::OneHotV2Op, ops::OneHotV2OpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    one_hot_v2, ops::OneHotV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::OneHotV2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu
deleted file mode 100644
index 2366f1422244e34deaf9ba019eba1fde2620f7ad..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ /dev/null
@@ -1,99 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/one_hot_v2_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename InT, typename OutT>
-__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
-                                 const int64_t numel, const int depth) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
-    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
-  }
-}
-
-template <typename DeviceContext, typename InT>
-struct OneHotV2OpCUDAFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  const DeviceContext& ctx_;
-  int depth_;
-
-  OneHotV2OpCUDAFunctor(const framework::LoDTensor* in,
-                        framework::LoDTensor* out, int depth,
-                        const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    auto stream = ctx_.stream();
-    math::set_constant(ctx_, out_, 0.0);
-
-    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                           PADDLE_CUDA_NUM_THREADS,
-                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        p_in_data, p_out_data, numel, depth_);
-  }
-};
-
-using LoDTensor = framework::LoDTensor;
-template <typename DeviceContext, typename T>
-class OneHotV2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-
-    int depth = -1;
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<framework::Tensor>("depth_tensor");
-      if (platform::is_gpu_place(depth_tensor->place())) {
-        framework::Tensor temp;
-        TensorCopySync(*depth_tensor, platform::CPUPlace(), &temp);
-        depth = *temp.data<int32_t>();
-      } else {
-        depth = *depth_tensor->data<int32_t>();
-      }
-
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    } else {
-      depth = context.Attr<int>("depth");
-    }
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotV2OpCUDAFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    one_hot_v2,
-    ops::OneHotV2CUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::OneHotV2CUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_v2_op.h b/paddle/fluid/operators/one_hot_v2_op.h
deleted file mode 100644
index 7cfe2d61d17f32bce87e6428add0a9654dcba778..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/one_hot_v2_op.h
+++ /dev/null
@@ -1,94 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename InT>
-struct OneHotV2OpFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  int depth_;
-  const DeviceContext& ctx_;
-  bool allow_out_of_range_;
-
-  OneHotV2OpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                    int depth, const DeviceContext& ctx,
-                    bool allow_out_of_range = false)
-      : in_(in),
-        out_(out),
-        depth_(depth),
-        ctx_(ctx),
-        allow_out_of_range_(allow_out_of_range) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    math::set_constant(ctx_, out_, 0.0);
-
-    if (allow_out_of_range_) {
-      for (int i = 0; i < numel; ++i) {
-        if (p_in_data[i] >= 0 && p_in_data[i] < depth_) {
-          *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
-        }
-      }
-    } else {
-      for (int i = 0; i < numel; ++i) {
-        PADDLE_ENFORCE_GE(p_in_data[i], 0,
-                          "Illegal index value, should be at least 0.");
-        PADDLE_ENFORCE_LT(
-            p_in_data[i], depth_,
-            "Illegal index value, should be less than depth (%d).", depth_);
-        *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
-      }
-    }
-  }
-};
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-template <typename DeviceContext, typename T>
-class OneHotV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int depth = context.Attr<int>("depth");
-    bool allow_out_of_range = context.Attr<bool>("allow_out_of_range");
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
-      auto* depth_data = depth_tensor->data<int32_t>();
-      depth = depth_data[0];
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotV2OpFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>(),
-            allow_out_of_range));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/CMakeLists.txt b/paddle/fluid/operators/optimizers/CMakeLists.txt
deleted file mode 100644
index 5d468316e8eacb73c4a4ce81c784880bb5e46c2d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-include(operators)
-register_operators()
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
deleted file mode 100644
index 01c0f1bb2d4778c3ba4980b9e7d4faef77901c0b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/adadelta_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class AdadeltaOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
-                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
-                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredGradOut"),
-        "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredUpdateOut"),
-        "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        "param and grad input of AdadeltaOp should have same dimension");
-    PADDLE_ENFORCE_NE(framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
-                      "Maybe the Input variable AvgSquaredGrad has not "
-                      "been initialized. You may need to confirm if you put "
-                      "exe.run(startup_program) after optimizer.minimize "
-                      "function.");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
-                      "Param and AvgSquaredGrad input of AdadeltaOp "
-                      "should have same dimension");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
-                      "Param and AvgSquaredUpdate input of AdadeltaOp "
-                      "should have same dimension");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
-    AddInput("AvgSquaredUpdate",
-             "(Tensor) Input average of squared parameter updates");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("AvgSquaredGradOut",
-              "(Tensor) Output average of squared gradient");
-    AddOutput("AvgSquaredUpdateOut",
-              "(Tensor) Output average of squared parameter updates");
-
-    AddAttr<float>("rho",
-                   "(float, default 0.95) Exponential decay rate "
-                   "for squared gradients.")
-        .SetDefault(0.95f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-6) Constant for "
-                   "numerical stability")
-        .SetDefault(1.0e-6f);
-    AddComment(R"DOC(
-Adadelta Optimizer.
-
-Adadelta optimizer is implemented as explained in:
-https://arxiv.org/abs/1212.5701
-Adadelta is a per-dimension adaptive learning rate method used
-for gradient descent.
-
-Adadelta updates are as follows:
-
-$$
-avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\
-param\_update =  - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\
-avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\
-param\_out = param + param\_update
-$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cu b/paddle/fluid/operators/optimizers/adadelta_op.cu
deleted file mode 100644
index 562a157f063b44d65254d556d44439eee3636c4c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adadelta_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adadelta_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h
deleted file mode 100644
index 3f51bb0b3d6ddf41a08a64f254f76c88b60ced22..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdadeltaOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto avg_squared_grad_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredGradOut");
-    auto avg_squared_update_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T rho = static_cast<T>(ctx.Attr<float>("rho"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    // Squared gradient accumulator
-    auto avg_squared_grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredGrad"));
-    // Squared updates accumulator
-    auto avg_squared_update = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto avg_squared_grad_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
-    auto avg_squared_update_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    avg_squared_grad_out.device(place) =
-        rho * avg_squared_grad + (1 - rho) * grad.square();
-    auto update =
-        -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
-             .sqrt() *
-        grad;
-    avg_squared_update_out.device(place) =
-        rho * avg_squared_update + (1 - rho) * update.square();
-    param_out.device(place) = param + update;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
deleted file mode 100644
index 0310fe2eba8e9fcd02ac6c229f90a1d75ddea63e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/adagrad_op.h"
-#include <vector>
-
-#include <cmath>
-
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-class AdagradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of AdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of AdagradOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of AdagradOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "LearningRate should have one element");
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdagradOp should have the same dimension.");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment"),
-        "Param and Moment input of AdagradOp should have the same dimension.");
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("Moment", "(Tensor) Second moment");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("MomentOut", "(Tensor) Output second moment");
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-6) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-6f);
-    AddComment(R"DOC(
-
-Adaptive Gradient Algorithm (Adagrad).
-
-The update is done as follows:
-
-$$moment\_out = moment + grad * grad \\
-param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
-$$
-
-The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have the epsilon attribute. It is added here in our implementation
-as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
-for numerical stability to avoid the division by zero error.
-
-)DOC");
-  }
-};
-
-namespace {
-size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
-  return std::find(rows.begin(), rows.end(), value) - rows.begin();
-}
-}  // namespace
-
-template <typename T>
-struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& grad,
-                  const framework::Tensor& learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param) {
-    // 1. g_m.rows = set(g.rows)
-    auto grad_width = grad.value().dims()[1];
-    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
-    auto grad_merge = merge_func(context, grad);
-    auto& merge_rows = grad_merge.rows();
-    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
-
-    // 2. m += g_m * g_m
-    auto grad_square =
-        SquareSelectedRows<platform::CPUDeviceContext, T>(context, grad_merge);
-
-    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
-    functor(context, grad_square, moment);
-
-    // 3. update parameter
-    auto* lr = learning_rate.data<T>();
-    auto* param_data = param->data<T>();
-    auto* moment_data = moment->data<T>();
-
-    for (size_t i = 0; i < merge_rows.size(); i++) {
-      for (int64_t j = 0; j < grad_width; j++) {
-        param_data[merge_rows[i] * grad_width + j] -=
-            lr[0] * grad_merge_data[i * grad_width + j] /
-            (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon);
-      }
-    }
-  }
-};
-
-template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
-template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu
deleted file mode 100644
index 5043468d4c5f721ae0906b1a319eb3ec10b26580..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/operators/optimizers/adagrad_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {
-
-template <typename T, int block_size>
-__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows,
-                                T* grad_merge, const int64_t* grad_merge_rows,
-                                size_t grad_merge_rows_size,
-                                int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-  __shared__ size_t grad_merge_idx;
-
-  if (tid == 0) {
-    for (size_t i = 0; i < grad_merge_rows_size; i++) {
-      if (grad_rows[ty] == grad_merge_rows[i]) {
-        grad_merge_idx = i;
-      }
-    }
-  }
-
-  __syncthreads();
-
-  grad += ty * row_numel;
-  grad_merge += grad_merge_idx * row_numel;
-  for (int index = tid; index < row_numel; index += block_size) {
-    paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]);
-  }
-}
-
-template <typename T, int block_size>
-__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
-                                           const T* learning_rate, T* param,
-                                           T* moment, int64_t row_numel,
-                                           T epsilon) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  grad += ty * row_numel;
-  param += rows[ty] * row_numel;
-  moment += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we have to use
-    // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(param + index,
-                                    -1.0 * learning_rate[0] * grad[index] /
-                                        (sqrt(moment[index]) + epsilon));
-  }
-}
-}  // namespace
-
-template <typename T>
-struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& grad,
-                  const framework::Tensor& learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param) {
-    // 1. g_m.rows = set(g.rows)
-    auto grad_width = grad.value().dims()[1];
-    math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
-    auto grad_merge = merge_func(context, grad);
-    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
-    framework::Vector<int64_t> merge_rows(grad_merge.rows());
-    // 2. m += g_m * g_m
-    auto grad_square =
-        SquareSelectedRows<platform::CUDADeviceContext, T>(context, grad_merge);
-
-    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
-    functor(context, grad_square, moment);
-
-    // 3. update parameter
-    auto* lr = learning_rate.data<T>();
-    auto* param_data = param->data<T>();
-    auto* moment_data = moment->data<T>();
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid2(1, merge_rows.size());
-    SparseAdagradFunctorKernel<
-        T, 256><<<grid2, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
-        grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr,
-        param_data, moment_data, grad_width, epsilon);
-  }
-};
-
-template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
-template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h
deleted file mode 100644
index 13455fc42cdc72a8ebfcac3dc0c94b79497d91f6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adagrad_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-struct SparseAdagradFunctor {
-  void operator()(const DeviceContext &context,
-                  const framework::SelectedRows &grad,
-                  const framework::Tensor &learning_rate, T epsilon,
-                  framework::Tensor *moment, framework::Tensor *param);
-};
-
-template <typename DeviceContext, typename T>
-framework::SelectedRows SquareSelectedRows(
-    const DeviceContext &context, const framework::SelectedRows &input) {
-  framework::SelectedRows out;
-  out.set_rows(input.rows());
-  out.set_height(input.height());
-  out.mutable_value()->mutable_data<T>(input.value().dims(),
-                                       context.GetPlace());
-  auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-  auto e_in = framework::EigenVector<T>::Flatten(input.value());
-  e_out.device(*context.eigen_device()) = e_in.square();
-  return out;
-}
-
-template <typename DeviceContext, typename T>
-class AdagradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
-
-    auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto *grad_var = ctx.InputVar("Grad");
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto param = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("Param"));
-      auto grad = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("Grad"));
-      auto moment = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("Moment"));
-      auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-      auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-      auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto *place = ctx.template device_context<DeviceContext>().eigen_device();
-
-      moment_out.device(*place) = moment + grad * grad;
-      Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-      if (platform::is_cpu_place(ctx.GetPlace())) {
-        auto *lr = learning_rate->data<T>();
-        param_out.device(*place) =
-            param - lr[0] * grad / (moment_out.sqrt() + epsilon);
-      } else {
-        auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
-        param_out.device(*place) =
-            param -
-            lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
-      }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto *param_tensor = ctx.Input<framework::Tensor>("Param");
-      PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
-
-      auto *moment_tensor = ctx.Input<framework::Tensor>("Moment");
-      PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
-
-      SparseAdagradFunctor<DeviceContext, T> functor;
-      functor(ctx.template device_context<DeviceContext>(),
-              *ctx.Input<framework::SelectedRows>("Grad"),
-              *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
-              moment_out_tensor, param_out_tensor);
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
deleted file mode 100644
index fc851e56cbfd2ab6780a3c812309bced2b693acd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/adam_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-void AdamOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Param"),
-                 "Input(Param) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                 "Input(Grad) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Moment1"),
-                 "Input(Moment1) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Moment2"),
-                 "Input(Moment2) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                 "Input(LearningRate) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
-                 "Input(Beta1Pow) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
-                 "Input(Beta2Pow) of AdamOp should not be null.");
-
-  PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                 "Output(ParamOut) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
-                 "Output(Moment1Out) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
-                 "Output(Moment2Out) of AdamOp should not be null.");
-
-  auto lr_dims = ctx->GetInputDim("LearningRate");
-  PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                    "Maybe the Input variable LearningRate has not "
-                    "been initialized. You may need to confirm "
-                    "if you put exe.run(startup_program) "
-                    "after optimizer.minimize function.");
-  PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                    "Learning rate should have 1 dimension");
-  auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-  PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                    "Beta1 power accumulator should have 1 dimension");
-  auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-  PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
-                    "Beta2 power accumulator should have 1 dimension");
-
-  auto param_dims = ctx->GetInputDim("Param");
-  if (ctx->GetInputsVarType("Grad")[0] ==
-      framework::proto::VarType::LOD_TENSOR) {
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdamOp should have same dimension");
-  }
-  PADDLE_ENFORCE_EQ(
-      param_dims, ctx->GetInputDim("Moment1"),
-      "Param and Moment1 input of AdamOp should have same dimension");
-  PADDLE_ENFORCE_EQ(
-      param_dims, ctx->GetInputDim("Moment2"),
-      "Param and Moment2 input of AdamOp should have same dimension");
-
-  ctx->SetOutputDim("ParamOut", param_dims);
-  ctx->SetOutputDim("Moment1Out", param_dims);
-  ctx->SetOutputDim("Moment2Out", param_dims);
-}
-
-framework::OpKernelType AdamOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  auto input_data_type = ctx.Input<framework::Tensor>("Param")->type();
-  return framework::OpKernelType(input_data_type, ctx.GetPlace());
-}
-
-class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-    AddInput("Moment1", "(Tensor) Input first moment");
-    AddInput("Moment2", "(Tensor) Input second moment");
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("Moment1Out", "(Tensor) Output first moment");
-    AddOutput("Moment2Out", "(Tensor) Output second moment");
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "first moment estimates.")
-        .SetDefault(0.9f);
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the "
-                   "second moment estimates.")
-        .SetDefault(0.999f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-    AddAttr<bool>(
-        "lazy_mode",
-        "(bool, default false) "
-        "only update the parameter that has gradient in sparse update")
-        .SetDefault(false);
-    AddAttr<int64_t>("min_row_size_to_use_multithread",
-                     "(int64_t, default 0) "
-                     "when not zero, if param row size is larger then "
-                     "min_row_size_to_use_multithread and "
-                     "inner_op_parallelism is larger then 0, sparse update "
-                     "will run in multithread mode")
-        .SetDefault(1000);
-
-    AddComment(R"DOC(
-Adam Optimizer.
-
-This implements the Adam optimizer from Section 2 of the Adam
-paper : https://arxiv.org/abs/1412.6980.
-Adam is a first-order gradient-based optimization method based on
-adaptive estimates of lower-order moments.
-
-Adam updates:
-
-$$
-moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
-moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
-learning\_rate = learning\_rate *
-                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-$$
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
deleted file mode 100644
index 4eb2db717d45a730798eef48d3d10bce9d387c4b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adam_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adam, ops::AdamOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdamOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
deleted file mode 100644
index 1cc34f11d09e9ec1868249f20fcc1b189efb0589..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ /dev/null
@@ -1,566 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>  // for sqrt in CPU and CUDA
-#include <Eigen/Dense>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/algorithm.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-namespace scatter = paddle::operators::math::scatter;
-
-class AdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-struct GPUAdam;
-struct CPUAdam;
-
-template <typename T, typename Flavour>
-struct AdamFunctor;
-
-template <typename T>
-struct AdamFunctor<T, GPUAdam> {
-  T beta1_;
-  T beta2_;
-  T epsilon_;
-
-  const T* beta1_pow_;
-  const T* beta2_pow_;
-  const T* moment1_;
-  T* moment1_out_;
-  const T* moment2_;
-  T* moment2_out_;
-  const T* lr_;
-  const T* grad_;
-  const T* param_;
-  T* param_out_;
-
-  AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
-              const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
-              T* mom2_out, const T* lr, const T* grad, const T* param,
-              T* param_out)
-      : beta1_(beta1),
-        beta2_(beta2),
-        epsilon_(epsilon),
-        beta1_pow_(beta1_pow),
-        beta2_pow_(beta2_pow),
-        moment1_(mom1),
-        moment1_out_(mom1_out),
-        moment2_(mom2),
-        moment2_out_(mom2_out),
-        lr_(lr),
-        grad_(grad),
-        param_(param),
-        param_out_(param_out) {}
-
-  inline HOSTDEVICE void operator()(size_t i) const {
-    // Merge all memory access together.
-    T g = grad_[i];
-    T mom1 = moment1_[i];
-    T mom2 = moment2_[i];
-    T lr = *lr_;
-    T beta1_pow = *beta1_pow_;
-    T beta2_pow = *beta2_pow_;
-    T p = param_[i];
-
-    // Calculation
-    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-
-    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
-    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
-
-    // Write back to global memory
-    moment1_out_[i] = mom1;
-    moment2_out_[i] = mom2;
-    param_out_[i] = p;
-  }
-};
-
-template <typename T>
-struct AdamFunctor<T, CPUAdam> {
-  T beta1_;
-  T beta2_;
-  T epsilon_;
-
-  const T* beta1_pow_;
-  const T* beta2_pow_;
-  const T* moment1_;
-  T* moment1_out_;
-  const T* moment2_;
-  T* moment2_out_;
-  const T* lr_;
-  const T* grad_;
-  const T* param_;
-  T* param_out_;
-
-  AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
-              const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
-              T* mom2_out, const T* lr, const T* grad, const T* param,
-              T* param_out)
-      : beta1_(beta1),
-        beta2_(beta2),
-        epsilon_(epsilon),
-        beta1_pow_(beta1_pow),
-        beta2_pow_(beta2_pow),
-        moment1_(mom1),
-        moment1_out_(mom1_out),
-        moment2_(mom2),
-        moment2_out_(mom2_out),
-        lr_(lr),
-        grad_(grad),
-        param_(param),
-        param_out_(param_out) {}
-
-  void operator()(size_t numel) const {
-    Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>> g{
-        grad_, static_cast<Eigen::Index>(numel)};
-    Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>> mom1{
-        moment1_, static_cast<Eigen::Index>(numel)};
-    Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>> mom2{
-        moment2_, static_cast<Eigen::Index>(numel)};
-    Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>> param{
-        param_, static_cast<Eigen::Index>(numel)};
-
-    Eigen::Map<Eigen::Array<T, 1, Eigen::Dynamic>> param_out{
-        param_out_, static_cast<Eigen::Index>(numel)};
-    Eigen::Map<Eigen::Array<T, 1, Eigen::Dynamic>> moment1_out{
-        moment1_out_, static_cast<Eigen::Index>(numel)};
-    Eigen::Map<Eigen::Array<T, 1, Eigen::Dynamic>> moment2_out{
-        moment2_out_, static_cast<Eigen::Index>(numel)};
-
-    T lr = *lr_;
-    T beta1_pow = *beta1_pow_;
-    T beta2_pow = *beta2_pow_;
-
-    // Calculation
-    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-
-    moment1_out = beta1_ * mom1 + (1 - beta1_) * g;
-    moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g;
-    param_out = param - lr * (moment1_out / (moment2_out.sqrt() + epsilon_));
-  }
-};
-
-template <typename T, typename Flavour>
-struct SparseAdamFunctor;
-
-template <typename T>
-struct SparseAdamFunctor<T, GPUAdam> {
-  T beta1_;
-  T beta2_;
-  T epsilon_;
-
-  const T* beta1_pow_;
-  const T* beta2_pow_;
-  const T* moment1_;
-  T* moment1_out_;
-  const T* moment2_;
-  T* moment2_out_;
-  const T* lr_;
-  const T* grad_;
-  const T* param_;
-  T* param_out_;
-
-  const int64_t* rows_;
-  int64_t row_numel_;
-  int64_t row_count_;
-  bool lazy_mode_;
-
-  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
-                    const T* beta2_pow, const T* mom1, T* mom1_out,
-                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
-                    const T* param, T* param_out, const int64_t* rows,
-                    int64_t row_numel, int64_t row_count, bool lazy_mode)
-      : beta1_(beta1),
-        beta2_(beta2),
-        epsilon_(epsilon),
-        beta1_pow_(beta1_pow),
-        beta2_pow_(beta2_pow),
-        moment1_(mom1),
-        moment1_out_(mom1_out),
-        moment2_(mom2),
-        moment2_out_(mom2_out),
-        lr_(lr),
-        grad_(grad),
-        param_(param),
-        param_out_(param_out),
-        rows_(rows),
-        row_numel_(row_numel),
-        row_count_(row_count),
-        lazy_mode_(lazy_mode) {}
-
-  inline HOSTDEVICE void adam_update(size_t i, T g) const {
-    // The following code is the same as dense
-    T mom1 = moment1_[i];
-    T mom2 = moment2_[i];
-    T lr = *lr_;
-    T beta1_pow = *beta1_pow_;
-    T beta2_pow = *beta2_pow_;
-    T p = param_[i];
-
-    // Calculation
-    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-
-    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
-    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
-
-    // Write back to global memory
-    moment1_out_[i] = mom1;
-    moment2_out_[i] = mom2;
-    param_out_[i] = p;
-  }
-
-  inline HOSTDEVICE void operator()(size_t i) const {
-    auto row_idx =
-        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
-    if (lazy_mode_ && row_idx < 0) {
-      return;
-    } else {
-      T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
-      adam_update(i, g);
-    }
-  }
-};
-
-template <typename T>
-struct SparseAdamFunctor<T, CPUAdam> {
-  T beta1_;
-  T beta2_;
-  T epsilon_;
-
-  const T* beta1_pow_;
-  const T* beta2_pow_;
-  const T* moment1_;
-  T* moment1_out_;
-  const T* moment2_;
-  T* moment2_out_;
-  const T* lr_;
-  const T* grad_;
-  const T* param_;
-  T* param_out_;
-
-  const int64_t* rows_;
-  int64_t row_numel_;
-  int64_t row_count_;
-
-  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
-                    const T* beta2_pow, const T* mom1, T* mom1_out,
-                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
-                    const T* param, T* param_out, const int64_t* rows,
-                    int64_t row_numel, int64_t row_count, bool lazy_mode)
-      : beta1_(beta1),
-        beta2_(beta2),
-        epsilon_(epsilon),
-        beta1_pow_(beta1_pow),
-        beta2_pow_(beta2_pow),
-        moment1_(mom1),
-        moment1_out_(mom1_out),
-        moment2_(mom2),
-        moment2_out_(mom2_out),
-        lr_(lr),
-        grad_(grad),
-        param_(param),
-        param_out_(param_out),
-        rows_(rows),
-        row_numel_(row_numel),
-        row_count_(row_count) {}
-
-  inline HOSTDEVICE void adam_update(size_t i, T g) const {
-    // The following code is the same as dense
-    T mom1 = moment1_[i];
-    T mom2 = moment2_[i];
-    T lr = *lr_;
-    T beta1_pow = *beta1_pow_;
-    T beta2_pow = *beta2_pow_;
-    T p = param_[i];
-
-    // Calculation
-    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-
-    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
-    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
-
-    // Write back to global memory
-    moment1_out_[i] = mom1;
-    moment2_out_[i] = mom2;
-    param_out_[i] = p;
-  }
-
-  inline void operator()(size_t numel) const {
-    // lr could be reuse
-    T lr = *lr_;
-    T beta1_pow = *beta1_pow_;
-    T beta2_pow = *beta2_pow_;
-    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-    int64_t row_count = static_cast<int64_t>(numel / row_numel_);
-
-    for (int64_t i = 0, j = 0; i != row_count; ++i) {
-      if (i == *(rows_ + j)) {
-        for (int64_t k = 0; k != row_numel_; ++k) {
-          T g = grad_[j * row_numel_ + k];
-          adam_update(i * row_numel_ + k, g);
-        }
-        ++j;
-      } else {
-        for (int64_t k = 0; k != row_numel_; ++k) {
-          T mom1 = moment1_[i * row_numel_ + k];
-          T mom2 = moment2_[i * row_numel_ + k];
-          T p = param_[i * row_numel_ + k];
-
-          mom1 = beta1_ * mom1;
-          mom2 = beta2_ * mom2;
-
-          p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
-          // Write back to global memory
-          moment1_out_[i * row_numel_ + k] = mom1;
-          moment2_out_[i * row_numel_ + k] = mom2;
-          param_out_[i * row_numel_ + k] = p;
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AdamOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
-
-    using paddle::framework::LoDTensor;
-    using paddle::operators::detail::Ref;
-
-    int64_t min_row_size_to_use_multithread =
-        ctx.Attr<int64_t>("min_row_size_to_use_multithread");
-    bool lazy_mode = ctx.Attr<bool>("lazy_mode");
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
-    // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
-    auto* grad_var = ctx.InputVar("Grad");
-    auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1");
-    auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2");
-    auto& lr =
-        Ref(ctx.Input<LoDTensor>("LearningRate"), "Must set LearningRate");
-
-    auto& beta1_pow =
-        Ref(ctx.Input<LoDTensor>("Beta1Pow"), "Must set Beta1Pow");
-    auto& beta2_pow =
-        Ref(ctx.Input<LoDTensor>("Beta2Pow"), "Must set Beta2Pow");
-
-    auto& param_out =
-        Ref(ctx.Output<LoDTensor>("ParamOut"), "Must set ParamOut");
-    auto& mom1_out =
-        Ref(ctx.Output<LoDTensor>("Moment1Out"), "Must set Moment1Out");
-    auto& mom2_out =
-        Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
-
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
-
-      if (platform::is_cpu_place(ctx.GetPlace())) {
-        AdamFunctor<T, CPUAdam> functor(
-            beta1, beta2, epsilon, beta1_pow.template data<T>(),
-            beta2_pow.template data<T>(), mom1.template data<T>(),
-            mom1_out.template mutable_data<T>(ctx.GetPlace()),
-            mom2.template data<T>(),
-            mom2_out.template mutable_data<T>(ctx.GetPlace()),
-            lr.template data<T>(), grad.template data<T>(),
-            param.template data<T>(),
-            param_out.template mutable_data<T>(ctx.GetPlace()));
-        functor(param.numel());
-      } else if (platform::is_gpu_place(ctx.GetPlace())) {
-        AdamFunctor<T, GPUAdam> functor(
-            beta1, beta2, epsilon, beta1_pow.template data<T>(),
-            beta2_pow.template data<T>(), mom1.template data<T>(),
-            mom1_out.template mutable_data<T>(ctx.GetPlace()),
-            mom2.template data<T>(),
-            mom2_out.template mutable_data<T>(ctx.GetPlace()),
-            lr.template data<T>(), grad.template data<T>(),
-            param.template data<T>(),
-            param_out.template mutable_data<T>(ctx.GetPlace()));
-
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(ctx.device_context()),
-            param.numel());
-        for_range(functor);
-      }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto& grad =
-          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
-      if (grad.rows().size() == 0) {
-        VLOG(3) << "grad row size is 0!!";
-        return;
-      }
-
-      std::vector<int64_t> cpu_rows(grad.rows().begin(), grad.rows().end());
-      bool is_strict_sorted = true;
-      for (size_t i = 1; i < cpu_rows.size(); ++i) {
-        if (cpu_rows[i - 1] >= cpu_rows[i]) {
-          is_strict_sorted = false;
-          break;
-        }
-      }
-
-      framework::SelectedRows tmp_grad_merge;
-      const framework::SelectedRows* grad_merge_ptr;
-      if (is_strict_sorted) {
-        grad_merge_ptr = &grad;
-      } else {
-        // merge duplicated rows if any.
-        // The rows of grad_merge have been sorted inside MergeAdd functor
-        scatter::MergeAdd<DeviceContext, T> merge_func;
-        merge_func(ctx.template device_context<DeviceContext>(), grad,
-                   &tmp_grad_merge, true);
-        grad_merge_ptr = &tmp_grad_merge;
-      }
-
-      auto& grad_merge = *grad_merge_ptr;
-      auto& grad_tensor = grad_merge.value();
-      const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
-      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
-
-      if (platform::is_cpu_place(ctx.GetPlace())) {
-        SparseAdamFunctor<T, CPUAdam> functor(
-            beta1, beta2, epsilon, beta1_pow.template data<T>(),
-            beta2_pow.template data<T>(), mom1.template data<T>(),
-            mom1_out.template mutable_data<T>(ctx.GetPlace()),
-            mom2.template data<T>(),
-            mom2_out.template mutable_data<T>(ctx.GetPlace()),
-            lr.template data<T>(), grad_data, param.template data<T>(),
-            param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
-            grad_merge.rows().size(), lazy_mode);
-        if (lazy_mode) {
-          VLOG(3) << "run cpu lazy mode";
-          size_t row_count = grad_merge.rows().size();
-          std::vector<int64_t> cpu_rows(grad_merge.rows());
-          for (size_t row_index = 0; row_index < row_count; ++row_index) {
-            for (size_t offset = 0; offset < row_numel; ++offset) {
-              size_t i = cpu_rows[row_index] * row_numel + offset;
-              functor.adam_update(i, grad_data[row_index * row_numel + offset]);
-            }
-          }
-        }
-#ifndef _WIN32
-        else if (FLAGS_inner_op_parallelism > 1 &&  // NOLINT
-                 min_row_size_to_use_multithread > 0 &&
-                 param.dims()[0] > min_row_size_to_use_multithread) {
-          VLOG(3) << "use multi thread, inner_op_parallelism="
-                  << FLAGS_inner_op_parallelism
-                  << " min_row_size_to_use_multithread="
-                  << min_row_size_to_use_multithread;
-          if (FLAGS_inner_op_parallelism > 10) {
-            VLOG(1) << "FLAGS_inner_op_parallelism "
-                    << FLAGS_inner_op_parallelism << " is two large!";
-          }
-          auto& grad_rows = grad_merge.rows();
-          std::unordered_map<size_t, int> row_id_to_grad_row_offset;
-          size_t param_row_count = param.numel() / row_numel;
-          if (param_row_count < 1000) {
-            VLOG(1) << "param_row_count should be larger then 1000 to use "
-                       "multi thread, currently "
-                    << param_row_count;
-          }
-          for (size_t i = 0; i < grad_rows.size(); ++i) {
-            row_id_to_grad_row_offset[grad_rows[i]] = i;
-          }
-          std::vector<std::future<void>> fs;
-          int64_t line_in_each_thread =
-              param_row_count / FLAGS_inner_op_parallelism + 1;
-          for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) {
-            int64_t start = i * line_in_each_thread;
-            int64_t end = (i + 1) * line_in_each_thread;
-            if (start >= static_cast<int64_t>(param_row_count)) {
-              break;
-            }
-            if (end > static_cast<int64_t>(param_row_count)) {
-              end = static_cast<int64_t>(param_row_count);
-            }
-            fs.push_back(
-                framework::Async([&functor, &row_id_to_grad_row_offset,
-                                  &grad_data, row_numel, start, end]() {
-                  for (int64_t row_id = start; row_id < end; ++row_id) {
-                    auto iter = row_id_to_grad_row_offset.find(row_id);
-                    if (iter != row_id_to_grad_row_offset.end()) {
-                      for (size_t row_offset = 0U; row_offset < row_numel;
-                           ++row_offset) {
-                        functor.adam_update(
-                            row_id * row_numel + row_offset,
-                            grad_data[iter->second * row_numel + row_offset]);
-                      }
-                    } else {
-                      for (size_t row_offset = 0U; row_offset < row_numel;
-                           ++row_offset) {
-                        functor.adam_update(row_id * row_numel + row_offset, 0);
-                      }
-                    }
-                  }
-                }));
-          }
-          for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-        }
-#endif          // !_WIN32
-        else {  // NOLINT
-          functor(param.numel());
-        }
-      } else if (platform::is_gpu_place(ctx.GetPlace())) {
-        SparseAdamFunctor<T, GPUAdam> functor(
-            beta1, beta2, epsilon, beta1_pow.template data<T>(),
-            beta2_pow.template data<T>(), mom1.template data<T>(),
-            mom1_out.template mutable_data<T>(ctx.GetPlace()),
-            mom2.template data<T>(),
-            mom2_out.template mutable_data<T>(ctx.GetPlace()),
-            lr.template data<T>(), grad_data, param.template data<T>(),
-            param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
-            grad_merge.rows().size(), lazy_mode);
-
-        // FIXME(minqiyang): remove BinarySearch in GPU later
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(ctx.device_context()),
-            param.numel());
-        for_range(functor);
-      }
-    } else {
-      PADDLE_THROW("Variable type not supported by adam_op");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
deleted file mode 100644
index a0152906235cbc8a870a05da990409f661338f6e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/adamax_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-class AdamaxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("InfNorm"),
-                   "Input(InfNorm) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
-                   "Input(Beta1Pow) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"),
-                   "Output(InfNormOut) of AdamaxOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 dimension");
-    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdamaxOp should have same dimension");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment"),
-        "Param and Moment input of AdamaxOp should have same dimension");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("InfNorm"),
-        "Param and InfNorm input of AdamaxOp should have same dimension");
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-    ctx->SetOutputDim("InfNormOut", param_dims);
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-    AddInput("Moment", "(Tensor) First moment");
-    AddInput("InfNorm",
-             "(Tensor) "
-             "Input exponentially weighted infinity norm");
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("MomentOut", "(Tensor) Output first moment");
-    AddOutput("InfNormOut",
-              "(Tensor) "
-              "Output exponentially weighted infinity norm");
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "1st moment estimates.")
-        .SetDefault(0.9f);
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the weighted "
-                   "infinity norm estimates.")
-        .SetDefault(0.999f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-    AddComment(R"DOC(
-Adamax Optimizer.
-
-We implement the Adamax optimizer from Section 7 of the Adam
-paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
-Adam algorithm based on the infinity norm.
-
-Adamax updates:
-
-$$
-moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\
-inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\
-learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
-$$
-
-The original paper does not have an epsilon attribute.
-However, it is added here for numerical stability to prevent the
-division by 0 error.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cu b/paddle/fluid/operators/optimizers/adamax_op.cu
deleted file mode 100644
index 80e0219d4414db2909b5babc22599d8c0d906c7d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adamax_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adamax_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h
deleted file mode 100644
index 55d25ecbddf175c0c9ba2c68ef2f6c7b83dcf32e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adamax_op.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdamaxOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment"));
-    auto inf_norm = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("InfNorm"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-    auto beta1_pow = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Beta1Pow"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto inf_norm_out =
-        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(*place) =
-        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
-    auto lr_t = lr / (1 - beta1_pow);
-    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(*place) =
-        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
deleted file mode 100644
index b44a84ccf71b574663ba5e425c4537d3769fdffe..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-class DecayedAdagradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        "Input(LearningRate) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "LearningRate should have one element");
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
-                      "Param and Grad input of DecayedAdagradOp should have "
-                      "the same dimension.");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
-                      "Param and Moment input of DecayedAdagradOp should have "
-                      "the same dimension.");
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("Moment", "(Tensor) Second moment");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("MomentOut", "(Tensor) Output second moment");
-
-    AddAttr<float>("decay",
-                   "(float, default 0.95) "
-                   "Discounting factor for coming gradient")
-        .SetDefault(0.95);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-6) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-6f);
-    AddComment(R"DOC(
-Decayed Adagrad Optimizer.
-
-The update is done as follows:
-
-$$
-moment\_out = decay * moment + (1 - decay) * grad * grad \\
-param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + epsilon}
-$$
-
-The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have an epsilon attribute. It is added here for numerical
-stability to avoid the division by zero error.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
-                             ops::DecayedAdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
deleted file mode 100644
index dc568802a2b19fee5c8d7fd8d07c929cba8ab4e3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
deleted file mode 100644
index 4abd436927707f1a18039c9104a92b2a0bf3c982..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DecayedAdagradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    float decay = ctx.Attr<float>("decay");
-    float epsilon = ctx.Attr<float>("epsilon");
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
-    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(place) =
-        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
deleted file mode 100644
index f263e67593bbd15f062648e5f09627d5fea64f0d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/dpsgd_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-class DpsgdOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
-                      "Input(Param) of DpsgdOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
-                      "Input(Grad) of DpsgdOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
-                      "Input(LearningRate) of DpsgdOp should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Param").front(),
-        framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Grad").front(),
-        framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
-                      "Output(ParamOut) of DpsgdOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 dimension");
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of DpsgdOp should have same dimension");
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class DpsgdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-
-    AddAttr<float>("clip",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "1st moment estimates.")
-        .SetDefault(10.0f);
-    AddAttr<float>("batch_size",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the weighted "
-                   "infinity norm estimates.")
-        .SetDefault(16.0f);
-    AddAttr<float>("sigma",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0f);
-    AddComment(R"DOC(
-Dpsgd Optimizer.
-
-We implement the Dpsgd optimizer according to CCS16 paper - 
-Deep Learning with Differential Privacy.
-
-Dpsgd updates:
-CCS16 - Deep Learning with Differential Privacy.
-[https://arxiv.org/abs/1607.00133]
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(dpsgd, ops::DpsgdOp, ops::DpsgdOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    dpsgd, ops::DpsgdOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DpsgdOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h
deleted file mode 100644
index 4eba7fed7e98cdb2065ed8245eca898388f23d0f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <stdlib.h>
-#include <iostream>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DpsgdOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      "The Var(%s)'s type should be LoDTensor, "
-                      "but the received is %s",
-                      ctx.Inputs("Param").front(),
-                      framework::ToTypeName(param_var->Type()));
-
-    const auto *grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      "The Var(%s)'s type should be LoDTensor, "
-                      "but the received is %s",
-                      ctx.Inputs("Grad").front(),
-                      framework::ToTypeName(grad_var->Type()));
-
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    const auto *param = ctx.Input<framework::Tensor>("Param");
-    const auto *grad = ctx.Input<framework::Tensor>("Grad");
-
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-
-    auto sz = param_out->numel();
-    PADDLE_ENFORCE_EQ(param->numel(), sz);
-    PADDLE_ENFORCE_EQ(grad->numel(), sz);
-
-    const T *lr = learning_rate->data<T>();
-    const T *param_data = param->data<T>();
-    const T *grad_data = grad->data<T>();
-
-    T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-
-    T clip = static_cast<T>(ctx.Attr<float>("clip"));
-    T batch_size = static_cast<T>(ctx.Attr<float>("batch_size"));
-    T sigma = static_cast<T>(ctx.Attr<float>("sigma"));
-
-    // compute clipping
-    float l2_norm = 0.0;
-    for (int64_t i = 0; i < grad->numel(); ++i) {
-      l2_norm = l2_norm + grad_data[i] * grad_data[i];
-    }
-    l2_norm = std::sqrt(l2_norm);
-
-    float scale = 1.0;
-    if (l2_norm > clip) {
-      scale = l2_norm / clip;
-    }
-
-    // generate gaussian noise.
-    // [https://en.wikipedia.org/wiki/Box-Muller_transform]
-    float V1, V2, S;
-    float X;
-    float mu = 0.0;
-    float U1, U2;
-    unsigned seed = (unsigned int)(time(NULL));
-    std::minstd_rand engine;
-    engine.seed(seed);
-    std::uniform_real_distribution<T> dist(0.0, 1.0);
-    do {
-      // srand((unsigned int)(time(NULL)));
-      // U1 = (rand() * 1.0) / RAND_MAX;
-      // U2 = (rand() * 1.0) / RAND_MAX;
-      // U1 = rand_rr(&seed) * (1.0 / RAND_MAX);
-      // U2 = rand_rr(&seed) * (1.0 / RAND_MAX);
-      U1 = dist(engine);
-      U2 = dist(engine);
-      V1 = 2 * U1 - 1;
-      V2 = 2 * U2 - 1;
-      S = V1 * V1 + V2 * V2;
-    } while (S >= 1 || S == 0);
-
-    X = V1 * sqrt(-2 * log(S) / S);
-
-    float gaussian_noise = mu + X * sigma;
-
-    // update parameters
-    for (int64_t i = 0; i < grad->numel(); ++i) {
-      out_data[i] =
-          param_data[i] -
-          lr[0] * (grad_data[i] / scale + gaussian_noise / batch_size);
-    }
-    // CCS16 - Deep Learning with Differential Privacy.
-    // [https://arxiv.org/abs/1607.00133]
-  }  // Compute
-};   // class
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc
deleted file mode 100644
index 98b71175624e77bf3ea1d402b9ab13c84d93c8a5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/ftrl_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-class FTRLOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("SquaredAccumulator"),
-                   "Input(SquaredAccumulator) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LinearAccumulator"),
-                   "Input(LinearAccumulator) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of FTRL should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("SquaredAccumOut"),
-                   "Output(SquaredAccumOut) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("LinearAccumOut"),
-                   "Output(LinearAccumOut) of FTRL should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
-                      "Two input of FTRL Op's dimension must be same.");
-
-    auto lr_dim = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(framework::product(lr_dim), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("SquaredAccumOut", param_dim);
-    ctx->SetOutputDim("LinearAccumOut", param_dim);
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("Param")->type();
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter value that has to be updated.");
-    AddInput("SquaredAccumulator",
-             "(Tensor, default Tensor<float>) "
-             "Accumulator that accumulates squared gradients.");
-    AddInput("LinearAccumulator",
-             "(Tensor, default Tensor<float>) "
-             "Accumulator that accumulates linear gradients.");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1.");
-
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
-    AddOutput("SquaredAccumOut",
-              "(Tensor) Output accumulated squared"
-              " gradients.");
-    AddOutput("LinearAccumOut",
-              "(Tensor) Output accumulated linear"
-              " gradients.");
-
-    AddAttr<float>("l1",
-                   "(float, default 0.0) "
-                   "L1 regularization strength.")
-        .SetDefault(0.0f);
-    AddAttr<float>("l2",
-                   "(float, default 0.0) "
-                   "L2 regularization strength.")
-        .SetDefault(0.0f);
-    AddAttr<float>("lr_power",
-                   "(float, default -0.5f) "
-                   "Learning Rate Power.")
-        .SetDefault(-0.5f);
-    AddComment(R"DOC(
-FTRL (Follow The Regularized Leader) Operator.
-
-Optimizer that implements the FTRL algorithm:
-
-$$
-new\_accum = squared\_accum + grad^2 \\
-if (lr\_power == -0.5) {
-   linear\_accum += grad - (\surd(new\_accum) - \surd(squared\_accum)) /
-                   (learning\_rate * param) \\
-} else {
-   linear\_accum += grad -
-                  (new\_accum^{-lr\_power} - accum^{-lr\_power}) /
-                  (learning\_rate * param) \\
-}
-
-x = (l1 * sign(linear\_accum) - linear\_accum)
-if (lr\_power == -0.5) {
-   y = \frac{\surd(new\_accum)}{learning\_rate} + (2 * l2) \\
-   pre\_shrink = \frac{x}{y} \\
-   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
-} else {
-   y = \frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) \\
-   pre\_shrink = \frac{x}{y} \\
-   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
-}
-squared\_accum += grad^2;
-$$
-
-The paper that proposed Follow The Regularized Leader (FTRL):
-(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    ftrl, ops::FTRLOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cu b/paddle/fluid/operators/optimizers/ftrl_op.cu
deleted file mode 100644
index acf8e38ca0f5a3cf9899f4898898013e8a2afdd2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/ftrl_op.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed
-under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. */
-#include "paddle/fluid/operators/optimizers/ftrl_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    ftrl, ops::FTRLOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
deleted file mode 100644
index bbf34d8316b09a78c334b0d79b132639be8af4f7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class FTRLOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
-
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
-    auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    sq_accum_out->mutable_data<T>(ctx.GetPlace());
-    lin_accum_out->mutable_data<T>(ctx.GetPlace());
-
-    auto grad = ctx.Input<Tensor>("Grad");
-
-    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
-    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
-    auto lr_power = static_cast<T>(ctx.Attr<float>("lr_power"));
-
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto sq_accum =
-        EigenVector<T>::Flatten(*ctx.Input<Tensor>("SquaredAccumulator"));
-    auto lin_accum =
-        EigenVector<T>::Flatten(*ctx.Input<Tensor>("LinearAccumulator"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
-    auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    auto new_accum = sq_accum + g * g;
-    // Special case for lr_power = -0.5
-    if (lr_power == static_cast<T>(-0.5)) {
-      l_acc_out.device(place) =
-          lin_accum + g -
-          ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p;
-    } else {
-      l_acc_out.device(place) =
-          lin_accum + g -
-          ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) /
-           lr.broadcast(grad_dsize)) *
-              p;
-    }
-
-    auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out);
-    if (lr_power == static_cast<T>(-0.5)) {
-      auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) +
-               l_acc_out.constant(static_cast<T>(2) * l2);
-      auto pre_shrink = x / y;
-      p_out.device(place) =
-          (l_acc_out.abs() > l_acc_out.constant(l1))
-              .select(pre_shrink, p.constant(static_cast<T>(0)));
-    } else {
-      auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) +
-               l_acc_out.constant(static_cast<T>(2) * l2);
-      auto pre_shrink = x / y;
-      p_out.device(place) =
-          (l_acc_out.abs() > l_acc_out.constant(l1))
-              .select(pre_shrink, p.constant(static_cast<T>(0)));
-    }
-
-    s_acc_out.device(place) = sq_accum + g * g;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc
deleted file mode 100644
index f8f97c56c4b823afcf919c34d7551332d93da0ba..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/lamb_op.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/lamb_op.h"
-#include "paddle/fluid/operators/optimizers/adam_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LambOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(LoDTensor, default LoDTensor<float>) "
-             "Input parameter that has to be updated.");
-    AddInput("Grad",
-             "(LoDTensor, default LoDTensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("LearningRate", "(Tensor) Learning rate.");
-    AddInput("Moment1", "(Tensor) Input first moment.");
-    AddInput("Moment2", "(Tensor) Input second moment.");
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator.");
-    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator.");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter.");
-    AddOutput("Moment1Out", "(Tensor) Output first moment.");
-    AddOutput("Moment2Out", "(Tensor) Output second moment.");
-    AddAttr<float>("weight_decay", "(float) Weight decay rate.");
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) The exponential decay rate for the "
-                   "1st moment estimates.")
-        .SetDefault(0.9);
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) The exponential decay rate for the "
-                   "2nd moment estimates.")
-        .SetDefault(0.999);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-6) "
-                   "Constant for numerical stability.")
-        .SetDefault(1.0e-6f);
-
-    AddComment(R"DOC(
-LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
-
-LAMB Optimizer is designed to scale up the batch size of training without losing 
-accuracy, which supports adaptive element-wise updating and accurate layer-wise 
-correction. For more information, please refer to https://arxiv.org/abs/1904.00962.
-
-The updating of parameters follows:
-
-$$
-m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t \\
-
-v_t &= \beta_2 v_{t - 1}  + (1 - \beta_2)g_t^2 \\
-
-r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon} \\
-
-w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})
-$$
-
-where $m$ is the 1st moment, and $v$ the 2nd moment, $\eta$ the 
-learning rate, $\lambda$ the weight decay rate.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(lamb, ops::AdamOp, ops::LambOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    lamb, ops::LambOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LambOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cu b/paddle/fluid/operators/optimizers/lamb_op.cu
deleted file mode 100644
index 9ffb62926a4fffd95ca014947282a7a32e92e4b8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/lamb_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/lamb_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lamb, ops::LambOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LambOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h
deleted file mode 100644
index 082235599015dfef272e5a830e41e225ecc13a66..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>  // for sqrt in CPU and CUDA
-#include <Eigen/Dense>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/algorithm.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-namespace scatter = paddle::operators::math::scatter;
-
-template <typename T>
-struct LambMomentUpdateFunctor {
-  T weight_decay_;
-  T beta1_;
-  T beta2_;
-  T epsilon_;
-
-  const T* beta1_pow_;
-  const T* beta2_pow_;
-  const T* moment1_;
-  T* moment1_out_;
-  const T* moment2_;
-  T* moment2_out_;
-  const T* grad_;
-  const T* param_;
-  T* trust_ratio_div_;
-
-  LambMomentUpdateFunctor(T weight_decay, T beta1, T beta2, T epsilon,
-                          const T* beta1_pow, const T* beta2_pow, const T* mom1,
-                          T* mom1_out, const T* mom2, T* mom2_out,
-                          const T* grad, const T* param, T* trust_ratio_div)
-      : weight_decay_(weight_decay),
-        beta1_(beta1),
-        beta2_(beta2),
-        epsilon_(epsilon),
-        beta1_pow_(beta1_pow),
-        beta2_pow_(beta2_pow),
-        moment1_(mom1),
-        moment1_out_(mom1_out),
-        moment2_(mom2),
-        moment2_out_(mom2_out),
-        grad_(grad),
-        param_(param),
-        trust_ratio_div_(trust_ratio_div) {}
-
-  inline HOSTDEVICE void operator()(size_t i) const {
-    T g = grad_[i];
-    T mom1 = moment1_[i];
-    T mom2 = moment2_[i];
-    T p = param_[i];
-
-    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
-    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-
-    moment1_out_[i] = mom1;
-    moment2_out_[i] = mom2;
-    trust_ratio_div_[i] = mom1 / (sqrt(mom2) + epsilon_) + weight_decay_ * p;
-  }
-};
-
-template <typename T>
-struct SparseLambMomentUpdateFunctor {
-  T weight_decay_;
-  T beta1_;
-  T beta2_;
-  T epsilon_;
-
-  const T* beta1_pow_;
-  const T* beta2_pow_;
-  const T* moment1_;
-  T* moment1_out_;
-  const T* moment2_;
-  T* moment2_out_;
-  const T* grad_;
-  const T* param_;
-  T* trust_ratio_div_;
-
-  const int64_t* rows_;
-  int64_t row_numel_;
-  int64_t row_count_;
-
-  SparseLambMomentUpdateFunctor(T weight_decay, T beta1, T beta2, T epsilon,
-                                const T* beta1_pow, const T* beta2_pow,
-                                const T* mom1, T* mom1_out, const T* mom2,
-                                T* mom2_out, const T* grad, const T* param,
-                                T* trust_ratio_div, const int64_t* rows,
-                                int64_t row_numel, int64_t row_count)
-      : weight_decay_(weight_decay),
-        beta1_(beta1),
-        beta2_(beta2),
-        epsilon_(epsilon),
-        beta1_pow_(beta1_pow),
-        beta2_pow_(beta2_pow),
-        moment1_(mom1),
-        moment1_out_(mom1_out),
-        moment2_(mom2),
-        moment2_out_(mom2_out),
-        grad_(grad),
-        param_(param),
-        trust_ratio_div_(trust_ratio_div),
-        rows_(rows),
-        row_numel_(row_numel),
-        row_count_(row_count) {}
-
-  inline HOSTDEVICE void update(size_t i, T g) const {
-    // The following code is same as dense
-    T mom1 = moment1_[i];
-    T mom2 = moment2_[i];
-    T p = param_[i];
-
-    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
-    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-
-    moment1_out_[i] = mom1;
-    moment2_out_[i] = mom2;
-    trust_ratio_div_[i] = mom1 / (sqrt(mom2) + epsilon_) + weight_decay_ * p;
-  }
-
-  inline HOSTDEVICE void operator()(size_t i) const {
-    auto row_idx =
-        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
-    T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
-    update(i, g);
-  }
-};
-
-template <typename T>
-struct LambParamUpateFunctor {
-  const T* lr_;
-  const T* param_;
-  const T* param_norm_;
-  const T* trust_ratio_div_;
-  const T* trust_ratio_div_norm_;
-  T* param_out_;
-
-  LambParamUpateFunctor(const T* lr, const T* param, const T* param_norm,
-                        const T* trust_ratio_div, const T* trust_ratio_div_norm,
-                        T* param_out)
-      : lr_(lr),
-        param_(param),
-        param_norm_(param_norm),
-        trust_ratio_div_(trust_ratio_div),
-        trust_ratio_div_norm_(trust_ratio_div_norm),
-        param_out_(param_out) {}
-
-  inline HOSTDEVICE void operator()(size_t i) const {
-    T lr = *lr_;
-    T p = *param_norm_;
-    T t = *trust_ratio_div_norm_;
-
-    T r = (p > 0 && t > 0) ? p / t : 1.0;
-    lr *= r;
-    param_out_[i] = param_[i] - lr * trust_ratio_div_[i];
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LambOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
-
-    using paddle::framework::LoDTensor;
-    using paddle::operators::detail::Ref;
-
-    T weight_decay = static_cast<T>(ctx.Attr<float>("weight_decay"));
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param.");
-    auto* grad_var = ctx.InputVar("Grad");
-    auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1.");
-    auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2.");
-    auto& lr =
-        Ref(ctx.Input<LoDTensor>("LearningRate"), "Must set LearningRate.");
-
-    auto& beta1_pow =
-        Ref(ctx.Input<LoDTensor>("Beta1Pow"), "Must set Beta1Pow.");
-    auto& beta2_pow =
-        Ref(ctx.Input<LoDTensor>("Beta2Pow"), "Must set Beta2Pow.");
-
-    auto& param_out =
-        Ref(ctx.Output<LoDTensor>("ParamOut"), "Must set ParamOut.");
-    auto& mom1_out =
-        Ref(ctx.Output<LoDTensor>("Moment1Out"), "Must set Moment1Out.");
-    auto& mom2_out =
-        Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out.");
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, param.numel());
-    framework::Tensor trust_ratio_div =
-        ctx.AllocateTmpTensor<T, DeviceContext>(param.dims(), dev_ctx);
-
-    // Update moments
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad.");
-
-      LambMomentUpdateFunctor<T> moment_update_functor(
-          weight_decay, beta1, beta2, epsilon, beta1_pow.template data<T>(),
-          beta2_pow.template data<T>(), mom1.template data<T>(),
-          mom1_out.template mutable_data<T>(ctx.GetPlace()),
-          mom2.template data<T>(),
-          mom2_out.template mutable_data<T>(ctx.GetPlace()),
-          grad.template data<T>(), param.template data<T>(),
-          trust_ratio_div.template data<T>());
-      for_range(moment_update_functor);
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto& grad =
-          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad.");
-      if (grad.rows().size() == 0) {
-        VLOG(3) << "grad row size is 0!!";
-        return;
-      }
-
-      std::vector<int64_t> cpu_rows(grad.rows().begin(), grad.rows().end());
-      bool is_strict_sorted = true;
-      for (size_t i = 1; i < cpu_rows.size(); ++i) {
-        if (cpu_rows[i - 1] >= cpu_rows[i]) {
-          is_strict_sorted = false;
-          break;
-        }
-      }
-
-      framework::SelectedRows tmp_grad_merge;
-      const framework::SelectedRows* grad_merge_ptr;
-      if (is_strict_sorted) {
-        grad_merge_ptr = &grad;
-      } else {
-        // merge duplicated rows if any.
-        // The rows of grad_merge have been sorted inside MergeAdd functor
-        scatter::MergeAdd<DeviceContext, T> merge_func;
-        merge_func(dev_ctx, grad, &tmp_grad_merge, true);
-        grad_merge_ptr = &tmp_grad_merge;
-      }
-
-      auto& grad_merge = *grad_merge_ptr;
-      auto& grad_tensor = grad_merge.value();
-      const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
-      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
-
-      SparseLambMomentUpdateFunctor<T> moment_update_functor(
-          weight_decay, beta1, beta2, epsilon, beta1_pow.template data<T>(),
-          beta2_pow.template data<T>(), mom1.template data<T>(),
-          mom1_out.template mutable_data<T>(ctx.GetPlace()),
-          mom2.template data<T>(),
-          mom2_out.template mutable_data<T>(ctx.GetPlace()), grad_data,
-          param.template data<T>(), trust_ratio_div.template data<T>(), rows,
-          row_numel, grad_merge.rows().size());
-      for_range(moment_update_functor);
-    } else {
-      PADDLE_THROW("Variable type not supported by lamb_op.");
-    }
-
-    // Update parameter
-    framework::Tensor p_norm_t =
-        ctx.AllocateTmpTensor<T, DeviceContext>({1}, dev_ctx);
-    framework::Tensor trust_ratio_div_norm_t =
-        ctx.AllocateTmpTensor<T, DeviceContext>({1}, dev_ctx);
-    auto p_norm = framework::EigenScalar<T>::From(p_norm_t);
-    auto trust_ratio_div_norm =
-        framework::EigenScalar<T>::From(trust_ratio_div_norm_t);
-
-    auto p = framework::EigenVector<T>::Flatten(param);
-    auto t = framework::EigenVector<T>::Flatten(trust_ratio_div);
-
-    auto* place = dev_ctx.eigen_device();
-    p_norm.device(*place) = p.square().sum().sqrt();
-    trust_ratio_div_norm.device(*place) = t.square().sum().sqrt();
-
-    LambParamUpateFunctor<T> param_update_functor(
-        lr.template data<T>(), param.template data<T>(),
-        p_norm_t.template data<T>(), trust_ratio_div.template data<T>(),
-        trust_ratio_div_norm_t.template data<T>(),
-        param_out.template mutable_data<T>(ctx.GetPlace()));
-    for_range(param_update_functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
deleted file mode 100644
index 126b665dd4d9301ae67346afa45a250accfec656..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(LoDTensor, default LoDTensor<float>) "
-             "Input parameter that has to be updated");
-    AddInput("Grad",
-             "(LoDTensor, default LoDTensor<float>) "
-             "Input gradient of the parameter");
-    AddInput("Velocity",
-             "(LoDTensor, default LoDTensor<float>) "
-             "Input velocity (corresponding to the parameter) "
-             "that has to be updated");
-    AddInput("LearningRate",
-             "(LoDTensor, default LoDTensor<float>) "
-             "Input learning rate");
-
-    AddOutput("ParamOut",
-              "(LoDTensor) This output is updated parameter. "
-              "It shared memory with Input(Param).");
-    AddOutput("VelocityOut",
-              "(LoDTensor) This output is updated velocity. "
-              "It shared memory with Input(Velocity).");
-
-    AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<float>("lars_coeff", "(float, default 0.001) LARS coefficient.")
-        .SetDefault(0.001);
-    AddAttr<float>("lars_weight_decay",
-                   "(float, default 0.0005) LARS weight decay")
-        .SetDefault(0.0005);
-
-    AddComment(R"DOC(
-Lars Momentum Optimizer.
-
-This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
-weight using a local learning rate:
-
-$$
-local\_lr = \eta  *
-    \frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
-velocity = mu * velocity +
-    local\_lr * (grad + \beta * param) \\
-param = param - velocity. \\
-$$
-
-Note that we use lars_weight_decay here to decay weights, you may need not to
-use L2 regularizers in case of using LARS.
-
-)DOC");
-  }
-};
-
-class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {}
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lars_momentum, ops::MomentumOp, ops::LarsMomentumOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::LarsMomentumOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(lars_momentum, ops::LarsMomentumOpKernel<float>,
-                       ops::LarsMomentumOpKernel<double>);
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
deleted file mode 100644
index a277d6ff2bea917addac8c6ea4b24b63dcbc8dba..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
-                                   const T* learning_rate, const T mu,
-                                   const int64_t num, const T lars_coeff,
-                                   const T lars_weight_decay, const T* p_norm,
-                                   const T* g_norm, T* p_out, T* v_out) {
-  T lr = learning_rate[0];
-  T local_lr = learning_rate[0];
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    if (p_norm[0] > 0 && g_norm[0] > 0) {
-      local_lr = lr * lars_coeff * p_norm[0] /
-                 (g_norm[0] + lars_weight_decay * p_norm[0]);
-    }
-    T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
-    v_out[i] = v_new;
-    p_out[i] = p[i] - v_new;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
-    auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
-    auto param = ctx.Input<framework::LoDTensor>("Param");
-    auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
-    auto grad = ctx.Input<framework::LoDTensor>("Grad");
-    auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
-
-    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
-    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    T lars_coeff = ctx.Attr<float>("lars_coeff");
-    T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
-
-    auto* p = param->data<T>();
-    auto* v = velocity->data<T>();
-    auto* g = grad->data<T>();
-    auto* lr = learning_rate->data<T>();
-
-    int block = 512;
-    int grid = (param->numel() + block - 1) / block;
-
-    auto eigen_p = framework::EigenVector<T>::Flatten(*param);
-    auto eigen_g = framework::EigenVector<T>::Flatten(*grad);
-    // calculate norms using eigein and launch the kernel.
-    framework::Tensor p_norm_t, g_norm_t;
-    p_norm_t.Resize({1});
-    g_norm_t.Resize({1});
-    auto* p_norm_data = p_norm_t.mutable_data<T>(ctx.GetPlace());
-    auto* g_norm_data = g_norm_t.mutable_data<T>(ctx.GetPlace());
-    auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
-    auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
-
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-    ep_norm.device(*place) = eigen_p.square().sum().sqrt();
-    eg_norm.device(*place) = eigen_g.square().sum().sqrt();
-    MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
-        p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
-        p_norm_data, g_norm_data, p_out, v_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lars_momentum,
-    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h
deleted file mode 100644
index e0064c201825b1f074eb53c591dc3abdd7bc1e1b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class LarsMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
-    auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
-    auto param = ctx.Input<framework::LoDTensor>("Param");
-    auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
-    auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
-    auto* grad_var = ctx.InputVar("Grad");
-    // only support dense for now.
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true);
-    auto grad = ctx.Input<framework::LoDTensor>("Grad");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    velocity_out->mutable_data<T>(ctx.GetPlace());
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    T lars_coeff = ctx.Attr<float>("lars_coeff");
-    T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
-
-    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
-    auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
-
-    auto p = framework::EigenVector<T>::Flatten(*param);
-    auto v = framework::EigenVector<T>::Flatten(*velocity);
-    auto g = framework::EigenVector<T>::Flatten(*grad);
-    auto* lr = learning_rate->data<T>();
-
-    framework::Tensor p_norm_t, g_norm_t;
-    p_norm_t.Resize({1});
-    g_norm_t.Resize({1});
-    p_norm_t.mutable_data<T>(ctx.GetPlace());
-    g_norm_t.mutable_data<T>(ctx.GetPlace());
-    auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
-    auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
-
-    ep_norm = p.square().sum().sqrt();
-    eg_norm = g.square().sum().sqrt();
-    T local_lr = lr[0];
-    if (ep_norm(0) > 0 && eg_norm(0) > 0) {
-      local_lr = lr[0] * lars_coeff * ep_norm(0) /
-                 (eg_norm(0) + lars_weight_decay * ep_norm(0));
-    }
-    v_out = v * mu + local_lr * (g + lars_weight_decay * p);
-    p_out = p - v_out;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
deleted file mode 100644
index 7cf218c20f4c8a22aefc8cd8ce8e1cca36dee3bf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class MomentumOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto& input_var = ctx->Input("Param")[0];
-    for (auto& out_var : ctx->Output("ParamOut")) {
-      if (ctx->GetType(input_var) == framework::proto::VarType::SELECTED_ROWS) {
-        ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
-      } else if (ctx->GetType(input_var) ==
-                 framework::proto::VarType::LOD_TENSOR) {
-        ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR);
-      } else {
-        PADDLE_THROW(
-            "Only support LodTensor and SelectedRows, Unexpected Input Type.");
-      }
-    }
-  }
-};
-
-class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter that has to be updated");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter");
-    AddInput("Velocity",
-             "(Tensor, default Tensor<float>) "
-             "Input velocity (corresponding to the parameter) "
-             "that has to be updated");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "Input learning rate");
-
-    AddOutput("ParamOut",
-              "(Tensor) This output is updated parameter. "
-              "It shared memory with Input(Param).");
-    AddOutput("VelocityOut",
-              "(Tensor) This output is updated velocity. "
-              "It shared memory with Input(Velocity).");
-
-    AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("use_nesterov",
-                  "(bool, default false) "
-                  "Use Nesterov Momentum")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Momentum Optimizer.
-
-This optimizer has a flag for Nestrov Momentum.
-The update equations are as follows:
-
-$$
-velocity = mu * velocity + gradient \\
-if (use\_nesterov):   \\
-  param = param - (gradient + mu * velocity) * learning\_rate \\
-else:   \\
-  param = param - learning\_rate * velocity. \\
-$$
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(momentum, ops::MomentumOp, ops::MomentumOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::MomentumOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    momentum, ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cu b/paddle/fluid/operators/optimizers/momentum_op.cu
deleted file mode 100644
index 7f9e7246401bc3c765e539ac4395c4feef3c9508..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/momentum_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext,
-                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
deleted file mode 100644
index f56f5b6bbe372f9e38b93f00c89fa99d5a58544a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ /dev/null
@@ -1,404 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/algorithm.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using framework::SelectedRows;
-struct NoNesterov;
-struct UseNesterov;
-
-class MomentumOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(param) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(grad) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
-                   "Input(velocity) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of Momentum should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
-                   "Output(VelocityOut) of Momentum should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning_rate should be a scalar");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    if (ctx->GetInputsVarType("Grad")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          param_dim, ctx->GetInputDim("Grad"),
-          "Param and Grad input of MomentumOp should have the same dimension.");
-      PADDLE_ENFORCE_EQ(
-          param_dim, ctx->GetInputDim("Velocity"),
-          "Param and Velocity of MomentumOp should have the same dimension.");
-    }
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("VelocityOut", param_dim);
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class CPUDenseMomentumFunctor {
- private:
-  const Tensor* param;
-  const Tensor* grad;
-  const Tensor* velocity;
-  const Tensor* learning_rate;
-  const T mu;
-  const T use_nesterov;
-  Tensor* param_out;
-  Tensor* velocity_out;
-
- public:
-  CPUDenseMomentumFunctor(const Tensor* param, const Tensor* grad,
-                          const Tensor* velocity, const Tensor* learning_rate,
-                          const T mu, const bool use_nesterov,
-                          Tensor* param_out, Tensor* velocity_out)
-      : param(param),
-        grad(grad),
-        velocity(velocity),
-        learning_rate(learning_rate),
-        mu(mu),
-        use_nesterov(use_nesterov),
-        param_out(param_out),
-        velocity_out(velocity_out) {}
-
-  inline void operator()() {
-    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
-    auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
-
-    auto p = framework::EigenVector<T>::Flatten(*param);
-    auto v = framework::EigenVector<T>::Flatten(*velocity);
-    auto g = framework::EigenVector<T>::Flatten(*grad);
-    auto* lr = learning_rate->data<T>();
-
-    v_out = v * mu + g;
-    if (use_nesterov) {
-      p_out = p - (g + v_out * mu) * lr[0];
-    } else {
-      p_out = p - lr[0] * v_out;
-    }
-  }
-};
-
-template <typename T, typename UpdateMethod>
-class DenseMomentumFunctor;
-
-// NOTE(dzh) for performance.
-// avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two
-// functor.
-template <typename T>
-class DenseMomentumFunctor<T, UseNesterov> {
- private:
-  const T* p_;
-  const T* g_;
-  const T* v_;
-  const T* lr_;
-  const T mu_;
-  const int64_t num_;
-  T* p_out_;
-  T* v_out_;
-
- public:
-  DenseMomentumFunctor(const T* p, const T* g, const T* v,
-                       const T* learning_rate, const T mu, const int64_t num,
-                       T* p_out, T* v_out)
-      : p_(p),
-        g_(g),
-        v_(v),
-        lr_(learning_rate),
-        mu_(mu),
-        num_(num),
-        p_out_(p_out),
-        v_out_(v_out) {}
-  inline HOSTDEVICE void operator()(size_t i) const {
-    // put memory access in register
-    const T p = p_[i];
-    const T g = g_[i];
-    const T lr = lr_[0];
-    const T v = v_[i];
-    T v_out = v * mu_ + g;
-    T p_out = p - (g + v_out * mu_) * lr;
-    // write reigster to memory
-    v_out_[i] = v_out;
-    p_out_[i] = p_out;
-  }
-};
-
-template <typename T>
-class DenseMomentumFunctor<T, NoNesterov> {
- private:
-  const T* p_;
-  const T* g_;
-  const T* v_;
-  const T* lr_;
-  const T mu_;
-  const int64_t num_;
-  T* p_out_;
-  T* v_out_;
-
- public:
-  DenseMomentumFunctor(const T* p, const T* g, const T* v,
-                       const T* learning_rate, const T mu, const int64_t num,
-                       T* p_out, T* v_out)
-      : p_(p),
-        g_(g),
-        v_(v),
-        lr_(learning_rate),
-        mu_(mu),
-        num_(num),
-        p_out_(p_out),
-        v_out_(v_out) {}
-  inline HOSTDEVICE void operator()(size_t i) const {
-    // put memory access in register
-    const T p = p_[i];
-    const T g = g_[i];
-    const T lr = lr_[0];
-    const T v = v_[i];
-    T v_out = v * mu_ + g;
-    T p_out = p - lr * v_out;
-    // write reigster to memory
-    v_out_[i] = v_out;
-    p_out_[i] = p_out;
-  }
-};
-
-template <typename T, typename UpdateMethod>
-class SparseMomentumFunctor;
-
-template <typename T>
-class SparseMomentumFunctor<T, UseNesterov> {
- private:
-  const T* p_;
-  const T* g_;
-  const T* v_;
-  const T* lr_;
-  const T mu_;
-  const int64_t* rows_;
-  const int64_t row_numel_;
-  const int64_t row_height_;
-  T* p_out_;
-  T* v_out_;
-
- public:
-  SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr,
-                        const T mu, const int64_t* rows, int64_t row_numel,
-                        int64_t row_height, T* p_out, T* v_out)
-      : p_(p),
-        g_(g),
-        v_(v),
-        lr_(lr),
-        mu_(mu),
-        rows_(rows),
-        row_numel_(row_numel),
-        row_height_(row_height),
-        p_out_(p_out),
-        v_out_(v_out) {}
-
-  inline HOSTDEVICE void operator()(size_t i) {
-    auto row_idx =
-        math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
-    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
-                       : static_cast<T>(0);
-    // put memory access in register
-    const T p = p_[i];
-    const T lr = lr_[0];
-    const T v = v_[i];
-    T v_out = v * mu_ + g;
-    T p_out = p - (g + v_out * mu_) * lr;
-    // write reigster to memory
-    v_out_[i] = v_out;
-    p_out_[i] = p_out;
-  }
-};
-
-template <typename T>
-class SparseMomentumFunctor<T, NoNesterov> {
- private:
-  const T* p_;
-  const T* g_;
-  const T* v_;
-  const T* lr_;
-  const T mu_;
-  const int64_t* rows_;
-  const int64_t row_numel_;
-  const int64_t row_height_;
-  T* p_out_;
-  T* v_out_;
-
- public:
-  SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr,
-                        const T mu, const int64_t* rows, int64_t row_numel,
-                        int64_t row_height, T* p_out, T* v_out)
-      : p_(p),
-        g_(g),
-        v_(v),
-        lr_(lr),
-        mu_(mu),
-        rows_(rows),
-        row_numel_(row_numel),
-        row_height_(row_height),
-        p_out_(p_out),
-        v_out_(v_out) {}
-
-  inline HOSTDEVICE void operator()(size_t i) {
-    auto row_idx =
-        math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
-    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
-                       : static_cast<T>(0);
-    // put memory access in register
-    const T p = p_[i];
-    const T lr = lr_[0];
-    const T v = v_[i];
-    T v_out = v * mu_ + g;
-    T p_out = p - v_out * lr;
-    // write reigster to memory
-    v_out_[i] = v_out;
-    p_out_[i] = p_out;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto* velocity = ctx.Input<framework::Tensor>("Velocity");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
-    param_out->mutable_data<T>(ctx.GetPlace());
-    velocity_out->mutable_data<T>(ctx.GetPlace());
-
-    auto* grad_var = ctx.InputVar("Grad");
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto grad = ctx.Input<framework::Tensor>("Grad");
-      if (platform::is_cpu_place(ctx.GetPlace())) {
-        CPUDenseMomentumFunctor<T> functor(param, grad, velocity, learning_rate,
-                                           mu, use_nesterov, param_out,
-                                           velocity_out);
-        functor();
-      } else if (platform::is_gpu_place(ctx.GetPlace())) {
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(ctx.device_context()),
-            param->numel());
-        if (use_nesterov) {
-          DenseMomentumFunctor<T, UseNesterov> functor(
-              param->data<T>(), grad->data<T>(), velocity->data<T>(),
-              learning_rate->data<T>(), mu, param->numel(),
-              param_out->mutable_data<T>(ctx.GetPlace()),
-              velocity_out->mutable_data<T>(ctx.GetPlace()));
-          for_range(functor);
-
-        } else {
-          DenseMomentumFunctor<T, NoNesterov> functor(
-              param->data<T>(), grad->data<T>(), velocity->data<T>(),
-              learning_rate->data<T>(), mu, param->numel(),
-              param_out->mutable_data<T>(ctx.GetPlace()),
-              velocity_out->mutable_data<T>(ctx.GetPlace()));
-          for_range(functor);
-        }
-      }
-
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      // sparse update embedding with selectedrows
-      auto grad = ctx.Input<framework::SelectedRows>("Grad");
-
-      // sparse update maybe empty.
-      if (grad->rows().size() == 0) {
-        VLOG(3) << "Grad SelectedRows contains no data!";
-        return;
-      }
-
-      framework::SelectedRows tmp_merged_grad;
-      framework::SelectedRows* merged_grad = &tmp_merged_grad;
-      math::scatter::MergeAdd<DeviceContext, T> merge_func;
-      merge_func(ctx.template device_context<DeviceContext>(), *grad,
-                 merged_grad);
-
-      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
-      int64_t row_numel =
-          merged_grad->value().numel() / merged_grad->rows().size();
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(ctx.device_context()),
-          param->numel());
-      if (use_nesterov) {
-        SparseMomentumFunctor<T, UseNesterov> functor(
-            param->data<T>(), merged_grad->value().data<T>(),
-            velocity->data<T>(), learning_rate->data<T>(), mu, rows, row_numel,
-            static_cast<int64_t>(merged_grad->rows().size()),
-            param_out->mutable_data<T>(ctx.GetPlace()),
-            velocity_out->mutable_data<T>(ctx.GetPlace()));
-        for_range(functor);
-
-      } else {
-        SparseMomentumFunctor<T, NoNesterov> functor(
-            param->data<T>(), merged_grad->value().data<T>(),
-            velocity->data<T>(), learning_rate->data<T>(), mu, rows, row_numel,
-            static_cast<int64_t>(merged_grad->rows().size()),
-            param_out->mutable_data<T>(ctx.GetPlace()),
-            velocity_out->mutable_data<T>(ctx.GetPlace()));
-        for_range(functor);
-      }
-    } else {
-      PADDLE_THROW(
-          string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
-                          "gradient, but the received Variable Type is %s",
-                          framework::ToTypeName(grad_var->Type())));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
deleted file mode 100644
index 9dd9b8afbd4915202df120b02f7e62de79e9e224..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-class ProximalAdagradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of ProximalAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of ProximalAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of ProximalAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        "Input(LearningRate) of ProximalAdagradOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of ProximalAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("MomentOut"),
-        "Output(MomentOut) of ProximalAdagradOp should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        "Param and Grad of ProximalAdagrad Op must have same dimension.");
-
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Moment"),
-        "Param and Moment of ProximalAdagrad Op must have same dimension.");
-
-    auto lr_dim = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("MomentOut", param_dim);
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter that has to be updated.");
-    AddInput("Moment",
-             "(Tensor, default Tensor<float>) "
-             "Moment parameter that has to be updated.");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1.");
-
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
-    AddOutput("MomentOut", "(Tensor) Output updated moment value.");
-
-    AddAttr<float>("l1",
-                   "(float, default 0.0) "
-                   "L1 regularization strength.")
-        .SetDefault(0.0f);
-    AddAttr<float>("l2",
-                   "(float, default 0.0) "
-                   "L2 regularization strength.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-Proximal Adagrad Optimizer.
-
-Optimizer that implements the proximal adagrad algorithm:
-
-$$
-moment = moment + grad * grad \\
-prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\
-param = sign(prox\_param) / (1 + learning\_rate * l2) *
-        \max(|prox\_param| - learning\_rate * l1 , 0)
-$$
-
-The paper that proposed Proximal GD: 
-(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
-Here, we use the adagrad learning rate as specified here: 
-(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp,
-                             ops::ProximalAdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
deleted file mode 100644
index 591dead3b12763e4cd1b9c390a87816ab121fbf8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed
-under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. */
-#include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
deleted file mode 100644
index 91416450a60d683eeb33462258ff8325dae76e6e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class ProximalAdagradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* moment_out = ctx.Output<Tensor>("MomentOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-
-    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
-    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
-
-    auto grad = ctx.Input<Tensor>("Grad");
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto m = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto m_out = EigenVector<T>::Flatten(*moment_out);
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    m_out.device(*place) = m + g * g;
-    auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt();
-    if (l1 > static_cast<T>(0)) {
-      p_out.device(*place) =
-          prox_param.sign() *
-          (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
-                .cwiseMax(static_cast<T>(0.0))) /
-           (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize)));
-    } else {
-      p_out.device(*place) =
-          prox_param / (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
deleted file mode 100644
index fccfc2b4584a25e5f703750393464bbc3026de42..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/proximal_gd_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-class ProximalGDOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of ProximalGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of ProximalGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of ProximalGDOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of ProximalGDOp should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
-                      "Two input of ProximalGD Op's dimension must be same.");
-
-    auto lr_dim = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter value that has to be updated.");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1.");
-
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
-
-    AddAttr<float>("l1",
-                   "(float, default 0.0) "
-                   "L1 regularization strength.")
-        .SetDefault(0.0f);
-    AddAttr<float>("l2",
-                   "(float, default 0.0) "
-                   "L2 regularization strength.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-ProximalGD Operator.
-
-Optimizer that implements the proximal gradient descent algorithm:
-
-$$
-prox\_param = param - learning\_rate * grad \\
-param = sign(prox\_param) / (1 + learning\_rate * l2) *
-        \max(|prox\_param| - learning\_rate * l1, 0)
-$$        
-
-The paper that proposed Proximal Gradient Descent:
-(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp,
-                             ops::ProximalGDOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    proximal_gd,
-    ops::ProximalGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cu b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
deleted file mode 100644
index d556fa74f19529d0e2f80d4c6dbfca62498c9dcc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed
-under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. */
-#include "paddle/fluid/operators/optimizers/proximal_gd_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    proximal_gd,
-    ops::ProximalGDOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.h b/paddle/fluid/operators/optimizers/proximal_gd_op.h
deleted file mode 100644
index d49badf16d510c13847b8921360544bbb6078e05..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class ProximalGDOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-
-    auto grad = ctx.Input<Tensor>("Grad");
-
-    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
-    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
-
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    auto prox_param = p - lr.broadcast(grad_dsize) * g;
-    if (l1 > 0) {
-      p_out.device(place) =
-          prox_param.sign() *
-          (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
-                .cwiseMax(T(0.0))) /
-           (1.0 + (lr * l2).broadcast(grad_dsize)));
-    } else {
-      p_out.device(place) =
-          prox_param / (1.0 + (lr * l2).broadcast(grad_dsize));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
deleted file mode 100644
index 99d1156ee6d5fc88161e25bfa581a265707e6f92..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/rmsprop_op.h"
-
-namespace paddle {
-namespace operators {
-
-class RmspropOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
-                   "Input(MeanSquare) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(param_out) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
-                   "Output(MeanSquareOut) of RmspropOp should not be null.");
-    if (ctx->Attrs().Get<bool>("centered")) {
-      PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
-                     "Output(MeanGradOut) of RmspropOp should not be null.");
-    }
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        "Param and grad input of RmspropOp should have the same dimension.");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
-                      "Param and Momentum input of RmspropOp "
-                      "should have the same dimension.");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
-                      "Param and Momentum input of RmspropOp "
-                      "should have the same dimension.");
-
-    auto lr_dim = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("MomentOut", param_dim);
-    ctx->SetOutputDim("MeanSquareOut", param_dim);
-    if (ctx->Attrs().Get<bool>("centered")) {
-      ctx->SetOutputDim("MeanGradOut", param_dim);
-    }
-  }
-};
-
-class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter value that has to be updated.");
-    AddInput("MeanSquare",
-             "(Tensor, default Tensor<float>)"
-             " The mean square value that gets updated.");
-    AddInput("MeanGrad",
-             "(Tensor, default Tensor<float>)"
-             " The moving average of gradient")
-        .AsDispensable();
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1.");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("Moment",
-             "(Tensor, default Tensor<float>) The moment that gets updated.");
-
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
-    AddOutput("MomentOut", "(Tensor) Output updated moment.");
-    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
-    AddOutput("MeanGradOut",
-              "(Tensor) Output moving average of gradient updated value.");
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1e-10) Constant "
-                   "for numerical stability.")
-        .SetDefault(1.0e-10f);
-    AddAttr<float>("decay",
-                   "(float, default 0.9) "
-                   "Discounting factor for coming gradient.")
-        .SetDefault(0.9f);
-    AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
-        .SetDefault(0.0f);
-    AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Rmsprop Optimizer. 
-
-$$
-MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\
-MomentOut = momentum * Moment +
-            \frac{LearningRate * Grad}{\sqrt{MeanSquareOut + epsilon}} \\
-ParamOut = Param -  MomentOut
-$$
-
-if centered is true:
-
-mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
-mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
-mom = momentum * mom{t-1} + learning_rate * g_t /
-    sqrt(mean_square - mean_grad**2 + epsilon)
-param -= mom
-
-The original slides that proposed Rmsprop: Slide 29 of
-http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cu b/paddle/fluid/operators/optimizers/rmsprop_op.cu
deleted file mode 100644
index 8b17d6a0204045a9b20adb79dbad72dff5ba267e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cu
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/rmsprop_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h
deleted file mode 100644
index 4550052b2d614ccbbb09f4a2b9e747708b2a2baa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/algorithm.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-struct DenseRmspropGradFunctor {
-  inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
-
-  HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
-
-  const T *grad_;
-};
-
-template <typename T>
-struct SparseRmspropGradFunctor {
-  inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows,
-                                  int64_t row_numel, int64_t row_count)
-      : grad_(grad),
-        rows_(rows),
-        row_numel_(row_numel),
-        row_count_(row_count) {}
-
-  HOSTDEVICE inline T operator()(int64_t idx) const {
-    auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_);
-    return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
-  }
-
-  const T *grad_;
-  const int64_t *rows_;
-  int64_t row_numel_;
-  int64_t row_count_;
-};
-
-template <typename T, typename GradFunctor>
-struct UncenteredRmspropFunctor {
-  UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho,
-                           T epsilon, T momentum,
-                           const GradFunctor &grad_functor)
-      : param_(param),
-        ms_(ms),
-        mom_(mom),
-        lr_(lr),
-        rho_(rho),
-        epsilon_(epsilon),
-        momentum_(momentum),
-        grad_functor_(grad_functor) {}
-
-  HOSTDEVICE inline void operator()(int64_t idx) const {
-    T g = grad_functor_(idx);
-    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
-    T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
-    param_[idx] -= mom_out;
-    ms_[idx] = ms_out;
-    mom_[idx] = mom_out;
-  }
-
-  T *param_;
-  T *ms_;
-  T *mom_;
-  const T *lr_;
-  T rho_;
-  T epsilon_;
-  T momentum_;
-  GradFunctor grad_functor_;
-};
-
-template <typename T, typename GradFunctor>
-struct CenteredRmspropFunctor {
-  CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr,
-                         T rho, T epsilon, T momentum,
-                         const GradFunctor &grad_functor)
-      : param_(param),
-        ms_(ms),
-        mom_(mom),
-        mean_grad_(mean_grad),
-        lr_(lr),
-        rho_(rho),
-        epsilon_(epsilon),
-        momentum_(momentum),
-        grad_functor_(grad_functor) {}
-
-  HOSTDEVICE inline void operator()(int64_t idx) const {
-    T g = grad_functor_(idx);
-    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
-    T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
-    T mom_out = momentum_ * mom_[idx] +
-                lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
-    param_[idx] -= mom_out;
-    ms_[idx] = ms_out;
-    mom_[idx] = mom_out;
-    mean_grad_[idx] = mg_out;
-  }
-
-  T *param_;
-  T *ms_;
-  T *mom_;
-  T *mean_grad_;
-  const T *lr_;
-  T rho_;
-  T epsilon_;
-  T momentum_;
-  GradFunctor grad_functor_;
-};
-
-template <typename DeviceContext, typename T>
-class RmspropOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using LoDTensor = framework::LoDTensor;
-    auto *grad_var = ctx.InputVar("Grad");
-    auto *param_out = ctx.Output<LoDTensor>("ParamOut");
-    auto *moment_out = ctx.Output<LoDTensor>("MomentOut");
-    auto *mean_square_out = ctx.Output<LoDTensor>("MeanSquareOut");
-
-    auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto rho = static_cast<T>(ctx.Attr<float>("decay"));
-    auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
-    bool centered = ctx.Attr<bool>("centered");
-
-    auto &p_tensor = *ctx.Input<LoDTensor>("Param");
-    auto &ms_tensor = *ctx.Input<LoDTensor>("MeanSquare");
-    auto &lr_tensor = *ctx.Input<LoDTensor>("LearningRate");
-    auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
-
-    PADDLE_ENFORCE_EQ(&p_tensor, param_out,
-                      "Param and ParamOut must be the same Tensor");
-    PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
-                      "Moment and MomentOut must be the same Tensor");
-    PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
-                      "MeanSquare and MeanSquareOut must be the same Tensor");
-
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    size_t limit = static_cast<size_t>(ms_tensor.numel());
-
-    if (grad_var->IsType<LoDTensor>()) {
-      auto &grad_tensor = grad_var->Get<LoDTensor>();
-
-      if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
-        auto &place =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        auto lr_value = lr_tensor.data<T>()[0];
-
-        auto p = EigenVector<T>::Flatten(p_tensor);
-        auto ms = EigenVector<T>::Flatten(ms_tensor);
-        auto g = EigenVector<T>::Flatten(grad_tensor);
-        auto mom = EigenVector<T>::Flatten(mom_tensor);
-
-        auto p_out = EigenVector<T>::Flatten(*param_out);
-        auto mom_out = EigenVector<T>::Flatten(*moment_out);
-        auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
-
-        ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-        if (centered) {
-          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
-          auto mg = EigenVector<T>::Flatten(mg_tensor);
-          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-          PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                            "MeanGrad and MeanGradOut must be the same Tensor");
-          auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
-
-          mg_out.device(place) = rho * mg + (1 - rho) * g;
-          mom_out.device(place) =
-              momentum * mom +
-              lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
-        } else {
-          mom_out.device(place) =
-              momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
-        }
-        p_out.device(place) = p - mom_out;
-      } else {
-        DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
-        platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-        if (centered) {
-          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
-          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-          PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                            "MeanGrad and MeanGradOut must be the same Tensor");
-          for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
-              param_out->mutable_data<T>(ctx.GetPlace()),
-              mean_square_out->mutable_data<T>(ctx.GetPlace()),
-              moment_out->mutable_data<T>(ctx.GetPlace()),
-              mean_grad_out->mutable_data<T>(ctx.GetPlace()),
-              lr_tensor.data<T>(), rho, epsilon, momentum, grad_func));
-        } else {
-          for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
-              param_out->mutable_data<T>(ctx.GetPlace()),
-              mean_square_out->mutable_data<T>(ctx.GetPlace()),
-              moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
-              rho, epsilon, momentum, grad_func));
-        }
-      }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto &grad = grad_var->Get<framework::SelectedRows>();
-      framework::SelectedRows tmp_merged_grad;
-      framework::SelectedRows *merged_grad = &tmp_merged_grad;
-      math::scatter::MergeAdd<DeviceContext, T> merge_func;
-      merge_func(dev_ctx, grad, merged_grad);
-
-      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace());
-
-      auto &merged_tensor = merged_grad->value();
-      int64_t row_count = merged_grad->rows().size();
-      int64_t row_numel = merged_tensor.numel() / row_count;
-      SparseRmspropGradFunctor<T> grad_func(merged_tensor.data<T>(), rows,
-                                            row_numel, row_count);
-
-      if (centered) {
-        auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
-        auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-        PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                          "MeanGrad and MeanGradOut must be the same Tensor");
-        for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
-            param_out->mutable_data<T>(ctx.GetPlace()),
-            mean_square_out->mutable_data<T>(ctx.GetPlace()),
-            moment_out->mutable_data<T>(ctx.GetPlace()),
-            mean_grad_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
-            rho, epsilon, momentum, grad_func));
-      } else {
-        for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
-            param_out->mutable_data<T>(ctx.GetPlace()),
-            mean_square_out->mutable_data<T>(ctx.GetPlace()),
-            moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
-            rho, epsilon, momentum, grad_func));
-      }
-    } else {
-      PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
deleted file mode 100644
index 9ccf3d9364635ad0bc4423d09776e5a8f253993c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include <string>
-namespace paddle {
-namespace operators {
-
-class SGDOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of SGDOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 element");
-    auto param_dim = ctx->GetInputDim("Param");
-    // TODO(qijun): check dimensions of Param and Grad at compile
-    // and runtime.
-    ctx->SetOutputDim("ParamOut", param_dim);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
-    if (var_name == "LearningRate") {
-      return framework::OpKernelType(tensor.type(), tensor.place(),
-                                     tensor.layout());
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class SGDOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto &input_var_n = ctx->Input("Param")[0];
-    auto in_var_type = ctx->GetType(input_var_n);
-    PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
-                       in_var_type == framework::proto::VarType::LOD_TENSOR,
-                   "The input Var's type should be LoDtensor or SelectedRows,"
-                   " but the received var(%s)'s type is %s",
-                   input_var_n, in_var_type);
-
-    for (auto &out_var_n : ctx->Output("ParamOut")) {
-      if (ctx->GetType(out_var_n) != in_var_type) {
-        ctx->SetType(out_var_n, in_var_type);
-      }
-    }
-  }
-};
-
-class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor or SelectedRows) Input parameter");
-    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
-    AddOutput("ParamOut",
-              "(Tensor or SelectedRows, same with Param) "
-              "Output parameter, should share the same memory with Param");
-    AddComment(R"DOC(
-
-SGD operator
-
-This operator implements one step of the stochastic gradient descent algorithm.
-
-$$param\_out = param - learning\_rate * grad$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sgd, ops::SGDOp, ops::SGDOpMaker,
-                  paddle::framework::EmptyGradOpMaker, ops::SGDOpInferVarType);
-REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
deleted file mode 100644
index fca982821aea957b1261d54410c47f8f3b442c58..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {
-
-template <typename T>
-__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
-                          const int num, T* p_out) {
-  T lr = learning_rate[0];
-  int grid_size = blockDim.x * gridDim.x;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) {
-    T g_data = g[i];
-    T p_data = p[i];
-    p_out[i] = p_data - lr * g_data;
-  }
-}
-
-template <typename T>
-__global__ void SparseSGDFunctorKernel(const T* selected_rows,
-                                       const int64_t* rows,
-                                       const T* learning_rate, T* tensor_out,
-                                       int64_t row_numel, int64_t limit) {
-  for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
-    const T* selected_rows_ptr = selected_rows + i * row_numel;
-    T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
-    for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
-      // Since index in rows of SelectedRows can be duplicate, we have to use
-      // Atomic Operation to avoid concurrent write error.
-      paddle::platform::CudaAtomicAdd(
-          tensor_out_ptr + index,
-          -static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
-    }
-  }
-}
-}  // namespace
-
-template <typename T>
-class SGDOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
-
-    auto* param = ctx.Input<framework::Tensor>("Param");
-    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    auto* grad_var = ctx.InputVar("Grad");
-    // Actually, all tensors are LoDTensor except SelectedRows.
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      param_out->mutable_data<T>(ctx.GetPlace());
-      auto* grad = ctx.Input<framework::Tensor>("Grad");
-      auto* grad_data = grad->data<T>();
-      auto* param_data = param->data<T>();
-      auto* param_out_data = param_out->data<T>();
-
-      int block = 512;
-      int grid = (param->numel() + block - 1) / block;
-
-      SGDKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
-          grad_data, param_data, learning_rate->data<T>(), param->numel(),
-          param_out_data);
-
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-      // This manual optimization brings difficulty to track data dependency.
-      // It's better to find a more elegant solution.
-      PADDLE_ENFORCE_EQ(param, param_out);
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
-
-      auto in_height = grad->height();
-      auto out_dims = param_out->dims();
-      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
-
-      auto& in_value = grad->value();
-      auto& in_rows = grad->rows();
-
-      int64_t in_row_numel = in_value.numel() / in_rows.size();
-      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
-
-      auto* in_data = in_value.data<T>();
-      auto* out_data = param_out->data<T>();
-
-      const int kThreadsPerBlock = 256;
-      int thread_x = kThreadsPerBlock;
-      int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
-      int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
-      SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
-                               ctx.cuda_device_context().stream()>>>(
-          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
-          out_data, in_row_numel, in_rows.size());
-
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel<float>,
-                        ops::SGDOpCUDAKernel<double>,
-                        ops::SGDOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
deleted file mode 100644
index 5dd5f67e004c63e294152239ab7bd3db26542eed..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class SGDOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    const auto *param_var = ctx.InputVar("Param");
-    const auto *grad_var = ctx.InputVar("Grad");
-
-    if (param_var->IsType<framework::LoDTensor>()) {
-      const auto *param = ctx.Input<framework::Tensor>("Param");
-      auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-      // Actually, all tensors are LoDTensor except SelectedRows.
-      if (grad_var->IsType<framework::LoDTensor>()) {
-        const auto *grad = ctx.Input<framework::Tensor>("Grad");
-        auto sz = param_out->numel();
-        PADDLE_ENFORCE_EQ(param->numel(), sz);
-        PADDLE_ENFORCE_EQ(grad->numel(), sz);
-
-        jit::sgd_attr_t attr(1, sz, 1, sz, 1);
-        const T *lr = learning_rate->data<T>();
-        const T *param_data = param->data<T>();
-        const T *grad_data = grad->data<T>();
-        int64_t rows_idx = 0;
-        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-
-        auto sgd =
-            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
-                attr);
-        sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
-      } else if (grad_var->IsType<framework::SelectedRows>()) {
-        // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-        // This manual optimization brings difficulty to track data dependency.
-        // It's better to find a more elegant solution.
-        PADDLE_ENFORCE_EQ(param, param_out);
-        const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
-        auto &grad_rows = grad->rows();
-
-        // for distributed training, a sparse var may be empty,
-        // just skip updating.
-        if (grad_rows.size() == 0) {
-          return;
-        }
-
-        auto out_dims = param_out->dims();
-        PADDLE_ENFORCE_EQ(grad->height(), out_dims[0]);
-        auto &grad_value = grad->value();
-        const T *param_data = param->data<T>();
-        const T *grad_data = grad_value.data<T>();
-        const T *lr = learning_rate->data<T>();
-        const int64_t *rows_data = grad_rows.data();
-        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-
-        jit::sgd_attr_t attr;
-        attr.param_height = out_dims[0];
-        attr.param_width = param_out->numel() / attr.param_height;
-        attr.grad_height = grad_rows.size();  // note: it is not grad->height()
-        attr.grad_width = grad_value.numel() / attr.grad_height;
-        attr.selected_rows_size = grad_rows.size();
-        PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width);
-
-        auto sgd =
-            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
-                attr);
-        sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
-      } else {
-        PADDLE_THROW("Unsupported Variable Type of Grad");
-      }
-    } else if (param_var->IsType<framework::SelectedRows>()) {
-      PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
-                     "when param "
-                     "is SelectedRows, gradient should also be SelectedRows");
-      const auto &param = param_var->Get<framework::SelectedRows>();
-      auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
-      const auto &grad = grad_var->Get<framework::SelectedRows>();
-
-      // for distributed training, a sparse var may be empty,
-      // just skip updating.
-      if (grad.rows().size() == 0) {
-        return;
-      }
-
-      auto param_row_width = param.value().dims()[1];
-      auto grad_row_width = grad.value().dims()[1];
-      VLOG(4) << " param rows: " << param.rows().size()
-              << " param memory rows: " << param.value().dims()[0]
-              << " grad rows: " << grad.rows().size()
-              << " grad memory rows: " << grad.value().dims()[0];
-      PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
-                        "param_row should have the same size with grad_row");
-
-      const auto *lr = learning_rate->data<T>();
-      const auto *grad_data = grad.value().data<T>();
-      auto *out_data = param_out->mutable_value()->data<T>();
-      for (size_t i = 0; i < grad.rows().size(); i++) {
-        int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
-        PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
-                          "id should be in the table");
-        for (int64_t j = 0; j < grad_row_width; j++) {
-          out_data[id_index * grad_row_width + j] -=
-              lr[0] * grad_data[i * grad_row_width + j];
-        }
-      }
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Parameter");
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
deleted file mode 100644
index 3069d5601442c8bc1fb1dc0a4d08558da4dfd9f1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad2d_op.cc
+++ /dev/null
@@ -1,663 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T>
-void Pad2DConstNCHW(const T* in_data, const int num, const int channels,
-                    const int in_height, const int in_width,
-                    const int out_height, const int out_width,
-                    const int pad_top, const int pad_left, T value,
-                    T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = out_h - pad_top;
-          int in_w = out_w - pad_left;
-          out_data[out_h * out_width + out_w] =
-              (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
-                  ? value
-                  : in_data[in_h * in_width + in_w];
-        }
-      }
-      in_data += in_height * in_width;
-      out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DConstNHWC(const T* in_data, const int num, const int channels,
-                    const int in_height, const int in_width,
-                    const int out_height, const int out_width,
-                    const int pad_top, const int pad_left, T value,
-                    T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        int in_h = out_h - pad_top;
-        int in_w = out_w - pad_left;
-        const int out_index = (out_h * out_width + out_w) * channels;
-        if (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width) {
-          for (int c = 0; c < channels; ++c) {
-            out_data[out_index + c] = value;
-          }
-        } else {
-          const int in_index = (in_h * in_width + in_w) * channels;
-          for (int c = 0; c < channels; ++c) {
-            out_data[out_index + c] = in_data[in_index + c];
-          }
-        }
-      }
-    }
-    in_data += in_height * in_width * channels;
-    out_data += out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void Pad2DReflectNCHW(const T* in_data, const int num, const int channels,
-                      const int in_height, const int in_width,
-                      const int out_height, const int out_width,
-                      const int pad_top, const int pad_left, T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = out_h - pad_top;
-          int in_w = out_w - pad_left;
-          in_h = std::max(in_h, -in_h);  // reflect by 0
-          in_h =
-              std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-          in_w = std::max(in_w, -in_w);                  // reflect by 0
-          in_w =
-              std::min(in_w, 2 * in_width - in_w - 2);  // reflect by in_width
-          out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
-        }
-      }
-      in_data += in_height * in_width;
-      out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DReflectNHWC(const T* in_data, const int num, const int channels,
-                      const int in_height, const int in_width,
-                      const int out_height, const int out_width,
-                      const int pad_top, const int pad_left, T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        const int out_index = (out_h * out_width + out_w) * channels;
-        int in_h = out_h - pad_top;
-        int in_w = out_w - pad_left;
-        in_h = std::max(in_h, -in_h);
-        in_h = std::min(in_h, 2 * in_height - in_h - 2);
-        in_w = std::max(in_w, -in_w);
-        in_w = std::min(in_w, 2 * in_width - in_w - 2);
-        const int in_index = (in_h * in_width + in_w) * channels;
-
-        for (int c = 0; c < channels; ++c) {
-          out_data[out_index + c] = in_data[in_index + c];
-        }
-      }
-    }
-    in_data += in_height * in_width * channels;
-    out_data += out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void Pad2DEdgeNCHW(const T* in_data, const int num, const int channels,
-                   const int in_height, const int in_width,
-                   const int out_height, const int out_width, const int pad_top,
-                   const int pad_left, T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-          int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-          out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
-        }
-      }
-      in_data += in_height * in_width;
-      out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DEdgeNHWC(const T* in_data, const int num, const int channels,
-                   const int in_height, const int in_width,
-                   const int out_height, const int out_width, const int pad_top,
-                   const int pad_left, T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        const int out_index = (out_h * out_width + out_w) * channels;
-        int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-        int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-        const int in_index = (in_h * in_width + in_w) * channels;
-        for (int c = 0; c < channels; ++c) {
-          out_data[out_index + c] = in_data[in_index + c];
-        }
-      }
-    }
-    in_data += in_height * in_width * channels;
-    out_data += out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void Pad2DGradConstNCHW(T* d_in_data, const int num, const int channels,
-                        const int in_height, const int in_width,
-                        const int out_height, const int out_width,
-                        const int pad_top, const int pad_left,
-                        const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = out_h - pad_top;
-          int in_w = out_w - pad_left;
-          if (!(in_h < 0 || in_w < 0 || in_h >= in_height ||
-                in_w >= in_width)) {
-            d_in_data[in_h * in_width + in_w] =
-                d_out_data[out_h * out_width + out_w];
-          }
-        }
-      }
-      d_in_data += in_height * in_width;
-      d_out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DGradConstNHWC(T* d_in_data, const int num, const int channels,
-                        const int in_height, const int in_width,
-                        const int out_height, const int out_width,
-                        const int pad_top, const int pad_left,
-                        const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        int in_h = out_h - pad_top;
-        int in_w = out_w - pad_left;
-        const int out_index = (out_h * out_width + out_w) * channels;
-        if (!(in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)) {
-          const int in_index = (in_h * in_width + in_w) * channels;
-          for (int c = 0; c < channels; ++c) {
-            d_in_data[in_index + c] = d_out_data[out_index + c];
-          }
-        }
-      }
-    }
-    d_in_data += in_height * in_width * channels;
-    d_out_data += out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void Pad2DGradReflectNCHW(T* d_in_data, const int num, const int channels,
-                          const int in_height, const int in_width,
-                          const int out_height, const int out_width,
-                          const int pad_top, const int pad_left,
-                          const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = out_h - pad_top;
-          int in_w = out_w - pad_left;
-          in_h = std::max(in_h, -in_h);  // reflect over 0
-          in_h = std::min(in_h,
-                          2 * in_height - in_h - 2);  // reflect over in_height
-          in_w = std::max(in_w, -in_w);               // reflect over 0
-          in_w =
-              std::min(in_w, 2 * in_width - in_w - 2);  // reflect over in_width
-          d_in_data[in_h * in_width + in_w] +=
-              d_out_data[out_h * out_width + out_w];
-        }
-      }
-      d_in_data += in_height * in_width;
-      d_out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DGradReflectNHWC(T* d_in_data, const int num, const int channels,
-                          const int in_height, const int in_width,
-                          const int out_height, const int out_width,
-                          const int pad_top, const int pad_left,
-                          const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        const int out_index = (out_h * out_width + out_w) * channels;
-        int in_h = out_h - pad_top;
-        int in_w = out_w - pad_left;
-        in_h = std::max(in_h, -in_h);
-        in_h = std::min(in_h, 2 * in_height - in_h - 2);
-        in_w = std::max(in_w, -in_w);
-        in_w = std::min(in_w, 2 * in_width - in_w - 2);
-        const int in_index = (in_h * in_width + in_w) * channels;
-        for (int c = 0; c < channels; ++c) {
-          d_in_data[in_index + c] += d_out_data[out_index + c];
-        }
-      }
-    }
-    d_in_data += in_height * in_width * channels;
-    d_out_data += out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void Pad2DGradEdgeNCHW(T* d_in_data, const int num, const int channels,
-                       const int in_height, const int in_width,
-                       const int out_height, const int out_width,
-                       const int pad_top, const int pad_left,
-                       const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-          int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-          d_in_data[in_h * in_width + in_w] +=
-              d_out_data[out_h * out_width + out_w];
-        }
-      }
-      d_in_data += in_height * in_width;
-      d_out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
-                       const int in_height, const int in_width,
-                       const int out_height, const int out_width,
-                       const int pad_top, const int pad_left,
-                       const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        const int out_index = (out_h * out_width + out_w) * channels;
-        int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-        int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-        const int in_index = (in_h * in_width + in_w) * channels;
-        for (int c = 0; c < channels; ++c) {
-          d_in_data[in_index + c] += d_out_data[out_index + c];
-        }
-      }
-    }
-    d_in_data += in_height * in_width * channels;
-    d_out_data += out_height * out_width * channels;
-  }
-}
-
-static inline void GetPaddings(int* paddings,
-                               const framework::ExecutionContext& context) {
-  auto* paddings_t = context.Input<Tensor>("Paddings");
-  if (paddings_t) {
-    auto paddings_data = paddings_t->data<int>();
-    paddings[0] = paddings_data[0];
-    paddings[1] = paddings_data[1];
-    paddings[2] = paddings_data[2];
-    paddings[3] = paddings_data[3];
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings);
-  }
-}
-
-template <typename T>
-class Pad2dCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    int pads[4];
-    GetPaddings(pads, context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    T value = context.Attr<T>("pad_value");
-
-    auto* x = context.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const T* in_data = x->data<T>();
-
-    auto* out = context.Output<Tensor>("Out");
-    if (data_format == "NCHW") {
-      out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[0] + pads[1],
-                   in_dims[3] + pads[2] + pads[3]});
-    } else {
-      out->Resize({in_dims[0], in_dims[1] + pads[0] + pads[1],
-                   in_dims[2] + pads[2] + pads[3], in_dims[3]});
-    }
-    auto out_dims = out->dims();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    const int pad_top = pads[0];
-    const int pad_left = pads[2];
-    const int num = in_dims[0];
-    if (data_format == "NCHW") {
-      const int channels = in_dims[1];
-      const int in_height = in_dims[2];
-      const int in_width = in_dims[3];
-      const int out_height = out_dims[2];
-      const int out_width = out_dims[3];
-      if (mode == "reflect") {
-        Pad2DReflectNCHW(in_data, num, channels, in_height, in_width,
-                         out_height, out_width, pad_top, pad_left, out_data);
-      } else if (mode == "edge") {
-        Pad2DEdgeNCHW(in_data, num, channels, in_height, in_width, out_height,
-                      out_width, pad_top, pad_left, out_data);
-      } else {
-        Pad2DConstNCHW(in_data, num, channels, in_height, in_width, out_height,
-                       out_width, pad_top, pad_left, value, out_data);
-      }
-    } else {
-      const int channels = in_dims[3];
-      const int in_height = in_dims[1];
-      const int in_width = in_dims[2];
-      const int out_height = out_dims[1];
-      const int out_width = out_dims[2];
-      if (mode == "reflect") {
-        Pad2DReflectNHWC(in_data, num, channels, in_height, in_width,
-                         out_height, out_width, pad_top, pad_left, out_data);
-      } else if (mode == "edge") {
-        Pad2DEdgeNHWC(in_data, num, channels, in_height, in_width, out_height,
-                      out_width, pad_top, pad_left, out_data);
-      } else {
-        Pad2DConstNHWC(in_data, num, channels, in_height, in_width, out_height,
-                       out_width, pad_top, pad_left, value, out_data);
-      }
-    }
-  }
-};
-
-template <typename T>
-class Pad2dGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    int pads[4];
-    GetPaddings(pads, context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    auto d_out_dims = d_out->dims();
-    const T* d_out_data = d_out->data<T>();
-    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context.template device_context<platform::CPUDeviceContext>(),
-             d_in, static_cast<T>(0));
-    const int pad_top = pads[0];
-    const int pad_left = pads[2];
-    const int num = d_in_dims[0];
-    if (data_format == "NCHW") {
-      const int channels = d_in_dims[1];
-      const int in_height = d_in_dims[2];
-      const int in_width = d_in_dims[3];
-      const int out_height = d_out_dims[2];
-      const int out_width = d_out_dims[3];
-      if (mode == "reflect") {
-        Pad2DGradReflectNCHW(d_in_data, num, channels, in_height, in_width,
-                             out_height, out_width, pad_top, pad_left,
-                             d_out_data);
-      } else if (mode == "edge") {
-        Pad2DGradEdgeNCHW(d_in_data, num, channels, in_height, in_width,
-                          out_height, out_width, pad_top, pad_left, d_out_data);
-      } else {
-        Pad2DGradConstNCHW(d_in_data, num, channels, in_height, in_width,
-                           out_height, out_width, pad_top, pad_left,
-                           d_out_data);
-      }
-    } else {
-      const int channels = d_in_dims[3];
-      const int in_height = d_in_dims[1];
-      const int in_width = d_in_dims[2];
-      const int out_height = d_out_dims[1];
-      const int out_width = d_out_dims[2];
-      if (mode == "reflect") {
-        Pad2DGradReflectNHWC(d_in_data, num, channels, in_height, in_width,
-                             out_height, out_width, pad_top, pad_left,
-                             d_out_data);
-      } else if (mode == "edge") {
-        Pad2DGradEdgeNHWC(d_in_data, num, channels, in_height, in_width,
-                          out_height, out_width, pad_top, pad_left, d_out_data);
-      } else {
-        Pad2DGradConstNHWC(d_in_data, num, channels, in_height, in_width,
-                           out_height, out_width, pad_top, pad_left,
-                           d_out_data);
-      }
-    }
-  }
-};
-
-class Pad2dOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of Pad2dOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of Pad2dOp should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dim.size(), 4,
-                      "The size of input(X)'s dimension should be equal to 4.");
-
-    std::vector<int64_t> out_dims(x_dim.size());
-    auto data_format = ctx->Attrs().Get<std::string>("data_format");
-    out_dims[0] = x_dim[0];
-    if (ctx->HasInput("Paddings")) {
-      auto paddings_dim = ctx->GetInputDim("Paddings");
-      PADDLE_ENFORCE_EQ(
-          paddings_dim.size(), 1,
-          "Size of Input(Paddings)'s dimension should be equal to 1.");
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(paddings_dim[0], 4,
-                          "Shape of Input(Paddings) should be equal to [4].");
-      }
-      out_dims[1] = x_dim[1];
-      out_dims[2] = x_dim[2];
-      out_dims[3] = x_dim[3];
-    } else {
-      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-      PADDLE_ENFORCE_EQ(paddings.size(), 4,
-                        "Size of paddings should be equal to 4.");
-      if (data_format == "NCHW") {
-        out_dims[1] = x_dim[1];  // channel
-        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
-                          ? x_dim[2]
-                          : (x_dim[2] + paddings[0] + paddings[1]);  // height
-        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
-                          ? x_dim[3]
-                          : (x_dim[3] + paddings[2] + paddings[3]);  // width
-      } else {                                                       // NHWC
-        out_dims[3] = x_dim[3];                                      // channel
-        out_dims[1] = ((!ctx->IsRuntime()) && (x_dim[1] < 0))
-                          ? x_dim[1]
-                          : (x_dim[1] + paddings[0] + paddings[1]);  // height
-        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
-                          ? x_dim[2]
-                          : (x_dim[2] + paddings[2] + paddings[3]);  // width
-      }
-    }
-
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input of pad2d op. "
-             "The input should be a 4-D tensor with formate NCHW or NHWC.");
-    AddOutput("Out",
-              "The output of pad2d op. "
-              "A tensor with the same shape as X.");
-    AddInput("Paddings",
-             "A 1-D tensor to describe the padding rules."
-             "paddings=[0, 1, 2, 3] means "
-             "padding 0 row to top, 1 row to bottom, 2 columns to left "
-             "and 3 columns to right. Size of paddings must be 4.")
-        .AsDispensable();
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "(vector<int>) "
-        "A list<int> to describe the padding rules."
-        "paddings=[0, 1, 2, 3] means "
-        "padding 0 row to top, 1 row to bottom, 2 columns to left "
-        "and 3 columns to right. Size of paddings must be 4.");
-    AddAttr<float>("pad_value",
-                   "(float, default 0.0) "
-                   "The value to fill the padded areas in constant mode.")
-        .SetDefault(0.0f);
-    AddAttr<std::string>("mode",
-                         "(float, default constant) "
-                         "Three modes: constant(default), reflect, edge.")
-        .SetDefault("constant");
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the input data.")
-        .SetDefault("NCHW");
-    AddComment(R"DOC(
-Pad2d Operator.
-Pad 2-d images accordding to 'paddings' and 'mode'. 
-If mode is 'reflect', paddings[0] and paddings[1] must be no greater
-than height-1. And the width dimension has the same condition.
-
-Given that X is a channel of image from input:
-
-X = [[1, 2, 3],
-     [4, 5, 6]]
-
-Case 0:
-
-paddings = [0, 1, 2, 3],
-mode = 'constant'
-pad_value = 0
-
-Out = [[0, 0, 1, 2, 3, 0, 0, 0]
-       [0, 0, 4, 5, 6, 0, 0, 0]
-       [0, 0, 0, 0, 0, 0, 0, 0]]
-
-Case 1:
-
-paddings = [0, 1, 2, 1],
-mode = 'reflect'
-
-Out = [[3, 2, 1, 2, 3, 2]
-       [6, 5, 4, 5, 6, 5]
-       [3, 2, 1, 2, 3, 2]]
-
-Case 2:
-
-paddings = [0, 1, 2, 1],
-mode = 'edge'
-
-Out = [[1, 1, 1, 2, 3, 3]
-       [4, 4, 4, 5, 6, 6]
-       [4, 4, 4, 5, 6, 6]]
-)DOC");
-  }
-};
-
-class Pad2dOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-};
-
-class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* bind = new framework::OpDesc();
-    bind->SetInput("X", Input("X"));
-    if (ForwardOp().Inputs().count("Paddings") > 0) {
-      bind->SetInput("Paddings", Input("Paddings"));
-    }
-    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    bind->SetAttrMap(Attrs());
-    bind->SetType("pad2d_grad");
-    return std::unique_ptr<framework::OpDesc>(bind);
-  }
-};
-
-// TODO(zjl): Paddings can also be skipped!
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(Pad2dOpGradNoNeedBufferVarsInference,
-                                      "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker,
-                  ops::Pad2dOpGradMaker);
-REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad,
-                  ops::Pad2dOpGradNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel<float>);
-REGISTER_OP_CPU_KERNEL(pad2d_grad, ops::Pad2dGradCPUKernel<float>);
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
deleted file mode 100644
index 72eca08b06b144335424a669241b5754beda758d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad2d_op.cu
+++ /dev/null
@@ -1,463 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-using framework::Tensor;
-
-template <typename T>
-__global__ void Pad2DConstNCHW(const int nthreads, const T* in_data,
-                               const int num, const int channels,
-                               const int in_height, const int in_width,
-                               const int out_height, const int out_width,
-                               const int pad_top, const int pad_left, T value,
-                               T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    out_data[index] =
-        (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
-            ? value
-            : in_data[(nc * in_height + in_h) * in_width + in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DConstNHWC(const int nthreads, const T* in_data,
-                               const int num, const int channels,
-                               const int in_height, const int in_width,
-                               const int out_height, const int out_width,
-                               const int pad_top, const int pad_left, T value,
-                               T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int in_h = out_h - pad_top;
-    const int in_w = out_w - pad_left;
-    out_data[index] =
-        (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
-            ? value
-            : in_data[((n * in_height + in_h) * in_width + in_w) * channels +
-                      c];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DReflectNCHW(const int nthreads, const T* in_data,
-                                 const int num, const int channels,
-                                 const int in_height, const int in_width,
-                                 const int out_height, const int out_width,
-                                 const int pad_top, const int pad_left,
-                                 T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    in_h = max(in_h, -in_h);                     // reflect by 0
-    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-    in_w = max(in_w, -in_w);                     // reflect by 0
-    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-    out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DReflectNHWC(const int nthreads, const T* in_data,
-                                 const int num, const int channels,
-                                 const int in_height, const int in_width,
-                                 const int out_height, const int out_width,
-                                 const int pad_top, const int pad_left,
-                                 T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    in_h = max(in_h, -in_h);
-    in_h = min(in_h, 2 * in_height - in_h - 2);
-    in_w = max(in_w, -in_w);
-    in_w = min(in_w, 2 * in_width - in_w - 2);
-    out_data[index] =
-        in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DEdgeNCHW(const int nthreads, const T* in_data,
-                              const int num, const int channels,
-                              const int in_height, const int in_width,
-                              const int out_height, const int out_width,
-                              const int pad_top, const int pad_left,
-                              T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DEdgeNHWC(const int nthreads, const T* in_data,
-                              const int num, const int channels,
-                              const int in_height, const int in_width,
-                              const int out_height, const int out_width,
-                              const int pad_top, const int pad_left,
-                              T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    out_data[index] =
-        in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradConstNCHW(const int in_size, T* d_in_data,
-                                   const int num, const int channels,
-                                   const int in_height, const int in_width,
-                                   const int out_height, const int out_width,
-                                   const int pad_top, const int pad_left,
-                                   const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(in_index, in_size) {
-    int nc = in_index / in_width;
-    const int out_w = in_index % in_width + pad_left;
-    const int out_h = nc % in_height + pad_top;
-    nc /= in_height;
-    d_in_data[in_index] =
-        d_out_data[(nc * out_height + out_h) * out_width + out_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradConstNHWC(const int in_size, T* d_in_data,
-                                   const int num, const int channels,
-                                   const int in_height, const int in_width,
-                                   const int out_height, const int out_width,
-                                   const int pad_top, const int pad_left,
-                                   const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(in_index, in_size) {
-    int n = in_index / channels;
-    const int c = in_index % channels;
-    const int out_w = n % in_width + pad_left;
-    n /= in_width;
-    const int out_h = n % in_height + pad_top;
-    n /= in_height;
-    d_in_data[in_index] =
-        d_out_data[((n * out_height + out_h) * out_width + out_w) * channels +
-                   c];
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradReflectNCHW(const int out_size, T* d_in_data,
-                                     const int num, const int channels,
-                                     const int in_height, const int in_width,
-                                     const int out_height, const int out_width,
-                                     const int pad_top, const int pad_left,
-                                     const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    in_h = max(in_h, -in_h);
-    in_w = max(in_w, -in_w);
-    in_h = min(in_h, 2 * in_height - in_h - 2);
-    in_w = min(in_w, 2 * in_width - in_w - 2);
-    atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
-              d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradReflectNHWC(const int out_size, T* d_in_data,
-                                     const int num, const int channels,
-                                     const int in_height, const int in_width,
-                                     const int out_height, const int out_width,
-                                     const int pad_top, const int pad_left,
-                                     const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    in_h = max(in_h, -in_h);
-    in_w = max(in_w, -in_w);
-    in_h = min(in_h, in_height * 2 - in_h - 2);
-    in_w = min(in_w, in_width * 2 - in_w - 2);
-    atomicAdd(
-        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradEdgeNCHW(const int out_size, T* d_in_data,
-                                  const int num, const int channels,
-                                  const int in_height, const int in_width,
-                                  const int out_height, const int out_width,
-                                  const int pad_top, const int pad_left,
-                                  const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
-              d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
-                                  const int num, const int channels,
-                                  const int in_height, const int in_width,
-                                  const int out_height, const int out_width,
-                                  const int pad_top, const int pad_left,
-                                  const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    atomicAdd(
-        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-static inline void GetPaddings(int* paddings,
-                               const framework::ExecutionContext& context) {
-  auto* paddings_t = context.Input<Tensor>("Paddings");
-  if (paddings_t) {
-    Tensor pads;
-    framework::TensorCopySync(*paddings_t, platform::CPUPlace(), &pads);
-    auto pads_data = pads.data<int>();
-    paddings[0] = pads_data[0];
-    paddings[1] = pads_data[1];
-    paddings[2] = pads_data[2];
-    paddings[3] = pads_data[3];
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings);
-  }
-}
-
-template <typename T>
-class Pad2dCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    int pads[4];
-    GetPaddings(pads, context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    T value = context.Attr<T>("pad_value");
-
-    auto* x = context.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const T* in_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
-    auto out_dims = out->dims();
-    if (data_format == "NCHW") {
-      out_dims[0] = in_dims[0];
-      out_dims[1] = in_dims[1];
-      out_dims[2] = in_dims[2] + pads[0] + pads[1];
-      out_dims[3] = in_dims[3] + pads[2] + pads[3];
-    } else {
-      out_dims[0] = in_dims[0];
-      out_dims[1] = in_dims[1] + pads[0] + pads[1];
-      out_dims[2] = in_dims[2] + pads[2] + pads[3];
-      out_dims[3] = in_dims[3];
-    }
-    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
-    const int pad_top = pads[0];
-    const int pad_left = pads[2];
-    const int num = in_dims[0];
-
-    auto stream = context.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    const int out_size = out->numel();
-    int grid = (out_size + block - 1) / block;
-
-    if (data_format == "NCHW") {
-      const int channels = in_dims[1];
-      const int in_height = in_dims[2];
-      const int in_width = in_dims[3];
-      const int out_height = out_dims[2];
-      const int out_width = out_dims[3];
-      if (mode == "reflect") {
-        Pad2DReflectNCHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, out_data);
-      } else if (mode == "edge") {
-        Pad2DEdgeNCHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, out_data);
-      } else {
-        Pad2DConstNCHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, value, out_data);
-      }
-    } else {
-      const int channels = in_dims[3];
-      const int in_height = in_dims[1];
-      const int in_width = in_dims[2];
-      const int out_height = out_dims[1];
-      const int out_width = out_dims[2];
-      if (mode == "reflect") {
-        Pad2DReflectNHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, out_data);
-      } else if (mode == "edge") {
-        Pad2DEdgeNHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, out_data);
-      } else {
-        Pad2DConstNHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, value, out_data);
-      }
-    }
-  }
-};
-
-template <typename T>
-class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    int pads[4];
-    GetPaddings(pads, context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    auto d_out_dims = d_out->dims();
-    const T* d_out_data = d_out->data<T>();
-    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(context.template device_context<platform::CUDADeviceContext>(),
-             d_in, static_cast<T>(0));
-
-    const int pad_top = pads[0];
-    const int pad_left = pads[2];
-    const int num = d_in_dims[0];
-
-    auto stream = context.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    const int out_size = d_out->numel();
-    const int in_size = d_in->numel();
-    int grid = (out_size + block - 1) / block;
-
-    if (data_format == "NCHW") {
-      const int channels = d_in_dims[1];
-      const int in_height = d_in_dims[2];
-      const int in_width = d_in_dims[3];
-      const int out_height = d_out_dims[2];
-      const int out_width = d_out_dims[3];
-      if (mode == "reflect") {
-        Pad2DGradReflectNCHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, d_out_data);
-      } else if (mode == "edge") {
-        Pad2DGradEdgeNCHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, d_out_data);
-      } else {
-        grid = (in_size + block - 1) / block;
-        Pad2DGradConstNCHW<T><<<grid, block, 0, stream>>>(
-            in_size, d_in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, d_out_data);
-      }
-    } else {
-      const int channels = d_in_dims[3];
-      const int in_height = d_in_dims[1];
-      const int in_width = d_in_dims[2];
-      const int out_height = d_out_dims[1];
-      const int out_width = d_out_dims[2];
-      if (mode == "reflect") {
-        Pad2DGradReflectNHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, d_out_data);
-      } else if (mode == "edge") {
-        Pad2DGradEdgeNHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, d_out_data);
-      } else {
-        grid = (in_size + block - 1) / block;
-        Pad2DGradConstNHWC<T><<<grid, block, 0, stream>>>(
-            in_size, d_in_data, num, channels, in_height, in_width, out_height,
-            out_width, pad_top, pad_left, d_out_data);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(pad2d, ops::Pad2dCUDAKernel<float>);
-REGISTER_OP_CUDA_KERNEL(pad2d_grad, ops::Pad2dGradCUDAKernel<float>);
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
deleted file mode 100644
index 31ed0a686f712bd286b4accda68716b156037dbc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pad_constant_like_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class PadConstantLikeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of PadConstantLikeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of PadConstantLikeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of PadConstantLikeOp should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto y_dim = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dim.size(), y_dim.size(),
-                      "The dimention of X and Y should be the same.");
-
-    for (int i = 0; i < x_dim.size(); ++i) {
-      if ((!ctx->IsRuntime()) && ((x_dim[i] == -1) || (y_dim[i] == -1))) {
-        continue;
-      } else {
-        PADDLE_ENFORCE_GE(
-            x_dim[i], y_dim[i],
-            "expected X_dim[i] >= Y_dim[i], but received %d < %d for dim %d",
-            x_dim[i], y_dim[i], i);
-      }
-    }
-
-    ctx->SetOutputDim("Out", x_dim);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Y")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class PadConstantLikeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input of pad_constant_like op. "
-             "The input should be a k-D tensor(k > 0 and k < 7)");
-    AddInput("Y",
-             "The input of pad_constant_like op. "
-             "The input should be a k-D tensor(k > 0 and k < 7)");
-    AddOutput("Out",
-              "The output of pad_constant_like op. "
-              "A tensor with the same shape as X.");
-    AddAttr<float>("pad_value",
-                   "(float, default 0.0) "
-                   "The value to fill the padded areas.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-PadConstantLikeOp Operator.
-
-Pad input(Y) with a pad_value, the number of values padded to the edges of each
-axis is specified by the difference of the shape of X and Y.
-((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for
-each axis.
-The input should be a k-D tensor(k > 0 and k < 7). As an example:
-
-case1:
-    Given:
-        X = [[1, 2],
-             [3, 4],
-             [1, 2],
-             [3, 4]]],
-        X.shape = (4, 2)
-
-        Y = [[5, 6],
-            [7, 8]],
-        Y.shape = (2, 2)
-
-    And
-        pad_value = 0,
-
-    Return:
-        Out = [[5, 6],
-               [7, 8],
-               [0, 0],
-               [0, 0]]
-        Out.shape = (4, 2)
-
-case2:
-    Given:
-        X = [[[[ 0,  1,  2],
-               [ 3,  4,  5]],
-              [[ 6,  7,  8],
-               [ 9, 10, 11]],
-              [[12, 13, 14],
-               [15, 16, 17]]],
-             [[[18, 19, 20],
-               [21, 22, 23]],
-              [[24, 25, 26],
-               [27, 28, 29]],
-              [[30, 31, 32],
-               [33, 34, 35]]]]
-        X.shape = (2, 3, 2, 3)
-
-        Y = [[[[35, 36, 37]],
-              [[38, 39, 40]],
-              [[41, 42, 43]]]]
-        Y.shape = (1, 3, 1, 3)
-
-    And
-        pad_value = -1,
-
-    Return:
-
-        Out = [[[[35, 36, 37],
-                 [-1, -1, -1]],
-                [[38, 39, 40],
-                 [-1, -1, -1]],
-                [[41, 42, 43],
-                 [-1, -1, -1]]],
-               [[[-1, -1, -1],
-                 [-1, -1, -1]],
-                [[-1, -1, -1],
-                 [-1, -1, -1]],
-                [[-1, -1, -1],
-                 [-1, -1, -1]]]]
-        Out.shape = (2, 3, 2, 3)
-)DOC");
-  }
-};
-
-class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto y_dim = ctx->GetInputDim("Y");
-    auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(dout_dim.size(), y_dim.size(),
-                      "The dimention of X and Y should be the same.");
-
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dim);
-      ctx->ShareLoD("Y", /*->*/ y_grad_name);
-
-      for (int i = 0; i < y_dim.size(); ++i) {
-        if ((!ctx->IsRuntime()) && ((dout_dim[i] == -1) || (y_dim[i] == -1))) {
-          continue;
-        } else {
-          PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i],
-                            "expected Out_dim[i] >= Y_dim[i], but received %d "
-                            "< %d for dim %d",
-                            dout_dim[i], y_dim[i], i);
-        }
-      }
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Y")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class PadConstantLikeOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *bind = new framework::OpDesc();
-    bind->SetType("pad_constant_like_grad");
-    bind->SetInput("Y", Input("Y"));
-    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    bind->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    bind->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(bind);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(pad_constant_like, ops::PadConstantLikeOp,
-                  ops::PadConstantLikeOpMaker, ops::PadConstantLikeOpGradMaker);
-REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    pad_constant_like,
-    ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/pad_constant_like_op.cu b/paddle/fluid/operators/pad_constant_like_op.cu
deleted file mode 100644
index 9e62a6dc9d34a96c59a08d0e5fd6cdd9f0d6d51d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad_constant_like_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/pad_constant_like_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    pad_constant_like,
-    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
deleted file mode 100644
index 01d66901afc49a487c344b039b65f547967e95ff..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/padding.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PadConstantLikeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto in_x = context.Input<framework::Tensor>("X");
-    auto in_y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    if (in_x->dims() == in_y->dims()) {
-      // TensorCopy(in_y, context.GetPlace(), context, out);
-      out->ShareDataWith(*in_y);
-      return;
-    }
-
-    T pad_value = context.Attr<T>("pad_value");
-    out->mutable_data<T>(context.GetPlace());
-
-    int rank = context.Input<framework::Tensor>("X")->dims().size();
-
-    std::vector<int> pads(rank * 2, 0);
-
-    for (int j = 0; j < rank; ++j) {
-      pads[j * 2] = 0;
-      pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
-    }
-
-    math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value,
-                                           *in_y, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PadConstantLikeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto in_y = context.Input<framework::Tensor>("Y");
-    auto in_dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_y = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    if (d_y == nullptr) {
-      return;
-    }
-
-    if (in_dout->dims() == in_y->dims()) {
-      // TensorCopy(in_dout, context.GetPlace(), context, d_y);
-      d_y->ShareDataWith(*in_dout);
-      return;
-    }
-
-    d_y->mutable_data<T>(context.GetPlace());
-    int rank = in_dout->dims().size();
-
-    std::vector<int> pads(static_cast<size_t>(rank) * 2, 0);
-    for (int j = 0; j < rank; ++j) {
-      pads[j * 2] = 0;
-      pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
-    }
-
-    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout,
-                                               d_y);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
deleted file mode 100644
index 36dc8b0dbb3d3b6537b6395f4c831ac25b03a4c6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad_op.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pad_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class PadOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of PadOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of PadOp should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
-                      "Size of paddings should be equal to 2 * dimension size "
-                      "of input tensor.");
-    for (size_t i = 0; i < paddings.size(); ++i) {
-      PADDLE_ENFORCE_GE(paddings[i], 0, "paddings should >= 0.");
-    }
-    std::vector<int64_t> out_dims(x_dim.size());
-    for (int i = 0; i < x_dim.size(); ++i) {
-      if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) {
-        out_dims[i] = -1;
-      } else {
-        out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
-      }
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-    if (out_dims[0] == x_dim[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
-class PadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input of pad op. "
-             "The input should be a k-D tensor(k > 0 and k < 7)");
-    AddOutput("Out",
-              "The output of pad op. "
-              "A tensor with the same shape as X.");
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "(vector<int>) "
-        "A list<int> to describe the padding rules for each dimension. "
-        "For 2-D image tensor, paddings=[0, 1, 2, 3] means "
-        "padding 0 row to top, 1 row to bottom, 2 columns to left "
-        "and 3 columns to right. Size of paddings should be equal to "
-        "2 * dimension size of the input tensor.");
-    AddAttr<float>("pad_value",
-                   "(float, default 0.0) "
-                   "The value to fill the padded areas.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-Pad Operator.
-
-Pad input into output, as specified by paddings and pad_value. 
-The input should be a k-D tensor(k > 0 and k < 7). As an example:
-
-Given:
-
-X = [[1, 2],
-     [3, 4]],
-
-paddings = [0, 1, 1, 2],
-
-and
-
-pad_value = 0,
-
-we have:
-
-Out = [[0, 1, 2, 0, 0]
-       [0, 3, 4, 0, 0]
-       [0, 0, 0, 0, 0]]
-
-)DOC");
-  }
-};
-
-class PadOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-      auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-      for (int i = 0; i < dout_dims.size(); ++i) {
-        if (ctx->IsRuntime() || (dout_dims[i] != -1)) {
-          dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
-        }
-      }
-      ctx->SetOutputDim(x_grad_name, dout_dims);
-    }
-  }
-};
-
-class PadOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* bind = new framework::OpDesc();
-    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    bind->SetAttrMap(Attrs());
-    bind->SetType("pad_grad");
-    return std::unique_ptr<framework::OpDesc>(bind);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
-REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/pad_op.cu b/paddle/fluid/operators/pad_op.cu
deleted file mode 100644
index 95098a8dca36594c3af60ad8488217e71c673a75..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/pad_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
deleted file mode 100644
index 32698dac4917e183cfe36c831787b049985b19b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad_op.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/padding.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class PadKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    T pad_value = context.Attr<T>("pad_value");
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    int rank = x->dims().size();
-    math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value, *x,
-                                           out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PadGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-    if (d_x == nullptr) {
-      return;
-    }
-
-    d_x->mutable_data<T>(context.GetPlace());
-    int rank = d_out->dims().size();
-    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
-                                               d_x);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
deleted file mode 100644
index 59ba660af79bff02cd350afb3eb7675bfe8ac498..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pixel_shuffle_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class PixelShuffleOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of PixelShuffleOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of PixelShuffleOp should not be null.");
-
-    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
-    auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
-
-    PADDLE_ENFORCE(input_dims[1] % (upscale_factor * upscale_factor) == 0,
-                   "Upscale_factor should devide the number of channel");
-
-    auto output_dims = input_dims;
-    output_dims[0] = input_dims[0];
-    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
-    output_dims[2] = input_dims[2] * upscale_factor;
-    output_dims[3] = input_dims[3] * upscale_factor;
-    ctx->SetOutputDim("Out", output_dims);
-  }
-};
-
-class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor, default Tensor<float>), "
-        "the input feature data of PixelShuffleOp, the layout is [N C H W].");
-    AddOutput(
-        "Out",
-        "(Tensor, default Tensor<float>), the output of "
-        "PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor].");
-    AddAttr<int>("upscale_factor",
-                 "the factor to increase spatial resolution by.")
-        .SetDefault(1)
-        .AddCustomChecker([](const int& upscale_factor) {
-          PADDLE_ENFORCE_GE(upscale_factor, 1,
-                            "upscale_factor should be larger than 0.");
-        });
-
-    AddComment(R"DOC(
-		Pixel Shuffle operator
-		This operator rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
-    		to a tensor of shape :math:`(C, H \times r, W \times r)`.
-
-		This is useful for implementing efficient sub-pixel convolution
-    		with a stride of :math:`1/r`.
-
-		Please refer to the paper:
-		 `Real-Time Single Image and Video Super-Resolution Using an Efficient 
-		 Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
-    		by Shi et. al (2016) for more details. 
-
-        )DOC");
-  }
-};
-
-class PixelShuffleGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType("pixel_shuffle_grad");
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetAttrMap(Attrs());
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-class PixelShuffleGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@Grad) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@Grad) should not be null");
-
-    auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(do_dims.size() == 4, "The layout of input is NCHW.");
-
-    auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
-
-    auto dx_dims = do_dims;
-    dx_dims[0] = do_dims[0];
-    dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
-    dx_dims[2] = do_dims[2] / upscale_factor;
-    dx_dims[3] = do_dims[3] / upscale_factor;
-    ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
-                  ops::PixelShuffleGradMaker);
-
-REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    pixel_shuffle,
-    ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    pixel_shuffle_grad,
-    ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cu b/paddle/fluid/operators/pixel_shuffle_op.cu
deleted file mode 100644
index 6faf91079e1dac00b3516ccde8dc82cec73a79e6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pixel_shuffle_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pixel_shuffle_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    pixel_shuffle, ops::PixelShuffleOpKernel<plat::CUDADeviceContext, float>,
-    ops::PixelShuffleOpKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    pixel_shuffle_grad,
-    ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, float>,
-    ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h
deleted file mode 100644
index 1ae1c7e9d50cb9d701fd0e79337a1906f2f5d545..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pixel_shuffle_op.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PixelShuffleOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int factor = ctx.Attr<int>("upscale_factor");
-
-    auto in_dims = in->dims();
-    auto o_dims = out->dims();
-
-    framework::Tensor t;
-    t.ShareDataWith(*in);
-    t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
-
-    std::vector<int> axis = {0, 1, 4, 2, 5, 3};
-
-    framework::Tensor o;
-    o.ShareDataWith(*out);
-    o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
-
-    math::Transpose<DeviceContext, T, 6> trans;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    trans(dev_ctx, t, &o, axis);
-    out->Resize(o_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    int factor = ctx.Attr<int>("upscale_factor");
-
-    auto do_dims = dout->dims();
-    auto dx_dims = dx->dims();
-
-    framework::Tensor t;
-    t.ShareDataWith(*dout);
-    t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
-
-    std::vector<int> axis = {0, 1, 3, 5, 2, 4};
-
-    framework::Tensor o;
-    o.ShareDataWith(*dx);
-    o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
-
-    math::Transpose<DeviceContext, T, 6> trans;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    trans(dev_ctx, t, &o, axis);
-    dx->Resize(dx_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
deleted file mode 100644
index b26f127026804239afd2ebffcb1f93eb3011e238..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/pool_op.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
-using DataLayout = platform::DataLayout;
-using PoolingMode = platform::PoolingMode;
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-
-template <typename T>
-class PoolCUDNNOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-
-    const Tensor *input = ctx.Input<Tensor>("X");
-    Tensor *output = ctx.Output<Tensor>("Out");
-
-    const T *input_data = input->data<T>();
-    T *output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    if (ctx.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(input->dims()[i + 2]);
-      }
-    }
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
-    DataLayout layout;
-
-    if (strides.size() == 2U) {
-      layout = DataLayout::kNCHW;
-    } else {
-      layout = DataLayout::kNCDHW;
-    }
-
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize<int>(output->dims()));
-
-    PoolingMode pooling_mode;
-    if (pooling_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : PoolingMode::kAverageInclusive;
-    }
-
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-
-    // ------------------- cudnn pool algorithm ---------------------
-    auto handle = ctx.cuda_device_context().cudnn_handle();
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
-        cudnn_output_desc, output_data));
-  }
-};
-
-template <typename T>
-class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-
-    const Tensor *input = ctx.Input<Tensor>("X");
-    const Tensor *output = ctx.Input<Tensor>("Out");
-    const Tensor *output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-
-    if (ctx.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(input->dims()[i + 2]);
-      }
-    }
-
-    const T *input_data = input->data<T>();
-    const T *output_data = output->data<T>();
-    const T *output_grad_data = output_grad->data<T>();
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
-    DataLayout layout;
-
-    if (strides.size() == 2U) {
-      layout = DataLayout::kNCHW;
-    } else {
-      layout = DataLayout::kNCDHW;
-    }
-
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize<int>(output->dims()));
-
-    PoolingMode pooling_mode;
-    if (pooling_type == "max") {
-      if (FLAGS_cudnn_deterministic) {
-        pooling_mode = PoolingMode::kMaximumDeterministic;
-      } else {
-        pooling_mode = PoolingMode::kMaximum;
-      }
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : PoolingMode::kAverageInclusive;
-    }
-
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-
-    // ------------------- cudnn pool algorithm ---------------------
-    auto handle = ctx.cuda_device_context().cudnn_handle();
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    if (input_grad) {
-      T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      // Because beta is zero, it is unnecessary to reset input_grad.
-
-      CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward(
-          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
-          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
-          &beta, cudnn_input_desc, input_grad_data));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
deleted file mode 100644
index af0665f4a12acfa9fd9c0642da671af57f9e3f89..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_op.cc
+++ /dev/null
@@ -1,504 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pool_op.h"
-#include <unordered_map>
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
-                   bool ceil_mode) {
-  int output_size;
-  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  } else {
-    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
-  }
-  PADDLE_ENFORCE(output_size > 0,
-                 "Due to the settings of padding(%d), filter_size(%d) and "
-                 "stride(%d), the output size is less than 0, please check "
-                 "again. Input_size:%d",
-                 padding, filter_size, stride, input_size);
-  return output_size;
-}
-
-void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Out(Output) of Pooling should not be null.");
-
-  auto in_x_dims = ctx->GetInputDim("X");
-
-  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
-  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
-  bool adaptive = ctx->Attrs().Get<bool>("adaptive");
-
-  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                 "Pooling intput should be 4-D or 5-D tensor.");
-
-  if (ctx->Attrs().Get<bool>("global_pooling")) {
-    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-    }
-  }
-
-  PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
-                 "Input size and pooling size should be consistent.");
-  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                    "Strides size and pooling size should be the same.");
-  PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
-                    "Paddings size and pooling size should be the same.");
-
-  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-  if (adaptive) {
-    output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
-  } else {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      output_shape.push_back(PoolOutputSize(
-          in_x_dims[i + 2], ksize[i], paddings[i], strides[i], ceil_mode));
-    }
-  }
-  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  ctx->ShareLoD("X", "Out");
-}
-
-framework::OpKernelType PoolOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library_{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-
-#ifdef PADDLE_WITH_CUDA
-  if (platform::CanCUDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
-  }
-#endif
-#ifdef PADDLE_WITH_MKLDNN
-  if (library_ == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kMKLDNN;
-    layout_ = framework::DataLayout::kMKLDNN;
-  }
-#endif
-
-  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
-                                 layout_, library_);
-}
-
-void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                 "Input(X@GRAD) should not be null.");
-  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-}
-
-framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library_{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-
-#ifdef PADDLE_WITH_CUDA
-  if (platform::CanCUDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
-  }
-#endif
-#ifdef PADDLE_WITH_MKLDNN
-  if (library_ == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kMKLDNN;
-    layout_ = framework::DataLayout::kMKLDNN;
-  }
-#endif
-
-  auto input_data_type = ctx.Input<Tensor>("X")->type();
-  if (input_data_type == framework::proto::VarType::FP16) {
-    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
-                      "float16 can only be used when CUDNN is used");
-  }
-  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
-                                 library_);
-}
-
-void Pool2dOpMaker::Make() {
-  AddInput(
-      "X",
-      "(Tensor) The input tensor of pooling operator. "
-      "The format of input tensor is NCHW, where N is batch size, C is the "
-      "number of channels, H is the height of the feature, "
-      "and W is the width of the feature.");
-  AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator. "
-            "The format of output tensor is also NCHW, "
-            "where N is batch size, C is the number of channels, "
-            "H is the height of the feature, "
-            "and W is the width of the feature.");
-
-  AddAttr<std::string>("pooling_type",
-                       "(string), pooling type, can be \"max\" for max-pooling "
-                       "and \"avg\" for average-pooling.")
-      .InEnum({"max", "avg"});
-  AddAttr<std::vector<int>>("ksize",
-                            "(vector<int>) The pooling window "
-                            "size(height, width) of the pooling operator. "
-                            "If global_pooling = true, ksize and paddings will "
-                            "be ignored.");  // TODO(Chengduo): Add checker.
-                                             // (Currently,
-  // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>(
-      "global_pooling",
-      "(bool, default false) Whether to use the global pooling. "
-      "If global_pooling = true, kernel size and paddings will be ignored.")
-      .SetDefault(false);
-  AddAttr<std::vector<int>>("strides",
-                            "(vector<int>, default {1, 1}), strides(height, "
-                            "width) of pooling operator.")
-      .SetDefault({1, 1});
-  // TODO(Chengduo): Add checker. (Currently,
-  // TypedAttrChecker don't support vector type.)
-  AddAttr<std::vector<int>>(
-      "paddings",
-      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
-      "operator."
-      "If global_pooling = true, paddings and kernel size will be ignored.")
-      .SetDefault({0, 0});
-  AddAttr<bool>(
-      "exclusive",
-      "(bool, default True) When true, will exclude the zero-padding in the "
-      "averaging calculating, otherwise, include the zero-padding. Note, it "
-      "is only used when pooling_type is avg. The default is True.")
-      .SetDefault(true);
-  AddAttr<bool>(
-      "adaptive",
-      "(bool, default False) When true, will perform adaptive pooling instead, "
-      "output shape in H and W dimensions will be same as ksize, input data "
-      "will be divided into grids specify by ksize averagely and perform "
-      "pooling in each grid area to get output pooling value.")
-      .SetDefault(false);
-
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<bool>(
-      "ceil_mode",
-      "(bool, default false) Whether to use the ceil function to calculate "
-      "output height and width. False is the default. If it is set to False, "
-      "the floor function will be used.")
-      .SetDefault(false);
-  AddAttr<bool>("use_mkldnn",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<bool>("use_quantizer",
-                "(bool, default false) "
-                "Set to true for operators that should be quantized and use "
-                "int8 kernel. "
-                "Only used on CPU.")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  AddAttr<bool>("is_test",
-                "(bool, default false) Set to true for inference only, false "
-                "for training. Some layers may run faster when this is true.")
-      .SetDefault(false);
-
-  // TODO(dzhwinter): need to registered layout transform function
-
-  AddComment(R"DOC(
-The pooling2d operation calculates the output based on
-the input, pooling_type and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
-number of channels, H is the height of the feature, and W is the width of the feature.
-Parameters(ksize, strides, paddings) are two elements.
-These two elements represent height and width, respectively.
-The input(X) size and output(Out) size may be different.
-
-Example:
-
-  Input:
-
-       X shape: $(N, C, H_{in}, W_{in})$
-
-  Output:
-
-       Out shape: $(N, C, H_{out}, W_{out})$
-
-  For ceil_mode = false:
-       $$
-       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
-       $$
-       $$
-       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
-       $$
-  For ceil_mode = true:
-       $$
-       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
-       $$
-       $$
-       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
-       $$
-
-  For exclusive = false:
-       $$
-       hstart = i * strides[0] - paddings[0]
-       $$
-       $$
-       hend = hstart + ksize[0]
-       $$
-       $$
-       wstart = j * strides[1] - paddings[1]
-       $$
-       $$
-       wend = wstart + ksize[1]
-       $$
-       $$
-       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
-       $$
-
-  For exclusive = true:
-       $$
-       hstart = max(0, i * strides[0] - paddings[0])
-       $$
-       $$
-       hend = min(H, hstart + ksize[0])
-       $$
-       $$
-       wstart = max(0, j * strides[1] - paddings[1])
-       $$
-       $$
-       wend = min(W, wstart + ksize[1])
-       $$
-       $$
-       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-       $$
-
-)DOC");
-}
-
-class PoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
-  }
-};
-
-void Pool3dOpMaker::Make() {
-  AddInput("X",
-           "(Tensor) The input tensor of pooling operator. "
-           "The format of input tensor is NCDHW, where N is batch size, C is "
-           "the number of channels, and D, H and W is the depth, height and "
-           "width of "
-           "the feature, respectively.");
-  AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCDHW, "
-            "where N is batch size, C is "
-            "the number of channels, and D, H and W is the depth, height and "
-            "width of the feature, respectively.");
-
-  AddAttr<std::string>("pooling_type",
-                       "(string) Pooling type, can be \"max\" for max-pooling "
-                       "and \"avg\" for average-pooling.")
-      .InEnum({"max", "avg"});
-  AddAttr<std::vector<int>>(
-      "ksize",
-      "(vector<int>) The pooling window size(depth, height, "
-      "width) of pooling operator. "
-      "If global_pooling = true, ksize and paddings will "
-      "be ignored.");  // TODO(Chengduo): Add checker.
-                       // (Currently,
-  // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>(
-      "global_pooling",
-      "(bool, default false) Whether to use the global pooling. "
-      "If global_pooling = true, kernel size and paddings will be ignored.")
-      .SetDefault(false);
-  AddAttr<std::vector<int>>(
-      "strides",
-      "(vector<int>, default {1,1,1}) Strides(depth, height, "
-      "width) of the pooling operator.")
-      .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
-                               // TypedAttrChecker don't support vector type.)
-  AddAttr<std::vector<int>>(
-      "paddings",
-      "(vector<int>, default {0,0,0}), paddings(depth, height, "
-      "width) of pooling operator. "
-      "If global_pooling = true, ksize and paddings will be ignored.")
-      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
-                               // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>(
-      "exclusive",
-      "(bool, default True) When true, will exclude the zero-padding in the "
-      "averaging calculating, otherwise, include the zero-padding. Note, it "
-      "is only used when pooling_type is avg. The default is True.")
-      .SetDefault(true);
-  AddAttr<bool>(
-      "adaptive",
-      "(bool, default False) When true, will perform adaptive pooling instead, "
-      "output shape in H and W dimensions will be same as ksize, input data "
-      "will be divided into grids specify by ksize averagely and perform "
-      "pooling in each grid area to get output pooling value.")
-      .SetDefault(false);
-
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<bool>(
-      "ceil_mode",
-      "(bool, default false) Whether to use the ceil function to calculate "
-      "output height and width. False is the default. If it is set to False, "
-      "the floor function will be used.")
-      .SetDefault(false);
-  AddAttr<bool>("use_mkldnn",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  // TODO(dzhwinter): need to registered layout transform function
-
-  AddComment(R"DOC(
-Pool3d Operator.
-
-The pooling3d operation calculates the output based on
-the input, pooling_type, ksize, strides, and paddings parameters.
-Input(X) and output(Out) are in NCDHW format, where N is batch
-size, C is the number of channels, and D, H and W are the depth, height and
-width of the feature, respectively. Parameters(ksize, strides, paddings)
-are three elements. These three elements represent depth, height and
-width, respectively. The input(X) size and output(Out) size may be different.
-
-Example:
-  Input:
-       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
-  Output:
-       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  For ceil_mode = false:
-       $$
-       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
-       $$
-       $$
-       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
-       $$
-       $$
-       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
-       $$
-  For ceil_mode = true:
-       $$
-       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
-       $$
-       $$
-       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
-       $$
-       $$
-       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
-       $$
-
-  For exclusive = false:
-       $$
-       dstart = i * strides[0] - paddings[0]
-       $$
-       $$
-       dend = dstart + ksize[0]
-       $$
-       $$
-       hstart = j * strides[1] - paddings[1]
-       $$
-       $$
-       hend = hstart + ksize[1]
-       $$
-       $$
-       wstart = k * strides[2] - paddings[2]
-       $$
-       $$
-       wend = wstart + ksize[2]
-       $$
-       $$
-       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
-       $$
-
-  For exclusive = true:
-       $$
-       dstart = max(0, i * strides[0] - paddings[0])
-       $$
-       $$
-       dend = min(D, dstart + ksize[0])
-       $$
-       $$
-       hend = min(H, hstart + ksize[1])
-       $$
-       $$
-       wstart = max(0, k * strides[2] - paddings[2])
-       $$
-       $$
-       wend = min(W, wstart + ksize[2])
-       $$
-       $$
-       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-       $$
-
-)DOC");
-}
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
-                  ops::PoolOpInferVarType,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
-                  ops::PoolOpInferVarType,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool3d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/pool_op.cu.cc b/paddle/fluid/operators/pool_op.cu.cc
deleted file mode 100644
index 37bc14e2cbb3437a750c416d39b7b914370961b0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_op.cu.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pool_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    pool2d_grad,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    pool3d_grad,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
deleted file mode 100644
index 6c5900bd0f55bb817834de6d1f3c5e4eb7f282b9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_op.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/pooling.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class PoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class PoolOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-template <typename DeviceContext, typename T>
-class PoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool adaptive = context.Attr<bool>("adaptive");
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-      }
-    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    switch (ksize.size()) {
-      case 2: {
-        if (pooling_type == "max") {
-          paddle::operators::math::Pool2dFunctor<
-              DeviceContext, paddle::operators::math::MaxPool<T>, T>
-              pool2d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         true, false, out);
-
-        } else if (pooling_type == "avg") {
-          paddle::operators::math::Pool2dFunctor<
-              DeviceContext, paddle::operators::math::AvgPool<T>, T>
-              pool2d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         exclusive, adaptive, out);
-        }
-      } break;
-      case 3: {
-        if (pooling_type == "max") {
-          paddle::operators::math::Pool3dFunctor<
-              DeviceContext, paddle::operators::math::MaxPool<T>, T>
-              pool3d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         true, false, out);
-        } else if (pooling_type == "avg") {
-          paddle::operators::math::Pool3dFunctor<
-              DeviceContext, paddle::operators::math::AvgPool<T>, T>
-              pool3d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         exclusive, adaptive, out);
-        }
-      } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool adaptive = context.Attr<bool>("adaptive");
-
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-      }
-    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx, in_x_grad, 0.0);
-
-      switch (ksize.size()) {
-        case 2: {
-          if (pooling_type == "max") {
-            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
-                pool2d_backward;
-            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, in_x_grad);
-          } else if (pooling_type == "avg") {
-            paddle::operators::math::Pool2dGradFunctor<
-                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
-                pool2d_backward;
-            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, adaptive,
-                            in_x_grad);
-          }
-        } break;
-        case 3: {
-          if (pooling_type == "max") {
-            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
-                pool3d_backward;
-            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, in_x_grad);
-          } else if (pooling_type == "avg") {
-            paddle::operators::math::Pool3dGradFunctor<
-                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
-                pool3d_backward;
-            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, adaptive,
-                            in_x_grad);
-          }
-        } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
deleted file mode 100644
index 91bd2a902f7cc53f76682d99195ed0d2c08352a3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ /dev/null
@@ -1,321 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pool_with_index_op.h"
-
-namespace paddle {
-namespace operators {
-
-inline int MaxPoolOutputSize(int input_size, int filter_size, int padding,
-                             int stride) {
-  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  return output_size;
-}
-
-class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of Pooling should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of Pooling should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Mask"),
-                   "Output(Mask) of Pooling should not be null.");
-
-    auto in_x_dims = ctx->GetInputDim("X");
-
-    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    bool adaptive = ctx->Attrs().Get<bool>("adaptive");
-
-    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                   "Pooling intput should be 4-D or 5-D tensor.");
-
-    if (ctx->Attrs().Get<bool>("global_pooling")) {
-      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-      }
-    }
-
-    PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
-                   "Input size and pooling size should be consistent.");
-    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                      "Strides size and pooling size should be the same.");
-    PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
-                      "Paddings size and pooling size should be the same.");
-
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    if (adaptive) {
-      output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
-    } else {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                                 paddings[i], strides[i]));
-      }
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-    ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of pooling operator. "
-        "The format of input tensor is NCHW, where N is batch size, C is the "
-        "number of channels, H is the height of the image, "
-        "and W is the width of the image.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of pooling operator. "
-              "The format of output tensor is also NCHW, "
-              "where N is batch size, C is "
-              "the number of channels, H is the height of the image "
-              "and W is the width of the image.");
-    AddOutput("Mask",
-              "(Tensor) The Mask tensor of pooling operator."
-              "The format of output tensor is also NCHW, "
-              "where N is batch size, C is the number of channels, "
-              "H is the height of the image, "
-              "and W is the width of the image. "
-              "It represents the index in the current feature map.");
-
-    AddAttr<std::vector<int>>("ksize",
-                              "(vector<int>) The pooling window size(height, "
-                              "width) of pooling operator. "
-                              "If global_pooling = true, ksize and paddings "
-                              "will be ignored.");  // TODO(Chengduo): Add
-                                                    // checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-    AddAttr<bool>(
-        "global_pooling",
-        "(bool, default:false) Whether to use the global pooling. "
-        "If global_pooling = true, ksize and paddings will be ignored.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "adaptive",
-        "(bool, default False) When true, will perform adaptive pooling "
-        "instead, "
-        "output shape in H and W dimensions will be same as ksize, input data "
-        "will be divided into grids specify by ksize averagely and perform "
-        "pooling in each grid area to get output pooling value.")
-        .SetDefault(false);
-    AddAttr<std::vector<int>>("strides",
-                              "(vector<int>, default {1, 1}), strides(height, "
-                              "width) of pooling operator.")
-        .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
-        "operator. "
-        "If global_pooling = true, paddings and will be ignored.")
-        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-
-    AddComment(R"DOC(
-MaxPool2d Operator.
-
-The maxPooling2d with index operation calculates the output and the mask
-based on the input, ksize, strides, and paddings parameters. Input(X) and
-output(Out, Mask) are in NCHW format, where N is batch size, C is the
-number of channels, H is the height of the feature, 
-and W is the width of the feature.
-Parameters(ksize, strides, paddings) are two elements.
-These two elements represent height and width, respectively.
-The input(X) size and output(Out, Mask) size may be different.
-
-Example:
-  Input:
-       X shape: $(N, C, H_{in}, W_{in})$
-  Output:
-       Out shape: $(N, C, H_{out}, W_{out})$
-       Mask shape: $(N, C, H_{out}, W_{out})$
-  Where
-       $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
-       $$
-  
-  For adaptive = true:
-       $$
-       H_{out} = ksize[0]   W_{out} = ksize[1]
-       $$
-      
-
-)DOC");
-  }
-};
-
-class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor) The input tensor of pooling operator. "
-             "The format of input tensor is NCDHW, where N is batch size, C is "
-             "the number of channels, and D, H and W are the depth, height and "
-             "width of "
-             "the image, respectively");
-    AddOutput("Out",
-              "(Tensor) The output tensor of pooling operator. "
-              "The format of output tensor is also NCDHW, "
-              "where N is the batch size, C is the number of channels, "
-              "and D, H and W are the depth, height and "
-              "width of the image, respectively.");
-    AddOutput("Mask",
-              "(Tensor) The Mask tensor of pooling operator. "
-              "The format of output tensor is also NCDHW, "
-              "where N is the batch size, C is the number of channels, and "
-              "D, H and W are the depth, height and width "
-              "of the image, respectively. "
-              "It represents the index in the current feature map.");
-
-    AddAttr<std::vector<int>>("ksize",
-                              "(vector<int>) The pooling window size(depth, "
-                              "height, width) of pooling operator. "
-                              "If global_pooling = true, ksize and paddings "
-                              "will be ignored.");  // TODO(Chengduo): Add
-                                                    // checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-    AddAttr<bool>(
-        "global_pooling",
-        "(bool, default false) Whether to use the global pooling. "
-        "If global_pooling = true, ksize and paddings will be ignored.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "adaptive",
-        "(bool, default False) When true, will perform adaptive pooling "
-        "instead, "
-        "output shape in H and W dimensions will be same as ksize, input data "
-        "will be divided into grids specify by ksize averagely and perform "
-        "pooling in each grid area to get output pooling value.")
-        .SetDefault(false);
-    AddAttr<std::vector<int>>("strides",
-                              "(vector<int>, default {1,1,1}), strides(depth, "
-                              "height, width) of pooling operator.")
-        .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "(vector, default {0,0,0}), paddings(depth, "
-        "height, width) of pooling operator. "
-        "If global_pooling = true, paddings and ksize will be ignored.")
-        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-
-    AddComment(R"DOC(
-MaxPool3d Operator.
-
-The maxpooling3d with index operation calculates the output and the mask
-based on the input and ksize, strides, paddings parameters.
-Input(X) and output(Out, Mask) are in NCDHW format, where N is batch
-size, C is the number of channels, and D, H and W are the depth, height and
-width of the feature, respectively. 
-Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
-The input(X) size and output(Out, Mask) size may be different.
-
-Example:
-  Input:
-       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
-  Output:
-       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-       Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  Where
-       $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
-       $$
-  
-  For adaptive = true:
-       $$
-       D_{out} = ksize[0]   H_{out} = ksize[1]   W_{out} = ksize[2]
-       $$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
-                  ops::MaxPool2dWithIndexOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
-                                int>);
-REGISTER_OP_CPU_KERNEL(
-    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>);
-
-REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
-                  ops::MaxPool3dWithIndexOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
-                                int>);
-REGISTER_OP_CPU_KERNEL(
-    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>);
diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc
deleted file mode 100644
index 5497dcbd9ce255f833df24989d7a76c40bcbca06..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pool_with_index_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
-                                int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
-                                int>);
-REGISTER_OP_CUDA_KERNEL(
-    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>);
-
-REGISTER_OP_CUDA_KERNEL(
-    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
-                                int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
-                                int>);
-REGISTER_OP_CUDA_KERNEL(
-    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>);
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
deleted file mode 100644
index a6bec121d4ff002ec80a0f47510e4431176e0ddc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/pooling.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    Tensor* mask = context.Output<Tensor>("Mask");
-
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool adaptive = context.Attr<bool>("adaptive");
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-      }
-    }
-
-    switch (ksize.size()) {
-      case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
-                                                           T2>
-            pool2d_forward;
-        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
-                       mask);
-      } break;
-      case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
-                                                           T2>
-            pool3d_forward;
-        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
-                       mask);
-      } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* mask = context.Input<Tensor>("Mask");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool adaptive = context.Attr<bool>("adaptive");
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
-      }
-    }
-
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T1>(context.GetPlace());
-      auto& device_ctx = context.template device_context<DeviceContext>();
-      math::set_constant(device_ctx, in_x_grad, 0);
-
-      switch (ksize.size()) {
-        case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<DeviceContext,
-                                                                 T1, T2>
-              pool2d_backward;
-          pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, adaptive, in_x_grad);
-        } break;
-        case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
-                                                                 T1, T2>
-              pool3d_backward;
-          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, adaptive, in_x_grad);
-        } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
deleted file mode 100644
index e917e778e41ff8994f248e905635da702b428fc2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/positive_negative_pair_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PositiveNegativePairOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Score"),
-        "Input(Score) of PositiveNegativePairOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("Label"),
-        "Input(Label) of PositiveNegativePairOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("QueryID"),
-        "Input(QueryID) of PositiveNegativePairOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("PositivePair"),
-        "Output(PositivePair) of PositiveNegativePairOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("NegativePair"),
-        "Output(NegativePair) of PositiveNegativePairOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("NeutralPair"),
-        "Output(NeutralPair) of PositiveNegativePairOp should not be null.");
-    auto scalar_dim = framework::make_ddim({1});
-    if (ctx->HasInput("AccumulatePositivePair") ||
-        ctx->HasInput("AccumulateNegativePair") ||
-        ctx->HasInput("AccumulateNeutralPair")) {
-      PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") &&
-                         ctx->HasInput("AccumulateNegativePair") &&
-                         ctx->HasInput("AccumulateNeutralPair"),
-                     "All optional inputs(AccumulatePositivePair, "
-                     "AccumulateNegativePair, AccumulateNeutralPair) of "
-                     "PositiveNegativePairOp are required if one of them is "
-                     "specified.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
-                        "Shape of AccumulatePositivePair should be {1}.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNegativePair"), scalar_dim,
-                        "Shape of AccumulateNegativePair should be {1}.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNeutralPair"), scalar_dim,
-                        "Shape of AccumulateNeutralPair should be {1}.");
-    }
-
-    auto score_dim = ctx->GetInputDim("Score");
-    auto label_dim = ctx->GetInputDim("Label");
-    auto query_dim = ctx->GetInputDim("QueryID");
-    PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
-
-    if (ctx->IsRuntime() ||
-        (score_dim[0] > 0 && label_dim[0] > 0 && query_dim[0] > 0)) {
-      PADDLE_ENFORCE_EQ(
-          label_dim[0], score_dim[0],
-          "Tensor Score and Label should have the same height (batch size).");
-
-      PADDLE_ENFORCE_EQ(label_dim[1], 1,
-                        "The width of Label should be 1, i.e. each item should "
-                        "have a scalar label.");
-
-      PADDLE_ENFORCE(query_dim == label_dim,
-                     "QueryID should have the same shape as Label.");
-
-      if (ctx->HasInput("Weight")) {
-        PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
-                       "Weight should have the same shape as Label.");
-      }
-
-      int column = ctx->Attrs().Get<int>("column");
-      auto depth = score_dim[1];
-      PADDLE_ENFORCE(column < depth && column >= -depth,
-                     "Attribute column should be in the range of [-%l, %l)",
-                     depth, depth);
-    }
-
-    ctx->SetOutputDim("PositivePair", scalar_dim);
-    ctx->SetOutputDim("NegativePair", scalar_dim);
-    ctx->SetOutputDim("NeutralPair", scalar_dim);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Score")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Score",
-             "(Tensor, float) Model Score on an item (with "
-             "respect to QueryID). It's a 2-D tensor with shape [batch_size, "
-             "depth], where the column specified by the attribute \"column\" "
-             "is used as item score.");
-    AddInput("Label",
-             "(Tensor, float) Label of an item (with repsect to "
-             "QueryId). It's a 2-D tensor with shape [batch_size, 1].");
-    AddInput("QueryID",
-             "(Tensor, int64) Query ID that indicates the context. Its shape "
-             "should be the same as Label.");
-    AddInput(
-        "AccumulatePositivePair",
-        "(float) Optional. The accumulated number of positive pairs over a "
-        "stream of data. If provided, the output PositivePair will be "
-        "initialized with this number rather than 0. it won't be modified "
-        "in place.")
-        .AsDispensable();
-    AddInput(
-        "AccumulateNegativePair",
-        "(float) Optional. The accumulated number of negative pairs over a "
-        "stream of data. If provided, the output NegativePair will be "
-        "initialized with this number rather than 0. it won't be modified "
-        "in place.")
-        .AsDispensable();
-    AddInput("AccumulateNeutralPair",
-             "(float) Optional. The accumulated number of neutral pairs over a "
-             "stream of data. If provided, the output NeutralPair will be "
-             "initialized with this number rather than 0. it won't be modified "
-             "in place.")
-        .AsDispensable();
-    AddInput("Weight",
-             "(float) Optional. Weight of current item. If specified, its "
-             "shape should be the same as Label, and the meaning of the output "
-             "changes from numbers of pairs to the total sum of pairs' "
-             "weights. Weight of a pair of items is the average of their "
-             "weights.")
-        .AsDispensable();
-    AddOutput("PositivePair",
-              "(float) Number of positive pairs, i.e. the pairs of "
-              "items that are ranked correctly.");
-    AddOutput("NegativePair",
-              "(float) Number of negative pairs, i.e. the pairs of "
-              "items that are ranked incorrectly.");
-    AddOutput("NeutralPair",
-              "(float) Number of neutral pairs, i.e. the pairs of items "
-              "that have the same score.")
-        .AsDispensable();
-    AddAttr<int>(
-        "column",
-        "(int, default -1) The column position of Score used to rank items in "
-        "descending order. It must be in the range of [-rank(Score), "
-        "rank(Score)). "
-        "If `dim < 0`, the dim to reduce is `rank + dim`. "
-        "Noting that reducing on the first dim will make the LoD info lost.")
-        .SetDefault(0);
-    AddComment(R"DOC(
-PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's
-performance.
-
-Within some context, e.g. the "query", a LTR model generates scores for a list
-of items, which gives a partial order of the items. PositiveNegativePairOp
-takes a list of reference rank order (Input("Label")) and the model generated
-scores (Input(Score)) as inputs and counts the pairs that ranked correctly
-and incorrectly.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
-                             ops::PositiveNegativePairOp,
-                             ops::PositiveNegativePairOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    positive_negative_pair,
-    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
-    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
deleted file mode 100644
index a47deb18b6fccae672c9cfe19d91b151e43c92da..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class PositiveNegativePairKernel : public framework::OpKernel<T> {
- public:
-  struct PredictionResult {
-    PredictionResult(T score, T label, T weight)
-        : score(score), label(label), weight(weight) {}
-    T score;
-    T label;
-    T weight;
-  };
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto score_t = context.Input<Tensor>("Score");
-    auto label_t = context.Input<Tensor>("Label");
-    auto query_t = context.Input<Tensor>("QueryID");
-    auto acc_positive_t = context.Input<Tensor>("AccumulatePositivePair");
-    auto acc_negative_t = context.Input<Tensor>("AccumulateNegativePair");
-    auto acc_neutral_t = context.Input<Tensor>("AccumulateNeutralPair");
-    auto positive_t = context.Output<Tensor>("PositivePair");
-    auto negative_t = context.Output<Tensor>("NegativePair");
-    auto neutral_t = context.Output<Tensor>("NeutralPair");
-    auto weight_t = context.Input<Tensor>("Weight");
-
-    auto score = score_t->data<T>();
-    auto label = label_t->data<T>();
-    auto query = query_t->data<int64_t>();
-    const T* weight = nullptr;
-    if (weight_t != nullptr) {
-      weight = weight_t->data<T>();
-    }
-    T* positive = positive_t->mutable_data<T>(context.GetPlace());
-    T* negative = negative_t->mutable_data<T>(context.GetPlace());
-    T* neutral = neutral_t->mutable_data<T>(context.GetPlace());
-
-    auto score_dim = score_t->dims();
-    auto batch_size = score_dim[0];
-    auto width = score_dim[1];
-    auto column = context.Attr<int32_t>("column");
-    if (column < 0) {
-      column += width;
-    }
-
-    // construct document instances for each query: Query => List[<score#0,
-    // label#0, weight#0>, ...]
-    std::unordered_map<int64_t, std::vector<PredictionResult>> predictions;
-    for (auto i = 0; i < batch_size; ++i) {
-      if (predictions.find(query[i]) == predictions.end()) {
-        predictions.emplace(
-            std::make_pair(query[i], std::vector<PredictionResult>()));
-      }
-      predictions[query[i]].emplace_back(score[i * width + column], label[i],
-                                         weight_t != nullptr ? weight[i] : 1.0);
-    }
-
-    // for each query, accumulate pair counts
-    T pos = 0, neg = 0, neu = 0;
-    if (acc_positive_t != nullptr && acc_negative_t != nullptr &&
-        acc_neutral_t != nullptr) {
-      pos = acc_positive_t->data<T>()[0];
-      neg = acc_negative_t->data<T>()[0];
-      neu = acc_neutral_t->data<T>()[0];
-    }
-    auto evaluate_one_list = [&pos, &neg,
-                              &neu](std::vector<PredictionResult> vec) {
-      for (auto ite1 = vec.begin(); ite1 != vec.end(); ++ite1) {
-        for (auto ite2 = ite1 + 1; ite2 != vec.end(); ++ite2) {
-          if (ite1->label == ite2->label) {  // labels are equal, ignore.
-            continue;
-          }
-          T w = (ite1->weight + ite2->weight) * 0.5;
-          if (ite1->score == ite2->score) {
-            neu += w;
-          }
-          (ite1->score - ite2->score) * (ite1->label - ite2->label) > 0.0
-              ? pos += w
-              : neg += w;
-        }
-      }
-    };
-    for (auto prediction : predictions) {
-      evaluate_one_list(prediction.second);
-    }
-    *positive = pos;
-    *negative = neg;
-    *neutral = neu;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
deleted file mode 100644
index ccb08b245a4696865b46f555b1ef2500bd39aadd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prelu_op.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/prelu_op.h"
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class PReluOp : public framework::OperatorWithKernel {
- public:
-  PReluOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    std::string mode = ctx->Attrs().Get<std::string>("mode");
-
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of PreluOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Alpha"),
-                   "Input(Alpha) of PreluOp should not be null");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of PreluOp should not be null");
-    if (mode == "all") {
-      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
-                     "For mode 'all', size of weight Alpha must be one.");
-    } else if (mode == "channel") {
-      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == x_dim[1],
-                     "For channel-wise mode, size of weight Alpha must be "
-                     "equal to the number of channels, should be %d",
-                     x_dim[1]);
-    } else if (mode == "element") {
-      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == product(x_dim),
-                     "For element-wise mode, size of weight Alpha must be "
-                     "equal to the number of input, should be %d",
-                     product(x_dim));
-    } else {
-      PADDLE_THROW("Unkown mode %s", mode);
-    }
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor of prelu operator.");
-    AddInput("Alpha", "The alpha weight of prelu operator.");
-    AddOutput("Out", "The output tensor of prelu operator.");
-    AddComment(R"DOC(
-PRelu Operator.
-The equation is:
-$$
-f(x) =
-\begin{cases}
-\alpha * x, \quad  \text{if} \ x < 0 \\
-x,         \qquad  \text{if} \ x >= 0
-\end{cases}
-$$
-The input `X` can carry the LoD (Level of Details) information,
-or not. And the output shares the LoD information with input `X`.
-There are modes:
-  all: all elements share same weight
-  channel: elements in a channel share same weight
-  element: each element has a weight
-)DOC");
-    AddAttr<std::string>("mode", "The mode for inputs to share weights.")
-        .SetDefault("all");
-  }
-};
-
-// The operator to calculate gradients of a prelu operator.
-class PReluGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_grad_name = framework::GradVarName("X");
-    auto alpha_grad_name = framework::GradVarName("Alpha");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
-    }
-    if (ctx->HasOutput(alpha_grad_name)) {
-      ctx->SetOutputDim(alpha_grad_name, ctx->GetInputDim("Alpha"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    prelu_grad,
-    ops::PReluGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
deleted file mode 100644
index 4a26c98af8814a500e35cb2168097a43b16cef44..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prelu_op.cu
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/prelu.h"
-#include "paddle/fluid/operators/prelu_op.h"
-#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-static const int CUDA_NUM_THREADS = 1024;
-static const int CUDA_MAX_NUM_BLOCKS = 65535;
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class CUDAPReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* out = context.Output<Tensor>("Out");
-
-    const T* x_ptr = x->data<T>();
-    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-
-    const T* alpha_ptr = alpha->data<T>();
-    auto& mode = context.Attr<std::string>("mode");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    std::vector<int> input_shape = framework::vectorize<int>(dim);
-
-    if (mode == "channel") {
-      math::PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
-      prelu_channel_wise(context.cuda_device_context().stream(), x_ptr,
-                         alpha_ptr, o_ptr, input_shape);
-    } else if (mode == "element") {
-      math::PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
-      prelu_element_wise(context.cuda_device_context().stream(), x_ptr,
-                         alpha_ptr, o_ptr, input_shape);
-    } else {
-      math::PreluScalarDirectCUDAFunctor<T> prelu_scalar;
-      prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr,
-                   o_ptr, input_shape);
-    }
-  }
-};
-
-namespace prelu {
-struct ElementWiseMode {};
-struct ChannelMode {};
-struct ScalarMode {};
-} /* namespace prelu */
-
-template <typename T, typename M>
-struct AlphaFunctor {
-  HOSTDEVICE inline T operator()(const T* alpha, size_t channel,
-                                 size_t spatial_size, size_t idx) const {}
-};
-
-template <typename T>
-struct AlphaFunctor<T, prelu::ElementWiseMode> {
-  HOSTDEVICE inline T operator()(const T* alpha, size_t channel,
-                                 size_t spatial_size, size_t idx) const {
-    return alpha[blockIdx.x * spatial_size + idx];
-  }
-};
-
-template <typename T>
-struct AlphaFunctor<T, prelu::ChannelMode> {
-  HOSTDEVICE inline T operator()(const T* alpha, size_t channel,
-                                 size_t spatial_size, size_t idx) const {
-    return alpha[blockIdx.x % channel];
-  }
-};
-
-template <typename T>
-struct AlphaFunctor<T, prelu::ScalarMode> {
-  HOSTDEVICE inline T operator()(const T* alpha, size_t channel,
-                                 size_t spatial_size, size_t idx) const {
-    return alpha[0];
-  }
-};
-
-template <typename T, typename M>
-__global__ void PReluGradElementWiseKernel(const T* x_ptr, const T* y_ptr,
-                                           const T* alpha_ptr, const T* dy_ptr,
-                                           T* dx_ptr, T* dalpha_ptr,
-                                           size_t channel,
-                                           size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  AlphaFunctor<T, M> alpha_func;
-
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    T y = y_ptr[offset + i];
-    T x = x_ptr[offset + i];
-    T dy = dy_ptr[offset + i];
-    T alpha = alpha_func(alpha_ptr, channel, spatial_size, i);
-    if (dx_ptr != nullptr) dx_ptr[offset + i] = (y > 0) ? dy : alpha * dy;
-    if (dalpha_ptr != nullptr) dalpha_ptr[offset + i] = (x > 0) ? 0 : x * dy;
-  }
-}
-
-template <typename T, typename M>
-class PreluGradElementwiseFunctor {
- public:
-  void operator()(cudaStream_t stream, const T* x, const T* y, const T* alpha,
-                  const T* dy, T* dx, T* dalpha, std::vector<int> input_shape) {
-    size_t unroll = input_shape[0] * input_shape[1];
-    size_t spatial_size = input_shape[2] * input_shape[3];
-    CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-    PReluGradElementWiseKernel<T, M><<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-        x, y, alpha, dy, dx, dalpha, input_shape[1], spatial_size);
-  }
-};
-
-template <typename T>
-struct IdentityFunctor {
-  HOSTDEVICE inline T operator()(const T& x) const { return x; }
-};
-
-template <typename DeviceContext, typename T>
-class CUDAPReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Out");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
-
-    const T* x_ptr = x->data<T>();
-    const T* y_ptr = y->data<T>();
-    const T* alpha_ptr = alpha->data<T>();
-    const T* dy_ptr = dy->data<T>();
-    T* dx_ptr = dx ? dx->mutable_data<T>(context.GetPlace()) : nullptr;
-    T* dalpha_ptr =
-        dalpha ? dalpha->mutable_data<T>(context.GetPlace()) : nullptr;
-
-    if (!dx && !dalpha) return;
-
-    auto& mode = context.Attr<std::string>("mode");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    std::vector<int> input_shape = framework::vectorize<int>(dim);
-    auto stream = context.cuda_device_context().stream();
-
-    T* dalpha_tmp_ptr;
-    Tensor dalpha_tmp;
-    if (mode == "element" || dalpha_ptr == nullptr) {
-      dalpha_tmp_ptr = dalpha_ptr;
-    } else {
-      auto& dev_ctx = context.template device_context<DeviceContext>();
-      dalpha_tmp = context.AllocateTmpTensor<T, DeviceContext>(dim, dev_ctx);
-      dalpha_tmp_ptr = dalpha_tmp.mutable_data<T>(context.GetPlace());
-    }
-
-    if (mode == "element") {
-      PreluGradElementwiseFunctor<T, prelu::ElementWiseMode> prelu_grad;
-      prelu_grad(stream, x_ptr, y_ptr, alpha_ptr, dy_ptr, dx_ptr,
-                 dalpha_tmp_ptr, input_shape);
-    } else if (mode == "channel") {
-      PreluGradElementwiseFunctor<T, prelu::ChannelMode> prelu_grad;
-      prelu_grad(stream, x_ptr, y_ptr, alpha_ptr, dy_ptr, dx_ptr,
-                 dalpha_tmp_ptr, input_shape);
-    } else {
-      PreluGradElementwiseFunctor<T, prelu::ScalarMode> prelu_grad;
-      prelu_grad(stream, x_ptr, y_ptr, alpha_ptr, dy_ptr, dx_ptr,
-                 dalpha_tmp_ptr, input_shape);
-    }
-
-    if (mode == "element" || dalpha_tmp_ptr == nullptr) return;
-
-    std::vector<int> reduce_dims;
-    for (size_t i = 0; i < input_shape.size(); i++) {
-      if (mode == "channel" && i == 1) continue;
-      reduce_dims.push_back(i);
-    }
-
-    TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
-        dalpha_tmp, dalpha, reduce_dims, static_cast<T>(0), cub::Sum(),
-        IdentityFunctor<T>(), stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    prelu, ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    prelu_grad,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
deleted file mode 100644
index 594f1cb3abe49c61ad7c490ebcd100a5c9ea6fb9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prelu_op.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/transform.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::Transform;
-
-template <typename DeviceContext, typename T>
-class PReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* out = context.Output<Tensor>("Out");
-
-    const T* x_ptr = x->data<T>();
-    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-
-    const T* alpha_ptr = alpha->data<T>();
-    auto& mode = context.Attr<std::string>("mode");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    int index = 0;
-    int i = 0;
-    if (mode == "channel") {
-      int temp = numel / (dim[0] * dim[1]);
-      for (i = 0; i < numel; i++) {
-        index = (i / temp) % dim[1];
-        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-      }
-    } else if (mode == "element") {
-      for (i = 0; i < numel; i++) {
-        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i];
-      }
-    } else {
-      for (i = 0; i < numel; i++) {
-        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
-    auto* out = context.Input<Tensor>("Out");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    const T* alpha_ptr = alpha->data<T>();
-    const T* x_ptr = x->data<T>();
-    const T* dout_ptr = dout->data<T>();
-    const T* out_ptr = out->data<T>();
-    std::string mode = context.Attr<std::string>("mode");
-    int numel = x->numel();
-    auto dim = x->dims();
-    int index = 0;
-    int i = 0;
-    int temp = 0;
-    if (dx) {
-      T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
-      if (mode == "channel") {
-        for (i = 0; i < numel; i++) {
-          temp = numel / (dim[0] * dim[1]);
-          index = (i / temp) % dim[1];
-          dx_ptr[i] =
-              out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
-        }
-      } else if (mode == "element") {
-        for (i = 0; i < numel; i++) {
-          dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[i] * dout_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i];
-        }
-      }
-    }
-
-    index = 0;
-    if (dalpha) {
-      T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace());
-      memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel());
-
-      if (mode == "channel") {
-        for (i = 0; i < numel; i++) {
-          temp = numel / (dim[0] * dim[1]);
-          index = (i / temp) % dim[1];
-          dalpha_ptr[index] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-        }
-      } else if (mode == "element") {
-        for (i = 0; i < numel; i++) {
-          dalpha_ptr[i] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          dalpha_ptr[0] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-        }
-      }
-    }
-
-    // TODO(Guanzhong): add GPU kernels
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
deleted file mode 100644
index f686e5293b0f504863e228d37db56c6df4954c24..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/print_op.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-
-namespace paddle {
-namespace operators {
-using framework::GradVarName;
-
-#define CLOG std::cout
-
-const char kForward[] = "FORWARD";
-const char kBackward[] = "BACKWARD";
-const char kBoth[] = "BOTH";
-
-class LogGuard {
- public:
-  inline LogGuard() { LogMutex().lock(); }
-
-  inline ~LogGuard() { LogMutex().unlock(); }
-
- private:
-  static std::mutex &LogMutex() {
-    static std::mutex mtx;
-    return mtx;
-  }
-};
-
-struct Formater {
-  std::string message;
-  std::string name;
-  std::vector<int> dims;
-  std::type_index dtype{typeid(const char)};
-  framework::LoD lod;
-  int summarize;
-  void *data{nullptr};
-  platform::Place place;
-  std::stringstream logs;
-
-  void operator()(size_t size) {
-    PrintMessage();
-    PrintPlaceInfo();
-    PrintName();
-    PrintDims();
-    PrintDtype();
-    PrintLod();
-    PrintData(size);
-    LogGuard guard;
-    CLOG << logs.str();
-  }
-
- private:
-  void PrintPlaceInfo() { logs << "The place is:" << place << std::endl; }
-  void PrintMessage() { logs << std::time(nullptr) << "\t" << message << "\t"; }
-  void PrintName() {
-    if (!name.empty()) {
-      logs << "Tensor[" << name << "]" << std::endl;
-    }
-  }
-  void PrintDims() {
-    if (!dims.empty()) {
-      logs << "\tshape: [";
-      for (auto i : dims) {
-        logs << i << ",";
-      }
-      logs << "]" << std::endl;
-    }
-  }
-  void PrintDtype() {
-    if (!framework::IsType<const char>(dtype)) {
-      logs << "\tdtype: " << dtype.name() << std::endl;
-    }
-  }
-  void PrintLod() {
-    if (!lod.empty()) {
-      logs << "\tLoD: [";
-      for (auto level : lod) {
-        logs << "[ ";
-        for (auto i : level) {
-          logs << i << ",";
-        }
-        logs << " ]";
-      }
-      logs << "]" << std::endl;
-    }
-  }
-
-  void PrintData(size_t size) {
-    PADDLE_ENFORCE_NOT_NULL(data);
-    // print float
-    if (framework::IsType<const float>(dtype)) {
-      Display<float>(size);
-    } else if (framework::IsType<const double>(dtype)) {
-      Display<double>(size);
-    } else if (framework::IsType<const int>(dtype)) {
-      Display<int>(size);
-    } else if (framework::IsType<const int64_t>(dtype)) {
-      Display<int64_t>(size);
-    } else if (framework::IsType<const bool>(dtype)) {
-      Display<bool>(size);
-    } else {
-      logs << "\tdata: unprintable type: " << dtype.name() << std::endl;
-    }
-  }
-
-  template <typename T>
-  void Display(size_t size) {
-    auto *d = reinterpret_cast<T *>(data);
-    logs << "\tdata: ";
-    if (summarize != -1) {
-      summarize = std::min(size, (size_t)summarize);
-      for (int i = 0; i < summarize; i++) {
-        logs << d[i] << ",";
-      }
-    } else {
-      for (size_t i = 0; i < size; i++) {
-        logs << d[i] << ",";
-      }
-    }
-    logs << std::endl;
-  }
-};
-
-// TODO(ChunweiYan) there should be some other printers for TensorArray
-class PrintOp : public framework::OperatorBase {
- public:
-  PrintOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    const auto in_var = scope.FindVar(Input("In"));
-    auto out_var = scope.FindVar(Output("Out"));
-    PADDLE_ENFORCE_NOT_NULL(in_var, "The input should not be found in scope",
-                            Input("In"));
-    PADDLE_ENFORCE_NOT_NULL(out_var, "The output should not be found in scope",
-                            Output("Out"));
-    auto &in_tensor = in_var->Get<framework::LoDTensor>();
-    framework::LoDTensor *out_tensor =
-        out_var->GetMutable<framework::LoDTensor>();
-
-    PrintValue(place, Inputs("In").front(), in_tensor);
-    framework::TensorCopy(in_tensor, place, out_tensor);
-    out_tensor->set_lod(in_tensor.lod());
-  }
-
-  void PrintValue(const platform::Place &place,
-                  const std::string &printed_var_name,
-                  const framework::LoDTensor &in_tensor) const {
-    std::string print_phase = Attr<std::string>("print_phase");
-    bool is_forward = Attr<bool>("is_forward");
-
-    if ((is_forward && print_phase == kBackward) ||
-        (!is_forward && print_phase == kForward)) {
-      return;
-    }
-
-    int first_n = Attr<int>("first_n");
-    if (first_n > 0 && ++times_ > first_n) return;
-
-    framework::LoDTensor printed_tensor;
-    printed_tensor.set_lod(in_tensor.lod());
-    printed_tensor.Resize(in_tensor.dims());
-
-    if (is_cpu_place(in_tensor.place())) {
-      printed_tensor.ShareDataWith(in_tensor);
-    } else {
-      // copy data to cpu to print
-      platform::CPUPlace place;
-      TensorCopy(in_tensor, place, &printed_tensor);
-    }
-
-    Formater formater;
-    formater.place = place;
-    formater.message = Attr<std::string>("message");
-    if (Attr<bool>("print_tensor_name")) {
-      formater.name = printed_var_name;
-    }
-    if (Attr<bool>("print_tensor_type")) {
-      formater.dtype = framework::ToTypeIndex(printed_tensor.type());
-    }
-    if (Attr<bool>("print_tensor_shape")) {
-      auto &dims = printed_tensor.dims();
-      formater.dims.resize(dims.size());
-      for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i];
-    }
-    if (Attr<bool>("print_tensor_lod")) {
-      formater.lod = printed_tensor.lod();
-    }
-    formater.summarize = Attr<int>("summarize");
-    formater.data = reinterpret_cast<void *>(printed_tensor.data<void>());
-    formater(printed_tensor.numel());
-  }
-
- private:
-  mutable int times_{0};
-};
-
-class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("In", "Input tensor to be displayed.");
-    AddOutput("Out", "The output tensor.");
-    AddAttr<int>("first_n", "Only log `first_n` number of times.");
-    AddAttr<std::string>("message", "A string message to print as a prefix.");
-    AddAttr<int>("summarize", "Number of elements printed.");
-    AddAttr<bool>("print_tensor_name", "Whether to print the tensor name.");
-    AddAttr<bool>("print_tensor_type", "Whether to print the tensor's dtype.");
-    AddAttr<bool>("print_tensor_shape", "Whether to print the tensor's shape.");
-    AddAttr<bool>("print_tensor_lod", "Whether to print the tensor's lod.");
-    AddAttr<std::string>("print_phase",
-                         "(string, default 'FORWARD') Which phase to display "
-                         "including 'FORWARD' "
-                         "'BACKWARD' and 'BOTH'.")
-        .SetDefault(std::string(kBoth))
-        .InEnum({std::string(kForward), std::string(kBackward),
-                 std::string(kBoth)});
-    AddAttr<bool>("is_forward", "Whether is forward or not").SetDefault(true);
-    AddComment(R"DOC(
-Creates a print op that will print when a tensor is accessed.
-
-Wraps the tensor passed in so that whenever that a tensor is accessed,
-the message `message` is printed, along with the current value of the
-tensor `t`.)DOC");
-  }
-};
-
-class PrintOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    VLOG(10) << "PrintOpInferShape";
-    PADDLE_ENFORCE(ctx->HasInput("In"), "Input(In) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
-    ctx->ShareDim("In", /*->*/ "Out");
-    ctx->ShareLoD("In", /*->*/ "Out");
-  }
-};
-
-class PrintOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto input_type = ctx->GetType(ctx->Input("In")[0]);
-    auto out_name = ctx->Output("Out").front();
-    ctx->SetType(out_name, input_type);
-  }
-};
-
-class PrintOpGradientMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op_desc_ptr = new framework::OpDesc();
-    op_desc_ptr->SetType("print");
-    op_desc_ptr->SetInput("In", OutputGrad("Out"));
-    op_desc_ptr->SetOutput("Out", InputGrad("In"));
-    op_desc_ptr->SetAttrMap(Attrs());
-    op_desc_ptr->SetAttr("is_forward", false);
-    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(print, ops::PrintOp, ops::PrintOpProtoAndCheckMaker,
-                  ops::PrintOpGradientMaker, ops::PrintOpInferShape,
-                  ops::PrintOpVarTypeInference);
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
deleted file mode 100644
index 6d5129f8d603088551ca2f7dcf01edf7e5b0ffc4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/prroi_pool_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-class PRROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor), "
-             "the input of PRROIPoolOp. "
-             "The format of input tensor is NCHW. Where N is the batch size, "
-             "C is the number of input channels, "
-             "H is the height of the input feature map, and "
-             "W is the width.");
-    AddInput("ROIs",
-             "(LoDTensor), "
-             "ROIs (Regions of Interest) to pool over. "
-             "should be a 2-D LoDTensor of shape (num_rois, 4) "
-             "given as [(x1, y1, x2, y2), ...]. "
-             "where (x1, y1) is the top left coordinates, and "
-             "(x2, y2) is the bottom right coordinates. "
-             "The roi batch index can be calculated from LoD.");
-    AddOutput("Out",
-              "(Tensor), "
-              "the output of PRROIPoolOp is a 4-D Tensor with shape "
-              "(num_rois, output_channels, pooled_h, pooled_w).");
-    AddAttr<int>(
-        "output_channels",
-        "(int), "
-        "the number of channels of the output feature map. "
-        "For a task of C classes of objects, output_channels should be "
-        "(C + 1) for classification only.");
-    AddAttr<float>("spatial_scale",
-                   "(float, default 1.0), "
-                   "Multiplicative spatial scale factor "
-                   "to translate ROI coords from their input scale "
-                   "to the scale used when pooling.")
-        .SetDefault(1.0);
-    AddAttr<int>("pooled_height",
-                 "(int, default 1), "
-                 "the pooled output height.")
-        .SetDefault(1);
-    AddAttr<int>("pooled_width",
-                 "(int, default 1), "
-                 "the pooled output width.")
-        .SetDefault(1);
-    AddComment(R"Doc(
-**PRROIPool Operator**
-
-Precise region of interest pooling (also known as PRROIPooling) is to perform
- bilinear interpolation average pooling method for RoI Pooling.
-
-Please refer to https://arxiv.org/abs/1807.11590 for more details.
-
-    )Doc");
-  }
-};
-
-class PRROIPoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of op(PRROIPool) should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      "Input(ROIs) of op(PRROIPool) should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of op(PRROIPool) should not be null.");
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      "The format of input tensor is NCHW");
-    PADDLE_ENFORCE_EQ(rois_dims.size(), 2,
-                      "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-                      "given as [(x1, y1, x2, y2), ...]");
-    PADDLE_ENFORCE_EQ(rois_dims[1], 4,
-                      "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-                      "given as [(x1, y1, x2, y2), ...]");
-
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    int output_channels = ctx->Attrs().Get<int>("output_channels");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_EQ(
-        input_dims[1], output_channels * pooled_height * pooled_width,
-        "the channel of X(%d) should be equal to the product of "
-        "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
-        input_dims[1], output_channels, pooled_height, pooled_width);
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      "The pooled output height must be greater than 0");
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      "The pooled output width must be greater than 0");
-    PADDLE_ENFORCE_GT(output_channels, 1,
-                      "The pooled output channels must greater than 1");
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      "The spatial scale must greater than 0.");
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] =
-        output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class PRROIPoolGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "The gradient of Out should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      "The gradient of X should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class PRROIPoolGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("prroi_pool_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("ROIs", Input("ROIs"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(prroi_pool, ops::PRROIPoolOp, ops::PRROIPoolOpMaker,
-                  ops::PRROIPoolGradDescMaker);
-REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    prroi_pool,
-    ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    prroi_pool_grad,
-    ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
deleted file mode 100644
index 915e3daae538f00434edb26a354252895520a21f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ /dev/null
@@ -1,309 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/prroi_pool_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-DEVICE void PrRoIPoolingDistributeDiffCUDA(T* diff, const T top_diff,
-                                           const int h, const int w,
-                                           const int height, const int width,
-                                           const T coeff) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  if (!overflow) {
-    paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
-  }
-}
-
-template <typename T>
-__global__ void GPUPRROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-    T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-    T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-    T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-
-    T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-    T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    T win_start_w = roi_start_w + bin_size_w * pw;
-    T win_start_h = roi_start_h + bin_size_h * ph;
-    T win_end_w = win_start_w + bin_size_w;
-    T win_end_h = win_start_h + bin_size_h;
-
-    T win_size = max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    const T* offset_input_data =
-        input_data +
-        (roi_batch_id * input_channels + input_channel) * height * width;
-
-    if (win_size > static_cast<T>(0.0)) {
-      int s_w = floor(win_start_w);
-      int e_w = ceil(win_end_w);
-      int s_h = floor(win_start_h);
-      int e_h = ceil(win_end_h);
-      T sum_out = 0;
-
-      for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-        for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-          sum_out += PrRoIPoolingMatCalculation(
-              offset_input_data, h_iter, w_iter, h_iter + 1, w_iter + 1,
-              max(win_start_h, static_cast<T>(h_iter)),
-              max(win_start_w, static_cast<T>(w_iter)),
-              min(win_end_h, static_cast<T>(h_iter) + static_cast<T>(1.0)),
-              min(win_end_w, static_cast<T>(w_iter) + static_cast<T>(1.0)),
-              height, width);
-        }
-      }
-      output_data[i] = sum_out / win_size;
-    } else {
-      output_data[i] = 0.;
-    }
-  }
-}
-
-template <typename T>
-__global__ void GPUPRROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad_data,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    int input_offset =
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T* offset_input_grad_data = input_grad_data + input_offset;
-    const T* offset_output_grad_data = output_grad_data + i;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-    T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-    T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-    T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-
-    T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-    T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    T win_start_w = roi_start_w + bin_size_w * pw;
-    T win_start_h = roi_start_h + bin_size_h * ph;
-    T win_end_w = win_start_w + bin_size_w;
-    T win_end_h = win_start_h + bin_size_h;
-
-    T win_size = max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-    int s_w = floor(win_start_w);
-    int e_w = ceil(win_end_w);
-    int s_h = floor(win_start_h);
-    int e_h = ceil(win_end_h);
-
-    T sum_out = win_size == static_cast<T>(0.)
-                    ? static_cast<T>(0.)
-                    : *offset_output_grad_data / win_size;
-
-    for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-      for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-        PrRoIPoolingMatDistributeDiff(
-            offset_input_grad_data, sum_out, h_iter, w_iter, h_iter + 1,
-            w_iter + 1, max(win_start_h, static_cast<T>(h_iter)),
-            max(win_start_w, static_cast<T>(w_iter)),
-            min(win_end_h, static_cast<T>(h_iter) + static_cast<T>(1.0)),
-            min(win_end_w, static_cast<T>(w_iter) + static_cast<T>(1.0)),
-            height, width, PrRoIPoolingDistributeDiffCUDA<T>);
-      }
-    }
-  }
-}
-
-template <typename T>
-class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      "the channels of input X should equal the product of "
-                      "output_channels x pooled_height x pooled_width");
-
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "The rois_batch_size and input(X) batch_size must be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-
-    // set rois batch id
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        rois_batch_id_data[i] = n;
-      }
-    }
-
-    framework::Tensor rois_batch_id_list_gpu;
-    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &rois_batch_id_list_gpu);
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    // call cuda kernel function
-    GPUPRROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale,
-        input_channels, height, width, output_channels, pooled_height,
-        pooled_width, rois_batch_id_list_gpu.data<int>(),
-        out->mutable_data<T>(ctx.GetPlace()));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int input_channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (input_grad) {
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-
-      framework::Tensor rois_batch_id_list_gpu;
-      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &rois_batch_id_list_gpu);
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
-
-      int output_grad_size = output_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUPRROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<T>(), output_grad->data<T>(),
-            spatial_scale, input_channels, height, width, output_channels,
-            pooled_height, pooled_width, rois_batch_id_list_gpu.data<int>(),
-            input_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(prroi_pool, ops::GPUPRROIPoolOpKernel<float>,
-                        ops::GPUPRROIPoolOpKernel<double>);
-REGISTER_OP_CUDA_KERNEL(
-    prroi_pool_grad,
-    ops::GPUPRROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPRROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
deleted file mode 100644
index 621e543fab5539df15bab65ab7552ae7cf2f2196..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-HOSTDEVICE T PrRoIPoolingGetData(const T* data, const int h, const int w,
-                                 const int height, const int width) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  T retVal = overflow ? 0.0f : data[h * width + w];
-  return retVal;
-}
-
-template <typename T>
-HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data, const int s_h,
-                                        const int s_w, const int e_h,
-                                        const int e_w, const T y0, const T x0,
-                                        const T y1, const T x1, const int h0,
-                                        const int w0) {
-  T alpha, beta, lim_alpha, lim_beta, tmp;
-  T sum_out = 0;
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = y0 - static_cast<T>(s_h);
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = y1 - static_cast<T>(s_h);
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = static_cast<T>(e_h) - y1;
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = static_cast<T>(e_h) - y0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
-
-  return sum_out;
-}
-
-template <typename T>
-HOSTDEVICE void PrRoIPoolingDistributeDiff(T* diff, const T top_diff,
-                                           const int h, const int w,
-                                           const int height, const int width,
-                                           const T coeff) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  if (!overflow) {
-    *(diff + h * width + w) = top_diff * coeff;
-  }
-}
-
-template <typename T, typename Functor>
-HOSTDEVICE void PrRoIPoolingMatDistributeDiff(
-    T* diff, const T top_diff, const int s_h, const int s_w, const int e_h,
-    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
-    const int w0, Functor functor) {
-  T alpha, beta, lim_alpha, lim_beta, tmp;
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = y0 - static_cast<T>(s_h);
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = y1 - static_cast<T>(s_h);
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  functor(diff, top_diff, s_h, s_w, h0, w0, tmp);
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  functor(diff, top_diff, s_h, e_w, h0, w0, tmp);
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = static_cast<T>(e_h) - y1;
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = static_cast<T>(e_h) - y0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  functor(diff, top_diff, e_h, s_w, h0, w0, tmp);
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  functor(diff, top_diff, e_h, e_w, h0, w0, tmp);
-}
-
-template <typename DeviceContext, typename T>
-class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_channels = ctx.Attr<int>("output_channels");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = framework::stride(in_dims);
-    auto out_stride = framework::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "the rois_batch_size and input(X) batch_size should be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num,
-                      "the rois_num from input and lod must be the same");
-
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      "the channels of input X should equal the product of "
-                      "output_channels x pooled_height x pooled_width");
-
-    // calculate batch id index for each roi according to LoD
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        rois_batch_id_data[i] = n;
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* input_rois = rois->data<T>();
-
-    // calculate prroipooling, parallel processing can be implemented per ROI
-    for (int n = 0; n < rois_num; ++n) {
-      // set roi batch id
-      int roi_batch_id = rois_batch_id_data[n];
-
-      // [start, end) interval for spatial sampling
-      const T* offset_input_rois = input_rois + n * 4;
-      T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-      T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-      T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-      T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-
-      T roi_width = std::max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-      T roi_height = std::max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-      // Compute w and h at input feature map
-      T bin_size_h = roi_height / static_cast<T>(pooled_height);
-      T bin_size_w = roi_width / static_cast<T>(pooled_width);
-      T win_size = std::max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-
-      // calculate each pixel of the output feature map.
-      int out_roi_offset = n * out_stride[0];
-      for (int c = 0; c < output_channels; ++c) {
-        // per category
-        int out_plane_offset = out_roi_offset + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          int out_row_offset = out_plane_offset + ph * out_stride[2];
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            // calculate w and h at input feature map
-            T win_start_h = static_cast<T>(ph) * bin_size_h + roi_start_h;
-            T win_start_w = static_cast<T>(pw) * bin_size_w + roi_start_w;
-            T win_end_h = win_start_h + bin_size_h;
-            T win_end_w = win_start_w + bin_size_w;
-            //  Add roi offsets and clip to input boundaries
-            int s_w = std::floor(win_start_w);
-            int e_w = std::ceil(win_end_w);
-            int s_h = std::floor(win_start_h);
-            int e_h = std::ceil(win_end_h);
-
-            int output_index = out_row_offset + pw;
-            int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-            int input_plane_offset =
-                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
-            const T* offset_input_data = input_data + input_plane_offset;
-            T sum_out = 0.;
-
-            if (win_size > static_cast<T>(0.0)) {
-              for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-                for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-                  sum_out += PrRoIPoolingMatCalculation(
-                      offset_input_data, h_iter, w_iter, h_iter + 1, w_iter + 1,
-                      std::max(win_start_h, static_cast<T>(h_iter)),
-                      std::max(win_start_w, static_cast<T>(w_iter)),
-                      std::min(win_end_h,
-                               static_cast<T>(h_iter) + static_cast<T>(1.0)),
-                      std::min(win_end_w,
-                               static_cast<T>(w_iter) + static_cast<T>(1.0)),
-                      height, width);
-                }
-              }
-
-              output_data[output_index] = sum_out / win_size;
-            } else {
-              output_data[output_index] = 0.;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    if (input_grad) {
-      auto in_dims = in->dims();
-      int input_channels = in_dims[1];
-      int height = in_dims[2];
-      int width = in_dims[3];
-      int rois_num = rois->dims()[0];
-
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-
-      const T* input_rois = rois->data<T>();
-      const T* output_grad_data = output_grad->data<T>();
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-      // set gradient of X to be 0. before backpropagate.
-      math::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), input_grad,
-               static_cast<T>(0));
-
-      // backpropagate gradient per output pixel
-      int output_grad_size = output_grad->numel();
-      for (int i = 0; i < output_grad_size; ++i) {
-        // The output is in order (n, c, ph, pw)
-        int pw = i % pooled_width;
-        int ph = (i / pooled_width) % pooled_height;
-        int c = (i / pooled_width / pooled_height) % output_channels;
-        int n = i / pooled_width / pooled_height / output_channels;
-
-        // set roi_batch_id
-        int roi_batch_id = rois_batch_id_data[n];
-        int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-        int input_offset =
-            (roi_batch_id * input_channels + input_channel) * height * width;
-        T* offset_input_grad_data = input_grad_data + input_offset;
-        const T* offset_output_grad_data = output_grad_data + i;
-
-        // [start, end) interval for spatial sampling
-        const T* offset_input_rois = input_rois + n * 4;
-        T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-        T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-        T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-        T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-
-        T roi_width = std::max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-        T roi_height = std::max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-        // Compute w and h at input feature map
-        T bin_size_h = roi_height / static_cast<T>(pooled_height);
-        T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-        T win_start_w = roi_start_w + bin_size_w * pw;
-        T win_start_h = roi_start_h + bin_size_h * ph;
-        T win_end_w = win_start_w + bin_size_w;
-        T win_end_h = win_start_h + bin_size_h;
-
-        T win_size = std::max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-
-        T sum_out = win_size == static_cast<T>(0.)
-                        ? static_cast<T>(0.)
-                        : *offset_output_grad_data / win_size;
-
-        int s_w = std::floor(win_start_w);
-        int e_w = std::ceil(win_end_w);
-        int s_h = std::floor(win_start_h);
-        int e_h = std::ceil(win_end_h);
-
-        for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-          for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-            PrRoIPoolingMatDistributeDiff(
-                offset_input_grad_data, sum_out, h_iter, w_iter, h_iter + 1,
-                w_iter + 1, std::max(win_start_h, static_cast<T>(h_iter)),
-                std::max(win_start_w, static_cast<T>(w_iter)),
-                std::min(win_end_h,
-                         static_cast<T>(h_iter) + static_cast<T>(1.0)),
-                std::min(win_end_w,
-                         static_cast<T>(w_iter) + static_cast<T>(1.0)),
-                height, width, PrRoIPoolingDistributeDiff<T>);
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
deleted file mode 100644
index dce9108eb17d76cfdf1c1b2313d975fd9fbdf9a7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor), "
-             "the input of PSROIPoolOp. "
-             "The format of input tensor is NCHW. Where N is the batch size, "
-             "C is the number of input channels, "
-             "H is the height of the input feature map, and "
-             "W is the width.");
-    AddInput("ROIs",
-             "(LoDTensor), "
-             "ROIs (Regions of Interest) to pool over. "
-             "should be a 2-D LoDTensor of shape (num_rois, 4) "
-             "given as [(x1, y1, x2, y2), ...]. "
-             "where (x1, y1) is the top left coordinates, and "
-             "(x2, y2) is the bottom right coordinates. "
-             "The roi batch index can be calculated from LoD.");
-    AddOutput("Out",
-              "(Tensor), "
-              "the output of PSROIPoolOp is a 4-D Tensor with shape "
-              "(num_rois, output_channels, pooled_h, pooled_w).");
-    AddAttr<int>(
-        "output_channels",
-        "(int), "
-        "the number of channels of the output feature map. "
-        "For a task of C classes of objects, output_channels should be "
-        "(C + 1) for classification only.");
-    AddAttr<float>("spatial_scale",
-                   "(float, default 1.0), "
-                   "Multiplicative spatial scale factor "
-                   "to translate ROI coords from their input scale "
-                   "to the scale used when pooling.")
-        .SetDefault(1.0);
-    AddAttr<int>("pooled_height",
-                 "(int, default 1), "
-                 "the pooled output height.")
-        .SetDefault(1);
-    AddAttr<int>("pooled_width",
-                 "(int, default 1), "
-                 "the pooled output width.")
-        .SetDefault(1);
-    AddComment(R"Doc(
-**PSROIPool Operator**
-
-Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
-position-sensitive average pooling on regions of interest specified by input, takes as 
-input N position-sensitive score maps and a list of num_rois regions of interest. 
-
-PSROIPooling for R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
-    )Doc");
-  }
-};
-
-class PSROIPoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of PSROIPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
-                   "Input(ROIs) of PSROIPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of PSROIPoolOp should not be null.");
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE(input_dims.size() == 4,
-                   "The format of input tensor is NCHW");
-    PADDLE_ENFORCE(rois_dims.size() == 2,
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-                   "given as [(x1, y1, x2, y2), ...]");
-    PADDLE_ENFORCE(rois_dims[1] == 4,
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-                   "given as [(x1, y1, x2, y2), ...]");
-
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    int output_channels = ctx->Attrs().Get<int>("output_channels");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE(
-        input_dims[1] == output_channels * pooled_height * pooled_width,
-        "the channel of X(%d) should be equal to the product of "
-        "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
-        input_dims[1], output_channels, pooled_height, pooled_width);
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      "The pooled output height must be greater than 0");
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      "The pooled output width must be greater than 0");
-    PADDLE_ENFORCE_GT(output_channels, 1,
-                      "The pooled output channels must greater than 1");
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      "The spatial scale must greater than 0.");
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] =
-        output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class PSROIPoolGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The gradient of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "The gradient of X should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class PSROIPoolGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("psroi_pool_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("ROIs", Input("ROIs"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
-                  ops::PSROIPoolGradDescMaker);
-REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool_grad,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
deleted file mode 100644
index 22fec3244fabe5ca466202784c0cce372d0bf6e5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    const T* offset_input_data =
-        input_data +
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T outsum = 0;
-
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        outsum += offset_input_data[input_index];
-      }
-    }
-
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    output_data[i] = is_empty ? 0. : outsum / bin_area;
-  }
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad_data,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    int input_offset =
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T* offset_input_grad_data = input_grad_data + input_offset;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Accumulate diff_val into input data
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val);
-      }
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      "the channels of input X should equal the product of "
-                      "output_channels x pooled_height x pooled_width");
-
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "The rois_batch_size and input(X) batch_size must be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-
-    // set rois batch id
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        rois_batch_id_data[i] = n;
-      }
-    }
-
-    framework::Tensor rois_batch_id_list_gpu;
-    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &rois_batch_id_list_gpu);
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    // call cuda kernel function
-    GPUPSROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale,
-        input_channels, height, width, output_channels, pooled_height,
-        pooled_width, rois_batch_id_list_gpu.data<int>(),
-        out->mutable_data<T>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int input_channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (input_grad) {
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-
-      framework::Tensor rois_batch_id_list_gpu;
-      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &rois_batch_id_list_gpu);
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
-
-      int output_grad_size = output_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUPSROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<T>(), output_grad->data<T>(),
-            spatial_scale, input_channels, height, width, output_channels,
-            pooled_height, pooled_width, rois_batch_id_list_gpu.data<int>(),
-            input_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool_grad,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
deleted file mode 100644
index 5666613f6efb99ec484d110857b8067a8f3b2ae5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_channels = ctx.Attr<int>("output_channels");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = framework::stride(in_dims);
-    auto out_stride = framework::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "the rois_batch_size and input(X) batch_size should be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num,
-                      "the rois_num from input and lod must be the same");
-
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      "the channels of input X should equal the product of "
-                      "output_channels x pooled_height x pooled_width");
-
-    // calculate batch id index for each roi according to LoD
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        rois_batch_id_data[i] = n;
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* input_rois = rois->data<T>();
-
-    // calculate psroipooling, parallel processing can be implemented per ROI
-    for (int n = 0; n < rois_num; ++n) {
-      // set roi batch id
-      int roi_batch_id = rois_batch_id_data[n];
-
-      // [start, end) interval for spatial sampling
-      const T* offset_input_rois = input_rois + n * 4;
-      T roi_start_w =
-          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-      T roi_start_h =
-          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-      T roi_end_w =
-          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-      T roi_end_h =
-          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-      // Force too small rois to be 1 x 1
-      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-      // Compute bin size w and h at input feature map
-      T bin_size_h = roi_height / static_cast<T>(pooled_height);
-      T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-      // calculate each pixel of the output feature map.
-      int out_roi_offset = n * out_stride[0];
-      for (int c = 0; c < output_channels; ++c) {
-        // per category
-        int out_plane_offset = out_roi_offset + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          int out_row_offset = out_plane_offset + ph * out_stride[2];
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            // calculate w and h at input feature map
-            int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
-            int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
-            int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
-            int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
-            //  Add roi offsets and clip to input boundaries
-            hstart = std::min(std::max(hstart, 0), height);
-            wstart = std::min(std::max(wstart, 0), width);
-            hend = std::min(std::max(hend, 0), height);
-            wend = std::min(std::max(wend, 0), width);
-
-            int output_index = out_row_offset + pw;
-            int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-            int input_plane_offset =
-                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
-            const T* offset_input_data = input_data + input_plane_offset;
-            T out_sum = 0.;
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            for (int ih = hstart; ih < hend; ++ih) {
-              for (int iw = wstart; iw < wend; ++iw) {
-                int input_index = ih * in_stride[2] + iw;
-                out_sum += offset_input_data[input_index];
-              }
-            }
-            T bin_area = (hend - hstart) * (wend - wstart);
-            output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    if (input_grad) {
-      auto in_dims = in->dims();
-      int input_channels = in_dims[1];
-      int height = in_dims[2];
-      int width = in_dims[3];
-      int rois_num = rois->dims()[0];
-
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-
-      const T* input_rois = rois->data<T>();
-      const T* output_grad_data = output_grad->data<T>();
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-      // set gradient of X to be 0. before backpropagate.
-      math::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), input_grad,
-               static_cast<T>(0));
-
-      // backpropagate gradient per output pixel
-      int output_grad_size = output_grad->numel();
-      for (int i = 0; i < output_grad_size; ++i) {
-        // The output is in order (n, c, ph, pw)
-        int pw = i % pooled_width;
-        int ph = (i / pooled_width) % pooled_height;
-        int c = (i / pooled_width / pooled_height) % output_channels;
-        int n = i / pooled_width / pooled_height / output_channels;
-
-        // set roi_batch_id
-        int roi_batch_id = rois_batch_id_data[n];
-        int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-        int input_offset =
-            (roi_batch_id * input_channels + input_channel) * height * width;
-        T* offset_input_grad_data = input_grad_data + input_offset;
-
-        // [start, end) interval for spatial sampling
-        const T* offset_input_rois = input_rois + n * 4;
-        T roi_start_w =
-            static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-        T roi_start_h =
-            static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-        T roi_end_w =
-            static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-        T roi_end_h =
-            static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-        // Force too small ROIs to be 1x1
-        T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-        T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-        // Compute w and h at input feature map
-        T bin_size_h = roi_height / static_cast<T>(pooled_height);
-        T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-        int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-        int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-        int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-        int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-        // Add roi offsets and clip to input boundaries
-        hstart = std::min(std::max(hstart, 0), height);
-        hend = std::min(std::max(hend, 0), height);
-        wstart = std::min(std::max(wstart, 0), width);
-        wend = std::min(std::max(wend, 0), width);
-        bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-        // Accumulate diff_val into input data
-        T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-        T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-        for (int ih = hstart; ih < hend; ++ih) {
-          for (int iw = wstart; iw < wend; ++iw) {
-            int input_index = ih * width + iw;
-            offset_input_grad_data[input_index] += diff_val;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc
deleted file mode 100644
index 8532649614c867a860774378e4ffd9b251dd76d5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/pull_box_sparse_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PullBoxSparseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_GE(ctx->Inputs("Ids").size(), 1UL,
-                      "Inputs(Ids) of PullBoxSparseOp should not be empty.");
-    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      "Outputs(Out) of PullBoxSparseOp should not be empty.");
-    auto hidden_size = static_cast<int64_t>(ctx->Attrs().Get<int>("size"));
-    auto all_ids_dim = ctx->GetInputsDim("Ids");
-    const size_t n_ids = all_ids_dim.size();
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.resize(n_ids);
-    for (size_t i = 0; i < n_ids; ++i) {
-      const auto ids_dims = all_ids_dim[i];
-      int ids_rank = ids_dims.size();
-      PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
-                        "Shape error in %lu id, the last dimension of the "
-                        "'Ids' tensor must be 1.",
-                        i);
-      auto out_dim = framework::vectorize(
-          framework::slice_ddim(ids_dims, 0, ids_rank - 1));
-      out_dim.push_back(hidden_size);
-      outs_dims[i] = framework::make_ddim(out_dim);
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
-    for (size_t i = 0; i < n_ids; ++i) {
-      ctx->ShareLoD("Ids", "Out", i, i);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::FP32,
-                                   ctx.device_context());
-  }
-};
-
-class PullBoxSparseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "Input tensors with type int32 or int64 "
-             "contains the ids to be looked up in BoxPS. "
-             "The last dimension size must be 1.")
-        .AsDuplicable();
-    AddOutput("Out", "The lookup results tensors.").AsDuplicable();
-    AddAttr<int>("size", "(int, the embedding hidden size").SetDefault(1);
-    AddComment(R"DOC(
-Pull Box Sparse Operator.
-
-This operator is used to perform lookups on the BoxPS,
-then concatenated into a dense tensor.
-
-The input Ids can carry the LoD (Level of Details) information,
-or not. And the output only shares the LoD information with input Ids.
-
-)DOC");
-  }
-};
-
-class PushBoxSparseOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("push_box_sparse");
-    op->SetInput("Ids", Input("Ids"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class PushBoxSparseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.MultiInput<framework::Tensor>(framework::GradVarName("Out"))[0]
-            ->type(),
-        ctx.device_context());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(pull_box_sparse, ops::PullBoxSparseOp,
-                  ops::PullBoxSparseOpMaker, ops::PushBoxSparseOpDescMaker);
-REGISTER_OPERATOR(push_box_sparse, ops::PushBoxSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseCPUKernel<float>)
-REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseCPUKernel<float>)
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cu b/paddle/fluid/operators/pull_box_sparse_op.cu
deleted file mode 100644
index 8bba9db5426b7055dce03ee2f5e87c11a38aef1b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pull_box_sparse_op.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/pull_box_sparse_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class PullBoxSparseCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PullBoxSparseFunctor<T>(ctx);
-  }
-};
-
-template <typename T>
-class PushBoxSparseCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PushBoxSparseFunctor<T>(ctx);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseCUDAKernel<float>)
-REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseCUDAKernel<float>)
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
deleted file mode 100644
index 48a9e4d9313640b90d1ba7278703a217e31feb46..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/fleet/box_wrapper.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) {
-  auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
-  auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
-  auto hidden_size = ctx.Attr<int>("size");
-  const auto slot_size = inputs.size();
-  std::vector<const uint64_t *> all_keys(slot_size);
-  // BoxPS only supports float now
-  std::vector<float *> all_values(slot_size);
-  std::vector<int64_t> slot_lengths(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    const auto *slot = inputs[i];
-    const uint64_t *single_slot_keys =
-        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
-    all_keys[i] = single_slot_keys;
-    slot_lengths[i] = slot->numel();
-    auto *output = outputs[i]->mutable_data<T>(ctx.GetPlace());
-    all_values[i] = output;
-  }
-  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
-  box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths,
-                      hidden_size);
-}
-
-template <typename T>
-static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
-  auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
-  auto d_output =
-      ctx.MultiInput<framework::Tensor>(framework::GradVarName("Out"));
-  auto hidden_size = ctx.Attr<int>("size");
-  const auto slot_size = inputs.size();
-  std::vector<const uint64_t *> all_keys(slot_size);
-  std::vector<const float *> all_grad_values(slot_size);
-  std::vector<int64_t> slot_lengths(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    const auto *slot = inputs[i];
-    const uint64_t *single_slot_keys =
-        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
-    all_keys[i] = single_slot_keys;
-    slot_lengths[i] = slot->numel();
-    const float *grad_value = d_output[i]->data<float>();
-    all_grad_values[i] = grad_value;
-  }
-  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
-  box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values,
-                          slot_lengths, hidden_size);
-}
-
-using LoDTensor = framework::LoDTensor;
-template <typename T>
-class PullBoxSparseCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PullBoxSparseFunctor<T>(ctx);
-  }
-};
-
-template <typename T>
-class PushBoxSparseCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PushBoxSparseFunctor<T>(ctx);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
deleted file mode 100644
index 5300e807472d3bb243dc198c0bfd1bc572538015..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/py_func_op.cc
+++ /dev/null
@@ -1,312 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/py_func_op.h"
-
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-namespace py = ::pybind11;
-
-static std::vector<py::object> g_py_callables;
-
-const char kForwardPythonCallableId[] = "forward_callable_id";
-const char kBackwardPythonCallableId[] = "backward_callable_id";
-const char kPyFuncBackwardSkipVars[] = "backward_skip_vars";
-
-size_t AppendPythonCallableObjectAndReturnId(const py::object &py_obj) {
-  g_py_callables.emplace_back(py_obj);
-  return g_py_callables.size() - 1;
-}
-
-// Return py::object* instead of py::object
-// Returning py::object would cause reference count increasing
-// but without GIL, reference count in Python may not be safe
-static py::object *GetPythonCallableObject(size_t i) {
-  PADDLE_ENFORCE_LT(i, g_py_callables.size(), "Invalid python callable id");
-  return &g_py_callables[i];
-}
-
-static std::string PythonFuncDebugString(const py::object &py_callable) {
-  py::gil_scoped_acquire guard;
-  std::string wrapper_func_str = py::str(py_callable);
-  auto inner_func = py_callable.attr("_func");
-  std::string inner_func_str = py::str(inner_func);
-  return inner_func_str + " wrapped by " + wrapper_func_str;
-}
-
-static void CallPythonFunc(py::object *callable,
-                           const std::vector<framework::LoDTensor> &ins,
-                           std::vector<framework::LoDTensor *> *outs) {
-  py::gil_scoped_acquire guard;
-  py::tuple in_args(ins.size());
-  for (size_t i = 0; i < ins.size(); ++i) {
-    in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr);
-  }
-
-  auto ret = (*callable)(*in_args);
-  auto ret_tuple = py::cast<py::tuple>(ret);
-  size_t ret_num = py::len(ret_tuple);
-  size_t out_num = outs->size();
-  if (UNLIKELY(ret_num != out_num)) {
-    // Python function has no return values or returns None
-    // In this case, ret_num = 1 && ret[0] == None && out_num should be 0
-    // Otherwise, ret_num must be equal to out_num
-    PADDLE_ENFORCE(
-        ret_num == 1 && out_num == 0 &&
-            py::cast<framework::LoDTensor *>(ret_tuple[0]) == nullptr,
-        "Output number not match. Expected %d, actual %d", out_num, ret_num);
-  }
-
-  for (size_t i = 0; i < out_num; ++i) {
-    auto *out = (*outs)[i];
-    if (out == nullptr) {
-      continue;
-    }
-    try {
-      auto *py_out_tensor = py::cast<framework::LoDTensor *>(ret_tuple[i]);
-      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
-                              "Output tensor %d should not be nullptr", i);
-      out->set_lod(py_out_tensor->lod());
-      out->ShareDataWith(*py_out_tensor);
-    } catch (py::cast_error &) {
-      PADDLE_THROW("The %d-th output must be LoDTensor", i);
-    }
-  }
-}
-
-class PyFuncOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    bool has_out = (ctx->HasOutput("Out") && !ctx->Output("Out").empty());
-
-    bool has_in = (ctx->HasInput("X") && !ctx->Input("X").empty());
-
-    /**
-     * X or Out can be empty, so that py_func can be more flexible
-     * to support Python functions with no input or no output
-     */
-    PADDLE_ENFORCE(has_in || has_out, "Input(X) or Output(Out) must exist");
-
-    PADDLE_ENFORCE_GE(boost::get<int>(ctx->GetAttr(kForwardPythonCallableId)),
-                      0, "Function id cannot be less than 0");
-
-    if (!has_out) return;
-
-    /**
-     * Traverse all outputs, check if name of any output ends with @GRAD.
-     * If found, set its shape, dtype, lod_level, type to be the same as
-     * the corresponding forward variable
-     */
-    const std::string kGradVarSuffix = framework::kGradVarSuffix;
-    auto &out_var_names = ctx->Output("Out");
-    for (auto &out_var_name : out_var_names) {
-      if (out_var_name == framework::kEmptyVarName ||
-          out_var_name.size() < kGradVarSuffix.size()) {
-        continue;
-      }
-
-      size_t len = out_var_name.size() - kGradVarSuffix.size();
-      if (out_var_name.substr(len) == kGradVarSuffix) {
-        auto fwd_var_name = out_var_name.substr(0, len);
-        PADDLE_ENFORCE(ctx->HasVar(out_var_name),
-                       "Backward variable %s not found", out_var_name);
-        PADDLE_ENFORCE(ctx->HasVar(fwd_var_name),
-                       "Backward variable %s not found", fwd_var_name);
-        VLOG(10) << "Infer var_desc of Output(" << out_var_name << ") as Input("
-                 << fwd_var_name << ")";
-
-        ctx->SetShape(out_var_name, ctx->GetShape(fwd_var_name));
-        ctx->SetDataType(out_var_name, ctx->GetDataType(fwd_var_name));
-        ctx->SetLoDLevel(out_var_name, ctx->GetLoDLevel(fwd_var_name));
-        ctx->SetType(out_var_name, ctx->GetType(fwd_var_name));
-      }
-    }
-  }
-};
-
-class PyFuncOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(!ctx->IsRuntime(),
-                   "Infer shape cannot be called in runtime.");
-  }
-};
-
-class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Inputs of py_func op.").AsDuplicable();
-    AddOutput("Out", "Outputs of py_func op").AsDuplicable();
-    AddAttr<int>(kForwardPythonCallableId,
-                 "Index of registered forward Python function.")
-        .SetDefault(0);
-    AddAttr<int>(kBackwardPythonCallableId,
-                 "Index of registered backward Python function.")
-        .SetDefault(-1);
-    AddAttr<std::vector<std::string>>(kPyFuncBackwardSkipVars,
-                                      "Unused forward in/out in backward op")
-        .SetDefault(std::vector<std::string>());
-    AddComment(R"DOC("PyFunc Op")DOC");
-  }
-};
-
-/**
- * There are several benefits when backward op of py_func op is
- * still py_func op.
- *
- *  - Less codes are needed, since codes of backward is almost
- *    the same as forward.
- *
- *  - To support high order derivative, so that py_func is
- *    infinite-order differentiable
- */
-class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase {
- private:
-  static std::string DebugString(const std::vector<std::string> &strs) {
-    if (strs.empty()) return "";
-    std::string ret = strs[0];
-    for (size_t i = 1; i < strs.size(); ++i) {
-      ret += " ";
-      ret += strs[i];
-    }
-    return ret;
-  }
-
- public:
-  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-    auto &fwd_attrs = Attrs();
-    // no backward op when backward_id is less than 0
-    if (boost::get<int>(fwd_attrs.at(kBackwardPythonCallableId)) < 0) {
-      return {};
-    }
-
-    std::unique_ptr<framework::OpDesc> grad_op(new framework::OpDesc());
-    grad_op->SetType("py_func");
-
-    framework::AttributeMap bwd_attrs;
-    bwd_attrs[kForwardPythonCallableId] =
-        fwd_attrs.at(kBackwardPythonCallableId);
-    bwd_attrs[kBackwardPythonCallableId] = -1;
-    grad_op->SetAttrMap(bwd_attrs);
-
-    // All forward inputs
-    auto fwd_ins = Input("X");
-    // All forward outputs
-    auto fwd_outs = Output("Out");
-
-    // For memory reused, some inputs/output in forward part may be not needed
-    // in backward part. Skipping these vars helps to save memory
-    auto &backward_skip_var_list = boost::get<std::vector<std::string>>(
-        fwd_attrs.at(kPyFuncBackwardSkipVars));
-    std::unordered_set<std::string> backward_skip_var_set(
-        backward_skip_var_list.begin(), backward_skip_var_list.end());
-    std::vector<std::string> bwd_ins;
-    bwd_ins.reserve(fwd_ins.size() + fwd_outs.size());
-    for (auto &fwd_in : fwd_ins) {
-      if (backward_skip_var_set.count(fwd_in) == 0) {
-        bwd_ins.emplace_back(fwd_in);
-      }
-    }
-
-    for (auto &fwd_out : fwd_outs) {
-      if (backward_skip_var_set.count(fwd_out) == 0) {
-        bwd_ins.emplace_back(fwd_out);
-      }
-    }
-
-    // Backward OG cannot be skipped
-    // But in Python side, if OG is kEmptyVarName, input tensor would be None
-    auto fwd_out_grads = OutputGrad("Out");
-    bwd_ins.reserve(bwd_ins.size() + fwd_out_grads.size());
-    bwd_ins.insert(bwd_ins.end(), fwd_out_grads.begin(), fwd_out_grads.end());
-
-    // Backward IG cannot be skipped
-    // But in Python side, if IG is not needed, users can just return None
-    auto bwd_outs = InputGrad("X", false);
-
-    VLOG(10) << "PyFunc Grad Input: " << DebugString(bwd_ins);
-    VLOG(10) << "PyFunc Grad Output: " << DebugString(bwd_outs);
-
-    grad_op->SetInput("X", bwd_ins);
-    grad_op->SetOutput("Out", bwd_outs);
-
-    std::vector<std::unique_ptr<framework::OpDesc>> ret(1);
-    ret[0] = std::move(grad_op);
-    return ret;
-  }
-};
-
-class PyFuncOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- protected:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto &in_arg_names = Inputs("X");
-    auto &out_arg_names = Outputs("Out");
-
-    std::vector<framework::LoDTensor> inputs(in_arg_names.size());
-    for (size_t i = 0; i < in_arg_names.size(); ++i) {
-      auto in_var = scope.FindVar(in_arg_names[i]);
-      // When py_func op is called in backward, in_var may be null
-      if (in_var == nullptr) {
-        continue;
-      }
-      auto &in_tensor = in_var->Get<framework::LoDTensor>();
-      if (!in_tensor.IsInitialized()) {
-        continue;
-      }
-      if (platform::is_gpu_place(in_tensor.place())) {
-        framework::TensorCopySync(in_tensor, platform::CPUPlace(), &inputs[i]);
-      } else {
-        inputs[i].ShareDataWith(in_tensor);
-      }
-      inputs[i].set_lod(in_tensor.lod());
-    }
-
-    std::vector<framework::LoDTensor *> outputs(out_arg_names.size());
-    for (size_t i = 0; i < out_arg_names.size(); ++i) {
-      auto *out_var = scope.FindVar(out_arg_names[i]);
-      outputs[i] =
-          out_var ? out_var->GetMutable<framework::LoDTensor>() : nullptr;
-    }
-
-    auto callable_id = static_cast<size_t>(Attr<int>(kForwardPythonCallableId));
-    auto *py_callable = GetPythonCallableObject(callable_id);
-    VLOG(10) << "Call Python function with id " << callable_id << ": "
-             << PythonFuncDebugString(*py_callable);
-    CallPythonFunc(py_callable, inputs, &outputs);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker,
-                  ops::PyFuncOpVarTypeInference, ops::PyFuncOpShapeInference,
-                  ops::PyFuncOpGradDescMaker);
diff --git a/paddle/fluid/operators/py_func_op.h b/paddle/fluid/operators/py_func_op.h
deleted file mode 100644
index 5cebcd8dc0d2dcd537c7d2ce48508a6a5ee6a77d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/py_func_op.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/python_headers.h"
-
-namespace paddle {
-namespace operators {
-
-size_t AppendPythonCallableObjectAndReturnId(const ::pybind11::object &py_obj);
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc
deleted file mode 100644
index d8e20f4c4ae6059551bfff3603a2ad6c0a7aa86d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/quantize_op.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#include "paddle/fluid/operators/quantize_op.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-framework::OpKernelType QuantOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library_ = framework::LibraryType::kMKLDNN;
-  framework::DataLayout layout_ = framework::DataLayout::kMKLDNN;
-
-  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                 ctx.GetPlace(), layout_, library_);
-}
-
-void QuantOpMaker::Make() {
-  AddInput("Input", "input data");
-  AddOutput("Output", "output data");
-  AddAttr<bool>("is_negative_input",
-                "(bool, default false) Only used in mkldnn INT8 kernel")
-      .SetDefault(false);
-  AddAttr<float>("Scale", "scale data").SetDefault({1.0f});
-  AddComment(R"DOC(This op will quantize data from FP32 to INT8)DOC");
-}
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker);
diff --git a/paddle/fluid/operators/quantize_op.h b/paddle/fluid/operators/quantize_op.h
deleted file mode 100644
index 091306e4637c7e2393b6736f0e1edf9dd7fd2c8a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/quantize_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::OpKernelType;
-using framework::Tensor;
-
-class QuantOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Output", ctx->GetInputDim("Input"));
-    ctx->ShareLoD("Input", /*->*/ "Output");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class QuantOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
deleted file mode 100644
index 65a8d603fcee27223182984769df221e3f519b05..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/random_crop_op.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/random_crop_op.h"
-
-namespace paddle {
-namespace operators {
-
-class RandomCropOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "A batch of instances to random crop.");
-    AddInput("Seed", "The random seed.");
-    AddOutput("Out", "The cropped instance batch.");
-    AddOutput("SeedOut", "The random seed after random cropping.")
-        .AsIntermediate();
-    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
-    AddAttr<int>("startup_seed",
-                 "If the input 'Seed' is not initialized, the 'startup_seed' "
-                 "will be used to replace it. Even so, the seed after random "
-                 "crop will also be outputed to the 'SeedOut'.")
-        .SetDefault(0);
-    AddComment(R"DOC(
-      This operator takes a batch of instance, and do random cropping on each instance.
-      It means that cropping positions differs on each instance, which is determined
-      by an uniform random generator. All cropped instances have the same shape, which 
-      is determined by the operator's attribute 'shape'.
-    )DOC");
-  }
-};
-
-class RandomCropOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
-    auto out_dim = framework::vectorize<int>(x_dim);
-    for (size_t i = 1; i <= shape.size(); ++i) {
-      size_t x_i = x_dim.size() - i;
-      size_t shape_i = shape.size() - i;
-      if (ctx->IsRuntime() || (x_dim[x_i] > 0 && shape[shape_i] > 0)) {
-        PADDLE_ENFORCE_GE(x_dim[x_i], shape[shape_i]);
-      }
-      out_dim[x_i] = shape[shape_i];
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace f = paddle::framework;
-REGISTER_OPERATOR(random_crop, ops::RandomCropOp, ops::RandomCropOpMaker,
-                  ops::RandomCropOpInferShape, f::EmptyGradOpMaker);
-
-template <typename T>
-using Kernel = ops::RandomCropKernel<paddle::platform::CPUDeviceContext, T>;
-REGISTER_OP_CPU_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
-                       Kernel<uint8_t>, Kernel<int16_t>);
diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/fluid/operators/random_crop_op.cu
deleted file mode 100644
index 6fc9bedc55b4d349ddf3d109c7f9049113235f0c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/random_crop_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/random_crop_op.h"
-
-namespace ops = paddle::operators;
-template <typename T>
-using Kernel = ops::RandomCropKernel<paddle::platform::CUDADeviceContext, T>;
-REGISTER_OP_CUDA_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
-                        Kernel<uint8_t>, Kernel<int16_t>);
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
deleted file mode 100644
index ae58358cbb202c229cb8a96e20e4c48d926c5bf3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/random_crop_op.h
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/for_range.h"
-#ifdef PADDLE_WITH_CUDA
-#include <thrust/random.h>
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext>
-struct Random;
-
-template <>
-struct Random<platform::CPUDeviceContext> {
-  using Engine = std::minstd_rand;
-
-  template <typename T>
-  using UniformIntDist = std::uniform_int_distribution<T>;
-};
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-struct Random<platform::CUDADeviceContext> {
-  using Engine = thrust::minstd_rand;
-
-  template <typename T>
-  using UniformIntDist = thrust::uniform_int_distribution<T>;
-};
-#endif
-
-template <typename T>
-HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out,
-                                     const size_t* out_dims, int i, int rank,
-                                     size_t prod_x_remain,
-                                     size_t prod_out_remain,
-                                     const size_t* offsets) {
-  size_t x_dim_i = x_dims[i];
-  size_t out_dim_i = out_dims[i];
-  size_t x_stride = prod_x_remain / x_dim_i;
-  size_t out_stride = prod_out_remain / out_dim_i;
-  size_t offset_i = offsets[i];
-
-  if (i == rank - 1) {
-    PADDLE_ENFORCE(x_stride == 1,
-                   "When i:%d == rank:%d - 1, x_stride of random_crop_op "
-                   "expected to be 1, but got %ld. Please check input "
-                   "value.",
-                   i, rank, x_stride);
-    PADDLE_ENFORCE(out_stride == 1,
-                   "When i:%d == rank:%d - 1, out_stride of random_crop_op "
-                   "expected to be 1, but got %ld. Please check input "
-                   "value.",
-                   i, rank, out_stride);
-    x += offset_i;
-    for (size_t j = 0; j < out_dim_i; ++j) {
-      *out++ = *x++;
-    }
-  } else {
-    x += offset_i * x_stride;
-    for (size_t j = 0; j < out_dim_i; ++j) {
-      StridedMemcpy<T>(x, x_dims, out, out_dims, i + 1, rank, x_stride,
-                       out_stride, offsets);
-      x += x_stride;
-      out += out_stride;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-struct RandomCropFunctor {
-  const T* x_;
-  T* out_;
-  size_t x_dims_[9];
-  size_t out_dims_[9];
-  int num_batchsize_dims_;
-  int rank_;
-  int64_t seed_;
-
-  size_t prod_batchsize_dims_;
-  size_t prod_x_ins_dims_;
-  size_t prod_out_ins_dims_;
-
-  RandomCropFunctor(const T* x, T* out, const framework::DDim& x_dims,
-                    const framework::DDim& out_dims, int num_batchsize_dims,
-                    int64_t seed)
-      : x_(x),
-        out_(out),
-        num_batchsize_dims_(num_batchsize_dims),
-        rank_(x_dims.size()),
-        seed_(seed) {
-    PADDLE_ENFORCE_EQ(x_dims.size(), out_dims.size());
-    PADDLE_ENFORCE_GT(rank_, num_batchsize_dims_);
-    prod_batchsize_dims_ = 1;
-    prod_x_ins_dims_ = 1;
-    prod_out_ins_dims_ = 1;
-    for (size_t i = 0; i < static_cast<size_t>(rank_); ++i) {
-      size_t x_dim_i = x_dims[i];
-      size_t out_dim_i = out_dims[i];
-      x_dims_[i] = x_dim_i;
-      out_dims_[i] = out_dim_i;
-      if (i < static_cast<size_t>(num_batchsize_dims_)) {
-        PADDLE_ENFORCE_EQ(x_dim_i, out_dim_i);
-        prod_batchsize_dims_ *= x_dim_i;
-      } else {
-        prod_x_ins_dims_ *= x_dim_i;
-        prod_out_ins_dims_ *= out_dim_i;
-      }
-    }
-  }
-
-  HOSTDEVICE void operator()(size_t ins_idx) {
-    typename Random<DeviceContext>::Engine engine(seed_);
-    engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
-    size_t offsets[9] = {};
-    for (int i = num_batchsize_dims_; i < rank_; ++i) {
-      typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
-          0, x_dims_[i] - out_dims_[i]);
-      offsets[i - num_batchsize_dims_] = dist(engine);
-    }
-
-    const T* x = x_ + ins_idx * prod_x_ins_dims_;
-    T* out = out_ + ins_idx * prod_out_ins_dims_;
-
-    StridedMemcpy<T>(x, x_dims_ + num_batchsize_dims_, out,
-                     out_dims_ + num_batchsize_dims_, 0,
-                     rank_ - num_batchsize_dims_, prod_x_ins_dims_,
-                     prod_out_ins_dims_, offsets);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RandomCropKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    int64_t seed = 0;
-    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
-    if (seed_tensor.IsInitialized()) {
-      if (platform::is_cpu_place(seed_tensor.place())) {
-        seed = *seed_tensor.data<int64_t>();
-      } else {
-        LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
-                        "your program";
-        framework::LoDTensor cpu_seed;
-        framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
-        seed = *cpu_seed.data<int64_t>();
-      }
-    } else {
-      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
-                 "'startup_seed' instead.";
-      seed = ctx.Attr<int>("startup_seed");
-    }
-    auto shape = ctx.Attr<std::vector<int>>("shape");
-    auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
-    auto& out = detail::Ref(ctx.Output<framework::LoDTensor>("Out"));
-
-    int num_batchsize_dims = x.dims().size() - shape.size();
-    RandomCropFunctor<DeviceContext, T> functor(
-        x.data<T>(), out.mutable_data<T>(ctx.GetPlace()), x.dims(), out.dims(),
-        num_batchsize_dims, seed);
-    platform::ForRange<DeviceContext> for_range(
-        ctx.template device_context<DeviceContext>(),
-        functor.prod_batchsize_dims_);
-
-    for_range(functor);
-
-    Random<platform::CPUDeviceContext>::Engine engine(seed);
-    engine.discard(functor.prod_batchsize_dims_ *
-                   (functor.rank_ - functor.num_batchsize_dims_));
-    *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
-        framework::make_ddim({1}), platform::CPUPlace()) = engine();
-  }
-};
-
-// TODO(fengjiayi): Backward of random crop op
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc
deleted file mode 100644
index ee8c68fd008c8c9764e9ef74dc37fa08cf31be19..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/range_op.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/range_op.h"
-
-namespace paddle {
-namespace operators {
-
-class RangeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasInput("Start")) {
-      auto s_dims = ctx->GetInputDim("Start");
-      PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
-                     "The shape of Input(Start) should be [1].");
-    }
-    if (ctx->HasInput("End")) {
-      auto e_dims = ctx->GetInputDim("End");
-      PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
-                     "The shape of Input(End) should be [1].");
-    }
-    if (ctx->HasInput("Step")) {
-      auto step_dims = ctx->GetInputDim("Step");
-      PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
-                     "The shape of Input(Step) should be [1].");
-    }
-    ctx->SetOutputDim("Out", {-1});
-  }
-};
-
-class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Start",
-             "Start of interval. The interval includes this value. It is a "
-             "tensor with shape=[1].");
-    AddInput("End",
-             "End of interval. The interval does not include this value, "
-             "except in some cases where step is not an integer and floating "
-             "point round-off affects the length of out. It is a tensor with "
-             "shape=[1].");
-    AddInput("Step", "Spacing between values. It is a tensor with shape=[1].");
-    AddOutput("Out", "A sequence of numbers.");
-    AddComment(R"DOC(
-    Return evenly spaced values within a given interval. Values are generated within the half-open interval [start, stop) (in other words, the interval including start but excluding stop). Like arange function of numpy.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker);
-REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel<int>,
-                       ops::CPURangeKernel<float>, ops::CPURangeKernel<double>,
-                       ops::CPURangeKernel<int64_t>);
diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
deleted file mode 100644
index e2c03716d55ee41ce3a9053b48b5c6d4c70e391f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/range_op.cu
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/range_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename T>
-__global__ void RangeKernel(T start, T step, int64_t size, T* out) {
-  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
-}
-
-template <typename T>
-class CUDARangeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<framework::Tensor>("Start");
-    auto* end_t = context.Input<framework::Tensor>("End");
-    auto* step_t = context.Input<framework::Tensor>("Step");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    framework::Tensor n;
-    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
-    T start = n.data<T>()[0];
-    framework::TensorCopy(*end_t, platform::CPUPlace(), &n);
-    T end = n.data<T>()[0];
-    framework::TensorCopy(*step_t, platform::CPUPlace(), &n);
-    T step = n.data<T>()[0];
-
-    int64_t size = 0;
-    GetSize(start, end, step, &size);
-    out->Resize(framework::make_ddim({size}));
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto stream = context.cuda_device_context().stream();
-    int block = 512;
-    int grid = (size + block - 1) / block;
-    RangeKernel<T><<<grid, block, 0, stream>>>(start, step, size, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel<int>,
-                        ops::CUDARangeKernel<int64_t>,
-                        ops::CUDARangeKernel<float>,
-                        ops::CUDARangeKernel<double>);
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
deleted file mode 100644
index fce58b45c96ad76dfdd4ed7f54becde327070002..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/range_op.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void GetSize(T start, T end, T step, int64_t* size) {
-  PADDLE_ENFORCE(!std::equal_to<T>()(step, 0),
-                 "The step of range op should not be 0.");
-  PADDLE_ENFORCE(((start < end) && (step > 0)) || ((start > end) && (step < 0)),
-                 "The step should be greater than 0 while start < end. And the "
-                 "step should be less than 0 while start > end.");
-  *size = std::is_integral<T>::value
-              ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
-              : std::ceil(std::abs((end - start) / step));
-}
-
-template <typename T>
-class CPURangeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
-    T end = context.Input<framework::Tensor>("End")->data<T>()[0];
-    T step = context.Input<framework::Tensor>("Step")->data<T>()[0];
-    auto* out = context.Output<framework::Tensor>("Out");
-    int64_t size = 0;
-    GetSize(start, end, step, &size);
-    out->Resize(framework::make_ddim({size}));
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    T value = start;
-    for (int64_t i = 0; i < size; ++i) {
-      out_data[i] = value;
-      value += step;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
deleted file mode 100644
index 45daa6b955639e3695211c1032869c743ede9b2c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/rank_loss_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class RankLossOp : public framework::OperatorWithKernel {
- public:
-  RankLossOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
-
-    auto label_dims = ctx->GetInputDim("Label");
-    auto left_dims = ctx->GetInputDim("Left");
-    auto right_dims = ctx->GetInputDim("Right");
-
-    PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
-                   "All inputs must have the same size.");
-    PADDLE_ENFORCE(
-        (label_dims.size() == 2) && (label_dims[1] == 1),
-        "All inputs must be 2-D tensors with shape [batch_size x 1].");
-    ctx->SetOutputDim("Out", label_dims);
-  }
-};
-
-class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Label",
-             "(2-D Tensor with shape [batch_size x 1]) "
-             "The label indicating A ranked higher than B or not.");
-    AddInput("Left",
-             "(2-D Tensor with shape [batch_size x 1]) "
-             "The output of RankNet for doc A.");
-    AddInput("Right",
-             "(2-D Tensor with shape [batch_size x 1]) "
-             "The output of RankNet for doc B.");
-    AddOutput("Out",
-              "(2-D Tensor with shape [batch_size x 1]) "
-              "The output loss of RankLoss operator.");
-    AddComment(R"DOC(
-RankLoss Operator.
-
-RankLoss operator for RankNet
-(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). 
-RankNet is a pairwise ranking model with
-one training sample consisting of a pair of doc A and B, and the label P
-indicating that A is ranked higher than B or not:
-
-P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
-the input pair.
-
-The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output score of RankNet for the two docs and 
-the label respectively, and yields the rank loss C_{i,j} using the following 
-equation:
-
-$$
-  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
-  o_{i,j} =  o_i - o_j  \\
-  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-$$
-
-The operator can take batch inputs with size batch_size (batch_size >= 1).
-
-)DOC");
-  }
-};
-
-class RankLossGradOp : public framework::OperatorWithKernel {
- public:
-  RankLossGradOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    auto dims = ctx->GetInputDim("Left");
-    auto left_grad_name = framework::GradVarName("Left");
-    auto right_grad_name = framework::GradVarName("Right");
-
-    if (ctx->HasOutput(left_grad_name)) {
-      ctx->SetOutputDim(left_grad_name, dims);
-    }
-
-    if (ctx->HasOutput(right_grad_name)) {
-      ctx->SetOutputDim(right_grad_name, dims);
-    }
-  }
-};
-
-class RankLossGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("rank_loss_grad");
-    op->SetInput("Label", Input("Label"));
-    op->SetInput("Left", Input("Left"));
-    op->SetInput("Right", Input("Right"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("Left"), InputGrad("Left"));
-    op->SetOutput(framework::GradVarName("Right"), InputGrad("Right"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(rank_loss, ops::RankLossOp, ops::RankLossOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(rank_loss_grad, ops::RankLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    rank_loss_grad,
-    ops::RankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/rank_loss_op.cu b/paddle/fluid/operators/rank_loss_op.cu
deleted file mode 100644
index ed805279892d0f045fdde94b30c9bc7b19348a9a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/rank_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/rank_loss_op.h"
-
-REGISTER_OP_CUDA_KERNEL(rank_loss,
-                        paddle::operators::RankLossKernel<
-                            paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(rank_loss_grad,
-                        paddle::operators::RankLossGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/rank_loss_op.h b/paddle/fluid/operators/rank_loss_op.h
deleted file mode 100644
index 28626c0e2e697cef29db3b969fa55a1da78dd8a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/rank_loss_op.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class RankLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::Tensor>("Out");
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-    auto* left_t = ctx.Input<framework::Tensor>("Left");
-    auto* right_t = ctx.Input<framework::Tensor>("Right");
-    out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto out = framework::EigenVector<T>::Flatten(*out_t);
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto left = framework::EigenVector<T>::Flatten(*left_t);
-    auto right = framework::EigenVector<T>::Flatten(*right_t);
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    out.device(dev) =
-        (1. + (left - right).exp()).log() - label * (left - right);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RankLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_left_t =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Left"));
-    auto* d_right_t =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Right"));
-
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-    auto* left_t = ctx.Input<framework::Tensor>("Left");
-    auto* right_t = ctx.Input<framework::Tensor>("Right");
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto left = framework::EigenVector<T>::Flatten(*left_t);
-    auto right = framework::EigenVector<T>::Flatten(*right_t);
-
-    // compute d_left
-    if (d_left_t) {
-      d_left_t->mutable_data<T>(ctx.GetPlace());
-      auto d_left = framework::EigenVector<T>::Flatten(*d_left_t);
-      d_left.device(dev) = d_out * (1. / (1. + (right - left).exp()) - label);
-    }
-    // compute d_right
-    if (d_right_t) {
-      d_right_t->mutable_data<T>(ctx.GetPlace());
-      auto d_right = framework::EigenVector<T>::Flatten(*d_right_t);
-      d_right.device(dev) =
-          -d_out * (1.0 / (1. + (right - left).exp()) - label);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
deleted file mode 100644
index f61af3332911b6115853ed8d382b3ca35161d5b8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-include(operators)
-
-cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader)
-set(LOCAL_READER_LIBS)
-
-function(reader_library TARGET_NAME)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    set(options "")
-    set(common_deps reader_op_registry)
-    cmake_parse_arguments(reader_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-    op_library(${TARGET_NAME} SRCS ${reader_library_SRCS} DEPS ${common_deps} ${reader_library_DEPS})
-    set(LOCAL_READER_LIBS
-            ${TARGET_NAME}
-            ${LOCAL_READER_LIBS}
-        PARENT_SCOPE)
-endfunction()
-
-cc_library(py_reader SRCS py_reader.cc DEPS reader)
-cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
-
-reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
-reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader)
-
-cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
-# Export local libraries to parent
-# set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
-
-op_library(read_op DEPS py_reader buffered_reader)
-
-foreach(src ${LOCAL_READER_LIBS})
-    set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
-endforeach()
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
deleted file mode 100644
index b23105916bcef4759c5a212ef019e33e21f2a1b7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ /dev/null
@@ -1,133 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <deque>
-#include <utility>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-template <typename T>
-class BlockingQueue {
-  // BlockingQueue is for buffered reading and is supposed to use only the
-  // reader package. It is true that we could and we should have been using
-  // framework::Channel, but which has currently a deadlock bug. BlockingQueue
-  // is a workaround and a simplified version of framework::Channel as it
-  // doesn't support GPU and it implements on buffered blocking queue.
- public:
-  explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
-      : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
-    PADDLE_ENFORCE_GT(
-        capacity_, static_cast<size_t>(0),
-        "The capacity of a reader::BlockingQueue must be greater than 0.");
-  }
-
-  bool Send(const T& elem) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
-    if (closed_) {
-      VLOG(5)
-          << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
-      return false;
-    }
-    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
-    queue_.push_back(elem);
-    receive_cv_.notify_one();
-    return true;
-  }
-
-  bool Send(T&& elem) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
-    if (closed_) {
-      VLOG(5)
-          << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
-      return false;
-    }
-    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
-    queue_.emplace_back(std::move(elem));
-    receive_cv_.notify_one();
-    return true;
-  }
-
-  bool Receive(T* elem) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; });
-    if (!queue_.empty()) {
-      PADDLE_ENFORCE_NOT_NULL(elem);
-      *elem = queue_.front();
-      if (LIKELY(!speed_test_mode_)) {
-        queue_.pop_front();
-      }
-      send_cv_.notify_one();
-      return true;
-    } else {
-      PADDLE_ENFORCE(closed_);
-      VLOG(3) << "queue is closed! return nothing.";
-      return false;
-    }
-  }
-
-  void ReOpen() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    VLOG(1) << "reopen queue";
-    closed_ = false;
-    std::deque<T> new_deque;
-    queue_.swap(new_deque);
-    send_cv_.notify_all();
-    receive_cv_.notify_all();
-  }
-
-  void Close() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    VLOG(1) << "close queue";
-    closed_ = true;
-    send_cv_.notify_all();
-    receive_cv_.notify_all();
-  }
-
-  bool IsClosed() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return closed_;
-  }
-
-  size_t Cap() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return capacity_;
-  }
-
-  size_t Size() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return queue_.size();
-  }
-
- private:
-  size_t capacity_;
-  bool speed_test_mode_;
-  bool closed_;
-  std::deque<T> queue_;
-
-  mutable std::mutex mutex_;
-  mutable std::condition_variable receive_cv_;
-  mutable std::condition_variable send_cv_;
-};
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
deleted file mode 100644
index b332450c252e0c5799591fd1a8f23009685be5ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reader/buffered_reader.h"
-#include <memory>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-
-#include "paddle/fluid/platform/profiler.h"
-namespace paddle {
-namespace operators {
-namespace reader {
-BufferedReader::~BufferedReader() {
-  VLOG(1) << "~BufferedReader";
-  reader_->Shutdown();
-  while (!position_.empty()) {
-    position_.front().wait();
-    position_.pop();
-  }
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(place_)) {
-    platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
-    for (auto &event : events_) {
-      PADDLE_ENFORCE(cudaEventDestroy(event));
-    }
-  }
-#endif
-}
-
-BufferedReader::BufferedReader(
-    const std::shared_ptr<framework::ReaderBase> &reader,
-    const platform::Place &place, size_t buffer_size)
-    : framework::DecoratedReader(reader),
-      thread_pool_(1),
-      place_(place),
-      buffer_size_(buffer_size) {
-  VLOG(1) << "BufferedReader";
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(place_)) {
-    platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    compute_stream_ =
-        ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
-                                             .Get(place_)))
-            ->stream();
-    events_.resize(buffer_size);
-    for (auto &event : events_) {
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-    }
-    PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
-  }
-#endif
-  cpu_buffer_.resize(buffer_size);
-  gpu_buffer_.resize(buffer_size);
-  ReadTillBufferFullAsync();
-}
-
-void BufferedReader::ReadTillBufferFullAsync() {
-  PADDLE_ENFORCE_EQ(position_.size(), 0U);
-  for (size_t i = 0; i < buffer_size_; ++i) {
-    ReadAsync(i);
-  }
-}
-
-void BufferedReader::ReadAsync(size_t i) {
-  position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
-    TensorVec &cpu = cpu_buffer_[i];
-    reader_->ReadNext(&cpu);
-
-    if (cpu.empty()) {
-      return -1UL;
-    }
-
-#ifdef PADDLE_WITH_CUDA
-    // NOTE(liangdun): using async copy instead of TensorCopySync
-    // TensorCopySync would block other stream, because TensorCopySync
-    // issues the copying command to the default stream, it will make two
-    // commands from different streams cannot run concurrently.
-    if (platform::is_gpu_place(place_)) {
-      TensorVec &gpu = gpu_buffer_[i];
-      if (gpu.empty()) {
-        gpu.resize(cpu.size());
-      } else {
-        PADDLE_ENFORCE_EQ(gpu.size(), cpu.size(),
-                          "Input tensor number not matched");
-      }
-
-      std::vector<void *> gpu_ptrs;
-      gpu_ptrs.reserve(cpu.size());
-      for (size_t i = 0; i < cpu.size(); ++i) {
-        gpu[i].Resize(cpu[i].dims());
-        gpu[i].set_layout(cpu[i].layout());
-        gpu_ptrs.emplace_back(gpu[i].mutable_data(place_, cpu[i].type()));
-      }
-
-      // NOTE(zjl): cudaStreamWaitEvent() must be called after all
-      // gpu[i].mutable_data() is called, since some ops release
-      // gpu memory immediately without waiting gpu kernel ends
-      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-      PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
-      PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
-
-      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
-      for (size_t i = 0; i < cpu.size(); ++i) {
-        auto cpu_place = cpu[i].place();
-        auto cpu_ptr = cpu[i].data<void>();
-        auto gpu_ptr = gpu_ptrs[i];
-        auto size =
-            cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
-        if (platform::is_cuda_pinned_place(cpu_place)) {
-          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
-                       boost::get<platform::CUDAPinnedPlace>(cpu_place),
-                       cpu_ptr, size, stream_);
-        } else if ((platform::is_gpu_place(cpu_place))) {
-          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
-                       boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
-                       size, stream_);
-        } else {
-          platform::CUDAPinnedPlace cuda_pinned_place;
-          framework::LoDTensor cuda_pinned_tensor;
-          cuda_pinned_tensor.Resize(cpu[i].dims());
-          auto cuda_pinned_ptr =
-              cuda_pinned_tensor.mutable_data(cuda_pinned_place, cpu[i].type());
-          memory::Copy(cuda_pinned_place, cuda_pinned_ptr,
-                       boost::get<platform::CPUPlace>(cpu_place), cpu_ptr,
-                       size);
-          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
-                       cuda_pinned_place, cuda_pinned_ptr, size, stream_);
-          PADDLE_ENFORCE(cudaStreamSynchronize(stream_),
-                         "cuda stream sync error.");
-        }
-        gpu[i].set_lod(cpu[i].lod());
-      }
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    }
-#endif
-    return i;
-  }));
-}
-
-void BufferedReader::ShutdownImpl() {
-  VLOG(1) << "ShutdownImpl";
-  reader_->Shutdown();
-  while (!position_.empty()) {
-    position_.pop();
-  }
-  prev_pos_ = -1UL;
-}
-
-void BufferedReader::StartImpl() {
-  reader_->Start();
-  ReadTillBufferFullAsync();
-}
-
-void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
-  if (position_.empty()) {
-    out->clear();
-    return;
-  }
-  size_t i = position_.front().get();
-  position_.pop();
-
-  if (i == -1UL) {
-    ReadNextImpl(out);
-    return;
-  }
-
-  *out = std::move(platform::is_gpu_place(place_) ? gpu_buffer_[i]
-                                                  : cpu_buffer_[i]);
-
-  // Do not push current position into ReadAsync. Push the previous position
-  // Since all computation in fluid are async, change the data of
-  // current position may cause data error.
-  if (prev_pos_ != -1Ul) {
-    ReadAsync(prev_pos_);
-  }
-  prev_pos_ = i;
-}
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
deleted file mode 100644
index 5f8b2d47c22d0a15d53c8d30d39608fd64d4bddd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <list>
-#include <memory>
-#include <queue>
-#include <vector>
-#include "ThreadPool.h"
-#include "paddle/fluid/framework/reader.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class BufferedReader : public framework::DecoratedReader {
-  using TensorVec = std::vector<framework::LoDTensor>;
-  using VecFuture = std::future<TensorVec>;
-
- public:
-  BufferedReader(const std::shared_ptr<framework::ReaderBase>& reader,
-                 const platform::Place& place, size_t buffer_size);
-
-  ~BufferedReader() override;
-
- private:
-  void ReadTillBufferFullAsync();
-
-  void ReadAsync(size_t i);
-
- protected:
-  void ShutdownImpl() override;
-  void StartImpl() override;
-  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
-
- private:
-  ThreadPool thread_pool_;
-  platform::Place place_;
-  const size_t buffer_size_;
-
-  std::queue<std::future<size_t>> position_;
-
-  // The buffer for reading data.
-  // NOTE: the simplest way to implement buffered reader is do not use any
-  // buffer, just read async and create futures as buffer size. However, to
-  // malloc tensors every time is extremely slow. Here we store all data in
-  // buffers and prevent alloc every time.
-  std::vector<TensorVec> cpu_buffer_;
-  std::vector<TensorVec> gpu_buffer_;
-  size_t prev_pos_{-1UL};
-#ifdef PADDLE_WITH_CUDA
-  cudaStream_t stream_;
-  cudaStream_t compute_stream_;
-  std::vector<cudaEvent_t> events_;
-#endif
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
deleted file mode 100644
index 2a3e80c9152b5550631f8c5669283b782f975d4e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reader/ctr_reader.h"
-
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-#include "paddle/fluid/operators/reader/reader_op_registry.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class CreateCTRReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    if (out->Get() != nullptr) return;
-
-    const std::string& queue_name = Input("blocking_queue");
-    auto* queue_holder_var = scope.FindVar(queue_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        queue_holder_var,
-        "No LoDTensorBlockingQueueHolder variable with name %s found",
-        queue_name);
-    auto* queue_holder =
-        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
-
-    auto thread_num = Attr<int>("thread_num");
-    auto sparse_slots = Attr<std::vector<std::string>>("sparse_slots");
-    auto dense_slot_index = Attr<std::vector<int>>("dense_slot_index");
-    auto sparse_slot_index = Attr<std::vector<int>>("sparse_slot_index");
-    auto batch_size = Attr<int>("batch_size");
-    auto file_type = Attr<std::string>("file_type");
-    auto file_format = Attr<std::string>("file_format");
-    auto file_list = Attr<std::vector<std::string>>("file_list");
-    DataDesc data_desc(batch_size, file_list, file_type, file_format,
-                       dense_slot_index, sparse_slot_index, sparse_slots);
-    VLOG(1) << data_desc;
-    out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), thread_num,
-                                           data_desc));
-  }
-};
-
-class CreateCTRReaderOpMaker : public FileReaderMakerBase {
- protected:
-  void Apply() override {
-    AddInput("blocking_queue",
-             "Name of the `LoDTensorBlockingQueueHolder` variable");
-    AddAttr<int>("thread_num", "the thread num to read data");
-    AddAttr<int>("batch_size", "the batch size of read data");
-    AddAttr<std::string>("file_type", "plain or gzip").SetDefault("plain");
-    AddAttr<std::string>("file_format", "svm or csv").SetDefault("csv");
-    AddAttr<std::vector<std::string>>("file_list",
-                                      "The list of files that need to read");
-    AddAttr<std::vector<int>>(
-        "dense_slot_index",
-        "the dense slots id that should be extract from file")
-        .SetDefault({});
-    AddAttr<std::vector<int>>(
-        "sparse_slot_index",
-        "the sparse slots id that should be extract from file")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>("sparse_slots",
-                                      "the sparse slots id that should be "
-                                      "extract from file, used when file "
-                                      "format is svm");
-
-    AddComment(R"DOC(
-			Create CTRReader to support read ctr data with cpp.
-      )DOC");
-  }
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
-
-namespace reader = ::paddle::operators::reader;
-
-REGISTER_FILE_READER_OPERATOR(create_ctr_reader, reader::CreateCTRReaderOp,
-                              reader::CreateCTRReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
deleted file mode 100644
index 975f7b991f80ee292aa7eb02109ab5e518331726..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/reader/reader_op_registry.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class CustomReader : public framework::DecoratedReader {
- public:
-  CustomReader(const std::shared_ptr<ReaderBase>& reader,
-               const framework::BlockDesc& sub_block,
-               const std::vector<std::string>& source_var_names,
-               const std::vector<std::string>& sink_var_names)
-      : DecoratedReader(reader),
-        program_(*sub_block.Program()),
-        sub_block_id_(sub_block.ID()),
-        exe_(framework::Executor(platform::CPUPlace())),
-        source_var_names_(source_var_names),
-        sink_var_names_(sink_var_names) {}
-
-  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
-
- private:
-  const framework::ProgramDesc program_;
-  int sub_block_id_;
-  framework::Executor exe_;
-  framework::Scope scope_;
-
-  std::vector<std::string> source_var_names_;
-  std::vector<std::string> sink_var_names_;
-};
-
-class CreateCustomReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    auto* sub_block = Attr<framework::BlockDesc*>("sub_block");
-    if (out->Get() != nullptr) {
-      return;
-    }
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    out->Reset(framework::MakeDecoratedReader<CustomReader>(
-        underlying_reader, *sub_block,
-        Attr<std::vector<std::string>>("source_var_names"),
-        Attr<std::vector<std::string>>("sink_var_names")));
-  }
-};
-
-class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
- protected:
-  void Apply() override {
-    AddAttr<framework::BlockDesc*>(
-        "sub_block", "The block to hold all preprocessing operators.");
-    AddAttr<std::vector<std::string>>(
-        "source_var_names",
-        "Source variables are starting points of data preprocessing. They hold "
-        "preprocessing's input tensors. Each source variable corresponds to "
-        "one of underlying reader's output datas.");
-    AddAttr<std::vector<std::string>>(
-        "sink_var_names",
-        "Sink variables are ending points of data preprocessing. They hold "
-        "preprocessing's output tensors. Each sink variable corresponds to "
-        "one of custom reader's output datas.");
-    AddComment(R"DOC(
-      CreateCustomReader Operator
-
-      A custom reader can be used for input data preprocessing.
-      A custom reader holds its own sub-block, which will be executed in CPU
-      in its 'ReadNext()' function. Users can configurate their own
-      preprocessing pipelines by inserting operators into custom reader's
-      sub-block.
-    )DOC");
-  }
-};
-
-class CustomReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(!ctx->IsRuntime(),
-                   "'CustomReaderInferShape' should only be invoked during "
-                   "compile time.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "The output decorated reader should not be null.");
-    const auto* sub_block =
-        ctx->Attrs().Get<framework::BlockDesc*>("sub_block");
-    const auto sink_var_names =
-        ctx->Attrs().Get<std::vector<std::string>>("sink_var_names");
-    std::vector<std::vector<int64_t>> res_dims;
-    std::vector<int32_t> res_lod_levels;
-    for (const std::string& var_name : sink_var_names) {
-      auto* sink_var = sub_block->FindVar(var_name);
-      PADDLE_ENFORCE_NOT_NULL(sink_var);
-      res_dims.emplace_back(sink_var->GetShape());
-      res_lod_levels.push_back(sink_var->GetLoDLevel());
-    }
-    auto* out_reader =
-        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-    out_reader->SetShapes(res_dims);
-    out_reader->SetLoDLevels(res_lod_levels);
-  }
-};
-
-class CustomReaderInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto& out_var_name = ctx->Output("Out")[0];
-    PADDLE_ENFORCE(ctx->HasVar(out_var_name));
-    ctx->SetType(out_var_name, framework::proto::VarType::READER);
-
-    auto sink_var_names =
-        boost::get<std::vector<std::string>>(ctx->GetAttr("sink_var_names"));
-    const auto* sub_block =
-        boost::get<framework::BlockDesc*>(ctx->GetAttr("sub_block"));
-    std::vector<framework::proto::VarType::Type> res_data_types;
-    for (const std::string& var_name : sink_var_names) {
-      framework::VarDesc* var = sub_block->FindVar(var_name);
-      PADDLE_ENFORCE_NOT_NULL(var);
-      res_data_types.emplace_back(var->GetDataType());
-    }
-    ctx->SetDataTypes(out_var_name, res_data_types);
-  }
-};
-
-void CustomReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
-  out->clear();
-  std::vector<framework::LoDTensor> underlying_outs;
-  reader_->ReadNext(&underlying_outs);
-  if (underlying_outs.empty()) {
-    // There is not next data.
-    return;
-  }
-  PADDLE_ENFORCE(source_var_names_.size() == underlying_outs.size(),
-                 "The size of source_var_names(%d) and the size of "
-                 "underlying_outs(%d) are not consistent. Each feeding element "
-                 "must have its own source variable.",
-                 source_var_names_.size(), underlying_outs.size());
-  // The scope for CustomReader's sub-block should be independent and shouldn't
-  // be any other computation scope's child. Otherwise, data preprocessing and
-  // compution cannot be concurrent.
-  framework::Scope* exe_scope = &scope_.NewScope();
-  // 1. Copy LoDTensors from underlying reader's output to source variables.
-  for (size_t i = 0; i < source_var_names_.size(); ++i) {
-    framework::Variable* var = exe_scope->Var(source_var_names_[i]);
-    framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
-    tensor->ShareDataWith(underlying_outs[i]);
-    tensor->set_lod(underlying_outs[i].lod());
-  }
-  // 2. Run the sub-block.
-  exe_.Run(program_, exe_scope, sub_block_id_, false, true, {}, true);
-  // 3. Copy LoDTensors from sink variables to out.
-  out->resize(sink_var_names_.size());
-  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
-    const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i]))
-                             .Get<framework::LoDTensor>();
-    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
-  }
-  scope_.DeleteScope(exe_scope);
-}
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators::reader;
-REGISTER_OPERATOR(create_custom_reader, ops::CreateCustomReaderOp,
-                  ops::CreateCustomReaderOpMaker, ops::CustomReaderInferShape,
-                  ops::CustomReaderInferVarType,
-                  paddle::framework::EmptyGradOpMaker)
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
deleted file mode 100644
index ed719f91d0980480aa62a5cd3c1f819e6c0e7475..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reader/buffered_reader.h"
-#include "paddle/fluid/operators/reader/reader_op_registry.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-class CreateDoubleBufferReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    if (out->Get() != nullptr) {
-      return;
-    }
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-
-    auto place_str = Attr<std::string>("place");
-    platform::Place place;
-    if (place_str == "AUTO") {
-      place = dev_place;
-    } else if (place_str == "CPU") {
-      place = platform::CPUPlace();
-    } else {
-      std::istringstream sin(place_str);
-      sin.seekg(std::string("CUDA:").size(), std::ios::beg);
-      size_t num;
-      sin >> num;
-      place = platform::CUDAPlace(static_cast<int>(num));
-    }
-
-    out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader,
-                                                              place, 2));
-  }
-};
-
-class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
- protected:
-  void Apply() override {
-    AddComment(R"DOC(
-      CreateDoubleBufferReader Operator
-
-      A double buffer reader takes another reader as its 'underlying reader'.
-      It launches another thread to execute the 'underlying reader' asynchronously,
-      which prevents reading process from blocking subsequent training.
-    )DOC");
-    std::unordered_set<std::string> enum_range;
-    constexpr size_t kMaxCUDADevs = 128;
-    for (size_t i = 0; i < kMaxCUDADevs; ++i) {
-      enum_range.insert(string::Sprintf("CUDA:%d", i));
-    }
-    enum_range.insert("CPU");
-    enum_range.insert("AUTO");
-    AddAttr<std::string>("place", "The double buffer place")
-        .SetDefault("AUTO")
-        .InEnum({enum_range});
-  }
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators::reader;
-REGISTER_DECORATED_READER_OPERATOR(create_double_buffer_reader,
-                                   ops::CreateDoubleBufferReaderOp,
-                                   ops::CreateDoubleBufferReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
deleted file mode 100644
index 4a6581bbbd00019db33896371adac6d4e420e48c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reader/py_reader.h"
-#include "paddle/fluid/operators/reader/reader_op_registry.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class CreatePyReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    if (out->Get() != nullptr) return;
-
-    const std::string& queue_name = Input("blocking_queue");
-    auto* queue_holder_var = scope.FindVar(queue_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        queue_holder_var,
-        "No LoDTensorBlockingQueueHolder variable with name %s found",
-        queue_name);
-    auto* queue_holder =
-        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
-
-    out->Reset(std::make_shared<PyReader>(queue_holder->GetQueue()));
-  }
-};
-
-class CreatePyReaderOpMaker : public FileReaderMakerBase {
- protected:
-  void Apply() override {
-    AddInput("blocking_queue",
-             "Name of the `LoDTensorBlockingQueueHolder` variable");
-
-    AddComment(R"DOC(
-      Create PyReader to support LoDTensor data feeding in Python side.
-      )DOC");
-  }
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
-
-namespace reader = ::paddle::operators::reader;
-
-REGISTER_FILE_READER_OPERATOR(create_py_reader, reader::CreatePyReaderOp,
-                              reader::CreatePyReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
deleted file mode 100644
index be044085f1435089b3fb736df684358136ea7c10..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ /dev/null
@@ -1,91 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/reader/blocking_queue.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class LoDTensorBlockingQueueHolder;
-
-class LoDTensorBlockingQueue {
-  friend class LoDTensorBlockingQueueHolder;
-
- private:
-  explicit LoDTensorBlockingQueue(size_t capacity, bool speed_test_mode = false)
-      : queue_(capacity, speed_test_mode) {}
-
- public:
-  bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
-    return queue_.Send(lod_tensor_vec);
-  }
-
-  bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
-    return queue_.Send(std::move(lod_tensor_vec));
-  }
-
-  std::vector<framework::LoDTensor> Pop(bool* ok = nullptr) {
-    std::vector<framework::LoDTensor> lod_tensor_vec;
-    bool success = queue_.Receive(&lod_tensor_vec);
-    if (ok != nullptr) *ok = success;
-    return lod_tensor_vec;
-  }
-
-  inline size_t Cap() const { return queue_.Cap(); }
-
-  inline size_t Size() const { return queue_.Size(); }
-
-  inline void ReOpen() { queue_.ReOpen(); }
-
-  inline void Close() {
-    VLOG(1) << "LoDTensorBlockingQueue close";
-    queue_.Close();
-  }
-
-  inline bool IsClosed() const { return queue_.IsClosed(); }
-
- private:
-  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
-};
-
-class LoDTensorBlockingQueueHolder {
- public:
-  void InitOnce(size_t capacity, bool speed_test_mode = false) {
-    PADDLE_ENFORCE(
-        queue_ == nullptr,
-        "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
-    queue_.reset(new LoDTensorBlockingQueue(capacity, speed_test_mode));
-  }
-
-  inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
-    return queue_;
-  }
-
- private:
-  std::shared_ptr<LoDTensorBlockingQueue> queue_;
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc
deleted file mode 100644
index 155ae859defcf20a5e226a4abfb99dc308dfb23c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/py_reader.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reader/py_reader.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-PyReader::PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
-    : framework::FileReader() {
-  PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
-  queue_ = queue;
-}
-
-void PyReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  bool success;
-  *out = queue_->Pop(&success);
-  if (!success) out->clear();
-}
-
-PyReader::~PyReader() { queue_->Close(); }
-
-void PyReader::Shutdown() { queue_->Close(); }
-
-void PyReader::Start() { queue_->ReOpen(); }
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/py_reader.h b/paddle/fluid/operators/reader/py_reader.h
deleted file mode 100644
index 43079075142e8db22c0e3b7c86de4249d447f961..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/py_reader.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class PyReader : public framework::FileReader {
- public:
-  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-
-  ~PyReader();
-
-  void Shutdown() override;
-
-  void Start() override;
-
- private:
-  std::shared_ptr<LoDTensorBlockingQueue> queue_;
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
deleted file mode 100644
index 5fa6bf73aa839ffcd307a15eb4fc8dd5a2ea16af..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/read_op.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-class ReadInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Reader"),
-                   "The ReadOp must take a reader as input.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
-                   "The ReadOp should be assigned with output.");
-    if (!ctx->IsRuntime() && ctx->Attrs().Get<bool>("infer_out")) {
-      std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
-      std::vector<std::string> out_names = ctx->Outputs("Out");
-      PADDLE_ENFORCE_EQ(
-          reader_dims.size(), out_names.size(),
-          "The reader's dim number doesn't match the output number.");
-      ctx->SetOutputsDim("Out", reader_dims);
-      auto in_desc =
-          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Reader")[0]);
-      auto in_lod_levels = in_desc->GetLoDLevels();
-      auto out_var_ptrs = ctx->GetOutputVarPtrs("Out");
-      PADDLE_ENFORCE_EQ(in_lod_levels.size(), out_var_ptrs.size(),
-                        "LoDLevels of Input(Reader) must be the same as the "
-                        "number of Outputs(Out).");
-      for (size_t i = 0; i < out_var_ptrs.size(); ++i) {
-        auto* out_desc = boost::get<framework::VarDesc*>(out_var_ptrs[i]);
-        out_desc->SetLoDLevel(in_lod_levels[i]);
-      }
-    }
-  }
-};
-
-class ReadInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    bool infer_out = boost::get<bool>(ctx->GetAttr("infer_out"));
-    if (infer_out) {
-      std::string reader_name = ctx->Input("Reader")[0];
-      std::vector<std::string> out_names = ctx->Output("Out");
-      auto dtypes = ctx->GetDataTypes(reader_name);
-      PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
-      for (size_t i = 0; i < dtypes.size(); ++i) {
-        ctx->SetType(out_names[i], framework::proto::VarType::LOD_TENSOR);
-        ctx->SetDataType(out_names[i], dtypes[i]);
-      }
-    }
-  }
-};
-
-class ReadOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    VLOG(3) << "read op in";
-    framework::ReaderHolder* reader =
-        detail::Ref(scope.FindVar(Input("Reader")),
-                    "Cannot find reader variable %s", Input("Reader"))
-            .GetMutable<framework::ReaderHolder>();
-    std::vector<std::string> out_arg_names = Outputs("Out");
-    std::vector<framework::LoDTensor> ins;
-
-    // For profiling
-    platform::RecordEvent record_event(Type());
-
-    reader->ReadNext(&ins);
-    if (ins.empty()) {
-      VLOG(3) << "throw_eof_exp";
-      PADDLE_THROW_EOF();
-    }
-    PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
-    for (size_t i = 0; i < out_arg_names.size(); ++i) {
-      auto* out =
-          scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
-      out->ShareDataWith(ins[i]);
-      out->set_lod(ins[i].lod());
-    }
-  }
-};
-
-class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Reader", "(ReaderHolder) The executed reader.");
-    AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
-    AddAttr<bool>(
-        "throw_eof_exp",
-        "If set true, an exception will be thrown when the Reader "
-        "yields empty (which means there is no next data).\n"
-        "NOTES: This flag must be true always. It will be set to false"
-        " only when the data-balance is enabled in ParallelExecutor"
-        " and it is set by ParallelExecutor instance, not users.")
-        .SetDefault(true);
-    AddAttr<bool>("infer_out", "").SetDefault(true);
-    AddComment(R"DOC(
-      Read Operator
-
-      Execute a given reader once and output data.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(read, ops::ReadOp, ops::ReadInferShape, ops::ReadOpMaker,
-                  paddle::framework::EmptyGradOpMaker, ops::ReadInferVarType);
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
deleted file mode 100644
index dc0940ac0b78d295b5088cb6ae26300da1dc883d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <chrono>  // NOLINT
-#include <set>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/operators/reader/blocking_queue.h"
-
-using paddle::operators::reader::BlockingQueue;
-
-TEST(BlockingQueue, CapacityTest) {
-  size_t cap = 10;
-  BlockingQueue<int> q(cap);
-  EXPECT_EQ(q.Cap(), cap);
-}
-
-void FirstInFirstOut(size_t queue_cap, size_t elem_num, size_t send_time_gap,
-                     size_t receive_time_gap) {
-  BlockingQueue<size_t> q(queue_cap);
-  std::thread sender([&]() {
-    for (size_t i = 0; i < elem_num; ++i) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(send_time_gap));
-      EXPECT_TRUE(q.Send(i));
-    }
-    q.Close();
-  });
-  size_t count = 0;
-  while (true) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(receive_time_gap));
-    size_t elem;
-    if (!q.Receive(&elem)) {
-      break;
-    }
-    EXPECT_EQ(elem, count++);
-  }
-  sender.join();
-  EXPECT_EQ(count, elem_num);
-  EXPECT_TRUE(q.IsClosed());
-}
-
-TEST(BlockingQueue, FirstInFirstOutTest) {
-  FirstInFirstOut(2, 5, 2, 50);
-  FirstInFirstOut(2, 5, 50, 2);
-  FirstInFirstOut(10, 3, 50, 2);
-  FirstInFirstOut(10, 3, 2, 50);
-}
-
-TEST(BlockingQueue, SenderBlockingTest) {
-  const size_t queue_cap = 2;
-  BlockingQueue<size_t> q(queue_cap);
-  size_t send_count = 0;
-  std::thread sender([&]() {
-    for (size_t i = 0; i < 5; ++i) {
-      if (!q.Send(i)) {
-        break;
-      }
-      ++send_count;
-    }
-  });
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));
-  q.Close();
-  sender.join();
-  EXPECT_EQ(send_count, queue_cap);
-  std::vector<size_t> res;
-  while (true) {
-    size_t elem;
-    if (!q.Receive(&elem)) {
-      break;
-    }
-    res.push_back(elem);
-  }
-  EXPECT_EQ(res.size(), queue_cap);
-  for (size_t i = 0; i < res.size(); ++i) {
-    EXPECT_EQ(res[i], i);
-  }
-}
-
-TEST(BlockingQueue, ReceiverBlockingTest) {
-  const size_t queue_cap = 5;
-  BlockingQueue<size_t> q(queue_cap);
-  std::vector<size_t> receive_res;
-  std::thread receiver([&]() {
-    size_t elem;
-    while (true) {
-      if (!q.Receive(&elem)) {
-        break;
-      }
-      receive_res.push_back(elem);
-    }
-  });
-  std::vector<size_t> to_send{2, 1, 7};
-  for (auto e : to_send) {
-    q.Send(e);
-  }
-  q.Close();
-  receiver.join();
-  EXPECT_EQ(receive_res.size(), to_send.size());
-  for (size_t i = 0; i < to_send.size(); ++i) {
-    EXPECT_EQ(receive_res[i], to_send[i]);
-  }
-}
-
-void CheckIsUnorderedSame(const std::vector<std::vector<size_t>>& v1,
-                          const std::vector<std::vector<size_t>>& v2) {
-  std::set<size_t> s1;
-  std::set<size_t> s2;
-  for (auto vec : v1) {
-    for (size_t elem : vec) {
-      s1.insert(elem);
-    }
-  }
-  for (auto vec : v2) {
-    for (size_t elem : vec) {
-      s2.insert(elem);
-    }
-  }
-  EXPECT_EQ(s1.size(), s2.size());
-  auto it1 = s1.begin();
-  auto it2 = s2.begin();
-  while (it1 != s1.end()) {
-    EXPECT_EQ(*it1, *it2);
-    ++it1;
-    ++it2;
-  }
-}
-
-void MultiSenderMultiReceiver(const size_t queue_cap,
-                              const std::vector<std::vector<size_t>>& to_send,
-                              size_t receiver_num, size_t send_time_gap,
-                              size_t receive_time_gap) {
-  BlockingQueue<size_t> q(queue_cap);
-  size_t sender_num = to_send.size();
-  std::vector<std::thread> senders;
-  for (size_t s_idx = 0; s_idx < sender_num; ++s_idx) {
-    senders.emplace_back(std::thread([&, s_idx] {
-      for (size_t elem : to_send[s_idx]) {
-        std::this_thread::sleep_for(std::chrono::milliseconds(send_time_gap));
-        EXPECT_TRUE(q.Send(elem));
-      }
-    }));
-  }
-  std::vector<std::thread> receivers;
-  std::mutex mu;
-  std::vector<std::vector<size_t>> res;
-  for (size_t r_idx = 0; r_idx < receiver_num; ++r_idx) {
-    receivers.emplace_back(std::thread([&] {
-      std::vector<size_t> receiver_res;
-      while (true) {
-        std::this_thread::sleep_for(
-            std::chrono::milliseconds(receive_time_gap));
-        size_t elem;
-        if (!q.Receive(&elem)) {
-          break;
-        }
-        receiver_res.push_back(elem);
-      }
-      std::lock_guard<std::mutex> lock(mu);
-      res.push_back(receiver_res);
-    }));
-  }
-  for (auto& t : senders) {
-    t.join();
-  }
-  q.Close();
-  for (auto& t : receivers) {
-    t.join();
-  }
-  CheckIsUnorderedSame(to_send, res);
-}
-
-TEST(BlockingQueue, MultiSenderMultiReaderTest) {
-  std::vector<std::vector<size_t>> to_send_1{{2, 3, 4}, {9}, {0, 7, 15, 6}};
-  MultiSenderMultiReceiver(2, to_send_1, 2, 0, 0);
-  MultiSenderMultiReceiver(10, to_send_1, 2, 0, 0);
-  MultiSenderMultiReceiver(2, to_send_1, 20, 0, 0);
-  MultiSenderMultiReceiver(2, to_send_1, 2, 50, 0);
-  MultiSenderMultiReceiver(2, to_send_1, 2, 0, 50);
-
-  std::vector<std::vector<size_t>> to_send_2{
-      {2, 3, 4}, {}, {0, 7, 15, 6, 9, 32}};
-  MultiSenderMultiReceiver(2, to_send_2, 3, 0, 0);
-  MultiSenderMultiReceiver(20, to_send_2, 3, 0, 0);
-  MultiSenderMultiReceiver(2, to_send_2, 30, 0, 0);
-  MultiSenderMultiReceiver(2, to_send_2, 3, 50, 0);
-  MultiSenderMultiReceiver(2, to_send_2, 3, 0, 50);
-}
-
-struct MyClass {
-  MyClass() : val_(0) {}
-  explicit MyClass(int val) : val_(val) {}
-  MyClass(const MyClass& b) { val_ = b.val_; }
-  MyClass(MyClass&& b) { val_ = b.val_; }
-  void operator=(const MyClass& b) { val_ = b.val_; }
-
-  int val_;
-};
-
-TEST(BlockingQueue, MyClassTest) {
-  BlockingQueue<MyClass> q(2);
-  MyClass a(200);
-  q.Send(std::move(a));
-  MyClass b;
-  q.Receive(&b);
-  EXPECT_EQ(a.val_, b.val_);
-}
-
-TEST(BlockingQueue, speed_test_mode) {
-  size_t queue_size = 10;
-  BlockingQueue<size_t> q1(queue_size, false);
-  for (size_t i = 0; i < queue_size; ++i) {
-    q1.Send(i);
-  }
-  size_t b;
-  for (size_t i = 0; i < queue_size; ++i) {
-    q1.Receive(&b);
-    EXPECT_EQ(b, i);
-  }
-  EXPECT_EQ(q1.Size(), 0UL);
-
-  BlockingQueue<size_t> q2(queue_size, true);
-  for (size_t i = 0; i < queue_size; ++i) {
-    q2.Send(i);
-  }
-  for (size_t i = 0; i < queue_size; ++i) {
-    q2.Receive(&b);
-    EXPECT_EQ(b, 0UL);
-  }
-  EXPECT_EQ(q2.Size(), queue_size);
-}
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
deleted file mode 100644
index 6a9506b5cd91b893540e07302d7305e11774ca74..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reader/reader_op_registry.h"
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-std::vector<framework::DDim> RestoreShapes(const std::vector<int>& shape_concat,
-                                           const std::vector<int>& ranks) {
-  std::vector<framework::DDim> res;
-  int offset = 0;
-  for (int len : ranks) {
-    auto start_it = shape_concat.begin() + offset;
-    auto end_it = start_it + len;
-    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
-    offset += len;
-  }
-  return res;
-}
-
-std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() {
-  static std::unordered_map<std::string, FileReaderCreator> regs;
-  return regs;
-}
-
-void FileReaderMakerBase::Make() {
-  AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable();
-  AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
-  AddAttr<std::vector<int>>(
-      "ranks",
-      "The ranks of each data."
-      "e.g."
-      "shape_concat = [2,3,4,5,6]"
-      "ranks = [3,2]"
-      "It means the reader will generate two data each time,"
-      "whose shapes are [2,3,4] and [5,6] respectively.");
-  AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
-  AddAttr<bool>(
-      "use_data_config",
-      "Use the config of all datas like shape_concat/ranks/lod_levels")
-      .SetDefault(true);
-  Apply();
-}
-
-void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(
-      !ctx->IsRuntime(),
-      "'FileReaderInferShape' should only be invoked during compile time.");
-
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "The output file reader should not be null.");
-  bool use_data_config = ctx->Attrs().Get<bool>("use_data_config");
-  if (use_data_config) {
-    const auto shape_concat =
-        ctx->Attrs().Get<std::vector<int>>("shape_concat");
-    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
-    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-    ctx->SetReaderDims("Out", shapes);
-
-    const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
-    PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
-                      "The number of 'lod_levels'(%d) doesn't match the number "
-                      "of 'shapes'(%d).",
-                      lod_levels.size(), shapes.size());
-    framework::VarDesc* reader =
-        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-    reader->SetLoDLevels(lod_levels);
-  }
-}
-
-void FileReaderInferVarType::operator()(
-    framework::InferVarTypeContext* ctx) const {
-  std::string reader_name = ctx->Output("Out")[0];
-  ctx->SetType(reader_name, framework::proto::VarType::READER);
-}
-
-void DecoratedReaderInferShape::operator()(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(!ctx->IsRuntime(),
-                 "'DecoratedReaderInferShape' should only be invoked during "
-                 "compile time.");
-
-  PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
-                 "Input(UnderlyingReader) should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "The output decorated reader should not be null.");
-  ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
-
-  framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
-      ctx->GetInputVarPtrs("UnderlyingReader")[0]);
-  framework::VarDesc* out_reader =
-      boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-  out_reader->SetLoDLevels(in_reader->GetLoDLevels());
-}
-
-void DecoratedReaderInferVarType::operator()(
-    framework::InferVarTypeContext* ctx) const {
-  const std::string& in_reader_name = ctx->Input("UnderlyingReader")[0];
-  const std::string& out_reader_name = ctx->Output("Out")[0];
-  ctx->SetType(out_reader_name, framework::proto::VarType::READER);
-  ctx->SetDataTypes(out_reader_name, ctx->GetDataTypes(in_reader_name));
-}
-
-void DecoratedReaderMakerBase::Make() {
-  AddInput("UnderlyingReader",
-           "(ReaderHolder) The underlying reader for creating a batch reader.");
-  AddOutput("Out", "(ReaderHolder) The created batch reader.");
-  Apply();
-}
-
-}  // namespace reader
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
deleted file mode 100644
index de0c34ad32e226cacc998767bf824e4a7c8a28ef..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ /dev/null
@@ -1,110 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-static constexpr char kFileFormatSeparator[] = ".";
-
-using FileReaderCreator =
-    std::function<framework::ReaderBase*(const std::string&)>;
-
-std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry();
-
-template <typename Reader>
-int RegisterFileReader(const std::string& filetype) {
-  FileReaderRegistry()[filetype] = [](const std::string& fn) {
-    return new Reader(fn);
-  };
-  return 0;
-}
-
-extern std::vector<framework::DDim> RestoreShapes(
-    const std::vector<int>& shape_concat, const std::vector<int>& ranks);
-
-class FileReaderMakerBase : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final;
-
- protected:
-  virtual void Apply() = 0;
-};
-
-class FileReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override;
-};
-
-class FileReaderInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override;
-};
-
-// general infershape for decorated reader
-class DecoratedReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override;
-};
-
-// general var type inference for decorated reader
-class DecoratedReaderInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override;
-};
-
-class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final;
-
- protected:
-  virtual void Apply() = 0;
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_FILE_READER_OPERATOR(op_name, ...)                  \
-  REGISTER_OPERATOR(op_name, __VA_ARGS__,                            \
-                    paddle::operators::reader::FileReaderInferShape, \
-                    paddle::framework::EmptyGradOpMaker,             \
-                    paddle::operators::reader::FileReaderInferVarType)
-
-#define REGISTER_DECORATED_READER_OPERATOR(op_name, ...)                  \
-  REGISTER_OPERATOR(op_name, __VA_ARGS__,                                 \
-                    paddle::operators::reader::DecoratedReaderInferShape, \
-                    paddle::framework::EmptyGradOpMaker,                  \
-                    paddle::operators::reader::DecoratedReaderInferVarType)
-
-#define REGISTER_FILE_READER(_filetype, _reader)            \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
-      _reg_file_reader_##_filetype,                         \
-      "Must use REGISTER_FILE_READER in global namespace"); \
-  int TouchFileReader##_filetype() { return 0; }            \
-  int _reg_file_reader_entry_##filetype =                   \
-      paddle::operators::reader::RegisterFileReader<_reader>(#_filetype)
-
-#define USE_FILE_READER(filetype)         \
-  extern int TouchFileReader##filetype(); \
-  static int _use_##filetype = TouchFileReader##filetype()
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
deleted file mode 100644
index 91615a1b43fccaf330b05ffffe06b4bc428737c7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/recurrent_op.cc
+++ /dev/null
@@ -1,670 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/recurrent_op.h"
-
-#include <algorithm>
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using StepScopeVar = std::vector<framework::Scope *>;
-
-const char RecurrentBase::kInputs[] = "inputs";
-const char RecurrentBase::kInitialStates[] = "initial_states";
-const char RecurrentBase::kParameters[] = "parameters";
-const char RecurrentBase::kOutputs[] = "outputs";
-const char RecurrentBase::kStepScopes[] = "step_scopes";
-const char RecurrentBase::kHasStates[] = "has_states";
-const char RecurrentBase::kExStates[] = "ex_states";
-const char RecurrentBase::kStates[] = "states";
-const char RecurrentBase::kStepBlock[] = "sub_block";
-const char RecurrentBase::kReverse[] = "reverse";
-const char RecurrentBase::kIsTrain[] = "is_train";
-const char RecurrentBase::kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
-#define GRAD_SUFFIX "@GRAD"
-const char RecurrentBase::kInputGrads[] = "inputs" GRAD_SUFFIX;
-const char RecurrentBase::kOutputGrads[] = "outputs" GRAD_SUFFIX;
-const char RecurrentBase::kParamGrads[] = "parameters" GRAD_SUFFIX;
-const char RecurrentBase::kInitStateGrads[] = "initial_states" GRAD_SUFFIX;
-
-static void ClearStepScopes(const platform::DeviceContext &dev_ctx,
-                            framework::Scope *parent_scope,
-                            StepScopeVar *step_scopes) {
-  if (step_scopes->empty()) return;
-
-  dev_ctx.Wait();
-
-  for (auto *sub_scope : *step_scopes) {
-    parent_scope->DeleteScope(sub_scope);
-  }
-
-  step_scopes->clear();
-}
-
-StepScopes::StepScopes(const platform::DeviceContext &dev_ctx,
-                       const framework::Scope &parent, StepScopeVar *scopes,
-                       bool is_train, size_t seq_len, bool is_backward)
-    : counter_(is_backward ? seq_len - 1 : 0UL),
-      scopes_(scopes),
-      is_train_(is_train),
-      is_backward_(is_backward) {
-  size_t num_step_scopes = is_train ? seq_len : 2;
-  PADDLE_ENFORCE_EQ(is_train || !is_backward, true,
-                    "Cannot backward when is not training");
-  if (!is_backward_) {
-    ClearStepScopes(dev_ctx, const_cast<framework::Scope *>(&parent), scopes);
-    scopes->reserve(static_cast<size_t>(num_step_scopes));
-    for (size_t i = 0; i < num_step_scopes; ++i) {
-      scopes->emplace_back(&parent.NewScope());
-    }
-  }
-}
-
-framework::Scope &StepScopes::CurScope() { return GetScope(counter_); }
-
-framework::Scope &StepScopes::ExScope() {
-  auto &scope = GetScope(is_backward_ ? counter_ + 1 : counter_ - 1);
-  return scope;
-}
-
-void StepScopes::BackwardNext(const platform::DeviceContext &dev_ctx,
-                              framework::Scope *parent_scope) {
-  PADDLE_ENFORCE_EQ(is_backward_, true,
-                    "Cannot get backward next scope when is forward");
-  if (counter_ + 2 == scopes_->size()) {
-    parent_scope->DeleteScope((*scopes_)[counter_ + 1]);
-    scopes_->pop_back();
-    VLOG(3) << "Deleted scope at " << counter_ + 1;
-  }
-  --counter_;
-}
-
-void StepScopes::ForwardNext() {
-  PADDLE_ENFORCE_EQ(is_backward_, false,
-                    "Cannot get forward next scope when is backward");
-  ++counter_;
-}
-
-framework::Scope &StepScopes::GetScope(size_t scope_id) const {
-  if (!is_train_) {
-    scope_id %= 2;
-  }
-  PADDLE_ENFORCE_LT(scope_id, scopes_->size());
-  return *(*scopes_)[scope_id];
-}
-
-RecurrentBase::RecurrentBase(const std::string &type,
-                             const framework::VariableNameMap &inputs,
-                             const framework::VariableNameMap &outputs,
-                             const framework::AttributeMap &attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
-
-// Get SequenceLength from Scope
-//   The sequence length is got from input tensor. The input tensor's
-//   dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
-//   is SEQ_LEN. The second of the tensor's shape could be the batch size or
-//   nested sequence length.
-int64_t RecurrentBase::GetSequenceLength(const framework::Scope &scope) const {
-  // Dim format SEQ_LEN, BATCH_SIZE, ...
-  int64_t seq_len = -1;
-  auto &all_inputs = Inputs(kInputs);
-  PADDLE_ENFORCE_EQ(all_inputs.empty(), false);
-  for (auto &iname : all_inputs) {
-    auto *var = scope.FindVar(iname);
-    PADDLE_ENFORCE_NOT_NULL(var);
-    PADDLE_ENFORCE_EQ(var->IsType<framework::LoDTensor>(), true);
-    auto &dim = var->Get<framework::LoDTensor>().dims();
-    if (seq_len == -1) {
-      seq_len = dim[0];
-    } else {
-      PADDLE_ENFORCE_EQ(seq_len, dim[0]);
-    }
-  }
-  return seq_len;
-}
-
-// for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
-//                                   map(dst_scope.Var, dst_vars)):
-//   dst_tensor.ShareDataWith(src_tensor)
-void RecurrentBase::LinkTensor(const framework::Scope &src_scope,
-                               const std::vector<std::string> &src_vars,
-                               framework::Scope *dst_scope,
-                               const std::vector<std::string> &dst_vars) {
-  LinkTensorWithCallback(
-      src_scope, src_vars, dst_scope, dst_vars,
-      [&](const framework::Tensor &src, framework::Tensor *dst) {
-        dst->ShareDataWith(src);
-      });
-}
-
-// (seq_len, shape) -> return [seq_len] + list(shape)
-framework::DDim RecurrentBase::PrependDims(size_t seq_len,
-                                           const framework::DDim &src) {
-  auto dims = framework::vectorize(src);
-  dims.insert(dims.begin(), static_cast<int64_t>(seq_len));
-  return framework::make_ddim(dims);
-}
-
-RecurrentOp::RecurrentOp(const std::string &type,
-                         const framework::VariableNameMap &inputs,
-                         const framework::VariableNameMap &outputs,
-                         const framework::AttributeMap &attrs)
-    : RecurrentBase(type, inputs, outputs, attrs) {}
-
-void RecurrentOp::RunImpl(const framework::Scope &scope,
-                          const platform::Place &place) const {
-  bool has_state = Attr<bool>(kHasStates);
-  auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
-
-  // get device context from pool
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(place);
-
-  VLOG(3) << "Static RNN input sequence length = " << seq_len;
-  StepScopes scopes = CreateStepScopes(dev_ctx, scope, seq_len);
-  auto reverse = Attr<bool>(kReverse);
-
-  framework::Executor executor(place);
-  auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-
-  auto *program = block->Program();
-  auto ctx = executor.Prepare(
-      *program, block->ID(), Attr<std::vector<std::string>>(
-                                 kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);
-
-  for (size_t i = 0; i < seq_len; ++i) {
-    size_t seq_offset = reverse ? seq_len - i - 1 : i;
-    VLOG(3) << "Recurrent operate at the time step " << seq_offset;
-
-    auto &cur_scope = scopes.CurScope();
-
-    // Link outside::input --> inside::input
-    //   inside::input = outside::input[seq_offset: seq_offset+1]
-    LinkTensorWithCallback(
-        scope, Inputs(kInputs), &cur_scope, Inputs(kInputs),
-        [&seq_offset](const framework::Tensor &outside,
-                      framework::Tensor *inside) {
-          inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
-          auto dims = framework::vectorize(inside->dims());
-          dims.erase(dims.begin());
-          inside->Resize(framework::make_ddim(dims));
-        });
-
-    if (has_state) {
-      if (i == 0) {
-        // Link initial states  --> ex_states
-        LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
-                   Attr<std::vector<std::string>>(kExStates));
-      } else {
-        auto &ex_scope = scopes.ExScope();
-        // Link ex_scope::state --> cur_scope::ex_state
-        LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
-                   &cur_scope, Attr<std::vector<std::string>>(kExStates));
-      }
-    }
-
-    // Link inside::output -> outside::output
-    //   outside::output[seq_offset: seq_offset + 1] = inside::output
-    executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_);
-    if (i > 0) {
-      LinkTensorWithCallback(scope, Outputs(kOutputs), cur_scope,
-                             Outputs(kOutputs),
-                             [&](const framework::LoDTensor &src_tensor,
-                                 framework::LoDTensor *dst_tensor) {
-                               framework::Tensor src_slice =
-                                   src_tensor.Slice(seq_offset, seq_offset + 1);
-                               dst_tensor->ShareDataWith(src_slice);
-                             });
-    }
-
-    // Linked now, execute!
-    executor.RunPreparedContext(ctx.get(), &cur_scope,
-                                false /*create_local_scope*/,
-                                false /*create_vars*/, true /* keep_kids */);
-    if (i == 0) {
-      LinkTensorWithCallback(
-          cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs),
-          [&](const framework::LoDTensor &src_tensor,
-              framework::LoDTensor *dst_tensor) {
-            // create output tensor at begin
-            dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
-            dst_tensor->mutable_data(place, src_tensor.type());
-
-            auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
-            // Explicit copy output since the local RNN scope can be destroyed
-            // early.
-            framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
-          });
-    }
-
-    scopes.ForwardNext();
-  }
-}
-
-StepScopes RecurrentOp::CreateStepScopes(const platform::DeviceContext &dev_ctx,
-                                         const framework::Scope &scope,
-                                         size_t seq_len) const {
-  auto *var = scope.FindVar(Output(kStepScopes));
-  PADDLE_ENFORCE_NOT_NULL(var);
-  return StepScopes(dev_ctx, scope, var->GetMutable<StepScopeVar>(),
-                    Attr<bool>(kIsTrain), seq_len);
-}
-
-RecurrentGradOp::RecurrentGradOp(const std::string &type,
-                                 const framework::VariableNameMap &inputs,
-                                 const framework::VariableNameMap &outputs,
-                                 const framework::AttributeMap &attrs)
-    : RecurrentBase(type, inputs, outputs, attrs) {}
-
-void RecurrentGradOp::RunImpl(const framework::Scope &scope,
-                              const platform::Place &place) const {
-  bool has_state = Attr<bool>(kHasStates);
-  const size_t seq_len = static_cast<size_t>(GetSequenceLength(scope));
-
-  // get device context from pool
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(place);
-
-  StepScopes scopes = CreateStepScopes(dev_ctx, scope, seq_len);
-  auto reverse = Attr<bool>(kReverse);
-
-  framework::Executor executor(place);
-  auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-  auto *program = block->Program();
-  auto ctx = executor.Prepare(
-      *program, block->ID(), Attr<std::vector<std::string>>(
-                                 kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);
-
-  for (size_t step_id = 0; step_id < seq_len; ++step_id) {
-    size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
-    VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
-    auto &cur_scope = scopes.CurScope();
-
-    // Link outside::output_grads --> inside::output_grads
-    //   inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
-    LinkTensorWithCallback(
-        scope, Inputs(kOutputGrads), &cur_scope, Inputs(kOutputGrads),
-        [&](const framework::Tensor &outside, framework::Tensor *inside) {
-          inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
-          auto dims = framework::vectorize(inside->dims());
-          dims.erase(dims.begin());
-          inside->Resize(framework::make_ddim(dims));
-        },
-        true /*is_backward*/);
-    auto og_set = List2Set(Inputs(kOutputGrads));
-
-    if (VLOG_IS_ON(10)) {
-      std::ostringstream sout;
-      std::copy(og_set.begin(), og_set.end(),
-                std::ostream_iterator<std::string>(sout, ","));
-      VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
-    }
-
-    if (has_state) {
-      // Link states
-      //   if cur_scope::cur_state_grad in out_grads:
-      //     cur_scope::cur_state_grad += ex_scope::ex_state_grad
-      //   else:
-      //     ex_scope::ex_state_grad --> cur_scope::cur_state_grad
-      if (step_id != 0) {  // not at beginning
-        auto &ex_scope = scopes.ExScope();
-        auto ex_state_grads =
-            GradVarLists(Attr<std::vector<std::string>>(kExStates));
-        auto cur_state_grads =
-            GradVarLists(Attr<std::vector<std::string>>(kStates));
-
-        PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
-        for (size_t i = 0; i < ex_state_grads.size(); ++i) {
-          auto &cur_grad = cur_state_grads[i];
-          auto &ex_grad = ex_state_grads[i];
-          auto &ex_grad_tensor =
-              ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
-
-          VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
-          auto *cur_grad_var = cur_scope.Var(cur_grad);
-          framework::LoDTensor *cur_grad_tensor =
-              cur_grad_var->GetMutable<framework::LoDTensor>();
-          cur_grad_tensor->ShareDataWith(ex_grad_tensor);
-        }
-      }
-    }
-
-    // Link inside::output -> outside::output
-    //   outside::output[seq_offset: seq_offset + 1] = inside::output
-    executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_);
-    if (step_id > 0) {
-      LinkTensorWithCallback(scope, Outputs(kInputGrads), cur_scope,
-                             GradVarLists(Inputs(kInputs)),
-                             [&](const framework::LoDTensor &src_tensor,
-                                 framework::LoDTensor *dst_tensor) {
-                               if (src_tensor.memory_size() ==
-                                   0) {  // Inside Gradient is not created.
-                                 return;
-                               }
-                               framework::Tensor src_slice =
-                                   src_tensor.Slice(seq_offset, seq_offset + 1);
-                               dst_tensor->ShareDataWith(src_slice);
-                             },
-                             true /*is_backward*/);
-    }
-
-    VLOG(5) << "Recurrent memory linking finished ";
-    // Run step block with cur_scope
-    executor.RunPreparedContext(ctx.get(), &cur_scope,
-                                false /*create_local_scope*/,
-                                false /*create_vars*/, true /* keep_kids */);
-
-    VLOG(5) << "executor.Run finished ";
-
-    auto local_var_names = LocalVarNames(cur_scope);
-
-    // Accumulate params
-    //   if (step == 0):
-    //      outside::param_grad = 0.0
-    //   outside::param_grad += inside::param_grad
-    {
-      auto &pg_names = Outputs(kParamGrads);
-      auto &p_names = Inputs(kParameters);
-      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
-
-      for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
-        auto inside_grad_name = framework::GradVarName(p_names[param_id]);
-
-        // If does not compute gradient of that variable inside rnn, just
-        // continue
-        if (local_var_names.find(inside_grad_name) == local_var_names.end()) {
-          continue;
-        }
-
-        // zero gradient variable in step 0
-        if (step_id == 0) {
-          auto &inside_tensor =
-              cur_scope.FindVar(inside_grad_name)->Get<framework::LoDTensor>();
-          framework::AttributeMap attrs;
-          attrs["dtype"] = inside_tensor.type();
-          attrs["shape"] = framework::vectorize<int>(inside_tensor.dims());
-          attrs["value"] = 0.0f;
-
-          auto zero_op = framework::OpRegistry::CreateOp(
-              "fill_constant", framework::VariableNameMap{},
-              {{"Out", {pg_names[param_id]}}}, attrs);
-          zero_op->Run(scope, place);
-        }
-
-        auto new_inside_name = cur_scope.Rename(inside_grad_name);
-
-        // sum gradient
-        auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}},
-            framework::AttributeMap{{"use_mkldnn", {false}}});
-        sum_op->Run(cur_scope, place);
-
-        cur_scope.Rename(new_inside_name, inside_grad_name);
-      }
-    }
-    VLOG(5) << "Accumulate Parameter finished ";
-
-    // Copy input gradient from inside to outside
-    //   outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
-    if (step_id == 0) {
-      LinkTensorWithCallback(
-          cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads),
-          [&](const framework::LoDTensor &inside,
-              framework::LoDTensor *outside) {
-            if (inside.memory_size() == 0) {  // IG is not created.
-              return;
-            }
-            // Alloc outside memory
-            outside->Resize(PrependDims(seq_len, inside.dims()));
-            outside->mutable_data(place, inside.type());
-
-            auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            framework::TensorCopy(inside, place, dev_ctx, &dst);
-          },
-          true /*is_backward*/);
-    }
-    VLOG(5) << "Link outside gradient finished ";
-
-    if (has_state) {
-      if (step_id + 1 == seq_len) {  // at_end
-        // copy initialize states gradient from inside to outside
-        LinkTensorWithCallback(
-            cur_scope, GradVarLists(Attr<std::vector<std::string>>(kExStates)),
-            scope, Outputs(kInitStateGrads),
-            [&](const framework::LoDTensor &inside,
-                framework::LoDTensor *outside) {
-              outside->Resize(inside.dims());
-              outside->mutable_data(place, inside.type());
-              framework::TensorCopy(inside, place, dev_ctx, outside);
-            },
-            true /*is_backward*/);
-        VLOG(5) << "Link initialize state gradient finished ";
-      }
-    }
-    scopes.BackwardNext(dev_ctx, const_cast<framework::Scope *>(&scope));
-  }
-  // Delete the scope of StepScopes
-  auto *var = scope.FindVar(Input(kStepScopes));
-  PADDLE_ENFORCE_NOT_NULL(var);
-  auto *step_scopes = var->GetMutable<StepScopeVar>();
-  ClearStepScopes(dev_ctx, const_cast<framework::Scope *>(&scope), step_scopes);
-}
-
-StepScopes RecurrentGradOp::CreateStepScopes(
-    const platform::DeviceContext &dev_ctx, const framework::Scope &scope,
-    size_t seq_len) const {
-  auto *var = scope.FindVar(Input(kStepScopes));
-  PADDLE_ENFORCE_NOT_NULL(var);
-  return StepScopes(dev_ctx, scope, var->GetMutable<StepScopeVar>(),
-                    Attr<bool>(kIsTrain), seq_len, true /*is_backward*/);
-}
-
-std::unordered_set<std::string> RecurrentGradOp::List2Set(
-    const std::vector<std::string> &list) const {
-  std::unordered_set<std::string> local_var_name_set;
-  local_var_name_set.reserve(list.size());
-  for (auto &each : list) {
-    local_var_name_set.insert(each);
-  }
-  return local_var_name_set;
-}
-
-std::unordered_set<std::string> RecurrentGradOp::LocalVarNames(
-    const framework::Scope &scope) const {
-  return this->List2Set(scope.LocalVarNames());
-}
-
-std::vector<std::string> RecurrentGradOp::GradVarLists(
-    const std::vector<std::string> &var_names) {
-  std::vector<std::string> retv;
-  retv.reserve(var_names.size());
-  std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv),
-                 framework::GradVarName);
-  return retv;
-}
-
-class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(RecurrentBase::kInputs, "rnn inputs").AsDuplicable();
-    AddInput(RecurrentBase::kInitialStates, "rnn initial states")
-        .AsDuplicable();
-    AddInput(RecurrentBase::kParameters,
-             "Parameters are used by step block as its input. However, the "
-             "input is not a sequence tensor. Every time step, each operator "
-             "in step block just use the parameter directly.")
-        .AsDuplicable();
-    AddOutput(RecurrentBase::kOutputs,
-              "The output sequence of RNN. The sequence length must be same.")
-        .AsDuplicable();
-    AddOutput(RecurrentBase::kStepScopes,
-              "StepScopes contain all local variables in each time step.");
-    AddAttr<bool>(RecurrentBase::kHasStates, "Whether has states.")
-        .SetDefault(false);
-    AddAttr<std::vector<std::string>>(
-        RecurrentBase::kExStates,
-        string::Sprintf(
-            R"DOC(The ex-state variable names.
-The ex-state means the state value in the ex-timestep or the previous time step
-[%s, %s, %s] must be the same order)DOC",
-            RecurrentBase::kExStates, RecurrentBase::kStates,
-            RecurrentBase::kInitStateGrads));
-    AddAttr<std::vector<std::string>>(
-        RecurrentBase::kStates,
-        string::Sprintf(
-            "The state variable names. [%s, %s, %s] must be the same order",
-            RecurrentBase::kExStates, RecurrentBase::kStates,
-            RecurrentBase::kInitStateGrads));
-    AddAttr<framework::BlockDesc *>(RecurrentBase::kStepBlock,
-                                    "The step block inside RNN");
-    AddAttr<bool>(RecurrentBase::kReverse, R"DOC(Calculate RNN reversely or not.
-By default reverse=False
-
-Assume the input data is [A, B, C, D]
-
-if reverse is False:
-  the computation of RNN is like
-      A          B          C         D
-      |          |          |         |
-      v          v          v         v
-     rnn -----> rnn -----> rnn ----> rnn
-      |          |          |         |
-      v          v          v         v
-      o          o          o         o
-
-if reverse is True
-  the computation of RNN is like
-      A          B          C         D
-      |          |          |         |
-      v          v          v         v
-     rnn <----- rnn <----- rnn <---- rnn
-      |          |          |         |
-      v          v          v         v
-      o          o          o         o
-)DOC").SetDefault(false);
-    AddAttr<bool>(RecurrentBase::kIsTrain, "").SetDefault(true);
-    AddAttr<std::vector<std::string>>(RecurrentBase::kSkipEagerDeletionVars,
-                                      "Vars that would skip eager deletion."
-                                      "Users should not set this manually.")
-        .SetDefault(std::vector<std::string>());
-
-    AddComment(R"DOC(
-Static Length Recurrent Operator.
-
-The static length recurrent operator can only operate on fixed size sequence
-data, i.e. in each mini-batch, the sequence length of all inputs are the same.
-
-)DOC");
-  }
-};
-
-class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  virtual std::unique_ptr<framework::OpDesc> Apply() const {
-    auto *grad = new framework::OpDesc();
-    grad->SetType("recurrent_grad");
-    for (auto &input_param : this->InputNames()) {
-      grad->SetInput(input_param, this->Input(input_param));
-      grad->SetOutput(framework::GradVarName(input_param),
-                      this->InputGrad(input_param, false));
-    }
-
-    for (auto &output_param : this->OutputNames()) {
-      if (output_param == RecurrentBase::kStepScopes) {
-        grad->SetInput(output_param, this->Output(output_param));
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->Output(output_param));
-      } else {
-        grad->SetInput(output_param, this->Output(output_param));
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->OutputGrad(output_param));
-      }
-    }
-    grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(RecurrentBase::kStepBlock, grad_block_[0]);
-
-    return std::unique_ptr<framework::OpDesc>(grad);
-  }
-};
-
-class RecurrentGradOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    std::vector<std::string> output{RecurrentBase::kOutputs};
-
-    // In some case the kInitialStates is empty.
-    // If the kInitialStates is empty, all the states should be empty.
-    if (!ctx->HasInputs(RecurrentBase::kInitialStates)) {
-      PADDLE_ENFORCE_EQ(
-          ctx->Attrs()
-              .Get<std::vector<std::string>>(RecurrentBase::kExStates)
-              .size(),
-          0, "The Attr(%s) should be empty.", RecurrentBase::kExStates);
-      PADDLE_ENFORCE_EQ(
-          ctx->Attrs()
-              .Get<std::vector<std::string>>(RecurrentBase::kStates)
-              .size(),
-          0, "The Attr(%s) should be empty.", RecurrentBase::kStates);
-    }
-
-    PADDLE_ENFORCE_EQ(ctx->HasInputs(RecurrentBase::kInputs), true,
-                      "The input(%s) should not be empty.",
-                      RecurrentBase::kInputs);
-    PADDLE_ENFORCE_EQ(ctx->HasInputs(RecurrentBase::kOutputs), true,
-                      "The input(%s) should not be empty.",
-                      RecurrentBase::kOutputs);
-
-    // In some case the kInitialStates is empty.
-    if (ctx->HasInputs(RecurrentBase::kInitialStates)) {
-      PADDLE_ENFORCE_EQ(ctx->HasOutputs(framework::GradVarName(
-                            RecurrentBase::kInitialStates)),
-                        true, "The output of(%s) should not be empty.",
-                        framework::GradVarName(RecurrentBase::kInitialStates));
-      ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kInitialStates),
-                         ctx->GetInputsDim(RecurrentBase::kInitialStates));
-    }
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutputs(framework::GradVarName(RecurrentBase::kInputs)), true,
-        "The output of(%s) should not be empty.",
-        framework::GradVarName(RecurrentBase::kInputs));
-    ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kInputs),
-                       ctx->GetInputsDim(RecurrentBase::kInputs));
-
-    // In some case the kParameters is empty.
-    if (ctx->HasInputs(RecurrentBase::kParameters)) {
-      PADDLE_ENFORCE_EQ(
-          ctx->HasOutputs(framework::GradVarName(RecurrentBase::kParameters)),
-          true, "The output of(%s) should not be empty.",
-          framework::GradVarName(RecurrentBase::kParameters));
-      ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kParameters),
-                         ctx->GetInputsDim(RecurrentBase::kParameters));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(recurrent, paddle::operators::RecurrentOp,
-                  paddle::operators::RecurrentOpProtoMaker,
-                  paddle::operators::RecurrentGradOpDescMaker);
-REGISTER_OPERATOR(recurrent_grad, paddle::operators::RecurrentGradOp,
-                  paddle::operators::RecurrentGradOpShapeInference);
diff --git a/paddle/fluid/operators/recurrent_op.h b/paddle/fluid/operators/recurrent_op.h
deleted file mode 100644
index a4b21448a6057054d1520ce660758cb037667315..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/recurrent_op.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-// StepScopes manages the scopes inside Recurrent Op.
-//
-// if is_train = False, then
-//   there are two scopes for the RNN and just support forward
-// else
-//   the len(scopes) == seq_len
-//
-// if is_backward = True, then
-//   reversely access scopes, delete useless ex-scope
-// else
-//   access scopes from beginning to end
-class StepScopes {
- public:
-  StepScopes(const platform::DeviceContext &dev_ctx,
-             const framework::Scope &parent,
-             std::vector<framework::Scope *> *scopes, bool is_train,
-             size_t seq_len, bool is_backward = false);
-
-  // Get the current scope
-  framework::Scope &CurScope();
-
-  // Get the ex-scope, which is the scope in previous time step
-  framework::Scope &ExScope();
-
-  // Move to next time step when forwarding
-  void ForwardNext();
-
-  // Delete ex-scope after using it, then move to next time step when
-  // backwarding
-  void BackwardNext(const platform::DeviceContext &dev_ctx,
-                    framework::Scope *parent_scope);
-
- private:
-  framework::Scope &GetScope(size_t scope_id) const;
-
-  size_t counter_;
-  std::vector<framework::Scope *> *scopes_;
-  bool is_train_;
-  bool is_backward_;
-};
-
-// Base class for RecurrentOp/RecurrentGradOp
-//    Some common protected functions for RecurrentOp/RecurrentGradOp
-class RecurrentBase : public framework::OperatorBase {
- public:
-  static const char kInputs[];
-  static const char kInitialStates[];
-  static const char kParameters[];
-  static const char kOutputs[];
-  static const char kStepScopes[];
-  static const char kHasStates[];
-  static const char kExStates[];
-  static const char kStates[];
-  static const char kStepBlock[];
-  static const char kReverse[];
-  static const char kIsTrain[];
-  static const char kSkipEagerDeletionVars[];
-  static const char kInputGrads[];
-  static const char kOutputGrads[];
-  static const char kParamGrads[];
-  static const char kInitStateGrads[];
-
-  RecurrentBase(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs);
-
- protected:
-  // Get SequenceLength from Scope
-  //   The sequence length is got from input tensor. The input tensor's
-  //   dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
-  //   is SEQ_LEN. The second of the tensor's shape could be the batch size or
-  //   nested sequence length.
-  int64_t GetSequenceLength(const framework::Scope &scope) const;
-
-  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
-  //                                   map(dst_scope.Var, dst_vars)):
-  //   dst_tensor.ShareDataWith(src_tensor)
-  static void LinkTensor(const framework::Scope &src_scope,
-                         const std::vector<std::string> &src_vars,
-                         framework::Scope *dst_scope,
-                         const std::vector<std::string> &dst_vars);
-
-  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
-  //                                   map(dst_scope.Var, dst_vars)):
-  //   callback(src_tensor, &dst_tensor)
-  template <typename Callback>
-  static void LinkTensorWithCallback(const framework::Scope &src_scope,
-                                     const std::vector<std::string> &src_vars,
-                                     framework::Scope *dst_scope,
-                                     const std::vector<std::string> &dst_vars,
-                                     Callback callback,
-                                     bool is_backward = false) {
-    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
-    for (size_t i = 0; i < dst_vars.size(); ++i) {
-      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
-      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
-                   is_backward);
-    }
-  }
-
-  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
-  //                                   map(dst_scope.FindVar, dst_vars)):
-  //   callback(src_tensor, &dst_tensor)
-  template <typename Callback>
-  static void LinkTensorWithCallback(const framework::Scope &src_scope,
-                                     const std::vector<std::string> &src_vars,
-                                     const framework::Scope &dst_scope,
-                                     const std::vector<std::string> &dst_vars,
-                                     Callback callback,
-                                     bool is_backward = false) {
-    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
-    for (size_t i = 0; i < dst_vars.size(); ++i) {
-      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
-      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
-                   is_backward);
-    }
-  }
-
-  // (seq_len, shape) -> return [seq_len] + list(shape)
-  static framework::DDim PrependDims(size_t seq_len,
-                                     const framework::DDim &src);
-
- private:
-  template <typename Callback>
-  static void AccessTensor(const framework::Scope &src_scope,
-                           const std::string &src_var_name,
-                           framework::Scope *dst_scope,
-                           const std::string &dst_var_name, Callback callback,
-                           bool is_backward = false) {
-    auto *src_var = src_scope.FindVar(src_var_name);
-    if (is_backward && src_var == nullptr) {
-      return;
-    }
-    PADDLE_ENFORCE_NOT_NULL(src_var, "%s is not found.", src_var_name);
-    auto &src_tensor = src_var->Get<framework::LoDTensor>();
-
-    auto *dst_var = dst_scope->Var(dst_var_name);
-    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
-    callback(src_tensor, dst_tensor);
-  }
-
-  template <typename Callback>
-  static void AccessTensor(const framework::Scope &src_scope,
-                           const std::string &src_var_name,
-                           const framework::Scope &dst_scope,
-                           const std::string &dst_var_name, Callback callback,
-                           bool is_backward = false) {
-    auto *dst_var = dst_scope.FindVar(dst_var_name);
-    if (is_backward && dst_var == nullptr) {
-      return;
-    }
-    auto *src_var = src_scope.FindVar(src_var_name);
-    PADDLE_ENFORCE_NOT_NULL(src_var, "%s is not found.", src_var_name);
-    auto &src_tensor = src_var->Get<framework::LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(dst_var, "%s is not found.", dst_var_name);
-    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
-    callback(src_tensor, dst_tensor);
-  }
-};
-
-class RecurrentOp : public RecurrentBase {
- public:
-  RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs);
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override;
-
- private:
-  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
-                              const framework::Scope &scope,
-                              size_t seq_len) const;
-};
-
-class RecurrentGradOp : public RecurrentBase {
- public:
-  RecurrentGradOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs);
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override;
-
-  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
-                              const framework::Scope &scope,
-                              size_t seq_len) const;
-
-  std::unordered_set<std::string> List2Set(
-      const std::vector<std::string> &list) const;
-
-  std::unordered_set<std::string> LocalVarNames(
-      const framework::Scope &scope) const;
-
-  static std::vector<std::string> GradVarLists(
-      const std::vector<std::string> &var_names);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
deleted file mode 100644
index ebd07d90ebe6b0ba008ac89c01c4f054f96a6da9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-include(operators)
-if(WITH_GPU)
-    register_operators(DEPS cub)
-else()
-    register_operators()
-endif()
-
-if(WITH_GPU)
-    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu")
-    string(REPLACE ".part.cu" "" OPS "${OPS}")
-
-    foreach(src ${OPS})
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.part.cu)
-            set(CUDA_KERNEL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${src}.part.cu)
-            file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT)
-            string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT})
-            if (MATCHED)
-                string(STRIP ${CMAKE_MATCH_1} MATCHED)
-                file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n")
-            endif()
-
-        endif()
-    endforeach()
-endif()
diff --git a/paddle/fluid/operators/reduce_ops/cub_reduce.h b/paddle/fluid/operators/reduce_ops/cub_reduce.h
deleted file mode 100644
index af56e85e9c6f5e0cfb5e03587fbace4665d9e5fb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/cub_reduce.h
+++ /dev/null
@@ -1,328 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <numeric>
-#include <set>
-#include <vector>
-
-#include <cub/cub.cuh>  // NOLINT
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-namespace detail {
-template <typename T, size_t ElementCount>
-struct Array {
- public:
-  HOSTDEVICE inline Array() {}
-
-  HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
-
-  HOSTDEVICE inline const T& operator[](size_t index) const {
-    return data_[index];
-  }
-
-  HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
-
-  template <typename VectorLikeType>
-  static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
-    PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
-    size_t n = static_cast<size_t>(vec.size());
-    Array<T, ElementCount> ret;
-    for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
-    return ret;
-  }
-
- private:
-  T data_[ElementCount];
-};
-
-// reduce the last axis of 2d array
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
-          int BlockDim>
-__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
-                               TransformOp transformer, Ty init,
-                               int reduce_num) {
-  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
-  int idx_x = blockIdx.x * reduce_num;
-  int idx_y = threadIdx.x;
-  Ty reduce_var = init;
-  for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
-    reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
-
-  reduce_var =
-      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
-
-  if (threadIdx.x == 0) {
-    y[blockIdx.x] = reduce_var;
-  }
-}
-
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
-          int BlockDim, int Rank, int ReduceRank>
-__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
-                             TransformOp transformer, Ty init, int reduce_num,
-                             Array<int, Rank> x_strides,
-                             Array<int, ReduceRank> reduce_dim,
-                             Array<int, ReduceRank> reduce_strides,
-                             Array<int, Rank - ReduceRank> left_dim,
-                             Array<int, Rank - ReduceRank> left_strides) {
-  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
-  Array<int, Rank> sub_index;
-  int left_idx = blockIdx.x;
-  for (int i = 0; i < Rank - ReduceRank; ++i) {
-    sub_index[left_dim[i]] = left_idx / left_strides[i];
-    left_idx %= left_strides[i];
-  }
-
-  int reduce_idx = threadIdx.x;
-  for (int j = 0; j < ReduceRank; ++j) {
-    sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
-    reduce_idx %= reduce_strides[j];
-  }
-
-  int idx_x = 0;
-  for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
-  Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
-
-  for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
-    int reduce_idx = i;
-    for (int j = 0; j < ReduceRank; ++j) {
-      sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
-      reduce_idx %= reduce_strides[j];
-    }
-
-    int idx_x = 0;
-    for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
-    reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
-  }
-
-  reduce_var =
-      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
-
-  if (threadIdx.x == 0) {
-    y[blockIdx.x] = reduce_var;
-  }
-}
-
-static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
-  int n = static_cast<int>(dims.size());
-  if (n == 0) return std::vector<int>();
-  std::vector<int> strides(n);
-  strides.back() = 1;
-  for (int i = n - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * dims[i + 1];
-  }
-  return strides;
-}
-
-static inline std::vector<int> GetStrides(const std::vector<int>& dims,
-                                          const std::vector<int>& idx) {
-  int n = static_cast<int>(idx.size());
-  if (n == 0) return std::vector<int>();
-  std::vector<int> strides(n);
-  strides.back() = 1;
-  for (int i = n - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * dims[idx[i + 1]];
-  }
-  return strides;
-}
-
-constexpr int kMaxBlockDim = 512;
-
-static inline int GetDesiredBlockDim(int block_dim) {
-  return block_dim >= kMaxBlockDim
-             ? kMaxBlockDim
-             : (1 << static_cast<int>(std::log2(block_dim)));
-}
-
-template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
-          typename TransformOp>
-static void TensorReduceImpl(
-    const Tx* x_data, Ty* y_data, const platform::Place& place,
-    const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
-    int left_num, int reduce_num, const std::vector<int>& x_strides,
-    const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
-    const std::vector<int>& left_dim, const std::vector<int>& left_strides,
-    cudaStream_t stream) {
-#define CUB_RANK_CASE(i, ...)             \
-  case i: {                               \
-    constexpr auto kRank = i;             \
-    switch (reduce_rank) { __VA_ARGS__; } \
-  } break
-
-#define CUB_REDUCE_RANK_CASE(i, ...)                              \
-  case i: {                                                       \
-    constexpr auto kReduceRank = i;                               \
-    ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank,  \
-                 kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
-        x_data, y_data, reducer, transformer, init, reduce_num,   \
-        Array<int, kRank>::From(x_strides),                       \
-        Array<int, kReduceRank>::From(reduce_dim),                \
-        Array<int, kReduceRank>::From(reduce_strides),            \
-        Array<int, kRank - kReduceRank>::From(left_dim),          \
-        Array<int, kRank - kReduceRank>::From(left_strides));     \
-  } break
-
-  int rank = x_strides.size();
-  int reduce_rank = reduce_strides.size();
-  if (rank == reduce_rank) {
-    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
-        x_data, transformer);
-    size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
-                              reduce_num, reducer, init, stream);
-    framework::Tensor tmp;
-    auto* temp_storage = tmp.mutable_data<uint8_t>(
-        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
-        place);
-    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
-                              reduce_num, reducer, init, stream);
-    return;
-  }
-  if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
-    ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
-                   BlockDim><<<left_num, BlockDim, 0, stream>>>(
-        x_data, y_data, reducer, transformer, init, reduce_num);
-    return;
-  }
-  /*
-  if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
-    // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
-    // Currently, it is handled by code below, but inefficient
-    return;
-  }
-  */
-
-  switch (rank) {
-    CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
-
-    CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
-
-    CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3););
-
-    CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
-
-    CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
-                  CUB_REDUCE_RANK_CASE(5););
-
-    CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
-                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
-
-    CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
-                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
-
-    CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
-                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
-                  CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
-  }
-
-#undef CUB_REDUCE_RANK_CASE
-#undef CUB_RANK_CASE
-}
-
-}  // namespace detail
-
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
-void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
-                  std::vector<int> origin_reduce_dims, const Ty& init,
-                  const ReduceOp& reducer, const TransformOp& transformer,
-                  cudaStream_t stream) {
-  auto x_dim = framework::vectorize<int>(x.dims());
-  std::vector<int> new_x_dim, new_reduce_dims;
-  int is_reduced = 0;
-  for (auto e : origin_reduce_dims) {
-    auto pos = e >= 0 ? e : e + x_dim.size();
-    is_reduced |= 1 << e;
-  }
-  for (int i = 0; i < x_dim.size(); i++) {
-    if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
-      new_x_dim.push_back(x_dim[i]);
-      if ((is_reduced >> i) & 1)
-        new_reduce_dims.push_back(new_x_dim.size() - 1);
-    } else {
-      new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
-    }
-  }
-  x_dim = new_x_dim;
-  origin_reduce_dims = new_reduce_dims;
-  int x_rank = static_cast<int>(x_dim.size());
-  std::set<int> left_set, reduce_set;
-  for (int i = 0; i < x_rank; ++i) left_set.insert(i);
-
-  for (auto e : origin_reduce_dims) {
-    left_set.erase(e);
-    reduce_set.insert(e);
-  }
-
-  std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
-  std::vector<int> left_dim(left_set.begin(), left_set.end());
-
-  std::vector<int> x_strides = detail::GetStrides(x_dim);
-  std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
-  std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
-  int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
-  int left_num = 1;
-  if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
-
-  std::vector<int> y_dim(left_dim.size());
-  for (int i = 0; i < left_dim.size(); ++i) {
-    y_dim[i] = x_dim[left_dim[i]];
-  }
-  auto x_data = x.data<Tx>();
-  auto y_data = y->mutable_data<Ty>(x.place());
-  if (reduce_num == 1) {
-    auto out_dims = y->dims();
-    framework::TensorCopy(x, y->place(), y);
-    y->Resize(out_dims);
-    return;
-  }
-
-#define CUB_BLOCK_DIM_CASE(block_dim)                                    \
-  case block_dim: {                                                      \
-    constexpr auto kBlockDim = block_dim;                                \
-    detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>(  \
-        x_data, y_data, x.place(), reducer, transformer, init, left_num, \
-        reduce_num, x_strides, reduce_dim, reduce_strides, left_dim,     \
-        left_strides, stream);                                           \
-  } break
-
-  switch (detail::GetDesiredBlockDim(reduce_num)) {
-    CUB_BLOCK_DIM_CASE(512);
-    CUB_BLOCK_DIM_CASE(256);
-    CUB_BLOCK_DIM_CASE(128);
-    CUB_BLOCK_DIM_CASE(64);
-    CUB_BLOCK_DIM_CASE(32);
-    CUB_BLOCK_DIM_CASE(16);
-    CUB_BLOCK_DIM_CASE(8);
-    CUB_BLOCK_DIM_CASE(4);
-    CUB_BLOCK_DIM_CASE(2);
-  }
-#undef CUB_BLOCK_DIM_CASE
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
deleted file mode 100644
index a3ca9ae0675472cb4f0bcd6f404f39004e7cc62f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
-
-REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all);
-REGISTER_OP_CPU_KERNEL(reduce_all,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         bool, ops::AllFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
deleted file mode 100644
index bd94ba263d957d0d65506ecd802bf43add6e2fb4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_all,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          bool, ops::AllFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.h b/paddle/fluid/operators/reduce_ops/reduce_all_op.h
deleted file mode 100644
index ba159dd703c8904784546eda262bf7be77967d48..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-struct AllFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->all(dim);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
deleted file mode 100644
index 34f0fffc9adef240c6fa222540710537587010c5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
-
-REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any);
-REGISTER_OP_CPU_KERNEL(reduce_any,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         bool, ops::AnyFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
deleted file mode 100644
index 66f0c9997ea1e27cf172a6839a68d2eb23395c4d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_any,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          bool, ops::AnyFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.h b/paddle/fluid/operators/reduce_ops/reduce_any_op.h
deleted file mode 100644
index b36bad9cada259932d2bd77c2426fbb46790de76..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-struct AnyFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->any(dim);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
deleted file mode 100644
index cb438b4a8057267015c8b3c15dd8468fca5a4b44..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_REDUCE_OP(reduce_max);
-REGISTER_OP_CPU_KERNEL(
-    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::MaxFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
deleted file mode 100644
index 832112ede833a06e053dcff5139e82f054b127c4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_max,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::MaxFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu
deleted file mode 100644
index 5ee38b8fa46290c86cd44ef1bcc71bd2fcd9bcd4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
deleted file mode 100644
index e549d2bddfef07ab438f72ab7273418ef0f97728..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-// NOTE(dengkaipeng): Input(Out) is unnecessary in reduce_mean_grad
-// calcualtion, but will incur a reduce_mean_grad op after
-// reduce_mean_grad_grad, delete Input(Out) here.
-// This change has no effect on reduce_mean_grad calculations.
-class ReduceMeanOpGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("reduce_mean_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetAttrMap(Attrs());
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return op;
-  }
-};
-
-class ReduceMeanDoubleGradMaker : public framework::GradOpDescMakerBase {
- public:
-  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<framework::OpDesc>> ops;
-    auto x_gg = OutputGrad(framework::GradVarName("X"));  // input ddx
-    auto out_grads = InputGrad(framework::GradVarName("Out"));
-    if (!out_grads.empty()) {
-      auto* out_grad_op = new framework::OpDesc();
-      out_grad_op->SetType("reduce_mean");
-      out_grad_op->SetInput("X", x_gg);
-      out_grad_op->SetAttrMap(Attrs());
-      out_grad_op->SetOutput("Out", out_grads);
-      ops.emplace_back(out_grad_op);
-    }
-
-    return ops;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ReduceMeanGradNoNeedBufferVarInference,
-                                      "X");
-}  // namespace operators
-}  // namespace paddle
-
-class __reduce_meanMaker__ : public ops::ReduceOpMaker {
- protected:
-  virtual std::string GetName() const { return "reduce_mean"; }
-  virtual std::string GetOpType() const { return "Reduce reduce_mean"; }
-};
-
-REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__,
-                  ops::ReduceMeanOpGradDescMaker);
-REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
-                  ops::ReduceMeanDoubleGradMaker,
-                  ops::ReduceMeanGradNoNeedBufferVarInference);
-REGISTER_OP_CPU_KERNEL(reduce_mean,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int64_t, ops::MeanFunctor>);
-
-template <typename T>
-using CPUReduceMeanGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, T,
-                          ops::MeanGradFunctor, true>;
-
-REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<float>,
-                       CPUReduceMeanGradKernel<double>,
-                       CPUReduceMeanGradKernel<int>,
-                       CPUReduceMeanGradKernel<int64_t>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
deleted file mode 100644
index 4d3bce8fdd05e536baa5fecb4fc5a117e2031224..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
-
-template <typename T>
-class ReduceMeanKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
-
-    int reduce_num = 1;
-    for (int i = 0; i < reduce_dims.size(); ++i) {
-      reduce_num *= input->dims()[reduce_dims[i]];
-    }
-
-    auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
-        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
-        DivideFunctor<T>(reduce_num), stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
-                        ops::ReduceMeanKernel<double>,
-                        ops::ReduceMeanKernel<int>,
-                        ops::ReduceMeanKernel<int64_t>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.h b/paddle/fluid/operators/reduce_ops/reduce_mean_op.h
deleted file mode 100644
index 240c43bc6d0af266e3500c14f894fe30abab728e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->mean(dim);
-  }
-};
-
-struct MeanGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
deleted file mode 100644
index 12eceb33ec27298d60713e72c9cc2cf91a5e7cfb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// .part used to speed up nvcc compile
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-
-template <typename T>
-using CUDAReduceMeanGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
-                          ops::MeanGradFunctor, true>;
-
-REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<float>,
-                        CUDAReduceMeanGradKernel<double>,
-                        CUDAReduceMeanGradKernel<int>,
-                        CUDAReduceMeanGradKernel<int64_t>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h b/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h
deleted file mode 100644
index 2557e8dd488618dd4998845b0e6e3ba823b96986..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-struct MaxFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->maximum(dim);
-  }
-};
-
-struct MinFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->minimum(dim);
-  }
-};
-
-struct MaxOrMinGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    auto equals = (*x) == y->broadcast(dim);
-    auto ones = dx->constant(1);
-    auto zeros = dx->constant(0);
-    // If there are multiple minimum or maximum elements, the subgradient of
-    // each is the set [0, 1], and we pass gradient to all of them here.
-    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
deleted file mode 100644
index 11aa78382e319331dc65ec22927f0d5762adfb43..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_REDUCE_OP(reduce_min);
-REGISTER_OP_CPU_KERNEL(
-    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::MinFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
deleted file mode 100644
index 7b2706866f594228cbceb084e99d83aa8f345dfd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_min,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::MinFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu
deleted file mode 100644
index bf886063786a8c36884ed20fef41c99468156c01..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
deleted file mode 100644
index 838ac895e5d570999742e39bd23a74a2bf6616c1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-#define HANDLE_DIM(NDIM, RDIM)                                            \
-  if (ndim == NDIM && rdim == RDIM) {                                     \
-    ReduceFunctor<DeviceContext, T, NDIM, RDIM, Functor>(                 \
-        context.template device_context<DeviceContext>(), *input, output, \
-        dims, keep_dim);                                                  \
-  }
-
-template <typename DeviceContext, typename T, typename Functor>
-class ReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    if (reduce_all) {
-      // Flatten and reduce 1-D tensor
-      auto x = EigenVector<T>::Flatten(*input);
-      auto out = EigenScalar<T>::From(*output);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({{0}});
-      Functor functor;
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      int ndim = input->dims().size();
-      int rdim = dims.size();
-      // comments for accelerating compiling temporarily.
-      //      HANDLE_DIM(6, 5);
-      //      HANDLE_DIM(6, 4);
-      //      HANDLE_DIM(6, 3);
-      //      HANDLE_DIM(6, 2);
-      //      HANDLE_DIM(6, 1);
-      //      HANDLE_DIM(5, 4);
-      //      HANDLE_DIM(5, 3);
-      //      HANDLE_DIM(5, 2);
-      //      HANDLE_DIM(5, 1);
-      HANDLE_DIM(4, 3);
-      HANDLE_DIM(4, 2);
-      HANDLE_DIM(4, 1);
-      HANDLE_DIM(3, 2);
-      HANDLE_DIM(3, 1);
-      HANDLE_DIM(2, 1);
-      HANDLE_DIM(1, 1);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, typename Functor,
-          bool kNoNeedBufferX = false, bool kNoNeedBufferY = false>
-class ReduceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto dims = context.Attr<std::vector<int>>("dim");
-
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Out");
-    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
-    output->mutable_data<T>(context.GetPlace());
-
-    // NOTE: EigenTensor::From() uses tensor->data()
-    // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or
-    // kNoNeedBufferY should set true
-    // and use fake var that has same dims.
-    if (kNoNeedBufferX) {
-      input0 = output;
-    }
-    if (kNoNeedBufferY) {
-      input1 = input2;
-    }
-
-    // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
-    // not be set as Input in grad Maker, use Out_grad to replace here
-    if (!input1) input1 = input2;
-
-    if (reduce_all) {
-      auto x = EigenVector<T>::Flatten(*input0);
-      auto x_reduce = EigenVector<T>::From(*input1);
-      auto x_reduce_grad = EigenVector<T>::From(*input2);
-      auto x_grad = EigenVector<T>::Flatten(*output);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      auto broadcast_dim =
-          Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
-      Functor functor;
-      functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
-              broadcast_dim[0]);
-    } else {
-      int rank = input0->dims().size();
-      switch (rank) {
-        case 1:
-          ReduceGradFunctor<DeviceContext, T, 1, Functor>(
-              context.template device_context<DeviceContext>(), *input0,
-              *input1, *input2, output, dims);
-          break;
-        case 2:
-          ReduceGradFunctor<DeviceContext, T, 2, Functor>(
-              context.template device_context<DeviceContext>(), *input0,
-              *input1, *input2, output, dims);
-          break;
-        case 3:
-          ReduceGradFunctor<DeviceContext, T, 3, Functor>(
-              context.template device_context<DeviceContext>(), *input0,
-              *input1, *input2, output, dims);
-          break;
-        case 4:
-          ReduceGradFunctor<DeviceContext, T, 4, Functor>(
-              context.template device_context<DeviceContext>(), *input0,
-              *input1, *input2, output, dims);
-          break;
-        case 5:
-          ReduceGradFunctor<DeviceContext, T, 5, Functor>(
-              context.template device_context<DeviceContext>(), *input0,
-              *input1, *input2, output, dims);
-          break;
-        case 6:
-          ReduceGradFunctor<DeviceContext, T, 6, Functor>(
-              context.template device_context<DeviceContext>(), *input0,
-              *input1, *input2, output, dims);
-          break;
-      }
-    }
-  }
-};
-
-class ReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReduceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReduceOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      PADDLE_ENFORCE_LT(
-          dims[i], x_rank,
-          "The dim should be in the range [-rank(input), rank(input)).");
-    }
-    sort(dims.begin(), dims.end());
-    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    if (reduce_all) {
-      if (keep_dim)
-        ctx->SetOutputDim(
-            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
-      else
-        ctx->SetOutputDim("Out", {1});
-    } else {
-      auto dims_vector = vectorize(x_dims);
-      if (keep_dim) {
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = 1;
-        }
-      } else {
-        const int kDelFlag = -2;
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = kDelFlag;
-        }
-        dims_vector.erase(
-            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-            dims_vector.end());
-      }
-      auto out_dims = framework::make_ddim(dims_vector);
-      ctx->SetOutputDim("Out", out_dims);
-      if (dims[0] != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
-    }
-  }
-};
-
-class ReduceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      PADDLE_ENFORCE_LT(
-          dims[i], x_rank,
-          "The dim should be in the range [-rank(input), rank(input)).");
-    }
-    sort(dims.begin(), dims.end());
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-};
-
-class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput("X",
-             "(Tensor) The input tensor. Tensors with rank at most 6 are "
-             "supported.");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<std::vector<int>>(
-        "dim",
-        "(list<int>, default {0}) The dimensions to reduce. "
-        "Must be in the range [-rank(input), rank(input)). "
-        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
-        "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault({0});
-    AddAttr<bool>("keep_dim",
-                  "(bool, default false) "
-                  "If true, retain the reduced dimension with length 1.")
-        .SetDefault(false);
-    AddAttr<bool>("reduce_all",
-                  "(bool, default false) "
-                  "If true, output a scalar reduced along all dimensions.")
-        .SetDefault(false);
-    AddComment(string::Sprintf(R"DOC(
-%s Operator.
-
-This operator computes the %s of input tensor along the given dimension.
-The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-If reduce_all is true, just reduce along all dimensions and output a scalar.
-
-)DOC",
-                               GetOpType(), GetName()));
-  }
-
- protected:
-  virtual std::string GetName() const = 0;
-  virtual std::string GetOpType() const = 0;
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_OP(op_name)                                      \
-  class __##op_name##Maker__ : public ops::ReduceOpMaker {               \
-   protected:                                                            \
-    virtual std::string GetName() const { return #op_name; }             \
-    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
-  };                                                                     \
-  REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
-                    paddle::framework::DefaultGradOpDescMaker<true>);    \
-  REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
-
-#define REGISTER_REDUCE_OP_WITHOUT_GRAD(op_name)                         \
-  class __##op_name##Maker__ : public ops::ReduceOpMaker {               \
-   protected:                                                            \
-    virtual std::string GetName() const { return #op_name; }             \
-    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
-  };                                                                     \
-  REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
-                    paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
deleted file mode 100644
index 3da27bc8ac8d448471b9ff3779ac6aca59fac523..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T, size_t D, size_t R_D,
-          typename Functor>
-void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input,
-                   framework::Tensor* output, const std::vector<int>& dims,
-                   bool keep_dim) {
-  auto x = EigenTensor<T, D>::From(input);
-  auto x_rank = static_cast<int>(x.dimensions().size());
-  auto reduce_dim = Eigen::array<int, R_D>();
-  std::vector<int> dims_ref = dims;
-  for (size_t i = 0; i < dims_ref.size(); ++i) {
-    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
-    reduce_dim[i] = dims_ref[i];
-  }
-  // construct the squeezed output tensor
-  DDim out_dims = output->dims();
-  if (keep_dim && x_rank > 1) {
-    const int kDelFlag = -2;
-    auto dims_vector = framework::vectorize(out_dims);
-    for (size_t i = 0; i < dims_ref.size(); ++i) {
-      dims_vector[dims_ref[i]] = kDelFlag;
-    }
-    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-                      dims_vector.end());
-    out_dims = framework::make_ddim(dims_vector);
-  }
-  auto& place = *context.eigen_device();
-  Functor functor;
-
-  if (D == 1) {
-    auto out = EigenScalar<T>::From(*output);
-    functor(place, &x, &out, reduce_dim);
-  } else {
-    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
-    functor(place, &x, &out, reduce_dim);
-  }
-}
-
-template <typename DeviceContext, typename T, size_t D, typename Functor>
-void ReduceGradFunctor(const DeviceContext& context,
-                       const framework::Tensor& input0,
-                       const framework::Tensor& input1,
-                       const framework::Tensor& input2,
-                       framework::Tensor* output,
-                       const std::vector<int>& dims) {
-  auto x = EigenTensor<T, D>::From(input0);
-  auto x_grad = EigenTensor<T, D>::From(*output);
-  auto x_rank = static_cast<int>(x.dimensions().size());
-  auto x_dims = input0.dims();
-  auto reduced_dims_v = framework::vectorize(x_dims);
-  std::vector<int> dims_ref = dims;
-  Eigen::array<int, D> broadcast_dim;
-  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
-
-  int broad_cats_times = 1;
-  for (size_t i = 0; i < dims_ref.size(); ++i) {
-    if (dims_ref[i] < 0) {
-      dims_ref[i] = x_rank + dims_ref[i];
-    }
-    reduced_dims_v[dims_ref[i]] = 1;
-    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
-    broad_cats_times *= x_dims[dims_ref[i]];
-  }
-  auto reduced_dims = framework::make_ddim(reduced_dims_v);
-  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
-  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
-
-  auto& place = *context.eigen_device();
-
-  Functor functor;
-  functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
-          broad_cats_times);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
deleted file mode 100644
index 88935107df187da731e5b77bb6c24cd692d2994f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-
-REGISTER_REDUCE_OP(reduce_prod);
-REGISTER_OP_CPU_KERNEL(reduce_prod,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int64_t, ops::ProdFunctor>);
-REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             float, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             double, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int64_t, ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
deleted file mode 100644
index 4434937f75397d8d5340a94abbd41efa7e7a8d4b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_prod,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::ProdFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
deleted file mode 100644
index 103e108e4bda1c33434ec0c5d6c58f24fa725f57..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-struct ProdFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->prod(dim);
-  }
-};
-
-struct ProdGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu
deleted file mode 100644
index 0610cdd94f89c0371988fac7955d07fc5498a69f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
deleted file mode 100644
index 14bb2cf0013a25239c0166e4abb8ca27645bf681..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-// NOTE: Input(Out) is unnecessary in reduce_sum_grad, and Input(X) needs no
-// buffer
-class ReduceSumOpGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("reduce_sum_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetAttrMap(Attrs());
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ReduceSumGradNoNeedBufferVarInference,
-                                      "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-class ReduceSumOpMaker : public ops::ReduceOpMaker {
- protected:
-  virtual std::string GetName() const { return "reduce_sum"; }
-  virtual std::string GetOpType() const { return "Reduce reduce_sum"; }
-};
-
-REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
-                  ops::ReduceSumOpGradDescMaker);
-REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
-                  ops::ReduceSumGradNoNeedBufferVarInference);
-
-REGISTER_OP_CPU_KERNEL(
-    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::SumFunctor>);
-
-template <typename T>
-using CPUReduceSumGradKernel =
-    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, T,
-                             ops::SumGradFunctor, true>;
-
-REGISTER_OP_CPU_KERNEL(reduce_sum_grad, CPUReduceSumGradKernel<float>,
-                       CPUReduceSumGradKernel<double>,
-                       CPUReduceSumGradKernel<int>,
-                       CPUReduceSumGradKernel<int64_t>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
deleted file mode 100644
index 9051740e83aabd783750e8f415da09921608e470..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct IdentityFunctor {
-  HOSTDEVICE explicit inline IdentityFunctor() {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x; }
-};
-
-template <typename T>
-class ReduceSumKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
-
-    int reduce_num = 1;
-    for (int i = 0; i < reduce_dims.size(); ++i) {
-      reduce_num *= input->dims()[reduce_dims[i]];
-    }
-
-    auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
-        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
-        IdentityFunctor<T>(), stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
-                        ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
-                        ops::ReduceSumKernel<int64_t>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
deleted file mode 100644
index 7343d01e29d9983f546d7c6fd6b4be837cc1dcc5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast
-template <typename DeviceContext, typename T, typename Functor,
-          bool kNoNeedBufferX = false>
-class ReduceSumGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto dims = context.Attr<std::vector<int>>("dim");
-    if (context.GetPlace().type() == typeid(platform::CPUPlace) &&
-        dims.size() == 1) {
-      auto* input0 = context.Input<Tensor>("X");
-      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
-      output->mutable_data<T>(context.GetPlace());
-      const auto* input2_d = input2->data<T>();
-      auto* output_d = output->data<T>();
-
-      // handle reduce_all
-      if (input2->dims().size() == 1 && input2->dims()[0] == 1) {
-        for (int64_t i = 0; i < framework::product(input0->dims()); ++i) {
-          output_d[i] = input2_d[0];
-        }
-        return;
-      }
-
-      // handle reduce by one dimension
-      int reduce_dim_index = dims[0];
-      if (reduce_dim_index < 0) {
-        reduce_dim_index += input0->dims().size();
-      }
-
-      auto& input_dim = input0->dims();
-      int64_t before_dim = 1;
-      for (int i = 0; i < reduce_dim_index; ++i) {
-        before_dim *= input_dim[i];
-      }
-      int64_t reduce_dim = input_dim[reduce_dim_index];
-      int64_t after_dim = 1;
-      for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
-        after_dim *= input_dim[i];
-      }
-      for (int64_t i = 0; i < before_dim; ++i) {
-        for (int64_t j = 0; j < reduce_dim; ++j) {
-          for (int64_t k = 0; k < after_dim; ++k) {
-            output_d[i * reduce_dim * after_dim + j * after_dim + k] =
-                input2_d[i * after_dim + k];
-          }
-        }
-      }
-      return;
-    }
-
-    // default use Eigen broadcast
-    ReduceGradKernel<DeviceContext, T, Functor, kNoNeedBufferX> kernel;
-    kernel.Compute(context);
-  }
-};
-
-struct SumFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->sum(dim);
-  }
-};
-
-struct SumGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->eval().broadcast(dim);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
deleted file mode 100644
index 0d689d710a19103cf667a76e592dfba9571cae5c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-
-template <typename T>
-using CUDAReduceSumGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
-                          ops::SumGradFunctor, true>;
-
-REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel<float>,
-                        CUDAReduceSumGradKernel<double>,
-                        CUDAReduceSumGradKernel<int>,
-                        CUDAReduceSumGradKernel<int64_t>);
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
deleted file mode 100644
index 7ceb5b58465bcdfa22345944bf8140793f187498..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ /dev/null
@@ -1,273 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-class ReorderLoDTensorByRankTableOpProtoMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor), the input lod tensor to be reordered according to "
-             "Input(RankTable).");
-    AddInput("RankTable",
-             "(LoDRankTable), the rank table according to which Input(X) is "
-             "reordered.");
-    AddOutput("Out", "(LoDTensor), the reordered lod tensor.");
-    AddComment(R"DOC(ReorderLoDTensorByRankTable operator.
-
-Input(X) is a batch of sequences. Input(RankTable) stores new orders of the
-input sequence batch. The reorder_lod_tensor_by_rank operator reorders the
-Input(X) according to the information provided by Input(RankTable).
-
-For example:
-
-If the indices stored in the Input(RankTable) are [3, 0, 2, 1], the
-Input(X) will be reordered that the fourth sequence in Input(X) will become the
-first one, and then followed by the original first, third, and the second one.
-
-This is:
-X = [Seq0, Seq1, Seq2, Seq3]. The indices in RankTable are [3, 0, 2, 1].
-Out =  [Seq3, Seq0, Seq2, Seq1] with a new LoD information.
-
-If the LoD information of Input(X) is empty, this means Input(X) is not sequence
-data. This is also identical to a batch of sequences where each sequence has a
-fixed length 1. In this case, the reorder_lod_tensor_by_rank operator reorders
-each slice of Input(X) along the first axis according to Input(RankTable).
-
-This is:
-X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is empty. The
-indices in RankTable are [3, 0, 2, 1].
-Out = [Slice3, Slice0, Slice2, Slice1] with no LoD information is appended.
-
-NOTE: This operator sorts Input(X) according to a given LoDRankTable which does
-not need to be calculated according to Input(X). It can be calculated according
-to another different sequence, and then this operator sorts Input(X) according
-to the given LoDRankTable.
-
-)DOC");
-  }
-};
-
-class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
- public:
-  ReorderLoDTensorByRankTableBase(const std::string &type,
-                                  const framework::VariableNameMap &inputs,
-                                  const framework::VariableNameMap &outputs,
-                                  const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto &x =
-        detail::Ref(scope.FindVar(Input("X")),
-                    "Cannot find input lod tensor variable %s", Input("X"))
-            .Get<framework::LoDTensor>();
-    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")),
-                                   "Cannot find input rank table variable %s",
-                                   Input("RankTable"))
-                           .Get<framework::LoDRankTable>();
-    auto &out =
-        *detail::Ref(scope.FindVar(Output("Out")),
-                     "Cannot find output lod tensor variable %s", Output("Out"))
-             .GetMutable<framework::LoDTensor>();
-
-    out.Resize(x.dims());
-    out.mutable_data(x.place(), x.type());
-    this->process(place, x, rank_table, &out);
-  }
-
- protected:
-  virtual void process(const platform::Place &place,
-                       const framework::LoDTensor &x,
-                       const framework::LoDRankTable &rank_table,
-                       framework::LoDTensor *out) const = 0;
-
-  struct AbsoluteRankTableItem {
-    size_t offset;  // the absolute/accumulated offset.
-    size_t length;  // the length
-    framework::LoD lod;
-  };
-
-  std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
-      const framework::LoDTensor &x) const {
-    std::vector<AbsoluteRankTableItem> absolute_table;
-
-    if (x.lod().empty()) {
-      // For Tensor without lod, such as the output of sequence_pool_op
-      size_t size = x.dims()[0];
-      absolute_table.reserve(size);
-      for (size_t i = 0; i < size; ++i) {
-        absolute_table.emplace_back();
-        absolute_table.back().length = 1;
-        absolute_table.back().offset = i;
-      }
-    } else {
-      size_t level = 0;
-      size_t size = x.lod()[level].size();
-
-      for (size_t i = 0; i < size - 1; ++i) {
-        auto lod_offset =
-            framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
-
-        auto &offset = lod_offset.second;
-
-        absolute_table.emplace_back();
-        absolute_table.back().length = offset.second - offset.first;
-        absolute_table.back().offset = offset.first;
-        absolute_table.back().lod = lod_offset.first;
-      }
-    }
-
-    return absolute_table;
-  }
-
-  size_t CopyTensorAndLod(const platform::Place &place,
-                          const AbsoluteRankTableItem &item,
-                          const framework::LoDTensor &x,
-                          framework::LoDTensor *out, size_t out_offset) const {
-    auto &out_lod = *out->mutable_lod();
-    auto len = item.length;
-    auto x_offset = item.offset;
-
-    if (out_lod.empty()) {
-      for (size_t i = 0; i < item.lod.size(); ++i) {
-        out_lod.push_back(std::vector<size_t>({0}));
-      }
-    }
-
-    for (size_t i = 0; i < out_lod.size(); ++i) {
-      auto &out_v = out_lod[i];
-      auto &new_lod_v = item.lod[i];
-
-      for (auto &detail : new_lod_v) {
-        out_v.push_back(out_v.back() + detail);
-      }
-    }
-
-    auto x_sliced = x.Slice(x_offset, x_offset + len);
-    auto out_sliced = out->Slice(out_offset, out_offset + len);
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    framework::TensorCopy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
-    out_offset += len;
-    return out_offset;
-  }
-};
-
-class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
- public:
-  ReorderLoDTensorByRankTableOp(const std::string &type,
-                                const framework::VariableNameMap &inputs,
-                                const framework::VariableNameMap &outputs,
-                                const framework::AttributeMap &attrs)
-      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
-
- protected:
-  void process(const platform::Place &place, const framework::LoDTensor &x,
-               const framework::LoDRankTable &rank_table,
-               framework::LoDTensor *out) const override {
-    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
-    size_t out_offset = 0;
-    out->mutable_lod()->clear();
-    for (auto &item : rank_table.items()) {
-      PADDLE_ENFORCE_LT(item.index, absolute_table.size());
-      out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out,
-                                    out_offset);
-    }
-  }
-};
-
-class IdentityInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    if (!context->IsRuntime()) {
-      context->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
-class ReorderLodTensorByRankGradOpMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("reorder_lod_tensor_by_rank_grad");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetInput("RankTable", Input("RankTable"));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
- public:
-  ReorderLoDTensorByRankGradOp(const std::string &type,
-                               const framework::VariableNameMap &inputs,
-                               const framework::VariableNameMap &outputs,
-                               const framework::AttributeMap &attrs)
-      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
-
- protected:
-  void process(const platform::Place &place, const framework::LoDTensor &x,
-               const framework::LoDRankTable &rank_table,
-               framework::LoDTensor *out) const override {
-    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
-
-    // offsets = enumerate([item.index for item in rank_table.items()])
-    std::vector<std::pair<size_t, size_t>> offsets;
-    offsets.reserve(rank_table.items().size());
-    for (size_t i = 0; i < rank_table.items().size(); ++i) {
-      offsets.push_back({i, rank_table.items()[i].index});
-    }
-
-    // offsets.sort(key=lambda x: x[1])
-    std::sort(
-        offsets.begin(), offsets.end(),
-        [](const std::pair<size_t, size_t> &a,
-           const std::pair<size_t, size_t> &b) { return a.second < b.second; });
-
-    // Copy TensorAndLod
-    size_t out_offset = 0;
-    for (auto &offset : offsets) {
-      out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first],
-                                          x, out, out_offset);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(reorder_lod_tensor_by_rank,
-                  ops::ReorderLoDTensorByRankTableOp,
-                  ops::ReorderLodTensorByRankGradOpMaker,
-                  ops::ReorderLoDTensorByRankTableOpProtoMaker,
-                  ops::IdentityInferShape);
-REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad,
-                  ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape);
diff --git a/paddle/fluid/operators/requantize_op.cc b/paddle/fluid/operators/requantize_op.cc
deleted file mode 100644
index d156ae207763433ea2ed7fb97a08cbe5880da3cd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/requantize_op.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#include "paddle/fluid/operators/requantize_op.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-framework::OpKernelType ReQuantOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library_ = framework::LibraryType::kMKLDNN;
-  framework::DataLayout layout_ = framework::DataLayout::kMKLDNN;
-
-  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                 ctx.GetPlace(), layout_, library_);
-}
-
-void ReQuantOpMaker::Make() {
-  AddInput("Input", "input data");
-  AddOutput("Output", "output data");
-  AddAttr<float>("Scale_in", "scale in data").SetDefault({1.0f});
-  AddAttr<float>("Scale_out", "scale out data").SetDefault({1.0f});
-  AddComment(
-      R"DOC(This op will re-quantize data from INT8 with scale_in to INT8 with scale_out)DOC");
-}
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(requantize, ops::ReQuantOp, ops::ReQuantOpMaker);
diff --git a/paddle/fluid/operators/requantize_op.h b/paddle/fluid/operators/requantize_op.h
deleted file mode 100644
index c2b154db11dc713fdce1b9ef2f2616428bc09202..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/requantize_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::OpKernelType;
-using framework::Tensor;
-
-class ReQuantOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Output", ctx->GetInputDim("Input"));
-    ctx->ShareLoD("Input", /*->*/ "Output");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class ReQuantOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
deleted file mode 100644
index 0059921c0466af35349363df139362a33a791eeb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reshape_op.cc
+++ /dev/null
@@ -1,478 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-inline std::vector<int> get_new_shape(
-    const std::vector<const Tensor *> &list_new_shape_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_shape;
-  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
-    auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
-                      "shape of dim tensor should be [1]");
-    if (platform::is_gpu_place(tensor->place())) {
-      framework::Tensor temp;
-      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-
-      vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
-    } else {
-      vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
-    }
-  }
-
-  return vec_new_shape;
-}
-
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of ReshapeOp should not be null.");
-
-    if (ctx->HasInputs("ShapeTensor")) {
-      // top prority shape
-      auto ShapeTensor = ctx->Inputs("ShapeTensor");
-      PADDLE_ENFORCE_GT(ShapeTensor.size(), 0,
-                        "The size of Input(ShapeTensor) can't be zero");
-      auto infer_shape = ctx->Attrs().Get<std::vector<int>>("shape");
-      const int64_t copy_dim_val = 0;
-      auto in_dims = ctx->GetInputDim("X");
-      for (size_t i = 0; i < infer_shape.size(); ++i) {
-        if (infer_shape[i] == copy_dim_val) {
-          PADDLE_ENFORCE_LT(
-              static_cast<int>(i), in_dims.size(),
-              "The dimension of data to copy from input must be less "
-              "than the dimension of input.");
-          infer_shape[i] = in_dims[i];
-        }
-      }
-      auto infer_out_dims = framework::make_ddim(infer_shape);
-      ctx->SetOutputDim("Out", infer_out_dims);
-      return;
-    }
-
-    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    if (ctx->HasInput("Shape") && shape.empty()) {
-      auto shape_dims = ctx->GetInputDim("Shape");
-      int num_ele = 1;
-      for (int i = 0; i < shape_dims.size(); ++i) {
-        num_ele *= shape_dims[i];
-      }
-      auto vec_dims = std::vector<int>(num_ele, -1);
-      auto out_dims = framework::make_ddim(vec_dims);
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-      return;
-    }
-
-    if (ctx->HasInput("Shape") && !shape.empty() && ctx->IsRuntime()) {
-      // If true, set the shape of Output(Out) according to Input(Shape) in
-      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
-      ctx->ShareLoD("X", /*->*/ "Out");
-      return;
-    }
-
-    PADDLE_ENFORCE_EQ(!shape.empty(), true,
-                      "The shape information must be set by Attr(shape).");
-    auto x_dims = ctx->GetInputDim("X");
-    auto out_dims = ValidateShape(shape, x_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
-  static framework::DDim ValidateShape(const std::vector<int> shape,
-                                       const framework::DDim &in_dims) {
-    const int64_t in_size = framework::product(in_dims);
-    auto in_dims_vec = framework::vectorize(in_dims);
-    bool all_positive = std::all_of(in_dims_vec.cbegin(), in_dims_vec.cend(),
-                                    [](int64_t i) { return i > 0; });
-    // only one dimension can be set to -1, whose size will be automatically
-    // infered.
-    const int64_t unk_dim_val = -1;
-    const int64_t copy_dim_val = 0;
-
-    std::vector<int64_t> output_shape(shape.size(), 0);
-    int64_t capacity = 1;
-    int unk_dim_idx = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      if (shape[i] == unk_dim_val) {
-        PADDLE_ENFORCE_EQ(
-            unk_dim_idx, -1,
-            "Only one input dimension of Attr(shape) can be unknown.");
-        unk_dim_idx = i;
-      } else if (shape[i] == copy_dim_val) {
-        PADDLE_ENFORCE_LT(
-            static_cast<int>(i), in_dims.size(),
-            "The index of dimension to copy from input shape must be less "
-            "than the size of input shape.");
-      } else {
-        PADDLE_ENFORCE_GT(
-            shape[i], 0,
-            "Each input dimension of Attr(shape) must not be negtive except "
-            "one unknown dimension.");
-      }
-
-      capacity *= (shape[i] ? shape[i] : in_dims[i]);
-      output_shape[i] =
-          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
-    }
-
-    if (unk_dim_idx != -1) {
-      if (all_positive) {
-        // in_size < 0 and is un-determinate in compile time, skip the check,
-        // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
-        // capacity = -24, in_size = -8, output_shape[0] = 0
-        // the following check will fail.
-        output_shape[unk_dim_idx] = -in_size / capacity;
-        PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
-                          "Invalid shape is given.");
-      } else {
-        output_shape[unk_dim_idx] = -1;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
-    }
-    return framework::make_ddim(output_shape);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "ShapeTensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor). The input tensor of reshape operator.");
-    AddInput("Shape",
-             "(Tensor<int32>, optional). If provided, reshape according to "
-             "this given shape. That is to say it has a higher priority than "
-             "the shape attribute, while the shape attribute still should be "
-             "set correctly to gurantee shape inference in compile time.")
-        .AsDispensable();
-    AddInput(
-        "ShapeTensor",
-        "(vector<Tensor<int32>>, optional). If provided, reshape will use this"
-        "The shape of the tensor in vector MUST BE [1]"
-        "it has the highest priority compare with Input(Shape) and "
-        "attr(shape).")
-        .AsDuplicable()
-        .AsDispensable();
-    AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>(
-        "shape", "(std::vector<int>) Target shape of reshape operator.")
-        .SetDefault({});
-    AddComment(R"DOC(
-Reshape Operator.
-
-Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The
-data in Input(X) are unchanged.
-
-Examples:
-
-1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
-specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X)
-into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged.
-
-2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
-specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform
-Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data
-unchanged. In this case, one and only dimension of Attr(shape) can be set to -1,
-the value of this dimension is inferred from the total element number of
-Input(X) and remaining dimensions.
-
-3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
-specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform
-Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data
-unchanged. In this case, besides -1, 0 means the actual dimension value is going
-to be copied from the corresponding dimension of Input(X).
-
-Note:
-
-1. One and only one dimension in Attr(shape) can be set -1. In this case,
-the actual dimension value will be infered from the total element number of
-Input(X) and remaining dimensions.
-
-2. More than one dimensions in Attr(shape) can be set to 0, which means the real
-dimension value will be copied from Input(X) at runtime. Note that the index of
-0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape
-[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
-
-3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
-Attr(shape) still should be set correctly to gurantee shape inference in
-compile-time.
-
-)DOC");
-  }
-};
-
-class ReshapeGradOp : public framework::OperatorWithKernel {
- public:
-  ReshapeGradOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) shouldn't be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) shouldn't be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ReshapeKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
-    auto *in = ctx.Input<framework::LoDTensor>("X");
-
-    framework::DDim out_dims = out->dims();
-
-    auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("ShapeTensor");
-    if (list_new_shape_tensor.size() > 0) {
-      // have shape tensor
-      auto new_shape = get_new_shape(list_new_shape_tensor);
-      out_dims = ReshapeOp::ValidateShape(new_shape, in->dims());
-
-    } else {
-      auto *shape_tensor = ctx.HasInput("Shape")
-                               ? ctx.Input<framework::LoDTensor>("Shape")
-                               : nullptr;
-
-      if (shape_tensor) {
-        auto *shape_data = shape_tensor->data<int>();
-        framework::Tensor cpu_shape_tensor;
-        if (platform::is_gpu_place(shape_tensor->place())) {
-          TensorCopySync(*shape_tensor, platform::CPUPlace(),
-                         &cpu_shape_tensor);
-          shape_data = cpu_shape_tensor.data<int>();
-        }
-        auto shape =
-            std::vector<int>(shape_data, shape_data + shape_tensor->numel());
-        out_dims = ReshapeOp::ValidateShape(shape, in->dims());
-      }
-    }
-
-    out->Resize(out_dims);
-    out->mutable_data(ctx.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), out);
-    out->Resize(out_dims);
-  }
-};
-
-class ReshapeGradKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto in_dims = d_x->dims();
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    d_x->Resize(in_dims);
-  }
-};
-
-// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape,
-// the XShape is used to carry the shape and lod of X which will be used in
-// reshape_grad, in this way, the framework can reuse the memory of X
-// immediately the reshape_op is finished.
-// Considering compatibility issues, we could not fix reshape_op
-class Reshape2Op : public ReshapeOp {
- public:
-  Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : ReshapeOp(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true,
-                      "Output(XShape) of ReshapeOp should not be null.");
-    const auto &x_dims = ctx->GetInputDim("X");
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-
-    ReshapeOp::InferShape(ctx);
-  }
-};
-
-class Reshape2OpMaker : public ReshapeOpMaker {
- public:
-  void Make() override {
-    ReshapeOpMaker::Make();
-    AddOutput("XShape",
-              "XShape is just used to store the shape and lod of X, which will "
-              "be used in FlattenGradOp.")
-        .AsIntermediate();
-  }
-};
-
-class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("reshape2_grad");
-    grad_op->SetInput("XShape", Output("XShape"));
-    grad_op->SetInput("ShapeTensor", Input("ShapeTensor"));
-    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class Reshape2GradOp : public framework::OperatorWithKernel {
- public:
-  Reshape2GradOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("XShape"), true,
-                      "Input(XShape) shouldn't be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) shouldn't be null.");
-    auto xshape_dims = ctx->GetInputDim("XShape");
-    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("XShape", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "ShapeTensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(ReshapeOpInplaceInToOut, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(ReshapeGradInplaceInToOut,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>,
-                  ops::ReshapeOpInplaceInToOut);
-REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp,
-                  ops::ReshapeGradInplaceInToOut);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
-                               ops::ReshapeKernel, int, ops::ReshapeKernel,
-                               int64_t, ops::ReshapeKernel);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
-                               double, ops::ReshapeGradKernel, int,
-                               ops::ReshapeGradKernel, int64_t,
-                               ops::ReshapeGradKernel);
-
-REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
-                  ops::Reshape2GradMaker, ops::ReshapeOpInplaceInToOut);
-REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp,
-                  ops::ReshapeGradInplaceInToOut);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                               ops::ReshapeKernel, int, ops::ReshapeKernel,
-                               int64_t, ops::ReshapeKernel);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
-                               double, ops::ReshapeGradKernel, int,
-                               ops::ReshapeGradKernel, int64_t,
-                               ops::ReshapeGradKernel);
-
-#ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
-                                ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                int64_t, ops::ReshapeKernel, plat::float16,
-                                ops::ReshapeKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
-                                double, ops::ReshapeGradKernel, int,
-                                ops::ReshapeGradKernel, int64_t,
-                                ops::ReshapeGradKernel, plat::float16,
-                                ops::ReshapeGradKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                                ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                int64_t, ops::ReshapeKernel, plat::float16,
-                                ops::ReshapeKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
-                                double, ops::ReshapeGradKernel, int,
-                                ops::ReshapeGradKernel, int64_t,
-                                ops::ReshapeGradKernel, plat::float16,
-                                ops::ReshapeGradKernel);
-#endif
diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc
deleted file mode 100644
index a20f7d231fa9ea313581ac0629a87fa5f4a88ce5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reverse_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reverse_op.h"
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class ReverseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
-    const auto& x_dims = ctx->GetInputDim("X");
-    const auto& axis = ctx->Attrs().Get<std::vector<int>>("axis");
-    PADDLE_ENFORCE(!axis.empty(), "'axis' can not be empty.");
-    for (int a : axis) {
-      PADDLE_ENFORCE_LT(a, x_dims.size(),
-                        "The axis must be less than input tensor's rank.");
-    }
-    ctx->SetOutputDim("Out", x_dims);
-  }
-};
-
-class ReverseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The LoDTensor to be flipped.");
-    AddOutput("Out", "The LoDTensor after flipping.");
-    AddAttr<std::vector<int>>(
-        "axis", "The axises that along which order of elements is reversed.");
-    AddComment(R"DOC(
-      Reverse Operator.
-
-      Reverse the order of elements in the input LoDTensor along given axises.
-
-      Case 1:
-        Given
-            X = [[1, 2, 3, 4, 5]
-                 [6, 7, 8, 9, 10]
-                 [11, 12, 13, 14, 15]],
-        and
-            axis = [0],
-        we get:
-            Out = [[11, 12, 13, 14, 15]
-                   [6, 7, 8, 9, 10]
-                   [1, 2, 3, 4, 5]].
-        
-      Case 2:
-        Given
-            X = [[[1, 2, 3, 4]
-                  [5, 6, 7, 8]]
-                 [[9, 10, 11, 12]
-                  [13, 14, 15, 16]]],
-        and
-            axis = [0, 2],
-        we get:
-            Out = [[[12, 11, 10, 9]
-                    [16, 15, 14, 13]]
-                   [[4, 3, 2, 1]
-                    [8, 7, 6, 5]]],
-    )DOC");
-  }
-};
-
-class ReverseGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* grad_op = new framework::OpDesc();
-    grad_op->SetType("reverse");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttr("axis", GetAttr("axis"));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker,
-                  ops::ReverseGradMaker);
-REGISTER_OPERATOR(reverse_grad, ops::ReverseOp);
-REGISTER_OP_CPU_KERNEL(
-    reverse, ops::ReverseKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.cu b/paddle/fluid/operators/reverse_op.cu
deleted file mode 100644
index 635c41529b38f2dd287b00ed2e5659e11f619e78..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reverse_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reverse_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    reverse, ops::ReverseKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h
deleted file mode 100644
index 9063cd59bba5c6307b55a500455908a5fd278390..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reverse_op.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T, int Rank>
-struct ReverseFunctor {
-  void operator()(const DeviceContext& context, const framework::LoDTensor& in,
-                  framework::LoDTensor* out, const std::vector<int>& axis) {
-    Eigen::array<bool, Rank> reverse_axis;
-    for (int i = 0; i < Rank; ++i) {
-      reverse_axis[i] = false;
-    }
-    for (int a : axis) {
-      reverse_axis[a] = true;
-    }
-
-    auto in_eigen = framework::EigenTensor<T, Rank>::From(in);
-    auto out_eigen = framework::EigenTensor<T, Rank>::From(*out);
-    auto* dev = context.eigen_device();
-
-    out_eigen.device(*dev) = in_eigen.reverse(reverse_axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReverseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::LoDTensor>("X");
-    auto* out = context.Output<framework::LoDTensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    const auto& axis = context.Attr<std::vector<int>>("axis");
-    int rank = x->dims().size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    switch (rank) {
-      case 1:
-        ReverseFunctor<DeviceContext, T, 1> functor1;
-        functor1(dev_ctx, *x, out, axis);
-        break;
-      case 2:
-        ReverseFunctor<DeviceContext, T, 2> functor2;
-        functor2(dev_ctx, *x, out, axis);
-        break;
-      case 3:
-        ReverseFunctor<DeviceContext, T, 3> functor3;
-        functor3(dev_ctx, *x, out, axis);
-        break;
-      case 4:
-        ReverseFunctor<DeviceContext, T, 4> functor4;
-        functor4(dev_ctx, *x, out, axis);
-        break;
-      case 5:
-        ReverseFunctor<DeviceContext, T, 5> functor5;
-        functor5(dev_ctx, *x, out, axis);
-        break;
-      case 6:
-        ReverseFunctor<DeviceContext, T, 6> functor6;
-        functor6(dev_ctx, *x, out, axis);
-        break;
-      default:
-        PADDLE_THROW(
-            "Reserve operator doesn't supports tensors whose ranks are greater "
-            "than 6.");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
deleted file mode 100644
index f360ae3cbf4264f655a9d517708a4f9a1c1a15da..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class RNNMemoryHelperOp : public framework::OperatorBase {
- public:
-  RNNMemoryHelperOp(const std::string &type,
-                    const framework::VariableNameMap &inputs,
-                    const framework::VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto mem_var_name = Input("X");
-    auto *mem_var = scope.FindVar(mem_var_name);
-    PADDLE_ENFORCE(mem_var != nullptr,
-                   "Cannot find mem_var in scope, mem_var_name is %s",
-                   mem_var_name);
-
-    auto out_name = this->Output("Out");
-    auto *out_var = scope.FindVar(out_name);
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot find out_var in scope, out_var_name is %s",
-                   out_name);
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-
-    auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-    auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
-    framework::TensorCopy(mem_tensor, dev_place, dev_ctx, out_tensor);
-    out_tensor->set_lod(mem_tensor.lod());
-  }
-};
-
-class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of rnn_memory_helper op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output of rnn_memory_helper op should not be null.");
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "");
-    AddOutput("Out", "");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddComment("");
-  }
-};
-
-class RNNMemoryHelperGradOp : public framework::OperatorBase {
- public:
-  RNNMemoryHelperGradOp(const std::string &type,
-                        const framework::VariableNameMap &inputs,
-                        const framework::VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto out_grad_var_name = Input(framework::GradVarName("Out"));
-    auto *out_grad_var = scope.FindVar(out_grad_var_name);
-
-    auto in_grad_var_name = Output(framework::GradVarName("X"));
-    auto *in_grad_var = scope.FindVar(in_grad_var_name);
-
-    PADDLE_ENFORCE(in_grad_var != nullptr,
-                   "Cannot find in_grad_var in scope, name is %s",
-                   in_grad_var_name);
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-
-    if (out_grad_var == nullptr) {
-      VLOG(5) << "Using fill constant 0 as starting gradient";
-      auto in_var_name = Input("X");
-      auto *in_var = scope.FindVar(in_var_name);
-      auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
-
-      framework::AttributeMap attrs;
-      attrs["dtype"] = in_var_tensor.type();
-      attrs["shape"] = framework::vectorize<int>(in_var_tensor.dims());
-      attrs["value"] = 0.0f;
-
-      auto zero_op = framework::OpRegistry::CreateOp(
-          "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
-      zero_op->Run(scope, dev_place);
-    } else {
-      auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
-      auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
-      framework::TensorCopy(out_grad_tensor, dev_place, dev_ctx,
-                            in_grad_tensor);
-      in_grad_tensor->set_lod(out_grad_tensor.lod());
-    }
-  }
-};
-
-class RNNMemoryHelperGradOpInfoMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(framework::GradVarName("Out"), "");
-    AddInput("X", "");
-    AddInput("Out", "");
-    AddOutput(framework::GradVarName("X"), "");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddComment("");
-  }
-};
-
-class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    auto x_grad_name = framework::GradVarName("X");
-    PADDLE_ENFORCE(ctx->HasOutput(x_grad_name),
-                   "Gradient of Input(X) in rnn_memory_helper_grad of should "
-                   "not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of rnn_memory_helper_grad of should not be null.");
-    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ x_grad_name);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(rnn_memory_helper, paddle::operators::RNNMemoryHelperOp,
-                  paddle::operators::RNNMemoryHelperOpInfoMaker,
-                  paddle::operators::RNNMemoryHelperOpShapeInference,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(rnn_memory_helper_grad,
-                  paddle::operators::RNNMemoryHelperGradOp,
-                  paddle::operators::RNNMemoryHelperGradOpInfoMaker,
-                  paddle::operators::RNNMemoryHelperGradOpShapeInference);
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
deleted file mode 100644
index 21c3dd27f02b18fa78108f6a291dbf4c12724786..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_align_op.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/roi_align_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-class ROIAlignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ROIAlignOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
-                   "Input(ROIs) of ROIAlignOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ROIAlignOp should not be null.");
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE(input_dims.size() == 4,
-                   "The format of input tensor is NCHW.");
-    PADDLE_ENFORCE(rois_dims.size() == 2,
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], ...].");
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE(rois_dims[1] == 4,
-                     "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                     "given as [[x1, y1, x2, y2], ...].");
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      "The pooled output height must greater than 0");
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      "The pooled output width must greater than 0");
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      "The spatial scale must greater than 0");
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ROIAlignGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The GRAD@Out of ROIAlignGradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
-                   "The GRAD@X of ROIAlignGradOp should not be null.");
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("ROIs")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor), "
-             "The input of ROIAlignOp. "
-             "The format of input tensor is NCHW. Where N is batch size, "
-             "C is the number of input channels, "
-             "H is the height of the feature, and "
-             "W is the width of the feature.");
-    AddInput("ROIs",
-             "(LoDTensor), "
-             "ROIs (Regions of Interest) to pool over. "
-             "should be a 2-D LoDTensor of shape (num_rois, 4)"
-             "given as [[x1, y1, x2, y2], ...]. "
-             "(x1, y1) is the top left coordinates, and "
-             "(x2, y2) is the bottom right coordinates.");
-    AddOutput("Out",
-              "(Tensor), "
-              "The output of ROIAlignOp is a 4-D tensor with shape "
-              "(num_rois, channels, pooled_h, pooled_w).");
-    AddAttr<float>("spatial_scale",
-                   "(float, default 1.0), "
-                   "Multiplicative spatial scale factor "
-                   "to translate ROI coords from their input scale "
-                   "to the scale used when pooling.")
-        .SetDefault(1.0);
-    AddAttr<int>("pooled_height",
-                 "(int, default 1), "
-                 "The pooled output height.")
-        .SetDefault(1);
-    AddAttr<int>("pooled_width",
-                 "(int, default 1), "
-                 "The pooled output width.")
-        .SetDefault(1);
-    AddAttr<int>("sampling_ratio",
-                 "(int,default -1),"
-                 "number of sampling points in the interpolation grid"
-                 "If <=0, then grid points are adaptive to roi_width "
-                 "and pooled_w, likewise for height")
-        .SetDefault(-1);
-    AddComment(R"DOC(
-**RoIAlign Operator**
-
-Region of interest align (also known as RoI align) is to perform
-bilinear interpolation on inputs of nonuniform sizes to obtain 
-fixed-size feature maps (e.g. 7*7)
-
-Dividing each region proposal into equal-sized sections with
-the pooled_width and pooled_height. Location remains the origin
-result.
-
-In each ROI bin, the value of the four regularly sampled locations 
-are computed directly through bilinear interpolation. The output is
-the mean of four locations.
-Thus avoid the misaligned problem.   
-    )DOC");
-  }
-};
-
-class ROIAlignGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("roi_align_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("ROIs", Input("ROIs"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(RoiAlignGradNoNeedBufVarsInferer, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
-                  ops::ROIAlignGradDescMaker);
-REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
-                  ops::RoiAlignGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roi_align,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    roi_align_grad,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
deleted file mode 100644
index 943c5c81dc47a99f6e2489757b1b15a6ae41bde8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_align_op.cu
+++ /dev/null
@@ -1,359 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_align_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <class T>
-__device__ T BilinearInterpolate(const T* input_data, const int height,
-                                 const int width, T y, T x) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    return 0;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  int y_low = static_cast<int>(y);
-  int x_low = static_cast<int>(x);
-  int y_high;
-  int x_high;
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  T v1 = input_data[y_low * width + x_low];
-  T v2 = input_data[y_low * width + x_high];
-  T v3 = input_data[y_high * width + x_low];
-  T v4 = input_data[y_high * width + x_high];
-  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-template <class T>
-__device__ void BilinearInterpolateGradient(const int height, const int width,
-                                            T y, T x, T* w1, T* w2, T* w3,
-                                            T* w4, int* x_low, int* x_high,
-                                            int* y_low, int* y_high) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    return;
-  }
-
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  *y_low = static_cast<int>(y);
-  *x_low = static_cast<int>(x);
-  if (*y_low >= height - 1) {
-    *y_high = *y_low = height - 1;
-    y = static_cast<T>(*y_low);
-  } else {
-    *y_high = *y_low + 1;
-  }
-  if (*x_low >= width - 1) {
-    *x_high = *x_low = width - 1;
-    x = static_cast<T>(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-  T ly = y - *y_low, lx = x - *x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
-
-  return;
-}
-
-template <class T>
-__global__ void GPUROIAlignForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int sampling_ratio, int* roi_batch_id_data, T* output_data) {
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-
-    T roi_xmin = offset_input_rois[0] * spatial_scale;
-    T roi_ymin = offset_input_rois[1] * spatial_scale;
-    T roi_xmax = offset_input_rois[2] * spatial_scale;
-    T roi_ymax = offset_input_rois[3] * spatial_scale;
-
-    T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.));
-    T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.));
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const T count = roi_bin_grid_h * roi_bin_grid_w;
-    T output_val = 0;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T y = roi_ymin + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_xmin + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-        T val = BilinearInterpolate(offset_input_data, height, width, y, x);
-        output_val += val;
-      }
-    }
-    output_val /= count;
-    output_data[i] = output_val;
-  }
-}
-
-template <typename T>
-__global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois,
-                                    const T* out_grad, const int num_rois,
-                                    const float spatial_scale,
-                                    const int channels, const int height,
-                                    const int width, const int pooled_height,
-                                    const int pooled_width,
-                                    const int sampling_ratio,
-                                    int* roi_batch_id_data, T* input_grad) {
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-
-    T roi_xmin = offset_input_rois[0] * spatial_scale;
-    T roi_ymin = offset_input_rois[1] * spatial_scale;
-    T roi_xmax = offset_input_rois[2] * spatial_scale;
-    T roi_ymax = offset_input_rois[3] * spatial_scale;
-
-    T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.));
-    T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.));
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_input_grad =
-        input_grad + (roi_batch_ind * channels + c) * height * width;
-
-    const T* offset_out_grad =
-        out_grad + (n * channels + c) * pooled_height * pooled_width;
-    const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw];
-
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    const T count = roi_bin_grid_h * roi_bin_grid_w;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T y = roi_ymin + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_xmin + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-        T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
-        int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
-        BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4,
-                                    &x_low, &x_high, &y_low, &y_high);
-        T diff1 = out_grad_this_bin * w1 / count;
-        T diff2 = out_grad_this_bin * w2 / count;
-        T diff3 = out_grad_this_bin * w3 / count;
-        T diff4 = out_grad_this_bin * w4 / count;
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low,
-                                  diff1);
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high,
-                                  diff2);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low,
-                                  diff3);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high,
-                                  diff4);
-        }
-      }
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIAlignOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "The rois_batch_size and imgs batch_size must be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-    auto& dev_ctx = ctx.cuda_device_context();
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-    GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto roi_ptr =
-        memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int));
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<Place, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-
-    int output_grad_size = out_grad->numel();
-    int blocks = NumBlocks(output_grad_size);
-    int threads = kNumCUDAThreads;
-
-    if (output_grad_size > 0) {
-      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
-          spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_id_data,
-          in_grad->mutable_data<T>(ctx.GetPlace()));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_align,
-    ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    roi_align_grad,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
deleted file mode 100644
index 78befea2f87302769b1ddee51152ff98daff911c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_align_op.h
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kROISize = 4;
-
-template <class T>
-void PreCalcForBilinearInterpolate(
-    const platform::DeviceContext& ctx, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int iy_upper,
-    const int ix_upper, T roi_ymin, T roi_xmin, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w, Tensor* pre_pos, Tensor* pre_w) {
-  int pre_calc_index = 0;
-  int* pre_pos_data = pre_pos->mutable_data<int>(ctx.GetPlace());
-  T* pre_w_data = pre_w->mutable_data<T>(ctx.GetPlace());
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        // calculate y of sample points
-        T y = roi_ymin + ph * bin_size_h +
-              static_cast<T>(iy + .5f) * bin_size_h /
-                  static_cast<T>(roi_bin_grid_h);
-        // calculate x of samle points
-        for (int ix = 0; ix < ix_upper; ix++) {
-          T x = roi_xmin + pw * bin_size_w +
-                static_cast<T>(ix + .5f) * bin_size_w /
-                    static_cast<T>(roi_bin_grid_w);
-          // deal with elements out of map
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            for (int i = 0; i < kROISize; ++i) {
-              pre_pos_data[i + pre_calc_index * kROISize] = 0;
-              pre_w_data[i + pre_calc_index * kROISize] = 0;
-            }
-            pre_calc_index += 1;
-            continue;
-          }
-          y = y <= 0 ? 0 : y;
-          x = x <= 0 ? 0 : x;
-
-          int y_low = static_cast<int>(y);
-          int x_low = static_cast<int>(x);
-          int y_high;
-          int x_high;
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = static_cast<T>(y_low);
-          } else {
-            y_high = y_low + 1;
-          }
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = static_cast<T>(x_low);
-          } else {
-            x_high = x_low + 1;
-          }
-          T ly = y - y_low, lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          pre_pos_data[pre_calc_index * kROISize] = y_low * width + x_low;
-          pre_pos_data[pre_calc_index * kROISize + 1] = y_low * width + x_high;
-          pre_pos_data[pre_calc_index * kROISize + 2] = y_high * width + x_low;
-          pre_pos_data[pre_calc_index * kROISize + 3] = y_high * width + x_high;
-          pre_w_data[pre_calc_index * kROISize] = hy * hx;
-          pre_w_data[pre_calc_index * kROISize + 1] = hy * lx;
-          pre_w_data[pre_calc_index * kROISize + 2] = ly * hx;
-          pre_w_data[pre_calc_index * kROISize + 3] = ly * lx;
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <class T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   const T out_grad_this_bin, const T count,
-                                   T* batch_grad_data) {
-  int x_low, y_low, x_high, y_high;
-  T w1, w2, w3, w4;
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    w1 = w2 = w3 = w4 = 0;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  y_low = static_cast<int>(y);
-  x_low = static_cast<int>(x);
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-  T diff1 = out_grad_this_bin * w1 / count;
-  T diff2 = out_grad_this_bin * w2 / count;
-  T diff3 = out_grad_this_bin * w3 / count;
-  T diff4 = out_grad_this_bin * w4 / count;
-  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-    *(batch_grad_data + y_low * width + x_low) += diff1;
-    *(batch_grad_data + y_low * width + x_high) += diff2;
-    *(batch_grad_data + y_high * width + x_low) += diff3;
-    *(batch_grad_data + y_high * width + x_high) += diff4;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CPUROIAlignOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    auto in_stride = framework::stride(in_dims);
-    auto roi_stride = framework::stride(rois->dims());
-    auto out_stride = framework::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "The rois_batch_size and imgs batch_size must be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* rois_data = rois->data<T>();
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale;
-      T roi_ymin = rois_data[1] * spatial_scale;
-      T roi_xmax = rois_data[2] * spatial_scale;
-      T roi_ymax = rois_data[3] * spatial_scale;
-
-      T roi_width = std::max(roi_xmax - roi_xmin, static_cast<T>(1.));
-      T roi_height = std::max(roi_ymax - roi_ymin, static_cast<T>(1.));
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      int roi_bin_grid_h = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_height / pooled_height);
-      int roi_bin_grid_w = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_width / pooled_width);
-      const T count = roi_bin_grid_h * roi_bin_grid_w;
-      Tensor pre_pos;
-      Tensor pre_w;
-      int pre_size = count * out_stride[1];
-      pre_pos.Resize({pre_size, kROISize});
-      pre_w.Resize({pre_size, kROISize});
-
-      PreCalcForBilinearInterpolate(
-          dev_ctx, height, width, pooled_height, pooled_width, roi_bin_grid_h,
-          roi_bin_grid_w, roi_ymin, roi_xmin, bin_size_h, bin_size_w,
-          roi_bin_grid_h, roi_bin_grid_w, &pre_pos, &pre_w);
-      const int* pre_pos_data = pre_pos.data<int>();
-      const T* pre_w_data = pre_w.data<T>();
-      for (int c = 0; c < channels; c++) {
-        int pre_calc_index = 0;
-        for (int ph = 0; ph < pooled_height; ph++) {
-          for (int pw = 0; pw < pooled_width; pw++) {
-            const int pool_index = ph * pooled_width + pw;
-            T output_val = 0;
-            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-                for (int i = 0; i < kROISize; i++) {
-                  int pos = pre_pos_data[pre_calc_index * kROISize + i];
-                  T w = pre_w_data[pre_calc_index * kROISize + i];
-                  output_val += w * batch_data[pos];
-                }
-                pre_calc_index += 1;
-              }
-            }
-            output_val /= count;
-            output_data[pool_index] = output_val;
-          }
-        }
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto in_dims = in->dims();
-
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-
-    int output_grad_size = out_grad->numel();
-
-    if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) {
-      return;
-    }
-
-    const T* rois_data = rois->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto in_stride = framework::stride(in->dims());
-    auto roi_stride = framework::stride(rois->dims());
-    auto out_stride = framework::stride(out_grad->dims());
-
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_idx = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale;
-      T roi_ymin = rois_data[1] * spatial_scale;
-      T roi_xmax = rois_data[2] * spatial_scale;
-      T roi_ymax = rois_data[3] * spatial_scale;
-      T roi_width = std::max(roi_xmax - roi_xmin, static_cast<T>(1.));
-      T roi_height = std::max(roi_ymax - roi_ymin, static_cast<T>(1.));
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-      for (int c = 0; c < channels; ++c) {
-        T* batch_grad_data =
-            in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
-        const T* batch_out_grad_data =
-            out_grad_data + n * out_stride[0] + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            int pool_index = ph * pooled_width + pw;
-            T out_grad_this_bin = batch_out_grad_data[pool_index];
-            int roi_bin_grid_h = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_height / pooled_height);
-            int roi_bin_grid_w = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_width / pooled_width);
-            T count = roi_bin_grid_h * roi_bin_grid_w;
-            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-              const T y = roi_ymin + ph * bin_size_h +
-                          static_cast<T>(iy + .5f) * bin_size_h /
-                              static_cast<T>(roi_bin_grid_h);
-              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-                const T x = roi_xmin + pw * bin_size_w +
-                            static_cast<T>(ix + .5f) * bin_size_w /
-                                static_cast<T>(roi_bin_grid_w);
-                bilinear_interpolate_gradient(height, width, y, x,
-                                              out_grad_this_bin, count,
-                                              batch_grad_data);
-              }
-            }
-          }
-        }
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
deleted file mode 100644
index cfac7e09e123c43204454adacb87a7c3c158690e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/roi_pool_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-class ROIPoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ROIPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
-                   "Input(ROIs) of ROIPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ROIPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Argmax"),
-                   "Output(Argmax) of ROIPoolOp should not be null.");
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE(input_dims.size() == 4,
-                   "The format of input tensor is NCHW.");
-    PADDLE_ENFORCE(rois_dims.size() == 2,
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], ...].");
-    PADDLE_ENFORCE(rois_dims[1] == kROISize,
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], ...].");
-
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      "The pooled output height must greater than 0");
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      "The pooled output width must greater than 0");
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      "The spatial scale must greater than 0");
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("Argmax", out_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ROIPoolGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The gradient of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
-                   "The gradient of X should not be null.");
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor), "
-             "the input of ROIPoolOp. "
-             "The format of input tensor is NCHW. Where N is batch size, "
-             "C is the number of input channels, "
-             "H is the height of the feature, and "
-             "W is the width of the feature.");
-    AddInput("ROIs",
-             "(LoDTensor), "
-             "ROIs (Regions of Interest) to pool over. "
-             "should be a 2-D LoDTensor of shape (num_rois, 4)"
-             "given as [[x1, y1, x2, y2], ...]. "
-             "Where batch_id is the id of the data, "
-             "(x1, y1) is the top left coordinates, and "
-             "(x2, y2) is the bottom right coordinates.");
-    AddOutput("Out",
-              "(Tensor), "
-              "The output of ROIPoolOp is a 4-D tensor with shape "
-              "(num_rois, channels, pooled_h, pooled_w).");
-    AddOutput("Argmax",
-              "(Tensor), "
-              "Argmaxes corresponding to indices in X used "
-              "for gradient computation. Only output "
-              "if arg \"is_test\" is false.")
-        .AsIntermediate();
-    AddAttr<float>("spatial_scale",
-                   "(float, default 1.0), "
-                   "Multiplicative spatial scale factor "
-                   "to translate ROI coords from their input scale "
-                   "to the scale used when pooling.")
-        .SetDefault(1.0);
-    AddAttr<int>("pooled_height",
-                 "(int, default 1), "
-                 "The pooled output height.")
-        .SetDefault(1);
-    AddAttr<int>("pooled_width",
-                 "(int, default 1), "
-                 "The pooled output width.")
-        .SetDefault(1);
-    AddComment(R"DOC(
-**ROIPool Operator**
-
-Region of interest pooling (also known as RoI pooling) is to perform
-is to perform max pooling on inputs of nonuniform sizes to obtain
-fixed-size feature maps (e.g. 7*7).
-
-The operator has three steps:
-
-1. Dividing each region proposal into equal-sized sections with
-   the pooled_width and pooled_height
-
-2. Finding the largest value in each section
-
-3. Copying these max values to the output buffer
-
-ROI Pooling for Faster-RCNN. The link below is a further introduction: 
-https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
-    )DOC");
-  }
-};
-
-class ROIPoolGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("roi_pool_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("ROIs", Input("ROIs"));
-    op->SetInput("Argmax", Output("Argmax"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
-                  ops::ROIPoolGradDescMaker);
-REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool_grad,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
deleted file mode 100644
index da8088d2ea70f589b6a5b8a443f16429cd0d1034..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ /dev/null
@@ -1,259 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_pool_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-    int roi_start_w = round(offset_input_rois[0] * spatial_scale);
-    int roi_start_h = round(offset_input_rois[1] * spatial_scale);
-    int roi_end_w = round(offset_input_rois[2] * spatial_scale);
-    int roi_end_h = round(offset_input_rois[3] * spatial_scale);
-
-    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-
-    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
-                                        static_cast<double>(roi_height) /
-                                        static_cast<double>(pooled_height)));
-    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
-                                        static_cast<double>(roi_width) /
-                                        static_cast<double>(pooled_width)));
-    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
-                                     static_cast<double>(roi_height) /
-                                     static_cast<double>(pooled_height)));
-    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
-                                     static_cast<double>(roi_width) /
-                                     static_cast<double>(pooled_width)));
-    hstart = min(max(hstart + roi_start_h, 0), height);
-    hend = min(max(hend + roi_start_h, 0), height);
-    wstart = min(max(wstart + roi_start_w, 0), width);
-    wend = min(max(wend + roi_start_w, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
-    int maxidx = -1;
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int input_data_index = h * width + w;
-        if (offset_input_data[input_data_index] > maxval) {
-          maxval = offset_input_data[input_data_index];
-          maxidx = input_data_index;
-        }
-      }
-    }
-    output_data[i] = maxval;
-    if (argmax_data) {
-      argmax_data[i] = maxidx;
-    }
-  }
-}
-
-template <typename T>
-__global__ void GPUROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad,
-    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, int* roi_batch_id_data,
-    T* input_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    int roi_batch_ind = roi_batch_id_data[n];
-    int input_offset = (roi_batch_ind * channels + c) * height * width;
-    int output_offset = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_output_grad = output_grad + output_offset;
-    T* offset_input_grad = input_grad + input_offset;
-    const int64_t* offset_argmax_data = argmax_data + output_offset;
-
-    int argmax = offset_argmax_data[ph * pooled_width + pw];
-    if (argmax != -1) {
-      platform::CudaAtomicAdd(
-          offset_input_grad + argmax,
-          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* argmax = ctx.Output<Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    auto in_stride = framework::stride(in_dims);
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "The rois_batch_size and imgs batch_size must be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-
-    GPUROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()),
-        argmax->mutable_data<int64_t>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* argmax = ctx.Input<Tensor>("Argmax");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (x_grad) {
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      auto cplace = platform::CPUPlace();
-      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-
-      auto& dev_ctx = ctx.cuda_device_context();
-      int bytes = roi_batch_id_list.numel() * sizeof(int);
-      auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-      const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                   dev_ctx.stream());
-
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<Place, T> set_zero;
-      set_zero(dev_ctx, x_grad, static_cast<T>(0));
-
-      int output_grad_size = out_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            output_grad_size, rois->data<T>(), out_grad->data<T>(),
-            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width, roi_id_data,
-            x_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool_grad,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
deleted file mode 100644
index 07de7c9f0e070cef7c6f38f8d564ab76910842db..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_pool_op.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr int kROISize = 4;
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = framework::stride(in_dims);
-    auto argmax_stride = framework::stride(argmax->dims());
-    auto roi_stride = framework::stride(rois->dims());
-    auto out_stride = framework::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "The rois_batch_size and imgs batch_size must be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        roi_batch_id_data[i] = n;
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
-
-    const T* rois_data = rois->data<T>();
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = roi_batch_id_data[n];
-      int roi_start_w = round(rois_data[0] * spatial_scale);
-      int roi_start_h = round(rois_data[1] * spatial_scale);
-      int roi_end_w = round(rois_data[2] * spatial_scale);
-      int roi_end_h = round(rois_data[3] * spatial_scale);
-
-      // Force malformed ROIs to be 1x1
-      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
-      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
-
-      const float bin_size_h =
-          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-      const float bin_size_w =
-          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      for (int c = 0; c < channels; ++c) {
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            //  Compute pooling region for this output unit:
-            //  start (included) = floor(ph * roi_height / pooled_height_)
-            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
-            int hstart =
-                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
-            int wstart =
-                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
-            int hend =
-                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
-            int wend =
-                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
-
-            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
-            hend = std::min(std::max(hend + roi_start_h, 0), height);
-            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
-            wend = std::min(std::max(wend + roi_start_w, 0), width);
-
-            const int pool_index = ph * pooled_width + pw;
-
-            // Define an empty pooling region to be zero
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            output_data[pool_index] =
-                is_empty ? 0 : -std::numeric_limits<T>::max();
-            argmax_data[pool_index] = -1;
-
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                const int index = h * width + w;
-                if (batch_data[index] > output_data[pool_index]) {
-                  output_data[pool_index] = batch_data[index];
-                  argmax_data[pool_index] = index;
-                }
-              }
-            }
-          }
-        }
-
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-        argmax_data += argmax_stride[1];
-      }
-      // Increment ROI data pointer
-      rois_data += roi_stride[0];
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-
-    if (in_grad) {
-      int rois_num = rois->dims()[0];
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      int* roi_batch_id_data =
-          roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-
-      const T* rois_data = rois->data<T>();
-      const T* out_grad_data = out_grad->data<T>();
-      const int64_t* argmax_data = argmax->data<int64_t>();
-      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
-               static_cast<T>(0));
-
-      auto in_stride = framework::stride(in->dims());
-      auto argmax_stride = framework::stride(argmax->dims());
-      auto roi_stride = framework::stride(rois->dims());
-      auto out_stride = framework::stride(out_grad->dims());
-
-      int channels = in->dims()[1];
-
-      for (int n = 0; n < rois_num; ++n) {
-        int roi_batch_idx = roi_batch_id_data[n];
-        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
-        for (int c = 0; c < channels; ++c) {
-          for (int ph = 0; ph < pooled_height; ++ph) {
-            for (int pw = 0; pw < pooled_width; ++pw) {
-              int pool_index = ph * pooled_width + pw;
-              if (argmax_data[pool_index] >= 0) {
-                auto index = argmax_data[pool_index];
-                batch_grad_data[index] += out_grad_data[pool_index];
-              }
-            }
-          }
-          batch_grad_data += in_stride[1];
-          out_grad_data += out_stride[1];
-          argmax_data += argmax_stride[1];
-        }
-        rois_data += roi_stride[0];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
deleted file mode 100644
index 1645c47e9660faa4d211c1fb05167a582e0fbc46..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/row_conv_op.cc
+++ /dev/null
@@ -1,354 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/row_conv_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-class RowConvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of RowConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                   "Input(Filter) of RowConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of RowConvOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
-
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class RowConvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                   "Input(Filter) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Gradient of output(Out) should not be null.");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-      ctx->SetOutputDim(x_grad_name, dout_dims);
-    }
-
-    auto filter_grad_name = framework::GradVarName("Filter");
-    if (ctx->HasOutput(filter_grad_name)) {
-      auto filter_dims = ctx->GetInputDim("Filter");
-      ctx->SetOutputDim(filter_grad_name, filter_dims);
-    }
-  }
-};
-
-class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "the input(X) is a LodTensor or tensor, LodTensor(X) supports "
-             "variable time-length input sequences. The underlying tensor "
-             "in this LoDTensor is a matrix with shape (T x N), where T "
-             "is the total time steps in this mini-batch and N is the input "
-             "data dimension. the shape of Tensor input(X) has shape "
-             "(B x T x N), B is batch size;");
-    AddInput("Filter",
-             "the input(Filter) is a learnable parameter. It "
-             "is a 2-D tensor with shape (future_context x N), where, "
-             "future_context is the future context length and N is the data "
-             "dimension.");
-    AddOutput("Out",
-              "the output(Out) is a LodTensor, which supports "
-              "variable time-length input sequences. The underlying tensor "
-              "in this LodTensor is a matrix with shape T x N, i.e., the "
-              "same shape as X.");
-    AddComment(R"DOC(
-:strong:`Row-convolution operator`
-
-The row convolution is called lookahead convolution.  This operator was 
-introduced in the following paper for DeepSpeech2:
-http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf 
-
-The main motivation is that a bidirectional RNN, useful in DeepSpeech 
-like speech models, learns representation for a sequence by performing a 
-forward and a backward pass through the entire sequence. However, unlike 
-unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
-and low-latency setting. The lookahead convolution incorporates information 
-from future subsequences in a computationally efficient manner to improve 
-unidirectional recurrent neural networks. The row convolution operator is 
-different from the 1D sequence convolution, and is computed as follows:
-
-Given an input sequence $X$ of length $t$ and input dimension $D$, 
-and a filter ($W$) of size $context \times D$,
-the output sequence is convolved as:
-
-$$
-out_{i} = \\sum_{j=i}^{i + context - 1} X_{j} \\cdot W_{j-i}
-$$
-
-In the above equation:
-
-* $Out_{i}$: The i-th row of output variable with shape [1, D].
-
-* $context$: Future context size.
-
-* $X_{j}$: The j-th row of input variable with shape [1, D].
-
-* $W_{j-i}$: The (j-i)-th row of parameters with shape [1, D].
-
-More details about row_conv please refer to
-the design document
-https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
-
-)DOC");
-  }
-};
-
-template <typename T>
-class RowConvKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<LoDTensor>("X");
-    auto *filter = context.Input<Tensor>("Filter");
-    auto *out = context.Output<LoDTensor>("Out");
-
-    out->mutable_data<T>(context.GetPlace());
-
-    bool is_tensor = x->lod().empty();
-    int batch_size = 0;
-    if (is_tensor) {
-      batch_size = x->dims()[0];
-    } else {
-      batch_size = x->lod()[0].size() - 1;
-    }
-    framework::Vector<size_t> batch_indices(batch_size + 1);
-    int input_dim = 0;
-    int timesteps = 0;
-    if (is_tensor) {
-      for (int i = 0; i < batch_size + 1; i++) {
-        batch_indices[i] = i;
-      }
-      input_dim = x->dims()[2];
-      timesteps = x->dims()[1];
-    } else {
-      batch_indices = x->lod()[0];
-      input_dim = x->dims()[1];
-    }
-    size_t num_sequence = batch_indices.size() - 1;
-
-    auto future_context = filter->dims()[0];
-    auto weights = EigenMatrix<T>::From(*filter);
-
-    for (size_t i = 0; i < num_sequence; i++) {
-      int start = static_cast<int>(batch_indices[i]);
-      int end = static_cast<int>(batch_indices[i + 1]);
-      int current_timesteps = 0;
-      if (is_tensor) {
-        current_timesteps = timesteps;
-      } else {
-        current_timesteps = end - start;
-      }
-      // int current_timesteps = end - start;
-      Tensor cur_input_sequence =
-          x->Slice(start, end);  // Current input sequence
-      cur_input_sequence =
-          cur_input_sequence.Resize({current_timesteps, input_dim});
-
-      Tensor cur_output_sequence =
-          out->Slice(start, end);  // Current output sequence
-      cur_output_sequence =
-          cur_output_sequence.Resize({current_timesteps, input_dim});
-
-      auto cip_seq = EigenMatrix<T>::From(cur_input_sequence);
-      auto cot_seq = EigenMatrix<T>::From(cur_output_sequence);
-
-      for (int k = 0; k < current_timesteps;
-           k++) {  // For different time steps in the same sequence
-        for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
-             w++) {
-          for (int d = 0; d < input_dim; d++) {
-            if (w == 0) {
-              cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d);
-            } else {
-              cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d);
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class RowConvGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<LoDTensor>("X");
-    auto *filter = context.Input<Tensor>("Filter");
-    auto *d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto *dx = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto *d_filter = context.Output<Tensor>(framework::GradVarName("Filter"));
-
-    auto &x_lod = x->lod();
-    bool is_tensor = x_lod.empty();
-    int batch_size = 0;
-    if (is_tensor) {
-      batch_size = x->dims()[0];
-    } else {
-      batch_size = x->lod()[0].size() - 1;
-    }
-    framework::Vector<size_t> batch_indices(batch_size + 1);
-    int timesteps = 0;
-    int input_dim = 0;
-    if (is_tensor) {
-      for (int i = 0; i < batch_size + 1; i++) {
-        batch_indices[i] = i;
-      }
-      input_dim = x->dims()[2];
-      timesteps = x->dims()[1];
-    } else {
-      batch_indices = x->lod()[0];
-      input_dim = x->dims()[1];
-    }
-
-    size_t num_sequence = batch_indices.size() - 1;
-    auto future_context = filter->dims()[0];
-    if (d_filter) {
-      d_filter->mutable_data<T>(context.GetPlace());
-      auto dweights =
-          EigenMatrix<T>::From(*d_filter);  // Gradient of weight matrix
-      dweights.setZero();
-
-      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
-        int start = static_cast<int>(batch_indices[i]);
-        int end = static_cast<int>(batch_indices[i + 1]);
-
-        int current_timesteps = 0;
-        if (is_tensor) {
-          current_timesteps = timesteps;
-        } else {
-          current_timesteps = end - start;
-        }
-        Tensor cur_input = x->Slice(start, end);  // Current input sequence
-        cur_input = cur_input.Resize({current_timesteps, input_dim});
-        Tensor cur_doutput =
-            d_out->Slice(start, end);  // Current output grad sequence
-        cur_doutput = cur_doutput.Resize({current_timesteps, input_dim});
-        auto cur_ip = EigenMatrix<T>::From(cur_input);
-        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
-        for (int k = 0; k < current_timesteps;
-             k++) {  // For different time steps in the same sequence
-          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
-               w++) {
-            // For dweights (Updating the gradient of weight matrix)
-            for (int d = 0; d < input_dim; d++) {
-              dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d);
-            }
-          }
-        }
-      }
-    }
-
-    if (dx) {
-      dx->mutable_data<T>(context.GetPlace());
-      auto weights = EigenMatrix<T>::From(*filter);
-      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
-        int start = static_cast<int>(batch_indices[i]);
-        int end = static_cast<int>(batch_indices[i + 1]);
-
-        int current_timesteps = 0;
-        if (is_tensor) {
-          current_timesteps = timesteps;
-        } else {
-          current_timesteps = end - start;
-        }
-
-        Tensor cur_doutput =
-            d_out->Slice(start, end);  // Current output grad sequence
-        cur_doutput = cur_doutput.Resize({current_timesteps, input_dim});
-        Tensor cur_dinput =
-            dx->Slice(start, end);  // Current input grad sequence
-        cur_dinput = cur_dinput.Resize({current_timesteps, input_dim});
-
-        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
-        auto cur_dip = EigenMatrix<T>::From(cur_dinput);
-        cur_dip.setZero();
-
-        for (int k = 0; k < current_timesteps;
-             k++) {  // For different time steps in the same sequence
-          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
-               w++) {
-            // For dinput (Updating the gradient wrt input)
-            for (int d = 0; d < input_dim; d++) {
-              cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d);
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-class RowConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("row_conv_grad");
-    op->SetAttrMap(Attrs());
-    op->SetInput("X", Input("X"));
-    op->SetInput("Filter", Input("Filter"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker,
-                  ops::RowConvGradOpDescMaker);
-REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp);
-REGISTER_OP_CPU_KERNEL(
-    row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    row_conv_grad,
-    ops::RowConvGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
deleted file mode 100644
index a712878854298bc2eb372be155e1bd512aba7037..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/row_conv_op.cu
+++ /dev/null
@@ -1,457 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/row_conv_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using framework::Tensor;
-
-namespace {
-
-inline int DivUp(int x, int y) { return (x + y - 1) / y; }
-
-// Forward prop (shared memory version, for small future_context)
-template <typename T>
-__global__ void RowConvForwardSharedMemory(const T *in, const T *wt,
-                                           int num_sequence, int input_dim,
-                                           int future_context,
-                                           const size_t *batch_indices,
-                                           T *out) {
-  int blx = blockDim.x;
-  int bly = blockDim.y;
-  int thx = threadIdx.x;
-  int thy = threadIdx.y;
-  int d = blockIdx.x * blx + thx;  // index along input dim
-
-  extern __shared__ T mem[];
-  T *sw = mem;
-
-  if (thy < future_context) {
-    sw[thy * blx + thx] =
-        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
-  }
-  __syncthreads();
-  for (size_t i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    int current_timesteps = end - start;
-
-    for (int k = thy; k < current_timesteps; k += bly) {
-      T sum = 0;
-      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
-           w++) {
-        sum += (d < input_dim)
-                   ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d]
-                   : static_cast<T>(0);
-      }
-      if (d < input_dim) {
-        out[(start + k) * input_dim + d] = sum;
-      }
-    }
-  }
-}
-
-// Forward prop (naive version)
-template <typename T>
-__global__ void RowConvForward(const T *in, const T *wt, int num_sequence,
-                               int input_dim, int future_context,
-                               const size_t *batch_indices, T *out) {
-  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
-  int bly = blockDim.y;
-  int thy = threadIdx.y;
-
-  if (d >= input_dim) return;
-  for (size_t i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    int current_timesteps = end - start;
-
-    for (int k = thy; k < current_timesteps; k += bly) {
-      T sum = 0;
-      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
-           w++) {
-        sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]);
-      }
-      out[(start + k) * input_dim + d] = sum;
-    }
-  }
-}
-
-// Compute input gradient (shared memory version, for small future_context)
-template <typename T>
-__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt,
-                                             int num_sequence, int input_dim,
-                                             int future_context,
-                                             const size_t *batch_indices,
-                                             T *din) {
-  int blx = blockDim.x;
-  int bly = blockDim.y;
-  int thx = threadIdx.x;
-  int thy = threadIdx.y;
-  int d = blockIdx.x * blx + thx;  // index along input dim
-
-  extern __shared__ T mem[];
-  T *sw = mem;
-  if (thy < future_context) {
-    sw[thy * blx + thx] =
-        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
-  }
-  __syncthreads();
-
-  int current_timesteps = 0;
-  for (int i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    current_timesteps = end - start;
-
-    for (int k = thy; k < current_timesteps; k += bly) {
-      T sum = 0;
-      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
-        sum += (d < input_dim)
-                   ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d])
-                   : static_cast<T>(0);
-      }
-      if (d < input_dim) {
-        din[(k + start) * input_dim + d] = sum;
-      }
-    }
-  }
-}
-
-// Compute input gradient (Naive version)
-template <typename T>
-__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence,
-                                 int input_dim, int future_context,
-                                 const size_t *batch_indices, T *din) {
-  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
-  int bly = blockDim.y;
-  int thy = threadIdx.y;
-
-  if (d >= input_dim) return;
-  int current_timesteps = 0;
-
-  for (int i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    current_timesteps = end - start;
-
-    for (int k = thy; k < current_timesteps; k += bly) {
-      T sum = 0;
-      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
-        sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]);
-      }
-      din[(k + start) * input_dim + d] = sum;
-    }
-  }
-}
-
-// Compute W gradient (small future_context version)
-template <typename T>
-__global__ void RowConvGradFilterImproved(const T *in, const T *dout,
-                                          int num_sequence, int input_dim,
-                                          int future_context, int block_x,
-                                          int block_y,
-                                          const size_t *batch_indices,
-                                          T *dfilter) {
-  int blx = blockDim.x;
-  int bly = blockDim.y;
-  int thx = threadIdx.x;
-  int thy = threadIdx.y;
-  int gx = blockIdx.x * blx;
-  int d = gx + thx;  // index along input dim
-
-  extern __shared__ T mem[];
-
-  int xdim_sh_in = block_y;
-  int xdim_sh_dout = block_y;
-  int ydim_sh_in = block_x;
-  int ydim_sh_dout = block_x + future_context - 1;
-  int ydim_sh_dfilter = block_y;
-
-  T *sh_in = mem;
-  T *sh_dout = &mem[xdim_sh_in * ydim_sh_in];
-  T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout];
-
-  if (thy < future_context) {
-    sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast<T>(0);
-  }
-  __syncthreads();
-
-  // NOTE(zcd): temporary solution
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-
-  for (int i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    int current_timesteps = end - start;
-
-    int scaled_cur_steps =
-        ((current_timesteps + block_x - 1) / block_x) * block_x;
-
-    for (int k = thy; k < scaled_cur_steps; k += block_x) {
-      int pos = start + k;
-      sh_in[thx * ydim_sh_in + thy] =
-          (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0);
-      sh_dout[thx * ydim_sh_dout + thy + future_context - 1] =
-          (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0);
-      __syncthreads();
-
-      if (thy < future_context - 1) {
-        int pos_offset = pos - future_context + 1;
-        sh_dout[thx * ydim_sh_dout + thy] =
-            (d < input_dim && pos_offset >= start)
-                ? dout[pos_offset * input_dim + d]
-                : T(0);
-      }
-      __syncthreads();
-
-      for (int w = 0; w < future_context; w++) {
-        T val = sh_in[thy * ydim_sh_in + thx] *
-                sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w];
-        __syncthreads();
-
-        for (int offset = 16; offset > 0;
-             offset = offset / 2) {  // blockDim.x is 32.
-          val += platform::CudaShuffleDownSync(mask, val, offset);
-        }
-        __syncthreads();
-
-        if (thx == 0) {
-          sh_dfilter[w * ydim_sh_dfilter + thy] += val;
-        }
-        __syncthreads();
-      }
-    }
-  }
-  for (int w = thy; (w < future_context) && (d < input_dim); w += bly) {
-    dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx];
-  }
-}
-
-// Compute weight(filter) gradient
-template <typename T>
-__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
-                                  int input_dim, int future_context,
-                                  int block_x, int block_y,
-                                  const size_t *batch_indices, T *dfilter) {
-  int blx = blockDim.x;
-  int thx = threadIdx.x;
-  int thy = threadIdx.y;
-  int gx = blockIdx.x * blx;
-  int d = gx + thx;  // index along input dim
-  extern __shared__ T mem[];
-  T *sh_in = mem;
-  T *sh_dout = &mem[block_x * block_y];
-
-  // NOTE(zcd): temporary solution
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-  for (int i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    int current_timesteps = end - start;
-
-    int scaled_cur_steps =
-        ((current_timesteps + block_x - 1) / block_x) * block_x;
-
-    for (int k = thy; k < scaled_cur_steps; k += block_x) {
-      int pos = start + k;
-      sh_in[thx * block_y + thy] =
-          (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0;
-      __syncthreads();
-
-      for (int w = 0; w < future_context; w++) {
-        sh_dout[thx * block_y + thy] =
-            (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps)
-                ? dout[(pos - w) * input_dim + d]
-                : 0.0;
-        __syncthreads();
-
-        T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx];
-        __syncthreads();
-
-        for (int offset = 16; offset > 0;
-             offset = offset / 2) {  // blockDim.x is 32.
-          val += platform::CudaShuffleDownSync(mask, val, offset);
-        }
-        __syncthreads();
-
-        if (thx == 0 && (gx + thy) < input_dim) {
-          dfilter[w * input_dim + gx + thy] += val;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace
-
-template <typename T>
-class RowConvKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<LoDTensor>("X");
-    auto *Filter = context.Input<Tensor>("Filter");
-    auto *Out = context.Output<LoDTensor>("Out");
-
-    const T *in = X->data<T>();
-    const T *weight = Filter->data<T>();
-    T *out = Out->mutable_data<T>(context.GetPlace());
-    bool is_tensor = X->lod().empty();
-    int batch_size = 0;
-    if (is_tensor) {
-      batch_size = X->dims()[0];
-    } else {
-      batch_size = X->lod()[0].size() - 1;
-    }
-    int input_dim = 0;
-    framework::Vector<size_t> batch_indices(batch_size + 1);
-    int timesteps = X->dims()[1];
-    if (is_tensor) {
-      for (int i = 0; i < batch_size + 1; i++) {
-        batch_indices[i] = i * timesteps;
-      }
-      input_dim = X->dims()[2];
-    } else {
-      batch_indices = X->lod()[0];
-      input_dim = X->dims()[1];
-    }
-
-    int num_sequence = batch_indices.size() - 1;
-    int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
-    auto stream = context.cuda_device_context().stream();
-
-    if (future_context <= 32) {
-      dim3 block_dim = dim3(32, 32);
-      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-      int mem_per_block = (future_context * block_dim.x) * sizeof(T);
-      RowConvForwardSharedMemory<
-          T><<<grid_dim, block_dim, mem_per_block, stream>>>(
-          in, weight, num_sequence, input_dim, future_context, idx, out);
-    } else {
-      dim3 block_dim = dim3(32, 32);
-      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-      RowConvForward<T><<<grid_dim, block_dim, 0, stream>>>(
-          in, weight, num_sequence, input_dim, future_context, idx, out);
-    }
-  }
-};
-
-template <typename T>
-class RowConvGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<LoDTensor>("X");
-    auto *Filter = context.Input<Tensor>("Filter");
-    auto *dOut = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    const T *in = X->data<T>();
-    const T *weights = Filter->data<T>();
-    const T *dout = dOut->data<T>();
-
-    Tensor *dX = context.Output<LoDTensor>(framework::GradVarName("X"));
-    Tensor *dFilter = context.Output<Tensor>(framework::GradVarName("Filter"));
-    int batch_size = 0;
-    bool is_tensor = X->lod().empty();
-    if (is_tensor) {
-      batch_size = X->dims()[0];
-    } else {
-      batch_size = X->lod()[0].size() - 1;
-    }
-
-    int input_dim = 0;
-    framework::Vector<size_t> batch_indices(batch_size + 1);
-    int timesteps = X->dims()[1];
-    if (is_tensor) {
-      for (int i = 0; i < batch_size + 1; i++) {
-        batch_indices[i] = i * timesteps;
-      }
-      input_dim = X->dims()[2];
-    } else {
-      batch_indices = X->lod()[0];
-      input_dim = X->dims()[1];
-    }
-    // int input_dim = X->dims()[1];
-    int num_sequence = batch_indices.size() - 1;
-    int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
-
-    auto &device_ctx = context.cuda_device_context();
-    math::SetConstant<platform::CUDADeviceContext, T> zero;
-
-    if (dFilter) {
-      T *dfilter = dFilter->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, dFilter, static_cast<T>(0.0));
-
-      if (future_context <= 32) {
-        dim3 block_dim = dim3(32, 32);
-        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-        int block_x = block_dim.x;
-        int block_y = block_dim.y;
-        int mem_per_block =
-            (block_y * block_x + block_y * (block_x + future_context - 1) +
-             future_context * block_y) *
-            sizeof(T);
-        RowConvGradFilterImproved<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
-            idx, dfilter);
-      } else {
-        dim3 block_dim = dim3(32, 32);
-        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-        int block_x = block_dim.x;
-        int block_y = block_dim.y;
-        int mem_per_block =
-            (block_x * block_y * 2) * sizeof(T);  // For 2 arrays of size 32x32
-        RowConvGradFilter<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
-            idx, dfilter);
-      }
-    }
-
-    if (dX) {
-      T *din = dX->mutable_data<T>(context.GetPlace());
-      if (future_context <= 32) {
-        dim3 block_dim = dim3(32, 32);
-        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-        int mem_per_block = (future_context * block_dim.x) * sizeof(T);
-        RowConvGradInputSharedMemory<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            dout, weights, num_sequence, input_dim, future_context, idx, din);
-      } else {
-        dim3 block_dim = dim3(32, 32);
-        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-        RowConvGradInput<T><<<grid_dim, block_dim, 0, device_ctx.stream()>>>(
-            dout, weights, num_sequence, input_dim, future_context, idx, din);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    row_conv, ops::RowConvKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    row_conv_grad,
-    ops::RowConvGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/row_conv_op.h b/paddle/fluid/operators/row_conv_op.h
deleted file mode 100644
index fb999568f81b4034a683e2bc155692dffbf82f82..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/row_conv_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class RowConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override;
-};
-
-template <typename DeviceContext, typename T>
-class RowConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override;
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
deleted file mode 100644
index 8ce2d52273d7cc3d523e5d77c2c79b9989b9227f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/sample_logits_op.h"
-#include <memory>
-#include "paddle/fluid/operators/math/sample_prob.h"
-
-namespace paddle {
-namespace operators {
-
-class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Logits",
-             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
-             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
-             "and K is the class number.");
-    AddInput("Labels",
-             "(Tensor) The ground truth which is a 2-D tensor. Labels is a "
-             "Tensor<int64> with shape [N x NT], where NT is the number of"
-             "true labels for each example.");
-    AddInput("CustomizedSamples",
-             "(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
-             "NT + S],"
-             " where N is the batch size, NT is the number of true labels "
-             "and S is the number of negtive sample for each example."
-             "The first NT elements of each row should be the same with true "
-             "labels, "
-             "followed by S custom negtive samples. This tensor"
-             "is only used when use_customized_samples is true.")
-        .AsDispensable();
-    AddInput(
-        "CustomizedProbabilities",
-        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
-        "The tensor has the same shape with CustomSamples,"
-        "and each element represents probability of element in CustomSamples. "
-        "This "
-        "tensor is only used when use_customized_samples is true.")
-        .AsDispensable();
-    AddOutput("Samples",
-              "(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
-              "NT + S]."
-              "The outputs value of sampler, including NT true lables and S "
-              "negetive samples "
-              "for each example. This will be used in"
-              "backward calculation.")
-        .AsIntermediate();
-    AddOutput(
-        "Probabilities",
-        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
-        "The probabilites of sampled positive and negtive labels.")
-        .AsIntermediate();
-    AddOutput("LogitsDim", "Store dim information of Logits for gradient op")
-        .AsIntermediate();
-    AddOutput("LabelsDim", "Store dim information of Logits for gradient op")
-        .AsIntermediate();
-    AddOutput("SampledLogits",
-              "(Tensor, default: Tensor<float>), A 2-D tensor with shape"
-              "[N, NT + S]. The outputs value of sampled logits, which will be"
-              "used in backward propagation.")
-        .AsIntermediate();
-    AddOutput(
-        "SampledLabels",
-        "(Tensor, default: Tensor<int64>), A 2-D tensor. The sampled labels"
-        "with shape [N, NT]. The tonsor contains hard labels as input to "
-        " softmax op, that is 0, 1, ..., NT-1 because of the first NT elements"
-        " of Sampels are positive lables.");
-    AddAttr<bool>(
-        "use_customized_samples",
-        "An indicator whether to use customized samples with probabilities, if "
-        "True"
-        "the operator will use customized samples and customized probabilities"
-        "otherwise, the operator will generate them by itself.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "uniq",
-        "An indicator whether to sample non-repetitive negtive labels, if True"
-        "the operator will sample negtive labels without replacement."
-        "Otherwise, the operator will sample negtive labels with replacement.")
-        .SetDefault(true);
-    AddAttr<bool>(
-        "remove_accidental_hits",
-        "An indicator whether to remove accidental hits when samples hits true"
-        "labels, the removal is implemented by subtracting the corresponding"
-        "logits by float_max to subpress their softmax to be zero.")
-        .SetDefault(true);
-    AddAttr<int>("num_samples", "The number of negative samples.");
-    AddAttr<int>("seed", "Random seed for generating samples").SetDefault(0);
-
-    AddComment(R"DOC(
-  """
-  Computes sampled output training logits and labels suitable for implementing
-  sampled softmax.        
-  """
-
-)DOC");
-  }
-};
-
-class SampleLogitsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should be not null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Samples"),
-                   "Output(Samples) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Probabilities"),
-                   "Output(Probabilities) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("SampledLogits"),
-                   "Output(SampledLogits) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("SampledLabels"),
-                   "Output(SampledLabels) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("LogitsDim"),
-                   "Output(LogitsDim) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("LabelsDim"),
-                   "Output(LabelsDim) should be not null.");
-
-    auto logits_dims = ctx->GetInputDim("Logits");
-    auto labels_dims = ctx->GetInputDim("Labels");
-
-    PADDLE_ENFORCE_EQ(
-        logits_dims.size(), 2UL,
-        "The logits of softmax_with_cross_entropy should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
-                      "The labels should be a 2-D tensor.");
-
-    const int num_samples = ctx->Attrs().Get<int>("num_samples");
-    int num_sampled_classes = labels_dims[1] + num_samples;
-    if ((!ctx->IsRuntime()) && labels_dims[1] <= 0) {
-      num_sampled_classes = -1;
-    }
-    ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes});
-    ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes});
-    ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes});
-    ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]});
-
-    // append 0 to shape variable to avoid optimized by memory optimize pass
-    auto logits_dim_vec = framework::vectorize(logits_dims);
-    logits_dim_vec.push_back(0);
-    ctx->SetOutputDim("LogitsDim", framework::make_ddim(logits_dim_vec));
-
-    auto labels_dim_vec = framework::vectorize(labels_dims);
-    labels_dim_vec.push_back(0);
-    ctx->SetOutputDim("LabelsDim", framework::make_ddim(labels_dim_vec));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Logits"));
-    framework::OpKernelType kt =
-        framework::OpKernelType(data_type, ctx.device_context());
-    return kt;
-  }
-};
-
-// UNDERSTAND: InferShape for Grad
-class SampleLogitsOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("LogitsDim"),
-                   "Input(LogitsDim) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LabelsDim"),
-                   "Input(LabelsDim) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Samples"),
-                   "Input(Samples) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("SampledLogits")),
-                   "Input(SampledLogits@Grad) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
-                   "Output(Logits@Grad) should be not null.");
-
-    auto logits_dims = ctx->GetInputDim("LogitsDim");
-    logits_dims = framework::DDim(logits_dims.Get(), logits_dims.size() - 1);
-    auto labels_dims = ctx->GetInputDim("LabelsDim");
-    labels_dims = framework::DDim(labels_dims.Get(), labels_dims.size() - 1);
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
-                      "The label should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(logits_dims.size(), 2UL,
-                      "The logits should be a 2-D tensor.");
-
-    ctx->SetOutputDim(framework::GradVarName("Logits"), logits_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(
-        ctx.InputVar(framework::GradVarName("SampledLogits")));
-    framework::OpKernelType kt =
-        framework::OpKernelType(data_type, ctx.device_context());
-    return kt;
-  }
-};
-
-// UNDERSTAND: what's the rule for making a GradMaker TODO
-class SampleLogitsGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* grad_op = new framework::OpDesc();
-    grad_op->SetType("sample_logits_grad");
-    grad_op->SetInput("LogitsDim", Output("LogitsDim"));
-    grad_op->SetInput("LabelsDim", Output("LabelsDim"));
-    grad_op->SetInput("Samples", Output("Samples"));
-    grad_op->SetInput(framework::GradVarName("SampledLogits"),
-                      OutputGrad("SampledLogits"));
-    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(sample_logits, ops::SampleLogitsOp, ops::SampleLogitsOpMaker,
-                  ops::SampleLogitsGradMaker);
-REGISTER_OPERATOR(sample_logits_grad, ops::SampleLogitsOpGrad);
-REGISTER_OP_CPU_KERNEL(sample_logits, ops::SampleLogitsKernel<float>,
-                       ops::SampleLogitsKernel<double>);
-REGISTER_OP_CPU_KERNEL(sample_logits_grad, ops::SampleLogitsGradKernel<float>,
-                       ops::SampleLogitsGradKernel<double>);
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
deleted file mode 100644
index fb49793b730f72d66dc846f233bd95ebdab37c52..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/sample_prob.h"
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/sample_logits_op.h"
-
-namespace paddle {
-namespace operators {
-
-// UNDERSTAND: something like take_along_axis in numpy.
-template <typename T>
-__global__ void GPUTakeAlongD1(size_t size, const int batch_size,
-                               const int array_slice_size,
-                               const int idx_slice_size, const T* p_array,
-                               const int64_t* p_index, T* p_value) {
-  const auto value_slice_size = idx_slice_size;
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int step_size = blockDim.x * gridDim.x;
-
-  for (; idx < size; idx += step_size) {
-    int i = idx / idx_slice_size;
-    auto array_index = p_index[idx];
-    p_value[idx] = p_array[i * array_slice_size + array_index];
-  }
-}
-
-// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
-// indices, scatter is done in += way.
-template <typename T>
-__global__ void GPUPutAlongD1(size_t size, const int batch_size,
-                              const int array_slice_size,
-                              const int idx_slice_size, T* p_array,
-                              const int64_t* p_index, const T* p_value) {
-  const auto value_slice_size = idx_slice_size;
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int step_size = blockDim.x * gridDim.x;
-
-  // size == batch_size
-  for (; idx < size; idx += step_size) {
-    int i = idx;
-    for (int j = 0; j < idx_slice_size; ++j) {
-      auto array_index = p_index[i * idx_slice_size + j];
-      p_array[i * array_slice_size + array_index] +=
-          p_value[i * idx_slice_size + j];
-    }
-  }
-}
-
-// UNDERSTAND: set label as 0,1,...,num_true-1
-template <typename T>
-__global__ void GPUSetLabel(size_t size, const int num_true, int64_t* p_array) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int step_size = blockDim.x * gridDim.x;
-
-  for (; idx < size; idx += step_size) {
-    p_array[idx] = idx % num_true;
-  }
-}
-
-// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
-// logits by a float max, here 1e20
-template <typename T>
-__global__ void gpu_compute_remove_accidental_hits(const int size,
-                                                   const int num_true,
-                                                   const int idx_slice_size,
-                                                   const int64_t* p_index,
-                                                   T* p_value) {
-  const auto value_slice_size = idx_slice_size;
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int step_size = blockDim.x * gridDim.x;
-
-  for (; idx < size; idx += step_size) {
-    int i = idx / idx_slice_size;
-    if (idx % idx_slice_size < num_true) continue;
-    for (int j = 0; j < num_true; ++j) {
-      const auto true_idx = i * idx_slice_size + j;
-      if (p_index[true_idx] == p_index[idx]) {
-        p_value[idx] -= 1e20;
-        break;
-      }
-    }
-  }
-}
-
-template <typename T>
-class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
- public:
-  using Tensor = framework::Tensor;
-  void Compute(const framework::ExecutionContext& context) const override {
-    // get necessary inputs
-    const Tensor* logits = context.Input<Tensor>("Logits");
-    const Tensor* labels = context.Input<Tensor>("Labels");
-    VLOG(3) << "Enter SampleLogitsCUDAKernel";
-
-    // get necessary outputs
-    Tensor* samples = context.Output<Tensor>("Samples");
-    Tensor* probabilities = context.Output<Tensor>("Probabilities");
-    Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
-    Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
-
-    // shapes
-    const auto batch_size = logits->dims()[0];
-    const auto num_classes = logits->dims()[1];
-    const auto labels_dim = labels->dims();
-    const auto num_true = labels_dim[1];
-    const auto samples_dim = samples->dims();
-
-    // attrs
-    const auto num_samples = context.Attr<int>("num_samples");
-    const bool use_customized_samples =
-        context.Attr<bool>("use_customized_samples");
-    const bool uniq = context.Attr<bool>("uniq");
-    const bool remove_accidental_hits =
-        context.Attr<bool>("remove_accidental_hits");
-
-    // device contexts
-    auto& dev_ctx = context.cuda_device_context();
-
-    // UNDERSTAND: allocate memories for temporaries
-    sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(dev_ctx, sampled_logits, static_cast<T>(0));
-
-    auto sampled_labels_data =
-        sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
-    int threads = 512;
-    size_t size = batch_size * num_true;
-    int grid = (size + threads - 1) / threads;
-    GPUSetLabel<
-        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-        size, num_true, sampled_labels_data);
-
-    if (use_customized_samples) {
-      const Tensor* customized_samples =
-          context.Input<Tensor>("CustomizedSamples");
-      const Tensor* customized_probabilities =
-          context.Input<Tensor>("CustomizedProbabilities");
-      samples->ShareDataWith(*customized_samples);
-      probabilities->ShareDataWith(*customized_probabilities);
-    } else {
-      samples->mutable_data<int64_t>(context.GetPlace());
-      probabilities->mutable_data<T>(samples_dim, context.GetPlace());
-      // UNDERSTAND: sampling
-      const auto seed = context.Attr<int>("seed");
-      auto sampler_with_prob = math::GPUSampleWithProb<T>();
-      sampler_with_prob(context.cuda_device_context(), seed, num_classes, uniq,
-                        num_samples, labels, samples, probabilities);
-    }
-
-    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
-    const auto num_take = samples->dims()[1];
-    const auto array_dims = logits->dims();
-    const auto idx_dims = samples->dims();
-
-    const T* p_array = logits->data<T>();
-    const int64_t* p_index = samples->data<int64_t>();
-    T* p_value = sampled_logits->data<T>();
-
-    // src slice size
-    const auto array_slice_size = array_dims[1];
-    // index slice size
-    const auto idx_slice_size = idx_dims[1];
-
-    size = batch_size * num_take;
-    grid = (size + threads - 1) / threads;
-    GPUTakeAlongD1<
-        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
-        p_value);
-
-    if (remove_accidental_hits) {
-      const size_t size = batch_size * (num_true + num_samples);
-      int grid = (size + threads - 1) / threads;
-      gpu_compute_remove_accidental_hits<
-          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-          size, num_true, idx_slice_size, p_index, p_value);
-    }
-
-    // subtracted sampled logits with logQ(y|x)
-    auto probs = EigenMatrix<T>::From(*probabilities);
-    auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
-    smp_logits.device(*dev_ctx.eigen_device()) =
-        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
-            .unaryExpr(TolerableValue<T>());
-  }
-};
-
-template <typename T>
-class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  using Tensor = framework::Tensor;
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
-    const Tensor* samples = context.Input<Tensor>("Samples");
-    const Tensor* sampled_logits_grad =
-        context.Input<Tensor>(framework::GradVarName("SampledLogits"));
-    logits_grad->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx = context.cuda_device_context();
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(dev_ctx, logits_grad, static_cast<T>(0));
-
-    // UNDERSTAND: scatter it back to logit_grad
-    const auto batch_size = samples->dims()[0];
-    const auto num_put = samples->dims()[1];
-    const auto array_dims = logits_grad->dims();
-    const auto idx_dims = samples->dims();
-
-    T* p_array = logits_grad->data<T>();
-    const int64_t* p_index = samples->data<int64_t>();
-    const T* p_value = sampled_logits_grad->data<T>();
-
-    // src slice size
-    const auto array_slice_size = array_dims[1];
-    // index slice size
-    const auto idx_slice_size = idx_dims[1];
-
-    int threads = 128;
-    const size_t size = batch_size;
-    int grid = (size + threads - 1) / threads;
-
-    GPUPutAlongD1<
-        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
-        p_value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(sample_logits, ops::SampleLogitsCUDAKernel<float>,
-                        ops::SampleLogitsCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(sample_logits_grad,
-                        ops::SampleLogitsGradCUDAKernel<float>,
-                        ops::SampleLogitsGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
deleted file mode 100644
index 18ef6c9d3fe62a913413cf8c84e23b7c6accfc5c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sample_logits_op.h
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/sample_prob.h"
-#include "paddle/fluid/operators/math/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct TolerableValue {
-  HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ENFORCE(std::is_floating_point<T>::value,
-                   "TolerableValue should be float in sample_logits_op.");
-    const T kApproInf = 1e20;
-    if (x == INFINITY) return kApproInf;
-    if (x == -INFINITY) return -kApproInf;
-    return x;
-  }
-};
-
-// UNDERSTAND: something like take_along_axis in numpy.
-template <typename T>
-static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
-                           const framework::Tensor& array,
-                           const framework::Tensor& index,
-                           framework::Tensor* value) {
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true);
-  // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
-  PADDLE_ENFORCE_EQ(index.dims().size(), 2);
-  PADDLE_ENFORCE_EQ(array.dims().size(), 2);
-  PADDLE_ENFORCE_EQ(index.dims()[0], array.dims()[0]);
-  PADDLE_ENFORCE_EQ(index.dims(), value->dims());
-
-  const auto batch_size = index.dims()[0];
-  const auto num_take = index.dims()[1];
-  const auto array_dims = array.dims();
-  const auto idx_dims = index.dims();
-
-  // UNDERSTAND: no allocations here
-  const T* p_array = array.data<T>();
-  const int64_t* p_index = index.data<int64_t>();
-  T* p_value = value->data<T>();
-
-  // src slice size
-  const auto array_slice_size = array_dims[1];
-
-  // index slice size
-  const auto idx_slice_size = idx_dims[1];
-  const auto value_slice_size = idx_slice_size;
-
-  for (int i = 0; i < batch_size; ++i) {
-    for (int j = 0; j < num_take; ++j) {
-      auto array_index = p_index[i * idx_slice_size + j];
-      p_value[i * value_slice_size + j] =
-          p_array[i * array_slice_size + array_index];
-    }
-  }
-}
-
-// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
-// indices, scatter is done in += way.
-template <typename T>
-static void CPUPutAlongD1(const platform::DeviceContext& ctx,
-                          framework::Tensor* array,
-                          const framework::Tensor& index,
-                          const framework::Tensor& value) {
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true);
-  // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
-  PADDLE_ENFORCE_EQ(index.dims().size(), 2);
-  PADDLE_ENFORCE_EQ(array->dims().size(), 2);
-  PADDLE_ENFORCE_EQ(index.dims()[0], array->dims()[0]);
-  PADDLE_ENFORCE_EQ(index.dims(), value.dims());
-  const auto batch_size = index.dims()[0];
-  const auto num_put = index.dims()[1];
-  auto array_dims = array->dims();
-  auto idx_dims = index.dims();
-
-  // UNDERSTAND: no allocations here
-  T* p_array = array->data<T>();
-  const int64_t* p_index = index.data<int64_t>();
-  const T* p_value = value.data<T>();
-
-  // slice sizes
-  const auto array_slice_size = array_dims[1];
-  const auto idx_slice_size = idx_dims[1];
-  const auto value_slice_size = idx_slice_size;
-
-  for (int i = 0; i < batch_size; ++i) {
-    for (int j = 0; j < num_put; ++j) {
-      auto array_index = p_index[i * idx_slice_size + j];
-      p_array[i * array_slice_size + array_index] +=
-          p_value[i * value_slice_size + j];
-    }
-  }
-}
-
-// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
-// logits by a float max, here 1e20
-template <typename T>
-static void compute_remove_accidental_hits(const platform::DeviceContext& ctx,
-                                           framework::Tensor* sampled_logits,
-                                           const framework::Tensor& samples,
-                                           const int num_true) {
-  const auto batch_size = sampled_logits->dims()[0];
-  const auto num_sampled_classes = sampled_logits->dims()[1];
-  T* sampled_logits_data = sampled_logits->data<T>();
-  const auto samples_data = samples.data<int64_t>();
-
-  std::unordered_set<int64_t> tmp_true_labels;
-  for (int i = 0; i < batch_size; ++i) {
-    tmp_true_labels.clear();
-    tmp_true_labels.insert(samples_data + i * num_sampled_classes,
-                           samples_data + i * num_sampled_classes + num_true);
-    for (int j = num_true; j < num_sampled_classes; ++j) {
-      const auto idx = i * num_sampled_classes + j;
-      if (tmp_true_labels.find(samples_data[idx]) != tmp_true_labels.end())
-        sampled_logits_data[idx] -= 1e20;
-    }
-  }
-}
-
-template <typename T>
-class SampleLogitsKernel : public framework::OpKernel<T> {
- public:
-  using Tensor = framework::Tensor;
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(context.GetPlace()), true,
-                      "This kernel only runs on CPU.");
-    VLOG(3) << "Enter SampleLogitsKernel";
-    // get necessary inputs
-    const Tensor* logits = context.Input<Tensor>("Logits");
-    const Tensor* labels = context.Input<Tensor>("Labels");
-
-    // get necessary outputs
-    Tensor* samples = context.Output<Tensor>("Samples");
-    Tensor* probabilities = context.Output<Tensor>("Probabilities");
-    Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
-    Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
-
-    // shapes
-    const auto batch_size = logits->dims()[0];
-    const auto num_classes = logits->dims()[1];
-    const auto labels_dim = labels->dims();
-    const auto num_true = labels_dim[1];
-    const auto samples_dim = samples->dims();
-
-    // attrs
-    const auto num_samples = context.Attr<int>("num_samples");
-    const bool use_customized_samples =
-        context.Attr<bool>("use_customized_samples");
-    const bool remove_accidental_hits =
-        context.Attr<bool>("remove_accidental_hits");
-
-    // device contexts
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-
-    // UNDERSTAND: allocate memories for temporaries
-    sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
-    auto sampled_labels_data =
-        sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
-    for (int i = 0; i < batch_size; ++i) {
-      for (int j = 0; j < num_true; ++j) {
-        sampled_labels_data[i * num_true + j] = j;
-      }
-    }
-
-    if (use_customized_samples) {
-      const Tensor* customized_samples =
-          context.Input<Tensor>("CustomizedSamples");
-      const Tensor* customized_probabilities =
-          context.Input<Tensor>("CustomizedProbabilities");
-      samples->ShareDataWith(*customized_samples);
-      probabilities->ShareDataWith(*customized_probabilities);
-    } else {
-      samples->mutable_data<int64_t>(context.GetPlace());
-      probabilities->mutable_data<T>(samples_dim, context.GetPlace());
-      // UNDERSTAND: sampling
-      const auto seed = context.Attr<int>("seed");
-      auto sampler_with_prob =
-          math::SampleWithProb<platform::CPUDeviceContext, T>();
-      sampler_with_prob(dev_ctx, math::LogUniformSampler(num_classes, seed),
-                        num_samples, labels, samples, probabilities);
-    }
-
-    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
-    CPUTakeAlongD1<T>(dev_ctx, *logits, *samples, sampled_logits);
-    if (remove_accidental_hits) {
-      compute_remove_accidental_hits<T>(dev_ctx, sampled_logits, *samples,
-                                        num_true);
-    }
-
-    // subtracted sampled logits with logQ(y|x)
-    auto probs = EigenMatrix<T>::From(*probabilities);
-    auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
-    smp_logits.device(*dev_ctx.eigen_device()) =
-        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
-            .unaryExpr(TolerableValue<T>());
-  }
-};
-
-template <typename T>
-class SampleLogitsGradKernel : public framework::OpKernel<T> {
- public:
-  using Tensor = framework::Tensor;
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
-    const Tensor* samples = context.Input<Tensor>("Samples");
-    const Tensor* sampled_logits_grad =
-        context.Input<Tensor>(framework::GradVarName("SampledLogits"));
-    logits_grad->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(dev_ctx, logits_grad, static_cast<T>(0));
-
-    // UNDERSTAND: scatter it back to logit_grad
-    CPUPutAlongD1<T>(dev_ctx, logits_grad, *samples, *sampled_logits_grad);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
deleted file mode 100644
index 36712a8d06d3e9a6f582f8296e2c0c4b4b302eb1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sampling_id_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class SamplingIdOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SamplingIdOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SamplingIdOp should not be null.");
-    PADDLE_ENFORCE_LT(ctx->Attrs().Get<float>("min"),
-                      ctx->Attrs().Get<float>("max"), "min must less then max");
-
-    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(input_dims.size() == 2,
-                   "Input(X, Filter) should be 2-D tensor.");
-
-    auto dim0 = input_dims[0];
-    framework::DDim dims = framework::make_ddim({dim0});
-    ctx->SetOutputDim("Out", dims);
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of softmax. "
-             "2-D with shape [batch_size, input_feature_dimensions].");
-    AddOutput("Out", "SamplingId data tensor.");
-    AddComment(R"DOC(
-SamplingId Operator.
-A layer for sampling id from multinomial distribution from the
- input. Sampling one id for one sample.)DOC");
-    AddAttr<float>("min", "Minimum value of random. (float, default 0.0).")
-        .SetDefault(0.0f);
-    AddAttr<float>("max", "Maximun value of random. (float, default 1.0).")
-        .SetDefault(1.0f);
-    AddAttr<int>(
-        "seed",
-        "Random seed used for the random number engine. "
-        "0 means use a seed generated by the system."
-        "Note that if seed is not 0, this operator will always "
-        "generate the same random numbers every time. (int, default 0).")
-        .SetDefault(0);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sampling_id, ops::SamplingIdOp, ops::SamplingIdOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
-                       paddle::operators::SamplingIdKernel<double>);
diff --git a/paddle/fluid/operators/sampling_id_op.cu b/paddle/fluid/operators/sampling_id_op.cu
deleted file mode 100644
index a4f0470314d00b5e370fd478736b54579c88448c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sampling_id_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include "paddle/fluid/operators/sampling_id_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
-                        paddle::operators::SamplingIdKernel<double>);
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
deleted file mode 100644
index 133d3f72dbd6ab13c98d124369038309c94cba5b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sampling_id_op.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class SamplingIdKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("X");
-    const int batch_size = static_cast<int>(input->dims()[0]);
-    const int width = static_cast<int>(input->dims()[1]);
-
-    PADDLE_ENFORCE_GE(batch_size, 0,
-                      "batch_size(dims[0]) must be nonnegative.");
-    PADDLE_ENFORCE_GE(width, 0, "width(dims[1]) must be nonnegative.");
-
-    std::vector<T> ins_vector;
-    framework::TensorToVector(*input, context.device_context(), &ins_vector);
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::uniform_real_distribution<T> dist(
-        static_cast<T>(context.Attr<float>("min")),
-        static_cast<T>(context.Attr<float>("max")));
-
-    std::vector<int64_t> ids(batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      T r = dist(engine);
-      int idx = width - 1;
-      for (int j = 0; j < width; ++j) {
-        if ((r -= ins_vector[i * width + j]) < 0) {
-          idx = j;
-          break;
-        }
-      }
-      ids[i] = int64_t(idx);
-    }
-
-    std::vector<int64_t> out_dim;
-    out_dim.push_back(static_cast<int64_t>(batch_size));
-
-    Tensor* output = context.Output<Tensor>("Out");
-    output->Resize(framework::make_ddim(out_dim));
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(ids, context.device_context(), output);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
deleted file mode 100644
index 953e2655d13328b986a67398dca54f8a5e3aedcf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/save_combine_op.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/operators/save_combine_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class SaveCombineOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::FP32,
-                                   ctx.GetPlace());
-  }
-  // TODO(lujun): The override here is just to bypass transform
-  //  in operator impl, which is not elegant enough.
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    return expected_kernel_type;
-  }
-};
-
-class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(vector) Input LoDTensors that need to be saved together in a file.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SaveCombine operator
-
-This operator will serialize and write a list of input LoDTensor variables
-to a file on disk.
-)DOC");
-    AddAttr<bool>("overwrite",
-                  "(boolean, default true)"
-                  "Overwrite the output file if it exists.")
-        .SetDefault(true);
-    AddAttr<bool>("save_as_fp16",
-                  "(boolean, default false)"
-                  "If true, the tensor will be converted to float16 data "
-                  "type and then saved. Otherwise, the tensor will be "
-                  "directly saved without data type conversion.")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "file_path",
-        "(string)"
-        "The \"file_path\" where the LoDTensor variables will be saved.")
-        .AddCustomChecker(
-            [](const std::string& path) { return !path.empty(); });
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
-                  ops::SaveCombineOpProtoMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    save_combine,
-    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/save_combine_op.cu b/paddle/fluid/operators/save_combine_op.cu
deleted file mode 100644
index 78607823a0368d216310bbbb390fd7face002839..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/save_combine_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/save_combine_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    save_combine,
-    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
deleted file mode 100644
index 4ee82e17dd5e8173ce7dfb5c248890912d2cc7ef..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/save_combine_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <fstream>
-#include <numeric>
-#include <sstream>
-#include <string>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class SaveCombineOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    auto filename = ctx.Attr<std::string>("file_path");
-    auto overwrite = ctx.Attr<bool>("overwrite");
-    auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
-
-    bool is_present = FileExists(filename);
-    if (is_present && !overwrite) {
-      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
-                   filename, overwrite);
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-
-    auto &inp_var_names = ctx.Inputs("X");
-    auto &inp_vars = ctx.MultiInputVar("X");
-    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
-                      "The number of input variables should be greater than 0");
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    for (size_t i = 0; i < inp_var_names.size(); i++) {
-      PADDLE_ENFORCE(inp_vars[i] != nullptr,
-                     "Cannot find variable %s for save_combine_op",
-                     inp_var_names[i]);
-      PADDLE_ENFORCE(inp_vars[i]->IsType<framework::LoDTensor>(),
-                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
-                     inp_var_names[i]);
-
-      auto &tensor = inp_vars[i]->Get<framework::LoDTensor>();
-      // Serialize tensors one by one
-
-      // Check types to see if a fp16 transformation is required
-      auto in_dtype = tensor.type();
-      auto out_dtype =
-          save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-      if (in_dtype != out_dtype) {
-        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-        framework::LoDTensor out;
-        // copy LoD info to the new tensor
-        out.set_lod(tensor.lod());
-        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
-        framework::SerializeToStream(fout, out, dev_ctx);
-      } else {
-        framework::SerializeToStream(fout, tensor, dev_ctx);
-      }
-    }
-    fout.close();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
deleted file mode 100644
index 5594de16b6789e99d5c4cc6828889eb0e311624a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <string>
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-
-USE_CPU_ONLY_OP(save_combine);
-USE_CPU_ONLY_OP(load_combine);
-
-template <typename T, typename U>
-T* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
-                          std::string var_name,
-                          const paddle::platform::CPUPlace& place,
-                          paddle::framework::Scope* scope,
-                          paddle::framework::LoD* expect_lod) {
-  auto var = scope->Var(var_name);
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({x, y});
-  expect_lod->resize(1);
-  for (size_t i = 0; i < lod_info.size(); i++) {
-    (*expect_lod)[0].push_back(lod_info[i]);
-  }
-  tensor->set_lod(*expect_lod);
-  T* expect = tensor->mutable_data<T>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<T>(
-        static_cast<U>(i));  // For FP16, we intend to do float(float16(i))
-  }
-  return expect;
-}
-
-paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
-    const std::string out_var_name, paddle::framework::Scope* scope) {
-  auto load_var = scope->Var(out_var_name);
-  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
-  return target;
-}
-
-template <typename T>
-T* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
-                               const paddle::framework::Scope& scope,
-                               paddle::framework::LoD* actual_lod) {
-  T* actual = target->data<T>();
-  *actual_lod = target->lod();
-  return actual;
-}
-
-template <typename T, typename U>
-void CheckValues(T* expect, U* actual, const paddle::framework::LoD& expect_lod,
-                 const paddle::framework::LoD& actual_lod, const int& numel) {
-  for (int i = 0; i < numel; ++i) {
-    EXPECT_EQ(expect[i], static_cast<T>(actual[i]));
-  }
-  EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
-    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
-      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
-    }
-  }
-}
-
-// Here, we create 4 LoDTensors and use save_combine_op to first save these
-// in a single file. Then, we use load_combine_op to load these sequentially
-TEST(SaveLoadCombineOp, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  std::vector<int> lod1 = {0, 1, 2, 3, 10};
-  int numel1 = 100;
-  paddle::framework::LoD expect_lod1;
-  int* expect1 = CreateForSaveCombineOp<int, int>(10, 10, lod1, "test_var1",
-                                                  place, &scope, &expect_lod1);
-
-  std::vector<int> lod2 = {0, 2, 5, 10};
-  int numel2 = 200;
-  paddle::framework::LoD expect_lod2;
-  int* expect2 = CreateForSaveCombineOp<int, int>(10, 20, lod2, "test_var2",
-                                                  place, &scope, &expect_lod2);
-
-  std::vector<int> lod3 = {0, 2, 3, 20};
-  int numel3 = 4000;
-  paddle::framework::LoD expect_lod3;
-  int* expect3 = CreateForSaveCombineOp<int, int>(20, 200, lod3, "test_var3",
-                                                  place, &scope, &expect_lod3);
-
-  std::vector<int> lod4 = {0, 1, 20};
-  int numel4 = 1000;
-  paddle::framework::LoD expect_lod4;
-  int* expect4 = CreateForSaveCombineOp<int, int>(20, 50, lod4, "test_var4",
-                                                  place, &scope, &expect_lod4);
-
-  // Set attributes
-  std::string filename = "check_tensor.ls";
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string(filename)});
-
-  // Run the save_combine_op
-  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
-      "save_combine",
-      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
-  save_combine_op->Run(scope, place);
-
-  // Set up output vars
-  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", &scope);
-  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", &scope);
-  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", &scope);
-  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", &scope);
-
-  // Run the load_combine_op
-  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
-      "load_combine", {},
-      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
-  load_combine_op->Run(scope, place);
-
-  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
-  int* actual1 = GetValuesAfterLoadCombineOp<int>(target1, scope, &actual_lod1);
-  int* actual2 = GetValuesAfterLoadCombineOp<int>(target2, scope, &actual_lod2);
-  int* actual3 = GetValuesAfterLoadCombineOp<int>(target3, scope, &actual_lod3);
-  int* actual4 = GetValuesAfterLoadCombineOp<int>(target4, scope, &actual_lod4);
-
-  CheckValues<int, int>(expect1, actual1, expect_lod1, actual_lod1, numel1);
-  CheckValues<int, int>(expect2, actual2, expect_lod2, actual_lod2, numel2);
-  CheckValues<int, int>(expect3, actual3, expect_lod3, actual_lod3, numel3);
-  CheckValues<int, int>(expect4, actual4, expect_lod4, actual_lod4, numel4);
-}
-
-// FP16 version of SaveLoadCombineOp Test, only altering the saving aspect
-// to save as FP16.
-TEST(SaveCombineFP16Op, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  std::vector<int> lod1 = {0, 1, 2, 3, 10};
-  int numel1 = 100;
-  paddle::framework::LoD expect_lod1;
-  float* expect1 = CreateForSaveCombineOp<float, paddle::platform::float16>(
-      10, 10, lod1, "test_var1", place, &scope, &expect_lod1);
-
-  std::vector<int> lod2 = {0, 2, 5, 10};
-  int numel2 = 200;
-  paddle::framework::LoD expect_lod2;
-  float* expect2 = CreateForSaveCombineOp<float, paddle::platform::float16>(
-      10, 20, lod2, "test_var2", place, &scope, &expect_lod2);
-
-  std::vector<int> lod3 = {0, 20};
-  int numel3 = 4000;
-  paddle::framework::LoD expect_lod3;
-  float* expect3 = CreateForSaveCombineOp<float, paddle::platform::float16>(
-      20, 200, lod3, "test_var3", place, &scope, &expect_lod3);
-
-  std::vector<int> lod4 = {0, 1, 20};
-  int numel4 = 1000;
-  paddle::framework::LoD expect_lod4;
-  float* expect4 = CreateForSaveCombineOp<float, paddle::platform::float16>(
-      20, 50, lod4, "test_var4", place, &scope, &expect_lod4);
-
-  // Set attributes
-  std::string filename = "check_tensor_fp16_save.ls";
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string(filename)});
-  attrs.insert({"save_as_fp16", true});
-
-  // Run the save_combine_op
-  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
-      "save_combine",
-      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
-  save_combine_op->Run(scope, place);
-
-  // Set up output vars
-  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", &scope);
-  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", &scope);
-  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", &scope);
-  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", &scope);
-
-  // Run the load_combine_op
-  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
-      "load_combine", {},
-      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
-  load_combine_op->Run(scope, place);
-
-  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
-  paddle::platform::float16* actual1 =
-      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target1, scope,
-                                                             &actual_lod1);
-  paddle::platform::float16* actual2 =
-      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target2, scope,
-                                                             &actual_lod2);
-  paddle::platform::float16* actual3 =
-      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target3, scope,
-                                                             &actual_lod3);
-  paddle::platform::float16* actual4 =
-      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target4, scope,
-                                                             &actual_lod4);
-
-  CheckValues<float, paddle::platform::float16>(expect1, actual1, expect_lod1,
-                                                actual_lod1, numel1);
-  CheckValues<float, paddle::platform::float16>(expect2, actual2, expect_lod2,
-                                                actual_lod2, numel2);
-  CheckValues<float, paddle::platform::float16>(expect3, actual3, expect_lod3,
-                                                actual_lod3, numel3);
-  CheckValues<float, paddle::platform::float16>(expect4, actual4, expect_lod4,
-                                                actual_lod4, numel4);
-}
-
-// FP16 version of SaveLoadCombineOp Test, only altering the loading aspect
-// to load tensors with FP16 precision.
-TEST(LoadCombineFP16Op, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  std::vector<int> lod1 = {0, 1, 2, 3, 10};
-  int numel1 = 100;
-  paddle::framework::LoD expect_lod1;
-  float* expect1 = CreateForSaveCombineOp<float, paddle::platform::float16>(
-      10, 10, lod1, "test_var1", place, &scope, &expect_lod1);
-
-  std::vector<int> lod2 = {0, 2, 5, 10};
-  int numel2 = 200;
-  paddle::framework::LoD expect_lod2;
-  float* expect2 = CreateForSaveCombineOp<float, paddle::platform::float16>(
-      10, 20, lod2, "test_var2", place, &scope, &expect_lod2);
-
-  std::vector<int> lod3 = {0, 20};
-  int numel3 = 4000;
-  paddle::framework::LoD expect_lod3;
-  float* expect3 = CreateForSaveCombineOp<float, paddle::platform::float16>(
-      20, 200, lod3, "test_var3", place, &scope, &expect_lod3);
-
-  std::vector<int> lod4 = {0, 1, 20};
-  int numel4 = 1000;
-  paddle::framework::LoD expect_lod4;
-  float* expect4 = CreateForSaveCombineOp<float, paddle::platform::float16>(
-      20, 50, lod4, "test_var4", place, &scope, &expect_lod4);
-
-  // Set attributes
-  std::string filename = "check_tensor_fp16_load.ls";
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string(filename)});
-
-  // Run the save_combine_op
-  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
-      "save_combine",
-      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
-  save_combine_op->Run(scope, place);
-
-  // Set up output vars
-  auto load_var1 = scope.Var("out_var1");
-  auto load_var2 = scope.Var("out_var2");
-  auto load_var3 = scope.Var("out_var3");
-  auto load_var4 = scope.Var("out_var4");
-
-  attrs.insert({"load_as_fp16", true});
-  // Run the load_combine_op
-  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
-      "load_combine", {},
-      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
-  load_combine_op->Run(scope, place);
-
-  auto* target1 = load_var1->GetMutable<paddle::framework::LoDTensor>();
-  auto* target2 = load_var2->GetMutable<paddle::framework::LoDTensor>();
-  auto* target3 = load_var3->GetMutable<paddle::framework::LoDTensor>();
-  auto* target4 = load_var4->GetMutable<paddle::framework::LoDTensor>();
-
-  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
-  paddle::platform::float16* actual1 =
-      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target1, scope,
-                                                             &actual_lod1);
-  paddle::platform::float16* actual2 =
-      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target2, scope,
-                                                             &actual_lod2);
-  paddle::platform::float16* actual3 =
-      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target3, scope,
-                                                             &actual_lod3);
-  paddle::platform::float16* actual4 =
-      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target4, scope,
-                                                             &actual_lod4);
-
-  CheckValues<float, paddle::platform::float16>(expect1, actual1, expect_lod1,
-                                                actual_lod1, numel1);
-  CheckValues<float, paddle::platform::float16>(expect2, actual2, expect_lod2,
-                                                actual_lod2, numel2);
-  CheckValues<float, paddle::platform::float16>(expect3, actual3, expect_lod3,
-                                                actual_lod3, numel3);
-  CheckValues<float, paddle::platform::float16>(expect4, actual4, expect_lod4,
-                                                actual_lod4, numel4);
-}
-
-// Test with original SaveLoadTest
-TEST(SaveLoadTestWithCombineOp, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  auto var = scope.Var("test_var");
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({3, 4000});
-  paddle::framework::LoD expect_lod;
-  expect_lod.resize(1);
-  expect_lod[0].push_back(0);
-  expect_lod[0].push_back(1);
-  expect_lod[0].push_back(2);
-  expect_lod[0].push_back(3);
-
-  tensor->set_lod(expect_lod);
-  int* expect = tensor->mutable_data<int>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<int>(i);
-  }
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string("check_t.save")});
-
-  auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "save_combine", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, place);
-
-  auto load_var = scope.Var("out_var");
-  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
-  auto load_op = paddle::framework::OpRegistry::CreateOp(
-      "load_combine", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, place);
-  int* actual = target->data<int>();
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    EXPECT_EQ(expect[i], actual[i]);
-  }
-  auto& actual_lod = target->lod();
-  EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
-    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
-      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
-    }
-  }
-}
diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc
deleted file mode 100644
index d277198a2f92c426586e774873c6770b93660e85..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-
-USE_CPU_ONLY_OP(save);
-USE_CPU_ONLY_OP(load);
-
-TEST(SaveLoadOp, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  auto var = scope.Var("test_var");
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({3, 10});
-  paddle::framework::LoD expect_lod;
-  expect_lod.resize(1);
-  expect_lod[0].push_back(0);
-  expect_lod[0].push_back(1);
-  expect_lod[0].push_back(2);
-  expect_lod[0].push_back(3);
-
-  tensor->set_lod(expect_lod);
-  int* expect = tensor->mutable_data<int>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<int>(i);
-  }
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string("tensor.save")});
-
-  auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "save", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, place);
-
-  auto load_var = scope.Var("out_var");
-  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
-  auto load_op = paddle::framework::OpRegistry::CreateOp(
-      "load", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, place);
-  int* actual = target->data<int>();
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    EXPECT_EQ(expect[i], actual[i]);
-  }
-  auto& actual_lod = target->lod();
-  EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
-    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
-      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
-    }
-  }
-}
-
-TEST(SaveFP16Op, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  auto var = scope.Var("test_var");
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({3, 10});
-  paddle::framework::LoD expect_lod;
-  expect_lod.resize(1);
-  expect_lod[0].push_back(0);
-  expect_lod[0].push_back(1);
-  expect_lod[0].push_back(2);
-  expect_lod[0].push_back(3);
-
-  tensor->set_lod(expect_lod);
-  float* expect = tensor->mutable_data<float>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<float>(paddle::platform::float16(i));
-  }
-
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string("tensor.save")});
-  attrs.insert({"save_as_fp16", true});
-
-  auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "save", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, place);
-
-  auto load_var = scope.Var("out_var");
-  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
-  auto load_op = paddle::framework::OpRegistry::CreateOp(
-      "load", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, place);
-  paddle::platform::float16* actual = target->data<paddle::platform::float16>();
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    EXPECT_EQ(expect[i], static_cast<float>(actual[i]));
-  }
-  auto& actual_lod = target->lod();
-  EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
-    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
-      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
-    }
-  }
-}
-
-TEST(LoadFP16Op, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  auto var = scope.Var("test_var");
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({3, 10});
-
-  paddle::framework::LoD expect_lod;
-  expect_lod.resize(1);
-  expect_lod[0].push_back(0);
-  expect_lod[0].push_back(1);
-  expect_lod[0].push_back(2);
-  expect_lod[0].push_back(3);
-
-  tensor->set_lod(expect_lod);
-  float* expect = tensor->mutable_data<float>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<float>(paddle::platform::float16(i));
-  }
-
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string("tensor.save")});
-  attrs.insert({"load_as_fp16", true});
-
-  auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "save", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, place);
-
-  auto load_var = scope.Var("out_var");
-  load_var->GetMutable<paddle::framework::LoDTensor>();
-  auto load_op = paddle::framework::OpRegistry::CreateOp(
-      "load", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, place);
-
-  auto target = load_var->Get<paddle::framework::LoDTensor>();
-  paddle::platform::float16* actual = target.data<paddle::platform::float16>();
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    EXPECT_EQ(expect[i], static_cast<float>(actual[i]));
-  }
-
-  auto& actual_lod = target.lod();
-  EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
-    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
-      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
-    }
-  }
-}
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
deleted file mode 100644
index c660bbb8ed9a4caf564fd75d3c248827ea46d35a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/save_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdint.h>
-#include <fstream>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/save_op.h"
-
-namespace paddle {
-namespace operators {
-class SaveOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor ) Input LoDTensor and SelectedRows to be saved");
-    AddComment(R"DOC(
-Save operator
-
-This operator will serialize and write LoDTensor / SelectedRows variable to file on disk.
-)DOC");
-    AddAttr<bool>("overwrite",
-                  "(boolean, default true)"
-                  "Overwrite the output file if exist")
-        .SetDefault(true);
-    AddAttr<bool>("save_as_fp16",
-                  "(boolean, default false)"
-                  "If true, the tensor will be converted to float16 data "
-                  "type and then saved. Otherwise, the tensor will be "
-                  "directly saved without data type conversion.")
-        .SetDefault(false);
-    AddAttr<std::string>("file_path",
-                         "(string)"
-                         "The \"file_path\" where the variable will be saved.")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-    AddOutput(LOOKUP_TABLE_PATH,
-              "(string)"
-              "for pserver: The \"kLookupTablePath\" where checkpoint notify "
-              "to save lookup table variables"
-              " to directory specified.")
-        .AsDispensable();
-  }
-};
-
-class SaveOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto var_type = framework::proto::VarType::RAW;
-    ctx->SetType(LOOKUP_TABLE_PATH, var_type);
-  }
-};
-
-class SaveOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker,
-                  ops::SaveOpVarTypeInference, ops::SaveOpShapeInference);
-
-REGISTER_OP_CPU_KERNEL(
-    save, ops::SaveOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op.cu b/paddle/fluid/operators/save_op.cu
deleted file mode 100644
index 0a778a694e52f146b6cceddb969b8af08f40ef9e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/save_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/save_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    save, ops::SaveOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
deleted file mode 100644
index b59421cb9e08e343a507210316be0d9b06192c49..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/save_op.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <fstream>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace operators {
-// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
-// to directory specified.
-constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
-template <typename DeviceContext, typename T>
-class SaveOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-
-    auto *input_var = ctx.InputVar("X");
-    auto iname = ctx.Inputs("X").data();
-    PADDLE_ENFORCE(input_var != nullptr, "Cannot find variable %s for save_op",
-                   iname);
-
-    if (input_var->IsType<framework::LoDTensor>()) {
-      SaveLodTensor(ctx, place, input_var);
-    } else if (input_var->IsType<framework::SelectedRows>()) {
-      SaveSelectedRows(ctx, place, input_var);
-    } else {
-      PADDLE_ENFORCE(
-          false,
-          "SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
-          iname);
-    }
-  }
-
-  void SaveLodTensor(const framework::ExecutionContext &ctx,
-                     const platform::Place &place,
-                     const framework::Variable *var) const {
-    auto filename = ctx.Attr<std::string>("file_path");
-    auto overwrite = ctx.Attr<bool>("overwrite");
-
-    if (FileExists(filename) && !overwrite) {
-      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
-                   filename, overwrite);
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-
-    auto &tensor = var->Get<framework::LoDTensor>();
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-
-    auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
-    auto in_dtype = tensor.type();
-    auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-    if (in_dtype != out_dtype) {
-      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-      framework::LoDTensor out;
-      framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
-      // copy LoD info to the new tensor
-      out.set_lod(tensor.lod());
-      framework::SerializeToStream(fout, out, dev_ctx);
-    } else {
-      framework::SerializeToStream(fout, tensor, dev_ctx);
-    }
-    fout.close();
-  }
-
-  void SaveSelectedRows(const framework::ExecutionContext &ctx,
-                        const platform::Place &place,
-                        const framework::Variable *var) const {
-    auto file_path = ctx.Attr<std::string>("file_path");
-    auto overwrite = ctx.Attr<bool>("overwrite");
-
-    std::string filename = file_path;
-    VLOG(4) << "SaveSelectedRows output file_path: " << file_path;
-
-    framework::Variable *out_put_var = ctx.scope().FindVar(LOOKUP_TABLE_PATH);
-    if (out_put_var != nullptr) {
-      auto *lt_var = out_put_var->GetMutable<std::string>();
-      if (lt_var->length() > 0) {
-        VLOG(4) << "SaveSelectedRows output var name: " << *lt_var;
-        filename = *lt_var;
-      }
-    }
-
-    if (FileExists(filename) && !overwrite) {
-      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
-                   filename, overwrite);
-    }
-
-    VLOG(4) << "SaveSelectedRows get File name: " << filename;
-
-    MkDirRecursively(DirName(filename).c_str());
-
-    auto &selectedRows = var->Get<framework::SelectedRows>();
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-    framework::SerializeToStream(fout, selectedRows, dev_ctx);
-    fout.close();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
deleted file mode 100644
index 383e7940fa56586d17cf8819c463c201fbd24050..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scale_op.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/scale_op.h"
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-
-class ScaleOp : public framework::OperatorWithKernel {
- public:
-  ScaleOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ScaleOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ScaleOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of scale operator.");
-    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
-    AddComment(R"DOC(
-**Scale operator**
-
-Apply scaling and bias addition to the input tensor.
-
-if bias_after_scale=True:
-
-$$Out = scale*X + bias$$
-
-else:
-
-$$Out = scale*(X + bias)$$
-)DOC");
-    AddAttr<float>("scale", "The scaling factor of the scale operator.")
-        .SetDefault(1.0);
-    AddAttr<float>("bias", "The bias of the scale operator.").SetDefault(0.0);
-    AddAttr<bool>(
-        "bias_after_scale",
-        "Apply bias addition after or before scaling. It is useful for "
-        "numeric stability in some circumstances.")
-        .SetDefault(true);
-  }
-};
-
-class ScaleOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto &in_var_name = ctx->Input("X").front();
-    auto out_var_name = ctx->Output("Out").front();
-
-    if (in_var_name != out_var_name) {
-      ctx->SetType(out_var_name, ctx->GetType(in_var_name));
-      ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
-    }
-  }
-};
-
-class ScaleGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("scale");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttr("scale", GetAttr("scale"));
-    grad_op->SetAttr("bias", 0.0f);
-    grad_op->SetAttr("bias_after_scale", true);
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-using ScaleOpInplace = framework::SingleOpInplaceInToOut;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker,
-                  ops::ScaleOpVarTypeInference, ops::ScaleOpInplace);
-REGISTER_OP_CPU_KERNEL(
-    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu
deleted file mode 100644
index e1f20a73b20fc23ec8b99ba0e5154eb184718ca3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scale_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/platform/float16.h"
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    scale,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   uint8_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int16_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   plat::float16>);
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
deleted file mode 100644
index 96b8b00b429df72569ef2a292c8a600c56159f19..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scale_op.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class ScaleKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in_var = ctx.InputVar("X");
-    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
-
-    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
-    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
-    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-
-    auto* out_var = ctx.OutputVar("Out");
-    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
-      auto& in_slr = in_var->Get<framework::SelectedRows>();
-      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
-      out_slr->set_rows(in_slr.rows());
-      out_slr->set_height(in_slr.height());
-    }
-
-    auto* out =
-        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
-    out->mutable_data<T>(in->place());
-
-    PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
-                      "in and out should have the same dim");
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    if (bias_after_scale) {
-      eigen_out.device(dev) = scale * eigen_in + bias;
-    } else {
-      eigen_out.device(dev) = scale * (eigen_in + bias);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
deleted file mode 100644
index 0e83219ded28d561b2bf7ef03154632503b75ea4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter.cu.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <unordered_set>
-#include <vector>
-#include "math/math_function.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-template <typename T, typename IndexT = int>
-__global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
-                                      size_t index_size, size_t slice_size,
-                                      bool overwrite) {
-  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT scatter_i = indices[indices_i];
-    IndexT out_i = scatter_i * slice_size + slice_i;
-    *(output + out_i) = static_cast<T>(0);
-  }
-}
-
-template <typename T, typename IndexT = int>
-__global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
-                                  T* output, size_t index_size,
-                                  size_t slice_size, bool overwrite) {
-  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT scatter_i = indices[indices_i];
-    IndexT out_i = scatter_i * slice_size + slice_i;
-    if (overwrite) {
-      *(output + out_i) = *(params + i);
-    } else {
-      paddle::platform::CudaAtomicAdd(output + out_i, *(params + i));
-    }
-  }
-}
-
-template <typename T, typename IndexT = int>
-__global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
-                                    T* output, const int* output_dims,
-                                    size_t remain_size, size_t slice_size,
-                                    size_t end_size) {
-  CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT gather_i = 0;
-    int64_t temp = slice_size;
-    for (int64_t j = end_size - 1; j >= 0; --j) {
-      IndexT index_value = indices[indices_i * end_size + j];
-      gather_i += (index_value * temp);
-      temp *= output_dims[j];
-    }
-    IndexT output_i = gather_i + slice_i;
-    paddle::platform::CudaAtomicAdd(output + output_i, *(update + i));
-  }
-}
-
-/**
- * A thin wrapper on gpu tensor
- * Return a new updated tensor from source tensor, scatter-assigned according to
- * index
- * input[src]: type-T source Tensor
- * input[index]: type-IndexT index Tensor (1-D)
- * return: output tensor
- */
-template <typename T, typename IndexT = int>
-void GPUScatterAssign(const framework::ExecutionContext& context,
-                      const Tensor& src, const Tensor& index, Tensor* output,
-                      bool overwrite = true) {
-  // check index of shape 1-D
-  const auto& ctx = context.device_context();
-  if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      "index.dims()[1] should be 1 when index.dims().size() == "
-                      "2 in scatter_op.");
-  } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      "index.dims().size() should be 1 or 2 in scatter_op.");
-  }
-  int index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
-  output_dims[0] = index_size;
-
-  // slice size
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-  const size_t& slice_bytes = slice_size * sizeof(T);
-
-  // set block and grid num
-  int block = 512;
-  int n = slice_size * index_size;
-  int grid = (n + block - 1) / block;
-
-  // if not overwrite mode, init data
-  if (!overwrite) {
-    ScatterInitCUDAKernel<T, IndexT><<<
-        grid, block, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-        p_index, p_output, index_size, slice_size, overwrite);
-  }
-
-  ScatterCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_src, p_index, p_output, index_size, slice_size, overwrite);
-}
-
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUScatterNdAdd(const framework::ExecutionContext& context,
-                     const Tensor& update, const Tensor& index,
-                     Tensor* output) {
-  auto index_dims = index.dims();
-  auto index_dims_size = index_dims.size();
-
-  auto output_dims = output->dims();
-  auto output_dims_size = output_dims.size();
-
-  const T* p_update = update.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  // final dim
-  int64_t end_size = index_dims[index_dims_size - 1];
-  // remain dim
-  auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = framework::product(remain_ddim);
-  // slice size
-  int64_t slice_size = 1;
-  for (int64_t i = end_size; i < output_dims_size; ++i) {
-    slice_size *= output_dims[i];
-  }
-  const size_t slice_bytes = slice_size * sizeof(T);
-  // put output_dims int CUDA
-  // gplace and cplace
-  const auto& ctx = context.template device_context<DeviceContext>();
-  const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  auto cplace = platform::CPUPlace();
-
-  std::vector<int> v_output_dims(output_dims_size);
-  for (int i = 0; i < output_dims_size; ++i) {
-    v_output_dims[i] = static_cast<int>(output_dims[i]);
-  }
-  auto& dev_ctx = context.cuda_device_context();
-  int bytes = output_dims_size * sizeof(int);
-  auto output_dims_ptr = memory::Alloc(dev_ctx, bytes);
-  int* g_output_dims = reinterpret_cast<int*>(output_dims_ptr->ptr());
-  memory::Copy(gplace, g_output_dims, cplace, v_output_dims.data(), bytes,
-               ctx.stream());
-
-  int block = 512;
-  int n = slice_size * remain_numel;
-  int grid = (n + block - 1) / block;
-
-  ScatterNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_update, p_index, p_output, g_output_dims, remain_numel, slice_size,
-      end_size);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h
deleted file mode 100644
index 2a88b96dd8b4b17327f5e727b4e23bf9d707efa3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstring>
-#include <string>
-
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/place.h"
-#include "unordered_set"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-/**
-  * Return the updated array pointer, use blas or eigen lib to optimize time
- * cost
- */
-template <typename T, typename IndexT = int>
-typename std::enable_if<std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, const T* dist_pointer,
-                      T* result_dist_pointer, const framework::Tensor& src,
-                      framework::Tensor* dist, const int& src_index,
-                      const IndexT& dist_index, const int& slice_size,
-                      const size_t& slice_bytes) {
-  auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-
-  blas.VADD(slice_size, src_pointer + src_index * slice_size,
-            dist_pointer + dist_index * slice_size,
-            result_dist_pointer + dist_index * slice_size);
-}
-
-template <typename T, typename IndexT = int>
-typename std::enable_if<!std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, const T* dist_pointer,
-                      T* result_dist_pointer, const framework::Tensor& src,
-                      framework::Tensor* dist, const int& src_index,
-                      const IndexT& dist_index, const int& slice_size,
-                      const size_t& slice_bytes) {
-  auto src_slice = src.Slice(src_index, src_index + 1);
-  auto dist_slice = dist->Slice(dist_index, dist_index + 1);
-
-  auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-  auto eigen_dist = framework::EigenVector<T>::Flatten(dist_slice);
-
-  eigen_dist += eigen_src;
-}
-/**
- * Return an updated tensor from source tensor, scattered according to index:
- * dst[i] = src[index[i]]
- * input[src]: type-T source Tensor
- * input[index]: type-IndexT index Tensor (1-D)
- * return: output tensor
- */
-template <typename T, typename IndexT = int>
-void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
-                   const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true);
-  // check index of shape 1-D
-  if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      "index.dims()[1] should be 1 when index.dims().size() == "
-                      "2 in scatter_op.");
-  } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      "index.dims().size() should be 1 or 2 in scatter_op.");
-  }
-  int index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-  auto dst_dims = output->dims();
-
-  const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  // check src shape and dst shape should match
-  for (int i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i]);
-
-  // slice size
-  size_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int i = 0; i < index_size; ++i) {
-    IndexT index_ = p_index[i];
-    memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
-  }
-}
-
-template <typename T, typename IndexT = int>
-void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
-                      const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.device_context().GetPlace()),
-                    true);
-  // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1 ||
-                     (index.dims().size() == 2 && index.dims()[1] == 1),
-                 "");
-  int index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-  auto dst_dims = output->dims();
-
-  const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-
-  const T* p_output = output->data<T>();
-  T* result_p_output = output->data<T>();
-
-  // check src shape and dst shape should match
-  for (int i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i]);
-
-  // slice size
-  size_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const size_t& slice_bytes = slice_size * sizeof(T);
-
-  // if not in overwrite mode, need to init output data
-  for (int i = 0; i < index_size; ++i) {
-    const IndexT& index_ = p_index[i];
-    memset(result_p_output + slice_size * index_, 0, slice_bytes);
-  }
-
-  // if not in overwrite mode, need to init output data
-  for (int i = 0; i < index_size; ++i) {
-    const IndexT& index_ = p_index[i];
-    elementwise_inner_add<T, IndexT>(ctx, p_src, p_output, result_p_output, src,
-                                     output, i, index_, slice_size,
-                                     slice_bytes);
-  }
-}
-
-template <typename T, typename IndexT = int>
-void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
-                  const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.device_context().GetPlace()),
-                    true, "It should be running on the CPU");
-
-  // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
-  auto index_dims = index.dims();
-  auto index_dims_size = index_dims.size();
-
-  auto output_dims = output->dims();
-  auto output_dims_size = output_dims.size();
-
-  const T* p_update = update.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* result_p_output = output->data<T>();
-  const T* p_output = output->data<T>();
-
-  // final dim
-  int64_t end_size = index_dims[index_dims_size - 1];
-  // remain dim
-  auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = framework::product(remain_ddim);
-  // slice size
-  int64_t slice_size = 1;
-  for (int64_t i = end_size; i < output_dims_size; ++i) {
-    slice_size *= output_dims[i];
-  }
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int64_t i = 0; i < remain_numel; ++i) {
-    IndexT index_ = 0;
-    IndexT temp = 1;
-    for (int64_t j = end_size - 1; j >= 0; --j) {
-      IndexT index_value = p_index[i * end_size + j];
-      index_ += (index_value * temp);
-      temp *= output_dims[j];
-    }
-    elementwise_inner_add<T, IndexT>(ctx, p_update, p_output, result_p_output,
-                                     update, output, i, index_, slice_size,
-                                     slice_bytes);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
deleted file mode 100644
index 41f18eaeaf8bd894282929321a483ef5859c5895..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/scatter_nd_add_op.h"
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-class ScatterNdAddOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of ScatterNdAddOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      "Input(Index) of ScatterNdAddOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Updates"), true,
-                      "Input(Updates) of ScatterNdAddOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of ScatterNdAddOp should not be null.");
-
-    auto ref_dims = ctx->GetInputDim("X");
-    auto ref_dims_size = ref_dims.size();
-    auto index_dims = ctx->GetInputDim("Index");
-    auto index_dims_size = index_dims.size();
-    auto updates_dims = ctx->GetInputDim("Updates");
-    auto updates_dims_size = updates_dims.size();
-
-    PADDLE_ENFORCE_LE(
-        index_dims[index_dims_size - 1], ref_dims_size,
-        "Input(Index).shape[-1] should be no greater than Input(X).rank");
-    PADDLE_ENFORCE_GE(index_dims_size, 2UL,
-                      "The rank of Input(Index) should be greater than 1");
-
-    // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
-    std::vector<int64_t> r_updates_dims;
-    for (int64_t i = 0; i < index_dims_size - 1; ++i) {
-      r_updates_dims.emplace_back(index_dims[i]);
-    }
-    for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) {
-      r_updates_dims.emplace_back(ref_dims[i]);
-    }
-
-    PADDLE_ENFORCE_EQ(r_updates_dims.size(), updates_dims_size,
-                      "Updates has wrong shape");
-
-    for (int64_t i = 0; i < updates_dims_size; ++i) {
-      PADDLE_ENFORCE_EQ(r_updates_dims[i], updates_dims[i],
-                        "Updates has wrong shape");
-    }
-    ctx->SetOutputDim("Out", ref_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->type(),
-                      ctx.Input<Tensor>("Updates")->type(),
-                      "Ref and Updates must have same type");
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ScatterNdAddGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("Updates"))) {
-      ctx->SetOutputDim(framework::GradVarName("Updates"),
-                        ctx->GetInputDim("Updates"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class ScatterNdAddOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The source input of scatter_nd_add op");
-    AddInput("Index",
-             "The index input of scatter_nd_add op where X will be updated");
-    AddInput("Updates", "The updated value of scatter_nd_add op");
-    AddOutput("Out", "The output of scatter_nd_add op");
-    AddComment(R"DOC(
-Scatter_nd_add Operator.
-
-Output is obtained by applying sparse addition to a single value or slice in a Variable.
-
-      Given:
-        * Case 1:
-            ref = [0, 1, 2, 3, 4, 5]
-            index = [[1], [2], [3], [1]]
-            updates = [9, 10, 11, 12]
-
-          we get:
-
-            output = [0, 22, 12, 14, 4, 5]
-
-        * Case 2:
-            ref = [[65, 17], [-14, -25]]
-            index = [[], []]
-            updates = [[[-1, -2], [1, 2]],
-                       [[3, 4], [-3, -4]]]
-            ref.shape = (2, 2)
-            index.shape = (2, 0)
-            updates.shape = (2, 2, 2)
-
-          we get:
-
-            output = [[67, 19], [-16, -27]]
-)DOC");
-  }
-};
-
-class ScatterNdAddGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("scatter_nd_add_grad");
-    op->SetInput("Index", Input("Index"));
-    op->SetInput("Updates", Input("Updates"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ScatterNdAddGradNoNeedBufferVarsInference,
-                                      "Updates");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker,
-                  ops::ScatterNdAddGradDescMaker);
-
-REGISTER_OPERATOR(scatter_nd_add_grad, ops::ScatterNdAddGradOp,
-                  ops::ScatterNdAddGradNoNeedBufferVarsInference);
-
-REGISTER_OP_CPU_KERNEL(scatter_nd_add, ops::ScatterNdAddOpKernel<float>,
-                       ops::ScatterNdAddOpKernel<double>,
-                       ops::ScatterNdAddOpKernel<int64_t>,
-                       ops::ScatterNdAddOpKernel<int>,
-                       ops::ScatterNdAddOpKernel<uint8_t>);
-
-REGISTER_OP_CPU_KERNEL(scatter_nd_add_grad,
-                       ops::ScatterNdAddGradientOpKernel<float>,
-                       ops::ScatterNdAddGradientOpKernel<double>,
-                       ops::ScatterNdAddGradientOpKernel<int64_t>,
-                       ops::ScatterNdAddGradientOpKernel<int>,
-                       ops::ScatterNdAddGradientOpKernel<uint8_t>);
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu
deleted file mode 100644
index ecd9beb10cf9d73e014510ff8c628e5d5b6a2a73..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_nd_add_op.cu
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
-#include "paddle/fluid/operators/scatter_nd_add_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ScatterNdAddOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on GPU device.");
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-
-    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = Ids->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int32_t>(ctx, *Updates, *Ids, Out);
-    } else {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *Updates, *Ids, Out);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on GPU device.");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    if (dX) {
-      // In place gradient: dX = dO
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-    }
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather
-      const auto &index_type = Ids->type();
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUGatherNd<DeviceContext, T, int32_t>(ctx, *dOut, *Ids, dUpdates);
-      } else {
-        GPUGatherNd<DeviceContext, T, int64_t>(ctx, *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(scatter_nd_add,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, float>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, double>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, int64_t>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, int>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(scatter_nd_add_grad,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, float>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, double>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, int64_t>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, int>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, plat::float16>);
diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h
deleted file mode 100644
index 4b90fa1cf50b003fe32148ccb65185bae71c5fa0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_nd_add_op.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class ScatterNdAddOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on CPU.");
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-
-    // In place output: Out = X
-    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = Ids->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
-    } else {
-      ScatterNdAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
-    }
-  }
-};
-
-template <typename T>
-class ScatterNdAddGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on CPU.");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    if (dX) {
-      // In place gradient: dX = dO
-      framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
-    }
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      const auto &index_type = Ids->type();
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUGatherNd<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      } else {
-        CPUGatherNd<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
deleted file mode 100644
index 4eb5b7ad9d1fe128ade904cf61e0178d59b374b8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_op.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/scatter_op.h"
-#include <memory>
-#include "paddle/fluid/framework/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-class ScatterOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Ids"),
-                   "Input(Ids) of ScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Updates"),
-                   "Input(Updates) of ScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ScatterOp should not be null.");
-
-    auto updates_dims = ctx->GetInputDim("Updates");
-    auto ref_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Ids").size(), 1,
-                      "Update Ids should be 1-D.");
-    PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
-                      "Xerence and Updates should have the same shape size");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
-                      ctx->GetInputDim("Ids")[0],
-                      "Updates and Ids should have same batch-size.");
-    ctx->SetOutputDim("Out", ref_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ScatterGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("Updates"))) {
-      ctx->SetOutputDim(framework::GradVarName("Updates"),
-                        ctx->GetInputDim("Updates"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The source input of scatter op");
-    AddInput("Ids", "The index input of scatter op where X will be updated");
-    AddInput("Updates", "The updated value of scatter op");
-    AddOutput("Out", "The output of scatter op");
-    AddAttr<bool>("overwrite",
-                  "(bool, defalut: True) "
-                  "The mode that updating the output when has same index,"
-                  "If True, use the overwrite mode to update the output"
-                  "of the same index, if False, use the accumulate mode to"
-                  "update the output of the same index,Default value is True."
-                  "You can set overwrite=False to implement scatter_add.")
-        .SetDefault(true);
-    AddComment(R"DOC(
-Scatter Operator.
-
-This operator obtains output by updating the input on selected indices on the first axis:
-
-$$
-Out = X \\
-Out[Ids] = Updates
-$$
-
-)DOC");
-  }
-};
-
-class ScatterGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("scatter_grad");
-    op->SetInput("Ids", Input("Ids"));
-    op->SetInput("Updates", Input("Updates"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ScatterGradNoNeedBufferVarsInference,
-                                      "Updates");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
-                  ops::ScatterGradDescMaker);
-REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp,
-                  ops::ScatterGradNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
-REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
deleted file mode 100644
index 6c4da760ce828e49b55c5d488958e1039fe62702..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_op.cu
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
-#include "paddle/fluid/operators/scatter_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ScatterOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-    bool overwrite = ctx.Attr<bool>("overwrite");
-
-    Out->ShareDataWith(*X);
-    // use template class to support int32_t and int64_t
-    const auto &index_type = Ids->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "scatter_op Index holds the wrong type, it holds %s, but desires to be "
-        "%s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int32_t>(ctx, *Updates, *Ids, Out, overwrite);
-    } else {
-      GPUScatterAssign<T, int64_t>(ctx, *Updates, *Ids, Out, overwrite);
-    }
-  }
-};
-
-template <typename T>
-class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    if (dX) {
-      // In place gradient: dX = dO
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-    }
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      const auto &index_type = Ids->type();
-      bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                              index_type == framework::proto::VarType::INT64;
-      PADDLE_ENFORCE_EQ(
-          index_type_match, true,
-          "scatter_op Index holds the wrong type, it holds %s, but desires to "
-          "be %s or %s",
-          paddle::framework::DataTypeToString(index_type),
-          paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-          paddle::framework::DataTypeToString(
-              framework::proto::VarType::INT64));
-      // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      } else {
-        GPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
-REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
deleted file mode 100644
index 97254f817d9856aca9ffe1a101551b902541d9cf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class ScatterOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-    double overwrite = ctx.Attr<bool>("overwrite");
-
-    // In place output: Out = X, Out[Ids] = Updates
-    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    // Apply ScatterUpdate: Out[index] = Updates[:]
-    const auto &index_type = Ids->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE(
-        index_type_match,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (overwrite) {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *Updates, *Ids, Out);
-      } else {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *Updates, *Ids, Out);
-      }
-    } else {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssignAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
-      } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
-      }
-    }
-  }
-};
-
-template <typename T>
-class ScatterGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    if (dX) {
-      // In place gradient: dX = dO
-      framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
-    }
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      const auto &index_type = Ids->type();
-      bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                              index_type == framework::proto::VarType::INT64;
-      PADDLE_ENFORCE_EQ(
-          index_type_match, true,
-          "scatter_op index holds the wrong type, it holds %s, but desires to "
-          "be %s or %s",
-          paddle::framework::DataTypeToString(index_type),
-          paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-          paddle::framework::DataTypeToString(
-              framework::proto::VarType::INT64));
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      } else {
-        CPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
deleted file mode 100644
index eb248e59b6ce6e5c9c04f94b21e4bc14207c39b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_test.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/scatter.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <string>
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
-TEST(scatter, ScatterUpdate) {
-  paddle::framework::Tensor src;
-  paddle::framework::Tensor index;
-  paddle::framework::Tensor output;
-
-  auto* p_src = src.mutable_data<float>(paddle::framework::make_ddim({1, 4}),
-                                        paddle::platform::CPUPlace());
-  auto* p_index = index.mutable_data<int>(paddle::framework::make_ddim({1}),
-                                          paddle::platform::CPUPlace());
-
-  for (size_t i = 0; i < 4; ++i) {
-    p_src[i] = static_cast<float>(i);
-  }
-  p_index[0] = 1;
-
-  auto* p_output = output.mutable_data<float>(
-      paddle::framework::make_ddim({4, 4}), paddle::platform::CPUPlace());
-
-  for (int64_t i = 0; i < output.numel(); ++i) {
-    p_output[i] = 0;
-  }
-
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::ScatterAssign<float>(ctx, src, index, &output);
-
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);
-  for (size_t i = 4; i < 8; ++i) {
-    EXPECT_EQ(p_output[i], static_cast<float>(i - 4));
-  }
-  for (size_t i = 4; i < 8; ++i)
-    EXPECT_EQ(output.data<float>()[i], static_cast<float>(i - 4));
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f);
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);
-}
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
deleted file mode 100644
index c795f1e390b8a38407b856321eeda75c9ff57895..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/search_compute.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <immintrin.h>
-#include <cfloat>
-#include <cmath>
-#include <cstring>
-
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/dynload/mklml.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-template <typename DeviceContext, typename T>
-void call_gemm(const math::BlasT<DeviceContext, T>& blas,
-               const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
-               const int M, const int N, const int K, const T alpha, const T* A,
-               const T* B, const T beta, T* C) {
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
-}
-
-template <typename T>
-void call_gemm(const framework::ExecutionContext& ctx,
-               const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
-               const int M, const int N, const int K, const T alpha, const T* A,
-               const T* B, const T beta, T* C) {
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
-}
-
-template <typename DeviceContext, typename T>
-void call_gemm_with_lda(const math::BlasT<DeviceContext, T>& blas,
-                        const CBLAS_TRANSPOSE TransA,
-                        const CBLAS_TRANSPOSE TransB, const int M, const int N,
-                        const int K, const T alpha, const T* A, const T* B,
-                        const T beta, T* C, int lda) {
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-
-  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
-}
-
-template <typename T>
-void call_gemm_batched(const framework::ExecutionContext& ctx,
-                       const CBLAS_TRANSPOSE TransA,
-                       const CBLAS_TRANSPOSE TransB, const int M, const int N,
-                       const int K, const T alpha, const T** A, const T** B,
-                       const T beta, T** C, const int batch) {
-  for (int i = 0; i < batch; ++i) {
-    call_gemm(ctx, TransA, TransB, M, N, K, alpha, A[i], B[i], beta, C[i]);
-  }
-}
-
-#ifndef TYPE_USE_FLOAT
-#define TYPE_USE_FLOAT
-#endif
-#ifndef USE_SSE
-#define USE_SSE
-#endif
-
-#if defined(TYPE_USE_FLOAT)
-
-#define __m256x __m256
-#define __m128x __m128
-
-static const unsigned int AVX_STEP_SIZE = 8;
-static const unsigned int SSE_STEP_SIZE = 4;
-static const unsigned int AVX_CUT_LEN_MASK = 7U;
-static const unsigned int SSE_CUT_LEN_MASK = 3U;
-
-#define _mm256_mul_px _mm256_mul_ps
-#define _mm256_add_px _mm256_add_ps
-#define _mm256_load_px _mm256_loadu_ps
-#define _mm256_store_px _mm256_storeu_ps
-#define _mm256_broadcast_sx _mm256_broadcast_ss
-
-#define _mm_add_px _mm_add_ps
-#define _mm_mul_px _mm_mul_ps
-#define _mm_load_px _mm_loadu_ps
-#define _mm_store_px _mm_storeu_ps
-#define _mm_load1_px _mm_load1_ps
-
-#endif
-
-template <typename T>
-inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) {
-  unsigned int jjj, lll;
-  jjj = lll = 0;
-
-#if defined(USE_AVX)
-  lll = len & ~AVX_CUT_LEN_MASK;
-  __m256x mm_alpha = _mm256_broadcast_sx(&alpha);
-  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
-    _mm256_store_px(
-        y + jjj,
-        _mm256_add_px(_mm256_load_px(y + jjj),
-                      _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
-  }
-
-#elif defined(USE_SSE)
-  lll = len & ~SSE_CUT_LEN_MASK;
-  __m128x mm_alpha = _mm_load1_px(&alpha);
-  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
-    _mm_store_px(y + jjj,
-                 _mm_add_px(_mm_load_px(y + jjj),
-                            _mm_mul_px(mm_alpha, _mm_load_px(x + jjj))));
-  }
-
-#endif
-  for (; jjj < len; jjj++) {
-    y[jjj] += alpha * x[jjj];
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc
deleted file mode 100644
index 67fca18000a4fac1e2ca39fc26ebe67649a51bc3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/selu_op.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/selu_op.h"
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SeluOp : public framework::OperatorWithKernel {
- public:
-  SeluOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SeluOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SeluOp should not be null.");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.GetPlace());
-  }
-};
-
-class SeluOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
-  }
-};
-
-class SeluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor of selu operator.");
-    AddOutput("Out", "The output tensor of selu operator.");
-    AddAttr<float>("scale",
-                   "(float) the default value is 1.0507~. For more "
-                   "information about this value, please refer to:"
-                   "https://arxiv.org/abs/1706.02515.")
-        .SetDefault(1.0507009873554804934193349852946);
-    AddAttr<float>("alpha",
-                   "(float) the default value is 1.6732~. For more "
-                   "information about this value, please refer to:"
-                   "https://arxiv.org/abs/1706.02515.")
-        .SetDefault(1.6732632423543772848170429916717);
-    AddComment(R"DOC(
-Selu Operator.
-
-The equation is:
-$$
-f(x) =\lambda*
-\begin{cases}
- \quad \quad   x,  \quad \quad \quad \text{if} \ x > 0 \\
- \alpha * e^x - \alpha,  \qquad  \text{if} \ x <= 0
-\end{cases}
-$$
-
-The input `X` can carry the LoD (Level of Details) information,
-or not. And the output shares the LoD information with input `X`.
-)DOC");
-  }
-};
-
-class SeluGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("selu_grad");
-    grad_op->SetInput("Out", Output("Out"));
-    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class SeluGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null");
-    auto x_grad_name = framework::GradVarName("X");
-    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("Out"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::GetDataTypeOfVar(ctx.InputVar("Out")), ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType,
-                  ops::SeluGradMaker);
-REGISTER_OPERATOR(selu_grad, ops::SeluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    selu, ops::SeluKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SeluKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    selu_grad, ops::SeluGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SeluGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/selu_op.cu b/paddle/fluid/operators/selu_op.cu
deleted file mode 100644
index fb3245ab7609ea9067709134a3713e9871dbb4d4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/selu_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/selu_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    selu, ops::SeluKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SeluKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    selu_grad, ops::SeluGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SeluGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/selu_op.h b/paddle/fluid/operators/selu_op.h
deleted file mode 100644
index b2fc834c42f65ff3521b6267ed2f32fabbab4e4d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/selu_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct SeluFunctor {
-  SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
-      : x_data_ptr_(x_data_ptr),
-        alpha_(alpha),
-        scale_(scale),
-        y_data_ptr_(y_data_ptr) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T x_ele = x_data_ptr_[idx];
-    if (x_ele <= 0) {
-      x_ele = alpha_ * real_exp(x_ele) - alpha_;
-    }
-    y_data_ptr_[idx] = scale_ * x_ele;
-  }
-  const T* x_data_ptr_;
-  const float alpha_;
-  const float scale_;
-  T* y_data_ptr_;
-};
-
-template <typename T>
-struct SeluGradFunctor {
-  SeluGradFunctor(const T* y_data_ptr, const T* dy_data_ptr, float alpha,
-                  float scale, T* dx_data_ptr)
-      : y_data_ptr_(y_data_ptr),
-        dy_data_ptr_(dy_data_ptr),
-        alpha_(alpha),
-        scale_(scale),
-        la_(alpha * scale),
-        dx_data_ptr_(dx_data_ptr) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T y_ele = y_data_ptr_[idx];
-    T dy_ele = dy_data_ptr_[idx];
-
-    float tmp = scale_;
-    if (y_ele <= 0) {
-      tmp = y_ele + la_;
-    }
-    dx_data_ptr_[idx] = dy_ele * tmp;
-  }
-  const T* y_data_ptr_;
-  const T* dy_data_ptr_;
-  const float alpha_;
-  const float scale_;
-  const float la_;
-  T* dx_data_ptr_;
-};
-
-template <typename DeviceContext, typename T>
-class SeluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-
-    float alpha = context.Attr<float>("alpha");
-    float scale = context.Attr<float>("scale");
-
-    auto out_ptr = out->mutable_data<T>(context.GetPlace());
-
-    SeluFunctor<T> functor(x->data<T>(), alpha, scale, out_ptr);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    size_t limit = static_cast<size_t>(x->numel());
-    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SeluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = framework::Tensor;
-
-    auto* out = context.Input<Tensor>("Out");
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-
-    float alpha = context.Attr<float>("alpha");
-    float scale = context.Attr<float>("scale");
-
-    auto dx_ptr = dx->mutable_data<T>(context.GetPlace());
-
-    SeluGradFunctor<T> functor(out->data<T>(), dout->data<T>(), alpha, scale,
-                               dx_ptr);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    size_t limit = static_cast<size_t>(out->numel());
-    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/CMakeLists.txt b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
deleted file mode 100644
index 5d468316e8eacb73c4a4ce81c784880bb5e46c2d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-include(operators)
-register_operators()
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
deleted file mode 100644
index d652f9216f8faf53deeac2c9ce1f737651c3939b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class SeqConcatOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The inputs of sequence concat op").AsDuplicable();
-    AddOutput("Out", "The output of sequence concat op");
-    AddComment(
-        "Sequence Concat Op\n"
-        "It will concat LoD tensors by its sequence information.\n"
-        "For example:\n"
-        "  LoD of X1 = [0, 3, 7]\n"
-        "  LoD of X2 = [0, 7, 9]\n"
-        "  Result LoD is [0, (3+7), (7+9)]\n"
-        "            i.e.[0, 10, 16]\n");
-  }
-};
-
-class SeqConcatShapeInferer : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInputs("X"),
-                   "Input(X) of Sequence Concat Op should not be null.");
-    PADDLE_ENFORCE(context->HasOutput("Out"),
-                   "Output(Out) of Sequence Concat Op should not be null.");
-
-    PADDLE_ENFORCE_GT(context->Inputs("X").size(), 1,
-                      "The number of input sequences is at least two.");
-    auto x_dims = context->GetInputsDim("X");
-    int64_t batch_size = 0;
-    int64_t feature_size = 0;
-    std::vector<int64_t> out_dims;
-    for (auto &x_dim : x_dims) {
-      if (out_dims.empty()) {
-        out_dims = framework::vectorize(x_dim);
-      }
-      batch_size += x_dim[0];
-      if (feature_size == 0) {
-        feature_size = framework::product(x_dim) / x_dim[0];
-      } else {
-        PADDLE_ENFORCE_EQ(
-            feature_size, framework::product(x_dim) / x_dim[0],
-            "Inputs of sequence concat must have same feature size");
-      }
-    }
-    if (batch_size < 0) {
-      batch_size = -1;  // Normalize batch size for compile time.
-    }
-    out_dims[0] = batch_size;
-    context->SetOutputDim("Out", framework::make_ddim(out_dims));
-    if (!context->IsRuntime()) {  // Runtime LoD infershape will be computed
-      // in Kernel.
-      context->ShareLoD("X", "Out");
-    }
-  }
-};
-
-class SeqConcatGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sequence_concat_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class SeqConcatGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    context->SetOutputsDim(framework::GradVarName("X"),
-                           context->GetInputsDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SeqConcatGradNoNeedBufferVarsInference,
-                                      "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace op = paddle::operators;
-
-REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
-                  op::SeqConcatOpMaker, op::SeqConcatShapeInferer,
-                  op::SeqConcatGradOpDescMaker);
-template <typename T>
-using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
-REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
-                       Kernel<int64_t>);
-
-REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp,
-                  op::SeqConcatGradNoNeedBufferVarsInference);
-template <typename T>
-using GradKernel =
-    op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>;
-REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>,
-                       GradKernel<double>, GradKernel<int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
deleted file mode 100644
index 7b8043bc4538b486bb73e005769e1585e5c4817e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
-
-template <typename T>
-using Kernel =
-    paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext, T>;
-REGISTER_OP_CUDA_KERNEL(sequence_concat, Kernel<float>, Kernel<double>);
-template <typename T>
-using GradKernel =
-    paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
-                                           T>;
-REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, GradKernel<float>,
-                        GradKernel<double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
deleted file mode 100644
index dd31f9f17265a0a3df1f4a4e1d84378fd0889206..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <utility>
-#include <vector>
-#include "boost/optional.hpp"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-
-namespace paddle {
-namespace operators {
-
-namespace detail {
-template <typename Container>
-inline framework::LoD ConcatLoD(const Container &xs,
-                                std::vector<framework::Tensor> *xs_in_order) {
-  std::vector<size_t> result;
-  result.resize(xs[0].get().lod()[0].size());
-
-  for (size_t i = 1; i < result.size(); ++i) {
-    size_t sum = 0;
-    for (size_t j = 0; j < xs.size(); ++j) {
-      auto &x_lod = xs[j].get().lod()[0];
-      const framework::Tensor &tensor = xs[j].get();
-      if (x_lod[i - 1] < x_lod[i]) {
-        xs_in_order->emplace_back(tensor.Slice(x_lod[i - 1], x_lod[i]));
-      }
-      sum += x_lod[i];
-    }
-    result[i] = sum;
-  }
-  framework::LoD lod;
-  lod.emplace_back(result);
-  return lod;
-}
-}  // namespace detail
-
-template <typename DeviceContext, typename T>
-class SeqConcatKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto xs = detail::VectorRef(context.MultiInput<framework::LoDTensor>("X"),
-                                "Cannot find multiple input X");
-    auto &out = detail::Ref(context.Output<framework::LoDTensor>("Out"),
-                            "Cannot find output");
-
-    size_t lod_size = 0;
-    for (auto &x : xs) {
-      if (lod_size == 0) {
-        lod_size = x.get().lod()[0].size();
-      } else {
-        PADDLE_ENFORCE_EQ(
-            lod_size, x.get().lod()[0].size(),
-            "The number of sequence must be same between each input");
-      }
-    }
-    PADDLE_ENFORCE_NE(lod_size, 0, "Each input must have sequence information");
-
-    std::vector<framework::Tensor> x_in_order;
-    out.set_lod(detail::ConcatLoD(xs, &x_in_order));
-    out.mutable_data<T>(context.GetPlace());
-    math::ConcatFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(), x_in_order, 0,
-            &out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SeqConcatGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto xs = context.MultiInput<framework::LoDTensor>("X");
-    auto dxs =
-        context.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
-    PADDLE_ENFORCE_EQ(xs.size(), dxs.size());
-    for (size_t i = 0; i < dxs.size(); ++i) {
-      if (dxs[i] != nullptr) {
-        dxs[i]->set_lod(xs[i]->lod());
-        dxs[i]->mutable_data<T>(context.GetPlace());
-      }
-    }
-
-    std::vector<framework::Tensor> sliced_x;
-    std::vector<boost::optional<framework::Tensor>> sliced_dx;
-
-    for (size_t i = 1; i < xs[0]->lod()[0].size(); ++i) {
-      for (size_t j = 0; j < xs.size(); ++j) {
-        const framework::LoDTensor *x = xs[j];
-        framework::DDim x_dims = x->dims();
-
-        framework::LoDTensor *dx = dxs[j];
-        auto &x_lod = x->lod()[0];
-        if (x_lod[i - 1] == x_lod[i]) continue;
-
-        auto prev_lod = x_lod[i - 1];
-        auto next_lod = x_lod[i];
-
-        x_dims[0] = next_lod - prev_lod;
-
-        sliced_x.emplace_back();
-        sliced_x.back().Resize(x_dims);
-
-        if (dx) {
-          sliced_dx.emplace_back(dx->Slice(prev_lod, next_lod));
-        } else {
-          sliced_dx.emplace_back(boost::none);
-        }
-      }
-    }
-
-    std::vector<const framework::Tensor *> sliced_x_ptr;
-    sliced_x_ptr.reserve(sliced_x.size());
-    for (auto &x : sliced_x) {
-      sliced_x_ptr.emplace_back(&x);
-    }
-
-    std::vector<framework::Tensor *> sliced_dx_ptr;
-    sliced_dx_ptr.reserve(sliced_dx.size());
-    for (auto &dx : sliced_dx) {
-      if (dx) {
-        sliced_dx_ptr.emplace_back(&dx.get());
-      }
-    }
-
-    math::SplitFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(),
-            detail::Ref(
-                context.Input<framework::Tensor>(framework::GradVarName("Out")),
-                "Sequence Concat OG must be set"),
-            sliced_x_ptr, 0, &sliced_dx_ptr);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
deleted file mode 100644
index 89c1fe834832802cc86dacd5a2d8c22bafa6072b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_set>
-
-namespace paddle {
-namespace operators {
-
-class SequenceConvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                   "Input(Filter) of SequenceConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceConvOp should not be null.");
-
-    int context_length = ctx->Attrs().Get<int>("contextLength");
-    int context_start = ctx->Attrs().Get<int>("contextStart");
-
-    auto in_dims = ctx->GetInputDim("X");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    PADDLE_ENFORCE(ctx->Attrs().Get<int>("contextStride") == 1,
-                   "Currently, SequenceConvOp only supports contextStride=1.");
-    PADDLE_ENFORCE(in_dims.size() == 2 && filter_dims.size() == 2,
-                   "Input(X, Filter) should be 2-D tensor.");
-    PADDLE_ENFORCE(filter_dims[0] == context_length * in_dims[1],
-                   "Filter's height should be context_length * "
-                   "input_hidden_size .");
-
-    if (ctx->Attrs().Get<bool>("paddingTrainable")) {
-      PADDLE_ENFORCE(
-          ctx->HasInput("PaddingData"),
-          "Input(PaddingData) of SequenceConvOp should not be null.");
-      framework::DDim padding_dim = ctx->GetInputDim("PaddingData");
-      int up_pad = std::max(0, -context_start);
-      int down_pad = std::max(0, context_start + context_length - 1);
-      int total_pad = up_pad + down_pad;
-      int input_width = static_cast<int>(in_dims[1]);
-
-      if (context_start == 0 && context_length == 1) {
-        PADDLE_THROW(
-            "If context_start is 0 and context_length is 1, paddingTrainable "
-            "should be false.");
-      }
-      PADDLE_ENFORCE(padding_dim.size() == 2,
-                     "Input(PaddingData) should be 2-D tensor.");
-      PADDLE_ENFORCE(
-          padding_dim[0] == total_pad && padding_dim[1] == input_width,
-          "Input(PaddingData)'s shape is not consistent with 'context_start' "
-          "and 'context_length'.");
-    }
-
-    in_dims[1] = filter_dims[1];
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class SequenceConvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Gradient of output(Out) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X"), "The input(X) should not be null.");
-
-    if (ctx->Attrs().Get<bool>("paddingTrainable") &&
-        ctx->HasOutput(framework::GradVarName("PaddingData"))) {
-      ctx->SetOutputDim(framework::GradVarName("PaddingData"),
-                        ctx->GetInputDim("PaddingData"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
-      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-      ctx->SetOutputDim(framework::GradVarName("Filter"),
-                        ctx->GetInputDim("Filter"));
-    }
-  }
-};
-
-class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(LoDTensor) the input(X) is a LodTensor, which supports "
-        "variable-time length input sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, N), where T is the "
-        "total time steps in this mini-batch and N is the input_hidden_size.");
-    AddInput("PaddingData",
-             "(Tensor, optional) the input(PaddingData) is an optional "
-             "parameter, and it is learnable. "
-             "This is a tensor with shape (P, N), where P is the "
-             "top_pad + bottom_pad, N is the input_hidden_size. In order to "
-             "ensure the equal length of sequence before and after "
-             "convolution, it is necessary to fill the top and bottom of each "
-             "sequence according to context_length, context_stride and "
-             "context_start")
-        .AsDispensable();
-    AddInput(
-        "Filter",
-        "(Tensor) the input(Filter) is an learnable parameter."
-        "This is a tensor with shape (K, M), where K is the "
-        "context_length * input_hidden_size, M is the output feature size.");
-    AddOutput(
-        "Out",
-        "(LoDTensor) the output(Out) is a LodTensor, which support "
-        "variable-time length output sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, M), where, T is the "
-        "total time steps in this mini-batch, M is the output feature size.");
-
-    AddAttr<bool>("paddingTrainable",
-                  "(bool, default:false) the padding data of SequenceConvOp "
-                  "is trainable or not.")
-        .SetDefault(false);
-    AddAttr<int>("contextLength",
-                 "(int) the contextLength of SequenceConvOp is the "
-                 "height of the convolution kernel.")
-        .GreaterThan(0);
-    AddAttr<int>("contextStart",
-                 "(int, default:0) the contextStart of SequenceConvOp "
-                 "represents the beginning of the convolution of the number of "
-                 "rows of sequence, which can be negative. The negative number "
-                 "means to pad contextStart time-steps of zeros or learnable "
-                 "parameters at the beginning of each instance. The positive "
-                 "number means to skip contextStart time-steps of each "
-                 "instance.")
-        .SetDefault(0);
-    AddAttr<int>("contextStride",
-                 "(int, default:1) the contextStride of SequenceConvOp "
-                 "represents the stride length of convolution kernel. "
-                 "Currently, SequenceConvOp only supports"
-                 "contextStride=1.")
-        .SetDefault(1)
-        .GreaterThan(0);
-
-    AddComment(R"DOC(
-Sequence Conv Operator.
-
-SequenceConvOp performs convolution operation on features of contextLength
-time-steps of each instance. The convolution operation calculates the output
-based on the input, filter, strides and paddings parameters.
-The size of each dimension of the parameters is checked during infer-shape.
-In order to ensure the equal length of sequence before and after convolution,
-it is necessary to fill the top and bottom of each sequence based on
-context_length, context_stride and context_start.
-
-    )DOC");
-  }
-};
-
-class SequenceConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sequence_conv_grad");
-    op->SetAttrMap(Attrs());
-
-    if (boost::get<bool>(Attrs().at("paddingTrainable")) &&
-        ForwardOp().Inputs().count("PaddingData") > 0) {
-      op->SetInput("PaddingData", Input("PaddingData"));
-      op->SetOutput(framework::GradVarName("PaddingData"),
-                    InputGrad("PaddingData"));
-    }
-
-    op->SetInput("X", Input("X"));
-    op->SetInput("Filter", Input("Filter"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
-
-    return op;
-  }
-};
-
-class SequenceConvGradNoNeedBufferVarsInference
-    : public framework::NoNeedBufferVarsInference {
- public:
-  using framework::NoNeedBufferVarsInference::NoNeedBufferVarsInference;
-
-  std::unordered_set<std::string> operator()() const override {
-    if (!boost::get<bool>(Attrs().at("paddingTrainable"))) {
-      return {"PaddingData"};
-    } else {
-      return {};
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
-                  ops::SequenceConvGradOpDescMaker);
-
-REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp,
-                  ops::SequenceConvGradNoNeedBufferVarsInference);
-
-REGISTER_OP_CPU_KERNEL(
-    sequence_conv,
-    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc
deleted file mode 100644
index 600981b5e96c279329a67b608a8dd94dee7d88ef..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_conv,
-    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
deleted file mode 100644
index 3a2c9e3f734c8c302e3e4b1c6718b3a236fe897a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/context_project.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class SequenceConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    auto filter = *context.Input<Tensor>("Filter");
-
-    out->mutable_data<T>(context.GetPlace());
-
-    int context_start = context.Attr<int>("contextStart");
-    int context_length = context.Attr<int>("contextLength");
-    int context_stride = context.Attr<int>("contextStride");
-    bool padding_trainable = context.Attr<bool>("paddingTrainable");
-
-    PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
-                      "Only support one level sequence now.");
-
-    const Tensor* padding_data = nullptr;
-    if (padding_trainable) {
-      padding_data = context.Input<Tensor>("PaddingData");
-    }
-
-    int up_pad = std::max(0, -context_start);
-    int down_pad = std::max(0, context_start + context_length - 1);
-    auto sequence_width = static_cast<int64_t>(in->dims()[1]);
-
-    framework::DDim col_shape = {in->dims()[0],
-                                 context_length * sequence_width};
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // Because if padding_trainable is false, padding data should be zeros.
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    set_zero(dev_ctx, &col, static_cast<T>(0));
-    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
-
-    seq_project_functor(dev_ctx, *in, padding_data, padding_trainable,
-                        context_start, context_length, context_stride, up_pad,
-                        down_pad, &col);
-
-    blas.MatMul(col, filter, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* filter_g = context.Output<Tensor>(framework::GradVarName("Filter"));
-    auto* padding_data_g =
-        context.Output<Tensor>(framework::GradVarName("PaddingData"));
-    auto* in = context.Input<LoDTensor>("X");
-    auto* filter = context.Input<Tensor>("Filter");
-
-    int context_start = context.Attr<int>("contextStart");
-    int context_length = context.Attr<int>("contextLength");
-    int context_stride = context.Attr<int>("contextStride");
-    bool padding_trainable = context.Attr<bool>("paddingTrainable");
-
-    PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
-                      "Only support one level sequence now.");
-    auto lod_g_level_0 = in->lod()[0];
-
-    int up_pad = std::max(0, -context_start);
-    int down_pad = std::max(0, context_start + context_length - 1);
-    auto sequence_width = static_cast<int64_t>(in->dims()[1]);
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    // use col_shape in the im2col calculation
-    framework::DDim col_shape = {in->dims()[0],
-                                 sequence_width * context_length};
-    Tensor col;
-
-    if (in_g || filter_g || (padding_trainable && padding_data_g)) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
-      // Because if padding_trainable is false, padding data should be zeros.
-      set_zero(dev_ctx, &col, static_cast<T>(0));
-      blas.MatMul(*out_g, false, *filter, true, &col);
-    }
-    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
-    math::ContextProjectGradFunctor<DeviceContext, T> seq_project_grad_functor;
-
-    if (in_g) {
-      in_g->mutable_data<T>(context.GetPlace());
-      in_g->set_lod(in->lod());
-      set_zero(dev_ctx, in_g, static_cast<T>(0));
-
-      seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start,
-                               context_length, context_stride, up_pad, down_pad,
-                               false, true, padding_data_g, &col);
-    }
-
-    if (padding_trainable && padding_data_g) {
-      padding_data_g->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, padding_data_g, static_cast<T>(0));
-
-      LoDTensor* input = const_cast<LoDTensor*>(in);
-      seq_project_grad_functor(
-          dev_ctx, *input, padding_trainable, context_start, context_length,
-          context_stride, up_pad, down_pad, true, false, padding_data_g, &col);
-    }
-
-    if (filter_g) {
-      filter_g->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_g, static_cast<T>(0));
-
-      Tensor filter_grad = *filter_g;
-      LoDTensor out_grad = *out_g;
-
-      const Tensor* padding_data = nullptr;
-      if (padding_trainable) {
-        padding_data = context.Input<Tensor>("PaddingData");
-      }
-
-      seq_project_functor(dev_ctx, *in, padding_data, padding_trainable,
-                          context_start, context_length, context_stride, up_pad,
-                          down_pad, &col);
-
-      blas.MatMul(col, true, out_grad, false, &filter_grad);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
deleted file mode 100644
index cc4eedbf4de2272caac75eb1e5a1d51feaf8cb38..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SequenceEnumerateOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("X"),
-        "Input(X) of SequecceEnumerate operator should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(X) of SequenceEnumerate operator should not be null.");
-
-    const auto x_dims = ctx->GetInputDim("X");
-    const auto win_size = ctx->Attrs().Get<int>("win_size");
-    ctx->SetOutputDim("Out", {x_dims[0], win_size});
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(2-D LoDTensor with the 2nd dimension equal to 1) "
-             "Input LoDTensor of SequenceEnumerate operator.");
-    AddOutput("Out",
-              "(2-D LoDTensor with the 2nd dimension equal to win_size) "
-              "Output LoDTensor of SequenceEnumerate operator.");
-    AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
-        .AddCustomChecker([](const int& win_size) {
-          PADDLE_ENFORCE(win_size >= 2,
-                         "The window size should be not less than 2.");
-        });
-    AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
-        .SetDefault(0);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
-                  "Skip calling InferShape() function in the runtime.")
-        .SetDefault(true);
-    AddComment(R"DOC(
-Sequence Enumerate Operator.
-
-Generate a new sequence for the input index sequence, which enumerates all the
-sub-sequences with length `win_size` of the input. 
-The enumerated sequence has the same 1st dimension with variable `input`, and
-the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
-    
-Examples:
-Case 1:
-  Input:
-    X.lod = [[0, 3, 5]]
-    X.data = [[1], [2], [3], [4], [5]]
-    X.dims = [5, 1]
-  Attrs:
-    win_size = 2
-    pad_value = 0
-  Output:
-    Out.lod = [[0, 3, 5]]
-    Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
-    Out.dims = [5, 2]
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate, ops::SequenceEnumerateOp,
-                             ops::SequenceEnumerateOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    sequence_enumerate,
-    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
deleted file mode 100644
index d5deb7582c7c00f3102ea568a716b715611212ce..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-__global__ void CalcOutPut(const T* in_data, const size_t* in_lod,
-                           const size_t lod_len, const int64_t win_size,
-                           const int64_t pad_value, T* out_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < in_lod[lod_len - 1]) {
-    int end_idx = 0;
-    // Get LoD interval of index
-    for (int i = 1; i < lod_len; ++i) {
-      if (index < in_lod[i]) {
-        end_idx = in_lod[i];
-        break;
-      }
-    }
-    for (size_t i = 0; i < win_size; ++i) {
-      int word_pos = index + i;
-      out_data[index * win_size + i] =
-          word_pos < end_idx ? in_data[word_pos] : pad_value;
-    }
-  }
-}
-
-template <typename T>
-class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int win_size = context.Attr<int>("win_size");
-    int pad_value = context.Attr<int>("pad_value");
-
-    auto in_dims = in->dims();
-    auto in_lod = in->lod();
-
-    PADDLE_ENFORCE_EQ(
-        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
-        "The actual input data's size mismatched with LoD information.");
-
-    /* Generate enumerate sequence set */
-    auto stream = context.cuda_device_context().stream();
-    auto lod0 = in_lod[0];
-    auto in_len = in->numel();
-    auto in_data = in->data<T>();
-    out->Resize({in_dims[0], win_size});
-    auto out_data = out->mutable_data<T>(context.GetPlace());
-    // Copy LoD to GPU
-    const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
-    // Calc output tensor
-    CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
-                 PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data);
-    out->set_lod(in->lod());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    sequence_enumerate,
-    paddle::operators::SequenceEnumerateOpCUDAKernel<int32_t>,
-    paddle::operators::SequenceEnumerateOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
deleted file mode 100644
index 6c5a2e968086bd4c3d0d56ae5c81a09dda91ab86..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ /dev/null
@@ -1,77 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class SequenceEnumerateKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int win_size = context.Attr<int>("win_size");
-    auto pad_value = static_cast<T>(context.Attr<int>("pad_value"));
-
-    auto in_dims = in->dims();
-    auto lod0 = in->lod()[0];
-    PADDLE_ENFORCE_EQ(
-        static_cast<uint64_t>(in_dims[0]), lod0.back(),
-        "The actual input data's size mismatched with LoD information.");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(), 2UL,
-        "Input(X) of SequenceEnumerate operator's rank should be 2.");
-    PADDLE_ENFORCE_EQ(in_dims[1], 1,
-                      "Input(X) of SequenceEnumerate operator's 2nd "
-                      "dimension should be 1.");
-
-    // Generate enumerate sequence set
-    auto in_data = in->data<T>();
-    out->Resize({in_dims[0], win_size});
-    out->set_lod(in->lod());
-    auto out_data = out->mutable_data<T>(context.GetPlace());
-    for (size_t i = 0; i < lod0.size() - 1; ++i) {
-      if (lod0[i] == lod0[i + 1]) continue;
-      int start = lod0[i];
-      int end = lod0[i + 1];
-
-      int copy_size = win_size < end - start + 1 ? win_size : end - start + 1;
-      int mid = end + 1 - copy_size;
-      int pad_num = win_size - copy_size;
-      copy_size *= sizeof(T);
-      for (int idx = start; idx < mid; ++idx) {
-        std::memcpy(out_data, in_data + idx, copy_size);
-        out_data += win_size;
-      }
-      for (int idx = mid; idx < end; ++idx) {
-        copy_size -= sizeof(T);
-        pad_num++;
-        std::memcpy(out_data, in_data + idx, copy_size);
-        T* pdata = out_data + copy_size / sizeof(T);
-        for (int i = 0; i < pad_num; ++i) {
-          pdata[i] = pad_value;
-        }
-        out_data += win_size;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
deleted file mode 100644
index ddda80ee0824e261b0d737f86e03866d5fdfd77a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class SequenceEraseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceErase operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceErase operator should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1,
-                   "Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
-                   "with the 2nd dimension equal to 1.");
-    ctx->SetOutputDim("Out", x_dims);
-  }
-};
-
-class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(2-D LoDTensor with the 2nd dim. equal to 1) "
-             "Input LoDTensor of SequenceEraseOp.");
-    AddOutput("Out",
-              "(2-D LoDTensor with the 2nd dim. equal to 1) "
-              "Output LoDTensor of SequenceEraseOp.");
-    AddAttr<std::vector<int>>("tokens",
-                              "(vector<int>) Tokens need to be erased from "
-                              "input sequences.");
-    AddComment(R"DOC(
-Sequence Erase Operator.
-
-Sequence erase operator erases tokens specified by Attr(tokens) from the input 
-sequences Input(X), and outputs the remaining data and modifies the LoD 
-information at the same time. For example, given a 2-D LoDTensor
-
-    X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
-
-with lod = [[0, 3, 6, 10]], there are three sequences in the input:
-   
-     X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T.
-
-If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing 
-operation, the three sequences become
-
-    X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
-
-Hence the LoDTensor Output(Out) should be
-
-    Out = [[6, 1, 9, 6, 1, 0, 1]]^T,
-
-with lod = [[0, 1, 3, 7]].
-
-An example usage for this operator is to remove the special tokens when 
-computing the edit distance between two strings, such as blank, start token, 
-and end token.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp,
-                             ops::SequenceEraseOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    sequence_erase,
-    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
deleted file mode 100644
index 0401c22c92e1a9be35c2ff6b2c7e95924afe3f1b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-__global__ void LabelErasedIdx(const T* in_dat, const int64_t in_len,
-                               const int* tokens, const size_t tokens_len,
-                               size_t* num_erased) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < in_len) {
-    for (size_t i = 0; i < tokens_len; ++i) {
-      if (in_dat[index] == tokens[i]) {
-        num_erased[index + 1] = 1;
-        break;
-      }
-    }
-  }
-}
-
-__global__ void GetOutLod(const size_t* num_erased, const size_t* in_lod,
-                          const size_t lod_len, size_t* out_lod0) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < lod_len) {
-    out_lod0[index] = in_lod[index] - num_erased[in_lod[index]];
-  }
-}
-
-template <typename T>
-__global__ void SetOutput(const T* in_dat, const int64_t in_len,
-                          const size_t* num_erased, T* out_dat) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < in_len) {
-    if (num_erased[index] == num_erased[index + 1]) {
-      out_dat[index - num_erased[index]] = in_dat[index];
-    }
-  }
-}
-
-template <typename T>
-class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-
-    auto lod = in->lod();
-    PADDLE_ENFORCE_EQ(lod[lod.size() - 1].back(), (size_t)in->numel(),
-                      "The actual size mismatches with the LoD information.");
-    auto tokens = ctx.Attr<std::vector<int>>("tokens");
-    auto in_len = in->numel();
-    auto in_dat = in->data<T>();
-    // Copy tokens to GPU
-    thrust::device_vector<int> dev_tokens(tokens.begin(), tokens.end());
-    int* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
-
-    // Count number of elements to be erased
-    thrust::device_vector<size_t> num_erased(in_len + 1, 0);
-    size_t* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
-    auto stream = ctx.cuda_device_context().stream();
-    LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
-                     PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_dat, in_len, dev_tokens_ptr, tokens.size(), num_erased_ptr);
-    thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(),
-                           num_erased.begin() + 1);
-
-    // Copy LoD to GPU
-    auto last_lod = lod[lod.size() - 1];
-    auto lod_len = last_lod.size();
-    const size_t* dev_in_lod_ptr = last_lod.CUDAData(ctx.GetPlace());
-    // Calc output LoD
-    thrust::device_vector<size_t> dev_out_lod(lod_len);
-    size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
-    GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
-                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
-    // Set LoD for output
-    std::vector<size_t> out_last_lod(dev_out_lod.begin(), dev_out_lod.end());
-    framework::LoD out_lod;
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      out_lod.push_back(lod[i]);
-    }
-    out_lod.push_back(out_last_lod);
-    out->set_lod(out_lod);
-
-    // Set output
-    out->Resize({static_cast<int64_t>(out_last_lod.back()), 1});
-    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
-    SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
-                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
-                                                      num_erased_ptr, out_dat);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(sequence_erase,
-                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>,
-                        paddle::operators::SequenceEraseOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
deleted file mode 100644
index af5a64dce5d2484ad9006f0c30e8851746794f38..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SequenceEraseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::LoDTensor>("X");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-
-    auto lod = in->lod();
-    PADDLE_ENFORCE_EQ(lod[lod.size() - 1].back(), (size_t)in->numel(),
-                      "The actual size mismatches with the LoD information.");
-    auto tokens = ctx.Attr<std::vector<int>>("tokens");
-    auto in_len = in->numel();
-    auto in_dat = in->data<T>();
-    auto last_lod = lod[lod.size() - 1];
-
-    std::vector<size_t> num_erased(in_len + 1, 0);
-    std::vector<size_t> out_last_lod(1, 0);
-    for (size_t i = 0; i < last_lod.size() - 1; ++i) {
-      size_t num_out = 0;
-      for (auto j = last_lod[i] + 1; j <= last_lod[i + 1]; ++j) {
-        num_erased[j] = num_erased[j - 1];
-        if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) !=
-            tokens.end()) {
-          num_erased[j] += 1;
-        } else {
-          num_out += 1;
-        }
-      }
-      out_last_lod.push_back(out_last_lod.back() + num_out);
-    }
-
-    auto out_len = in_len - num_erased[in_len];
-    out->Resize({static_cast<int64_t>(out_len), 1});
-    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
-
-    for (int64_t i = 0; i < in_len; ++i) {
-      if (num_erased[i] == num_erased[i + 1]) {
-        out_dat[i - num_erased[i]] = in_dat[i];
-      }
-    }
-    framework::LoD out_lod;
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      out_lod.push_back(lod[i]);
-    }
-    out_lod.push_back(out_last_lod);
-    out->set_lod(out_lod);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
deleted file mode 100644
index e1f6c3e3d599340acfa9bb5b47017b003721e4a3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-using framework::LoDTensor;
-
-class SequenceExpandAsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceExpandAsOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of SequenceExpandAsOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceExpandAsOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto out_dims = x_dims;
-
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "Dimension number of Input(X) should be at least 2.");
-
-    if (ctx->IsRuntime()) {
-      framework::Variable* x_var =
-          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
-      framework::Variable* y_var =
-          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
-
-      auto& x_dim = x_var->Get<LoDTensor>().dims();
-      auto& y_lod = y_var->Get<LoDTensor>().lod();
-
-      PADDLE_ENFORCE_EQ(y_lod.size(), 1,
-                        "Level number of Input(Y)'s lod should be 1.");
-
-      PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dim[0]), y_lod[0].size() - 1,
-                        "The first dimension of Input(X) should be equal "
-                        "to the size of Input(Y)'s 0 level lod.");
-
-      int64_t out_first_dim = 0;
-      if (y_lod[0].size() <= 1) {
-        out_first_dim = x_dims[0];
-      } else {
-        for (size_t i = 1; i < y_lod[0].size(); ++i) {
-          out_first_dim += (y_lod[0][i] - y_lod[0][i - 1]);
-        }
-      }
-      out_dims[0] = out_first_dim;
-    } else {
-      out_dims[0] = -1;
-    }
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("Y", /*->*/ "Out");
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor whose lod "
-             "level is at most 1.");
-    AddInput("Y",
-             "(LoDTensor, default LoDTensor<float>) Referred LoDTensor whose "
-             "lod (specified level) is referred by Input(X).");
-    AddOutput("Out",
-              "(LodTensor, default LoDTensor<float>) Output LoDTensor which is "
-              "generated from Input(X) by referring lod of Input(Y).");
-    AddComment(R"DOC(
-Sequence Expand As Operator.
-
-This operator expands `X` according to the zeroth level lod of `Y`. Current
-implementation requires the level number of Input(Y)'s lod should be 1, and
-the first dimension of Input(X) should be equal to the size of Input(Y)'s zeroth
-level lod, and lod of Input(X) is not considered.
-
-Following are cases to better explain how this works:
-
-Case 1:
-
-Given a 1-level LoDTensor input(X)
-    X.data = [[a], [b], [c], [d]]
-    X.dims = [4, 1]
-and input(Y)
-    Y.lod = [[0, 3, 6, 7, 8]]
-ref_level: 0
-then we get 1-level LoDTensor
-    Out.lod =  [[0,            3,              6,  7,  8]]
-    Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
-    Out.dims = [8, 1]
-
-Case 2:
-
-Given a common Tensor input(X)
-    X.data = [[a, b], [c, d], [e, f]]
-    X.dims = [3, 2]
-and input(Y)
-    Y.lod = [[0, 2, 3, 6]]
-ref_level: 0
-then we get a common LoDTensor
-    Out.lod =  [[0,             2,     3,                    6]]
-    Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
-    Out.dims = [6, 2]
-
-)DOC");
-  }
-};
-
-class SequenceExpandAsOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", x_grad_name);
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-};
-
-class SequenceExpandAsOpGradOpDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sequence_expand_as_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Y", Input("Y"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    SequenceExpandAsOpNoNeedBufferVarsInference, "Y");
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    SequenceExpandAsGradOpNoNeedBufferVarsInference, "X", "Y");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_expand_as, ops::SequenceExpandAsOp,
-                  ops::SequenceExpandAsOpMaker,
-                  ops::SequenceExpandAsOpGradOpDescMaker,
-                  ops::SequenceExpandAsOpNoNeedBufferVarsInference);
-REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad,
-                  ops::SequenceExpandAsGradOpNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(
-    sequence_expand_as,
-    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_expand_as_grad,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
deleted file mode 100644
index 998bf82ab1ddcd815491de95a3f7cf987036ee65..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-static __global__ void sequence_expand_as_kernel(const T *in_data,
-                                                 const size_t *expand_offset,
-                                                 const size_t src_hight,
-                                                 const size_t src_widht,
-                                                 T *out_data) {
-  for (int h_id = blockIdx.x; h_id < src_hight; h_id += gridDim.x) {
-    int span = expand_offset[h_id + 1] - expand_offset[h_id];
-    if (span == 0) continue;
-    const T *src = in_data + h_id * src_widht;
-    for (int w_id = threadIdx.x; w_id < src_widht; w_id += blockDim.x) {
-      T ele = src[w_id];
-      int offset = expand_offset[h_id] * src_widht;
-      for (int k = 0; k < span; ++k) {
-        out_data[offset + k * src_widht + w_id] = ele;
-      }
-    }
-  }
-}
-
-template <typename T>
-static __global__ void sequence_expand_as_grad_kernel(
-    const T *dout_data, const size_t *expand_offset, const size_t dst_hight,
-    const size_t dst_width, T *dx_data) {
-  for (int h_id = blockIdx.x; h_id < dst_hight; h_id += gridDim.x) {
-    T *dst = dx_data + h_id * dst_width;
-    int span = expand_offset[h_id + 1] - expand_offset[h_id];
-
-    for (int w_id = threadIdx.x; w_id < dst_width; w_id += blockDim.x) {
-      T result = 0;
-      for (int k = 0; k < span; ++k) {
-        int offset = (expand_offset[h_id] + k) * dst_width;
-        const T *src = dout_data + offset;
-        result += src[w_id];
-      }
-      dst[w_id] = result;
-    }
-  }
-}
-
-template <typename T>
-struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
-  void operator()(
-      const platform::CUDADeviceContext &context, const LoDTensor &x,
-      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
-      LoDTensor *out) {
-    int hight = x.dims()[0];
-    int width = framework::product(x.dims()) / hight;
-
-    const int kThreadsPerBlock = 1024;
-    int thread_x = kThreadsPerBlock;
-    if (width < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      thread_x = ((width + 31) >> 5) << 5;
-    }
-
-    int max_threads = context.GetMaxPhysicalThreadCount();
-    int block_x = std::max(max_threads / thread_x, 1);
-
-    dim3 block_size(thread_x);
-    dim3 grid_size(block_x);
-    sequence_expand_as_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), hight, width,
-        out->mutable_data<T>(context.GetPlace()));
-  }
-};
-
-template <typename T>
-struct SequenceExpandAsGradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &context,
-                  const LoDTensor &dout,
-                  const framework::Vector<size_t> &ref_lod, /*expand based lod*/
-                  LoDTensor *dx) {
-    int hight = dx->dims()[0];
-    int width = framework::product(dx->dims()) / hight;
-
-    const int kThreadsPerBlock = 1024;
-    int thread_x = kThreadsPerBlock;
-    if (width < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      thread_x = ((width + 31) >> 5) << 5;
-    }
-
-    int max_threads = context.GetMaxPhysicalThreadCount();
-    int block_x = std::max(max_threads / thread_x, 1);
-
-    dim3 block_size(thread_x);
-    dim3 grid_size(block_x);
-    sequence_expand_as_grad_kernel<<<grid_size, block_size, 0,
-                                     context.stream()>>>(
-        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()), hight, width,
-        dx->mutable_data<T>(context.GetPlace()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_expand_as,
-    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_expand_as_grad,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
deleted file mode 100644
index 42c90d01c05e369efc276498aa94debb367a6bfa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <numeric>  // std::iota
-#include <sstream>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-struct SequenceExpandFunctor {
-  void operator()(
-      const DeviceContext &ctx, const framework::LoDTensor &x,
-      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
-      framework::LoDTensor *out);
-};
-
-template <typename DeviceContext, typename T>
-struct SequenceExpandAsGradFunctor {
-  void operator()(
-      const DeviceContext &ctx, const framework::LoDTensor &dout,
-      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
-      framework::LoDTensor *dx);
-};
-
-template <typename T>
-struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
-  void operator()(
-      const platform::CPUDeviceContext &context, const framework::LoDTensor &x,
-      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
-      framework::LoDTensor *out) {
-    int64_t hight = x.dims()[0];
-    int64_t width = framework::product(x.dims()) / hight;
-
-    const T *in_data = x.data<T>();
-    T *out_data = out->mutable_data<T>(context.GetPlace());
-
-    for (int h_id = 0; h_id < hight; ++h_id) {
-      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
-      if (span == 0) continue;
-      const T *src = in_data + h_id * width;
-      for (int64_t w_id = 0; w_id < width; ++w_id) {
-        T ele = src[w_id];
-        size_t offset = ref_lod[h_id] * width;
-        for (size_t k = 0; k < span; ++k) {
-          out_data[offset + k * width + w_id] = ele;
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceExpandAsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<framework::LoDTensor>("X");
-    auto *y = context.Input<framework::LoDTensor>("Y");
-    auto *out = context.Output<framework::LoDTensor>("Out");
-
-    auto &y_lod = y->lod();
-    PADDLE_ENFORCE_EQ(y_lod.size(), 1, "LoD of Y should be 1.");
-    PADDLE_ENFORCE_GT(y_lod[0].size(), 1, ".");
-
-    out->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    SequenceExpandFunctor<DeviceContext, T> seq_espand_functor;
-    seq_espand_functor(dev_ctx, *x, y_lod[0], out);
-  }
-};
-
-/*
- *Given Grad(Out)
- *
- *    Grad(Out).lod = [[0,              3,            6]]
- *    Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
- * Then
- *    Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)]
- *                 = [0.6, 1.5]
- *    Grad(X).lod = Input(X).lod
- *
- * */
-template <typename T>
-struct SequenceExpandAsGradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(
-      const platform::CPUDeviceContext &context,
-      const framework::LoDTensor &dout,
-      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
-      framework::LoDTensor *dx) {
-    int64_t hight = dx->dims()[0];
-    int64_t width = framework::product(dx->dims()) / hight;
-
-    const T *dout_data = dout.data<T>();
-    T *dx_data = dx->mutable_data<T>(context.GetPlace());
-
-    for (int64_t h_id = 0; h_id < hight; ++h_id) {
-      T *dst = dx_data + h_id * width;
-      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
-      for (int64_t w_id = 0; w_id < width; ++w_id) {
-        T result = 0;
-        for (size_t k = 0; k < span; ++k) {
-          size_t offset = (ref_lod[h_id] + k) * width;
-          result += dout_data[offset + w_id];
-        }
-        dst[w_id] = result;
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceExpandAsGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *g_out =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto *y = context.Input<framework::LoDTensor>("Y");
-    auto *g_x =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-
-    g_x->mutable_data<T>(context.GetPlace());
-
-    SequenceExpandAsGradFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(), *g_out,
-            y->lod()[0], g_x);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
deleted file mode 100644
index b7c0420636ab60e8a3e0a9332cbd3858aacda1b0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ /dev/null
@@ -1,260 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-using framework::LoDTensor;
-
-class SequenceExpandOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceExpandOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of SequenceExpandOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceExpandOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto out_dims = x_dims;
-    int ref_level = ctx->Attrs().Get<int>("ref_level");
-
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "Dimension number of Input(X) should be at least 2.");
-
-    if (ctx->IsRuntime()) {
-      framework::Variable* x_var =
-          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
-      framework::Variable* y_var =
-          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
-
-      auto& x_lod = x_var->Get<LoDTensor>().lod();
-      auto& y_lod = y_var->Get<LoDTensor>().lod();
-
-      PADDLE_ENFORCE_LE(x_lod.size(), 1UL,
-                        "Level number of Input(X)'s lod should not be "
-                        "greater than 1.");
-      PADDLE_ENFORCE_GT(y_lod.size(), 0UL,
-                        "Level number of Input(Y)'s lod should be "
-                        "greater than 0.");
-      PADDLE_ENFORCE(
-          ref_level == -1 ||
-              (ref_level >= 0 && ref_level < static_cast<int>(y_lod.size())),
-          "Invlid `ref_level`, which should be either equal to -1 "
-          "or in [0, %d)",
-          y_lod.size());
-
-      if (ref_level == -1) ref_level = y_lod.size() - 1;
-
-      if (x_lod.size() > 0) {
-        PADDLE_ENFORCE(x_lod[0].size() == y_lod[ref_level].size(),
-                       "Level number of Input(X)'s lod could be 0. Otherwise "
-                       "size of Input(X)'s first level lod should be equal to "
-                       "size of Input(Y)'s referred level lod.");
-      } else {
-        PADDLE_ENFORCE_EQ(x_dims[0],
-                          static_cast<int64_t>(y_lod[ref_level].size()) - 1,
-                          "When Input(X)'s lod is null, the dims[0] of "
-                          "Input(X) should match the "
-                          "size of Input(Y)'s referred level lod.");
-      }
-
-      int64_t out_first_dim = 0;
-      if (y_lod[ref_level].size() <= 1) {
-        out_first_dim = x_dims[0];
-      } else {
-        for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-          int x_seq_len = 1;
-          if (x_lod.size() == 1) {
-            x_seq_len = x_lod[0][i] - x_lod[0][i - 1];
-          }
-          out_first_dim +=
-              (y_lod[ref_level][i] - y_lod[ref_level][i - 1]) * x_seq_len;
-        }
-      }
-      out_dims[0] = out_first_dim;
-    } else {
-      out_dims[0] = -1;
-    }
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor whose lod "
-             "level is at most 1.");
-    AddInput("Y",
-             "(LoDTensor, default LoDTensor<float>) Referred LoDTensor whose "
-             "lod (specified level) is referred by Input(X).");
-    AddOutput("Out",
-              "(LodTensor, default LoDTensor<float>) Output LoDTensor which is "
-              "generated from Input(X) by referring lod of Input(Y).");
-    AddAttr<int>("ref_level", "Specify lod level of Input(Y).").SetDefault(-1);
-    AddComment(R"DOC(
-Sequence Expand Operator.
-
-This operator expands `X` according to specified level lod of `Y`. Current
-implementation constaints that lod level of `X` should be at most 1. Attribute
-`ref_level` is used to specify which level lod of `Y` is referred to expand `X`.
-If set `ref_level` to -1, then last level lod of `Y` would be referred.
-Please note, rank of `X` should be at least 2, when the rank exceeds 2, `X`
-would be viewed as a 2-D tensor.
-
-Following are cases to better explain how this works:
-
-Case 1:
-
-Given a 1-level LoDTensor input(X)
-    X.lod =  [[0,   2,        4]]
-    X.data = [[a], [b], [c], [d]]
-    X.dims = [4, 1]
-and input(Y)
-    Y.lod = [[0,    2,    4],
-             [0, 3, 6, 7, 8]]
-ref_level: 0
-then we get 1-level LoDTensor
-    Out.lod =  [[0,   2,        4,        6,        8]]
-    Out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
-    Out.dims = [8, 1]
-
-Case 2:
-
-Given 1-level LoDTensor input(X)
-    X.lod =  [[0,   1,        4]]
-    X.data = [[a], [b], [c], [d]]
-    X.dims = [4, 1]
-and input(Y)
-    Y.lod = [[0,    2,    4],
-             [0, 3, 6, 6, 8]]
-ref_level: 0
-then we get 1-level LoDTensor
-    Out.lod =  [[0,   1,   2,        5,             8]]
-    Out.data = [[a], [a], [b], [c], [d], [b], [c], [d]]
-    Out.dims = [8, 1]
-
-Case 3:
-
-Given a common Tensor input(X)
-    X.data = [[a], [b], [c]]
-    X.dims = [3, 1]
-and input(Y)
-    Y.lod = [[0, 2, 3, 6]]
-ref_level: -1
-then we get a common Tensor
-    Out.data = [[a], [a], [b], [c], [c], [c]]
-    Out.dims = [6, 1]
-
-Case 4:
-
-Given a common Tensor input(X)
-    X.data = [[a, b], [c, d], [e, f]]
-    X.dims = [3, 2]
-and input(Y)
-    Y.lod = [[0, 2, 3, 6]]
-ref_level: 0
-then we get a common LoDTensor
-    Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
-    Out.dims = [6, 2]
-
-)DOC");
-  }
-};
-
-class SequenceExpandOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-};
-
-class SequenceExpandOpGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sequence_expand_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Y", Input("Y"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SequenceExpandOpNoNeedBufferVarsInference,
-                                      "Y");
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    SequenceExpandGradOpNoNeedBufferVarsInference, "X", "Y");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp,
-                  ops::SequenceExpandOpMaker,
-                  ops::SequenceExpandOpGradDescMaker,
-                  ops::SequenceExpandOpNoNeedBufferVarsInference);
-REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad,
-                  ops::SequenceExpandGradOpNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(
-    sequence_expand,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_expand_grad,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
deleted file mode 100644
index 888d1a12e6751eeb91f0af04b50cf6d5bea74162..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-__global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
-                                       const size_t* ref_lod,
-                                       const size_t* offset,
-                                       const size_t lod_size,
-                                       /* default=1,
-                                          the instance length*/
-                                       const int x_item_length, T* out_data) {
-  int bid = blockIdx.x;
-  if (bid >= lod_size - 1) return;
-
-  int x_item_count = x_lod[bid + 1] - x_lod[bid];
-  int repeats = ref_lod[bid + 1] - ref_lod[bid];
-  int out_offset = static_cast<int>(offset[bid]);
-  int x_offset = x_lod[bid];
-  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
-    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
-      for (int tid_x = threadIdx.x; tid_x < x_item_length;
-           tid_x += blockDim.x) {
-        out_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length +
-                 tid_x] = x_data[(x_offset + tid_y) * x_item_length + tid_x];
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void sequence_expand_grad_kernel(
-    const T* dout_data, const size_t* ref_lod, const size_t* dx_lod,
-    const size_t* offset, const size_t lod_size,
-    /* default=1,
-       the instance length*/
-    const int x_item_length, T* dx_data) {
-  int bid = blockIdx.x;
-  if (bid >= lod_size - 1) return;
-  int x_item_count = dx_lod[bid + 1] - dx_lod[bid];
-  int repeats = ref_lod[bid + 1] - ref_lod[bid];
-  int out_offset = static_cast<int>(offset[bid]);
-  int x_offset = dx_lod[bid];
-
-  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
-    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
-      for (int tid_x = threadIdx.x; tid_x < x_item_length;
-           tid_x += blockDim.x) {
-        platform::CudaAtomicAdd(
-            &dx_data[(x_offset + tid_y) * x_item_length + tid_x],
-            dout_data[(out_offset + tid_z * x_item_count + tid_y) *
-                          x_item_length +
-                      tid_x]);
-      }
-    }
-  }
-}
-
-void GetOutputOffset(const framework::Vector<size_t>& x_lod,
-                     const framework::Vector<size_t>& ref_lod,
-                     framework::Vector<size_t>* out_offset) {
-  size_t offset = 0;
-  int lod_size = static_cast<int>(x_lod.size());
-  for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
-    (*out_offset)[i] = offset;
-    if (i < lod_size - 1) {
-      offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
-    }
-  }
-}
-
-template <typename T>
-static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
-                              const LoDTensor& x, LoDTensor* out,
-                              const framework::Vector<size_t>& x_lod,
-                              const framework::Vector<size_t>& ref_lod,
-                              bool do_copy) {
-  auto out_data = out->data<T>();
-  auto x_data = x.data<T>();
-
-  auto& gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
-
-  int x_item_length = x.numel() / x.dims()[0];
-  int out_offset = 0;
-  int num_copys = 0;
-  for (size_t i = 1; i < ref_lod.size(); ++i) {
-    int repeat_num = ref_lod[i] - ref_lod[i - 1];
-    int x_start = x_lod[i - 1];
-    int x_end = x_lod[i];
-    int x_seq_len = x_end - x_start;
-    if (repeat_num > 0) {
-      if (do_copy) {
-        int out_start = out_offset;
-        if (out->lod().size() == 1) {
-          out_start = out->lod()[0][out_offset];
-        }
-        for (int j = 0; j < repeat_num; j++) {
-          for (int k = 0; k < x_seq_len; k++) {
-            memory::Copy(
-                gpu_place,
-                out_data + (out_start + j * x_seq_len + k) * x_item_length,
-                gpu_place, x_data + (x_start + k) * x_item_length,
-                sizeof(T) * x_item_length, context.stream());
-          }
-        }
-      } else {
-        num_copys += repeat_num * x_seq_len;
-      }
-    }
-    out_offset += repeat_num;
-  }
-  return num_copys;
-}
-
-template <typename T>
-struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
-  void operator()(
-      const platform::CUDADeviceContext& context, const LoDTensor& x,
-      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
-      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
-      LoDTensor* out) {
-    int num_copys =
-        ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, false);
-    // Sometimes direct copies will be faster, this maybe need deeply analysis.
-    if (num_copys < 5) {
-      ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, true);
-    } else {
-      int x_item_length = x.numel() / x.dims()[0];
-      size_t x_lod_size = x_lod.size();
-      framework::Vector<size_t> out_offset(x_lod_size * 2 + ref_lod.size());
-      GetOutputOffset(x_lod, ref_lod, &out_offset);
-
-      for (size_t i = 0; i < x_lod_size; ++i) {
-        out_offset[x_lod_size + i] = x_lod[i];
-      }
-      for (size_t i = 0; i < ref_lod.size(); ++i) {
-        out_offset[2 * x_lod_size + i] = ref_lod[i];
-      }
-
-      const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
-      const size_t* x_lod_data = out_offset_data + x_lod_size;
-      const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
-
-      int thread_x =
-          std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
-      int thread_y = 16;
-      int thread_z = 1024 / thread_x / thread_y;
-      int block_x = static_cast<int>(ref_lod.size());
-      dim3 block_size(thread_x, thread_y, thread_z);
-      dim3 grid_size(block_x, 1);
-
-      sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-          x.data<T>(), x_lod_data, ref_lod_data, out_offset_data, x_lod_size,
-          x_item_length, out->mutable_data<T>(context.GetPlace()));
-    }
-  }
-};
-
-template <typename T>
-struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const LoDTensor& dout,
-                  const framework::Vector<size_t>& x_lod, /*expand source lod*/
-                  const framework::Vector<size_t>& ref_lod, /*expand based lod*/
-                  LoDTensor* dx) {
-    int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
-    framework::Vector<size_t> out_offset(x_lod.size());
-    GetOutputOffset(x_lod, ref_lod, &out_offset);
-
-    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
-    int thread_y = 16;
-    int thread_z = 1024 / thread_x / thread_y;
-    int block_x = static_cast<int>(ref_lod.size());
-    dim3 block_size(thread_x, thread_y, thread_z);
-    dim3 grid_size(block_x, 1);
-    sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
-        x_lod.CUDAData(context.GetPlace()),
-        out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
-        dx->mutable_data<T>(context.GetPlace()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_expand,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_expand_grad,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
deleted file mode 100644
index fac63f3fa0791ca04b5743891ddb829cb9c448fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <numeric>  // std::iota
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-struct SequenceExpandFunctor {
-  void operator()(
-      const DeviceContext& ctx, const LoDTensor& x,
-      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
-      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
-      LoDTensor* out);
-};
-
-template <typename DeviceContext, typename T>
-struct SequenceExpandGradFunctor {
-  void operator()(
-      const DeviceContext& ctx, const LoDTensor& dout,
-      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
-      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
-      LoDTensor* dx);
-};
-
-template <typename T>
-struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
-  void operator()(
-      const platform::CPUDeviceContext& context, const LoDTensor& x,
-      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
-      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
-      LoDTensor* out) {
-    int out_offset = 0;
-    int x_item_length = x.numel() / x.dims()[0];
-    auto out_data = out->data<T>();
-    auto x_data = x.data<T>();
-    for (size_t i = 1; i < ref_lod.size(); ++i) {
-      int repeat_num = ref_lod[i] - ref_lod[i - 1];
-      int x_start = x_lod[i - 1];
-      int x_end = x_lod[i];
-      int x_seq_len = x_end - x_start;
-      if (repeat_num > 0) {
-        int out_start = out_offset;
-        if (out->lod().size() == 1) {
-          out_start = out->lod()[0][out_offset];
-        }
-        for (int j = 0; j < repeat_num; j++) {
-          for (int k = 0; k < x_seq_len; k++) {
-            for (int l = 0; l < x_item_length; l++) {
-              out_data[(out_start + j * x_seq_len + k) * x_item_length + l] =
-                  x_data[(x_start + k) * x_item_length + l];
-            }
-          }
-        }
-      }
-      out_offset += repeat_num;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceExpandKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<LoDTensor>("X");
-    auto* y = context.Input<LoDTensor>("Y");
-    auto* out = context.Output<LoDTensor>("Out");
-
-    int ref_level = context.Attr<int>("ref_level");
-    auto& x_lod = x->lod();
-    auto& y_lod = y->lod();
-
-    if (ref_level == -1) ref_level = y_lod.size() - 1;
-
-    out->mutable_data<T>(context.GetPlace());
-
-    if (y_lod[ref_level].size() <= 1) {
-      framework::TensorCopy(*x, context.GetPlace(), out);
-      return;
-    }
-
-    // x lod level is at most 1.
-    framework::Vector<size_t> out_lod;
-    if (x_lod.size() == 1) {
-      out_lod.push_back(0);
-      int out_offset = 0;
-      for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-        int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
-        int x_start = x_lod[0][i - 1];
-        int x_end = x_lod[0][i];
-        int x_seq_len = x_end - x_start;
-        for (int j = 0; j < repeat_num; ++j) {
-          out_lod.push_back(out_lod.back() + x_seq_len);
-          out_offset++;
-        }
-      }
-      // write lod to out if x has lod
-      auto& ref_lod = *out->mutable_lod();
-      ref_lod[0] = out_lod;
-    }
-    framework::Vector<size_t> ref_x_lod;
-    if (x->lod().size() == 1) {
-      ref_x_lod = x->lod()[0];
-    } else {
-      // x_lod doesn't has lod, use fake x lod, level = 0
-      ref_x_lod.resize(x->dims()[0] + 1);
-      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
-    }
-    SequenceExpandFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(), *x, ref_x_lod,
-            y_lod[ref_level], out);
-  }
-};
-
-/*
- *Given Grad(Out)
- *
- *    Grad(Out).lod = [[0,                            2],
- *                     [0,              3,            6]]
- *    Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
- * Then
- *    Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)]
- *                 = [0.6, 1.5]
- *    Grad(X).lod = Input(X).lod
- *
- * */
-template <typename T>
-struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(
-      const platform::CPUDeviceContext& context, const LoDTensor& dout,
-      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
-      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
-      LoDTensor* dx) {
-    int dout_offset = 0;
-    for (size_t i = 1; i < ref_lod.size(); ++i) {
-      int repeat_num = ref_lod[i] - ref_lod[i - 1];
-      if (repeat_num > 0) {
-        int x_start = x_lod[i - 1];
-        int x_end = x_lod[i];
-        int x_seq_len = x_end - x_start;
-        if (x_seq_len == 0) continue;
-        auto dx_sub = dx->Slice(x_start, x_end);
-        dx_sub.Resize(flatten_to_1d(dx_sub.dims()));
-        int dout_end = dout_offset + repeat_num * x_seq_len;
-        auto dout_sub = dout.Slice(dout_offset, dout_end);
-        dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
-        math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
-        col_sum(context, dout_sub, &dx_sub);
-        dout_offset += repeat_num * x_seq_len;
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceExpandGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* g_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* x = context.Input<LoDTensor>("X");
-    auto* y = context.Input<LoDTensor>("Y");
-    auto* g_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    int ref_level = context.Attr<int>("ref_level");
-
-    g_x->mutable_data<T>(context.GetPlace());
-    g_x->set_lod(x->lod());
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, g_x, static_cast<T>(0));
-
-    auto& y_lod = y->lod();
-    if (ref_level == -1) ref_level = y_lod.size() - 1;
-    // just copy the gradient
-    if (y_lod[ref_level].size() <= 1) {
-      framework::TensorCopy(*g_out, context.GetPlace(), g_x);
-      return;
-    }
-
-    framework::Vector<size_t> ref_x_lod;
-    framework::Vector<size_t> ref_lod = y_lod[ref_level];
-    if (x->lod().size() == 1) {
-      ref_x_lod = x->lod()[0];
-    } else {
-      // x_lod doesn't has lod, use fake x lod, level = 0
-      ref_x_lod.resize(x->dims()[0] + 1);
-      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
-    }
-    SequenceExpandGradFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(), *g_out, ref_x_lod,
-            ref_lod, g_x);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
deleted file mode 100644
index a7225adbf9fcafdff30ecf0f6c7a5f6a73c4f3e8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequenceMaskOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
-
-    int maxlen = ctx->Attrs().Get<int>("maxlen");
-    auto dim = framework::vectorize<int>(ctx->GetInputDim("X"));
-
-    if (ctx->HasInputs("MaxLenTensor")) {
-      dim.push_back(-1);
-    } else {
-      dim.push_back(maxlen > 0 ? maxlen : -1);
-    }
-    ctx->SetOutputDim("Y", framework::make_ddim(dim));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "depth_tensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor of sequence_mask op.");
-    AddOutput("Y", "The output mask of sequence_mask op.");
-    AddInput("MaxLenTensor",
-             "Max length tensor"
-             "have higher priority than maxlen attribute")
-        .AsDispensable();
-    AddAttr<int>("maxlen",
-                 "The maximum length of the sequence. If maxlen < 0, maxlen "
-                 "= max(Input(X)).")
-        .SetDefault(-1)
-        .AddCustomChecker([](const int& v) {
-          PADDLE_ENFORCE(v < 0 || v >= 1,
-                         "Attr(maxlen) must be less than 0 or larger than 1");
-        });
-    AddAttr<int>("out_dtype", "Output data type");
-    AddComment(R"DOC(
-SequenceMask Operator
-
-This operator outputs a Mask according to Input(X) and Attr(maxlen).
-Supposing Input(X) is a Tensor with shape [d_1, d_2, ..., d_n], the
-Output(Y) is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
-
-Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n)) 
-
-If maxlen < 0, maxlen = max(X)
-    )DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(sequence_mask, paddle::operators::SequenceMaskOp,
-                  paddle::operators::SequenceMaskOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    sequence_mask,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
-                                          int>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
-                                          int64_t>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
-                                          float>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
-                                          double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu
deleted file mode 100644
index e963ce610e2c147d66087a1df59f67a04d899ccc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    sequence_mask,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
-                                          int>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
-                                          float>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
-                                          double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
deleted file mode 100644
index abddc6859fe737ca610c694577c94e530803c931..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef __NVCC__
-#include <thrust/device_ptr.h>
-#include <thrust/functional.h>
-#include <thrust/reduce.h>
-#else
-#include <algorithm>
-#endif
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-template <typename Tx, typename Ty>
-struct SequenceMaskForRangeFunctor {
-  HOSTDEVICE SequenceMaskForRangeFunctor(const Tx *x, Ty *y, int maxlen)
-      : x_(x), y_(y), maxlen_(maxlen) {}
-
-  HOSTDEVICE void operator()(int y_idx) const {
-    int x_idx = y_idx / maxlen_;
-    int j = y_idx % maxlen_;
-    y_[y_idx] = static_cast<Ty>(j < x_[x_idx] ? 1 : 0);
-  }
-
- private:
-  const Tx *x_;
-  Ty *y_;
-  int maxlen_;
-};
-
-template <typename DeviceContext, typename Tx>
-struct SequenceMaskFunctor {
-  SequenceMaskFunctor(const DeviceContext &ctx, const Tx *x, Tensor *y,
-                      int limits, int maxlen)
-      : ctx_(ctx), x_(x), y_(y), limits_(limits), maxlen_(maxlen) {}
-
-  template <typename Ty>
-  void apply() const {
-    auto *y_data = y_->mutable_data<Ty>(ctx_.GetPlace());
-    platform::ForRange<DeviceContext> for_range(ctx_, limits_);
-    for_range(SequenceMaskForRangeFunctor<Tx, Ty>(x_, y_data, maxlen_));
-  }
-
- private:
-  const DeviceContext &ctx_;
-  const Tx *x_;
-  Tensor *y_;
-  int limits_;
-  int maxlen_;
-};
-
-template <typename DeviceContext, typename Tx>
-class SequenceMaskKernel : public framework::OpKernel<Tx> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Y");
-    int maxlen = ctx.Attr<int>("maxlen");
-    if (ctx.HasInput("MaxLenTensor")) {
-      auto max_len_tensor = ctx.Input<Tensor>("MaxLenTensor");
-      PADDLE_ENFORCE(max_len_tensor != NULL, "MaxLenTensor is NULL");
-      if (platform::is_gpu_place(max_len_tensor->place())) {
-        framework::Tensor temp;
-        TensorCopySync(*max_len_tensor, platform::CPUPlace(), &temp);
-        maxlen = *temp.data<int32_t>();
-      } else {
-        maxlen = *max_len_tensor->data<int32_t>();
-      }
-
-      auto y_dim = framework::vectorize<int>(x->dims());
-      y_dim.push_back(maxlen);
-      y->Resize(framework::make_ddim(y_dim));
-
-      PADDLE_ENFORCE_GT(maxlen, 0,
-                        "MaxLenTensor value should be greater than 0");
-    }
-
-    auto *x_data = x->data<Tx>();
-    auto x_numel = x->numel();
-    if (maxlen < 0) {
-#ifdef __NVCC__
-      VLOG(10)
-          << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
-      maxlen = static_cast<int>(
-          thrust::reduce(thrust::device_pointer_cast(x_data),
-                         thrust::device_pointer_cast(x_data) + x_numel,
-                         static_cast<Tx>(0), thrust::maximum<Tx>()));
-#else
-      maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
-#endif
-      auto y_dim = framework::vectorize<int>(x->dims());
-      y_dim.push_back(maxlen);
-      y->Resize(framework::make_ddim(y_dim));
-    }
-
-    auto out_dtype = static_cast<framework::proto::VarType::Type>(
-        ctx.Attr<int>("out_dtype"));
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    framework::VisitDataType(out_dtype,
-                             SequenceMaskFunctor<DeviceContext, Tx>(
-                                 dev_ctx, x_data, y, x_numel * maxlen, maxlen));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
deleted file mode 100644
index fcc49096e2c48c264179e95133c9f9b4ec973e1f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ /dev/null
@@ -1,246 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequencePadOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of SequencePadOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("PadValue"), true,
-                      "Input(PadValue) of SequencePadOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of SequencePadOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Length"), true,
-                      "Output(Length) of SequencePadOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The rank of Input(X) can't be less than 2.");
-    auto time_step_dims = framework::slice_ddim(x_dims, 1, x_dims.size());
-    auto pad_value_dims = ctx->GetInputDim("PadValue");
-    PADDLE_ENFORCE_EQ(pad_value_dims == framework::make_ddim({1}) ||
-                          pad_value_dims == time_step_dims,
-                      true,
-                      "The Input(PadValue) must be a scalar or a tensor whose "
-                      "shape equals to time steps in sequences");
-
-    int out_dim_0 = -1;
-
-    int padded_length = ctx->Attrs().Get<int>("padded_length");
-    if (ctx->IsRuntime()) {
-      // run time
-      framework::Variable* x_var =
-          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
-      const auto& x_lod = x_var->Get<LoDTensor>().lod();
-      PADDLE_ENFORCE_EQ(x_lod.empty(), false,
-                        "The Input(X) must hold lod info.");
-      const auto& x_lod_0 = x_lod[0];
-      PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
-                        "The Input(X)'s lod info is corrupted.");
-      PADDLE_ENFORCE_EQ(
-          x_dims[0], static_cast<int64_t>(x_lod_0.back()),
-          "The Input(X)'s lod info mismatches the actual tensor shape.");
-
-      int seq_num = x_lod_0.size() - 1;
-      int max_seq_len = math::MaximumSequenceLength(x_lod_0);
-      if (padded_length == -1) {
-        padded_length = max_seq_len;
-      }
-      PADDLE_ENFORCE_GE(padded_length, max_seq_len,
-                        "The Attr(padded_length) must be -1 or an int greater "
-                        "than the length of the longest original sequence.");
-      out_dim_0 = seq_num;
-    } else {
-      // compile time
-      if (padded_length == -1) {
-        padded_length = 1;
-      }
-      framework::VarDesc* x_desc =
-          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
-      PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
-    }
-
-    std::vector<int> out_dims_vec{out_dim_0, padded_length};
-    std::vector<int> len_dims_vec{out_dim_0};
-    auto time_step_dims_vec = framework::vectorize<int>(time_step_dims);
-    out_dims_vec.insert(out_dims_vec.end(), time_step_dims_vec.begin(),
-                        time_step_dims_vec.end());
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
-    ctx->SetOutputDim("Length", framework::make_ddim(len_dims_vec));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) Input variable which "
-             "should contain lod information.");
-    AddInput("PadValue",
-             "(LoDTensor), this Tensor holds values that will be fill into "
-             "padded steps. It can be a scalar or a tensor whose shape equals "
-             "to time steps in sequences. If it's a scalar, it will be "
-             "automatically broadcasted to the shape of time step.");
-    AddOutput(
-        "Out",
-        "(LoDTensor) The output vairable, which contains padded sequences.");
-    AddOutput(
-        "Length",
-        "(LoDTensor) The output vairable, which contains the actual length of "
-        "sequences before padding.");
-    AddAttr<int>(
-        "padded_length",
-        "The length of padded sequences. It can be setted to -1 or "
-        "any positive int. When it is -1, all sequences will be padded up to "
-        "the length of the longest one among them; when it a certain positive "
-        "value, it must be greater than the length of the longest original "
-        "sequence.")
-        .SetDefault(-1);
-    AddComment(R"DOC(
-      Sequence Pad Operator
-
-      This operator pads sequences in a same batch to a consistent length. 
-      The length is specified by attribute 'padded_length'. New elements, 
-      whose values are specified by input 'PadValue', will be appended to 
-      the end of each sequence, to make their final lengths consistent.
-
-      Following are cases to better explain how this works:
-
-      Case 1:
-
-      Given a 1-level LoDTensor input(X):
-          X.lod = [[0, 2,       5]]
-          X.data = [a, b, c, d, e]
-      and Input(PadValue):
-          PadValue.data = [0]
-      and attribite 'padded_length' = 4,
-      then we get LoDTensor:
-          Out.data = [[a, b, 0, 0], 
-                      [c, d, e, 0]]
-          Length.data = [2, 3]
-      
-      Case 2:
-
-      Given a 1-level LoDTensor input(X):
-          X.lod = [[0,               2,                           5]]
-          X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
-      and Input(PadValue):
-          PadValue.data = [0]
-      and attribite 'padded_length' = -1, which mean using the length 
-      of longest input sequence(3 in this case),
-      then we get LoDTensor:
-          Out.data = [[[a1, a2], [b1, b2], [0, 0]], 
-                      [[c1, c2], [d1, d2], [e1, e2]]]
-          Length.data = [2, 3]
- 
-      Case 3:
-
-      Given a 1-level LoDTensor input(X):
-          X.lod = [[0,               2,                           5]]
-          X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
-      and Input(PadValue):
-          PadValue.data = [p1, p2]
-      and attribite 'padded_length' = -1, which mean using the length 
-      of longest input sequence(3 in this case),
-      then we get LoDTensor:
-          Out.data = [[[a1, a2], [b1, b2], [p1, p2]], 
-                      [[c1, c2], [d1, d2], [e1, e2]]]
-          Length.data = [2, 3]
-
-    )DOC");
-  }
-};
-
-class SequencePadGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of SequencePadGradOp should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput(framework::GradVarName("Out")), true,
-        "Input(Out@GRAD) of SequencePadGradOp should not be null.");
-
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(
-        ctx.InputVar(framework::GradVarName("Out")));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class SequencePadGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sequence_pad_grad");
-    op->SetAttrMap(Attrs());
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    SequencePadGradOpNoNeedBufferVarsInference, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker,
-                  ops::SequencePadGradOpDescMaker);
-REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp,
-                  ops::SequencePadGradOpNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(
-    sequence_pad,
-    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_pad_grad,
-    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu
deleted file mode 100644
index 7fc64a530ef5442ae927faac96ad92a4126febcd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_pad,
-    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_pad_grad,
-    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
deleted file mode 100644
index 840bd39a7f3eaca6cb03bca59016fc032e9a3068..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/sequence_padding.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-template <typename DeviceContext, typename T>
-class SequencePadOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    auto* len_t = ctx.Output<LoDTensor>("Length");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto* pad_value = ctx.Input<LoDTensor>("PadValue");
-
-    int padded_length = ctx.Attr<int>("padded_length");
-
-    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), *x, out, *pad_value,
-        padded_length, 0, false, math::kBatchLengthWidth);
-
-    LoDTensor seq_len;
-    seq_len.Resize(len_t->dims());
-    int64_t* len_data = seq_len.mutable_data<int64_t>(platform::CPUPlace());
-    for (size_t i = 1; i < x->lod()[0].size(); ++i) {
-      len_data[i - 1] = x->lod()[0][i] - x->lod()[0][i - 1];
-    }
-    framework::TensorCopy(seq_len, ctx.GetPlace(),
-                          ctx.template device_context<DeviceContext>(), len_t);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequencePadGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    if (d_x) {
-      const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-      d_x->mutable_data<T>(ctx.GetPlace());
-
-      int padded_length = ctx.Attr<int>("padded_length");
-
-      math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), *d_out, d_x,
-          padded_length, 0, false, math::kBatchLengthWidth);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
deleted file mode 100644
index 51e354dcd175845c3db2cce78dac6039361aed08..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequencePoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of SequencePoolOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    if (ctx->Attrs().Get<std::string>("pooltype") == "MAX") {
-      PADDLE_ENFORCE_EQ(
-          ctx->HasOutput("MaxIndex"), true,
-          "Output(MaxIndex) of SequencePoolOp should not be null.");
-      ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X"));
-    }
-  }
-};
-
-class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
-    AddOutput("Out",
-              "(Tensor) The output of SequencePoolOp does not contain LoD "
-              "infomation.");
-    AddOutput("MaxIndex",
-              "(Tensor<int>) This tensor is used for the sequence max-pooling "
-              "to record the max indexes.")
-        .AsIntermediate();
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "pooltype",
-        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
-        .SetDefault("AVERAGE")
-        .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
-    AddAttr<float>("pad_value",
-                   "(float, default 0.0) The value to pad for empty sequence.")
-        .SetDefault(0.0);
-    AddComment(R"DOC(
-Sequence Pool Operator.
-
-The SequencePoolOp pools features of all time-steps of each instance.
-It supports six pooling types:
-1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$
-2. SUM:     $$Out[i] = \sum_jX_{ij}$$
-3. SQRT:    $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
-4. LAST:    Out[i] = last instance in i-th sequence X[i]
-5. FIRST:   Out[i] = first instance in i-th sequence X[i]
-6. MAX:     $$Out[i] = max(X_i)$$
-
-and for the empty sequence Out[i] = attr(pad_value).
-
-The following example explains how this works:
-For a mini-batch of 3 variable-length sentences,
-containing 2, 3, and 2 time-steps:
-
-Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
-Besides, for the sake of simplicity, we assume M=1 and N=1,
-and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
-
-Thus, Out is a [3,1,1] Tensor without LoD infomation.
-And for different pooltype, the value of Out is as follows:
-
-- AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
-- SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
-- SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
-           6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
-- MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
-- LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
-- FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
-
-    )DOC");
-  }
-};
-
-class SequencePoolGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Gradient of Out should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "The input X should not be null.");
-    auto og_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(),
-                      "The rank of output grad must equal to Input(X).");
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
-    }
-
-    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op_desc_ptr = new framework::OpDesc();
-    op_desc_ptr->SetType("sequence_pool_grad");
-    op_desc_ptr->SetInput("X", Input("X"));
-    if (boost::get<std::string>(GetAttr("pooltype")) == "MAX") {
-      op_desc_ptr->SetInput("MaxIndex", Output("MaxIndex"));
-    }
-    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op_desc_ptr->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    SequencePoolGradOpNoNeedBufferVarsInference, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
-                  ops::SequencePoolGradOpMaker);
-REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp,
-                  ops::SequencePoolGradOpNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(
-    sequence_pool,
-    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
deleted file mode 100644
index 4897474a485d8417854ffb53aa8ee64321c78ae7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_pool,
-    ops::SequencePoolKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
deleted file mode 100644
index 3eec4df121046e6c269cd950234c06b31b57d5a2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/sequence_pooling.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class SequencePoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    std::string pooltype = context.Attr<std::string>("pooltype");
-    T pad_value = static_cast<T>(context.Attr<float>("pad_value"));
-
-    auto dims = in->dims();
-    auto lod = in->lod();
-    auto lod_level = lod.size();
-    // InferShape by lod
-    PADDLE_ENFORCE_GE(lod_level, 1UL,
-                      "The lod level of input shall be 1 at least.");
-    PADDLE_ENFORCE_LE(lod_level, 2UL,
-                      "The lod level of input shall be no more than 2.");
-    PADDLE_ENFORCE_GE(
-        dims[0],
-        /*batch size = */ static_cast<int64_t>(lod[lod_level - 1].size() - 1),
-        "The first dimension of Input(X) must be large than batch size.");
-    if (lod_level > 1UL) {
-      PADDLE_ENFORCE_EQ(lod[0][lod[0].size() - 1], lod[1].size() - 1,
-                        "The input lod information is illegal.");
-      framework::LoD out_lod;
-      out_lod.push_back(lod[0]);
-      out->set_lod(out_lod);
-    }
-    dims[0] = lod[lod_level - 1].size() - 1;
-    out->Resize({dims});
-    out->mutable_data<T>(context.GetPlace());
-    Tensor* index = nullptr;
-
-    const bool is_test = context.Attr<bool>("is_test");
-
-    // Do not create index buffer for inference (is_test) mode
-    // TODO(jczaja): Skip index buffer creation for other devices eg. GPU
-    if (pooltype == "MAX" &&
-        (is_test == false ||
-         platform::is_cpu_place(context.GetPlace()) == false)) {
-      index = context.Output<Tensor>("MaxIndex");
-      index->Resize({dims});
-      index->mutable_data<int>(context.GetPlace());
-    }
-    math::SequencePoolFunctor<DeviceContext, T> pool;
-    pool(context.template device_context<DeviceContext>(), pooltype, pad_value,
-         *in, out, is_test, index);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequencePoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
-    std::string pooltype = context.Attr<std::string>("pooltype");
-    const Tensor* index = nullptr;
-    if (pooltype == "MAX") {
-      index = context.Input<Tensor>("MaxIndex");
-    }
-    in_g->mutable_data<T>(context.GetPlace());
-    math::SequencePoolGradFunctor<DeviceContext, T> pool;
-    pool(context.template device_context<DeviceContext>(), pooltype, *out_g,
-         in_g, index);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
deleted file mode 100644
index 5421f35662b3b0a6a61748ac0b6b5f718d213b73..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_reshape_op.h"
-#include "paddle/fluid/framework/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-class SequenceReshapeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceReshapeOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_numel = product(x_dims);
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2.");
-    int new_dim = ctx->Attrs().Get<int>("new_dim");
-    if (ctx->IsRuntime()) {
-      ctx->SetOutputDim("Out",
-                        {x_numel / new_dim, static_cast<int64_t>(new_dim)});
-    } else {
-      // when compiling, the batch size is undetermined, just set to -1
-      ctx->SetOutputDim("Out", {-1, static_cast<int64_t>(new_dim)});
-    }
-  }
-};
-
-class SequenceReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with shape "
-             "being [N, M].");
-    AddOutput("Out",
-              "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with "
-              "shape [T, new_dim] where T is calculated based on X.lod, M and "
-              "new_dim.");
-    AddAttr<int>("new_dim", "Sequence dimension of the output LoDTensor.");
-    AddComment(R"DOC(
-Sequence Reshape Operator.
-
-This operator will rearrange the input sequences. The new dimension is set by
-attribute and length of each sequence may change longer or shorter which is
-decided by original length, original dimension and new dimension. The following
-example will help to illustrate the function of this operator:
-
-x is a LoDTensor:
-    x.lod  = [[0, 2, 6]]
-    x.data = [[1, 2], [3, 4],
-              [5, 6], [7, 8], [9, 10], [11, 12]]
-    x.dims = [6, 2]
-
-set new_dim = 4
-
-then out is a LoDTensor:
-    out.lod  = [[0, 1, 3]]
-    out.data = [[1, 2, 3, 4],
-                [5, 6, 7, 8], [9, 10, 11, 12]]
-    out.dims = [3, 4]
-
-Currently, only 1-level LoDTensor is supported and please make sure (original
-length * original dimension) can be divided by new_dim with no remainder for
-each sequence.
-
-)DOC");
-  }
-};
-
-class SequenceReshapeGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput(framework::GradVarName("Out")),
-        "Input(Out@GRAD) of SequenceReshapeGradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceReshapeGradOp should  not be null.");
-
-    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-};
-
-class SequenceReshapeGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op_desc_ptr = new framework::OpDesc();
-    op_desc_ptr->SetType("sequence_reshape_grad");
-    op_desc_ptr->SetInput("X", Input("X"));
-    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op_desc_ptr->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_reshape, ops::SequenceReshapeOp,
-                  ops::SequenceReshapeOpMaker, ops::SequenceReshapeGradOpMaker);
-REGISTER_OPERATOR(sequence_reshape_grad, ops::SequenceReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sequence_reshape,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_reshape_grad,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu
deleted file mode 100644
index 38bc599165d5f84f67e2fe08bf96ebef4b03d8a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_reshape_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_reshape,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_reshape_grad,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
deleted file mode 100644
index 2893808ee9ceca12924efb4884d7fd6783e94e35..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
+++ /dev/null
@@ -1,86 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-template <typename DeviceContext, typename T>
-class SequenceReshapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int out_width = context.Attr<int>("new_dim");
-
-    auto in_dims = in->dims();
-    int64_t in_width = in_dims[1];
-    auto& in_lod = in->lod();
-
-    PADDLE_ENFORCE_EQ(in_lod.size(), 1UL,
-                      "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(
-        (uint64_t)in_dims[0], in_lod[0].back(),
-        "Inconsistent size between X.shape[0] and X.lod()[0].back().");
-
-    auto in_lod_l0 = in_lod[0];
-    int seq_num = in_lod_l0.size() - 1;
-
-    if (in_width == out_width) {
-      out->set_lod(in->lod());
-    } else {
-      auto& out_lod = *out->mutable_lod();
-      out_lod.resize(1);
-      out_lod[0].resize(seq_num + 1);
-      out_lod[0][0] = 0;
-      for (int i = 0; i < seq_num; ++i) {
-        size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i];
-        size_t offset = 0;
-        offset = (seq_len * in_width) / out_width;
-        PADDLE_ENFORCE_EQ(offset * out_width, seq_len * in_width,
-                          "Please make sure (sequence_length * dimension) can "
-                          "be divided by new_dim with no remainder for each "
-                          "sequence. The %dth sequence is invalid.",
-                          i + 1);
-        out_lod[0][i + 1] = out_lod[0][i] + offset;
-      }
-    }
-
-    framework::TensorCopy(*in, context.GetPlace(), out);
-    out->Resize({static_cast<int64_t>(out->lod()[0].back()), out_width});
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceReshapeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_tensor_ptr = context.Input<LoDTensor>("X");
-    auto* outg_tensor_ptr =
-        context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* xg_tensor_ptr =
-        context.Output<LoDTensor>(framework::GradVarName("X"));
-
-    xg_tensor_ptr->mutable_data<T>(context.GetPlace());
-    framework::TensorCopy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr);
-    xg_tensor_ptr->Resize(x_tensor_ptr->dims());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc
deleted file mode 100644
index dfbbf5f156983189ac1ab82fbff51d7eb4844f9a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_reverse_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(sequence_reverse, ops::SequenceReverseOp,
-                  ops::SequenceReverseOpMaker,
-                  ops::SequenceReverseGradOpDescMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    sequence_reverse,
-    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu
deleted file mode 100644
index 0a59ed7f9fee07bc3b12909973535f31ef049a4a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_reverse_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    sequence_reverse,
-    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
deleted file mode 100644
index 14e4fc9b0dd0561a2c3630165f73234f35fa024d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ /dev/null
@@ -1,171 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/algorithm.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-class SequenceReverseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
-
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dim.size(), 2,
-                      "Rank of Input(X) must be not less than 2.");
-
-    ctx->SetOutputDim("Y", x_dim);
-    ctx->ShareLoD("X", "Y");
-  }
-};
-
-class SequenceReverseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input LoDTensor of sequence_reverse op.");
-    AddOutput("Y", "The output LoDTensor of sequence_reverse op.");
-    AddComment(R"DOC(
-SequenceReverse Operator.
-
-Reverse each sequence in input X along dim 0.
-
-Assuming X is a LoDTensor with dims [5, 4] and lod [[0, 2, 5]], where:
-
-X.data() = [
-  [1, 2, 3, 4],
-  [5, 6, 7, 8], # the 0-th sequence with length 2
-  [9, 10, 11, 12],
-  [13, 14, 15, 16],
-  [17, 18, 19, 20] # the 1-st sequence with length 3
-]
-
-The output Y would be a LoDTensor sharing the same dims and lod with input X,
-and:
-
-Y.data() = [
-  [5, 6, 7, 8],
-  [1, 2, 3, 4], # the reversed 0-th sequence with length 2
-  [17, 18, 19, 20],
-  [13, 14, 15, 16],
-  [9, 10, 11, 12] # the reversed 1-st sequence with length 3
-]
-
-This Operator is useful to build a reverse dynamic RNN network.
-
-This Operator only supports one-level lod currently.
-    )DOC");
-  }
-};
-
-template <typename T>
-struct SequenceReverseFunctor {
-  SequenceReverseFunctor(const T *x, T *y, const size_t *lod, size_t lod_count,
-                         size_t row_numel)
-      : x_(x), y_(y), lod_(lod), lod_count_(lod_count), row_numel_(row_numel) {}
-
-  HOSTDEVICE void operator()(size_t idx_x) const {
-    auto row_idx_x = idx_x / row_numel_;
-    auto lod_idx = math::UpperBound(lod_, lod_count_, row_idx_x);
-    auto row_idx_y = lod_[lod_idx - 1] + (lod_[lod_idx] - 1 - row_idx_x);
-    auto idx_y = row_idx_y * row_numel_ + idx_x % row_numel_;
-    y_[idx_y] = x_[idx_x];
-  }
-
-  const T *x_;
-  T *y_;
-  const size_t *lod_;
-  size_t lod_count_;
-  size_t row_numel_;
-};
-
-template <typename DeviceContext, typename T>
-class SequenceReverseOpKernel : public framework::OpKernel<T> {
-  using LoDTensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &x = *ctx.Input<LoDTensor>("X");
-    auto *y = ctx.Output<LoDTensor>("Y");
-
-    PADDLE_ENFORCE_EQ(x.lod().size(), 1,
-                      "SequenceReverse Op only support one level lod.");
-
-    const size_t *lod;
-    size_t lod_count = x.lod()[0].size();
-
-#ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      lod = x.lod()[0].CUDAData(ctx.GetPlace());
-    } else {
-#endif
-      lod = x.lod()[0].data();
-#ifdef PADDLE_WITH_CUDA
-    }
-#endif
-
-    size_t limit = static_cast<size_t>(x.numel());
-    size_t row_numel = static_cast<size_t>(limit / x.dims()[0]);
-    auto *x_data = x.data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-
-    PADDLE_ENFORCE_NE(x_data, y_data,
-                      "SequenceReverse Op does not support in-place operation");
-
-    if (platform::is_cpu_place(ctx.GetPlace())) {
-      for (size_t idx = 0; idx < lod_count - 1; idx++) {
-        auto start_pos = lod[idx];
-        auto end_pos = lod[idx + 1];
-        for (auto pos = start_pos; pos < end_pos; pos++) {
-          auto cur_pos = end_pos - pos - 1 + start_pos;
-          std::memcpy(y_data + pos * row_numel, x_data + cur_pos * row_numel,
-                      row_numel * sizeof(T));
-        }
-      }
-    } else {
-      auto &dev_ctx = ctx.template device_context<DeviceContext>();
-
-      SequenceReverseFunctor<T> functor(x_data, y_data, lod, lod_count,
-                                        row_numel);
-      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      for_range(functor);
-    }
-  }
-};
-
-class SequenceReverseGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sequence_reverse");
-    op->SetInput("X", OutputGrad("Y"));
-    op->SetOutput("Y", InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
deleted file mode 100644
index 5a22212edf29cc79d28b12029dc7595ae5f1aab3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_scatter_op.h"
-#include <memory>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-class SequenceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The source input of sequence scatter op");
-    AddInput("Ids",
-             "(LoDTensor) The index input of sequence scatter op where X"
-             " will be  updated, must be a LoDTensor");
-    AddInput("Updates",
-             "(LoDTensor) The values to scatter to the input tensor "
-             "X, must be a LoDTensor with the same LoD information as Ids");
-    AddOutput("Out",
-              "(Tensor) The output tensor of sequence scatter op, which "
-              "has the same dims as X");
-    AddComment(R"DOC(
-Sequence Scatter Operator.
-
-This operator scatters the Updates tensor to the input X. It uses the LoD
-information of Ids to select the rows to update, and use the values in Ids as
-the columns to update in each row of X.
-
-Following are cases to better explain how this works:
-
-Example 1:
-Given an all-ones Tensor input(X)
-    X.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
-    X.dims = [3, 6]
-a LoDTensor input(Ids)
-    Ids.data = [[0], [1], [2], [5], [4], [3], [2], [1], [3], [2], [5], [4]]
-    Ids.lod =  [[0,        3,                       8,                 12]]
-and a Tensor input(Updates)
-    Updates.data = [[0.3], [0.3], [0.4], [0.1], [0.2], [0.3], [0.4], [0.0], [0.2], [0.3], [0.1], [0.4]]
-    Updates.lod =  [[  0,            3,                                 8,                         12]]
-then we get an output Tensor
-    Out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
-                [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
-                [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
-    Out.dims = X.dims = [3, 6]
-)DOC");
-  }
-};
-
-class SequenceScatterOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // Enforce has inputs and outputs
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Ids"),
-                   "Input(Ids) of SequenceScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Updates"),
-                   "Input(Updates) of SequenceScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceScatterOp should not be null.");
-
-    // Set output dim the same as input
-    auto ref_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", ref_dims);
-
-    // Enforce the Updates and Ids are the same shape
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
-                      ctx->GetInputDim("Ids")[0],
-                      "Updates and Ids should have same shape.");
-
-    // Enforce LoD of ids and updates be the same
-    if (ctx->IsRuntime()) {
-      framework::Variable* ids_var =
-          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Ids")[0]);
-      framework::Variable* updates_var =
-          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Updates")[0]);
-
-      auto& ids_lod = ids_var->Get<LoDTensor>().lod();
-      auto& updates_lod = updates_var->Get<LoDTensor>().lod();
-      PADDLE_ENFORCE_EQ(ids_lod.size(), 1,
-                        "Currently only level 1 LoD could be"
-                        " processed by sequence scatter op.");
-      PADDLE_ENFORCE_EQ(updates_lod.size(), 1,
-                        "Currently only level 1 LoD "
-                        "could be processed by sequence scatter op.");
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-class SequenceScatterGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("Updates"),
-                      ctx->GetInputDim("Updates"));
-    ctx->SetOutputDim(framework::GradVarName("X"),
-                      ctx->GetInputDim(framework::GradVarName("Out")));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        platform::CPUPlace());
-  }
-};
-
-class SequenceScatterGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sequence_scatter_grad");
-    op->SetInput("Ids", Input("Ids"));
-    op->SetInput("Updates", Input("Updates"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    SequenceScatterGradNoNeedBufferVarsInference, "Updates");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_scatter, ops::SequenceScatterOp,
-                  ops::SequenceScatterOpMaker,
-                  ops::SequenceScatterGradDescMaker);
-REGISTER_OPERATOR(sequence_scatter_grad, ops::SequenceScatterGradOp,
-                  ops::SequenceScatterGradNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(sequence_scatter, ops::SequenceScatterOpKernel<float>,
-                       ops::SequenceScatterOpKernel<double>,
-                       ops::SequenceScatterOpKernel<int>,
-                       ops::SequenceScatterOpKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(sequence_scatter_grad,
-                       ops::SequenceScatterGradientOpKernel<float>,
-                       ops::SequenceScatterGradientOpKernel<double>,
-                       ops::SequenceScatterGradientOpKernel<int>,
-                       ops::SequenceScatterGradientOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
deleted file mode 100644
index d9b681b7aa76849a40d50e3348418d7604641c10..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class SequenceScatterOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* ids = ctx.Input<LoDTensor>("Ids");
-    auto* updates = ctx.Input<LoDTensor>("Updates");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto& ids_lod = ids->lod();
-
-    // Initialize out as same as x
-    out->mutable_data<T>(ctx.GetPlace());
-    framework::TensorCopySync(*x, ctx.GetPlace(), out);
-
-    auto x_dims = x->dims();
-    auto out_dims = out->dims();
-
-    for (int i = 0; i < x_dims.size(); ++i)
-      PADDLE_ENFORCE(x_dims[i] == out_dims[i],
-                     "Input and output shape of "
-                     "sequence scatter op must exactly be the same.");
-
-    size_t slice_size = 1;
-    for (int i = 1; i < x_dims.size(); ++i) slice_size *= x_dims[i];
-
-    auto lod_vec = ids_lod[0];
-    unsigned int seg = 0;
-    for (int i = 0; i < ids->dims()[0]; ++i) {
-      PADDLE_ENFORCE_LT(seg, lod_vec.size() - 1,
-                        "Segment num must not exceed batch size.\n");
-      int lower_bound = lod_vec[seg];
-      int upper_bound = lod_vec[seg + 1];
-      if (i >= lower_bound && i < upper_bound) {
-        T* p_out = out->data<T>();
-        const T* p_updates = updates->data<T>();
-        const int64_t* p_index = ids->data<int64_t>();
-        p_out[seg * slice_size + p_index[i]] += p_updates[i];
-      } else {
-        ++seg;
-        --i;
-      }
-    }
-  }
-};
-
-template <typename T>
-class SequenceScatterGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dUpdates = ctx.Output<LoDTensor>(framework::GradVarName("Updates"));
-    auto* ids = ctx.Input<LoDTensor>("Ids");
-    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto& ids_lod = ids->lod();
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
-    dUpdates->mutable_data<T>(ctx.GetPlace());
-
-    auto dx_dims = dX->dims();
-    auto dout_dims = dOut->dims();
-
-    for (int i = 0; i < dx_dims.size(); ++i)
-      PADDLE_ENFORCE(dx_dims[i] == dout_dims[i],
-                     "Input and output shape of "
-                     "sequence scatter grad op must exactly be the same.");
-
-    size_t slice_size = 1;
-    for (int i = 1; i < dx_dims.size(); ++i) slice_size *= dx_dims[i];
-
-    auto lod_vec = ids_lod[0];
-    unsigned int seg = 0;
-
-    for (int i = 0; i < ids->dims()[0]; ++i) {
-      PADDLE_ENFORCE_LT(seg, lod_vec.size() - 1,
-                        "Segment num must not exceed batch size.\n");
-      int lower_bound = lod_vec[seg];
-      int upper_bound = lod_vec[seg + 1];
-      if (i >= lower_bound && i < upper_bound) {
-        const T* p_dOut = dOut->data<T>();
-        const int64_t* p_index = ids->data<int64_t>();
-        T* p_dUpdates = dUpdates->data<T>();
-        p_dUpdates[i] = p_dOut[seg * slice_size + p_index[i]];
-      } else {
-        ++seg;
-        --i;
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
deleted file mode 100644
index 4b2ec6e7cad7c04e248c0ffbb117951fba1ec877..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SequenceSliceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceSliceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Offset"),
-                   "Input(Offset) of SequenceSliceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Length"),
-                   "Input(Length) of SequenceSliceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceSliceOp should not be null.");
-    auto input_dims = ctx->GetInputDim("X");
-
-    auto offset_dim = ctx->GetInputDim("Offset");
-    auto length_dim = ctx->GetInputDim("Length");
-
-    PADDLE_ENFORCE_EQ(
-        offset_dim.size(), 2UL,
-        "Only support one level sequence now, The rank of offset must be 2.");
-    PADDLE_ENFORCE_EQ(
-        length_dim.size(), 2UL,
-        "Only support one level sequence now, The rank of Length must be 2.");
-
-    // Initialize the output's dims to maximum,
-    // and re-set to real dims by the value of Offset and Length at kernel
-    ctx->SetOutputDim("Out", input_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class SequenceSliceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The gradient of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
-                   "The gradient of X should not be null.");
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor), "
-             "the input of SequenceSliceOp.");
-    AddInput("Offset",
-             "(Tensor), "
-             "a vector<int> to describe the offset of every input sequence for "
-             "sub sequence item.");
-    AddInput("Length",
-             "(Tensor), "
-             "a vector<int> to describe the length of every input sequence for "
-             "sub sequence item.");
-    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
-    AddComment(R"DOC(
-Sequence slice operator
-
-The operator crops a subsequence from given sequence with given start offset and subsequence length.
-It only supports sequence (LoD Tensor with level number is 1).
-- Case:
-    X = [[a1, a2;
-        b1, b2;
-        c1, c2]
-       [d1, d2;
-        e1, e2]]
-    LoD(X) = {{0, 3, 5}}; Dims(X) = (5, 2)
-    Offset = [[0], [1]]; Length = [[2], [1]]
-
-    Out = [[a1, a2;
-            b1, b2]
-            [e1, e2]]
-    LoD(Out) = {{0, 2, 3}}; Dims(Out) = (3, 2)
-NOTE: The first dimension size of input, the size of offset and Length, should be equal. The offset start from 0.
-    )DOC");
-  }
-};
-
-class SequenceSliceGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sequence_slice_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Offset", Input("Offset"));
-    op->SetInput("Length", Input("Length"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    SequenceSliceGradNoNeedBufferVarsInference, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp,
-                  ops::SequenceSliceOpMaker, ops::SequenceSliceGradOpDescMaker);
-REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp,
-                  ops::SequenceSliceGradNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(
-    sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu
deleted file mode 100644
index 1e4a1b8323dbaacdf3f74c33e7aa4484d9be2478..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
deleted file mode 100644
index a07fc54090d755114b878623104a8ac14f8cce8d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-template <typename T>
-inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
-                            const int64_t* length_data) {
-  auto out_lod = in.lod();
-  size_t lod_offset = 0;
-
-  auto n = in.lod()[0].size() - 1;
-  out_lod[0][0] = 0;
-  for (size_t i = 0; i < n; ++i) {
-    lod_offset += length_data[i];
-    out_lod[0][i + 1] = lod_offset;
-  }
-  return out_lod;
-}
-
-template <typename DeviceContext, typename T>
-class SequenceSliceOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<LoDTensor>("X");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* length = ctx.Input<Tensor>("Length");
-    auto* out = ctx.Output<LoDTensor>("Out");
-
-    auto lod = in->lod();
-    auto n = lod[0].size() - 1;
-
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(
-        n, static_cast<size_t>(length->dims()[0]),
-        "The size of input-sequence and length-array should be the same");
-    PADDLE_ENFORCE_EQ(
-        n, static_cast<size_t>(offset->dims()[0]),
-        "The size of input-sequence and offset-array should be the same");
-
-    const int64_t* offset_data = offset->data<int64_t>();
-    const int64_t* length_data = length->data<int64_t>();
-    framework::Tensor offset_cpu;
-    framework::Tensor length_cpu;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
-      offset_data = offset_cpu.data<int64_t>();
-
-      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
-      length_data = length_cpu.data<int64_t>();
-    }
-
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_LE(0, offset_data[i],
-                        "The offset[%d] must be nonnegative.", i);
-      PADDLE_ENFORCE_LE(0, length_data[i],
-                        "The length[%d] must be nonnegative.", i);
-      PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
-                        lod[0][i + 1], "The target tensor's length overflow.");
-    }
-
-    out->mutable_data<T>(ctx.GetPlace());
-    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
-    auto out_dims = in->dims();
-    out_dims[0] = out_lod[0][out_lod[0].size() - 1];
-    out->Resize(out_dims);
-    out->set_lod(out_lod);
-
-    auto in_stride = framework::stride(in->dims());
-    auto out_stride = framework::stride(out->dims());
-
-    size_t out_offset = 0;
-    for (size_t i = 0; i < n; ++i) {
-      if (length_data[i] == 0) continue;
-      Tensor in_t = in->Slice(
-          static_cast<int>(lod[0][i] + offset_data[i]),
-          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
-
-      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
-                       in_t.dims(), out_stride, out->data<T>() + out_offset);
-      out_offset += length_data[i] * in_stride[0];
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<LoDTensor>("X");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* length = ctx.Input<Tensor>("Length");
-    auto* out_grad =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-
-    const int64_t* offset_data = offset->data<int64_t>();
-    const int64_t* length_data = length->data<int64_t>();
-    framework::Tensor offset_cpu;
-    framework::Tensor length_cpu;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
-      offset_data = offset_cpu.data<int64_t>();
-
-      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
-      length_data = length_cpu.data<int64_t>();
-    }
-
-    auto lod = in->lod();
-    // to avoid out_grad missing lod, compute lod again
-    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
-
-    if (x_grad) {
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      x_grad->set_lod(in->lod());
-      math::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), x_grad,
-               static_cast<T>(0));
-
-      for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
-        if (length_data[i] == 0) continue;
-        Tensor out_grad_t =
-            out_grad->Slice(static_cast<int>(out_lod[0][i]),
-                            static_cast<int>(out_lod[0][i + 1]));
-        auto out_grad_stride = framework::stride(out_grad_t.dims());
-
-        auto x_grad_stride = framework::stride(x_grad->dims());
-
-        Tensor x_grad_t = x_grad->Slice(
-            static_cast<int>(lod[0][i] + offset_data[i]),
-            static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
-
-        StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>(),
-                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
-                         x_grad_t.data<T>());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
deleted file mode 100644
index 585363958696fa0d8ed1ffdc7b6fdaab26349b08..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-
-    auto& lod = x->lod();
-    auto& dims = x->dims();
-
-    const size_t level = lod.size() - 1;
-    PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
-                      "The first dimension of Input(X) should be equal to the "
-                      "sum of all sequences' lengths.");
-    PADDLE_ENFORCE_EQ(dims[0], x->numel(),
-                      "The width of each timestep in Input(X) of "
-                      "SequenceSoftmaxOp should be 1.");
-
-    out->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
-      int start_pos = static_cast<int>(lod[level][i]);
-      int end_pos = static_cast<int>(lod[level][i + 1]);
-      Tensor x_i = x->Slice(start_pos, end_pos);
-      Tensor out_i = out->Slice(start_pos, end_pos);
-
-      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
-      framework::DDim dims_i =
-          // framework::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL});
-          framework::make_ddim({1UL, end_pos - start_pos});
-      x_i.Resize(dims_i);
-      out_i.Resize(dims_i);
-      math::SoftmaxCUDNNFunctor<T>()(
-          ctx.template device_context<platform::CUDADeviceContext>(), &x_i,
-          &out_i);
-    }
-  }
-};
-
-template <typename T>
-class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<LoDTensor>("Out");
-    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    if (x_grad) {
-      x_grad->set_lod(x->lod());
-    }
-    auto& lod = x->lod();
-    const size_t level = lod.size() - 1;
-
-    x_grad->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
-      int start_pos = static_cast<int>(lod[level][i]);
-      int end_pos = static_cast<int>(lod[level][i + 1]);
-
-      Tensor out_i = out->Slice(start_pos, end_pos);
-      Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
-      Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
-
-      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
-      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
-      out_i.Resize(dims_i);
-      out_grad_i.Resize(dims_i);
-      x_grad_i.Resize(dims_i);
-      math::SoftmaxGradCUDNNFunctor<T>()(
-          ctx.template device_context<platform::CUDADeviceContext>(), &out_i,
-          &out_grad_i, &x_grad_i);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::SequenceSoftmaxCUDNNKernel<float>,
-                   ops::SequenceSoftmaxCUDNNKernel<double>);
-REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::SequenceSoftmaxGradCUDNNKernel<float>,
-                   ops::SequenceSoftmaxGradCUDNNKernel<double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
deleted file mode 100644
index 027073e5d7d6c767ebb02662c6fd8b2cf9306904..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequenceSoftmaxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceSoftmaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceSoftmaxOp should not be null.");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // choose cudnn kernel if the runtime supported.
-    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-    bool runtime_cudnn_support = false;
-#ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
-    }
-#endif
-    framework::LibraryType library_ = framework::LibraryType::kPlain;
-    if (use_cudnn && runtime_cudnn_support) {
-      library_ = framework::LibraryType::kCUDNN;
-    }
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(
-        ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
-        framework::StringToDataLayout(data_format), library_);
-  }
-};
-
-class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
-             "of length 1.");
-    AddOutput("Out",
-              "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
-              "of length 1.");
-    AddAttr<bool>(
-        "use_cudnn",
-        "(bool, default false) Only used in cudnn kernel, need install cudnn")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout");
-    AddComment(R"DOC(
-Sequence Softmax Operator.
-
-SequenceSoftmaxOp computes the softmax activation among all time-steps for each
-sequence. The dimension of each time-step should be 1. Thus, the shape of
-input Tensor can be either [N, 1] or [N], where N is the sum of the length
-of all sequences.
-
-The algorithm works as follows:
-
-    for i-th sequence in a mini-batch:
-
-$$
-Out(X[lod[i]:lod[i+1]], :) = \
-\frac{\exp(X[lod[i]:lod[i+1], :])} \
-{\sum(\exp(X[lod[i]:lod[i+1], :]))}
-$$
-
-For example, for a mini-batch of 3 sequences with variable-length,
-each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
-then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :]
-and N turns out to be 7.
-
-)DOC");
-  }
-};
-
-class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"),
-                   "Input(Out) of SequenceSoftmaxGradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput(framework::GradVarName("Out")),
-        "Input(Out@GRAD) of SequenceSoftmaxGradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceSoftmaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) of SequenceSoftmaxOp should not be null.");
-
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Out"),
-        ctx->GetInputDim(framework::GradVarName("Out")),
-        "Input(Out) and Input(Out@GRAD) of SequenceSoftmaxGradOp should be of "
-        "the same shape.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // choose cudnn kernel if the runtime supported.
-    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-    bool runtime_cudnn_support = false;
-#ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
-    }
-#endif
-    framework::LibraryType library_ = framework::LibraryType::kPlain;
-    if (use_cudnn && runtime_cudnn_support) {
-      library_ = framework::LibraryType::kCUDNN;
-    }
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(
-        ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
-        framework::StringToDataLayout(data_format), library_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_softmax, ops::SequenceSoftmaxOp,
-                  ops::SequenceSoftmaxOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_softmax_grad, ops::SequenceSoftmaxGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
deleted file mode 100644
index a9dc0a4fda253db9bb0d33c4a25fbba36492f35b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <cub/cub.cuh>  // NOLINT
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, int BlockDim>
-using BlockReduce = cub::BlockReduce<T, BlockDim>;
-
-template <typename T, int BlockDim>
-using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
-
-template <typename T, int BlockDim>
-__global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod,
-                                        const size_t src_hight, T *out_data) {
-  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
-  __shared__ T shared_max_data;
-  __shared__ T shared_sum_data;
-
-  for (int i = blockIdx.x; i < src_hight; i += gridDim.x) {
-    size_t start = ref_lod[i];
-    size_t span = ref_lod[i + 1] - start;
-
-    // Find the max ele
-    T max_ele = -FLT_MAX;
-    for (int tid = threadIdx.x; tid < span; tid += blockDim.x) {
-      T ele = in_data[start + tid];
-      max_ele = max_ele > ele ? max_ele : ele;
-    }
-    max_ele =
-        BlockReduce<T, BlockDim>(temp_storage).Reduce(max_ele, cub::Max());
-    if (threadIdx.x == 0) {
-      shared_max_data = max_ele;
-    }
-    __syncthreads();
-
-    // sum
-    T sum_data = 0;
-    for (int tid = threadIdx.x; tid < span; tid += blockDim.x) {
-      T ele = in_data[start + tid];
-      sum_data += real_exp(ele - shared_max_data);
-    }
-    sum_data =
-        BlockReduce<T, BlockDim>(temp_storage).Reduce(sum_data, cub::Sum());
-    if (threadIdx.x == 0) {
-      shared_sum_data = sum_data;
-    }
-    __syncthreads();
-
-    // get final resit
-    for (int tid = threadIdx.x; tid < span; tid += blockDim.x) {
-      T ele = in_data[start + tid];
-      ele = real_exp(ele - shared_max_data) / shared_sum_data;
-      out_data[start + tid] = ele;
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data,
-                                             const T *softmax_data,
-                                             const size_t *ref_lod,
-                                             const size_t src_hight,
-                                             T *dx_data) {
-  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
-  __shared__ T shared_data;
-
-  for (int i = blockIdx.x; i < src_hight; i += gridDim.x) {
-    size_t start = ref_lod[i];
-    size_t span = ref_lod[i + 1] - start;
-
-    T result = 0;
-    for (int tid = threadIdx.x; tid < span; tid += blockDim.x) {
-      size_t idx = start + tid;
-      T s_g_d = softmax_grad_data[idx];
-      T s_d = softmax_data[idx];
-      result += s_g_d * s_d;
-    }
-    result = BlockReduce<T, BlockDim>(temp_storage).Reduce(result, cub::Sum());
-    if (threadIdx.x == 0) {
-      shared_data = result;
-    }
-    __syncthreads();
-
-    for (int tid = threadIdx.x; tid < span; tid += blockDim.x) {
-      size_t idx = start + tid;
-      T s_g_d = softmax_grad_data[idx];
-      T s_d = softmax_data[idx];
-      dx_data[idx] = (s_g_d - shared_data) * s_d;
-    }
-  }
-}
-
-template <typename T>
-struct SequenceSoftmaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &context,
-                  const LoDTensor &x,
-                  const framework::Vector<size_t> &ref_lod, /*referenced lod*/
-                  LoDTensor *out) {
-    int hight = ref_lod.size() - 1;
-
-    const int kThreadsPerBlock = 32;
-    int thread_x = kThreadsPerBlock;
-    int max_threads = context.GetMaxPhysicalThreadCount();
-    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
-    dim3 block_size(thread_x);
-    dim3 grid_size(max_blocks);
-    sequence_softmax_kernel<
-        T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), hight,
-        out->mutable_data<T>(context.GetPlace()));
-  }
-};
-
-template <typename T>
-struct SequenceSoftmaxGradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &context,
-                  const LoDTensor &dout, const LoDTensor &out,
-                  const framework::Vector<size_t> &ref_lod, /*referenced lod*/
-                  LoDTensor *dx) {
-    size_t hight = ref_lod.size() - 1;
-
-    const int kThreadsPerBlock = 32;
-    int thread_x = kThreadsPerBlock;
-    int max_threads = context.GetMaxPhysicalThreadCount();
-    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
-    dim3 block_size(thread_x);
-    dim3 grid_size(max_blocks);
-
-    sequence_softmax_grad_kernel<
-        T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        dout.data<T>(), out.data<T>(), ref_lod.CUDAData(context.GetPlace()),
-        hight, dx->mutable_data<T>(context.GetPlace()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
deleted file mode 100644
index 0555e4ee003e64e834320c23246dbfb900f445ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-struct SequenceSoftmaxFunctor {
-  void operator()(
-      const DeviceContext &ctx, const LoDTensor &x,
-      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
-      LoDTensor *out);
-};
-
-template <typename DeviceContext, typename T>
-struct SequenceSoftmaxGradFunctor {
-  void operator()(const DeviceContext &ctx, const LoDTensor &dout,
-                  const LoDTensor &out,
-                  const framework::Vector<size_t> &ref_lod, /*referenced lod*/
-                  LoDTensor *dx);
-};
-
-template <typename T>
-struct SequenceSoftmaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx, const LoDTensor &x,
-                  const framework::Vector<size_t> &ref_lod, /*referenced lod*/
-                  LoDTensor *out) {
-    size_t hight = ref_lod.size() - 1;
-    const T *in_data = x.data<T>();
-    T *out_data = out->mutable_data<T>(ctx.GetPlace());
-    for (size_t i = 0; i < hight; ++i) {
-      size_t span = ref_lod[i + 1] - ref_lod[i];
-      T result = 0;
-      for (size_t j = 0; j < span; ++j) {
-        result += exp(in_data[ref_lod[i] + j]);
-      }
-      for (size_t j = 0; j < span; ++j) {
-        out_data[ref_lod[i] + j] = exp(in_data[ref_lod[i] + j]) / result;
-      }
-    }
-  }
-};
-
-template <typename T>
-struct SequenceSoftmaxGradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx, const LoDTensor &dout,
-                  const LoDTensor &out,
-                  const framework::Vector<size_t> &ref_lod, /*referenced lod*/
-                  LoDTensor *dx) {
-    size_t hight = ref_lod.size() - 1;
-
-    const T *softmax_grad_data = dout.data<T>();
-    const T *softmax = out.data<T>();
-    T *dx_data = dx->mutable_data<T>(ctx.GetPlace());
-
-    for (size_t i = 0; i < hight; ++i) {
-      size_t span = ref_lod[i + 1] - ref_lod[i];
-      T result = 0;
-      for (size_t j = 0; j < span; ++j) {
-        result += softmax_grad_data[ref_lod[i] + j] * softmax[ref_lod[i] + j];
-      }
-
-      for (size_t j = 0; j < span; ++j) {
-        dx_data[ref_lod[i] + j] = (softmax_grad_data[ref_lod[i] + j] - result) *
-                                  softmax[ref_lod[i] + j];
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceSoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<LoDTensor>("X");
-    auto *out = ctx.Output<LoDTensor>("Out");
-
-    auto lod = x->lod();
-    auto dims = x->dims();
-
-    const size_t level = lod.size() - 1;
-    PADDLE_ENFORCE_GT(
-        lod.size(), 0U,
-        "The LoD level of Input X should be larger than 0 (lod.size() > 0).");
-    PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
-                      "The first dimension of Input(X) should be equal to the "
-                      "sum of all sequences' lengths.");
-    PADDLE_ENFORCE_EQ(dims[0], x->numel(),
-                      "The width of each timestep in Input(X) of "
-                      "SequenceSoftmaxOp should be 1.");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    SequenceSoftmaxFunctor<DeviceContext, T> seq_softmax_functor;
-    seq_softmax_functor(ctx.template device_context<DeviceContext>(), *x,
-                        lod[level], out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out = ctx.Input<LoDTensor>("Out");
-    auto *out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto *x = ctx.Input<LoDTensor>("X");
-    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    if (!x_grad) {
-      return;
-    }
-
-    x_grad->set_lod(x->lod());
-    auto lod = x->lod();
-    const size_t level = lod.size() - 1;
-    x_grad->mutable_data<T>(ctx.GetPlace());
-
-    SequenceSoftmaxGradFunctor<DeviceContext, T> seq_softmax_grad_functor;
-    seq_softmax_grad_functor(ctx.template device_context<DeviceContext>(),
-                             *out_grad, *out, lod[level], x_grad);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
deleted file mode 100644
index 232f324de77e4808a0731c9ca7d79906d6b69cde..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROW"), true,
-                      "Input(ROW) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("COLUMN"), true,
-                      "Input(COLUMN) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("pos"), true,
-                      "pos(out) should not be null");
-
-    auto attr = ctx->Attrs();
-    auto channel_num = attr.Get<int>("channel_num");
-    auto topks = attr.Get<std::vector<int>>("topks");
-
-    auto row_dim = ctx->GetInputDim("ROW");
-
-    auto num_k = topks.size();
-    auto row_shape_0 = row_dim[0];
-
-    std::vector<int> vec_out_shape;
-    vec_out_shape.push_back(row_shape_0);
-    vec_out_shape.push_back(channel_num * num_k);
-
-    ctx->SetOutputDim("Out", framework::make_ddim(vec_out_shape));
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class SequenceTopkAvgPoolingOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor) The variable-length input of SequenceTopkPoolingOp");
-    AddInput("ROW", "(LoDTensor) the row info");
-    AddInput("COLUMN", "(LoDTensor) the column info");
-    AddOutput(
-        "Out",
-        "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD "
-        "infomation.");
-    AddOutput("pos", "(Tensor<int>) store the topk index ").AsIntermediate();
-    AddAttr<std::vector<int>>("topks", "topks");
-    AddAttr<int>("channel_num", "channel number");
-    AddComment(R"DOC(
-    sequecen topk average pooling op
-    )DOC");
-  }
-};
-
-class SequenceTopkAvgPoolingGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Gradient of Out should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "The input X should not be null.");
-
-    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class SequenceTopkAvgPoolGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op_desc_ptr = new framework::OpDesc();
-    op_desc_ptr->SetType("sequence_topk_avg_pooling_grad");
-    op_desc_ptr->SetInput("X", Input("X"));
-    op_desc_ptr->SetInput("ROW", Input("ROW"));
-    op_desc_ptr->SetInput("COLUMN", Input("COLUMN"));
-    op_desc_ptr->SetInput("pos", Output("pos"));
-    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op_desc_ptr->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_topk_avg_pooling, ops::SequenceTopkAvgPoolingOp,
-                  ops::SequenceTopkAvgPoolingOpMaker,
-                  ops::SequenceTopkAvgPoolGradOpMaker);
-REGISTER_OPERATOR(sequence_topk_avg_pooling_grad,
-                  ops::SequenceTopkAvgPoolingGradOp);
-REGISTER_OP_CPU_KERNEL(sequence_topk_avg_pooling,
-                       ops::SequenceTopkAvgPoolingKernel<
-                           paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(sequence_topk_avg_pooling_grad,
-                       ops::SequenceTopkAvgPoolingGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
deleted file mode 100644
index c6bfdea8bedd79fe17dae14f0ed73824e59f3ca8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <limits>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-template <typename T>
-void get_topk_pos(const T* data, int length, int k, int* pos) {
-  size_t real_k = k < length ? k : length;
-
-  std::vector<T> v(data, data + length);
-
-  std::vector<int> topk_pos;
-  T min_val = std::numeric_limits<T>::lowest();
-  while (topk_pos.size() < real_k) {
-    T max_val = min_val;
-    int max_pos = -1;
-    for (int i = 0; i < length; ++i) {
-      if (v[i] > max_val) {
-        max_pos = i;
-        max_val = v[i];
-      }
-    }
-
-    assert(max_pos >= 0);
-
-    topk_pos.push_back(max_pos);
-    v[max_pos] = min_val;
-  }
-
-  assert(topk_pos.size() > 0);
-  while (topk_pos.size() < (size_t)k) {
-    topk_pos.push_back(-1);
-  }
-
-  for (size_t i = 0; i < topk_pos.size(); ++i) {
-    pos[i] = topk_pos[i];
-  }
-}
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* row = context.Input<LoDTensor>("ROW");
-    auto* col = context.Input<LoDTensor>("COLUMN");
-    auto* out = context.Output<LoDTensor>("Out");
-    auto* pos = context.Output<Tensor>("pos");
-
-    auto channel_num = context.Attr<int>("channel_num");
-    auto topks = context.Attr<std::vector<int>>("topks");
-    auto k_num = topks.size();
-    auto max_k = topks[topks.size() - 1];
-    std::vector<int> vec_pos_shape;
-    auto in_lod = in->lod()[0];
-
-    auto row_lod = row->lod()[0];
-    auto col_lod = col->lod()[0];
-    int batch_size = row_lod.size() - 1;
-    int pos_total_size = row_lod[batch_size] * channel_num * max_k;
-    vec_pos_shape.push_back(pos_total_size);
-    pos->Resize({framework::make_ddim(vec_pos_shape)});
-    auto pos_data = pos->mutable_data<int>(context.GetPlace());
-
-    int offset = 0;
-    framework::Vector<size_t> vec_out_lod;
-    vec_out_lod.reserve(batch_size + 1);
-    for (int i = 0; i <= batch_size; ++i) {
-      offset = row_lod[i];
-      vec_out_lod.push_back(offset);
-    }
-
-    framework::LoD lod_temp;
-    lod_temp.push_back(vec_out_lod);
-    out->set_lod(lod_temp);
-
-    auto din_data = in->data<T>();
-    auto dout_data = out->mutable_data<T>(context.GetPlace());
-
-    T* sum_data = new T[max_k];
-    for (int i = 0; i < batch_size; ++i) {
-      int total_size = in_lod[i + 1] - in_lod[i];
-      int row_size = row_lod[i + 1] - row_lod[i];
-      int col_size = col_lod[i + 1] - col_lod[i];
-      PADDLE_ENFORCE_EQ(total_size, channel_num * row_size * col_size,
-                        "size wrong in sequence_topk_avg_pooling_op!");
-
-      int feature_num = row_size * col_size;
-      for (int j = 0; j < channel_num; ++j) {
-        auto input_offset_feature_data = din_data + in_lod[i] + j * feature_num;
-
-        for (int r = 0; r < row_size; ++r) {
-          auto row_data = input_offset_feature_data + r * col_size;
-
-          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
-                                r * channel_num * max_k + j * max_k;
-          auto out_slice_data = dout_data + row_lod[i] * channel_num * k_num +
-                                r * channel_num * k_num + j * k_num;
-
-          get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
-          if (pos_slice_data[0] == -1) {
-            sum_data[0] = 0.0;
-          } else {
-            sum_data[0] = row_data[pos_slice_data[0]];
-          }
-          for (int k = 1; k < max_k; ++k) {
-            if (pos_slice_data[k] == -1) {
-              sum_data[k] = sum_data[k - 1];
-            } else {
-              sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
-            }
-          }
-          for (size_t k = 0; k < k_num; ++k) {
-            out_slice_data[k] = sum_data[topks[k] - 1] / topks[k];
-          }
-        }
-      }
-    }
-    delete[] sum_data;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* pos_input = context.Input<Tensor>("pos");
-    auto* row_input = context.Input<LoDTensor>("ROW");
-    auto* col_input = context.Input<LoDTensor>("COLUMN");
-    auto* forward_input = context.Input<LoDTensor>("X");
-
-    int batch_size = row_input->lod()[0].size() - 1;
-    auto channel_num = context.Attr<int>("channel_num");
-    auto topks = context.Attr<std::vector<int>>("topks");
-    auto k_num = topks.size();
-    auto max_k = topks[k_num - 1];
-
-    auto out_lod = forward_input->lod();
-    d_in->set_lod(out_lod);
-
-    d_in->mutable_data<T>(context.GetPlace());
-    auto pos_data = pos_input->data<int>();
-    auto dout_data = d_out->data<T>();
-
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<paddle::platform::CPUDeviceContext, T> zero;
-    zero(dev_ctx, d_in, static_cast<T>(0.0));
-
-    auto din_data = d_in->data<T>();
-
-    auto out_offset = out_lod[0];
-    auto row_lod = row_input->lod()[0];
-    auto col_lod = col_input->lod()[0];
-
-    for (int i = 0; i < batch_size; ++i) {
-      int row_size = row_lod[i + 1] - row_lod[i];
-      int col_size = col_lod[i + 1] - col_lod[i];
-      int feature_num = row_size * col_size;
-
-      for (int j = 0; j < channel_num; ++j) {
-        auto in_offset_feature_data =
-            din_data + out_offset[i] + j * feature_num;
-
-        for (int r = 0; r < row_size; r++) {
-          auto row_data = dout_data + row_lod[i] * channel_num * k_num +
-                          r * channel_num * k_num + j * k_num;
-          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
-                                r * channel_num * max_k + j * max_k;
-          auto in_slice_data = in_offset_feature_data + r * col_size;
-
-          for (size_t m = 0; m < k_num; ++m) {
-            for (int k = 0; k < topks[m]; ++k) {
-              if (pos_slice_data[k] == -1) {
-                break;
-              } else {
-                in_slice_data[pos_slice_data[k]] += row_data[m] / topks[m];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
deleted file mode 100644
index 23581c360ff541e6d9d1000e9b2bae3712e15dd9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequenceUnpadOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of SequenceUnpadOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Length"), true,
-                      "Input(Length) of SequenceUnpadOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of SequenceUnpadOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The rank of Input(X) can't be less than 2.");
-
-    auto len_dims = ctx->GetInputDim("Length");
-    PADDLE_ENFORCE_EQ(len_dims.size(), 1,
-                      "The shape of Input(Length) should be [batch_size].");
-    PADDLE_ENFORCE_EQ(
-        len_dims[0], x_dims[0],
-        "Input(X) and Input(Length) should have the same first dimension.");
-
-    int64_t out_dim_0 = -1;
-    if (ctx->IsRuntime()) {
-      out_dim_0 = x_dims[0] * x_dims[1];
-    }
-
-    std::vector<int64_t> out_dims_vec{out_dim_0};
-    if (x_dims.size() == 2) {
-      out_dims_vec.push_back(1);
-    } else {
-      for (int i = 2; i < x_dims.size(); ++i) {
-        out_dims_vec.push_back(x_dims[i]);
-      }
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) Input tensor which "
-             "contains the padded sequences with equal length.");
-    AddInput("Length",
-             "(LoDTensor) The input tensor which specifies the actual ength of "
-             "sequences after unpadding.");
-    AddOutput(
-        "Out",
-        "(LoDTensor) The output tensor which contains unpadded sequences.");
-    AddComment(R"DOC(
-      Sequence Unpad Operator
-
-      This operator removes the padding data in the input sequences and convert 
-      them into sequences with actual length as output, identitied by lod 
-      information.
-
-      Example:
-
-      Given input tensor Input(X):
-          X.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
-                    [ 6.0,  7.0,  8.0,  9.0, 10.0],
-                    [11.0, 12.0, 13.0, 14.0, 15.0]], 
-`     
-      in which there are 3 sequences padded to length 5, and the acutal length 
-      specified by Input(Length):
-
-          Length.data = [2, 3, 4],
-
-      after unpadding, Output(Out) will be:
-
-          Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
-          Out.lod = [[0, 2, 5, 9]]      
-
-    )DOC");
-  }
-};
-
-class SequenceUnpadGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of SequenceUnpadGradOp should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput(framework::GradVarName("Out")), true,
-        "Input(Out@GRAD) of SequenceUnpadGradOp should not be null.");
-
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(
-        ctx.InputVar(framework::GradVarName("Out")));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class SequenceUnpadGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sequence_unpad_grad");
-    op->SetAttrMap(Attrs());
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    SequenceUnpadGradOpNoNeedBufferVarsInference, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp,
-                  ops::SequenceUnpadOpMaker, ops::SequenceUnpadGradOpDescMaker);
-REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp,
-                  ops::SequenceUnpadGradOpNoNeedBufferVarsInference);
-REGISTER_OP_CPU_KERNEL(
-    sequence_unpad,
-    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_unpad_grad,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu
deleted file mode 100644
index bf54f77f5b55cf7eb19873e352359c028207308a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_unpad,
-    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_unpad_grad,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
deleted file mode 100644
index 60ba4797db1e2af267a37715c715fb7107ac8500..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/sequence_padding.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-template <typename DeviceContext, typename T>
-class SequenceUnpadOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x_t = ctx.Input<LoDTensor>("X");
-    auto* len_t = ctx.Input<LoDTensor>("Length");
-    auto* out_t = ctx.Output<LoDTensor>("Out");
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    framework::Tensor seq_len_cpu =
-        ctx.AllocateTmpTensor<T, DeviceContext>(len_t->dims(), dev_ctx);
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      seq_len_cpu.mutable_data<int64_t>(platform::CPUPlace());
-      framework::TensorCopySync(*len_t, platform::CPUPlace(), &seq_len_cpu);
-    } else {
-      seq_len_cpu = *len_t;
-    }
-
-    const int64_t* seq_len_ptr = seq_len_cpu.data<int64_t>();
-    int64_t batch_size = len_t->dims()[0];
-    std::vector<size_t> out_lod0(batch_size + 1, 0);
-    for (int64_t i = 0; i < batch_size; ++i) {
-      out_lod0[i + 1] = out_lod0[i] + static_cast<size_t>(seq_len_ptr[i]);
-    }
-
-    framework::LoD out_lod;
-    out_lod.push_back(out_lod0);
-    out_t->set_lod(out_lod);
-    std::vector<int64_t> out_dims_vec{static_cast<int64_t>(out_lod0.back())};
-    if (x_t->dims().size() == 2) {
-      out_dims_vec.push_back(1);
-    } else {
-      for (int i = 2; i < x_t->dims().size(); ++i) {
-        out_dims_vec.push_back(x_t->dims()[i]);
-      }
-    }
-    out_t->Resize(framework::make_ddim(out_dims_vec));
-
-    // after set the lod of output, allocate the memory
-    out_t->mutable_data<T>(ctx.GetPlace());
-
-    int64_t padded_length = x_t->dims()[1];
-    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-        dev_ctx, *x_t, out_t, padded_length, 0, false, math::kBatchLengthWidth);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    if (d_x) {
-      const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-      d_x->mutable_data<T>(ctx.GetPlace());
-
-      int padded_length = d_x->dims()[1];
-
-      LoDTensor zero_pads;
-      zero_pads.Resize({1, 1});
-      zero_pads.mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> set_zero;
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      set_zero(dev_ctx, &zero_pads, static_cast<T>(0));
-
-      math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads,
-          padded_length, 0, false, math::kBatchLengthWidth);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
deleted file mode 100644
index 3715dd114d6363ca48a0ef6a4c162b1ce4486f55..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shape_op.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/shape_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class ShapeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input (Input) of get_shape op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output (Out) of get_shape op should not be null.");
-    auto in_dim = ctx->GetInputDim("Input");
-    ctx->SetOutputDim("Out", {in_dim.size()});
-  }
-};
-
-class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "(LoDTensor), The input tensor.");
-    AddOutput(
-        "Out",
-        "(LoDTensor), The shape of input tensor, the data type of the shape"
-        " is int32_t, will be on the same device with the input Tensor.");
-    AddComment(R"DOC(
-Shape Operator.
-
-Return the shape of the input.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
deleted file mode 100644
index 2df4ad13399735f5384cbbecd1fbb3a97ec37870..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shape_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/shape_op.h"
-
-REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
-                        paddle::operators::ShapeKernel<int32_t>,
-                        paddle::operators::ShapeKernel<int64_t>,
-                        paddle::operators::ShapeKernel<float>,
-                        paddle::operators::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
deleted file mode 100644
index 0d510a505583c55e26a26bfc6e5d6192899b3d9e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shape_op.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class ShapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("Input");
-    auto* out_t = ctx.Output<Tensor>("Out");
-    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
-    auto in_dims = in_t->dims();
-    for (int i = 0; i < in_dims.size(); ++i) {
-      out_data[i] = in_dims[i];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc
deleted file mode 100644
index 578dcd37bb42bdc4c69020c2cf500d4a6c203a55..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shard_index_op.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/shard_index_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ShardIndexOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ShardIndexOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ShardIndexOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "Rank of Input(X) should be at least 2.");
-    if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) {
-      PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
-                        "Last dimension of Input(X) should be 1.");
-    }
-
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ShardIndexOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, LoDTensor<int|int64>) Input variable. Each value "
-             "of X is an index.");
-    AddOutput(
-        "Out",
-        "(Tensor, Tensor<int|int64>) Output tensor with same shape as X. "
-        "The tensor consists of sharding representations of values in X.");
-    AddAttr<int>("index_num",
-                 "A positive integer to specify the range of the input X.");
-
-    AddAttr<int>("nshards",
-                 "A positive integer to specify the number of shards.");
-    AddAttr<int>("shard_id", "The current shard id");
-    AddAttr<int>("ignore_value", "An ingeter value out of sharded range")
-        .SetDefault(-1);
-    AddComment(R"DOC(
-This layer creates the sharded index for input. This layers is used in
-model- and data- parallel mixed training generally, in which the index
-data (usually the label) should be recaculated in each trainer according
-to 
-
-.. math::
-    
-    assert index_num % nshards == 0
-
-    shard_size = index_num / nshards
-
-    y = x % shard_size if x / shard_size == shard_id else ignore_value
-
-We take the distributed one-hot representation to show what this layer is
-used for. The distributed one-hot representation is seperated into multiple
-shards, and each shard is filling zeros except the one with the index
-inside. In order to create these sharded representation in each trainer,
-the original index should be recalculated (i.e. sharded) before.
-
-Examples:
-
-    X is a Tensor of integer values:
-      X.shape = [4, 1]
-      X.data = [[1], [6], [12], [19]]
-    
-    suppose index_num = 20 and nshards = 2, then we get shard_size = 10
-    
-    if shard_id == 0, we get the Out:
-      Out.shape = [4, 1]
-      Out.data = [[1], [6], [-1], [-1]]
-    
-    if shard_id == 1, we get the Out:
-      Out.shape = [4, 1]
-      Out.data = [[-1], [-1], [2], [9]]
-
-    the default `ignore_value` -1 is used in this example.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(shard_index, ops::ShardIndexOp,
-                             ops::ShardIndexOpMaker);
-REGISTER_OP_CPU_KERNEL(shard_index, ops::ShardIndexCPUKernel<int>,
-                       ops::ShardIndexCPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu
deleted file mode 100644
index 08503e3e1a8fe66b20f1e23012c584f9e32b4a01..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shard_index_op.cu
+++ /dev/null
@@ -1,77 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/shard_index_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-__global__ void ShardIndexInner(const T* in_data, T* out_data,
-                                const int64_t numel, const int index_num,
-                                const int nshards, const int shard_id,
-                                const int ignore_value) {
-  int shard_size = index_num / nshards;
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel) {
-    assert(in_data[idx] >= 0 && in_data[idx] < index_num);
-    if (in_data[idx] / shard_size == shard_id) {
-      out_data[idx] = in_data[idx] % shard_size;
-    } else {
-      out_data[idx] = ignore_value;
-    }
-  }
-}
-
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class ShardIndexCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-    PADDLE_ENFORCE_GT(index_num, 0);
-    PADDLE_ENFORCE_GT(nshards, 0);
-    PADDLE_ENFORCE(shard_id >= 0 && shard_id < nshards,
-                   "shard_id(%d) is not in range [0, %d)", shard_id, nshards);
-
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    ShardIndexInner<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                          PADDLE_CUDA_NUM_THREADS,
-                      PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(shard_index, ops::ShardIndexCUDAKernel<int>,
-                        ops::ShardIndexCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/shard_index_op.h b/paddle/fluid/operators/shard_index_op.h
deleted file mode 100644
index f060b3fdf182a2bf7fe03b1d86db41c4d1cfb340..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shard_index_op.h
+++ /dev/null
@@ -1,58 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-template <typename T>
-class ShardIndexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-    PADDLE_ENFORCE_GT(index_num, 0);
-    PADDLE_ENFORCE_GT(nshards, 0);
-    PADDLE_ENFORCE(shard_id >= 0 && shard_id < nshards,
-                   "shard_id(%d) is not in range [0, %d)", shard_id, nshards);
-
-    int shard_size = index_num / nshards;
-
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    for (int64_t i = 0; i < numel; ++i) {
-      PADDLE_ENFORCE(in_data[i] >= 0 && in_data[i] < index_num,
-                     "Input index(%d) is out of range [0,%d)", in_data[i],
-                     index_num);
-      if (in_data[i] / shard_size == shard_id) {
-        out_data[i] = in_data[i] % shard_size;
-      } else {
-        out_data[i] = ignore_value;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
deleted file mode 100644
index 2e2aea2c632d8e4e0abbcd2cac562e492e0f552f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/array_operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class ShrinkRNNMemoryOp : public ArrayOp {
- public:
-  ShrinkRNNMemoryOp(const std::string &type,
-                    const framework::VariableNameMap &inputs,
-                    const framework::VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs)
-      : ArrayOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto *x_var = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
-    auto &x_tensor = x_var->Get<framework::LoDTensor>();
-    size_t offset = this->GetOffset(scope, place);
-    auto *rank_table_var = scope.FindVar(Input("RankTable"));
-    PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
-    auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
-
-    auto &rank_items = rank_table.items();
-    int dst_num_rows =
-        std::lower_bound(rank_items.begin(), rank_items.end(), offset,
-                         [](const framework::LoDRankTable::TableItem &a,
-                            size_t b) { return a.length > b; }) -
-        rank_items.begin();
-
-    auto *out_var = scope.FindVar(Output("Out"));
-    PADDLE_ENFORCE(out_var != nullptr, "Output(Out) must be set.");
-    auto &out_tensor = *out_var->GetMutable<framework::LoDTensor>();
-
-    size_t height = dst_num_rows;
-
-    // do shrink for the top level LoD
-    if (x_tensor.lod().size() > 0 &&
-        x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
-      auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
-                                                              dst_num_rows, 0);
-      height = lod_offset.second.second;
-      auto out_lod = out_tensor.mutable_lod();
-      framework::AppendLoD(out_lod, lod_offset.first);
-    }
-
-    if (dst_num_rows != 0) {
-      out_tensor.mutable_data(place, x_tensor.type());
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
-                            &out_tensor);
-    }
-  }
-};
-
-class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
-    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
-    AddInput("I",
-             "(LoDTensor) The step index. The RNN step memory 'X' will be "
-             "shrinked to match the size of the input of the index'th step.");
-    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
-    AddComment(R"DOC(
-This operator is used to shrink output batch of memory defined in dynamic RNN.
-
-Dynamic RNN is able to handle variable-length sequences, in which, sequences in
-a mini-batch are sorted by their lengths first. After that, the longest sequence
-becomes the first one in the sorted batch, followed by the second longest, the
-third longest, and so on. Dynamic RNN then slices a batch input timestep by
-timestep from the sorted input. Once any sequence in the input batch reaches its
-end, memory defined in dynamicRNN has to shrink its outputs to adapt to the input
-batch size for the next time step.
-)DOC");
-  }
-};
-
-class ShrinkRNNMemoryInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"));
-    PADDLE_ENFORCE(context->HasInput("I"));
-    PADDLE_ENFORCE(context->HasInput("RankTable"));
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    if (!context->IsRuntime()) {
-      context->DecreaseLoDLevel("X", /*->*/ "Out");
-    }
-  }
-};
-
-class ShrinkRNNMemoryGradOp : public ArrayOp {
- public:
-  ShrinkRNNMemoryGradOp(const std::string &type,
-                        const framework::VariableNameMap &inputs,
-                        const framework::VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs)
-      : ArrayOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
-    auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
-    PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
-    auto *x_var = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE(x_var != nullptr);
-
-    auto &x_tensor = x_var->Get<framework::LoDTensor>();
-    auto &dx_tensor = *dx_var->GetMutable<framework::LoDTensor>();
-    dx_tensor.Resize(x_tensor.dims());
-    dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    if (dout_var == nullptr) {  // dx_tensor fill zero
-      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
-    } else {
-      auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
-      auto height = dout_tensor.dims()[0];
-      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
-      framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
-      if (dx_tensor.dims()[0] > height) {
-        auto rest_tensor = dx_tensor.Slice(
-            static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
-        math::set_constant(dev_ctx, &rest_tensor, 0.0f);
-      }
-    }
-    dx_tensor.set_lod(x_tensor.lod());
-  }
-};
-
-class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"));
-    PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
-
-    context->ShareDim("X", /*->*/ framework::GradVarName("X"));
-    context->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-};
-
-class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op = new framework::OpDesc();
-    op->SetType("shrink_rnn_memory_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(shrink_rnn_memory, ops::ShrinkRNNMemoryOp,
-                  ops::ShrinkRNNMemoryInferShape,
-                  ops::ShrinkRNNMemoryOpProtoMaker, ops::ShrinkRNNGradOpMaker);
-REGISTER_OPERATOR(shrink_rnn_memory_grad, ops::ShrinkRNNMemoryGradOp,
-                  ops::ShrinkRNNMemoryGradInferShape);
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
deleted file mode 100644
index ad6fb3510f02ae783c8ae4318f559a8db74a59d1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/shuffle_channel_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class ShuffleChannelOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ShuffleChannelOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ShuffleChannelOp should not be null.");
-
-    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
-
-    ctx->SetOutputDim("Out", input_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), "
-             "the input feature data of ShuffleChannelOp, the layout is NCHW.");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), the output of "
-              "ShuffleChannelOp. The layout is NCHW.");
-    AddAttr<int>("group", "the number of groups.")
-        .SetDefault(1)
-        .AddCustomChecker([](const int& group) {
-          PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0.");
-        });
-
-    AddComment(R"DOC(
-		Shuffle Channel operator
-		This opearator shuffles the channels of input x.
-		It  divide the input channels in each group into several subgroups,
-		and obtain a new order by selecting element from every subgroup one by one.
-
-		Shuffle channel operation makes it possible to build more powerful structures
-		with multiple group convolutional layers.
-		please get more information from the following paper:
-		https://arxiv.org/pdf/1707.01083.pdf
-        )DOC");
-  }
-};
-
-class ShuffleChannelGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto input_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class ShuffleChannelGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("shuffle_channel_grad");
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp,
-                  ops::ShuffleChannelOpMaker, ops::ShuffleChannelGradDescMaker);
-
-REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    shuffle_channel,
-    ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    shuffle_channel_grad,
-    ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext,
-                                    double>);
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
deleted file mode 100644
index dbc3e1a7ebe26ffccd24d1749093d014751d866f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/shuffle_channel_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-__global__ void ShuffleChannel(const int nthreads, const int feature_map_size,
-                               T* output, const T* input, int group_row,
-                               int group_column, int len) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t ii = index; ii < nthreads; ii += offset) {
-    const int n = index / group_row / group_column / len;
-    const int i = (index / group_column / len) % group_row;
-    const int j = index / len % group_column;
-    const int k = index - (n * feature_map_size + (i * group_column + j) * len);
-    T* p_o = output + n * feature_map_size + (j * group_row + i) * len;
-    p_o[k] = input[index];
-  }
-}
-template <typename DeviceContext, typename T>
-class ShuffleChannelOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    int group = ctx.Attr<int>("group");
-
-    auto input_dims = input->dims();
-    auto num = input_dims[0];
-    auto channel = input_dims[1];
-    auto height = input_dims[2];
-    auto weight = input_dims[3];
-
-    auto feature_map_size = channel * height * weight;
-    auto sp_sz = height * weight;
-    int group_row = group;
-    int group_column = channel / group_row;
-    // count is the product of NCHW same as numel()
-    int count = num * group_column * group_row * sp_sz;
-
-    int blocks = NumBlocks(output->numel());
-    int threads = kNumCUDAThreads;
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    ShuffleChannel<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        count, feature_map_size, output_data, input_data, group_row,
-        group_column, sp_sz);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    int group = ctx.Attr<int>("group");
-
-    const auto& input_dims = input_grad->dims();
-    auto num = input_dims[0];
-    auto channel = input_dims[1];
-    auto height = input_dims[2];
-    auto weight = input_dims[3];
-    auto feature_map_size = channel * height * weight;
-    auto sp_sz = height * weight;
-
-    int group_row = group;
-    int group_column = channel / group_row;
-
-    T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    const T* output_grad_data = output_grad->data<T>();
-
-    int blocks = NumBlocks(output_grad->numel());
-    int threads = kNumCUDAThreads;
-    int count = num * group_column * group_row * sp_sz;
-
-    ShuffleChannel<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        count, feature_map_size, input_grad_data, output_grad_data, group_row,
-        group_column, sp_sz);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    shuffle_channel,
-    ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                    double>);
-REGISTER_OP_CUDA_KERNEL(
-    shuffle_channel_grad,
-    ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        double>);
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
deleted file mode 100644
index 3ce1e0c770bb3fe6c4b0a54dad14e47f372958af..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ShuffleChannelOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    int group = ctx.Attr<int>("group");
-
-    auto input_dims = input->dims();
-    auto num = input_dims[0];
-    auto channel = input_dims[1];
-    auto height = input_dims[2];
-    auto weight = input_dims[3];
-
-    auto feature_map_size = channel * height * weight;
-    auto sp_sz = height * weight;
-    int group_row = group;
-    int group_column = channel / group_row;
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    for (int n = 0; n < num; ++n) {
-      for (int i = 0; i < group_row; ++i) {
-        for (int j = 0; j < group_column; ++j) {
-          const T* p_i = input_data + n * feature_map_size +
-                         (i * group_column + j) * sp_sz;
-          T* p_o =
-              output_data + n * feature_map_size + (j * group_row + i) * sp_sz;
-          memcpy(p_o, p_i, sizeof(int) * sp_sz);
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ShuffleChannelGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    int group = ctx.Attr<int>("group");
-
-    const auto& input_dims = input_grad->dims();
-    auto num = input_dims[0];
-    auto channel = input_dims[1];
-    auto height = input_dims[2];
-    auto weight = input_dims[3];
-    auto feature_map_size = channel * height * weight;
-    auto sp_sz = height * weight;
-
-    int group_row = group;
-    int group_column = channel / group_row;
-
-    T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    const T* output_grad_data = output_grad->data<T>();
-    for (int n = 0; n < num; ++n) {
-      for (int i = 0; i < group_row; ++i) {
-        for (int j = 0; j < group_column; ++j) {
-          const T* p_i = output_grad_data + n * feature_map_size +
-                         (i * group_column + j) * sp_sz;
-          T* p_o = input_grad_data + n * feature_map_size +
-                   (j * group_row + i) * sp_sz;
-          memcpy(p_o, p_i, sizeof(int) * sp_sz);
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
deleted file mode 100644
index c453b03dddf68a7f4638aa0eceaa2aa70dc3d5f4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-const int kIgnoreIndex = -100;
-
-class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
-                      "Input(X) and Input(Label) shall have the same rank.");
-    bool check = true;
-    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
-                                framework::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank),
-                        framework::slice_ddim(labels_dims, 0, rank),
-                        "Input(X) and Input(Label) shall have the same shape "
-                        "except the last dimension.");
-    }
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class SigmoidCrossEntropyWithLogitsGradOp
-    : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shoudl be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    int rank = x_dims.size();
-    bool check = true;
-    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
-                                framework::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank),
-                        framework::slice_ddim(labels_dims, 0, rank),
-                        "Input(X) and Input(Label) shall have the same shape.");
-
-      PADDLE_ENFORCE_EQ(
-          framework::slice_ddim(x_dims, 0, rank),
-          framework::slice_ddim(dout_dims, 0, rank),
-          "Input(X) and Input(Out@Grad) shall have the same shape.");
-    }
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  }
-};
-
-class SigmoidCrossEntropyWithLogitsOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
-             "where N is the batch size and D is the number of classes. "
-             "This input is a tensor of logits computed by the previous "
-             " operator. Logits are unscaled log probabilities given as "
-             "log(p/(1-p)).");
-    AddInput("Label",
-             "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
-             "and shape as X. This input is a tensor of probabalistic labels "
-             "for each logit");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
-              " of elementwise logistic losses.");
-    AddAttr<bool>("normalize",
-                  "if true, divide the loss by the number of "
-                  "targets != ignore_index.")
-        .SetDefault(false);
-    AddAttr<int>("ignore_index",
-                 "(int, default kIgnoreIndex), Specifies a target value that "
-                 "is ignored and"
-                 "does not contribute to the input gradient.")
-        .SetDefault(kIgnoreIndex);
-    AddComment(R"DOC(
-SigmoidCrossEntropyWithLogits Operator.
-
-This measures the element-wise probability error in classification tasks
-in which each class is independent. This can be thought of as predicting labels
-for a data-point, where labels are not mutually exclusive.
-For example, a news article can be about politics, technology or sports
-at the same time or none of these.
-
-The logistic loss is given as follows:
-
-       $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$
-
-We know that $$\sigma(X) = \\frac{1}{1 + \exp(-X)}$$. By substituting this we get:
-
-       $$loss = X - X * Labels + \log(1 + \exp(-X))$$
-
-For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
-we reformulate the loss as follows:
-
-       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-\|X\|))$$
-
-Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
-However the output only shares the LoD with input `X`.
-
-)DOC");
-  }
-};
-
-class SigmoidCrossEntropyWithLogitsGradOpDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("sigmoid_cross_entropy_with_logits_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Label", Input("Label"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsInplaceInferer,
-                           {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
-                  ops::SigmoidCrossEntropyWithLogitsOp,
-                  ops::SigmoidCrossEntropyWithLogitsOpMaker,
-                  ops::SigmoidCrossEntropyWithLogitsGradOpDescMaker,
-                  ops::SigmoidCrossEntropyWithLogitsInplaceInferer);
-REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
-                  ops::SigmoidCrossEntropyWithLogitsGradOp,
-                  ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_cross_entropy_with_logits,
-    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
-                                             float>,
-    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
-                                             double>);
-REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, float>,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
deleted file mode 100644
index 7c3a0ecba02a5d16dcb45025284680ba933ce9d5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "cub/cub.cuh"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename T>
-__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
-                                  const int ignore_index, const int limit,
-                                  T *out_data, T *counts) {
-  CUDA_1D_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T label = label_data[i];
-    T eps = static_cast<T>(1e-5);
-    T diff = label - static_cast<T>(ignore_index);
-    if ((diff > -eps) && (diff < eps)) {
-      out_data[i] = static_cast<T>(0.);
-      counts[i] = 0;
-    } else {
-      T term1 = (x > 0) ? x : 0;
-      T term2 = x * label;
-      T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
-      out_data[i] = term1 - term2 + term3;
-      counts[i] = 1;
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void Sum(const T *counts, int num, const T eps, T *sum) {
-  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T in = 0;
-  for (int i = threadIdx.x; i < num; i += BlockDim) {
-    in += counts[i];
-  }
-  __syncthreads();
-  auto out =
-      BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    T a = out > eps ? out : eps;
-    sum[0] = a;
-  }
-}
-
-template <typename T>
-__global__ void Div(T *loss, const int num, const T *norm) {
-  CUDA_1D_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
-}
-
-template <typename T>
-__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
-                                   const int ignore_index, const T *dout_data,
-                                   const int limit, T *dx_data, T *counts) {
-  CUDA_1D_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T label = label_data[i];
-    T dout = dout_data[i];
-    T eps = static_cast<T>(1e-5);
-    T diff = label - static_cast<T>(ignore_index);
-    if ((diff > -eps) && (diff < eps)) {
-      dx_data[i] = static_cast<T>(0.);
-      counts[i] = 0;
-    } else {
-      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
-      T diff = simoid_x - label;
-      dx_data[i] = dout * diff;
-      counts[i] = 1;
-    }
-  }
-}
-
-// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename DeviceContext, typename T>
-class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    Tensor *Out = context.Output<Tensor>("Out");
-    int ignore_index = context.Attr<int>("ignore_index");
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.cuda_device_context();
-    bool normalize = context.Attr<bool>("normalize");
-
-    // Temporary memory
-    auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T));
-    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
-
-    int limit = Out->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts);
-    if (normalize) {
-      auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
-      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
-      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-          counts, limit, static_cast<T>(1e-5), norm);
-      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm);
-    }
-  }
-};
-
-// dX = sigmoid(X) - labels
-template <typename DeviceContext, typename T>
-class GPUSigmoidCrossEntropyWithLogitsGradKernel
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-
-    int ignore_index = context.Attr<int>("ignore_index");
-
-    auto &dev_ctx = context.cuda_device_context();
-    // Temporary memory
-    auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T));
-    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
-
-    int limit = dX->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit,
-        dx_data, counts);
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
-      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
-      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-          counts, limit, static_cast<T>(1e-5), norm);
-      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
-                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
deleted file mode 100644
index 8f459d573ae5930c27a97c39ac79231384c3d12f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    Tensor *Out = context.Output<Tensor>("Out");
-    int ignore_index = context.Attr<int>("ignore_index");
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-    int limit = Out->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<T>();
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      T label = label_data[idx];
-      if (static_cast<int>(label) == ignore_index) {
-        out_data[idx] = static_cast<T>(0.);
-      } else {
-        T term1 = (x > 0) ? x : 0;
-        T term2 = x * label;
-        T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
-        out_data[idx] = term1 - term2 + term3;
-      }
-    }
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      int norm = 0;
-      T eps = static_cast<T>(1e-6);
-      for (int idx = 0; idx < limit; ++idx) {
-        T diff = label_data[idx] - static_cast<T>(ignore_index);
-        if ((diff < -eps) || (diff > eps)) {
-          norm += 1;
-        }
-      }
-      eps = static_cast<T>(1e-5);
-      norm = norm > eps ? norm : eps;
-      std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; });
-    }
-  }
-};
-
-// dX = sigmoid(X) - labels
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-
-    int ignore_index = context.Attr<int>("ignore_index");
-    int limit = dX->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<T>();
-    auto dout_data = dOut->data<T>();
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      T label = label_data[idx];
-      T dout = dout_data[idx];
-      if (static_cast<int>(label) == ignore_index) {
-        dx_data[idx] = static_cast<T>(0.);
-      } else {
-        T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
-        T diff = simoid_x - label;
-        dx_data[idx] = dout * diff;
-      }
-    }
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      int norm = 0;
-      T eps = static_cast<T>(1e-6);
-      for (int idx = 0; idx < limit; ++idx) {
-        T diff = label_data[idx] - static_cast<T>(ignore_index);
-        if ((diff < -eps) || (diff > eps)) {
-          norm += 1;
-        }
-      }
-      eps = static_cast<T>(1e-5);
-      norm = norm > eps ? norm : eps;
-      std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; });
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
deleted file mode 100644
index 6837856a6da804e27af2cd6c83052c04f17140d8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sign_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sign_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SignOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SignOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-template <typename AttrType>
-class SignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of sign operator.");
-    AddOutput("Out", "(Tensor) Output tensor of sign operator.");
-    AddComment(R"DOC(
-Sign operator
-
-$$Out = X.sign()$$
-)DOC");
-  }
-};
-
-class SignGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("scale");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttr("scale", 0.0f);
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
-                  ops::SignGradMaker);
-REGISTER_OP_CPU_KERNEL(
-    sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SignKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sign_op.cu b/paddle/fluid/operators/sign_op.cu
deleted file mode 100644
index 817e0fbbd511462f161633242d28e63062676eb9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sign_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sign_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    sign,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
deleted file mode 100644
index b99934daee17e2b8a9295b488c0483e47187a009..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sign_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class SignKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(place) = eigen_in.sign();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
deleted file mode 100644
index 21871d76569d0ce410824cf4760cb22529535094..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/similarity_focus_op.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/similarity_focus_op.h"
-
-namespace paddle {
-namespace operators {
-class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 4-D tensor with shape,"
-             " [BatchSize, X, Y, Z]");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), the similarity focus mask"
-              " with the same shape of input X.");
-    AddAttr<int>("axis",
-                 "(int32), indicating the dimension to be select. It can"
-                 " only be 1, 2, or 3.");
-    AddAttr<std::vector<int>>("indexes",
-                              "(std::vector<int32>), indicating the indexes"
-                              " of the selected dimension.");
-    AddComment(R"DOC(
-SimilarityFocus Operator.
-
-Generate a similarity focus mask with the same shape of input using the following method:
-1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding 
-   to the axis according to the indexes. For example, if axis=1 and indexes=[a], 
-   it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X 
-   is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
-2. For each index, find the largest numbers in the tensor T, so that the same 
-   row and same column has at most one number(what it means is that if the 
-   largest number has been found in the i-th row and the j-th column, then 
-   the numbers in the i-th row or j-th column will be skipped. And then the 
-   next largest number will be selected from the remaining numbers. Obviously 
-   there will be min(B, C) numbers), and mark the corresponding position of the 
-   3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
-   each index.
-3. Broadcast the 3-D similarity focus mask to the same shape of input X.
-
-Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
-)DOC");
-  }
-};
-
-class SimilarityFocusOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4, "Input(X)'s rank should be 4.");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   platform::CPUPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(similarity_focus, ops::SimilarityFocusOp,
-                  ops::SimilarityFocusOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(similarity_focus, ops::SimilarityFocusKernel<float>,
-                       ops::SimilarityFocusKernel<double>);
diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h
deleted file mode 100644
index bf3fed2aaf2cf92d5619ae5bce6dd70d9dfe9621..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/similarity_focus_op.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <cstring>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename T>
-class SimilarityFocusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    Tensor* out = context.Output<Tensor>("Out");
-    const Tensor* x = context.Input<Tensor>("X");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const T* x_data = x->data<T>();
-
-    int axis = context.Attr<int>("axis");
-    std::vector<int> indexes = context.Attr<std::vector<int>>("indexes");
-
-    int64_t batch_size = x->dims()[0];
-    int64_t dim[4];
-    for (int i = 1; i <= 3; ++i) {
-      dim[i] = x->dims()[i];
-    }
-
-    if (indexes.size() < 1) {
-      PADDLE_THROW("Indexes' size can not be 0.");
-    }
-    for (auto index : indexes) {
-      if (dim[axis] < index) {
-        PADDLE_THROW("Index exceeds tensor shape limit.");
-      }
-    }
-
-    int64_t array_size = 1;
-    for (int i = 1; i <= 3; ++i) {
-      if (i != axis) {
-        array_size *= dim[i];
-      }
-    }
-
-    std::vector<std::pair<T, int64_t>> array(array_size);
-
-    bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) = [](
-        std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
-      return x.first > y.first;
-    };
-
-    int64_t (*compute_index)(int64_t*, int, int, int, int) = [](
-        int64_t* dim, int d1, int d2, int d3, int d4) {
-      return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
-             d3 * dim[3] + d4;
-    };
-
-    memset(out_data, 0, sizeof(T) * batch_size * dim[1] * dim[2] * dim[3]);
-    for (int i = 0; i < batch_size; ++i) {
-      for (auto index : indexes) {
-        if (axis == 1) {
-          for (int j = 0; j < dim[2]; ++j) {
-            for (int k = 0; k < dim[3]; ++k) {
-              array[j * dim[3] + k] = std::make_pair(
-                  x_data[compute_index(dim, i, index, j, k)], j * dim[3] + k);
-            }
-          }
-
-          std::sort(array.begin(), array.end(), cmp);
-          int tag_num = 0;
-          std::vector<bool> tag2(dim[2]), tag3(dim[3]);
-          for (auto x : array) {
-            int idx2 = x.second / dim[3];
-            int idx3 = x.second % dim[3];
-            if (tag2[idx2] || tag3[idx3]) {
-              continue;
-            }
-            tag_num++;
-            tag2[idx2] = true;
-            tag3[idx3] = true;
-            for (int j = 0; j < dim[1]; ++j) {
-              out_data[compute_index(dim, i, j, idx2, idx3)] = 1;
-            }
-            if (tag_num == std::min(dim[2], dim[3])) {
-              break;
-            }
-          }
-        } else if (axis == 2) {
-          for (int j = 0; j < dim[1]; ++j) {
-            for (int k = 0; k < dim[3]; ++k) {
-              array[j * dim[3] + k] = std::make_pair(
-                  x_data[compute_index(dim, i, j, index, k)], j * dim[3] + k);
-            }
-          }
-
-          std::sort(array.begin(), array.end(), cmp);
-          int tag_num = 0;
-          std::vector<bool> tag1(dim[1]), tag3(dim[3]);
-          for (auto x : array) {
-            int idx1 = x.second / dim[3];
-            int idx3 = x.second % dim[3];
-            if (tag1[idx1] || tag3[idx3]) {
-              continue;
-            }
-            tag_num++;
-            tag1[idx1] = true;
-            tag3[idx3] = true;
-            for (int j = 0; j < dim[2]; ++j) {
-              out_data[compute_index(dim, i, idx1, j, idx3)] = 1;
-            }
-            if (tag_num == std::min(dim[1], dim[3])) {
-              break;
-            }
-          }
-        } else if (axis == 3) {
-          for (int j = 0; j < dim[1]; ++j) {
-            for (int k = 0; k < dim[2]; ++k) {
-              array[j * dim[2] + k] = std::make_pair(
-                  x_data[compute_index(dim, i, j, k, index)], j * dim[2] + k);
-            }
-          }
-
-          std::sort(array.begin(), array.end(), cmp);
-          int tag_num = 0;
-          std::vector<bool> tag1(dim[1]), tag2(dim[2]);
-          for (auto x : array) {
-            int idx1 = x.second / dim[2];
-            int idx2 = x.second % dim[2];
-            if (tag1[idx1] || tag2[idx2]) {
-              continue;
-            }
-            tag_num++;
-            tag1[idx1] = true;
-            tag2[idx2] = true;
-            for (int j = 0; j < dim[3]; ++j) {
-              out_data[compute_index(dim, i, idx1, idx2, j)] = 1;
-            }
-            if (tag_num == std::min(dim[1], dim[2])) {
-              break;
-            }
-          }
-        } else {
-          PADDLE_THROW("Axis must be 1 or 2 or 3");
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
deleted file mode 100644
index 0eda6fbef324111ec902b5695750e910b3aad792..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/size_op.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/size_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class SizeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input (Input) of Size op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output (Out) of Size op should not be null.");
-    ctx->SetOutputDim("Out", {1});
-  }
-};
-
-class SizeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "The input tensor.");
-    AddOutput("Out",
-              "The returned tensor, the data type "
-              "is int64_t, will be on the same device with the input Tensor.");
-    AddComment(R"DOC(
-Size Operator.
-
-Return the number of elements in the input.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(size, ops::SizeOp, ops::SizeOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int32_t>,
-                       ops::SizeKernel<float>, ops::SizeKernel<double>,
-                       ops::SizeKernel<bool>);
diff --git a/paddle/fluid/operators/size_op.cu b/paddle/fluid/operators/size_op.cu
deleted file mode 100644
index 4e5846660e62543638b669d586a92fc36b0c8e87..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/size_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/size_op.h"
-
-REGISTER_OP_CUDA_KERNEL(size, paddle::operators::SizeKernel<int>,
-                        paddle::operators::SizeKernel<int32_t>,
-                        paddle::operators::SizeKernel<float>,
-                        paddle::operators::SizeKernel<bool>,
-                        paddle::operators::SizeKernel<double>);
diff --git a/paddle/fluid/operators/size_op.h b/paddle/fluid/operators/size_op.h
deleted file mode 100644
index fb44070897156ef88062231322e28a2db1f244a7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/size_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class SizeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("Input");
-    auto* out_t = ctx.Output<Tensor>("Out");
-    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
-    out_data[0] = in_t->numel();
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
deleted file mode 100644
index 4cd7b33a4a83eeee1977a94afaf90d91b7edb766..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/slice_op.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/slice_op.h"
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class SliceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      "Input (Input) of slice op should not be null.");
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output (Out) of slice op should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_LT(in_dims.size(), 7,
-                      "The rank of input should be less than 7.");
-    framework::DDim out_dims(in_dims);
-
-    auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
-    auto ends = ctx->Attrs().Get<std::vector<int>>("ends");
-    auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
-    auto decrease_axis = ctx->Attrs().Get<std::vector<int>>("decrease_axis");
-
-    auto starts_size = starts.size();
-    auto ends_size = ends.size();
-    if (infer_flags.empty()) {
-      // Initialize infer_flags with 1.
-      // To be compatible with other op tests in which infer_flags is not set.
-      infer_flags = std::vector<int>(axes.size(), 1);
-    }
-
-    if (ctx->HasInputs("StartsTensorList")) {
-      auto StartsTensorList = ctx->Inputs("StartsTensorList");
-      PADDLE_ENFORCE_GT(StartsTensorList.size(), 0,
-                        "StartsTensorList size can't be zero");
-      starts_size = StartsTensorList.size();
-    }
-    if (ctx->HasInputs("EndsTensorList")) {
-      auto EndsTensorList = ctx->Inputs("EndsTensorList");
-      PADDLE_ENFORCE_GT(EndsTensorList.size(), 0,
-                        "EndsTensorList size can't be zero");
-      ends_size = EndsTensorList.size();
-    }
-
-    if (ctx->HasInput("StartsTensor") == false) {
-      PADDLE_ENFORCE_EQ(
-          starts_size, axes.size(),
-          "The size of starts must be equal to the size of axes.");
-    }
-    if (ctx->HasInput("EndsTensor") == false) {
-      PADDLE_ENFORCE_EQ(ends_size, axes.size(),
-                        "The size of ends must be equal to the size of axes.");
-    }
-
-    int dim_value, start, end;
-    for (size_t i = 0; i < axes.size(); ++i) {
-      PADDLE_ENFORCE_LT(static_cast<int>(axes[i]), in_dims.size(),
-                        "The index of dimension in axes must be less "
-                        "than the size of input shape.");
-      if (infer_flags[i] == -1) {
-        out_dims[axes[i]] = -1;
-      } else {
-        // infer out_dim shape
-        dim_value = out_dims[axes[i]];
-        if (dim_value > 0) {
-          start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-          end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-          start = std::max(start, 0);
-          end = std::max(end, 0);
-          end = std::min(end, dim_value);
-          PADDLE_ENFORCE_GT(end, start, "end should greater than start");
-          out_dims[axes[i]] = end - start;
-        }
-      }
-    }
-    // generate new shape
-    if (decrease_axis.size() > 0) {
-      std::vector<int> new_out_shape;
-      for (size_t i = 0; i < decrease_axis.size(); ++i) {
-        if (ctx->IsRuntime() && infer_flags[i] != -1) {
-          PADDLE_ENFORCE_EQ(out_dims[decrease_axis[i]], 1,
-                            "decrease dim should be 1");
-        }
-        out_dims[decrease_axis[i]] = 0;
-      }
-
-      for (int i = 0; i < out_dims.size(); ++i) {
-        if (out_dims[i] != 0) {
-          new_out_shape.push_back(out_dims[i]);
-        }
-      }
-      if (new_out_shape.size() == 0) {
-        new_out_shape.push_back(1);
-      }
-
-      out_dims = framework::make_ddim(new_out_shape);
-    }
-    ctx->SetOutputDim("Out", out_dims);
-    if (axes[0] != 0) {
-      ctx->ShareLoD("Input", /*->*/ "Out");
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   ctx.device_context());
-  }
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "StartsTensor" || var_name == "EndsTensor") {
-      return expected_kernel_type;
-    }
-    if (var_name == "StartsTensorList" || var_name == "EndsTensorList") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class SliceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "(Tensor) Tensor of data to extract slices from.");
-    AddInput("StartsTensor",
-             "(Tensor<int32>, optional) If provided, slice will use this."
-             "It has the highest priority of StartsTensor, StartsTensorList "
-             "and attr(starts).")
-        .AsDispensable();
-    AddInput("EndsTensor",
-             "(Tensor<int32>, optional) If provided, slice will use this."
-             "It has the highest priority of EndsTensor, EndsTensorList and "
-             "attr(ends).")
-        .AsDispensable();
-    AddInput(
-        "StartsTensorList",
-        "(vector<Tensor<int32>>, optional) If provided, slice will use this."
-        "The shape of the tensor in vector MUST BE [1]."
-        "It has higher priority compare with attr(starts).")
-        .AsDuplicable()
-        .AsDispensable();
-    AddInput(
-        "EndsTensorList",
-        "(vector<Tensor<int32>>, optional) If provided, slice will use this."
-        "The shape of the tensor in vector MUST BE [1]."
-        "It has higher priority compare with attr(ends).")
-        .AsDuplicable()
-        .AsDispensable();
-    AddOutput("Out", "Sliced data tensor.");
-    AddAttr<std::vector<int>>(
-        "axes",
-        "(list<int>) Axes that `starts` and `ends` apply to. It's optional."
-        "If not present, will be treated as [0, 1, ..., len(`starts`) - 1].");
-    AddAttr<std::vector<int>>(
-        "starts",
-        "(list<int>) Starting indices of corresponding axis in `axes`")
-        .SetDefault({});
-    AddAttr<std::vector<int>>(
-        "ends", "(list<int>) Ending indices of corresponding axis in `axes`.")
-        .SetDefault({});
-    AddAttr<std::vector<int>>(
-        "infer_flags", "(list<int>) Flags of inferring dims in attributes.")
-        .SetDefault({});
-    AddAttr<std::vector<int>>("decrease_axis", "(list<int>) decrease_axis")
-        .SetDefault({});
-    AddComment(R"DOC(
-Slice Operator.
-
-Produces a slice of the input tensor along multiple axes. Similar to numpy:
-https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
-Slice uses `axes`, `starts` and `ends` attributes to specify the start and
-end dimension for each axis in the list of axes, it uses this information
-to slice the input data tensor. If a negative value is passed for any of
-the start or end indices, it represents number of elements before the end
-of that dimension. If the value passed to start or end is larger than
-the n (the number of elements in this dimension), it represents n.
-For slicing to the end of a dimension with unknown size, it is recommended
-to pass in INT_MAX. The size of axes must be equal to starts\' and ends\'.
-Following examples will explain how slice works:
-
-.. code-block:: text
-
-    Case1:
-        Given:
-            data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-            axes = [0, 1]
-            starts = [1, 0]
-            ends = [2, 3]
-        Then:
-            result = [ [5, 6, 7], ]
-
-    Case2:
-        Given:
-            data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-            starts = [0, 1]
-            ends = [-1, 1000]
-        Then:
-            result = [ [2, 3, 4], ]
-)DOC");
-  }
-};
-
-class SliceOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, "Input should not be null");
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("Input");
-    auto x_grad_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "StartsTensor" || var_name == "EndsTensor") {
-      return expected_kernel_type;
-    }
-    if (var_name == "StartsTensorList" || var_name == "EndsTensorList") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class SliceOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *bind = new framework::OpDesc();
-    bind->SetInput("Input", Input("Input"));
-    bind->SetInput("StartsTensor", Input("StartsTensor"));
-    bind->SetInput("EndsTensor", Input("EndsTensor"));
-    bind->SetInput("StartsTensorList", Input("StartsTensorList"));
-    bind->SetInput("EndsTensorList", Input("EndsTensorList"));
-    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    bind->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    bind->SetAttrMap(Attrs());
-    bind->SetType("slice_grad");
-    return std::unique_ptr<framework::OpDesc>(bind);
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SliceOpGradNoNeedBufferVarsInference,
-                                      "Input");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker,
-                  ops::SliceOpGradMaker);
-REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad,
-                  ops::SliceOpGradNoNeedBufferVarsInference);
-
-REGISTER_OP_CPU_KERNEL(
-    slice, ops::SliceKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    slice_grad, ops::SliceGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu
deleted file mode 100644
index 4db8b837869bcefacd230f7d6a767187985837b0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/slice_op.cu
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/slice_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <size_t D>
-__global__ void Padding(const paddle::platform::float16* d_out,
-                        const int* out_dims, const int* in_dims,
-                        const int* offsets, int64_t n,
-                        paddle::platform::float16* d_in) {
-  int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (out_idx < n) {
-    int64_t out_idx_tmp = out_idx;
-    int coords[D] = {0};
-    for (int i = D - 1; i >= 0; --i) {
-      coords[i] = out_idx_tmp % out_dims[i];
-      out_idx_tmp /= out_dims[i];
-      coords[i] += offsets[i];
-    }
-
-    int64_t in_idx = 0;
-    for (int i = 0; i < D; ++i) {
-      in_idx = in_idx * in_dims[i] + coords[i];
-    }
-
-    d_in[in_idx] = d_out[out_idx];
-  }
-}
-
-template <>
-class SliceGradKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>
-    : public framework::OpKernel<paddle::platform::float16> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
-    d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
-
-    auto out_dims = d_out->dims();
-    auto in_dims = d_in->dims();
-    int rank = out_dims.size();
-    std::vector<int> offsets(rank, 0);
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto starts = ctx.Attr<std::vector<int>>("starts");
-
-    auto list_new_starts_tensor =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = get_new_data_from_tensorlist(list_new_starts_tensor);
-    } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<framework::Tensor>("StartsTensor");
-      starts = get_new_data_from_tensor(starts_tensor);
-    }
-
-    for (size_t i = 0; i < starts.size(); ++i) {
-      if (starts[i] < 0) {
-        starts[i] += in_dims[axes[i]];
-      }
-      offsets[axes[i]] = std::max(starts[i], 0);
-    }
-
-    math::SetConstant<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>
-        set_zero;
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::CUDADeviceContext>();
-    set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
-
-    int64_t numel = d_out->numel();
-    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
-    dim3 threads(PADDLE_CUDA_NUM_THREADS);
-    auto stream = ctx.cuda_device_context().stream();
-
-    auto out_shape = framework::vectorize<int>(out_dims);
-    thrust::device_vector<int> out_dims_vec(out_shape.begin(), out_shape.end());
-    auto in_shape = framework::vectorize<int>(in_dims);
-    thrust::device_vector<int> in_dims_vec(in_shape.begin(), in_shape.end());
-    thrust::device_vector<int> offsets_vec(offsets.begin(), offsets.end());
-    const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data());
-    const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data());
-    const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data());
-
-    switch (rank) {
-      case 1:
-        Padding<1><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 2:
-        Padding<2><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 3:
-        Padding<3><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 4:
-        Padding<4><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 5:
-        Padding<5><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 6:
-        Padding<6><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    slice_grad,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
deleted file mode 100644
index 5f687fedf546f0548c421680151ef972be3dea19..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/slice_op.h
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-inline std::vector<int> get_new_data_from_tensorlist(
-    const std::vector<const Tensor*>& list_new_data_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_data;
-  for (size_t i = 0; i < list_new_data_tensor.size(); ++i) {
-    auto tensor = list_new_data_tensor[i];
-    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
-                      "shape of dim tensor should be [1]");
-    if (platform::is_gpu_place(tensor->place())) {
-      framework::Tensor temp;
-      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-      vec_new_data.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
-    } else {
-      vec_new_data.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
-    }
-  }
-  return vec_new_data;
-}
-inline std::vector<int> get_new_data_from_tensor(
-    const Tensor* new_data_tensor) {
-  std::vector<int> vec_new_data;
-  auto* new_data = new_data_tensor->data<int>();
-  framework::Tensor cpu_starts_tensor;
-  if (platform::is_gpu_place(new_data_tensor->place())) {
-    TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
-    new_data = cpu_starts_tensor.data<int>();
-  }
-  vec_new_data =
-      std::vector<int>(new_data, new_data + new_data_tensor->numel());
-  return vec_new_data;
-}
-
-template <typename DeviceContext, typename T>
-class SliceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
-    switch (rank) {
-      case 1:
-        SliceCompute<1>(ctx);
-        break;
-      case 2:
-        SliceCompute<2>(ctx);
-        break;
-      case 3:
-        SliceCompute<3>(ctx);
-        break;
-      case 4:
-        SliceCompute<4>(ctx);
-        break;
-      case 5:
-        SliceCompute<5>(ctx);
-        break;
-      case 6:
-        SliceCompute<6>(ctx);
-        break;
-    }
-  }
-
- private:
-  template <size_t D>
-  void SliceCompute(const framework::ExecutionContext& context) const {
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto in = context.Input<framework::Tensor>("Input");
-    auto out = context.Output<framework::Tensor>("Out");
-    auto out_dims = out->dims();
-    auto in_dims = in->dims();
-
-    auto axes = context.Attr<std::vector<int>>("axes");
-    auto starts = context.Attr<std::vector<int>>("starts");
-    auto ends = context.Attr<std::vector<int>>("ends");
-    auto decrease_axis = context.Attr<std::vector<int>>("decrease_axis");
-    auto infer_flags = context.Attr<std::vector<int>>("infer_flags");
-
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-
-    bool need_infer = false;
-    if (context.HasInput("StartsTensor") || context.HasInput("EndsTensor")) {
-      need_infer = true;
-    }
-    if (list_new_starts_tensor.size() > 0 || list_new_ends_tensor.size() > 0) {
-      need_infer = true;
-    }
-
-    if (need_infer) {
-      if (context.HasInput("StartsTensor")) {
-        auto* starts_tensor = context.Input<framework::Tensor>("StartsTensor");
-        starts = get_new_data_from_tensor(starts_tensor);
-      } else if (list_new_starts_tensor.size() > 0) {
-        starts = get_new_data_from_tensorlist(list_new_starts_tensor);
-      }
-      PADDLE_ENFORCE_EQ(
-          starts.size(), axes.size(),
-          "The size of starts must be equal to the size of axes.");
-      if (context.HasInput("EndsTensor")) {
-        auto* ends_tensor = context.Input<framework::Tensor>("EndsTensor");
-        ends = get_new_data_from_tensor(ends_tensor);
-      } else if (list_new_ends_tensor.size() > 0) {
-        ends = get_new_data_from_tensorlist(list_new_ends_tensor);
-      }
-      PADDLE_ENFORCE_EQ(ends.size(), axes.size(),
-                        "The size of ends must be equal to the size of axes.");
-      out_dims = in_dims;
-      int dim_value, start, end;
-      for (size_t i = 0; i < axes.size(); ++i) {
-        dim_value = out_dims[axes[i]];
-        if (dim_value > 0) {
-          // when end = start+1 and start == -1
-          if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
-            auto ret =
-                std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
-            if (ret != decrease_axis.end()) {
-              ends[i] = 10000000;
-            }
-          }
-
-          start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-          end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-          start = std::max(start, 0);
-          end = std::max(end, 0);
-          end = std::min(end, dim_value);
-          PADDLE_ENFORCE_GT(end, start, "end should greater than start");
-          out_dims[axes[i]] = end - start;
-        }
-      }
-      out->Resize(out_dims);
-      // generate new shape
-      if (decrease_axis.size() > 0) {
-        std::vector<int> new_out_shape;
-        for (size_t i = 0; i < decrease_axis.size(); ++i) {
-          PADDLE_ENFORCE_EQ(out_dims[decrease_axis[i]], 1,
-                            "decrease dim should be 1");
-          out_dims[decrease_axis[i]] = 0;
-        }
-
-        for (int i = 0; i < out_dims.size(); ++i) {
-          if (out_dims[i] != 0) {
-            new_out_shape.push_back(out_dims[i]);
-          }
-        }
-        if (new_out_shape.size() == 0) {
-          new_out_shape.push_back(1);
-        }
-
-        out_dims = framework::make_ddim(new_out_shape);
-      }
-    }
-
-    // resize out_dims
-    if (decrease_axis.size() > 0) {
-      if (decrease_axis.size() == (size_t)in_dims.size()) {
-        std::vector<int> vec_origin_out_shape(decrease_axis.size(), 1);
-        out->Resize(framework::make_ddim(vec_origin_out_shape));
-      } else {
-        std::vector<int> vec_origin_out_shape(
-            out_dims.size() + decrease_axis.size(), -1);
-
-        for (size_t i = 0; i < decrease_axis.size(); ++i) {
-          vec_origin_out_shape[decrease_axis[i]] = 1;
-        }
-
-        int index = 0;
-        for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) {
-          if (vec_origin_out_shape[i] == -1) {
-            vec_origin_out_shape[i] = out_dims[index];
-            ++index;
-          }
-        }
-
-        out->Resize(framework::make_ddim(vec_origin_out_shape));
-      }
-    }
-
-    out->mutable_data<T>(context.GetPlace());
-
-    auto new_out_dims = out->dims();
-    auto offsets = Eigen::array<int, D>();
-    auto extents = Eigen::array<int, D>();
-    for (size_t i = 0; i < D; ++i) {
-      offsets[i] = 0;
-      extents[i] = new_out_dims[i];
-    }
-    int start;
-    for (size_t i = 0; i < axes.size(); ++i) {
-      start = starts[i];
-      if (start < 0) {
-        start = (start + in_dims[axes[i]]);
-      }
-      start = std::max(start, 0);
-      offsets[axes[i]] = start;
-    }
-    auto in_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in);
-    auto out_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *out, new_out_dims);
-    out_t.device(place) = in_t.slice(offsets, extents);
-
-    out->Resize(out_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SliceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    size_t rank = ctx.Input<framework::Tensor>("Input")->dims().size();
-    switch (rank) {
-      case 1:
-        SliceCompute<1>(ctx);
-        break;
-      case 2:
-        SliceCompute<2>(ctx);
-        break;
-      case 3:
-        SliceCompute<3>(ctx);
-        break;
-      case 4:
-        SliceCompute<4>(ctx);
-        break;
-      case 5:
-        SliceCompute<5>(ctx);
-        break;
-      case 6:
-        SliceCompute<6>(ctx);
-        break;
-    }
-  }
-
- private:
-  template <size_t D>
-  void SliceCompute(const framework::ExecutionContext& context) const {
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_input =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-    d_input->mutable_data<T>(context.GetPlace());
-    auto out_dims = d_out->dims();
-    auto in_dims = d_input->dims();
-    auto axes = context.Attr<std::vector<int>>("axes");
-    auto starts = context.Attr<std::vector<int>>("starts");
-    auto ends = context.Attr<std::vector<int>>("ends");
-
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = get_new_data_from_tensorlist(list_new_starts_tensor);
-    } else if (context.HasInput("StartsTensor")) {
-      auto* starts_tensor = context.Input<framework::Tensor>("StartsTensor");
-      starts = get_new_data_from_tensor(starts_tensor);
-    }
-
-    if (list_new_ends_tensor.size() > 0) {
-      ends = get_new_data_from_tensorlist(list_new_ends_tensor);
-    } else if (context.HasInput("EndsTensor")) {
-      auto* ends_tensor = context.Input<framework::Tensor>("EndsTensor");
-      ends = get_new_data_from_tensor(ends_tensor);
-    }
-
-    auto decrease_axis = context.Attr<std::vector<int>>("decrease_axis");
-    if (decrease_axis.size() > 0) {
-      if (decrease_axis.size() == (size_t)in_dims.size()) {
-        // all dims decrease
-        std::vector<int> vec_origin_out_shape(decrease_axis.size(), 1);
-        out_dims = framework::make_ddim(vec_origin_out_shape);
-      } else {
-        std::vector<int> vec_origin_out_shape(
-            out_dims.size() + decrease_axis.size(), -1);
-
-        for (size_t i = 0; i < decrease_axis.size(); ++i) {
-          vec_origin_out_shape[decrease_axis[i]] = 1;
-        }
-
-        int index = 0;
-        for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) {
-          if (vec_origin_out_shape[i] == -1) {
-            vec_origin_out_shape[i] = out_dims[index];
-            ++index;
-          }
-        }
-
-        out_dims = framework::make_ddim(vec_origin_out_shape);
-      }
-    }
-
-    auto offsets = Eigen::array<int, D>();
-    auto extents = Eigen::array<int, D>();
-    for (size_t i = 0; i < D; ++i) {
-      offsets[i] = 0;
-      extents[i] = out_dims[i];
-    }
-    int start;
-    for (size_t i = 0; i < axes.size(); ++i) {
-      start = starts[i];
-      if (start < 0) {
-        start = (start + in_dims[axes[i]]);
-      }
-      start = std::max(start, 0);
-      offsets[axes[i]] = start;
-    }
-    Eigen::array<std::pair<int, int>, D> paddings;
-    for (size_t i = 0; i < paddings.size(); ++i) {
-      paddings[i].first = offsets[i];
-      paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i];
-    }
-    auto d_in_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *d_input);
-    auto d_out_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *d_out, out_dims);
-    d_in_t.device(place) = d_out_t.pad(paddings, 0);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
deleted file mode 100644
index 22b621248d69811898e418b5a0ae609319583e43..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/smooth_l1_loss_op.h"
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SmoothL1LossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(x_dims, y_dims);
-    }
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The tensor rank of Input(X) should not be less than 2.");
-    if (ctx->HasInput("InsideWeight")) {
-      PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
-                     "If weights are provided, must specify both "
-                     "inside and outside weights.");
-      auto dims = ctx->GetInputDim("InsideWeight");
-      bool check = true;
-      if ((!ctx->IsRuntime()) &&
-          (framework::product(dims) <= 0 || framework::product(x_dims) <= 0)) {
-        check = false;
-      }
-      if (check) {
-        PADDLE_ENFORCE_EQ(dims, x_dims);
-      }
-
-      dims = ctx->GetInputDim("OutsideWeight");
-      check = true;
-      if ((!ctx->IsRuntime()) &&
-          (framework::product(dims) <= 0 || framework::product(x_dims) <= 0)) {
-        check = false;
-      }
-      if (check) {
-        PADDLE_ENFORCE_EQ(dims, x_dims);
-      }
-    }
-
-    ctx->SetOutputDim("Diff", x_dims);
-    // loss is a two-rank tensor
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-  }
-};
-
-class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "The input value of smooth l1 loss op with shape "
-             "[batch_size, dim1, ..., dimN].");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "The target value of smooth l1 loss op with same shape as X.");
-    AddInput("InsideWeight",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "This input is optional and should have same shape with X. "
-             "If provided, the result of (X - Y) will be multiplied "
-             "by this tensor element by element.")
-        .AsDispensable();
-    AddInput("OutsideWeight",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "This input is optional and should have same shape with X. "
-             "If provided, the out smooth l1 loss will be multiplied by this "
-             "tensor element by element.")
-        .AsDispensable();
-    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
-        .AsIntermediate();
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
-              "The output smooth l1 loss with shape [batch_size, 1].");
-    AddAttr<float>("sigma",
-                   "Hyper parameter of smooth l1 loss op."
-                   "A float scalar with default value 3.0.")
-        .SetDefault(1.0);
-    AddComment(R"DOC(
-Smooth L1 Loss Operator.
-
-This operator computes the smooth l1 loss for X and Y.
-The operator takes the first dimension of X and Y as batch size.
-For each instance, it computes the smooth l1 loss element by element first
-and then sums all the losses. So the shape of Out is [batch_size, 1].
-
-The equation is:
-$$
-Out_{\sigma}(X, Y)_i = \begin{cases}
-0.5 * (\sigma * (X_i - Y_i)) ^ 2
-\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
-\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
-\quad otherwise
-\end{cases}
-$$
-
-In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
-element of Out, X and Y.
-
-)DOC");
-  }
-};
-
-class SmoothL1LossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("Diff");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_GE(out_dims.size(), 2,
-                      "The tensor rank of Input(Out@Grad) should be 2.");
-    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[0], in_dims[0],
-                                 "The 1st dimension of Input(Out@Grad) must be "
-                                 "same as input.");
-    PADDLE_INFERSHAPE_ENFORCE_EQ(
-        ctx, out_dims[1], 1, "The 2nd dimension of Input(Out@Grad) must be 1.");
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, in_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, in_dims);
-    }
-  }
-};
-
-class SmoothL1LossGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType("smooth_l1_loss_grad");
-    op->SetInput("InsideWeight", Input("InsideWeight"));
-    op->SetInput("OutsideWeight", Input("OutsideWeight"));
-    op->SetInput("Diff", Output("Diff"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
-                  ops::SmoothL1LossGradMaker);
-REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    smooth_l1_loss,
-    ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu
deleted file mode 100644
index e5df479090fabe926f65f58e2300e3ee2027e54d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/smooth_l1_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/smooth_l1_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    smooth_l1_loss,
-    ops::SmoothL1LossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h
deleted file mode 100644
index efe3afba18e8f3368f2e21f91adc5aa935bf713a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/smooth_l1_loss_op.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct SmoothL1LossForward {
-  HOSTDEVICE SmoothL1LossForward(const T& sigma2) : sigma2(sigma2) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val < 1.0 / sigma2) {
-      return 0.5 * val * val * sigma2;
-    } else {
-      return abs_val - 0.5 / sigma2;
-    }
-  }
-
-  T sigma2;
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class SmoothL1LossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* in2 = context.Input<Tensor>("InsideWeight");
-    auto* in3 = context.Input<Tensor>("OutsideWeight");
-    auto* out0 = context.Output<Tensor>("Diff");
-    auto* out1 = context.Output<Tensor>("Out");
-
-    out0->mutable_data<T>(context.GetPlace());
-    out1->mutable_data<T>(context.GetPlace());
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
-    T sigma2 = sigma * sigma;
-    bool has_weight = (in2 != nullptr) && (in3 != nullptr);
-
-    auto x = EigenVector<T>::Flatten(*in0);
-    auto y = EigenVector<T>::Flatten(*in1);
-    auto diff = EigenVector<T>::Flatten(*out0);
-
-    diff.device(*place) = x - y;
-    // multiply inside weight
-    if (has_weight) {
-      auto inside_weight = EigenVector<T>::Flatten(*in2);
-      // cache diff, reused in bp
-      diff.device(*place) = diff * inside_weight;
-    }
-
-    auto in_counts = in0->numel();
-    Tensor ptensor_errors;
-    ptensor_errors.mutable_data<T>({static_cast<int>(in_counts)},
-                                   context.GetPlace());
-    auto errors = EigenVector<T>::Flatten(ptensor_errors);
-    // apply smooth l1 forward
-    errors.device(*place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
-
-    // multiply outside weight
-    if (has_weight) {
-      auto outside_weight = EigenVector<T>::Flatten(*in3);
-      errors.device(*place) = errors * outside_weight;
-    }
-    auto loss = EigenVector<T>::Flatten(*out1);
-    // first dimension of 'X' is the number of samples
-    auto mat_dims =
-        framework::make_ddim({static_cast<int>(in0->dims()[0]),
-                              static_cast<int>(in_counts / in0->dims()[0])});
-    auto errors_mat_view = EigenMatrix<T>::From(ptensor_errors, mat_dims);
-    loss.device(*place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
-  }
-};
-
-template <typename T>
-struct SmoothL1LossBackward {
-  HOSTDEVICE SmoothL1LossBackward(const T& sigma2) : sigma2(sigma2) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val < 1.0 / sigma2) {
-      return sigma2 * val;
-    } else {
-      return (0 < val) - (val < 0);
-    }
-  }
-
-  T sigma2;
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class SmoothL1LossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("InsideWeight");
-    auto* in1 = context.Input<Tensor>("OutsideWeight");
-    auto* in2 = context.Input<Tensor>("Diff");
-    auto* og = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
-    T sigma2 = sigma * sigma;
-    bool has_weight = (in0 != nullptr) && (in1 != nullptr);
-
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    auto in_dims = in2->dims();
-    auto counts = in2->numel();
-    auto cols = counts / in_dims[0];
-    auto mat_dims = framework::make_ddim(
-        {static_cast<int>(in_dims[0]), static_cast<int>(cols)});
-
-    Tensor ptensor_diff;
-    ptensor_diff.mutable_data<T>({static_cast<int>(counts)},
-                                 context.GetPlace());
-    auto diff = EigenVector<T>::Flatten(ptensor_diff);
-    // apply smooth l1 backwoard
-    diff.device(*place) = EigenVector<T>::Flatten(*in2).unaryExpr(
-        SmoothL1LossBackward<T>(sigma2));
-
-    // compute weights
-    Tensor ptensor_weights;
-    ptensor_weights.mutable_data<T>(mat_dims, context.GetPlace());
-    auto weights = EigenMatrix<T>::From(ptensor_weights);
-    // initialize to 1.0
-    weights.device(*place) = weights.constant(static_cast<T>(1.0));
-    if (has_weight) {
-      auto inside_weight = EigenMatrix<T>::From(*in0, mat_dims);
-      auto outside_weight = EigenMatrix<T>::From(*in1, mat_dims);
-      weights.device(*place) = inside_weight * outside_weight;
-    }
-
-    // compute gradients
-    auto out_grad = EigenMatrix<T>::From(*og);
-    auto diff_mat_view = EigenMatrix<T>::From(ptensor_diff, mat_dims);
-    auto gradients = out_grad.broadcast(
-                         Eigen::array<int, 2>({{1, static_cast<int>(cols)}})) *
-                     weights * diff_mat_view;
-
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
-
-    if (out0) {
-      out0->mutable_data<T>(context.GetPlace());
-      auto x_grad = EigenMatrix<T>::From(*out0, mat_dims);
-      x_grad.device(*place) = gradients;
-    }
-
-    if (out1) {
-      out1->mutable_data<T>(context.GetPlace());
-      auto y_grad = EigenMatrix<T>::From(*out1, mat_dims);
-      y_grad.device(*place) = -1 * gradients;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
deleted file mode 100644
index ad3e5543f10ae05865565110ba2231c897c205b8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<Tensor>("X");
-    auto* Out = context.Output<Tensor>("Out");
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto dims = X->dims();
-    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
-    framework::LoDTensor flattened_x;
-    framework::LoDTensor flattened_out;
-    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
-    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
-
-    math::SoftmaxCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(),
-        &flattened_x, &flattened_out);
-  }
-};
-
-template <typename T>
-class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<Tensor>("Out");
-    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
-
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto dims = Out->dims();
-    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
-    framework::LoDTensor flattened_out;
-    framework::LoDTensor flattened_d_out;
-    framework::LoDTensor flattened_d_x;
-    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
-    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
-    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
-
-    math::SoftmaxGradCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(),
-        &flattened_out, &flattened_d_out, &flattened_d_x);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxCUDNNKernel<float>,
-                   ops::SoftmaxCUDNNKernel<double>,
-                   ops::SoftmaxCUDNNKernel<plat::float16>);
-REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>,
-                   ops::SoftmaxGradCUDNNKernel<double>,
-                   ops::SoftmaxGradCUDNNKernel<plat::float16>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
deleted file mode 100644
index 9d73a19197c29fae29728cd6ab770bc0cc7a3ab1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_op.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/softmax_op.h"
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class SoftmaxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SoftmaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SoftmaxOp should not be null.");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto rank_x = dim_x.size();
-    auto axis = ctx->Attrs().Get<int>("axis");
-    PADDLE_ENFORCE(axis >= -rank_x && axis < rank_x,
-                   "Attr(axis) value should be in range [-R, R-1], "
-                   "R is the rank of Input(X).");
-
-    auto use_cudnn = ctx->Attrs().Get<bool>("use_cudnn");
-    if (axis != rank_x - 1 && axis != -1) {
-      PADDLE_ENFORCE(!use_cudnn, "CUDNN kernel only support axis as -1.");
-    }
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // choose cudnn kernel if the runtime supported.
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-
-#ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kCUDNN;
-    }
-#endif
-#ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
-    }
-#endif
-
-    auto input_data_type = ctx.Input<Tensor>("X")->type();
-    if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                     "float16 can only be used on GPU place");
-    }
-
-    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
-                                   library_);
-  }
-};
-
-class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of softmax, "
-             "whose dimension :attr:`axis` is the input_feature_dimensions.");
-    AddOutput("Out", "The normalized values with the same shape as X.");
-    AddAttr<int>("axis",
-                 "The dimension index of Input(x) to perform softmax,"
-                 "default -1 for last dimension")
-        .SetDefault(-1);
-    AddAttr<bool>(
-        "use_cudnn",
-        "(bool, default false) Only used in cudnn kernel, need install cudnn")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Softmax Operator.
-
-The input of the softmax operator is a tensor of any rank. The output tensor
-has the same shape as the input.
-
-The dimension :attr:`axis` of the input tensor will be permuted to the last.
-Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
-second dimension(row length) is as same as the dimension :attr:`axis` of the input
-tensor, and the first dimension(column length) is the product of all other
-dimensions of the input tensor. For each row of the matrix, the softmax operator
-squashes the K-dimensional(K is the width of the matrix, which is also the size
-of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
-K-dimensional vector of real values in the range [0, 1] that add up to 1.
-It computes the exponential of the given dimension and the sum of exponential
-values of all the other dimensions in the K-dimensional vector input.
-Then the ratio of the exponential of the given dimension and the sum of
-exponential values of all the other dimensions is the output of the softmax
-operator.
-
-For each row $i$ and each column $j$ in the matrix, we have:
-    $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
-
-)DOC");
-  }
-};
-
-class SoftmaxOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
-  }
-};
-
-class SoftmaxOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Out"),
-                      ctx->GetInputDim(framework::GradVarName("Out")),
-                      "Input(Out) and its gradients should have a same shape.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"),
-                      ctx->GetInputDim(framework::GradVarName("Out")));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // choose cudnn kernel if the runtime supported.
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-
-#ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kCUDNN;
-    }
-#endif
-#ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
-    }
-#endif
-    auto input_data_type =
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type();
-    if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                     "float16 can only be used on GPU place");
-    }
-
-    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
-                                   library_);
-  }
-};
-
-class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op = new framework::OpDesc();
-    op->SetType("softmax_grad");
-
-    op->SetInput("Out", Output("Out"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(SoftmaxInplaceInferer, {"X", "Out"});
-
-class SoftmaxGradInplaceInferer final : public framework::InplaceOpInference {
- public:
-  using framework::InplaceOpInference::InplaceOpInference;
-
-  std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc& op_desc, bool use_cuda) const final {
-    if (use_cuda) {
-      return {{"Out", framework::GradVarName("X")}};
-    } else {
-      // NOTE(zjl): AVX implementation of SoftmaxGrad does not support in-place
-      return {};
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
-                  ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker,
-                  ops::SoftmaxInplaceInferer);
-REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad,
-                  ops::SoftmaxGradInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
deleted file mode 100644
index 19359b7eef5126d84f0707d39095a74ae4561186..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxKernel<plat::CUDADeviceContext, double>,
-    ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
deleted file mode 100644
index a964c3b57a635b3e5f0a4c163e3b3c13d465102b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_op.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline int SizeToAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeFromAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename DeviceContext, typename T>
-class SoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<Tensor>("X");
-    auto* Out = context.Output<Tensor>("Out");
-    const int rank = X->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = X->dims()[axis];
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-
-    const int n = SizeToAxis(axis, X->dims());
-    const int d = SizeFromAxis(axis, X->dims());
-    Tensor X_2d, Out_2d;
-    X_2d.ShareDataWith(*X).Resize({n, d});
-    Out_2d.ShareDataWith(*Out).Resize({n, d});
-
-#ifdef PADDLE_ON_INFERENCE
-    math::SoftmaxFunctor<DeviceContext, T, true>()(
-        context.template device_context<DeviceContext>(), axis_dim, &X_2d,
-        &Out_2d);
-#else
-    math::SoftmaxFunctor<DeviceContext, T, false>()(
-        context.template device_context<DeviceContext>(), axis_dim, &X_2d,
-        &Out_2d);
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<Tensor>("Out");
-    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
-    const int rank = dX->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = dX->dims()[axis];
-
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-
-    const int n = SizeToAxis(axis, dX->dims());
-    const int d = SizeFromAxis(axis, dX->dims());
-    Tensor dX_2d, Out_2d, dOut_2d;
-    dX_2d.ShareDataWith(*dX).Resize({n, d});
-    Out_2d.ShareDataWith(*Out).Resize({n, d});
-    dOut_2d.ShareDataWith(*dOut).Resize({n, d});
-
-    math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), axis_dim, &Out_2d,
-        &dOut_2d, &dX_2d);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
deleted file mode 100644
index 8cde72921cb10bd6cbd7522e32bc5fafcaf46bb9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class SoftmaxWithCrossEntropyOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Logits",
-             "(Tensor, default: Tensor<float>), The input tensor of unscaled "
-             "log probabilities, whose dimension :attr:`axis` should be scaled "
-             "by softmax.");
-    AddInput(
-        "Label",
-        "(Tensor) The input tesnor of groud truth label. If :attr:`soft_label` "
-        "is set to false, Label is a Tensor<int64> in same shape with "
-        "Input(Logits) except the shape in dimension :attr:`axis` as 1. If "
-        "soft_label is set to true, Label is a Tensor<float/double> in same "
-        "shape with Input(Logits).");
-    AddOutput(
-        "Softmax",
-        "(Tensor, default: Tensor<float>), A tensor in same shape with "
-        "Input(Logits). "
-        "The outputs value of softmax activation by given the input batch, "
-        "which will be used in backward calculation.")
-        .AsIntermediate();
-    AddOutput("Loss",
-              "(Tensor, default: Tensor<float>), A tensor in same shape with "
-              "Input(Logits) "
-              "except the shape in dimension :attr:`axis` as 1. The cross "
-              "entropy loss.");
-    AddAttr<bool>(
-        "soft_label",
-        "(bool, default: false), A flag to indicate whether to interpretate "
-        "the given labels as soft labels.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "numeric_stable_mode",
-        "(bool, default: true), A flag to indicate whether to use more "
-        "numerically stable algorithm. This flag is only valid when "
-        "soft_label is false and GPU is used.")
-        .SetDefault(true);
-    AddAttr<int>(
-        "ignore_index",
-        "(int, default -100), Specifies a target value that is ignored and"
-        "does not contribute to the input gradient. Only valid if soft_label"
-        "is set to False")
-        .SetDefault(-100);
-    AddAttr<int>("axis",
-                 "The dimension index of Input(Logits) to perform softmax,"
-                 "default -1 for last dimension")
-        .SetDefault(-1);
-    AddComment(R"DOC(
-Softmax With Cross Entropy Operator.
-
-Cross entropy loss with softmax is used as the output layer extensively. This
-operator computes the softmax normalized values for each row of the input
-tensor, after which cross-entropy loss is computed. This provides a more
-numerically stable gradient.
-
-Because this operator performs a softmax on logits internally, it expects
-unscaled logits. This operator should not be used with the output of
-softmax operator since that would produce incorrect results.
-
-When the attribute soft_label is set false, this operators expects mutually
-exclusive hard labels, each sample in a batch is in exactly one class with a
-probability of 1.0. Each sample in the batch will have a single label.
-
-The equation is as follows:
-
-1) Hard label (one-hot label, so every sample has exactly one class)
-
-$$Loss_j =  -\text{Logit}_{Label_j} +
-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
-j = 1,..., K$$
-
-2) Soft label (each sample can have a distribution over all classes)
-
-$$Loss_j =  -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i -
-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
-j = 1,...,K$$
-
-)DOC");
-  }
-};
-
-class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Softmax"),
-                   "Output(Softmax) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) should be not null.");
-
-    auto axis = ctx->Attrs().Get<int>("axis");
-    auto logits_dims = ctx->GetInputDim("Logits");
-    auto labels_dims = ctx->GetInputDim("Label");
-    auto logits_rank = logits_dims.size();
-    PADDLE_ENFORCE(axis >= -logits_rank && axis < logits_rank,
-                   "Attr(axis) value should be in range [-R, R-1], "
-                   "R is the rank of Input(Logits).");
-
-    axis = CanonicalAxis(axis, logits_rank);
-    for (int i = 0; i < logits_rank; i++) {
-      if (i != axis) {
-        if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
-          PADDLE_ENFORCE_EQ(
-              logits_dims[i], labels_dims[i],
-              "Input(Logits) and Input(Label) should in same shape in "
-              "dimensions except axis.");
-        }
-      }
-    }
-
-    auto numeric_stable_mode = ctx->Attrs().Get<bool>("numeric_stable_mode");
-    if (axis != logits_rank - 1) {
-      PADDLE_ENFORCE(
-          numeric_stable_mode,
-          "Attr(axis) can only be -1 when not in numeric_stable_mode.");
-    }
-
-    bool soft_label = ctx->Attrs().Get<bool>("soft_label");
-    if (soft_label) {
-      if (ctx->IsRuntime() ||
-          (logits_dims[axis] > 0 && labels_dims[axis] > 0)) {
-        PADDLE_ENFORCE_EQ(logits_dims[axis], labels_dims[axis],
-                          "If Attr(soft_label) == true, the axis dimension of "
-                          "Input(X) and Input(Label) should be equal.");
-      }
-    } else {
-      if (ctx->IsRuntime() || labels_dims[axis] > 0) {
-        PADDLE_ENFORCE_EQ(labels_dims[axis], 1UL,
-                          "If Attr(soft_label) == false, the axis dimension of "
-                          "Input(Label) should be 1.");
-      }
-    }
-
-    ctx->SetOutputDim("Softmax", logits_dims);
-
-    logits_dims[axis] = 1;
-    ctx->SetOutputDim("Loss", logits_dims);
-
-    ctx->ShareLoD("Logits", /*->*/ "Softmax");
-    ctx->ShareLoD("Logits", /*->*/ "Loss");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Logits")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input(Loss@Grad) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Softmax"),
-                   "Input(Softmax) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
-                   "Output(Logits@Grad) should be not null.");
-
-    auto axis = ctx->Attrs().Get<int>("axis");
-    auto softmax_dims = ctx->GetInputDim("Softmax");
-    auto labels_dims = ctx->GetInputDim("Label");
-    auto softmax_rank = softmax_dims.size();
-    PADDLE_ENFORCE(axis >= -softmax_rank && axis < softmax_rank,
-                   "Attr(axis) value should be in range [-R, R-1], "
-                   "R is the rank of Input(Logits).");
-
-    axis = CanonicalAxis(axis, softmax_rank);
-    for (int i = 0; i < softmax_rank; i++) {
-      if (i != axis) {
-        if (ctx->IsRuntime() || (softmax_dims[i] > 0 && labels_dims[i] > 0)) {
-          PADDLE_ENFORCE_EQ(
-              softmax_dims[i], labels_dims[i],
-              "Input(Logits) and Input(Label) should in same shape in "
-              "dimensions except axis.");
-        }
-      }
-    }
-
-    bool soft_label = ctx->Attrs().Get<bool>("soft_label");
-    if (soft_label) {
-      if (ctx->IsRuntime() ||
-          (softmax_dims[axis] > 0 && labels_dims[axis] > 0)) {
-        PADDLE_ENFORCE_EQ(softmax_dims[axis], labels_dims[axis],
-                          "If Attr(soft_label) == true, the axis dimension of "
-                          "Input(X) and Input(Label) should be equal.");
-      }
-    } else {
-      if (ctx->IsRuntime() || labels_dims[axis] > 0) {
-        PADDLE_ENFORCE_EQ(labels_dims[axis], 1UL,
-                          "If Attr(soft_label) == false, the axis dimension of "
-                          "Input(Label) should be 1.");
-      }
-    }
-
-    ctx->SetOutputDim(framework::GradVarName("Logits"),
-                      ctx->GetInputDim("Softmax"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Loss"))->type(),
-        ctx.device_context());
-  }
-};
-
-class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* grad_op = new framework::OpDesc();
-    grad_op->SetType("softmax_with_cross_entropy_grad");
-    grad_op->SetInput("Label", Input("Label"));
-    grad_op->SetInput("Softmax", Output("Softmax"));
-    grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
-    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyInplaceInference,
-                           {"Logits", "Softmax"});
-
-DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyGradInplaceInference,
-                           {"Softmax", framework::GradVarName("Logits")});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
-                  ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker,
-                  ops::SoftmaxWithCrossEntropyInplaceInference);
-REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
-                  ops::SoftmaxWithCrossEntropyOpGrad,
-                  ops::SoftmaxWithCrossEntropyGradInplaceInference);
-REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyKernel<float>,
-                       ops::SoftmaxWithCrossEntropyKernel<double>);
-REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradKernel<float>,
-                       ops::SoftmaxWithCrossEntropyGradKernel<double>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
deleted file mode 100644
index 12b64052a7cd63be5bcd6be7c313111fb0727b5f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ /dev/null
@@ -1,520 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cub/cub.cuh>
-#include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-namespace {
-template <typename T>
-__global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
-                                 const int n, const int d, const int remain,
-                                 const int ignore_index) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n * remain;
-       i += blockDim.x * gridDim.x) {
-    int idx_n = i / remain;
-    int idx_remain = i % remain;
-    int idx = idx_n * d + labels[i] * remain + idx_remain;
-    logit_grad[idx] -=
-        ignore_index == labels[i] ? static_cast<T>(0.) : static_cast<T>(1.);
-  }
-}
-
-template <typename T>
-__global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
-                      const int d, const int remain) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    int idx_n = i / d;
-    int idx_remain = i % remain;
-    logit_grad[i] *= loss_grad[idx_n * remain + idx_remain];
-  }
-}
-
-template <typename T>
-__global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
-                                               const T* loss_grad,
-                                               const T* labels, const int n,
-                                               const int d, const int remain) {
-  int ids = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ids < n * d) {
-    int idx_n = ids / d;
-    int idx_remain = ids % remain;
-    int idx_loss = idx_n * remain + idx_remain;
-    logit_grad[ids] = loss_grad[idx_loss] * (logit_grad[ids] - labels[ids]);
-  }
-}
-
-}  // namespace
-
-static __device__ __forceinline__ platform::float16 exp_on_device(
-    platform::float16 x) {
-  return ::Eigen::numext::exp(x);
-}
-static __device__ __forceinline__ float exp_on_device(float x) {
-  return expf(x);
-}
-static __device__ __forceinline__ double exp_on_device(double x) {
-  return exp(x);
-}
-static __device__ __forceinline__ platform::float16 log_on_device(
-    platform::float16 x) {
-  return math::TolerableValue<platform::float16>()(::Eigen::numext::log(x));
-}
-static __device__ __forceinline__ float log_on_device(float x) {
-  return math::TolerableValue<float>()(logf(x));
-}
-static __device__ __forceinline__ double log_on_device(double x) {
-  return math::TolerableValue<double>()(log(x));
-}
-
-/** In the following codes, 3 CUDA kernels are implemented to calculate softmax
- * and loss **/
-/*
-  Supposing the x is `logits` and y is `labels`, the equations are as
-followings:
-  cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
-        = \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
-        = \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
-        = \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
-        = \sum_{j}(-y_i_j * tmp_i_j)
-  softmax_i_j = e^{tmp_i_j}
-where:
-  max_i = \max_{j}{x_i_j}
-  logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
-  tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
-Therefore, the calculation can be separated into 3 steps:
-Step 1: row-wise operation to calculate max_i
-Step 2: row-wise operation to calculate logDiffMaxSum_i
-Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
-To save memory, we can share memory among max_i, logDiffMaxSum_i and
-cross\_entropy_i.
-In this way, the 3 steps should be changed to:
-Step 1 (RowReductionForMax): row-wise operation to calculate max_i
-Step 2 (RowReductionForDiffMaxSum): calculate immediate result of softmax'_i_j =
-x_i_j - max_i, and row-wise operation to calculate logDiffMaxSum_i
-Step 3 (RowReductionForSoftmaxAndCrossEntropy): calculate tmp_i_j = softmax'_i_j
-- logDiffMaxSum_i, and finally get softmax_i_j and cross\_entropy_i
-*/
-
-// There are 3 kinds of reduce algorithms in cub:
-// BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
-// BLOCK_REDUCE_RAKING
-// BLOCK_REDUCE_WARP_REDUCTIONS (default)
-template <typename T, int BlockDim>
-using BlockReduce =
-    cub::BlockReduce<T, BlockDim /*, cub::BLOCK_REDUCE_WARP_REDUCTIONS*/>;
-
-template <typename T, int BlockDim>
-using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
-
-// Make sure that BlockDim <= axis_dim
-// This kernel is used to calculate the max element of each row
-template <typename T, int BlockDim>
-static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
-                                          int d, int axis_dim) {
-  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
-
-  // logits_data view as [n, axis_dim, remain]
-  // max_data view as [n, 1, remain]
-  // blockDim = n * remain, split blockIdx to idx_n and idx_remain
-  int remain = d / axis_dim;
-  int idx_n = blockIdx.x / remain;
-  int idx_remain = blockIdx.x % remain;
-  int beg_idx = idx_n * d + threadIdx.x * remain + idx_remain;
-  int end_idx = (idx_n + 1) * d;
-
-  int step = BlockDim * remain;
-  T cur_max = logits_data[beg_idx];
-  beg_idx += step;
-  while (beg_idx < end_idx) {
-    if (cur_max < logits_data[beg_idx]) {
-      cur_max = logits_data[beg_idx];
-    }
-    beg_idx += step;
-  }
-
-  cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
-
-  if (threadIdx.x == 0) {
-    max_data[blockIdx.x] =
-        cur_max < static_cast<T>(-64) ? static_cast<T>(-64) : cur_max;
-  }
-}
-
-// Make sure that BlockDim <= axis_dim
-template <typename T, int BlockDim, bool CalculateLogSoftmax = false>
-static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
-                                                 T* max_data, T* softmax, int d,
-                                                 int axis_dim) {
-  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
-
-  // logits, softmax data view as [n, axis_dim, remain]
-  // max_data view as [n, 1, remain]
-  // blockDim = n * remain, split blockIdx to idx_n and idx_remain
-  int remain = d / axis_dim;
-  int idx_n = blockIdx.x / remain;
-  int idx_remain = blockIdx.x % remain;
-  int beg_idx = idx_n * d + threadIdx.x * remain + idx_remain;
-  int end_idx = (idx_n + 1) * d;
-
-  auto block_max = max_data[blockIdx.x];
-  int step = BlockDim * remain;
-
-  softmax[beg_idx] = logits_data[beg_idx] - block_max;
-  T diff_max_sum = exp_on_device(softmax[beg_idx]);
-  auto idx = beg_idx + step;
-  while (idx < end_idx) {
-    softmax[idx] = logits_data[idx] - block_max;
-    diff_max_sum += exp_on_device(softmax[idx]);
-    idx += step;
-  }
-
-  diff_max_sum =
-      BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
-  if (threadIdx.x == 0) max_data[blockIdx.x] = log_on_device(diff_max_sum);
-
-  if (!CalculateLogSoftmax) return;
-  __syncthreads();
-  diff_max_sum = max_data[blockIdx.x];
-  softmax[beg_idx] -= diff_max_sum;
-  beg_idx += step;
-  while (beg_idx < end_idx) {
-    softmax[beg_idx] -= diff_max_sum;
-    beg_idx += step;
-  }
-  if (threadIdx.x == 0) max_data[blockIdx.x] = 0;
-}
-
-// Make sure that BlockDim <= axis_dim
-template <typename T, int BlockDim>
-static __global__ void RowReductionForSoftmaxAndCrossEntropy(
-    const T* logits_data, const T* labels_data, T* loss_data, T* softmax, int d,
-    int axis_dim) {
-  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
-
-  // logits, softmax, labels data view as [n, axis_dim, remain]
-  // loss_data view as [n, 1, remain]
-  // blockDim = n * remain, split blockIdx to idx_n and idx_remain
-  int remain = d / axis_dim;
-  int idx_n = blockIdx.x / remain;
-  int idx_remain = blockIdx.x % remain;
-  int beg_idx = idx_n * d + threadIdx.x * remain + idx_remain;
-  int end_idx = (idx_n + 1) * d;
-
-  // log_diff_max_sum shares memory with loss
-  auto block_log_diff_max_sum = loss_data[blockIdx.x];
-  auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
-  softmax[beg_idx] = exp_on_device(tmp);
-  auto loss = -labels_data[beg_idx] * tmp;
-  int step = BlockDim * remain;
-  beg_idx += step;
-  while (beg_idx < end_idx) {
-    tmp = softmax[beg_idx] - block_log_diff_max_sum;
-    softmax[beg_idx] = exp_on_device(tmp);
-    loss -= (labels_data[beg_idx] * tmp);
-    beg_idx += step;
-  }
-
-  loss = BlockReduce<T, BlockDim>(temp_storage).Reduce(loss, cub::Sum());
-  if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
-}
-
-template <typename T>
-struct HardLabelSoftmaxWithCrossEntropyFunctor {
- public:
-  HardLabelSoftmaxWithCrossEntropyFunctor(const int64_t* labels, T* loss,
-                                          T* log_softmax, int d, int axis_dim)
-      : labels_(labels),
-        loss_(loss),
-        log_softmax_(log_softmax),
-        d_(d),
-        axis_dim_(axis_dim) {}
-
-  __device__ void operator()(int idx) const {
-    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    int remain = d_ / axis_dim_;
-    int idx_n = idx / d_;
-    int idx_axis = (idx % d_) / remain;
-    int idx_remain = idx % remain;
-    // labels, loss view as [n, remain]
-    int idx_lbl = idx_n * remain + idx_remain;
-    if (idx_axis != labels_[idx_lbl]) {
-      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
-    } else {
-      auto softmax = log_softmax_[idx];
-      log_softmax_[idx] = exp_on_device(softmax);
-      loss_[idx_lbl] = -softmax;
-    }
-  }
-
- private:
-  const int64_t* labels_;
-  T* loss_;
-  T* log_softmax_;
-  int d_;
-  int axis_dim_;
-};
-
-template <typename T>
-struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
- public:
-  HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels,
-                                                       T* loss, T* log_softmax,
-                                                       int d, int axis_dim,
-                                                       int ignore_idx)
-      : labels_(labels),
-        loss_(loss),
-        log_softmax_(log_softmax),
-        d_(d),
-        axis_dim_(axis_dim),
-        ignore_idx_(ignore_idx) {}
-
-  __device__ void operator()(int idx) const {
-    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    int remain = d_ / axis_dim_;
-    int idx_n = idx / d_;
-    int idx_axis = (idx % d_) / remain;
-    int idx_remain = idx % remain;
-    // labels, loss view as [n, remain]
-    int idx_lbl = idx_n * remain + idx_remain;
-    if (idx_axis != labels_[idx_lbl] || idx_axis == ignore_idx_) {
-      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
-    } else {
-      auto softmax = log_softmax_[idx];
-      log_softmax_[idx] = exp_on_device(softmax);
-      loss_[idx_lbl] = -softmax;
-    }
-  }
-
- private:
-  const int64_t* labels_;
-  T* loss_;
-  T* log_softmax_;
-  int d_;
-  int axis_dim_;
-  int ignore_idx_;
-};
-
-template <typename T>
-static void HardLabelSoftmaxWithCrossEntropy(
-    const platform::CUDADeviceContext& ctx, const T* logits_data,
-    const int64_t* labels_data, T* loss_data, T* softmax_data, int n, int d,
-    int axis_dim, int ignore_idx) {
-  constexpr int kMaxBlockDim = 512;
-  int block_dim = axis_dim >= kMaxBlockDim
-                      ? kMaxBlockDim
-                      : (1 << static_cast<int>(std::log2(axis_dim)));
-  int grid_dim = n * d / axis_dim;
-  auto stream = ctx.stream();
-
-#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)  \
-  case BlockDim: {                                                         \
-    RowReductionForMax<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(    \
-        logits_data, loss_data, d, axis_dim);                              \
-    RowReductionForDiffMaxSum<T, BlockDim,                                 \
-                              true><<<grid_dim, BlockDim, 0, stream>>>(    \
-        logits_data, loss_data, softmax_data, d, axis_dim);                \
-    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);  \
-    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                        \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(   \
-          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
-    } else {                                                               \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                \
-          labels_data, loss_data, softmax_data, d, axis_dim));             \
-    }                                                                      \
-  } break
-
-  switch (block_dim) {
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
-    default:
-      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
-      break;
-  }
-#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
-}
-
-template <typename T>
-static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
-                                               const T* labels_data,
-                                               T* softmax_data, T* loss_data,
-                                               int n, int d, int axis_dim,
-                                               cudaStream_t stream) {
-  constexpr int kMaxBlockDim = 512;
-  int block_dim = axis_dim >= kMaxBlockDim
-                      ? kMaxBlockDim
-                      : (1 << static_cast<int>(std::log2(axis_dim)));
-  int grid_dim = n * d / axis_dim;
-
-#define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                 \
-  case BlockDim:                                                               \
-    RowReductionForMax<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(        \
-        logits_data, loss_data, d, axis_dim);                                  \
-    RowReductionForDiffMaxSum<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>( \
-        logits_data, loss_data, softmax_data, d, axis_dim);                    \
-    RowReductionForSoftmaxAndCrossEntropy<                                     \
-        T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(                       \
-        logits_data, labels_data, loss_data, softmax_data, d, axis_dim);       \
-    break
-
-  switch (block_dim) {
-    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
-    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
-    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
-    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
-    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
-    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
-    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
-    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
-    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
-    default:
-      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
-      break;
-  }
-
-#undef CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
-}
-
-template <typename T>
-class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    const Tensor* logits = context.Input<Tensor>("Logits");
-    const Tensor* labels = context.Input<Tensor>("Label");
-    Tensor* softmax = context.Output<Tensor>("Softmax");
-    Tensor* loss = context.Output<Tensor>("Loss");
-
-    const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = logits->dims()[axis];
-
-    if (axis_dim == 1) {
-      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
-      set_constant(context.cuda_device_context(), softmax, static_cast<T>(1));
-      set_constant(context.cuda_device_context(), loss, static_cast<T>(0));
-      return;
-    }
-
-    const int n = SizeToAxis(axis, logits->dims());
-    const int d = SizeFromAxis(axis, logits->dims());
-
-    auto* softmax_data = softmax->mutable_data<T>(context.GetPlace());
-    auto* loss_data = loss->mutable_data<T>(context.GetPlace());
-
-    auto soft_label = context.Attr<bool>("soft_label");
-    auto ignore_index = context.Attr<int>("ignore_index");
-
-    if (soft_label) {
-      auto* logits_data = logits->data<T>();
-      auto* labels_data = labels->data<T>();
-      SoftmaxWithCrossEntropyFusedKernel(
-          logits_data, labels_data, softmax_data, loss_data, n, d, axis_dim,
-          context.cuda_device_context().stream());
-    } else {
-      if (!context.Attr<bool>("numeric_stable_mode")) {
-        // CUDNN kernel only suppoer 2-D tensor and perfome softmax on last dim
-        Tensor logits_2d, softmax_2d, labels_2d, loss_2d;
-        logits_2d.ShareDataWith(*logits).Resize({n, d});
-        softmax_2d.ShareDataWith(*softmax).Resize({n, d});
-        labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n});
-        loss_2d.ShareDataWith(*loss).Resize({n, 1});
-        math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(),
-                                       &logits_2d, &softmax_2d);
-        math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-            context.cuda_device_context(), &loss_2d, &softmax_2d, &labels_2d,
-            false, ignore_index, axis_dim);
-      } else {
-        auto* logits_data = logits->data<T>();
-        auto* labels_data = labels->data<int64_t>();
-        HardLabelSoftmaxWithCrossEntropy<T>(
-            context.cuda_device_context(), logits_data, labels_data, loss_data,
-            softmax_data, n, d, axis_dim, ignore_index);
-      }
-    }
-  }
-};
-
-template <typename T>
-class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    const Tensor* labels = context.Input<Tensor>("Label");
-    const T* loss_grad_data =
-        context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
-    Tensor* logit_grad =
-        context.Output<Tensor>(framework::GradVarName("Logits"));
-    const Tensor* softmax = context.Input<Tensor>("Softmax");
-    if (logit_grad != softmax) {
-      framework::TensorCopy(*softmax, context.GetPlace(),
-                            context.device_context(), logit_grad);
-    }
-    T* logit_grad_data = logit_grad->data<T>();
-
-    const int rank = logit_grad->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = logit_grad->dims()[axis];
-
-    const int n = SizeToAxis(axis, logit_grad->dims());
-    const int d = SizeFromAxis(axis, logit_grad->dims());
-    const int remain = d / axis_dim;
-
-    int block = 512;
-    auto stream = context.cuda_device_context().stream();
-    auto ignore_index = context.Attr<int>("ignore_index");
-    if (context.Attr<bool>("soft_label")) {
-      int grid = (n * d + block - 1) / block;
-      const T* label_data = labels->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, loss_grad_data, label_data, n, d, remain);
-    } else {
-      int grid = (n * remain + block - 1) / block;
-      const int64_t* label_data = labels->data<int64_t>();
-      CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, label_data, n, d, remain, ignore_index);
-      int num = n * d;
-      grid = (num + block - 1) / block;
-      Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
-                                           d, remain);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
-    ops::SoftmaxWithCrossEntropyCUDAKernel<paddle::platform::float16>,
-    ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(
-    softmax_with_cross_entropy_grad,
-    ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
-    ops::SoftmaxWithCrossEntropyGradCUDAKernel<paddle::platform::float16>,
-    ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
deleted file mode 100644
index 4533295a8d8c0d7f36522143adc2820020179ace..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/softmax_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
-                   "This kernel only runs on CPU.");
-    const Tensor* logits = context.Input<Tensor>("Logits");
-    const Tensor* labels = context.Input<Tensor>("Label");
-    Tensor* softmax = context.Output<Tensor>("Softmax");
-    Tensor* loss = context.Output<Tensor>("Loss");
-    const bool soft_label = context.Attr<bool>("soft_label");
-
-    const int rank = logits->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = logits->dims()[axis];
-
-    softmax->mutable_data<T>(context.GetPlace());
-    loss->mutable_data<T>(context.GetPlace());
-
-    const int n = SizeToAxis(axis, logits->dims());
-    const int d = SizeFromAxis(axis, logits->dims());
-    Tensor logits_2d, softmax_2d, labels_2d, loss_2d;
-    logits_2d.ShareDataWith(*logits).Resize({n, d});
-    softmax_2d.ShareDataWith(*softmax).Resize({n, d});
-    labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n});
-    loss_2d.ShareDataWith(*loss).Resize({n, d / axis_dim});
-
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-    math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
-        dev_ctx, axis_dim, &logits_2d, &softmax_2d);
-    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        dev_ctx, &loss_2d, &softmax_2d, &labels_2d, soft_label,
-        context.Attr<int>("ignore_index"), axis_dim);
-  }
-};
-
-template <typename T>
-class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Loss"));
-    const Tensor* labels = context.Input<Tensor>("Label");
-    Tensor* logit_grad =
-        context.Output<Tensor>(framework::GradVarName("Logits"));
-
-    const Tensor* softmax = context.Input<Tensor>("Softmax");
-    if (logit_grad != softmax) {
-      framework::TensorCopy(*softmax, context.GetPlace(),
-                            context.device_context(), logit_grad);
-    }
-
-    const bool soft_label = context.Attr<bool>("soft_label");
-
-    const int rank = logit_grad->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    int axis_dim = logit_grad->dims()[axis];
-
-    const int n = SizeToAxis(axis, logit_grad->dims());
-    const int d = SizeFromAxis(axis, logit_grad->dims());
-    Tensor logit_grad_2d, labels_2d, out_grad_2d;
-    logit_grad_2d.ShareDataWith(*logit_grad).Resize({n, d});
-    labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n});
-    out_grad_2d.ShareDataWith(*out_grad).Resize({n, d / axis_dim});
-
-    auto out_grad_mat = EigenMatrix<T>::From(out_grad_2d);
-    auto logit_grad_mat = EigenMatrix<T>::From(logit_grad_2d);
-    auto& place = *context.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    if (soft_label) {
-      auto lbl_mat = EigenMatrix<T>::From(labels_2d);
-      logit_grad_mat.device(place) =
-          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, axis_dim)) *
-          (logit_grad_mat - lbl_mat);
-    } else {
-      logit_grad_mat.device(place) =
-          logit_grad_mat *
-          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, axis_dim));
-
-      const int64_t* label_data = labels->data<int64_t>();
-      T* logit_grad_data = logit_grad->data<T>();
-      const T* out_grad_data = out_grad->data<T>();
-      const int remain = d / axis_dim;
-      for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < remain; j++) {
-          int idx = i * remain + j;
-          logit_grad_data[i * d + label_data[idx] * remain + j] -=
-              out_grad_data[idx];
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
deleted file mode 100644
index 3d66613248c27f683faf6e3f075c495ed6e71b06..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/space_to_depth_op.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class SpaceToDepthOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SpaceToDepthOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SpaceToDepthOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4, "input should be a 4D tensor");
-    auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
-
-    PADDLE_ENFORCE_GT(blocksize, 1, "The blocksize should be Greater than 1");
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0");
-      PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0");
-      PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
-
-      PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
-                        "input channel should be divisible of the square of "
-                        "SpaceToDepthOp blocksize");
-      PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
-                        "input Height should be divisible of the square of "
-                        "SpaceToDepthOp blocksize");
-      PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
-                        "input Width should be divisible of the square of "
-                        "SpaceToDepthOp blocksize");
-    } else {
-      if (x_dims[1] != -1) {
-        PADDLE_ENFORCE_GT(x_dims[1], 0,
-                          "input channel should be Greater than 0");
-        PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
-                          "input channel should be divisible of the square of "
-                          "SpaceToDepthOp blocksize");
-      }
-      if (x_dims[2] != -1) {
-        PADDLE_ENFORCE_GT(x_dims[2], 0,
-                          "input Height should be Greater than 0");
-        PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
-                          "input Height should be divisible of the square of "
-                          "SpaceToDepthOp blocksize");
-      }
-
-      if (x_dims[3] != -1) {
-        PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
-
-        PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
-                          "input Width should be divisible of the square of "
-                          "SpaceToDepthOp blocksize");
-      }
-    }
-
-    VLOG(3) << "SpaceToDepthOp operator x.shape=" << x_dims
-            << "Attribute blocksize" << blocksize << std::endl;
-
-    std::vector<int64_t> output_shape(4, 0);  // [B,C,H,W]
-    output_shape[0] = x_dims[0];
-    output_shape[1] = x_dims[1] * blocksize * blocksize;
-    output_shape[2] = x_dims[2] / blocksize;
-    output_shape[3] = x_dims[3] / blocksize;
-
-    auto out_dims = framework::make_ddim(output_shape);
-
-    ctx->SetOutputDim("Out", out_dims);
-
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
-class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor). The input should be a 4D tensor B * C * W * H of "
-             "SpaceToDepthOp "
-             "operator.");
-    AddOutput("Out",
-              "(Tensor), The output should be a 4D tensor B * C2 * W2 * H2 of "
-              "SpaceToDepthOp operator.");
-    AddAttr<int64_t>(
-        "blocksize",
-        "(int64_t, default 2) blocksize used to do change Space To Depth.")
-        .SetDefault(2)
-        .GreaterThan(1);
-    AddComment(R"DOC(
-        reorg operator used in Yolo v2.
-        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,
-
-        Reshape Input(X) into the shape according to Attr(blocksize). The
-        data in Input(X) are unchanged.
-
-        Examples:
-
-            1. Given a 4-D tensor Input(X) with a shape [128, 2048, 26, 26], and the blocksize is 2, the reorg operator will transform Input(X)
-            into a 4-D tensor with shape [128, 2048, 13, 13] and leaving Input(X)'s data unchanged.
-
-    )DOC");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SpaceToDepthGradOpNoBuffer, "X");
-
-class SpaceToDepthGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("space_to_depth_grad");
-
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("X", Input("X"));
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class SpaceToDepthGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(space_to_depth, ops::SpaceToDepthOp, ops::SpaceToDepthOpMaker,
-                  ops::SpaceToDepthGradOpDescMaker);
-REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp,
-                  ops::SpaceToDepthGradOpNoBuffer);
-REGISTER_OP_CPU_KERNEL(
-    space_to_depth,
-    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    space_to_depth_grad,
-    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/space_to_depth_op.cu b/paddle/fluid/operators/space_to_depth_op.cu
deleted file mode 100644
index 38d0a662733222386b8ecd68d064f3d1abe56c3b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/space_to_depth_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/space_to_depth_op.h"
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    space_to_depth,
-    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    space_to_depth_grad,
-    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/space_to_depth_op.h b/paddle/fluid/operators/space_to_depth_op.h
deleted file mode 100644
index a71662b4813ab27b65f5c7a918e2bb6fb15a1993..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/space_to_depth_op.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
-#define PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
-#endif  // PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class space_to_depth_compute {
- public:
-  HOSTDEVICE space_to_depth_compute(const T *x, int64_t w, int64_t h, int64_t c,
-                                    int64_t batch, int64_t blocksize,
-                                    int64_t forward, T *out)
-      : x_(x),
-        w_(w),
-        h_(h),
-        c_(c),
-        batch_(batch),
-        blocksize_(blocksize),
-        forward_(forward),
-        out_(out) {}
-
-  HOSTDEVICE void operator()(int64_t in_index) {
-    int64_t out_c = c_ / (blocksize_ * blocksize_);
-    // calculate each dim position with index of tensor
-    int64_t b = in_index / (c_ * h_ * w_);
-    int64_t k = (in_index % (c_ * h_ * w_)) / (h_ * w_);
-    int64_t j = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) / w_;
-    int64_t i = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) % w_;
-
-    int64_t c2 = k % out_c;
-    int64_t offset = k / out_c;
-    int64_t w2 = i * blocksize_ + offset % blocksize_;
-    int64_t h2 = j * blocksize_ + offset / blocksize_;
-    int64_t out_index =
-        w2 + w_ * blocksize_ * (h2 + h_ * blocksize_ * (c2 + out_c * b));
-    if (forward_)
-      out_[out_index] = x_[in_index];
-    else
-      out_[in_index] = x_[out_index];
-  }
-
- private:
-  const T *x_;
-  int64_t w_, h_, c_, batch_, blocksize_, forward_;
-  T *out_;
-};
-
-template <typename DeviceContext, typename T>
-class SpaceToDepthKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *out = context.Output<framework::LoDTensor>("Out");
-    auto *x = context.Input<framework::LoDTensor>("X");
-    auto blocksize = context.Attr<int64_t>("blocksize");
-    auto in_dims = x->dims();
-    out->mutable_data(context.GetPlace(), x->type());
-
-    auto out_dims = out->dims();
-    auto B = in_dims[0];
-    auto C = in_dims[1];
-    auto H = in_dims[2];
-    auto W = in_dims[3];
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(x->numel()));
-
-    auto *x_data = x->data<T>();
-    auto *out_data = out->data<T>();
-    paddle::operators::space_to_depth_compute<T> computer(
-        x_data, W, H, C, B, blocksize, 1, out_data);
-    for_range(computer);
-
-    out->Resize(out_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SpaceToDepthGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *d_out =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto *d_x =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto blocksize = context.Attr<int64_t>("blocksize");
-    auto in_dims = d_x->dims();
-    d_x->mutable_data(context.GetPlace(), d_out->type());
-
-    auto B = in_dims[0];
-    auto C = in_dims[1];
-    auto H = in_dims[2];
-    auto W = in_dims[3];
-
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(d_x->numel()));
-
-    auto *dx_data = d_x->data<T>();
-    auto *dout_data = d_out->data<T>();
-
-    paddle::operators::space_to_depth_compute<T> computer(
-        dout_data, W, H, C, B, blocksize, 0, dx_data);
-    for_range(computer);
-
-    d_x->Resize(in_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc
deleted file mode 100644
index ec5ee487729d0650983d553dbffe14b63c16b26a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/spectral_norm_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class SpectralNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of SpectralNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("U"),
-                   "Input(U) of SpectralNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("V"),
-                   "Input(V) of SpectralNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SpectralNormOp should not be null.");
-
-    auto dim_weight = ctx->GetInputDim("Weight");
-    auto rank_weight = dim_weight.size();
-    PADDLE_ENFORCE(rank_weight >= 2 && rank_weight <= 5,
-                   "The rank of Input(Weights) can only be 2, 3,"
-                   "4, 5 for fc, conv1d, conv2d, conv3d layers.");
-
-    int dim = ctx->Attrs().Get<int>("dim");
-    int power_iters = ctx->Attrs().Get<int>("power_iters");
-    PADDLE_ENFORCE(dim == 0 || dim == 1, "Attr(dim) can only be 0 or 1");
-    PADDLE_ENFORCE(power_iters >= 0,
-                   "Attr(power_iters) should be larger equal then 0");
-
-    int h = dim_weight[dim];
-    int w = 1;
-    for (int i = 0; i < rank_weight; i++) {
-      if (i != dim) {
-        w *= dim_weight[i];
-      }
-    }
-    auto dim_u = ctx->GetInputDim("U");
-    auto dim_v = ctx->GetInputDim("V");
-
-    if (ctx->IsRuntime() || (dim_u[0] > 0 && h > 0)) {
-      PADDLE_ENFORCE_EQ(dim_u[0], h,
-                        "Input(U) dims[0] should be equal to "
-                        "Input(Weight) dims[Attr(dim)]");
-    }
-
-    if (ctx->IsRuntime() || (dim_v[0] > 0 && w > 0)) {
-      PADDLE_ENFORCE_EQ(
-          dim_v[0], w,
-          "Input(V) dims[0] should be equal to "
-          "the product of Input(Weight) dims except dims[Attr(dim)]");
-    }
-
-    ctx->SetOutputDim("Out", dim_weight);
-    ctx->ShareLoD("Weight", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Weight")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Weight",
-             "The input weight tensor of spectral_norm operator, "
-             "This can be a 2-D, 3-D, 4-D, 5-D tensor which is the "
-             "weights of fc, conv1d, conv2d, conv3d layer.");
-    AddInput("U",
-             "The weight_u tensor of spectral_norm operator, "
-             "This can be a 1-D tensor in shape [H, 1],"
-             "H is the 1st dimentions of Weight after reshape"
-             "corresponding by Attr(dim). As for Attr(dim) = 1"
-             "in conv2d layer with weight shape [M, C, K1, K2]"
-             "Weight will be reshape to [C, M*K1*K2], U will"
-             "be in shape [C, 1].");
-    AddInput("V",
-             "The weight_v tensor of spectral_norm operator, "
-             "This can be a 1-D tensor in shape [W, 1], "
-             "W is the 2nd dimentions of Weight after reshape "
-             "corresponding by Attr(dim). As for Attr(dim) = 1 "
-             "in conv2d layer with weight shape [M, C, K1, K2] "
-             "Weight will be reshape to [C, M*K1*K2], V will "
-             "be in shape [M*K1*K2, 1].");
-    AddOutput("Out",
-              "The output weight tensor of spectral_norm operator, "
-              "This tensor is in same shape with Input(Weight).");
-
-    AddAttr<int>("dim",
-                 "The index of dimension which should be permuted "
-                 "to the first before reshaping Input(Weight) to "
-                 "matrix, it should be set as 0 if Input(Weight) is "
-                 "the weight of fc layer, and should be set as 1 if "
-                 "Input(Weight) is the weight of conv layer, "
-                 "default 0.")
-        .SetDefault(0);
-    AddAttr<int>("power_iters",
-                 "number of power iterations to calculate "
-                 "spectral norm, default 1.")
-        .SetDefault(1);
-    AddAttr<float>("eps",
-                   "epsilon for numerical stability in "
-                   "calculating norms")
-        .SetDefault(1e-12);
-
-    AddComment(R"DOC(
-          This layer calculates the spectral normalization value of weight of
-          fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D
-          tensor.
-
-          Spectral normalization stabilizes the training of critic in GANs
-          (Generative Adversarial Networks). This layer rescaling weight tensor
-          with spectral normalize value.
-
-          For spectral normalization calculations, we rescaling weight
-          tensor with :math:`\sigma`, while :math:`\sigma{\mathbf{W}}` is
-
-            $$\sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \\frac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2}$$
-
-          We calculate :math:`\sigma{\mathbf{W}}` through power iterations as
-
-            $$
-            \mathbf{v} = \mathbf{W}^{T} \mathbf{u}
-            $$
-            $$
-            \mathbf{v} = \\frac{\mathbf{v}}{\|\mathbf{v}\|_2}
-            $$
-            $$
-            \mathbf{u} = \mathbf{W}^{T} \mathbf{v}
-            $$
-            $$
-            \mathbf{u} = \\frac{\mathbf{u}}{\|\mathbf{u}\|_2}
-            $$
-
-          And :math:`\sigma` should be
-
-            $$\sigma{\mathbf{W}} = \mathbf{u}^{T} \mathbf{W} \mathbf{v}$$
-
-          For details of spectral normalization, please refer to paper: 
-          `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
-         )DOC");
-  }
-};
-
-class SpectralNormGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("spectral_norm_grad");
-
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("Weight", Input("Weight"));
-    op->SetInput("U", Input("U"));
-    op->SetInput("V", Input("V"));
-
-    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
-
-    op->SetAttrMap(Attrs());
-
-    return op;
-  }
-};
-
-class SpectralNormOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Weight"), "Input(Weight) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto dim_x = ctx->GetInputDim("Weight");
-    if (ctx->HasOutput(framework::GradVarName("Weight"))) {
-      ctx->SetOutputDim(framework::GradVarName("Weight"), dim_x);
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Weight")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(spectral_norm, ops::SpectralNormOp, ops::SpectralNormOpMaker,
-                  ops::SpectralNormGradOpDescMaker);
-REGISTER_OPERATOR(spectral_norm_grad, ops::SpectralNormOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    spectral_norm,
-    ops::SpectralNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SpectralNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    spectral_norm_grad,
-    ops::SpectralNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SpectralNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/spectral_norm_op.cu b/paddle/fluid/operators/spectral_norm_op.cu
deleted file mode 100644
index ea90e3b4c122b00d5bfe13617e48a9bbe0ee8395..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spectral_norm_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/spectral_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    spectral_norm,
-    ops::SpectralNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SpectralNormKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    spectral_norm_grad,
-    ops::SpectralNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SpectralNormGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/spectral_norm_op.h b/paddle/fluid/operators/spectral_norm_op.h
deleted file mode 100644
index eb48e3b7840e18efe809540dd697f243a0a63a52..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spectral_norm_op.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using Tensor = framework::Tensor;
-
-using Array1 = Eigen::DSizes<int64_t, 1>;
-using Array2 = Eigen::DSizes<int64_t, 2>;
-using IndexPair = Eigen::IndexPair<int>;
-
-template <typename DeviceContext, typename T>
-static inline void TransCompute(const int rank, const Tensor& in, Tensor* out,
-                                const std::vector<int>& perm,
-                                const DeviceContext& dev_ctx) {
-  if (rank <= 1 || rank > 5) {
-    PADDLE_THROW("Invalid weight rank.");
-  }
-
-  switch (rank) {
-    case 2:
-      math::Transpose<DeviceContext, T, 2> trans2;
-      trans2(dev_ctx, in, out, perm);
-      break;
-    case 3:
-      math::Transpose<DeviceContext, T, 3> trans3;
-      trans3(dev_ctx, in, out, perm);
-      break;
-    case 4:
-      math::Transpose<DeviceContext, T, 4> trans4;
-      trans4(dev_ctx, in, out, perm);
-      break;
-    case 5:
-      math::Transpose<DeviceContext, T, 5> trans5;
-      trans5(dev_ctx, in, out, perm);
-      break;
-    default:
-      break;
-  }
-}
-
-template <typename DeviceContext, typename T>
-static inline void CalcMatrixSigmaAndNormWeight(
-    Tensor* sigma, Tensor* u, Tensor* v, Tensor* weight, const int power_iters,
-    const float eps, const framework::ExecutionContext& ctx) {
-  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-  auto blas = math::GetBlas<DeviceContext, T>(ctx);
-  auto sigma_t = EigenTensor<T, 2>::From(*sigma);
-  auto weight_t = EigenTensor<T, 2>::From(*weight);
-  auto u_t = EigenTensor<T, 2>::From(*u);
-  auto v_t = EigenTensor<T, 2>::From(*v);
-
-  const int h = weight->dims()[0];
-  const int w = weight->dims()[1];
-
-  for (int i = 0; i < power_iters; i++) {
-    // V = W^T * U / ||W^T * U||_2
-    blas.MatMul(*weight, true, *u, false, T(1), v, T(0));
-    auto v_t_norm =
-        v_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast(
-            Array1(w));
-    v_t.device(place) = v_t / (v_t_norm + v_t_norm.constant(eps));
-    // U = W^T * V / ||W^T * V||_2
-    blas.MatMul(*weight, false, *v, false, T(1), u, T(0));
-    auto u_t_norm =
-        u_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast(
-            Array1(h));
-    u_t.device(place) = u_t / (u_t_norm + u_t_norm.constant(eps));
-  }
-  Tensor weight_v;
-  weight_v.mutable_data<T>({h, 1}, ctx.GetPlace());
-  blas.MatMul(*weight, false, *v, false, T(1), &weight_v, T(0));
-  auto weight_v_t = EigenTensor<T, 2>::From(weight_v);
-  sigma_t.device(place) = (u_t * weight_v_t)
-                              .sum()
-                              .eval()
-                              .reshape(Array2(1, 1))
-                              .broadcast(Array2(h, w));
-  weight_t.device(place) = weight_t / sigma_t;
-}
-
-template <typename DeviceContext, typename T>
-class SpectralNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto weight = ctx.Input<Tensor>("Weight");
-    auto u = ctx.Input<Tensor>("U");
-    auto v = ctx.Input<Tensor>("V");
-    auto out = ctx.Output<Tensor>("Out");
-
-    int dim = ctx.Attr<int>("dim");
-    int power_iters = ctx.Attr<int>("power_iters");
-    float eps = ctx.Attr<float>("eps");
-
-    const int h = u->dims()[0];
-    const int w = v->dims()[0];
-
-    Tensor weight_mat;
-    auto dims = weight->dims();
-    const int rank = dims.size();
-    std::vector<int> real_dims;
-    if (dim != 0) {
-      std::vector<int> perm;
-      perm.push_back(dim);
-      real_dims.push_back(dims[dim]);
-      for (int i = 0; i < rank; i++) {
-        if (i != dim) {
-          perm.push_back(i);
-          real_dims.push_back(dims[i]);
-        }
-      }
-      weight_mat.mutable_data<T>(framework::make_ddim(real_dims),
-                                 ctx.GetPlace());
-      TransCompute<DeviceContext, T>(rank, *weight, &weight_mat, perm, dev_ctx);
-    } else {
-      for (int i = 0; i < rank; i++) {
-        real_dims.push_back(i);
-      }
-      TensorCopySync(*weight, ctx.GetPlace(), &weight_mat);
-    }
-    weight_mat = weight_mat.Resize({h, w});
-
-    Tensor sigma;
-    sigma.mutable_data<T>(weight_mat.dims(), ctx.GetPlace());
-    Tensor uu, vv;
-    TensorCopySync(*u, ctx.GetPlace(), &uu);
-    TensorCopySync(*v, ctx.GetPlace(), &vv);
-    CalcMatrixSigmaAndNormWeight<DeviceContext, T>(
-        &sigma, &(uu.Resize({h, 1})), &(vv.Resize({w, 1})), &weight_mat,
-        power_iters, eps, ctx);
-
-    if (dim != 0) {
-      std::vector<int> perm;
-      for (int i = 0; i < rank; i++) {
-        if (i < dim) {
-          perm.push_back(i + 1);
-        } else if (i == dim) {
-          perm.push_back(0);
-        } else {
-          perm.push_back(i);
-        }
-      }
-      out->mutable_data<T>(dims, ctx.GetPlace());
-      TransCompute<DeviceContext, T>(
-          rank, weight_mat.Resize(framework::make_ddim(real_dims)), out, perm,
-          dev_ctx);
-    } else {
-      TensorCopySync(weight_mat.Resize(dims), ctx.GetPlace(), out);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SpectralNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-    auto weight = ctx.Input<Tensor>("Weight");
-    auto u = ctx.Input<Tensor>("U");
-    auto v = ctx.Input<Tensor>("V");
-    auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto weight_grad = ctx.Output<Tensor>(framework::GradVarName("Weight"));
-
-    int dim = ctx.Attr<int>("dim");
-    int power_iters = ctx.Attr<int>("power_iters");
-    float eps = ctx.Attr<float>("eps");
-
-    const int h = u->dims()[0];
-    const int w = v->dims()[0];
-
-    Tensor weight_mat, out_grad_mat;
-    auto dims = weight->dims();
-    const int rank = dims.size();
-    std::vector<int> real_dims;
-    if (dim != 0) {
-      std::vector<int> perm;
-      perm.push_back(dim);
-      real_dims.push_back(dims[dim]);
-      for (int i = 0; i < rank; i++) {
-        if (i != dim) {
-          perm.push_back(i);
-          real_dims.push_back(dims[i]);
-        }
-      }
-      weight_mat.mutable_data<T>(framework::make_ddim(real_dims),
-                                 ctx.GetPlace());
-      out_grad_mat.mutable_data<T>(framework::make_ddim(real_dims),
-                                   ctx.GetPlace());
-      TransCompute<DeviceContext, T>(rank, *weight, &weight_mat, perm, dev_ctx);
-      TransCompute<DeviceContext, T>(rank, *out_grad, &out_grad_mat, perm,
-                                     dev_ctx);
-    } else {
-      for (int i = 0; i < rank; i++) {
-        real_dims.push_back(i);
-      }
-      TensorCopySync(*weight, ctx.GetPlace(), &weight_mat);
-      TensorCopySync(*out_grad, ctx.GetPlace(), &out_grad_mat);
-    }
-    weight_mat = weight_mat.Resize({h, w});
-    out_grad_mat = out_grad_mat.Resize({h, w});
-
-    Tensor sigma;
-    sigma.mutable_data<T>(weight_mat.dims(), ctx.GetPlace());
-    Tensor uu, vv;
-    TensorCopySync(*u, ctx.GetPlace(), &uu);
-    TensorCopySync(*v, ctx.GetPlace(), &vv);
-    CalcMatrixSigmaAndNormWeight<DeviceContext, T>(
-        &sigma, &(uu.Resize({h, 1})), &(vv.Resize({w, 1})), &weight_mat,
-        power_iters, eps, ctx);
-
-    Tensor uv;
-    uv.mutable_data<T>({h, w}, ctx.GetPlace());
-    blas.MatMul(uu.Resize({h, 1}), false, vv.Resize({w, 1}), false, T(1), &uv,
-                T(0));
-
-    Tensor weight_grad_mat;
-    weight_grad_mat.mutable_data<T>({h, w}, ctx.GetPlace());
-    auto weight_grad_mat_t = EigenTensor<T, 2>::From(weight_grad_mat);
-    auto weight_mat_t = EigenTensor<T, 2>::From(weight_mat);
-    auto out_grad_mat_t = EigenTensor<T, 2>::From(out_grad_mat);
-    auto sigma_t = EigenTensor<T, 2>::From(sigma);
-    auto uv_t = EigenTensor<T, 2>::From(uv);
-    weight_mat_t.device(place) =
-        weight_mat_t.sum().eval().reshape(Array2(1, 1)).broadcast(Array2(h, w));
-    weight_grad_mat_t.device(place) =
-        out_grad_mat_t * (out_grad_mat_t.constant(1.0) - uv_t * weight_mat_t) /
-        sigma_t;
-
-    if (dim != 0) {
-      std::vector<int> perm;
-      for (int i = 0; i < rank; i++) {
-        if (i < dim) {
-          perm.push_back(i + 1);
-        } else if (i == dim) {
-          perm.push_back(0);
-        } else {
-          perm.push_back(i);
-        }
-      }
-      weight_grad->mutable_data<T>(dims, ctx.GetPlace());
-      TransCompute<DeviceContext, T>(
-          rank, weight_grad_mat.Resize(framework::make_ddim(real_dims)),
-          weight_grad, perm, dev_ctx);
-    } else {
-      TensorCopySync(weight_grad_mat.Resize(dims), ctx.GetPlace(), weight_grad);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
deleted file mode 100644
index c89e683d766580113d160eecf4f04d4ead4594f6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-struct CopyRange {
-  size_t begin;
-  size_t end;
-};
-
-using LoD = framework::LoD;
-
-class SplitLoDTensorOp : public framework::OperatorBase {
- public:
-  SplitLoDTensorOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
-    auto *out_true =
-        scope.FindVar(Output("OutTrue"))->GetMutable<framework::LoDTensor>();
-    auto *out_false =
-        scope.FindVar(Output("OutFalse"))->GetMutable<framework::LoDTensor>();
-    auto level = static_cast<size_t>(Attr<int>("level"));
-    auto &x_lod = x.lod();
-    auto &mask_dim = mask.dims();
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-
-    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
-    if (platform::is_cpu_place(mask.place())) {
-      cpu_mask->ShareDataWith(mask);
-    } else if (platform::is_gpu_place(mask.place())) {
-#ifdef PADDLE_WITH_CUDA
-      framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
-                            cpu_mask.get());
-#else
-      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
-#endif
-    }
-    auto *mask_data = cpu_mask->data<bool>();
-
-    std::vector<std::vector<CopyRange>> copy_ranges(2);
-
-    // set out_true/out_false lod
-    for (size_t t = 0; t < 2; t++) {
-      LoD *lod = nullptr;
-      if (t == 0) {
-        lod = out_false->mutable_lod();
-      } else {
-        lod = out_true->mutable_lod();
-      }
-      lod->clear();
-      for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
-        if (static_cast<size_t>(mask_data[i]) == t) {
-          size_t start_idx = i;
-          auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
-              x_lod, start_idx, start_idx + 1, level);
-
-          auto &lod_length = lod_and_offset.first;
-          framework::AppendLoD(lod, lod_length);
-
-          size_t start_offset = lod_and_offset.second.first;
-          size_t end_offset = lod_and_offset.second.second;
-          copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
-        }
-      }
-    }
-
-    for (size_t t = 0; t < 2; ++t) {
-      framework::LoDTensor *out;
-      if (t == 0) {
-        out = out_false;
-      } else {
-        out = out_true;
-      }
-      auto &ranges = copy_ranges[t];
-      size_t height = std::accumulate(
-          ranges.begin(), ranges.end(), 0UL,
-          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
-      auto x_dim = x.dims();
-      x_dim[0] = static_cast<int64_t>(height);
-      out->Resize(x_dim);
-      out->mutable_data(x.place(), x.type());
-      size_t offset = 0;
-      for (auto &each_range : ranges) {
-        size_t len = each_range.end - each_range.begin;
-        if (len == 0) {
-          continue;
-        }
-        // out[offset: offset+len] = x[each_range.begin: each_range.end]
-        auto slice = out->Slice(static_cast<int>(offset),
-                                static_cast<int>(offset + len));
-        framework::TensorCopy(x.Slice(static_cast<int>(each_range.begin),
-                                      static_cast<int>(each_range.end)),
-                              x.place(), dev_ctx, &slice);
-        offset += len;
-      }
-    }
-  }
-};
-
-class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input LoDTensor");
-    AddInput("Mask", "A bool column vector which mask the input");
-    AddOutput("OutTrue", "True branch of input LoDTensor");
-    AddOutput("OutFalse", "False branch of input LoDTensor");
-    AddAttr<int>("level", "(int) the specific lod level to split.")
-        .SetDefault(0)
-        .EqualGreaterThan(0);
-    AddComment(
-        R"DOC(
-        Split a LoDTensor with a Mask at certain level. The input LoDTensor
-        has 3 sequence at certain lod level. The Mask is a bool column vector,
-        such as [0, 1, 0] at the same level. The first and third sequence will
-        be send to False Output LoDTensor; whereas the second sequence will
-        be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC");
-  }
-};
-
-class SplitLoDTensorInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "SplitLoDTensorOp must has input X.");
-    PADDLE_ENFORCE(context->HasInput("Mask"),
-                   "SplitLoDTensorOp must has input Mask.");
-    PADDLE_ENFORCE(context->HasOutput("OutTrue"),
-                   "SplitLoDTensorOp must has output OutTrue.");
-    PADDLE_ENFORCE(context->HasOutput("OutFalse"),
-                   "SplitLoDTensorOp must has output OutFalse.");
-
-    auto mask_dim = context->GetInputDim("Mask");
-    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
-    if (context->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(mask_dim[1], 1);
-    }
-
-    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
-    context->SetOutputDim("OutFalse", context->GetInputDim("X"));
-  }
-};
-
-class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("merge_lod_tensor");
-    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
-    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
-    grad_op->SetInput("Mask", Input("Mask"));
-    grad_op->SetInput("X", Input("X"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp,
-                  ops::SplitLoDTensorOpProtoMaker,
-                  ops::SplitLoDTensorInferShape,
-                  ops::SplitLoDTensorArrayGradMaker);
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
deleted file mode 100644
index a43bad878179d02c41d8c8bcd6b43eaffaa6e9a2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/split_op.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/split_op.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-class SplitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SplitOp should not be null.");
-    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      "Outputs(Out) of SplitOp should not be empty.");
-    auto in_dims = ctx->GetInputDim("X");
-    auto outs_names = ctx->Outputs("Out");
-    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
-    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    std::vector<int> sections = static_cast<std::vector<int>>(
-        ctx->Attrs().Get<std::vector<int>>("sections"));
-    const size_t outs_number = outs_names.size();
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.reserve(outs_number);
-
-    if (num > 0) {
-      int64_t in_axis_dim = in_dims[axis];
-      if (ctx->IsRuntime() || in_axis_dim > 0) {
-        PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
-                          "tensor split does not result"
-                          " in an equal division");
-        size_t out_axis_dim = in_axis_dim / num;
-        for (size_t i = 0; i < outs_number; ++i) {
-          auto dim = in_dims;
-          dim[axis] = out_axis_dim;
-          outs_dims.push_back(dim);
-        }
-      } else {
-        for (size_t i = 0; i < outs_number; ++i) {
-          auto dim = in_dims;
-          dim[axis] = -1;
-          outs_dims.push_back(dim);
-        }
-      }
-    } else if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
-                        "tensor split sections size"
-                        "should be equal to output size.");
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[axis] = sections[i];
-        outs_dims.push_back(dim);
-      }
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
-    if (axis != 0) {
-      // Only pass LoD when not spliting along the first dim.
-      for (size_t i = 0; i < outs_number; ++i) {
-        ctx->ShareLoD("X", "Out", 0, i);
-      }
-    }
-  }
-};
-
-class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of the split operator.");
-    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-Split operator
-
-This operator splits the input tensor into multiple sub-tensors.
-
-Example:
-  Input = [[1,2],
-           [3,4],
-           [5,6]]
-  sections = [2,1]
-  axis = 0
-  Output[0] = [[1,2],
-               [3,4]]
-  Output[1] = [[5,6]]
-
-    )DOC");
-    AddAttr<std::vector<int>>("sections",
-                              "(vector<int>) "
-                              "the length of each output along the "
-                              "specified axis.")
-        .SetDefault(std::vector<int>{});
-    AddAttr<int>("num",
-                 "(int, default 0)"
-                 "Number of sub-tensors. This must evenly divide "
-                 "Input.dims()[axis]")
-        .SetDefault(0);
-    AddAttr<int>("axis",
-                 "(int, default 0) "
-                 "The axis which the input will be splited on.")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
-REGISTER_OP_CPU_KERNEL(
-    split, ops::SplitOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/split_op.cu.cc b/paddle/fluid/operators/split_op.cu.cc
deleted file mode 100644
index 18e0904681753aff7f3deac96efb6d62f389a031..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/split_op.cu.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/split_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
deleted file mode 100644
index 6f4a25ab5ed86937f2f5db532a9eba22b5a2c5be..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/split_op.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <chrono>  // NOLINT
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-    auto place = ctx.GetPlace();
-
-    std::vector<const framework::Tensor*> shape_refer;
-    for (size_t j = 0; j < outs.size(); ++j) {
-      outs[j]->mutable_data<T>(ctx.GetPlace());
-      shape_refer.emplace_back(outs[j]);
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // Sometimes direct copies will be faster, this maybe need deeply analysis.
-    if (axis == 0 && outs.size() < 10) {
-      StridedMemcpyWithAxis0<T>(dev_ctx, *in, shape_refer, &outs);
-    } else {
-      math::SplitFunctor<DeviceContext, T> functor;
-      functor(dev_ctx, *in, shape_refer, axis, &outs);
-    }
-  }
-};
-
-class SplitGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto op = new framework::OpDesc();
-    op->SetType("concat");
-    op->SetInput("X", OutputGrad("Out"));
-    op->SetOutput("Out", InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
deleted file mode 100644
index 88dfebc0cff0d0f7752c372780f1d952667ec630..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/split_selected_rows_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input SelectedRows.");
-    AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-
-    AddComment(R"DOC(
-Split a SelectedRows with a specified rows section.
-height_sections is only needed when need to split the dims of the original tensor.
-
-Example:
-  Input:
-    X.rows = {7, 5}
-    X.height = 12
-  Attr:
-    height_sections = {4, 8}
-  Out:
-    out0.rows = {}
-    out0.height = 4
-
-    out1.rows = {5, 7}
-    out2.height = 8
-
-)DOC");
-  }
-};
-
-class SplitSelectedRowsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "SplitSelectedRowsOp must has input X.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
-                   "SplitSelectedRowsOp must has output Out.");
-  }
-};
-
-class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    for (auto &out_var : ctx->Output("Out")) {
-      ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
-    }
-  }
-};
-
-class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("sum");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
-                  ops::SplitSelectedRowsOpMaker,
-                  ops::SplitSelectedRowsGradMaker,
-                  ops::SplitSelectedRowsOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    split_selected_rows,
-    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_selected_rows_op.cu b/paddle/fluid/operators/split_selected_rows_op.cu
deleted file mode 100644
index 7250917036f611816a457e250b08f028f50f769d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/split_selected_rows_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/split_selected_rows_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split_selected_rows,
-    ops::SplitSelectedRowsOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
deleted file mode 100644
index 9ec459e2a68d85af526e741d7fd9ecd858383132..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::SelectedRows>("X");
-    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-    auto height_sections = ctx.Attr<std::vector<int64_t>>("height_sections");
-
-    auto abs_sections = ToAbsoluteSection(height_sections);
-
-    auto& x_rows = x->rows();
-    auto height = x->height();
-    std::vector<std::vector<int>> outs_rows_idx;
-    std::vector<std::vector<int>> outs_dense_idx;
-
-    outs_rows_idx.resize(outs.size());
-    outs_dense_idx.resize(outs.size());
-
-    auto row_numel = x->value().numel() / x->value().dims()[0];
-    auto src = x->value().data<T>();
-
-    // split rows index into output sparse vars
-    for (size_t i = 0; i < x_rows.size(); ++i) {
-      auto& id = x_rows[i];
-      PADDLE_ENFORCE_LT(id, height);
-      int out_idx = GetSectionIndex(id, abs_sections);
-      outs_rows_idx[out_idx].push_back(id);
-      outs_dense_idx[out_idx].push_back(i);
-    }
-    auto place = ctx.GetPlace();
-
-    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
-      auto rows_idx = outs_rows_idx[i];
-      outs[i]->set_height(height_sections[i]);
-      auto dims = x->GetCompleteDims();
-      dims[0] = rows_idx.size();
-      outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
-      outs[i]->mutable_rows()->clear();
-      if (rows_idx.size() > 0) {
-        for (auto idx : rows_idx) {
-          auto id_offset = idx - abs_sections[i];
-          PADDLE_ENFORCE_LT(id_offset, height_sections[i]);
-          outs[i]->mutable_rows()->push_back(id_offset);
-        }
-        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
-        for (size_t j = 0; j < rows_idx.size(); j++) {
-          if (platform::is_cpu_place(place)) {
-            memory::Copy(
-                platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
-                src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
-          } else {
-#ifdef PADDLE_WITH_CUDA
-            auto stream = ctx.cuda_device_context().stream();
-            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
-                         platform::CUDAPlace(),
-                         src + outs_dense_idx[i][j] * row_numel,
-                         sizeof(T) * row_numel, stream);
-#else
-            PADDLE_THROW("Paddle is not compiled with GPU");
-#endif
-          }
-        }
-      }
-      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
-                        "rows should has the same size with tensor dim 0");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
deleted file mode 100644
index a2a96b72f09df86790ad1f90ead9189ff9bd581c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spp_op.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/spp_op.h"
-#include <string>
-#include <vector>
-namespace paddle {
-namespace operators {
-
-class SppOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of spp operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of spp operator."
-              "N * M."
-              "M = C * H * W");
-    AddAttr<int>("pyramid_height", "(int), multi level pooling");
-    AddAttr<std::string>(
-        "pooling_type",
-        "(string), pooling type, can be \"max\" for max-pooling "
-        "and \"avg\" for average-pooling.")
-        .InEnum({"max", "avg"});
-    AddComment(R"DOC(
-        "With spatial pyramid pooling, the input image can
-        be of any sizes. This not only allows arbitrary aspect
-        ratios, but also allows arbitrary scales. We can resize
-        the input image to any scale (e.g., min(w, h)=180, 224,
-        ...) and apply the same deep network. When the
-        input image is at different scales, the network (with
-        the same filter sizes) will extract features at different
-        scales. The scales play important roles in traditional
-        methods.
-        Input shape: $(N, C_{in}, H_{in}, W_{in})$
-        Output shape: $(H_{out}, W_{out})$
-        Where
-          $$
-            H_{out} = N \\
-            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
-          $$
-        paper https://arxiv.org/pdf/1406.4729v4.pdf
-        )DOC");
-  }
-};
-
-class SppOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SppOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SppOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
-    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
-    PADDLE_ENFORCE(in_x_dims.size() == 4,
-                   "Spping intput must be of 4-dimensional.");
-    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
-    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  }
-};
-
-class SppOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(spp, ops::SppOp, ops::SppOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(spp_grad, ops::SppOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    spp_grad, ops::SppGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SppGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/spp_op.cu.cc b/paddle/fluid/operators/spp_op.cu.cc
deleted file mode 100644
index 7fe63d17c0dcc21f307dfc3089a84732c1c86889..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spp_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/spp_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    spp, ops::SppKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SppKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    spp_grad, ops::SppGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SppGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
deleted file mode 100644
index 3c2d51ec9111e649632dda89290f21a0988db6dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spp_op.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class SppKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    int pyramid_height = context.template Attr<int>("pyramid_height");
-    std::string pooling_type =
-        context.template Attr<std::string>("pooling_type");
-    out->mutable_data<T>(context.GetPlace());
-    auto out_stride = framework::stride(out->dims());
-    int input_h = in_x->dims()[2];
-    int input_w = in_x->dims()[3];
-    size_t output_offset = 0;
-    for (int p = 0; p < pyramid_height; ++p) {
-      int bins = std::pow(2, p);
-      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
-      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
-      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
-      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
-      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
-      std::vector<int> strides({kernel_size_h, kernel_size_w});
-      std::vector<int> paddings({padding_h, padding_w});
-      // pooling output shape
-      framework::Tensor out_level;
-      std::vector<int64_t> output_shape_vec(
-          {in_x->dims()[0], in_x->dims()[1], bins, bins});
-      framework::DDim output_shape(framework::make_ddim(output_shape_vec));
-      out_level.mutable_data<T>(output_shape, context.GetPlace());
-      // pooling
-      if (pooling_type == "max") {
-        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
-        math::MaxPool<T> max_process;
-        pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, max_process, true, false,
-                     &out_level);
-      } else if (pooling_type == "avg") {
-        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
-        math::AvgPool<T> avg_process;
-        pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, avg_process, true, false,
-                     &out_level);
-      }
-      // flatten pooling output shape
-      int output_flatten_w = in_x->dims()[1] * bins * bins;
-      std::vector<int64_t> output_flatten_shape_vec(
-          {in_x->dims()[0], output_flatten_w});
-      framework::DDim output_flatten_shape(
-          framework::make_ddim(output_flatten_shape_vec));
-      out_level.Resize(output_flatten_shape);
-      // concat
-      auto out_level_stride = framework::stride(out_level.dims());
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
-                       out_level.data<T>(), out_level_stride, out_level.dims(),
-                       out_stride, out->data<T>() + output_offset);
-      output_offset += out_level.dims()[1] * out_level_stride[1];
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class SppGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
-    const framework::Tensor* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor* in_x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int pyramid_height = context.template Attr<int>("pyramid_height");
-    std::string pooling_type =
-        context.template Attr<std::string>("pooling_type");
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-    in_x_grad->mutable_data<T>(context.GetPlace());
-    zero(device_ctx, in_x_grad, static_cast<T>(0));
-    auto out_stride = framework::stride(out->dims());
-    int input_h = in_x->dims()[2];
-    int input_w = in_x->dims()[3];
-    size_t out_offset = 0;
-    for (int p = 0; p < pyramid_height; ++p) {
-      int bins = std::pow(2, p);
-      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
-      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
-      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
-      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
-      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
-      std::vector<int> strides({kernel_size_h, kernel_size_w});
-      std::vector<int> paddings({padding_h, padding_w});
-      // split out and outgrad  ...  to flatten
-      framework::Tensor out_level;
-      framework::Tensor outgrad_level;
-      int out_flatten_w = in_x->dims()[1] * bins * bins;
-      std::vector<int64_t> out_flatten_shape_vec(
-          {in_x->dims()[0], out_flatten_w});
-      framework::DDim out_flatten_shape(
-          framework::make_ddim(out_flatten_shape_vec));
-      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
-      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
-      auto flatten_stride = framework::stride(out_level.dims());
-      // memcpy
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
-                       out->data<T>() + out_offset, out_stride,
-                       out_level.dims(), flatten_stride, out_level.data<T>());
-
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
-                       out_grad->data<T>() + out_offset, out_stride,
-                       outgrad_level.dims(), flatten_stride,
-                       outgrad_level.data<T>());
-      out_offset += out_level.dims()[1] * out_stride[1];
-      // flatten backward to nchw
-
-      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
-      out_shape_vec.push_back(
-          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
-      out_shape_vec.push_back(
-          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
-      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
-      out_level.ShareDataWith(out_level);
-      out_level.Resize(out_shape);
-      outgrad_level.ShareDataWith(outgrad_level);
-      outgrad_level.Resize(out_shape);
-      // pooling backward
-      if (pooling_type == "max") {
-        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
-        pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
-                        *&out_level, *&outgrad_level, kernel_size, strides,
-                        paddings, in_x_grad);
-      } else if (pooling_type == "avg") {
-        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
-            pool_backward;
-        math::AvgPoolGrad<T> avg_process;
-        pool_backward(context.template device_context<DeviceContext>(), *in_x,
-                      *&out_level, *&outgrad_level, kernel_size, strides,
-                      paddings, avg_process, true, false, in_x_grad);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
deleted file mode 100644
index 6e82bf407496ab2d37d3fe81aacccfc128d57aec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/squared_l2_distance_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-
-namespace paddle {
-namespace operators {
-
-class SquaredL2DistanceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SquaredL2DistanceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of SquaredL2DistanceOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("sub_result"),
-        "Output(sub_result) of SquaredL2DistanceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SquaredL2DistanceOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(framework::arity(x_dims), framework::arity(y_dims),
-                      "Tensor rank of both SquaredL2DistanceOp's "
-                      "inputs must be same.");
-
-    int rank = framework::arity(x_dims);
-    PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0],
-                        product(y_dims) / y_dims[0],
-                        "Product of dimensions expcet the first dimension of "
-                        "input and target must be equal.");
-    }
-    check = true;
-    if ((!ctx->IsRuntime()) && (y_dims[0] <= 0 || x_dims[0] <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
-                     "First dimension of target must be equal to input "
-                     "or to 1.");
-    }
-    ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]});
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SquaredL2DistanceGradOpNoBuffer, "X",
-                                      "Y");
-
-class SquaredL2DistanceGradOpDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("squared_l2_distance_grad");
-
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("sub_result", Output("sub_result"));
-    op->SetInput("X", Input("X"));
-    op->SetInput("Y", Input("Y"));
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-
-    op->SetAttrMap(Attrs());
-
-    return op;
-  }
-};
-
-class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
-    AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
-    AddOutput("sub_result",
-              "(Tensor) Buffering subtraction result which "
-              "will be reused in backward.")
-        .AsIntermediate();
-    AddOutput("Out", "(Tensor) Squared l2 distance between input and target.");
-    AddComment(R"DOC(
-SquaredL2Distance operator
-
-This operator will cacluate the squared L2 distance for the input and 
-the target. Number of distance value will be equal to the first dimension 
-of input. First dimension of the target could be equal to the input or to 1. 
-If the first dimension of target is 1, the operator will broadcast target's 
-first dimension to input's first dimension. During backward propagation, 
-the user can decide whether to calculate the gradient of the input or 
-the target or both.
-
-Both the input X and Y can carry the LoD (Level of Details) information. 
-However, the output only shares the LoD information with input X.
-    )DOC");
-  }
-};
-
-class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Gradient of Out should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("sub_result"), "SubResult should not be null");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[0], x_dims[0],
-                                 "First dimension of output gradient and "
-                                 "input value must be equal.");
-    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[1], 1,
-                                 "Second dimension of output gradient "
-                                 "must be 1.");
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims);
-    if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("sub_result")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(squared_l2_distance, ops::SquaredL2DistanceOp,
-                  ops::SquaredL2DistanceOpMaker,
-                  ops::SquaredL2DistanceGradOpDescMaker);
-REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp,
-                  ops::SquaredL2DistanceGradOpNoBuffer);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad,
-                       ops::SquaredL2DistanceGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu
deleted file mode 100644
index c9264da838246efded7d9f85664faf0dc1cec282..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/squared_l2_distance_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/squared_l2_distance_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad,
-                        ops::SquaredL2DistanceGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h
deleted file mode 100644
index 12a8f05b5a603417ead8ebd250ff7951f928f4a1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class SquaredL2DistanceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<Tensor>("sub_result");
-    auto* out1 = context.Output<Tensor>("Out");
-
-    auto in0_dims = in0->dims();
-    auto in1_dims = in1->dims();
-
-    int cols = in0->numel() / in0_dims[0];
-    // reduce dimensions except the first
-    auto x =
-        EigenMatrix<T>::From(*in0, framework::make_ddim({in0_dims[0], cols}));
-    auto y =
-        EigenMatrix<T>::From(*in1, framework::make_ddim({in1_dims[0], cols}));
-
-    out0->mutable_data<T>(context.GetPlace());
-    out1->mutable_data<T>(context.GetPlace());
-    auto sub_result = EigenMatrix<T>::From(*out0);
-    auto z = EigenVector<T>::Flatten(*out1);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto x_dims = x.dimensions();
-    auto y_dims = y.dimensions();
-    // buffer the substraction result
-    if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) {
-      sub_result.device(place) =
-          x -
-          y.broadcast(Eigen::array<int, 2>({{static_cast<int>(x_dims[0]), 1}}));
-    } else {
-      sub_result.device(place) = x - y;
-    }
-    auto sub_res_pow2 = sub_result * sub_result;
-    z.device(place) = sub_res_pow2.sum(Eigen::array<int, 1>({{1}}));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("sub_result");
-    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));
-
-    PADDLE_ENFORCE_NOT_NULL(x_g);
-    PADDLE_ENFORCE_NOT_NULL(y_g);
-
-    auto sub_result = EigenMatrix<T>::From(*in0);
-    auto out_grad = EigenMatrix<T>::From(*in1);
-
-    auto x_dims = x_g->dims();
-    auto y_dims = y_g->dims();
-
-    int cols = x_g->numel() / x_dims[0];
-    // calculate gradient
-    auto grad_mat = 2 *
-                    (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) *
-                    sub_result;
-
-    // propagate back to input
-    auto& eigen_place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    x_g->mutable_data<T>(context.GetPlace());
-    // eigen matrix
-    auto x_grad =
-        EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
-    // dimensions are same with subResult
-    x_grad.device(eigen_place) = grad_mat;
-
-    y_g->mutable_data<T>(context.GetPlace());
-
-    PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
-                      "First dimension of gradient must be greater or "
-                      "equal than first dimension of target.");
-
-    if (sub_result.dimensions()[0] == y_dims[0]) {
-      auto y_grad =
-          EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
-      y_grad.device(eigen_place) = -1 * grad_mat;
-    } else {
-      auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
-      auto y_grad = EigenVector<T>::Flatten(*y_g);
-      y_grad.device(eigen_place) = col_sum_res;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
deleted file mode 100644
index 9d2deb678ecf714421f507af88e7eabade7ecb68..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/squared_l2_norm_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class SquaredL2NormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
-
-    ctx->SetOutputDim("Out", {1});
-  }
-};
-
-class SquaredL2NormGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("squared_l2_norm_grad");
-
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("X", Input("X"));
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class SquaredL2NormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should be not null.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input of squared_l2_norm op.");
-    AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
-    AddComment(R"DOC(
-SquaredL2Norm Operator.
-
-Computes the squared L2 norm of a tensor.
-
-$$Out = \sum_{i} X_{i}^2$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp,
-                  ops::SquaredL2NormOpMaker, ops::SquaredL2NormGradOpDescMaker);
-REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cu b/paddle/fluid/operators/squared_l2_norm_op.cu
deleted file mode 100644
index e31cfeb78ab8a8d1b55a198fe7a2c647a3dce665..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/squared_l2_norm_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/squared_l2_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op.h b/paddle/fluid/operators/squared_l2_norm_op.h
deleted file mode 100644
index b32db4569e7ac0008074300c31e094de95afee30..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/squared_l2_norm_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-// Out = sum(square(X))
-template <typename DeviceContext, typename T>
-class SquaredL2NormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenScalar<T>::From(*Out);
-    auto *place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    out.device(*place) = x.square().sum();
-  }
-};
-
-// dX = X
-template <typename DeviceContext, typename T>
-class SquaredL2NormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(dOut->numel() == 1,
-                   "Squared L2 Norm Gradient should be scalar");
-    framework::Tensor *dX =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto *place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> x_dsize(X->numel());
-    dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
deleted file mode 100644
index 7aeb1d961b1b53105131336eea9ef2a798c65213..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/squeeze_op.cc
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/squeeze_op.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class SqueezeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of Squeeze operator should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of Squeeze operator should not be null.");
-
-    const auto &x_dims = ctx->GetInputDim("X");
-    // Check input tensor dims (<6) Eigen limit.
-    PADDLE_ENFORCE_LE(x_dims.size(), 6,
-                      "Invalid dimnesions, the rank of Input(X) "
-                      "should be in the range of [1, 6] (Eigen limit).");
-
-    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    for (int a : axes) {
-      PADDLE_ENFORCE_LT(a, x_dims.size(),
-                        "The squeeze axis should be less than input "
-                        "tensor's rank.");
-    }
-
-    auto out_dims = GetOutputShape(axes, x_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-  }
-
-  static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
-                                        const framework::DDim &in_dims) {
-    size_t num_squeeze_dims = squeeze_dims.size();
-    int cnt_squeezed_dims = 0;
-    bool should_squeeze[9] = {false};
-
-    // Determines number of dimensions of output tensor after squeeze.
-    // Mark and count the dimensions need to be squeezed
-    if (num_squeeze_dims == 0) {
-      for (int idx = 0; idx < in_dims.size(); ++idx) {
-        if (in_dims[idx] == 1) {
-          should_squeeze[idx] = true;
-          ++cnt_squeezed_dims;
-        }
-      }
-    } else {
-      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
-        int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
-                                            : squeeze_dims[idx];
-        // Check current index, the upper limit has beed checked in line 36.
-        PADDLE_ENFORCE_GE(current, 0,
-                          "Invalid axis, the negative axis is out of range.");
-
-        if (!(should_squeeze[current])) {
-          ++cnt_squeezed_dims;
-        }
-        should_squeeze[current] = true;
-      }
-    }
-
-    // Make output dimensions
-    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
-    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
-      if (!should_squeeze[in_idx]) {
-        output_shape[out_idx++] = in_dims[in_idx];
-      }
-    }
-
-    return framework::make_ddim(output_shape);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class SqueezeGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    context->SetOutputDim(framework::GradVarName("X"),
-                          context->GetInputDim("X"));
-    context->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor). The input tensor of squeeze operator.");
-    AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
-    AddAttr<std::vector<int>>("axes",
-                              "(std::vector<int>). List of integers,"
-                              " indicating the dimensions to squeeze.")
-        .SetDefault({});
-    AddComment(R"DOC(
-        Squeeze Operator.
-
-        Remove single-dimensional entries from the shape of a tensor.
-        Takes a parameter axes with a list of axes to squeeze.
-        If axes is not provided, all the single dimensions will be removed from the shape.
-        If an axis is selected with shape entry not equal to one, an error is raised.
-
-        Examples:
-        Case 1:
-          Given
-            X.shape = (1, 3, 1, 5)
-          and
-            axes = [0]
-          we get:
-            Out.shape = (3, 1, 5)
-
-        Case 2:
-          Given
-            X.shape = (1, 3, 1, 5)
-          and
-            axes = []
-          we get:
-            Out.shape = (3, 5)
-    )DOC");
-  }
-};
-
-class Squeeze2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of Squeeze operator should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of Squeeze operator should not be null.");
-
-    const auto &x_dims = ctx->GetInputDim("X");
-    // Check input tensor dims (<6) Eigen limit.
-    PADDLE_ENFORCE_LE(x_dims.size(), 6,
-                      "Invalid dimnesions, the rank of Input(X) "
-                      "should be in the range of [1, 6] (Eigen limit).");
-
-    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    for (int a : axes) {
-      PADDLE_ENFORCE_LT(a, x_dims.size(),
-                        "The squeeze axis should be less than input "
-                        "tensor's rank.");
-    }
-
-    auto out_dims = SqueezeOp::GetOutputShape(axes, x_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true,
-                      "Output(XShape) of Squeeze operator should not be null.");
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-  }
-};
-
-class Squeeze2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE_EQ(context->HasInput("XShape"), true,
-                      "Input(XShape) shouldn't be null.");
-    PADDLE_ENFORCE_EQ(context->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) shouldn't be null.");
-    auto xshape_dims = context->GetInputDim("XShape");
-    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    context->SetOutputDim(framework::GradVarName("X"), x_dims);
-    context->ShareLoD("XShape", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
-// the XShape is used to carry the shape and lod of X which will be used in
-// squeeze_grad, in this way, the framework can reuse the memory of X
-// immediately the squeeze2_op is finished.
-// Considering compatibility issues, we could not fix squeeze2_op
-class Squeeze2OpMaker : public SqueezeOpMaker {
- public:
-  void Make() override {
-    SqueezeOpMaker::Make();
-    AddOutput("XShape",
-              "XShape is just used to store the shape and lod of X, which will "
-              "be used in SqueezeGradOp.")
-        .AsIntermediate();
-  }
-};
-
-class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("squeeze2_grad");
-    grad_op->SetInput("XShape", Output("XShape"));
-    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(SequeezeInplaceInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(SequeezeGradInplaceInferer,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp);
-
-REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
-                  ops::Squeeze2GradOpMaker, ops::SequeezeInplaceInferer);
-REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
-                  ops::SequeezeGradInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    squeeze, ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    squeeze_grad,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    squeeze2, ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc
deleted file mode 100644
index 50fee1497e989fc2df93292253010a212c78a54f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/squeeze_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    squeeze, ops::SqueezeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze_grad,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze2, ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
deleted file mode 100644
index 5aae186527543dfe6e36de59fac058524e66bf59..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/squeeze_op.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SqueezeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<framework::LoDTensor>("X");
-    auto *out = context.Output<framework::LoDTensor>("Out");
-
-    auto &axes = context.Attr<std::vector<int>>("axes");
-    auto x_dims = in->dims();
-    auto out_dims = GetOutputShape(axes, x_dims);
-
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(), out);
-    out->Resize(out_dims);
-  }
-
-  static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
-                                        const framework::DDim &in_dims) {
-    size_t num_squeeze_dims = squeeze_dims.size();
-    int cnt_squeezed_dims = 0;
-    bool should_squeeze[9] = {false};
-
-    // Determines number of dimensions of output tensor after squeeze.
-    // Mark and count the dimensions need to be squeezed
-    if (num_squeeze_dims == 0) {
-      for (int idx = 0; idx < in_dims.size(); ++idx) {
-        if (in_dims[idx] == 1) {
-          should_squeeze[idx] = true;
-          ++cnt_squeezed_dims;
-        }
-      }
-    } else {
-      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
-        int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
-                                            : squeeze_dims[idx];
-        // Check current index, the upper limit has beed checked in line 36.
-        PADDLE_ENFORCE_GE(current, 0,
-                          "Invalid axis, the negative axis is out of range.");
-
-        PADDLE_ENFORCE_EQ(in_dims[current], 1,
-                          "Invalid axis index, the axis that will be squeezed "
-                          "should be equal to 1.");
-
-        if (!(should_squeeze[current])) {
-          ++cnt_squeezed_dims;
-        }
-        should_squeeze[current] = true;
-      }
-    }
-
-    // Make output dimensions
-    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
-    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
-      if (!should_squeeze[in_idx]) {
-        output_shape[out_idx++] = in_dims[in_idx];
-      }
-    }
-
-    return framework::make_ddim(output_shape);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SqueezeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_out =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto in_dims = ctx.Input<framework::LoDTensor>("X")->dims();
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    d_x->Resize(in_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Squeeze2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *out = context.Output<framework::LoDTensor>("Out");
-    auto *in = context.Input<framework::LoDTensor>("X");
-
-    auto &axes = context.Attr<std::vector<int>>("axes");
-
-    auto x_dims = in->dims();
-    auto out_dims =
-        SqueezeKernel<DeviceContext, T>::GetOutputShape(axes, x_dims);
-
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(), out);
-    out->Resize(out_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Squeeze2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_out =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    // auto in_dims = d_x->dims();
-
-    auto xshape_dims = ctx.Input<framework::LoDTensor>("XShape")->dims();
-    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    d_x->Resize(x_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
deleted file mode 100644
index 9345b495415d203728238c19621a20f446c40bf5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/stack_op.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/stack_op.h"
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
-                  ops::StackGradOpDescMaker);
-REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
-
-REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel<plat::CPUDeviceContext, float>,
-                       ops::StackKernel<plat::CPUDeviceContext, double>,
-                       ops::StackKernel<plat::CPUDeviceContext, int>,
-                       ops::StackKernel<plat::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(stack_grad,
-                       ops::StackGradKernel<plat::CPUDeviceContext, float>,
-                       ops::StackGradKernel<plat::CPUDeviceContext, double>,
-                       ops::StackGradKernel<plat::CPUDeviceContext, int>,
-                       ops::StackGradKernel<plat::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
deleted file mode 100644
index 24d0b2f906a8e0b360c3f477c9290ebe5d57a3ff..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/stack_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/stack_op.h"
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    stack, ops::StackKernel<plat::CUDADeviceContext, float>,
-    ops::StackKernel<plat::CUDADeviceContext, double>,
-    ops::StackKernel<plat::CUDADeviceContext, int>,
-    ops::StackKernel<plat::CUDADeviceContext, int64_t>,
-    ops::StackKernel<plat::CUDADeviceContext, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    stack_grad, ops::StackGradKernel<plat::CUDADeviceContext, float>,
-    ops::StackGradKernel<plat::CUDADeviceContext, double>,
-    ops::StackGradKernel<plat::CUDADeviceContext, int>,
-    ops::StackGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::StackGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
deleted file mode 100644
index 9ebf166d7034cd55d9f920128580d06e53cfac18..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/stack_op.h
+++ /dev/null
@@ -1,263 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-#ifdef __NVCC__
-#include <thrust/device_vector.h>
-#include "paddle/fluid/framework/array.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class StackOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_GT(ctx->Inputs("X").size(), 0,
-                      "Number of Inputs(X) must be larger than 0");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist.");
-
-    auto input_dims = ctx->GetInputsDim("X");
-    for (size_t i = 1; i < input_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
-                        "Dims of all Inputs(X) must be the same");
-    }
-
-    // Only lod of X[0] would be shared with Y
-    ctx->ShareLoD("X", /*->*/ "Y");
-
-    int axis = ctx->Attrs().Get<int>("axis");
-    int rank = input_dims[0].size();
-    PADDLE_ENFORCE(
-        axis >= -(rank + 1) && axis < rank + 1,
-        "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank);
-    if (axis < 0) axis += (rank + 1);
-
-    auto vec = framework::vectorize<int>(input_dims[0]);
-    vec.insert(vec.begin() + axis, input_dims.size());
-    ctx->SetOutputDim("Y", framework::make_ddim(vec));
-  }
-};
-
-class StackOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of stack op.").AsDuplicable();
-    AddOutput("Y", "The output of stack op.");
-    AddAttr<int>("axis",
-                 "The axis along which all of the Inputs(X) should be stacked.")
-        .SetDefault(0);
-    AddComment(R"DOC(
-      Stack Operator.
-
-      Stack all of the Inputs(X) into one tensor along Attr(axis). The dims of all Inputs(X) must be the same.
-    )DOC");
-  }
-};
-
-template <typename VecXType, typename T>
-struct StackFunctor {
-  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
-      : x_(x), y_(y), n_(n), post_(post) {}
-
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    y_[idx] = x_[which_x][x_index];
-  }
-
- private:
-  VecXType x_;
-  T *y_;
-  int n_;
-  int post_;
-};
-
-template <typename VecDxType, typename T>
-struct StackGradFunctor {
-  HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
-      : dx_(dx), dy_(dy), n_(n), post_(post) {}
-
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    dx_[which_x][x_index] = dy_[idx];
-  }
-
- private:
-  VecDxType dx_;
-  const T *dy_;
-  int n_;
-  int post_;
-};
-
-template <typename DeviceContext, typename VecXType, typename T>
-static inline void StackFunctorForRange(const DeviceContext &ctx,
-                                        const VecXType &x, T *y, int total_num,
-                                        int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackFunctor<VecXType, T>(x, y, n, post));
-}
-
-template <typename DeviceContext, typename VecDxType, typename T>
-static inline void StackGradFunctorForRange(const DeviceContext &ctx,
-                                            const VecDxType &dx, const T *dy,
-                                            int total_num, int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
-}
-
-template <typename DeviceContext, typename T>
-class StackKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Y");
-
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-
-    int n = static_cast<int>(x.size());
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    std::vector<const T *> x_datas(n);
-    for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
-
-    int pre = 1, post = 1;
-    auto &dim = x[0]->dims();
-    for (auto i = 0; i < axis; ++i) pre *= dim[i];
-    for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-
-#ifdef __NVCC__
-    int total_num = pre * n * post;
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-
-    thrust::device_vector<const T *> device_x_vec(x_datas);
-    auto x_data_arr = device_x_vec.data().get();
-
-    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
-
-    // Wait() must be called because device_x_vec may be destructed before
-    // kernel ends
-    dev_ctx.Wait();
-#else
-    auto x_data_arr = x_datas.data();
-
-    size_t x_offset = 0;
-    size_t y_offset = 0;
-    for (int i = 0; i < pre; i++) {
-      for (int j = 0; j < n; j++) {
-        std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
-                    post * sizeof(T));
-        y_offset += post;
-      }
-      x_offset += post;
-    }
-#endif
-  }
-};
-
-class StackOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@Grad) must exist.");
-
-    int axis = ctx->Attrs().Get<int>("axis");
-    auto dy_dim = ctx->GetInputDim(framework::GradVarName("Y"));
-    int rank = dy_dim.size();
-    PADDLE_ENFORCE(axis >= -rank && axis < rank,
-                   "Attr(axis) must be inside [-rank, rank), where rank = %d",
-                   rank);
-    if (axis < 0) axis += rank;
-
-    PADDLE_ENFORCE_EQ(ctx->Outputs(framework::GradVarName("X")).size(),
-                      static_cast<size_t>(dy_dim[axis]),
-                      "Number of Outputs(X@Grad) is wrong");
-    auto vec = framework::vectorize<int>(dy_dim);
-    vec.erase(vec.begin() + axis);
-    ctx->SetOutputsDim(
-        framework::GradVarName("X"),
-        std::vector<framework::DDim>(dy_dim[axis], framework::make_ddim(vec)));
-  }
-};
-
-class StackGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("stack_grad");
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class StackGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-
-    int n = dy->dims()[axis];
-    std::vector<T *> dx_datas(n);  // NOLINT
-    for (int i = 0; i < n; i++) {
-      dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
-    }
-    auto dy_data = dy->data<T>();
-
-    int pre = 1;
-    for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
-    int total_num = dy->numel();
-    int post = total_num / (n * pre);
-
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-#ifdef __NVCC__
-    thrust::device_vector<T *> device_dx_vec(dx_datas);
-    auto dx_data_arr = device_dx_vec.data().get();
-#else
-    auto dx_data_arr = dx_datas.data();
-#endif
-    StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
-#ifdef __NVCC__
-    // Wait() must be called because device_dx_vec may be destructed before
-    // kernel ends
-    dev_ctx.Wait();
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
deleted file mode 100644
index 6a99ad9a90f69ba3c96fc18dc46dfcadcb6ac631..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/strided_memcpy.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/detail/strided_memcpy.h"
-namespace paddle {
-namespace operators {
-
-// Strided memory copy from src to dst.
-//
-// The src and dst should be both on dev_ctx.GetPlace(), otherwise, there will
-// be a segment fault.
-//
-// The stride of an array (also referred to as increment, pitch or step size) is
-// the number of locations in memory between beginnings of successive array
-// elements
-//
-// For example, for tensor like [1, 3, 300, 300]. If there is no padding, the
-// stride is [270000, 90000, 300, 1].
-//
-// NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
-// `dev_ctx.Wait()`.
-template <typename T>
-inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
-                          const framework::DDim& src_stride,
-                          const framework::DDim& dst_dim,
-                          const framework::DDim& dst_stride, T* dst) {
-  paddle::operators::detail::StridedCopyDimVisitor<T> func(
-      dev_ctx, src, src_stride, dst_stride, dst);
-  dst_dim.apply_visitor(func);
-}
-
-// Strided numel memory copy from src to dst by the specified axis
-//
-// For example, for a tensor dims [4, 20, 100], the strieded numel is
-// [8000, 2000, 100]
-//
-// NOTE: The src and dst tensor should have the same elements
-// except the specified axis.
-template <typename T>
-inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
-                                     int64_t axis, T* dst,
-                                     const framework::DDim& dst_stride_numel,
-                                     const T* src,
-                                     const framework::DDim& src_stride_numel,
-                                     int64_t size) {
-  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
-  int64_t src_after = src_stride_numel[axis];
-  int64_t dst_after = dst_stride_numel[axis];
-  auto place = ctx.GetPlace();
-
-  PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(),
-                    "src and dst tensor should have the same dims size.");
-
-  for (int64_t i = 0; i < axis; ++i) {
-    if (i < axis) {
-      PADDLE_ENFORCE_EQ(src_stride_numel[i] / src_stride_numel[axis],
-                        dst_stride_numel[i] / dst_stride_numel[axis],
-                        "src and dst should have the same elements "
-                        "except the specified axis.");
-    } else if (i == axis) {
-      continue;
-    } else {
-      PADDLE_ENFORCE_EQ(src_stride_numel[i], dst_stride_numel[i],
-                        "src and dst should have the same elements "
-                        "except the specified axis.");
-    }
-  }
-
-  for (int64_t i = 0; i < before; ++i) {
-    if (platform::is_cpu_place(place)) {
-      auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      memory::Copy(cpu_place, dst + i * dst_after, cpu_place,
-                   src + i * src_after, sizeof(T) * size);
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
-      auto& cuda_ctx =
-          reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
-      memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
-                   src + i * src_after, sizeof(T) * size, cuda_ctx.stream());
-#else
-      PADDLE_THROW("Paddle is not compiled with GPU");
-#endif
-    }
-  }
-}
-
-template <typename T>
-inline void StridedMemcpyWithAxis0(
-    const platform::DeviceContext& dev_ctx, const framework::Tensor& input,
-    const std::vector<const framework::Tensor*>& shape_refer,
-    std::vector<framework::Tensor*>* outputs) {
-  const framework::DDim in_stride = stride_numel(input.dims());
-  const int axis = 0;
-  size_t input_offset = 0;
-
-  for (size_t i = 0; i < outputs->size(); ++i) {
-    auto out_stride = stride_numel(shape_refer[i]->dims());
-    auto out = outputs->at(i);
-    if (out != nullptr) {
-      StridedNumelCopyWithAxis<T>(dev_ctx, axis, out->data<T>(), out_stride,
-                                  input.data<T>() + input_offset, in_stride,
-                                  out_stride[axis]);
-    }
-    input_offset += out_stride[axis];
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc
deleted file mode 100644
index 3a450773a9d749eb3f73baa46e681e588e1fbd0f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/strided_memcpy.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/memory.h"
-
-namespace paddle {
-namespace operators {
-
-TEST(StridedMemcpy, CPUCrop) {
-  // clang-format off
-  int src[] = {
-      0, 1, 2, 0, 0,
-      0, 3, 4, 0, 0,
-      0, 0, 0, 0, 0,
-  };
-  // clang-format on
-
-  framework::DDim src_stride({5, 1});
-
-  int dst[4];
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({2, 1});
-
-  platform::CPUDeviceContext ctx;
-  StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
-
-  ASSERT_EQ(1, dst[0]);
-  ASSERT_EQ(2, dst[1]);
-  ASSERT_EQ(3, dst[2]);
-  ASSERT_EQ(4, dst[3]);
-}
-
-TEST(StridedMemcpy, CPUConcat) {
-  // clang-format off
-  int src[] = {
-      1, 2,
-      3, 4
-  };
-  // clang-format on
-
-  int dst[8];
-
-  framework::DDim src_stride({2, 1});
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({4, 1});
-  platform::CPUDeviceContext ctx;
-
-  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
-  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
-
-  // clang-format off
-  int expect_dst[] = {
-      1, 2, 1, 2,
-      3, 4, 3, 4
-  };
-  // clang-format on
-  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
-    ASSERT_EQ(expect_dst[i], dst[i]);
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(StridedMemcpy, GPUCrop) {
-  // clang-format off
-  int src[] = {
-      0, 1, 2, 0, 0,
-      0, 3, 4, 0, 0,
-      0, 0, 0, 0, 0,
-  };
-  // clang-format on
-
-  platform::CUDAPlace gpu0(0);
-  platform::CPUPlace cpu;
-
-  platform::CUDADeviceContext ctx(gpu0);
-
-  auto src_allocation = memory::Alloc(gpu0, sizeof(src));
-
-  int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
-
-  framework::DDim src_stride({5, 1});
-
-  int dst[4];
-  auto dst_allocation = memory::Alloc(gpu0, sizeof(dst));
-  int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
-
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({2, 1});
-
-  StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
-                     gpu_dst);
-
-  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
-  ctx.Wait();
-
-  ASSERT_EQ(1, dst[0]);
-  ASSERT_EQ(2, dst[1]);
-  ASSERT_EQ(3, dst[2]);
-  ASSERT_EQ(4, dst[3]);
-}
-
-TEST(StridedMemcpy, GPUConcat) {
-  // clang-format off
-  int src[] = {
-      1, 2,
-      3, 4
-  };
-  // clang-format on
-
-  platform::CUDAPlace gpu0(0);
-  platform::CPUPlace cpu;
-  platform::CUDADeviceContext ctx(gpu0);
-  auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
-  int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
-
-  int dst[8];
-  auto gpu_dst_allocation = memory::Alloc(gpu0, sizeof(dst));
-  int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
-
-  framework::DDim src_stride({2, 1});
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({4, 1});
-
-  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
-  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
-                     gpu_dst + 2);
-
-  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
-  ctx.Wait();
-
-  // clang-format off
-  int expect_dst[] = {
-      1, 2, 1, 2,
-      3, 4, 3, 4
-  };
-  // clang-format on
-  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
-    ASSERT_EQ(expect_dst[i], dst[i]);
-  }
-}
-
-#endif
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc
deleted file mode 100644
index b6bbb071acc9ad8cda52f55dc9ac7700eba8a34c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/strided_slice_op.h"
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/slice_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class StridedSliceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      "Input (Input) of slice op should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output (Out) of slice op should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_LT(in_dims.size(), 7,
-                      "The rank of input should be less than 7.");
-    auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
-    auto ends = ctx->Attrs().Get<std::vector<int>>("ends");
-    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
-
-    auto starts_size = starts.size();
-    auto ends_size = ends.size();
-    auto strides_size = strides.size();
-
-    if (ctx->HasInputs("StartsTensorList")) {
-      auto StartsTensorList = ctx->Inputs("StartsTensorList");
-      PADDLE_ENFORCE_GT(StartsTensorList.size(), 0,
-                        "StartsTensorList size can't be zero");
-      starts_size = StartsTensorList.size();
-    }
-    if (ctx->HasInputs("EndsTensorList")) {
-      auto EndsTensorList = ctx->Inputs("EndsTensorList");
-      PADDLE_ENFORCE_GT(EndsTensorList.size(), 0,
-                        "EndsTensorList size can't be zero");
-      ends_size = EndsTensorList.size();
-    }
-    if (ctx->HasInputs("StridesTensorList")) {
-      auto StridesTensorList = ctx->Inputs("StridesTensorList");
-      PADDLE_ENFORCE_GT(StridesTensorList.size(), 0,
-                        "StridesTensorList size can't be zero");
-      strides_size = StridesTensorList.size();
-    }
-
-    auto tensor_input = false;
-    if (ctx->HasInput("EndsTensor") || ctx->HasInput("StartsTensor") ||
-        ctx->HasInput("StridesTensor")) {
-      tensor_input = true;
-    }
-    if (ctx->HasInput("EndsTensor") == false) {
-      PADDLE_ENFORCE_EQ(ends_size, axes.size(),
-                        "The size of ends must be equal to the size of axes.");
-    }
-    if (ctx->HasInput("StartsTensor") == false) {
-      PADDLE_ENFORCE_EQ(
-          starts_size, axes.size(),
-          "The size of starts must be equal to the size of axes.");
-    }
-    if (ctx->HasInput("StridesTensor") == false) {
-      PADDLE_ENFORCE_EQ(
-          strides_size, axes.size(),
-          "The size of strides must be equal to the size of axes.");
-    }
-    // we need to analysis strided slice op is valid for
-    // the parameter that we get from python front
-    std::vector<int> out_dims_vector(in_dims.size(), -1);
-    if (!tensor_input) {
-      StridedSliceOutDims(starts, ends, strides, axes, infer_flags, in_dims,
-                          out_dims_vector.data(), axes.size(), true);
-    }
-    framework::DDim out_dims(framework::make_ddim(out_dims_vector));
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("Input", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                   ctx.Input<Tensor>("Input")->place());
-  }
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "StartsTensor" || var_name == "EndsTensor" ||
-        var_name == "StridesTensor") {
-      return expected_kernel_type;
-    }
-    if (var_name == "StartsTensorList" || var_name == "EndsTensorList" ||
-        var_name == "StridesTensorList") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class StridedSliceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "Tensor of data to extract slices from.");
-    AddOutput("Out", "Strided Sliced data tensor.");
-
-    AddInput("StartsTensor",
-             "(Tensor<int32>, optional) If provided, slice will use this."
-             "It has the highest priority of StartsTensor, StartsTensorList "
-             "and attr(starts).")
-        .AsDispensable();
-    AddInput("EndsTensor",
-             "(Tensor<int32>, optional) If provided, slice will use this."
-             "It has the highest priority of EndsTensor, EndsTensorList and "
-             "attr(ends).")
-        .AsDispensable();
-    AddInput(
-        "StridesTensor",
-        "(Tensor<int32>, optional) If provided, slice will use this."
-        "It has the highest priority of StridesTensor, StridesTensorList and "
-        "attr(ends).")
-        .AsDispensable();
-    AddInput(
-        "StartsTensorList",
-        "(vector<Tensor<int32>>, optional) If provided, slice will use this."
-        "The shape of the tensor in vector MUST BE [1]."
-        "It has higher priority compare with attr(starts).")
-        .AsDuplicable()
-        .AsDispensable();
-    AddInput(
-        "EndsTensorList",
-        "(vector<Tensor<int32>>, optional) If provided, slice will use this."
-        "The shape of the tensor in vector MUST BE [1]."
-        "It has higher priority compare with attr(ends).")
-        .AsDuplicable()
-        .AsDispensable();
-    AddInput(
-        "StridesTensorList",
-        "(vector<Tensor<int32>>, optional) If provided, slice will use this."
-        "The shape of the tensor in vector MUST BE [1]."
-        "It has higher priority compare with attr(strides).")
-        .AsDuplicable()
-        .AsDispensable();
-    AddAttr<std::vector<int>>(
-        "axes", "(list<int>) Axes that `starts` and `ends` apply to.");
-    AddAttr<std::vector<int>>(
-        "starts", "(list<int>) Start indices for the strided slice start.")
-        .SetDefault({});
-    AddAttr<std::vector<int>>("ends",
-                              "(list<int>) End indices the tensor slice end")
-        .SetDefault({});
-    AddAttr<std::vector<int>>(
-        "strides", "(list<int> Stride step from the start to the end)")
-        .SetDefault({});
-    AddAttr<std::vector<int>>(
-        "infer_flags", "(list<int>) Flags of inferring dims in attributes.")
-        .SetDefault({});
-    AddComment(R"DOC(
-Strided Slice Operator.
-Instead of calling this op directly most users will want to use the
-NumPy-style slicing syntax.
-For Example:
-data = fluid.layers.fill_constant(shape=[3, 3], value=0, dtype='int64')
-y = fluid.layers.strided_slice(data, [0, 1], [1,0], [2, 3], [1, 1])
-)DOC");
-  }
-};
-
-class StridedSliceOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, "Input should not be null");
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("Input");
-    auto x_grad_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "StartsTensor" || var_name == "EndsTensor") {
-      return expected_kernel_type;
-    }
-    if (var_name == "StartsTensorList" || var_name == "EndsTensorList") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class StridedSliceOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *bind = new framework::OpDesc();
-    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    bind->SetInput("Input", Input("Input"));
-    bind->SetInput("StartsTensor", Input("StartsTensor"));
-    bind->SetInput("EndsTensor", Input("EndsTensor"));
-    bind->SetInput("StridesTensor", Input("StridesTensor"));
-    bind->SetInput("StartsTensorList", Input("StartsTensorList"));
-    bind->SetInput("EndsTensorList", Input("EndsTensorList"));
-    bind->SetInput("StridesTensorList", Input("StridesTensorList"));
-    bind->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    bind->SetAttrMap(Attrs());
-    bind->SetType("strided_slice_grad");
-    return std::unique_ptr<framework::OpDesc>(bind);
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    StridedSliceOpGradNoNeedBufferVarsInference, "Input");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(strided_slice, ops::StridedSliceOp, ops::StridedSliceOpMaker,
-                  ops::StridedSliceOpGradMaker);
-REGISTER_OPERATOR(strided_slice_grad, ops::StridedSliceOpGrad,
-                  ops::StridedSliceOpGradNoNeedBufferVarsInference);
-
-REGISTER_OP_CPU_KERNEL(
-    strided_slice,
-    ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    strided_slice_grad,
-    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/strided_slice_op.cu b/paddle/fluid/operators/strided_slice_op.cu
deleted file mode 100644
index f0c9d557b9a81ce80ef66f72638c9729a89d7be4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/strided_slice_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/strided_slice_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    strided_slice,
-    ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    strided_slice_grad,
-    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/strided_slice_op.h b/paddle/fluid/operators/strided_slice_op.h
deleted file mode 100644
index 57d33f29d80fedc6bac06b60708268e41c725d26..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/strided_slice_op.h
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <cstdlib>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/slice_op.h"
-namespace paddle {
-namespace operators {
-
-static void StridedSliceOutDims(
-    const std::vector<int>& starts, const std::vector<int>& ends,
-    const std::vector<int>& strides, const std::vector<int>& axes,
-    const std::vector<int>& infer_flags, const framework::DDim in_dims,
-    int* out_dims_vector, const size_t size, bool infer_shape) {
-  for (int i = 0; i < in_dims.size(); i++) {
-    out_dims_vector[i] = in_dims[i];
-  }
-  int stride_index, start_index, end_index;
-  for (size_t i = 0; i < size; i++) {
-    int axes_index = axes[i];
-    if (infer_shape && infer_flags[i] == -1) {
-      out_dims_vector[axes_index] = -1;
-      continue;
-    }
-
-    PADDLE_ENFORCE_NE(strides[i], 0, "stride must not to be zero");
-    start_index = starts[i];
-    end_index = ends[i];
-    stride_index = strides[i];
-    int axis_size = in_dims[axes_index];
-    if (axis_size < 0) {
-      continue;
-    }
-
-    if (start_index < 0) {
-      start_index = start_index + axis_size;
-    }
-    if (end_index < 0) {
-      end_index = end_index + axis_size;
-    }
-
-    if (stride_index < 0) {
-      start_index = start_index + 1;
-      end_index = end_index + 1;
-    }
-
-    bool zero_dim_condition =
-        ((stride_index < 0 && (start_index <= end_index)) ||
-         (stride_index > 0 && (start_index >= end_index)));
-    PADDLE_ENFORCE_EQ(zero_dim_condition, false,
-                      "starts and end must meet requirement in different "
-                      "stride conditiont");
-    int left = std::max(0, std::min(start_index, end_index));
-    int right = std::min(axis_size, std::max(start_index, end_index));
-    int step = std::abs(stride_index);
-    auto out_dims_index = (std::abs(right - left) + step - 1) / step;
-
-    out_dims_vector[axes_index] = out_dims_index;
-  }
-}
-
-static void StridedSliceFunctor(int* starts, int* ends, int* strides, int* axes,
-                                int* reverse_axis, const framework::DDim dims,
-                                const size_t size) {
-  for (size_t axis = 0; axis < size; axis++) {
-    int axis_size = dims[axes[axis]];
-    int axis_index = axis;
-    if (axis_size < 0) {
-      starts[axis_index] = 0;
-      ends[axis_index] = 1;
-      strides[axis_index] = 1;
-    }
-    // stride must not be zero
-    if (starts[axis_index] < 0) {
-      starts[axis_index] = starts[axis_index] + axis_size;
-    }
-
-    if (ends[axis_index] < 0) {
-      ends[axis_index] = ends[axis_index] + axis_size;
-    }
-    if (strides[axis_index] < 0) {
-      reverse_axis[axis_index] = 1;
-      strides[axis_index] = -strides[axis_index];
-      if (starts[axis_index] > ends[axis_index]) {
-        // swap the reverse
-        starts[axis_index] = starts[axis_index] + 1;
-        ends[axis_index] = ends[axis_index] + 1;
-      }
-      std::swap(starts[axis_index], ends[axis_index]);
-    } else {
-      reverse_axis[axis_index] = 0;
-      strides[axis_index] = strides[axis_index];
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class StridedSliceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
-    switch (rank) {
-      case 1:
-        StridedSliceCompute<1>(ctx);
-        break;
-      case 2:
-        StridedSliceCompute<2>(ctx);
-        break;
-      case 3:
-        StridedSliceCompute<3>(ctx);
-        break;
-      case 4:
-        StridedSliceCompute<4>(ctx);
-        break;
-      case 5:
-        StridedSliceCompute<5>(ctx);
-        break;
-      case 6:
-        StridedSliceCompute<6>(ctx);
-        break;
-    }
-  }
-
- private:
-  template <size_t D>
-  void StridedSliceCompute(const framework::ExecutionContext& context) const {
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto in = context.Input<framework::Tensor>("Input");
-    auto out = context.Output<framework::Tensor>("Out");
-    auto in_dims = in->dims();
-
-    auto starts = context.Attr<std::vector<int>>("starts");
-    auto ends = context.Attr<std::vector<int>>("ends");
-    auto strides = context.Attr<std::vector<int>>("strides");
-    auto axes = context.Attr<std::vector<int>>("axes");
-    auto infer_flags = context.Attr<std::vector<int>>("infer_flags");
-
-    auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto reverse_axis = Eigen::array<bool, D>();
-
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-    auto list_new_strides_tensor =
-        context.MultiInput<framework::Tensor>("StridesTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = get_new_data_from_tensorlist(list_new_starts_tensor);
-    } else if (context.HasInput("StartsTensor")) {
-      auto* starts_tensor = context.Input<framework::Tensor>("StartsTensor");
-      starts = get_new_data_from_tensor(starts_tensor);
-    }
-
-    if (list_new_ends_tensor.size() > 0) {
-      ends = get_new_data_from_tensorlist(list_new_ends_tensor);
-    } else if (context.HasInput("EndsTensor")) {
-      auto* ends_tensor = context.Input<framework::Tensor>("EndsTensor");
-      ends = get_new_data_from_tensor(ends_tensor);
-    }
-
-    if (list_new_strides_tensor.size() > 0) {
-      strides = get_new_data_from_tensorlist(list_new_strides_tensor);
-    } else if (context.HasInput("StridesTensor")) {
-      auto* strides_tensor = context.Input<framework::Tensor>("StridesTensor");
-      strides = get_new_data_from_tensor(strides_tensor);
-    }
-
-    std::vector<int> out_dims_vector(in_dims.size(), -1);
-    StridedSliceOutDims(starts, ends, strides, axes, infer_flags, in_dims,
-                        out_dims_vector.data(), axes.size(), false);
-    framework::DDim out_dims(framework::make_ddim(out_dims_vector));
-
-    std::vector<int> reverse_vector(starts.size(), 0);
-    StridedSliceFunctor(starts.data(), ends.data(), strides.data(), axes.data(),
-                        reverse_vector.data(), in_dims, starts.size());
-
-    for (size_t axis = 0; axis < D; axis++) {
-      starts_indices[axis] = 0;
-      ends_indices[axis] = out_dims[axis];
-      strides_indices[axis] = 1;
-      reverse_axis[axis] = false;
-    }
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      int axis_index = axes[axis];
-      starts_indices[axis_index] = starts[axis];
-      ends_indices[axis_index] = ends[axis];
-      strides_indices[axis_index] = strides[axis];
-      reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false;
-    }
-
-    framework::Tensor tmp;
-    tmp.mutable_data<T>(out_dims, context.GetPlace());
-
-    out->Resize(out_dims);
-    out->mutable_data<T>(context.GetPlace());
-    auto in_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in);
-    auto tmp_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            tmp);
-    auto out_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *out, out_dims);
-    tmp_t.device(place) =
-        in_t.stridedSlice(starts_indices, ends_indices, strides_indices);
-    out_t.device(place) = tmp_t.reverse(reverse_axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class StridedSliceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    size_t rank = ctx.Input<framework::Tensor>("Input")->dims().size();
-    switch (rank) {
-      case 1:
-        StridedSliceGradCompute<1>(ctx);
-        break;
-      case 2:
-        StridedSliceGradCompute<2>(ctx);
-        break;
-      case 3:
-        StridedSliceGradCompute<3>(ctx);
-        break;
-      case 4:
-        StridedSliceGradCompute<4>(ctx);
-        break;
-      case 5:
-        StridedSliceGradCompute<5>(ctx);
-        break;
-      case 6:
-        StridedSliceGradCompute<6>(ctx);
-        break;
-    }
-  }
-
- private:
-  template <size_t D>
-  void StridedSliceGradCompute(
-      const framework::ExecutionContext& context) const {
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto* d_input =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_out =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-    d_out->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, d_out, static_cast<T>(0));
-    auto out_dims = d_out->dims();
-    auto in_dims = d_input->dims();
-    auto starts = context.Attr<std::vector<int>>("starts");
-    auto ends = context.Attr<std::vector<int>>("ends");
-    auto strides = context.Attr<std::vector<int>>("strides");
-    auto axes = context.Attr<std::vector<int>>("axes");
-
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-    auto list_new_strides_tensor =
-        context.MultiInput<framework::Tensor>("StridesTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = get_new_data_from_tensorlist(list_new_starts_tensor);
-    } else if (context.HasInput("StartsTensor")) {
-      auto* starts_tensor = context.Input<framework::Tensor>("StartsTensor");
-      starts = get_new_data_from_tensor(starts_tensor);
-    }
-
-    if (list_new_ends_tensor.size() > 0) {
-      ends = get_new_data_from_tensorlist(list_new_ends_tensor);
-    } else if (context.HasInput("EndsTensor")) {
-      auto* ends_tensor = context.Input<framework::Tensor>("EndsTensor");
-      ends = get_new_data_from_tensor(ends_tensor);
-    }
-
-    if (list_new_strides_tensor.size() > 0) {
-      strides = get_new_data_from_tensorlist(list_new_strides_tensor);
-    } else if (context.HasInput("StridesTensor")) {
-      auto* strides_tensor = context.Input<framework::Tensor>("StridesTensor");
-      strides = get_new_data_from_tensor(strides_tensor);
-    }
-
-    auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-
-    auto reverse_axis = Eigen::array<bool, D>();
-    std::vector<int> reverse_vector(starts.size(), 0);
-
-    StridedSliceFunctor(starts.data(), ends.data(), strides.data(), axes.data(),
-                        reverse_vector.data(), out_dims, starts.size());
-
-    for (size_t axis = 0; axis < D; axis++) {
-      starts_indices[axis] = 0;
-      ends_indices[axis] = out_dims[axis];
-      strides_indices[axis] = 1;
-    }
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      int axis_index = axes[axis];
-      starts_indices[axis_index] = starts[axis];
-      ends_indices[axis_index] = ends[axis];
-      strides_indices[axis_index] = strides[axis];
-      reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false;
-    }
-
-    framework::Tensor reverse_input;
-    reverse_input.mutable_data<T>(in_dims, context.GetPlace());
-
-    auto in_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *d_input);
-    auto reverse_in_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            reverse_input);
-    auto out_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *d_out, out_dims);
-
-    reverse_in_t.device(place) = in_t.reverse(reverse_axis);
-    out_t.stridedSlice(starts_indices, ends_indices, strides_indices)
-        .device(place) = reverse_in_t;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
deleted file mode 100644
index 37204fd72aef0f29e3b399bf06123b8f7715358e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sum_op.cc
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sum_op.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/var_type_inference.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-class SumOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SumOp should not be null.");
-    if (ctx->IsRuntime() &&
-        ctx->GetOutputsVarType("Out")[0] ==
-            framework::proto::VarType::LOD_TENSOR_ARRAY) {
-      return;  // skip runtime infershape when is tensor array;
-    }
-
-    auto x_var_types = ctx->GetInputsVarType("X");
-    auto x_dims = ctx->GetInputsDim("X");
-
-    size_t N = x_dims.size();
-    PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
-    if (N == 1) {
-      VLOG(3) << "Warning: sum have only one input, may waste memory";
-    }
-
-    framework::DDim in_dim({0});
-    for (size_t i = 0; i < x_dims.size(); ++i) {
-      auto& x_dim = x_dims[i];
-      // x_dim.size() == 1 means the real dim of selected rows is [0]
-      if (x_var_types[i] == framework::proto::VarType::SELECTED_ROWS &&
-          x_dim.size() == 1) {
-        continue;
-      }
-      if (framework::product(x_dim) == 0) {
-        continue;
-      }
-      if (framework::product(in_dim) == 0) {
-        in_dim = x_dim;
-      } else {
-        if (ctx->IsRuntime()) {
-          PADDLE_ENFORCE_EQ(in_dim, x_dim,
-                            "Input tensors must have same shape");
-        } else {
-          PADDLE_ENFORCE_EQ(in_dim.size(), x_dim.size(),
-                            "Input tensors must have same shape size");
-          // if in_dim or x_dim has -1, not check equal
-          for (int i = 0; i < x_dim.size(); ++i) {
-            if (x_dim[i] == -1 || in_dim[i] == -1) {
-              continue;
-            }
-            PADDLE_ENFORCE_EQ(in_dim[i], x_dim[i],
-                              "Input tensors must have same shape if not -1");
-          }
-        }
-      }
-    }
-    ctx->SetOutputDim("Out", in_dim);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto x_vars = ctx.MultiInputVar("X");
-    auto x_vars_name = ctx.Inputs("X");
-
-    framework::LibraryType library{framework::LibraryType::kPlain};
-    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-    }
-#endif
-
-    if (x_vars[0]->IsType<framework::LoDTensor>()) {
-      int dtype = -1;
-      for (size_t idx = 0; idx < x_vars.size(); ++idx) {
-        PADDLE_ENFORCE(x_vars[idx] != nullptr,
-                       "Input var[%s] should not be nullptr", x_vars_name[idx]);
-        auto tensor =
-            framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_vars[idx]);
-        if (tensor->numel() <= 0 || (!tensor->IsInitialized())) {
-          continue;
-        }
-        if (dtype == -1) {
-          dtype = tensor->type();
-        } else {
-          PADDLE_ENFORCE_EQ(dtype, tensor->type());
-        }
-      }
-      PADDLE_ENFORCE_NE(dtype, -1,
-                        "Sum operator should have at least one tensor");
-
-      return framework::OpKernelType(
-          static_cast<framework::proto::VarType::Type>(dtype), ctx.GetPlace(),
-          layout, library);
-    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
-      for (auto& var : x_vars) {
-        auto& value = var->Get<framework::SelectedRows>().value();
-        if (value.IsInitialized()) {
-          return framework::OpKernelType(value.type(), ctx.device_context(),
-                                         layout, library);
-        }
-      }
-      // if input sparse vars are not initialized, use an default kernel type.
-      return framework::OpKernelType(framework::proto::VarType::FP32,
-                                     ctx.device_context(), layout, library);
-    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
-      for (auto& x_var : x_vars) {
-        auto& array = x_var->Get<framework::LoDTensorArray>();
-        for (auto& each : array) {
-          if (each.numel() != 0 && each.IsInitialized()) {
-            return framework::OpKernelType(each.type(), ctx.device_context(),
-                                           layout, library);
-          }
-        }
-      }
-      PADDLE_THROW("Cannot find the input data type by all input data");
-    }
-    PADDLE_THROW("Unexpected branch. Input type is %s",
-                 framework::ToTypeName(x_vars[0]->Type()));
-  }
-};
-
-class SumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
-        .AsDuplicable();
-    AddOutput("Out", "(Tensor) The output tensor of sum operator.");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Sum operator.
-
-This operators sums the input tensors. All the inputs can carry the
-LoD (Level of Details) information. However, the output only shares
-the LoD information with the first input.
-)DOC");
-  }
-};
-
-class SumOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto& inputs = ctx->Input("X");
-    auto var_type = framework::proto::VarType::SELECTED_ROWS;
-    for (auto& name : ctx->Input("X")) {
-      VLOG(10) << name << " " << ctx->GetType(name);
-    }
-
-    bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [ctx](const std::string& name) {
-          return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR;
-        });
-
-    auto is_tensor_array = [ctx](const std::string& name) {
-      return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR_ARRAY;
-    };
-
-    bool any_input_is_tensor_array =
-        std::any_of(inputs.begin(), inputs.end(), is_tensor_array);
-    bool all_inputs_are_tensor_array =
-        std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
-
-    if (any_input_is_tensor_array) {
-      if (!all_inputs_are_tensor_array) {
-        std::ostringstream os;
-        for (auto& each : inputs) {
-          os << "    " << each << " type is " << ctx->GetType(each) << "\n";
-        }
-        PADDLE_ENFORCE(all_inputs_are_tensor_array,
-                       "Not all inputs are tensor array:\n%s", os.str());
-      }
-      var_type = framework::proto::VarType::LOD_TENSOR_ARRAY;
-    } else if (any_input_is_lod_tensor) {
-      var_type = framework::proto::VarType::LOD_TENSOR;
-    }
-
-    auto out_var_name = ctx->Output("Out").front();
-    ctx->SetType(out_var_name, var_type);
-    ctx->SetDataType(out_var_name, ctx->GetDataType(inputs.front()));
-  }
-};
-
-class SumGradMaker : public framework::GradOpDescMakerBase {
- public:
-  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-    auto x_grads = InputGrad("X", false);
-    std::vector<std::unique_ptr<framework::OpDesc>> grad_ops;
-    grad_ops.reserve(x_grads.size());
-    auto og = OutputGrad("Out");
-    std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
-                   [&og](const std::string& x_grad) {
-                     auto* grad_op = new framework::OpDesc();
-                     grad_op->SetType("scale");
-                     grad_op->SetInput("X", og);
-                     grad_op->SetOutput("Out", {x_grad});
-                     grad_op->SetAttr("scale", 1.0f);
-                     return std::unique_ptr<framework::OpDesc>(grad_op);
-                   });
-    return grad_ops;
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(SumInplace, {"X", "Out"});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
-                  ops::SumOpVarTypeInference, ops::SumInplace);
-
-REGISTER_OP_CPU_KERNEL(
-    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
deleted file mode 100644
index 3564ed0c4f0faf45461374ba1faa68c1c7992cb6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sum_op.cu
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/fluid/platform/device_context.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
-
-using LoDTensor = framework::LoDTensor;
-
-template <class T>
-__global__ void Sum2CUDAKernel(const T *in_0, const T *in_1, T *out,
-                               int64_t N) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  while (id < N) {
-    out[id] = in_0[id] + in_1[id];
-    id += blockDim.x * gridDim.x;
-  }
-}
-
-template <class T>
-__global__ void SumArrayCUDAKernel(T **in, T *out, int64_t N, size_t in_size,
-                                   bool read_dst) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  while (id < N) {
-    T total(read_dst ? out[id] : static_cast<T>(0));
-    for (int i = 0; i < in_size; ++i) {
-      const T *tmp = in[i];
-      if (tmp) {
-        total += tmp[id];
-      }
-    }
-    out[id] = total;
-    id += blockDim.x * gridDim.x;
-  }
-}
-
-template <class T>
-__global__ void SumSelectedRowsCUDAKernel(T **sr_in_out, int64_t N,
-                                          size_t rows) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  while (id < N) {
-    for (int i = 0; i < 2 * rows; i += 2) {
-      const T *tmp = sr_in_out[i];
-      T *tmp_out = sr_in_out[i + 1];
-      if (tmp && tmp_out) {
-        tmp_out[id] += tmp[id];
-      }
-    }
-    id += blockDim.x * gridDim.x;
-  }
-}
-
-template <class T>
-__global__ void SumAlign4CUDAKernel(const T *in_0, const T *in_1, T *out,
-                                    int64_t N) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = id; i < N / 4; i += blockDim.x * gridDim.x) {
-    const float4 *in0_4 = reinterpret_cast<float4 *>(in_0);
-    const float4 *in1_4 = reinterpret_cast<float4 *>(in_1);
-    float4 tmp;
-    tmp.x = in0_4[i].x + in1_4[i].x;
-    tmp.y = in0_4[i].y + in1_4[i].y;
-    tmp.z = in0_4[i].z + in1_4[i].z;
-    tmp.w = in0_4[i].w + in1_4[i].w;
-    reinterpret_cast<float4 *>(out)[i] = tmp;
-  }
-}
-
-template <class T>
-void SumToLoDTensor(const framework::ExecutionContext &context) {
-  auto in_vars = context.MultiInputVar("X");
-  const size_t in_num = in_vars.size();
-
-  constexpr size_t theory_sm_threads = 1024;
-  auto &dev_ctx =
-      context.template device_context<platform::CUDADeviceContext>();
-  auto stream = dev_ctx.stream();
-
-  auto max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-  auto sm_count = max_threads / theory_sm_threads;
-  size_t tile_size = 0;
-  dim3 grids;
-  dim3 blocks;
-
-  auto ComputeKernelParameter = [&](size_t length) {
-    if (length >= max_threads)
-      tile_size = 1024;
-    else if (length < max_threads && length > sm_count * 128)
-      tile_size = 512;
-    else if (length <= sm_count * 128)
-      tile_size = 256;
-    grids = dim3(CEIL_DIV(length, tile_size), 1, 1);
-    blocks = dim3(tile_size, 1, 1);
-  };
-
-  auto *out = context.Output<LoDTensor>("Out");
-  bool in_place = in_vars[0] == context.OutputVar("Out");
-
-  if (!in_place) {
-    auto *out_ptr = out->mutable_data<T>(context.GetPlace());
-    if (in_num >= 1 && in_vars[0]->IsType<framework::LoDTensor>()) {
-      auto &in_0_tensor = in_vars[0]->Get<framework::LoDTensor>();
-      if (in_0_tensor.numel() > 0) {
-        in_place = (in_0_tensor.data<T>() == out_ptr);
-      }
-    }
-  }
-
-  // Sum of two tensors
-  if (in_num == 2 && in_vars[0]->IsType<framework::LoDTensor>() &&
-      in_vars[1]->IsType<framework::LoDTensor>()) {
-    auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
-    auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
-
-    auto length = in_0.numel();
-    if (length && in_0.IsInitialized() && in_1.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      auto in_0_e = EigenVector<T>::Flatten(in_0);
-      auto in_1_e = EigenVector<T>::Flatten(in_1);
-      result.device(place) = in_0_e + in_1_e;
-    } else if (length && in_0.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      result.device(place) = EigenVector<T>::Flatten(in_0);
-    } else if (length && in_1.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      result.device(place) = EigenVector<T>::Flatten(in_1);
-    }
-    return;
-  }
-
-  int start = in_place ? 1 : 0;
-  if (!in_place) {
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(
-        context.template device_context<platform::CUDADeviceContext>(), out,
-        static_cast<T>(0));
-  }
-
-  std::vector<const T *> in_data;
-  std::vector<int> selectrow_index;
-  int64_t lod_length = 0;
-  bool dst_write = false;
-  for (int i = start; i < in_num; ++i) {
-    if (in_vars[i]->IsType<framework::LoDTensor>()) {
-      auto &in_i = in_vars[i]->Get<framework::LoDTensor>();
-      in_data.emplace_back(in_i.data<T>());
-      lod_length = in_i.numel();
-    } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
-      selectrow_index.push_back(i);
-    }
-  }
-
-  // compute select rows seperately.
-  if (!selectrow_index.empty()) {
-    std::vector<const T *> sr_in_out_data;
-    size_t rows = 0;
-    int64_t length = 0;
-    for (auto index : selectrow_index) {
-      auto &sr = in_vars[index]->Get<framework::SelectedRows>();
-      auto &sr_value = sr.value();
-      auto &sr_rows = sr.rows();
-
-      auto row_numel = sr_value.numel() / sr_rows.size();
-      auto out_dims = out->dims();
-
-      PADDLE_ENFORCE_EQ(sr.height(), out_dims[0]);
-      PADDLE_ENFORCE_EQ(row_numel, out->numel() / sr.height());
-
-      auto *sr_data = sr_value.data<T>();
-      auto *sr_out_data = out->data<T>();
-      rows += sr_rows.size();
-      length = row_numel;
-
-      for (size_t i = 0; i < sr_rows.size(); ++i) {
-        sr_in_out_data.emplace_back(&sr_data[i * row_numel]);
-        sr_in_out_data.emplace_back(&sr_out_data[sr_rows[i] * row_numel]);
-      }
-    }
-    if (!sr_in_out_data.empty()) {
-      auto tmp_sr_in_out_array =
-          memory::Alloc(dev_ctx, sr_in_out_data.size() * sizeof(T *));
-
-      memory::Copy(boost::get<platform::CUDAPlace>(dev_ctx.GetPlace()),
-                   tmp_sr_in_out_array->ptr(), platform::CPUPlace(),
-                   reinterpret_cast<void *>(sr_in_out_data.data()),
-                   sr_in_out_data.size() * sizeof(T *), dev_ctx.stream());
-
-      T **sr_in_out_array_data =
-          reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
-
-      ComputeKernelParameter(length);
-      SumSelectedRowsCUDAKernel<T><<<grids, blocks, 0, stream>>>(
-          sr_in_out_array_data, length, rows);
-      dst_write = true;
-    }
-  }
-  // if indata not null, merge into one kernel call.
-  if (!in_data.empty()) {
-    auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *));
-
-    memory::Copy(boost::get<platform::CUDAPlace>(dev_ctx.GetPlace()),
-                 tmp_in_array->ptr(), platform::CPUPlace(),
-                 reinterpret_cast<void *>(in_data.data()),
-                 in_data.size() * sizeof(T *), dev_ctx.stream());
-
-    T **in_array_data = reinterpret_cast<T **>(tmp_in_array->ptr());
-    ComputeKernelParameter(lod_length);
-    SumArrayCUDAKernel<T><<<grids, blocks, 0, stream>>>(
-        in_array_data, out->data<T>(), lod_length, in_data.size(),
-        dst_write | in_place);
-  }
-}
-
-template <typename T>
-class SumKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto out_var = context.OutputVar("Out");
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      SumToLoDTensor<T>(context);
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      SelectedRowsCompute<platform::CUDADeviceContext, T>(context);
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      LodTensorArrayCompute<platform::CUDADeviceContext, T>(context);
-    } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
deleted file mode 100644
index 3b7f42927917cce74b0c0965b4b182db7ce545e4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sum_op.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
-using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-void SelectedRowsCompute(const framework::ExecutionContext &context) {
-  auto in_vars = context.MultiInputVar("X");
-  auto out_var = context.OutputVar("Out");
-  bool in_place = out_var == in_vars[0];
-
-  if (in_place && in_vars.size() < 2) {
-    return;
-  }
-
-  std::vector<const paddle::framework::SelectedRows *> inputs;
-  SelectedRows temp_in0;
-
-  if (in_place) {
-    auto &in0 = in_vars[0]->Get<SelectedRows>();
-    temp_in0.set_height(in0.height());
-    temp_in0.set_rows(in0.rows());
-    framework::TensorCopy(in0.value(), in0.place(), context.device_context(),
-                          temp_in0.mutable_value());
-    inputs.push_back(&temp_in0);
-    for (size_t i = 1; i < in_vars.size(); ++i) {
-      auto &in = in_vars[i]->Get<SelectedRows>();
-      if (in.rows().size() > 0) {
-        inputs.push_back(&in);
-      }
-    }
-  } else {
-    for (auto &in_var : in_vars) {
-      auto &in = in_var->Get<SelectedRows>();
-      if (in.rows().size() > 0) {
-        inputs.push_back(&in_var->Get<SelectedRows>());
-      }
-    }
-  }
-
-  auto *out = context.Output<SelectedRows>("Out");
-  out->mutable_rows()->clear();
-
-  bool has_data = false;
-  for (auto &in : inputs) {
-    if (in->rows().size() > 0) {
-      has_data = true;
-      break;
-    }
-  }
-  if (has_data) {
-    math::scatter::MergeAdd<DeviceContext, T> merge_add;
-    merge_add(context.template device_context<DeviceContext>(), inputs, out);
-
-    out->SyncIndex();
-
-  } else {
-    // no data, just set a empty out tensor.
-    out->mutable_value()->mutable_data<T>(framework::make_ddim({0}),
-                                          context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-void LodTensorArrayCompute(const framework::ExecutionContext &context) {
-  auto in_vars = context.MultiInputVar("X");
-  auto out_var = context.OutputVar("Out");
-  bool in_place = out_var == in_vars[0];
-  auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
-  for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(), true,
-                      "Only support all inputs are TensorArray");
-    auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
-
-    for (size_t i = 0; i < in_array.size(); ++i) {
-      if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) {
-        if (i >= out_array.size()) {
-          out_array.resize(i + 1);
-        }
-        if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) {
-          framework::TensorCopy(in_array[i], in_array[i].place(),
-                                context.device_context(), &out_array[i]);
-          out_array[i].set_lod(in_array[i].lod());
-        } else {
-          PADDLE_ENFORCE_EQ(out_array[i].lod(), in_array[i].lod());
-          auto in = EigenVector<T>::Flatten(in_array[i]);
-          auto result = EigenVector<T>::Flatten(out_array[i]);
-          result.device(*context.template device_context<DeviceContext>()
-                             .eigen_device()) = result + in;
-        }
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class SumKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto in_vars = context.MultiInputVar("X");
-    size_t in_num = in_vars.size();
-    auto out_var = context.OutputVar("Out");
-
-    bool in_place = out_var == in_vars[0];
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      auto *out = out_var->GetMutable<framework::LoDTensor>();
-      auto *out_ptr = out->mutable_data<T>(context.GetPlace());
-      if (in_num >= 1 && in_vars[0]->IsType<framework::LoDTensor>()) {
-        auto &in_0_tensor = in_vars[0]->Get<framework::LoDTensor>();
-        if (in_0_tensor.numel() > 0) {
-          in_place = (in_0_tensor.data<T>() == out_ptr);
-        }
-      }
-
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      int start = in_place ? 1 : 0;
-      if (!in_place) {
-        if ((in_num >= 2) && in_vars[0]->IsType<framework::LoDTensor>() &&
-            in_vars[1]->IsType<framework::LoDTensor>()) {
-          auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
-          auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
-          if (in_0.numel() && in_1.numel()) {
-            auto in_0_e = EigenVector<T>::Flatten(in_0);
-            auto in_1_e = EigenVector<T>::Flatten(in_1);
-            result.device(place) = in_0_e + in_1_e;
-            start = 2;
-          }
-        }
-        if (start != 2) {
-          math::SetConstant<DeviceContext, T> constant_functor;
-          constant_functor(context.template device_context<DeviceContext>(),
-                           out, static_cast<T>(0));
-        }
-      }
-
-      math::SelectedRowsAddToTensor<DeviceContext, T> functor;
-      // If in_place, just skip the first tensor
-      for (size_t i = start; i < in_num; i++) {
-        if (in_vars[i]->IsType<framework::LoDTensor>()) {
-          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
-          if (in_t.numel() == 0) {
-            continue;
-          }
-          auto in = EigenVector<T>::Flatten(in_t);
-          result.device(place) = result + in;
-        } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
-          auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
-          functor(context.template device_context<DeviceContext>(), in_t, out);
-        } else {
-          PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
-        }
-      }
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      SelectedRowsCompute<DeviceContext, T>(context);
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      LodTensorArrayCompute<DeviceContext, T>(context);
-    } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc
deleted file mode 100644
index d6cf27fd779eeddc94c1839e46892a99f61bd1bf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sync_batch_norm_op.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/batch_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
-REGISTER_OPERATOR(sync_batch_norm_grad, ops::BatchNormGradOp);
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu
deleted file mode 100644
index fb4ae48eb07c425f79fa37a3b9374059149f1a77..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ /dev/null
@@ -1,473 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// clang-format off
-#include <algorithm>
-#include <cfloat>
-#include <cmath>
-#include <string>
-#include <vector>
-#include "cub/cub.cuh"
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/norm_utils.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-template <typename T>
-using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void KeLocalStats(const T *x, int N, int M, int C,
-                             BatchNormParamType<T> *mean_var) {
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  for (int k = blockIdx.x; k < C; k += gridDim.x) {
-    BatchNormParamType<T> x_sum = 0.;
-    BatchNormParamType<T> x2_sum = 0.;
-    for (int i = threadIdx.x; i < N * M; i += BlockDim) {
-      int id = layout == framework::DataLayout::kNCHW
-                   ? (i / M) * C * M + k * M + i % M
-                   : i * C + k;
-      auto x_in = static_cast<BatchNormParamType<T>>(x[id]);
-      x_sum += x_in;
-      x2_sum += x_in * x_in;
-    }
-    __syncthreads();
-    auto out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      mean_var[k] = out / (N * M);
-    }
-    out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      mean_var[k + C] = out / (N * M);
-    }
-  }
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    mean_var[2 * C] = static_cast<BatchNormParamType<T>>(1.0);
-  }
-}
-
-template <typename T>
-__global__ void KeSyncAndMovingStats(
-    BatchNormParamType<T> *means, BatchNormParamType<T> *variances,
-    BatchNormParamType<T> *num_dev, const int C,
-    const BatchNormParamType<T> momentum, const double epsilon,
-    BatchNormParamType<T> *sv_mean_data, BatchNormParamType<T> *sv_inv_var_data,
-    BatchNormParamType<T> *moving_means,
-    BatchNormParamType<T> *moving_variances) {
-  // sync stats across multi-devices
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < C; i += stride) {
-    auto mean = means[i] / (*num_dev);
-    auto var = variances[i] / (*num_dev);
-    var = var - mean * mean;
-
-    // sync stats
-    sv_mean_data[i] = mean;
-    sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon);
-    variances[i] = var;
-
-    // moving stats
-    moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum);
-    moving_variances[i] =
-        moving_variances[i] * momentum + var * (1. - momentum);
-  }
-}
-
-template <typename T, framework::DataLayout layout>
-static __global__ void KeNormAffine(const T *x,
-                                    const BatchNormParamType<T> *scale,
-                                    const BatchNormParamType<T> *bias,
-                                    const BatchNormParamType<T> *mean,
-                                    const BatchNormParamType<T> *variance,
-                                    const double epsilon, const int C,
-                                    const int M, const int num, T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
-    auto x_i = static_cast<BatchNormParamType<T>>(x[i]);
-    auto y_i =
-        (x_i - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c];
-    y[i] = static_cast<T>(y_i);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class SyncBatchNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = framework::StringToDataLayout(layout_str);
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    PADDLE_ENFORCE(
-        !use_global_stats,
-        "sync_batch_norm doesn't support to set use_global_stats True. ",
-        "Please use batch_norm in this case.");
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 2 and 5");
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
-    int x_numel = x->numel();
-
-    const T *x_d = x->data<T>();
-    const auto *s_d = ctx.Input<Tensor>("Scale")->data<BatchNormParamType<T>>();
-    const auto *b_d = ctx.Input<Tensor>("Bias")->data<BatchNormParamType<T>>();
-
-    auto *y = ctx.Output<Tensor>("Y");
-    T *y_d = y->mutable_data<T>(ctx.GetPlace());
-
-    const BatchNormParamType<T> *mean_data = nullptr;
-    const BatchNormParamType<T> *var_data = nullptr;
-
-    auto &dev_ctx = ctx.cuda_device_context();
-    auto stream = dev_ctx.stream();
-    auto *comm = dev_ctx.nccl_comm();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-
-    paddle::memory::AllocationPtr alloc_ptr{nullptr};
-
-    if (is_test) {
-      const auto *est_mean = ctx.Input<Tensor>("Mean");
-      const auto *est_var = ctx.Input<Tensor>("Variance");
-      mean_data = est_mean->data<BatchNormParamType<T>>();
-      var_data = est_var->data<BatchNormParamType<T>>();
-    } else {
-      // x, x^2, 1, here 1 is used to calc device num
-      // device num also can be got from platform::DeviceContextPool
-      const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
-      alloc_ptr = memory::Alloc(dev_ctx, bytes);
-
-      auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
-      const int threads = 256;
-      int grid = std::min(C, (max_threads + threads - 1) / threads);
-      if (layout == framework::DataLayout::kNCHW) {
-        KeLocalStats<T, threads, framework::DataLayout::kNCHW>
-            <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
-      } else {
-        KeLocalStats<T, threads, framework::DataLayout::kNHWC>
-            <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
-      }
-
-      // moving mean/variance
-      auto *mean_out = ctx.Output<Tensor>("MeanOut");
-      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-      auto *est_mean_data =
-          mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      auto *est_var_data =
-          variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto *saved_inv_variance = ctx.Output<Tensor>("SavedVariance");
-      auto *sv_mean_data =
-          saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      auto *sv_inv_var_data =
-          saved_inv_variance->mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace());
-
-      Tensor c_g_st;
-      auto *c_g_st_d = c_g_st.mutable_data<BatchNormParamType<T>>(
-          {2 * C + 1}, platform::CPUPlace());
-      auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-      memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0);
-
-      int dtype = platform::ToNCCLDataType(mean_out->type());
-      // In-place operation
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
-          stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
-          comm, stream));
-
-      // Note, Input('Mean')/Input('Variance') share variable with
-      // Output('MeanOut')/Output('VarianceOut')
-      KeSyncAndMovingStats<T><<<(C + block - 1) / block, block, 0, stream>>>(
-          stats, stats + C, stats + 2 * C, C, momentum, epsilon, sv_mean_data,
-          sv_inv_var_data, est_mean_data, est_var_data);
-
-      mean_data = sv_mean_data;
-      var_data = stats + C;
-    }
-
-    int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
-    if (layout == framework::DataLayout::kNCHW) {
-      KeNormAffine<T, framework::DataLayout::kNCHW>
-          <<<grid2, block, 0, stream>>>(x_d, s_d, b_d, mean_data, var_data,
-                                        epsilon, C, H * W * D, x_numel, y_d);
-    } else {
-      KeNormAffine<T, framework::DataLayout::kNHWC>
-          <<<grid2, block, 0, stream>>>(x_d, s_d, b_d, mean_data, var_data,
-                                        epsilon, C, H * W * D, x_numel, y_d);
-    }
-  }
-};
-
-template <typename T, const int BlockDim, framework::DataLayout layout>
-__global__ void KeBackwardLocalStats(const T *dy, const T *x,
-                                     const BatchNormParamType<T> *means, int N,
-                                     int M, int C,
-                                     BatchNormParamType<T> *sum_dy_prod) {
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  for (int k = blockIdx.x; k < C; k += gridDim.x) {
-    BatchNormParamType<T> sum1 = 0.;
-    BatchNormParamType<T> sum2 = 0.;
-    auto mean = means[k];
-    for (int i = threadIdx.x; i < N * M; i += blockDim.x) {
-      int id = layout == framework::DataLayout::kNCHW
-                   ? (i / M) * C * M + k * M + i % M
-                   : i * C + k;
-      auto g = static_cast<BatchNormParamType<T>>(dy[id]);
-      sum1 += g;
-      auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
-      sum2 += g * (x_i - mean);
-    }
-
-    __syncthreads();
-    auto out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      sum_dy_prod[k] = out;
-    }
-    out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      sum_dy_prod[k + C] = out;
-    }
-  }
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    sum_dy_prod[2 * C] = 1.0;
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void KeBNBackwardScaleBias(
-    const T *dy, const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *inv_variance, const double epsilon,
-    const int N, const int C, const int HxW, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = 0.;
-    BatchNormParamType<T> db_sum = 0.;
-
-    auto inv_var_i = inv_variance[i];
-    auto mean_i = mean[i];
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int id = layout == framework::DataLayout::kNCHW
-                         ? ((j / HxW) * C + i) * HxW + (j % HxW)
-                         : j * outer_size + i;
-      auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
-      auto dy_i = static_cast<BatchNormParamType<T>>(dy[id]);
-      ds_sum += dy_i * (x_i - mean_i);
-      db_sum += dy_i;
-    }
-    __syncthreads();
-    auto os = BlockReduce(temp_storage).Reduce(ds_sum, cub::Sum());
-    __syncthreads();
-    auto ob = BlockReduce(temp_storage).Reduce(db_sum, cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      dscale[i] = os * inv_var_i;
-      dbias[i] = ob;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNBackwardData(
-    const T *dy, const T *x, const BatchNormParamType<T> *gamma,
-    const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *inv_variance,
-    const BatchNormParamType<T> *g_sum_dy,
-    const BatchNormParamType<T> *g_sum_dy_prod,
-    const BatchNormParamType<T> *num_dev, const double epsilon, const int C,
-    const int HxW, const int num, T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  auto scale = static_cast<BatchNormParamType<T>>(C) / num;
-  auto dev_num = num_dev[0];
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    auto inv_var = inv_variance[c];
-    auto s_d = gamma[c];
-    auto gvar =
-        -((g_sum_dy_prod[c] / dev_num) * s_d * inv_var * (inv_var * inv_var));
-    auto gmean = -((g_sum_dy[c] / dev_num) * s_d * inv_var);
-
-    auto x_i = static_cast<BatchNormParamType<T>>(x[i]);
-    auto dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
-    auto dx_i =
-        dy_i * s_d * inv_var + gmean * scale + gvar * scale * (x_i - mean[c]);
-    dx[i] = static_cast<T>(dx_i);
-  }
-}
-
-// Deriving the Gradient for the Backward Pass of Batch Normalization
-// https://kevinzakka.github.io/2016/09/14/batch_normalization/
-template <typename DeviceContext, typename T>
-class SyncBatchNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-
-    const DataLayout layout = framework::StringToDataLayout(layout_str);
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-
-    const auto &x_dims = x->dims();
-
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 2 and 5");
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    }
-    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
-    PADDLE_ENFORCE_EQ(scale->dims()[0], C);
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (layout == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * C * D, 1, W * D * C, D * C, C};
-    }
-
-    const T *x_d = x->data<T>();
-    const T *dy_d = d_y->data<T>();
-
-    auto &dev_ctx = ctx.cuda_device_context();
-    auto stream = dev_ctx.stream();
-    auto *comm = dev_ctx.nccl_comm();
-
-    const auto *saved_mean =
-        ctx.Input<Tensor>("SavedMean")->data<BatchNormParamType<T>>();
-    const auto *saved_inv_var =
-        ctx.Input<Tensor>("SavedVariance")->data<BatchNormParamType<T>>();
-    const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
-    auto alloc_ptr = memory::Alloc(dev_ctx, bytes);
-    auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
-
-    const int threads = 256;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    int grid = std::min(C, (max_threads + threads - 1) / threads);
-    int x_numel = x->numel();
-    int fsize = H * W * D;
-
-    if (layout == framework::DataLayout::kNCHW) {
-      KeBackwardLocalStats<T, threads, framework::DataLayout::kNCHW>
-          <<<grid, threads, 0, stream>>>(dy_d, x_d, saved_mean, N, fsize, C,
-                                         stats);
-    } else {
-      KeBackwardLocalStats<T, threads, framework::DataLayout::kNHWC>
-          <<<grid, threads, 0, stream>>>(dy_d, x_d, saved_mean, N, fsize, C,
-                                         stats);
-    }
-    int dtype = platform::ToNCCLDataType(scale->type());
-    // In-place operation
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
-        stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
-        comm, stream));
-
-    const int block = 512;
-    int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
-    if (layout == framework::DataLayout::kNCHW) {
-      if (d_scale && d_bias) {
-        KeBNBackwardScaleBias<T, threads, framework::DataLayout::kNCHW>
-            <<<grid, threads, 0, stream>>>(
-                dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-      }
-      if (d_x) {
-        KeBNBackwardData<T, framework::DataLayout::kNCHW>
-            <<<grid2, block, 0, stream>>>(
-                dy_d, x_d, scale->data<BatchNormParamType<T>>(), saved_mean,
-                saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C,
-                fsize, x->numel(), d_x->data<T>());
-      }
-    } else {
-      if (d_scale && d_bias) {
-        KeBNBackwardScaleBias<T, threads, framework::DataLayout::kNHWC>
-            <<<grid, threads, 0, stream>>>(
-                dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-      }
-      if (d_x) {
-        KeBNBackwardData<T, framework::DataLayout::kNHWC>
-            <<<grid2, block, 0, stream>>>(
-                dy_d, x_d, scale->data<BatchNormParamType<T>>(), saved_mean,
-                saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C,
-                fsize, x->numel(), d_x->data<T>());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm, ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, double>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm_grad,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, double>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-
-// clang-format on
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
deleted file mode 100644
index 7f95d16f09b5182e4da33763751ac87b53f41cf3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
-                      "Input(Label)'s rank should be 2.");
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                        "The 1st dimension of Input(X) and Input(Label) should "
-                        "be equal.");
-      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
-                        "The 2nd dimension of "
-                        "Input(Label) should be 1.");
-    }
-    ctx->SetOutputDim("Y", {x_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // teacher_student_sigmoid_loss
-  // is determined by its input "X".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class TeacherStudentSigmoidLossGradOpDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("teacher_student_sigmoid_loss_grad");
-
-    op->SetInput("X", Input("X"));
-    op->SetInput("Label", Input("Label"));
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class TeacherStudentSigmoidLossGradientOp
-    : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                        "The 1st dimension of Input(X) and Input(Label) should "
-                        "be equal.");
-      PADDLE_ENFORCE_EQ(
-          x_dims[0], dy_dims[0],
-          "The 1st dimension of Input(X) and Input(Y@Grad) should "
-          "be equal.");
-      PADDLE_ENFORCE_EQ(dy_dims[1], 1,
-                        "The 2nd dimension of Input(Y@Grad) should be 1.");
-      PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
-                        "Input(Label) should be 1.");
-    }
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // teacher_student_sigmoid_loss
-  // is determined by its input "X".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class TeacherStudentSigmoidLossOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x 1],"
-             " where N is the batch size and D is the output. "
-             "This input is a probability computed by the previous operator, "
-             "which is almost always the result of a softmax operator.");
-    AddInput("Label",
-             "(Tensor), the ground truth which is a 2-D tensor. "
-             "Label is a Tensor<float> with shape [N x 1]. ");
-    AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
-              "[N x 1]. The teacher student sigmoid loss.");
-    AddAttr<float>(
-        "soft_max_up_bound",
-        "fp32, if input > soft_max_up_bound, input will be bound, default 15.0")
-        .SetDefault(15.0);
-    AddAttr<float>("soft_max_lower_bound",
-                   "fp32, if input < soft_max_lower_bound, input will be "
-                   "bound, default -15.0")
-        .SetDefault(-15.0);
-    AddComment(R"DOC(
-TeacherStudentSigmoidLoss Operator.
-
-It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that
-we add another label(z') to original.
-        loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
-        z is click or not
-        z' is teacher value 
-        label = {-2, -1, [0, 2]}
-        when z' is not exist, clk = 0 : label = -2;
-        when z' is not exist, clk = 1 : label = -1;
-        when z' is exist , clk = 0 : label = 0 + z';
-        when z' is exist    , clk = 1 : label = 1 + z';
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(teacher_student_sigmoid_loss,
-                  ops::TeacherStudentSigmoidLossOp,
-                  ops::TeacherStudentSigmoidLossOpMaker,
-                  ops::TeacherStudentSigmoidLossGradOpDescMaker);
-
-REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad,
-                  ops::TeacherStudentSigmoidLossGradientOp);
-
-REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss,
-                       ops::TeacherStudentSigmoidLossOpKernel<float>,
-                       ops::TeacherStudentSigmoidLossOpKernel<double>);
-
-REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss_grad,
-                       ops::TeacherStudentSigmoidLossGradOpKernel<float>,
-                       ops::TeacherStudentSigmoidLossGradOpKernel<double>);
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
deleted file mode 100644
index 41d2662ae2a4d37222323d6a536ed3af1ab7e056..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T>
-class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    Tensor* y = context.Output<Tensor>("Y");
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* labels = context.Input<Tensor>("Label");
-    T* y_data = y->mutable_data<T>(context.GetPlace());
-    const T* x_data = x->data<T>();
-    const T* label_data = labels->data<T>();
-    int64_t batch_size = x->dims()[0];
-    // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' +
-    // log(1 + exp(-abs(x)))
-    // z is click or not
-    // z' is value q of feed_fine
-    // label = {-2, -1, [0, 2]}
-    // when z' is not exist, clk = 0 : label = -2;
-    // when z' is not exist, clk = 1 : label = -1;
-    // when z' is exist    , clk = 0 : label = 0 + z';
-    // when z' is exist    , clk = 1 : label = 1 + z';
-    for (int i = 0; i < batch_size; ++i) {
-      if (label_data[i] < -1.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else if (label_data[i] < 0.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else if (label_data[i] < 1.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
-                    log(1.0 + exp(-fabs(x_data[i]))) +
-                    (x_data[i] > 0 ? x_data[i] : 0.0) -
-                    x_data[i] * label_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i]))) +
-                    (x_data[i] > 0 ? x_data[i] : 0.0) -
-                    x_data[i] * (label_data[i] - 1.0) +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      }
-    }
-  }
-};
-
-template <typename T>
-class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    const T* x_data = x->data<T>();
-
-    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    T* dx_data = dx->mutable_data<T>(context.GetPlace());
-
-    const Tensor* labels = context.Input<Tensor>("Label");
-    const T* label_data = labels->data<T>();
-
-    T soft_max_up_bound =
-        static_cast<T>(context.Attr<float>("soft_max_up_bound"));
-    T soft_max_lower_bound =
-        static_cast<T>(context.Attr<float>("soft_max_lower_bound"));
-
-    int64_t batch_size = x->dims()[0];
-
-    const framework::Tensor* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Y"));
-
-    const T* dout_data = dOut->data<T>();
-
-    for (int i = 0; i < batch_size; ++i) {
-      T sum_val = x_data[i];
-      if (sum_val > soft_max_up_bound) {
-        sum_val = soft_max_up_bound;
-      } else {
-        if (sum_val < soft_max_lower_bound) {
-          sum_val = soft_max_lower_bound;
-        }
-      }
-
-      T pred = 1.0 / (1.0 + exp(-sum_val));
-      if (label_data[i] < -1.0) {
-        dx_data[i] = 0.0 - pred;
-      } else if (label_data[i] < 0.0) {
-        dx_data[i] = 1.0 - pred;
-      } else {
-        dx_data[i] = label_data[i] - 2.0 * pred;
-      }
-      if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) {
-        dx_data[i] = 0;
-      }
-      dx_data[i] *= dout_data[i] * -1;
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
deleted file mode 100644
index f2a8ae9a411c34ce3f18884d6c2eab45eae5d5ab..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/temporal_shift_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class TemporalShiftOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of TemporalShiftOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of TemporalShiftOp should not be null.");
-
-    auto dim_x = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(dim_x.size(), 4,
-                      "Input(X) rank should be 4 in shape of [N*T, C, H, W].");
-
-    int seg_num = ctx->Attrs().Get<int>("seg_num");
-    float shift_ratio = ctx->Attrs().Get<float>("shift_ratio");
-    PADDLE_ENFORCE_GT(seg_num, 0, "Attr(seg_num) should be greater than 0.");
-    PADDLE_ENFORCE_GT(shift_ratio, 0.,
-                      "Attr(shift_ratio) should be greater than 0");
-    PADDLE_ENFORCE_LT(shift_ratio, 0.5,
-                      "Attr(shift_ratio) should be less than 0.5");
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          dim_x[0] % seg_num, 0,
-          "Input(X) dims[0] should be divided exactly by Attr(seg_num).");
-    }
-
-    ctx->SetOutputDim("Out", dim_x);
-    ctx->ShareLoD("X", "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of temporal shift operator. "
-             "This is a 4-D tensor with shape of [N*T,  C, H, W]. "
-             "While N is the batch size, T is the temporal segment "
-             "number, C is the channel number, H is the height of "
-             "features and W is the width of features.");
-    AddOutput("Out",
-              "The output tensor of temporal shift operator. "
-              "This is a 4-D tensor in the same shape with Input(X).");
-
-    AddAttr<int>("seg_num",
-                 "The temporal segment number, this should be a positive "
-                 "integer.");
-    AddAttr<float>(
-        "shift_ratio",
-        "The shift ratio of the channels, the first :attr:`shift_ratio` part "
-        "of channels will be shifted by -1 along the temporal dimension, "
-        "and the second :attr:`shift_ratio` part of channels will be shifted "
-        "by 1 along the temporal dimension. Default 0.25.")
-        .SetDefault(0.25);
-
-    AddComment(R"DOC(
-          This operator calculates the temporal shifting features for Input(X).
-
-          Input(X) should be in shape of [N*T, C, H, W], while N is the batch
-          size, T is the temporal segment number specified by :attr:`seg_num`, 
-          C is the channel number, H and W is the height and width of features.
-
-          Temporal Shifting is calculated as follows:
-          
-          Step 1: Reshape Input(X) to [N, T, C, H, W].
-
-          Step 2: Pad 0 to reshaping result in the 2nd(T) dimension with 
-          padding width as 1 on each side, padding result will be in shape 
-          of [N, T+2, C, H, W].
-
-          Step 3: Assume :attr:`shift_ratio` is :math:`1/4`, slice padding 
-          result as follows:
-
-          $$
-          slice1 = x[:, :T, :C/4, :, :]
-          $$
-          $$
-          slice2 = x[:, 2:T+2, C/4:C/2, :, :]
-          $$
-          $$
-          slice3 = x[:, 1:T+1, C/2:, :, :]
-          $$
-
-          Step 4: Concatenate three slices along the 3rd(C) dimension and 
-          reshape result to [N*T, C, H, W].
-
-          For details of temporal shifting, please refer to paper: 
-          `Temporal Shift Module <http://arxiv.org/abs/1811.08383>`_ .
-
-         )DOC");
-  }
-};
-
-class TemporalShiftOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace());
-  }
-};
-
-class TemporalShiftGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("temporal_shift_grad");
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(temporal_shift, ops::TemporalShiftOp,
-                  ops::TemporalShiftOpMaker, ops::TemporalShiftGradOpDescMaker);
-REGISTER_OPERATOR(temporal_shift_grad, ops::TemporalShiftOpGrad);
-REGISTER_OP_CPU_KERNEL(temporal_shift, ops::TemporalShiftKernel<float>,
-                       ops::TemporalShiftKernel<double>);
-REGISTER_OP_CPU_KERNEL(temporal_shift_grad, ops::TemporalShiftGradKernel<float>,
-                       ops::TemporalShiftGradKernel<double>);
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
deleted file mode 100644
index 24f1f8e178eb51aa7230d6c8c8f69d5beb728940..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/temporal_shift_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T>
-__global__ void KeTemporalShiftFw(const T* input, T* output, const int ntchw,
-                                  const int tchw, const int chw, const int hw,
-                                  const int w, const int t, const int c,
-                                  const float shift_ratio) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int src_it = 0;
-  for (; tid < ntchw; tid += stride) {
-    int in = tid / tchw;
-    int it = (tid % tchw) / chw;
-    int ic = (tid % chw) / hw;
-    int ih = (tid % hw) / w;
-    int iw = tid % w;
-
-    const int c1 = static_cast<T>(c * shift_ratio);
-    const int c2 = static_cast<T>(c * 2 * shift_ratio);
-
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it < 0 || src_it >= t) {
-      output[tid] = 0;
-    } else {
-      int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-      output[tid] = input[src_idx];
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeTemporalShiftBw(const T* output_grad, T* input_grad,
-                                  const int ntchw, const int tchw,
-                                  const int chw, const int hw, const int w,
-                                  const int t, const int c,
-                                  const float shift_ratio) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int src_it = 0;
-  for (; tid < ntchw; tid += stride) {
-    int in = tid / tchw;
-    int it = (tid % tchw) / chw;
-    int ic = (tid % chw) / hw;
-    int ih = (tid % hw) / w;
-    int iw = tid % w;
-
-    const int c1 = static_cast<T>(c * shift_ratio);
-    const int c2 = static_cast<T>(c * 2 * shift_ratio);
-
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it >= 0 && src_it < t) {
-      int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-      input_grad[src_idx] = output_grad[tid];
-    }
-  }
-}
-
-template <typename T>
-class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    int t = ctx.Attr<int>("seg_num");
-    float shift_ratio = ctx.Attr<float>("shift_ratio");
-
-    const int nt = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    const int hw = h * w;
-    const int chw = c * hw;
-    const int tchw = t * chw;
-    const int ntchw = nt * chw;
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
-
-    int pixelNum = nt * chw;
-    int grid_dim = (pixelNum + 512 - 1) / 512;
-    grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-    KeTemporalShiftFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_data, output_data, ntchw, tchw, chw, hw, w, t, c, shift_ratio);
-  }
-};
-
-template <typename T>
-class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int t = ctx.Attr<int>("seg_num");
-    float shift_ratio = ctx.Attr<float>("shift_ratio");
-
-    const int nt = output_grad->dims()[0];
-    const int c = output_grad->dims()[1];
-    const int h = output_grad->dims()[2];
-    const int w = output_grad->dims()[3];
-
-    const int hw = h * w;
-    const int chw = c * hw;
-    const int tchw = t * chw;
-    const int ntchw = nt * chw;
-
-    const T* output_grad_data = output_grad->data<T>();
-    T* input_grad_data =
-        input_grad->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T>()(
-        ctx.template device_context<platform::CUDADeviceContext>(), input_grad,
-        static_cast<T>(0));
-
-    int pixelNum = nt * chw;
-    int grid_dim = (pixelNum + 512 - 1) / 512;
-    grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-    KeTemporalShiftBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, hw, w, t, c,
-        shift_ratio);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(temporal_shift, ops::TemporalShiftOpCUDAKernel<float>,
-                        ops::TemporalShiftOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(temporal_shift_grad,
-                        ops::TemporalShiftGradOpCUDAKernel<float>,
-                        ops::TemporalShiftGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h
deleted file mode 100644
index 4c7eed5af471a18768eda6597472c0ad592ccbd0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-static HOSTDEVICE inline int GetEntryIndex(int in, int it, int ic, int ih,
-                                           int iw, const int tchw,
-                                           const int chw, const int hw,
-                                           const int w) {
-  return in * tchw + it * chw + ic * hw + ih * w + iw;
-}
-
-template <typename T>
-class TemporalShiftKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    int t = ctx.Attr<int>("seg_num");
-    float shift_ratio = ctx.Attr<float>("shift_ratio");
-
-    const int nt = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
-
-    const int hw = h * w;
-    const int chw = c * hw;
-    const int tchw = t * chw;
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
-
-    int src_it = 0;
-    for (int i = 0; i < output->numel(); i++) {
-      int in = i / tchw;
-      int it = (i % tchw) / chw;
-      int ic = (i % chw) / hw;
-      int ih = (i % hw) / w;
-      int iw = i % w;
-
-      if (ic < c1) {
-        src_it = it - 1;
-      } else if (ic < c2) {
-        src_it = it + 1;
-      } else {
-        src_it = it;
-      }
-
-      if (src_it < 0 || src_it >= t) {
-        output_data[i] = 0;
-      } else {
-        int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-        output_data[i] = input_data[src_idx];
-      }
-    }
-  }
-};
-
-template <typename T>
-class TemporalShiftGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int t = ctx.Attr<int>("seg_num");
-    float shift_ratio = ctx.Attr<float>("shift_ratio");
-
-    const int nt = output_grad->dims()[0];
-    const int c = output_grad->dims()[1];
-    const int h = output_grad->dims()[2];
-    const int w = output_grad->dims()[3];
-
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
-
-    const int hw = h * w;
-    const int chw = c * hw;
-    const int tchw = t * chw;
-
-    const T* output_grad_data = output_grad->data<T>();
-    T* input_grad_data =
-        input_grad->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
-    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
-
-    int src_it = 0;
-    for (int i = 0; i < output_grad->numel(); i++) {
-      int in = i / tchw;
-      int it = (i % tchw) / chw;
-      int ic = (i % chw) / hw;
-      int ih = (i % hw) / w;
-      int iw = i % w;
-
-      if (ic < c1) {
-        src_it = it - 1;
-      } else if (ic < c2) {
-        src_it = it + 1;
-      } else {
-        src_it = it;
-      }
-
-      if (src_it >= 0 && src_it < t) {
-        int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-        input_grad_data[src_idx] = output_grad_data[i];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
deleted file mode 100644
index 8cba4961153cf40424885ea4798a4ae534cc8ea1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-void LodTensorArray2LodTensorVector(const framework::Scope &scope,
-                                    const std::string &base_name,
-                                    const std::string &lod_tensor_array_name,
-                                    std::vector<std::string> *res_names) {
-  auto &inx =
-      scope.FindVar(lod_tensor_array_name)->Get<framework::LoDTensorArray>();
-  for (size_t i = 0; i < inx.size(); i++) {
-    std::string var_name = base_name + std::to_string(i);
-    framework::Variable *g_feed_value =
-        const_cast<framework::Scope &>(scope).Var(var_name);
-    auto &feed_input =
-        *(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
-    feed_input.ShareDataWith(inx[i]);
-    res_names->push_back(var_name);
-  }
-}
-
-void LodTensorVectorResizeFromLodTensorArray(
-    const framework::Scope &scope, const std::string &base_name,
-    const std::string &lod_tensor_array_name,
-    std::vector<std::string> *res_names) {
-  auto &inx =
-      scope.FindVar(lod_tensor_array_name)->Get<framework::LoDTensorArray>();
-  for (size_t i = 0; i < inx.size(); i++) {
-    std::string var_name = base_name + std::to_string(i);
-    framework::Variable *g_feed_value =
-        const_cast<framework::Scope &>(scope).Var(var_name);
-    auto &feed_input =
-        *(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
-    auto dims = inx[i].dims();
-    feed_input.Resize(dims);
-    res_names->push_back(var_name);
-  }
-}
-
-void LodTensorArrayCreateFromLodTensorArray(
-    const framework::Scope &scope,
-    const std::string &input_lod_tensor_array_name,
-    const std::string &output_lod_tensor_array_name) {
-  auto &inx = scope.FindVar(input_lod_tensor_array_name)
-                  ->Get<framework::LoDTensorArray>();
-  auto &grad_inx = *scope.FindVar(output_lod_tensor_array_name)
-                        ->GetMutable<framework::LoDTensorArray>();
-
-  for (size_t i = 0; i < inx.size(); i++) {
-    std::string var_name = output_lod_tensor_array_name + std::to_string(i);
-    framework::Variable *g_feed_value =
-        const_cast<framework::Scope &>(scope).Var(var_name);
-    auto &feed_input =
-        *(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
-    grad_inx.push_back(feed_input);
-  }
-}
-
-class LoDTensorArray2TensorOp : public framework::OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto axis = Attr<int>("axis");
-
-    framework::AttributeMap attrs;
-    attrs["axis"] = axis;
-
-    auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    auto &out_inx =
-        *scope.FindVar(Output("OutIndex"))->GetMutable<framework::LoDTensor>();
-
-    const size_t n = inx.size();
-    PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0.");
-
-    std::string base_name = Inputs("X")[0];
-    std::vector<std::string> names;
-
-    // get the input tensorarray items' dim in out_inx
-    auto out_inx_dim = out_inx.dims();
-    out_inx_dim[0] = inx.size();
-    out_inx.Resize(out_inx_dim);
-    int *tmp_index_data = out_inx.mutable_data<int>(platform::CPUPlace());
-
-    auto out_dims = inx[0].dims();
-    size_t out_dim_sum = 0;
-    for (size_t index = 0; index < inx.size(); index++) {
-      auto inx_dims = inx[index].dims();
-      out_dim_sum += inx_dims[axis];
-      tmp_index_data[index] = inx_dims[axis];
-    }
-
-    // get input array items' dims
-    out_dims[axis] = out_dim_sum;
-    out.Resize(out_dims);
-
-    LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
-    // Invoke concat Op
-    auto concat_op = framework::OpRegistry::CreateOp(
-        "concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs);
-
-    concat_op->Run(scope, place);
-  }
-};
-
-class LoDTensorArray2TensorOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input LoDTensorArray of tensor_array_to_tensor operator.");
-    AddOutput("Out", "Output tensor of tensor_array_to_tensor operator.");
-    AddOutput("OutIndex",
-              "Output input LoDTensorArray items' dims of "
-              "tensor_array_to_tensor operator.");
-    AddAttr<int>("axis",
-                 "The axis along which the input tensors will be concatenated.")
-        .SetDefault(0);
-    AddComment(R"DOC(
-tensor_array_to_tensor Operator.
-
-Concatenate the input LoDTensorArray along dimension axis to the output Tensor.
-Examples:
-  Input = {[1,2], [3,4], [5,6]}
-  axis = 0
-  Output = [[1,2],
-            [3,4],
-            [5,6]]
-  OutputIndex = [1,1,1]
-
-)DOC");
-  }
-};
-
-class LoDTensorArray2TensorOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {}
-};
-
-class LoDTensorArray2TensorGradInferVarType
-    : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    for (auto &out_var : ctx->Output(framework::GradVarName("X"))) {
-      ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
-    }
-  }
-};
-
-class LoDTensorArray2TensorGradOp : public framework::OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto axis = Attr<int>("axis");
-    framework::AttributeMap attrs;
-    attrs["axis"] = axis;
-
-    auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
-    const size_t n = inx.size();
-    PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0.");
-
-    std::string base_name = Inputs("X")[0];
-    std::vector<std::string> names;
-
-    LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
-
-    // grad
-    auto dx_name = Output(framework::GradVarName("X"));
-    auto dout_name = Input(framework::GradVarName("Out"));
-
-    std::vector<std::string> grad_names;
-
-    LodTensorVectorResizeFromLodTensorArray(scope, "grad_name", Input("X"),
-                                            &grad_names);
-
-    auto concat_grad_op = framework::OpRegistry::CreateOp(
-        "concat_grad", {{"X", names}, {"Out@GRAD", {dout_name}}},
-        {{"X@GRAD", grad_names}}, attrs);
-
-    concat_grad_op->Run(scope, place);
-
-    LodTensorArrayCreateFromLodTensorArray(scope, Input("X"), dx_name);
-    auto &grad_inx =
-        *scope.FindVar(dx_name)->GetMutable<framework::LoDTensorArray>();
-
-    for (size_t i = 0; i < grad_names.size(); i++) {
-      std::string var_name = grad_names[i];
-      auto &feed_input = scope.FindVar(var_name)->Get<framework::LoDTensor>();
-      grad_inx[i].ShareDataWith(feed_input);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-USE_OP(concat);
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(tensor_array_to_tensor, ops::LoDTensorArray2TensorOp,
-                  ops::LoDTensorArray2TensorOpMaker,
-                  ops::LoDTensorArray2TensorOpInferShape,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(tensor_array_to_tensor_grad, ops::LoDTensorArray2TensorGradOp,
-                  ops::LoDTensorArray2TensorGradInferShape,
-                  ops::LoDTensorArray2TensorGradInferVarType);
diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt
deleted file mode 100644
index 6b551d13f1dc5cd1c82a15a8347b278e8f795c1c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
-file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n")
-nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-  DEPS tensorrt_engine_op
-  analysis)
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
deleted file mode 100644
index 6cf3e65e00ff6dd6a87d2b699ae89b9bde5d5462..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
-
-namespace paddle {
-
-namespace operators {
-
-class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Xs", "A list of inputs.").AsDuplicable();
-    AddOutput("Ys", "A list of outputs").AsDuplicable();
-    AddAttr<std::string>("subgraph", "the subgraph.");
-    AddAttr<std::string>("calibration_data", "the calibration data for int8");
-    AddAttr<std::string>(
-        "engine_serialized_data",
-        "the serialized data contains the all info of the ICUDAEngine");
-    AddAttr<std::string>(
-        "engine_key",
-        "The engine_key here is used to distinguish different TRT Engines");
-    AddAttr<int>("max_batch_size", "the maximum batch size.");
-    AddAttr<int>("workspace_size", "the workspace size.");
-    AddAttr<framework::BlockDesc *>("sub_block", "the trt block");
-    AddAttr<bool>("enable_int8", "whether swith to int8 mode");
-    AddComment("TensorRT engine operator.");
-  }
-};
-
-class TensorRTEngineInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
-                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
-
-#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
deleted file mode 100644
index 22c0c9e9d4af3233e3cc712d26d13eb38b302abf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ /dev/null
@@ -1,297 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-
-namespace paddle {
-
-namespace operators {
-
-using inference::Singleton;
-using inference::tensorrt::TensorRTEngine;
-using inference::tensorrt::TRTInt8Calibrator;
-using inference::tensorrt::TRTCalibratorEngine;
-using inference::tensorrt::TRTCalibratorEngineManager;
-
-class TensorRTEngineOp : public framework::OperatorBase {
- private:
-  std::vector<std::string> input_names_;
-  std::unordered_set<std::string> param_names_;
-  mutable TensorRTEngine *trt_engine_{nullptr};
-  int max_batch_size_;
-  int workspace_size_;
-  std::unique_ptr<TRTInt8Calibrator> calibrator_;
-  bool enable_int8_;
-  bool enable_fp16_;
-  bool use_calib_mode_;
-  std::string calibration_data_;
-  std::string engine_key_;
-  bool calibration_mode_;
-  int predictor_id_;
-  int device_id_;
-  AnalysisConfig::Precision precision_mode_;
-
- public:
-  TensorRTEngineOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {
-    input_names_ = Inputs("Xs");
-    max_batch_size_ = Attr<int>("max_batch_size");
-    workspace_size_ = Attr<int>("workspace_size");
-    device_id_ = Attr<int>("gpu_id");
-    enable_int8_ = Attr<bool>("enable_int8");
-    enable_fp16_ = Attr<bool>("enable_fp16");
-    use_calib_mode_ = Attr<bool>("use_calib_mode");
-    calibration_data_ = Attr<std::string>("calibration_data");
-    engine_key_ = Attr<std::string>("engine_key");
-    predictor_id_ = Attr<int>("predictor_id");
-
-    auto params = Attr<std::vector<std::string>>("parameters");
-    for (const auto &param : params) {
-      param_names_.insert(param);
-    }
-    // calibration_mode is ture represents we need to
-    // generate the calibration table data.
-    calibration_mode_ =
-        (enable_int8_ && calibration_data_.size() == 0 && use_calib_mode_);
-
-    VLOG(4) << "calibration_mode: " << calibration_mode_;
-    if (enable_int8_ && calibration_data_.size()) {
-      calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
-    }
-    bool has_engine =
-        inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-            .Has(engine_key_ + std::to_string(predictor_id_));
-
-    if (!calibration_mode_ && has_engine) {
-      trt_engine_ =
-          inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-              .Get(engine_key_ + std::to_string(predictor_id_));
-    }
-    precision_mode_ = AnalysisConfig::Precision::kFloat32;
-    if (enable_int8_) {
-      precision_mode_ = AnalysisConfig::Precision::kInt8;
-    }
-    if (enable_fp16_) {
-      precision_mode_ = AnalysisConfig::Precision::kHalf;
-    }
-  }
-
- protected:
-  void RunNativeImpl(const framework::Scope &scope,
-                     const platform::Place &dev_place) const {
-    framework::Executor executor(dev_place);
-    auto *block = Attr<framework::BlockDesc *>("sub_block");
-    auto *program = block->Program();
-    auto &current_scope = scope.NewScope();
-    auto ctx = executor.Prepare(*program, block->ID());
-    executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
-  }
-
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    if (calibration_mode_ == true) {
-      RunCalibration(scope, dev_place);
-      return;
-    }
-    auto *trt_engine = GetEngine(scope, dev_place);
-    RunTrt(scope, dev_place, trt_engine);
-  }
-
-  void RunCalibration(const framework::Scope &scope,
-                      const platform::Place &dev_place) const {
-    // This process will builds a 32-bit trt engine, runs it on the calibration
-    // set, and records a histogram for each
-    // tensor of the distribution of activation values.
-    LOG_FIRST_N(INFO, 1) << "This process is generating calibration table for "
-                            "Paddle TRT int8...";
-
-    int runtime_batch = 1;
-    if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
-      TRTCalibratorEngine *calib_res =
-          Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
-      std::unordered_map<std::string, size_t> calib_buffers;
-      for (auto &x : input_names_) {
-        if (param_names_.count(x)) continue;
-        auto &t =
-            inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
-        calib_buffers[x] = t.memory_size();
-        auto t_shape = framework::vectorize(t.dims());
-        runtime_batch = t_shape[0];
-      }
-      calib_res->calib_.reset(new TRTInt8Calibrator(
-          calib_buffers, runtime_batch, engine_key_, dev_place));
-      calib_res->thr_.reset(new std::thread([&]() {
-        calib_res->engine_.reset(new TensorRTEngine(
-            max_batch_size_, workspace_size_, precision_mode_,
-            calib_res->calib_.get(),
-            boost::get<platform::CUDAPlace>(dev_place).device));
-        VLOG(3) << "start the calib trt engine thread";
-        PrepareTRTEngine(scope, calib_res->engine_.get());
-      }));
-    }
-
-    TRTInt8Calibrator *temp_calibrator =
-        Singleton<TRTCalibratorEngineManager>::Global()
-            .Get(engine_key_)
-            ->calib_.get();
-    std::unordered_map<std::string, void *> calib_data;
-
-    for (auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
-      auto &t =
-          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
-      calib_data.emplace(x, t.data<void>());
-    }
-    temp_calibrator->setBatch(calib_data);
-    RunNativeImpl(scope, dev_place);
-  }
-
-  void RunTrt(const framework::Scope &scope, const platform::Place &dev_place,
-              TensorRTEngine *engine) const {
-    int runtime_batch = 1;
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
-
-    PADDLE_ENFORCE_EQ(input_names_.empty(), false,
-                      "should pass at least one input");
-
-    std::vector<std::string> output_maps =
-        Attr<std::vector<std::string>>("output_name_mapping");
-
-    int num_inputs = 0;
-
-    for (const auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
-      num_inputs += 1;
-    }
-    const int num_bindings = num_inputs + Outputs("Ys").size();
-    std::vector<void *> buffers(num_bindings);
-
-    // Bind input tensor to TRT.
-    for (const auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
-      // convert input and copy to TRT engine's buffer
-      auto &t =
-          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
-      auto t_shape = framework::vectorize<int64_t>(t.dims());
-      // check if the input shapes are consistent with model.
-      if (HasAttr(x + "_shape")) {
-        std::vector<int64_t> i_shape = Attr<std::vector<int64_t>>(x + "_shape");
-        std::vector<int64_t> model_input_shape(i_shape.begin() + 1,
-                                               i_shape.end());
-        std::vector<int64_t> runtime_input_shape(t_shape.begin() + 1,
-                                                 t_shape.end());
-        PADDLE_ENFORCE_EQ(model_input_shape == runtime_input_shape, true,
-                          "Input shapes are inconsistent with the model. TRT 5 "
-                          "or lower version "
-                          "does not support dynamic input shapes. Please check "
-                          "your input shapes.");
-      }
-
-      runtime_batch = t_shape[0];
-
-      const int bind_index = engine->engine()->getBindingIndex(x.c_str());
-      PADDLE_ENFORCE(bind_index < num_bindings,
-                     "The bind index should be less than num_bindings");
-      buffers[bind_index] = static_cast<void *>(t.data<float>());
-    }
-
-    // Bind output tensor to TRT.
-    int output_index = 0;
-    VLOG(4) << "TensorRT Engine Op Outputs:";
-    for (const auto &y : Outputs("Ys")) {
-      const int bind_index =
-          engine->engine()->getBindingIndex(output_maps[output_index].c_str());
-      auto dims = engine->engine()->getBindingDimensions(bind_index);
-      // Use the output ITensor's dims to reshape the Fluid Tensor.
-      // The ITensor doesn't contain the batch size dim.
-      std::vector<int> ddim;
-      ddim.push_back(runtime_batch);
-      for (int i = 0; i < dims.nbDims; i++) {
-        ddim.push_back(dims.d[i]);
-      }
-      auto *fluid_v = scope.FindVar(y);
-      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
-      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-      fluid_t->Resize(framework::make_ddim(ddim));
-
-      PADDLE_ENFORCE(bind_index < num_bindings,
-                     "The bind index should be less than num_bindings");
-      buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
-          boost::get<platform::CUDAPlace>(dev_place)));
-
-      output_index += 1;
-    }
-
-    PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
-    // Execute the engine.
-    engine->Execute(runtime_batch, &buffers, stream);
-    cudaStreamSynchronize(stream);
-  }
-
-  TensorRTEngine *GetEngine(const framework::Scope &scope,
-                            const platform::Place &dev_place) const {
-    if (!trt_engine_) {
-      trt_engine_ =
-          inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-              .Create(engine_key_ + std::to_string(predictor_id_),
-                      max_batch_size_, workspace_size_, precision_mode_,
-                      calibrator_.get(), device_id_);
-      PrepareTRTEngine(scope, trt_engine_);
-    }
-    return trt_engine_;
-  }
-
-  void PrepareTRTEngine(const framework::Scope &scope,
-                        TensorRTEngine *engine) const {
-    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
-                 "kernel etc). This process may cost a lot of time.";
-    framework::proto::BlockDesc block_proto;
-    block_proto.ParseFromString(Attr<std::string>("subgraph"));
-    framework::BlockDesc block_desc(nullptr, &block_proto);
-
-    std::vector<std::string> inputs = Inputs("Xs");
-    std::vector<std::string> outputs =
-        Attr<std::vector<std::string>>("output_name_mapping");
-
-    inference::Singleton<inference::tensorrt::OpConverter>::Global()
-        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
-                                 outputs, engine);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
deleted file mode 100644
index e813e9ca7579f154b91db851c70286e5f4405820..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-
-USE_NO_KERNEL_OP(tensorrt_engine);
-namespace paddle {
-namespace operators {
-
-namespace {
-void CreateCUDATensor(framework::Scope* scope, const std::string& name,
-                      const std::vector<int64_t>& shape) {
-  auto* var = scope->Var(name);
-  auto* tensor = var->GetMutable<framework::LoDTensor>();
-  auto dims = framework::make_ddim(shape);
-  tensor->Resize(dims);
-  platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
-  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
-}
-
-void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
-                          const std::string& name,
-                          const std::vector<int64_t>& shape) {
-  using framework::proto::VarType;
-  auto* var = block->add_vars();
-  framework::VarDesc desc(name);
-  desc.SetType(VarType::LOD_TENSOR);
-  desc.SetDataType(VarType::FP32);
-  desc.SetShape(shape);
-  *var = *desc.Proto();
-}
-
-}  // namespace
-
-using inference::analysis::SetAttr;
-
-TEST(TensorRTEngineOp, manual) {
-  framework::ProgramDesc program;
-  auto* block_ = program.Proto()->add_blocks();
-  block_->set_idx(0);
-  block_->set_parent_idx(-1);
-
-  LOG(INFO) << "create block desc";
-  framework::BlockDesc block_desc(&program, block_);
-  LOG(INFO) << "create fc op";
-  auto* fc0 = block_desc.AppendOp();
-  fc0->SetType("fc");
-  fc0->SetInput("X", std::vector<std::string>({"x"}));     // 4 x 1 x 1
-  fc0->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
-  fc0->SetOutput("Out", std::vector<std::string>({"z"}));  // 6 x 1 x 1
-
-  LOG(INFO) << "create fc op";
-  auto* fc1 = block_desc.AppendOp();
-  fc1->SetType("fc");
-  fc1->SetInput("X", std::vector<std::string>({"z"}));
-  fc1->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
-  fc1->SetOutput("Out", std::vector<std::string>({"z0"}));  // 8 x 1 x 1
-
-  // Set inputs' variable shape in BlockDesc
-  // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
-  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
-  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
-  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
-  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
-
-  // It is wired, need to copy manually.
-  *block_->add_ops() = *fc0->Proto();
-  *block_->add_ops() = *fc1->Proto();
-
-  ASSERT_EQ(block_->ops_size(), 2);
-
-  LOG(INFO) << "create tensorrt desc";
-  framework::OpDesc engine_op_desc(nullptr);
-  engine_op_desc.SetType("tensorrt_engine");
-  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
-  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
-
-  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
-  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
-  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
-  engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
-  engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
-  engine_op_desc.SetAttr("predictor_id", 1);
-  engine_op_desc.SetAttr("calibration_data", std::string(""));
-  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
-  engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
-  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
-  engine_op_desc.SetAttr("output_name_mapping",
-                         std::vector<std::string>({"z0"}));
-  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
-  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
-  int device_id = 0;
-  engine_op_desc.SetAttr("gpu_id", device_id);
-
-  LOG(INFO) << "create engine op";
-  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
-  LOG(INFO) << "engine_op " << engine_op.get();
-
-  framework::Scope scope;
-  platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
-  // Prepare variables.
-  CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
-  CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
-  CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
-
-  CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
-  CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
-
-  // Execute them.
-  LOG(INFO) << "engine_op run";
-  engine_op->Run(scope, place);
-}
-
-void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
-
-  auto* block_ = program.Proto()->add_blocks();
-  block_->set_idx(0);
-  block_->set_parent_idx(-1);
-
-  using shape_t = std::vector<int64_t>;
-
-  LOG(INFO) << "create block desc";
-  framework::BlockDesc block_desc(&program, block_);
-
-  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
-                        const std::string& z_name, bool x_created,
-                        const shape_t& x_shape, const shape_t& y_shape,
-                        const shape_t& z_shape) {
-    LOG(INFO) << "create fc op";
-    auto* fc = block_desc.AppendOp();
-    fc->SetType("mul");
-    fc->SetInput("X", std::vector<std::string>({x_name}));
-    fc->SetInput("Y", std::vector<std::string>({y_name}));
-    fc->SetOutput("Out", std::vector<std::string>({z_name}));
-
-    // Set inputs' variable shape in BlockDesc
-    if (!x_created) {
-      AddTensorToBlockDesc(block_, x_name,
-                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
-    }
-    AddTensorToBlockDesc(block_, y_name,
-                         std::vector<int64_t>({input_dim, output_dim}));
-    AddTensorToBlockDesc(block_, z_name,
-                         std::vector<int64_t>({batch_size, output_dim}));
-
-    // Prepare variables.
-    if (!x_created) {
-      CreateCUDATensor(&scope, x_name, std::vector<int64_t>(x_shape));
-    }
-    CreateCUDATensor(&scope, y_name, std::vector<int64_t>(y_shape));
-    CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
-
-    // It is wired, need to copy manually.
-    *block_->add_ops() = *fc->Proto();
-  };
-
-  // Test with 4 layer FC
-  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
-             {input_dim, output_dim}, {batch_size, output_dim});
-  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
-             {batch_size, output_dim});
-  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
-             {batch_size, output_dim});
-  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
-             {batch_size, output_dim});
-
-  LOG(INFO) << "create tensorrt desc";
-  framework::OpDesc engine_op_desc(nullptr);
-  engine_op_desc.SetType("tensorrt_engine");
-  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
-  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
-
-  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
-  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
-  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
-  engine_op_desc.SetAttr("parameters",
-                         std::vector<std::string>({"y0", "y1", "y2", "y3"}));
-  engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
-  engine_op_desc.SetAttr("predictor_id", 1);
-  engine_op_desc.SetAttr("calibration_data", std::string(""));
-  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
-  engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
-  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
-  engine_op_desc.SetAttr("output_name_mapping",
-                         std::vector<std::string>({"z3"}));
-  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
-  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
-  int device_id = 0;
-  engine_op_desc.SetAttr("gpu_id", device_id);
-
-  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
-
-  // Execute them.
-  engine_op->Run(scope, place);
-}
-
-// Test with a larger FC layer.
-TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
-
-}  // namespace operators
-}  // namespace paddle
-
-USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cc b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cc
deleted file mode 100644
index 9a06a9a27620300081c7b4847e6b1a91cd08515d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h"
-
-namespace paddle {
-namespace operators {
-
-TEST(leaky_relu_grad_grad, test_cpu) {
-  ASSERT_TRUE(
-      TestLeakyReluGradGradMain<float>({32, 64}, platform::CPUPlace(), 0.02));
-}
-
-TEST(leaky_relu_grad_grad, test_cpu_zero_alpha) {
-  ASSERT_TRUE(
-      TestLeakyReluGradGradMain<float>({32, 64}, platform::CPUPlace(), 0.0));
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cu b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cu
deleted file mode 100644
index 6f0f840b8c5d4ddda4c3fc5e8a525905cbce4850..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h"
-
-namespace paddle {
-namespace operators {
-
-TEST(leaky_relu_grad_grad, test_gpu) {
-  ASSERT_TRUE(
-      TestLeakyReluGradGradMain<float>({32, 64}, platform::CUDAPlace(0), 0.15));
-}
-
-TEST(leaky_relu_grad_grad, test_gpu_zero_alpha) {
-  ASSERT_TRUE(
-      TestLeakyReluGradGradMain<float>({32, 64}, platform::CUDAPlace(0), 0.0));
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
deleted file mode 100644
index f416aa6e00f5a4a82c2562c36f9d32bb1a6843aa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <random>
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static void InitRandom(framework::Tensor *tensor,
-                       const platform::Place &place) {
-  framework::Tensor cpu_tensor;
-  auto *cpu_ptr =
-      cpu_tensor.mutable_data<T>(tensor->dims(), platform::CPUPlace());
-  int64_t numel = cpu_tensor.numel();
-  std::mt19937 engine;
-  std::uniform_real_distribution<T> dist(static_cast<T>(-2.0),
-                                         static_cast<T>(2.0));
-  for (int64_t i = 0; i < numel; ++i) {
-    cpu_ptr[i] = dist(engine);
-  }
-  framework::TensorCopySync(cpu_tensor, place, tensor);
-}
-
-template <typename T>
-struct LeakyReluGradGradEachElementFunctor {
-  LeakyReluGradGradEachElementFunctor(const T *ddx, const T *out, T alpha,
-                                      T *ddout)
-      : ddx_(ddx), out_(out), alpha_(alpha), ddout_(ddout) {}
-
-  HOSTDEVICE void operator()(int idx) {
-    if (out_[idx] > 0) {
-      ddout_[idx] = ddx_[idx];
-    } else {
-      ddout_[idx] = ddx_[idx] * alpha_;
-    }
-  }
-
-  const T *ddx_;
-  const T *out_;
-  T alpha_;
-  T *ddout_;
-};
-
-template <typename T>
-static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
-                                      const platform::Place &place,
-                                      float alpha) {
-  LeakyReluGradGradFunctor<T> functor;
-  functor.alpha = alpha;
-  auto &dev_ctx = *platform::DeviceContextPool::Instance().Get(place);
-  framework::Tensor *x = nullptr;
-  framework::Tensor *dout = nullptr;
-  framework::Tensor *dx = nullptr;
-
-  framework::Tensor out;
-  out.Resize(dim);
-  InitRandom<T>(&out, place);
-
-  framework::Tensor ddx;
-  ddx.Resize(dim);
-  InitRandom<T>(&ddx, place);
-
-  framework::Tensor ddout;
-  ddout.Resize(dim);
-  InitRandom<T>(&ddout, place);
-
-  framework::Tensor ddout_actual;
-  ddout_actual.mutable_data<T>(dim, place);
-  LeakyReluGradGradEachElementFunctor<T> actual_functor(
-      ddx.data<T>(), out.data<T>(), static_cast<T>(alpha),
-      ddout_actual.data<T>());
-
-  int64_t limit = out.numel();
-
-#ifdef __NVCC__
-  if (platform::is_gpu_place(place)) {
-    auto &cuda_dev_ctx = dynamic_cast<platform::CUDADeviceContext &>(dev_ctx);
-    functor(cuda_dev_ctx, x, &out, &ddx, &ddout, dout, dx);
-    platform::ForRange<platform::CUDADeviceContext> for_range(cuda_dev_ctx,
-                                                              limit);
-    for_range(actual_functor);
-  } else {
-#endif
-    auto &cpu_dev_ctx = dynamic_cast<platform::CPUDeviceContext &>(dev_ctx);
-    functor(cpu_dev_ctx, x, &out, &ddx, &ddout, dout, dx);
-    platform::ForRange<platform::CPUDeviceContext> for_range(cpu_dev_ctx,
-                                                             limit);
-    for_range(actual_functor);
-#ifdef __NVCC__
-  }
-#endif
-
-  dev_ctx.Wait();
-
-  framework::Tensor ddout_cpu, ddout_actual_cpu;
-  framework::TensorCopySync(ddout, platform::CPUPlace(), &ddout_cpu);
-  framework::TensorCopySync(ddout_actual, platform::CPUPlace(),
-                            &ddout_actual_cpu);
-
-  bool is_equal = std::equal(ddout_cpu.data<T>(), ddout_cpu.data<T>() + limit,
-                             ddout_actual_cpu.data<T>());
-  return is_equal;
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
deleted file mode 100644
index db763a051d1e08b962a40913d290c69e7c61ec32..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/top_k_op.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/top_k_op.h"
-
-namespace paddle {
-namespace operators {
-
-class TopkOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
-                   "Output(Indices) of TopkOp should not be null.");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-
-    PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
-                        "input must have >= k columns");
-    }
-
-    framework::DDim dims = input_dims;
-    dims[dims.size() - 1] = k;
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context(), layout_, library_);
-  }
-};
-
-class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input of Topk op");
-    AddInput("K",
-             "(Tensor)  Number of top elements to look for along "
-             "the last dimension (along each row for matrices).")
-        .AsDispensable();
-    AddOutput("Out", "(Tensor) The output tensor of Topk op");
-    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
-    AddComment(R"DOC(
-Top K operator
-
-If the input is a vector (1d tensor), this operator finds the k largest 
-entries in the vector and outputs their values and indices as vectors. 
-Thus values[j] is the j-th largest entry in input, and its index is indices[j].
-
-For matrices, this operator computes the top k entries in each row. )DOC");
-    AddAttr<int>("k",
-                 "(int, default 1) Number of top elements to look for along "
-                 "the last dimension (along each row for matrices).")
-        .SetDefault(1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(top_k,
-                       ops::TopkKernel<paddle::platform::CPUPlace, float>,
-                       ops::TopkKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
deleted file mode 100644
index fe243a3b87d20c1741f1a4cbbe8c7466e6428456..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/top_k_op.cu
+++ /dev/null
@@ -1,379 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
-
-  __device__ __forceinline__ void set(T value, int64_t id) {
-    v = value;
-    id = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair<T>& in) {
-    v = in.v;
-    id = in.id;
-  }
-
-  __device__ __forceinline__ bool operator<(const T value) const {
-    return (v < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
-    return (v < in.v) || ((v == in.v) && (id > in.id));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
-    return (v > in.v) || ((v == in.v) && (id < in.id));
-  }
-
-  T v;
-  int64_t id;
-};
-
-template <typename T>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
-                                      int beam_size) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int beam_size>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* src,
-                                              bool* firstStep, bool* is_empty,
-                                              Pair<T>* max, int dim,
-                                              const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - (*beam)) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* val,
-                                              int* col, bool* firstStep,
-                                              bool* is_empty, Pair<T>* max,
-                                              int dim, const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - *beam) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -1) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
-                                            Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int* beam, int* k,
-                                            const int tid, const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < BlockSize / 2) {
-      if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
-        maxid[tid] = tid + BlockSize / 2;
-      } else {
-        maxid[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
-          maxid[tid] = maxid[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = sh_topk[maxid[0]].v;
-      **topIds = sh_topk[maxid[0]].id;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxid[0]) (*beam)++;
-    if (--(*k) == 0) break;
-    __syncthreads();
-
-    if (tid == maxid[0]) {
-      if (*beam < MaxLength) {
-        sh_topk[tid] = topk[*beam];
-      }
-    }
-    // NOTE(zcd): temporary solution
-    unsigned mask = 0u;
-    CREATE_SHFL_MASK(mask, true);
-
-    if (maxid[0] / 32 == warp) {
-      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
-          MaxLength)
-        break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top MaxLength value;
- * 2. merge to sh_topk, block reduce and get max value;
- * 3. go to the second setp, until one thread's topk value is null;
- * 4. go to the first setp, until get the topk value.
- */
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
-                             const T* src, int lds, int dim, int k,
-                             int grid_dim, int num) {
-  __shared__ Pair<T> sh_topk[BlockSize];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-
-  const int bid = blockIdx.x;
-  for (int i = bid; i < num; i += grid_dim) {
-    int top_num = k;
-    __shared__ int maxid[BlockSize / 2];
-    T* out = output + i * output_stride;
-    int64_t* inds = indices + i * k;
-    Pair<T> topk[MaxLength];
-    int beam = MaxLength;
-    Pair<T> max;
-    bool is_empty = false;
-    bool firststep = true;
-
-    for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-static_cast<T>(INFINITY), -1);
-    }
-    while (top_num) {
-      ThreadGetTopK<T, MaxLength, BlockSize>(
-          topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
-
-      sh_topk[tid] = topk[0];
-      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
-                                           &beam, &top_num, tid, warp);
-    }
-  }
-}
-
-inline static int GetDesiredBlockDim(int dim) {
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
-#define FIXED_BLOCK_DIM_BASE(dim, ...) \
-  case (dim): {                        \
-    constexpr auto kBlockDim = (dim);  \
-    __VA_ARGS__;                       \
-  } break
-
-#define FIXED_BLOCK_DIM(...)                \
-  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
-
-template <typename T>
-class TopkOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-    size_t k = static_cast<int>(ctx.Attr<int>("k"));
-
-    auto* k_t = ctx.Input<Tensor>("K");
-    if (k_t) {
-      Tensor k_host;
-      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
-      k = k_host.data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      output_dims[output_dims.size() - 1] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    // FIXME(typhoonzero): data is always converted to type T?
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    framework::DDim inputdims = input->dims();
-    const size_t input_height = framework::product(
-        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
-    const size_t input_width = inputdims[inputdims.size() - 1];
-
-    if (k > input_width) k = input_width;
-
-    // NOTE: pass lds and dim same to input width.
-    // NOTE: old matrix implementation of stride is different to eigen.
-    // TODO(typhoonzero): refine this kernel.
-    const int kMaxHeight = 2048;
-    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-    auto& dev_ctx = ctx.cuda_device_context();
-    switch (GetDesiredBlockDim(input_width)) {
-      FIXED_BLOCK_DIM(
-          KeMatrixTopK<T, 5,
-                       kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-              output_data, k, indices_data, input_data, input_width,
-              input_width, static_cast<int>(k), gridx, input_height));
-      default:
-        PADDLE_THROW("Error");
-    }
-  }
-};
-
-#undef FIXED_BLOCK_DIM_BASE
-#undef FIXED_BLOCK_DIM
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k, paddle::operators::TopkOpCUDAKernel<float>,
-    paddle::operators::TopkOpCUDAKernel<double>,
-    paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
deleted file mode 100644
index 6b9db260605c13af97325366c217e81fdfae08c7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/top_k_op.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class TopkKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // Get the top k elements of each row of input tensor
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-
-    size_t k = static_cast<int>(ctx.Attr<int>("k"));
-    auto* k_t = ctx.Input<Tensor>("K");
-    if (k_t) {
-      k = k_t->data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      output_dims[output_dims.size() - 1] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    // reshape input to a flattern matrix(like flat_inner_dims)
-    framework::DDim inputdims = input->dims();
-    const size_t row = framework::product(
-        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
-    const size_t col = inputdims[inputdims.size() - 1];
-    Eigen::DSizes<int, 2> flat2dims(row, col);
-// NOTE: eigen shape doesn't affect paddle tensor.
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-    for (size_t i = 0; i < row; i++) {
-      std::vector<std::pair<T, size_t>> vec;
-      vec.reserve(col);
-      // 1D vector
-      if (inputdims.size() == 1) {
-        auto eg_input = EigenVector<T>::Flatten(*input);
-        for (size_t j = 0; j < col; j++) {
-          vec.push_back(std::pair<T, size_t>(eg_input(j), j));
-        }
-      } else {
-        auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
-        for (size_t j = 0; j < col; j++) {
-          vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
-        }
-      }
-
-      std::partial_sort(
-          vec.begin(), vec.begin() + k, vec.end(),
-          [](const std::pair<T, size_t>& l, const std::pair<T, size_t>& r) {
-            return l.first > r.first;
-          });
-      for (size_t j = 0; j < k; j++) {
-        output_data[i * k + j] = vec[j].first;
-        indices_data[i * k + j] = int64_t(vec[j].second);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
deleted file mode 100644
index 47840d71a3d1fddced1fc9f37174915a89f17aa4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/transpose_op.cc
+++ /dev/null
@@ -1,301 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/transpose_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class TransposeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    std::vector<int> axis = ctx->Attrs().Get<std::vector<int>>("axis");
-    size_t x_rank = x_dims.size();
-    size_t axis_size = axis.size();
-
-    PADDLE_ENFORCE_EQ(x_rank, axis_size,
-                      "The input tensor's rank(%d) "
-                      "should be equal to the axis's size(%d)",
-                      x_rank, axis_size);
-
-    std::vector<int> count(axis_size, 0);
-    for (size_t i = 0; i < axis_size; i++) {
-      PADDLE_ENFORCE(
-          axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
-          "Each element of Attribute axis should be a unique value "
-          "range from 0 to (dims - 1), "
-          "where the dims is the axis's size");
-    }
-
-    framework::DDim out_dims(x_dims);
-    for (size_t i = 0; i < axis_size; i++) {
-      out_dims[i] = x_dims[axis[i]];
-    }
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-#ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
-    }
-#endif
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace(), layout_, library_);
-  }
-};
-
-class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
-    AddOutput("Out", "(Tensor)The output tensor.");
-    AddAttr<std::vector<int>>(
-        "axis",
-        "(vector<int>) A list of values, and the size of the list should be "
-        "the same with the input tensor rank. This operator permutes the input "
-        "tensor's axes according to the values given.");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout");
-    AddComment(R"DOC(
-Transpose Operator.
-
-The input tensor will be permuted according to the axes given.
-The behavior of this operator is similar to how `numpy.transpose` works.
-
-- suppose the input `X` is a 2-D tensor:
-    $$
-    X = \begin{pmatrix}
-    0 &1 &2 \\
-    3 &4 &5
-    \end{pmatrix}$$
-
-    the given `axes` is: $[1, 0]$, and $Y$ = transpose($X$, axis)
-
-    then the output $Y$ is:
-
-    $$
-    Y = \begin{pmatrix}
-         0 &3 \\
-         1 &4  \\
-         2 &5
-    \end{pmatrix}$$
-
-- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is
-$[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
-
-)DOC");
-  }
-};
-
-class TransposeOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-#ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
-    }
-#endif
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace(), layout_, library_);
-  }
-};
-
-// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
-// transpose, the XShape is used to carry the shape and lod of X which
-// will be used in transpose_grad, in this way, the framework can reuse
-// the memory of X immediately the transpose2_op is finished.
-// Considering compatibility issues, we could not fix transpose2_op
-class Transpose2Op : public TransposeOp {
- public:
-  Transpose2Op(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : TransposeOp(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    TransposeOp::InferShape(ctx);
-    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
-                   "Output(XShape) should not be null");
-    const auto &in_dims = ctx->GetInputDim("X");
-    std::vector<int64_t> x_shape_dim(in_dims.size() + 1);
-    x_shape_dim[0] = 0;
-    for (int i = 0; i < in_dims.size(); ++i) {
-      x_shape_dim[i + 1] = in_dims[i];
-    }
-    ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-#ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
-    }
-#endif
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace(), layout_, library_);
-  }
-};
-
-class Transpose2OpMaker : public TransposeOpMaker {
- public:
-  void Make() override {
-    TransposeOpMaker::Make();
-    AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate();
-  }
-};
-
-class Transpose2GradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("transpose2_grad");
-    grad_op->SetInput("XShape", Output("XShape"));
-    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class Transpose2OpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      auto xshape_dim = ctx->GetInputDim("XShape");
-      auto x_shape_dim =
-          framework::slice_ddim(xshape_dim, 1, xshape_dim.size());
-      ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim);
-      ctx->ShareLoD("XShape", framework::GradVarName("X"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-#ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
-    }
-#endif
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.GetPlace(), layout_, library_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
-                  ops::Transpose2GradMaker);
-REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    transpose2, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    transpose2_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc
deleted file mode 100644
index debf9bce55f4e6265253eb466c921a0eed619c8a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    transpose, ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    transpose2,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    transpose2_grad,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
deleted file mode 100644
index 895d1ce2cca19c0c1e4aa03cc64eb1425e8bab1a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/transpose_op.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
-                         const framework::Tensor& in, framework::Tensor* out,
-                         const std::vector<int>& axis) {
-  switch (dim) {
-    case 1:
-      math::Transpose<DeviceContext, T, 1> trans1;
-      trans1(dev_ctx, in, out, axis);
-      break;
-    case 2:
-      math::Transpose<DeviceContext, T, 2> trans2;
-      trans2(dev_ctx, in, out, axis);
-      break;
-    case 3:
-      math::Transpose<DeviceContext, T, 3> trans3;
-      trans3(dev_ctx, in, out, axis);
-      break;
-    case 4:
-      math::Transpose<DeviceContext, T, 4> trans4;
-      trans4(dev_ctx, in, out, axis);
-      break;
-    case 5:
-      math::Transpose<DeviceContext, T, 5> trans5;
-      trans5(dev_ctx, in, out, axis);
-      break;
-    case 6:
-      math::Transpose<DeviceContext, T, 6> trans6;
-      trans6(dev_ctx, in, out, axis);
-      break;
-    default:
-      PADDLE_THROW("Tensors with rank at most 6 are supported");
-  }
-}
-
-template <typename DeviceContext, typename T>
-class TransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (!x_grad) return;
-
-    x_grad->mutable_data<T>(context.GetPlace());
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-
-    int ndims = axis.size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
-                                   reversed_axis);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tree_conv_op.cc b/paddle/fluid/operators/tree_conv_op.cc
deleted file mode 100644
index 566939afaa4b435c58717a49cfdec69d6c616587..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tree_conv_op.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/tree_conv_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-class TreeConvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("NodesVector",
-             "(Tensor) The feature vector of every node on the tree. "
-             "The shape of the feature vector must be "
-             "[max_tree_node_size, feature_size].");
-    AddInput("EdgeSet",
-             "(Tensor) The Edges of Tree. The edge must be directional. "
-             "The shape of the edge set must be [max_tree_node_size, 2].");
-    AddInput("Filter",
-             "(Tensor) The feature detector. "
-             "The shape of the filter is "
-             "[feature_size, 3, output_size, num_filters].");
-    AddOutput("Out",
-              "(Tensor) The feature vector of subtrees. "
-              "The shape of the output tensor is [max_tree_node_size, "
-              "output_size, num_filters]. "
-              "The output tensor could be a new feature "
-              "vector for next tree convolution layers.");
-    AddAttr<int>("max_depth",
-                 "(int, default: 2) The depth of feature detector.")
-        .SetDefault(2)
-        .GreaterThan(1);
-    AddComment(R"DOC(
-**Tree-Based Convolution Operator**
-
-Tree-Based Convolution is a kind of convolution based on tree structure.
-Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN),
-which is used to classify tree structures, such as Abstract Syntax Tree.
-Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
-which regards multiway tree as binary tree.
-The paper of Tree-Based Convolution Operator is here:
-https://arxiv.org/abs/1409.5718v1
-)DOC");
-  }
-};
-class TreeConvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"));
-    auto edge_dims = ctx->GetInputDim("EdgeSet");
-    auto vector_dims = ctx->GetInputDim("NodesVector");
-    auto filter_dims = ctx->GetInputDim("Filter");
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
-    } else {
-      if (edge_dims[2] != -1) {
-        PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
-      }
-    }
-    PADDLE_ENFORCE_EQ(edge_dims.size(), 3,
-                      "The dimension of EdgeSet Tensor should be 3");
-    PADDLE_ENFORCE_EQ(vector_dims.size(), 3,
-                      "The dimension of NodesVector Tensor should be 3");
-    PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
-                      "The dimension of Filter Tensor should be 4");
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(filter_dims[1], 3, "Input(Filter) dim[1] should be 3");
-      PADDLE_ENFORCE_EQ(
-          filter_dims[0], vector_dims[2],
-          "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
-    } else {
-      if (filter_dims[1] != -1) {
-        PADDLE_ENFORCE_EQ(filter_dims[1], 3,
-                          "Input(Filter) dim[1] should be 3");
-      }
-
-      if (filter_dims[0] != -1 && vector_dims[2] != -1) {
-        PADDLE_ENFORCE_EQ(
-            filter_dims[0], vector_dims[2],
-            "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
-      }
-    }
-    auto output_dims = framework::make_ddim(
-        {vector_dims[0], vector_dims[1], filter_dims[2], filter_dims[3]});
-    ctx->SetOutputDim("Out", output_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("NodesVector")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class TreeConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("tree_conv_grad");
-
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetInput("Filter", Input("Filter"));
-    op->SetInput("EdgeSet", Input("EdgeSet"));
-    op->SetInput("NodesVector", Input("NodesVector"));
-
-    op->SetOutput(framework::GradVarName("NodesVector"),
-                  InputGrad("NodesVector"));
-    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
-
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class TreeConvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    auto vectors_dims = ctx->GetInputDim("NodesVector");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "the gradient of output(Out) must not be null");
-    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-      ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("NodesVector"))) {
-      ctx->SetOutputDim(framework::GradVarName("NodesVector"), vectors_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("NodesVector")->type(),
-                                   ctx.device_context());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(tree_conv, ops::TreeConvOp, ops::TreeConvOpMaker,
-                  ops::TreeConvGradOpDescMaker);
-
-REGISTER_OPERATOR(tree_conv_grad, ops::TreeConvGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    tree_conv, ops::TreeConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TreeConvKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    tree_conv_grad,
-    ops::TreeConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TreeConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/tree_conv_op.cu b/paddle/fluid/operators/tree_conv_op.cu
deleted file mode 100644
index eebfe412bdd65139d9657aae78288f66d9d7bc06..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tree_conv_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/tree_conv_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    tree_conv, ops::TreeConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TreeConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    tree_conv_grad,
-    ops::TreeConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TreeConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
deleted file mode 100644
index a84589b32fd0016e0372c50aac8156b2dce883ba..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tree_conv_op.h
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iostream>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/tree2col.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-template <typename DeviceContext, typename T>
-class TreeConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    math::Tree2ColFunctor<DeviceContext, T> tree2col;
-    math::SetConstant<DeviceContext, T> constant;
-
-    auto *Edges = ctx.Input<Tensor>("EdgeSet");
-    auto *Embeddings = ctx.Input<Tensor>("NodesVector");
-    auto *Filter = ctx.Input<Tensor>("Filter");
-    auto *output_emb = ctx.Output<Tensor>("Out");
-    int max_depth = ctx.Attr<int>("max_depth");
-
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-
-    Tensor W;
-    W.ShareDataWith(*Filter);
-    W.Resize(framework::flatten_to_2d(Filter->dims(), 2));
-
-    int batch_size = static_cast<int>(Edges->dims()[0]);
-    int n = static_cast<int>(Embeddings->dims()[1]);
-    int out_size = static_cast<int>(Filter->dims()[2]);
-    int num_filters = static_cast<int>(Filter->dims()[3]);
-    output_emb->mutable_data<T>({batch_size, n, out_size, num_filters},
-                                ctx.GetPlace());
-
-    auto edge_set_slicedim = framework::slice_ddim(
-        Edges->dims(), 1, static_cast<int>(Edges->dims().size()));
-
-    auto embedding_slicedim = framework::slice_ddim(
-        Embeddings->dims(), 1, static_cast<int>(Embeddings->dims().size()));
-
-    auto output_slicedim = framework::slice_ddim(
-        output_emb->dims(), 1, static_cast<int>(output_emb->dims().size()));
-
-    output_slicedim = framework::flatten_to_2d(output_slicedim, 1);
-
-    for (int idx = 0; idx < batch_size; idx++) {
-      auto edge_set = Edges->Slice(idx, idx + 1).Resize(edge_set_slicedim);
-      auto embeddings =
-          Embeddings->Slice(idx, idx + 1).Resize(embedding_slicedim);
-      auto out_vec = output_emb->Slice(idx, idx + 1).Resize(output_slicedim);
-      Tensor patch;
-      tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
-      constant(dev_ctx, &out_vec, 0);
-      blas.MatMul(patch, W, &out_vec);
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class TreeConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out_g = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *in_g = ctx.Output<Tensor>(framework::GradVarName("NodesVector"));
-    auto *filter_g = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    int max_depth = ctx.Attr<int>("max_depth");
-    auto *Embeddings = ctx.Input<Tensor>("NodesVector");
-    auto *edges = ctx.Input<Tensor>("EdgeSet");
-    auto *Filter = ctx.Input<Tensor>("Filter");
-    math::Tree2ColFunctor<DeviceContext, T> tree2col;
-    math::Col2TreeFunctor<DeviceContext, T> col2tree;
-    math::SetConstant<DeviceContext, T> constant;
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-
-    Tensor W;
-    W.ShareDataWith(*Filter);
-    W.Resize(framework::flatten_to_2d(Filter->dims(), 1));
-
-    int batch_size = static_cast<int>(Embeddings->dims()[0]);
-
-    auto edge_set_slicedim = framework::slice_ddim(
-        edges->dims(), 1, static_cast<int>(edges->dims().size()));
-
-    auto embedding_slicedim = framework::slice_ddim(
-        Embeddings->dims(), 1, static_cast<int>(Embeddings->dims().size()));
-
-    auto out_grad_dims = framework::slice_ddim(
-        out_g->dims(), 1, static_cast<int>(out_g->dims().size()));
-    out_grad_dims = framework::flatten_to_2d(out_grad_dims, 1);
-    if (filter_g) {
-      filter_g->mutable_data<T>(Filter->dims(), ctx.GetPlace());
-      Tensor f_g;
-      f_g.ShareDataWith(*filter_g);
-      f_g.Resize(framework::flatten_to_2d(Filter->dims(), 2));
-      constant(dev_ctx, filter_g, 0);
-      for (int batch_id = 0; batch_id < batch_size; batch_id++) {
-        auto edge_set =
-            edges->Slice(batch_id, batch_id + 1).Resize(edge_set_slicedim);
-        auto embeddings = Embeddings->Slice(batch_id, batch_id + 1)
-                              .Resize(embedding_slicedim);
-        auto out_grad =
-            out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
-        Tensor patch;
-        tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
-        blas.MatMul(patch, true, out_grad, false, T(1.0), &f_g, T(1.0));
-      }
-    }
-    if (in_g) {
-      auto input_grad_dims = framework::slice_ddim(
-          in_g->dims(), 1, static_cast<int>(in_g->dims().size()));
-      in_g->mutable_data<T>(Embeddings->dims(), ctx.GetPlace());
-      constant(dev_ctx, in_g, 0);
-      for (int batch_id = 0; batch_id < batch_size; batch_id++) {
-        auto edge_set =
-            edges->Slice(batch_id, batch_id + 1).Resize(edge_set_slicedim);
-        auto out_grad =
-            out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
-        auto in_grad =
-            in_g->Slice(batch_id, batch_id + 1).Resize(input_grad_dims);
-        Tensor in_grad_temp;
-        col2tree(dev_ctx, edge_set, out_grad, &in_grad_temp, max_depth);
-        blas.MatMul(in_grad_temp, false, W, true, &in_grad);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
deleted file mode 100644
index 1e8708f2648d7dd3c10319bd0a4be193d2458d53..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <limits>
-#include <random>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-// reference: https://gist.github.com/lakshayg/d80172fe5ae3c5d2c2aedb53c250320e
-template <typename T>
-T Erfinv(T x) {
-  if (x < -1 || x > 1) {
-    return std::numeric_limits<T>::quiet_NaN();
-  } else if (x == 1.0) {
-    return std::numeric_limits<T>::infinity();
-  } else if (x == -1.0) {
-    return -std::numeric_limits<T>::infinity();
-  }
-
-  const T LN2 = 6.931471805599453094172321214581e-1;
-
-  const T A0 = 1.1975323115670912564578e0;
-  const T A1 = 4.7072688112383978012285e1;
-  const T A2 = 6.9706266534389598238465e2;
-  const T A3 = 4.8548868893843886794648e3;
-  const T A4 = 1.6235862515167575384252e4;
-  const T A5 = 2.3782041382114385731252e4;
-  const T A6 = 1.1819493347062294404278e4;
-  const T A7 = 8.8709406962545514830200e2;
-
-  const T B0 = 1.0000000000000000000e0;
-  const T B1 = 4.2313330701600911252e1;
-  const T B2 = 6.8718700749205790830e2;
-  const T B3 = 5.3941960214247511077e3;
-  const T B4 = 2.1213794301586595867e4;
-  const T B5 = 3.9307895800092710610e4;
-  const T B6 = 2.8729085735721942674e4;
-  const T B7 = 5.2264952788528545610e3;
-
-  const T C0 = 1.42343711074968357734e0;
-  const T C1 = 4.63033784615654529590e0;
-  const T C2 = 5.76949722146069140550e0;
-  const T C3 = 3.64784832476320460504e0;
-  const T C4 = 1.27045825245236838258e0;
-  const T C5 = 2.41780725177450611770e-1;
-  const T C6 = 2.27238449892691845833e-2;
-  const T C7 = 7.74545014278341407640e-4;
-
-  const T D0 = 1.4142135623730950488016887e0;
-  const T D1 = 2.9036514445419946173133295e0;
-  const T D2 = 2.3707661626024532365971225e0;
-  const T D3 = 9.7547832001787427186894837e-1;
-  const T D4 = 2.0945065210512749128288442e-1;
-  const T D5 = 2.1494160384252876777097297e-2;
-  const T D6 = 7.7441459065157709165577218e-4;
-  const T D7 = 1.4859850019840355905497876e-9;
-
-  const T E0 = 6.65790464350110377720e0;
-  const T E1 = 5.46378491116411436990e0;
-  const T E2 = 1.78482653991729133580e0;
-  const T E3 = 2.96560571828504891230e-1;
-  const T E4 = 2.65321895265761230930e-2;
-  const T E5 = 1.24266094738807843860e-3;
-  const T E6 = 2.71155556874348757815e-5;
-  const T E7 = 2.01033439929228813265e-7;
-
-  const T F0 = 1.414213562373095048801689e0;
-  const T F1 = 8.482908416595164588112026e-1;
-  const T F2 = 1.936480946950659106176712e-1;
-  const T F3 = 2.103693768272068968719679e-2;
-  const T F4 = 1.112800997078859844711555e-3;
-  const T F5 = 2.611088405080593625138020e-5;
-  const T F6 = 2.010321207683943062279931e-7;
-  const T F7 = 2.891024605872965461538222e-15;
-
-  T abs_x = abs(x);
-
-  if (abs_x <= 0.85) {
-    T r = 0.180625 - 0.25 * x * x;
-    T num =
-        (((((((A7 * r + A6) * r + A5) * r + A4) * r + A3) * r + A2) * r + A1) *
-             r +
-         A0);
-    T den =
-        (((((((B7 * r + B6) * r + B5) * r + B4) * r + B3) * r + B2) * r + B1) *
-             r +
-         B0);
-    return x * num / den;
-  }
-
-  T r = sqrt(LN2 - log(1.0 - abs_x));
-
-  T num, den;
-  if (r <= 5.0) {
-    r = r - 1.6;
-    num =
-        (((((((C7 * r + C6) * r + C5) * r + C4) * r + C3) * r + C2) * r + C1) *
-             r +
-         C0);
-    den =
-        (((((((D7 * r + D6) * r + D5) * r + D4) * r + D3) * r + D2) * r + D1) *
-             r +
-         D0);
-  } else {
-    r = r - 5.0;
-    num =
-        (((((((E7 * r + E6) * r + E5) * r + E4) * r + E3) * r + E2) * r + E1) *
-             r +
-         E0);
-    den =
-        (((((((F7 * r + F6) * r + F5) * r + F4) * r + F3) * r + F2) * r + F1) *
-             r +
-         F0);
-  }
-
-  if (x < 0) {
-    return -num / den;
-  } else {
-    return num / den;
-  }
-}
-
-template <typename T>
-struct TruncatedNormal {
-  T mean, std;
-  T a_normal_cdf;
-  T b_normal_cdf;
-  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
-    auto normal_cdf = [](T x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    a_normal_cdf = normal_cdf(-2.0);
-    b_normal_cdf = normal_cdf(2.0);
-  }
-
-  T operator()(T value) const {
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
-  }
-};
-
-template <typename T>
-class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
-                                           1.0);
-    TruncatedNormal<T> truncated_normal(mean, std);
-    int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = truncated_normal(dist(engine));
-    }
-  }
-};
-
-class TruncatedGaussianRandomOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(Out) of TruncatedGaussianRandomOp should not be null.");
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> out_dim;
-    out_dim.reserve(shape.size());
-    for (auto dim : shape) {
-      out_dim.push_back(static_cast<int64_t>(dim));
-    }
-    PADDLE_ENFORCE(shape.size() > 0UL,
-                   "shape can be one int or array. shape must be set.");
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library{framework::LibraryType::kPlain};
-    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context(), layout, library);
-  }
-};
-
-class TruncatedGaussianRandomOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "Output tensor of truncated gaussian random op.");
-
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "The dimension of random tensor.");
-    AddAttr<float>("mean",
-                   "(float, default 0.0) "
-                   "mean of random tensor.")
-        .SetDefault(.0f);
-    AddAttr<float>("std",
-                   "(float, default 1.0) "
-                   "std of random tensor.")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed of generator."
-                 "0 means use system wide seed."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
-    AddAttr<int>("dtype",
-                 "(int, default 5(FP32)) "
-                 "Output data type.")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddComment(R"DOC(
-TruncatedGaussianRandom Operator.
-
-Used to initialize tensors with truncated gaussian random generator.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random,
-                             ops::TruncatedGaussianRandomOp,
-                             ops::TruncatedGaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(truncated_gaussian_random,
-                       ops::CPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
deleted file mode 100644
index 5a3510babe4d57b9e80f0e7898df98033834ca15..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include <limits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct TruncatedNormal {
-  T mean, std;
-  T a_normal_cdf;
-  T b_normal_cdf;
-  unsigned int seed;
-  T numeric_min;
-
-  __host__ __device__ TruncatedNormal(T mean, T std, T numeric_min, int seed)
-      : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
-    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
-    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
-  }
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
-    rng.discard(n);
-    T value = dist(rng);
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
-  }
-};
-
-template <typename T>
-class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-    }
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    thrust::transform(
-        index_sequence_begin, index_sequence_begin + size,
-        thrust::device_ptr<T>(data),
-        TruncatedNormal<T>(mean, std, std::numeric_limits<T>::min(), seed));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    truncated_gaussian_random,
-    paddle::operators::GPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
deleted file mode 100644
index d21340b478b590259b04ce66a3db129fdb50c7e7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unfold_op.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#include "paddle/fluid/operators/unfold_op.h"
-
-namespace paddle {
-namespace operators {
-
-class UnfoldOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "Tensor, "
-             "the input of unfold op. "
-             "The format of X is [N, C_in, H, W], "
-             "where N is the batch size, C_in is the input channels, "
-             "H is the height and W is the width");
-    AddOutput(
-        "Y",
-        "Tensor, "
-        "the output of unfold op. "
-        "The format of Y is [N, C_in*filter_height*filter_width, "
-        "output_height*output_width], where N is the batch size, "
-        "C_in is the input channels of X, filter_height and filter_width is "
-        "height and width of the filtering kernel, output_height and "
-        "output_width "
-        "is the calculated height and width of output feature map.");
-    AddAttr<std::vector<int>>(
-        "kernel_sizes",
-        "vector<int>, the kernel sizes of the convolution operator.");
-    AddAttr<std::vector<int>>(
-        "strides", "vector<int>, the strides of the convolution operator.");
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "vector<int>, the paddings applied to pad the feature map.");
-    AddAttr<std::vector<int>>(
-        "dilations", "vector<int>, the dilations of the convolution operator.");
-    AddComment(R"DOC(
-**Unfold Operator**
-
-This Operator is used to extract sliding local blocks from a batched input tensor, also known
-as im2col when operated on batched 2D image tensor. For each block under the convolution filter,
-all element will be rearranged as a column. While the convolution filter silding over the input
-feature map, a series of such columns will be formed. 
-    )DOC");
-  }
-};
-
-class UnfoldOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of UnfoldOp should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of UnfoldOp should not be null");
-    auto in_dims = ctx->GetInputDim("X");
-    std::vector<int> kernel_sizes =
-        ctx->Attrs().Get<std::vector<int>>("kernel_sizes");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    std::vector<int> dilations =
-        ctx->Attrs().Get<std::vector<int>>("dilations");
-
-    // Only [N, C, H, W] input supported now
-    PADDLE_ENFORCE(
-        in_dims.size() == 4,
-        "Input shold be 4-D tensor of format [N, C, H, W], but get %u",
-        in_dims.size());
-    PADDLE_ENFORCE(
-        in_dims.size() - kernel_sizes.size() == 2U,
-        "The dims of X should be larger than that of kernel_sizes "
-        "by a number of 2, due to the batch size and input channel dim. "
-        "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2",
-        in_dims.size(), kernel_sizes.size());
-    PADDLE_ENFORCE_EQ(
-        strides.size(), kernel_sizes.size(),
-        "The dims of strides shold be the same with that of kernel_sizes. "
-        "But recieved dims(strides: %u) != dims(kernel_sizes: %u).",
-        strides.size(), kernel_sizes.size());
-    PADDLE_ENFORCE_EQ(
-        paddings.size(), 2 * strides.size(),
-        "The dims of paddings should be 2 times of that of strides. "
-        "But recieved dims(paddings: %u) != 2*dims(strides: %u).",
-        paddings.size(), strides.size());
-    PADDLE_ENFORCE_EQ(
-        strides.size(), dilations.size(),
-        "The dims of strides shold be the same with that of dilations. "
-        "But recieved dims(strides: %u) != dims(dilations: %u).",
-        strides.size(), dilations.size());
-
-    std::vector<int> out_dims;
-    out_dims.push_back(in_dims[0]);
-
-    int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
-    out_dims.push_back(output_channels);
-
-    int output_height =
-        CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0],
-                       paddings[2], strides[0]);
-    int output_width = CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1],
-                                      paddings[1], paddings[3], strides[1]);
-    int output_col_length = output_height * output_width;
-    out_dims.push_back(output_col_length);
-
-    ctx->SetOutputDim("Y", framework::make_ddim(out_dims));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class UnfoldGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "The gradient of Y should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "The gradient of X should not be null");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::Tensor>(framework::GradVarName("Y"))->type(),
-        ctx.device_context());
-  }
-};
-
-class UnfoldGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("unfold_grad");
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-    op->SetInput("X", Input("X"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(UnfoldGradOpNoNeedBufferVarsInference,
-                                      "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker,
-                  ops::UnfoldGradDescMaker);
-REGISTER_OPERATOR(unfold_grad, ops::UnfoldGradOp,
-                  ops::UnfoldGradOpNoNeedBufferVarsInference);
-
-REGISTER_OP_CPU_KERNEL(
-    unfold, ops::UnfoldOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnfoldOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    unfold_grad,
-    ops::UnfoldGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnfoldGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/unfold_op.cu b/paddle/fluid/operators/unfold_op.cu
deleted file mode 100644
index 46584506d431564cfc7af11072eee6c544f03564..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unfold_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unfold_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    unfold, ops::UnfoldOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnfoldOpKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    unfold_grad,
-    ops::UnfoldGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnfoldGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/unfold_op.h b/paddle/fluid/operators/unfold_op.h
deleted file mode 100644
index 97e8143bc052b346a85b50ab26bb8563e48f30d9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unfold_op.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-inline int CalcOutputSize(int input_size, int filter_size, int dilation,
-                          int padding1, int padding2, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
-  PADDLE_ENFORCE(output_size > 0,
-                 "Due to the settings of padding(%d, %d), filter_size(%d), "
-                 "dilation(%d) and "
-                 "stride(%d), the output size is less than 0, please check "
-                 "again. Input_size:%d",
-                 padding1, padding2, filter_size, dilation, stride, input_size);
-
-  return output_size;
-}
-
-template <typename DeviceContext, typename T>
-class UnfoldOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("X");
-    const int batch_size = static_cast<int>(input->dims()[0]);
-    Tensor* output = ctx.Output<Tensor>("Y");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    std::vector<int> kernel_sizes = ctx.Attr<std::vector<int>>("kernel_sizes");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto input_dims = input->dims();
-
-    int output_height =
-        CalcOutputSize(input_dims[2], kernel_sizes[0], dilations[0],
-                       paddings[0], paddings[2], strides[0]);
-    int output_width =
-        CalcOutputSize(input_dims[3], kernel_sizes[1], dilations[1],
-                       paddings[1], paddings[3], strides[1]);
-
-    framework::DDim input_shape({input_dims[1], input_dims[2], input_dims[3]});
-    framework::DDim output_matrix_shape({input_dims[1], kernel_sizes[0],
-                                         kernel_sizes[1], output_height,
-                                         output_width});
-
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-      im2col(dev_ctx, in_batch, dilations, strides, paddings, &out_batch);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UnfoldGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* output_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    if ((!output_grad) || (!input_grad)) return;
-
-    std::vector<int> kernel_sizes = ctx.Attr<std::vector<int>>("kernel_sizes");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input_grad->dims()[0]);
-
-    auto input_dims = input_grad->dims();
-
-    int output_height =
-        CalcOutputSize(input_dims[2], kernel_sizes[0], dilations[0],
-                       paddings[0], paddings[2], strides[0]);
-    int output_width =
-        CalcOutputSize(input_dims[3], kernel_sizes[1], dilations[1],
-                       paddings[1], paddings[3], strides[1]);
-
-    framework::DDim input_shape({input_dims[1], input_dims[2], input_dims[3]});
-    framework::DDim output_matrix_shape({input_dims[1], kernel_sizes[0],
-                                         kernel_sizes[1], output_height,
-                                         output_width});
-
-    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, input_grad, static_cast<T>(0));
-    for (int i = 0; i < batch_size; i++) {
-      Tensor out_grad_batch =
-          output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-      Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
-      col2im(dev_ctx, out_grad_batch, dilations, strides, paddings,
-             &in_grad_batch);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
deleted file mode 100644
index 598c9042cfb9f308e12e270172fb0453b6b7e634..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/batch_size_like.h"
-
-namespace paddle {
-namespace operators {
-
-class UniformRandomBatchSizeLikeOp : public BatchSizeLikeOp {
- protected:
-  using BatchSizeLikeOp::BatchSizeLikeOp;
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class UniformRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
- protected:
-  void Apply() override {
-    AddComment(R"DOC(
-UniformRandomBatchSizeLike operator.
-
-This operator initializes a tensor with the same batch_size as the Input tensor
-with random values sampled from a uniform distribution.
-
-)DOC");
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
-        .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximun value of uniform random")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
-    AddAttr<int>("diag_num",
-                 "The number of diag elements. Note that if "
-                 "diag_num is 0, it means without diag init.[default 0].")
-        .SetDefault(0);
-    AddAttr<int>("diag_step", "The step between two diag element.[default 0].")
-        .SetDefault(0);
-    AddAttr<float>("diag_val", "The value of diag element. [default 1.0].")
-        .SetDefault(1.0f);
-    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
-        .SetDefault(framework::proto::VarType::FP32);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(uniform_random_batch_size_like,
-                  paddle::operators::UniformRandomBatchSizeLikeOp,
-                  paddle::operators::UniformRandomBatchSizeLikeOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::BatchSizeLikeNoNeedBufferVarsInference);
-// Kernels are registered in uniform_random_op.cc and uniform_random_op.cu
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
deleted file mode 100644
index 35fa0d7fc68e50de04dd0fd5a7aa8d6f71cefa30..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename T>
-class CPUUniformRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    framework::Tensor *tensor = nullptr;
-    auto out_var = ctx.OutputVar("Out");
-    if (out_var->IsType<framework::LoDTensor>()) {
-      tensor = out_var->GetMutable<framework::LoDTensor>();
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-      auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
-      tensor = selected_rows->mutable_value();
-      tensor->Resize(framework::make_ddim(shape));
-      selected_rows->mutable_rows()->reserve(shape[0]);
-    } else {
-      PADDLE_THROW(
-          "uniform_random_op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-    T *data = tensor->mutable_data<T>(ctx.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::uniform_real_distribution<T> dist(
-        static_cast<T>(ctx.Attr<float>("min")),
-        static_cast<T>(ctx.Attr<float>("max")));
-    int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
-    }
-    unsigned int diag_num =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
-    auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
-    if (diag_num > 0) {
-      PADDLE_ENFORCE_GT(size, (diag_num - 1) * (diag_step + 1),
-                        "The index of diagonal elements is out of bounds");
-      for (int64_t i = 0; i < diag_num; ++i) {
-        int64_t pos = i * diag_step + i;
-        data[pos] = diag_val;
-      }
-    }
-  }
-};
-
-class UniformRandomOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of UniformRandomOp should not be null.");
-
-    PADDLE_ENFORCE_LT(ctx->Attrs().Get<float>("min"),
-                      ctx->Attrs().Get<float>("max"),
-                      "uniform_random's min must less then max");
-    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<int>("diag_num"), 0,
-                      "diag_num must greater than or equal 0");
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<int>("diag_step"), 0,
-                      "diag_step must greater than or equal 0");
-    std::vector<int64_t> temp;
-    temp.reserve(shape.size());
-    for (auto dim : shape) {
-      temp.push_back(static_cast<int64_t>(dim));
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(temp));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "The output tensor of uniform random op");
-    AddComment(R"DOC(
-This operator initializes a tensor with random values sampled from a
-uniform distribution. The random result is in set [min, max].
-
-)DOC");
-    AddAttr<std::vector<int64_t>>("shape", "The shape of the output tensor");
-    AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].")
-        .SetDefault(-1.0f);
-    AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time. [default 0].")
-        .SetDefault(0);
-    AddAttr<int>("diag_num",
-                 "The number of diag elements. Note that if "
-                 "diag_num is 0, it means without diag init.[default 0].")
-        .SetDefault(0);
-    AddAttr<int>("diag_step", "The step between two diag element.[default 0].")
-        .SetDefault(0);
-    AddAttr<float>("diag_val", "The value of diag element. [default 1.0].")
-        .SetDefault(1.0f);
-    AddAttr<int>("dtype", "Output tensor data type. [default 5(FP32)].")
-        .SetDefault(framework::proto::VarType::FP32);
-  }
-};
-
-class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto out_var_name = ctx->Output("Out").front();
-    auto var_data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(ctx->GetAttr("dtype")));
-
-    if (ctx->GetType(out_var_name) !=
-        framework::proto::VarType::SELECTED_ROWS) {
-      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
-    }
-    ctx->SetDataType(out_var_name, var_data_type);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(uniform_random, paddle::operators::UniformRandomOp,
-                  paddle::operators::UniformRandomOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::UniformRandomOpVarTypeInference);
-
-REGISTER_OP_CPU_KERNEL(uniform_random,
-                       paddle::operators::CPUUniformRandomKernel<float>,
-                       paddle::operators::CPUUniformRandomKernel<double>);
-REGISTER_OP_CPU_KERNEL(uniform_random_batch_size_like,
-                       paddle::operators::CPUUniformRandomKernel<float>,
-                       paddle::operators::CPUUniformRandomKernel<double>);
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
deleted file mode 100644
index a9f10d8b297791f9e725d4cf2568945bfbdb685c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct UniformGenerator {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
-                                       int diag_step, T diag_val)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename T>
-class GPUUniformRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    framework::Tensor* tensor = nullptr;
-    auto out_var = context.OutputVar("Out");
-    if (out_var->IsType<framework::LoDTensor>()) {
-      tensor = out_var->GetMutable<framework::LoDTensor>();
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      auto shape = context.Attr<std::vector<int64_t>>("shape");
-      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(shape));
-    } else {
-      PADDLE_THROW(
-          "uniform_random_op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-    }
-    T min = static_cast<T>(context.Attr<float>("min"));
-    T max = static_cast<T>(context.Attr<float>("max"));
-    unsigned int diag_num =
-        static_cast<unsigned int>(context.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(context.Attr<int>("diag_step"));
-    T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    thrust::transform(
-        index_sequence_begin, index_sequence_begin + size,
-        thrust::device_ptr<T>(data),
-        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(uniform_random,
-                        paddle::operators::GPUUniformRandomKernel<float>,
-                        paddle::operators::GPUUniformRandomKernel<double>);
-REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
-                        paddle::operators::GPUUniformRandomKernel<float>,
-                        paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
deleted file mode 100644
index 08ce81d75e49c7875fddbce6a91974acd158c5cd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unique_op.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unique_op.h"
-
-namespace paddle {
-namespace operators {
-
-class UniqueOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of UniqueOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of UniqueOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Index"),
-                   "Output(Index) of UniqueOp should not be null.");
-
-    auto in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(in_dims.size() == 1, "Input(X) should be a vector.");
-
-    ctx->SetOutputDim("Out", {-1});
-    ctx->SetOutputDim("Index", in_dims);
-  }
-};
-
-class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input tensor. It should be a 1-D tensor.");
-    AddAttr<int>("dtype", "data type for output index");
-    AddOutput("Out", "A unique subsequence for input tensor.");
-    AddOutput("Index",
-              "An index tensor pointing to unique subsequence, which has "
-              "identical shape with input tensor and int64 dtype.");
-    AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, and an index tensor pointing to this unique subsequence
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
-REGISTER_OP_CPU_KERNEL(unique, ops::UniqueKernel<float>,
-                       ops::UniqueKernel<double>, ops::UniqueKernel<int32_t>,
-                       ops::UniqueKernel<int64_t>);
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
deleted file mode 100644
index 4b492e9c819cadd8f36ee5e585ce388e1013b1d8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unique_op.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename InT>
-struct UniqueOpFunctor {
-  framework::Tensor* out_;
-  framework::Tensor* index_;
-  const framework::Tensor* in_;
-  framework::Tensor* count_;
-
-  UniqueOpFunctor(framework::Tensor* out, framework::Tensor* index,
-                  const framework::Tensor* in,
-                  framework::Tensor* count = nullptr)
-      : out_(out), index_(index), in_(in), count_(count) {}
-
-  template <typename IndexT>
-  void apply() const {
-    auto* in_data = in_->data<InT>();
-    auto* index_data = index_->mutable_data<IndexT>(platform::CPUPlace());
-
-    int64_t j = 0;
-
-    // TODO(fangzeyang): Should optimize performance here.
-    std::unordered_map<InT, int64_t> dict;
-    std::vector<InT> uniq;
-
-    PADDLE_ENFORCE(in_->numel() < pow(2, 31),
-                   "numel of Unique op input should less than INT_MAX");
-
-    for (auto i = 0; i < in_->numel(); i++) {
-      auto it = dict.find(in_data[i]);
-      if (it == dict.end()) {
-        dict.emplace(std::make_pair(in_data[i], j));
-        uniq.emplace_back(in_data[i]);
-        index_data[i] = static_cast<IndexT>(j);
-        j++;
-      } else {
-        index_data[i] = static_cast<IndexT>(it->second);
-      }
-    }
-
-    if (count_ != nullptr) {
-      // Resize the count tensor dims to allocate the memory
-      count_->Resize(framework::make_ddim({static_cast<int64_t>(uniq.size())}));
-      IndexT* count_data = count_->mutable_data<IndexT>(platform::CPUPlace());
-      // init count_data to 0
-      memset(count_data, 0, uniq.size() * sizeof(IndexT));
-
-      const auto& index_type = index_->type();
-      bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                              index_type == framework::proto::VarType::INT64;
-      PADDLE_ENFORCE(
-          index_type_match,
-          "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-          paddle::framework::DataTypeToString(index_type),
-          paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-          paddle::framework::DataTypeToString(
-              framework::proto::VarType::INT64));
-
-      if (index_type == framework::proto::VarType::INT32) {
-        for (auto i = 0; i < in_->numel(); ++i) {
-          const IndexT& index = index_data[i];
-          count_data[static_cast<int32_t>(index)] += static_cast<IndexT>(1);
-        }
-      } else {
-        for (auto i = 0; i < in_->numel(); ++i) {
-          const IndexT& index = index_data[i];
-          count_data[static_cast<int64_t>(index)] += static_cast<IndexT>(1);
-        }
-      }
-    }
-
-    out_->Resize(framework::make_ddim({static_cast<int64_t>(uniq.size())}));
-    auto out_data = out_->mutable_data<InT>(platform::CPUPlace());
-    std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT));
-  }
-};
-
-template <typename T>
-class UniqueKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* index = context.Output<framework::Tensor>("Index");
-
-    framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unique_with_counts_op.cc b/paddle/fluid/operators/unique_with_counts_op.cc
deleted file mode 100644
index 770bbefea15d2111cf1ca0b98e7e51ea1dd5195b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unique_with_counts_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unique_with_counts_op.h"
-
-namespace paddle {
-namespace operators {
-
-class UniqueWithCountsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of UniqueWithCountsOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of UniqueWithCountsOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Index"),
-                   "Output(Index) of UniqueWithCountsOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Count"),
-                   "Output(Count) of UniqueWithCountsOp should not be null.");
-
-    auto in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(in_dims.size() == 1,
-                   "The op of fluid.layers.unique_with_counts, Input(X) should "
-                   "be a vector.");
-
-    ctx->SetOutputDim("Out", {-1});
-    ctx->SetOutputDim("Index", in_dims);
-    ctx->SetOutputDim("Count", {-1});
-  }
-};
-
-class UniqueWithCountsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input tensor. It should be a 1-D tensor.");
-    AddAttr<int>("dtype", "data type for output index");
-    AddOutput("Out", "A unique subsequence for input tensor.");
-    AddOutput("Index",
-              "An index tensor pointing to unique subsequence, which has "
-              "identical shape with input tensor and the data type is set by "
-              "the attr `dtype`");
-    AddOutput("Count", "A subsequence for the count of unique index");
-    AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, index tensor pointing to this unique subsequence, 
-    and the subsequence for the count of unique index.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(unique_with_counts, ops::UniqueWithCountsOp,
-                             ops::UniqueWithCountsOpMaker);
-REGISTER_OP_CPU_KERNEL(unique_with_counts, ops::UniqueWithCountsKernel<float>,
-                       ops::UniqueWithCountsKernel<double>,
-                       ops::UniqueWithCountsKernel<int32_t>,
-                       ops::UniqueWithCountsKernel<int64_t>);
diff --git a/paddle/fluid/operators/unique_with_counts_op.h b/paddle/fluid/operators/unique_with_counts_op.h
deleted file mode 100644
index f61bac7cda003041c96de5d6abbeda4d8ee3e9bf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unique_with_counts_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/unique_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class UniqueWithCountsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* index = context.Output<framework::Tensor>("Index");
-    auto* count = context.Output<framework::Tensor>("Count");
-    framework::VisitDataType(data_type,
-                             UniqueOpFunctor<T>(out, index, x, count));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
deleted file mode 100644
index fae5041c9328fe48aed388c1400aefaaf8bea5e7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unpool_op.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unpool_op.h"
-#include <string>
-#include <vector>
-namespace paddle {
-namespace operators {
-
-class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of unpool operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddInput(
-        "Indices",
-        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of unpool operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is "
-              "the number of channels, H and W is the height and "
-              "width of feature.");
-    AddAttr<std::vector<int>>(
-        "ksize",
-        "(vector), the unpooling window size(height, width) "
-        "of unpooling operator.");
-    AddAttr<std::vector<int>>("strides",
-                              "(vector, default:{1, 1}), "
-                              "strides (height, width) of unpooling operator.")
-        .SetDefault({1, 1});
-    AddAttr<std::vector<int>>("paddings",
-                              "(vector default:{0,0}), "
-                              "paddings (height, width) of unpooling operator.")
-        .SetDefault({0, 0});
-    AddAttr<std::string>(
-        "unpooling_type",
-        "(string), unpooling type, can be \"max\" for max-unpooling ")
-        .InEnum({"max"});
-    AddComment(R"DOC(
-Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
-$(N, C_{out}, H_{out}, W_{out})$, where
-$$
-H_{out} = (H_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\
-W_{out} = (W_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1]
-$$
-Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
-)DOC");
-  }
-};
-
-int UnpoolOutputSize(int input_size, int ksize, int padding, int stride) {
-  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
-  return output_size;
-}
-
-class UnpoolOp : public framework::OperatorWithKernel {
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of UnpoolOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input(Indices) of UnpoolOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of UnpoolOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
-    auto in_y_dims = ctx->GetInputDim("Indices");
-    std::string unpooling_type =
-        ctx->Attrs().Get<std::string>("unpooling_type");
-    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    PADDLE_ENFORCE(in_x_dims.size() == 4,
-                   "Unpooling intput must be of 4-dimensional.");
-    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
-
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      if (!ctx->IsRuntime() && in_x_dims[i + 2] <= 0) {
-        output_shape.push_back(-1);
-      } else {
-        output_shape.push_back(UnpoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                                paddings[i], strides[i]));
-      }
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  }
-};
-
-class UnpoolOpGrad : public framework::OperatorWithKernel {
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    unpool_grad,
-    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc
deleted file mode 100644
index 7c59a0feaa472e57b3c9505a88de6743cbf71a74..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unpool_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unpool_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    unpool, ops::UnpoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnpoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    unpool_grad,
-    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
deleted file mode 100644
index e388ec5ae3937aadebdcd8ecce8d82dae05be7cd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unpool_op.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/unpooling.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class UnpoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
-    auto* out = context.Output<framework::Tensor>("Out");
-    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    T* output_data = out->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (output_data) {
-      math::SetConstant<DeviceContext, T> set_zero;
-      set_zero(dev_ctx, out, static_cast<T>(0));
-    }
-    math::Unpool2dMaxFunctor<DeviceContext, T> unpool2d_max_forward;
-    unpool2d_max_forward(dev_ctx, *in_x, *in_y, out);
-  }
-};
-template <typename DeviceContext, typename T>
-class UnpoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
-    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
-    const framework::Tensor* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor* in_x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-
-    in_x_grad->mutable_data<T>(context.GetPlace());
-    zero(device_ctx, in_x_grad, static_cast<T>(0));
-
-    math::Unpool2dMaxGradFunctor<DeviceContext, T> unpool2d_max_backward;
-    unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
deleted file mode 100644
index fc849e73c579f3457852e05dec404c001b74b19e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ /dev/null
@@ -1,267 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unsqueeze_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class UnsqueezeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of Unsqueeze operator should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of Unsqueeze operator should not be null.");
-
-    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    const auto &x_dims = ctx->GetInputDim("X");
-    // Validity Check: input tensor dims (<6).
-    PADDLE_ENFORCE_LE(x_dims.size(), 6,
-                      "Invalid dimensions, the rank of Input(X) "
-                      "should be in the range of [1, 6] (Eigen limit)");
-    auto out_dims = GetOutputShape(axes, x_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-  }
-
-  static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
-                                        const framework::DDim &in_dims) {
-    int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
-    int cur_output_size = in_dims.size();
-    std::vector<int64_t> output_shape(output_size, 0);
-
-    // Validity Check: rank range.
-    PADDLE_ENFORCE_LE(output_size, 6,
-                      "The output tensor's rank should be less than 6.");
-
-    for (int axis : unsqz_dims) {
-      int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
-      // Vaildity Check: the axis bound
-      PADDLE_ENFORCE_GE(cur, 0);
-      PADDLE_ENFORCE_LE(cur, cur_output_size);
-      // Move old axis, and insert new axis
-      for (int i = cur_output_size; i >= cur; --i) {
-        if (output_shape[i] == 1) {
-          // Move axis
-          output_shape[i + 1] = 1;
-          output_shape[i] = 0;
-        }
-      }
-      output_shape[cur] = 1;
-      // Add the output size.
-      cur_output_size++;
-    }
-
-    // Make output shape
-    for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
-      if (output_shape[out_idx] == 0) {
-        output_shape[out_idx] = in_dims[in_idx++];
-      }
-    }
-
-    return framework::make_ddim(output_shape);
-  }
-};
-
-class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor). The input tensor of unsqueeze operator.");
-    AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator.");
-    AddAttr<std::vector<int>>("axes",
-                              "(std::vector<int>). List of integers,"
-                              " indicating the dimensions to be inserted")
-        .AddCustomChecker([](const std::vector<int> &axes) {
-          PADDLE_ENFORCE_EQ(!axes.empty(), true,
-                            "Invalid axes, The unsqueeze axes is empty.");
-          // Validity Check: axes dims (<6).
-          PADDLE_ENFORCE_LT(static_cast<int>(axes.size()), 6,
-                            "Invalid dimensions, dynamic dimensions should be "
-                            "within [1, 6] dimensions (Eigen limit).");
-          // Validity Check: the range of unsqueeze aixs.
-          for (int axis : axes) {
-            PADDLE_ENFORCE_LT(axis, 6,
-                              "Invalid dimensions, input axis should be"
-                              " within [1, 6] dimensions (Eigen limit).");
-          }
-        });
-    AddComment(R"DOC(
-    Unsqueeze Operator.
-
-    Insert single-dimensional entries to the shape of a tensor.
-    Takes one required argument axes, a list of dimensions that will be inserted.
-    Dimension indices in axes are as seen in the output tensor.
-
-    For example:
-      Given a tensor such that tensor with shape [3, 4, 5],
-      then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
-    )DOC");
-  }
-};
-
-class UnsqueezeGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-};
-
-// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
-// unsqueeze, the XShape is used to carry the shape and lod of X which
-// will be used in unsqueeze_grad, in this way, the framework can reuse
-// the memory of X immediately the unsqueeze2_op is finished.
-// Considering compatibility issues, we could not fix unsqueeze2_op
-class Unsqueeze2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of Unsqueeze operator should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of Unsqueeze operator should not be null.");
-
-    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    const auto &x_dims = ctx->GetInputDim("X");
-    // Validity Check: input tensor dims (<6).
-    PADDLE_ENFORCE_LE(x_dims.size(), 6,
-                      "Invalid dimensions, the rank of Input(X) "
-                      "should be in the range of [1, 6] (Eigen limit)");
-    auto out_dims = UnsqueezeOp::GetOutputShape(axes, x_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("XShape"), true,
-        "Output(XShape) of Unsqueeze operator should not be null.");
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-  }
-};
-
-class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
- public:
-  void Make() override {
-    UnsqueezeOpMaker::Make();
-    AddOutput("XShape",
-              "XShape is just used to store the shape and lod of X, which will "
-              "be used in UnsqueezeGradOp.")
-        .AsIntermediate();
-  }
-};
-
-class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("unsqueeze2_grad");
-    grad_op->SetInput("XShape", Output("XShape"));
-    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class Unsqueeze2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE_EQ(context->HasInput("XShape"), true,
-                      "Input(XShape) shouldn't be null.");
-    PADDLE_ENFORCE_EQ(context->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) shouldn't be null.");
-    auto xshape_dims = context->GetInputDim("XShape");
-    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    context->SetOutputDim(framework::GradVarName("X"), x_dims);
-    context->ShareLoD("XShape", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp);
-
-REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
-                  ops::Unsqueeze2GradOpMaker, ops::UnsqueezeInplaceInferer);
-REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
-                  ops::UnsqueezeGradInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze_grad,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze2,
-    ops::Unsqueeze2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Unsqueeze2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Unsqueeze2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Unsqueeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Unsqueeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze2_grad,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc
deleted file mode 100644
index fbdec5af94a570f430f9c50a16fe01b69a4f2d14..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unsqueeze_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze_grad,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze2,
-    ops::Unsqueeze2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Unsqueeze2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Unsqueeze2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Unsqueeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Unsqueeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze2_grad,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
deleted file mode 100644
index 68f0cbe81223126c3f850a6e738c7b581910c69d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class UnsqueezeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &axes = context.Attr<std::vector<int>>("axes");
-    auto *in = context.Input<framework::LoDTensor>("X");
-    auto *out = context.Output<framework::LoDTensor>("Out");
-    auto x_dims = in->dims();
-    auto out_dims = GetOutputShape(axes, x_dims);
-
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(), out);
-    out->Resize(out_dims);
-  }
-
-  static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
-                                        const framework::DDim &in_dims) {
-    int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
-    int cur_output_size = in_dims.size();
-    std::vector<int64_t> output_shape(output_size, 0);
-
-    // Validity Check: rank range.
-    PADDLE_ENFORCE_LE(output_size, 6,
-                      "The output tensor's rank should be less than 6.");
-
-    for (int axis : unsqz_dims) {
-      int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
-      // Vaildity Check: the axis bound
-      PADDLE_ENFORCE_GE(cur, 0);
-      PADDLE_ENFORCE_LE(cur, cur_output_size);
-      // Move old axis, and insert new axis
-      for (int i = cur_output_size; i >= cur; --i) {
-        if (output_shape[i] == 1) {
-          // Move axis
-          output_shape[i + 1] = 1;
-          output_shape[i] = 0;
-        }
-      }
-      output_shape[cur] = 1;
-      // Add the output size.
-      cur_output_size++;
-    }
-
-    // Make output shape
-    for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
-      if (output_shape[out_idx] == 0) {
-        output_shape[out_idx] = in_dims[in_idx++];
-      }
-    }
-
-    return framework::make_ddim(output_shape);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UnsqueezeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_out =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto in_dims = ctx.Input<framework::LoDTensor>("X")->dims();
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    d_x->Resize(in_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Unsqueeze2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *out = context.Output<framework::LoDTensor>("Out");
-    auto *in = context.Input<framework::LoDTensor>("X");
-
-    auto &axes = context.Attr<std::vector<int>>("axes");
-
-    auto x_dims = in->dims();
-    auto out_dims =
-        UnsqueezeKernel<DeviceContext, T>::GetOutputShape(axes, x_dims);
-
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(), out);
-    out->Resize(out_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Unsqueeze2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_out =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    // auto in_dims = d_x->dims();
-
-    auto xshape_dims = ctx.Input<framework::LoDTensor>("XShape")->dims();
-    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    d_x->Resize(x_dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc
deleted file mode 100644
index 204aa1fa6709485b7f277270cd4cc8e32b757515..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unstack_op.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unstack_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-class UnStackOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must exist.");
-
-    int axis = ctx->Attrs().Get<int>("axis");
-    int num = ctx->Attrs().Get<int>("num");
-    auto x_dim = ctx->GetInputDim("X");
-    int rank = x_dim.size();
-    PADDLE_ENFORCE_GE(
-        axis, -rank, "Attr(axis) must be inside [-rank, rank), where rank = %d",
-        rank);
-    PADDLE_ENFORCE_LT(
-        axis, rank, "Attr(axis) must be inside [-rank, rank), where rank = %d",
-        rank);
-    if (axis < 0) axis += rank;
-
-    PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast<size_t>(num),
-                      "Number of Outputs(Y) is wrong");
-    if (x_dim[axis] > 0) {
-      PADDLE_ENFORCE_EQ(num, x_dim[axis], "Number of Outputs(Y) is wrong");
-    }
-    auto vec = framework::vectorize<int>(x_dim);
-    vec.erase(vec.begin() + axis);
-    ctx->SetOutputsDim("Y", std::vector<framework::DDim>(  // NOLINT
-                                x_dim[axis], framework::make_ddim(vec)));
-  }
-};
-
-class UnStackOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of unstack op.");
-    AddOutput("Y", "The output of unstack op.").AsDuplicable();
-    AddAttr<int>("axis", "The axis along which Input(X) should be unstacked.")
-        .SetDefault(0);
-    AddAttr<int>("num", "The number of outputs(Y).").GreaterThan(0);
-    AddComment(R"DOC(
-      UnStack Operator.
-
-      UnStack Input(X) into several tensors along Attr(axis).
-    )DOC");
-  }
-};
-
-class UnStackGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("unstack_grad");
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class UnStackGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0,
-                      "Number of Inputs(Y@Grad) must be larger than 0");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      "Output(X@Grad) must exist.");
-
-    auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
-    for (size_t i = 1; i < input_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
-                        "Dims of all Inputs(Y@Grad) must be the same");
-    }
-
-    int axis = ctx->Attrs().Get<int>("axis");
-    int rank = input_dims[0].size();
-    PADDLE_ENFORCE_GE(
-        axis, -(rank + 1),
-        "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank);
-    PADDLE_ENFORCE_LT(
-        axis, rank + 1,
-        "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank);
-    if (axis < 0) axis += (rank + 1);
-
-    auto vec = framework::vectorize<int>(input_dims[0]);
-    vec.insert(vec.begin() + axis, input_dims.size());
-    ctx->SetOutputDim(framework::GradVarName("X"), framework::make_ddim(vec));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker,
-                  ops::UnStackGradOpDescMaker);
-
-REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp);
-
-REGISTER_OP_CPU_KERNEL(unstack,
-                       ops::UnStackKernel<plat::CPUDeviceContext, float>,
-                       ops::UnStackKernel<plat::CPUDeviceContext, double>,
-                       ops::UnStackKernel<plat::CPUDeviceContext, int>,
-                       ops::UnStackKernel<plat::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(unstack_grad,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, float>,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, double>,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, int>,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unstack_op.cu b/paddle/fluid/operators/unstack_op.cu
deleted file mode 100644
index b591898a4d7aa3918e41118f1f1b3137f4638a18..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unstack_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unstack_op.h"
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    unstack, ops::UnStackKernel<plat::CUDADeviceContext, float>,
-    ops::UnStackKernel<plat::CUDADeviceContext, double>,
-    ops::UnStackKernel<plat::CUDADeviceContext, int>,
-    ops::UnStackKernel<plat::CUDADeviceContext, int64_t>,
-    ops::UnStackKernel<plat::CUDADeviceContext, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    unstack_grad, ops::UnStackGradKernel<plat::CUDADeviceContext, float>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, double>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, int>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h
deleted file mode 100644
index 6344ea16f81cddb1c8f4f07f28fd318f40296427..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/unstack_op.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-#ifdef __NVCC__
-#include <thrust/device_vector.h>
-#include "paddle/fluid/framework/array.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename VecXType, typename T>
-struct StackFunctor {
-  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
-      : x_(x), y_(y), n_(n), post_(post) {}
-
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    y_[idx] = x_[which_x][x_index];
-  }
-
- private:
-  VecXType x_;
-  T *y_;
-  int n_;
-  int post_;
-};
-
-template <typename VecDxType, typename T>
-struct StackGradFunctor {
-  HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
-      : dx_(dx), dy_(dy), n_(n), post_(post) {}
-
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    dx_[which_x][x_index] = dy_[idx];
-  }
-
- private:
-  VecDxType dx_;
-  const T *dy_;
-  int n_;
-  int post_;
-};
-
-template <typename DeviceContext, typename VecXType, typename T>
-static inline void StackFunctorForRange(const DeviceContext &ctx,
-                                        const VecXType &x, T *y, int total_num,
-                                        int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackFunctor<VecXType, T>(x, y, n, post));
-}
-
-template <typename DeviceContext, typename VecDxType, typename T>
-static inline void StackGradFunctorForRange(const DeviceContext &ctx,
-                                            const VecDxType &dx, const T *dy,
-                                            int total_num, int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
-}
-
-template <typename DeviceContext, typename T>
-class UnStackGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<Tensor>(framework::GradVarName("Y"));
-    auto *y = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-
-    int n = static_cast<int>(x.size());
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    std::vector<const T *> x_datas(n);
-    for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
-
-    int pre = 1;
-    int post = 1;
-    auto &dim = x[0]->dims();
-    for (auto i = 0; i < axis; ++i) pre *= dim[i];
-    for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-
-#ifdef __NVCC__
-    int total_num = pre * n * post;
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-
-    thrust::device_vector<const T *> device_x_vec(x_datas);
-    auto x_data_arr = device_x_vec.data().get();
-
-    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
-
-    // Wait() must be called because device_x_vec may be destructed before
-    // kernel ends
-    dev_ctx.Wait();
-#else
-    auto x_data_arr = x_datas.data();
-
-    size_t x_offset = 0;
-    size_t y_offset = 0;
-    for (int i = 0; i < pre; i++) {
-      for (int j = 0; j < n; j++) {
-        std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
-                    post * sizeof(T));
-        y_offset += post;
-      }
-      x_offset += post;
-    }
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UnStackKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *dy = ctx.Input<Tensor>("X");
-    auto dx = ctx.MultiOutput<Tensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-
-    int n = dy->dims()[axis];
-    std::vector<T *> dx_datas(n);  // NOLINT
-    for (int i = 0; i < n; i++) {
-      dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
-    }
-    auto dy_data = dy->data<T>();
-
-    int pre = 1;
-    for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
-    int total_num = dy->numel();
-    int post = total_num / (n * pre);
-
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-#ifdef __NVCC__
-    thrust::device_vector<T *> device_dx_vec(dx_datas);
-    auto dx_data_arr = device_dx_vec.data().get();
-#else
-    auto dx_data_arr = dx_datas.data();
-#endif
-    StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
-#ifdef __NVCC__
-    // Wait() must be called because device_dx_vec may be destructed before
-    // kernel ends
-    dev_ctx.Wait();
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
deleted file mode 100644
index 232075203a0705ba5c68c80bae7cbf4613cbb970..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ /dev/null
@@ -1,431 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/var_conv_2d_op.h"
-#include <vector>
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/dynload/mklml.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-void VarConv2dOpMaker::Make() {
-  AddInput("X",
-           "X (LoDTensor, default LoDTensor<float>) Input variable which "
-           "should contain lod information.");
-  AddInput("ROW", "(LoDTensor) the row variable provides lod information");
-  AddInput("COLUMN",
-           "(LoDTensor) the column variable provides lod information");
-  AddInput("W", "W (Tensor), the filter.");
-  AddAttr<int>("InputChannel", "the input filter num").SetDefault(1);
-  AddAttr<int>("OutputChannel", "the output filter num").SetDefault(1);
-  AddAttr<int>("StrideH", "the height of Stride").SetDefault(1);
-  AddAttr<int>("StrideW", "the width of Stride").SetDefault(1);
-  AddAttr<int>("KernelH", "the height of Kernel").SetDefault(1);
-  AddAttr<int>("KernelW", "the width of Kernel").SetDefault(1);
-
-  AddOutput("Out", "(LoDTensor, default LoDTensor<float>) Output variable");
-  AddOutput("Col",
-            "(LoDTensor, default LoDTensor<float>) the intermediate result "
-            "variable");
-
-  AddComment(R"DOC(
-    Var Size Conv Operator
-
-    This operator calculate Out = \sigma \left ( W * X + b \right ), 
-    only support 2-D for X.
-    
-    NOTE: only support 'float32' data type now.
-
-  )DOC");
-}
-
-void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"),
-                 "X(Input) of VarConv2dOP should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("W"),
-                 "W(Input) of VarConv2dOP should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("ROW"),
-                 "Input(ROW) of VarConv2dOP should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("COLUMN"),
-                 "Input(COLUMN) of VarConv2dOP should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Out(Output) of VarConv2dOP should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Col"),
-                 "Col(Output) of VarConv2dOP should not be null.");
-
-  auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    "The rank of X(Input) can't be less than 2.");
-
-  auto w_dims = ctx->GetInputDim("W");
-
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
-  int output_channel = ctx->Attrs().Get<int>("OutputChannel");
-  int input_channel = ctx->Attrs().Get<int>("InputChannel");
-  int kernel_h = ctx->Attrs().Get<int>("KernelH");
-  int kernel_w = ctx->Attrs().Get<int>("KernelW");
-  PADDLE_ENFORCE_EQ(w_dims[0], output_channel,
-                    "W dim[0] should be equal to OutputChannel");
-  PADDLE_ENFORCE_EQ(
-      w_dims[1], input_channel * kernel_h * kernel_w,
-      "W dim[1] should be equal to InputChannel * StrideH * StrideW");
-
-  if (ctx->IsRuntime()) {
-    framework::Variable* x_var =
-        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
-    const auto& x_lod = x_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
-
-    PADDLE_ENFORCE_GE(x_lod.size(), 1, "The Input(X)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], static_cast<int64_t>(x_lod[0].back()),
-        "The Input(X)'s lod info mismatches the actual tensor shape.");
-
-    framework::Variable* row_var =
-        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("ROW")[0]);
-    const auto& row_lod = row_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE(!row_lod.empty(), "The Input(ROW) must hold lod info.");
-
-    framework::Variable* col_var =
-        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("COLUMN")[0]);
-    const auto& col_lod = col_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE(!col_lod.empty(), "The Input(COLUMN) must hold lod info.");
-  } else {
-    std::vector<int64_t> out_dims_vec{-1};
-    out_dims_vec.push_back(1);
-    std::vector<int64_t> col_dims_vec{-1};
-    col_dims_vec.push_back(1);
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
-    ctx->SetOutputDim("Col", framework::make_ddim(col_dims_vec));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CPUVarConv2dOPKernel : public framework::OpKernel<T> {
- public:
-  void Im2Col(const framework::ExecutionContext& ctx, const LoDTensor& input,
-              LoDTensor* col) const {
-    int input_channel = ctx.Attr<int>("InputChannel");
-    auto* in_row = ctx.Input<LoDTensor>("ROW");
-    auto* in_col = ctx.Input<LoDTensor>("COLUMN");
-    int kernel_h = ctx.Attr<int>("KernelH");
-    int kernel_w = ctx.Attr<int>("KernelW");
-    int stride_h = ctx.Attr<int>("StrideH");
-    int stride_w = ctx.Attr<int>("StrideW");
-
-    int batch = input.lod()[0].size() - 1;
-    const auto& bottom_offset = input.lod()[0];
-    // 2-D lod info.
-    const auto& offset_x = in_col->lod()[0];
-    const auto& offset_y = in_row->lod()[0];
-
-    // top offset is the whole size of each data sample
-    std::vector<size_t> top_offset;
-    int top_size = 0;
-    top_offset.push_back(top_size);
-    for (int b = 0; b < batch; ++b) {
-      int width = offset_x[b + 1] - offset_x[b];
-      int height = offset_y[b + 1] - offset_y[b];
-      int top_im_x = 0;
-      if (width == 0) {
-        top_im_x = 0;
-      } else {
-        top_im_x = (width - 1) / stride_w + 1;
-      }
-      int top_im_y = 0;
-      if (height == 0) {
-        top_im_y = 0;
-      } else {
-        top_im_y = (height - 1) / stride_h + 1;
-      }
-      int top_x = top_im_y * top_im_x;
-      int top_y = input_channel * kernel_h * kernel_w;
-      top_size += top_y * top_x;
-      top_offset.push_back(top_size);
-    }
-    framework::LoD col_lod;
-    col_lod.push_back(top_offset);
-    col->set_lod(col_lod);
-    std::vector<int64_t> col_dims_vec{top_size};
-    col_dims_vec.push_back(1);
-    auto* top_data = col->mutable_data<T>(framework::make_ddim(col_dims_vec),
-                                          ctx.GetPlace());
-    auto* bottom_data = input.data<T>();
-
-    int kernel_win_size = kernel_h * kernel_w;
-    int half_kernel_h = kernel_h / 2;
-    int half_kernel_w = kernel_w / 2;
-    for (int b = 0; b < batch; ++b) {
-      int t_offset = top_offset[b];
-      int b_offset = bottom_offset[b];
-      int width = offset_x[b + 1] - offset_x[b];
-      int height = offset_y[b + 1] - offset_y[b];
-      if (width == 0 || height == 0) {
-        continue;
-      }
-      int top_im_x = (width - 1) / stride_w + 1;
-      int top_im_y = (height - 1) / stride_h + 1;
-      int top_x = top_im_y * top_im_x;
-      for (int z = 0; z < input_channel; ++z) {
-        int row_offset = kernel_win_size * z;
-        int im_offset = z * width * height;
-        for (int y = 0; y < height; y += stride_h) {
-          for (int x = 0; x < width; x += stride_w) {
-            int col_offset = x / stride_w + y / stride_h * top_im_x;
-            for (int ky = 0; ky < kernel_h; ++ky) {
-              for (int kx = 0; kx < kernel_w; ++kx) {
-                int im_y = y + ky - half_kernel_h;
-                int im_x = x + kx - half_kernel_w;
-                if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
-                  top_data[t_offset +
-                           (row_offset + ky * kernel_w + kx) * top_x +
-                           col_offset] =
-                      bottom_data[b_offset + im_offset + im_y * width + im_x];
-                } else {
-                  top_data[t_offset +
-                           (row_offset + ky * kernel_w + kx) * top_x +
-                           col_offset] = 0;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* bottom = ctx.Input<LoDTensor>("X");
-    auto* in_row = ctx.Input<LoDTensor>("ROW");
-    auto* in_col = ctx.Input<LoDTensor>("COLUMN");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* top = ctx.Output<LoDTensor>("Out");
-    auto* col = ctx.Output<LoDTensor>("Col");
-
-    int output_channel = ctx.Attr<int>("OutputChannel");
-    int input_channel = ctx.Attr<int>("InputChannel");
-    int kernel_h = ctx.Attr<int>("KernelH");
-    int kernel_w = ctx.Attr<int>("KernelW");
-    int stride_h = ctx.Attr<int>("StrideH");
-    int stride_w = ctx.Attr<int>("StrideW");
-
-    Im2Col(ctx, *bottom, col);
-    int batch = bottom->lod()[0].size() - 1;
-    const auto& col_offset = col->lod()[0];
-    const auto& offset_x = in_col->lod()[0];
-    const auto& offset_y = in_row->lod()[0];
-    std::vector<size_t> top_offset;
-    int top_size = 0;
-    top_offset.push_back(top_size);
-    for (int b = 0; b < batch; ++b) {
-      int width = offset_x[b + 1] - offset_x[b];
-      int height = offset_y[b + 1] - offset_y[b];
-      int top_im_x = 0;
-      if (width == 0) {
-        top_im_x = 0;
-      } else {
-        top_im_x = (width - 1) / stride_w + 1;
-      }
-      int top_im_y = 0;
-      if (height == 0) {
-        top_im_y = 0;
-      } else {
-        top_im_y = (height - 1) / stride_h + 1;
-      }
-      int top_im_size = top_im_y * top_im_x;
-      top_size += output_channel * top_im_size;
-      top_offset.push_back(top_size);
-    }
-
-    framework::LoD top_lod;
-    top_lod.push_back(top_offset);
-
-    top->set_lod(top_lod);
-    std::vector<int64_t> top_dims_vec{top_size};
-    top_dims_vec.push_back(1);
-    auto* top_data = top->mutable_data<T>(framework::make_ddim(top_dims_vec),
-                                          ctx.GetPlace());
-
-    auto* w_data = w->data<T>();
-    auto* col_data = col->data<T>();
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    for (int b = 0; b < batch; ++b) {
-      int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
-      if (top_im_size == 0) {
-        continue;
-      }
-
-      blas.GEMM(CblasNoTrans, CblasNoTrans, output_channel, top_im_size,
-                input_channel * kernel_h * kernel_w, 1.0, w_data,
-                col_data + col_offset[b], 0.0, top_data + top_offset[b]);
-    }
-  }
-};
-
-void VarConv2dOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"),
-                 "Input(X) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("W"),
-                 "Input(W) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                 "Input(Out@GRAD) of SequencePadGradOp should not be null.");
-
-  if (ctx->HasOutput(framework::GradVarName("X"))) {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-  if (ctx->HasOutput(framework::GradVarName("W"))) {
-    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CPUVarConv2dOPGradKernel : public framework::OpKernel<T> {
- public:
-  void Im2ColGrad(const framework::ExecutionContext& ctx, T* top_diff) const {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* in_row = ctx.Input<LoDTensor>("ROW");
-    auto* in_col = ctx.Input<LoDTensor>("COLUMN");
-    auto* col = ctx.Input<LoDTensor>("Col");
-
-    int input_channel = ctx.Attr<int>("InputChannel");
-    int kernel_h = ctx.Attr<int>("KernelH");
-    int kernel_w = ctx.Attr<int>("KernelW");
-    int stride_h = ctx.Attr<int>("StrideH");
-    int stride_w = ctx.Attr<int>("StrideW");
-
-    auto* dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    memset(dx_data, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T));
-
-    const auto& bottom_offset = x->lod()[0];
-    const auto& offset_x = in_col->lod()[0];
-    const auto& offset_y = in_row->lod()[0];
-    const auto& top_offset = col->lod()[0];
-    int batch = x->lod()[0].size() - 1;
-    int kernel_win_size = kernel_h * kernel_w;
-    int half_kernel_h = kernel_h / 2;
-    int half_kernel_w = kernel_w / 2;
-    for (int b = 0; b < batch; ++b) {
-      int t_offset = top_offset[b];
-      int b_offset = bottom_offset[b];
-      int width = offset_x[b + 1] - offset_x[b];
-      int height = offset_y[b + 1] - offset_y[b];
-      if (width == 0 || height == 0) {
-        continue;
-      }
-      int top_im_x = (width - 1) / stride_w + 1;
-      int top_im_y = (height - 1) / stride_h + 1;
-      int top_x = top_im_y * top_im_x;
-      for (int z = 0; z < input_channel; ++z) {
-        int row_offset = kernel_win_size * z;
-        int im_offset = z * width * height;
-        for (int y = 0; y < height; y += stride_h) {
-          for (int x = 0; x < width; x += stride_w) {
-            int col_offset = x / stride_w + y / stride_h * top_im_x;
-            for (int ky = 0; ky < kernel_h; ++ky) {
-              for (int kx = 0; kx < kernel_w; ++kx) {
-                int im_y = y + ky - half_kernel_h;
-                int im_x = x + kx - half_kernel_w;
-                if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
-                  dx_data[b_offset + im_offset + im_y * width + im_x] +=
-                      top_diff[t_offset +
-                               (row_offset + ky * kernel_w + kx) * top_x +
-                               col_offset];
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* col = ctx.Input<LoDTensor>("Col");
-    auto* out = ctx.Input<LoDTensor>("Out");
-
-    int output_channel = ctx.Attr<int>("OutputChannel");
-    int input_channel = ctx.Attr<int>("InputChannel");
-    int kernel_h = ctx.Attr<int>("KernelH");
-    int kernel_w = ctx.Attr<int>("KernelW");
-
-    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
-
-    Tensor col_grad;
-    col_grad.Resize(col->dims());
-    auto* col_diff = col_grad.mutable_data<T>(ctx.GetPlace());
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* w_diff = d_w->mutable_data<T>(ctx.GetPlace());
-
-    memset(dx_data, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T));
-    memset(w_diff, 0.0, w->dims()[0] * w->dims()[1] * sizeof(T));
-    memset(col_diff, 0.0, col->dims()[0] * col->dims()[1] * sizeof(T));
-    auto* top_diff = d_out->data<T>();
-    auto* w_data = w->data<T>();
-    auto* col_data = col->data<T>();
-    int batch = x->lod()[0].size() - 1;
-    const auto& top_offset = out->lod()[0];
-    const auto& col_offset = col->lod()[0];
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    for (int b = 0; b < batch; ++b) {
-      int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
-      if (top_im_size == 0) {
-        continue;
-      }
-
-      blas.GEMM(CblasTrans, CblasNoTrans, input_channel * kernel_h * kernel_w,
-                top_im_size, output_channel, 1.0, w_data,
-                top_diff + top_offset[b], 1.0, col_diff + col_offset[b]);
-
-      blas.GEMM(CblasNoTrans, CblasTrans, output_channel,
-                input_channel * kernel_h * kernel_w, top_im_size, 1.0,
-                top_diff + top_offset[b], col_data + col_offset[b], 1.0,
-                w_diff);
-    }
-    Im2ColGrad(ctx, col_diff);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plt = paddle::platform;
-namespace frm = paddle::framework;
-REGISTER_OPERATOR(var_conv_2d, ops::VarConv2dOP, ops::VarConv2dOpMaker,
-                  frm::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(var_conv_2d_grad, ops::VarConv2dOpGrad);
-
-REGISTER_OP_CPU_KERNEL(var_conv_2d,
-                       ops::CPUVarConv2dOPKernel<plt::CPUDeviceContext, float>);
-//     ops::CPUVarConv2dOPKernel<plt::CPUDeviceContext,
-//                                       double>
-REGISTER_OP_CPU_KERNEL(
-    var_conv_2d_grad,
-    ops::CPUVarConv2dOPGradKernel<plt::CPUDeviceContext, float>);
-//     ops::CPUVarConv2dOPGradKernel<plt::CPUDeviceContext,
-//                                           double>
diff --git a/paddle/fluid/operators/var_conv_2d_op.h b/paddle/fluid/operators/var_conv_2d_op.h
deleted file mode 100644
index b8d5de060934fa7ad5157c3718ddf0cc85771870..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/var_conv_2d_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-class VarConv2dOP : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class VarConv2dOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class VarConv2dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
deleted file mode 100644
index 7033d55a53b876fce34f91c05c0dc91288ae4a59..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/warpctc_op.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/warpctc_op.h"
-
-#include <memory>
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class WarpCTCOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) of WarpCTCOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input(Label) of WarpCTCOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("WarpCTCGrad"),
-                   "Output(WarpCTCGrad) of WarpCTCOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
-                   "Output(Loss) of WarpCTCOp should not be null.");
-
-    auto logits_dims = ctx->GetInputDim("Logits");
-    int blank = ctx->Attrs().Get<int>("blank");
-    int sequence_width = 0;
-
-    if (ctx->HasInput("LogitsLength")) {
-      sequence_width = logits_dims[2];
-    } else {
-      sequence_width =
-          static_cast<int>(framework::product(logits_dims) / logits_dims[0]);
-    }
-    PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width),
-                   "The value of Attr(blank) should be in interval [0, %d).",
-                   sequence_width);
-
-    // TODO(liuyiqun): it is tricky to set the wrong dimension here.
-    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
-    return framework::OpKernelType(ctx.Input<Tensor>("Logits")->type(),
-                                   ctx.device_context(), layout_, library_);
-  }
-};
-
-class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Logits",
-             "(2-D LoDTensor<float>) or (3-D Tensor<float>), the "
-             "unscaled probabilities of variable-length sequences."
-             "When is a 2-D Tensor with LoD information, "
-             "it's shape is [Lp, num_classes + 1], "
-             "where Lp is the sum of all input sequences' length "
-             "and num_classes is the true number of classes "
-             "(not including the blank label)."
-             "When it is 3-D Tensor, it's shape is "
-             "[max_logit_length, batch_size, num_classes + 1], "
-             "where max_logit_length is the length of the longest "
-             "logit sequence.");
-    AddInput("Label",
-             "(2-D LoDTensor<int>) or (2-D Tensor<int>), the "
-             "ground truth of variable-length sequence. "
-             "When it is a 2-D Tensor with LoD information, "
-             "it is of the shape [Lg, 1], where Lg is th sum of "
-             "all labels' length."
-             "When it is a 2-D Tensor<int>, it's shape is also [Lg, 1].");
-    AddInput("LogitsLength",
-             "1-D Tensor<int64_t>. "
-             "Input sequence length for Logits when Logits is a 3-D tensor.")
-        .AsDispensable();
-    AddInput("LabelLength",
-             "1-D Tensor<int64_t>. "
-             "Target sequence length for Label when Label is a 2-D tensor.")
-        .AsDispensable();
-    AddOutput("WarpCTCGrad",
-              "(Tensor, default: Tensor<float>), a temporary "
-              "output Tensor to store the gradients of warp-ctc, which is "
-              "computed with loss together in one call. It is a 3-D Tensor of "
-              "the shape [max_sequence_length, batch_size, num_classes + 1].")
-        .AsIntermediate();
-    AddOutput("Loss",
-              "(Tensor, default: Tensor<float>), the Connectionist "
-              "Temporal Classification (CTC) loss, which is a 2-D Tensor of "
-              "the shape [batch_size, 1]");
-    AddAttr<int>("blank",
-                 "(int, default: 0), the blank label of Connectionist "
-                 "Temporal Classification (CTC) loss, which is in the "
-                 "half-opened interval [0, num_classes + 1).")
-        .SetDefault(0);
-    AddAttr<bool>("norm_by_times",
-                  "(bool, default: false), whether to "
-                  "normalize the gradients by the number of time-step, "
-                  "which is also the sequence's length.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-An operator integrating the open-source
-[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
-[Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin](
-https://arxiv.org/pdf/1512.02595v1.pdf),
-to compute Connectionist Temporal Classification (CTC) loss.
-It can be aliased as softmax with ctc, since a native softmax activation is
-interated to the warp-ctc library, to to normlize values for each row of the
-input tensor.
-
-More detail of CTC loss can be found by refering to
-[Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with
-Recurrent Neural Networks](
-http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
-)DOC");
-  }
-};
-
-class WarpCTCGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-
-    op->SetType("warpctc_grad");
-
-    op->SetInput("WarpCTCGrad", Output("WarpCTCGrad"));
-    op->SetInput("Logits", Input("Logits"));
-    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
-
-    op->SetInput("LogitsLength", Input("LogitsLength"));
-
-    op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
-
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class WarpCTCGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("WarpCTCGrad"),
-                   "Input(WarpCTCGrad) of WarpCTCGradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
-                   "Output(Logits@GRAD) of WarpCTCGradOp should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("Logits"),
-                      ctx->GetInputDim("Logits"));
-    ctx->ShareLoD("Logits", /*->*/ framework::GradVarName("Logits"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("Logits")->type(),
-                                   ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
-                  ops::WarpCTCGradOpDescMaker);
-REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp);
-REGISTER_OP_CPU_KERNEL(
-    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    warpctc_grad,
-    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/warpctc_op.cu.cc b/paddle/fluid/operators/warpctc_op.cu.cc
deleted file mode 100644
index 6f8559f542f6034661b6ff346beccd9626e2370a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/warpctc_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/warpctc_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    warpctc_grad,
-    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
deleted file mode 100644
index 1859c748d783519971ffb43cd695a9d22d09dbb6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/warpctc_op.h
+++ /dev/null
@@ -1,312 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/sequence_padding.h"
-#include "paddle/fluid/operators/math/sequence_scale.h"
-#include "paddle/fluid/platform/dynload/warpctc.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext>
-class WarpCTCFunctor {
- public:
-  /*
-   * \brief Compute the connectionist temporal classification loss,
-   *        and optionally compute the gradient with respect to the inputs.
-   *
-   * If gradient is nullptr, it only computes the ctc loss,
-   * or computes both ctc loss and gradient.
-   *
-   * \param ctx               execution context of this functor
-   * \param input             batch matrix of input probabilities, in
-   *                          max_sequence_length x num_sequences x
-   *                          sequence_width, (row-major) format
-   * \param gradient          batch matrix of gradient, with the same shape as
-   *                          input.
-   * \param cpu_labels        labels always in CPU memory.
-   * \param cpu_label_lengths length of all labels in CPU memory.
-   * \param cpu_input_lengths length of all sequences in CPU memory.
-   * \param sequence_width    number of possible output symbols.
-   * \param num_sequences     number of sequence.
-   * \param blank             blank label used in ctc loss function.
-   * \param cpu_losss         cost of each sequence in CPU memory.
-   */
-  void operator()(const framework::ExecutionContext& ctx, const float* input,
-                  float* gradient, const int* cpu_labels,
-                  const int* cpu_label_lengths, const int* cpu_input_lengths,
-                  const size_t sequence_width, const size_t num_sequences,
-                  const size_t blank, float* cpu_loss) {
-    // Init warp-ctc options
-    init(ctx, blank);
-
-    // Compute the required workspace size.
-    // There is no memory allocated operations within warp-ctc.
-    size_t workspace_bytes = 0;
-    ctcStatus_t status = platform::dynload::get_workspace_size(
-        cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
-        static_cast<int>(num_sequences), options_, &workspace_bytes);
-    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
-                      "warp-ctc [version %d] Error in get_workspace_size: ",
-                      warpctc_version_,
-                      platform::dynload::ctcGetStatusString(status));
-    PADDLE_ENFORCE_GT(workspace_bytes, 0UL,
-                      "Bytes of workspace got by warp-ctc function, "
-                      "get_workspace_size(), should be larger than 0.");
-
-    Tensor workspace;
-    size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
-    float* workspace_data = workspace.mutable_data<float>(
-        framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
-        ctx.GetPlace());
-    math::SetConstant<DeviceContext, float>()(
-        ctx.template device_context<DeviceContext>(), &workspace,
-        static_cast<float>(0));
-
-    // compute loss and gradient
-    status = platform::dynload::compute_ctc_loss(
-        input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
-        static_cast<int>(sequence_width), static_cast<int>(num_sequences),
-        cpu_loss, workspace_data, options_);
-    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
-                      "warp-ctc [version %d] Error in compute_ctc_loss: ",
-                      warpctc_version_,
-                      platform::dynload::ctcGetStatusString(status));
-  }
-
- protected:
-  void init(const framework::ExecutionContext& ctx, const size_t blank) {
-    warpctc_version_ = platform::dynload::get_warpctc_version();
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      options_.loc = CTC_GPU;
-      options_.stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream();
-#else
-      PADDLE_THROW("[warpctc init] GPU is not enabled.");
-#endif
-    } else {
-      options_.loc = CTC_CPU;
-      options_.num_threads = 1;
-    }
-
-    options_.blank_label = blank;
-  }
-
- private:
-  int warpctc_version_;
-  ctcOptions options_;
-};
-
-template <typename DeviceContext, typename T>
-class WarpCTCKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* logits = ctx.Input<LoDTensor>("Logits");
-    auto* label = ctx.Input<LoDTensor>("Label");
-    auto* warpctc_grad = ctx.Output<Tensor>("WarpCTCGrad");
-    auto* loss = ctx.Output<Tensor>("Loss");
-
-    size_t num_sequences, sequence_width, max_sequence_length;
-    framework::Vector<size_t> logits_lod;
-    framework::Vector<size_t> label_lod;
-
-    if (ctx.HasInput("LogitsLength") && ctx.HasInput("LabelLength")) {
-      num_sequences = logits->dims()[1];
-      sequence_width = logits->dims()[2];
-      max_sequence_length = logits->dims()[0];
-
-      auto* logits_length = ctx.Input<framework::Tensor>("LogitsLength");
-      auto* labels_length = ctx.Input<framework::Tensor>("LabelLength");
-      framework::Tensor logits_length_cpu;
-      framework::Tensor labels_length_cpu;
-      framework::TensorCopy(*logits_length, platform::CPUPlace(),
-                            &logits_length_cpu);
-      framework::TensorCopy(*labels_length, platform::CPUPlace(),
-                            &labels_length_cpu);
-
-      logits_lod.push_back(0);
-      label_lod.push_back(0);
-      for (auto i = 0; i < num_sequences; i++) {
-        logits_lod.push_back(logits_lod[i] +
-                             logits_length_cpu.data<int64_t>()[i]);
-        label_lod.push_back(label_lod[i] +
-                            labels_length_cpu.data<int64_t>()[i]);
-      }
-    } else {
-      logits_lod = framework::ToAbsOffset(logits->lod())[0];
-      auto logits_dims = logits->dims();
-      PADDLE_ENFORCE_EQ(
-          logits_dims[0], static_cast<int64_t>(logits_lod.back()),
-          "The first dimension of Input(Logits) should be equal to "
-          "the sum of all sequences' lengths.");
-
-      label_lod = framework::ToAbsOffset(label->lod())[0];
-      auto label_dims = label->dims();
-      PADDLE_ENFORCE_EQ(
-          label_dims[0], label->numel(),
-          "The width of each timestep in Input(Label) should be 1.");
-
-      num_sequences = logits_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(num_sequences, label_lod.size() - 1,
-                        "The number of sequences of Input(Logits) should be "
-                        "equal to that of Input(Label).");
-
-      sequence_width = logits->numel() / logits_dims[0];
-      max_sequence_length = math::MaximumSequenceLength(logits_lod);
-    }
-
-    auto loss_dims =
-        framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
-
-    // warpctc needs sequences data stored in transposed padding format
-    LoDTensor warpctc_logits;
-    auto warpctc_logits_dims =
-        framework::make_ddim({static_cast<int64_t>(max_sequence_length),
-                              static_cast<int64_t>(num_sequences),
-                              static_cast<int64_t>(sequence_width)});
-    warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
-
-    if (ctx.HasInput("LogitsLength")) {
-      TensorCopySync(*logits, ctx.GetPlace(), &warpctc_logits);
-    } else {
-      LoDTensor cpu_pad_value;
-      T* pad_value_data =
-          cpu_pad_value.mutable_data<T>({1}, platform::CPUPlace());
-      *pad_value_data = static_cast<T>(0);
-      LoDTensor pad_value;
-      if (platform::is_cpu_place(ctx.GetPlace())) {
-        pad_value = cpu_pad_value;
-      } else {
-        TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value);
-      }
-
-      math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), *logits,
-          &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */,
-          math::kLengthBatchWidth);
-    }
-    const T* warpctc_logits_data = warpctc_logits.data<T>();
-
-    std::vector<int> warpctc_label_lengths(num_sequences);
-    std::vector<int> warpctc_logits_lengths(num_sequences);
-
-    for (size_t i = 0; i < num_sequences; ++i) {
-      warpctc_label_lengths[i] = label_lod[i + 1] - label_lod[i];
-      warpctc_logits_lengths[i] = logits_lod[i + 1] - logits_lod[i];
-    }
-
-    // warpctc computes loss and gradient in one call, gradient data also stored
-    // in batch format
-    T* warpctc_grad_data =
-        warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
-
-    math::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), warpctc_grad,
-        static_cast<T>(0));
-
-    // warpctc accesses labels in CPU memory
-    Tensor warpctc_label;
-    TensorCopySync(*label, platform::CPUPlace(), &warpctc_label);
-
-    const int* warpctc_label_data = warpctc_label.data<int>();
-    // warpctc stores loss in CPU memory
-    Tensor warpctc_loss;
-    T* warpctc_loss_data =
-        warpctc_loss.mutable_data<T>(loss_dims, platform::CPUPlace());
-
-    const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
-
-    WarpCTCFunctor<DeviceContext>()(
-        ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
-        warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
-        sequence_width, num_sequences, blank, warpctc_loss_data);
-
-    // Copy the loss back
-    TensorCopy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class WarpCTCGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
-    auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
-    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-
-    logits_grad->mutable_data<T>(ctx.GetPlace());
-    bool norm_by_times = ctx.Attr<bool>("norm_by_times");
-
-    if (ctx.HasInput("LogitsLength")) {
-      size_t max_seq_length = warpctc_grad->dims()[0];
-      size_t num_sequences = warpctc_grad->dims()[1];
-      size_t seq_width = warpctc_grad->dims()[2];
-
-      LoDTensor logits_grad_with_lod;
-      auto logits_grad_dims =
-          framework::make_ddim({static_cast<int64_t>(max_seq_length),
-                                static_cast<int64_t>(num_sequences),
-                                static_cast<int64_t>(seq_width)});
-      T* logits_grad_cpu_data = logits_grad_with_lod.mutable_data<T>(
-          logits_grad_dims, platform::CPUPlace());
-
-      TensorCopySync(*warpctc_grad, platform::CPUPlace(),
-                     &logits_grad_with_lod);
-
-      Tensor loss_grad_cpu;
-      loss_grad_cpu.mutable_data<T>(loss_grad->dims(), platform::CPUPlace());
-      TensorCopySync(*loss_grad, platform::CPUPlace(), &loss_grad_cpu);
-
-      LoDTensor scaled_logits;
-      T* scaled_logits_data =
-          scaled_logits.mutable_data<T>(logits_grad_dims, platform::CPUPlace());
-
-      const T* loss_grad_data = loss_grad_cpu.data<T>();
-      for (size_t i = 0; i < max_seq_length; ++i) {
-        for (size_t j = 0; j < num_sequences; ++j) {
-          for (size_t k = 0; k < seq_width; ++k) {
-            size_t idx = i * (num_sequences * seq_width) + j * seq_width + k;
-            scaled_logits_data[idx] =
-                logits_grad_cpu_data[idx] * loss_grad_data[j];
-          }
-        }
-      }
-
-      TensorCopySync(scaled_logits, ctx.GetPlace(), logits_grad);
-    } else {
-      math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), *warpctc_grad,
-          logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
-
-      const T* loss_grad_data = loss_grad->data<T>();
-      math::ScaleLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), loss_grad_data,
-          logits_grad);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_op.cc b/paddle/fluid/operators/where_op.cc
deleted file mode 100644
index 3b53ebec0b250c7181968e37f996ec9ef5cf2a2c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_op.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/where_op.h"
-
-namespace paddle {
-namespace operators {
-
-class WhereOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Condition"),
-                   "Input(Condition) of WhereOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputDim("Condition").size() >= 1,
-        "Input(Condition) should have number of dimension at least 1");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(OUt) of WhereOp should not be null.");
-    ctx->SetOutputDim("Out", {-1, ctx->GetInputDim("Condition").size()});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto output_type = framework::proto::VarType::INT64;
-    return framework::OpKernelType(output_type, ctx.device_context());
-  }
-};
-
-class WhereOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Condition", "A bool tensor whose rank is at least 1");
-    AddOutput("Out", "An int64 tensor of rank 2");
-    AddComment(R"DOC(
-      Return a int64 tensor with rank 2, specifying the coordinate of true element in `Condition`.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(where, ops::WhereOp, ops::WhereOpMaker);
-REGISTER_OP_CPU_KERNEL(where, ops::CPUWhereKernel<int64_t>);
diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu
deleted file mode 100644
index 27682f869c73c760bf475489a8bdd57e39cfaea5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_op.cu
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/where_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-class CUDAWhereKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    // TODO(zhoukunsheng): Should optimize to ensure GPU is faster than CPU.
-    framework::Tensor cond_cpu;
-    framework::TensorCopy(*condition, platform::CPUPlace(), &cond_cpu);
-
-    const bool* cond_data = cond_cpu.data<bool>();
-    int64_t numel = cond_cpu.numel();
-    auto dims = cond_cpu.dims();
-    int rank = dims.size();
-
-    thrust::host_vector<int> h_true_index;
-    for (int64_t i = 0; i < numel; i++) {
-      if (cond_data[i]) {
-        h_true_index.push_back(i);
-      }
-    }
-    thrust::device_vector<int> d_true_index = h_true_index;
-    int* ptr_true_index = thrust::raw_pointer_cast(d_true_index.data());
-
-    size_t true_num = h_true_index.size();
-
-    out->Resize(framework::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_ptr = out->mutable_data<T>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    thrust::host_vector<int> h_stride(rank, 0);
-    h_stride[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      h_stride[i] = h_stride[i + 1] * dims[i + 1];
-    }
-    thrust::device_vector<int> d_stride = h_stride;
-    int* ptr_stride = thrust::raw_pointer_cast(d_stride.data());
-
-    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
-    WhereFunctor<int*> functor(ptr_true_index, true_num, ptr_stride, rank,
-                               out_ptr);
-    platform::ForRange<CUDADeviceContext> for_range(dev_ctx, true_num);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(where, ops::CUDAWhereKernel<int64_t>);
diff --git a/paddle/fluid/operators/where_op.h b/paddle/fluid/operators/where_op.h
deleted file mode 100644
index 6a161a2668fa02f181ef99bfbfb501541988a333..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct WhereFunctor {
-  WhereFunctor(const T& true_index, int true_num, const T& stride, int rank,
-               int64_t* out)
-      : true_index_(true_index),
-        true_num_(true_num),
-        stride_(stride),
-        rank_(rank),
-        out_ptr_(out) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    int index = true_index_[idx];
-    for (int j = 0; j < rank_; j++) {
-      out_ptr_[idx * rank_ + j] = index / stride_[j];
-      index -= out_ptr_[idx * rank_ + j] * stride_[j];
-    }
-  }
-
-  const T true_index_;
-  int true_num_;
-  const T stride_;
-  int rank_;
-  int64_t* out_ptr_;
-};
-
-using CPUDeviceContext = paddle::platform::CPUDeviceContext;
-
-template <typename T>
-class CPUWhereKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const bool* cond_data = condition->data<bool>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    std::vector<int> true_index;
-    for (auto i = 0; i < numel; i++) {
-      if (cond_data[i]) {
-        true_index.push_back(i);
-      }
-    }
-    auto true_num = true_index.size();
-
-    out->Resize(framework::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_ptr = out->mutable_data<T>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    std::vector<int> stride(rank);
-    stride[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      stride[i] = stride[i + 1] * dims[i + 1];
-    }
-
-    auto& dev_ctx = context.template device_context<CPUDeviceContext>();
-    WhereFunctor<int*> functor(true_index.data(), true_num, stride.data(), rank,
-                               out_ptr);
-    platform::ForRange<CPUDeviceContext> for_range(dev_ctx, true_num);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
deleted file mode 100644
index a84f521f589ab680513852fdb83e593ba4946fe5..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/CMakeLists.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
-py_proto_compile(profiler_py_proto SRCS profiler.proto)
-
-add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-
-add_dependencies(profiler_py_proto profiler_py_proto_init)
-
-if (NOT WIN32)
-add_custom_command(TARGET profiler_py_proto POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
-        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
-        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-else(NOT WIN32)
-string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
-add_custom_command(TARGET profiler_py_proto POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
-        COMMAND copy /Y *.py ${proto_dstpath}
-        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif(NOT WIN32)
-
-cc_library(flags SRCS flags.cc DEPS gflags) 
-
-if(WITH_GPU)
-  nv_library(enforce SRCS enforce.cc DEPS flags)
-else()
-  cc_library(enforce SRCS enforce.cc DEPS flags)
-endif()
-cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
-
-set(CPU_INFO_DEPS gflags glog enforce)
-IF(WITH_XBYAK)
-    list(APPEND CPU_INFO_DEPS xbyak)
-ENDIF()
-cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
-cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
-
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
-
-cc_library(place SRCS place.cc DEPS enforce boost)
-cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
-
-add_subdirectory(dynload)
-
-cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
-cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
-
-set(dgc_deps "")
-IF(WITH_DGC)
-    set(dgc_deps dgc)
-ENDIF()
-
-IF(WITH_GPU)
-    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
-ENDIF()
-
-IF(WITH_MKLDNN)
-    set(MKLDNN_CTX_DEPS mkldnn)
-ELSE()
-    set(MKLDNN_CTX_DEPS)
-ENDIF()
-
-nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
-IF(WITH_GPU)
-  set(STREAM_CALLBACK_DEPS stream_callback_manager)
-ELSE()
-  set(STREAM_CALLBACK_DEPS)
-ENDIF()
-
-# memcpy depends on device_context, here add deps individually for
-# avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
-    ${dgc_deps})
-
-if (WITH_DISTRIBUTE)
-  cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto  device_context enforce)
-endif()
-
-if(WIN32)
-    if(WITH_GPU AND NOT WITH_DSO)
-        get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
-        target_link_libraries(device_context ${cuda_modules})
-    endif(WITH_GPU AND NOT WITH_DSO)
-endif(WIN32)
-
-nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-
-cc_test(init_test SRCS init_test.cc DEPS device_context)
-
-nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
-nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
-nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
-
-cc_library(timer SRCS timer.cc)
-cc_test(timer_test SRCS timer_test.cc DEPS timer)
-
-cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto)
-cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer)
-
-cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
-if(WITH_GPU)
-  nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
-  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
-  nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
-else()
-  cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
-  cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
-endif()
-cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-
-nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
-cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
-
-nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
-
-if(NOT APPLE AND NOT WIN32)
-  cc_library(device_code SRCS device_code.cc DEPS device_context)
-  if(WITH_GPU)
-    cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor)
-  endif()
-endif()
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
deleted file mode 100644
index 2025e5346f66565e9dd9fccc5a4f3051fb8467b2..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/collective_helper.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-
-#include <memory>
-#include <utility>
-
-#include "paddle/fluid/platform/dynload/nccl.h"
-
-namespace paddle {
-namespace platform {
-
-class NCCLCommImpl : public NCCLComm {
- public:
-  void set_ring_id(int ring_id) { ring_id_ = ring_id; }
-  int ring_id() const override { return ring_id_; }
-
-  void set_nranks(int nranks) { nranks_ = nranks; }
-  int nranks() const override { return nranks_; }
-
-  void set_rank(int rank) { rank_ = rank; }
-  int rank() const override { return rank_; }
-
-  int device_id() const override {
-    return boost::get<CUDAPlace>(dev_ctx_->GetPlace()).device;
-  }
-
-  ncclComm_t comm() const override { return dev_ctx_->nccl_comm(); }
-
-  cudaStream_t stream() const override { return dev_ctx_->stream(); }
-
-  void set_dev_ctx(std::unique_ptr<CUDADeviceContext>&& dev_ctx) {
-    dev_ctx_ = std::move(dev_ctx);
-  }
-
- private:
-  int ring_id_;
-  int nranks_;
-  int rank_;
-  std::unique_ptr<CUDADeviceContext> dev_ctx_;
-};
-
-NCCLComm* NCCLCommContext::CreateNCCLComm(ncclUniqueId* nccl_id, int nranks,
-                                          int rank, int dev_id, int ring_id) {
-  PADDLE_ENFORCE_NOT_NULL(nccl_id);
-  PADDLE_ENFORCE_GT(nranks, 1);
-  PADDLE_ENFORCE_GE(rank, 0);
-  PADDLE_ENFORCE_LT(rank, nranks);
-  PADDLE_ENFORCE_GE(dev_id, 0);
-
-  ncclComm_t comm = nullptr;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank));
-
-  std::unique_ptr<CUDADeviceContext> dev_ctx(
-      new CUDADeviceContext(CUDAPlace(dev_id)));
-  dev_ctx->set_nccl_comm(comm);
-
-  NCCLCommImpl* c = new NCCLCommImpl;
-  c->set_ring_id(ring_id);
-  c->set_nranks(nranks);
-  c->set_rank(rank);
-  c->set_dev_ctx(std::move(dev_ctx));
-
-  comm_map_mutex_.lock();
-  if (comm_map_.count(ring_id) == 0) {
-    comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<NCCLComm>>());
-  }
-  auto& dev2comm = comm_map_[ring_id];
-
-  dev2comm.emplace(dev_id, std::unique_ptr<NCCLComm>(c));
-  comm_map_mutex_.unlock();
-
-  VLOG(1) << "nccl communicator of rank " << rank << " in ring " << ring_id
-          << " has been created";
-
-  std::call_once(once_flag_, []() {
-    std::atexit([]() { NCCLCommContext::Instance().ReleaseNCCLComms(); });
-  });
-
-  return comm_map_[ring_id][dev_id].get();
-}
-
-void NCCLCommContext::CreateAllNCCLComms(const std::vector<int>& dev_ids,
-                                         int ring_id) {
-  PADDLE_ENFORCE_GT(dev_ids.size(), 0);
-
-  const int kDevices = dev_ids.size();
-  ncclComm_t comms[kDevices];
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
-      comms, dev_ids.size(), dev_ids.data()));
-
-  PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0);
-  comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<NCCLComm>>());
-
-  auto& dev2comm = comm_map_[ring_id];
-  for (size_t i = 0; i < dev_ids.size(); ++i) {
-    std::unique_ptr<CUDADeviceContext> dev_ctx(
-        new CUDADeviceContext(CUDAPlace(dev_ids[i])));
-    dev_ctx->set_nccl_comm(comms[i]);
-
-    NCCLCommImpl* c = new NCCLCommImpl;
-    c->set_ring_id(ring_id);
-    c->set_nranks(dev_ids.size());
-    c->set_rank(i);
-    c->set_dev_ctx(std::move(dev_ctx));
-
-    dev2comm.emplace(dev_ids[i], std::unique_ptr<NCCLComm>(c));
-  }
-
-  std::call_once(once_flag_, []() {
-    std::atexit([]() { NCCLCommContext::Instance().ReleaseNCCLComms(); });
-  });
-}
-
-void NCCLCommContext::ReleaseNCCLComms() {
-  // CUDADeviceContext maintain the lifetime of nccl_comm_t, so we should not
-  // destroy nccl_comm_t explicitly. Please refer to
-  // platform::CUDADeviceContext::~CUDADeviceContext()
-  for (auto& p : comm_map_) {
-    for (auto& q : p.second) {
-      q.second.reset();
-    }
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
deleted file mode 100644
index 747e840037ee96eba0abc8e9355c6e2a31a57338..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/collective_helper.h
+++ /dev/null
@@ -1,117 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "boost/variant.hpp"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-
-// In order to apply hierarchical communication with NCCL, we need
-// a communication ring contains NCCL communicators associated to a global
-// ncclUniqueId. E.g. for a hierarchical case,
-//
-//    11 - 12   21 - 22
-//     |    |    |    |
-//    13 - 14 - 23 - 24
-//          |    |
-//    31 - 32 - 41 - 42
-//     |    |    |    |
-//    33 - 34   43 - 44
-//
-// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
-// (31,32,33,34), (41,42,43,44) as bottoms respectively.
-//
-// We could also use a single communication ring for the flatten case
-//
-// The NCCLComm instance is created and reversed in the NCCLCommContext
-// singleton with a global user specified group id.
-class NCCLComm {
- public:
-  virtual int ring_id() const = 0;
-  virtual int nranks() const = 0;
-  virtual int rank() const = 0;
-  virtual int device_id() const = 0;
-  virtual ncclComm_t comm() const = 0;
-  virtual cudaStream_t stream() const = 0;
-  virtual ~NCCLComm() = default;
-};
-
-// A singleton NCCL communicator context reserves communication ring ids
-class NCCLCommContext {
- public:
-  static NCCLCommContext& Instance() {
-    static NCCLCommContext comm_ctx;
-    return comm_ctx;
-  }
-
-  NCCLComm* CreateNCCLComm(ncclUniqueId* nccl_id, int nranks, int rank,
-                           int dev_id, int ring_id = 0);
-
-  void CreateAllNCCLComms(const std::vector<int>& dev_ids, int ring_id = 0);
-
-  // retrieve a communicator by the ring id in multiprocessing mode
-  NCCLComm* Get(int ring_id) const {
-    PADDLE_ENFORCE_GT(comm_map_.count(ring_id), 0,
-                      "comunicator in ring id %d has not been initialized",
-                      ring_id);
-    PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
-                      "you should specify a device id to retrieve from "
-                      "multiple communicators");
-    return comm_map_.at(ring_id).begin()->second.get();
-  }
-
-  // retrieve a communicator by the ring id and the device id
-  NCCLComm* Get(int ring_id, int dev_id) const {
-    PADDLE_ENFORCE_GT(comm_map_.count(ring_id), 0,
-                      "comunicator of ring id %d has not been initialized",
-                      ring_id);
-    PADDLE_ENFORCE_GT(
-        comm_map_.at(ring_id).count(dev_id), 0,
-        "comunicator at device id %d has not been initialized in ring %d",
-        dev_id, ring_id);
-    return comm_map_.at(ring_id).at(dev_id).get();
-  }
-
-  // retrieve a communicator by the ring id and place
-  NCCLComm* Get(int ring_id, Place place) const {
-    return Get(ring_id, boost::get<CUDAPlace>(place).device);
-  }
-
- private:
-  std::once_flag once_flag_;
-  std::mutex comm_map_mutex_;
-  // ring id to dev-NCCLComm
-  std::map<int, std::map<int, std::unique_ptr<NCCLComm>>> comm_map_;
-
-  void ReleaseNCCLComms();
-
-  NCCLCommContext() = default;
-  DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
-};
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
deleted file mode 100644
index b737a6c38d0441cbfcd55ed2c55969ceca68db5d..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cpu_helper.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#endif
-
-namespace paddle {
-namespace platform {
-
-void SetNumThreads(int num_threads) {
-#ifdef PADDLE_USE_OPENBLAS
-// windows has no support for openblas multi-thread
-// please refer to: https://github.com/PaddlePaddle/Paddle/issues/7234
-#ifdef _WIN32
-  if (num_threads > 1) {
-    num_threads = 1;
-  }
-#endif
-  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  openblas_set_num_threads(real_num_threads);
-#elif defined(PADDLE_WITH_MKLML)
-  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
-  omp_set_num_threads(real_num_threads);
-#else
-  PADDLE_ENFORCE(false, "To be implemented.");
-#endif
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_helper.h b/paddle/fluid/platform/cpu_helper.h
deleted file mode 100644
index 78fc392b632ef92d4ae08de2051041fc0bf6778b..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cpu_helper.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>
-
-namespace paddle {
-namespace platform {
-
-//! Set the number of threads in use.
-void SetNumThreads(int num_threads);
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_helper_test.cc b/paddle/fluid/platform/cpu_helper_test.cc
deleted file mode 100644
index dc1b2b56cd98ca6259c46a76231dbc99482970c1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cpu_helper_test.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/cpu_helper.h"
-
-#include "gtest/gtest.h"
-
-TEST(CpuHelper, SetNumThread) {
-  paddle::platform::SetNumThreads(1);
-  paddle::platform::SetNumThreads(4);
-}
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
deleted file mode 100644
index b7ed66bd36369b0b31df3afbbd18e49fba8e23e1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cpu_info.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/cpu_info.h"
-
-#ifdef PADDLE_WITH_XBYAK
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-#endif
-
-#ifdef __APPLE__
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#elif defined(_WIN32)
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#include <windows.h>
-#else
-#include <unistd.h>
-#endif  // _WIN32
-
-#include <algorithm>
-#include "gflags/gflags.h"
-
-DECLARE_double(fraction_of_cpu_memory_to_use);
-DECLARE_uint64(initial_cpu_memory_in_mb);
-DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
-
-// If use_pinned_memory is true, CPUAllocator calls mlock, which
-// returns pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the amount
-// of memory available to the system for paging.  So, by default, we
-// should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
-
-namespace paddle {
-namespace platform {
-
-size_t CpuTotalPhysicalMemory() {
-#ifdef __APPLE__
-  int mib[2];
-  mib[0] = CTL_HW;
-  mib[1] = HW_MEMSIZE;
-  int64_t size = 0;
-  size_t len = sizeof(size);
-  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
-  return 0L;
-#elif defined(_WIN32)
-  MEMORYSTATUSEX sMeminfo;
-  sMeminfo.dwLength = sizeof(sMeminfo);
-  GlobalMemoryStatusEx(&sMeminfo);
-  return sMeminfo.ullTotalPhys;
-#else
-  int64_t pages = sysconf(_SC_PHYS_PAGES);
-  int64_t page_size = sysconf(_SC_PAGE_SIZE);
-  return pages * page_size;
-#endif
-}
-
-size_t CpuMaxAllocSize() {
-  // For distributed systems, it requires configuring and limiting
-  // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
-}
-
-size_t CpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 4 KB.
-  return 1 << 12;
-}
-
-size_t CpuMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
-  // or the initial_cpu_memory_in_mb.
-  return std::min(
-      static_cast<size_t>(CpuMaxAllocSize() / 32),
-      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
-}
-
-size_t CUDAPinnedMaxAllocSize() {
-  // For distributed systems, it requires configuring and limiting
-  // the fraction of memory to use.
-  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
-}
-
-size_t CUDAPinnedMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 64 KB.
-  return 1 << 16;
-}
-
-size_t CUDAPinnedMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 1/256 of CUDA_PINNED
-  // memory.
-  return CUDAPinnedMaxAllocSize() / 256;
-}
-
-#ifdef PADDLE_WITH_XBYAK
-static Xbyak::util::Cpu cpu;
-bool MayIUse(const cpu_isa_t cpu_isa) {
-  using namespace Xbyak::util;  // NOLINT
-  switch (cpu_isa) {
-    case sse42:
-      return cpu.has(Cpu::tSSE42);
-    case avx:
-      return cpu.has(Cpu::tAVX);
-    case avx2:
-      return cpu.has(Cpu::tAVX2);
-    case avx512f:
-      return cpu.has(Cpu::tAVX512F);
-    case avx512_core:
-      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
-             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ);
-    case avx512_core_vnni:
-      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
-             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) &&
-             cpu.has(Cpu::tAVX512_VNNI);
-    case avx512_mic:
-      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512CD) &&
-             cpu.has(Cpu::tAVX512ER) && cpu.has(Cpu::tAVX512PF);
-    case avx512_mic_4ops:
-      return true && MayIUse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) &&
-             cpu.has(Cpu::tAVX512_4VNNIW);
-    case isa_any:
-      return true;
-  }
-  return false;
-}
-#else
-bool MayIUse(const cpu_isa_t cpu_isa) {
-  if (cpu_isa == isa_any) {
-    return true;
-  } else {
-    return false;
-  }
-}
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
deleted file mode 100644
index c70e3be858fe72f298a5e553bcca189641392cdc..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cpu_info.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>
-
-#ifdef _WIN32
-#if defined(__AVX2__)
-#include <immintrin.h>  //avx2
-#elif defined(__AVX__)
-#include <intrin.h>  //avx
-#endif               // AVX
-#else                // WIN32
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-#endif  // WIN32
-
-#if defined(_WIN32)
-#define ALIGN32_BEG __declspec(align(32))
-#define ALIGN32_END
-#else
-#define ALIGN32_BEG
-#define ALIGN32_END __attribute__((aligned(32)))
-#endif  // _WIN32
-
-namespace paddle {
-namespace platform {
-
-size_t CpuTotalPhysicalMemory();
-
-//! Get the maximum allocation size for a machine.
-size_t CpuMaxAllocSize();
-
-//! Get the maximum allocation size for a machine.
-size_t CUDAPinnedMaxAllocSize();
-
-//! Get the minimum chunk size for buddy allocator.
-size_t CpuMinChunkSize();
-
-//! Get the maximum chunk size for buddy allocator.
-size_t CpuMaxChunkSize();
-
-//! Get the minimum chunk size for buddy allocator.
-size_t CUDAPinnedMinChunkSize();
-
-//! Get the maximum chunk size for buddy allocator.
-size_t CUDAPinnedMaxChunkSize();
-
-typedef enum {
-  isa_any,
-  sse42,
-  avx,
-  avx2,
-  avx512f,
-  avx512_core,
-  avx512_core_vnni,
-  avx512_mic,
-  avx512_mic_4ops,
-} cpu_isa_t;  // Instruction set architecture
-
-// May I use some instruction
-bool MayIUse(const cpu_isa_t cpu_isa);
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info_test.cc b/paddle/fluid/platform/cpu_info_test.cc
deleted file mode 100644
index aac882e846309f23f49f68aba805da0857c7fb2d..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cpu_info_test.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/platform/cpu_info.h"
-
-#include <ostream>
-#include <sstream>
-
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/string/printf.h"
-
-DECLARE_double(fraction_of_cpu_memory_to_use);
-
-TEST(CpuMemoryUsage, Print) {
-  std::stringstream ss;
-  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
-  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
-
-  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
-                                       use_percent, memory_size)
-            << std::endl;
-}
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
deleted file mode 100644
index 202613244deb02c05c39ed18abaa18d79078db33..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cuda_device_function.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cuda.h>
-// NOTE(): support float16 to half in header file.
-#define PADDLE_CUDA_FP16
-#include <cuda_fp16.h>
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace platform {
-
-#if CUDA_VERSION < 9000
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
-#define FULL_WARP_MASK 0xFFFFFFFF
-#define CREATE_SHFL_MASK(mask, predicate) \
-  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-#endif
-
-inline static int RoundToPowerOfTwo(int dim) {
-  if (dim > 512) {
-    return 1024;
-  } else if (dim > 256) {
-    return 512;
-  } else if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
-#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
-  case (dim): {                            \
-    constexpr auto kPowerOfTwoDim = (dim); \
-    __VA_ARGS__;                           \
-  } break
-
-#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
-  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
-  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
-  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
-
-template <typename T>
-__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
-                                                 int delta,
-                                                 int width = warpSize) {
-#if CUDA_VERSION < 9000
-  return __shfl_down(val, delta, width);
-#else
-  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
-#endif
-}
-
-template <typename T>
-__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
-                                                int width = warpSize) {
-#if CUDA_VERSION < 9000
-  return __shfl_xor(val, width);
-#else
-  return __shfl_xor_sync(mask, val, width);
-#endif
-}
-
-// CUDA 9.0 have native compatible float16 shfl_down
-#if CUDA_VERSION < 9000
-template <>
-__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
-                                                       float16 val, int delta,
-                                                       int width) {
-  return float16(
-      __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
-}
-template <>
-__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
-                                                      float16 val, int width) {
-  return float16(__shfl_xor(static_cast<half>(val), width));
-}
-#else
-template <>
-__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
-                                                       float16 val, int delta,
-                                                       int width) {
-  return float16(__shfl_down_sync(mask, static_cast<half>(val),
-                                  static_cast<unsigned>(delta), width));
-}
-template <>
-__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
-                                                      float16 val, int width) {
-  return float16(__shfl_xor_sync(mask, static_cast<half>(val), width));
-}
-#endif
-
-template <typename T>
-__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
-                                             int width = 32) {
-#if CUDA_VERSION < 9000
-  return __shfl(val, src_line, width);
-#else
-  return __shfl_sync(mask, val, src_line, width);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE T Infinity() {
-  return INFINITY;
-}
-
-template <typename T>
-__device__ T reduceSum(T val, int tid, int len) {
-  // NOTE(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
-  const int warpSize = 32;
-  __shared__ T shm[warpSize];
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, tid < len);
-
-  for (int offset = warpSize / 2; offset > 0; offset /= 2)
-    val += platform::CudaShuffleDownSync(mask, val, offset);
-
-  if (tid < warpSize) shm[tid] = 0;
-  __syncthreads();
-
-  if (tid % warpSize == 0) {
-    shm[tid / warpSize] = val;
-  }
-  __syncthreads();
-
-  CREATE_SHFL_MASK(mask, tid < warpSize);
-
-  if (tid < warpSize) {
-    val = shm[tid];
-    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-      val += platform::CudaShuffleDownSync(mask, val, offset);
-  }
-  return val;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_device_guard.cc b/paddle/fluid/platform/cuda_device_guard.cc
deleted file mode 100644
index 8582ec9f604f96b244a0f2d650aa8d669d6fc66c..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cuda_device_guard.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/platform/cuda_device_guard.h"
-
-namespace paddle {
-namespace platform {
-// Even this source file does not contains any code, it is better to keep this
-// source file for cmake dependency.
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_device_guard.h b/paddle/fluid/platform/cuda_device_guard.h
deleted file mode 100644
index a85ebf4b8136630712d39d98e2341ee919cf6e45..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cuda_device_guard.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace platform {
-
-class CUDADeviceGuard {
- public:
-  explicit inline CUDADeviceGuard(int dev_id) {
-    int prev_id = platform::GetCurrentDeviceId();
-    if (prev_id != dev_id) {
-      prev_id_ = prev_id;
-      platform::SetDeviceId(dev_id);
-    }
-  }
-
-  inline ~CUDADeviceGuard() {
-    if (prev_id_ != -1) {
-      platform::SetDeviceId(prev_id_);
-    }
-  }
-
-  CUDADeviceGuard(const CUDADeviceGuard& o) = delete;
-  CUDADeviceGuard& operator=(const CUDADeviceGuard& o) = delete;
-
- private:
-  int prev_id_{-1};
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
deleted file mode 100644
index c21924ae483997e80d06d041afe2a8ff9d6e19c6..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cuda_helper.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mutex>  // NOLINT
-
-#include "paddle/fluid/platform/dynload/cublas.h"
-#include "paddle/fluid/platform/macros.h"
-
-#if CUDA_VERSION < 9000
-enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
-#endif
-
-namespace paddle {
-namespace platform {
-
-class CublasHandleHolder {
- public:
-  CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
-#if CUDA_VERSION >= 9000
-    if (math_type == CUBLAS_TENSOR_OP_MATH) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));
-    }
-#endif
-  }
-
-  ~CublasHandleHolder() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
-  }
-
-  template <typename Callback>
-  inline void Call(Callback &&callback) const {
-    std::lock_guard<std::mutex> guard(mtx_);
-    callback(handle_);
-  }
-
- private:
-  DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
-
-  cublasHandle_t handle_;
-  mutable std::mutex mtx_;
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
deleted file mode 100644
index 9e3025bf30b8849472e33a71228eb16814157b21..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <iostream>
-#ifdef _WIN32
-#include <numeric>
-#endif
-#include <random>
-
-#define PADDLE_CUDA_FP16
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
-using paddle::platform::float16;
-
-template <typename T>
-__global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
-  }
-}
-
-template <typename T>
-struct AddFunctor {
-  T operator()(const T& a, const T& b) { return a + b; }
-};
-
-template <typename T>
-void TestCase(size_t num) {
-  T *in1, *in2, *out;
-  T *d_in1, *d_in2;
-  size_t size = sizeof(T) * num;
-  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
-  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
-  in1 = reinterpret_cast<T*>(malloc(size));
-  in2 = reinterpret_cast<T*>(malloc(size));
-  out = reinterpret_cast<T*>(malloc(size));
-  std::minstd_rand engine;
-  std::uniform_real_distribution<double> dist(0.0, 1.0);
-  for (size_t i = 0; i < num; ++i) {
-    in1[i] = static_cast<T>(dist(engine));
-    in2[i] = static_cast<T>(dist(engine));
-  }
-  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
-  AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
-  cudaDeviceSynchronize();
-  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
-  for (size_t i = 0; i < num; ++i) {
-    // NOTE(dzhwinter): the float16 add has small underflow/overflow
-    // so we use EXPECT_NEAR to check the result.
-    EXPECT_NEAR(static_cast<float>(out[i]),
-                static_cast<float>(AddFunctor<T>()(in1[i], in2[i])), 0.001);
-  }
-  free(in1);
-  free(in2);
-  free(out);
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-}
-
-// cuda primitives
-TEST(CudaAtomic, Add) {
-  TestCase<float>(static_cast<size_t>(10));
-  TestCase<float>(static_cast<size_t>(1024 * 1024));
-
-  TestCase<double>(static_cast<size_t>(10));
-  TestCase<double>(static_cast<size_t>(1024 * 1024));
-}
-
-TEST(CudaAtomic, float16) {
-  TestCase<float16>(static_cast<size_t>(1));
-  TestCase<float16>(static_cast<size_t>(2));
-  TestCase<float16>(static_cast<size_t>(3));
-
-  TestCase<float16>(static_cast<size_t>(10));
-  TestCase<float16>(static_cast<size_t>(1024 * 1024));
-}
-
-// unalignment of uint8
-void TestUnalign(size_t num, const int shift_bit) {
-  ASSERT_EQ(num % 2, 0);
-  float16 *in1, *in2, *out;
-  float16 *d_in1, *d_in2;
-  size_t size = sizeof(uint8_t) * (num + shift_bit);
-  size_t array_size = sizeof(float16) * (num / 2);
-
-  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
-  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
-  in1 = reinterpret_cast<float16*>(malloc(size));
-  in2 = reinterpret_cast<float16*>(malloc(size));
-  out = reinterpret_cast<float16*>(malloc(size));
-
-  // right shift 1, mimic the unalignment of address
-  float16* r_in1 =
-      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in1) + shift_bit);
-  float16* r_in2 =
-      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in2) + shift_bit);
-
-  std::minstd_rand engine;
-  std::uniform_real_distribution<double> dist(0.0, 1.0);
-  for (size_t i = 0; i < num / 2; ++i) {
-    r_in1[i] = static_cast<float16>(dist(engine));
-    r_in2[i] = static_cast<float16>(dist(engine));
-  }
-  cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
-  AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
-  cudaDeviceSynchronize();
-  cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
-  for (size_t i = 0; i < num / 2; ++i) {
-    // NOTE(dzhwinter): the float16 add has small truncate error.
-    // so we use EXPECT_NEAR to check the result.
-    EXPECT_NEAR(static_cast<float>(out[i]),
-                static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
-                0.001);
-  }
-  free(in1);
-  free(in2);
-  free(out);
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-}
-
-TEST(CudaAtomic, float16Unalign) {
-  // same with float16 testcase
-  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 2);
-  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 2);
-  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 2);
-
-  // shift the address.
-  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 1);
-  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 1);
-  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 1);
-
-  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 3);
-  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
-  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
-}
-
-// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
-template <typename T>
-static __forceinline__ __device__ T WarpReduceSum(T val) {
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
-  }
-  return val;
-}
-
-template <typename T>
-__forceinline__ __device__ T BlockReduce(T val) {
-  static __shared__ T shared[32];  // Shared mem for 32 partial sums
-  int lane = threadIdx.x % warpSize;
-  int wid = threadIdx.x / warpSize;
-
-  val = WarpReduceSum(val);  // Each warp performs partial reduction
-
-  if (lane == 0) shared[wid] = val;  // Write reduced value to shared memory
-
-  __syncthreads();  // Wait for all partial reductions
-
-  // read from shared memory only if that warp existed
-  val =
-      (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<T>(0);
-
-  if (wid == 0) val = WarpReduceSum(val);  // Final reduce within first warp
-
-  return val;
-}
-
-template <typename T>
-__global__ void DeviceReduceSum(T* in, T* out, size_t N) {
-  T sum(0);
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    sum += in[i];
-  }
-  sum = BlockReduce<T>(sum);
-  __syncthreads();
-  if (threadIdx.x == 0) out[blockIdx.x] = sum;
-}
-
-template <typename T>
-void TestReduce(size_t num, float atol = 0.01) {
-  T* in1;
-  T *d_in1, *d_in2;
-  size_t size = sizeof(T) * num;
-  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
-  cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
-  in1 = reinterpret_cast<T*>(malloc(size));
-  std::minstd_rand engine;
-  std::uniform_real_distribution<double> dist(0.0, 1.0);
-  for (size_t i = 0; i < num; ++i) {
-    in1[i] = static_cast<T>(dist(engine));
-  }
-  auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
-  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
-  cudaDeviceSynchronize();
-  DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
-  cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
-  // NOTE(dzhwinter): the float16 add has small underflow/overflow
-  // so we use EXPECT_NEAR to check the result.
-  EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
-  free(in1);
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-}
-
-TEST(CudaShuffleSync, float16) {
-  TestReduce<float>(10);
-  TestReduce<float>(1000);
-
-  // float16 will overflow or accumulate truncate errors in big size.
-  TestReduce<float16>(10);
-  TestReduce<float16>(100, /*atol error*/ 1.0);
-}
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
deleted file mode 100644
index 67ea64833d3b844d88a2e5996f860ef165bd8ffd..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cuda_primitives.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cuda.h>
-#include <stdio.h>
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace platform {
-
-#define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T *address, const T val)
-
-#define USE_CUDA_ATOMIC(op, T) \
-  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
-
-// Default thread count per block(or block size).
-// TODO(typhoonzero): need to benchmark against setting this value
-//                    to 1024.
-constexpr int PADDLE_CUDA_NUM_THREADS = 512;
-
-// For atomicAdd.
-USE_CUDA_ATOMIC(Add, float);
-USE_CUDA_ATOMIC(Add, int);
-USE_CUDA_ATOMIC(Add, unsigned int);
-// CUDA API uses unsigned long long int, we cannot use uint64_t here.
-// It because unsigned long long int is not necessarily uint64_t
-USE_CUDA_ATOMIC(Add, unsigned long long int);  // NOLINT
-
-CUDA_ATOMIC_WRAPPER(Add, int64_t) {
-  // Here, we check long long int must be int64_t.
-  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
-                "long long should be int64");
-  return CudaAtomicAdd(
-      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
-      static_cast<unsigned long long int>(val));            // NOLINT
-}
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
-USE_CUDA_ATOMIC(Add, double);
-#else
-CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int *address_as_ull =                  // NOLINT
-      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-
-    // Note: uses integer comparison to avoid hang in case of NaN
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-}
-#endif
-
-#ifdef PADDLE_CUDA_FP16
-// NOTE(dzhwinter): cuda do not have atomicCAS for half.
-// Just use the half address as a unsigned value address and
-// do the atomicCAS. According to the value store at high 16 bits
-// or low 16 bits, then do a different sum and CAS.
-// Given most warp-threads will failed on the atomicCAS, so this
-// implemented should be avoided in high concurrency. It's will be
-// slower than the way convert value into 32bits and do a full atomicCAS.
-
-// convert the value into float and do the add arithmetic.
-// then store the result into a uint32.
-inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
-  float16 low_half;
-  // the float16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half = static_cast<float16>(static_cast<float>(low_half) + x);
-  return (val & 0xFFFF0000u) | low_half.x;
-}
-
-inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
-  float16 high_half;
-  // the float16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half = static_cast<float16>(static_cast<float>(high_half) + x);
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-
-CUDA_ATOMIC_WRAPPER(Add, float16) {
-  // concrete packed float16 value may exsits in lower or higher 16bits
-  // of the 32bits address.
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t sum;
-  uint32_t newval;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // the float16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // the float16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-
-#endif
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
deleted file mode 100644
index 41d7c121469edd24c67b4288793cb95159fd4b62..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cuda_profiler.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <cuda_profiler_api.h>
-
-#include <string>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-
-void CudaProfilerInit(std::string output_file, std::string output_mode,
-                      std::string config_file) {
-  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
-  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
-  PADDLE_ENFORCE(
-      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
-}
-
-void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
-
-void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h
deleted file mode 100644
index f503dfb3878464fd13d977e377fbcc4e5fd4dc1a..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cudnn_desc.h
+++ /dev/null
@@ -1,205 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <functional>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <vector>
-#include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace platform {
-using framework::Tensor;
-
-template <typename T>
-inline cudnnDataType_t ToCudnnDataType(const T& t) {
-  auto type = framework::ToDataType(t);
-  return ToCudnnDataType(type);
-}
-
-template <>
-inline cudnnDataType_t ToCudnnDataType(
-    const framework::proto::VarType::Type& t) {
-  cudnnDataType_t type = CUDNN_DATA_FLOAT;
-  switch (t) {
-    case framework::proto::VarType::FP16:
-      type = CUDNN_DATA_HALF;
-      break;
-    case framework::proto::VarType::FP32:
-      type = CUDNN_DATA_FLOAT;
-      break;
-    case framework::proto::VarType::FP64:
-      type = CUDNN_DATA_DOUBLE;
-      break;
-    default:
-      break;
-  }
-  return type;
-}
-
-class ActivationDescriptor {
- public:
-  using T = cudnnActivationStruct;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        CUDNN_ENFORCE(dynload::cudnnDestroyActivationDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  ActivationDescriptor() {
-    T* raw_ptr;
-    CUDNN_ENFORCE(dynload::cudnnCreateActivationDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  template <typename T>
-  void set(cudnnActivationMode_t mode, const T& coef) {
-    CUDNN_ENFORCE(dynload::cudnnSetActivationDescriptor(
-        desc_.get(), mode, CUDNN_NOT_PROPAGATE_NAN, static_cast<double>(coef)));
-  }
-
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-class TensorDescriptor {
- public:
-  using T = cudnnTensorStruct;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        CUDNN_ENFORCE(dynload::cudnnDestroyTensorDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  TensorDescriptor() {
-    T* raw_ptr;
-    CUDNN_ENFORCE(dynload::cudnnCreateTensorDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-  void set(const Tensor& tensor, const int groups = 1) {
-    auto dims = framework::vectorize<int>(tensor.dims());
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    std::vector<int> dims_with_group(dims.begin(), dims.end());
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-    CUDNN_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
-        desc_.get(), ToCudnnDataType(tensor.type()), dims_with_group.size(),
-        dims_with_group.data(), strides.data()));
-  }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-class FilterDescriptor {
- public:
-  using T = cudnnFilterStruct;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        CUDNN_ENFORCE(dynload::cudnnDestroyFilterDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  FilterDescriptor() {
-    T* raw_ptr;
-    CUDNN_ENFORCE(dynload::cudnnCreateFilterDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
-  void set(const Tensor& tensor, const cudnnTensorFormat_t format,
-           const int groups = 1) {
-    auto dims = framework::vectorize<int>(tensor.dims());
-    if (groups > 1) {
-      dims[1] = dims[1] / groups;
-    }
-    CUDNN_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
-        desc_.get(), ToCudnnDataType(tensor.type()), format, dims.size(),
-        dims.data()));
-  }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-class ConvolutionDescriptor {
- public:
-  using T = cudnnConvolutionStruct;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        CUDNN_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  ConvolutionDescriptor() {
-    T* raw_ptr;
-    CUDNN_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
-  void set(cudnnDataType_t dtype, const std::vector<int>& pads,
-           const std::vector<int>& strides, const std::vector<int>& dilations,
-           const int groups = 1) {
-    cudnnDataType_t compute_type =
-        (dtype == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
-    T* desc = desc_.get();
-    CUDNN_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
-        desc, pads.size(), pads.data(), strides.data(), dilations.data(),
-        CUDNN_CROSS_CORRELATION, compute_type));
-#if CUDNN_VERSION_MIN(7, 0, 1)
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnSetConvolutionGroupCount(desc, groups));
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        desc, CUDNN_DEFAULT_MATH));
-    if (dtype == CUDNN_DATA_HALF) {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-          desc, CUDNN_TENSOR_OP_MATH));
-    }
-#endif
-#endif
-  }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_desc_test.cc b/paddle/fluid/platform/cudnn_desc_test.cc
deleted file mode 100644
index a60102a54899b25c89d8c131220dde21f77bba70..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cudnn_desc_test.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/platform/cudnn_desc.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace platform {
-
-TEST(TensorDescriptor, Empty) {
-  ActivationDescriptor a;
-  TensorDescriptor t;
-  TensorDescriptor t1;
-  TensorDescriptor *t11 = new TensorDescriptor();
-  delete t11;
-  std::unique_ptr<TensorDescriptor> tt(new TensorDescriptor());
-}
-
-TEST(TensorDescriptor, Normal) {
-  framework::Tensor tt;
-  tt.Resize({2, 3, 4});
-  tt.mutable_data<float>(platform::CPUPlace());
-
-  TensorDescriptor desc;
-  desc.set(tt);
-  EXPECT_TRUE(desc.desc() != nullptr);
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
deleted file mode 100644
index 8c124e71583ae154baae38a5441a4a07116e50da..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cudnn_helper.h
+++ /dev/null
@@ -1,496 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/dynload/cudnn.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/macros.h"
-
-DECLARE_bool(cudnn_deterministic);
-
-namespace paddle {
-namespace platform {
-
-inline const char* cudnnGetErrorString(cudnnStatus_t status) {
-  switch (status) {
-    case CUDNN_STATUS_SUCCESS:
-      return "CUDNN_STATUS_SUCCESS";
-    case CUDNN_STATUS_NOT_INITIALIZED:
-      return "CUDNN_STATUS_NOT_INITIALIZED";
-    case CUDNN_STATUS_ALLOC_FAILED:
-      return "CUDNN_STATUS_ALLOC_FAILED";
-    case CUDNN_STATUS_BAD_PARAM:
-      return "CUDNN_STATUS_BAD_PARAM";
-    case CUDNN_STATUS_INTERNAL_ERROR:
-      return "CUDNN_STATUS_INTERNAL_ERROR";
-    case CUDNN_STATUS_INVALID_VALUE:
-      return "CUDNN_STATUS_INVALID_VALUE";
-    case CUDNN_STATUS_ARCH_MISMATCH:
-      return "CUDNN_STATUS_ARCH_MISMATCH";
-    case CUDNN_STATUS_MAPPING_ERROR:
-      return "CUDNN_STATUS_MAPPING_ERROR";
-    case CUDNN_STATUS_EXECUTION_FAILED:
-      return "CUDNN_STATUS_EXECUTION_FAILED";
-    case CUDNN_STATUS_NOT_SUPPORTED:
-      return "CUDNN_STATUS_NOT_SUPPORTED";
-    case CUDNN_STATUS_LICENSE_ERROR:
-      return "CUDNN_STATUS_LICENSE_ERROR";
-    default:
-      return "Unknown cudnn error number";
-  }
-}
-
-#define CUDNN_VERSION_MIN(major, minor, patch) \
-  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
-
-#define CUDNN_ENFORCE(condition)                                     \
-  do {                                                               \
-    auto status = condition;                                         \
-    if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) {                  \
-      PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
-    }                                                                \
-  } while (false)
-
-enum class DataLayout {  // Not use
-  kNHWC,
-  kNCHW,
-  kNCDHW,
-  kNCHW_VECT_C,
-};
-
-enum class PoolingMode {
-  kMaximum,
-  kMaximumDeterministic,
-  kAverageExclusive,
-  kAverageInclusive,
-};
-
-enum ActivationMode {
-  kNone,  // activation identity
-  kSigmoid,
-  kRelu,
-  kRelu6,
-  kReluX,
-  kTanh,
-  kBandPass,
-};
-
-#if CUDNN_VERSION < 6000
-#pragma message "CUDNN version under 6.0 is supported at best effort."
-#pragma message "We strongly encourage you to move to 6.0 and above."
-#pragma message "This message is intended to annoy you enough to update."
-#pragma message \
-    "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/"
-
-inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
-  switch (mode) {
-    case PoolingMode::kMaximumDeterministic:
-      return CUDNN_POOLING_MAX;
-    case PoolingMode::kAverageExclusive:
-      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-    case PoolingMode::kAverageInclusive:
-      return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-    case PoolingMode::kMaximum:
-      return CUDNN_POOLING_MAX;
-    default:
-      PADDLE_THROW("Unexpected pooling mode.");
-  }
-}
-#else
-
-inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
-  switch (mode) {
-    case PoolingMode::kMaximumDeterministic:
-      return CUDNN_POOLING_MAX_DETERMINISTIC;
-    case PoolingMode::kAverageExclusive:
-      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-    case PoolingMode::kAverageInclusive:
-      return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-    case PoolingMode::kMaximum:
-      return CUDNN_POOLING_MAX;
-    default:
-      PADDLE_THROW("Unexpected pooling mode.");
-  }
-}
-#endif  // CUDNN_VERSION < 6000
-
-inline ActivationMode StringToActivationMode(const std::string& str) {
-  if (str == "identity") {
-    return ActivationMode::kNone;
-  } else if (str == "sigmoid") {
-    return ActivationMode::kSigmoid;
-  } else if (str == "relu") {
-    return ActivationMode::kRelu;
-  } else if (str == "relu6") {
-    return ActivationMode::kRelu6;
-  } else if (str == "relux") {
-    return ActivationMode::kReluX;
-  } else if (str == "tanh") {
-    return ActivationMode::kTanh;
-  } else if (str == "bandpass") {
-    return ActivationMode::kBandPass;
-  } else {
-    PADDLE_THROW("Unknown activation string: %s", str);
-  }
-}
-
-template <typename T>
-class CudnnDataType;
-
-template <>
-class CudnnDataType<float16> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_HALF;
-  // The scaling param type is float for HALF and FLOAT tensors
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class CudnnDataType<float> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class CudnnDataType<double> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
-  using ScalingParamType = const double;
-  using BatchNormParamType = double;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-inline cudnnTensorFormat_t GetCudnnTensorFormat(
-    const DataLayout& order) {  // Not use
-  switch (order) {
-    case DataLayout::kNHWC:
-      return CUDNN_TENSOR_NHWC;
-    case DataLayout::kNCHW:
-      return CUDNN_TENSOR_NCHW;
-    case DataLayout::kNCDHW:
-      return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
-    default:
-      PADDLE_THROW("Unknown cudnn equivalent for order");
-  }
-  return CUDNN_TENSOR_NCHW;
-}
-
-class ScopedTensorDescriptor {
- public:
-  ScopedTensorDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&desc_));
-  }
-  ~ScopedTensorDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(desc_));
-  }
-
-  inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
-                                            const cudnnDataType_t type,
-                                            const std::vector<int>& dims,
-                                            const int groups = 1) {
-    // the format is not used now, will add later
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    // Update tensor descriptor dims setting if groups > 1
-    // NOTE: Assume using NCHW or NCDHW order
-    std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
-        desc_, type, dims_with_group.size(), dims_with_group.data(),
-        strides.data()));
-    return desc_;
-  }
-
-  template <typename T>
-  inline cudnnTensorDescriptor_t descriptor(const DataLayout& order,
-                                            const std::vector<int>& dims,
-                                            const int groups = 1) {
-    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims,
-                      groups);
-  }
-
- private:
-  cudnnTensorDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
-};
-
-class ScopedFilterDescriptor {
- public:
-  ScopedFilterDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&desc_));
-  }
-  ~ScopedFilterDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(desc_));
-  }
-
-  inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
-                                            const cudnnDataType_t type,
-                                            const std::vector<int>& kernel,
-                                            const int groups = 1) {
-    // filter layout: MCHW(MCDHW), where M is the number of
-    // output image channels, C is the number of input image channels,
-    // D is the depth of the filter, H is the height of the filter, and W is the
-    // width of the filter.
-    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
-    if (groups > 1) {
-      kernel_with_group[0] /= groups;
-      // NOTE: input filter(C) of the filter is already asserted to be C/groups.
-    }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
-        desc_, type, format, kernel_with_group.size(),
-        kernel_with_group.data()));
-    return desc_;
-  }
-
-  template <typename T>
-  inline cudnnFilterDescriptor_t descriptor(const DataLayout& order,
-                                            const std::vector<int>& kernel,
-                                            const int groups = 1) {
-    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
-                      kernel, groups);
-  }
-
- private:
-  cudnnFilterDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
-};
-
-class ScopedConvolutionDescriptor {
- public:
-  ScopedConvolutionDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnCreateConvolutionDescriptor(&desc_));
-  }
-  ~ScopedConvolutionDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnDestroyConvolutionDescriptor(desc_));
-  }
-
-  inline cudnnConvolutionDescriptor_t descriptor(
-      cudnnDataType_t type, const std::vector<int>& pads,
-      const std::vector<int>& strides, const std::vector<int>& dilations) {
-    PADDLE_ENFORCE_EQ(pads.size(), strides.size());
-    PADDLE_ENFORCE_EQ(pads.size(), dilations.size());
-
-#if !CUDNN_VERSION_MIN(6, 0, 0)
-    // cudnn v5 does not support dilation conv, the argument is called upscale
-    // instead of dilations and it is must be one.
-    for (size_t i = 0; i < dilations.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          dilations[i], 1,
-          "Dilations conv is not supported in this cuDNN version(%d.%d.%d).",
-          CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
-          CUDNN_VERSION % 100);
-    }
-#endif
-
-    cudnnDataType_t compute_type =
-        (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
-        desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
-        CUDNN_CROSS_CORRELATION, compute_type));
-    return desc_;
-  }
-
-  template <typename T>
-  inline cudnnConvolutionDescriptor_t descriptor(
-      const std::vector<int>& pads, const std::vector<int>& strides,
-      const std::vector<int>& dilations) {
-    return descriptor(CudnnDataType<T>::type, pads, strides, dilations);
-  }
-
- private:
-  cudnnConvolutionDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
-};
-
-class ScopedPoolingDescriptor {
- public:
-  ScopedPoolingDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreatePoolingDescriptor(&desc_));
-  }
-  ~ScopedPoolingDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyPoolingDescriptor(desc_));
-  }
-
-  inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
-                                             const std::vector<int>& kernel,
-                                             const std::vector<int>& pads,
-                                             const std::vector<int>& strides) {
-    PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
-    PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetPoolingNdDescriptor(
-        desc_, (GetPoolingMode(mode)),
-        CUDNN_PROPAGATE_NAN,  // Always propagate nans.
-        kernel.size(), kernel.data(), pads.data(), strides.data()));
-    return desc_;
-  }
-
- private:
-  cudnnPoolingDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
-};
-
-class ScopedSpatialTransformerDescriptor {
- public:
-  ScopedSpatialTransformerDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
-  }
-  ~ScopedSpatialTransformerDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
-  }
-
-  template <typename T>
-  inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims,
-                                                        const int dimA[]) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetSpatialTransformerNdDescriptor(
-        desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType<T>::type, nbDims, dimA));
-    return desc_;
-  }
-
- private:
-  cudnnSpatialTransformerDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedSpatialTransformerDescriptor);
-};
-
-class ScopedActivationDescriptor {
- public:
-  ScopedActivationDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnCreateActivationDescriptor(&desc_));
-  }
-  ~ScopedActivationDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnDestroyActivationDescriptor(desc_));
-  }
-
-  template <typename T>
-  inline cudnnActivationDescriptor_t descriptor(
-      const std::string& act, double value_max = static_cast<double>(0.)) {
-    double relu_ceiling = 0.0;
-    ActivationMode activation_mode = StringToActivationMode(act);
-    cudnnActivationMode_t mode;
-    switch (activation_mode) {
-#if CUDNN_VERSION >= 7100
-      case ActivationMode::kNone:
-        mode = CUDNN_ACTIVATION_IDENTITY;
-        break;
-#endif
-      case ActivationMode::kRelu6:
-        relu_ceiling = 6.0;
-        mode = CUDNN_ACTIVATION_CLIPPED_RELU;
-        break;
-      case ActivationMode::kReluX:
-        relu_ceiling = value_max;
-        mode = CUDNN_ACTIVATION_CLIPPED_RELU;
-        break;
-      case ActivationMode::kRelu:
-        mode = CUDNN_ACTIVATION_RELU;
-        break;
-      case ActivationMode::kSigmoid:
-        mode = CUDNN_ACTIVATION_SIGMOID;
-        break;
-      case ActivationMode::kTanh:
-        mode = CUDNN_ACTIVATION_TANH;
-        break;
-      default:
-        PADDLE_THROW("unrecognized activation mode: %d .",
-                     static_cast<int>(activation_mode));
-    }
-    CUDNN_ENFORCE(dynload::cudnnSetActivationDescriptor(
-        desc_, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling));
-    return desc_;
-  }
-
- private:
-  cudnnActivationDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor);
-};
-
-inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (use_cudnn) {
-    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-  }
-#endif
-  return use_cudnn;
-}
-
-#if CUDNN_VERSION >= 7001
-class ScopedCTCLossDescriptor {
- public:
-  ScopedCTCLossDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateCTCLossDescriptor(&desc_));
-  }
-  ~ScopedCTCLossDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyCTCLossDescriptor(desc_));
-  }
-
-  template <typename T>
-  inline cudnnCTCLossDescriptor_t descriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType<T>::type));
-    return desc_;
-  }
-
- private:
-  cudnnCTCLossDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedCTCLossDescriptor);
-};
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_helper_test.cc b/paddle/fluid/platform/cudnn_helper_test.cc
deleted file mode 100644
index 28edfd2e50237c887dbeb7ac73e1f990ce239a9c..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cudnn_helper_test.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
-
-#include "paddle/fluid/platform/cudnn_helper.h"
-#include <gtest/gtest.h>
-
-TEST(CudnnHelper, ScopedTensorDescriptor) {
-  using paddle::platform::ScopedTensorDescriptor;
-  using paddle::platform::DataLayout;
-
-  ScopedTensorDescriptor tensor_desc;
-  std::vector<int> shape = {2, 4, 6, 6};
-  auto desc = tensor_desc.descriptor<float>(DataLayout::kNCHW, shape);
-
-  cudnnDataType_t type;
-  int nd;
-  std::vector<int> dims(4);
-  std::vector<int> strides(4);
-  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
-      desc, 4, &type, &nd, dims.data(), strides.data());
-
-  EXPECT_EQ(nd, 4);
-  for (size_t i = 0; i < dims.size(); ++i) {
-    EXPECT_EQ(dims[i], shape[i]);
-  }
-  EXPECT_EQ(strides[3], 1);
-  EXPECT_EQ(strides[2], 6);
-  EXPECT_EQ(strides[1], 36);
-  EXPECT_EQ(strides[0], 144);
-
-  // test tensor5d: ScopedTensorDescriptor
-  ScopedTensorDescriptor tensor5d_desc;
-  std::vector<int> shape_5d = {2, 4, 6, 6, 6};
-  auto desc_5d = tensor5d_desc.descriptor<float>(DataLayout::kNCDHW, shape_5d);
-
-  std::vector<int> dims_5d(5);
-  std::vector<int> strides_5d(5);
-  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
-      desc_5d, 5, &type, &nd, dims_5d.data(), strides_5d.data());
-
-  EXPECT_EQ(nd, 5);
-  for (size_t i = 0; i < dims_5d.size(); ++i) {
-    EXPECT_EQ(dims_5d[i], shape_5d[i]);
-  }
-  EXPECT_EQ(strides_5d[4], 1);
-  EXPECT_EQ(strides_5d[3], 6);
-  EXPECT_EQ(strides_5d[2], 36);
-  EXPECT_EQ(strides_5d[1], 216);
-  EXPECT_EQ(strides_5d[0], 864);
-}
-
-TEST(CudnnHelper, ScopedFilterDescriptor) {
-  using paddle::platform::ScopedFilterDescriptor;
-  using paddle::platform::DataLayout;
-
-  ScopedFilterDescriptor filter_desc;
-  std::vector<int> shape = {2, 3, 3};
-  auto desc = filter_desc.descriptor<float>(DataLayout::kNCHW, shape);
-
-  cudnnDataType_t type;
-  int nd;
-  cudnnTensorFormat_t format;
-  std::vector<int> kernel(3);
-  paddle::platform::dynload::cudnnGetFilterNdDescriptor(desc, 3, &type, &format,
-                                                        &nd, kernel.data());
-
-  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
-  EXPECT_EQ(nd, 3);
-  for (size_t i = 0; i < shape.size(); ++i) {
-    EXPECT_EQ(kernel[i], shape[i]);
-  }
-
-  ScopedFilterDescriptor filter_desc_4d;
-  std::vector<int> shape_4d = {2, 3, 3, 3};
-  auto desc_4d = filter_desc.descriptor<float>(DataLayout::kNCDHW, shape_4d);
-
-  std::vector<int> kernel_4d(4);
-  paddle::platform::dynload::cudnnGetFilterNdDescriptor(
-      desc_4d, 4, &type, &format, &nd, kernel_4d.data());
-
-  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
-  EXPECT_EQ(nd, 4);
-  for (size_t i = 0; i < shape_4d.size(); ++i) {
-    EXPECT_EQ(kernel_4d[i], shape_4d[i]);
-  }
-}
-
-TEST(CudnnHelper, ScopedConvolutionDescriptor) {
-  using paddle::platform::ScopedConvolutionDescriptor;
-
-  ScopedConvolutionDescriptor conv_desc;
-  std::vector<int> src_pads = {2, 2, 2};
-  std::vector<int> src_strides = {1, 1, 1};
-  std::vector<int> src_dilations = {1, 1, 1};
-  auto desc = conv_desc.descriptor<float>(src_pads, src_strides, src_dilations);
-
-  cudnnDataType_t type;
-  cudnnConvolutionMode_t mode;
-  int nd;
-  std::vector<int> pads(3);
-  std::vector<int> strides(3);
-  std::vector<int> dilations(3);
-  paddle::platform::dynload::cudnnGetConvolutionNdDescriptor(
-      desc, 3, &nd, pads.data(), strides.data(), dilations.data(), &mode,
-      &type);
-
-  EXPECT_EQ(nd, 3);
-  for (size_t i = 0; i < src_pads.size(); ++i) {
-    EXPECT_EQ(pads[i], src_pads[i]);
-    EXPECT_EQ(strides[i], src_strides[i]);
-    EXPECT_EQ(dilations[i], src_dilations[i]);
-  }
-  EXPECT_EQ(mode, CUDNN_CROSS_CORRELATION);
-}
-
-TEST(CudnnHelper, ScopedPoolingDescriptor) {
-  using paddle::platform::ScopedPoolingDescriptor;
-  using paddle::platform::PoolingMode;
-
-  ScopedPoolingDescriptor pool_desc;
-  std::vector<int> src_kernel = {2, 2, 5};
-  std::vector<int> src_pads = {1, 1, 2};
-  std::vector<int> src_strides = {2, 2, 3};
-  auto desc = pool_desc.descriptor(PoolingMode::kMaximum, src_kernel, src_pads,
-                                   src_strides);
-
-  cudnnPoolingMode_t mode;
-  cudnnNanPropagation_t nan_t = CUDNN_PROPAGATE_NAN;
-  int nd;
-  std::vector<int> kernel(3);
-  std::vector<int> pads(3);
-  std::vector<int> strides(3);
-  paddle::platform::dynload::cudnnGetPoolingNdDescriptor(
-      desc, 3, &mode, &nan_t, &nd, kernel.data(), pads.data(), strides.data());
-
-  EXPECT_EQ(nd, 3);
-  for (size_t i = 0; i < src_pads.size(); ++i) {
-    EXPECT_EQ(kernel[i], src_kernel[i]);
-    EXPECT_EQ(pads[i], src_pads[i]);
-    EXPECT_EQ(strides[i], src_strides[i]);
-  }
-  EXPECT_EQ(mode, CUDNN_POOLING_MAX);
-}
diff --git a/paddle/fluid/platform/cudnn_workspace_helper.h b/paddle/fluid/platform/cudnn_workspace_helper.h
deleted file mode 100644
index 29abdc72e264b3676dec29c6be3fd8c4e1510dfe..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cudnn_workspace_helper.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace platform {
-
-static constexpr int kDefaultConvWorkspaceSizeLimitMB = 512;
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/details/cuda_transform_iterator_cast.h b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
deleted file mode 100644
index 06afc44c257bbeb0729323e1a42e1eead23ff075..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef __NVCC__
-#error device_ptr_cast must be include by .cu file
-#endif
-
-#include <type_traits>  // For std::remove_pointer and std::is_pointer.
-
-#include "thrust/device_ptr.h"
-
-namespace paddle {
-namespace platform {
-namespace details {
-
-// PointerToThrustDevicePtr has two speicalizations, one casts a (CUDA
-// device) pointer into thrust::device_ptr, the other keeps rest types
-// un-casted.
-template <typename T, bool is_ptr>
-struct PointerToThrustDevicePtr;
-
-template <typename T>
-struct PointerToThrustDevicePtr<T, true> {
-  using ELEM = typename std::remove_pointer<T>::type;
-  using RTYPE = thrust::device_ptr<ELEM>;
-
-  inline thrust::device_ptr<ELEM> operator()(ELEM* ele) const {
-    return thrust::device_pointer_cast(ele);
-  }
-};
-
-template <typename T>
-struct PointerToThrustDevicePtr<T, false> {
-  using RTYPE = T;
-  inline RTYPE operator()(RTYPE it) const { return it; }
-};
-
-// CastToCUDATransformIterator casts a pointer to thrust::device_ptr
-// so it could be used as the iterator of thrust::transform.  It
-// doesn't cast other types.
-//
-// We need CastToCUDATransformIterator because it is often that we
-// want to use device memory pointers as transform iterators, e.g., to
-// transform a block of float32 to float16.  In this case, we want
-// CastToCUDATransformIterator to cast float16/32 pointers to
-// thrust::device_ptr, otherwise they cannot work as the iterator
-// required by thrust::transform.  At the same time, we don't want to
-// cast thrust::device_ptr to thrust::device_ptr repeatedly.
-template <typename T>
-auto CastToCUDATransformIterator(T t) ->
-    typename PointerToThrustDevicePtr<T, std::is_pointer<T>::value>::RTYPE {
-  PointerToThrustDevicePtr<T, std::is_pointer<T>::value> cast;
-  return cast(t);
-}
-
-}  // namespace details
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc
deleted file mode 100644
index 24421b5c3c99bd341c562f4c35df55ad749bdc50..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device_code.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/device_code.h"
-#include <algorithm>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-
-#ifdef PADDLE_WITH_CUDA
-inline bool is_error(nvrtcResult stat) { return stat != NVRTC_SUCCESS; }
-
-inline void throw_on_error(nvrtcResult stat, const std::string& msg) {
-#ifndef REPLACE_ENFORCE_GLOG
-  throw std::runtime_error(dynload::nvrtcGetErrorString(stat) + msg);
-#else
-  LOG(FATAL) << dynload::nvrtcGetErrorString(stat) << msg;
-#endif
-}
-
-CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
-                               const std::string& kernel) {
-  if (!is_gpu_place(place)) {
-    PADDLE_THROW("CUDADeviceCode can only launch on GPU place.");
-  }
-
-  place_ = place;
-  name_ = name;
-  kernel_ = kernel;
-}
-
-void CUDADeviceCode::Compile() {
-  nvrtcProgram program;
-  PADDLE_ENFORCE_EQ(dynload::nvrtcCreateProgram(&program,
-                                                kernel_.c_str(),  // buffer
-                                                name_.c_str(),    // name
-                                                0,                // numHeaders
-                                                nullptr,          // headers
-                                                nullptr),  // includeNames
-                    NVRTC_SUCCESS,
-                    "nvrtcCreateProgram failed.");
-
-  // Compile the program for specified compute_capability
-  auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
-      DeviceContextPool::Instance().Get(place_));
-  int compute_capability = dev_ctx->GetComputeCapability();
-  std::string compute_flag =
-      "--gpu-architecture=compute_" + std::to_string(compute_capability);
-  const std::vector<const char*> options = {"--std=c++11",
-                                            compute_flag.c_str()};
-  nvrtcResult compile_result =
-      dynload::nvrtcCompileProgram(program,          // program
-                                   options.size(),   // numOptions
-                                   options.data());  // options
-  if (compile_result == NVRTC_ERROR_COMPILATION) {
-    // Obtain compilation log from the program
-    size_t log_size;
-    PADDLE_ENFORCE_EQ(dynload::nvrtcGetProgramLogSize(program, &log_size),
-                      NVRTC_SUCCESS, "nvrtcGetProgramLogSize failed.");
-    std::vector<char> log;
-    log.resize(log_size + 1);
-    PADDLE_ENFORCE_EQ(dynload::nvrtcGetProgramLog(program, log.data()),
-                      NVRTC_SUCCESS, "nvrtcGetProgramLog failed.");
-    LOG(FATAL) << "JIT compiling of CUDA code failed:\n" << log.data();
-  }
-
-  // Obtain PTX from the program
-  size_t ptx_size;
-  PADDLE_ENFORCE_EQ(dynload::nvrtcGetPTXSize(program, &ptx_size), NVRTC_SUCCESS,
-                    "nvrtcGetPTXSize failed.");
-  ptx_.resize(ptx_size + 1);
-  PADDLE_ENFORCE_EQ(dynload::nvrtcGetPTX(program, ptx_.data()), NVRTC_SUCCESS,
-                    "nvrtcGetPTX failed.");
-
-  PADDLE_ENFORCE_EQ(dynload::nvrtcDestroyProgram(&program), NVRTC_SUCCESS,
-                    "nvrtcDestroyProgram failed.");
-
-  PADDLE_ENFORCE_EQ(
-      dynload::cuModuleLoadData(&module_, ptx_.data()), CUDA_SUCCESS,
-      "Fail to load PTX of %s (in cuModuleLoadData.)", name_.c_str());
-  PADDLE_ENFORCE_EQ(
-      dynload::cuModuleGetFunction(&function_, module_, name_.c_str()),
-      CUDA_SUCCESS, "Fail to get function of %s (in cuModuleGetFunction.)",
-      name_.c_str());
-
-  max_threads_ = dev_ctx->GetMaxPhysicalThreadCount();
-}
-
-void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
-  int max_blocks = std::max(max_threads_ / num_threads_, 1);
-  int workload_per_block = workload_per_thread_ * num_threads_;
-  int num_blocks =
-      std::min(max_blocks, (static_cast<int>(n) + workload_per_block - 1) /
-                               workload_per_block);
-
-  auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
-      DeviceContextPool::Instance().Get(place_));
-  PADDLE_ENFORCE_EQ(
-      dynload::cuLaunchKernel(function_, num_blocks, 1, 1,  // grid dim
-                              num_threads_, 1, 1,           // block dim
-                              0,                            // shared memory
-                              dev_ctx->stream(),            // stream
-                              args->data(),                 // arguments
-                              nullptr),
-      CUDA_SUCCESS, "Fail to launch kernel %s (in cuLaunchKernel.)",
-      name_.c_str());
-}
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_code.h b/paddle/fluid/platform/device_code.h
deleted file mode 100644
index 19adb0707f1742e9a41c4eaec549f7ccd5101acb..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device_code.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/platform/device_context.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/dynload/cuda_driver.h"
-#include "paddle/fluid/platform/dynload/nvrtc.h"
-#endif
-
-namespace paddle {
-namespace platform {
-
-class DeviceCode {
- public:
-  virtual ~DeviceCode() {}
-  virtual void Compile() = 0;
-  virtual void Launch(const size_t n, std::vector<void*>* args) const = 0;
-
- protected:
-  Place place_;
-  std::string name_;
-  std::string kernel_;
-};
-
-#ifdef PADDLE_WITH_CUDA
-class CUDADeviceCode : public DeviceCode {
- public:
-  explicit CUDADeviceCode(const Place& place, const std::string& name,
-                          const std::string& kernel);
-  void Compile() override;
-  void Launch(const size_t n, std::vector<void*>* args) const override;
-
-  void SetNumThreads(int num_threads) { num_threads_ = num_threads; }
-  void SetWorkloadPerThread(int workload_per_thread) {
-    workload_per_thread_ = workload_per_thread;
-  }
-
- private:
-  int max_threads_{0};
-  int num_threads_{1024};
-  int workload_per_thread_{1};
-  std::vector<char> ptx_;
-  CUmodule module_;
-  CUfunction function_;
-};
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
deleted file mode 100644
index 3b63ed4e369c7c9ccecf8a6b7e2272973a44e266..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device_code_test.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/device_code.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/init.h"
-
-constexpr auto saxpy_code = R"(
-extern "C" __global__
-void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
-  for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < n;
-       tid += blockDim.x * gridDim.x) {
-    z[tid] = a * x[tid] + y[tid];
-  }
-}
-)";
-
-#ifdef PADDLE_WITH_CUDA
-TEST(device_code, cuda) {
-  paddle::framework::InitDevices(false, {0});
-  paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceCode code(place, "saxpy_kernel", saxpy_code);
-
-  paddle::framework::Tensor cpu_x;
-  paddle::framework::Tensor cpu_y;
-  paddle::framework::Tensor cpu_z;
-
-  float scale = 2;
-  auto dims = paddle::framework::make_ddim(
-      {static_cast<int64_t>(256), static_cast<int64_t>(1024)});
-  cpu_x.mutable_data<float>(dims, paddle::platform::CPUPlace());
-  cpu_y.mutable_data<float>(dims, paddle::platform::CPUPlace());
-
-  size_t n = cpu_x.numel();
-  for (size_t i = 0; i < n; ++i) {
-    cpu_x.data<float>()[i] = static_cast<float>(i);
-  }
-  for (size_t i = 0; i < n; ++i) {
-    cpu_y.data<float>()[i] = static_cast<float>(0.5);
-  }
-
-  paddle::framework::Tensor x;
-  paddle::framework::Tensor y;
-  paddle::framework::Tensor z;
-
-  float* x_data = x.mutable_data<float>(dims, place);
-  float* y_data = y.mutable_data<float>(dims, place);
-  float* z_data = z.mutable_data<float>(dims, place);
-
-  TensorCopySync(cpu_x, place, &x);
-  TensorCopySync(cpu_y, place, &y);
-
-  code.Compile();
-
-  std::vector<void*> args = {&scale, &x_data, &y_data, &z_data, &n};
-  code.SetNumThreads(1024);
-  code.SetWorkloadPerThread(1);
-  code.Launch(n, &args);
-
-  TensorCopySync(z, paddle::platform::CPUPlace(), &cpu_z);
-  for (size_t i = 0; i < n; i++) {
-    PADDLE_ENFORCE_EQ(cpu_z.data<float>()[i],
-                      static_cast<float>(i) * scale + 0.5);
-  }
-}
-#endif
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
deleted file mode 100644
index 3166593365404e98fad0e91a7d7b5cd7176cd9ed..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device_context.cc
+++ /dev/null
@@ -1,507 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/platform/device_context.h"
-#include <set>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/memory/memory.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/rw_lock.h"
-#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#endif
-
-#include "glog/logging.h"
-
-namespace paddle {
-namespace memory {
-
-AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
-  auto place = dev_ctx.GetPlace();
-#ifdef PADDLE_WITH_CUDA
-  if (size == 0 || !platform::is_gpu_place(place)) {
-    return Alloc(place, size);
-  }
-  auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
-      platform::DeviceContextPool::Instance().Get(place));
-  auto& desired_dev_ctx =
-      static_cast<const platform::CUDADeviceContext&>(dev_ctx);
-  if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
-    return Alloc(place, size);
-  } else {
-    return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
-        desired_dev_ctx, size);
-  }
-#else
-  return Alloc(place, size);
-#endif
-}
-
-}  // namespace memory
-}  // namespace paddle
-
-namespace paddle {
-namespace platform {
-
-DeviceContextPool* DeviceContextPool::pool = nullptr;
-
-platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
-  auto it = device_contexts_.find(place);
-  if (it == device_contexts_.end()) {
-    PADDLE_THROW(
-        "Place %s is not supported, Please check that your paddle compiles "
-        "with WITH_GPU "
-        "option or check that your train process hold the correct gpu_id if "
-        "you use Executor",
-        place);
-  }
-  return it->second.get().get();
-}
-
-template <typename DevCtx, typename PlaceType>
-inline void EmplaceDeviceContext(
-    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-        map_ptr,
-    platform::Place p) {
-  using PtrType = std::unique_ptr<DeviceContext>;
-  map_ptr->emplace(p, std::async(std::launch::deferred, [=] {
-                     // lazy evaluation. i.e., only create device context at
-                     // first `Get`
-                     return PtrType(new DevCtx(boost::get<PlaceType>(p)));
-                   }));
-}
-
-DeviceContextPool::DeviceContextPool(
-    const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  std::set<Place> set;
-  for (auto& p : places) {
-    set.insert(p);
-  }
-  for (auto& p : set) {
-    if (platform::is_cpu_place(p)) {
-#ifdef PADDLE_WITH_MKLDNN
-      EmplaceDeviceContext<MKLDNNDeviceContext, CPUPlace>(&device_contexts_, p);
-#else
-      EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
-#endif
-    } else if (platform::is_gpu_place(p)) {
-#ifdef PADDLE_WITH_CUDA
-      EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
-#else
-      PADDLE_THROW(
-          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
-          "option");
-#endif
-    } else if (platform::is_cuda_pinned_place(p)) {
-#ifdef PADDLE_WITH_CUDA
-      EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
-          &device_contexts_, p);
-#else
-      PADDLE_THROW(
-          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
-          "option");
-#endif
-    }
-  }
-}
-
-CPUDeviceContext::CPUDeviceContext() {
-  eigen_device_.reset(new Eigen::DefaultDevice());
-}
-
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
-  eigen_device_.reset(new Eigen::DefaultDevice());
-}
-
-Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
-  return eigen_device_.get();
-}
-
-Place CPUDeviceContext::GetPlace() const { return place_; }
-
-#ifdef PADDLE_WITH_CUDA
-
-class EigenCudaStreamDevice : public Eigen::StreamInterface {
- public:
-  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
-    Eigen::initializeDeviceProp();
-  }
-  ~EigenCudaStreamDevice() override {}
-
-  void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
-    stream_ = cuda_stream;
-    place_ = place;
-    device_prop_ = &Eigen::m_deviceProperties[place.device];
-  }
-
-  const cudaStream_t& stream() const override { return *stream_; }
-
-  const cudaDeviceProp& deviceProperties() const override {
-    return *device_prop_;
-  }
-
-  void* allocate(size_t num_bytes) const override {
-    if (UNLIKELY(num_bytes == 0)) {
-      return nullptr;
-    }
-    auto buf = memory::Alloc(place_, num_bytes);
-    VLOG(4) << "Eigen allocated at " << buf->ptr() << ", size" << buf->size()
-            << " requested " << num_bytes;
-    void* retv = buf->ptr();
-    {
-      std::lock_guard<std::mutex> lock(mtx_);
-      allocations_.emplace(retv, std::move(buf));
-    }
-    return retv;
-  }
-
-  void deallocate(void* buffer) const override {
-    if (LIKELY(buffer)) {
-      std::lock_guard<std::mutex> lock(mtx_);
-      allocations_.erase(buffer);
-    }
-  }
-
-  void* scratchpad() const override {
-    if (scratch_ == NULL) {
-      scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int));
-    }
-    return scratch_;
-  }
-
-  unsigned int* semaphore() const override {
-    if (semaphore_ == NULL) {
-      char* scratch =
-          static_cast<char*>(scratchpad()) + Eigen::kCudaScratchSize;
-      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
-    }
-    return semaphore_;
-  }
-
- private:
-  CUDAPlace place_;
-  const cudaStream_t* stream_;         // not owned;
-  const cudaDeviceProp* device_prop_;  // not owned;
-  mutable void* scratch_;
-  mutable unsigned int* semaphore_;
-  mutable std::mutex mtx_;  // to protect allocations_
-  mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
-};
-
-void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
-  if (required_workspace_bytes <= WorkspaceSize()) {
-    return;
-  }
-  // reset allocation first before re-allocate to save memory
-  allocation_.reset();
-  allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
-}
-
-CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
-  CUDADeviceGuard guard(place_.device);
-  compute_capability_ = GetCUDAComputeCapability(place_.device);
-  multi_process_ = GetCUDAMultiProcessors(place_.device);
-  max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
-  eigen_stream_.reset(new EigenCudaStreamDevice());
-  eigen_stream_->Reinitialize(&stream_, place);
-  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
-  cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH));
-
-  if (TensorCoreAvailable()) {
-#if CUDA_VERSION >= 9000
-    cublas_tensor_core_handle_.reset(
-        new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH));
-#endif
-  }
-
-  driver_version_ = GetCUDADriverVersion(place_.device);
-  runtime_version_ = GetCUDARuntimeVersion(place_.device);
-
-  LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device
-                          << ", CUDA Capability: " << compute_capability_
-                          << ", Driver API Version: " << driver_version_ / 1000
-                          << "." << (driver_version_ % 100) / 10
-                          << ", Runtime API Version: "
-                          << runtime_version_ / 1000 << "."
-                          << (runtime_version_ % 100) / 10;
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
-                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
-                          << (cudnn_dso_ver % 1000) / 100 << ".";
-
-  {
-    // Check CUDA/CUDNN version compatiblity
-    auto local_cuda_version =
-        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
-    auto compile_cuda_version =
-        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
-    if (local_cuda_version < compile_cuda_version) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << place_.device
-          << ". The installed Paddle is compiled with CUDA "
-          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
-          << ", but CUDA runtime version in your machine is "
-          << local_cuda_version / 10 << "." << local_cuda_version % 10
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible CUDA "
-             "version.";
-    }
-
-    if (dynload::HasCUDNN()) {
-      auto local_cudnn_version = cudnn_dso_ver / 100;
-      auto compile_cudnn_version = CUDNN_VERSION / 100;
-      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
-        LOG_FIRST_N(WARNING, 1)
-            << "WARNING: device: " << place_.device
-            << ". The installed Paddle is compiled with CUDNN "
-            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
-            << ", but CUDNN version in your machine is "
-            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
-            << ", which may cause serious incompatible bug. "
-            << "Please recompile or reinstall Paddle with compatible CUDNN "
-               "version.";
-      }
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnCreate(&cudnn_handle_),
-          "Failed to create Cudnn handle in DeviceContext");
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnSetStream(cudnn_handle_, stream_),
-          "Failed to set stream for Cudnn handle in DeviceContext");
-    } else {
-      cudnn_handle_ = nullptr;
-    }
-  }
-
-  callback_manager_.reset(new StreamCallbackManager(stream_));
-}
-
-CUDADeviceContext::~CUDADeviceContext() {
-  SetDeviceId(place_.device);
-  Wait();
-  WaitStreamCallback();
-  cublas_handle_.reset();
-  cublas_tensor_core_handle_.reset();
-  eigen_stream_.reset();
-  eigen_device_.reset();
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
-  if (cudnn_handle_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
-                                "Failed to destory Cudnn handle");
-  }
-#if !defined(_WIN32)
-  if (nccl_comm_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
-  }
-#endif
-}
-
-Place CUDADeviceContext::GetPlace() const { return place_; }
-
-void CUDADeviceContext::Wait() const {
-  cudaError_t e_sync = cudaStreamSynchronize(stream_);
-  if (e_sync != 0) {
-    LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync)
-               << " errno: " << e_sync;
-  }
-
-  cudaError_t e_get = cudaGetLastError();
-  if (e_get != 0) {
-    LOG(FATAL) << "cudaGetLastError  " << cudaGetErrorString(e_get)
-               << " errno: " << e_get;
-  }
-}
-
-int CUDADeviceContext::GetComputeCapability() const {
-  return compute_capability_;
-}
-
-int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
-  return multi_process_ * max_threads_per_mp_;
-}
-
-Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
-  return eigen_device_.get();
-}
-
-bool CUDADeviceContext::tensor_core_available() const {
-  return cublas_tensor_core_handle_ != nullptr;
-}
-
-cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
-
-CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
-  return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
-}
-
-cudaStream_t CUDADeviceContext::stream() const { return stream_; }
-
-CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
-  eigen_device_.reset(new Eigen::DefaultDevice());
-}
-
-CUDAPinnedDeviceContext::CUDAPinnedDeviceContext(CUDAPinnedPlace place)
-    : place_(place) {
-  eigen_device_.reset(new Eigen::DefaultDevice());
-}
-
-Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
-  return eigen_device_.get();
-}
-
-Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobmap_() {
-  p_blobmap_.reset(new BlobMap());
-  p_mutex_.reset(new std::mutex());
-}
-
-namespace {
-// Current mkldnn session id.
-thread_local size_t cur_mkldnn_session_id = kMKLDNNSessionID_Default;
-// Current data input shape string.
-// - For fixed-shape, it's a null string in default.
-// - For dynamic-shape, it's user specific.
-thread_local std::string cur_input_shape_str = "";
-// the cache capacity of different input shapes for MKLDNN.
-// Default 1 means fixed input shape, not dynamic shape.
-thread_local int cur_input_shape_cache_capacity = 1;
-}  // namespace
-
-void set_cur_mkldnn_session_id(size_t sid) { cur_mkldnn_session_id = sid; }
-size_t get_cur_mkldnn_session_id(void) { return cur_mkldnn_session_id; }
-void set_cur_input_shape_str(std::string input_shape_str) {
-  cur_input_shape_str = input_shape_str;
-}
-void set_cur_input_shape_cache_capacity(int input_shape_cache_capacity) {
-  cur_input_shape_cache_capacity = input_shape_cache_capacity;
-}
-
-void MKLDNNDeviceContext::ResetBlobMap() const { p_blobmap_->clear(); }
-
-size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
-  std::lock_guard<std::mutex> lock(*p_mutex_);
-  BlobMap* pMap = p_blobmap_.get();
-  auto map_it = pMap->find(cur_mkldnn_session_id);
-  if (map_it == pMap->end()) {
-    LOG(FATAL) << "MKLDNNDeviceContext don't find cur_mkldnn_session_id : "
-               << cur_mkldnn_session_id;
-  }
-  return map_it->second->size();
-}
-
-void MKLDNNDeviceContext::SetBlob(const std::string& name,
-                                  std::shared_ptr<void> data) const {
-  BlobMap* pMap = p_blobmap_.get();
-  std::shared_ptr<ShapeBlob> sBlob = nullptr;
-  std::shared_ptr<KeyBlob> pBlob = nullptr;
-
-  int sid = platform::get_cur_mkldnn_session_id();
-
-  std::lock_guard<std::mutex> lock(*p_mutex_);
-
-  // Find ShapeBlob for current mkldnn session id.
-  auto map_it = pMap->find(sid);
-
-  if (map_it == pMap->end()) {
-    // 1st time to set blob in current thread
-    sBlob = std::shared_ptr<ShapeBlob>(new ShapeBlob());
-    (*pMap)[sid] = sBlob;
-    VLOG(2) << "SetBlob: sid=" << sid << ", add new sid\n";
-  } else {
-    sBlob = map_it->second;
-  }
-
-  // Find KeyBlob for current input shape
-  auto key_it = sBlob->find(cur_input_shape_str);
-
-  if (key_it == sBlob->end()) {
-    // In cache clearing mode, cur_input_shape_cache_capacity defines
-    // max pblob capacity
-    if ((static_cast<size_t>(sid) == kMKLDNNSessionID_CacheClearing) &&
-        sBlob->size() &&
-        (sBlob->size() >=
-         static_cast<size_t>(cur_input_shape_cache_capacity))) {
-      VLOG(2) << "sid=" << sid
-              << ", remove all blobs of shape: " << sBlob->begin()->first;
-      sBlob->erase(sBlob->begin()->first);
-    }
-    pBlob = std::shared_ptr<KeyBlob>(new KeyBlob());
-    (*sBlob)[cur_input_shape_str] = pBlob;
-  } else {
-    pBlob = key_it->second;
-  }
-
-  // Find Blob via name
-  auto blob_it = pBlob->find(name);
-  if (blob_it == pBlob->end()) {
-    (*pBlob)[name] = data;
-  } else {
-    blob_it->second = data;  // set data to existing blob
-  }
-  VLOG(2) << "SetBlob: sid=" << sid << ", add blob=" << name << "\n";
-  // lock will be automatically released when out of scope
-  return;
-}
-
-std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
-    const std::string& name) const {
-  BlobMap* pMap = p_blobmap_.get();
-  std::shared_ptr<ShapeBlob> sBlob = nullptr;
-  std::shared_ptr<KeyBlob> pBlob = nullptr;
-
-  int sid = platform::get_cur_mkldnn_session_id();
-
-  std::lock_guard<std::mutex> lock(*p_mutex_);
-
-  // Find ShapeBlob for current mkldnn session id firstly
-  auto map_it = pMap->find(sid);
-  if (map_it == pMap->end()) {
-    VLOG(2) << "GetBlob: sid=" << sid << ", miss sid\n";
-    return nullptr;
-  }
-  sBlob = map_it->second;
-
-  // Find KeyBlob for current input shape secondly
-  auto sBlob_it = sBlob->find(cur_input_shape_str);
-  if (sBlob_it == sBlob->end()) {
-    VLOG(2) << "GetBlob: sid=" << cur_input_shape_str
-            << ", miss input_shape_str\n";
-    return nullptr;
-  }
-  pBlob = sBlob_it->second;
-
-  // Find Blob via name
-  auto key_it = pBlob->find(name);
-
-  if (key_it == pBlob->end()) {
-    VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n";
-    return nullptr;
-  }
-
-  VLOG(2) << "GetBlob sid=" << sid << ", get blob=" << name << "\n";
-  // lock will be automatically released when out of scope
-  return key_it->second;
-}
-
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
deleted file mode 100644
index 3504f62b7bdaa523deb2ae2074cf0d22cfe93851..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device_context.h
+++ /dev/null
@@ -1,354 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <future>  // NOLINT
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/memory/malloc.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_helper.h"
-#include "paddle/fluid/platform/dynload/cublas.h"
-#include "paddle/fluid/platform/dynload/cudnn.h"
-#if !defined(__APPLE__) && !defined(_WIN32)
-#include "paddle/fluid/platform/dynload/nccl.h"
-#endif
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "mkldnn.hpp"
-#endif
-
-#include <map>
-#include "glog/logging.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/stream_callback_manager.h"
-#endif
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace platform {
-
-class DeviceContext {
- public:
-  virtual ~DeviceContext() {}
-  virtual Place GetPlace() const = 0;
-
-  virtual void Wait() const {}
-};
-
-class CPUDeviceContext : public DeviceContext {
- public:
-  CPUDeviceContext();
-  explicit CPUDeviceContext(CPUPlace place);
-
-  Eigen::DefaultDevice* eigen_device() const;
-
-  Place GetPlace() const override;
-
- private:
-  CPUPlace place_;
-  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
-};
-
-template <typename Place>
-struct DefaultDeviceContextType;
-
-template <>
-struct DefaultDeviceContextType<platform::CPUPlace> {
-  using TYPE = CPUDeviceContext;
-};
-
-#ifdef PADDLE_WITH_CUDA
-
-class EigenCudaStreamDevice;
-class CudnnWorkspaceHandle;
-
-class CUDADeviceContext : public DeviceContext {
- public:
-  explicit CUDADeviceContext(CUDAPlace place);
-  virtual ~CUDADeviceContext();
-
-  /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const override;
-
-  /*! \brief  Return place in the device context. */
-  Place GetPlace() const override;
-
-  /*! \brief  Return compute capability in the device context. */
-  int GetComputeCapability() const;
-
-  /*! \brief  Return the max physical thread count in the device context */
-  int GetMaxPhysicalThreadCount() const;
-
-  /*! \brief  Return eigen device in the device context. */
-  Eigen::GpuDevice* eigen_device() const;
-
-  /*! \brief  Call cublas function safely. */
-  template <typename Callback>
-  inline void CublasCall(Callback&& callback) const {
-    cublas_handle_->Call(std::forward<Callback>(callback));
-  }
-
-  /*! \brief  Check whether tensor core is supported */
-  bool tensor_core_available() const;
-
-  /*! \brief  Call cublas function with Tensor Core safely. If
-      Tensor Core is not available, use DEFAULT_MATH instead. */
-  template <typename Callback>
-  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
-    if (cublas_tensor_core_handle_) {
-      cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
-    } else {
-      cublas_handle_->Call(std::forward<Callback>(callback));
-    }
-  }
-
-  /*! \brief  Return cudnn  handle in the device context. */
-  cudnnHandle_t cudnn_handle() const;
-
-  /*! \brief  Return a cudnn workspace handle to call multiple cudnn
-   *  functions without interrupting by other threads.
-   *  Once the first cudnn function is called by the handle, a lock
-   *  would be acquired to prevent other threads from accessing the
-   *  workspace. Once the handle is destructed, the lock would be released.
-   *  CudnnWorkspaceHandle is an RAII object to implement thread-safe
-   *  sequential cudnn function calls. */
-  CudnnWorkspaceHandle cudnn_workspace_handle() const;
-
-  /*! \brief  Return cuda stream in the device context. */
-  cudaStream_t stream() const;
-
-#if !defined(_WIN32)
-  /*! \brief  Return nccl communicators. */
-  ncclComm_t nccl_comm() const { return nccl_comm_; }
-
-  /*! \brief  Set nccl communicators. */
-  void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
-#endif
-
-  template <typename Callback>
-  void RecordEvent(cudaEvent_t ev, Callback callback) {
-    callback();
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
-  }
-
-  template <typename Callback>
-  void AddStreamCallback(Callback&& callback) const {
-    callback_manager_->AddCallback(callback);
-  }
-
-  void WaitStreamCallback() const { callback_manager_->Wait(); }
-
- private:
-  CUDAPlace place_;
-
-  mutable std::once_flag init_cudnn_;
-
-  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
-  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-  cudaStream_t stream_;
-
-  cudnnHandle_t cudnn_handle_;
-  mutable std::mutex cudnn_handle_mtx_;
-
-  std::unique_ptr<CublasHandleHolder> cublas_handle_;
-  std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
-
-#if !defined(_WIN32)
-  // NCCL communicator (single process version) for NCCL collective operations.
-  // NCCL collective operations provides fast collectives over multiple GPUs
-  // both within and across nodes.
-  // But, this collectives is used for collectives over multiple GPUs within
-  // nodes.
-  ncclComm_t nccl_comm_{nullptr};
-#endif
-
-  int compute_capability_;
-  int runtime_version_;
-  int driver_version_;
-  int multi_process_;
-  int max_threads_per_mp_;
-
-  // StreamCallbackManager is thread-safe
-  std::unique_ptr<StreamCallbackManager> callback_manager_;
-
-  DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
-};
-
-class CudnnWorkspaceHandle {
- public:
-  inline CudnnWorkspaceHandle(const CUDADeviceContext& dev_ctx, std::mutex* mtx)
-      : device_context_(dev_ctx), mtx_(mtx) {}
-
-  template <typename Callback>
-  inline void RunFunc(Callback&& cudnn_func, size_t required_workspace_bytes) {
-    if (required_workspace_bytes > WorkspaceSize()) {
-      ReallocWorkspace(required_workspace_bytes);
-    }
-    VLOG(2) << "Cudnn workspace size at RunFunc: "
-            << static_cast<double>(WorkspaceSize()) / (1 << 20) << " MB";
-    {
-      std::lock_guard<std::mutex> guard(*mtx_);
-      cudnn_func(allocation_ ? allocation_->ptr() : nullptr);
-    }
-  }
-
-  /*! \brief Thread which call RunFuncSync() would release gpu memory after
-   *  running the function. Currently this function is only used when cudnn
-   *  exhaustive searching and callers have to guarantee that the input function
-   *  is host blocking */
-  template <typename Callback>
-  inline void RunFuncSync(Callback&& cudnn_func,
-                          size_t required_workspace_bytes) {
-    RunFunc(cudnn_func, required_workspace_bytes);
-    ResetWorkspace();
-  }
-
-  void ReallocWorkspace(size_t required_workspace_bytes);
-
-  inline void ResetWorkspace() { allocation_ = nullptr; }
-
-  inline size_t WorkspaceSize() {
-    if (allocation_ == nullptr) {
-      return 0;
-    }
-    return allocation_->size();
-  }
-
-  CudnnWorkspaceHandle(CudnnWorkspaceHandle&&) = default;
-  CudnnWorkspaceHandle& operator=(CudnnWorkspaceHandle&&) = delete;
-
- private:
-  memory::allocation::AllocationPtr allocation_;
-  const CUDADeviceContext& device_context_;
-  std::mutex* mtx_;
-};
-
-template <>
-struct DefaultDeviceContextType<platform::CUDAPlace> {
-  using TYPE = CUDADeviceContext;
-};
-
-// Currently, CUDAPinnedDeviceContext is only used to data copying.
-class CUDAPinnedDeviceContext : public DeviceContext {
- public:
-  CUDAPinnedDeviceContext();
-  explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
-
-  Place GetPlace() const override;
-
-  Eigen::DefaultDevice* eigen_device() const;
-
- private:
-  CUDAPinnedPlace place_;
-  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
-};
-
-template <>
-struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
-  using TYPE = CUDAPinnedDeviceContext;
-};
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-// Following three maps are used to cache MKLDNN primitives.
-// There relations are:
-// - BlobMap = Map<cur_thread_id, ShapeBlob>
-// - ShapeBlob = Map<cur_input_shape_str, KeyBlob>
-// - KeyBlob  = Map<blob_name, blob>
-// Where:
-using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
-using ShapeBlob = std::unordered_map<std::string, std::shared_ptr<KeyBlob>>;
-using BlobMap = std::unordered_map<int, std::shared_ptr<ShapeBlob>>;
-
-// default mkldnn session id
-constexpr size_t kMKLDNNSessionID_Default = 0;
-// mkldnn session id for cache clearing mode
-constexpr size_t kMKLDNNSessionID_CacheClearing = -1;
-
-void set_cur_mkldnn_session_id(size_t);
-size_t get_cur_mkldnn_session_id(void);
-void set_cur_input_shape_str(std::string input_shape_str);
-void set_cur_input_shape_cache_capacity(int input_shape_cache_capacity);
-
-class MKLDNNDeviceContext : public CPUDeviceContext {
- public:
-  explicit MKLDNNDeviceContext(CPUPlace place);
-
-  /* \brief  Get the active engine */
-  const mkldnn::engine& GetEngine() const { return engine_; }
-
-  // Remove all entries from the blob map
-  void ResetBlobMap() const;
-
-  // Get the ShapeBlob size in cur_mkldnn_session_id.
-  size_t GetShapeBlobSize() const;
-
-  // Set data to blob (i.e. name/data pair). Create blob if not existing
-  void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
-
-  // Find a saved blob. Return nullptr if not found
-  std::shared_ptr<void> GetBlob(const std::string& name) const;
-
- private:
-  mkldnn::engine engine_;
-  std::shared_ptr<BlobMap> p_blobmap_;
-  std::shared_ptr<std::mutex> p_mutex_;
-};
-#endif
-
-/*! \brief device context pool singleton */
-class DeviceContextPool {
- public:
-  explicit DeviceContextPool(const std::vector<platform::Place>& places);
-
-  static DeviceContextPool& Instance() {
-    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
-    return *pool;
-  }
-
-  /*! \brief  Create should only called by Init function */
-  static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
-    if (pool == nullptr) {
-      pool = new DeviceContextPool(places);
-    }
-    return *pool;
-  }
-
-  /*! \brief  Return handle of single device context. */
-  platform::DeviceContext* Get(const platform::Place& place);
-
-  template <typename Place>
-  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
-      const Place& place) {
-    return reinterpret_cast<
-        const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
-  }
-
-  size_t size() const { return device_contexts_.size(); }
-
- private:
-  static DeviceContextPool* pool;
-  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
-      device_contexts_;
-  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
deleted file mode 100644
index 5b3aa98efb46b51d6c3edb6d2cbd4200bd0a35c6..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device_context_test.cu
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/platform/device_context.h"
-
-#include <vector>
-
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-TEST(Device, Init) {
-  using paddle::platform::DeviceContext;
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::CUDAPlace;
-
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
-    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
-    ASSERT_NE(nullptr, gpu_device);
-    delete device_context;
-  }
-}
-
-TEST(Device, CUDADeviceContext) {
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::CUDAPlace;
-
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
-    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
-    ASSERT_NE(nullptr, gpu_device);
-    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
-    ASSERT_NE(nullptr, cudnn_handle);
-    delete device_context;
-  }
-}
-
-TEST(Device, DeviceContextPool) {
-  using paddle::platform::DeviceContextPool;
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::Place;
-  using paddle::platform::CPUPlace;
-  using paddle::platform::CUDAPlace;
-
-  DeviceContextPool& pool = DeviceContextPool::Instance();
-  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
-  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
-  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
-
-  std::vector<Place> gpu_places;
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    auto dev_ctx = pool.Get(CUDAPlace(i));
-    ASSERT_NE(dev_ctx, nullptr);
-  }
-}
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
deleted file mode 100644
index 7b901856daa7d899074ba8659ea6cf7f36b89f01..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/device_memory_aligment.h"
-
-namespace paddle {
-namespace platform {
-size_t Alignment(size_t size, const platform::Place &place) {
-  size_t alignment = 1024;
-  if (platform::is_cpu_place(place)) {
-    alignment = CpuMinChunkSize();
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    alignment = GpuMinChunkSize();
-#else
-    PADDLE_THROW("Fluid is not compiled with CUDA");
-#endif
-  }
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
deleted file mode 100644
index 2c19a2b10626d0e312c267feba0104a23005909e..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stddef.h>
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
-
-namespace paddle {
-namespace platform {
-size_t Alignment(size_t size, const platform::Place &place);
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
deleted file mode 100644
index 8458b17f82a976bad37df58dddc2c0d80c8eb13e..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device_tracer.cc
+++ /dev/null
@@ -1,723 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <deque>
-#include <forward_list>
-#include <fstream>
-#include <list>
-#include <map>
-#include <mutex>  // NOLINT
-#include <numeric>
-#include <sstream>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "glog/logging.h"
-#include "google/protobuf/text_format.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/platform/device_tracer.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace paddle {
-namespace platform {
-namespace {
-// Tracking the nested block stacks of each thread.
-thread_local std::deque<int> block_id_stack;
-// Tracking the nested event stacks.
-thread_local std::deque<Event *> annotation_stack;
-
-std::map<uint32_t, int32_t> system_thread_id_map;
-
-std::once_flag tracer_once_flag;
-DeviceTracer *tracer = nullptr;
-
-void PrintCuptiHint() {
-  static bool showed = false;
-  if (showed) return;
-  showed = true;
-  LOG(WARNING) << "Invalid timestamp occured. Please try increasing the "
-                  "FLAGS_multiple_of_cupti_buffer_size.";
-}
-
-}  // namespace
-#ifdef PADDLE_WITH_CUPTI
-
-namespace {
-// The experimental best performance is
-// the same size with CUPTI device buffer size(8M)
-uint64_t kBufSize = 1024 * 1024 * 8;
-uint64_t kAlignSize = 8;
-std::unordered_map<CUpti_CallbackId, std::string> runtime_cbid_str,
-    driver_cbid_str;
-
-#define ALIGN_BUFFER(buffer, align)                                 \
-  (((uintptr_t)(buffer) & ((align)-1))                              \
-       ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \
-       : (buffer))
-
-#define CUPTI_CALL(call)                                                   \
-  do {                                                                     \
-    CUptiResult _status = call;                                            \
-    if (_status != CUPTI_SUCCESS) {                                        \
-      const char *errstr;                                                  \
-      dynload::cuptiGetResultString(_status, &errstr);                     \
-      fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
-              __FILE__, __LINE__, #call, errstr);                          \
-      exit(-1);                                                            \
-    }                                                                      \
-  } while (0)
-
-std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
-  switch (kind) {
-    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
-      return "MEMCPY_HtoD";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
-      return "MEMCPY_DtoH";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
-      return "MEMCPY_HtoA";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
-      return "MEMCPY_AtoH";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
-      return "MEMCPY_AtoA";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
-      return "MEMCPY_AtoD";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
-      return "MEMCPY_DtoA";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
-      return "MEMCPY_DtoD";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
-      return "MEMCPY_HtoH";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
-      return "MEMCPY_PtoP";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
-      return "MEMCPY_FORCE_INT";
-    default:
-      break;
-  }
-  return "MEMCPY";
-}
-
-std::string DriverKind(CUpti_CallbackId cbid) {
-  auto iter = driver_cbid_str.find(cbid);
-  if (iter == driver_cbid_str.end())
-    return "Driver API " + std::to_string(cbid);
-  return iter->second;
-}
-
-std::string RuntimeKind(CUpti_CallbackId cbid) {
-  auto iter = runtime_cbid_str.find(cbid);
-  if (iter == runtime_cbid_str.end())
-    return "Runtime API " + std::to_string(cbid);
-  return iter->second;
-}
-
-void EnableActivity() {
-  // Device activity record is created when CUDA initializes, so we
-  // want to enable it before cuInit() or any CUDA runtime call.
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
-  CUPTI_CALL(
-      dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
-  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
-  // We don't track these activities for now.
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
-  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
-  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
-  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
-  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
-  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
-  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
-  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
-}
-
-void DisableActivity() {
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
-  CUPTI_CALL(
-      dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
-  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
-  // Disable all other activity record kinds.
-  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
-  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
-  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
-  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
-}
-
-void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
-                              size_t *maxNumRecords) {
-  uint8_t *buf = reinterpret_cast<uint8_t *>(malloc(kBufSize + kAlignSize));
-  *size = kBufSize;
-  *buffer = ALIGN_BUFFER(buf, kAlignSize);
-  *maxNumRecords = 0;
-}
-
-void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
-                              size_t size, size_t validSize) {
-  static std::thread::id cupti_thread_id(0);
-  if (cupti_thread_id == std::thread::id(0))
-    cupti_thread_id = std::this_thread::get_id();
-  PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id,
-                    "Only one thread is allowed to call bufferCompleted()");
-  CUptiResult status;
-  CUpti_Activity *record = NULL;
-  if (validSize > 0) {
-    do {
-      status = dynload::cuptiActivityGetNextRecord(buffer, validSize, &record);
-      if (status == CUPTI_SUCCESS) {
-        switch (record->kind) {
-          case CUPTI_ACTIVITY_KIND_KERNEL:
-          case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
-            auto *kernel =
-                reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
-            tracer->AddKernelRecords(kernel->name, kernel->start, kernel->end,
-                                     kernel->deviceId, kernel->streamId,
-                                     kernel->correlationId);
-            break;
-          }
-          case CUPTI_ACTIVITY_KIND_MEMCPY: {
-            auto *memcpy =
-                reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
-            tracer->AddMemRecords(
-                MemcpyKind(
-                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
-                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
-                memcpy->correlationId, memcpy->bytes);
-            break;
-          }
-          case CUPTI_ACTIVITY_KIND_MEMCPY2: {
-            auto *memcpy =
-                reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
-            tracer->AddMemRecords(
-                MemcpyKind(
-                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
-                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
-                memcpy->correlationId, memcpy->bytes);
-            break;
-          }
-          case CUPTI_ACTIVITY_KIND_MEMSET: {
-            auto *memset =
-                reinterpret_cast<const CUpti_ActivityMemset *>(record);
-            tracer->AddKernelRecords("MEMSET", memset->start, memset->end,
-                                     memset->deviceId, memset->streamId,
-                                     memset->correlationId);
-            break;
-          }
-          case CUPTI_ACTIVITY_KIND_DRIVER: {
-            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
-            if (api->start != 0 && api->end != 0) {
-              // -1 device id represents ActiveKind api call
-              tracer->AddActiveKindRecords(
-                  DriverKind(api->cbid), api->start, api->end, -1,
-                  GetThreadIdFromSystemThreadId(api->threadId),
-                  api->correlationId);
-            }
-            break;
-          }
-          case CUPTI_ACTIVITY_KIND_RUNTIME: {
-            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
-            if (api->start != 0 && api->end != 0) {
-              // -1 device id represents ActiveKind api call
-              tracer->AddActiveKindRecords(
-                  RuntimeKind(api->cbid), api->start, api->end, -1,
-                  GetThreadIdFromSystemThreadId(api->threadId),
-                  api->correlationId);
-            }
-            break;
-          }
-          default: { break; }
-        }
-      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
-        // Seems not an error in this case.
-        break;
-      } else {
-        CUPTI_CALL(status);
-      }
-    } while (1);
-
-    size_t dropped;
-    CUPTI_CALL(
-        dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
-    if (dropped != 0) {
-      fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
-      PrintCuptiHint();
-    }
-  }
-  free(buffer);
-}
-
-void initCuptiCbidStr();
-
-}  // namespace
-
-#endif  // PADDLE_WITH_CUPTI
-
-class DeviceTracerImpl : public DeviceTracer {
- public:
-  DeviceTracerImpl() : enabled_(false) {
-#ifdef PADDLE_WITH_CUPTI
-    initCuptiCbidStr();
-#endif
-  }
-
-  void AddAnnotation(uint32_t id, Event *event) {
-    thread_local std::forward_list<std::pair<uint32_t, Event *>>
-        *local_correlations_pairs = nullptr;
-    if (local_correlations_pairs == nullptr) {
-      std::lock_guard<std::mutex> l(trace_mu_);
-      correlations_pairs.emplace_front();
-      local_correlations_pairs = &correlations_pairs.front();
-    }
-    local_correlations_pairs->push_front(std::make_pair(id, event));
-  }
-
-  void AddCPURecords(const std::string &anno, uint64_t start_ns,
-                     uint64_t end_ns, int64_t device_id, int64_t thread_id) {
-    if (anno.empty()) {
-      VLOG(1) << "Empty timeline annotation.";
-      return;
-    }
-    thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
-    if (local_cpu_records_ == nullptr) {
-      std::lock_guard<std::mutex> l(trace_mu_);
-      cpu_records_.emplace_front();
-      local_cpu_records_ = &cpu_records_.front();
-    }
-    local_cpu_records_->push_front(
-        CPURecord{anno, start_ns, end_ns, device_id, thread_id});
-  }
-
-  void AddMemRecords(const std::string &name, uint64_t start_ns,
-                     uint64_t end_ns, int64_t device_id, int64_t stream_id,
-                     uint32_t correlation_id, uint64_t bytes) {
-    // 0 means timestamp information could not be collected for the kernel.
-    if (start_ns == 0 || end_ns == 0 || start_ns == end_ns) {
-      VLOG(3) << name << " cannot be traced";
-      PrintCuptiHint();
-      return;
-    }
-    // NOTE(liangdun): lock is not needed, only one thread call this function.
-    mem_records_.push_front(MemRecord{name, start_ns, end_ns, device_id,
-                                      stream_id, correlation_id, bytes});
-  }
-
-  void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                        const Place &place, const std::string &alloc_in,
-                        const std::string &free_in, int64_t thread_id) {
-    if (0 == start_ns || 0 == end_ns) {
-      VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
-      return;
-    }
-    thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
-        nullptr;
-    if (local_mem_info_record == nullptr) {
-      std::lock_guard<std::mutex> l(trace_mu_);
-      mem_info_record_.emplace_front();
-      local_mem_info_record = &mem_info_record_.front();
-    }
-    local_mem_info_record->emplace_front(MemInfoRecord{
-        start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
-  }
-
-  void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
-                            uint64_t end_ns, int64_t device_id,
-                            int64_t thread_id, uint32_t correlation_id) {
-    if (anno.empty()) {
-      VLOG(1) << "Empty timeline annotation.";
-      return;
-    }
-    thread_local std::forward_list<ActiveKindRecord>
-        *local_active_kind_records = nullptr;
-    if (local_active_kind_records == nullptr) {
-      std::lock_guard<std::mutex> l(trace_mu_);
-      active_kind_records_.emplace_front();
-      local_active_kind_records = &active_kind_records_.front();
-    }
-    //  lock is not needed, only one thread call this function.
-    local_active_kind_records->push_front(ActiveKindRecord{
-        anno, start_ns, end_ns, device_id, thread_id, correlation_id});
-  }
-
-  void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
-                        int64_t device_id, int64_t stream_id,
-                        uint32_t correlation_id) {
-    // 0 means timestamp information could not be collected for the kernel.
-    if (start == 0 || end == 0 || start == end) {
-      VLOG(3) << correlation_id << " cannot be traced";
-      PrintCuptiHint();
-      return;
-    }
-    // NOTE(liangdun): lock is not needed, only one thread call this function.
-    kernel_records_.push_front(
-        KernelRecord{name, start, end, device_id, stream_id, correlation_id});
-  }
-
-  bool IsEnabled() {
-    std::lock_guard<std::mutex> l(trace_mu_);
-    return enabled_;
-  }
-
-  void Enable() {
-    std::lock_guard<std::mutex> l(trace_mu_);
-    if (enabled_) {
-      return;
-    }
-
-#ifdef PADDLE_WITH_CUPTI
-    EnableActivity();
-
-    // Register callbacks for buffer requests and completed by CUPTI.
-    CUPTI_CALL(dynload::cuptiActivityRegisterCallbacks(bufferRequested,
-                                                       bufferCompleted));
-
-    CUptiResult ret;
-    ret = dynload::cuptiSubscribe(
-        &subscriber_, static_cast<CUpti_CallbackFunc>(ApiCallback), this);
-    if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) {
-      fprintf(stderr, "CUPTI subcriber limit reached.\n");
-    } else if (ret != CUPTI_SUCCESS) {
-      fprintf(stderr, "Failed to create CUPTI subscriber.\n");
-    }
-    const std::vector<int> cbids {
-      CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
-          CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020,
-          CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
-          CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
-          CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
-          CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
-          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
-#if CUDA_VERSION >= 9000
-          ,
-          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
-          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000
-#endif
-    };
-    for (auto cbid : cbids)
-      CUPTI_CALL(dynload::cuptiEnableCallback(
-          1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, cbid));
-    CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
-#endif  // PADDLE_WITH_CUPTI
-    enabled_ = true;
-  }
-
-  void Reset() {
-#ifdef PADDLE_WITH_CUPTI
-    CUPTI_CALL(
-        dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
-#endif
-    std::lock_guard<std::mutex> l(trace_mu_);
-    kernel_records_.clear();
-    mem_records_.clear();
-    correlations_.clear();
-    for (auto &tmp : correlations_pairs) tmp.clear();
-    for (auto &tmp : cpu_records_) tmp.clear();
-    for (auto &tmp : mem_info_record_) tmp.clear();
-    for (auto &tmp : active_kind_records_) tmp.clear();
-  }
-
-  void GenEventKernelCudaElapsedTime() {
-#ifdef PADDLE_WITH_CUPTI
-    if (correlations_.empty())
-      for (auto &tmp : correlations_pairs)
-        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
-    for (const KernelRecord &r : kernel_records_) {
-      auto c = correlations_.find(r.correlation_id);
-      if (c != correlations_.end() && c->second != nullptr) {
-        Event *e = c->second;
-        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
-      }
-    }
-    for (const auto &r : mem_records_) {
-      auto c = correlations_.find(r.correlation_id);
-      if (c != correlations_.end() && c->second != nullptr) {
-        Event *e = c->second;
-        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
-      }
-    }
-#endif
-  }
-
-  proto::Profile GenProfile(const std::string &profile_path) {
-    int miss = 0, find = 0;
-    std::lock_guard<std::mutex> l(trace_mu_);
-    proto::Profile profile_pb;
-    profile_pb.set_start_ns(start_ns_);
-    profile_pb.set_end_ns(end_ns_);
-    if (correlations_.empty()) {
-      for (auto &tmp : correlations_pairs) {
-        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
-      }
-    }
-
-    for (const KernelRecord &r : kernel_records_) {
-      auto *event = profile_pb.add_events();
-      event->set_type(proto::Event::GPUKernel);
-      auto c = correlations_.find(r.correlation_id);
-      if (c != correlations_.end() && c->second != nullptr) {
-        event->set_name(c->second->name());
-        event->set_detail_info(r.name);
-        find++;
-      } else {
-        VLOG(10) << "Missing Kernel Event: " + r.name;
-        miss++;
-        event->set_name(r.name);
-      }
-      event->set_start_ns(r.start_ns);
-      event->set_end_ns(r.end_ns);
-      event->set_sub_device_id(r.stream_id);
-      event->set_device_id(r.device_id);
-    }
-    VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
-
-    for (auto &tmp : cpu_records_) {
-      for (const CPURecord &r : tmp) {
-        auto *event = profile_pb.add_events();
-        event->set_type(proto::Event::CPU);
-        event->set_name(r.name);
-        event->set_start_ns(r.start_ns);
-        event->set_end_ns(r.end_ns);
-        event->set_sub_device_id(r.thread_id);
-        event->set_device_id(r.device_id);
-      }
-    }
-
-    for (auto &tmp : active_kind_records_) {
-      for (const ActiveKindRecord &r : tmp) {
-        auto *event = profile_pb.add_events();
-        event->set_type(proto::Event::CPU);
-        auto c = correlations_.find(r.correlation_id);
-        if (c != correlations_.end() && c->second != nullptr) {
-          event->set_name(c->second->name());
-          event->set_detail_info(r.name);
-        } else {
-          event->set_name(r.name);
-        }
-        event->set_start_ns(r.start_ns);
-        event->set_end_ns(r.end_ns);
-        event->set_sub_device_id(r.thread_id);
-        event->set_device_id(r.device_id);
-      }
-    }
-    miss = find = 0;
-    for (const MemRecord &r : mem_records_) {
-      auto *event = profile_pb.add_events();
-      event->set_type(proto::Event::GPUKernel);
-      auto c = correlations_.find(r.correlation_id);
-      if (c != correlations_.end() && c->second != nullptr) {
-        event->set_name(c->second->name());
-        event->set_detail_info(r.name);
-        find++;
-      } else {
-        miss++;
-        event->set_name(r.name);
-      }
-      event->set_start_ns(r.start_ns);
-      event->set_end_ns(r.end_ns);
-      event->set_sub_device_id(r.stream_id);
-      event->set_device_id(r.device_id);
-      event->mutable_memcopy()->set_bytes(r.bytes);
-    }
-    VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
-
-    for (auto &tmp : mem_info_record_) {
-      for (const auto &r : tmp) {
-        auto *event = profile_pb.add_mem_events();
-        event->set_device_id(0);
-        if (platform::is_cpu_place(r.place)) {
-          event->set_place(proto::MemEvent::CPUPlace);
-        } else if (platform::is_gpu_place(r.place)) {
-          event->set_place(proto::MemEvent::CUDAPlace);
-          event->set_device_id(
-              boost::get<platform::CUDAPlace>(r.place).GetDeviceId());
-        } else if (platform::is_cuda_pinned_place(r.place)) {
-          event->set_place(proto::MemEvent::CUDAPinnedPlace);
-        } else {
-          PADDLE_THROW("The current place is not supported.");
-        }
-        event->set_alloc_in(r.alloc_in);
-        event->set_free_in(r.free_in);
-        event->set_start_ns(r.start_ns);
-        event->set_end_ns(r.end_ns);
-        event->set_bytes(r.bytes);
-        event->set_thread_id(r.thread_id);
-      }
-    }
-
-    std::ofstream profile_f;
-    profile_f.open(profile_path,
-                   std::ios::out | std::ios::trunc | std::ios::binary);
-    profile_pb.SerializeToOstream(&profile_f);
-    profile_f.close();
-    return profile_pb;
-  }
-
-  void Disable() {
-#ifdef PADDLE_WITH_CUPTI
-    // flush might cause additional calls to DeviceTracker.
-    CUPTI_CALL(
-        dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
-#endif  // PADDLE_WITH_CUPTI
-    std::lock_guard<std::mutex> l(trace_mu_);
-#ifdef PADDLE_WITH_CUPTI
-    DisableActivity();
-    CUPTI_CALL(dynload::cuptiUnsubscribe(subscriber_));
-    CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
-#endif  // PADDLE_WITH_CUPTI
-    enabled_ = false;
-  }
-
- private:
-#ifdef PADDLE_WITH_CUPTI
-  static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
-                                   CUpti_CallbackId cbid, const void *cbdata) {
-    auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
-    DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
-    if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-      Event *event = CurAnnotation();
-      tracer->AddAnnotation(cbInfo->correlationId, event);
-    }
-  }
-  CUpti_SubscriberHandle subscriber_;
-#endif  // PADDLE_WITH_CUPTI
-  std::mutex trace_mu_;
-  bool enabled_;
-  uint64_t start_ns_;
-  uint64_t end_ns_;
-  std::forward_list<KernelRecord> kernel_records_;
-  std::forward_list<MemRecord> mem_records_;
-  std::forward_list<std::forward_list<CPURecord>> cpu_records_;
-  std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
-  std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
-  std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
-      correlations_pairs;
-  std::unordered_map<uint32_t, Event *> correlations_;
-};
-
-void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
-
-DeviceTracer *GetDeviceTracer() {
-  std::call_once(tracer_once_flag, CreateTracer, &tracer);
-  return tracer;
-}
-
-void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
-
-void ClearCurAnnotation() { annotation_stack.pop_back(); }
-
-Event *CurAnnotation() {
-  if (annotation_stack.empty()) return nullptr;
-  return annotation_stack.back();
-}
-std::string CurAnnotationName() {
-  if (annotation_stack.empty()) return "Unknown";
-  return annotation_stack.back()->name();
-}
-
-void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
-
-void ClearCurBlock() { block_id_stack.pop_back(); }
-
-int BlockDepth() { return block_id_stack.size(); }
-
-uint32_t GetCurSystemThreadId() {
-  std::stringstream ss;
-  ss << std::this_thread::get_id();
-  uint32_t id = static_cast<uint32_t>(std::stoull(ss.str()));
-  return id;
-}
-
-void RecoreCurThreadId(int32_t id) {
-  auto gid = GetCurSystemThreadId();
-  VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id;
-  system_thread_id_map[gid] = id;
-}
-
-int32_t GetThreadIdFromSystemThreadId(uint32_t id) {
-  auto it = system_thread_id_map.find(id);
-  if (it != system_thread_id_map.end()) return it->second;
-  // return origin id if no event is recorded in this thread.
-  return static_cast<int32_t>(id);
-}
-
-#ifdef PADDLE_WITH_CUPTI
-namespace {
-
-void initCuptiCbidStr() {
-  static bool called = false;
-  if (called) return;
-  called = true;
-#define REGISTER_RUNTIME_CBID_STR(cbid) \
-  runtime_cbid_str[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid
-
-  REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
-  REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
-  REGISTER_RUNTIME_CBID_STR(
-      cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
-  REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
-#if CUDA_VERSION >= 9000
-  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
-  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
-#endif
-
-#undef REGISTER_RUNTIME_CBID_STR
-}
-}  // namespace
-#endif  // PADDLE_WITH_CUPTI
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
deleted file mode 100644
index 85168a046fb3fa4317956737871cde56e15bedfb..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device_tracer.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <chrono>  // NOLINT
-#include <string>
-
-#include "paddle/fluid/platform/dynload/cupti.h"
-#include "paddle/fluid/platform/event.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/profiler.pb.h"
-
-namespace paddle {
-namespace platform {
-
-///////////////////////
-// WARN: Under Development. Don't depend on it yet.
-//////////////////////
-inline uint64_t PosixInNsec() {
-  struct timeval tv;
-  gettimeofday(&tv, nullptr);
-  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
-}
-
-// DeviceTracer performs the following tasks:
-// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
-// 2. Collect cuda statistics: start/end ts, memory, etc.
-// 3. Generate a protobuf for further analysis.
-class DeviceTracer {
- public:
-  struct KernelRecord {
-    std::string name;
-    uint64_t start_ns;
-    uint64_t end_ns;
-    int64_t device_id;
-    int64_t stream_id;
-    uint32_t correlation_id;
-  };
-
-  struct CPURecord {
-    std::string name;
-    uint64_t start_ns;
-    uint64_t end_ns;
-    int64_t device_id;
-    int64_t thread_id;
-  };
-
-  struct MemRecord {
-    std::string name;
-    uint64_t start_ns;
-    uint64_t end_ns;
-    int64_t device_id;
-    int64_t stream_id;
-    uint32_t correlation_id;
-    uint64_t bytes;
-  };
-
-  struct MemInfoRecord {
-    uint64_t start_ns;
-    uint64_t end_ns;
-    size_t bytes;
-    Place place;
-    int64_t thread_id;
-    std::string alloc_in;
-    std::string free_in;
-  };
-
-  struct ActiveKindRecord {
-    std::string name;
-    uint64_t start_ns;
-    uint64_t end_ns;
-    int64_t device_id;
-    int64_t thread_id;
-    uint32_t correlation_id;
-  };
-
-  virtual ~DeviceTracer() {}
-  // Needs to be called once before use.
-  virtual void Enable() = 0;
-  // Needs to be called once after use.
-  virtual void Disable() = 0;
-  // Needs to be called once before reuse.
-  virtual void Reset() = 0;
-
-  // Add a pair to correlate internal cuda id with high level
-  // annotation event(with string). So cuda statistics can be represented by
-  // human-readable annotations.
-  virtual void AddAnnotation(uint32_t id, Event* event) = 0;
-
-  virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
-                             uint64_t end_ns, int64_t device_id,
-                             int64_t stream_id, uint32_t correlation_id,
-                             uint64_t bytes) = 0;
-
-  virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
-                             uint64_t end_ns, int64_t device_id,
-                             int64_t thread_id) = 0;
-  virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
-                                    uint64_t end_ns, int64_t device_id,
-                                    int64_t thread_id,
-                                    uint32_t correlation_id) = 0;
-
-  virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
-                                size_t bytes, const Place& place,
-                                const std::string& alloc_in,
-                                const std::string& free_in,
-                                int64_t thread_id) = 0;
-
-  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
-  // added before for human readability.
-  virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
-                                int64_t device_id, int64_t stream_id,
-                                uint32_t correlation_id) = 0;
-
-  // Generate a proto after done (Disabled).
-  virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
-
-  // generate kernel elapsed time into Event
-  virtual void GenEventKernelCudaElapsedTime() = 0;
-
-  virtual bool IsEnabled() = 0;
-};
-
-// Get a DeviceTracer.
-DeviceTracer* GetDeviceTracer();
-
-// Set a name for the cuda kernel operation being launched by the thread.
-void SetCurAnnotation(Event* event);
-// Clear the name after the operation is done.
-void ClearCurAnnotation();
-// Current name of the operation being run in the thread.
-std::string CurAnnotationName();
-Event* CurAnnotation();
-
-void SetCurBlock(int block_id);
-void ClearCurBlock();
-int BlockDepth();
-
-// Set current thread id, so we can map the system thread id to thread id.
-void RecoreCurThreadId(int32_t id);
-int32_t GetThreadIdFromSystemThreadId(uint32_t id);
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
deleted file mode 100644
index 81312111aebac24d7c2854f0b192269860af0db1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
-
-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
-
-# There is no macOS version of NCCL.
-# Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
-if (NOT APPLE AND NOT WIN32)
-  list(APPEND CUDA_SRCS nccl.cc nvrtc.cc cuda_driver.cc)
-endif()
-
-if (TENSORRT_FOUND)
-  list(APPEND CUDA_SRCS tensorrt.cc)
-endif()
-
-configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
-if (CUPTI_FOUND)
-    list(APPEND CUDA_SRCS cupti.cc)
-endif(CUPTI_FOUND)
-nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
-cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
-if (WITH_MKLML)
-    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
-endif()
-# TODO(TJ): add iomp, mkldnn?
diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc
deleted file mode 100644
index 41648c32fe6f98bb0b78ea7891065e5586f70463..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/cublas.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
-
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
-CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
-#endif
-
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
-CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
-#endif
-
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
-CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
-#endif
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
deleted file mode 100644
index ed9b9133c6a0d7597d73a7090c41b4dc56062e24..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/cublas.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cublasXt.h>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <mutex>  // NOLINT
-#include <type_traits>
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag cublas_dso_flag;
-extern void *cublas_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cublas routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    using FUNC_TYPE = decltype(&::__name);                                   \
-    template <typename... Args>                                              \
-    inline cublasStatus_t operator()(Args... args) {                         \
-      std::call_once(cublas_dso_flag, []() {                                 \
-        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
-      return reinterpret_cast<FUNC_TYPE>(p_##__name)(args...);               \
-    }                                                                        \
-  };                                                                         \
-  extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
-  struct DynLoad__##__name {                         \
-    template <typename... Args>                      \
-    inline cublasStatus_t operator()(Args... args) { \
-      return ::__name(args...);                      \
-    }                                                \
-  };                                                 \
-  extern DynLoad__##__name __name
-#endif
-
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSaxpy_v2);                \
-  __macro(cublasDaxpy_v2);                \
-  __macro(cublasSgemv_v2);                \
-  __macro(cublasDgemv_v2);                \
-  __macro(cublasSgemm_v2);                \
-  __macro(cublasDgemm_v2);                \
-  __macro(cublasHgemm);                   \
-  __macro(cublasSgemmEx);                 \
-  __macro(cublasSgeam);                   \
-  __macro(cublasDgeam);                   \
-  __macro(cublasCreate_v2);               \
-  __macro(cublasDestroy_v2);              \
-  __macro(cublasSetStream_v2);            \
-  __macro(cublasSetPointerMode_v2);       \
-  __macro(cublasGetPointerMode_v2);       \
-  __macro(cublasSgemmBatched);            \
-  __macro(cublasDgemmBatched);            \
-  __macro(cublasCgemmBatched);            \
-  __macro(cublasZgemmBatched);            \
-  __macro(cublasSgetrfBatched);           \
-  __macro(cublasSgetriBatched);           \
-  __macro(cublasDgetrfBatched);           \
-  __macro(cublasDgetriBatched);
-
-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-
-// APIs available after CUDA 8.0
-#if CUDA_VERSION >= 8000
-#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
-  __macro(cublasGemmEx);                     \
-  __macro(cublasSgemmStridedBatched);        \
-  __macro(cublasDgemmStridedBatched);        \
-  __macro(cublasCgemmStridedBatched);        \
-  __macro(cublasZgemmStridedBatched);        \
-  __macro(cublasHgemmStridedBatched);
-
-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-#endif
-
-// APIs available after CUDA 9.0
-#if CUDA_VERSION >= 9000
-#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
-  __macro(cublasSetMathMode);                \
-  __macro(cublasGetMathMode);
-
-CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-#endif
-
-// APIs available after CUDA 9.1
-#if CUDA_VERSION >= 9010
-#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
-  __macro(cublasGemmBatchedEx);              \
-  __macro(cublasGemmStridedBatchedEx);
-
-CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-#endif
-
-#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cuda_driver.cc b/paddle/fluid/platform/dynload/cuda_driver.cc
deleted file mode 100644
index 2c2edb2ccef9720f0b31b3734c3a775337b5e1ce..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/cuda_driver.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/cuda_driver.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag cuda_dso_flag;
-void* cuda_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUDA_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h
deleted file mode 100644
index 894797728bb1c3794082bc0ba3094a6748c5a0c4..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cuda.h>
-#include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag cuda_dso_flag;
-extern void* cuda_dso_handle;
-
-#ifdef PADDLE_USE_DSO
-
-#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {     \
-      using cuda_func = decltype(&::__name);                             \
-      std::call_once(cuda_dso_flag, []() {                               \
-        cuda_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(cuda_dso_handle, #__name);         \
-      return reinterpret_cast<cuda_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
-  extern struct DynLoad__##__name __name
-
-#else
-
-#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \
-  struct DynLoad__##__name {                   \
-    template <typename... Args>                \
-    inline auto operator()(Args... args) {     \
-      return ::__name(args...);                \
-    }                                          \
-  };                                           \
-  extern DynLoad__##__name __name
-
-#endif
-
-/**
- * include all needed cuda driver functions
- **/
-#define CUDA_ROUTINE_EACH(__macro)                      \
-  __macro(cuGetErrorString);                            \
-  __macro(cuModuleLoadData);                            \
-  __macro(cuModuleGetFunction);                         \
-  __macro(cuModuleUnload);                              \
-  __macro(cuOccupancyMaxActiveBlocksPerMultiprocessor); \
-  __macro(cuLaunchKernel);                              \
-  __macro(cuCtxCreate);                                 \
-  __macro(cuCtxGetCurrent);                             \
-  __macro(cuDeviceGet);                                 \
-  __macro(cuDevicePrimaryCtxGetState)
-
-CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
-
-#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
deleted file mode 100644
index 91d9a1ef013449e83f2540a6646c96e34347ccc1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/cudnn.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
-CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R5
-CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R6
-CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R7
-CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
-#endif
-
-#ifdef PADDLE_USE_DSO
-bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag,
-                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
-  return cudnn_dso_handle != nullptr;
-}
-
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE(cudnn_dso_handle != nullptr,
-                 "Cannot load cudnn shared library. Cannot invoke method %s",
-                 fn_name);
-}
-#else
-bool HasCUDNN() { return true; }
-#endif
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
deleted file mode 100644
index 67e2a18dd372243e5e7b5ddf128ec9e4e383484f..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <glog/logging.h>
-
-#include <cudnn.h>
-#include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag cudnn_dso_flag;
-extern void* cudnn_dso_handle;
-extern bool HasCUDNN();
-
-#ifdef PADDLE_USE_DSO
-
-extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using cudnn_func = decltype(&::__name);                              \
-      std::call_once(cudnn_dso_flag, []() {                                \
-        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
-      });                                                                  \
-      EnforceCUDNNLoaded(#__name);                                         \
-      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);          \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
-
-#else
-
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
-  struct DynLoad__##__name {                    \
-    template <typename... Args>                 \
-    inline auto operator()(Args... args) {      \
-      return ::__name(args...);                 \
-    }                                           \
-  };                                            \
-  extern DynLoad__##__name __name
-
-#endif
-
-/**
- * include all needed cudnn functions in HPPL
- * different cudnn version has different interfaces
- **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor);                    \
-  __macro(cudnnSetTensor4dDescriptorEx);                  \
-  __macro(cudnnSetTensorNdDescriptor);                    \
-  __macro(cudnnGetTensorNdDescriptor);                    \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);         \
-  __macro(cudnnGetConvolutionForwardAlgorithm);           \
-  __macro(cudnnCreateTensorDescriptor);                   \
-  __macro(cudnnDestroyTensorDescriptor);                  \
-  __macro(cudnnCreateFilterDescriptor);                   \
-  __macro(cudnnSetFilter4dDescriptor);                    \
-  __macro(cudnnSetFilterNdDescriptor);                    \
-  __macro(cudnnGetFilterNdDescriptor);                    \
-  __macro(cudnnSetPooling2dDescriptor);                   \
-  __macro(cudnnSetPoolingNdDescriptor);                   \
-  __macro(cudnnGetPoolingNdDescriptor);                   \
-  __macro(cudnnDestroyFilterDescriptor);                  \
-  __macro(cudnnCreateConvolutionDescriptor);              \
-  __macro(cudnnCreatePoolingDescriptor);                  \
-  __macro(cudnnDestroyPoolingDescriptor);                 \
-  __macro(cudnnSetConvolution2dDescriptor);               \
-  __macro(cudnnDestroyConvolutionDescriptor);             \
-  __macro(cudnnSetConvolutionNdDescriptor);               \
-  __macro(cudnnGetConvolutionNdDescriptor);               \
-  __macro(cudnnDeriveBNTensorDescriptor);                 \
-  __macro(cudnnCreateSpatialTransformerDescriptor);       \
-  __macro(cudnnSetSpatialTransformerNdDescriptor);        \
-  __macro(cudnnDestroySpatialTransformerDescriptor);      \
-  __macro(cudnnSpatialTfGridGeneratorForward);            \
-  __macro(cudnnSpatialTfGridGeneratorBackward);           \
-  __macro(cudnnSpatialTfSamplerForward);                  \
-  __macro(cudnnSpatialTfSamplerBackward);                 \
-  __macro(cudnnCreate);                                   \
-  __macro(cudnnDestroy);                                  \
-  __macro(cudnnSetStream);                                \
-  __macro(cudnnActivationForward);                        \
-  __macro(cudnnActivationBackward);                       \
-  __macro(cudnnConvolutionForward);                       \
-  __macro(cudnnConvolutionBackwardBias);                  \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize);       \
-  __macro(cudnnTransformTensor);                          \
-  __macro(cudnnPoolingForward);                           \
-  __macro(cudnnPoolingBackward);                          \
-  __macro(cudnnSoftmaxBackward);                          \
-  __macro(cudnnSoftmaxForward);                           \
-  __macro(cudnnGetVersion);                               \
-  __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
-  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
-  __macro(cudnnGetErrorString);                           \
-  __macro(cudnnCreateDropoutDescriptor);                  \
-  __macro(cudnnDropoutGetStatesSize);                     \
-  __macro(cudnnSetDropoutDescriptor);                     \
-  __macro(cudnnCreateRNNDescriptor);                      \
-  __macro(cudnnSetRNNDescriptor);                         \
-  __macro(cudnnGetRNNParamsSize);                         \
-  __macro(cudnnGetRNNWorkspaceSize);                      \
-  __macro(cudnnGetRNNTrainingReserveSize);                \
-  __macro(cudnnRNNForwardTraining);                       \
-  __macro(cudnnRNNBackwardData);                          \
-  __macro(cudnnRNNBackwardWeights);                       \
-  __macro(cudnnRNNForwardInference);                      \
-  __macro(cudnnDestroyDropoutDescriptor);                 \
-  __macro(cudnnDestroyRNNDescriptor);
-
-CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
-  __macro(cudnnAddTensor);                 \
-  __macro(cudnnConvolutionBackwardData);   \
-  __macro(cudnnConvolutionBackwardFilter);
-CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm);       \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);     \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs available after R4:
-#if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
-  __macro(cudnnBatchNormalizationForwardTraining);  \
-  __macro(cudnnBatchNormalizationForwardInference); \
-  __macro(cudnnBatchNormalizationBackward);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs in R5
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
-  __macro(cudnnCreateActivationDescriptor); \
-  __macro(cudnnSetActivationDescriptor);    \
-  __macro(cudnnGetActivationDescriptor);    \
-  __macro(cudnnDestroyActivationDescriptor);
-CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs in R6
-#if CUDNN_VERSION >= 6000
-#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6);
-CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#if CUDNN_VERSION >= 7001
-#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                \
-  __macro(cudnnSetConvolutionGroupCount);                 \
-  __macro(cudnnSetConvolutionMathType);                   \
-  __macro(cudnnConvolutionBiasActivationForward);         \
-  __macro(cudnnCreateCTCLossDescriptor);                  \
-  __macro(cudnnDestroyCTCLossDescriptor);                 \
-  __macro(cudnnGetCTCLossDescriptor);                     \
-  __macro(cudnnSetCTCLossDescriptor);                     \
-  __macro(cudnnGetCTCLossWorkspaceSize);                  \
-  __macro(cudnnCTCLoss);                                  \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7);   \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
-  __macro(cudnnGetConvolutionForwardAlgorithm_v7);
-CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cupti.cc b/paddle/fluid/platform/dynload/cupti.cc
deleted file mode 100644
index a25660c6ed411bbe444ac8aa10a324cbed9c9d4f..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/cupti.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUPTI
-
-#include "paddle/fluid/platform/dynload/cupti.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag cupti_dso_flag;
-void *cupti_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUPTI_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-
-#endif  // PADDLE_WITH_CUPTI
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
deleted file mode 100644
index b946f46e82af4b09fafff54765b899254a4ec1df..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/cupti.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#ifdef PADDLE_WITH_CUPTI
-
-#include <cuda.h>
-#include <cupti.h>
-#include <mutex>  // NOLINT
-
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag cupti_dso_flag;
-extern void *cupti_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cupti routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    inline CUptiResult CUPTIAPI operator()(Args... args) {                 \
-      using cuptiFunc = decltype(&::__name);                               \
-      std::call_once(cupti_dso_flag, []() {                                \
-        cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
-      });                                                                  \
-      static void *p_##__name = dlsym(cupti_dso_handle, #__name);          \
-      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
-    }                                                                      \
-  };                                                                       \
-  extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \
-  struct DynLoad__##__name {                               \
-    template <typename... Args>                            \
-    inline CUptiResult CUPTIAPI operator()(Args... args) { \
-      return __name(args...);                              \
-    }                                                      \
-  };                                                       \
-  extern DynLoad__##__name __name
-#endif
-
-#define CUPTI_ROUTINE_EACH(__macro)           \
-  __macro(cuptiActivityEnable);               \
-  __macro(cuptiActivityDisable);              \
-  __macro(cuptiActivityRegisterCallbacks);    \
-  __macro(cuptiActivityGetAttribute);         \
-  __macro(cuptiActivitySetAttribute);         \
-  __macro(cuptiGetTimestamp);                 \
-  __macro(cuptiActivityGetNextRecord);        \
-  __macro(cuptiGetResultString);              \
-  __macro(cuptiActivityGetNumDroppedRecords); \
-  __macro(cuptiActivityFlushAll);             \
-  __macro(cuptiSubscribe);                    \
-  __macro(cuptiUnsubscribe);                  \
-  __macro(cuptiEnableCallback);               \
-  __macro(cuptiEnableDomain);
-
-CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
-
-#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-
-#endif  // PADDLE_WITH_CUPTI
diff --git a/paddle/fluid/platform/dynload/cupti_lib_path.h.in b/paddle/fluid/platform/dynload/cupti_lib_path.h.in
deleted file mode 100644
index 017384bfbb7eb6db3927894f652c11ddb07cebc5..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/cupti_lib_path.h.in
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
diff --git a/paddle/fluid/platform/dynload/curand.cc b/paddle/fluid/platform/dynload/curand.cc
deleted file mode 100644
index ce83ebc84fe7bbd91e1c7e46f98a7f0d8b4a7394..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/curand.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/curand.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag curand_dso_flag;
-void *curand_dso_handle;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
deleted file mode 100644
index 0bb300ec33076d9ddfaf69190f14131279cc888e..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/curand.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <curand.h>
-
-#include <mutex>  // NOLINT
-#include "paddle/fluid/platform/port.h"
-
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-extern std::once_flag curand_dso_flag;
-extern void *curand_dso_handle;
-#ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    curandStatus_t operator()(Args... args) {                                \
-      using curandFunc = decltype(&::__name);                                \
-      std::call_once(curand_dso_flag, []() {                                 \
-        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(curand_dso_handle, #__name);           \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
-  extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
-  struct DynLoad__##__name {                     \
-    template <typename... Args>                  \
-    curandStatus_t operator()(Args... args) {    \
-      return ::__name(args...);                  \
-    }                                            \
-  };                                             \
-  extern DynLoad__##__name __name
-#endif
-
-#define CURAND_RAND_ROUTINE_EACH(__macro)      \
-  __macro(curandCreateGenerator);              \
-  __macro(curandSetStream);                    \
-  __macro(curandSetPseudoRandomGeneratorSeed); \
-  __macro(curandGenerateUniform);              \
-  __macro(curandGenerateUniformDouble);        \
-  __macro(curandGenerateNormal);               \
-  __macro(curandDestroyGenerator);
-
-CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
deleted file mode 100644
index 4a1cd5a8db7fa9b8f8fdd9427c7a26e5c90cc95f..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
-
-DEFINE_string(cudnn_dir, "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
-
-DEFINE_string(cuda_dir, "",
-              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(nccl_dir, "",
-              "Specify path for loading nccl library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
-
-DEFINE_string(
-    tensorrt_dir, "",
-    "Specify path for loading tensorrt library, such as libnvinfer.so.");
-
-DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-struct PathNode {
-  PathNode() {}
-  std::string path = "";
-};
-
-static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
-
-static PathNode s_py_site_pkg_path;
-
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
-static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
-static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
-#endif
-
-static inline std::string join(const std::string& part1,
-                               const std::string& part2) {
-  // directory separator
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-void SetPaddleLibPath(const std::string& py_site_pkg_path) {
-  s_py_site_pkg_path.path = py_site_pkg_path;
-  VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
-}
-
-static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
-                                                int dynload_flags) {
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
-  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  // and /usr/local/lib path
-  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-
-// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-// bring System Integrity Projection (SIP), if dso_handle
-// is null, search from default package path in Mac OS.
-#if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == dso_handle) {
-    dso_handle =
-        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
-    if (nullptr == dso_handle) {
-      if (dso_path == "libcudnn.dylib") {
-        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-                        "For instance, sudo tar -xzf "
-                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-                        "chmod a+r /usr/local/cuda/include/cudnn.h "
-                        "/usr/local/cuda/lib/libcudnn*";
-      }
-    }
-  }
-#endif
-
-  if (nullptr == dso_handle) {
-    LOG(WARNING) << "Can not find library: " << dso_path
-                 << ". The process maybe hang. Please try to add the lib path "
-                    "to LD_LIBRARY_PATH.";
-  }
-  return dso_handle;
-}
-
-static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
-                                               const std::string& dso_name,
-                                               bool throw_on_error = true) {
-#if !defined(_WIN32)
-  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-#else
-  int dynload_flags = 0;
-#endif  // !_WIN32
-  void* dso_handle = nullptr;
-
-  std::string dlPath = dso_name;
-  if (search_root.empty()) {
-    dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
-  } else {
-    // search xxx.so from custom path
-    dlPath = join(search_root, dso_name);
-    dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-#if !defined(_WIN32)
-    auto errorno = dlerror();
-#else
-    auto errorno = GetLastError();
-#endif  // !_WIN32
-    // if not found, search from default path
-    if (nullptr == dso_handle) {
-      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
-                   << errorno << ")";
-      if (dlPath.find("nccl") != std::string::npos) {
-        std::cout
-            << "You may need to install 'nccl2' from NVIDIA official website: "
-            << "https://developer.nvidia.com/nccl/nccl-download"
-            << "before install PaddlePaddle" << std::endl;
-      }
-      dlPath = dso_name;
-      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
-    }
-  }
-  auto error_msg =
-      "Failed to find dynamic library: %s ( %s ) \n Please specify "
-      "its path correctly using following ways: \n Method. set "
-      "environment variable LD_LIBRARY_PATH on Linux or "
-      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
-      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
-      "using the DYLD_LIBRARY_PATH is impossible unless System "
-      "Integrity Protection (SIP) is disabled.";
-#if !defined(_WIN32)
-  auto errorno = dlerror();
-#else
-  auto errorno = GetLastError();
-#endif  // !_WIN32
-  if (throw_on_error) {
-    PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno);
-  } else if (nullptr == dso_handle) {
-    LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno);
-  }
-
-  return dso_handle;
-}
-
-void* GetCublasDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
-#endif
-}
-
-void* GetCUDNNDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
-#endif
-}
-
-void* GetCUPTIDsoHandle() {
-  std::string cupti_path = cupti_lib_path;
-  if (!FLAGS_cupti_dir.empty()) {
-    cupti_path = FLAGS_cupti_dir;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
-#else
-  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false);
-#endif
-}
-
-void* GetCurandDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
-#endif
-}
-
-void* GetNVRTCDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so");
-#endif
-}
-
-void* GetCUDADsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so");
-#endif
-}
-
-void* GetWarpCTCDsoHandle() {
-  std::string warpctc_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    warpctc_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
-#else
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
-#endif
-}
-
-void* GetNCCLDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
-#endif
-}
-
-void* GetTensorRtDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
-#endif
-}
-
-void* GetMKLMLDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
-#endif
-}
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
deleted file mode 100644
index df101474aa4e158198baf92ca389d23239ba6f47..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#ifndef _WIN32
-#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
-#else
-#define DECLARE_TYPE(__name, ...) decltype(auto)
-#endif
-
-void* GetCublasDsoHandle();
-void* GetCUDNNDsoHandle();
-void* GetCUPTIDsoHandle();
-void* GetCurandDsoHandle();
-void* GetNVRTCDsoHandle();
-void* GetCUDADsoHandle();
-void* GetWarpCTCDsoHandle();
-void* GetNCCLDsoHandle();
-void* GetTensorRtDsoHandle();
-void* GetMKLMLDsoHandle();
-
-void SetPaddleLibPath(const std::string&);
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc
deleted file mode 100644
index 020c02d9baadabc061c52e8d33b3bc8ebb74248f..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/mklml.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/mklml.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag mklml_dso_flag;
-void* mklml_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MKLML_ROUTINE_EACH(DEFINE_WRAP);
-
-#if !defined(_WIN32)
-DEFINE_WRAP(mkl_scsrmm);
-DEFINE_WRAP(mkl_dcsrmm);
-#endif
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
deleted file mode 100644
index 5070be43756fa0a0a08a410fcfcdbadaf751c424..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/mklml.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mkl.h>
-#include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag mklml_dso_flag;
-extern void* mklml_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load mklml routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using mklmlFunc = decltype(&::__name);                               \
-      std::call_once(mklml_dso_flag, []() {                                \
-        mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
-      });                                                                  \
-      static void* p_##_name = dlsym(mklml_dso_handle, #__name);           \
-      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);              \
-    }                                                                      \
-  };                                                                       \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
-
-#define MKLML_ROUTINE_EACH(__macro) \
-  __macro(cblas_sgemm);             \
-  __macro(cblas_dgemm);             \
-  __macro(cblas_saxpy);             \
-  __macro(cblas_daxpy);             \
-  __macro(cblas_scopy);             \
-  __macro(cblas_dcopy);             \
-  __macro(cblas_sgemv);             \
-  __macro(cblas_dgemv);             \
-  __macro(cblas_sgemm_alloc);       \
-  __macro(cblas_dgemm_alloc);       \
-  __macro(cblas_sgemm_pack);        \
-  __macro(cblas_dgemm_pack);        \
-  __macro(cblas_sgemm_compute);     \
-  __macro(cblas_dgemm_compute);     \
-  __macro(cblas_sgemm_free);        \
-  __macro(cblas_dgemm_free);        \
-  __macro(cblas_sgemm_batch);       \
-  __macro(cblas_dgemm_batch);       \
-  __macro(cblas_sdot);              \
-  __macro(cblas_ddot);              \
-  __macro(cblas_sasum);             \
-  __macro(cblas_dasum);             \
-  __macro(cblas_isamax);            \
-  __macro(cblas_idamax);            \
-  __macro(cblas_sscal);             \
-  __macro(cblas_dscal);             \
-  __macro(vsAdd);                   \
-  __macro(vdAdd);                   \
-  __macro(vsMul);                   \
-  __macro(vdMul);                   \
-  __macro(vsExp);                   \
-  __macro(vdExp);                   \
-  __macro(vsSqr);                   \
-  __macro(vdSqr);                   \
-  __macro(vsPowx);                  \
-  __macro(vdPowx);                  \
-  __macro(vsInv);                   \
-  __macro(vdInv);                   \
-  __macro(vmsErf);                  \
-  __macro(vmdErf);                  \
-  __macro(MKL_Set_Num_Threads)
-
-MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
-
-#if !defined(_WIN32)
-DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
-DYNAMIC_LOAD_MKLML_WRAP(mkl_dcsrmm);
-#endif
-
-#undef DYNAMIC_LOAD_MKLML_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
deleted file mode 100644
index 2c40c48ee08497f9a2a414687b9c51d87ba574aa..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/nccl.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag nccl_dso_flag;
-void *nccl_dso_handle;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
deleted file mode 100644
index 06ee478efd73e4102d9d8e6d051d26bb90bd0b24..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/nccl.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <nccl.h>
-
-#include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag nccl_dso_flag;
-extern void* nccl_dso_handle;
-
-#ifdef PADDLE_USE_DSO
-
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> decltype(__name(args...)) {         \
-      using nccl_func = decltype(&::__name);                             \
-      std::call_once(nccl_dso_flag, []() {                               \
-        nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
-  extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
-  struct DynLoad__##__name {                   \
-    template <typename... Args>                \
-    ncclResult_t operator()(Args... args) {    \
-      return __name(args...);                  \
-    }                                          \
-  };                                           \
-  extern DynLoad__##__name __name
-#endif
-
-#define NCCL_RAND_ROUTINE_EACH(__macro) \
-  __macro(ncclCommInitAll);             \
-  __macro(ncclGetUniqueId);             \
-  __macro(ncclCommInitRank);            \
-  __macro(ncclCommDestroy);             \
-  __macro(ncclCommCount);               \
-  __macro(ncclCommCuDevice);            \
-  __macro(ncclCommUserRank);            \
-  __macro(ncclAllReduce);               \
-  __macro(ncclBcast);                   \
-  __macro(ncclAllGather);               \
-  __macro(ncclGroupStart);              \
-  __macro(ncclGroupEnd);                \
-  __macro(ncclReduce);                  \
-  __macro(ncclReduceScatter);           \
-  __macro(ncclGetErrorString);
-
-NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nvrtc.cc b/paddle/fluid/platform/dynload/nvrtc.cc
deleted file mode 100644
index 793b5b8d149daa89d7a570e7d7519a3e9aebf584..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/nvrtc.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/nvrtc.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag nvrtc_dso_flag;
-void* nvrtc_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-NVRTC_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h
deleted file mode 100644
index 20647affabc807ed5a570f09daa241e4389007e4..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <nvrtc.h>
-#include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag nvrtc_dso_flag;
-extern void* nvrtc_dso_handle;
-
-#ifdef PADDLE_USE_DSO
-
-#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using nvrtc_func = decltype(&::__name);                              \
-      std::call_once(nvrtc_dso_flag, []() {                                \
-        nvrtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
-      });                                                                  \
-      static void* p_##__name = dlsym(nvrtc_dso_handle, #__name);          \
-      return reinterpret_cast<nvrtc_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
-
-#else
-
-#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \
-  struct DynLoad__##__name {                    \
-    template <typename... Args>                 \
-    inline auto operator()(Args... args) {      \
-      return ::__name(args...);                 \
-    }                                           \
-  };                                            \
-  extern DynLoad__##__name __name
-
-#endif
-
-/**
- * include all needed nvrtc functions
- **/
-#define NVRTC_ROUTINE_EACH(__macro) \
-  __macro(nvrtcGetErrorString);     \
-  __macro(nvrtcCompileProgram);     \
-  __macro(nvrtcCreateProgram);      \
-  __macro(nvrtcDestroyProgram);     \
-  __macro(nvrtcGetPTX);             \
-  __macro(nvrtcGetPTXSize);         \
-  __macro(nvrtcGetProgramLog);      \
-  __macro(nvrtcGetProgramLogSize)
-
-NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
-
-#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
deleted file mode 100644
index f3c8e27944ca9b6419de87d752df3a83751039b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/tensorrt.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag tensorrt_dso_flag;
-void *tensorrt_dso_handle;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
deleted file mode 100644
index 4c7ba0f054cfc80702eb4fb4127d7008f6e49c02..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <NvInfer.h>
-#if !defined(_WIN32)
-#include <dlfcn.h>
-#endif
-
-#include <mutex>  // NOLINT
-
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag tensorrt_dso_flag;
-extern void* tensorrt_dso_handle;
-
-#ifdef PADDLE_USE_DSO
-
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
-  struct DynLoad__##__name {                                            \
-    template <typename... Args>                                         \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
-      using tensorrt_func = decltype(&::__name);                        \
-      std::call_once(tensorrt_dso_flag, []() {                          \
-        tensorrt_dso_handle =                                           \
-            paddle::platform::dynload::GetTensorRtDsoHandle();          \
-        PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
-      });                                                               \
-      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);    \
-      PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
-      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
-    }                                                                   \
-  };                                                                    \
-  extern DynLoad__##__name __name
-
-#else
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
-  struct DynLoad__##__name {                       \
-    template <typename... Args>                    \
-    tensorrtResult_t operator()(Args... args) {    \
-      return __name(args...);                      \
-    }                                              \
-  };                                               \
-  extern DynLoad__##__name __name
-#endif
-
-#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
-  __macro(createInferBuilder_INTERNAL);     \
-  __macro(createInferRuntime_INTERNAL);
-
-TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/warpctc.cc b/paddle/fluid/platform/dynload/warpctc.cc
deleted file mode 100644
index 4a150048959c52e88515e196390aae57a4e9c12e..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/warpctc.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/warpctc.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag warpctc_dso_flag;
-void* warpctc_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
deleted file mode 100644
index bc1977b05de5da062fae5662dfb51d4a74868c8a..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-#include "warpctc/include/ctc.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag warpctc_dso_flag;
-extern void* warpctc_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warpctc routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
-      using warpctcFunc = decltype(&::__name);                                 \
-      std::call_once(warpctc_dso_flag, []() {                                  \
-        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
-      });                                                                      \
-      static void* p_##_name = dlsym(warpctc_dso_handle, #__name);             \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
-    }                                                                          \
-  };                                                                           \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
-  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
-
-#define WARPCTC_ROUTINE_EACH(__macro) \
-  __macro(get_warpctc_version);       \
-  __macro(ctcGetStatusString);        \
-  __macro(compute_ctc_loss);          \
-  __macro(get_workspace_size)
-
-WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
-
-#undef DYNAMIC_LOAD_WARPCTC_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.cc b/paddle/fluid/platform/enforce.cc
deleted file mode 100644
index 6d0c656781af474fe0e858327d223539bf8f8692..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/enforce.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
deleted file mode 100644
index f9ae4113f9dac4c0fe79c29bfc06a08182645a05..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/enforce.h
+++ /dev/null
@@ -1,514 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef __GNUC__
-#include <cxxabi.h>  // for __cxa_demangle
-#endif               // __GNUC__
-
-#ifdef PADDLE_WITH_CUDA
-#include <cublas_v2.h>
-#include <cudnn.h>
-#include <curand.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-#endif  // PADDLE_WITH_CUDA
-
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "glog/logging.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/to_string.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/dynload/cublas.h"
-#include "paddle/fluid/platform/dynload/cudnn.h"
-#include "paddle/fluid/platform/dynload/curand.h"
-#if !defined(__APPLE__) && !defined(_WIN32)
-#include "paddle/fluid/platform/dynload/nccl.h"
-#endif  // __APPLE__
-#endif  // PADDLE_WITH_CUDA
-
-namespace paddle {
-namespace platform {
-
-#ifdef __GNUC__
-inline std::string demangle(std::string name) {
-  int status = -4;  // some arbitrary value to eliminate the compiler warning
-  std::unique_ptr<char, void (*)(void*)> res{
-      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
-  return (status == 0) ? res.get() : name;
-}
-#else
-inline std::string demangle(std::string name) { return name; }
-#endif
-
-template <typename StrType>
-inline std::string GetTraceBackString(StrType&& what, const char* file,
-                                      int line) {
-  static constexpr int TRACE_STACK_LIMIT = 100;
-  std::ostringstream sout;
-
-  sout << string::Sprintf("%s at [%s:%d]", std::forward<StrType>(what), file,
-                          line)
-       << std::endl;
-  sout << "PaddlePaddle Call Stacks: " << std::endl;
-#if !defined(_WIN32)
-  void* call_stack[TRACE_STACK_LIMIT];
-  auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
-  auto symbols = backtrace_symbols(call_stack, size);
-  Dl_info info;
-  for (int i = 0; i < size; ++i) {
-    if (dladdr(call_stack[i], &info) && info.dli_sname) {
-      auto demangled = demangle(info.dli_sname);
-      auto addr_offset = static_cast<char*>(call_stack[i]) -
-                         static_cast<char*>(info.dli_saddr);
-      sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, 2 + sizeof(void*) * 2,
-                              call_stack[i], demangled, addr_offset);
-    } else {
-      sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
-                              call_stack[i]);
-    }
-  }
-  free(symbols);
-#else
-  sout << "Windows not support stack backtrace yet.";
-#endif
-  return sout.str();
-}
-
-struct EnforceNotMet : public std::exception {
-  std::string err_str_;
-  EnforceNotMet(std::exception_ptr e, const char* file, int line) {
-    try {
-      std::rethrow_exception(e);
-    } catch (std::exception& e) {
-      err_str_ = GetTraceBackString(e.what(), file, line);
-    }
-  }
-
-  EnforceNotMet(const std::string& str, const char* file, int line)
-      : err_str_(GetTraceBackString(str, file, line)) {}
-
-  const char* what() const noexcept override { return err_str_.c_str(); }
-};
-
-struct EOFException : public std::exception {
-  std::string err_str_;
-  EOFException(const char* err_msg, const char* file, int line) {
-    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, file, line);
-  }
-
-  const char* what() const noexcept override { return err_str_.c_str(); }
-};
-
-// Because most enforce conditions would evaluate to true, we can use
-// __builtin_expect to instruct the C++ compiler to generate code that
-// always forces branch prediction of true.
-// This generates faster binary code. __builtin_expect is since C++11.
-// For more details, please check https://stackoverflow.com/a/43870188/724872.
-#if !defined(_WIN32)
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#else
-// there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition)
-#endif
-
-#if !defined(_WIN32)
-#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
-#else
-// there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) (condition)
-#endif
-
-inline bool is_error(bool stat) { return !stat; }
-
-inline void throw_on_error(bool stat, const std::string& msg) {
-#ifndef REPLACE_ENFORCE_GLOG
-  throw std::runtime_error(msg);
-#else
-  LOG(FATAL) << msg;
-#endif
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
-
-inline void throw_on_error(cudaError_t e, const std::string& msg) {
-#ifndef REPLACE_ENFORCE_GLOG
-  throw thrust::system_error(e, thrust::cuda_category(), msg);
-#else
-  LOG(FATAL) << msg;
-#endif
-}
-
-inline bool is_error(curandStatus_t stat) {
-  return stat != CURAND_STATUS_SUCCESS;
-}
-
-inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
-#ifndef REPLACE_ENFORCE_GLOG
-  throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
-                             msg);
-#else
-  LOG(FATAL) << msg;
-#endif
-}
-
-inline bool is_error(cudnnStatus_t stat) {
-  return stat != CUDNN_STATUS_SUCCESS;
-}
-
-inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
-#ifndef REPLACE_ENFORCE_GLOG
-  throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + msg);
-#else
-  LOG(FATAL) << platform::dynload::cudnnGetErrorString(stat) << msg;
-#endif
-}
-
-inline bool is_error(cublasStatus_t stat) {
-  return stat != CUBLAS_STATUS_SUCCESS;
-}
-
-inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
-  std::string err;
-  if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
-    err = "CUBLAS: not initialized, ";
-  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
-    err = "CUBLAS: alloc failed, ";
-  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
-    err = "CUBLAS: invalid value, ";
-  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
-    err = "CUBLAS: arch mismatch, ";
-  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
-    err = "CUBLAS: mapping error, ";
-  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
-    err = "CUBLAS: execution failed, ";
-  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
-    err = "CUBLAS: internal error, ";
-  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
-    err = "CUBLAS: not supported, ";
-  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
-    err = "CUBLAS: license error, ";
-  }
-#ifndef REPLACE_ENFORCE_GLOG
-  throw std::runtime_error(err + msg);
-#else
-  LOG(FATAL) << err << msg;
-#endif
-}
-
-#if !defined(__APPLE__) && !defined(_WIN32)
-inline bool is_error(ncclResult_t nccl_result) {
-  return nccl_result != ncclSuccess;
-}
-
-inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
-#ifndef REPLACE_ENFORCE_GLOG
-  throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + msg);
-#else
-  LOG(FATAL) << platform::dynload::ncclGetErrorString(stat) << msg;
-#endif
-}
-#endif  // __APPLE__ and windows
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef PADDLE_WITH_CUDA
-namespace details {
-
-template <typename T>
-struct CudaStatusType {};
-
-#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \
-  template <>                                        \
-  struct CudaStatusType<type> {                      \
-    using Type = type;                               \
-    static constexpr Type kSuccess = success_value;  \
-  }
-
-DEFINE_CUDA_STATUS_TYPE(cudaError_t, cudaSuccess);
-DEFINE_CUDA_STATUS_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS);
-DEFINE_CUDA_STATUS_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS);
-DEFINE_CUDA_STATUS_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS);
-
-#if !defined(__APPLE__) && !defined(_WIN32)
-DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
-#endif
-
-}  // namespace details
-#endif
-
-#define PADDLE_THROW(...)                                            \
-  do {                                                               \
-    throw ::paddle::platform::EnforceNotMet(                         \
-        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
-  } while (0)
-
-#if defined(__CUDA_ARCH__)
-// For cuda, the assertions can affect performance and it is therefore
-// recommended to disable them in production code
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
-#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)                   \
-  do {                                                                 \
-    if (!(_IS_NOT_ERROR)) {                                            \
-      printf("Exception: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
-             __FILE__, __LINE__, #_IS_NOT_ERROR, ##__VA_ARGS__);       \
-      asm("trap;");                                                    \
-    }                                                                  \
-  } while (0)
-#else
-#define PADDLE_ENFORCE(COND, ...)                                         \
-  do {                                                                    \
-    auto __cond__ = (COND);                                               \
-    if (UNLIKELY(::paddle::platform::is_error(__cond__))) {               \
-      try {                                                               \
-        ::paddle::platform::throw_on_error(                               \
-            __cond__, ::paddle::string::Sprintf(__VA_ARGS__));            \
-      } catch (...) {                                                     \
-        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
-                                                __FILE__, __LINE__);      \
-      }                                                                   \
-    }                                                                     \
-  } while (0)
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-#define PADDLE_ENFORCE_CUDA_SUCCESS(COND, ...)                            \
-  do {                                                                    \
-    auto __cond__ = (COND);                                               \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                      \
-    constexpr auto __success_type__ =                                     \
-        ::paddle::platform::details::CudaStatusType<                      \
-            __CUDA_STATUS_TYPE__>::kSuccess;                              \
-    if (UNLIKELY(__cond__ != __success_type__)) {                         \
-      try {                                                               \
-        ::paddle::platform::throw_on_error(                               \
-            __cond__, ::paddle::string::Sprintf(__VA_ARGS__));            \
-      } catch (...) {                                                     \
-        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
-                                                __FILE__, __LINE__);      \
-      }                                                                   \
-    }                                                                     \
-  } while (0)
-
-#undef DEFINE_CUDA_STATUS_TYPE
-#endif
-
-#define PADDLE_THROW_EOF()                                                     \
-  do {                                                                         \
-    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
-                                           __LINE__);                          \
-  } while (0)
-
-#define PADDLE_THROW_BAD_ALLOC(...)                                  \
-  do {                                                               \
-    throw ::paddle::memory::allocation::BadAlloc(                    \
-        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
-  } while (0)
-
-/*
- * Some enforce helpers here, usage:
- *    int a = 1;
- *    int b = 2;
- *    PADDLE_ENFORCE_EQ(a, b);
- *
- *    will raise an expression described as follows:
- *    "Enforce failed. Expected input a == b, but received a(1) != b(2)."
- *      with detailed stack information.
- *
- *    extra messages is also supported, for example:
- *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
- */
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                 \
-  do {                                                      \
-    if (UNLIKELY(nullptr == (__VAL))) {                     \
-      PADDLE_THROW(#__VAL " should not be null\n%s",        \
-                   ::paddle::string::Sprintf(__VA_ARGS__)); \
-    }                                                       \
-  } while (0)
-
-namespace details {
-template <typename T>
-inline constexpr bool IsArithmetic() {
-  return std::is_arithmetic<T>::value;
-}
-
-template <typename T1, typename T2, bool kIsArithmetic /* = true */>
-struct TypeConverterImpl {
-  using Type1 = typename std::common_type<T1, T2>::type;
-  using Type2 = Type1;
-};
-
-template <typename T1, typename T2>
-struct TypeConverterImpl<T1, T2, false> {
-  using Type1 = T1;
-  using Type2 = T2;
-};
-
-template <typename T1, typename T2>
-struct TypeConverter {
- private:
-  static constexpr bool kIsArithmetic =
-      IsArithmetic<T1>() && IsArithmetic<T2>();
-
- public:
-  using Type1 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type1;
-  using Type2 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type2;
-};
-
-template <typename T1, typename T2>
-using CommonType1 = typename std::add_lvalue_reference<
-    typename std::add_const<typename TypeConverter<T1, T2>::Type1>::type>::type;
-
-template <typename T1, typename T2>
-using CommonType2 = typename std::add_lvalue_reference<
-    typename std::add_const<typename TypeConverter<T1, T2>::Type2>::type>::type;
-
-// Here, we use SFINAE to check whether T can be converted to std::string
-template <typename T>
-struct CanToString {
- private:
-  using YesType = uint8_t;
-  using NoType = uint16_t;
-
-  template <typename U>
-  static YesType Check(decltype(std::cout << std::declval<U>())) {
-    return 0;
-  }
-
-  template <typename U>
-  static NoType Check(...) {
-    return 0;
-  }
-
- public:
-  static constexpr bool kValue =
-      std::is_same<YesType, decltype(Check<T>(std::cout))>::value;
-};
-
-template <bool kCanToString /* = true */>
-struct BinaryCompareMessageConverter {
-  template <typename T>
-  static std::string Convert(const char* expression, const T& value) {
-    return expression + std::string(":") + string::to_string(value);
-  }
-};
-
-template <>
-struct BinaryCompareMessageConverter<false> {
-  template <typename T>
-  static const char* Convert(const char* expression, const T& value) {
-    return expression;
-  }
-};
-
-}  // namespace details
-
-#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)         \
-  do {                                                                         \
-    auto __val1 = (__VAL1);                                                    \
-    auto __val2 = (__VAL2);                                                    \
-    using __TYPE1__ = decltype(__val1);                                        \
-    using __TYPE2__ = decltype(__val2);                                        \
-    using __COMMON_TYPE1__ =                                                   \
-        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>;        \
-    using __COMMON_TYPE2__ =                                                   \
-        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>;        \
-    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP(        \
-        static_cast<__COMMON_TYPE2__>(__val2));                                \
-    if (UNLIKELY(!__is_not_error)) {                                           \
-      constexpr bool __kCanToString__ =                                        \
-          ::paddle::platform::details::CanToString<__TYPE1__>::kValue &&       \
-          ::paddle::platform::details::CanToString<__TYPE2__>::kValue;         \
-      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                       \
-                   " %s, but received %s " #__INV_CMP " %s.\n%s",              \
-                   #__VAL1, #__VAL2,                                           \
-                   ::paddle::platform::details::BinaryCompareMessageConverter< \
-                       __kCanToString__>::Convert(#__VAL1, __val1),            \
-                   ::paddle::platform::details::BinaryCompareMessageConverter< \
-                       __kCanToString__>::Convert(#__VAL2, __val2),            \
-                   ::paddle::string::Sprintf(__VA_ARGS__));                    \
-    }                                                                          \
-  } while (0)
-
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-
-#define __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL1, __VAL2, __CMP, \
-                                           __INV_CMP, ...)               \
-  do {                                                                   \
-    auto __val1 = (__VAL1);                                              \
-    auto __val2 = (__VAL2);                                              \
-    if (!__CTX->IsRuntime()) {                                           \
-      if (__val1 == -1 || __val2 == -1) {                                \
-        break;                                                           \
-      }                                                                  \
-    }                                                                    \
-    using __TYPE1__ = decltype(__val1);                                  \
-    using __TYPE2__ = decltype(__val2);                                  \
-    using __COMMON_TYPE1__ =                                             \
-        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>;  \
-    using __COMMON_TYPE2__ =                                             \
-        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>;  \
-    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP(  \
-        static_cast<__COMMON_TYPE2__>(__val2));                          \
-    if (UNLIKELY(!__is_not_error)) {                                     \
-      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                 \
-                   " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s",  \
-                   #__VAL1, #__VAL2, #__VAL1,                            \
-                   ::paddle::string::to_string(__val1), #__VAL2,         \
-                   ::paddle::string::to_string(__val2),                  \
-                   ::paddle::string::Sprintf(__VA_ARGS__));              \
-    }                                                                    \
-  } while (0)
-
-#define PADDLE_INFERSHAPE_ENFORCE_EQ(__CTX, __VAL0, __VAL1, ...) \
-  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, ==, !=, __VA_ARGS__)
-#define PADDLE_INFERSHAPE_ENFORCE_NE(__CTX, __VAL0, __VAL1, ...) \
-  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, !=, ==, __VA_ARGS__)
-#define PADDLE_INFERSHAPE_ENFORCE_GT(__CTX, __VAL0, __VAL1, ...) \
-  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, >, <=, __VA_ARGS__)
-#define PADDLE_INFERSHAPE_ENFORCE_GE(__CTX, __VAL0, __VAL1, ...) \
-  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, >=, <, __VA_ARGS__)
-#define PADDLE_INFERSHAPE_ENFORCE_LT(__CTX, __VAL0, __VAL1, ...) \
-  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, <, >=, __VA_ARGS__)
-#define PADDLE_INFERSHAPE_ENFORCE_LE(__CTX, __VAL0, __VAL1, ...) \
-  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, <=, >, __VA_ARGS__)
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
deleted file mode 100644
index 4e34f3cbf5b711d8ca639461c0ebcf1597017e7b..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/enforce_test.cc
+++ /dev/null
@@ -1,361 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <array>
-#include <iostream>
-#include <list>
-#include <memory>
-#include <set>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/string/piece.h"
-
-using StringPiece = paddle::string::Piece;
-using paddle::string::HasPrefix;
-
-TEST(ENFORCE, OK) {
-  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
-  size_t val = 1;
-  const size_t limit = 10;
-  PADDLE_ENFORCE(val < limit, "Enforce is OK too");
-}
-
-TEST(ENFORCE, FAILED) {
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all"));
-  }
-  EXPECT_TRUE(caught_exception);
-
-  caught_exception = false;
-  try {
-    PADDLE_ENFORCE(false, "Enforce is not ok at all");
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "Enforce is not ok at all"));
-  }
-  EXPECT_TRUE(caught_exception);
-
-  caught_exception = false;
-  try {
-    PADDLE_ENFORCE(false);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_NE(std::string(error.what()).find("  at "), 0);
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE, NO_ARG_OK) {
-  int a = 2;
-  int b = 2;
-  PADDLE_ENFORCE_EQ(a, b);
-  // test enforce with extra message.
-  PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info");
-}
-
-TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
-  int a = 2;
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_EQ(a, 1 + 3);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    HasPrefix(
-        StringPiece(error.what()),
-        "Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + 3:4.");
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
-  int a = 2;
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their");
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    HasPrefix(StringPiece(error.what()),
-              "Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + "
-              "3:4.\ntheir size not match");
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_NE, OK) {
-  PADDLE_ENFORCE_NE(1, 2);
-  PADDLE_ENFORCE_NE(1.0, 2UL);
-}
-TEST(ENFORCE_NE, FAIL) {
-  bool caught_exception = false;
-
-  try {
-    // 2UL here to check data type compatible
-    PADDLE_ENFORCE_NE(1.0, 1UL);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(HasPrefix(
-        StringPiece(error.what()),
-        "Enforce failed. Expected 1.0 != 1UL, but received 1.0:1 == 1UL:1."))
-        << error.what() << " does not have expected prefix";
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
-TEST(ENFORCE_GT, FAIL) {
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_GT(1, 2);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()),
-                  "Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2."));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_GE, OK) {
-  PADDLE_ENFORCE_GE(2, 2);
-  PADDLE_ENFORCE_GE(3, 2);
-  PADDLE_ENFORCE_GE(3.21, 2.0);
-}
-TEST(ENFORCE_GE, FAIL) {
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_GE(1, 2);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()),
-                  "Enforce failed. Expected 1 >= 2, but received 1:1 < 2:2."));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_LE, OK) {
-  PADDLE_ENFORCE_LE(1, 1);
-  PADDLE_ENFORCE_LE(1UL, 1UL);
-  PADDLE_ENFORCE_LE(2, 3);
-  PADDLE_ENFORCE_LE(2UL, 3UL);
-  PADDLE_ENFORCE_LE(2.0, 3.2);
-}
-TEST(ENFORCE_LE, FAIL) {
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_GT(1, 2);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()),
-                  "Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2."));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_LT, OK) {
-  PADDLE_ENFORCE_LT(3, 10);
-  PADDLE_ENFORCE_LT(2UL, 3UL);
-  PADDLE_ENFORCE_LT(2, 3);
-}
-TEST(ENFORCE_LT, FAIL) {
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_LT(1UL, 0.12);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-                          "Enforce failed. Expected 1UL < 0.12, but "
-                          "received 1UL:1 >= 0.12:0.12."));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_NOT_NULL, OK) {
-  int* a = new int;
-  PADDLE_ENFORCE_NOT_NULL(a);
-  delete a;
-}
-TEST(ENFORCE_NOT_NULL, FAIL) {
-  bool caught_exception = false;
-  try {
-    int* a = nullptr;
-    PADDLE_ENFORCE_NOT_NULL(a);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-struct Dims {
-  size_t dims_[4];
-
-  bool operator==(const Dims& o) const {
-    for (size_t i = 0; i < 4; ++i) {
-      if (dims_[i] != o.dims_[i]) return false;
-    }
-    return true;
-  }
-};
-
-std::ostream& operator<<(std::ostream& os, const Dims& d) {
-  for (size_t i = 0; i < 4; ++i) {
-    if (i == 0) {
-      os << "[";
-    }
-    os << d.dims_[i];
-    if (i == 4 - 1) {
-      os << "]";
-    } else {
-      os << ", ";
-    }
-  }
-  return os;
-}
-
-TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
-  Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
-  PADDLE_ENFORCE_EQ(a, b);
-}
-
-TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
-  Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_EQ(a, b);
-  } catch (paddle::platform::EnforceNotMet&) {
-    caught_exception = true;
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(EOF_EXCEPTION, THROW_EOF) {
-  bool caught_eof = false;
-  try {
-    PADDLE_THROW_EOF();
-  } catch (paddle::platform::EOFException error) {
-    caught_eof = true;
-    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "There is no next data."));
-  }
-  EXPECT_TRUE(caught_eof);
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
-  PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
-  return true;
-}
-
-template <typename T>
-bool CheckCudaStatusFailure(
-    T value, const std::string& msg = "self-defined cuda status failed") {
-  try {
-    PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
-    return false;
-  } catch (paddle::platform::EnforceNotMet& error) {
-    std::string ex_msg = error.what();
-    return ex_msg.find(msg) != std::string::npos;
-  }
-}
-
-TEST(enforce, cuda_success) {
-  EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation));
-
-  EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH));
-  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED));
-
-  EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED));
-
-  EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE));
-#if !defined(__APPLE__) && !defined(_WIN32)
-  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError));
-#endif
-}
-#endif
-
-struct CannotToStringType {
-  explicit CannotToStringType(int num) : num_(num) {}
-
-  bool operator==(const CannotToStringType& other) const {
-    return num_ == other.num_;
-  }
-
-  bool operator!=(const CannotToStringType& other) const {
-    return num_ != other.num_;
-  }
-
- private:
-  int num_;
-};
-
-TEST(enforce, cannot_to_string_type) {
-  static_assert(
-      !paddle::platform::details::CanToString<CannotToStringType>::kValue,
-      "CannotToStringType must not be converted to string");
-  static_assert(paddle::platform::details::CanToString<int>::kValue,
-                "int can be converted to string");
-  CannotToStringType obj1(3), obj2(4), obj3(3);
-
-  PADDLE_ENFORCE_NE(obj1, obj2, "Object 1 is not equal to Object 2");
-  PADDLE_ENFORCE_EQ(obj1, obj3, "Object 1 is equal to Object 3");
-
-  std::string msg = "Compare obj1 with obj2";
-  try {
-    PADDLE_ENFORCE_EQ(obj1, obj2, msg);
-  } catch (paddle::platform::EnforceNotMet& error) {
-    std::string ex_msg = error.what();
-    LOG(INFO) << ex_msg;
-    EXPECT_TRUE(ex_msg.find(msg) != std::string::npos);
-    EXPECT_TRUE(
-        ex_msg.find("Expected obj1 == obj2, but received obj1 != obj2") !=
-        std::string::npos);
-  }
-
-  msg = "Compare x with y";
-  try {
-    int x = 3, y = 2;
-    PADDLE_ENFORCE_EQ(x, y, msg);
-  } catch (paddle::platform::EnforceNotMet& error) {
-    std::string ex_msg = error.what();
-    LOG(INFO) << ex_msg;
-    EXPECT_TRUE(ex_msg.find(msg) != std::string::npos);
-    EXPECT_TRUE(ex_msg.find("Expected x == y, but received x:3 != y:2") !=
-                std::string::npos);
-  }
-
-  std::set<int> set;
-  PADDLE_ENFORCE_EQ(set.begin(), set.end());
-  set.insert(3);
-  PADDLE_ENFORCE_NE(set.begin(), set.end());
-
-  std::list<float> list;
-  PADDLE_ENFORCE_EQ(list.begin(), list.end());
-  list.push_back(4);
-  PADDLE_ENFORCE_NE(list.begin(), list.end());
-}
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
deleted file mode 100644
index e9bdb82a50fa4166cecdaea1de01d2f458f3da9a..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/event.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-#endif
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace platform {
-
-enum EventType { kMark, kPushRange, kPopRange };
-
-class Event {
- public:
-  // The DeviceContext is used to get the cuda stream.
-  // If CPU profiling mode, can pass nullptr.
-  Event(EventType type, std::string name, uint32_t thread_id);
-
-  const EventType& type() const;
-  std::string name() const { return name_; }
-  uint32_t thread_id() const { return thread_id_; }
-
-#ifdef PADDLE_WITH_CUDA
-#ifndef PADDLE_WITH_CUPTI
-  cudaEvent_t event() const { return event_; }
-  int device() const { return device_; }
-#endif
-#endif
-
-  double CpuElapsedMs(const Event& e) const;
-  double CudaElapsedMs(const Event& e) const;
-
- private:
-  EventType type_;
-  std::string name_;
-  uint32_t thread_id_;
-  int64_t cpu_ns_;
-#ifdef PADDLE_WITH_CUDA
-#ifdef PADDLE_WITH_CUPTI
-  int64_t gpu_ns_ = 0;
-
- public:
-  void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
-    gpu_ns_ += end_ns - start_ns;
-  }
-
- private:
-#else
-  cudaEvent_t event_ = nullptr;
-  int device_ = -1;
-#endif
-#endif
-};
-
-class MemEvent {
- public:
-  MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
-           Place place, int64_t thread_id, const std::string& annotation)
-      : type_(type),
-        start_ns_(start_ns),
-        end_ns_(end_ns),
-        bytes_(bytes),
-        place_(place),
-        thread_id_(thread_id),
-        annotation_(annotation) {}
-
-  const EventType& type() const { return type_; }
-  uint64_t start_ns() const { return start_ns_; }
-  uint64_t end_ns() const { return end_ns_; }
-  size_t bytes() const { return bytes_; }
-  Place place() const { return place_; }
-  int64_t thread_id() const { return thread_id_; }
-  const std::string& annotation() const { return annotation_; }
-
- private:
-  EventType type_;
-  uint64_t start_ns_ = 0;
-  uint64_t end_ns_ = 0;
-  size_t bytes_;
-  Place place_;
-  int64_t thread_id_;
-  std::string annotation_;
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
deleted file mode 100644
index e40d0cf18a965dd301a6d131e1e4df5307e11966..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/flags.cc
+++ /dev/null
@@ -1,453 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gflags/gflags.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-#endif
-
-/**
- * NOTE(paddle-dev): This file is designed to define all public FLAGS.
- */
-
-/**
- * Paddle initialization related FLAG
- * Name: FLAGS_paddle_num_threads
- * Since Version: 0.15.0
- * Value Range: int32, default=1
- * Example: FLAGS_paddle_num_threads=2, set the maximum thread number per
- * instance to 2
- * Note:
- */
-DEFINE_int32(paddle_num_threads, 1,
-             "Number of threads for each paddle instance.");
-
-/**
- * Operator related FLAG
- * Name: FLAGS_check_nan_inf
- * Since Version: 0.13.0
- * Value Range: bool, default=false
- * Example:
- * Note: Used to debug. Checking whether operator produce NAN/INF or not.
- */
-DEFINE_bool(check_nan_inf, false,
-            "Checking whether operator produce NAN/INF or not. It will be "
-            "extremely slow so please use this flag wisely.");
-
-#ifdef PADDLE_WITH_CUDA
-
-/**
- * CUDA related related FLAG
- * Name: FLAGS_enable_cublas_tensor_op_math
- * Since Version: 1.2.0
- * Value Range: bool, default=false
- * Example:
- * Note: whether to use Tensor Core, faster but it may loss precision.
- */
-DEFINE_bool(
-    enable_cublas_tensor_op_math, false,
-    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
-    "but it may loss precision. Currently, There are two CUDA libraries that"
-    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
-    " GEMM computations(the matrices must be either half precision or single "
-    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
-    "input and output must be half precision) and recurrent neural networks "
-    "(RNNs).");
-
-/**
- * CUDA related FLAG
- * Name: FLAGS_selected_gpus
- * Since Version: 1.3.0
- * Value Range: integer list separated by comma, default empty list
- * Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu
- * cards
- * Note: A list of device ids separated by comma, like: 0,1,2,3
- */
-DEFINE_string(selected_gpus, "",
-              "A list of device ids separated by comma, like: 0,1,2,3. "
-              "This option is useful when doing multi process training and "
-              "each process have only one device (GPU). If you want to use "
-              "all visible devices, set this to empty string. NOTE: the "
-              "reason of doing this is that we want to use P2P communication"
-              "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
-              "share-memory only.");
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-
-/**
- * CUDNN related FLAG
- * Name: FLAGS_cudnn_deterministic
- * Since Version: 0.13.0
- * Value Range: bool, default=false
- * Example:
- * Note: whether to use deterministic algorithm in cudnn.
- *       If true, it will slow down some operators such as conv and pooling.
- */
-DEFINE_bool(cudnn_deterministic, false,
-            "Whether allow using an autotuning algorithm for convolution "
-            "operator. The autotuning algorithm may be non-deterministic. If "
-            "true, the algorithm is deterministic.");
-
-/**
- * CUDNN related FLAG
- * Name: FLAGS_conv_workspace_size_limit
- * Since Version: 0.13.0
- * Value Range: uint64, default=512 (MB)
- * Example:
- * Note: The internal function of cuDNN obtains the fastest matching algorithm
- *       within this memory limit. Usually, faster algorithms can be chosen in
- *       larger workspaces, but memory space can also be significantly
- * increased.
- *       Users need to balance memory and speed.
- */
-DEFINE_uint64(conv_workspace_size_limit,
-              paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
-              "cuDNN convolution workspace limit in MB unit.");
-
-/**
- * CUDNN related FLAG
- * Name: FLAGS_cudnn_exhaustive_search
- * Since Version: 1.2.0
- * Value Range: bool, default=false
- * Example:
- * Note: Represents whether an exhaustive search method is used to
- *       select a convolution algorithm. There are two search methods in cuDNN,
- *       heuristic search and exhaustive search. Exhaustive search attempts
- *       all cuDNN algorithms to select the fastest. This method is very
- *       time-consuming, and the selected algorithm will be cached for a given
- *       layer specification. Once you change the layer specifications
- *       (such as batch size, feature map size), it will search again.
- */
-DEFINE_bool(cudnn_exhaustive_search, false,
-            "Whether enable exhaustive search for cuDNN convolution or "
-            "not, default is False.");
-
-/**
- * CUDNN related FLAG
- * Name: FLAGS_cudnn_exhaustive_search_times
- * Since Version:
- * Value Range:
- * Example:
- * Note: only used to predict for advanced developer
- */
-DEFINE_int64(cudnn_exhaustive_search_times, -1,
-             "Exhaustive search times for cuDNN convolution, "
-             "default is -1, not exhaustive search");
-
-/**
- * CUDNN related FLAG
- * Name: FLAGS_cudnn_batchnorm_spatial_persistent
- * Since Version: 1.4.0
- * Value Range: bool, default=false
- * Example:
- * Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be
- * faster in
- *       some tasks because an optimized path may be selected for
- * CUDNN_DATA_FLOAT
- *       and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
- *       reason we set it to false by default is that this mode may use scaled
- *       atomic integer reduction that may cause a numerical overflow for
- * certain
- *       input data range.
- */
-DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
-            "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
-            "batch_norm, default is False.");
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-
-/**
- * NCCL related FLAG
- * Name: FLAGS_enable_cublas_tensor_op_math
- * Since Version:
- * Value Range:
- * Example:
- * Note: asynchronous nccl allreduce or synchronous issue:
- *       https://github.com/PaddlePaddle/Paddle/issues/15049
- *       If you want to change this default value, why?(gongwb)
- */
-DEFINE_bool(
-    sync_nccl_allreduce, true,
-    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
-    "after allreduce, this mode can get better performance in some scenarios.");
-#endif
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-/**
- * Distributed related FLAG
- * Name: FLAGS_communicator_max_merge_var_num
- * Since Version: 1.5.0
- * Value Range: int32, default=20
- * Example:
- * Note: The maximum number of gradients to be merged into a gradient and
- *       sent through the communicator. The trainer puts all the gradients
- *       into the queue, and then the communicator takes the gradients out
- *       of the queue and sends them after merging.
- */
-DEFINE_int32(communicator_max_merge_var_num, 20,
-             "max var num to merge and send");
-
-/**
- * Distributed related FLAG
- * Name: FLAGS_communicator_send_queue_size
- * Since Version: 1.5.0
- * Value Range: int32, default=20
- * Example:
- * Note: Size for each gradient queue. The trainer puts the gradient into
- *       the queue, and then the communicator takes it out of the queue and
- *       sends it out. When the communicator is slow, the queue may be full,
- *       and the trainer will be continuously blocked before the queue has
- *       space. It is used to avoid training much faster than communication,
- *       so that too many gradients are not sent out in time.
- */
-DEFINE_int32(communicator_send_queue_size, 20,
-             "queue size to recv gradient before send");
-#endif
-
-/**
- * Distributed related FLAG
- * Name: FLAGS_dist_threadpool_size
- * Since Version: 1.0.0
- * Value Range: int32, default=0
- * Example:
- * Note: Control the number of threads used for distributed modules.
- *       If it is not set, it is set to a hard thread.
- */
-DEFINE_int32(dist_threadpool_size, 0,
-             "number of threads used for distributed executed.");
-
-/**
- * Garbage collector related FLAG
- * Name: FLAGS_eager_delete_tensor_gb
- * Since Version: 1.0.0
- * Value Range: double, default=kDefaultEagerDeleteTensorGB
- * Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is
- * no longer used.
- *          FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when
- * garbage occupies 1.0GB of memory.
- *          FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection
- * policy.
- * Note: Represents whether a garbage collection strategy is used to optimize
- * network memory usage.
- *       It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to
- *       enable garbage collection strategy when training large networks.
- */
-// Disable gc by default when inference library is built
-#ifdef PADDLE_ON_INFERENCE
-static const double kDefaultEagerDeleteTensorGB = -1;
-#else
-static const double kDefaultEagerDeleteTensorGB = 0;
-#endif
-
-DEFINE_double(
-    eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB,
-    "Memory size threshold (GB) when the garbage collector clear tensors."
-    "Disabled when this value is less than 0");
-
-/**
- * Memory related FLAG
- * Name: FLAGS_fast_eager_deletion_mode
- * Since Version: 1.3.0
- * Value Range: bool, default=true
- * Example:
- * Note: Whether to use fast garbage collection strategy.
- *       If not set, the GPU memory is released at the end of the CUDA kernel.
- *       Otherwise, the GPU memory will be released before the CUDA kernel
- *       has finished, which will make the garbage collection strategy faster.
- *       Only works when garbage collection strategy is enabled.
- */
-DEFINE_bool(fast_eager_deletion_mode, true,
-            "Fast eager deletion mode. If enabled, memory would release "
-            "immediately without waiting GPU kernel ends.");
-
-/**
- * Memory related FLAG
- * Name: FLAGS_memory_fraction_of_eager_deletion
- * Since Version: 1.4
- * Value Range: double [0.0, 1.0], default=1.0
- * Example:
- * Note: The percentage of memory size of garbage collection policy
- *       to release variables.
- *       If FLAGS_memory_fraction_of_eager_deletion = 1.0,
- *       all temporary variables in the network will be released.
- *       If FLAGS_memory_fraction_of_eager_deletion = 0.0,
- *       no temporary variables in the network are released.
- *       If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0,
- *       all temporary variables will be sorted in descending order
- *       according to their memory size, and only variables with the
- *       largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
- *       The flag is only valid when running parallel data compilers.
- */
-DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
-              "Fraction of eager deletion. If less than 1.0, all variables in "
-              "the program would be sorted according to its memory size, and "
-              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
-              "variables would be deleted.");
-
-/**
- * Allocator related FLAG
- * Name: FLAGS_allocator_strategy
- * Since Version: 1.2
- * Value Range: string, {naive_best_fit, auto_groth}, default=naive_best_fit
- * Example:
- * Note: Allocator policy for selecting Paddle Paddle.
- *       The allocator strategy is under development and the non-legacy
- *       allocator is not yet stable.
- */
-DEFINE_string(allocator_strategy, "naive_best_fit",
-              "The allocation strategy. naive_best_fit means the original best "
-              "fit allocator of Fluid. "
-              "auto_growth means the experimental auto-growth allocator. "
-              "Enum in [naive_best_fit, auto_growth].");
-
-/**
- * Memory related FLAG
- * Name: FLAGS_fraction_of_cpu_memory_to_use
- * Since Version: 0.12.0
- * Value Range: double, [0.0, 1.0], default=1
- * Example:
- * Note: Represents the proportion of allocated CPU memory blocks
- *       to the total memory size of the CPU. Future CPU memory usage
- *       will be allocated from this memory block. If the memory block does
- *       not have enough CUDA pinned memory, new memory blocks of the same
- *       size as the memory block will be allocated from the CUDA pinned
- *       request util the CPU does not have enough memory.
- */
-DEFINE_double(fraction_of_cpu_memory_to_use, 1,
-              "Default use 100% of CPU memory for PaddlePaddle,"
-              "reserve the rest for page tables, etc");
-
-/**
- * Memory related FLAG
- * Name: FLAGS_initial_cpu_memory_in_mb
- * Since Version: 0.14.0
- * Value Range: uint64, default=500 (MB)
- * Example:
- * Note: The CPU memory block size of the initial allocator in MB.
- *       The allocator takes the minimum values of
- *       FLAGS_initial_cpu_memory_in_mb and
- *       FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
- *       as memory block sizes.
- */
-DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
-              "Initial CPU memory for PaddlePaddle, in MD unit.");
-
-/**
- * Memory related FLAG
- * Name: FLAGS_fraction_of_cuda_pinned_memory_to_use
- * Since Version: 0.12.0
- * Value Range: double, [0.0, 1.0], default=0.5
- * Example:
- * Note: Represents the proportion of allocated CUDA pinned memory blocks
- *       to the total memory size of the CPU. Future CUDA pinned memory usage
- *       will be allocated from this memory block. If the memory block does
- *       not have enough CPU memory, new memory blocks of the same
- *       size as the memory block will be allocated from the CPU
- *       request util the CPU does not have enough memory.
- */
-DEFINE_double(
-    fraction_of_cuda_pinned_memory_to_use, 0.5,
-    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
-    "reserve the rest for page tables, etc");
-
-#ifdef PADDLE_WITH_CUDA
-
-/**
- * Memory related FLAG
- * Name: FLAGS_fraction_of_gpu_memory_to_use
- * Since Version: 1.2.0
- * Value Range: double, default=0.5 if win32, 0.92 else
- * Example:
- * Note: Represents the proportion of allocated memory blocks to the total
- * memory size
- *       of the GPU. Future memory usage will be allocated from this memory
- * block.
- *       If the memory block does not have enough GPU memory, new memory blocks
- * of
- *       the same size as the memory block will be allocated from the GPU
- * request
- *       until the GPU does not have enough memory.
- */
-
-#ifndef _WIN32
-constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
-#else
-// fraction_of_gpu_memory_to_use cannot be too high on windows,
-// since the win32 graphic sub-system can occupy some GPU memory
-// which may lead to insufficient memory left for paddle
-constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
-#endif
-DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
-              "Allocate a trunk of gpu memory that is this fraction of the "
-              "total gpu memory size. Future memory usage will be allocated "
-              "from the trunk. If the trunk doesn't have enough gpu memory, "
-              "additional trunks of the same size will be requested from gpu "
-              "until the gpu has no memory left for another trunk.");
-
-/**
- * Memory related FLAG
- * Name: FLAGS_initial_gpu_memory_in_mb
- * Since Version: 1.4.0
- * Value Range: uint64, default=0 (MB)
- * Example:
- * Note: Allocate a specified size of GPU memory block. Later memory usage
- *       will be allocated from that memory block. If the memory block does not
- *       have enough GPU memory, the memory block with the size
- *       FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
- *       the GPU has no remaining memory.
- */
-DEFINE_uint64(
-    initial_gpu_memory_in_mb, 0ul,
-    "Allocate a trunk of gpu memory whose byte size is specified by "
-    "the flag. Future memory usage will be allocated from the "
-    "trunk. If the trunk doesn't have enough gpu memory, additional "
-    "trunks of the gpu memory will be requested from gpu with size "
-    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
-    "no memory left for the additional trunk. Note: if you set this "
-    "flag, the memory size set by "
-    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
-    "flag. If you don't set this flag, PaddlePaddle will use "
-    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
-
-/**
- * Memory related FLAG
- * Name: FLAGS_reallocate_gpu_memory_in_mb
- * Since Version: 1.4.0
- * Value Range: uint64, default=0 (MB)
- * Example:
- * Note: If the allocated GPU memory blocks are exhausted,
- *       additional GPU memory blocks are reallocated
- */
-DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
-              "If this flag is set, Paddle will reallocate the gpu memory with "
-              "size specified by this flag. Else Paddle will reallocate by "
-              "FLAGS_fraction_of_gpu_memory_to_use");
-
-#endif
-
-/**
- * Scope related FLAG
- * Name: local_exe_sub_scope_limit
- * Since Version: 1.6.0
- * Value Range: double, default=256 (MB)
- * Example:
- * Note:
- */
-DEFINE_double(local_exe_sub_scope_limit, 256.0,  // MBytes
-              "The memory up limit of sub-scopes of local execution scope for "
-              "each CUDAPlace. If you don't need to limit the memory, "
-              "you should set FLAGS_local_exe_sub_scope_limit=-1. "
-              "The default value is 256 MBytes.");
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
deleted file mode 100644
index c203f4e04a28452807a42bbdaf75e89977772a04..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/float16.h
+++ /dev/null
@@ -1,1086 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <limits>
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef __GNUC__
-#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
-#else
-#define PADDLE_GNUC_VER 0
-#endif  // __GNUC__
-
-#ifdef __clang__
-#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
-#else
-#define PADDLE_CLANG_VER 0
-#endif  // __clang__
-
-#if defined(__CUDACC__) && CUDA_VERSION >= 7050
-#define PADDLE_CUDA_FP16
-#include <cuda_fp16.h>
-#endif
-
-#if defined(__arm__) || defined(__aarch64__)
-#define PADDLE_ARM
-#endif
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#define PADDLE_NEON
-#include <arm_neon.h>
-#endif
-
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
-    (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
-#define PADDLE_WITH_NATIVE_FP16
-#endif
-
-#ifndef PADDLE_ARM
-#include <immintrin.h>
-#endif  // PADDLE_ARM
-
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
-
-namespace paddle {
-namespace platform {
-
-// Forward declare float16 for eigen.h
-struct float16;
-
-}  // namespace platform
-}  // namespace paddle
-
-#include "paddle/fluid/platform/hostdevice.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace platform {
-
-// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
-// and aligned at least on a 2-byte boundary, which leads to efficient
-// memory access of float16 struct and also makes float16 compatible
-// with CUDA half, ARM float16_t, and Eigen::half data types.
-struct PADDLE_ALIGN(2) float16 {
- public:
-  uint16_t x;
-
-  // The following defaulted special class member functions
-  // are added to make float16 pass the std::is_trivial test
-  float16() = default;
-  float16(const float16& o) = default;
-  float16& operator=(const float16& o) = default;
-  float16(float16&& o) = default;
-  float16& operator=(float16&& o) = default;
-  ~float16() = default;
-
-// Constructors
-#ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline explicit float16(const half& h) {
-#if CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
-#else
-    x = h.x;
-#endif  // CUDA_VERSION >= 9000
-  }
-#endif  // PADDLE_CUDA_FP16
-
-  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
-
-#ifdef PADDLE_WITH_NATIVE_FP16
-  // __fp16 is a native half precision data type for arm cpu,
-  // float16_t is an alias for __fp16
-  HOSTDEVICE inline explicit float16(const float16_t& h) {
-    x = *reinterpret_cast<const uint16_t*>(&h);
-  }
-#endif
-
-  HOSTDEVICE inline explicit float16(float val) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-    half tmp = __float2half(val);
-    x = *reinterpret_cast<uint16_t*>(&tmp);
-
-#elif defined(PADDLE_WITH_NATIVE_FP16)
-    float32x4_t tmp = vld1q_dup_f32(&val);
-    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
-    x = *reinterpret_cast<uint16_t*>(&res);
-
-#elif defined(__F16C__)
-    x = _cvtss_sh(val, 0);
-
-#else
-    // Conversion routine adapted from
-    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-    Bits v, s;
-    v.f = val;
-    uint32_t sign = v.si & sigN;
-    v.si ^= sign;
-    sign >>= shiftSign;  // logical shift
-    s.si = mulN;
-    s.si = s.f * v.f;  // correct subnormals
-    v.si ^= (s.si ^ v.si) & -(minN > v.si);
-    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
-    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
-    v.ui >>= shift;  // logical shift
-    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
-    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
-    x = v.ui | sign;
-
-#endif
-  }
-
-  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
-
-  template <class T>
-  HOSTDEVICE inline explicit float16(const T& val)
-      : x(float16(static_cast<float>(val)).x) {}
-
-// Assignment operators
-#ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline float16& operator=(const half& rhs) {
-#if CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
-#else
-    x = rhs.x;
-#endif
-    return *this;
-  }
-#endif
-
-  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
-    x = rhs.x;
-    return *this;
-  }
-
-#ifdef PADDLE_WITH_NATIVE_FP16
-  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
-    x = *reinterpret_cast<const uint16_t*>(&rhs);
-    return *this;
-  }
-#endif
-
-  HOSTDEVICE inline float16& operator=(bool b) {
-    x = b ? 0x3c00 : 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int8_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint8_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int16_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint16_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int32_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint32_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int64_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint64_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(float val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(double val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-// Conversion opertors
-#ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline explicit operator half() const {
-#if CUDA_VERSION >= 9000
-    __half_raw h;
-    h.x = x;
-    return half(h);
-#else
-    half h;
-    h.x = x;
-    return h;
-#endif  // CUDA_VERSION >= 9000
-  }
-#endif  // PADDLE_CUDA_FP16
-
-  HOSTDEVICE inline explicit operator Eigen::half() const {
-    Eigen::half h;
-    h.x = x;
-    return h;
-  }
-
-#ifdef PADDLE_WITH_NATIVE_FP16
-  HOSTDEVICE inline explicit operator float16_t() const {
-    return *reinterpret_cast<const float16_t*>(this);
-  }
-#endif
-
-  HOSTDEVICE inline explicit operator float() const {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-    half tmp = *reinterpret_cast<const half*>(this);
-    return __half2float(tmp);
-
-#elif defined(PADDLE_WITH_NATIVE_FP16)
-    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
-    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
-
-#elif defined(__F16C__)
-    return _cvtsh_ss(this->x);
-
-#else
-    // Conversion routine adapted from
-    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-    Bits v;
-    v.ui = this->x;
-    int32_t sign = v.si & sigC;
-    v.si ^= sign;
-    sign <<= shiftSign;
-    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
-    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
-    Bits s;
-    s.si = mulC;
-    s.f *= v.si;
-    int32_t mask = -(norC > v.si);
-    v.si <<= shift;
-    v.si ^= (s.si ^ v.si) & mask;
-    v.si |= sign;
-    return v.f;
-
-#endif
-  }
-
-  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
-
-  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(static_cast<float>(*this));
-  }
-
- private:
-  union Bits {
-    float f;
-    int32_t si;
-    uint32_t ui;
-  };
-
-  static const int shift = 13;
-  static const int shiftSign = 16;
-
-  static const int32_t infN = 0x7F800000;
-  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
-  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
-  static const int32_t sigN = 0x80000000;  // sign bit
-
-  static constexpr int32_t infC = infN >> shift;
-  static constexpr int32_t nanN = (infC + 1)
-                                  << shift;  // minimum flt16 nan as float32
-  static constexpr int32_t maxC = maxN >> shift;
-  static constexpr int32_t minC = minN >> shift;
-  static constexpr int32_t sigC = sigN >> shiftSign;
-
-  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
-  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
-  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
-  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
-
-  static constexpr int32_t maxD = infC - maxC - 1;
-  static constexpr int32_t minD = minC - subC - 1;
-};
-
-// Arithmetic operators on GPU
-// CUDA 9.0 provides built-in arithmetic operators for half while
-// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
-// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
-// CUDA 9.0 regarding the half data type.
-#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
-
-DEVICE inline half operator+(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hadd(a, b);
-#else
-  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
-  return half(float16(res));
-#endif
-}
-
-DEVICE inline half operator-(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hsub(a, b);
-#else
-  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
-  return half(float16(res));
-#endif
-}
-
-DEVICE inline half operator*(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hmul(a, b);
-#else
-  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
-  return half(float16(res));
-#endif
-}
-
-DEVICE inline half operator/(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-#else
-  float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
-  return half(float16(res));
-#endif
-}
-
-DEVICE inline half operator-(const half& a) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hneg(a);
-#else
-  float res = -static_cast<float>(float16(a));
-  return half(float16(res));
-#endif
-}
-
-DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
-  a = a + b;
-  return a;
-}
-
-DEVICE inline half& operator-=(half& a, const half& b) {  // NOLINT
-  a = a - b;
-  return a;
-}
-
-DEVICE inline half& operator*=(half& a, const half& b) {  // NOLINT
-  a = a * b;
-  return a;
-}
-
-DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
-  a = a / b;
-  return a;
-}
-
-DEVICE inline bool operator==(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __heq(a, b);
-#else
-  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator!=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hne(a, b);
-#else
-  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator<(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hlt(a, b);
-#else
-  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator<=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hle(a, b);
-#else
-  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator>(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hgt(a, b);
-#else
-  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator>=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hge(a, b);
-#else
-  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
-#endif
-}
-
-#endif  // PADDLE_CUDA_FP16
-
-// Arithmetic operators for float16 on GPU
-#if defined(PADDLE_CUDA_FP16)
-HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hadd(half(a), half(b)));
-#else
-  return float16(static_cast<float>(a) + static_cast<float>(b));
-#endif
-}
-
-HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hsub(half(a), half(b)));
-#else
-  return float16(static_cast<float>(a) - static_cast<float>(b));
-#endif
-}
-
-HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hmul(half(a), half(b)));
-#else
-  return float16(static_cast<float>(a) * static_cast<float>(b));
-#endif
-}
-
-HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-  // TODO(kexinzhao): check which cuda version starts to support __hdiv
-  float num = __half2float(half(a));
-  float denom = __half2float(half(b));
-  return float16(num / denom);
-#else
-  return float16(static_cast<float>(a) / static_cast<float>(b));
-#endif
-}
-
-HOSTDEVICE inline float16 operator-(const float16& a) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hneg(half(a)));
-#else
-  float16 res;
-  res.x = a.x ^ 0x8000;
-  return res;
-#endif
-}
-
-HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
-  a = a + b;
-  return a;
-}
-
-HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
-  a = a - b;
-  return a;
-}
-
-HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
-  a = a * b;
-  return a;
-}
-
-HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
-  a = a / b;
-  return a;
-}
-
-HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __heq(half(a), half(b));
-#else
-  return static_cast<float>(a) == static_cast<float>(b);
-#endif
-}
-
-HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hne(half(a), half(b));
-#else
-  return static_cast<float>(a) != static_cast<float>(b);
-#endif
-}
-
-HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hlt(half(a), half(b));
-#else
-  return static_cast<float>(a) < static_cast<float>(b);
-#endif
-}
-
-HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hle(half(a), half(b));
-#else
-  return static_cast<float>(a) <= static_cast<float>(b);
-#endif
-}
-
-HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hgt(half(a), half(b));
-#else
-  return static_cast<float>(a) > static_cast<float>(b);
-#endif
-}
-
-HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hge(half(a), half(b));
-#else
-  return static_cast<float>(a) >= static_cast<float>(b);
-#endif
-}
-
-// Arithmetic operators for float16 on ARMv8.2-A CPU
-#elif defined(PADDLE_WITH_NATIVE_FP16)
-inline float16 operator+(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fadd h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator-(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fsub h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator*(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fmul h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator/(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fdiv h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator-(const float16& a) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "fneg h0, h0\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0");
-  return res;
-}
-
-inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
-  a = a + b;
-  return a;
-}
-
-inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
-  a = a - b;
-  return a;
-}
-
-inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
-  a = a * b;
-  return a;
-}
-
-inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
-  a = a / b;
-  return a;
-}
-
-inline bool operator==(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fcmeq h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
-
-inline bool operator<(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v1.h}[0], [%[a_ptr]]\n"
-      "ld1 {v0.h}[0], [%[b_ptr]]\n"
-      "fcmgt h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator<=(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v1.h}[0], [%[a_ptr]]\n"
-      "ld1 {v0.h}[0], [%[b_ptr]]\n"
-      "fcmge h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator>(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fcmgt h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator>=(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fcmge h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-// Arithmetic operators for float16, software emulated on other CPU
-#else
-inline float16 operator+(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) + static_cast<float>(b));
-}
-
-inline float16 operator-(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) - static_cast<float>(b));
-}
-
-inline float16 operator*(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) * static_cast<float>(b));
-}
-
-inline float16 operator/(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) / static_cast<float>(b));
-}
-
-inline float16 operator-(const float16& a) {
-  float16 res;
-  res.x = a.x ^ 0x8000;
-  return res;
-}
-
-inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) + static_cast<float>(b));
-  return a;
-}
-
-inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) - static_cast<float>(b));
-  return a;
-}
-
-inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) * static_cast<float>(b));
-  return a;
-}
-
-inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) / static_cast<float>(b));
-  return a;
-}
-
-inline bool operator==(const float16& a, const float16& b) {
-  return static_cast<float>(a) == static_cast<float>(b);
-}
-
-inline bool operator!=(const float16& a, const float16& b) {
-  return static_cast<float>(a) != static_cast<float>(b);
-}
-
-inline bool operator<(const float16& a, const float16& b) {
-  return static_cast<float>(a) < static_cast<float>(b);
-}
-
-inline bool operator<=(const float16& a, const float16& b) {
-  return static_cast<float>(a) <= static_cast<float>(b);
-}
-
-inline bool operator>(const float16& a, const float16& b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}
-
-inline bool operator>=(const float16& a, const float16& b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
-}
-#endif
-
-HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
-  float16 res;
-  res.x = a;
-  return res;
-}
-
-HOSTDEVICE inline bool(isnan)(const float16& a) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hisnan(half(a));
-#else
-  return (a.x & 0x7fff) > 0x7c00;
-#endif
-}
-
-HOSTDEVICE inline bool(isinf)(const float16& a) {
-  return (a.x & 0x7fff) == 0x7c00;
-}
-
-HOSTDEVICE inline bool(isfinite)(const float16& a) {
-  return !((isnan)(a)) && !((isinf)(a));
-}
-
-inline std::ostream& operator<<(std::ostream& os, const float16& a) {
-  os << static_cast<float>(a);
-  return os;
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-namespace std {
-
-// Override the std::is_pod::value for float16
-// The reason is that different compilers implemented std::is_pod based on
-// different C++ standards. float16 class is a plain old data in C++11 given
-// that it is both trivial and standard_layout.
-// However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is
-// more restricted in that you cannot provide any customized
-// constructor in float16. Hence, we override is_pod here following C++11
-// so that .cu files can be successfully compiled by nvcc.
-template <>
-struct is_pod<paddle::platform::float16> {
-  static const bool value =
-      is_trivial<paddle::platform::float16>::value &&
-      is_standard_layout<paddle::platform::float16>::value;
-};
-
-template <>
-struct is_floating_point<paddle::platform::float16>
-    : std::integral_constant<
-          bool, std::is_same<paddle::platform::float16,
-                             typename std::remove_cv<
-                                 paddle::platform::float16>::type>::value> {};
-template <>
-struct is_signed<paddle::platform::float16> {
-  static const bool value = true;
-};
-
-template <>
-struct is_unsigned<paddle::platform::float16> {
-  static const bool value = false;
-};
-
-inline bool isnan(const paddle::platform::float16& a) {
-  return paddle::platform::isnan(a);
-}
-
-inline bool isinf(const paddle::platform::float16& a) {
-  return paddle::platform::isinf(a);
-}
-
-template <>
-struct numeric_limits<paddle::platform::float16> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 11;
-  static const int digits10 = 3;
-  static const int max_digits10 = 5;
-  static const int radix = 2;
-  static const int min_exponent = -13;
-  static const int min_exponent10 = -4;
-  static const int max_exponent = 16;
-  static const int max_exponent10 = 4;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
-
-  static paddle::platform::float16(min)() {
-    return paddle::platform::raw_uint16_to_float16(0x400);
-  }
-  static paddle::platform::float16 lowest() {
-    return paddle::platform::raw_uint16_to_float16(0xfbff);
-  }
-  static paddle::platform::float16(max)() {
-    return paddle::platform::raw_uint16_to_float16(0x7bff);
-  }
-  static paddle::platform::float16 epsilon() {
-    return paddle::platform::raw_uint16_to_float16(0x0800);
-  }
-  static paddle::platform::float16 round_error() {
-    return paddle::platform::float16(0.5);
-  }
-  static paddle::platform::float16 infinity() {
-    return paddle::platform::raw_uint16_to_float16(0x7c00);
-  }
-  static paddle::platform::float16 quiet_NaN() {
-    return paddle::platform::raw_uint16_to_float16(0x7e00);
-  }
-  static paddle::platform::float16 signaling_NaN() {
-    return paddle::platform::raw_uint16_to_float16(0x7e00);
-  }
-  static paddle::platform::float16 denorm_min() {
-    return paddle::platform::raw_uint16_to_float16(0x1);
-  }
-};
-
-}  // namespace std
-
-namespace Eigen {
-
-using float16 = paddle::platform::float16;
-
-template <>
-struct NumTraits<float16> : GenericNumTraits<float16> {
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
-
-  HOSTDEVICE static inline float16 epsilon() {
-    return paddle::platform::raw_uint16_to_float16(0x0800);
-  }
-  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
-  HOSTDEVICE static inline float16 highest() {
-    return paddle::platform::raw_uint16_to_float16(0x7bff);
-  }
-  HOSTDEVICE static inline float16 lowest() {
-    return paddle::platform::raw_uint16_to_float16(0xfbff);
-  }
-  HOSTDEVICE static inline float16 infinity() {
-    return paddle::platform::raw_uint16_to_float16(0x7c00);
-  }
-  HOSTDEVICE static inline float16 quiet_NaN() {
-    return paddle::platform::raw_uint16_to_float16(0x7c01);
-  }
-};
-
-namespace numext {
-
-template <>
-HOSTDEVICE inline bool(isnan)(const float16& a) {
-  return (paddle::platform::isnan)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isinf)(const float16& a) {
-  return (paddle::platform::isinf)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isfinite)(const float16& a) {
-  return (paddle::platform::isfinite)(a);
-}
-
-template <>
-HOSTDEVICE inline float16 exp(const float16& a) {
-  return float16(::expf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 erf(const float16& a) {
-  return float16(::erff(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 log(const float16& a) {
-  return float16(::logf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 tanh(const float16& a) {
-  return float16(::tanhf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 sqrt(const float16& a) {
-  return float16(::sqrtf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 ceil(const float16& a) {
-  return float16(::ceilf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 floor(const float16& a) {
-  return float16(::floorf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 round(const float16& a) {
-  return float16(::roundf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
-  return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
-}
-
-template <>
-HOSTDEVICE inline float16 abs(const float16& a) {
-  return float16(::fabs(static_cast<float>(a)));
-}
-
-}  // namespace numext
-
-}  // namespace Eigen
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
deleted file mode 100644
index f411c3863ffb39a943a863bc2b2ae0f327d51fb9..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/float16_test.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/platform/float16.h"
-
-#include <vector>
-
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
-
-namespace paddle {
-namespace platform {
-
-TEST(float16, conversion_cpu) {
-  // Explicit conversion from Eigen::half
-  EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00);
-  EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800);
-  EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555);
-  EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000);
-  EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000);
-  EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff);
-  EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00);
-
-  // Conversion from float
-  EXPECT_EQ(float16(1.0f).x, 0x3c00);
-  EXPECT_EQ(float16(0.5f).x, 0x3800);
-  EXPECT_EQ(float16(0.33333f).x, 0x3555);
-  EXPECT_EQ(float16(0.0f).x, 0x0000);
-  EXPECT_EQ(float16(-0.0f).x, 0x8000);
-  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
-  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
-
-  // Conversion from double
-  EXPECT_EQ(float16(1.0).x, 0x3c00);
-  EXPECT_EQ(float16(0.5).x, 0x3800);
-  EXPECT_EQ(float16(0.33333).x, 0x3555);
-  EXPECT_EQ(float16(0.0).x, 0x0000);
-  EXPECT_EQ(float16(-0.0).x, 0x8000);
-  EXPECT_EQ(float16(65504.0).x, 0x7bff);
-  EXPECT_EQ(float16(65536.0).x, 0x7c00);
-
-  // Conversion from int
-  EXPECT_EQ(float16(-1).x, 0xbc00);
-  EXPECT_EQ(float16(0).x, 0x0000);
-  EXPECT_EQ(float16(1).x, 0x3c00);
-  EXPECT_EQ(float16(2).x, 0x4000);
-  EXPECT_EQ(float16(3).x, 0x4200);
-
-  // Conversion from bool
-  EXPECT_EQ(float16(true).x, 0x3c00);
-  EXPECT_EQ(float16(false).x, 0x0000);
-
-  // Assignment operator
-  float16 v_assign;
-  v_assign = float16(0);
-  EXPECT_EQ(v_assign.x, 0x0000);
-  v_assign = Eigen::half(1.0f);
-  EXPECT_EQ(v_assign.x, 0x3c00);
-  v_assign = 0.5f;
-  EXPECT_EQ(v_assign.x, 0x3800);
-  v_assign = 0.33333;
-  EXPECT_EQ(v_assign.x, 0x3555);
-  v_assign = -1;
-  EXPECT_EQ(v_assign.x, 0xbc00);
-  v_assign = true;
-  EXPECT_EQ(v_assign.x, 0x3c00);
-
-  // Conversion operator
-  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
-  EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
-  EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
-  EXPECT_EQ(static_cast<int>(float16(-1)), -1);
-  EXPECT_EQ(static_cast<bool>(float16(true)), true);
-}
-
-TEST(float16, arithmetic_cpu) {
-  EXPECT_EQ(static_cast<float>(float16(1) + float16(1)), 2);
-  EXPECT_EQ(static_cast<float>(float16(5) + float16(-5)), 0);
-  EXPECT_NEAR(static_cast<float>(float16(0.33333f) + float16(0.66667f)), 1.0f,
-              0.001);
-  EXPECT_EQ(static_cast<float>(float16(3) - float16(5)), -2);
-  EXPECT_NEAR(static_cast<float>(float16(0.66667f) - float16(0.33333f)),
-              0.33334f, 0.001);
-  EXPECT_NEAR(static_cast<float>(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
-  EXPECT_NEAR(static_cast<float>(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
-  EXPECT_NEAR(static_cast<float>(float16(2.0f) / float16(3.0f)), 0.66667f,
-              0.001);
-  EXPECT_EQ(static_cast<float>(float16(1.0f) / float16(2.0f)), 0.5f);
-  EXPECT_EQ(static_cast<float>(-float16(512.0f)), -512.0f);
-  EXPECT_EQ(static_cast<float>(-float16(-512.0f)), 512.0f);
-}
-
-TEST(float16, comparison_cpu) {
-  EXPECT_TRUE(float16(1.0f) == float16(1.0f));
-  EXPECT_FALSE(float16(-1.0f) == float16(-0.5f));
-  EXPECT_TRUE(float16(1.0f) != float16(0.5f));
-  EXPECT_FALSE(float16(-1.0f) != float16(-1.0f));
-  EXPECT_TRUE(float16(1.0f) < float16(2.0f));
-  EXPECT_FALSE(float16(-1.0f) < float16(-1.0f));
-  EXPECT_TRUE(float16(1.0f) <= float16(1.0f));
-  EXPECT_TRUE(float16(2.0f) > float16(1.0f));
-  EXPECT_FALSE(float16(-2.0f) > float16(-2.0f));
-  EXPECT_TRUE(float16(2.0f) >= float16(2.0f));
-
-  EXPECT_TRUE(float16(0.0f) == float16(-0.0f));
-  EXPECT_TRUE(float16(0.0f) <= float16(-0.0f));
-  EXPECT_TRUE(float16(0.0f) >= float16(-0.0f));
-  EXPECT_FALSE(float16(0.0f) < float16(-0.0f));
-  EXPECT_FALSE(float16(-0.0f) < float16(0.0f));
-  EXPECT_FALSE(float16(0.0f) > float16(-0.0f));
-  EXPECT_FALSE(float16(-0.0f) > float16(0.0f));
-}
-
-TEST(float16, lod_tensor_cpu) {
-  framework::LoDTensor lod_tensor;
-
-  std::vector<float16> input_data = {float16(1.0f), float16(0.5f),
-                                     float16(0.33333f), float16(0.0f)};
-  EXPECT_EQ(input_data[0].x, 0x3c00);
-  EXPECT_EQ(input_data[1].x, 0x3800);
-  EXPECT_EQ(input_data[2].x, 0x3555);
-  EXPECT_EQ(input_data[3].x, 0x0000);
-
-  lod_tensor.Resize({4, 1});
-  lod_tensor.set_lod(framework::LoD({{0, 2, 4}}));
-  float16* data_ptr = lod_tensor.mutable_data<float16>(CPUPlace());
-
-  EXPECT_NE(data_ptr, nullptr);
-  EXPECT_EQ(input_data.size(), static_cast<size_t>(lod_tensor.numel()));
-  for (size_t i = 0; i < input_data.size(); ++i) {
-    data_ptr[i] = input_data[i];
-    EXPECT_EQ(data_ptr[i].x, input_data[i].x);
-  }
-}
-
-TEST(float16, floating) {
-  // compile time assert.
-  PADDLE_ENFORCE_EQ(std::is_floating_point<float16>::value, true);
-}
-
-TEST(float16, print) {
-  float16 a = float16(1.0f);
-  std::cout << a << std::endl;
-}
-
-// CPU test
-TEST(float16, isinf) {
-  float16 a;
-  a.x = 0x7c00;
-  float16 b = float16(INFINITY);
-  float16 c = static_cast<float16>(INFINITY);
-  EXPECT_EQ(std::isinf(a), true);
-  EXPECT_EQ(std::isinf(b), true);
-  EXPECT_EQ(std::isinf(c), true);
-}
-
-TEST(float16, isnan) {
-  float16 a;
-  a.x = 0x7fff;
-  float16 b = float16(NAN);
-  float16 c = static_cast<float16>(NAN);
-  EXPECT_EQ(std::isnan(a), true);
-  EXPECT_EQ(std::isnan(b), true);
-  EXPECT_EQ(std::isnan(c), true);
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
deleted file mode 100644
index bf2038419cb064c508af01a7e0cd085df9ed6d6d..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/float16_test.cu
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/float16.h"
-
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <bitset>
-#include <iostream>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#define ARITHMETIC_KERNEL(op_type, sign)                                 \
-  __global__ void op_type(const half* in1, const half* in2, half* out) { \
-    out[0] = in1[0] sign in2[0];                                         \
-  }
-
-#define COMPOUND_KERNEL(op_type, sign) \
-  __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
-
-#define COMPARISON_KERNEL(op_type, sign)                                 \
-  __global__ void op_type(const half* in1, const half* in2, bool* out) { \
-    out[0] = in1[0] sign in2[0];                                         \
-  }
-
-#define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
-  void Test##op_type(float v_in1, float v_in2, float v_out) { \
-    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
-    half *in1, *in2, *out;                                    \
-    half *d_in1, *d_in2, *d_out;                              \
-    int size = sizeof(half);                                  \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
-    cudaMalloc(reinterpret_cast<void**>(&d_out), size);       \
-    in1 = reinterpret_cast<half*>(malloc(size));              \
-    in2 = reinterpret_cast<half*>(malloc(size));              \
-    out = reinterpret_cast<half*>(malloc(size));              \
-    in1[0] = half(float16(v_in1));                            \
-    in2[0] = half(float16(v_in2));                            \
-    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
-    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
-    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
-    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(static_cast<float>(float16(out[0])), v_out);    \
-    free(in1);                                                \
-    free(in2);                                                \
-    free(out);                                                \
-    cudaFree(d_in1);                                          \
-    cudaFree(d_in2);                                          \
-    cudaFree(d_out);                                          \
-  }
-
-#define COMPOUND_KERNEL_LAUNCH(op_type)                       \
-  void Test##op_type(float v_in1, float v_in2, float v_out) { \
-    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
-    half *in1, *in2;                                          \
-    half *d_in1, *d_in2;                                      \
-    int size = sizeof(half);                                  \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
-    in1 = reinterpret_cast<half*>(malloc(size));              \
-    in2 = reinterpret_cast<half*>(malloc(size));              \
-    in1[0] = half(float16(v_in1));                            \
-    in2[0] = half(float16(v_in2));                            \
-    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
-    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
-    op_type<<<1, 1>>>(d_in1, d_in2);                          \
-    cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out);    \
-    free(in1);                                                \
-    free(in2);                                                \
-    cudaFree(d_in1);                                          \
-    cudaFree(d_in2);                                          \
-  }
-
-#define COMPARISON_KERNEL_LAUNCH(op_type)                    \
-  void Test##op_type(float v_in1, float v_in2, bool v_out) { \
-    LOG(INFO) << "Test " << #op_type << " on GPU!";          \
-    half *in1, *in2;                                         \
-    half *d_in1, *d_in2;                                     \
-    bool *out, *d_out;                                       \
-    int size = sizeof(half);                                 \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);      \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);      \
-    cudaMalloc(reinterpret_cast<void**>(&d_out), 1);         \
-    in1 = reinterpret_cast<half*>(malloc(size));             \
-    in2 = reinterpret_cast<half*>(malloc(size));             \
-    out = reinterpret_cast<bool*>(malloc(1));                \
-    in1[0] = half(float16(v_in1));                           \
-    in2[0] = half(float16(v_in2));                           \
-    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
-    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
-    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
-    cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost);       \
-    EXPECT_EQ(out[0], v_out);                                \
-    free(in1);                                               \
-    free(in2);                                               \
-    free(out);                                               \
-    cudaFree(d_in1);                                         \
-    cudaFree(d_in2);                                         \
-    cudaFree(d_out);                                         \
-  }
-
-#ifdef PADDLE_CUDA_FP16
-namespace paddle {
-namespace platform {
-
-#if CUDA_VERSION < 9000
-ARITHMETIC_KERNEL(Add, +)
-ARITHMETIC_KERNEL(Sub, -)
-ARITHMETIC_KERNEL(Mul, *)
-ARITHMETIC_KERNEL(Div, /)
-
-ARITHMETIC_KERNEL_LAUNCH(Add)
-ARITHMETIC_KERNEL_LAUNCH(Sub)
-ARITHMETIC_KERNEL_LAUNCH(Mul)
-ARITHMETIC_KERNEL_LAUNCH(Div)
-
-// Negative sign kernel
-__global__ void Neg(half* in) { in[0] = -in[0]; }
-
-void TestNeg(float v_in, float v_out) {
-  LOG(INFO) << "Test Neg on GPU!";
-  half *in, *d_in;
-  int size = sizeof(half);
-  cudaMalloc(reinterpret_cast<void**>(&d_in), size);
-  in = reinterpret_cast<half*>(malloc(size));
-  in[0] = half(float16(v_in));
-  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
-  Neg<<<1, 1>>>(d_in);
-  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
-  EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
-  free(in);
-  cudaFree(d_in);
-}
-
-COMPOUND_KERNEL(AddAssign, +=)
-COMPOUND_KERNEL(SubAssign, -=)
-COMPOUND_KERNEL(MulAssign, *=)
-COMPOUND_KERNEL(DivAssign, /=)
-
-COMPOUND_KERNEL_LAUNCH(AddAssign)
-COMPOUND_KERNEL_LAUNCH(SubAssign)
-COMPOUND_KERNEL_LAUNCH(MulAssign)
-COMPOUND_KERNEL_LAUNCH(DivAssign)
-
-COMPARISON_KERNEL(Equal, ==)
-COMPARISON_KERNEL(NotEqual, !=)
-COMPARISON_KERNEL(Less, <)
-COMPARISON_KERNEL(LessEqual, <=)
-COMPARISON_KERNEL(Greater, >)
-COMPARISON_KERNEL(GreaterEqual, >=)
-
-COMPARISON_KERNEL_LAUNCH(Equal)
-COMPARISON_KERNEL_LAUNCH(NotEqual)
-COMPARISON_KERNEL_LAUNCH(Less)
-COMPARISON_KERNEL_LAUNCH(LessEqual)
-COMPARISON_KERNEL_LAUNCH(Greater)
-COMPARISON_KERNEL_LAUNCH(GreaterEqual)
-
-TEST(float16, arithmetic_on_gpu) {
-  TestAdd(1, 2, 3);
-  TestSub(2, 1, 1);
-  TestMul(2, 3, 6);
-  TestDiv(6, 2, 3);
-  TestNeg(1, -1);
-}
-
-TEST(float16, compound_on_gpu) {
-  TestAddAssign(1, 2, 3);
-  TestSubAssign(2, 1, 1);
-  TestMulAssign(2, 3, 6);
-  TestDivAssign(6, 2, 3);
-}
-
-TEST(float16, comparision_on_gpu) {
-  TestEqual(1, 1, true);
-  TestEqual(1, 2, false);
-  TestNotEqual(2, 3, true);
-  TestNotEqual(2, 2, false);
-  TestLess(3, 4, true);
-  TestLess(3, 3, false);
-  TestLessEqual(3, 3, true);
-  TestLessEqual(3, 2, false);
-  TestGreater(4, 3, true);
-  TestGreater(4, 4, false);
-  TestGreaterEqual(4, 4, true);
-  TestGreaterEqual(4, 5, false);
-}
-#endif  // CUDA_VERSION
-
-TEST(float16, conversion_on_gpu) {
-  // Explicit conversion to and from cuda half
-  EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
-  EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
-  EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
-  EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
-  EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
-  EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
-  EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
-
-  // Assignment operator
-  float16 v_assign;
-  v_assign = half(float16(1.0f));
-  EXPECT_EQ(v_assign.x, 0x3c00);
-}
-
-TEST(float16, lod_tensor_on_gpu) {
-  framework::LoDTensor src_tensor;
-  framework::LoDTensor gpu_tensor;
-  framework::LoDTensor dst_tensor;
-
-  float16* src_ptr = src_tensor.mutable_data<float16>(
-      framework::make_ddim({2, 2}), CPUPlace());
-
-  float16 arr[4] = {float16(1.0f), float16(0.5f), float16(0.33333f),
-                    float16(0.0f)};
-  memcpy(src_ptr, arr, 4 * sizeof(float16));
-
-  // CPU LoDTensor to GPU LoDTensor
-  CUDAPlace gpu_place(0);
-  CUDADeviceContext gpu_ctx(gpu_place);
-  framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
-
-  // GPU LoDTensor to CPU LoDTensor
-  framework::TensorCopy(gpu_tensor, CPUPlace(), gpu_ctx, &dst_tensor);
-
-  // Sync before comparing LoDTensors
-  gpu_ctx.Wait();
-  const float16* dst_ptr = dst_tensor.data<float16>();
-  ASSERT_NE(src_ptr, dst_ptr);
-  for (size_t i = 0; i < 4; ++i) {
-    EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
-  }
-}
-
-template <typename T>
-struct Functor {
-  bool operator()(const T& val) {
-    return std::type_index(typeid(T)) ==
-           std::type_index(typeid(platform::float16));
-  }
-};
-
-TEST(float16, typeid) {
-  // the framework heavily used typeid hash
-  Functor<float16> functor;
-  float16 a = float16(.0f);
-  Functor<int> functor2;
-  int b(0);
-
-  // compile time assert
-  PADDLE_ENFORCE_EQ(functor(a), true);
-  PADDLE_ENFORCE_EQ(functor2(b), false);
-}
-
-// GPU test
-TEST(float16, isinf) {
-  float16 a;
-  a.x = 0x7c00;
-  float16 b = float16(INFINITY);
-  // underflow to 0
-  float16 native_a(5e-40f);
-  EXPECT_EQ(std::isinf(a), true);
-  EXPECT_EQ(std::isinf(b), true);
-#ifndef _WIN32
-  // overflow to inf
-  float16 native_b(5e40f);
-  EXPECT_EQ(std::isinf(native_b), true);
-#endif
-  EXPECT_EQ(native_a, float16(0));
-}
-
-TEST(float16, isnan) {
-  float16 a;
-  a.x = 0x7fff;
-  float16 b = float16(NAN);
-  float16 c = float16(5e40);
-  // inf * +-0 will get a nan
-  float16 d = c * float16(0);
-  EXPECT_EQ(std::isnan(a), true);
-  EXPECT_EQ(std::isnan(b), true);
-  EXPECT_EQ(std::isnan(d), true);
-}
-
-TEST(float16, cast) {
-  float16 a;
-  a.x = 0x0070;
-  auto b = a;
-  {
-    // change semantic, keep the same value
-    float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b));
-    EXPECT_EQ(b, c);
-  }
-
-  {
-    // use uint32 low 16 bit store float16
-    uint32_t c = reinterpret_cast<uint32_t&>(b);
-    float16 d;
-    d.x = c;
-    EXPECT_EQ(b, d);
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
-#endif  // PADDLE_CUDA_FP16
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
deleted file mode 100644
index c153e80fe42aecb33d3aa97874d2881bce9029be..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/for_range.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace platform {
-
-template <typename DeviceContext>
-struct ForRange {
-  ForRange(const DeviceContext& dev_ctx, size_t limit);
-
-  template <typename Function>
-  void operator()(Function func) const;
-};
-
-template <>
-struct ForRange<CPUDeviceContext> {
-  ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    for (size_t i = 0; i < limit_; ++i) {
-      func(i);
-    }
-  }
-
-  size_t limit_;
-};
-
-#ifdef __NVCC__
-template <typename Function>
-__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
-  size_t idx = static_cast<size_t>(threadIdx.x);
-  func(idx);
-}
-
-template <typename Function>
-__global__ static void ForRangeElemwiseOp(Function func, int limit) {
-  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
-  if (idx < limit) {
-    func(idx);
-  }
-}
-
-template <>
-struct ForRange<CUDADeviceContext> {
-  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
-
-  template <typename Function>
-  inline void operator()(Function func) const {
-    constexpr int num_threads = 1024;
-    int block_size = limit_ <= num_threads ? limit_ : num_threads;
-    int grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    if (grid_size == 1) {
-      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-          func);
-    } else {
-      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
-          func, limit_);
-    }
-  }
-
-  const CUDADeviceContext& dev_ctx_;
-  int limit_;
-};
-
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
deleted file mode 100644
index 8191d688472a3eb0f297936f3387e77809a20e2f..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/gpu_info.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/gpu_info.h"
-#include <algorithm>
-#include <cstdlib>
-#include <string>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/string/split.h"
-
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_string(selected_gpus);
-
-constexpr static float fraction_reserve_gpu_memory = 0.05f;
-
-namespace paddle {
-namespace platform {
-
-inline std::string CudaErrorWebsite() {
-  return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api"
-         "/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824"
-         "6db0a94a430e0038";
-}
-
-static int GetCUDADeviceCountImpl() {
-  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
-  if (cuda_visible_devices != nullptr) {
-    std::string cuda_visible_devices_str(cuda_visible_devices);
-    if (std::all_of(cuda_visible_devices_str.begin(),
-                    cuda_visible_devices_str.end(),
-                    [](char ch) { return ch == ' '; })) {
-      VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected.";
-      return 0;
-    }
-  }
-
-  int count;
-  auto error_code = cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE(
-      error_code,
-      "cudaGetDeviceCount failed in "
-      "paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s",
-      error_code, CudaErrorWebsite());
-  return count;
-}
-
-int GetCUDADeviceCount() {
-  static auto dev_cnt = GetCUDADeviceCountImpl();
-  return dev_cnt;
-}
-
-int GetCUDAComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
-  cudaDeviceProp device_prop;
-  auto error_code = cudaGetDeviceProperties(&device_prop, id);
-  PADDLE_ENFORCE(
-      error_code,
-      "cudaGetDeviceProperties failed in "
-      "paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
-      error_code, CudaErrorWebsite());
-  return device_prop.major * 10 + device_prop.minor;
-}
-
-int GetCUDARuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
-  int runtime_version = 0;
-  auto error_code = cudaRuntimeGetVersion(&runtime_version);
-  PADDLE_ENFORCE(error_code,
-                 "cudaRuntimeGetVersion failed in "
-                 "paddle::platform::GetCUDARuntimeVersion, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
-  return runtime_version;
-}
-
-int GetCUDADriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
-  int driver_version = 0;
-  auto error_code = cudaDriverGetVersion(&driver_version);
-  PADDLE_ENFORCE(error_code,
-                 "cudaDriverGetVersion failed in "
-                 "paddle::platform::GetCUDADriverVersion, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
-  return driver_version;
-}
-
-bool TensorCoreAvailable() {
-#if CUDA_VERSION >= 9000
-  int device = GetCurrentDeviceId();
-  int driver_version = GetCUDAComputeCapability(device);
-  return driver_version >= 70;
-#else
-  return false;
-#endif
-}
-
-int GetCUDAMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
-  int count;
-  auto error_code =
-      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id);
-  PADDLE_ENFORCE(error_code,
-                 "cudaDeviceGetAttribute failed in "
-                 "paddle::platform::GetCUDAMultiProcess, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
-  return count;
-}
-
-int GetCUDAMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
-  int count;
-  auto error_code = cudaDeviceGetAttribute(
-      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id);
-  PADDLE_ENFORCE(
-      error_code,
-      "cudaDeviceGetAttribute failed in paddle::"
-      "platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s",
-      error_code, CudaErrorWebsite());
-  return count;
-}
-
-int GetCurrentDeviceId() {
-  int device_id;
-  auto error_code = cudaGetDevice(&device_id);
-  PADDLE_ENFORCE(error_code,
-                 "cudaGetDevice failed in "
-                 "paddle::platform::GetCurrentDeviceId, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
-  return device_id;
-}
-
-//! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedDevices() {
-  // use user specified GPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_gpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetCUDADeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
-}
-
-void SetDeviceId(int id) {
-  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
-  auto error_code = cudaSetDevice(id);
-  PADDLE_ENFORCE(error_code,
-                 "cudaSetDevice failed in "
-                 "paddle::platform::SetDeviced, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
-}
-
-void GpuMemoryUsage(size_t *available, size_t *total) {
-  auto error_code = cudaMemGetInfo(available, total);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemGetInfo failed in "
-                 "paddle::platform::GetMemoryUsage, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
-}
-
-size_t GpuAvailableMemToAlloc() {
-  size_t total = 0;
-  size_t available = 0;
-  GpuMemoryUsage(&available, &total);
-  size_t reserving =
-      static_cast<size_t>(fraction_reserve_gpu_memory * available);
-  // If available size is less than minimum chunk size, no usable memory exists
-  size_t available_to_alloc = available - reserving;
-  size_t min_chunk_size = GpuMinChunkSize();
-  if (available_to_alloc < min_chunk_size) {
-    available_to_alloc = 0;
-  }
-  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
-           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
-  return available_to_alloc;
-}
-
-size_t GpuMaxAllocSize() {
-  return std::max(GpuInitAllocSize(), GpuReallocSize());
-}
-
-static size_t GpuAllocSize(bool realloc) {
-  size_t available_to_alloc = GpuAvailableMemToAlloc();
-  PADDLE_ENFORCE_GT(available_to_alloc, 0, "No enough available GPU memory");
-  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
-  // allocated by fraction
-  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
-                           : FLAGS_initial_gpu_memory_in_mb;
-  size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
-  PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
-                    "No enough available GPU memory");
-  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
-           << " MiB, is it Re-alloc: " << realloc;
-  return alloc_bytes;
-}
-
-size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
-
-size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
-
-size_t GpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
-
-size_t GpuMaxChunkSize() {
-  size_t max_chunk_size = GpuMaxAllocSize();
-  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
-  return max_chunk_size;
-}
-
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    enum cudaMemcpyKind kind, cudaStream_t stream) {
-  auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
-                 "(%p -> %p, length: %d) error code : %d, %s",
-                 src, dst, static_cast<int>(count), error_code,
-                 CudaErrorWebsite());
-}
-
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind) {
-  auto error_code = cudaMemcpy(dst, src, count, kind);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync "
-                 "(%p -> %p, length: %d) error code : %d, %s",
-                 src, dst, static_cast<int>(count), error_code,
-                 CudaErrorWebsite());
-}
-
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, cudaStream_t stream) {
-  auto error_code =
-      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream);
-  PADDLE_ENFORCE(
-      error_code,
-      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync "
-      "error code : %d, %s",
-      error_code, CudaErrorWebsite());
-}
-
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
-  auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync "
-                 "error code : %d, %s",
-                 error_code, CudaErrorWebsite());
-}
-
-void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
-  auto error_code = cudaMemsetAsync(dst, value, count, stream);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync "
-                 "error code : %d, %s",
-                 error_code, CudaErrorWebsite());
-}
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
deleted file mode 100644
index e468c4aab0b01c19b69a2e57e794c0ad0a117c71..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/gpu_info.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <cuda_runtime.h>
-#include <stddef.h>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace platform {
-
-//! Get the total number of GPU devices in system.
-int GetCUDADeviceCount();
-
-//! Get the compute capability of the ith GPU (format: major * 10 + minor)
-int GetCUDAComputeCapability(int i);
-
-//! Get the runtime version of the ith GPU
-int GetCUDARuntimeVersion(int id);
-
-//! Get the driver version of the ith GPU
-int GetCUDADriverVersion(int id);
-
-//! Wheter the current device support TensorCore
-bool TensorCoreAvailable();
-
-//! Get the MultiProcessors of the ith GPU.
-int GetCUDAMultiProcessors(int i);
-
-//! Get the MaxThreads of each MultiProcessor of the ith GPU.
-int GetCUDAMaxThreadsPerMultiProcessor(int i);
-
-//! Get the current GPU device id in system.
-int GetCurrentDeviceId();
-
-//! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedDevices();
-
-//! Set the GPU device id for next execution.
-void SetDeviceId(int device_id);
-
-//! Get the memory usage of current GPU device.
-void GpuMemoryUsage(size_t *available, size_t *total);
-
-//! Get the available memory to allocate, which is the size of available gpu
-//! minus reserving.
-size_t GpuAvailableMemToAlloc();
-
-//! Get the maximum allocation size of current GPU device.
-size_t GpuMaxAllocSize();
-
-//! Get the initial allocation size of current GPU device.
-size_t GpuInitAllocSize();
-
-//! Get the re-allocation size of current GPU device.
-size_t GpuReallocSize();
-
-//! Get the minimum chunk size for GPU buddy allocator.
-size_t GpuMinChunkSize();
-
-//! Get the maximum chunk size for GPU buddy allocator.
-size_t GpuMaxChunkSize();
-
-//! Copy memory from address src to dst asynchronously.
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    enum cudaMemcpyKind kind, cudaStream_t stream);
-
-//! Copy memory from address src to dst synchronously.
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind);
-
-//! Copy memory from one device to another device asynchronously.
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, cudaStream_t stream);
-
-//! Copy memory from one device to another device synchronously.
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count);
-
-//! Set memory dst with value count size asynchronously
-void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/platform/hostdevice.h b/paddle/fluid/platform/hostdevice.h
deleted file mode 100644
index c0dc92a52176406a0333ffb6423fb68021beb7c1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/hostdevice.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#ifdef __CUDACC__
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
-#endif
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
deleted file mode 100644
index be6519b189011e0d2b09aa09bc2ddb173db2389e..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/init.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string.h>  // for strdup
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <set>
-#include <stdexcept>
-#include <string>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/string/split.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/dynload/cupti.h"
-#endif
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/piece.h"
-#if defined(PADDLE_WITH_DGC)
-#include "dgc/dgc.h"
-#endif
-
-DECLARE_int32(paddle_num_threads);
-DEFINE_int32(multiple_of_cupti_buffer_size, 1,
-             "Multiple of the CUPTI device buffer size. If the timestamps have "
-             "been dropped when you are profiling, try increasing this value.");
-
-namespace paddle {
-namespace framework {
-
-#ifdef _WIN32
-#define strdup _strdup
-#endif
-
-std::once_flag gflags_init_flag;
-std::once_flag p2p_init_flag;
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-std::once_flag dgc_init_flag;
-#endif
-
-void InitGflags(std::vector<std::string> argv) {
-  std::call_once(gflags_init_flag, [&]() {
-    FLAGS_logtostderr = true;
-    argv.insert(argv.begin(), "dummy");
-    int argc = argv.size();
-    char **arr = new char *[argv.size()];
-    std::string line;
-    for (size_t i = 0; i < argv.size(); i++) {
-      arr[i] = &argv[i][0];
-      line += argv[i];
-      line += ' ';
-    }
-    google::ParseCommandLineFlags(&argc, &arr, true);
-    VLOG(1) << "Init commandline: " << line;
-  });
-}
-
-void InitP2P(std::vector<int> devices) {
-#ifdef PADDLE_WITH_CUDA
-  std::call_once(p2p_init_flag, [&]() {
-    int count = devices.size();
-    for (int i = 0; i < count; ++i) {
-      for (int j = 0; j < count; ++j) {
-        if (devices[i] == devices[j]) continue;
-        int can_acess = -1;
-        PADDLE_ENFORCE(
-            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]),
-            "Failed to test P2P access.");
-        if (can_acess != 1) {
-          LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
-                       << " to " << devices[j];
-        } else {
-          platform::CUDADeviceGuard guard(devices[i]);
-          cudaDeviceEnablePeerAccess(devices[j], 0);
-        }
-      }
-    }
-  });
-#endif
-}
-
-void InitCupti() {
-#ifdef PADDLE_WITH_CUPTI
-  if (FLAGS_multiple_of_cupti_buffer_size == 1) return;
-  size_t attrValue = 0, attrValueSize = sizeof(size_t);
-#define MULTIPLY_ATTR_VALUE(attr)                                 \
-  {                                                               \
-    PADDLE_ENFORCE(!platform::dynload::cuptiActivityGetAttribute( \
-        attr, &attrValueSize, &attrValue));                       \
-    attrValue *= FLAGS_multiple_of_cupti_buffer_size;             \
-    LOG(WARNING) << "Set " #attr " " << attrValue << " byte";     \
-    PADDLE_ENFORCE(!platform::dynload::cuptiActivitySetAttribute( \
-        attr, &attrValueSize, &attrValue));                       \
-  }
-  MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE);
-  MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP);
-#if CUDA_VERSION >= 9000
-  MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE);
-#endif
-#undef MULTIPLY_ATTR_VALUE
-#endif
-}
-
-void InitDevices(bool init_p2p) {
-  // CUPTI attribute should be set before any CUDA context is created (see CUPTI
-  // documentation about CUpti_ActivityAttribute).
-  InitCupti();
-  /*Init all available devices by default */
-  std::vector<int> devices;
-#ifdef PADDLE_WITH_CUDA
-  try {
-    // use user specified GPUs in single-node multi-process mode.
-    devices = platform::GetSelectedDevices();
-  } catch (const std::exception &exp) {
-    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
-  }
-#endif
-  InitDevices(init_p2p, devices);
-}
-
-void InitDevices(bool init_p2p, const std::vector<int> devices) {
-  std::vector<platform::Place> places;
-
-  for (size_t i = 0; i < devices.size(); ++i) {
-    // In multi process multi gpu mode, we may have gpuid = 7
-    // but count = 1.
-    if (devices[i] < 0) {
-      LOG(WARNING) << "Invalid devices id.";
-      continue;
-    }
-
-    places.emplace_back(platform::CUDAPlace(devices[i]));
-  }
-  if (init_p2p) {
-    InitP2P(devices);
-  }
-  places.emplace_back(platform::CPUPlace());
-  platform::DeviceContextPool::Init(places);
-
-#ifndef PADDLE_WITH_MKLDNN
-  platform::SetNumThreads(FLAGS_paddle_num_threads);
-#endif
-
-#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__)
-  if (platform::MayIUse(platform::avx)) {
-#ifndef __AVX__
-    LOG(WARNING) << "AVX is available, Please re-compile on local machine";
-#endif
-  }
-
-// Throw some informations when CPU instructions mismatch.
-#define AVX_GUIDE(compiletime, runtime)                                     \
-  LOG(FATAL)                                                                \
-      << "This version is compiled on higher instruction(" #compiletime     \
-         ") system, you may encounter illegal instruction error running on" \
-         " your local CPU machine. Please reinstall the " #runtime          \
-         " version or compile from source code."
-
-#ifdef __AVX512F__
-  if (!platform::MayIUse(platform::avx512f)) {
-    if (platform::MayIUse(platform::avx2)) {
-      AVX_GUIDE(AVX512, AVX2);
-    } else if (platform::MayIUse(platform::avx)) {
-      AVX_GUIDE(AVX512, AVX);
-    } else {
-      AVX_GUIDE(AVX512, NonAVX);
-    }
-  }
-#endif
-
-#ifdef __AVX2__
-  if (!platform::MayIUse(platform::avx2)) {
-    if (platform::MayIUse(platform::avx)) {
-      AVX_GUIDE(AVX2, AVX);
-    } else {
-      AVX_GUIDE(AVX2, NonAVX);
-    }
-  }
-#endif
-
-#ifdef __AVX__
-  if (!platform::MayIUse(platform::avx)) {
-    AVX_GUIDE(AVX, NonAVX);
-  }
-#endif
-#undef AVX_GUIDE
-
-#endif
-}
-
-#ifndef _WIN32
-void SignalHandle(const char *data, int size) {
-  auto file_path = string::Sprintf("/tmp/paddle.%d.dump_info", ::getpid());
-  try {
-    LOG(WARNING) << std::string(data, size);
-    std::ofstream dump_info;
-    dump_info.open(file_path, std::ios::app);
-    dump_info << std::string(data, size);
-    dump_info.close();
-  } catch (...) {
-  }
-}
-#endif
-
-void InitGLOG(const std::string &prog_name) {
-  // glog will not hold the ARGV[0] inside.
-  // Use strdup to alloc a new string.
-  google::InitGoogleLogging(strdup(prog_name.c_str()));
-#ifndef _WIN32
-  google::InstallFailureSignalHandler();
-  google::InstallFailureWriter(&SignalHandle);
-#endif
-}
-
-#if defined(PADDLE_WITH_DGC)
-void InitDGC() {
-  std::call_once(dgc_init_flag, []() {
-    PADDLE_ENFORCE(paddle::communication::dgc::dynloadNcclLib());
-  });
-}
-#else
-void InitDGC() {}
-#endif
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h
deleted file mode 100644
index d25e79e78faa86c3105a2c901c514f7239c85c99..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/init.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <mutex>  // NOLINT
-#include <string>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-
-namespace paddle {
-namespace framework {
-
-void InitGflags(std::vector<std::string> argv);
-
-void InitGLOG(const std::string &prog_name);
-
-void InitDevices(bool init_p2p);
-
-void InitDevices(bool init_p2p, const std::vector<int> devices);
-
-void InitDGC();
-
-#ifndef _WIN32
-void SignalHandle(const char *data, int size);
-#endif
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
deleted file mode 100644
index 3f911843c57877cfbedfe47da390f1bebc8dd256..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/init_test.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-
-TEST(InitDevices, CPU) {
-  using paddle::framework::InitDevices;
-  using paddle::platform::DeviceContextPool;
-
-#ifndef PADDLE_WITH_CUDA
-  InitDevices(true);
-  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.size(), 1U);
-#endif
-}
-
-TEST(InitDevices, CUDA) {
-  using paddle::framework::InitDevices;
-  using paddle::platform::DeviceContextPool;
-
-#ifdef PADDLE_WITH_CUDA
-  int count = paddle::platform::GetCUDADeviceCount();
-  InitDevices(true);
-  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
-#endif
-}
-
-#ifndef _WIN32
-TEST(SignalHandle, SignalHandle) {
-  std::string msg = "Signal raises";
-  paddle::framework::SignalHandle(msg.c_str(), msg.size());
-}
-#endif
diff --git a/paddle/fluid/platform/lock_guard_ptr.h b/paddle/fluid/platform/lock_guard_ptr.h
deleted file mode 100644
index bff24e74a7070b31d6385b2d5924bdc62d7219c9..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/lock_guard_ptr.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include <memory>
-#include <mutex>  // NOLINT
-namespace paddle {
-namespace platform {
-
-/**
- * LockGuard for std::unique_ptr<LockType>. It will do nothing when guarded ptr
- * is nullptr.
- *
- * The advantage of using `LockGuardPtr` instead of
- * std::unique<std::lock_guard<lock_type>> is this type is totally a stack
- * variable. There is no heap allocation at all.
- */
-template <typename LockType>
-class LockGuardPtr {
- public:
-  explicit LockGuardPtr(std::unique_ptr<LockType>& lock_ptr)  // NOLINT
-      : lock_(lock_ptr.get()) {
-    if (lock_) {
-      lock_->lock();
-    }
-  }
-  ~LockGuardPtr() {
-    if (lock_) {
-      lock_->unlock();
-    }
-  }
-
-  LockGuardPtr(const LockGuardPtr&) = delete;
-  LockGuardPtr& operator=(const LockGuardPtr&) = delete;
-  LockGuardPtr(LockGuardPtr&&) = delete;
-  LockGuardPtr& operator=(LockGuardPtr&&) = delete;
-
- private:
-  LockType* lock_;
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
deleted file mode 100644
index 33d0fe6268046b3bcbd4addc75fcf34c03d70bf9..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/lodtensor_printer.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace platform {
-
-void PrintVar(framework::Scope* scope, const std::string& var_name,
-              const std::string& print_info) {
-  framework::Variable* var = scope->FindVar(var_name);
-  if (var == nullptr) {
-    VLOG(1) << "Variable Name " << var_name << " does not exist in your scope";
-    return;
-  }
-  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
-  if (tensor == nullptr) {
-    VLOG(1) << "tensor of variable " << var_name
-            << " does not exist in your scope";
-    return;
-  }
-
-  std::ostringstream sstream;
-  sstream << print_info << "\t";
-  sstream << var_name << "\t";
-  sstream << *tensor << "\t";
-  std::cout << sstream.str() << std::endl;
-}
-
-}  // end namespace platform
-}  // end namespace paddle
diff --git a/paddle/fluid/platform/lodtensor_printer.h b/paddle/fluid/platform/lodtensor_printer.h
deleted file mode 100644
index e070e3540c996a0fe248a3b9312c18d948395426..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/lodtensor_printer.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace platform {
-void PrintVar(framework::Scope* scope, const std::string& var_name,
-              const std::string& print_info);
-}  // end namespace platform
-}  // end namespace paddle
diff --git a/paddle/fluid/platform/lodtensor_printer_test.cc b/paddle/fluid/platform/lodtensor_printer_test.cc
deleted file mode 100644
index 19e85284b8fc8842b2e5662343c74fc451b08d9e..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/lodtensor_printer_test.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/platform/lodtensor_printer.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
-
-TEST(LodTensorPrinter, PrintVar) {
-  paddle::framework::Scope scope;
-  paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var");
-}
diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h
deleted file mode 100644
index 32b7efc04c1f2ecc22f93c08387aec69ded4930a..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/macros.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cfloat>
-
-// Disable the copy and assignment operator for a class.
-#ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname)         \
- private:                                          \
-  classname(const classname&) = delete;            \
-  classname(classname&&) = delete;                 \
-  classname& operator=(const classname&) = delete; \
-  classname& operator=(classname&&) = delete
-#endif
-
-#if defined(__FLT_MAX__)
-#define FLT_MAX __FLT_MAX__
-#endif  // __FLT_MAX__
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
deleted file mode 100644
index 1ff568cef3401724463ab129c81a522bf1aebc22..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <mkldnn.h>
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/place.h"
-namespace paddle {
-#ifdef PADDLE_WITH_MKLDNN
-using MKLDNNMemoryFormat = mkldnn::memory::format;
-#endif
-namespace platform {
-
-using MKLDNNStream = mkldnn::stream;
-using MKLDNNEngine = mkldnn::engine;
-using MKLDNNMemory = mkldnn::memory;
-using MKLDNNMemoryDescriptor = mkldnn::memory::desc;
-using MKLDNNPrimitive = mkldnn::primitive;
-using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
-
-typedef std::unique_ptr<MKLDNNStream> MKLDNNStreamPtr;
-typedef std::unique_ptr<MKLDNNEngine> MKLDNNEnginePtr;
-typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
-typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
-typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
-
-template <typename Type>
-void* to_void_cast(const Type* t) {
-  return static_cast<void*>(const_cast<Type*>(t));
-}
-
-template <typename Type>
-void* to_void_reinterpret_cast(const Type* t) {
-  return reinterpret_cast<void*>(const_cast<Type*>(t));
-}
-
-template <class Type>
-using tf_desc = typename Type::desc;
-
-template <class Type>
-using tf_pd = typename Type::primitive_desc;
-
-template <typename Type, typename Engine, typename... Args>
-std::shared_ptr<tf_pd<Type>> MKLDNNFwdPrimitiveDesc(const Engine& e,
-                                                    Args&&... args) {
-  auto desc = tf_desc<Type>(mkldnn::prop_kind::forward, (args)...);
-  auto pd = new tf_pd<Type>(desc, e);
-  return std::shared_ptr<tf_pd<Type>>(pd);
-}
-
-template <typename Type, typename Engine, typename Primitive, typename... Args>
-tf_pd<Type> MKLDNNBwdPrimitiveDesc(const Engine& e, const Primitive& p,
-                                   Args&&... args) {
-  auto desc = tf_desc<Type>(args...);
-  return tf_pd<Type>(desc, e, p);
-}
-
-inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
-                                          mkldnn::memory::data_type data_type,
-                                          MKLDNNMemoryFormat format) {
-  mkldnn::memory::dims tz = dims;
-  return mkldnn::memory::desc({tz}, data_type, format);
-}
-
-inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
-  bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
-  return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
-}
-
-template <typename Type>
-mkldnn::memory::data_type MKLDNNGetDataType() {
-  return mkldnn::memory::data_type::data_undef;
-}
-
-template <>
-inline mkldnn::memory::data_type MKLDNNGetDataType<float>() {
-  return mkldnn::memory::data_type::f32;
-}
-template <>
-inline mkldnn::memory::data_type MKLDNNGetDataType<int32_t>() {
-  return mkldnn::memory::data_type::s32;
-}
-template <>
-inline mkldnn::memory::data_type MKLDNNGetDataType<int8_t>() {
-  return mkldnn::memory::data_type::s8;
-}
-template <>
-inline mkldnn::memory::data_type MKLDNNGetDataType<uint8_t>() {
-  return mkldnn::memory::data_type::u8;
-}
-
-inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) {
-  auto reorder_prim = mkldnn::reorder(src, dst);
-  std::vector<mkldnn::primitive> pipeline;
-  pipeline.push_back(reorder_prim);
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-}
-
-inline MKLDNNMemoryFormat GetMKLDNNFormat(const mkldnn::memory memory) {
-  return static_cast<MKLDNNMemoryFormat>(
-      memory.get_primitive_desc().desc().data.format);
-}
-
-inline MKLDNNMemoryFormat GetMKLDNNFormat(
-    const mkldnn::sum::primitive_desc& memory) {
-  return static_cast<MKLDNNMemoryFormat>(
-      memory.dst_primitive_desc().desc().data.format);
-}
-
-inline MKLDNNMemoryFormat MKLDNNFormatForSize(size_t dims_size,
-                                              MKLDNNMemoryFormat data_format) {
-  if (dims_size == 1) {
-    return MKLDNNMemoryFormat::x;
-  } else if (dims_size == 2) {
-    return MKLDNNMemoryFormat::nc;
-  } else if (dims_size == 3) {
-    if (data_format == MKLDNNMemoryFormat::nchw) {
-      return MKLDNNMemoryFormat::ncw;
-    } else if (data_format == MKLDNNMemoryFormat::nhwc) {
-      return MKLDNNMemoryFormat::nwc;
-    }
-  } else if (dims_size == 4) {
-    if (data_format == MKLDNNMemoryFormat::goihw) {
-      return MKLDNNMemoryFormat::oihw;
-    }
-  } else if (dims_size == 5) {
-    if (data_format == MKLDNNMemoryFormat::goidhw) {
-      return MKLDNNMemoryFormat::oidhw;
-    }
-    if (data_format == MKLDNNMemoryFormat::nchw) {
-      return MKLDNNMemoryFormat::ncdhw;
-    } else if (data_format == MKLDNNMemoryFormat::nhwc) {
-      return MKLDNNMemoryFormat::ndhwc;
-    }
-  }
-  return data_format;
-}
-
-inline MKLDNNMemoryFormat data_format_to_memory_format(
-    const std::string& data_format) {
-  switch (framework::StringToDataLayout(data_format)) {
-    case framework::DataLayout::kNHWC:
-      return MKLDNNMemoryFormat::nhwc;
-    case framework::DataLayout::kNCHW:
-      return MKLDNNMemoryFormat::nchw;
-    default:
-      return MKLDNNMemoryFormat::any;
-  }
-}
-
-inline MKLDNNMemoryFormat StringToMKLDNNFormat(std::string* format) {
-  std::transform(format->begin(), format->end(), format->begin(), ::tolower);
-
-  if (!format->compare("nchw")) {
-    return MKLDNNMemoryFormat::nchw;
-  } else if (!format->compare("nchw16c")) {
-    return MKLDNNMemoryFormat::nChw16c;
-  } else if (!format->compare("nchw8c")) {
-    return MKLDNNMemoryFormat::nChw8c;
-  } else if (!format->compare("nhwc")) {
-    return MKLDNNMemoryFormat::nhwc;
-  } else {
-    return MKLDNNMemoryFormat::any;
-  }
-}
-
-inline std::string ThreadIDasStr(void) {
-  return std::to_string(
-      std::hash<std::thread::id>()(std::this_thread::get_id()));
-}
-
-template <typename T>
-inline void AppendKey(std::string* key, const T& num) {
-  key->append(std::to_string(num));
-}
-
-inline void AppendKey(std::string* key, const std::string& str) {
-  key->append(str);
-}
-
-inline void AppendKey(std::string* key, const char* str) { key->append(str); }
-
-inline void AppendKey(std::string* key, const std::vector<int>& dims) {
-  for (size_t i = 0; i < dims.size(); i++) {
-    AppendKey(key, std::to_string(dims[i]));
-  }
-}
-
-template <typename... ArgTypes>
-inline std::string CreateKey(ArgTypes&&... args) {
-  std::string key;
-  key.reserve(256);
-  using expand_type = int[];
-  expand_type{0, (AppendKey(&key, std::forward<ArgTypes>(args)), 0)...};
-  return key;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
deleted file mode 100644
index 7396b90ea3d0728a6c63069e8cb3089cc3c47f98..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ /dev/null
@@ -1,1159 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-#include "boost/optional.hpp"
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace platform {
-
-using user_function = std::function<std::shared_ptr<float>(const float*)>;
-using memory = mkldnn::memory;
-
-template <typename T, typename TForward, typename TBackward>
-class MKLDNNHandlerT {
- public:
-  MKLDNNHandlerT(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-                 platform::Place cpu_place, const std::string& base_key)
-      : dev_ctx_(dev_ctx),
-        engine_(engine),
-        place_(cpu_place),
-        key_common_(base_key),
-        fwd_pd_(nullptr),
-        bwd_pd_(nullptr) {
-    if (platform::get_cur_mkldnn_session_id() !=
-        platform::kMKLDNNSessionID_Default) {
-      key_ = key_common_;
-    } else {
-      key_ = key_common_ + "-t:" + ThreadIDasStr();
-    }
-  }
-
-  template <typename... Args>
-  std::shared_ptr<TForward> AcquireForwardPrimitive(Args&&... args) {
-    const std::string key_p = key_ + "@forward_p";
-    auto forward_p =
-        std::static_pointer_cast<TForward>(dev_ctx_.GetBlob(key_p));
-    if (forward_p == nullptr) {
-      forward_p =
-          std::make_shared<TForward>(*fwd_pd_, std::forward<Args>(args)...);
-      dev_ctx_.SetBlob(key_p, forward_p);
-    }
-    return forward_p;
-  }
-
-  template <typename... Args>
-  std::shared_ptr<TBackward> AcquireBackwardPrimitive(Args&&... args) {
-    const std::string key_p = key_ + "@backward_p";
-    auto backward_p =
-        std::static_pointer_cast<TBackward>(dev_ctx_.GetBlob(key_p));
-    if (backward_p == nullptr) {
-      backward_p =
-          std::make_shared<TBackward>(*bwd_pd_, std::forward<Args>(args)...);
-      dev_ctx_.SetBlob(key_p, backward_p);
-    }
-    return backward_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const framework::Tensor* input) {
-    const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(fwd_pd_->src_primitive_desc(),
-                                            to_void_cast<T>(input_data),
-                                            "@src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
-    T* ptr = output->mutable_data<T>(place_,
-                                     fwd_pd_->dst_primitive_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_primitive_desc(), ptr,
-                                            "@dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
-      const framework::Tensor* output) {
-    const T* output_data = output->data<T>();
-    return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_primitive_desc(),
-                                            to_void_cast<T>(output_data),
-                                            "@bwd-dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
-      const framework::Tensor* diffdst) {
-    const T* ptr = diffdst->data<T>();
-    return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_primitive_desc(),
-                                            to_void_cast<T>(ptr),
-                                            "@diff_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
-      framework::Tensor* diffsrc) {
-    T* ptr = diffsrc->mutable_data<T>(
-        place_, bwd_pd_->diff_src_primitive_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_primitive_desc(),
-                                            ptr, "@diff_src_mem_p");
-  }
-
- protected:
-  template <typename... Args>
-  void AcquireForwardPrimitiveDescriptor(Args&&... args) {
-    // Forward PD has to be passed to Grad op that
-    // may be executed by diffrent thread, hence
-    // for that one we use key that does not contain TID
-    const std::string key_pd = key_common_ + "@forward_pd";
-    fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
-        dev_ctx_.GetBlob(key_pd));
-    if (fwd_pd_ == nullptr) {
-      static std::mutex acquire_barrier;
-      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
-          acquire_barrier);
-      fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
-          dev_ctx_.GetBlob(key_pd));
-      if (fwd_pd_ == nullptr) {
-        auto fwd_desc = typename TForward::desc(std::forward<Args>(args)...);
-        fwd_pd_ = std::make_shared<typename TForward::primitive_desc>(fwd_desc,
-                                                                      engine_);
-        dev_ctx_.SetBlob(key_pd, fwd_pd_);
-      }
-    }
-  }
-
-  template <typename... Args>
-  void AcquireBackwardPrimitiveDescriptor(Args&&... args) {
-    const std::string key_fwd_pd = key_common_ + "@forward_pd";
-    fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
-        dev_ctx_.GetBlob(key_fwd_pd));
-    PADDLE_ENFORCE_NOT_NULL(fwd_pd_);
-    const std::string key_pd = key_ + "@backward_pd";
-    bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
-        dev_ctx_.GetBlob(key_pd));
-    if (bwd_pd_ == nullptr) {
-      auto bwd_desc = typename TBackward::desc(std::forward<Args>(args)...);
-      bwd_pd_ = std::make_shared<typename TBackward::primitive_desc>(
-          bwd_desc, engine_, *fwd_pd_);
-      dev_ctx_.SetBlob(key_pd, bwd_pd_);
-    }
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
-      mkldnn::memory::primitive_desc mdp, void* ptr,
-      const std::string& suffix) {
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
-  const MKLDNNDeviceContext& dev_ctx_;
-  mkldnn::engine engine_;
-  platform::Place place_;
-  std::string key_;
-  std::string key_common_;
-  std::shared_ptr<typename TForward::primitive_desc> fwd_pd_;
-  std::shared_ptr<typename TBackward::primitive_desc> bwd_pd_;
-};
-
-// TODO(grygielski) this class will be deleted later.
-class MKLDNNHandler {
- public:
-  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-                const std::string& base_key)
-      : dev_ctx_(dev_ctx), engine_(engine), key_common_(base_key) {
-    if (platform::get_cur_mkldnn_session_id() !=
-        platform::kMKLDNNSessionID_Default) {
-      key_ = key_common_;
-    } else {
-      key_ = key_common_ + "-t:" + ThreadIDasStr();
-    }
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
-      mkldnn::memory::primitive_desc mdp, void* ptr,
-      const std::string& suffix) {
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
-  // This incarnation of AcquireMemory can call user function eg. custom reorder
-  // or preprocessing routine if needed
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      const mkldnn::memory::desc& md, void* ptr, const std::string& suffix,
-      user_function custom_func = {}) {
-    /*Generate key*/
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      // Call custom reorder/preprocessing func if available
-      if (custom_func) {
-        auto reordered_data = custom_func(reinterpret_cast<const float*>(ptr));
-        dev_ctx_.SetBlob(local_key + "-custom_reorder", reordered_data);
-        ptr = reinterpret_cast<void*>(reordered_data.get());
-      }
-
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      const std::vector<int>& dims, const mkldnn::memory::data_type dtype,
-      const MKLDNNMemoryFormat& fmt, void* ptr, const std::string& suffix) {
-    /*Generate key*/
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto md = mkldnn::memory::desc(dims, dtype, fmt);
-
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      const std::shared_ptr<mkldnn::memory>& user_memory_p,
-      const std::shared_ptr<mkldnn::memory>& target_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto local_key = key_ + suffix;
-    auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto stored_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-        dev_ctx_.GetBlob(key_reorder_p));
-
-    if (stored_reorder_p) {
-      pipeline.push_back(*stored_reorder_p);
-    } else {
-      auto reorder_p =
-          std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
-      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-      pipeline.push_back(*reorder_p);
-    }
-
-    return target_memory_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      mkldnn::memory::primitive_desc& mpd,       // NOLINT
-      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false, bool is_INT8 = false,
-      std::vector<float> scale_data = {1.0f}, int mask = 0) {
-    // create reorder primitive if the input format is not the preferred one
-    auto local_key = key_ + suffix;
-    auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto target_memory_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (target_memory_p == nullptr) {
-      target_memory_p = user_memory_p;
-      std::shared_ptr<mkldnn::primitive> reorder_p;
-      if (mpd != user_mpd) {
-        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
-        std::shared_ptr<mkldnn::reorder> reorder_p;
-        if (is_INT8) {
-          mkldnn::primitive_attr
-              attri;  // attribute for int8 weights and bias data reorder.
-          attri.set_output_scales(mask, scale_data);
-
-          auto reorder_pd = std::shared_ptr<mkldnn::reorder::primitive_desc>(
-              new mkldnn::reorder::primitive_desc(user_mpd, mpd, attri));
-          reorder_p = std::shared_ptr<mkldnn::reorder>(new mkldnn::reorder(
-              *reorder_pd, *user_memory_p, *target_memory_p));
-        } else {
-          reorder_p = std::make_shared<mkldnn::reorder>(*user_memory_p,
-                                                        *target_memory_p);
-        }
-        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-        pipeline.push_back(*reorder_p);
-      }
-      dev_ctx_.SetBlob(local_key, target_memory_p);
-    } else if (!is_persistent) {
-      // Make reorder if needed
-      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-          dev_ctx_.GetBlob(key_reorder_p));
-      if (reorder_p != nullptr) {
-        pipeline.push_back(*reorder_p);
-      }
-    }
-    return target_memory_p;
-  }
-
- protected:
-  const MKLDNNDeviceContext& dev_ctx_;
-  mkldnn::engine engine_;
-  std::string key_;
-  std::string key_common_;
-};
-
-class SumMKLDNNHandler : public MKLDNNHandler {
- public:
-  SumMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx,
-                   mkldnn::engine engine, const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
-
-  std::shared_ptr<mkldnn::sum::primitive_desc> AcquireSumPrimitiveDescriptor(
-      const std::vector<std::shared_ptr<mkldnn::memory>>& src_mems,
-      const std::vector<float>& scales, const mkldnn::memory::desc& dst_md) {
-    const std::string key_sum_pd = key_ + "@sum_pd";
-
-    sum_pd_ = std::static_pointer_cast<mkldnn::sum::primitive_desc>(
-        dev_ctx_.GetBlob(key_sum_pd));
-    if (sum_pd_ == nullptr) {
-      // Get vector of inputs primitive descriptors
-      std::vector<mkldnn::memory::primitive_desc> src_pds;
-      for (auto& input_mem : src_mems) {
-        src_pds.push_back(input_mem->get_primitive_desc());
-      }
-
-      sum_pd_.reset(new mkldnn::sum::primitive_desc(dst_md, scales, src_pds));
-      dev_ctx_.SetBlob(key_sum_pd, sum_pd_);
-    }
-
-    return sum_pd_;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
-    return this->AcquireMemoryFromPrimitive(sum_pd_->dst_primitive_desc(), ptr,
-                                            "@dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSecondSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_src2_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::sum> AcquireSum(
-      std::shared_ptr<mkldnn::memory> dst_memory,
-      std::vector<mkldnn::primitive::at>* inputs) {
-    auto prim_key = key_ + "@sum_p";
-    auto sum_p =
-        std::static_pointer_cast<mkldnn::sum>(dev_ctx_.GetBlob(prim_key));
-    if (sum_p == nullptr) {
-      sum_p = std::make_shared<mkldnn::sum>(*(sum_pd_), *inputs, *(dst_memory));
-      dev_ctx_.SetBlob(prim_key, sum_p);
-    }
-    return sum_p;
-  }
-
- private:
-  std::shared_ptr<mkldnn::sum::primitive_desc> sum_pd_;
-};
-
-template <typename T>
-class ActivationMKLDNNHandler
-    : public MKLDNNHandlerT<T, mkldnn::eltwise_forward,
-                            mkldnn::eltwise_backward> {
- public:
-  ActivationMKLDNNHandler(const std::vector<int>& dims,
-                          mkldnn::algorithm algorithm, float alpha, float beta,
-                          const MKLDNNMemoryFormat fmt, bool is_test,
-                          const platform::MKLDNNDeviceContext& dev_ctx,
-                          platform::Place cpu_place,
-                          const std::string& unique_name)
-
-      : platform::MKLDNNHandlerT<T, mkldnn::eltwise_forward,
-                                 mkldnn::eltwise_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dims, algorithm, fmt, alpha, beta,
-                                unique_name)) {
-    auto md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
-
-    this->AcquireForwardPrimitiveDescriptor(
-        is_test ? mkldnn::prop_kind::forward_inference
-                : mkldnn::prop_kind::forward_training,
-        algorithm, md, alpha, beta);
-  }
-
-  ActivationMKLDNNHandler(const std::vector<int>& dims,
-                          mkldnn::algorithm algorithm, float alpha, float beta,
-                          const MKLDNNMemoryFormat fmt,
-                          const MKLDNNMemoryFormat diff_fmt,
-                          const platform::MKLDNNDeviceContext& dev_ctx,
-                          platform::Place cpu_place,
-                          const std::string& unique_name)
-
-      : platform::MKLDNNHandlerT<T, mkldnn::eltwise_forward,
-                                 mkldnn::eltwise_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dims, algorithm, fmt, alpha, beta,
-                                unique_name)) {
-    auto diff_dst_md = platform::MKLDNNMemDesc(
-        dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
-    auto src_md =
-        platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType<T>(), fmt);
-
-    this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md,
-                                             alpha, beta);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBackwardSrcMemory(
-      const framework::Tensor* input) {
-    const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_primitive_desc(),
-                                            to_void_cast<T>(input_data),
-                                            "@bwd-src_mem_p");
-  }
-};
-
-template <typename T>
-class LRNMKLDNNHandler
-    : public MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward> {
- public:
-  LRNMKLDNNHandler(const std::vector<int>& dims, const int n, const float alpha,
-                   const float beta, const float k,
-                   const MKLDNNMemoryFormat fmt, bool is_test,
-                   const platform::MKLDNNDeviceContext& dev_ctx,
-                   platform::Place cpu_place, const std::string& unique_name)
-
-      : platform::MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dims, n, alpha, beta, k, fmt, unique_name)) {
-    auto src_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
-    this->AcquireForwardPrimitiveDescriptor(
-        is_test ? mkldnn::prop_kind::forward_inference
-                : mkldnn::prop_kind::forward_training,
-        mkldnn::lrn_across_channels, src_md, n, alpha, beta, k);
-  }
-
-  LRNMKLDNNHandler(const std::vector<int>& dims, const int n, const float alpha,
-                   const float beta, const float k,
-                   const MKLDNNMemoryFormat fmt,
-                   const MKLDNNMemoryFormat diff_fmt,
-                   const platform::MKLDNNDeviceContext& dev_ctx,
-                   platform::Place cpu_place, const std::string& unique_name)
-
-      : platform::MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dims, n, alpha, beta, k, fmt, unique_name)) {
-    auto src_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
-    auto diff_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
-
-    this->AcquireBackwardPrimitiveDescriptor(
-        mkldnn::lrn_across_channels, src_md, diff_md, n, alpha, beta, k);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(
-      framework::Tensor* workspace) {
-    T* ptr = workspace->mutable_data<T>(
-        this->place_, this->fwd_pd_->dst_primitive_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(
-        this->fwd_pd_->workspace_primitive_desc(), ptr, "@wrk_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBackwardWorkspaceMemory(
-      const framework::Tensor* workspace) {
-    const T* workspace_data = workspace->data<T>();
-    return this->AcquireMemoryFromPrimitive(
-        this->fwd_pd_->workspace_primitive_desc(),
-        to_void_cast<T>(workspace_data), "@bwd-wrk_mem_p");
-  }
-};
-
-template <typename T>
-class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
-                                                   mkldnn::pooling_backward> {
- public:
-  PoolingMKLDNNHandler(
-      const std::vector<int>& src_dims, const std::vector<int>& dst_dims,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string& pooling_type,
-      bool ceil_mode, const MKLDNNMemoryFormat fmt,
-      mkldnn::memory::data_type dt, bool is_test,
-      const platform::MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place,
-      const std::string& unique_name)
-      : platform::MKLDNNHandlerT<T, mkldnn::pooling_forward,
-                                 mkldnn::pooling_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(src_dims, pooling_type, ksize, strides,
-                                paddings, dt, fmt, unique_name)) {
-    auto src_md = mkldnn::memory::desc(src_dims, dt, fmt);
-    /* create memory descriptor for pooling without specified format
-     * ('any') which lets a primitive (pooling in this case) choose
-     * the memory format preferred for best performance
-     */
-    auto dst_md =
-        platform::MKLDNNMemDesc(dst_dims, dt, MKLDNNMemoryFormat::any);
-
-    std::vector<int> padding_left_top(paddings);
-    std::vector<int> padding_right_bottom(paddings);
-    if (ceil_mode) {
-      CorrectOutputSize(src_dims, dst_dims, ksize, paddings, strides,
-                        padding_right_bottom);
-    }
-
-    this->AcquireForwardPrimitiveDescriptor(
-        is_test ? mkldnn::prop_kind::forward_inference
-                : mkldnn::prop_kind::forward_training,
-        pooling_type == "max" ? mkldnn::algorithm::pooling_max
-                              : mkldnn::algorithm::pooling_avg,
-        src_md, dst_md, strides, ksize, padding_left_top, padding_right_bottom,
-        mkldnn::padding_kind::zero);
-  }
-
-  PoolingMKLDNNHandler(
-      const std::vector<int>& diff_dst_dims,
-      const std::vector<int>& diff_src_dims, const std::vector<int>& ksize,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::string& pooling_type, bool ceil_mode,
-      const MKLDNNMemoryFormat fmt, const MKLDNNMemoryFormat diff_dst_fmt,
-      mkldnn::memory::data_type dt,
-      const platform::MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place,
-      const std::string& unique_name)
-      : platform::MKLDNNHandlerT<T, mkldnn::pooling_forward,
-                                 mkldnn::pooling_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(diff_src_dims, pooling_type, ksize, strides,
-                                paddings, dt, fmt, unique_name)) {
-    auto diff_dst_md = mkldnn::memory::desc(
-        diff_dst_dims, platform::MKLDNNGetDataType<T>(), diff_dst_fmt);
-    auto diff_src_md =
-        mkldnn::memory::desc(diff_src_dims, platform::MKLDNNGetDataType<T>(),
-                             MKLDNNMemoryFormat::any);
-
-    this->AcquireBackwardPrimitiveDescriptor(
-        pooling_type == "max" ? mkldnn::algorithm::pooling_max
-                              : mkldnn::algorithm::pooling_avg,
-        diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
-        mkldnn::padding_kind::zero);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(void) {
-    mkldnn::memory::primitive_desc workspace_mpd =
-        this->fwd_pd_->workspace_primitive_desc();
-    // Pooling PD has to be passed to Grad op that
-    // may be executed by diffrent thread, hence
-    // for that one we use key that does not contain TID
-    auto local_key = this->key_common_ + "@workspace";
-    auto mem_p = std::static_pointer_cast<mkldnn::memory>(
-        this->dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      static std::mutex acquire_barrier;
-      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
-          acquire_barrier);
-      mem_p = std::static_pointer_cast<mkldnn::memory>(
-          this->dev_ctx_.GetBlob(local_key));
-      if (mem_p == nullptr) {
-        mem_p = std::make_shared<mkldnn::memory>(workspace_mpd);
-        this->dev_ctx_.SetBlob(local_key, mem_p);
-      }
-    }
-    return mem_p;
-  }
-
- private:
-  static inline int ComputeCeiledOutput(int input_size, int kernel_size,
-                                        int padding, int stride) {
-    return (input_size - kernel_size + 2 * padding) / stride + 1;
-  }
-
-  static inline void CorrectOutputSize(
-      const std::vector<int>& src_tz, const std::vector<int>& dst_tz,
-      const std::vector<int>& kernel_size, const std::vector<int>& paddings,
-      const std::vector<int>& strides,
-      std::vector<int>& right_bot_padding) {  // NOLINT
-    for (size_t i = 0; i < right_bot_padding.size(); i++) {
-      int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i],
-                                             paddings[i], strides[i]);
-      if (desired_size != dst_tz[i + 2]) {
-        right_bot_padding[i] += strides[i] - 1;
-      }
-    }
-  }
-};
-
-class TransposeMKLDNNHandler : public MKLDNNHandler {
- public:
-  TransposeMKLDNNHandler(std::vector<int>& dims,  // NOLINT
-                         std::vector<int>& axis,  // NOLINT
-                         const platform::MKLDNNDeviceContext& dev_ctx,
-                         mkldnn::engine engine, const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        dims_(dims),
-        axis_(axis),
-        logical_axis_(dims.size(), 0) {}
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const MKLDNNMemoryFormat& fmt, void* ptr) {
-    auto local_key = key_ + "@user_src_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      // Make memory descriptor using input format, unless it
-      // cannot be trusted (nchw) then make up memory fmt manually
-      for (size_t i = 0; i < logical_axis_.size(); ++i) {
-        logical_axis_[i] = i;
-      }
-      auto src_md = fmt != MKLDNNMemoryFormat::nchw
-                        ? platform::MKLDNNMemDesc(
-                              dims_, platform::MKLDNNGetDataType<float>(), fmt)
-                        : Axis2MemoryDesc(dims_, logical_axis_);
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output,
-                                                   platform::Place place) {
-    auto local_key = key_ + "@user_dst_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto dst_mdp = mkldnn::memory::primitive_desc{
-          Axis2MemoryDesc(dims_, axis_), engine_};
-
-      auto dst_data = output->mutable_data<float>(place, dst_mdp.get_size());
-
-      mem_p = std::make_shared<mkldnn::memory>(dst_mdp, dst_data);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      auto dst_data = output->mutable_data<float>(place);
-      mem_p->set_data_handle(dst_data);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::reorder> AcquireTranspose(
-      std::shared_ptr<mkldnn::memory> dst_memory_p,
-      std::shared_ptr<mkldnn::memory> src_memory_p) {
-    auto prim_key = key_ + "@transpose_p";
-    auto transpose_p =
-        std::static_pointer_cast<mkldnn::reorder>(dev_ctx_.GetBlob(prim_key));
-    if (transpose_p == nullptr) {
-      transpose_p =
-          std::make_shared<mkldnn::reorder>(*(src_memory_p), *(dst_memory_p));
-      dev_ctx_.SetBlob(prim_key, transpose_p);
-    }
-    return transpose_p;
-  }
-
- protected:
-  mkldnn_memory_desc_t Axis2MemoryDesc(std::vector<int>& nchw_tz,  // NOLINT
-                                       std::vector<int>& axis      // NOLINT
-                                       ) {
-    mkldnn_memory_desc_t mem_fmt;
-
-    mem_fmt.primitive_kind = mkldnn_memory;
-    mem_fmt.ndims = axis.size();
-    for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
-      mem_fmt.dims[i] = nchw_tz[i];  // logical dimensions (nchw format,
-      // regardless physical layout)
-    }
-    mem_fmt.data_type = mkldnn_f32;
-    mem_fmt.format = mkldnn_blocked;
-
-    unsigned int total_stride = 1;
-    for (int i = nchw_tz.size() - 1; i >= 0; --i) {
-      mem_fmt.layout_desc.blocking.padding_dims[i] =
-          nchw_tz[i];  // logical dimensions (nchw format, regardless physical
-      // layout)
-      mem_fmt.layout_desc.blocking.block_dims[i] = 1;
-      mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
-      mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
-      mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1;
-      total_stride *= nchw_tz[axis[i]];
-    }
-    mem_fmt.layout_desc.blocking.offset_padding = 0;  // no initial offset
-    return mem_fmt;
-  }
-
- private:
-  std::vector<int> dims_;
-  std::vector<int> axis_;
-  std::vector<int> logical_axis_;
-};
-
-class ReorderMKLDNNHandler : public MKLDNNHandler {
- public:
-  ReorderMKLDNNHandler(std::vector<int>& dims,  // NOLINT
-                       framework::proto::VarType::Type vtype,
-                       mkldnn::memory::data_type dtype,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       mkldnn::engine engine, const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        dims_(dims),
-        vtype_(vtype),
-        dtype_(dtype) {}
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const MKLDNNMemoryFormat& fmt, void* ptr) {
-    return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
-      framework::Tensor* output, const MKLDNNMemoryFormat& fmt,
-      platform::Place place) {
-    auto local_key = key_ + "@user_dst_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt);
-      auto dst_mdp = mkldnn::memory::primitive_desc{dst_md, engine_};
-
-      auto dst_data = output->mutable_data(place, vtype_);
-
-      mem_p = std::make_shared<mkldnn::memory>(dst_mdp, dst_data);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      auto dst_data = output->mutable_data(place, vtype_);
-      mem_p->set_data_handle(dst_data);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::reorder> AcquireReorder(
-      std::shared_ptr<mkldnn::memory> dst_memory_p,
-      std::shared_ptr<mkldnn::memory> src_memory_p) {
-    auto prim_key = key_ + "@reorder_p";
-    auto reorder_p =
-        std::static_pointer_cast<mkldnn::reorder>(dev_ctx_.GetBlob(prim_key));
-    if (reorder_p == nullptr) {
-      reorder_p =
-          std::make_shared<mkldnn::reorder>(*(src_memory_p), *(dst_memory_p));
-      dev_ctx_.SetBlob(prim_key, reorder_p);
-    }
-    return reorder_p;
-  }
-
- private:
-  std::vector<int> dims_;
-  framework::proto::VarType::Type vtype_;
-  mkldnn::memory::data_type dtype_;
-};
-
-template <typename T>
-struct convolutional_algorithm;
-
-template <>
-struct convolutional_algorithm<mkldnn::convolution_forward> {
-  static constexpr mkldnn::algorithm T = mkldnn::algorithm::convolution_direct;
-};
-
-template <>
-struct convolutional_algorithm<mkldnn::deconvolution_forward> {
-  static constexpr mkldnn::algorithm T =
-      mkldnn::algorithm::deconvolution_direct;
-};
-
-template <class forward_t, class backward_data_t, class backward_weights_t>
-class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
- public:
-  ConvMKLDNNTemplateHandler(const platform::MKLDNNDeviceContext& dev_ctx,
-                            mkldnn::engine engine, const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
-
-  ConvMKLDNNTemplateHandler(
-      std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
-      std::shared_ptr<typename backward_data_t::primitive_desc>
-          conv_bwd_data_pd,
-      std::shared_ptr<typename backward_weights_t::primitive_desc>
-          conv_bwd_weights_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        conv_pd_(conv_pd),
-        conv_bwd_weights_pd_(conv_bwd_weights_pd),
-        conv_bwd_data_pd_(conv_bwd_data_pd) {
-    // If we are in Grad operatgor then update a key with BWD suffix to
-    // distinguish from FWD memory primitives
-    key_ += "-BWD";
-  }
-
-  size_t GetDstMemorySize() const {
-    return conv_pd_->dst_primitive_desc().get_size();
-  }
-
-  MKLDNNMemoryFormat GetDstFormat() const {
-    return static_cast<MKLDNNMemoryFormat>(
-        conv_pd_->dst_primitive_desc().desc().data.format);
-  }
-
-  size_t GetDiffWeightsMemorySize() const {
-    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
-  }
-
-  size_t GetDiffSourceMemorySize() const {
-    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(src_pd, user_pd, user_memory_p,
-                               "@weights-src_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
-                               "@weights-diff_dst_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
-      void* ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
-        "@diff_weights_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
-                               "@data-diff_dst_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
-    auto user_pd = user_weights_memory_p->get_primitive_desc();
-    return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
-                               "@data-weights_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
-      const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
-      void* dst_ptr,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    return this->AcquireMemory(user_residual_memory_p,
-                               this->AcquireDstMemoryFromPrimitive(dst_ptr),
-                               "@residual_data_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
-      void* ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
-    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
-                                            "@dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto src_pd = conv_pd_->src_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
-                               pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
-      const mkldnn::memory::desc& md, void* ptr,
-      user_function custom_func = {}) {
-    return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false, bool is_INT8 = false,
-      std::vector<float> scale_data = {1.0f}, int mask = 0) {
-    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
-    auto weights_pd = conv_pd_->weights_primitive_desc();
-    return this->AcquireMemory(
-        weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p",
-        pipeline, is_persistent, is_INT8, scale_data, mask);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false, bool is_INT8 = false,
-      std::vector<float> scale_data = {1.0f},
-      int mask = 0) {  // NOLINT
-    auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
-    auto bias_pd = conv_pd_->bias_primitive_desc();
-    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
-                               "@bias_mem_p", pipeline, is_persistent, is_INT8,
-                               scale_data, mask);
-  }
-
-  mkldnn::primitive_attr CreatePostOps(
-      std::string fuse_activation, float fuse_alpha, float fuse_beta,
-      bool fuse_residual_conn, const std::vector<float> output_shift_scale = {},
-      float sum_scale = 1.0f) const {
-    mkldnn::primitive_attr conv_attr;
-    mkldnn::post_ops post_operations;
-    if (output_shift_scale.size() > 0) {
-      int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
-      conv_attr.set_output_scales(mask, output_shift_scale);
-    }
-    // Fusion with Elementwise layer relies on adding a sum post-operation with
-    // the scale parameter. It is assumed that when fuse_residual_connection is
-    // true, the output tensor contains the data coming from residual
-    // connection. The result of this post_op is:
-    // Output = scale * Output + Conv_Out.
-    if (fuse_residual_conn) {
-      post_operations.append_sum(sum_scale);
-    }
-    // Fusion with ReLU layer is executed through the PostOps feature. Create a
-    // PostOps object and configure it to execute an eltwise relu operation.
-    if (fuse_activation == "relu" || fuse_activation == "leaky_relu") {
-      constexpr float scale = 1.0f;
-      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
-                                     fuse_alpha, fuse_beta);
-    }
-
-    if (fuse_activation == "relu6") {
-      constexpr float scale = 1.0f;
-      post_operations.append_eltwise(scale,
-                                     mkldnn::algorithm::eltwise_bounded_relu,
-                                     fuse_alpha, fuse_beta);
-    }
-    conv_attr.set_post_ops(post_operations);
-    return conv_attr;
-  }
-
-  std::shared_ptr<typename forward_t::primitive_desc>
-  AcquireConvolutionPrimitiveDescriptor(
-      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
-      boost::optional<const mkldnn::memory::desc&> bias,
-      const mkldnn::memory::desc& dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const mkldnn::engine& engine,
-      const std::string& fuse_activation, float fuse_alpha, float fuse_beta,
-      const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind,
-      const std::vector<float> output_shift_scale = {},
-      const float sum_scale = 1.0f) {
-    // Conv PD has to be passed to Grad op that
-    // may be exxecuted by diffrent thread, hence
-    // for that one we use key that does not contain TID
-    const std::string key_conv_pd = key_common_ + "@conv_pd";
-
-    conv_pd_ = std::static_pointer_cast<typename forward_t::primitive_desc>(
-        dev_ctx_.GetBlob(key_conv_pd));
-
-    if (conv_pd_ == nullptr) {
-      static std::mutex acquire_barrier;
-      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
-          acquire_barrier);
-
-      conv_pd_ = std::static_pointer_cast<typename forward_t::primitive_desc>(
-          dev_ctx_.GetBlob(key_conv_pd));
-      if (conv_pd_ == nullptr) {
-        mkldnn::memory::dims stride_dims = strides;
-        mkldnn::memory::dims padding_dims = paddings;
-
-        auto conv_desc =
-            bias ? typename forward_t::desc(
-                       fwd_prop_kind, convolutional_algorithm<forward_t>::T,
-                       src, weights, *bias, dst, stride_dims, padding_dims,
-                       padding_dims, mkldnn::padding_kind::zero)
-                 : typename forward_t::desc(
-                       fwd_prop_kind, convolutional_algorithm<forward_t>::T,
-                       src, weights, dst, stride_dims, padding_dims,
-                       padding_dims, mkldnn::padding_kind::zero);
-
-        mkldnn::primitive_attr conv_attr =
-            CreatePostOps(fuse_activation, fuse_alpha, fuse_beta,
-                          fuse_residual_conn, output_shift_scale, sum_scale);
-
-        conv_pd_.reset(new typename forward_t::primitive_desc(
-            conv_desc, conv_attr, engine));
-        // Save conv_pd/src_memory/weights_memory for backward pass
-        dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
-      }
-    }
-
-    return conv_pd_;
-  }
-
-  std::shared_ptr<forward_t> AcquireConvolution(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> dst_memory_p) {
-    auto prim_key = key_ + "@conv_p";
-    auto conv_p =
-        std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
-    if (conv_p == nullptr) {
-      conv_p = std::make_shared<forward_t>(*conv_pd_, *src_memory_p,
-                                           *weights_memory_p, *dst_memory_p);
-
-      dev_ctx_.SetBlob(prim_key, conv_p);
-    }
-    return conv_p;
-  }
-
-  std::shared_ptr<forward_t> AcquireConvolution(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> bias_memory_p,
-      std::shared_ptr<mkldnn::memory> dst_memory_p) {
-    auto prim_key = key_ + "@conv_p";
-    auto conv_p =
-        std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
-    if (conv_p == nullptr) {
-      conv_p = std::make_shared<forward_t>(*conv_pd_, *src_memory_p,
-                                           *weights_memory_p, *bias_memory_p,
-                                           *dst_memory_p);
-
-      dev_ctx_.SetBlob(prim_key, conv_p);
-    }
-    return conv_p;
-  }
-
-  std::shared_ptr<backward_weights_t> AcquireConvolutionBackwardWeights(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
-    auto prim_key = key_ + "@conv_bwd_weights_p";
-    auto conv_bwd_weights_p = std::static_pointer_cast<backward_weights_t>(
-        dev_ctx_.GetBlob(prim_key));
-    if (conv_bwd_weights_p == nullptr) {
-      // create backward conv primitive for weights
-      conv_bwd_weights_p = std::make_shared<backward_weights_t>(
-          *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
-          *diff_weights_memory_p);
-      dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
-    }
-    return conv_bwd_weights_p;
-  }
-
-  std::shared_ptr<backward_data_t> AcquireConvolutionBackwardData(
-      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
-    auto prim_key = key_ + "@conv_bwd_data_p";
-    auto conv_bwd_data_p =
-        std::static_pointer_cast<backward_data_t>(dev_ctx_.GetBlob(prim_key));
-    if (conv_bwd_data_p == nullptr) {
-      conv_bwd_data_p = std::make_shared<backward_data_t>(
-          *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
-          *diff_src_memory_p);
-      dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
-    }
-    return conv_bwd_data_p;
-  }
-
- private:
-  std::shared_ptr<typename forward_t::primitive_desc> conv_pd_;
-  std::shared_ptr<typename backward_weights_t::primitive_desc>
-      conv_bwd_weights_pd_;
-  std::shared_ptr<typename backward_data_t::primitive_desc> conv_bwd_data_pd_;
-};
-
-using ConvMKLDNNHandler =
-    ConvMKLDNNTemplateHandler<mkldnn::convolution_forward,
-                              mkldnn::convolution_backward_data,
-                              mkldnn::convolution_backward_weights>;
-
-using ConvTransposeMKLDNNHandler =
-    ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward,
-                              mkldnn::deconvolution_backward_data,
-                              mkldnn::deconvolution_backward_weights>;
-
-template <typename T>
-static void SetDstMemoryQuantized(
-    const framework::ExecutionContext& ctx, framework::Tensor* output,
-    std::vector<int> dst_tz, const mkldnn::engine& engine,
-    std::shared_ptr<mkldnn::memory::primitive_desc>& dst_pd,  // NOLINT
-    std::shared_ptr<mkldnn::memory>& dst_memory) {            // NOLINT
-  T* output_data = output->mutable_data<T>(ctx.GetPlace());
-  const size_t dst_dims = dst_tz.size();
-  MKLDNNMemoryFormat dst_fmt;
-  PADDLE_ENFORCE_LE(dst_dims, 5,
-                    "Dst memory for quantization can not have dims > 5");
-  dst_fmt = platform::MKLDNNFormatForSize(dst_dims, MKLDNNMemoryFormat::nhwc);
-
-  auto dst_md = platform::MKLDNNMemDesc(
-      {dst_tz}, paddle::framework::ToMKLDNNDataType(
-                    framework::DataTypeTrait<T>::DataType()),
-      dst_fmt);
-  dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine));
-  dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<T>(output_data)));
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
deleted file mode 100644
index a843a7680f37f76fbe69d90296561e9b56213e96..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/nccl_helper.h
+++ /dev/null
@@ -1,323 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef _WIN32
-#pragma once
-
-#include <stdio.h>
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <typeindex>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/dynload/nccl.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-
-#define NCCL_ID_VARNAME "NCCLID"
-
-namespace paddle {
-namespace platform {
-
-inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
-  if (type == framework::proto::VarType::FP32) {
-    return ncclFloat;
-  } else if (type == framework::proto::VarType::FP64) {
-    return ncclDouble;
-  } else if (type == framework::proto::VarType::INT32) {
-    return ncclInt;
-  } else if (type == framework::proto::VarType::INT64) {
-    return ncclInt64;
-  } else if (type == framework::proto::VarType::FP16) {
-    return ncclFloat16;
-  } else {
-    PADDLE_THROW("Not supported");
-  }
-}
-
-// NOTE(minqiyang): according to the ncclGroupEnd documentations:
-// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
-// ncclGroupEnd will wait for all communicators to be initialized, which will
-// cause blocking problem when a runtime_error was thrown, so try only guard
-// NCCL actions when use it.
-class NCCLGroupGuard {
- public:
-  static std::mutex &NCCLMutex() {
-    static std::mutex mtx;
-    return mtx;
-  }
-
-  inline NCCLGroupGuard() {
-    NCCLMutex().lock();
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
-  }
-
-  inline ~NCCLGroupGuard() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
-    NCCLMutex().unlock();
-  }
-};
-
-struct NCCLContext {
-  std::unique_ptr<CUDADeviceContext> ctx_;
-  ncclComm_t comm_;
-
-  explicit NCCLContext(int dev_id)
-      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
-
-  cudaStream_t stream() const { return ctx_->stream(); }
-  ncclComm_t comm() const { return comm_; }
-
-  int device_id() const {
-    return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
-  }
-};
-
-struct NCCLContextMap {
-  std::unordered_map<int, NCCLContext> contexts_;
-  std::vector<int> order_;
-
-  explicit NCCLContextMap(const std::vector<platform::Place> &places,
-                          ncclUniqueId *nccl_id = nullptr,
-                          size_t num_trainers = 1, size_t trainer_id = 0) {
-    PADDLE_ENFORCE_EQ(!places.empty(), true);
-    order_.reserve(places.size());
-    for (auto &p : places) {
-      int dev_id = boost::get<CUDAPlace>(p).device;
-      order_.emplace_back(dev_id);
-      contexts_.emplace(dev_id, NCCLContext(dev_id));
-    }
-    PADDLE_ENFORCE_EQ(
-        order_.size(), contexts_.size(),
-        "NCCL Context Map does not support contain two or more same device");
-
-    std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
-    // if num_trainers == 1, should create a new nccl id for local comms.
-    if (num_trainers == 1 && nccl_id == nullptr) {
-      std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
-          comms.get(), static_cast<int>(order_.size()), order_.data()));
-    } else {
-      PADDLE_ENFORCE_NOT_NULL(nccl_id);
-      {
-        int nranks = num_trainers * order_.size();
-        NCCLGroupGuard gurad;
-        for (size_t i = 0; i < order_.size(); ++i) {
-          int gpu_id = order_[i];
-          int rank;
-          if (order_.size() > 1) {
-            rank = trainer_id * order_.size() + i;
-          } else {
-            rank = trainer_id;
-          }
-          VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
-                  << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
-          PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(gpu_id));
-          PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
-              comms.get() + i, nranks, *nccl_id, rank));
-        }
-      }
-    }
-    int i = 0;
-    for (auto &dev_id : order_) {
-      contexts_.at(dev_id).comm_ = comms[i++];
-    }
-  }
-
-  NCCLContextMap(const NCCLContextMap &other) = delete;
-  NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
-
-  CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
-
-  CUDADeviceContext *DevCtx(platform::Place p) const {
-    return DevCtx(boost::get<CUDAPlace>(p).device);
-  }
-
-  const NCCLContext &at(platform::Place p) const {
-    return this->at(boost::get<CUDAPlace>(p).device);
-  }
-
-  const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
-
-  void WaitAll() {
-    for (auto &p : contexts_) {
-      p.second.ctx_->Wait();
-    }
-  }
-};
-
-inline std::string GetFlatNCCLVarName(size_t pos) {
-  if (pos == 0) {
-    return NCCL_ID_VARNAME;
-  }
-  return string::Sprintf("%s_%d", NCCL_ID_VARNAME, static_cast<int>(pos));
-}
-
-inline std::string GetHierarchicalExterNCCLVarName(size_t pos) {
-  return string::Sprintf("Hierarchical_exter_%s_%d", NCCL_ID_VARNAME,
-                         static_cast<int>(pos));
-}
-inline std::string GetHierarchicalInterNCCLVarName(size_t pos) {
-  return string::Sprintf("Hierarchical_inter_%s_%d", NCCL_ID_VARNAME,
-                         static_cast<int>(pos));
-}
-
-class NCCLCommunicator {
- public:
-  NCCLCommunicator() {}
-  virtual ~NCCLCommunicator() {}
-
-  NCCLContextMap *DefaultFlatCtx() const {
-    if (flat_ctxs_.size() == 0) {
-      return nullptr;
-    }
-
-    return flat_ctxs_[0].get();
-  }
-
-  std::vector<std::unique_ptr<NCCLContextMap>> *GetFlatCtxs() {
-    return &flat_ctxs_;
-  }
-
-  NCCLContextMap *GetFlatCtx(size_t run_order) const {
-    return flat_ctxs_[run_order % flat_ctxs_.size()].get();
-  }
-
-  NCCLContextMap *GetRunEnvNCCLCtx(size_t run_order,
-                                   bool use_hierarchical_allreduce) const {
-    if (!use_hierarchical_allreduce) {
-      return GetFlatCtx(run_order);
-    }
-
-    return GetHierarchicalInterCtx(run_order);
-  }
-
-  /*
-   *When nccl inits nccl comm using ncclCommInitAll, it meets error when
-   *allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
-   *create a new nccl comm for sync_batch_norm_op. And these codes should be
-   *polished with a unified nccl management.
-  */
-  NCCLContextMap *GetSyncBatchNormCtx(
-      framework::Scope *scope, const std::vector<platform::Place> &places) {
-    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
-    if (nccl_id_var != nullptr) {
-      return DefaultFlatCtx();
-    }
-
-    if (sync_batch_norm_ctx_.get() == nullptr) {
-      sync_batch_norm_ctx_.reset(new NCCLContextMap(places));
-    }
-    return sync_batch_norm_ctx_.get();
-  }
-
-  void InitFlatCtxs(const std::vector<platform::Place> &places,
-                    const std::vector<ncclUniqueId *> &nccl_ids,
-                    size_t trainers_num, size_t trainer_id) {
-    if (nccl_ids.size() == 0) {
-      auto ptr = new platform::NCCLContextMap(places);
-      VLOG(1) << "init local trainer";
-      flat_ctxs_.emplace_back(ptr);
-      return;
-    }
-
-    for (size_t i = 0; i < nccl_ids.size(); i++) {
-      auto ptr = new platform::NCCLContextMap(places, nccl_ids[i], trainers_num,
-                                              trainer_id);
-      VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
-      flat_ctxs_.emplace_back(ptr);
-    }
-  }
-
-  void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
-                            const std::vector<ncclUniqueId *> &inter_nccl_ids,
-                            const std::vector<ncclUniqueId *> &exter_nccl_ids,
-                            size_t trainers_num, size_t trainer_id,
-                            size_t inter_trainers_num,
-                            size_t exter_trainers_num) {
-    PADDLE_ENFORCE_EQ(trainers_num, inter_trainers_num * exter_trainers_num,
-                      "trainers_num:%llu != inter_trainers_num:%llu * "
-                      "exter_trainers_num:%llu",
-                      trainers_num, inter_trainers_num, exter_trainers_num);
-
-    PADDLE_ENFORCE_GT(inter_trainers_num, 1, "inter_trainers_num:%llu must > 1",
-                      inter_trainers_num);
-
-    int inter_trainer_id = trainer_id % inter_trainers_num;
-    for (size_t i = 0; i < inter_nccl_ids.size(); i++) {
-      VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
-              << ", comm no:" << i;
-      auto local = new NCCLContextMap(places, inter_nccl_ids[i],
-                                      inter_trainers_num, inter_trainer_id);
-
-      h_inter_ctxs_.emplace_back(local);
-    }
-
-    int exter_trainer_id = -1;
-    if (trainer_id % inter_trainers_num == 0) {
-      exter_trainer_id = trainer_id / inter_trainers_num;
-    }
-
-    if (exter_trainer_id >= 0) {
-      for (size_t i = 0; i < exter_nccl_ids.size(); i++) {
-        auto ex = new NCCLContextMap(places, exter_nccl_ids[i],
-                                     exter_trainers_num, exter_trainer_id);
-        VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
-                << ", comm no:" << i;
-        h_exter_ctxs_.emplace_back(ex);
-      }
-    }
-  }
-
-  bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
-
-  NCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
-    PADDLE_ENFORCE(h_inter_ctxs_.size() > 0,
-                   "must init hierarchical ctxs first!");
-    return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
-  }
-
-  NCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
-    PADDLE_ENFORCE(h_exter_ctxs_.size() > 0,
-                   "must init hierarchical ctxs first!");
-    return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
-  }
-
-  std::vector<std::unique_ptr<NCCLContextMap>> *GetHierarchicalInterCtxs() {
-    return &h_inter_ctxs_;
-  }
-
-  std::vector<std::unique_ptr<NCCLContextMap>> *GetHierarchicalExterCtxs() {
-    return &h_exter_ctxs_;
-  }
-
- protected:
-  // Support multi nccl comm on default nccl ring while NCCLContextMap can't.
-  std::vector<std::unique_ptr<NCCLContextMap>> flat_ctxs_;
-
-  // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
-  // And h_exter_ctxs_ can support multi comm too.
-  std::vector<std::unique_ptr<NCCLContextMap>> h_inter_ctxs_;
-  std::vector<std::unique_ptr<NCCLContextMap>> h_exter_ctxs_;
-
-  // just used for sync_batch_norm op.
-  std::unique_ptr<NCCLContextMap> sync_batch_norm_ctx_;
-};
-
-}  // namespace platform
-}  // namespace paddle
-#endif
diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h
deleted file mode 100644
index 2bacd5bd4c368f7c97b8c2af2b5ada803d19d0bd..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/ngraph_helper.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_NGRAPH
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ngraph/ngraph.hpp"
-
-namespace paddle {
-namespace platform {
-
-std::shared_ptr<ngraph::Node> Nhwc2Nchw(std::shared_ptr<ngraph::Node> in) {
-  auto in_shape = in->get_shape();
-  in_shape[0] = in->get_shape()[0];
-  in_shape[1] = in->get_shape()[3];
-  in_shape[2] = in->get_shape()[1];
-  in_shape[3] = in->get_shape()[2];
-  ngraph::AxisVector axis_vec = {0, 3, 1, 2};
-  return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
-}
-
-std::shared_ptr<ngraph::Node> Nchw2Nhwc(std::shared_ptr<ngraph::Node> in) {
-  auto in_shape = in->get_shape();
-  in_shape[0] = in->get_shape()[0];
-  in_shape[1] = in->get_shape()[2];
-  in_shape[2] = in->get_shape()[3];
-  in_shape[3] = in->get_shape()[1];
-  ngraph::AxisVector axis_vec = {0, 2, 3, 1};
-  return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
-}
-
-ngraph::Shape FlattenTo1d(ngraph::Shape sh, int num) {
-  auto x1 = std::accumulate(std::begin(sh), std::end(sh) + num, 1,
-                            std::multiplies<size_t>());
-  size_t x1_l = (size_t)x1;
-  return ngraph::Shape{x1_l};
-}
-
-ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
-  auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
-                            std::multiplies<size_t>());
-  auto x2 = std::accumulate(std::begin(sh) + num, std::end(sh), 1,
-                            std::multiplies<size_t>());
-  size_t x1_l = static_cast<size_t>(x1);
-  size_t x2_l = static_cast<size_t>(x2);
-  return ngraph::Shape{x1_l, x2_l};
-}
-
-std::shared_ptr<ngraph::Node> NgReshaper(std::shared_ptr<ngraph::Node> input,
-                                         ngraph::Shape shape) {
-  std::vector<size_t> input_order(input->get_shape().size());
-  std::iota(std::begin(input_order), std::end(input_order), 0);
-  return std::make_shared<ngraph::op::Reshape>(
-      input, ngraph::AxisVector(input_order), shape);
-}
-
-std::shared_ptr<ngraph::Node> GetNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    const std::string name, const paddle::framework::VariableNameMap& var_map,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto& var_names = var_map.at(name);
-  if (var_names.size() == 0) return nullptr;
-  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
-    return (*ngb_node_map)[var_names[0]];
-  } else {
-    return nullptr;
-  }
-}
-
-std::shared_ptr<ngraph::Node> GetInputNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    const std::string name,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  return GetNode(op, name, op->Inputs(), ngb_node_map);
-}
-
-std::shared_ptr<ngraph::Node> GetOutputNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    const std::string name,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  return GetNode(op, name, op->Outputs(), ngb_node_map);
-}
-
-template <typename T>
-std::shared_ptr<ngraph::Node> CreateConstant(const ngraph::element::Type& type,
-                                             ngraph::Shape shape,
-                                             std::initializer_list<T> values) {
-  std::shared_ptr<ngraph::Node> result;
-  if (values.size() == 1 && shape != ngraph::Shape{} &&  // NOLINT
-      shape != ngraph::Shape{1}) {
-    result = std::make_shared<ngraph::op::Constant>(type, ngraph::Shape{},
-                                                    std::vector<T>{values});
-    ngraph::AxisSet axis_set;
-    for (size_t i = 0; i < shape.size(); ++i) axis_set.insert(i);
-    result = std::make_shared<ngraph::op::Broadcast>(result, shape, axis_set);
-  } else {
-    result = std::make_shared<ngraph::op::Constant>(type, shape,
-                                                    std::vector<T>{values});
-  }
-  return result;
-}
-
-void SetOutputNode(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    const std::string name, std::shared_ptr<ngraph::Node> node,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto& var_names = op->Outputs().at(name);
-  if (var_names.size() == 1) {
-    (*ngb_node_map)[var_names[0]] = node;
-  } else if (var_names.size() == 0) {
-    (*ngb_node_map)[""] = node;
-  } else {
-    PADDLE_THROW("name %s has more than 1 var_names.", name);
-  }
-}
-
-bool HasOutput(const std::shared_ptr<paddle::framework::OperatorBase>& op,
-               const std::string name) {
-  auto& outputs = op->Outputs();
-  if (outputs.find(name) == outputs.end()) return false;
-  return outputs.at(name).size() > 0;
-}
-
-inline void GetMidDims(const ngraph::Shape& x_shape,
-                       const ngraph::Shape& y_shape, int axis, int* pre, int* n,
-                       int* post) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_shape[i];
-  }
-
-  for (size_t i = 0; i < y_shape.size(); ++i) {
-    PADDLE_ENFORCE_EQ(x_shape[i + axis], y_shape[i],
-                      "Broadcast dimension mismatch.");
-    (*n) *= y_shape[i];
-  }
-
-  for (size_t i = axis + y_shape.size(); i < x_shape.size(); ++i) {
-    (*post) *= x_shape[i];
-  }
-}
-
-inline void TrimTrailingSingularDims(ngraph::Shape* shape) {
-  // Remove trailing dimensions of size 1 for y
-  auto actual_shape_size = shape->size();
-  for (; actual_shape_size != 0; --actual_shape_size) {
-    if ((*shape)[actual_shape_size - 1] != 1) {
-      break;
-    } else {
-      shape->pop_back();
-    }
-  }
-}
-
-ngraph::element::Type GetNgType(paddle::framework::proto::VarType::Type dtype) {
-  ngraph::element::Type ng_dtype;
-  if (dtype == paddle::framework::proto::VarType::FP32) {
-    ng_dtype = ngraph::element::f32;
-  } else if (dtype == paddle::framework::proto::VarType::FP64) {
-    ng_dtype = ngraph::element::f64;
-  } else if (dtype == paddle::framework::proto::VarType::INT64) {
-    ng_dtype = ngraph::element::i64;
-  } else if (dtype == paddle::framework::proto::VarType::INT32) {
-    ng_dtype = ngraph::element::i32;
-  } else {
-    PADDLE_THROW("unsupported data type: %s", dtype);
-  }
-  return ng_dtype;
-}
-}  // namespace platform
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
deleted file mode 100644
index 50a401bb160ce71aca4cbddb321849c328f8f686..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/place.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/place.h"
-
-DEFINE_bool(benchmark, false,
-            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs."
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
-
-namespace paddle {
-namespace platform {
-
-namespace detail {
-
-class PlacePrinter : public boost::static_visitor<> {
- public:
-  explicit PlacePrinter(std::ostream &os) : os_(os) {}
-  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
-  void operator()(const CUDAPlace &p) {
-    os_ << "CUDAPlace(" << p.device << ")";
-  }
-  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
-
- private:
-  std::ostream &os_;
-};
-
-}  // namespace detail
-
-bool is_gpu_place(const Place &p) {
-  return boost::apply_visitor(IsCUDAPlace(), p);
-}
-
-bool is_cpu_place(const Place &p) {
-  return boost::apply_visitor(IsCPUPlace(), p);
-}
-
-bool is_cuda_pinned_place(const Place &p) {
-  return boost::apply_visitor(IsCUDAPinnedPlace(), p);
-}
-
-bool places_are_same_class(const Place &p1, const Place &p2) {
-  return p1.which() == p2.which();
-}
-
-bool is_same_place(const Place &p1, const Place &p2) {
-  if (places_are_same_class(p1, p2)) {
-    if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
-      return true;
-    } else {
-      return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
-    }
-  } else {
-    return false;
-  }
-}
-
-std::ostream &operator<<(std::ostream &os, const Place &p) {
-  detail::PlacePrinter printer(os);
-  boost::apply_visitor(printer, p);
-  return os;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
deleted file mode 100644
index daa70e943ded1909785458833349e4f8091847d1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/place.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <functional>
-#include <iostream>
-#include <vector>
-
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace platform {
-
-struct CPUPlace {
-  // WORKAROUND: for some reason, omitting this constructor
-  // causes errors with boost 1.59 and OSX
-  CPUPlace() {}
-
-  // needed for variant equality comparison
-  inline bool operator==(const CPUPlace &) const { return true; }
-  inline bool operator!=(const CPUPlace &) const { return false; }
-  inline bool operator<(const CPUPlace &) const { return false; }
-};
-
-struct CUDAPlace {
-  CUDAPlace() : CUDAPlace(0) {}
-  explicit CUDAPlace(int d) : device(d) {}
-
-  inline int GetDeviceId() const { return device; }
-  // needed for variant equality comparison
-  inline bool operator==(const CUDAPlace &o) const {
-    return device == o.device;
-  }
-  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
-  inline bool operator<(const CUDAPlace &o) const { return device < o.device; }
-
-  int device;
-};
-
-struct CUDAPinnedPlace {
-  CUDAPinnedPlace() {}
-
-  // needed for variant equality comparison
-  inline bool operator==(const CUDAPinnedPlace &) const { return true; }
-  inline bool operator!=(const CUDAPinnedPlace &) const { return false; }
-  inline bool operator<(const CUDAPinnedPlace &) const { return false; }
-};
-
-struct IsCUDAPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &gpu) const { return true; }
-  bool operator()(const CUDAPinnedPlace &) const { return false; }
-};
-
-struct IsCPUPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &cpu) const { return true; }
-  bool operator()(const CUDAPlace &) const { return false; }
-  bool operator()(const CUDAPinnedPlace &) const { return false; }
-};
-
-struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &) const { return false; }
-  bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
-};
-
-typedef boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> Place;
-
-using PlaceList = std::vector<Place>;
-
-bool is_gpu_place(const Place &);
-bool is_cpu_place(const Place &);
-bool is_cuda_pinned_place(const Place &);
-bool places_are_same_class(const Place &, const Place &);
-bool is_same_place(const Place &, const Place &);
-
-std::ostream &operator<<(std::ostream &, const Place &);
-
-template <typename Visitor>
-struct PlaceVisitorWrapper
-    : public boost::static_visitor<typename Visitor::result_type> {
-  const Visitor &visitor_;
-  explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
-
-  typename Visitor::result_type operator()(const CPUPlace &cpu) const {
-    return visitor_(cpu);
-  }
-
-  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
-#ifdef PADDLE_WITH_CUDA
-    return visitor_(cuda);
-#else
-    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
-    return typename Visitor::result_type();
-#endif
-  }
-
-  typename Visitor::result_type operator()(
-      const CUDAPinnedPlace &cuda_pinned) const {
-#ifdef PADDLE_WITH_CUDA
-    return visitor_(cuda_pinned);
-#else
-    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda_pinned");
-    return typename Visitor::result_type();
-#endif
-  }
-};
-
-template <typename Visitor>
-typename Visitor::result_type VisitPlace(const Place &place,
-                                         const Visitor &visitor) {
-  return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc
deleted file mode 100644
index e4c1d3def90f191194b46bb9ea27dd27d69dcb8b..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/place_test.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/platform/place.h"
-#include <sstream>
-#include "gtest/gtest.h"
-
-TEST(Place, Equality) {
-  paddle::platform::CPUPlace cpu;
-  paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
-
-  EXPECT_EQ(cpu, cpu);
-  EXPECT_EQ(g0, g0);
-  EXPECT_EQ(g1, g1);
-  EXPECT_EQ(g0, gg0);
-
-  EXPECT_NE(g0, g1);
-
-  EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
-  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
-}
-
-TEST(Place, Print) {
-  {
-    std::stringstream ss;
-    ss << paddle::platform::CUDAPlace(1);
-    EXPECT_EQ("CUDAPlace(1)", ss.str());
-  }
-  {
-    std::stringstream ss;
-    ss << paddle::platform::CPUPlace();
-    EXPECT_EQ("CPUPlace", ss.str());
-  }
-}
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
deleted file mode 100644
index c1b81159aca979efe4b46777a1cef49e44b95e27..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/port.h
+++ /dev/null
@@ -1,175 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdio>
-#include <stdexcept>
-
-#include <time.h>
-#include <memory>
-#include <string>
-
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "glog/logging.h"
-
-#if !defined(_WIN32)
-#include <dlfcn.h>     //  dladdr
-#include <execinfo.h>  // backtrace
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <algorithm>  // std::accumulate
-#else
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
-#include <io.h>  // _popen, _pclose
-#include <stdio.h>
-#include <windows.h>
-#include <numeric>  // std::accumulate in msvc
-#ifndef S_ISDIR     // windows port for sys/stat.h
-#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
-#endif  // S_ISDIR
-
-static void *dlsym(void *handle, const char *symbol_name) {
-  FARPROC found_symbol;
-  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
-
-  if (found_symbol == NULL) {
-    throw std::runtime_error(std::string(symbol_name) + " not found.");
-  }
-  return reinterpret_cast<void *>(found_symbol);
-}
-
-static void *dlopen(const char *filename, int flag) {
-  std::string file_name(filename);
-  HMODULE hModule = LoadLibrary(file_name.c_str());
-  if (!hModule) {
-    throw std::runtime_error(file_name + " not found.");
-  }
-  return reinterpret_cast<void *>(hModule);
-}
-
-static int gettimeofday(struct timeval *tp, void *tzp) {
-  time_t clock;
-  struct tm tm;
-  SYSTEMTIME wtm;
-
-  GetLocalTime(&wtm);
-  tm.tm_year = wtm.wYear - 1900;
-  tm.tm_mon = wtm.wMonth - 1;
-  tm.tm_mday = wtm.wDay;
-  tm.tm_hour = wtm.wHour;
-  tm.tm_min = wtm.wMinute;
-  tm.tm_sec = wtm.wSecond;
-  tm.tm_isdst = -1;
-  clock = mktime(&tm);
-  tp->tv_sec = clock;
-  tp->tv_usec = wtm.wMilliseconds * 1000;
-
-  return (0);
-}
-#endif  // !_WIN32
-
-static void ExecShellCommand(const std::string &cmd, std::string *message) {
-  char buffer[128];
-#if !defined(_WIN32)
-  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
-#else
-  std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
-#endif  // _WIN32
-  if (!pipe) {
-    LOG(ERROR) << "error running command: " << cmd;
-    return;
-  }
-  while (!feof(pipe.get())) {
-    if (fgets(buffer, 128, pipe.get()) != nullptr) {
-      *message += buffer;
-    }
-  }
-}
-
-static bool PathExists(const std::string &path) {
-#if !defined(_WIN32)
-  struct stat statbuf;
-  if (stat(path.c_str(), &statbuf) != -1) {
-    if (S_ISDIR(statbuf.st_mode)) {
-      return true;
-    }
-  }
-#else
-  struct _stat statbuf;
-  if (_stat(path.c_str(), &statbuf) != -1) {
-    if (S_ISDIR(statbuf.st_mode)) {
-      return true;
-    }
-  }
-#endif  // !_WIN32
-  return false;
-}
-
-// TODO(yuyang18): If the functions below are needed by other files, move them
-// to paddle::filesystem namespace.
-#if !defined(_WIN32)
-constexpr char kSEP = '/';
-#else
-constexpr char kSEP = '\\';
-#endif  // _WIN32
-
-static bool FileExists(const std::string &filepath) {
-#if !defined(_WIN32)
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-#else
-  struct _stat buffer;
-  return (_stat(filepath.c_str(), &buffer) == 0);
-#endif  // !_WIN32
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  std::string path_error(path);
-  path_error += " mkdir failed!";
-#if !defined(_WIN32)
-  if (mkdir(path, 0755)) {
-    if (errno != EEXIST) {
-      throw std::runtime_error(path_error);
-    }
-  }
-#else
-  BOOL return_value = CreateDirectory(path, NULL);
-  if (!return_value) {
-    auto errorno = GetLastError();
-    if (errorno != ERROR_ALREADY_EXISTS) {
-      throw std::runtime_error(path_error);
-    }
-  }
-#endif  // !_WIN32
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
deleted file mode 100644
index 6d055a442106d88af39c771f3ddf156ba616c99f..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/profiler.cc
+++ /dev/null
@@ -1,655 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/profiler.h"
-#include <algorithm>
-#include <iomanip>
-#include <limits>
-#include <map>
-#include <mutex>  // NOLINT
-#include <random>
-#include <string>
-#include <vector>
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#endif  // PADDLE_WITH_CUDA
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/platform/device_tracer.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/string/printf.h"
-
-DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
-
-namespace paddle {
-namespace platform {
-
-static int64_t profiler_lister_id = 0;
-static bool should_send_profile_state = false;
-std::mutex profiler_mu;
-
-// The profiler state, the initial value is ProfilerState::kDisabled
-static ProfilerState g_state = ProfilerState::kDisabled;
-// The thread local event list only can be accessed by the specific thread
-// The thread index of each thread
-static thread_local int32_t g_thread_id;
-// The g_next_thread_id is a global counter for threads, by the g_thread_id and
-// g_next_thread_id, we can know how many threads have created EventList.
-static uint32_t g_next_thread_id = 0;
-// The global mutex
-static std::mutex g_all_event_lists_mutex;
-// The total event lists of all threads
-static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
-// The thread local event list only can be accessed by the specific thread
-static thread_local std::shared_ptr<EventList<Event>> g_event_list;
-
-static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
-static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
-static std::mutex g_all_mem_event_lists_mutex;
-static thread_local int32_t g_mem_thread_id;
-static uint32_t g_mem_next_thread_id = 0;
-
-inline uint64_t GetTimeInNsec() {
-  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
-                                 std::chrono::high_resolution_clock,
-                                 std::chrono::steady_clock>::type;
-  return std::chrono::duration_cast<std::chrono::nanoseconds>(
-             clock::now().time_since_epoch())
-      .count();
-}
-
-Event::Event(EventType type, std::string name, uint32_t thread_id)
-    : type_(type), name_(name), thread_id_(thread_id) {
-  cpu_ns_ = GetTimeInNsec();
-}
-
-const EventType &Event::type() const { return type_; }
-
-double Event::CpuElapsedMs(const Event &e) const {
-  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
-}
-
-double Event::CudaElapsedMs(const Event &e) const {
-#ifdef PADDLE_WITH_CUPTI
-  return gpu_ns_ / 1000000.0;
-#else
-  LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
-  return 0;
-#endif
-}
-
-inline EventList<MemEvent> &GetMemEventList() {
-  if (!g_mem_event_list) {
-    g_mem_event_list = std::make_shared<EventList<MemEvent>>();
-    std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
-    g_mem_thread_id = g_mem_next_thread_id++;
-    g_all_mem_event_lists.emplace_front(g_mem_event_list);
-  }
-  return *g_mem_event_list;
-}
-
-void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                  const Place &place, const std::string &annotation) {
-  GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
-                           place, g_mem_thread_id, annotation);
-}
-
-void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                 const Place &place, const std::string &annotation) {
-  GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
-                           g_mem_thread_id, annotation);
-}
-
-inline EventList<Event> &GetEventList() {
-  if (!g_event_list) {
-    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
-    g_event_list = std::make_shared<EventList<Event>>();
-    g_thread_id = g_next_thread_id++;
-    g_all_event_lists.emplace_front(g_event_list);
-    RecoreCurThreadId(g_thread_id);
-  }
-  return *g_event_list;
-}
-
-void Mark(const std::string &name) {
-  GetEventList().Record(EventType::kMark, name, g_thread_id);
-}
-
-Event *PushEvent(const std::string &name) {
-  return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
-}
-
-void PopEvent(const std::string &name) {
-  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
-}
-
-RecordEvent::RecordEvent(const std::string &name)
-    : is_enabled_(false), start_ns_(PosixInNsec()) {
-  if (g_state == ProfilerState::kDisabled) return;
-  // lock is not needed, the code below is thread-safe
-
-  is_enabled_ = true;
-  name_ = name;
-  Event *e = PushEvent(name_);
-  // Maybe need the same push/pop behavior.
-  SetCurAnnotation(e);
-}
-
-RecordEvent::~RecordEvent() {
-  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
-  // lock is not needed, the code below is thread-safe
-  DeviceTracer *tracer = GetDeviceTracer();
-  if (tracer) {
-    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
-                          BlockDepth(), g_thread_id);
-  }
-  ClearCurAnnotation();
-  PopEvent(name_);
-}
-
-MemEvenRecorder MemEvenRecorder::recorder;
-
-void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
-                                    size_t size) {
-  if (g_state == ProfilerState::kDisabled) return;
-  std::lock_guard<std::mutex> guard(mtx_);
-  auto &events = address_memevent_[place];
-  PADDLE_ENFORCE(events.count(ptr) == 0, "");
-  events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
-                          new MemEvenRecorder::RecordMemEvent(place, size)));
-}
-
-void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
-  if (g_state == ProfilerState::kDisabled) return;
-  std::lock_guard<std::mutex> guard(mtx_);
-  auto &events = address_memevent_[place];
-  auto iter = events.find(ptr);
-  // The ptr maybe not in address_memevent
-  if (iter != events.end()) {
-    events.erase(iter);
-  }
-}
-
-void MemEvenRecorder::Flush() {
-  std::lock_guard<std::mutex> guard(mtx_);
-  address_memevent_.clear();
-}
-
-MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
-                                                size_t bytes)
-    : place_(place),
-      bytes_(bytes),
-      start_ns_(PosixInNsec()),
-      alloc_in_(CurAnnotationName()) {
-  PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
-}
-
-MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
-  DeviceTracer *tracer = GetDeviceTracer();
-  end_ns_ = PosixInNsec();
-
-  auto annotation_free = CurAnnotationName();
-  if (tracer) {
-    tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
-                             annotation_free, g_mem_thread_id);
-  }
-  PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
-}
-
-RecordRPCEvent::RecordRPCEvent(const std::string &name) {
-  if (FLAGS_enable_rpc_profiler) {
-    event_.reset(new platform::RecordEvent(name));
-  }
-}
-
-RecordBlock::RecordBlock(int block_id)
-    : is_enabled_(false), start_ns_(PosixInNsec()) {
-  // lock is not needed, the code below is thread-safe
-  if (g_state == ProfilerState::kDisabled) return;
-  is_enabled_ = true;
-  SetCurBlock(block_id);
-  name_ = string::Sprintf("block_%d", block_id);
-}
-
-RecordBlock::~RecordBlock() {
-  // lock is not needed, the code below is thread-safe
-  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
-  DeviceTracer *tracer = GetDeviceTracer();
-  if (tracer) {
-    // We try to put all blocks at the same nested depth in the
-    // same timeline lane. and distinguish the using thread_id.
-    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
-                          g_thread_id);
-  }
-  ClearCurBlock();
-}
-
-void SynchronizeAllDevice() {
-#ifdef PADDLE_WITH_CUDA
-  int count = GetCUDADeviceCount();
-  for (int i = 0; i < count; i++) {
-    SetDeviceId(i);
-    PADDLE_ENFORCE(cudaDeviceSynchronize());
-  }
-#endif
-}
-
-void EnableProfiler(ProfilerState state) {
-  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
-                 "Can't enable profiling, since the input state is ",
-                 "ProfilerState::kDisabled");
-  SynchronizeAllDevice();
-  std::lock_guard<std::mutex> l(profiler_mu);
-  if (state == g_state) {
-    return;
-  }
-  g_state = state;
-  should_send_profile_state = true;
-  GetDeviceTracer()->Enable();
-#ifdef PADDLE_WITH_CUDA
-  if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
-      g_state == ProfilerState::kCPU) {
-    // Generate some dummy events first to reduce the startup overhead.
-    DummyKernelAndEvent();
-    GetDeviceTracer()->Reset();
-  }
-#endif
-  // Mark the profiling start.
-  Mark("_start_profiler_");
-}
-
-void ResetProfiler() {
-  SynchronizeAllDevice();
-  GetDeviceTracer()->Reset();
-  MemEvenRecorder::Instance().Flush();
-  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
-  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
-       ++it) {
-    (*it)->Clear();
-  }
-  for (auto it = g_all_mem_event_lists.begin();
-       it != g_all_mem_event_lists.end(); ++it) {
-    (*it)->Clear();
-  }
-}
-
-std::vector<std::vector<Event>> GetAllEvents() {
-  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
-  std::vector<std::vector<Event>> result;
-  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
-       ++it) {
-    result.emplace_back((*it)->Reduce());
-  }
-  return result;
-}
-
-std::vector<std::vector<MemEvent>> GetMemEvents() {
-  std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
-  std::vector<std::vector<MemEvent>> result;
-  for (auto &it : g_all_mem_event_lists) {
-    result.emplace_back((*it).Reduce());
-  }
-  return result;
-}
-
-// The information of each event given in the profiling report
-struct EventItem {
-  std::string name;
-  int calls;
-  double total_time;
-  double max_time;
-  double ave_time;
-  double min_time;
-  double cpu_time;
-  double gpu_time;
-  float ratio;
-};
-
-// Print results
-void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
-                   const std::string &sorted_domain, const size_t name_width,
-                   const size_t data_width, bool merge_thread) {
-  // Output header information
-  std::cout << "\n------------------------->"
-            << "     Profiling Report     "
-            << "<-------------------------\n\n";
-  std::string place;
-  if (g_state == ProfilerState::kCPU) {
-    place = "CPU";
-  } else if (g_state == ProfilerState::kCUDA) {
-    place = "CUDA";
-  } else if (g_state == ProfilerState::kAll) {
-    place = "All";
-  } else {
-    PADDLE_THROW("Invalid profiler state", g_state);
-  }
-
-  if (merge_thread) {
-    std::cout << "Note! This Report merge all thread info into one."
-              << std::endl;
-  }
-  std::cout << "Place: " << place << std::endl;
-  std::cout << "Time unit: ms" << std::endl;
-  std::cout << "Sorted by " << sorted_domain
-            << " in descending order in the same thread\n\n";
-  // Output events table
-  std::cout.setf(std::ios::left);
-  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
-            << "Calls" << std::setw(data_width) << "Total";
-  if (g_state == ProfilerState::kAll) {
-    std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
-              << std::setw(data_width * 2) << "GPU Time (Ratio)";
-  }
-  std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
-            << "Max." << std::setw(data_width) << "Ave."
-            << std::setw(data_width) << "Ratio." << std::endl;
-  for (size_t i = 0; i < events_table.size(); ++i) {
-    for (size_t j = 0; j < events_table[i].size(); ++j) {
-      const EventItem &event_item = events_table[i][j];
-      std::cout << std::setw(name_width) << event_item.name
-                << std::setw(data_width) << event_item.calls
-                << std::setw(data_width) << event_item.total_time;
-      if (g_state == ProfilerState::kAll) {
-        std::cout << std::setw(data_width * 2)
-                  << string::Sprintf(
-                         "%f (%f)", event_item.cpu_time,
-                         (event_item.cpu_time / event_item.total_time))
-                  << std::setw(data_width * 2)
-                  << string::Sprintf(
-                         "%f (%f)", event_item.gpu_time,
-                         (event_item.gpu_time / event_item.total_time));
-      }
-      std::cout << std::setw(data_width) << event_item.min_time
-                << std::setw(data_width) << event_item.max_time
-                << std::setw(data_width) << event_item.ave_time
-                << std::setw(data_width) << event_item.ratio << std::endl;
-    }
-  }
-  std::cout << std::endl;
-}
-
-// Parse the event list and output the profiling report
-void ParseEvents(const std::vector<std::vector<Event>> &events,
-                 bool merge_thread,
-                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
-  if (g_state == ProfilerState::kDisabled) return;
-  if (merge_thread && events.size() < 2) return;
-
-  std::string sorted_domain;
-  std::function<bool(const EventItem &, const EventItem &)> sorted_func;
-  switch (sorted_by) {
-    case EventSortingKey::kCalls:
-      sorted_domain = "number of calls";
-      sorted_func = [](const EventItem &a, const EventItem &b) {
-        return a.calls > b.calls;
-      };
-      break;
-    case EventSortingKey::kTotal:
-      sorted_domain = "total time";
-      sorted_func = [](const EventItem &a, const EventItem &b) {
-        return a.total_time > b.total_time;
-      };
-      break;
-    case EventSortingKey::kMin:
-      sorted_domain = "minimum time";
-      sorted_func = [](const EventItem &a, const EventItem &b) {
-        return a.min_time > b.min_time;
-      };
-      break;
-    case EventSortingKey::kMax:
-      sorted_domain = "maximum time";
-      sorted_func = [](const EventItem &a, const EventItem &b) {
-        return a.max_time > b.max_time;
-      };
-      break;
-    case EventSortingKey::kAve:
-      sorted_domain = "average time";
-      sorted_func = [](const EventItem &a, const EventItem &b) {
-        return a.ave_time > b.ave_time;
-      };
-      break;
-    case EventSortingKey::kGPUTime:
-      sorted_domain = "average time";
-      sorted_func = [](const EventItem &a, const EventItem &b) {
-        return a.gpu_time > b.gpu_time;
-      };
-      break;
-    case EventSortingKey::kCPUTime:
-      sorted_domain = "average time";
-      sorted_func = [](const EventItem &a, const EventItem &b) {
-        return a.cpu_time > b.cpu_time;
-      };
-      break;
-    default:
-      sorted_domain = "event first end time";
-  }
-
-  const std::vector<std::vector<Event>> *analyze_events;
-  std::vector<std::vector<Event>> merged_events_list;
-  if (merge_thread) {
-    std::vector<Event> merged_events;
-    for (size_t i = 0; i < events.size(); ++i) {
-      for (size_t j = 0; j < events[i].size(); ++j) {
-        merged_events.push_back(events[i][j]);
-      }
-    }
-    merged_events_list.push_back(merged_events);
-    analyze_events = &merged_events_list;
-  } else {
-    analyze_events = &events;
-  }
-
-  std::vector<std::vector<EventItem>> events_table;
-  size_t max_name_width = 0;
-  for (size_t i = 0; i < (*analyze_events).size(); i++) {
-    double total = 0.;  // the total time in one thread
-    std::list<Event> pushed_events;
-    std::vector<EventItem> event_items;
-    std::unordered_map<std::string, int> event_idx;
-
-    for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
-      if ((*analyze_events)[i][j].type() == EventType::kPushRange) {
-        pushed_events.push_back((*analyze_events)[i][j]);
-      } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) {
-        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
-        while (rit != pushed_events.rend() &&
-               rit->name() != (*analyze_events)[i][j].name()) {
-          ++rit;
-        }
-
-        if (rit != pushed_events.rend()) {
-          double event_time = 0;
-          double gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
-          double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
-          if (g_state == ProfilerState::kCUDA) {
-            event_time = gpu_time;
-          } else if (g_state == ProfilerState::kCPU) {
-            event_time = cpu_time;
-          } else {
-            event_time = gpu_time + cpu_time;
-          }
-
-          total += event_time;
-
-          std::string event_name;
-          if (merge_thread) {
-            event_name = rit->name();
-            max_name_width = std::max(max_name_width, event_name.size());
-          } else {
-            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
-                         rit->name();
-            max_name_width = std::max(max_name_width, event_name.size());
-          }
-
-          if (event_idx.find(event_name) == event_idx.end()) {
-            event_idx[event_name] = event_items.size();
-            EventItem event_item = {event_name, 1,          event_time,
-                                    event_time, event_time, event_time,
-                                    gpu_time,   cpu_time,   0.};
-            event_items.push_back(event_item);
-          } else {
-            int index = event_idx[event_name];
-            event_items[index].calls += 1;
-            // total time
-            event_items[index].total_time += event_time;
-            // min time
-            event_items[index].min_time =
-                std::min(event_time, event_items[index].min_time);
-            // max time
-            event_items[index].max_time =
-                std::max(event_time, event_items[index].max_time);
-            event_items[index].gpu_time += gpu_time;
-            event_items[index].cpu_time += cpu_time;
-          }
-
-          // remove the push marker from the list
-          pushed_events.erase((++rit).base());
-        } else {
-          LOG(WARNING) << "Cannot find the push marker of event \'"
-                       << (*analyze_events)[i][j].name()
-                       << "\', which will be ignored in profiling report.";
-        }
-      }
-    }
-    // average time
-    for (auto &item : event_items) {
-      item.ave_time = item.total_time / item.calls;
-      item.ratio = item.total_time / total;
-    }
-    // sort
-    if (sorted_by != EventSortingKey::kDefault) {
-      std::sort(event_items.begin(), event_items.end(), sorted_func);
-    }
-
-    events_table.push_back(event_items);
-    // log warning if there are events with `push` but without `pop`
-    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
-    while (rit != pushed_events.rend()) {
-      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
-                   << "\', which will be ignored in profiling report.";
-      ++rit;
-    }
-  }
-
-  // Print report
-  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
-                merge_thread);
-}
-
-struct MemoryProfierReport {
-  size_t alloc_times{0};
-  size_t alloc_size{0};
-  size_t free_times{0};
-  size_t free_size{0};
-};
-
-// Print results
-void PrintMemProfiler(
-    const std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
-        &annotation_report,
-    const size_t name_width, const size_t data_width) {
-  // Output header information
-  std::cout << "\n------------------------->"
-            << "    Memory Profiling Report     "
-            << "<-------------------------\n\n";
-
-  // Output events table
-  std::cout.setf(std::ios::left);
-  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
-            << "Alloc Calls" << std::setw(data_width) << "Size(MB)"
-            << std::setw(data_width) << "Free Calls" << std::setw(data_width)
-            << "Size(MB)" << std::endl;
-
-  for (auto &tmp : annotation_report) {
-    for (auto &e : tmp.second) {
-      auto event_name = string::Sprintf("%s:%s", tmp.first, e.first);
-      std::cout << std::setw(name_width) << event_name;
-      std::cout << std::setw(data_width) << e.second.alloc_times;
-      std::cout << std::setw(data_width)
-                << e.second.alloc_size / (1024.0 * 1024.0);
-      std::cout << std::setw(data_width) << e.second.free_times;
-      std::cout << std::setw(data_width)
-                << e.second.free_size / (1024.0 * 1024.0) << std::endl;
-    }
-  }
-  std::cout << std::endl;
-}
-
-// parse memory events
-void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
-  if (g_state == ProfilerState::kDisabled) return;
-  // place, annotation, alloc times,  alloc size
-  std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
-      annotation_report;
-
-  for (auto &tmp : events) {
-    for (auto &e : tmp) {
-      if (e.type() == EventType::kPushRange) {
-        annotation_report[e.place()][e.annotation()].alloc_times += 1;
-        annotation_report[e.place()][e.annotation()].alloc_size += e.bytes();
-      } else if (e.type() == EventType::kPopRange) {
-        annotation_report[e.place()][e.annotation()].free_times += 1;
-        annotation_report[e.place()][e.annotation()].free_size += e.bytes();
-      }
-    }
-  }
-  PrintMemProfiler(annotation_report, 55, 18);
-}
-
-void DisableProfiler(EventSortingKey sorted_key,
-                     const std::string &profile_path) {
-  SynchronizeAllDevice();
-  MemEvenRecorder::Instance().Flush();
-
-  std::lock_guard<std::mutex> l(profiler_mu);
-  if (g_state == ProfilerState::kDisabled) return;
-  // Mark the profiling stop.
-  Mark("_stop_profiler_");
-
-  DeviceTracer *tracer = GetDeviceTracer();
-  if (tracer->IsEnabled()) {
-    tracer->Disable();
-    tracer->GenProfile(profile_path);
-    tracer->GenEventKernelCudaElapsedTime();
-  }
-
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, true, sorted_key);
-  ParseEvents(all_events, false, sorted_key);
-  if (VLOG_IS_ON(5)) {
-    std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
-    ParseMemEvents(all_mem_events);
-  }
-
-  ResetProfiler();
-  g_state = ProfilerState::kDisabled;
-  should_send_profile_state = true;
-}
-
-bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
-bool ShouldSendProfileState() { return should_send_profile_state; }
-
-void SetProfileListener() {
-  std::mt19937 rng;
-  rng.seed(std::random_device()());
-  std::uniform_int_distribution<std::mt19937::result_type> dist6(
-      1, std::numeric_limits<int>::max());
-  profiler_lister_id = dist6(rng);
-}
-int64_t ListenerId() { return profiler_lister_id; }
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
deleted file mode 100644
index d4db65060bb2e5fff1b131dcdcd9f38a39bdb444..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/profiler.cu
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cuda.h>
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace platform {
-
-__global__ void DummyKernel(int *a) { a[0] = 0; }
-
-static void ForEachDevice(std::function<void(int)> func) {
-  auto original_device = platform::GetCurrentDeviceId();
-  int count = platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; i++) {
-    platform::SetDeviceId(i);
-    func(i);
-  }
-  platform::SetDeviceId(original_device);
-}
-
-void DummyKernelAndEvent() {
-  for (int i = 0; i < 5; i++) {
-    ForEachDevice([](int d) {
-      platform::SetDeviceId(d);
-      cudaStream_t stream;
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
-      Mark("_cuda_startup_");
-      int *ptr;
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&ptr, sizeof(int)));
-      DummyKernel<<<1, 1, 0, stream>>>(ptr);
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr));
-    });
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
deleted file mode 100644
index 8d11855b70de824159f19f2997b876564e7719b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/profiler.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <forward_list>
-#include <list>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/event.h"
-#include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
-namespace paddle {
-namespace platform {
-
-enum ProfilerState {
-  kDisabled,  // disabled state
-  kCPU,       // CPU profiling state
-  kCUDA,      // GPU profiling state
-  kAll,       // Profile both CPU and GPU. (Currently experimental).
-};
-
-void Mark(const std::string& name);
-
-void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                  const Place& place);
-void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                 const Place& place);
-
-struct MemEvenRecorder {
- public:
-  void PushMemRecord(const void* ptr, const Place& place, size_t size);
-  void PopMemRecord(const void* ptr, const Place& place);
-  void Flush();
-  static MemEvenRecorder& Instance() { return recorder; }
-
- private:
-  struct RecordMemEvent {
-    RecordMemEvent(const Place& place, size_t bytes);
-    ~RecordMemEvent();
-
-    Place place_;
-    size_t bytes_;
-    uint64_t start_ns_;
-    uint64_t end_ns_;
-    std::string alloc_in_;
-    std::string free_in_;
-  };
-
-  static MemEvenRecorder recorder;
-  std::map<Place,
-           std::unordered_map<const void*, std::unique_ptr<RecordMemEvent>>>
-      address_memevent_;
-  std::mutex mtx_;
-  MemEvenRecorder() {}
-  DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
-};
-
-Event* PushEvent(const std::string& name);
-void PopEvent(const std::string& name);
-
-struct RecordEvent {
-  explicit RecordEvent(const std::string& name);
-
-  ~RecordEvent();
-
-  bool is_enabled_;
-  uint64_t start_ns_;
-  // Event name
-  std::string name_;
-  // Need to distinguish name by op type, block_id, program_id and perhaps
-  // different kernel invocations within an op.
-  std::string full_name_;
-};
-
-class RecordRPCEvent {
- public:
-  explicit RecordRPCEvent(const std::string& name);
-  ~RecordRPCEvent() {}
-
- private:
-  std::unique_ptr<RecordEvent> event_;
-};
-
-struct RecordBlock {
-  explicit RecordBlock(int block_id);
-  ~RecordBlock();
-
- private:
-  bool is_enabled_;
-  std::string name_;
-  uint64_t start_ns_;
-};
-
-// Return the event list of all threads. Assumed the returned value calls
-// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
-std::vector<std::vector<Event>> GetAllEvents();
-
-// Candidate keys to sort the profiling report
-enum EventSortingKey {
-  kDefault,
-  kCalls,
-  kTotal,
-  kMin,
-  kMax,
-  kAve,
-  kCPUTime,
-  kGPUTime
-};
-
-template <typename T>
-struct EventList {
-  constexpr static size_t kMB = 1024 * 1024;
-  constexpr static size_t kEventBlockSize = 16 * kMB;
-  constexpr static size_t kEventSize = sizeof(T);
-  constexpr static size_t kEventAlign = alignof(T);
-  constexpr static size_t kNumBlock =
-      kEventBlockSize /
-      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
-
-  template <typename... Args>
-  T* Record(Args&&... args) {
-    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
-      event_blocks.emplace_front();
-      event_blocks.front().reserve(kNumBlock);
-    }
-    event_blocks.front().emplace_back(std::forward<Args>(args)...);
-    return &event_blocks.front().back();
-  }
-
-  std::vector<T> Reduce() {
-    std::vector<T> result;
-    for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
-                    std::make_move_iterator(block.end()));
-    }
-    event_blocks.clear();
-    return result;
-  }
-
-  void Clear() { event_blocks.clear(); }
-
-  std::forward_list<std::vector<T>> event_blocks;
-};
-
-// Enable the profiling function.
-void EnableProfiler(ProfilerState state);
-
-// Clear the g_all_event_lists, which is total event lists of all threads.
-void ResetProfiler();
-
-void DisableProfiler(EventSortingKey sorted_key,
-                     const std::string& profile_path);
-
-const int kEnableProfiler = 1;
-const int kDisableProfiler = 2;
-// Test if the profiler is currently enabled.
-bool IsProfileEnabled();
-// Whether the trainer should send profiling state to PS.
-bool ShouldSendProfileState();
-// Mark current process as PS by assigning a lister id.
-void SetProfileListener();
-int64_t ListenerId();
-
-#ifdef PADDLE_WITH_CUDA
-void DummyKernelAndEvent();
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
deleted file mode 100644
index cfa3c6906f83f750c8d6dc654f29b8fe95ec17ac..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/profiler.proto
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-package paddle.platform.proto;
-
-message MemCopy { optional uint64 bytes = 1; }
-
-message Event {
-  enum EventType {
-    CPU = 0;
-    GPUKernel = 1;
-  }
-  optional EventType type = 8;
-  optional string name = 1;
-  optional uint64 start_ns = 2;
-  optional uint64 end_ns = 3;
-  // When positive, it represents gpu id. When -1, it represents CPU.
-  optional int64 device_id = 5;
-  optional int64 sub_device_id = 6;
-
-  optional MemCopy memcopy = 7;
-  optional string detail_info = 9;
-}
-
-message MemEvent {
-  enum Place {
-    CUDAPlace = 0;
-    CPUPlace = 1;
-    CUDAPinnedPlace = 2;
-  }
-  optional uint64 start_ns = 1;
-  optional uint64 end_ns = 2;
-  optional uint64 bytes = 3;
-  optional Place place = 4;
-  optional uint64 thread_id = 5;
-  optional uint32 device_id = 6;
-  optional string alloc_in = 7;
-  optional string free_in = 8;
-}
-
-message Profile {
-  repeated Event events = 1;
-  optional uint64 start_ns = 2;
-  optional uint64 end_ns = 3;
-  repeated MemEvent mem_events = 4;
-}
\ No newline at end of file
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
deleted file mode 100644
index a851488e72d27dfcbd04546d9b531d26257f611c..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/profiler_test.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/profiler.h"
-#include <string>
-#ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-#endif
-#include "gtest/gtest.h"
-
-TEST(Event, CpuElapsedTime) {
-  using paddle::platform::Event;
-  using paddle::platform::EventType;
-
-  Event start_event(EventType::kPushRange, "test", 0);
-  int counter = 0;
-  while (counter != 1000) {
-    counter++;
-  }
-  Event stop_event(EventType::kPopRange, "test", 0);
-  EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
-}
-
-TEST(RecordEvent, RecordEvent) {
-  using paddle::platform::Event;
-  using paddle::platform::EventType;
-  using paddle::platform::RecordEvent;
-  using paddle::platform::PushEvent;
-  using paddle::platform::PopEvent;
-  using paddle::platform::ProfilerState;
-  using paddle::platform::EventSortingKey;
-
-  ProfilerState state = ProfilerState::kCPU;
-  EnableProfiler(state);
-
-  /* Usage 1:
-  *  PushEvent(evt_name);
-  *  ...
-  *  code to be analyzed
-  *  ...
-  * PopEvent(evt_name);
-  */
-  LOG(INFO) << "Usage 1: PushEvent & PopEvent";
-  for (int loop = 0; loop < 3; ++loop) {
-    for (int i = 1; i < 5; ++i) {
-      std::string name = "op_" + std::to_string(i);
-      PushEvent(name);
-      int counter = 1;
-      while (counter != i * 1000) counter++;
-      PopEvent(name);
-    }
-  }
-
-  /* Usage 2:
-   * {
-   *   RecordEvent record_event(name);
-   *   ...
-   *   code to be analyzed
-   *   ...
-   * }
-   */
-  LOG(INFO) << "Usage 2: RecordEvent";
-  for (int i = 1; i < 5; ++i) {
-    std::string name = "evs_op_" + std::to_string(i);
-    RecordEvent record_event(name);
-    int counter = 1;
-    while (counter != i * 1000) counter++;
-  }
-
-  /* Usage 3
-   * {
-   *   RecordEvent record_event(name1, dev_ctx);
-   *   ...
-   *   code to be analyzed
-   *   ...
-   *   {
-   *     RecordEvent nested_record_event(name2, dev_ctx);
-   *     ...
-   *     code to be analyzed
-   *     ...
-   *   }
-   * }
-   */
-  LOG(INFO) << "Usage 3: nested RecordEvent";
-  for (int i = 1; i < 5; ++i) {
-    std::string name = "ano_evs_op_" + std::to_string(i);
-    RecordEvent record_event(name);
-    int counter = 1;
-    while (counter != i * 100) counter++;
-    {
-      std::string nested_name = "nested_ano_evs_op_" + std::to_string(i);
-      RecordEvent nested_record_event(nested_name);
-      int nested_counter = 1;
-      while (nested_counter != i * 100) nested_counter++;
-    }
-  }
-
-  // Bad Usage:
-  PushEvent("event_without_pop");
-  PopEvent("event_without_push");
-  std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
-
-  int cuda_startup_count = 0;
-  int start_profiler_count = 0;
-  for (size_t i = 0; i < events.size(); ++i) {
-    for (size_t j = 0; j < events[i].size(); ++j) {
-      if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
-      if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
-      if (events[i][j].name() == "push") {
-        EXPECT_EQ(events[i][j + 1].name(), "pop");
-#ifdef PADDLE_WITH_CUDA
-        EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
-#else
-        EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
-#endif
-      }
-    }
-  }
-  EXPECT_EQ(cuda_startup_count % 5, 0);
-  EXPECT_EQ(start_profiler_count, 1);
-
-  // Will remove parsing-related code from test later
-  DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(TMP, stream_wait) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  cudaStreamSynchronize(stream);
-  cudaStreamSynchronize(stream);
-  cudaStreamSynchronize(stream);
-}
-#endif
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
deleted file mode 100644
index 5a9e24374f6f777c2286b8928eae9dcbe8be6378..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/platform/stream_callback_manager.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-
-#if CUDA_VERSION >= 10000
-static void CUDART_CB StreamCallbackFunc(void *user_data)
-#else
-static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
-                                         cudaError_t status, void *user_data)
-#endif
-{
-  std::unique_ptr<std::function<void()>> func(
-      reinterpret_cast<std::function<void()> *>(user_data));
-  (*func)();
-}
-
-StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream)
-    : stream_(stream), thread_pool_(1) {}
-
-void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
-  auto *callback_func = new std::function<void()>(std::move(callback));
-  auto *func = new std::function<void()>([this, callback_func] {
-    std::lock_guard<std::mutex> lock(mtx_);
-    last_future_ = thread_pool_.enqueue([callback_func] {
-      std::unique_ptr<std::function<void()>> releaser(callback_func);
-      (*callback_func)();
-    });
-  });
-#if CUDA_VERSION >= 10000
-  PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
-#else
-  PADDLE_ENFORCE(cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
-#endif
-}
-
-void StreamCallbackManager::Wait() const {
-  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-  {
-    std::lock_guard<std::mutex> lock(mtx_);
-    if (last_future_.valid()) {
-      last_future_.wait();
-    }
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
deleted file mode 100644
index 8668bcb1131719e882ecbccb08ad00b63409eb28..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <mutex>  // NOLINT
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-
-// NOTE(zjl): clean StreamCallbackManager to make compilation faster
-// Make StreamCallbackManager thread-safe
-class StreamCallbackManager {
- public:
-  explicit StreamCallbackManager(const cudaStream_t stream);
-
-  ~StreamCallbackManager() = default;
-
-  void AddCallback(std::function<void()> callback) const;
-
-  void Wait() const;
-
- private:
-  const cudaStream_t stream_;
-  mutable ::ThreadPool thread_pool_;
-  mutable std::mutex mtx_;
-  mutable std::future<void> last_future_;
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/timer.cc b/paddle/fluid/platform/timer.cc
deleted file mode 100644
index 75d4e5cbf90bd81c73756605eacc6b0c15a63e9d..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/timer.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/timer.h"
-
-namespace paddle {
-namespace platform {
-
-void Timer::Reset() {
-  _start.tv_sec = 0;
-  _start.tv_usec = 0;
-
-  _count = 0;
-  _elapsed = 0;
-  _paused = true;
-}
-
-void Timer::Start() {
-  Reset();
-  Resume();
-}
-
-void Timer::Pause() {
-  if (_paused) {
-    return;
-  }
-  _elapsed += Tickus();
-  ++_count;
-  _paused = true;
-}
-
-void Timer::Resume() {
-  gettimeofday(&_start, NULL);
-  _paused = false;
-}
-
-int Timer::Count() { return _count; }
-
-double Timer::ElapsedUS() { return static_cast<double>(_elapsed); }
-
-double Timer::ElapsedMS() { return _elapsed / 1000.0; }
-
-double Timer::ElapsedSec() { return _elapsed / 1000000.0; }
-
-int64_t Timer::Tickus() {
-  gettimeofday(&_now, NULL);
-  return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L +
-         (_now.tv_usec - _start.tv_usec);
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h
deleted file mode 100644
index ff0e1d95c2946b6db3ac0c05acba64ff5d3c59ef..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/timer.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdlib.h>
-#include "paddle/fluid/platform/port.h"
-
-#ifdef _WIN32
-static unsigned sleep(unsigned seconds) {
-  Sleep(seconds * 1000);
-  return 0;
-}
-#endif
-
-namespace paddle {
-namespace platform {
-
-// A Standard Timer implementation for debugging
-class Timer {
- public:
-  // a timer class for profiling
-  // Reset() will be called during initialization
-  // all timing variables will be set 0 in Reset()
-  Timer() { Reset(); }
-  void Reset();
-  void Start();
-  void Pause();
-  // Resume will get current system time
-  void Resume();
-  int Count();
-  // return elapsed time in us
-  double ElapsedUS();
-  // return elapsed time in ms
-  double ElapsedMS();
-  // return elapsed time in sec
-  double ElapsedSec();
-
- private:
-  struct timeval _start;
-  struct timeval _now;
-  int _count;
-  int64_t _elapsed;
-  bool _paused;
-
-  // get us difference between start and now
-  int64_t Tickus();
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/timer_test.cc b/paddle/fluid/platform/timer_test.cc
deleted file mode 100644
index 09edf8131ffa5c1dfe607b7d72627b225c4452fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/timer_test.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/platform/timer.h"
-#include "gtest/gtest.h"
-
-TEST(Timer, Reset) {
-  paddle::platform::Timer timeline;
-  timeline.Start();
-  sleep(3);
-  timeline.Pause();
-  timeline.Reset();
-}
-
-TEST(Timer, Start) {
-  paddle::platform::Timer timeline;
-  timeline.Start();
-  sleep(3);
-  timeline.Pause();
-}
-
-TEST(Timer, Pause) {
-  paddle::platform::Timer timeline;
-  timeline.Start();
-  sleep(3);
-  timeline.Pause();
-}
-
-TEST(Timer, Resume) {
-  paddle::platform::Timer timeline;
-  timeline.Start();
-  sleep(3);
-  timeline.Pause();
-  timeline.Resume();
-}
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
deleted file mode 100644
index 7877d3e41c1c993662f5d91b263cbcb71db74c36..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/transform.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <type_traits>
-
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/hostdevice.h"
-#include "paddle/fluid/platform/place.h"
-
-#ifdef __NVCC__
-#include <thrust/execution_policy.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
-#endif
-
-namespace paddle {
-namespace platform {
-
-// Transform applys a unary or a binary functor on each element in a
-// range defined by a pair of iterators.
-//
-// - The specialization for CPU calls std::transform.
-// - The specialization for CUDA calls thrust::tranform.
-//
-// NOTE: We need to define InputIter and OutputIter defined as
-//       different types, because the InputIter points op's inputs and
-//       OutputIter pints to op's outputs.
-//
-// NOTE: We don't assume that InputIter to be const InputType* and
-//       OutputIter to be OutputType*, because we might use a iterator
-//       class, paddle::fluid::operators::RowwiseTRansformIterator.
-template <typename DeviceContext>
-struct Transform {
-  // The unary version.
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const DeviceContext& context, InputIter first, InputIter last,
-                  OutputIter result, UnaryOperation op);
-
-  // The binary version.
-  template <typename InputIter1, typename InputIter2, typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const DeviceContext& context, InputIter1 first1,
-                  InputIter1 last1, InputIter2 first2, OutputIter result,
-                  BinaryOperation op);
-};
-
-template <>
-struct Transform<platform::CPUDeviceContext> {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const platform::CPUDeviceContext& context, InputIter first,
-                  InputIter last, OutputIter result, UnaryOperation op) {
-    std::transform(first, last, result, op);
-  }
-
-  template <typename InputIter1, typename InputIter2, typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const platform::CPUDeviceContext& context, InputIter1 first1,
-                  InputIter1 last1, InputIter2 first2, OutputIter result,
-                  BinaryOperation op) {
-    std::transform(first1, last1, first2, result, op);
-  }
-};
-
-#ifdef __NVCC__
-template <>
-struct Transform<platform::CUDADeviceContext> {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const platform::CUDADeviceContext& context, InputIter first,
-                  InputIter last, OutputIter result, UnaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
-    thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::CastToCUDATransformIterator(first),
-                      details::CastToCUDATransformIterator(last),
-                      details::CastToCUDATransformIterator(result), op);
-  }
-
-  template <typename InputIter1, typename InputIter2, typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const platform::CUDADeviceContext& context, InputIter1 first1,
-                  InputIter1 last1, InputIter2 first2, OutputIter result,
-                  BinaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
-    thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::CastToCUDATransformIterator(first1),
-                      details::CastToCUDATransformIterator(last1),
-                      details::CastToCUDATransformIterator(first2),
-                      details::CastToCUDATransformIterator(result), op);
-  }
-};
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
deleted file mode 100644
index 23f5865971246b2862f859885f5bfccd926b9697..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/transform_test.cu
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/hostdevice.h"
-#include "paddle/fluid/platform/transform.h"
-
-template <typename T>
-class Scale {
- public:
-  explicit Scale(const T& scale) : scale_(scale) {}
-  HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
-
- private:
-  T scale_;
-};
-
-template <typename T>
-class Multiply {
- public:
-  HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
-};
-
-using paddle::memory::Alloc;
-using paddle::memory::Copy;
-
-using paddle::platform::CPUPlace;
-using paddle::platform::CUDAPlace;
-using paddle::platform::CPUDeviceContext;
-using paddle::platform::CUDADeviceContext;
-
-using paddle::platform::Transform;
-
-TEST(Transform, CPUUnary) {
-  CPUDeviceContext ctx;
-  float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform<CPUDeviceContext> trans;
-  trans(ctx, buf, buf + 4, buf, Scale<float>(10));
-  for (int i = 0; i < 4; ++i) {
-    ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
-  }
-}
-
-TEST(Transform, GPUUnary) {
-  CUDAPlace gpu0(0);
-  CUDADeviceContext ctx(gpu0);
-  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
-  auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
-  float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
-  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
-  Transform<CUDADeviceContext> trans;
-  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
-  ctx.Wait();
-  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
-  for (int i = 0; i < 4; ++i) {
-    ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
-  }
-}
-
-TEST(Transform, CPUBinary) {
-  int buf[4] = {1, 2, 3, 4};
-  Transform<CPUDeviceContext> trans;
-  CPUDeviceContext ctx;
-  trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
-  for (int i = 0; i < 4; ++i) {
-    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
-  }
-}
-
-TEST(Transform, GPUBinary) {
-  int buf[4] = {1, 2, 3, 4};
-  CUDAPlace gpu0(0);
-  CUDADeviceContext ctx(gpu0);
-  auto gpu_allocation = Alloc(gpu0, sizeof(buf));
-  int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
-  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
-  Transform<CUDADeviceContext> trans;
-  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
-  ctx.Wait();
-  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
-  for (int i = 0; i < 4; ++i) {
-    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
-  }
-}
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
deleted file mode 100644
index e9aef621acea44b0dab7a687c13223617d5603c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/variant.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// Boost 1.41.0 requires __CUDACC_VER__, but in CUDA 9 __CUDACC_VER__
-// is removed, so we have to manually define __CUDACC_VER__ instead.
-// For details, please refer to
-// https://github.com/PaddlePaddle/Paddle/issues/6626
-#if defined(__CUDACC__) && defined(__CUDACC_VER_MAJOR__)
-#undef __CUDACC_VER__
-#define __CUDACC_VER__                                  \
-  __CUDACC_VER_BUILD__ + __CUDACC_VER_MAJOR__ * 10000 + \
-      __CUDACC_VER_MINOR__ * 100
-#endif
-
-#include "boost/config.hpp"
-
-// Because Boost 1.41.0's variadic templates has bug on nvcc, boost
-// will disable variadic template support in NVCC mode.  Define
-// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
-// function symbols.  For details,
-// https://github.com/PaddlePaddle/Paddle/issues/3386
-#ifdef PADDLE_WITH_CUDA
-#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
-#define BOOST_NO_CXX11_VARIADIC_TEMPLATES
-#endif
-#endif
-
-#include <boost/any.hpp>
-#include <boost/mpl/comparison.hpp>
-#include <boost/mpl/less_equal.hpp>
-#include <boost/optional.hpp>
-#include <boost/variant.hpp>
-
-// some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore
deleted file mode 100644
index 8f222791edb016df65be5db75831f5f83cf63726..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-pybind.h
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
deleted file mode 100644
index cb3493b62a37b79300a511cdd93f2eb377bb20f8..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune
-  feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
-  analysis_predictor imperative_profiler nccl_context imperative_flag)
-
-if(WITH_PYTHON)
-  list(APPEND PYBIND_DEPS py_func_op)
-endif()
-
-if (WITH_DISTRIBUTE)
-  list(APPEND PYBIND_DEPS communicator)
-endif()
-
-set(PYBIND_SRCS
-  pybind.cc
-  exception.cc
-  protobuf.cc
-  const_value.cc
-  reader_py.cc
-  fleet_wrapper_py.cc
-  box_helper_py.cc
-  nccl_wrapper_py.cc
-  data_set_py.cc
-  imperative.cc
-  ir.cc
-  inference_api.cc)
-
-if (WITH_DISTRIBUTE)
-  list(APPEND PYBIND_SRCS communicator_py.cc)
-endif()
-
-if(WITH_PYTHON)
-  if(WITH_AMD_GPU)
-    hip_library(paddle_pybind SHARED
-      SRCS ${PYBIND_SRCS}
-      DEPS ARCHIVE_START ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END)
-  else()
-    cc_library(paddle_pybind SHARED
-      SRCS ${PYBIND_SRCS}
-      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-    if(NOT APPLE AND NOT WIN32)
-      target_link_libraries(paddle_pybind rt)
-    endif(NOT APPLE AND NOT WIN32)
-  endif(WITH_AMD_GPU)
-
-  get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-  target_link_libraries(paddle_pybind ${os_dependency_modules})
-
-endif(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/box_helper_py.cc b/paddle/fluid/pybind/box_helper_py.cc
deleted file mode 100644
index 13aec9aa9234c9109299136dba79c9e66ce535b0..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/box_helper_py.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fcntl.h>
-
-#ifdef _POSIX_C_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-
-#ifdef _XOPEN_SOURCE
-#undef _XOPEN_SOURCE
-#endif
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/data_feed.pb.h"
-#include "paddle/fluid/framework/fleet/box_wrapper.h"
-#include "paddle/fluid/pybind/box_helper_py.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-void BindBoxHelper(py::module* m) {
-  py::class_<framework::BoxHelper, std::shared_ptr<framework::BoxHelper>>(
-      *m, "BoxPS")
-      .def(py::init([](paddle::framework::Dataset* dataset) {
-        return std::make_shared<paddle::framework::BoxHelper>(dataset);
-      }))
-      .def("begin_pass", &framework::BoxHelper::BeginPass)
-      .def("end_pass", &framework::BoxHelper::EndPass)
-      .def("wait_feed_pass_done", &framework::BoxHelper::WaitFeedPassDone)
-      .def("preload_into_memory", &framework::BoxHelper::PreLoadIntoMemory)
-      .def("load_into_memory", &framework::BoxHelper::LoadIntoMemory);
-}  // end BoxHelper
-}  // end namespace pybind
-}  // end namespace paddle
diff --git a/paddle/fluid/pybind/box_helper_py.h b/paddle/fluid/pybind/box_helper_py.h
deleted file mode 100644
index 33072dd5a3a38b0a306056a7bd4b8aa5cf36b1df..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/box_helper_py.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-
-void BindBoxHelper(py::module* m);
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/communicator_py.cc b/paddle/fluid/pybind/communicator_py.cc
deleted file mode 100644
index 5b576f06dab9fba4cccdff35647c8bc9cebcbdc9..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/communicator_py.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/pybind/communicator_py.h"
-
-#include <Python.h>
-#include <memory>
-
-#include "paddle/fluid/framework/program_desc.h"
-#include "pybind11/pybind11.h"
-
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-namespace py = pybind11;
-
-using paddle::framework::ProgramDesc;
-using paddle::operators::distributed::Communicator;
-using paddle::framework::Scope;
-
-namespace paddle {
-namespace pybind {
-
-void BindCommunicator(py::module* m) {
-  // Communicator is already used by nccl, change to DistCommunicator
-  py::class_<Communicator, std::shared_ptr<Communicator>>(*m,
-                                                          "DistCommunicator")
-      .def(py::init([](const ProgramDesc& program, Scope* param_scope) {
-        Communicator::Init(program, param_scope);
-        return Communicator::GetInstantcePtr();
-      }))
-      .def("stop", &Communicator::Stop)
-      .def("start", &Communicator::Start)
-      .def("is_running", &Communicator::IsRunning);
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/communicator_py.h b/paddle/fluid/pybind/communicator_py.h
deleted file mode 100644
index 374c74bdafed17b30f5dff98d3f03a8726ff6049..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/communicator_py.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <Python.h>
-
-#include "pybind11/pybind11.h"
-
-namespace paddle {
-namespace pybind {
-
-void BindCommunicator(pybind11::module* m);
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
deleted file mode 100644
index 71eeaf3b53acf98c9f5e43f9acd6d67d42086005..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/const_value.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/pybind/const_value.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/operator.h"
-
-#if defined(PADDLE_WITH_DGC)
-#include "paddle/fluid/framework/details/dgc_const_values.h"
-#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
-#endif
-
-namespace paddle {
-namespace pybind {
-
-void BindConstValue(pybind11::module* m) {
-  m->def("kEmptyVarName", [] { return framework::kEmptyVarName; });
-  m->def("kTempVarName", [] { return framework::kTempVarName; });
-  m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
-  m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
-  m->def("kControlDepVarName",
-         [] { return framework::ir::Node::kControlDepVarName; });
-  m->def("kNewGradSuffix", [] { return framework::kNewGradSuffix; });
-
-  auto op_proto_and_checker_maker =
-      m->def_submodule("op_proto_and_checker_maker");
-
-  pybind11::enum_<framework::OpRole>(op_proto_and_checker_maker, "OpRole")
-      .value("Forward", framework::OpRole::kForward)
-      .value("Backward", framework::OpRole::kBackward)
-      .value("Optimize", framework::OpRole::kOptimize)
-      .value("Loss", framework::OpRole::kLoss)
-      .value("RPC", framework::OpRole::kRPC)
-      .value("Dist", framework::OpRole::kDist)
-      .value("LRSched", framework::OpRole::kLRSched);
-
-  op_proto_and_checker_maker.def(
-      "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
-  op_proto_and_checker_maker.def(
-      "kOpRoleVarAttrName",
-      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName);
-  op_proto_and_checker_maker.def(
-      "kOpNameScopeAttrName",
-      framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
-  op_proto_and_checker_maker.def(
-      "kOpCreationCallstackAttrName",
-      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
-#if defined(PADDLE_WITH_DGC)
-  auto dgc = m->def_submodule("dgc");
-  dgc.def("kDGCUName", [] { return framework::details::g_dgc_u; });
-  dgc.def("kDGCVName", [] { return framework::details::g_dgc_v; });
-  dgc.def("kDGCKName", [] { return framework::details::g_dgc_k; });
-  dgc.def("kDGCEncodedName", [] { return framework::details::g_dgc_encoded; });
-  dgc.def("kDGCCounterName",
-          [] { return framework::details::g_dgc_counter_name; });
-  dgc.def("kDGCRampUpBeginStepName",
-          [] { return framework::details::g_dgc_rampup_begin_step; });
-#endif
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/const_value.h b/paddle/fluid/pybind/const_value.h
deleted file mode 100644
index 2fab3160d1d95af7f6a49c472c2e211c19e67cac..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/const_value.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <Python.h>
-
-#include "paddle/fluid/platform/enforce.h"
-#include "pybind11/pybind11.h"
-
-namespace paddle {
-namespace pybind {
-
-void BindConstValue(pybind11::module* m);
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
deleted file mode 100644
index dd513d4b85ccb0d18d2641239365523c5b0b7ea4..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/data_set_py.cc
+++ /dev/null
@@ -1,279 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fcntl.h>
-#ifdef _POSIX_C_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-
-#ifdef _XOPEN_SOURCE
-#undef _XOPEN_SOURCE
-#endif
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/text_format.h"
-#include "paddle/fluid/framework/async_executor.h"
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/data_feed.pb.h"
-#include "paddle/fluid/framework/data_set.h"
-#include "paddle/fluid/framework/dataset_factory.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/variant.h"
-#include "paddle/fluid/pybind/data_set_py.h"
-
-namespace py = pybind11;
-namespace pd = paddle::framework;
-
-namespace paddle {
-namespace pybind {
-
-class IterableDatasetWrapper {
- public:
-  IterableDatasetWrapper(framework::Dataset *dataset,
-                         const std::vector<std::string> &slots,
-                         const std::vector<platform::Place> &places,
-                         size_t batch_size, bool drop_last)
-      : dataset_(dataset),
-        slots_(slots),
-        places_(places),
-        batch_size_(batch_size),
-        drop_last_(drop_last) {
-#if defined _WIN32
-    PADDLE_THROW("Dataset is not supported on Windows");
-#elif defined __APPLE__
-    PADDLE_THROW("Dataset is not supported on MAC");
-#else
-    size_t device_num = places_.size();
-    PADDLE_ENFORCE_GT(device_num, 0, "thread_num must be larger than 0");
-    PADDLE_ENFORCE_GT(slots_.size(), 0, "slot_num cannot be 0");
-    scopes_.reserve(device_num);
-    tensors_.reserve(device_num);
-    for (size_t i = 0; i < device_num; ++i) {
-      scopes_.emplace_back(new framework::Scope());
-      tensors_.emplace_back();
-      for (auto &var_name : slots_) {
-        auto *var = scopes_.back()->Var(var_name);
-        auto *t = var->GetMutable<framework::LoDTensor>();
-        tensors_.back().emplace_back(t);
-      }
-    }
-
-    is_exhaustive_.resize(device_num);
-    exhaustive_num_ = 0;
-#endif
-  }
-
-  void Start() {
-    PADDLE_ENFORCE_EQ(is_started_, false, "Reader has been started");
-    data_feeds_ = dataset_->GetReaders();
-    PADDLE_ENFORCE_EQ(data_feeds_.size(), places_.size(),
-                      "Device number does not match reader number");
-    for (size_t i = 0; i < places_.size(); ++i) {
-      data_feeds_[i]->AssignFeedVar(*scopes_[i]);
-      data_feeds_[i]->SetPlace(platform::CPUPlace());
-      PADDLE_ENFORCE_EQ(data_feeds_[i]->Start(), true, "Reader start failed");
-    }
-    is_started_ = true;
-
-    is_exhaustive_.assign(places_.size(), false);
-    exhaustive_num_ = 0;
-  }
-
-  std::vector<std::unordered_map<std::string, framework::LoDTensor>> Next() {
-    PADDLE_ENFORCE_EQ(is_started_, true, "Reader must be started");
-    size_t device_num = places_.size();
-
-    std::vector<std::unordered_map<std::string, framework::LoDTensor>> result(
-        device_num);
-
-    size_t read_num = 0;
-    while (read_num < device_num && exhaustive_num_ < device_num) {
-      for (size_t i = 0; i < data_feeds_.size(); ++i) {
-        if (is_exhaustive_[i]) {
-          continue;
-        }
-
-        bool is_success = (data_feeds_[i]->Next() > 0);
-        if (!is_success) {
-          is_exhaustive_[i] = true;
-          ++exhaustive_num_;
-          continue;
-        }
-
-        for (size_t j = 0; j < slots_.size(); ++j) {
-          if (!IsValidLoDTensor(*tensors_[i][j])) {
-            is_success = false;
-            break;
-          }
-
-          if (tensors_[i][j]->place() == places_[read_num]) {
-            result[read_num].emplace(slots_[j], std::move(*tensors_[i][j]));
-          } else {
-            framework::TensorCopy(std::move(*tensors_[i][j]), places_[read_num],
-                                  &result[read_num][slots_[j]]);
-          }
-        }
-
-        if (!is_success) {
-          is_exhaustive_[i] = true;
-          ++exhaustive_num_;
-          continue;
-        }
-
-        ++read_num;
-        if (read_num == device_num) {
-          break;
-        }
-      }
-    }
-
-    if (UNLIKELY(read_num != device_num)) {
-      is_started_ = false;
-      throw py::stop_iteration();
-    }
-
-    return result;
-  }
-
- private:
-  bool IsValidLoDTensor(const framework::LoDTensor &tensor) const {
-    auto &lod = tensor.lod();
-    PADDLE_ENFORCE_LE(lod.size(), 1, "lod level must be not larger than 1");
-    if (!drop_last_) return true;
-
-    if (lod.empty()) {
-      return static_cast<size_t>(tensor.dims()[0]) == batch_size_;
-    } else {
-      return lod[0].size() == batch_size_ + 1;
-    }
-  }
-
- private:
-  framework::Dataset *dataset_;
-  std::vector<std::string> slots_;
-  std::vector<platform::Place> places_;
-  size_t batch_size_;
-  bool drop_last_;
-
-  std::vector<framework::DataFeed *> data_feeds_;
-  std::vector<bool> is_exhaustive_;
-  size_t exhaustive_num_;
-
-  std::vector<std::unique_ptr<framework::Scope>> scopes_;
-  std::vector<std::vector<framework::LoDTensor *>> tensors_;
-  bool is_started_{false};
-};
-
-void BindDataset(py::module *m) {
-  py::class_<framework::Dataset, std::unique_ptr<framework::Dataset>>(*m,
-                                                                      "Dataset")
-      .def(py::init([](const std::string &name = "MultiSlotDataset") {
-        return framework::DatasetFactory::CreateDataset(name);
-      }))
-      .def("set_filelist", &framework::Dataset::SetFileList,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_thread_num", &framework::Dataset::SetThreadNum,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_trainer_num", &framework::Dataset::SetTrainerNum,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_fleet_send_batch_size",
-           &framework::Dataset::SetFleetSendBatchSize,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_hdfs_config", &framework::Dataset::SetHdfsConfig,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_data_feed_desc", &framework::Dataset::SetDataFeedDesc,
-           py::call_guard<py::gil_scoped_release>())
-      .def("get_filelist", &framework::Dataset::GetFileList,
-           py::call_guard<py::gil_scoped_release>())
-      .def("get_thread_num", &framework::Dataset::GetThreadNum,
-           py::call_guard<py::gil_scoped_release>())
-      .def("get_trainer_num", &framework::Dataset::GetTrainerNum,
-           py::call_guard<py::gil_scoped_release>())
-      .def("get_fleet_send_batch_size",
-           &framework::Dataset::GetFleetSendBatchSize,
-           py::call_guard<py::gil_scoped_release>())
-      .def("get_hdfs_config", &framework::Dataset::GetHdfsConfig,
-           py::call_guard<py::gil_scoped_release>())
-      .def("get_data_feed_desc", &framework::Dataset::GetDataFeedDesc,
-           py::call_guard<py::gil_scoped_release>())
-      .def("register_client2client_msg_handler",
-           &framework::Dataset::RegisterClientToClientMsgHandler,
-           py::call_guard<py::gil_scoped_release>())
-      .def("create_channel", &framework::Dataset::CreateChannel,
-           py::call_guard<py::gil_scoped_release>())
-      .def("create_readers", &framework::Dataset::CreateReaders,
-           py::call_guard<py::gil_scoped_release>())
-      .def("destroy_readers", &framework::Dataset::DestroyReaders,
-           py::call_guard<py::gil_scoped_release>())
-      .def("load_into_memory", &framework::Dataset::LoadIntoMemory,
-           py::call_guard<py::gil_scoped_release>())
-      .def("preload_into_memory", &framework::Dataset::PreLoadIntoMemory,
-           py::call_guard<py::gil_scoped_release>())
-      .def("wait_preload_done", &framework::Dataset::WaitPreLoadDone,
-           py::call_guard<py::gil_scoped_release>())
-      .def("release_memory", &framework::Dataset::ReleaseMemory,
-           py::call_guard<py::gil_scoped_release>())
-      .def("local_shuffle", &framework::Dataset::LocalShuffle,
-           py::call_guard<py::gil_scoped_release>())
-      .def("global_shuffle", &framework::Dataset::GlobalShuffle,
-           py::call_guard<py::gil_scoped_release>())
-      .def("get_memory_data_size", &framework::Dataset::GetMemoryDataSize,
-           py::call_guard<py::gil_scoped_release>())
-      .def("get_shuffle_data_size", &framework::Dataset::GetShuffleDataSize,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_queue_num", &framework::Dataset::SetChannelNum,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_parse_ins_id", &framework::Dataset::SetParseInsId,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_parse_content", &framework::Dataset::SetParseContent,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_merge_by_lineid", &framework::Dataset::SetMergeByInsId,
-           py::call_guard<py::gil_scoped_release>())
-      .def("merge_by_lineid", &framework::Dataset::MergeByInsId,
-           py::call_guard<py::gil_scoped_release>())
-      .def("slots_shuffle", &framework::Dataset::SlotsShuffle,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_fea_eval", &framework::Dataset::SetFeaEval,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_preload_thread_num", &framework::Dataset::SetPreLoadThreadNum,
-           py::call_guard<py::gil_scoped_release>())
-      .def("create_preload_readers", &framework::Dataset::CreatePreLoadReaders,
-           py::call_guard<py::gil_scoped_release>())
-      .def("destroy_preload_readers",
-           &framework::Dataset::DestroyPreLoadReaders,
-           py::call_guard<py::gil_scoped_release>())
-      .def("dynamic_adjust_channel_num",
-           &framework::Dataset::DynamicAdjustChannelNum,
-           py::call_guard<py::gil_scoped_release>())
-      .def("dynamic_adjust_readers_num",
-           &framework::Dataset::DynamicAdjustReadersNum,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_fleet_send_sleep_seconds",
-           &framework::Dataset::SetFleetSendSleepSeconds,
-           py::call_guard<py::gil_scoped_release>());
-
-  py::class_<IterableDatasetWrapper>(*m, "IterableDatasetWrapper")
-      .def(py::init<framework::Dataset *, const std::vector<std::string> &,
-                    const std::vector<platform::Place> &, size_t, bool>())
-      .def("_start", &IterableDatasetWrapper::Start)
-      .def("_next", &IterableDatasetWrapper::Next);
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/data_set_py.h b/paddle/fluid/pybind/data_set_py.h
deleted file mode 100644
index f60e862ce673119c7b8e8ae5981fc54e8c9bdb2e..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/data_set_py.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-
-void BindDataset(py::module* m);
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
deleted file mode 100644
index 831f30e35fd3e01ce0f0524f6f85dd59494f5353..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/exception.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/pybind/exception.h"
-
-namespace paddle {
-namespace pybind {
-
-void BindException(pybind11::module* m) {
-  static pybind11::exception<platform::EOFException> eof(*m, "EOFException");
-  static pybind11::exception<platform::EnforceNotMet> exc(*m, "EnforceNotMet");
-  pybind11::register_exception_translator([](std::exception_ptr p) {
-    try {
-      if (p) std::rethrow_exception(p);
-    } catch (const platform::EOFException& e) {
-      eof(e.what());
-    } catch (const platform::EnforceNotMet& e) {
-      exc(e.what());
-    }
-  });
-
-  m->def("__unittest_throw_exception__",
-         [] { PADDLE_THROW("test exception"); });
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h
deleted file mode 100644
index 5e054267361f2c62b3ad36581be0ad17ce0718de..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/exception.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <Python.h>
-
-#include "paddle/fluid/platform/enforce.h"
-#include "pybind11/pybind11.h"
-
-namespace paddle {
-namespace pybind {
-
-void BindException(pybind11::module* m);
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc
deleted file mode 100644
index e7c7750c27de617ba8f339302ffa0fde95a794af..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fcntl.h>
-
-#ifdef _POSIX_C_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-
-#ifdef _XOPEN_SOURCE
-#undef _XOPEN_SOURCE
-#endif
-
-#include <string>
-#include <vector>
-
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/text_format.h"
-#include "paddle/fluid/framework/async_executor.h"
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/data_feed.pb.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/variant.h"
-#include "paddle/fluid/pybind/fleet_wrapper_py.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-void BindFleetWrapper(py::module* m) {
-  py::class_<framework::FleetWrapper>(*m, "Fleet")
-      .def(py::init())
-      .def("push_dense", &framework::FleetWrapper::PushDenseVarsSync)
-      .def("pull_dense", &framework::FleetWrapper::PullDenseVarsSync)
-      .def("init_server", &framework::FleetWrapper::InitServer)
-      .def("run_server", &framework::FleetWrapper::RunServer)
-      .def("init_worker", &framework::FleetWrapper::InitWorker)
-      .def("init_model", &framework::FleetWrapper::PushDenseParamSync)
-      .def("save_model", &framework::FleetWrapper::SaveModel)
-      .def("get_cache_threshold", &framework::FleetWrapper::GetCacheThreshold)
-      .def("cache_shuffle", &framework::FleetWrapper::CacheShuffle)
-      .def("save_cache", &framework::FleetWrapper::SaveCache)
-      .def("load_model", &framework::FleetWrapper::LoadModel)
-      .def("clear_model", &framework::FleetWrapper::ClearModel)
-      .def("stop_server", &framework::FleetWrapper::StopServer)
-      .def("gather_servers", &framework::FleetWrapper::GatherServers)
-      .def("gather_clients", &framework::FleetWrapper::GatherClients)
-      .def("get_clients_info", &framework::FleetWrapper::GetClientsInfo)
-      .def("create_client2client_connection",
-           &framework::FleetWrapper::CreateClient2ClientConnection)
-      .def("shrink_sparse_table", &framework::FleetWrapper::ShrinkSparseTable)
-      .def("shrink_dense_table", &framework::FleetWrapper::ShrinkDenseTable)
-      .def("client_flush", &framework::FleetWrapper::ClientFlush)
-      .def("load_from_paddle_model",
-           &framework::FleetWrapper::LoadFromPaddleModel)
-      .def("load_model_one_table", &framework::FleetWrapper::LoadModelOneTable)
-      .def("set_client2client_config",
-           &framework::FleetWrapper::SetClient2ClientConfig);
-}  // end FleetWrapper
-}  // end namespace pybind
-}  // end namespace paddle
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.h b/paddle/fluid/pybind/fleet_wrapper_py.h
deleted file mode 100644
index b2bfa10eecd5b79a1450ad8b9c784fa8af708602..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/fleet_wrapper_py.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-
-void BindFleetWrapper(py::module* m);
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
deleted file mode 100644
index 63e3e7e8570f3fc2afabf14c95e8a558de1482a6..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/imperative.cc
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/pybind/imperative.h"
-
-#include <Python.h>
-#include <pybind11/chrono.h>
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/imperative/backward_strategy.h"
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/imperative/nccl_context.h"
-#include "paddle/fluid/imperative/profiler.h"
-#include "paddle/fluid/imperative/tracer.h"
-#include "paddle/fluid/imperative/type_defs.h"
-
-#include "paddle/fluid/pybind/pybind_boost_headers.h"
-
-namespace paddle {
-namespace pybind {
-
-namespace py = ::pybind11;
-
-class Layer : public imperative::Layer {
- public:
-  using imperative::Layer::Layer;  // Inherit constructors
-
-  std::vector<std::shared_ptr<imperative::VarBase>> Forward(
-      const std::vector<std::shared_ptr<imperative::VarBase>> &inputs)
-      override {
-    PYBIND11_OVERLOAD(std::vector<std::shared_ptr<imperative::VarBase>>, Layer,
-                      Forward, inputs);  // NOLINT
-  }
-};
-
-// warper for pyobject to avoid imperative module depend on python
-// TODO(jiabin) Add OpBase's pybind interface back to enable backward hook
-class PYBIND11_HIDDEN PyCallableObject {
- public:
-  PyCallableObject(std::shared_ptr<py::object> py_obj_ptr)
-      : py_obj_ptr_(std::move(py_obj_ptr)) {}
-  ~PyCallableObject() {
-    py::call_guard<py::gil_scoped_acquire>();
-    py_obj_ptr_.reset();
-  }
-  void operator()() {
-    py::call_guard<py::gil_scoped_acquire>();
-    py_obj_ptr_->operator()(this);
-  }
-
- private:
-  std::shared_ptr<py::object> py_obj_ptr_;
-};
-
-// Function like obj.attr_name in Python.
-static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) {
-  // NOTE(zjl): PyObject_GetAttrString would return nullptr when attr_name
-  // is not inside obj, but it would also set the error flag of Python.
-  // If the error flag is set in C++, C++ code would not raise Exception,
-  // but Python would raise Exception once C++ call ends.
-  // To avoid unexpected Exception raised in Python, we check whether
-  // attribute exists before calling PyObject_GetAttrString.
-  //
-  // Caution: PyObject_GetAttrString would increase reference count of PyObject.
-  // Developer should call Py_DECREF manually after the attribute is not used.
-  if (PyObject_HasAttrString(obj, attr_name)) {
-    return PyObject_GetAttrString(obj, attr_name);
-  } else {
-    return nullptr;
-  }
-}
-
-template <typename T>
-static T PyObjectCast(PyObject *obj) {
-  try {
-    return py::cast<T>(py::handle(obj));
-  } catch (py::cast_error &) {
-    PADDLE_THROW("Python object is not type of %s", typeid(T).name());
-  }
-}
-
-// NOTE(zjl): py::handle is a very light wrapper of PyObject *.
-// Unlike py::object, py::handle does not change reference count of PyObject *.
-static std::vector<std::shared_ptr<imperative::VarBase>>
-GetVarBaseListFromPyHandle(const py::handle &handle) {
-  PyObject *py_obj = handle.ptr();  // get underlying PyObject
-  // Python None is not nullptr in C++!
-  if (!py_obj || py_obj == Py_None) {
-    return {};
-  }
-
-  const char *kIVarField = "_ivar";
-  PyObject *py_ivar = GetPythonAttribute(py_obj, kIVarField);
-  std::vector<std::shared_ptr<imperative::VarBase>> result;
-
-  if (py_ivar) {  // Variable
-    result.emplace_back(
-        PyObjectCast<std::shared_ptr<imperative::VarBase>>(py_ivar));
-    Py_DECREF(py_ivar);
-  } else if (PyList_Check(py_obj)) {  // List of Variable
-    size_t len = PyList_GET_SIZE(py_obj);
-    result.reserve(len);
-    for (size_t i = 0; i < len; ++i) {
-      PyObject *py_ivar =
-          PyObject_GetAttrString(PyList_GET_ITEM(py_obj, i), kIVarField);
-      PADDLE_ENFORCE_NOT_NULL(py_ivar);
-      result.emplace_back(
-          PyObjectCast<std::shared_ptr<imperative::VarBase>>(py_ivar));
-      Py_DECREF(py_ivar);
-    }
-  } else if (PyTuple_Check(py_obj)) {  // Tuple of Variable
-    size_t len = PyTuple_GET_SIZE(py_obj);
-    result.reserve(len);
-    for (size_t i = 0; i < len; ++i) {
-      PyObject *py_ivar =
-          PyObject_GetAttrString(PyTuple_GET_ITEM(py_obj, i), kIVarField);
-      PADDLE_ENFORCE_NOT_NULL(py_ivar);
-      result.emplace_back(
-          PyObjectCast<std::shared_ptr<imperative::VarBase>>(py_ivar));
-      Py_DECREF(py_ivar);
-    }
-  } else {
-    PADDLE_THROW(
-        "unsupported type %s, must be Variable, list[Variable] or "
-        "tuple[Variable]",
-        py::str(handle));
-  }
-
-  return result;
-}
-
-using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
-
-static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
-    const PyNameVarBaseMap &map) {
-  imperative::NameVarBaseMap result;
-  for (auto &pair : map) {
-    auto var_vec = GetVarBaseListFromPyHandle(pair.second);
-    if (!var_vec.empty()) {
-      result.emplace(pair.first, std::move(var_vec));
-    }
-  }
-
-  PADDLE_ENFORCE_EQ(PyErr_Occurred() == nullptr, true,
-                    py::str(py::handle(PyErr_Occurred())));
-  return result;
-}
-
-static std::string GetTypeName(const imperative::VarBase &var) {
-  if (var.Type() == framework::proto::VarType::RAW) {
-    return "RAW";
-  } else if (!var.Var().IsInitialized()) {
-    return "nullptr";
-  } else {
-    return framework::ToTypeName(var.Var().Type());
-  }
-}
-
-// Bind Methods
-void BindImperative(py::module *m_ptr) {
-  auto &m = *m_ptr;
-
-  py::class_<imperative::detail::BackwardStrategy> backward_strategy(
-      m, "BackwardStrategy", R"DOC(
-
-    BackwardStrategy is a descriptor of a how to run the backward process. Now it has:
-
-    1. :code:`sort_sum_gradient`, which will sum the gradient by the reverse order of trace.
-
-    Examples:
-
-        .. code-block:: python
-
-          import numpy as np
-          import paddle.fluid as fluid
-          from paddle.fluid import FC
-
-          x = np.ones([2, 2], np.float32)
-          with fluid.dygraph.guard():
-              inputs2 = []
-              for _ in range(10):
-                  inputs2.append(fluid.dygraph.base.to_variable(x))
-              ret2 = fluid.layers.sums(inputs2)
-              loss2 = fluid.layers.reduce_sum(ret2)
-              backward_strategy = fluid.dygraph.BackwardStrategy()
-              backward_strategy.sort_sum_gradient = True
-              loss2.backward(backward_strategy)
-      )DOC");
-  backward_strategy.def(py::init())
-      .def_property("sort_sum_gradient",
-                    [](const imperative::detail::BackwardStrategy &self) {
-                      return self.sorted_sum_gradient_;
-                    },
-                    [](imperative::detail::BackwardStrategy &self,
-                       bool sorted_sum_gradient) {
-                      self.sorted_sum_gradient_ = sorted_sum_gradient;
-                    });
-
-  m.def("start_imperative_gperf_profiler",
-        []() { imperative::StartProfile(); });
-
-  m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
-
-  m.def("_is_dygraph_debug_enabled",
-        []() { return imperative::IsDebugEnabled(); });
-  m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); });
-
-  py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
-      m, "VarBase",
-      R"DOC()DOC")
-      .def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
-      .def("__init__",
-           [](imperative::VarBase &self, const std::string &name,
-              framework::proto::VarType::Type type,
-              framework::proto::VarType::Type dtype,
-              const std::vector<int> &dims, bool persistable) {
-             new (&self) imperative::VarBase(name);
-             self.SetPersistable(persistable);
-             self.SetType(type);
-             self.SetDataType(dtype);
-             if (type == framework::proto::VarType::LOD_TENSOR) {
-               auto *tensor =
-                   self.MutableVar()->GetMutable<framework::LoDTensor>();
-               tensor->Resize(framework::make_ddim(dims));
-             }
-           })
-      .def("_run_backward",
-           [](imperative::VarBase &self,
-              const imperative::detail::BackwardStrategy &bckst,
-              const imperative::Tracer &tracer) {
-             // TODO(jiabin): when we impl more backward execution we can select
-             // them
-
-             imperative::Engine *engine = tracer.GetDefaultEngine();
-             VLOG(3) << "Start backward";
-             engine->Init(&self, bckst);
-             engine->Execute();
-             VLOG(3) << "Finish backward";
-           },
-           py::call_guard<py::gil_scoped_release>())
-      .def("_grad_name", &imperative::VarBase::GradVarName)
-      .def("_grad_value",
-           [](imperative::VarBase &self) {
-             return self.MutableGradVar()->Get<framework::LoDTensor>();
-           },
-           py::return_value_policy::reference)
-      .def("_clear_gradient", &imperative::VarBase::ClearGradient)
-      .def("_grad_ivar",
-           [](const imperative::VarBase &self) {
-             auto &grad_var = self.GradVarBase();
-             if (grad_var && grad_var->Var().IsInitialized()) {
-               return grad_var;
-             } else {
-               return std::shared_ptr<imperative::VarBase>(nullptr);
-             }
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const imperative::VarBase &self, const platform::CPUPlace &place,
-              bool blocking) { return self.NewVarBase(place, blocking); },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const imperative::VarBase &self, const platform::CUDAPlace &place,
-              bool blocking) { return self.NewVarBase(place, blocking); },
-           py::return_value_policy::copy)
-      .def("value", [](imperative::VarBase &self) { return self.MutableVar(); },
-           py::return_value_policy::reference)
-      .def_property("name", &imperative::VarBase::Name,
-                    &imperative::VarBase::SetName)
-      .def_property_readonly(
-          "shape",
-          [](imperative::VarBase &self) {
-            if (self.Var().IsType<framework::LoDTensor>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::LoDTensor>().dims());
-            } else {
-              VLOG(2) << "It is meaningless to get shape of variable type "
-                      << GetTypeName(self);
-              return std::vector<int>();
-            }
-          })
-      .def_property_readonly("type", &imperative::VarBase::Type)
-      .def_property_readonly("dtype", &imperative::VarBase::DataType)
-      .def_property("persistable", &imperative::VarBase::Persistable,
-                    &imperative::VarBase::SetPersistable)
-      .def_property("stop_gradient",
-                    &imperative::VarBase::OverridedStopGradient,
-                    &imperative::VarBase::SetOverridedStopGradient);
-
-  py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
-  layer.def(py::init<>())
-      .def("forward",
-           [](imperative::Layer &self,
-              const std::vector<std::shared_ptr<imperative::VarBase>> &inputs) {
-             return self.Forward(inputs);
-           });
-
-  py::class_<imperative::Tracer>(m, "Tracer", "")
-      .def("__init__",
-           [](imperative::Tracer &self) { new (&self) imperative::Tracer(); })
-      .def("trace",
-           [](imperative::Tracer &self, const std::string &type,
-              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs, const platform::CUDAPlace &place,
-              bool trace_backward) {
-             auto ins_map = ConvertToNameVarBaseMap(ins);
-             auto outs_map = ConvertToNameVarBaseMap(outs);
-             {
-               py::gil_scoped_release release;
-               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward);
-             }
-           })
-      .def("trace",
-           [](imperative::Tracer &self, const std::string &type,
-              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs, const platform::CPUPlace &place,
-              bool trace_backward) {
-             auto ins_map = ConvertToNameVarBaseMap(ins);
-             auto outs_map = ConvertToNameVarBaseMap(outs);
-             {
-               py::gil_scoped_release release;
-               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward);
-             }
-           });
-
-  // define parallel context
-  py::class_<imperative::ParallelStrategy> parallel_strategy(
-      m, "ParallelStrategy", "");
-  parallel_strategy.def(py::init())
-      .def_property(
-          "nranks",
-          [](const imperative::ParallelStrategy &self) { return self.nranks_; },
-          [](imperative::ParallelStrategy &self, int nranks) {
-            self.nranks_ = nranks;
-          })
-      .def_property("local_rank",
-                    [](const imperative::ParallelStrategy &self) {
-                      return self.local_rank_;
-                    },
-                    [](imperative::ParallelStrategy &self, int local_rank) {
-                      self.local_rank_ = local_rank;
-                    })
-      .def_property(
-          "trainer_endpoints",
-          [](const imperative::ParallelStrategy &self) {
-            return self.trainer_endpoints_;
-          },
-          [](imperative::ParallelStrategy &self, std::vector<std::string> eps) {
-            self.trainer_endpoints_ = eps;
-          })
-      .def_property("current_endpoint",
-                    [](const imperative::ParallelStrategy &self) {
-                      return self.current_endpoint_;
-                    },
-                    [](imperative::ParallelStrategy &self,
-                       const std::string &ep) { self.current_endpoint_ = ep; });
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  py::class_<imperative::NCCLParallelContext> nccl_ctx(m,
-                                                       "NCCLParallelContext");
-
-  nccl_ctx
-      .def(py::init<const imperative::ParallelStrategy &,
-                    const platform::CUDAPlace &>())
-      .def("init", [](imperative::NCCLParallelContext &self) { self.Init(); });
-#endif
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
deleted file mode 100644
index 0e3e98512d60fa111c94f70bf43524c36463cc05..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/imperative.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <Python.h>
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-
-namespace paddle {
-namespace pybind {
-
-void BindImperative(pybind11::module* m);
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
deleted file mode 100644
index f7a590222854c275acbeb995aa62a36224ccab2e..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/inference_api.cc
+++ /dev/null
@@ -1,429 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/pybind/inference_api.h"
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-#include <cstring>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-using paddle::PaddleDType;
-using paddle::PaddleBuf;
-using paddle::PaddleTensor;
-using paddle::PaddlePlace;
-using paddle::PaddlePredictor;
-using paddle::NativeConfig;
-using paddle::NativePaddlePredictor;
-using paddle::AnalysisPredictor;
-
-namespace {
-void BindPaddleDType(py::module *m);
-void BindPaddleBuf(py::module *m);
-void BindPaddleTensor(py::module *m);
-void BindPaddlePlace(py::module *m);
-void BindPaddlePredictor(py::module *m);
-void BindNativeConfig(py::module *m);
-void BindNativePredictor(py::module *m);
-void BindAnalysisConfig(py::module *m);
-void BindAnalysisPredictor(py::module *m);
-
-#ifdef PADDLE_WITH_MKLDNN
-void BindMkldnnQuantizerConfig(py::module *m);
-#endif
-
-template <typename T>
-PaddleBuf PaddleBufCreate(py::array_t<T> data) {
-  PaddleBuf buf(data.size() * sizeof(T));
-  std::copy_n(static_cast<T *>(data.mutable_data()), data.size(),
-              static_cast<T *>(buf.data()));
-  return buf;
-}
-
-template <typename T>
-void PaddleBufReset(PaddleBuf &buf, py::array_t<T> data) {  // NOLINT
-  buf.Resize(data.size() * sizeof(T));
-  std::copy_n(static_cast<T *>(data.mutable_data()), data.size(),
-              static_cast<T *>(buf.data()));
-}
-
-template <typename T>
-PaddleDType PaddleTensorGetDType();
-
-template <>
-PaddleDType PaddleTensorGetDType<int32_t>() {
-  return PaddleDType::INT32;
-}
-
-template <>
-PaddleDType PaddleTensorGetDType<int64_t>() {
-  return PaddleDType::INT64;
-}
-
-template <>
-PaddleDType PaddleTensorGetDType<float>() {
-  return PaddleDType::FLOAT32;
-}
-
-template <typename T>
-PaddleTensor PaddleTensorCreate(
-    py::array_t<T> data, const std::string name = "",
-    const std::vector<std::vector<size_t>> &lod = {}, bool copy = true) {
-  PaddleTensor tensor;
-
-  if (copy) {
-    PaddleBuf buf(data.size() * sizeof(T));
-    std::copy_n(static_cast<T *>(data.mutable_data()), data.size(),
-                static_cast<T *>(buf.data()));
-    tensor.data = std::move(buf);
-  } else {
-    tensor.data = PaddleBuf(data.mutable_data(), data.size() * sizeof(T));
-  }
-
-  tensor.dtype = PaddleTensorGetDType<T>();
-  tensor.name = name;
-  tensor.lod = lod;
-  tensor.shape.resize(data.ndim());
-  std::copy_n(data.shape(), data.ndim(), tensor.shape.begin());
-
-  return tensor;
-}
-
-py::array PaddleTensorGetData(PaddleTensor &tensor) {  // NOLINT
-  py::dtype dt;
-  switch (tensor.dtype) {
-    case PaddleDType::INT32:
-      dt = py::dtype::of<int32_t>();
-      break;
-    case PaddleDType::INT64:
-      dt = py::dtype::of<int64_t>();
-      break;
-    case PaddleDType::FLOAT32:
-      dt = py::dtype::of<float>();
-      break;
-    default:
-      LOG(FATAL) << "unsupported dtype";
-  }
-  return py::array(dt, {tensor.shape}, tensor.data.data());
-}
-}  // namespace
-
-void BindInferenceApi(py::module *m) {
-  BindPaddleDType(m);
-  BindPaddleBuf(m);
-  BindPaddleTensor(m);
-  BindPaddlePlace(m);
-  BindPaddlePredictor(m);
-  BindNativeConfig(m);
-  BindNativePredictor(m);
-  BindAnalysisConfig(m);
-  BindAnalysisPredictor(m);
-#ifdef PADDLE_WITH_MKLDNN
-  BindMkldnnQuantizerConfig(m);
-#endif
-  m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<AnalysisConfig>);
-  m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<NativeConfig>);
-  m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
-}
-
-namespace {
-void BindPaddleDType(py::module *m) {
-  py::enum_<PaddleDType>(*m, "PaddleDType")
-      .value("FLOAT32", PaddleDType::FLOAT32)
-      .value("INT64", PaddleDType::INT64)
-      .value("INT32", PaddleDType::INT32);
-}
-
-void BindPaddleBuf(py::module *m) {
-  py::class_<PaddleBuf>(*m, "PaddleBuf")
-      .def(py::init<size_t>())
-      .def(py::init([](std::vector<float> &data) {
-        auto buf = PaddleBuf(data.size() * sizeof(float));
-        std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
-        return buf;
-      }))
-      .def(py::init(&PaddleBufCreate<int32_t>))
-      .def(py::init(&PaddleBufCreate<int64_t>))
-      .def(py::init(&PaddleBufCreate<float>))
-      .def("resize", &PaddleBuf::Resize)
-      .def("reset",
-           [](PaddleBuf &self, std::vector<float> &data) {
-             self.Resize(data.size() * sizeof(float));
-             std::memcpy(self.data(), data.data(), self.length());
-           })
-      .def("reset", &PaddleBufReset<int32_t>)
-      .def("reset", &PaddleBufReset<int64_t>)
-      .def("reset", &PaddleBufReset<float>)
-      .def("empty", &PaddleBuf::empty)
-      .def("tolist",
-           [](PaddleBuf &self, const std::string &dtype) -> py::list {
-             py::list l;
-             if (dtype == "int32") {
-               auto *data = static_cast<int32_t *>(self.data());
-               auto size = self.length() / sizeof(int32_t);
-               l = py::cast(std::vector<int32_t>(data, data + size));
-             } else if (dtype == "int64") {
-               auto *data = static_cast<int64_t *>(self.data());
-               auto size = self.length() / sizeof(int64_t);
-               l = py::cast(std::vector<int64_t>(data, data + size));
-             } else if (dtype == "float32") {
-               auto *data = static_cast<float *>(self.data());
-               auto size = self.length() / sizeof(float);
-               l = py::cast(std::vector<float>(data, data + size));
-             } else {
-               LOG(FATAL) << "unsupported dtype";
-             }
-             return l;
-           })
-      .def("float_data",
-           [](PaddleBuf &self) -> std::vector<float> {
-             auto *data = static_cast<float *>(self.data());
-             return {data, data + self.length() / sizeof(*data)};
-           })
-      .def("int64_data",
-           [](PaddleBuf &self) -> std::vector<int64_t> {
-             int64_t *data = static_cast<int64_t *>(self.data());
-             return {data, data + self.length() / sizeof(*data)};
-           })
-      .def("int32_data",
-           [](PaddleBuf &self) -> std::vector<int32_t> {
-             int32_t *data = static_cast<int32_t *>(self.data());
-             return {data, data + self.length() / sizeof(*data)};
-           })
-      .def("length", &PaddleBuf::length);
-}
-
-void BindPaddleTensor(py::module *m) {
-  py::class_<PaddleTensor>(*m, "PaddleTensor")
-      .def(py::init<>())
-      .def(py::init(&PaddleTensorCreate<int32_t>), py::arg("data"),
-           py::arg("name") = "",
-           py::arg("lod") = std::vector<std::vector<size_t>>(),
-           py::arg("copy") = true)
-      .def(py::init(&PaddleTensorCreate<int64_t>), py::arg("data"),
-           py::arg("name") = "",
-           py::arg("lod") = std::vector<std::vector<size_t>>(),
-           py::arg("copy") = true)
-      .def(py::init(&PaddleTensorCreate<float>), py::arg("data"),
-           py::arg("name") = "",
-           py::arg("lod") = std::vector<std::vector<size_t>>(),
-           py::arg("copy") = true)
-      .def("as_ndarray", &PaddleTensorGetData)
-      .def_readwrite("name", &PaddleTensor::name)
-      .def_readwrite("shape", &PaddleTensor::shape)
-      .def_readwrite("data", &PaddleTensor::data)
-      .def_readwrite("dtype", &PaddleTensor::dtype)
-      .def_readwrite("lod", &PaddleTensor::lod);
-}
-
-void BindPaddlePlace(py::module *m) {
-  py::enum_<PaddlePlace>(*m, "PaddlePlace")
-      .value("UNK", PaddlePlace::kUNK)
-      .value("CPU", PaddlePlace::kCPU)
-      .value("GPU", PaddlePlace::kGPU);
-}
-
-void BindPaddlePredictor(py::module *m) {
-  auto paddle_predictor = py::class_<PaddlePredictor>(*m, "PaddlePredictor");
-  paddle_predictor
-      .def("run",
-           [](PaddlePredictor &self, const std::vector<PaddleTensor> &inputs) {
-             std::vector<PaddleTensor> outputs;
-             self.Run(inputs, &outputs);
-             return outputs;
-           })
-      .def("get_input_tensor", &PaddlePredictor::GetInputTensor)
-      .def("get_output_tensor", &PaddlePredictor::GetOutputTensor)
-      .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
-      .def("clone", &PaddlePredictor::Clone);
-
-  auto config = py::class_<PaddlePredictor::Config>(paddle_predictor, "Config");
-  config.def(py::init<>())
-      .def_readwrite("model_dir", &PaddlePredictor::Config::model_dir);
-}
-
-void BindNativeConfig(py::module *m) {
-  py::class_<NativeConfig, PaddlePredictor::Config>(*m, "NativeConfig")
-      .def(py::init<>())
-      .def_readwrite("use_gpu", &NativeConfig::use_gpu)
-      .def_readwrite("device", &NativeConfig::device)
-      .def_readwrite("fraction_of_gpu_memory",
-                     &NativeConfig::fraction_of_gpu_memory)
-      .def_readwrite("prog_file", &NativeConfig::prog_file)
-      .def_readwrite("param_file", &NativeConfig::param_file)
-      .def_readwrite("specify_input_name", &NativeConfig::specify_input_name)
-      .def("set_cpu_math_library_num_threads",
-           &NativeConfig::SetCpuMathLibraryNumThreads)
-      .def("cpu_math_library_num_threads",
-           &NativeConfig::cpu_math_library_num_threads);
-}
-
-void BindNativePredictor(py::module *m) {
-  py::class_<NativePaddlePredictor, PaddlePredictor>(*m,
-                                                     "NativePaddlePredictor")
-      .def(py::init<const NativeConfig &>())
-      .def("init", &NativePaddlePredictor::Init)
-      .def("run",
-           [](NativePaddlePredictor &self,
-              const std::vector<PaddleTensor> &inputs) {
-             std::vector<PaddleTensor> outputs;
-             self.Run(inputs, &outputs);
-             return outputs;
-           })
-      .def("get_input_tensor", &NativePaddlePredictor::GetInputTensor)
-      .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
-      .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
-      .def("clone", &NativePaddlePredictor::Clone)
-      .def("scope", &NativePaddlePredictor::scope,
-           py::return_value_policy::reference);
-}
-
-void BindAnalysisConfig(py::module *m) {
-  py::class_<AnalysisConfig> analysis_config(*m, "AnalysisConfig");
-
-  py::enum_<AnalysisConfig::Precision>(analysis_config, "Precision")
-      .value("Float32", AnalysisConfig::Precision::kFloat32)
-      .value("Int8", AnalysisConfig::Precision::kInt8)
-      .value("Half", AnalysisConfig::Precision::kHalf)
-      .export_values();
-
-  analysis_config.def(py::init<const AnalysisConfig &>())
-      .def(py::init<const std::string &>())
-      .def(py::init<const std::string &, const std::string &>())
-      .def("set_model", (void (AnalysisConfig::*)(const std::string &)) &
-                            AnalysisConfig::SetModel)
-      .def("set_model", (void (AnalysisConfig::*)(const std::string &,
-                                                  const std::string &)) &
-                            AnalysisConfig::SetModel)
-      .def("set_prog_file", &AnalysisConfig::SetProgFile)
-      .def("set_params_file", &AnalysisConfig::SetParamsFile)
-      .def("model_dir", &AnalysisConfig::model_dir)
-      .def("prog_file", &AnalysisConfig::prog_file)
-      .def("params_file", &AnalysisConfig::params_file)
-      .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
-           py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
-      .def("disable_gpu", &AnalysisConfig::DisableGpu)
-      .def("use_gpu", &AnalysisConfig::use_gpu)
-      .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
-      .def("memory_pool_init_size_mb",
-           &AnalysisConfig::memory_pool_init_size_mb)
-      .def("fraction_of_gpu_memory_for_pool",
-           &AnalysisConfig::fraction_of_gpu_memory_for_pool)
-      .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim,
-           py::arg("x") = true)
-      .def("ir_optim", &AnalysisConfig::ir_optim)
-      .def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim)
-      .def("enable_profile", &AnalysisConfig::EnableProfile)
-      .def("set_optim_cache_dir", &AnalysisConfig::SetOptimCacheDir)
-      .def("switch_use_feed_fetch_ops", &AnalysisConfig::SwitchUseFeedFetchOps,
-           py::arg("x") = true)
-      .def("use_feed_fetch_ops_enabled",
-           &AnalysisConfig::use_feed_fetch_ops_enabled)
-      .def("switch_specify_input_names",
-           &AnalysisConfig::SwitchSpecifyInputNames, py::arg("x") = true)
-      .def("specify_input_name", &AnalysisConfig::specify_input_name)
-      .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
-           py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
-           py::arg("min_subgraph_size") = 3,
-           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
-           py::arg("use_static") = false, py::arg("use_calib_mode") = true)
-      .def("enable_anakin_engine", &AnalysisConfig::EnableAnakinEngine,
-           py::arg("max_batch_size") = 1,
-           py::arg("max_input_shape") =
-               std::map<std::string, std::vector<int>>(),
-           py::arg("min_subgraph_size") = 6,
-           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
-           py::arg("auto_config_layout") = false,
-           py::arg("passes_filter") = std::vector<std::string>(),
-           py::arg("ops_filter") = std::vector<std::string>())
-      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
-      .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
-           py::arg("x") = true)
-      .def("enable_mkldnn", &AnalysisConfig::EnableMKLDNN)
-      .def("mkldnn_enabled", &AnalysisConfig::mkldnn_enabled)
-      .def("set_cpu_math_library_num_threads",
-           &AnalysisConfig::SetCpuMathLibraryNumThreads)
-      .def("cpu_math_library_num_threads",
-           &AnalysisConfig::cpu_math_library_num_threads)
-      .def("to_native_config", &AnalysisConfig::ToNativeConfig)
-      .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
-#ifdef PADDLE_WITH_MKLDNN
-      .def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config,
-           py::return_value_policy::reference)
-#endif
-      .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
-      .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
-      .def("model_from_memory", &AnalysisConfig::model_from_memory)
-      .def("pass_builder", &AnalysisConfig::pass_builder,
-           py::return_value_policy::reference);
-}
-
-#ifdef PADDLE_WITH_MKLDNN
-void BindMkldnnQuantizerConfig(py::module *m) {
-  py::class_<MkldnnQuantizerConfig> quantizer_config(*m,
-                                                     "MkldnnQuantizerConfig");
-  quantizer_config.def(py::init<const MkldnnQuantizerConfig &>())
-      .def(py::init<>())
-      .def("set_quant_data",
-           [](MkldnnQuantizerConfig &self,
-              const std::vector<PaddleTensor> &data) {
-             auto warmup_data =
-                 std::make_shared<std::vector<PaddleTensor>>(data);
-             self.SetWarmupData(warmup_data);
-             return;
-           })
-      .def("set_quant_batch_size", &MkldnnQuantizerConfig::SetWarmupBatchSize)
-      .def(
-          "set_enabled_op_types",
-          (void (MkldnnQuantizerConfig::*)(std::unordered_set<std::string> &)) &
-              MkldnnQuantizerConfig::SetEnabledOpTypes);
-}
-#endif
-
-void BindAnalysisPredictor(py::module *m) {
-  py::class_<AnalysisPredictor, PaddlePredictor>(*m, "AnalysisPredictor")
-      .def(py::init<const AnalysisConfig &>())
-      .def("init", &AnalysisPredictor::Init)
-      .def(
-          "run",
-          [](AnalysisPredictor &self, const std::vector<PaddleTensor> &inputs) {
-            std::vector<PaddleTensor> outputs;
-            self.Run(inputs, &outputs);
-            return outputs;
-          })
-      .def("get_input_tensor", &AnalysisPredictor::GetInputTensor)
-      .def("get_output_tensor", &AnalysisPredictor::GetOutputTensor)
-      .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
-      .def("clone", &AnalysisPredictor::Clone)
-      .def("scope", &AnalysisPredictor::scope,
-           py::return_value_policy::reference)
-      .def("SaveOptimModel", &AnalysisPredictor::SaveOptimModel,
-           py::arg("dir"));
-}
-}  // namespace
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/inference_api.h b/paddle/fluid/pybind/inference_api.h
deleted file mode 100644
index c2adfbecf72ca0f475e526ca97adf9833b03ede4..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/inference_api.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-namespace paddle {
-namespace pybind {
-void BindInferenceApi(pybind11::module *m);
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
deleted file mode 100644
index abc10765e4a37000412534e5396b7e9ef792a00d..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/ir.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/pybind/ir.h"
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "pybind11/stl.h"
-
-namespace py = pybind11;
-using paddle::framework::ir::Graph;
-using paddle::framework::ir::Node;
-using paddle::framework::ir::GraphSafeRemoveNodes;
-using paddle::framework::ir::HasCircle;
-using paddle::framework::ir::GraphNum;
-using paddle::framework::ir::TopologySortOperations;
-using paddle::framework::ir::BuildOperationAdjList;
-using paddle::framework::OpDesc;
-using paddle::framework::ProgramDesc;
-using paddle::framework::Scope;
-using paddle::framework::VarDesc;
-using pybind11::return_value_policy;
-
-namespace paddle {
-namespace pybind {
-void BindGraph(py::module *m) {
-  m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes);
-  m->def("has_circle", HasCircle);
-  m->def("graph_num", GraphNum);
-  m->def("topology_sort", TopologySortOperations,
-         return_value_policy::reference);
-  m->def("build_adjacency_list", BuildOperationAdjList,
-         return_value_policy::reference);
-  py::class_<Graph, std::shared_ptr<Graph>>(
-      *m, "Graph",
-      "The graph is a Directed Acyclic Single Static Assignment Graph, see "
-      "`paddle::ir::Graph` for details.")
-      .def(py::init<const ProgramDesc &>())
-      .def("clone", &Graph::Clone)
-      .def("has", &Graph::Has)
-      .def("get_bool", &Graph::Get<bool>)
-      .def("get_int", &Graph::Get<int>)
-      .def("get_float", &Graph::Get<float>)
-      .def("get_double", &Graph::Get<double>)
-      .def("get_string", &Graph::Get<std::string>)
-      .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>,
-           return_value_policy::reference)
-      .def("set", [](Graph &self, const std::string &attr_name,
-                     bool attr) { return self.Set(attr_name, new bool(attr)); })
-      .def("set", [](Graph &self, const std::string &attr_name,
-                     int attr) { return self.Set(attr_name, new int(attr)); })
-      .def("set",
-           [](Graph &self, const std::string &attr_name,
-              const std::string &attr) {
-             return self.Set(attr_name, new std::string(attr));
-           })
-      .def("set",
-           [](Graph &self, const std::string &attr_name, float attr) {
-             return self.Set(attr_name, new float(attr));
-           })
-      .def("set",
-           [](Graph &self, const std::string &attr_name, double attr) {
-             return self.Set(attr_name, new double(attr));
-           })
-      .def("set",
-           [](Graph &self, const std::string &attr_name,
-              const std::unordered_set<const Node *> &attr) {
-             return self.Set(attr_name,
-                             new std::unordered_set<const Node *>(attr));
-           })
-      .def("set",
-           [](Graph &self, const std::string &attr_name,
-              const std::unordered_set<std::string> &attr) {
-             return self.Set(attr_name,
-                             new std::unordered_set<std::string>(attr));
-           })
-      .def("set_not_owned",
-           [](Graph &self, const std::string &attr_name, Scope &attr) {
-             self.SetNotOwned<Scope>(attr_name, &attr);
-           })
-      .def("erase", &Graph::Erase)
-      .def("nodes", &Graph::Nodes, return_value_policy::reference)
-      .def("create_var_node",
-           [](Graph &self, VarDesc &var_desc) {
-             return self.CreateVarNode(&var_desc);
-           },
-           return_value_policy::reference)
-      .def("create_op_node",
-           [](Graph &self, OpDesc &op_desc) {
-             return self.CreateOpNode(&op_desc);
-           },
-           return_value_policy::reference)
-      .def("create_control_dep_var", &Graph::CreateControlDepVar,
-           return_value_policy::reference)
-      .def("create_empty_node", &Graph::CreateEmptyNode,
-           return_value_policy::reference)
-      .def("release_nodes", &Graph::ReleaseNodes)
-      .def("remove_node",
-           [](Graph &self, Node &node) { return self.RemoveNode(&node); })
-      .def("retrieve_node", &Graph::RetrieveNode,
-           return_value_policy::reference)
-      .def("resolve_hazard", &Graph::ResolveHazard)
-      .def("origin_program_desc", &Graph::OriginProgram,
-           return_value_policy::reference);
-}
-
-void BindNode(py::module *m) {
-  py::class_<Node> node(*m, "Node");
-  node.def("name", &Node::Name)
-      .def("node_type", &Node::NodeType)
-      .def("var", &Node::Var, return_value_policy::reference)
-      .def("op", &Node::Op, return_value_policy::reference)
-      .def("id", &Node::id)
-      .def("is_op", &Node::IsOp)
-      .def("is_var", &Node::IsVar)
-      .def("is_ctrl_var", &Node::IsCtrlVar)
-      .def("clear_inputs", [](Node &self) { self.inputs.clear(); })
-      .def("remove_input",
-           [](Node &self, int node_id) {
-             auto pos = std::find_if(
-                 self.inputs.begin(), self.inputs.end(),
-                 [&node_id](const Node *n) { return n->id() == node_id; });
-             if (pos != self.inputs.end()) {
-               self.inputs.erase(pos);
-             }
-           })
-      .def("remove_input",
-           [](Node &self, Node &node) {
-             auto pos =
-                 std::find(self.inputs.begin(), self.inputs.end(), &node);
-             if (pos != self.inputs.end()) {
-               self.inputs.erase(pos);
-             }
-           })
-      .def("append_input",
-           [](Node &self, Node &node) { self.inputs.push_back(&node); })
-      .def("clear_outputs", [](Node &self) { self.outputs.clear(); })
-      .def("remove_output",
-           [](Node &self, int node_id) {
-             auto pos = std::find_if(
-                 self.outputs.begin(), self.outputs.end(),
-                 [&node_id](const Node *n) { return n->id() == node_id; });
-             if (pos != self.outputs.end()) {
-               self.outputs.erase(pos);
-             }
-           })
-      .def("remove_output",
-           [](Node &self, Node &node) {
-             auto pos =
-                 std::find(self.outputs.begin(), self.outputs.end(), &node);
-             if (pos != self.outputs.end()) {
-               self.outputs.erase(pos);
-             }
-           })
-      .def("append_output",
-           [](Node &self, Node &node) { self.outputs.push_back(&node); })
-      .def_readwrite("inputs", &Node::inputs)
-      .def_readwrite("outputs", &Node::outputs);
-
-  py::enum_<Node::Type>(node, "Type")
-      .value("Operation", Node::Type::kOperation)
-      .value("Variable", Node::Type::kVariable)
-      .export_values();
-}
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/ir.h b/paddle/fluid/pybind/ir.h
deleted file mode 100644
index 5bee70eba695b6d71c4df03e7ffe5d8d11384172..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/ir.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace pybind {
-void BindGraph(pybind11::module *m);
-void BindNode(pybind11::module *m);
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/nccl_wrapper_py.cc b/paddle/fluid/pybind/nccl_wrapper_py.cc
deleted file mode 100644
index bbba03f6660fe9ddb14764709ea81a9a82b1b386..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/nccl_wrapper_py.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fcntl.h>
-
-#ifdef _POSIX_C_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-
-#ifdef _XOPEN_SOURCE
-#undef _XOPEN_SOURCE
-#endif
-
-#include <string>
-#include <vector>
-
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/text_format.h"
-#include "paddle/fluid/framework/async_executor.h"
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/data_feed.pb.h"
-#include "paddle/fluid/framework/fleet/nccl_wrapper.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/variant.h"
-#include "paddle/fluid/pybind/nccl_wrapper_py.h"
-
-namespace py = pybind11;
-namespace pd = paddle::framework;
-
-namespace paddle {
-namespace pybind {
-void BindNCCLWrapper(py::module* m) {
-  py::class_<framework::NCCLWrapper>(*m, "Nccl")
-      .def(py::init())
-      .def("init_nccl", &framework::NCCLWrapper::InitNCCL)
-      .def("set_nccl_id", &framework::NCCLWrapper::SetNCCLId)
-      .def("set_rank_info", &framework::NCCLWrapper::SetRankInfo)
-      .def("sync_var", &framework::NCCLWrapper::SyncVar);
-}  // end NCCLWrapper
-}  // end namespace pybind
-}  // end namespace paddle
diff --git a/paddle/fluid/pybind/nccl_wrapper_py.h b/paddle/fluid/pybind/nccl_wrapper_py.h
deleted file mode 100644
index 683eb4d61e00abf4e7192efb1d102ff73cb9e02e..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/nccl_wrapper_py.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-
-void BindNCCLWrapper(py::module* m);
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
deleted file mode 100644
index 31b5dd5d7c053d369bec6dac2c5ba0e73d7ddd60..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/protobuf.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/pybind/protobuf.h"
-
-#include <deque>
-#include <iostream>
-#include <string>
-#include <tuple>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/var_desc.h"
-
-#include "paddle/fluid/pybind/pybind_boost_headers.h"
-
-namespace paddle {
-namespace pybind {
-
-namespace pd = paddle::framework;
-
-template <typename T>
-static pybind11::bytes SerializeMessage(
-    T &self) {  // NOLINT due to pybind11 convention.
-  // Check IsInitialized in Python
-  std::string retv;
-  PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
-                 "Cannot serialize message");
-  return retv;
-}
-
-// Bind Methods
-void BindProgramDesc(pybind11::module *m) {
-  pybind11::class_<pd::ProgramDesc>(*m, "ProgramDesc", "")
-      .def(pybind11::init<>())
-      .def("__init__",
-           [](pd::ProgramDesc &self, const pd::ProgramDesc &other) {
-             new (&self) pd::ProgramDesc(other);
-           })
-      .def("__init__",
-           [](pd::ProgramDesc &self, const pybind11::bytes &binary_str) {
-             std::string str(binary_str);
-             new (&self) pd::ProgramDesc(str);
-           })
-      .def("append_block", &pd::ProgramDesc::AppendBlock,
-           pybind11::return_value_policy::reference)
-      .def("block", &pd::ProgramDesc::MutableBlock,
-           pybind11::return_value_policy::reference)
-      .def("num_blocks", &pd::ProgramDesc::Size)
-      .def("flush", &pd::ProgramDesc::Flush)
-      .def("get_feed_target_names", &pd::ProgramDesc::GetFeedTargetNames)
-      .def("get_fetch_target_names", &pd::ProgramDesc::GetFetchTargetNames)
-      .def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
-      .def("parse_from_string",
-           [](pd::ProgramDesc &program_desc, const std::string &data) {
-             pd::proto::ProgramDesc *desc = program_desc.Proto();
-             PADDLE_ENFORCE(desc->ParseFromString(data),
-                            "Fail to parse ProgramDesc from string. This could "
-                            "be a bug of Paddle.");
-           })
-      .def("_version", [](pd::ProgramDesc &self) -> int64_t {
-        return self.Proto()->version().version();
-      });
-}
-
-void BindBlockDesc(pybind11::module *m) {
-  pybind11::class_<pd::BlockDesc>(*m, "BlockDesc", "")
-      .def_property_readonly("id", &pd::BlockDesc::ID)
-      .def_property_readonly("parent", &pd::BlockDesc::Parent)
-      .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
-      .def("_set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
-      .def("append_op", &pd::BlockDesc::AppendOp,
-           pybind11::return_value_policy::reference)
-      .def("_prepend_op", &pd::BlockDesc::PrependOp,
-           pybind11::return_value_policy::reference)
-      .def("_insert_op", &pd::BlockDesc::InsertOp,
-           pybind11::return_value_policy::reference)
-      .def("_remove_op", &pd::BlockDesc::RemoveOp)
-      .def("var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.Var(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("has_var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.HasVar(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("_rename_var",
-           [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
-              const pybind11::bytes &byte_name_new) {
-             std::string name = byte_name;
-             std::string new_name = byte_name_new;
-             self.RenameVar(name, new_name);
-           })
-      .def("has_var_recursive",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.HasVarRecursive(name);
-           })
-      .def("find_var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.FindVar(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("find_var_recursive",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.FindVarRecursive(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("_remove_var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.RemoveVar(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("all_vars", &pd::BlockDesc::AllVars,
-           pybind11::return_value_policy::reference)
-      .def("op_size", &pd::BlockDesc::OpSize)
-      .def("op", &pd::BlockDesc::Op, pybind11::return_value_policy::reference)
-      .def("serialize_to_string", SerializeMessage<pd::BlockDesc>);
-}
-
-void BindVarDsec(pybind11::module *m) {
-  pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
-  var_desc.def(pybind11::init<const std::string &>())
-      .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
-      .def("set_name", &pd::VarDesc::SetName)
-      .def("set_shape", &pd::VarDesc::SetShape)
-      .def("set_shapes", &pd::VarDesc::SetShapes)
-      .def("set_dtype", &pd::VarDesc::SetDataType)
-      .def("set_dtypes", &pd::VarDesc::SetDataTypes)
-      .def("shape", &pd::VarDesc::GetShape,
-           pybind11::return_value_policy::reference)
-      .def("shapes", &pd::VarDesc::GetShapes,
-           pybind11::return_value_policy::reference)
-      .def("dtype", &pd::VarDesc::GetDataType,
-           pybind11::return_value_policy::reference)
-      .def("dtypes", &pd::VarDesc::GetDataTypes,
-           pybind11::return_value_policy::reference)
-      .def("lod_level", &pd::VarDesc::GetLoDLevel)
-      .def("lod_levels", &pd::VarDesc::GetLoDLevels,
-           pybind11::return_value_policy::reference)
-      .def("set_lod_level", &pd::VarDesc::SetLoDLevel)
-      .def("set_lod_levels", &pd::VarDesc::SetLoDLevels)
-      .def("type", &pd::VarDesc::GetType)
-      .def("set_type", &pd::VarDesc::SetType)
-      .def("serialize_to_string", SerializeMessage<pd::VarDesc>)
-      .def("persistable", &pd::VarDesc::Persistable)
-      .def("set_persistable", &pd::VarDesc::SetPersistable);
-
-  pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
-      .value("BOOL", pd::proto::VarType::BOOL)
-      .value("UINT8", pd::proto::VarType::UINT8)
-      .value("INT8", pd::proto::VarType::INT8)
-      .value("INT16", pd::proto::VarType::INT16)
-      .value("INT32", pd::proto::VarType::INT32)
-      .value("INT64", pd::proto::VarType::INT64)
-      .value("FP16", pd::proto::VarType::FP16)
-      .value("FP32", pd::proto::VarType::FP32)
-      .value("FP64", pd::proto::VarType::FP64)
-      .value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR)
-      .value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH)
-      .value("FETCH_LIST", pd::proto::VarType::FETCH_LIST)
-      .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
-      .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
-      .value("READER", pd::proto::VarType::READER)
-      .value("RAW", pd::proto::VarType::RAW);
-}
-
-void BindOpDesc(pybind11::module *m) {
-  pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "")
-      .value("INT", pd::proto::AttrType::INT)
-      .value("INTS", pd::proto::AttrType::INTS)
-      .value("LONG", pd::proto::AttrType::LONG)
-      .value("LONGS", pd::proto::AttrType::LONGS)
-      .value("FLOAT", pd::proto::AttrType::FLOAT)
-      .value("FLOATS", pd::proto::AttrType::FLOATS)
-      .value("STRING", pd::proto::AttrType::STRING)
-      .value("STRINGS", pd::proto::AttrType::STRINGS)
-      .value("BOOL", pd::proto::AttrType::BOOLEAN)
-      .value("BOOLS", pd::proto::AttrType::BOOLEANS)
-      .value("BLOCK", pd::proto::AttrType::BLOCK)
-      .value("BLOCKS", pd::proto::AttrType::BLOCKS);
-
-  pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
-  op_desc
-      .def("__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
-           pybind11::return_value_policy::reference)
-      .def("copy_from", &pd::OpDesc::CopyFrom)
-      .def("type", &pd::OpDesc::Type)
-      .def("set_type", &pd::OpDesc::SetType)
-      .def("input", &pd::OpDesc::Input)
-      .def("input_names", &pd::OpDesc::InputNames)
-      .def("output", &pd::OpDesc::Output)
-      .def("output_names", &pd::OpDesc::OutputNames)
-      .def("set_input", &pd::OpDesc::SetInput)
-      .def("set_output", &pd::OpDesc::SetOutput)
-      .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
-      .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
-      .def("_rename_input", &pd::OpDesc::RenameInput)
-      .def("_rename_output", &pd::OpDesc::RenameOutput)
-      .def("has_attr", &pd::OpDesc::HasAttr)
-      .def("attr_type", &pd::OpDesc::GetAttrType)
-      .def("attr_names", &pd::OpDesc::AttrNames)
-      .def("_set_attr", &pd::OpDesc::SetAttr)
-      .def("remove_attr", &pd::OpDesc::RemoveAttr)
-      .def("attr", &pd::OpDesc::GetAttr)
-      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
-      .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
-      .def("set_serialized_attr",
-           [](pd::OpDesc &self, const std::string &name,
-              const pybind11::bytes &seriralized) {
-             std::string ser(seriralized);
-             self.SetAttr(name, ser);
-           })
-      .def("_block_attr_id", &pd::OpDesc::GetBlockAttrId)
-      .def("_blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
-      .def("check_attrs", &pd::OpDesc::CheckAttrs)
-      .def("infer_shape", &pd::OpDesc::InferShape)
-      .def("infer_var_type", &pd::OpDesc::InferVarType)
-      .def("set_is_target", &pd::OpDesc::SetIsTarget)
-      .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
-      .def("block", [](pd::OpDesc &self) { return self.Block(); },
-           pybind11::return_value_policy::reference);
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/protobuf.h b/paddle/fluid/pybind/protobuf.h
deleted file mode 100644
index e7370672a88fcf9238cc88c6aae65c6ee643746b..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/protobuf.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <Python.h>
-
-#include <fstream>
-#include <vector>
-
-#include "paddle/fluid/platform/variant.h"
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-
-namespace paddle {
-namespace pybind {
-
-void BindProgramDesc(pybind11::module* m);
-void BindBlockDesc(pybind11::module* m);
-void BindVarDsec(pybind11::module* m);
-void BindOpDesc(pybind11::module* m);
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
deleted file mode 100644
index 64413685c6c673708b8246c63beb39acf2bf0f69..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/pybind.cc
+++ /dev/null
@@ -1,1751 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <Python.h>
-#include <algorithm>
-#include <cstdlib>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT // for call_once
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
-#include "paddle/fluid/framework/ir/pass_builder.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/scope_pool.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/py_func_op.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/pybind/box_helper_py.h"
-#include "paddle/fluid/pybind/const_value.h"
-#include "paddle/fluid/pybind/data_set_py.h"
-#include "paddle/fluid/pybind/exception.h"
-#include "paddle/fluid/pybind/fleet_wrapper_py.h"
-#include "paddle/fluid/pybind/imperative.h"
-#include "paddle/fluid/pybind/inference_api.h"
-#include "paddle/fluid/pybind/ir.h"
-
-#ifndef _WIN32
-#include "paddle/fluid/pybind/nccl_wrapper_py.h"
-#endif
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/pybind/protobuf.h"
-#include "paddle/fluid/pybind/pybind.h"  // NOLINT
-#include "paddle/fluid/pybind/reader_py.h"
-#include "paddle/fluid/pybind/tensor_py.h"
-#include "paddle/fluid/string/to_string.h"
-#ifdef PADDLE_WITH_CUDA
-#ifndef _WIN32
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#endif
-#include "paddle/fluid/platform/cuda_profiler.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/pybind/communicator_py.h"
-#endif
-
-#include "pybind11/stl.h"
-
-DEFINE_bool(reader_queue_speed_test_mode, false,
-            "If set true, the queue.pop will only get data from queue but not "
-            "remove the data from queue for speed testing");
-DECLARE_bool(use_mkldnn);
-#ifdef PADDLE_WITH_NGRAPH
-DECLARE_bool(use_ngraph);
-#endif
-
-// disable auto conversion to list in Python
-PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
-
-namespace paddle {
-namespace pybind {
-bool IsCompiledWithCUDA() {
-#ifndef PADDLE_WITH_CUDA
-  return false;
-#else
-  return true;
-#endif
-}
-
-bool IsCompiledWithMKLDNN() {
-#ifndef PADDLE_WITH_MKLDNN
-  return false;
-#else
-  return true;
-#endif
-}
-
-bool IsCompiledWithNGRAPH() {
-#ifndef PADDLE_WITH_NGRAPH
-  return false;
-#else
-  return true;
-#endif
-}
-
-bool IsCompiledWithBrpc() {
-#ifndef PADDLE_WITH_DISTRIBUTE
-  return false;
-#endif
-
-#ifdef PADDLE_WITH_GRPC
-  return false;
-#endif
-
-  return true;
-}
-
-bool IsCompiledWithDIST() {
-#ifdef PADDLE_WITH_DISTRIBUTE
-  return true;
-#else
-  return false;
-#endif
-}
-
-template <typename PlaceType1, typename PlaceType2>
-static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
-  return paddle::platform::Place(p1) == paddle::platform::Place(p2);
-}
-
-template <typename PlaceType>
-static inline int PlaceIndex(const PlaceType &p) {
-  return static_cast<int>(paddle::platform::Place(p).which());
-}
-
-#ifdef PADDLE_WITH_AVX
-PYBIND11_MODULE(core_avx, m) {
-#else
-PYBIND11_MODULE(core_noavx, m) {
-#endif
-
-  // Not used, just make sure cpu_info.cc is linked.
-  paddle::platform::CpuTotalPhysicalMemory();
-
-  paddle::memory::allocation::UseAllocatorStrategyGFlag();
-
-  m.doc() = "C++ core of PaddlePaddle";
-
-  // using framework in this function. Since it is inside a function, it will
-  // not cause namespace pollution.
-  using namespace paddle::framework;  // NOLINT
-
-  BindException(&m);
-
-  m.def("set_num_threads", &platform::SetNumThreads);
-
-  m.def(
-      "_append_python_callable_object_and_return_id",
-      [](py::object py_obj) -> size_t {
-        return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj);
-      });
-
-  m.def("_get_use_default_grad_op_desc_maker_ops",
-        [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); });
-
-  // NOTE(zjl): ctest would load environment variables at the beginning even
-  // though we have not `import paddle.fluid as fluid`. So we add this API
-  // to enable eager deletion mode in unittest.
-  m.def("_set_eager_deletion_mode", &paddle::framework::SetEagerDeletionMode);
-
-  m.def("_set_fuse_parameter_group_size",
-        &paddle::framework::ir::SetFuseParameterGroupsSize);
-  m.def("_set_fuse_parameter_memory_size",
-        &paddle::framework::ir::SetFuseParameterMemorySize);
-
-  m.add_object("_cleanup",
-               py::capsule([]() { ScopePool::Instance().Clear(); }));
-
-  m.def("_set_paddle_lib_path", &paddle::platform::dynload::SetPaddleLibPath);
-
-  BindImperative(&m);
-
-  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
-      .def("__array__", [](Tensor &self) { return TensorToPyArray(self); })
-      .def("_is_initialized",
-           [](const Tensor &self) { return self.IsInitialized(); })
-      .def("_get_dims",
-           [](const Tensor &self) { return vectorize(self.dims()); })
-      .def("_set_dims",
-           [](Tensor &self, const std::vector<int64_t> &dim) {
-             self.Resize(make_ddim(dim));
-           })
-      .def("_set_layout",
-           [](Tensor &self, const std::string &layout) {
-             self.set_layout(StringToDataLayout(layout));
-           })
-      .def("_alloc_float",
-           [](Tensor &self, paddle::platform::CUDAPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_double",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<double>(place);
-           })
-      .def("_alloc_int",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](Tensor &self, paddle::platform::CUDAPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_float",
-           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_clear", &Tensor::clear)
-      .def("set", PyCPUTensorSetFromArray<float>)
-      .def("set", PyCPUTensorSetFromArray<int>)
-      .def("set", PyCPUTensorSetFromArray<double>)
-      .def("set", PyCPUTensorSetFromArray<int64_t>)
-      .def("set", PyCPUTensorSetFromArray<bool>)
-      .def("set", PyCPUTensorSetFromArray<uint16_t>)
-      .def("set", PyCPUTensorSetFromArray<uint8_t>)
-      .def("set", PyCPUTensorSetFromArray<int8_t>)
-#ifdef PADDLE_WITH_CUDA
-      .def("set", PyCUDATensorSetFromArray<float>)
-      .def("set", PyCUDATensorSetFromArray<int>)
-      .def("set", PyCUDATensorSetFromArray<double>)
-      .def("set", PyCUDATensorSetFromArray<int64_t>)
-      .def("set", PyCUDATensorSetFromArray<bool>)
-      .def("set", PyCUDATensorSetFromArray<uint16_t>)
-      .def("set", PyCUDATensorSetFromArray<uint8_t>)
-      .def("set", PyCUDATensorSetFromArray<int8_t>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<float>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<int>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<double>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
-      .def("set", PyCUDAPinnedTensorSetFromArray<int8_t>)
-#endif
-      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
-      .def("_set_float_element", TensorSetElement<float>)
-      .def("_get_float_element", TensorGetElement<float>)
-      .def("_set_double_element", TensorSetElement<double>)
-      .def("_get_double_element", TensorGetElement<double>)
-      .def("_place", [](Tensor &self) { return self.place(); })
-      .def("_dtype", [](Tensor &self) { return self.type(); })
-      .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
-      .def("__str__", [](const Tensor &self) {
-        std::stringstream ostr;
-        ostr << self;
-        return ostr.str();
-      });
-
-  py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
-    LoDTensor is a Tensor with optional LoD information.
-
-    np.array(lod_tensor) can convert LoDTensor to numpy array.
-    lod_tensor.lod() can retrieve the LoD information.
-
-    LoD is short for Level of Details and is usually used for varied sequence
-    length. You can skip the following comment if you don't need optional LoD.
-
-    For example, a LoDTensor X can look like the example below. It contains
-    2 sequences. The first has length 2 and the second has length 3, as
-    described by x.lod.
-
-    The first tensor dimension 5=2+3 is calculated from LoD if it's available.
-    It means the total number of sequence element. In X, each element has 2
-    columns, hence [5, 2].
-
-    x.lod  = [[2, 3]]
-
-    x.data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
-
-    x.shape = [5, 2]
-
-    LoD can have multiple levels (for example, a paragraph can have multiple
-    sentences and a sentence can have multiple words). In the following
-    LodTensor Y, the lod_level is 2. It means there are 2 sequence, the
-    first sequence length is 2 (has 2 sub-sequences), the second one's
-    length is 1. The first sequence's 2 sub-sequences have length 2 and 2,
-    respectively. And the second sequence's 1 sub-sequence has length 3.
-
-    y.lod = [[2 1], [2 2 3]]
-
-    y.shape = [2+2+3, ...]
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          t = fluid.LoDTensor()
-
-  Note:
-      In above description, LoD is length-based. In Paddle internal
-      implementation, lod is offset-based. Hence, internally,
-      y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based
-      equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]).
-
-      Sometimes LoD is called recursive_sequence_length to be more
-      self-explanatory. In this case, it must be length-based. Due to history
-      reasons. when LoD is called lod in public API, it might be offset-based.
-      Users should be careful about it.
-        )DOC")
-      .def("__array__", [](Tensor &self) { return TensorToPyArray(self); })
-      .def("__init__",
-           [](LoDTensor &instance, const std::vector<std::vector<size_t>>
-                                       &recursive_sequence_lengths) {
-             LoD new_lod;
-             new_lod.reserve(recursive_sequence_lengths.size());
-             std::copy(recursive_sequence_lengths.begin(),
-                       recursive_sequence_lengths.end(),
-                       std::back_inserter(new_lod));
-             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_offset_lod, -1), true,
-                 "the provided recursive_sequence_lengths info is invalid");
-             new (&instance) LoDTensor(new_offset_lod);
-           })
-      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
-      // We implement offset based LOD in C++ while we use length based with
-      // Python API. So we changed set_lod to set_recursive_sequence_lengths to
-      // avoid misuse.
-      // The discussion is here:
-      // https://github.com/PaddlePaddle/Paddle/issues/10855
-      .def("set_lod",
-           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
-             // the input lod is offset-based level-of-detail info
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_lod, vectorize(self.dims()).front()), true,
-                 "the provided lod info is invalid");
-             self.set_lod(new_lod);
-           },
-           py::arg("lod"), R"DOC(
-           Set LoD of the LoDTensor.
-
-           Args:
-               lod (List[List[int]]): the lod to be set.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle.fluid as fluid
-                 import numpy as np
-
-                 t = fluid.LoDTensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_lod([[0, 2, 5]])
-           )DOC")
-      .def("set_recursive_sequence_lengths",
-           [](LoDTensor &self, const std::vector<std::vector<size_t>>
-                                   &recursive_sequence_lengths) {
-             // the input recursive_sequence_lengths is length-based
-             // level-of-detail info
-             LoD new_lod;
-             new_lod.reserve(recursive_sequence_lengths.size());
-             std::copy(recursive_sequence_lengths.begin(),
-                       recursive_sequence_lengths.end(),
-                       std::back_inserter(new_lod));
-             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true,
-                 "the provided recursive_sequence_lengths info is invalid");
-             self.set_lod(new_offset_lod);
-           },
-           py::arg("recursive_sequence_lengths"), R"DOC(
-           Set LoD of the LoDTensor according to recursive sequence length.
-
-           For example, if recursive_sequence_lengths=[[2, 3]], meaning that
-           there are two sequences with length 2 and 3 respectively, the
-           corresponding lod would be [[0, 2, 2+3]], i.e, [[0, 2, 5]].
-
-           Args:
-                recursive_sequence_lengths (List[List[int]]): sequence lengths.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle.fluid as fluid
-                 import numpy as np
-
-                 t = fluid.LoDTensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-           )DOC")
-      .def("lod",
-           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-             // output the offset-based lod info
-             LoD lod = self.lod();
-             std::vector<std::vector<size_t>> new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return new_lod;
-           },
-           R"DOC(
-           Return the LoD of the LoDTensor.
-
-           Returns:
-               out (List[List[int]]): the lod of the LoDTensor.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle.fluid as fluid
-                 import numpy as np
-
-                 t = fluid.LoDTensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_lod([[0, 2, 5]])
-                 print(t.lod()) # [[0, 2, 5]]
-           )DOC")
-      // Set above comments of set_lod.
-      .def("recursive_sequence_lengths",
-           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-             // output the length-based lod info
-             LoD lod = ConvertToLengthBasedLoD(self.lod());
-             std::vector<std::vector<size_t>> new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return new_lod;
-           },
-           R"DOC(
-           Return the sequence length of the LoDTensor corresponding to LoD.
-
-           Returns:
-               out (List[List[int]): the sequence lengths.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle.fluid as fluid
-                 import numpy as np
-
-                 t = fluid.LoDTensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.recursive_sequence_lengths()) # [[2, 3]]
-           )DOC")
-      .def("has_valid_recursive_sequence_lengths",
-           [](LoDTensor &self) -> bool {
-             // Check that the lod info is valid and match the outermost
-             // dimension of the LoDTensor data
-             return CheckLoD(self.lod(), vectorize(self.dims()).front());
-           },
-           R"DOC(
-           Check whether the lod of the LoDTensor is valid.
-
-           Returns:
-               out (bool): whether the lod is valid.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle.fluid as fluid
-                 import numpy as np
-
-                 t = fluid.LoDTensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.has_valid_recursive_sequence_lengths()) # True
-           )DOC")
-      .def("__getitem__", PySliceTensor, py::return_value_policy::reference,
-           R"DOC(
-           Slice the original Tensor, and remove the LoD information.
-
-           Returns:
-               out (Tensor): new Tensor(NOT LoDTensor).
-           )DOC")
-      .def("__str__",
-           [](const LoDTensor &self) {
-             std::stringstream ostr;
-             ostr << self;
-             return ostr.str();
-           })
-      .def("_copy", [](const LoDTensor &self, const platform::Place &place) {
-        // follow fetch_op's inplementation
-        LoDTensor dst;
-        if (self.IsInitialized() && self.numel() > 0) {
-          TensorCopySync(self, place, &dst);
-        } else {
-          // Not copy, if the src tensor is empty.
-          dst.clear();
-          dst.Resize({0});
-        }
-        dst.set_lod(self.lod());
-        return dst;
-      });
-
-  py::class_<SelectedRows>(m, "SelectedRows")
-      .def("__init__",
-           [](SelectedRows &instance) { new (&instance) SelectedRows(); })
-      .def("__init__",
-           [](SelectedRows &instance, const std::vector<int64_t> rows,
-              const int64_t &height) {
-             new (&instance) SelectedRows(rows, height);
-           })
-      .def("get_tensor",
-           [](SelectedRows &self) { return self.mutable_value(); },
-           py::return_value_policy::reference)
-      .def("numel",
-           [](SelectedRows &self) -> int64_t { return self.value().numel(); })
-      .def("set_height", &SelectedRows::set_height)
-      .def("height", &SelectedRows::height)
-      .def("set_rows",
-           [](SelectedRows &self, std::vector<int64_t> rows) {
-#ifndef PADDLE_WITH_CUDA
-             self.set_rows(rows);
-#else
-        Vector<int64_t> new_rows(rows);
-        self.set_rows(new_rows);
-#endif
-           })
-      .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); })
-      .def("rows", [](SelectedRows &self) {
-        auto rows = self.rows();
-        std::vector<int64_t> new_rows;
-        new_rows.reserve(rows.size());
-        std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
-        return new_rows;
-      });
-
-  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
-
-All parameter, weight, gradient are variables in Paddle.
-)DOC")
-      .def(py::init<>())
-      .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
-      .def("set_int",
-           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
-      .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
-      .def("is_float", [](const Variable &var) { return var.IsType<float>(); })
-      .def("set_float",
-           [](Variable &var, float val) -> void {
-             *var.GetMutable<float>() = val;
-           })
-      .def("get_float",
-           [](const Variable &var) -> float { return var.Get<float>(); })
-      .def("get_tensor",
-           [](Variable &self) -> LoDTensor * {
-             return self.GetMutable<LoDTensor>();
-           },
-           py::return_value_policy::reference)
-      .def("get_lod_rank_table",
-           [](Variable &self) { return self.GetMutable<LoDRankTable>(); },
-           py::return_value_policy::reference)
-      .def("get_selected_rows",
-           [](Variable &self) -> SelectedRows * {
-             return self.GetMutable<SelectedRows>();
-           },
-           py::return_value_policy::reference)
-      .def("get_lod_tensor_array",
-           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
-           py::return_value_policy::reference)
-#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
-      .def("get_communicator",
-           [](Variable &self) -> platform::Communicator * {
-             return self.GetMutable<platform::Communicator>();
-           },
-           py::return_value_policy::reference)
-#endif
-      .def("get_reader",
-           [](Variable &self) -> framework::ReaderHolder * {
-             PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(), true);
-             return self.GetMutable<framework::ReaderHolder>();
-           },
-           py::return_value_policy::reference);
-
-  BindReader(&m);
-
-  using LoDTensorBlockingQueue =
-      ::paddle::operators::reader::LoDTensorBlockingQueue;
-  using LoDTensorBlockingQueueHolder =
-      ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
-
-  py::class_<LoDTensorBlockingQueue, std::shared_ptr<LoDTensorBlockingQueue>>(
-      m, "LoDTensorBlockingQueue", "")
-      .def("push",
-           [](LoDTensorBlockingQueue &self,
-              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
-             pybind11::gil_scoped_release release;
-             return self.Push(lod_tensor_vec);
-           })
-      .def("size", &LoDTensorBlockingQueue::Size)
-      .def("capacity", &LoDTensorBlockingQueue::Cap)
-      .def("close", &LoDTensorBlockingQueue::Close)
-      .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
-
-  m.def("init_lod_tensor_blocking_queue",
-        [](Variable &var,
-           size_t capacity) -> std::shared_ptr<LoDTensorBlockingQueue> {
-          VLOG(1) << "init_lod_tensor_blocking_queue";
-          auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-          holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
-          return holder->GetQueue();
-        },
-        py::return_value_policy::copy);
-
-  py::class_<Scope>(m, "_Scope", R"DOC(
-    Scope is an association of a name to Variable. All variables belong to Scope.
-
-    Variables in a parent scope can be retrieved from local scope.
-
-    You need to specify a scope to run a Net, i.e., `exe.Run(&scope)`.
-    One net can run in different scopes and update different variable in the
-    scope.
-
-    You can create var in a scope and get it from the scope.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          # create tensor from a scope and set value to it.
-          param = scope.var('Param').get_tensor()
-          param_array = np.full((height, row_numel), 5.0).astype("float32")
-          param.set(param_array, place)
-
-        )DOC")
-      .def("_remove_from_pool",
-           [](Scope &self) { ScopePool::Instance().Remove(&self); })
-      .def("var",
-           [](Scope &self, const std::string &name) -> Variable * {
-             return self.Var(name);
-           },
-           py::arg("name"),
-           R"DOC(
-           Find or create variable named :code:`name` in the current scope.
-
-           If the variable named :code:`name` does not exist in the
-           current scope, the variable would be created. Otherwise,
-           return the existing variable.
-
-           Args:
-               name (str): the variable name.
-
-           Returns:
-               out (core.Variable): the found or created variable.
-           )DOC",
-           py::return_value_policy::reference)
-      .def("find_var", &Scope::FindVar, py::arg("name"),
-           R"DOC(
-           Find variable named :code:`name` in the current scope or
-           its parent scope. Return None if not found.
-
-           Args:
-               name (str): the variable name.
-
-           Returns:
-               out (core.Variable|None): the found variable or None.
-           )DOC",
-           py::return_value_policy::reference)
-      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
-           R"DOC(
-           Create a new sub-scope of the current scope.
-
-           Returns:
-               out (core._Scope): the created sub-scope.
-           )DOC",
-           py::return_value_policy::reference)
-      .def("drop_kids", &Scope::DropKids,
-           R"DOC(
-           Delete all sub-scopes of the current scope.
-           )DOC")
-      .def("_kids", &Scope::kids);
-
-  m.def("Scope",
-        []() -> Scope * {
-          auto *s = new Scope();
-          ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
-          return s;
-        },
-        R"DOC(
-        Create a new scope.
-
-        Returns:
-            out (core._Scope): the created scope.
-        )DOC",
-        py::return_value_policy::reference);
-
-  //! @note: Be careful! PyBind will return std::string as an unicode, not
-  //! Python str. If you want a str object, you should cast them in Python.
-  m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
-    std::vector<py::bytes> ret_values;
-    for (auto &iter : OpInfoMap::Instance().map()) {
-      auto &info = iter.second;
-      if (info.HasOpProtoAndChecker()) {
-        std::string str;
-        PADDLE_ENFORCE_EQ(
-            info.Proto().SerializeToString(&str), true,
-            "Serialize OpProto Error. This could be a bug of Paddle.");
-        ret_values.emplace_back(str);
-      }
-    }
-    return ret_values;
-  });
-  m.def(
-      "get_grad_op_desc", [](const OpDesc &op_desc,
-                             const std::unordered_set<std::string> &no_grad_set,
-                             const std::vector<BlockDesc *> &grad_sub_block) {
-        std::unordered_map<std::string, std::string> grad_to_var;
-        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
-            framework::OpInfoMap::Instance()
-                .Get(op_desc.Type())
-                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
-                               grad_sub_block);
-        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
-        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
-                       grad_op_desc_ptrs.begin(),
-                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
-        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
-      });
-  m.def("has_grad_op_maker", [](const std::string op_type) {
-    return framework::OpInfoMap::Instance().Get(op_type).HasGradOpMaker();
-  });
-  m.def("has_infer_inplace", [](const std::string op_type) {
-    return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace();
-  });
-  m.def("get_flags_use_mkldnn", []() { return FLAGS_use_mkldnn; });
-#ifdef PADDLE_WITH_NGRAPH
-  m.def("get_flags_use_ngraph", []() { return FLAGS_use_ngraph; });
-#endif
-
-  m.def("prune", [](const ProgramDesc &origin,
-                    const std::set<std::string> &feeded_var_names,
-                    const std::vector<std::array<size_t, 2>> &targets) {
-    ProgramDesc prog_with_targets(origin);
-
-    for (const auto &t : targets) {
-      prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true);
-    }
-    proto::ProgramDesc pruned_desc;
-    Prune(*prog_with_targets.Proto(), feeded_var_names, &pruned_desc);
-    return new ProgramDesc(pruned_desc);
-  });
-  m.def("prune_backward", [](const framework::ProgramDesc &program) {
-    return PruneBackward(program);
-  });
-  m.def("empty_var_name",
-        []() { return std::string(framework::kEmptyVarName); });
-  m.def("grad_var_suffix",
-        []() { return std::string(framework::kGradVarSuffix); });
-  m.def_submodule(
-       "var_names",
-       "The module will return special predefined variable name in Paddle")
-      .def("empty", []() { return kEmptyVarName; })
-      .def("temp", []() { return kTempVarName; });
-  // clang-format off
-  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
-      .def_static("create",
-                  [](paddle::platform::CPUPlace& place)
-                      -> paddle::platform::DeviceContext* {
-                    return new paddle::platform::CPUDeviceContext();
-                  })
-      .def_static("create",
-                  [](paddle::platform::CUDAPlace& place)
-                      -> paddle::platform::DeviceContext* {
-#ifndef PADDLE_WITH_CUDA
-                    PADDLE_THROW("CUDAPlace is not supported in CPU device.");
-#else
-                    return new paddle::platform::CUDADeviceContext(place);
-#endif
-                  })
-          .def_static("create",
-                [](paddle::platform::CUDAPinnedPlace& place)
-                        -> paddle::platform::DeviceContext* {
-#ifndef PADDLE_WITH_CUDA
-                  PADDLE_THROW(
-                        "CUDAPinnedPlace is not supported in CPU device.");
-#else
-                  return new paddle::platform::CUDAPinnedDeviceContext(place);
-#endif
-                });;
-// clang-format on
-#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
-  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
-#endif
-  py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
-    CUDAPlace is a descriptor of a device. It represents a GPU, and each CUDAPlace
-    has a dev_id to indicate the number of cards represented by the current CUDAPlace.
-    The memory of CUDAPlace with different dev_id is not accessible.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          gpu_place = fluid.CUDAPlace(0)
-
-        )DOC")
-      .def("__init__",
-           [](platform::CUDAPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_CUDA
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid CUDAPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-
-             if (UNLIKELY(dev_id >= platform::GetCUDADeviceCount())) {
-               if (platform::GetCUDADeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use GPU because there is no GPU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
-                     "number on your machine is %d",
-                     dev_id, platform::GetCUDADeviceCount(),
-                     platform::GetCUDADeviceCount());
-                 std::exit(-1);
-               }
-             }
-
-             new (&self) platform::CUDAPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use GPU because you have installed CPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use GPU, please try to install GPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-gpu\n"
-                 "If you only have CPU, please change CUDAPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
-      .def("__str__", string::to_string<const platform::CUDAPlace &>);
-
-  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
-    CPUPlace is a descriptor of a device. It represents a CPU, and the memory
-    CPUPlace can be accessed by CPU.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          cpu_place = fluid.CPUPlace()
-
-        )DOC")
-      .def(py::init<>())
-      .def("_type", &PlaceIndex<platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
-      .def("__str__", string::to_string<const platform::CPUPlace &>);
-
-  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace", R"DOC(
-    CUDAPinnedPlace is a descriptor of a device. The memory of CUDAPinnedPlace
-    can be accessed by GPU and CPU.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          place = fluid.CUDAPinnedPlace()
-
-        )DOC")
-      .def("__init__",
-           [](platform::CUDAPinnedPlace &self) {
-#ifndef PADDLE_WITH_CUDA
-             PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
-#endif
-             new (&self) platform::CUDAPinnedPlace();
-           })
-      .def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
-      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
-
-  py::class_<platform::Place>(m, "Place")
-      .def(py::init<>())
-      .def("_type", &PlaceIndex<platform::Place>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
-      .def("is_gpu_place",
-           [](platform::Place &self) { return platform::is_gpu_place(self); })
-      .def("is_cpu_place",
-           [](platform::Place &self) { return platform::is_cpu_place(self); })
-      .def("is_cuda_pinned_place",
-           [](platform::Place &self) {
-             return platform::is_cuda_pinned_place(self);
-           })
-      .def("gpu_device_id",
-           [](platform::Place &self) {
-             return boost::get<platform::CUDAPlace>(self).device;
-           })
-      .def("set_place", [](platform::Place &self,
-                           const platform::Place &other) { self = other; })
-      .def("set_place",
-           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
-             self = cpu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
-             self = gpu_place;
-           })
-      .def("set_place", [](platform::Place &self,
-                           const platform::CUDAPinnedPlace &cuda_pinned_place) {
-        self = cuda_pinned_place;
-      });
-
-  py::class_<OperatorBase>(m, "Operator")
-      .def_static(
-          "create",
-          [](py::bytes protobin) {
-            proto::OpDesc desc;
-            PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), true,
-                              "Cannot parse user input to OpDesc");
-            PADDLE_ENFORCE_EQ(desc.IsInitialized(), true,
-                              "User OpDesc is not initialized, reason %s",
-                              desc.InitializationErrorString());
-            return OpRegistry::CreateOp(desc);
-          })
-      .def("run",
-           [](OperatorBase &self, const Scope &scope,
-              const platform::CPUPlace &place) { self.Run(scope, place); })
-      .def("run",
-           [](OperatorBase &self, const Scope &scope,
-              const platform::CUDAPlace &place) { self.Run(scope, place); })
-      .def("run",
-           [](OperatorBase &self, const Scope &scope,
-              const platform::CUDAPinnedPlace &place) {
-             self.Run(scope, place);
-           })
-      .def("type",
-           [](const OperatorBase &op) -> std::string { return op.Type(); })
-      .def("outputs",
-           [](const OperatorBase &op)
-               -> std::map<std::string, std::vector<std::string>> {
-                 return op.Outputs();
-               })
-      .def("output_vars",
-           [](const OperatorBase &op) { return op.OutputVars(true); })
-      .def("inputs", [](const OperatorBase &op) { return op.Inputs(); })
-      .def("input_vars", [](const OperatorBase &op) { return op.InputVars(); })
-      .def("__str__", &OperatorBase::DebugString)
-      .def("no_intermediate_outputs",
-           [](const OperatorBase &op) { return op.OutputVars(false); })
-      .def("support_gpu", &OperatorBase::SupportGPU);
-
-  py::class_<framework::ExecutorPrepareContext>(m, "ExecutorPrepareContext")
-      .def(py::init<const ProgramDesc &, size_t>());
-
-  py::class_<framework::Executor>(m, "Executor")
-      .def(py::init<const platform::Place &>())
-      .def("close", &Executor::Close)
-      .def("run_from_dataset", &Executor::RunFromDataset,
-           py::call_guard<py::gil_scoped_release>())
-      .def("run_prepared_ctx",
-           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
-              std::map<std::string, const LoDTensor *> *feed_targets,
-              std::map<std::string, LoDTensor *> *fetch_targets,
-              bool create_local_scope = true, bool create_vars = true,
-              const std::string &feed_holder_name = "feed",
-              const std::string &fetch_holder_name = "fetch") {
-             pybind11::gil_scoped_release release;
-             self.RunPreparedContext(ctx, scope, feed_targets, fetch_targets,
-                                     create_local_scope, create_vars,
-                                     feed_holder_name, fetch_holder_name);
-           })
-      .def("run_cached_prepared_ctx",
-           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
-              bool create_local_scope = true, bool create_vars = true,
-              bool keep_kids = false) {
-             pybind11::gil_scoped_release release;
-             self.RunPreparedContext(ctx, scope, create_local_scope,
-                                     create_vars, keep_kids);
-           })
-      .def("prepare_ctx_cache", &Executor::PrepareCtxCache,
-           py::call_guard<py::gil_scoped_release>())
-      .def("create_variables", &Executor::CreateVariables,
-           py::call_guard<py::gil_scoped_release>())
-      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
-                     int block_id, bool create_local_scope, bool create_vars,
-                     const std::vector<std::string> &fetch_vars) {
-        pybind11::gil_scoped_release release;
-        self.Run(prog, scope, block_id, create_local_scope, create_vars,
-                 fetch_vars);
-      });
-
-  m.def("init_gflags", framework::InitGflags);
-  m.def("init_glog", framework::InitGLOG);
-  m.def("init_dgc", framework::InitDGC);
-  m.def("init_devices",
-        [](bool init_p2p) { framework::InitDevices(init_p2p); });
-
-  m.def("is_compiled_with_ngraph", IsCompiledWithNGRAPH);
-  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
-  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
-  m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
-  m.def("is_compiled_with_dist", IsCompiledWithDIST);
-#ifdef PADDLE_WITH_CUDA
-  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
-    // Only GPUs with Compute Capability >= 53 support float16
-    return platform::GetCUDAComputeCapability(place.device) >= 53;
-  });
-#endif
-
-  m.def("set_feed_variable", framework::SetFeedVariable);
-  m.def("get_fetch_variable", framework::GetFetchVariable);
-  m.def("get_variable_tensor", framework::GetVariableTensor);
-
-  m.def("_is_program_version_supported", IsProgramVersionSupported);
-
-  BindProgramDesc(&m);
-  BindBlockDesc(&m);
-  BindVarDsec(&m);
-  BindOpDesc(&m);
-  BindConstValue(&m);
-
-  py::class_<framework::LoDRankTable>(m, "LodRankTable")
-      .def("items", [](framework::LoDRankTable &table) {
-        std::vector<std::pair<size_t, size_t>> res;
-        for (auto &item : table.items()) {
-          res.push_back({item.index, item.length});
-        }
-        return res;
-      });
-
-  py::class_<LoDTensorArray>(m, "LoDTensorArray", R"DOC(
-    Array of LoDTensor.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          arr = fluid.LoDTensorArray()
-)DOC")
-      .def("__init__",
-           [](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); })
-      .def("__getitem__",
-           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
-           py::return_value_policy::reference)
-      .def("__len__", [](LoDTensorArray &self) { return self.size(); })
-      .def("__setitem__",
-           [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
-             PADDLE_ENFORCE_LT(i, self.size());
-             self[i].ShareDataWith(t);
-             self[i].set_lod(t.lod());
-           })
-      .def("append",
-           [](LoDTensorArray &self, const LoDTensor &t) {
-             self.emplace_back();
-             self.back().ShareDataWith(t);
-             self.back().set_lod(t.lod());
-           },
-           py::arg("tensor"), R"DOC(
-             Append a LoDensor to LoDTensorArray.
-
-             Examples:
-                 .. code-block:: python
-
-                   import paddle.fluid as fluid
-                   import numpy as np
-
-                   arr = fluid.LoDTensorArray()
-                   t = fluid.LoDTensor()
-                   t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                   arr.append(t)
-           )DOC")
-      .def("_move_to_list",
-           [](LoDTensorArray &self) -> py::list {
-             py::list res(self.size());
-             for (size_t i = 0; i < self.size(); ++i) {
-               res[i] = py::cast(std::move(self[i]));
-             }
-             self.clear();
-             return res;
-           },
-           py::return_value_policy::take_ownership);
-
-  m.def("op_support_gpu", OpSupportGPU);
-#ifdef PADDLE_WITH_CUDA
-  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
-
-#ifndef _WIN32
-  m.def("nvprof_init", platform::CudaProfilerInit);
-  m.def("nvprof_start", platform::CudaProfilerStart);
-  m.def("nvprof_stop", platform::CudaProfilerStop);
-#endif
-#endif
-
-  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
-      .value("kDisabled", platform::ProfilerState::kDisabled)
-      .value("kCPU", platform::ProfilerState::kCPU)
-      .value("kCUDA", platform::ProfilerState::kCUDA)
-      .value("kAll", platform::ProfilerState::kAll)
-      .export_values();
-
-  py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
-      .value("kDefault", platform::EventSortingKey::kDefault)
-      .value("kCalls", platform::EventSortingKey::kCalls)
-      .value("kTotal", platform::EventSortingKey::kTotal)
-      .value("kMin", platform::EventSortingKey::kMin)
-      .value("kMax", platform::EventSortingKey::kMax)
-      .value("kAve", platform::EventSortingKey::kAve)
-      .export_values();
-
-  m.def("enable_profiler", platform::EnableProfiler);
-  m.def("disable_profiler", platform::DisableProfiler);
-  m.def("is_profiler_enabled", platform::IsProfileEnabled);
-  m.def("reset_profiler", platform::ResetProfiler);
-  m.def("get_pass", [](const std::string &pass_type) {
-    auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
-    return std::shared_ptr<framework::ir::Pass>(std::move(pass));
-  });
-
-  m.def("size_of_dtype", framework::SizeOfType);
-
-  using VarQuantScale =
-      std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
-
-  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
-  pass.def(py::init())
-      .def("has", &ir::Pass::Has)
-      .def("set_not_owned",
-           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
-             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
-           })
-      .def(
-          "set",
-          [](ir::Pass &self, const std::string &name, const std::string &attr) {
-            self.Set<std::string>(name, new std::string(attr));
-          })
-      .def("set", [](ir::Pass &self, const std::string &name,
-                     int val) { self.Set<const int>(name, new int(val)); })
-      .def("set",
-           [](ir::Pass &self, const std::string &name,
-              std::unordered_set<std::string> set) {
-             self.Set(name, new std::unordered_set<std::string>(set));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name,
-              std::unordered_set<int> set) {
-             self.Set(name, new std::unordered_set<int>(set));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, VarQuantScale scales) {
-             self.Set(name, new VarQuantScale(scales));
-           })
-      .def("type", &ir::Pass::Type)
-      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
-        self.Apply(graph.get());
-      });
-
-  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
-      m, "PassBuilder");
-  pb.def(py::init())
-      .def("append_pass",
-           [](ir::PassBuilder &self,
-              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
-             return self.AppendPass(pass_type);
-           })
-      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
-      .def("insert_pass",
-           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
-             return self.InsertPass(idx, pass_type);
-           })
-      .def("remove_pass",
-           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
-
-  // -- python binds for parallel executor.
-
-  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
-    ExecutionStrategy allows the user to more preciously control how to run
-    the program in ParallelExecutor by setting the property.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-          y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-          y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-          cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-          avg_loss = fluid.layers.mean(cost)
-
-          sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-          sgd_optimizer.minimize(avg_loss)
-
-          exec_strategy = fluid.ExecutionStrategy()
-          exec_strategy.num_threads = 4
-
-          train_exe = fluid.ParallelExecutor(use_cuda=False,
-                                             loss_name=avg_loss.name,
-                                             exec_strategy=exec_strategy)
-
-        )DOC");
-
-  exec_strategy.def(py::init())
-      .def_property(
-          "num_threads",
-          [](const ExecutionStrategy &self) { return self.num_threads_; },
-          [](ExecutionStrategy &self, size_t num_threads) {
-            self.num_threads_ = num_threads;
-          },
-          R"DOC(The type is INT, num_threads represents the size of thread pool that
-            used to run the operators of the current program in ParallelExecutor.
-            If :math:`num\_threads=1`, all the operators will execute one by one,
-            but the order maybe difference between iterations.
-            If it is not set, it will be set in ParallelExecutor according to the
-            device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
-            :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
-            if it is not set, ParallelExecutor will get the cpu count by calling
-            `multiprocessing.cpu_count()`. Default 0.)DOC")
-      .def_property(
-          "use_cuda",
-          [](const ExecutionStrategy &self) { return self.use_cuda_; },
-          [](ExecutionStrategy &self, bool use_cuda) {
-            self.use_cuda_ = use_cuda;
-          })  // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
-      // make user confuse, because ParallelExecutor has a parameter named
-      // 'use_cuda' too, in current implementation, ParallelExecutor's
-      // 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
-      .def_property(
-          "allow_op_delay",
-          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
-          [](ExecutionStrategy &self, bool allow_op_delay) {
-            self.allow_op_delay_ = allow_op_delay;
-          },
-          R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
-                communication operators to run, it may make the execution faster.
-                Note that this option is invalid now, and it will be removed in
-                next version. Default False.)DOC")
-      .def_property(
-          "num_iteration_per_drop_scope",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_drop_scope_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
-            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
-          },
-          R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
-                many iterations to clean up the temp variables which
-                is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations.
-                Default 1.
-
-                NOTES:
-                    1. If you fetch data when calling the 'run', the ParallelExecutor
-                       will clean up the temp variables at the end of the current iteration.
-                    2. In some NLP model, it may cause the GPU memory is insufficient,
-                       in this case, you should reduce `num_iteration_per_drop_scope`.
-              )DOC")
-      .def_property(
-          "num_iteration_per_run",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_run_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_run) {
-            self.num_iteration_per_run_ = num_iteration_per_run;
-          },
-          R"DOC(This config that how many iteration the executor will run when
-                user call pe.run() in python
-              )DOC")
-      .def_property("_dry_run",
-                    [](const ExecutionStrategy &self) { return self.dry_run_; },
-                    [](ExecutionStrategy &self, bool dry_run) {
-                      self.dry_run_ = dry_run;
-                    });
-
-  exec_strategy.def_property(
-      "use_experimental_executor",
-      [](const ExecutionStrategy &self) {
-        return self.type_ == ExecutionStrategy::kExperimental;
-      },
-      [](ExecutionStrategy &self, bool experimental) {
-        self.type_ = experimental ? ExecutionStrategy::kExperimental
-                                  : ExecutionStrategy::kDefault;
-      });
-
-  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
-    BuildStrategy allows the user to more preciously control how to
-    build the SSA Graph in ParallelExecutor by setting the property.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-)DOC");
-
-  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
-      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
-      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce);
-  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
-                                                  "GradientScaleStrategy")
-      .value("CoeffNumDevice",
-             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
-      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
-      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
-
-  build_strategy.def(py::init())
-      .def_property(
-          "reduce_strategy",
-          [](const BuildStrategy &self) { return self.reduce_; },
-          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.reduce_ = strategy;
-          },
-          R"DOC(The type is fluid.BuildStrategy.ReduceStrategy, there are two reduce
-                strategies in ParallelExecutor, AllReduce and Reduce. If you want
-                that all the parameters' optimization are done on all devices independently,
-                you should choose AllReduce; if you choose Reduce, all the parameters'
-                optimization will be evenly distributed to different devices, and then
-                broadcast the optimized parameter to other devices.
-                Default 'AllReduce'.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-                  )DOC")
-      .def_property(
-          "gradient_scale_strategy",
-          [](const BuildStrategy &self) { return self.gradient_scale_; },
-          [](BuildStrategy &self,
-             BuildStrategy::GradientScaleStrategy strategy) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finalized.");
-            self.gradient_scale_ = strategy;
-          },
-          R"DOC(The type is fluid.BuildStrategy.GradientScaleStrategy, there are three
-                ways of defining :math:`loss@grad` in ParallelExecutor, CoeffNumDevice,
-                One and Customized. By default, ParallelExecutor sets the :math:`loss@grad`
-                according to the number of devices. If you want to customize :math:`loss@grad`,
-                you can choose Customized. Default 'CoeffNumDevice'.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        import paddle.fluid.compiler as compiler
-                        import numpy
-                        import os
-
-                        use_cuda = True
-                        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-                        exe = fluid.Executor(place)
-
-                        # NOTE: If you use CPU to run the program, you need
-                        # to specify the CPU_NUM, otherwise, fluid will use
-                        # all the number of the logic core as the CPU_NUM,
-                        # in that case, the batch size of the input should be
-                        # greater than CPU_NUM, if not, the process will be
-                        # failed by an exception.
-                        if not use_cuda:
-                            os.environ['CPU_NUM'] = str(2)
-                            places = fluid.cpu_places()
-                        else:
-                            places = places = fluid.cuda_places()
-
-                        data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-                        hidden = fluid.layers.fc(input=data, size=10)
-                        loss = fluid.layers.mean(hidden)
-                        fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-                        fluid.default_startup_program().random_seed=1
-                        exe.run(fluid.default_startup_program())
-
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.gradient_scale_strategy = \
-                                 fluid.BuildStrategy.GradientScaleStrategy.Customized
-                        compiled_prog = compiler.CompiledProgram(
-                                 fluid.default_main_program()).with_data_parallel(
-                                          loss_name=loss.name, build_strategy=build_strategy,
-                                          places = places)
-
-                        dev_count =  len(places)
-                        x = numpy.random.random(size=(10, 1)).astype('float32')
-                        loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
-                        loss_grad_name = loss.name+"@GRAD"
-                        loss_data = exe.run(compiled_prog,
-                                             feed={"X": x, loss_grad_name : loss_grad},
-                                             fetch_list=[loss.name, loss_grad_name])
-                   )DOC")
-      .def_property(
-          "debug_graphviz_path",
-          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
-          [](BuildStrategy &self, const std::string &path) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.debug_graphviz_path_ = path;
-          },
-          R"DOC(The type is STR, debug_graphviz_path indicates the path that
-                writing the SSA Graph to file in the form of graphviz.
-                It is useful for debugging. Default ""
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.debug_graphviz_path = "./graph"
-
-                    )DOC")
-      .def_property(
-          "enable_sequential_execution",
-          [](const BuildStrategy &self) {
-            return self.enable_sequential_execution_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.enable_sequential_execution_ = b;
-          },
-          R"DOC(The type is BOOL. If set True, the execution order of ops would
-                be the same as what is in the program. Default False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.enable_sequential_execution = True
-          )DOC")
-      .def_property(
-          "remove_unnecessary_lock",
-          [](const BuildStrategy &self) {
-            return self.remove_unnecessary_lock_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.remove_unnecessary_lock_ = b;
-          },
-          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be
-                released and ParallelExecutor would run faster. Default True.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.remove_unnecessary_lock = True
-          )DOC")
-      .def_property(
-          "num_trainers",
-          [](const BuildStrategy &self) { return self.num_trainers_; },
-          [](BuildStrategy &self, int num_trainers) {
-#ifdef WIN32
-            PADDLE_THROW("Windows has NO support to distribute mode.");
-#endif
-            self.num_trainers_ = num_trainers;
-          })
-      .def_property(
-          "trainers_endpoints",
-          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
-          [](BuildStrategy &self,
-             const std::vector<std::string> &trainers_endpoints) {
-            self.trainers_endpoints_ = trainers_endpoints;
-          })
-      .def_property("trainer_id",
-                    [](const BuildStrategy &self) { return self.trainer_id_; },
-                    [](BuildStrategy &self, int trainer_id) {
-                      self.trainer_id_ = trainer_id;
-                    })
-      .def_property(
-          "nccl_comm_num",
-          [](const BuildStrategy &self) { return self.nccl_comm_num_; },
-          [](BuildStrategy &self, int nccl_comm_num) {
-            self.nccl_comm_num_ = nccl_comm_num;
-          })
-      .def_property("use_hierarchical_allreduce",
-                    [](const BuildStrategy &self) {
-                      return self.use_hierarchical_allreduce_;
-                    },
-                    [](BuildStrategy &self, bool use) {
-                      self.use_hierarchical_allreduce_ = use;
-                    })
-      .def_property("hierarchical_allreduce_inter_nranks",
-                    [](const BuildStrategy &self) {
-                      return self.hierarchical_allreduce_inter_nranks_;
-                    },
-                    [](BuildStrategy &self, int nranks) {
-                      self.hierarchical_allreduce_inter_nranks_ = nranks;
-                    })
-
-      .def_property(
-          "fuse_elewise_add_act_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_elewise_add_act_ops_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.fuse_elewise_add_act_ops_ = b;
-          },
-          R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
-                to fuse elementwise_add_op and activation_op,
-                it may make the execution faster. Default False
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.fuse_elewise_add_act_ops = True
-                     )DOC")
-      .def_property(
-          "fuse_relu_depthwise_conv",
-          [](const BuildStrategy &self) {
-            return self.fuse_relu_depthwise_conv_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.fuse_relu_depthwise_conv_ = b;
-          },
-          R"DOC(The type is BOOL, fuse_relu_depthwise_conv indicate whether
-                to fuse relu and depthwise_conv2d,
-                it will save GPU memory and may make the execution faster.
-                This options is only available in GPU devices.
-                Default False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.fuse_relu_depthwise_conv = True
-          )DOC")
-      .def_property("fuse_broadcast_ops",
-                    [](const BuildStrategy &self) {
-                      return self.fuse_broadcast_ops_ == true ||
-                             self.fuse_broadcast_ops_ == boost::none;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                                        "BuildStrategy is finlaized.");
-                      self.fuse_broadcast_ops_ = b;
-                    },
-                    R"DOC(The type is BOOL, fuse_broadcast_op indicates whether
-                      to fuse the broadcast ops. Note that, in Reduce mode,
-                      fusing broadcast ops may make the program faster. Because
-                      fusing broadcast OP equals delaying the execution of all
-                      broadcast Ops, in this case, all nccl streams are used only
-                      for NCCLReduce operations for a period of time. Default False.)DOC")
-      .def_property("fuse_all_optimizer_ops",
-                    [](const BuildStrategy &self) {
-                      return self.fuse_all_optimizer_ops_ == true ||
-                             self.fuse_all_optimizer_ops_ == boost::none;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                                        "BuildStrategy is finlaized.");
-                      self.fuse_all_optimizer_ops_ = b;
-                    })
-      .def_property(
-          "sync_batch_norm",
-          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
-                              "BuildStrategy is finlaized.");
-            self.sync_batch_norm_ = b;
-          },
-          R"DOC(The type is BOOL, sync_batch_norm indicates whether to use
-                synchronous batch normalization which synchronizes the mean
-                and variance through multi-devices in training phase.
-
-                Current implementation doesn't support FP16 training and CPU.
-                And only synchronous on one machine, not all machines.
-
-                Default False
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.sync_batch_norm = True
-                )DOC")
-      .def_property(
-          "memory_optimize",
-          [](const BuildStrategy &self) -> py::object {
-            if (self.memory_optimize_) {
-              return py::cast(self.memory_optimize_.get());
-            } else {
-              return py::cast(nullptr);
-            }
-          },
-          [](BuildStrategy &self, const py::handle &value) {
-            auto *py_obj = value.ptr();
-            if (py_obj == nullptr || py_obj == Py_None) {
-              self.memory_optimize_ = boost::none;
-            } else if (PyBool_Check(py_obj)) {
-              self.memory_optimize_ = (py_obj == Py_True);
-            } else {
-              PADDLE_THROW(
-                  "BuildStrategy.memory_optimize must be None, False or True");
-            }
-          },
-          R"DOC(The type is BOOL or None, memory opitimize aims to save total memory
-                consumption, set to True to enable it.
-
-                Default None. None means framework would choose to use or not use 
-                this strategy automatically. Currently, None means that it is 
-                enabled when GC is disabled, and disabled when GC is enabled. 
-                True means enabling and False means disabling. Default None.)DOC")
-      .def_property(
-          "is_distribution",
-          [](const BuildStrategy &self) { return self.is_distribution_; },
-          [](BuildStrategy &self, bool b) {
-#ifdef WIN32
-            if (b) {
-              PADDLE_THROW("Windows has NO support to distribute mode.");
-            }
-#else
-            self.is_distribution_ = b;
-#endif
-          })
-      .def_property("async_mode",
-                    [](const BuildStrategy &self) { return self.async_mode_; },
-                    [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
-      .def_property(
-          "enable_inplace",
-          [](const BuildStrategy &self) { return self.enable_inplace_; },
-          [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
-      .def_property(
-          "fuse_all_reduce_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_all_reduce_ops_ == true ||
-                   self.fuse_all_reduce_ops_ == boost::none;
-          },
-          [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
-      .def_property("enable_backward_optimizer_op_deps",
-                    [](const BuildStrategy &self) {
-                      return self.enable_backward_optimizer_op_deps_;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      self.enable_backward_optimizer_op_deps_ = b;
-                    })
-      .def_property(
-          "cache_runtime_context",
-          [](const BuildStrategy &self) { return self.cache_runtime_context_; },
-          [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
-      .def_property(
-          "mkldnn_enabled_op_types",
-          [](const BuildStrategy &self) {
-            return self.mkldnn_enabled_op_types_;
-          },
-          [](BuildStrategy &self,
-             const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
-            self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
-          })
-      .def("_finalize_strategy_and_create_passes",
-           [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-             return self.CreatePassesFromStrategy(true);
-           },
-           R"DOC(Allow user to customized passes. Normally model-specific
-                optimization passes should be defined in this way. BuildStrategy
-                cannot be updated after being finalized.)DOC");
-
-  pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::vector<std::string> &, const std::string &,
-                  Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
-                  const BuildStrategy &, ir::Graph *>())
-      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
-      // We still cannot get local_scope from this vector, since the element
-      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
-      // one by one and mark them as reference.
-      .def("local_scopes",
-           [](ParallelExecutor &self) -> std::vector<Scope *> * {
-             return &self.GetLocalScopes();
-           },
-           py::return_value_policy::reference)
-      .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
-      .def("_need_create_local_exe_scopes",
-           &ParallelExecutor::NeedCreateLocalExeScope)
-      .def("feed_tensors_into_local_scopes",
-           &ParallelExecutor::FeedTensorsIntoLocalScopes)
-      .def("feed_and_split_tensor_into_local_scopes",
-           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
-      .def("run", [](ParallelExecutor &self,
-                     const std::vector<std::string> &fetch_tensors) {
-        pybind11::gil_scoped_release release;
-        return self.Run(fetch_tensors);
-      });
-
-  BindFleetWrapper(&m);
-  BindBoxHelper(&m);
-#ifndef _WIN32
-  BindNCCLWrapper(&m);
-#endif
-  BindGraph(&m);
-  BindNode(&m);
-  BindInferenceApi(&m);
-  BindDataset(&m);
-#ifdef PADDLE_WITH_DISTRIBUTE
-  BindCommunicator(&m);
-#endif
-}
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind_boost_headers.h b/paddle/fluid/pybind/pybind_boost_headers.h
deleted file mode 100644
index 3eb4db175a745c8ea7a3afaff919e4f21d430a8b..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/pybind_boost_headers.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <Python.h>
-
-#include <vector>
-
-#include "glog/logging.h"
-#include "paddle/fluid/platform/variant.h"
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-// Cast boost::variant for PyBind.
-// Copy from
-// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199
-namespace pybind11 {
-namespace detail {
-
-#if !defined(PYBIND11_HIDDEN)
-#ifdef _WIN32
-#define PYBIND11_HIDDEN __declspec(dllexport)
-#else
-#define PYBIND11_HIDDEN __attribute__((visibility("hidden")))
-#endif
-#endif
-
-// Can be replaced by a generic lambda in C++14
-struct PYBIND11_HIDDEN paddle_variant_caster_visitor
-    : public boost::static_visitor<handle> {
-  return_value_policy policy;
-  handle parent;
-
-  paddle_variant_caster_visitor(return_value_policy policy, handle parent)
-      : policy(policy), parent(parent) {}
-
-  template <class T>
-  handle operator()(T const &src) const {
-    return make_caster<T>::cast(src, policy, parent);
-  }
-};
-
-template <class Variant>
-struct paddle_variant_caster;
-
-template <template <class...> class V, class... Ts>
-struct paddle_variant_caster<V<Ts...>> {
-  using Type = V<Ts...>;
-
-  template <typename T>
-  typename std::enable_if<
-      !std::is_same<T, boost::detail::variant::void_>::value, bool>::type
-  try_load(handle src, bool convert) {
-    auto caster = make_caster<T>();
-    if (!load_success_ && caster.load(src, convert)) {
-      load_success_ = true;
-
-      if (std::is_same<T, std::vector<float>>::value) {
-        auto caster_ints = make_caster<std::vector<int64_t>>();
-        if (caster_ints.load(src, convert)) {
-          VLOG(4) << "This value are floats and int64_ts satisfy "
-                     "simultaneously, will set it's type to "
-                     "std::vector<int64_t>";
-          value = cast_op<std::vector<int64_t>>(caster_ints);
-          return true;
-        }
-      }
-
-      if (std::is_same<T, float>::value) {
-        auto caster_int64 = make_caster<int64_t>();
-        if (caster_int64.load(src, convert)) {
-          VLOG(4) << "this value are float and int64 satisfy simula.";
-          value = cast_op<int64_t>(caster_int64);
-          return true;
-        }
-      }
-
-      value = cast_op<T>(caster);
-      return true;
-    }
-    return false;
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_same<T, boost::detail::variant::void_>::value,
-                          bool>::type
-  try_load(handle src, bool convert) {
-    return false;
-  }
-
-  bool load(handle src, bool convert) {
-    auto unused = {false, try_load<Ts>(src, convert)...};
-    (void)(unused);
-    return load_success_;
-  }
-
-  static handle cast(Type const &src, return_value_policy policy,
-                     handle parent) {
-    paddle_variant_caster_visitor visitor(policy, parent);
-    return boost::apply_visitor(visitor, src);
-  }
-
-  PYBIND11_TYPE_CASTER(Type, _("Variant"));
-  bool load_success_{false};
-};
-
-// Add specialization for concrete variant type
-template <class... Args>
-struct type_caster<boost::variant<Args...>>
-    : paddle_variant_caster<boost::variant<Args...>> {};
-
-}  // namespace detail
-}  // namespace pybind11
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
deleted file mode 100644
index 4009bcf2a8b07acaf49a6bf865036ce4a650ef3d..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/reader_py.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/pybind/reader_py.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "Python.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/operators/reader/buffered_reader.h"
-#include "paddle/fluid/operators/reader/py_reader.h"
-#include "paddle/fluid/platform/place.h"
-#include "pybind11/stl.h"
-
-namespace paddle {
-namespace pybind {
-
-namespace py = pybind11;
-
-static void RaiseStopIterationException() {
-  VLOG(2) << "Raise StopIteration Exception in Python";
-  py::gil_scoped_acquire guard;
-  throw py::stop_iteration();
-}
-
-class MultiDeviceFeedReader {
- public:
-  using ResultDictList =
-      std::vector<std::unordered_map<std::string, framework::LoDTensor>>;
-  using ResultList = std::vector<std::vector<framework::LoDTensor>>;
-
-  MultiDeviceFeedReader(
-      const std::shared_ptr<operators::reader::LoDTensorBlockingQueue> &queue,
-      const std::vector<std::string> &names,
-      const std::vector<platform::Place> &dst_places, bool use_double_buffer)
-      : queue_(queue),
-        names_(names),
-        pool_(new ::ThreadPool(dst_places.size())) {
-    std::shared_ptr<framework::ReaderBase> reader(
-        new operators::reader::PyReader(queue));
-
-    readers_.reserve(dst_places.size());
-    for (auto &p : dst_places) {
-      auto *holder = new framework::ReaderHolder();
-      if (use_double_buffer) {
-        holder->Reset(
-            framework::MakeDecoratedReader<operators::reader::BufferedReader>(
-                reader, p, 2));
-      } else {
-        if (platform::is_gpu_place(p)) {
-          PADDLE_THROW(
-              "Place cannot be CUDAPlace when use_double_buffer is False");
-        }
-        holder->Reset(reader);
-      }
-      readers_.emplace_back(holder);
-    }
-
-    futures_.resize(dst_places.size());
-    ret_.resize(dst_places.size());
-    ReadAsync();
-  }
-
-  ResultDictList ReadNext() {
-    bool success = WaitFutures();
-
-    if (!success) {
-      RaiseStopIterationException();
-      return {};
-    }
-
-    ResultDictList result(ret_.size());
-    for (size_t i = 0; i < ret_.size(); ++i) {
-      for (size_t j = 0; j < names_.size(); ++j) {
-        result[i].emplace(names_[j], std::move(ret_[i][j]));
-      }
-    }
-    ReadAsync();
-    return result;
-  }
-
-  ResultList ReadNextList() {
-    bool success = WaitFutures();
-    if (!success) {
-      RaiseStopIterationException();
-      return {};
-    }
-
-    ResultList result;
-    result.reserve(ret_.size());
-    for (size_t i = 0; i < ret_.size(); ++i) {
-      result.emplace_back(std::move(ret_[i]));
-    }
-    ReadAsync();
-    return result;
-  }
-
-  void Reset() {
-    Shutdown();
-    Start();
-    ReadAsync();
-  }
-
-  ~MultiDeviceFeedReader() {
-    queue_->Close();
-    pool_.reset();
-  }
-
- private:
-  bool WaitFutures() {
-    bool success = true;
-    for (auto &f : futures_) {
-      success &= f.get();
-    }
-    return success;
-  }
-
-  void Shutdown() {
-    for (auto &r : readers_) r->Shutdown();
-  }
-
-  void Start() {
-    for (auto &r : readers_) r->Start();
-  }
-
-  void ReadAsync() {
-    for (size_t i = 0; i < readers_.size(); ++i) {
-      futures_[i] = pool_->enqueue([this, i] {
-        readers_[i]->ReadNext(&ret_[i]);
-        return !ret_[i].empty();
-      });
-    }
-  }
-
-  std::shared_ptr<operators::reader::LoDTensorBlockingQueue> queue_;
-  std::vector<std::string> names_;
-  std::unique_ptr<::ThreadPool> pool_;
-
-  std::vector<std::unique_ptr<framework::ReaderHolder>> readers_;
-
-  std::vector<std::future<bool>> futures_;
-  std::vector<std::vector<framework::LoDTensor>> ret_;
-};
-
-void BindReader(py::module *module) {
-  auto &m = *module;
-
-  namespace reader = ::paddle::operators::reader;
-
-  py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("start", &framework::ReaderHolder::Start)
-      .def("reset", &framework::ReaderHolder::ResetAll);
-
-  py::class_<MultiDeviceFeedReader>(m, "MultiDeviceFeedReader", "")
-      .def("read_next", &MultiDeviceFeedReader::ReadNext,
-           py::call_guard<py::gil_scoped_release>())
-      .def("read_next_list", &MultiDeviceFeedReader::ReadNextList,
-           py::call_guard<py::gil_scoped_release>())
-      .def("reset", &MultiDeviceFeedReader::Reset,
-           py::call_guard<py::gil_scoped_release>());
-
-  m.def("create_py_reader",
-        [](const std::shared_ptr<operators::reader::LoDTensorBlockingQueue>
-               &queue,
-           const std::vector<std::string> &names,
-           const std::vector<platform::Place> &dst_places,
-           bool use_double_buffer) {
-          return new MultiDeviceFeedReader(queue, names, dst_places,
-                                           use_double_buffer);
-        },
-        py::return_value_policy::take_ownership);
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/reader_py.h b/paddle/fluid/pybind/reader_py.h
deleted file mode 100644
index 472ff65368f3fb206ae599ae5d9d11e9ae8195ae..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/reader_py.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "pybind11/pybind11.h"
-
-namespace paddle {
-namespace pybind {
-
-void BindReader(pybind11::module *module);
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
deleted file mode 100644
index 08e43bf24ce1a6863f13b6334f9b3272e4414ff5..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/tensor_py.h
+++ /dev/null
@@ -1,519 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <Python.h>
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-
-template <typename T>
-T TensorGetElement(const framework::Tensor &self, size_t offset) {
-  PADDLE_ENFORCE_LT(offset, self.numel());
-  T b = static_cast<T>(0);
-  if (platform::is_cpu_place(self.place())) {
-    b = self.data<T>()[offset];
-#ifdef PADDLE_WITH_CUDA
-  } else {
-    const T *a = self.data<T>();
-    auto p = boost::get<platform::CUDAPlace>(self.place());
-    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
-                         nullptr);
-#endif
-  }
-  return b;
-}
-
-template <typename T>
-void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
-  PADDLE_ENFORCE_LT(offset, self->numel());
-  if (platform::is_cpu_place(self->place())) {
-    self->mutable_data<T>(self->place())[offset] = elem;
-#ifdef PADDLE_WITH_CUDA
-  } else {
-    auto p = boost::get<platform::CUDAPlace>(self->place());
-    T *a = self->mutable_data<T>(p);
-    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
-                         nullptr);
-#endif
-  }
-}
-
-template <typename T>
-void PyCPUTensorSetFromArray(
-    framework::Tensor *self,
-    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
-        array,
-    paddle::platform::CPUPlace place) {
-  std::vector<int64_t> dims;
-  dims.reserve(array.ndim());
-  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
-    dims.push_back(static_cast<int>(array.shape()[i]));
-  }
-
-  self->Resize(framework::make_ddim(dims));
-  auto *dst = self->mutable_data<T>(place);
-  std::memcpy(dst, array.data(), sizeof(T) * array.size());
-}
-
-template <>
-// This following specialization maps uint16_t in the parameter type to
-// platform::float16.
-inline void PyCPUTensorSetFromArray(
-    framework::Tensor *self,
-    pybind11::array_t<uint16_t,
-                      pybind11::array::c_style | pybind11::array::forcecast>
-        array,
-    paddle::platform::CPUPlace place) {
-  std::vector<int64_t> dims;
-  dims.reserve(array.ndim());
-  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
-    dims.push_back(static_cast<int>(array.shape()[i]));
-  }
-
-  self->Resize(framework::make_ddim(dims));
-  auto *dst = self->mutable_data<platform::float16>(place);
-  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
-}
-
-template <typename T, size_t D>
-void _sliceCompute(const framework::Tensor *in, framework::Tensor *out,
-                   const platform::CPUDeviceContext &ctx,
-                   const std::vector<int> &axes,
-                   const std::vector<int> &starts) {
-  auto &eigen_place = *ctx.eigen_device();
-  auto place = in->place();
-  auto out_dims = out->dims();
-  auto in_dims = in->dims();
-
-  auto offsets = Eigen::array<int, D>();
-  auto extents = Eigen::array<int, D>();
-  for (size_t i = 0; i < D; ++i) {
-    offsets[i] = 0;
-    extents[i] = out_dims[i];
-  }
-  int start;
-  for (size_t i = 0; i < axes.size(); ++i) {
-    start = starts[i];
-    if (start < 0) {
-      start = (start + in_dims[axes[i]]);
-    }
-    start = std::max(start, 0);
-    offsets[axes[i]] = start;
-  }
-  auto in_t =
-      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-          *in);
-  auto out_t =
-      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-          *out);
-  out_t.device(eigen_place) = in_t.slice(offsets, extents);
-}
-
-template <typename T>
-void _concatCompute(const std::vector<paddle::framework::Tensor> &ins,
-                    paddle::framework::Tensor *out,
-                    const platform::CPUDeviceContext &ctx, int64_t axis) {
-  if (axis == 0 && ins.size() < 10) {
-    size_t output_offset = 0;
-    for (auto &in : ins) {
-      auto in_stride = framework::stride_numel(in.dims());
-      auto out_stride = framework::stride_numel(out->dims());
-      paddle::operators::StridedNumelCopyWithAxis<T>(
-          ctx, axis, out->data<T>() + output_offset, out_stride, in.data<T>(),
-          in_stride, in_stride[axis]);
-      output_offset += in_stride[axis];
-    }
-  } else {
-    paddle::operators::math::ConcatFunctor<platform::CPUDeviceContext, T>
-        concat_functor;
-    concat_functor(ctx, ins, static_cast<int>(axis), out);
-  }
-}
-
-void _getSliceinfo(const framework::Tensor &self, py::object obj,
-                   const int64_t dim, int64_t *pstart, int64_t *pstop,
-                   int64_t *pstep, int64_t *pslicelength) {
-  auto &start = *pstart;
-  auto &stop = *pstop;
-  auto &step = *pstep;
-  auto &slicelength = *pslicelength;
-  const framework::DDim &srcDDim = self.dims();
-  if (dim < 0 || dim >= srcDDim.size()) {
-    throw py::index_error();
-  }
-  if (py::isinstance<py::slice>(obj)) {
-    size_t lstart, lstop, lstep, lslicelength;
-    py::slice s = static_cast<py::slice>(obj);
-    if (!s.compute(srcDDim[dim], &lstart, &lstop, &lstep, &lslicelength)) {
-      throw py::index_error();
-    }
-    start = static_cast<int64_t>(lstart);
-    stop = static_cast<int64_t>(lstop);
-    step = static_cast<int64_t>(lstep);
-    slicelength = static_cast<int64_t>(lslicelength);
-  } else if (py::isinstance<py::int_>(obj)) {
-    start = static_cast<int64_t>(static_cast<py::int_>(obj));
-    if (std::abs(start) >= srcDDim[dim]) {
-      throw py::index_error();
-    }
-    start = (start >= 0) ? start : srcDDim[dim] - start;
-    stop = start + 1;
-    step = 1;
-    slicelength = 1;
-  } else {
-    throw py::index_error();
-  }
-}
-
-inline framework::Tensor *_getTensor(const framework::Tensor &self,
-                                     const framework::DDim &ddim) {
-  framework::Tensor *output = new framework::Tensor();
-  output->Resize(ddim);
-  auto place = self.place();
-  if (platform::is_cpu_place(place)) {
-    output->mutable_data(boost::get<platform::CPUPlace>(place), self.type());
-#ifdef PADDLE_WITH_CUDA
-  } else {
-    if (platform::is_cuda_pinned_place(place)) {
-      output->mutable_data(boost::get<platform::CUDAPinnedPlace>(place),
-                           self.type());
-    } else if ((platform::is_gpu_place(place))) {
-      output->mutable_data(boost::get<platform::CUDAPlace>(place), self.type());
-    }
-#endif
-  }
-  return output;
-}
-
-template <typename T>
-void _sliceDapper(const framework::Tensor *in, framework::Tensor *out,
-                  const platform::CPUDeviceContext &ctx,
-                  const std::vector<int> &axes, const std::vector<int> &starts,
-                  int size) {
-  switch (size) {
-    case 1:
-      _sliceCompute<T, 1>(in, out, ctx, axes, starts);
-      break;
-    case 2:
-      _sliceCompute<T, 2>(in, out, ctx, axes, starts);
-      break;
-    case 3:
-      _sliceCompute<T, 3>(in, out, ctx, axes, starts);
-      break;
-    case 4:
-      _sliceCompute<T, 4>(in, out, ctx, axes, starts);
-      break;
-    case 5:
-      _sliceCompute<T, 5>(in, out, ctx, axes, starts);
-      break;
-    case 6:
-      _sliceCompute<T, 6>(in, out, ctx, axes, starts);
-      break;
-    case 7:
-      _sliceCompute<T, 7>(in, out, ctx, axes, starts);
-      break;
-    case 8:
-      _sliceCompute<T, 8>(in, out, ctx, axes, starts);
-      break;
-    case 9:
-      _sliceCompute<T, 9>(in, out, ctx, axes, starts);
-      break;
-    default:
-      PADDLE_THROW("dim size not exepected, current is %d", size);
-      break;
-  }
-}
-
-template <typename T>
-inline framework::Tensor *_sliceWrapper(const framework::Tensor &self,
-                                        const platform::CPUDeviceContext &ctx,
-                                        py::object obj, int dim, int64_t start,
-                                        int64_t slicelength) {
-  framework::DDim dstDDim = self.dims();
-  dstDDim[dim] = static_cast<int64_t>(slicelength);
-  std::vector<int> axes({dim});
-  std::vector<int> starts({static_cast<int>(start)});
-  framework::Tensor *output = _getTensor(self, dstDDim);
-  _sliceDapper<T>(&self, output, ctx, axes, starts, dstDDim.size());
-  return output;
-}
-
-template <typename T>
-inline framework::Tensor *_sliceAndConcat(const framework::Tensor &self,
-                                          py::object obj, int dim) {
-  platform::CPUDeviceContext ctx;
-  int64_t start, stop, step, slicelength;
-  _getSliceinfo(self, obj, dim, &start, &stop, &step, &slicelength);
-  if (step == 1 || slicelength == 1) {
-    return _sliceWrapper<T>(self, ctx, obj, dim, start, slicelength);
-  } else {
-    std::vector<framework::Tensor> ins;
-    for (auto i = 0; i < slicelength; ++i, start += step) {
-      ins.emplace_back(*_sliceWrapper<T>(self, ctx, obj, dim, start, 1));
-    }
-
-    // do the concat operation
-    framework::DDim dstDDim = self.dims();
-    dstDDim[dim] = static_cast<int64_t>(slicelength);
-    framework::Tensor *output1 = _getTensor(self, dstDDim);
-    _concatCompute<T>(ins, output1, ctx, dim);
-    return output1;
-  }
-}
-
-inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
-                                       py::object obj, int dim) {
-  auto src_type = self.type();
-  switch (src_type) {
-    case framework::proto::VarType::FP16:
-      return _sliceAndConcat<paddle::platform::float16>(self, obj, dim);
-    case framework::proto::VarType::FP32:
-      return _sliceAndConcat<float>(self, obj, dim);
-    case framework::proto::VarType::FP64:
-      return _sliceAndConcat<double>(self, obj, dim);
-    case framework::proto::VarType::INT32:
-      return _sliceAndConcat<int>(self, obj, dim);
-    case framework::proto::VarType::INT64:
-      return _sliceAndConcat<int64_t>(self, obj, dim);
-    case framework::proto::VarType::BOOL:
-      return _sliceAndConcat<bool>(self, obj, dim);
-    case framework::proto::VarType::INT16:
-      return _sliceAndConcat<bool>(self, obj, dim);
-    case framework::proto::VarType::UINT8:
-      return _sliceAndConcat<bool>(self, obj, dim);
-    default:
-      PADDLE_THROW("Not support type %d", src_type);
-  }
-}
-
-inline framework::Tensor *_pySliceTensor(const framework::Tensor &self,
-                                         py::object obj) {
-  if (py::isinstance<py::tuple>(obj)) {
-    py::list l = static_cast<py::list>(obj);
-    std::unique_ptr<framework::Tensor> target;
-    framework::Tensor *src = const_cast<framework::Tensor *>(&self);
-    for (auto i = 0; i < static_cast<int>(l.size()); ++i) {
-      src = _sliceTensor(*src, l[i], i);
-      if (i + 1 == static_cast<int>(l.size())) {
-        return src;
-      } else {
-        target.reset(src);
-      }
-    }
-    return nullptr;
-  } else {
-    return _sliceTensor(self, obj, 0);
-  }
-}
-
-inline framework::Tensor *PySliceTensor(const framework::Tensor &self,
-                                        py::object obj) {
-  if (platform::is_gpu_place(self.place())) {
-    std::unique_ptr<framework::Tensor> holder;
-    framework::Tensor src;
-    framework::TensorCopySync(self, platform::CPUPlace(), &src);
-    framework::Tensor *output = _pySliceTensor(src, obj);
-    holder.reset(output);
-    framework::Tensor *dst = _getTensor(*output, output->dims());
-    framework::TensorCopySync(*output, self.place(), dst);
-    return dst;
-  } else {
-    return _pySliceTensor(self, obj);
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-void PyCUDATensorSetFromArray(
-    framework::Tensor *self,
-    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
-        array,
-    paddle::platform::CUDAPlace place) {
-  std::vector<int64_t> dims;
-  dims.reserve(array.ndim());
-  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
-    dims.push_back(static_cast<int>(array.shape()[i]));
-  }
-
-  self->Resize(framework::make_ddim(dims));
-  auto *dst = self->mutable_data<T>(place);
-  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
-                                  cudaMemcpyHostToDevice);
-}
-
-template <>
-// This following specialization maps uint16_t in the parameter type to
-// platform::float16.
-inline void PyCUDATensorSetFromArray(
-    framework::Tensor *self,
-    pybind11::array_t<uint16_t,
-                      pybind11::array::c_style | pybind11::array::forcecast>
-        array,
-    paddle::platform::CUDAPlace place) {
-  std::vector<int64_t> dims;
-  dims.reserve(array.ndim());
-  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
-    dims.push_back(static_cast<int>(array.shape()[i]));
-  }
-
-  self->Resize(framework::make_ddim(dims));
-  auto *dst = self->mutable_data<platform::float16>(place);
-  paddle::platform::GpuMemcpySync(dst, array.data(),
-                                  sizeof(uint16_t) * array.size(),
-                                  cudaMemcpyHostToDevice);
-}
-
-template <typename T>
-void PyCUDAPinnedTensorSetFromArray(
-    framework::Tensor *self,
-    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
-        array,
-    const paddle::platform::CUDAPinnedPlace &place) {
-  std::vector<int64_t> dims;
-  dims.reserve(array.ndim());
-  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
-    dims.push_back(static_cast<int>(array.shape()[i]));
-  }
-
-  self->Resize(framework::make_ddim(dims));
-  auto *dst = self->mutable_data<T>(place);
-  std::memcpy(dst, array.data(), sizeof(T) * array.size());
-}
-
-template <>
-// This following specialization maps uint16_t in the parameter type to
-// platform::float16.
-inline void PyCUDAPinnedTensorSetFromArray(
-    framework::Tensor *self,
-    pybind11::array_t<uint16_t,
-                      pybind11::array::c_style | pybind11::array::forcecast>
-        array,
-    const paddle::platform::CUDAPinnedPlace &place) {
-  std::vector<int64_t> dims;
-  dims.reserve(array.ndim());
-  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
-    dims.push_back(static_cast<int>(array.shape()[i]));
-  }
-
-  self->Resize(framework::make_ddim(dims));
-  auto *dst = self->mutable_data<platform::float16>(place);
-  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
-}
-#endif
-
-namespace details {
-
-template <typename T>
-struct ValidDTypeToPyArrayChecker {
-  static constexpr bool kValue = false;
-};
-
-#define DECLARE_VALID_DTYPE_TO_PY_ARRAY(type) \
-  template <>                                 \
-  struct ValidDTypeToPyArrayChecker<type> {   \
-    static constexpr bool kValue = true;      \
-  }
-
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(float);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(double);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(int8_t);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(uint8_t);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(int);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(int64_t);
-
-inline std::string TensorDTypeToPyDTypeStr(
-    framework::proto::VarType::Type type) {
-#define TENSOR_DTYPE_TO_PY_DTYPE(T, proto_type)                             \
-  if (type == proto_type) {                                                 \
-    if (std::is_same<T, platform::float16>::value) {                        \
-      return "e";                                                           \
-    } else {                                                                \
-      constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker<T>::kValue; \
-      PADDLE_ENFORCE(kIsValidDType,                                         \
-                     "This type of tensor cannot be expose to Python");     \
-      return py::format_descriptor<T>::format();                            \
-    }                                                                       \
-  }
-
-  _ForEachDataType_(TENSOR_DTYPE_TO_PY_DTYPE);
-#undef TENSOR_DTYPE_TO_PY_DTYPE
-  PADDLE_THROW("Unsupported data type %d", static_cast<int>(type));
-}
-
-}  // namespace details
-
-inline py::array TensorToPyArray(const framework::Tensor &tensor) {
-  if (!tensor.IsInitialized()) {
-    return py::array();
-  }
-  bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
-  const auto &tensor_dims = tensor.dims();
-  auto tensor_dtype = tensor.type();
-  size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
-
-  std::vector<size_t> py_dims(tensor_dims.size());
-  std::vector<size_t> py_strides(tensor_dims.size());
-
-  size_t numel = 1;
-  for (int i = tensor_dims.size() - 1; i >= 0; --i) {
-    py_dims[i] = (size_t)tensor_dims[i];
-    py_strides[i] = sizeof_dtype * numel;
-    numel *= py_dims[i];
-  }
-
-  const void *tensor_buf_ptr = tensor.data<void>();
-
-  std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
-
-  if (!is_gpu_tensor) {
-    return py::array(py::buffer_info(
-        const_cast<void *>(tensor_buf_ptr), sizeof_dtype, py_dtype_str,
-        static_cast<size_t>(tensor.dims().size()), py_dims, py_strides));
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
-  PADDLE_ENFORCE(py_arr.writeable() && py_arr.owndata(),
-                 "PyArray must be writable and own data, otherwise memory leak "
-                 "or double free would occur");
-
-  size_t copy_bytes = sizeof_dtype * numel;
-  paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
-                                  copy_bytes, cudaMemcpyDeviceToHost);
-  return py_arr;
-#else
-  PADDLE_THROW("CUDAPlace is not supported when not compiled with CUDA");
-#endif
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt
deleted file mode 100644
index a465f5909a7c6ee83211b8e03f1c3e7d3103022c..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-cc_library(stringpiece SRCS piece.cc DEPS flags)
-cc_library(pretty_log SRCS pretty_log.cc DEPS flags)
-cc_library(string_helper SRCS string_helper.cc DEPS boost flags)
-cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
-cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
-cc_test(to_string_test SRCS to_string_test.cc)
-cc_test(split_test SRCS split_test.cc)
diff --git a/paddle/fluid/string/piece.cc b/paddle/fluid/string/piece.cc
deleted file mode 100644
index e60eb0d614eabf624cab6003ee026c956aabab52..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/piece.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/string/piece.h"
-
-#include <string.h>
-
-#include <algorithm>
-#include <iosfwd>
-#include <stdexcept>
-
-#define CHAR_POINTER_CMP(a, b) \
-  do {                         \
-    if (!a && !b) return 0;    \
-    if (!a) return -1;         \
-    if (!b) return 1;          \
-  } while (0)
-
-namespace paddle {
-namespace string {
-
-Piece::Piece() : data_(NULL), size_(0) {}
-
-Piece::Piece(const char* d, size_t n) : data_(d), size_(n) {
-  if (d == NULL && n != 0)
-    throw std::invalid_argument("Piece requires len to be 0 for NULL data");
-}
-
-Piece::Piece(const char* s) : data_(s) { size_ = (s == NULL) ? 0 : strlen(s); }
-
-Piece::Piece(const std::string& s) : data_(s.data()), size_(s.size()) {}
-
-char Piece::operator[](size_t n) const {
-  if (n >= len()) throw std::invalid_argument("index out of Piece length");
-  return data_[n];
-}
-
-int Compare(Piece a, Piece b) {
-  CHAR_POINTER_CMP(a.data(), b.data());
-  const size_t min_len = (a.len() < b.len()) ? a.len() : b.len();
-  int r = memcmp(a.data(), b.data(), min_len);
-  if (r == 0) {
-    if (a.len() < b.len())
-      return -1;
-    else if (a.len() > b.len())
-      return 1;
-  }
-  return r;
-}
-
-bool operator==(Piece x, Piece y) {
-  return (!x.len() && !y.len()) ? true
-                                : ((x.len() == y.len()) &&
-                                   (x.data() == y.data() ||
-                                    memcmp(x.data(), y.data(), x.len()) == 0));
-}
-
-bool operator!=(Piece x, Piece y) { return !(x == y); }
-
-bool operator<(Piece x, Piece y) { return Compare(x, y) < 0; }
-bool operator>(Piece x, Piece y) { return Compare(x, y) > 0; }
-
-bool operator<=(Piece x, Piece y) { return Compare(x, y) <= 0; }
-bool operator>=(Piece x, Piece y) { return Compare(x, y) >= 0; }
-
-bool HasPrefix(Piece s, Piece x) {
-  return !x.len() ? true : ((s.len() >= x.len()) &&
-                            (memcmp(s.data(), x.data(), x.len()) == 0));
-}
-
-bool HasSuffix(Piece s, Piece x) {
-  return !x.len() ? true : ((s.len() >= x.len()) &&
-                            (memcmp(s.data() + (s.len() - x.len()), x.data(),
-                                    x.len()) == 0));
-}
-
-Piece SkipPrefix(Piece s, size_t n) {
-  if (n > s.len())
-    throw std::invalid_argument("Skip distance larger than Piece length");
-  return Piece(s.data() + n, s.len() - n);
-}
-
-Piece SkipSuffix(Piece s, size_t n) {
-  if (n > s.len())
-    throw std::invalid_argument("Skip distance larger than Piece length");
-  return Piece(s.data(), s.len() - n);
-}
-
-Piece TrimPrefix(Piece s, Piece x) {
-  return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s;
-}
-
-Piece TrimSuffix(Piece s, Piece x) {
-  return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s;
-}
-
-bool Contains(Piece s, Piece sub) {
-  return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end();
-}
-
-size_t Index(Piece s, Piece sub) {
-  auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end());
-  return e != s.end() ? e - s.data() : Piece::npos;
-}
-
-size_t Find(Piece s, char c, size_t pos) {
-  if (pos >= s.len()) {
-    return Piece::npos;
-  }
-  const char* result =
-      reinterpret_cast<const char*>(memchr(s.data() + pos, c, s.len() - pos));
-  return result != nullptr ? result - s.data() : Piece::npos;
-}
-
-size_t RFind(Piece s, char c, size_t pos) {
-  if (s.len() == 0) return Piece::npos;
-  for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data();
-       p--) {
-    if (*p == c) {
-      return p - s.data();
-    }
-  }
-  return Piece::npos;
-}
-
-Piece SubStr(Piece s, size_t pos, size_t n) {
-  if (pos > s.len()) pos = s.len();
-  if (n > s.len() - pos) n = s.len() - pos;
-  return Piece(s.data() + pos, n);
-}
-
-std::ostream& operator<<(std::ostream& o, Piece piece) {
-  return o << piece.ToString();
-}
-
-}  // namespace string
-}  // namespace paddle
diff --git a/paddle/fluid/string/piece.h b/paddle/fluid/string/piece.h
deleted file mode 100644
index 8dda484eaac4d62b758e57ac5e81bfe68a5c60d4..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/piece.h
+++ /dev/null
@@ -1,105 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ostream>
-#include <string>
-
-namespace paddle {
-namespace string {
-
-// Piece points into a std::string object but doesn't own the
-// string.  It is for efficient access to strings.  Like Go's string
-// type.  Not that Piece doesn't mutate the underlying string,
-// so it is thread-safe given that the underlying string doesn't
-// change.  Because Piece contains a little data members, and
-// its syntax is simple as it doesn't own/manage the string, it is
-// cheap to construct Pieces and pass them around.
-class Piece {
- public:
-  static const size_t npos = static_cast<size_t>(-1);
-
-  // We provide non-explicit singleton constructors so users can
-  // pass in a "const char*" or a "string" wherever a "Piece"
-  // is expected.  These constructors ensure that if data_ is NULL,
-  // size_ is 0.
-  Piece();
-  Piece(const char* d, size_t n);
-  Piece(const char* d);         // NOLINT: accept C string into Piece.
-  Piece(const std::string& s);  // NOLINT: accept C++ string into Piece.
-
-  const char* data() const { return data_; }
-  size_t len() const { return size_; }
-
-  char operator[](size_t n) const;
-
-  // Piece doesn't own the string, so both iterator and const
-  // iterator are const char* indeed.
-  typedef const char* const_iterator;
-  typedef const char* iterator;
-  iterator begin() const { return data_; }
-  iterator end() const { return data_ + size_; }
-
-  // Return a string that contains the copy of the referenced data.
-  std::string ToString() const { return std::string(data_, size_); }
-
- private:
-  const char* data_;
-  size_t size_;
-
-  // Intentionally copyable
-};
-
-int Compare(Piece a, Piece b);
-
-bool operator==(Piece x, Piece y);
-bool operator!=(Piece x, Piece y);
-bool operator<(Piece x, Piece y);
-bool operator>(Piece x, Piece y);
-bool operator<=(Piece x, Piece y);
-bool operator>=(Piece x, Piece y);
-
-bool HasPrefix(Piece s, Piece prefix);
-bool HasSuffix(Piece s, Piece suffix);
-
-Piece SkipPrefix(Piece s, size_t n);
-Piece SkipSuffix(Piece s, size_t n);
-
-// Skip the prefix (or suffix) if it matches with the string.
-Piece TrimPrefix(Piece s, Piece prefix);
-Piece TrimSuffix(Piece s, Piece suffix);
-
-// Returns if s contains sub.  Any s except for empty s contains an
-// empty sub.
-bool Contains(Piece s, Piece sub);
-
-// Return the first occurrence of sub in s, or npos.  If both s and
-// sub is empty, it returns npos; otherwise, if only sub is empty, it
-// returns 0.
-size_t Index(Piece s, Piece sub);
-
-// Return the first occurrence of c in s[pos:end], or npos.
-size_t Find(Piece s, char c, size_t pos);
-
-// Search range is [0..pos] inclusive.  If pos == npos, search everything.
-size_t RFind(Piece s, char c, size_t pos);
-
-Piece SubStr(Piece s, size_t pos, size_t n);
-
-// allow Piece to be logged
-std::ostream& operator<<(std::ostream& o, Piece piece);
-
-}  // namespace string
-}  // namespace paddle
diff --git a/paddle/fluid/string/piece_test.cc b/paddle/fluid/string/piece_test.cc
deleted file mode 100644
index 80b712b08ccbcc3fc33e26f8c8c5b0531a71d974..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/piece_test.cc
+++ /dev/null
@@ -1,293 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/string/piece.h"
-
-#include <sstream>
-
-#include "gtest/gtest.h"
-
-TEST(StringPiece, Construct) {
-  {
-    paddle::string::Piece s;
-    EXPECT_EQ(NULL, s.data());
-    EXPECT_EQ(0U, s.len());
-  }
-  {
-    EXPECT_THROW(paddle::string::Piece s(NULL, 10000U), std::invalid_argument);
-  }
-  {
-    paddle::string::Piece s(NULL);
-    EXPECT_EQ(0U, s.len());
-  }
-  {
-    std::string a;
-    EXPECT_EQ(0U, a.size());
-    paddle::string::Piece s(a);
-    EXPECT_EQ(0U, s.len());
-  }
-}
-
-TEST(StringPiece, CopyAndAssign) {
-  paddle::string::Piece empty;
-  EXPECT_EQ(0U, empty.len());
-
-  paddle::string::Piece a("hello");
-  paddle::string::Piece b = a;
-  EXPECT_EQ(b.len(), strlen("hello"));
-  EXPECT_EQ(a, b);
-
-  std::string storage("hello");
-  paddle::string::Piece c(storage);
-  EXPECT_EQ(a, c);
-  EXPECT_NE(a.data(), c.data());
-}
-
-TEST(StringPiece, Compare) {
-  {
-    paddle::string::Piece a("hello");
-    paddle::string::Piece b("world");
-    EXPECT_TRUE(a != b);
-    EXPECT_FALSE(a == b);
-    EXPECT_TRUE(a < b);
-    EXPECT_TRUE(a <= b);
-    EXPECT_FALSE(a > b);
-    EXPECT_FALSE(a >= b);
-    EXPECT_LT(Compare(a, b), 0);
-    EXPECT_GT(Compare(b, a), 0);
-  }
-  {
-    paddle::string::Piece a, b;
-    EXPECT_TRUE(a == b);
-    EXPECT_FALSE(a != b);
-    EXPECT_FALSE(a < b);
-    EXPECT_FALSE(a > b);
-    EXPECT_TRUE(a <= b);
-    EXPECT_TRUE(a >= b);
-    EXPECT_EQ(0, Compare(a, b));
-    EXPECT_EQ(0, Compare(b, a));
-  }
-}
-
-TEST(StringPiece, ToString) {
-  {
-    paddle::string::Piece s;
-    EXPECT_EQ(std::string(""), s.ToString());
-  }
-  {
-    paddle::string::Piece s(NULL);
-    EXPECT_EQ(std::string(""), s.ToString());
-  }
-  {
-    paddle::string::Piece s("hello");
-    EXPECT_EQ(std::string("hello"), s.ToString());
-  }
-}
-
-TEST(StringPiece, HasPrefixSuffix) {
-  using paddle::string::HasPrefix;
-  using paddle::string::HasSuffix;
-  {
-    paddle::string::Piece s;
-    EXPECT_FALSE(HasPrefix(s, "something"));
-    EXPECT_TRUE(HasPrefix(s, ""));
-    EXPECT_FALSE(HasSuffix(s, "something"));
-    EXPECT_TRUE(HasSuffix(s, ""));
-  }
-  {
-    paddle::string::Piece s("app");
-    EXPECT_TRUE(HasPrefix(s, ""));
-    EXPECT_TRUE(HasPrefix(s, "a"));
-    EXPECT_TRUE(HasPrefix(s, "ap"));
-    EXPECT_TRUE(HasPrefix(s, "app"));
-
-    EXPECT_TRUE(HasSuffix(s, ""));
-    EXPECT_TRUE(HasSuffix(s, "p"));
-    EXPECT_TRUE(HasSuffix(s, "pp"));
-    EXPECT_TRUE(HasSuffix(s, "app"));
-  }
-}
-
-TEST(StringPiece, SkipPrefixSuffix) {
-  using paddle::string::SkipPrefix;
-  using paddle::string::SkipSuffix;
-  {
-    paddle::string::Piece s;
-    EXPECT_EQ("", SkipPrefix(s, 0));
-    EXPECT_THROW(SkipPrefix(s, 1), std::invalid_argument);
-
-    EXPECT_EQ("", SkipSuffix(s, 0));
-    EXPECT_THROW(SkipSuffix(s, 1), std::invalid_argument);
-  }
-  {
-    paddle::string::Piece s("app");
-    EXPECT_EQ("app", SkipPrefix(s, 0));
-    EXPECT_EQ("pp", SkipPrefix(s, 1));
-    EXPECT_EQ("p", SkipPrefix(s, 2));
-    EXPECT_EQ("", SkipPrefix(s, 3));
-    EXPECT_THROW(SkipPrefix(s, 4), std::invalid_argument);
-
-    EXPECT_EQ("app", SkipSuffix(s, 0));
-    EXPECT_EQ("ap", SkipSuffix(s, 1));
-    EXPECT_EQ("a", SkipSuffix(s, 2));
-    EXPECT_EQ("", SkipSuffix(s, 3));
-    EXPECT_THROW(SkipSuffix(s, 4), std::invalid_argument);
-  }
-}
-
-TEST(StringPiece, TrimPrefixSuffix) {
-  using paddle::string::TrimPrefix;
-  using paddle::string::TrimSuffix;
-  {
-    paddle::string::Piece s;
-    EXPECT_EQ("", TrimPrefix(s, ""));
-    EXPECT_EQ("", TrimPrefix(s, "something"));
-
-    EXPECT_EQ("", TrimSuffix(s, ""));
-    EXPECT_EQ("", TrimSuffix(s, "something"));
-  }
-  {
-    paddle::string::Piece s("app");
-    EXPECT_EQ("app", TrimPrefix(s, ""));
-    EXPECT_EQ("pp", TrimPrefix(s, "a"));
-    EXPECT_EQ("p", TrimPrefix(s, "ap"));
-    EXPECT_EQ("", TrimPrefix(s, "app"));
-    EXPECT_EQ("app", TrimPrefix(s, "something"));
-
-    EXPECT_EQ("app", TrimSuffix(s, ""));
-    EXPECT_EQ("ap", TrimSuffix(s, "p"));
-    EXPECT_EQ("a", TrimSuffix(s, "pp"));
-    EXPECT_EQ("", TrimSuffix(s, "app"));
-    EXPECT_EQ("app", TrimSuffix(s, "something"));
-  }
-}
-
-TEST(StringPiece, Contains) {
-  using paddle::string::Contains;
-  {
-    paddle::string::Piece s;
-    EXPECT_FALSE(Contains(s, ""));
-    EXPECT_FALSE(Contains(s, "something"));
-  }
-  {
-    paddle::string::Piece s("app");
-    EXPECT_TRUE(Contains(s, ""));
-    EXPECT_TRUE(Contains(s, "a"));
-    EXPECT_TRUE(Contains(s, "p"));
-    EXPECT_TRUE(Contains(s, "ap"));
-    EXPECT_TRUE(Contains(s, "pp"));
-    EXPECT_TRUE(Contains(s, "app"));
-    EXPECT_FALSE(Contains(s, "something"));
-  }
-}
-
-TEST(StringPiece, Index) {
-  using paddle::string::Index;
-  auto npos = paddle::string::Piece::npos;
-  {
-    paddle::string::Piece s;
-    EXPECT_EQ(npos, Index(s, ""));
-    EXPECT_EQ(npos, Index(s, "something"));
-  }
-  {
-    paddle::string::Piece s("app");
-    EXPECT_EQ(0U, Index(s, ""));
-    EXPECT_EQ(0U, Index(s, "a"));
-    EXPECT_EQ(1U, Index(s, "p"));
-    EXPECT_EQ(0U, Index(s, "ap"));
-    EXPECT_EQ(1U, Index(s, "pp"));
-    EXPECT_EQ(0U, Index(s, "app"));
-    EXPECT_EQ(npos, Index(s, "something"));
-  }
-}
-
-TEST(StringPiece, Find) {
-  using paddle::string::Find;
-  auto npos = paddle::string::Piece::npos;
-  {
-    paddle::string::Piece s;
-    EXPECT_EQ(npos, Find(s, 'a', 0U));
-  }
-  {
-    paddle::string::Piece s("app");
-    EXPECT_EQ(0U, Find(s, 'a', 0U));
-    EXPECT_EQ(1U, Find(s, 'p', 0U));
-    EXPECT_EQ(1U, Find(s, 'p', 1U));
-    EXPECT_EQ(2U, Find(s, 'p', 2U));
-    EXPECT_EQ(npos, Find(s, 'z', 2U));
-  }
-}
-
-TEST(StringPiece, RFind) {
-  using paddle::string::RFind;
-  auto npos = paddle::string::Piece::npos;
-  {
-    paddle::string::Piece s;
-    EXPECT_EQ(npos, RFind(s, 'a', 0U));
-  }
-  {
-    paddle::string::Piece s("app");
-    EXPECT_EQ(2U, RFind(s, 'p', 2U));
-    EXPECT_EQ(0U, RFind(s, 'a', 2U));
-    EXPECT_EQ(1U, RFind(s, 'p', 1U));
-    EXPECT_EQ(0U, RFind(s, 'a', 0));
-    EXPECT_EQ(npos, RFind(s, 'z', 2U));
-  }
-}
-
-TEST(StringPiece, SubStr) {
-  using paddle::string::SubStr;
-  {
-    paddle::string::Piece s;
-    EXPECT_EQ("", SubStr(s, 0, 0));
-    EXPECT_EQ("", SubStr(s, 0, 1));
-    EXPECT_EQ("", SubStr(s, 1, 0));
-  }
-  {
-    paddle::string::Piece s("app");
-    EXPECT_EQ("", SubStr(s, 0, 0));
-    EXPECT_EQ("", SubStr(s, 1, 0));
-    EXPECT_EQ("", SubStr(s, 2, 0));
-    EXPECT_EQ("", SubStr(s, 3, 0));
-
-    EXPECT_EQ("a", SubStr(s, 0, 1));
-    EXPECT_EQ("p", SubStr(s, 1, 1));
-    EXPECT_EQ("p", SubStr(s, 2, 1));
-    EXPECT_EQ("", SubStr(s, 3, 1));
-
-    EXPECT_EQ("ap", SubStr(s, 0, 2));
-    EXPECT_EQ("pp", SubStr(s, 1, 2));
-    EXPECT_EQ("p", SubStr(s, 2, 2));
-    EXPECT_EQ("", SubStr(s, 3, 2));
-
-    EXPECT_EQ("app", SubStr(s, 0, 3));
-    EXPECT_EQ("pp", SubStr(s, 1, 3));
-    EXPECT_EQ("p", SubStr(s, 2, 3));
-    EXPECT_EQ("", SubStr(s, 3, 3));
-  }
-}
-
-TEST(StringPiece, StreamOutput) {
-  using paddle::string::Piece;
-
-  std::stringstream o;
-  o << paddle::string::Piece();
-  EXPECT_EQ("", o.str());
-
-  o << paddle::string::Piece("hello");
-  EXPECT_EQ("hello", o.str());
-
-  o << paddle::string::Piece();
-  EXPECT_EQ("hello", o.str());
-}
diff --git a/paddle/fluid/string/pretty_log.cc b/paddle/fluid/string/pretty_log.cc
deleted file mode 100644
index 4534fdc58b81fe03b3a1fc19b55aa62ddbf5eaf1..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/pretty_log.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/string/pretty_log.h"
-#include <gflags/gflags.h>
-
-DEFINE_bool(color, true, "Whether to turn on pretty log");
-
-namespace paddle {
-namespace string {}  // namespace string
-}  // namespace paddle
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
deleted file mode 100644
index da4c1f326fbc2703e639279d79acb52dc748266a..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/pretty_log.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <gflags/gflags.h>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <utility>
-#include "paddle/fluid/string/printf.h"
-
-DECLARE_bool(color);
-
-namespace paddle {
-
-namespace string {
-
-inline std::string black() { return FLAGS_color ? "\e[30m" : ""; }
-inline std::string red() { return FLAGS_color ? "\e[31m" : ""; }
-inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; }
-inline std::string green() { return FLAGS_color ? "\e[32m" : ""; }
-inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; }
-inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; }
-inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; }
-inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; }
-inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; }
-inline std::string white() { return FLAGS_color ? "\e[37m" : ""; }
-inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; }
-inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; }
-inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; }
-inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; }
-inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; }
-inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; }
-
-using TextBlock = std::pair<std::string, std::string>;
-
-struct Style {
-  static std::string info() { return black(); }
-  static std::string warn() { return b_red(); }
-  static std::string suc() { return green(); }
-  static std::string H1() { return bold() + purple(); }
-  static std::string H2() { return green(); }
-  static std::string H3() { return green(); }
-  static std::string detail() { return light_gray(); }
-};
-
-template <typename... Args>
-static void PrettyLogEndl(const std::string &style, const char *fmt,
-                          const Args &... args) {
-  std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
-}
-template <typename... Args>
-static void PrettyLog(const std::string &style, const char *fmt,
-                      const Args &... args) {
-  std::cerr << style << Sprintf(fmt, args...) << reset();
-}
-
-template <typename... Args>
-static void PrettyLogInfo(const char *fmt, const Args &... args) {
-  PrettyLogEndl(Style::info(), fmt, args...);
-}
-template <typename... Args>
-static void PrettyLogDetail(const char *fmt, const Args &... args) {
-  PrettyLogEndl(Style::detail(), fmt, args...);
-}
-template <typename... Args>
-static void PrettyLogH1(const char *fmt, const Args &... args) {
-  PrettyLogEndl(Style::H1(), fmt, args...);
-}
-template <typename... Args>
-static void PrettyLogH2(const char *fmt, const Args &... args) {
-  PrettyLogEndl(Style::H2(), fmt, args...);
-}
-
-}  // namespace string
-}  // namespace paddle
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
deleted file mode 100644
index 66b768665b6d0b97b4ca1470020132bfc9576bbb..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/printf.h
+++ /dev/null
@@ -1,124 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Compared with std::stringstream, there are primary purpose of
-// string::Printf:
-//
-// 1. Type-safe printing, with why and how explained in
-//    http://www.drdobbs.com/stringprintf-a-typesafe-printf-family-fo/184401999.
-//    Implementation includes
-//
-//    https://github.com/c42f/tinyformat
-//    boost::format
-//    std::stringstream
-//
-//    std::stringstream is not convenient enough in many cases.  For example:
-//
-//      std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n";
-//
-//    boost::format is the most convenient one.  We can have
-//
-//      std::cout << format("%2% %1%") % 36 % 77;
-//
-//    or
-//
-//      format fmter("%2% %1%");
-//      fmter % 36; fmter % 77;
-//      std::cout << fmter.c_str();
-//
-//    But the overloading of % might be overkilling and it would be
-//    more efficient if it can write to std::cout directly.
-//
-//    tinyformat has an interface compatible with the C-printf style,
-//    and it can writes to a stream or returns a std::string:
-//
-//      std::cout << tfm::printf(
-//                  "%s, %s %d, %.2d:%.2d\n",
-//                  weekday, month, day, hour, min);
-//
-//    or
-//
-//      tfm::format(std::cout,
-//                  "%s, %s %d, %.2d:%.2d\n",
-//                  weekday, month, day, hour, min);
-//
-// 2. High-performance -- most printed strings are not too long and
-//    doens't need dynamic memory allocation.  Many StringPrintf
-//    implementations doesn't enforce type-safe, but are
-//    high-performance, including
-//
-//    https://developers.google.com/optimization/reference/base/stringprintf/
-//    https://github.com/adobe/chromium/blob/master/base/stringprintf.h
-//    https://github.com/google/protobuf/blob/master/src/google/protobuf/stubs/stringprintf.h
-//
-// According to
-// https://github.com/c42f/tinyformat#compile-time-and-code-bloat,
-// boost::format runs too slow and results in large executable binary
-// files.  So here we port tinyformat.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
-
-namespace paddle {
-namespace string {
-
-template <typename... Args>
-void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
-  tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
-}
-
-inline std::string Sprintf() { return ""; }
-
-template <typename... Args>
-std::string Sprintf(const Args&... args) {
-  std::ostringstream oss;
-  Fprintf(oss, "%s", args...);
-  return oss.str();
-}
-
-template <typename... Args>
-std::string Sprintf(const char* fmt, const Args&... args) {
-  std::ostringstream oss;
-  Fprintf(oss, fmt, args...);
-  return oss.str();
-}
-
-template <typename... Args>
-void Printf(const char* fmt, const Args&... args) {
-  Fprintf(std::cout, fmt, args...);
-}
-
-inline std::string HumanReadableSize(double f_size) {
-  size_t i = 0;
-  double orig = f_size;
-  const std::vector<std::string> units(
-      {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
-  while (f_size >= 1024) {
-    f_size /= 1024;
-    i++;
-  }
-  if (i >= units.size()) {
-    return Sprintf("%fB", orig);
-  }
-  return Sprintf("%f%s", f_size, units[i]);
-}
-
-}  // namespace string
-}  // namespace paddle
diff --git a/paddle/fluid/string/printf_test.cc b/paddle/fluid/string/printf_test.cc
deleted file mode 100644
index 544b12ef3a877a6e84c136433799301edaa4abdf..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/printf_test.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/string/printf.h"
-
-#include <string>
-
-#include "gtest/gtest.h"
-
-TEST(StringPrintf, StringPrintf) {
-  std::string weekday = "Wednesday";
-  const char* month = "July";
-  size_t day = 27;
-  int hour = 14;
-  int min = 44;
-  EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
-            paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
-                                    hour, min));
-  EXPECT_EQ(std::string(""), paddle::string::Sprintf());
-}
diff --git a/paddle/fluid/string/split.h b/paddle/fluid/string/split.h
deleted file mode 100644
index ccb96b8a9cb68f03acbca592a2149ba5001f34d2..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/split.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace string {
-
-static inline std::vector<std::string> Split(std::string const& original,
-                                             char separator) {
-  std::vector<std::string> results;
-  std::string token;
-  std::istringstream is(original);
-  while (std::getline(is, token, separator)) {
-    if (!token.empty()) {
-      results.push_back(token);
-    }
-  }
-  return results;
-}
-
-}  // namespace string
-}  // namespace paddle
diff --git a/paddle/fluid/string/split_test.cc b/paddle/fluid/string/split_test.cc
deleted file mode 100644
index c85dc1eed40dbe25d922c0f4810a747d1bd2d60f..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/split_test.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/string/split.h"
-
-#include <string>
-
-#include "gtest/gtest.h"
-
-TEST(StringSplit, StringSplit) {
-  std::string to_split = "0,1,2,3,4,5";
-  int i = 0;
-  for (auto s : paddle::string::Split(to_split, ',')) {
-    EXPECT_EQ(atoi(s.c_str()), i);
-    i++;
-  }
-}
diff --git a/paddle/fluid/string/string_helper.cc b/paddle/fluid/string/string_helper.cc
deleted file mode 100644
index 27708b8eebd2131ebadcc310fd3521ad5ab824f3..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/string_helper.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/string/string_helper.h"
-#include <ctype.h>
-#include <stdio.h>
-#include <cstring>
-#include <string>
-#include <vector>
-#include "boost/lexical_cast.hpp"
-#include "glog/logging.h"
-
-namespace paddle {
-namespace string {
-
-inline size_t count_spaces(const char* s) {
-  size_t count = 0;
-
-  while (*s != 0 && isspace(*s++)) {
-    count++;
-  }
-
-  return count;
-}
-
-inline size_t count_nonspaces(const char* s) {
-  size_t count = 0;
-
-  while (*s != 0 && !isspace(*s++)) {
-    count++;
-  }
-
-  return count;
-}
-
-// remove leading and tailing spaces
-std::string trim_spaces(const std::string& str) {
-  const char* p = str.c_str();
-
-  while (*p != 0 && isspace(*p)) {
-    p++;
-  }
-
-  size_t len = strlen(p);
-
-  while (len > 0 && isspace(p[len - 1])) {
-    len--;
-  }
-
-  return std::string(p, len);
-}
-
-inline int str_to_float(const char* str, float* v) {
-  const char* head = str;
-  char* cursor = NULL;
-  int index = 0;
-  while (*(head += count_spaces(head)) != 0) {
-    v[index++] = std::strtof(head, &cursor);
-    if (head == cursor) {
-      break;
-    }
-    head = cursor;
-  }
-  return index;
-}
-
-// A helper class for reading lines from file.
-// A line buffer is maintained. It
-// doesn't need to know the maximum possible length of a line.
-char* LineFileReader::getdelim(FILE* f, char delim) {
-#ifndef _WIN32
-  int32_t ret = ::getdelim(&_buffer, &_buf_size, delim, f);
-
-  if (ret >= 0) {
-    if (ret >= 1 && _buffer[ret - 1] == delim) {
-      _buffer[--ret] = 0;
-    }
-
-    _length = (size_t)ret;
-    return _buffer;
-  } else {
-    _length = 0;
-    CHECK(feof(f));
-    return NULL;
-  }
-#else
-  return NULL;
-#endif
-}
-
-}  // end namespace string
-}  // end namespace paddle
diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h
deleted file mode 100644
index cc09088c7ee8a38e3b3a7a673c9da2e21cf42485..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/string_helper.h
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ctype.h>
-#include <stdio.h>
-#include <cstring>
-#include <string>
-#include <utility>
-#include <vector>
-#include "boost/lexical_cast.hpp"
-#include "glog/logging.h"
-
-namespace paddle {
-namespace string {
-
-inline size_t count_spaces(const char* s);
-
-inline size_t count_nonspaces(const char* s);
-
-template <class... ARGS>
-void format_string_append(std::string& str, const char* fmt,  // NOLINT
-                          ARGS&&... args) {
-  int len = snprintf(NULL, 0, fmt, args...);
-  CHECK_GE(len, 0);
-  size_t oldlen = str.length();
-  str.resize(oldlen + len + 1);
-  CHECK(snprintf(&str[oldlen], (size_t)len + 1, fmt, args...) == len);
-  str.resize(oldlen + len);
-}
-
-template <class... ARGS>
-void format_string_append(std::string& str, const std::string& fmt,  // NOLINT
-                          ARGS&&... args) {
-  format_string_append(str, fmt.c_str(), args...);
-}
-
-template <class... ARGS>
-std::string format_string(const char* fmt, ARGS&&... args) {
-  std::string str;
-  format_string_append(str, fmt, args...);
-  return std::move(str);
-}
-
-template <class... ARGS>
-std::string format_string(const std::string& fmt, ARGS&&... args) {
-  return format_string(fmt.c_str(), args...);
-}
-
-// remove leading and tailing spaces
-std::string trim_spaces(const std::string& str);
-
-int str_to_float(const char* str, float* v);
-
-// split string by delim
-template <class T = std::string>
-std::vector<T> split_string(const std::string& str, const std::string& delim) {
-  size_t pre_pos = 0;
-  size_t pos = 0;
-  std::string tmp_str;
-  std::vector<T> res_list;
-  res_list.clear();
-  if (str.empty()) {
-    return res_list;
-  }
-  while ((pos = str.find(delim, pre_pos)) != std::string::npos) {
-    tmp_str.assign(str, pre_pos, pos - pre_pos);
-    res_list.push_back(tmp_str);
-    pre_pos = pos + 1;
-  }
-  tmp_str.assign(str, pre_pos, str.length() - pre_pos);
-  if (!tmp_str.empty()) {
-    res_list.push_back(tmp_str);
-  }
-  return res_list;
-}
-
-// split string by spaces. Leading and tailing spaces are ignored. Consecutive
-// spaces are treated as one delim.
-template <class T = std::string>
-std::vector<T> split_string(const std::string& str) {
-  std::vector<T> list;
-  const char* p;
-  int pre_pos = 0;
-  int pos = 0;
-  std::string tmp_str;
-  if (str.empty()) {
-    return list;
-  }
-  for (p = str.c_str(); *p != 0;) {
-    if (!isspace(*p)) {
-      pos = pre_pos;
-      p++;
-
-      while (*p != 0 && !isspace(*p)) {
-        pos++;
-        p++;
-      }
-      tmp_str.assign(str, pre_pos, pos - pre_pos + 1);
-      list.push_back(tmp_str);
-      pre_pos = pos + 1;
-    } else {
-      pre_pos++;
-      p++;
-    }
-  }
-  return list;
-}
-
-template <class Container>
-std::string join_strings(const Container& strs, char delim) {
-  std::string str;
-
-  int i = 0;
-  for (auto& elem : strs) {
-    if (i > 0) {
-      str += delim;
-    }
-
-    str += boost::lexical_cast<std::string>(elem);
-    ++i;
-  }
-
-  return str;
-}
-
-// A helper class for reading lines from file. A line buffer is maintained. It
-// doesn't need to know the maximum possible length of a line.
-
-class LineFileReader {
- public:
-  LineFileReader() {}
-  LineFileReader(LineFileReader&&) = delete;
-  LineFileReader(const LineFileReader&) = delete;
-  ~LineFileReader() { ::free(_buffer); }
-  char* getline(FILE* f) { return this->getdelim(f, '\n'); }
-  char* getdelim(FILE* f, char delim);
-  char* get() { return _buffer; }
-  size_t length() { return _length; }
-
- private:
-  char* _buffer = NULL;
-  size_t _buf_size = 0;
-  size_t _length = 0;
-};
-}  // end namespace string
-}  // end namespace paddle
diff --git a/paddle/fluid/string/tinyformat/tinyformat.h b/paddle/fluid/string/tinyformat/tinyformat.h
deleted file mode 100644
index a5c1798e1002759a4ea560747453b0e9836b9624..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/tinyformat/tinyformat.h
+++ /dev/null
@@ -1,892 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// tinyformat.h
-// Copyright (C) 2011, Chris Foster [chris42f (at) gmail (d0t) com]
-//
-// Boost Software License - Version 1.0
-//
-// Permission is hereby granted, free of charge, to any person or organization
-// obtaining a copy of the software and accompanying documentation covered by
-// this license (the "Software") to use, reproduce, display, distribute,
-// execute, and transmit the Software, and to prepare derivative works of the
-// Software, and to permit third-parties to whom the Software is furnished to
-// do so, all subject to the following:
-//
-// The copyright notices in the Software and this entire statement, including
-// the above license grant, this restriction and the following disclaimer,
-// must be included in all copies of the Software, in whole or in part, and
-// all derivative works of the Software, unless such copies or derivative
-// works are solely in the form of machine-executable object code generated by
-// a source language processor.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS IN THE SOFTWARE.
-
-//------------------------------------------------------------------------------
-// Tinyformat: A minimal type safe printf replacement
-//
-// tinyformat.h is a type safe printf replacement library in a single C++
-// header file.  Design goals include:
-//
-// * Type safety and extensibility for user defined types.
-// * C99 printf() compatibility, to the extent possible using std::ostream
-// * Simplicity and minimalism.  A single header file to include and distribute
-//   with your projects.
-// * Augment rather than replace the standard stream formatting mechanism
-// * C++98 support, with optional C++11 niceties
-//
-//
-// Main interface example usage
-// ----------------------------
-//
-// To print a date to std::cout:
-//
-//   std::string weekday = "Wednesday";
-//   const char* month = "July";
-//   size_t day = 27;
-//   long hour = 14;
-//   int min = 44;
-//
-//   tfm::printf("%s, %s %d, %.2d:%.2d\n", weekday, month, day, hour, min);
-//
-// The strange types here emphasize the type safety of the interface; it is
-// possible to print a std::string using the "%s" conversion, and a
-// size_t using the "%d" conversion.  A similar result could be achieved
-// using either of the tfm::format() functions.  One prints on a user provided
-// stream:
-//
-//   tfm::format(std::cerr, "%s, %s %d, %.2d:%.2d\n",
-//               weekday, month, day, hour, min);
-//
-// The other returns a std::string:
-//
-//   std::string date = tfm::format("%s, %s %d, %.2d:%.2d\n",
-//                                  weekday, month, day, hour, min);
-//   std::cout << date;
-//
-// These are the three primary interface functions.  There is also a
-// convenience function printfln() which appends a newline to the usual result
-// of printf() for super simple logging.
-//
-//
-// User defined format functions
-// -----------------------------
-//
-// Simulating variadic templates in C++98 is pretty painful since it requires
-// writing out the same function for each desired number of arguments.  To make
-// this bearable tinyformat comes with a set of macros which are used
-// internally to generate the API, but which may also be used in user code.
-//
-// The three macros TINYFORMAT_ARGTYPES(n), TINYFORMAT_VARARGS(n) and
-// TINYFORMAT_PASSARGS(n) will generate a list of n argument types,
-// type/name pairs and argument names respectively when called with an integer
-// n between 1 and 16.  We can use these to define a macro which generates the
-// desired user defined function with n arguments.  To generate all 16 user
-// defined function bodies, use the macro TINYFORMAT_FOREACH_ARGNUM.  For an
-// example, see the implementation of printf() at the end of the source file.
-//
-// Sometimes it's useful to be able to pass a list of format arguments through
-// to a non-template function.  The FormatList class is provided as a way to do
-// this by storing the argument list in a type-opaque way.  Continuing the
-// example from above, we construct a FormatList using makeFormatList():
-//
-//   FormatListRef formatList = tfm::makeFormatList(weekday, month, day, hour,
-//   min);
-//
-// The format list can now be passed into any non-template function and used
-// via a call to the vformat() function:
-//
-//   tfm::vformat(std::cout, "%s, %s %d, %.2d:%.2d\n", formatList);
-//
-//
-// Additional API information
-// --------------------------
-//
-// Error handling: Define TINYFORMAT_ERROR to customize the error handling for
-// format strings which are unsupported or have the wrong number of format
-// specifiers (calls assert() by default).
-//
-// User defined types: Uses operator<< for user defined types by default.
-// Overload formatValue() for more control.
-
-#pragma once
-
-#include <algorithm>
-#include <cassert>
-#include <iostream>
-#include <sstream>
-
-namespace paddle {
-namespace string {
-namespace tinyformat {
-
-#ifndef TINYFORMAT_ERROR
-#define TINYFORMAT_ERROR(reason) assert(0 && reason)
-#endif
-
-//------------------------------------------------------------------------------
-namespace detail {
-
-// Test whether type T1 is convertible to type T2
-template <typename T1, typename T2>
-struct is_convertible {
- private:
-  // two types of different size
-  struct fail {
-    char dummy[2];
-  };
-  struct succeed {
-    char dummy;
-  };
-  // Try to convert a T1 to a T2 by plugging into tryConvert
-  static fail tryConvert(...);
-  static succeed tryConvert(const T2 &);
-  static const T1 &makeT1();
-
- public:
-  // Standard trick: the (...) version of tryConvert will be chosen from
-  // the overload set only if the version taking a T2 doesn't match.
-  // Then we compare the sizes of the return types to check which
-  // function matched.  Very neat, in a disgusting kind of way :)
-  static const bool value = sizeof(tryConvert(makeT1())) == sizeof(succeed);
-};
-
-// Format the value by casting to type fmtT.  This default implementation
-// should never be called.
-template <typename T, typename fmtT,
-          bool convertible = is_convertible<T, fmtT>::value>
-struct formatValueAsType {
-  static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); }
-};
-// Specialized version for types that can actually be converted to fmtT, as
-// indicated by the "convertible" template parameter.
-template <typename T, typename fmtT>
-struct formatValueAsType<T, fmtT, true> {
-  static void invoke(std::ostream &out, const T &value) {
-    out << static_cast<fmtT>(value);
-  }
-};
-
-// Convert an arbitrary type to integer.  The version with convertible=false
-// throws an error.
-template <typename T, bool convertible = is_convertible<T, int>::value>
-struct convertToInt {
-  static int invoke(const T & /*value*/) {
-    TINYFORMAT_ERROR(
-        "tinyformat: Cannot convert from argument type to "
-        "integer for use as variable width or precision");
-    return 0;
-  }
-};
-// Specialization for convertToInt when conversion is possible
-template <typename T>
-struct convertToInt<T, true> {
-  static int invoke(const T &value) { return static_cast<int>(value); }
-};
-
-// Format at most ntrunc characters to the given stream.
-template <typename T>
-inline void formatTruncated(std::ostream &out, const T &value, int ntrunc) {
-  std::ostringstream tmp;
-  tmp << value;
-  std::string result = tmp.str();
-  out.write(result.c_str(),
-            (std::min)(ntrunc, static_cast<int>(result.size())));
-}
-#define TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(type)                       \
-  inline void formatTruncated(std::ostream &out, type *value, int ntrunc) { \
-    std::streamsize len = 0;                                                \
-    while (len < ntrunc && value[len] != 0) ++len;                          \
-    out.write(value, len);                                                  \
-  }
-// Overload for const char* and char*.  Could overload for signed & unsigned
-// char too, but these are technically unneeded for printf compatibility.
-TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(const char)
-TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char)
-#undef TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR
-
-}  // namespace detail
-
-//------------------------------------------------------------------------------
-// Variable formatting functions.  May be overridden for user-defined types if
-// desired.
-
-/// Format a value into a stream, delegating to operator<< by default.
-///
-/// Users may override this for their own types.  When this function is called,
-/// the stream flags will have been modified according to the format string.
-/// The format specification is provided in the range [fmtBegin, fmtEnd).  For
-/// truncating conversions, ntrunc is set to the desired maximum number of
-/// characters, for example "%.7s" calls formatValue with ntrunc = 7.
-///
-/// By default, formatValue() uses the usual stream insertion operator
-/// operator<< to format the type T, with special cases for the %c and %p
-/// conversions.
-template <typename T>
-inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,
-                        const char *fmtEnd, int ntrunc, const T &value) {
-  // The mess here is to support the %c and %p conversions: if these
-  // conversions are active we try to convert the type to a char or const
-  // void* respectively and format that instead of the value itself.  For the
-  // %p conversion it's important to avoid dereferencing the pointer, which
-  // could otherwise lead to a crash when printing a dangling (const char*).
-  const bool canConvertToChar = detail::is_convertible<T, char>::value;
-  const bool canConvertToVoidPtr =
-      detail::is_convertible<T, const void *>::value;
-  if (canConvertToChar && *(fmtEnd - 1) == 'c')
-    detail::formatValueAsType<T, char>::invoke(out, value);
-  else if (canConvertToVoidPtr && *(fmtEnd - 1) == 'p')
-    detail::formatValueAsType<T, const void *>::invoke(out, value);
-  else if (ntrunc >= 0) {
-    // Take care not to overread C strings in truncating conversions like
-    // "%.4s" where at most 4 characters may be read.
-    detail::formatTruncated(out, value, ntrunc);
-  } else
-    out << value;
-}
-
-// Overloaded version for char types to support printing as an integer
-#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType)                      \
-  inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,   \
-                          const char *fmtEnd, int /**/, charType value) { \
-    switch (*(fmtEnd - 1)) {                                              \
-      case 'u':                                                           \
-      case 'd':                                                           \
-      case 'i':                                                           \
-      case 'o':                                                           \
-      case 'X':                                                           \
-      case 'x':                                                           \
-        out << static_cast<int>(value);                                   \
-        break;                                                            \
-      default:                                                            \
-        out << value;                                                     \
-        break;                                                            \
-    }                                                                     \
-  }
-// per 3.9.1: char, signed char and unsigned char are all distinct types
-TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char)
-TINYFORMAT_DEFINE_FORMATVALUE_CHAR(signed char)
-TINYFORMAT_DEFINE_FORMATVALUE_CHAR(unsigned char)
-#undef TINYFORMAT_DEFINE_FORMATVALUE_CHAR
-
-//------------------------------------------------------------------------------
-// Tools for emulating variadic templates in C++98.  The basic idea here is
-// stolen from the boost preprocessor metaprogramming library and cut down to
-// be just general enough for what we need.
-
-#define TINYFORMAT_ARGTYPES(n) TINYFORMAT_ARGTYPES_##n
-#define TINYFORMAT_VARARGS(n) TINYFORMAT_VARARGS_##n
-#define TINYFORMAT_PASSARGS(n) TINYFORMAT_PASSARGS_##n
-#define TINYFORMAT_PASSARGS_TAIL(n) TINYFORMAT_PASSARGS_TAIL_##n
-
-// To keep it as transparent as possible, the macros below have been generated
-// using python via the excellent cog.py code generation script.  This avoids
-// the need for a bunch of complex (but more general) preprocessor tricks as
-// used in boost.preprocessor.
-//
-// To rerun the code generation in place, use `cog.py -r tinyformat.h`
-// (see http://nedbatchelder.com/code/cog).  Alternatively you can just create
-// extra versions by hand.
-
-/*[[[cog
-maxParams = 16
-
-def makeCommaSepLists(lineTemplate, elemTemplate, startInd=1):
-    for j in range(startInd,maxParams+1):
-        list = ', '.join([elemTemplate % {'i':i} for i in range(startInd,j+1)])
-        cog.outl(lineTemplate % {'j':j, 'list':list})
-
-makeCommaSepLists('#define TINYFORMAT_ARGTYPES_%(j)d %(list)s',
-                  'class T%(i)d')
-
-cog.outl()
-makeCommaSepLists('#define TINYFORMAT_VARARGS_%(j)d %(list)s',
-                  'const T%(i)d& v%(i)d')
-
-cog.outl()
-makeCommaSepLists('#define TINYFORMAT_PASSARGS_%(j)d %(list)s', 'v%(i)d')
-
-cog.outl()
-cog.outl('#define TINYFORMAT_PASSARGS_TAIL_1')
-makeCommaSepLists('#define TINYFORMAT_PASSARGS_TAIL_%(j)d , %(list)s',
-                  'v%(i)d', startInd = 2)
-
-cog.outl()
-cog.outl('#define TINYFORMAT_FOREACH_ARGNUM(m) \\\n    ' +
-         ' '.join(['m(%d)' % (j,) for j in range(1,maxParams+1)]))
-]]]*/
-#define TINYFORMAT_ARGTYPES_1 class T1
-#define TINYFORMAT_ARGTYPES_2 class T1, class T2
-#define TINYFORMAT_ARGTYPES_3 class T1, class T2, class T3
-#define TINYFORMAT_ARGTYPES_4 class T1, class T2, class T3, class T4
-#define TINYFORMAT_ARGTYPES_5 class T1, class T2, class T3, class T4, class T5
-#define TINYFORMAT_ARGTYPES_6 \
-  class T1, class T2, class T3, class T4, class T5, class T6
-#define TINYFORMAT_ARGTYPES_7 \
-  class T1, class T2, class T3, class T4, class T5, class T6, class T7
-#define TINYFORMAT_ARGTYPES_8 \
-  class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8
-#define TINYFORMAT_ARGTYPES_9                                           \
-  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
-      class T8, class T9
-#define TINYFORMAT_ARGTYPES_10                                          \
-  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
-      class T8, class T9, class T10
-#define TINYFORMAT_ARGTYPES_11                                          \
-  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
-      class T8, class T9, class T10, class T11
-#define TINYFORMAT_ARGTYPES_12                                          \
-  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
-      class T8, class T9, class T10, class T11, class T12
-#define TINYFORMAT_ARGTYPES_13                                          \
-  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
-      class T8, class T9, class T10, class T11, class T12, class T13
-#define TINYFORMAT_ARGTYPES_14                                          \
-  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
-      class T8, class T9, class T10, class T11, class T12, class T13,   \
-      class T14
-#define TINYFORMAT_ARGTYPES_15                                          \
-  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
-      class T8, class T9, class T10, class T11, class T12, class T13,   \
-      class T14, class T15
-#define TINYFORMAT_ARGTYPES_16                                          \
-  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
-      class T8, class T9, class T10, class T11, class T12, class T13,   \
-      class T14, class T15, class T16
-
-#define TINYFORMAT_VARARGS_1 const T1 &v1
-#define TINYFORMAT_VARARGS_2 const T1 &v1, const T2 &v2
-#define TINYFORMAT_VARARGS_3 const T1 &v1, const T2 &v2, const T3 &v3
-#define TINYFORMAT_VARARGS_4 \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4
-#define TINYFORMAT_VARARGS_5 \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5
-#define TINYFORMAT_VARARGS_6                                            \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
-      const T6 &v6
-#define TINYFORMAT_VARARGS_7                                            \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
-      const T6 &v6, const T7 &v7
-#define TINYFORMAT_VARARGS_8                                            \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
-      const T6 &v6, const T7 &v7, const T8 &v8
-#define TINYFORMAT_VARARGS_9                                            \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
-      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9
-#define TINYFORMAT_VARARGS_10                                           \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
-      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10
-#define TINYFORMAT_VARARGS_11                                                 \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
-      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
-      const T11 &v11
-#define TINYFORMAT_VARARGS_12                                                 \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
-      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
-      const T11 &v11, const T12 &v12
-#define TINYFORMAT_VARARGS_13                                                 \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
-      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
-      const T11 &v11, const T12 &v12, const T13 &v13
-#define TINYFORMAT_VARARGS_14                                                 \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
-      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
-      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14
-#define TINYFORMAT_VARARGS_15                                                 \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
-      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
-      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14,         \
-      const T15 &v15
-#define TINYFORMAT_VARARGS_16                                                 \
-  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
-      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
-      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14,         \
-      const T15 &v15, const T16 &v16
-
-#define TINYFORMAT_PASSARGS_1 v1
-#define TINYFORMAT_PASSARGS_2 v1, v2
-#define TINYFORMAT_PASSARGS_3 v1, v2, v3
-#define TINYFORMAT_PASSARGS_4 v1, v2, v3, v4
-#define TINYFORMAT_PASSARGS_5 v1, v2, v3, v4, v5
-#define TINYFORMAT_PASSARGS_6 v1, v2, v3, v4, v5, v6
-#define TINYFORMAT_PASSARGS_7 v1, v2, v3, v4, v5, v6, v7
-#define TINYFORMAT_PASSARGS_8 v1, v2, v3, v4, v5, v6, v7, v8
-#define TINYFORMAT_PASSARGS_9 v1, v2, v3, v4, v5, v6, v7, v8, v9
-#define TINYFORMAT_PASSARGS_10 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10
-#define TINYFORMAT_PASSARGS_11 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
-#define TINYFORMAT_PASSARGS_12 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
-#define TINYFORMAT_PASSARGS_13 \
-  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13
-#define TINYFORMAT_PASSARGS_14 \
-  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
-#define TINYFORMAT_PASSARGS_15 \
-  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
-#define TINYFORMAT_PASSARGS_16 \
-  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16
-
-#define TINYFORMAT_PASSARGS_TAIL_1
-#define TINYFORMAT_PASSARGS_TAIL_2 , v2
-#define TINYFORMAT_PASSARGS_TAIL_3 , v2, v3
-#define TINYFORMAT_PASSARGS_TAIL_4 , v2, v3, v4
-#define TINYFORMAT_PASSARGS_TAIL_5 , v2, v3, v4, v5
-#define TINYFORMAT_PASSARGS_TAIL_6 , v2, v3, v4, v5, v6
-#define TINYFORMAT_PASSARGS_TAIL_7 , v2, v3, v4, v5, v6, v7
-#define TINYFORMAT_PASSARGS_TAIL_8 , v2, v3, v4, v5, v6, v7, v8
-#define TINYFORMAT_PASSARGS_TAIL_9 , v2, v3, v4, v5, v6, v7, v8, v9
-#define TINYFORMAT_PASSARGS_TAIL_10 , v2, v3, v4, v5, v6, v7, v8, v9, v10
-#define TINYFORMAT_PASSARGS_TAIL_11 , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
-#define TINYFORMAT_PASSARGS_TAIL_12 \
-  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
-#define TINYFORMAT_PASSARGS_TAIL_13 \
-  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13
-#define TINYFORMAT_PASSARGS_TAIL_14 \
-  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
-#define TINYFORMAT_PASSARGS_TAIL_15 \
-  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
-#define TINYFORMAT_PASSARGS_TAIL_16 \
-  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16
-
-#define TINYFORMAT_FOREACH_ARGNUM(m)                                         \
-  m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) m(10) m(11) m(12) m(13) m(14) \
-      m(15) m(16)
-//[[[end]]]
-
-namespace detail {
-
-// Type-opaque holder for an argument to format(), with associated actions on
-// the type held as explicit function pointers.  This allows FormatArg's for
-// each argument to be allocated as a homogenous array inside FormatList
-// whereas a naive implementation based on inheritance does not.
-class FormatArg {
- public:
-  FormatArg() {}
-
-  template <typename T>
-  FormatArg(const T &value)
-      : m_value(static_cast<const void *>(&value)),
-        m_formatImpl(&formatImpl<T>),
-        m_toIntImpl(&toIntImpl<T>) {}
-
-  void format(std::ostream &out, const char *fmtBegin, const char *fmtEnd,
-              int ntrunc) const {
-    m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value);
-  }
-
-  int toInt() const { return m_toIntImpl(m_value); }
-
- private:
-  template <typename T>
-  static void formatImpl(std::ostream &out, const char *fmtBegin,
-                         const char *fmtEnd, int ntrunc, const void *value) {
-    formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T *>(value));
-  }
-
-  template <typename T>
-  static int toIntImpl(const void *value) {
-    return convertToInt<T>::invoke(*static_cast<const T *>(value));
-  }
-
-  const void *m_value;
-  void (*m_formatImpl)(std::ostream &out, const char *fmtBegin,
-                       const char *fmtEnd, int ntrunc, const void *value);
-  int (*m_toIntImpl)(const void *value);
-};
-
-// Parse and return an integer from the string c, as atoi()
-// On return, c is set to one past the end of the integer.
-inline int parseIntAndAdvance(const char *&c) {
-  int i = 0;
-  for (; *c >= '0' && *c <= '9'; ++c) i = 10 * i + (*c - '0');
-  return i;
-}
-
-// Print literal part of format string and return next format spec
-// position.
-//
-// Skips over any occurrences of '%%', printing a literal '%' to the
-// output.  The position of the first % character of the next
-// nontrivial format spec is returned, or the end of string.
-inline const char *printFormatStringLiteral(std::ostream &out,
-                                            const char *fmt) {
-  const char *c = fmt;
-  for (;; ++c) {
-    switch (*c) {
-      case '\0':
-        out.write(fmt, c - fmt);
-        return c;
-      case '%':
-        out.write(fmt, c - fmt);
-        if (*(c + 1) != '%') return c;
-        // for "%%", tack trailing % onto next literal section.
-        fmt = ++c;
-        break;
-      default:
-        break;
-    }
-  }
-}
-
-// Parse a format string and set the stream state accordingly.
-//
-// The format mini-language recognized here is meant to be the one from C99,
-// with the form "%[flags][width][.precision][length]type".
-//
-// Formatting options which can't be natively represented using the ostream
-// state are returned in spacePadPositive (for space padded positive numbers)
-// and ntrunc (for truncating conversions).  argIndex is incremented if
-// necessary to pull out variable width and precision .  The function returns a
-// pointer to the character after the end of the current format spec.
-inline const char *streamStateFromFormat(std::ostream &out,
-                                         bool &spacePadPositive, int &ntrunc,
-                                         const char *fmtStart,
-                                         const detail::FormatArg *formatters,
-                                         int &argIndex, int numFormatters) {
-  if (*fmtStart != '%') {
-    TINYFORMAT_ERROR(
-        "tinyformat: Not enough conversion specifiers in format string");
-    return fmtStart;
-  }
-  // Reset stream state to defaults.
-  out.width(0);
-  out.precision(6);
-  out.fill(' ');
-  // Reset most flags; ignore irrelevant unitbuf & skipws.
-  out.unsetf(std::ios::adjustfield | std::ios::basefield |
-             std::ios::floatfield | std::ios::showbase | std::ios::boolalpha |
-             std::ios::showpoint | std::ios::showpos | std::ios::uppercase);
-  bool precisionSet = false;
-  bool widthSet = false;
-  int widthExtra = 0;
-  const char *c = fmtStart + 1;
-  // 1) Parse flags
-  for (;; ++c) {
-    switch (*c) {
-      case '#':
-        out.setf(std::ios::showpoint | std::ios::showbase);
-        continue;
-      case '0':
-        // overridden by left alignment ('-' flag)
-        if (!(out.flags() & std::ios::left)) {
-          // Use internal padding so that numeric values are
-          // formatted correctly, eg -00010 rather than 000-10
-          out.fill('0');
-          out.setf(std::ios::internal, std::ios::adjustfield);
-        }
-        continue;
-      case '-':
-        out.fill(' ');
-        out.setf(std::ios::left, std::ios::adjustfield);
-        continue;
-      case ' ':
-        // overridden by show positive sign, '+' flag.
-        if (!(out.flags() & std::ios::showpos)) spacePadPositive = true;
-        continue;
-      case '+':
-        out.setf(std::ios::showpos);
-        spacePadPositive = false;
-        widthExtra = 1;
-        continue;
-      default:
-        break;
-    }
-    break;
-  }
-  // 2) Parse width
-  if (*c >= '0' && *c <= '9') {
-    widthSet = true;
-    out.width(parseIntAndAdvance(c));
-  }
-  if (*c == '*') {
-    widthSet = true;
-    int width = 0;
-    if (argIndex < numFormatters)
-      width = formatters[argIndex++].toInt();
-    else
-      TINYFORMAT_ERROR(
-          "tinyformat: Not enough arguments to read variable width");
-    if (width < 0) {
-      // negative widths correspond to '-' flag set
-      out.fill(' ');
-      out.setf(std::ios::left, std::ios::adjustfield);
-      width = -width;
-    }
-    out.width(width);
-    ++c;
-  }
-  // 3) Parse precision
-  if (*c == '.') {
-    ++c;
-    int precision = 0;
-    if (*c == '*') {
-      ++c;
-      if (argIndex < numFormatters)
-        precision = formatters[argIndex++].toInt();
-      else
-        TINYFORMAT_ERROR(
-            "tinyformat: Not enough arguments to read variable precision");
-    } else {
-      if (*c >= '0' && *c <= '9')
-        precision = parseIntAndAdvance(c);
-      else if (*c == '-')  // negative precisions ignored, treated as zero.
-        parseIntAndAdvance(++c);
-    }
-    out.precision(precision);
-    precisionSet = true;
-  }
-  // 4) Ignore any C99 length modifier
-  while (*c == 'l' || *c == 'h' || *c == 'L' || *c == 'j' || *c == 'z' ||
-         *c == 't')
-    ++c;
-  // 5) We're up to the conversion specifier character.
-  // Set stream flags based on conversion specifier (thanks to the
-  // boost::format class for forging the way here).
-  bool intConversion = false;
-  switch (*c) {
-    case 'u':
-    case 'd':
-    case 'i':
-      out.setf(std::ios::dec, std::ios::basefield);
-      intConversion = true;
-      break;
-    case 'o':
-      out.setf(std::ios::oct, std::ios::basefield);
-      intConversion = true;
-      break;
-    case 'X':
-      out.setf(std::ios::uppercase);
-    case 'x':
-    case 'p':
-      out.setf(std::ios::hex, std::ios::basefield);
-      intConversion = true;
-      break;
-    case 'E':
-      out.setf(std::ios::uppercase);
-    case 'e':
-      out.setf(std::ios::scientific, std::ios::floatfield);
-      out.setf(std::ios::dec, std::ios::basefield);
-      break;
-    case 'F':
-      out.setf(std::ios::uppercase);
-    case 'f':
-      out.setf(std::ios::fixed, std::ios::floatfield);
-      break;
-    case 'G':
-      out.setf(std::ios::uppercase);
-    case 'g':
-      out.setf(std::ios::dec, std::ios::basefield);
-      // As in boost::format, let stream decide float format.
-      out.flags(out.flags() & ~std::ios::floatfield);
-      break;
-    case 'a':
-    case 'A':
-      TINYFORMAT_ERROR(
-          "tinyformat: the %a and %A conversion specs "
-          "are not supported");
-      break;
-    case 'c':
-      // Handled as special case inside formatValue()
-      break;
-    case 's':
-      if (precisionSet) ntrunc = static_cast<int>(out.precision());
-      // Make %s print booleans as "true" and "false"
-      out.setf(std::ios::boolalpha);
-      break;
-    case 'n':
-      // Not supported - will cause problems!
-      TINYFORMAT_ERROR("tinyformat: %n conversion spec not supported");
-      break;
-    case '\0':
-      TINYFORMAT_ERROR(
-          "tinyformat: Conversion spec incorrectly "
-          "terminated by end of string");
-      return c;
-    default:
-      break;
-  }
-  if (intConversion && precisionSet && !widthSet) {
-    // "precision" for integers gives the minimum number of digits (to be
-    // padded with zeros on the left).  This isn't really supported by the
-    // iostreams, but we can approximately simulate it with the width if
-    // the width isn't otherwise used.
-    out.width(out.precision() + widthExtra);
-    out.setf(std::ios::internal, std::ios::adjustfield);
-    out.fill('0');
-  }
-  return c + 1;
-}
-
-//------------------------------------------------------------------------------
-inline void formatImpl(std::ostream &out, const char *fmt,
-                       const detail::FormatArg *formatters, int numFormatters) {
-  // Saved stream state
-  std::streamsize origWidth = out.width();
-  std::streamsize origPrecision = out.precision();
-  std::ios::fmtflags origFlags = out.flags();
-  char origFill = out.fill();
-
-  for (int argIndex = 0; argIndex < numFormatters; ++argIndex) {
-    // Parse the format string
-    fmt = printFormatStringLiteral(out, fmt);
-    bool spacePadPositive = false;
-    int ntrunc = -1;
-    const char *fmtEnd =
-        streamStateFromFormat(out, spacePadPositive, ntrunc, fmt, formatters,
-                              argIndex, numFormatters);
-    if (argIndex >= numFormatters) {
-      // Check args remain after reading any variable width/precision
-      TINYFORMAT_ERROR("tinyformat: Not enough format arguments");
-      return;
-    }
-    const FormatArg &arg = formatters[argIndex];
-    // Format the arg into the stream.
-    if (!spacePadPositive)
-      arg.format(out, fmt, fmtEnd, ntrunc);
-    else {
-      // The following is a special case with no direct correspondence
-      // between stream formatting and the printf() behaviour.  Simulate
-      // it crudely by formatting into a temporary string stream and
-      // munging the resulting string.
-      std::ostringstream tmpStream;
-      tmpStream.copyfmt(out);
-      tmpStream.setf(std::ios::showpos);
-      arg.format(tmpStream, fmt, fmtEnd, ntrunc);
-      std::string result = tmpStream.str();  // allocates... yuck.
-      for (size_t i = 0, iend = result.size(); i < iend; ++i)
-        if (result[i] == '+') result[i] = ' ';
-      out << result;
-    }
-    fmt = fmtEnd;
-  }
-
-  // Print remaining part of format string.
-  fmt = printFormatStringLiteral(out, fmt);
-  if (*fmt != '\0')
-    TINYFORMAT_ERROR(
-        "tinyformat: Too many conversion specifiers in format string");
-
-  // Restore stream state
-  out.width(origWidth);
-  out.precision(origPrecision);
-  out.flags(origFlags);
-  out.fill(origFill);
-}
-
-}  // namespace detail
-
-/// List of template arguments format(), held in a type-opaque way.
-///
-/// A const reference to FormatList (typedef'd as FormatListRef) may be
-/// conveniently used to pass arguments to non-template functions: All type
-/// information has been stripped from the arguments, leaving just enough of a
-/// common interface to perform formatting as required.
-class FormatList {
- public:
-  FormatList(detail::FormatArg *formatters, int N)
-      : m_formatters(formatters), m_N(N) {}
-
-  friend void vformat(std::ostream &out, const char *fmt,
-                      const FormatList &list);
-
- private:
-  const detail::FormatArg *m_formatters;
-  int m_N;
-};
-
-/// Reference to type-opaque format list for passing to vformat()
-typedef const FormatList &FormatListRef;
-
-namespace detail {
-
-// Format list subclass with fixed storage to avoid dynamic allocation
-template <int N>
-class FormatListN : public FormatList {
- public:
-  template <typename... Args>
-  FormatListN(const Args &... args)
-      : FormatList(&m_formatterStore[0], N),
-        m_formatterStore{FormatArg(args)...} {
-    static_assert(sizeof...(args) == N, "Number of args must be N");
-  }
-
- private:
-  FormatArg m_formatterStore[N];
-};
-
-// Special 0-arg version - MSVC says zero-sized C array in struct is nonstandard
-template <>
-class FormatListN<0> : public FormatList {
- public:
-  FormatListN() : FormatList(0, 0) {}
-};
-
-}  // namespace detail
-
-//------------------------------------------------------------------------------
-// Primary API functions
-
-/// Make type-agnostic format list from list of template arguments.
-///
-/// The exact return type of this function is an implementation detail and
-/// shouldn't be relied upon.  Instead it should be stored as a FormatListRef:
-///
-///   FormatListRef formatList = makeFormatList( /*...*/ );
-template <typename... Args>
-detail::FormatListN<sizeof...(Args)> makeFormatList(const Args &... args) {
-  return detail::FormatListN<sizeof...(args)>(args...);
-}
-
-/// Format list of arguments to the stream according to the given format string.
-///
-/// The name vformat() is chosen for the semantic similarity to vprintf(): the
-/// list of format arguments is held in a single function argument.
-inline void vformat(std::ostream &out, const char *fmt, FormatListRef list) {
-  detail::formatImpl(out, fmt, list.m_formatters, list.m_N);
-}
-
-/// Format list of arguments to the stream according to given format string.
-template <typename... Args>
-void format(std::ostream &out, const char *fmt, const Args &... args) {
-  vformat(out, fmt, makeFormatList(args...));
-}
-
-/// Format list of arguments according to the given format string and return
-/// the result as a string.
-template <typename... Args>
-std::string format(const char *fmt, const Args &... args) {
-  std::ostringstream oss;
-  format(oss, fmt, args...);
-  return oss.str();
-}
-
-/// Format list of arguments to std::cout, according to the given format string
-template <typename... Args>
-void printf(const char *fmt, const Args &... args) {
-  format(std::cout, fmt, args...);
-}
-
-template <typename... Args>
-void printfln(const char *fmt, const Args &... args) {
-  format(std::cout, fmt, args...);
-  std::cout << '\n';
-}
-
-}  // namespace tinyformat
-}  // namespace string
-}  // namespace paddle
diff --git a/paddle/fluid/string/to_string.h b/paddle/fluid/string/to_string.h
deleted file mode 100644
index 8caf149420393ec81131389d7787bee925f4a27d..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/to_string.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <sstream>
-#include <string>
-#include <typeindex>
-
-namespace paddle {
-namespace string {
-inline std::ostream& operator<<(std::ostream& s, const std::type_index& t) {
-  s << t.name();
-  return s;
-}
-
-template <typename T>
-inline std::string to_string(T v) {
-  std::ostringstream sout;
-  sout << v;
-  return sout.str();
-}
-
-template <>
-inline std::string to_string(std::type_index t) {
-  return t.name();
-}
-
-// Faster std::string/const char* type
-template <>
-inline std::string to_string(std::string v) {
-  return v;
-}
-
-template <>
-inline std::string to_string(const char* v) {
-  return std::string(v);
-}
-
-}  // namespace string
-}  // namespace paddle
diff --git a/paddle/fluid/string/to_string_test.cc b/paddle/fluid/string/to_string_test.cc
deleted file mode 100644
index 1d9c0e5e0c2b6e7f44c1622d2828b21b0a4380ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/to_string_test.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/string/to_string.h"
-#include <gtest/gtest.h>
-
-constexpr char kOutputString[] = "User Defined Output";
-class UserDefinedClass {
- public:
-};
-
-std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
-  s << kOutputString;
-  return s;
-}
-
-TEST(to_string, normal) {
-  using paddle::string::to_string;
-  ASSERT_EQ("10", to_string(10));
-  ASSERT_EQ("abc", to_string("abc"));
-  ASSERT_EQ("1.2", to_string(1.2));
-}
-
-TEST(to_string, user_defined) {
-  UserDefinedClass instance;
-  ASSERT_EQ(kOutputString, paddle::string::to_string(instance));
-}
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
deleted file mode 100644
index 7b0bc669b0731abf8f21b58b1bc748acbf994133..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-function(train_test TARGET_NAME)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs ARGS)
-    cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    set(arg_list "")
-    if(train_test_ARGS)
-        foreach(arg ${train_test_ARGS})
-            list(APPEND arg_list "_${arg}")
-        endforeach()
-    else()
-        list(APPEND arg_list "_")
-    endif()
-    foreach(arg ${arg_list})
-        string(REGEX REPLACE "^_$" "" arg "${arg}")
-        cc_test(test_train_${TARGET_NAME}${arg}
-                SRCS test_train_${TARGET_NAME}.cc
-                DEPS paddle_fluid_origin
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES DEPENDS test_${TARGET_NAME})
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES LABELS "RUN_TYPE=DIST")
-    endforeach()
-endfunction(train_test)
-
-
-if(WITH_TESTING)
-  train_test(recognize_digits ARGS mlp conv)
-endif()
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
deleted file mode 100644
index 289dd9869bd39911fba571cbe9bdb0b7070249d2..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ /dev/null
@@ -1,74 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(cpp_train_demo CXX C)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
-endif()
-
-option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
-option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
-
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-
-add_executable(demo_trainer demo_trainer.cc)
-
-if(WITH_MKLDNN)
-  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  if(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
-  else(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-  endif(WIN32)
-endif(WITH_MKLDNN)
-
-if(WITH_MKL)
-  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  if(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
-  else(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
-  endif(WIN32)
-else()
-  if(APPLE)
-    set(MATH_LIB cblas)
-  elseif(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
-  else()
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
-  endif(APPLE)
-endif()
-
-if(APPLE)
-  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-else(APPLE)
-  set(ARCHIVE_START "-Wl,--whole-archive")
-  set(ARCHIVE_END "-Wl,--no-whole-archive")
-  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-endif(APPLE)
-
-target_link_libraries(demo_trainer
-        ${MACOS_LD_FLAGS}
-        ${ARCHIVE_START}
-        ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so
-        ${ARCHIVE_END}
-        ${MATH_LIB}
-        ${MKLDNN_LIB}
-        glog gflags protobuf z xxhash
-        ${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
deleted file mode 100644
index bd53ab4b0c023b2591d792b504ab496a42d2835d..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/demo/README.md
+++ /dev/null
@@ -1,65 +0,0 @@
-
-### step 1. build paddle lib
-
-```
-
-# WITH_MKL=ON|OFF
-# WITH_MKLDNN=ON|OFF
-
-PADDLE_LIB=/paddle/lib/dir
-cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
-         -DCMAKE_BUILD_TYPE=Release \
-         -DWITH_GPU=OFF \
-         -DWITH_STYLE_CHECK=OFF \
-         -DWITH_MKL=OFF \
-         -DWITH_MKLDNN=OFF
-make -j8
-make -j8 fluid_lib_dist
-```
-
-### step 2. generate program desc
-```
-# please install paddle before run this scripe
-pip install --upgrade paddlepaddle-*.whl
-python demo_network.py
-```
-
-This will generate two program desc files:
-  - startup_program: used to init all parameters
-  - main_program: main logic of the network
-
-### step 3. build demo_trainer and run it.
-
-
-```
-# Make a build dir at the same dir of this README.md document.
-# The demo dir can be put anywhere.
-mkdir build
-cd build
-
-# WITH_MKL=ON|OFF
-# WITH_MKLDNN=ON|OFF
-PADDLE_LIB=/paddle/lib/dir
-
-# PADDLE_LIB is the same with FLUID_INSTALL_DIR when building the lib
-cmake .. -DPADDLE_LIB=$PADDLE_LIB \
-         -DWITH_MKLDNN=OFF \
-         -DWITH_MKL=OFF
-make
-
-# copy startup_program and main_program to this dir
-cp ../startup_program .
-cp ../main_program .
-
-# run demo cpp trainer
-./demo_trainer
-
-```
-
-The output will be:
-```
-step: 0 loss: 1069.02
-step: 1 loss: 1069.02
-step: 2 loss: 1069.02
-....
-```
diff --git a/paddle/fluid/train/demo/demo_network.py b/paddle/fluid/train/demo/demo_network.py
deleted file mode 100644
index 41e98c6a24a750a9300b5c2a6d370303cc0e59c5..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/demo/demo_network.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle.fluid.framework as framework
-
-
-def train_network(with_optimize):
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-
-    if with_optimize:
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.00001)
-        sgd_optimizer.minimize(avg_cost)
-    else:
-        fluid.backward.append_backward(avg_cost)
-
-
-def save_program_desc(network_func):
-    startup_program = framework.Program()
-    train_program = framework.Program()
-
-    with framework.program_guard(train_program, startup_program):
-        network_func(with_optimize=False)
-
-    with open("startup_program", "w") as f:
-        f.write(startup_program.desc.serialize_to_string())
-    with open("main_program", "w") as f:
-        f.write(train_program.desc.serialize_to_string())
-
-
-save_program_desc(train_network)
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
deleted file mode 100644
index 1087f5672459506cc7b824127cd822c0df7ba566..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <time.h>
-#include <fstream>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace train {
-
-void ReadBinaryFile(const std::string& filename, std::string* contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-std::unique_ptr<paddle::framework::ProgramDesc> Load(
-    paddle::framework::Executor* executor, const std::string& model_filename) {
-  VLOG(3) << "loading model from " << model_filename;
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
-
-  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
-      new paddle::framework::ProgramDesc(program_desc_str));
-  return main_program;
-}
-
-}  // namespace train
-}  // namespace paddle
-
-int main() {
-  paddle::framework::InitDevices(false);
-
-  const auto cpu_place = paddle::platform::CPUPlace();
-
-  paddle::framework::Executor executor(cpu_place);
-  paddle::framework::Scope scope;
-  auto startup_program = paddle::train::Load(&executor, "startup_program");
-  auto train_program = paddle::train::Load(&executor, "main_program");
-
-  std::string loss_name = "";
-  for (auto op_desc : train_program->Block(0).AllOps()) {
-    if (op_desc->Type() == "mean") {
-      loss_name = op_desc->Output("Out")[0];
-      break;
-    }
-  }
-
-  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
-
-  // init all parameters
-  executor.Run(*startup_program, &scope, 0);
-
-  // prepare data
-  auto x_var = scope.Var("x");
-  auto x_tensor = x_var->GetMutable<paddle::framework::LoDTensor>();
-  x_tensor->Resize({2, 13});
-
-  auto x_data = x_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 2 * 13; ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  auto y_var = scope.Var("y");
-  auto y_tensor = y_var->GetMutable<paddle::framework::LoDTensor>();
-  y_tensor->Resize({2, 1});
-  auto y_data = y_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 2 * 1; ++i) {
-    y_data[i] = static_cast<float>(i);
-  }
-
-  auto loss_var = scope.Var(loss_name);
-
-  paddle::platform::ProfilerState pf_state;
-  pf_state = paddle::platform::ProfilerState::kCPU;
-  paddle::platform::EnableProfiler(pf_state);
-  clock_t t1 = clock();
-
-  for (int i = 0; i < 10; ++i) {
-    executor.Run(*train_program, &scope, 0, false, true);
-    std::cout << "step: " << i << " loss: "
-              << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
-              << std::endl;
-  }
-
-  clock_t t2 = clock();
-  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
-                                    "run_paddle_op_profiler");
-  std::cout << "run_time = " << t2 - t1 << std::endl;
-  return 0;
-}
diff --git a/paddle/fluid/train/imdb_demo/CMakeLists.txt b/paddle/fluid/train/imdb_demo/CMakeLists.txt
deleted file mode 100644
index d12069169eb713b101a93aa60f5e14d42395fe77..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/CMakeLists.txt
+++ /dev/null
@@ -1,74 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(cpp_imdb_train_demo CXX C)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
-endif()
-
-option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
-option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
-
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-
-add_executable(demo_trainer save_model.cc demo_trainer.cc)
-
-if(WITH_MKLDNN)
-  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  if(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
-  else(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-  endif(WIN32)
-endif(WITH_MKLDNN)
-
-if(WITH_MKL)
-  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  if(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
-  else(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
-  endif(WIN32)
-else()
-  if(APPLE)
-    set(MATH_LIB cblas)
-  elseif(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
-  else()
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
-  endif(APPLE)
-endif()
-
-if(APPLE)
-  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-else(APPLE)
-  set(ARCHIVE_START "-Wl,--whole-archive")
-  set(ARCHIVE_END "-Wl,--no-whole-archive")
-  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-endif(APPLE)
-
-target_link_libraries(demo_trainer
-	${MACOS_LD_FLAGS}
-	${ARCHIVE_START}
-	${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so
-	${ARCHIVE_END}
-	${MATH_LIB}
-	${MKLDNN_LIB}
-	glog gflags protobuf z xxhash
-	${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/imdb_demo/README.md b/paddle/fluid/train/imdb_demo/README.md
deleted file mode 100644
index 3c75a4744aba54e3dd56e13b5b4a2fd6646ac45c..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# Train with C++ inference API
-
-What is C++ inference API and how to install it:
-
-see: [PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线](https://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/deploy/inference/index_cn.html)
-
-## IMDB task
-
-see: [IMDB Dataset of 50K Movie Reviews | Kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
-
-## Quick Start
-
-### prepare data
-
-```shell
-    wget https://fleet.bj.bcebos.com/text_classification_data.tar.gz
-    tar -zxvf text_classification_data.tar.gz
-```
-### build
-
-```shell
-    mkdir build
-    cd build
-    rm -rf *
-    PADDLE_LIB=path/to/your/fluid_inference_install_dir/
-    cmake .. -DPADDLE_LIB=$PADDLE_LIB  -DWITH_MKLDNN=OFF -DWITH_MKL=OFF
-    make
-```
-
-### generate program description
-
-```
-    python generate_program.py bow
-```
-
-### run
-
-```shell
-   # After editing train.cfg
-   sh run.sh
-```
-
-## results
-
-Below are training logs on BOW model, the losses go down as expected.
-
-```
-WARNING: Logging before InitGoogleLogging() is written to STDERR
-I0731 22:39:06.974232 10965 demo_trainer.cc:130] Start training...
-I0731 22:39:57.395229 10965 demo_trainer.cc:164] epoch: 0; average loss: 0.405706
-I0731 22:40:50.262344 10965 demo_trainer.cc:164] epoch: 1; average loss: 0.110746
-I0731 22:41:49.731079 10965 demo_trainer.cc:164] epoch: 2; average loss: 0.0475805
-I0731 22:43:31.398355 10965 demo_trainer.cc:164] epoch: 3; average loss: 0.0233249
-I0731 22:44:58.744391 10965 demo_trainer.cc:164] epoch: 4; average loss: 0.00701507
-I0731 22:46:30.451735 10965 demo_trainer.cc:164] epoch: 5; average loss: 0.00258187
-I0731 22:48:14.396687 10965 demo_trainer.cc:164] epoch: 6; average loss: 0.00113157
-I0731 22:49:56.242744 10965 demo_trainer.cc:164] epoch: 7; average loss: 0.000698234
-I0731 22:51:11.585919 10965 demo_trainer.cc:164] epoch: 8; average loss: 0.000510136
-I0731 22:52:50.573947 10965 demo_trainer.cc:164] epoch: 9; average loss: 0.000400932
-I0731 22:54:02.686152 10965 demo_trainer.cc:164] epoch: 10; average loss: 0.000329259
-I0731 22:54:55.233342 10965 demo_trainer.cc:164] epoch: 11; average loss: 0.000278644
-I0731 22:56:15.496256 10965 demo_trainer.cc:164] epoch: 12; average loss: 0.000241055
-I0731 22:57:45.015926 10965 demo_trainer.cc:164] epoch: 13; average loss: 0.000212085
-I0731 22:59:18.419997 10965 demo_trainer.cc:164] epoch: 14; average loss: 0.000189109
-I0731 23:00:15.409077 10965 demo_trainer.cc:164] epoch: 15; average loss: 0.000170465
-I0731 23:01:38.795770 10965 demo_trainer.cc:164] epoch: 16; average loss: 0.000155051
-I0731 23:02:57.289487 10965 demo_trainer.cc:164] epoch: 17; average loss: 0.000142106
-I0731 23:03:48.032507 10965 demo_trainer.cc:164] epoch: 18; average loss: 0.000131089
-I0731 23:04:51.195230 10965 demo_trainer.cc:164] epoch: 19; average loss: 0.000121605
-I0731 23:06:27.008040 10965 demo_trainer.cc:164] epoch: 20; average loss: 0.00011336
-I0731 23:07:56.568284 10965 demo_trainer.cc:164] epoch: 21; average loss: 0.000106129
-I0731 23:09:23.948290 10965 demo_trainer.cc:164] epoch: 22; average loss: 9.97393e-05
-I0731 23:10:56.062590 10965 demo_trainer.cc:164] epoch: 23; average loss: 9.40532e-05
-I0731 23:12:23.014047 10965 demo_trainer.cc:164] epoch: 24; average loss: 8.89622e-05
-I0731 23:13:21.439818 10965 demo_trainer.cc:164] epoch: 25; average loss: 8.43784e-05
-I0731 23:14:56.171597 10965 demo_trainer.cc:164] epoch: 26; average loss: 8.02322e-05
-I0731 23:16:01.513542 10965 demo_trainer.cc:164] epoch: 27; average loss: 7.64629e-05
-I0731 23:17:18.709139 10965 demo_trainer.cc:164] epoch: 28; average loss: 7.30239e-05
-I0731 23:18:41.421555 10965 demo_trainer.cc:164] epoch: 29; average loss: 6.98716e-05
-```
-
-I trained a Bow model and a CNN model on IMDB dataset using the trainer. At the same time, I also trained the same models using traditional Python training methods. 
-Results show that the two methods achieve almost the same dev accuracy:
-
-CNN:
- 
-<img src="https://user-images.githubusercontent.com/23031310/62356234-32217300-b543-11e9-89fd-a07614904a08.png" width="300">
-
-BOW:
-
-<img src="https://user-images.githubusercontent.com/23031310/62356253-39488100-b543-11e9-9fa2-a399fc1119d6.png" width="300">
-
-I also recorded the training speed of the C++ Trainer and the python training methods, C++ trainer is quicker on CNN model: 
-
-<img src="https://user-images.githubusercontent.com/23031310/62356444-af4ce800-b543-11e9-88c8-f3bde1321ea1.png" width="300">
-
-#TODO (mapingshuo): find the reason why C++ trainer is quicker on CNN model than python method.
diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc
deleted file mode 100644
index d45edd563f03d7a1b156d063d5e7296290d0eaba..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <time.h>
-#include <fstream>
-
-#include "include/save_model.h"
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/dataset_factory.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-#include "gflags/gflags.h"
-
-DEFINE_string(filelist, "train_filelist.txt", "filelist for fluid dataset");
-DEFINE_string(data_proto_desc, "data.proto", "data feed protobuf description");
-DEFINE_string(startup_program_file, "startup_program",
-              "startup program description");
-DEFINE_string(main_program_file, "", "main program description");
-DEFINE_string(loss_name, "mean_0.tmp_0",
-              "loss tensor name in the main program");
-DEFINE_string(save_dir, "cnn_model", "directory to save trained models");
-DEFINE_int32(epoch_num, 30, "number of epochs to run when training");
-
-namespace paddle {
-namespace train {
-
-void ReadBinaryFile(const std::string& filename, std::string* contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-std::unique_ptr<paddle::framework::ProgramDesc> LoadProgramDesc(
-    const std::string& model_filename) {
-  VLOG(3) << "loading model from " << model_filename;
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
-  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
-      new paddle::framework::ProgramDesc(program_desc_str));
-  return main_program;
-}
-
-bool IsPersistable(const paddle::framework::VarDesc* var) {
-  if (var->Persistable() &&
-      var->GetType() != paddle::framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != paddle::framework::proto::VarType::FETCH_LIST &&
-      var->GetType() != paddle::framework::proto::VarType::RAW) {
-    return true;
-  }
-  return false;
-}
-
-}  // namespace train
-}  // namespace paddle
-
-int main(int argc, char* argv[]) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  std::cerr << "filelist: " << FLAGS_filelist << std::endl;
-  std::cerr << "data_proto_desc: " << FLAGS_data_proto_desc << std::endl;
-  std::cerr << "startup_program_file: " << FLAGS_startup_program_file
-            << std::endl;
-  std::cerr << "main_program_file: " << FLAGS_main_program_file << std::endl;
-  std::cerr << "loss_name: " << FLAGS_loss_name << std::endl;
-  std::cerr << "save_dir: " << FLAGS_save_dir << std::endl;
-  std::cerr << "epoch_num: " << FLAGS_epoch_num << std::endl;
-
-  std::string filelist = std::string(FLAGS_filelist);
-  std::vector<std::string> file_vec;
-  std::ifstream fin(filelist);
-  if (fin) {
-    std::string filename;
-    while (fin >> filename) {
-      file_vec.push_back(filename);
-    }
-  }
-  PADDLE_ENFORCE_GE(file_vec.size(), 1, "At least one file to train");
-  paddle::framework::InitDevices(false);
-  const auto cpu_place = paddle::platform::CPUPlace();
-  paddle::framework::Executor executor(cpu_place);
-  paddle::framework::Scope scope;
-  auto startup_program =
-      paddle::train::LoadProgramDesc(std::string(FLAGS_startup_program_file));
-  auto main_program =
-      paddle::train::LoadProgramDesc(std::string(FLAGS_main_program_file));
-
-  executor.Run(*startup_program, &scope, 0);
-
-  std::string data_feed_desc_str;
-  paddle::train::ReadBinaryFile(std::string(FLAGS_data_proto_desc),
-                                &data_feed_desc_str);
-  VLOG(3) << "load data feed desc done.";
-  std::unique_ptr<paddle::framework::Dataset> dataset_ptr;
-  dataset_ptr =
-      paddle::framework::DatasetFactory::CreateDataset("MultiSlotDataset");
-  VLOG(3) << "initialize dataset ptr done";
-
-  // find all params
-  std::vector<std::string> param_names;
-  const paddle::framework::BlockDesc& global_block = main_program->Block(0);
-  for (auto* var : global_block.AllVars()) {
-    if (paddle::train::IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
-      param_names.push_back(var->Name());
-    }
-  }
-
-  int epoch_num = FLAGS_epoch_num;
-  std::string loss_name = FLAGS_loss_name;
-  auto loss_var = scope.Var(loss_name);
-
-  LOG(INFO) << "Start training...";
-
-  for (int epoch = 0; epoch < epoch_num; ++epoch) {
-    VLOG(3) << "Epoch:" << epoch;
-    // get reader
-    dataset_ptr->SetFileList(file_vec);
-    VLOG(3) << "set file list done";
-    dataset_ptr->SetThreadNum(1);
-    VLOG(3) << "set thread num done";
-    dataset_ptr->SetDataFeedDesc(data_feed_desc_str);
-    VLOG(3) << "set data feed desc done";
-    dataset_ptr->CreateReaders();
-    const std::vector<paddle::framework::DataFeed*> readers =
-        dataset_ptr->GetReaders();
-    PADDLE_ENFORCE_EQ(readers.size(), 1,
-                      "readers num should be equal to thread num");
-    readers[0]->SetPlace(paddle::platform::CPUPlace());
-    const std::vector<std::string>& input_feed_names =
-        readers[0]->GetUseSlotAlias();
-    for (auto name : input_feed_names) {
-      readers[0]->AddFeedVar(scope.Var(name), name);
-    }
-    VLOG(3) << "get reader done";
-    readers[0]->Start();
-    VLOG(3) << "start a reader";
-    VLOG(3) << "readers size: " << readers.size();
-
-    int step = 0;
-    std::vector<float> loss_vec;
-
-    while (readers[0]->Next() > 0) {
-      executor.Run(*main_program, &scope, 0, false, true);
-      loss_vec.push_back(
-          loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]);
-    }
-    float average_loss =
-        accumulate(loss_vec.begin(), loss_vec.end(), 0.0) / loss_vec.size();
-
-    LOG(INFO) << "epoch: " << epoch << "; average loss: " << average_loss;
-    dataset_ptr->DestroyReaders();
-
-    // save model
-    std::string save_dir_root = FLAGS_save_dir;
-    std::string save_dir =
-        save_dir_root + "/epoch" + std::to_string(epoch) + ".model";
-    paddle::framework::save_model(main_program, &scope, param_names, save_dir,
-                                  false);
-  }
-}
diff --git a/paddle/fluid/train/imdb_demo/generate_program.py b/paddle/fluid/train/imdb_demo/generate_program.py
deleted file mode 100644
index a12282d94ddf9ed3e0824c9af709bd1f5b82556f..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/generate_program.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import paddle
-import logging
-import paddle.fluid as fluid
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-
-def load_vocab(filename):
-    vocab = {}
-    with open(filename) as f:
-        wid = 0
-        for line in f:
-            vocab[line.strip()] = wid
-            wid += 1
-    vocab["<unk>"] = len(vocab)
-    return vocab
-
-
-if __name__ == "__main__":
-    vocab = load_vocab('imdb.vocab')
-    dict_dim = len(vocab)
-    model_name = sys.argv[1]
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    dataset = fluid.DatasetFactory().create_dataset()
-    dataset.set_batch_size(128)
-    dataset.set_pipe_command("python imdb_reader.py")
-
-    dataset.set_use_var([data, label])
-    desc = dataset.proto_desc
-
-    with open("data.proto", "w") as f:
-        f.write(dataset.desc())
-
-    from nets import *
-    if model_name == 'cnn':
-        logger.info("Generate program description of CNN net")
-        avg_cost, acc, prediction = cnn_net(data, label, dict_dim)
-    elif model_name == 'bow':
-        logger.info("Generate program description of BOW net")
-        avg_cost, acc, prediction = bow_net(data, label, dict_dim)
-    else:
-        logger.error("no such model: " + model_name)
-        exit(0)
-    # optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
-    optimizer.minimize(avg_cost)
-
-    with open(model_name + "_main_program", "wb") as f:
-        f.write(fluid.default_main_program().desc.serialize_to_string())
-
-    with open(model_name + "_startup_program", "wb") as f:
-        f.write(fluid.default_startup_program().desc.serialize_to_string())
diff --git a/paddle/fluid/train/imdb_demo/imdb_reader.py b/paddle/fluid/train/imdb_demo/imdb_reader.py
deleted file mode 100644
index f197c95ec32171fb075bb9deeacd6fc6ae3b16e8..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/imdb_reader.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-import paddle
-import re
-import paddle.fluid.incubate.data_generator as dg
-
-
-class IMDBDataset(dg.MultiSlotDataGenerator):
-    def load_resource(self, dictfile):
-        self._vocab = {}
-        wid = 0
-        with open(dictfile) as f:
-            for line in f:
-                self._vocab[line.strip()] = wid
-                wid += 1
-        self._unk_id = len(self._vocab)
-        self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))')
-        self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0])
-
-    def get_words_and_label(self, line):
-        send = '|'.join(line.split('|')[:-1]).lower().replace("<br />",
-                                                              " ").strip()
-        label = [int(line.split('|')[-1])]
-
-        words = [x for x in self._pattern.split(send) if x and x != " "]
-        feas = [
-            self._vocab[x] if x in self._vocab else self._unk_id for x in words
-        ]
-        return feas, label
-
-    def infer_reader(self, infer_filelist, batch, buf_size):
-        def local_iter():
-            for fname in infer_filelist:
-                with open(fname, "r") as fin:
-                    for line in fin:
-                        feas, label = self.get_words_and_label(line)
-                        yield feas, label
-
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-
-    def generate_sample(self, line):
-        def memory_iter():
-            for i in range(1000):
-                yield self.return_value
-
-        def data_iter():
-            feas, label = self.get_words_and_label(line)
-            yield ("words", feas), ("label", label)
-
-        return data_iter
-
-
-if __name__ == "__main__":
-    imdb = IMDBDataset()
-    imdb.load_resource("imdb.vocab")
-    imdb.run_from_stdin()
diff --git a/paddle/fluid/train/imdb_demo/include/save_model.h b/paddle/fluid/train/imdb_demo/include/save_model.h
deleted file mode 100644
index 452052866855d294676a0792e06df7a4b6ecd76f..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/include/save_model.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <fcntl.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/text_format.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-void save_model(const std::unique_ptr<ProgramDesc>& main_program, Scope* scope,
-                const std::vector<std::string>& param_names,
-                const std::string& model_name, bool save_combine);
-}
-}
diff --git a/paddle/fluid/train/imdb_demo/nets.py b/paddle/fluid/train/imdb_demo/nets.py
deleted file mode 100644
index a25e67e3b5d56d1e672915cfade1a24ff6546eeb..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/nets.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import time
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-
-
-def bow_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    bow net
-    """
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
-    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def cnn_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            win_size=3):
-    """
-    conv net
-    """
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=win_size,
-        act="tanh",
-        pool_type="max")
-
-    fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
-
-    prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def lstm_net(data,
-             label,
-             dict_dim,
-             emb_dim=128,
-             hid_dim=128,
-             hid_dim2=96,
-             class_dim=2,
-             emb_lr=30.0):
-    """
-    lstm net
-    """
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr),
-        is_sparse=True)
-
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
-
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
-
-    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
-
-    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
-
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def gru_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            emb_lr=400.0):
-    """
-    gru net
-    """
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
-
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
-    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
-    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
-    gru_max_tanh = fluid.layers.tanh(gru_max)
-    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
diff --git a/paddle/fluid/train/imdb_demo/run.sh b/paddle/fluid/train/imdb_demo/run.sh
deleted file mode 100644
index f71b4bac602a9e6d5c7bea03f3c56043b13547d3..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/run.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-
-set -exu
-build/demo_trainer --flagfile="train.cfg"
diff --git a/paddle/fluid/train/imdb_demo/save_model.cc b/paddle/fluid/train/imdb_demo/save_model.cc
deleted file mode 100644
index 49da550dbb7f52912406663df6cf11e21e193bd9..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/save_model.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "include/save_model.h"
-#include <fcntl.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/text_format.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/place.h"
-
-using std::unique_ptr;
-
-namespace paddle {
-namespace framework {
-void save_model(const unique_ptr<ProgramDesc>& main_program, Scope* scope,
-                const std::vector<std::string>& param_names,
-                const std::string& model_name, bool save_combine) {
-  auto place = platform::CPUPlace();
-  const BlockDesc& global_block = main_program->Block(0);
-  std::vector<std::string> paralist;
-  for (auto* var : global_block.AllVars()) {
-    bool is_model_param = false;
-    for (auto param_name : param_names) {
-      if (var->Name() == param_name) {
-        is_model_param = true;
-        break;
-      }
-    }
-
-    if (!is_model_param) continue;
-
-    if (!save_combine) {
-      VLOG(3) << "model var name: %s" << var->Name().c_str();
-
-      paddle::framework::AttributeMap attrs;
-      attrs.insert({"file_path", model_name + "/" + var->Name()});
-      auto save_op = paddle::framework::OpRegistry::CreateOp(
-          "save", {{"X", {var->Name()}}}, {}, attrs);
-
-      save_op->Run(*scope, place);
-    } else {
-      paralist.push_back(var->Name());
-    }
-  }
-  if (save_combine) {
-    std::sort(paralist.begin(), paralist.end());
-    paddle::framework::AttributeMap attrs;
-    attrs.insert({"file_path", model_name});
-    auto save_op = paddle::framework::OpRegistry::CreateOp(
-        "save_combine", {{"X", paralist}}, {}, attrs);
-    save_op->Run(*scope, place);
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/train/imdb_demo/train.cfg b/paddle/fluid/train/imdb_demo/train.cfg
deleted file mode 100644
index 1821498890be8c17ff749bee5a9a0be3f2138810..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/train.cfg
+++ /dev/null
@@ -1,7 +0,0 @@
---filelist=train_filelist.txt
---data_proto_desc=data.proto
---loss_name=mean_0.tmp_0
---startup_program_file=bow_startup_program
---main_program_file=bow_main_program
---save_dir=bow_model
---epoch_num=30
diff --git a/paddle/fluid/train/imdb_demo/train_filelist.txt b/paddle/fluid/train/imdb_demo/train_filelist.txt
deleted file mode 100644
index dcf088af4176196a503097b7d4e16960bbe5ae10..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/train_filelist.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-train_data/part-0
-train_data/part-1
-train_data/part-10
-train_data/part-11
-train_data/part-2
-train_data/part-3
-train_data/part-4
-train_data/part-5
-train_data/part-6
-train_data/part-7
-train_data/part-8
-train_data/part-9
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
deleted file mode 100644
index bd2a439f6ab5273b29010cf3599460ea8bdd68d4..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <time.h>
-#include <fstream>
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-
-DEFINE_string(dirname, "", "Directory of the train model.");
-
-namespace paddle {
-
-void Train() {
-  CHECK(!FLAGS_dirname.empty());
-  framework::InitDevices(false);
-  const auto cpu_place = platform::CPUPlace();
-  framework::Executor executor(cpu_place);
-  framework::Scope scope;
-
-  auto train_program = inference::Load(
-      &executor, &scope, FLAGS_dirname + "__model_combined__.main_program",
-      FLAGS_dirname + "__params_combined__");
-
-  std::string loss_name = "";
-  for (auto op_desc : train_program->Block(0).AllOps()) {
-    if (op_desc->Type() == "mean") {
-      loss_name = op_desc->Output("Out")[0];
-      break;
-    }
-  }
-
-  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
-
-  // prepare data
-  auto x_var = scope.Var("img");
-  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
-  x_tensor->Resize({64, 1, 28, 28});
-
-  auto x_data = x_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 64 * 28 * 28; ++i) {
-    x_data[i] = 1.0;
-  }
-
-  auto y_var = scope.Var("label");
-  auto y_tensor = y_var->GetMutable<framework::LoDTensor>();
-  y_tensor->Resize({64, 1});
-  auto y_data = y_tensor->mutable_data<int64_t>(cpu_place);
-  for (int i = 0; i < 64 * 1; ++i) {
-    y_data[i] = static_cast<int64_t>(1);
-  }
-
-  auto loss_var = scope.Var(loss_name);
-  float first_loss = 0.0;
-  float last_loss = 0.0;
-  for (int i = 0; i < 100; ++i) {
-    executor.Run(*train_program, &scope, 0, false, true,
-                 {loss_name, "img", "label"});
-    if (i == 0) {
-      first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
-    } else if (i == 99) {
-      last_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
-    }
-  }
-  EXPECT_LT(last_loss, first_loss);
-}
-
-TEST(train, recognize_digits) { Train(); }
-
-}  // namespace paddle
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
deleted file mode 100644
index 68cb5a19f99ab5148b04d193eb2356588bdc5a59..0000000000000000000000000000000000000000
--- a/paddle/scripts/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-configure_file(submit_local.sh.in
-    paddle
-    @ONLY)
-
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin
-        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
deleted file mode 100644
index 39db5a601d3d46c106a574870f02434bd4bd5cd1..0000000000000000000000000000000000000000
--- a/paddle/scripts/README.md
+++ /dev/null
@@ -1,191 +0,0 @@
-# Building PaddlePaddle
-
-## Goals
-
-We want to make the building procedures:
-
-1. Static, can reproduce easily.
-1. Generate python `whl` packages that can be widely use cross many distributions.
-1. Build different binaries per release to satisfy different environments:
-    - Binaries for different CUDA and CUDNN versions, like CUDA 7.5, 8.0, 9.0
-    - Binaries containing only capi
-    - Binaries for python with wide unicode support or not.
-1. Build docker images with PaddlePaddle pre-installed, so that we can run
-PaddlePaddle applications directly in docker or on Kubernetes clusters.
-
-To achieve this, we maintain a dockerhub repo:https://hub.docker.com/r/paddlepaddle/paddle
-which provides pre-built environment images to build PaddlePaddle and generate corresponding `whl`
-binaries.(**We strongly recommend building paddlepaddle in our pre-specified Docker environment.**) 
-
-## Development Workflow
-
-Here we describe how the workflow goes on.  We start from considering our daily development environment.
-
-Developers work on a computer, which is usually a laptop or desktop:
-
-<img src="doc/paddle-development-environment.png" width=500 />
-
-or, they might rely on a more sophisticated box (like with GPUs):
-
-<img src="doc/paddle-development-environment-gpu.png" width=500 />
-
-A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
-
-## Build With Docker
-
-### Build Environments
-
-The lastest pre-built build environment images are:
-
-| Image | Tag |
-| ----- | --- |
-| paddlepaddle/paddle | latest-dev |
-
-### Start Build
-
-```bash
-git clone https://github.com/PaddlePaddle/Paddle.git
-cd Paddle
-./paddle/scripts/paddle_docker_build.sh build
-```
-
-After the build finishes, you can get output `whl` package under
-`build/python/dist`.
-
-This command will download the most recent dev image from docker hub, start a container in the backend and then run the build script `/paddle/paddle/scripts/paddle_build.sh build` in the container. 
-The container mounts the source directory on the host into `/paddle`. 
-When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
-
-### Build Options
-
-Users can specify the following Docker build arguments with either "ON" or "OFF" value:
-
-| Option | Default | Description |
-| ------ | -------- | ----------- |
-| `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
-| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
-| `WITH_TESTING` | OFF | Build unit tests binaries. |
-| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
-| `WITH_STYLE_CHECK` | ON | Check the code style when building. |
-| `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
-| `RUN_TEST` | OFF | Run unit test immediently after the build. |
-| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
-
-## Docker Images
-
-You can get the latest PaddlePaddle docker images by
-`docker pull paddlepaddle/paddle:<version>` or build one by yourself.
-
-### Official Docker Releases
-
-Official docker images at
-[here](https://hub.docker.com/r/paddlepaddle/paddle/tags/),
-you can choose either latest or images with a release tag like `0.10.0`,
-Currently available tags are:
-
-|   Tag  | Description |
-| ------ | --------------------- |
-| latest | latest CPU only image |
-| latest-gpu | latest binary with GPU support |
-| 0.10.0 | release 0.10.0 CPU only binary image |
-| 0.10.0-gpu | release 0.10.0 with GPU support |
-
-### Build Your Own Image
-
-Build PaddlePaddle docker images are quite simple since PaddlePaddle can
-be installed by just running `pip install`. A sample `Dockerfile` is:
-
-```dockerfile
-FROM nvidia/cuda:7.5-cudnn5-runtime-centos6
-RUN yum install -y centos-release-SCL
-RUN yum install -y python27
-# This whl package is generated by previous build steps.
-ADD python/dist/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl /
-RUN pip install /paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl && rm -f /*.whl
-```
-
-Then build the image by running `docker build -t [REPO]/paddle:[TAG] .` under
-the directory containing your own `Dockerfile`.
-
-We also release a script and Dockerfile for building PaddlePaddle docker images
-across different cuda versions. To build these docker images, run:
-
-```bash
-bash ./build_docker_images.sh
-docker build -t [REPO]/paddle:tag -f [generated_docker_file] .
-```
-
-- NOTE: note that you can choose different base images for your environment, you can find all the versions [here](https://hub.docker.com/r/nvidia/cuda/).
-
-### Use Docker Images
-
-Suppose that you have written an application program `train.py` using
-PaddlePaddle, we can test and run it using docker:
-
-```bash
-docker run --rm -it -v $PWD:/work paddlepaddle/paddle /work/a.py
-```
-
-But this works only if all dependencies of `train.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
-
-### Run PaddlePaddle Book In Docker
-
-Our [book repo](https://github.com/paddlepaddle/book) also provide a docker
-image to start a jupiter notebook inside docker so that you can run this book
-using docker:
-
-```bash
-docker run -d -p 8888:8888 paddlepaddle/book
-```
-
-Please refer to https://github.com/paddlepaddle/book if you want to build this
-docker image by your self.
-
-### Run Distributed Applications
-
-In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
-
-Of course, we can manually build an application image and launch the job using the kubectl tool:
-
-```bash
-docker build -f some/Dockerfile -t myapp .
-docker tag myapp me/myapp
-docker push
-kubectl ...
-```
-
-### Reading source code with woboq codebrowser
-
-For developers who are interested in the C++ source code, you can build C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
-
-- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
-
-```bash
-./paddle/scripts/paddle_docker_build.sh html
-```
-
-- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
-
-```
-docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
-```
-
-## More Options
-
-### Build Without Docker
-
-Follow the *Dockerfile* in the paddlepaddle repo to set up your local dev environment and run:
-
-```bash
-./paddle/scripts/paddle_build.sh build
-```
-
-### Additional Tasks
-
-You can get the help menu for the build scripts by running with no options:
-
-```bash
-./paddle/scripts/paddle_build.sh
-or ./paddle/scripts/paddle_docker_build.sh
-```
diff --git a/paddle/scripts/build_docker_images.sh b/paddle/scripts/build_docker_images.sh
deleted file mode 100644
index c60f42da7aa6351985acaede756d139329ef520c..0000000000000000000000000000000000000000
--- a/paddle/scripts/build_docker_images.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/sh
-set -xe
-
-REPO="${REPO:-paddlepaddle}"
-
-cp -f ../../python/requirements.txt .
-
-sed 's#FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04#FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04#g' ../../Dockerfile |
-sed 's#TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz#TensorRT_5.1_ga_cuda9_cudnnv7.5.tar.gz#g' |
-sed 's#/usr/local/TensorRT#/usr/local/TensorRT_5.1_ga_cuda9_cudnnv7.5#g' |
-sed 's#libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0#libnccl2=2.4.7-1+cuda9.0 libnccl-dev=2.4.7-1+cuda9.0#g' |
-sed 's#COPY ./paddle/scripts/docker/root/#COPY ./docker/root/#g' |
-sed 's#COPY ./python/requirements.txt#COPY ./requirements.txt#' > Dockerfile.cuda9.0-cudnn7
-# docker build -t ${REPO}/paddle:cuda9.0-cudnn7-devel-ubuntu16.04 -f Dockerfile.cuda9.0-cudnn7 .
-
-sed 's#FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04#FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04#g' ../../Dockerfile |
-sed 's#TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz#TensorRT_5.1_ga_cuda10_cudnnv7.5.tar.gz#g' |
-sed 's#/usr/local/TensorRT#/usr/local/TensorRT_5.1_ga_cuda10_cudnnv7.5#g' |
-sed 's#libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0#libnccl2=2.4.7-1+cuda10.0 libnccl-dev=2.4.7-1+cuda10.0#g' |
-sed 's#COPY ./paddle/scripts/docker/root/#COPY ./docker/root/#g' |
-sed 's#COPY ./python/requirements.txt#COPY ./requirements.txt#' > Dockerfile.cuda10.0-cudnn7
-# docker build -t ${REPO}/paddle:cuda10.0-cudnn7-devel-ubuntu16.04 -f Dockerfile.cuda10.0-cudnn7 .
diff --git a/paddle/scripts/doc/paddle-development-environment-gpu.graffle b/paddle/scripts/doc/paddle-development-environment-gpu.graffle
deleted file mode 100644
index 4629f9b9da7ababdafa0b964db18a98a819c6a9e..0000000000000000000000000000000000000000
Binary files a/paddle/scripts/doc/paddle-development-environment-gpu.graffle and /dev/null differ
diff --git a/paddle/scripts/doc/paddle-development-environment-gpu.png b/paddle/scripts/doc/paddle-development-environment-gpu.png
deleted file mode 100644
index 61a96d7198d013f08f0f9c269cc352da5f7dd2e9..0000000000000000000000000000000000000000
Binary files a/paddle/scripts/doc/paddle-development-environment-gpu.png and /dev/null differ
diff --git a/paddle/scripts/doc/paddle-development-environment.graffle b/paddle/scripts/doc/paddle-development-environment.graffle
deleted file mode 100644
index 5b164c4832809de94ead7309af49c579135d7f48..0000000000000000000000000000000000000000
Binary files a/paddle/scripts/doc/paddle-development-environment.graffle and /dev/null differ
diff --git a/paddle/scripts/doc/paddle-development-environment.png b/paddle/scripts/doc/paddle-development-environment.png
deleted file mode 100644
index 707ed45a335a981c23b3533984045f53848b55e2..0000000000000000000000000000000000000000
Binary files a/paddle/scripts/doc/paddle-development-environment.png and /dev/null differ
diff --git a/paddle/scripts/docker/root/.bashrc b/paddle/scripts/docker/root/.bashrc
deleted file mode 100755
index 4b3024e4e81a0fa206a796c12a8b9d72f1a8f5d9..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/root/.bashrc
+++ /dev/null
@@ -1,46 +0,0 @@
-# Locales
-
-export LC_ALL=en_US.UTF-8
-export LANG=en_US.UTF-8
-export LANGUAGE=en_US.UTF-8
-
-# Aliases
-
-alias rm='rm -i'
-alias cp='cp -i'
-alias mv='mv -i'
-
-alias ls='ls -hFG'
-alias l='ls -lF'
-alias ll='ls -alF'
-alias lt='ls -ltrF'
-alias ll='ls -alF'
-alias lls='ls -alSrF'
-alias llt='ls -altrF'
-
-# Colorize directory listing
-
-alias ls="ls -ph --color=auto"
-
-# Colorize grep
-
-if echo hello|grep --color=auto l >/dev/null 2>&1; then
-  export GREP_OPTIONS="--color=auto" GREP_COLOR="1;31"
-fi
-
-# Shell
-
-export CLICOLOR="1"
-
-YELLOW="\[\033[1;33m\]"
-NO_COLOUR="\[\033[0m\]"
-GREEN="\[\033[1;32m\]"
-WHITE="\[\033[1;37m\]"
-
-source ~/.scripts/git-prompt.sh
-
-export PS1="\[\033[1;33m\]λ $WHITE\h $GREEN\w$YELLOW\$(__git_ps1 \" \[\033[35m\]{\[\033[36m\]%s\[\033[35m\]}\")$NO_COLOUR "
-
-# Git
-
-source ~/.scripts/git-completion.sh
diff --git a/paddle/scripts/docker/root/.gitconfig b/paddle/scripts/docker/root/.gitconfig
deleted file mode 100755
index 6c249803a50403b9b79e36a13abe7fe88a35729d..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/root/.gitconfig
+++ /dev/null
@@ -1,43 +0,0 @@
-[user]
-  name =
-  email =
-
-[alias]
-  st = status --branch --short
-  ci = commit
-  br = branch
-  co = checkout
-  df = diff
-  l = log --pretty=format:\"%h %ad | %s%d [%an]\" --graph --date=short
-  ll = log --stat
-
-[merge]
-  tool = vimdiff
-
-[core]
-  excludesfile = ~/.gitignore
-  editor = vim
-
-[color]
-  branch = auto
-  diff = auto
-  status = auto
-
-[color "branch"]
-  current = yellow reverse
-  local = yellow
-  remote = green
-
-[color "diff"]
-  meta = yellow bold
-  frag = magenta bold
-  old = red bold
-  new = green bold
-
-[color "status"]
-  added = yellow
-  changed = green
-  untracked = cyan
-
-[push]
-  default = matching
\ No newline at end of file
diff --git a/paddle/scripts/docker/root/.scripts/git-completion.sh b/paddle/scripts/docker/root/.scripts/git-completion.sh
deleted file mode 100755
index bdddef5ac2faf50b47dd03539dae8912bec8a16c..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/root/.scripts/git-completion.sh
+++ /dev/null
@@ -1,2663 +0,0 @@
-#!bash
-#
-# bash/zsh completion support for core Git.
-#
-# Copyright (C) 2006,2007 Shawn O. Pearce <spearce@spearce.org>
-# Conceptually based on gitcompletion (http://gitweb.hawaga.org.uk/).
-# Distributed under the GNU General Public License, version 2.0.
-#
-# The contained completion routines provide support for completing:
-#
-#    *) local and remote branch names
-#    *) local and remote tag names
-#    *) .git/remotes file names
-#    *) git 'subcommands'
-#    *) tree paths within 'ref:path/to/file' expressions
-#    *) file paths within current working directory and index
-#    *) common --long-options
-#
-# To use these routines:
-#
-#    1) Copy this file to somewhere (e.g. ~/.git-completion.sh).
-#    2) Add the following line to your .bashrc/.zshrc:
-#        source ~/.git-completion.sh
-#    3) Consider changing your PS1 to also show the current branch,
-#       see git-prompt.sh for details.
-
-case "$COMP_WORDBREAKS" in
-*:*) : great ;;
-*)   COMP_WORDBREAKS="$COMP_WORDBREAKS:"
-esac
-
-# __gitdir accepts 0 or 1 arguments (i.e., location)
-# returns location of .git repo
-__gitdir ()
-{
-  if [ -z "${1-}" ]; then
-    if [ -n "${__git_dir-}" ]; then
-      echo "$__git_dir"
-    elif [ -n "${GIT_DIR-}" ]; then
-      test -d "${GIT_DIR-}" || return 1
-      echo "$GIT_DIR"
-    elif [ -d .git ]; then
-      echo .git
-    else
-      git rev-parse --git-dir 2>/dev/null
-    fi
-  elif [ -d "$1/.git" ]; then
-    echo "$1/.git"
-  else
-    echo "$1"
-  fi
-}
-
-# The following function is based on code from:
-#
-#   bash_completion - programmable completion functions for bash 3.2+
-#
-#   Copyright © 2006-2008, Ian Macdonald <ian@caliban.org>
-#             © 2009-2010, Bash Completion Maintainers
-#                     <bash-completion-devel@lists.alioth.debian.org>
-#
-#   This program is free software; you can redistribute it and/or modify
-#   it under the terms of the GNU General Public License as published by
-#   the Free Software Foundation; either version 2, or (at your option)
-#   any later version.
-#
-#   This program is distributed in the hope that it will be useful,
-#   but WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#   GNU General Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License
-#   along with this program; if not, write to the Free Software Foundation,
-#   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-#   The latest version of this software can be obtained here:
-#
-#   http://bash-completion.alioth.debian.org/
-#
-#   RELEASE: 2.x
-
-# This function can be used to access a tokenized list of words
-# on the command line:
-#
-# __git_reassemble_comp_words_by_ref '=:'
-# if test "${words_[cword_-1]}" = -w
-# then
-#   ...
-# fi
-#
-# The argument should be a collection of characters from the list of
-# word completion separators (COMP_WORDBREAKS) to treat as ordinary
-# characters.
-#
-# This is roughly equivalent to going back in time and setting
-# COMP_WORDBREAKS to exclude those characters.  The intent is to
-# make option types like --date=<type> and <rev>:<path> easy to
-# recognize by treating each shell word as a single token.
-#
-# It is best not to set COMP_WORDBREAKS directly because the value is
-# shared with other completion scripts.  By the time the completion
-# function gets called, COMP_WORDS has already been populated so local
-# changes to COMP_WORDBREAKS have no effect.
-#
-# Output: words_, cword_, cur_.
-
-__git_reassemble_comp_words_by_ref()
-{
-  local exclude i j first
-  # Which word separators to exclude?
-  exclude="${1//[^$COMP_WORDBREAKS]}"
-  cword_=$COMP_CWORD
-  if [ -z "$exclude" ]; then
-    words_=("${COMP_WORDS[@]}")
-    return
-  fi
-  # List of word completion separators has shrunk;
-  # re-assemble words to complete.
-  for ((i=0, j=0; i < ${#COMP_WORDS[@]}; i++, j++)); do
-    # Append each nonempty word consisting of just
-    # word separator characters to the current word.
-    first=t
-    while
-      [ $i -gt 0 ] &&
-      [ -n "${COMP_WORDS[$i]}" ] &&
-      # word consists of excluded word separators
-      [ "${COMP_WORDS[$i]//[^$exclude]}" = "${COMP_WORDS[$i]}" ]
-    do
-      # Attach to the previous token,
-      # unless the previous token is the command name.
-      if [ $j -ge 2 ] && [ -n "$first" ]; then
-        ((j--))
-      fi
-      first=
-      words_[$j]=${words_[j]}${COMP_WORDS[i]}
-      if [ $i = $COMP_CWORD ]; then
-        cword_=$j
-      fi
-      if (($i < ${#COMP_WORDS[@]} - 1)); then
-        ((i++))
-      else
-        # Done.
-        return
-      fi
-    done
-    words_[$j]=${words_[j]}${COMP_WORDS[i]}
-    if [ $i = $COMP_CWORD ]; then
-      cword_=$j
-    fi
-  done
-}
-
-if ! type _get_comp_words_by_ref >/dev/null 2>&1; then
-_get_comp_words_by_ref ()
-{
-  local exclude cur_ words_ cword_
-  if [ "$1" = "-n" ]; then
-    exclude=$2
-    shift 2
-  fi
-  __git_reassemble_comp_words_by_ref "$exclude"
-  cur_=${words_[cword_]}
-  while [ $# -gt 0 ]; do
-    case "$1" in
-    cur)
-      cur=$cur_
-      ;;
-    prev)
-      prev=${words_[$cword_-1]}
-      ;;
-    words)
-      words=("${words_[@]}")
-      ;;
-    cword)
-      cword=$cword_
-      ;;
-    esac
-    shift
-  done
-}
-fi
-
-__gitcompadd ()
-{
-  local i=0
-  for x in $1; do
-    if [[ "$x" == "$3"* ]]; then
-      COMPREPLY[i++]="$2$x$4"
-    fi
-  done
-}
-
-# Generates completion reply, appending a space to possible completion words,
-# if necessary.
-# It accepts 1 to 4 arguments:
-# 1: List of possible completion words.
-# 2: A prefix to be added to each possible completion word (optional).
-# 3: Generate possible completion matches for this word (optional).
-# 4: A suffix to be appended to each possible completion word (optional).
-__gitcomp ()
-{
-  local cur_="${3-$cur}"
-
-  case "$cur_" in
-  --*=)
-    ;;
-  *)
-    local c i=0 IFS=$' \t\n'
-    for c in $1; do
-      c="$c${4-}"
-      if [[ $c == "$cur_"* ]]; then
-        case $c in
-        --*=*|*.) ;;
-        *) c="$c " ;;
-        esac
-        COMPREPLY[i++]="${2-}$c"
-      fi
-    done
-    ;;
-  esac
-}
-
-# Generates completion reply from newline-separated possible completion words
-# by appending a space to all of them.
-# It accepts 1 to 4 arguments:
-# 1: List of possible completion words, separated by a single newline.
-# 2: A prefix to be added to each possible completion word (optional).
-# 3: Generate possible completion matches for this word (optional).
-# 4: A suffix to be appended to each possible completion word instead of
-#    the default space (optional).  If specified but empty, nothing is
-#    appended.
-__gitcomp_nl ()
-{
-  local IFS=$'\n'
-  __gitcompadd "$1" "${2-}" "${3-$cur}" "${4- }"
-}
-
-# Generates completion reply with compgen from newline-separated possible
-# completion filenames.
-# It accepts 1 to 3 arguments:
-# 1: List of possible completion filenames, separated by a single newline.
-# 2: A directory prefix to be added to each possible completion filename
-#    (optional).
-# 3: Generate possible completion matches for this word (optional).
-__gitcomp_file ()
-{
-  local IFS=$'\n'
-
-  # XXX does not work when the directory prefix contains a tilde,
-  # since tilde expansion is not applied.
-  # This means that COMPREPLY will be empty and Bash default
-  # completion will be used.
-  __gitcompadd "$1" "${2-}" "${3-$cur}" ""
-
-  # use a hack to enable file mode in bash < 4
-  compopt -o filenames +o nospace 2>/dev/null ||
-  compgen -f /non-existing-dir/ > /dev/null
-}
-
-# Execute 'git ls-files', unless the --committable option is specified, in
-# which case it runs 'git diff-index' to find out the files that can be
-# committed.  It return paths relative to the directory specified in the first
-# argument, and using the options specified in the second argument.
-__git_ls_files_helper ()
-{
-  (
-    test -n "${CDPATH+set}" && unset CDPATH
-    cd "$1"
-    if [ "$2" == "--committable" ]; then
-      git diff-index --name-only --relative HEAD
-    else
-      # NOTE: $2 is not quoted in order to support multiple options
-      git ls-files --exclude-standard $2
-    fi
-  ) 2>/dev/null
-}
-
-
-# __git_index_files accepts 1 or 2 arguments:
-# 1: Options to pass to ls-files (required).
-# 2: A directory path (optional).
-#    If provided, only files within the specified directory are listed.
-#    Sub directories are never recursed.  Path must have a trailing
-#    slash.
-__git_index_files ()
-{
-  local dir="$(__gitdir)" root="${2-.}" file
-
-  if [ -d "$dir" ]; then
-    __git_ls_files_helper "$root" "$1" |
-    while read -r file; do
-      case "$file" in
-      ?*/*) echo "${file%%/*}" ;;
-      *) echo "$file" ;;
-      esac
-    done | sort | uniq
-  fi
-}
-
-__git_heads ()
-{
-  local dir="$(__gitdir)"
-  if [ -d "$dir" ]; then
-    git --git-dir="$dir" for-each-ref --format='%(refname:short)' \
-      refs/heads
-    return
-  fi
-}
-
-__git_tags ()
-{
-  local dir="$(__gitdir)"
-  if [ -d "$dir" ]; then
-    git --git-dir="$dir" for-each-ref --format='%(refname:short)' \
-      refs/tags
-    return
-  fi
-}
-
-# __git_refs accepts 0, 1 (to pass to __gitdir), or 2 arguments
-# presence of 2nd argument means use the guess heuristic employed
-# by checkout for tracking branches
-__git_refs ()
-{
-  local i hash dir="$(__gitdir "${1-}")" track="${2-}"
-  local format refs
-  if [ -d "$dir" ]; then
-    case "$cur" in
-    refs|refs/*)
-      format="refname"
-      refs="${cur%/*}"
-      track=""
-      ;;
-    *)
-      for i in HEAD FETCH_HEAD ORIG_HEAD MERGE_HEAD; do
-        if [ -e "$dir/$i" ]; then echo $i; fi
-      done
-      format="refname:short"
-      refs="refs/tags refs/heads refs/remotes"
-      ;;
-    esac
-    git --git-dir="$dir" for-each-ref --format="%($format)" \
-      $refs
-    if [ -n "$track" ]; then
-      # employ the heuristic used by git checkout
-      # Try to find a remote branch that matches the completion word
-      # but only output if the branch name is unique
-      local ref entry
-      git --git-dir="$dir" for-each-ref --shell --format="ref=%(refname:short)" \
-        "refs/remotes/" | \
-      while read -r entry; do
-        eval "$entry"
-        ref="${ref#*/}"
-        if [[ "$ref" == "$cur"* ]]; then
-          echo "$ref"
-        fi
-      done | sort | uniq -u
-    fi
-    return
-  fi
-  case "$cur" in
-  refs|refs/*)
-    git ls-remote "$dir" "$cur*" 2>/dev/null | \
-    while read -r hash i; do
-      case "$i" in
-      *^{}) ;;
-      *) echo "$i" ;;
-      esac
-    done
-    ;;
-  *)
-    echo "HEAD"
-    git for-each-ref --format="%(refname:short)" -- "refs/remotes/$dir/" | sed -e "s#^$dir/##"
-    ;;
-  esac
-}
-
-# __git_refs2 requires 1 argument (to pass to __git_refs)
-__git_refs2 ()
-{
-  local i
-  for i in $(__git_refs "$1"); do
-    echo "$i:$i"
-  done
-}
-
-# __git_refs_remotes requires 1 argument (to pass to ls-remote)
-__git_refs_remotes ()
-{
-  local i hash
-  git ls-remote "$1" 'refs/heads/*' 2>/dev/null | \
-  while read -r hash i; do
-    echo "$i:refs/remotes/$1/${i#refs/heads/}"
-  done
-}
-
-__git_remotes ()
-{
-  local i IFS=$'\n' d="$(__gitdir)"
-  test -d "$d/remotes" && ls -1 "$d/remotes"
-  for i in $(git --git-dir="$d" config --get-regexp 'remote\..*\.url' 2>/dev/null); do
-    i="${i#remote.}"
-    echo "${i/.url*/}"
-  done
-}
-
-__git_list_merge_strategies ()
-{
-  git merge -s help 2>&1 |
-  sed -n -e '/[Aa]vailable strategies are: /,/^$/{
-    s/\.$//
-    s/.*://
-    s/^[  ]*//
-    s/[   ]*$//
-    p
-  }'
-}
-
-__git_merge_strategies=
-# 'git merge -s help' (and thus detection of the merge strategy
-# list) fails, unfortunately, if run outside of any git working
-# tree.  __git_merge_strategies is set to the empty string in
-# that case, and the detection will be repeated the next time it
-# is needed.
-__git_compute_merge_strategies ()
-{
-  test -n "$__git_merge_strategies" ||
-  __git_merge_strategies=$(__git_list_merge_strategies)
-}
-
-__git_complete_revlist_file ()
-{
-  local pfx ls ref cur_="$cur"
-  case "$cur_" in
-  *..?*:*)
-    return
-    ;;
-  ?*:*)
-    ref="${cur_%%:*}"
-    cur_="${cur_#*:}"
-    case "$cur_" in
-    ?*/*)
-      pfx="${cur_%/*}"
-      cur_="${cur_##*/}"
-      ls="$ref:$pfx"
-      pfx="$pfx/"
-      ;;
-    *)
-      ls="$ref"
-      ;;
-    esac
-
-    case "$COMP_WORDBREAKS" in
-    *:*) : great ;;
-    *)   pfx="$ref:$pfx" ;;
-    esac
-
-    __gitcomp_nl "$(git --git-dir="$(__gitdir)" ls-tree "$ls" 2>/dev/null \
-        | sed '/^100... blob /{
-                   s,^.*  ,,
-                   s,$, ,
-               }
-               /^120000 blob /{
-                   s,^.*  ,,
-                   s,$, ,
-               }
-               /^040000 tree /{
-                   s,^.*  ,,
-                   s,$,/,
-               }
-               s/^.*  //')" \
-      "$pfx" "$cur_" ""
-    ;;
-  *...*)
-    pfx="${cur_%...*}..."
-    cur_="${cur_#*...}"
-    __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
-    ;;
-  *..*)
-    pfx="${cur_%..*}.."
-    cur_="${cur_#*..}"
-    __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
-    ;;
-  *)
-    __gitcomp_nl "$(__git_refs)"
-    ;;
-  esac
-}
-
-
-# __git_complete_index_file requires 1 argument:
-# 1: the options to pass to ls-file
-#
-# The exception is --committable, which finds the files appropriate commit.
-__git_complete_index_file ()
-{
-  local pfx="" cur_="$cur"
-
-  case "$cur_" in
-  ?*/*)
-    pfx="${cur_%/*}"
-    cur_="${cur_##*/}"
-    pfx="${pfx}/"
-    ;;
-  esac
-
-  __gitcomp_file "$(__git_index_files "$1" "$pfx")" "$pfx" "$cur_"
-}
-
-__git_complete_file ()
-{
-  __git_complete_revlist_file
-}
-
-__git_complete_revlist ()
-{
-  __git_complete_revlist_file
-}
-
-__git_complete_remote_or_refspec ()
-{
-  local cur_="$cur" cmd="${words[1]}"
-  local i c=2 remote="" pfx="" lhs=1 no_complete_refspec=0
-  if [ "$cmd" = "remote" ]; then
-    ((c++))
-  fi
-  while [ $c -lt $cword ]; do
-    i="${words[c]}"
-    case "$i" in
-    --mirror) [ "$cmd" = "push" ] && no_complete_refspec=1 ;;
-    --all)
-      case "$cmd" in
-      push) no_complete_refspec=1 ;;
-      fetch)
-        return
-        ;;
-      *) ;;
-      esac
-      ;;
-    -*) ;;
-    *) remote="$i"; break ;;
-    esac
-    ((c++))
-  done
-  if [ -z "$remote" ]; then
-    __gitcomp_nl "$(__git_remotes)"
-    return
-  fi
-  if [ $no_complete_refspec = 1 ]; then
-    return
-  fi
-  [ "$remote" = "." ] && remote=
-  case "$cur_" in
-  *:*)
-    case "$COMP_WORDBREAKS" in
-    *:*) : great ;;
-    *)   pfx="${cur_%%:*}:" ;;
-    esac
-    cur_="${cur_#*:}"
-    lhs=0
-    ;;
-  +*)
-    pfx="+"
-    cur_="${cur_#+}"
-    ;;
-  esac
-  case "$cmd" in
-  fetch)
-    if [ $lhs = 1 ]; then
-      __gitcomp_nl "$(__git_refs2 "$remote")" "$pfx" "$cur_"
-    else
-      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
-    fi
-    ;;
-  pull|remote)
-    if [ $lhs = 1 ]; then
-      __gitcomp_nl "$(__git_refs "$remote")" "$pfx" "$cur_"
-    else
-      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
-    fi
-    ;;
-  push)
-    if [ $lhs = 1 ]; then
-      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
-    else
-      __gitcomp_nl "$(__git_refs "$remote")" "$pfx" "$cur_"
-    fi
-    ;;
-  esac
-}
-
-__git_complete_strategy ()
-{
-  __git_compute_merge_strategies
-  case "$prev" in
-  -s|--strategy)
-    __gitcomp "$__git_merge_strategies"
-    return 0
-  esac
-  case "$cur" in
-  --strategy=*)
-    __gitcomp "$__git_merge_strategies" "" "${cur##--strategy=}"
-    return 0
-    ;;
-  esac
-  return 1
-}
-
-__git_commands () {
-  if test -n "${GIT_TESTING_COMMAND_COMPLETION:-}"
-  then
-    printf "%s" "${GIT_TESTING_COMMAND_COMPLETION}"
-  else
-    git help -a|egrep '^  [a-zA-Z0-9]'
-  fi
-}
-
-__git_list_all_commands ()
-{
-  local i IFS=" "$'\n'
-  for i in $(__git_commands)
-  do
-    case $i in
-    *--*)             : helper pattern;;
-    *) echo $i;;
-    esac
-  done
-}
-
-__git_all_commands=
-__git_compute_all_commands ()
-{
-  test -n "$__git_all_commands" ||
-  __git_all_commands=$(__git_list_all_commands)
-}
-
-__git_list_porcelain_commands ()
-{
-  local i IFS=" "$'\n'
-  __git_compute_all_commands
-  for i in $__git_all_commands
-  do
-    case $i in
-    *--*)             : helper pattern;;
-    applymbox)        : ask gittus;;
-    applypatch)       : ask gittus;;
-    archimport)       : import;;
-    cat-file)         : plumbing;;
-    check-attr)       : plumbing;;
-    check-ignore)     : plumbing;;
-    check-mailmap)    : plumbing;;
-    check-ref-format) : plumbing;;
-    checkout-index)   : plumbing;;
-    commit-tree)      : plumbing;;
-    count-objects)    : infrequent;;
-    credential-cache) : credentials helper;;
-    credential-store) : credentials helper;;
-    cvsexportcommit)  : export;;
-    cvsimport)        : import;;
-    cvsserver)        : daemon;;
-    daemon)           : daemon;;
-    diff-files)       : plumbing;;
-    diff-index)       : plumbing;;
-    diff-tree)        : plumbing;;
-    fast-import)      : import;;
-    fast-export)      : export;;
-    fsck-objects)     : plumbing;;
-    fetch-pack)       : plumbing;;
-    fmt-merge-msg)    : plumbing;;
-    for-each-ref)     : plumbing;;
-    hash-object)      : plumbing;;
-    http-*)           : transport;;
-    index-pack)       : plumbing;;
-    init-db)          : deprecated;;
-    local-fetch)      : plumbing;;
-    lost-found)       : infrequent;;
-    ls-files)         : plumbing;;
-    ls-remote)        : plumbing;;
-    ls-tree)          : plumbing;;
-    mailinfo)         : plumbing;;
-    mailsplit)        : plumbing;;
-    merge-*)          : plumbing;;
-    mktree)           : plumbing;;
-    mktag)            : plumbing;;
-    pack-objects)     : plumbing;;
-    pack-redundant)   : plumbing;;
-    pack-refs)        : plumbing;;
-    parse-remote)     : plumbing;;
-    patch-id)         : plumbing;;
-    peek-remote)      : plumbing;;
-    prune)            : plumbing;;
-    prune-packed)     : plumbing;;
-    quiltimport)      : import;;
-    read-tree)        : plumbing;;
-    receive-pack)     : plumbing;;
-    remote-*)         : transport;;
-    repo-config)      : deprecated;;
-    rerere)           : plumbing;;
-    rev-list)         : plumbing;;
-    rev-parse)        : plumbing;;
-    runstatus)        : plumbing;;
-    sh-setup)         : internal;;
-    shell)            : daemon;;
-    show-ref)         : plumbing;;
-    send-pack)        : plumbing;;
-    show-index)       : plumbing;;
-    ssh-*)            : transport;;
-    stripspace)       : plumbing;;
-    symbolic-ref)     : plumbing;;
-    tar-tree)         : deprecated;;
-    unpack-file)      : plumbing;;
-    unpack-objects)   : plumbing;;
-    update-index)     : plumbing;;
-    update-ref)       : plumbing;;
-    update-server-info) : daemon;;
-    upload-archive)   : plumbing;;
-    upload-pack)      : plumbing;;
-    write-tree)       : plumbing;;
-    var)              : infrequent;;
-    verify-pack)      : infrequent;;
-    verify-tag)       : plumbing;;
-    *) echo $i;;
-    esac
-  done
-}
-
-__git_porcelain_commands=
-__git_compute_porcelain_commands ()
-{
-  __git_compute_all_commands
-  test -n "$__git_porcelain_commands" ||
-  __git_porcelain_commands=$(__git_list_porcelain_commands)
-}
-
-__git_pretty_aliases ()
-{
-  local i IFS=$'\n'
-  for i in $(git --git-dir="$(__gitdir)" config --get-regexp "pretty\..*" 2>/dev/null); do
-    case "$i" in
-    pretty.*)
-      i="${i#pretty.}"
-      echo "${i/ */}"
-      ;;
-    esac
-  done
-}
-
-__git_aliases ()
-{
-  local i IFS=$'\n'
-  for i in $(git --git-dir="$(__gitdir)" config --get-regexp "alias\..*" 2>/dev/null); do
-    case "$i" in
-    alias.*)
-      i="${i#alias.}"
-      echo "${i/ */}"
-      ;;
-    esac
-  done
-}
-
-# __git_aliased_command requires 1 argument
-__git_aliased_command ()
-{
-  local word cmdline=$(git --git-dir="$(__gitdir)" \
-    config --get "alias.$1")
-  for word in $cmdline; do
-    case "$word" in
-    \!gitk|gitk)
-      echo "gitk"
-      return
-      ;;
-    \!*)  : shell command alias ;;
-    -*) : option ;;
-    *=*)  : setting env ;;
-    git)  : git itself ;;
-    *)
-      echo "$word"
-      return
-    esac
-  done
-}
-
-# __git_find_on_cmdline requires 1 argument
-__git_find_on_cmdline ()
-{
-  local word subcommand c=1
-  while [ $c -lt $cword ]; do
-    word="${words[c]}"
-    for subcommand in $1; do
-      if [ "$subcommand" = "$word" ]; then
-        echo "$subcommand"
-        return
-      fi
-    done
-    ((c++))
-  done
-}
-
-__git_has_doubledash ()
-{
-  local c=1
-  while [ $c -lt $cword ]; do
-    if [ "--" = "${words[c]}" ]; then
-      return 0
-    fi
-    ((c++))
-  done
-  return 1
-}
-
-# Try to count non option arguments passed on the command line for the
-# specified git command.
-# When options are used, it is necessary to use the special -- option to
-# tell the implementation were non option arguments begin.
-# XXX this can not be improved, since options can appear everywhere, as
-# an example:
-# git mv x -n y
-#
-# __git_count_arguments requires 1 argument: the git command executed.
-__git_count_arguments ()
-{
-  local word i c=0
-
-  # Skip "git" (first argument)
-  for ((i=1; i < ${#words[@]}; i++)); do
-    word="${words[i]}"
-
-    case "$word" in
-      --)
-        # Good; we can assume that the following are only non
-        # option arguments.
-        ((c = 0))
-        ;;
-      "$1")
-        # Skip the specified git command and discard git
-        # main options
-        ((c = 0))
-        ;;
-      ?*)
-        ((c++))
-        ;;
-    esac
-  done
-
-  printf "%d" $c
-}
-
-__git_whitespacelist="nowarn warn error error-all fix"
-
-_git_am ()
-{
-  local dir="$(__gitdir)"
-  if [ -d "$dir"/rebase-apply ]; then
-    __gitcomp "--skip --continue --resolved --abort"
-    return
-  fi
-  case "$cur" in
-  --whitespace=*)
-    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
-    return
-    ;;
-  --*)
-    __gitcomp "
-      --3way --committer-date-is-author-date --ignore-date
-      --ignore-whitespace --ignore-space-change
-      --interactive --keep --no-utf8 --signoff --utf8
-      --whitespace= --scissors
-      "
-    return
-  esac
-}
-
-_git_apply ()
-{
-  case "$cur" in
-  --whitespace=*)
-    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
-    return
-    ;;
-  --*)
-    __gitcomp "
-      --stat --numstat --summary --check --index
-      --cached --index-info --reverse --reject --unidiff-zero
-      --apply --no-add --exclude=
-      --ignore-whitespace --ignore-space-change
-      --whitespace= --inaccurate-eof --verbose
-      "
-    return
-  esac
-}
-
-_git_add ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "
-      --interactive --refresh --patch --update --dry-run
-      --ignore-errors --intent-to-add
-      "
-    return
-  esac
-
-  # XXX should we check for --update and --all options ?
-  __git_complete_index_file "--others --modified"
-}
-
-_git_archive ()
-{
-  case "$cur" in
-  --format=*)
-    __gitcomp "$(git archive --list)" "" "${cur##--format=}"
-    return
-    ;;
-  --remote=*)
-    __gitcomp_nl "$(__git_remotes)" "" "${cur##--remote=}"
-    return
-    ;;
-  --*)
-    __gitcomp "
-      --format= --list --verbose
-      --prefix= --remote= --exec=
-      "
-    return
-    ;;
-  esac
-  __git_complete_file
-}
-
-_git_bisect ()
-{
-  __git_has_doubledash && return
-
-  local subcommands="start bad good skip reset visualize replay log run"
-  local subcommand="$(__git_find_on_cmdline "$subcommands")"
-  if [ -z "$subcommand" ]; then
-    if [ -f "$(__gitdir)"/BISECT_START ]; then
-      __gitcomp "$subcommands"
-    else
-      __gitcomp "replay start"
-    fi
-    return
-  fi
-
-  case "$subcommand" in
-  bad|good|reset|skip|start)
-    __gitcomp_nl "$(__git_refs)"
-    ;;
-  *)
-    ;;
-  esac
-}
-
-_git_branch ()
-{
-  local i c=1 only_local_ref="n" has_r="n"
-
-  while [ $c -lt $cword ]; do
-    i="${words[c]}"
-    case "$i" in
-    -d|-m)  only_local_ref="y" ;;
-    -r) has_r="y" ;;
-    esac
-    ((c++))
-  done
-
-  case "$cur" in
-  --set-upstream-to=*)
-    __gitcomp "$(__git_refs)" "" "${cur##--set-upstream-to=}"
-    ;;
-  --*)
-    __gitcomp "
-      --color --no-color --verbose --abbrev= --no-abbrev
-      --track --no-track --contains --merged --no-merged
-      --set-upstream-to= --edit-description --list
-      --unset-upstream
-      "
-    ;;
-  *)
-    if [ $only_local_ref = "y" -a $has_r = "n" ]; then
-      __gitcomp_nl "$(__git_heads)"
-    else
-      __gitcomp_nl "$(__git_refs)"
-    fi
-    ;;
-  esac
-}
-
-_git_bundle ()
-{
-  local cmd="${words[2]}"
-  case "$cword" in
-  2)
-    __gitcomp "create list-heads verify unbundle"
-    ;;
-  3)
-    # looking for a file
-    ;;
-  *)
-    case "$cmd" in
-      create)
-        __git_complete_revlist
-      ;;
-    esac
-    ;;
-  esac
-}
-
-_git_checkout ()
-{
-  __git_has_doubledash && return
-
-  case "$cur" in
-  --conflict=*)
-    __gitcomp "diff3 merge" "" "${cur##--conflict=}"
-    ;;
-  --*)
-    __gitcomp "
-      --quiet --ours --theirs --track --no-track --merge
-      --conflict= --orphan --patch
-      "
-    ;;
-  *)
-    # check if --track, --no-track, or --no-guess was specified
-    # if so, disable DWIM mode
-    local flags="--track --no-track --no-guess" track=1
-    if [ -n "$(__git_find_on_cmdline "$flags")" ]; then
-      track=''
-    fi
-    __gitcomp_nl "$(__git_refs '' $track)"
-    ;;
-  esac
-}
-
-_git_cherry ()
-{
-  __gitcomp "$(__git_refs)"
-}
-
-_git_cherry_pick ()
-{
-  local dir="$(__gitdir)"
-  if [ -f "$dir"/CHERRY_PICK_HEAD ]; then
-    __gitcomp "--continue --quit --abort"
-    return
-  fi
-  case "$cur" in
-  --*)
-    __gitcomp "--edit --no-commit --signoff --strategy= --mainline"
-    ;;
-  *)
-    __gitcomp_nl "$(__git_refs)"
-    ;;
-  esac
-}
-
-_git_clean ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "--dry-run --quiet"
-    return
-    ;;
-  esac
-
-  # XXX should we check for -x option ?
-  __git_complete_index_file "--others"
-}
-
-_git_clone ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "
-      --local
-      --no-hardlinks
-      --shared
-      --reference
-      --quiet
-      --no-checkout
-      --bare
-      --mirror
-      --origin
-      --upload-pack
-      --template=
-      --depth
-      --single-branch
-      --branch
-      "
-    return
-    ;;
-  esac
-}
-
-_git_commit ()
-{
-  case "$prev" in
-  -c|-C)
-    __gitcomp_nl "$(__git_refs)" "" "${cur}"
-    return
-    ;;
-  esac
-
-  case "$cur" in
-  --cleanup=*)
-    __gitcomp "default strip verbatim whitespace
-      " "" "${cur##--cleanup=}"
-    return
-    ;;
-  --reuse-message=*|--reedit-message=*|\
-  --fixup=*|--squash=*)
-    __gitcomp_nl "$(__git_refs)" "" "${cur#*=}"
-    return
-    ;;
-  --untracked-files=*)
-    __gitcomp "all no normal" "" "${cur##--untracked-files=}"
-    return
-    ;;
-  --*)
-    __gitcomp "
-      --all --author= --signoff --verify --no-verify
-      --edit --no-edit
-      --amend --include --only --interactive
-      --dry-run --reuse-message= --reedit-message=
-      --reset-author --file= --message= --template=
-      --cleanup= --untracked-files --untracked-files=
-      --verbose --quiet --fixup= --squash=
-      "
-    return
-  esac
-
-  if git rev-parse --verify --quiet HEAD >/dev/null; then
-    __git_complete_index_file "--committable"
-  else
-    # This is the first commit
-    __git_complete_index_file "--cached"
-  fi
-}
-
-_git_describe ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "
-      --all --tags --contains --abbrev= --candidates=
-      --exact-match --debug --long --match --always
-      "
-    return
-  esac
-  __gitcomp_nl "$(__git_refs)"
-}
-
-__git_diff_algorithms="myers minimal patience histogram"
-
-__git_diff_common_options="--stat --numstat --shortstat --summary
-      --patch-with-stat --name-only --name-status --color
-      --no-color --color-words --no-renames --check
-      --full-index --binary --abbrev --diff-filter=
-      --find-copies-harder
-      --text --ignore-space-at-eol --ignore-space-change
-      --ignore-all-space --exit-code --quiet --ext-diff
-      --no-ext-diff
-      --no-prefix --src-prefix= --dst-prefix=
-      --inter-hunk-context=
-      --patience --histogram --minimal
-      --raw --word-diff
-      --dirstat --dirstat= --dirstat-by-file
-      --dirstat-by-file= --cumulative
-      --diff-algorithm=
-"
-
-_git_diff ()
-{
-  __git_has_doubledash && return
-
-  case "$cur" in
-  --diff-algorithm=*)
-    __gitcomp "$__git_diff_algorithms" "" "${cur##--diff-algorithm=}"
-    return
-    ;;
-  --*)
-    __gitcomp "--cached --staged --pickaxe-all --pickaxe-regex
-      --base --ours --theirs --no-index
-      $__git_diff_common_options
-      "
-    return
-    ;;
-  esac
-  __git_complete_revlist_file
-}
-
-__git_mergetools_common="diffuse ecmerge emerge kdiff3 meld opendiff
-      tkdiff vimdiff gvimdiff xxdiff araxis p4merge bc3 codecompare
-"
-
-_git_difftool ()
-{
-  __git_has_doubledash && return
-
-  case "$cur" in
-  --tool=*)
-    __gitcomp "$__git_mergetools_common kompare" "" "${cur##--tool=}"
-    return
-    ;;
-  --*)
-    __gitcomp "--cached --staged --pickaxe-all --pickaxe-regex
-      --base --ours --theirs
-      --no-renames --diff-filter= --find-copies-harder
-      --relative --ignore-submodules
-      --tool="
-    return
-    ;;
-  esac
-  __git_complete_revlist_file
-}
-
-__git_fetch_options="
-  --quiet --verbose --append --upload-pack --force --keep --depth=
-  --tags --no-tags --all --prune --dry-run
-"
-
-_git_fetch ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "$__git_fetch_options"
-    return
-    ;;
-  esac
-  __git_complete_remote_or_refspec
-}
-
-__git_format_patch_options="
-  --stdout --attach --no-attach --thread --thread= --no-thread
-  --numbered --start-number --numbered-files --keep-subject --signoff
-  --signature --no-signature --in-reply-to= --cc= --full-index --binary
-  --not --all --cover-letter --no-prefix --src-prefix= --dst-prefix=
-  --inline --suffix= --ignore-if-in-upstream --subject-prefix=
-  --output-directory --reroll-count --to= --quiet --notes
-"
-
-_git_format_patch ()
-{
-  case "$cur" in
-  --thread=*)
-    __gitcomp "
-      deep shallow
-      " "" "${cur##--thread=}"
-    return
-    ;;
-  --*)
-    __gitcomp "$__git_format_patch_options"
-    return
-    ;;
-  esac
-  __git_complete_revlist
-}
-
-_git_fsck ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "
-      --tags --root --unreachable --cache --no-reflogs --full
-      --strict --verbose --lost-found
-      "
-    return
-    ;;
-  esac
-}
-
-_git_gc ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "--prune --aggressive"
-    return
-    ;;
-  esac
-}
-
-_git_gitk ()
-{
-  _gitk
-}
-
-__git_match_ctag() {
-  awk "/^${1////\\/}/ { print \$1 }" "$2"
-}
-
-_git_grep ()
-{
-  __git_has_doubledash && return
-
-  case "$cur" in
-  --*)
-    __gitcomp "
-      --cached
-      --text --ignore-case --word-regexp --invert-match
-      --full-name --line-number
-      --extended-regexp --basic-regexp --fixed-strings
-      --perl-regexp
-      --files-with-matches --name-only
-      --files-without-match
-      --max-depth
-      --count
-      --and --or --not --all-match
-      "
-    return
-    ;;
-  esac
-
-  case "$cword,$prev" in
-  2,*|*,-*)
-    if test -r tags; then
-      __gitcomp_nl "$(__git_match_ctag "$cur" tags)"
-      return
-    fi
-    ;;
-  esac
-
-  __gitcomp_nl "$(__git_refs)"
-}
-
-_git_help ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "--all --info --man --web"
-    return
-    ;;
-  esac
-  __git_compute_all_commands
-  __gitcomp "$__git_all_commands $(__git_aliases)
-    attributes cli core-tutorial cvs-migration
-    diffcore gitk glossary hooks ignore modules
-    namespaces repository-layout tutorial tutorial-2
-    workflows
-    "
-}
-
-_git_init ()
-{
-  case "$cur" in
-  --shared=*)
-    __gitcomp "
-      false true umask group all world everybody
-      " "" "${cur##--shared=}"
-    return
-    ;;
-  --*)
-    __gitcomp "--quiet --bare --template= --shared --shared="
-    return
-    ;;
-  esac
-}
-
-_git_ls_files ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "--cached --deleted --modified --others --ignored
-      --stage --directory --no-empty-directory --unmerged
-      --killed --exclude= --exclude-from=
-      --exclude-per-directory= --exclude-standard
-      --error-unmatch --with-tree= --full-name
-      --abbrev --ignored --exclude-per-directory
-      "
-    return
-    ;;
-  esac
-
-  # XXX ignore options like --modified and always suggest all cached
-  # files.
-  __git_complete_index_file "--cached"
-}
-
-_git_ls_remote ()
-{
-  __gitcomp_nl "$(__git_remotes)"
-}
-
-_git_ls_tree ()
-{
-  __git_complete_file
-}
-
-# Options that go well for log, shortlog and gitk
-__git_log_common_options="
-  --not --all
-  --branches --tags --remotes
-  --first-parent --merges --no-merges
-  --max-count=
-  --max-age= --since= --after=
-  --min-age= --until= --before=
-  --min-parents= --max-parents=
-  --no-min-parents --no-max-parents
-"
-# Options that go well for log and gitk (not shortlog)
-__git_log_gitk_options="
-  --dense --sparse --full-history
-  --simplify-merges --simplify-by-decoration
-  --left-right --notes --no-notes
-"
-# Options that go well for log and shortlog (not gitk)
-__git_log_shortlog_options="
-  --author= --committer= --grep=
-  --all-match
-"
-
-__git_log_pretty_formats="oneline short medium full fuller email raw format:"
-__git_log_date_formats="relative iso8601 rfc2822 short local default raw"
-
-_git_log ()
-{
-  __git_has_doubledash && return
-
-  local g="$(git rev-parse --git-dir 2>/dev/null)"
-  local merge=""
-  if [ -f "$g/MERGE_HEAD" ]; then
-    merge="--merge"
-  fi
-  case "$cur" in
-  --pretty=*|--format=*)
-    __gitcomp "$__git_log_pretty_formats $(__git_pretty_aliases)
-      " "" "${cur#*=}"
-    return
-    ;;
-  --date=*)
-    __gitcomp "$__git_log_date_formats" "" "${cur##--date=}"
-    return
-    ;;
-  --decorate=*)
-    __gitcomp "long short" "" "${cur##--decorate=}"
-    return
-    ;;
-  --*)
-    __gitcomp "
-      $__git_log_common_options
-      $__git_log_shortlog_options
-      $__git_log_gitk_options
-      --root --topo-order --date-order --reverse
-      --follow --full-diff
-      --abbrev-commit --abbrev=
-      --relative-date --date=
-      --pretty= --format= --oneline
-      --cherry-pick
-      --graph
-      --decorate --decorate=
-      --walk-reflogs
-      --parents --children
-      $merge
-      $__git_diff_common_options
-      --pickaxe-all --pickaxe-regex
-      "
-    return
-    ;;
-  esac
-  __git_complete_revlist
-}
-
-__git_merge_options="
-  --no-commit --no-stat --log --no-log --squash --strategy
-  --commit --stat --no-squash --ff --no-ff --ff-only --edit --no-edit
-"
-
-_git_merge ()
-{
-  __git_complete_strategy && return
-
-  case "$cur" in
-  --*)
-    __gitcomp "$__git_merge_options"
-    return
-  esac
-  __gitcomp_nl "$(__git_refs)"
-}
-
-_git_mergetool ()
-{
-  case "$cur" in
-  --tool=*)
-    __gitcomp "$__git_mergetools_common tortoisemerge" "" "${cur##--tool=}"
-    return
-    ;;
-  --*)
-    __gitcomp "--tool="
-    return
-    ;;
-  esac
-}
-
-_git_merge_base ()
-{
-  __gitcomp_nl "$(__git_refs)"
-}
-
-_git_mv ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "--dry-run"
-    return
-    ;;
-  esac
-
-  if [ $(__git_count_arguments "mv") -gt 0 ]; then
-    # We need to show both cached and untracked files (including
-    # empty directories) since this may not be the last argument.
-    __git_complete_index_file "--cached --others --directory"
-  else
-    __git_complete_index_file "--cached"
-  fi
-}
-
-_git_name_rev ()
-{
-  __gitcomp "--tags --all --stdin"
-}
-
-_git_notes ()
-{
-  local subcommands='add append copy edit list prune remove show'
-  local subcommand="$(__git_find_on_cmdline "$subcommands")"
-
-  case "$subcommand,$cur" in
-  ,--*)
-    __gitcomp '--ref'
-    ;;
-  ,*)
-    case "$prev" in
-    --ref)
-      __gitcomp_nl "$(__git_refs)"
-      ;;
-    *)
-      __gitcomp "$subcommands --ref"
-      ;;
-    esac
-    ;;
-  add,--reuse-message=*|append,--reuse-message=*|\
-  add,--reedit-message=*|append,--reedit-message=*)
-    __gitcomp_nl "$(__git_refs)" "" "${cur#*=}"
-    ;;
-  add,--*|append,--*)
-    __gitcomp '--file= --message= --reedit-message=
-        --reuse-message='
-    ;;
-  copy,--*)
-    __gitcomp '--stdin'
-    ;;
-  prune,--*)
-    __gitcomp '--dry-run --verbose'
-    ;;
-  prune,*)
-    ;;
-  *)
-    case "$prev" in
-    -m|-F)
-      ;;
-    *)
-      __gitcomp_nl "$(__git_refs)"
-      ;;
-    esac
-    ;;
-  esac
-}
-
-_git_pull ()
-{
-  __git_complete_strategy && return
-
-  case "$cur" in
-  --*)
-    __gitcomp "
-      --rebase --no-rebase
-      $__git_merge_options
-      $__git_fetch_options
-    "
-    return
-    ;;
-  esac
-  __git_complete_remote_or_refspec
-}
-
-_git_push ()
-{
-  case "$prev" in
-  --repo)
-    __gitcomp_nl "$(__git_remotes)"
-    return
-  esac
-  case "$cur" in
-  --repo=*)
-    __gitcomp_nl "$(__git_remotes)" "" "${cur##--repo=}"
-    return
-    ;;
-  --*)
-    __gitcomp "
-      --all --mirror --tags --dry-run --force --verbose
-      --receive-pack= --repo= --set-upstream
-    "
-    return
-    ;;
-  esac
-  __git_complete_remote_or_refspec
-}
-
-_git_rebase ()
-{
-  local dir="$(__gitdir)"
-  if [ -d "$dir"/rebase-apply ] || [ -d "$dir"/rebase-merge ]; then
-    __gitcomp "--continue --skip --abort"
-    return
-  fi
-  __git_complete_strategy && return
-  case "$cur" in
-  --whitespace=*)
-    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
-    return
-    ;;
-  --*)
-    __gitcomp "
-      --onto --merge --strategy --interactive
-      --preserve-merges --stat --no-stat
-      --committer-date-is-author-date --ignore-date
-      --ignore-whitespace --whitespace=
-      --autosquash
-      "
-
-    return
-  esac
-  __gitcomp_nl "$(__git_refs)"
-}
-
-_git_reflog ()
-{
-  local subcommands="show delete expire"
-  local subcommand="$(__git_find_on_cmdline "$subcommands")"
-
-  if [ -z "$subcommand" ]; then
-    __gitcomp "$subcommands"
-  else
-    __gitcomp_nl "$(__git_refs)"
-  fi
-}
-
-__git_send_email_confirm_options="always never auto cc compose"
-__git_send_email_suppresscc_options="author self cc bodycc sob cccmd body all"
-
-_git_send_email ()
-{
-  case "$cur" in
-  --confirm=*)
-    __gitcomp "
-      $__git_send_email_confirm_options
-      " "" "${cur##--confirm=}"
-    return
-    ;;
-  --suppress-cc=*)
-    __gitcomp "
-      $__git_send_email_suppresscc_options
-      " "" "${cur##--suppress-cc=}"
-
-    return
-    ;;
-  --smtp-encryption=*)
-    __gitcomp "ssl tls" "" "${cur##--smtp-encryption=}"
-    return
-    ;;
-  --thread=*)
-    __gitcomp "
-      deep shallow
-      " "" "${cur##--thread=}"
-    return
-    ;;
-  --*)
-    __gitcomp "--annotate --bcc --cc --cc-cmd --chain-reply-to
-      --compose --confirm= --dry-run --envelope-sender
-      --from --identity
-      --in-reply-to --no-chain-reply-to --no-signed-off-by-cc
-      --no-suppress-from --no-thread --quiet
-      --signed-off-by-cc --smtp-pass --smtp-server
-      --smtp-server-port --smtp-encryption= --smtp-user
-      --subject --suppress-cc= --suppress-from --thread --to
-      --validate --no-validate
-      $__git_format_patch_options"
-    return
-    ;;
-  esac
-  __git_complete_revlist
-}
-
-_git_stage ()
-{
-  _git_add
-}
-
-__git_config_get_set_variables ()
-{
-  local prevword word config_file= c=$cword
-  while [ $c -gt 1 ]; do
-    word="${words[c]}"
-    case "$word" in
-    --system|--global|--local|--file=*)
-      config_file="$word"
-      break
-      ;;
-    -f|--file)
-      config_file="$word $prevword"
-      break
-      ;;
-    esac
-    prevword=$word
-    c=$((--c))
-  done
-
-  git --git-dir="$(__gitdir)" config $config_file --list 2>/dev/null |
-  while read -r line
-  do
-    case "$line" in
-    *.*=*)
-      echo "${line/=*/}"
-      ;;
-    esac
-  done
-}
-
-_git_config ()
-{
-  case "$prev" in
-  branch.*.remote|branch.*.pushremote)
-    __gitcomp_nl "$(__git_remotes)"
-    return
-    ;;
-  branch.*.merge)
-    __gitcomp_nl "$(__git_refs)"
-    return
-    ;;
-  branch.*.rebase)
-    __gitcomp "false true"
-    return
-    ;;
-  remote.pushdefault)
-    __gitcomp_nl "$(__git_remotes)"
-    return
-    ;;
-  remote.*.fetch)
-    local remote="${prev#remote.}"
-    remote="${remote%.fetch}"
-    if [ -z "$cur" ]; then
-      __gitcomp_nl "refs/heads/" "" "" ""
-      return
-    fi
-    __gitcomp_nl "$(__git_refs_remotes "$remote")"
-    return
-    ;;
-  remote.*.push)
-    local remote="${prev#remote.}"
-    remote="${remote%.push}"
-    __gitcomp_nl "$(git --git-dir="$(__gitdir)" \
-      for-each-ref --format='%(refname):%(refname)' \
-      refs/heads)"
-    return
-    ;;
-  pull.twohead|pull.octopus)
-    __git_compute_merge_strategies
-    __gitcomp "$__git_merge_strategies"
-    return
-    ;;
-  color.branch|color.diff|color.interactive|\
-  color.showbranch|color.status|color.ui)
-    __gitcomp "always never auto"
-    return
-    ;;
-  color.pager)
-    __gitcomp "false true"
-    return
-    ;;
-  color.*.*)
-    __gitcomp "
-      normal black red green yellow blue magenta cyan white
-      bold dim ul blink reverse
-      "
-    return
-    ;;
-  diff.submodule)
-    __gitcomp "log short"
-    return
-    ;;
-  help.format)
-    __gitcomp "man info web html"
-    return
-    ;;
-  log.date)
-    __gitcomp "$__git_log_date_formats"
-    return
-    ;;
-  sendemail.aliasesfiletype)
-    __gitcomp "mutt mailrc pine elm gnus"
-    return
-    ;;
-  sendemail.confirm)
-    __gitcomp "$__git_send_email_confirm_options"
-    return
-    ;;
-  sendemail.suppresscc)
-    __gitcomp "$__git_send_email_suppresscc_options"
-    return
-    ;;
-  --get|--get-all|--unset|--unset-all)
-    __gitcomp_nl "$(__git_config_get_set_variables)"
-    return
-    ;;
-  *.*)
-    return
-    ;;
-  esac
-  case "$cur" in
-  --*)
-    __gitcomp "
-      --system --global --local --file=
-      --list --replace-all
-      --get --get-all --get-regexp
-      --add --unset --unset-all
-      --remove-section --rename-section
-      "
-    return
-    ;;
-  branch.*.*)
-    local pfx="${cur%.*}." cur_="${cur##*.}"
-    __gitcomp "remote pushremote merge mergeoptions rebase" "$pfx" "$cur_"
-    return
-    ;;
-  branch.*)
-    local pfx="${cur%.*}." cur_="${cur#*.}"
-    __gitcomp_nl "$(__git_heads)" "$pfx" "$cur_" "."
-    return
-    ;;
-  guitool.*.*)
-    local pfx="${cur%.*}." cur_="${cur##*.}"
-    __gitcomp "
-      argprompt cmd confirm needsfile noconsole norescan
-      prompt revprompt revunmerged title
-      " "$pfx" "$cur_"
-    return
-    ;;
-  difftool.*.*)
-    local pfx="${cur%.*}." cur_="${cur##*.}"
-    __gitcomp "cmd path" "$pfx" "$cur_"
-    return
-    ;;
-  man.*.*)
-    local pfx="${cur%.*}." cur_="${cur##*.}"
-    __gitcomp "cmd path" "$pfx" "$cur_"
-    return
-    ;;
-  mergetool.*.*)
-    local pfx="${cur%.*}." cur_="${cur##*.}"
-    __gitcomp "cmd path trustExitCode" "$pfx" "$cur_"
-    return
-    ;;
-  pager.*)
-    local pfx="${cur%.*}." cur_="${cur#*.}"
-    __git_compute_all_commands
-    __gitcomp_nl "$__git_all_commands" "$pfx" "$cur_"
-    return
-    ;;
-  remote.*.*)
-    local pfx="${cur%.*}." cur_="${cur##*.}"
-    __gitcomp "
-      url proxy fetch push mirror skipDefaultUpdate
-      receivepack uploadpack tagopt pushurl
-      " "$pfx" "$cur_"
-    return
-    ;;
-  remote.*)
-    local pfx="${cur%.*}." cur_="${cur#*.}"
-    __gitcomp_nl "$(__git_remotes)" "$pfx" "$cur_" "."
-    return
-    ;;
-  url.*.*)
-    local pfx="${cur%.*}." cur_="${cur##*.}"
-    __gitcomp "insteadOf pushInsteadOf" "$pfx" "$cur_"
-    return
-    ;;
-  esac
-  __gitcomp "
-    add.ignoreErrors
-    advice.commitBeforeMerge
-    advice.detachedHead
-    advice.implicitIdentity
-    advice.pushNonFastForward
-    advice.resolveConflict
-    advice.statusHints
-    alias.
-    am.keepcr
-    apply.ignorewhitespace
-    apply.whitespace
-    branch.autosetupmerge
-    branch.autosetuprebase
-    browser.
-    clean.requireForce
-    color.branch
-    color.branch.current
-    color.branch.local
-    color.branch.plain
-    color.branch.remote
-    color.decorate.HEAD
-    color.decorate.branch
-    color.decorate.remoteBranch
-    color.decorate.stash
-    color.decorate.tag
-    color.diff
-    color.diff.commit
-    color.diff.frag
-    color.diff.func
-    color.diff.meta
-    color.diff.new
-    color.diff.old
-    color.diff.plain
-    color.diff.whitespace
-    color.grep
-    color.grep.context
-    color.grep.filename
-    color.grep.function
-    color.grep.linenumber
-    color.grep.match
-    color.grep.selected
-    color.grep.separator
-    color.interactive
-    color.interactive.error
-    color.interactive.header
-    color.interactive.help
-    color.interactive.prompt
-    color.pager
-    color.showbranch
-    color.status
-    color.status.added
-    color.status.changed
-    color.status.header
-    color.status.nobranch
-    color.status.untracked
-    color.status.updated
-    color.ui
-    commit.status
-    commit.template
-    core.abbrev
-    core.askpass
-    core.attributesfile
-    core.autocrlf
-    core.bare
-    core.bigFileThreshold
-    core.compression
-    core.createObject
-    core.deltaBaseCacheLimit
-    core.editor
-    core.eol
-    core.excludesfile
-    core.fileMode
-    core.fsyncobjectfiles
-    core.gitProxy
-    core.ignoreStat
-    core.ignorecase
-    core.logAllRefUpdates
-    core.loosecompression
-    core.notesRef
-    core.packedGitLimit
-    core.packedGitWindowSize
-    core.pager
-    core.preferSymlinkRefs
-    core.preloadindex
-    core.quotepath
-    core.repositoryFormatVersion
-    core.safecrlf
-    core.sharedRepository
-    core.sparseCheckout
-    core.symlinks
-    core.trustctime
-    core.warnAmbiguousRefs
-    core.whitespace
-    core.worktree
-    diff.autorefreshindex
-    diff.external
-    diff.ignoreSubmodules
-    diff.mnemonicprefix
-    diff.noprefix
-    diff.renameLimit
-    diff.renames
-    diff.statGraphWidth
-    diff.submodule
-    diff.suppressBlankEmpty
-    diff.tool
-    diff.wordRegex
-    diff.algorithm
-    difftool.
-    difftool.prompt
-    fetch.recurseSubmodules
-    fetch.unpackLimit
-    format.attach
-    format.cc
-    format.headers
-    format.numbered
-    format.pretty
-    format.signature
-    format.signoff
-    format.subjectprefix
-    format.suffix
-    format.thread
-    format.to
-    gc.
-    gc.aggressiveWindow
-    gc.auto
-    gc.autopacklimit
-    gc.packrefs
-    gc.pruneexpire
-    gc.reflogexpire
-    gc.reflogexpireunreachable
-    gc.rerereresolved
-    gc.rerereunresolved
-    gitcvs.allbinary
-    gitcvs.commitmsgannotation
-    gitcvs.dbTableNamePrefix
-    gitcvs.dbdriver
-    gitcvs.dbname
-    gitcvs.dbpass
-    gitcvs.dbuser
-    gitcvs.enabled
-    gitcvs.logfile
-    gitcvs.usecrlfattr
-    guitool.
-    gui.blamehistoryctx
-    gui.commitmsgwidth
-    gui.copyblamethreshold
-    gui.diffcontext
-    gui.encoding
-    gui.fastcopyblame
-    gui.matchtrackingbranch
-    gui.newbranchtemplate
-    gui.pruneduringfetch
-    gui.spellingdictionary
-    gui.trustmtime
-    help.autocorrect
-    help.browser
-    help.format
-    http.lowSpeedLimit
-    http.lowSpeedTime
-    http.maxRequests
-    http.minSessions
-    http.noEPSV
-    http.postBuffer
-    http.proxy
-    http.sslCAInfo
-    http.sslCAPath
-    http.sslCert
-    http.sslCertPasswordProtected
-    http.sslKey
-    http.sslVerify
-    http.useragent
-    i18n.commitEncoding
-    i18n.logOutputEncoding
-    imap.authMethod
-    imap.folder
-    imap.host
-    imap.pass
-    imap.port
-    imap.preformattedHTML
-    imap.sslverify
-    imap.tunnel
-    imap.user
-    init.templatedir
-    instaweb.browser
-    instaweb.httpd
-    instaweb.local
-    instaweb.modulepath
-    instaweb.port
-    interactive.singlekey
-    log.date
-    log.decorate
-    log.showroot
-    mailmap.file
-    man.
-    man.viewer
-    merge.
-    merge.conflictstyle
-    merge.log
-    merge.renameLimit
-    merge.renormalize
-    merge.stat
-    merge.tool
-    merge.verbosity
-    mergetool.
-    mergetool.keepBackup
-    mergetool.keepTemporaries
-    mergetool.prompt
-    notes.displayRef
-    notes.rewrite.
-    notes.rewrite.amend
-    notes.rewrite.rebase
-    notes.rewriteMode
-    notes.rewriteRef
-    pack.compression
-    pack.deltaCacheLimit
-    pack.deltaCacheSize
-    pack.depth
-    pack.indexVersion
-    pack.packSizeLimit
-    pack.threads
-    pack.window
-    pack.windowMemory
-    pager.
-    pretty.
-    pull.octopus
-    pull.twohead
-    push.default
-    rebase.autosquash
-    rebase.stat
-    receive.autogc
-    receive.denyCurrentBranch
-    receive.denyDeleteCurrent
-    receive.denyDeletes
-    receive.denyNonFastForwards
-    receive.fsckObjects
-    receive.unpackLimit
-    receive.updateserverinfo
-    remote.pushdefault
-    remotes.
-    repack.usedeltabaseoffset
-    rerere.autoupdate
-    rerere.enabled
-    sendemail.
-    sendemail.aliasesfile
-    sendemail.aliasfiletype
-    sendemail.bcc
-    sendemail.cc
-    sendemail.cccmd
-    sendemail.chainreplyto
-    sendemail.confirm
-    sendemail.envelopesender
-    sendemail.from
-    sendemail.identity
-    sendemail.multiedit
-    sendemail.signedoffbycc
-    sendemail.smtpdomain
-    sendemail.smtpencryption
-    sendemail.smtppass
-    sendemail.smtpserver
-    sendemail.smtpserveroption
-    sendemail.smtpserverport
-    sendemail.smtpuser
-    sendemail.suppresscc
-    sendemail.suppressfrom
-    sendemail.thread
-    sendemail.to
-    sendemail.validate
-    showbranch.default
-    status.relativePaths
-    status.showUntrackedFiles
-    status.submodulesummary
-    submodule.
-    tar.umask
-    transfer.unpackLimit
-    url.
-    user.email
-    user.name
-    user.signingkey
-    web.browser
-    branch. remote.
-  "
-}
-
-_git_remote ()
-{
-  local subcommands="add rename remove set-head set-branches set-url show prune update"
-  local subcommand="$(__git_find_on_cmdline "$subcommands")"
-  if [ -z "$subcommand" ]; then
-    __gitcomp "$subcommands"
-    return
-  fi
-
-  case "$subcommand" in
-  rename|remove|set-url|show|prune)
-    __gitcomp_nl "$(__git_remotes)"
-    ;;
-  set-head|set-branches)
-    __git_complete_remote_or_refspec
-    ;;
-  update)
-    local i c='' IFS=$'\n'
-    for i in $(git --git-dir="$(__gitdir)" config --get-regexp "remotes\..*" 2>/dev/null); do
-      i="${i#remotes.}"
-      c="$c ${i/ */}"
-    done
-    __gitcomp "$c"
-    ;;
-  *)
-    ;;
-  esac
-}
-
-_git_replace ()
-{
-  __gitcomp_nl "$(__git_refs)"
-}
-
-_git_reset ()
-{
-  __git_has_doubledash && return
-
-  case "$cur" in
-  --*)
-    __gitcomp "--merge --mixed --hard --soft --patch"
-    return
-    ;;
-  esac
-  __gitcomp_nl "$(__git_refs)"
-}
-
-_git_revert ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "--edit --mainline --no-edit --no-commit --signoff"
-    return
-    ;;
-  esac
-  __gitcomp_nl "$(__git_refs)"
-}
-
-_git_rm ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "--cached --dry-run --ignore-unmatch --quiet"
-    return
-    ;;
-  esac
-
-  __git_complete_index_file "--cached"
-}
-
-_git_shortlog ()
-{
-  __git_has_doubledash && return
-
-  case "$cur" in
-  --*)
-    __gitcomp "
-      $__git_log_common_options
-      $__git_log_shortlog_options
-      --numbered --summary
-      "
-    return
-    ;;
-  esac
-  __git_complete_revlist
-}
-
-_git_show ()
-{
-  __git_has_doubledash && return
-
-  case "$cur" in
-  --pretty=*|--format=*)
-    __gitcomp "$__git_log_pretty_formats $(__git_pretty_aliases)
-      " "" "${cur#*=}"
-    return
-    ;;
-  --diff-algorithm=*)
-    __gitcomp "$__git_diff_algorithms" "" "${cur##--diff-algorithm=}"
-    return
-    ;;
-  --*)
-    __gitcomp "--pretty= --format= --abbrev-commit --oneline
-      $__git_diff_common_options
-      "
-    return
-    ;;
-  esac
-  __git_complete_revlist_file
-}
-
-_git_show_branch ()
-{
-  case "$cur" in
-  --*)
-    __gitcomp "
-      --all --remotes --topo-order --current --more=
-      --list --independent --merge-base --no-name
-      --color --no-color
-      --sha1-name --sparse --topics --reflog
-      "
-    return
-    ;;
-  esac
-  __git_complete_revlist
-}
-
-_git_stash ()
-{
-  local save_opts='--keep-index --no-keep-index --quiet --patch'
-  local subcommands='save list show apply clear drop pop create branch'
-  local subcommand="$(__git_find_on_cmdline "$subcommands")"
-  if [ -z "$subcommand" ]; then
-    case "$cur" in
-    --*)
-      __gitcomp "$save_opts"
-      ;;
-    *)
-      if [ -z "$(__git_find_on_cmdline "$save_opts")" ]; then
-        __gitcomp "$subcommands"
-      fi
-      ;;
-    esac
-  else
-    case "$subcommand,$cur" in
-    save,--*)
-      __gitcomp "$save_opts"
-      ;;
-    apply,--*|pop,--*)
-      __gitcomp "--index --quiet"
-      ;;
-    show,--*|drop,--*|branch,--*)
-      ;;
-    show,*|apply,*|drop,*|pop,*|branch,*)
-      __gitcomp_nl "$(git --git-dir="$(__gitdir)" stash list \
-          | sed -n -e 's/:.*//p')"
-      ;;
-    *)
-      ;;
-    esac
-  fi
-}
-
-_git_submodule ()
-{
-  __git_has_doubledash && return
-
-  local subcommands="add status init deinit update summary foreach sync"
-  if [ -z "$(__git_find_on_cmdline "$subcommands")" ]; then
-    case "$cur" in
-    --*)
-      __gitcomp "--quiet --cached"
-      ;;
-    *)
-      __gitcomp "$subcommands"
-      ;;
-    esac
-    return
-  fi
-}
-
-_git_svn ()
-{
-  local subcommands="
-    init fetch clone rebase dcommit log find-rev
-    set-tree commit-diff info create-ignore propget
-    proplist show-ignore show-externals branch tag blame
-    migrate mkdirs reset gc
-    "
-  local subcommand="$(__git_find_on_cmdline "$subcommands")"
-  if [ -z "$subcommand" ]; then
-    __gitcomp "$subcommands"
-  else
-    local remote_opts="--username= --config-dir= --no-auth-cache"
-    local fc_opts="
-      --follow-parent --authors-file= --repack=
-      --no-metadata --use-svm-props --use-svnsync-props
-      --log-window-size= --no-checkout --quiet
-      --repack-flags --use-log-author --localtime
-      --ignore-paths= --include-paths= $remote_opts
-      "
-    local init_opts="
-      --template= --shared= --trunk= --tags=
-      --branches= --stdlayout --minimize-url
-      --no-metadata --use-svm-props --use-svnsync-props
-      --rewrite-root= --prefix= --use-log-author
-      --add-author-from $remote_opts
-      "
-    local cmt_opts="
-      --edit --rmdir --find-copies-harder --copy-similarity=
-      "
-
-    case "$subcommand,$cur" in
-    fetch,--*)
-      __gitcomp "--revision= --fetch-all $fc_opts"
-      ;;
-    clone,--*)
-      __gitcomp "--revision= $fc_opts $init_opts"
-      ;;
-    init,--*)
-      __gitcomp "$init_opts"
-      ;;
-    dcommit,--*)
-      __gitcomp "
-        --merge --strategy= --verbose --dry-run
-        --fetch-all --no-rebase --commit-url
-        --revision --interactive $cmt_opts $fc_opts
-        "
-      ;;
-    set-tree,--*)
-      __gitcomp "--stdin $cmt_opts $fc_opts"
-      ;;
-    create-ignore,--*|propget,--*|proplist,--*|show-ignore,--*|\
-    show-externals,--*|mkdirs,--*)
-      __gitcomp "--revision="
-      ;;
-    log,--*)
-      __gitcomp "
-        --limit= --revision= --verbose --incremental
-        --oneline --show-commit --non-recursive
-        --authors-file= --color
-        "
-      ;;
-    rebase,--*)
-      __gitcomp "
-        --merge --verbose --strategy= --local
-        --fetch-all --dry-run $fc_opts
-        "
-      ;;
-    commit-diff,--*)
-      __gitcomp "--message= --file= --revision= $cmt_opts"
-      ;;
-    info,--*)
-      __gitcomp "--url"
-      ;;
-    branch,--*)
-      __gitcomp "--dry-run --message --tag"
-      ;;
-    tag,--*)
-      __gitcomp "--dry-run --message"
-      ;;
-    blame,--*)
-      __gitcomp "--git-format"
-      ;;
-    migrate,--*)
-      __gitcomp "
-        --config-dir= --ignore-paths= --minimize
-        --no-auth-cache --username=
-        "
-      ;;
-    reset,--*)
-      __gitcomp "--revision= --parent"
-      ;;
-    *)
-      ;;
-    esac
-  fi
-}
-
-_git_tag ()
-{
-  local i c=1 f=0
-  while [ $c -lt $cword ]; do
-    i="${words[c]}"
-    case "$i" in
-    -d|-v)
-      __gitcomp_nl "$(__git_tags)"
-      return
-      ;;
-    -f)
-      f=1
-      ;;
-    esac
-    ((c++))
-  done
-
-  case "$prev" in
-  -m|-F)
-    ;;
-  -*|tag)
-    if [ $f = 1 ]; then
-      __gitcomp_nl "$(__git_tags)"
-    fi
-    ;;
-  *)
-    __gitcomp_nl "$(__git_refs)"
-    ;;
-  esac
-}
-
-_git_whatchanged ()
-{
-  _git_log
-}
-
-__git_main ()
-{
-  local i c=1 command __git_dir
-
-  while [ $c -lt $cword ]; do
-    i="${words[c]}"
-    case "$i" in
-    --git-dir=*) __git_dir="${i#--git-dir=}" ;;
-    --git-dir)   ((c++)) ; __git_dir="${words[c]}" ;;
-    --bare)      __git_dir="." ;;
-    --help) command="help"; break ;;
-    -c|--work-tree|--namespace) ((c++)) ;;
-    -*) ;;
-    *) command="$i"; break ;;
-    esac
-    ((c++))
-  done
-
-  if [ -z "$command" ]; then
-    case "$cur" in
-    --*)   __gitcomp "
-      --paginate
-      --no-pager
-      --git-dir=
-      --bare
-      --version
-      --exec-path
-      --exec-path=
-      --html-path
-      --man-path
-      --info-path
-      --work-tree=
-      --namespace=
-      --no-replace-objects
-      --help
-      "
-      ;;
-    *)     __git_compute_porcelain_commands
-           __gitcomp "$__git_porcelain_commands $(__git_aliases)" ;;
-    esac
-    return
-  fi
-
-  local completion_func="_git_${command//-/_}"
-  declare -f $completion_func >/dev/null && $completion_func && return
-
-  local expansion=$(__git_aliased_command "$command")
-  if [ -n "$expansion" ]; then
-    completion_func="_git_${expansion//-/_}"
-    declare -f $completion_func >/dev/null && $completion_func
-  fi
-}
-
-__gitk_main ()
-{
-  __git_has_doubledash && return
-
-  local g="$(__gitdir)"
-  local merge=""
-  if [ -f "$g/MERGE_HEAD" ]; then
-    merge="--merge"
-  fi
-  case "$cur" in
-  --*)
-    __gitcomp "
-      $__git_log_common_options
-      $__git_log_gitk_options
-      $merge
-      "
-    return
-    ;;
-  esac
-  __git_complete_revlist
-}
-
-if [[ -n ${ZSH_VERSION-} ]]; then
-  echo "WARNING: this script is deprecated, please see git-completion.zsh" 1>&2
-
-  autoload -U +X compinit && compinit
-
-  __gitcomp ()
-  {
-    emulate -L zsh
-
-    local cur_="${3-$cur}"
-
-    case "$cur_" in
-    --*=)
-      ;;
-    *)
-      local c IFS=$' \t\n'
-      local -a array
-      for c in ${=1}; do
-        c="$c${4-}"
-        case $c in
-        --*=*|*.) ;;
-        *) c="$c " ;;
-        esac
-        array[$#array+1]="$c"
-      done
-      compset -P '*[=:]'
-      compadd -Q -S '' -p "${2-}" -a -- array && _ret=0
-      ;;
-    esac
-  }
-
-  __gitcomp_nl ()
-  {
-    emulate -L zsh
-
-    local IFS=$'\n'
-    compset -P '*[=:]'
-    compadd -Q -S "${4- }" -p "${2-}" -- ${=1} && _ret=0
-  }
-
-  __gitcomp_file ()
-  {
-    emulate -L zsh
-
-    local IFS=$'\n'
-    compset -P '*[=:]'
-    compadd -Q -p "${2-}" -f -- ${=1} && _ret=0
-  }
-
-  _git ()
-  {
-    local _ret=1 cur cword prev
-    cur=${words[CURRENT]}
-    prev=${words[CURRENT-1]}
-    let cword=CURRENT-1
-    emulate ksh -c __${service}_main
-    let _ret && _default && _ret=0
-    return _ret
-  }
-
-  compdef _git git gitk
-  return
-fi
-
-__git_func_wrap ()
-{
-  local cur words cword prev
-  _get_comp_words_by_ref -n =: cur words cword prev
-  $1
-}
-
-# Setup completion for certain functions defined above by setting common
-# variables and workarounds.
-# This is NOT a public function; use at your own risk.
-__git_complete ()
-{
-  local wrapper="__git_wrap${2}"
-  eval "$wrapper () { __git_func_wrap $2 ; }"
-  complete -o bashdefault -o default -o nospace -F $wrapper $1 2>/dev/null \
-    || complete -o default -o nospace -F $wrapper $1
-}
-
-# wrapper for backwards compatibility
-_git ()
-{
-  __git_wrap__git_main
-}
-
-# wrapper for backwards compatibility
-_gitk ()
-{
-  __git_wrap__gitk_main
-}
-
-__git_complete git __git_main
-__git_complete gitk __gitk_main
-
-# The following are necessary only for Cygwin, and only are needed
-# when the user has tab-completed the executable name and consequently
-# included the '.exe' suffix.
-#
-if [ Cygwin = "$(uname -o 2>/dev/null)" ]; then
-__git_complete git.exe __git_main
-fi
diff --git a/paddle/scripts/docker/root/.scripts/git-prompt.sh b/paddle/scripts/docker/root/.scripts/git-prompt.sh
deleted file mode 100755
index 576f4ec14c94a24ebffa9e2620acf881e6b5ddaa..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/root/.scripts/git-prompt.sh
+++ /dev/null
@@ -1,445 +0,0 @@
-# bash/zsh git prompt support
-#
-# Copyright (C) 2006,2007 Shawn O. Pearce <spearce@spearce.org>
-# Distributed under the GNU General Public License, version 2.0.
-#
-# This script allows you to see repository status in your prompt.
-#
-# To enable:
-#
-#    1) Copy this file to somewhere (e.g. ~/.git-prompt.sh).
-#    2) Add the following line to your .bashrc/.zshrc:
-#        source ~/.git-prompt.sh
-#    3a) Change your PS1 to call __git_ps1 as
-#        command-substitution:
-#        Bash: PS1='[\u@\h \W$(__git_ps1 " (%s)")]\$ '
-#        ZSH:  setopt PROMPT_SUBST ; PS1='[%n@%m %c$(__git_ps1 " (%s)")]\$ '
-#        the optional argument will be used as format string.
-#    3b) Alternatively, for a slightly faster prompt, __git_ps1 can
-#        be used for PROMPT_COMMAND in Bash or for precmd() in Zsh
-#        with two parameters, <pre> and <post>, which are strings
-#        you would put in $PS1 before and after the status string
-#        generated by the git-prompt machinery.  e.g.
-#        Bash: PROMPT_COMMAND='__git_ps1 "\u@\h:\w" "\\\$ "'
-#          will show username, at-sign, host, colon, cwd, then
-#          various status string, followed by dollar and SP, as
-#          your prompt.
-#        ZSH:  precmd () { __git_ps1 "%n" ":%~$ " "|%s" }
-#          will show username, pipe, then various status string,
-#          followed by colon, cwd, dollar and SP, as your prompt.
-#        Optionally, you can supply a third argument with a printf
-#        format string to finetune the output of the branch status
-#
-# The repository status will be displayed only if you are currently in a
-# git repository. The %s token is the placeholder for the shown status.
-#
-# The prompt status always includes the current branch name.
-#
-# In addition, if you set GIT_PS1_SHOWDIRTYSTATE to a nonempty value,
-# unstaged (*) and staged (+) changes will be shown next to the branch
-# name.  You can configure this per-repository with the
-# bash.showDirtyState variable, which defaults to true once
-# GIT_PS1_SHOWDIRTYSTATE is enabled.
-#
-# You can also see if currently something is stashed, by setting
-# GIT_PS1_SHOWSTASHSTATE to a nonempty value. If something is stashed,
-# then a '$' will be shown next to the branch name.
-#
-# If you would like to see if there're untracked files, then you can set
-# GIT_PS1_SHOWUNTRACKEDFILES to a nonempty value. If there're untracked
-# files, then a '%' will be shown next to the branch name.  You can
-# configure this per-repository with the bash.showUntrackedFiles
-# variable, which defaults to true once GIT_PS1_SHOWUNTRACKEDFILES is
-# enabled.
-#
-# If you would like to see the difference between HEAD and its upstream,
-# set GIT_PS1_SHOWUPSTREAM="auto".  A "<" indicates you are behind, ">"
-# indicates you are ahead, "<>" indicates you have diverged and "="
-# indicates that there is no difference. You can further control
-# behaviour by setting GIT_PS1_SHOWUPSTREAM to a space-separated list
-# of values:
-#
-#     verbose       show number of commits ahead/behind (+/-) upstream
-#     legacy        don't use the '--count' option available in recent
-#                   versions of git-rev-list
-#     git           always compare HEAD to @{upstream}
-#     svn           always compare HEAD to your SVN upstream
-#
-# By default, __git_ps1 will compare HEAD to your SVN upstream if it can
-# find one, or @{upstream} otherwise.  Once you have set
-# GIT_PS1_SHOWUPSTREAM, you can override it on a per-repository basis by
-# setting the bash.showUpstream config variable.
-#
-# If you would like to see more information about the identity of
-# commits checked out as a detached HEAD, set GIT_PS1_DESCRIBE_STYLE
-# to one of these values:
-#
-#     contains      relative to newer annotated tag (v1.6.3.2~35)
-#     branch        relative to newer tag or branch (master~4)
-#     describe      relative to older annotated tag (v1.6.3.1-13-gdd42c2f)
-#     default       exactly matching tag
-#
-# If you would like a colored hint about the current dirty state, set
-# GIT_PS1_SHOWCOLORHINTS to a nonempty value. The colors are based on
-# the colored output of "git status -sb" and are available only when
-# using __git_ps1 for PROMPT_COMMAND or precmd.
-
-# stores the divergence from upstream in $p
-# used by GIT_PS1_SHOWUPSTREAM
-__git_ps1_show_upstream ()
-{
-  local key value
-  local svn_remote svn_url_pattern count n
-  local upstream=git legacy="" verbose=""
-
-  svn_remote=()
-  # get some config options from git-config
-  local output="$(git config -z --get-regexp '^(svn-remote\..*\.url|bash\.showupstream)$' 2>/dev/null | tr '\0\n' '\n ')"
-  while read -r key value; do
-    case "$key" in
-    bash.showupstream)
-      GIT_PS1_SHOWUPSTREAM="$value"
-      if [[ -z "${GIT_PS1_SHOWUPSTREAM}" ]]; then
-        p=""
-        return
-      fi
-      ;;
-    svn-remote.*.url)
-      svn_remote[$((${#svn_remote[@]} + 1))]="$value"
-      svn_url_pattern+="\\|$value"
-      upstream=svn+git # default upstream is SVN if available, else git
-      ;;
-    esac
-  done <<< "$output"
-
-  # parse configuration values
-  for option in ${GIT_PS1_SHOWUPSTREAM}; do
-    case "$option" in
-    git|svn) upstream="$option" ;;
-    verbose) verbose=1 ;;
-    legacy)  legacy=1  ;;
-    esac
-  done
-
-  # Find our upstream
-  case "$upstream" in
-  git)    upstream="@{upstream}" ;;
-  svn*)
-    # get the upstream from the "git-svn-id: ..." in a commit message
-    # (git-svn uses essentially the same procedure internally)
-    local -a svn_upstream
-    svn_upstream=($(git log --first-parent -1 \
-          --grep="^git-svn-id: \(${svn_url_pattern#??}\)" 2>/dev/null))
-    if [[ 0 -ne ${#svn_upstream[@]} ]]; then
-      svn_upstream=${svn_upstream[${#svn_upstream[@]} - 2]}
-      svn_upstream=${svn_upstream%@*}
-      local n_stop="${#svn_remote[@]}"
-      for ((n=1; n <= n_stop; n++)); do
-        svn_upstream=${svn_upstream#${svn_remote[$n]}}
-      done
-
-      if [[ -z "$svn_upstream" ]]; then
-        # default branch name for checkouts with no layout:
-        upstream=${GIT_SVN_ID:-git-svn}
-      else
-        upstream=${svn_upstream#/}
-      fi
-    elif [[ "svn+git" = "$upstream" ]]; then
-      upstream="@{upstream}"
-    fi
-    ;;
-  esac
-
-  # Find how many commits we are ahead/behind our upstream
-  if [[ -z "$legacy" ]]; then
-    count="$(git rev-list --count --left-right \
-        "$upstream"...HEAD 2>/dev/null)"
-  else
-    # produce equivalent output to --count for older versions of git
-    local commits
-    if commits="$(git rev-list --left-right "$upstream"...HEAD 2>/dev/null)"
-    then
-      local commit behind=0 ahead=0
-      for commit in $commits
-      do
-        case "$commit" in
-        "<"*) ((behind++)) ;;
-        *)    ((ahead++))  ;;
-        esac
-      done
-      count="$behind  $ahead"
-    else
-      count=""
-    fi
-  fi
-
-  # calculate the result
-  if [[ -z "$verbose" ]]; then
-    case "$count" in
-    "") # no upstream
-      p="" ;;
-    "0  0") # equal to upstream
-      p="=" ;;
-    "0  "*) # ahead of upstream
-      p=">" ;;
-    *"  0") # behind upstream
-      p="<" ;;
-    *)      # diverged from upstream
-      p="<>" ;;
-    esac
-  else
-    case "$count" in
-    "") # no upstream
-      p="" ;;
-    "0  0") # equal to upstream
-      p=" u=" ;;
-    "0  "*) # ahead of upstream
-      p=" u+${count#0 }" ;;
-    *"  0") # behind upstream
-      p=" u-${count%  0}" ;;
-    *)      # diverged from upstream
-      p=" u+${count#* }-${count%  *}" ;;
-    esac
-  fi
-
-}
-
-# Helper function that is meant to be called from __git_ps1.  It
-# injects color codes into the appropriate gitstring variables used
-# to build a gitstring.
-__git_ps1_colorize_gitstring ()
-{
-  if [[ -n ${ZSH_VERSION-} ]]; then
-    local c_red='%F{red}'
-    local c_green='%F{green}'
-    local c_lblue='%F{blue}'
-    local c_clear='%f'
-  else
-    # Using \[ and \] around colors is necessary to prevent
-    # issues with command line editing/browsing/completion!
-    local c_red='\[\e[31m\]'
-    local c_green='\[\e[32m\]'
-    local c_lblue='\[\e[1;34m\]'
-    local c_clear='\[\e[0m\]'
-  fi
-  local bad_color=$c_red
-  local ok_color=$c_green
-  local flags_color="$c_lblue"
-
-  local branch_color=""
-  if [ $detached = no ]; then
-    branch_color="$ok_color"
-  else
-    branch_color="$bad_color"
-  fi
-  c="$branch_color$c"
-
-  z="$c_clear$z"
-  if [ "$w" = "*" ]; then
-    w="$bad_color$w"
-  fi
-  if [ -n "$i" ]; then
-    i="$ok_color$i"
-  fi
-  if [ -n "$s" ]; then
-    s="$flags_color$s"
-  fi
-  if [ -n "$u" ]; then
-    u="$bad_color$u"
-  fi
-  r="$c_clear$r"
-}
-
-# __git_ps1 accepts 0 or 1 arguments (i.e., format string)
-# when called from PS1 using command substitution
-# in this mode it prints text to add to bash PS1 prompt (includes branch name)
-#
-# __git_ps1 requires 2 or 3 arguments when called from PROMPT_COMMAND (pc)
-# in that case it _sets_ PS1. The arguments are parts of a PS1 string.
-# when two arguments are given, the first is prepended and the second appended
-# to the state string when assigned to PS1.
-# The optional third parameter will be used as printf format string to further
-# customize the output of the git-status string.
-# In this mode you can request colored hints using GIT_PS1_SHOWCOLORHINTS=true
-__git_ps1 ()
-{
-  local pcmode=no
-  local detached=no
-  local ps1pc_start='\u@\h:\w '
-  local ps1pc_end='\$ '
-  local printf_format=' (%s)'
-
-  case "$#" in
-    2|3)  pcmode=yes
-      ps1pc_start="$1"
-      ps1pc_end="$2"
-      printf_format="${3:-$printf_format}"
-    ;;
-    0|1)  printf_format="${1:-$printf_format}"
-    ;;
-    *)  return
-    ;;
-  esac
-
-  local repo_info rev_parse_exit_code
-  repo_info="$(git rev-parse --git-dir --is-inside-git-dir \
-    --is-bare-repository --is-inside-work-tree \
-    --short HEAD 2>/dev/null)"
-  rev_parse_exit_code="$?"
-
-  if [ -z "$repo_info" ]; then
-    if [ $pcmode = yes ]; then
-      #In PC mode PS1 always needs to be set
-      PS1="$ps1pc_start$ps1pc_end"
-    fi
-    return
-  fi
-
-  local short_sha
-  if [ "$rev_parse_exit_code" = "0" ]; then
-    short_sha="${repo_info##*$'\n'}"
-    repo_info="${repo_info%$'\n'*}"
-  fi
-  local inside_worktree="${repo_info##*$'\n'}"
-  repo_info="${repo_info%$'\n'*}"
-  local bare_repo="${repo_info##*$'\n'}"
-  repo_info="${repo_info%$'\n'*}"
-  local inside_gitdir="${repo_info##*$'\n'}"
-  local g="${repo_info%$'\n'*}"
-
-  local r=""
-  local b=""
-  local step=""
-  local total=""
-  if [ -d "$g/rebase-merge" ]; then
-    read b 2>/dev/null <"$g/rebase-merge/head-name"
-    read step 2>/dev/null <"$g/rebase-merge/msgnum"
-    read total 2>/dev/null <"$g/rebase-merge/end"
-    if [ -f "$g/rebase-merge/interactive" ]; then
-      r="|REBASE-i"
-    else
-      r="|REBASE-m"
-    fi
-  else
-    if [ -d "$g/rebase-apply" ]; then
-      read step 2>/dev/null <"$g/rebase-apply/next"
-      read total 2>/dev/null <"$g/rebase-apply/last"
-      if [ -f "$g/rebase-apply/rebasing" ]; then
-        read b 2>/dev/null <"$g/rebase-apply/head-name"
-        r="|REBASE"
-      elif [ -f "$g/rebase-apply/applying" ]; then
-        r="|AM"
-      else
-        r="|AM/REBASE"
-      fi
-    elif [ -f "$g/MERGE_HEAD" ]; then
-      r="|MERGING"
-    elif [ -f "$g/CHERRY_PICK_HEAD" ]; then
-      r="|CHERRY-PICKING"
-    elif [ -f "$g/REVERT_HEAD" ]; then
-      r="|REVERTING"
-    elif [ -f "$g/BISECT_LOG" ]; then
-      r="|BISECTING"
-    fi
-
-    if [ -n "$b" ]; then
-      :
-    elif [ -h "$g/HEAD" ]; then
-      # symlink symbolic ref
-      b="$(git symbolic-ref HEAD 2>/dev/null)"
-    else
-      local head=""
-      if ! read head 2>/dev/null <"$g/HEAD"; then
-        if [ $pcmode = yes ]; then
-          PS1="$ps1pc_start$ps1pc_end"
-        fi
-        return
-      fi
-      # is it a symbolic ref?
-      b="${head#ref: }"
-      if [ "$head" = "$b" ]; then
-        detached=yes
-        b="$(
-        case "${GIT_PS1_DESCRIBE_STYLE-}" in
-        (contains)
-          git describe --contains HEAD ;;
-        (branch)
-          git describe --contains --all HEAD ;;
-        (describe)
-          git describe HEAD ;;
-        (* | default)
-          git describe --tags --exact-match HEAD ;;
-        esac 2>/dev/null)" ||
-
-        b="$short_sha..."
-        b="($b)"
-      fi
-    fi
-  fi
-
-  if [ -n "$step" ] && [ -n "$total" ]; then
-    r="$r $step/$total"
-  fi
-
-  local w=""
-  local i=""
-  local s=""
-  local u=""
-  local c=""
-  local p=""
-
-  if [ "true" = "$inside_gitdir" ]; then
-    if [ "true" = "$bare_repo" ]; then
-      c="BARE:"
-    else
-      b="GIT_DIR!"
-    fi
-  elif [ "true" = "$inside_worktree" ]; then
-    if [ -n "${GIT_PS1_SHOWDIRTYSTATE-}" ] &&
-       [ "$(git config --bool bash.showDirtyState)" != "false" ]
-    then
-      git diff --no-ext-diff --quiet --exit-code || w="*"
-      if [ -n "$short_sha" ]; then
-        git diff-index --cached --quiet HEAD -- || i="+"
-      else
-        i="#"
-      fi
-    fi
-    if [ -n "${GIT_PS1_SHOWSTASHSTATE-}" ] &&
-       [ -r "$g/refs/stash" ]; then
-      s="$"
-    fi
-
-    if [ -n "${GIT_PS1_SHOWUNTRACKEDFILES-}" ] &&
-       [ "$(git config --bool bash.showUntrackedFiles)" != "false" ] &&
-       git ls-files --others --exclude-standard --error-unmatch -- '*' >/dev/null 2>/dev/null
-    then
-      u="%${ZSH_VERSION+%}"
-    fi
-
-    if [ -n "${GIT_PS1_SHOWUPSTREAM-}" ]; then
-      __git_ps1_show_upstream
-    fi
-  fi
-
-  local z="${GIT_PS1_STATESEPARATOR-" "}"
-
-  # NO color option unless in PROMPT_COMMAND mode
-  if [ $pcmode = yes ] && [ -n "${GIT_PS1_SHOWCOLORHINTS-}" ]; then
-    __git_ps1_colorize_gitstring
-  fi
-
-  local f="$w$i$s$u"
-  local gitstring="$c${b##refs/heads/}${f:+$z$f}$r$p"
-
-  if [ $pcmode = yes ]; then
-    if [[ -n ${ZSH_VERSION-} ]]; then
-      gitstring=$(printf -- "$printf_format" "$gitstring")
-    else
-      printf -v gitstring -- "$printf_format" "$gitstring"
-    fi
-    PS1="$ps1pc_start$gitstring$ps1pc_end"
-  else
-    printf -- "$printf_format" "$gitstring"
-  fi
-}
diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh
deleted file mode 100644
index b629a251cc5ac2ceb3d39a11cd47f169d0ef0409..0000000000000000000000000000000000000000
--- a/paddle/scripts/fast_install.sh
+++ /dev/null
@@ -1,1071 +0,0 @@
-#!/bin/bash
-
-## purple to echo
-function purple(){
-    echo -e "\033[35m$1\033[0m"
-}
-
-
-## green to echo
-function green(){
-    echo -e "\033[32m$1\033[0m"
-}
-
-## Error to warning with blink
-function bred(){
-    echo -e "\033[31m\033[01m\033[05m$1\033[0m"
-}
-
-## Error to warning with blink
-function byellow(){
-    echo -e "\033[33m\033[01m\033[05m$1\033[0m"
-}
-
-
-## Error
-function red(){
-    echo -e "\033[31m\033[01m$1\033[0m"
-}
-
-## warning
-function yellow(){
-    echo -e "\033[33m\033[01m$1\033[0m"
-}
-
-path='http://paddlepaddle.org/download?url='
-release_version=`pip show paddlepaddle|grep Version|awk '{print $NF}'`
-python_list=(
-"27"
-"35"
-"36"
-"37"
-)
-
-
-function use_cpu(){
-   while true
-    do
-     read -p "是否安装CPU版本的PaddlePaddle？(y/n)" cpu_option
-     cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'`
-     if [[ "$cpu_option" == "" || "$cpu_option" == "n" ]];then
-        echo "退出安装中..."
-        exit
-     else
-        GPU='cpu'
-        echo "将为您安装CPU版本的PaddlePaddle"
-        break
-     fi
-    done
-}
-
-function checkLinuxCUDNN(){
-   echo
-   read -n1 -p "请按回车键进行下一步..."
-   echo
-   while true
-   do
-       version_file='/usr/local/cuda/include/cudnn.h'
-       if [ -f "$version_file" ];then
-          CUDNN=`cat $version_file | grep CUDNN_MAJOR |awk 'NR==1{print $NF}'`
-       fi
-       if [ "$CUDNN" == "" ];then
-           version_file=`sudo find /usr -name "cudnn.h"|head -1`
-           if [ "$version_file" != "" ];then
-               CUDNN=`cat ${version_file} | grep CUDNN_MAJOR -A 2|awk 'NR==1{print $NF}'`
-           else
-               echo "检测结果：未在常规路径下找到cuda/include/cudnn.h文件"
-               while true
-               do
-                  read -p "请核实cudnn.h位置，并在此输入路径（请注意，路径需要输入到“cudnn.h”这一级）:" cudnn_version
-                  echo
-                  if [ "$cudnn_version" == "" ] || [ ! -f "$cudnn_version" ];then
-                        read -p "仍未找到cuDNN，输入y将安装CPU版本的PaddlePaddle，输入n可重新录入cuDNN路径，请输入（y/n）" cpu_option
-                        echo
-                        cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'`
-                        if [ "$cpu_option" == "y" -o "$cpu_option" == "" ];then
-                            GPU='cpu'
-                            break
-                        else
-                            echo "请重新输入"
-                            echo
-                        fi
-                  else
-                     CUDNN=`cat $cudnn_version | grep CUDNN_MAJOR |awk 'NR==1{print $NF}'`
-                     echo "检测结果：找到cudnn.h"
-                     break
-                  fi
-                 done
-             if [ "$GPU" == "cpu" ];then
-                break
-             fi
-           fi
-       fi
-       if [ "$CUDA" == "9" -a "$CUDNN" != "7" ];then
-           echo
-           echo "目前CUDA9下仅支持cuDNN7，暂不支持您机器上的CUDNN${CUDNN}。您可以访问NVIDIA官网下载适合版本的CUDNN，请ctrl+c退出安装进程。按回车键将为您安装CPU版本的PaddlePaddle"
-           echo
-          use_cpu()
-          if [ "$GPU"=="cpu" ];then
-             break
-          fi
-       fi
-
-       if [ "$CUDNN" == 5 ] || [ "$CUDNN" == 7 ];then
-          echo
-          echo "您的CUDNN版本是: CUDNN$CUDNN"
-          break
-       else
-          echo
-          read -n1 -p "目前支持的CUDNN版本为5和7,暂不支持您机器上的CUDNN${CUDNN}，将为您安装CPU版本的PaddlePaddle,请按回车键开始安装"
-          echo
-          use_cpu
-          if [ "$GPU"=="cpu" ];then
-             break
-          fi
-       fi
-   done
-}
-
-function checkLinuxCUDA(){
-   while true
-   do
-       CUDA=`echo ${CUDA_VERSION}|awk -F "[ .]" '{print $1}'`
-       if [ "$CUDA" == "" ];then
-         if [ -f "/usr/local/cuda/version.txt" ];then
-           CUDA=`cat /usr/local/cuda/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'`
-           tmp_cuda=$CUDA
-         fi
-         if [ -f "/usr/local/cuda8/version.txt" ];then
-           CUDA=`cat /usr/local/cuda8/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'`
-           tmp_cuda8=$CUDA
-         fi
-         if [ -f "/usr/local/cuda9/version.txt" ];then
-           CUDA=`cat /usr/local/cuda9/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'`
-           tmp_cuda9=$CUDA
-         fi
-         if [ -f "/usr/local/cuda10/version.txt" ];then
-           CUDA=`cat /usr/local/cuda10/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'`
-           tmp_cuda10=$CUDA
-         fi
-       fi
-
-       if [ "$tmp_cuda" != "" ];then
-         echo "检测结果：找到CUDA $tmp_cuda"
-       fi
-       if [ "$tmp_cudai8" != "" ];then
-         echo "检测结果：找到CUDA $tmp_cuda8"
-       fi
-       if [ "$tmp_cuda9" != "" ];then
-         echo "检测结果：找到CUDA $tmp_cuda9"
-       fi
-       if [ "$tmp_cuda10" != "" ];then
-         echo "检测结果：找到CUDA $tmp_cuda10"
-       fi
-
-       if [ "$CUDA" == "" ];then
-            echo "检测结果：没有在常规路径下找到cuda/version.txt文件"
-            while true
-            do
-                read -p "请输入cuda/version.txt的路径:" cuda_version
-                if [ "$cuda_version" == "" || ! -f "$cuda_version" ];then
-                    read -p "仍未找到CUDA，输入y将安装CPU版本的PaddlePaddle，输入n可重新录入CUDA路径，请输入（y/n）" cpu_option
-                    cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'`
-                    if [ "$cpu_option" == "y" || "$cpu_option" == "" ];then
-                        GPU='cpu'
-                        break
-                    else
-                        echo "重新输入..."
-                    fi
-                else
-                    CUDA=`cat $cuda_version | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'`
-                    if [ "$CUDA" == "" ];then
-                        echo "未能在version.txt中找到CUDA相关信息"
-                    else
-                        break
-                    fi
-                fi
-            done
-            if [ "$GPU" == "cpu" ];then
-                break
-            fi
-       fi
-
-       if [ "$CUDA" == "8" ] || [ "$CUDA" == "9" ] || [ "$CUDA" == "10" ];then
-          echo "您的CUDA版本是${CUDA}"
-          break
-       else
-          echo "目前支持CUDA8/9/10，暂不支持您的CUDA${CUDA}，将为您安装CPU版本的PaddlePaddle"
-          echo
-          use_cpu
-       fi
-
-       if [ "$GPU" == "cpu" ];then
-          break
-       fi
-   done
-}
-
-function checkLinuxMathLibrary(){
-  while true
-    do
-      if [ "$GPU" == "gpu" ];then
-        math='mkl'
-        echo "检测到您的机器上配备GPU，推荐您使用mkl数学库"
-        break
-      else
-        read -p "请输入您希望使用的数学库：
-            1：openblas 一个高性能多核 BLAS 库
-            2：mkl（推荐） 英特尔数学核心函数库
-            => 请输入数字1或2。如输入其他字符或直接回车，将会默认选择【 2. mkl 】 。请在这里输入并回车：" math
-          if [ "$math" == "" ];then
-            math="mkl"
-            echo "您选择了数字【2】"
-            break
-          fi
-          if [ "$math" == "1" ];then
-            math=openblas
-            echo "您选择了数字【1】"
-            break
-          elif [ "$math" == "2" ];then
-            math=mkl
-            echo "您选择了数字【2】"
-            break
-          fi
-          echo "输入错误，请再次输入"
-      fi
-    done
-}
-
-function checkLinuxPaddleVersion(){
-  read -n1 -p "请按回车键继续..."
-  while true
-    do
-      read -p "
-               1. 开发版：对应Github上develop分支，如您需要开发、或希望使用PaddlePaddle最新功能，请选用此版本
-               2. 稳定版（推荐）：如您无特殊开发需求，建议使用此版本，目前最新的版本号为 ${release_version}
-                => 请输入数字1或2。如输入其他字符或直接回车，将会默认选择【 2. 稳定版 】 。请在这里输入并回车：" paddle_version
-        if [ "$paddle_version" == "" ];then
-          paddle_version="2"
-          echo "您选择了数字【2】，为您安装release-${release_version}"
-          break
-        fi
-        if [ "$paddle_version" == "1" ];then
-          echo "您选择了数字【1】，将为您安装开发版"
-          break
-        elif [ "$paddle_version" == "2" ];then
-          echo "您选择了数字【2】，为您安装release-${release_version}"
-          break
-        fi
-        echo "输入错误，请再次输入"
-    done
-}
-
-function checkPythonVirtualenv(){
-  while true
-    do
-      read -p "
-                是否使用python  virtualenv虚环境安装(y/n)": check_virtualenv
-    case $check_virtualenv in
-      y)
-        echo "为您使用python虚环境安装"
-        ;;
-      n)
-        break
-        ;;
-      *)
-        continue
-        ;;
-    esac
-
-    virtualenv_path=`which virtualenv 2>&1`
-    if [ "$virtualenv_path" == "" ];then
-      $python_path -m pip install virtualenv
-      if [ "$?" != '0' ];then
-        echo "安装虚拟环境失败,请检查本地环境"
-      fi
-    fi
-
-    while true
-      do
-        read -p "请输入虚拟环境名字：" virtualenv_name
-        if [ "$virtualenv_name" == "" ];then
-          echo "不能为空"
-          continue
-        fi
-        break
-    done
-
-    virtualenv -p $python_path ${virtualenv_name}
-    if [ "$?" != 0 ];then
-      echo "创建虚环境失败,请检查环境"
-      exit 2
-    fi
-    cd ${virtualenv_name}
-    source ./bin/activate
-
-    if [ "$?" == 0 ];then
-      use_virtualenv=
-      python_path=`which python`
-      break
-    else
-      echo "创建虚环境失败,请检查环境"
-      exit 2
-    fi
-  done
-}
-
-function checkLinuxPython(){
-  python_path=`which python 2>/dev/null`
-  while true
-    do
-  if [ "$python_path" == '' ];then
-    while true
-      do
-        read -p "没有找到默认的python版本,请输入要安装的python路径:"  python_path
-        python_path=`$python_path -V`
-        if [ "$python_path" != "" ];then
-          break
-        else
-          echo "输入路径有误,未找到pyrhon"
-        fi
-    done
-  fi
-
-  python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
-  pip_version=`$python_path -m pip -V|awk -F '[ .]' '{print $2}'`
-  while true
-    do
-      read -p "
-                找到python版本$python_version,使用请输入y,选择其他版本请输n(y/n):"  check_python
-      case $check_python in
-        n)
-          read -p "请指定您的python路径:" new_python_path
-          python_V=`$new_python_path -V 2>/dev/null`
-          if [ "$python_V" != "" ];then
-            python_path=$new_python_path
-            python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
-            pip_version=`python -m pip -V|awk -F '[ .]' '{print $2}'`
-            echo "您的python版本为${python_version}"
-            break
-          else
-            echo 输入有误,未找到python路径
-          fi
-          ;;
-        y)
-          break
-          ;;
-        *)
-          echo "输入有误，请重新输入."
-          continue
-          ;;
-      esac
-  done
-
-  if [ "$pip_version" -lt 9 ];then
-    echo "您的pip版本小于9.0.1  请升级pip (pip install --upgrade pip)"
-    exit 0
-  fi
-
-
-  if [ "$python_version" == "27" ];then
-     python_version_all=`$python_path -V 2>&1|awk -F '[ .]' '{print $4}'`
-     if [[ $python_version_all -le 15 ]];then
-        echo "Python2版本小于2.7.15,请更新Python2版本或使用Python3"
-        exit 0
-      fi
-     uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
-     if [[ "$uncode" == "" ]];then
-        uncode=
-     else
-        uncode=u
-     fi
-  fi
-
-  version_list=`echo "${python_list[@]}" | grep "$python_version" `
-  if [ "$version_list" == "" ];then
-    echo "找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入， 或使用ctrl + c退出 "
-  else
-    break
-  fi
-  done
-}
-
-
-function PipLinuxInstall(){
-  wheel_cpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-${GPU}-${math}/paddlepaddle-${release_version}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
-  wheel_gpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-gpu-cuda${CUDA}-cudnn${CUDNN}-${math}/paddlepaddle_gpu-${release_version}.post${CUDA}${CUDNN}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
-  wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
-  wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
-
-
-  if [[ "$paddle_version" == "2" ]];then
-    if [[ "$GPU" == "gpu" ]];then
-          rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'`
-          wget -q $wheel_gpu_release
-          if [ "$?" == "0" ];then
-            $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
-            if [ "$?" == 0 ];then
-              echo 安装成功
-              exit 0
-            else
-              echo 安装失败
-              exit 1
-            fi
-          else
-            echo paddlepaddle whl包下载失败
-            exit 1
-          fi
-    else
-        rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'`
-        wget -q $wheel_cpu_release
-        if [ "$?" == "0" ];then
-          $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
-          if [ "$?" == 0 ];then
-              echo 安装成功
-              exit 0
-            else
-              echo 安装失败
-              exit 1
-            fi
-        else
-          echo paddlepaddle whl包下载失败
-          exit 1
-        fi
-    fi
-  fi
-  if [[ "$GPU" == "gpu" ]];then
-        rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'`
-        wget -q $wheel_gpu_develop
-        if [ "$?" == "0" ];then
-          $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
-          if [ "$?" == 0 ];then
-              echo 安装成功
-              exit 0
-            else
-              echo 安装失败
-              exit 1
-            fi
-        else
-          echo paddlepaddle whl包下载失败
-          exit 1
-        fi
-  else
-        rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'`
-        wget -q $wheel_cpu_develop
-        if [ "$?" == "0" ];then
-          $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
-          if [ "$?" == 0 ];then
-              echo 安装成功
-              exit 0
-            else
-              echo 安装失败
-              exit 1
-            fi
-        else
-          echo paddlepaddle whl包下载失败
-          exit 1
-        fi
-    fi
-}
-
-
-function checkLinuxGPU(){
-  read -n1 -p "即将检测您的机器是否含GPU，请按回车键继续..."
-  echo
-  which nvidia-smi >/dev/null 2>&1
-  if [ "$?" != "0" ];then
-    GPU='cpu'
-    echo "未在机器上找到GPU，或PaddlePaddle暂不支持此型号的GPU"
-  else
-    GPU='gpu'
-    echo "已在您的机器上找到GPU，即将确认CUDA和CUDNN版本..."
-    echo
-  fi
-  if [ "$GPU" == 'gpu' ];then
-    checkLinuxCUDA
-    checkLinuxCUDNN
-  fi
-}
-
-function linux(){
-gpu_list=(
-"GeForce 410M"
-"GeForce 610M"
-"GeForce 705M"
-"GeForce 710M"
-"GeForce 800M"
-"GeForce 820M"
-"GeForce 830M"
-"GeForce 840M"
-"GeForce 910M"
-"GeForce 920M"
-"GeForce 930M"
-"GeForce 940M"
-"GeForce GT 415M"
-"GeForce GT 420M"
-"GeForce GT 430"
-"GeForce GT 435M"
-"GeForce GT 440"
-"GeForce GT 445M"
-"GeForce GT 520"
-"GeForce GT 520M"
-"GeForce GT 520MX"
-"GeForce GT 525M"
-"GeForce GT 540M"
-"GeForce GT 550M"
-"GeForce GT 555M"
-"GeForce GT 610"
-"GeForce GT 620"
-"GeForce GT 620M"
-"GeForce GT 625M"
-"GeForce GT 630"
-"GeForce GT 630M"
-"GeForce GT 635M"
-"GeForce GT 640"
-"GeForce GT 640 (GDDR5)"
-"GeForce GT 640M"
-"GeForce GT 640M LE"
-"GeForce GT 645M"
-"GeForce GT 650M"
-"GeForce GT 705"
-"GeForce GT 720"
-"GeForce GT 720M"
-"GeForce GT 730"
-"GeForce GT 730M"
-"GeForce GT 735M"
-"GeForce GT 740"
-"GeForce GT 740M"
-"GeForce GT 745M"
-"GeForce GT 750M"
-"GeForce GTS 450"
-"GeForce GTX 1050"
-"GeForce GTX 1060"
-"GeForce GTX 1070"
-"GeForce GTX 1080"
-"GeForce GTX 1080 Ti"
-"GeForce GTX 460"
-"GeForce GTX 460M"
-"GeForce GTX 465"
-"GeForce GTX 470"
-"GeForce GTX 470M"
-"GeForce GTX 480"
-"GeForce GTX 480M"
-"GeForce GTX 485M"
-"GeForce GTX 550 Ti"
-"GeForce GTX 560M"
-"GeForce GTX 560 Ti"
-"GeForce GTX 570"
-"GeForce GTX 570M"
-"GeForce GTX 580"
-"GeForce GTX 580M"
-"GeForce GTX 590"
-"GeForce GTX 650"
-"GeForce GTX 650 Ti"
-"GeForce GTX 650 Ti BOOST"
-"GeForce GTX 660"
-"GeForce GTX 660M"
-"GeForce GTX 660 Ti"
-"GeForce GTX 670"
-"GeForce GTX 670M"
-"GeForce GTX 670MX"
-"GeForce GTX 675M"
-"GeForce GTX 675MX"
-"GeForce GTX 680"
-"GeForce GTX 680M"
-"GeForce GTX 680MX"
-"GeForce GTX 690"
-"GeForce GTX 750"
-"GeForce GTX 750 Ti"
-"GeForce GTX 760"
-"GeForce GTX 760M"
-"GeForce GTX 765M"
-"GeForce GTX 770"
-"GeForce GTX 770M"
-"GeForce GTX 780"
-"GeForce GTX 780M"
-"GeForce GTX 780 Ti"
-"GeForce GTX 850M"
-"GeForce GTX 860M"
-"GeForce GTX 870M"
-"GeForce GTX 880M"
-"GeForce GTX 950"
-"GeForce GTX 950M"
-"GeForce GTX 960"
-"GeForce GTX 960M"
-"GeForce GTX 965M"
-"GeForce GTX 970"
-"GeForce GTX 970M"
-"GeForce GTX 980"
-"GeForce GTX 980M"
-"GeForce GTX 980 Ti"
-"GeForce GTX TITAN"
-"GeForce GTX TITAN Black"
-"GeForce GTX TITAN X"
-"GeForce GTX TITAN Z"
-"Jetson TK1"
-"Jetson TX1"
-"Jetson TX2"
-"Mobile Products"
-"NVIDIA NVS 310"
-"NVIDIA NVS 315"
-"NVIDIA NVS 510"
-"NVIDIA NVS 810"
-"NVIDIA TITAN V"
-"NVIDIA TITAN X"
-"NVIDIA TITAN Xp"
-"NVS 4200M"
-"NVS 5200M"
-"NVS 5400M"
-"Quadro 410"
-"Quadro GP100"
-"Quadro K1100M"
-"Quadro K1200"
-"Quadro K2000"
-"Quadro K2000D"
-"Quadro K2100M"
-"Quadro K2200"
-"Quadro K2200M"
-"Quadro K3100M"
-"Quadro K4000"
-"Quadro K4100M"
-"Quadro K420"
-"Quadro K4200"
-"Quadro K4200M"
-"Quadro K5000"
-"Quadro K500M"
-"Quadro K5100M"
-"Quadro K510M"
-"Quadro K5200"
-"Quadro K5200M"
-"Quadro K600"
-"Quadro K6000"
-"Quadro K6000M"
-"Quadro K610M"
-"Quadro K620"
-"Quadro K620M"
-"Quadro M1000M"
-"Quadro M1200"
-"Quadro M2000"
-"Quadro M2000M"
-"Quadro M2200"
-"Quadro M3000M"
-"Quadro M4000"
-"Quadro M4000M"
-"Quadro M5000"
-"Quadro M5000M"
-"Quadro M500M"
-"Quadro M520"
-"Quadro M5500M"
-"Quadro M6000"
-"Quadro M6000 24GB"
-"Quadro M600M"
-"Quadro M620"
-"Quadro Mobile Products"
-"Quadro P1000"
-"Quadro P2000"
-"Quadro P3000"
-"Quadro P400"
-"Quadro P4000"
-"Quadro P5000"
-"Quadro P600"
-"Quadro P6000"
-"Quadro Plex 7000"
-"Tegra K1"
-"Tegra X1"
-"Tesla C2050/C2070"
-"Tesla C2075"
-"Tesla Data Center Products"
-"Tesla K10"
-"Tesla K20"
-"Tesla K40"
-"Tesla K80"
-"Tesla M40"
-"Tesla M60"
-"Tesla P100"
-"Tesla P4"
-"Tesla P40"
-"Tesla V100")
-
-  echo "Step 2. 检测GPU型号和CUDA/cuDNN版本"
-  echo
-  checkLinuxGPU
-  echo
-  echo "Step 3. 检测数学库"
-  echo
-  checkLinuxMathLibrary
-  echo
-  echo "Step 4. 选择要安装的PaddlePaddle版本"
-  echo
-  checkLinuxPaddleVersion
-  echo
-  echo "Step 5. 检测pip版本"
-  echo
-  checkLinuxPython
-  echo
-  echo "Step 6.是否使用Python的虚拟环境"
-  use_virtualenv="--user"
-  checkPythonVirtualenv
-  echo "*********************2. 开始安装*****************************"
-  PipLinuxInstall
-  if [ "$check_virtualenv" == 'y' ];then
-    echo "虚环境创建成功，请cd 进入${virtualenv_name}, 执行 source bin/activate　进入虚环境。退出虚环境执行 deactivate命令。
-  更多虚环境使用方法请参考virtualenv官网:https://virtualenv.pypa.io/en/latest/"
-  fi
-}
-
-function clearMacPythonEnv(){
-   python_version=""
-   python_brief_version=""
-   python_root=""
-}
-
-function checkMacPython2(){
-    while true
-       do
-          python_min="2.7.15"
-          python_version=`$python_root --version 2>&1 1>&1`
-          if [[ $? == "0" ]];then
-               if [ "$python_version" == "" ] || ( [ "$python_root" == "/usr/bin/python" ] && ( [ "$python_version" \< "$python_min" ] || ( [ "$python_version" \> "$python_min" ] && [ ${#python_version} -lt ${#python_min} ] ) ) );then
-                    clearMacPythonEnv
-               elif [[ "$python_version" < "2.7.15" ]];then
-                    echo -e "          => 在您的环境中找到 \033[32m[ $python_version ]\033[0m,此版本小于2.7.15不建议使用,请选择其他版本."
-                    exit
-               else
-                    check_python=`echo $python_version | grep "Python 2"`
-                    if [[ -n "$check_python" ]];then
-                       while true
-                         do
-                           echo -e "          => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车: "
-                           read -p "" use_python
-                           echo
-                           use_python=`echo $use_python | tr 'A-Z' 'a-z'`
-                           if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
-                                use_python="y"
-                                break
-                           elif [[ "$use_python" == "n" ]];then
-                                clearMacPythonEnv
-                                break
-                           else
-                               red "            输入错误，请重新输入(y/n)"
-                           fi
-                       done
-                       if [[ "$use_python" == "y" ]];then
-                         return 0
-                       fi
-                    else
-                       red "          您输入Python的不是Python2"
-                       clearMacPythonEnv
-                    fi
-               fi
-          else
-               clearMacPythonEnv
-               red "          => 未能在常规路径下找到可用的Python2，请使用ctrl+c命令退出安装程序，并使用brew或pypi.org下载安装Python2（注意Python版本不能低于2.7.15）"
-               read -p "          如希望自定义Python路径，请输入路径
-          如果希望重新选择Python版本，请回车：" python_root
-               echo
-               if [[ "$python_root" == "" ]];then
-                     python_V=""
-                     clearMacPythonEnv
-                     return 1
-               fi
-          fi
-       done
-}
-
-function checkMacPython3(){
-    while true
-       do
-          python_min="2.7.15"
-          python_version=`$python_root --version 2>&1 1>&1`
-          if [[ $? == "0" ]];then
-               if [ "$python_version" == "" ] || ( [ "$python_root" == "/usr/bin/python" ] && ( [ "$python_version" \< "$python_min" ] || ( [ "$python_version" \> "$python_min" ] && [ ${#python_version} -lt ${#python_min} ] ) ) );then
-                    clearMacPythonEnv
-               else
-                    check_python=`echo $python_version | grep "Python 3"`
-                    if [[ -n "$check_python" ]];then
-                       while true
-                         do
-                           echo -e "          => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车: "
-                           read -p "" use_python
-                           echo
-                           use_python=`echo $use_python | tr 'A-Z' 'a-z'`
-                           if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
-                                use_python="y"
-                                break
-                           elif [[ "$use_python" == "n" ]];then
-                                clearMacPythonEnv
-                                break
-                           else
-                               red "            输入错误，请重新输入(y/n)"
-                           fi
-                       done
-                       if [[ "$use_python" == "y" ]];then
-                         return 0
-                       fi
-                    else
-                       red "          您输入Python的不是Python3"
-                       clearMacPythonEnv
-                    fi
-               fi
-          else
-               clearMacPythonEnv
-               red "          => 未能在常规路径下找到可用的Python3，请使用ctrl+c命令退出安装程序，并使用brew或pypi.org下载安装Python3（注意Python版本不能低于3.5.x)"
-               read -p "          如希望自定义Python路径，请输入路径
-          如果希望重新选择Python版本，请回车：" python_root
-               echo
-               if [[ "$python_root" == "" ]];then
-                     python_V=""
-                     clearMacPythonEnv
-                     return 1
-               fi
-          fi
-       done
-}
-
-function checkMacPaddleVersion(){
-    echo
-    yellow "          目前PaddlePaddle在MacOS环境下只提供稳定版，最新的版本号为 ${release_version}"
-    echo
-    paddle_version="2"
-    echo
-    yellow "          我们将会为您安装PaddlePaddle稳定版，请按回车键继续... "
-    read -n1 -p ""
-    echo
-}
-function initCheckMacPython2(){
-   echo
-   yellow "          您选择了Python "$python_V"，正在寻找符合要求的Python 2版本"
-   echo
-   python_root=`which python2.7`
-   if [[ "$python_root" == "" ]];then
-        python_root=`which python`
-   fi
-   checkMacPython2
-   if [[ "$?" == "1" ]];then
-        return 1
-   else
-        return 0
-   fi
-}
-
-function initCheckMacPython3(){
-   echo
-   yellow "          您选择了Python "$python_V"，正在寻找符合您要求的Python 2版本"
-   echo
-   python_root=`which python3`
-   checkMacPython3
-   if [[ "$?" == "1" ]];then
-        return 1
-   else
-        return 0
-   fi
-}
-
-function checkMacPip(){
-   if [[ "$python_V" == "2" ]]||[[ "$python_V" == "3" ]];then
-
-       python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
-       if [[ ${python_brief_version} == "" ]];then
-            red "您输入的python：${python_root} 对应的pip不可用，请检查此pip或重新选择其他python"
-            echo
-            return 1
-       fi
-       pip_version=`$python_root -m pip -V |awk -F '[ .]' '{print $2}'`
-       if [[ 9 -le ${pip_version} ]];then
-            :
-       else
-            red "您的pip版本过低，请安装pip 9.0.1及以上的版本"
-            echo
-            return 1
-       fi
-       if [[ "$python_brief_version" == "" ]];then
-            clearMacPythonEnv
-            red "您的 $python_root 对应的pip存在问题，请按ctrl + c退出后重新安装pip，或切换其他python版本"
-            echo
-            return 1
-       else
-            if [[ $python_brief_version == "27" ]];then
-               uncode=`$python_root -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
-               if [[ $uncode == "" ]];then
-                  uncode="mu"
-               else
-                  uncode="m"
-               fi
-            fi
-            version_list=`echo "${python_list[@]}" | grep "$python_brief_version" `
-            if [[ "$version_list" != "" ]];then
-               return 0
-             else
-               red "未找到可用的pip或pip3。PaddlePaddle目前支持：Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入，或使用ctrl + c退出"
-               echo
-               clearMacPythonEnv
-               return 1
-            fi
-
-       fi
-   fi
-}
-
-function checkMacPythonVersion(){
-  while true
-    do
-       read -n1 -p "Step 3. 选择Python版本，请按回车键继续..."
-       echo
-       yellow "          2. 使用python 2.x"
-       yellow "          3. 使用python 3.x"
-       read -p "          => 请输入数字2或3。如输入其他字符或直接回车，将会默认使用【Python 2 】。请在这里输入并回车：" python_V
-       if [[ "$python_V" == "" ]];then
-            python_V="2"
-       fi
-       if [[ "$python_V" == "2" ]];then
-            initCheckMacPython2
-            if [[ "$?" == "0" ]];then
-                checkMacPip
-                if [[ "$?" == "0" ]];then
-                    return 0
-                else
-                    :
-                fi
-            else
-                :
-            fi
-       elif [[ "$python_V" == "3" ]];then
-            initCheckMacPython3
-            if [[ "$?" == "0" ]];then
-                checkMacPip
-                if [[ "$?" == "0" ]];then
-                    return 0
-                else
-                    :
-                fi
-            else
-                :
-            fi
-       else
-            red "输入错误，请重新输入"
-       fi
-  done
-}
-
-
-function checkMacGPU(){
-    read -n1 -p "Step 5. 选择CPU/GPU版本，请按回车键继续..."
-    echo
-    if [[ $GPU != "" ]];then
-        yellow "          MacOS环境下，暂未提供GPU版本的PaddlePaddle安装包，将为您安装CPU版本的PaddlePaddle"
-    else
-        yellow "          MacOS环境下，暂未提供GPU版本的PaddlePaddle安装包，将为您安装CPU版本的PaddlePaddle"
-        GPU=cpu
-    fi
-    echo
-}
-
-function macos() {
-  path='http://paddlepaddle.org/download?url='
-
-  while true
-      do
-
-        checkMacPaddleVersion
-
-        checkMacPythonVersion
-
-        checkMacGPU
-
-
-        green "*********************2. 开始安装*****************************"
-        echo
-        yellow "即将为您下载并安装PaddlePaddle，请按回车键继续..."
-        read -n1 -p ""
-        if [[ $paddle_version == "2" ]];then
-            $python_root -m pip install paddlepaddle
-            if [[ $? == "0" ]];then
-               green "安装成功，可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
-               break
-            else
-               rm  $whl_cpu_release
-               red "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
-               echo""
-               echo "=========================================================================================="
-               echo""
-               exit 1
-            fi
-        else
-            if [[ -f $whl_cpu_develop ]];then
-                $python_root -m pip install $whl_cpu_develop
-                if [[ $? == "0" ]];then
-                   rm -rf $whl_cpu_develop
-                   # TODO add install success check here
-                   green "安装成功！小提示：可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
-                   break
-                else
-                   red "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
-                   echo""
-                   echo "=========================================================================================="
-                   echo""
-                   exit 1
-                fi
-            else
-                wget ${path}$whl_cpu_develop -O $whl_cpu_develop
-                if [[ $? == "0" ]];then
-                    $python_root -m pip install $whl_cpu_develop
-                    if [[ $? == "0" ]];then
-                       rm  $wheel_cpu_develop
-                       green "安装成功，可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
-                       break
-                    else
-                       rm  $whl_cpu_release
-                       red "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
-                       echo""
-                       echo "=========================================================================================="
-                       echo""
-                       exit 1
-                    fi
-                else
-                      rm  $whl_cpu_develop
-                      red "未能正常安装PaddlePaddle，请检查您的网络 或者确认您是否安装有 wget，或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
-                      echo""
-                      echo "=========================================================================================="
-                      echo""
-                      exit 1
-                fi
-            fi
-        fi
-  done
-}
-
-function main() {
-  echo "*********************************"
-  green "欢迎使用PaddlePaddle快速安装脚本"
-  echo "*********************************"
-  echo
-  yellow "如果您在安装过程中遇到任何问题，请在https://github.com/PaddlePaddle/Paddle/issues反馈，我们的工作人员将会帮您答疑解惑"
-  echo
-  echo  "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括"
-  yellow "1）安装前的准备"
-  yellow "2）开始安装"
-  echo
-  read -n1 -p "请按回车键进行下一步..."
-  echo
-  echo
-  green "*********************1. 安装前的准备*****************************"
-  echo
-  echo "Step 1. 正在检测您的操作系统信息..."
-  echo
-  SYSTEM=`uname -s`
-  if [[ "$SYSTEM" == "Darwin" ]];then
-  	yellow "          您的系统为：MAC OSX"
-    echo
-  	macos
-  else
- 	yellow "          您的系统为：Linux"
-  echo
-	  OS=`cat /etc/issue|awk 'NR==1 {print $1}'`
-	  if [[ $OS == "\S" ]] || [[ "$OS" == "CentOS" ]] || [[ $OS == "Ubuntu" ]];then
-	    linux
-	  else
-	    red "您的系统不在本安装包的支持范围，如您需要在windows环境下安装PaddlePaddle，请您参考PaddlePaddle官网的windows安装文档"
-	  fi
-  fi
-}
-main
diff --git a/paddle/scripts/installation_validate.py b/paddle/scripts/installation_validate.py
deleted file mode 100644
index f84e2f4b176609dec28a8e29afea74d3654e9e4c..0000000000000000000000000000000000000000
--- a/paddle/scripts/installation_validate.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle as pd
-
-print(pd.__version__)
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
deleted file mode 100755
index d3932a4561f4c152b2f33cba28e3c36f5e3d22e1..0000000000000000000000000000000000000000
--- a/paddle/scripts/paddle_build.sh
+++ /dev/null
@@ -1,1087 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#=================================================
-#                   Utils
-#=================================================
-
-set -ex
-
-function print_usage() {
-    echo -e "\n${RED}Usage${NONE}:
-    ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
-
-    echo -e "\n${RED}Options${NONE}:
-    ${BLUE}build${NONE}: run build for x86 platform
-    ${BLUE}test${NONE}: run all unit tests
-    ${BLUE}single_test${NONE}: run a single unit test
-    ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
-    ${BLUE}doc${NONE}: generate paddle documents
-    ${BLUE}gen_doc_lib${NONE}: generate paddle documents library
-    ${BLUE}html${NONE}: convert C++ source code into HTML
-    ${BLUE}dockerfile${NONE}: generate paddle release dockerfile
-    ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
-    ${BLUE}check_style${NONE}: run code style check
-    ${BLUE}cicheck${NONE}: run CI tasks
-    ${BLUE}assert_api_not_changed${NONE}: check api compability
-    "
-}
-
-function init() {
-    RED='\033[0;31m'
-    BLUE='\033[0;34m'
-    BOLD='\033[1m'
-    NONE='\033[0m'
-
-    PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
-    if [ -z "${SCRIPT_NAME}" ]; then
-        SCRIPT_NAME=$0
-    fi
-}
-
-function cmake_base() {
-    # build script will not fail if *.deb does not exist
-    rm *.deb 2>/dev/null || true
-    # delete previous built whl packages
-    rm -rf python/dist 2>/dev/null || true
-
-    # Support build for all python versions, currently
-    # including cp27-cp27m and cp27-cp27mu.
-    PYTHON_FLAGS=""
-    SYSTEM=`uname -s`
-    if [ "$SYSTEM" == "Darwin" ]; then
-        echo "Using python abi: $1"
-        if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
-            if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
-                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
-                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
-                export PATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/:${PATH}
-                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7
-            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7
-            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib"
-            pip install --user -r ${PADDLE_ROOT}/python/requirements.txt
-            else
-                exit 1
-            fi
-        elif [ "$1" == "cp35-cp35m" ]; then
-            if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then
-                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
-                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
-                export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH}
-                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
-            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
-            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
-                pip3.5 uninstall -y protobuf
-                pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt
-            else
-                exit 1
-            fi
-        elif [ "$1" == "cp36-cp36m" ]; then
-            if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then
-                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
-                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
-                export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH}
-                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
-            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
-            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
-                pip3.6 uninstall -y protobuf
-                pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
-            else
-                exit 1
-            fi
-        elif [ "$1" == "cp37-cp37m" ]; then
-            if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
-                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
-                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
-                export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH}
-                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
-            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
-            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
-                pip3.7 uninstall -y protobuf
-                pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
-            else
-                exit 1
-            fi
-        fi
-    else
-        if [ "$1" != "" ]; then
-            echo "using python abi: $1"
-            if [ "$1" == "cp27-cp27m" ]; then
-                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
-                export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
-                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-            -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
-                pip uninstall -y protobuf
-                pip install -r ${PADDLE_ROOT}/python/requirements.txt
-            elif [ "$1" == "cp27-cp27mu" ]; then
-                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
-                export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
-                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-            -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
-                pip uninstall -y protobuf
-                pip install -r ${PADDLE_ROOT}/python/requirements.txt
-            elif [ "$1" == "cp35-cp35m" ]; then
-                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH}
-                export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH}
-                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
-            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
-            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
-                pip3.5 uninstall -y protobuf
-                pip3.5 install -r ${PADDLE_ROOT}/python/requirements.txt
-            elif [ "$1" == "cp36-cp36m" ]; then
-                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
-                export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
-                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
-            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
-            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
-                pip3.6 uninstall -y protobuf
-                pip3.6 install -r ${PADDLE_ROOT}/python/requirements.txt
-            elif [ "$1" == "cp37-cp37m" ]; then
-                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
-                export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
-                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7
-            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
-            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
-                pip3.7 uninstall -y protobuf
-                pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
-           fi
-        else
-            pip uninstall -y protobuf
-            pip install -r ${PADDLE_ROOT}/python/requirements.txt
-        fi
-    fi
-
-    if [ "$SYSTEM" == "Darwin" ]; then
-        WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON}
-        WITH_AVX=${WITH_AVX:-ON}
-        INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-~/.cache/inference_demo}
-    else
-        INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
-    fi
-
-    distibuted_flag=${WITH_DISTRIBUTE:-OFF}
-    grpc_flag=${WITH_GRPC:-${distibuted_flag}}
-
-    cat <<EOF
-    ========================================
-    Configuring cmake in /paddle/build ...
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
-        ${PYTHON_FLAGS}
-        -DWITH_DSO=ON
-        -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${distibuted_flag}
-        -DWITH_MKL=${WITH_MKL:-ON}
-        -DWITH_NGRAPH=${WITH_NGRAPH:-OFF}
-        -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN}
-        -DWITH_PYTHON=${WITH_PYTHON:-ON}
-        -DCUDNN_ROOT=/usr/
-        -DWITH_TESTING=${WITH_TESTING:-ON}
-        -DWITH_COVERAGE=${WITH_COVERAGE:-OFF}
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
-        -DWITH_HIGH_LEVEL_API_TEST=${WITH_HIGH_LEVEL_API_TEST:-OFF}
-        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
-        -DPY_VERSION=${PY_VERSION:-2.7}
-        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-        -DWITH_GRPC=${grpc_flag}
-    ========================================
-EOF
-    # Disable UNITTEST_USE_VIRTUALENV in docker because
-    # docker environment is fully controlled by this script.
-    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-    cmake .. \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
-        ${PYTHON_FLAGS} \
-        -DWITH_DSO=ON \
-        -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${distibuted_flag} \
-        -DWITH_MKL=${WITH_MKL:-ON} \
-        -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \
-        -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DNOAVX_CORE_FILE=${NOAVX_CORE_FILE:-""} \
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-        -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} \
-        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-        -DCUDNN_ROOT=/usr/ \
-        -DWITH_TESTING=${WITH_TESTING:-ON} \
-        -DWITH_COVERAGE=${WITH_COVERAGE:-OFF} \
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
-        -DWITH_HIGH_LEVEL_API_TEST=${WITH_HIGH_LEVEL_API_TEST:-OFF} \
-        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
-        -DPY_VERSION=${PY_VERSION:-2.7} \
-        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-        -DWITH_GRPC=${grpc_flag}
-
-}
-
-function cmake_gen() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    cmake_base $1
-}
-
-function abort(){
-    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to check what is wrong." 1>&2
-    exit 1
-}
-
-function check_style() {
-    trap 'abort' 0
-    set -e
-
-    if [ -x "$(command -v gimme)" ]; then
-    	eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-    fi
-
-    pip install cpplint
-    # set up go environment for running gometalinter
-    mkdir -p $GOPATH/src/github.com/PaddlePaddle/
-    ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
-
-    export PATH=/usr/bin:$PATH
-    pre-commit install
-    clang-format --version
-
-    if ! pre-commit run -a ; then
-        git diff
-        exit 1
-    fi
-
-    trap : 0
-}
-
-#=================================================
-#              Build
-#=================================================
-
-function build_base() {
-    if [ "$SYSTEM" == "Linux" ];then
-      parallel_number=`nproc`
-    else
-      parallel_number=8
-    fi
-    if [ "$1" != "" ]; then
-      parallel_number=$1
-    fi
-    make clean
-    make -j ${parallel_number}
-    make install -j ${parallel_number}
-}
-
-
-function build() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    cat <<EOF
-    ============================================
-    Building in /paddle/build ...
-    ============================================
-EOF
-    build_base $1
-}
-
-function build_mac() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    cat <<EOF
-    ============================================
-    Building in /paddle/build ...
-    ============================================
-EOF
-    make clean
-    make -j 8
-    make install -j 8
-}
-
-function run_test() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
-    cat <<EOF
-    ========================================
-    Running unit tests ...
-    ========================================
-EOF
-        if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
-            ctest -V
-        else
-            ctest --output-on-failure
-        fi
-    fi
-}
-
-
-function combine_avx_noavx_build() {
-    mkdir -p ${PADDLE_ROOT}/build.noavx
-    cd ${PADDLE_ROOT}/build.noavx
-    WITH_AVX=OFF
-    cmake_base ${PYTHON_ABI:-""}
-    build_base
-
-    # build combined one
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    NOAVX_CORE_FILE=`find ${PADDLE_ROOT}/build.noavx/python/paddle/fluid/ -name "core_noavx.*"`
-    WITH_AVX=ON
-
-    cmake_base ${PYTHON_ABI:-""}
-    build_base
-}
-
-
-function run_brpc_test() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    if [[ ${WITH_TESTING:-ON} == "ON" \
-        && ${WITH_DISTRIBUTE:-OFF} == "ON" \
-        && ${WITH_GRPC:-OFF} == "OFF" ]] ; then
-    cat <<EOF
-    ========================================
-    Running brpc unit tests ...
-    ========================================
-EOF
-        set +x
-        declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test" \
-        "rpc_server_test" "varhandle_test" "collective_server_test" "brpc_serde_test")
-        all_tests=`ctest -N`
-
-        for t in "${other_tests[@]}"
-        do
-            if [[ ${all_tests} != *$t* ]]; then
-                continue
-            fi
-
-            if [[ ${TESTING_DEBUG_MODE:-OFF} == "ON" ]] ; then
-                ctest -V -R $t
-            else
-                ctest --output-on-failure -R $t
-            fi
-        done
-        set -x
-
-        if [[ ${TESTING_DEBUG_MODE:-OFF} == "ON" ]] ; then
-            ctest -V -R test_dist_*
-        else
-            ctest --output-on-failure -R test_dist_*
-        fi
-    fi
-}
-
-
-
-function run_mac_test() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
-    cat <<EOF
-    ========================================
-    Running unit tests ...
-    ========================================
-EOF
-        #remove proxy here to fix dist error on mac
-        export http_proxy=
-        export https_proxy=
-        # make install should also be test when unittest
-        make install -j 8
-
-        set +ex
-        if [ "$1" == "cp27-cp27m" ]; then
-            pip uninstall -y paddlepaddle
-        elif [ "$1" == "cp35-cp35m" ]; then
-            pip3.5 uninstall -y paddlepaddle
-        elif [ "$1" == "cp36-cp36m" ]; then
-            pip3.6 uninstall -y paddlepaddle
-        elif [ "$1" == "cp37-cp37m" ]; then
-            pip3.7 uninstall -y paddlepaddle
-        fi
-        set -ex
-
-        if [ "$1" == "cp27-cp27m" ]; then
-            set -e
-            pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
-            python ${PADDLE_ROOT}/paddle/scripts/installation_validate.py
-        elif [ "$1" == "cp35-cp35m" ]; then
-            pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
-        elif [ "$1" == "cp36-cp36m" ]; then
-            pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
-        elif [ "$1" == "cp37-cp37m" ]; then
-            pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
-        fi
-
-        # TODO: jiabin need to refine this part when these tests fixed on mac
-        ctest --output-on-failure -j $2
-
-        paddle version
-    fi
-}
-
-function assert_api_not_changed() {
-    mkdir -p ${PADDLE_ROOT}/build/.check_api_workspace
-    cd ${PADDLE_ROOT}/build/.check_api_workspace
-    virtualenv .env
-    source .env/bin/activate
-    pip install ${PADDLE_ROOT}/build/python/dist/*whl
-    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
-
-    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
-        # Use sed to make python2 and python3 sepc keeps the same
-        sed -i 's/arg0: str/arg0: unicode/g' new.spec
-        sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
-    fi
-    # ComposeNotAligned has significant difference between py2 and py3
-    sed -i '/.*ComposeNotAligned.*/d' new.spec
-
-    python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
-
-    # Currently, we only check in PR_CI python 2.7
-    if [ "$SYSTEM" != "Darwin" ]; then
-      if [ "$1" == "" ] || [ "$1" == "cp27-cp27m" ] || [ "$1" == "cp27-cp27mu" ]; then
-        python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py ${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_op_maker.spec
-      fi
-    fi
-    deactivate
-}
-
-function assert_api_spec_approvals() {
-    /bin/bash ${PADDLE_ROOT}/tools/check_api_approvals.sh
-    if [ "$?" != 0 ];then
-       exit 1
-    fi
-}
-
-
-function single_test() {
-    TEST_NAME=$1
-    if [ -z "${TEST_NAME}" ]; then
-        echo -e "${RED}Usage:${NONE}"
-        echo -e "${BOLD}${SCRIPT_NAME}${NONE} ${BLUE}single_test${NONE} [test_name]"
-        exit 1
-    fi
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
-    cat <<EOF
-    ========================================
-    Running ${TEST_NAME} ...
-    ========================================
-EOF
-        ctest --output-on-failure -R ${TEST_NAME}
-    fi
-}
-
-function bind_test() {
-    # the number of process to run tests
-    NUM_PROC=6
-
-    # calculate and set the memory usage for each process
-    MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
-    export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
-
-    # get the CUDA device count
-    CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
-
-    for (( i = 0; i < $NUM_PROC; i++ )); do
-        cuda_list=()
-        for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
-            s=$[i+j]
-            n=$[s%CUDA_DEVICE_COUNT]
-            if [ $j -eq 0 ]; then
-                cuda_list=("$n")
-            else
-                cuda_list="$cuda_list,$n"
-            fi
-        done
-        echo $cuda_list
-        # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
-        # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
-        env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
-    done
-    wait
-}
-
-EXIT_CODE=0;
-function caught_error() {
- for job in `jobs -p`; do
-        # echo "PID => ${job}"
-        if ! wait ${job} ; then
-            echo "At least one test failed with exit code => $?" ;
-            EXIT_CODE=1;
-        fi
-    done
-}
-
-function card_test() {
-    set -m
-
-    # get the CUDA device count
-    CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
-
-    testcases=$1
-    if (( $# > 1 )); then
-        cardnumber=$2
-        if (( $cardnumber > $CUDA_DEVICE_COUNT )); then
-            cardnumber=$CUDA_DEVICE_COUNT
-        fi
-    else
-        cardnumber=$CUDA_DEVICE_COUNT
-    fi
-
-    if [[ "$testcases" == "" ]]; then
-        return 0
-    fi
-
-    trap 'caught_error' CHLD
-
-    NUM_PROC=$[CUDA_DEVICE_COUNT/$cardnumber]
-    for (( i = 0; i < $NUM_PROC; i++ )); do
-        # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
-        # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
-        cuda_list=()
-        for (( j = 0; j < cardnumber; j++ )); do
-            if [ $j -eq 0 ]; then
-                    cuda_list=("$[i*cardnumber]")
-                else
-                    cuda_list="$cuda_list,$[i*cardnumber+j]"
-            fi
-        done
-        if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
-            if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
-                ctest -I $i,,$NUM_PROC -R "($testcases)" -V &
-            else
-                env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -V &
-            fi
-        else
-            if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
-                ctest -I $i,,$NUM_PROC -R "($testcases)" --output-on-failure &
-            else
-                # echo "env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R \"($testcases)\" --output-on-failure &"
-                env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" --output-on-failure &
-            fi
-        fi
-    done
-
-    wait; # wait for all subshells to finish
-    set +m
-}
-
-function parallel_test_base() {
-    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
-    cat <<EOF
-    ========================================
-    Running unit tests in parallel way ...
-    ========================================
-EOF
-
-set +x
-        EXIT_CODE=0;
-        test_cases=$(ctest -N -V) # get all test cases
-        exclusive_tests=''        # cases list which would be run exclusively
-        single_card_tests=''      # cases list which would take one graph card
-        multiple_card_tests=''    # cases list which would take multiple GPUs, most cases would be two GPUs
-        is_exclusive=''           # indicate whether the case is exclusive type
-        is_multicard=''           # indicate whether the case is multiple GPUs type
-        while read -r line; do
-            if [[ "$line" == "" ]]; then
-                continue
-            fi
-                read matchstr <<< $(echo "$line"|grep -oEi 'Test[ \t]+#')
-                if [[ "$matchstr" == "" ]]; then
-                    # Any test case with LABELS property would be parse here
-                    # RUN_TYPE=EXCLUSIVE mean the case would run exclusively
-                    # RUN_TYPE=DIST mean the case would take two graph GPUs during runtime
-                    read is_exclusive <<< $(echo "$line"|grep -oEi "RUN_TYPE=EXCLUSIVE")
-                    read is_multicard <<< $(echo "$line"|grep -oEi "RUN_TYPE=DIST")
-                    continue
-                fi
-                read testcase <<< $(echo "$line"|grep -oEi "\w+$")
-
-                if [[ "$is_multicard" == "" ]]; then
-                  # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
-                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist")
-                fi
-
-                if [[ "$is_exclusive" != "" ]]; then
-                    if [[ "$exclusive_tests" == "" ]]; then
-                        exclusive_tests="^$testcase$"
-                    else
-                        exclusive_tests="$exclusive_tests|^$testcase$"
-                    fi
-                elif [[ "$is_multicard" != "" ]]; then
-                    if [[ "$multiple_card_tests" == "" ]]; then
-                        multiple_card_tests="^$testcase$"
-                    else
-                        multiple_card_tests="$multiple_card_tests|^$testcase$"
-                    fi
-                else
-                    if [[ "$single_card_tests" == "" ]]; then
-                        single_card_tests="^$testcase$"
-                    else
-                        single_card_tests="$single_card_tests|^$testcase$"
-                    fi
-                fi
-                is_exclusive=''
-                is_multicard=''
-                matchstr=''
-                testcase=''
-        done <<< "$test_cases";
-
-        card_test "$single_card_tests" 1    # run cases with single GPU
-        card_test "$multiple_card_tests" 2  # run cases with two GPUs
-        card_test "$exclusive_tests"        # run cases exclusively, in this cases would be run with 4/8 GPUs
-        if [[ "$EXIT_CODE" != "0" ]]; then
-            exit 8;
-        fi
-set -ex
-    fi
-}
-
-function parallel_test() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    parallel_test_base
-}
-
-function gen_doc_lib() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    cat <<EOF
-    ========================================
-    Building documentation library ...
-    In /paddle/build
-    ========================================
-EOF
-    cmake .. \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF \
-
-    local LIB_TYPE=$1
-    case $LIB_TYPE in
-      full)
-        # Build full Paddle Python module. Will timeout without caching 'copy_paddle_pybind' first
-        make -j `nproc` framework_py_proto copy_paddle_pybind paddle_python
-        ;;
-      pybind)
-        # Build paddle pybind library. Takes 49 minutes to build. Might timeout
-        make -j `nproc` copy_paddle_pybind
-        ;;
-      proto)
-        # Even smaller library.
-        make -j `nproc` framework_py_proto
-        ;;
-      *)
-        exit 0
-        ;;
-      esac
-}
-
-function gen_html() {
-    cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-    export WOBOQ_OUT=${PADDLE_ROOT}/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-    	-b ${PADDLE_ROOT}/build \
-    	-a \
-    	-o $WOBOQ_OUT \
-    	-p paddle:${PADDLE_ROOT}
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-}
-
-function gen_dockerfile() {
-    # Set BASE_IMAGE according to env variables
-    CUDA_MAJOR="$(echo $CUDA_VERSION | cut -d '.' -f 1).$(echo $CUDA_VERSION | cut -d '.' -f 2)"
-    CUDNN_MAJOR=$(echo $CUDNN_VERSION | cut -d '.' -f 1)
-    if [[ ${WITH_GPU} == "ON" ]]; then
-        BASE_IMAGE="nvidia/cuda:${CUDA_MAJOR}-cudnn${CUDNN_MAJOR}-devel-ubuntu16.04"
-    else
-        BASE_IMAGE="ubuntu:16.04"
-    fi
-
-    DOCKERFILE_GPU_ENV=""
-    DOCKERFILE_CUDNN_DSO=""
-    DOCKERFILE_CUBLAS_DSO=""
-    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -sf /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
-        DOCKERFILE_CUBLAS_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.${CUDA_MAJOR} /usr/lib/x86_64-linux-gnu/libcublas.so"
-    fi
-
-    cat <<EOF
-    ========================================
-    Generate ${PADDLE_ROOT}/build/Dockerfile ...
-    ========================================
-EOF
-    
-    ref_CUDA_MAJOR="$(echo $CUDA_VERSION | cut -d '.' -f 1)"
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        ref_gpu=gpu-cuda${ref_CUDA_MAJOR}-cudnn${CUDNN_MAJOR}
-    else
-        ref_gpu=cpu
-    fi
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        install_gpu="_gpu"
-    else
-        install_gpu=""
-    fi
-    if [[ ${WITH_MKL} == "ON" ]]; then
-        ref_mkl=mkl
-    else
-        ref_mkl=openblas
-    fi
-
-    ref_web=https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}
-
-    ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
-
-    ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
-
-    if [[ ${PADDLE_BRANCH} != "latest" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
-        ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
-        ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
-        ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
-        ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
-        ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
-        ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
-        ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
-        ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
-    fi
-
-    #ref_paddle2_mv1=""
-    #ref_paddle2_mv2=""
-    ref_paddle35_mv1=""
-    ref_paddle35_mv2=""
-    ref_paddle36_mv1=""
-    ref_paddle36_mv2=""
-    #ref_paddle37_mv1=""
-    #ref_paddle37_mv2=""
-    if [[ ${PADDLE_BRANCH} == "latest" && ${WITH_GPU} == "ON" ]]; then
-        #ref_paddle2_whl=paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl
-        ref_paddle35_whl=paddlepaddle_gpu-1.5.1-cp35-cp35m-linux_x86_64.whl
-        ref_paddle36_whl=paddlepaddle_gpu-1.5.1-cp36-cp36m-linux_x86_64.whl
-        #ref_paddle37_whl=paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl
-        #ref_paddle2_mv1="mv ref_paddle2 paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl &&"
-        #ref_paddle2_mv2="&& mv paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl ref_paddle2"
-        ref_paddle35_mv1="mv ${ref_paddle35} ${ref_paddle35_whl} &&"
-        ref_paddle35_mv2="&& mv ${ref_paddle35_whl} ${ref_paddle35}"
-        ref_paddle36_mv1="mv ${ref_paddle36} ${ref_paddle36_whl} &&"
-        ref_paddle36_mv2="&& mv ${ref_paddle36_whl} ${ref_paddle36}"
-        #ref_paddle37_mv1="mv ref_paddle37 paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl &&"
-        #ref_paddle37_mv2="&& mv paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl ref_paddle37"
-    fi
-    if [[ ${PADDLE_BRANCH} == "latest" && ${WITH_GPU} != "ON" ]]; then
-        #ref_paddle2_whl=paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl
-        ref_paddle35_whl=paddlepaddle-1.5.1-cp35-cp35m-linux_x86_64.whl
-        ref_paddle36_whl=paddlepaddle-1.5.1-cp36-cp36m-linux_x86_64.whl
-        #ref_paddle37_whl=paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl
-        #ref_paddle2_mv1="mv ref_paddle2 paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl &&"
-        #ref_paddle2_mv2="&& mv paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl ref_paddle2"
-        ref_paddle35_mv1="mv ${ref_paddle35} ${ref_paddle35_whl} &&"
-        ref_paddle35_mv2="&& mv ${ref_paddle35_whl} ${ref_paddle35}"
-        ref_paddle36_mv1="mv ${ref_paddle36} ${ref_paddle36_whl} &&"
-        ref_paddle36_mv2="&& mv ${ref_paddle36_whl} ${ref_paddle36}"
-        #ref_paddle37_mv1="mv ref_paddle37 paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl &&"
-        #ref_paddle37_mv2="&& mv paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl ref_paddle37"
-    fi
-    
-    cat > ${PADDLE_ROOT}/build/Dockerfile <<EOF
-    FROM ${BASE_IMAGE}
-    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-    ENV HOME /root
-EOF
-
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=2.4.7-1+cuda${CUDA_MAJOR} libnccl-dev=2.4.7-1+cuda${CUDA_MAJOR} || true"
-    else
-        NCCL_DEPS="true"
-    fi
-
-    if [[ ${WITH_GPU} == "ON" && ${CUDA_MAJOR} = "8.0" ]]; then 
-        NCCL_DEPS="apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=2.2.13-1+cuda8.0 libnccl-dev=2.2.13-1+cuda8.0"
-    fi
-
-    PADDLE_VERSION="paddle version"
-    CMD='"paddle", "version"'
-    
-    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
-    # run paddle version to install python packages first
-    RUN apt-get update && ${NCCL_DEPS}
-    RUN apt-get install -y wget python3 python3-pip libgtk2.0-dev dmidecode python3-tk && \
-        pip3 install opencv-python py-cpuinfo==5.0.0 && wget ${ref_web}/${ref_paddle35} && ${ref_paddle35_mv1} pip3 install ${ref_paddle35_whl} ${ref_paddle35_mv2}; apt-get install -f -y && \
-        apt-get clean -y && \
-        rm -f ${ref_paddle35} && \
-        ldconfig
-    ${DOCKERFILE_CUDNN_DSO}
-    ${DOCKERFILE_CUBLAS_DSO}
-    ${DOCKERFILE_GPU_ENV}
-EOF
-    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
-    # run paddle version to install python packages first
-    RUN apt-get update && ${NCCL_DEPS}
-    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
-        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
-        xz-utils tk-dev libffi-dev liblzma-dev
-    RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
-        tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
-        ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
-        wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
-        tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
-        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
-        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.6.0.tgz
-    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
-        pip3.6 install opencv-python && wget ${ref_web}/${ref_paddle36} && ${ref_paddle36_mv1} pip3.6 install ${ref_paddle36_whl} ${ref_paddle36_mv2}; apt-get install -f -y && \
-        apt-get clean -y && \
-        rm -f ${ref_paddle36} && \
-        ldconfig
-EOF
-    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
-    # run paddle version to install python packages first
-    RUN apt-get update && ${NCCL_DEPS}
-    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
-        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
-        xz-utils tk-dev libffi-dev liblzma-dev
-    RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
-        tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
-        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
-        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.7.0.tgz
-    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
-        pip3.7 install opencv-python && wget ${ref_web}/${ref_paddle37} && pip3.7 install ${ref_paddle37_whl}; apt-get install -f -y && \
-        apt-get clean -y && \
-        rm -f ${ref_paddle37} && \
-        ldconfig
-EOF
-    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
-    # run paddle version to install python packages first
-    RUN apt-get update && ${NCCL_DEPS}
-    RUN apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
-        wget ${ref_web}/${ref_paddle2} && pip install ${ref_paddle2_whl}; apt-get install -f -y && \
-        apt-get clean -y && \
-        rm -f ${ref_paddle2} && \
-        ${PADDLE_VERSION} && \
-        ldconfig
-EOF
-
-    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
-    # default command shows the paddle version and exit
-    CMD [${CMD}]
-EOF
-}
-
-function gen_fluid_lib() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    cat <<EOF
-    ========================================
-    Generating fluid library for train and inference ...
-    ========================================
-EOF
-    parallel_number=`nproc`
-    if [[ "$1" != "" ]]; then
-      parallel_number=$1
-    fi
-    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN}
-
-    make -j ${parallel_number} fluid_lib_dist
-    make -j ${parallel_number} inference_lib_dist
-}
-
-function tar_fluid_lib() {
-    cat <<EOF
-    ========================================
-    Taring fluid library for train and inference ...
-    ========================================
-EOF
-    cd ${PADDLE_ROOT}/build
-    cp -r fluid_install_dir fluid
-    tar -czf fluid.tgz fluid
-    cp -r fluid_inference_install_dir fluid_inference
-    tar -czf fluid_inference.tgz fluid_inference
-}
-
-function test_fluid_lib() {
-    cat <<EOF
-    ========================================
-    Testing fluid library for inference ...
-    ========================================
-EOF
-    cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
-    ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
-             ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
-    ./clean.sh
-}
-
-
-function build_document_preview() {
-    sh /paddle/tools/document_preview.sh ${PORT}
-}
-
-
-function example() {
-    pip install /paddle/build/python/dist/*.whl
-    paddle version
-    cd ${PADDLE_ROOT}/python/paddle/fluid
-    python sampcd_processor.py cpu 
-    if [ "$?" != "0" ];then
-      echo "Code instance execution failed"
-      exit 1
-    fi
-}
-
-
-function main() {
-    local CMD=$1
-    local parallel_number=$2
-    init
-    case $CMD in
-      build_only)
-        cmake_gen ${PYTHON_ABI:-""}
-        build ${parallel_number}
-        ;;
-      build_and_check)
-        cmake_gen ${PYTHON_ABI:-""}
-        build ${parallel_number}
-        assert_api_not_changed ${PYTHON_ABI:-""}
-        example
-        assert_api_spec_approvals
-        ;;
-      build)
-        cmake_gen ${PYTHON_ABI:-""}
-        build ${parallel_number}
-        gen_dockerfile ${PYTHON_ABI:-""}
-        assert_api_spec_approvals
-        ;;
-      combine_avx_noavx)
-        combine_avx_noavx_build
-        gen_dockerfile ${PYTHON_ABI:-""}
-        ;;
-      combine_avx_noavx_build_and_test)
-        combine_avx_noavx_build
-        gen_dockerfile ${PYTHON_ABI:-""}
-        parallel_test_base
-        ;;
-      test)
-        parallel_test
-        ;;
-      single_test)
-        single_test $2
-        ;;
-      bind_test)
-        bind_test
-        ;;
-      gen_doc_lib)
-        gen_doc_lib $2
-        ;;
-      html)
-        gen_html
-        ;;
-      dockerfile)
-        gen_dockerfile ${PYTHON_ABI:-""}
-        ;;
-      fluid_inference_lib)
-        cmake_gen ${PYTHON_ABI:-""}
-        gen_fluid_lib ${parallel_number}
-        tar_fluid_lib
-        test_fluid_lib
-        ;;
-      check_style)
-        check_style
-        ;;
-      cicheck)
-        cmake_gen ${PYTHON_ABI:-""}
-        build ${parallel_number}
-        parallel_test
-        ;;
-      cicheck_brpc)
-        cmake_gen ${PYTHON_ABI:-""}
-        build ${parallel_number}
-        run_brpc_test
-        ;;
-      assert_api)
-        assert_api_not_changed ${PYTHON_ABI:-""}
-        assert_api_spec_approvals
-        ;;
-      test_inference)
-        gen_fluid_lib ${parallel_number}
-        test_fluid_lib
-        ;;
-      assert_api_approvals)
-        assert_api_spec_approvals
-        ;;
-      maccheck)
-        cmake_gen ${PYTHON_ABI:-""}
-        build_mac
-        run_mac_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
-        ;;
-      macbuild)
-        cmake_gen ${PYTHON_ABI:-""}
-        build_mac
-        ;;
-      cicheck_py35)
-        cmake_gen ${PYTHON_ABI:-""}
-        build ${parallel_number}
-        parallel_test
-        ;;
-      cmake_gen)
-        cmake_gen ${PYTHON_ABI:-""}
-        ;;
-      gen_fluid_lib)
-        gen_fluid_lib ${parallel_number}
-        ;;
-      test_fluid_lib)
-        test_fluid_lib
-        ;;
-      document)
-        cmake_gen ${PYTHON_ABI:-""}
-        build ${parallel_number}
-        build_document_preview
-        ;;
-      api_example)
-        example
-        ;;
-      *)
-        print_usage
-        exit 1
-        ;;
-      esac
-}
-
-main $@
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
deleted file mode 100755
index d6b639d0da2a54e1e31051c44bc05b333e8493ce..0000000000000000000000000000000000000000
--- a/paddle/scripts/paddle_docker_build.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-function start_build_docker() {
-    docker pull $IMG
-
-    apt_mirror='s#http://archive.ubuntu.com/ubuntu#mirror://mirrors.ubuntu.com/mirrors.txt#g'
-    DOCKER_ENV=$(cat <<EOL
-        -e FLAGS_fraction_of_gpu_memory_to_use=0.15 \
-        -e CTEST_OUTPUT_ON_FAILURE=1 \
-        -e CTEST_PARALLEL_LEVEL=1 \
-        -e APT_MIRROR=${apt_mirror} \
-        -e WITH_GPU=ON \
-        -e CUDA_ARCH_NAME=Auto \
-        -e WITH_AVX=ON \
-        -e WITH_TESTING=ON \
-        -e WITH_COVERAGE=ON \
-        -e COVERALLS_UPLOAD=ON \
-        -e WITH_DEB=OFF \
-        -e CMAKE_BUILD_TYPE=RelWithDebInfo \
-        -e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \
-        -e CUDA_VISIBLE_DEVICES=0,1 \
-        -e WITH_DISTRIBUTE=ON \
-        -e RUN_TEST=ON
-EOL
-    )
-
-    DOCKER_CMD="nvidia-docker"
-    if ! [ -x "$(command -v ${DOCKER_CMD})" ]; then
-        DOCKER_CMD="docker"
-    fi
-    if [ ! -d "${HOME}/.ccache" ]; then
-        mkdir ${HOME}/.ccache
-    fi
-    set -ex
-    ${DOCKER_CMD} run -it \
-        ${DOCKER_ENV} \
-        -e SCRIPT_NAME=$0 \
-        -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
-        -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
-        -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
-        -v $PADDLE_ROOT:/paddle \
-        -v ${HOME}/.ccache:/root/.ccache \
-        -w /paddle \
-        $IMG \
-        paddle/scripts/paddle_build.sh $@
-    set +x
-}
-
-function main() {
-    DOCKER_REPO="paddlepaddle/paddle"
-    VERSION="latest-dev"
-    PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
-    IMG=${DOCKER_REPO}:${VERSION}
-    start_build_docker $@
-}
-
-main $@
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
deleted file mode 100755
index be8bc294149216583cb75cd70f02a70c05a66ded..0000000000000000000000000000000000000000
--- a/paddle/scripts/submit_local.sh.in
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/bin/bash
-
-function version(){
-        echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
-        echo "    with_avx: @WITH_AVX@"
-        echo "    with_gpu: @WITH_GPU@"
-        echo "    with_mkl: @WITH_MKL@"
-        echo "    with_mkldnn: @WITH_MKLDNN@"
-        echo "    with_python: @WITH_PYTHON@"
-}
-
-function ver2num() {
-  set -e
-  # convert version to number.
-  if [ -z "$1" ]; then # empty argument
-    printf "%03d%03d%03d%03d%03d" 0
-  else
-    local VERN=$(echo $1 | sed 's#v##g' | sed 's#\.# #g' \
-        | sed 's#a# 0 #g' | sed 's#b# 1 #g' | sed 's#rc# 2 #g')
-    if [ `echo $VERN | wc -w` -eq 3 ] ; then
-      printf "%03d%03d%03d%03d%03d" $VERN 999 999
-    else
-      printf "%03d%03d%03d%03d%03d" $VERN
-    fi
-  fi
-  set +e
-}
-
-function cpu_config() {
-  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
-  # only when MKL enabled
-  if [ "@WITH_MKL@" == "OFF" ]; then
-    return 0
-  fi
-  platform="`uname -s`"
-  ht=0
-  if [ $platform == "Linux" ]; then
-    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
-  elif [ $platform == "Darwin" ]; then
-    if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
-      # HT is OFF
-      ht=1
-    fi
-  else
-    return 0
-  fi
-  if [ $ht -eq 1 ]; then # HT is OFF
-    if [ -z "$KMP_AFFINITY" ]; then
-      export KMP_AFFINITY="granularity=fine,compact,0,0"
-    fi
-    if [ -z "$OMP_DYNAMIC" ]; then
-      export OMP_DYNAMIC="FALSE"
-    fi
-  else # HT is ON
-    if [ -z "$KMP_AFFINITY" ]; then
-      export KMP_AFFINITY="granularity=fine,compact,1,0"
-    fi
-    if [ -z "$OMP_DYNAMIC" ]; then
-      export OMP_DYNAMIC="True"
-    fi
-  fi
-}
-
-function threads_config() {
-  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
-  # according to trainer_count and total processors
-  # only when MKL enabled
-  # auto set OPENBLAS_NUM_THREADS when do not use MKL
-  platform="`uname -s`"
-  processors=0
-  if [ $platform == "Linux" ]; then
-    processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
-  elif [ $platform == "Darwin" ]; then
-    processors=`sysctl -n hw.logicalcpu`
-  else
-    return 0
-  fi
-  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
-  if [ -z $trainers ]; then
-    trainers=1
-  fi
-  threads=$((processors / trainers))
-  if [ $threads -eq 0 ]; then
-    threads=1
-  fi
-  if [ "@WITH_MKL@" == "ON" ]; then
-    if [ -z "$OMP_NUM_THREADS" ]; then
-      export OMP_NUM_THREADS=$threads
-    fi
-    if [ -z "$MKL_NUM_THREADS" ]; then
-      export MKL_NUM_THREADS=$threads
-    fi
-  else
-    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
-      export OPENBLAS_NUM_THREADS=$threads
-    fi
-    if [ $threads -gt 1 ] && [ -z "$OPENBLAS_MAIN_FREE" ]; then
-      export OPENBLAS_MAIN_FREE=1
-    fi
-  fi
-  
-}
-
-PADDLE_CONF_HOME="$HOME/.config/paddle"
-mkdir -p ${PADDLE_CONF_HOME}
-
-if [ -z "${PADDLE_NO_STAT+x}" ]; then
-    SERVER_VER=`curl -m 5 -X POST --data content="{ \"version\": \"@PADDLE_VERSION@\" }"\
-        -b ${PADDLE_CONF_HOME}/paddle.cookie \
-        -c ${PADDLE_CONF_HOME}/paddle.cookie \
-        http://api.paddlepaddle.org/version 2>/dev/null`
-    if [ $? -eq 0 ] && [ "$(ver2num @PADDLE_VERSION@)" -lt  $(ver2num $SERVER_VER) ]; then
-      echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org"
-    fi
-fi
-
-PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-if [ ! -z "${DEBUGGER}" ]; then
-    echo "Using debug command ${DEBUGGER}"
-fi
-
-CUDNN_LIB_PATH="@CUDNN_LIB_PATH@"
-
-if [ ! -z "${CUDNN_LIB_PATH}" ]; then
-    export LD_LIBRARY_PATH=${CUDNN_LIB_PATH}:${LD_LIBRARY_PATH}
-fi
-
-export PYTHONPATH=${PWD}:${PYTHONPATH}
-
-
-# Check python lib installed or not.
-pip --help > /dev/null
-if [ $? -ne 0 ]; then
-    echo "pip should be installed to run paddle."
-    exit 1
-fi
-
-if [ "@WITH_GPU@" == "ON" ]; then
-    PADDLE_NAME="paddlepaddle-gpu"
-else 
-    PADDLE_NAME="paddlepaddle"
-fi
-
-INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'`
-
-if [ -z "${INSTALLED_VERSION}" ]; then
-   INSTALLED_VERSION="0.0.0"  # not installed
-fi
-cat <<EOF | python -
-from distutils.version import LooseVersion
-import sys
-if LooseVersion("${INSTALLED_VERSION}") < LooseVersion("@PADDLE_VERSION@"):
-  sys.exit(1)
-else:
-  sys.exit(0)
-EOF
-
-cpu_config
-# echo $KMP_AFFINITY $OMP_DYNAMIC
-
-case "$1" in
-    "version")
-        version
-        ;;
-    *)
-        version
-        ;;
- esac
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
deleted file mode 100644
index dc6245ce6b024ba10e6631d5aea307de75dc2963..0000000000000000000000000000000000000000
--- a/paddle/testing/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# for paddle test case
-
-if(WITH_TESTING)
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
-endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
deleted file mode 100644
index d5acff56a9aa9136b84e216f6f8b0f28b528dbc5..0000000000000000000000000000000000000000
--- a/paddle/testing/paddle_gtest_main.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstring>
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/init.h"
-
-int main(int argc, char** argv) {
-  paddle::memory::allocation::UseAllocatorStrategyGFlag();
-  testing::InitGoogleTest(&argc, argv);
-  std::vector<char*> new_argv;
-  std::string gflags_env;
-  for (int i = 0; i < argc; ++i) {
-    new_argv.push_back(argv[i]);
-  }
-
-  std::vector<std::string> envs;
-  std::vector<std::string> undefok;
-#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC)
-  std::string str_max_body_size;
-  if (google::GetCommandLineOption("max_body_size", &str_max_body_size)) {
-    setenv("FLAGS_max_body_size", "2147483647", 1);
-    envs.push_back("max_body_size");
-  }
-#endif
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  envs.push_back("fraction_of_gpu_memory_to_use");
-  envs.push_back("initial_gpu_memory_in_mb");
-  envs.push_back("reallocate_gpu_memory_in_mb");
-  envs.push_back("allocator_strategy");
-#elif __clang__
-  envs.push_back("use_mkldnn");
-  envs.push_back("initial_cpu_memory_in_mb");
-  envs.push_back("allocator_strategy");
-
-  undefok.push_back("use_mkldnn");
-  undefok.push_back("initial_cpu_memory_in_mb");
-#else
-  envs.push_back("use_pinned_memory");
-  envs.push_back("use_mkldnn");
-  envs.push_back("initial_cpu_memory_in_mb");
-  envs.push_back("allocator_strategy");
-
-  undefok.push_back("use_pinned_memory");
-  undefok.push_back("use_mkldnn");
-  undefok.push_back("initial_cpu_memory_in_mb");
-#endif
-
-  char* env_str = nullptr;
-  if (envs.size() > 0) {
-    std::string env_string = "--tryfromenv=";
-    for (auto t : envs) {
-      env_string += t + ",";
-    }
-    env_string = env_string.substr(0, env_string.length() - 1);
-    env_str = strdup(env_string.c_str());
-    new_argv.push_back(env_str);
-    VLOG(1) << "gtest env_string:" << env_string;
-  }
-
-  char* undefok_str = nullptr;
-  if (undefok.size() > 0) {
-    std::string undefok_string = "--undefok=";
-    for (auto t : undefok) {
-      undefok_string += t + ",";
-    }
-    undefok_string = undefok_string.substr(0, undefok_string.length() - 1);
-    undefok_str = strdup(undefok_string.c_str());
-    new_argv.push_back(undefok_str);
-    VLOG(1) << "gtest undefok_string:" << undefok_string;
-  }
-
-  int new_argc = static_cast<int>(new_argv.size());
-  char** new_argv_address = new_argv.data();
-  google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
-  paddle::framework::InitDevices(true);
-
-  int ret = RUN_ALL_TESTS();
-
-  if (env_str) free(env_str);
-  if (undefok_str) free(undefok_str);
-
-  return ret;
-}
diff --git a/patches/grpc/completion_queue.h b/patches/grpc/completion_queue.h
deleted file mode 100644
index 6e92c60ea2db00cc6e227830228888f9a06735c4..0000000000000000000000000000000000000000
--- a/patches/grpc/completion_queue.h
+++ /dev/null
@@ -1,386 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/// A completion queue implements a concurrent producer-consumer queue, with
-/// two main API-exposed methods: \a Next and \a AsyncNext. These
-/// methods are the essential component of the gRPC C++ asynchronous API.
-/// There is also a \a Shutdown method to indicate that a given completion queue
-/// will no longer have regular events. This must be called before the
-/// completion queue is destroyed.
-/// All completion queue APIs are thread-safe and may be used concurrently with
-/// any other completion queue API invocation; it is acceptable to have
-/// multiple threads calling \a Next or \a AsyncNext on the same or different
-/// completion queues, or to call these methods concurrently with a \a Shutdown
-/// elsewhere.
-/// \remark{All other API calls on completion queue should be completed before
-/// a completion queue destructor is called.}
-#ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
-#define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
-
-#include <typeinfo>
-
-#include <grpc/impl/codegen/atm.h>
-#include <grpcpp/impl/codegen/completion_queue_tag.h>
-#include <grpcpp/impl/codegen/core_codegen_interface.h>
-#include <grpcpp/impl/codegen/grpc_library.h>
-#include <grpcpp/impl/codegen/status.h>
-#include <grpcpp/impl/codegen/time.h>
-
-struct grpc_completion_queue;
-
-namespace grpc {
-
-template <class R>
-class ClientReader;
-template <class W>
-class ClientWriter;
-template <class W, class R>
-class ClientReaderWriter;
-template <class R>
-class ServerReader;
-template <class W>
-class ServerWriter;
-namespace internal {
-template <class W, class R>
-class ServerReaderWriterBody;
-}  // namespace internal
-
-class Channel;
-class ChannelInterface;
-class ClientContext;
-class CompletionQueue;
-class Server;
-class ServerBuilder;
-class ServerContext;
-class ServerInterface;
-
-namespace internal {
-class CompletionQueueTag;
-class RpcMethod;
-template <class ServiceType, class RequestType, class ResponseType>
-class RpcMethodHandler;
-template <class ServiceType, class RequestType, class ResponseType>
-class ClientStreamingHandler;
-template <class ServiceType, class RequestType, class ResponseType>
-class ServerStreamingHandler;
-template <class ServiceType, class RequestType, class ResponseType>
-class BidiStreamingHandler;
-class UnknownMethodHandler;
-template <class Streamer, bool WriteNeeded>
-class TemplatedBidiStreamingHandler;
-template <class InputMessage, class OutputMessage>
-class BlockingUnaryCallImpl;
-}  // namespace internal
-
-extern CoreCodegenInterface* g_core_codegen_interface;
-
-/// A thin wrapper around \ref grpc_completion_queue (see \ref
-/// src/core/lib/surface/completion_queue.h).
-/// See \ref doc/cpp/perf_notes.md for notes on best practices for high
-/// performance servers.
-class CompletionQueue : private GrpcLibraryCodegen {
- public:
-  /// Default constructor. Implicitly creates a \a grpc_completion_queue
-  /// instance.
-  CompletionQueue()
-      : CompletionQueue(grpc_completion_queue_attributes{
-            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, GRPC_CQ_DEFAULT_POLLING}) {}
-
-  /// Wrap \a take, taking ownership of the instance.
-  ///
-  /// \param take The completion queue instance to wrap. Ownership is taken.
-  explicit CompletionQueue(grpc_completion_queue* take);
-
-  /// Destructor. Destroys the owned wrapped completion queue / instance.
-  ~CompletionQueue() {
-    if (typeid(*g_core_codegen_interface).hash_code() !=
-        typeid(CoreCodegenInterface).hash_code()) {
-      g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
-    }
-  }
-
-  /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
-  enum NextStatus {
-    SHUTDOWN,   ///< The completion queue has been shutdown and fully-drained
-    GOT_EVENT,  ///< Got a new event; \a tag will be filled in with its
-                ///< associated value; \a ok indicating its success.
-    TIMEOUT     ///< deadline was reached.
-  };
-
-  /// Read from the queue, blocking until an event is available or the queue is
-  /// shutting down.
-  ///
-  /// \param tag[out] Updated to point to the read event's tag.
-  /// \param ok[out] true if read a successful event, false otherwise.
-  ///
-  /// Note that each tag sent to the completion queue (through RPC operations
-  /// or alarms) will be delivered out of the completion queue by a call to
-  /// Next (or a related method), regardless of whether the operation succeeded
-  /// or not. Success here means that this operation completed in the normal
-  /// valid manner.
-  ///
-  /// Server-side RPC request: \a ok indicates that the RPC has indeed
-  /// been started. If it is false, the server has been Shutdown
-  /// before this particular call got matched to an incoming RPC.
-  ///
-  /// Client-side StartCall/RPC invocation: \a ok indicates that the RPC is
-  /// going to go to the wire. If it is false, it not going to the wire. This
-  /// would happen if the channel is either permanently broken or
-  /// transiently broken but with the fail-fast option. (Note that async unary
-  /// RPCs don't post a CQ tag at this point, nor do client-streaming
-  /// or bidi-streaming RPCs that have the initial metadata corked option set.)
-  ///
-  /// Client-side Write, Client-side WritesDone, Server-side Write,
-  /// Server-side Finish, Server-side SendInitialMetadata (which is
-  /// typically included in Write or Finish when not done explicitly):
-  /// \a ok means that the data/metadata/status/etc is going to go to the
-  /// wire. If it is false, it not going to the wire because the call
-  /// is already dead (i.e., canceled, deadline expired, other side
-  /// dropped the channel, etc).
-  ///
-  /// Client-side Read, Server-side Read, Client-side
-  /// RecvInitialMetadata (which is typically included in Read if not
-  /// done explicitly): \a ok indicates whether there is a valid message
-  /// that got read. If not, you know that there are certainly no more
-  /// messages that can ever be read from this stream. For the client-side
-  /// operations, this only happens because the call is dead. For the
-  /// server-sider operation, though, this could happen because the client
-  /// has done a WritesDone already.
-  ///
-  /// Client-side Finish: \a ok should always be true
-  ///
-  /// Server-side AsyncNotifyWhenDone: \a ok should always be true
-  ///
-  /// Alarm: \a ok is true if it expired, false if it was canceled
-  ///
-  /// \return true if got an event, false if the queue is fully drained and
-  ///         shut down.
-  bool Next(void** tag, bool* ok) {
-    return (AsyncNextInternal(tag,
-                              ok,
-                              g_core_codegen_interface->gpr_inf_future(
-                                  GPR_CLOCK_REALTIME)) != SHUTDOWN);
-  }
-
-  /// Read from the queue, blocking up to \a deadline (or the queue's shutdown).
-  /// Both \a tag and \a ok are updated upon success (if an event is available
-  /// within the \a deadline).  A \a tag points to an arbitrary location usually
-  /// employed to uniquely identify an event.
-  ///
-  /// \param tag[out] Upon sucess, updated to point to the event's tag.
-  /// \param ok[out] Upon sucess, true if a successful event, false otherwise
-  ///        See documentation for CompletionQueue::Next for explanation of ok
-  /// \param deadline[in] How long to block in wait for an event.
-  ///
-  /// \return The type of event read.
-  template <typename T>
-  NextStatus AsyncNext(void** tag, bool* ok, const T& deadline) {
-    TimePoint<T> deadline_tp(deadline);
-    return AsyncNextInternal(tag, ok, deadline_tp.raw_time());
-  }
-
-  /// EXPERIMENTAL
-  /// First executes \a F, then reads from the queue, blocking up to
-  /// \a deadline (or the queue's shutdown).
-  /// Both \a tag and \a ok are updated upon success (if an event is available
-  /// within the \a deadline).  A \a tag points to an arbitrary location usually
-  /// employed to uniquely identify an event.
-  ///
-  /// \param F[in] Function to execute before calling AsyncNext on this queue.
-  /// \param tag[out] Upon sucess, updated to point to the event's tag.
-  /// \param ok[out] Upon sucess, true if read a regular event, false otherwise.
-  /// \param deadline[in] How long to block in wait for an event.
-  ///
-  /// \return The type of event read.
-  template <typename T, typename F>
-  NextStatus DoThenAsyncNext(F&& f, void** tag, bool* ok, const T& deadline) {
-    CompletionQueueTLSCache cache = CompletionQueueTLSCache(this);
-    f();
-    if (cache.Flush(tag, ok)) {
-      return GOT_EVENT;
-    } else {
-      return AsyncNext(tag, ok, deadline);
-    }
-  }
-
-  /// Request the shutdown of the queue.
-  ///
-  /// \warning This method must be called at some point if this completion queue
-  /// is accessed with Next or AsyncNext. \a Next will not return false
-  /// until this method has been called and all pending tags have been drained.
-  /// (Likewise for \a AsyncNext returning \a NextStatus::SHUTDOWN .)
-  /// Only once either one of these methods does that (that is, once the queue
-  /// has been \em drained) can an instance of this class be destroyed.
-  /// Also note that applications must ensure that no work is enqueued on this
-  /// completion queue after this method is called.
-  void Shutdown();
-
-  /// Returns a \em raw pointer to the underlying \a grpc_completion_queue
-  /// instance.
-  ///
-  /// \warning Remember that the returned instance is owned. No transfer of
-  /// owership is performed.
-  grpc_completion_queue* cq() { return cq_; }
-
- protected:
-  /// Private constructor of CompletionQueue only visible to friend classes
-  CompletionQueue(const grpc_completion_queue_attributes& attributes) {
-    cq_ = g_core_codegen_interface->grpc_completion_queue_create(
-        g_core_codegen_interface->grpc_completion_queue_factory_lookup(
-            &attributes),
-        &attributes,
-        NULL);
-    InitialAvalanching();  // reserve this for the future shutdown
-  }
-
- private:
-  // Friend synchronous wrappers so that they can access Pluck(), which is
-  // a semi-private API geared towards the synchronous implementation.
-  template <class R>
-  friend class ::grpc::ClientReader;
-  template <class W>
-  friend class ::grpc::ClientWriter;
-  template <class W, class R>
-  friend class ::grpc::ClientReaderWriter;
-  template <class R>
-  friend class ::grpc::ServerReader;
-  template <class W>
-  friend class ::grpc::ServerWriter;
-  template <class W, class R>
-  friend class ::grpc::internal::ServerReaderWriterBody;
-  template <class ServiceType, class RequestType, class ResponseType>
-  friend class ::grpc::internal::RpcMethodHandler;
-  template <class ServiceType, class RequestType, class ResponseType>
-  friend class ::grpc::internal::ClientStreamingHandler;
-  template <class ServiceType, class RequestType, class ResponseType>
-  friend class ::grpc::internal::ServerStreamingHandler;
-  template <class Streamer, bool WriteNeeded>
-  friend class ::grpc::internal::TemplatedBidiStreamingHandler;
-  friend class ::grpc::internal::UnknownMethodHandler;
-  friend class ::grpc::Server;
-  friend class ::grpc::ServerContext;
-  friend class ::grpc::ServerInterface;
-  template <class InputMessage, class OutputMessage>
-  friend class ::grpc::internal::BlockingUnaryCallImpl;
-
-  /// EXPERIMENTAL
-  /// Creates a Thread Local cache to store the first event
-  /// On this completion queue queued from this thread.  Once
-  /// initialized, it must be flushed on the same thread.
-  class CompletionQueueTLSCache {
-   public:
-    CompletionQueueTLSCache(CompletionQueue* cq);
-    ~CompletionQueueTLSCache();
-    bool Flush(void** tag, bool* ok);
-
-   private:
-    CompletionQueue* cq_;
-    bool flushed_;
-  };
-
-  NextStatus AsyncNextInternal(void** tag, bool* ok, gpr_timespec deadline);
-
-  /// Wraps \a grpc_completion_queue_pluck.
-  /// \warning Must not be mixed with calls to \a Next.
-  bool Pluck(internal::CompletionQueueTag* tag) {
-    auto deadline =
-        g_core_codegen_interface->gpr_inf_future(GPR_CLOCK_REALTIME);
-    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
-        cq_, tag, deadline, nullptr);
-    bool ok = ev.success != 0;
-    void* ignored = tag;
-    GPR_CODEGEN_ASSERT(tag->FinalizeResult(&ignored, &ok));
-    GPR_CODEGEN_ASSERT(ignored == tag);
-    // Ignore mutations by FinalizeResult: Pluck returns the C API status
-    return ev.success != 0;
-  }
-
-  /// Performs a single polling pluck on \a tag.
-  /// \warning Must not be mixed with calls to \a Next.
-  ///
-  /// TODO: sreek - This calls tag->FinalizeResult() even if the cq_ is already
-  /// shutdown. This is most likely a bug and if it is a bug, then change this
-  /// implementation to simple call the other TryPluck function with a zero
-  /// timeout. i.e:
-  ///      TryPluck(tag, gpr_time_0(GPR_CLOCK_REALTIME))
-  void TryPluck(internal::CompletionQueueTag* tag) {
-    auto deadline = g_core_codegen_interface->gpr_time_0(GPR_CLOCK_REALTIME);
-    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
-        cq_, tag, deadline, nullptr);
-    if (ev.type == GRPC_QUEUE_TIMEOUT) return;
-    bool ok = ev.success != 0;
-    void* ignored = tag;
-    // the tag must be swallowed if using TryPluck
-    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
-  }
-
-  /// Performs a single polling pluck on \a tag. Calls tag->FinalizeResult if
-  /// the pluck() was successful and returned the tag.
-  ///
-  /// This exects tag->FinalizeResult (if called) to return 'false' i.e expects
-  /// that the tag is internal not something that is returned to the user.
-  void TryPluck(internal::CompletionQueueTag* tag, gpr_timespec deadline) {
-    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
-        cq_, tag, deadline, nullptr);
-    if (ev.type == GRPC_QUEUE_TIMEOUT || ev.type == GRPC_QUEUE_SHUTDOWN) {
-      return;
-    }
-
-    bool ok = ev.success != 0;
-    void* ignored = tag;
-    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
-  }
-
-  /// Manage state of avalanching operations : completion queue tags that
-  /// trigger other completion queue operations. The underlying core completion
-  /// queue should not really shutdown until all avalanching operations have
-  /// been finalized. Note that we maintain the requirement that an avalanche
-  /// registration must take place before CQ shutdown (which must be maintained
-  /// elsehwere)
-  void InitialAvalanching() {
-    gpr_atm_rel_store(&avalanches_in_flight_, static_cast<gpr_atm>(1));
-  }
-  void RegisterAvalanching() {
-    gpr_atm_no_barrier_fetch_add(&avalanches_in_flight_,
-                                 static_cast<gpr_atm>(1));
-  }
-  void CompleteAvalanching();
-
-  grpc_completion_queue* cq_;  // owned
-
-  gpr_atm avalanches_in_flight_;
-};
-
-/// A specific type of completion queue used by the processing of notifications
-/// by servers. Instantiated by \a ServerBuilder.
-class ServerCompletionQueue : public CompletionQueue {
- public:
-  bool IsFrequentlyPolled() { return polling_type_ != GRPC_CQ_NON_LISTENING; }
-
- private:
-  grpc_cq_polling_type polling_type_;
-  friend class ServerBuilder;
-  /// \param is_frequently_polled Informs the GRPC library about whether the
-  /// server completion queue would be actively polled (by calling Next() or
-  /// AsyncNext()). By default all server completion queues are assumed to be
-  /// frequently polled.
-  ServerCompletionQueue(grpc_cq_polling_type polling_type)
-      : CompletionQueue(grpc_completion_queue_attributes{
-            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, polling_type}),
-        polling_type_(polling_type) {}
-};
-
-}  // namespace grpc
-
-#endif  // GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
diff --git a/patches/grpc/grpc_library.h b/patches/grpc/grpc_library.h
deleted file mode 100644
index 4870a1cda4b2a6489bc379fe53cf3e9659fffc47..0000000000000000000000000000000000000000
--- a/patches/grpc/grpc_library.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
-#define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
-
-#include <typeinfo>
-
-#include <grpcpp/impl/codegen/core_codegen_interface.h>
-
-namespace grpc {
-
-class GrpcLibraryInterface {
- public:
-  virtual ~GrpcLibraryInterface() = default;
-  virtual void init() = 0;
-  virtual void shutdown() = 0;
-};
-
-/// Initialized by \a grpc::GrpcLibraryInitializer from
-/// <grpcpp/impl/grpc_library.h>
-extern GrpcLibraryInterface* g_glip;
-
-/// Classes that require gRPC to be initialized should inherit from this class.
-class GrpcLibraryCodegen {
- public:
-  GrpcLibraryCodegen(bool call_grpc_init = true) : grpc_init_called_(false) {
-    if (call_grpc_init) {
-      GPR_CODEGEN_ASSERT(g_glip &&
-                         "gRPC library not initialized. See "
-                         "grpc::internal::GrpcLibraryInitializer.");
-      g_glip->init();
-      grpc_init_called_ = true;
-    }
-  }
-  virtual ~GrpcLibraryCodegen() {
-    if (grpc_init_called_ &&
-        typeid(*g_glip).hash_code() !=
-            typeid(GrpcLibraryInterface).hash_code()) {
-      GPR_CODEGEN_ASSERT(g_glip &&
-                         "gRPC library not initialized. See "
-                         "grpc::internal::GrpcLibraryInitializer.");
-      g_glip->shutdown();
-    }
-  }
-
- private:
-  bool grpc_init_called_;
-};
-
-}  // namespace grpc
-
-#endif  // GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
diff --git a/publish_include.sh b/publish_include.sh
deleted file mode 100644
index 82dcb8c548f569c45b02a9de30a06bb2e5e077c9..0000000000000000000000000000000000000000
--- a/publish_include.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!bash
-OUTPUT_PATH=../../../bc_out/baidu/feed-mlarch/paddle-trainer/output/include/
-INCLUDE_DIR=paddle/fluid/train/custom_trainer/feed/
-SUB_DIR_LIST=(common dataset accessor executor monitor process shuffler)
-rm -rf ${OUTPUT_PATH}/${INCLUDE_DIR}/*
-
-cp ${INCLUDE_DIR}/*.h ${OUTPUT_PATH}/${INCLUDE_DIR}/
-for sub_name in "${SUB_DIR_LIST[@]}"
-do
-    mkdir ${OUTPUT_PATH}/${INCLUDE_DIR}/${sub_name}
-    cp ${INCLUDE_DIR}/${sub_name}/*.h ${OUTPUT_PATH}/${INCLUDE_DIR}/${sub_name}/
-done
diff --git a/python/.gitignore b/python/.gitignore
deleted file mode 100644
index 53a2b7a76b0dd2d9095f9582540e455e2c1174e2..0000000000000000000000000000000000000000
--- a/python/.gitignore
+++ /dev/null
@@ -1,9 +0,0 @@
-*pyc
-build
-dist
-paddlepaddle.egg-info
-paddle.egg-info
-paddlepaddle_gpu.egg-info
-.idea
-paddle/proto/*.py
-paddle/proto/*.pyc
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
deleted file mode 100644
index 62fd81aa54ca289f403b2897df40add136bdfcfc..0000000000000000000000000000000000000000
--- a/python/CMakeLists.txt
+++ /dev/null
@@ -1,117 +0,0 @@
-file(GLOB UTILS_PY_FILES . ./paddle/legacy/utils/*.py)
-file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/*.py)
-set(PY_FILES paddle/__init__.py
-  ${UTILS_PY_FILES}
-  ${FLUID_PY_FILES})
-
-if(WITH_GPU)
-  SET(PACKAGE_NAME "paddlepaddle-gpu")
-else()
-  SET(PACKAGE_NAME "paddlepaddle")
-endif()
-
-set(FLUID_CORE_NAME "core")
-if(WITH_AVX AND AVX_FOUND)
-  set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_avx")
-  if(NOT DEFINED NOAVX_CORE_FILE OR NOAVX_CORE_FILE STREQUAL "")
-    message(STATUS "WARNING: This is just a warning for publishing release.
-      You are building AVX version without NOAVX core.
-      So the wheel package may fail on NOAVX machine.
-      You can add -DFLUID_CORE_NAME=/path/to/your/core_noavx.* in cmake command
-      to get a full wheel package to resolve this warning.
-      While, this version will still work on local machine.")
-  endif()
-
-  if(NOAVX_CORE_FILE AND NOT EXISTS "${NOAVX_CORE_FILE}")
-    message(FATAL_ERROR "The file ${NOAVX_CORE_FILE} does not exist!")
-  endif()
-
-  set(HAS_NOAVX_CORE ON)
-else()
-  set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_noavx")
-endif()
-
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-
-set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
-
-IF(WIN32)
-    # Python would use the .pyd by default under Windows series platform
-    set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.pyd)
-    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)
-ELSE()
-    set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so)
-    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so)
-ENDIF()
-
-set(FLUID_CORE_DEPS ${FLUID_CORE})
-
-if(HAS_NOAVX_CORE AND EXISTS "${NOAVX_CORE_FILE}")
-  get_filename_component(NOAVX_CORE_NAME ${NOAVX_CORE_FILE} NAME)
-  get_filename_component(NOAVX_CORE_EXT ${NOAVX_CORE_FILE} EXT)
-  if(WIN32)
-    if(NOT NOAVX_CORE_EXT STREQUAL ".pyd")
-      message(FATAL_ERROR "Wrong file ${NOAVX_CORE_NAME}, the ext does not match windows *.pyd!")
-    endif()
-  else()
-    if(NOT NOAVX_CORE_EXT STREQUAL ".so")
-      message(FATAL_ERROR "Wrong file ${NOAVX_CORE_NAME}, the ext does not match *.so!")
-    endif()
-  endif()
-  add_custom_command(OUTPUT ${FLUID_NOAVX_CORE}
-    COMMAND cmake -E copy ${NOAVX_CORE_FILE} ${FLUID_NOAVX_CORE} DEPENDS paddle_pybind)
-  list(APPEND FLUID_CORE_DEPS ${FLUID_NOAVX_CORE})
-endif()
-
-add_custom_command(OUTPUT ${FLUID_CORE}
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-        DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE_DEPS})
-
-IF(WIN32)
-    add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-            COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
-            COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-            COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-            COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-            DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
-ELSE(WIN32)
-	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-		COMMAND touch stub.cc
-		COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
-		COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-		DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
-ENDIF()
-
-add_custom_target(paddle_python ALL DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
-
-set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
-
-if (WITH_TESTING)
-  add_subdirectory(paddle/reader/tests)
-  add_subdirectory(paddle/dataset/tests)
-  add_subdirectory(paddle/fluid/tests)
-  add_subdirectory(paddle/fluid/contrib/tests)
-  add_subdirectory(paddle/fluid/contrib/slim/tests)
-endif()
-install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
-    DESTINATION opt/paddle/share/wheels
-)
-
-if(APPLE)
-  find_program(INSTALL_NAME_TOOL_EXECUTABLE install_name_tool)
-  if(NOT INSTALL_NAME_TOOL_EXECUTABLE)
-    message(FATAL_ERROR "install_name_tool not found, please check.\n")
-  endif()
-endif()
-if(LINUX)
-  find_program(PATCHELF_EXECUTABLE patchelf)
-  if(NOT PATCHELF_EXECUTABLE)
-    message(FATAL_ERROR "patchelf not found, please install it.\n"
-            "For Ubuntu, the command is: apt-get install -y patchelf.")
-  endif()
-endif(LINUX)
diff --git a/python/__init__.py b/python/__init__.py
deleted file mode 100644
index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000
--- a/python/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/.gitignore b/python/paddle/.gitignore
deleted file mode 100644
index 98527864664d32f798edc06a53131e8d5a068295..0000000000000000000000000000000000000000
--- a/python/paddle/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-version.py
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
deleted file mode 100644
index fccc4bb09951328cc5a827bdf861f5ab714feb5f..0000000000000000000000000000000000000000
--- a/python/paddle/__init__.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from paddle.check_import_scipy import check_import_scipy
-
-check_import_scipy(os.name)
-
-try:
-    from paddle.version import full_version as __version__
-    from paddle.version import commit as __git_commit__
-
-except ImportError:
-    import sys
-    sys.stderr.write('''Warning with import paddle: you should not
-     import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
-                     )
-
-import paddle.reader
-import paddle.dataset
-import paddle.batch
-import paddle.compat
-import paddle.distributed
-batch = batch.batch
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
deleted file mode 100644
index 008509660739d61245526278735064472b8b06dd..0000000000000000000000000000000000000000
--- a/python/paddle/batch.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ['batch']
-
-
-def batch(reader, batch_size, drop_last=False):
-    """
-    Create a batched reader.
-
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param batch_size: size of each mini-batch
-    :type batch_size: int
-    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
-    :type drop_last: bool
-    :return: the batched reader.
-    :rtype: callable
-    """
-
-    def batch_reader():
-        r = reader()
-        b = []
-        for instance in r:
-            b.append(instance)
-            if len(b) == batch_size:
-                yield b
-                b = []
-        if drop_last == False and len(b) != 0:
-            yield b
-
-    # Batch size check
-    batch_size = int(batch_size)
-    if batch_size <= 0:
-        raise ValueError("batch_size should be a positive integeral value, "
-                         "but got batch_size={}".format(batch_size))
-
-    return batch_reader
diff --git a/python/paddle/check_import_scipy.py b/python/paddle/check_import_scipy.py
deleted file mode 100644
index 0172d568e5b08693847495cde040054f96257785..0000000000000000000000000000000000000000
--- a/python/paddle/check_import_scipy.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def check_import_scipy(OsName):
-    print_info = ""
-    if OsName == 'nt':
-        try:
-            import scipy.io as scio
-        except ImportError as e:
-            print_info = str(e)
-        if (len(print_info) > 0):
-            if 'DLL load failed' in print_info:
-                raise ImportError(
-                    print_info +
-                    "\nplease download visual C++ Redistributable for vs 2015, https://www.microsoft.com/en-us/download/details.aspx?id=48145"
-                )
-    return
diff --git a/python/paddle/compat.py b/python/paddle/compat.py
deleted file mode 100644
index 50726b6fa1bbbde68a590c86db9344b8f02f79f2..0000000000000000000000000000000000000000
--- a/python/paddle/compat.py
+++ /dev/null
@@ -1,237 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import six
-import math
-
-__all__ = [
-    'long_type',
-    'to_text',
-    'to_bytes',
-    'round',
-    'floor_division',
-    'get_exception_message',
-]
-
-if six.PY2:
-    int_type = int
-    long_type = long
-else:
-    int_type = int
-    long_type = int
-
-
-#  str and bytes related functions
-def to_text(obj, encoding='utf-8', inplace=False):
-    """
-      All string in PaddlePaddle should be represented as a literal string.
-    This function will convert object to a literal string without any encoding.
-    Especially, if the object type is a list or set container, we will iterate
-    all items in the object and convert them to literal string.
-
-    In Python3:
-        Decode the bytes type object to str type with specific encoding
-
-    In Python2:
-        Decode the str type object to unicode type with specific encoding
-
-    Args:
-        obj(unicode|str|bytes|list|set) : The object to be decoded.
-        encoding(str) : The encoding format to decode a string
-        inplace(bool) : If we change the original object or we create a new one
-
-    Returns:
-        Decoded result of obj
-    """
-    if obj is None:
-        return obj
-
-    if isinstance(obj, list):
-        if inplace:
-            for i in six.moves.xrange(len(obj)):
-                obj[i] = _to_text(obj[i], encoding)
-            return obj
-        else:
-            return [_to_text(item, encoding) for item in obj]
-    elif isinstance(obj, set):
-        if inplace:
-            for item in obj:
-                obj.remove(item)
-                obj.add(_to_text(item, encoding))
-            return obj
-        else:
-            return set([_to_text(item, encoding) for item in obj])
-    else:
-        return _to_text(obj, encoding)
-
-
-def _to_text(obj, encoding):
-    """
-    In Python3:
-        Decode the bytes type object to str type with specific encoding
-
-    In Python2:
-        Decode the str type object to unicode type with specific encoding,
-        or we just return the unicode string of object
-
-    Args:
-        obj(unicode|str|bytes) : The object to be decoded.
-        encoding(str) : The encoding format
-
-    Returns:
-        decoded result of obj
-    """
-    if obj is None:
-        return obj
-
-    if isinstance(obj, six.binary_type):
-        return obj.decode(encoding)
-    elif isinstance(obj, six.text_type):
-        return obj
-    else:
-        return six.u(obj)
-
-
-def to_bytes(obj, encoding='utf-8', inplace=False):
-    """
-      All string in PaddlePaddle should be represented as a literal string.
-    This function will convert object to a bytes with specific encoding.
-    Especially, if the object type is a list or set container, we will iterate
-    all items in the object and convert them to bytes.
-
-    In Python3:
-        Encode the str type object to bytes type with specific encoding
-
-    In Python2:
-        Encode the unicode type object to str type with specific encoding,
-        or we just return the 8-bit string of object
-
-    Args:
-        obj(unicode|str|bytes|list|set) : The object to be encoded.
-        encoding(str) : The encoding format to encode a string
-        inplace(bool) : If we change the original object or we create a new one
-
-    Returns:
-        Decoded result of obj
-    """
-    if obj is None:
-        return obj
-
-    if isinstance(obj, list):
-        if inplace:
-            for i in six.moves.xrange(len(obj)):
-                obj[i] = _to_bytes(obj[i], encoding)
-            return obj
-        else:
-            return [_to_bytes(item, encoding) for item in obj]
-    elif isinstance(obj, set):
-        if inplace:
-            for item in obj:
-                obj.remove(item)
-                obj.add(_to_bytes(item, encoding))
-            return obj
-        else:
-            return set([_to_bytes(item, encoding) for item in obj])
-    else:
-        return _to_bytes(obj, encoding)
-
-
-def _to_bytes(obj, encoding):
-    """
-    In Python3:
-        Encode the str type object to bytes type with specific encoding
-
-    In Python2:
-        Encode the unicode type object to str type with specific encoding,
-        or we just return the 8-bit string of object
-
-    Args:
-        obj(unicode|str|bytes) : The object to be encoded.
-        encoding(str) : The encoding format
-
-    Returns:
-        encoded result of obj
-    """
-    if obj is None:
-        return obj
-
-    assert encoding is not None
-    if isinstance(obj, six.text_type):
-        return obj.encode(encoding)
-    elif isinstance(obj, six.binary_type):
-        return obj
-    else:
-        return six.b(obj)
-
-
-# math related functions
-def round(x, d=0):
-    """
-    Compatible round which act the same behaviour in Python3.
-
-    Args:
-        x(float) : The number to round halfway.
-
-    Returns:
-        round result of x
-    """
-    if six.PY3:
-        # The official walkaround of round in Python3 is incorrect
-        # we implement accroding this answer: https://www.techforgeek.info/round_python.html
-        if x > 0.0:
-            p = 10**d
-            return float(math.floor((x * p) + math.copysign(0.5, x))) / p
-        elif x < 0.0:
-            p = 10**d
-            return float(math.ceil((x * p) + math.copysign(0.5, x))) / p
-        else:
-            return math.copysign(0.0, x)
-    else:
-        import __builtin__
-        return __builtin__.round(x, d)
-
-
-def floor_division(x, y):
-    """
-    Compatible division which act the same behaviour in Python3 and Python2,
-    whose result will be a int value of floor(x / y) in Python3 and value of
-    (x / y) in Python2.
-
-    Args:
-        x(int|float) : The number to divide.
-        y(int|float) : The number to be divided
-
-    Returns:
-        division result of x // y
-    """
-    return x // y
-
-
-# exception related functions
-def get_exception_message(exc):
-    """
-    Get the error message of a specific exception
-
-    Args:
-        exec(Exception) : The exception to get error message.
-
-    Returns:
-        the error message of exec
-    """
-    assert exc is not None
-
-    if six.PY2:
-        return exc.message
-    else:
-        return str(exc)
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
deleted file mode 100644
index 54aa3edc51d3734633ce077a59bd86cec8d09032..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Dataset package.
-"""
-
-import paddle.dataset.mnist
-import paddle.dataset.imikolov
-import paddle.dataset.imdb
-import paddle.dataset.cifar
-import paddle.dataset.movielens
-import paddle.dataset.conll05
-import paddle.dataset.uci_housing
-import paddle.dataset.sentiment
-import paddle.dataset.wmt14
-import paddle.dataset.wmt16
-import paddle.dataset.mq2007
-import paddle.dataset.flowers
-import paddle.dataset.voc2012
-import paddle.dataset.image
-
-__all__ = [
-    'mnist',
-    'imikolov',
-    'imdb',
-    'cifar',
-    'movielens',
-    'conll05',
-    'sentiment',
-    'uci_housing',
-    'wmt14',
-    'wmt16',
-    'mq2007',
-    'flowers',
-    'voc2012',
-    'image',
-]
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
deleted file mode 100644
index 2c62d1c7d1112239020f2ea9669f4729ea3c367a..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/cifar.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-CIFAR dataset.
-
-This module will download dataset from https://dataset.bj.bcebos.com/cifar/cifar-10-python.tar.gz and https://dataset.bj.bcebos.com/cifar/cifar-100-python.tar.gz, parse train/test set into
-paddle reader creators.
-
-The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
-with 6000 images per class. There are 50000 training images and 10000 test
-images.
-
-The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
-containing 600 images each. There are 500 training images and 100 testing
-images per class.
-
-"""
-
-from __future__ import print_function
-
-import itertools
-import numpy
-import paddle.dataset.common
-import tarfile
-import six
-from six.moves import cPickle as pickle
-
-__all__ = ['train100', 'test100', 'train10', 'test10']
-
-URL_PREFIX = 'https://dataset.bj.bcebos.com/cifar/'
-CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
-CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
-CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
-CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
-
-
-def reader_creator(filename, sub_name, cycle=False):
-    def read_batch(batch):
-        data = batch[six.b('data')]
-        labels = batch.get(
-            six.b('labels'), batch.get(six.b('fine_labels'), None))
-        assert labels is not None
-        for sample, label in six.moves.zip(data, labels):
-            yield (sample / 255.0).astype(numpy.float32), int(label)
-
-    def reader():
-        while True:
-            with tarfile.open(filename, mode='r') as f:
-                names = (each_item.name for each_item in f
-                         if sub_name in each_item.name)
-
-                for name in names:
-                    if six.PY2:
-                        batch = pickle.load(f.extractfile(name))
-                    else:
-                        batch = pickle.load(
-                            f.extractfile(name), encoding='bytes')
-                    for item in read_batch(batch):
-                        yield item
-
-            if not cycle:
-                break
-
-    return reader
-
-
-def train100():
-    """
-    CIFAR-100 training set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 99].
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'train')
-
-
-def test100():
-    """
-    CIFAR-100 test set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 99].
-
-    :return: Test reader creator.
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'test')
-
-
-def train10(cycle=False):
-    """
-    CIFAR-10 training set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch',
-        cycle=cycle)
-
-
-def test10(cycle=False):
-    """
-    CIFAR-10 test set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: Test reader creator.
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch',
-        cycle=cycle)
-
-
-def fetch():
-    paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
-    paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
deleted file mode 100644
index e8c27180ee407b2707847de74e6abb41815ef3ea..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/common.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import requests
-import hashlib
-import os
-import errno
-import shutil
-import six
-import sys
-import importlib
-import paddle.dataset
-import six.moves.cPickle as pickle
-import glob
-
-__all__ = [
-    'DATA_HOME',
-    'download',
-    'md5file',
-    'split',
-    'cluster_files_reader',
-]
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
-
-
-# When running unit tests, there could be multiple processes that
-# trying to create DATA_HOME directory simultaneously, so we cannot
-# use a if condition to check for the existence of the directory;
-# instead, we use the filesystem as the synchronization mechanism by
-# catching returned errors.
-def must_mkdirs(path):
-    try:
-        os.makedirs(DATA_HOME)
-    except OSError as exc:
-        if exc.errno != errno.EEXIST:
-            raise
-        pass
-
-
-must_mkdirs(DATA_HOME)
-
-
-def md5file(fname):
-    hash_md5 = hashlib.md5()
-    f = open(fname, "rb")
-    for chunk in iter(lambda: f.read(4096), b""):
-        hash_md5.update(chunk)
-    f.close()
-    return hash_md5.hexdigest()
-
-
-def download(url, module_name, md5sum, save_name=None):
-    dirname = os.path.join(DATA_HOME, module_name)
-    if not os.path.exists(dirname):
-        os.makedirs(dirname)
-
-    filename = os.path.join(dirname,
-                            url.split('/')[-1]
-                            if save_name is None else save_name)
-
-    if os.path.exists(filename) and md5file(filename) == md5sum:
-        return filename
-
-    retry = 0
-    retry_limit = 3
-    while not (os.path.exists(filename) and md5file(filename) == md5sum):
-        if os.path.exists(filename):
-            sys.stderr.write("file %s  md5 %s" % (md5file(filename), md5sum))
-        if retry < retry_limit:
-            retry += 1
-        else:
-            raise RuntimeError("Cannot download {0} within retry limit {1}".
-                               format(url, retry_limit))
-        sys.stderr.write("Cache file %s not found, downloading %s" %
-                         (filename, url))
-        r = requests.get(url, stream=True)
-        total_length = r.headers.get('content-length')
-
-        if total_length is None:
-            with open(filename, 'wb') as f:
-                shutil.copyfileobj(r.raw, f)
-        else:
-            with open(filename, 'wb') as f:
-                dl = 0
-                total_length = int(total_length)
-                for data in r.iter_content(chunk_size=4096):
-                    if six.PY2:
-                        data = six.b(data)
-                    dl += len(data)
-                    f.write(data)
-                    done = int(50 * dl / total_length)
-                    sys.stderr.write("\r[%s%s]" % ('=' * done,
-                                                   ' ' * (50 - done)))
-                    sys.stdout.flush()
-    sys.stderr.write("\n")
-    sys.stdout.flush()
-    return filename
-
-
-def fetch_all():
-    for module_name in [
-            x for x in dir(paddle.dataset) if not x.startswith("__")
-    ]:
-        if "fetch" in dir(
-                importlib.import_module("paddle.dataset.%s" % module_name)):
-            getattr(
-                importlib.import_module("paddle.dataset.%s" % module_name),
-                "fetch")()
-
-
-def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
-    """
-    you can call the function as:
-
-    split(paddle.dataset.cifar.train10(), line_count=1000,
-        suffix="imikolov-train-%05d.pickle")
-
-    the output files as:
-
-    |-imikolov-train-00000.pickle
-    |-imikolov-train-00001.pickle
-    |- ...
-    |-imikolov-train-00480.pickle
-
-    :param reader: is a reader creator
-    :param line_count: line count for each file
-    :param suffix: the suffix for the output files, should contain "%d"
-                means the id for each file. Default is "%05d.pickle"
-    :param dumper: is a callable function that dump object to file, this
-                function will be called as dumper(obj, f) and obj is the object
-                will be dumped, f is a file object. Default is cPickle.dump.
-    """
-    if not callable(dumper):
-        raise TypeError("dumper should be callable.")
-    lines = []
-    indx_f = 0
-    for i, d in enumerate(reader()):
-        lines.append(d)
-        if i >= line_count and i % line_count == 0:
-            with open(suffix % indx_f, "w") as f:
-                dumper(lines, f)
-                lines = []
-                indx_f += 1
-    if lines:
-        with open(suffix % indx_f, "w") as f:
-            dumper(lines, f)
-
-
-def cluster_files_reader(files_pattern,
-                         trainer_count,
-                         trainer_id,
-                         loader=pickle.load):
-    """
-    Create a reader that yield element from the given files, select
-    a file set according trainer count and trainer_id
-
-    :param files_pattern: the files which generating by split(...)
-    :param trainer_count: total trainer count
-    :param trainer_id: the trainer rank id
-    :param loader: is a callable function that load object from file, this
-                function will be called as loader(f) and f is a file object.
-                Default is cPickle.load
-    """
-
-    def reader():
-        if not callable(loader):
-            raise TypeError("loader should be callable.")
-        file_list = glob.glob(files_pattern)
-        file_list.sort()
-        my_file_list = []
-        for idx, fn in enumerate(file_list):
-            if idx % trainer_count == trainer_id:
-                print("append file: %s" % fn)
-                my_file_list.append(fn)
-        for fn in my_file_list:
-            with open(fn, "r") as f:
-                lines = loader(f)
-                for line in lines:
-                    yield line
-
-    return reader
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
deleted file mode 100644
index 81a8cfc2e6abbb9767eadfc25a51bb6f18b56fdc..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/conll05.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Conll05 dataset.
-Paddle semantic role labeling Book and demo use this dataset as an example.
-Because Conll05 is not free in public, the default downloaded URL is test set
-of Conll05 (which is public). Users can change URL and MD5 to their Conll
-dataset. And a pre-trained word vector model based on Wikipedia corpus is used
-to initialize SRL model.
-"""
-
-from __future__ import print_function
-
-import tarfile
-import gzip
-import itertools
-import paddle.dataset.common
-import paddle.compat as cpt
-from six.moves import zip, range
-
-__all__ = ['test, get_dict', 'get_embedding']
-
-DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
-DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
-WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
-VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
-TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
-EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
-
-UNK_IDX = 0
-
-
-def load_label_dict(filename):
-    d = dict()
-    tag_dict = set()
-    with open(filename, 'r') as f:
-        for i, line in enumerate(f):
-            line = line.strip()
-            if line.startswith("B-"):
-                tag_dict.add(line[2:])
-            elif line.startswith("I-"):
-                tag_dict.add(line[2:])
-        index = 0
-        for tag in tag_dict:
-            d["B-" + tag] = index
-            index += 1
-            d["I-" + tag] = index
-            index += 1
-        d["O"] = index
-    return d
-
-
-def load_dict(filename):
-    d = dict()
-    with open(filename, 'r') as f:
-        for i, line in enumerate(f):
-            d[line.strip()] = i
-    return d
-
-
-def corpus_reader(data_path, words_name, props_name):
-    """
-    Read one corpus. It returns an iterator. Each element of
-    this iterator is a tuple including sentence and labels. The sentence is
-    consist of a list of word IDs. The labels include a list of label IDs.
-    :return: a iterator of data.
-    :rtype: iterator
-    """
-
-    def reader():
-        tf = tarfile.open(data_path)
-        wf = tf.extractfile(words_name)
-        pf = tf.extractfile(props_name)
-        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
-                fileobj=pf) as props_file:
-            sentences = []
-            labels = []
-            one_seg = []
-            for word, label in zip(words_file, props_file):
-                word = cpt.to_text(word.strip())
-                label = cpt.to_text(label.strip().split())
-
-                if len(label) == 0:  # end of sentence
-                    for i in range(len(one_seg[0])):
-                        a_kind_lable = [x[i] for x in one_seg]
-                        labels.append(a_kind_lable)
-
-                    if len(labels) >= 1:
-                        verb_list = []
-                        for x in labels[0]:
-                            if x != '-':
-                                verb_list.append(x)
-
-                        for i, lbl in enumerate(labels[1:]):
-                            cur_tag = 'O'
-                            is_in_bracket = False
-                            lbl_seq = []
-                            verb_word = ''
-                            for l in lbl:
-                                if l == '*' and is_in_bracket == False:
-                                    lbl_seq.append('O')
-                                elif l == '*' and is_in_bracket == True:
-                                    lbl_seq.append('I-' + cur_tag)
-                                elif l == '*)':
-                                    lbl_seq.append('I-' + cur_tag)
-                                    is_in_bracket = False
-                                elif l.find('(') != -1 and l.find(')') != -1:
-                                    cur_tag = l[1:l.find('*')]
-                                    lbl_seq.append('B-' + cur_tag)
-                                    is_in_bracket = False
-                                elif l.find('(') != -1 and l.find(')') == -1:
-                                    cur_tag = l[1:l.find('*')]
-                                    lbl_seq.append('B-' + cur_tag)
-                                    is_in_bracket = True
-                                else:
-                                    raise RuntimeError('Unexpected label: %s' %
-                                                       l)
-
-                            yield sentences, verb_list[i], lbl_seq
-
-                    sentences = []
-                    labels = []
-                    one_seg = []
-                else:
-                    sentences.append(word)
-                    one_seg.append(label)
-
-        pf.close()
-        wf.close()
-        tf.close()
-
-    return reader
-
-
-def reader_creator(corpus_reader,
-                   word_dict=None,
-                   predicate_dict=None,
-                   label_dict=None):
-    def reader():
-        for sentence, predicate, labels in corpus_reader():
-
-            sen_len = len(sentence)
-
-            verb_index = labels.index('B-V')
-            mark = [0] * len(labels)
-            if verb_index > 0:
-                mark[verb_index - 1] = 1
-                ctx_n1 = sentence[verb_index - 1]
-            else:
-                ctx_n1 = 'bos'
-
-            if verb_index > 1:
-                mark[verb_index - 2] = 1
-                ctx_n2 = sentence[verb_index - 2]
-            else:
-                ctx_n2 = 'bos'
-
-            mark[verb_index] = 1
-            ctx_0 = sentence[verb_index]
-
-            if verb_index < len(labels) - 1:
-                mark[verb_index + 1] = 1
-                ctx_p1 = sentence[verb_index + 1]
-            else:
-                ctx_p1 = 'eos'
-
-            if verb_index < len(labels) - 2:
-                mark[verb_index + 2] = 1
-                ctx_p2 = sentence[verb_index + 2]
-            else:
-                ctx_p2 = 'eos'
-
-            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
-
-            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            pred_idx = [predicate_dict.get(predicate)] * sen_len
-            label_idx = [label_dict.get(w) for w in labels]
-
-            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
-              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
-
-    return reader
-
-
-def get_dict():
-    """
-    Get the word, verb and label dictionary of Wikipedia corpus.
-    """
-    word_dict = load_dict(
-        paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
-    verb_dict = load_dict(
-        paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
-    label_dict = load_label_dict(
-        paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
-    return word_dict, verb_dict, label_dict
-
-
-def get_embedding():
-    """
-    Get the trained word vector based on Wikipedia corpus.
-    """
-    return paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
-
-
-def test():
-    """
-    Conll05 test set creator.
-
-    Because the training dataset is not free, the test dataset is used for
-    training. It returns a reader creator, each sample in the reader is nine
-    features, including sentence sequence, predicate, predicate context,
-    predicate context flag and tagged sequence.
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    word_dict, verb_dict, label_dict = get_dict()
-    reader = corpus_reader(
-        paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
-        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
-        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
-    return reader_creator(reader, word_dict, verb_dict, label_dict)
-
-
-def fetch():
-    paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
-    paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
-    paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
-    paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
-    paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
deleted file mode 100644
index 969ad3c922f9c15b2e39f71ae4359cd3d2fcdcce..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/flowers.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module will download dataset from
-http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
-and parse train/test set intopaddle reader creators.
-
-This set contains images of flowers belonging to 102 different categories.
-The images were acquired by searching the web and taking pictures. There are a
-minimum of 40 images for each category.
-
-The database was used in:
-
-Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
- number of classes.Proceedings of the Indian Conference on Computer Vision,
-Graphics and Image Processing (2008)
-http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
-
-"""
-
-from __future__ import print_function
-
-import itertools
-import functools
-from .common import download
-import tarfile
-import scipy.io as scio
-from paddle.dataset.image import *
-from paddle.reader import *
-from paddle import compat as cpt
-import os
-import numpy as np
-from multiprocessing import cpu_count
-import six
-from six.moves import cPickle as pickle
-__all__ = ['train', 'test', 'valid']
-
-DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
-LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
-SETID_URL = 'http://paddlemodels.bj.bcebos.com/flowers/setid.mat'
-DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
-LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
-SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
-# In official 'readme', tstid is the flag of test data
-# and trnid is the flag of train data. But test data is more than train data.
-# So we exchange the train data and test data.
-TRAIN_FLAG = 'tstid'
-TEST_FLAG = 'trnid'
-VALID_FLAG = 'valid'
-
-
-def default_mapper(is_train, sample):
-    '''
-    map image bytes data to type needed by model input layer
-    '''
-    img, label = sample
-    img = load_image_bytes(img)
-    img = simple_transform(
-        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
-    return img.flatten().astype('float32'), label
-
-
-train_mapper = functools.partial(default_mapper, True)
-test_mapper = functools.partial(default_mapper, False)
-
-
-def reader_creator(data_file,
-                   label_file,
-                   setid_file,
-                   dataset_name,
-                   mapper,
-                   buffered_size=1024,
-                   use_xmap=True,
-                   cycle=False):
-    '''
-    1. read images from tar file and
-        merge images into batch files in 102flowers.tgz_batch/
-    2. get a reader to read sample from batch file
-
-    :param data_file: downloaded data file
-    :type data_file: string
-    :param label_file: downloaded label file
-    :type label_file: string
-    :param setid_file: downloaded setid file containing information
-                        about how to split dataset
-    :type setid_file: string
-    :param dataset_name: data set name (tstid|trnid|valid)
-    :type dataset_name: string
-    :param mapper: a function to map image bytes data to type
-                    needed by model input layer
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: data reader
-    :rtype: callable
-    '''
-    labels = scio.loadmat(label_file)['labels'][0]
-    indexes = scio.loadmat(setid_file)[dataset_name][0]
-    img2label = {}
-    for i in indexes:
-        img = "jpg/image_%05d.jpg" % i
-        img2label[img] = labels[i - 1]
-    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
-
-    def reader():
-        while True:
-            with open(file_list, 'r') as f_list:
-                for file in f_list:
-                    file = file.strip()
-                    batch = None
-                    with open(file, 'rb') as f:
-                        if six.PY2:
-                            batch = pickle.load(f)
-                        else:
-                            batch = pickle.load(f, encoding='bytes')
-
-                        if six.PY3:
-                            batch = cpt.to_text(batch)
-                        data_batch = batch['data']
-                        labels_batch = batch['label']
-                        for sample, label in six.moves.zip(data_batch,
-                                                           labels_batch):
-                            yield sample, int(label) - 1
-            if not cycle:
-                break
-
-    if use_xmap:
-        return xmap_readers(mapper, reader, min(4, cpu_count()), buffered_size)
-    else:
-        return map_readers(mapper, reader)
-
-
-def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
-    '''
-    Create flowers training set reader.
-    It returns a reader, each sample in the reader is
-    image pixels in [0, 1] and label in [1, 102]
-    translated from original color image by steps:
-    1. resize to 256*256
-    2. random crop to 224*224
-    3. flatten
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: train data reader
-    :rtype: callable
-    '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5),
-        TRAIN_FLAG,
-        mapper,
-        buffered_size,
-        use_xmap,
-        cycle=cycle)
-
-
-def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
-    '''
-    Create flowers test set reader.
-    It returns a reader, each sample in the reader is
-    image pixels in [0, 1] and label in [1, 102]
-    translated from original color image by steps:
-    1. resize to 256*256
-    2. random crop to 224*224
-    3. flatten
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: test data reader
-    :rtype: callable
-    '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5),
-        TEST_FLAG,
-        mapper,
-        buffered_size,
-        use_xmap,
-        cycle=cycle)
-
-
-def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
-    '''
-    Create flowers validation set reader.
-    It returns a reader, each sample in the reader is
-    image pixels in [0, 1] and label in [1, 102]
-    translated from original color image by steps:
-    1. resize to 256*256
-    2. random crop to 224*224
-    3. flatten
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :return: test data reader
-    :rtype: callable
-    '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
-        buffered_size, use_xmap)
-
-
-def fetch():
-    download(DATA_URL, 'flowers', DATA_MD5)
-    download(LABEL_URL, 'flowers', LABEL_MD5)
-    download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
deleted file mode 100644
index 57547f1867a937d16fb2dfc9b84e1a30759a527e..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/image.py
+++ /dev/null
@@ -1,416 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This file contains some common interfaces for image preprocess.
-Many users are confused about the image layout. We introduce
-the image layout as follows.
-
-- CHW Layout
-
-  - The abbreviations: C=channel, H=Height, W=Width
-  - The default layout of image opened by cv2 or PIL is HWC.
-    PaddlePaddle only supports the CHW layout. And CHW is simply
-    a transpose of HWC. It must transpose the input image.
-
-- Color format: RGB or BGR
-
-  OpenCV use BGR color format. PIL use RGB color format. Both
-  formats can be used for training. Noted that, the format should
-  be keep consistent between the training and inference peroid.
-"""
-
-from __future__ import print_function
-
-import six
-import numpy as np
-# FIXME(minqiyang): this is an ugly fix for the numpy bug reported here
-# https://github.com/numpy/numpy/issues/12497
-if six.PY3:
-    import subprocess
-    import sys
-    import_cv2_proc = subprocess.Popen(
-        [sys.executable, "-c", "import cv2"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE)
-    out, err = import_cv2_proc.communicate()
-    retcode = import_cv2_proc.poll()
-    if retcode != 0:
-        cv2 = None
-    else:
-        import cv2
-else:
-    try:
-        import cv2
-    except ImportError:
-        cv2 = None
-import os
-import tarfile
-import six.moves.cPickle as pickle
-
-__all__ = [
-    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
-    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
-    "batch_images_from_tar"
-]
-
-
-def _check_cv2():
-    if cv2 is None:
-        import sys
-        sys.stderr.write(
-            '''Warning with paddle image module: opencv-python should be imported,
-         or paddle image module could NOT work; please install opencv-python first.'''
-        )
-        return False
-    else:
-        return True
-
-
-def batch_images_from_tar(data_file,
-                          dataset_name,
-                          img2label,
-                          num_per_batch=1024):
-    """
-    Read images from tar file and batch them into batch file.
-
-    :param data_file: path of image tar file
-    :type data_file: string
-    :param dataset_name: 'train','test' or 'valid'
-    :type dataset_name: string
-    :param img2label: a dic with image file name as key
-                    and image's label as value
-    :type img2label: dic
-    :param num_per_batch: image number per batch file
-    :type num_per_batch: int
-    :return: path of list file containing paths of batch file
-    :rtype: string
-    """
-    batch_dir = data_file + "_batch"
-    out_path = "%s/%s" % (batch_dir, dataset_name)
-    meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
-
-    if os.path.exists(out_path):
-        return meta_file
-    else:
-        os.makedirs(out_path)
-
-    tf = tarfile.open(data_file)
-    mems = tf.getmembers()
-    data = []
-    labels = []
-    file_id = 0
-    for mem in mems:
-        if mem.name in img2label:
-            data.append(tf.extractfile(mem).read())
-            labels.append(img2label[mem.name])
-            if len(data) == num_per_batch:
-                output = {}
-                output['label'] = labels
-                output['data'] = data
-                pickle.dump(
-                    output,
-                    open('%s/batch_%d' % (out_path, file_id), 'wb'),
-                    protocol=2)
-                file_id += 1
-                data = []
-                labels = []
-    if len(data) > 0:
-        output = {}
-        output['label'] = labels
-        output['data'] = data
-        pickle.dump(
-            output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2)
-
-    with open(meta_file, 'a') as meta:
-        for file in os.listdir(out_path):
-            meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
-    return meta_file
-
-
-def load_image_bytes(bytes, is_color=True):
-    """
-    Load an color or gray image from bytes array.
-
-    Example usage:
-
-    .. code-block:: python
-
-        with open('cat.jpg') as f:
-            im = load_image_bytes(f.read())
-
-    :param bytes: the input image bytes array.
-    :type bytes: str
-    :param is_color: If set is_color True, it will load and
-                     return a color image. Otherwise, it will
-                     load and return a gray image.
-    :type is_color: bool
-    """
-    assert _check_cv2() is True
-
-    flag = 1 if is_color else 0
-    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
-    img = cv2.imdecode(file_bytes, flag)
-    return img
-
-
-def load_image(file, is_color=True):
-    """
-    Load an color or gray image from the file path.
-
-    Example usage:
-
-    .. code-block:: python
-
-        im = load_image('cat.jpg')
-
-    :param file: the input image path.
-    :type file: string
-    :param is_color: If set is_color True, it will load and
-                     return a color image. Otherwise, it will
-                     load and return a gray image.
-    :type is_color: bool
-    """
-    assert _check_cv2() is True
-
-    # cv2.IMAGE_COLOR for OpenCV3
-    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
-    # cv2.IMAGE_GRAYSCALE for OpenCV3
-    # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version
-    # Here, use constant 1 and 0
-    # 1: COLOR, 0: GRAYSCALE
-    flag = 1 if is_color else 0
-    im = cv2.imread(file, flag)
-    return im
-
-
-def resize_short(im, size):
-    """
-    Resize an image so that the length of shorter edge is size.
-
-    Example usage:
-
-    .. code-block:: python
-
-        im = load_image('cat.jpg')
-        im = resize_short(im, 256)
-
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param size: the shorter edge size of image after resizing.
-    :type size: int
-    """
-    assert _check_cv2() is True
-
-    h, w = im.shape[:2]
-    h_new, w_new = size, size
-    if h > w:
-        h_new = size * h // w
-    else:
-        w_new = size * w // h
-    im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC)
-    return im
-
-
-def to_chw(im, order=(2, 0, 1)):
-    """
-    Transpose the input image order. The image layout is HWC format
-    opened by cv2 or PIL. Transpose the input image to CHW layout
-    according the order (2,0,1).
-
-    Example usage:
-
-    .. code-block:: python
-
-        im = load_image('cat.jpg')
-        im = resize_short(im, 256)
-        im = to_chw(im)
-
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param order: the transposed order.
-    :type order: tuple|list
-    """
-    assert len(im.shape) == len(order)
-    im = im.transpose(order)
-    return im
-
-
-def center_crop(im, size, is_color=True):
-    """
-    Crop the center of image with size.
-
-    Example usage:
-
-    .. code-block:: python
-
-        im = center_crop(im, 224)
-
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param size: the cropping size.
-    :type size: int
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    """
-    h, w = im.shape[:2]
-    h_start = (h - size) // 2
-    w_start = (w - size) // 2
-    h_end, w_end = h_start + size, w_start + size
-    if is_color:
-        im = im[h_start:h_end, w_start:w_end, :]
-    else:
-        im = im[h_start:h_end, w_start:w_end]
-    return im
-
-
-def random_crop(im, size, is_color=True):
-    """
-    Randomly crop input image with size.
-
-    Example usage:
-
-    .. code-block:: python
-
-        im = random_crop(im, 224)
-
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param size: the cropping size.
-    :type size: int
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    """
-    h, w = im.shape[:2]
-    h_start = np.random.randint(0, h - size + 1)
-    w_start = np.random.randint(0, w - size + 1)
-    h_end, w_end = h_start + size, w_start + size
-    if is_color:
-        im = im[h_start:h_end, w_start:w_end, :]
-    else:
-        im = im[h_start:h_end, w_start:w_end]
-    return im
-
-
-def left_right_flip(im, is_color=True):
-    """
-    Flip an image along the horizontal direction.
-    Return the flipped image.
-
-    Example usage:
-
-    .. code-block:: python
-
-        im = left_right_flip(im)
-
-    :param im: input image with HWC layout or HW layout for gray image
-    :type im: ndarray
-    :param is_color: whether input image is color or not
-    :type is_color: bool
-    """
-    if len(im.shape) == 3 and is_color:
-        return im[:, ::-1, :]
-    else:
-        return im[:, ::-1]
-
-
-def simple_transform(im,
-                     resize_size,
-                     crop_size,
-                     is_train,
-                     is_color=True,
-                     mean=None):
-    """
-    Simply data argumentation for training. These operations include
-    resizing, croping and flipping.
-
-    Example usage:
-
-    .. code-block:: python
-
-        im = simple_transform(im, 256, 224, True)
-
-    :param im: The input image with HWC layout.
-    :type im: ndarray
-    :param resize_size: The shorter edge length of the resized image.
-    :type resize_size: int
-    :param crop_size: The cropping size.
-    :type crop_size: int
-    :param is_train: Whether it is training or not.
-    :type is_train: bool
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or
-                 mean values per channel.
-    :type mean: numpy array | list
-    """
-    im = resize_short(im, resize_size)
-    if is_train:
-        im = random_crop(im, crop_size, is_color=is_color)
-        if np.random.randint(2) == 0:
-            im = left_right_flip(im, is_color)
-    else:
-        im = center_crop(im, crop_size, is_color=is_color)
-    if len(im.shape) == 3:
-        im = to_chw(im)
-
-    im = im.astype('float32')
-    if mean is not None:
-        mean = np.array(mean, dtype=np.float32)
-        # mean value, may be one value per channel
-        if mean.ndim == 1 and is_color:
-            mean = mean[:, np.newaxis, np.newaxis]
-        elif mean.ndim == 1:
-            mean = mean
-        else:
-            # elementwise mean
-            assert len(mean.shape) == len(im)
-        im -= mean
-
-    return im
-
-
-def load_and_transform(filename,
-                       resize_size,
-                       crop_size,
-                       is_train,
-                       is_color=True,
-                       mean=None):
-    """
-    Load image from the input file `filename` and transform image for
-    data argumentation. Please refer to the `simple_transform` interface
-    for the transform operations.
-
-    Example usage:
-
-    .. code-block:: python
-
-        im = load_and_transform('cat.jpg', 256, 224, True)
-
-    :param filename: The file name of input image.
-    :type filename: string
-    :param resize_size: The shorter edge length of the resized image.
-    :type resize_size: int
-    :param crop_size: The cropping size.
-    :type crop_size: int
-    :param is_train: Whether it is training or not.
-    :type is_train: bool
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or
-                 mean values per channel.
-    :type mean: numpy array | list
-    """
-    im = load_image(filename, is_color)
-    im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
-    return im
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
deleted file mode 100644
index 99f4adc35c128dd44844411a4634082582ce7413..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/imdb.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-IMDB dataset.
-
-This module downloads IMDB dataset from
-http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
-of 25,000 highly polar movie reviews for training, and 25,000 for testing.
-Besides, this module also provides API for building dictionary.
-"""
-
-from __future__ import print_function
-
-import paddle.dataset.common
-import collections
-import tarfile
-import re
-import string
-import six
-
-__all__ = ['build_dict', 'train', 'test']
-
-URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
-MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
-
-
-def tokenize(pattern):
-    """
-    Read files that match the given pattern.  Tokenize and yield each file.
-    """
-
-    with tarfile.open(paddle.dataset.common.download(URL, 'imdb', MD5)) as tarf:
-        # Note that we should use tarfile.next(), which does
-        # sequential access of member files, other than
-        # tarfile.extractfile, which does random access and might
-        # destroy hard disks.
-        tf = tarf.next()
-        while tf != None:
-            if bool(pattern.match(tf.name)):
-                # newline and punctuations removal and ad-hoc tokenization.
-                yield tarf.extractfile(tf).read().rstrip(six.b(
-                    "\n\r")).translate(
-                        None, six.b(string.punctuation)).lower().split()
-            tf = tarf.next()
-
-
-def build_dict(pattern, cutoff):
-    """
-    Build a word dictionary from the corpus. Keys of the dictionary are words,
-    and values are zero-based IDs of these words.
-    """
-    word_freq = collections.defaultdict(int)
-    for doc in tokenize(pattern):
-        for word in doc:
-            word_freq[word] += 1
-
-    # Not sure if we should prune less-frequent words here.
-    word_freq = [x for x in six.iteritems(word_freq) if x[1] > cutoff]
-
-    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
-    words, _ = list(zip(*dictionary))
-    word_idx = dict(list(zip(words, six.moves.range(len(words)))))
-    word_idx['<unk>'] = len(words)
-    return word_idx
-
-
-def reader_creator(pos_pattern, neg_pattern, word_idx):
-    UNK = word_idx['<unk>']
-    INS = []
-
-    def load(pattern, out, label):
-        for doc in tokenize(pattern):
-            out.append(([word_idx.get(w, UNK) for w in doc], label))
-
-    load(pos_pattern, INS, 0)
-    load(neg_pattern, INS, 1)
-
-    def reader():
-        for doc, label in INS:
-            yield doc, label
-
-    return reader
-
-
-def train(word_idx):
-    """
-    IMDB training set creator.
-
-    It returns a reader creator, each sample in the reader is an zero-based ID
-    sequence and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        re.compile("aclImdb/train/pos/.*\.txt$"),
-        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
-
-
-def test(word_idx):
-    """
-    IMDB test set creator.
-
-    It returns a reader creator, each sample in the reader is an zero-based ID
-    sequence and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        re.compile("aclImdb/test/pos/.*\.txt$"),
-        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
-
-
-def word_dict():
-    """
-    Build a word dictionary from the corpus.
-
-    :return: Word dictionary
-    :rtype: dict
-    """
-    return build_dict(
-        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
-
-
-def fetch():
-    paddle.dataset.common.download(URL, 'imdb', MD5)
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
deleted file mode 100644
index 83cde3526ea9e425581ac358cca1e9ab6d3da859..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/imikolov.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-imikolov's simple dataset.
-
-This module will download dataset from
-http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
-into paddle reader creators.
-"""
-
-from __future__ import print_function
-
-import paddle.dataset.common
-import collections
-import tarfile
-import six
-
-__all__ = ['train', 'test', 'build_dict']
-
-URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
-MD5 = '30177ea32e27c525793142b6bf2c8e2d'
-
-
-class DataType(object):
-    NGRAM = 1
-    SEQ = 2
-
-
-def word_count(f, word_freq=None):
-    if word_freq is None:
-        word_freq = collections.defaultdict(int)
-
-    for l in f:
-        for w in l.strip().split():
-            word_freq[w] += 1
-        word_freq['<s>'] += 1
-        word_freq['<e>'] += 1
-
-    return word_freq
-
-
-def build_dict(min_word_freq=50):
-    """
-    Build a word dictionary from the corpus,  Keys of the dictionary are words,
-    and values are zero-based IDs of these words.
-    """
-    train_filename = './simple-examples/data/ptb.train.txt'
-    test_filename = './simple-examples/data/ptb.valid.txt'
-    with tarfile.open(
-            paddle.dataset.common.download(paddle.dataset.imikolov.URL,
-                                           'imikolov',
-                                           paddle.dataset.imikolov.MD5)) as tf:
-        trainf = tf.extractfile(train_filename)
-        testf = tf.extractfile(test_filename)
-        word_freq = word_count(testf, word_count(trainf))
-        if '<unk>' in word_freq:
-            # remove <unk> for now, since we will set it as last index
-            del word_freq['<unk>']
-
-        word_freq = [
-            x for x in six.iteritems(word_freq) if x[1] > min_word_freq
-        ]
-
-        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
-        words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(list(zip(words, six.moves.range(len(words)))))
-        word_idx['<unk>'] = len(words)
-
-    return word_idx
-
-
-def reader_creator(filename, word_idx, n, data_type):
-    def reader():
-        with tarfile.open(
-                paddle.dataset.common.download(
-                    paddle.dataset.imikolov.URL, 'imikolov',
-                    paddle.dataset.imikolov.MD5)) as tf:
-            f = tf.extractfile(filename)
-
-            UNK = word_idx['<unk>']
-            for l in f:
-                if DataType.NGRAM == data_type:
-                    assert n > -1, 'Invalid gram length'
-                    l = ['<s>'] + l.strip().split() + ['<e>']
-                    if len(l) >= n:
-                        l = [word_idx.get(w, UNK) for w in l]
-                        for i in six.moves.range(n, len(l) + 1):
-                            yield tuple(l[i - n:i])
-                elif DataType.SEQ == data_type:
-                    l = l.strip().split()
-                    l = [word_idx.get(w, UNK) for w in l]
-                    src_seq = [word_idx['<s>']] + l
-                    trg_seq = l + [word_idx['<e>']]
-                    if n > 0 and len(src_seq) > n: continue
-                    yield src_seq, trg_seq
-                else:
-                    assert False, 'Unknow data type'
-
-    return reader
-
-
-def train(word_idx, n, data_type=DataType.NGRAM):
-    """
-    imikolov training set creator.
-
-    It returns a reader creator, each sample in the reader is a word ID
-    tuple.
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :param n: sliding window size if type is ngram, otherwise max length of sequence
-    :type n: int
-    :param data_type: data type (ngram or sequence)
-    :type data_type: member variable of DataType (NGRAM or SEQ)
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
-                          data_type)
-
-
-def test(word_idx, n, data_type=DataType.NGRAM):
-    """
-    imikolov test set creator.
-
-    It returns a reader creator, each sample in the reader is a word ID
-    tuple.
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :param n: sliding window size if type is ngram, otherwise max length of sequence
-    :type n: int
-    :param data_type: data type (ngram or sequence)
-    :type data_type: member variable of DataType (NGRAM or SEQ)
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
-                          data_type)
-
-
-def fetch():
-    paddle.dataset.common.download(URL, "imikolov", MD5)
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
deleted file mode 100644
index f52ffa049bc4aea7f56cb16221682cedfb67fd92..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/mnist.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-MNIST dataset.
-
-This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
-parse training set and test set into paddle reader creators.
-"""
-
-from __future__ import print_function
-
-import paddle.dataset.common
-import gzip
-import numpy
-import struct
-from six.moves import range
-__all__ = ['train', 'test']
-
-URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
-TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
-TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
-TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
-TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
-TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
-TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
-TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
-TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
-
-
-def reader_creator(image_filename, label_filename, buffer_size):
-    def reader():
-        with gzip.GzipFile(image_filename, 'rb') as image_file:
-            img_buf = image_file.read()
-            with gzip.GzipFile(label_filename, 'rb') as label_file:
-                lab_buf = label_file.read()
-
-                step_label = 0
-
-                offset_img = 0
-                # read from Big-endian
-                # get file info from magic byte
-                # image file : 16B
-                magic_byte_img = '>IIII'
-                magic_img, image_num, rows, cols = struct.unpack_from(
-                    magic_byte_img, img_buf, offset_img)
-                offset_img += struct.calcsize(magic_byte_img)
-
-                offset_lab = 0
-                # label file : 8B
-                magic_byte_lab = '>II'
-                magic_lab, label_num = struct.unpack_from(magic_byte_lab,
-                                                          lab_buf, offset_lab)
-                offset_lab += struct.calcsize(magic_byte_lab)
-
-                while True:
-                    if step_label >= label_num:
-                        break
-                    fmt_label = '>' + str(buffer_size) + 'B'
-                    labels = struct.unpack_from(fmt_label, lab_buf, offset_lab)
-                    offset_lab += struct.calcsize(fmt_label)
-                    step_label += buffer_size
-
-                    fmt_images = '>' + str(buffer_size * rows * cols) + 'B'
-                    images_temp = struct.unpack_from(fmt_images, img_buf,
-                                                     offset_img)
-                    images = numpy.reshape(images_temp, (
-                        buffer_size, rows * cols)).astype('float32')
-                    offset_img += struct.calcsize(fmt_images)
-
-                    images = images / 255.0
-                    images = images * 2.0
-                    images = images - 1.0
-
-                    for i in range(buffer_size):
-                        yield images[i, :], int(labels[i])
-
-    return reader
-
-
-def train():
-    """
-    MNIST training set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [-1, 1] and label in [0, 9].
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
-                                       TRAIN_IMAGE_MD5),
-        paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
-                                       TRAIN_LABEL_MD5), 100)
-
-
-def test():
-    """
-    MNIST test set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [-1, 1] and label in [0, 9].
-
-    :return: Test reader creator.
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5),
-        paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5),
-        100)
-
-
-def fetch():
-    paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
-    paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
-    paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
deleted file mode 100644
index eddd858ace863df7983089eee2a556193004c587..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/movielens.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Movielens 1-M dataset.
-
-Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
-movies, which was collected by GroupLens Research. This module will download
-Movielens 1-M dataset from
-http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
-set and test set into paddle reader creators.
-
-"""
-
-from __future__ import print_function
-
-import numpy as np
-import zipfile
-import paddle.dataset.common
-import re
-import random
-import functools
-import six
-import paddle.compat as cpt
-
-__all__ = [
-    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
-    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
-]
-
-age_table = [1, 18, 25, 35, 45, 50, 56]
-
-URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
-MD5 = 'c4d9eecfca2ab87c1945afe126590906'
-
-
-class MovieInfo(object):
-    """
-    Movie id, title and categories information are stored in MovieInfo.
-    """
-
-    def __init__(self, index, categories, title):
-        self.index = int(index)
-        self.categories = categories
-        self.title = title
-
-    def value(self):
-        """
-        Get information from a movie.
-        """
-        return [
-            self.index, [CATEGORIES_DICT[c] for c in self.categories],
-            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
-        ]
-
-    def __str__(self):
-        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
-            self.index, self.title, self.categories)
-
-    def __repr__(self):
-        return self.__str__()
-
-
-class UserInfo(object):
-    """
-    User id, gender, age, and job information are stored in UserInfo.
-    """
-
-    def __init__(self, index, gender, age, job_id):
-        self.index = int(index)
-        self.is_male = gender == 'M'
-        self.age = age_table.index(int(age))
-        self.job_id = int(job_id)
-
-    def value(self):
-        """
-        Get information from a user.
-        """
-        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
-
-    def __str__(self):
-        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
-            self.index, "M"
-            if self.is_male else "F", age_table[self.age], self.job_id)
-
-    def __repr__(self):
-        return str(self)
-
-
-MOVIE_INFO = None
-MOVIE_TITLE_DICT = None
-CATEGORIES_DICT = None
-USER_INFO = None
-
-
-def __initialize_meta_info__():
-    fn = paddle.dataset.common.download(URL, "movielens", MD5)
-    global MOVIE_INFO
-    if MOVIE_INFO is None:
-        pattern = re.compile(r'^(.*)\((\d+)\)$')
-        with zipfile.ZipFile(file=fn) as package:
-            for info in package.infolist():
-                assert isinstance(info, zipfile.ZipInfo)
-                MOVIE_INFO = dict()
-                title_word_set = set()
-                categories_set = set()
-                with package.open('ml-1m/movies.dat') as movie_file:
-                    for i, line in enumerate(movie_file):
-                        line = cpt.to_text(line, encoding='latin')
-                        movie_id, title, categories = line.strip().split('::')
-                        categories = categories.split('|')
-                        for c in categories:
-                            categories_set.add(c)
-                        title = pattern.match(title).group(1)
-                        MOVIE_INFO[int(movie_id)] = MovieInfo(
-                            index=movie_id, categories=categories, title=title)
-                        for w in title.split():
-                            title_word_set.add(w.lower())
-
-                global MOVIE_TITLE_DICT
-                MOVIE_TITLE_DICT = dict()
-                for i, w in enumerate(title_word_set):
-                    MOVIE_TITLE_DICT[w] = i
-
-                global CATEGORIES_DICT
-                CATEGORIES_DICT = dict()
-                for i, c in enumerate(categories_set):
-                    CATEGORIES_DICT[c] = i
-
-                global USER_INFO
-                USER_INFO = dict()
-                with package.open('ml-1m/users.dat') as user_file:
-                    for line in user_file:
-                        line = cpt.to_text(line, encoding='latin')
-                        uid, gender, age, job, _ = line.strip().split("::")
-                        USER_INFO[int(uid)] = UserInfo(
-                            index=uid, gender=gender, age=age, job_id=job)
-    return fn
-
-
-def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
-    fn = __initialize_meta_info__()
-    np.random.seed(rand_seed)
-    with zipfile.ZipFile(file=fn) as package:
-        with package.open('ml-1m/ratings.dat') as rating:
-            for line in rating:
-                line = cpt.to_text(line, encoding='latin')
-                if (np.random.random() < test_ratio) == is_test:
-                    uid, mov_id, rating, _ = line.strip().split("::")
-                    uid = int(uid)
-                    mov_id = int(mov_id)
-                    rating = float(rating) * 2 - 5.0
-
-                    mov = MOVIE_INFO[mov_id]
-                    usr = USER_INFO[uid]
-                    yield usr.value() + mov.value() + [[rating]]
-
-
-def __reader_creator__(**kwargs):
-    return lambda: __reader__(**kwargs)
-
-
-train = functools.partial(__reader_creator__, is_test=False)
-test = functools.partial(__reader_creator__, is_test=True)
-
-
-def get_movie_title_dict():
-    """
-    Get movie title dictionary.
-    """
-    __initialize_meta_info__()
-    return MOVIE_TITLE_DICT
-
-
-def __max_index_info__(a, b):
-    if a.index > b.index:
-        return a
-    else:
-        return b
-
-
-def max_movie_id():
-    """
-    Get the maximum value of movie id.
-    """
-    __initialize_meta_info__()
-    return six.moves.reduce(__max_index_info__, list(MOVIE_INFO.values())).index
-
-
-def max_user_id():
-    """
-    Get the maximum value of user id.
-    """
-    __initialize_meta_info__()
-    return six.moves.reduce(__max_index_info__, list(USER_INFO.values())).index
-
-
-def __max_job_id_impl__(a, b):
-    if a.job_id > b.job_id:
-        return a
-    else:
-        return b
-
-
-def max_job_id():
-    """
-    Get the maximum value of job id.
-    """
-    __initialize_meta_info__()
-    return six.moves.reduce(__max_job_id_impl__,
-                            list(USER_INFO.values())).job_id
-
-
-def movie_categories():
-    """
-    Get movie categoriges dictionary.
-    """
-    __initialize_meta_info__()
-    return CATEGORIES_DICT
-
-
-def user_info():
-    """
-    Get user info dictionary.
-    """
-    __initialize_meta_info__()
-    return USER_INFO
-
-
-def movie_info():
-    """
-    Get movie info dictionary.
-    """
-    __initialize_meta_info__()
-    return MOVIE_INFO
-
-
-def unittest():
-    for train_count, _ in enumerate(train()()):
-        pass
-    for test_count, _ in enumerate(test()()):
-        pass
-
-    print(train_count, test_count)
-
-
-def fetch():
-    paddle.dataset.common.download(URL, "movielens", MD5)
-
-
-if __name__ == '__main__':
-    unittest()
diff --git a/python/paddle/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
deleted file mode 100644
index d5740f30c898d5704636e1de9b2e1137d12e3c35..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/mq2007.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-MQ2007 dataset
-
-MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
-validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
-validation set and testing set.
-
-MQ2007 dataset from website
-http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
-
-"""
-
-from __future__ import print_function
-
-import os
-import functools
-import rarfile
-from .common import download
-import numpy as np
-
-# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
-URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
-MD5 = "7be1640ae95c6408dab0ae7207bdc706"
-
-
-def __initialize_meta_info__():
-    """
-  download and extract the MQ2007 dataset
-  """
-    fn = fetch()
-    rar = rarfile.RarFile(fn)
-    dirpath = os.path.dirname(fn)
-    rar.extractall(path=dirpath)
-    return dirpath
-
-
-class Query(object):
-    """
-  queries used for learning to rank algorithms. It is created from relevance scores,  query-document feature vectors
-
-  Parameters:
-  ----------
-  query_id : int
-    query_id in dataset, mapping from query to relevance documents
-  relevance_score : int
-    relevance score of query and document pair
-  feature_vector : array, dense feature
-    feature in vector format
-  description : string
-    comment section in query doc pair data
-  """
-
-    def __init__(self,
-                 query_id=-1,
-                 relevance_score=-1,
-                 feature_vector=None,
-                 description=""):
-        self.query_id = query_id
-        self.relevance_score = relevance_score
-        if feature_vector is None:
-            self.feature_vector = []
-        else:
-            self.feature_vector = feature_vector
-        self.description = description
-
-    def __str__(self):
-        string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
-                               " ".join(str(f) for f in self.feature_vector))
-        return string
-
-    # @classmethod
-    def _parse_(self, text):
-        """
-    parse line into Query
-    """
-        comment_position = text.find('#')
-        line = text[:comment_position].strip()
-        self.description = text[comment_position + 1:].strip()
-        parts = line.split()
-        if len(parts) != 48:
-            sys.stdout.write("expect 48 space split parts, get %d" %
-                             (len(parts)))
-            return None
-        # format : 0 qid:10 1:0.000272 2:0.000000 ....
-        self.relevance_score = int(parts[0])
-        self.query_id = int(parts[1].split(':')[1])
-        for p in parts[2:]:
-            pair = p.split(':')
-            self.feature_vector.append(float(pair[1]))
-        return self
-
-
-class QueryList(object):
-    """
-  group query into list, every item in list is a Query
-  """
-
-    def __init__(self, querylist=None):
-        self.query_id = -1
-        if querylist is None:
-            self.querylist = []
-        else:
-            self.querylist = querylist
-            for query in self.querylist:
-                if self.query_id == -1:
-                    self.query_id = query.query_id
-                else:
-                    if self.query_id != query.query_id:
-                        raise ValueError("query in list must be same query_id")
-
-    def __iter__(self):
-        for query in self.querylist:
-            yield query
-
-    def __len__(self):
-        return len(self.querylist)
-
-    def __getitem__(self, i):
-        return self.querylist[i]
-
-    def _correct_ranking_(self):
-        if self.querylist is None:
-            return
-        self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
-
-    def _add_query(self, query):
-        if self.query_id == -1:
-            self.query_id = query.query_id
-        else:
-            if self.query_id != query.query_id:
-                raise ValueError("query in list must be same query_id")
-        self.querylist.append(query)
-
-
-def gen_plain_txt(querylist):
-    """
-  gen plain text in list for other usage
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-
-  return :
-  ------
-  query_id : np.array, shape=(samples_num, )
-  label : np.array, shape=(samples_num, )
-  querylist : np.array, shape=(samples_num, feature_dimension)
-    """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    for query in querylist:
-        yield querylist.query_id, query.relevance_score, np.array(
-            query.feature_vector)
-
-
-def gen_point(querylist):
-    """
-  gen item in list for point-wise learning to rank algorithm
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-
-  return :
-  ------
-  label : np.array, shape=(samples_num, )
-  querylist : np.array, shape=(samples_num, feature_dimension)
-  """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    for query in querylist:
-        yield query.relevance_score, np.array(query.feature_vector)
-
-
-def gen_pair(querylist, partial_order="full"):
-    """
-  gen pair for pair-wise learning to rank algorithm
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-  pairtial_order : "full" or "neighbour"
-    there is redudant in all possiable pair combinations, which can be simplifed
-  gen pairs for neighbour items or the full partial order pairs
-
-  return :
-  ------
-  label : np.array, shape=(1)
-  query_left : np.array, shape=(1, feature_dimension)
-  query_right : same as left
-  """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    labels = []
-    docpairs = []
-
-    # C(n,2)
-    for i in range(len(querylist)):
-        query_left = querylist[i]
-        for j in range(i + 1, len(querylist)):
-            query_right = querylist[j]
-            if query_left.relevance_score > query_right.relevance_score:
-                labels.append([1])
-                docpairs.append([
-                    np.array(query_left.feature_vector),
-                    np.array(query_right.feature_vector)
-                ])
-            elif query_left.relevance_score < query_right.relevance_score:
-                labels.append([1])
-                docpairs.append([
-                    np.array(query_right.feature_vector),
-                    np.array(query_left.feature_vector)
-                ])
-    for label, pair in zip(labels, docpairs):
-        yield np.array(label), pair[0], pair[1]
-
-
-def gen_list(querylist):
-    """
-  gen item in list for list-wise learning to rank algorithm
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-
-  return :
-  ------
-  label : np.array, shape=(samples_num, )
-  querylist : np.array, shape=(samples_num, feature_dimension)
-  """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    relevance_score_list = [[query.relevance_score] for query in querylist]
-    feature_vector_list = [query.feature_vector for query in querylist]
-    yield np.array(relevance_score_list), np.array(feature_vector_list)
-
-
-def query_filter(querylists):
-    """
-    filter query get only document with label 0.
-    label 0, 1, 2 means the relevance score document with query
-    parameters :
-      querylist : QueyList list
-
-    return :
-      querylist : QueyList list
-    """
-    filter_query = []
-    for querylist in querylists:
-        relevance_score_list = [query.relevance_score for query in querylist]
-        if sum(relevance_score_list) != .0:
-            filter_query.append(querylist)
-    return filter_query
-
-
-def load_from_text(filepath, shuffle=False, fill_missing=-1):
-    """
-  parse data file into querys
-  """
-    prev_query_id = -1
-    querylists = []
-    querylist = None
-    fn = __initialize_meta_info__()
-    with open(os.path.join(fn, filepath)) as f:
-        for line in f:
-            query = Query()
-            query = query._parse_(line)
-            if query == None:
-                continue
-            if query.query_id != prev_query_id:
-                if querylist is not None:
-                    querylists.append(querylist)
-                querylist = QueryList()
-                prev_query_id = query.query_id
-            querylist._add_query(query)
-    if querylist is not None:
-        querylists.append(querylist)
-    return querylists
-
-
-def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
-    """
-  Parameters
-  --------
-  filename : string
-  fill_missing : fill the missing value. default in MQ2007 is -1
-
-  Returns
-  ------
-  yield
-    label query_left, query_right  # format = "pairwise"
-    label querylist # format = "listwise"
-  """
-    querylists = query_filter(
-        load_from_text(
-            filepath, shuffle=shuffle, fill_missing=fill_missing))
-    for querylist in querylists:
-        if format == "plain_txt":
-            yield next(gen_plain_txt(querylist))
-        elif format == "pointwise":
-            yield next(gen_point(querylist))
-        elif format == "pairwise":
-            for pair in gen_pair(querylist):
-                yield pair
-        elif format == "listwise":
-            yield next(gen_list(querylist))
-
-
-train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
-test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
-
-
-def fetch():
-    return download(URL, "MQ2007", MD5)
-
-
-if __name__ == "__main__":
-    fetch()
-    mytest = functools.partial(
-        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
-    for label, query in mytest():
-        print(label, query)
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
deleted file mode 100644
index 9a1eae3f82a7b45d482282a8afed8fb476ad6a6a..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/sentiment.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# /usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-The script fetch and preprocess movie_reviews data set that provided by NLTK
-
-TODO(yuyang18): Complete dataset.
-"""
-
-from __future__ import print_function
-
-import six
-import collections
-from itertools import chain
-
-import nltk
-from nltk.corpus import movie_reviews
-
-import paddle.dataset.common
-
-__all__ = ['train', 'test', 'get_word_dict']
-NUM_TRAINING_INSTANCES = 1600
-NUM_TOTAL_INSTANCES = 2000
-
-
-def download_data_if_not_yet():
-    """
-    Download the data set, if the data set is not download.
-    """
-    try:
-        # make sure that nltk can find the data
-        if paddle.dataset.common.DATA_HOME not in nltk.data.path:
-            nltk.data.path.append(paddle.dataset.common.DATA_HOME)
-        movie_reviews.categories()
-    except LookupError:
-        print("Downloading movie_reviews data set, please wait.....")
-        nltk.download(
-            'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-        print("Download data set success.....")
-        print("Path is " + nltk.data.find('corpora/movie_reviews').path)
-
-
-def get_word_dict():
-    """
-    Sorted the words by the frequency of words which occur in sample
-    :return:
-        words_freq_sorted
-    """
-    words_freq_sorted = list()
-    word_freq_dict = collections.defaultdict(int)
-    download_data_if_not_yet()
-
-    for category in movie_reviews.categories():
-        for field in movie_reviews.fileids(category):
-            for words in movie_reviews.words(field):
-                word_freq_dict[words] += 1
-    words_sort_list = list(six.iteritems(word_freq_dict))
-    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
-    for index, word in enumerate(words_sort_list):
-        words_freq_sorted.append((word[0], index))
-    return words_freq_sorted
-
-
-def sort_files():
-    """
-    Sorted the sample for cross reading the sample
-    :return:
-        files_list
-    """
-    files_list = list()
-    neg_file_list = movie_reviews.fileids('neg')
-    pos_file_list = movie_reviews.fileids('pos')
-    files_list = list(
-        chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
-    return files_list
-
-
-def load_sentiment_data():
-    """
-    Load the data set
-    :return:
-        data_set
-    """
-    data_set = list()
-    download_data_if_not_yet()
-    words_ids = dict(get_word_dict())
-    for sample_file in sort_files():
-        words_list = list()
-        category = 0 if 'neg' in sample_file else 1
-        for word in movie_reviews.words(sample_file):
-            words_list.append(words_ids[word.lower()])
-        data_set.append((words_list, category))
-    return data_set
-
-
-def reader_creator(data):
-    """
-    Reader creator, generate an iterator for data set
-    :param data:
-        train data set or test data set
-    """
-    for each in data:
-        yield each[0], each[1]
-
-
-def train():
-    """
-    Default training set reader creator
-    """
-    data_set = load_sentiment_data()
-    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
-
-
-def test():
-    """
-    Default test set reader creator
-    """
-    data_set = load_sentiment_data()
-    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
-
-
-def fetch():
-    nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
diff --git a/python/paddle/dataset/tests/CMakeLists.txt b/python/paddle/dataset/tests/CMakeLists.txt
deleted file mode 100644
index 485c38a13b573664d8033c237272a10ebb7c9701..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-py_test(test_image SRCS test_image.py)
diff --git a/python/paddle/dataset/tests/cat.jpg b/python/paddle/dataset/tests/cat.jpg
deleted file mode 100644
index bc1fbbd371216b9904b522ed302700c79d2e4876..0000000000000000000000000000000000000000
Binary files a/python/paddle/dataset/tests/cat.jpg and /dev/null differ
diff --git a/python/paddle/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py
deleted file mode 100644
index 8e514f0fd9a18a7d512430111a8a11b942950d20..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/cifar_test.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.dataset.cifar
-import unittest
-
-
-class TestCIFAR(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        for l in reader():
-            self.assertEqual(l[0].size, 3072)
-            if l[1] > label:
-                label = l[1]
-            sum += 1
-        return sum, label
-
-    def test_test10(self):
-        instances, max_label_value = self.check_reader(
-            paddle.dataset.cifar.test10())
-        self.assertEqual(instances, 10000)
-        self.assertEqual(max_label_value, 9)
-
-    def test_train10(self):
-        instances, max_label_value = self.check_reader(
-            paddle.dataset.cifar.train10())
-        self.assertEqual(instances, 50000)
-        self.assertEqual(max_label_value, 9)
-
-    def test_test100(self):
-        instances, max_label_value = self.check_reader(
-            paddle.dataset.cifar.test100())
-        self.assertEqual(instances, 10000)
-        self.assertEqual(max_label_value, 99)
-
-    def test_train100(self):
-        instances, max_label_value = self.check_reader(
-            paddle.dataset.cifar.train100())
-        self.assertEqual(instances, 50000)
-        self.assertEqual(max_label_value, 99)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py
deleted file mode 100644
index 06a0a7761cfa10ca3211297d176e3e909332e271..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/flowers_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.dataset.flowers
-import unittest
-
-
-class TestFlowers(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        size = 224 * 224 * 3
-        for l in reader():
-            self.assertEqual(l[0].size, size)
-            if l[1] > label:
-                label = l[1]
-            sum += 1
-        return sum, label
-
-    def test_train(self):
-        instances, max_label_value = self.check_reader(
-            paddle.dataset.flowers.train())
-        self.assertEqual(instances, 6149)
-        self.assertEqual(max_label_value, 102)
-
-    def test_test(self):
-        instances, max_label_value = self.check_reader(
-            paddle.dataset.flowers.test())
-        self.assertEqual(instances, 1020)
-        self.assertEqual(max_label_value, 102)
-
-    def test_valid(self):
-        instances, max_label_value = self.check_reader(
-            paddle.dataset.flowers.valid())
-        self.assertEqual(instances, 1020)
-        self.assertEqual(max_label_value, 102)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
deleted file mode 100644
index 415947e3477f2e5b9979588528f7cb6f799acf6a..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/imdb_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.dataset.imdb
-import unittest
-import re
-
-TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
-TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
-TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
-
-TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
-TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
-TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
-
-
-class TestIMDB(unittest.TestCase):
-    word_idx = None
-
-    def test_build_dict(self):
-        if self.word_idx == None:
-            self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
-
-        self.assertEqual(len(self.word_idx), 7036)
-
-    def check_dataset(self, dataset, expected_size):
-        if self.word_idx == None:
-            self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
-
-        sum = 0
-        for l in dataset(self.word_idx):
-            self.assertEqual(l[1], sum % 2)
-            sum += 1
-        self.assertEqual(sum, expected_size)
-
-    def test_train(self):
-        self.check_dataset(paddle.dataset.imdb.train, 25000)
-
-    def test_test(self):
-        self.check_dataset(paddle.dataset.imdb.test, 25000)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
deleted file mode 100644
index 1f78a5dd4d1a09c3192bc8c144c5a78c8a214f3a..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/imikolov_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.dataset.imikolov
-import unittest
-
-WORD_DICT = paddle.dataset.imikolov.build_dict()
-
-
-class TestMikolov(unittest.TestCase):
-    def check_reader(self, reader, n):
-        for l in reader():
-            self.assertEqual(len(l), n)
-
-    def test_train(self):
-        n = 5
-        self.check_reader(paddle.dataset.imikolov.train(WORD_DICT, n), n)
-
-        first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
-            'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
-            'rake regatta rubens sim snack-food ssangyong swapo wachter'
-        first_line = [
-            WORD_DICT.get(ch, WORD_DICT['<unk>'])
-            for ch in first_line.split(' ')
-        ]
-        for l in paddle.dataset.imikolov.train(
-                WORD_DICT, n=-1,
-                data_type=paddle.dataset.imikolov.DataType.SEQ)():
-            read_line = l[0][1:]
-            break
-        self.assertEqual(first_line, read_line)
-
-    def test_test(self):
-        n = 5
-        self.check_reader(paddle.dataset.imikolov.test(WORD_DICT, n), n)
-
-        first_line = 'consumers may want to move their telephones a little '\
-                'closer to the tv set'
-        first_line = [
-            WORD_DICT.get(ch, WORD_DICT['<unk>'])
-            for ch in first_line.split(' ')
-        ]
-        for l in paddle.dataset.imikolov.test(
-                WORD_DICT, n=-1,
-                data_type=paddle.dataset.imikolov.DataType.SEQ)():
-            read_line = l[0][1:]
-            break
-        self.assertEqual(first_line, read_line)
-
-    def test_total(self):
-        _, idx = list(zip(*list(WORD_DICT.items())))
-        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py
deleted file mode 100644
index fbb5d926494e38283e78ec15381530e50f32915d..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/mnist_test.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.dataset.mnist
-import unittest
-
-
-class TestMNIST(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        for l in reader():
-            self.assertEqual(l[0].size, 784)
-            if l[1] > label:
-                label = l[1]
-            sum += 1
-        return sum, label
-
-    def test_train(self):
-        instances, max_label_value = self.check_reader(
-            paddle.dataset.mnist.train())
-        self.assertEqual(instances, 60000)
-        self.assertEqual(max_label_value, 9)
-
-    def test_test(self):
-        instances, max_label_value = self.check_reader(
-            paddle.dataset.mnist.test())
-        self.assertEqual(instances, 10000)
-        self.assertEqual(max_label_value, 9)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/dataset/tests/mq2007_test.py b/python/paddle/dataset/tests/mq2007_test.py
deleted file mode 100644
index ee0897e88f0d7ad089b7f7b68d31d04d96fa3e9d..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/mq2007_test.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.dataset.mq2007
-import unittest
-
-
-class TestMQ2007(unittest.TestCase):
-    def test_pairwise(self):
-        for label, query_left, query_right in paddle.dataset.mq2007.test(
-                format="pairwise"):
-            self.assertEqual(query_left.shape(), (46, ))
-            self.assertEqual(query_right.shape(), (46, ))
-
-    def test_listwise(self):
-        for label_array, query_array in paddle.dataset.mq2007.test(
-                format="listwise"):
-            self.assertEqual(len(label_array), len(query_array))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/dataset/tests/test_image.py b/python/paddle/dataset/tests/test_image.py
deleted file mode 100644
index 32d2eb17ae673e72bbee2fc3bb5e3b05f1b20074..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/test_image.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.dataset.image as image
-
-
-class Image(unittest.TestCase):
-    def test_resize_flip_chw(self):
-        # resize
-        im = image.load_image('cat.jpg')
-        im = image.resize_short(im, 256)
-        self.assertEqual(256, min(im.shape[:2]))
-        self.assertEqual(3, im.shape[2])
-
-        # flip
-        im = image.left_right_flip(im)
-        im2 = np.flip(im, 1)
-        self.assertEqual(im.all(), im2.all())
-
-        # to_chw
-        h, w, c = im.shape
-        im = image.to_chw(im)
-        self.assertEqual(c, im.shape[0])
-        self.assertEqual(h, im.shape[1])
-        self.assertEqual(w, im.shape[2])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
deleted file mode 100644
index bb9830132e987370022df3192060de3e908a2e85..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# /usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import nltk
-import paddle.dataset.sentiment as st
-from nltk.corpus import movie_reviews
-
-
-class TestSentimentMethods(unittest.TestCase):
-    def test_get_word_dict(self):
-        word_dict = st.get_word_dict()[0:10]
-        test_word_list = [(',', 0), ('the', 1), ('.', 2), ('a', 3), ('and', 4),
-                          ('of', 5), ('to', 6), ("'", 7), ('is', 8), ('in', 9)]
-        for idx, each in enumerate(word_dict):
-            self.assertEqual(each, test_word_list[idx])
-        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
-
-    def test_sort_files(self):
-        last_label = ''
-        for sample_file in st.sort_files():
-            current_label = sample_file.split("/")[0]
-            self.assertNotEqual(current_label, last_label)
-            last_label = current_label
-
-    def test_data_set(self):
-        data_set = st.load_sentiment_data()
-        last_label = -1
-        for each in st.test():
-            self.assertNotEqual(each[1], last_label)
-            last_label = each[1]
-        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
-        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
-        self.assertEqual(
-            len(list(st.test())),
-            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py
deleted file mode 100644
index cddeb91cab2c0f90567f28f8258156e2bb654abc..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/voc2012_test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.dataset.voc2012
-import unittest
-
-
-class TestVOC(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        for l in reader():
-            self.assertEqual(l[0].size, 3 * l[1].size)
-            sum += 1
-        return sum
-
-    def test_train(self):
-        count = self.check_reader(paddle.dataset.voc_seg.train())
-        self.assertEqual(count, 2913)
-
-    def test_test(self):
-        count = self.check_reader(paddle.dataset.voc_seg.test())
-        self.assertEqual(count, 1464)
-
-    def test_val(self):
-        count = self.check_reader(paddle.dataset.voc_seg.val())
-        self.assertEqual(count, 1449)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py
deleted file mode 100644
index be121bb10121967590c9e136e9a1964a133e934b..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/tests/wmt16_test.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.dataset.wmt16
-import unittest
-
-
-class TestWMT16(unittest.TestCase):
-    def checkout_one_sample(self, sample):
-        # train data has 3 field: source language word indices,
-        # target language word indices, and target next word indices.
-        self.assertEqual(len(sample), 3)
-
-        # test start mark and end mark in source word indices.
-        self.assertEqual(sample[0][0], 0)
-        self.assertEqual(sample[0][-1], 1)
-
-        # test start mask in target word indices
-        self.assertEqual(sample[1][0], 0)
-
-        # test en mask in target next word indices
-        self.assertEqual(sample[2][-1], 1)
-
-    def test_train(self):
-        for idx, sample in enumerate(
-                paddle.dataset.wmt16.train(
-                    src_dict_size=100000, trg_dict_size=100000)()):
-            if idx >= 10: break
-            self.checkout_one_sample(sample)
-
-    def test_test(self):
-        for idx, sample in enumerate(
-                paddle.dataset.wmt16.test(
-                    src_dict_size=1000, trg_dict_size=1000)()):
-            if idx >= 10: break
-            self.checkout_one_sample(sample)
-
-    def test_val(self):
-        for idx, sample in enumerate(
-                paddle.dataset.wmt16.validation(
-                    src_dict_size=1000, trg_dict_size=1000)()):
-            if idx >= 10: break
-            self.checkout_one_sample(sample)
-
-    def test_get_dict(self):
-        dict_size = 1000
-        word_dict = paddle.dataset.wmt16.get_dict("en", dict_size, True)
-        self.assertEqual(len(word_dict), dict_size)
-        self.assertEqual(word_dict[0], "<s>")
-        self.assertEqual(word_dict[1], "<e>")
-        self.assertEqual(word_dict[2], "<unk>")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
deleted file mode 100644
index 5bc9c1444d2b34f057cd92782eb50e5fc23916eb..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/uci_housing.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-UCI Housing dataset.
-
-This module will download dataset from
-https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
-parse training set and test set into paddle reader creators.
-"""
-
-from __future__ import print_function
-
-import numpy as np
-import six
-import tempfile
-import tarfile
-import os
-import paddle.dataset.common
-
-__all__ = ['train', 'test']
-
-URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
-MD5 = 'd4accdce7a25600298819f8e28e8d593'
-feature_names = [
-    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
-    'PTRATIO', 'B', 'LSTAT'
-]
-
-UCI_TRAIN_DATA = None
-UCI_TEST_DATA = None
-
-FLUID_URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fluid/fit_a_line.fluid.tar'
-FLUID_MD5_MODEL = '6e6dd637ccd5993961f68bfbde46090b'
-
-
-def feature_range(maximums, minimums):
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt
-    fig, ax = plt.subplots()
-    feature_num = len(maximums)
-    ax.bar(list(range(feature_num)),
-           maximums - minimums,
-           color='r',
-           align='center')
-    ax.set_title('feature scale')
-    plt.xticks(list(range(feature_num)), feature_names)
-    plt.xlim([-1, feature_num])
-    fig.set_figheight(6)
-    fig.set_figwidth(10)
-    if not os.path.exists('./image'):
-        os.makedirs('./image')
-    fig.savefig('image/ranges.png', dpi=48)
-    plt.close(fig)
-
-
-def load_data(filename, feature_num=14, ratio=0.8):
-    global UCI_TRAIN_DATA, UCI_TEST_DATA
-    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
-        return
-
-    data = np.fromfile(filename, sep=' ')
-    data = data.reshape(data.shape[0] // feature_num, feature_num)
-    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
-        axis=0) / data.shape[0]
-    feature_range(maximums[:-1], minimums[:-1])
-    for i in six.moves.range(feature_num - 1):
-        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
-    offset = int(data.shape[0] * ratio)
-    UCI_TRAIN_DATA = data[:offset]
-    UCI_TEST_DATA = data[offset:]
-
-
-def train():
-    """
-    UCI_HOUSING training set creator.
-
-    It returns a reader creator, each sample in the reader is features after
-    normalization and price number.
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    global UCI_TRAIN_DATA
-    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
-
-    def reader():
-        for d in UCI_TRAIN_DATA:
-            yield d[:-1], d[-1:]
-
-    return reader
-
-
-def test():
-    """
-    UCI_HOUSING test set creator.
-
-    It returns a reader creator, each sample in the reader is features after
-    normalization and price number.
-
-    :return: Test reader creator
-    :rtype: callable
-    """
-    global UCI_TEST_DATA
-    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
-
-    def reader():
-        for d in UCI_TEST_DATA:
-            yield d[:-1], d[-1:]
-
-    return reader
-
-
-def fluid_model():
-    parameter_tar = paddle.dataset.common.download(
-        FLUID_URL_MODEL, 'uci_housing', FLUID_MD5_MODEL, 'fit_a_line.fluid.tar')
-
-    tar = tarfile.TarFile(parameter_tar, mode='r')
-    dirpath = tempfile.mkdtemp()
-    tar.extractall(path=dirpath)
-
-    return dirpath
-
-
-def predict_reader():
-    """
-    It returns just one tuple data to do inference.
-
-    :return: one tuple data
-    :rtype: tuple
-    """
-    global UCI_TEST_DATA
-    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
-    return (UCI_TEST_DATA[0][:-1], )
-
-
-def fetch():
-    paddle.dataset.common.download(URL, 'uci_housing', MD5)
diff --git a/python/paddle/dataset/voc2012.py b/python/paddle/dataset/voc2012.py
deleted file mode 100644
index 50688937654ae72b77e1439f21a0d7c847d5e135..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/voc2012.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Image dataset for segmentation.
-The 2012 dataset contains images from 2008-2011 for which additional
-segmentations have been prepared. As in previous years the assignment
-to training/test sets has been maintained. The total number of images
-with segmentation has been increased from 7,062 to 9,993.
-"""
-
-from __future__ import print_function
-
-import tarfile
-import io
-import numpy as np
-from paddle.dataset.common import download
-from paddle.dataset.image import *
-from PIL import Image
-
-__all__ = ['train', 'test', 'val']
-
-VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
-VOCtrainval_11-May-2012.tar'
-
-VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
-SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
-DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
-LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
-
-CACHE_DIR = 'voc2012'
-
-
-def reader_creator(filename, sub_name):
-
-    tarobject = tarfile.open(filename)
-    name2mem = {}
-    for ele in tarobject.getmembers():
-        name2mem[ele.name] = ele
-
-    def reader():
-        set_file = SET_FILE.format(sub_name)
-        sets = tarobject.extractfile(name2mem[set_file])
-        for line in sets:
-            line = line.strip()
-            data_file = DATA_FILE.format(line)
-            label_file = LABEL_FILE.format(line)
-            data = tarobject.extractfile(name2mem[data_file]).read()
-            label = tarobject.extractfile(name2mem[label_file]).read()
-            data = Image.open(io.BytesIO(data))
-            label = Image.open(io.BytesIO(label))
-            data = np.array(data)
-            label = np.array(label)
-            yield data, label
-
-    return reader
-
-
-def train():
-    """
-    Create a train dataset reader containing 2913 images in HWC order.
-    """
-    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
-
-
-def test():
-    """
-    Create a test dataset reader containing 1464 images in HWC order.
-    """
-    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
-
-
-def val():
-    """
-    Create a val dataset reader containing 1449 images in HWC order.
-    """
-    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
deleted file mode 100644
index 129e1129fb9f637d48772cbbf036d10a6cf241cf..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/wmt14.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-WMT14 dataset.
-The original WMT14 dataset is too large and a small set of data for set is
-provided. This module will download dataset from
-http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
-parse training set and test set into paddle reader creators.
-
-"""
-
-from __future__ import print_function
-
-import six
-import tarfile
-import gzip
-
-import paddle.dataset.common
-import paddle.compat as cpt
-
-__all__ = [
-    'train',
-    'test',
-    'get_dict',
-]
-
-URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
-                'cslm_joint_paper/data/dev+test.tgz')
-MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
-# this is a small set of data for test. The original data is too large and
-# will be add later.
-URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz')
-MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
-# BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
-MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
-
-START = "<s>"
-END = "<e>"
-UNK = "<unk>"
-UNK_IDX = 2
-
-
-def __read_to_dict(tar_file, dict_size):
-    def __to_dict(fd, size):
-        out_dict = dict()
-        for line_count, line in enumerate(fd):
-            if line_count < size:
-                out_dict[cpt.to_text(line.strip())] = line_count
-            else:
-                break
-        return out_dict
-
-    with tarfile.open(tar_file, mode='r') as f:
-        names = [
-            each_item.name for each_item in f
-            if each_item.name.endswith("src.dict")
-        ]
-        assert len(names) == 1
-        src_dict = __to_dict(f.extractfile(names[0]), dict_size)
-        names = [
-            each_item.name for each_item in f
-            if each_item.name.endswith("trg.dict")
-        ]
-        assert len(names) == 1
-        trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
-        return src_dict, trg_dict
-
-
-def reader_creator(tar_file, file_name, dict_size):
-    def reader():
-        src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
-        with tarfile.open(tar_file, mode='r') as f:
-            names = [
-                each_item.name for each_item in f
-                if each_item.name.endswith(file_name)
-            ]
-            for name in names:
-                for line in f.extractfile(name):
-                    line = cpt.to_text(line)
-                    line_split = line.strip().split('\t')
-                    if len(line_split) != 2:
-                        continue
-                    src_seq = line_split[0]  # one source sequence
-                    src_words = src_seq.split()
-                    src_ids = [
-                        src_dict.get(w, UNK_IDX)
-                        for w in [START] + src_words + [END]
-                    ]
-
-                    trg_seq = line_split[1]  # one target sequence
-                    trg_words = trg_seq.split()
-                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
-
-                    # remove sequence whose length > 80 in training mode
-                    if len(src_ids) > 80 or len(trg_ids) > 80:
-                        continue
-                    trg_ids_next = trg_ids + [trg_dict[END]]
-                    trg_ids = [trg_dict[START]] + trg_ids
-
-                    yield src_ids, trg_ids, trg_ids_next
-
-    return reader
-
-
-def train(dict_size):
-    """
-    WMT14 training set creator.
-
-    It returns a reader creator, each sample in the reader is source language
-    word ID sequence, target language word ID sequence and next word ID
-    sequence.
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'train/train', dict_size)
-
-
-def test(dict_size):
-    """
-    WMT14 test set creator.
-
-    It returns a reader creator, each sample in the reader is source language
-    word ID sequence, target language word ID sequence and next word ID
-    sequence.
-
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'test/test', dict_size)
-
-
-def gen(dict_size):
-    return reader_creator(
-        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'gen/gen', dict_size)
-
-
-def get_dict(dict_size, reverse=True):
-    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
-    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
-    tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
-    if reverse:
-        src_dict = {v: k for k, v in six.iteritems(src_dict)}
-        trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
-    return src_dict, trg_dict
-
-
-def fetch():
-    paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
deleted file mode 100644
index 3e9007c8aaf6ab74dfd72bba968807bb2c0c9b95..0000000000000000000000000000000000000000
--- a/python/paddle/dataset/wmt16.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-ACL2016 Multimodal Machine Translation. Please see this website for more
-details: http://www.statmt.org/wmt16/multimodal-task.html#task1
-
-If you use the dataset created for your task, please cite the following paper:
-Multi30K: Multilingual English-German Image Descriptions.
-
-@article{elliott-EtAl:2016:VL16,
- author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
- title     = {Multi30K: Multilingual English-German Image Descriptions},
- booktitle = {Proceedings of the 6th Workshop on Vision and Language},
- year      = {2016},
- pages     = {70--74},
- year      = 2016
-}
-"""
-
-from __future__ import print_function
-
-import os
-import six
-import tarfile
-import gzip
-from collections import defaultdict
-
-import paddle.dataset.common
-import paddle.compat as cpt
-
-__all__ = [
-    "train",
-    "test",
-    "validation",
-    "fetch",
-    "get_dict",
-]
-
-DATA_URL = ("http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz")
-DATA_MD5 = "0c38be43600334966403524a40dcd81e"
-
-TOTAL_EN_WORDS = 11250
-TOTAL_DE_WORDS = 19220
-
-START_MARK = "<s>"
-END_MARK = "<e>"
-UNK_MARK = "<unk>"
-
-
-def __build_dict(tar_file, dict_size, save_path, lang):
-    word_dict = defaultdict(int)
-    with tarfile.open(tar_file, mode="r") as f:
-        for line in f.extractfile("wmt16/train"):
-            line = cpt.to_text(line)
-            line_split = line.strip().split("\t")
-            if len(line_split) != 2: continue
-            sen = line_split[0] if lang == "en" else line_split[1]
-            for w in sen.split():
-                word_dict[w] += 1
-
-    with open(save_path, "wb") as fout:
-        fout.write(
-            cpt.to_bytes("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)))
-        for idx, word in enumerate(
-                sorted(
-                    six.iteritems(word_dict), key=lambda x: x[1],
-                    reverse=True)):
-            if idx + 3 == dict_size: break
-            fout.write(cpt.to_bytes(word[0]))
-            fout.write(cpt.to_bytes('\n'))
-
-
-def __load_dict(tar_file, dict_size, lang, reverse=False):
-    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
-                             "wmt16/%s_%d.dict" % (lang, dict_size))
-    if not os.path.exists(dict_path) or (
-            len(open(dict_path, "rb").readlines()) != dict_size):
-        __build_dict(tar_file, dict_size, dict_path, lang)
-
-    word_dict = {}
-    with open(dict_path, "rb") as fdict:
-        for idx, line in enumerate(fdict):
-            if reverse:
-                word_dict[idx] = cpt.to_text(line.strip())
-            else:
-                word_dict[cpt.to_text(line.strip())] = idx
-    return word_dict
-
-
-def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
-    src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
-                                        TOTAL_DE_WORDS))
-    trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
-                                        TOTAL_EN_WORDS))
-    return src_dict_size, trg_dict_size
-
-
-def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
-    def reader():
-        src_dict = __load_dict(tar_file, src_dict_size, src_lang)
-        trg_dict = __load_dict(tar_file, trg_dict_size,
-                               ("de" if src_lang == "en" else "en"))
-
-        # the indice for start mark, end mark, and unk are the same in source
-        # language and target language. Here uses the source language
-        # dictionary to determine their indices.
-        start_id = src_dict[START_MARK]
-        end_id = src_dict[END_MARK]
-        unk_id = src_dict[UNK_MARK]
-
-        src_col = 0 if src_lang == "en" else 1
-        trg_col = 1 - src_col
-
-        with tarfile.open(tar_file, mode="r") as f:
-            for line in f.extractfile(file_name):
-                line = cpt.to_text(line)
-                line_split = line.strip().split("\t")
-                if len(line_split) != 2:
-                    continue
-                src_words = line_split[src_col].split()
-                src_ids = [start_id] + [
-                    src_dict.get(w, unk_id) for w in src_words
-                ] + [end_id]
-
-                trg_words = line_split[trg_col].split()
-                trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
-
-                trg_ids_next = trg_ids + [end_id]
-                trg_ids = [start_id] + trg_ids
-
-                yield src_ids, trg_ids, trg_ids_next
-
-    return reader
-
-
-def train(src_dict_size, trg_dict_size, src_lang="en"):
-    """
-    WMT16 train set reader.
-
-    This function returns the reader for train data. Each sample the reader
-    returns is made up of three fields: the source language word index sequence,
-    target language word index sequence and next word index sequence.
-
-
-    NOTE:
-    The original like for training data is:
-    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
-
-    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
-    using moses's tokenization script:
-    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
-
-    Args:
-        src_dict_size(int): Size of the source language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        trg_dict_size(int): Size of the target language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        src_lang(string): A string indicating which language is the source
-                          language. Available options are: "en" for English
-                          and "de" for Germany.
-
-    Returns:
-        callable: The train reader.
-    """
-
-    if src_lang not in ["en", "de"]:
-        raise ValueError("An error language type.  Only support: "
-                         "en (for English); de(for Germany).")
-    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
-                                                   src_lang)
-
-    return reader_creator(
-        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                "wmt16.tar.gz"),
-        file_name="wmt16/train",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
-
-
-def test(src_dict_size, trg_dict_size, src_lang="en"):
-    """
-    WMT16 test set reader.
-
-    This function returns the reader for test data. Each sample the reader
-    returns is made up of three fields: the source language word index sequence,
-    target language word index sequence and next word index sequence.
-
-    NOTE:
-    The original like for test data is:
-    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
-
-    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
-    using moses's tokenization script:
-    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
-
-    Args:
-        src_dict_size(int): Size of the source language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        trg_dict_size(int): Size of the target language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        src_lang(string): A string indicating which language is the source
-                          language. Available options are: "en" for English
-                          and "de" for Germany.
-
-    Returns:
-        callable: The test reader.
-    """
-
-    if src_lang not in ["en", "de"]:
-        raise ValueError("An error language type. "
-                         "Only support: en (for English); de(for Germany).")
-
-    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
-                                                   src_lang)
-
-    return reader_creator(
-        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                "wmt16.tar.gz"),
-        file_name="wmt16/test",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
-
-
-def validation(src_dict_size, trg_dict_size, src_lang="en"):
-    """
-    WMT16 validation set reader.
-
-    This function returns the reader for validation data. Each sample the reader
-    returns is made up of three fields: the source language word index sequence,
-    target language word index sequence and next word index sequence.
-
-    NOTE:
-    The original like for validation data is:
-    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
-
-    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
-    using moses's tokenization script:
-    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
-
-    Args:
-        src_dict_size(int): Size of the source language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        trg_dict_size(int): Size of the target language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        src_lang(string): A string indicating which language is the source
-                          language. Available options are: "en" for English
-                          and "de" for Germany.
-
-    Returns:
-        callable: The validation reader.
-    """
-    if src_lang not in ["en", "de"]:
-        raise ValueError("An error language type. "
-                         "Only support: en (for English); de(for Germany).")
-    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
-                                                   src_lang)
-
-    return reader_creator(
-        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                "wmt16.tar.gz"),
-        file_name="wmt16/val",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
-
-
-def get_dict(lang, dict_size, reverse=False):
-    """
-    return the word dictionary for the specified language.
-
-    Args:
-        lang(string): A string indicating which language is the source
-                      language. Available options are: "en" for English
-                      and "de" for Germany.
-        dict_size(int): Size of the specified language dictionary.
-        reverse(bool): If reverse is set to False, the returned python
-                       dictionary will use word as key and use index as value.
-                       If reverse is set to True, the returned python
-                       dictionary will use index as key and word as value.
-
-    Returns:
-        dict: The word dictionary for the specific language.
-    """
-
-    if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS)
-    else: dict_size = min(dict_size, TOTAL_DE_WORDS)
-
-    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
-                             "wmt16/%s_%d.dict" % (lang, dict_size))
-    assert os.path.exists(dict_path), "Word dictionary does not exist. "
-    "Please invoke paddle.dataset.wmt16.train/test/validation first "
-    "to build the dictionary."
-    tar_file = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16.tar.gz")
-    return __load_dict(tar_file, dict_size, lang, reverse)
-
-
-def fetch():
-    """download the entire dataset.
-    """
-    paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                      "wmt16.tar.gz")
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
deleted file mode 100644
index d0c32e26092f6ea25771279418582a24ea449ab2..0000000000000000000000000000000000000000
--- a/python/paddle/distributed/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
deleted file mode 100644
index a35ab93eb85860544cbc68bd8f2f9390ef7771fc..0000000000000000000000000000000000000000
--- a/python/paddle/distributed/launch.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-paddle.distributed.launch is a module that spawns multiple distributed 
-process on each trainning node for gpu trainning.
-Usage:
-    In both of single node training or multiple node training, this module 
-launch a process on each of the given gpu card.
-    1. for single node trainning with all visible gpu cards:
-       python -m paddle.distributed.launch \
-         your_training_py (arg1 arg2 and all others)
-    
-    2. for single node trainning with [0,4) cards
-       python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \
-         your_training_py (arg1 arg2 and all others)
-    3. for mulitple node training such as two node:192.168.0.16, 192.168.0.17
-        on 192.168.0.16:
-            python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
-                --node_ip=192.168.0.16 \
-                your_training_py (arg1 arg2 and all others)
-        on 192.168.0.17:
-            python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
-                --node_ip=192.168.0.17 \
-                your_training_py (arg1 arg2 and all others)
-"""
-
-from __future__ import print_function
-import logging
-import sys
-from sys import version
-import subprocess
-import os
-import time
-import six
-import copy
-from argparse import ArgumentParser, REMAINDER
-import paddle.fluid as fluid
-
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-log_handler = logging.StreamHandler()
-log_format = logging.Formatter(
-    '%(asctime)s - %(filename)s:%(lineno)d - %(levelname)s: %(message)s')
-log_handler.setFormatter(log_format)
-logger.addHandler(log_handler)
-
-
-def _print_arguments(args):
-    print("-----------  Configuration Arguments -----------")
-    for arg, value in sorted(six.iteritems(vars(args))):
-        print("%s: %s" % (arg, value))
-    print("------------------------------------------------")
-
-
-def _parse_args():
-    """
-    Helper function parsing the command line options
-    @retval ArgumentParser
-    """
-    parser = ArgumentParser(
-        description='''start paddle training using multi-process mode.
-NOTE: your train program ***must*** run as distributed nccl2 mode,
-see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
-And your train program must read environment variables below in order to let different
-process init properly:
-FLAGS_selected_gpus
-PADDLE_TRAINER_ID
-PADDLE_CURRENT_ENDPOINT
-PADDLE_TRAINERS_NUM
-PADDLE_TRAINER_ENDPOINTS
-POD_IP (current node ip address, not needed for local training)
-''')
-
-    #Optional arguments for the launch helper
-    parser.add_argument(
-        "--cluster_node_ips",
-        type=str,
-        default="127.0.0.1",
-        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
-    parser.add_argument(
-        "--node_ip",
-        type=str,
-        default="127.0.0.1",
-        help="The current node ip. ")
-    parser.add_argument(
-        "--use_paddlecloud",
-        type=bool,
-        default="False",
-        help="wheter to use paddlecloud platform to run your multi-process job.")
-    parser.add_argument(
-        "--started_port",
-        type=int,
-        default=6170,
-        help="The trainer's started port on a single node")
-
-    parser.add_argument(
-        "--print_config",
-        type=bool,
-        default=True,
-        help="Print the config or not")
-
-    parser.add_argument(
-        "--selected_gpus",
-        type=str,
-        default=None,
-        help="It's for gpu trainning and the trainning process will run on the selected_gpus,"
-        "each process is bound to a single GPU. And if it's not setted, this module will use all the gpu cards for training."
-    )
-
-    parser.add_argument(
-        "--log_dir",
-        type=str,
-        help="The path for each process's log.If it's not setted, the log will printed to default pipe."
-    )
-
-    #positional
-    parser.add_argument(
-        "training_script",
-        type=str,
-        help="The full path to the single GPU training "
-        "program/script to be launched in parallel, "
-        "followed by all the arguments for the "
-        "training script")
-
-    #rest from the training program
-    parser.add_argument('training_script_args', nargs=REMAINDER)
-    return parser.parse_args()
-
-
-def terminate_procs(procs):
-    for p in procs:
-        if p.poll() is None:
-            p.terminate()
-
-
-def start_procs(args):
-    """
-    """
-    procs = []
-    log_fns = []
-
-    default_env = os.environ.copy()
-
-    current_node_ip = args.node_ip
-    node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]
-    node_id = node_ips.index(current_node_ip)
-    if args.use_paddlecloud:
-        trainer_nums = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-        if trainer_nums != 1:
-            #you can automatically get ip info while using paddlecloud multi nodes mode.
-            current_node_ip = os.getenv("POD_IP")
-            assert current_node_ip is not None, "POD_IP should not be None"
-            node_ips = os.getenv("PADDLE_TRAINERS")
-            assert node_ips is not None, "PADDLE_TRAINERS should not be None"
-            node_ips = node_ips.split(",")
-            node_id = os.getenv("PADDLE_TRAINER_ID")
-            assert node_id is not None, "PADDLE_TRAINER_ID should not be None"
-            node_id = int(node_id)
-
-            if args.node_ip != "127.0.0.1" and current_node_ip != args.node_ip:
-                logger.warning(
-                    "Please NOTE: When using paddlecloud, current_node_ip is \
-automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
-current_node_ip: {} from paddlecloud environment."
-                    .format(args.node_ip, current_node_ip))
-            if args.cluster_node_ips != "127.0.0.1" and args.cluster_node_ips != ",".join(
-                    node_ips):
-                logger.warning(
-                    "Please NOTE: When using paddlecloud, cluster_node_ips is \
-automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
-Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
-paddlecloud environment.".format(args.cluster_node_ips, node_ips))
-    num_nodes = len(node_ips)
-
-    if args.selected_gpus is None:
-        gpus_num = fluid.core.get_cuda_device_count()
-        selected_gpus = [str(x) for x in range(0, gpus_num)]
-    else:
-        selected_gpus = [x.strip() for x in args.selected_gpus.split(',')]
-    selected_gpus_num = len(selected_gpus)
-
-    trainers_endpoints = ""
-    for ip in node_ips:
-        for i in range(selected_gpus_num):
-            if trainers_endpoints != "":
-                trainers_endpoints += ","
-            trainers_endpoints += "%s:%d" % (ip, args.started_port + i)
-
-    nranks = num_nodes * selected_gpus_num
-
-    if args.print_config:
-        print("trainers_endpoints:", trainers_endpoints, ", node_id:", node_id,
-              ", current_node_ip:", current_node_ip, ", num_nodes:", num_nodes,
-              ", node_ips:", node_ips, ", nranks:", nranks)
-
-    current_env = copy.copy(default_env)
-    #paddle broadcast ncclUniqueId use socket, and
-    #proxy maybe make trainers unreachable, so delete them.
-    #if we set them to "", grpc will log error message "bad uri"
-    #so just delete them.
-    current_env.pop("http_proxy", None)
-    current_env.pop("https_proxy", None)
-
-    procs = []
-    cmds = []
-    for i in range(0, selected_gpus_num):
-        current_env.update({
-            "FLAGS_selected_gpus": "%s" % selected_gpus[i],
-            "PADDLE_TRAINER_ID": "%d" % (node_id * selected_gpus_num + i),
-            "PADDLE_CURRENT_ENDPOINT":
-            "%s:%d" % (current_node_ip, args.started_port + i),
-            "PADDLE_TRAINERS_NUM": "%d" % nranks,
-            "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
-        })
-
-        if num_nodes > 1:
-            current_env.update({"FLAGS_sync_nccl_allreduce": "0"})
-
-        cmd = [sys.executable, "-u", args.training_script
-               ] + args.training_script_args
-
-        cmds.append(cmd)
-
-        if args.log_dir is not None:
-            os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
-            log_fns.append(fn)
-
-            proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
-        else:
-            proc = subprocess.Popen(cmd, env=current_env)
-
-        procs.append(proc)
-
-    try:
-        alive = True
-        error = False
-        # wait all process finish or one error
-        while alive and not error:
-            alive = False
-            for p in procs:
-                ret = p.poll()
-                if ret is None:
-                    alive = True
-                elif ret != 0:
-                    error = True
-            time.sleep(1)
-
-        if error:
-            terminate_procs(procs)
-            exit(1)
-
-    except KeyboardInterrupt:
-        logger.warning("KeyboardInterrupt, exit")
-        terminate_procs(procs)
-        raise
-    except SystemExit:
-        logger.error("One trainer process abort, exit")
-        terminate_procs(procs)
-        raise
-    except:
-        logger.error("Trainer process abort, exit")
-        terminate_procs(procs)
-        raise
-    finally:
-        for fn in log_fns:
-            fn.close()
-
-
-def launch():
-    args = _parse_args()
-    if args.print_config:
-        _print_arguments(args)
-    start_procs(args)
-
-
-if __name__ == "__main__":
-    launch()
diff --git a/python/paddle/distributed/launch_ps.py b/python/paddle/distributed/launch_ps.py
deleted file mode 100644
index 4a6885c888897cb55fd1f8b961fa2adc7673c10c..0000000000000000000000000000000000000000
--- a/python/paddle/distributed/launch_ps.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from __future__ import unicode_literals
-import subprocess
-import sys
-import os
-import copy
-from argparse import ArgumentParser, REMAINDER
-
-
-def parse_args():
-    # Optional arguments for the launch helper
-    parser = ArgumentParser(description="Distributed training")
-    parser.add_argument(
-        "--cluster_node_ips",
-        type=str,
-        default="127.0.0.1",
-        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
-
-    parser.add_argument(
-        "--node_ip",
-        type=str,
-        default="127.0.0.1",
-        help="The current node ip. ")
-
-    parser.add_argument(
-        "--start_port",
-        type=int,
-        default=6170,
-        help="The trainer's start port on a single node")
-
-    parser.add_argument(
-        "--print_config",
-        type=bool,
-        default=True,
-        help="Print the config or not")
-
-    parser.add_argument(
-        "--endpoints", type=str, default="", help="User defined endpoints")
-
-    parser.add_argument(
-        "--worker_num", type=int, default=2, help="number of workers")
-
-    parser.add_argument(
-        "--server_num", type=int, default=2, help="number of servers")
-
-    parser.add_argument(
-        "--log_dir",
-        default="logs",
-        type=str,
-        help="The path for each process's log.If it's not setted, the log will printed to default pipe."
-    )
-
-    # positional
-    parser.add_argument(
-        "training_script",
-        type=str,
-        help="The full path to the single GPU training "
-        "program/script to be launched in parallel, "
-        "followed by all the arguments for the "
-        "training script")
-
-    # rest from the training program
-    parser.add_argument('training_script_args', nargs=REMAINDER)
-    return parser.parse_args()
-
-
-def start_procs(args):
-    worker_num = args.worker_num
-    server_num = args.server_num
-    start_port = args.start_port
-    default_env = os.environ.copy()
-    current_env = copy.copy(default_env)
-    current_env.pop("http_proxy", None)
-    current_env.pop("https_proxy", None)
-    procs = []
-    cmds = []
-    log_fns = []
-    ports = range(start_port, start_port + server_num, 1)
-    default_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
-    user_endpoints = ""
-    if args.endpoints == "":
-        user_endpoints = default_endpoints
-    else:
-        user_endpoints = args.endpoints
-    user_endpoints_ips = [x.split(":")[0] for x in user_endpoints.split(",")]
-    user_endpoints_port = [x.split(":")[1] for x in user_endpoints.split(",")]
-    for i in range(server_num):
-        current_env.update({
-            "PADDLE_TRAINERS_NUM": str(server_num),
-            "PADDLE_PORT": ",".join(user_endpoints_port),
-            #"POD_IP": user_endpoints_ips[i],
-            "CURRENT_ENDPOINT":
-            user_endpoints_ips[i] + ":" + user_endpoints_port[i],
-            "PADDLE_PSERVERS": ",".join(user_endpoints_ips),
-            "PADDLE_TRAINING_ROLE": "PSERVER"
-        })
-        cmd = [sys.executable, "-u", args.training_script
-               ] + args.training_script_args
-        cmds.append(cmd)
-        print(cmd)
-        if args.log_dir is not None:
-            os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/serverlog.%d" % (args.log_dir, i), "w")
-            log_fns.append(fn)
-            proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
-        else:
-            proc = subprocess.Popen(cmd, env=current_env)
-        procs.append(proc)
-
-    for i in range(worker_num):
-        current_env.update({
-            "PADDLE_PSERVERS": ",".join(user_endpoints_ips),
-            "PADDLE_PORT": ",".join(user_endpoints_port),
-            "PADDLE_TRAINERS_NUM": str(worker_num),
-            "PADDLE_TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(i)
-        })
-        cmd = [sys.executable, "-u", args.training_script
-               ] + args.training_script_args
-        print(cmd)
-        cmds.append(cmd)
-        if args.log_dir is not None:
-            os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
-            log_fns.append(fn)
-            proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
-        else:
-            proc = subprocess.Popen(cmd, env=current_env)
-        procs.append(proc)
-
-    for i in range(0, len(procs)):
-        proc = procs[i]
-
-        proc.wait()
-        if len(log_fns) > 0:
-            log_fns[i].close()
-
-        if proc.returncode != 0:
-            raise subprocess.CalledProcessError(
-                returncode=procs[i].returncode, cmd=cmds[i])
-
-
-def launch():
-    args = parse_args()
-    if args.print_config:
-        start_procs(args)
-
-
-# server num, worker num        
-if __name__ == "__main__":
-    launch()
diff --git a/python/paddle/fluid/.gitignore b/python/paddle/fluid/.gitignore
deleted file mode 100644
index 80c1cf3fcb86aa50600e02e4765640a91560916e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-proto
-core.so
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
deleted file mode 100644
index 180fae663161a6b540ed7e91e14f8a05953bdec5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/__init__.py
+++ /dev/null
@@ -1,222 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-import sys
-
-# The legacy core need to be removed before "import core",
-# in case of users installing paddlepadde without -U option
-core_suffix = 'so'
-if os.name == 'nt':
-    core_suffix = 'pyd'
-
-legacy_core = os.path.abspath(os.path.dirname(
-    __file__)) + os.sep + 'core.' + core_suffix
-if os.path.exists(legacy_core):
-    sys.stderr.write('Deleting legacy file ' + legacy_core + '\n')
-    try:
-        os.remove(legacy_core)
-    except Exception as e:
-        raise e
-
-# import all class inside framework into fluid module
-from . import framework
-from .framework import *
-# import all class inside executor into fluid module
-from . import executor
-from .executor import *
-
-from . import data_feed_desc
-from .data_feed_desc import *
-
-from . import dataset
-from .dataset import *
-
-from . import trainer_desc
-from . import inferencer
-
-from . import io
-from . import evaluator
-from . import initializer
-from . import layers
-from . import dygraph
-from . import contrib
-from . import nets
-from . import optimizer
-from . import backward
-from .backward import gradients
-from . import regularizer
-from . import average
-from . import metrics
-from . import transpiler
-from . import incubate
-from .input import embedding, one_hot
-from . import distribute_lookup_table
-from .param_attr import ParamAttr, WeightNormParamAttr
-from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
-from .incubate import fleet
-from .incubate import data_generator
-from .transpiler import DistributeTranspiler, \
-    memory_optimize, release_memory, DistributeTranspilerConfig
-from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
-from . import clip
-from . import dygraph_grad_clip
-from . import profiler
-from . import unique_name
-from . import parallel_executor
-from .parallel_executor import *
-from . import compiler
-from .compiler import *
-from paddle.fluid.layers.math_op_patch import monkey_patch_variable
-from . import install_check
-from .dygraph.nn import *
-from .dygraph.layers import *
-
-Tensor = LoDTensor
-
-__all__ = framework.__all__ + executor.__all__ + \
-    trainer_desc.__all__ + inferencer.__all__ + transpiler.__all__ + \
-    parallel_executor.__all__ + lod_tensor.__all__ + \
-    data_feed_desc.__all__ + compiler.__all__ + backward.__all__ + [
-        'io',
-        'initializer',
-        'embedding',
-        'one_hot',
-        'layers',
-        'contrib',
-        'dygraph',
-        'transpiler',
-        'nets',
-        'optimizer',
-        'learning_rate_decay',
-        'backward',
-        'regularizer',
-        'LoDTensor',
-        'LoDTensorArray',
-        'CPUPlace',
-        'CUDAPlace',
-        'CUDAPinnedPlace',
-        'Tensor',
-        'ParamAttr',
-        'WeightNormParamAttr',
-        'DataFeeder',
-        'clip',
-        'dygraph_grad_clip',
-        'profiler',
-        'unique_name',
-        'Scope',
-        'install_check',
-    ]
-
-
-def __bootstrap__():
-    """
-    Enable reading gflags from environment variables.
-
-    Returns:
-        None
-    """
-    import sys
-    import os
-    import platform
-    from . import core
-
-    in_test = 'unittest' in sys.modules
-
-    try:
-        num_threads = int(os.getenv('OMP_NUM_THREADS', '1'))
-    except ValueError:
-        num_threads = 1
-
-    if num_threads > 1:
-        print(
-            'WARNING: OMP_NUM_THREADS set to {0}, not 1. The computation '
-            'speed will not be optimized if you use data parallel. It will '
-            'fail if this PaddlePaddle binary is compiled with OpenBlas since'
-            ' OpenBlas does not support multi-threads.'.format(num_threads),
-            file=sys.stderr)
-        print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)
-
-    os.environ['OMP_NUM_THREADS'] = str(num_threads)
-    sysstr = platform.system()
-    read_env_flags = [
-        'check_nan_inf', 'fast_check_nan_inf', 'benchmark',
-        'eager_delete_scope', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
-        'paddle_num_threads', 'dist_threadpool_size', 'eager_delete_tensor_gb',
-        'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
-        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
-        'enable_parallel_graph', 'fuse_parameter_groups_size',
-        'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
-        'tracer_profile_fname', 'dygraph_debug'
-    ]
-    if 'Darwin' not in sysstr:
-        read_env_flags.append('use_pinned_memory')
-
-    if os.name != 'nt':
-        read_env_flags.append('cpu_deterministic')
-
-    if core.is_compiled_with_mkldnn():
-        read_env_flags.append('use_mkldnn')
-
-    if core.is_compiled_with_ngraph():
-        read_env_flags.append('use_ngraph')
-
-    if core.is_compiled_with_dist():
-        #env for rpc
-        read_env_flags.append('rpc_deadline')
-        read_env_flags.append('rpc_retry_times')
-        read_env_flags.append('rpc_server_profile_path')
-        read_env_flags.append('enable_rpc_profiler')
-        read_env_flags.append('rpc_send_thread_num')
-        read_env_flags.append('rpc_get_thread_num')
-        read_env_flags.append('rpc_prefetch_thread_num')
-        read_env_flags.append('rpc_disable_reuse_port')
-
-        # env for communicator
-        read_env_flags.append('communicator_independent_recv_thread')
-        read_env_flags.append('communicator_send_queue_size')
-        read_env_flags.append('communicator_min_send_grad_num_before_recv')
-        read_env_flags.append('communicator_thread_pool_size')
-        read_env_flags.append('communicator_max_merge_var_num')
-        read_env_flags.append('communicator_fake_rpc')
-        read_env_flags.append('communicator_send_wait_times')
-        read_env_flags.append('communicator_merge_sparse_grad')
-        if core.is_compiled_with_brpc():
-            read_env_flags.append('max_body_size')
-            #set brpc max body size
-            os.environ['FLAGS_max_body_size'] = "2147483647"
-
-    if core.is_compiled_with_cuda():
-        read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
-            'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
-            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search', 'selected_gpus', 'sync_nccl_allreduce',
-            'cudnn_batchnorm_spatial_persistent', 'gpu_allocator_retry_time',
-            'local_exe_sub_scope_limit'
-        ]
-    core.init_gflags([sys.argv[0]] +
-                     ["--tryfromenv=" + ",".join(read_env_flags)])
-    core.init_glog(sys.argv[0])
-    # don't init_p2p when in unittest to save time.
-    core.init_devices(not in_test)
-
-
-# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
-# Consider paddle.init(args) or paddle.main(args)
-monkey_patch_variable()
-__bootstrap__()
diff --git a/python/paddle/fluid/annotations.py b/python/paddle/fluid/annotations.py
deleted file mode 100644
index 15e7976354f2a22065f1723bfa696d056181dac2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/annotations.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import functools
-import sys
-
-__all__ = ['deprecated']
-
-
-def deprecated(since, instead, extra_message=""):
-    def decorator(func):
-        err_msg = "API {0} is deprecated since {1}. Please use {2} instead.".format(
-            func.__name__, since, instead)
-        if len(extra_message) != 0:
-            err_msg += "\n"
-            err_msg += extra_message
-
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            print(err_msg, file=sys.stderr)
-            return func(*args, **kwargs)
-
-        wrapper.__doc__ += "\n    "
-        wrapper.__doc__ += err_msg
-        return wrapper
-
-    return decorator
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
deleted file mode 100644
index a7d64d37bc7a0aea7a68ec3fdfcce42551de7ae7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/average.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import warnings
-"""
-    Class of all kinds of Average.
-
-    All Averages are accomplished via Python totally. 
-    They do not change Paddle's Program, nor do anything to
-    modify NN model's configuration. They are completely 
-    wrappers of Python functions.
-"""
-
-__all__ = ["WeightedAverage"]
-
-
-def _is_number_(var):
-    return isinstance(var, int) or isinstance(var, float) or (isinstance(
-        var, np.ndarray) and var.shape == (1, ))
-
-
-def _is_number_or_matrix_(var):
-    return _is_number_(var) or isinstance(var, np.ndarray)
-
-
-class WeightedAverage(object):
-    """
-    Calculate weighted average.
-
-    The average calculating is accomplished via Python totally. 
-    They do not change Paddle's Program, nor do anything to
-    modify NN model's configuration. They are completely 
-    wrappers of Python functions.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            avg = fluid.average.WeightedAverage()
-            avg.add(value=2.0, weight=1)
-            avg.add(value=4.0, weight=2)
-            avg.eval()
-
-            # The result is 3.333333333.
-            # For (2.0 * 1 + 4.0 * 2) / (1 + 2) = 3.333333333
-    """
-
-    def __init__(self):
-        warnings.warn(
-            "The %s is deprecated, please use fluid.metrics.Accuracy instead." %
-            (self.__class__.__name__), Warning)
-        self.reset()
-
-    def reset(self):
-        self.numerator = None
-        self.denominator = None
-
-    def add(self, value, weight):
-        if not _is_number_or_matrix_(value):
-            raise ValueError(
-                "The 'value' must be a number(int, float) or a numpy ndarray.")
-        if not _is_number_(weight):
-            raise ValueError("The 'weight' must be a number(int, float).")
-
-        if self.numerator is None or self.denominator is None:
-            self.numerator = value * weight
-            self.denominator = weight
-        else:
-            self.numerator += value * weight
-            self.denominator += weight
-
-    def eval(self):
-        if self.numerator is None or self.denominator is None:
-            raise ValueError(
-                "There is no data to be averaged in WeightedAverage.")
-        return self.numerator / self.denominator
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
deleted file mode 100644
index 07d7c9d19df2331abca1b8721a4f9cd0fd9342e5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/backward.py
+++ /dev/null
@@ -1,1321 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from paddle.fluid import framework as framework
-from . import core
-import collections
-import copy
-import six
-from .. import compat as cpt
-from . import unique_name
-
-__all__ = [
-    'append_backward',
-    'gradients',
-]
-
-
-class ProgramStats(object):
-    def __init__(self, block, ops):
-        self.block = block
-        self.ops = ops
-        self.op_deps = {}  # op-> in_ops, out_ops
-        self.var_op_deps = {}  # var as input op, var as output op
-
-    def get_input_nodes(self):
-        input_names = []
-        for name in self.var_op_deps:
-            if len(self.var_op_deps[name]["var_as_output_ops"]) <= 0 and \
-               len(self.var_op_deps[name]["var_as_input_ops"]) > 0:
-                if self.block.var(name).persistable:
-                    continue
-                input_names.append(name)
-        for op in self.ops:
-            if op.desc.type() == "read":
-                input_names.extend(op.desc.output_arg_names())
-        return input_names
-
-    def get_reserved_vars(self):
-        var_name = []
-        for op in self.ops:
-            if op.desc.type() == "dropout":
-                var_name.extend(op.desc.output_arg_names())
-        return var_name
-
-    def get_out_of_subgraph_vars(self, begin_op_idx, end_op_idx):
-        var_name = []
-        for i in range(begin_op_idx, end_op_idx, 1):
-            for name in self.ops[i].desc.output_arg_names():
-                if name in self.var_op_deps:
-                    for idx in self.var_op_deps[name]["var_as_input_ops"]:
-                        if idx >= end_op_idx:
-                            var_name.append(name)
-        return var_name
-
-    def is_subgraph(self, var_group1, var_group2):
-        # should traverse from var_group1 to var_group2
-        # max op idx in var_group2
-        # min op idx in var_group1
-        min_op_idx = len(self.ops)
-        max_op_idx = -1
-        for name in var_group1:
-            if name not in self.var_op_deps:
-                return False, min_op_idx, max_op_idx
-        for name in var_group2:
-            if name not in self.var_op_deps:
-                return False, min_op_idx, max_op_idx
-        for name in var_group1:
-            op_idx = self.var_op_deps[name]["var_as_input_ops"]
-            for idx in op_idx:
-                min_op_idx = min(min_op_idx, idx)
-        for name in var_group2:
-            op_idx = self.var_op_deps[name]["var_as_output_ops"]
-            for idx in op_idx:
-                max_op_idx = max(max_op_idx, idx)
-        if min_op_idx >= max_op_idx:
-            return False, min_op_idx, max_op_idx
-        return True, min_op_idx, max_op_idx
-
-    def build_stats(self):
-        for i, op in enumerate(self.ops):
-            self.op_deps[i] = {"in_ops": [], "out_ops": []}
-            for j, name in enumerate(op.desc.input_arg_names()):
-                if name in self.var_op_deps:
-                    self.op_deps[i]["in_ops"].extend(self.var_op_deps[name][
-                        "var_as_output_ops"])
-            for j, name in enumerate(op.desc.input_arg_names()):
-                if name in self.var_op_deps:
-                    self.var_op_deps[name]["var_as_input_ops"].extend([i])
-                else:
-                    self.var_op_deps[name] = {}
-                    self.var_op_deps[name]["var_as_input_ops"] = [i]
-                    self.var_op_deps[name]["var_as_output_ops"] = []
-
-            for j, name in enumerate(op.desc.output_arg_names()):
-                if name in self.var_op_deps:
-                    self.var_op_deps[name]["var_as_output_ops"].extend([i])
-                else:
-                    self.var_op_deps[name] = {}
-                    self.var_op_deps[name]["var_as_input_ops"] = []
-                    self.var_op_deps[name]["var_as_output_ops"] = [i]
-
-            for op_idx in self.op_deps[i]["in_ops"]:
-                self.op_deps[op_idx]["out_ops"].extend([i])
-
-
-def _pretty_op_desc_(op_desc, prefix):
-    out_s = "%s\tname:[%s]\n%s    \tinputs:[%s]\n%s    \toutputs:[%s]" % \
-            (prefix + "_op", str(op_desc.type()), prefix + "_input", " ".join(op_desc.input_arg_names()),
-             prefix + "_output", " ".join(op_desc.output_arg_names()))
-    return out_s
-
-
-def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
-    if len(descs) == 0:
-        return []
-    result_descs = []
-    op_role_attr_name = \
-            core.op_proto_and_checker_maker.kOpRoleAttrName()
-    backward = core.op_proto_and_checker_maker.OpRole.Backward
-    for desc in descs:
-        if isinstance(desc, framework.Operator):
-            desc = desc.desc
-        if isinstance(desc, tuple):
-            desc = desc[0]
-        is_needed = False
-        for name in desc.output_arg_names():
-            if main_block.has_var(name) and main_block.var(name).persistable:
-                continue
-            if name not in in_memory_vars:
-                is_needed = True
-        if is_needed:
-            new_op_desc = block.desc.append_op()
-            new_op_desc.copy_from(desc)
-            new_op_desc._set_attr(op_role_attr_name, backward)
-            result_descs.append(new_op_desc)
-    return result_descs
-
-
-def _add_descs_to_block(descs, block):
-    if len(descs) == 0:
-        return []
-    result_descs = []
-    op_role_attr_name = \
-        core.op_proto_and_checker_maker.kOpRoleAttrName()
-    backward = core.op_proto_and_checker_maker.OpRole.Backward
-    for desc in descs:
-        if isinstance(desc, framework.Operator):
-            desc = desc.desc
-        if isinstance(desc, tuple):
-            desc = desc[0]
-        new_op_desc = block.desc.append_op()
-        new_op_desc.copy_from(desc)
-        new_op_desc._set_attr(op_role_attr_name, backward)
-        result_descs.append(new_op_desc)
-    return result_descs
-
-
-def _find_loss_op_(loss):
-    for op in reversed(loss.block.ops):
-        assert isinstance(op, framework.Operator)
-        if len(op.output_arg_names) == 1 and op.output_arg_names[
-                0] == loss.name:
-            loss.op = op
-            break
-    if loss.op is None:
-        raise ValueError("loss.op is None. Should not happend")
-
-
-def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
-    """
-    Traverse all ops in op_descs[begin_idx : end_idx],
-    if any op has inputs/outputs named "old_name", rename it as 'new_name'
-    """
-    if begin_idx is None:
-        begin_idx = 0
-    if end_idx is None:
-        end_idx = len(op_descs)
-    for i in range(begin_idx, end_idx):
-        op_desc = op_descs[i]
-        if isinstance(op_desc, tuple):
-            op_desc = op_desc[0]
-        op_desc._rename_input(old_name, new_name)
-        op_desc._rename_output(old_name, new_name)
-
-
-def _create_op_desc_(op_type, inputs, outputs, attrs):
-    """
-    Create a C++ OpDesc object with specified inputs, outputs and attributes.
-    """
-    op_desc = core.OpDesc()
-    op_desc.set_type(op_type)
-    for para, args in six.iteritems(inputs):
-        op_desc.set_input(
-            para,
-            list(
-                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
-                    args)))
-    for para, args in six.iteritems(outputs):
-        op_desc.set_output(
-            para,
-            list(
-                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
-                    args)))
-
-    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
-
-    if op_role_attr_name not in attrs:
-        attrs[
-            op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
-    for name, val in six.iteritems(attrs):
-        if isinstance(val, framework.Block):
-            op_desc.set_block_attr(name, val.desc)
-        else:
-            op_desc._set_attr(name, val)
-    return op_desc
-
-
-def _create_loss_op_desc_(loss):
-    op_desc = _create_op_desc_(
-        "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, {
-            "shape": [1],
-            "value": 1.0,
-            "dtype": loss.dtype,
-            "force_cpu": False,
-            core.op_proto_and_checker_maker.kOpRoleAttrName():
-            int(core.op_proto_and_checker_maker.OpRole.Backward) |
-            int(core.op_proto_and_checker_maker.OpRole.Loss),
-        })
-    return op_desc
-
-
-def _infer_var_data_type_(grad_var_name, block):
-    """
-    Infer the data type of given grad variable
-    """
-    grad_var = block.desc.find_var(cpt.to_bytes(grad_var_name))
-    fwd_name = _strip_grad_suffix_(grad_var_name)
-    if block.desc.has_var_recursive(cpt.to_bytes(fwd_name)):
-        fwd_var = block.desc.find_var_recursive(cpt.to_bytes(fwd_name))
-        grad_var.set_dtype(fwd_var.dtype())
-    else:
-        grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-
-def _all_in_set_(cands, s):
-    """
-    Test if all elements of 'cands' are in set 's'
-    """
-    if len(cands) == 0:
-        return False
-    for c in cands:
-        if not c in s:
-            return False
-    return True
-
-
-def _some_in_set_(cands, s):
-    """
-    Test if some elements of 'cands' are in set 's'
-    """
-    if len(cands) == 0:
-        return False
-    literal_set = cpt.to_text(s)
-    literal_cands = cpt.to_text(cands)
-    for c in literal_cands:
-        if c in literal_set:
-            return True
-    return False
-
-
-def _strip_grad_suffix_(name):
-    """
-    Strip the grad suffix from the given variable name
-    e.g. x@GRAD ==> x
-         y@GRAD@RENAME@1 ==> y
-    """
-    name = cpt.to_text(name)
-    pos = name.find(core.grad_var_suffix())
-    return name[:pos] if pos != -1 else name
-
-
-def _append_grad_suffix_(name):
-    """
-    Append grad suffix to the given variable name
-    e.g. x ==> x@GRAD
-    """
-    return cpt.to_text(name) + core.grad_var_suffix()
-
-
-def _addup_repetitive_outputs_(op_descs):
-    """
-    In backward part, an variable may be the output of more than one ops.
-    And one op may yield its multiple outputs to the same variable.
-    In these cases, the variable should be the accumulation of all the outputs.
-    `sum_op`s are added to implement the accumulate.
-    """
-    pending_sum_ops = []
-    var_rename_count = collections.defaultdict(int)
-    renamed_vars = collections.defaultdict(list)
-    renamed_var_start_idx = collections.defaultdict(list)
-    for idx, op_desc in enumerate(op_descs):
-        for var_name in op_desc.input_arg_names():
-            if "@GRAD" not in var_name:
-                continue
-            if len(renamed_vars[var_name]) > 1:
-                pending_sum_ops.append((_create_op_desc_(
-                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
-                    {"use_mkldnn": False}), idx))
-                renamed_vars[var_name] = [var_name]
-        for param_idx, param_name in enumerate(op_desc.output_names()):
-            arg_names = op_desc.output(param_name)
-            for arg_idx, var_name in enumerate(arg_names):
-                if "@GRAD" not in var_name:
-                    continue
-                #if "@RENAME@" in var_name:
-                #    continue
-                if var_name == core.empty_var_name(
-                ) or var_name in op_desc.input_arg_names():
-                    # empty variable or inplace op
-                    continue
-                if len(renamed_vars[var_name]) == 0:
-                    # it's the first time we get the variable
-                    renamed_vars[var_name] = [var_name]
-                    renamed_var_start_idx[var_name] = idx
-                else:
-                    if len(renamed_vars[var_name]) == 1:
-                        new_name = var_name + "@RENAME@" + \
-                            str(var_rename_count[var_name])
-                        var_rename_count[var_name] += 1
-                        # rename original var_name
-                        renamed_vars[var_name][0] = new_name
-                        # before change: _rename_arg_(op_descs, var_name,
-                        #                             new_name, 0, idx)
-                        # rename arg from idx of the first appearance
-                        # in backward, not always from 0
-                        _rename_arg_(op_descs, var_name, new_name,
-                                     renamed_var_start_idx[var_name], idx)
-                        _rename_arg_(pending_sum_ops, var_name, new_name)
-
-                        for p in op_desc.output_names()[:param_idx]:
-                            p_arg_names = op_desc.output(p)
-                            if var_name in p_arg_names:
-                                op_desc.set_output(p, [
-                                    new_name if x == var_name else x
-                                    for x in p_arg_names
-                                ])
-
-                        arg_names = [
-                            new_name if x == var_name else x
-                            for x in arg_names[:arg_idx]
-                        ] + arg_names[arg_idx:]
-
-                    new_name = var_name + "@RENAME@" + \
-                        str(var_rename_count[var_name])
-                    var_rename_count[var_name] += 1
-                    arg_names[arg_idx] = new_name
-                    op_desc.set_output(param_name, arg_names)
-                    renamed_vars[var_name].append(new_name)
-
-    for var_name, inputs in six.iteritems(renamed_vars):
-        if len(inputs) > 1:
-            pending_sum_ops.append(
-                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
-                                  {"use_mkldnn": False}), len(op_descs)))
-    # sum_op descs are sorted according to their insert position
-    for p in reversed(pending_sum_ops):
-        op_descs.insert(p[1], p[0])
-
-    return op_descs
-
-
-def _remove_no_grad_branch_(op_descs, no_grad_set):
-    """
-    Remove unnecessary grad ops
-    A grad op can be removed in two cases:
-        1. all outputs of the grad op are in 'no_grad_set'
-        2. all grad inputs of the grad op are in 'no_grad_set'
-    """
-
-    def _op_can_be_removed_(op_desc, no_grad_set):
-        out_arg_names = op_desc.output_arg_names()
-        if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
-            return True
-        if _all_in_set_([
-                name for name in op_desc.input_arg_names()
-                if name.find(core.grad_var_suffix()) != -1
-        ], no_grad_set):
-            no_grad_set.update(out_arg_names)
-            return True
-        return False
-
-    # Remove ops whose outputs are all in no_grad_dict
-    op_descs = [
-        op_desc for op_desc in op_descs
-        if not _op_can_be_removed_(op_desc, no_grad_set)
-    ]
-    # Insert fill_zeros_like_op
-    to_insert = []
-    for idx, op_desc in enumerate(op_descs):
-        for arg in op_desc.input_arg_names():
-            # arg is a gradient var name and arg should not have gradient
-            if core.grad_var_suffix() in arg and arg in no_grad_set:
-                x_in = _strip_grad_suffix_(arg)
-                # the reason should be: arg can be input of another grad op
-                # and the op is a not-to-remove op
-                to_insert.append((_create_op_desc_(
-                    "fill_zeros_like", {"X": [x_in]}, {"Out": [arg]}, {}), idx))
-
-    list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
-
-    return op_descs
-
-
-def _find_not_need_ops(grad_op_descs, forward_ops, input_grad_names_set):
-    """
-    Pruning Program with Structural Analysis Method of Computational Graph.
-    The nodes of the computational graph composed of backward OPS should be
-    interconnected. If there are unconnected sub-graphs in the computational graph,
-    these sub-graphs should be cut off.
-
-    Args:
-        grad_op_descs(list[core.OpDesc]): The candidate backward OpDescs.
-        forward_ops(list[Operator]): The forward ops.
-        input_grad_names_set(set): this set is used to store the gradients' name
-            which is generated by backward ops, and input_grad_names_set can help
-            to prune the unnecessary backward ops.
-
-    Return:
-        (list[core.OpDesc]): A list of OpDescs which should be pruned.
-    """
-
-    class Var(object):
-        def __init__(self, var_name):
-            self.var_name = var_name
-            self.gen_op = None
-            self.pendding_ops = []
-
-        def set_gen_op(self, gen_op):
-            assert isinstance(gen_op, Op)
-            assert self.gen_op is None
-            self.gen_op = gen_op
-
-        def add_pending_op(self, op):
-            assert isinstance(op, Op)
-            self.pendding_ops.append(op)
-
-    class Op(object):
-        def __init__(self, op_desc):
-            self.op_desc = op_desc
-            self.inputs = []
-            self.outputs = []
-
-        def insert_input(self, var):
-            assert isinstance(var, Var)
-            self.inputs.append(var)
-
-        def insert_output(self, var):
-            assert isinstance(var, Var)
-            self.outputs.append(var)
-
-    var_versions = dict()
-
-    def _create_node(name):
-        if name not in var_versions.keys():
-            var_versions[name] = [Var(name)]
-        else:
-            var_versions[name].append(Var(name))
-        return var_versions[name][-1]
-
-    def _create_or_get_last_version_node(name):
-        if name not in var_versions.keys():
-            var_versions[name] = [Var(name)]
-        return var_versions[name][-1]
-
-    def _create_op_node(op_desc):
-        op_node = Op(op_desc)
-        for input in op_desc.input_arg_names():
-            var = _create_or_get_last_version_node(name=input)
-            var.add_pending_op(op_node)
-            op_node.insert_input(var)
-        for output in op_desc.output_arg_names():
-            var = _create_node(name=output)
-            var.set_gen_op(op_node)
-            op_node.insert_output(var)
-        return op_node
-
-    # Record the forward vars
-    forward_vars_set = set() if input_grad_names_set is None else set(
-        input_grad_names_set)
-    for op in forward_ops:
-        forward_vars_set.update(op.desc.input_arg_names())
-        forward_vars_set.update(op.desc.output_arg_names())
-
-    # Record the vars which are created during backward and is not generated by op.
-    backward_vars_set = set()
-    # special_op_nodes is the candidate sub-graph head node.
-    special_op_nodes = set()
-    for op_desc in grad_op_descs:
-        input_set = set(op_desc.input_arg_names())
-        # The new_vars are created during backward and is not generated by op.
-        new_vars = input_set - forward_vars_set - backward_vars_set
-        backward_vars_set.update(op_desc.output_arg_names())
-
-        op_node = _create_op_node(op_desc)
-        if len(new_vars) == len(input_set):
-            special_op_nodes.add(op_node)
-
-    not_need_op_descs = []
-    # Start traversing all candidate sub-graph headers to check whether
-    # they are connected to backward computational graphs, and if they are
-    # not, list them in not_need_op_descs
-    for special_op_node in special_op_nodes:
-        op_list = [special_op_node]
-        ready_vars = set(special_op_node.inputs)
-        remove_ops = True
-        candidate_ops = [special_op_node]
-        while len(candidate_ops) > 0:
-            op_node = candidate_ops.pop(0)
-            if _all_in_set_(op_node.inputs, ready_vars):
-                for out_var in op_node.outputs:
-                    candidate_ops.extend(out_var.pendding_ops)
-                    op_list.extend(out_var.pendding_ops)
-                ready_vars.update(op_node.outputs)
-            else:
-                remove_ops = False
-                break
-        if remove_ops:
-            not_need_op_descs.extend([node.op_desc for node in op_list])
-
-    return set(not_need_op_descs)
-
-
-from .proto import framework_pb2
-
-
-def serialize_op_decs(op_desc):
-    protostr = op_desc.serialize_to_string()
-    proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
-    return proto.__str__()
-
-
-def _append_backward_ops_with_checkpoints_(
-        block, ops, target_block, no_grad_dict, grad_to_var, checkpoints):
-
-    checkpoints_name = [x.name for x in checkpoints]
-    """
-    Create grad ops with forward ops, and insert them into given block
-
-    Args:
-        block(Block): the block where forward ops are
-        ops(Op): the forward operators whose forward recomputation backward ops need to be added
-        target_block(Block): the block which is going to hold new generated grad ops
-        no_grad_dict(dict):
-            key(int) block index
-            val(str): corresponding forward variable name
-        checkpoints: variables that a user defined as checkpoint for forward recomputation
-
-    Algorithms:
-        1) go through all forward ops and induct all checkpoint vars
-            a. input variables can be deduced from forward program
-            b. input variables are checkpoints
-            c. variables that are used across segments will be held in memory
-        2) find ops between checkpoints, i.e. recompute_segments
-        3) go through each recompute_segments, add backward ops with forward recomputation
-            a. add ops in current recompute_segment as forward recomputation ops
-            b. rename all non-checkpoint variables in recomputation ops
-            c. add sum_op to merge gradient if needed
-            d. add backward ops of current recomputation ops
-        4) remove no grad branch as it is in _remove_no_grad_branch_
-        5) Note1: all appended ops' OpRole are Backward
-        6) Note2: variables that are used across segments will be held in memory
-        7) Note3: all variables with new name should be returned so that _append_backward_vars_ can be called
-        8) Note4: current forward recomputation backpropagation does not handle programs with subblock
-    """
-    local_block = block.program._create_block()
-    buffer_block = block.program._create_block()
-
-    program_stat = ProgramStats(block, ops)
-    program_stat.build_stats()
-    segments = []
-
-    if len(checkpoints) == 1:
-        # only one checkpoint
-        max_op_idx = -1
-        var_group = [checkpoints_name[0]]
-        for name in var_group:
-            if name not in program_stat.var_op_deps:
-                break
-            op_idx = program_stat.var_op_deps[name]["var_as_output_ops"]
-            for idx in op_idx:
-                max_op_idx = max(max_op_idx, idx)
-        if max_op_idx > 0:
-            segments.append([0, max_op_idx + 1])
-    else:
-        start_idx = 0
-        while True:
-            if start_idx >= len(checkpoints_name) - 1:
-                break
-            flag, min_idx, max_idx = program_stat.is_subgraph(
-                [checkpoints_name[start_idx]],
-                [checkpoints_name[start_idx + 1]])
-            if flag:
-                segments.append([min_idx, max_idx + 1])
-            start_idx += 1
-
-    checkpoints_name = list(set(checkpoints_name))
-
-    if segments != [] and segments[0][0] != 0:
-        recompute_segments = [[0, segments[0][0]]] + segments
-    else:
-        recompute_segments = segments
-    vars_should_be_hold = []
-    for segment in recompute_segments:
-        vars_should_be_hold.extend(
-            program_stat.get_out_of_subgraph_vars(segment[0], segment[1]))
-    vars_should_be_hold.extend(program_stat.get_reserved_vars())
-    vars_should_be_hold.extend(program_stat.get_input_nodes())
-    vars_should_be_hold = list(set(vars_should_be_hold))
-
-    # find variables that can not be deleted
-    grad_should_be_hold = [x + "@GRAD" for x in vars_should_be_hold]
-    vars_should_be_hold.extend(grad_should_be_hold)
-
-    grad_op_descs = []
-    var_name_dict = {}
-
-    vars_in_memory = vars_should_be_hold + checkpoints_name
-
-    max_calculated_op_position = len(ops)
-    if recompute_segments == []:
-        gap_ops = ops[0:max_calculated_op_position]
-        for op in reversed(gap_ops):
-            if op.has_attr("sub_block"):
-                raise Exception("Recompute don't support ops with sub_block"
-                                "invoke op: %s" %
-                                _pretty_op_desc_(op.desc, "with_sub_block"))
-            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-                op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
-            added_descs = _add_descs_to_block(grad_op_desc, local_block)
-            grad_op_descs.extend(added_descs)
-            grad_to_var.update(op_grad_to_var)
-
-    for i, segment in enumerate(recompute_segments[::-1]):
-        # add grad op for ops not in any segments
-        gap_ops = ops[segment[1]:max_calculated_op_position]
-        max_calculated_op_position = segment[0]
-        for op in reversed(gap_ops):
-            if op.has_attr("sub_block"):
-                raise Exception("Recompute don't support ops with sub_block"
-                                "invoke op: %s" %
-                                _pretty_op_desc_(op.desc, "with_sub_block"))
-            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-                op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
-            added_descs = _add_descs_to_block(grad_op_desc, local_block)
-            grad_op_descs.extend(added_descs)
-            grad_to_var.update(op_grad_to_var)
-
-        ff_ops = ops[segment[0]:segment[1]]
-        var_suffix = ".subprog_%d" % i
-
-        for op in ff_ops:
-            if op.has_attr("sub_block"):
-                raise Exception("Recompute don't support ops with sub_block"
-                                "invoke op: %s" %
-                                _pretty_op_desc_(op.desc, "with_sub_block"))
-            input_and_output_names = []
-            input_and_output_names.extend(op.desc.input_arg_names())
-            input_and_output_names.extend(op.desc.output_arg_names())
-            for name in input_and_output_names:
-                if block.var(name).persistable or name in checkpoints_name:
-                    continue
-                if name in vars_should_be_hold:
-                    continue
-                if name not in var_name_dict:
-                    var_name_dict[name] = name + var_suffix
-        buffer_descs = _add_needed_descs_to_block(ff_ops, buffer_block, block,
-                                                  vars_in_memory)
-        added_descs = _add_descs_to_block(ff_ops, local_block)
-
-        # rename variable names in added_descs
-        for key in var_name_dict:
-            _rename_arg_(buffer_descs, key, var_name_dict[key])
-
-        # added_descs should be in grad_op_descs because it is backward op desc
-        grad_op_descs.extend(buffer_descs)
-
-        #for op_desc in reversed(buffer_descs):
-        for op_desc in reversed(added_descs):
-
-            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-                op_desc, cpt.to_text(no_grad_dict[block.idx]), [])
-
-            for key in var_name_dict:
-                _rename_arg_(grad_op_desc, key, var_name_dict[key])
-
-            grad_op_descs.extend(grad_op_desc)
-            grad_to_var.update(op_grad_to_var)
-
-    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
-    grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
-                                            no_grad_dict[block.idx])
-    added_descs = _add_descs_to_block(grad_op_descs, target_block)
-    return program_stat, checkpoints_name, vars_should_be_hold, recompute_segments
-
-
-def _append_backward_ops_(block,
-                          ops,
-                          target_block,
-                          no_grad_dict,
-                          grad_to_var,
-                          callbacks=None,
-                          input_grad_names_set=None):
-    """
-    Create all grad ops, and insert them into given block
-
-    Args:
-        block(Block): the block where forward ops are
-        ops(Op): the forward operators whose backward ops need to be added
-        target_block(Block): the block which is going to hold new generated grad ops
-        no_grad_dict(dict):
-            key(int)  block index
-            val(set) a set of varibale names. These varibales have no gradient
-        grad_to_var(dict)(output argument):
-            key(str): grad variable name
-            val(str): corresponding forward variable name
-        callbacks(callable object): a callable object used to decorate new generated grad ops
-        input_grad_names_set(set): this set is used to store the gradients' name which is
-            generated by backward ops, and input_grad_names_set can help to prune the unnecessary
-            backward ops.
-    """
-    if callbacks is not None:
-        assert (isinstance(callbacks, list))
-        for cb in callbacks:
-            if not hasattr(cb, '__call__'):
-                raise ValueError("'callback' must be a callable object.")
-
-    # grad_op_descs holds created grad_op, and will be appended to target_block
-    grad_op_descs = []
-    program = block.program
-    for op in reversed(ops):
-        grad_sub_block_list = []
-        # If the op has its own sub-block, deal with the sub-block first
-        if op.has_attr("sub_block"):
-            sub_block = program.block(op._block_attr_id("sub_block"))
-            grad_sub_block = program._create_block()
-            grad_sub_block._set_forward_block_idx(sub_block.idx)
-            # see follwing comments for why set None here.
-            pre_input_grad_names_set = copy.copy(input_grad_names_set)
-            input_grad_names_set = None
-            _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
-                                  no_grad_dict, grad_to_var, callbacks,
-                                  input_grad_names_set)
-            input_grad_names_set = pre_input_grad_names_set
-
-            program._rollback()
-            grad_sub_block_list.append(grad_sub_block.desc)
-
-        # Getting op's corresponding grad_op
-        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-            op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
-
-        # If input_grad_names_set is not None, extend grad_op_descs only when
-        # any input grad in outputs of previous grad ops.
-        # But this strategy is not suited for while op for some control flow,
-        # for example, for while op, the grads maybe generated in next loop.
-        if input_grad_names_set is not None:
-            is_append_grad = False
-            for op_desc in grad_op_desc:
-                input_grad_names = [
-                    name for name in op_desc.input_arg_names()
-                    if name.find(core.grad_var_suffix()) != -1
-                ]
-                # some code of gradient ops, like increment, are not very
-                # standard, there is no @GRAD in these ops' inputs.
-                if len(input_grad_names) == 0:
-                    is_append_grad = True
-                    break
-
-                if _some_in_set_(input_grad_names, input_grad_names_set):
-                    grad_op_descs.append(op_desc)
-                    is_append_grad = True
-                    for name in op_desc.output_arg_names():
-                        input_grad_names_set.add(name)
-            if is_append_grad:
-                grad_to_var.update(op_grad_to_var)
-        else:
-            grad_op_descs.extend(grad_op_desc)
-            grad_to_var.update(op_grad_to_var)
-
-    # add grad_op_desc by reversed ops
-
-    # sum parameter's gradients' var given multiple var gradient
-    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
-
-    # if all outputs of the grad op are in no_grad_set, then just remove and fill zero
-    # if all inputs of the grad op are in no_grad_set, just remove this op
-    grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
-                                            no_grad_dict[block.idx])
-
-    # remove some backward ops
-    not_need_ops = _find_not_need_ops(grad_op_descs, ops, input_grad_names_set)
-
-    grad_op_descs = [
-        op_desc for op_desc in grad_op_descs if op_desc not in not_need_ops
-    ]
-    # append op_desc in grad_op_descs to target_block
-    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
-    backward = core.op_proto_and_checker_maker.OpRole.Backward
-    for op_desc in grad_op_descs:
-        new_op_desc = target_block.desc.append_op()
-        new_op_desc.copy_from(op_desc)
-        new_op_desc._set_attr(op_role_attr_name, backward)
-        grad_to_var["__current_op_desc__"] = new_op_desc
-        if callbacks is not None:
-            assert (isinstance(callbacks, list))
-            for cb in callbacks:
-                cb(block=target_block, context=grad_to_var)
-
-
-def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
-    """
-    Create new variables required by backward pass.
-
-    Args:
-        block(Block): the block where new variables will be created
-        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
-        grad_to_var(dict):
-            key(str): grad variable name
-            val(str): corresponding forward variable name
-            In most cases, this dict is generated by _append_backward_ops_()
-        grad_info_map(dict)(output argument):
-            key(str): forward variable name
-            val(tuple): a tuple of (str, Block), str is the corresponding grad name, Block is the block containing grad variable
-    """
-    for op_idx in range(start_op_idx, block.desc.op_size()):
-        op_desc = block.desc.op(op_idx)
-        if op_desc.has_attr("sub_block"):
-            sub_block = block.program.block(op_desc._block_attr_id("sub_block"))
-            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
-        new_vars = set()
-        # create new gradient variables
-        for grad_var_name in op_desc.output_arg_names():
-            if block.desc.has_var_recursive(cpt.to_bytes(
-                    grad_var_name)) or grad_var_name == core.empty_var_name():
-                continue
-            block.desc.var(cpt.to_bytes(grad_var_name))
-            new_vars.add(grad_var_name)
-            if grad_var_name not in grad_to_var:
-                continue
-            grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
-        # infer_shape and infer_type
-        op_desc.infer_var_type(block.desc)
-        op_desc.infer_shape(block.desc)
-        for arg in op_desc.output_arg_names():
-            if arg in new_vars:
-                _infer_var_data_type_(arg, block)
-
-
-def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
-    var_map = copy.copy(target_grad_map)
-    for op_idx in range(start_op_idx, block.desc.op_size()):
-        op_desc = block.desc.op(op_idx)
-        for name in op_desc.input_arg_names():
-            if name in var_map:
-                op_desc._rename_input(name, var_map[name])
-
-        for name in op_desc.output_arg_names():
-            if "@GRAD" not in name:
-                continue
-            if block.desc.find_var(name.encode("ascii")):
-                new_name = unique_name.generate(name)
-                op_desc._rename_output(name, new_name)
-                var_map[name] = new_name
-
-    for g, ng in six.iteritems(var_map):
-        if g in grad_to_var:
-            grad_to_var[ng] = grad_to_var[g]
-            grad_to_var.pop(g)
-
-
-def _get_stop_gradients_(program):
-    no_grad_dict = dict()
-    assert isinstance(program, framework.Program)
-    for block in program.blocks:
-        assert isinstance(block, framework.Block)
-        block_no_grad_set = set()
-        for var in list(block.vars.values()):
-            assert isinstance(var, framework.Variable)
-            if var.stop_gradient:
-                block_no_grad_set.add(_append_grad_suffix_(var.name))
-        no_grad_dict[block.idx] = block_no_grad_set
-    return no_grad_dict
-
-
-def append_backward(loss,
-                    parameter_list=None,
-                    no_grad_set=None,
-                    callbacks=None,
-                    checkpoints=None):
-    """
-    Append backward part to main_program.
-
-    A complete neural network training is made up of forward and backward
-    propagation. However, when we configure a network, we only need to
-    specify its forwrd part. The backward part is generated automatically
-    according to the forward part by this function.
-
-    In most cases, users do not need to invoke this function manually. It
-    will be automatically invoked by the optimizer's `minimize` function.
-
-    Args:
-        loss(Variable): The loss variable of the network.
-        parameter_list(list[string]|None): Names of parameters that need
-                                           to be updated by optimizers.
-                                           If it is None, all parameters
-                                           will be updated.
-                                           Default: None
-        no_grad_set(set|None): Variables in the Block 0 whose gradients
-                               should be ignored. All variables with
-                               `stop_gradient=True` from all blocks will
-                               be automatically added into this set.
-                               Default: None
-        callbacks(list[callable object]|None): The callbacks are used for
-                                               doing some custom jobs during
-                                               backward part building. All
-                                               callable objects in it will
-                                               be invoked once each time a
-                                               new gradient operator is added
-                                               into the program. The callable
-                                               object must has two input
-                                               parameters: 'block' and 'context'.
-                                               The 'block' is the block which
-                                               the new gradient operator will
-                                               be added to. The 'context' is a
-                                               map, whose keys are gradient
-                                               variable names and values are
-                                               corresponding original variables.
-                                               In addition to this, the 'context'
-                                               has another special key-value pair:
-                                               the key is string '__current_op_desc__'
-                                               and the value is the op_desc of the
-                                               gradient operator who has just
-                                               triggered the callable object.
-
-    Returns:
-        list[(Variable,Variable)]: Pairs of parameter and its
-        corresponding gradients. The key is the parameter and the
-        value is gradient variable.
-
-    Raises:
-        AssertionError: If `loss` is not an instance of Variable.
-
-    Examples:
-        .. code-block:: python
-
-            # network configuration code
-            # loss from ...
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-            y_predict = fluid.layers.fc(input=x, size=1, act=None)
-            loss = fluid.layers.square_error_cost(input=y_predict, label=y)
-
-            avg_loss = fluid.layers.mean(loss)
-            param_grad_list = fluid.backward.append_backward(loss=avg_loss)
-    """
-    assert isinstance(loss, framework.Variable)
-
-    if loss.op is None:
-        # the loss is from a cloned program. Find loss op manually.
-        _find_loss_op_(loss)
-
-    loss.op._set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
-                      int(core.op_proto_and_checker_maker.OpRole.Forward) |
-                      int(core.op_proto_and_checker_maker.OpRole.Loss))
-
-    if callbacks is not None:
-        isinstance(callbacks, list)
-
-    program = loss.block.program
-    program._appending_grad_times += 1
-
-    if no_grad_set is None:
-        no_grad_set = set()
-    no_grad_set = copy.copy(no_grad_set)
-    no_grad_dict = _get_stop_gradients_(program)
-    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
-
-    grad_info_map = dict()
-    root_block = program.block(0)
-
-    fwd_op_num = root_block.desc.op_size()
-    current_block_idx = program.current_block_idx
-    grad_to_var = dict()
-
-    op_desc = _create_loss_op_desc_(loss)
-    root_block.desc.append_op().copy_from(op_desc)
-
-    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
-    op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
-    no_grad_vars = _find_no_grad_vars(root_block, op_path, [loss],
-                                      block_no_grad_set)
-    block_no_grad_set.update(no_grad_vars)
-    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
-
-    input_grad_names_set = None
-    # For double backward, input_grad_names is used for filter
-    # some non-used gradients op.
-    if program._appending_grad_times > 1:
-        input_grad_names_set = set([_append_grad_suffix_(loss.name)])
-
-
-    if checkpoints != None and \
-       isinstance(checkpoints, list) and \
-       len(checkpoints) > 0:
-        program_stat, checkpoint_names, \
-        vars_should_be_hold, \
-        recompute_segments = \
-                        _append_backward_ops_with_checkpoints_(
-                            root_block,
-                            op_path,
-                            root_block,
-                            no_grad_dict,
-                            grad_to_var,
-                            checkpoints)
-    else:
-        _append_backward_ops_(
-            root_block,
-            op_path,
-            root_block,
-            no_grad_dict,
-            grad_to_var,
-            callbacks,
-            input_grad_names_set=input_grad_names_set)
-
-    # Because calc_gradient may be called multiple times,
-    # we need rename the internal gradient variables so that they have
-    # different names.
-    _rename_grad_(root_block, fwd_op_num, grad_to_var, {})
-
-    _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
-
-    program.current_block_idx = current_block_idx
-    program._sync_with_cpp()
-
-    if parameter_list is not None:
-        parameters = parameter_list
-    else:
-        params = program.global_block().all_parameters()
-        parameters = [param.name for param in params if param.trainable]
-
-    params_and_grads = []
-    for param in parameters:
-        if cpt.to_text(param) not in grad_info_map:
-            continue
-        grad_info = grad_info_map[param]
-        grad_block = grad_info[1]
-        if not grad_block.has_var(grad_info[0]):
-            raise ValueError("grad block[{0}] did not have grad var {1}".format(
-                grad_info[1], grad_info[0]))
-        # Get the param var from the global block
-        param_var = program.global_block().var(param)
-        grad_var = grad_block.var(grad_info[0])
-        if loss.block.has_var(grad_info[0]):
-            params_and_grads.append((param_var, grad_var))
-        else:
-            params_and_grads.append((param_var, None))
-
-    op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
-    for p, g in params_and_grads:
-        if g is None:
-            continue
-        for op in reversed(program.global_block().ops):
-            assert isinstance(op, framework.Operator)
-            if g.name in op.output_arg_names:
-                g.op = op
-                break
-
-        if g.op is None:
-            raise ValueError("Unexpected branch")
-        attr_val = [p.name, g.name]
-        if g.op.has_attr(op_role_var_attr_name):
-            attr_val.extend(g.op.attr(op_role_var_attr_name))
-        g.op._set_attr(op_role_var_attr_name, attr_val)
-
-    return params_and_grads
-
-
-def _as_list(x):
-    if x is None:
-        return []
-    return list(x) if isinstance(x, collections.Sequence) else [x]
-
-
-def _find_no_grad_vars(block, op_path, targets, no_grad_set):
-    """
-    Find the vars which is not used in the program, and
-    those var belong to no_grad_var.
-    """
-    output_names = set([out.name for out in targets])
-    no_grad_var = []
-    for i, op in reversed(list(enumerate(op_path))):
-        # If the op has sub_block, it is too complicated to find the correct no_grad_var.
-        if not op.has_attr("sub_block"):
-            for out_var in op.desc.output_arg_names():
-                if out_var not in output_names and out_var not in op.desc.input_arg_names(
-                ) and not block.vars[out_var].stop_gradient:
-                    no_grad_var.append(out_var)
-        for name in op.desc.input_arg_names():
-            if name not in no_grad_set:
-                output_names.add(name)
-    return set(no_grad_var)
-
-
-def _find_op_path_(block, outputs, inputs, no_grad_set):
-    """
-    no_grad_set will also be changed
-    """
-    input_names = set([inp.name for inp in inputs])
-    output_names = set([out.name for out in outputs])
-
-    relevant_op_flags = [True] * len(block.ops)
-
-    # All the inputs of the block are used if inputs is empty,
-    if inputs:
-        for i, op in enumerate(block.ops):
-            if _some_in_set_(op.desc.input_arg_names(), input_names):
-                for name in op.desc.output_arg_names():
-                    if name not in no_grad_set:
-                        input_names.add(name)
-            else:
-                relevant_op_flags[i] = False
-
-    for i, op in reversed(list(enumerate(block.ops))):
-        if _some_in_set_(op.desc.output_arg_names(), output_names):
-            for name in op.desc.input_arg_names():
-                if name not in no_grad_set:
-                    output_names.add(name)
-        else:
-            relevant_op_flags[i] = False
-
-    op_path = [
-        block.ops[i] for i in range(len(block.ops)) if relevant_op_flags[i]
-    ]
-
-    if inputs:
-        for op in op_path:
-            for name in op.desc.input_arg_names():
-                if name not in input_names and block.vars[name].stop_gradient:
-                    no_grad_set.add(name)
-
-    return op_path
-
-
-def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
-    """
-    Backpropagate the gradients of targets to inputs.
-
-    Args:
-        targets(Variable|list[Variable]): The target variables
-        inputs(Variable|list[Variable]): The input variables
-        target_gradients (Variable|list[Variable]|None): The gradient variables
-            of targets which has the same shape with targets, If None, ones will
-            be created for them.
-        no_grad_set(set[string]): The names of variables that have no gradients
-            in Block 0. All variables with `stop_gradient=True` from all blocks
-            will be automatically added.
-
-    Return:
-        (list[Variable]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
-        will be None
-    """
-    targets = _as_list(targets)
-    inputs = _as_list(inputs)
-    target_gradients = _as_list(target_gradients)
-
-    block = targets[0].block
-    prog = block.program
-    # increase appending gradients times
-    prog._appending_grad_times += 1
-    block_idx = block.idx
-
-    if not target_gradients:
-        target_gradients = [None] * len(targets)
-
-    if len(targets) != len(target_gradients):
-        raise ValueError(
-            "Should have the same number of target_gradients as targets")
-
-    if no_grad_set is None:
-        no_grad_set = set()
-    no_grad_set = copy.copy(no_grad_set)
-    no_grad_dict = _get_stop_gradients_(prog)
-    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
-
-    fwd_op_num = block.desc.op_size()
-
-    input_grad_names_set = set()
-
-    target_grad_map = {}
-    for i, grad in enumerate(target_gradients):
-        target = targets[i]
-        if grad is None:
-            grad_name = _append_grad_suffix_(target.name)
-            op_desc = _create_op_desc_("fill_constant_batch_size_like",
-                                       {"Input": [target.name]},
-                                       {"Out": [grad_name]}, {
-                                           "shape": target.shape,
-                                           "value": 1.0,
-                                           "dtype": target.dtype,
-                                           'input_dim_idx': 0,
-                                           'output_dim_idx': 0
-                                       })
-            block.desc.append_op().copy_from(op_desc)
-            input_grad_names_set.add(grad_name)
-        else:
-            if target.block.idx != block_idx or target.block.program != prog:
-                raise ValueError("all targets must be in the same block")
-            if target.shape != grad.shape:
-                raise ValueError(
-                    "The shapes of target and grad are different: %s %s" % (
-                        target.name, grad.name))
-            target_grad_map[_append_grad_suffix_(target.name)] = grad.name
-            input_grad_names_set.add(grad.name)
-
-    # For double backward, input_grad_names is used for filter
-    # some non-used gradients op.
-    if prog._appending_grad_times == 1:
-        input_grad_names_set = None
-
-    for input in inputs:
-        if input.block.program != prog:
-            raise "input must be in the same program as targets"
-
-    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
-    op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
-    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
-    grad_to_var = dict()
-    grad_info_map = dict()
-    _append_backward_ops_(
-        block,
-        op_path,
-        block,
-        no_grad_dict,
-        grad_to_var,
-        input_grad_names_set=input_grad_names_set)
-
-    # Because calc_gradient may be called multiple times,
-    # we need rename the internal gradient variables so that they have
-    # different names.
-    _rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map)
-
-    _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map)
-    prog._sync_with_cpp()
-
-    grad_vars = []
-    for input_var in inputs:
-        if input_var.name not in grad_info_map:
-            grad_vars.append(None)
-        else:
-            grad_info = grad_info_map[input_var.name]
-            grad_block = grad_info[1]
-            grad_var = grad_block.var(grad_info[0])
-            grad_vars.append(grad_var)
-
-    if len(grad_vars) == 1:
-        return grad_vars[0]
-    else:
-        return grad_vars
-
-
-def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
-    """
-    Backpropagate the gradients of targets to inputs.
-
-    Args:
-        targets (Variable|list[Variable]): The target variables.
-        inputs (Variable|list[Variable]): The input variables.
-        target_gradients (Variable|list[Variable]|None): The gradient variables
-            of targets which has the same shape with targets, If None, ones will
-            be created for them.
-        no_grad_set (set[string]): The names of variables that have no gradients
-            in Block 0. All variables with `stop_gradient=True` from all blocks
-            will be automatically added.
-
-    Return:
-        (list[Variable]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
-        will be None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name='x', shape=[2,8,8], dtype='float32')
-            x.stop_gradient=False
-            y = fluid.layers.conv2d(x, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
-            y = fluid.layers.conv2d(y, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
-            z = fluid.gradients([y], x)
-            print(z)
-    """
-    outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
-    return _as_list(outs)
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
deleted file mode 100644
index 95d547f2f4a2be80affeb3445f2b9cae511cecee..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/clip.py
+++ /dev/null
@@ -1,450 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import copy
-import six
-
-import functools
-from . import layers
-from . import framework
-from . import core
-from .dygraph.base import _not_support
-
-__all__ = [
-    'set_gradient_clip',
-    'ErrorClipByValue',
-    'GradientClipByValue',
-    'GradientClipByNorm',
-    'GradientClipByGlobalNorm',
-]
-
-
-class BaseErrorClipAttr(object):
-    def __str__(self):
-        raise NotImplementedError()
-
-    def _append_clip_op(self, block, grad_name):
-        raise NotImplementedError()
-
-
-class ErrorClipByValue(BaseErrorClipAttr):
-    """
-    Clips tensor values to the range [min, max].
-
-    Given a tensor t, this operation clips its value to min and max inplace.
-
-    - Any values less than min are set to min.
-    - Any values greater than max are set to max.
-
-    Args:
-        max (float): The maximum value to clip by.
-        min (float, optional): The minimum value to clip by. if not set by user, \
-        will be set to -max by framework.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            BATCH_SIZE = 128
-            CLIP_MAX = 2e-6
-            CLIP_MIN = -1e-6
-            prog = fluid.framework.Program()
-            with fluid.program_guard(main_program=prog):
-                image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-                predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-                cost = fluid.layers.cross_entropy(input=predict, label=label)
-                avg_cost = fluid.layers.mean(cost)
-            prog_clip = prog.clone()
-            prog_clip.block(0).var(hidden1.name)._set_error_clip(
-                fluid.clip.ErrorClipByValue(
-                    max=CLIP_MAX, min=CLIP_MIN)
-    """
-
-    def __init__(self, max, min=None):
-        max = float(max)
-        if min is None:
-            min = -max
-        else:
-            min = float(min)
-        self.max = max
-        self.min = min
-
-    def __str__(self):
-        return "ByValue, min=%f, max=%f" % (self.min, self.max)
-
-    def _append_clip_op(self, block, grad_name):
-        clip_op_desc = block.desc.append_op()
-        clip_op_desc.set_type("clip")
-        clip_op_desc.set_input("X", [grad_name])
-        clip_op_desc.set_output("Out", [grad_name])
-        clip_op_desc._set_attr("min", self.min)
-        clip_op_desc._set_attr("max", self.max)
-
-
-def error_clip_callback(block, context):
-    # the context is a grad_to_var map
-    grad_to_var = context
-    op_desc = block.desc.op(block.desc.op_size() - 1)
-    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
-        fwd_var = block._var_recursive(grad_to_var[grad_n])
-        error_clip = getattr(fwd_var, "error_clip", None)
-        if not (error_clip is None or isinstance(error_clip,
-                                                 BaseErrorClipAttr)):
-            raise TypeError(
-                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
-            )
-        if error_clip is not None:
-            error_clip._append_clip_op(block, grad_n)
-
-
-class BaseGradientClipAttr(object):
-    def __str__(self):
-        raise NotImplementedError()
-
-    def _process_context(self, context, param, grad):
-        raise NotImplementedError()
-
-    def _create_operators(self, param, grad):
-        raise NotImplementedError()
-
-
-class NullGradientClipAttr(BaseGradientClipAttr):
-    def __str__(self):
-        return "Null"
-
-    def _process_context(self, context, param, grad):
-        pass
-
-    def _create_operators(self, param, grad):
-        return param, grad
-
-
-class GradientClipByValue(BaseGradientClipAttr):
-    """
-    Clips gradient values to the range [min, max].
-
-    Given a tensor t, this operation clips its value to min and max inplace.
-
-    - Any values less than min are set to min.
-    - Any values greater than max are set to max.
-
-    Args:
-        max (float): The maximum value to clip by.
-        min (float, optional): The minimum value to clip by. if not set by user, \
-        will be set to -max by framework.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            w_param_attrs = fluid.ParamAttr(name=None,
-              initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0),
-              learning_rate=1.0,
-              regularizer=fluid.regularizer.L1Decay(1.0),
-              trainable=True,
-              gradient_clip=fluid.clip.GradientClipByValue(-1.0, 1.0))
-            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
-            y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
-    """
-
-    def __init__(self, max, min=None):
-        max = float(max)
-        if min is None:
-            min = -max
-        else:
-            min = float(min)
-        self.max = max
-        self.min = min
-
-    def __str__(self):
-        return "ByValue, min=%f, max=%f" % (self.min, self.max)
-
-    def _process_context(self, context, param, grad):
-        pass
-
-    def _create_operators(self, param, grad):
-        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
-        return param, new_grad
-
-
-class GradientClipByNorm(BaseGradientClipAttr):
-    """
-    Clips tensor values to a maximum L2-norm.
-
-    This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
-    If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
-    will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
-    :math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
-    :math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
-
-    .. math::
-
-        Out = \\frac{max\_norm * X}{norm(X)},
-
-    where :math:`norm(X)` represents the L2 norm of :math:`X`.
-
-    Args:
-        clip_norm (float): The maximum norm value
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            w_param_attrs = fluid.ParamAttr(name=None,
-              initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0),
-              learning_rate=1.0,
-              regularizer=fluid.regularizer.L1Decay(1.0),
-              trainable=True,
-              gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0))
-            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
-            y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
-
-    """
-
-    def __init__(self, clip_norm):
-        self.clip_norm = clip_norm
-
-    def __str__(self):
-        return "ByNorm, clip_norm=%f" % self.clip_norm
-
-    def _process_context(self, context, param, grad):
-        pass
-
-    def _create_operators(self, param, grad):
-        new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
-        return param, new_grad
-
-
-class GradientClipByGlobalNorm(BaseGradientClipAttr):
-    """
-    Clips values of multiple tensors by the ratio of the sum of their norms.
-
-    Given a list of tensors t_list, and a clipping ratio clip_norm, this
-    operation returns a list of clipped tensors list_clipped and the global
-    norm (global_norm) of all tensors in t_list.
-
-    To perform the clipping, the values :math:`t\_list[i]` are set to:
-
-    .. math::
-
-        t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}
-
-    where:
-
-    .. math::
-
-        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
-
-    If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are,
-    otherwise they're all shrunk by the global ratio.
-
-    Args:
-        clip_norm (float): The maximum norm value
-        group_name (str, optional): The group name for this clip.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            prog = fluid.framework.Program()
-            startup_program = fluid.framework.Program()
-            with fluid.program_guard(
-                    main_program=prog, startup_program=startup_program):
-                image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-                predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-                cost = fluid.layers.cross_entropy(input=predict, label=label)
-                avg_cost = fluid.layers.mean(cost)
-            prog_clip = prog.clone()
-            avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
-            p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
-
-            with fluid.program_guard(main_program=prog_clip):
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
-                p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
-
-    """
-
-    def __init__(self, clip_norm, group_name="default_group"):
-        if not isinstance(group_name, six.string_types):
-            raise TypeError("'group_name' must be a %s." % (six.string_types))
-
-        self.clip_norm = clip_norm
-        self.group_name = group_name
-
-    def __str__(self):
-        return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
-                                                              self.clip_norm)
-
-    def _process_context(self, context, param, grad):
-        if self.group_name not in context:
-            context[self.group_name] = []
-            context[self.group_name + "_clip_value"] = self.clip_norm
-            context[self.group_name + "_clip"] = layers.fill_constant(
-                shape=[1], dtype="float32", value=self.clip_norm)
-        else:
-            if not self.clip_norm == context[self.group_name + "_clip_value"]:
-                raise ValueError(
-                    "All parameters' 'clip_norm' of a same group should be the same"
-                )
-
-        merge_grad = grad
-        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
-            merge_grad = layers.merge_selected_rows(grad)
-            merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-
-        square = layers.square(merge_grad)
-        local_norm_var = layers.reduce_sum(input=square)
-        context[self.group_name].append(local_norm_var)
-
-        self.context = context
-
-    def _create_operators(self, param, grad):
-        group_scale_name = self.group_name + "_scale"
-        if group_scale_name not in self.context:
-            group_norm_var = layers.sums(input=self.context[self.group_name])
-            group_norm_var = layers.sqrt(x=group_norm_var)
-            clip_var = self.context[self.group_name + "_clip"]
-            group_scale_var = layers.elementwise_div(
-                x=clip_var,
-                y=layers.elementwise_max(
-                    x=clip_var, y=group_norm_var))
-            assert group_scale_var.shape == (1, )
-            self.context[group_scale_name] = group_scale_var
-
-        new_grad = layers.elementwise_mul(
-            x=grad, y=self.context[group_scale_name])
-
-        return param, new_grad
-
-
-@_not_support
-def set_gradient_clip(clip, param_list=None, program=None):
-    """
-    To specify parameters that require gradient clip.
-
-    Args:
-        clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
-                for example :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
-                which describes the type and detailed attributes of required gradient clip.
-        param_list(list(Variable), optional): Parameters that require gradient clip.
-                It can be a list of parameter or a list of parameter's name.
-                Default None, meaning that all parameters in the program will be included.
-        program(Program, optional): The program where parameters are located.
-                Default None, meaning that using :ref:`api_fluid_default_main_program` .
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-
-            def network():
-                image = fluid.layers.data(name='image', shape=[28], dtype='float32')
-                param_attr1 = fluid.ParamAttr("fc1_param")
-                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
-                param_attr2 = fluid.ParamAttr("fc2_param")
-                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
-                loss = fluid.layers.reduce_mean(fc2)
-                return loss
-
-
-            # network 1: clip all parameter gradient
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 2: clip parameter gradient by name
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
-                    param_list=["fc1_param", "fc2_param"])
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 3: clip parameter gradient by var
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
-                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
-                    param_list=[param_var1, param_var2])
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-    """
-    if not isinstance(clip, BaseGradientClipAttr):
-        raise TypeError(
-            "'clip' should be an instance of BaseGradientClipAttr's derived class"
-        )
-    if program is None:
-        program = framework.default_main_program()
-    if param_list is None:
-        param_list = program.block(0).all_parameters()
-    if all(isinstance(elem, six.string_types) for elem in param_list):
-        param_list = [program.block(0).var(elem) for elem in param_list]
-    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
-        raise TypeError(
-            "'param_list' should be a list of Parameter or basestring(parameter's name)."
-        )
-
-    for param in param_list:
-        param.gradient_clip_attr = copy.deepcopy(clip)
-
-
-def append_gradient_clip_ops(param_grads):
-    context = dict()
-    for p, g in param_grads:
-        if g is None:
-            continue
-        with p.block.program._optimized_guard(
-            [p, g]), framework.name_scope('append_clip'):
-            clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
-            if clip_attr is None:
-                clip_attr = NullGradientClipAttr()
-            if not isinstance(clip_attr, BaseGradientClipAttr):
-                raise TypeError(
-                    "clip attribute should be an instance of BaseGradientClipAttr"
-                )
-
-            clip_attr._process_context(context=context, param=p, grad=g)
-
-    res = []
-    for p, g in param_grads:
-        if g is None:
-            continue
-        with p.block.program._optimized_guard(
-            [p, g]), framework.name_scope('append_graident_clip'):
-            res.append(clip_attr._create_operators(param=p, grad=g))
-
-    return res
-
-
-ClipByValue = GradientClipByValue
-ClipByNorm = GradientClipByNorm
-ClipByGlobalNorm = GradientClipByGlobalNorm
diff --git a/python/paddle/fluid/communicator.py b/python/paddle/fluid/communicator.py
deleted file mode 100644
index 2fecdd34c1569145981f56746e218a3ecf6bb9b4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/communicator.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .executor import global_scope
-from . import core
-from .framework import Program
-
-__all__ = ['Communicator']
-
-
-class Communicator(object):
-    def __init__(self, program):
-        """
-        Communicator is used for async distribute training in distribute_transpiler mode.
-        It's a wrapper of a cpp class Communicator and should be used inside fleet API.
-
-        Args:
-            program(Program): the trainers program after transpile of distribute_transpiler.
-            It's used by communicator to extract the information to do communication.
-
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.Program()
-                comm = fluid.communicator.Communicator(prog)
-                comm.start()
-                comm.stop()
-        """
-        # set all recv op to not_run mode
-        assert isinstance(program, Program)
-        for op in program.block(0).ops:
-            if op.type == "recv":
-                op._set_attr('do_not_run', True)
-        self.communicator_ = core.DistCommunicator(program.desc, global_scope())
-
-    def start(self):
-        """
-        Start communicator. Should call before training process.
-
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.Program()
-                comm = fluid.communicator.Communicator(prog)
-                comm.start()
-                comm.stop()
-        """
-        self.communicator_.start()
-
-    def stop(self):
-        """
-        Stop communicator. Should call after training process.
-
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.Program()
-                comm = fluid.communicator.Communicator(prog)
-                comm.start()
-                comm.stop()
-        """
-        self.communicator_.stop()
-
-    def is_running(self):
-        """
-        Get communicator is running or stop.
-
-        Returns:
-            bool
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.Program()
-                comm = fluid.communicator.Communicator(prog)
-                comm.is_running()
-        """
-        self.communicator_.is_running()
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
deleted file mode 100644
index 69658dbfb7327f6dc35cf714c1c694fe62c3638a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/compiler.py
+++ /dev/null
@@ -1,393 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import multiprocessing
-import os
-import six
-import sys
-from .. import compat as cpt
-from . import framework
-from .framework import cuda_places, cpu_places
-
-from . import core
-
-__all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
-
-ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
-BuildStrategy = core.ParallelExecutor.BuildStrategy
-InferNativeConfig = core.NativeConfig
-InferAnalysisConfig = core.AnalysisConfig
-
-
-def _place_obj(place):
-    p = core.Place()
-    p.set_place(place)
-    return p
-
-
-def _is_pserver_mode(main_program):
-    main = main_program if main_program \
-        else framework.default_main_program()
-    for op in main.global_block().ops:
-        if op.type in ["send", "recv"]:
-            return True
-    return False
-
-
-def _has_backward_op(graph):
-    for node in graph.nodes():
-        if node.is_op() and node.op() is not None and \
-                node.op().type().endswith("_grad"):
-            return True
-    return False
-
-
-def _prune_feed_ops(program):
-    # prune the feed ops in the program.
-    pop_idx = []
-    for i, op in enumerate(program.global_block().ops):
-        if op.type == "feed": pop_idx.append(i)
-    for index in pop_idx[::-1]:
-        program.global_block()._remove_op(index)
-
-
-class CompiledProgram(object):
-    """
-    Compiles to Graph for execution.
-
-    1. Users first create the program with layers.
-    2. Optionally, users use CompiledProgram to optimize the program before run.
-    3. The original program or CompiledProgram is run by executor.
-
-    The CompiledProgram is used to transform a program for various
-    optimizations, for example.
-      * Pre-compute some logic once so that each run is faster.
-      * Transform the program so that it can run in multiple devices.
-      * Transform the program for optimized inference or distributed
-        training. **Note that: this part is not finished.**
-
-    Example:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle.fluid.compiler as compiler
-          import numpy
-          import os
-
-          place = fluid.CUDAPlace(0) # fluid.CPUPlace()
-          exe = fluid.Executor(place)
-
-          data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-          hidden = fluid.layers.fc(input=data, size=10)
-          loss = fluid.layers.mean(hidden)
-          fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-          fluid.default_startup_program().random_seed=1
-          exe.run(fluid.default_startup_program())
-          compiled_prog = compiler.CompiledProgram(
-                   fluid.default_main_program())
-
-          x = numpy.random.random(size=(10, 1)).astype('float32')
-          loss_data, = exe.run(compiled_prog,
-                               feed={"X": x},
-                               fetch_list=[loss.name])
-
-    Args:
-        program_or_graph (Graph|Program): If it's Program, it will be first
-            lowered to a graph for further optimizations. If it's a graph
-            (potentially optimized before), it will be directly used for
-            further optimizations. Note: graph is only supported when compiled
-            with with_data_parallel option.
-        build_strategy(BuildStrategy): build_strategy is used to
-            build the graph with the specified options.
-            For more information, please refer to fluid.BuildStrategy.
-            Default None.
-    """
-
-    def __init__(self, program_or_graph, build_strategy=None):
-        if isinstance(program_or_graph, core.Graph):
-            self._graph = program_or_graph
-            # don't not create a new program here.
-            self._program = None
-        elif isinstance(program_or_graph, framework.Program):
-            _prune_feed_ops(program_or_graph)
-            self._graph = core.Graph(program_or_graph.desc)
-            self._program = program_or_graph
-        else:
-            raise ValueError("Wrong program_to_graph type: %s" %
-                             type(program_or_graph))
-
-        self._scope = None
-        self._place = None
-        self._executor = None
-        self._compiled = False
-        self._is_data_parallel = False
-        self._is_inference = False
-        self._loss_name = None
-        self._share_vars_from = None
-        self._places = None
-        self._build_strategy = build_strategy
-        self._exec_strategy = None
-
-    def with_data_parallel(self,
-                           loss_name=None,
-                           build_strategy=None,
-                           exec_strategy=None,
-                           share_vars_from=None,
-                           places=None):
-        """Configs the program to run in data parallel way.
-
-        Example:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              import paddle.fluid.compiler as compiler
-              import numpy
-              import os
-
-              use_cuda = True
-              place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-              # NOTE: If you use CPU to run the program, you need
-              # to specify the CPU_NUM, otherwise, fluid will use
-              # all the number of the logic core as the CPU_NUM,
-              # in that case, the batch size of the input should be
-              # greater than CPU_NUM, if not, the process will be
-              # failed by an exception.
-              if not use_cuda:
-                  os.environ['CPU_NUM'] = str(2)
-
-              exe = fluid.Executor(place)
-
-              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-              hidden = fluid.layers.fc(input=data, size=10)
-              loss = fluid.layers.mean(hidden)
-              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-              fluid.default_startup_program().random_seed=1
-              exe.run(fluid.default_startup_program())
-              compiled_prog = compiler.CompiledProgram(
-                       fluid.default_main_program()).with_data_parallel(
-                                loss_name=loss.name)
-
-              x = numpy.random.random(size=(10, 1)).astype('float32')
-              loss_data, = exe.run(compiled_prog,
-                                   feed={"X": x},
-                                   fetch_list=[loss.name])
-
-        Args:
-            loss_name (str): The loss name must set in training. Default None.
-            build_strategy(BuildStrategy): build_strategy is used to
-                build the graph with the specified options.
-                For more information, please refer to fluid.BuildStrategy.
-                Note that, if you set build_strategy in the argument list when
-                creating CompiledProgram and calling with_data_parallel,
-                the build_strategy in CompiledProgram will be overwritten by the latter.
-                Default None.
-            exec_strategy(ExecutionStrategy): exec_strategy is used to
-                to select the a way to execute the graph, for example how many
-                threads are used, how many iterations to clean up the temp
-                variables. For more information, please refer
-                to fluid.ExecutionStrategy. Default None.
-            share_vars_from(CompiledProgram): If provided, this CompiledProgram
-                will share variables from `share_vars_from`. `share_vars_from`
-                must be run by the executor before this CompiledProgram so that
-                vars are ready.
-            places(list(CUDAPlace)|list(CPUPlace)|None): If provided, only compile
-                program in the given places. Otherwise, the places used when compiled 
-                is determined by the Executor, and the places used are controlled 
-                by environment variables: FLAGS_selected_gpus or CUDA_VISIBLE_DEVICES
-                if using GPU; or CPU_NUM if using CPU. For example, if you want to 
-                run on GPU 0 and 1, set places=[fluid.CUDAPlace(0), fluid.CUDAPlace(1)].
-                If you want to run on 2 CPU cores, set places=[fluid.CPUPlace()]*2.  
-
-        Returns:
-            self
-        """
-        assert not self._is_data_parallel, "Already compiled with parallel."
-        assert not self._is_inference, "Cannot compile both data parallel and inference"
-        self._is_data_parallel = True
-        # FIXME(zcd): Currently, the build_strategy can be set during creating
-        # CompiledProgram or calling with_data_parallel, and it may be confusing,
-        # but in the long run, we should set up build_strategy only when creating
-        # CompiledProgram, and exec_strategy should be deprecated.
-        if build_strategy is not None: self._build_strategy = build_strategy
-        self._exec_strategy = exec_strategy
-        self._loss_name = loss_name
-        self._share_vars_from = share_vars_from
-        self._places = places
-
-        if _has_backward_op(self._graph):
-            assert self._loss_name is not None, "The loss_name should be set here."
-
-        if self._places is not None:
-            if not isinstance(self._places, (list, tuple)):
-                self._places = [self._places]
-
-        return self
-
-    def _with_inference_optimize(self, config):
-        """ Add inference optimize
-
-        Args:
-            config: instance of `NativeConfig` or `AnalysisConfig` to create predictor
-        Returns:
-            self
-        """
-        assert not self._is_data_parallel, "Cannot compile both data parallel and inference"
-        assert not self._is_inference, "Already compiled with inference"
-
-        assert any([
-            isinstance(config, InferNativeConfig),
-            isinstance(config, InferAnalysisConfig)
-        ])
-        self._is_inference = True
-        self._infer_config = config
-        return self
-
-    def _with_distributed(self):
-        raise NotImplementedError()
-
-    def _compile_data_parallel(self, places, use_cuda=False, scope=None):
-        if self._share_vars_from:
-            if scope:
-                sys.stderr.write("share_vars_from is set, scope is ignored.\n")
-            if not self._is_data_parallel:
-                raise ValueError(
-                    "Currently, only data parallel mode need share_vars_from.")
-            if not self._share_vars_from._is_data_parallel:
-                raise ValueError("share_vars_from is not data parallel. Cannot "
-                                 "share vars from it.")
-            if self._share_vars_from._executor is None:
-                raise ValueError(
-                    "share_vars_from is not compiled and run, so there is no "
-                    "var to share.")
-            self._local_scopes = self._share_vars_from._executor.local_scopes()
-        else:
-            assert scope is not None, ""
-            self._local_scopes = []
-
-        assert isinstance(places, tuple) or isinstance(places, list), \
-            "Currently , The places type only should be list or tuple, \n" \
-            "but the input type is {}.".format(type(places))
-
-        if self._build_strategy is None:
-            self._build_strategy = BuildStrategy()
-        self._build_strategy.is_distribution = _is_pserver_mode(self._program)
-
-        if self._exec_strategy is None:
-            self._exec_strategy = ExecutionStrategy()
-        self._exec_strategy.use_cuda = use_cuda
-
-        if self._exec_strategy.num_threads == 0:
-            if self._exec_strategy.use_cuda:
-                # Experiments on se-resnext shows that too many threads hurt
-                # performance. Worth tunning for other models in the future.
-                self._exec_strategy.num_threads = len(places) * 4
-            else:
-                self._exec_strategy.num_threads = len(places) * 2
-
-        if self._build_strategy.num_trainers > 1:
-            assert self._is_data_parallel, \
-                "If you use multi-trainer to train the model, you should use "\
-                "the data parallel model, i.e. calling with_data_parallel function."
-
-        # TODO(wuyi): trainer endpoings should be passed in through
-        # build_strategy, not program.xxx.
-        # TODO(gongwb): let user to set them once.
-        if self._program and self._build_strategy.num_trainers > 1 and \
-                self._program._trainers_endpoints:
-            tps = self._program._trainers_endpoints
-
-            assert self._build_strategy.num_trainers == len(
-                tps), "num_trainers == len(end_points)"
-            self._build_strategy.trainers_endpoints = tps
-
-        if self._program:
-            self._build_strategy.nccl_comm_num = self._program._nccl_comm_num
-            self._build_strategy.use_hierarchical_allreduce = self._program._use_hierarchical_allreduce
-            self._build_strategy.hierarchical_allreduce_inter_nranks = self._program._hierarchical_allreduce_inter_nranks
-
-        if self._build_strategy.sync_batch_norm:
-            self._build_strategy.enable_sequential_execution = True
-
-        self._persistable_vars = []
-        for node in self._graph.nodes():
-            if node.is_var() and node.var() is not None and node.var().persistable() and \
-                    node.var().type() != core.VarDesc.VarType.RAW:
-                self._persistable_vars.append(cpt.to_text(node.name()))
-
-        places = list(map(_place_obj, places))
-
-        # ParallelExecutor would broadcast all the parameters during initializing.
-        # The parameters of each process should be in the same ordered for the data-parallelism
-        # distributed training to keep the broadcast correct.
-        self._persistable_vars = list(set(self._persistable_vars))
-        self._persistable_vars.sort()
-
-        return core.ParallelExecutor(
-            places, self._persistable_vars,
-            cpt.to_text(self._loss_name)
-            if self._loss_name else six.u(''), self._scope, self._local_scopes,
-            self._exec_strategy, self._build_strategy, self._graph)
-
-    def _compile_inference(self):
-        return core.create_paddle_predictor(self._infer_config)
-
-    def _compile(self, scope, place):
-        """Compile the program based on the configs.
-
-        Args:
-            scope: The variables (resources) that are associated with
-               this compiled program.
-            place: The location that the compiled program will be run on.
-
-        Returns:
-            self
-        """
-        if self._compiled:
-            if scope and self._scope != scope:
-                raise ValueError("Cannot compile with different scope")
-            if place and not self._place._equals(place):
-                raise ValueError("Cannot compile with different place")
-            return self
-        self._compiled = True
-
-        self._scope = scope
-        self._place = place
-
-        if self._is_inference:
-            self._executor = self._compile_inference()
-        else:
-            if self._is_data_parallel:
-                self._places = self._get_places(self._place, self._places)
-            else:
-                self._places = [self._place]
-            self._executor = self._compile_data_parallel(
-                use_cuda=isinstance(self._place, core.CUDAPlace),
-                scope=self._scope,
-                places=self._places)
-        return self
-
-    def _get_places(self, place, place_list):
-        has_set_place = (place_list is not None)
-        if has_set_place:
-            for p in place_list:
-                assert p._type() == place._type(), \
-                    "Place type not match. You may set the wrong type of places"
-        else:
-            place_list = cuda_places() if isinstance(
-                place, core.CUDAPlace) else cpu_places()
-        assert place_list, "no place for execution"
-        return place_list
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
deleted file mode 100644
index ff478200aefa5524b7cfb70996ba9e3ee50db6f2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/__init__.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import decoder
-from .decoder import *
-from . import memory_usage_calc
-from .memory_usage_calc import *
-from . import op_frequence
-from .op_frequence import *
-from . import quantize
-from .quantize import *
-from . import reader
-from .reader import *
-from . import slim
-from .slim import *
-from . import utils
-from .utils import *
-from . import extend_optimizer
-from .extend_optimizer import *
-from . import model_stat
-from .model_stat import *
-from . import mixed_precision
-from .mixed_precision import *
-from . import layers
-from .layers import *
-
-__all__ = []
-__all__ += decoder.__all__
-__all__ += memory_usage_calc.__all__
-__all__ += op_frequence.__all__
-__all__ += quantize.__all__
-__all__ += reader.__all__
-__all__ += slim.__all__
-__all__ += utils.__all__
-__all__ += extend_optimizer.__all__
-__all__ += ['mixed_precision']
-__all__ += layers.__all__
diff --git a/python/paddle/fluid/contrib/decoder/__init__.py b/python/paddle/fluid/contrib/decoder/__init__.py
deleted file mode 100644
index 9f973fd3c9af60a0c9a2ba5225a616671545436b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/decoder/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import beam_search_decoder
-from .beam_search_decoder import *
-
-__all__ = beam_search_decoder.__all__
diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
deleted file mode 100644
index 5854cadb58c76066ba4b48dc6b5dbca06fba8cba..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ /dev/null
@@ -1,842 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module provides a general beam search decoder API for RNN based decoders.
-The purpose of this API is to allow users to highly customize the behavior
-within their RNN decoder(vanilla RNN, LSTM, attention + LSTM, future etc.),
-without using the low level API such as while ops.
-
-This API is still under active development and may change drastically.
-"""
-
-from __future__ import print_function
-
-from ...wrapped_decorator import signature_safe_contextmanager
-import numpy as np
-import six
-
-from ... import layers
-from ...framework import Variable
-from ... import core
-from ... import framework, unique_name
-from ...layer_helper import LayerHelper
-
-__all__ = ['InitState', 'StateCell', 'TrainingDecoder', 'BeamSearchDecoder']
-
-
-class _DecoderType:
-    TRAINING = 1
-    BEAM_SEARCH = 2
-
-
-class InitState(object):
-    """
-    The initial hidden state object. The state objects holds a variable, and may
-    use it to initialize the hidden state cell of RNN. Usually used as input to
-    `StateCell` class.
-
-    Args:
-        init (Variable): The initial variable of the hidden state. If set None,
-            the variable will be created as a tensor with constant value based
-            on `shape` and `value` param.
-        shape (tuple|list): If `init` is None, new Variable's shape. Default
-            None.
-        value (float): If `init` is None, new Variable's value. Default None.
-        init_boot (Variable): If provided, the initial variable will be created
-            with the same shape as this variable.
-        need_reorder (bool): If set true, the init will be sorted by its lod
-            rank within its batches. This should be used if `batch_size > 1`.
-        dtype (np.dtype|core.VarDesc.VarType|str): Data type of the initial
-            variable.
-
-    Returns:
-        An initialized state object.
-
-    Examples:
-        See `StateCell`.
-    """
-
-    def __init__(self,
-                 init=None,
-                 shape=None,
-                 value=0.0,
-                 init_boot=None,
-                 need_reorder=False,
-                 dtype='float32'):
-        if init is not None:
-            self._init = init
-        elif init_boot is None:
-            raise ValueError(
-                'init_boot must be provided to infer the shape of InitState .\n')
-        else:
-            self._init = layers.fill_constant_batch_size_like(
-                input=init_boot, value=value, shape=shape, dtype=dtype)
-
-        self._shape = shape
-        self._value = value
-        self._need_reorder = need_reorder
-        self._dtype = dtype
-
-    @property
-    def value(self):
-        return self._init
-
-    @property
-    def need_reorder(self):
-        return self._need_reorder
-
-
-class _MemoryState(object):
-    def __init__(self, state_name, rnn_obj, init_state):
-        self._state_name = state_name  # each is a rnn.memory
-        self._rnn_obj = rnn_obj
-        self._state_mem = self._rnn_obj.memory(
-            init=init_state.value, need_reorder=init_state.need_reorder)
-
-    def get_state(self):
-        return self._state_mem
-
-    def update_state(self, state):
-        self._rnn_obj.update_memory(self._state_mem, state)
-
-
-class _ArrayState(object):
-    def __init__(self, state_name, block, init_state):
-        self._state_name = state_name
-        self._block = block
-
-        self._state_array = self._block.create_var(
-            name=unique_name.generate('array_state_array'),
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=init_state.value.dtype)
-
-        self._counter = self._block.create_var(
-            name=unique_name.generate('array_state_counter'),
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            dtype='int64')
-
-        # initialize counter
-        self._block.append_op(
-            type='fill_constant',
-            inputs={},
-            outputs={'Out': [self._counter]},
-            attrs={
-                'shape': [1],
-                'dtype': self._counter.dtype,
-                'value': float(0.0),
-                'force_cpu': True
-            })
-
-        self._counter.stop_gradient = True
-
-        # write initial state
-        block.append_op(
-            type='write_to_array',
-            inputs={'X': init_state.value,
-                    'I': self._counter},
-            outputs={'Out': self._state_array})
-
-    def get_state(self):
-        state = layers.array_read(array=self._state_array, i=self._counter)
-        return state
-
-    def update_state(self, state):
-        layers.increment(x=self._counter, value=1, in_place=True)
-        layers.array_write(state, array=self._state_array, i=self._counter)
-
-
-class StateCell(object):
-    """
-    The state cell class stores the hidden state of the RNN cell. A typical RNN
-    cell has one or more hidden states, and one or more step inputs. This class
-    allows you to defines the name of hidden states as well as step inputs, and
-    their associated variables.
-
-    Args:
-        inputs (dict): A feeding dict of {name(str) : Variable}. It specifies
-            the names of step inputs for RNN cell, and the associated variables.
-            The variable could initially be None and set manually during each
-            RNN step.
-        states (dict): A feeding dict of {name(str) : InitState object}. It
-            specifies the names of hidden states and their initialized state.
-        out_state (str): A string that specifies the name of hidden state that
-            will be used to compute the score in beam search process.
-        name (str): The name of the RNN cell. Default None.
-
-    Raises:
-        `ValueError`: If the initial state is not an instance of InitState, or
-            the out_state is not in the dict of states.
-
-    Returns:
-        StateCell: The initialized StateCell object.
-
-    Examples:
-        .. code-block:: python
-          hidden_state = InitState(init=encoder_out, need_reorder=True)
-          state_cell = StateCell(
-              inputs={'current_word': None},
-              states={'h': hidden_state},
-              out_state='h')
-    """
-
-    def __init__(self, inputs, states, out_state, name=None):
-        self._helper = LayerHelper('state_cell', name=name)
-        self._cur_states = {}
-        self._state_names = []
-        for state_name, state in six.iteritems(states):
-            if not isinstance(state, InitState):
-                raise ValueError('state must be an InitState object.')
-            self._cur_states[state_name] = state
-            self._state_names.append(state_name)
-        self._inputs = inputs  # inputs is place holder here
-        self._cur_decoder_obj = None
-        self._in_decoder = False
-        self._states_holder = {}
-        self._switched_decoder = False
-        self._state_updater = None
-        self._out_state = out_state
-        if self._out_state not in self._cur_states:
-            raise ValueError('out_state must be one state in states')
-
-    def _enter_decoder(self, decoder_obj):
-        if self._in_decoder == True or self._cur_decoder_obj is not None:
-            raise ValueError('StateCell has already entered a decoder.')
-        self._in_decoder = True
-        self._cur_decoder_obj = decoder_obj
-        self._switched_decoder = False
-
-    def _leave_decoder(self, decoder_obj):
-        if not self._in_decoder:
-            raise ValueError('StateCell not in decoder, '
-                             'invalid leaving operation.')
-
-        if self._cur_decoder_obj != decoder_obj:
-            raise ValueError('Inconsistent decoder object in StateCell.')
-
-        self._in_decoder = False
-        self._cur_decoder_obj = None
-        self._switched_decoder = False
-
-    def _switch_decoder(self):  # lazy switch
-        if not self._in_decoder:
-            raise ValueError('StateCell must be enter a decoder.')
-
-        if self._switched_decoder:
-            raise ValueError('StateCell already done switching.')
-
-        for state_name in self._state_names:
-            if state_name not in self._states_holder:
-                state = self._cur_states[state_name]
-
-                if not isinstance(state, InitState):
-                    raise ValueError('Current type of state is %s, should be '
-                                     'an InitState object.' % type(state))
-
-                self._states_holder[state_name] = {}
-
-                if self._cur_decoder_obj.type == _DecoderType.TRAINING:
-                    self._states_holder[state_name][id(self._cur_decoder_obj)] \
-                        = _MemoryState(state_name,
-                                       self._cur_decoder_obj.dynamic_rnn,
-                                       state)
-                elif self._cur_decoder_obj.type == _DecoderType.BEAM_SEARCH:
-                    self._states_holder[state_name][id(self._cur_decoder_obj)] \
-                        = _ArrayState(state_name,
-                                      self._cur_decoder_obj._parent_block(),
-                                      state)
-                else:
-                    raise ValueError('Unknown decoder type, only support '
-                                     '[TRAINING, BEAM_SEARCH]')
-
-            # Read back, since current state should be LoDTensor
-            self._cur_states[state_name] = \
-                self._states_holder[state_name][
-                    id(self._cur_decoder_obj)].get_state()
-
-        self._switched_decoder = True
-
-    def get_state(self, state_name):
-        """
-        The getter of state object. Find the state variable by its name.
-
-        Args:
-            state_name (str): A string of the state's name.
-
-        Returns:
-            The associated state object.
-        """
-        if self._in_decoder and not self._switched_decoder:
-            self._switch_decoder()
-
-        if state_name not in self._cur_states:
-            raise ValueError(
-                'Unknown state %s. Please make sure _switch_decoder() '
-                'invoked.' % state_name)
-
-        return self._cur_states[state_name]
-
-    def get_input(self, input_name):
-        """
-        The getter of input variable. Find the input variable by its name.
-
-        Args:
-            input_name (str): The string of the input's name.
-
-        Returns:
-            The associated input variable.
-        """
-        if input_name not in self._inputs or self._inputs[input_name] is None:
-            raise ValueError('Invalid input %s.' % input_name)
-        return self._inputs[input_name]
-
-    def set_state(self, state_name, state_value):
-        """
-        The setter of the state variable. Change the variable of the given
-        `state_name`.
-
-        Args:
-            state_name (str): The name of the state to change.
-            state_value (Var): The variable of the new state.
-        """
-        self._cur_states[state_name] = state_value
-
-    def state_updater(self, updater):
-        """
-        Set up the updater to update the hidden state every RNN step. The
-        behavior of updater could be customized by users. The updater should be
-        a function that takes a `StateCell` object as input and update the
-        hidden state within it. The hidden state could be accessed through
-        `get_state` method.
-
-        Args:
-            updater (func): the updater to update the state cell.
-        """
-        self._state_updater = updater
-
-        def _decorator(state_cell):
-            if state_cell == self:
-                raise TypeError('Updater should only accept a StateCell object '
-                                'as argument.')
-            updater(state_cell)
-
-        return _decorator
-
-    def compute_state(self, inputs):
-        """
-        Provide the step input of RNN cell, and compute the new hidden state
-        with updater and give step input.
-
-        Args:
-            inputs (dict): A feed dict, {name(str): Variable}. name should be
-            the names of step inputs for this RNN cell, and Variable should be
-            the associated variables.
-
-        Examples:
-        .. code-block:: python
-          state_cell.compute_state(inputs={'x': current_word})
-        """
-        if self._in_decoder and not self._switched_decoder:
-            self._switch_decoder()
-
-        for input_name, input_value in six.iteritems(inputs):
-            if input_name not in self._inputs:
-                raise ValueError('Unknown input %s. '
-                                 'Please make sure %s in input '
-                                 'place holder.' % (input_name, input_name))
-            self._inputs[input_name] = input_value
-        self._state_updater(self)
-
-    def update_states(self):
-        """
-        Update and record state information after each RNN step.
-        """
-        if self._in_decoder and not self._switched_decoder:
-            self._switched_decoder()
-
-        for state_name, decoder_state in six.iteritems(self._states_holder):
-            if id(self._cur_decoder_obj) not in decoder_state:
-                raise ValueError('Unknown decoder object, please make sure '
-                                 'switch_decoder been invoked.')
-            decoder_state[id(self._cur_decoder_obj)].update_state(
-                self._cur_states[state_name])
-
-    def out_state(self):
-        """
-        Get the output state variable. This must be called after update_states.
-
-        Returns:
-            The output variable of the RNN cell.
-        """
-        return self._cur_states[self._out_state]
-
-
-class TrainingDecoder(object):
-    """
-    A decoder that can only be used for training. The decoder could be
-    initialized with a `StateCell` object. The computation within the RNN cell
-    could be defined with decoder's block.
-
-    Args:
-        state_cell (StateCell): A StateCell object that handles the input and
-            state variables.
-        name (str): The name of this decoder. Default None.
-
-    Returns:
-        TrainingDecoder: The initialized TrainingDecoder object.
-
-    Examples:
-        .. code-block:: python
-          decoder = TrainingDecoder(state_cell)
-          with decoder.block():
-              current_word = decoder.step_input(trg_embedding)
-              decoder.state_cell.compute_state(inputs={'x': current_word})
-              current_score = layers.fc(input=decoder.state_cell.get_state('h'),
-                                        size=32,
-                                        act='softmax')
-              decoder.state_cell.update_states()
-              decoder.output(current_score)
-    """
-    BEFORE_DECODER = 0
-    IN_DECODER = 1
-    AFTER_DECODER = 2
-
-    def __init__(self, state_cell, name=None):
-        self._helper = LayerHelper('training_decoder', name=name)
-        self._status = TrainingDecoder.BEFORE_DECODER
-        self._dynamic_rnn = layers.DynamicRNN()
-        self._type = _DecoderType.TRAINING
-        self._state_cell = state_cell
-        self._state_cell._enter_decoder(self)
-
-    @signature_safe_contextmanager
-    def block(self):
-        """
-        Define the behavior of the decoder for each RNN time step.
-        """
-        if self._status != TrainingDecoder.BEFORE_DECODER:
-            raise ValueError('decoder.block() can only be invoked once')
-        self._status = TrainingDecoder.IN_DECODER
-        with self._dynamic_rnn.block():
-            yield
-        self._status = TrainingDecoder.AFTER_DECODER
-        self._state_cell._leave_decoder(self)
-
-    @property
-    def state_cell(self):
-        self._assert_in_decoder_block('state_cell')
-        return self._state_cell
-
-    @property
-    def dynamic_rnn(self):
-        return self._dynamic_rnn
-
-    @property
-    def type(self):
-        return self._type
-
-    def step_input(self, x):
-        """
-        Set the input variable as a step input to the RNN cell. For example,
-        in machine translation, each time step we read one word from the target
-        sentences, then the target sentence is a step input to the RNN cell.
-
-        Args:
-            x (Variable): the variable to be used as step input.
-
-        Returns:
-            Variable: The variable as input of current step.
-
-        Examples:
-        .. code-block:: python
-          current_word = decoder.step_input(trg_embedding)
-        """
-        self._assert_in_decoder_block('step_input')
-        return self._dynamic_rnn.step_input(x)
-
-    def static_input(self, x):
-        """
-        Set the input variable as a static input of RNN cell. In contrast to
-        step input, this variable will be used as a whole within the RNN decode
-        loop and will not be scattered into time steps.
-
-        Args:
-            x (Variable): the variable to be used as static input.
-
-        Returns:
-            Variable: The variable as input of current step.
-
-        Examples:
-        .. code-block:: python
-          encoder_vec = decoder.static_input(encoded_vector)
-        """
-        self._assert_in_decoder_block('static_input')
-        return self._dynamic_rnn.static_input(x)
-
-    def __call__(self, *args, **kwargs):
-        """
-        Get the output of RNN. This API should only be invoked after RNN.block()
-
-        Returns:
-            Variable: The specified output of the RNN cell.
-        """
-        if self._status != TrainingDecoder.AFTER_DECODER:
-            raise ValueError('Output of training decoder can only be visited '
-                             'outside the block.')
-        return self._dynamic_rnn(*args, **kwargs)
-
-    def output(self, *outputs):
-        """
-        Set the output variable of the RNN cell.
-
-        Args:
-            *outputs (Variables): a series of variables that treated as output
-                of the RNN cell.
-
-        Examples:
-        .. code-block:: python
-          out = fluid.layers.fc(input=h,
-                                size=32,
-                                bias_attr=True,
-                                act='softmax')
-          decoder.output(out)
-        """
-        self._assert_in_decoder_block('output')
-        self._dynamic_rnn.output(*outputs)
-
-    def _assert_in_decoder_block(self, method):
-        if self._status != TrainingDecoder.IN_DECODER:
-            raise ValueError('%s should be invoked inside block of '
-                             'TrainingDecoder object.' % method)
-
-
-class BeamSearchDecoder(object):
-    """
-    A beam search decoder that can be used for inference. The decoder should be
-    initialized with a `StateCell` object. The decode process can be defined
-    within its block.
-
-    Args:
-        state_cell (StateCell): A StateCell object that handles the input and
-            state variables.
-        init_ids (Variable): The init beam search token ids.
-        init_scores (Variable): The associated score of each id.
-        target_dict_dim (int): Size of dictionary.
-        word_dim (int): Word embedding dimension.
-        input_var_dict (dict): A feeding dict to feed the required input
-            variables to the state cell. It will be used by state_cell 's
-            compute method. Default empty.
-        topk_size (int): The topk size used for beam search. Default 50.
-        max_len (int): The maximum allowed length of the generated sentence.
-            Default 100.
-        beam_size (int): The beam width of beam search decode. Default 1.
-        end_id (int): The id of end token within beam search.
-        name (str): The name of this decoder. Default None.
-
-    Returns:
-        BeamSearchDecoder: A initialized BeamSearchDecoder object.
-
-    Examples:
-    .. code-block:: python
-      decoder = BeamSearchDecoder(
-          state_cell=state_cell,
-          init_ids=init_ids,
-          init_scores=init_scores,
-          target_dict_dim=target_dict_dim,
-          word_dim=word_dim,
-          init_var_dict={},
-          topk_size=topk_size,
-          sparse_emb=IS_SPARSE,
-          max_len=max_length,
-          beam_size=beam_size,
-          end_id=1,
-          name=None
-      )
-      decoder.decode()
-      translation_ids, translation_scores = decoder()
-    """
-    BEFORE_BEAM_SEARCH_DECODER = 0
-    IN_BEAM_SEARCH_DECODER = 1
-    AFTER_BEAM_SEARCH_DECODER = 2
-
-    def __init__(self,
-                 state_cell,
-                 init_ids,
-                 init_scores,
-                 target_dict_dim,
-                 word_dim,
-                 input_var_dict={},
-                 topk_size=50,
-                 sparse_emb=True,
-                 max_len=100,
-                 beam_size=1,
-                 end_id=1,
-                 name=None):
-        self._helper = LayerHelper('beam_search_decoder', name=name)
-        self._counter = layers.zeros(shape=[1], dtype='int64')
-        self._counter.stop_gradient = True
-        self._type = _DecoderType.BEAM_SEARCH
-        self._max_len = layers.fill_constant(
-            shape=[1], dtype='int64', value=max_len)
-        self._cond = layers.less_than(
-            x=self._counter,
-            y=layers.fill_constant(
-                shape=[1], dtype='int64', value=max_len))
-        self._while_op = layers.While(self._cond)
-        self._state_cell = state_cell
-        self._state_cell._enter_decoder(self)
-        self._status = BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER
-        self._zero_idx = layers.fill_constant(
-            shape=[1], value=0, dtype='int64', force_cpu=True)
-        self._array_dict = {}
-        self._array_link = []
-        self._ids_array = None
-        self._scores_array = None
-        self._beam_size = beam_size
-        self._end_id = end_id
-
-        self._init_ids = init_ids
-        self._init_scores = init_scores
-        self._target_dict_dim = target_dict_dim
-        self._topk_size = topk_size
-        self._sparse_emb = sparse_emb
-        self._word_dim = word_dim
-        self._input_var_dict = input_var_dict
-
-    @signature_safe_contextmanager
-    def block(self):
-        """
-        Define the behavior of the decoder for each RNN time step.
-        """
-        if self._status != BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER:
-            raise ValueError('block() can only be invoke once.')
-
-        self._status = BeamSearchDecoder.IN_BEAM_SEARCH_DECODER
-
-        with self._while_op.block():
-            yield
-            with layers.Switch() as switch:
-                with switch.case(self._cond):
-                    layers.increment(x=self._counter, value=1.0, in_place=True)
-
-                    for value, array in self._array_link:
-                        layers.array_write(
-                            x=value, i=self._counter, array=array)
-
-                    layers.less_than(
-                        x=self._counter, y=self._max_len, cond=self._cond)
-
-        self._status = BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER
-        self._state_cell._leave_decoder(self)
-
-    @property
-    def type(self):
-        return self._type
-
-    def early_stop(self):
-        """
-        Stop the generation process in advance. Could be used as "break".
-        """
-        layers.fill_constant(
-            shape=[1], value=0, dtype='bool', force_cpu=True, out=self._cond)
-
-    def decode(self):
-        """
-        Set up the computation within the decoder. Then you could call the
-        decoder to get the result of beam search decode. If you want to define
-        a more specific decoder, you could override this function.
-
-        Examples:
-        .. code-block:: python
-          decoder.decode()
-          translation_ids, translation_scores = decoder()
-        """
-        with self.block():
-            prev_ids = self.read_array(init=self._init_ids, is_ids=True)
-            prev_scores = self.read_array(
-                init=self._init_scores, is_scores=True)
-            prev_ids_embedding = layers.embedding(
-                input=prev_ids,
-                size=[self._target_dict_dim, self._word_dim],
-                dtype='float32',
-                is_sparse=self._sparse_emb)
-
-            feed_dict = {}
-            update_dict = {}
-
-            for init_var_name, init_var in six.iteritems(self._input_var_dict):
-                if init_var_name not in self.state_cell._inputs:
-                    raise ValueError('Variable ' + init_var_name +
-                                     ' not found in StateCell!\n')
-
-                read_var = self.read_array(init=init_var)
-                update_dict[init_var_name] = read_var
-                feed_var_expanded = layers.sequence_expand(read_var,
-                                                           prev_scores)
-                feed_dict[init_var_name] = feed_var_expanded
-
-            for state_str in self._state_cell._state_names:
-                prev_state = self.state_cell.get_state(state_str)
-                prev_state_expanded = layers.sequence_expand(prev_state,
-                                                             prev_scores)
-                self.state_cell.set_state(state_str, prev_state_expanded)
-
-            for i, input_name in enumerate(self._state_cell._inputs):
-                if input_name not in feed_dict:
-                    feed_dict[input_name] = prev_ids_embedding
-
-            self.state_cell.compute_state(inputs=feed_dict)
-            current_state = self.state_cell.out_state()
-            current_state_with_lod = layers.lod_reset(
-                x=current_state, y=prev_scores)
-            scores = layers.fc(input=current_state_with_lod,
-                               size=self._target_dict_dim,
-                               act='softmax')
-            topk_scores, topk_indices = layers.topk(scores, k=self._topk_size)
-            accu_scores = layers.elementwise_add(
-                x=layers.log(x=topk_scores),
-                y=layers.reshape(
-                    prev_scores, shape=[-1]),
-                axis=0)
-            selected_ids, selected_scores = layers.beam_search(
-                prev_ids,
-                prev_scores,
-                topk_indices,
-                accu_scores,
-                self._beam_size,
-                end_id=1,
-                level=0)
-
-            with layers.Switch() as switch:
-                with switch.case(layers.is_empty(selected_ids)):
-                    self.early_stop()
-                with switch.default():
-                    self.state_cell.update_states()
-                    self.update_array(prev_ids, selected_ids)
-                    self.update_array(prev_scores, selected_scores)
-                    for update_name, var_to_update in six.iteritems(
-                            update_dict):
-                        self.update_array(var_to_update, feed_dict[update_name])
-
-    def read_array(self, init, is_ids=False, is_scores=False):
-        """
-        Read an array to get the decoded ids and scores generated by previous
-        RNN step. At the first step of RNN, the init variable mut be used to
-        initialize the array.
-
-        Args:
-            init (Variable): The initial variable for first step usage. init
-                must be provided.
-            is_ids (bool): Specify whether the variable is an id.
-            is_scores (bool): Specify whether the variable is a score.
-
-        Returns:
-            The associated variable generated during previous RNN steps.
-
-        Examples:
-            .. code-block:: python
-              prev_ids = decoder.read_array(init=init_ids, is_ids=True)
-              prev_scores = decoder.read_array(init=init_scores, is_scores=True)
-        """
-        self._assert_in_decoder_block('read_array')
-
-        if is_ids and is_scores:
-            raise ValueError('Shouldn\'t mark current array be ids array and'
-                             'scores array at the same time.')
-
-        if not isinstance(init, Variable):
-            raise TypeError('The input argument `init` must be a Variable.')
-
-        parent_block = self._parent_block()
-        array = parent_block.create_var(
-            name=unique_name.generate('beam_search_decoder_array'),
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=init.dtype)
-        parent_block.append_op(
-            type='write_to_array',
-            inputs={'X': init,
-                    'I': self._zero_idx},
-            outputs={'Out': array})
-
-        if is_ids:
-            self._ids_array = array
-        elif is_scores:
-            self._scores_array = array
-
-        read_value = layers.array_read(array=array, i=self._counter)
-        self._array_dict[read_value.name] = array
-        return read_value
-
-    def update_array(self, array, value):
-        """
-        Store the value generated in current step in an array for each RNN step.
-        This array could be accessed by read_array method.
-
-        Args:
-            array (Variable): The array to append the new variable to.
-            value (Variable): The newly generated value to be stored.
-        """
-        self._assert_in_decoder_block('update_array')
-
-        if not isinstance(array, Variable):
-            raise TypeError(
-                'The input argument `array` of  must be a Variable.')
-        if not isinstance(value, Variable):
-            raise TypeError('The input argument `value` of must be a Variable.')
-
-        array = self._array_dict.get(array.name, None)
-        if array is None:
-            raise ValueError('Please invoke read_array before update_array.')
-        self._array_link.append((value, array))
-
-    def __call__(self):
-        """
-        Run the decode process and return the final decode result.
-
-        Returns:
-            A tuple of decoded (id, score) pairs. id is a Variable that holds
-            the generated tokens, and score is a Variable with the same shape
-            as id, holds the score for each generated token.
-        """
-        if self._status != BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER:
-            raise ValueError('Output of BeamSearchDecoder object can '
-                             'only be visited outside the block.')
-        return layers.beam_search_decode(
-            ids=self._ids_array,
-            scores=self._scores_array,
-            beam_size=self._beam_size,
-            end_id=self._end_id)
-
-    @property
-    def state_cell(self):
-        self._assert_in_decoder_block('state_cell')
-        return self._state_cell
-
-    def _parent_block(self):
-        """
-        Getter of parent block.
-
-        Returns:
-            The parent block of decoder.
-        """
-        program = self._helper.main_program
-        parent_block_idx = program.current_block().parent_idx
-        if parent_block_idx < 0:
-            raise ValueError('Invalid block with index %d.' % parent_block_idx)
-        parent_block = program.block(parent_block_idx)
-        return parent_block
-
-    def _assert_in_decoder_block(self, method):
-        if self._status != BeamSearchDecoder.IN_BEAM_SEARCH_DECODER:
-            raise ValueError('%s should be invoked inside block of '
-                             'BeamSearchDecoder object.' % method)
diff --git a/python/paddle/fluid/contrib/extend_optimizer/__init__.py b/python/paddle/fluid/contrib/extend_optimizer/__init__.py
deleted file mode 100644
index 697ea0f05ae725cbda66e2568cf212bd69cb8787..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/extend_optimizer/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from . import extend_optimizer_with_weight_decay
-from .extend_optimizer_with_weight_decay import *
-
-__all__ = []
-__all__ += extend_optimizer_with_weight_decay.__all__
diff --git a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
deleted file mode 100644
index fcc99c07346eaa8adc58b0dc7ceca37a1fb72872..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
+++ /dev/null
@@ -1,152 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.fluid
-from paddle.fluid import framework as framework
-
-__all__ = ["extend_with_decoupled_weight_decay"]
-
-
-class DecoupledWeightDecay(object):
-    def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
-        if not isinstance(coeff, float) and \
-                not isinstance(coeff, framework.Variable):
-            raise TypeError("coeff should be float or Variable.")
-        self._params_name = set()
-        self._apply_decay_param_fun = apply_decay_param_fun
-        self._coeff = coeff
-        super(DecoupledWeightDecay, self).__init__(**kwargs)
-
-    def _scale_parameters(self, params_and_grads):
-        """
-        Adds weight decay ops.
-            scaled_parameter = parameter * coeff
-
-        Args:
-            params_and_grads: A list of (parameters, gradients) pairs,
-                the parameters need to decay.
-        Raises:
-            Exception: The type of coeff and parameter is not consistent.
-        """
-        if isinstance(self._coeff, float) and self._coeff == 0.0:
-            return
-
-        scaled_params = []
-        for param, grad in params_and_grads:
-            # If no gradient then we don't need to do anything
-            if grad is None:
-                continue
-            if self._apply_decay_param_fun is not None \
-                    and not self._apply_decay_param_fun(param.name):
-                continue
-
-            if isinstance(self._coeff, float):
-                assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
-                    "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
-            else:
-                assert self._coeff.dtype == param.dtype, \
-                    "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
-
-            with param.block.program._optimized_guard(
-                [param, grad]), framework.name_scope('weight decay'):
-                assert param.name not in self._params_name
-                scaled_params.append((param, grad, param * self._coeff))
-                self._params_name.add(param.name)
-        return scaled_params
-
-    def backward(self, **kargs):
-        return super(DecoupledWeightDecay, self).backward(**kargs)
-
-    def apply_optimize(self, **kargs):
-        return super(DecoupledWeightDecay, self).apply_optimize(**kargs)
-
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        params_grads = self.backward(
-            loss=loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
-        scaled_params = self._scale_parameters(params_grads)
-        for p_grad_sgrad in scaled_params:
-            param, grad, scaled_param = p_grad_sgrad
-            with param.block.program._optimized_guard(
-                [param, grad]), framework.name_scope('weight decay'):
-                updated_param = paddle.fluid.layers.elementwise_sub(
-                    x=param, y=scaled_param)
-                paddle.fluid.layers.assign(input=updated_param, output=param)
-
-        optimize_ops = self.apply_optimize(
-            loss=loss,
-            params_grads=params_grads,
-            startup_program=startup_program)
-        return optimize_ops, params_grads
-
-    def __str__(self):
-        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
-
-
-def extend_with_decoupled_weight_decay(base_optimizer):
-    """
-    extend_with_decoupled_weight_decay is a decorator function, it returns an
-    optimizer class with decoupled weight decay. The returned optimizer will
-    apply weight decay on the optimized parameters with the parameters before
-    optimization, i.e: new_parameter = optimized_parameter - parameter * coeff.
-    The details of decoupled weight decay yplease refer to this
-    `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
-
-    Args:
-        base_optimizer (Optimizer): The base_optimizer should be a derived class of Optimizer.
-
-    Returns:
-        OptimizerWithDecoupledWeightDecay: the optimizer with decouple weight decay.
-
-    Examples:
-
-      .. code-block:: python
-
-        AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
-            fluid.optimizer.Adam)
-        optimizer = AdamW(learning_rate=0.1,
-                          weight_decay=0.01)
-
-        optimizer.minimize(cost)
-    """
-    if not issubclass(base_optimizer, paddle.fluid.optimizer.Optimizer):
-        raise TypeError(
-            "The input(base_optimizer) should be a derived class of Optimizer.")
-
-    class OptimizerWithDecoupledWeightDecay(DecoupledWeightDecay,
-                                            base_optimizer):
-        """
-        OptimizerWithDecoupledWeightDecay is used to update the optimized parameters
-        with the parameters before optimization. For more information, please refer:
-        https://arxiv.org/pdf/1711.05101.pdf.
-
-        Args:
-            weight_decay (float|Variable): The weight decay coefficient, it can be
-                float or Variable.
-            apply_decay_param_fun (function|None): If it is not None,
-                only variables that makes apply_decay_param_fun(variable)==True
-                will be updated. It only works when we want to specify variables.
-                Default: None.
-        """
-
-        def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs):
-            super(OptimizerWithDecoupledWeightDecay, self).__init__(
-                weight_decay, apply_decay_param_fun, **kwargs)
-
-    return OptimizerWithDecoupledWeightDecay
diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py
deleted file mode 100644
index 4f37129234482189436ad71391f55394e2b8a277..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/inferencer.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from ..wrapped_decorator import signature_safe_contextmanager
-
-from .. import core
-
-from .. import executor
-from .. import framework
-from .. import io
-from .. import parallel_executor
-from .. import unique_name
-from .trainer import check_and_get_place
-
-__all__ = ['Inferencer', ]
-
-
-class Inferencer(object):
-    """
-    Inferencer High Level API.
-
-    Args:
-        infer_func (Python func): Infer function that will return predict Variable
-        param_path (str): The path where the inference model is saved by fluid.io.save_params
-        place (Place): place to do the inference
-        parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
-
-    Examples:
-        .. code-block:: python
-
-            def inference_program():
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                return y_predict
-
-            place = fluid.CPUPlace()
-            inferencer = fluid.Inferencer(
-                infer_func=inference_program, param_path="/tmp/model", place=place)
-
-    """
-
-    def __init__(self, infer_func, param_path, place=None, parallel=False):
-        self.param_path = param_path
-        self.scope = core.Scope()
-        self.parallel = parallel
-        self.place = check_and_get_place(place)
-
-        self.inference_program = framework.Program()
-        with framework.program_guard(self.inference_program):
-            with unique_name.guard():
-                self.predict_var = infer_func()
-
-        with self._prog_and_scope_guard():
-            # load params from param_path into scope
-            io.load_params(executor.Executor(self.place), param_path)
-
-        if parallel:
-            with self._prog_and_scope_guard():
-                self.exe = parallel_executor.ParallelExecutor(
-                    use_cuda=isinstance(self.place, core.CUDAPlace),
-                    loss_name=self.predict_var.name)
-        else:
-            self.exe = executor.Executor(self.place)
-
-        self.inference_program = self.inference_program.clone(for_test=True)
-
-    def infer(self, inputs, return_numpy=True):
-        """
-        Do Inference for Inputs
-
-        Args:
-            inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
-            return_numpy (bool): transform return value into numpy or not
-
-        Returns:
-            Tensor or Numpy: the predict value of the inference model for the inputs
-
-        Examples:
-            .. code-block:: python
-
-                tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
-                results = inferencer.infer({'x': tensor_x})
-        """
-        if not isinstance(inputs, dict):
-            raise ValueError(
-                "inputs should be a map of {'input_name': input_var}")
-
-        with self._prog_and_scope_guard():
-            results = self.exe.run(feed=inputs,
-                                   fetch_list=[self.predict_var.name],
-                                   return_numpy=return_numpy)
-
-        return results
-
-    @signature_safe_contextmanager
-    def _prog_and_scope_guard(self):
-        with framework.program_guard(main_program=self.inference_program):
-            with executor.scope_guard(self.scope):
-                yield
diff --git a/python/paddle/fluid/contrib/layers/__init__.py b/python/paddle/fluid/contrib/layers/__init__.py
deleted file mode 100644
index 94889a65b3620f730dcd39c911599f50acbfe614..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/layers/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import nn
-from .nn import *
-
-from .rnn_impl import *
-from . import metric_op
-from .metric_op import *
-
-__all__ = []
-__all__ += nn.__all__
-__all__ += rnn_impl.__all__
-__all__ += metric_op.__all__
diff --git a/python/paddle/fluid/contrib/layers/metric_op.py b/python/paddle/fluid/contrib/layers/metric_op.py
deleted file mode 100644
index f76a3283f2f81880fce5cd8b8fa4fc46434fd165..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/layers/metric_op.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Contrib layers just related to metric.
-"""
-
-from __future__ import print_function
-
-import warnings
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.initializer import Normal, Constant
-from paddle.fluid.framework import Variable
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.layers import nn
-
-__all__ = ['ctr_metric_bundle']
-
-
-def ctr_metric_bundle(input, label):
-    """
-    ctr related metric layer
-
-    This function help compute the ctr related metrics: RMSE, MAE, predicted_ctr, q_value.
-    To compute the final values of these metrics, we should do following computations using
-    total instance number:
-    MAE = local_abserr / instance number
-    RMSE = sqrt(local_sqrerr / instance number)
-    predicted_ctr = local_prob / instance number
-    q = local_q / instance number
-    Note that if you are doing distribute job, you should all reduce these metrics and instance
-    number first
-
-    Args:
-        input(Variable): A floating-point 2D Variable, values are in the range
-                         [0, 1]. Each row is sorted in descending order. This
-                         input should be the output of topk. Typically, this
-                         Variable indicates the probability of each label.
-        label(Variable): A 2D int Variable indicating the label of the training
-                         data. The height is batch size and width is always 1.
-
-    Returns:
-        local_sqrerr(Variable): Local sum of squared error
-        local_abserr(Variable): Local sum of abs error
-        local_prob(Variable): Local sum of predicted ctr
-        local_q(Variable): Local sum of q value
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            label = fluid.layers.data(name="label", shape=[1], dtype="int32")
-            predict = fluid.layers.sigmoid(fluid.layers.fc(input=data, size=1))
-            auc_out = fluid.contrib.layers.ctr_metric_bundle(input=predict, label=label)
-    """
-    assert input.shape == label.shape
-    helper = LayerHelper("ctr_metric_bundle", **locals())
-
-    local_abserr = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_sqrerr = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_prob = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_q = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_pos_num = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_ins_num = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-
-    tmp_res_elesub = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[-1])
-    tmp_res_sigmoid = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[-1])
-    tmp_ones = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[-1])
-
-    batch_prob = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_abserr = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_sqrerr = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_q = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_pos_num = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_ins_num = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    for var in [
-            local_abserr, batch_abserr, local_sqrerr, batch_sqrerr, local_prob,
-            batch_prob, local_q, batch_q, batch_pos_num, batch_ins_num,
-            local_pos_num, local_ins_num
-    ]:
-        helper.set_variable_initializer(
-            var, Constant(
-                value=0.0, force_cpu=True))
-
-    helper.append_op(
-        type="elementwise_sub",
-        inputs={"X": [input],
-                "Y": [label]},
-        outputs={"Out": [tmp_res_elesub]})
-
-    helper.append_op(
-        type="squared_l2_norm",
-        inputs={"X": [tmp_res_elesub]},
-        outputs={"Out": [batch_sqrerr]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_sqrerr],
-                "Y": [local_sqrerr]},
-        outputs={"Out": [local_sqrerr]})
-
-    helper.append_op(
-        type="l1_norm",
-        inputs={"X": [tmp_res_elesub]},
-        outputs={"Out": [batch_abserr]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_abserr],
-                "Y": [local_abserr]},
-        outputs={"Out": [local_abserr]})
-
-    helper.append_op(
-        type="reduce_sum", inputs={"X": [input]},
-        outputs={"Out": [batch_prob]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_prob],
-                "Y": [local_prob]},
-        outputs={"Out": [local_prob]})
-    helper.append_op(
-        type="sigmoid",
-        inputs={"X": [input]},
-        outputs={"Out": [tmp_res_sigmoid]})
-    helper.append_op(
-        type="reduce_sum",
-        inputs={"X": [tmp_res_sigmoid]},
-        outputs={"Out": [batch_q]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_q],
-                "Y": [local_q]},
-        outputs={"Out": [local_q]})
-
-    helper.append_op(
-        type="reduce_sum",
-        inputs={"X": [label]},
-        outputs={"Out": [batch_pos_num]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_pos_num],
-                "Y": [local_pos_num]},
-        outputs={"Out": [local_pos_num]})
-
-    helper.append_op(
-        type='fill_constant_batch_size_like',
-        inputs={"Input": label},
-        outputs={'Out': [tmp_ones]},
-        attrs={
-            'shape': [-1, 1],
-            'dtype': tmp_ones.dtype,
-            'value': float(1.0),
-        })
-    helper.append_op(
-        type="reduce_sum",
-        inputs={"X": [tmp_ones]},
-        outputs={"Out": [batch_ins_num]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_ins_num],
-                "Y": [local_ins_num]},
-        outputs={"Out": [local_ins_num]})
-
-    return local_sqrerr, local_abserr, local_prob, local_q, local_pos_num, local_ins_num
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
deleted file mode 100644
index bd8ea98e414a3ed40ab1191a76f435197add528c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ /dev/null
@@ -1,429 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Contrib layers just related to the neural network.
-"""
-
-from __future__ import print_function
-
-import numpy as np
-import six
-import os
-import inspect
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.layers import utils
-
-__all__ = [
-    'fused_elemwise_activation',
-    'sequence_topk_avg_pooling',
-    'var_conv_2d',
-    'match_matrix_tensor',
-    'tree_conv',
-]
-
-
-def fused_elemwise_activation(x,
-                              y,
-                              functor_list,
-                              axis=-1,
-                              scale=0.0,
-                              save_intermediate_out=True):
-    """
-    **Fused elementwise_add/mul and activation layers**
-
-    This function computes an elementwise_add/mul cooperated with an activation.
-
-    .. math::
-
-        out = Unary(Binary(x, y))
-
-    or
-
-    .. math::
-
-        out = Binary(x, Unary(y))
-
-    Unary operators can be: `scale`, `relu`, `tanh`. Binary operators can be:
-    `elementwise_add`, `elementwise_mul`.
-
-    Args:
-        x (Variable): left operation of the binary operator.
-        y (Variable): right operator of the binary operator.
-        functor_list (list of str): types of operator which will be executed
-            by this layer. For example, ['elementwise_add', 'relu']
-            (out = elementwise_add(x, relu(y))),
-            or ['relu', 'elemmentwise_add'] (out = relu(elementwise_add(x, y))).
-        axis (int32, default -1): axis of elementwise op.
-        scale (float32, default 0): parameter of scale op.
-        save_intermediate_out (bool, default True): whether to save the
-            intermediate result, Unary(y) or Binary(x, y).
-
-    Returns:
-        Variable: The computation result.
-    """
-    if isinstance(functor_list, str):
-        functor_list = functor_list.split(',')
-
-    if not isinstance(functor_list, list) or len(functor_list) != 2:
-        raise ValueError(
-            'functor_list should be a list of str, and the length should be 2.')
-
-    helper = LayerHelper('fused_elemwise_activation', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    intermediate_out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='fused_elemwise_activation',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out,
-                 'IntermediateOut': intermediate_out},
-        attrs={
-            'axis': axis,
-            'scale': scale,
-            'save_intermediate_out': save_intermediate_out,
-            'functor_list': functor_list
-        })
-    return out
-
-
-def var_conv_2d(input,
-                row,
-                col,
-                input_channel,
-                output_channel,
-                filter_size,
-                stride=1,
-                param_attr=None,
-                act=None,
-                dtype='float32',
-                name=None):
-    """
-    The var_conv_2d layer calculates the output base on the :attr:`input` with variable length,
-    row, col, input channel, filter size and strides. Both :attr:`input`, :attr:`row`,
-    and :attr:`col` are 1-level LodTensor. The covolution operation is same as conv2d layer with 
-    padding. Besides, input.dims[1] should be 1. 
-
-    .. code-block:: text
-            
-            If input_channel is 2 and given row lodTensor and col lodTensor as follows:
-                row.lod = [[5, 4]]
-                col.lod = [[6, 7]]
-            input is a lodTensor: 
-                input.lod = [[60, 56]]	# where 60 = input_channel * 5 * 6
-                input.dims = [116, 1]	# where 116 = 60 + 56
-            
-            If set output_channel is 3, filter_size is [3, 3], stride is [1, 1]:
-                output.lod = [[90, 84]] # where 90 = output_channel * [(5-1)/stride + 1] * [(6-1)/stride + 1]
-                output.dims = [174, 1]  # where 174 = 90 + 84
-
-    Args:
-        input (Variable): The input shoud be 1-level LodTensor with dims[1] equals 1.
-        row (Variable): The row shoud be 1-level LodTensor to provide height information.
-        col (Variable): The col shoud be 1-level LodTensor to provide width information.
-        input_channel (int): The number of input channel.
-        output_channel (int): The number of output channel.
-        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of var_conv2d. If it is set to None or one attribute of ParamAttr, var_conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None
-        dtype ('float32'): The data type of parameter and output.
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None
-
-    Returns:
-        Variable: Output variable with LoD specified by this layer.
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            from paddle.fluid import layers
-            from paddle.fluid import contrib
-
-            x_lod_tensor = layers.data(name='x', shape=[1], lod_level=1)
-            row_lod_tensor = layers.data(name='row', shape=[6], lod_level=1)
-            col_lod_tensor = layers.data(name='col', shape=[6], lod_level=1)
-            out = contrib.var_conv_2d(input=x_lod_tensor, 
-                                     row=row_lod_tensor,
-                                     col=col_lod_tensor,
-                                     input_channel=3,
-                                     output_channel=5,
-                                     filter_size=[3, 3],
-                                     stride=1)
-    """
-    helper = LayerHelper('var_conv_2d', **locals())
-    x_shape = list(input.shape)
-    assert len(x_shape) == 2
-
-    filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-    stride = utils.convert_to_list(stride, 2, 'stride')
-
-    filter_shape = [
-        int(output_channel),
-        int(input_channel) * filter_size[0] * filter_size[1]
-    ]
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype, )
-
-    conv_res = helper.create_variable_for_type_inference(dtype)
-    tmp_res = helper.create_variable_for_type_inference(
-        dtype, stop_gradient=True)
-
-    helper.append_op(
-        type='var_conv_2d',
-        inputs={
-            'X': input,
-            'ROW': row,
-            'COLUMN': col,
-            'W': filter_param,
-        },
-        outputs={"Out": conv_res,
-                 "Col": tmp_res},
-        attrs={
-            'InputChannel': input_channel,
-            'OutputChannel': output_channel,
-            'StrideH': stride[0],
-            'StrideW': stride[1],
-            'KernelH': filter_size[0],
-            'KernelW': filter_size[1],
-        })
-
-    return helper.append_activation(conv_res)
-
-
-def match_matrix_tensor(x,
-                        y,
-                        channel_num,
-                        act=None,
-                        param_attr=None,
-                        dtype='float32',
-                        name=None):
-    """
-    Calculate the semantic matching matrix of two word sequences with variable length.
-    Given a query A of length `n` and a title B of length `m`, the input shape are respectively
-    [n, h] and [m, h], which h is hidden_size. If :attr:`channel_num` is set to 3,
-    it will generate a learnable parameter matrix W with shape [h, 3, h].
-    Then the semantic matching matrix of query A and title B is calculated by 
-    A * W * B.T = [n, h]*[h, 3, h]*[h, m] = [n, 3, m]. The learnable parameter matrix `W` 
-    is equivalent to a fully connected layer in the calculation process. If :attr:`act` is provided, 
-    the corresponding activation function will be applied to output matrix.
-    The :attr:`x` and :attr:`y` should be LodTensor and only one level LoD is supported.
-
-    .. code-block:: text
-
-            Given a 1-level LoDTensor x:
-                x.lod =  [[2,                     3,                               ]]
-                x.data = [[0.3, 0.1], [0.2, 0.3], [0.5, 0.6], [0.7, 0.1], [0.3, 0.4]]
-                x.dims = [5, 2]
-            y is a Tensor:
-                y.lod =  [[3,                                 1,       ]]
-                y.data = [[0.1, 0.2], [0.3, 0.7], [0.9, 0.2], [0.4, 0.1]]
-                y.dims = [4, 2]
-            set channel_num 2, then we get a 1-level LoDTensor:
-                out.lod =  [[12, 6]]   # where 12 = channel_num * x.lod[0][0] * y.lod[0][0]
-                out.dims = [18, 1]     # where 18 = 12 + 6
-
-    Args:
-        x (Variable): Input variable x which should be 1-level LodTensor.
-        y (Variable): Input variable y which should be 1-level LodTensor.
-        channel_num (int): The channel number of learnable parameter W.
-        act (str, default None): Activation to be applied to the output of this layer.
-        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
-            parameters/weights of this layer.
-        dtype ('float32'): The data type of w data.
-        name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None
-
-    Returns:
-        Variable: output with LoD specified by this layer.
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            from paddle.fluid import layers
-            from paddle.fluid import contrib
-
-            x_lod_tensor = layers.data(name='x', shape=[10], lod_level=1)
-            y_lod_tensor = layers.data(name='y', shape=[10], lod_level=1)
-            out, out_tmp = contrib.match_matrix_tensor(x=x_lod_tensor, y=y_lod_tensor, channel_num=3)
-    """
-    helper = LayerHelper('match_matrix_tensor', **locals())
-
-    x_shape = list(x.shape)
-    y_shape = list(y.shape)
-    assert len(x_shape) == 2 and len(y_shape) == 2 and x_shape[-1] == y_shape[
-        -1]
-
-    weight_shape = [x_shape[-1], channel_num, y_shape[-1]]
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=weight_shape, dtype=dtype, is_bias=False)
-    mm_res = helper.create_variable_for_type_inference(dtype)
-    tmp_res = helper.create_variable_for_type_inference(
-        dtype, stop_gradient=True)
-    helper.append_op(
-        type='match_matrix_tensor',
-        inputs={
-            'X': x,
-            'Y': y,
-            'W': w,
-        },
-        outputs={"Out": mm_res,
-                 "Tmp": tmp_res},
-        attrs={'dim_t': channel_num})
-
-    return helper.append_activation(mm_res), tmp_res
-
-
-def sequence_topk_avg_pooling(input, row, col, topks, channel_num):
-    """
-    The :attr:`topks` is a list with incremental values in this function. For each topk,
-    it will average the topk features as an output feature for each channel of every 
-    input sequence. Both :attr:`row` and :attr:`col` are LodTensor, which provide height 
-    and width information for :attr:`input` tensor. If feature size of input sequence is less 
-    than topk, it will padding 0 at the back.
-
-    .. code-block:: text
-
-            If channel_num is 2 and given row LoDTensor and col LoDTensor as follows:
-                row.lod = [[5, 4]]
-                col.lod = [[6, 7]]
-
-            input is a LoDTensor with input.lod[0][i] = channel_num * row.lod[0][i] * col.lod[0][i] 
-                input.lod = [[60, 56]]  # where 60 = channel_num * 5 * 6
-                input.dims = [116, 1]   # where 116 = 60 + 56
-
-            If topks is [1, 3, 5], then we get a 1-level LoDTensor:
-                out.lod =  [[5, 4]] 	# share Lod info with row LodTensor
-                out.dims = [9, 6]   	# where 6 = len(topks) * channel_num
-
-    Args:
-        input (Variable): The input should be 2D LodTensor with dims[1] equals 1.
-        row (Variable): The row shoud be 1-level LodTensor to provide the height information
-                        of the input tensor data.
-        col (Variable): The col shoud be 1-level LodTensor to provide the width information
-                        of the input tensor data.
-        topks (list): A list of incremental value to average the topk feature.
-        channel_num (int): The number of input channel.
-
-    Returns:
-        Variable: output LodTensor specified by this layer.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            from paddle.fluid import layers
-            from paddle.fluid import contrib
-
-            x_lod_tensor = layers.data(name='x', shape=[1], lod_level=1)
-            row_lod_tensor = layers.data(name='row', shape=[6], lod_level=1)
-            col_lod_tensor = layers.data(name='col', shape=[6], lod_level=1)
-            out = contrib.sequence_topk_avg_pooling(input=x_lod_tensor,
-                                                   row=row_lod_tensor,
-                                                   col=col_lod_tensor,
-                                                   topks=[1, 3, 5],
-                                                   channel_num=5)
-    """
-    helper = LayerHelper('sequence_topk_avg_pooling', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    pos = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype(), stop_gradient=True)
-    helper.append_op(
-        type='sequence_topk_avg_pooling',
-        inputs={'X': input,
-                'ROW': row,
-                'COLUMN': col},
-        outputs={'Out': out,
-                 'pos': pos},
-        attrs={'topks': topks,
-               'channel_num': channel_num})
-
-    return out
-
-
-def tree_conv(nodes_vector,
-              edge_set,
-              output_size,
-              num_filters=1,
-              max_depth=2,
-              act='tanh',
-              param_attr=None,
-              bias_attr=None,
-              name=None):
-    """ 
-    ${comment}
-    		
-    Args:
-        nodes_vector(${nodes_vector_type}): ${nodes_vector_comment}
-        edge_set(${edge_set_type}): ${edge_set_comment}
-        output_size(int): output feature width
-        num_filters(int): number of filters, Default 1
-        max_depth(int): max depth of filters, Default 2
-        act(str): activation function, Default tanh
-        param_attr(ParamAttr): the parameter attribute for the filters, Default None
-        bias_attr(ParamAttr): the parameter attribute for the bias of this layer, Default None
-        name(str): a name of this layer(optional). If set None, the layer will be named automatically, Default None
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          # 10 for max_node_size of dataset, 5 for vector width
-          nodes_vector = fluid.layers.data(name='vectors', shape=[10, 5], dtype='float32')
-          # 10 for max_node_size of dataset, 2 for every edge has two nodes
-          # edges must be directional
-          edge_set = fluid.layers.data(name='edge_set', shape=[10, 2], dtype='float32')
-          # the shape of output will be [10, 6, 1],
-          # 10 for max_node_size of dataset, 6 for output size, 1 for 1 filter
-          out_vector = fluid.layers.tree_conv(nodes_vector, edge_set, 6, 1, 2)
-          # After reshape, output tensor could be nodes_vector for next tree convolution
-          out_vector = fluid.layers.reshape(out_vector, shape=[-1, 10, 6])
-          out_vector_2 = fluid.layers.tree_conv(out_vector, edge_set, 3, 4, 2)
-          # also output tensor could be pooling(the pooling in paper called global pooling)
-          pooled = fluid.layers.reduce_max(out_vector, dim=2) # global pooling
-    """
-    helper = LayerHelper("tree_conv", **locals())
-    dtype = helper.input_dtype('nodes_vector')
-    feature_size = nodes_vector.shape[2]
-    W_shape = [feature_size, 3, output_size, num_filters]
-    W = helper.create_parameter(
-        attr=param_attr, shape=W_shape, dtype=dtype, is_bias=False)
-    out = helper.create_variable_for_type_inference(dtype=dtype)
-    helper.append_op(
-        type='tree_conv',
-        inputs={'NodesVector': nodes_vector,
-                'EdgeSet': edge_set,
-                'Filter': W},
-        outputs={'Out': out, },
-        attrs={'max_depth': max_depth})
-    if helper.bias_attr:
-        pre_activation = helper.append_bias_op(out)
-    else:
-        pre_activation = out
-    return helper.append_activation(pre_activation)
diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py
deleted file mode 100644
index e6a868ada37ab9fb27f973b4bfe648387bb4279f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ /dev/null
@@ -1,743 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid import layers
-from paddle.fluid.dygraph import Layer
-from paddle.fluid.layers.control_flow import StaticRNN
-
-__all__ = ['BasicGRUUnit', 'basic_gru', 'BasicLSTMUnit', 'basic_lstm']
-
-
-class BasicGRUUnit(Layer):
-    """
-    ****
-    BasicGRUUnit class, using basic operators to build GRU
-    The algorithm can be described as the equations below.
-
-        .. math::
-            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
-
-            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
-
-            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    Args:
-        name_scope(string) : The name scope used to identify parameters and biases
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of GRU unit.
-            If it is set to None or one attribute of ParamAttr, gru_unit will 
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cell (actNode).
-                             Default: 'fluid.layers.tanh'
-        dtype(string): data type used in this unit
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid.layers as layers
-            from paddle.fluid.contrib.layers import BasicGRUUnit
-
-            input_size = 128
-            hidden_size = 256
-            input = layers.data( name = "input", shape = [-1, input_size], dtype='float32')
-            pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
-
-            gru_unit = BasicGRUUnit( "gru_unit", hidden_size )
-
-            new_hidden = gru_unit( input, pre_hidden )
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 dtype='float32'):
-        super(BasicGRUUnit, self).__init__(name_scope, dtype)
-
-        self._name = name_scope
-        self._hiden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        self._dtype = dtype
-
-    def _build_once(self, input, pre_hidden):
-        self._input_size = input.shape[-1]
-        assert (self._input_size > 0)
-
-        self._gate_weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._input_size + self._hiden_size, 2 * self._hiden_size],
-            dtype=self._dtype)
-
-        self._candidate_weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._input_size + self._hiden_size, self._hiden_size],
-            dtype=self._dtype)
-
-        self._gate_bias = self.create_parameter(
-            self._bias_attr,
-            shape=[2 * self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
-        self._candidate_bias = self.create_parameter(
-            self._bias_attr,
-            shape=[self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input, pre_hidden):
-        concat_input_hidden = layers.concat([input, pre_hidden], 1)
-
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
-
-        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
-
-        gate_input = self._gate_activation(gate_input)
-        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
-
-        r_hidden = r * pre_hidden
-
-        candidate = layers.matmul(
-            layers.concat([input, pre_hidden], 1), self._candidate_weight)
-        candidate = layers.elementwise_add(candidate, self._candidate_bias)
-
-        c = self._activation(candidate)
-        new_hidden = u * pre_hidden + (1 - u) * c
-
-        return new_hidden
-
-
-def basic_gru(input,
-              init_hidden,
-              hidden_size,
-              num_layers=1,
-              sequence_length=None,
-              dropout_prob=0.0,
-              bidirectional=False,
-              batch_first=True,
-              param_attr=None,
-              bias_attr=None,
-              gate_activation=None,
-              activation=None,
-              dtype='float32',
-              name='basic_gru'):
-    """
-    GRU implementation using basic operator, supports multiple layers and bidirection gru.
-
-    .. math::
-            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
-
-            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
-
-            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    Args:
-        input (Variable): GRU input tensor, 
-                       if batch_first = False, shape should be ( seq_len x batch_size x input_size )  
-                       if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
-        init_hidden(Variable|None): The initial hidden state of the GRU
-                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
-                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-                       and can be reshaped to tensor with ( num_layers x 2 x batch_size x hidden_size) to use.
-                       If it's None, it will be set to all 0.
-        hidden_size (int): Hidden size of the GRU
-        num_layers (int): The total number of layers of the GRU
-        sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance,
-                        This tensor will be convert to a mask to mask the padding ids
-                        If it's None means NO padding ids
-        dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of earch layers, 
-                             NOT between time steps
-        bidirectional (bool|False): If it is bidirectional
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of GRU unit.
-            If it is set to None or one attribute of ParamAttr, gru_unit will 
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cell (actNode).
-                             Default: 'fluid.layers.tanh'
-        dtype(string): data type used in this unit
-        name(string): name used to identify parameters and biases
-
-    Returns:
-        rnn_out(Tensor),last_hidden(Tensor)
-            - rnn_out is result of GRU hidden, with shape (seq_len x batch_size x hidden_size) \
-              if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
-            - last_hidden is the hidden state of the last step of GRU \
-              shape is ( num_layers x batch_size x hidden_size ) \
-              if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size),
-              can be reshaped to a tensor with shape( num_layers x 2 x batch_size x hidden_size)
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid.layers as layers
-            from paddle.fluid.contrib.layers import basic_gru
-
-            batch_size = 20
-            input_size = 128
-            hidden_size = 256
-            num_layers = 2
-            dropout = 0.5
-            bidirectional = True
-            batch_first = False
-
-            input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32')
-            pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
-            sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32')
-
-
-            rnn_out, last_hidden = basic_gru( input, pre_hidden, hidden_size, num_layers = num_layers, \
-                    sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \
-                    batch_first = batch_first)
-
-    """
-
-    fw_unit_list = []
-
-    for i in range(num_layers):
-        new_name = name + "_layers_" + str(i)
-        fw_unit_list.append(
-            BasicGRUUnit(new_name, hidden_size, param_attr, bias_attr,
-                         gate_activation, activation, dtype))
-    if bidirectional:
-        bw_unit_list = []
-
-        for i in range(num_layers):
-            new_name = name + "_reverse_layers_" + str(i)
-            bw_unit_list.append(
-                BasicGRUUnit(new_name, hidden_size, param_attr, bias_attr,
-                             gate_activation, activation, dtype))
-
-    if batch_first:
-        input = layers.transpose(input, [1, 0, 2])
-
-    mask = None
-    if sequence_length:
-        max_seq_len = layers.shape(input)[0]
-        mask = layers.sequence_mask(
-            sequence_length, maxlen=max_seq_len, dtype='float32')
-        mask = layers.transpose(mask, [1, 0])
-
-    direc_num = 1
-    if bidirectional:
-        direc_num = 2
-    if init_hidden:
-        init_hidden = layers.reshape(
-            init_hidden, shape=[num_layers, direc_num, -1, hidden_size])
-
-    def get_single_direction_output(rnn_input,
-                                    unit_list,
-                                    mask=None,
-                                    direc_index=0):
-        rnn = StaticRNN()
-        with rnn.step():
-            step_input = rnn.step_input(rnn_input)
-
-            if mask:
-                step_mask = rnn.step_input(mask)
-
-            for i in range(num_layers):
-                if init_hidden:
-                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
-                else:
-                    pre_hidden = rnn.memory(
-                        batch_ref=rnn_input,
-                        shape=[-1, hidden_size],
-                        ref_batch_dim_idx=1)
-
-                new_hidden = unit_list[i](step_input, pre_hidden)
-
-                if mask:
-                    new_hidden = layers.elementwise_mul(
-                        new_hidden, step_mask, axis=0) - layers.elementwise_mul(
-                            pre_hidden, (step_mask - 1), axis=0)
-                rnn.update_memory(pre_hidden, new_hidden)
-
-                rnn.step_output(new_hidden)
-
-                step_input = new_hidden
-                if dropout_prob != None and dropout_prob > 0.0:
-                    step_input = layers.dropout(
-                        step_input,
-                        dropout_prob=dropout_prob, )
-
-            rnn.step_output(step_input)
-
-        rnn_out = rnn()
-
-        last_hidden_array = []
-        rnn_output = rnn_out[-1]
-        for i in range(num_layers):
-            last_hidden = rnn_out[i]
-            last_hidden = last_hidden[-1]
-            last_hidden_array.append(last_hidden)
-
-        last_hidden_output = layers.concat(last_hidden_array, axis=0)
-        last_hidden_output = layers.reshape(
-            last_hidden_output, shape=[num_layers, -1, hidden_size])
-
-        return rnn_output, last_hidden_output
-        # seq_len, batch_size, hidden_size
-
-    fw_rnn_out, fw_last_hidden = get_single_direction_output(
-        input, fw_unit_list, mask, direc_index=0)
-
-    if bidirectional:
-        bw_input = layers.reverse(input, axis=[0])
-        bw_mask = None
-        if mask:
-            bw_mask = layers.reverse(mask, axis=[0])
-        bw_rnn_out, bw_last_hidden = get_single_direction_output(
-            bw_input, bw_unit_list, bw_mask, direc_index=1)
-
-        bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0])
-
-        rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2)
-        last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1)
-
-        last_hidden = layers.reshape(
-            last_hidden, shape=[num_layers * direc_num, -1, hidden_size])
-
-        if batch_first:
-            rnn_out = layers.transpose(rnn_out, [1, 0, 2])
-        return rnn_out, last_hidden
-    else:
-
-        rnn_out = fw_rnn_out
-        last_hidden = fw_last_hidden
-
-        if batch_first:
-            rnn_out = fluid.layser.transpose(rnn_out, [1, 0, 2])
-
-        return rnn_out, last_hidden
-
-
-def basic_lstm(input,
-               init_hidden,
-               init_cell,
-               hidden_size,
-               num_layers=1,
-               sequence_length=None,
-               dropout_prob=0.0,
-               bidirectional=False,
-               batch_first=True,
-               param_attr=None,
-               bias_attr=None,
-               gate_activation=None,
-               activation=None,
-               forget_bias=1.0,
-               dtype='float32',
-               name='basic_lstm'):
-    """
-    LSTM implementation using basic operators, supports multiple layers and bidirection LSTM.
-
-    .. math::
-           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
-
-           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
-
-           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
-
-           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-
-           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-
-           h_t &= o_t \odot tanh(c_t)
-
-    Args:
-        input (Variable): lstm input tensor, 
-                       if batch_first = False, shape should be ( seq_len x batch_size x input_size )  
-                       if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
-        init_hidden(Variable|None): The initial hidden state of the LSTM
-                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
-                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-                       and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use.
-                       If it's None, it will be set to all 0.
-        init_cell(Variable|None): The initial hidden state of the LSTM
-                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
-                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-                       and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use.
-                       If it's None, it will be set to all 0.
-        hidden_size (int): Hidden size of the LSTM
-        num_layers (int): The total number of layers of the LSTM
-        sequence_length (Variabe|None): A tensor (shape [batch_size]) stores each real length of each instance,
-                        This tensor will be convert to a mask to mask the padding ids
-                        If it's None means NO padding ids
-        dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of earch layers, 
-                             NOT between time steps
-        bidirectional (bool|False): If it is bidirectional
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will 
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cell (actNode).
-                             Default: 'fluid.layers.tanh'
-        forget_bias (float|1.0) : Forget bias used to compute the forget gate
-        dtype(string): Data type used in this unit
-        name(string): Name used to identify parameters and biases
-
-    Returns:
-        rnn_out(Tensor), last_hidden(Tensor), last_cell(Tensor)
-            - rnn_out is the result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
-              if is_bidirec set to True, it's shape will be ( seq_len x batch_sze x hidden_size*2)
-            - last_hidden is the hidden state of the last step of LSTM \
-              with shape ( num_layers x batch_size x hidden_size ) \
-              if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size),
-              and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size)  to use.
-            - last_cell is the hidden state of the last step of LSTM \
-              with shape ( num_layers x batch_size x hidden_size ) \
-              if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size),
-              and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size)  to use.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid.layers as layers
-            from paddle.fluid.contrib.layers import basic_lstm
-
-            batch_size = 20
-            input_size = 128
-            hidden_size = 256
-            num_layers = 2
-            dropout = 0.5
-            bidirectional = True
-            batch_first = False
-
-            input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32')
-            pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
-            pre_cell = layers.data( name = "pre_cell", shape=[-1, hidden_size], dtype='float32')
-            sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32')
-
-            rnn_out, last_hidden, last_cell = basic_lstm( input, pre_hidden, pre_cell, \
-                    hidden_size, num_layers = num_layers, \
-                    sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \
-                    batch_first = batch_first)
-
-    """
-    fw_unit_list = []
-
-    for i in range(num_layers):
-        new_name = name + "_layers_" + str(i)
-        fw_unit_list.append(
-            BasicLSTMUnit(
-                new_name,
-                hidden_size,
-                param_attr=param_attr,
-                bias_attr=bias_attr,
-                gate_activation=gate_activation,
-                activation=activation,
-                forget_bias=forget_bias,
-                dtype=dtype))
-    if bidirectional:
-        bw_unit_list = []
-
-        for i in range(num_layers):
-            new_name = name + "_reverse_layers_" + str(i)
-            bw_unit_list.append(
-                BasicLSTMUnit(
-                    new_name,
-                    hidden_size,
-                    param_attr=param_attr,
-                    bias_attr=bias_attr,
-                    gate_activation=gate_activation,
-                    activation=activation,
-                    forget_bias=forget_bias,
-                    dtype=dtype))
-
-    if batch_first:
-        input = layers.transpose(input, [1, 0, 2])
-
-    mask = None
-    if sequence_length:
-        max_seq_len = layers.shape(input)[0]
-        mask = layers.sequence_mask(
-            sequence_length, maxlen=max_seq_len, dtype='float32')
-
-        mask = layers.transpose(mask, [1, 0])
-
-    direc_num = 1
-    if bidirectional:
-        direc_num = 2
-        # convert to [num_layers, 2, batch_size, hidden_size]
-    if init_hidden:
-        init_hidden = layers.reshape(
-            init_hidden, shape=[num_layers, direc_num, -1, hidden_size])
-        init_cell = layers.reshape(
-            init_cell, shape=[num_layers, direc_num, -1, hidden_size])
-
-    # forward direction
-    def get_single_direction_output(rnn_input,
-                                    unit_list,
-                                    mask=None,
-                                    direc_index=0):
-        rnn = StaticRNN()
-        with rnn.step():
-            step_input = rnn.step_input(rnn_input)
-
-            if mask:
-                step_mask = rnn.step_input(mask)
-
-            for i in range(num_layers):
-                if init_hidden:
-                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
-                    pre_cell = rnn.memory(init=init_cell[i, direc_index])
-                else:
-                    pre_hidden = rnn.memory(
-                        batch_ref=rnn_input, shape=[-1, hidden_size])
-                    pre_cell = rnn.memory(
-                        batch_ref=rnn_input, shape=[-1, hidden_size])
-
-                new_hidden, new_cell = unit_list[i](step_input, pre_hidden,
-                                                    pre_cell)
-
-                if mask:
-                    new_hidden = layers.elementwise_mul(
-                        new_hidden, step_mask, axis=0) - layers.elementwise_mul(
-                            pre_hidden, (step_mask - 1), axis=0)
-                    new_cell = layers.elementwise_mul(
-                        new_cell, step_mask, axis=0) - layers.elementwise_mul(
-                            pre_cell, (step_mask - 1), axis=0)
-
-                rnn.update_memory(pre_hidden, new_hidden)
-                rnn.update_memory(pre_cell, new_cell)
-
-                rnn.step_output(new_hidden)
-                rnn.step_output(new_cell)
-
-                step_input = new_hidden
-                if dropout_prob != None and dropout_prob > 0.0:
-                    step_input = layers.dropout(
-                        step_input,
-                        dropout_prob=dropout_prob,
-                        dropout_implementation='upscale_in_train')
-
-            rnn.step_output(step_input)
-
-        rnn_out = rnn()
-
-        last_hidden_array = []
-        last_cell_array = []
-        rnn_output = rnn_out[-1]
-        for i in range(num_layers):
-            last_hidden = rnn_out[i * 2]
-            last_hidden = last_hidden[-1]
-            last_hidden_array.append(last_hidden)
-            last_cell = rnn_out[i * 2 + 1]
-            last_cell = last_cell[-1]
-            last_cell_array.append(last_cell)
-
-        last_hidden_output = layers.concat(last_hidden_array, axis=0)
-        last_hidden_output = layers.reshape(
-            last_hidden_output, shape=[num_layers, -1, hidden_size])
-        last_cell_output = layers.concat(last_cell_array, axis=0)
-        last_cell_output = layers.reshape(
-            last_cell_output, shape=[num_layers, -1, hidden_size])
-
-        return rnn_output, last_hidden_output, last_cell_output
-        # seq_len, batch_size, hidden_size
-
-    fw_rnn_out, fw_last_hidden, fw_last_cell = get_single_direction_output(
-        input, fw_unit_list, mask, direc_index=0)
-
-    if bidirectional:
-        bw_input = layers.reverse(input, axis=[0])
-        bw_mask = None
-        if mask:
-            bw_mask = layers.reverse(mask, axis=[0])
-        bw_rnn_out, bw_last_hidden, bw_last_cell = get_single_direction_output(
-            bw_input, bw_unit_list, bw_mask, direc_index=1)
-
-        bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0])
-
-        rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2)
-        last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1)
-        last_hidden = layers.reshape(
-            last_hidden, shape=[num_layers * direc_num, -1, hidden_size])
-
-        last_cell = layers.concat([fw_last_cell, bw_last_cell], axis=1)
-        last_cell = layers.reshape(
-            last_cell, shape=[num_layers * direc_num, -1, hidden_size])
-
-        if batch_first:
-            rnn_out = layers.transpose(rnn_out, [1, 0, 2])
-        return rnn_out, last_hidden, last_cell
-    else:
-
-        rnn_out = fw_rnn_out
-        last_hidden = fw_last_hidden
-        last_cell = fw_last_cell
-
-        if batch_first:
-            rnn_out = layers.transpose(rnn_out, [1, 0, 2])
-
-        return rnn_out, last_hidden, last_cell
-
-
-class BasicLSTMUnit(Layer):
-    """
-    ****
-    BasicLSTMUnit class, Using basic operator to build LSTM
-    The algorithm can be described as the code below.
-
-        .. math::
-
-           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
-
-           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
-
-           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
-
-           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-
-           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-
-           h_t &= o_t \odot tanh(c_t)
-
-        - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
-          of weights from the input gate to the input)
-        - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
-        - sigmoid is the logistic sigmoid function.
-        - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-          and cell activation vectors, respectively, all of which have the same size as
-          the cell output activation vector $h$.
-        - The :math:`\odot` is the element-wise product of the vectors.
-        - :math:`tanh` is the activation functions.
-        - :math:`\\tilde{c_t}` is also called candidate hidden state,
-          which is computed based on the current input and the previous hidden state.
-
-    Args:
-        name_scope(string) : The name scope used to identify parameter and bias name
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will 
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized as zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cells (actNode).
-                             Default: 'fluid.layers.tanh'
-        forget_bias(float|1.0): forget bias used when computing forget gate
-        dtype(string): data type used in this unit
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid.layers as layers
-            from paddle.fluid.contrib.layers import BasicLSTMUnit
-
-            input_size = 128
-            hidden_size = 256
-            input = layers.data( name = "input", shape = [-1, input_size], dtype='float32')
-            pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
-            pre_cell = layers.data( name = "pre_cell", shape=[-1, hidden_size], dtype='float32')
-
-            lstm_unit = BasicLSTMUnit( "gru_unit", hidden_size)
-
-            new_hidden, new_cell = lstm_unit( input, pre_hidden, pre_cell )
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 dtype='float32'):
-        super(BasicLSTMUnit, self).__init__(name_scope, dtype)
-
-        self._name = name_scope
-        self._hiden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        self._forget_bias = layers.fill_constant(
-            [1], dtype=dtype, value=forget_bias)
-        self._forget_bias.stop_gradient = False
-        self._dtype = dtype
-
-    def _build_once(self, input, pre_hidden, pre_cell):
-        self._input_size = input.shape[-1]
-        assert (self._input_size > 0)
-
-        self._weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._input_size + self._hiden_size, 4 * self._hiden_size],
-            dtype=self._dtype)
-
-        self._bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[4 * self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input, pre_hidden, pre_cell):
-        concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
-
-        gate_input = layers.elementwise_add(gate_input, self._bias)
-        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
-        new_cell = layers.elementwise_add(
-            layers.elementwise_mul(
-                pre_cell,
-                layers.sigmoid(layers.elementwise_add(f, self._forget_bias))),
-            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
-        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
-
-        return new_hidden, new_cell
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
deleted file mode 100644
index 1f7ec69dd7544a2835b9e336491c9d0fa2c76925..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ /dev/null
@@ -1,120 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module privides a memory usage calculate function for user.
-The purpose of this API is to allow users to estimate memory usage of
-a program under a special batch size, then user can set appropriate
-batch size to fully utilize a GPU.
-
-This API is still under active development and may change drastically.
-"""
-
-from __future__ import print_function
-
-import six
-
-from .. import core
-from ..framework import Program, Variable
-
-__all__ = ['memory_usage']
-
-dtype_to_size = {
-    core.VarDesc.VarType.FP16: 2,
-    core.VarDesc.VarType.FP32: 4,
-    core.VarDesc.VarType.FP64: 8,
-    core.VarDesc.VarType.INT16: 2,
-    core.VarDesc.VarType.INT32: 4,
-    core.VarDesc.VarType.INT64: 8,
-    core.VarDesc.VarType.BOOL: 1,
-    core.VarDesc.VarType.UINT8: 1,
-}
-
-DEBUG = False
-
-
-def memory_usage(program, batch_size):
-    """
-    Get the estimate memory usage of program with input batch size.
-
-    Args:
-        program(Program): The current Program.
-        batch_size(int): The current input data batch_size.
-
-    Returns:
-        min_total_memory(float): the estimate memory usage lower bound.
-        max_total_memory(float): the estimate memory usage upper bound.
-        unit_str(string): the unit of estimate usage result.
-
-    Examples:
-
-        >>> import paddle.fluid as fluid
-        >>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
-                fluid.default_main_program(), batch_size=10)
-        >>> print "memory usage is about %.3f - %.3f %s" % \
-                (lower_usage, upper_usage, unit)
-
-    """
-
-    # Parameters check
-    if not isinstance(program, Program):
-        raise TypeError(
-            "Calculating Memory Usage requires Program as its Parameter."
-            "But you passed in %s" % (type(program)))
-    if batch_size <= 0:
-        raise ValueError("The batch size need to be positive.")
-
-    # Get the var_name list of first block and calculate
-    total_memory = 0.0
-    processed_var_names = set(["@EMPTY@"])
-    for op in program.global_block().ops:
-        for var_name in op.output_arg_names:
-            if var_name in processed_var_names:
-                continue
-            processed_var_names.add(var_name)
-            var = program.global_block().vars[var_name]
-            if var.desc.type() != core.VarDesc.VarType.LOD_TENSOR:
-                continue
-
-            data_count = 1
-            neg_dim_count = 0
-            for x in var.shape:
-                if x < 0:
-                    if neg_dim_count >= 1:
-                        raise ValueError("Var %s has more than one negtive dim."
-                                         % (var_name))
-                    neg_dim_count += 1
-                    data_count *= batch_size * (-x)
-                else:
-                    data_count *= x
-            var_memory = data_count * dtype_to_size[var.dtype]
-            if DEBUG:
-                print("%s memory usage: %d" % (var.name, var_memory))
-            total_memory += var_memory
-    if DEBUG:
-        print("total memory usage: %.2f" % (total_memory))
-
-    # Convert appropriate unit
-    unit_str = "B"
-    if total_memory > 1024:
-        total_memory /= 1024
-        unit_str = "KB"
-        if total_memory > 1024:
-            total_memory /= 1024
-            unit_str = "MB"
-
-    # Append extra memory consumption (5% - 10%)
-    min_total_memory = total_memory * 1.05
-    max_total_memory = total_memory * 1.1
-
-    return min_total_memory, max_total_memory, unit_str
diff --git a/python/paddle/fluid/contrib/mixed_precision/__init__.py b/python/paddle/fluid/contrib/mixed_precision/__init__.py
deleted file mode 100644
index c6296bcac93015c5f6c55861575a45a3a33b3628..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/mixed_precision/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from . import decorator
-from .decorator import *
-from .fp16_lists import AutoMixedPrecisionLists
-
-__all__ = decorator.__all__
-__all__ += fp16_lists.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
deleted file mode 100644
index 4ca4a8972c7e9d88693b1bed005fd59b9345ad4e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ... import default_main_program
-from ... import default_startup_program
-from ... import layers
-from ... import unique_name
-from . import fp16_utils
-from .fp16_utils import update_loss_scaling, rewrite_program
-from .fp16_utils import update_role_var_grad
-from .fp16_lists import AutoMixedPrecisionLists
-
-__all__ = ["decorate"]
-
-
-class OptimizerWithMixedPrecison(object):
-    """
-    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
-    optimizer, plus the support of mixed-precision pretraining. The object
-    of this class almost has the same behavior as the common optimizer, with the 
-    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
-    Additionally, it enables the MP training automatically, i.e, the creation 
-    and maintenance of master parameters, scaling of loss, etc.
-
-    Args:
-        optimizer (Optimizer): A common Optimizer object.
-        amp_lists (AutoMixedPrecisionLists): An AutoMixedPrecisionLists object.
-        init_loss_scaling (float): The initial loss scaling factor.
-        use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
-        incr_every_n_steps(int): Increases loss scaling every n consecutive 
-                                 steps with finite gradients.
-        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
-                                      accumulated steps with nan or 
-                                      inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
-                           scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
-                           the loss scaling.
-
-    """
-
-    def __init__(self, optimizer, amp_lists, init_loss_scaling,
-                 use_dynamic_loss_scaling, incr_every_n_steps,
-                 decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
-        self._optimizer = optimizer
-        self._amp_lists = amp_lists
-        self._param_grads = None
-        self._train_program = default_main_program()
-        self._startup_prog = default_startup_program()
-        self._loss_scaling = layers.create_global_var(
-            name=unique_name.generate("loss_scaling"),
-            shape=[1],
-            value=init_loss_scaling,
-            dtype='float32',
-            persistable=True)
-        self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
-        if self._use_dynamic_loss_scaling:
-            self._incr_every_n_steps = layers.fill_constant(
-                shape=[1], dtype='int32', value=incr_every_n_steps)
-            self._decr_every_n_nan_or_inf = layers.fill_constant(
-                shape=[1], dtype='int32', value=decr_every_n_nan_or_inf)
-            self._incr_ratio = incr_ratio
-            self._decr_ratio = decr_ratio
-            self._num_good_steps = layers.create_global_var(
-                name=unique_name.generate("num_good_steps"),
-                shape=[1],
-                value=0,
-                dtype='int32',
-                persistable=True)
-            self._num_bad_steps = layers.create_global_var(
-                name=unique_name.generate("num_bad_steps"),
-                shape=[1],
-                value=0,
-                dtype='int32',
-                persistable=True)
-
-        # Ensure the data type of learning rate vars is float32 (same as the 
-        # master parameter dtype)
-        if isinstance(optimizer._learning_rate, float):
-            optimizer._learning_rate_map[default_main_program()] = \
-                        layers.create_global_var(
-                        name=unique_name.generate("learning_rate"),
-                        shape=[1],
-                        value=float(optimizer._learning_rate),
-                        dtype='float32',
-                        persistable=True)
-
-    def get_loss_scaling(self):
-        """Return the real-time loss scaling factor.
-        """
-        return self._loss_scaling
-
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None):
-        """
-        Backward propogation or auto differentiation for gradients' computation.
-
-        Args:
-            loss (Variable): The loss Variable to minimize.
-            startup_program (Program|None): The startup Program for initializing 
-                                       parameters in `parameter_list`.
-            parameter_list (list|None): A list of Variables to update.
-            no_grad_set (set|None): A set of Variables should be ignored.
-            callbacks (list|None): A list of callables to run when appending 
-                                   backward operator for one parameter.
-
-        Returns:
-            A list of (param, grad), which is a tuple of a parameter and its 
-            gradient respectively, and the scaled loss.
-        """
-        rewrite_program(self._train_program, self._amp_lists)
-        scaled_loss = loss * self._loss_scaling
-        self._params_grads = self._optimizer.backward(
-            scaled_loss, startup_program, parameter_list, no_grad_set,
-            callbacks)
-        update_role_var_grad(self._train_program, self._params_grads)
-        scaled_params_grads = []
-        for p, g in self._params_grads:
-            with self._train_program._optimized_guard([p, g]):
-                scaled_g = g / self._loss_scaling
-                scaled_params_grads.append([p, scaled_g])
-
-        return scaled_params_grads, scaled_loss
-
-    def apply_gradients(self, scaled_params_grads):
-        """
-        Check scaled gradients to determine whether to update loss scaling and update 
-        parameters by their scaled gradients, 
-  
-        Args:
-            scaled_params_grads (list): A list of params and scaled grads.
-    
-        Returns:
-            A list of optimize operators.
-        """
-
-        if self._use_dynamic_loss_scaling:
-
-            grads = [layers.reduce_sum(g) for [_, g] in scaled_params_grads]
-            all_grads = layers.concat(grads)
-            all_grads_sum = layers.reduce_sum(all_grads)
-            is_overall_finite = layers.isfinite(all_grads_sum)
-
-            update_loss_scaling(is_overall_finite, self._loss_scaling,
-                                self._num_good_steps, self._num_bad_steps,
-                                self._incr_every_n_steps,
-                                self._decr_every_n_nan_or_inf, self._incr_ratio,
-                                self._decr_ratio)
-
-            # apply_gradient append all ops in global block, thus we shouldn't
-            # apply gradient in the switch branch.
-            with layers.Switch() as switch:
-                with switch.case(is_overall_finite):
-                    pass
-                with switch.default():
-                    for _, g in scaled_params_grads:
-                        layers.assign(layers.zeros_like(g), g)
-
-        optimize_ops = self._optimizer.apply_gradients(scaled_params_grads)
-
-        return optimize_ops
-
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        """
-        Perform optimization by minimizing the given loss.
-
-        Args:
-            loss (Variable): The loss Variable.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-
-        Returns:
-            The scaled loss by scaling factor, the list of optimize ops, and a
-            list of scaled parameters and gradients.
-        """
-        scaled_params_grads, scaled_loss = self.backward(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
-
-        optimize_ops = self.apply_gradients(scaled_params_grads)
-
-        return optimize_ops, scaled_params_grads
-
-
-def decorate(optimizer,
-             amp_lists=None,
-             init_loss_scaling=1.0,
-             incr_every_n_steps=1000,
-             decr_every_n_nan_or_inf=2,
-             incr_ratio=2.0,
-             decr_ratio=0.8,
-             use_dynamic_loss_scaling=True):
-    """ 
-    Decorate the given optimizer to adapt to the mixed-precision training.
-
-    Args:
-        optimizer(Optimizer): A common Optimizer.
-        amp_lists (AutoMixedPrecisionLists): An AutoMixedPrecisionLists object.
-        init_loss_scaling(float): The initial loss scaling factor.
-        incr_every_n_steps(int): Increases loss scaling every n consecutive 
-                                 steps with finite gradients.
-        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
-                                      accumulated steps with nan or 
-                                      inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
-                           scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
-                           the loss scaling.
-        use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
-
-    Returns:
-        An optimizer acting like a normal one but with mixed-precision training 
-        enabled.
-
-    Examples:
-	.. code-block:: python
-
-	    loss = network()
-            optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-	
-            mp_optimizer = fluid.contrib.mixed_precision.decorate(
-	              optimizer=optimizer, init_loss_scaling=8.0)
-	
-            ops, param_grads = mp_optimizer.minimize(loss)
-            scaled_loss = mp_optimizer.get_loss_scaling()
-    """
-    if amp_lists is None:
-        amp_lists = AutoMixedPrecisionLists()
-    mp_optimizer = OptimizerWithMixedPrecison(
-        optimizer, amp_lists, init_loss_scaling, use_dynamic_loss_scaling,
-        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
-
-    return mp_optimizer
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
deleted file mode 100644
index 75f90cabfff434225e1b111746a28251d57b5b92..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ /dev/null
@@ -1,280 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-
-__all__ = ["AutoMixedPrecisionLists"]
-
-
-class AutoMixedPrecisionLists(object):
-    """
-    AutoMixedPrecisionLists is a class for black/white list. It can update
-    pre-defined black list and white list according to users' custom black
-    white lists. The lists are used for an algorithm which determines op's
-    exectuion mode (fp32 or fp16).
-
-    Args:
-        custom_white_list (set): Users' custom white list.
-        custom_black_list (set): Users' custom black list.
-    """
-
-    def __init__(self, custom_white_list=None, custom_black_list=None):
-        self._custom_white_list = custom_white_list
-        self._custom_black_list = custom_black_list
-        self.white_list = copy.copy(white_list)
-        self.black_list = copy.copy(black_list)
-        self.gray_list = copy.copy(gray_list)
-        self._update_list()
-
-    def _update_list(self):
-        """
-        Update black and white list according to users' custom list.
-        """
-        if self._custom_white_list and self._custom_black_list:
-            for op_name in self._custom_white_list:
-                if op_name in self._custom_black_list:
-                    raise ValueError("Custom white list overlap "
-                                     "custom black list")
-        if self._custom_white_list:
-            for op_name in self._custom_white_list:
-                if op_name in self.black_list:
-                    self.black_list.remove(op_name)
-                elif op_name in self.gray_list:
-                    self.gray_list.remove(op_name)
-                self.white_list.add(op_name)
-        if self._custom_black_list:
-            for op_name in self._custom_black_list:
-                if op_name in self.white_list:
-                    self.white_list.remove(op_name)
-                elif op_name in self.gray_list:
-                    self.gray_list.remove(op_name)
-                self.black_list.add(op_name)
-
-
-# The three sets listed below are changed dynamiclly. They don't contain all  
-# paddle ops currently.
-
-# The set of ops that support fp16 calculation and are considered numerically-
-# safe and performance-critical. These ops are always converted to fp16.
-white_list = {
-    'conv2d',
-    'matmul',
-    'mul',
-}
-
-# The set of ops that support fp16 calculation and are considered numerically-
-# dangerous and whose effects may also be observed in downstream ops.
-black_list = {
-    'exp',
-    'square',
-    'log',
-    'mean',
-    'sum',
-    'cos_sim',
-    'softmax',
-    'softmax_with_cross_entropy',
-    'sigmoid_cross_entropy_with_logits',
-    'cross_entropy',
-    'cross_entropy2',
-}
-
-# This set contains two types of ops. All ops supported fp16 calculation. One 
-# of two types is considered numerically-safe, but may be made unsafe by an
-# updtream blacklist op. Another type do not have numerically-significant 
-# effects, like stack, flatten2.
-gray_list = {
-    'elementwise_add',
-    'elementwise_sub',
-    'elementwise_mul',
-    'elementwise_div',
-    'elementwise_max',
-    'elementwise_min',
-    'elementwise_pow',
-    'elementwise_mod',
-    'elementwise_floordiv',
-    'batch_norm',
-    'tanh',
-    'sigmoid',
-    'lookup_table',
-    'top_k',
-    'pool2d',
-    'pool3d',
-    'dropout',
-    'relu',
-    'relu6',
-    'leaky_relu',
-    'soft_relu',
-    'flatten2',
-    'stack',
-    'unstack',
-    'uniform_random_batch_size_like',
-    'gaussian_random',
-    'gaussian_random_batch_size_like',
-    'slice',
-    'rank',
-    'scale',
-    'transpose2',
-    'reshape2',
-    'gather',
-    'fill_constant',
-    'get_tensor_from_selected_rows',
-    'sign',
-    'cast',
-}
-'''
-# The set of ops that don't support fp16 calculation
-unsupported_fp16_list = {
-		# from python/paddle/fluid/layers/io.py
-    'send',
-    'send_barrier',
-    'recv',
-    'fetch_barrier',
-    'create_py_reader',
-    'create_double_buffer_reader',
-    'read',
-    'load',
-    
-   	# from python/paddle/fluid/control_flow.py
-    'increment',
-    'less_than',
-    'less_equal',
-    'greater_than',
-    'greater_equal',
-    'equal',
-    'not_equal',
-    'read_from_array',
-    'shrink_rnn_memory',
-    'lod_array_length',
-    'logical_and',
-    'logical_or',
-    'logical_xor',
-    'logical_not',
-    'print',
-    'conditional_block',
-    'while',
-    'ifelse',
-    'is_empty',
-
-    'lstm',
-    'cudnn_lstm',
-    'lstmp',
-    'gru',
-    'gru_unit',
-    'linear_chain_crf',
-    'crf_decoding',
-    'bpr_loss',
-    'chunk_eval',
-    'sequence_conv',
-    'sequence_softmax',
-    # Depthwise conv2d isn't fast and safe currently.
-    # ref: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h#L79
-    'depthwise_conv2d',
-    # Tensor Core kernels are not available for 3D convolutions currently.
-    'conv3d',
-    'sequence_pool',
-    'sequence_concat',
-    'sequence_slice',
-    'data_norm',
-    'layer_norm',
-    'group_norm',
-    'spectral_norm',
-    'depthwise_conv2d_transpose',
-    'sequence_expand',
-    'conv_transposed2d',
-    'conv_transposed3d',
-    'sequence_expand_as',
-    'sequence_pad',
-    'sequence_unpad',
-    'sequence_erase',
-    'beam_search',
-    'beam_search_decode',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'reduce_prod',
-    'reduce_all',
-    'reduce_any',
-    'split',
-    'edit_distance',
-    'ctc_align',
-    'warpctc',
-    'sequence_reshape',
-    'nce',
-    'hierarchical_sigmoid',
-    'im2sequence',
-    'row_conv',
-    'multiplex',
-    'sample_logits',
-    'one_hot',
-    'smooth_l1_loss',
-    'squeeze2',
-    'unsqueeze2',
-    'lod_reset',
-    'lrn',
-    'pad',
-    'pad_constant_like',
-    'label_smooth',
-    'scatter',
-    'sequence_scatter',
-    'random_crop',
-    'mean_iou',
-    'selu',
-    'crop',
-    'affine_grid',
-    'rank_loss',
-    'margin_rank_loss',
-    'pad2d',
-    'elu',
-    'pow',
-    'stanh',
-    'hard_sigmoid',
-    'swish',
-    'prelu',
-    'brelu',
-    'sequence_enumerate',
-    'sequence_mask',
-    'expand',
-    'sampling_id',
-    'maxout',
-    'space_to_depth',
-    'sequence_reverse',
-    'similarity_focus',
-    'hash',
-    'grid_sampler',
-    'log_loss',
-    'teacher_student_sigmoid_loss',
-    'add_position_encoding',
-    'bilinear_tensor_product',
-    'shuffle_channel',
-    'temporal_shift',
-    'psroi_pool',
-    'huber_loss',
-    'kldiv_loss',
-    'tree_conv',
-    'pixel_shuffle',
-    'fsp',
-    'cvm',
-
-    'affine_channel',
-    'roi_pool',
-    'roi_align',
-    'anchor_generator',
-    'generate_proposals',
-    'generate_proposal_labels',
-    'generate_mask_labels',
-		
-}
-'''
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
deleted file mode 100644
index 05dfe27303505903533d6404de0e6ffe51a661ad..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ /dev/null
@@ -1,350 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from ... import core
-from ... import layers
-from ... import framework
-
-
-def append_cast_op(i, o, prog):
-    """
-    Append a cast op in a given Program to cast input `i` to data type `o.dtype`.
-
-    Args:
-        i (Variable): The input Variable.
-        o (Variable): The output Variable.
-        prog (Program): The Program to append cast op.
-    """
-    prog.global_block().append_op(
-        type="cast",
-        inputs={"X": i},
-        outputs={"Out": o},
-        attrs={"in_dtype": i.dtype,
-               "out_dtype": o.dtype})
-
-
-def _rename_arg(op, old_name, new_name):
-    """
-    If an op has old_name input and output, rename these input 
-    args new_name.
-
-    Args:
-        op (Operator): Current operator.
-        old_name (str): The old name of input args.
-        new_name (str): The new name of input args.
-    """
-    op_desc = op.desc
-    if isinstance(op_desc, tuple):
-        op_desc = op_desc[0]
-    op_desc._rename_input(old_name, new_name)
-    op_desc._rename_output(old_name, new_name)
-
-
-def _dtype_to_str(dtype):
-    """
-    Convert specific variable type to its corresponding string.
-
-    Args:
-        dtype (VarType): Variable type.
-    """
-    if dtype == core.VarDesc.VarType.FP16:
-        return 'fp16'
-    else:
-        return 'fp32'
-
-
-def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
-    """
-    Insert cast op and rename args of input and output.
-
-    Args:
-        block (Program): The block in which the operator is.
-        op (Operator): The operator to insert cast op.
-        idx (int): The index of current operator.
-        src_dtype (VarType): The input variable dtype of cast op.
-        desr_dtype (VarType): The output variable dtype of cast op.
-
-    Returns:
-        num_cast_op (int): The number of cast ops that have been inserted.
-    """
-    num_cast_ops = 0
-    valid_types = [
-        core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.SELECTED_ROWS,
-        core.VarDesc.VarType.LOD_TENSOR_ARRAY
-    ]
-    for in_name in op.input_names:
-        if src_dtype == core.VarDesc.VarType.FP32 and op.type == 'batch_norm':
-            if in_name != 'X':
-                continue
-        for in_var_name in op.input(in_name):
-            in_var = block.var(in_var_name)
-            if in_var.type not in valid_types:
-                continue
-            if in_var.dtype == src_dtype:
-                out_var = block.create_var(
-                    name=in_var.name + \
-                            '.cast_' + _dtype_to_str(dest_dtype),
-                    dtype=dest_dtype,
-                    persistable=False,
-                    stop_gradient=False)
-                block._insert_op(
-                    idx,
-                    type="cast",
-                    inputs={"X": in_var},
-                    outputs={"Out": out_var},
-                    attrs={
-                        "in_dtype": in_var.dtype,
-                        "out_dtype": out_var.dtype
-                    })
-                num_cast_ops += 1
-                _rename_arg(op, in_var.name, out_var.name)
-            else:
-                if op.has_attr('in_dtype'):
-                    op._set_attr('in_dtype', dest_dtype)
-    if src_dtype == core.VarDesc.VarType.FP32:
-        for out_name in op.output_names:
-            if op.type == 'batch_norm' and out_name != 'Y':
-                continue
-            for out_var_name in op.output(out_name):
-                out_var = block.var(out_var_name)
-                if out_var.type not in valid_types:
-                    continue
-                if out_var.dtype == core.VarDesc.VarType.FP32:
-                    out_var.desc.set_dtype(core.VarDesc.VarType.FP16)
-                    if op.has_attr('out_dtype'):
-                        op._set_attr('out_dtype', core.VarDesc.VarType.FP16)
-    return num_cast_ops
-
-
-def find_true_prev_op(ops, cur_op, var_name):
-    """
-    Find the true prev op that outputs var_name variable.
-
-    Args:
-        ops (list): A list of ops.
-        cur_op (Operator): Current operator which has var_name variable.
-        var_name (string): Variable name.
-    """
-    prev_op = []
-    for op in ops:
-        if op == cur_op:
-            break
-        for out_name in op.output_names:
-            for out_var_name in op.output(out_name):
-                if out_var_name == var_name:
-                    prev_op.append(op)
-    if prev_op:
-        if not len(prev_op) == 1:
-            raise ValueError("There must be only one previous op "
-                             "that outputs {0} variable".format(var_name))
-        else:
-            return prev_op[0]
-    return None
-
-
-def rewrite_program(main_prog, amp_lists):
-    """
-    Traverse all ops in current block and insert cast op according to 
-    which set current op belongs to.
-
-    1. When an op belongs to the black list, add it to black set
-    2. When an op belongs to the white list, add it to white set
-    3. When an op belongs to the gray list. If one 
-       of its inputs is the output of black set op or black list op, 
-       add it to black set. If all of its previous ops are not black 
-       op and one of its inputs is the output of white set op or 
-       white list op, add it to white set.
-    4. When an op isn't in the lists, add it to black op set.
-    5. Add necessary cast ops to make sure that black set op will be 
-       computed in fp32 mode, while white set op will be computed in 
-       fp16 mode.
-
-    Args:
-        main_prog (Program): The main program for training.
-    """
-    block = main_prog.global_block()
-    ops = block.ops
-    white_op_set = set()
-    black_op_set = set()
-    for op in ops:
-        if op.type in amp_lists.black_list:
-            black_op_set.add(op)
-        elif op.type in amp_lists.white_list:
-            white_op_set.add(op)
-        elif op.type in amp_lists.gray_list:
-            is_black_op = False
-            is_white_op = False
-            for in_name in op.input_names:
-                # if this op has inputs
-                if in_name:
-                    for in_var_name in op.input(in_name):
-                        in_var = block.var(in_var_name)
-                        # this in_var isn't the output of other op
-                        if in_var.op is None:
-                            continue
-                        elif in_var.op is op:
-                            prev_op = find_true_prev_op(ops, op, in_var_name)
-                            if prev_op is None:
-                                continue
-                        else:
-                            prev_op = in_var.op
-                        # if it's one of inputs
-                        if prev_op in black_op_set or \
-                                prev_op.type in amp_lists.black_list:
-                            is_black_op = True
-                        elif prev_op in white_op_set or \
-                                prev_op.type in amp_lists.white_list:
-                            is_white_op = True
-            if is_black_op:
-                black_op_set.add(op)
-            elif is_white_op:
-                white_op_set.add(op)
-            else:
-                pass
-        else:
-            # For numerical safe, we apply fp32 computation on ops that
-            # are not determined which list they should stay.
-            black_op_set.add(op)
-
-    idx = 0
-    while idx < len(ops):
-        op = ops[idx]
-        num_cast_ops = 0
-        if op in black_op_set:
-            num_cast_ops = _insert_cast_op(block, op, idx,
-                                           core.VarDesc.VarType.FP16,
-                                           core.VarDesc.VarType.FP32)
-        elif op in white_op_set:
-            num_cast_ops = _insert_cast_op(block, op, idx,
-                                           core.VarDesc.VarType.FP32,
-                                           core.VarDesc.VarType.FP16)
-        else:
-            pass
-
-        idx += num_cast_ops + 1
-
-
-def update_role_var_grad(main_prog, params_grads):
-    """
-    Update op_role_var attr for some ops to make sure the gradients
-    transfered across gpus is FP16.
-    1. Check whether the op that outputs gradient is cast or not.
-    2. If op is cast and gradient is FP32, remove the op_role_var
-       and find the prev op which outputs FP16 gradient
-    3. Update the op_role_var of the prev op.
-
-    Args:
-        main_prog (Program): The main program for training.
-        params_grads (list): A list of params and grads.
-    """
-    block = main_prog.global_block()
-    BACKWARD = core.op_proto_and_checker_maker.OpRole.Backward
-    OPTIMIZE = core.op_proto_and_checker_maker.OpRole.Optimize
-    for p, g in params_grads:
-        op = g.op
-        if g.dtype == core.VarDesc.VarType.FP32 and op.type == 'cast':
-            role = op.attr('op_role')
-            if role & int(BACKWARD) and op.has_attr('op_role_var'):
-                op.desc.remove_attr("op_role_var")
-            else:
-                raise ValueError("The cast op {0} must be in BACKWARD role "
-                                 "and have op_role_var attr.".format(op))
-
-            fp16_grad_name = op.input(op.input_names[0])[0]
-            op_for_fp16_grad = find_true_prev_op(block.ops, op, fp16_grad_name)
-            op_role_var_attr_name = \
-                core.op_proto_and_checker_maker.kOpRoleVarAttrName()
-            attr_val = [p.name, fp16_grad_name]
-            if op_for_fp16_grad.has_attr(op_role_var_attr_name):
-                attr_val.extend(op_for_fp16_grad.attr(op_role_var_attr_name))
-            op_for_fp16_grad._set_attr(op_role_var_attr_name, attr_val)
-
-            # maximize the allreduce overlap
-            op._set_attr('op_role', OPTIMIZE)
-
-
-def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
-                        num_bad_steps, incr_every_n_steps,
-                        decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
-    """
-    Update loss scaling according to overall gradients. If all gradients is 
-    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
-    Otherwisw, loss scaling will decrease by decr_ratio after 
-    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
-
-    Args:
-        is_overall_finite (Variable): A boolean variable indicates whether 
-                                     all gradients are finite.
-        prev_loss_scaling (Variable): Previous loss scaling.
-        num_good_steps (Variable): A variable accumulates good steps in which 
-                                   all gradients are finite.
-        num_bad_steps (Variable): A variable accumulates bad steps in which 
-                                  some gradients are infinite.
-        incr_every_n_steps (Variable): A variable represents increasing loss 
-                                       scaling every n consecutive steps with 
-                                       finite gradients.
-        decr_every_n_nan_or_inf (Variable): A variable represents decreasing 
-                                            loss scaling every n accumulated 
-                                            steps with nan or inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
-                           scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
-                           loss scaling.
-    """
-    zero_steps = layers.fill_constant(shape=[1], dtype='int32', value=0)
-    with layers.Switch() as switch:
-        with switch.case(is_overall_finite):
-            should_incr_loss_scaling = layers.less_than(incr_every_n_steps,
-                                                        num_good_steps + 1)
-            with layers.Switch() as switch1:
-                with switch1.case(should_incr_loss_scaling):
-                    new_loss_scaling = prev_loss_scaling * incr_ratio
-                    loss_scaling_is_finite = layers.isfinite(new_loss_scaling)
-                    with layers.Switch() as switch2:
-                        with switch2.case(loss_scaling_is_finite):
-                            layers.assign(new_loss_scaling, prev_loss_scaling)
-                        with switch2.default():
-                            pass
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-
-                with switch1.default():
-                    layers.increment(num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-
-        with switch.default():
-            should_decr_loss_scaling = layers.less_than(decr_every_n_nan_or_inf,
-                                                        num_bad_steps + 1)
-            with layers.Switch() as switch3:
-                with switch3.case(should_decr_loss_scaling):
-                    new_loss_scaling = prev_loss_scaling * decr_ratio
-                    static_loss_scaling = \
-                        layers.fill_constant(shape=[1],
-                                             dtype='float32',
-                                             value=1.0)
-                    less_than_one = layers.less_than(new_loss_scaling,
-                                                     static_loss_scaling)
-                    with layers.Switch() as switch4:
-                        with switch4.case(less_than_one):
-                            layers.assign(static_loss_scaling,
-                                          prev_loss_scaling)
-                        with switch4.default():
-                            layers.assign(new_loss_scaling, prev_loss_scaling)
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-                with switch3.default():
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.increment(num_bad_steps)
diff --git a/python/paddle/fluid/contrib/model_stat.py b/python/paddle/fluid/contrib/model_stat.py
deleted file mode 100644
index 0d974c8d9685840c79de17f297fcba00b01a6c35..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/model_stat.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-'''
-Example:
-    >>from paddle.fluid.contrib.model_stat import summary
-    >>main_program = ...
-    >>summary(main_program)
-    +-----+------------+----------------+----------------+---------+------------+
-    | No. |       TYPE |          INPUT |         OUTPUT |  PARAMs |      FLOPs |
-    +-----+------------+----------------+----------------+---------+------------+
-    |   0 |     conv2d |  (3, 200, 200) | (64, 100, 100) |    9408 |  188160000 |
-    |   1 | batch_norm | (64, 100, 100) | (64, 100, 100) |     256 |     640000 |
-    |   2 |       relu | (64, 100, 100) | (64, 100, 100) |       0 |     640000 |
-    |   3 |     pool2d | (64, 100, 100) |   (64, 50, 50) |       0 |    1440000 |
-    ...
-    | 176 |     conv2d |    (512, 7, 7) |    (512, 7, 7) | 2359296 |  231211008 |
-    | 177 |       relu |    (512, 7, 7) |    (512, 7, 7) |       0 |      25088 |
-    | 178 |     conv2d |    (512, 7, 7) |   (2048, 7, 7) | 1048576 |  102760448 |
-    | 179 |       relu |   (2048, 7, 7) |   (2048, 7, 7) |       0 |     100352 |
-    | 180 |     pool2d |   (2048, 7, 7) |   (2048, 1, 1) |       0 |     100352 |
-    +-----+------------+----------------+----------------+---------+------------+
-    Total PARAMs: 48017344(0.0480G)
-    Total FLOPs: 11692747751(11.69G)
-'''
-from collections import OrderedDict
-from prettytable import PrettyTable
-
-
-def summary(main_prog):
-    '''
-    It can summary model's PARAMS, FLOPs until now.
-    It support common operator like conv, fc, pool, relu, sigmoid, bn etc. 
-    Args:
-        main_prog: main program 
-    Returns:
-        print summary on terminal
-    '''
-    collected_ops_list = []
-    for one_b in main_prog.blocks:
-        block_vars = one_b.vars
-        for one_op in one_b.ops:
-            op_info = OrderedDict()
-            spf_res = _summary_model(block_vars, one_op)
-            if spf_res is None:
-                continue
-            # TODO: get the operator name
-            op_info['type'] = one_op.type
-            op_info['input_shape'] = spf_res[0][1:]
-            op_info['out_shape'] = spf_res[1][1:]
-            op_info['PARAMs'] = spf_res[2]
-            op_info['FLOPs'] = spf_res[3]
-            collected_ops_list.append(op_info)
-
-    summary_table, total = _format_summary(collected_ops_list)
-    _print_summary(summary_table, total)
-
-
-def _summary_model(block_vars, one_op):
-    '''
-    Compute operator's params and flops.
-    Args:
-        block_vars: all vars of one block
-        one_op: one operator to count
-    Returns:
-        in_data_shape: one operator's input data shape
-        out_data_shape: one operator's output data shape
-        params: one operator's PARAMs 
-        flops: : one operator's FLOPs
-    '''
-    if one_op.type in ['conv2d', 'depthwise_conv2d']:
-        k_arg_shape = block_vars[one_op.input("Filter")[0]].shape
-        in_data_shape = block_vars[one_op.input("Input")[0]].shape
-        out_data_shape = block_vars[one_op.output("Output")[0]].shape
-        c_out, c_in, k_h, k_w = k_arg_shape
-        _, c_out_, h_out, w_out = out_data_shape
-        assert c_out == c_out_, 'shape error!'
-        k_groups = one_op.attr("groups")
-        kernel_ops = k_h * k_w * (c_in / k_groups)
-        bias_ops = 0 if one_op.input("Bias") == [] else 1
-        params = c_out * (kernel_ops + bias_ops)
-        flops = h_out * w_out * c_out * (kernel_ops + bias_ops)
-        # base nvidia paper, include mul and add
-        flops = 2 * flops
-
-    elif one_op.type == 'pool2d':
-        in_data_shape = block_vars[one_op.input("X")[0]].shape
-        out_data_shape = block_vars[one_op.output("Out")[0]].shape
-        _, c_out, h_out, w_out = out_data_shape
-        k_size = one_op.attr("ksize")
-        params = 0
-        flops = h_out * w_out * c_out * (k_size[0] * k_size[1])
-
-    elif one_op.type == 'mul':
-        k_arg_shape = block_vars[one_op.input("Y")[0]].shape
-        in_data_shape = block_vars[one_op.input("X")[0]].shape
-        out_data_shape = block_vars[one_op.output("Out")[0]].shape
-        # TODO: fc has mul ops
-        # add attr to mul op, tell us whether it belongs to 'fc'
-        # this's not the best way
-        if 'fc' not in one_op.output("Out")[0]:
-            return None
-        k_in, k_out = k_arg_shape
-        # bias in sum op
-        params = k_in * k_out + 1
-        flops = k_in * k_out
-
-    elif one_op.type in ['sigmoid', 'tanh', 'relu', 'leaky_relu', 'prelu']:
-        in_data_shape = block_vars[one_op.input("X")[0]].shape
-        out_data_shape = block_vars[one_op.output("Out")[0]].shape
-        params = 0
-        if one_op.type == 'prelu':
-            params = 1
-        flops = 1
-        for one_dim in in_data_shape:
-            flops *= one_dim
-
-    elif one_op.type == 'batch_norm':
-        in_data_shape = block_vars[one_op.input("X")[0]].shape
-        out_data_shape = block_vars[one_op.output("Y")[0]].shape
-        _, c_in, h_out, w_out = in_data_shape
-        # gamma, beta
-        params = c_in * 2
-        # compute mean and std
-        flops = h_out * w_out * c_in * 2
-
-    else:
-        return None
-
-    return in_data_shape, out_data_shape, params, flops
-
-
-def _format_summary(collected_ops_list):
-    '''
-    Format summary report.
-    Args:
-        collected_ops_list: the collected operator with summary
-    Returns:
-        summary_table: summary report format
-        total: sum param and flops
-    '''
-    summary_table = PrettyTable(
-        ["No.", "TYPE", "INPUT", "OUTPUT", "PARAMs", "FLOPs"])
-    summary_table.align = 'r'
-
-    total = {}
-    total_params = []
-    total_flops = []
-    for i, one_op in enumerate(collected_ops_list):
-        # notice the order
-        table_row = [
-            i,
-            one_op['type'],
-            one_op['input_shape'],
-            one_op['out_shape'],
-            int(one_op['PARAMs']),
-            int(one_op['FLOPs']),
-        ]
-        summary_table.add_row(table_row)
-        total_params.append(int(one_op['PARAMs']))
-        total_flops.append(int(one_op['FLOPs']))
-
-    total['params'] = total_params
-    total['flops'] = total_flops
-
-    return summary_table, total
-
-
-def _print_summary(summary_table, total):
-    '''
-    Print all the summary on terminal.
-    Args:
-        summary_table: summary report format
-        total: sum param and flops
-    '''
-    parmas = total['params']
-    flops = total['flops']
-    print(summary_table)
-    print('Total PARAMs: {}({:.4f}M)'.format(
-        sum(parmas), sum(parmas) / (10**6)))
-    print('Total FLOPs: {}({:.2f}G)'.format(sum(flops), sum(flops) / 10**9))
-    print(
-        "Notice: \n now supported ops include [Conv, DepthwiseConv, FC(mul), BatchNorm, Pool, Activation(sigmoid, tanh, relu, leaky_relu, prelu)]"
-    )
diff --git a/python/paddle/fluid/contrib/op_frequence.py b/python/paddle/fluid/contrib/op_frequence.py
deleted file mode 100644
index 68dd0a946b4b69d47d51dce3de25ce147198f09a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/op_frequence.py
+++ /dev/null
@@ -1,104 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from collections import OrderedDict
-
-from ..framework import Program
-
-__all__ = ['op_freq_statistic']
-
-
-def op_freq_statistic(program):
-    """
-    Statistics of Op frequency.
-
-    Args:
-        program(Program): The current Program.
-
-    Returns:
-        uni_op_freq(dict): the single op frequency.
-        adj_2_op_freq(dict): the two adjacent ops frequency.
-
-    Examples:
-
-        >>> import paddle.fluid as fluid
-        >>> uni_op_freq, adj_2_op_freq = fluid.contrib.op_freq_statistic(
-        >>>        fluid.default_main_program())
-        >>> for op_type, op_num in uni_op_freq:
-        >>>     print("%s  \t  %d" % (op_type, op_num))
-        >>> for op_type, op_num in adj_2_op_freq:
-        >>>     print("%s  \t  %d" % (op_type, op_num))
-
-    """
-
-    if not isinstance(program, Program):
-        raise TypeError("The input type should be Porgram."
-                        "But you passed in %s" % (type(program)))
-
-    uni_op_freq = OrderedDict()
-    adj_2_op_freq = OrderedDict()
-    op_in_ops = OrderedDict()
-
-    parameters = [p.name for p in program.blocks[0].all_parameters()]
-
-    # get uni_op_freq
-    for op in program.global_block().ops:
-        had_recorded = False
-        for var_name in op.output_arg_names:
-            if var_name in parameters:
-                continue
-            if not had_recorded and uni_op_freq.has_key(op.type):
-                uni_op_freq[op.type] += 1
-                had_recorded = True
-            elif not had_recorded:
-                uni_op_freq[op.type] = 1
-                had_recorded = True
-
-    # get adj_2_op_freq
-    var_gen_op = {}
-    for op in program.global_block().ops:
-        for var_name in op.input_arg_names:
-            if var_name in parameters:
-                continue
-            if var_gen_op.has_key(var_name):
-                assert len(var_gen_op[var_name]) > 0
-                if op_in_ops.has_key(op.type):
-                    op_in_ops[op.type].append(var_gen_op[var_name][-1])
-                else:
-                    op_in_ops[op.type] = [var_gen_op[var_name][-1]]
-            else:
-                print("Var's generate op is not found,%s, %s" %
-                      (var_name, op.type))
-
-        for var_name in op.output_arg_names:
-            if var_gen_op.has_key(var_name):
-                var_gen_op[var_name].append(op.type)
-            else:
-                var_gen_op[var_name] = [op.type]
-
-    for op, in_ops in op_in_ops.iteritems():
-        for in_op in in_ops:
-            op_op = in_op + "->" + op
-            if adj_2_op_freq.has_key(op_op):
-                adj_2_op_freq[op_op] += 1
-            else:
-                adj_2_op_freq[op_op] = 1
-
-    uni_op_freq = sorted(
-        uni_op_freq.items(), key=lambda item: item[1], reverse=True)
-    adj_2_op_freq = sorted(
-        adj_2_op_freq.items(), key=lambda item: item[1], reverse=True)
-
-    return uni_op_freq, adj_2_op_freq
diff --git a/python/paddle/fluid/contrib/quantize/__init__.py b/python/paddle/fluid/contrib/quantize/__init__.py
deleted file mode 100644
index 14c208d0e7f35ebfbbe1c36d0b11a8d0f0efb4a6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/quantize/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import quantize_transpiler
-from .quantize_transpiler import *
-
-__all__ = quantize_transpiler.__all__
diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
deleted file mode 100644
index 471a796eb3e0a75a1fa0a9eb28499c9b168a3ee3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ /dev/null
@@ -1,561 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import numpy as np
-
-from paddle.fluid.framework import default_main_program, default_startup_program, program_guard
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid import unique_name
-from paddle.fluid import core
-from paddle.fluid.initializer import Constant
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.layers.nn import autoincreased_step_counter
-from paddle.fluid.framework import Variable
-from paddle.fluid.executor import global_scope
-
-__all__ = ['QuantizeTranspiler']
-
-_QUANTIZABLE_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul']
-
-
-def _quantized_var_name(var_name):
-    """
-    Return quantized variable name for the input `var_name`.
-    """
-    return "%s.quantized" % (var_name)
-
-
-def _dequantized_var_name(var_name):
-    """
-    Return dequantized variable name for the input `var_name`.
-    """
-    return "%s.dequantized" % (var_name)
-
-
-def _quantized_scale_name(var_name):
-    """
-    Return quantized variable name for the input `var_name`.
-    """
-    return "%s.scale" % (var_name)
-
-
-def _original_var_name(var_name):
-    """
-    Return the original variable name.
-    """
-    if var_name.endswith('.quantized.dequantized'):
-        return var_name[:-len('.quantized.dequantized')]
-    if var_name.endswith('.quantized'):
-        return var_name[:-len('.quantized')]
-    if var_name.endswith('.dequantized'):
-        return var_name[:-len('.dequantized')]
-    if var_name.endswith('.scale'):
-        return var_name[:-len('.scale')]
-    else:
-        return var_name
-
-
-def _is_float(v):
-    return isinstance(v, float) or isinstance(v, np.float32)
-
-
-def quant(x, scale, num_bits):
-    y = np.round(x / scale * ((1 << (num_bits - 1)) - 1))
-    return y
-
-
-class QuantizeTranspiler(object):
-    def __init__(self,
-                 weight_bits=8,
-                 activation_bits=8,
-                 activation_quantize_type='abs_max',
-                 weight_quantize_type='abs_max',
-                 window_size=10000,
-                 moving_rate=0.9):
-        """
-        Convert and rewrite the fluid Program according to weight and
-        activation quantization type.
-
-        Args:
-            weight_bits (int): quantization bit number for weights,
-                the bias is not quantized.
-            activation_bits (int): quantization bit number for activation.
-            activation_quantize_type (str): quantization type for activation,
-                now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
-                the quantization scale will be calculated dynamically each step
-                in both training and testing period. If use 'range_abs_max',
-                a static quantization scale will be calculated during training
-                and used in inference.
-            weight_quantize_type (str): quantization type for weights,
-                support 'abs_max'. The 'range_abs_max' usually is not used for
-                weight, since weights are fixed once the model is well trained.
-            window_size (int): the window size for 'range_abs_max' quantization.
-
-        Examples:
-
-        .. code-block:: python
-
-            # the original program will be rewrite, if you don't want to
-            # change it, please clone at first.
-            # quantize_program = program.clone()
-            t = fluid.QuantizeTranspiler()
-            t.transpile(quantize_program)
-
-        """
-        self.weight_bits = weight_bits
-        self.activation_bits = activation_bits
-        quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max']
-        if weight_quantize_type not in quant_type:
-            raise ValueError(
-                "Unknown weight_quantize_type: '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
-                str(weight_quantize_type))
-        if activation_quantize_type not in quant_type:
-            raise ValueError(
-                "Unknown activation_quantize_type : '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
-                str(activation_quantize_type))
-
-        self.weight_quantize_type = weight_quantize_type
-        self.activation_quantize_type = activation_quantize_type
-
-        self.window_size = window_size
-        self.moving_rate = moving_rate
-        self.helper = LayerHelper(self.__class__.__name__)
-        self.fake_quant_op_types = [
-            'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
-            'fake_quantize_moving_average_abs_max'
-        ]
-        self.fake_dequant_op_types = ['fake_dequantize_max_abs']
-        self.is_test = None
-        self.global_step = None
-
-    def training_transpile(self, program=None, startup_program=None):
-        """Rewrites a training input program in place for simulated
-        quantization. Insert fake quantization and de-quantization ops into
-        program to simulate the error introduced by quantization. And change
-        the graident ops' input by using the faked quantization weights and
-        activation. Since the program is transformed in place, the graph
-        connection will change.
-
-        Args:
-            program (Program): the input program to be transpile.
-        """
-        self.is_test = False
-        program = default_main_program() if program is None else program
-        startup_program = default_startup_program() if startup_program is \
-            None else startup_program
-
-        # marked the variable which has been quantized and dequantized.
-        dequanted_vars = [
-            collections.OrderedDict() for _ in range(len(program.blocks))
-        ]
-        grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES]
-
-        params = [p.name for p in program.global_block().iter_parameters()]
-
-        def _transpile_forward(block, op):
-            idx = block.ops.index(op)
-            block_id = block.idx
-            # insert quant op and dequant op
-            for name in op.input_arg_names:
-                #if share input between ops
-                if name in dequanted_vars[block_id]:
-                    dequant_var = dequanted_vars[block_id][name]
-                else:
-                    var = block.var(name)
-                    quant_bits = self.weight_bits if var.name in params \
-                                 else self.activation_bits
-                    quant_type = self.weight_quantize_type if var.name \
-                        in params else self.activation_quantize_type
-
-                    quant_var, scale_var = self._insert_quant_op(
-                        block, idx, var, quant_bits, quant_type)
-                    dequant_var = self._insert_dequant_op(
-                        block, idx + 1, quant_var, scale_var, quant_bits)
-                    dequanted_vars[block_id][name] = dequant_var
-                # rename the forward op inputs
-                op._rename_input(name, dequant_var.name)
-
-        def _transpile_backward(block, op):
-            block_id = block.idx
-            no_dequanted_input_vars = True
-            for name in op.input_arg_names:
-                if name in dequanted_vars[block_id]:
-                    dequant_var = dequanted_vars[block_id][name]
-                    op._rename_input(name, dequant_var.name)
-                    no_dequanted_input_vars = False
-            if no_dequanted_input_vars:
-                raise ValueError("There is no dequanted inputs for op %s." %
-                                 (op.type))
-
-        with program_guard(program, startup_program):
-            self._create_global_step()
-            for block in program.blocks:
-                ops = list(block.ops)
-                block_id = block.idx
-                for op in ops:
-                    # rewrite the forward ProgramDes
-                    if op.type in _QUANTIZABLE_OP_TYPES:
-                        _transpile_forward(block, op)
-                    # rename the backward op inputs
-                    if op.type in grad_op_types:
-                        _transpile_backward(block, op)
-
-    def _create_global_step(self):
-        if self.weight_quantize_type == 'range_abs_max' or \
-            self.activation_quantize_type == 'range_abs_max':
-            self.global_step = autoincreased_step_counter()
-
-    def freeze_program(self, program, place, scope=None):
-        """Freeze input training program for inference.
-
-        Args:
-            program (Program): the input program to be transpile.
-        """
-
-        self.is_test = True
-        scope = global_scope() if scope is None else scope
-        program = default_main_program() if program is None else program
-
-        persistable_vars = [
-            v.name
-            for v in filter(lambda var: var.persistable, program.list_vars())
-        ]
-        op_in_rename_map = [
-            collections.OrderedDict() for _ in range(len(program.blocks))
-        ]
-        op_out_rename_map = [
-            collections.OrderedDict() for _ in range(len(program.blocks))
-        ]
-        var_scale_map = [
-            collections.OrderedDict() for _ in range(len(program.blocks))
-        ]
-
-        def _remove_fake_quant_and_dequant_op(block, op):
-            idx = block.ops.index(op)
-            block_id = block.idx
-            k = op.output('Out')[0]
-            v = op.input('X')[0]
-            if v not in op_in_rename_map[block_id]:
-                op_in_rename_map[block_id][k] = v
-            else:
-                op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v]
-            block._remove_op(idx)
-
-        def _insert_post_dequant_op(block, op):
-            idx = block.ops.index(op)
-            block_id = block.idx
-            max_range = None
-            scale_var = None
-            for name in op.input_arg_names:
-                #rename input name of the op to the input name of last op which has be removed
-                if name in op_in_rename_map[block_id]:
-                    op._rename_input(name, op_in_rename_map[block_id][name])
-
-                scale_v = var_scale_map[block_id][_original_var_name(name)]
-                if _original_var_name(name) in persistable_vars:
-                    param_range = (1 << (self.weight_bits - 1)) - 1
-                    act_range = (1 << (self.activation_bits - 1)) - 1
-                    assert _is_float(scale_v)
-                    max_range = param_range * act_range / scale_v
-                else:
-                    assert isinstance(scale_v, Variable)
-                    scale_var = scale_v
-
-            if len(op.output_arg_names) != 1:
-                raise ValueError("Only support one output, but op %s has"
-                                 " more than one output." % (op.type))
-            out_var = block.var(op.output_arg_names[0])
-            dequant_var = block.create_var(
-                name=_dequantized_var_name(out_var.name),
-                type=out_var.type,
-                shape=out_var.shape,
-                dtype=out_var.dtype)
-            # insert fake_dequantize_op
-            dequant_op = block._insert_op(
-                idx + 1,
-                type="fake_dequantize_max_abs",
-                attrs={'max_range': float(max_range)},
-                inputs={"X": out_var,
-                        'Scale': scale_var},
-                outputs={"Out": dequant_var})
-            op_out_rename_map[block_id][out_var.name] = dequant_var.name
-            return dequant_var
-
-        def _load_var(name):
-            return np.array(scope.find_var(name).get_tensor())
-
-        def _restore_var(name, arr):
-            t = scope.find_var(name).get_tensor()
-            t.set(arr, place)
-
-        for block in program.blocks:
-            ops = list(block.ops)
-            block_id = block.idx
-            for op in ops:
-                op_type = op.type
-
-                # insert dequant_op after fc/conv, need to rename
-                # input of the followed ops(of fc/conv) to the dquant_op
-                for name in op.input_arg_names:
-                    if name in op_out_rename_map[block_id]:
-                        op._rename_input(name,
-                                         op_out_rename_map[block_id][name])
-
-                if op_type in self.fake_quant_op_types:
-                    in_arg_name = op.input('X')[0]
-                    if in_arg_name in persistable_vars:
-                        if self.weight_quantize_type == 'abs_max':
-                            param = _load_var(in_arg_name)
-                            scale_v = np.max(np.abs(param))
-                        else:
-                            scale_v = _load_var(op.output('OutScale')[0])
-                        var_scale_map[block_id][in_arg_name] = scale_v
-                    else:
-                        scale_v = block.var(op.output('OutScale')[0])
-                        var_scale_map[block_id][in_arg_name] = scale_v
-
-                    if in_arg_name in persistable_vars:
-                        _remove_fake_quant_and_dequant_op(block, op)
-                        # quantize weight and restore
-                        param_t = _load_var(in_arg_name)
-                        param_q_t = quant(param_t, scale_v, self.weight_bits)
-                        _restore_var(in_arg_name, param_q_t)
-
-                if op_type in self.fake_dequant_op_types:
-                    _remove_fake_quant_and_dequant_op(block, op)
-
-                if op_type in _QUANTIZABLE_OP_TYPES:
-                    dequant_var = _insert_post_dequant_op(block, op)
-
-        # remove the unused var in ProgramDesc
-        self._remove_unused_var(program)
-        #program = program.clone()
-
-    def convert_to_int8(self, program, place, scope=None):
-        scope = global_scope() if scope is None else scope
-        program = default_main_program() if program is None else program
-
-        def _load_var(name):
-            return np.array(scope.find_var(name).get_tensor())
-
-        global_block = program.global_block()
-
-        def convert_to_int8(var):
-            int8_var_name = var.name + ".int8"
-            int8_var = global_block.create_parameter(
-                name=int8_var_name.encode('ascii'),
-                type=var.type,
-                dtype=core.VarDesc.VarType.INT8,
-                shape=var.shape)
-
-            tensor = _load_var(var.name)
-
-            scope.var(int8_var_name)
-            int8_tensor = scope.find_var(int8_var_name).get_tensor()
-            int8_tensor.set(tensor.astype(np.int8), place)
-            return int8_var
-
-        input_map = {}
-        for block in program.blocks:
-            for op in list(block.ops):
-                if op.type in _QUANTIZABLE_OP_TYPES:
-                    for name in op.input_arg_names:
-                        var = block.var(name)
-                        if var.persistable:
-                            if name not in input_map:
-                                int8_var = convert_to_int8(var)
-                                input_map[name] = int8_var.name
-                            op._rename_input(name, input_map[name])
-        self._remove_unused_var(program)
-
-    def _remove_unused_var(self, program):
-        all_remove_vars = []
-        for block in program.blocks:
-            args = []
-            for op in block.ops:
-                args += op.input_arg_names
-                args += op.output_arg_names
-            args = list(set(args))  #vals of all left ops
-            var_names = block.vars.keys()  # all vals
-            sub_block_remove_vars = []
-            for var in var_names:
-                if var not in args:
-                    sub_block_remove_vars.append(var)
-            all_remove_vars.append(sub_block_remove_vars)
-
-        remove_vars = [list(set(v)) for v in all_remove_vars]
-        for i, block in enumerate(program.blocks):
-            for v in remove_vars[i]:
-                block._remove_var(v)
-
-    def _insert_quant_abs_max_op(self, block, idx, var, quant_bits):
-        """Insert fake_quantize_abs_max op.
-        """
-        quant_var = block.create_var(
-            name=_quantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype)
-        scale = block.create_var(
-            name=_quantized_scale_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype)
-        quant_op = block._insert_op(
-            idx,
-            type='fake_quantize_abs_max',
-            attrs={'bit_length': quant_bits},
-            inputs={'X': var},
-            outputs={'Out': quant_var,
-                     'OutScale': scale})
-        return quant_var, scale
-
-    def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits):
-        """Insert fake_quantize_range_abs_max
-        """
-        quant_var = block.create_var(
-            name=_quantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype)
-        scale = self.helper.create_parameter(
-            attr=ParamAttr(
-                name=_quantized_scale_name(var.name),
-                initializer=Constant(0.001),
-                trainable=False),
-            shape=[1],
-            dtype=var.dtype)
-        scale.stop_gradient = True
-
-        ins = {'X': var, 'InScale': scale}
-        outs = {'Out': quant_var, 'OutScale': scale}
-        if not self.is_test:
-            # A global step counter variable with type int64
-            scales = self.helper.create_global_variable(
-                name=unique_name.generate('scales'),
-                persistable=True,
-                dtype=var.dtype,
-                shape=[self.window_size])
-            self.helper.set_variable_initializer(
-                scales, initializer=Constant(value=0))
-
-            ins['Iter'] = self.global_step
-            outs['OutScales'] = scales
-
-        attrs = {
-            'window_size': self.window_size,
-            'bit_length': quant_bits,
-            'is_test': self.is_test
-        }
-
-        quant_op = block._insert_op(
-            idx,
-            type='fake_quantize_range_abs_max',
-            attrs=attrs,
-            inputs=ins,
-            outputs=outs)
-
-        return quant_var, scale
-
-    def _insert_quant_moving_average_abs_max_op(self, block, idx, var,
-                                                quant_bits):
-        """Insert fake_quantize_moving_average_abs_max
-        """
-        quant_var = block.create_var(
-            name=_quantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype)
-        state = self.helper.create_global_variable(
-            name=unique_name.generate('state'),
-            persistable=True,
-            dtype=var.dtype,
-            shape=[1])
-        self.helper.set_variable_initializer(
-            state, initializer=Constant(value=1))
-        accum = self.helper.create_global_variable(
-            name=unique_name.generate('accum'),
-            persistable=True,
-            dtype=var.dtype,
-            shape=[1])
-        self.helper.set_variable_initializer(
-            accum, initializer=Constant(value=1))
-        scale = self.helper.create_parameter(
-            attr=ParamAttr(
-                name=_quantized_scale_name(var.name),
-                initializer=Constant(0.001),
-                trainable=False),
-            shape=[1],
-            dtype=var.dtype)
-        scale.stop_gradient = True
-
-        ins = {'X': var, 'InScale': scale}
-        outs = {'Out': quant_var, 'OutScale': scale}
-        if not self.is_test:
-            ins['InState'] = state
-            ins['InAccum'] = accum
-            outs['OutState'] = state
-            outs['OutAccum'] = accum
-
-        attrs = {
-            'bit_length': quant_bits,
-            'moving_rate': self.moving_rate,
-            'is_test': self.is_test
-        }
-
-        quant_op = block._insert_op(
-            idx,
-            type='fake_quantize_moving_average_abs_max',
-            attrs=attrs,
-            inputs=ins,
-            outputs=outs)
-
-        return quant_var, scale
-
-    def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
-        """
-        Insert fake_quantize_op
-        """
-        if quant_type == 'abs_max':
-            return self._insert_quant_abs_max_op(block, idx, var, quant_bits)
-        elif quant_type == 'range_abs_max':
-            return self._insert_quant_range_abs_max_op(block, idx, var,
-                                                       quant_bits)
-        elif quant_type == 'moving_average_abs_max':
-            return self._insert_quant_moving_average_abs_max_op(block, idx, var,
-                                                                quant_bits)
-
-    def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
-        """
-        Insert fake_quantize_op
-        """
-        dequant_var = block.create_var(
-            name=_dequantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype)
-        # insert fake_dequantize_op
-        max_range = (1 << (quant_bits - 1)) - 1
-        dequant_op = block._insert_op(
-            idx,
-            type="fake_dequantize_max_abs",
-            attrs={'max_range': float(max_range)},
-            inputs={"X": var,
-                    'Scale': scale},
-            outputs={"Out": dequant_var})
-        return dequant_var
diff --git a/python/paddle/fluid/contrib/reader/README.md b/python/paddle/fluid/contrib/reader/README.md
deleted file mode 100644
index f043a17493ec2b77ae2910ebc02744db42d77dfb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/reader/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-## CTR READER
-
-An multi-thread cpp reader that has the same interface with py_reader. It
-uses cpp multi-thread to read file and is much more faster then the Python read
-thread in py_reader.
-
-Currently, it support two types of file:
- - gzip
- - plain text file
-
-and two types of data format:
- - cvs data format is :
-   * label dense_fea,dense_fea sparse_fea,sparse_fea
- - the svm data format is :
-   * label slot1:fea_sign slot2:fea_sign slot1:fea_sign
-
-## Distributed reader
-
-The distributed reader is mainly used by multi-process tasks, and the input must be a batch reader.
-
-Cons:
-  - It can be operated conveniently so that different processes can read different data.
-
-Pros:
-  - If batch_reader produces training data, and batch_reader loads or preprocesses data for a long time, this data reading method may be slower.
diff --git a/python/paddle/fluid/contrib/reader/__init__.py b/python/paddle/fluid/contrib/reader/__init__.py
deleted file mode 100644
index 32054d1421a27e7e73656e06555eec20e5ed0ea6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/reader/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from .distributed_reader import *
-
-__all__ = []
-__all__ += distributed_reader.__all__
diff --git a/python/paddle/fluid/contrib/reader/distributed_reader.py b/python/paddle/fluid/contrib/reader/distributed_reader.py
deleted file mode 100644
index ecee769218f5474cc5489c51bdc1f443833e66e8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/reader/distributed_reader.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-
-__all__ = ["distributed_batch_reader"]
-
-
-def distributed_batch_reader(batch_reader):
-    """
-    Create a reader for multi-process training. The input must be a batch reader.
-
-    Args:
-        batch_reader (callable): The input reader should be a batch reader.
-
-    Examples:
-
-    .. code-block:: python
-           import paddle
-           import paddle.fluid as fluid
-
-           train_reader = paddle.batch(paddle.dataset.mnist.train(),
-                    batch_size=32,drop_last=True)
-           train_reader = fluid.contrib.reader.distributed_batch_reader(
-                    train_reader)
-
-    """
-    trainers_num = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
-    assert trainer_id < trainers_num
-
-    def decorate_for_multi_process():
-        if trainers_num > 1:
-            print("start data reader (trainers_num: {}, trainer_id: {})".format(
-                trainers_num, trainer_id))
-
-        train_data, idx = None, 1
-        for batch_id, data in enumerate(batch_reader()):
-            if trainers_num > 1:
-                if idx < trainers_num:
-                    if idx == trainer_id + 1:
-                        train_data = data
-                    idx += 1
-                else:
-                    if idx == trainer_id + 1:
-                        train_data = data
-                    assert train_data is not None, "train data should not be None."
-                    yield train_data
-                    train_data, idx = None, 1
-            else:
-                yield data
-
-    return decorate_for_multi_process
diff --git a/python/paddle/fluid/contrib/slim/__init__.py b/python/paddle/fluid/contrib/slim/__init__.py
deleted file mode 100644
index 4a71fab6d0fc73aa3bbe9c9fe56278e473f354e1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .core import *
-__all__ = ['Compressor', ]
diff --git a/python/paddle/fluid/contrib/slim/core/__init__.py b/python/paddle/fluid/contrib/slim/core/__init__.py
deleted file mode 100644
index 831bd70ecc62f8d576b304c52b0abea994fd2ceb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/core/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import config
-from .config import *
-from . import compressor
-from .compressor import *
-from . import strategy
-from .strategy import *
-
-__all__ = config.__all__ + compressor.__all__ + strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/core/compressor.py b/python/paddle/fluid/contrib/slim/core/compressor.py
deleted file mode 100644
index 6ede756599fde04ef9151ea1c1e10d91ac7ee507..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/core/compressor.py
+++ /dev/null
@@ -1,585 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ....core import CPUPlace, EOFException
-from .... import compiler
-from ....framework import Variable
-from .... import io
-from .... import profiler
-from .... import scope_guard
-from ....data_feeder import DataFeeder
-from ....log_helper import get_logger
-from ..graph import *
-from .config import ConfigFactory
-import numpy as np
-from collections import Iterable
-import time
-import os
-import logging
-import sys
-import pickle
-import functools
-import traceback
-
-__all__ = ['Context', 'Compressor']
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-def cached_reader(reader, sampled_rate, cache_path, cached_id):
-    """
-    Sample partial data from reader and cache them into local file system.
-    Args:
-        reader: Iterative data source.
-        sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
-        cache_path(str): The path to cache the sampled data.
-        cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0.
-    """
-    np.random.seed(cached_id)
-    cache_path = os.path.join(cache_path, str(cached_id))
-    _logger.debug('read data from: {}'.format(cache_path))
-
-    def s_reader():
-        if os.path.isdir(cache_path):
-            for file_name in open(os.path.join(cache_path, "list")):
-                yield np.load(os.path.join(cache_path, file_name.strip()))
-        else:
-            os.makedirs(cache_path)
-            list_file = open(os.path.join(cache_path, "list"), 'w')
-            batch = 0
-            dtype = None
-            for data in reader():
-                if batch == 0 or (np.random.uniform() < sampled_rate):
-                    np.save(
-                        os.path.join(cache_path, 'batch' + str(batch)), data)
-                    list_file.write('batch' + str(batch) + '.npy\n')
-                    batch += 1
-                    yield data
-
-    return s_reader
-
-
-class Context(object):
-    """
-    The context in the process of compression.
-    """
-
-    def __init__(self,
-                 place,
-                 scope,
-                 train_graph=None,
-                 train_reader=None,
-                 eval_graph=None,
-                 eval_reader=None,
-                 teacher_graphs=None,
-                 train_optimizer=None,
-                 distiller_optimizer=None,
-                 search_space=None):
-        """
-        Args:
-            place: The device place where the compression job running.
-            scope: The scope used in compression job.
-            train_graph: The graph with loss as output node.
-            eval_graph: The graph used for evaluation.
-            eval_reader: The data reader used for evaluation.
-            teacher_graphs: The teacher graphs used in distillation strategies.
-            train_optimizer: The optimizer used to append backward ops and
-                             optimization ops into train_graph.
-            distiller_optimizer: The optimizer used by distillation strategies.
-        """
-        # The total number of epoches to be trained.
-        self.epoch = 0
-        # Current epoch
-        self.epoch_id = 0
-        # Current batch
-        self.batch_id = 0
-
-        self.k_v = {}
-
-        self.place = place
-        self.scope = scope
-        self.train_graph = train_graph
-        self.train_reader = train_reader
-        self.eval_graph = eval_graph
-        self.eval_reader = eval_reader
-        self.executor = None
-        self.teacher_graphs = teacher_graphs
-        self.train_optimizer = train_optimizer
-        self.distiller_optimizer = distiller_optimizer
-        self.optimize_graph = None
-        self.cache_path = './eval_cache'
-        self.eval_results = {}
-
-        self.skip_training = False
-        self.search_space = search_space
-
-    def to_file(self, file_name):
-        """
-        Save the context into file.
-        """
-        data = {}
-        data['epoch_id'] = self.epoch_id
-        data['eval_results'] = self.eval_results
-        with open(file_name, 'wb') as context_file:
-            pickle.dump(data, context_file)
-
-    def from_file(self, file_name):
-        """
-        Load the context from file.
-        """
-        with open(file_name, 'rb') as context_file:
-            if sys.version_info < (3, 0):
-                data = pickle.load(context_file)
-            else:
-                data = pickle.load(context_file, encoding='bytes')
-            self.epoch_id = data['epoch_id']
-            self.eval_results = data['eval_results']
-
-    def eval_converged(self, metric_name, delta=0.001):
-        """
-        Check whether the training has been converged.
-        Args:
-            metric_name(str): The metric used to check convergence.
-            delta(float): '(metric[k] - metric[k-1] / metric[k-1]) < delta'
-                          means that the training has been converged.
-        Returns:
-            bool: True means the training has been converged.
-        """
-        # TODO(wanghaoshuang@baidu.com): enhence this method.
-        if (metric_name not in self.eval_results
-            ) or len(self.eval_results[metric_name]) < 2:
-            return False
-        results = self.eval_results[metric_name][-2:]
-        _logger.info('Latest evaluations: {}'.format(results))
-        return abs(results[1] - results[0]) / results[0] < delta
-
-    def run_eval_graph(self, sampled_rate=None, cached_id=0):
-        """
-        Evaluate the current mode in context.
-        Args:
-            sampled_rate(float): The sampled rate used to sample partial data
-            for evaluation. None means using all data in eval_reader. default: None.
-            cached_id(int): The id of dataset sampled. Evaluations with same
-                            cached_id use the same sampled dataset. default: 0.
-        """
-        _logger.info('Running evaluation')
-        assert self.eval_graph is not None
-        assert self.eval_reader is not None
-        eval_graph = self.eval_graph.clone(for_test=True)
-
-        executor = SlimGraphExecutor(self.place)
-        results = []
-        batch_id = 0
-        s_time = time.time()
-        reader = self.eval_reader
-        if sampled_rate:
-            reader = cached_reader(reader, sampled_rate, self.cache_path,
-                                   cached_id)
-
-        if isinstance(reader, Variable):
-            reader.start()
-            try:
-                while True:
-                    result = executor.run(eval_graph, self.scope)
-                    result = [np.mean(r) for r in result]
-                    results.append(result)
-                    if batch_id % 20 == 0:
-                        _logger.info("batch-{}; {}={}".format(
-                            batch_id, eval_graph.out_nodes.keys(), result))
-                    batch_id += 1
-            except EOFException:
-                reader.reset()
-        else:
-            for data in reader():
-                result = executor.run(eval_graph, self.scope, data=data)
-                result = [np.mean(r) for r in result]
-                results.append(result)
-                if batch_id % 20 == 0:
-                    _logger.info("batch-{}; {}={}".format(
-                        batch_id, eval_graph.out_nodes.keys(), result))
-                batch_id += 1
-
-        result = np.mean(np.array(results), axis=0)
-        _logger.info("Final eval result: {}={}".format(
-            eval_graph.out_nodes.keys(), result))
-        if not isinstance(result, Iterable):
-            result = [result]
-        _logger.info('Finish evaluation')
-        return result, eval_graph.out_nodes.keys()
-
-    def put(self, key, value):
-        self.k_v[key] = value
-
-    def get(self, key):
-        return self.k_v.get(key)
-
-
-class Compressor(object):
-    """
-    The pass used to compress model.
-    """
-
-    def __init__(self,
-                 place,
-                 scope,
-                 train_program,
-                 train_reader=None,
-                 train_feed_list=None,
-                 train_fetch_list=None,
-                 eval_program=None,
-                 eval_reader=None,
-                 eval_feed_list=None,
-                 eval_fetch_list=None,
-                 eval_func=None,
-                 save_eval_model=True,
-                 prune_infer_model=None,
-                 teacher_programs=[],
-                 checkpoint_path=None,
-                 train_optimizer=None,
-                 distiller_optimizer=None,
-                 search_space=None):
-        """
-        Args:
-            place(fluid.Place): The device place where the compression job running.
-            scope(fluid.core.Scope): The scope used to run graph.
-            train_program(Program): The main program to be compressed. It must have loss op.
-            train_reader: The data reader used for training.
-            train_feed_list(dict): A dict to indicate the input variable of the training program.
-                                   The key is user-defined and human-readable name.
-                                   The value is the name of Variable.
-            train_fetch_list(dict): A dict to indicate the output variable of the training program.
-                                   The key is user-defined and human-readable name.
-                                   The value is the name of Variable.
-            eval_program(Program): The program used for evaluation.
-            eval_reader: The data reader used for evaluation. It can be None if eval_func is not None.
-            eval_feed_list(dict): A dict to indicate the input variable of the evaluation program.
-                                   The key is user-defined and human-readable name.
-                                   The value is the name of Variable.
-                                   It can be None if eval_func is not None.
-            eval_fetch_list(dict): A dict to indicate the output variable of the evaluation program.
-                                   The key is user-defined and human-readable name.
-                                   The value is the name of Variable.
-            eval_func(dict|function): Callback functions used to evaluate the compressed model.
-                                   The eval_func is a dict, the key is user-defined name and the value is 
-                                   a callback function. And the score returned from callback functions 
-                                   can be referenced in config file by the key of eval_func.
-                                   The args of callback function are compressed eval_program and scope which
-                                   store the compressed parameters.
-                                   Default: None.
-            save_eval_model(bool): Whether to save eval model when saving checkpoints. Default: True.
-            prune_infer_model(tuple|list): If prune_infer_model is not None, compressor will prune
-                                   eval program into inference program according to inputs and outputs
-                                   defined in prune_infer_model. prune_infer_model[0] is a list of input
-                                   variables' names and prune_infer_model[1] is a list of output variables'
-                                   names. If prune_infer_model is None, it will not save inference model.
-                                   Default: None.
-            teacher_programs: The teacher graphs used in distillation strategies.
-            train_optimizer: The optimizer used to append backward ops and
-                             optimization ops into train_graph.
-            distiller_optimizer: The optimizer used by distillation strategies. In distillation strategy,
-                                 this optimizer is used to minimize the combined loss of student-net and
-                                 teacher-net while train_optimizer is used to minimize loss of
-                                 student-net in fine-tune stage. 
-            search_space(slim.nas.SearchSpace): The instance that define the searching space. It must inherite
-                              slim.nas.SearchSpace class and overwrite the abstract methods.
-
-        """
-        assert train_feed_list is None or isinstance(
-            train_feed_list, list
-        ), "train_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
-        assert eval_feed_list is None or isinstance(
-            eval_feed_list, list
-        ), "eval_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
-        self.strategies = []
-        self.epoch = 0
-        self.place = CPUPlace() if place is None else place
-        self.scope = scope
-        self.train_graph = GraphWrapper(
-            train_program, in_nodes=train_feed_list, out_nodes=train_fetch_list)
-        self.eval_graph = GraphWrapper(
-            eval_program, in_nodes=eval_feed_list, out_nodes=eval_fetch_list)
-        self.train_reader = train_reader
-        self.eval_reader = eval_reader
-        self.eval_func = eval_func
-        self.save_eval_model = save_eval_model
-        self.prune_infer_model = prune_infer_model
-
-        self.teacher_graphs = []
-        for teacher in teacher_programs:
-            self.teacher_graphs.append(GraphWrapper(teacher))
-
-        self.checkpoint = None
-        self.checkpoint_path = checkpoint_path
-        self.eval_epoch = 1
-
-        self.train_optimizer = train_optimizer
-        self.distiller_optimizer = distiller_optimizer
-        self.init_model = None
-
-        self.search_space = search_space
-
-    def _add_strategy(self, strategy):
-        """
-        Add a strategy to current compress pass.
-        Args:
-            strategy: The strategy to be added into current compress pass.
-        """
-        self.strategies.append(strategy)
-        self.epoch = max(strategy.end_epoch, self.epoch)
-
-    def config(self, config_file):
-        """
-        Configure the compress pass from file with yaml format.
-        Args:
-            config_file(str): The config file in local file system.
-        """
-        factory = ConfigFactory(config_file)
-        self.epoch = factory.compressor['epoch']
-        for strategy in factory.compressor['strategies']:
-            self._add_strategy(strategy)
-        if 'checkpoint_path' in factory.compressor:
-            self.checkpoint_path = factory.compressor['checkpoint_path']
-
-        if 'init_model' in factory.compressor:
-            self.init_model = factory.compressor['init_model']
-
-        if 'eval_epoch' in factory.compressor:
-            self.eval_epoch = factory.compressor['eval_epoch']
-
-    def _init_model(self, context):
-        """
-        Load model that has been compressed. 
-        """
-        if self.init_model and os.path.exists(self.init_model):
-            exe = SlimGraphExecutor(context.place)
-            with scope_guard(context.scope):
-                context.train_graph.load_persistables(self.init_model, exe)
-            flops = context.eval_graph.flops()
-            conv_flops = context.eval_graph.flops(only_conv=True)
-            context.eval_graph.update_param_shape(context.scope)
-            context.eval_graph.update_groups_of_conv()
-            _logger.info("conv flops: -{}".format(1 - float(
-                context.eval_graph.flops(only_conv=True)) / conv_flops))
-            _logger.info("total flops: -{}".format(1 - float(
-                context.eval_graph.flops()) / flops))
-            context.train_graph.update_param_shape(context.scope)
-            context.train_graph.update_groups_of_conv()
-            context.train_graph.infer_shape()
-            _logger.info("Init model from: {}".format(self.init_model))
-
-    def _load_checkpoint(self, context):
-        """
-        Load checkpoints from file.
-        """
-        _logger.debug('_load_checkpoint')
-        strategies = self.strategies
-        if self.checkpoint_path:
-            if not os.path.exists(self.checkpoint_path):
-                _logger.warning("Checkpints path doesn't exist: [{}]".format(
-                    self.checkpoint_path))
-                return context, strategies
-            checkpoints = [
-                dir for dir in os.listdir(self.checkpoint_path)
-                if os.path.isdir(os.path.join(self.checkpoint_path, dir))
-            ]
-            _logger.debug('self.checkpoint_path: {}'.format(
-                self.checkpoint_path))
-            _logger.info('checkpoints: {}'.format(checkpoints))
-            if len(checkpoints) > 0:
-                latest = max([int(ck) for ck in checkpoints])
-                latest_ck_path = os.path.join(self.checkpoint_path, str(latest))
-
-                model_path = os.path.join(latest_ck_path, 'model')
-                context_path = os.path.join(latest_ck_path, 'context')
-                strategy_path = os.path.join(latest_ck_path, 'strategies')
-                if os.path.exists(context_path):
-                    context.from_file(context_path)
-                    context.epoch_id += 1
-                if os.path.exists(strategy_path):
-                    with open(strategy_path, 'rb') as strategy_file:
-                        if sys.version_info < (3, 0):
-                            strategies = pickle.load(strategy_file)
-                        else:
-                            strategies = pickle.load(
-                                strategy_file, encoding='bytes')
-
-                for s, s1 in zip(self.strategies, strategies):
-                    s1.__dict__.update(s.__dict__)
-
-                for strategy in strategies:
-                    strategy.restore_from_checkpoint(context)
-
-                if os.path.exists(model_path):
-                    exe = SlimGraphExecutor(context.place)
-                    with scope_guard(context.scope):
-                        context.optimize_graph.load_persistables(model_path,
-                                                                 exe)
-                    _logger.info("Loaded params from: {}".format(model_path))
-        return context, strategies
-
-    def _save_checkpoint(self, context):
-        """
-        Save checkpoints to file.
-        """
-        if context.epoch_id % 1 == 0 and self.checkpoint_path:
-            checkpoint_path = os.path.join(self.checkpoint_path,
-                                           str(context.epoch_id))
-            model_path = os.path.join(checkpoint_path, 'model')
-            eval_model_path = os.path.join(checkpoint_path, 'eval_model')
-            context_path = os.path.join(checkpoint_path, 'context')
-            strategy_path = os.path.join(checkpoint_path, 'strategies')
-            if not os.path.isdir(model_path):
-                os.makedirs(model_path)
-            exe = SlimGraphExecutor(context.place)
-            with scope_guard(context.scope):
-                context.optimize_graph.save_persistables(model_path, exe)
-                if self.save_eval_model:
-                    context.eval_graph.save_model(eval_model_path, exe)
-                if self.prune_infer_model:
-                    context.eval_graph.save_infer_model(
-                        eval_model_path,
-                        exe,
-                        self.prune_infer_model,
-                        program_only=self.save_eval_model)
-
-            context.to_file(context_path)
-            with open(strategy_path, 'wb') as strategy_file:
-                pickle.dump(self.strategies, strategy_file)
-            _logger.info('Saved checkpoint to: {}'.format(checkpoint_path))
-
-    def _train_one_epoch(self, context):
-        """
-        Train one epoch.
-        """
-        if context.skip_training:
-            return
-        executor = SlimGraphExecutor(self.place)
-
-        if context.optimize_graph.compiled_graph is None:
-            context.optimize_graph.compiled_graph = compiler.CompiledProgram(
-                context.optimize_graph.program).with_data_parallel(
-                    loss_name=context.optimize_graph.out_nodes['loss'])
-
-        if isinstance(context.train_reader, Variable):
-            context.train_reader.start()
-            try:
-                while True:
-
-                    for strategy in self.strategies:
-                        strategy.on_batch_begin(context)
-                    results = executor.run(context.optimize_graph,
-                                           context.scope)
-                    results = [float(np.mean(result)) for result in results]
-                    if context.batch_id % 20 == 0:
-                        _logger.info("epoch:{}; batch_id:{}; {} = {}".format(
-                            context.epoch_id, context.batch_id,
-                            context.optimize_graph.out_nodes.keys(
-                            ), [round(r, 3) for r in results]))
-                    for strategy in self.strategies:
-                        strategy.on_batch_end(context)
-                    context.batch_id += 1
-
-            except EOFException:
-                context.train_reader.reset()
-
-        else:
-            for data in context.train_reader():
-                for strategy in self.strategies:
-                    strategy.on_batch_begin(context)
-                results = executor.run(context.optimize_graph,
-                                       context.scope,
-                                       data=data)
-                results = [float(np.mean(result)) for result in results]
-                if context.batch_id % 20 == 0:
-                    _logger.info("epoch:{}; batch_id:{}; {} = {}".format(
-                        context.epoch_id, context.batch_id,
-                        context.optimize_graph.out_nodes.keys(
-                        ), [round(r, 3) for r in results]))
-                for strategy in self.strategies:
-                    strategy.on_batch_end(context)
-                context.batch_id += 1
-        context.batch_id = 0
-
-    def _eval(self, context):
-        """
-        Runing evaluation.
-        """
-        if self.eval_func is not None:
-            for key in self.eval_func:
-                func = self.eval_func[key]
-                if key not in context.eval_results:
-                    context.eval_results[key] = []
-                context.eval_results[key].append(
-                    func(self.eval_graph.program, self.scope))
-        else:
-            results, names = context.run_eval_graph()
-            for name, result in zip(names, results):
-                if name not in context.eval_results:
-                    context.eval_results[name] = []
-                context.eval_results[name].append(result)
-
-    def run(self):
-        """
-        Execute compressiong pass.
-        """
-        context = Context(
-            place=self.place,
-            scope=self.scope,
-            train_graph=self.train_graph,
-            train_reader=self.train_reader,
-            eval_graph=self.eval_graph,
-            eval_reader=self.eval_reader,
-            teacher_graphs=self.teacher_graphs,
-            train_optimizer=self.train_optimizer,
-            distiller_optimizer=self.distiller_optimizer,
-            search_space=self.search_space)
-        self.context = context
-        if self.teacher_graphs:
-            context.put('teachers', self.teacher_graphs)
-        self._init_model(context)
-        if not context.optimize_graph:
-            if context.train_optimizer:
-                context.train_optimizer._name = 'train_opt'
-                context.optimize_graph = context.train_graph.get_optimize_graph(
-                    context.train_optimizer, context.place, context.scope)
-            else:
-                context.optimize_graph = context.train_graph
-
-        context, self.strategies = self._load_checkpoint(context)
-
-        for strategy in self.strategies:
-            strategy.on_compression_begin(context)
-        if 'MKLDNNPostTrainingQuantStrategy' in [
-                i.__class__.__name__ for i in self.strategies
-        ]:
-            return None
-        start = context.epoch_id
-        for epoch in range(start, self.epoch):
-            context.epoch_id = epoch
-            try:
-                for strategy in self.strategies:
-                    strategy.on_epoch_begin(context)
-                self._train_one_epoch(context)
-                if self.eval_epoch and epoch % self.eval_epoch == 0:
-                    self._eval(context)
-                self._save_checkpoint(context)
-                for strategy in self.strategies:
-                    strategy.on_epoch_end(context)
-            except Exception:
-                _logger.error(traceback.print_exc())
-                continue
-        for strategy in self.strategies:
-            strategy.on_compression_end(context)
-        return context.eval_graph
diff --git a/python/paddle/fluid/contrib/slim/core/config.py b/python/paddle/fluid/contrib/slim/core/config.py
deleted file mode 100644
index cbe1c736fe8ebc1bf6b9032d49e36067187a0878..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/core/config.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import funcsigs
-import yaml
-from collections import OrderedDict
-from ..prune import *
-from ..quantization import *
-from .strategy import *
-from ..distillation import *
-from ..searcher import *
-from ..nas import *
-
-__all__ = ['ConfigFactory']
-"""This factory is used to create instances by loading and parsing configure file with yaml format.
-"""
-
-PLUGINS = ['pruners', 'quantizers', 'distillers', 'strategies', 'controllers']
-
-
-class ConfigFactory(object):
-    def __init__(self, config):
-        """Init a factory from configure file."""
-        self.instances = {}
-        self.compressor = {}
-        self.version = None
-        self._parse_config(config)
-
-    def instance(self, name):
-        """
-        Get instance from factory.
-        """
-        if name in self.instances:
-            return self.instances[name]
-        else:
-            return None
-
-    def _new_instance(self, name, attrs):
-        if name not in self.instances:
-            class_ = globals()[attrs['class']]
-            sig = funcsigs.signature(class_.__init__)
-            keys = [
-                param.name for param in sig.parameters.values()
-                if (param.kind == param.POSITIONAL_OR_KEYWORD)
-            ][1:]
-            keys = set(attrs.keys()).intersection(set(keys))
-            args = {}
-            for key in keys:
-                value = attrs[key]
-                if isinstance(value, str) and value.lower() == 'none':
-                    value = None
-                if isinstance(value, str) and value in self.instances:
-                    value = self.instances[value]
-                if isinstance(value, list):
-                    for i in range(len(value)):
-                        if isinstance(value[i],
-                                      str) and value[i] in self.instances:
-                            value[i] = self.instances[value[i]]
-
-                args[key] = value
-            self.instances[name] = class_(**args)
-        return self.instances.get(name)
-
-    def _parse_config(self, config):
-        assert config
-        with open(config, 'r') as config_file:
-            key_values = self._ordered_load(config_file)
-            for key in key_values:
-                # parse version
-                if key == 'version' and self.version is None:
-                    self.version = int(key_values['version'])
-                    assert self.version == int(key_values['version'])
-
-                # parse pruners
-                if key in PLUGINS:
-                    instances = key_values[key]
-                    for name in instances:
-                        self._new_instance(name, instances[name])
-
-                if key == 'compressor':
-                    self.compressor['strategies'] = []
-                    self.compressor['epoch'] = key_values[key]['epoch']
-                    if 'init_model' in key_values[key]:
-                        self.compressor['init_model'] = key_values[key][
-                            'init_model']
-                    if 'checkpoint_path' in key_values[key]:
-                        self.compressor['checkpoint_path'] = key_values[key][
-                            'checkpoint_path']
-                    if 'eval_epoch' in key_values[key]:
-                        self.compressor['eval_epoch'] = key_values[key][
-                            'eval_epoch']
-                    if 'strategies' in key_values[key]:
-                        for name in key_values[key]['strategies']:
-                            strategy = self.instance(name)
-                            self.compressor['strategies'].append(strategy)
-
-                if key == 'include':
-                    for config_file in key_values[key]:
-                        self._parse_config(config_file.strip())
-
-    def _ordered_load(self,
-                      stream,
-                      Loader=yaml.Loader,
-                      object_pairs_hook=OrderedDict):
-        """
-        See: https://stackoverflow.com/questions/5121931/in-python-how-can-you-load-yaml-mappings-as-ordereddicts
-        """
-
-        class OrderedLoader(Loader):
-            pass
-
-        def construct_mapping(loader, node):
-            loader.flatten_mapping(node)
-            return object_pairs_hook(loader.construct_pairs(node))
-
-        OrderedLoader.add_constructor(
-            yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping)
-        return yaml.load(stream, OrderedLoader)
diff --git a/python/paddle/fluid/contrib/slim/core/strategy.py b/python/paddle/fluid/contrib/slim/core/strategy.py
deleted file mode 100644
index c0ddb758f39c9e295f3eadca713a29f1738bb3d4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/core/strategy.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ['Strategy']
-
-
-class Strategy(object):
-    """
-    Base class for all strategies.
-    """
-
-    def __init__(self, start_epoch=0, end_epoch=0):
-        """
-        Args:
-            start_epoch: The first epoch to apply the strategy.
-            end_epoch: The last epoch to apply the strategy.
-        """
-        self.start_epoch = start_epoch
-        self.end_epoch = end_epoch
-
-    def __getstate__(self):
-        d = {}
-        for key in self.__dict__:
-            if key not in ["start_epoch", "end_epoch"]:
-                d[key] = self.__dict__[key]
-        return d
-
-    def on_compression_begin(self, context):
-        pass
-
-    def on_epoch_begin(self, context):
-        pass
-
-    def on_epoch_end(self, context):
-        pass
-
-    def on_batch_begin(self, context):
-        pass
-
-    def on_batch_end(self, context):
-        pass
-
-    def on_compression_end(self, context):
-        pass
-
-    def restore_from_checkpoint(self, context):
-        pass
diff --git a/python/paddle/fluid/contrib/slim/distillation/__init__.py b/python/paddle/fluid/contrib/slim/distillation/__init__.py
deleted file mode 100644
index 455c7c563318daec42892e71dcf0a48f22f376a1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/distillation/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import distiller
-from .distiller import *
-from . import distillation_strategy
-from .distillation_strategy import *
-
-__all__ = distiller.__all__
-__all__ += distillation_strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
deleted file mode 100644
index c54e5dc5b559b428cca99f6e4cce7b9e342535c8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..core.strategy import Strategy
-from ....framework import Program, Variable, program_guard
-from ....log_helper import get_logger
-from .... import Executor
-import logging
-
-__all__ = ['DistillationStrategy']
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class DistillationStrategy(Strategy):
-    def __init__(self, distillers=None, start_epoch=0, end_epoch=0):
-        """
-        Args:
-            distillers(list): A list of distiller used to combine student graph and teacher graph
-                              by adding some loss.
-            start_epoch(int): The epoch when to merge student graph and teacher graph for
-                              distillation training. default: 0
-            end_epoch(int): The epoch when to finish distillation training. default: 0
-            
-        """
-        super(DistillationStrategy, self).__init__(start_epoch, end_epoch)
-        self.distillers = distillers
-
-    def restore_from_checkpoint(self, context):
-        # load from checkpoint
-        if context.epoch_id > 0:
-            if context.epoch_id > self.start_epoch and context.epoch_id < self.end_epoch:
-                _logger.info('Restore DistillationStrategy')
-                self._create_distillation_graph(context)
-                _logger.info('Restore DistillationStrategy finish.')
-
-    def on_epoch_begin(self, context):
-        if self.start_epoch == context.epoch_id:
-            _logger.info('DistillationStrategy::on_epoch_begin.')
-            self._create_distillation_graph(context)
-            _logger.info('DistillationStrategy set optimize_graph.')
-
-    def _create_distillation_graph(self, context):
-        """
-        step 1: Merge student graph and teacher graph into distillation graph.
-        step 2: Add loss into distillation graph by distillers.
-        step 3: Append backward ops and optimize ops into distillation graph for training.
-        """
-        # step 1
-        teacher = context.teacher_graphs[0]
-        for var in teacher.program.list_vars():
-            var.stop_gradient = True
-        graph = context.train_graph.clone()
-        graph.merge(teacher)
-        if 'loss' in graph.out_nodes:
-            graph.out_nodes['student_loss'] = graph.out_nodes['loss']
-
-        # step 2
-        for distiller in self.distillers:
-            graph = distiller.distiller_loss(graph)
-
-        # step 3
-        startup_program = Program()
-        with program_guard(graph.program, startup_program):
-            context.distiller_optimizer._name = 'distillation_optimizer'
-
-            # The learning rate variable may be created in other program.
-            # Update information in optimizer to make
-            # learning rate variable being accessible in current program.
-            optimizer = context.distiller_optimizer
-            if isinstance(optimizer._learning_rate, Variable):
-                optimizer._learning_rate_map[
-                    graph.program] = optimizer._learning_rate
-
-            optimizer.minimize(graph.var(graph.out_nodes['loss'])._var)
-
-        exe = Executor(context.place)
-        exe.run(startup_program, scope=context.scope)
-
-        # backup graph for fine-tune after distillation
-        context.put('distillation_backup_optimize_graph',
-                    context.optimize_graph)
-        context.optimize_graph = graph
-
-    def on_epoch_end(self, context):
-        if context.epoch_id == (self.end_epoch - 1):
-            _logger.info('DistillationStrategy::on_epoch_end.')
-            # restore optimize_graph for fine-tune or other strategy in next stage.
-            context.optimize_graph = context.get(
-                'distillation_backup_optimize_graph')
-            _logger.info(
-                'DistillationStrategy set context.optimize_graph to None.')
diff --git a/python/paddle/fluid/contrib/slim/distillation/distiller.py b/python/paddle/fluid/contrib/slim/distillation/distiller.py
deleted file mode 100644
index eda7954a2f1d8e3364a14b0d6ccb81fcbf5d489f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/distillation/distiller.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .... import layers
-from .... import optimizer
-from .... import Executor
-from .... import Program
-from .... import program_guard
-from .... import regularizer
-
-__all__ = ['FSPDistiller', 'L2Distiller', 'SoftLabelDistiller']
-
-
-class L2Distiller(object):
-    """
-    Combine two layers from student net and teacher net by l2-loss.
-    And add the loss into the total loss using for distillation training.
-    """
-
-    def __init__(self,
-                 student_feature_map,
-                 teacher_feature_map,
-                 distillation_loss_weight=1):
-        """
-        Args:
-            student_feature_map(str): The name of feature map from student network.
-            teacher_feature_map(str): The name of feature map from teacher network.
-                                      It's shape should be the same with student network.
-            distillation_loss_weight(float): The weight of the l2-loss.
-        """
-        self.student_feature_map = student_feature_map
-        self.teacher_feature_map = teacher_feature_map
-        self.distillation_loss_weight = distillation_loss_weight
-
-    def distiller_loss(self, graph):
-        """
-        Modify graph inplace to add l2-loss.
-        Args: 
-            graph(GraphWrapper): The graph to be modified.
-        Returns:
-            GraphWrapper: The modified graph.
-        """
-        distiller_pass = L2DistillerPass(self.student_feature_map,
-                                         self.teacher_feature_map,
-                                         self.distillation_loss_weight)
-        dis_graph = distiller_pass.apply(graph)
-        return dis_graph
-
-
-class L2DistillerPass(object):
-    """
-    The pass used to add l2-loss.
-    """
-
-    def __init__(self,
-                 student_feature_map,
-                 teacher_feature_map,
-                 distillation_loss_weight=1):
-        """
-        Args:
-            student_feature_map(str): The name of feature map from student network.
-            teacher_feature_map(str): The name of feature map from teacher network.
-                                      It's shape should be the same with student network.
-            distillation_loss_weight(float): The weight of the l2-loss.
-        """
-        self.student_feature_map = student_feature_map
-        self.teacher_feature_map = teacher_feature_map
-        self.distillation_loss_weight = distillation_loss_weight
-
-    def apply(self, graph):
-        ret_graph = graph
-        with program_guard(ret_graph.program):
-
-            student_feature_map = ret_graph.var(self.student_feature_map)._var
-            teacher_feature_map = ret_graph.var(self.teacher_feature_map)._var
-            l2loss = layers.reduce_mean(
-                layers.square(student_feature_map - teacher_feature_map))
-
-            distillation_loss = l2loss * self.distillation_loss_weight
-            student_loss = 0
-            if 'loss' in ret_graph.out_nodes:
-                student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
-            loss = distillation_loss + student_loss
-
-            ret_graph.out_nodes['loss'] = loss.name
-            ret_graph.out_nodes[
-                'l2loss_' + self.student_feature_map + "_" +
-                self.teacher_feature_map] = distillation_loss.name
-        return ret_graph
-
-
-class FSPDistiller(object):
-    """
-    Combine layers from student net and teacher net by fsp-loss.
-    """
-
-    def __init__(self, student_pairs, teacher_pairs,
-                 distillation_loss_weight=1):
-        """
-        Args:
-            student_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
-                                        a section in student network. The variables in a tuple should
-                                        have the same feature map size.
-            teacher_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
-                                        a section in teacher network. The variables in a tuple should
-                                        have the same feature map size. Varibale named teacher_pairs[i][j]
-                                        should has the save channel number with that of variable named 
-                                        student_pairs[i][j].
-
-            distillation_loss_weight(float): The weight of the fsp-loss. default: 1.
-        """
-        self.student_pairs = student_pairs
-        self.teacher_pairs = teacher_pairs
-        self.distillation_loss_weight = distillation_loss_weight
-
-    def distiller_loss(self, graph):
-        """
-        Modify graph inplace to add fsp-loss.
-        Args: 
-            graph(GraphWrapper): The graph to be modified.
-        Returns:
-            GraphWrapper: The modified graph.
-        """
-        distiller_pass = FSPDistillerPass(self.student_pairs,
-                                          self.teacher_pairs,
-                                          self.distillation_loss_weight)
-        dis_graph = distiller_pass.apply(graph)
-        return dis_graph
-
-
-class FSPDistillerPass(object):
-    '''
-    Combine layers from student net and teacher net by fsp-loss.
-    '''
-
-    def __init__(self, s_pairs, t_pairs, distillation_loss_weight=1):
-        """
-        Args:
-            s_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
-                                        a section in student network. The variables in a tuple should
-                                        have the same feature map size.
-            t_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
-                                        a section in teacher network. The variables in a tuple should
-                                        have the same feature map size. Varibale named teacher_pairs[i][j]
-                                        should has the save channel number with that of variable named 
-                                        student_pairs[i][j].
-
-            distillation_loss_weight(float): The weight of the fsp-loss. default: 1.
-        """
-        self.s_pairs = s_pairs
-        self.t_pairs = t_pairs
-        self.distillation_loss_weight = distillation_loss_weight
-
-    def apply(self, graph):
-        ret_graph = graph
-        with program_guard(ret_graph.program):
-            losses = []
-            for s_pair, t_pair in zip(self.s_pairs, self.t_pairs):
-                s_pair_start = ret_graph.var(s_pair[0])._var
-                s_pair_end = ret_graph.var(s_pair[1])._var
-                s_fsp_matrix = self._fsp_matrix(s_pair_start, s_pair_end)
-                t_pair_start = ret_graph.var(t_pair[0])._var
-                t_pair_end = ret_graph.var(t_pair[1])._var
-                t_fsp_matrix = self._fsp_matrix(t_pair_start, t_pair_end)
-                l2_loss = layers.reduce_mean(
-                    layers.square(s_fsp_matrix - t_fsp_matrix))
-                losses.append(l2_loss)
-            distillation_loss = layers.sum(
-                losses) * self.distillation_loss_weight
-            student_loss = 0
-            if 'loss' in ret_graph.out_nodes:
-                student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
-            loss = distillation_loss + student_loss
-
-            ret_graph.out_nodes['loss'] = loss.name
-            ret_graph.out_nodes[
-                'fsp_distillation_loss'] = distillation_loss.name
-        return ret_graph
-
-    def _fsp_matrix(self, fea_map_0, fea_map_1):
-        return layers.fsp_matrix(fea_map_0, fea_map_1)
-
-
-class SoftLabelDistiller(object):
-    """
-    Combine two layers from student net and teacher net by softmax_with_cross_entropy loss.
-    And add the loss into the total loss using for distillation training.
-    """
-
-    def __init__(self,
-                 student_feature_map=None,
-                 teacher_feature_map=None,
-                 student_temperature=1.0,
-                 teacher_temperature=1.0,
-                 distillation_loss_weight=1):
-        """
-        Args:
-            student_feature_map(str): The name of feature map from student network.
-            teacher_feature_map(str): The name of feature map from teacher network.
-                                      It's shape should be the same with student network.
-            student_temperature(float): Temperature used to divide student_feature_map before softmax_with_cross_entropy. default: 1.0
-            teacher_temperature(float): Temperature used to divide teacher_feature_map before softmax_with_cross_entropy. default: 1.0
-            distillation_loss_weight(float): The weight of the l2-loss.
-        """
-
-        self.student_feature_map = student_feature_map
-        self.teacher_feature_map = teacher_feature_map
-        self.distillation_loss_weight = distillation_loss_weight
-        self.student_temperature = student_temperature
-        self.teacher_temperature = teacher_temperature
-
-    def distiller_loss(self, graph):
-        """
-        Modify graph inplace to add softmax_with_cross_entropy loss.
-        Args: 
-            graph(GraphWrapper): The graph to be modified.
-        Returns:
-            GraphWrapper: The modified graph.
-        """
-        distiller_pass = SoftLabelDistillerPass(
-            self.student_feature_map, self.teacher_feature_map,
-            self.student_temperature, self.teacher_temperature,
-            self.distillation_loss_weight)
-        dis_graph = distiller_pass.apply(graph)
-        return dis_graph
-
-
-class SoftLabelDistillerPass(object):
-    def __init__(self,
-                 student_feature_map,
-                 teacher_feature_map,
-                 student_temperature,
-                 teacher_temperature,
-                 distillation_loss_weight=1):
-        """
-        Args:
-            student_feature_map(str): The name of feature map from student network.
-            teacher_feature_map(str): The name of feature map from teacher network.
-                                      It's shape should be the same with student network.
-            student_temperature(float): Temperature used to divide student_feature_map before softmax_with_cross_entropy.
-            teacher_temperature(float): Temperature used to divide teacher_feature_map before softmax_with_cross_entropy.
-            distillation_loss_weight(float): The weight of the l2-loss.
-        """
-        self.student_feature_map = student_feature_map
-        self.teacher_feature_map = teacher_feature_map
-        self.student_temperature = student_temperature
-        self.teacher_temperature = teacher_temperature
-        self.distillation_loss_weight = distillation_loss_weight
-
-    def apply(self, graph):
-        ret_graph = graph
-        with program_guard(ret_graph.program):
-
-            student_feature_map = ret_graph.var(self.student_feature_map)._var
-            teacher_feature_map = ret_graph.var(self.teacher_feature_map)._var
-            s_fea = student_feature_map / self.student_temperature
-            t_fea = teacher_feature_map / self.teacher_temperature
-            t_fea.stop_gradient = True
-            ce_loss = layers.softmax_with_cross_entropy(
-                s_fea, t_fea, soft_label=True)
-            distillation_loss = ce_loss * self.distillation_loss_weight
-            student_loss = 0
-            if 'loss' in ret_graph.out_nodes:
-                student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
-            loss = distillation_loss + student_loss
-
-            ret_graph.out_nodes['loss'] = loss.name
-            ret_graph.out_nodes[
-                'soft_label_loss_' + self.student_feature_map + "_" +
-                self.teacher_feature_map] = distillation_loss.name
-        return ret_graph
diff --git a/python/paddle/fluid/contrib/slim/graph/__init__.py b/python/paddle/fluid/contrib/slim/graph/__init__.py
deleted file mode 100644
index c5d1c4dbdfb208ea66bb3dc315e502309799492e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/graph/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import executor
-from .executor import *
-from . import graph_wrapper
-from .graph_wrapper import *
-__all__ = executor.__all__
-__all__ += graph_wrapper.__all__
diff --git a/python/paddle/fluid/contrib/slim/graph/executor.py b/python/paddle/fluid/contrib/slim/graph/executor.py
deleted file mode 100644
index 74de141b06b4d64a1001bd0c6815beb1beb7ea54..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/graph/executor.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ....compiler import CompiledProgram
-from ....data_feeder import DataFeeder
-from .... import executor
-from .graph_wrapper import GraphWrapper
-
-__all__ = ['SlimGraphExecutor']
-
-
-class SlimGraphExecutor(object):
-    """
-    Wrapper of executor used to run GraphWrapper.
-    """
-
-    def __init__(self, place):
-        self.exe = executor.Executor(place)
-        self.place = place
-
-    def run(self, graph, scope, data=None):
-        """
-        Runing a graph with a batch of data.
-        Args:
-            graph(GraphWrapper): The graph to be executed.
-            scope(fluid.core.Scope): The scope to be used.
-            data(list<tuple>): A batch of data. Each tuple in this list is a sample.
-                               It will feed the items of tuple to the in_nodes of graph.
-        Returns:
-            results(list): A list of result with the same order indicated by graph.out_nodes.
-        """
-        assert isinstance(graph, GraphWrapper)
-        feed = None
-        if data is not None:
-            feeder = DataFeeder(
-                feed_list=list(graph.in_nodes.values()),
-                place=self.place,
-                program=graph.program)
-            feed = feeder.feed(data)
-
-        fetch_list = list(graph.out_nodes.values())
-        program = graph.compiled_graph if graph.compiled_graph else graph.program
-        results = self.exe.run(program,
-                               scope=scope,
-                               fetch_list=fetch_list,
-                               feed=feed)
-        return results
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
deleted file mode 100644
index 78697a2023b68bedecc39a713b72e217523a41f5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
+++ /dev/null
@@ -1,582 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import OrderedDict
-from .... import io
-from .... import compiler
-from ....framework import Program
-from ....framework import program_guard
-from ....framework import Parameter
-from ....framework import Variable
-from ....executor import Executor
-import copy
-from collections import Iterable
-from ....io import save_inference_model, load_inference_model, save_persistables
-import numpy as np
-import pickle
-import os
-
-__all__ = ['GraphWrapper', 'VarWrapper', 'OpWrapper']
-
-OPTIMIZER_OPS = [
-    'momentum',
-    'lars_momentum',
-    'adagrad',
-    'adam',
-    'adamax',
-    'dpsgd',
-    'decayed_adagrad',
-    'adadelta',
-    'rmsprop',
-]
-
-
-class VarWrapper(object):
-    def __init__(self, var, graph):
-        assert isinstance(var, Variable)
-        assert isinstance(graph, GraphWrapper)
-        self._var = var
-        self._graph = graph
-
-    def __eq__(self, v):
-        """
-        Overwrite this function for ...in... syntax in python.
-        """
-        return self._var.name == v._var.name
-
-    def name(self):
-        """
-        Get the name of the variable.
-        """
-        return self._var.name
-
-    def shape(self):
-        """
-        Get the shape of the varibale.
-        """
-        return self._var.shape
-
-    def set_shape(self, shape):
-        """
-        Set the shape of the variable.
-        """
-        self._var.desc.set_shape(shape)
-
-    def inputs(self):
-        """
-        Get all the operators that use this variable as output.
-        Returns:
-            list<OpWrapper>: A list of operators.
-        """
-        ops = []
-        for op in self._graph.ops():
-            if self in op.all_inputs():
-                ops.append(op)
-        return ops
-
-    def outputs(self):
-        """
-        Get all the operators that use this variable as input.
-        Returns:
-            list<OpWrapper>: A list of operators.
-        """
-        ops = []
-        for op in self._graph.ops():
-            if self in op.all_outputs():
-                ops.append(op)
-        return ops
-
-
-class OpWrapper(object):
-    def __init__(self, op, graph):
-        assert isinstance(graph, GraphWrapper)
-        self._op = op
-        self._graph = graph
-
-    def __eq__(self, op):
-        """
-        Overwrite this function for ...in... syntax in python.
-        """
-        return self.idx() == op.idx()
-
-    def all_inputs(self):
-        """
-        Get all the input variables of this operator.
-        """
-        return [
-            self._graph.var(var_name) for var_name in self._op.input_arg_names
-        ]
-
-    def all_outputs(self):
-        """
-        Get all the output variables of this operator.
-        """
-        return [
-            self._graph.var(var_name) for var_name in self._op.output_arg_names
-        ]
-
-    def idx(self):
-        """
-        Get the id of this operator.
-        """
-        return self._op.idx
-
-    def type(self):
-        """
-        Get the type of this operator.
-        """
-        return self._op.type
-
-    def is_bwd_op(self):
-        """
-        Whether this operator is backward op.
-        """
-        return self.type().endswith('_grad')
-
-    def is_opt_op(self):
-        """
-        Whether this operator is optimizer op.
-        """
-        return self.type() in OPTIMIZER_OPS
-
-    def inputs(self, name):
-        """
-        Get all the varibales by the input name.
-        """
-        return [self._graph.var(var_name) for var_name in self._op.input(name)]
-
-    def outputs(self, name):
-        """
-        Get all the varibales by the output name.
-        """
-        return [self._graph.var(var_name) for var_name in self._op.output(name)]
-
-    def set_attr(self, key, value):
-        """
-        Set the value of attribute by attribute's name.
-
-        Args:
-            key(str): the attribute name.
-            value(bool|int|str|float|list): the value of the attribute.
-        """
-        self._op._set_attr(key, value)
-
-    def attr(self, name):
-        """
-        Get the attribute by name.
-
-        Args:
-            name(str): the attribute name.
-
-        Returns:
-            bool|int|str|float|list: The attribute value. The return value
-            can be any valid attribute type.
-        """
-        return self._op.attr(name)
-
-
-class GraphWrapper(object):
-    """
-    It is a wrapper of paddle.fluid.framework.IrGraph with some special functions
-    for paddle slim framework.
-    """
-
-    def __init__(self, program=None, in_nodes=[], out_nodes=[]):
-        """
-        Args:
-            program(framework.Program): A program with 
-            in_nodes(dict): A dict to indicate the input nodes of the graph.
-                            The key is user-defined and human-readable name.
-                            The value is the name of Variable.
-            out_nodes(dict): A dict to indicate the input nodes of the graph.
-                            The key is user-defined and human-readable name.
-                            The value is the name of Variable.
-        """
-        super(GraphWrapper, self).__init__()
-        self.program = Program() if program is None else program
-        self.persistables = {}
-        self.teacher_persistables = {}
-        for var in self.program.list_vars():
-            if var.persistable:
-                self.persistables[var.name] = var
-        self.compiled_graph = None
-        in_nodes = [] if in_nodes is None else in_nodes
-        out_nodes = [] if out_nodes is None else out_nodes
-        self.in_nodes = OrderedDict(in_nodes)
-        self.out_nodes = OrderedDict(out_nodes)
-        self._attrs = OrderedDict()
-
-    def all_parameters(self):
-        """
-        Get all the parameters in this graph.
-        Returns:
-            list<VarWrapper>: A list of VarWrapper instances.
-        """
-        params = []
-        for block in self.program.blocks:
-            for param in block.all_parameters():
-                params.append(VarWrapper(param, self))
-        return params
-
-    def is_parameter(self, var):
-        """
-        Whether the given variable is parameter.
-        Args:
-            var(VarWrapper): The given varibale.
-        """
-        return isinstance(var._var, Parameter)
-
-    def is_persistable(self, var):
-        """
-        Whether the given variable is persistable.
-        Args:
-            var(VarWrapper): The given varibale.
-        """
-        return var._var.persistable
-
-    def compile(self, for_parallel=True, for_test=False, mem_opt=False):
-        """
-        Compile the program in this wrapper to framework.CompiledProgram for next running.
-        This function must be called if the program is modified.
-        Args:
-            for_parallel(bool): Whether the program to run in data parallel way. default: True.
-            for_test(bool): Whether the compiled program is used for test.
-        """
-        target = self.program
-        if for_test:
-            loss = None
-        else:
-            loss = self.out_nodes['loss']
-        if for_parallel:
-            # disable memory optimize for stable training
-            build_strategy = compiler.BuildStrategy()
-            build_strategy.enable_inplace = mem_opt
-            build_strategy.memory_optimize = mem_opt
-            #            build_strategy.async_mode = False
-            self.compiled_graph = compiler.CompiledProgram(
-                target).with_data_parallel(
-                    loss_name=loss, build_strategy=build_strategy)
-        else:
-            self.compiled_graph = compiler.CompiledProgram(target)
-
-    def ops(self):
-        """
-        Return all operator nodes included in the graph as a set.
-        """
-        ops = []
-        for block in self.program.blocks:
-            for op in block.ops:
-                ops.append(OpWrapper(op, self))
-        return ops
-
-    def vars(self):
-        """
-        Get all the variables.
-        """
-        return [VarWrapper(var, self) for var in self.program.list_vars()]
-
-    def var(self, name):
-        """
-        Get the variable by variable name.
-        """
-        return VarWrapper(self.program.global_block().var(name), self)
-
-    def clone(self, for_test=False):
-        """
-        Clone a new graph from current graph.
-        Returns:
-            (GraphWrapper): The wrapper of a new graph.
-        """
-        return GraphWrapper(
-            self.program.clone(for_test),
-            copy.deepcopy(self.in_nodes), copy.deepcopy(self.out_nodes))
-
-    def merge(self, graph):
-        """
-        Merge a graph into current graph.
-        Args:
-            graph(GraphWrapper): The graph to be merged by current graph.
-        """
-        for var in graph.program.list_vars():
-            if var.persistable:
-                self.teacher_persistables[var.name] = var
-            new_var = self.program.global_block()._clone_variable(
-                var, force_persistable=False)
-            new_var.stop_gradient = var.stop_gradient
-            # TODO: parameters should be cloned
-        for op in graph.ops():
-            op = op._op
-            inputs = {}
-            outputs = {}
-            attrs = {}
-            for input_name in op.input_names:
-                inputs[input_name] = [
-                    self.var(in_var_name)._var
-                    for in_var_name in op.input(input_name)
-                ]
-            for output_name in op.output_names:
-                outputs[output_name] = [
-                    self.var(out_var_name)._var
-                    for out_var_name in op.output(output_name)
-                ]
-            for attr_name in op.attr_names:
-                attrs[attr_name] = op.attr(attr_name)
-            self.program.global_block().append_op(
-                type=op.type, inputs=inputs, outputs=outputs, attrs=attrs)
-
-    def program(self):
-        """
-        Get the program in current wrapper.
-        """
-        return self.program
-
-    def pre_ops(self, op):
-        """
-        Get all the previous operators of target operator.
-        Args:
-            op(OpWrapper): Target operator..
-        Returns:
-            list<OpWrapper>: A list of operators.
-        """
-        ops = []
-        for p in self.ops():
-            for in_var in op.all_inputs():
-                if in_var in p.all_outputs():
-                    ops.append(p)
-        return ops
-
-    def next_ops(self, op):
-        """
-        Get all the next operators of target operator.
-        Args:
-            op(OpWrapper): Target operator..
-        Returns:
-            list<OpWrapper>: A list of operators.
-        """
-        ops = []
-        for p in self.ops():
-            for out_var in op.all_outputs():
-                if out_var in p.all_inputs():
-                    ops.append(p)
-        return ops
-
-    def get_param_by_op(self, op):
-        """
-        Get the parameters used by target operator.
-        """
-        assert isinstance(op, OpWrapper)
-        params = []
-        for var in op.all_inputs():
-            if isinstance(var._var, Parameter):
-                params.append(var)
-        assert len(params) > 0
-        return params
-
-    def numel_params(self):
-        """
-        Get the number of elements in all parameters.
-        """
-        ret = 0
-        for param in self.all_parameters():
-            ret += np.product(param.shape())
-        return ret
-
-    def get_optimize_graph(self, optimizer, place, scope, no_grad_var_names=[]):
-        """
-        Get a new graph for training by appending some backward operators and optimization operators.
-        Args:
-            optimizer: The optimzier used to generate training graph.
-            place: The place to run the graph.
-            scope: The scope used to run the graph. Some new variable will be added into this scope.
-            no_grad_var_names(list<str>): Names of variables that should be ignored while computing gradients. default: [].
-        Returns:
-            (GraphWrapper): The wrapper of new graph with backward ops and optimization ops. 
-        """
-        graph = self.clone()
-        startup_program = Program()
-        with program_guard(
-                main_program=graph.program, startup_program=startup_program):
-            target_name = None
-            if 'loss' in graph.out_nodes:
-                target_name = graph.out_nodes['loss']
-            elif 'cost' in graph.out_nodes:
-                target_name = graph.out_nodes['cost']
-            else:
-                return None
-            target = graph.var(target_name)._var
-            # The learning rate variable may be created in other program.
-            # Update information in optimizer to make
-            # learning rate variable being accessible in current program.
-            if isinstance(optimizer._learning_rate, Variable):
-                optimizer._learning_rate_map[
-                    graph.program] = optimizer._learning_rate
-            optimizer.minimize(target, no_grad_set=no_grad_var_names)
-
-        exe = Executor(place)
-        exe.run(program=startup_program, scope=scope)
-        return graph
-
-    def flops(self, only_conv=False):
-        """
-        Get the flops of current graph.
-        Args:
-            only_conv: Only calculating the conv layers. default: False.
-        Returns:
-            int: The flops of current graph.
-        """
-        flops = 0
-        for op in self.ops():
-            if op.type() in ['conv2d', 'depthwise_conv2d']:
-                filter_shape = op.inputs("Filter")[0].shape()
-                input_shape = op.inputs("Input")[0].shape()
-                output_shape = op.outputs("Output")[0].shape()
-                c_out, c_in, k_h, k_w = filter_shape
-                _, _, h_out, w_out = output_shape
-                groups = op.attr("groups")
-                kernel_ops = k_h * k_w * (c_in / groups)
-                if len(op.inputs("Bias")) > 0:
-                    with_bias = 1
-                else:
-                    with_bias = 0
-                flops += 2 * h_out * w_out * c_out * (kernel_ops + with_bias)
-            elif op.type() == 'pool2d' and not only_conv:
-                input_shape = op.inputs("X")[0].shape()
-                output_shape = op.outputs("Out")[0].shape()
-                _, c_out, h_out, w_out = output_shape
-                k_size = op.attr("ksize")
-                flops += h_out * w_out * c_out * (k_size[0]**2)
-
-            elif op.type() == 'mul' and not only_conv:
-                x_shape = list(op.inputs("X")[0].shape())
-                y_shape = op.inputs("Y")[0].shape()
-                if x_shape[0] == -1:
-                    x_shape[0] = 1
-                flops += 2 * x_shape[0] * x_shape[1] * y_shape[1]
-
-            elif op.type() in ['relu', 'sigmoid', 'batch_norm'
-                               ] and not only_conv:
-                input_shape = list(op.inputs("X")[0].shape())
-                if input_shape[0] == -1:
-                    input_shape[0] = 1
-                flops += np.product(input_shape)
-
-        return flops
-
-    def save_model(self, path, exe):
-        """
-        Save network and parameters into file which can be load by load_inference_model api.
-        Args:
-            path(str): The path to save the persistables.
-            exe(framework.Executor): The executor used to save the persistables.
-        """
-        out_vars = [
-            self.var(var_name)._var for var_name in self.out_nodes.values()
-        ]
-        in_vars = list(self.in_nodes.values())
-        assert (len(in_vars) > 0)
-        assert (len(out_vars) > 0)
-        io.save_inference_model(
-            path,
-            in_vars,
-            out_vars,
-            exe.exe,
-            model_filename="__model__",
-            params_filename="__params__",
-            main_program=self.program.clone(),
-            export_for_deployment=True)
-
-    def save_infer_model(self, path, exe, in_out, program_only=False):
-        """
-        Save network and parameters into file which can be load by load_inference_model api.
-        Args:
-            path(str): The path to save the persistables.
-            exe(framework.Executor): The executor used to save the persistables.
-            in_out(tuple|list): in_out[0] is a list of input nodes' names
-            and in_out[1] is a list of output nodes' names.
-            program_only(bool): Whether to save program only.
-        """
-        out_vars = [self.var(var_name)._var for var_name in in_out[1]]
-        in_vars = list(in_out[0])
-        assert (len(in_vars) > 0)
-        assert (len(out_vars) > 0)
-        io.save_inference_model(
-            path,
-            in_vars,
-            out_vars,
-            exe.exe,
-            model_filename="__model__.infer",
-            params_filename="__params__",
-            program_only=program_only,
-            main_program=self.program.clone(),
-            export_for_deployment=True)
-
-    def save_persistables(self, path, exe):
-        """
-        Save all the persistable variables into file.
-        Args:
-            path(str): The path to save the persistables.
-            exe(framework.Executor): The executor used to save the persistables.
-        """
-        # update persistables from program
-        for var in self.program.list_vars():
-            if var.persistable and var.name not in self.persistables:
-                self.persistables[var.name] = var
-        persistables = []
-        for var in self.persistables:
-            if 'reader' not in var and 'double_buffer' not in var and var not in self.teacher_persistables:
-                persistables.append(self.persistables[var])
-
-        io.save_vars(exe.exe, path, vars=persistables)
-
-    def load_persistables(self, path, exe):
-        """
-        Load the persistable variables from file.
-        Args:
-            path(str): The path to load the persistables.
-            exe(framework.Executor): The executor used to load the persistables.
-        """
-
-        def if_exist(var):
-            return os.path.exists(os.path.join(path, var.name))
-
-        persistables = []
-        for var in self.persistables:
-            if 'reader' not in var and 'double_buffer' not in var:
-                persistables.append(self.persistables[var])
-        io.load_vars(exe.exe, path, vars=persistables, predicate=if_exist)
-
-    def update_param_shape(self, scope):
-        """
-        Update the shape of parameters in the graph according to tensors in scope.
-        It is used after loading pruned parameters from file.
-        """
-        for param in self.all_parameters():
-            tensor_shape = np.array(scope.find_var(param.name()).get_tensor(
-            )).shape
-            param.set_shape(tensor_shape)
-
-    def infer_shape(self):
-        """
-        Update the groups of convolution layer according to current filters.
-        It is used after loading pruned parameters from file.
-        """
-        for op in self.ops():
-            if op.type() != 'conditional_block':
-                op._op.desc.infer_shape(op._op.block.desc)
-
-    def update_groups_of_conv(self):
-        for op in self.ops():
-            if op.type() == 'depthwise_conv2d' or op.type(
-            ) == 'depthwise_conv2d_grad':
-                op.set_attr('groups', op.inputs('Filter')[0].shape()[0])
diff --git a/python/paddle/fluid/contrib/slim/nas/__init__.py b/python/paddle/fluid/contrib/slim/nas/__init__.py
deleted file mode 100644
index 7330a2075142cbe34680119d974c0876955d408e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/nas/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import light_nas_strategy
-from .light_nas_strategy import *
-from . import controller_server
-from .controller_server import *
-from . import search_agent
-from .search_agent import *
-from . import search_space
-from .search_space import *
-from . import lock
-from .lock import *
-
-__all__ = light_nas_strategy.__all__
-__all__ += controller_server.__all__
-__all__ += search_agent.__all__
-__all__ += search_space.__all__
diff --git a/python/paddle/fluid/contrib/slim/nas/controller_server.py b/python/paddle/fluid/contrib/slim/nas/controller_server.py
deleted file mode 100644
index 65cfbd7d86ff3783e358f73fff83d89fd98dc01a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/nas/controller_server.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import socket
-from threading import Thread
-from ....log_helper import get_logger
-
-__all__ = ['ControllerServer']
-
-_logger = get_logger(
-    __name__,
-    logging.INFO,
-    fmt='ControllerServer-%(asctime)s-%(levelname)s: %(message)s')
-
-
-class ControllerServer(object):
-    """
-    The controller wrapper with a socket server to handle the request of search agentt.
-    """
-
-    def __init__(self,
-                 controller=None,
-                 address=('', 0),
-                 max_client_num=100,
-                 search_steps=None,
-                 key=None):
-        """
-        Args:
-            controller(slim.searcher.Controller): The controller used to generate tokens.
-            address(tuple): The address of current server binding with format (ip, port). Default: ('', 0).
-                            which means setting ip automatically
-            max_client_num(int): The maximum number of clients connecting to current server simultaneously. Default: 100.
-            search_steps(int): The total steps of searching. None means never stopping. Default: None 
-        """
-        self._controller = controller
-        self._address = address
-        self._max_client_num = max_client_num
-        self._search_steps = search_steps
-        self._closed = False
-        self._port = address[1]
-        self._ip = address[0]
-        self._key = key
-
-    def start(self):
-        self._socket_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        self._socket_server.bind(self._address)
-        self._socket_server.listen(self._max_client_num)
-        self._port = self._socket_server.getsockname()[1]
-        self._ip = self._socket_server.getsockname()[0]
-        _logger.info("listen on: [{}:{}]".format(self._ip, self._port))
-        thread = Thread(target=self.run)
-        thread.start()
-        return str(thread)
-
-    def close(self):
-        """Close the server."""
-        self._closed = True
-
-    def port(self):
-        """Get the port."""
-        return self._port
-
-    def ip(self):
-        """Get the ip."""
-        return self._ip
-
-    def run(self):
-        _logger.info("Controller Server run...")
-        while ((self._search_steps is None) or
-               (self._controller._iter <
-                (self._search_steps))) and not self._closed:
-            conn, addr = self._socket_server.accept()
-            message = conn.recv(1024).decode()
-            if message.strip("\n") == "next_tokens":
-                tokens = self._controller.next_tokens()
-                tokens = ",".join([str(token) for token in tokens])
-                conn.send(tokens.encode())
-            else:
-                _logger.info("recv message from {}: [{}]".format(addr, message))
-                messages = message.strip('\n').split("\t")
-                if (len(messages) < 3) or (messages[0] != self._key):
-                    _logger.info("recv noise from {}: [{}]".format(addr,
-                                                                   message))
-                    continue
-                tokens = messages[1]
-                reward = messages[2]
-                tokens = [int(token) for token in tokens.split(",")]
-                self._controller.update(tokens, float(reward))
-                tokens = self._controller.next_tokens()
-                tokens = ",".join([str(token) for token in tokens])
-                conn.send(tokens.encode())
-                _logger.info("send message to {}: [{}]".format(addr, tokens))
-            conn.close()
-        self._socket_server.close()
-        _logger.info("server closed!")
diff --git a/python/paddle/fluid/contrib/slim/nas/light_nas_strategy.py b/python/paddle/fluid/contrib/slim/nas/light_nas_strategy.py
deleted file mode 100644
index 2723ed5f16f90505eea505eb451c7968cb406a4a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/nas/light_nas_strategy.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..core.strategy import Strategy
-from ..graph import GraphWrapper
-from .controller_server import ControllerServer
-from .search_agent import SearchAgent
-from ....executor import Executor
-from ....log_helper import get_logger
-import re
-import logging
-import functools
-import socket
-from .lock import lock, unlock
-
-__all__ = ['LightNASStrategy']
-
-_logger = get_logger(
-    __name__,
-    logging.INFO,
-    fmt='LightNASStrategy-%(asctime)s-%(levelname)s: %(message)s')
-
-
-class LightNASStrategy(Strategy):
-    """
-    Light-NAS search strategy.
-    """
-
-    def __init__(self,
-                 controller=None,
-                 end_epoch=1000,
-                 target_flops=629145600,
-                 target_latency=0,
-                 retrain_epoch=1,
-                 metric_name='top1_acc',
-                 server_ip=None,
-                 server_port=0,
-                 is_server=False,
-                 max_client_num=100,
-                 search_steps=None,
-                 key="light-nas"):
-        """
-        Args:
-            controller(searcher.Controller): The searching controller. Default: None.
-            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. Default: 0
-            target_flops(int): The constraint of FLOPS.
-            target_latency(float): The constraint of latency.
-            retrain_epoch(int): The number of training epochs before evaluating structure generated by controller. Default: 1.
-            metric_name(str): The metric used to evaluate the model.
-                         It should be one of keys in out_nodes of graph wrapper. Default: 'top1_acc'
-            server_ip(str): The ip that controller server listens on. None means getting the ip automatically. Default: None.
-            server_port(int): The port that controller server listens on. 0 means getting usable port automatically. Default: 0.
-            is_server(bool): Whether current host is controller server. Default: False.
-            max_client_num(int): The maximum number of clients that connect to controller server concurrently. Default: 100.
-            search_steps(int): The total steps of searching. Default: None.
-            key(str): The key used to identify legal agent for controller server. Default: "light-nas"
-        """
-        self.start_epoch = 0
-        self.end_epoch = end_epoch
-        self._max_flops = target_flops
-        self._max_latency = target_latency
-        self._metric_name = metric_name
-        self._controller = controller
-        self._retrain_epoch = 0
-        self._server_ip = server_ip
-        self._server_port = server_port
-        self._is_server = is_server
-        self._retrain_epoch = retrain_epoch
-        self._search_steps = search_steps
-        self._max_client_num = max_client_num
-        self._max_try_times = 100
-        self._key = key
-
-        if self._server_ip is None:
-            self._server_ip = self._get_host_ip()
-
-    def _get_host_ip(self):
-        return socket.gethostbyname(socket.gethostname())
-
-    def on_compression_begin(self, context):
-        self._current_tokens = context.search_space.init_tokens()
-        self._controller.reset(context.search_space.range_table(),
-                               self._current_tokens, None)
-
-        # create controller server
-        if self._is_server:
-            open("./slim_LightNASStrategy_controller_server.socket",
-                 'a').close()
-            socket_file = open(
-                "./slim_LightNASStrategy_controller_server.socket", 'r+')
-            lock(socket_file)
-            tid = socket_file.readline()
-            if tid == '':
-                _logger.info("start controller server...")
-                self._server = ControllerServer(
-                    controller=self._controller,
-                    address=(self._server_ip, self._server_port),
-                    max_client_num=self._max_client_num,
-                    search_steps=self._search_steps,
-                    key=self._key)
-                tid = self._server.start()
-                self._server_port = self._server.port()
-                socket_file.write(tid)
-                _logger.info("started controller server...")
-            unlock(socket_file)
-            socket_file.close()
-        _logger.info("self._server_ip: {}; self._server_port: {}".format(
-            self._server_ip, self._server_port))
-        # create client
-        self._search_agent = SearchAgent(
-            self._server_ip, self._server_port, key=self._key)
-
-    def __getstate__(self):
-        """Socket can't be pickled."""
-        d = {}
-        for key in self.__dict__:
-            if key not in ["_search_agent", "_server"]:
-                d[key] = self.__dict__[key]
-        return d
-
-    def on_epoch_begin(self, context):
-        if context.epoch_id >= self.start_epoch and context.epoch_id <= self.end_epoch and (
-                self._retrain_epoch == 0 or
-            (context.epoch_id - self.start_epoch) % self._retrain_epoch == 0):
-            _logger.info("light nas strategy on_epoch_begin")
-            for _ in range(self._max_try_times):
-                startup_p, train_p, test_p, _, _, train_reader, test_reader = context.search_space.create_net(
-                    self._current_tokens)
-                context.eval_graph.program = test_p
-                flops = context.eval_graph.flops()
-                if self._max_latency > 0:
-                    latency = context.search_space.get_model_latency(test_p)
-                    _logger.info("try [{}] with latency {} flops {}".format(
-                        self._current_tokens, latency, flops))
-                else:
-                    _logger.info("try [{}] with flops {}".format(
-                        self._current_tokens, flops))
-                if flops > self._max_flops or (self._max_latency > 0 and
-                                               latency > self._max_latency):
-                    self._current_tokens = self._search_agent.next_tokens()
-                else:
-                    break
-
-            context.train_reader = train_reader
-            context.eval_reader = test_reader
-
-            exe = Executor(context.place)
-            exe.run(startup_p)
-
-            context.optimize_graph.program = train_p
-            context.optimize_graph.compile()
-
-            context.skip_training = (self._retrain_epoch == 0)
-
-    def on_epoch_end(self, context):
-        if context.epoch_id >= self.start_epoch and context.epoch_id < self.end_epoch and (
-                self._retrain_epoch == 0 or
-            (context.epoch_id - self.start_epoch + 1
-             ) % self._retrain_epoch == 0):
-
-            self._current_reward = context.eval_results[self._metric_name][-1]
-            flops = context.eval_graph.flops()
-            if flops > self._max_flops:
-                self._current_reward = 0.0
-            if self._max_latency > 0:
-                test_p = context.search_space.create_net(self._current_tokens)[
-                    2]
-                latency = context.search_space.get_model_latency(test_p)
-                if latency > self._max_latency:
-                    self._current_reward = 0.0
-                _logger.info("reward: {}; latency: {}; flops: {}; tokens: {}".
-                             format(self._current_reward, latency, flops,
-                                    self._current_tokens))
-            else:
-                _logger.info("reward: {}; flops: {}; tokens: {}".format(
-                    self._current_reward, flops, self._current_tokens))
-            self._current_tokens = self._search_agent.update(
-                self._current_tokens, self._current_reward)
diff --git a/python/paddle/fluid/contrib/slim/nas/lock.py b/python/paddle/fluid/contrib/slim/nas/lock.py
deleted file mode 100644
index 5edcd317304f941c2e7c15ad56e95525dea85398..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/nas/lock.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-__All__ = ['lock', 'unlock']
-if os.name == 'nt':
-
-    def lock(file):
-        raise NotImplementedError('Windows is not supported.')
-
-    def unlock(file):
-        raise NotImplementedError('Windows is not supported.')
-
-elif os.name == 'posix':
-    from fcntl import flock, LOCK_EX, LOCK_UN
-
-    def lock(file):
-        """Lock the file in local file system."""
-        flock(file.fileno(), LOCK_EX)
-
-    def unlock(file):
-        """Unlock the file in local file system."""
-        flock(file.fileno(), LOCK_UN)
-else:
-    raise RuntimeError("File Locker only support NT and Posix platforms!")
diff --git a/python/paddle/fluid/contrib/slim/nas/search_agent.py b/python/paddle/fluid/contrib/slim/nas/search_agent.py
deleted file mode 100644
index 4f32c46999eeace82359d388f867c461105f46ea..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/nas/search_agent.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import socket
-from ....log_helper import get_logger
-
-__all__ = ['SearchAgent']
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class SearchAgent(object):
-    """
-    Search agent.
-    """
-
-    def __init__(self, server_ip=None, server_port=None, key=None):
-        """
-        Args:
-            server_ip(str): The ip that controller server listens on. None means getting the ip automatically. Default: None.
-            server_port(int): The port that controller server listens on. 0 means getting usable port automatically. Default: 0.
-            key(str): The key used to identify legal agent for controller server. Default: "light-nas"
-        """
-        self.server_ip = server_ip
-        self.server_port = server_port
-        self.socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        self._key = key
-
-    def update(self, tokens, reward):
-        """
-        Update the controller according to latest tokens and reward.
-        Args:
-            tokens(list<int>): The tokens generated in last step.
-            reward(float): The reward of tokens.
-        """
-        socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        socket_client.connect((self.server_ip, self.server_port))
-        tokens = ",".join([str(token) for token in tokens])
-        socket_client.send("{}\t{}\t{}".format(self._key, tokens, reward)
-                           .encode())
-        tokens = socket_client.recv(1024).decode()
-        tokens = [int(token) for token in tokens.strip("\n").split(",")]
-        return tokens
-
-    def next_tokens(self):
-        """
-        Get next tokens.
-        """
-        socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        socket_client.connect((self.server_ip, self.server_port))
-        socket_client.send("next_tokens".encode())
-        tokens = socket_client.recv(1024).decode()
-        tokens = [int(token) for token in tokens.strip("\n").split(",")]
-        return tokens
diff --git a/python/paddle/fluid/contrib/slim/nas/search_space.py b/python/paddle/fluid/contrib/slim/nas/search_space.py
deleted file mode 100644
index bd8b369f6ec367657153386e136c86353136e8b7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/nas/search_space.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""The search space used to search neural architecture"""
-
-__all__ = ['SearchSpace']
-
-
-class SearchSpace(object):
-    """Controller for Neural Architecture Search.
-    """
-
-    def __init__(self, *args, **kwargs):
-        pass
-
-    def init_tokens(self):
-        """Get init tokens in search space.
-        """
-        raise NotImplementedError('Abstract method.')
-
-    def range_table(self):
-        """Get range table of current search space.
-        """
-        raise NotImplementedError('Abstract method.')
-
-    def create_net(self, tokens):
-        """Create networks for training and evaluation according to tokens.
-        Args:
-            tokens(list<int>): The tokens which represent a network.
-        Return:
-            (tuple): startup_program, train_program, evaluation_program, train_metrics, test_metrics
-        """
-        raise NotImplementedError('Abstract method.')
-
-    def get_model_latency(self, program):
-        """Get model latency according to program.
-        Args:
-            program(Program): The program to get latency.
-        Return:
-            (float): model latency.
-        """
-        raise NotImplementedError('Abstract method.')
diff --git a/python/paddle/fluid/contrib/slim/prune/__init__.py b/python/paddle/fluid/contrib/slim/prune/__init__.py
deleted file mode 100644
index ae487a21e341297dedb82cf275cc41badb9b2621..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/prune/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import pruner
-from .pruner import *
-from . import prune_strategy
-from .prune_strategy import *
-from . import auto_prune_strategy
-from .auto_prune_strategy import *
-
-__all__ = pruner.__all__
-__all__ += prune_strategy.__all__
-__all__ += auto_prune_strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py
deleted file mode 100644
index f9dce622da2a3d40bd7aac6a13071856089d3b9a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .prune_strategy import PruneStrategy
-import re
-import logging
-import functools
-import copy
-from ....log_helper import get_logger
-
-__all__ = ['AutoPruneStrategy']
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class AutoPruneStrategy(PruneStrategy):
-    """
-    Automatic pruning strategy.
-    """
-
-    def __init__(self,
-                 pruner=None,
-                 controller=None,
-                 start_epoch=0,
-                 end_epoch=10,
-                 min_ratio=0.5,
-                 max_ratio=0.7,
-                 metric_name='top1_acc',
-                 pruned_params='conv.*_weights',
-                 retrain_epoch=0,
-                 uniform_range=None,
-                 init_tokens=None):
-        """
-        Args:
-            pruner(slim.Pruner): The pruner used to prune the parameters. Default: None.
-            controller(searcher.Controller): The searching controller. Default: None.
-            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. Default: 0
-            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. Default: 0
-            min_ratio(float): The maximum pruned ratio. Default: 0.7
-            max_ratio(float): The minimum pruned ratio. Default: 0.5
-            metric_name(str): The metric used to evaluate the model.
-                         It should be one of keys in out_nodes of graph wrapper. Default: 'top1_acc'
-            pruned_params(str): The pattern str to match the parameter names to be pruned. Default: 'conv.*_weights'
-            retrain_epoch(int): The training epochs in each seaching step. Default: 0
-            uniform_range(int): The token range in each position of tokens generated by controller. None means getting the range automatically. Default: None.
-            init_tokens(list<int>): The initial tokens. None means getting the initial tokens automatically. Default: None.
-        """
-        super(AutoPruneStrategy, self).__init__(pruner, start_epoch, end_epoch,
-                                                0.0, metric_name, pruned_params)
-        self._max_ratio = max_ratio
-        self._min_ratio = min_ratio
-        self._controller = controller
-        self._metric_name = metric_name
-        self._pruned_param_names = []
-        self._retrain_epoch = retrain_epoch
-        self._uniform_range = uniform_range
-        self._init_tokens = init_tokens
-        self._current_tokens = None
-
-    def on_compression_begin(self, context):
-        """
-        Prepare some information for searching strategy.
-        step 1: Find all the parameters to be pruned.
-        step 2: Get initial tokens and setup controller.
-        """
-        pruned_params = []
-        for param in context.eval_graph.all_parameters():
-            if re.match(self.pruned_params, param.name()):
-                self._pruned_param_names.append(param.name())
-
-        if self._init_tokens is not None:
-            self._current_tokens = self._init_tokens
-        else:
-            self._current_tokens = self._get_init_tokens(context)
-
-        if self._uniform_range is not None:
-            self._range_table = [round(self._uniform_range, 2) / 0.01] * len(
-                self._pruned_param_names)
-        else:
-            self._range_table = copy.deepcopy(self._current_tokens)
-        _logger.info('init tokens: {}'.format(self._current_tokens))
-        _logger.info("range_table: {}".format(self._range_table))
-        constrain_func = functools.partial(
-            self._constrain_func, context=context)
-
-        self._controller.reset(self._range_table, self._current_tokens,
-                               constrain_func)
-
-    def _constrain_func(self, tokens, context=None):
-        """Check whether the tokens meet constraint."""
-        ori_flops = context.eval_graph.flops()
-        ratios = self._tokens_to_ratios(tokens)
-        params = self._pruned_param_names
-        param_shape_backup = {}
-        self._prune_parameters(
-            context.eval_graph,
-            context.scope,
-            params,
-            ratios,
-            context.place,
-            only_graph=True,
-            param_shape_backup=param_shape_backup)
-        context.eval_graph.update_groups_of_conv()
-        flops = context.eval_graph.flops()
-        for param in param_shape_backup.keys():
-            context.eval_graph.var(param).set_shape(param_shape_backup[param])
-        flops_ratio = (1 - float(flops) / ori_flops)
-        if flops_ratio >= self._min_ratio and flops_ratio <= self._max_ratio:
-            _logger.info("Success try [{}]; flops: -{}".format(tokens,
-                                                               flops_ratio))
-            return True
-        else:
-            _logger.info("Failed try [{}]; flops: -{}".format(tokens,
-                                                              flops_ratio))
-            return False
-
-    def _get_init_tokens(self, context):
-        """Get initial tokens.
-        """
-        ratios = self._get_uniform_ratios(context)
-        _logger.info('Get init ratios: {}'.format(
-            [round(r, 2) for r in ratios]))
-        return self._ratios_to_tokens(ratios)
-
-    def _ratios_to_tokens(self, ratios):
-        """Convert pruned ratios to tokens.
-        """
-        return [int(ratio / 0.01) for ratio in ratios]
-
-    def _tokens_to_ratios(self, tokens):
-        """Convert tokens to pruned ratios.
-        """
-        return [token * 0.01 for token in tokens]
-
-    def _get_uniform_ratios(self, context):
-        """
-        Search a group of uniform ratios.
-        """
-        min_ratio = 0.
-        max_ratio = 1.
-        target = (self._min_ratio + self._max_ratio) / 2
-        flops = context.eval_graph.flops()
-        model_size = context.eval_graph.numel_params()
-        ratios = None
-        while min_ratio < max_ratio:
-            ratio = (max_ratio + min_ratio) / 2
-            ratios = [ratio] * len(self._pruned_param_names)
-            param_shape_backup = {}
-            self._prune_parameters(
-                context.eval_graph,
-                context.scope,
-                self._pruned_param_names,
-                ratios,
-                context.place,
-                only_graph=True,
-                param_shape_backup=param_shape_backup)
-
-            pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
-            pruned_size = 1 - (float(context.eval_graph.numel_params()) /
-                               model_size)
-            for param in param_shape_backup.keys():
-                context.eval_graph.var(param).set_shape(param_shape_backup[
-                    param])
-
-            if abs(pruned_flops - target) < 1e-2:
-                break
-            if pruned_flops > target:
-                max_ratio = ratio
-            else:
-                min_ratio = ratio
-        _logger.info('Get ratios: {}'.format([round(r, 2) for r in ratios]))
-        return ratios
-
-    def on_epoch_begin(self, context):
-        """
-        step 1: Get a new tokens from controller.
-        step 2: Pruning eval_graph and optimize_program by tokens
-        """
-        if context.epoch_id >= self.start_epoch and context.epoch_id <= self.end_epoch and (
-                self._retrain_epoch == 0 or
-            (context.epoch_id - self.start_epoch) % self._retrain_epoch == 0):
-            _logger.info("on_epoch_begin")
-            params = self._pruned_param_names
-            ratios = self._tokens_to_ratios(self._current_tokens)
-
-            self._param_shape_backup = {}
-            self._param_backup = {}
-            self._prune_parameters(
-                context.optimize_graph,
-                context.scope,
-                params,
-                ratios,
-                context.place,
-                param_backup=self._param_backup,
-                param_shape_backup=self._param_shape_backup)
-            self._prune_graph(context.eval_graph, context.optimize_graph)
-            context.optimize_graph.update_groups_of_conv()
-            context.eval_graph.update_groups_of_conv()
-            context.optimize_graph.compile(
-                mem_opt=False)  # to update the compiled program
-            context.skip_training = (self._retrain_epoch == 0)
-
-    def on_epoch_end(self, context):
-        """
-        step 1: Get reward of current tokens and update controller.
-        step 2: Restore eval_graph and optimize_graph
-        """
-        if context.epoch_id >= self.start_epoch and context.epoch_id < self.end_epoch and (
-                self._retrain_epoch == 0 or
-            (context.epoch_id - self.start_epoch + 1
-             ) % self._retrain_epoch == 0):
-            _logger.info("on_epoch_end")
-            reward = context.eval_results[self._metric_name][-1]
-            self._controller.update(self._current_tokens, reward)
-
-            self._current_tokens = self._controller.next_tokens()
-            # restore pruned parameters
-            for param_name in self._param_backup.keys():
-                param_t = context.scope.find_var(param_name).get_tensor()
-                param_t.set(self._param_backup[param_name], context.place)
-            self._param_backup = {}
-            # restore shape of parameters
-            for param in self._param_shape_backup.keys():
-                context.optimize_graph.var(param).set_shape(
-                    self._param_shape_backup[param])
-            self._param_shape_backup = {}
-            self._prune_graph(context.eval_graph, context.optimize_graph)
-
-            context.optimize_graph.update_groups_of_conv()
-            context.eval_graph.update_groups_of_conv()
-            context.optimize_graph.compile(
-                mem_opt=False)  # to update the compiled program
-
-        elif context.epoch_id == self.end_epoch:  # restore graph for final training
-            # restore pruned parameters
-            for param_name in self._param_backup.keys():
-                param_t = context.scope.find_var(param_name).get_tensor()
-                param_t.set(self.param_backup[param_name], context.place)
-            # restore shape of parameters
-            for param in self._param_shape_backup.keys():
-                context.eval_graph.var(param).set_shape(
-                    self._param_shape_backup[param])
-                context.optimize_graph.var(param).set_shape(
-                    self._param_shape_backup[param])
-
-            context.optimize_graph.update_groups_of_conv()
-            context.eval_graph.update_groups_of_conv()
-
-            params, ratios = self._get_prune_ratios(
-                self._controller._best_tokens)
-            self._prune_parameters(context.optimize_graph, context.scope,
-                                   params, ratios, context.place)
-
-            self._prune_graph(context.eval_graph, context.optimize_graph)
-            context.optimize_graph.update_groups_of_conv()
-            context.eval_graph.update_groups_of_conv()
-            context.optimize_graph.compile(
-                mem_opt=True)  # to update the compiled program
-
-            context.skip_training = False
diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
deleted file mode 100644
index ffce27113705b86bec963a77d67a448ec4cf360e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ /dev/null
@@ -1,962 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..core.strategy import Strategy
-from ..graph import VarWrapper, OpWrapper, GraphWrapper
-from ....framework import Program, program_guard, Parameter
-from ....log_helper import get_logger
-from .... import layers
-import prettytable as pt
-import numpy as np
-from scipy.optimize import leastsq
-import copy
-import re
-import os
-import pickle
-import logging
-import sys
-
-__all__ = ['SensitivePruneStrategy', 'UniformPruneStrategy', 'PruneStrategy']
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class PruneStrategy(Strategy):
-    """
-    The base class of all pruning strategies.
-    """
-
-    def __init__(self,
-                 pruner=None,
-                 start_epoch=0,
-                 end_epoch=0,
-                 target_ratio=0.5,
-                 metric_name=None,
-                 pruned_params='conv.*_weights'):
-        """
-        Args:
-            pruner(slim.Pruner): The pruner used to prune the parameters.
-            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
-            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
-            target_ratio(float): The flops ratio to be pruned from current model.
-            metric_name(str): The metric used to evaluate the model.
-                         It should be one of keys in out_nodes of graph wrapper.
-            pruned_params(str): The pattern str to match the parameter names to be pruned.
-        """
-        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
-        self.pruner = pruner
-        self.target_ratio = target_ratio
-        self.metric_name = metric_name
-        self.pruned_params = pruned_params
-        self.pruned_list = []
-
-    def _eval_graph(self, context, sampled_rate=None, cached_id=0):
-        """
-        Evaluate the current mode in context.
-        Args:
-            context(slim.core.Context): The context storing all information used to evaluate the current model.
-            sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
-            cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0.
-        """
-        results, names = context.run_eval_graph(sampled_rate, cached_id)
-        metric = np.mean(results[list(names).index(self.metric_name)])
-        return metric
-
-    def _prune_filters_by_ratio(self,
-                                scope,
-                                params,
-                                ratio,
-                                place,
-                                lazy=False,
-                                only_graph=False,
-                                param_shape_backup=None,
-                                param_backup=None):
-        """
-        Pruning filters by given ratio.
-        Args:
-            scope(fluid.core.Scope): The scope used to pruning filters.
-            params(list<VarWrapper>): A list of filter parameters.
-            ratio(float): The ratio to be pruned.
-            place(fluid.Place): The device place of filter parameters.
-            lazy(bool): True means setting the pruned elements to zero.
-                        False means cutting down the pruned elements.
-            only_graph(bool): True means only modifying the graph.
-                              False means modifying graph and variables in  scope.
-        """
-        if params[0].name() in self.pruned_list[0]:
-            return
-        param_t = scope.find_var(params[0].name()).get_tensor()
-        pruned_idx = self.pruner.cal_pruned_idx(
-            params[0].name(), np.array(param_t), ratio, axis=0)
-        for param in params:
-            assert isinstance(param, VarWrapper)
-            param_t = scope.find_var(param.name()).get_tensor()
-            if param_backup is not None and (param.name() not in param_backup):
-                param_backup[param.name()] = copy.deepcopy(np.array(param_t))
-            pruned_param = self.pruner.prune_tensor(
-                np.array(param_t), pruned_idx, pruned_axis=0, lazy=lazy)
-            if not only_graph:
-                param_t.set(pruned_param, place)
-            ori_shape = param.shape()
-            if param_shape_backup is not None and (
-                    param.name() not in param_shape_backup):
-                param_shape_backup[param.name()] = copy.deepcopy(param.shape())
-            new_shape = list(param.shape())
-            new_shape[0] = pruned_param.shape[0]
-            param.set_shape(new_shape)
-            _logger.debug(
-                '|----------------------------------------+----+------------------------------+------------------------------|'
-            )
-            _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
-                str(param.name()),
-                str(ratio), str(ori_shape), str(param.shape())))
-            self.pruned_list[0].append(param.name())
-        return pruned_idx
-
-    def _prune_parameter_by_idx(self,
-                                scope,
-                                params,
-                                pruned_idx,
-                                pruned_axis,
-                                place,
-                                lazy=False,
-                                only_graph=False,
-                                param_shape_backup=None,
-                                param_backup=None):
-        """
-        Pruning parameters in given axis.
-        Args:
-            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
-            params(VarWrapper): The parameter to be pruned.
-            pruned_idx(list): The index of elements to be pruned.
-            pruned_axis(int): The pruning axis.
-            place(fluid.Place): The device place of filter parameters.
-            lazy(bool): True means setting the pruned elements to zero.
-                        False means cutting down the pruned elements.
-            only_graph(bool): True means only modifying the graph.
-                              False means modifying graph and variables in  scope.
-        """
-        if params[0].name() in self.pruned_list[pruned_axis]:
-            return
-        for param in params:
-            assert isinstance(param, VarWrapper)
-            param_t = scope.find_var(param.name()).get_tensor()
-            if param_backup is not None and (param.name() not in param_backup):
-                param_backup[param.name()] = copy.deepcopy(np.array(param_t))
-            pruned_param = self.pruner.prune_tensor(
-                np.array(param_t), pruned_idx, pruned_axis, lazy=lazy)
-            if not only_graph:
-                param_t.set(pruned_param, place)
-            ori_shape = param.shape()
-
-            if param_shape_backup is not None and (
-                    param.name() not in param_shape_backup):
-                param_shape_backup[param.name()] = copy.deepcopy(param.shape())
-            new_shape = list(param.shape())
-            new_shape[pruned_axis] = pruned_param.shape[pruned_axis]
-            param.set_shape(new_shape)
-            _logger.debug(
-                '|----------------------------------------+----+------------------------------+------------------------------|'
-            )
-            _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
-                str(param.name()),
-                str(pruned_axis), str(ori_shape), str(param.shape())))
-            self.pruned_list[pruned_axis].append(param.name())
-
-    def _forward_search_related_op(self, graph, param):
-        """
-        Forward search operators that will be affected by pruning of param.
-        Args:
-            graph(GraphWrapper): The graph to be searched.
-            param(VarWrapper): The current pruned parameter.
-        Returns:
-            list<OpWrapper>: A list of operators.
-        """
-        assert isinstance(param, VarWrapper)
-        visited = {}
-        for op in graph.ops():
-            visited[op.idx()] = False
-        stack = []
-        for op in graph.ops():
-            if (not op.is_bwd_op()) and (param in op.all_inputs()):
-                stack.append(op)
-        visit_path = []
-        while len(stack) > 0:
-            top_op = stack[len(stack) - 1]
-            if visited[top_op.idx()] == False:
-                visit_path.append(top_op)
-                visited[top_op.idx()] = True
-            next_ops = None
-            if top_op.type() == "conv2d" and param not in top_op.all_inputs():
-                next_ops = None
-            elif top_op.type() == "mul":
-                next_ops = None
-            else:
-                next_ops = self._get_next_unvisited_op(graph, visited, top_op)
-            if next_ops == None:
-                stack.pop()
-            else:
-                stack += next_ops
-        return visit_path
-
-    def _get_next_unvisited_op(self, graph, visited, top_op):
-        """
-        Get next unvisited adjacent operators of given operators.
-        Args:
-            graph(GraphWrapper): The graph used to search. 
-            visited(list): The ids of operators that has been visited.
-            top_op: The given operator.
-        Returns:
-            list<OpWrapper>: A list of operators. 
-        """
-        assert isinstance(top_op, OpWrapper)
-        next_ops = []
-        for op in graph.next_ops(top_op):
-            if (visited[op.idx()] == False) and (not op.is_bwd_op()):
-                next_ops.append(op)
-        return next_ops if len(next_ops) > 0 else None
-
-    def _get_accumulator(self, graph, param):
-        """
-        Get accumulators of given parameter. The accumulator was created by optimizer.
-        Args:
-            graph(GraphWrapper): The graph used to search.
-            param(VarWrapper): The given parameter.
-        Returns:
-            list<VarWrapper>: A list of accumulators which are variables.
-        """
-        assert isinstance(param, VarWrapper)
-        params = []
-        for op in param.outputs():
-            if op.is_opt_op():
-                for out_var in op.all_outputs():
-                    if graph.is_persistable(out_var) and out_var.name(
-                    ) != param.name():
-                        params.append(out_var)
-        return params
-
-    def _forward_pruning_ralated_params(self,
-                                        graph,
-                                        scope,
-                                        param,
-                                        place,
-                                        ratio=None,
-                                        pruned_idxs=None,
-                                        lazy=False,
-                                        only_graph=False,
-                                        param_backup=None,
-                                        param_shape_backup=None):
-        """
-        Pruning all the parameters affected by the pruning of given parameter.
-        Args:
-            graph(GraphWrapper): The graph to be searched.
-            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
-            param(VarWrapper): The given parameter.
-            place(fluid.Place): The device place of filter parameters.
-            ratio(float): The target ratio to be pruned.
-            pruned_idx(list): The index of elements to be pruned.
-            lazy(bool): True means setting the pruned elements to zero.
-                        False means cutting down the pruned elements.
-            only_graph(bool): True means only modifying the graph.
-                              False means modifying graph and variables in  scope.
-        """
-        assert isinstance(
-            graph,
-            GraphWrapper), "graph must be instance of slim.core.GraphWrapper"
-        assert isinstance(
-            param, VarWrapper), "param must be instance of slim.core.VarWrapper"
-
-        if param.name() in self.pruned_list[0]:
-            return
-        related_ops = self._forward_search_related_op(graph, param)
-
-        if ratio is None:
-            assert pruned_idxs is not None
-            self._prune_parameter_by_idx(
-                scope, [param] + self._get_accumulator(graph, param),
-                pruned_idxs,
-                pruned_axis=0,
-                place=place,
-                lazy=lazy,
-                only_graph=only_graph,
-                param_backup=param_backup,
-                param_shape_backup=param_shape_backup)
-
-        else:
-            pruned_idxs = self._prune_filters_by_ratio(
-                scope, [param] + self._get_accumulator(graph, param),
-                ratio,
-                place,
-                lazy=lazy,
-                only_graph=only_graph,
-                param_backup=param_backup,
-                param_shape_backup=param_shape_backup)
-        corrected_idxs = pruned_idxs[:]
-
-        for idx, op in enumerate(related_ops):
-            if op.type() == "conv2d" and (param not in op.all_inputs()):
-                for in_var in op.all_inputs():
-                    if graph.is_parameter(in_var):
-                        conv_param = in_var
-                        self._prune_parameter_by_idx(
-                            scope, [conv_param] + self._get_accumulator(
-                                graph, conv_param),
-                            corrected_idxs,
-                            pruned_axis=1,
-                            place=place,
-                            lazy=lazy,
-                            only_graph=only_graph,
-                            param_backup=param_backup,
-                            param_shape_backup=param_shape_backup)
-            if op.type() == "depthwise_conv2d":
-                for in_var in op.all_inputs():
-                    if graph.is_parameter(in_var):
-                        conv_param = in_var
-                        self._prune_parameter_by_idx(
-                            scope, [conv_param] + self._get_accumulator(
-                                graph, conv_param),
-                            corrected_idxs,
-                            pruned_axis=0,
-                            place=place,
-                            lazy=lazy,
-                            only_graph=only_graph,
-                            param_backup=param_backup,
-                            param_shape_backup=param_shape_backup)
-            elif op.type() == "elementwise_add":
-                # pruning bias
-                for in_var in op.all_inputs():
-                    if graph.is_parameter(in_var):
-                        bias_param = in_var
-                        self._prune_parameter_by_idx(
-                            scope, [bias_param] + self._get_accumulator(
-                                graph, bias_param),
-                            pruned_idxs,
-                            pruned_axis=0,
-                            place=place,
-                            lazy=lazy,
-                            only_graph=only_graph,
-                            param_backup=param_backup,
-                            param_shape_backup=param_shape_backup)
-            elif op.type() == "mul":  # pruning fc layer
-                fc_input = None
-                fc_param = None
-                for in_var in op.all_inputs():
-                    if graph.is_parameter(in_var):
-                        fc_param = in_var
-                    else:
-                        fc_input = in_var
-
-                idx = []
-                feature_map_size = fc_input.shape()[2] * fc_input.shape()[3]
-                range_idx = np.array(range(feature_map_size))
-                for i in corrected_idxs:
-                    idx += list(range_idx + i * feature_map_size)
-                corrected_idxs = idx
-                self._prune_parameter_by_idx(
-                    scope, [fc_param] + self._get_accumulator(graph, fc_param),
-                    corrected_idxs,
-                    pruned_axis=0,
-                    place=place,
-                    lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
-
-            elif op.type() == "concat":
-                concat_inputs = op.all_inputs()
-                last_op = related_ops[idx - 1]
-                for out_var in last_op.all_outputs():
-                    if out_var in concat_inputs:
-                        concat_idx = concat_inputs.index(out_var)
-                offset = 0
-                for ci in range(concat_idx):
-                    offset += concat_inputs[ci].shape()[1]
-                corrected_idxs = [x + offset for x in pruned_idxs]
-            elif op.type() == "batch_norm":
-                bn_inputs = op.all_inputs()
-                mean = bn_inputs[2]
-                variance = bn_inputs[3]
-                alpha = bn_inputs[0]
-                beta = bn_inputs[1]
-                self._prune_parameter_by_idx(
-                    scope, [mean] + self._get_accumulator(graph, mean),
-                    corrected_idxs,
-                    pruned_axis=0,
-                    place=place,
-                    lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
-                self._prune_parameter_by_idx(
-                    scope, [variance] + self._get_accumulator(graph, variance),
-                    corrected_idxs,
-                    pruned_axis=0,
-                    place=place,
-                    lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
-                self._prune_parameter_by_idx(
-                    scope, [alpha] + self._get_accumulator(graph, alpha),
-                    corrected_idxs,
-                    pruned_axis=0,
-                    place=place,
-                    lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
-                self._prune_parameter_by_idx(
-                    scope, [beta] + self._get_accumulator(graph, beta),
-                    corrected_idxs,
-                    pruned_axis=0,
-                    place=place,
-                    lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
-
-    def _prune_parameters(self,
-                          graph,
-                          scope,
-                          params,
-                          ratios,
-                          place,
-                          lazy=False,
-                          only_graph=False,
-                          param_backup=None,
-                          param_shape_backup=None):
-        """
-        Pruning the given parameters.
-        Args:
-            graph(GraphWrapper): The graph to be searched.
-            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
-            params(list<str>): A list of parameter names to be pruned.
-            ratios(list<float>): A list of ratios to be used to pruning parameters.
-            place(fluid.Place): The device place of filter parameters.
-            pruned_idx(list): The index of elements to be pruned.
-            lazy(bool): True means setting the pruned elements to zero.
-                        False means cutting down the pruned elements.
-            only_graph(bool): True means only modifying the graph.
-                              False means modifying graph and variables in  scope.
-
-        """
-        _logger.debug('\n################################')
-        _logger.debug('#       pruning parameters       #')
-        _logger.debug('################################\n')
-        _logger.debug(
-            '|----------------------------------------+----+------------------------------+------------------------------|'
-        )
-        _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format('parameter', 'axis',
-                                                            'from', 'to'))
-        assert len(params) == len(ratios)
-        self.pruned_list = [[], []]
-        for param, ratio in zip(params, ratios):
-            assert isinstance(param, str) or isinstance(param, unicode)
-            param = graph.var(param)
-            self._forward_pruning_ralated_params(
-                graph,
-                scope,
-                param,
-                place,
-                ratio=ratio,
-                lazy=lazy,
-                only_graph=only_graph,
-                param_backup=param_backup,
-                param_shape_backup=param_shape_backup)
-            ops = param.outputs()
-            for op in ops:
-                if op.type() == 'conv2d':
-                    brother_ops = self._search_brother_ops(graph, op)
-                    for broher in brother_ops:
-                        for p in graph.get_param_by_op(broher):
-                            self._forward_pruning_ralated_params(
-                                graph,
-                                scope,
-                                p,
-                                place,
-                                ratio=ratio,
-                                lazy=lazy,
-                                only_graph=only_graph,
-                                param_backup=param_backup,
-                                param_shape_backup=param_shape_backup)
-        _logger.debug(
-            '|----------------------------------------+----+------------------------------+------------------------------|'
-        )
-
-    def _search_brother_ops(self, graph, op_node):
-        """
-        Search brother operators that was affected by pruning of given operator.
-        Args:
-            graph(GraphWrapper): The graph to be searched.
-            op_node(OpWrapper): The start node for searching.
-        Returns: 
-            list<VarWrapper>: A list of operators.
-        """
-        visited = [op_node.idx()]
-        stack = []
-        brothers = []
-        for op in graph.next_ops(op_node):
-            if (op.type() != 'conv2d') and (op.type() != 'fc') and (
-                    not op._is_bwd_op()):
-                stack.append(op)
-                visited.append(op.idx())
-        while len(stack) > 0:
-            top_op = stack.pop()
-            for parent in graph.pre_ops(top_op):
-                if parent.idx() not in visited and (not parent._is_bwd_op()):
-                    if ((parent.type == 'conv2d') or (parent.type == 'fc')):
-                        brothers.append(parent)
-                    else:
-                        stack.append(parent)
-                    visited.append(parent.idx())
-
-            for child in graph.next_ops(top_op):
-                if (child.type != 'conv2d') and (child.type != 'fc') and (
-                        child.idx() not in visited) and (
-                            not child._is_bwd_op()):
-                    stack.append(child)
-                    visited.append(child.idx())
-        return brothers
-
-    def _prune_graph(self, graph, target_graph):
-        """
-        Pruning parameters of graph according to target graph.
-        Args:
-            graph(GraphWrapper): The graph to be pruned.
-            target_graph(GraphWrapper): The reference graph.
-        Return: None
-        """
-        count = 1
-        _logger.debug(
-            '|----+----------------------------------------+------------------------------+------------------------------|'
-        )
-        _logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format('id', 'parammeter',
-                                                            'from', 'to'))
-        for param in target_graph.all_parameters():
-            var = graph.var(param.name())
-            ori_shape = var.shape()
-            var.set_shape(param.shape())
-            _logger.debug(
-                '|----+----------------------------------------+------------------------------+------------------------------|'
-            )
-            _logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format(
-                str(count),
-                str(param.name()), str(ori_shape), str(param.shape())))
-            count += 1
-        _logger.debug(
-            '|----+----------------------------------------+------------------------------+------------------------------|'
-        )
-
-
-class UniformPruneStrategy(PruneStrategy):
-    """
-    The uniform pruning strategy. The parameters will be pruned by uniform ratio.
-    """
-
-    def __init__(self,
-                 pruner=None,
-                 start_epoch=0,
-                 end_epoch=0,
-                 target_ratio=0.5,
-                 metric_name=None,
-                 pruned_params='conv.*_weights'):
-        """
-        Args:
-            pruner(slim.Pruner): The pruner used to prune the parameters.
-            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
-            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
-            target_ratio(float): The flops ratio to be pruned from current model.
-            metric_name(str): The metric used to evaluate the model.
-                         It should be one of keys in out_nodes of graph wrapper.
-            pruned_params(str): The pattern str to match the parameter names to be pruned.
-        """
-        super(UniformPruneStrategy, self).__init__(pruner, start_epoch,
-                                                   end_epoch, target_ratio,
-                                                   metric_name, pruned_params)
-
-    def _get_best_ratios(self, context):
-        """
-        Search a group of ratios for pruning target flops.
-        """
-        _logger.info('_get_best_ratios')
-        pruned_params = []
-        for param in context.eval_graph.all_parameters():
-            if re.match(self.pruned_params, param.name()):
-                pruned_params.append(param.name())
-
-        min_ratio = 0.
-        max_ratio = 1.
-
-        flops = context.eval_graph.flops()
-        model_size = context.eval_graph.numel_params()
-
-        while min_ratio < max_ratio:
-            ratio = (max_ratio + min_ratio) / 2
-            _logger.debug(
-                '-----------Try pruning ratio: {:.2f}-----------'.format(ratio))
-            ratios = [ratio] * len(pruned_params)
-            param_shape_backup = {}
-            self._prune_parameters(
-                context.eval_graph,
-                context.scope,
-                pruned_params,
-                ratios,
-                context.place,
-                only_graph=True,
-                param_shape_backup=param_shape_backup)
-
-            pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
-            pruned_size = 1 - (float(context.eval_graph.numel_params()) /
-                               model_size)
-            _logger.debug('Pruned flops: {:.2f}'.format(pruned_flops))
-            _logger.debug('Pruned model size: {:.2f}'.format(pruned_size))
-            for param in param_shape_backup.keys():
-                context.eval_graph.var(param).set_shape(param_shape_backup[
-                    param])
-
-            if abs(pruned_flops - self.target_ratio) < 1e-2:
-                break
-            if pruned_flops > self.target_ratio:
-                max_ratio = ratio
-            else:
-                min_ratio = ratio
-        _logger.info('Get ratios: {}'.format([round(r, 2) for r in ratios]))
-        return pruned_params, ratios
-
-    def restore_from_checkpoint(self, context):
-        self._prune(context, self.params, self.ratios)
-
-    def _prune(self, context, params, ratios):
-        self._prune_parameters(context.optimize_graph, context.scope, params,
-                               ratios, context.place)
-
-        model_size = context.eval_graph.numel_params()
-        flops = context.eval_graph.flops()
-        _logger.debug('\n################################')
-        _logger.debug('#          pruning eval graph    #')
-        _logger.debug('################################\n')
-        self._prune_graph(context.eval_graph, context.optimize_graph)
-        context.optimize_graph.update_groups_of_conv()
-        context.eval_graph.update_groups_of_conv()
-
-        _logger.info(
-            '------------------finish pruning--------------------------------')
-        _logger.info('Pruned size: {:.2f}'.format(1 - (float(
-            context.eval_graph.numel_params()) / model_size)))
-        _logger.info('Pruned flops: {:.2f}'.format(1 - (float(
-            context.eval_graph.flops()) / flops)))
-
-    def on_epoch_begin(self, context):
-        if context.epoch_id == self.start_epoch:
-            params, ratios = self._get_best_ratios(context)
-            self.params = params
-            self.ratios = ratios
-            self._prune(context, params, ratios)
-            _logger.info(
-                '------------------UniformPruneStrategy.on_compression_begin finish--------------------------------'
-            )
-
-
-class SensitivePruneStrategy(PruneStrategy):
-    """
-    Sensitive pruning strategy. Different pruned ratio was applied on each layer.
-    """
-
-    def __init__(self,
-                 pruner=None,
-                 start_epoch=0,
-                 end_epoch=0,
-                 delta_rate=0.20,
-                 target_ratio=0.5,
-                 metric_name='top1_acc',
-                 pruned_params='conv.*_weights',
-                 sensitivities_file='./sensitivities.data',
-                 sensitivities={},
-                 num_steps=1,
-                 eval_rate=None):
-        """
-        Args:
-            pruner(slim.Pruner): The pruner used to prune the parameters.
-            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0.
-            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 10.
-            delta_rate(float): The delta used to generate ratios when calculating sensitivities. default: 0.2
-            target_ratio(float): The flops ratio to be pruned from current model. default: 0.5
-            metric_name(str): The metric used to evaluate the model.
-                         It should be one of keys in out_nodes of graph wrapper. default: 'top1_acc'
-            pruned_params(str): The pattern str to match the parameter names to be pruned. default: 'conv.*_weights'.
-            sensitivities_file(str): The sensitivities file. default: './sensitivities.data'
-            sensitivities(dict): The user-defined sensitivities. default: {}.
-            num_steps(int): The number of pruning steps. default: 1.
-            eval_rate(float): The rate of sampled data used to calculate sensitivities.
-                              None means using all the data. default: None.
-        """
-        super(SensitivePruneStrategy, self).__init__(pruner, start_epoch,
-                                                     end_epoch, target_ratio,
-                                                     metric_name, pruned_params)
-        self.delta_rate = delta_rate
-        self.pruned_list = []
-        self.sensitivities = sensitivities
-        self.sensitivities_file = sensitivities_file
-        self.num_steps = num_steps
-        self.eval_rate = eval_rate
-        self.pruning_step = 1 - pow((1 - target_ratio), 1.0 / self.num_steps)
-
-    def _save_sensitivities(self, sensitivities, sensitivities_file):
-        """
-        Save sensitivities into file.
-        """
-        with open(sensitivities_file, 'wb') as f:
-            pickle.dump(sensitivities, f)
-
-    def _load_sensitivities(self, sensitivities_file):
-        """
-        Load sensitivities from file.
-        """
-        sensitivities = {}
-        if sensitivities_file and os.path.exists(sensitivities_file):
-            with open(sensitivities_file, 'rb') as f:
-                if sys.version_info < (3, 0):
-                    sensitivities = pickle.load(f)
-                else:
-                    sensitivities = pickle.load(f, encoding='bytes')
-
-        for param in sensitivities:
-            sensitivities[param]['pruned_percent'] = [
-                round(p, 2) for p in sensitivities[param]['pruned_percent']
-            ]
-        self._format_sensitivities(sensitivities)
-        return sensitivities
-
-    def _format_sensitivities(self, sensitivities):
-        """
-        Print formated sensitivities in debug log level.
-        """
-        tb = pt.PrettyTable()
-        tb.field_names = ["parameter", "size"] + [
-            str(round(i, 2))
-            for i in np.arange(self.delta_rate, 1, self.delta_rate)
-        ]
-        for param in sensitivities:
-            if len(sensitivities[param]['loss']) == (len(tb.field_names) - 2):
-                tb.add_row([param, sensitivities[param]['size']] + [
-                    round(loss, 2) for loss in sensitivities[param]['loss']
-                ])
-        _logger.debug('\n################################')
-        _logger.debug('#      sensitivities table     #')
-        _logger.debug('################################\n')
-        _logger.debug(tb)
-
-    def _compute_sensitivities(self, context):
-        """
-        Computing the sensitivities of all parameters.
-        """
-        _logger.info("calling _compute_sensitivities.")
-        cached_id = np.random.randint(1000)
-        if self.start_epoch == context.epoch_id:
-            sensitivities_file = self.sensitivities_file
-        else:
-            sensitivities_file = self.sensitivities_file + ".epoch" + str(
-                context.epoch_id)
-        sensitivities = self._load_sensitivities(sensitivities_file)
-
-        for param in context.eval_graph.all_parameters():
-            if not re.match(self.pruned_params, param.name()):
-                continue
-            if param.name() not in sensitivities:
-                sensitivities[param.name()] = {
-                    'pruned_percent': [],
-                    'loss': [],
-                    'size': param.shape()[0]
-                }
-
-        metric = None
-
-        for param in sensitivities.keys():
-            ratio = self.delta_rate
-            while ratio < 1:
-                ratio = round(ratio, 2)
-                if ratio in sensitivities[param]['pruned_percent']:
-                    _logger.debug('{}, {} has computed.'.format(param, ratio))
-                    ratio += self.delta_rate
-                    continue
-                if metric is None:
-                    metric = self._eval_graph(context, self.eval_rate,
-                                              cached_id)
-
-                param_backup = {}
-                # prune parameter by ratio
-                self._prune_parameters(
-                    context.eval_graph,
-                    context.scope, [param], [ratio],
-                    context.place,
-                    lazy=True,
-                    param_backup=param_backup)
-                self.pruned_list[0]
-                # get accuracy after pruning and update self.sensitivities
-                pruned_metric = self._eval_graph(context, self.eval_rate,
-                                                 cached_id)
-                loss = metric - pruned_metric
-                _logger.info("pruned param: {}; {}; loss={}".format(
-                    param, ratio, loss))
-                for brother in self.pruned_list[0]:
-                    if re.match(self.pruned_params, brother):
-                        if brother not in sensitivities:
-                            sensitivities[brother] = {
-                                'pruned_percent': [],
-                                'loss': []
-                            }
-                        sensitivities[brother]['pruned_percent'].append(ratio)
-                        sensitivities[brother]['loss'].append(loss)
-
-                self._save_sensitivities(sensitivities, sensitivities_file)
-
-                # restore pruned parameters
-                for param_name in param_backup.keys():
-                    param_t = context.scope.find_var(param_name).get_tensor()
-                    param_t.set(self.param_backup[param_name], context.place)
-
-#                pruned_metric = self._eval_graph(context)
-
-                ratio += self.delta_rate
-        return sensitivities
-
-    def _get_best_ratios(self, context, sensitivities, target_ratio):
-        """
-        Search a group of ratios for pruning target flops.
-        """
-        _logger.info('_get_best_ratios for pruning ratie: {}'.format(
-            target_ratio))
-
-        def func(params, x):
-            a, b, c, d = params
-            return a * x * x * x + b * x * x + c * x + d
-
-        def error(params, x, y):
-            return func(params, x) - y
-
-        def slove_coefficient(x, y):
-            init_coefficient = [10, 10, 10, 10]
-            coefficient, loss = leastsq(error, init_coefficient, args=(x, y))
-            return coefficient
-
-        min_loss = 0.
-        max_loss = 0.
-
-        # step 1: fit curve by sensitivities
-        coefficients = {}
-        for param in sensitivities:
-            losses = np.array([0] * 5 + sensitivities[param]['loss'])
-            precents = np.array([0] * 5 + sensitivities[param][
-                'pruned_percent'])
-            coefficients[param] = slove_coefficient(precents, losses)
-            loss = np.max(losses)
-            max_loss = np.max([max_loss, loss])
-
-        # step 2: Find a group of ratios by binary searching.
-        flops = context.eval_graph.flops()
-        model_size = context.eval_graph.numel_params()
-        ratios = []
-        while min_loss < max_loss:
-            loss = (max_loss + min_loss) / 2
-            _logger.info(
-                '-----------Try pruned ratios while acc loss={:.4f}-----------'.
-                format(loss))
-            ratios = []
-            # step 2.1: Get ratios according to current loss
-            for param in sensitivities:
-                coefficient = copy.deepcopy(coefficients[param])
-                coefficient[-1] = coefficient[-1] - loss
-                roots = np.roots(coefficient)
-                for root in roots:
-                    min_root = 1
-                    if np.isreal(root) and root > 0 and root < 1:
-                        selected_root = min(root.real, min_root)
-                ratios.append(selected_root)
-            _logger.info('Pruned ratios={}'.format(
-                [round(ratio, 3) for ratio in ratios]))
-            # step 2.2: Pruning by current ratios
-            param_shape_backup = {}
-            self._prune_parameters(
-                context.eval_graph,
-                context.scope,
-                sensitivities.keys(),
-                ratios,
-                context.place,
-                only_graph=True,
-                param_shape_backup=param_shape_backup)
-
-            pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
-            pruned_size = 1 - (float(context.eval_graph.numel_params()) /
-                               model_size)
-            _logger.info('Pruned flops: {:.4f}'.format(pruned_flops))
-            _logger.info('Pruned model size: {:.4f}'.format(pruned_size))
-            for param in param_shape_backup.keys():
-                context.eval_graph.var(param).set_shape(param_shape_backup[
-                    param])
-
-            # step 2.3: Check whether current ratios is enough
-            if abs(pruned_flops - target_ratio) < 0.015:
-                break
-            if pruned_flops > target_ratio:
-                max_loss = loss
-            else:
-                min_loss = loss
-        return sensitivities.keys(), ratios
-
-    def _current_pruning_target(self, context):
-        '''
-        Get the target pruning rate in current epoch.
-        '''
-        _logger.info('Left number of pruning steps: {}'.format(self.num_steps))
-        if self.num_steps <= 0:
-            return None
-        if (self.start_epoch == context.epoch_id) or context.eval_converged(
-                self.metric_name, 0.005):
-            self.num_steps -= 1
-            return self.pruning_step
-
-    def on_epoch_begin(self, context):
-        current_ratio = self._current_pruning_target(context)
-        if current_ratio is not None:
-            sensitivities = self._compute_sensitivities(context)
-            params, ratios = self._get_best_ratios(context, sensitivities,
-                                                   current_ratio)
-            self._prune_parameters(context.optimize_graph, context.scope,
-                                   params, ratios, context.place)
-
-            model_size = context.eval_graph.numel_params()
-            flops = context.eval_graph.flops()
-            _logger.debug('################################')
-            _logger.debug('#          pruning eval graph    #')
-            _logger.debug('################################')
-            self._prune_graph(context.eval_graph, context.optimize_graph)
-            context.optimize_graph.update_groups_of_conv()
-            context.eval_graph.update_groups_of_conv()
-            context.optimize_graph.compile()  # to update the compiled program
-            context.eval_graph.compile(
-                for_parallel=False,
-                for_test=True)  # to update the compiled program
-            _logger.info(
-                '------------------finish pruning--------------------------------'
-            )
-            _logger.info('Pruned size: {:.3f}'.format(1 - (float(
-                context.eval_graph.numel_params()) / model_size)))
-            _logger.info('Pruned flops: {:.3f}'.format(1 - (float(
-                context.eval_graph.flops()) / flops)))
-            metric = self._eval_graph(context)
-            _logger.info('Metric after pruning: {:.2f}'.format(metric))
-            _logger.info(
-                '------------------SensitivePruneStrategy.on_epoch_begin finish--------------------------------'
-            )
diff --git a/python/paddle/fluid/contrib/slim/prune/pruner.py b/python/paddle/fluid/contrib/slim/prune/pruner.py
deleted file mode 100644
index 506b8fbe1de2e0f8a036f591bd2baacd5759c9c8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/prune/pruner.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import collections
-from .... import layers
-
-__all__ = ['Pruner', 'StructurePruner']
-
-
-class Pruner(object):
-    """
-    Base class of all pruners.
-    """
-
-    def __init__(self):
-        pass
-
-    def prune(self, param):
-        pass
-
-
-class StructurePruner(Pruner):
-    """
-    Pruner used to pruning parameters by groups.
-    """
-
-    def __init__(self, pruning_axis, criterions):
-        """
-        Args:
-            pruning_axis(dict): The key is the name of parameter to be pruned,
-                                '*' means all the parameters.
-                                The value is the axis to be used. Given a parameter
-                                with shape [3, 4], the result of pruning 50% on aixs 1
-                                is a parameter with shape [3, 2].
-            criterions(dict): The key is the name of parameter to be pruned,
-                              '*' means all the parameters.
-                              The value is the criterion used to sort groups for pruning.
-                              It only supports 'l1_norm' currently.
-        """
-        self.pruning_axis = pruning_axis
-        self.criterions = criterions
-
-    def cal_pruned_idx(self, name, param, ratio, axis=None):
-        """
-        Calculate the index to be pruned on axis by given pruning ratio.
-        Args:
-            name(str): The name of parameter to be pruned.
-            param(np.array): The data of parameter to be pruned.
-            ratio(float): The ratio to be pruned.
-            axis(int): The axis to be used for pruning given parameter.
-                       If it is None, the value in self.pruning_axis will be used.
-                       default: None.
-        Returns:
-            list<int>: The indexes to be pruned on axis.
-        """
-        criterion = self.criterions[
-            name] if name in self.criterions else self.criterions['*']
-        if axis is None:
-            assert self.pruning_axis is not None, "pruning_axis should set if axis is None."
-            axis = self.pruning_axis[
-                name] if name in self.pruning_axis else self.pruning_axis['*']
-        prune_num = int(round(param.shape[axis] * ratio))
-        reduce_dims = [i for i in range(len(param.shape)) if i != axis]
-        if criterion == 'l1_norm':
-            criterions = np.sum(np.abs(param), axis=tuple(reduce_dims))
-        pruned_idx = criterions.argsort()[:prune_num]
-        return pruned_idx
-
-    def prune_tensor(self, tensor, pruned_idx, pruned_axis, lazy=False):
-        """
-        Pruning a array by indexes on given axis.
-        Args:
-            tensor(numpy.array): The target array to be pruned.
-            pruned_idx(list<int>): The indexes to be pruned.
-            pruned_axis(int): The axis of given array to be pruned on. 
-            lazy(bool): True means setting the pruned elements to zero.
-                        False means remove the pruned elements from memory.
-                        default: False.
-        Returns:
-            numpy.array: The pruned array.
-        """
-        mask = np.zeros(tensor.shape[pruned_axis], dtype=bool)
-        mask[pruned_idx] = True
-
-        def func(data):
-            return data[~mask]
-
-        def lazy_func(data):
-            data[mask] = 0
-            return data
-
-        if lazy:
-            return np.apply_along_axis(lazy_func, pruned_axis, tensor)
-        else:
-            return np.apply_along_axis(func, pruned_axis, tensor)
diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py
deleted file mode 100644
index 659265895a594862c3e32c6360f7ddabf53e3b64..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/quantization/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import quantization_pass
-from .quantization_pass import *
-from . import quantization_strategy
-from .quantization_strategy import *
-from . import mkldnn_post_training_strategy
-from .mkldnn_post_training_strategy import *
-from . import quantization_mkldnn_pass
-from .quantization_mkldnn_pass import *
-
-__all__ = quantization_pass.__all__ + quantization_strategy.__all__
-__all__ += mkldnn_post_training_strategy.__all__
-__all__ += quantization_mkldnn_pass.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/mkldnn_post_training_strategy.py b/python/paddle/fluid/contrib/slim/quantization/mkldnn_post_training_strategy.py
deleted file mode 100644
index ad5ef33bf770395efd50fce06021e7ec7c4db4af..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/quantization/mkldnn_post_training_strategy.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import logging
-import six
-import numpy as np
-from .... import core
-from ..core.strategy import Strategy
-from ....log_helper import get_logger
-
-__all__ = ['MKLDNNPostTrainingQuantStrategy']
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class MKLDNNPostTrainingQuantStrategy(Strategy):
-    """
-    The strategy for MKL-DNN Post Training quantization strategy.
-    """
-
-    def __init__(self,
-                 int8_model_save_path=None,
-                 fp32_model_path=None,
-                 cpu_math_library_num_threads=1):
-        """
-        Args:
-            int8_model_save_path(str): int8_model_save_path is used to save an int8 ProgramDesc
-                        with fp32 weights which is used for MKL-DNN int8 inference. For post training quantization,
-                        MKLDNNPostTrainingQuantStrategy only supports converting a fp32 ProgramDesc
-                        with fp32 weights to an int8 ProgramDesc with fp32 weights now. The saved
-                        int8 ProgramDesc with fp32 weights only can be executed with MKL-DNN enabled.
-                        None means it doesn't save int8 ProgramDesc with fp32 weights. default: None.
-            fp32_model_path(str): fp32_model_path is used to load an original fp32 ProgramDesc with fp32 weights.
-                        None means it doesn't have a fp32 ProgramDesc with fp32 weights. default: None.
-            cpu_math_library_num_threads(int): The number of cpu math library threads which is used on
-                        MKLDNNPostTrainingQuantStrategy. 1 means it only uses one cpu math library
-                        thread. default: 1
-        """
-
-        super(MKLDNNPostTrainingQuantStrategy, self).__init__(0, 0)
-        self.int8_model_save_path = int8_model_save_path
-        if fp32_model_path is None:
-            raise Exception("fp32_model_path is None")
-        self.fp32_model_path = fp32_model_path
-        self.cpu_math_library_num_threads = cpu_math_library_num_threads
-
-    def on_compression_begin(self, context):
-        """
-        Prepare the data and quantify the model
-        """
-
-        super(MKLDNNPostTrainingQuantStrategy,
-              self).on_compression_begin(context)
-        _logger.info('InferQuantStrategy::on_compression_begin')
-
-        # Prepare the Analysis Config
-        infer_config = core.AnalysisConfig("AnalysisConfig")
-        infer_config.switch_ir_optim(True)
-        infer_config.disable_gpu()
-        infer_config.set_model(self.fp32_model_path)
-        infer_config.enable_mkldnn()
-        infer_config.set_cpu_math_library_num_threads(
-            self.cpu_math_library_num_threads)
-
-        # Prepare the data for calculating the quantization scales
-        warmup_reader = context.eval_reader()
-        if six.PY2:
-            data = warmup_reader.next()
-
-        if six.PY3:
-            data = warmup_reader.__next__()
-
-        num_images = len(data)
-        image_data = [img.tolist() for (img, _) in data]
-        image_data = np.array(image_data).astype("float32").reshape(
-            [num_images, ] + list(data[0][0].shape))
-        image_data = image_data.ravel()
-        images = core.PaddleTensor(image_data, "x")
-        images.shape = [num_images, ] + list(data[0][0].shape)
-
-        label_data = [label for (_, label) in data]
-        labels = core.PaddleTensor(
-            np.array(label_data).astype("int64").reshape([num_images, 1]), "y")
-
-        warmup_data = [images, labels]
-
-        # Enable the INT8 Quantization
-        infer_config.enable_quantizer()
-        infer_config.quantizer_config().set_quant_data(warmup_data)
-        infer_config.quantizer_config().set_quant_batch_size(num_images)
-
-        # Run INT8 MKL-DNN Quantization
-        predictor = core.create_paddle_predictor(infer_config)
-        if self.int8_model_save_path:
-            if not os.path.exists(self.int8_model_save_path):
-                os.makedirs(self.int8_model_save_path)
-            predictor.SaveOptimModel(self.int8_model_save_path)
-
-        _logger.info(
-            'Finish MKLDNNPostTrainingQuantStrategy::on_compresseion_begin')
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py
deleted file mode 100644
index 9ea03a49dad943d5e4a5ad315c90df67143fe35c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py
+++ /dev/null
@@ -1,552 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from .... import core
-from ....framework import IrGraph
-from ....framework import IrNode
-
-__all__ = ['FakeQAT2MkldnnINT8KernelPass', 'FakeQAT2MkldnnINT8PerfPass']
-
-
-class FakeQAT2MkldnnINT8KernelPass(object):
-    """
-    Convert QuantizationFreezePass generated IrGraph to MKL-DNN supported INT8
-    IrGraph. Following transformations did in this pass:
-        1. Convert int8 range weights with float32 data type, which are generated by
-           the QuantizationFreezePass, to float32 range weights with float32 data type
-           by using the corresponding scales. This conversion is because MKL-DNN INT8
-           conv2d kernel and mul kernel now only support float32 weights input, hence
-           weights quantization will happen inside the conv2d and mul INT8 kernel.
-        2. Create the new conv2d or mul op with the converted weights and link its output
-           to fake_dequantize_abs_max op's output and set conv2d's attribute "force_fp32
-           _output" as true
-        3. Transform fake_quantize_xx op to quantize op
-        4. Remove fake_dequantize_abs_max op
-    """
-
-    def __init__(self, _scope=None, _place=None):
-        """
-        Args:
-            scope(fluid.Scope): scope is used to initialize the new parameters.
-            place(fluid.CPUPlace): place is used to initialize the new parameters.
-
-
-        Examples:
-        .. code-block:: python
-            # The original graph will be rewrite.
-            import paddle.fluid as fluid
-            from paddle.fluid.contrib.slim.quantization \
-                import FakeQAT2MkldnnINT8KernelPass
-            from paddle.fluid.framework import IrGraph
-            from paddle.fluid import core
-
-            graph = IrGraph(core.Graph(fluid.Program().desc), for_test=False)
-            place = fluid.CPUPlace()
-            mkldnn_pass = FakeQAT2MkldnnINT8KernelPass(fluid.global_scope(),
-            place)
-            mkldnn_pass.apply(graph)
-        """
-
-        self._scope = _scope
-        self._place = _place
-
-        self._quantize_type = [
-            'fake_quantize_moving_average_abs_max',
-            'fake_quantize_range_abs_max'
-        ]
-        self._dequantize_type = ['fake_dequantize_max_abs']
-        self._quantize_dequantize_type = [
-            'fake_quantize_dequantize_moving_average_abs_max'
-        ]
-
-        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
-        self._conv_ops = ['conv2d', 'depthwise_conv2d']
-        self._pool_ops = ['pool2d']
-
-        self._in_scale = {}
-        self._max_range = {}
-        self._new_output = {}
-        self._s8_max = 127
-
-    def apply(self, graph):
-        """
-        Quantize the graph for running MKL-DNN INT8 inference. According
-        to activation quantization type, the graph will transform fake
-        quantize ops to quantize ops and remove the fake dequantize ops.
-
-        Args:
-            graph(IrGraph): the applied graph.
-        """
-
-        assert isinstance(graph,
-                          IrGraph), 'graph must be the instance of IrGraph.'
-        ops = graph.all_op_nodes()
-
-        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
-        # Collect the _in_scales and _max_range to calculate the new scales for MKL-DNN
-        # INT8 conv2d and mul
-        for op_node in ops:
-            if op_node.name() in self._dequantize_type:
-                input_name = op_node.input("X")[0]
-                scale_name = op_node.input("Scale")[0]
-                self._in_scale[input_name] = self._load_param(self._scope,
-                                                              scale_name)[0]
-                self._max_range[input_name] = op_node.op().attr("max_range")
-                self._new_output[input_name] = op_node.output("Out")[0]
-
-            if op_node.name() in self._quantize_dequantize_type:
-                inputs = op_node.op().input_names()
-                attrs = op_node.op().attr_names()
-                input_name = op_node.input("X")[0]
-                scale_name = op_node.input("InScale")[0]
-                self._in_scale[input_name] = self._load_param(self._scope,
-                                                              scale_name)[0]
-                #  self._max_range[input_name] = op_node.op().attr("max_range")
-                self._new_output[input_name] = op_node.output("Out")[0]
-
-        for op_node in ops:
-            if op_node.name() in self._quantizable_ops:
-                if op_node.name() in self._conv_ops:
-                    self._transform_to_conv_mkldnn(graph, op_node)
-                elif op_node.name() in self._pool_ops:
-                    self._transform_to_pool_mkldnn(graph, op_node)
-                else:
-                    self._transform_to_mul_mkldnn(graph, op_node)
-            elif op_node.name() in self._quantize_type:
-                self._transform_to_quantize_mkldnn(graph, op_node)
-            elif op_node.name() in self._dequantize_type:
-                self._remove_fake_dequantize_op(graph, op_node)
-            self._remove_unused_var_nodes(graph)
-        return graph
-
-    def _transform_to_pool_mkldnn(self, graph, op):
-        output_name = op.output("Out")[0]
-        input_name = op.input("X")[0]
-
-    def _transform_to_conv_mkldnn(self, graph, op_node):
-        weight_name = op_node.input("Filter")[0]
-        output_name = op_node.output("Output")[0]
-        # Convert int8 range weights to fp32 range weights
-        weight = self._load_param(self._scope, weight_name)
-        w_fp32 = np.divide(
-            np.multiply(weight, self._s8_max), self._max_range[output_name])
-        w_fp32 = w_fp32.reshape(weight.shape)
-        self._restore_var(weight_name, w_fp32)
-        input_var_node = graph._find_node_by_name(op_node.inputs,
-                                                  op_node.input("Input")[0])
-        weight_var_node = graph._find_node_by_name(op_node.inputs, weight_name)
-
-        # Set fake_dequantize_abs_max's output as new output of conv2d
-        output_var_node = graph._find_node_by_name(
-            graph.all_var_nodes(), self._new_output[output_name])
-        attrs = {
-            name: op_node.op().attr(name)
-            for name in op_node.op().attr_names()
-        }
-
-        conv_op_node = graph.create_op_node(
-            op_type='conv2d',
-            attrs=attrs,
-            inputs={'Input': input_var_node,
-                    'Filter': weight_var_node},
-            outputs={'Output': output_var_node})
-
-        # Based on the QAT's scales to calculate the scales of MKL-DNN INT8 conv2d
-        scale_in = self._s8_max / self._in_scale[output_name]
-        scale_w = []
-        scale_w = [self._max_range[output_name] / self._s8_max]
-
-        conv_op_node.set_attr("Scale_weights", scale_w)
-        conv_op_node.set_attr("Scale_in", scale_in)
-        conv_op_node.set_attr("Scale_out", 1.0)
-        conv_op_node.set_attr("use_mkldnn", 1)
-        conv_op_node.set_attr("force_fp32_output", 1)
-        graph.link_to(input_var_node, conv_op_node)
-        graph.link_to(weight_var_node, conv_op_node)
-        graph.link_to(conv_op_node, output_var_node)
-        graph.safe_remove_nodes(op_node)
-
-    def _transform_to_mul_mkldnn(self, graph, op_node):
-        # For MKL-DNN INT8 mul, input Y should be the weights
-        weight_name = op_node.input("Y")[0]
-        output_name = op_node.output("Out")[0]
-        # Convert int8 range weights to fp32 range weights
-        weight = self._load_param(self._scope, weight_name)
-        w_fp32 = np.divide(
-            np.multiply(weight, self._s8_max), self._max_range[output_name])
-        w_fp32 = w_fp32.reshape(weight.shape)
-        self._restore_var(weight_name, w_fp32)
-        input_var_node = graph._find_node_by_name(op_node.inputs,
-                                                  op_node.input("X")[0])
-        weight_var_node = graph._find_node_by_name(op_node.inputs, weight_name)
-
-        # Set fake_dequantize_abs_max's output as new output of mul
-        output_var_node = graph._find_node_by_name(
-            graph.all_var_nodes(), self._new_output[output_name])
-        attrs = {
-            name: op_node.op().attr(name)
-            for name in op_node.op().attr_names()
-        }
-
-        mul_op_node = graph.create_op_node(
-            op_type='mul',
-            attrs=attrs,
-            inputs={'X': input_var_node,
-                    'Y': weight_var_node},
-            outputs={'Out': output_var_node})
-
-        # Based on the QAT's scales to calculate MKL-DNN INT8 mul's scales
-        scale_in = self._s8_max / self._in_scale[output_name]
-        scale_w = []
-        scale_w = [self._max_range[output_name] / self._s8_max]
-
-        mul_op_node.set_attr("scale_y", scale_w)
-        mul_op_node.set_attr("scale_x", scale_in)
-        mul_op_node.set_attr("scale_out", 1.0)
-        mul_op_node.set_attr("use_mkldnn", 1)
-        mul_op_node.set_attr("force_fp32_output", 1)
-        graph.link_to(input_var_node, mul_op_node)
-        graph.link_to(weight_var_node, mul_op_node)
-        graph.link_to(mul_op_node, output_var_node)
-        graph.safe_remove_nodes(op_node)
-
-    def _transform_to_quantize_mkldnn(self, graph, op_node):
-        """
-        Transform fake_quantize_xx op to quantize mkldnn op in the graph.
-        """
-        input_var_node = graph._find_node_by_name(op_node.inputs,
-                                                  op_node.input("X")[0])
-        output_var_node = graph._find_node_by_name(op_node.outputs,
-                                                   op_node.output("Out")[0])
-        scale_in = self._s8_max / self._load_param(
-            self._scope, op_node.input("InScale")[0])[0]
-        quant_op_node = graph.create_op_node(
-            op_type='quantize',
-            attrs={
-                'data_format': 'MKLDNNLAYOUT',
-                'use_mkldnn': 1,
-                'Scale': scale_in,
-                'is_negative_input': 1
-            },
-            inputs={'Input': input_var_node},
-            outputs={'Output': output_var_node})
-        graph.link_to(input_var_node, quant_op_node)
-        graph.link_to(quant_op_node, output_var_node)
-        graph.safe_remove_nodes(op_node)
-
-    def _remove_fake_dequantize_op(self, graph, op_node):
-        input_var_node = graph._find_node_by_name(op_node.inputs,
-                                                  op_node.input("X")[0])
-        graph.safe_remove_nodes(op_node)
-
-    def _load_param(self, scope, param_name):
-        return np.array(scope.find_var(param_name).get_tensor())
-
-    def _restore_var(self, name, array):
-        tensor = self._scope.find_var(name).get_tensor()
-        tensor.set(array, self._place)
-
-    def _remove_unused_var_nodes(self, graph):
-        all_used_vars = set()
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            for input_node in op_node.inputs:
-                all_used_vars.add(input_node)
-            for output_node in op_node.outputs:
-                all_used_vars.add(output_node)
-
-        all_used_vars = {n.node for n in all_used_vars}
-        all_unused_vars = {
-            n
-            for n in filter(lambda node: node.node not in all_used_vars,
-                            graph.all_var_nodes())
-        }
-        graph.safe_remove_nodes(all_unused_vars)
-
-
-class FakeQAT2MkldnnINT8PerfPass(object):
-    """
-    Transform a QAT model IrGraph into MKL-DNN supported INT8 IrGraph.
-    The pass consists of the following transformations:
-        1. gather scale values from fake quantize/dequantize operators,
-        2. extract FP32 inference model graph from the QAT graph, i.e.
-            a.  remove fake quantize/dequantize operators,
-            b.  dequantize conv2d and mul's weights,
-        3. optimize the FP32 graph using standard FP32 optimization fuses
-            (e.g. `conv2d`+`bn` -> `conv2d`),
-        4. quantize the optimized FP32 graph using standard INT8v2 quantization
-            passes (`cpu_quantize_pass`, `cpu_quantize_squash_pass`).
-    """
-
-    def __init__(self, _scope=None, _place=None, _core=None, _debug=False):
-        self._scope = _scope
-        self._place = _place
-        self._core = _core
-        self._debug = _debug
-        self._quantize_types = [
-            'fake_quantize_moving_average_abs_max',
-            'fake_quantize_range_abs_max',
-            'fake_quantize_dequantize_moving_average_abs_max'
-        ]
-        self._fake_quantize_types = [
-            'fake_quantize_moving_average_abs_max',
-            'fake_quantize_dequantize_moving_average_abs_max'
-        ]
-        self._fake_dequantize_types = ['fake_dequantize_max_abs']
-        self._conv_ops = ['conv2d', 'depthwise_conv2d']
-        self._pool_ops = ['pool2d']
-        self._mul_ops = ['mul']
-        self._fc_ops = ['fc']
-        self._weight_scales = {}
-        # Collect the Input and Output sclaes from Fake QAT models
-        self._var_quant_scales = {}
-        self._max_range = {}
-        self._s8_max = 127
-
-    def apply(self, graph):
-        assert isinstance(graph,
-                          IrGraph), 'graph must be the instance of IrGraph.'
-
-        graph = self._gather_scales(graph)
-        graph = self._remove_fake_ops(graph)
-        graph = self._update_pooling_scales(graph)
-        graph = self._dequantize_weights(graph)
-        graph = self._optimize_fp32_graph(graph)
-        graph = self._compute_weight_scales(graph)
-        graph = self._quantize_fp32_graph(graph)
-        graph = self._remove_unused_var_nodes(graph)
-        return graph
-
-    def _convert_scale2tensor(self, scale):
-        tensor = core.LoDTensor()
-        tensor.set(scale, core.CPUPlace())
-        return tensor
-
-    def _gather_scales(self, graph):
-        for op in graph.all_op_nodes():
-            if op.name() in self._quantize_types:
-                bit_length = op.op().attr("bit_length")
-                assert bit_length == 8, 'Unsupported number quantization bits ({}). Only 8 is supported now.'.format(
-                    bit_length)
-
-                input_name = op.input("X")[0]
-                scale_name = op.input("InScale")[0]
-                # Gather new weights scale after folding batchnorm in convolution
-                scale = np.array(1.0 / self._load_param(
-                    self._scope, scale_name)[0]).astype(np.float64)
-                lod_tensor = self._convert_scale2tensor(scale)
-                use_unsigned_int = False
-                self._var_quant_scales[input_name] = (use_unsigned_int,
-                                                      lod_tensor)
-
-            if op.name() in self._fake_dequantize_types:
-                input_name = op.input("X")[0]
-                _max_range = op.op().attr("max_range")
-                self._weight_scales[input_name] = _max_range
-        return graph
-
-    def _update_pooling_scales(self, graph):
-        for op in graph.all_op_nodes():
-            if op.name() in self._pool_ops:
-                input_name = op.input("X")[0]
-                output_name = op.output("Out")[0]
-                if input_name in self._var_quant_scales:
-                    self._var_quant_scales[
-                        output_name] = self._var_quant_scales[input_name]
-        return graph
-
-    def _load_param(self, scope, param_name):
-        return np.array(scope.find_var(param_name).get_tensor())
-
-    def _remove_fake_ops(self, graph):
-        for op in graph.all_op_nodes():
-            if op.name() in self._fake_quantize_types:
-                op_out = graph._find_node_by_name(op.outputs,
-                                                  op.output("Out")[0])
-                self._remove_fake_quantize(graph, op)
-
-        for op in graph.all_op_nodes():
-            if op.name() in self._fake_dequantize_types:
-                op_in = graph._find_node_by_name(op.inputs, op.input("X")[0])
-                self._remove_fake_dequantize(graph, op)
-        return graph
-
-    def _remove_fake_quantize(self, graph, op):
-        fake_quant_in = graph._find_node_by_name(op.inputs, op.input("X")[0])
-        fake_quant_in_scale = graph._find_node_by_name(op.inputs,
-                                                       op.input("InScale")[0])
-        fake_quant_out = graph._find_node_by_name(op.outputs,
-                                                  op.output("Out")[0])
-        fake_quant_out_scale = graph._find_node_by_name(
-            op.outputs, op.output("OutScale")[0])
-
-        next_ops = fake_quant_out.outputs
-        for next_op in next_ops:
-            self._swap_inputs(next_op, fake_quant_out, fake_quant_in)
-            graph.link_to(fake_quant_in, next_op)
-        graph.safe_remove_nodes(
-            {op, fake_quant_in_scale, fake_quant_out, fake_quant_out_scale})
-
-        return graph
-
-    def _remove_fake_dequantize(self, graph, op):
-        fake_dequant_in = graph._find_node_by_name(op.inputs, op.input("X")[0])
-        fake_dequant_out = graph._find_node_by_name(op.outputs,
-                                                    op.output("Out")[0])
-
-        next_ops = fake_dequant_out.outputs
-        for next_op in next_ops:
-            self._swap_inputs(next_op, fake_dequant_out, fake_dequant_in)
-            graph.link_to(fake_dequant_in, next_op)
-        graph.safe_remove_nodes({op, fake_dequant_out})
-
-        return graph
-
-    def _swap_inputs(self, op, old_input, new_input):
-        for input_name in op.op().input_names():
-            if old_input.name() in op.input(input_name):
-                op.op().set_input(input_name, [
-                    new_input.name() if x == old_input.name() else x
-                    for x in op.input(input_name)
-                ])
-
-    def _dequantize_weights(self, graph):
-        for op in graph.all_op_nodes():
-            if op.name() in self._conv_ops:
-                self._dequantize_conv_weights(graph, op)
-            elif op.name() in self._mul_ops:
-                self._dequantize_mul_weights(graph, op)
-        return graph
-
-    def _dequantize_conv_weights(self, graph, op_node):
-        weight_name = op_node.input("Filter")[0]
-        output_name = op_node.output("Output")[0]
-        # Convert int8 range weights to fp32 range weights
-        scales = self._weight_scales[output_name]
-        weight = self._load_param(self._scope, weight_name)
-        w_fp32 = np.divide(np.multiply(weight, self._s8_max), scales)
-        w_fp32 = w_fp32.reshape(weight.shape)
-        self._restore_var(weight_name, w_fp32)
-
-    def _dequantize_mul_weights(self, graph, op_node):
-        weight_name = op_node.input("Y")[0]
-        output_name = op_node.output("Out")[0]
-        scales = self._weight_scales[output_name]
-        weight = self._load_param(self._scope, weight_name)
-        w_fp32 = np.divide(np.multiply(weight, self._s8_max), scales)
-        w_fp32 = w_fp32.reshape(weight.shape)
-        self._restore_var(weight_name, w_fp32)
-
-    def _restore_var(self, name, array):
-        tensor = self._scope.find_var(name).get_tensor()
-        tensor.set(array, self._place)
-
-    def _optimize_fp32_graph(self, graph):
-        graph = self._apply_pass(graph, 'mkldnn_placement_pass',
-                                 ['mkldnn_enabled_op_types'], [set()])
-        graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass')
-        graph = self._apply_pass(graph, 'conv_bn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_bias_mkldnn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_elementwise_add_mkldnn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_relu_mkldnn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_relu6_mkldnn_fuse_pass')
-        graph = self._apply_pass(graph, 'fc_fuse_pass')
-        return graph
-
-    def _apply_pass(self, graph, pass_name, attrs=None, attr_values=None):
-        ir_pass = core.get_pass(pass_name)
-        inference_program = graph.to_program()
-        ir_graph = core.Graph(inference_program.desc)
-        ir_graph.set_not_owned('__param_scope__', self._scope)
-        if attrs:
-            assert attr_values and len(attrs) == len(
-                attr_values
-            ), "Different number of pass attributes and their values."
-            for attr, value in zip(attrs, attr_values):
-                ir_pass.set(attr, value)
-        ir_pass.apply(ir_graph)
-        graph = IrGraph(ir_graph, for_test=True)
-        if self._debug:
-            graph.draw('.', 'qat_fp32_{}'.format(pass_name),
-                       graph.all_op_nodes())
-        self._remove_unused_var_nodes(graph)
-        return graph
-
-    def _remove_unused_var_nodes(self, graph):
-        all_used_vars = set()
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            for input_node in op_node.inputs:
-                all_used_vars.add(input_node)
-            for output_node in op_node.outputs:
-                all_used_vars.add(output_node)
-
-        all_used_vars = {n.node for n in all_used_vars}
-        all_unused_vars = {
-            n
-            for n in filter(lambda node: node.node not in all_used_vars,
-                            graph.all_var_nodes())
-        }
-        graph.safe_remove_nodes(all_unused_vars)
-        return graph
-
-    def _compute_weight_scales(self, graph):
-        def _compute_var_scales(ops, out_name, w_name, axis):
-            for op in graph.all_op_nodes():
-                if op.op().type() in ops:
-                    weight_var_name = op.input(w_name)[0]
-                    weights = np.array(
-                        self._load_param(self._scope, weight_var_name))
-                    scales = 1.0 / np.amax(
-                        np.abs(weights.reshape(weights.shape[0], -1)),
-                        axis=axis)
-
-                    lod_tensor = self._convert_scale2tensor(
-                        scales.astype(np.float64))
-                    use_unsigned_int = False
-                    self._var_quant_scales[weight_var_name] = (use_unsigned_int,
-                                                               lod_tensor)
-
-        _compute_var_scales(self._conv_ops, "Output", "Filter", axis=1)
-        _compute_var_scales(self._fc_ops, "Out", "W", axis=0)
-        return graph
-
-    def _find_avg_pooling_ids(self, graph):
-        ids = []
-        for op in graph.all_op_nodes():
-            if op.name() in self._pool_ops:
-                if op.op().attr("pooling_type") == "avg":
-                    ids.append(op.id())
-        return set(ids)
-
-    def _quantize_fp32_graph(self, graph):
-        ir_pass = self._core.get_pass('cpu_quantize_placement_pass')
-        inference_program = graph.to_program()
-        ir_graph = self._core.Graph(inference_program.desc)
-        ir_pass.set('quantize_enabled_op_types', {'conv2d', 'pool2d'})
-        ir_pass.set('quantize_excluded_op_ids',
-                    self._find_avg_pooling_ids(graph))
-        ir_pass.apply(ir_graph)
-        graph = IrGraph(ir_graph, for_test=True)
-        if self._debug:
-            graph.draw('.', 'qat_int8_{}'.format(ir_pass.type()),
-                       graph.all_op_nodes())
-
-        graph = self._apply_pass(graph, 'cpu_quantize_pass',
-                                 ['quant_var_scales'],
-                                 [self._var_quant_scales])
-        graph = self._apply_pass(graph, 'cpu_quantize_squash_pass')
-        return graph
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
deleted file mode 100644
index 15a91c063d0f901c55881fdfd3edff368ebe2afc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ /dev/null
@@ -1,1316 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import numpy as np
-from ..... import compat as cpt
-from .... import core
-from ....framework import IrGraph
-from ....framework import IrNode
-from .... import unique_name
-
-__all__ = [
-    'QuantizationTransformPass', 'QuantizationFreezePass', 'ConvertToInt8Pass',
-    'TransformForMobilePass', 'ScaleForTrainingPass', 'ScaleForInferencePass',
-    'AddQuantDequantPass'
-]
-
-_quantizable_op_list = ['conv2d', 'depthwise_conv2d', 'mul', 'pool2d']
-
-_fake_quant_op_list = [
-    'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
-    'fake_quantize_moving_average_abs_max', 'fake_channel_wise_quantize_abs_max'
-]
-
-_fake_dequant_op_list = [
-    'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
-]
-
-_out_scale_op_list = [
-    "mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", "depthwise_conv2d",
-    "batch_norm", "concat", "tanh", "pad", "elementwise_add", "elementwise_mul",
-    "dropout", "split", "prelu", "conv2d_transpose", "leaky_relu"
-]
-
-
-def _init_var_node(var_node, value, scope, place):
-    assert isinstance(value,
-                      np.ndarray), 'The type of value should be numpy array.'
-    assert scope is not None, \
-        'The scope cannot be set None.'
-    assert place is not None, \
-        'The place cannot be set None.'
-    tensor = scope.var(var_node.name()).get_tensor()
-    tensor.set(value, place)
-
-
-class QuantizationTransformPass(object):
-    def __init__(self,
-                 scope=None,
-                 place=None,
-                 weight_bits=8,
-                 activation_bits=8,
-                 activation_quantize_type='abs_max',
-                 weight_quantize_type='abs_max',
-                 window_size=10000,
-                 moving_rate=0.9,
-                 skip_pattern='skip_quant'):
-        """
-        Convert and rewrite the IrGraph according to weight and
-        activation quantization type.
-
-        Args:
-            scope(fluid.Scope): When activation use 'range_abs_max' as the quantize
-            type, this pass will create some new parameters. The scope is used to
-            initialize these new parameters.
-            place(fluid.CPUPlace|fluid.CUDAPlace): place is used to initialize new
-            parameters described above.
-            weight_bits (int): quantization bit number for weights,
-                the bias is not quantized.
-            activation_bits (int): quantization bit number for activation.
-            activation_quantize_type (str): quantization type for activation,
-                now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
-                If use 'abs_max' mode, the quantization scale will be calculated
-                dynamically each step in both training and testing period. If use
-                'range_abs_max', a static quantization scale will be calculated
-                during training and used in inference.
-            weight_quantize_type (str): quantization type for weights,
-                support 'abs_max' and 'channel_wise_abs_max'. The 'range_abs_max'
-                usually is not used for weight, since weights are fixed once the
-                model is well trained.
-            window_size (int): the window size for 'range_abs_max' quantization.
-            skip_pattern(str): The user-defined quantization skip pattern, which
-                will be presented in the name scope of an op. When the skip pattern is
-                detected in an op's name scope, the corresponding op will not be quantized.
-
-        Examples:
-        .. code-block:: python
-            # The original graph will be rewrite.
-            import paddle.fluid as fluid
-            from paddle.fluid.contrib.slim.quantization \
-                import QuantizationTransformPass
-            from paddle.fluid.contrib.slim.graph import IrGraph
-            from paddle.fluid import core
-
-            graph = IrGraph(core.Graph(program.desc), for_test=False)
-            place = fluid.CPUPlace()
-            transform_pass = QuantizationTransformPass(fluid.global_scope(),
-            place)
-            transform_pass.apply(graph)
-        """
-        self._scope = scope
-        self._place = place
-        self._weight_bits = weight_bits
-        self._activation_bits = activation_bits
-        self._skip_pattern = skip_pattern
-
-        quant_type = [
-            'abs_max', 'channel_wise_abs_max', 'range_abs_max',
-            'moving_average_abs_max'
-        ]
-        assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'."
-        if activation_quantize_type not in quant_type:
-            raise ValueError(
-                "Unknown activation_quantize_type : '%s'. It can only be "
-                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'." %
-                (str(activation_quantize_type)))
-        if weight_quantize_type not in quant_type:
-            raise ValueError(
-                "Unknown weight_quantize_type: '%s'. It can only be "
-                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'."
-                % (str(weight_quantize_type)))
-
-        self._activation_quantize_type = activation_quantize_type
-        self._weight_quantize_type = weight_quantize_type
-        self._window_size = window_size
-        self._moving_rate = moving_rate
-
-        self._quantizable_ops = _quantizable_op_list
-        self._conv_ops = ['conv2d', 'depthwise_conv2d']
-        self._quantizable_grad_ops = [
-            '%s_grad' % (op) for op in self._quantizable_ops
-        ]
-        self._is_test = None
-        self._global_step = None
-
-    def apply(self, graph):
-        """
-        Quantize the graph for training process. According to weight and
-        activation quantization type, the graph will be added some fake
-        quantize operators and fake dequantize operators.
-
-        Args:
-            graph(IrGraph): the applied graph.
-        """
-        assert isinstance(graph,
-                          IrGraph), 'graph must be the instance of IrGraph.'
-        self._is_test = graph.is_test()
-        # marked the variable which has been dequantized.
-        dequantized_vars = collections.OrderedDict()
-        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
-
-        def _quant_preprocess(op_node):
-            pool_skipped = op_node.op().has_attr("pooling_type") and \
-                    op_node.op().attr("pooling_type") == 'avg'
-            user_skipped = isinstance(self._skip_pattern, str) and \
-                           op_node.op().has_attr("op_namescope") and \
-                           op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
-
-            if pool_skipped or user_skipped:
-                op_node.op()._set_attr("skip_quant", True)
-
-        def _transform_forward(graph, op):
-            for var_node in op.inputs:
-                if var_node.name() not in op.input_arg_names():
-                    continue
-                if var_node.name() in dequantized_vars:
-                    dequant_var_node = dequantized_vars[var_node.name()]
-                else:
-                    quant_bits = self._weight_bits if var_node.name() in persistable_vars \
-                    else self._activation_bits
-                    quant_type = self._weight_quantize_type if var_node.name() \
-                        in persistable_vars else self._activation_quantize_type
-                    if quant_type == 'channel_wise_abs_max':
-                        assert var_node.name(
-                        ) in persistable_vars, "'channel_wise_abs_max' can only be applied on weights."
-                        if op.name() in self._conv_ops:
-                            quant_var_node, scale_var_node = self._insert_channel_quant_op(
-                                graph, var_node, quant_bits)
-                            dequant_var_node = self._insert_channel_dequant_op(
-                                graph, quant_var_node, [scale_var_node],
-                                [quant_bits])
-                        else:
-                            quant_var_node, scale_var_node = self._insert_quant_op(
-                                graph, var_node, quant_bits, 'abs_max')
-                            dequant_var_node = self._insert_dequant_op(
-                                graph, quant_var_node, scale_var_node,
-                                quant_bits)
-                    else:
-                        quant_var_node, scale_var_node = self._insert_quant_op(
-                            graph, var_node, quant_bits, quant_type)
-                        dequant_var_node = self._insert_dequant_op(
-                            graph, quant_var_node, scale_var_node, quant_bits)
-                    dequantized_vars[var_node.name()] = dequant_var_node
-                graph.update_input_link(var_node, dequant_var_node, op)
-
-        def _transform_backward(graph, op):
-            no_dequanted_input_vars = True
-            for var_node in op.inputs:
-                if var_node.name() not in op.input_arg_names():
-                    continue
-                if var_node.name() in dequantized_vars:
-                    dequant_var_node = dequantized_vars[var_node.name()]
-                    graph.update_input_link(var_node, dequant_var_node, op)
-                    no_dequanted_input_vars = False
-            if no_dequanted_input_vars:
-                raise ValueError("There is no dequanted inputs for op %s." %
-                                 (op.name()))
-
-        if not self._is_test:
-            self._create_global_step(graph)
-        ops = graph.all_op_nodes()
-        # Do the preproccess of quantization, such as skipping some ops
-        # for not being quantized.
-        for op in ops:
-            if op.name() in self._quantizable_ops or \
-                    op.name() in self._quantizable_grad_ops:
-                _quant_preprocess(op)
-        # The process of _transform_forward and _transform_backward is needed in two for loops.
-        # The loop for transforming the forward graph:
-        for op in ops:
-            if op.name() in self._quantizable_ops:
-                skipped = op.op().has_attr("skip_quant") and \
-                         op.op().attr("skip_quant")
-                if skipped:
-                    continue
-                _transform_forward(graph, op)
-        # The loop for renaming the inputs of backward op.
-        for op in ops:
-            if op.name() in self._quantizable_grad_ops:
-                skipped = op.op().has_attr("skip_quant") and \
-                         op.op().attr("skip_quant")
-                if skipped:
-                    continue
-                _transform_backward(graph, op)
-        graph.resolve_hazard()
-        return graph
-
-    def _create_global_step(self, graph):
-        if self._weight_quantize_type == 'range_abs_max' or \
-                self._activation_quantize_type == 'range_abs_max':
-            counter_name = cpt.to_text('@STEP_COUNTER@')
-            for node in graph.all_var_nodes():
-                if node.name() == counter_name:
-                    self._global_step = node
-            if self._global_step is None:
-                global_step_in = graph.create_persistable_node(
-                    name=counter_name,
-                    var_type=core.VarDesc.VarType.LOD_TENSOR,
-                    shape=[1],
-                    var_dtype=core.VarDesc.VarType.INT64)
-                _init_var_node(
-                    global_step_in,
-                    np.zeros(
-                        [1], dtype='int64'),
-                    self._scope,
-                    self._place)
-                global_step_out = graph.create_var_node_from_desc(
-                    global_step_in.var())
-                # The attribute of `op_role` is needed by ParallelExecutor.
-                increment_op = graph.create_op_node(
-                    op_type='increment',
-                    attrs={
-                        'step': 1.0,
-                        'op_role':
-                        core.op_proto_and_checker_maker.OpRole.Forward
-                    },
-                    inputs={'X': global_step_in},
-                    outputs={'Out': global_step_out})
-                graph.link_to(global_step_in, increment_op)
-                graph.link_to(increment_op, global_step_out)
-                self._global_step = global_step_out
-
-    def _insert_quant_op(self, graph, var_node, quant_bits, quant_type):
-        """
-        Insert fake_quantize_op in the graph.
-        """
-        if quant_type == 'abs_max':
-            return self._insert_quant_abs_max_op(graph, var_node, quant_bits)
-        elif quant_type == 'range_abs_max':
-            return self._insert_quant_range_abs_max_op(graph, var_node,
-                                                       quant_bits)
-        elif quant_type == 'moving_average_abs_max':
-            return self._insert_quant_moving_average_abs_max_op(graph, var_node,
-                                                                quant_bits)
-
-    def _insert_quant_abs_max_op(self, graph, var_node, quant_bits):
-        """
-        Insert fake_quantize_abs_max op in the graph.
-        """
-        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
-
-        quant_var_node = graph.create_var_node(
-            name=self._quantized_var_name(var_node.name()),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=var_node.dtype())
-        scale_var_node = graph.create_var_node(
-            name=self._quantized_scale_name(var_node.name()),
-            var_type=var_node.type(),
-            shape=[1],
-            var_dtype=var_node.dtype())
-        quant_op_node = graph.create_op_node(
-            op_type='fake_quantize_abs_max',
-            attrs={
-                'bit_length': quant_bits,
-                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-            },
-            inputs={'X': var_node},
-            outputs={'Out': quant_var_node,
-                     'OutScale': scale_var_node})
-        graph.link_to(var_node, quant_op_node)
-        graph.link_to(quant_op_node, quant_var_node)
-        graph.link_to(quant_op_node, scale_var_node)
-        return quant_var_node, scale_var_node
-
-    def _insert_quant_range_abs_max_op(self, graph, var_node, quant_bits):
-        """
-        Insert fake_quantize_range_abs_max on the graph.
-        """
-        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
-
-        quant_var_node = graph.create_var_node(
-            name=self._quantized_var_name(var_node.name()),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=var_node.dtype())
-
-        scale_in_node = graph.create_persistable_node(
-            name=self._quantized_scale_name(var_node.name()),
-            var_type=core.VarDesc.VarType.LOD_TENSOR,
-            shape=[1],
-            var_dtype=var_node.dtype())
-        data_type = 'float64' if var_node.dtype(
-        ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(
-            scale_in_node,
-            np.array(
-                [0.001], dtype=data_type),
-            self._scope,
-            self._place)
-
-        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
-        inputs = {'X': var_node, 'InScale': scale_in_node}
-        outputs = {'Out': quant_var_node, 'OutScale': scale_out_node}
-
-        if not self._is_test:
-            # The name of scales_var_node maybe 'scales_0', 'scales_1', etc.
-            scales_node = graph.create_persistable_node(
-                name=unique_name.generate('scales'),
-                var_type=core.VarDesc.VarType.LOD_TENSOR,
-                shape=[self._window_size],
-                var_dtype=var_node.dtype())
-            data_type = 'float64' if var_node.dtype(
-            ) == core.VarDesc.VarType.FP64 else 'float32'
-            _init_var_node(
-                scales_node,
-                np.zeros(
-                    [self._window_size], dtype=data_type),
-                self._scope,
-                self._place)
-
-            inputs['Iter'] = self._global_step
-            outputs['OutScales'] = scales_node
-        attrs = {
-            'window_size': self._window_size,
-            'bit_length': quant_bits,
-            'is_test': self._is_test,
-            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-        }
-        quant_op_node = graph.create_op_node(
-            op_type='fake_quantize_range_abs_max',
-            attrs=attrs,
-            inputs=inputs,
-            outputs=outputs)
-
-        graph.link_to(var_node, quant_op_node)
-        graph.link_to(scale_in_node, quant_op_node)
-        graph.link_to(quant_op_node, quant_var_node)
-        graph.link_to(quant_op_node, scale_out_node)
-
-        if not self._is_test:
-            graph.link_to(self._global_step, quant_op_node)
-            graph.link_to(quant_op_node, scales_node)
-
-        return quant_var_node, scale_out_node
-
-    def _insert_quant_moving_average_abs_max_op(self, graph, var_node,
-                                                quant_bits):
-        """Insert fake_quantize_moving_average_abs_max
-        """
-        quant_var_node = graph.create_var_node(
-            name=self._quantized_var_name(var_node.name()),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=var_node.dtype())
-        scale_in_node = graph.create_persistable_node(
-            name=self._quantized_scale_name(var_node.name()),
-            var_type=core.VarDesc.VarType.LOD_TENSOR,
-            shape=[1],
-            var_dtype=var_node.dtype())
-        data_type = 'float64' if var_node.dtype(
-        ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(
-            scale_in_node,
-            np.array(
-                [0.001], dtype=data_type),
-            self._scope,
-            self._place)
-
-        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
-        ins = {'X': var_node, 'InScale': scale_in_node}
-        outs = {'Out': quant_var_node, 'OutScale': scale_out_node}
-        if not self._is_test:
-            state_in_node = graph.create_persistable_node(
-                name=unique_name.generate('state'),
-                var_type=core.VarDesc.VarType.LOD_TENSOR,
-                var_dtype=var_node.dtype(),
-                shape=[1])
-            data_type = 'float64' if var_node.dtype(
-            ) == core.VarDesc.VarType.FP64 else 'float32'
-            _init_var_node(
-                state_in_node,
-                np.ones(
-                    [1], dtype=data_type),
-                self._scope,
-                self._place)
-            accum_in_node = graph.create_persistable_node(
-                name=unique_name.generate('accum'),
-                var_type=core.VarDesc.VarType.LOD_TENSOR,
-                var_dtype=var_node.dtype(),
-                shape=[1])
-            _init_var_node(
-                accum_in_node,
-                np.ones(
-                    [1], dtype=data_type),
-                self._scope,
-                self._place)
-            state_out_node = graph.create_var_node_from_desc(state_in_node.var(
-            ))
-            accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
-            ))
-
-            ins['InState'] = state_in_node
-            ins['InAccum'] = accum_in_node
-            outs['OutState'] = state_out_node
-            outs['OutAccum'] = accum_out_node
-
-        attrs = {
-            'bit_length': quant_bits,
-            'moving_rate': self._moving_rate,
-            'is_test': self._is_test,
-            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-        }
-
-        quant_op_node = graph.create_op_node(
-            op_type='fake_quantize_moving_average_abs_max',
-            attrs=attrs,
-            inputs=ins,
-            outputs=outs)
-
-        graph.link_to(var_node, quant_op_node)
-        graph.link_to(scale_in_node, quant_op_node)
-        graph.link_to(quant_op_node, quant_var_node)
-        graph.link_to(quant_op_node, scale_out_node)
-
-        if not self._is_test:
-            graph.link_to(state_in_node, quant_op_node)
-            graph.link_to(accum_in_node, quant_op_node)
-            graph.link_to(quant_op_node, state_out_node)
-            graph.link_to(quant_op_node, accum_out_node)
-
-        return quant_var_node, scale_out_node
-
-    def _insert_channel_quant_op(self, graph, var_node, quant_bits):
-        """
-        Insert fake_channel_wise_quantize_abs_max op in the graph.
-        """
-        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
-
-        quant_var_node = graph.create_var_node(
-            name=self._quantized_var_name(var_node.name()),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=var_node.dtype())
-        scale_var_node = graph.create_var_node(
-            name=self._quantized_scale_name(var_node.name()),
-            var_type=var_node.type(),
-            shape=[var_node.shape()[0]],
-            var_dtype=var_node.dtype())
-        quant_op_node = graph.create_op_node(
-            op_type='fake_channel_wise_quantize_abs_max',
-            attrs={
-                'bit_length': quant_bits,
-                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-            },
-            inputs={'X': var_node},
-            outputs={'Out': quant_var_node,
-                     'OutScale': scale_var_node})
-        graph.link_to(var_node, quant_op_node)
-        graph.link_to(quant_op_node, quant_var_node)
-        graph.link_to(quant_op_node, scale_var_node)
-        return quant_var_node, scale_var_node
-
-    def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
-        """
-        Insert fake_dequantize_op in the graph.
-        """
-        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
-
-        dequant_var_node = graph.create_var_node(
-            name=self._dequantized_var_name(var_node.name()),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=var_node.dtype())
-        max_range = (1 << (quant_bits - 1)) - 1
-        dequant_op_node = graph.create_op_node(
-            op_type='fake_dequantize_max_abs',
-            attrs={
-                'max_range': float(max_range),
-                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-            },
-            inputs={'X': var_node,
-                    'Scale': scale_var_node},
-            outputs={'Out': dequant_var_node})
-        graph.link_to(var_node, dequant_op_node)
-        graph.link_to(scale_var_node, dequant_op_node)
-        graph.link_to(dequant_op_node, dequant_var_node)
-        return dequant_var_node
-
-    def _insert_channel_dequant_op(self, graph, var_node, scale_var_nodes,
-                                   quant_bits):
-        """
-        Insert fake_channel_wise_dequantize_max_abs in the graph.
-        """
-        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
-
-        dequant_var_node = graph.create_var_node(
-            name=self._dequantized_var_name(var_node.name()),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=var_node.dtype())
-        dequant_op_node = graph.create_op_node(
-            op_type='fake_channel_wise_dequantize_max_abs',
-            attrs={
-                'quant_bits': quant_bits,
-                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-            },
-            inputs={'X': var_node,
-                    'Scales': scale_var_nodes},
-            outputs={'Out': dequant_var_node})
-        graph.link_to(var_node, dequant_op_node)
-        for scale_n in scale_var_nodes:
-            graph.link_to(scale_n, dequant_op_node)
-        graph.link_to(dequant_op_node, dequant_var_node)
-        return dequant_var_node
-
-    def _quantized_var_name(self, var_name):
-        """
-        Return quantized variable name for the input `var_name`.
-        """
-        return "%s.quantized" % (var_name)
-
-    def _dequantized_var_name(self, var_name):
-        """
-        Return dequantized variable name for the input `var_name`.
-        """
-        return "%s.dequantized" % (var_name)
-
-    def _quantized_scale_name(self, var_name):
-        """
-        Return the scale name of quantized variable for the input `var_name`.
-        """
-        return "%s.scale" % (var_name)
-
-
-class QuantizationFreezePass(object):
-    """
-    The freeze pass is used to adjust the quantize operator order, for example:
-        1) `activation -> quant -> dequant -> conv2d` will be freezed into
-        `activation -> quant -> conv2d -> dequant`
-        2) `weight -> quant -> dequant -> conv2d` will be freezed into `weight -> conv2d`,
-        and weight will be sacled offline.
-
-    Args:
-        scope(fluid.Scope): scope is used to get the weight tensor values.
-        place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors.
-        weight_bits (int): quantization bit number for weights.
-        activation_bits (int): quantization bit number for activation.
-        weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'.
-        The 'range_abs_max' usually is not used for weight, since weights are fixed once the
-        model is well trained.
-    """
-
-    def __init__(self,
-                 scope,
-                 place,
-                 weight_bits=8,
-                 activation_bits=8,
-                 weight_quantize_type='abs_max'):
-        assert scope is not None, \
-            'The scope cannot be set None.'
-        assert place is not None, \
-            'The place cannot be set None.'
-        self._scope = scope
-        self._place = place
-        self._weight_bits = weight_bits
-        self._activation_bits = activation_bits
-        self._weight_quantize_type = weight_quantize_type
-        self._quantizable_ops = _quantizable_op_list
-        self._conv_ops = ['conv2d', 'depthwise_conv2d']
-        self._fake_quant_op_names = _fake_quant_op_list
-        self._fake_dequant_op_names = _fake_dequant_op_list
-        self._op_input_rename_map = collections.OrderedDict()
-        self._op_output_rename_map = collections.OrderedDict()
-        self._var_scale_map = collections.OrderedDict()
-
-    def apply(self, graph):
-        """
-        Adjust quantize/dequantize operators order for the inference process.
-
-        Args:
-            graph(IrGraph): the applied graph.
-        """
-        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            op_name = op_node.name()
-            if op_name in self._fake_quant_op_names:
-                input_arg_name = op_node.input('X')[0]
-                if input_arg_name in persistable_vars:
-                    if self._weight_quantize_type == 'abs_max':
-                        param = self._load_var(input_arg_name)
-                        scale_v = np.max(np.abs(param))
-                    elif self._weight_quantize_type == 'channel_wise_abs_max':
-                        param = self._load_var(input_arg_name)
-                        if len(param.shape) == 4:  # conv2d or depthwise_conv2d
-                            scale_v = []
-                            for i in range(param.shape[0]):
-                                scale_v.append(np.max(np.abs(param[i])))
-                        else:
-                            scale_v = np.max(np.abs(param))
-                    else:
-                        scale_v = self._load_var(
-                            op_node.output('OutScale')[0])[0]
-                    self._var_scale_map[input_arg_name] = scale_v
-                    self._remove_fake_quant_and_dequant_op(graph, op_node)
-                    # quantize weight and restore
-                    param_v = self._load_var(input_arg_name)
-                    quantized_param_v = self._quant(param_v, scale_v,
-                                                    self._weight_bits)
-                    self._restore_var(input_arg_name, quantized_param_v)
-                else:
-                    scale_v = graph._find_node_by_name(
-                        op_node.outputs, op_node.output('OutScale')[0])
-                    self._var_scale_map[input_arg_name] = scale_v
-
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            op_name = op_node.name()
-            if op_name in self._fake_dequant_op_names:
-                self._remove_fake_quant_and_dequant_op(graph, op_node)
-
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            op_name = op_node.name()
-            if op_name in self._quantizable_ops:
-                skipped = op_node.op().has_attr("skip_quant") and \
-                         op_node.op().attr("skip_quant")
-                if skipped:
-                    continue
-                if self._weight_quantize_type == 'channel_wise_abs_max' and op_name in self._conv_ops:
-                    self._insert_post_channel_dequant_op(graph, op_node)
-                else:
-                    self._insert_post_dequant_op(graph, op_node)
-
-        for op_node in ops:
-            # insert dequant_op after fc/conv, need to rename inputs of the followed ops
-            for var_node in op_node.inputs:
-                if var_node.node in self._op_output_rename_map:
-                    old_in = var_node
-                    new_in = self._op_output_rename_map[var_node.node]
-                    graph.update_input_link(old_in, new_in, op_node)
-
-        # remove the unused var node in the graph
-        self._remove_unused_var_nodes(graph)
-        graph.resolve_hazard()
-        return graph
-
-    def _remove_fake_quant_and_dequant_op(self, graph, op_node):
-        k = graph._find_node_by_name(op_node.outputs, op_node.output('Out')[0])
-        v = graph._find_node_by_name(op_node.inputs, op_node.input('X')[0])
-        if v.node not in self._op_input_rename_map:
-            self._op_input_rename_map[k.node] = v
-        else:
-            self._op_input_rename_map[k.node] = self._op_input_rename_map[
-                v.node]
-        graph.safe_remove_nodes(op_node)
-
-    def _insert_post_channel_dequant_op(self, graph, op_node):
-        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
-        for var_node in op_node.inputs:
-            name = var_node.name()
-            if name not in op_node.input_arg_names():
-                continue
-            if var_node.node in self._op_input_rename_map:
-                old_in = var_node
-                new_in = self._op_input_rename_map[var_node.node]
-                new_in.clear_outputs()
-                graph.update_input_link(old_in, new_in, op_node)
-            original_var_name = self._original_var_name(name)
-            scale_v = self._var_scale_map[original_var_name]
-            if original_var_name in persistable_vars:
-                assert isinstance(
-                    scale_v,
-                    list), 'The scale of parameter %s is not a list.' % (
-                        original_var_name)
-                channel_scale = np.array(scale_v)
-            else:
-                assert isinstance(scale_v, IrNode)
-                scale_var_node = self._var_scale_map[original_var_name]
-
-        if len(op_node.output_arg_names()) != 1:
-            raise ValueError("Only support one output, but op %s has"
-                             " more than one output." % (op_node.name()))
-
-        output_var_node = graph._find_node_by_name(
-            op_node.outputs, op_node.output_arg_names()[0])
-        weight_scale_node = graph.create_persistable_node(
-            name=unique_name.generate('channel_scale'),
-            var_type=core.VarDesc.VarType.LOD_TENSOR,
-            shape=[channel_scale.shape[0]],
-            var_dtype=output_var_node.dtype())
-        data_type = 'float64' if output_var_node.dtype(
-        ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(weight_scale_node,
-                       channel_scale.astype(data_type), self._scope,
-                       self._place)
-        dequant_var_node = graph.create_var_node(
-            name=self._dequantized_var_name(output_var_node.name()),
-            var_type=output_var_node.type(),
-            shape=output_var_node.shape(),
-            var_dtype=output_var_node.dtype())
-        dequant_op_node = graph.create_op_node(
-            op_type='fake_channel_wise_dequantize_max_abs',
-            attrs={
-                'quant_bits': [self._weight_bits, self._activation_bits],
-                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-            },
-            inputs={
-                'X': output_var_node,
-                'Scales': [weight_scale_node, scale_var_node]
-            },
-            outputs={'Out': dequant_var_node})
-        graph.link_to(output_var_node, dequant_op_node)
-        graph.link_to(scale_var_node, dequant_op_node)
-        graph.link_to(weight_scale_node, dequant_op_node)
-        graph.link_to(dequant_op_node, dequant_var_node)
-        self._op_output_rename_map[output_var_node.node] = dequant_var_node
-        return dequant_var_node
-
-    def _insert_post_dequant_op(self, graph, op_node):
-        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
-        if len(op_node.input_arg_names()) >= 2 and len(persistable_vars) == 0:
-            raise ValueError("The op %s has more than one inputs "
-                             "and all of them are not persistable. "
-                             "Now, it is not supported!" % (op_node.name()))
-        max_range = 1
-        param_range = (1 << (self._weight_bits - 1)) - 1
-        act_range = (1 << (self._activation_bits - 1)) - 1
-        for var_node in op_node.inputs:
-            name = var_node.name()
-            if name not in op_node.input_arg_names():
-                continue
-            if var_node.node in self._op_input_rename_map:
-                old_in = var_node
-                new_in = self._op_input_rename_map[var_node.node]
-                new_in.clear_outputs()
-                graph.update_input_link(old_in, new_in, op_node)
-            original_var_name = self._original_var_name(name)
-            scale_v = self._var_scale_map[original_var_name]
-            if original_var_name in persistable_vars:
-                assert self._is_float(
-                    scale_v), 'The scale of parameter %s is not a float.' % (
-                        original_var_name)
-                max_range *= param_range / scale_v
-            else:
-                max_range *= act_range
-                assert isinstance(scale_v, IrNode)
-                scale_var_node = self._var_scale_map[original_var_name]
-
-        if len(op_node.output_arg_names()) != 1:
-            raise ValueError("Only support one output, but op %s has"
-                             " more than one output." % (op_node.name()))
-
-        output_var_node = graph._find_node_by_name(
-            op_node.outputs, op_node.output_arg_names()[0])
-        dequant_var_node = graph.create_var_node(
-            name=self._dequantized_var_name(output_var_node.name()),
-            var_type=output_var_node.type(),
-            shape=output_var_node.shape(),
-            var_dtype=output_var_node.dtype())
-        dequant_op_node = graph.create_op_node(
-            op_type='fake_dequantize_max_abs',
-            attrs={
-                'max_range': float(max_range),
-                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-            },
-            inputs={'X': output_var_node,
-                    'Scale': scale_var_node},
-            outputs={'Out': dequant_var_node})
-        graph.link_to(output_var_node, dequant_op_node)
-        graph.link_to(scale_var_node, dequant_op_node)
-        graph.link_to(dequant_op_node, dequant_var_node)
-        self._op_output_rename_map[output_var_node.node] = dequant_var_node
-        return dequant_var_node
-
-    def _load_var(self, name):
-        return np.array(self._scope.find_var(name).get_tensor())
-
-    def _restore_var(self, name, array):
-        tensor = self._scope.find_var(name).get_tensor()
-        tensor.set(array, self._place)
-
-    def _remove_unused_var_nodes(self, graph):
-        all_used_vars = set()
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            for input_node in op_node.inputs:
-                all_used_vars.add(input_node)
-            for output_node in op_node.outputs:
-                all_used_vars.add(output_node)
-
-        all_used_vars = {n.node for n in all_used_vars}
-        all_unused_vars = {
-            n
-            for n in filter(lambda node: node.node not in all_used_vars,
-                            graph.all_var_nodes())
-        }
-        graph.safe_remove_nodes(all_unused_vars)
-
-    def _original_var_name(self, var_name):
-        """
-        Return the original variable name.
-        """
-        if var_name.endswith('.quantized.dequantized'):
-            return var_name[:-len('.quantized.dequantized')]
-        if var_name.endswith('.quantized'):
-            return var_name[:-len('.quantized')]
-        if var_name.endswith('.dequantized'):
-            return var_name[:-len('.dequantized')]
-        if var_name.endswith('.scale'):
-            return var_name[:-len('.scale')]
-        else:
-            return var_name
-
-    def _dequantized_var_name(self, var_name):
-        """
-        Return dequantized variable name for the input `var_name`.
-        """
-        return "%s.dequantized" % (var_name)
-
-    def _is_float(self, v):
-        return isinstance(v, float) or isinstance(v, np.float32) \
-            or isinstance(v, np.float64)
-
-    def _quant(self, x, scale, num_bits):
-        if isinstance(scale, list):
-            for i, s in enumerate(scale):
-                x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
-            return x
-        else:
-            return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
-
-
-class ConvertToInt8Pass(object):
-    """
-    Convert the weights into int8_t type.
-
-    Args:
-        scope(fluid.Scope): scope is used to get the weight tensor values.
-        place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the
-        8bits weight tensors.
-    """
-
-    def __init__(self, scope, place):
-        assert scope is not None, \
-            'The scope cannot be set None.'
-        assert place is not None, \
-            'The place cannot be set None.'
-        self._scope = scope
-        self._place = place
-        self._quantizable_ops = _quantizable_op_list
-
-    def apply(self, graph):
-        """
-        Convert weights' tpye of the graph. After that, the data type of the
-        graph weigths is int8_t.
-
-        Args:
-            graph(IrGraph): the applied graph.
-        """
-        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
-        ops = graph.all_op_nodes()
-        input_map = {}
-        for op_node in ops:
-            op_name = op_node.name()
-            if op_name in self._quantizable_ops:
-                skipped = op_node.op().has_attr("skip_quant") and \
-                         op_node.op().attr("skip_quant")
-                if skipped:
-                    continue
-                for var_node in op_node.inputs:
-                    name = var_node.name()
-                    if name in persistable_vars:
-                        if name not in input_map:
-                            int8_var_node = self._convert_to_int8(graph,
-                                                                  var_node)
-                            input_map[name] = int8_var_node
-                        graph.update_input_link(var_node, input_map[name],
-                                                op_node)
-
-        # remove the unused var node in the graph
-        self._remove_unused_var_nodes(graph)
-        graph.resolve_hazard()
-        return graph
-
-    def _convert_to_int8(self, graph, var_node):
-        int8_var_node_name = var_node.name() + ".int8"
-        int8_var_node = graph.create_persistable_node(
-            name=cpt.to_text(int8_var_node_name),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=core.VarDesc.VarType.INT8)
-        array = self._load_var(var_node.name())
-        self._scope.var(int8_var_node_name)
-        self._store_var(int8_var_node_name, array, np.int8)
-        return int8_var_node
-
-    def _load_var(self, name):
-        return np.array(self._scope.find_var(name).get_tensor())
-
-    def _store_var(self, name, array, dtype):
-        tensor = self._scope.find_var(name).get_tensor()
-        tensor.set(array.astype(dtype), self._place)
-
-    def _remove_unused_var_nodes(self, graph):
-        all_used_vars = set()
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            for input_node in op_node.inputs:
-                all_used_vars.add(input_node)
-            for output_node in op_node.outputs:
-                all_used_vars.add(output_node)
-
-        all_used_vars = {n.node for n in all_used_vars}
-        all_unused_vars = {
-            n
-            for n in filter(lambda node: node.node not in all_used_vars,
-                            graph.all_var_nodes())
-        }
-        graph.safe_remove_nodes(all_unused_vars)
-
-
-class TransformForMobilePass(object):
-    """
-    This pass is used to convert the freezed graph for paddle-mobile execution.
-    """
-
-    def __init__(self):
-        self._fake_quant_op_names = _fake_quant_op_list
-        self._fake_dequant_op_names = _fake_dequant_op_list
-
-    def apply(self, graph):
-        """
-        Because paddle-mobile use `quantize` an `dequantize` as the names of
-        quantize operator and dequantize operator, the `apply` function just
-        realize this logic.
-
-        Args:
-            graph(IrGraph): the graph will be transformed.
-        """
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            name = op_node.name()
-            if name in self._fake_quant_op_names:
-                op_node.set_type('quantize')
-                quant_node = graph.create_op_node_from_desc(op_node.op())
-                for input_node in op_node.inputs:
-                    graph.link_to(input_node, quant_node)
-                for output_node in op_node.outputs:
-                    graph.link_to(quant_node, output_node)
-                graph.safe_remove_nodes(op_node)
-            if name in self._fake_dequant_op_names:
-                op_node.set_type('dequantize')
-                dequant_node = graph.create_op_node_from_desc(op_node.op())
-                for input_node in op_node.inputs:
-                    graph.link_to(input_node, dequant_node)
-                for output_node in op_node.outputs:
-                    graph.link_to(dequant_node, output_node)
-                graph.safe_remove_nodes(op_node)
-        graph.resolve_hazard()
-        return graph
-
-
-class ScaleForTrainingPass(object):
-    def __init__(self, scope=None, place=None, moving_rate=0.9):
-        """
-        This pass is used for calculating output scales of some operators.
-        These output scales may be used by tensorRT or some other inference engines.
-
-        Args:
-            scope(fluid.Scope): The scope is used to initialize these new parameters.
-            place(fluid.CPUPlace|fluid.CUDAPlace): The place is used to initialize new parameters.
-            moving_rate(float): The decay coefficient of moving average. The default value is 0.9.
-        """
-        self._scope = scope
-        self._place = place
-        self._moving_rate = moving_rate
-        self._is_test = None
-        self._teller_set = _out_scale_op_list
-
-    def apply(self, graph):
-        """
-        Insert the `moving_average_abs_max_scale` op in order to calculate output scales
-        of operators in the teller_set.
-
-        Args:
-            graph(IrGraph): the target graph.
-        """
-        assert isinstance(graph,
-                          IrGraph), 'graph must be the instance of IrGraph.'
-        self._is_test = graph.is_test()
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            name = op_node.name()
-            if name in self._teller_set:
-                if len(op_node.output_arg_names()) != 1:
-                    continue
-                in_node = graph._find_node_by_name(
-                    op_node.outputs, op_node.output_arg_names()[0])
-                out_node = graph.create_var_node_from_desc(in_node.var())
-                scale_node = graph.create_persistable_node(
-                    name=self._scale_name(in_node.name()),
-                    var_type=core.VarDesc.VarType.LOD_TENSOR,
-                    shape=[1],
-                    var_dtype=in_node.dtype())
-                ins = {'X': in_node}
-                outs = {'Out': out_node, 'OutScale': scale_node}
-                if not self._is_test:
-                    state_in_node = graph.create_persistable_node(
-                        name=unique_name.generate('scale_state@'),
-                        var_type=core.VarDesc.VarType.LOD_TENSOR,
-                        var_dtype=in_node.dtype(),
-                        shape=[1])
-                    data_type = 'float64' if in_node.dtype(
-                    ) == core.VarDesc.VarType.FP64 else 'float32'
-                    _init_var_node(
-                        state_in_node,
-                        np.ones(
-                            [1], dtype=data_type),
-                        self._scope,
-                        self._place)
-                    accum_in_node = graph.create_persistable_node(
-                        name=unique_name.generate('scale_accum@'),
-                        var_type=core.VarDesc.VarType.LOD_TENSOR,
-                        var_dtype=in_node.dtype(),
-                        shape=[1])
-                    _init_var_node(
-                        accum_in_node,
-                        np.ones(
-                            [1], dtype=data_type),
-                        self._scope,
-                        self._place)
-                    state_out_node = graph.create_var_node_from_desc(
-                        state_in_node.var())
-                    accum_out_node = graph.create_var_node_from_desc(
-                        accum_in_node.var())
-
-                    ins['InState'] = state_in_node
-                    ins['InAccum'] = accum_in_node
-                    outs['OutState'] = state_out_node
-                    outs['OutAccum'] = accum_out_node
-
-                attrs = {
-                    'moving_rate': self._moving_rate,
-                    'is_test': self._is_test,
-                    'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-                }
-                scale_op_node = graph.create_op_node(
-                    op_type='moving_average_abs_max_scale',
-                    attrs=attrs,
-                    inputs=ins,
-                    outputs=outs)
-                graph.link_to(in_node, scale_op_node)
-                graph.link_to(scale_op_node, out_node)
-                graph.link_to(scale_op_node, scale_node)
-                if not self._is_test:
-                    graph.link_to(state_in_node, scale_op_node)
-                    graph.link_to(accum_in_node, scale_op_node)
-                    graph.link_to(scale_op_node, state_out_node)
-                    graph.link_to(scale_op_node, accum_out_node)
-        graph.resolve_hazard()
-        return graph
-
-    def _scale_name(self, var_name):
-        """
-        Return the scale name for the var named `var_name`.
-        """
-        return "%s@scale" % (var_name)
-
-
-class ScaleForInferencePass(object):
-    def __init__(self, scope=None):
-        """
-        This pass is used for setting output scales of some operators.
-        These output scales may be used by tensorRT or some other inference engines.
-
-        Args:
-            scope(fluid.Scope): The scope is used to initialize these new parameters.
-        """
-        self._scope = scope
-        self._teller_set = _out_scale_op_list
-
-    def apply(self, graph):
-        """
-        Get output scales from the scope and set these scales in op_descs
-        of operators in the teller_set.
-
-        Args:
-            graph(IrGraph): the target graph.
-        """
-        assert isinstance(graph,
-                          IrGraph), 'graph must be the instance of IrGraph.'
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            name = op_node.name()
-            if name in self._teller_set:
-                if len(op_node.output_arg_names()) != 1:
-                    continue
-                scale_name = self._scale_name(op_node.output_arg_names()[0])
-                scale_v = np.array(
-                    self._scope.find_var(scale_name).get_tensor())[0]
-                op_node.op()._set_attr("out_scale", float(scale_v))
-        graph.resolve_hazard()
-        return graph
-
-    def _scale_name(self, var_name):
-        """
-        Return the scale name for the var named `var_name`.
-        """
-        return "%s@scale" % (var_name)
-
-
-class AddQuantDequantPass(object):
-    def __init__(self, scope=None, place=None, moving_rate=0.9, quant_bits=8):
-        """
-        This pass is used to add quant_dequant op for some ops, such as the
-        'elementwise_add' and 'average pool2d' op.
-        """
-        self._scope = scope
-        self._place = place
-        self._moving_rate = moving_rate
-        self._quant_bits = quant_bits
-        self._is_test = None
-        self._target_ops = ["elementwise_add", "pool2d"]
-        self._target_grad_ops = ['%s_grad' % (op) for op in self._target_ops]
-
-    def apply(self, graph):
-        """
-        Add quant_dequant before some ops, such as the 'elementwise_add'
-        and 'average pool2d' op.
-        Args:
-            graph(IrGraph): the target graph.
-        """
-        assert isinstance(graph,
-                          IrGraph), 'graph must be the instance of IrGraph.'
-        self._is_test = graph.is_test()
-        dequantized_vars_map = collections.OrderedDict()
-        ops = graph.all_op_nodes()
-
-        for op_node in ops:
-            if op_node.name() in self._target_ops:
-                in_nodes_all_not_persistable = True
-                for input_name in op_node.input_arg_names():
-                    in_node = graph._find_node_by_name(op_node.inputs,
-                                                       input_name)
-                    in_nodes_all_not_persistable = (
-                        in_nodes_all_not_persistable and
-                        not in_node.persistable())
-                if not in_nodes_all_not_persistable:
-                    continue
-
-                if op_node.op().has_attr("pooling_type") and \
-                    op_node.op().attr("pooling_type") == 'max':
-                    continue
-
-                input_names = op_node.input_arg_names()
-                for input_name in input_names:
-                    in_node = graph._find_node_by_name(op_node.inputs,
-                                                       input_name)
-                    quant_var_node, scale_var_node = \
-                        self._inser_quant_dequant_moving_average_abs_max_op(
-                        graph, in_node, self._quant_bits)
-                    dequantized_vars_map[input_name] = quant_var_node
-                    graph.update_input_link(in_node, quant_var_node, op_node)
-
-        for op_node in ops:
-            if op_node.name() in self._target_grad_ops:
-                for input_name in op_node.input_arg_names():
-                    if input_name in dequantized_vars_map:
-                        in_node = graph._find_node_by_name(op_node.inputs,
-                                                           input_name)
-                        dequant_var_node = dequantized_vars_map[input_name]
-                        graph.update_input_link(in_node, dequant_var_node,
-                                                op_node)
-
-        graph.resolve_hazard()
-        return graph
-
-    def _inser_quant_dequant_moving_average_abs_max_op(self, graph, var_node,
-                                                       quant_bits):
-        """Insert fake_quantize_dequantize_moving_average_abs_max op.
-        """
-        quant_var_node = graph.create_var_node(
-            name="{}.quant_dequant".format(var_node.name()),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=var_node.dtype())
-        scale_in_node = graph.create_persistable_node(
-            name="{}.quant_dequant.scale".format(var_node.name()),
-            var_type=core.VarDesc.VarType.LOD_TENSOR,
-            shape=[1],
-            var_dtype=var_node.dtype())
-        data_type = 'float64' if var_node.dtype(
-        ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(
-            scale_in_node,
-            np.array(
-                [0.001], dtype=data_type),
-            self._scope,
-            self._place)
-
-        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
-        ins = {'X': var_node, 'InScale': scale_in_node}
-        outs = {'Out': quant_var_node, 'OutScale': scale_out_node}
-        if not self._is_test:
-            state_in_node = graph.create_persistable_node(
-                name=unique_name.generate('quant_dequant.state'),
-                var_type=core.VarDesc.VarType.LOD_TENSOR,
-                var_dtype=var_node.dtype(),
-                shape=[1])
-            data_type = 'float64' if var_node.dtype(
-            ) == core.VarDesc.VarType.FP64 else 'float32'
-            _init_var_node(
-                state_in_node,
-                np.ones(
-                    [1], dtype=data_type),
-                self._scope,
-                self._place)
-            accum_in_node = graph.create_persistable_node(
-                name=unique_name.generate('quant_dequant.accum'),
-                var_type=core.VarDesc.VarType.LOD_TENSOR,
-                var_dtype=var_node.dtype(),
-                shape=[1])
-            _init_var_node(
-                accum_in_node,
-                np.ones(
-                    [1], dtype=data_type),
-                self._scope,
-                self._place)
-            state_out_node = graph.create_var_node_from_desc(state_in_node.var(
-            ))
-            accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
-            ))
-
-            ins['InState'] = state_in_node
-            ins['InAccum'] = accum_in_node
-            outs['OutState'] = state_out_node
-            outs['OutAccum'] = accum_out_node
-
-        attrs = {
-            'bit_length': quant_bits,
-            'moving_rate': self._moving_rate,
-            'is_test': self._is_test,
-            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-        }
-
-        quant_op_node = graph.create_op_node(
-            op_type='fake_quantize_dequantize_moving_average_abs_max',
-            attrs=attrs,
-            inputs=ins,
-            outputs=outs)
-
-        graph.link_to(var_node, quant_op_node)
-        graph.link_to(scale_in_node, quant_op_node)
-        graph.link_to(quant_op_node, quant_var_node)
-        graph.link_to(quant_op_node, scale_out_node)
-
-        if not self._is_test:
-            graph.link_to(state_in_node, quant_op_node)
-            graph.link_to(accum_in_node, quant_op_node)
-            graph.link_to(quant_op_node, state_out_node)
-            graph.link_to(quant_op_node, accum_out_node)
-
-        return quant_var_node, scale_out_node
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
deleted file mode 100644
index 5d2b6ea369dedfd1f1437ae626f7f3b3eb6a21a7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import sys
-import numpy as np
-from .... import Executor
-from .... import io
-from .... import core, scope_guard
-from ....compiler import CompiledProgram
-from ....compiler import BuildStrategy
-from ....framework import IrGraph, Variable, Program
-from ....log_helper import get_logger
-from ..core.strategy import Strategy
-from .quantization_pass import *
-
-__all__ = ['QuantizationStrategy']
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class QuantizationStrategy(Strategy):
-    """
-    The strategy for Quantization.
-    """
-
-    def __init__(self,
-                 start_epoch=0,
-                 end_epoch=0,
-                 float_model_save_path=None,
-                 mobile_model_save_path=None,
-                 int8_model_save_path=None,
-                 activation_bits=8,
-                 weight_bits=8,
-                 activation_quantize_type='abs_max',
-                 weight_quantize_type='abs_max',
-                 save_in_nodes=None,
-                 save_out_nodes=None):
-        """
-        Args:
-            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
-            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
-            float_model_save_path(str): The path to save model with float weights.
-                            None means it doesn't save float model. default: None.
-            mobile_model_save_path(str): The path to save model for paddle-mobile execution.
-                            None means it doesn't save mobile model. default: None.
-            int8_model_save_path(str): The path to save model with int8_t weight.
-                            None means it doesn't save int8 model. default: None.
-            activation_bits(int): quantization bit number for activation. default: 8.
-            weight_bits(int): quantization bit number for weights. The bias is not quantized.
-                              default: 8.
-            activation_quantize_type(str): quantization type for activation,
-                now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
-                If use 'abs_max' mode, the quantization scale will be calculated
-                dynamically each step in both training and testing period. If use
-                'range_abs_max', a static quantization scale will be calculated
-                during training and used in inference.
-            weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'.
-            The 'range_abs_max' usually is not used for weight, since weights are fixed once the model is well trained.
-            save_in_nodes(list<str>): A list of variable names used to prune graph
-                                      for saving inference model.
-            save_out_nodes(list<str>): A list of variable names used to prune graph
-                                      for saving inference model.
-
-        """
-        super(QuantizationStrategy, self).__init__(start_epoch, end_epoch)
-        self.start_epoch = start_epoch
-        self.end_epoch = end_epoch
-        self.float_model_save_path = float_model_save_path
-        self.mobile_model_save_path = mobile_model_save_path
-        self.int8_model_save_path = int8_model_save_path
-        self.activation_bits = activation_bits
-        self.weight_bits = weight_bits
-        self.activation_quantize_type = activation_quantize_type
-        self.weight_quantize_type = weight_quantize_type
-        self.save_out_nodes = save_out_nodes
-        self.save_in_nodes = save_in_nodes
-
-    def restore_from_checkpoint(self, context):
-        """
-        Restore graph when the compression task is inited from checkpoint.
-        """
-        # It is inited from checkpoint and has missed start epoch.
-        if context.epoch_id != 0 and context.epoch_id > self.start_epoch:
-            _logger.info("Restore quantization task from checkpoint")
-            self._modify_graph_for_quantization(context)
-            _logger.info("Finish restoring quantization task from checkpoint")
-
-    def _modify_graph_for_quantization(self, context):
-        """
-        Insert fake_quantize_op and fake_dequantize_op before training and testing.
-        """
-        train_ir_graph = IrGraph(
-            core.Graph(context.optimize_graph.program.clone().desc),
-            for_test=False)
-        test_ir_graph = IrGraph(
-            core.Graph(context.eval_graph.program.clone().desc), for_test=True)
-        transform_pass = QuantizationTransformPass(
-            scope=context.scope,
-            place=context.place,
-            weight_bits=self.weight_bits,
-            activation_bits=self.activation_bits,
-            activation_quantize_type=self.activation_quantize_type,
-            weight_quantize_type=self.weight_quantize_type)
-        transform_pass.apply(train_ir_graph)
-        transform_pass.apply(test_ir_graph)
-        # Put persistables created by transform_pass into context.optimize_graph.persistables
-        # for saving checkpoint.
-        program_persistables = set()
-        for var in context.optimize_graph.program.list_vars():
-            if var.persistable:
-                program_persistables.add(var.name)
-
-        program = Program()
-        for var_node in train_ir_graph.all_persistable_nodes():
-            if var_node.name() not in program_persistables:
-                var_desc = var_node.var()
-                var = program.global_block().create_var(
-                    name=var_node.name(),
-                    shape=var_desc.shape(),
-                    dtype=var_desc.dtype(),
-                    type=var_desc.type(),
-                    lod_level=var_desc.lod_level())
-                context.optimize_graph.persistables[var.name] = var
-
-        build_strategy = BuildStrategy()
-        build_strategy.enable_inplace = False
-        build_strategy.memory_optimize = False
-        # for quantization training
-        context.optimize_graph.compiled_graph = CompiledProgram(
-            train_ir_graph.graph).with_data_parallel(
-                loss_name=context.optimize_graph.out_nodes['loss'],
-                build_strategy=build_strategy)
-
-        context.eval_graph.program = test_ir_graph.to_program()
-
-        # for saving inference model after training
-        context.put('quantization_test_ir_graph_backup', test_ir_graph)
-
-    def on_epoch_begin(self, context):
-        """
-        Insert fake_quantize_op and fake_dequantize_op before training and testing.
-        """
-        super(QuantizationStrategy, self).on_epoch_begin(context)
-        if self.start_epoch == context.epoch_id:
-            _logger.info('QuantizationStrategy::on_epoch_begin')
-            self._modify_graph_for_quantization(context)
-            _logger.info('Finish QuantizationStrategy::on_epoch_begin')
-
-    def on_epoch_end(self, context):
-        """
-        Free and save inference model.
-        """
-        super(QuantizationStrategy, self).on_compression_end(context)
-
-        if context.epoch_id == self.end_epoch:
-            _logger.info('QuantizationStrategy::on_epoch_end')
-            test_ir_graph = context.get('quantization_test_ir_graph_backup')
-            # freeze the graph after training
-            freeze_pass = QuantizationFreezePass(
-                scope=context.scope,
-                place=context.place,
-                weight_bits=self.weight_bits,
-                activation_bits=self.activation_bits,
-                weight_quantize_type=self.weight_quantize_type)
-            freeze_pass.apply(test_ir_graph)
-
-            # for other strategies
-            context.eval_graph.program = test_ir_graph.to_program()
-
-            if self.save_out_nodes == None:
-                out_vars = [
-                    context.eval_graph.var(var_name)._var
-                    for var_name in context.eval_graph.out_nodes.values()
-                ]
-            else:
-                out_vars = [
-                    context.eval_graph.var(var_name)._var
-                    for var_name in self.save_out_nodes
-                ]
-
-            if self.save_in_nodes == None:
-                in_vars = list(context.eval_graph.in_nodes.values())
-            else:
-                in_vars = self.save_in_nodes
-
-            # save float model
-            if self.float_model_save_path:
-                executor = Executor(context.place)
-                with scope_guard(context.scope):
-                    io.save_inference_model(
-                        self.float_model_save_path,
-                        in_vars,
-                        out_vars,
-                        executor,
-                        main_program=test_ir_graph.to_program(),
-                        model_filename='model',
-                        params_filename='weights',
-                        export_for_deployment=True)
-
-            # save int8 model
-            if self.int8_model_save_path:
-                convert_int8_pass = ConvertToInt8Pass(
-                    scope=context.scope, place=context.place)
-                convert_int8_pass.apply(test_ir_graph)
-
-                executor = Executor(context.place)
-
-                with scope_guard(context.scope):
-                    io.save_inference_model(
-                        self.int8_model_save_path,
-                        in_vars,
-                        out_vars,
-                        executor,
-                        main_program=test_ir_graph.to_program(),
-                        model_filename='model',
-                        params_filename='weights',
-                        export_for_deployment=True)
-
-            # save mobile model
-            if self.mobile_model_save_path:
-                if not self.int8_model_save_path:
-                    # convert the weights as int8_t type
-                    convert_int8_pass = ConvertToInt8Pass(
-                        scope=context.scope, place=context.place)
-                    convert_int8_pass.apply(test_ir_graph)
-                # make some changes on the graph for the mobile inference
-                mobile_pass = TransformForMobilePass()
-                mobile_pass.apply(test_ir_graph)
-                executor = Executor(context.place)
-                with scope_guard(context.scope):
-                    io.save_inference_model(
-                        self.mobile_model_save_path,
-                        in_vars,
-                        out_vars,
-                        executor,
-                        main_program=test_ir_graph.to_program(),
-                        model_filename='model',
-                        params_filename='weights',
-                        export_for_deployment=True)
-            _logger.info('Finish QuantizationStrategy::on_epoch_end')
diff --git a/python/paddle/fluid/contrib/slim/searcher/__init__.py b/python/paddle/fluid/contrib/slim/searcher/__init__.py
deleted file mode 100644
index 734811e318b25dbf6063bbe11d23fd30cb9a48d2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/searcher/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import controller
-from .controller import *
-
-__all__ = controller.__all__
diff --git a/python/paddle/fluid/contrib/slim/searcher/controller.py b/python/paddle/fluid/contrib/slim/searcher/controller.py
deleted file mode 100644
index 7072dc73746d1172a9626c60ff50adfe8c9e51b9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/searcher/controller.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""The controller used to search hyperparameters or neural architecture"""
-
-import numpy as np
-import copy
-import math
-import logging
-from ....log_helper import get_logger
-
-__all__ = ['EvolutionaryController', 'SAController']
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class EvolutionaryController(object):
-    """Abstract controller for all evolutionary searching method.
-    """
-
-    def __init__(self, *args, **kwargs):
-        pass
-
-    def update(self, tokens, reward):
-        """Update the status of controller according current tokens and reward.
-        Args:
-            tokens(list<int>): A solution of searching task.
-            reward(list<int>): The reward of tokens.
-        """
-        raise NotImplementedError('Abstract method.')
-
-    def reset(self, range_table, constrain_func=None):
-        """Reset the controller.
-        Args:
-            range_table(list<int>): It is used to define the searching space of controller.
-                                    The tokens[i] generated by controller should be in [0, range_table[i]).
-            constrain_func(function): It is used to check whether tokens meet the constraint.
-                                     None means there is no constraint. Default: None.
-        """
-        raise NotImplementedError('Abstract method.')
-
-    def next_tokens(self):
-        """Generate new tokens.
-        """
-        raise NotImplementedError('Abstract method.')
-
-
-class SAController(EvolutionaryController):
-    """Simulated annealing controller."""
-
-    def __init__(self,
-                 range_table=None,
-                 reduce_rate=0.85,
-                 init_temperature=1024,
-                 max_iter_number=300):
-        """Initialize.
-        Args:
-            range_table(list<int>): Range table.
-            reduce_rate(float): The decay rate of temperature.
-            init_temperature(float): Init temperature.
-            max_iter_number(int): max iteration number.
-        """
-        super(SAController, self).__init__()
-        self._range_table = range_table
-        self._reduce_rate = reduce_rate
-        self._init_temperature = init_temperature
-        self._max_iter_number = max_iter_number
-        self._reward = -1
-        self._tokens = None
-        self._max_reward = -1
-        self._best_tokens = None
-        self._iter = 0
-
-    def __getstate__(self):
-        d = {}
-        for key in self.__dict__:
-            if key != "_constrain_func":
-                d[key] = self.__dict__[key]
-        return d
-
-    def reset(self, range_table, init_tokens, constrain_func=None):
-        """
-        Reset the status of current controller.
-        Args:
-            range_table(list<int>): The range of value in each position of tokens generated by current controller. The range of tokens[i] is [0, range_table[i]).
-            init_tokens(list<int>): The initial tokens.
-            constrain_func(function): The callback function used to check whether the tokens meet constraint. None means there is no constraint. Default: None.
-        """
-        self._range_table = range_table
-        self._constrain_func = constrain_func
-        self._tokens = init_tokens
-        self._iter = 0
-
-    def update(self, tokens, reward):
-        """
-        Update the controller according to latest tokens and reward.
-        Args:
-            tokens(list<int>): The tokens generated in last step.
-            reward(float): The reward of tokens.
-        """
-        self._iter += 1
-        temperature = self._init_temperature * self._reduce_rate**self._iter
-        if (reward > self._reward) or (np.random.random() <= math.exp(
-            (reward - self._reward) / temperature)):
-            self._reward = reward
-            self._tokens = tokens
-        if reward > self._max_reward:
-            self._max_reward = reward
-            self._best_tokens = tokens
-        _logger.info("iter: {}; max_reward: {}; best_tokens: {}".format(
-            self._iter, self._max_reward, self._best_tokens))
-        _logger.info("current_reward: {}; current tokens: {}".format(
-            self._reward, self._tokens))
-
-    def next_tokens(self):
-        """
-        Get next tokens.
-        """
-        tokens = self._tokens
-        new_tokens = tokens[:]
-        index = int(len(self._range_table) * np.random.random())
-        new_tokens[index] = (
-            new_tokens[index] + np.random.randint(self._range_table[index] - 1)
-            + 1) % self._range_table[index]
-        _logger.info("change index[{}] from {} to {}".format(index, tokens[
-            index], new_tokens[index]))
-        if self._constrain_func is None:
-            return new_tokens
-        for _ in range(self._max_iter_number):
-            if not self._constrain_func(new_tokens):
-                index = int(len(self._range_table) * np.random.random())
-                new_tokens = tokens[:]
-                new_tokens[index] = np.random.randint(self._range_table[index])
-            else:
-                break
-        return new_tokens
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
deleted file mode 100644
index 71e6c99fb54decd1962b40fa2d985c85f7321c25..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ /dev/null
@@ -1,180 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-function(_inference_analysis_python_api_int8_test target model_dir data_dir filename use_mkldnn)
-    py_test(${target} SRCS ${filename}
-        ENVS CPU_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-             FLAGS_use_mkldnn=${use_mkldnn}
-        ARGS --infer_model ${model_dir}/model
-             --infer_data ${data_dir}/data.bin
-             --int8_model_save_path int8_models/${target}
-             --warmup_batch_size 100
-             --batch_size 50)
-endfunction()
-
-function(inference_analysis_python_api_int8_test target model_dir data_dir filename)
-    _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_dir} ${filename} False)
-endfunction()
-
-function(inference_analysis_python_api_int8_test_mkldnn target model_dir data_dir filename)
-    _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_dir} ${filename} True)
-endfunction()
-
-function(inference_qat_int8_test target model_dir data_dir test_script use_mkldnn)
-    py_test(${target} SRCS ${test_script}
-            ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 FLAGS_use_mkldnn=${use_mkldnn}
-            ARGS --qat_model ${model_dir}/model
-                 --infer_data ${data_dir}/data.bin
-                 --batch_size 25
-                 --batch_num 2
-                 --acc_diff_threshold 0.1)
-endfunction()
-
-function(inference_qat2_int8_test target model_dir data_dir test_script use_mkldnn)
-    py_test(${target} SRCS ${test_script}
-            ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 FLAGS_use_mkldnn=${use_mkldnn}
-            ARGS --qat_model ${model_dir}/float
-                 --infer_data ${data_dir}/data.bin
-                 --batch_size 25
-                 --batch_num 2
-                 --acc_diff_threshold 0.1
-                 --qat2)
-endfunction()
-
-
-if(WIN32)
-    list(REMOVE_ITEM TEST_OPS test_light_nas)
-endif()
-
-# int8 image classification python api test
-if(LINUX AND WITH_MKLDNN)
-  set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
-  set(MKLDNN_INT8_TEST_FILE "test_mkldnn_int8_quantization_strategy.py")
-  set(MKLDNN_INT8_TEST_FILE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_INT8_TEST_FILE}")
-
-  # googlenet int8
-  set(INT8_GOOGLENET_MODEL_DIR "${INT8_DATA_DIR}/googlenet")
-  inference_analysis_python_api_int8_test(test_slim_int8_googlenet ${INT8_GOOGLENET_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE_PATH})
-
-  # mobilenet int8
-  set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv1")
-  inference_analysis_python_api_int8_test(test_slim_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE_PATH})
-  inference_analysis_python_api_int8_test_mkldnn(test_slim_int8_mobilenet_mkldnn ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE_PATH})
-
-  # temporarily adding WITH_SLIM_MKLDNN_FULL_TEST FLAG for QA testing the following UTs locally,
-  # since the following UTs cost too much time on CI test.
-  if (WITH_SLIM_MKLDNN_FULL_TEST)
-    # resnet50 int8
-    set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
-    inference_analysis_python_api_int8_test(test_slim_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE_PATH})
-
-    # mobilenetv2 int8
-    set(INT8_MOBILENETV2_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv2")
-    inference_analysis_python_api_int8_test(test_slim_int8_mobilenetv2 ${INT8_MOBILENETV2_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE_PATH})
-
-    # resnet101 int8
-    set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101")
-    inference_analysis_python_api_int8_test(test_slim_int8_resnet101 ${INT8_RESNET101_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE_PATH})
-
-    # vgg16 int8
-    set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16")
-    inference_analysis_python_api_int8_test(test_slim_int8_vgg16 ${INT8_VGG16_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE_PATH})
-
-    # vgg19 int8
-    set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19")
-    inference_analysis_python_api_int8_test(test_slim_int8_vgg19 ${INT8_VGG19_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE_PATH})
-  endif()
-endif()
-
-# Since test_mkldnn_int8_quantization_strategy only supports testing on Linux
-# with MKL-DNN, we remove it here for not repeating test, or not testing on other systems.
-list(REMOVE_ITEM TEST_OPS test_mkldnn_int8_quantization_strategy)
-
-# QAT FP32 & INT8 comparison python api tests
-if(LINUX AND WITH_MKLDNN)
-	set(DATASET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
-	set(QAT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
-	set(QAT_MODELS_BASE_URL "${INFERENCE_URL}/int8/QAT_models")
-	set(MKLDNN_QAT_TEST_FILE "qat_int8_comparison.py")
-	set(MKLDNN_QAT_TEST_FILE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_QAT_TEST_FILE}")
-
-	# ImageNet small dataset
-	# May be already downloaded for INT8v2 unit tests
-	if (NOT EXISTS ${DATASET_DIR})
-		inference_download_and_uncompress(${DATASET_DIR} "${INFERENCE_URL}/int8" "imagenet_val_100_tail.tar.gz")
-	endif()
-
-	# QAT ResNet50
-	set(QAT_RESNET50_MODEL_DIR "${QAT_DATA_DIR}/ResNet50_QAT")
-	if (NOT EXISTS ${QAT_RESNET50_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_RESNET50_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "ResNet50_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_resnet50_mkldnn ${QAT_RESNET50_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT ResNet101
-	set(QAT_RESNET101_MODEL_DIR "${QAT_DATA_DIR}/ResNet101_QAT")
-	if (NOT EXISTS ${QAT_RESNET101_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_RESNET101_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "ResNet101_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_resnet101_mkldnn ${QAT_RESNET101_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT GoogleNet
-	set(QAT_GOOGLENET_MODEL_DIR "${QAT_DATA_DIR}/GoogleNet_QAT")
-	if (NOT EXISTS ${QAT_GOOGLENET_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_GOOGLENET_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "GoogleNet_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_googlenet_mkldnn ${QAT_GOOGLENET_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT MobileNetV1
-	set(QAT_MOBILENETV1_MODEL_DIR "${QAT_DATA_DIR}/MobileNetV1_QAT")
-	if (NOT EXISTS ${QAT_MOBILENETV1_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_MOBILENETV1_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "MobileNetV1_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_mobilenetv1_mkldnn ${QAT_MOBILENETV1_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT MobileNetV2
-	set(QAT_MOBILENETV2_MODEL_DIR "${QAT_DATA_DIR}/MobileNetV2_QAT")
-	if (NOT EXISTS ${QAT_MOBILENETV2_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_MOBILENETV2_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "MobileNetV2_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_mobilenetv2_mkldnn ${QAT_MOBILENETV2_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT VGG16
-	set(QAT_VGG16_MODEL_DIR "${QAT_DATA_DIR}/VGG16_QAT")
-	if (NOT EXISTS ${QAT_VGG16_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_VGG16_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "VGG16_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_vgg16_mkldnn ${QAT_VGG16_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT VGG19
-	set(QAT_VGG19_MODEL_DIR "${QAT_DATA_DIR}/VGG19_QAT")
-	if (NOT EXISTS ${QAT_VGG19_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_VGG19_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "VGG19_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_vgg19_mkldnn ${QAT_VGG19_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-  
-        set(QAT2_RESNET50_MODEL_DIR "${QAT_DATA_DIR}/ResNet50_qat_perf")
-        if (NOT EXISTS ${QAT2_RESNET50_MODEL_DIR})
-                inference_download_and_uncompress(${QAT2_RESNET50_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "ResNet50_qat_perf.tar.gz" )
-        endif()
-        inference_qat2_int8_test(test_qat2_int8_resnet50_mkldnn ${QAT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-        set(QAT2_MOBILENETV1_MODEL_DIR "${QAT_DATA_DIR}/MobileNet_qat_perf")
-        if (NOT EXISTS ${QAT2_MOBILENETV1_MODEL_DIR})
-                inference_download_and_uncompress(${QAT2_MOBILENETV1_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "MobileNet_qat_perf.tar.gz" )
-        endif()
-        inference_qat2_int8_test(test_qat2_int8_mobilenetv1_mkldnn ${QAT2_MOBILENETV1_MODEL_DIR}/MobileNet_qat_perf ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-endif()
-
-# Since the test for QAT FP32 & INT8 comparison supports only testing on Linux 
-# with MKL-DNN, we remove it here to not test it on other systems.
-list(REMOVE_ITEM TEST_OPS qat_int8_comparison.py)
-
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/contrib/slim/tests/QAT_mkldnn_int8_readme.md b/python/paddle/fluid/contrib/slim/tests/QAT_mkldnn_int8_readme.md
deleted file mode 100644
index 15b701a0f2fa9233661408697a678984fa5c0eb1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/QAT_mkldnn_int8_readme.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# SLIM Quantization-aware training (QAT) on INT8 MKL-DNN
-
-This document describes how to use [Paddle Slim](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/paddle_slim/paddle_slim.md) to convert a quantization-aware trained model to an INT8 MKL-DNN runnable model which has almost the same accuracy as QAT on GoogleNet, MobileNet-V1, MobileNet-V2, ResNet-101, ResNet-50, VGG16 and VGG19. We provide the accuracy results compared with fake QAT accuracy by running the QAT trained model with MKL-DNN int8 kernel on above 7 models.
-
-## 0. Prerequisite
-You need to install at least PaddlePaddle-1.5 python package `pip install paddlepaddle==1.5`.
-
-## 1. How to generate INT8 MKL-DNN QAT model
-You can refer to the unit test in [test_quantization_mkldnn_pass.py](test_quantization_mkldnn_pass.py). Users firstly use PaddleSlim quantization strategy to get a saved fake QAT model by [QuantizationFreezePass](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api), then use the `FakeQAT2MkldnnINT8KernelPass` to get the graph which can be run with MKL-DNN INT8 kernel. In Paddle Release 1.5, this pass only supports `conv2d` and `depthwise_conv2d` with channel-wise quantization for weights.
-
-```python
-    import paddle.fluid as fluid
-    from paddle.fluid.contrib.slim.quantization import FakeQAT2MkldnnINT8KernelPass
-    from paddle.fluid.framework import IrGraph
-    from paddle.fluid import core	
-    
-    # Create the IrGraph by Program
-    graph = IrGraph(core.Graph(fluid.Program().desc), for_test=False)
-    place = fluid.CPUPlace()
-    # Convert the IrGraph to MKL-DNN supported INT8 IrGraph by using
-    # FakeQAT2MkldnnINT8KernelPass
-    mkldnn_pass = FakeQAT2MkldnnINT8KernelPass(fluid.global_scope(), place)
-    # Apply FakeQAT2MkldnnINT8KernelPass to IrGraph
-    mkldnn_pass.apply(graph)
-```
-
-## 2. Accuracy benchmark
-
->**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
-
-| Model        | Fake QAT Top1 Accuracy | Fake QAT Top5 Accuracy |MKL-DNN INT8 Top1 Accuracy |  Top1 Diff   | MKL-DNN INT8 Top5 Accuracy | Top5 Diff  |
-| :----------: | :--------------------: | :--------------------: |:-----------------------:  | :----------: | :------------------------: | :--------: |
-| GoogleNet    |         70.40%         |          89.46%        |           70.39%          |     0.010%   |           89.46%           |   0.000%   |
-| MobileNet-V1 |         70.83%         |          89.56%        |           70.84%          |    -0.010%   |           89.56%           |   0.000%   |
-| MobileNet-V2 |         72.17%         |          90.67%        |           72.13%          |     0.040%   |           90.67%           |   0.000%   |
-| ResNet-101   |         77.49%         |          93.65%        |           77.51%          |    -0.020%   |           93.67%           |  -0.020%   |
-| ResNet-50    |         76.62%         |          93.08%        |           76.61%          |     0.010%   |           93.09%           |  -0.010%   |
-| VGG16        |         72.71%         |          91.11%        |           72.69%          |     0.020%   |           91.09%           |   0.020%   |
-| VGG19        |         73.37%         |          91.40%        |           73.37%          |     0.000%   |           91.41%           |  -0.010%   |
-
-Notes:
-
-* MKL-DNN and MKL are required.
-
-## 3. How to reproduce the results
-Three steps to reproduce the above-mentioned accuracy results, and we take ResNet50 benchmark as an example:
- * ### Prepare dataset
-```bash
-cd /PATH/TO/PADDLE
-python paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
-```
-The converted data binary file is saved by default in `~/.cache/paddle/dataset/int8/download/int8_full_val.bin`
- * ### Prepare model
-You can run the following commands to download ResNet50 model.
-
-```bash
-mkdir -p /PATH/TO/DOWNLOAD/MODEL/
-cd /PATH/TO/DOWNLOAD/MODEL/
-export MODEL_NAME=ResNet50
-wget http://paddle-inference-dist.bj.bcebos.com/int8/QAT_models/${MODEL_NAME}_qat_model.tar.gz
-mkdir -p ${MODEL_NAME}
-tar -xvf ${MODEL_NAME}_qat_model.tar.gz -C ${MODEL_NAME}
-```
-
-To download and verify all the 7 models, you need to set `MODEL_NAME` to one of the following values in command line:
-
-```text
-MODEL_NAME=ResNet50, ResNet101, GoogleNet, MobileNetV1, MobileNetV2, VGG16, VGG19
-```
-* ### Commands to reproduce benchmark
-You can run `qat_int8_comparison.py` with the following arguments to reproduce the accuracy result on ResNet50.
-
-```bash
-OMP_NUM_THREADS=28 FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py --qat_model=/PATH/TO/DOWNLOAD/MODEL/${MODEL_NAME}/model --infer_data=~/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=50 --batch_num=1000 --acc_diff_threshold=0.001
-```
-> Notes: The above commands will cost maybe several hours in the prediction stage (include int8 prediction and fp32 prediction) since there have 50000 pictures need to be predicted in `int8_full_val.bin`. User can set `OMP_NUM_THREADS` to the max number of physical cores of the used server to accelerate the process.
diff --git a/python/paddle/fluid/contrib/slim/tests/__init__.py b/python/paddle/fluid/contrib/slim/tests/__init__.py
deleted file mode 100644
index 6d41233e227dc7bab94ee4284cc25e12b45bf469..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/fluid/contrib/slim/tests/auto_pruning/compress.yaml b/python/paddle/fluid/contrib/slim/tests/auto_pruning/compress.yaml
deleted file mode 100644
index 8f0ab5fbddf351ee109dc7cbd3dc6e672857aecf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/auto_pruning/compress.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-version: 1.0
-pruners:
-    pruner_1:
-        class: 'StructurePruner'
-        pruning_axis:
-            '*': 0
-        criterions:
-            '*': 'l1_norm'
-controllers:
-    sa_controller:
-        class: 'SAController'
-        reduce_rate: 0.9
-        init_temperature: 1024
-        max_iter_number: 300
-strategies:
-    auto_pruning_strategy:
-        class: 'AutoPruneStrategy'
-        pruner: 'pruner_1'
-        controller: 'sa_controller'
-        start_epoch: 0
-        end_epoch: 2
-        max_ratio: 0.7
-        min_ratio: 0.5
-        pruned_params: '.*_sep_weights'
-        metric_name: 'acc_top5'
-compressor:
-    epoch: 2
-    checkpoint_path: './checkpoints_auto_pruning/'
-    strategies:
-        - auto_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/compress.yaml b/python/paddle/fluid/contrib/slim/tests/configs/compress.yaml
deleted file mode 100644
index 604cdf3f447ae0ed17700fe53f1daf6ded77399a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/compress.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-version: 1.0
-compressor:
-    epoch: 1 
-    checkpoint_path: './checkpoints/'
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml b/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
deleted file mode 100644
index 570c60026d55c242106f7e2dc5c3f47bfbdbe884..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-#start_epoch:         The 'on_epoch_begin' function will be called in start_epoch. default: 0.
-#end_epoch:           The 'on_epoch_end' function will be called in end_epoch. default: 10.
-#delta_rate:          The delta used to generate ratios when calculating sensitivities.
-#target_ratio:        The flops ratio to be pruned from current model.
-#metric_name:         The metric used to evaluate the model.
-#pruned_params:       The pattern str to match the parameter names to be pruned.
-#sensitivities_file:  The sensitivities file.
-#num_steps:           The number of pruning steps.
-#eval_rate:           The rate of sampled data used to calculate sensitivities.
-version: 1.0
-pruners:
-    pruner_1:
-        class: 'StructurePruner'
-        pruning_axis:
-            '*': 0
-        criterions:
-            '*': 'l1_norm'
-strategies:
-    sensitive_pruning_strategy:
-        class: 'SensitivePruneStrategy'
-        pruner: 'pruner_1'
-        start_epoch: 0
-        delta_rate: 0.1
-        target_ratio: 0.3
-        num_steps: 1
-        eval_rate: 0.5
-        pruned_params: '.*_sep_weights'
-        sensitivities_file: 'mobilenet_acc_top1_sensitive.data'
-        metric_name: 'acc_top1'
-compressor:
-    epoch: 120
-    checkpoint_path: './checkpoints/'
-    strategies:
-        - sensitive_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml b/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
deleted file mode 100644
index 0d3d10b8651eb3767b24a6723311739e013df42a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-#start_epoch(int): The epoch when to merge student graph and teacher graph for
-#                  distillation training. default: 0
-#
-#end_epoch(int): The epoch when to finish distillation training. default: 0
-#
-#student_feature_map(str): The name of feature map from student network.
-#
-#teacher_feature_map(str): The name of feature map from teacher network.
-#                          It's shape should be the same with student network.
-#
-#student_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
-#                            a section in student network. The variables in a tuple should
-#                            have the same feature map size.
-#
-#teacher_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
-#                            a section in teacher network. The variables in a tuple should
-#                            have the same feature map size. Varibale named teacher_pairs[i][j]
-#                            should has the save channel number with that of variable named 
-#                            student_pairs[i][j].
-#
-#distillation_loss_weight(float): The weight of the loss.
-version: 1.0
-distillers:
-    fsp_distiller:
-        class: 'FSPDistiller'
-#        teacher_pairs: [['teacher_depthwise_conv2d_1.tmp_0', 'teacher_conv2d_3.tmp_0']]
-#        student_pairs: [['student_depthwise_conv2d_1.tmp_0', 'student_conv2d_3.tmp_0']]
-        teacher_pairs: [['teacher_conv2_1_dw.tmp_0', 'teacher_conv1.tmp_0']]
-        student_pairs: [['student_conv2_1_dw.tmp_0', 'student_conv1.tmp_0']]
-        distillation_loss_weight: 1
-    l2_distiller:
-        class: 'L2Distiller'
-        teacher_feature_map: 'teacher.tmp_1'
-        student_feature_map: 'student.tmp_1'
-        distillation_loss_weight: 1
-    soft_label_distiller:
-        class: 'SoftLabelDistiller'
-        student_temperature: 1.0
-        teacher_temperature: 1.0 
-        teacher_feature_map: 'teacher.tmp_2'
-        student_feature_map: 'student.tmp_2'
-        distillation_loss_weight: 0.001
-strategies:
-    distillation_strategy:
-        class: 'DistillationStrategy'
-        distillers: ['fsp_distiller', 'l2_distiller', 'soft_label_distiller']
-        start_epoch: 0
-        end_epoch: 1
-compressor:
-    epoch: 1
-    checkpoint_path: './distillation_checkpoints/'
-    strategies:
-        - distillation_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
deleted file mode 100644
index b21a36263727f39f7eb13778d9b326dd045d9627..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-#start_epoch:         The 'on_epoch_begin' function will be called in start_epoch. default: 0.
-#end_epoch:           The 'on_epoch_end' function will be called in end_epoch. default: 10.
-#delta_rate:          The delta used to generate ratios when calculating sensitivities.
-#target_ratio:        The flops ratio to be pruned from current model.
-#metric_name:         The metric used to evaluate the model.
-#pruned_params:       The pattern str to match the parameter names to be pruned.
-#sensitivities_file:  The sensitivities file.
-#num_steps:           The number of pruning steps.
-#eval_rate:           The rate of sampled data used to calculate sensitivities.
-version: 1.0
-pruners:
-    pruner_1:
-        class: 'StructurePruner'
-        pruning_axis:
-            '*': 0
-        criterions:
-            '*': 'l1_norm'
-strategies:
-    sensitive_pruning_strategy:
-        class: 'SensitivePruneStrategy'
-        pruner: 'pruner_1'
-        start_epoch: 1
-        delta_rate: 0.2
-        target_ratio: 0.08
-        num_steps: 1
-        eval_rate: 0.5
-        pruned_params: 'conv6_sep_weights'
-        sensitivities_file: 'mobilenet_acc_top1_sensitive.data'
-        metric_name: 'acc_top1'
-compressor:
-    epoch: 1
-    checkpoint_path: './checkpoints_pruning/'
-    strategies:
-        - sensitive_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore.yaml b/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore.yaml
deleted file mode 100644
index 9e437aedc9d2427394fb697ca1898baffb00a109..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-version: 1.0
-pruners:
-    pruner_1:
-        class: 'StructurePruner'
-        pruning_axis:
-            '*': 0
-        criterions:
-            '*': 'l1_norm'
-strategies:
-    uniform_pruning_strategy:
-        class: 'UniformPruneStrategy'
-        pruner: 'pruner_1'
-        start_epoch: 0
-        target_ratio: 0.5
-        pruned_params: 'conv.*'
-        metric_name: 'acc_top1'
-compressor:
-    epoch: 2
-    checkpoint_path: './checkpoints_uniform_restore_tmp/'
-    strategies:
-        - uniform_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_0.yaml b/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_0.yaml
deleted file mode 100644
index 49f104f98f3854ee831ebbea1ff6fa9c7817a15b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_0.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-version: 1.0
-pruners:
-    pruner_1:
-        class: 'StructurePruner'
-        pruning_axis:
-            '*': 0
-        criterions:
-            '*': 'l1_norm'
-strategies:
-    uniform_pruning_strategy:
-        class: 'UniformPruneStrategy'
-        pruner: 'pruner_1'
-        start_epoch: 0
-        target_ratio: 0.5
-        pruned_params: 'conv.*'
-        metric_name: 'acc_top1'
-compressor:
-    epoch: 1
-    checkpoint_path: './checkpoints_uniform_restore/'
-    strategies:
-        - uniform_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_1.yaml b/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_1.yaml
deleted file mode 100644
index 82e6793aff97d261a83d88dbc077e76e652e1fe1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_1.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-version: 1.0
-pruners:
-    pruner_1:
-        class: 'StructurePruner'
-        pruning_axis:
-            '*': 0
-        criterions:
-            '*': 'l1_norm'
-strategies:
-    uniform_pruning_strategy:
-        class: 'UniformPruneStrategy'
-        pruner: 'pruner_1'
-        start_epoch: 0
-        target_ratio: 0.5
-        pruned_params: 'conv.*'
-        metric_name: 'acc_top1'
-compressor:
-    epoch: 2
-    checkpoint_path: './checkpoints_uniform_restore/'
-    strategies:
-        - uniform_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/light_nas/compress.yaml b/python/paddle/fluid/contrib/slim/tests/light_nas/compress.yaml
deleted file mode 100644
index d75b4c6f67f8ebb04b30ca96bac7f9a35fb50cc3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/light_nas/compress.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-version: 1.0
-controllers:
-    sa_controller:
-        class: 'SAController'
-        reduce_rate: 0.9
-        init_temperature: 1024
-        max_iter_number: 300
-strategies:
-    light_nas_strategy:
-        class: 'LightNASStrategy'
-        controller: 'sa_controller'
-        target_flops: 629145600
-        target_latency: 1
-        end_epoch: 2
-        retrain_epoch: 1
-        metric_name: 'acc_top1'
-        is_server: 1
-        max_client_num: 100
-        search_steps: 2
-compressor:
-    epoch: 2
-    strategies:
-        - light_nas_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/light_nas/light_nas_space.py b/python/paddle/fluid/contrib/slim/tests/light_nas/light_nas_space.py
deleted file mode 100644
index 082ee7dde4a58e604a9254754d58d63359218e26..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/light_nas/light_nas_space.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid.contrib.slim.nas import SearchSpace
-from light_nasnet import LightNASNet
-import paddle.fluid as fluid
-import paddle
-import json
-import random
-
-total_images = 1281167
-lr = 0.1
-num_epochs = 1
-batch_size = 256
-lr_strategy = "cosine_decay"
-l2_decay = 4e-5
-momentum_rate = 0.9
-image_shape = [1, 28, 28]
-
-__all__ = ['LightNASSpace']
-
-NAS_FILTER_SIZE = [[18, 24, 30], [24, 32, 40], [48, 64, 80], [72, 96, 120],
-                   [120, 160, 192]]
-NAS_LAYERS_NUMBER = [[1, 2, 3], [2, 3, 4], [3, 4, 5], [2, 3, 4], [2, 3, 4]]
-NAS_KERNEL_SIZE = [3, 5]
-NAS_FILTERS_MULTIPLIER = [3, 4, 5, 6]
-NAS_SHORTCUT = [0, 1]
-NAS_SE = [0, 1]
-
-
-def get_bottleneck_params_list(var):
-    """Get bottleneck_params_list from var.
-    Args:
-        var: list, variable list.
-    Returns:
-        list, bottleneck_params_list.
-    """
-    params_list = [
-        1, 16, 1, 1, 3, 1, 0, \
-        6, 24, 2, 2, 3, 1, 0, \
-        6, 32, 3, 2, 3, 1, 0, \
-        6, 64, 4, 2, 3, 1, 0, \
-        6, 96, 3, 1, 3, 1, 0, \
-        6, 160, 3, 2, 3, 1, 0, \
-        6, 320, 1, 1, 3, 1, 0, \
-    ]
-    for i in range(5):
-        params_list[i * 7 + 7] = NAS_FILTERS_MULTIPLIER[var[i * 6]]
-        params_list[i * 7 + 8] = NAS_FILTER_SIZE[i][var[i * 6 + 1]]
-        params_list[i * 7 + 9] = NAS_LAYERS_NUMBER[i][var[i * 6 + 2]]
-        params_list[i * 7 + 11] = NAS_KERNEL_SIZE[var[i * 6 + 3]]
-        params_list[i * 7 + 12] = NAS_SHORTCUT[var[i * 6 + 4]]
-        params_list[i * 7 + 13] = NAS_SE[var[i * 6 + 5]]
-    return params_list
-
-
-class LightNASSpace(SearchSpace):
-    def __init__(self):
-        super(LightNASSpace, self).__init__()
-
-    def init_tokens(self):
-        """Get init tokens in search space.
-        """
-        return [
-            0, 1, 2, 0, 1, 0, 0, 2, 1, 1, 1, 0, 3, 2, 0, 1, 1, 0, 3, 1, 0, 0, 1,
-            0, 3, 2, 2, 1, 1, 0
-        ]
-
-    def range_table(self):
-        """Get range table of current search space.
-        """
-        # [NAS_FILTER_SIZE, NAS_LAYERS_NUMBER, NAS_KERNEL_SIZE, NAS_FILTERS_MULTIPLIER, NAS_SHORTCUT, NAS_SE]
-        return [
-            4, 3, 3, 2, 2, 2, 4, 3, 3, 2, 2, 2, 4, 3, 3, 2, 2, 2, 4, 3, 3, 2, 2,
-            2, 4, 3, 3, 2, 2, 2
-        ]
-
-    def get_model_latency(self, program):
-        """Get model latency according to program.
-        Returns a random number since it's only for testing.
-        Args:
-            program(Program): The program to get latency.
-        Return:
-            (float): model latency.
-        """
-        return random.randint(1, 2)
-
-    def create_net(self, tokens=None):
-        """Create a network for training by tokens.
-        """
-        if tokens is None:
-            tokens = self.init_tokens()
-
-        bottleneck_params_list = get_bottleneck_params_list(tokens)
-
-        startup_prog = fluid.Program()
-        train_prog = fluid.Program()
-        test_prog = fluid.Program()
-        train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program(
-            is_train=True,
-            main_prog=train_prog,
-            startup_prog=startup_prog,
-            bottleneck_params_list=bottleneck_params_list)
-        test_py_reader, test_cost, test_acc1, test_acc5 = build_program(
-            is_train=False,
-            main_prog=test_prog,
-            startup_prog=startup_prog,
-            bottleneck_params_list=bottleneck_params_list)
-        test_prog = test_prog.clone(for_test=True)
-        train_batch_size = batch_size / 1
-        test_batch_size = batch_size
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(),
-            batch_size=train_batch_size,
-            drop_last=True)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=test_batch_size)
-
-        with fluid.program_guard(train_prog, startup_prog):
-            train_py_reader.decorate_paddle_reader(train_reader)
-
-        with fluid.program_guard(test_prog, startup_prog):
-            test_py_reader.decorate_paddle_reader(test_reader)
-        return startup_prog, train_prog, test_prog, (
-            train_cost, train_acc1, train_acc5,
-            global_lr), (test_cost, test_acc1,
-                         test_acc5), train_py_reader, test_py_reader
-
-
-def build_program(is_train,
-                  main_prog,
-                  startup_prog,
-                  bottleneck_params_list=None):
-    with fluid.program_guard(main_prog, startup_prog):
-        py_reader = fluid.layers.py_reader(
-            capacity=16,
-            shapes=[[-1] + image_shape, [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            use_double_buffer=False)
-        with fluid.unique_name.guard():
-            image, label = fluid.layers.read_file(py_reader)
-            model = LightNASNet()
-            avg_cost, acc_top1, acc_top5 = net_config(
-                image,
-                label,
-                model,
-                class_dim=10,
-                bottleneck_params_list=bottleneck_params_list,
-                scale_loss=1.0)
-
-            avg_cost.persistable = True
-            acc_top1.persistable = True
-            acc_top5.persistable = True
-            if is_train:
-                params = model.params
-                params["total_images"] = total_images
-                params["lr"] = lr
-                params["num_epochs"] = num_epochs
-                params["learning_strategy"]["batch_size"] = batch_size
-                params["learning_strategy"]["name"] = lr_strategy
-                params["l2_decay"] = l2_decay
-                params["momentum_rate"] = momentum_rate
-                optimizer = optimizer_setting(params)
-                optimizer.minimize(avg_cost)
-                global_lr = optimizer._global_learning_rate()
-
-        if is_train:
-            return py_reader, avg_cost, acc_top1, acc_top5, global_lr
-        else:
-            return py_reader, avg_cost, acc_top1, acc_top5
-
-
-def net_config(image,
-               label,
-               model,
-               class_dim=1000,
-               bottleneck_params_list=None,
-               scale_loss=1.0):
-    bottleneck_params_list = [
-        bottleneck_params_list[i:i + 7]
-        for i in range(0, len(bottleneck_params_list), 7)
-    ]
-    out = model.net(input=image,
-                    bottleneck_params_list=bottleneck_params_list,
-                    class_dim=class_dim)
-    cost, pred = fluid.layers.softmax_with_cross_entropy(
-        out, label, return_softmax=True)
-    if scale_loss > 1:
-        avg_cost = fluid.layers.mean(x=cost) * float(scale_loss)
-    else:
-        avg_cost = fluid.layers.mean(x=cost)
-    acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
-    acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
-    return avg_cost, acc_top1, acc_top5
-
-
-def optimizer_setting(params):
-    """optimizer setting.
-    Args:
-        params: dict, params.
-    """
-    ls = params["learning_strategy"]
-    l2_decay = params["l2_decay"]
-    momentum_rate = params["momentum_rate"]
-    if ls["name"] == "piecewise_decay":
-        if "total_images" not in params:
-            total_images = IMAGENET1000
-        else:
-            total_images = params["total_images"]
-        batch_size = ls["batch_size"]
-        step = int(total_images / batch_size + 1)
-        bd = [step * e for e in ls["epochs"]]
-        base_lr = params["lr"]
-        lr = []
-        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=bd, values=lr),
-            momentum=momentum_rate,
-            regularization=fluid.regularizer.L2Decay(l2_decay))
-    elif ls["name"] == "cosine_decay":
-        if "total_images" not in params:
-            total_images = IMAGENET1000
-        else:
-            total_images = params["total_images"]
-        batch_size = ls["batch_size"]
-        step = int(total_images / batch_size + 1)
-        lr = params["lr"]
-        num_epochs = params["num_epochs"]
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.cosine_decay(
-                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
-            momentum=momentum_rate,
-            regularization=fluid.regularizer.L2Decay(l2_decay))
-    elif ls["name"] == "cosine_warmup_decay":
-        if "total_images" not in params:
-            total_images = IMAGENET1000
-        else:
-            total_images = params["total_images"]
-        batch_size = ls["batch_size"]
-        l2_decay = params["l2_decay"]
-        momentum_rate = params["momentum_rate"]
-        step = int(math.ceil(float(total_images) / batch_size))
-        lr = params["lr"]
-        num_epochs = params["num_epochs"]
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=cosine_decay_with_warmup(
-                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
-            momentum=momentum_rate,
-            regularization=fluid.regularizer.L2Decay(l2_decay))
-    elif ls["name"] == "linear_decay":
-        if "total_images" not in params:
-            total_images = IMAGENET1000
-        else:
-            total_images = params["total_images"]
-        batch_size = ls["batch_size"]
-        num_epochs = params["num_epochs"]
-        start_lr = params["lr"]
-        end_lr = 0
-        total_step = int((total_images / batch_size) * num_epochs)
-        lr = fluid.layers.polynomial_decay(
-            start_lr, total_step, end_lr, power=1)
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=lr,
-            momentum=momentum_rate,
-            regularization=fluid.regularizer.L2Decay(l2_decay))
-    elif ls["name"] == "adam":
-        lr = params["lr"]
-        optimizer = fluid.optimizer.Adam(learning_rate=lr)
-    else:
-        lr = params["lr"]
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=lr,
-            momentum=momentum_rate,
-            regularization=fluid.regularizer.L2Decay(l2_decay))
-    return optimizer
diff --git a/python/paddle/fluid/contrib/slim/tests/light_nas/light_nasnet.py b/python/paddle/fluid/contrib/slim/tests/light_nas/light_nasnet.py
deleted file mode 100644
index 0ac3ac55b587ed1486f228c9dc85d9de96f445ec..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/light_nas/light_nasnet.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""LightNASNet."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-
-__all__ = ['LightNASNet']
-
-train_parameters = {
-    "input_size": [3, 224, 224],
-    "input_mean": [0.485, 0.456, 0.406],
-    "input_std": [0.229, 0.224, 0.225],
-    "learning_strategy": {
-        "name": "piecewise_decay",
-        "batch_size": 256,
-        "epochs": [30, 60, 90],
-        "steps": [0.1, 0.01, 0.001, 0.0001]
-    }
-}
-
-
-class LightNASNet(object):
-    """LightNASNet."""
-
-    def __init__(self):
-        self.params = train_parameters
-
-    def net(self, input, bottleneck_params_list=None, class_dim=1000,
-            scale=1.0):
-        """Build network.
-        Args:
-            input: Variable, input.
-            class_dim: int, class dim.
-            scale: float, scale.
-        Returns:
-            Variable, network output.
-        """
-        if bottleneck_params_list is None:
-            # MobileNetV2
-            # bottleneck_params_list = [
-            #     (1, 16, 1, 1, 3, 1, 0),
-            #     (6, 24, 2, 2, 3, 1, 0),
-            #     (6, 32, 3, 2, 3, 1, 0),
-            #     (6, 64, 4, 2, 3, 1, 0),
-            #     (6, 96, 3, 1, 3, 1, 0),
-            #     (6, 160, 3, 2, 3, 1, 0),
-            #     (6, 320, 1, 1, 3, 1, 0),
-            # ]
-            bottleneck_params_list = [
-                (1, 16, 1, 1, 3, 1, 0),
-                (3, 24, 3, 2, 3, 1, 0),
-                (3, 40, 3, 2, 5, 1, 0),
-                (6, 80, 3, 2, 5, 1, 0),
-                (6, 96, 2, 1, 3, 1, 0),
-                (6, 192, 4, 2, 5, 1, 0),
-                (6, 320, 1, 1, 3, 1, 0),
-            ]
-
-        #conv1
-        input = self.conv_bn_layer(
-            input,
-            num_filters=int(32 * scale),
-            filter_size=3,
-            stride=2,
-            padding=1,
-            if_act=True,
-            name='conv1_1')
-
-        # bottleneck sequences
-        i = 1
-        in_c = int(32 * scale)
-        for layer_setting in bottleneck_params_list:
-            t, c, n, s, k, ifshortcut, ifse = layer_setting
-            i += 1
-            input = self.invresi_blocks(
-                input=input,
-                in_channel=in_c,
-                expansion=t,
-                out_channel=int(c * scale),
-                num_layers=n,
-                stride=s,
-                filter_size=k,
-                shortcut=ifshortcut,
-                squeeze=ifse,
-                name='conv' + str(i))
-            in_c = int(c * scale)
-        #last_conv
-        input = self.conv_bn_layer(
-            input=input,
-            num_filters=int(1280 * scale) if scale > 1.0 else 1280,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            if_act=True,
-            name='conv9')
-
-        input = fluid.layers.pool2d(
-            input=input,
-            pool_size=7,
-            pool_stride=1,
-            pool_type='avg',
-            global_pooling=True)
-
-        output = fluid.layers.fc(input=input,
-                                 size=class_dim,
-                                 param_attr=ParamAttr(name='fc10_weights'),
-                                 bias_attr=ParamAttr(name='fc10_offset'))
-        return output
-
-    def conv_bn_layer(self,
-                      input,
-                      filter_size,
-                      num_filters,
-                      stride,
-                      padding,
-                      num_groups=1,
-                      if_act=True,
-                      name=None,
-                      use_cudnn=True):
-        """Build convolution and batch normalization layers.
-        Args:
-            input: Variable, input.
-            filter_size: int, filter size.
-            num_filters: int, number of filters.
-            stride: int, stride.
-            padding: int, padding.
-            num_groups: int, number of groups.
-            if_act: bool, whether using activation.
-            name: str, name.
-            use_cudnn: bool, whether use cudnn.
-        Returns:
-            Variable, layers output.
-        """
-        conv = fluid.layers.conv2d(
-            input=input,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=ParamAttr(name=name + '_weights'),
-            bias_attr=False)
-        bn_name = name + '_bn'
-        bn = fluid.layers.batch_norm(
-            input=conv,
-            param_attr=ParamAttr(name=bn_name + "_scale"),
-            bias_attr=ParamAttr(name=bn_name + "_offset"),
-            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance')
-        if if_act:
-            return fluid.layers.relu6(bn)
-        else:
-            return bn
-
-    def shortcut(self, input, data_residual):
-        """Build shortcut layer.
-        Args:
-            input: Variable, input.
-            data_residual: Variable, residual layer.
-        Returns:
-            Variable, layer output.
-        """
-        return fluid.layers.elementwise_add(input, data_residual)
-
-    def squeeze_excitation(self,
-                           input,
-                           num_channels,
-                           reduction_ratio,
-                           name=None):
-        """Build squeeze excitation layers.
-        Args:
-            input: Variable, input.
-            num_channels: int, number of channels.
-            reduction_ratio: float, reduction ratio.
-            name: str, name.
-        Returns:
-            Variable, layers output.
-        """
-        pool = fluid.layers.pool2d(
-            input=input, pool_size=0, pool_type='avg', global_pooling=True)
-        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-        squeeze = fluid.layers.fc(
-            input=pool,
-            size=num_channels // reduction_ratio,
-            act='relu',
-            param_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv),
-                name=name + '_sqz_weights'),
-            bias_attr=ParamAttr(name=name + '_sqz_offset'))
-        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
-        excitation = fluid.layers.fc(
-            input=squeeze,
-            size=num_channels,
-            act='sigmoid',
-            param_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv),
-                name=name + '_exc_weights'),
-            bias_attr=ParamAttr(name=name + '_exc_offset'))
-        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
-        return scale
-
-    def inverted_residual_unit(self,
-                               input,
-                               num_in_filter,
-                               num_filters,
-                               ifshortcut,
-                               ifse,
-                               stride,
-                               filter_size,
-                               expansion_factor,
-                               reduction_ratio=4,
-                               name=None):
-        """Build inverted residual unit.
-        Args:
-            input(Variable): Theinput.
-            num_in_filter(int): The number of input filters.
-            num_filters(int): The number of filters.
-            ifshortcut(bool): Whether to use shortcut.
-            stride(int): The stride.
-            filter_size(int): The filter size.
-            padding(int): The padding.
-            expansion_factor(float): Expansion factor.
-            name(str): The name.
-        Returns:
-            Variable, layers output.
-        """
-        num_expfilter = int(round(num_in_filter * expansion_factor))
-        channel_expand = self.conv_bn_layer(
-            input=input,
-            num_filters=num_expfilter,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1,
-            if_act=True,
-            name=name + '_expand')
-
-        bottleneck_conv = self.conv_bn_layer(
-            input=channel_expand,
-            num_filters=num_expfilter,
-            filter_size=filter_size,
-            stride=stride,
-            padding=int((filter_size - 1) / 2),
-            num_groups=num_expfilter,
-            if_act=True,
-            name=name + '_dwise',
-            use_cudnn=False)
-
-        linear_out = self.conv_bn_layer(
-            input=bottleneck_conv,
-            num_filters=num_filters,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1,
-            if_act=False,
-            name=name + '_linear')
-        out = linear_out
-        if ifshortcut:
-            out = self.shortcut(input=input, data_residual=out)
-        if ifse:
-            scale = self.squeeze_excitation(
-                input=linear_out,
-                num_channels=num_filters,
-                reduction_ratio=reduction_ratio,
-                name=name + '_fc')
-            out = fluid.layers.elementwise_add(x=out, y=scale, act='relu')
-        return out
-
-    def invresi_blocks(self,
-                       input,
-                       in_channel,
-                       expansion,
-                       out_channel,
-                       num_layers,
-                       stride,
-                       filter_size,
-                       shortcut,
-                       squeeze,
-                       name=None):
-        """Build inverted residual blocks.
-        Args:
-            input(Variable): The input feture map.
-            in_channel(int): The number of input channel.
-            expansion(float): Expansion factor.
-            out_channel(int): The number of output channel.
-            num_layers(int): The number of layers.
-            stride(int): The stride.
-            filter_size(int): The size of filter.
-            shortcut(bool): Whether to add shortcut layers.
-            squeeze(bool): Whether to add squeeze excitation layers.
-            name(str): The name.
-        Returns:
-            Variable, layers output.
-        """
-        first_block = self.inverted_residual_unit(
-            input=input,
-            num_in_filter=in_channel,
-            num_filters=out_channel,
-            ifshortcut=False,
-            ifse=squeeze,
-            stride=stride,
-            filter_size=filter_size,
-            expansion_factor=expansion,
-            name=name + '_1')
-
-        last_residual_block = first_block
-        last_c = out_channel
-
-        for i in range(1, num_layers):
-            last_residual_block = self.inverted_residual_unit(
-                input=last_residual_block,
-                num_in_filter=last_c,
-                num_filters=out_channel,
-                ifshortcut=shortcut,
-                ifse=squeeze,
-                stride=1,
-                filter_size=filter_size,
-                expansion_factor=expansion,
-                name=name + '_' + str(i + 1))
-        return last_residual_block
diff --git a/python/paddle/fluid/contrib/slim/tests/mobilenet.py b/python/paddle/fluid/contrib/slim/tests/mobilenet.py
deleted file mode 100644
index f5dbef17e8d4a7c474881d88b6619061a3424177..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/mobilenet.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import paddle.fluid as fluid
-from paddle.fluid.initializer import MSRA
-from paddle.fluid.param_attr import ParamAttr
-
-__all__ = ['MobileNet']
-
-train_parameters = {
-    "input_size": [3, 224, 224],
-    "input_mean": [0.485, 0.456, 0.406],
-    "input_std": [0.229, 0.224, 0.225],
-    "learning_strategy": {
-        "name": "piecewise_decay",
-        "batch_size": 256,
-        "epochs": [30, 60, 90],
-        "steps": [0.1, 0.01, 0.001, 0.0001]
-    }
-}
-
-
-class MobileNet():
-    def __init__(self, name=""):
-        self.params = train_parameters
-        self.name = name
-
-    def net(self, input, class_dim=1000, scale=1.0):
-        # conv1: 112x112
-        input = self.conv_bn_layer(
-            input,
-            filter_size=3,
-            channels=3,
-            num_filters=int(32 * scale),
-            stride=2,
-            padding=1,
-            name=self.name + "_conv1")
-
-        # 56x56
-        input = self.depthwise_separable(
-            input,
-            num_filters1=32,
-            num_filters2=64,
-            num_groups=32,
-            stride=1,
-            scale=scale,
-            name=self.name + "_conv2_1")
-
-        input = self.depthwise_separable(
-            input,
-            num_filters1=64,
-            num_filters2=128,
-            num_groups=64,
-            stride=2,
-            scale=scale,
-            name=self.name + "_conv2_2")
-
-        # 28x28
-        input = self.depthwise_separable(
-            input,
-            num_filters1=128,
-            num_filters2=128,
-            num_groups=128,
-            stride=1,
-            scale=scale,
-            name=self.name + "_conv3_1")
-
-        input = self.depthwise_separable(
-            input,
-            num_filters1=128,
-            num_filters2=256,
-            num_groups=128,
-            stride=2,
-            scale=scale,
-            name=self.name + "_conv3_2")
-
-        # 14x14
-        input = self.depthwise_separable(
-            input,
-            num_filters1=256,
-            num_filters2=256,
-            num_groups=256,
-            stride=1,
-            scale=scale,
-            name=self.name + "_conv4_1")
-
-        input = self.depthwise_separable(
-            input,
-            num_filters1=256,
-            num_filters2=512,
-            num_groups=256,
-            stride=2,
-            scale=scale,
-            name=self.name + "_conv4_2")
-
-        # 14x14
-        for i in range(5):
-            input = self.depthwise_separable(
-                input,
-                num_filters1=512,
-                num_filters2=512,
-                num_groups=512,
-                stride=1,
-                scale=scale,
-                name=self.name + "_conv5" + "_" + str(i + 1))
-        # 7x7
-        input = self.depthwise_separable(
-            input,
-            num_filters1=512,
-            num_filters2=1024,
-            num_groups=512,
-            stride=2,
-            scale=scale,
-            name=self.name + "_conv5_6")
-
-        input = self.depthwise_separable(
-            input,
-            num_filters1=1024,
-            num_filters2=1024,
-            num_groups=1024,
-            stride=1,
-            scale=scale,
-            name=self.name + "_conv6")
-
-        input = fluid.layers.pool2d(
-            input=input,
-            pool_size=0,
-            pool_stride=1,
-            pool_type='avg',
-            global_pooling=True)
-
-        output = fluid.layers.fc(
-            input=input,
-            size=class_dim,
-            act='softmax',
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=self.name + "_fc7_weights"),
-            bias_attr=ParamAttr(name=self.name + "_fc7_offset"),
-            name=self.name)
-        return output
-
-    def conv_bn_layer(self,
-                      input,
-                      filter_size,
-                      num_filters,
-                      stride,
-                      padding,
-                      channels=None,
-                      num_groups=1,
-                      act='relu',
-                      use_cudnn=True,
-                      name=None):
-        conv = fluid.layers.conv2d(
-            input=input,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=name + "_weights"),
-            name=name,
-            bias_attr=False)
-        bn_name = name + "_bn"
-        return fluid.layers.batch_norm(
-            input=conv,
-            act=act,
-            name=name,
-            param_attr=ParamAttr(name=bn_name + "_scale"),
-            bias_attr=ParamAttr(name=bn_name + "_offset"),
-            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance')
-
-    def depthwise_separable(self,
-                            input,
-                            num_filters1,
-                            num_filters2,
-                            num_groups,
-                            stride,
-                            scale,
-                            name=None):
-        depthwise_conv = self.conv_bn_layer(
-            input=input,
-            filter_size=3,
-            num_filters=int(num_filters1 * scale),
-            stride=stride,
-            padding=1,
-            num_groups=int(num_groups * scale),
-            use_cudnn=False,
-            name=name + "_dw")
-
-        pointwise_conv = self.conv_bn_layer(
-            input=depthwise_conv,
-            filter_size=1,
-            num_filters=int(num_filters2 * scale),
-            stride=1,
-            padding=0,
-            name=name + "_sep")
-        return pointwise_conv
diff --git a/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py b/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py
deleted file mode 100644
index 9f713450684904dd95c0d18d4018adcc27151a84..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py
+++ /dev/null
@@ -1,338 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import unittest
-import os
-import sys
-import argparse
-import logging
-import struct
-import six
-import numpy as np
-import time
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import FakeQAT2MkldnnINT8KernelPass
-from paddle.fluid.contrib.slim.quantization import FakeQAT2MkldnnINT8PerfPass
-from paddle.fluid import core
-
-logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
-_logger = logging.getLogger(__name__)
-_logger.setLevel(logging.INFO)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--batch_size', type=int, default=1, help='Batch size.')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=0,
-        help='Number of the first minibatches to skip in performance statistics.'
-    )
-    parser.add_argument(
-        '--debug',
-        action='store_true',
-        help='If used, the graph of QAT model is drawn.')
-    parser.add_argument(
-        '--qat_model', type=str, default='', help='A path to a QAT model.')
-    parser.add_argument(
-        '--qat2',
-        action='store_true',
-        help='If used, the QAT model is treated as a second generation model for performance optimization.'
-    )
-    parser.add_argument(
-        '--save_model',
-        action='store_true',
-        help='If used, the QAT model will be saved after all transformations')
-    parser.add_argument('--infer_data', type=str, default='', help='Data file.')
-    parser.add_argument(
-        '--batch_num',
-        type=int,
-        default=1,
-        help='Number of batches to process. 0 or less means all.')
-    parser.add_argument(
-        '--acc_diff_threshold',
-        type=float,
-        default=0.01,
-        help='Accepted accuracy difference threshold.')
-
-    test_args, args = parser.parse_known_args(namespace=unittest)
-
-    return test_args, sys.argv[:1] + args
-
-
-class TestQatInt8Comparison(unittest.TestCase):
-    """
-    Test for accuracy comparison of QAT FP32 and INT8 inference.
-    """
-
-    def _reader_creator(self, data_file='data.bin'):
-        def reader():
-            with open(data_file, 'rb') as fp:
-                num = fp.read(8)
-                num = struct.unpack('q', num)[0]
-                imgs_offset = 8
-                img_ch = 3
-                img_w = 224
-                img_h = 224
-                img_pixel_size = 4
-                img_size = img_ch * img_h * img_w * img_pixel_size
-                label_size = 8
-                labels_offset = imgs_offset + num * img_size
-
-                step = 0
-                while step < num:
-                    fp.seek(imgs_offset + img_size * step)
-                    img = fp.read(img_size)
-                    img = struct.unpack_from(
-                        '{}f'.format(img_ch * img_w * img_h), img)
-                    img = np.array(img)
-                    img.shape = (img_ch, img_w, img_h)
-                    fp.seek(labels_offset + label_size * step)
-                    label = fp.read(label_size)
-                    label = struct.unpack('q', label)[0]
-                    yield img, int(label)
-                    step += 1
-
-        return reader
-
-    def _get_batch_accuracy(self, batch_output=None, labels=None):
-        total = 0
-        correct = 0
-        correct_5 = 0
-        for n, result in enumerate(batch_output):
-            index = result.argsort()
-            top_1_index = index[-1]
-            top_5_index = index[-5:]
-            total += 1
-            if top_1_index == labels[n]:
-                correct += 1
-            if labels[n] in top_5_index:
-                correct_5 += 1
-        acc1 = float(correct) / float(total)
-        acc5 = float(correct_5) / float(total)
-        return acc1, acc5
-
-    def _prepare_for_fp32_mkldnn(self, graph):
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            name = op_node.name()
-            if name in ['depthwise_conv2d']:
-                input_var_node = graph._find_node_by_name(
-                    op_node.inputs, op_node.input("Input")[0])
-                weight_var_node = graph._find_node_by_name(
-                    op_node.inputs, op_node.input("Filter")[0])
-                output_var_node = graph._find_node_by_name(
-                    graph.all_var_nodes(), op_node.output("Output")[0])
-                attrs = {
-                    name: op_node.op().attr(name)
-                    for name in op_node.op().attr_names()
-                }
-
-                conv_op_node = graph.create_op_node(
-                    op_type='conv2d',
-                    attrs=attrs,
-                    inputs={
-                        'Input': input_var_node,
-                        'Filter': weight_var_node
-                    },
-                    outputs={'Output': output_var_node})
-
-                graph.link_to(input_var_node, conv_op_node)
-                graph.link_to(weight_var_node, conv_op_node)
-                graph.link_to(conv_op_node, output_var_node)
-                graph.safe_remove_nodes(op_node)
-
-        return graph
-
-    def _predict(self,
-                 test_reader=None,
-                 model_path=None,
-                 batch_size=1,
-                 batch_num=1,
-                 skip_batch_num=0,
-                 transform_to_int8=False):
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        inference_scope = fluid.executor.global_scope()
-        with fluid.scope_guard(inference_scope):
-            if os.path.exists(os.path.join(model_path, '__model__')):
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(model_path, exe)
-            else:
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(
-                     model_path, exe, 'model', 'params')
-
-            graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
-            if (self._debug):
-                graph.draw('.', 'qat_orig', graph.all_op_nodes())
-            if (transform_to_int8):
-                if (test_case_args.qat2):
-                    transform_to_mkldnn_int8_pass = FakeQAT2MkldnnINT8PerfPass(
-                        _scope=inference_scope,
-                        _place=place,
-                        _core=core,
-                        _debug=self._debug)
-                    graph = transform_to_mkldnn_int8_pass.apply(graph)
-                else:
-                    mkldnn_int8_pass = FakeQAT2MkldnnINT8KernelPass(
-                        _scope=inference_scope, _place=place)
-                    graph = mkldnn_int8_pass.apply(graph)
-
-            else:
-                graph = self._prepare_for_fp32_mkldnn(graph)
-
-            inference_program = graph.to_program()
-
-            dshape = [3, 224, 224]
-            outputs = []
-            infer_accs1 = []
-            infer_accs5 = []
-            fpses = []
-            batch_times = []
-            total_samples = 0
-            top1 = 0.0
-            top5 = 0.0
-            iters = 0
-            infer_start_time = time.time()
-            for data in test_reader():
-                if batch_num > 0 and iters >= batch_num:
-                    break
-                if iters == skip_batch_num:
-                    total_samples = 0
-                    infer_start_time = time.time()
-                if six.PY2:
-                    images = map(lambda x: x[0].reshape(dshape), data)
-                if six.PY3:
-                    images = list(map(lambda x: x[0].reshape(dshape), data))
-                images = np.array(images).astype('float32')
-                labels = np.array([x[1] for x in data]).astype('int64')
-
-                start = time.time()
-                out = exe.run(inference_program,
-                              feed={feed_target_names[0]: images},
-                              fetch_list=fetch_targets)
-                batch_time = (time.time() - start) * 1000  # in miliseconds
-                outputs.append(out[0])
-                batch_acc1, batch_acc5 = self._get_batch_accuracy(out[0],
-                                                                  labels)
-                infer_accs1.append(batch_acc1)
-                infer_accs5.append(batch_acc5)
-                samples = len(data)
-                total_samples += samples
-                batch_times.append(batch_time)
-                fps = samples / batch_time * 1000
-                fpses.append(fps)
-                iters += 1
-                appx = ' (warm-up)' if iters <= skip_batch_num else ''
-                _logger.info('batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, '
-                             'latency: {3:.4f} ms, fps: {4:.2f}'.format(
-                                 iters, batch_acc1, batch_acc5, batch_time /
-                                 batch_size, fps, appx))
-
-            # Postprocess benchmark data
-            batch_latencies = batch_times[skip_batch_num:]
-            batch_latency_avg = np.average(batch_latencies)
-            latency_avg = batch_latency_avg / batch_size
-            fpses = fpses[skip_batch_num:]
-            fps_avg = np.average(fpses)
-            infer_total_time = time.time() - infer_start_time
-            acc1_avg = np.mean(infer_accs1)
-            acc5_avg = np.mean(infer_accs5)
-            _logger.info('Total inference run time: {:.2f} s'.format(
-                infer_total_time))
-
-            if test_case_args.save_model:
-                with fluid.scope_guard(inference_scope):
-                    fluid.io.save_inference_model(
-                        'transformed_qat_int8_model', feed_target_names,
-                        fetch_targets, exe, inference_program)
-
-            return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
-
-    def _summarize_performance(self, fp32_fps, fp32_lat, int8_fps, int8_lat):
-        _logger.info('--- Performance summary ---')
-        _logger.info('FP32: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.format(
-            fp32_fps, fp32_lat))
-        _logger.info('INT8: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.format(
-            int8_fps, int8_lat))
-
-    def _compare_accuracy(self, fp32_acc1, fp32_acc5, int8_acc1, int8_acc5,
-                          threshold):
-        _logger.info('--- Accuracy summary ---')
-        _logger.info(
-            'Accepted top1 accuracy drop threshold: {0}. (condition: (FP32_top1_acc - IN8_top1_acc) <= threshold)'
-            .format(threshold))
-        _logger.info(
-            'FP32: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'.
-            format(fp32_acc1, fp32_acc5))
-        _logger.info(
-            'INT8: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'.
-            format(int8_acc1, int8_acc5))
-        assert fp32_acc1 > 0.0
-        assert int8_acc1 > 0.0
-        assert fp32_acc1 - int8_acc1 <= threshold
-
-    def test_graph_transformation(self):
-        if not fluid.core.is_compiled_with_mkldnn():
-            return
-
-        qat_model_path = test_case_args.qat_model
-        data_path = test_case_args.infer_data
-        batch_size = test_case_args.batch_size
-        batch_num = test_case_args.batch_num
-        skip_batch_num = test_case_args.skip_batch_num
-        acc_diff_threshold = test_case_args.acc_diff_threshold
-        self._debug = test_case_args.debug
-
-        _logger.info('QAT FP32 & INT8 prediction run.')
-        _logger.info('QAT model: {0}'.format(qat_model_path))
-        _logger.info('Dataset: {0}'.format(data_path))
-        _logger.info('Batch size: {0}'.format(batch_size))
-        _logger.info('Batch number: {0}'.format(batch_num))
-        _logger.info('Accuracy drop threshold: {0}.'.format(acc_diff_threshold))
-
-        _logger.info('--- QAT FP32 prediction start ---')
-        val_reader = paddle.batch(
-            self._reader_creator(data_path), batch_size=batch_size)
-        fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict(
-            val_reader,
-            qat_model_path,
-            batch_size,
-            batch_num,
-            skip_batch_num,
-            transform_to_int8=False)
-        _logger.info('--- QAT INT8 prediction start ---')
-        val_reader = paddle.batch(
-            self._reader_creator(data_path), batch_size=batch_size)
-        int8_output, int8_acc1, int8_acc5, int8_fps, int8_lat = self._predict(
-            val_reader,
-            qat_model_path,
-            batch_size,
-            batch_num,
-            skip_batch_num,
-            transform_to_int8=True)
-
-        self._summarize_performance(fp32_fps, fp32_lat, int8_fps, int8_lat)
-        self._compare_accuracy(fp32_acc1, fp32_acc5, int8_acc1, int8_acc5,
-                               acc_diff_threshold)
-
-
-if __name__ == '__main__':
-    global test_case_args
-    test_case_args, remaining_args = parse_args()
-    unittest.main(argv=remaining_args)
diff --git a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
deleted file mode 100644
index 8bdfd5086135c022a648d1a0a08f073ecef83961..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-#start_epoch(int): The epoch to insert quantization operators. default: 0
-#
-#end_epoch(int): The epoch to save inference model. default: 0
-#
-#float_model_save_path(str): The path to save model with float weights.
-#                None means it doesn't save float model. default: None.
-#
-#mobile_model_save_path(str): The path to save model for paddle-mobile execution.
-#                None means it doesn't save mobile model. default: None.
-#
-#int8_model_save_path(str): The path to save model with int8_t weight.
-#                None means it doesn't save int8 model. default: None.
-#
-#activation_bits(int): quantization bit number for activation. default: 8.
-#
-#weight_bits(int): quantization bit number for weights. The bias is not quantized.
-#                  default: 8.
-#
-#activation_quantize_type(str): quantization type for activation,
-#    now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
-#    If use 'abs_max' mode, the quantization scale will be calculated
-#    dynamically each step in both training and testing period. If use
-#    'range_abs_max', a static quantization scale will be calculated
-#    during training and used in inference.
-#
-#save_in_nodes(list<str>): A list of variable names used to prune graph
-#                          for saving inference model.
-#
-#save_out_nodes(list<str>): A list of variable names used to prune graph
-#                                      for saving inference model.
-version: 1.0
-strategies:
-    quantization_strategy:
-        class: 'QuantizationStrategy'
-        start_epoch: 0
-        end_epoch: 0
-        float_model_save_path: './output/float'
-        mobile_model_save_path: './output/mobile'
-        int8_model_save_path: './output/int8'
-        weight_bits: 8
-        activation_bits: 8
-        weight_quantize_type: 'abs_max'
-        activation_quantize_type: 'abs_max'
-        save_in_nodes: ['image']
-        save_out_nodes: ['quan.tmp_2']
-compressor:
-    epoch: 1
-    checkpoint_path: './checkpoints_quan/'
-    strategies:
-        - quantization_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/quantization/compress_1.yaml b/python/paddle/fluid/contrib/slim/tests/quantization/compress_1.yaml
deleted file mode 100644
index 44e2dc985aac65306a3b05860a26a1d60fa5cf44..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/quantization/compress_1.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-#start_epoch(int): The epoch to insert quantization operators. default: 0
-#
-#end_epoch(int): The epoch to save inference model. default: 0
-#
-#float_model_save_path(str): The path to save model with float weights.
-#                None means it doesn't save float model. default: None.
-#
-#mobile_model_save_path(str): The path to save model for paddle-mobile execution.
-#                None means it doesn't save mobile model. default: None.
-#
-#int8_model_save_path(str): The path to save model with int8_t weight.
-#                None means it doesn't save int8 model. default: None.
-#
-#activation_bits(int): quantization bit number for activation. default: 8.
-#
-#weight_bits(int): quantization bit number for weights. The bias is not quantized.
-#                  default: 8.
-#
-#activation_quantize_type(str): quantization type for activation,
-#    now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
-#    If use 'abs_max' mode, the quantization scale will be calculated
-#    dynamically each step in both training and testing period. If use
-#    'range_abs_max', a static quantization scale will be calculated
-#    during training and used in inference.
-#
-#save_in_nodes(list<str>): A list of variable names used to prune graph
-#                          for saving inference model.
-#
-#save_out_nodes(list<str>): A list of variable names used to prune graph
-#                                      for saving inference model.
-version: 1.0
-strategies:
-    quantization_strategy:
-        class: 'QuantizationStrategy'
-        start_epoch: 0
-        end_epoch: 0
-        float_model_save_path: './output/float'
-        mobile_model_save_path: './output/mobile'
-        int8_model_save_path: './output/int8'
-        weight_bits: 8
-        activation_bits: 8
-        weight_quantize_type: 'abs_max'
-        activation_quantize_type: 'abs_max'
-        save_in_nodes: ['image']
-        save_out_nodes: ['quan.tmp_2']
-compressor:
-    epoch: 2
-    checkpoint_path: './checkpoints_quan/'
-    strategies:
-        - quantization_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/quantization/config_mkldnn_int8.yaml b/python/paddle/fluid/contrib/slim/tests/quantization/config_mkldnn_int8.yaml
deleted file mode 100644
index 1e0df9c58a2081275ad090857668c90e2efc8d55..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/quantization/config_mkldnn_int8.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-#int8_model_save_path(str): int8_model_save_path is used to save an int8 ProgramDesc with
-#                fp32 weights which is used for MKL-DNN int8 inference. For post training quantization,
-#                MKLDNNPostTrainingQuantStrategy only supports converting a fp32 ProgramDesc
-#                with fp32 weights to an int8 ProgramDesc with fp32 weights now. The saved
-#                int8 ProgramDesc with fp32 weights only can be executed with MKL-DNN enabled.
-#                None means it doesn't save int8 ProgramDesc with fp32 weights. default: None.
-#
-#fp32_model_path(str): fp32_model_path is used to load an original fp32 ProgramDesc with fp32 weights.
-#                None means it doesn't have a fp32 ProgramDesc with fp32 weights. default: None.
-#
-#cpu_math_library_num_threads(int): The number of cpu math library threads which is used on
-#                MKLDNNPostTrainingQuantStrategy. 1 means it only uses one cpu math library
-#                thread. default: 1
-#                Note: Here we set the cpu_math_library_num_threads to 4 which is the maximum number of
-#                cpu math library threads on CI machine.
-#
-version: 1.0
-strategies:
-    mkldnn_post_training_strategy:
-        class: 'MKLDNNPostTrainingQuantStrategy'
-        int8_model_save_path: 'OUTPUT_PATH'
-        fp32_model_path: 'MODEL_PATH'
-        cpu_math_library_num_threads: 4
-compressor:
-    epoch: 0
-    checkpoint_path: ''
-    strategies:
-        - mkldnn_post_training_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/slim_int8_mkldnn_post_training_quantization.md b/python/paddle/fluid/contrib/slim/tests/slim_int8_mkldnn_post_training_quantization.md
deleted file mode 100644
index 0e9fd33ee3686ad23d981c27c7da46a3fbfd67bb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/slim_int8_mkldnn_post_training_quantization.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# PaddleSlim Post-training quantization (MKL-DNN INT8)
-
-This document describes how to use [PaddleSlim](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md) to convert a FP32 ProgramDesc with FP32 weights to an INT8 ProgramDesc with FP32 weights on GoogleNet, MobileNet-V1, MobileNet-V2, ResNet-101, ResNet-50, VGG16 and VGG19. We provide the instructions on how to enable MKL-DNN INT8 calibration in PaddleSlim and show the results of accuracy on all the 7 models as mentioned.
-
-## 0. Prerequisite
-
-You need to install at least PaddlePaddle-1.5 python package `pip install paddlepaddle==1.5`.
-
-## 1. How to generate INT8 ProgramDesc with FP32 weights
-
-You can refer to the usage doc of [PaddleSlim](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md) in section 1.2 for details that how to use PaddleSlim Compressor. But for PaddleSlim Post-training quantization with MKL-DNN INT8, there are two differences.
-
-* Differences in `paddle.fluid.contrib.slim.Compressor` arguments
-
-Since the only one requirement in PaddleSlim Post-training quantization with MKL-DNN INT8 is the reader of warmup dataset, so you need to set other parameters of `paddle.fluid.contrib.slim.Compressor` to None, [] or ''.
-
-```python
-com_pass = Compressor(
-    place=None, # not required, set to None
-    scope=None, # not required, set to None
-    train_program=None, # not required, set to None
-    train_reader=None, # not required, set to None
-    train_feed_list=[], # not required, set to []
-    train_fetch_list=[], # not required, set to []
-    eval_program=None, # not required, set to None
-    eval_reader=reader, # required, the reader of warmup dataset
-    eval_feed_list=[], # not required, set to []
-    eval_fetch_list=[], # not required, set to []
-    teacher_programs=[], # not required, set to []
-    checkpoint_path='', # not required, set to ''
-    train_optimizer=None, # not required, set to None
-    distiller_optimizer=None # not required, set to None
-    )
-```
-
-* Differences in yaml config
-
-An example yaml config is listed below, for more details, you can refer to [config_mkldnn_int8.yaml](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/contrib/slim/tests/quantization/config_mkldnn_int8.yaml) which is used in unit test.
-
-```yaml
-version: 1.0
-strategies:
-    mkldnn_post_training_strategy:
-        class: 'MKLDNNPostTrainingQuantStrategy' # required, class name of MKL-DNN INT8 Post-training quantization strategy
-        int8_model_save_path: 'OUTPUT_PATH' # required, int8 ProgramDesc with fp32 weights
-        fp32_model_path: 'MODEL_PATH' # required, fp32 ProgramDesc with fp32 weights
-        cpu_math_library_num_threads: 1 # required, The number of cpu math library threads
-compressor:
-    epoch: 0 # not required, set to 0
-    checkpoint_path: '' # not required, set to ''
-    strategies:
-        - mkldnn_post_training_strategy
-```
-
-## 2. How to run INT8 ProgramDesc with fp32 weights
-
-You can load INT8 ProgramDesc with fp32 weights by load_inference_model [API](https://github.com/PaddlePaddle/Paddle/blob/8b50ad80ff6934512d3959947ac1e71ea3fb9ea3/python/paddle/fluid/io.py#L991) and run INT8 inference similar as [FP32](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/object_detection/eval.py "FP32").
-
-```python
-[infer_program, feed_dict, fetch_targets] = fluid.io.load_inference_model(model_path, exe)
-```
-
-## 3. Result
-
-We provide the results of accuracy measured on Intel(R) Xeon(R) Gold 6271.
-
->**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
-
->**Dataset: ILSVRC2012 Validation dataset**
-
-| Model        | FP32 Accuracy   | INT8 Accuracy   | Accuracy Diff(FP32-INT8)   |
-| :----------: | :-------------: | :------------:  | :--------------:           |
-| GoogleNet    |  70.50%         |  69.81%         |   0.69%                    |
-| MobileNet-V1 |  70.78%         |  70.42%         |   0.36%                    |
-| MobileNet-V2 |  71.90%         |  71.35%         |   0.55%                    |
-| ResNet-101   |  77.50%         |  77.42%         |   0.08%                    |
-| ResNet-50    |  76.63%         |  76.52%         |   0.11%                    |
-| VGG16        |  72.08%         |  72.03%         |   0.05%                    |
-| VGG19        |  72.57%         |  72.55%         |   0.02%                    |
-
-Notes:
-
-* MKL-DNN and MKL are required.
-
-## 4. How to reproduce the results
-
-Three steps to reproduce the above-mentioned accuracy results, and we take GoogleNet benchmark as an example:
-
-* ### Prepare dataset
-
-You can run the following commands to download and preprocess the ILSVRC2012 Validation dataset.
-
-```bash
-cd /PATH/TO/PADDLE
-python ./paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
-```
-
-Then the ILSVRC2012 Validation dataset will be preprocessed and saved by default in `~/.cache/paddle/dataset/int8/download/int8_full_val.bin`
-
-* ### Prepare model
-
-You can run the following commands to download GoogleNet model.
-
-```bash
-mkdir -p /PATH/TO/DOWNLOAD/MODEL/
-cd /PATH/TO/DOWNLOAD/MODEL/
-export MODEL_NAME=GoogleNet
-wget http://paddle-inference-dist.bj.bcebos.com/int8/${MODEL_NAME}_int8_model.tar.gz
-mkdir -p ${MODEL_NAME}
-tar -xvf ${MODEL_NAME}_int8_model.tar.gz -C ${MODEL_NAME}
-```
-
-To download and verify all the 7 models, you need to set `MODEL_NAME` to one of the following values in command line:
-
-```text
-MODEL_NAME=GoogleNet, mobilenetv1, mobilenet_v2, Res101, resnet50, VGG16, VGG19
-```
-
-* ### Commands to reproduce benchmark
-
-You can run `test_mkldnn_int8_quantization_strategy.py` with the following arguments to reproduce the accuracy result on GoogleNet.
-
-``` bash
-cd /PATH/TO/PADDLE/python/paddle/fluid/contrib/slim/tests/
-python ./test_mkldnn_int8_quantization_strategy.py --infer_model /PATH/TO/DOWNLOAD/MODEL/${MODEL_NAME}/model --infer_data ~/.cache/paddle/dataset/int8/download/int8_full_val.bin --warmup_batch_size 100 --batch_size 1
-```
-
-Notes:
-
-* The above commands will cost maybe several hours in the prediction stage (include int8 prediction and fp32 prediction) since there have 50000 pictures need to be predicted in `int8_full_val.bin`
-* Running the above command with environment variable `FLAGS_use_mkldnn=true` will make the FP32 part of the test running using MKL-DNN (the INT8 part uses MKL-DNN either way).
diff --git a/python/paddle/fluid/contrib/slim/tests/test_auto_pruning.py b/python/paddle/fluid/contrib/slim/tests/test_auto_pruning.py
deleted file mode 100644
index 006e5adb25c6e7d0c0d576d7e8c8f04954c2e110..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_auto_pruning.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import paddle
-import unittest
-import paddle.fluid as fluid
-from mobilenet import MobileNet
-from paddle.fluid.contrib.slim.core import Compressor
-from paddle.fluid.contrib.slim.graph import GraphWrapper
-
-
-class TestFilterPruning(unittest.TestCase):
-    def test_compression(self):
-        """
-        Model: mobilenet_v1
-        data: mnist
-        step1: Training one epoch
-        step2: pruning flops
-        step3: fine-tune one epoch
-        step4: check top1_acc.
-        """
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        class_dim = 10
-        image_shape = [1, 28, 28]
-        image = fluid.layers.data(
-            name='image', shape=image_shape, dtype='float32')
-        image.stop_gradient = False
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        out = MobileNet("auto_pruning").net(input=image, class_dim=class_dim)
-        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-        val_program = fluid.default_main_program().clone(for_test=False)
-
-        cost = fluid.layers.cross_entropy(input=out, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        optimizer = fluid.optimizer.Momentum(
-            momentum=0.9,
-            learning_rate=0.01,
-            regularization=fluid.regularizer.L2Decay(4e-5))
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
-
-        val_feed_list = [('img', image.name), ('label', label.name)]
-        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
-                                                        acc_top5.name)]
-
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=128)
-        train_feed_list = [('img', image.name), ('label', label.name)]
-        train_fetch_list = [('loss', avg_cost.name)]
-
-        com_pass = Compressor(
-            place,
-            fluid.global_scope(),
-            fluid.default_main_program(),
-            train_reader=train_reader,
-            train_feed_list=train_feed_list,
-            train_fetch_list=train_fetch_list,
-            eval_program=val_program,
-            eval_reader=val_reader,
-            eval_feed_list=val_feed_list,
-            eval_fetch_list=val_fetch_list,
-            train_optimizer=optimizer)
-        com_pass.config('./auto_pruning/compress.yaml')
-        eval_graph = com_pass.run()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_compressor.py b/python/paddle/fluid/contrib/slim/tests/test_compressor.py
deleted file mode 100644
index 330c6e3543ddb44e1016ffdbf14d65116422e54e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_compressor.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import paddle
-import unittest
-import os
-import numpy as np
-import paddle.fluid as fluid
-from paddle.fluid.contrib.slim.core import Compressor
-from paddle.fluid.contrib.slim.graph import GraphWrapper
-
-
-class TestCompressor(unittest.TestCase):
-    def test_eval_func(self):
-        class_dim = 10
-        image_shape = [1, 28, 28]
-        image = fluid.layers.data(
-            name='image', shape=image_shape, dtype='float32')
-        image.stop_gradient = False
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        out = fluid.layers.fc(input=image, size=class_dim)
-        out = fluid.layers.softmax(out)
-        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-        val_program = fluid.default_main_program().clone(for_test=False)
-
-        cost = fluid.layers.cross_entropy(input=out, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        optimizer = fluid.optimizer.Momentum(
-            momentum=0.9,
-            learning_rate=0.01,
-            regularization=fluid.regularizer.L2Decay(4e-5))
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
-
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=128)
-        train_feed_list = [('img', image.name), ('label', label.name)]
-        train_fetch_list = [('loss', avg_cost.name)]
-        eval_feed_list = [('img', image.name), ('label', label.name)]
-        eval_fetch_list = [('acc_top1', acc_top1.name)]
-
-        def eval_func(program, scope):
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            feeder = fluid.DataFeeder(
-                feed_list=[image.name, label.name],
-                place=place,
-                program=program)
-            results = []
-            for data in val_reader():
-                result = exe.run(program=program,
-                                 scope=scope,
-                                 fetch_list=[acc_top1.name],
-                                 feed=feeder.feed(data))
-                results.append(np.array(result))
-            result = np.mean(results)
-            return result
-
-        com_pass = Compressor(
-            place,
-            fluid.global_scope(),
-            fluid.default_main_program(),
-            train_reader=train_reader,
-            train_feed_list=train_feed_list,
-            train_fetch_list=train_fetch_list,
-            eval_program=val_program,
-            eval_feed_list=eval_feed_list,
-            eval_fetch_list=eval_fetch_list,
-            eval_func={"score": eval_func},
-            prune_infer_model=[[image.name], [out.name]],
-            train_optimizer=optimizer)
-        com_pass.config('./configs/compress.yaml')
-        com_pass.run()
-        self.assertTrue('score' in com_pass.context.eval_results)
-        self.assertTrue(float(com_pass.context.eval_results['score'][0]) > 0.9)
-        self.assertTrue(os.path.exists("./checkpoints/0/eval_model/__model__"))
-        self.assertTrue(
-            os.path.exists("./checkpoints/0/eval_model/__model__.infer"))
-        self.assertTrue(os.path.exists("./checkpoints/0/eval_model/__params__"))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_factory.py b/python/paddle/fluid/contrib/slim/tests/test_factory.py
deleted file mode 100644
index 90eb8bd4b3caa44880f6df21c7f9f6d460655a8c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_factory.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid.contrib.slim.core import ConfigFactory
-import unittest
-
-
-class TestFactory(unittest.TestCase):
-    def test_parse_pruning(self):
-        factory = ConfigFactory('./configs/filter_pruning.yaml')
-
-        pruner_1 = factory.instance('pruner_1')
-        self.assertEquals(pruner_1.pruning_axis['*'], 0)
-        self.assertEquals(pruner_1.criterions['*'], 'l1_norm')
-
-        strategy = factory.instance('sensitive_pruning_strategy')
-        pruner_1 = strategy.pruner
-        self.assertEquals(pruner_1.criterions['*'], 'l1_norm')
-
-        self.assertEquals(strategy.start_epoch, 0)
-        self.assertEquals(strategy.sensitivities_file,
-                          'mobilenet_acc_top1_sensitive.data')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
deleted file mode 100644
index cb956ef6bf09e0172d9e0caea1c76d5bf78fcfef..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import paddle
-import unittest
-import paddle.fluid as fluid
-import numpy as np
-from mobilenet import MobileNet
-from paddle.fluid.contrib.slim.core import Compressor
-from paddle.fluid.contrib.slim.graph import GraphWrapper
-
-
-class TestFilterPruning(unittest.TestCase):
-    def test_compression(self):
-        """
-        Model: mobilenet_v1
-        data: mnist
-        step1: Training one epoch
-        step2: pruning flops
-        step3: fine-tune one epoch
-        step4: check top1_acc.
-        """
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        class_dim = 10
-        image_shape = [1, 28, 28]
-        image = fluid.layers.data(
-            name='image', shape=image_shape, dtype='float32')
-        image.stop_gradient = False
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        out = MobileNet().net(input=image, class_dim=class_dim)
-        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-        val_program = fluid.default_main_program().clone(for_test=False)
-
-        cost = fluid.layers.cross_entropy(input=out, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        optimizer = fluid.optimizer.Momentum(
-            momentum=0.9,
-            learning_rate=0.01,
-            regularization=fluid.regularizer.L2Decay(4e-5))
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
-
-        val_feed_list = [('img', image.name), ('label', label.name)]
-        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
-                                                        acc_top5.name)]
-
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=128)
-        train_feed_list = [('img', image.name), ('label', label.name)]
-        train_fetch_list = [('loss', avg_cost.name)]
-
-        com_pass = Compressor(
-            place,
-            fluid.global_scope(),
-            fluid.default_main_program(),
-            train_reader=train_reader,
-            train_feed_list=train_feed_list,
-            train_fetch_list=train_fetch_list,
-            eval_program=val_program,
-            eval_reader=val_reader,
-            eval_feed_list=val_feed_list,
-            eval_fetch_list=val_fetch_list,
-            train_optimizer=optimizer)
-        com_pass.config('./filter_pruning/compress.yaml')
-        eval_graph = com_pass.run()
-        self.assertTrue(
-            abs((com_pass.context.eval_results['acc_top1'][-1] - 0.969) / 0.969)
-            < 0.02)
-
-    def test_uniform_restore_from_checkpoint(self):
-        np.random.seed(0)
-        self.uniform_restore_from_checkpoint(
-            "./filter_pruning/uniform_restore_0.yaml")
-        acc_0 = self.uniform_restore_from_checkpoint(
-            "./filter_pruning/uniform_restore_1.yaml")
-        np.random.seed(0)
-        acc_1 = self.uniform_restore_from_checkpoint(
-            "./filter_pruning/uniform_restore.yaml")
-        self.assertTrue(abs((acc_0 - acc_1) / acc_1) < 0.001)
-
-    def uniform_restore_from_checkpoint(self, config_file):
-
-        class_dim = 10
-        image_shape = [1, 28, 28]
-
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        train_program.random_seed = 10
-        startup_program.random_seed = 10
-
-        with fluid.program_guard(train_program, startup_program):
-            with fluid.unique_name.guard():
-                image = fluid.layers.data(
-                    name='image', shape=image_shape, dtype='float32')
-                image.stop_gradient = False
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
-                out = fluid.layers.conv2d(image, 4, 1)
-                out = fluid.layers.fc(out, size=class_dim)
-                out = fluid.layers.softmax(out)
-                acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-                acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-                cost = fluid.layers.cross_entropy(input=out, label=label)
-                avg_cost = fluid.layers.mean(x=cost)
-        val_program = train_program.clone(for_test=False)
-
-        optimizer = fluid.optimizer.Momentum(
-            momentum=0.9,
-            learning_rate=0.01,
-            regularization=fluid.regularizer.L2Decay(4e-5))
-
-        place = fluid.CPUPlace()
-        scope = fluid.Scope()
-        exe = fluid.Executor(place)
-        exe.run(startup_program, scope=scope)
-
-        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
-
-        val_feed_list = [('img', image.name), ('label', label.name)]
-        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
-                                                        acc_top5.name)]
-
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=128)
-        train_feed_list = [('img', image.name), ('label', label.name)]
-        train_fetch_list = [('loss', avg_cost.name)]
-
-        com_pass = Compressor(
-            place,
-            scope,
-            train_program,
-            train_reader=train_reader,
-            train_feed_list=train_feed_list,
-            train_fetch_list=train_fetch_list,
-            eval_program=val_program,
-            eval_reader=val_reader,
-            eval_feed_list=val_feed_list,
-            eval_fetch_list=val_fetch_list,
-            train_optimizer=optimizer)
-        com_pass.config(config_file)
-        eval_graph = com_pass.run()
-        return com_pass.context.eval_results['acc_top1'][-1]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
deleted file mode 100644
index 2cf897ec418fa75a70cfa7fa3fe0a4b9e79d3c65..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-from __future__ import print_function
-import os
-import six
-import numpy as np
-import unittest
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.framework import IrGraph
-from paddle.fluid import core
-
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-os.environ["CPU_NUM"] = "1"
-
-
-def conv_block():
-    img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
-    return [img, label], avg_loss
-
-
-class TestGraph(unittest.TestCase):
-    def graph_apis(self, use_cuda=False, for_ci=True):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                feeds, loss = conv_block()
-                opt = fluid.optimizer.Adam(learning_rate=0.001)
-                opt.minimize(loss)
-        graph = IrGraph(core.Graph(main.desc), for_test=False)
-        backup_graph = graph.clone()
-        self.assertEqual(len(graph.all_nodes()), len(backup_graph.all_nodes()))
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.memory_optimize = False
-        build_strategy.enable_inplace = False
-        origin_binary = fluid.CompiledProgram(graph.graph).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
-        backup_binary = fluid.CompiledProgram(
-            backup_graph.graph).with_data_parallel(
-                loss_name=loss.name, build_strategy=build_strategy)
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup)
-        iters = 5
-        batch_size = 8
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size)
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-
-        def _train(binary):
-            for _ in range(iters):
-                data = next(train_reader())
-                loss_v = exe.run(binary,
-                                 feed=feeder.feed(data),
-                                 fetch_list=[loss.name])
-                if not for_ci:
-                    print('{}: {}'.format('loss', loss_v))
-
-        _train(origin_binary)
-        _train(backup_binary)
-
-        checkponit_dir = "checkpoint_gpu" if use_cuda else "checkpoint_cpu"
-
-        def _set_zero(var_name, scope, place):
-            var = scope.find_var(var_name).get_tensor()
-            var_array = np.zeros(var._get_dims()).astype("float32")
-            var.set(var_array, place)
-
-        sum_before = np.sum(
-            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
-            )))
-        fluid.io._save_persistable_nodes(exe, checkponit_dir, graph)
-        _set_zero('conv2d_1.w_0', fluid.global_scope(), place)
-        set_after = np.sum(
-            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
-            )))
-        self.assertEqual(set_after, 0)
-        fluid.io._load_persistable_nodes(exe, checkponit_dir, graph)
-        sum_after = np.sum(
-            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
-            )))
-        self.assertEqual(sum_before, sum_after)
-
-        marked_nodes = set()
-        for op in graph.all_op_nodes():
-            if op.name().find('conv2d') > -1:
-                marked_nodes.add(op)
-        if not for_ci:
-            graph.draw('.', 'residual', marked_nodes)
-            backup_marked_nodes = set()
-            for op in backup_graph.all_op_nodes():
-                if op.name().find('conv2d') > -1:
-                    backup_marked_nodes.add(op)
-            backup_graph.draw('./origin', 'backup', backup_marked_nodes)
-        self.assertFalse(graph.has_circle())
-        self.assertEqual(graph.graph_num(), 1)
-        nodes = graph.topology_sort()
-        self.assertEqual(len(nodes), len(graph.all_op_nodes()))
-        nodes_map = graph.build_adjacency_list()
-        self.assertEqual(len(nodes_map), len(graph.all_op_nodes()))
-        nodes_num = len(graph.all_nodes())
-        graph.safe_remove_nodes(marked_nodes)
-        self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
-
-    def test_graph_apis_cpu(self):
-        self.graph_apis(use_cuda=False, for_ci=True)
-
-    def test_graph_apis_cuda(self):
-        if fluid.core.is_compiled_with_cuda():
-            self.graph_apis(use_cuda=True, for_ci=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
deleted file mode 100644
index 5340f36196edc9c011a98faee5fbf5a6fbc7a639..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-from __future__ import print_function
-import unittest
-import paddle.fluid as fluid
-import six
-import numpy as np
-from paddle.fluid.contrib.slim.graph import GraphWrapper
-from paddle.fluid import core
-import os
-os.environ['CPU_NUM'] = str(4)
-
-
-def residual_block(num):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    data = fluid.layers.data(name='image', shape=[1, 8, 8], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    data.stop_gradinet = False
-    hidden = data
-    for _ in six.moves.xrange(num):
-        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
-        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
-        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
-    fc = fluid.layers.fc(input=hidden, size=10)
-
-    loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
-    return data, label, loss
-
-
-class TestGraphWrapper(unittest.TestCase):
-    def build_program(self):
-        place = fluid.CPUPlace()
-        if fluid.core.is_compiled_with_cuda():
-            place = fluid.CUDAPlace(0)
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            image, label, self.loss = residual_block(2)
-            eval_program = main.clone()
-            opt = fluid.optimizer.SGD(learning_rate=0.001)
-            opt.minimize(self.loss)
-        self.scope = core.Scope()
-        exe = fluid.Executor(place)
-        exe.run(startup, scope=self.scope)
-        self.eval_graph = GraphWrapper(
-            program=eval_program,
-            in_nodes={'image': image.name,
-                      'label': label.name},
-            out_nodes={'loss': self.loss.name})
-        self.train_graph = GraphWrapper(
-            program=main,
-            in_nodes={'image': image.name,
-                      'label': label.name},
-            out_nodes={'loss': self.loss.name})
-
-    def test_all_parameters(self):
-        self.build_program()
-        self.assertEquals(len(self.train_graph.all_parameters()), 24)
-
-    def test_all_vars(self):
-        self.build_program()
-        # self.assertEquals(len(self.train_graph.vars()), 90)
-        # activation inplace has been disabled in python side
-        # which may produce more variable in program_desc
-        # update 90 => 94
-        # delete three useless RAW variables in Conv2D
-        # update 94 => 91
-        self.assertEquals(len(self.train_graph.vars()), 91)
-
-    def test_numel_params(self):
-        self.build_program()
-        self.assertEquals(self.train_graph.numel_params(), 13258)
-
-    def test_compile(self):
-        self.build_program()
-        place = fluid.CPUPlace()
-        if fluid.core.is_compiled_with_cuda():
-            place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        self.train_graph.compile()
-        exe.run(self.train_graph.compiled_graph,
-                scope=self.scope,
-                feed={
-                    'image':
-                    np.random.randint(0, 40, [16, 1, 8, 8]).astype('float32'),
-                    'label': np.random.randint(0, 10, [16, 1]).astype('int64')
-                })
-
-    def test_pre_and_next_ops(self):
-        self.build_program()
-        for op in self.train_graph.ops():
-            for next_op in self.train_graph.next_ops(op):
-                self.assertTrue(op in self.train_graph.pre_ops(next_op))
-
-    def test_get_optimize_graph(self):
-        self.build_program()
-        place = fluid.CPUPlace()
-        if fluid.core.is_compiled_with_cuda():
-            place = fluid.CUDAPlace(0)
-        opt = fluid.optimizer.SGD(learning_rate=0.001)
-        train_graph = self.eval_graph.get_optimize_graph(
-            opt, place, self.scope, no_grad_var_names=['image'])
-        self.assertEquals(len(self.train_graph.ops()), len(train_graph.ops()))
-        exe = fluid.Executor(place)
-        train_graph.compile()
-        image = np.random.randint(0, 225, [16, 1, 8, 8]).astype('float32')
-        label = np.random.randint(0, 10, [16, 1]).astype('int64')
-        exe.run(train_graph.compiled_graph,
-                scope=self.scope,
-                feed={'image': image,
-                      'label': label})
-
-    def test_get_optimize_graph_without_loss(self):
-        self.build_program()
-        self.eval_graph.out_nodes = {}
-        place = fluid.CPUPlace()
-        if fluid.core.is_compiled_with_cuda():
-            place = fluid.CUDAPlace(0)
-        opt = fluid.optimizer.SGD(learning_rate=0.001)
-        train_graph = self.eval_graph.get_optimize_graph(
-            opt, place, self.scope, no_grad_var_names=['image'])
-        self.assertEquals(train_graph, None)
-
-    def test_flops(self):
-        self.build_program()
-        self.assertEquals(self.train_graph.flops(), 354624)
-
-    def test_merge(self):
-        self.build_program()
-        self.train_graph.merge(self.eval_graph)
-        self.assertEquals(len(self.train_graph.ops()), 72)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_light_nas.py b/python/paddle/fluid/contrib/slim/tests/test_light_nas.py
deleted file mode 100644
index 1a32421d1e19bc49ff6994f8e0ca5419b20cddf2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_light_nas.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-"""
-Test LightNAS.
-"""
-import sys
-import unittest
-import paddle.fluid as fluid
-from paddle.fluid.contrib.slim.core import Compressor
-sys.path.append("./light_nas")
-from light_nas_space import LightNASSpace
-
-
-class TestLightNAS(unittest.TestCase):
-    """
-    Test LightNAS.
-    """
-
-    def test_compression(self):
-        """
-        Test LightNAS.
-        """
-        # Update compress.yaml
-        lines = list()
-        fid = open('./light_nas/compress.yaml')
-        for line in fid:
-            if 'target_latency' in line:
-                lines.append('        target_latency: 0\n')
-            else:
-                lines.append(line)
-        fid.close()
-        fid = open('./light_nas/compress.yaml', 'w')
-        for line in lines:
-            fid.write(line)
-        fid.close()
-
-        # Begin test
-        if not fluid.core.is_compiled_with_cuda():
-            return
-
-        space = LightNASSpace()
-
-        startup_prog, train_prog, test_prog, train_metrics, test_metrics, train_reader, test_reader = space.create_net(
-        )
-        train_cost, train_acc1, train_acc5, global_lr = train_metrics
-        test_cost, test_acc1, test_acc5 = test_metrics
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-
-        val_fetch_list = [('acc_top1', test_acc1.name),
-                          ('acc_top5', test_acc5.name)]
-        train_fetch_list = [('loss', train_cost.name)]
-
-        com_pass = Compressor(
-            place,
-            fluid.global_scope(),
-            train_prog,
-            train_reader=train_reader,
-            train_feed_list=None,
-            train_fetch_list=train_fetch_list,
-            eval_program=test_prog,
-            eval_reader=test_reader,
-            eval_feed_list=None,
-            eval_fetch_list=val_fetch_list,
-            train_optimizer=None,
-            search_space=space)
-        com_pass.config('./light_nas/compress.yaml')
-        eval_graph = com_pass.run()
-
-    def test_compression_with_target_latency(self):
-        """
-        Test LightNAS with target_latency.
-        """
-        # Update compress.yaml
-        lines = list()
-        fid = open('./light_nas/compress.yaml')
-        for line in fid:
-            if 'target_latency' in line:
-                lines.append('        target_latency: 1\n')
-            else:
-                lines.append(line)
-        fid.close()
-        fid = open('./light_nas/compress.yaml', 'w')
-        for line in lines:
-            fid.write(line)
-        fid.close()
-
-        # Begin test
-        if not fluid.core.is_compiled_with_cuda():
-            return
-
-        space = LightNASSpace()
-
-        startup_prog, train_prog, test_prog, train_metrics, test_metrics, train_reader, test_reader = space.create_net(
-        )
-        train_cost, train_acc1, train_acc5, global_lr = train_metrics
-        test_cost, test_acc1, test_acc5 = test_metrics
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-
-        val_fetch_list = [('acc_top1', test_acc1.name),
-                          ('acc_top5', test_acc5.name)]
-        train_fetch_list = [('loss', train_cost.name)]
-
-        com_pass = Compressor(
-            place,
-            fluid.global_scope(),
-            train_prog,
-            train_reader=train_reader,
-            train_feed_list=None,
-            train_fetch_list=train_fetch_list,
-            eval_program=test_prog,
-            eval_reader=test_reader,
-            eval_feed_list=None,
-            eval_fetch_list=val_fetch_list,
-            train_optimizer=None,
-            search_space=space)
-        com_pass.config('./light_nas/compress.yaml')
-        eval_graph = com_pass.run()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py b/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
deleted file mode 100644
index d41ea349071ee8310433f055d2be8c3c763c73e8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
+++ /dev/null
@@ -1,269 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import unittest
-import os
-import sys
-import argparse
-import shutil
-import logging
-import struct
-import six
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.framework import IrGraph
-from paddle.fluid import core
-from paddle.fluid.contrib.slim.core import Compressor
-from paddle.fluid.log_helper import get_logger
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--batch_size', type=int, default=1, help='batch size')
-    parser.add_argument(
-        '--infer_model',
-        type=str,
-        default='',
-        help='infer_model is used to load an original fp32 ProgramDesc with fp32 weights'
-    )
-    parser.add_argument('--infer_data', type=str, default='', help='data file')
-    parser.add_argument(
-        '--int8_model_save_path',
-        type=str,
-        default='./output',
-        help='infer_data is used to save an int8 ProgramDesc with fp32 weights')
-    parser.add_argument(
-        '--warmup_batch_size',
-        type=int,
-        default=100,
-        help='batch size for quantization warmup')
-    parser.add_argument(
-        '--accuracy_diff_threshold',
-        type=float,
-        default=0.01,
-        help='accepted accuracy drop threshold.')
-
-    test_args, args = parser.parse_known_args(namespace=unittest)
-
-    return test_args, sys.argv[:1] + args
-
-
-class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase):
-    """
-    Test API of Post Training quantization strategy for int8 with MKL-DNN.
-    """
-
-    def _reader_creator(self, data_file='data.bin', cycle=False):
-        def reader():
-            with open(data_file, 'rb') as fp:
-                num = fp.read(8)
-                num = struct.unpack('q', num)[0]
-                imgs_offset = 8
-                img_ch = 3
-                img_w = 224
-                img_h = 224
-                img_pixel_size = 4
-                img_size = img_ch * img_h * img_w * img_pixel_size
-                label_size = 8
-                labels_offset = imgs_offset + num * img_size
-                step = 0
-
-                while step < num:
-                    fp.seek(imgs_offset + img_size * step)
-                    img = fp.read(img_size)
-                    img = struct.unpack_from(
-                        '{}f'.format(img_ch * img_w * img_h), img)
-                    img = np.array(img)
-                    img.shape = (img_ch, img_w, img_h)
-                    fp.seek(labels_offset + label_size * step)
-                    label = fp.read(label_size)
-                    label = struct.unpack('q', label)[0]
-                    yield img, int(label)
-                    step += 1
-                    if cycle and step == num:
-                        step = 0
-
-        return reader
-
-    def _update_config_file(self, fp32_model_path, output_path):
-        config_path = './quantization/config_mkldnn_int8.yaml'
-        new_config_path = './quantization/temp.yaml'
-        shutil.copy(config_path, new_config_path)
-
-        with open(new_config_path, 'r+') as fp:
-            data = fp.read()
-        data = data.replace('MODEL_PATH', fp32_model_path)
-        data = data.replace('OUTPUT_PATH', output_path)
-        with open(new_config_path, 'w') as fp:
-            fp.write(data)
-
-        return new_config_path
-
-    def _transform_depthwise_conv(self, graph):
-        '''
-        Transform depthwise_conv2d into conv2d, with MKL-DNN only
-        '''
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            name = op_node.name()
-            if name in ['depthwise_conv2d']:
-                input_var_node = graph._find_node_by_name(
-                    op_node.inputs, op_node.input("Input")[0])
-                weight_var_node = graph._find_node_by_name(
-                    op_node.inputs, op_node.input("Filter")[0])
-                output_var_node = graph._find_node_by_name(
-                    graph.all_var_nodes(), op_node.output("Output")[0])
-                attrs = {
-                    name: op_node.op().attr(name)
-                    for name in op_node.op().attr_names()
-                }
-
-                conv_op_node = graph.create_op_node(
-                    op_type='conv2d',
-                    attrs=attrs,
-                    inputs={
-                        'Input': input_var_node,
-                        'Filter': weight_var_node
-                    },
-                    outputs={'Output': output_var_node})
-
-                graph.link_to(input_var_node, conv_op_node)
-                graph.link_to(weight_var_node, conv_op_node)
-                graph.link_to(conv_op_node, output_var_node)
-                graph.safe_remove_nodes(op_node)
-
-        return graph
-
-    def _predict(self, test_reader=None, model_path=None):
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        inference_scope = fluid.executor.global_scope()
-        with fluid.scope_guard(inference_scope):
-            if os.path.exists(os.path.join(model_path, '__model__')):
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(model_path, exe)
-            else:
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(
-                     model_path, exe, 'model', 'params')
-
-            use_mkldnn = fluid.core.get_flags_use_mkldnn()
-            if (use_mkldnn):
-                graph = IrGraph(
-                    core.Graph(inference_program.desc), for_test=True)
-                graph = self._transform_depthwise_conv(graph)
-                inference_program = graph.to_program()
-
-            dshape = [3, 224, 224]
-            top1 = 0.0
-            top5 = 0.0
-            total_samples = 0
-            for batch_id, data in enumerate(test_reader()):
-                if six.PY2:
-                    images = map(lambda x: x[0].reshape(dshape), data)
-                if six.PY3:
-                    images = list(map(lambda x: x[0].reshape(dshape), data))
-                images = np.array(images).astype('float32')
-                labels = np.array([x[1] for x in data]).astype("int64")
-                labels = labels.reshape([-1, 1])
-                fluid.core.set_num_threads(int(os.environ['CPU_NUM_THREADS']))
-                out = exe.run(inference_program,
-                              feed={
-                                  feed_target_names[0]: images,
-                                  feed_target_names[1]: labels
-                              },
-                              fetch_list=fetch_targets)
-                fluid.core.set_num_threads(1)
-                top1 += np.sum(out[1]) * len(data)
-                top5 += np.sum(out[2]) * len(data)
-                total_samples += len(data)
-                if (batch_id + 1) % 100 == 0:
-                    _logger.info('{} images have been predicted'.format(
-                        total_samples))
-            return top1 / total_samples, top5 / total_samples
-
-    def _warmup(self, reader=None, config_path=''):
-        com_pass = Compressor(
-            place=None,
-            scope=None,
-            train_program=None,
-            train_reader=None,
-            train_feed_list=[],
-            train_fetch_list=[],
-            eval_program=None,
-            eval_reader=reader,
-            eval_feed_list=[],
-            eval_fetch_list=[],
-            teacher_programs=[],
-            checkpoint_path='',
-            train_optimizer=None,
-            distiller_optimizer=None)
-        com_pass.config(config_path)
-        com_pass.run()
-
-    def _compare_accuracy(self, fp32_acc1, int8_acc1, threshold):
-        _logger.info('--- Accuracy summary ---')
-        _logger.info(
-            'Accepted top1 accuracy drop threshold: {0}. (condition: (FP32_top1_acc - IN8_top1_acc) <= threshold)'
-            .format(threshold))
-        _logger.info('FP32: avg top1 accuracy: {0:.4f}'.format(fp32_acc1))
-        _logger.info('INT8: avg top1 accuracy: {0:.4f}'.format(int8_acc1))
-        assert fp32_acc1 > 0.0
-        assert int8_acc1 > 0.0
-        assert fp32_acc1 - int8_acc1 <= threshold
-
-    def test_compression(self):
-        if not fluid.core.is_compiled_with_mkldnn():
-            return
-
-        int8_model_path = test_case_args.int8_model_save_path
-        data_path = test_case_args.infer_data
-        fp32_model_path = test_case_args.infer_model
-        batch_size = test_case_args.batch_size
-
-        warmup_batch_size = test_case_args.warmup_batch_size
-        accuracy_diff_threshold = test_case_args.accuracy_diff_threshold
-
-        _logger.info(
-            'FP32 & INT8 prediction run: batch_size {0}, warmup batch size {1}.'
-            .format(batch_size, warmup_batch_size))
-
-        #warmup dataset, only use the first batch data
-        warmup_reader = paddle.batch(
-            self._reader_creator(data_path, False),
-            batch_size=warmup_batch_size)
-        config_path = self._update_config_file(fp32_model_path, int8_model_path)
-        self._warmup(warmup_reader, config_path)
-
-        _logger.info('--- INT8 prediction start ---')
-        val_reader = paddle.batch(
-            self._reader_creator(data_path, False), batch_size=batch_size)
-        int8_model_result = self._predict(val_reader, int8_model_path)
-        _logger.info('--- FP32 prediction start ---')
-        val_reader = paddle.batch(
-            self._reader_creator(data_path, False), batch_size=batch_size)
-        fp32_model_result = self._predict(val_reader, fp32_model_path)
-
-        self._compare_accuracy(fp32_model_result[0], int8_model_result[0],
-                               accuracy_diff_threshold)
-
-
-if __name__ == '__main__':
-    global test_case_args
-    test_case_args, remaining_args = parse_args()
-    unittest.main(argv=remaining_args)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
deleted file mode 100644
index 7ccf67d9788251fa6b5589cbd2e56a152976fb76..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ /dev/null
@@ -1,197 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import os
-import unittest
-import random
-import numpy as np
-import paddle.fluid as fluid
-import six
-import paddle
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.contrib.slim.quantization import FakeQAT2MkldnnINT8KernelPass
-from paddle.fluid import core
-
-os.environ["CPU_NUM"] = "1"
-
-
-def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
-    return avg_loss
-
-
-class TestMKLDNNTransformBasedFreezePass(unittest.TestCase):
-    def setUp(self):
-        self.quantizable_op_and_inputs = {
-            'conv2d': ['Input', 'Filter'],
-            'depthwise_conv2d': ['Input', 'Filter'],
-            'mul': ['X', 'Y']
-        }
-
-    def check_program(self, program):
-        for block in program.blocks:
-            for op in block.ops:
-                if op.type in self.quantizable_op_and_inputs:
-                    for arg_name in op.output_arg_names:
-                        # Check quantizable op's output is linked to
-                        # fake_dequantize's output
-                        self.assertTrue(arg_name.endswith('.dequantized'))
-
-    def isinteger(self, x):
-        return np.equal(np.mod(x, 1), 0)
-
-    def build_program(self, main, startup, is_test, seed):
-        main.random_seed = seed
-        startup.random_seed = seed
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                img = fluid.layers.data(
-                    name='image', shape=[1, 28, 28], dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
-                loss = conv_net(img, label)
-                if not is_test:
-                    opt = fluid.optimizer.Adam(learning_rate=0.001)
-                    opt.minimize(loss)
-        return [img, label], loss
-
-    def mkldnn_based_freeze_graph(self,
-                                  use_cuda,
-                                  seed,
-                                  activation_quant_type,
-                                  weight_quant_type='abs_max',
-                                  qat_perf=False,
-                                  for_ci=False):
-        random.seed(0)
-        np.random.seed(0)
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        test_program = fluid.Program()
-        feeds, loss = self.build_program(main, startup, False, seed)
-        self.build_program(test_program, startup, True, seed)
-        test_program = test_program.clone(for_test=True)
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-        # Apply the QAT QuantizationTransformPass
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quant_type)
-        transform_pass.apply(main_graph)
-        transform_pass.apply(test_graph)
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.memory_optimize = False
-        build_strategy.enable_inplace = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
-        quantized_test_program = test_graph.to_program()
-        iters = 5
-        batch_size = 8
-
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-
-        # Training the model to get the weights value
-        with fluid.scope_guard(scope):
-            for _ in range(iters):
-                data = next(train_reader())
-                loss_v = exe.run(binary,
-                                 feed=feeder.feed(data),
-                                 fetch_list=[loss])
-
-        # Freeze graph for inference, but the weight of fc/conv is still float type.
-        freeze_pass = QuantizationFreezePass(
-            scope=scope, place=place, weight_quantize_type=weight_quant_type)
-        freeze_pass.apply(test_graph)
-
-        # Transform quantized graph for MKL-DNN INT8 inference
-        mkldnn_int8_pass = FakeQAT2MkldnnINT8KernelPass(
-            _scope=scope, _place=place)
-        mkldnn_int8_pass.apply(test_graph)
-        dev_name = '_cpu_'
-        if not for_ci:
-            marked_nodes = set()
-            for op in test_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            test_graph.draw('.', 'test_mkldnn' + dev_name +
-                            activation_quant_type + '_' + weight_quant_type,
-                            marked_nodes)
-        mkldnn_program = test_graph.to_program()
-
-        # Check the transformation weights of conv2d and mul
-        conv_w_mkldnn = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
-        mul_w_mkldnn = np.array(scope.find_var('fc_0.w_0').get_tensor())
-        # Check if weights are still integer
-        self.assertFalse(self.isinteger(np.sum(conv_w_mkldnn)))
-        self.assertFalse(self.isinteger(np.sum(mul_w_mkldnn)))
-
-        # Check if the conv2d output and mul output are correctly linked to fake_dequantize's 
-        # output
-        self.check_program(mkldnn_program)
-        if not for_ci:
-            print('{}: {}'.format('w_mkldnn' + dev_name + activation_quant_type
-                                  + '_' + weight_quant_type, np.sum(w_mkldnn)))
-
-    def test_mkldnn_graph_cpu_static(self):
-        with fluid.unique_name.guard():
-            self.mkldnn_based_freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='range_abs_max',
-                weight_quant_type='abs_max',
-                for_ci=True)
-            self.mkldnn_based_freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='moving_average_abs_max',
-                weight_quant_type='abs_max',
-                for_ci=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
deleted file mode 100644
index 162048d74402514aa294ba7d98a59a3be62fa0cf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ /dev/null
@@ -1,553 +0,0 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import os
-import unittest
-import random
-import numpy as np
-import paddle.fluid as fluid
-import six
-import paddle
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
-from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
-from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
-from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
-from paddle.fluid import core
-
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-os.environ["CPU_NUM"] = "1"
-
-
-def linear_fc(num):
-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = data
-    for _ in six.moves.xrange(num):
-        hidden = fluid.layers.fc(hidden, size=128, act='relu')
-    loss = fluid.layers.cross_entropy(input=hidden, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def residual_block(num):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = data
-    for _ in six.moves.xrange(num):
-        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
-        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
-        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
-    pool = fluid.layers.pool2d(
-        input=hidden, pool_size=2, pool_type='avg', pool_stride=2)
-    fc = fluid.layers.fc(input=pool, size=10)
-    loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def conv_net(img, label, quant_skip_pattern):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='max',
-        act="relu")
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='avg',
-        act="relu")
-    hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
-    with fluid.name_scope(quant_skip_pattern):
-        prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
-    return avg_loss
-
-
-class TestQuantizationTransformPass(unittest.TestCase):
-    def setUp(self):
-        self.quantizable_op_and_inputs = {
-            'conv2d': ['Input', 'Filter'],
-            'depthwise_conv2d': ['Input', 'Filter'],
-            'mul': ['X', 'Y']
-        }
-        self.quantizable_grad_op_inputs = {
-            'conv2d_grad': ['Input', 'Filter'],
-            'depthwise_conv2d_grad': ['Input', 'Filter'],
-            'mul_grad': ['X', 'Y']
-        }
-
-    def check_program(self, program):
-        quantized_ops = set()
-        for block in program.blocks:
-            for op in block.ops:
-                # check forward
-                if op.type in self.quantizable_op_and_inputs:
-                    for arg_name in op.input_arg_names:
-                        self.assertTrue(
-                            arg_name.endswith('.quantized.dequantized'))
-                        quantized_ops.add(arg_name)
-
-            for op in block.ops:
-                # check backward
-                if op.type in self.quantizable_grad_op_inputs:
-                    for pname in self.quantizable_grad_op_inputs[op.type]:
-                        arg_name = op.input(pname)[0]
-                        self.assertTrue(
-                            arg_name.endswith('.quantized.dequantized'))
-                        self.assertTrue(arg_name in quantized_ops)
-
-    def linear_fc_quant(self, activation_quant_type, for_ci=True):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = linear_fc(3)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
-        place = fluid.CPUPlace()
-        graph = IrGraph(core.Graph(main.desc), for_test=False)
-        transform_pass = QuantizationTransformPass(
-            scope=fluid.global_scope(),
-            place=place,
-            activation_quantize_type=activation_quant_type)
-        transform_pass.apply(graph)
-        if not for_ci:
-            marked_nodes = set()
-            for op in graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            graph.draw('.', 'quantize_fc_' + activation_quant_type,
-                       marked_nodes)
-        program = graph.to_program()
-        self.check_program(program)
-        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
-        if not for_ci:
-            val_marked_nodes = set()
-            for op in val_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    val_marked_nodes.add(op)
-            val_graph.draw('.', 'val_fc_' + activation_quant_type,
-                           val_marked_nodes)
-
-    def test_linear_fc_quant_abs_max(self):
-        self.linear_fc_quant('abs_max', for_ci=True)
-
-    def test_linear_fc_quant_range_abs_max(self):
-        self.linear_fc_quant('range_abs_max', for_ci=True)
-
-    def test_linear_fc_quant_moving_average_abs_max(self):
-        self.linear_fc_quant('moving_average_abs_max', for_ci=True)
-
-    def residual_block_quant(self, activation_quant_type, for_ci=True):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = residual_block(2)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
-        place = fluid.CPUPlace()
-        graph = IrGraph(core.Graph(main.desc), for_test=False)
-        transform_pass = QuantizationTransformPass(
-            scope=fluid.global_scope(),
-            place=place,
-            activation_quantize_type=activation_quant_type)
-        transform_pass.apply(graph)
-        if not for_ci:
-            marked_nodes = set()
-            for op in graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            graph.draw('.', 'quantize_residual_' + activation_quant_type,
-                       marked_nodes)
-        program = graph.to_program()
-        self.check_program(program)
-        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
-        if not for_ci:
-            val_marked_nodes = set()
-            for op in val_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    val_marked_nodes.add(op)
-            val_graph.draw('.', 'val_residual_' + activation_quant_type,
-                           val_marked_nodes)
-
-    def test_residual_block_abs_max(self):
-        self.residual_block_quant('abs_max', for_ci=True)
-
-    def test_residual_block_range_abs_max(self):
-        self.residual_block_quant('range_abs_max', for_ci=True)
-
-    def test_residual_block_moving_average_abs_max(self):
-        self.residual_block_quant('moving_average_abs_max', for_ci=True)
-
-
-class TestQuantizationFreezePass(unittest.TestCase):
-    def freeze_graph(self,
-                     use_cuda,
-                     seed,
-                     activation_quant_type,
-                     weight_quant_type='abs_max',
-                     for_ci=True,
-                     quant_skip_pattern='skip_quant'):
-        def build_program(main, startup, is_test):
-            main.random_seed = seed
-            startup.random_seed = seed
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    loss = conv_net(img, label, quant_skip_pattern)
-                    if not is_test:
-                        opt = fluid.optimizer.Adam(learning_rate=0.001)
-                        opt.minimize(loss)
-            return [img, label], loss
-
-        random.seed(0)
-        np.random.seed(0)
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        test_program = fluid.Program()
-        feeds, loss = build_program(main, startup, False)
-        build_program(test_program, startup, True)
-        test_program = test_program.clone(for_test=True)
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quant_type,
-            skip_pattern=quant_skip_pattern)
-        transform_pass.apply(main_graph)
-        transform_pass.apply(test_graph)
-        dev_name = '_gpu_' if use_cuda else '_cpu_'
-        if not for_ci:
-            marked_nodes = set()
-            for op in main_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            main_graph.draw('.', 'main' + dev_name + activation_quant_type + '_'
-                            + weight_quant_type, marked_nodes)
-            marked_nodes = set()
-            for op in test_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            test_graph.draw('.', 'test' + dev_name + activation_quant_type + '_'
-                            + weight_quant_type, marked_nodes)
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.memory_optimize = False
-        build_strategy.enable_inplace = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
-        quantized_test_program = test_graph.to_program()
-        iters = 5
-        batch_size = 8
-
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-        with fluid.scope_guard(scope):
-            for _ in range(iters):
-                data = next(train_reader())
-                loss_v = exe.run(binary,
-                                 feed=feeder.feed(data),
-                                 fetch_list=[loss])
-                if not for_ci:
-                    print('{}: {}'.format('loss' + dev_name +
-                                          activation_quant_type + '_' +
-                                          weight_quant_type, loss_v))
-
-        test_data = next(test_reader())
-        with fluid.program_guard(quantized_test_program):
-            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
-                                             quantized_test_program)
-        # Testing
-        with fluid.scope_guard(scope):
-            test_loss1, w_quant = exe.run(program=quantized_test_program,
-                                          feed=feeder.feed(test_data),
-                                          fetch_list=[loss, w_var])
-
-        # Freeze graph for inference, but the weight of fc/conv is still float type.
-        freeze_pass = QuantizationFreezePass(
-            scope=scope, place=place, weight_quantize_type=weight_quant_type)
-        freeze_pass.apply(test_graph)
-        if not for_ci:
-            marked_nodes = set()
-            for op in test_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            test_graph.draw('.', 'test_freeze' + dev_name +
-                            activation_quant_type + '_' + weight_quant_type,
-                            marked_nodes)
-
-        server_program = test_graph.to_program()
-        with fluid.scope_guard(scope):
-            test_loss2, = exe.run(program=server_program,
-                                  feed=feeder.feed(test_data),
-                                  fetch_list=[loss])
-        self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
-        if not for_ci:
-            print(
-                '{}: {}'.format('test_loss1' + dev_name + activation_quant_type
-                                + '_' + weight_quant_type, test_loss1))
-            print(
-                '{}: {}'.format('test_loss2' + dev_name + activation_quant_type
-                                + '_' + weight_quant_type, test_loss2))
-        w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
-        # Maybe failed, this is due to the calculation precision
-        # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
-        if not for_ci:
-            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
-                                  + '_' + weight_quant_type, np.sum(w_freeze)))
-            print('{}: {}'.format('w_quant' + dev_name + activation_quant_type +
-                                  '_' + weight_quant_type, np.sum(w_quant)))
-
-        # Convert parameter to 8-bit.
-        convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
-        convert_int8_pass.apply(test_graph)
-        if not for_ci:
-            marked_nodes = set()
-            for op in test_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            test_graph.draw('.', 'test_int8' + dev_name + activation_quant_type
-                            + '_' + weight_quant_type, marked_nodes)
-        server_program_int8 = test_graph.to_program()
-        # Save the 8-bit parameter and model file.
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model(
-                'server_int8' + dev_name + activation_quant_type + '_' +
-                weight_quant_type, ['image', 'label'], [loss], exe,
-                server_program_int8)
-            # Test whether the 8-bit parameter and model file can be loaded successfully.
-            [infer, feed, fetch] = fluid.io.load_inference_model(
-                'server_int8' + dev_name + activation_quant_type + '_' +
-                weight_quant_type, exe)
-        # Check the loaded 8-bit weight.
-        w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
-        self.assertEqual(w_8bit.dtype, np.int8)
-        self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
-        if not for_ci:
-            print('{}: {}'.format('w_8bit' + dev_name + activation_quant_type +
-                                  '_' + weight_quant_type, np.sum(w_8bit)))
-            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
-                                  + '_' + weight_quant_type, np.sum(w_freeze)))
-
-        mobile_pass = TransformForMobilePass()
-        mobile_pass.apply(test_graph)
-        if not for_ci:
-            marked_nodes = set()
-            for op in test_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            test_graph.draw('.', 'test_mobile' + dev_name +
-                            activation_quant_type + '_' + weight_quant_type,
-                            marked_nodes)
-
-        mobile_program = test_graph.to_program()
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model(
-                'mobile_int8' + dev_name + activation_quant_type + '_' +
-                weight_quant_type, ['image', 'label'], [loss], exe,
-                mobile_program)
-
-    def test_freeze_graph_cuda_dynamic(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='abs_max',
-                    weight_quant_type='abs_max',
-                    for_ci=True)
-            with fluid.unique_name.guard():
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='abs_max',
-                    weight_quant_type='channel_wise_abs_max',
-                    for_ci=True)
-
-    def test_freeze_graph_cpu_dynamic(self):
-        with fluid.unique_name.guard():
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='abs_max',
-                weight_quant_type='abs_max',
-                for_ci=True)
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='abs_max',
-                weight_quant_type='channel_wise_abs_max',
-                for_ci=True)
-
-    def test_freeze_graph_cuda_static(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='range_abs_max',
-                    weight_quant_type='abs_max',
-                    for_ci=True)
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='moving_average_abs_max',
-                    weight_quant_type='abs_max',
-                    for_ci=True)
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='range_abs_max',
-                    weight_quant_type='channel_wise_abs_max',
-                    for_ci=True)
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='moving_average_abs_max',
-                    weight_quant_type='channel_wise_abs_max',
-                    for_ci=True)
-
-    def test_freeze_graph_cpu_static(self):
-        with fluid.unique_name.guard():
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='range_abs_max',
-                weight_quant_type='abs_max',
-                for_ci=True)
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='moving_average_abs_max',
-                weight_quant_type='abs_max',
-                for_ci=True)
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='range_abs_max',
-                weight_quant_type='channel_wise_abs_max',
-                for_ci=True)
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='moving_average_abs_max',
-                weight_quant_type='channel_wise_abs_max',
-                for_ci=True)
-
-
-class TestAddQuantDequantPass(unittest.TestCase):
-    def setUp(self):
-        self._target_ops = {'elementwise_add', 'pool2d'}
-        self._target_grad_ops = {'elementwise_add_grad', 'pool2d_grad'}
-
-    def check_graph(self, graph):
-        ops = graph.all_op_nodes()
-
-        for op_node in ops:
-            if op_node.name() in self._target_ops:
-                in_nodes_all_not_persistable = True
-                for input_name in op_node.input_arg_names():
-                    in_node = graph._find_node_by_name(op_node.inputs,
-                                                       input_name)
-                    in_nodes_all_not_persistable = (
-                        in_nodes_all_not_persistable and
-                        not in_node.persistable())
-                if not in_nodes_all_not_persistable:
-                    continue
-
-                if op_node.op().has_attr("pooling_type") and \
-                    op_node.op().attr("pooling_type") == 'max':
-                    continue
-
-                input_names = op_node.input_arg_names()
-                for input_name in input_names:
-                    self.assertTrue(input_name.endswith('.quant_dequant'))
-
-    def residual_block_quant(self, for_ci=True):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = residual_block(1)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
-        place = fluid.CPUPlace()
-        graph = IrGraph(core.Graph(main.desc), for_test=False)
-        add_quant_dequant_pass = AddQuantDequantPass(
-            scope=fluid.global_scope(), place=place)
-        add_quant_dequant_pass.apply(graph)
-        if not for_ci:
-            marked_nodes = set()
-            for op in graph.all_op_nodes():
-                if op.name().find('quant') > -1:
-                    marked_nodes.add(op)
-            graph.draw('.', 'add_quant_dequant_graph', marked_nodes)
-        self.check_graph(graph)
-        program = graph.to_program()
-        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
-        if not for_ci:
-            val_marked_nodes = set()
-            for op in val_graph.all_op_nodes():
-                if op.name().find('quant') > -1:
-                    val_marked_nodes.add(op)
-            val_graph.draw('.', 'val_add_quant_dequant_graph', val_marked_nodes)
-
-    def test_residual_block(self):
-        self.residual_block_quant(for_ci=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
deleted file mode 100644
index 0739c9c1f7b9b7250e9743d496df5d29fb0d6ea9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ /dev/null
@@ -1,198 +0,0 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import os
-import unittest
-import random
-import numpy as np
-import six
-import paddle.fluid as fluid
-import paddle
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
-from paddle.fluid.contrib.slim.quantization import ScaleForTrainingPass
-from paddle.fluid.contrib.slim.quantization import ScaleForInferencePass
-from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
-from paddle.fluid import core
-
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-os.environ["CPU_NUM"] = "1"
-
-
-def residual_block(img, label, num=1):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    hidden = img
-    for _ in six.moves.xrange(num):
-        conv = conv_bn_layer(hidden, 20, 3, 1, 1, act=None, bias_attr=True)
-        short = conv_bn_layer(hidden, 20, 1, 1, 0, act=None)
-        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
-    fc = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-class TestQuantizationScalePass(unittest.TestCase):
-    def quantization_scale(self,
-                           use_cuda,
-                           seed,
-                           activation_quant_type,
-                           weight_quant_type='abs_max',
-                           for_ci=False):
-        def build_program(main, startup, is_test):
-            main.random_seed = seed
-            startup.random_seed = seed
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    loss = residual_block(img, label, 1)
-                    if not is_test:
-                        opt = fluid.optimizer.Adam(learning_rate=0.0001)
-                        opt.minimize(loss)
-            return [img, label], loss
-
-        random.seed(0)
-        np.random.seed(0)
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        test_program = fluid.Program()
-        feeds, loss = build_program(main, startup, False)
-        build_program(test_program, startup, True)
-        test_program = test_program.clone(for_test=True)
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quant_type)
-        transform_pass.apply(main_graph)
-        transform_pass.apply(test_graph)
-
-        add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place)
-        add_quant_dequant_pass.apply(main_graph)
-        add_quant_dequant_pass.apply(test_graph)
-
-        scale_training_pass = ScaleForTrainingPass(scope=scope, place=place)
-        scale_training_pass.apply(main_graph)
-
-        dev_name = '_gpu' if use_cuda else '_cpu'
-        if not for_ci:
-            marked_nodes = set()
-            for op in main_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            main_graph.draw('.', 'main_scale' + dev_name, marked_nodes)
-            marked_nodes = set()
-            for op in test_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            test_graph.draw('.', 'test_scale' + dev_name, marked_nodes)
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.memory_optimize = False
-        build_strategy.enable_inplace = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
-        iters = 5
-        batch_size = 8
-
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-        with fluid.scope_guard(scope):
-            for _ in range(iters):
-                data = next(train_reader())
-                loss_v = exe.run(binary,
-                                 feed=feeder.feed(data),
-                                 fetch_list=[loss])
-                if not for_ci:
-                    print('{}: {}'.format('loss' + dev_name, loss_v))
-
-        scale_inference_pass = ScaleForInferencePass(scope=scope)
-        scale_inference_pass.apply(test_graph)
-
-        # Freeze graph for inference, but the weight of fc/conv is still float type.
-        freeze_pass = QuantizationFreezePass(
-            scope=scope, place=place, weight_quantize_type=weight_quant_type)
-        freeze_pass.apply(test_graph)
-        server_program = test_graph.to_program()
-
-        if not for_ci:
-            marked_nodes = set()
-            for op in test_graph.all_op_nodes():
-                if op.name().find('quantize') > -1:
-                    marked_nodes.add(op)
-            test_graph.draw('.', 'quant_scale' + dev_name, marked_nodes)
-
-        with open('quant_scale_model' + dev_name + '.txt', 'w') as f:
-            f.write(str(server_program))
-
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model('quant_scale_model' + dev_name,
-                                          ['image', 'label'], [loss], exe,
-                                          server_program)
-
-    def test_quant_scale_cuda(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
-                self.quantization_scale(
-                    True,
-                    seed=1,
-                    activation_quant_type='moving_average_abs_max',
-                    weight_quant_type='channel_wise_abs_max',
-                    for_ci=True)
-
-    def test_quant_scale_cpu(self):
-        with fluid.unique_name.guard():
-            self.quantization_scale(
-                False,
-                seed=2,
-                activation_quant_type='moving_average_abs_max',
-                weight_quant_type='channel_wise_abs_max',
-                for_ci=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py
deleted file mode 100644
index a1ca7108ff08678236d6bbd17de6bd9408d8136c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import paddle
-import unittest
-import paddle.fluid as fluid
-from mobilenet import MobileNet
-from paddle.fluid.contrib.slim.core import Compressor
-from paddle.fluid.contrib.slim.graph import GraphWrapper
-
-
-class TestQuantizationStrategy(unittest.TestCase):
-    """
-    Test API of quantization strategy.
-    """
-
-    def test_compression(self):
-        self.quan("./quantization/compress.yaml")
-        self.quan("./quantization/compress_1.yaml")
-
-    def quan(self, config_file):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        class_dim = 10
-        image_shape = [1, 28, 28]
-
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-
-        with fluid.program_guard(train_program, startup_program):
-            with fluid.unique_name.guard():
-                image = fluid.layers.data(
-                    name='image', shape=image_shape, dtype='float32')
-                image.stop_gradient = False
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
-                out = MobileNet(name='quan').net(input=image,
-                                                 class_dim=class_dim)
-                print("out: {}".format(out.name))
-                acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-                acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-                cost = fluid.layers.cross_entropy(input=out, label=label)
-                avg_cost = fluid.layers.mean(x=cost)
-
-        val_program = train_program.clone(for_test=False)
-
-        optimizer = fluid.optimizer.Momentum(
-            momentum=0.9,
-            learning_rate=0.01,
-            regularization=fluid.regularizer.L2Decay(4e-5))
-
-        scope = fluid.Scope()
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(startup_program, scope=scope)
-
-        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
-
-        val_feed_list = [('img', image.name), ('label', label.name)]
-        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
-                                                        acc_top5.name)]
-
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=128)
-        train_feed_list = [('img', image.name), ('label', label.name)]
-        train_fetch_list = [('loss', avg_cost.name)]
-
-        com_pass = Compressor(
-            place,
-            scope,
-            train_program,
-            train_reader=train_reader,
-            train_feed_list=train_feed_list,
-            train_fetch_list=train_fetch_list,
-            eval_program=val_program,
-            eval_reader=val_reader,
-            eval_feed_list=val_feed_list,
-            eval_fetch_list=val_fetch_list,
-            train_optimizer=optimizer)
-        com_pass.config(config_file)
-        eval_graph = com_pass.run()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_slim_distillation_strategy.py b/python/paddle/fluid/contrib/slim/tests/test_slim_distillation_strategy.py
deleted file mode 100644
index 094cc4c6ac8be582fc31d0436e4468d2ebbb235a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_slim_distillation_strategy.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import paddle
-import unittest
-import paddle.fluid as fluid
-from mobilenet import MobileNet
-from paddle.fluid.contrib.slim.core import Compressor
-from paddle.fluid.contrib.slim.graph import GraphWrapper
-
-
-class TestDistillationStrategy(unittest.TestCase):
-    """
-    Test API of distillation strategy.
-    """
-
-    def test_compression(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        class_dim = 10
-        image_shape = [1, 28, 28]
-        image = fluid.layers.data(
-            name='image', shape=image_shape, dtype='float32')
-        image.stop_gradient = False
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        out = MobileNet(name="student").net(input=image, class_dim=class_dim)
-        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-        val_program = fluid.default_main_program().clone(for_test=False)
-
-        cost = fluid.layers.cross_entropy(input=out, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        optimizer = fluid.optimizer.Momentum(
-            momentum=0.9,
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=[5, 10], values=[0.01, 0.001, 0.0001]),
-            regularization=fluid.regularizer.L2Decay(4e-5))
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
-
-        val_feed_list = [('img', image.name), ('label', label.name)]
-        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
-                                                        acc_top5.name)]
-
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=128)
-        train_feed_list = [('img', image.name), ('label', label.name)]
-        train_fetch_list = [('loss', avg_cost.name)]
-
-        # define teacher program
-        teacher_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(teacher_program, startup_program):
-            img = teacher_program.global_block()._clone_variable(
-                image, force_persistable=False)
-            predict = MobileNet(name="teacher").net(input=img,
-                                                    class_dim=class_dim)
-
-        exe.run(startup_program)
-
-        com_pass = Compressor(
-            place,
-            fluid.global_scope(),
-            fluid.default_main_program(),
-            train_reader=train_reader,
-            train_feed_list=train_feed_list,
-            train_fetch_list=train_fetch_list,
-            eval_program=val_program,
-            eval_reader=val_reader,
-            eval_feed_list=val_feed_list,
-            eval_fetch_list=val_fetch_list,
-            teacher_programs=[teacher_program.clone(for_test=True)],
-            train_optimizer=optimizer,
-            distiller_optimizer=optimizer)
-        com_pass.config('./distillation/compress.yaml')
-        eval_graph = com_pass.run()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
deleted file mode 100644
index 7431b11817894ed002dd3ceb2de661dbe5c76be8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-foreach(src ${TEST_OPS})
-        py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/contrib/tests/test_distributed_reader.py b/python/paddle/fluid/contrib/tests/test_distributed_reader.py
deleted file mode 100644
index 51e1455e71ecfe3f347977bea17a56e556c5ce0d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/tests/test_distributed_reader.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-import os
-
-
-def data_generator():
-    data = [0, 1, 2, 3]
-    for val in data:
-        yield val
-
-
-class TestDistributedReader(unittest.TestCase):
-    def test_distributed_reader(self):
-        trainer_num = 4
-        os.environ['PADDLE_TRAINER_ID'] = str(1)
-        os.environ['PADDLE_TRAINERS_NUM'] = str(trainer_num)
-
-        reader = fluid.contrib.reader.distributed_batch_reader(data_generator)
-        data = next(reader())
-        assert data == 1
-
-        os.unsetenv('PADDLE_TRAINER_ID')
-        os.unsetenv('PADDLE_TRAINERS_NUM')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
deleted file mode 100644
index e556f4d07d1b0f956043a830fcc608d596472763..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ /dev/null
@@ -1,415 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import contextlib
-import math
-import sys
-import numpy
-import unittest
-import os
-import copy
-import numpy as np
-
-
-def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    def shortcut(input, ch_in, ch_out, stride):
-        if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-        else:
-            return input
-
-    def basicblock(input, ch_in, ch_out, stride):
-        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
-        short = shortcut(input, ch_in, ch_out, stride)
-        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
-
-    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
-        tmp = block_func(input, ch_in, ch_out, stride)
-        for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1)
-        return tmp
-
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) // 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    return pool
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
-
-
-def train(net_type, use_cuda, save_dirname, is_local):
-    classdim = 10
-    data_shape = [3, 32, 32]
-
-    train_program = fluid.Program()
-    startup_prog = fluid.Program()
-    train_program.random_seed = 123
-    startup_prog.random_seed = 456
-    with fluid.program_guard(train_program, startup_prog):
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-        if net_type == "vgg":
-            print("train vgg net")
-            net = vgg16_bn_drop(images)
-        elif net_type == "resnet":
-            print("train resnet")
-            net = resnet_cifar10(images, 32)
-        else:
-            raise ValueError("%s network is not supported" % net_type)
-
-        logits = fluid.layers.fc(input=net, size=classdim, act="softmax")
-        cost, predict = fluid.layers.softmax_with_cross_entropy(
-            logits, label, return_softmax=True)
-        avg_cost = fluid.layers.mean(cost)
-        acc = fluid.layers.accuracy(input=predict, label=label)
-
-        # Test program
-        test_program = train_program.clone(for_test=True)
-
-        optimizer = fluid.optimizer.Lamb(learning_rate=0.001)
-
-        mp_optimizer = fluid.contrib.mixed_precision.decorate(
-            optimizer=optimizer,
-            init_loss_scaling=8.0,
-            use_dynamic_loss_scaling=True)
-
-        mp_optimizer.minimize(avg_cost)
-        scaled_loss = mp_optimizer.get_loss_scaling()
-
-    BATCH_SIZE = 128
-    PASS_NUM = 1
-
-    # no shuffle for unit test
-    train_reader = paddle.batch(
-        paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE)
-
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-
-    def train_loop(main_program):
-        exe.run(startup_prog)
-        loss = 0.0
-        for pass_id in range(PASS_NUM):
-            for batch_id, data in enumerate(train_reader()):
-                np_scaled_loss, loss = exe.run(
-                    main_program,
-                    feed=feeder.feed(data),
-                    fetch_list=[scaled_loss, avg_cost])
-                print(
-                    'PassID {0:1}, BatchID {1:04}, train loss {2:2.4}, scaled train closs {3:2.4}'.
-                    format(pass_id, batch_id + 1,
-                           float(loss), float(np_scaled_loss)))
-                if (batch_id % 10) == 0:
-                    acc_list = []
-                    avg_loss_list = []
-                    for tid, test_data in enumerate(test_reader()):
-                        loss_t, acc_t = exe.run(program=test_program,
-                                                feed=feeder.feed(test_data),
-                                                fetch_list=[avg_cost, acc])
-                        if math.isnan(float(loss_t)):
-                            sys.exit("got NaN loss, training failed.")
-                        acc_list.append(float(acc_t))
-                        avg_loss_list.append(float(loss_t))
-                        break  # Use 1 segment for speeding up CI
-
-                    acc_value = numpy.array(acc_list).mean()
-                    avg_loss_value = numpy.array(avg_loss_list).mean()
-
-                    print(
-                        'PassID {0:1}, BatchID {1:04}, test loss {2:2.2}, acc {3:2.2}'.
-                        format(pass_id, batch_id + 1,
-                               float(avg_loss_value), float(acc_value)))
-
-                    if acc_value > 0.08:  # Low threshold for speeding up CI
-                        fluid.io.save_inference_model(
-                            save_dirname, ["pixel"], [predict],
-                            exe,
-                            main_program=train_program)
-                        return
-
-    if is_local:
-        train_loop(train_program)
-    else:
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
-        t = fluid.DistributeTranspiler()
-        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            train_loop(t.get_trainer_program())
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        # The input's dimension of conv should be 4-D or 5-D.
-        # Use normilized image pixels as input data, which should be in the range [0, 1.0].
-        batch_size = 1
-        tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
-
-        print("infer results: ", results[0])
-
-        fluid.io.save_inference_model(save_dirname, feed_target_names,
-                                      fetch_targets, exe, inference_program)
-
-
-def main(net_type, use_cuda, is_local=True):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    # Directory for saving the trained model
-    save_dirname = "image_classification_" + net_type + ".inference.model"
-
-    train(net_type, use_cuda, save_dirname, is_local)
-    #infer(use_cuda, save_dirname)
-
-
-class TestImageClassification(unittest.TestCase):
-    def test_amp_lists(self):
-        white_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.white_list)
-        black_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.black_list)
-        gray_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.gray_list)
-
-        amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists()
-        self.assertEqual(amp_lists.white_list, white_list)
-        self.assertEqual(amp_lists.black_list, black_list)
-        self.assertEqual(amp_lists.gray_list, gray_list)
-
-    def test_amp_lists_1(self):
-        white_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.white_list)
-        black_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.black_list)
-        gray_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.gray_list)
-
-        # 1. w={'exp}, b=None
-        white_list.add('exp')
-        black_list.remove('exp')
-
-        amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
-            {'exp'})
-        self.assertEqual(amp_lists.white_list, white_list)
-        self.assertEqual(amp_lists.black_list, black_list)
-        self.assertEqual(amp_lists.gray_list, gray_list)
-
-    def test_amp_lists_2(self):
-        white_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.white_list)
-        black_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.black_list)
-        gray_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.gray_list)
-
-        # 2. w={'tanh'}, b=None
-        white_list.add('tanh')
-        gray_list.remove('tanh')
-
-        amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
-            {'tanh'})
-        self.assertEqual(amp_lists.white_list, white_list)
-        self.assertEqual(amp_lists.black_list, black_list)
-        self.assertEqual(amp_lists.gray_list, gray_list)
-
-    def test_amp_lists_3(self):
-        white_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.white_list)
-        black_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.black_list)
-        gray_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.gray_list)
-
-        # 3. w={'lstm'}, b=None
-        white_list.add('lstm')
-
-        amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
-            {'lstm'})
-        self.assertEqual(amp_lists.white_list, white_list)
-        self.assertEqual(amp_lists.black_list, black_list)
-        self.assertEqual(amp_lists.gray_list, gray_list)
-
-    def test_amp_lists_4(self):
-        white_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.white_list)
-        black_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.black_list)
-        gray_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.gray_list)
-
-        # 4. w=None, b={'conv2d'}
-        white_list.remove('conv2d')
-        black_list.add('conv2d')
-
-        amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
-            custom_black_list={'conv2d'})
-        self.assertEqual(amp_lists.white_list, white_list)
-        self.assertEqual(amp_lists.black_list, black_list)
-        self.assertEqual(amp_lists.gray_list, gray_list)
-
-    def test_amp_lists_5(self):
-        white_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.white_list)
-        black_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.black_list)
-        gray_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.gray_list)
-
-        # 5. w=None, b={'tanh'}
-        black_list.add('tanh')
-        gray_list.remove('tanh')
-
-        amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
-            custom_black_list={'tanh'})
-        self.assertEqual(amp_lists.white_list, white_list)
-        self.assertEqual(amp_lists.black_list, black_list)
-        self.assertEqual(amp_lists.gray_list, gray_list)
-
-    def test_amp_lists_6(self):
-        white_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.white_list)
-        black_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.black_list)
-        gray_list = copy.copy(
-            fluid.contrib.mixed_precision.fp16_lists.gray_list)
-
-        # 6. w=None, b={'lstm'}
-        black_list.add('lstm')
-
-        amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
-            custom_black_list={'lstm'})
-        self.assertEqual(amp_lists.white_list, white_list)
-        self.assertEqual(amp_lists.black_list, black_list)
-        self.assertEqual(amp_lists.gray_list, gray_list)
-
-    def test_amp_lists_7(self):
-        # 7. w={'lstm'} b={'lstm'}
-        # raise ValueError
-        self.assertRaises(ValueError,
-                          fluid.contrib.mixed_precision.AutoMixedPrecisionLists,
-                          {'lstm'}, {'lstm'})
-
-    def test_vgg_cuda(self):
-        with self.scope_prog_guard():
-            main('vgg', use_cuda=True)
-
-    def test_resnet_cuda(self):
-        with self.scope_prog_guard():
-            main('resnet', use_cuda=True)
-
-    @contextlib.contextmanager
-    def scope_prog_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
deleted file mode 100644
index 77fdf0087b93c3ad44a2492de68f8f57ce243ef3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ /dev/null
@@ -1,281 +0,0 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import numpy as np
-import six
-
-import unittest
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name
-from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler
-
-
-def linear_fc(num):
-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = data
-    for _ in six.moves.xrange(num):
-        hidden = fluid.layers.fc(hidden, size=128, act='relu')
-    loss = fluid.layers.cross_entropy(input=hidden, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def residual_block(num):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = data
-    for _ in six.moves.xrange(num):
-        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
-        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
-        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
-    fc = fluid.layers.fc(input=hidden, size=10)
-    loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
-    return avg_loss
-
-
-class TestQuantizeTranspiler(unittest.TestCase):
-    def setUp(self):
-        # since quant_op and dequant_op is not ready, use cos and sin for test
-        self.weight_quant_op_type = 'fake_quantize_abs_max'
-        self.dequant_op_type = 'fake_dequantize_max_abs'
-        self.quantizable_op_and_inputs = {
-            'conv2d': ['Input', 'Filter'],
-            'depthwise_conv2d': ['Input', 'Filter'],
-            'mul': ['X', 'Y']
-        }
-        self.quantizable_op_grad_and_inputs = {
-            'conv2d_grad': ['Input', 'Filter'],
-            'depthwise_conv2d_grad': ['Input', 'Filter'],
-            'mul_grad': ['X', 'Y']
-        }
-
-    def check_program(self, program):
-        quantized_ops = {}
-
-        persistable_vars = [
-            v.name
-            for v in filter(lambda var: var.persistable, program.list_vars())
-        ]
-
-        for block in program.blocks:
-            for idx, op in enumerate(block.ops):
-                # check forward
-                if op.type in self.quantizable_op_and_inputs:
-                    for i, arg_name in enumerate(op.input_arg_names):
-                        quant_op_type = self.weight_quant_op_type if \
-                            _original_var_name(arg_name) \
-                            in persistable_vars else self.act_quant_op_type
-                        self.assertTrue(
-                            arg_name.endswith('.quantized.dequantized'))
-                        if arg_name not in quantized_ops:
-                            self.assertEqual(block.ops[idx - 2 * i - 1].type,
-                                             self.dequant_op_type)
-                            self.assertEqual(block.ops[idx - 2 * i - 2].type,
-                                             quant_op_type)
-                            quantized_ops[arg_name] = block.ops[idx - 2 * i - 2]
-                        else:
-                            op_idx = block.ops.index(quantized_ops[arg_name])
-                            self.assertLess(op_idx, idx)
-
-                # check backward
-                if op.type in self.quantizable_op_grad_and_inputs:
-                    for pname in self.quantizable_op_grad_and_inputs[op.type]:
-                        arg_name = op.input(pname)[0]
-                        self.assertTrue(
-                            arg_name.endswith('.quantized.dequantized'))
-                        self.assertTrue(arg_name in quantized_ops)
-
-    def linear_fc_quant(self, quant_type):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = linear_fc(3)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
-            t = QuantizeTranspiler(activation_quantize_type=quant_type)
-            t.training_transpile(main)
-            self.check_program(main)
-
-    def test_linear_fc_quant_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_abs_max'
-        self.linear_fc_quant('abs_max')
-
-    def test_linear_fc_quant_range_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_range_abs_max'
-        self.linear_fc_quant('range_abs_max')
-
-    def residual_block_quant(self, quant_type):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = residual_block(2)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
-            t = QuantizeTranspiler(activation_quantize_type=quant_type)
-            t.training_transpile(main)
-            self.check_program(main)
-
-    def test_residual_block_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_abs_max'
-        self.residual_block_quant('abs_max')
-
-    def test_residual_block_range_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_range_abs_max'
-        self.residual_block_quant('range_abs_max')
-
-    def freeze_program(self, use_cuda, seed):
-        def build_program(main, startup, is_test):
-            main.random_seed = seed
-            startup.random_seed = seed
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    loss = conv_net(img, label)
-                    if not is_test:
-                        opt = fluid.optimizer.Adam(learning_rate=0.001)
-                        opt.minimize(loss)
-            return [img, label], loss
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        test_program = fluid.Program()
-
-        import random
-        random.seed(0)
-        np.random.seed(0)
-
-        feeds, loss = build_program(main, startup, False)
-        build_program(test_program, startup, True)
-        test_program = test_program.clone(for_test=True)
-
-        quant_type = 'range_abs_max'  # 'range_abs_max' or 'abs_max'
-        quant_transpiler = QuantizeTranspiler(
-            activation_quantize_type=quant_type)
-        quant_transpiler.training_transpile(main, startup)
-        quant_transpiler.training_transpile(test_program, startup)
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        iters = 5
-        batch_size = 8
-        class_num = 10
-        exe.run(startup)
-
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-
-        with fluid.program_guard(main):
-            for _ in range(iters):
-                data = next(train_reader())
-                loss_v = exe.run(program=main,
-                                 feed=feeder.feed(data),
-                                 fetch_list=[loss])
-
-        with fluid.program_guard(test_program):
-            test_data = next(test_reader())
-            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
-                                             test_program)
-            # Testing during training
-            test_loss1, w_quant = exe.run(program=test_program,
-                                          feed=feeder.feed(test_data),
-                                          fetch_list=[loss, w_var])
-
-            # Freeze program for inference, but the weight of fc/conv is still float type.
-            quant_transpiler.freeze_program(test_program, place)
-            test_loss2, = exe.run(program=test_program,
-                                  feed=feeder.feed(test_data),
-                                  fetch_list=[loss])
-            self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
-            w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
-                                .get_tensor())
-            # fail: -432.0 != -433.0, this is due to the calculation precision
-            #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
-
-            # Convert parameter to 8-bit.
-            quant_transpiler.convert_to_int8(test_program, place)
-            # Save the 8-bit parameter and model file.
-            fluid.io.save_inference_model('model_8bit', ['image', 'label'],
-                                          [loss], exe, test_program)
-            # Test whether the 8-bit parameter and model file can be loaded successfully.
-            [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit',
-                                                                 exe)
-            # Check the loaded 8-bit weight.
-            w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8')
-                              .get_tensor())
-
-            self.assertEqual(w_8bit.dtype, np.int8)
-            self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
-
-    def not_test_freeze_program_cuda(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
-                self.freeze_program(True, seed=1)
-
-    def not_test_freeze_program_cpu(self):
-        with fluid.unique_name.guard():
-            self.freeze_program(False, seed=2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
deleted file mode 100644
index 2b331308de5ee9a8aa52a9e303bfbcf8d4264d5f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from functools import partial
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import contextlib
-
-
-def get_places():
-    places = [fluid.CPUPlace()]
-    if fluid.core.is_compiled_with_cuda():
-        places.append(fluid.CUDAPlace(0))
-    return places
-
-
-@contextlib.contextmanager
-def prog_scope_guard(main_prog, startup_prog):
-    scope = fluid.core.Scope()
-    with fluid.unique_name.guard():
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(main_prog, startup_prog):
-                yield
-
-
-def bow_net(data,
-            label,
-            dict_dim,
-            is_sparse=False,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    fluid/PaddleNLP/text_classification/nets.py
-    """
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
-    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    return avg_cost
-
-
-class TestWeightDecay(unittest.TestCase):
-    def setUp(self):
-        self.word_dict = paddle.dataset.imdb.word_dict()
-        reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict), batch_size=2)()
-        self.train_data = [next(reader) for _ in range(5)]
-        self.learning_rate = .5
-
-    def run_program(self, place, feed_list):
-        exe = fluid.Executor(place)
-        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
-        exe.run(fluid.default_startup_program())
-
-        main_prog = fluid.default_main_program()
-        param_list = [var.name for var in main_prog.block(0).all_parameters()]
-
-        param_sum = []
-        for data in self.train_data:
-            out = exe.run(main_prog,
-                          feed=feeder.feed(data),
-                          fetch_list=param_list)
-            p_sum = 0
-            for v in out:
-                p_sum += np.sum(np.abs(v))
-            param_sum.append(p_sum)
-        return param_sum
-
-    def check_weight_decay(self, place, model):
-        main_prog = fluid.framework.Program()
-        startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
-        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            avg_cost = model(data, label, len(self.word_dict))
-            AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
-                fluid.optimizer.Adam)
-
-            optimizer = AdamW(
-                learning_rate=self.learning_rate,
-                weight_decay=self.learning_rate)
-
-            optimizer.minimize(avg_cost)
-            param_sum = self.run_program(place, [data, label])
-
-        return param_sum
-
-    def check_weight_decay2(self, place, model):
-        main_prog = fluid.framework.Program()
-        startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
-        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-            avg_cost = model(data, label, len(self.word_dict))
-
-            param_list = [(var, var * self.learning_rate)
-                          for var in main_prog.block(0).all_parameters()]
-
-            optimizer = fluid.optimizer.Adam(learning_rate=self.learning_rate)
-
-            optimizer.minimize(avg_cost)
-            for params in param_list:
-                updated_p = fluid.layers.elementwise_sub(
-                    x=params[0], y=params[1])
-                fluid.layers.assign(input=updated_p, output=params[0])
-
-            param_sum = self.run_program(place, [data, label])
-        return param_sum
-
-    def test_weight_decay(self):
-        for place in get_places():
-            model = partial(bow_net, is_sparse=False)
-            param_sum1 = self.check_weight_decay(place, model)
-            param_sum2 = self.check_weight_decay2(place, model)
-
-            for i in range(len(param_sum1)):
-                assert np.isclose(a=param_sum1[i], b=param_sum2[i], rtol=5e-5)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
deleted file mode 100644
index d27b808438d53a004db4e85345a68c35d00fff98..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/trainer.py
+++ /dev/null
@@ -1,1258 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from ..wrapped_decorator import signature_safe_contextmanager
-import os
-import errno
-import shutil
-import six
-import time
-
-from .. import core
-from .. import data_feeder
-from .. import executor
-from .. import framework
-from .. import io
-# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
-from .. import optimizer as opt_module
-from .. import parallel_executor
-from ..transpiler import distribute_transpiler
-
-__all__ = [
-    'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
-    'EndStepEvent', 'CheckpointConfig'
-]
-
-
-class BeginEpochEvent(object):
-    """
-    The begin of a training epoch.
-
-    Args:
-        epoch_id(int): The current epoch ID.
-    """
-
-    def __init__(self, epoch_id):
-        self.epoch = epoch_id
-
-
-class EndEpochEvent(object):
-    """
-    The end of a training epoch.
-
-    Args:
-        epoch_id(int): The current epoch ID.
-    """
-
-    def __init__(self, epoch_id):
-        self.epoch = epoch_id
-
-
-class BeginStepEvent(object):
-    """
-    The begin of a training epoch.
-
-    Args:
-        epoch_id(int): The current epoch ID.
-        step_id(int): The current step ID.
-    """
-
-    def __init__(self, epoch_id, step_id):
-        self.epoch = epoch_id
-        self.step = step_id
-        self.fetch_metrics = True
-        """
-        If fetch_metrics is true, the metrics will be fetched at the
-        EndStepEvent. Default is True.
-        """
-
-
-class EndStepEvent(object):
-    """
-    The end of a training step.
-
-    Args:
-        epoch_id(int): The current epoch ID.
-        step_id(int): The current step ID.
-        metrics(list): A list of fetched tensor. The order of this list is same
-            as the :code:`train_func` returns.
-    """
-
-    def __init__(self, epoch_id, step_id, metrics):
-        self.epoch = epoch_id
-        self.step = step_id
-        self.metrics = metrics
-
-
-class CheckpointConfig(object):
-    """
-    Parameter object for :code:`save_checkpoint` and
-    :code:`fluid.Trainer`. Used to configuration how to save checkpoint.
-
-    Args:
-        checkpoint_dir(str): Directory path to save check point. Default is the
-            current directory.
-
-        max_num_checkpoints(int): The max number of local check points.
-        epoch_interval(int): Every number of epoch to save check point.
-        step_interval(int): Every number of step to save check point.
-
-    Examples:
-        >>> config = fluid.CheckpointConfig("./checkpoints")
-        >>> trainer = fluid.Trainer(train_func=train_program,
-        >>>                         place=place,
-        >>>                         optimizer_func=optimizer_func,
-        >>>                         checkpoint_config=config)
-        >>> trainer.train(...)
-    """
-
-    def __init__(self,
-                 checkpoint_dir=None,
-                 max_num_checkpoints=3,
-                 epoch_interval=1,
-                 step_interval=10):
-
-        assert epoch_interval >= 1
-        assert step_interval >= 1
-
-        self.checkpoint_dir = checkpoint_dir \
-            if checkpoint_dir is not None else os.getcwd()
-        self.max_num_checkpoints = max_num_checkpoints
-        self.epoch_interval = epoch_interval
-        self.step_interval = step_interval
-        self.epoch_id = 0
-        self.step_id = 0
-        self.load_serial = None
-        self.pserver_id = None
-        self.lookup_table_name = None
-
-
-def check_and_get_place(place):
-    """
-    Check the type of place or get the default place
-    Args:
-        place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on.
-
-    Raises:
-        TypeError if the type mismatched.
-
-    Returns:
-        the original place if it is not None.
-        if fluid is compiled with CUDA, returns CUDAPlace(0) by default.
-        Otherwise returns CPUPlace by default.
-    """
-    if place is None:
-        if core.is_compiled_with_cuda():
-            return core.CUDAPlace(0)
-        else:
-            return core.CPUPlace()
-    else:
-        if not isinstance(place, core.CUDAPlace) and not isinstance(
-                place, core.CPUPlace):
-            raise TypeError("Place should be either CUDAPlace or CPUPlace")
-        return place
-
-
-class Trainer(object):
-    """
-    A trainer wraps MultiGPU/MultiNode training loops and can be used to train a
-    simple neural network easily.
-
-    This API takes a :code:`train_func`. A :code:`train_func` is a function that
-    return loss as it first return value. The reset value can be fetched by
-    EndStepEvent.metrics
-
-    This API also takes a :code:`optimizer_func` that will return an optimizer
-    instance.
-
-    For example, to train a MLP for MNIST dataset, the sample program is
-
-    >>> import paddle.fluid as fluid
-    >>>
-    >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10):
-    >>>     hidden = image
-    >>>     for layer_size in layer_sizes:
-    >>>         hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation)
-    >>>     return fluid.layers.fc(input=hidden, size=num_classes, act="softmax")
-    >>>
-    >>> def train_mnist_mlp():
-    >>>     img = fluid.layers.data(name='image', shape=[784])
-    >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    >>>     prediction = mlp(img)
-    >>>     return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label))
-    >>>
-    >>> def optimizer():
-    >>>     return fluid.optimizer.Adam()
-    >>>
-    >>> trainer = Trainer(train_func=train_mnist_mlp,
-    >>>                   optimizer_func=optimizer,
-    >>>                   place=fluid.CUDAPlace(0),
-    >>>                   parallel=True)
-    >>>
-    >>> def train_callback(event):
-    >>>     if isinstance(event, fluid.EndStepEvent):
-    >>>         print "Epoch ID", event.epoch, "Step ID",\
-    >>>             event.step, "AvgLoss", event.metrics[0]
-    >>>     elif isinstance(event, fluid.EndEpochEvent):
-    >>>         trainer.save_params("./model_{0}".format(event.epoch))
-    >>>
-    >>> trainer.train(num_epochs=100, event_handler=train_callback)
-
-    For more example, please see :ref:`api_guide_high_level_api`.
-
-
-    Args:
-        train_func(callable): A function which will return loss. The loss must be
-            a scalar tensor.
-        optimizer_func(callable): A function that returns an Optimizer object.
-        place(CUDAPlace|CPUPlace): The device place of this trainer. If
-            :code:`parallel=True,` all CUDA Places will be used if :code:`place`
-            is a :code:`CUDAPlace`.
-        parallel(bool): True if use multiple devices.
-        checkpoint_config(CheckpointConfig): Configuration about how to save
-            checkpoints.
-    """
-
-    def __init__(self,
-                 train_func,
-                 optimizer_func,
-                 param_path=None,
-                 place=None,
-                 parallel=False,
-                 checkpoint_config=None):
-        self.__stop = False
-        self.parallel = parallel
-
-        # config for checkpoint
-        # only chief worker will save variables
-        self.trainer_id = 0
-        self.checkpoint_cfg = checkpoint_config
-        if self.checkpoint_cfg:
-            assert isinstance(self.checkpoint_cfg, CheckpointConfig)
-            serial = _get_latest_checkpoint_serial(
-                self.checkpoint_cfg.checkpoint_dir)
-            self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
-
-        self.scope = core.Scope()
-
-        # 1. we need to generate a framework.Program by calling
-        # program_func. Reference: fluid.program_guard in
-        # test_word2vec.py
-
-        self.startup_program = framework.Program()
-        self.train_program = framework.Program()
-
-        with framework.program_guard(self.train_program, self.startup_program):
-            program_func_outs = train_func()
-            self.train_func_outputs = program_func_outs if isinstance(
-                program_func_outs, list) else [program_func_outs]
-            self.test_program = self.train_program.clone(for_test=True)
-
-            # The first element of program_func_outs is loss.
-            loss = self.train_func_outputs[0]
-
-            optimizer = optimizer_func()
-            if not isinstance(optimizer, opt_module.Optimizer):
-                raise TypeError(
-                    "The optimizer should be an instance of Optimizer")
-            optimize_ops, params_grads = optimizer.minimize(loss)
-
-        self.place = check_and_get_place(place)
-
-        self._dist_transpile_if_necessary(optimize_ops, params_grads)
-
-        # 2. move the default_main_program to self.program and run the
-        # default_startup program on an empty core.Scope()
-        # Run startup program
-        with self._prog_and_scope_guard():
-            exe = executor.Executor(place)
-            exe.run(self.startup_program)
-
-        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial is not None:
-            self._load_checkpoint()
-
-        if param_path and os.path.isdir(param_path):
-            with self._prog_and_scope_guard():
-                # load params from param_path into scope
-                io.load_persistables(
-                    executor=exe,
-                    dirname=param_path,
-                    main_program=self.startup_program)
-
-    def _transpile_nccl2_dist(self):
-        # PADDLE_TRAINER_IPS
-        if "PADDLE_TRAINER_IPS" not in os.environ:
-            self.nccl_id_var = None
-        else:
-            self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-            port = os.getenv("PADDLE_PSERVER_PORT")
-            worker_ips = os.getenv("PADDLE_TRAINER_IPS")
-            worker_endpoints = []
-            for ip in worker_ips.split(","):
-                worker_endpoints.append(':'.join([ip, port]))
-            self.num_trainers = len(worker_endpoints)
-            current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
-            worker_endpoints.remove(current_endpoint)
-            # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
-            # in ParallelExecutor to start
-            # distributed training using NCCL2
-            self.nccl_id_var = self.startup_program.global_block().create_var(
-                name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
-            self.startup_program.global_block().append_op(
-                type="gen_nccl_id",
-                inputs={},
-                outputs={"NCCLID": self.nccl_id_var},
-                attrs={
-                    "endpoint": current_endpoint,
-                    "endpoint_list": worker_endpoints,
-                    "trainer_id": self.trainer_id
-                })
-
-    def _dist_transpile_if_necessary(self, optimize_ops, params_grads):
-        self._transpile_nccl2_dist()
-        if self.nccl_id_var != None:
-            return
-
-        if "PADDLE_TRAINING_ROLE" not in os.environ:
-            return
-
-        # the port of all pservers, needed by both trainer and pserver
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        # comma separated ips of all pservers, needed by trainer and
-        # pserver
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)
-        # total number of workers/trainers in the job, needed by
-        # trainer and pserver
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        # the IP of the local machine, needed by pserver only
-        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
-        # the unique trainer id, starting from 0, needed by trainer
-        # only
-        self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-
-        # the role, should be either PSERVER or TRAINER
-        training_role = os.getenv("PADDLE_TRAINING_ROLE")
-        with self._prog_and_scope_guard():
-            t = distribute_transpiler.DistributeTranspiler()
-            t.transpile(
-                self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
-            if training_role == "PSERVER":
-                if self.checkpoint_cfg:
-                    pserver_id = eplist.index(current_endpoint)
-                    self.checkpoint_cfg.pserver_id = pserver_id
-                    if t.has_distributed_lookup_table:
-                        self.checkpoint_cfg.lookup_table_name = t.table_name
-
-                self.train_program = t.get_pserver_program(current_endpoint)
-                self.startup_program = t.get_startup_program(current_endpoint,
-                                                             self.train_program)
-            elif training_role == "TRAINER":
-                self.train_program = t.get_trainer_program()
-            else:
-                raise ValueError(
-                    'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
-                )
-
-    def stop(self):
-        """
-        stop training
-        """
-        self.__stop = True
-
-    def train(self, num_epochs, event_handler, reader=None, feed_order=None):
-        """
-        Start the train loop to train the model.
-
-        Args:
-            num_epochs(int): The number of epoch. An epoch will process all data in reader
-            event_handler(callable): The event handler. A function with type (ev:Event)->void
-            reader(callable): A reader creator object. See also
-                :ref:`api_guide_python_reader` .
-            feed_order(list): Feeding order of reader. None will following the defining
-                order in program
-
-        Returns:
-            None
-        """
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
-        if training_role == "PSERVER":
-            with self._prog_and_scope_guard():
-                exe = executor.Executor(self.place)
-                exe.run()
-                return
-        if self.parallel:
-            self._train_by_parallel_executor(num_epochs, event_handler, reader,
-                                             feed_order)
-        else:
-            self._train_by_executor(num_epochs, event_handler, reader,
-                                    feed_order)
-
-    def test(self, reader, feed_order):
-        """
-        Test the model on given test data
-
-        Args:
-            reader(callable): The reader that yields test data.
-            feed_order(list): Feeding order of reader. None will following the
-                defining order in program
-        """
-
-        return self._test_by_executor(reader, feed_order,
-                                      self.train_func_outputs)
-
-    def save_params(self, param_path):
-        """
-        Save all parameters into :code:`param_path`.
-
-        Args:
-            param_path(str): The path to save parameters.
-
-        Returns:
-            None
-        """
-        with self._prog_and_scope_guard():
-            exe = executor.Executor(self.place)
-            io.save_persistables(exe, dirname=param_path)
-
-    def save_inference_model(self, param_path, feeded_var_names,
-                             target_var_indexes):
-        """
-        Save model for cpp inference into :code:`param_path`.
-
-        Args:
-            param_path(str): The path to save parameters.
-            feeded_var_names(list(str)): The name of the vars that you
-                need to feed in before run program.
-            target_var_indexes(list(int)): the index of target var that
-                you need to return in trainer.train_func.
-        Returns:
-            None
-        """
-        with self._prog_and_scope_guard():
-            exe = executor.Executor(self.place)
-            target_vars = [
-                self.train_func_outputs[index] for index in target_var_indexes
-            ]
-            io.save_inference_model(param_path, feeded_var_names, target_vars,
-                                    exe)
-
-    @signature_safe_contextmanager
-    def _prog_and_scope_guard(self):
-        with framework.program_guard(
-                main_program=self.train_program,
-                startup_program=self.startup_program):
-            with executor.scope_guard(self.scope):
-                yield
-
-    def _train_by_executor(self, num_epochs, event_handler, reader, feed_order):
-        """
-        Train by Executor and single device.
-
-        Args:
-            num_epochs:
-            event_handler:
-            reader:
-            feed_order:
-
-        Returns:
-
-        """
-        with self._prog_and_scope_guard():
-            feed_var_list = build_feed_var_list(self.train_program, feed_order)
-            feeder = data_feeder.DataFeeder(
-                feed_list=feed_var_list, place=self.place)
-            exe = executor.Executor(self.place)
-            reader = feeder.decorate_reader(reader, multi_devices=False)
-            self._train_by_any_executor(event_handler, exe, num_epochs, reader)
-
-    def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
-        if self.checkpoint_cfg:
-            epochs = [
-                epoch_id for epoch_id in range(num_epochs)
-                if epoch_id >= self.checkpoint_cfg.epoch_id
-            ]
-        else:
-            epochs = [epoch_id for epoch_id in range(num_epochs)]
-
-        for epoch_id in epochs:
-            event_handler(BeginEpochEvent(epoch_id))
-            for step_id, data in enumerate(reader()):
-                if self.__stop:
-                    if self.checkpoint_cfg:
-                        self._clean_checkpoint()
-                    return
-
-                if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \
-                        and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id:
-                    continue
-
-                begin_event = BeginStepEvent(epoch_id, step_id)
-                event_handler(begin_event)
-                if begin_event.fetch_metrics:
-                    metrics = exe.run(feed=data,
-                                      fetch_list=[
-                                          var.name
-                                          for var in self.train_func_outputs
-                                      ])
-                else:
-                    metrics = exe.run(feed=data, fetch_list=[])
-
-                if self.checkpoint_cfg:
-                    self._save_checkpoint(epoch_id, step_id)
-                event_handler(EndStepEvent(epoch_id, step_id, metrics))
-            event_handler(EndEpochEvent(epoch_id))
-        if self.checkpoint_cfg:
-            self._clean_checkpoint()
-
-    def _test_by_executor(self, reader, feed_order, fetch_list):
-        with executor.scope_guard(self.scope):
-            feed_var_list = build_feed_var_list(self.test_program, feed_order)
-            feeder = data_feeder.DataFeeder(
-                feed_list=feed_var_list, place=self.place)
-            exe = executor.Executor(self.place)
-            accumulated = len(fetch_list) * [0]
-            count = 0
-            for data in reader():
-                outs = exe.run(program=self.test_program,
-                               feed=feeder.feed(data),
-                               fetch_list=fetch_list)
-                accumulated = [x[0] + x[1][0] for x in zip(accumulated, outs)]
-                count += 1
-
-            return [x / count for x in accumulated]
-
-    def _train_by_parallel_executor(self, num_epochs, event_handler, reader,
-                                    feed_order):
-        with self._prog_and_scope_guard():
-            pe = self._get_or_create_parallel_executor()
-            feed_var_list = build_feed_var_list(self.train_program, feed_order)
-            feeder = data_feeder.DataFeeder(
-                feed_list=feed_var_list, place=self.place)
-            reader = feeder.decorate_reader(reader, multi_devices=True)
-            self._train_by_any_executor(event_handler, pe, num_epochs, reader)
-
-    def _get_parallel_executor(self):
-        return getattr(self, 'parallel_executor', None)
-
-    def _get_or_create_parallel_executor(self):
-        if self._get_parallel_executor() is None:
-            self.parallel_executor = parallel_executor.ParallelExecutor(
-                use_cuda=isinstance(self.place, core.CUDAPlace),
-                loss_name=self.train_func_outputs[0].name)
-        return self._get_parallel_executor()
-
-    def _clean_checkpoint(self):
-        assert self.checkpoint_cfg
-        clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
-
-    def _get_checkpoint_load_args(self):
-        """
-        epoch_id and step_id are runtime arguments, they are not variables, will load them independently.
-        """
-        return ["epoch_id", "step_id"]
-
-    def _get_checkpoint_save_args(self, epoch_id, step_id):
-        """
-        epoch_id and step_id are runtime arguments, they are not variables, will save them independently.
-        """
-        trainer_args = {}
-        trainer_args["epoch_id"] = epoch_id
-        trainer_args["step_id"] = step_id
-        return trainer_args
-
-    def _save_checkpoint(self, epoch_id, step_id):
-        assert self.checkpoint_cfg
-
-        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \
-                and step_id % self.checkpoint_cfg.step_interval == 0:
-            exe = executor.Executor(self.place)
-            save_checkpoint(
-                executor=exe,
-                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
-                trainer_id=self.trainer_id,
-                trainer_args=self._get_checkpoint_save_args(epoch_id, step_id),
-                main_program=self.train_program,
-                max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
-
-    def _load_checkpoint(self):
-        with self._prog_and_scope_guard():
-            exe = executor.Executor(self.place)
-            load_checkpoint(
-                executor=exe,
-                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
-                main_program=self.startup_program)
-
-            if not self.checkpoint_cfg.pserver_id:
-                load_trainer_args = self._get_checkpoint_load_args()
-                trainer_args = load_checkpoint(
-                    executor=exe,
-                    checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
-                    main_program=self.startup_program,
-                    role_id=self.trainer_id,
-                    is_trainer=True,
-                    load_trainer_args=load_trainer_args)
-
-                if len(trainer_args) != 2:
-                    raise ValueError(
-                        "the return trainer_args length do not equal _get_checkpoint_load_args"
-                    )
-                self.checkpoint_cfg.epoch_id = int(trainer_args[0])
-                self.checkpoint_cfg.step_id = int(trainer_args[1])
-            else:
-                if self.checkpoint_cfg.lookup_table_name:
-                    load_checkpoint(
-                        executor=exe,
-                        checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
-                        main_program=self.startup_program,
-                        role_id=self.checkpoint_cfg.pserver_id,
-                        is_trainer=False,
-                        load_trainer_args=None,
-                        load_lookup_table=self.checkpoint_cfg.lookup_table_name)
-
-
-def build_feed_var_list(program, feed_order):
-    if not isinstance(program, framework.Program):
-        raise TypeError("The 'program' should be an object of Program")
-
-    if isinstance(feed_order, list):
-        feed_var_list = [
-            program.global_block().var(var_name) for var_name in feed_order
-        ]
-    else:
-        if not isinstance(feed_order, dict):
-            raise TypeError(
-                "The 'feed_order' should be either None, list or dict.")
-        if not sorted(feed_order.values()) == list(range(len(feed_order))):
-            raise ValueError(
-                "The values of 'feed_order' should be a permutation of [0, len(feed_order))"
-            )
-        sorted_pair_list = sorted(
-            six.iteritems(feed_order), key=lambda item: item[1])
-        feed_var_list = [
-            program.global_block().var(pair[0]) for pair in sorted_pair_list
-        ]
-    return feed_var_list
-
-
-# move Checkpoint APIs from io.py to trainer.py, make all of them are private.
-SUCCESS_MARK_FILENAME = "_SUCCESS"
-CHECKPOINT_PREFIX = "checkpoint"
-MODEL_DIR = "__model__"
-LOOKUP_TABLE_DIR = "__lookup_table__"
-TRAINER_PREFIX = "trainer"
-CHECKPOINT_SEPARATOR = "_"
-
-
-def save_checkpoint(executor,
-                    checkpoint_dir,
-                    trainer_id,
-                    main_program,
-                    trainer_args=None,
-                    max_num_checkpoints=3,
-                    lookup_table=None,
-                    pserver_endpoints=None):
-    """
-    This function filters out all checkpoint variables from the give
-    main_program and then saves these variables to the `checkpoint_dir`
-    directory.
-
-    In the training precess, we generally save a checkpoint in each
-    iteration. So there might be a lot of checkpoints in the
-    `checkpoint_dir`. To avoid them taking too much disk space, the
-    `max_num_checkpoints` are introduced to limit the total number of
-    checkpoints. If the number of existing checkpints is greater than
-    the `max_num_checkpoints`, oldest ones will be scroll deleted.
-
-    A variable is a checkpoint variable and will be saved if it meets
-    all following conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for save checkpoint.
-        checkpoint_dir(str): The folder where to save checkpoints.
-        trainer_id(int): currect trainer id, if id is equal to 0, the trainer
-            is chief.
-        trainer_args(dict|None): Current training arguments. Such as 'epoch_id'
-            and 'step_id'.
-            Defaut: None
-        main_program(Program): The program whose checkpoint variables will
-            be saved.
-        max_num_checkpoints(int): The max number of total number of existing
-            checkpoints.
-            Default: 3
-        lookup_table(string|None): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name
-        pserver_endpoints(list|None): the parameter server ip:port list.
-            when use distribute lookup table, we can get pserver_endpoints by
-            distribute arguments.
-
-    Returns:
-        None
-
-    Raises:
-        ValueError: If `checkpoint_dir` is None.
-        AssertionError: If `trainer_args` is not a dict.
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            path = "./checkpoints"
-            prog = fluid.default_main_program()
-            trainer_args = {"epoch_id": 200,
-                            "step_id": 20} # just an example
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            save_checkpoint(executor=exe,
-                                     checkpoint_dir=path,
-                                     trainer_id=0,
-                                     trainer_args=trainer_args,
-                                     main_program=prog,
-                                     max_num_checkpoints=3,
-                                     lookup_table=table_name,
-                                     pserver_endpoints = ps_endpoints)
-    """
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-
-    if main_program is None:
-        raise ValueError('main_program should not be None.')
-
-    if trainer_args:
-        assert isinstance(trainer_args, dict)
-
-    is_chief = trainer_id == 0
-
-    _make_chekcpoint_dirs(checkpoint_dir)
-    serial = _get_latest_checkpoint_serial(checkpoint_dir) + 1
-    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-
-    _save_trainer_args(cur_dir, trainer_id, trainer_args)
-
-    if is_chief:
-        _save_persist_vars_without_grad(executor, cur_dir, main_program)
-
-    if is_chief and lookup_table and pserver_endpoints:
-        _save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
-                                     pserver_endpoints)
-
-    _scroll_delete(checkpoint_dir, max_num_checkpoints)
-
-
-def load_checkpoint(executor,
-                    checkpoint_dir,
-                    main_program,
-                    role_id=0,
-                    is_trainer=True,
-                    load_trainer_args=None,
-                    load_lookup_table=None):
-    """
-    This function filters out all checkpoint variables from the give
-    main_program and then try to load these variables from the
-    `checkpoint_dir` directory.
-
-    In the training precess, we generally save a checkpoint in each
-    iteration. So there are more than one checkpoint in the
-    `checkpoint_dir` (each checkpoint has its own sub folder), use
-    `serial` to specify which serial of checkpoint you would like to
-    load.
-
-    A variable is a checkpoint variable and will be loaded if it meets
-    all following conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for loading checkpoint.
-        checkpoint_dir(str): The folder where all checkpoints are.
-        serial(int): The serial of checkpoint you would like to load.
-        main_program(Program): The program whose checkpoint variables will
-                               be loaded.
-        role_id(int):  the trainer id or the parameter server id.
-        is_trainer(bool): trainer is True and parameter server is False.
-        load_trainer_args(list|None): list about load trainer args.
-        load_lookup_table(str|None): the lookup table name
-
-    Returns:
-        None
-
-    Raises:
-        ValueError: If `checkpoint_dir` is None.
-        ValueError: If `main_program` is None.
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            path = "./checkpoints"
-            prog = fluid.default_main_program()
-            load_checkpoint(executor=exe, checkpoint_dir=path,
-                    serial=9, main_program=prog)
-
-            # In this example, `load_checkpoint` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then try to load these variables form the
-            # folder "./checkpoints/checkpoint_9/__model__".
-    """
-
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-
-    serial = _get_latest_checkpoint_serial(checkpoint_dir)
-
-    # there are nothing  need to be loaded
-    if serial is None or serial < 0:
-        return
-
-    if main_program is None:
-        raise ValueError('main_program should not be None.')
-
-    if is_trainer and load_trainer_args is None:
-        cur_dir = _get_serial_dir(checkpoint_dir, serial)
-        _load_persist_vars_without_grad(executor, cur_dir, main_program, True)
-        return
-
-    if is_trainer and load_trainer_args:
-        return _load_trainer_args(checkpoint_dir, serial, role_id,
-                                  load_trainer_args)
-
-    if not is_trainer and load_lookup_table:
-        _load_lookup_table_vars(executor, checkpoint_dir, main_program, role_id,
-                                load_lookup_table)
-
-
-def clean_checkpoint(checkpoint_dir, delete_dir=False):
-    """
-    clean the checkpoint dir, when the train exits normally,
-    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
-    delete_dir only works when the directory is empty, otherwise, OSError is raised.
-
-    : param checkpoint_dir
-    : param delete_dir
-    """
-
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
-
-    if delete_dir and not os.listdir(checkpoint_dir):
-        os.rmdir(checkpoint_dir)
-
-
-def _load_persist_vars_without_grad(executor,
-                                    dirname,
-                                    program,
-                                    has_model_dir=False):
-    """
-    This function filters out all checkpoint variables from the give
-    program and then trys to load these variables from the given directory.
-
-    A variable is a checkpoint variable if it meets all following
-    conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for loading variables.
-        dirname(str): The directory path.
-        program(Program): The program whose checkpoint variables will
-                          be loaded.
-        has_model_dir(bool): if True, the function loads variables
-                             from a sub directory named '__model__'.
-                             Default: False
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            _load_persist_vars_without_grad(executor=exe,
-                    dirname=param_path, program=prog, has_model_dir=True)
-
-            # In this example, `_load_persist_vars_without_grad` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then trys to load these variables form the
-            # folder "./my_paddle_model/__model__".
-    """
-
-    if has_model_dir:
-        dirname = _get_model_dir(dirname)
-
-    io.load_vars(
-        executor,
-        dirname=dirname,
-        main_program=program,
-        predicate=_is_checkpoint_var,
-        filename=None)
-
-
-def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
-    """
-    The parameter server will load lookup table's local file in
-    selectedrows variable.
-
-    Args:
-        executor(Executor): The executor to run for loading persistable variables
-        dirname(str): The directory path
-        main_program(Program): Find the variable named table_name in main_program
-        pserver_id(int): the serial number in pserver_endpoints list
-        table_name(str): lookup table name
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            dirname = "./checkpoints/checkpoint_9/"
-            prog = fluid.default_main_program()
-            pserver_id = 1
-            table_name = "share_w"
-            _load_lookup_table_vars(executor=exe,
-                    dirname=dirname, program=prog, pserver_id=pserver_id,
-                    table_name=table_name)
-    """
-
-    for var in program.list_vars():
-        if var.name == table_name:
-            lookup_table_var = var
-            break
-
-    assert lookup_table_var is not None
-
-    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
-    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
-
-    load_prog = framework.Program()
-    load_block = load_prog.global_block()
-
-    load_block.append_op(
-        type='load',
-        inputs={},
-        outputs={'Out': [lookup_table_var]},
-        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
-
-    executor.run(load_prog)
-
-
-def _save_persist_vars_without_grad(executor, dirname, program):
-    """
-    This function filters out all checkpoint variables from the give
-    program and then save these variables to a sub-folder '__model__' of
-    the given directory.
-
-    A variable is a checkpoint variable if it meets all following
-    conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for saving variables.
-        dirname(str): The directory path.
-        program(Program): The program whose checkpoint variables will
-                          be saved.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            _save_persist_vars_without_grad(executor=exe,
-                    dirname=param_path, program=prog)
-
-            # In this example, `_save_persist_vars_without_grad` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then saves these variables to the folder
-            # "./my_paddle_model/__model__".
-    """
-    cur_dir = _get_model_dir(dirname)
-    io.save_vars(
-        executor,
-        dirname=cur_dir,
-        main_program=program,
-        vars=None,
-        predicate=_is_checkpoint_var,
-        filename=None)
-    _write_success(cur_dir)
-
-
-def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
-                                 ps_endpoint_list):
-    """
-    This function will send checkpoint notify message from Trainer 0
-    to all the pservers.
-    The checkpoint notify message contains lookup table name,
-    the absolute path on pserver to save lookup_table.
-
-    Args:
-        executor(Executor): The executor to run for send checkpoint notify.
-        dirname(str): The folder where to save checkpoints.
-        lookup_table(string): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name
-        ps_endpoint_list(list): the parameter server ip:port list.
-            when use distribute lookup table, we can get ps_endpoint_list by
-            distribute arguments.
-    Return:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            _save_pserver_vars_by_notify(executor=exe,
-                    dirname=param_path, lookup_table=table_name,
-                    ps_endpoint_list=ps_endpoints)
-    """
-    cur_dir = _get_lookuptable_dir(dirname)
-
-    checkpoint_notify_program = framework.Program()
-    checkpoint_notify_block = checkpoint_notify_program.global_block()
-
-    attrs = {}
-    attrs['epmap'] = ps_endpoint_list
-    attrs['dir'] = cur_dir
-    attrs['lookup_table'] = lookup_table
-
-    checkpoint_notify_block.append_op(
-        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-    executor.run(checkpoint_notify_program)
-
-
-def _save_trainer_args(dirname, trainer_id, trainer_args):
-    assert isinstance(trainer_args, dict)
-
-    cur_dir = _get_trainer_dir(dirname, trainer_id)
-
-    for name, value in six.iteritems(trainer_args):
-        args_file = os.path.join(cur_dir, name)
-        with open(args_file, 'w') as f:
-            f.write(str(value))
-    _write_success(cur_dir)
-
-
-def _load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
-    """
-    trainer will load some args from it's independent directory,
-    such as epoch_id and step_id.
-
-    Args:
-        checkpoint_dir(str): The folder where all checkpoints are.
-        serial(int): The serial of checkpoint you would like to load.
-        trainer_id(int): current trainer id.
-        trainer_args(list): list about load trainer args
-    Return:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            param_path = "./checkpoint/"
-            serial = 7
-            trainer_id = 2
-            trainer_args = ["epoch_id", "step_id"]
-
-            _load_trainer_args(checkpoint_dir=param_path, serial=serial,
-            trainer_id=trainer_id, trainer_args=trainer_args)
-    """
-    assert isinstance(trainer_args, list)
-
-    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
-
-    ret_values = []
-
-    for arg in trainer_args:
-        cur_file = os.path.join(cur_dir, arg)
-        with open(cur_file, 'r') as f:
-            contents = f.read()
-            ret_values.append(contents.strip())
-    return ret_values
-
-
-def _is_checkpoint_var(var):
-    """
-    the checkpoint will not save or load all the variables.
-    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-    : param var(Variable)
-    """
-    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-            var.desc.type() == core.VarDesc.VarType.RAW:
-        return False
-    # @GRAD are named for gradient variables, checkpoint will not save it.
-    if "@GRAD" in var.name:
-        return False
-    # .trainer_ are named for distribute train variables, checkpoint will not save it.
-    if ".trainer_" in var.name:
-        return False
-
-    # .block is named for distribute train variables, checkpoint will not save it.
-    if ".block" in var.name:
-        return False
-
-    return var.persistable
-
-
-def _make_chekcpoint_dirs(dirs):
-    """
-    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
-    """
-    assert dirs is not None
-
-    if os.path.isfile(dirs):
-        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
-
-    if not os.path.isdir(dirs):
-        try:
-            os.makedirs(dirs)
-        except OSError as err:
-            if err.errno != errno.EEXIST:
-                raise err
-
-
-def _get_dir_serial(dirname):
-    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
-
-    try:
-        serial_num = int(serial)
-    except ValueError:
-        serial_num = -1
-    return serial_num
-
-
-def _get_serial_dir(dirname, serial):
-    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
-    serial_dir = os.path.join(dirname, serial_folder)
-    _make_chekcpoint_dirs(serial_dir)
-
-    return serial_dir
-
-
-def _get_model_dir(dirname):
-    model_dir = os.path.join(dirname, MODEL_DIR)
-    _make_chekcpoint_dirs(model_dir)
-    return model_dir
-
-
-def _get_lookuptable_dir(dirname):
-    lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
-    _make_chekcpoint_dirs(lookuptable_dir)
-    return lookuptable_dir
-
-
-def _get_trainer_dir(dirname, trainer_id):
-    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
-    trainer_dir = os.path.join(dirname, trainer_folder)
-    _make_chekcpoint_dirs(trainer_dir)
-    return trainer_dir
-
-
-def _scroll_delete(dirname, max_num_checkpoints=3):
-    dirs = os.listdir(dirname)
-    serial_map = {}
-    for serial in dirs:
-        serial_num = _get_dir_serial(serial)
-        serial_map[serial_num] = serial
-
-    if len(list(serial_map.keys())) <= max_num_checkpoints:
-        return
-
-    serials = list(serial_map.keys())
-    serials.sort(reverse=True)
-    serials = serials[max_num_checkpoints:]
-    for serial in serials:
-        cur_dir = _get_serial_dir(dirname, serial)
-        try:
-            shutil.rmtree(cur_dir)
-        except OSError as err:
-            if err.errno != errno.ENOENT:
-                raise err
-
-
-def _write_success(dirname):
-    """
-    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
-
-    : param dirname
-    """
-    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
-    with open(success_file, 'a') as f:
-        now = time.ctime()
-        f.write(now)
-
-
-def _get_latest_checkpoint_serial(checkpoint_dir):
-    """
-    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
-
-    : param checkpoint_dir
-    """
-    if not checkpoint_dir:
-        return -1
-
-    def has_success(checkpoint_dir, cur_dir):
-        """
-        is _SUCCESS in this dir
-        """
-
-        serial = _get_dir_serial(cur_dir)
-        if serial == -1 or not os.path.isdir(
-                os.path.join(checkpoint_dir, cur_dir)):
-            return -1
-
-        success_path = os.path.join(
-            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
-            SUCCESS_MARK_FILENAME)
-        if os.path.isfile(success_path):
-            return serial
-
-    if not os.path.isdir(checkpoint_dir):
-        return -1
-
-    current_dir = -1
-    dirs = os.listdir(checkpoint_dir)
-    for cur_dir in dirs:
-        success_num = has_success(checkpoint_dir, cur_dir)
-        if success_num > current_dir:
-            current_dir = success_num
-    return current_dir
diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py
deleted file mode 100644
index 1c1c2fb22709189ca03dc543ca551257c8031c1a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from . import lookup_table_utils
-from .lookup_table_utils import *
-from . import hdfs_utils
-from .hdfs_utils import *
-
-__all__ = []
-__all__ += lookup_table_utils.__all__
-__all__ += hdfs_utils.__all__
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
deleted file mode 100644
index 962a5653f6135209de4e82d73b39cd3e8f8c9499..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ /dev/null
@@ -1,603 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""hdfs_utils.py will move to fluid/incubate/fleet/utils/hdfs.py"""
-
-import os
-import sys
-import subprocess
-import multiprocessing
-from datetime import datetime
-
-import re
-import copy
-import errno
-
-import logging
-from paddle.fluid.log_helper import get_logger
-
-__all__ = ["HDFSClient", "multi_download", "multi_upload"]
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class HDFSClient(object):
-    """
-    A tool of HDFS 
-
-    Args:
-        hadoop_home (string): hadoop_home 
-        configs (dict): hadoop config, it is a dict, please contain \
-            key "fs.default.name" and "hadoop.job.ugi"
-        Can be a float value
-    Examples:
-        hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-        configs = {
-            "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-            "hadoop.job.ugi": "hello,hello123"
-        }
-
-        client = HDFSClient(hadoop_home, configs)
-
-        client.ls("/user/com/train-25")
-        files = client.lsr("/user/com/train-25/models")
-    """
-
-    def __init__(self, hadoop_home, configs):
-        self.pre_commands = []
-        hadoop_bin = '%s/bin/hadoop' % hadoop_home
-        self.pre_commands.append(hadoop_bin)
-        dfs = 'fs'
-        self.pre_commands.append(dfs)
-
-        for k, v in configs.items():
-            config_command = '-D%s=%s' % (k, v)
-            self.pre_commands.append(config_command)
-
-    def __run_hdfs_cmd(self, commands, retry_times=5):
-        whole_commands = copy.deepcopy(self.pre_commands)
-        whole_commands.extend(commands)
-
-        print('Running system command: {0}'.format(' '.join(whole_commands)))
-
-        ret_code = 0
-        ret_out = None
-        ret_err = None
-        whole_commands = " ".join(whole_commands)
-        for x in range(retry_times + 1):
-            proc = subprocess.Popen(
-                whole_commands,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                shell=True)
-            (output, errors) = proc.communicate()
-            ret_code, ret_out, ret_err = proc.returncode, output, errors
-            if ret_code:
-                _logger.warn(
-                    'Times: %d, Error running command: %s. Return code: %d, Error: %s'
-                    % (x, ' '.join(whole_commands), proc.returncode, errors))
-            else:
-                break
-        return ret_code, ret_out, ret_err
-
-    def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
-        """
-        upload the local file to hdfs
-
-        Args:
-            hdfs_path(str): the hdfs file path
-            local_path(str): the local file path
-            overwrite(bool|None): will overwrite the file on HDFS or not
-            retry_times(int|5): retry times
-
-        Returns:
-                True or False
-        """
-        assert hdfs_path is not None
-        assert local_path is not None and os.path.exists(local_path)
-
-        if os.path.isdir(local_path):
-            _logger.warn(
-                "The Local path: {} is dir and I will support it later, return".
-                format(local_path))
-            return False
-
-        base = os.path.basename(local_path)
-        if not self.is_exist(hdfs_path):
-            self.makedirs(hdfs_path)
-        else:
-            if self.is_exist(os.path.join(hdfs_path, base)):
-                if overwrite:
-                    _logger.error(
-                        "The HDFS path: {} is exist and overwrite is True, delete it".
-                        format(hdfs_path))
-                    self.delete(hdfs_path)
-                else:
-                    _logger.error(
-                        "The HDFS path: {} is exist and overwrite is False, return".
-                        format(hdfs_path))
-                    return False
-
-        put_commands = ["-put", local_path, hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(put_commands,
-                                                         retry_times)
-        if returncode:
-            _logger.error("Put local path: {} to HDFS path: {} failed".format(
-                local_path, hdfs_path))
-            return False
-        else:
-            _logger.info("Put local path: {} to HDFS path: {} successfully".
-                         format(local_path, hdfs_path))
-            return True
-
-    def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
-        """
-        download file from HDFS
-
-        Args:
-            hdfs_path(str): the hdfs file path
-            local_path(str): the local file path
-            overwrite(bool|None): will overwrite the file on HDFS or not
-            unzip(bool|False): if the download file is compressed by zip, unzip it or not.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Downloading %r to %r.', hdfs_path, local_path)
-        _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
-
-        if not self.is_exist(hdfs_path):
-            print("HDFS path: {} do not exist".format(hdfs_path))
-            return False
-        if self.is_dir(hdfs_path):
-            _logger.error(
-                "The HDFS path: {} is dir and I will support it later, return".
-                format(hdfs_path))
-
-        if os.path.exists(local_path):
-            base = os.path.basename(hdfs_path)
-            local_file = os.path.join(local_path, base)
-            if os.path.exists(local_file):
-                if overwrite:
-                    os.remove(local_file)
-                else:
-                    _logger.error(
-                        "The Local path: {} is exist and overwrite is False, return".
-                        format(local_file))
-                    return False
-
-        self.make_local_dirs(local_path)
-
-        download_commands = ["-get", hdfs_path, local_path]
-        returncode, output, errors = self.__run_hdfs_cmd(download_commands)
-        if returncode:
-            _logger.error("Get local path: {} from HDFS path: {} failed".format(
-                local_path, hdfs_path))
-            return False
-        else:
-            _logger.info("Get local path: {} from HDFS path: {} successfully".
-                         format(local_path, hdfs_path))
-            return True
-
-    def is_exist(self, hdfs_path=None):
-        """
-        whether the remote HDFS path exists
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-        exist_cmd = ['-test', '-e', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            exist_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS is_exist HDFS path: {} failed".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
-                hdfs_path))
-            return True
-
-    def is_dir(self, hdfs_path=None):
-        """
-        whether the remote HDFS path is directory
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-
-        if not self.is_exist(hdfs_path):
-            return False
-
-        dir_cmd = ['-test', '-d', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS path: {} failed is not a directory".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} successfully is a directory".format(
-                hdfs_path))
-            return True
-
-    def delete(self, hdfs_path):
-        """
-        Remove a file or directory from HDFS.
-
-        whether the remote HDFS path exists
-
-        Args:
-        hdfs_path: HDFS path.
-
-        Returns:
-            True or False
-            This function returns `True` if the deletion was successful and `False` if
-            no file or directory previously existed at `hdfs_path`.
-        """
-        _logger.info('Deleting %r.', hdfs_path)
-
-        if not self.is_exist(hdfs_path):
-            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
-            return True
-
-        if self.is_dir(hdfs_path):
-            del_cmd = ['-rmr', hdfs_path]
-        else:
-            del_cmd = ['-rm', hdfs_path]
-
-        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
-
-        if returncode:
-            _logger.error("HDFS path: {} delete files failure".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} delete files successfully".format(
-                hdfs_path))
-            return True
-
-    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
-        """
-        Move a file or folder on HDFS.
-
-        Args:
-        hdfs_path(str): HDFS path.
-        overwrite(bool|False): If the path already exists and overwrite is False, will return False.
-
-        Returns:
-            True or False
-        """
-        assert hdfs_src_path is not None
-        assert hdfs_dst_path is not None
-
-        if not self.is_exist(hdfs_src_path):
-            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
-        if self.is_exist(hdfs_dst_path) and not overwrite:
-            _logger.error("HDFS path is exist: {} and overwrite=False".format(
-                hdfs_dst_path))
-
-        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            rename_command, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS rename path: {} to {} failed".format(
-                hdfs_src_path, hdfs_dst_path))
-            return False
-        else:
-            _logger.info("HDFS rename path: {} to {} successfully".format(
-                hdfs_src_path, hdfs_dst_path))
-            return True
-
-    @staticmethod
-    def make_local_dirs(local_path):
-        """
-        create a directiory local, is same to mkdir
-        Args:
-            local_path: local path that wants to create a directiory.
-        """
-        try:
-            os.makedirs(local_path)
-        except OSError as e:
-            if e.errno != errno.EEXIST:
-                raise
-
-    def makedirs(self, hdfs_path):
-        """
-        Create a remote directory, recursively if necessary.
-
-        Args:
-        hdfs_path(str): Remote path. Intermediate directories will be created appropriately.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Creating directories to %r.', hdfs_path)
-        assert hdfs_path is not None
-
-        if self.is_exist(hdfs_path):
-            _logger.error("HDFS path is exist: {}".format(hdfs_path))
-            return
-
-        mkdirs_commands = ['-mkdir', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            mkdirs_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
-            return False
-        else:
-            _logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
-            return True
-
-    def ls(self, hdfs_path):
-        """
-        ls directory contents about HDFS hdfs_path
-
-        Args:
-        hdfs_path(str): Remote HDFS path will be ls.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-ls', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list path: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
-
-            ret_lines = []
-            regex = re.compile('\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    ret_lines.append(re_line[7])
-            return ret_lines
-
-    def lsr(self, hdfs_path, only_file=True, sort=True):
-        """
-        list directory contents about HDFS hdfs_path recursively
-
-        Args:
-        hdfs_path(str): Remote HDFS path.
-        only_file(bool|True): will discard folders.
-        sort(bool|True): will be sorted by create time.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-
-        def sort_by_time(v1, v2):
-            v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
-            v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
-            return v1_time > v2_time
-
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-lsr', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list all files: {} successfully".format(
-                hdfs_path))
-            lines = []
-            regex = re.compile('\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    if only_file and re_line[0][0] == "d":
-                        continue
-                    else:
-                        lines.append(
-                            (re_line[7], re_line[5] + " " + re_line[6]))
-            if sort:
-                sorted(lines, cmp=sort_by_time)
-            ret_lines = [ret[0] for ret in lines]
-            return ret_lines
-
-
-def multi_download(client,
-                   hdfs_path,
-                   local_path,
-                   trainer_id,
-                   trainers,
-                   multi_processes=5):
-    """
-    Download files from HDFS using multi process.
-
-    Args:
-        client(HDFSClient): instance of HDFSClient
-        hdfs_path(str): path on hdfs
-        local_path(str): path on local
-        trainer_id(int): current trainer id
-        trainers(int): all trainers number
-        multi_processes(int|5): the download data process at the same time, default=5
-
-    Returns:
-        List:
-        Download files in local folder.
-    """
-
-    def __subprocess_download(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-            if re_path == os.curdir:
-                sub_local_re_path = local_path
-            else:
-                sub_local_re_path = os.path.join(local_path, re_path)
-            client.download(data, sub_local_re_path)
-
-    assert isinstance(client, HDFSClient)
-
-    client.make_local_dirs(local_path)
-    _logger.info("Make local dir {} successfully".format(local_path))
-
-    all_need_download = client.lsr(hdfs_path, sort=True)
-    need_download = all_need_download[trainer_id::trainers]
-    _logger.info("Get {} files From all {} files need to be download from {}".
-                 format(len(need_download), len(all_need_download), hdfs_path))
-
-    _logger.info("Start {} multi process to download datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = need_download[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_download, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to download datas".format(
-        multi_processes))
-
-    local_downloads = []
-    for data in need_download:
-        data_name = os.path.basename(data)
-        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-        if re_path == os.curdir:
-            local_re_path = os.path.join(local_path, data_name)
-        else:
-            local_re_path = os.path.join(local_path, re_path, data_name)
-        local_downloads.append(local_re_path)
-
-    return local_downloads
-
-
-def getfilelist(path):
-    rlist = []
-    for dir, folder, file in os.walk(path):
-        for i in file:
-            t = os.path.join(dir, i)
-            rlist.append(t)
-    for r in rlist:
-        print(r)
-
-
-def multi_upload(client,
-                 hdfs_path,
-                 local_path,
-                 multi_processes=5,
-                 overwrite=False,
-                 sync=True):
-    """
-    Upload files to HDFS using multi process.
-
-    Args:
-        client(HDFSClient): instance of HDFSClient
-        hdfs_path(str): path on hdfs
-        local_path(str): path on local
-        multi_processes(int|5): the upload data process at the same time, default=5
-        overwrite(bool|False): will overwrite file on HDFS or not
-        sync(bool|True): upload files sync or not.
-
-    Returns:
-        None
-    """
-
-    def __subprocess_upload(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), local_path)
-            hdfs_re_path = os.path.join(hdfs_path, re_path)
-            client.upload(hdfs_re_path, data, overwrite, retry_times=5)
-
-    def get_local_files(path):
-        rlist = []
-
-        if not os.path.isdir(path):
-            return rlist
-
-        for dirname, folder, files in os.walk(path):
-            for i in files:
-                t = os.path.join(dirname, i)
-                rlist.append(t)
-        return rlist
-
-    assert isinstance(client, HDFSClient)
-
-    all_files = get_local_files(local_path)
-    if not all_files:
-        _logger.info("there are nothing need to upload, exit")
-        return
-    _logger.info("Start {} multi process to upload datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = all_files[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_upload, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to upload datas".format(
-        multi_processes))
-
-
-if __name__ == "__main__":
-    hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-    configs = {
-        "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-        "hadoop.job.ugi": "hello,hello123"
-    }
-
-    client = HDFSClient(hadoop_home, configs)
-
-    client.ls("/user/com/train-25")
-    files = client.lsr("/user/com/train-25/models")
-
-    downloads = multi_download(
-        client,
-        "/user/com/train-25/model",
-        "/home/xx/data1",
-        1,
-        5,
-        100,
-        multi_processes=5)
-
-    multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
deleted file mode 100644
index 2d18a9a8620e210d5b0f6fbb90c3b59e31ac8086..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ /dev/null
@@ -1,496 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lookup_table_utils.py will move to fluid/incubate/fleet/utils/lookup_table.py"""
-
-from __future__ import print_function
-
-import os
-import time
-import logging
-
-import paddle
-from paddle.fluid import core
-from paddle.fluid import io
-from paddle.fluid import Program
-from paddle.fluid.log_helper import get_logger
-
-__all__ = [
-    "load_persistables_for_increment", "load_persistables_for_inference",
-    "convert_dist_to_sparse_program"
-]
-
-_logger = get_logger(
-    'lookup_table_utils',
-    logging.INFO,
-    fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-model_filename = "__model__"
-lookup_table_dir = "__lookup_table__"
-
-
-def __insert_lookup_sparse_table_op(main_program, idx, ids, w, out):
-    main_program.global_block()._insert_op(
-        index=idx,
-        type="lookup_sparse_table",
-        inputs={"Ids": [ids],
-                "W": [w]},
-        outputs={"Out": [out]},
-        attrs={
-            "is_distributed": False,
-            "is_sparse": True,
-            "grad_inplace": False
-        })
-
-
-def __get_prefetch_op_tuples(main_program):
-    # current lookup tables op is split_ids->prefetch->merge_ids
-    prefetch_op_tuples = None
-    op_types = [op.type for op in main_program.global_block().ops]
-
-    for i in range(len(op_types)):
-        if op_types[i] == "prefetch":
-            if op_types[i - 1] == "split_ids" and op_types[i +
-                                                           1] == "merge_ids":
-                split_ids_op_id = i - 1
-                split_ids_inputs = main_program.global_block().ops[i - 1].input(
-                    "Ids")
-                prefetch_op_inputs = main_program.global_block().ops[i].input(
-                    "X")
-                prefetch_op_outputs = main_program.global_block().ops[i].output(
-                    "Out")
-                merge_ids_outputs = main_program.global_block().ops[
-                    i + 1].output("Out")
-
-                need_delete_vars = []
-                need_delete_vars.extend(prefetch_op_inputs)
-                need_delete_vars.extend(prefetch_op_outputs)
-
-                prefetch_op_tuples = (split_ids_op_id, split_ids_inputs,
-                                      merge_ids_outputs, need_delete_vars)
-                break
-    return prefetch_op_tuples
-
-
-def convert_dist_to_sparse_program(program):
-    """
-    WARNING: this function will only be used for distributed training with distributed lookup table.
-    when we train model with distributed lookup table but want to do the local inference, we can use
-    this function to convert the train program with distributed lookup table to sparse lookup table.
-
-    Args:
-        program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
-    Returns:
-        program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
-    """
-    if not program._distributed_lookup_table:
-        _logger.warn(
-            "There are no distributed lookup tables need to be converted")
-        return
-
-    # create table param and grad var in pserver program
-    origin_emb_var = "{}.origin".format(program._distributed_lookup_table)
-    emb_var = program._distributed_lookup_table
-    program.global_block()._rename_var(emb_var, origin_emb_var)
-    origin_param_var = program.global_block().vars[origin_emb_var]
-
-    param_var = program.global_block().create_var(
-        name=emb_var,
-        shape=origin_param_var.shape,
-        dtype=origin_param_var.dtype,
-        type=core.VarDesc.VarType.SELECTED_ROWS,
-        persistable=True)
-    # parameter must be selected rows
-    param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-    program._sync_with_cpp()
-
-    prefetch_op_tuples = __get_prefetch_op_tuples(program)
-
-    split_ids_id = prefetch_op_tuples[0]
-
-    for idx in range(split_ids_id + 2, split_ids_id - 1, -1):
-        program.global_block()._remove_op(idx)
-    program.desc.flush()
-
-    in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2])
-
-    for in_out_pair in in_out_pairs:
-        idx = split_ids_id
-        ids = program.global_block().vars[in_out_pair[0]]
-        out = program.global_block().vars[in_out_pair[1]]
-        __insert_lookup_sparse_table_op(program, idx, ids, param_var, out)
-        program.desc.flush()
-    return program
-
-
-def load_persistables_for_increment(dirname, executor, program,
-                                    lookup_table_var, lookup_table_var_path):
-    """
-    WARNING: this function will only be used for distributed training with distributed lookup table.
-    for increment trainning, the pserver will not only load dense variables,
-    but also load the suitable lookup table var. Because of sliced lookup table
-    var with HASH, we must load the correct sliced var.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        program(Program): The parameter server program, which will run on Pserver.
-        lookup_table_var: the distributed lookup tables var name.
-        lookup_table_var_path: the the distributed lookup tables var location.
-
-    Returns:
-        None
-    """
-
-    def _load_persistable_vars(executor, dirname, need_load_vars):
-        load_prog = Program()
-        load_block = load_prog.global_block()
-        need_delete_vars = []
-
-        for param in need_load_vars:
-            origin_var = param.origin
-            slice_var = param.slice
-            is_slice = param.is_slice
-            offset = param.offset
-
-            if is_slice:
-                origin = load_block.create_var(
-                    name="{}.load".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-                slice = load_block.create_var(
-                    name=slice_var.name,
-                    type=slice_var.type,
-                    shape=slice_var.shape,
-                    dtype=slice_var.dtype,
-                    persistable=True)
-
-                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
-                start = int(offset / dim1_flatten)
-                end = int(offset / dim1_flatten + slice.shape[0])
-
-                load_block.append_op(
-                    type="slice",
-                    inputs={'Input': origin},
-                    outputs={'Out': slice},
-                    attrs={'axes': [0],
-                           'starts': [start],
-                           'ends': [end]})
-
-                need_delete_vars.append(origin)
-            else:
-                origin = load_block.create_var(
-                    name="{}".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-        load_block.append_op(
-            type='delete_var',
-            inputs={'X': need_delete_vars}, )
-
-        executor.run(load_prog)
-
-    def __load_lookup_table_vars(executor, main_program, lookup_table_var,
-                                 lookup_table_var_path):
-        emb_var = main_program.global_block().var(lookup_table_var)
-
-        load_program = Program()
-        load_block = load_program.global_block()
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [emb_var]},
-            attrs={'file_path': lookup_table_var_path})
-        executor.run(load_program)
-
-    if not os.path.isdir(dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if not os.path.exists(lookup_table_var_path):
-        raise ValueError("There is no file named '%s'", lookup_table_var_path)
-
-    if not isinstance(program, Program):
-        raise ValueError("program must be an instance of fluid.Program")
-
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    need_load_vars = program._parameters_on_pservers.get_distributed_vars_by_ep(
-        program._ps_endpoint)
-    _load_persistable_vars(executor, dirname, need_load_vars)
-    __load_lookup_table_vars(executor, program, lookup_table_var,
-                             lookup_table_var_path)
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-
-def load_persistables_for_inference(dirname, executor, program,
-                                    lookup_table_var_name):
-    """
-    WARNING: this function will only be used for inference with distributed lookup table.
-    Inference with distributed lookup table is a little funky, this function will load distributed
-    lookup table vars into sparse var, can be used in local inference mode.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        program(Program): The parameter server program, which will run on Pserver.
-        lookup_table_var_name: the distributed lookup tables var name.
-    Returns:
-        None
-    """
-
-    def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
-        def _is_checkpoint_var(exclude_fluid_vars=None):
-            """
-            the checkpoint will not save or load all the variables.
-            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-            : param var(Variable)
-            """
-
-            if exclude_fluid_vars is None:
-                exclude_fluid_vars = []
-
-            def is_valid(var):
-                if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                        var.desc.type() == core.VarDesc.VarType.RAW:
-                    return False
-                # @GRAD are named for gradient variables, checkpoint will not save it.
-                if "@GRAD" in var.name:
-                    return False
-                # .trainer_ are named for distribute train variables, checkpoint will not save it.
-                if ".trainer_" in var.name:
-                    return False
-
-                # .block is named for distribute train variables, checkpoint will not save it.
-                if ".block" in var.name:
-                    return False
-
-                if "tmp_" in var.name:
-                    return False
-
-                if var.name in exclude_fluid_vars:
-                    return False
-
-                return var.persistable
-
-            return is_valid
-
-        io.load_vars(
-            executor,
-            dirname=dirname,
-            main_program=program,
-            predicate=_is_checkpoint_var(lookup_table_vars),
-            filename=None)
-
-    def _load_lookup_table_vars(executor, dirname, main_program,
-                                lookup_table_vars):
-        if not os.path.isdir(dirname):
-            raise ValueError("There is no directory named '%s'", dirname)
-
-        lookup_table_dirname = os.path.join(dirname, lookup_table_dir)
-
-        emb_var_name = lookup_table_vars[0]
-        emb_var = main_program.global_block().var(emb_var_name)
-
-        emb_files = []
-        for emb_name in os.listdir(lookup_table_dirname):
-            if emb_var_name in emb_name:
-                emb_files.append(emb_name)
-
-        convert_program = Program()
-        global_block = convert_program.global_block()
-
-        emb_var = global_block.create_var(
-            name=emb_var.name,
-            shape=emb_var.shape,
-            dtype=emb_var.dtype,
-            type=core.VarDesc.VarType.SELECTED_ROWS,
-            persistable=True)
-        emb_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-
-        sums = []
-
-        for i, emb_file in enumerate(emb_files):
-            var_name = "{}_{}".format(emb_var.name, i)
-            param_var = global_block.create_var(
-                name=var_name,
-                shape=emb_var.shape,
-                dtype=emb_var.dtype,
-                type=core.VarDesc.VarType.SELECTED_ROWS,
-                persistable=True)
-            param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-            global_block.append_op(
-                type='load',
-                inputs={},
-                outputs={'Out': [param_var]},
-                attrs={
-                    'file_path': os.path.join(lookup_table_dirname, var_name)
-                })
-            sums.append(param_var)
-        global_block.append_op(
-            type='merge_sparse_lookup_table',
-            inputs={"X": sums},
-            outputs={'Out': emb_var},
-            attrs={})
-        global_block.append_op(
-            type='save',
-            inputs={"X": [emb_var]},
-            outputs={},
-            attrs={
-                'file_path': os.path.join(lookup_table_dirname, emb_var.name)
-            })
-        global_block.append_op(type='delete_var', inputs={'X': sums})
-        executor.run(convert_program)
-
-    if not os.path.isdir(dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if program:
-        if not isinstance(program, Program):
-            raise ValueError("program must be an instance of fluid.Program")
-    else:
-        local_model = os.path.join(dirname, model_filename)
-
-        with open(local_model, "rb") as f:
-            program_desc_str = f.read()
-
-        program = Program.parse_from_string(program_desc_str)
-
-        if not core._is_program_version_supported(program._version()):
-            raise ValueError("Unsupported program version: %d\n" %
-                             program._version())
-
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    _load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
-    _load_lookup_table_vars(executor, dirname, program, [lookup_table_var_name])
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    return program
-
-
-def get_inference_model(main_program, feeded_var_names, target_vars):
-    """
-    Prune the given `main_program` to build a new program especially for inference with distributed lookup table ,
-    and then add `feeded_vars` and `target_vars` in this program.
-
-    Args:
-        main_program(Program|None): The original program, which will be pruned to
-                                    build the inference model. If is setted None,
-                                    the default main program will be used.
-                                    Default: None.
-        feeded_var_names(list[str]): Names of variables that need to be feeded data
-                                     during inference.
-        target_vars(list[Variable]): Variables from which we can get inference
-                                     results.
-    Returns:
-        program(Program)
-
-    Raises:
-        ValueError: If `feed_var_names` is not a list of basestring.
-        ValueError: If `target_vars` is not a list of Variable.
-
-    """
-
-    def prepend_feed_ops(inference_program,
-                         feed_target_names,
-                         feed_holder_name='feed'):
-        if len(feed_target_names) == 0:
-            return
-
-        global_block = inference_program.global_block()
-
-        feed_var = global_block.create_var(
-            name=feed_holder_name,
-            type=core.VarDesc.VarType.FEED_MINIBATCH,
-            persistable=True)
-
-        for i, name in enumerate(feed_target_names):
-            out = global_block.var(name)
-            global_block._prepend_op(
-                type='feed',
-                inputs={'X': [feed_var]},
-                outputs={'Out': [out]},
-                attrs={'col': i})
-
-    def append_fetch_ops(inference_program,
-                         fetch_target_names,
-                         fetch_holder_name='fetch'):
-        global_block = inference_program.global_block()
-        fetch_var = global_block.create_var(
-            name=fetch_holder_name,
-            type=core.VarDesc.VarType.FETCH_LIST,
-            persistable=True)
-
-        for i, name in enumerate(fetch_target_names):
-            global_block.append_op(
-                type='fetch',
-                inputs={'X': [name]},
-                outputs={'Out': [fetch_var]},
-                attrs={'col': i})
-
-    origin_program = main_program.clone()
-    main_program = main_program.clone()
-    global_block = main_program.global_block()
-
-    need_to_remove_op_index = []
-    for i, op in enumerate(global_block.ops):
-        op.desc.set_is_target(False)
-        if op.type == "feed" or op.type == "fetch":
-            need_to_remove_op_index.append(i)
-
-    for index in need_to_remove_op_index[::-1]:
-        global_block._remove_op(index)
-
-    main_program.desc.flush()
-
-    main_program = main_program._prune(targets=target_vars)
-    main_program = main_program._inference_optimize(prune_read_op=True)
-
-    fetch_var_names = [v.name for v in target_vars]
-
-    prepend_feed_ops(main_program, feeded_var_names)
-    append_fetch_ops(main_program, fetch_var_names)
-
-    return main_program
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
deleted file mode 100644
index accffc17448a4f6f7c4b630d2149fcfed7c5135f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/core.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import site
-import sys
-import os
-
-core_suffix = 'so'
-if os.name == 'nt':
-    core_suffix = 'pyd'
-
-has_avx_core = False
-has_noavx_core = False
-
-current_path = os.path.abspath(os.path.dirname(__file__))
-if os.path.exists(current_path + os.sep + 'core_avx.' + core_suffix):
-    has_avx_core = True
-
-if os.path.exists(current_path + os.sep + 'core_noavx.' + core_suffix):
-    has_noavx_core = True
-
-try:
-    if os.name == 'nt':
-        third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
-        os.environ['path'] = third_lib_path + ';' + os.environ['path']
-        sys.path.insert(0, third_lib_path)
-
-except ImportError as e:
-    from .. import compat as cpt
-    if os.name == 'nt':
-        executable_path = os.path.abspath(os.path.dirname(sys.executable))
-        raise ImportError(
-            """NOTE: You may need to run \"set PATH=%s;%%PATH%%\"
-        if you encounters \"DLL load failed\" errors. If you have python
-        installed in other directory, replace \"%s\" with your own
-        directory. The original error is: \n %s""" %
-            (executable_path, executable_path, cpt.get_exception_message(e)))
-    else:
-        raise ImportError(
-            """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
-        if you encounters \"libmkldnn.so not found\" errors. If you have python
-        installed in other directory, replace \"/usr/local/lib\" with your own
-        directory. The original error is: \n""" + cpt.get_exception_message(e))
-except Exception as e:
-    raise e
-
-
-def avx_supported():
-    """
-    Whether current system(Linux, MacOS, Windows) is supported with AVX.
-    """
-    import platform
-    from .. import compat as cpt
-    sysstr = platform.system().lower()
-    has_avx = False
-    if sysstr == 'linux':
-        try:
-            has_avx = os.popen('cat /proc/cpuinfo | grep -i avx').read() != ''
-        except Exception as e:
-            sys.stderr.write('Can not get the AVX flag from /proc/cpuinfo.\n'
-                             'The original error is: %s\n' %
-                             cpt.get_exception_message(e))
-        return has_avx
-    elif sysstr == 'darwin':
-        try:
-            has_avx = os.popen(
-                'sysctl machdep.cpu.features | grep -i avx').read() != ''
-        except Exception as e:
-            sys.stderr.write(
-                'Can not get the AVX flag from machdep.cpu.features.\n'
-                'The original error is: %s\n' % cpt.get_exception_message(e))
-        if not has_avx:
-            try:
-                has_avx = os.popen(
-                    'sysctl machdep.cpu.leaf7_features | grep -i avx').read(
-                    ) != ''
-            except Exception as e:
-                sys.stderr.write(
-                    'Can not get the AVX flag from machdep.cpu.leaf7_features.\n'
-                    'The original error is: %s\n' %
-                    cpt.get_exception_message(e))
-        return has_avx
-    elif sysstr == 'windows':
-        import ctypes
-        ONE_PAGE = ctypes.c_size_t(0x1000)
-
-        def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
-            # Call the code_str as a function
-            # Alloc 1 page to ensure the protection
-            pfnVirtualAlloc = ctypes.windll.kernel32.VirtualAlloc
-            pfnVirtualAlloc.restype = ctypes.c_void_p
-            MEM_COMMIT = ctypes.c_ulong(0x1000)
-            PAGE_READWRITE = ctypes.c_ulong(0x4)
-            address = pfnVirtualAlloc(None, ONE_PAGE, MEM_COMMIT,
-                                      PAGE_READWRITE)
-            if not address:
-                raise Exception("Failed to VirtualAlloc")
-
-            # Copy the code into the memory segment
-            memmove = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p,
-                                       ctypes.c_void_p,
-                                       ctypes.c_size_t)(ctypes._memmove_addr)
-            if memmove(address, code_str, len(code_str)) < 0:
-                raise Exception("Failed to memmove")
-
-            # Enable execute permissions
-            PAGE_EXECUTE = ctypes.c_ulong(0x10)
-            pfnVirtualProtect = ctypes.windll.kernel32.VirtualProtect
-            res = pfnVirtualProtect(
-                ctypes.c_void_p(address), ONE_PAGE, PAGE_EXECUTE,
-                ctypes.byref(ctypes.c_ulong(0)))
-            if not res:
-                raise Exception("Failed VirtualProtect")
-
-            # Flush instruction cache
-            pfnGetCurrentProcess = ctypes.windll.kernel32.GetCurrentProcess
-            pfnGetCurrentProcess.restype = ctypes.c_void_p
-            prochandle = ctypes.c_void_p(pfnGetCurrentProcess())
-            res = ctypes.windll.kernel32.FlushInstructionCache(
-                prochandle, ctypes.c_void_p(address), ONE_PAGE)
-            if not res:
-                raise Exception("Failed FlushInstructionCache")
-
-            # Cast the memory to function
-            functype = ctypes.CFUNCTYPE(restype, *argtypes)
-            func = functype(address)
-            return func, address
-
-        # http://en.wikipedia.org/wiki/CPUID#EAX.3D1:_Processor_Info_and_Feature_Bits
-        # mov eax,0x1; cpuid; mov cx, ax; ret
-        code_str = b"\xB8\x01\x00\x00\x00\x0f\xa2\x89\xC8\xC3"
-        avx_bit = 28
-        retval = 0
-        try:
-            # Convert the code_str into a function that returns uint
-            func, address = asm_func(code_str)
-            retval = func()
-            ctypes.windll.kernel32.VirtualFree(
-                ctypes.c_void_p(address), ctypes.c_size_t(0), ONE_PAGE)
-        except Exception as e:
-            sys.stderr.write('Failed getting the AVX flag on Windows.\n'
-                             'The original error is: %s\n' %
-                             cpt.get_exception_message(e))
-        return (retval & (1 << avx_bit)) > 0
-    else:
-        sys.stderr.write('Do not get AVX flag on %s\n' % sysstr)
-        return False
-
-
-load_noavx = False
-
-if avx_supported():
-    try:
-        from .core_avx import *
-        from .core_avx import __doc__, __file__, __name__, __package__
-        from .core_avx import __unittest_throw_exception__
-        from .core_avx import _append_python_callable_object_and_return_id
-        from .core_avx import _cleanup, _Scope
-        from .core_avx import _get_use_default_grad_op_desc_maker_ops
-        from .core_avx import _is_program_version_supported
-        from .core_avx import _set_eager_deletion_mode
-        from .core_avx import _set_fuse_parameter_group_size
-        from .core_avx import _set_fuse_parameter_memory_size
-        from .core_avx import _is_dygraph_debug_enabled
-        from .core_avx import _dygraph_debug_level
-        from .core_avx import _set_paddle_lib_path
-    except Exception as e:
-        if has_avx_core:
-            raise e
-        else:
-            from .. import compat as cpt
-            sys.stderr.write(
-                'WARNING: Do not have avx core. You may not build with AVX, '
-                'but AVX is supported on local machine.\n You could build paddle '
-                'WITH_AVX=ON to get better performance.\n'
-                'The original error is: %s\n' % cpt.get_exception_message(e))
-            load_noavx = True
-else:
-    load_noavx = True
-
-if load_noavx:
-    try:
-        from .core_noavx import *
-        from .core_noavx import __doc__, __file__, __name__, __package__
-        from .core_noavx import __unittest_throw_exception__
-        from .core_noavx import _append_python_callable_object_and_return_id
-        from .core_noavx import _cleanup, _Scope
-        from .core_noavx import _get_use_default_grad_op_desc_maker_ops
-        from .core_noavx import _is_program_version_supported
-        from .core_noavx import _set_eager_deletion_mode
-        from .core_noavx import _set_fuse_parameter_group_size
-        from .core_noavx import _set_fuse_parameter_memory_size
-        from .core_noavx import _is_dygraph_debug_enabled
-        from .core_noavx import _dygraph_debug_level
-        from .core_noavx import _set_paddle_lib_path
-    except Exception as e:
-        if has_noavx_core:
-            sys.stderr.write(
-                'Error: Can not import noavx core while this file exists ' +
-                current_path + os.sep + 'core_noavx.' + core_suffix + '\n')
-        raise e
-
-
-# set paddle lib path
-def set_paddle_lib_path():
-    site_dirs = site.getsitepackages() if hasattr(
-        site,
-        'getsitepackages') else [x for x in sys.path if 'site-packages' in x]
-    for site_dir in site_dirs:
-        lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
-        if os.path.exists(lib_dir):
-            _set_paddle_lib_path(lib_dir)
-            return
-    if hasattr(site, 'USER_SITE'):
-        lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
-        if os.path.exists(lib_dir):
-            _set_paddle_lib_path(lib_dir)
-
-
-set_paddle_lib_path()
diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py
deleted file mode 100644
index fa4e35a44c46c7d01b939b1790e33fd3c829cc11..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/data_feed_desc.py
+++ /dev/null
@@ -1,250 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid.proto import data_feed_pb2
-from google.protobuf import text_format
-
-__all__ = ['DataFeedDesc']
-
-
-class DataFeedDesc(object):
-    """
-    Datafeed descriptor, describing input training data format. This class is
-    currently only used for AsyncExecutor (See comments for class AsyncExecutor
-    for a brief introduction)
-
-    DataFeedDesc shall be initialized from a valid protobuf message from disk.
-
-    See :code:`paddle/fluid/framework/data_feed.proto` for message definition.
-    A typical message might look like:
-
-    .. code-block:: python
-
-      import paddle.fluid as fluid
-      f = open("data.proto", "w")
-      print >> f, 'name: "MultiSlotDataFeed"'
-      print >> f, 'batch_size: 2'
-      print >> f, 'multi_slot_desc {'
-      print >> f, '    slots {'
-      print >> f, '         name: "words"'
-      print >> f, '         type: "uint64"'
-      print >> f, '         is_dense: false'
-      print >> f, '         is_used: true'
-      print >> f, '     }'
-      print >> f, '     slots {'
-      print >> f, '         name: "label"'
-      print >> f, '         type: "uint64"'
-      print >> f, '         is_dense: false'
-      print >> f, '         is_used: true'
-      print >> f, '    }'
-      print >> f, '}'
-      f.close()
-      data_feed = fluid.DataFeedDesc('data.proto')
-
-    However, users usually shouldn't care about the message format; instead,
-    they are encouragd to use :code:`Data Generator` as a tool to generate a
-    valid data description, in the process of converting their raw log files to
-    training files acceptable to AsyncExecutor.
-
-    DataFeedDesc can also be changed during runtime. Once you got familiar with
-    what each field mean, you can modify it to better suit your need. E.g.:
-
-    .. code-block:: python
-
-      import paddle.fluid as fluid
-      data_feed = fluid.DataFeedDesc('data.proto')
-      data_feed.set_batch_size(128)
-      data_feed.set_dense_slots('wd')  # The slot named 'wd' will be dense
-      data_feed.set_use_slots('wd')    # The slot named 'wd' will be used
-
-    Finally, the content can be dumped out for debugging purpose:
-
-    .. code-block:: python
-
-      print(data_feed.desc())
-
-    Args:
-        proto_file(string): Disk file containing a data feed description.
-
-    """
-
-    def __init__(self, proto_file):
-        self.proto_desc = data_feed_pb2.DataFeedDesc()
-        self.proto_desc.pipe_command = "cat"
-        with open(proto_file, 'r') as f:
-            text_format.Parse(f.read(), self.proto_desc)
-        if self.proto_desc.name == "MultiSlotDataFeed":
-            self.__name_to_index = {
-                slot.name: i
-                for i, slot in enumerate(self.proto_desc.multi_slot_desc.slots)
-            }
-
-    def set_batch_size(self, batch_size):
-        """
-        Set batch size. Will be effective during training
-
-        Example:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = fluid.DataFeedDesc('data.proto')
-              data_feed.set_batch_size(128)
-
-        Args:
-            batch_size: batch size
-
-        """
-        self.proto_desc.batch_size = batch_size
-
-    def set_dense_slots(self, dense_slots_name):
-        """
-        Set if a specific slot will be dense. Will be effective during training.
-        features for a dense slot will be fed into a Tensor, while those for a
-        sparse slot will be fed into a LoDTensor
-
-        Example:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = fluid.DataFeedDesc('data.proto')
-              data_feed.set_dense_slots(['words'])
-
-        Args:
-            dense_slots_name: a list of slot names which will be set dense
-
-        Note:
-            Default is sparse for all slots
-        """
-        if self.proto_desc.name != "MultiSlotDataFeed":
-            raise ValueError(
-                "Only MultiSlotDataFeed needs set_dense_slots, please check your datafeed.proto"
-            )
-        for name in dense_slots_name:
-            self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
-                name]].is_dense = True
-
-    def set_use_slots(self, use_slots_name):
-        """
-        Set if a specific slot will be used for training. A dataset shall
-        contain a lot of features, through this function one can select which
-        ones will be used for a specific model.
-
-        Example:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = fluid.DataFeedDesc('data.proto')
-              data_feed.set_use_slots(['words'])
-
-        Args:
-            use_slots_name: a list of slot names which will be used in training
-
-        Note:
-            Default is not used for all slots
-        """
-        if self.proto_desc.name != "MultiSlotDataFeed":
-            raise ValueError(
-                "Only MultiSlotDataFeed needs set_use_slots, please check your datafeed.proto"
-            )
-        for name in use_slots_name:
-            self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
-                name]].is_used = True
-
-    def desc(self):
-        """
-        Returns a protobuf message for this DataFeedDesc
-
-        Example:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = fluid.DataFeedDesc('data.proto')
-              print(data_feed.desc())
-
-        Returns:
-            A string message
-        """
-        return text_format.MessageToString(self.proto_desc)
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
deleted file mode 100644
index 3f9c69f120e4f7cfaf1350d78f5283349d37bc2a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/data_feeder.py
+++ /dev/null
@@ -1,500 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import core
-import numpy
-import os
-import six
-from six.moves import zip, range, xrange
-import multiprocessing
-
-from .framework import Variable, default_main_program, _current_expected_place
-from .framework import _cpu_num, _cuda_ids
-__all__ = ['DataFeeder']
-
-
-def convert_dtype(dtype):
-    if dtype == core.VarDesc.VarType.FP32:
-        return 'float32'
-    elif dtype == core.VarDesc.VarType.INT64:
-        return 'int64'
-    elif dtype == core.VarDesc.VarType.FP64:
-        return 'float64'
-    elif dtype == core.VarDesc.VarType.FP16:
-        return 'float16'
-    elif dtype == core.VarDesc.VarType.INT32:
-        return 'int32'
-    elif dtype == core.VarDesc.VarType.UINT8:
-        return 'uint8'
-    else:
-        raise ValueError("dtype must be any of [int32, float32, int64, "
-                         "float64, uint8]")
-
-
-class DataToLoDTensorConverter(object):
-    def __init__(self, place, lod_level, shape, dtype):
-        self.place = place
-        self.lod_level = lod_level
-        self.shape = shape
-        negtive_count = 0
-        for s in self.shape:
-            if s < 0:
-                negtive_count += 1
-            if negtive_count > 1:
-                self.shape = None
-                break
-        self.dtype = convert_dtype(dtype)
-        self._reset()
-
-    def _reset(self):
-        self.data = []
-        self.lod = [[] for _ in six.moves.range(self.lod_level)]
-
-    def feed(self, data):
-        self._feed_impl_(data, self.lod, self.lod_level)
-
-    def _feed_impl_(self, data, lod, lod_level):
-        if lod_level == 0:
-            self.data.append(data)
-        else:
-            lod[0].append(len(data))
-            for each_data in data:
-                self._feed_impl_(each_data, lod[1:], lod_level - 1)
-
-    def _check_shape(self, shape):
-        for s1, s2 in zip(self.shape, shape):
-            if s1 != s2 and s1 >= 0 and s2 >= 0:
-                raise ValueError(
-                    "Shape not match. What is defined in data layer is {}, but receive {}".
-                    format(self.shape, shape))
-
-    def done(self):
-        arr = numpy.array(self.data, dtype=self.dtype)
-        if self.shape:
-            if len(arr.shape) != len(self.shape):
-                try:
-                    arr = arr.reshape(self.shape)
-                except ValueError:
-                    raise ValueError(
-                        "Reshape error. What is defined in data layer is {}, but receive {}"
-                        .format(self.shape, arr.shape))
-        t = core.LoDTensor()
-        t.set(arr, self.place)
-        if self.lod_level > 0:
-            t.set_recursive_sequence_lengths(self.lod)
-        self._reset()
-        return t
-
-
-class BatchedTensorProvider(object):
-    def __init__(self, feed_list, place, batch_size, generator, drop_last):
-        self.place = place
-        self.batch_size = batch_size
-        self.generator = generator
-        self.converters = []
-        self.drop_last = drop_last
-
-        for var in feed_list:
-            assert var.lod_level == 0, "lod_level must be 0"
-            self.converters.append(
-                DataToLoDTensorConverter(
-                    place=self.place,
-                    lod_level=0,
-                    shape=var.shape,
-                    dtype=var.dtype))
-
-    def _done(self):
-        return [c.done() for c in self.converters]
-
-    def __call__(self):
-        idx = 0
-        for each_sample in self.generator():
-            for each_slot, each_converter in six.moves.zip(each_sample,
-                                                           self.converters):
-                each_converter.data.append(each_slot)
-
-            idx += 1
-            if idx == self.batch_size:
-                idx = 0
-                yield self._done()
-
-        if not self.drop_last and idx > 0:
-            yield self._done()
-        else:
-            [c._reset() for c in self.converters]
-
-
-class DataFeeder(object):
-    """
-    DataFeeder converts the data that returned by a reader into a data
-    structure that can feed into Executor and ParallelExecutor. The reader
-    usually returns a list of mini-batch data entries. Each data entry in
-    the list is one sample. Each sample is a list or a tuple with one
-    feature or multiple features.
-
-    The simple usage shows below:
-
-    ..  code-block:: python
-
-        import paddle.fluid as fluid
-        place = fluid.CPUPlace()
-        img = fluid.layers.data(name='image', shape=[1, 28, 28])
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
-        result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
-
-
-    If you want to feed data into GPU side separately in advance when you
-    use multi-GPU to train a model, you can use `decorate_reader` function.
-
-    ..  code-block:: python
-
-        import paddle
-        import paddle.fluid as fluid
-        
-        place=fluid.CUDAPlace(0)
-        data = fluid.layers.data(name='data', shape=[3, 224, 224], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        
-        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
-        reader = feeder.decorate_reader(
-                paddle.batch(paddle.dataset.flowers.train(), batch_size=16), multi_devices=True)
-
-    Args:
-        feed_list(list): The Variables or Variables'name that will
-            feed into model.
-        place(Place): place indicates feed data into CPU or GPU, if you want to
-            feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents
-            the GPU id), or if you want to feed data into CPU, please using
-            `fluid.CPUPlace()`.
-        program(Program): The Program that will feed data into, if program
-            is None, it will use default_main_program(). Default None.
-
-    Raises:
-        ValueError: If some Variable is not in this Program.
-
-    Examples:
-        ..  code-block:: python
-
-
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-            
-            place = fluid.CPUPlace()
-            
-            def reader():
-                yield [np.random.random([4]).astype('float32'), np.random.random([3]).astype('float32')],
-            
-            main_program = fluid.Program()
-            startup_program = fluid.Program()
-            
-            with fluid.program_guard(main_program, startup_program):
-                data_1 = fluid.layers.data(name='data_1', shape=[1, 2, 2])
-                data_2 = fluid.layers.data(name='data_2', shape=[1, 1, 3])
-                out = fluid.layers.fc(input=[data_1, data_2], size=2)
-                # ...
-            
-            feeder = fluid.DataFeeder([data_1, data_2], place)
-                        
-            exe = fluid.Executor(place)
-            exe.run(startup_program)
-            for data in reader():
-                outs = exe.run(program=main_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[out])
-
-    """
-
-    def __init__(self, feed_list, place, program=None):
-        self.feed_dtypes = []
-        self.feed_names = []
-        self.feed_shapes = []
-        self.feed_lod_level = []
-        if program is None:
-            program = default_main_program()
-        for each_var in feed_list:
-            if isinstance(each_var, six.string_types):
-                each_var = program.block(0).var(each_var)
-            if not isinstance(each_var, Variable):
-                raise TypeError("Feed list should contain a list of variable")
-            self.feed_dtypes.append(each_var.dtype)
-            self.feed_names.append(each_var.name)
-            self.feed_lod_level.append(each_var.lod_level)
-            self.feed_shapes.append(each_var.shape)
-
-        self.place = place
-
-    def feed(self, iterable):
-        """
-        According to feed_list and iterable, converters the input into
-        a data structure that can feed into Executor and ParallelExecutor.
-
-        Args:
-            iterable(list|tuple): the input data.
-
-        Returns:
-            dict: the result of conversion.
-
-        Examples:
-            ..  code-block:: python
-
-                import numpy.random as random
-                import paddle.fluid as fluid
-                
-                def reader(limit=5):
-                    for i in range(limit):
-                        yield random.random([784]).astype('float32'), random.random([1]).astype('int64'), random.random([256]).astype('float32')
-                
-                data_1 = fluid.layers.data(name='data_1', shape=[1, 28, 28])
-                data_2 = fluid.layers.data(name='data_2', shape=[1], dtype='int64')
-                data_3 = fluid.layers.data(name='data_3', shape=[16, 16], dtype='float32')
-                feeder = fluid.DataFeeder(['data_1','data_2', 'data_3'], fluid.CPUPlace())
-                
-                result = feeder.feed(reader()) 
-        """
-        converter = []
-        for lod_level, shape, dtype in six.moves.zip(
-                self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
-            converter.append(
-                DataToLoDTensorConverter(
-                    place=self.place,
-                    lod_level=lod_level,
-                    shape=shape,
-                    dtype=dtype))
-
-        for each_sample in iterable:
-            assert len(each_sample) == len(converter), (
-                "The number of fields in data (%d) does not match " +
-                "len(feed_list) (%d)") % (len(each_sample), len(converter))
-            for each_converter, each_slot in six.moves.zip(converter,
-                                                           each_sample):
-                each_converter.feed(each_slot)
-        ret_dict = {}
-        for each_name, each_converter in six.moves.zip(self.feed_names,
-                                                       converter):
-            ret_dict[each_name] = each_converter.done()
-        return ret_dict
-
-    def feed_parallel(self, iterable, num_places=None):
-        """
-        Takes multiple mini-batches. Each mini-batch will be feed on each
-        device in advance.
-
-        Args:
-            iterable(list|tuple): the input data.
-            num_places(int): the number of devices. Default None.
-
-        Returns:
-            dict: the result of conversion.
-
-        Notes:
-            The number of devices and number of mini-batches must be same.
-
-        Examples:
-            ..  code-block:: python
-
-                import numpy.random as random
-                import paddle.fluid as fluid
-                
-                def reader(limit=10):
-                    for i in range(limit):
-                        yield [random.random([784]).astype('float32'), random.random([1]).astype('float32')],
-                
-                x = fluid.layers.data(name='x', shape=[1, 28, 28])
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                
-                fluid.layers.elementwise_add(x, y)
-                
-                feeder = fluid.DataFeeder(['x','y'], fluid.CPUPlace())
-                place_num = 2 
-                places = [fluid.CPUPlace() for x in range(place_num)]
-                data = []
-                exe = fluid.Executor(fluid.CPUPlace())
-                exe.run(fluid.default_startup_program())
-                program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(places=places)
-                for item in reader():
-                    data.append(item)
-                    if place_num == len(data):
-                        exe.run(program=program, feed=list(feeder.feed_parallel(data, place_num)), fetch_list=[])
-                        data = []
-        """
-        if isinstance(self.place, core.CUDAPlace):
-            places = [
-                core.CUDAPlace(i)
-                for i in six.moves.xrange(
-                    self._get_number_of_places_(num_places))
-            ]
-        else:
-            places = [
-                core.CPUPlace()
-                for _ in six.moves.xrange(
-                    self._get_number_of_places_(num_places))
-            ]
-
-        if len(iterable) != len(places):
-            raise ValueError("feed_parallel takes multiple mini-batches. Each "
-                             "mini-batch will be feed on each device. The "
-                             "number of devices and number of mini-batches "
-                             "must be same.")
-
-        place = self.place
-        for p, batch in six.moves.zip(places, iterable):
-            self.place = p
-            yield self.feed(batch)
-        self.place = place
-
-    def _get_number_of_places_(self, num_places):
-        if num_places is not None:
-            return int(num_places)
-        elif isinstance(self.place, core.CUDAPlace):
-            return len(_cuda_ids())
-        else:
-            return _cpu_num()
-
-    def decorate_reader(self,
-                        reader,
-                        multi_devices,
-                        num_places=None,
-                        drop_last=True):
-        """
-        Converter the input data into a data that returned by reader into
-        multiple mini-batches. Each mini-batch will be feed on each device.
-
-        Args:
-            reader(function): the reader is the function which can generate data.
-            multi_devices(bool): whether to use multiple devices or not.
-            num_places(int): if multi_devices is True, you can specify the number
-                of GPU to use, if multi_devices is None, the function will use all the
-                GPU of the current machine. Default None.
-            drop_last(bool): whether to drop the last batch if the
-                size of the last batch is less than batch_size. Default True.
-
-        Returns:
-            dict: the result of conversion.
-
-        Raises:
-            ValueError: If drop_last is False and the data batch cannot fit for devices.
-
-        Examples:
-            ..  code-block:: python
-
-                import numpy.random as random
-                import paddle
-                import paddle.fluid as fluid
-                import paddle.fluid.compiler as compiler
-                
-                def reader(limit=10):
-                    for i in range(limit):
-                        yield (random.random([784]).astype('float32'), random.random([1]).astype('int64')),
-                
-                place=fluid.CUDAPlace(0)
-                data = fluid.layers.data(name='data', shape=[1, 28, 28], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-                
-                hidden = fluid.layers.fc(input=data, size=10)
-                
-                feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
-                reader = feeder.decorate_reader(reader, multi_devices=True)
-                
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                compiled_prog = compiler.CompiledProgram(
-                         fluid.default_main_program()).with_data_parallel()
-                for i,data in enumerate(reader()):
-                    print('iteration : ', i + 1)
-                    ret = exe.run(compiled_prog, feed=data, fetch_list=[hidden])
-        """
-
-        def __reader_creator__():
-            if not multi_devices:
-                for item in reader():
-                    yield self.feed(item)
-            else:
-                num = self._get_number_of_places_(num_places)
-                item = []
-                for batch in reader():
-                    item.append(batch)
-                    if len(item) == num:
-                        yield list(self.feed_parallel(item, num))
-                        item = []
-                if not drop_last and len(item) != 0:
-                    raise ValueError(
-                        "The data batch which cannot fit for devices will be "
-                        "dropped is not implementation. Other strategies are "
-                        "not implemented")
-
-        return __reader_creator__
-
-
-class NumpyToLoDTensorConverter(object):
-    def __init__(self, place):
-        self.place = place
-        self.data = []
-        self._reset()
-
-    def _reset(self):
-        self.data = []
-
-    def feed(self, data):
-        self.data.append(data)
-
-    def done(self):
-        arr = numpy.array(self.data)
-        t = core.LoDTensor()
-        t.set(arr, self.place)
-        self._reset()
-        return t
-
-
-class ListTensorProvider(object):
-    def __init__(self, generator, places):
-        self.generator = generator
-        self.converters = []
-        self.places = []
-        if places:
-            if not isinstance(places, (list, tuple)):
-                places = [places]
-            assert len(
-                places) == 1, "dygraph mode CAN NOT specify multiple places."
-            for place in places:
-                if isinstance(place, (core.CUDAPlace, core.CPUPlace)):
-                    self.places.append(place)
-                else:
-                    raise ValueError(
-                        "Please specify a valid place values such as core.CPUPlace or core.CUDAPlace"
-                    )
-        if len(self.places) == 0:
-            self.places.append(_current_expected_place())
-
-    def _readData(self, iterable, places):
-        for place, each_sample in six.moves.zip(places, iterable):
-            for item in each_sample:
-                if len(self.converters) < len(item):
-                    for i in item:
-                        self.converters.append(NumpyToLoDTensorConverter(place))
-                for each_converter, each_slot in six.moves.zip(self.converters,
-                                                               item):
-                    each_converter.feed(each_slot)
-            yield [c.done() for c in self.converters]
-
-    def __call__(self):
-        item = []
-        for batch in self.generator():
-            item.append(batch)
-            if len(item) == len(self.places):
-                yield list(self._readData(item, self.places))
-                item = []
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
deleted file mode 100644
index 1ae2d056e85ef7949522b1562e2aa6453a250b4b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dataset.py
+++ /dev/null
@@ -1,833 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This is defination of dataset class, which is high performance IO."""
-
-from paddle.fluid.proto import data_feed_pb2
-from google.protobuf import text_format
-from . import core
-__all__ = ['DatasetFactory', 'InMemoryDataset', 'QueueDataset']
-
-
-class DatasetFactory(object):
-    """
-    DatasetFactory is a factory which create dataset by its name,
-    you can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
-    the default is "QueueDataset".
-
-    Example:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-
-    """
-
-    def __init__(self):
-        """ Init. """
-        pass
-
-    def create_dataset(self, datafeed_class="QueueDataset"):
-        """
-        Create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
-        the default is "QueueDataset".
-
-        Args:
-            datafeed_class(str): datafeed class name, QueueDataset or InMemoryDataset.
-                                 Default is QueueDataset.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-
-        """
-        try:
-            dataset = globals()[datafeed_class]()
-            return dataset
-        except:
-            raise ValueError("datafeed class %s does not exist" %
-                             datafeed_class)
-
-
-class DatasetBase(object):
-    """ Base dataset class. """
-
-    def __init__(self):
-        """ Init. """
-        # define class name here
-        # to decide whether we need create in memory instance
-        self.proto_desc = data_feed_pb2.DataFeedDesc()
-        self.proto_desc.pipe_command = "cat"
-        self.dataset = core.Dataset("MultiSlotDataset")
-        self.thread_num = 1
-        self.filelist = []
-
-    def set_pipe_command(self, pipe_command):
-        """
-        Set pipe command of current dataset
-        A pipe command is a UNIX pipeline command that can be used only
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_pipe_command("python my_script.py")
-
-        Args:
-            pipe_command(str): pipe command
-
-        """
-        self.proto_desc.pipe_command = pipe_command
-
-    def set_fea_eval(self, record_candidate_size, fea_eval=True):
-        """
-        set fea eval mode for slots shuffle to debug the importance level of
-        slots(features), fea_eval need to be set True for slots shuffle.
-        
-        Args:
-            record_candidate_size(int): size of instances candidate to shuffle 
-                                        one slot
-            fea_eval(bool): wheather enable fea eval mode to enable slots shuffle.
-                            default is True.
-            
-        Examples:
-            .. code-block:: python
-
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_fea_eval(1000000, True)
-
-        """
-        if fea_eval:
-            self.dataset.set_fea_eval(fea_eval, record_candidate_size)
-        self.fea_eval = fea_eval
-
-    def slots_shuffle(self, slots):
-        """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
-        in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
-        evaluate the importance level of slots(features).
-        
-        Args:
-            slots(list[string]): the set of slots(string) to do slots shuffle.
-
-        Examples:
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_merge_by_lineid()
-            #suppose there is a slot 0
-            dataset.slots_shuffle(['0'])
-        """
-        if self.fea_eval:
-            slots_set = set(slots)
-            self.dataset.slots_shuffle(slots_set)
-
-    def set_batch_size(self, batch_size):
-        """
-        Set batch size. Will be effective during training
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_batch_size(128)
-
-        Args:
-            batch_size(int): batch size
-
-        """
-        self.proto_desc.batch_size = batch_size
-
-    def set_thread(self, thread_num):
-        """
-        Set thread num, it is the num of readers.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-               dataset.set_thread(12)
-
-        Args:
-            thread_num(int): thread num
-        """
-        self.dataset.set_thread_num(thread_num)
-        self.thread_num = thread_num
-
-    def set_filelist(self, filelist):
-        """
-        Set file list in current worker.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_filelist(['a.txt', 'b.txt'])
-
-        Args:
-            filelist(list): file list
-        """
-        self.dataset.set_filelist(filelist)
-        self.filelist = filelist
-
-    def set_use_var(self, var_list):
-        """
-        Set Variables which you will use.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_use_var([data, label])
-
-        Args:
-            var_list(list): variable list
-        """
-        multi_slot = self.proto_desc.multi_slot_desc
-        for var in var_list:
-            slot_var = multi_slot.slots.add()
-            slot_var.is_used = True
-            slot_var.name = var.name
-            if var.lod_level == 0:
-                slot_var.is_dense = True
-                slot_var.shape.extend(var.shape)
-            if var.dtype == core.VarDesc.VarType.FP32:
-                slot_var.type = "float"
-            elif var.dtype == core.VarDesc.VarType.INT64:
-                slot_var.type = "uint64"
-            else:
-                raise ValueError(
-                    "Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
-                )
-
-    def set_hdfs_config(self, fs_name, fs_ugi):
-        """
-        Set hdfs config: fs name ad ugi
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
-
-        Args:
-            fs_name(str): fs name
-            fs_ugi(str): fs ugi
-        """
-        self.dataset.set_hdfs_config(fs_name, fs_ugi)
-
-    def _prepare_to_run(self):
-        """
-        Set data_feed_desc before load or shuffle,
-        user no need to call this function.
-        """
-        if self.thread_num > len(self.filelist):
-            self.thread_num = len(self.filelist)
-        self.dataset.set_thread_num(self.thread_num)
-        self.dataset.set_data_feed_desc(self.desc())
-        self.dataset.create_readers()
-
-    def _finish_to_run(self):
-        self.dataset.destroy_readers()
-
-    def desc(self):
-        """
-        Returns a protobuf message for this DataFeedDesc
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              print(dataset.desc())
-
-        Returns:
-            A string message
-        """
-        return text_format.MessageToString(self.proto_desc)
-
-    def _dynamic_adjust_before_train(self, thread_num):
-        pass
-
-    def _dynamic_adjust_after_train(self):
-        pass
-
-
-class InMemoryDataset(DatasetBase):
-    """
-    InMemoryDataset, it will load data into memory
-    and shuffle data before training.
-    This class should be created by DatasetFactory
-
-    Example:
-        dataset = paddle.fluid.DatasetFactory().create_dataset("InMemoryDataset")
-    """
-
-    def __init__(self):
-        """ Init. """
-        super(InMemoryDataset, self).__init__()
-        self.proto_desc.name = "MultiSlotInMemoryDataFeed"
-        self.fleet_send_batch_size = None
-        self.is_user_set_queue_num = False
-        self.queue_num = None
-        self.parse_ins_id = False
-        self.parse_content = False
-        self.merge_by_lineid = False
-        self.fleet_send_sleep_seconds = None
-
-    def _prepare_to_run(self):
-        """
-        Set data_feed_desc before load or shuffle,
-        user no need to call this function.
-        """
-        if self.thread_num <= 0:
-            self.thread_num = 1
-        self.dataset.set_thread_num(self.thread_num)
-        if self.queue_num is None:
-            self.queue_num = self.thread_num
-        self.dataset.set_queue_num(self.queue_num)
-        self.dataset.set_parse_ins_id(self.parse_ins_id)
-        self.dataset.set_parse_content(self.parse_content)
-        self.dataset.set_data_feed_desc(self.desc())
-        self.dataset.create_channel()
-        self.dataset.create_readers()
-
-    def _dynamic_adjust_before_train(self, thread_num):
-        if not self.is_user_set_queue_num:
-            self.dataset.dynamic_adjust_channel_num(thread_num)
-        self.dataset.dynamic_adjust_readers_num(thread_num)
-
-    def _dynamic_adjust_after_train(self):
-        if not self.is_user_set_queue_num:
-            self.dataset.dynamic_adjust_channel_num(self.thread_num)
-        self.dataset.dynamic_adjust_readers_num(self.thread_num)
-
-    def set_queue_num(self, queue_num):
-        """
-        Set Dataset output queue num, training threads get data from queues
-
-        Args:
-            queue_num(int): dataset output queue num
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_queue_num(12)
-
-        """
-        self.is_user_set_queue_num = True
-        self.queue_num = queue_num
-
-    def set_parse_ins_id(self, parse_ins_id):
-        """
-        Set id Dataset need to parse insid
-
-        Args:
-            parse_ins_id(bool): if parse ins_id or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_ins_id(True)
-
-        """
-        self.parse_ins_id = parse_ins_id
-
-    def set_parse_content(self, parse_content):
-        """
-        Set if Dataset need to parse content
-
-        Args:
-            parse_content(bool): if parse content or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_content(True)
-
-        """
-        self.parse_content = parse_content
-
-    def set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
-        """
-        Set fleet send batch size, default is 1024
-
-        Args:
-            fleet_send_batch_size(int): fleet send batch size
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_fleet_send_batch_size(800)
-
-        """
-        self.fleet_send_batch_size = fleet_send_batch_size
-
-    def set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
-        """
-        Set fleet send sleep time, default is 0
-
-        Args:
-            fleet_send_sleep_seconds(int): fleet send sleep time
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_fleet_send_sleep_seconds(2)
-
-        """
-        self.fleet_send_sleep_seconds = fleet_send_sleep_seconds
-
-    def set_merge_by_lineid(self,
-                            var_list,
-                            erase_duplicate_feas=True,
-                            min_merge_size=2,
-                            keep_unmerged_ins=True):
-        """
-        Set merge by line id, instances of same line id will be merged after
-        shuffle, you should parse line id in data generator.
-
-        Args:
-            var_list(list): slots that can be merge. each element in var_list
-                            is Variable. some slots such as show and click, we
-                            usually don't merge them for same line id, so user
-                            should specify which slot can be merged.
-            erase_duplicate_feas(bool): whether erase duplicate feasigns when
-                                        merge. default is True.
-            min_merge_size(int): minimal size to merge. default is 2.
-            keep_unmerged_ins(bool): whether to keep unmerged ins, such as
-                                     ins with unique id or the num of ins with
-                                     same id is less than min_merge_size.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_merge_by_lineid()
-
-        """
-        var_name_list = [i.name for i in var_list]
-        self.dataset.set_merge_by_lineid(var_name_list, erase_duplicate_feas,
-                                         min_merge_size, keep_unmerged_ins)
-        self.merge_by_lineid = True
-
-    def load_into_memory(self):
-        """
-        Load data into memory
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-        """
-        self._prepare_to_run()
-        self.dataset.load_into_memory()
-
-    def preload_into_memory(self, thread_num=None):
-        """
-        Load data into memory in async mode
-
-        Args:
-            thread_num(int): preload thread num
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.preload_into_memory()
-              dataset.wait_preload_done()
-        """
-        self._prepare_to_run()
-        if thread_num is None:
-            thread_num = self.thread_num
-        self.dataset.set_preload_thread_num(thread_num)
-        self.dataset.create_preload_readers()
-        self.dataset.preload_into_memory()
-
-    def wait_preload_done(self):
-        """
-        Wait preload_into_memory done
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.preload_into_memory()
-              dataset.wait_preload_done()
-        """
-        self.dataset.wait_preload_done()
-        self.dataset.destroy_preload_readers()
-
-    def local_shuffle(self):
-        """
-        Local shuffle
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.local_shuffle()
-        """
-        self.dataset.local_shuffle()
-
-    def global_shuffle(self, fleet=None, thread_num=12):
-        """
-        Global shuffle.
-        Global shuffle can be used only in distributed mode. i.e. multiple
-        processes on single machine or multiple machines training together.
-        If you run in distributed mode, you should pass fleet instead of None.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.global_shuffle(fleet)
-
-        Args:
-            fleet(Fleet): fleet singleton. Default None.
-            thread_num(int): shuffle thread num. Default is 12.
-
-        """
-        trainer_num = 1
-        if fleet is not None:
-            fleet._role_maker._barrier_worker()
-            trainer_num = fleet.worker_num()
-        if self.fleet_send_batch_size is None:
-            self.fleet_send_batch_size = 1024
-        if self.fleet_send_sleep_seconds is None:
-            self.fleet_send_sleep_seconds = 0
-        self.dataset.register_client2client_msg_handler()
-        self.dataset.set_trainer_num(trainer_num)
-        self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size)
-        self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds)
-        if fleet is not None:
-            fleet._role_maker._barrier_worker()
-        self.dataset.global_shuffle(thread_num)
-        if fleet is not None:
-            fleet._role_maker._barrier_worker()
-        if self.merge_by_lineid:
-            self.dataset.merge_by_lineid()
-        if fleet is not None:
-            fleet._role_maker._barrier_worker()
-
-    def release_memory(self):
-        """
-        Release InMemoryDataset memory data, when data will not be used again.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.global_shuffle(fleet)
-              exe = fluid.Executor(fluid.CPUPlace())
-              exe.run(fluid.default_startup_program())
-              exe.train_from_dataset(fluid.default_main_program(), dataset)
-              dataset.release_memory()
-
-        """
-        self.dataset.release_memory()
-
-    def get_memory_data_size(self, fleet=None):
-        """
-        Get memory data size, user can call this function to know the num
-        of ins in all workers after load into memory.
-
-        Note:
-            This function may cause bad performance, because it has barrier
-
-        Args:
-            fleet(Fleet): Fleet Object.
-
-        Returns:
-            The size of memory data.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              print dataset.get_memory_data_size(fleet)
-
-        """
-        import numpy as np
-        local_data_size = self.dataset.get_memory_data_size()
-        local_data_size = np.array([local_data_size])
-        if fleet is not None:
-            global_data_size = local_data_size * 0
-            fleet._role_maker._node_type_comm.Allreduce(local_data_size,
-                                                        global_data_size)
-            return global_data_size[0]
-        return local_data_size[0]
-
-    def get_shuffle_data_size(self, fleet=None):
-        """
-        Get shuffle data size, user can call this function to know the num
-        of ins in all workers after local/global shuffle.
-
-        Note:
-            This function may cause bad performance to local shuffle,
-            because it has barrier. It does not affect global shuffle.
-
-        Args:
-            fleet(Fleet): Fleet Object.
-
-        Returns:
-            The size of shuffle data.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.global_shuffle(fleet)
-              print dataset.get_shuffle_data_size(fleet)
-
-        """
-        import numpy as np
-        local_data_size = self.dataset.get_shuffle_data_size()
-        local_data_size = np.array([local_data_size])
-        if fleet is not None:
-            global_data_size = local_data_size * 0
-            fleet._role_maker._node_type_comm.Allreduce(local_data_size,
-                                                        global_data_size)
-            return global_data_size[0]
-        return local_data_size[0]
-
-
-class QueueDataset(DatasetBase):
-    """
-    QueueDataset, it will process data streamly.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-
-    """
-
-    def __init__(self):
-        """
-        Initialize QueueDataset
-        This class should be created by DatasetFactory
-        """
-        super(QueueDataset, self).__init__()
-        self.proto_desc.name = "MultiSlotDataFeed"
-
-    def _prepare_to_run(self):
-        """
-        Set data_feed_desc/thread num/filelist before run,
-        user no need to call this function.
-        """
-        if self.thread_num > len(self.filelist):
-            self.thread_num = len(self.filelist)
-        if self.thread_num == 0:
-            self.thread_num = 1
-        self.dataset.set_thread_num(self.thread_num)
-        self.dataset.set_filelist(self.filelist)
-        self.dataset.set_data_feed_desc(self.desc())
-        self.dataset.create_readers()
-
-    def local_shuffle(self):
-        """
-        Local shuffle data.
-
-        Local shuffle is not supported in QueueDataset
-        NotImplementedError will be raised
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-              dataset.local_shuffle()
-
-        Raises:
-            NotImplementedError: QueueDataset does not support local shuffle
-
-        """
-        raise NotImplementedError(
-            "QueueDataset does not support local shuffle, "
-            "please use InMemoryDataset for local_shuffle")
-
-    def global_shuffle(self, fleet=None):
-        """
-        Global shuffle data.
-
-        Global shuffle is not supported in QueueDataset
-        NotImplementedError will be raised
-
-        Args:
-            fleet(Fleet): fleet singleton. Default None.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-              dataset.global_shuffle(fleet)
-
-        Raises:
-            NotImplementedError: QueueDataset does not support global shuffle
-
-        """
-        raise NotImplementedError(
-            "QueueDataset does not support global shuffle, "
-            "please use InMemoryDataset for global_shuffle")
-
-
-class FileInstantDataset(DatasetBase):
-    """
-    FileInstantDataset, it will process data streamly.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory.create_dataset("FileInstantDataset")
-    """
-
-    def __init__(self):
-        """
-        Initialize FileInstantDataset
-        This class should be created by DatasetFactory
-        """
-        super(FileInstantDataset, self).__init__()
-        self.proto_desc.name = "MultiSlotFileInstantDataFeed"
-
-    def local_shuffle(self):
-        """
-        Local shuffle
-        FileInstantDataset does not support local shuffle
-        """
-        raise NotImplementedError(
-            "FileInstantDataset does not support local shuffle, "
-            "please use InMemoryDataset for local_shuffle")
-
-    def global_shuffle(self, fleet=None):
-        """
-        Global shuffle
-        FileInstantDataset does not support global shuffle
-        """
-        raise NotImplementedError(
-            "FileInstantDataset does not support global shuffle, "
-            "please use InMemoryDataset for global_shuffle")
-
-
-class BoxPSDataset(InMemoryDataset):
-    """
-    BoxPSDataset: derived from InMemoryDataset.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory.create_dataset("BoxPSDataset")
-    """
-
-    def __init__(self):
-        """
-        Initialize BoxPSDataset
-        This class should be created by DatasetFactory
-        """
-        super(BoxPSDataset, self).__init__()
-        self.boxps = core.BoxPS(self.dataset)
-
-    def begin_pass(self):
-        """
-        Begin Pass
-        Notify BoxPS to begin next pass
-	"""
-        self.boxps.begin_pass()
-
-    def end_pass(self):
-        """
-        End Pass
-        Notify BoxPS to end current pass
-	"""
-        self.boxps.end_pass()
-
-    def wait_preload_done(self):
-        """
-        Wait async proload done
-        Wait Until Feed Pass Done
-	"""
-        self.boxps.wait_feed_pass_done()
-
-    def load_into_memory(self):
-        """
-	Load next pass into memory and notify boxps to fetch its emb from SSD
-	"""
-        self._prepare_to_run()
-        self.boxps.load_into_memory()
-
-    def preload_into_memory(self):
-        """
-	begin async preload next pass while current pass may be training
-	"""
-        self._prepare_to_run()
-        self.boxps.preload_into_memory()
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
deleted file mode 100644
index ef07dcebcabfe8aa3c0e3366597e40583a57db7c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/debugger.py
+++ /dev/null
@@ -1,367 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-import six
-import random
-import os
-import re
-from .graphviz import GraphPreviewGenerator
-from .proto import framework_pb2
-from google.protobuf import text_format
-from . import unique_name
-from .framework import Program, default_main_program, Variable
-from . import core
-from . import io
-from .layer_helper import LayerHelper
-
-_vartype2str_ = [
-    "UNK",
-    "LoDTensor",
-    "SelectedRows",
-    "FeedMinibatch",
-    "FetchList",
-    "StepScopes",
-    "LodRankTable",
-    "LoDTensorArray",
-    "PlaceList",
-]
-_dtype2str_ = [
-    "bool",
-    "int16",
-    "int32",
-    "int64",
-    "float16",
-    "float32",
-    "float64",
-]
-
-
-def repr_data_type(type):
-    return _dtype2str_[type]
-
-
-def repr_tensor(proto):
-    return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)],
-                                              str(proto.dims))
-
-
-reprtpl = "{ttype} {name} ({reprs})"
-
-
-def repr_lodtensor(proto):
-    if proto.type.type != framework_pb2.VarType.LOD_TENSOR:
-        return
-
-    level = proto.type.lod_tensor.lod_level
-    reprs = repr_tensor(proto.type.lod_tensor.tensor)
-    return reprtpl.format(
-        ttype="LoDTensor" if level > 0 else "Tensor",
-        name=proto.name,
-        reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
-
-
-def repr_selected_rows(proto):
-    if proto.type.type != framework_pb2.VarType.SELECTED_ROWS:
-        return
-
-    return reprtpl.format(
-        ttype="SelectedRows",
-        name=proto.name,
-        reprs=repr_tensor(proto.type.selected_rows))
-
-
-def repr_tensor_array(proto):
-    if proto.type.type != framework_pb2.VarType.LOD_TENSOR_ARRAY:
-        return
-
-    return reprtpl.format(
-        ttype="TensorArray",
-        name=proto.name,
-        reprs="level=%d, %s" % (proto.type.tensor_array.lod_level,
-                                repr_tensor(proto.type.lod_tensor.tensor)))
-
-
-type_handlers = [
-    repr_lodtensor,
-    repr_selected_rows,
-    repr_tensor_array,
-]
-
-
-def repr_var(vardesc):
-    for handler in type_handlers:
-        res = handler(vardesc)
-        if res:
-            return res
-
-
-def pprint_program_codes(program_desc):
-    reprs = []
-    for block_idx in range(program_desc.desc.num_blocks()):
-        block_desc = program_desc.block(block_idx)
-        block_repr = pprint_block_codes(block_desc)
-        reprs.append(block_repr)
-    return '\n'.join(reprs)
-
-
-def pprint_block_codes(block_desc, show_backward=False):
-    def is_op_backward(op_desc):
-        if op_desc.type.endswith('_grad'): return True
-
-        def is_var_backward(var):
-            if "@GRAD" in var.parameter: return True
-            for arg in var.arguments:
-                if "@GRAD" in arg: return True
-
-        for var in op_desc.inputs:
-            if is_var_backward(var): return True
-        for var in op_desc.outputs:
-            if is_var_backward(var): return True
-        return False
-
-    def is_var_backward(var_desc):
-        return "@GRAD" in var_desc.name
-
-    if type(block_desc) is not framework_pb2.BlockDesc:
-        block_desc = framework_pb2.BlockDesc.FromString(
-            block_desc.desc.serialize_to_string())
-    var_reprs = []
-    op_reprs = []
-    for var in block_desc.vars:
-        if not show_backward and is_var_backward(var):
-            continue
-        var_reprs.append(repr_var(var))
-
-    for op in block_desc.ops:
-        if not show_backward and is_op_backward(op): continue
-        op_reprs.append(repr_op(op))
-
-    tpl = "// block-{idx}  parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n"
-    return tpl.format(
-        idx=block_desc.idx,
-        pidx=block_desc.parent_idx,
-        vars='\n'.join(var_reprs),
-        ops='\n'.join(op_reprs), )
-
-
-def repr_attr(desc):
-    tpl = "{key}={value}"
-    valgetter = [
-        lambda attr: attr.i,
-        lambda attr: attr.f,
-        lambda attr: attr.s,
-        lambda attr: attr.ints,
-        lambda attr: attr.floats,
-        lambda attr: attr.strings,
-        lambda attr: attr.b,
-        lambda attr: attr.bools,
-        lambda attr: attr.block_idx,
-        lambda attr: attr.l,
-    ]
-    key = desc.name
-    value = valgetter[desc.type](desc)
-    if key == "dtype":
-        value = repr_data_type(value)
-    return tpl.format(key=key, value=str(value)), (key, value)
-
-
-def _repr_op_fill_constant(optype, inputs, outputs, attrs):
-    if optype == "fill_constant":
-        return "{output} = {data} [shape={shape}]".format(
-            output=','.join(outputs),
-            data=attrs['value'],
-            shape=str(attrs['shape']))
-
-
-op_repr_handlers = [_repr_op_fill_constant, ]
-
-
-def repr_op(opdesc):
-    optype = None
-    attrs = []
-    attr_dict = {}
-    is_target = None
-    inputs = []
-    outputs = []
-
-    tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]"
-    args2value = lambda args: args[0] if len(args) == 1 else str(list(args))
-    for var in opdesc.inputs:
-        key = var.parameter
-        value = args2value(var.arguments)
-        inputs.append("%s=%s" % (key, value))
-    for var in opdesc.outputs:
-        value = args2value(var.arguments)
-        outputs.append(value)
-    for attr in opdesc.attrs:
-        attr_repr, attr_pair = repr_attr(attr)
-        attrs.append(attr_repr)
-        attr_dict[attr_pair[0]] = attr_pair[1]
-
-    is_target = opdesc.is_target
-
-    for handler in op_repr_handlers:
-        res = handler(opdesc.type, inputs, outputs, attr_dict)
-        if res: return res
-
-    return tpl.format(
-        outputs=', '.join(outputs),
-        optype=opdesc.type,
-        inputs=', '.join(inputs),
-        attrs="{%s}" % ','.join(attrs),
-        is_target=", is_target" if is_target else "")
-
-
-def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
-    '''
-    Generate a debug graph for block.
-    Args:
-        block(Block): a block.
-    '''
-    graph = GraphPreviewGenerator("some graph")
-    # collect parameters and args
-    protostr = block.desc.serialize_to_string()
-    desc = framework_pb2.BlockDesc.FromString(six.binary_type(protostr))
-
-    def need_highlight(name):
-        if highlights is None: return False
-        for pattern in highlights:
-            assert type(pattern) is str
-            if re.match(pattern, name):
-                return True
-        return False
-
-    # draw parameters and args
-    vars = {}
-    for var in desc.vars:
-        # TODO(gongwb): format the var.type
-        # create var
-        if var.persistable:
-            varn = graph.add_param(
-                var.name,
-                str(var.type).replace("\n", "<br />", 1),
-                highlight=need_highlight(var.name))
-        else:
-            varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
-        vars[var.name] = varn
-
-    def add_op_link_var(op, var, op2var=False):
-        for arg in var.arguments:
-            if arg not in vars:
-                # add missing variables as argument
-                vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg))
-            varn = vars[arg]
-            highlight = need_highlight(op.description) or need_highlight(
-                varn.description)
-            if op2var:
-                graph.add_edge(op, varn, highlight=highlight)
-            else:
-                graph.add_edge(varn, op, highlight=highlight)
-
-    for op in desc.ops:
-        opn = graph.add_op(op.type, highlight=need_highlight(op.type))
-        for var in op.inputs:
-            add_op_link_var(opn, var, False)
-        for var in op.outputs:
-            add_op_link_var(opn, var, True)
-
-    graph(path, show=False)
-
-
-def prepare_fast_nan_inf_debug(_program):
-    """
-    Given a program to run, insert a (reduce) sum op for every var in that program.
-    Instead of checking all vars originally defined in the program,
-    only those inserted ops will be checked in the c++ end, to detect if it contains NAN or INF.
-    Thereforce, the speed of nan/inf checking could be improved.
-    Please set ``FLAGS_fast_check_nan_inf" to open the fast nan/inf feature.
-    """
-
-    helper = LayerHelper('reduce_sum', **locals())
-
-    if _program is None:
-        _program = default_main_program()
-
-    for _block in _program.blocks:
-        # fetch vars in the current block
-        _vars_in_prog = []
-        for _var_name in _block.vars:
-            _vars_in_prog.append((_var_name, _block.vars[_var_name]))
-
-        # append sum_op in the current block
-        for _var_name, _var in _vars_in_prog:
-
-            try:
-
-                if _var.dtype == -1:
-                    continue
-
-                ## create a var for holding sum output
-                _output_var = _block.create_var(
-                    name=unique_name.generate("debug_var_" + _var_name),
-                    dtype=_var.dtype,
-                    type=core.VarDesc.VarType.LOD_TENSOR,
-                    persistable=False,
-                    stop_gradient=True)
-
-                ## create a sum op, input each existing var in the block
-                _block.append_op(
-                    type='sum',
-                    outputs={'Out': _output_var},
-                    inputs={'X': [_var]})
-            except Exception as e:
-                pass
-
-
-def run_fast_nan_inf_debug(executor,
-                           program=None,
-                           feed=None,
-                           fetch_list=None,
-                           feed_var_name='feed',
-                           fetch_var_name='fetch',
-                           scope=None,
-                           return_numpy=True,
-                           use_program_cache=False,
-                           dump_core=True):
-    """
-    Run a program by the given executor. Catch the exception of NAN and INF, and save persistbales into the dumped core.
-    """
-
-    assert (executor is not None)
-
-    try:
-        output = executor.run(program=program,
-                              feed=feed,
-                              fetch_list=fetch_list,
-                              feed_var_name=feed_var_name,
-                              fetch_var_name=fetch_var_name,
-                              scope=scope,
-                              return_numpy=return_numpy,
-                              use_program_cache=use_program_cache)
-
-        return output
-
-    except Exception as e:
-
-        print("catch an exception:")
-        print(e)
-
-        core_filename = "core" + str(int(random.random() * 10000)) + ".pdckpt"
-        io.save_persistables(
-            executor, "./", main_program=program, filename=core_filename)
-
-        print("dumping a core into ./%s" % core_filename)
diff --git a/python/paddle/fluid/default_scope_funcs.py b/python/paddle/fluid/default_scope_funcs.py
deleted file mode 100644
index a5b2c84dfe6f2650b4a2ee4465f723812e5d4a01..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/default_scope_funcs.py
+++ /dev/null
@@ -1,103 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Default scope function.
-
-`Paddle` manages Scope as programming language's scope.  It just a
-thread-local stack of Scope. Top of that stack is current scope, the bottom
-of that stack is all scopes' parent.
-
-Invoking `var/find_var`  can `new/find` variable in current scope.
-Invoking `enter_local_scope/leave_local_scope` can create or destroy local
-scope.
-
-A `scoped_function` will take a `function` as input. That function will be
-invoked in a new local scope.
-"""
-
-from __future__ import print_function
-
-import paddle.fluid.core
-import threading
-
-__tl_scope__ = threading.local()
-
-__all__ = [
-    'get_cur_scope',
-    'enter_local_scope',
-    'leave_local_scope',
-    'var',
-    'find_var',
-    'scoped_function',
-]
-
-
-def get_cur_scope():
-    """
-    Get current scope.
-    :rtype: paddle.fluid.core.Scope
-    """
-    cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
-    if cur_scope_stack is None:
-        __tl_scope__.cur_scope = list()
-    if len(__tl_scope__.cur_scope) == 0:
-        __tl_scope__.cur_scope.append(paddle.fluid.core.Scope())
-    return __tl_scope__.cur_scope[-1]
-
-
-def enter_local_scope():
-    """
-    Enter a new local scope
-    """
-    cur_scope = get_cur_scope()
-    new_scope = cur_scope.new_scope()
-    __tl_scope__.cur_scope.append(new_scope)
-
-
-def leave_local_scope():
-    """
-    Leave local scope
-    """
-    __tl_scope__.cur_scope.pop()
-    get_cur_scope().drop_kids()
-
-
-def var(name):
-    """
-    create variable in current scope.
-    """
-    return get_cur_scope().var(name)
-
-
-def find_var(name):
-    """
-    get variable in current scope.
-    """
-    return get_cur_scope().find_var(name)
-
-
-def scoped_function(func):
-    """
-    invoke `func` in new scope.
-
-    :param func: a callable function that will be run in new scope.
-    :type func: callable
-    """
-    enter_local_scope()
-    try:
-        func()
-    except:
-        raise
-    finally:
-        leave_local_scope()
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
deleted file mode 100755
index 9b5331629df20e567d3001c2969438b4e4c6714d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/device_worker.py
+++ /dev/null
@@ -1,244 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ['DeviceWorker', 'Hogwild', 'DownpourSGD', 'Section']
-
-
-class DeviceWorker(object):
-    """
-    DeviceWorker is an abstract class, which generates worker desc.
-    This class is an inner class that we do computation logics within
-    the implementation. For example, execution of a program or a graph.
-    """
-
-    def __init__(self):
-        """
-        Init.
-        """
-        self._program = None
-        self._infer = None
-
-    def _set_infer(self, infer=False):
-        """
-        set inference flag for current device worker
-        
-        Args:
-            infer(bool): whether to do inference
-        """
-        self._infer = infer
-
-    def _set_fleet_desc(self, fleet_desc):
-        """
-        Set fleet desc.
-
-        Args:
-            fleet_desc(PSParameter): pslib.PSParameter object
-        """
-        self._fleet_desc = fleet_desc
-
-    def _set_program(self, program):
-        """
-        Set program.
-
-        Args:
-            program(Program): a Program object
-        """
-        self._program = program
-
-    def _gen_worker_desc(self, trainer_desc):
-        """
-        Generator worker desc.
-
-        Args:
-            trainer_desc(TrainerDesc): a TrainerDesc object
-        """
-        raise NotImplementedError(
-            "DeviceWorker does not implement gen_worker_desc, "
-            "please use Hogwild or DownpourSGD, etc.")
-
-
-class Hogwild(DeviceWorker):
-    """
-    Hogwild is a kind of SGD algorithm.
-
-    """
-
-    def __init__(self):
-        """
-        Init.
-        """
-        super(Hogwild, self).__init__()
-
-    def _gen_worker_desc(self, trainer_desc):
-        """
-        Generator worker desc, which device worker is HogwildWorker.
-
-        Args:
-            trainer_desc(TrainerDesc): a TrainerDesc object
-        """
-        trainer_desc.device_worker_name = "HogwildWorker"
-        if self._infer:
-            # just ignore feed op for inference model
-            trainer_desc.hogwild_param.skip_ops.extend(["feed"])
-
-
-class DownpourSGD(DeviceWorker):
-    """
-    DownpourSGD is a kind of distributed SGD algorithm.
-    """
-
-    def __init__(self):
-        """
-        Init.
-        initialize downpourSGD device worker
-        """
-        super(DownpourSGD, self).__init__()
-
-    def _gen_worker_desc(self, trainer_desc):
-        """
-        Generator worker desc, which device worker is DownpourWorker.
-
-        Args:
-            trainer_desc(TrainerDesc): a TrainerDesc object
-        """
-        dense_table_set = set()
-        program_id = str(id(self._program))
-        if self._program == None:
-            print("program of current device worker is not configured")
-            exit(-1)
-        opt_info = self._program._fleet_opt
-        program_configs = opt_info["program_configs"]
-        downpour = trainer_desc.downpour_param
-
-        for pid in program_configs:
-            if pid == program_id:
-                pc = downpour.program_config.add()
-                pc.program_id = program_id
-                for i in program_configs[program_id]["push_sparse"]:
-                    pc.push_sparse_table_id.extend([i])
-                for i in program_configs[program_id]["push_dense"]:
-                    pc.push_dense_table_id.extend([i])
-                    dense_table_set.add(i)
-                for i in program_configs[program_id]["pull_sparse"]:
-                    pc.pull_sparse_table_id.extend([i])
-                for i in program_configs[program_id]["pull_dense"]:
-                    pc.pull_dense_table_id.extend([i])
-                    dense_table_set.add(i)
-                break
-
-        trainer_desc.device_worker_name = "DownpourWorker"
-        pull_thread = trainer_desc.pull_dense_param
-        pull_thread.device_num = trainer_desc.thread_num
-        for i in self._fleet_desc.trainer_param[0].dense_table:
-            if i.table_id in dense_table_set:
-                dense_table = pull_thread.dense_table.add()
-                dense_table.dense_value_name.extend(i.dense_variable_name)
-                dense_table.table_id = \
-                    i.table_id
-        sparse_len = len(self._fleet_desc.trainer_param[0].sparse_table)
-        for i in range(sparse_len):
-            sparse_table = downpour.sparse_table.add()
-            sparse_table.table_id = \
-                        self._fleet_desc.trainer_param[0].sparse_table[i].table_id
-            sparse_table.sparse_key_name.extend(
-                self._fleet_desc.trainer_param[0].sparse_table[i].slot_key)
-            sparse_table.sparse_value_name.extend(
-                self._fleet_desc.trainer_param[0].sparse_table[i].slot_value)
-            sparse_table.sparse_grad_name.extend(
-                self._fleet_desc.trainer_param[0].sparse_table[i].slot_gradient)
-            if opt_info["use_cvm"]:
-                sparse_table.emb_dim = \
-                    self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
-                    i].accessor.fea_dim
-                sparse_table.fea_dim = sparse_table.emb_dim
-            else:
-                sparse_table.emb_dim = \
-                    self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
-                    i].accessor.fea_dim - 2
-                sparse_table.fea_dim = sparse_table.emb_dim + 2
-            # TODO(guru4elephant): hard code here, need to improve
-            sparse_table.label_var_name = "click"
-        if opt_info["stat_var_names"]:
-            for i in opt_info["stat_var_names"]:
-                downpour.stat_var_names.extend([i])
-
-        for i in self._fleet_desc.trainer_param[0].dense_table:
-            if i.table_id in dense_table_set:
-                dense_table = downpour.dense_table.add()
-                dense_table.table_id = i.table_id
-                dense_table.dense_value_name.extend(i.dense_variable_name)
-                dense_table.dense_grad_name.extend(
-                    i.dense_gradient_variable_name)
-                downpour.skip_ops.extend(self._fleet_desc.trainer_param[0].skip_op)
-        if self._infer:
-            downpour.push_dense = False
-            downpour.push_sparse = False
-
-
-class Section(DeviceWorker):
-    """
-    SectionWorker
-    """
-
-    def __init__(self):
-        """
-        Init.
-        """
-        super(Section, self).__init__()
-
-    def _gen_worker_desc(self, trainer_desc):
-        """
-        Generator worker desc, which device worker is SectionWorker.
-        Args:
-            trainer_desc(TrainerDesc): a TrainerDesc object
-        """
-        from google.protobuf import text_format
-        from . import core
-        trainer_desc.device_worker_name = "SectionWorker"
-        pipeline_opt = self._program._pipeline_opt
-        section_param = trainer_desc.section_param
-        section_param.queue_size = pipeline_opt["queue_size"]
-        section_param.sync_steps = pipeline_opt["sync_steps"]
-        section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"]
-        for e in pipeline_opt["param_need_sync"]:
-            section_param.param_need_sync.append(e)
-        for i, program in enumerate(pipeline_opt["section_program_list"]):
-            cfg = section_param.section_config.add()
-            cfg.program_desc.ParseFromString(program["program"]._get_desc()
-                                             .serialize_to_string())
-            # TODO: why does not work
-            #cfg.program_desc.CopyFrom(program.program._get_desc())
-            place = pipeline_opt["place_list"][i]
-            if isinstance(place, core.CPUPlace):
-                cfg.place = cfg.CPUPlace
-            elif isinstance(place, core.CUDAPlace):
-                cfg.place = cfg.CUDAPlace
-            elif isinstance(place, core.CUDAPinnedPlace):
-                cfg.place = cfg.CUDAPinnedPlace
-            else:
-                raise NotImplementedError(
-                    "SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
-                )
-
-            cfg.concurrency = pipeline_opt["concurrency_list"][i]
-            for var in program["input_set"]:
-                cfg.section_in_var_names.append(var)
-            for var in program["output_set"]:
-                cfg.section_out_var_names.append(var)
-
-
-class DeviceWorkerFactory(object):
-    def _create_device_worker(self, worker_type):
-        classname = worker_type.capitalize()
-        return globals()[classname]()
diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py
deleted file mode 100644
index 74824f6832442d5090e0cea2962ca2f68b7a0181..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distribute_lookup_table.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-LOOKUP_TABLE_TYPE = "lookup_table"
-
-
-def find_distributed_lookup_table_inputs(program, table_name):
-    """
-    Find input variable of distribute lookup table in program.
-    We only support one distribute table now.
-    Args:
-    program(Program): given program, locate distributed lookup table
-    table_name(str): given table name that is found beforehand
-    Returns:
-    inputs
-    """
-    local_vars = program.current_block().vars
-    inputs = []
-    for op in program.global_block().ops:
-        if op.type == LOOKUP_TABLE_TYPE:
-            if table_name == op.input("W")[0]:
-                inputs.extend([local_vars[name] for name in op.input("Ids")])
-    return inputs
-
-
-def find_distributed_lookup_table_outputs(program, table_name):
-    """
-    Find output variable of distribute lookup table in program.
-    We only support one distribute table now.
-    Args:
-    program(Program): given program, locate distributed lookup table
-    table_name(str): given table name that is found beforehand
-    Returns:
-    outputs
-    """
-    local_vars = program.current_block().vars
-    outputs = []
-    for op in program.global_block().ops:
-        if op.type == LOOKUP_TABLE_TYPE:
-            if table_name == op.input("W")[0]:
-                outputs.extend([local_vars[name] for name in op.output("Out")])
-    return outputs
-
-
-def find_distributed_lookup_table(program):
-    """
-    Find distribute lookup table in program.
-    We only support one distribute table now.
-    Args:
-    program(Program): given program, locate distributed lookup table
-    Returns:
-    table_name or None
-    """
-    table_name = None
-
-    for op in program.global_block().ops:
-        if op.type == LOOKUP_TABLE_TYPE:
-            if op.attr('is_distributed') is True:
-                if table_name is None:
-                    table_name = op.input("W")[0]
-                if table_name != op.input("W")[0]:
-                    raise RuntimeError("all distributed lookup_table_ops"
-                                       " should have only one table")
-            else:
-                if table_name is not None:
-                    assert op.input("W")[0] != table_name
-
-    return table_name
diff --git a/python/paddle/fluid/distributed/__init__.py b/python/paddle/fluid/distributed/__init__.py
deleted file mode 100644
index cd609c504078b907221a689fbb4e910ec8d54270..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distributed/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py
deleted file mode 100755
index 6908220192557fd540353939c3a5aff526214b50..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distributed/downpour.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-from .node import DownpourServer
-from .node import DownpourWorker
-from ..backward import append_backward
-import ps_pb2 as pslib
-from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
-from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs
-from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs
-from google.protobuf import text_format
-
-
-class DownpourSGD(object):
-    """
-    Distributed optimizer of downpour stochastic gradient descent
-    Standard implementation of Google's Downpour SGD
-    in Large Scale Distributed Deep Networks
-
-    Args:
-        learning_rate (float): the learning rate used to update parameters. \
-        Can be a float value
-    Examples:
-        .. code-block:: python
-    
-             opt = fluid.DistributedOptimizer(sgd_opt)
-             opt.minimize()
-
-             downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2)
-             downpour_sgd.minimize(cost)
-    """
-
-    def __init__(self, learning_rate=0.001, window=1):
-        # todo(guru4elephant): add more optimizers here as argument
-        # todo(guru4elephant): make learning_rate as a variable
-        self.learning_rate_ = learning_rate
-        self.window_ = window
-        self.type = "downpour"
-        self.data_norm_name = [
-            ".batch_size", ".batch_square_sum", ".batch_sum",
-            ".batch_size@GRAD", ".batch_square_sum@GRAD", ".batch_sum@GRAD"
-        ]
-
-    def minimize(self,
-                 losses,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        """
-        DownpounSGD is a distributed optimizer so
-        that user can call minimize to generate backward
-        operators and optimization operators within minmize function
-        Args:
-            loss(Variable): loss variable defined by user
-            startup_program(Program): startup program that defined by user
-            parameter_list(str list): parameter names defined by users
-            no_grad_set(set): a set of variables that is defined by users
-            so that these variables do not need gradient computation
-        Returns:
-            [ps_param, worker_skipped_ops]
-            ps_param: parameter server protobuf desc
-            worker_skipped_ops: operator names that need
-            to be skipped during execution
-        """
-        if not isinstance(losses, list):
-            raise ValueError('losses is a list, just lick [model.cost]')
-        table_name = find_distributed_lookup_table(losses[0].block.program)
-        prefetch_slots = find_distributed_lookup_table_inputs(
-            losses[0].block.program, table_name)
-        prefetch_slots_emb = find_distributed_lookup_table_outputs(
-            losses[0].block.program, table_name)
-
-        ps_param = pslib.PSParameter()
-        server = DownpourServer()
-        worker = DownpourWorker(self.window_)
-        sparse_table_index = 0
-        server.add_sparse_table(sparse_table_index, self.learning_rate_,
-                                prefetch_slots, prefetch_slots_emb)
-        worker.add_sparse_table(sparse_table_index, self.learning_rate_,
-                                prefetch_slots, prefetch_slots_emb)
-        dense_table_index = 1
-        program_configs = []
-        param_grads_list = []
-        tp = ps_param.trainer_param.add()
-        for loss_index in range(len(losses)):
-            program_config = tp.program_config.add()
-            program_config.program_id = str(
-                id(losses[loss_index].block.program))
-            program_config.pull_sparse_table_id.extend([sparse_table_index])
-            program_config.push_sparse_table_id.extend([sparse_table_index])
-            params_grads = sorted(
-                append_backward(losses[loss_index], parameter_list,
-                                no_grad_set),
-                key=lambda x: x[0].name)
-            param_grads_list.append(params_grads)
-            params = []
-            grads = []
-            data_norm_params = []
-            data_norm_grads = []
-            for i in params_grads:
-                is_data_norm_data = False
-                for data_norm_name in self.data_norm_name:
-                    if i[0].name.endswith(data_norm_name):
-                        is_data_norm_data = True
-                        data_norm_params.append(i[0])
-                if not is_data_norm_data:
-                    params.append(i[0])
-            for i in params_grads:
-                is_data_norm_data = False
-                for data_norm_grad in self.data_norm_name:
-                    if i[0].name.endswith(data_norm_grad):
-                        is_data_norm_data = True
-                        data_norm_grads.append(i[1])
-                if not is_data_norm_data:
-                    grads.append(i[1])
-            server.add_dense_table(dense_table_index, self.learning_rate_,
-                                   params, grads)
-            worker.add_dense_table(dense_table_index, self.learning_rate_,
-                                   params, grads)
-            program_config.pull_dense_table_id.extend([dense_table_index])
-            program_config.push_dense_table_id.extend([dense_table_index])
-            if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
-                dense_table_index += 1
-                server.add_data_norm_table(dense_table_index,
-                                           self.learning_rate_,
-                                           data_norm_params, data_norm_grads)
-                worker.add_dense_table(dense_table_index, self.learning_rate_,
-                                       data_norm_params, data_norm_grads)
-                program_config.pull_dense_table_id.extend([dense_table_index])
-                program_config.push_dense_table_id.extend([dense_table_index])
-            dense_table_index += 1
-            program_configs.append(program_config)
-        ps_param.server_param.CopyFrom(server.get_desc())
-        ps_param.trainer_param[0].CopyFrom(worker.get_desc())
-        for program_config in program_configs:
-            ps_param.trainer_param[0].program_config.extend([program_config])
-        # Todo(guru4elephant): figure out how to support more sparse parameters
-        # currently only support lookup_table
-        worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
-        ps_param.trainer_param[0].skip_op.extend(worker_skipped_ops)
-
-        # all fleet operations should be defined in operators in the future
-        # we want to return an object here containing:
-        # 1) worker execution strategy
-        # 2) pserver execution strategy
-        # 3) fleet configurations
-        # 4) skipped operators in runtime
-        # 5) distributed optimization
-        opt_info = {}
-        opt_info["trainer"] = "DistMultiTrainer"
-        opt_info["device_worker"] = "DownpourSGD"
-        opt_info["optimizer"] = "DownpourSGD"
-        opt_info["fleet_desc"] = ps_param
-        opt_info["worker_skipped_ops"] = worker_skipped_ops
-
-        for loss in losses:
-            loss.block.program._fleet_opt = opt_info
-
-        return None, param_grads_list
diff --git a/python/paddle/fluid/distributed/fleet.py b/python/paddle/fluid/distributed/fleet.py
deleted file mode 100644
index 8f3d2defb9f0631098de3fb9ee1fa7b1abdeb884..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distributed/fleet.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-import sys
-from .. import core
-from . import ps_instance
-
-__all__ = ['Fleet']
-
-
-class Fleet(object):
-    """
-    
-    """
-
-    def __init__(self):
-        self.instance_ = ps_instance.PaddlePSInstance()
-        self.fleet_ = core.FleetWrapper()
-
-    def stop(self):
-        self.instance_.barrier_worker()
-        if self.instance.is_first_worker():
-            self.fleet_.stop_server()
-        self.instance_.barrier_worker()
-        self.instance_.barrier_all()
-        self.instance.finalize()
-
-    def init_pserver(self, opt_info):
-        if "fleet_desc" in opt_info:
-            self.dist_desc_str_ = text_format.MessageToString(opt_info[
-                "fleet_desc"])
-            self.dist_desc_ = opt_info["fleet_desc"]
-        else:
-            print(
-                "You should run distributed optimization to get opt_info first")
-            sys.exit(-1)
-        self.fleet_.init_server(self.dist_desc_str_)
-        ip = self.fleet_.start_server()
-        self.instance_.set_ip(ip)
-        self.instance.barrier_all()
-        ips = self.instance.gather_ips()
-        self.fleet.gather_servers(ips, self.instance_.get_node_cnt())
-        self.instance_.barrier_all()
-
-    def init_worker(self, opt_info):
-        if "fleet_desc" in opt_info:
-            self.dist_desc_str_ = text_format.MessageToString(opt_info[
-                "fleet_desc"])
-            self.dist_desc_ = opt_info["fleet_desc"]
-        else:
-            print(
-                "You should run distributed optimization to get opt_info first")
-            sys.exit(-1)
-        self.instance_.barrier_all()
-        ips = self.instance.gather_ips()
-        self.fleet_.init_worker(self.dist_desc_str_, ips,
-                                self.instance_.get_node_cnt(),
-                                self.instance._rankid)
-        self.instance.barrier_worker()
-
-    def init_pserver_model(self):
-        if self.instance_.is_first_worker():
-            self.fleet_.init_model()
-        self.instance_.barrier_worker()
-
-    def save_pserver_model(self, save_path):
-        self.fleet_.save_model(save_path)
diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py
deleted file mode 100644
index 20f45b4e7961544d60053306b40325386d36bda3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distributed/helper.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-class FileSystem(object):
-    """
-    A file system that support hadoop client desc. 
-
-    Args:
-        fs_type (string): fs_type, for example is "afs"
-        user (string): hadoop param
-        passwd (string): hadoop param
-        hadoop bin (string): hadoop param
-    Examples:
-        fs = FileSystm()
-    """
-
-    def __init__(self,
-                 fs_type="afs",
-                 uri="afs://xx",
-                 user=None,
-                 passwd=None,
-                 hadoop_bin=""):
-        assert user != None
-        assert passwd != None
-        assert hadoop_bin != None
-        import ps_pb2 as pslib
-        self.fs_client = pslib.FsClientParameter()
-        self.fs_client.uri = uri
-        self.fs_client.user = user
-        self.fs_client.passwd = passwd
-        #self.fs_client.buffer_size = 0
-        self.fs_client.hadoop_bin = hadoop_bin
-        #self.fs_client.afs_conf = afs_conf if not afs_conf else ""
-
-    def get_desc(self):
-        """
-        get hadoop desc.
-        """
-        return self.fs_client
-
-
-class MPIHelper(object):
-    """
-    MPIHelper is a wrapper of mpi4py, support get_rank get_size etc.
-    Args:
-        No params
-    Examples:
-        mh = MPIHelper()
-        mh.get_ip()
-    """
-
-    def __init__(self):
-        from mpi4py import MPI
-        self.comm = MPI.COMM_WORLD
-        self.MPI = MPI
-
-    def get_rank(self):
-        return self.comm.Get_rank()
-
-    def get_size(self):
-        return self.comm.Get_size()
-
-    def get_ip(self):
-        import socket
-        local_ip = socket.gethostbyname(socket.gethostname())
-        return local_ip
-
-    def get_hostname(self):
-        import socket
-        return socket.gethostname()
-
-    def finalize(self):
-        self.MPI.Finalize()
diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py
deleted file mode 100644
index 41e0d64e0b788b0e354f7635c3d3e52d6bba7e23..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distributed/node.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-import ps_pb2 as pslib
-
-
-class Server(object):
-    """
-        A Server basic class.
-    """
-
-    def __init__(self):
-        pass
-
-
-class Worker(object):
-    """
-        A Worker basic class.
-    """
-
-    def __init__(self):
-        pass
-
-
-class DownpourServer(Server):
-    """
-        DownpourServer class is used to generate server program_desc
-        Args:
-            server: it is pslib.ServerParameter() 
-        Examples:
-            server = DownpourServer()
-    """
-
-    def __init__(self):
-        self.server_ = pslib.ServerParameter()
-        self.server_.downpour_server_param.service_param.start_server_port = 0
-        self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
-        self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
-        self.server_.downpour_server_param.service_param.service_class = "DownpourPsService"
-        self.server_.downpour_server_param.service_param.start_server_port = 0
-        self.server_.downpour_server_param.service_param.server_thread_num = 12
-
-    def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
-                         slot_value_var):
-        """
-        Args:
-            table_id(int): id of sparse params table
-            learning_rate(float): the learning rate used to update parameters. \
-                Can be a float value
-            slot_key_vars(string): slot key id 
-            slot_value_var(string): slot key value after embedding
-        Returns:
-            return None 
-        """
-        table = self.server_.downpour_server_param.downpour_table_param.add()
-        table.table_id = table_id
-        table.table_class = "DownpourSparseTable"
-        table.type = pslib.PS_SPARSE_TABLE
-        table.accessor.accessor_class = "DownpourFeatureValueAccessor"
-        table.accessor.sparse_sgd_param.learning_rate = learning_rate
-        table.accessor.sparse_sgd_param.initial_g2sum = 3
-        table.accessor.sparse_sgd_param.initial_range = 1e-4
-        table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10])
-
-        table.accessor.embedx_dim = 8
-        table.accessor.embedx_threshold = 5
-        table.accessor.fea_dim = 11
-        table.accessor.downpour_accessor_param.nonclk_coeff = 0.1
-        table.accessor.downpour_accessor_param.click_coeff = 2
-        table.accessor.downpour_accessor_param.base_threshold = 0.2
-        table.accessor.downpour_accessor_param.delta_threshold = 0.15
-        table.accessor.downpour_accessor_param.delta_keep_days = 31
-        table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999
-        table.accessor.downpour_accessor_param.delete_threshold = 0.8
-
-    def add_dense_table(self, table_id, learning_rate, param_var, grad_var):
-        """
-        Args:
-            table_id(int): id of sparse params table
-            learning_rate(float): the learning rate used to update parameters. \
-                Can be a float value
-            param_var(list): all dense param. it is a list.
-            grad_var(list): all dense grad parm it is a list.
-        Returns:
-            return None 
-        """
-        table = self.server_.downpour_server_param.downpour_table_param.add()
-        table.table_id = table_id
-        table.table_class = "DownpourDenseTable"
-        table.type = pslib.PS_DENSE_TABLE
-        table.accessor.accessor_class = "DownpourDenseValueAccessor"
-        table.accessor.dense_sgd_param.name = "adam"
-        table.accessor.dense_sgd_param.adam.learning_rate = learning_rate
-        table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993
-        table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999
-        table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8
-        table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99
-        table.accessor.dense_sgd_param.naive.learning_rate = 0.0002
-        fea_dim = 0
-        for param in filter(lambda x: x.name.find("embedding") == -1,
-                            param_var):
-            fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
-        table.accessor.fea_dim = fea_dim
-
-    def get_desc(self):
-        """
-        Return downpour server program_desc
-        """
-        return self.server_
-
-
-class DownpourWorker(Worker):
-    """
-        DownpourWorker class is used to generate worker program_desc
-        Args:
-            window (int): push params frequency
-            worker: it is pslib.DownpourTrainerParameter 
-        Examples:
-            worker = DownpourWorker(1)
-    """
-
-    def __init__(self, window):
-        self.window = window
-        self.worker_ = pslib.DownpourTrainerParameter()
-
-    def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
-                         slot_value_vars):
-        """
-        Args:
-            table_id(int): id of sparse params table
-            learning_rate(float): the learning rate used to update parameters. \
-                Can be a float value
-            slot_key_vars(string): slot key id 
-            slot_value_var(string): slot key value after embedding
-        Returns:
-            return None 
-        """
-        table = self.worker_.sparse_table.add()
-        table.table_id = table_id
-        table.slot_key.extend([var.name for var in slot_key_vars])
-        table.slot_value.extend([var.name for var in slot_value_vars])
-        table.slot_gradient.extend(
-            [var.name + "@GRAD" for var in slot_value_vars])
-
-    def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars):
-        """
-        Args:
-            table_id(int): id of sparse params table
-            learning_rate(float): the learning rate used to update parameters. \
-                Can be a float value
-            param_var(list): all dense param. it is a list.
-            grad_var(list): all dense grad parm it is a list.
-        Returns:
-            return None 
-        """
-        table = self.worker_.dense_table.add()
-        table.table_id = table_id
-        table.dense_variable_name.extend(
-            filter(lambda x: x.find("embedding") == -1,
-                   [p.name for p in param_vars]))
-        table.dense_gradient_variable_name.extend(
-            filter(lambda x: x.find("embedding") == -1,
-                   [g.name for g in grad_vars]))
-
-    def get_desc(self):
-        """
-        Return downpour worker program_desc
-        """
-        return self.worker_
diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py
deleted file mode 100644
index 19d661c660efef8394bd2369f7759645ebbf3c5d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distributed/ps_instance.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-from .helper import MPIHelper
-
-
-class PaddlePSInstance(object):
-    """
-        PaddlePSInstance class is used to generate A instance of server or worker 
-        Args:
-            server_worker_mode: is a value 0 or 1, default is 1
-            proc_per_node: process per node, default is 2 
-        Examples:
-            instance = PaddlePSInstance(1, 2)
-    """
-
-    def __init__(self, server_worker_mode, proc_per_node):
-        self.dh = MPIHelper()
-        self._rankid = self.dh.get_rank()
-        self._server_worker_mode = server_worker_mode
-        self._proc_per_node = proc_per_node
-        self._nodes = self.dh.get_size()
-
-        self._ip = 0
-        self._worker_num = self._nodes * self._proc_per_node / 2
-        self._server_num = self._nodes * self._proc_per_node / 2
-        self._total_server_worker = self._worker_num + self._server_num
-        self._node_type = None  #IDLE=-1, WORKER=1, SERVER=0
-        self._set_nodetype()
-        self._comm = None
-        self._split_comm()
-
-    def _set_nodetype(self):
-        if self._server_worker_mode == 0:
-            if self._rankid < self._server_num:
-                self._node_type = 1
-            elif self._rankid < self._total_server_worker:
-                self._node_type = 0
-            else:
-                self._node_type = -1
-        elif self._server_worker_mode == 1:
-            if self._rankid < self._total_server_worker:
-                if 0 == self._rankid % self._proc_per_node % 2:
-                    self._node_type = 0
-                else:
-                    self._node_type = 1
-            else:
-                self._node_type = -1
-        else:
-            self._node_type = -1
-
-    def _split_comm(self):
-        if self.is_server():
-            self._comm = self.dh.comm.Split(self._node_type)
-        elif self.is_worker():
-            self._comm = self.dh.comm.Split(self._node_type)
-        pass
-
-    def get_worker_index(self):
-        """
-        Return worker index 
-        """
-        if self._server_worker_mode == 0:
-            return self._rankid == self.server_num
-        else:
-            return self._rankid / self._proc_per_node
-
-    def get_server_index(self):
-        """
-        Return server index 
-        """
-        if self._server_worker_mode == 0:
-            return self.rank_id
-        else:
-            return self.rank_id / self._proc_per_node
-
-    def is_worker(self):
-        """
-        Return instance is worker or not
-        """
-        return self._node_type == 1
-
-    def is_server(self):
-        """
-        Return instance is server or not
-        """
-        return self._node_type == 0
-
-    def is_first_worker(self):
-        """
-        Return instance is first worker or not
-        """
-        return self.is_worker() and 0 == self.get_worker_index()
-
-    def set_ip(self, ip):
-        """
-            set server ip
-        """
-        self._ip = ip
-
-    def gather_ips(self):
-        """
-        Return all servers and workers ip throught mpi allgather 
-        """
-        self._ips = self.dh.comm.allgather(self._ip)
-        return self._ips
-
-    def get_node_cnt(self):
-        """
-        Return node cnt
-        """
-        return self._nodes
-
-    def get_worker_num(self):
-        """
-        Return worker num
-        """
-        return self._worker_num
-
-    def get_server_num(self):
-        """
-        Return server num
-        """
-        return self._server_num
-
-    def barrier_all(self):
-        """
-        barrier workers and servers
-        """
-        self.dh.comm.barrier()
-
-    def barrier_worker(self):
-        """
-        barrier workers
-        """
-        if self.is_worker():
-            self._comm.barrier()
-        pass
-
-    def finalize(self):
-        """
-        MPI finalize
-        """
-        self.dh.finalize()
-        pass
-
-
-if __name__ == "__main__":
-    instance = PaddlePSInstance(1, 1, 2, 50)
-    instance.barrier_all()
diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py
deleted file mode 100644
index d515a870fbccd00501a04102378ff4185276680b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distributed/ps_pb2.py
+++ /dev/null
@@ -1,1942 +0,0 @@
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: ps.proto
-
-import sys
-_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
-from google.protobuf.internal import enum_type_wrapper
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import message as _message
-from google.protobuf import reflection as _reflection
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf import descriptor_pb2
-# @@protoc_insertion_point(imports)
-
-_sym_db = _symbol_database.Default()
-
-
-
-
-DESCRIPTOR = _descriptor.FileDescriptor(
-  name='ps.proto',
-  package='paddle',
-  syntax='proto2',
-  serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\xb5\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12\x15\n\x0binit_gflags\x18\x04 \x01(\t:\x00\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x03(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xfd\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\x12-\n\x0eprogram_config\x18\x06 \x03(\x0b\x32\x15.paddle.ProgramConfig\"\x99\x01\n\rProgramConfig\x12\x12\n\nprogram_id\x18\x01 \x02(\t\x12\x1c\n\x14push_sparse_table_id\x18\x02 \x03(\x05\x12\x1b\n\x13push_dense_table_id\x18\x03 \x03(\x05\x12\x1c\n\x14pull_sparse_table_id\x18\x04 \x03(\x05\x12\x1b\n\x13pull_dense_table_id\x18\x05 \x03(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x02\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x17\n\tshard_num\x18\x03 \x01(\x04:\x04\x31\x30\x30\x30\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\'\n\x19\x65nable_sparse_table_cache\x18\x07 \x01(\x08:\x04true\x12\'\n\x17sparse_table_cache_rate\x18\x08 \x01(\x01:\x06\x30.0025\x12\'\n\x1bsparse_table_cache_file_num\x18\t \x01(\r:\x02\x31\x36\"\xc1\x04\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x13\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r:\x02\x31\x31\x12\x15\n\nembedx_dim\x18\x05 \x01(\r:\x01\x38\x12\x1c\n\x10\x65mbedx_threshold\x18\x06 \x01(\r:\x02\x31\x30\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\x12\x44\n\x16sparse_commonsgd_param\x18\t \x01(\x0b\x32$.paddle.SparseCommonSGDRuleParameter\x12=\n\x0f\x65mbed_sgd_param\x18\n \x01(\x0b\x32$.paddle.SparseCommonSGDRuleParameter\x12>\n\x10\x65mbedx_sgd_param\x18\x0b \x01(\x0b\x32$.paddle.SparseCommonSGDRuleParameter\"\xba\x02\n\x1e\x44ownpourTableAccessorParameter\x12\x19\n\x0cnonclk_coeff\x18\x01 \x01(\x02:\x03\x30.1\x12\x16\n\x0b\x63lick_coeff\x18\x02 \x01(\x02:\x01\x31\x12\x1b\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02:\x03\x31.5\x12\x1d\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02:\x04\x30.25\x12\x1b\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02:\x02\x31\x36\x12#\n\x15show_click_decay_rate\x18\x06 \x01(\x02:\x04\x30.98\x12\x1d\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02:\x03\x30.8\x12$\n\x18\x64\x65lete_after_unseen_days\x18\x08 \x01(\x02:\x02\x33\x30\x12\"\n\x17ssd_unseenday_threshold\x18\t \x01(\x05:\x01\x31\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"\x85\x01\n\x16SparseSGDRuleParameter\x12\x1b\n\rlearning_rate\x18\x01 \x01(\x01:\x04\x30.05\x12\x18\n\rinitial_g2sum\x18\x02 \x01(\x01:\x01\x33\x12\x1d\n\rinitial_range\x18\x03 \x01(\x01:\x06\x30.0001\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xc6\x01\n\x1cSparseCommonSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x32\n\x05naive\x18\x02 \x01(\x0b\x32#.paddle.SparseNaiveSGDRuleParameter\x12\x36\n\x07\x61\x64\x61grad\x18\x03 \x01(\x0b\x32%.paddle.SparseAdagradSGDRuleParameter\x12,\n\x04\x61\x64\x61m\x18\x04 \x01(\x0b\x32\x1e.paddle.SparseAdamSGDParameter\"p\n\x1bSparseNaiveSGDRuleParameter\x12\x1b\n\rlearning_rate\x18\x01 \x01(\x01:\x04\x30.05\x12\x1d\n\rinitial_range\x18\x02 \x01(\x01:\x06\x30.0001\x12\x15\n\rweight_bounds\x18\x03 \x03(\x02\"\x8c\x01\n\x1dSparseAdagradSGDRuleParameter\x12\x1b\n\rlearning_rate\x18\x01 \x01(\x01:\x04\x30.05\x12\x18\n\rinitial_g2sum\x18\x02 \x01(\x01:\x01\x33\x12\x1d\n\rinitial_range\x18\x03 \x01(\x01:\x06\x30.0001\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xc8\x01\n\x16SparseAdamSGDParameter\x12\x1c\n\rlearning_rate\x18\x01 \x01(\x01:\x05\x30.001\x12\x1d\n\rinitial_range\x18\x02 \x01(\x01:\x06\x30.0001\x12\x1d\n\x10\x62\x65ta1_decay_rate\x18\x03 \x01(\x01:\x03\x30.9\x12\x1f\n\x10\x62\x65ta2_decay_rate\x18\x04 \x01(\x01:\x05\x30.999\x12\x1a\n\x0b\x61\x64\x61_epsilon\x18\x05 \x01(\x01:\x05\x31\x65-08\x12\x15\n\rweight_bounds\x18\x06 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\xac\x01\n\x10\x41\x64\x61mSGDParameter\x12\x1c\n\rlearning_rate\x18\x01 \x01(\x01:\x05\x35\x65-06\x12 \n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01:\x08\x30.999993\x12\x1e\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01:\x06\x30.9999\x12\x1a\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01:\x05\x31\x65-08\x12\x1c\n\x0emom_decay_rate\x18\x05 \x01(\x01:\x04\x30.99\"J\n\x11NaiveSGDParameter\x12\x1d\n\rlearning_rate\x18\x01 \x01(\x01:\x06\x30.0002\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xba\x04\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x12\x1b\n\x17PS_SAVE_ONE_CACHE_TABLE\x10\r\x12\x1a\n\x16PS_GET_CACHE_THRESHOLD\x10\x0e\x12\x14\n\x10PS_CACHE_SHUFFLE\x10\x0f\x12\x11\n\rPS_COPY_TABLE\x10\x10\x12\x1c\n\x18PS_COPY_TABLE_BY_FEASIGN\x10\x11\x12(\n$PS_PULL_SPARSE_TABLE_WITH_DEPENDENCY\x10\x12\x12(\n$PS_PUSH_SPARSE_TABLE_WITH_DEPENDENCY\x10\x13\x12\x17\n\x13PS_PRINT_TABLE_STAT\x10\x14\x12\x0e\n\nPS_S2S_MSG\x10\x65\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x06\x80\x01\x01\xf8\x01\x01')
-)
-_sym_db.RegisterFileDescriptor(DESCRIPTOR)
-
-_TABLETYPE = _descriptor.EnumDescriptor(
-  name='TableType',
-  full_name='paddle.TableType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='PS_SPARSE_TABLE', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_DENSE_TABLE', index=1, number=1,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=4678,
-  serialized_end=4730,
-)
-_sym_db.RegisterEnumDescriptor(_TABLETYPE)
-
-TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE)
-_PSCMDID = _descriptor.EnumDescriptor(
-  name='PsCmdID',
-  full_name='paddle.PsCmdID',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='PS_PULL_DENSE_TABLE', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PUSH_DENSE_TABLE', index=1, number=1,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PULL_SPARSE_TABLE', index=2, number=2,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PUSH_SPARSE_TABLE', index=3, number=3,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_SHRINK_TABLE', index=4, number=4,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_SAVE_ONE_TABLE', index=5, number=5,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_SAVE_ALL_TABLE', index=6, number=6,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_LOAD_ONE_TABLE', index=7, number=7,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_LOAD_ALL_TABLE', index=8, number=8,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_CLEAR_ONE_TABLE', index=9, number=9,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_CLEAR_ALL_TABLE', index=10, number=10,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PUSH_DENSE_PARAM', index=11, number=11,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_STOP_SERVER', index=12, number=12,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_SAVE_ONE_CACHE_TABLE', index=13, number=13,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_GET_CACHE_THRESHOLD', index=14, number=14,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_CACHE_SHUFFLE', index=15, number=15,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_COPY_TABLE', index=16, number=16,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_COPY_TABLE_BY_FEASIGN', index=17, number=17,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PULL_SPARSE_TABLE_WITH_DEPENDENCY', index=18, number=18,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PUSH_SPARSE_TABLE_WITH_DEPENDENCY', index=19, number=19,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PRINT_TABLE_STAT', index=20, number=20,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_S2S_MSG', index=21, number=101,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=4733,
-  serialized_end=5303,
-)
-_sym_db.RegisterEnumDescriptor(_PSCMDID)
-
-PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID)
-PS_SPARSE_TABLE = 0
-PS_DENSE_TABLE = 1
-PS_PULL_DENSE_TABLE = 0
-PS_PUSH_DENSE_TABLE = 1
-PS_PULL_SPARSE_TABLE = 2
-PS_PUSH_SPARSE_TABLE = 3
-PS_SHRINK_TABLE = 4
-PS_SAVE_ONE_TABLE = 5
-PS_SAVE_ALL_TABLE = 6
-PS_LOAD_ONE_TABLE = 7
-PS_LOAD_ALL_TABLE = 8
-PS_CLEAR_ONE_TABLE = 9
-PS_CLEAR_ALL_TABLE = 10
-PS_PUSH_DENSE_PARAM = 11
-PS_STOP_SERVER = 12
-PS_SAVE_ONE_CACHE_TABLE = 13
-PS_GET_CACHE_THRESHOLD = 14
-PS_CACHE_SHUFFLE = 15
-PS_COPY_TABLE = 16
-PS_COPY_TABLE_BY_FEASIGN = 17
-PS_PULL_SPARSE_TABLE_WITH_DEPENDENCY = 18
-PS_PUSH_SPARSE_TABLE_WITH_DEPENDENCY = 19
-PS_PRINT_TABLE_STAT = 20
-PS_S2S_MSG = 101
-
-
-_FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor(
-  name='FsApiType',
-  full_name='paddle.FsClientParameter.FsApiType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='HDFS', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='AFS', index=1, number=1,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=4646,
-  serialized_end=4676,
-)
-_sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE)
-
-
-_PSPARAMETER = _descriptor.Descriptor(
-  name='PSParameter',
-  full_name='paddle.PSParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='worker_class', full_name='paddle.PSParameter.worker_class', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='server_class', full_name='paddle.PSParameter.server_class', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='instance_class', full_name='paddle.PSParameter.instance_class', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='init_gflags', full_name='paddle.PSParameter.init_gflags', index=3,
-      number=4, type=9, cpp_type=9, label=1,
-      has_default_value=True, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='worker_param', full_name='paddle.PSParameter.worker_param', index=4,
-      number=101, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='server_param', full_name='paddle.PSParameter.server_param', index=5,
-      number=102, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='trainer_param', full_name='paddle.PSParameter.trainer_param', index=6,
-      number=301, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='fs_client_param', full_name='paddle.PSParameter.fs_client_param', index=7,
-      number=501, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=21,
-  serialized_end=330,
-)
-
-
-_WORKERPARAMETER = _descriptor.Descriptor(
-  name='WorkerParameter',
-  full_name='paddle.WorkerParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='downpour_worker_param', full_name='paddle.WorkerParameter.downpour_worker_param', index=0,
-      number=1, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=332,
-  serialized_end=413,
-)
-
-
-_SERVERPARAMETER = _descriptor.Descriptor(
-  name='ServerParameter',
-  full_name='paddle.ServerParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='downpour_server_param', full_name='paddle.ServerParameter.downpour_server_param', index=0,
-      number=1, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=415,
-  serialized_end=496,
-)
-
-
-_DOWNPOURWORKERPARAMETER = _descriptor.Descriptor(
-  name='DownpourWorkerParameter',
-  full_name='paddle.DownpourWorkerParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='downpour_table_param', full_name='paddle.DownpourWorkerParameter.downpour_table_param', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=498,
-  serialized_end=577,
-)
-
-
-_DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor(
-  name='DownpourTrainerParameter',
-  full_name='paddle.DownpourTrainerParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='dense_table', full_name='paddle.DownpourTrainerParameter.dense_table', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='sparse_table', full_name='paddle.DownpourTrainerParameter.sparse_table', index=1,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='push_sparse_per_batch', full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch', index=2,
-      number=3, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='push_dense_per_batch', full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', index=3,
-      number=4, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='skip_op', full_name='paddle.DownpourTrainerParameter.skip_op', index=4,
-      number=5, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='program_config', full_name='paddle.DownpourTrainerParameter.program_config', index=5,
-      number=6, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=580,
-  serialized_end=833,
-)
-
-
-_PROGRAMCONFIG = _descriptor.Descriptor(
-  name='ProgramConfig',
-  full_name='paddle.ProgramConfig',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='program_id', full_name='paddle.ProgramConfig.program_id', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='push_sparse_table_id', full_name='paddle.ProgramConfig.push_sparse_table_id', index=1,
-      number=2, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='push_dense_table_id', full_name='paddle.ProgramConfig.push_dense_table_id', index=2,
-      number=3, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='pull_sparse_table_id', full_name='paddle.ProgramConfig.pull_sparse_table_id', index=3,
-      number=4, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='pull_dense_table_id', full_name='paddle.ProgramConfig.pull_dense_table_id', index=4,
-      number=5, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=836,
-  serialized_end=989,
-)
-
-
-_DENSETABLEPARAMETER = _descriptor.Descriptor(
-  name='DenseTableParameter',
-  full_name='paddle.DenseTableParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='table_id', full_name='paddle.DenseTableParameter.table_id', index=0,
-      number=1, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dense_variable_name', full_name='paddle.DenseTableParameter.dense_variable_name', index=1,
-      number=2, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dense_gradient_variable_name', full_name='paddle.DenseTableParameter.dense_gradient_variable_name', index=2,
-      number=3, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='fea_dim', full_name='paddle.DenseTableParameter.fea_dim', index=3,
-      number=4, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=991,
-  serialized_end=1114,
-)
-
-
-_SPARSETABLEPARAMETER = _descriptor.Descriptor(
-  name='SparseTableParameter',
-  full_name='paddle.SparseTableParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='table_id', full_name='paddle.SparseTableParameter.table_id', index=0,
-      number=1, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='feature_dim', full_name='paddle.SparseTableParameter.feature_dim', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='slot_key', full_name='paddle.SparseTableParameter.slot_key', index=2,
-      number=3, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='slot_value', full_name='paddle.SparseTableParameter.slot_value', index=3,
-      number=4, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='slot_gradient', full_name='paddle.SparseTableParameter.slot_gradient', index=4,
-      number=5, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1116,
-  serialized_end=1238,
-)
-
-
-_DOWNPOURSERVERPARAMETER = _descriptor.Descriptor(
-  name='DownpourServerParameter',
-  full_name='paddle.DownpourServerParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='downpour_table_param', full_name='paddle.DownpourServerParameter.downpour_table_param', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='service_param', full_name='paddle.DownpourServerParameter.service_param', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1241,
-  serialized_end=1375,
-)
-
-
-_SERVERSERVICEPARAMETER = _descriptor.Descriptor(
-  name='ServerServiceParameter',
-  full_name='paddle.ServerServiceParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='server_class', full_name='paddle.ServerServiceParameter.server_class', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=True, default_value=_b("DownpourBrpcPsServer").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='client_class', full_name='paddle.ServerServiceParameter.client_class', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=True, default_value=_b("DownpourBrpcPsClient").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='service_class', full_name='paddle.ServerServiceParameter.service_class', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=True, default_value=_b("DownpourPsService").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='start_server_port', full_name='paddle.ServerServiceParameter.start_server_port', index=3,
-      number=4, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='server_thread_num', full_name='paddle.ServerServiceParameter.server_thread_num', index=4,
-      number=5, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=12,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1378,
-  serialized_end=1593,
-)
-
-
-_TABLEPARAMETER = _descriptor.Descriptor(
-  name='TableParameter',
-  full_name='paddle.TableParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='table_id', full_name='paddle.TableParameter.table_id', index=0,
-      number=1, type=4, cpp_type=4, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='table_class', full_name='paddle.TableParameter.table_class', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='shard_num', full_name='paddle.TableParameter.shard_num', index=2,
-      number=3, type=4, cpp_type=4, label=1,
-      has_default_value=True, default_value=1000,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='accessor', full_name='paddle.TableParameter.accessor', index=3,
-      number=4, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle.TableParameter.type', index=4,
-      number=5, type=14, cpp_type=8, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='compress_in_save', full_name='paddle.TableParameter.compress_in_save', index=5,
-      number=6, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='enable_sparse_table_cache', full_name='paddle.TableParameter.enable_sparse_table_cache', index=6,
-      number=7, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=True,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='sparse_table_cache_rate', full_name='paddle.TableParameter.sparse_table_cache_rate', index=7,
-      number=8, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0025),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='sparse_table_cache_file_num', full_name='paddle.TableParameter.sparse_table_cache_file_num', index=8,
-      number=9, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=16,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1596,
-  serialized_end=1915,
-)
-
-
-_TABLEACCESSORPARAMETER = _descriptor.Descriptor(
-  name='TableAccessorParameter',
-  full_name='paddle.TableAccessorParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='accessor_class', full_name='paddle.TableAccessorParameter.accessor_class', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='sparse_sgd_param', full_name='paddle.TableAccessorParameter.sparse_sgd_param', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dense_sgd_param', full_name='paddle.TableAccessorParameter.dense_sgd_param', index=2,
-      number=3, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='fea_dim', full_name='paddle.TableAccessorParameter.fea_dim', index=3,
-      number=4, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=11,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='embedx_dim', full_name='paddle.TableAccessorParameter.embedx_dim', index=4,
-      number=5, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=8,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='embedx_threshold', full_name='paddle.TableAccessorParameter.embedx_threshold', index=5,
-      number=6, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=10,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='downpour_accessor_param', full_name='paddle.TableAccessorParameter.downpour_accessor_param', index=6,
-      number=7, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='table_accessor_save_param', full_name='paddle.TableAccessorParameter.table_accessor_save_param', index=7,
-      number=8, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='sparse_commonsgd_param', full_name='paddle.TableAccessorParameter.sparse_commonsgd_param', index=8,
-      number=9, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='embed_sgd_param', full_name='paddle.TableAccessorParameter.embed_sgd_param', index=9,
-      number=10, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='embedx_sgd_param', full_name='paddle.TableAccessorParameter.embedx_sgd_param', index=10,
-      number=11, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1918,
-  serialized_end=2495,
-)
-
-
-_DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
-  name='DownpourTableAccessorParameter',
-  full_name='paddle.DownpourTableAccessorParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='nonclk_coeff', full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff', index=0,
-      number=1, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(0.1),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='click_coeff', full_name='paddle.DownpourTableAccessorParameter.click_coeff', index=1,
-      number=2, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(1),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='base_threshold', full_name='paddle.DownpourTableAccessorParameter.base_threshold', index=2,
-      number=3, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(1.5),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='delta_threshold', full_name='paddle.DownpourTableAccessorParameter.delta_threshold', index=3,
-      number=4, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(0.25),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='delta_keep_days', full_name='paddle.DownpourTableAccessorParameter.delta_keep_days', index=4,
-      number=5, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(16),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='show_click_decay_rate', full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate', index=5,
-      number=6, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(0.98),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='delete_threshold', full_name='paddle.DownpourTableAccessorParameter.delete_threshold', index=6,
-      number=7, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(0.8),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='delete_after_unseen_days', full_name='paddle.DownpourTableAccessorParameter.delete_after_unseen_days', index=7,
-      number=8, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(30),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ssd_unseenday_threshold', full_name='paddle.DownpourTableAccessorParameter.ssd_unseenday_threshold', index=8,
-      number=9, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2498,
-  serialized_end=2812,
-)
-
-
-_TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
-  name='TableAccessorSaveParameter',
-  full_name='paddle.TableAccessorSaveParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='param', full_name='paddle.TableAccessorSaveParameter.param', index=0,
-      number=1, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='converter', full_name='paddle.TableAccessorSaveParameter.converter', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='deconverter', full_name='paddle.TableAccessorSaveParameter.deconverter', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2814,
-  serialized_end=2897,
-)
-
-
-_PSREQUESTMESSAGE = _descriptor.Descriptor(
-  name='PsRequestMessage',
-  full_name='paddle.PsRequestMessage',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='cmd_id', full_name='paddle.PsRequestMessage.cmd_id', index=0,
-      number=1, type=13, cpp_type=3, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='table_id', full_name='paddle.PsRequestMessage.table_id', index=1,
-      number=2, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='params', full_name='paddle.PsRequestMessage.params', index=2,
-      number=3, type=12, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='client_id', full_name='paddle.PsRequestMessage.client_id', index=3,
-      number=4, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='data', full_name='paddle.PsRequestMessage.data', index=4,
-      number=5, type=12, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b(""),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2899,
-  serialized_end=3000,
-)
-
-
-_SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
-  name='SparseSGDRuleParameter',
-  full_name='paddle.SparseSGDRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.SparseSGDRuleParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.05),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_g2sum', full_name='paddle.SparseSGDRuleParameter.initial_g2sum', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(3),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_range', full_name='paddle.SparseSGDRuleParameter.initial_range', index=2,
-      number=3, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0001),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='weight_bounds', full_name='paddle.SparseSGDRuleParameter.weight_bounds', index=3,
-      number=4, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3003,
-  serialized_end=3136,
-)
-
-
-_SPARSECOMMONSGDRULEPARAMETER = _descriptor.Descriptor(
-  name='SparseCommonSGDRuleParameter',
-  full_name='paddle.SparseCommonSGDRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle.SparseCommonSGDRuleParameter.name', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='naive', full_name='paddle.SparseCommonSGDRuleParameter.naive', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='adagrad', full_name='paddle.SparseCommonSGDRuleParameter.adagrad', index=2,
-      number=3, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='adam', full_name='paddle.SparseCommonSGDRuleParameter.adam', index=3,
-      number=4, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3139,
-  serialized_end=3337,
-)
-
-
-_SPARSENAIVESGDRULEPARAMETER = _descriptor.Descriptor(
-  name='SparseNaiveSGDRuleParameter',
-  full_name='paddle.SparseNaiveSGDRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.SparseNaiveSGDRuleParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.05),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_range', full_name='paddle.SparseNaiveSGDRuleParameter.initial_range', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0001),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='weight_bounds', full_name='paddle.SparseNaiveSGDRuleParameter.weight_bounds', index=2,
-      number=3, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3339,
-  serialized_end=3451,
-)
-
-
-_SPARSEADAGRADSGDRULEPARAMETER = _descriptor.Descriptor(
-  name='SparseAdagradSGDRuleParameter',
-  full_name='paddle.SparseAdagradSGDRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.SparseAdagradSGDRuleParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.05),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_g2sum', full_name='paddle.SparseAdagradSGDRuleParameter.initial_g2sum', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(3),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_range', full_name='paddle.SparseAdagradSGDRuleParameter.initial_range', index=2,
-      number=3, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0001),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='weight_bounds', full_name='paddle.SparseAdagradSGDRuleParameter.weight_bounds', index=3,
-      number=4, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3454,
-  serialized_end=3594,
-)
-
-
-_SPARSEADAMSGDPARAMETER = _descriptor.Descriptor(
-  name='SparseAdamSGDParameter',
-  full_name='paddle.SparseAdamSGDParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.SparseAdamSGDParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.001),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_range', full_name='paddle.SparseAdamSGDParameter.initial_range', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0001),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='beta1_decay_rate', full_name='paddle.SparseAdamSGDParameter.beta1_decay_rate', index=2,
-      number=3, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.9),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='beta2_decay_rate', full_name='paddle.SparseAdamSGDParameter.beta2_decay_rate', index=3,
-      number=4, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.999),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ada_epsilon', full_name='paddle.SparseAdamSGDParameter.ada_epsilon', index=4,
-      number=5, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(1e-08),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='weight_bounds', full_name='paddle.SparseAdamSGDParameter.weight_bounds', index=5,
-      number=6, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3597,
-  serialized_end=3797,
-)
-
-
-_DENSESGDRULEPARAMETER = _descriptor.Descriptor(
-  name='DenseSGDRuleParameter',
-  full_name='paddle.DenseSGDRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle.DenseSGDRuleParameter.name', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='adam', full_name='paddle.DenseSGDRuleParameter.adam', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='naive', full_name='paddle.DenseSGDRuleParameter.naive', index=2,
-      number=3, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='summary', full_name='paddle.DenseSGDRuleParameter.summary', index=3,
-      number=4, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='moving_average', full_name='paddle.DenseSGDRuleParameter.moving_average', index=4,
-      number=5, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3800,
-  serialized_end=4025,
-)
-
-
-_ADAMSGDPARAMETER = _descriptor.Descriptor(
-  name='AdamSGDParameter',
-  full_name='paddle.AdamSGDParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.AdamSGDParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(5e-06),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='avg_decay_rate', full_name='paddle.AdamSGDParameter.avg_decay_rate', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.999993),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ada_decay_rate', full_name='paddle.AdamSGDParameter.ada_decay_rate', index=2,
-      number=3, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.9999),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ada_epsilon', full_name='paddle.AdamSGDParameter.ada_epsilon', index=3,
-      number=4, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(1e-08),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='mom_decay_rate', full_name='paddle.AdamSGDParameter.mom_decay_rate', index=4,
-      number=5, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.99),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4028,
-  serialized_end=4200,
-)
-
-
-_NAIVESGDPARAMETER = _descriptor.Descriptor(
-  name='NaiveSGDParameter',
-  full_name='paddle.NaiveSGDParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.NaiveSGDParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0002),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='avg_decay_rate', full_name='paddle.NaiveSGDParameter.avg_decay_rate', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=False, default_value=float(0),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4202,
-  serialized_end=4276,
-)
-
-
-_SUMMARYSGDPARAMETER = _descriptor.Descriptor(
-  name='SummarySGDParameter',
-  full_name='paddle.SummarySGDParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='summary_decay_rate', full_name='paddle.SummarySGDParameter.summary_decay_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.999999),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4278,
-  serialized_end=4337,
-)
-
-
-_MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
-  name='MovingAverageRuleParameter',
-  full_name='paddle.MovingAverageRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='momentum', full_name='paddle.MovingAverageRuleParameter.momentum', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=False, default_value=float(0),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4339,
-  serialized_end=4385,
-)
-
-
-_PSRESPONSEMESSAGE = _descriptor.Descriptor(
-  name='PsResponseMessage',
-  full_name='paddle.PsResponseMessage',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='err_code', full_name='paddle.PsResponseMessage.err_code', index=0,
-      number=1, type=5, cpp_type=1, label=2,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='err_msg', full_name='paddle.PsResponseMessage.err_msg', index=1,
-      number=2, type=9, cpp_type=9, label=2,
-      has_default_value=True, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='data', full_name='paddle.PsResponseMessage.data', index=2,
-      number=3, type=12, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b(""),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4387,
-  serialized_end=4460,
-)
-
-
-_FSCLIENTPARAMETER = _descriptor.Descriptor(
-  name='FsClientParameter',
-  full_name='paddle.FsClientParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='fs_type', full_name='paddle.FsClientParameter.fs_type', index=0,
-      number=1, type=14, cpp_type=8, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='uri', full_name='paddle.FsClientParameter.uri', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='user', full_name='paddle.FsClientParameter.user', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='passwd', full_name='paddle.FsClientParameter.passwd', index=3,
-      number=4, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='buffer_size', full_name='paddle.FsClientParameter.buffer_size', index=4,
-      number=5, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='hadoop_bin', full_name='paddle.FsClientParameter.hadoop_bin', index=5,
-      number=51, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='afs_conf', full_name='paddle.FsClientParameter.afs_conf', index=6,
-      number=101, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-    _FSCLIENTPARAMETER_FSAPITYPE,
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4463,
-  serialized_end=4676,
-)
-
-_PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER
-_PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER
-_PSPARAMETER.fields_by_name['trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER
-_PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER
-_WORKERPARAMETER.fields_by_name['downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER
-_SERVERPARAMETER.fields_by_name['downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER
-_DOWNPOURWORKERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER
-_DOWNPOURTRAINERPARAMETER.fields_by_name['dense_table'].message_type = _DENSETABLEPARAMETER
-_DOWNPOURTRAINERPARAMETER.fields_by_name['sparse_table'].message_type = _SPARSETABLEPARAMETER
-_DOWNPOURTRAINERPARAMETER.fields_by_name['program_config'].message_type = _PROGRAMCONFIG
-_DOWNPOURSERVERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER
-_DOWNPOURSERVERPARAMETER.fields_by_name['service_param'].message_type = _SERVERSERVICEPARAMETER
-_TABLEPARAMETER.fields_by_name['accessor'].message_type = _TABLEACCESSORPARAMETER
-_TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE
-_TABLEACCESSORPARAMETER.fields_by_name['sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['sparse_commonsgd_param'].message_type = _SPARSECOMMONSGDRULEPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['embed_sgd_param'].message_type = _SPARSECOMMONSGDRULEPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['embedx_sgd_param'].message_type = _SPARSECOMMONSGDRULEPARAMETER
-_SPARSECOMMONSGDRULEPARAMETER.fields_by_name['naive'].message_type = _SPARSENAIVESGDRULEPARAMETER
-_SPARSECOMMONSGDRULEPARAMETER.fields_by_name['adagrad'].message_type = _SPARSEADAGRADSGDRULEPARAMETER
-_SPARSECOMMONSGDRULEPARAMETER.fields_by_name['adam'].message_type = _SPARSEADAMSGDPARAMETER
-_DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER
-_DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER
-_DENSESGDRULEPARAMETER.fields_by_name['summary'].message_type = _SUMMARYSGDPARAMETER
-_DENSESGDRULEPARAMETER.fields_by_name['moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER
-_FSCLIENTPARAMETER.fields_by_name['fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE
-_FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER
-DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER
-DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER
-DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER
-DESCRIPTOR.message_types_by_name['DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER
-DESCRIPTOR.message_types_by_name['DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER
-DESCRIPTOR.message_types_by_name['ProgramConfig'] = _PROGRAMCONFIG
-DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER
-DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER
-DESCRIPTOR.message_types_by_name['DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER
-DESCRIPTOR.message_types_by_name['ServerServiceParameter'] = _SERVERSERVICEPARAMETER
-DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER
-DESCRIPTOR.message_types_by_name['TableAccessorParameter'] = _TABLEACCESSORPARAMETER
-DESCRIPTOR.message_types_by_name['DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER
-DESCRIPTOR.message_types_by_name['TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER
-DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE
-DESCRIPTOR.message_types_by_name['SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER
-DESCRIPTOR.message_types_by_name['SparseCommonSGDRuleParameter'] = _SPARSECOMMONSGDRULEPARAMETER
-DESCRIPTOR.message_types_by_name['SparseNaiveSGDRuleParameter'] = _SPARSENAIVESGDRULEPARAMETER
-DESCRIPTOR.message_types_by_name['SparseAdagradSGDRuleParameter'] = _SPARSEADAGRADSGDRULEPARAMETER
-DESCRIPTOR.message_types_by_name['SparseAdamSGDParameter'] = _SPARSEADAMSGDPARAMETER
-DESCRIPTOR.message_types_by_name['DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER
-DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER
-DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER
-DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER
-DESCRIPTOR.message_types_by_name['MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER
-DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE
-DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER
-DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE
-DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID
-
-PSParameter = _reflection.GeneratedProtocolMessageType('PSParameter', (_message.Message,), dict(
-  DESCRIPTOR = _PSPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.PSParameter)
-  ))
-_sym_db.RegisterMessage(PSParameter)
-
-WorkerParameter = _reflection.GeneratedProtocolMessageType('WorkerParameter', (_message.Message,), dict(
-  DESCRIPTOR = _WORKERPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.WorkerParameter)
-  ))
-_sym_db.RegisterMessage(WorkerParameter)
-
-ServerParameter = _reflection.GeneratedProtocolMessageType('ServerParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SERVERPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.ServerParameter)
-  ))
-_sym_db.RegisterMessage(ServerParameter)
-
-DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType('DownpourWorkerParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DOWNPOURWORKERPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter)
-  ))
-_sym_db.RegisterMessage(DownpourWorkerParameter)
-
-DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType('DownpourTrainerParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DOWNPOURTRAINERPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter)
-  ))
-_sym_db.RegisterMessage(DownpourTrainerParameter)
-
-ProgramConfig = _reflection.GeneratedProtocolMessageType('ProgramConfig', (_message.Message,), dict(
-  DESCRIPTOR = _PROGRAMCONFIG,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.ProgramConfig)
-  ))
-_sym_db.RegisterMessage(ProgramConfig)
-
-DenseTableParameter = _reflection.GeneratedProtocolMessageType('DenseTableParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DENSETABLEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter)
-  ))
-_sym_db.RegisterMessage(DenseTableParameter)
-
-SparseTableParameter = _reflection.GeneratedProtocolMessageType('SparseTableParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSETABLEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter)
-  ))
-_sym_db.RegisterMessage(SparseTableParameter)
-
-DownpourServerParameter = _reflection.GeneratedProtocolMessageType('DownpourServerParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DOWNPOURSERVERPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter)
-  ))
-_sym_db.RegisterMessage(DownpourServerParameter)
-
-ServerServiceParameter = _reflection.GeneratedProtocolMessageType('ServerServiceParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SERVERSERVICEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter)
-  ))
-_sym_db.RegisterMessage(ServerServiceParameter)
-
-TableParameter = _reflection.GeneratedProtocolMessageType('TableParameter', (_message.Message,), dict(
-  DESCRIPTOR = _TABLEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.TableParameter)
-  ))
-_sym_db.RegisterMessage(TableParameter)
-
-TableAccessorParameter = _reflection.GeneratedProtocolMessageType('TableAccessorParameter', (_message.Message,), dict(
-  DESCRIPTOR = _TABLEACCESSORPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter)
-  ))
-_sym_db.RegisterMessage(TableAccessorParameter)
-
-DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType('DownpourTableAccessorParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DOWNPOURTABLEACCESSORPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter)
-  ))
-_sym_db.RegisterMessage(DownpourTableAccessorParameter)
-
-TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType('TableAccessorSaveParameter', (_message.Message,), dict(
-  DESCRIPTOR = _TABLEACCESSORSAVEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter)
-  ))
-_sym_db.RegisterMessage(TableAccessorSaveParameter)
-
-PsRequestMessage = _reflection.GeneratedProtocolMessageType('PsRequestMessage', (_message.Message,), dict(
-  DESCRIPTOR = _PSREQUESTMESSAGE,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage)
-  ))
-_sym_db.RegisterMessage(PsRequestMessage)
-
-SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseSGDRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSESGDRULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter)
-  ))
-_sym_db.RegisterMessage(SparseSGDRuleParameter)
-
-SparseCommonSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseCommonSGDRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSECOMMONSGDRULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseCommonSGDRuleParameter)
-  ))
-_sym_db.RegisterMessage(SparseCommonSGDRuleParameter)
-
-SparseNaiveSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseNaiveSGDRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSENAIVESGDRULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseNaiveSGDRuleParameter)
-  ))
-_sym_db.RegisterMessage(SparseNaiveSGDRuleParameter)
-
-SparseAdagradSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseAdagradSGDRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSEADAGRADSGDRULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseAdagradSGDRuleParameter)
-  ))
-_sym_db.RegisterMessage(SparseAdagradSGDRuleParameter)
-
-SparseAdamSGDParameter = _reflection.GeneratedProtocolMessageType('SparseAdamSGDParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSEADAMSGDPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseAdamSGDParameter)
-  ))
-_sym_db.RegisterMessage(SparseAdamSGDParameter)
-
-DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('DenseSGDRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DENSESGDRULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter)
-  ))
-_sym_db.RegisterMessage(DenseSGDRuleParameter)
-
-AdamSGDParameter = _reflection.GeneratedProtocolMessageType('AdamSGDParameter', (_message.Message,), dict(
-  DESCRIPTOR = _ADAMSGDPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter)
-  ))
-_sym_db.RegisterMessage(AdamSGDParameter)
-
-NaiveSGDParameter = _reflection.GeneratedProtocolMessageType('NaiveSGDParameter', (_message.Message,), dict(
-  DESCRIPTOR = _NAIVESGDPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter)
-  ))
-_sym_db.RegisterMessage(NaiveSGDParameter)
-
-SummarySGDParameter = _reflection.GeneratedProtocolMessageType('SummarySGDParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SUMMARYSGDPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter)
-  ))
-_sym_db.RegisterMessage(SummarySGDParameter)
-
-MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType('MovingAverageRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _MOVINGAVERAGERULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter)
-  ))
-_sym_db.RegisterMessage(MovingAverageRuleParameter)
-
-PsResponseMessage = _reflection.GeneratedProtocolMessageType('PsResponseMessage', (_message.Message,), dict(
-  DESCRIPTOR = _PSRESPONSEMESSAGE,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage)
-  ))
-_sym_db.RegisterMessage(PsResponseMessage)
-
-FsClientParameter = _reflection.GeneratedProtocolMessageType('FsClientParameter', (_message.Message,), dict(
-  DESCRIPTOR = _FSCLIENTPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.FsClientParameter)
-  ))
-_sym_db.RegisterMessage(FsClientParameter)
-
-
-DESCRIPTOR.has_options = True
-DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\200\001\001\370\001\001'))
-# @@protoc_insertion_point(module_scope)
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
deleted file mode 100644
index 3d81db53ecdd64084cf2da3f9bb50886dd1814f0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import base
-from .base import *
-
-from . import layers
-from .layers import *
-
-from . import nn
-from .nn import *
-
-from . import tracer
-from .tracer import *
-
-from . import parallel
-from .parallel import *
-
-from . import checkpoint
-from .checkpoint import *
-
-from . import learning_rate_scheduler
-from .learning_rate_scheduler import *
-
-from . import backward_strategy
-from .backward_strategy import *
-
-__all__ = []
-__all__ += layers.__all__
-__all__ += base.__all__
-__all__ += nn.__all__
-__all__ += tracer.__all__
-__all__ += parallel.__all__
-__all__ += checkpoint.__all__
-__all__ += learning_rate_scheduler.__all__
-__all__ += backward_strategy.__all__
diff --git a/python/paddle/fluid/dygraph/backward_strategy.py b/python/paddle/fluid/dygraph/backward_strategy.py
deleted file mode 100644
index bfcf66af31ce13b3394b5b091882b1976f9f003a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/backward_strategy.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid import core
-
-__all__ = ["BackwardStrategy"]
-
-BackwardStrategy = core.BackwardStrategy
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
deleted file mode 100644
index e218544a130f4de54459ffe89cadddecad19a37a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/base.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
-import contextlib
-import numpy as np
-from paddle.fluid import core
-from paddle.fluid import framework
-from .tracer import Tracer
-import logging
-import objgraph
-
-__all__ = [
-    'no_grad',
-    'guard',
-    'to_variable',
-]
-
-
-# This function should be removed in V1.6, because it can easily lead to cyclic dependencies.
-def enabled():
-    # Internal use only
-    return framework.in_dygraph_mode()
-
-
-@contextlib.contextmanager
-def _switch_tracer_mode_guard_(is_train=True):
-    tracer = framework._dygraph_tracer()
-    if tracer:
-        mode = tracer._train_mode
-        tracer._train_mode = is_train
-        yield
-        tracer._train_mode = mode
-    else:
-        yield
-
-
-def _dygraph_not_support_(func):
-    def __impl__(*args, **kwargs):
-        assert not framework.in_dygraph_mode(
-        ), "We don't support %s in Dygraph mode" % func.__name__
-        return func(*args, **kwargs)
-
-    return __impl__
-
-
-def _no_grad_(func):
-    """
-    This Decorator will avoid the func being decorated creating backward network in dygraph mode
-
-    Args:
-        func: the func don't need grad
-
-    Examples:
-
-     .. code-block:: python
-
-        import numpy as np
-        import paddle.fluid as fluid
-
-        @fluid.dygraph.no_grad
-        def test_layer():
-            with fluid.dygraph.guard():
-                inp = np.ones([3, 32, 32], dtype='float32')
-                t = fluid.dygraph.base.to_variable(inp)
-                fc1 = fluid.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
-                fc2 = fluid.FC('fc2', size=4)
-                ret = fc1(t)
-                dy_ret = fc2(ret)
-
-        test_layer()
-
-    """
-
-    def __impl__(*args, **kwargs):
-        with _switch_tracer_mode_guard_(is_train=False):
-            return func(*args, **kwargs)
-
-    return __impl__
-
-
-no_grad = wrap_decorator(_no_grad_)
-# for fluidDoc
-no_grad.__doc__ = _no_grad_.__doc__
-_not_support = wrap_decorator(_dygraph_not_support_)
-
-
-@signature_safe_contextmanager
-def guard(place=None):
-    """
-    This context will create a dygraph context for dygraph to run
-
-    Args:
-        place(fluid.CPUPlace|fluid.CUDAPlace|None): Place to run
-
-    return:
-        None
-
-    Examples:
-
-     .. code-block:: python
-
-        import numpy as np
-        import paddle.fluid as fluid
-
-        with fluid.dygraph.guard():
-            inp = np.ones([3, 32, 32], dtype='float32')
-            t = fluid.dygraph.base.to_variable(inp)
-            fc1 = fluid.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
-            fc2 = fluid.FC('fc2', size=4)
-            ret = fc1(t)
-            dy_ret = fc2(ret)
-
-    """
-    train = framework.Program()
-    startup = framework.Program()
-    tracer = Tracer()
-
-    if place is None:
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-    with framework.program_guard(train, startup):
-        with framework.unique_name.guard():
-            with framework._dygraph_guard(tracer):
-                with framework._dygraph_place_guard(place):
-                    yield
-
-
-def _print_debug_msg(limit=5, is_test=False):
-    if not core._is_dygraph_debug_enabled():
-        logging.warn(
-            'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
-        )
-        return
-    unique_name_size = len(framework.unique_name.generator.ids)
-    tracer_var_size = len(framework._dygraph_tracer()._vars)
-    alive_cpp_var_size = len(core.VarBase._alive_vars())
-    if not is_test:
-        logging.warn(
-            'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
-            .format(unique_name_size, tracer_var_size, alive_cpp_var_size))
-        objgraph.show_growth(limit=limit)
-    else:
-        return unique_name_size, tracer_var_size, alive_cpp_var_size
-
-
-def to_variable(value, block=None, name=None):
-    """
-    This function will create a variable from ndarray
-
-    Args:
-        value(ndarray): the numpy value need to be convert
-        block(fluid.Block|None): which block this variable will be in
-        name(str|None): Name of Variable
-
-    return:
-        Variable: The variable created from given numpy
-
-    Examples:
-
-     .. code-block:: python
-
-        import numpy as np
-        import paddle.fluid as fluid
-
-        with fluid.dygraph.guard():
-            x = np.ones([2, 2], np.float32)
-            y = fluid.dygraph.to_variable(x)
-
-    """
-    if isinstance(value, np.ndarray):
-        assert framework.in_dygraph_mode(
-        ), "to_variable could only be called in dygraph mode"
-
-        if not block:
-            block = framework.default_main_program().current_block()
-        py_var = framework.Variable(
-            block,
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            name=name,
-            shape=value.shape,
-            dtype=value.dtype,
-            stop_gradient=True)
-        var = py_var._ivar.value()
-        tensor = var.get_tensor()
-        if value.dtype == np.float16:
-            value = value.view(np.uint16)
-        tensor.set(value, framework._current_expected_place())
-        return py_var
-    elif isinstance(value, framework.Variable):
-        return value
-    else:
-        raise TypeError(
-            "to_variable only accepts 'ndarray' and 'Variable' as value's input")
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
deleted file mode 100644
index 631605bd0b1d856c54c055f73af4718444fb0a81..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import collections
-from ..framework import Variable, default_main_program
-import pickle
-from . import learning_rate_scheduler
-import warnings
-
-__all__ = ['save_persistables', 'load_persistables']
-
-
-def save_persistables(model_dict, dirname='save_dir', optimizers=None):
-    """
-    This function filters out all variables in layer.parameters from the give `layer`, and optimizer's learning rate decay.
-    And then trys to save these variables to the folder `dirname`.
-
-    Use the `dirname` to specify the folder where persistable variables were
-    saved.
-
-    Args:
-        model_dict(dict of Parameters): The parameters will
-                                    be saved. If it is None, nothing
-                                    will be deal.
-        dirname(str): The directory path.
-        optimizers(fluid.Optimizer|list(fluid.Optimizer)|None): The optimizers to be saved
-
-    Returns:
-        None
-
-    Examples:
-
-        .. code-block:: python
-
-          ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
-          sgd = fluid.optimizer.SGD(learning_rate=0.01)
-          x_data = np.arange(12).reshape(4, 3).astype('int64')
-          y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
-          x_data = x_data.reshape((-1, num_steps, 1))
-          y_data = y_data.reshape((-1, 1))
-          init_hidden_data = np.zeros(
-                (num_layers, batch_size, hidden_size), dtype='float32')
-          init_cell_data = np.zeros(
-                (num_layers, batch_size, hidden_size), dtype='float32')
-          x = to_variable(x_data)
-          y = to_variable(y_data)
-          init_hidden = to_variable(init_hidden_data)
-          init_cell = to_variable(init_cell_data)
-          dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                        init_cell)
-          dy_loss.backward()
-          sgd.minimize(dy_loss)
-          ptb_model.clear_gradient()
-          param_path = "./my_paddle_model"
-          fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path, sgd)
-    """
-    if isinstance(model_dict, collections.OrderedDict):
-        _save_var_to_file(model_dict, optimizers, dirname, None)
-
-
-def load_persistables(dirname='save_dir'):
-    """
-    This function trys to load persistable variables and optimizer's learning rate decay from the folder `dirname`.
-    And return the restored values in a dictionary way, respectively.
-
-    Use the `dirname` to specify the folder where persistable variables were
-    saved.
-
-    Args:
-        dirname(str): The directory path. default is save_dir
-
-    Returns:
-        layer_dict: The parameter-dict resumed from file
-        optimizer: The optimizer
-
-    Examples:
-
-         .. code-block:: python
-
-           my_layer = layer(fluid.Layer)
-           param_path = "./my_paddle_model"
-           sgd = SGDOptimizer(learning_rate=1e-3)
-           param_dict, optimizer_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
-           param_1 = param_dict['PtbModel_0.w_1']
-           sgd.load(optimizer_dict)
-
-        """
-    return _load_var_from_file(dirname)
-
-
-def _save_var_to_file(stat_dict, optimizers, file_dir, file_name):
-    save_block = default_main_program().global_block()
-    save_var_map = {}
-    for var_key, each_var in stat_dict.items():
-        save_var_map[each_var.name] = each_var
-        if file_name is None:
-            save_block.append_op(
-                type='save',
-                inputs={'X': [each_var]},
-                outputs={},
-                attrs={
-                    'file_path': os.path.join(file_dir,
-                                              os.path.normpath(each_var.name))
-                })
-
-    if optimizers is not None:
-        if isinstance(optimizers, (list, tuple)):
-            optimizers = optimizers
-        else:
-            optimizers = [optimizers]
-        if os.path.exists(
-                os.path.join(file_dir, os.path.normpath("optimizers"))):
-            pass
-        else:
-            os.mkdir(os.path.join(file_dir, os.path.normpath("optimizers")))
-        for optimizer in optimizers:
-            if isinstance(optimizer._learning_rate,
-                          learning_rate_scheduler.LearningRateDecay):
-                try:
-                    f = open(
-                        os.path.join(file_dir, "optimizers",
-                                     os.path.normpath(str(optimizer._name))),
-                        "wb")
-                    pickle.dump(optimizer._learning_rate, f, 2)
-                    f.close()
-                except ():
-                    raise IOError("Can't load %s",
-                                  os.path.join(
-                                      file_dir, "optimizers",
-                                      os.path.normpath(str(optimizer._name))))
-            else:
-                warnings.warn(
-                    "Optimizer not saved, Only optimizer with 'LearningRateDecay' under DyGraph mode need to be saved"
-                )
-    else:
-        pass
-
-    if file_name is not None:
-        save_var_list = []
-        for name in sorted(save_var_map.keys()):
-            save_var_list.append(save_var_map[name])
-
-        save_block.append_op(
-            type='save_combine',
-            inputs={'X': save_var_list},
-            outputs={},
-            attrs={
-                'file_path': os.path.join(file_dir, os.path.normpath(file_name))
-            })
-
-
-def _load_var_from_file(file_dir):
-    if not os.path.exists(file_dir):
-        raise IOError("{} not exist".format(file_dir))
-
-    def walk_filename(file_dir):
-        base_path = os.path.join(file_dir)
-        var_name_list = []
-        if os.path.exists(base_path):
-            for dirpath, dirnames, filenames in os.walk(base_path):
-                if "optimizers" in dirpath:
-                    continue
-                pt = dirpath.replace(base_path, "", 1)
-                if pt.startswith("/") or pt.startswith("\\"):
-                    pt = pt[1:]
-                for fth_name in filenames:
-                    if fth_name[0] != '.':
-                        name_path = os.path.join(pt, fth_name)
-                        if "\\" in name_path:
-                            name_path = name_path.replace("\\", "/")
-                        var_name_list.append(name_path)
-
-        return var_name_list
-
-    load_block = default_main_program().global_block()
-    load_var_map = {}
-    load_optimizer_map = {}
-    file_var_list = walk_filename(file_dir)
-    for var_name in file_var_list:
-        new_var = Variable(block=load_block, name=var_name)
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [new_var]},
-            attrs={
-                'file_path': os.path.join(file_dir,
-                                          os.path.normpath(new_var.name))
-            })
-
-        load_var_map[new_var.name] = new_var
-    opt_path = os.path.join(file_dir, "optimizers")
-    for _, _, optimizers in os.walk(opt_path):
-        for optimizer in optimizers:
-            try:
-                f = open(os.path.join(opt_path, optimizer), "rb")
-                load_optimizer_map[optimizer] = pickle.load(f)
-                f.close()
-            except IOError:
-                raise IOError("Can't load %s",
-                              os.path.join(
-                                  file_dir, "optimizers",
-                                  os.path.normpath(str(optimizer._name))))
-    if len(load_optimizer_map) == 0:
-        print(
-            "No optimizer loaded. If you didn't save optimizer, please ignore this. The program can still work with new optimizer. "
-        )
-        pass
-
-    return load_var_map, load_optimizer_map
-
-
-def _clone_var_in_block_(block, var):
-    assert isinstance(var, Variable)
-    return block.create_var(
-        name=var.name,
-        shape=var.shape,
-        dtype=var.dtype,
-        type=var.type,
-        lod_level=0,
-        persistable=True)
diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py
deleted file mode 100644
index 9fd1e392791f2bf7a19942749eae87001ec3ede8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ /dev/null
@@ -1,223 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import copy
-import six
-from ..framework import Parameter, in_dygraph_mode
-from ..param_attr import ParamAttr
-from .. import core
-from six.moves import zip
-from ..layer_helper_base import LayerHelperBase
-
-
-class LayerObjectHelper(LayerHelperBase):
-    def __init__(self, name):
-        super(LayerObjectHelper, self).__init__(name, layer_type=name)
-
-    def append_op(self,
-                  type=None,
-                  inputs=None,
-                  outputs=None,
-                  attrs=None,
-                  stop_gradient=None):
-        """append an operator for this layer object.
-
-           Args:
-               type: operator type
-               inputs: input variable of the operator
-               dtype: data type of this parameter
-               is_bias: if this is a bias parameter
-               default_initializer: set the default initializer for this parameter
-
-        Returns created parameter Variable.
-        """
-        return self.main_program.current_block().append_op(
-            type=type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=stop_gradient)
-
-    def _multiple_input(self, inputs_in):
-        inputs = inputs_in
-        ret = []
-        if isinstance(inputs, (list, tuple)):
-            for inp in inputs:
-                ret.append(self.to_variable(inp))
-        else:
-            ret.append(self.to_variable(inputs))
-        return ret
-
-    # TODO: make it public when we need it
-    def _input(self, inputs_in):
-        inputs = self._multiple_input(inputs_in)
-        if len(inputs) != 1:
-            raise "{0} layer only takes one input in".format(self.layer_type)
-        return inputs[0]
-
-    def _multiple_param_attr(self, length, param_attr_in=None):
-        param_attr = param_attr_in
-        if isinstance(param_attr, ParamAttr):
-            param_attr = [param_attr]
-
-        if len(param_attr) != 1 and len(param_attr) != length:
-            raise ValueError("parameter number mismatch in {}".format(
-                self.name))
-        elif len(param_attr) == 1 and length != 1:
-            tmp = [None] * length
-            for i in six.moves.range(length):
-                tmp[i] = copy.deepcopy(param_attr[0])
-            param_attr = tmp
-        return param_attr
-
-    def iter_inputs_and_params(self, inputs_in, param_attr_in=None):
-        """Access all inputs and params one by one
-
-           Args:
-               inputs_in: inputs to be iter
-               param_attr_in: param_attr to be iter
-
-        Returns input, param_attr
-        """
-        param_attr_in = ParamAttr._to_attr(param_attr_in)
-        if isinstance(param_attr_in, bool):
-            raise ValueError('Param_attr should not be False in {}'.format(
-                self.name))
-        inputs = inputs_in if (inputs_in is not None) else []
-        inputs = self._multiple_input(inputs)
-        param_attrs = self._multiple_param_attr(len(inputs), param_attr_in)
-        for ipt, param_attr in zip(inputs, param_attrs):
-            yield ipt, param_attr
-
-    def input_dtype(self, inputs_in):
-        """Get input data type
-
-           Args:
-               inputs_in: inputs wanted know the data type
-
-        Returns dtype of the input
-        """
-        inputs_in = inputs_in if (inputs_in is not None) else []
-        inputs = self._multiple_input(inputs_in)
-        dtype = None
-        for each in inputs:
-            if dtype is None:
-                dtype = each.dtype
-            elif dtype != each.dtype:
-                raise ValueError("Data Type mismatch: %d to %d in %s" %
-                                 (dtype, each.dtype, self.name))
-        return dtype
-
-    def get_parameter(self, name):
-        """Get parameter specifically
-
-           Args:
-               name: parameter's name
-
-        Returns target parameter
-        """
-        param = self.main_program.global_block().var(name)
-        if not isinstance(param, Parameter):
-            raise ValueError("no Parameter name %s found in %s" %
-                             (name, self.name))
-        return param
-
-    def append_bias_op(self,
-                       input_var,
-                       dim_start=1,
-                       dim_end=None,
-                       bias_attr=None):
-        """Append bias operator and return its output. If the user does not set bias_attr, append_bias_op will return input_var
-
-            Args:
-                input_var: the input variable. The len(input_var.shape) is
-                larger or equal than 2.
-                dim_start:
-                dim_end: the shape of the bias will be
-                bias_attr: the bias_attr of it
-
-        Return the Variable of after append bias op
-        """
-        size = list(input_var.shape[dim_start:dim_end])
-        bias_attr = bias_attr
-        if not bias_attr:
-            return input_var
-
-        b = self.create_parameter(
-            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
-        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-        self.append_op(
-            type='elementwise_add',
-            inputs={'X': [input_var],
-                    'Y': [b]},
-            outputs={'Out': [tmp]},
-            attrs={'axis': dim_start})
-        return tmp
-
-    # TODO: this should not be called anymore after all activation func move to Layers
-    def append_activation(self,
-                          input_var,
-                          act=None,
-                          use_cudnn=None,
-                          use_mkl_dnn=None):
-        """Append activation
-
-            Args:
-                input_var: the input variable. The len(input_var.shape) is
-                larger or equal than 2.
-                act: activation type
-                use_mkl_dnn: if use mkldnn
-                use_cudnn: if use cudnn
-
-        Return the Variable of after append activation
-        """
-        act = act
-        if act is None:
-            return input_var
-        if isinstance(act, six.string_types):
-            act = {'type': act}
-        else:
-            raise TypeError(
-                str(act) + " should be unicode or str in %s ", self.name)
-
-        if (use_cudnn is not None) and use_cudnn:
-            act['use_cudnn'] = use_cudnn
-        if (use_mkl_dnn is not None) and use_mkl_dnn:
-            act['use_mkldnn'] = use_mkl_dnn
-        act_type = act.pop('type')
-
-        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-        self.append_op(
-            type=act_type,
-            inputs={"X": [input_var]},
-            outputs={"Out": [tmp]},
-            attrs=act)
-        return tmp
-
-    def is_instance(self, param, cls):
-        """Check if the input parameter is instance of input class
-
-            Args:
-                param: parameter to be check
-                cls: class of the parameter
-
-        Return result of the check (True or False)
-        """
-        param = param
-        if not isinstance(param, cls):
-            raise TypeError(
-                "The input {0} parameter of method {1} must be {2}, in layer {3}",
-                param, self.layer_type, cls.__name__, self.name)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
deleted file mode 100644
index afb18ed505bd6f31c4aa2a0ad4feafdaa1da28f1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/layers.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import contextlib
-import sys
-import numpy as np
-import collections
-import six
-from . import parallel_helper
-from .. import unique_name
-from paddle.fluid import core
-from .layer_object_helper import LayerObjectHelper
-from paddle.fluid import framework
-from ..param_attr import ParamAttr
-
-__all__ = ['Layer']
-
-
-class Layer(core.Layer):
-    """Layers composed of operators.
-
-    Args:
-        name_scope: prefix name used by the layer to name parameters.
-            If prefix is "my_model/layer_1", parameter name in MyLayer
-            can be "my_model/layer_1/MyLayer/w_n", where w is the parameter
-            base name and n is an unique suffix auto-generated.
-        dtype: data type for the variables in the layer.
-    """
-
-    def __init__(self, name_scope, dtype=core.VarDesc.VarType.FP32):
-        self._full_name = unique_name.generate(name_scope + "/" +
-                                               self.__class__.__name__)
-        self._built = False
-        self._dtype = dtype
-        self._parameters = collections.OrderedDict()
-        self._sub_layers = collections.OrderedDict()
-        self._loaddict_holder = collections.OrderedDict()
-
-        self._helper = LayerObjectHelper(self._full_name)
-
-    def train(self):
-        framework._dygraph_tracer().train_mode()
-
-    def eval(self):
-        framework._dygraph_tracer().eval_mode()
-
-    def full_name(self):
-        """Full name for this layers.
-
-          Full name is composed by name_scope + "/" + MyLayer.__class__.__name__
-
-        Returns full name of this name.
-        """
-        return self._full_name
-
-    def create_parameter(self,
-                         attr,
-                         shape,
-                         dtype,
-                         is_bias=False,
-                         default_initializer=None):
-        """Create parameters for this layers.
-
-           Args:
-               attr: [ParamAttr] should be the parameter attribute for this parameter
-               shape: shape of the paramter
-               dtype: data type of this parameter
-               is_bias: if this is a bias parameter
-               default_initializer: set the default initializer for this parameter
-
-        Returns created parameter Variable.
-        """
-        if isinstance(attr, ParamAttr) and (attr.name is not None):
-            attr.name = ".".join([self._full_name, attr.name])
-        elif isinstance(attr, six.string_types):
-            attr = ".".join([self._full_name, attr])
-        return self._helper.create_parameter(attr, shape, dtype, is_bias,
-                                             default_initializer)
-
-    # TODO: Add more parameter list when we need them
-    def create_variable(self,
-                        name=None,
-                        persistable=None,
-                        dtype=None,
-                        type=core.VarDesc.VarType.LOD_TENSOR):
-        """Create Variable for this layers.
-
-           Args:
-               name: name of the variable
-               persistable: if set this variable persistable
-               dtype: data type of data in the variable
-               type: type of the variable
-
-        Returns created Variable.
-        """
-        if name is not None:
-            var_name = ".".join([self._full_name, name])
-        else:
-            var_name = unique_name.generate(".".join(
-                [self._full_name, "_generated_var"]))
-
-        return self._helper.main_program.current_block().create_var(
-            name=var_name, persistable=persistable, dtype=dtype, type=type)
-
-    def parameters(self, include_sublayers=True):
-        """Returns a list of Parameters from current and sub-layers.
-
-        Args:
-            include_sublayers: If true, also include the parameters from
-            sublayers.
-
-        Returns a list of Parameters.
-        """
-        ret = [p for p in self._parameters.values()]
-        if include_sublayers:
-            for l in self._sub_layers.values():
-                for p in l.parameters(include_sublayers):
-                    ret.append(p)
-        return ret
-
-    def sublayers(self, include_sublayers=True):
-        """Returns a list of sub layers.
-
-        Args:
-            include_sublayers: If true, also include the layers from sublayers.
-
-        Returns a list of sub layers.
-        """
-        ret = [l for l in self._sub_layers.values()]
-        if include_sublayers:
-            for l in self._sub_layers.values():
-                for sub_l in l.sublayers(include_sublayers):
-                    ret.append(sub_l)
-        return ret
-
-    def clear_gradients(self):
-        for p in self.parameters():
-            if p.trainable:
-                p.clear_gradient()
-
-    def _build_once(self, *args):
-        pass
-
-    def __call__(self, *inputs):
-        if not self._built:
-            self._build_once(*inputs)
-            if parallel_helper._is_data_parallel_mode():
-                parallel_helper._broadcast_parameters(self._parameters.values())
-
-        outputs = self.forward(*inputs)
-        self._built = True
-        return outputs
-
-    def forward(self, *inputs):
-        raise NotImplementedError
-
-    def backward(self, *inputs):
-        raise ValueError("Layer shouldn't implement backward")
-
-    def add_sublayer(self, name, sublayer):
-        """Adds a sub Layer instance.
-
-          Added sublayer can be access like self.name.
-
-        Args:
-            name: name of this sublayer.
-            sublayer: an instance of Layer.
-        Returns:
-            the sublayer passed in.
-        """
-        assert isinstance(sublayer, core.Layer)
-
-        self._sub_layers[name] = sublayer
-        return sublayer
-
-    def add_parameter(self, name, parameter):
-        """Adds a Parameter instance.
-
-          Added parameter can be access like self.name.
-
-        Args:
-            name: name of this sublayer.
-            parameter: an instance of Parameter.
-        Returns:
-            the parameter passed in.
-        """
-        assert isinstance(parameter, framework.Parameter)
-
-        if parameter.name in self._loaddict_holder:
-            var = parameter._ivar.value()
-            tensor = var.get_tensor()
-            tensor.set(self._loaddict_holder[parameter.name].numpy(),
-                       framework._current_expected_place())
-
-        self._parameters[name] = parameter
-        return parameter
-
-    def __getattr__(self, name):
-        if name in self._parameters:
-            return self._parameters[name]
-        elif name in self._sub_layers:
-            return self._sub_layers[name]
-        else:
-            return object.__getattribute__(self, name)
-
-    def __setattr__(self, name, value):
-        if isinstance(value, framework.Parameter):
-            params = self.__dict__.get('_parameters', None)
-            if params is None:
-                raise ValueError(
-                    "super(YourLayer, self).__init__() should be called first")
-            if value.name in self._loaddict_holder:
-                var = value._ivar.value()
-                tensor = var.get_tensor()
-                tensor.set(self._loaddict_holder[value.name].numpy(),
-                           framework._current_expected_place())
-            params[name] = value
-        elif isinstance(value, core.Layer):
-            layers = self.__dict__.get('_sub_layers', None)
-            if layers is None:
-                raise ValueError(
-                    "super(YourLayer, self).__init__() should be called first")
-            layers[name] = value
-        else:
-            object.__setattr__(self, name, value)
-
-    def __delattr__(self, name):
-        if name in self._parameters:
-            del self._parameters[name]
-        elif name in self._sub_layers:
-            del self._sub_layers[name]
-        else:
-            object.__delattr__(self, name)
-
-    def state_dict(self, destination=None, include_sublayers=True):
-        if destination is None:
-            destination = collections.OrderedDict()
-        for name, data in self._parameters.items():
-            if data is not None:
-                destination[data.name] = data
-
-        if include_sublayers:
-            for layer_name, layer_item in self._sub_layers.items():
-                if layer_item is not None:
-                    destination_temp = destination.copy()
-                    destination_temp.update(
-                        layer_item.state_dict(destination_temp,
-                                              include_sublayers))
-                    destination = destination_temp
-        return destination
-
-    def load_dict(self, stat_dict, include_sublayers=True):
-        self._loaddict_holder = stat_dict
-        for name, item in self.__dict__.get('_parameters', None).items():
-            if item.name in stat_dict:
-                var = item._ivar.value()
-                tensor = var.get_tensor()
-                tensor.set(stat_dict[item.name].numpy(),
-                           framework._current_expected_place())
-
-        if include_sublayers:
-            for layer_name, layer_item in self._sub_layers.items():
-                if layer_item is not None:
-                    layer_item.load_dict(stat_dict)
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
deleted file mode 100644
index 57d602afdb3c0075535fe2cad351aadff6b14bea..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ /dev/null
@@ -1,507 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import math
-
-from .. import unique_name
-
-__all__ = [
-    'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
-    'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
-]
-
-
-class LearningRateDecay(object):
-    """
-    Base class of learning rate decay
-    
-    Define the common interface of an LearningRateDecay.
-    User should not use this class directly,
-    but need to use one of it's implementation.
-    """
-
-    def __init__(self, begin=0, step=1, dtype='float32'):
-        self.step_num = begin
-        self.step_size = step
-        self.dtype = dtype
-
-    def __call__(self):
-        lr = self.step()
-        if isinstance(lr, float):
-            lr = self.create_lr_var(lr)
-        self.step_num += self.step_size
-        return lr
-
-    def create_lr_var(self, lr):
-        """
-        convert lr from float to variable
-
-        Args: 
-            lr: learning rate
-        Returns:
-            learning rate variable
-        """
-        from .. import layers
-        lr = layers.create_global_var(
-            name=unique_name.generate("learning_rate"),
-            shape=[1],
-            value=float(lr),
-            dtype=self.dtype,
-            persistable=False)
-        return lr
-
-    def step(self):
-        raise NotImplementedError()
-
-
-class PiecewiseDecay(LearningRateDecay):
-    """
-    piecewise decay scheduler
-
-    The algorithm can be described as the code below.
-
-    .. code-block:: text
-
-      boundaries = [10000, 20000]
-      values = [1.0, 0.5, 0.1]
-      if step < 10000:
-          learning_rate = 1.0
-      elif 10000 <= step < 20000:
-          learning_rate = 0.5
-      else:
-          learning_rate = 0.1
-    Args:
-        boundaries: A list of steps numbers.
-        values: A list of learning rate values that will be picked during
-            different step boundaries.
-        begin: The begin step to initilize the self.step_num
-        step: The step_size using when calculate the new step_num (Defalult is 1)
-        dtype: The dtype used to create the learning rate variable
-
-    Returns:
-        The decayed learning rate.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          boundaries = [10000, 20000]
-          values = [1.0, 0.5, 0.1]
-          with fluid.dygraph.guard():
-              optimizer = fluid.optimizer.SGD(
-                 learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0) )
-    """
-
-    def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
-        super(PiecewiseDecay, self).__init__(begin, step, dtype)
-        self.boundaries = boundaries
-        self.values = values
-
-        self.vars = []
-        for value in values:
-            self.vars.append(value)
-
-    def step(self):
-        for i in range(len(self.boundaries)):
-            if self.step_num < self.boundaries[i]:
-                return self.vars[i]
-        return self.create_lr_var(self.vars[len(self.values) - 1])
-
-
-class NaturalExpDecay(LearningRateDecay):
-    """
-    Applies natural exponential decay to the initial learning rate.
-    
-    .. code-block:: python
-
-        if not staircase:
-            decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-        else:
-            decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-
-    Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
-        begin: A Python 'int32' number, the begin step (Default is 0)
-        step: A Python 'int32' number, the step size (Default is 1)
-        dtype: A Python 'str', the dtype used to create learning rate variable (Default is 'float32')
-
-    Returns:
-        The decayed learning rate.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          with fluid.dygraph.guard():
-              sgd_optimizer = fluid.optimizer.SGD(
-        	      learning_rate=fluid.dygraph.NaturalExpDecay(
-	    	            learning_rate=base_lr,
-        		    decay_steps=10000,
-		            decay_rate=0.5,
-		            staircase=True))
-
-    """
-
-    def __init__(self,
-                 learning_rate,
-                 decay_steps,
-                 decay_rate,
-                 staircase=False,
-                 begin=0,
-                 step=1,
-                 dtype='float32'):
-        super(NaturalExpDecay, self).__init__(begin, step, dtype)
-        self.learning_rate = learning_rate
-        self.decay_steps = decay_steps
-        self.decay_rate = decay_rate
-        self.staircase = staircase
-
-    def step(self):
-        from .. import layers
-        div_res = self.create_lr_var(self.step_num / self.decay_steps)
-        if self.staircase:
-            div_res = layers.floor(div_res)
-        decayed_lr = self.learning_rate * layers.exp(-1 * self.decay_rate *
-                                                     div_res)
-
-        return decayed_lr
-
-
-class ExponentialDecay(LearningRateDecay):
-    """
-    Applies exponential decay to the learning rate.
-
-    When training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, the learning rate will be decayed by
-    'decay_rate' every 'decay_steps' steps.
-    
-    .. code-block:: python
-
-        if staircase == True:
-            decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
-        else:
-            decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
-
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        decay_steps(int): See the decay computation above.
-        decay_rate(float): The decay rate. See the decay computation above.
-        staircase(Boolean): If True, decay the learning rate at discrete intervals.
-                            Default: False
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
-
-    Returns:
-        The decayed learning rate.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          with fluid.dygraph.guard():
-              sgd_optimizer = fluid.optimizer.SGD(
-    	            learning_rate=fluid.dygraph.ExponentialDecay(
-		        learning_rate=base_lr,
-    		        decay_steps=10000,
-		        decay_rate=0.5,
-		        staircase=True))
-
-    """
-
-    def __init__(self,
-                 learning_rate,
-                 decay_steps,
-                 decay_rate,
-                 staircase=False,
-                 begin=0,
-                 step=1,
-                 dtype='float32'):
-        super(ExponentialDecay, self).__init__(begin, step, dtype)
-        self.learning_rate = learning_rate
-        self.decay_steps = decay_steps
-        self.decay_rate = decay_rate
-        self.staircase = staircase
-
-    def step(self):
-        from .. import layers
-        div_res = self.create_lr_var(self.step_num / self.decay_steps)
-        if self.staircase:
-            div_res = layers.floor(div_res)
-
-        decayed_lr = self.learning_rate * (self.decay_rate**div_res)
-
-        return decayed_lr
-
-
-class InverseTimeDecay(LearningRateDecay):
-    """
-    Applies inverse time decay to the initial learning rate.
-
-    When training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, an inverse decay function will be
-    applied to the initial learning rate.
-
-    >>> if staircase == True:
-    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
-    >>> else:
-    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
-
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        decay_steps(int): See the decay computation above.
-        decay_rate(float): The decay rate. See the decay computation above.
-        staircase(Boolean): If True, decay the learning rate at discrete intervals.
-                            Default: False
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
-
-    Returns:
-        The decayed learning rate.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          with fluid.dygraph.guard():
-              sgd_optimizer = fluid.optimizer.SGD(
-	          learning_rate=fluid.dygraph.InverseTimeDecay(
-		        learning_rate=base_lr,
-		        decay_steps=10000,
-		        decay_rate=0.5,
-		        staircase=True))
-
-    """
-
-    def __init__(self,
-                 learning_rate,
-                 decay_steps,
-                 decay_rate,
-                 staircase=False,
-                 begin=0,
-                 step=1,
-                 dtype='float32'):
-        super(InverseTimeDecay, self).__init__(begin, step, dtype)
-        self.learning_rate = learning_rate
-        self.decay_steps = decay_steps
-        self.decay_rate = decay_rate
-        self.staircase = staircase
-
-    def step(self):
-        from .. import layers
-        div_res = self.create_lr_var(self.step_num / self.decay_steps)
-        if self.staircase:
-            div_res = layers.floor(div_res)
-
-        decayed_lr = self.learning_rate / (1 + self.decay_rate * div_res)
-
-        return decayed_lr
-
-
-class PolynomialDecay(LearningRateDecay):
-    """
-    Applies polynomial decay to the initial learning rate.
-
-    .. code-block:: text
-
-     if cycle:
-       decay_steps = decay_steps * ceil(global_step / decay_steps)
-     else:
-       global_step = min(global_step, decay_steps)
-       decayed_learning_rate = (learning_rate - end_learning_rate) *
-            (1 - global_step / decay_steps) ^ power + end_learning_rate
-
-    Args:
-        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
-          will be the initial learning rate during training.
-        decay_steps(int32): A Python `int32` number.
-        end_learning_rate(float): A Python `float` number.
-        power(float): A Python `float` number.
-        cycle(bool): If set true, decay the learning rate every decay_steps.
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
-
-    Returns:
-        The decayed learning rate.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          start_lr = 0.01
-          total_step = 5000
-          end_lr = 0
-          with fluid.dygraph.guard():
-              optimizer  = fluid.optimizer.SGD(
-                  learning_rate = fluid.dygraph.PolynomialDecay(
-                  start_lr, total_step, end_lr, power=1.0) )
-
-    """
-
-    def __init__(self,
-                 learning_rate,
-                 decay_steps,
-                 end_learning_rate=0.0001,
-                 power=1.0,
-                 cycle=False,
-                 begin=0,
-                 step=1,
-                 dtype='float32'):
-        super(PolynomialDecay, self).__init__(begin, step, dtype)
-        self.learning_rate = learning_rate
-        self.decay_steps = decay_steps
-        self.end_learning_rate = end_learning_rate
-        self.power = power
-        self.cycle = cycle
-
-    def step(self):
-        from .. import layers
-        tmp_step_num = self.step_num
-        tmp_decay_steps = self.decay_steps
-        if self.cycle:
-            div_res = layers.ceil(
-                self.create_lr_var(tmp_step_num / float(self.decay_steps)))
-
-            if tmp_step_num == 0:
-                div_res = self.create_lr_var(1.0)
-            tmp_decay_steps = self.decay_steps * div_res
-        else:
-            tmp_step_num = self.create_lr_var(tmp_step_num
-                                              if tmp_step_num < self.decay_steps
-                                              else self.decay_steps)
-
-        decayed_lr = (self.learning_rate - self.end_learning_rate) * \
-            ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
-        return decayed_lr
-
-
-class CosineDecay(LearningRateDecay):
-    """
-    Applies cosine decay to the learning rate.
-
-    when training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, the learning rate will be decayed by
-    following cosine decay strategy.
-
-    .. math::
-
-	decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
-    
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        step_each_epoch(int): the number of steps in an epoch.
-        epochs(int): the number of epochs.
-        begin(int): The begin step (default is 0).
-        step(int): The step size (default is 1).
-        dtype(str): The dtype used to create learning rate (default is 'float32').
-
-    Returns:
-        The decayed learning rate.
-
-    Examples:
-	.. code-block:: python
-
-  	    base_lr = 0.1
-            with fluid.dygraph.guard():
-                optimizer  = fluid.optimizer.SGD(
-        	    learning_rate = fluid.dygraph.CosineDecay(
-	                    base_lr, 10000, 120) )
-    """
-
-    def __init__(self,
-                 learning_rate,
-                 step_each_epoch,
-                 epochs,
-                 begin=0,
-                 step=1,
-                 dtype='float32'):
-        super(CosineDecay, self).__init__(begin, step, dtype)
-        self.learning_rate = learning_rate
-        self.step_each_epoch = step_each_epoch
-        self.epochs = epochs
-
-    def step(self):
-        from .. import layers
-        cur_epoch = layers.floor(
-            self.create_lr_var(self.step_num / self.step_each_epoch))
-        decayed_lr = self.learning_rate * 0.5 * (
-            layers.cos(cur_epoch * math.pi / self.epochs) + 1)
-        return decayed_lr
-
-
-class NoamDecay(LearningRateDecay):
-    """
-    Noam decay method. The numpy implementation of noam decay as follows.
-
-    .. code-block:: python
-      
-      import numpy as np
-      # set hyper parameters
-      d_model = 2
-      current_steps = 20
-      warmup_steps = 200
-      # compute
-      lr_value = np.power(d_model, -0.5) * np.min([
-                              np.power(current_steps, -0.5),
-                              np.power(warmup_steps, -1.5) * current_steps])
-
-    Please reference `attention is all you need
-    <https://arxiv.org/pdf/1706.03762.pdf>`_.
-
-    Args:
-        d_model(Variable): The dimensionality of input and output of model.
-
-        warmup_steps(Variable): A super parameter.
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
-
-    Returns:
-        The decayed learning rate.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          warmup_steps = 100
-          learning_rate = 0.01
-          with fluid.dygraph.guard():
-              optimizer  = fluid.optimizer.SGD(
-                  learning_rate = fluid.dygraph.NoamDecay(
-                         1/(warmup_steps *(learning_rate ** 2)),
-                         warmup_steps) )
-    """
-
-    def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
-        super(NoamDecay, self).__init__(begin, step, dtype)
-        self.d_model = d_model
-        self.warmup_steps = warmup_steps
-
-    def step(self):
-        from .. import layers
-        a = self.create_lr_var(self.step_num**-0.5)
-        b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num)
-        lr_value = (self.d_model**-0.5) * layers.elementwise_min(a, b)
-        return lr_value
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
deleted file mode 100644
index 27aeda45d80657de2a0d43c55bb23083c264f1fc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/nn.py
+++ /dev/null
@@ -1,2672 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from six.moves import reduce
-
-from .. import core
-from ..layers import utils
-from . import layers
-from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter
-from ..param_attr import ParamAttr
-from ..initializer import Normal, Constant, NumpyArrayInitializer
-import numpy as np
-import logging
-
-__all__ = [
-    'Conv2D', 'Conv3D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit',
-    'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose',
-    'Conv3DTranspose', 'GroupNorm', 'SpectralNorm', 'TreeConv'
-]
-
-
-class Conv2D(layers.Layer):
-    """
-    The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input and
-    Output are in NCHW format, where N is batch size, C is the number of
-    channels, H is the height of the feature, and W is the width of the feature.
-    Filter is in MCHW format, where M is the number of output image channels,
-    C is the number of input image channels, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input image channels divided by the groups.
-    Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`
-    for more detials.
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a tensor with NCHW format.
-    * :math:`W`: Filter value, a tensor with MCHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-
-    Args:
-        name_scope(str) : The name for this class.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups (int): The groups number of the Conv2d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-        .. code-block:: python
-
-          from paddle.fluid.dygraph.base import to_variable
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph import Conv2D
-          import numpy as np
-
-          data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
-          with fluid.dygraph.guard():
-              conv2d = Conv2D( "conv2d", 2, 3)
-              data = to_variable( data )
-              conv = conv2d( data )
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 dtype='float32'):
-        assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(name_scope, dtype)
-        self._groups = groups
-        self._stride = utils.convert_to_list(stride, 2, 'stride')
-        self._padding = utils.convert_to_list(padding, 2, 'padding')
-        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
-        self._act = act
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        self._use_cudnn = use_cudnn
-        self._filter_size = filter_size
-        self._num_filters = num_filters
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._dtype = dtype
-        # if (self._num_channels == self._groups and
-        #         num_filters % self._num_channels == 0 and not self._use_cudnn):
-        #     self._l_type = 'depthwise_conv2d'
-        # else:
-        # TODO(jiabin): recover the usage of depthwise_conv2d when it's
-        #  kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
-        self._l_type = 'conv2d'
-
-    def _build_once(self, input):
-        self._num_channels = input.shape[1]
-        if self._groups is None:
-            num_filter_channels = self._num_channels
-        else:
-            if self._num_channels % self._groups != 0:
-                raise ValueError("num_channels must be divisible by groups.")
-            num_filter_channels = self._num_channels // self._groups
-        filter_size = utils.convert_to_list(self._filter_size, 2, 'filter_size')
-        filter_shape = [self._num_filters, int(num_filter_channels)
-                        ] + filter_size
-
-        def _get_default_param_initializer():
-            filter_elem_num = filter_size[0] * filter_size[
-                1] * self._num_channels
-            std = (2.0 / filter_elem_num)**0.5
-            return Normal(0.0, std, 0)
-
-        self._filter_param = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer())
-
-        self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-
-        self._helper.append_op(
-            type=self._l_type,
-            inputs={
-                'Input': input,
-                'Filter': self._filter_param,
-            },
-            outputs={"Output": pre_bias},
-            attrs={
-                'strides': self._stride,
-                'paddings': self._padding,
-                'dilations': self._dilation,
-                'groups': self._groups if self._groups else 1,
-                'use_cudnn': self._use_cudnn,
-                'use_mkldnn': False,
-            })
-
-        if self._bias_param is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self._bias_param]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
-        else:
-            pre_act = pre_bias
-
-        # Currently, we don't support inplace in dygraph mode
-        return self._helper.append_activation(pre_act, act=self._act)
-
-
-class Conv3D(layers.Layer):
-    """
-    **Convlution3D Layer**
-
-    The convolution3D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are in NCDHW format. Where N is batch size C is the number of
-    channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. Convlution3D is similar with Convlution2D
-    but adds one dimension(depth). If bias attribution and activation type are
-    provided, bias is added to the output of the convolution, and the
-    corresponding activation function is applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    In the above equation:
-
-    * :math:`X`: Input value, a tensor with NCDHW format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
-
-        - Output:
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
-
-    Args:
-        name_scope(str) : The name for this class.
-        num_filters(int): The number of filter. It is as same as the output image channel.
-        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
-            stride_D = stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|tuple): The padding size. If padding is a tuple, it must
-            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
-            padding_D = padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
-            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups (int): The groups number of the Conv3d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
-            will create ParamAttr as param_attr. If it is set to None, the parameter
-            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv3d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-
-    Returns:
-        Variable: The tensor variable storing the convolution and \
-                  non-linearity activation result.
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32')
-              conv3d = fluid.dygraph.nn.Conv3D(
-                    'Conv3D', num_filters=2, filter_size=3, act="relu")
-              ret = conv3d(fluid.dygraph.base.to_variable(data))
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None):
-        assert param_attr is not False, "param_attr should not be False here."
-        super(Conv3D, self).__init__(name_scope)
-        self._groups = groups
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._padding = utils.convert_to_list(padding, 3, 'padding')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._act = act
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        self._use_cudnn = use_cudnn
-        self._filter_size = filter_size
-        self._num_filters = num_filters
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-
-    def _build_once(self, input):
-        num_channels = input.shape[1]
-        self._dtype = self._helper.input_dtype(input)
-
-        if self._groups is None:
-            num_filter_channels = num_channels
-        else:
-            if num_channels % self._groups != 0:
-                raise ValueError("num_channels must be divisible by groups.")
-            num_filter_channels = num_channels // self._groups
-
-        filter_size = utils.convert_to_list(self._filter_size, 3, 'filter_size')
-
-        filter_shape = [self._num_filters, num_filter_channels] + filter_size
-
-        def _get_default_param_initializer():
-            filter_elem_num = filter_size[0] * filter_size[1] * filter_size[
-                2] * num_channels
-            std = (2.0 / filter_elem_num)**0.5
-            return Normal(0.0, std, 0)
-
-        self._filter_param = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer())
-
-        self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-
-        self._helper.append_op(
-            type='conv3d',
-            inputs={
-                'Input': input,
-                'Filter': self._filter_param,
-            },
-            outputs={"Output": pre_bias},
-            attrs={
-                'strides': self._stride,
-                'paddings': self._padding,
-                'dilations': self._dilation,
-                'groups': self._groups if self._groups else 1,
-                'use_cudnn': self._use_cudnn,
-                'use_mkldnn': False
-            })
-
-        pre_act = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-
-        self._helper.append_op(
-            type='elementwise_add',
-            inputs={'X': [pre_bias],
-                    'Y': [self._bias_param]},
-            outputs={'Out': [pre_act]},
-            attrs={'axis': 1})
-
-        return self._helper.append_activation(pre_act, act=self._act)
-
-
-class Conv3DTranspose(layers.Layer):
-    """
-    **Convlution3D transpose layer**
-
-    The convolution3D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input(Input) and output(Output)
-    are in NCDHW format. Where N is batch size, C is the number of channels,
-    D is the depth of the feature, H is the height of the feature, and W
-    is the width of the feature. Parameters(dilations, strides, paddings) are
-    two elements. These two elements represent height and width, respectively.
-    The details of convolution transpose layer, please refer to the following
-    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    In the above equation:
-
-    * :math:`X`: Input value, a tensor with NCDHW format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-           D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
-           H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
-           W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
-
-    Args:
-        name_scope(str) : The name for this class.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        output_size(int|tuple|None): The output image size. If output size is a
-            tuple, it must contain three integers, (image_D, image_H, image_W). This
-            parameter only works when filter_size is None.
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square. None if use output size to
-            calculate filter_size.
-        padding(int|tuple): The padding size. If padding is a tuple, it must
-            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
-            padding_D = padding_H = padding_W = padding. Default: padding = 0.
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
-            stride_D = stride_H = stride_W = stride. Default: stride = 1.
-        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
-            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups(int): The groups number of the Conv3d transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            Default: groups=1
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d_transpose.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv3d_transpose
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-
-    Returns:
-        Variable: The tensor variable storing the convolution transpose result.
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-       .. code-block:: python
-
-         import paddle.fluid as fluid
-         import numpy
-
-         with fluid.dygraph.guard():
-             data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32')
-
-             conv3dTranspose = fluid.dygraph.nn.Conv3DTranspose(
-                    'Conv3DTranspose',
-                    num_filters=12,
-                    filter_size=12,
-                    use_cudnn=False)
-             ret = conv3dTranspose(fluid.dygraph.base.to_variable(data))
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 output_size=None,
-                 filter_size=None,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 name=None):
-        super(Conv3DTranspose, self).__init__(name_scope)
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
-        self._padding = utils.convert_to_list(padding, 3, 'padding')
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._param_attr = param_attr
-        self._filter_size = filter_size
-        self._output_size = output_size
-        self._groups = 1 if groups is None else groups
-        self._num_filters = num_filters
-        self._use_cudnn = use_cudnn
-        self._bias_attr = bias_attr
-        self._act = act
-
-    def _build_once(self, input):
-        self._dtype = self._helper.input_dtype(input)
-        self._input_channel = input.shape[1]
-
-        if self._filter_size is None:
-            if self._output_size is None:
-                raise ValueError(
-                    "output_size must be set when filter_size is None")
-            if isinstance(self._output_size, int):
-                self._output_size = [self._output_size, self._output_size]
-
-            d_in = input.shape[2]
-            h_in = input.shape[3]
-            w_in = input.shape[4]
-
-            filter_size_d = (self._output_size[0] -
-                             (d_in - 1) * self._stride[0] + 2 * self._padding[0]
-                             - 1) // self._dilation[0] + 1
-            filter_size_h = (self._output_size[1] -
-                             (h_in - 1) * self._stride[1] + 2 * self._padding[1]
-                             - 1) // self._dilation[1] + 1
-            filter_size_w = (self._output_size[2] -
-                             (w_in - 1) * self._stride[2] + 2 * self._padding[2]
-                             - 1) // self._dilation[2] + 1
-            self._filter_size = [filter_size_d, filter_size_h, filter_size_w]
-        else:
-            self._filter_size = utils.convert_to_list(
-                self._filter_size, 3, 'conv3d_transpose.filter_size')
-
-        filter_shape = [
-            self._input_channel, self._num_filters // self._groups
-        ] + self._filter_size
-        self._img_filter = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
-        if self._bias_attr:
-            self._bias_param = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[self._num_filters],
-                dtype=self._dtype,
-                is_bias=True)
-
-    def forward(self, input):
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        self._helper.append_op(
-            type="conv3d_transpose",
-            inputs={'Input': [input],
-                    'Filter': [self._img_filter]},
-            outputs={'Output': pre_bias},
-            attrs={
-                'strides': self._stride,
-                'paddings': self._padding,
-                'dilations': self._dilation,
-                'groups': self._groups if self._groups else 1,
-                'use_cudnn': self._use_cudnn
-            })
-
-        if self._bias_attr:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self._bias_param]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
-        else:
-            pre_act = pre_bias
-
-        # Currently, we don't support inplace in imperative mode
-        return self._helper.append_activation(pre_act, act=self._act)
-
-
-class Pool2D(layers.Layer):
-    """
-    The pooling2d operation calculates the output based on the input, pooling_type and ksize, strides,
-    paddings parameters.Input(X) and output(Out) are in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-    Parameters(ksize, strides, paddings) are two elements. These two elements represent height and width, respectively.
-    The input(X) size and output(Out) size may be different.
-
-    Args:
-        name_scope(str) : The name of this class.
-        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int. Default: -1
-        pool_type(str) : The pooling type, can be "max" for max-pooling and "avg" for average-pooling. Default: max
-        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width). Otherwise,
-            the pool stride size will be a square of an int. Default: 1
-        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
-            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
-            Otherwise, the pool padding size will be a square of an int. Default: 0
-        global_pooling (bool): Whether to use the global pooling. If global_pooling = true,
-            kernel size and paddings will be ignored. Default: False
-        use_cudnn (bool): Only used in cudnn kernel, need install cudnn. Default: True
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width.
-            False is the default. If it is set to False, the floor function will be used. Default: False
-        exclusive (bool): Whether to exclude padding points in average pooling mode. Default: True
-
-    Returns:
-        Variable: The pooling result.
-
-    Raises:
-        ValueError: If 'pool_type' is not "max" nor "avg"
-        ValueError: If 'global_pooling' is False and 'pool_size' is -1
-        ValueError: If 'use_cudnn' is not a bool value.
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-             data = numpy.random.random((3, 32, 32)).astype('float32')
-
-             pool2d = fluid.dygraph.Pool2D("pool2d",pool_size=2,
-                            pool_type='max',
-                            pool_stride=1,
-                            global_pooling=False)
-             pool2d_res = pool2d(data)
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 pool_size=-1,
-                 pool_type="max",
-                 pool_stride=1,
-                 pool_padding=0,
-                 global_pooling=False,
-                 use_cudnn=True,
-                 ceil_mode=False,
-                 exclusive=True,
-                 dtype=core.VarDesc.VarType.FP32):
-        if pool_type not in ["max", "avg"]:
-            raise ValueError(
-                "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-                str(pool_type))
-
-        if global_pooling is False and pool_size == -1:
-            raise ValueError(
-                "When the global_pooling is False, pool_size must be passed "
-                "and be a valid value. Received pool_size: " + str(pool_size))
-
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-
-        super(Pool2D, self).__init__(name_scope, dtype=dtype)
-
-        self._pool_type = pool_type
-        self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
-        self._pool_padding = utils.convert_to_list(pool_padding, 2,
-                                                   'pool_padding')
-        self._pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
-        self._global_pooling = global_pooling
-        self._use_cudnn = use_cudnn
-        self._ceil_mode = ceil_mode
-        self._exclusive = exclusive
-        self._l_type = 'pool2d'
-
-    def forward(self, input):
-        pool_out = self._helper.create_variable_for_type_inference(self._dtype)
-
-        self._helper.append_op(
-            type=self._l_type,
-            inputs={"X": input},
-            outputs={"Out": pool_out},
-            attrs={
-                "pooling_type": self._pool_type,
-                "ksize": self._pool_size,
-                "global_pooling": self._global_pooling,
-                "strides": self._pool_stride,
-                "paddings": self._pool_padding,
-                "use_cudnn": self._use_cudnn,
-                "ceil_mode": self._ceil_mode,
-                "use_mkldnn": False,
-                "exclusive": self._exclusive,
-            })
-        return pool_out
-
-
-class FC(layers.Layer):
-    """
-    **Fully Connected Layer**
-
-    This function creates a fully connected layer in the network. It can take
-    one or multiple tensors as its inputs(input can be a list of Variable, see
-    Args in detail). It creates a variable called weights for each input tensor,
-    which represents a fully connected weight matrix from each input unit to
-    each output unit. The fully connected layer multiplies each input tensor
-    with its corresponding weight to produce an output Tensor with shape [M, `size`],
-    where M is batch size. If multiple input tensors are given, the results of
-    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
-    is not None, a bias variable will be created and added to the output.
-    Finally, if activation is not None, it will be applied to the output as well.
-
-    When the input is single tensor:
-
-    .. math::
-
-        Out = Act({XW + b})
-
-    When the input are multiple tensors:
-
-    .. math::
-
-        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
-
-    In the above equation:
-
-    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
-    * :math:`X_i`: The i-th input tensor.
-    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
-    * :math:`b`: The bias parameter created by this layer (if needed).
-    * :math:`Act`: The activation function.
-    * :math:`Out`: The output tensor.
-
-    See below for an example.
-
-    .. code-block:: text
-
-        Given:
-            data_1.data = [[[0.1, 0.2],
-                           [0.3, 0.4]]]
-            data_1.shape = (1, 2, 2) # 1 is batch_size
-
-            data_2 = [[[0.1, 0.2, 0.3]]]
-            data_2.shape = (1, 1, 3)
-
-            out = fluid.layers.fc(input=[data_1, data_2], size=2)
-
-        Then:
-            out.data = [[0.18669507, 0.1893476]]
-            out.shape = (1, 2)
-
-    Args:
-        name_scope(str): The name of this class.
-        size(int): The number of output units in this layer.
-        num_flatten_dims (int): The fc layer can accept an input tensor with more than
-            two dimensions. If this happens, the multidimensional tensor will first be flattened
-            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
-            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
-            dimensions will be flatten to form the first dimension of the final matrix (height of
-            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
-            form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
-            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
-        param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable
-            parameters/weights of this layer.
-        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-        act (str|None): Activation to be applied to the output of this layer.
-        is_test(bool): A flag indicating whether execution is in test phase. Default: False
-        dtype(str): Dtype used for weight
-
-    Raises:
-        ValueError: If rank of the input tensor is less than 2.
-
-    Examples:
-        .. code-block:: python
-
-          from paddle.fluid.dygraph.base import to_variable
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph import FC
-          import numpy as np
-
-          data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
-          with fluid.dygraph.guard():
-              fc = FC( "fc", 64, num_flatten_dims=2)
-              data = to_variable( data )
-              conv = fc( data )
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 size,
-                 num_flatten_dims=1,
-                 param_attr=None,
-                 bias_attr=None,
-                 act=None,
-                 is_test=False,
-                 dtype="float32"):
-        super(FC, self).__init__(name_scope, dtype)
-
-        self._size = size
-        self._num_flatten_dims = num_flatten_dims
-        self._dtype = dtype
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self.__w = list()
-
-    @property
-    def _w(self, i=0):
-        return self.__w[i]
-
-    @_w.setter
-    def _w(self, value, i=0):
-        assert isinstance(value, Parameter)
-        self.__w[i] = value
-
-    def _build_once(self, input):
-        i = 0
-        for inp, param in self._helper.iter_inputs_and_params(input,
-                                                              self._param_attr):
-            input_shape = inp.shape
-
-            param_shape = [
-                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
-                       1)
-            ] + [self._size]
-            self.__w.append(
-                self.add_parameter(
-                    '_w%d' % i,
-                    self.create_parameter(
-                        attr=param,
-                        shape=param_shape,
-                        dtype=self._dtype,
-                        is_bias=False)))
-            i += 1
-
-        size = list([self._size])
-        self._b = self.create_parameter(
-            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
-
-    def forward(self, input):
-        mul_results = list()
-        i = 0
-        for inp, param in self._helper.iter_inputs_and_params(input,
-                                                              self._param_attr):
-            tmp = self._helper.create_variable_for_type_inference(self._dtype)
-            self._helper.append_op(
-                type="mul",
-                inputs={"X": inp,
-                        "Y": self.__w[i]},
-                outputs={"Out": tmp},
-                attrs={
-                    "x_num_col_dims": self._num_flatten_dims,
-                    "y_num_col_dims": 1
-                })
-            i += 1
-            mul_results.append(tmp)
-
-        if len(mul_results) == 1:
-            pre_bias = mul_results[0]
-        else:
-            pre_bias = self._helper.create_variable_for_type_inference(
-                self._dtype)
-            self._helper.append_op(
-                type="sum",
-                inputs={"X": mul_results},
-                outputs={"Out": pre_bias},
-                attrs={"use_mkldnn": False})
-
-        if self._b:
-            pre_activation = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self._b]},
-                outputs={'Out': [pre_activation]},
-                attrs={'axis': self._num_flatten_dims})
-        else:
-            pre_activation = pre_bias
-        # Currently, we don't support inplace in dygraph mode
-        return self._helper.append_activation(pre_activation, act=self._act)
-
-
-class BatchNorm(layers.Layer):
-    """
-    **Batch Normalization Layer**
-
-    Can be used as a normalizer function for conv2d and fully_connected operations.
-    The required data format for this layer is one of the following:
-
-    1. NHWC `[batch, in_height, in_width, in_channels]`
-
-    2. NCHW `[batch, in_channels, in_height, in_width]`
-
-    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
-    for more details.
-
-    :math:`input` is the input features over a mini-batch.
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
-    They are global (or running) statistics. (It usually got from the
-    pre-trained model.)
-    The training and testing (or inference) have the same behavior:
-
-    ..  math::
-
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta
-
-    Args:
-        name_scope(str): The name of this class.
-        act(str|None): Activation type, linear|relu|prelu|...
-        is_test (bool): A flag indicating whether it is in
-            test phrase or not. Default: False
-        momentum(float): The value used for the moving_mean and
-            moving_var computation. The updated formula is:
-            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
-            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
-            Default is 0.9.
-        epsilon(float): A value added to the denominator for
-            numerical stability. Default is 1e-5.
-        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
-        data_layout(string): NCHW|NHWC. Default: NCHW
-        in_place(bool): Make the input and output of batch norm reuse memory. Default: False
-        moving_mean_name(string|None): The name of moving_mean which store the global Mean. Default: None
-        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
-        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
-        fuse_with_relu (bool): if True, this OP performs relu after batch norm. Default: False
-        use_global_stats(bool): Whether to use global mean and
-            variance. In inference or test mode, set use_global_stats to true
-            or is_test to true, and the behavior is equivalent.
-            In train mode, when setting use_global_stats True, the global mean
-            and variance are also used during train period. Default: False
-        trainable_statistics(bool): Whether to calculate mean and var in eval mode. In eval mode, when
-            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.Default: False
-
-    Returns:
-        Variable: A tensor variable which is the result after applying batch normalization on the input.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          with fluid.dygraph.guard():
-              fc = fluid.FC('fc', size=200, param_attr='fc1.w')
-              hidden1 = fc(x)
-              batch_norm = fluid.BatchNorm("batch_norm", 10)
-              hidden2 = batch_norm(hidden1)
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_channels,
-                 act=None,
-                 is_test=False,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32',
-                 data_layout='NCHW',
-                 in_place=False,
-                 moving_mean_name=None,
-                 moving_variance_name=None,
-                 do_model_average_for_mean_and_var=False,
-                 fuse_with_relu=False,
-                 use_global_stats=False,
-                 trainable_statistics=False):
-        super(BatchNorm, self).__init__(name_scope, dtype)
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-
-        assert bias_attr is not False, "bias_attr should not be False in batch_norm."
-
-        if dtype == "float16":
-            self._dtype = "float32"
-        else:
-            self._dtype = dtype
-
-        param_shape = [num_channels]
-
-        # create parameter
-        self._scale = self.create_parameter(
-            attr=self._param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            default_initializer=Constant(1.0))
-        if use_global_stats and self._param_attr.learning_rate == 0.:
-            self._scale.stop_gradient = True
-
-        self._bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=True)
-        if use_global_stats and self._param_attr.learning_rate == 0.:
-            self._bias.stop_gradient = True
-
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_mean_name,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=do_model_average_for_mean_and_var),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._mean.stop_gradient = True
-
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_variance_name,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=do_model_average_for_mean_and_var),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._variance.stop_gradient = True
-
-        self._in_place = in_place
-        self._data_layout = data_layout
-        self._momentum = momentum
-        self._epsilon = epsilon
-        self._is_test = is_test
-        self._fuse_with_relu = fuse_with_relu
-        self._use_global_stats = use_global_stats
-        self._trainable_statistics = trainable_statistics
-
-    def _build_once(self, input):
-        pass
-
-    def forward(self, input):
-        # create output
-        # mean and mean_out share the same memory
-        mean_out = self._mean
-        # variance and variance out share the same memory
-        variance_out = self._variance
-
-        saved_mean = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        saved_variance = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
-            self._dtype)
-
-        self._helper.append_op(
-            type="batch_norm",
-            inputs={
-                "X": input,
-                "Scale": self._scale,
-                "Bias": self._bias,
-                "Mean": self._mean,
-                "Variance": self._variance
-            },
-            outputs={
-                "Y": batch_norm_out,
-                "MeanOut": mean_out,
-                "VarianceOut": variance_out,
-                "SavedMean": saved_mean,
-                "SavedVariance": saved_variance
-            },
-            attrs={
-                "momentum": self._momentum,
-                "epsilon": self._epsilon,
-                "is_test": self._is_test,
-                "data_layout": self._data_layout,
-                "use_mkldnn": False,
-                "fuse_with_relu": self._fuse_with_relu,
-                "use_global_stats": self._use_global_stats,
-                "trainable_statistics": self._trainable_statistics
-            })
-
-        # Currently, we don't support inplace in dygraph mode
-        return self._helper.append_activation(batch_norm_out, self._act)
-
-
-class Embedding(layers.Layer):
-    """
-    **Embedding Layer**
-
-    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
-    a lookup table. The result of this lookup is the embedding of each ID in the
-    :attr:`input`.
-    All the input variables are passed in as local variables to the LayerHelper constructor
-
-    Args:
-        name_scope(str): The name of this class.
-        size(tuple|list): The shape of the look up table parameter. It should have two elements which indicate the size
-            of the dictionary of embeddings and the size of each embedding vector respectively.
-        is_sparse(bool): The flag indicating whether to use sparse update. Default: False
-        is_distributed(bool): Whether to run lookup table from remote parameter server. Default: False.
-        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
-            Otherwise the given :attr:`padding_idx` indicates padding the output with zeros whenever lookup encounters
-            it in :attr:`input`. If :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is :math:`size[0] + dim`. Default: None.
-        param_attr(ParamAttr): Parameters for this layer. Default: None.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc. Default: 'float32'.
-
-    Returns:
-        Variable: The tensor variable storing the embeddings of the \
-                  supplied inputs.
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle.fluid.dygraph.base as base
-          import numpy as np
-
-          inp_word = np.array([[[1]]]).astype('int64')
-          dict_size = 20
-          with fluid.dygraph.guard():
-              emb = fluid.dygraph.Embedding(
-                  name_scope='embedding',
-                  size=[dict_size, 32],
-                  param_attr='emb.w',
-                  is_sparse=False)
-              static_rlt3 = emb(base.to_variable(inp_word))
-    """
-
-    def __init__(self,
-                 name_scope,
-                 size,
-                 is_sparse=False,
-                 is_distributed=False,
-                 padding_idx=None,
-                 param_attr=None,
-                 dtype='float32'):
-        super(Embedding, self).__init__(name_scope, dtype)
-        self._size = size
-        self._is_sparse = is_sparse
-        self._is_distributed = is_distributed
-        self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
-            size[0] + padding_idx)
-
-        self._param_attr = param_attr
-        self._dtype = dtype
-        self._remote_prefetch = self._is_sparse and (not self._is_distributed)
-        if self._remote_prefetch:
-            assert self._is_sparse is True and self._is_distributed is False
-
-        self._w = self.create_parameter(
-            attr=self._param_attr,
-            shape=self._size,
-            dtype=self._dtype,
-            is_bias=False)
-
-    def forward(self, input):
-        out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type='lookup_table',
-            inputs={'Ids': input,
-                    'W': self._w},
-            outputs={'Out': out},
-            attrs={
-                'is_sparse': self._is_sparse,
-                'is_distributed': self._is_distributed,
-                'remote_prefetch': self._remote_prefetch,
-                'padding_idx': self._padding_idx
-            })
-
-        return out
-
-
-class LayerNorm(layers.Layer):
-    """
-    Assume feature vectors exist on dimensions
-    `begin_norm_axis ... rank(input)` and calculate the moment statistics along these dimensions for each feature
-    vector `a` with size `H`, then normalize each feature vector using the corresponding
-    statistics. After that, apply learnable gain and bias on the normalized
-    tensor to scale and shift if `scale` and `shift` are set.
-
-    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
-
-    The formula is as follows:
-
-    ..  math::
-
-        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
-
-        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
-
-        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
-
-    * :math:`a`: the vector representation of the summed inputs to the neurons in that layer.
-
-    * :math:`H`: the number of hidden units in a layers
-
-    * :math:`g`: the trainable scale parameter.
-
-    * :math:`b`: the trainable bias parameter.
-
-    Args:
-        name_scope(str): The name of this class.
-        scale(bool): Whether to learn the adaptive gain :math:`g` after
-            normalization. Default: True.
-        shift(bool): Whether to learn the adaptive bias :math:`b` after
-            normalization. Default: True.
-        begin_norm_axis(int): The normalization will be performed along
-            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
-            Default: 1.
-        epsilon(float): The small value added to the variance to prevent
-            division by zero. Default: 1e-05.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
-            omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
-            a default :code:`ParamAttr` would be added as scale. The
-            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the learnable
-            bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
-            omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
-            a default :code:`ParamAttr` would be added as bias. The
-            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
-        act(str): Activation to be applied to the output of layer normalizaiton.
-                  Default: None.
-    Returns:
-        Result after normalization
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              x = numpy.random.random((3, 32, 32)).astype('float32')
-              layerNorm = fluid.dygraph.nn.LayerNorm(
-                    'LayerNorm', begin_norm_axis=1)
-             ret = layerNorm(fluid.dygraph.base.to_variable(x))
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 scale=True,
-                 shift=True,
-                 begin_norm_axis=1,
-                 epsilon=1e-05,
-                 param_attr=None,
-                 bias_attr=None,
-                 act=None):
-        super(LayerNorm, self).__init__(name_scope)
-        self._scale = scale
-        self._shift = shift
-        self._begin_norm_axis = begin_norm_axis
-        self._epsilon = epsilon
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-
-    def _build_once(self, input):
-        self._dtype = self._helper.input_dtype(input)
-        input_shape = input.shape
-        param_shape = [
-            reduce(lambda x, y: x * y, input_shape[self._begin_norm_axis:])
-        ]
-        if self._scale:
-            self._scale_w = self.create_parameter(
-                attr=self._param_attr,
-                shape=param_shape,
-                dtype=self._dtype,
-                default_initializer=Constant(1.0))
-        else:
-            if self._param_attr:
-                logging.warn("param_attr are only avaliable with scale is True")
-
-        if self._shift:
-            assert self._bias_attr is not False
-            self._bias_w = self.create_parameter(
-                attr=self._bias_attr,
-                shape=param_shape,
-                dtype=self._dtype,
-                is_bias=True)
-        else:
-            if self._bias_attr:
-                logging.warn("bias_attr are only avaliable with shift is True")
-
-    def forward(self, input):
-        inputs = dict()
-        inputs['X'] = input
-        if self._scale:
-            inputs['Scale'] = self._scale_w
-        if self._shift:
-            inputs['Bias'] = self._bias_w
-        # create output
-        mean_out = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        variance_out = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        layer_norm_out = self._helper.create_variable_for_type_inference(
-            self._dtype)
-
-        self._helper.append_op(
-            type="layer_norm",
-            inputs=inputs,
-            outputs={
-                "Y": layer_norm_out,
-                "Mean": mean_out,
-                "Variance": variance_out,
-            },
-            attrs={
-                "epsilon": self._epsilon,
-                "begin_norm_axis": self._begin_norm_axis
-            })
-
-        return self._helper.append_activation(layer_norm_out, act=self._act)
-
-
-class GRUUnit(layers.Layer):
-    """
-    **GRU unit layer**
-
-    if origin_mode is True, then the equation of a gru step is from paper
-    `Learning Phrase Representations using RNN Encoder-Decoder for Statistical
-    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`
-
-        .. math::
-            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
-
-            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
-
-            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    if origin_mode is False, then the equation of a gru step is from paper
-    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
-    Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
-
-        .. math::
-            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
-
-            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
-
-            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
-
-
-    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
-    of the equation above, the :math:`z_t` is split into 3 parts -
-    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
-    implement a full GRU unit operator for an input, a fully
-    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
-
-    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
-    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
-    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
-    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
-    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
-
-    Args:
-        name_scope(str): The name of this class.
-        size (int): The input dimension value.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            hidden-hidden weight matrix. Note:
-
-            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
-              :math:`D` is the hidden size.
-            - All elements in the weight matrix can be divided into two parts.
-              The first part are weights of the update gate and reset gate with
-              shape :math:`(D \\times 2D)`, and the second part are weights for
-              candidate hidden state with shape :math:`(D \\times D)`.
-
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
-            the bias in the update gate, reset gate and candidate calculations.
-            If it is set to False, no bias will be applied to the update gate,
-            reset gate and candidate calculations. If it is set to None or one
-            attribute of ParamAttr, gru_unit will create ParamAttr as
-            bias_attr. If the Initializer of the bias_attr is not set, the bias
-            is initialized zero. Default: None.
-        activation (str): The activation type for cell (actNode).
-                             Default: 'tanh'
-        gate_activation (str): The activation type for gates (actGate).
-                                  Default: 'sigmoid'
-        dtype(str): The dtype of the layers. Default: 'float32'
-
-    Returns:
-        tuple: The hidden value, reset-hidden value and gate values.
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle.fluid.dygraph.base as base
-          import numpy
-
-          lod = [[2, 4, 3]]
-          D = 5
-          T = sum(lod[0])
-
-          hidden_input = numpy.random.rand(T, D).astype('float32')
-          with fluid.dygraph.guard():
-              x = numpy.random.random((3, 32, 32)).astype('float32')
-              gru = fluid.dygraph.GRUUnit('gru', size=D * 3)
-              dy_ret = gru(
-                base.to_variable(input), base.to_variable(hidden_input))
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 size,
-                 param_attr=None,
-                 bias_attr=None,
-                 activation='tanh',
-                 gate_activation='sigmoid',
-                 origin_mode=False,
-                 dtype='float32'):
-        super(GRUUnit, self).__init__(name_scope, dtype)
-
-        activation_dict = dict(
-            identity=0,
-            sigmoid=1,
-            tanh=2,
-            relu=3, )
-        self.activation = activation_dict[activation]
-        self.gate_activation = activation_dict[gate_activation]
-
-        self._dtype = dtype
-        size = size // 3
-        # create weight
-        self._weight = self.create_parameter(
-            attr=param_attr, shape=[size, 3 * size], dtype=dtype)
-
-        # create bias
-        bias_size = [1, 3 * size]
-        self._bias = self.create_parameter(
-            attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    def forward(self, input, hidden):
-        inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': self._weight}
-        if self._bias:
-            inputs['Bias'] = self._bias
-
-        gate = self._helper.create_variable_for_type_inference(self._dtype)
-        reset_hidden_pre = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        updated_hidden = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        self._helper.append_op(
-            type='gru_unit',
-            inputs=inputs,
-            outputs={
-                'Gate': gate,
-                'ResetHiddenPrev': reset_hidden_pre,
-                'Hidden': updated_hidden,
-            },
-            attrs={
-                'activation': self.activation,
-                'gate_activation': self.gate_activation,
-            })
-
-        return updated_hidden, reset_hidden_pre, gate
-
-
-class NCE(layers.Layer):
-    """
-    Compute and return the noise-contrastive estimation training loss. See
-    `Noise-contrastive estimation: A new estimation principle for unnormalized statistical models <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_ .
-    By default this operator uses a uniform distribution for sampling.
-
-    Args:
-        name_scope(str): The name of this class.
-        num_total_classes (int): Total number of classes in all samples
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-             of nce. If it is set to None or one attribute of ParamAttr, nce
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of nce.
-             If it is set to False, no bias will be added to the output units.
-             If it is set to None or one attribute of ParamAttr, nce
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
-        num_neg_samples (int): The number of negative classes. The default value is 10.
-        sampler (str): The sampler used to sample class from negtive classes.
-                       It can be 'uniform', 'log_uniform' or 'custom_dist'.
-                       default: 'uniform'.
-        custom_dist (float[]|None): A float[] with size=num_total_classes.
-                       It is used when sampler is set to 'custom_dist'.
-                       custom_dist[i] is the probability of i-th class to be sampled.
-                       Default: None.
-        seed (int): The seed used in sampler. Default: 0.
-        is_sparse(bool): The flag indicating whether to use sparse update, the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default: False.
-
-    Returns:
-        Variable: The output nce loss.
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            import paddle.fluid as fluid
-
-            window_size = 5
-            dict_size = 20
-            label_word = int(window_size // 2) + 1
-            inp_word = np.array([[[1]], [[2]], [[3]], [[4]], [[5]]]).astype('int64')
-            nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
-
-            with fluid.dygraph.guard():
-                words = []
-                for i in range(window_size):
-                    words.append(fluid.dygraph.base.to_variable(inp_word[i]))
-
-                emb = fluid.Embedding(
-                    'embedding',
-                    size=[dict_size, 32],
-                    param_attr='emb.w',
-                    is_sparse=False)
-
-                embs3 = []
-                for i in range(window_size):
-                    if i == label_word:
-                        continue
-
-                    emb_rlt = emb(words[i])
-                    embs3.append(emb_rlt)
-
-                embs3 = fluid.layers.concat(input=embs3, axis=1)
-                nce = fluid.NCE('nce',
-                             num_total_classes=dict_size,
-                             num_neg_samples=2,
-                             sampler="custom_dist",
-                             custom_dist=nid_freq_arr.tolist(),
-                             seed=1,
-                             param_attr='nce.w',
-                             bias_attr='nce.b')
-
-                nce_loss3 = nce(embs3, words[label_word])
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_total_classes,
-                 sample_weight=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 num_neg_samples=None,
-                 sampler="uniform",
-                 custom_dist=None,
-                 seed=0,
-                 is_sparse=False):
-        super(NCE, self).__init__(name_scope)
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._num_total_classes = num_total_classes
-
-        self._inputs = dict()
-        self._inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
-        if sampler == "uniform":
-            sampler = 0
-        elif sampler == "log_uniform":
-            sampler = 1
-        elif sampler == "custom_dist":
-            assert custom_dist is not None
-            # assert isinstance(custom_dist, Variable)
-
-            custom_dist_len = len(custom_dist)
-            alias_probs_ = [0] * custom_dist_len
-            alias_ = [0] * custom_dist_len
-            bigs = []
-            littles = []
-            for i in range(custom_dist_len):
-                normal_prob = custom_dist[i] * custom_dist_len
-                if normal_prob - 1.0 > 0:
-                    bigs.append((i, normal_prob))
-                elif 1.0 - normal_prob > 0:
-                    littles.append((i, normal_prob))
-                else:
-                    alias_probs_[i] = normal_prob
-                    alias_[i] = -1
-
-            while len(bigs) and len(littles):
-                big = bigs.pop(0)
-                little = littles.pop(0)
-
-                big_idx = big[0]
-                big_prob = big[1]
-
-                alias_probs_[little[0]] = little[1]
-                alias_[little[0]] = big_idx
-                big_left = big[1] + little[1] - 1
-                if big_left - 1.0 > 0:
-                    bigs.append((big_idx, big_left))
-                elif 1.0 - big_left > 0:
-                    littles.append((big_idx, big_left))
-                else:
-                    alias_probs_[big_idx] = big_left
-                    alias_[big_idx] = -1
-
-            if len(bigs):
-                big = bigs.pop(0)
-                alias_probs_[big[0]] = 1.0
-                alias_[big[0]] = -1
-            if len(littles):
-                little = littles.pop(0)
-                alias_probs_[little[0]] = 1.0
-                alias_[little[0]] = -1
-
-            def _init_by_numpy_array(numpy_array):
-                ret = self.create_parameter(
-                    attr=ParamAttr(),
-                    shape=numpy_array.shape,
-                    dtype=numpy_array.dtype,
-                    default_initializer=NumpyArrayInitializer(numpy_array))
-                ret.stop_gradient = True
-                return ret
-
-            self._inputs['CustomDistProbs'] = _init_by_numpy_array(
-                np.array(custom_dist).astype('float32'))
-            self._inputs['CustomDistAlias'] = _init_by_numpy_array(
-                np.array(alias_).astype('int32'))
-            self._inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
-                np.array(alias_probs_).astype('float32'))
-            sampler = 2
-        else:
-            raise Exception("Unsupported sampler type.")
-
-        if num_neg_samples is None:
-            num_neg_samples = 10
-        else:
-            num_neg_samples = int(num_neg_samples)
-        self._num_neg_samples = num_neg_samples
-        remote_prefetch = is_sparse
-        print(
-            "With sparse mode, if your models has only small parameter prefetch may cause speed down"
-        )
-        self._attrs = {
-            'num_total_classes': int(num_total_classes),
-            'num_neg_samples': num_neg_samples,
-            'seed': seed,
-            'sampler': sampler,
-            'is_sparse': is_sparse,
-            'remote_prefetch': remote_prefetch
-        }
-
-    def _build_once(self, input, label, sample_weight=None):
-        assert isinstance(input, Variable)
-        assert isinstance(label, Variable)
-
-        dim = input.shape[1]
-        num_true_class = label.shape[1]
-        self._w = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._num_total_classes, dim],
-            is_bias=False,
-            dtype=input.dtype)
-        if self._bias_attr:
-            self._b = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[self._num_total_classes, 1],
-                is_bias=True,
-                dtype=input.dtype)
-            self._inputs['Bias'] = self._b
-        self._inputs['Weight'] = self._w
-
-    def forward(self, input, label, sample_weight=None):
-        assert isinstance(input, Variable)
-        assert isinstance(label, Variable)
-
-        self._inputs['Input'] = input
-        self._inputs['Label'] = label
-        self._inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
-
-        cost = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype)
-        sample_logits = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype)
-        sample_labels = self._helper.create_variable_for_type_inference(
-            dtype=label.dtype)
-
-        self._helper.append_op(
-            type='nce',
-            inputs=self._inputs,
-            outputs={
-                'Cost': cost,
-                'SampleLogits': sample_logits,
-                'SampleLabels': sample_labels
-            },
-            attrs=self._attrs)
-        return cost / (self._num_neg_samples + 1)
-
-
-class PRelu(layers.Layer):
-    """
-    Equation:
-
-    .. math::
-        y = \max(0, x) + \\alpha * \min(0, x)
-
-    Args:
-        name_scope(str): The name of this class.
-        mode (str): The mode for weight sharing. It supports all, channel
-          and element. all: all elements share same weight
-          channel:elements in a channel share same weight
-          element:each element has a weight
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-          weight (alpha).
-
-    Returns:
-        Variable: The output tensor with the same shape as input.
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy as np
-
-          inp_np = np.ones([5, 200, 100, 100]).astype('float32')
-          with fluid.dygraph.guard():
-              mode = 'channel'
-              prelu = fluid.PRelu(
-                 'prelu',
-                 mode=mode,
-                 param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)))
-              dy_rlt = prelu(fluid.dygraph.base.to_variable(inp_np))
-
-    """
-
-    def __init__(self, name_scope, mode, param_attr=None):
-
-        super(PRelu, self).__init__(name_scope)
-        self._mode = mode
-        self._param_attr = param_attr
-        if self._mode not in ['all', 'channel', 'element']:
-            raise ValueError('mode should be one of all, channel, element.')
-        self._alpha_shape = [1]
-
-    def _build_once(self, input):
-        if self._mode == 'channel':
-            self._alpha_shape = [1, input.shape[1], 1, 1]
-        elif self._mode == 'element':
-            self._alpha_shape = input.shape
-        self._dtype = self._helper.input_dtype(input)
-        self._alpha = self.create_parameter(
-            attr=self._param_attr,
-            shape=self._alpha_shape,
-            dtype='float32',
-            is_bias=False,
-            default_initializer=Constant(1.0))
-
-    def forward(self, input):
-
-        out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="prelu",
-            inputs={"X": input,
-                    'Alpha': self._alpha},
-            attrs={"mode": self._mode},
-            outputs={"Out": out})
-        return out
-
-
-class BilinearTensorProduct(layers.Layer):
-    """
-    **Add Bilinear Tensor Product Layer**
-
-    This layer performs bilinear tensor product on two inputs.
-    For example:
-
-    .. math::
-      out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
-
-    In this formula:
-     - :math:`x`: the first input contains M elements, shape is [batch_size, M].
-     - :math:`y`: the second input contains N elements, shape is [batch_size, N].
-     - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
-     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
-     - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
-
-    Args:
-       name_scope(str): The name of this class.
-       size (int): The dimension of this layer.
-       act (str): Activation to be applied to the output of this layer. Default: None.
-       name (str): The name of this layer. Default: None.
-       param_attr (ParamAttr): The parameter attribute for the learnable w.
-           parameters/weights of this layer. Default: None.
-       bias_attr (ParamAttr): The parameter attribute for the bias
-           of this layer. If it is set to False, no bias will be added to the output units.
-           If it is set to None, the bias is initialized zero. Default: None.
-
-    Returns:
-       Variable: A 2-D Tensor of shape [batch_size, size].
-
-    Examples:
-       .. code-block:: python
-
-         import paddle.fluid as fluid
-         import numpy
-
-         with fluid.dygraph.guard():
-             layer1 = numpy.random.random((5, 5)).astype('float32')
-             layer2 = numpy.random.random((5, 4)).astype('float32')
-             bilinearTensorProduct = fluid.dygraph.nn.BilinearTensorProduct(
-                    'BilinearTensorProduct', size=1000)
-             ret = bilinearTensorProduct(fluid.dygraph.base.to_variable(layer1),
-                                fluid.dygraph.base.to_variable(layer2))
-    """
-
-    def __init__(self,
-                 name_scope,
-                 size,
-                 name=None,
-                 act=None,
-                 param_attr=None,
-                 bias_attr=None):
-        super(BilinearTensorProduct, self).__init__(name_scope)
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._size = size
-        self._name = name
-        self._inputs = dict()
-
-    def _build_once(self, x, y):
-        self._dtype = self._helper.input_dtype(x)
-
-        param_shape = [self._size, x.shape[1], y.shape[1]]
-
-        self._w = self.create_parameter(
-            attr=self._param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=False)
-
-        bias_size = [1, self._size]
-        self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=bias_size,
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, x, y):
-        self._inputs = {"X": x, "Y": y, "Weight": self._w}
-        if self._bias_param:
-            self._inputs["Bias"] = self._bias_param
-        if self._name is not None:
-            out = self._helper.create_variable(
-                name=".".join([self.full_name(), self._name]),
-                dtype=self._dtype,
-                persistable=False)
-        else:
-            out = self._helper.create_variable(
-                dtype=self._dtype, persistable=False)
-        self._helper.append_op(
-            type="bilinear_tensor_product",
-            inputs=self._inputs,
-            outputs={"Out": out})
-
-        # add activation
-        return self._helper.append_activation(out, act=self._act)
-
-
-class Conv2DTranspose(layers.Layer):
-    """
-    **Convlution2D transpose layer**
-
-    The convolution2D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input(Input) and output(Output)
-    are in NCHW format. Where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-    Parameters(dilations, strides, paddings) are two elements. These two elements
-    represent height and width, respectively. The details of convolution transpose
-    layer, please refer to the following explanation and references
-    `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a tensor with NCHW format.
-    * :math:`W`: Filter value, a tensor with MCHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
-
-    Args:
-        name_scope(str): The name of this class.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        output_size(int|tuple|None): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square. None if use output size to
-            calculate filter_size. Default: None.
-        padding(int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding. Default: padding = 0.
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups(int): The groups number of the Conv2d transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            Default: groups = 1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-
-    Returns:
-        Variable: The tensor variable storing the convolution transpose result.
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-       .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              data = numpy.random.random((3, 32, 32)).astype('float32')
-              conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
-                    'Conv2DTranspose', num_filters=2, filter_size=3)
-              ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 output_size=None,
-                 filter_size=None,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None):
-        super(Conv2DTranspose, self).__init__(name_scope)
-        assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._groups = groups
-        self._num_filters = num_filters
-        self._use_cudnn = use_cudnn
-        self._padding = padding
-        self._stride = stride
-        self._dilation = dilation
-        self._filter_size = filter_size
-        self._output_size = output_size
-        self._op_type = 'conv2d_transpose'
-
-    def _build_once(self, input):
-        input_channel = input.shape[1]
-        if (input_channel == self._groups and
-                self._num_filters == input_channel and not self._use_cudnn):
-            self._op_type = 'depthwise_conv2d_transpose'
-
-        if not isinstance(input, Variable):
-            raise TypeError("Input of conv2d_transpose must be Variable")
-
-        self._padding = utils.convert_to_list(self._padding, 2, 'padding')
-        self._stride = utils.convert_to_list(self._stride, 2, 'stride')
-        self._dilation = utils.convert_to_list(self._dilation, 2, 'dilation')
-
-        if not isinstance(self._use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-
-        if self._filter_size is None:
-            if self._output_size is None:
-                raise ValueError(
-                    "output_size must be set when filter_size is None")
-            if isinstance(self._output_size, int):
-                self._output_size = [self._output_size, self._output_size]
-
-            h_in = input.shape[2]
-            w_in = input.shape[3]
-
-            filter_size_h = (self._output_size[0] -
-                             (h_in - 1) * self._stride[0] + 2 * self._padding[0]
-                             - 1) // self._dilation[0] + 1
-            filter_size_w = (self._output_size[1] -
-                             (w_in - 1) * self._stride[1] + 2 * self._padding[1]
-                             - 1) // self._dilation[1] + 1
-            self._filter_size = [filter_size_h, filter_size_w]
-        else:
-            self._filter_size = utils.convert_to_list(
-                self._filter_size, 2, 'conv2d_transpose.filter_size')
-
-        if self._output_size is None:
-            self._output_size = []
-        elif isinstance(self._output_size, list) or isinstance(
-                self._output_size, int):
-            self._output_size = utils.convert_to_list(self._output_size, 2,
-                                                      'output_size')
-        else:
-            raise ValueError("output_size should be list or int")
-        self._padding = utils.convert_to_list(self._padding, 2, 'padding')
-        self._groups = 1 if self._groups is None else self._groups
-        filter_shape = [input_channel, self._num_filters // self._groups
-                        ] + self._filter_size
-
-        self._img_filter = self.create_parameter(
-            dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
-
-        self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype)
-        self._helper.append_op(
-            type=self._op_type,
-            inputs={'Input': [input],
-                    'Filter': [self._img_filter]},
-            outputs={'Output': pre_bias},
-            attrs={
-                'output_size': self._output_size,
-                'strides': self._stride,
-                'paddings': self._padding,
-                'dilations': self._dilation,
-                'groups': self._groups,
-                'use_cudnn': self._use_cudnn
-            })
-
-        if self._bias_param is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self._bias_param]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
-        else:
-            pre_act = pre_bias
-
-        out = self._helper.append_activation(pre_act, act=self._act)
-        return out
-
-
-class SequenceConv(layers.Layer):
-    """
-    This function creates the op for sequence_conv, using the inputs and
-    other convolutional configurations for the filters and stride as given
-    in the input parameters to the function.
-
-    Args:
-        name_scope(str): The name of this class.
-        num_filters (int): number of filters.
-        filter_size (int): the filter size (H and W). Default: 3.
-        filter_stride (int): stride of the filter. Default: 1.
-        padding (bool|None): if True, add paddings. Default: None
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, sequence_conv
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of sequence_conv. If it is set to None or one attribute of ParamAttr, sequence_conv
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-
-    Returns:
-        Variable: output of sequence_conv
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size=3,
-                 filter_stride=1,
-                 padding=None,
-                 bias_attr=None,
-                 param_attr=None,
-                 act=None):
-        assert not in_dygraph_mode(
-        ), "SequenceConv is not supported by dynamic graph mode yet!"
-        super(SequenceConv, self).__init__(name_scope)
-        self._num_filters = num_filters
-        self._filter_size = filter_size
-        self._filter_stride = filter_stride
-        self._padding = padding
-        self._bias_attr = bias_attr
-        self._param_attr = param_attr
-        self._act = act
-
-    def _build_once(self, input):
-        self._dtype = self._helper.input_dtype(input)
-        filter_shape = [self._filter_size * input.shape[1], self._num_filters]
-        self._filter_param = self.create_parameter(
-            attr=self._param_attr, shape=filter_shape, dtype=self._dtype)
-
-        self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type='sequence_conv',
-            inputs={
-                'X': [input],
-                'Filter': [self._filter_param],
-            },
-            outputs={"Out": pre_bias},
-            attrs={
-                'contextStride': self._filter_stride,
-                'contextStart': -int(self._filter_size // 2),
-                'contextLength': self._filter_size
-            })
-
-        if self._bias_param is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self._bias_param]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
-        else:
-            pre_act = pre_bias
-
-        return self._helper.append_activation(pre_act, act=self._act)
-
-
-class RowConv(layers.Layer):
-    """
-    ***Row-convolution operator***
-
-    The row convolution is called lookahead convolution.  This operator was introduced in the following paper for DeepSpeech2:
-    http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf
-
-    The main motivation is that a bidirectional RNN, useful in DeepSpeech like speech models, learns representation for a sequence by performing a
-    forward and a backward pass through the entire sequence. However, unlike
-    unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
-    and low-latency setting. The lookahead convolution incorporates information
-    from future subsequences in a computationally efficient manner to improve
-    unidirectional recurrent neural networks. The row convolution operator is
-    different from the 1D sequence convolution, and is computed as follows:
-
-    Given an input sequence X of length t and input dimension D, and a filter (W) of size context * D.
-
-    More details about row_conv please refer to the design document https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
-
-    Args:
-        name_scope(str): The name of this class.
-        future_context_size (int): Future context size. Please note, the shape
-            of convolution kernel is [future_context_size + 1, D].
-        param_attr (ParamAttr): Attributes of parameters, including
-            name, initializer etc. Default: None.
-        act (str): Non-linear activation to be applied to output variable. Default: None.
-
-    Returns:
-        the output(Out) is a LodTensor, which supports variable time-length input sequences.
-        The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              x = numpy.random.random((16)).astype('float32')
-              rowConv = fluid.dygraph.nn.RowConv(
-                    'RowConv', future_context_size=2)
-              ret = rowConv(fluid.dygraph.base.to_variable(x))
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 future_context_size,
-                 param_attr=None,
-                 act=None):
-        assert not in_dygraph_mode(
-        ), "RowConv is not supported by dynamic graph mode yet!"
-        super(RowConv, self).__init__(name_scope)
-        self._act = act
-        self._param_attr = param_attr
-        self._future_context_size = future_context_size
-
-    def _build_once(self, input):
-        self._dtype = self._helper.input_dtype(input)
-        filter_shape = [self._future_context_size + 1, input.shape[1]]
-        self._filter_param = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            is_bias=False)
-
-    def forward(self, input):
-        out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type='row_conv',
-            inputs={'X': [input],
-                    'Filter': [self._filter_param]},
-            outputs={'Out': [out]})
-        return self._helper.append_activation(out, act=self._act)
-
-
-class GroupNorm(layers.Layer):
-    """
-        **Group Normalization Layer**
-
-        Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
-
-        Args:
-            name_scope(str): The name of this class.
-            groups(int): The number of groups that divided from channels.
-            epsilon(float): The small value added to the variance to prevent
-                division by zero. Default: 1e-05.
-            param_attr(ParamAttr|None): The parameter attribute for the learnable
-                scale :math:`g`. If it is set to False, no scale will be added to the output units.
-                If it is set to None, the bias is initialized one. Default: None.
-            bias_attr(ParamAttr|None): The parameter attribute for the learnable
-                bias :math:`b`. If it is set to False, no bias will be added to the output units.
-                If it is set to None, the bias is initialized zero. Default: None.
-            act(str): Activation to be applied to the output of group normalizaiton.
-            data_layout(string|NCHW): Only NCHW is supported.
-
-        Returns:
-            Variable: A tensor variable which is the result after applying group normalization on the input.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              import numpy
-
-              with fluid.dygraph.guard():
-                  x = numpy.random.random((8, 32, 32)).astype('float32')
-                  groupNorm = fluid.dygraph.nn.GroupNorm('GroupNorm', groups=4)
-                  ret = groupNorm(fluid.dygraph.base.to_variable(x))
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 groups,
-                 epsilon=1e-05,
-                 param_attr=None,
-                 bias_attr=None,
-                 act=None,
-                 data_layout='NCHW'):
-        super(GroupNorm, self).__init__(name_scope)
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._epsilon = epsilon
-        self._groups = groups
-        self._act = act
-        if data_layout != 'NCHW':
-            raise ValueError("unsupported data layout:" + data_layout)
-
-    def _build_once(self, input):
-        self._dtype = self._helper.input_dtype(input)
-        param_shape = [input.shape[1]]
-        if self._bias_attr:
-            self._bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=param_shape,
-                dtype=self._dtype,
-                is_bias=True)
-
-        if self._param_attr:
-            self._scale = self.create_parameter(
-                attr=self._param_attr,
-                shape=param_shape,
-                dtype=self._dtype,
-                default_initializer=Constant(1.0))
-
-    def forward(self, input):
-        inputs = {'X': input}
-        if self._bias_attr:
-            inputs['Bias'] = self._bias
-        if self._param_attr:
-            inputs['Scale'] = self._scale
-
-        # create output
-        mean_out = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        variance_out = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        group_norm_out = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-
-        self._helper.append_op(
-            type="group_norm",
-            inputs=inputs,
-            outputs={
-                "Y": group_norm_out,
-                "Mean": mean_out,
-                "Variance": variance_out,
-            },
-            attrs={"epsilon": self._epsilon,
-                   "groups": self._groups})
-
-        return self._helper.append_activation(group_norm_out, self._act)
-
-
-class SpectralNorm(layers.Layer):
-    """
-    **Spectral Normalization Layer**
-
-    This layer calculates the spectral normalization value of weight parameters of
-    fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D
-    Parameters. Calculations are showed as follows.
-
-    Step 1:
-    Generate vector U in shape of [H], and V in shape of [W].
-    While H is the :attr:`dim` th dimension of the input weights,
-    and W is the product result of remaining dimensions.
-
-    Step 2:
-    :attr:`power_iters` shoule be a positive interger, do following
-    calculations with U and V for :attr:`power_iters` rounds.
-
-    .. math::
-
-        \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
-
-        \mathbf{u} := \\frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}
-
-    Step 3:
-    Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
-
-    .. math::
-
-        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
-
-        \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})}
-
-
-    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
-
-    Args:
-        name_scope(str): The name of this class.
-        dim(int): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: 0.
-        power_iters(int): The number of power iterations to calculate spectral norm. Default: 1.
-        eps(float): The epsilon for numerical stability in calculating norms. Default: 1e-12.
-        name (str): The name of this layer. It is optional.
-
-    Returns:
-        Variable: A tensor variable of weight parameters after spectral normalization.
-
-    Examples:
-       .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy
-
-            with fluid.dygraph.guard():
-                x = numpy.random.random((2, 8, 32, 32)).astype('float32')
-                spectralNorm = fluid.dygraph.nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
-                ret = spectralNorm(fluid.dygraph.base.to_variable(x))
-
-    """
-
-    def __init__(self, name_scope, dim=0, power_iters=1, eps=1e-12, name=None):
-        super(SpectralNorm, self).__init__(name_scope)
-        self._power_iters = power_iters
-        self._eps = eps
-        self._dim = dim
-
-    def _build_once(self, weight):
-        self._dtype = self._helper.input_dtype(weight)
-        input_shape = weight.shape
-        h = input_shape[self._dim]
-        w = np.prod(input_shape) // h
-
-        self.u = self.create_parameter(
-            attr=ParamAttr(),
-            shape=[h],
-            dtype=self._dtype,
-            default_initializer=Normal(0., 1.))
-        self.u.stop_gradient = True
-
-        self.v = self.create_parameter(
-            attr=ParamAttr(),
-            shape=[w],
-            dtype=self._dtype,
-            default_initializer=Normal(0., 1.))
-        self.v.stop_gradient = True
-
-    def forward(self, weight):
-        inputs = {'Weight': weight, 'U': self.u, 'V': self.v}
-        out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="spectral_norm",
-            inputs=inputs,
-            outputs={"Out": out, },
-            attrs={
-                "dim": self._dim,
-                "power_iters": self._power_iters,
-                "eps": self._eps,
-            })
-
-        return out
-
-
-class TreeConv(layers.Layer):
-    """
-        ***Tree-Based Convolution Operator***
-
-        Tree-Based Convolution is a kind of convolution based on tree structure.
-        Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN),
-        which is used to classify tree structures, such as Abstract Syntax Tree.
-        Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
-        which regards multiway tree as binary tree.
-        The paper of Tree-Based Convolution Operator is here: https://arxiv.org/abs/1409.5718v1
-
-
-        Args:
-            name_scope(str): The name of this class.
-            output_size(int): output feature width
-            num_filters(int): number of filters, Default: 1.
-            max_depth(int): max depth of filters, Default: 2.
-            act(str): activation function, Default: tanh.
-            param_attr(ParamAttr): the parameter attribute for the filters, Default: None.
-            bias_attr(ParamAttr): the parameter attribute for the bias of this layer, Default: None.
-            name(str): a name of this layer(optional). If set None, the layer will be named automatically, Default: None.
-
-        Returns:
-            out(Variable): (Tensor) The feature vector of subtrees. The shape of the output tensor is [max_tree_node_size, output_size, num_filters]. The output tensor could be a new feature vector for next tree convolution layers
-
-        Examples:
-
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              import numpy
-
-              with fluid.dygraph.guard():
-                  nodes_vector = numpy.random.random((1, 10, 5)).astype('float32')
-                  edge_set = numpy.random.random((1, 9, 2)).astype('int32')
-                  treeConv = fluid.dygraph.nn.TreeConv(
-                    'TreeConv', output_size=6, num_filters=1, max_depth=2)
-                  ret = treeConv(fluid.dygraph.base.to_variable(nodes_vector), fluid.dygraph.base.to_variable(edge_set))
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 output_size,
-                 num_filters=1,
-                 max_depth=2,
-                 act='tanh',
-                 param_attr=None,
-                 bias_attr=None,
-                 name=None):
-        super(TreeConv, self).__init__(name_scope)
-        self._name = name
-        self._output_size = output_size
-        self._act = act
-        self._max_depth = max_depth
-        self._num_filters = num_filters
-        self._bias_attr = bias_attr
-        self._param_attr = param_attr
-
-    def _build_once(self, nodes_vector, edge_set):
-        assert isinstance(nodes_vector, Variable)
-        assert isinstance(edge_set, Variable)
-        self._dtype = self._helper.input_dtype(nodes_vector)
-
-        feature_size = nodes_vector.shape[2]
-        w_shape = [feature_size, 3, self._output_size, self._num_filters]
-        if self._bias_attr:
-            self._bias_param = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[self._num_filters],
-                dtype=self._dtype,
-                is_bias=True)
-        self.W = self.create_parameter(
-            attr=self._param_attr,
-            shape=w_shape,
-            dtype=self._dtype,
-            is_bias=False)
-
-    def forward(self, nodes_vector, edge_set):
-
-        if self._name:
-            out = self.create_variable(
-                name=self._name, dtype=self._dtype, persistable=False)
-        else:
-
-            out = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-
-        self._helper.append_op(
-            type='tree_conv',
-            inputs={
-                'NodesVector': nodes_vector,
-                'EdgeSet': edge_set,
-                'Filter': self.W
-            },
-            outputs={'Out': out, },
-            attrs={'max_depth': self._max_depth})
-        if self._bias_attr:
-            pre_activation = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [out],
-                        'Y': [self._bias_param]},
-                outputs={'Out': [pre_activation]},
-                attrs={'axis': 1})
-        else:
-            pre_activation = out
-        return self._helper.append_activation(pre_activation, act=self._act)
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
deleted file mode 100644
index 6f68cc4e1c00e705f1f74a4254499b81160ad0cd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/parallel.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except jin compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import six
-import numpy as np
-from collections import OrderedDict
-from .. import core
-from . import layers
-from . import parallel_helper
-from .. import framework
-from ..layers import collective
-from . import to_variable, no_grad
-
-__all__ = ["prepare_context"]
-
-ParallelStrategy = core.ParallelStrategy
-
-
-def prepare_context(strategy=None):
-    if strategy is None:
-        strategy = ParallelStrategy()
-        strategy.nranks = Env().nranks
-        strategy.local_rank = Env().local_rank
-        strategy.trainer_endpoints = Env().trainer_endpoints
-        strategy.current_endpoint = Env().current_endpoint
-    if strategy.nranks < 2:
-        return
-    assert framework.in_dygraph_mode() is True, \
-        "dygraph.parallel.prepare_context should be used with dygrahp mode."
-    place = framework._current_expected_place()
-    assert place is not None, \
-        "dygraph.parallel.prepare_context should be used in fluid.dygraph.guard(place) guard."
-    if isinstance(place, core.CUDAPlace):
-        parallel_helper._set_parallel_ctx(
-            core.NCCLParallelContext(strategy, place))
-    else:
-        # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
-        assert ("Only support CUDAPlace for now.")
-    parallel_helper._init_parallel_ctx()
-    return strategy
-
-
-class Env(object):
-    def __init__(self):
-        self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-        self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-        self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-        self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
-                                            "").split(",")
-        self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
-
-    @property
-    def nranks(self):
-        return self._nranks
-
-    @property
-    def local_rank(self):
-        return self._local_rank
-
-    @property
-    def dev_id(self):
-        return self._dev_id
-
-    @property
-    def current_endpoint(self):
-        return self._current_endpoint
-
-    @property
-    def trainer_endpoints(self):
-        return self._trainer_endpoints
-
-
-class DataParallel(layers.Layer):
-    """
-    Runs the module with data parallelism.
-
-    Currently, DataParallel only supports to run the dynamic graph
-    with multi-process. The usage is:
-    `python -m paddle.distributed.launch --gpus 2 dynamic_graph_test.py`.
-    And the content of `dynamic_graph_test.py` is the code of examples.
-
-    Examples:
-        .. code-block:: python
-
-           import numpy as np
-           import paddle.fluid as fluid
-           import paddle.fluid.dygraph as dygraph
-           from paddle.fluid.optimizer import AdamOptimizer
-           from paddle.fluid.dygraph.nn import FC
-           from paddle.fluid.dygraph.base import to_variable
-
-           place = fluid.CUDAPlace(0)
-           with fluid.dygraph.guard(place=place):
-
-               # prepare the data parallel context
-               strategy=dygraph.parallel.prepare_context()
-
-               fc_layer = FC("FC", 10, act="softmax")
-               adam = fluid.optimizer.AdamOptimizer()
-
-               # make the module become the data parallelism module
-               fc_layer = dygraph.parallel.DataParallel(fc_layer, strategy)
-
-               x_data = np.random.random(size=[10, 1]).astype(np.float32)
-               data = to_variable(x_data)
-
-               hidden = fc_layer(data)
-               avg_loss = fluid.layers.mean(hidden)
-
-               # scale the loss according to the number of trainers.
-               avg_loss = fc_layer.scale_loss(avg_loss)
-
-               avg_loss.backward()
-
-               # collect the gradients of trainers.
-               fc_layer.apply_collective_grads()
-
-               adam.minimize(avg_loss)
-               fc_layer.clear_gradients()
-
-    Args:
-        layers(Layer): The module that should be executed by data parallel.
-        strategy(ParallelStrategy): The strategy of data parallelism.
-
-    Returns:
-        Layer: The data paralleled module.
-    """
-
-    def __init__(self, layers, strategy):
-        super(DataParallel,
-              self).__init__(layers.full_name() + "_data_parallel")
-
-        self._layers = layers
-        self._strategy = strategy
-
-    def forward(self, *inputs, **kwargs):
-        return self._layers(*inputs, **kwargs)
-
-    def scale_loss(self, loss):
-        """
-        Scale the loss. In data parallel mode, the loss should be scale with
-        the number of trainers. If not in data parallel mode, return the loss
-        directly.
-
-        Args:
-            loss(Layer): The loss of the current Model.
-
-        Returns:
-            Layer: the scaled loss.
-        """
-        if not self._is_data_parallel_mode():
-            return loss
-
-        loss_scale = to_variable(
-            np.array([self._strategy.nranks]).astype("float32"))
-        loss_scale.stop_gradient = True
-        loss = loss / loss_scale
-        return loss
-
-    def _coalesce_tensors(self, var_groups):
-        from ..layers import nn
-        coalesced_grads_and_grad_vars = []
-        for group_id, grad_vars in var_groups.items():
-            flattened_vars = []
-            g_var_shapes = []
-            for g_var in grad_vars:
-                g_var_shapes.append(g_var.shape)
-                flattened_vars.append(
-                    nn.reshape(
-                        x=g_var, shape=[np.prod(g_var.shape)], inplace=True))
-            coalesced_grad = nn.concat(flattened_vars)
-            coalesced_grads_and_grad_vars.append(
-                [coalesced_grad, grad_vars, g_var_shapes])
-        return coalesced_grads_and_grad_vars
-
-    def _split_tensors(self, coalesced_grads_and_grad_vars):
-        from ..layers import nn
-        for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars:
-            grad_var_len = [np.prod(g_shape) for g_shape in grad_shapes]
-            self._helper.main_program.current_block().append_op(
-                type='split',
-                inputs={'X': coalesced_grad},
-                outputs={'Out': origin_grad_vars},
-                attrs={'sections': grad_var_len,
-                       'axis': 0})
-            for g_var, g_shape in zip(origin_grad_vars, grad_shapes):
-                nn.reshape(x=g_var, shape=g_shape, inplace=True)
-
-    @no_grad
-    def apply_collective_grads(self):
-        """
-        AllReduce the Parameters' gradient.
-        """
-        if not self._is_data_parallel_mode():
-            return
-
-        grad_var_set = set()
-        grad_vars = []
-        for param in self._layers.parameters():
-            # NOTE(zcd): The grad_ivar maybe no generated.
-            if param.trainable and param._ivar._grad_ivar():
-                g_var = framework.Variable(
-                    block=self._helper.main_program.current_block(),
-                    name=param._ivar._grad_name(),
-                    stop_gradient=True,
-                    ivar=param._ivar._grad_ivar())
-                grad_vars.append(g_var)
-                assert g_var not in grad_var_set
-                grad_var_set.add(g_var)
-
-        # FIXME(zcd): the type of the var should be LoDTensor, i.e
-        # the gradients should be dense, otherwise, the following
-        # logic should be updated.
-        # 128 MB as a group
-        mega_bytes = 128 * 1024 * 1024
-        group_idx = 0
-        memory_counter = 0
-        grad_var_groups = OrderedDict()
-        dtype = grad_vars[0].dtype
-        for g_var in grad_vars:
-            # Note: the dtype of the same group should be the same.
-            bytes = np.prod(g_var.shape) * core.size_of_dtype(g_var.dtype)
-            if memory_counter < mega_bytes and dtype == g_var.dtype:
-                memory_counter += bytes
-            else:
-                memory_counter = bytes
-                group_idx += 1
-            grad_var_groups.setdefault(group_idx, []).append(g_var)
-
-        coalesced_grads_and_vars = self._coalesce_tensors(grad_var_groups)
-
-        for coalesced_grad, g_vars, g_shapes in coalesced_grads_and_vars:
-            collective._allreduce(
-                coalesced_grad, coalesced_grad, sync_mode=False)
-
-        self._split_tensors(coalesced_grads_and_vars)
-
-    def _is_data_parallel_mode(self):
-        return self._strategy.nranks > 1
diff --git a/python/paddle/fluid/dygraph/parallel_helper.py b/python/paddle/fluid/dygraph/parallel_helper.py
deleted file mode 100644
index f378211de2b8a1579ab139318cdc3cb8d5bdc2de..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/parallel_helper.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except jin compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from ..layers import collective
-from ..framework import Parameter
-__parallel_ctx__clz__ = None
-
-
-def _is_data_parallel_mode():
-    global __parallel_ctx__clz__
-    return __parallel_ctx__clz__ is not None and int(
-        os.getenv("PADDLE_TRAINERS_NUM", "1")) > 1
-
-
-def _set_parallel_ctx(nccl_parallel_context):
-    global __parallel_ctx__clz__
-    assert __parallel_ctx__clz__ is None, \
-        "ParallelContext can only be initialized once."
-    __parallel_ctx__clz__ = nccl_parallel_context
-
-
-def _init_parallel_ctx():
-    global __parallel_ctx__clz__
-    assert __parallel_ctx__clz__ is not None, \
-        "ParallelContext should be initialized."
-    __parallel_ctx__clz__.init()
-
-
-def _broadcast_parameters(parameters):
-    for param in parameters:
-        if isinstance(param, Parameter) and param.trainable:
-            collective._broadcast(param, 0, sync_mode=True)
diff --git a/python/paddle/fluid/dygraph/profiler.py b/python/paddle/fluid/dygraph/profiler.py
deleted file mode 100644
index 04c865500bb5e668373844e2940eaf36d1e9e39c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/profiler.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from .. import core
-
-__all__ = [
-    'start_gperf_profiler',
-    'stop_gperf_profiler',
-]
-
-
-def start_gperf_profiler():
-    core.start_imperative_gperf_profiler()
-
-
-def stop_gperf_profiler():
-    core.stop_imperative_gperf_profiler()
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
deleted file mode 100644
index 799f9423a1df7bd293aace89aefc1d62e142ae63..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/tracer.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import six
-
-from collections import defaultdict
-from paddle.fluid import core
-from paddle.fluid import framework
-
-__all__ = ['Tracer']
-
-
-class Tracer(core.Tracer):
-    """
-    Python wrapper of dygraph tracer
-    """
-
-    def __init__(self):
-        super(Tracer, self).__init__()
-
-        self._vars = defaultdict()
-        self._train_mode = True
-
-    def trace_var(self, name, var):
-        self._vars[name] = var
-
-    def all_parameters(self):
-        return list((item for name, item in six.iteritems(self._vars)
-                     if isinstance(item, framework.Parameter)))
-
-    def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False):
-        self.trace(type, inputs, outputs, attrs,
-                   framework._current_expected_place(), self._train_mode and
-                   not stop_gradient)
-
-    def train_mode(self):
-        self._train_mode = True
-
-    def eval_mode(self):
-        self._train_mode = False
diff --git a/python/paddle/fluid/dygraph_grad_clip.py b/python/paddle/fluid/dygraph_grad_clip.py
deleted file mode 100644
index 826f918f36ece2eab5ddf17c1c0b3c86ca4e6438..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph_grad_clip.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import copy
-import six
-
-import functools
-
-from . import layers
-from . import framework
-from . import core
-from .dygraph import base as imperative_base
-
-__all__ = [
-    'GradClipByValue',
-    'GradClipByNorm',
-    'GradClipByGlobalNorm',
-]
-
-
-class GradClipBase(object):
-    def __str__(self):
-        raise NotImplementedError()
-
-    def _clip(self, para_and_grad):
-        raise NotImplementedError
-
-    @imperative_base.no_grad
-    def __call__(self, para_and_grad):
-        return self._clip(para_and_grad)
-
-
-class GradClipByValue(GradClipBase):
-    """
-    Clips gradient values to the range [min_value, max_value].
-
-    Given a gradient g, this operation clips its value to min_value and max_value.
-
-    - Any values less than min_value are set to min_value.
-    - Any values greater than max_value are set to max_value.
-
-    Args:
-        max_value (float): The maximum value to clip by. 
-        min (float, optional): The minimum value to clip by. if not set by user, \
-        will be set to -max_value(max_value MUST be postive) by framework. 
-
-    Examples:
-        .. code-block:: python
-        
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-
-            from paddle.fluid.dygraph.base import to_variable
-            from paddle.fluid.dygraph.nn import FC
-
-            from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
-
-            from paddle.fluid.optimizer import SGDOptimizer
-
-            with fluid.dygraph.guard():
-                value_clip = GradClipByValue( -1.0, 1.0 )
-                sgd = SGDOptimizer(learning_rate=1.0)
-                
-                init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
-
-                fc = FC( "fc", 10)
-
-                out = fc( to_variable(init_value) )
-
-                loss = fluid.layers.reduce_mean( out )
-
-                loss.backward()
-                sgd.minimize(loss, grad_clip = value_clip)
-            
-    """
-
-    @imperative_base.no_grad
-    def __init__(self, min_value, max_value=None):
-
-        if min_value is None:
-            assert (max_value > 0.0)
-            min_value = -max_value
-        else:
-            min_value = float(min_value)
-        self.max_value = max_value
-        self.min_value = min_value
-
-    def __str__(self):
-        return "ClipByValue, min = %f, max=%f" % (self.min_value,
-                                                  self.max_value)
-
-    def _clip(self, para_and_grad):
-        out = []
-        for p, g in para_and_grad:
-            if g is None:
-                out.append((p, g))
-                continue
-
-            new_grad = layers.clip(x=g, min=self.min_value, max=self.max_value)
-
-            out.append((p, new_grad))
-
-        return out
-
-
-class GradClipByNorm(GradClipBase):
-    """
-    Clips tensor values to a maximum L2-norm.
-
-    This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
-    If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
-    will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
-    :math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
-    :math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
-
-    .. math::
-
-        Out = \\frac{max\_norm * X}{norm(X)},
-
-    where :math:`norm(X)` represents the L2 norm of :math:`X`.
-
-    Args:
-        clip_norm (float): The maximum norm value
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-
-            from paddle.fluid.dygraph.base import to_variable
-            from paddle.fluid.dygraph.nn import FC
-
-            from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
-
-            from paddle.fluid.optimizer import SGDOptimizer
-
-            with fluid.dygraph.guard():
-                norm_clip = GradClipByNorm( 5.0 )
-                sgd = SGDOptimizer(learning_rate=1.0)
-                
-                init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
-
-                fc = FC( "fc", 10)
-
-                out = fc( to_variable(init_value) )
-
-                loss = fluid.layers.reduce_mean( out )
-
-                loss.backward()
-                sgd.minimize(loss, grad_clip = norm_clip)
-
-    """
-
-    @imperative_base.no_grad
-    def __init__(self, clip_norm):
-        self.clip_norm = clip_norm
-
-    def __str__(self):
-        return "ClipByNorm, clip_norm=%f" % self.clip_norm
-
-    def _clip(self, para_and_grad):
-        out = []
-
-        for p, g in para_and_grad:
-            if g is None:
-                out.append((p, g))
-                continue
-            new_g = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
-
-            out.append((p, new_g))
-
-        return out
-
-
-class GradClipByGlobalNorm(GradClipBase):
-    """
-    Clips values of multiple tensors by the ratio of the sum of their norms.
-
-    Given a list of tensors t_list, and a clipping ratio clip_norm, this
-    operation returns a list of clipped tensors list_clipped and the global
-    norm (global_norm) of all tensors in t_list.
-
-    To perform the clipping, the values :math:`t\_list[i]` are set to:
-
-    .. math::
-
-        t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}
-
-    where:
-
-    .. math::
-
-        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
-
-    If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are,
-    otherwise they're all shrunk by the global ratio.
-
-    Args:
-        clip_norm (float): The maximum norm value
-        group_name (str, optional): The group name for this clip.
-
-    Examples:
-        .. code-block:: python
-        
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-
-            from paddle.fluid.dygraph.base import to_variable
-            from paddle.fluid.dygraph.nn import FC
-
-            from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
-
-            from paddle.fluid.optimizer import SGDOptimizer
-
-            with fluid.dygraph.guard():
-                gloabl_norm_clip = GradClipByGlobalNorm( 5.0 )
-                sgd = SGDOptimizer(learning_rate=1.0)
-                
-                init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
-
-                fc = FC( "fc", 10)
-
-                out = fc( to_variable(init_value) )
-
-                loss = fluid.layers.reduce_mean( out )
-
-                loss.backward()
-                sgd.minimize(loss, grad_clip = gloabl_norm_clip)
-   
-
-    """
-
-    @imperative_base.no_grad
-    def __init__(self, max_global_norm):
-        self.max_global_norm = layers.fill_constant(
-            shape=[1], dtype='float32', value=max_global_norm)
-
-    def __str__(self):
-        return "ClipByGlobalNorm, max_global_norm=%f" % (self.max_global_norm)
-
-    def _clip(self, para_and_grad):
-
-        out = []
-
-        norm_arr = []
-        for p, g in para_and_grad:
-            if g is None:
-                continue
-            power = layers.square(g)
-            sum_t = layers.reduce_sum(power)
-            norm_arr.append(sum_t)
-
-        norm_global = layers.concat(norm_arr)
-        norm_global = layers.reduce_sum(norm_global)
-        norm_global = layers.sqrt(norm_global)
-
-        clip_scale = layers.elementwise_div(
-            x=self.max_global_norm,
-            y=layers.elementwise_max(
-                x=norm_global, y=self.max_global_norm))
-
-        for p, g in para_and_grad:
-            if g is None:
-                out.append((p, g))
-                continue
-            new_grad = g * clip_scale
-
-            out.append((p, new_grad))
-
-        return out
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
deleted file mode 100644
index 80ac91575f633ecffbcfff7ad66111b01f86be46..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/evaluator.py
+++ /dev/null
@@ -1,431 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import warnings
-import numpy as np
-
-from . import layers
-from .framework import Program, Variable, program_guard
-from . import unique_name
-from .layer_helper import LayerHelper
-from .initializer import Constant
-from .layers import detection
-
-__all__ = [
-    'ChunkEvaluator',
-    'EditDistance',
-    'DetectionMAP',
-]
-
-
-def _clone_var_(block, var):
-    assert isinstance(var, Variable)
-    return block.create_var(
-        name=var.name,
-        shape=var.shape,
-        dtype=var.dtype,
-        type=var.type,
-        lod_level=var.lod_level,
-        persistable=True)
-
-
-class Evaluator(object):
-    """
-    Warning: better to use the fluid.metrics.* things, more
-    flexible support via pure Python and Operator, and decoupled
-    with executor. Short doc are intended to urge new user
-    start from Metrics.
-
-    Base Class for all evaluators.
-
-    Args:
-        name(str): The name of evaluator. such as, "accuracy". Used for generate
-            temporary variable name.
-        main_program(Program, optional): The evaluator should be added to this
-            main_program. Default default_main_program()
-        startup_program(Program, optional):The parameter should be added to this
-            startup_program. Default default_startup_program()
-
-    Attributes:
-        states(list): The list of state variables. states will be reset to zero
-            when `reset` is invoked.
-        metrics(list): The list of metrics variables. They will be calculate
-            every mini-batch
-    """
-
-    def __init__(self, name, **kwargs):
-        warnings.warn(
-            "The %s is deprecated, because maintain a modified program inside evaluator cause bug easily, please use fluid.metrics.%s instead."
-            % (self.__class__.__name__, self.__class__.__name__), Warning)
-        self.states = []
-        self.metrics = []
-        self.helper = LayerHelper(name, **kwargs)
-
-    def reset(self, executor, reset_program=None):
-        """
-        reset metric states at the begin of each pass/user specified batch
-
-        Args:
-            executor(Executor|ParallelExecutor): a executor for executing the reset_program
-            reset_program(Program): a single Program for reset process
-        """
-        if reset_program is None:
-            reset_program = Program()
-
-        with program_guard(main_program=reset_program):
-            for var in self.states:
-                assert isinstance(var, Variable)
-                g_var = _clone_var_(reset_program.current_block(), var)
-                layers.fill_constant(
-                    shape=g_var.shape, value=0.0, dtype=g_var.dtype, out=g_var)
-
-        executor.run(reset_program)
-
-    def eval(self, executor, eval_program=None):
-        """
-        Evaluate the statistics merged by multiple mini-batches.
-        Args:
-            executor(Executor|ParallelExecutor): a executor for executing the eval_program
-            eval_program(Program): a single Program for eval process
-        """
-        raise NotImplementedError()
-
-    def _create_state(self, suffix, dtype, shape):
-        """
-        Create state variable.
-
-        Args:
-            suffix(str): the state suffix.
-            dtype(str|core.VarDesc.VarType): the state data type
-            shape(tuple|list): the shape of state
-
-        Returns: State variable
-
-        """
-        state = self.helper.create_variable(
-            name="_".join([unique_name.generate(self.helper.name), suffix]),
-            persistable=True,
-            dtype=dtype,
-            shape=shape)
-        self.states.append(state)
-        return state
-
-
-class ChunkEvaluator(Evaluator):
-    """
-    Warning: This would be deprecated in the future. Please use fluid.metrics.ChunkEvaluator 
-    instead.
-
-    Accumulate counter numbers output by chunk_eval from mini-batches and
-    compute the precision recall and F1-score using the accumulated counter
-    numbers.
-    For some basics of chunking, please refer to
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
-
-    Args:
-        input (Variable): prediction output of the network.
-        label (Variable): label of the test data set.
-        chunk_scheme (str): can be IOB/IOE/IOBES and IO. See the chunk_eval op for details.
-        num_chunk_types (int): the number of chunk type.
-        excluded_chunk_types (list): A list including chunk type ids, indicating chunk types that are not counted.
-
-    Returns:
-        tuple: tuple containing: precision, recall, f1_score
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.executor(place)
-            evaluator = fluid.Evaluator.ChunkEvaluator(input, label)
-            for epoch in PASS_NUM:
-                evaluator.reset(exe)
-                for data in batches:
-                    loss = exe.run(fetch_list=[cost])
-                distance, instance_error = distance_evaluator.eval(exe)
-    """
-
-    def __init__(
-            self,
-            input,
-            label,
-            chunk_scheme,
-            num_chunk_types,
-            excluded_chunk_types=None, ):
-        super(ChunkEvaluator, self).__init__("chunk_eval")
-        main_program = self.helper.main_program
-        if main_program.current_block().idx != 0:
-            raise ValueError("You can only invoke Evaluator in root block")
-
-        self.num_infer_chunks = self._create_state(
-            dtype='int64', shape=[1], suffix='num_infer_chunks')
-        self.num_label_chunks = self._create_state(
-            dtype='int64', shape=[1], suffix='num_label_chunks')
-        self.num_correct_chunks = self._create_state(
-            dtype='int64', shape=[1], suffix='num_correct_chunks')
-        precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
-            input=input,
-            label=label,
-            chunk_scheme=chunk_scheme,
-            num_chunk_types=num_chunk_types,
-            excluded_chunk_types=excluded_chunk_types, )
-        layers.sums(
-            input=[self.num_infer_chunks, num_infer_chunks],
-            out=self.num_infer_chunks)
-        layers.sums(
-            input=[self.num_label_chunks, num_label_chunks],
-            out=self.num_label_chunks)
-        layers.sums(
-            input=[self.num_correct_chunks, num_correct_chunks],
-            out=self.num_correct_chunks)
-
-        self.metrics.extend([precision, recall, f1_score])
-
-    def eval(self, executor, eval_program=None):
-        if eval_program is None:
-            eval_program = Program()
-        block = eval_program.current_block()
-        num_infer_chunks, num_label_chunks, num_correct_chunks = executor.run(
-            eval_program,
-            fetch_list=[_clone_var_(block, state) for state in self.states])
-        num_infer_chunks = num_infer_chunks[0]
-        num_label_chunks = num_label_chunks[0]
-        num_correct_chunks = num_correct_chunks[0]
-        precision = float(
-            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
-        recall = float(
-            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
-        f1_score = float(2 * precision * recall) / (
-            precision + recall) if num_correct_chunks else 0
-        return np.array(
-            [precision], dtype='float32'), np.array(
-                [recall], dtype='float32'), np.array(
-                    [f1_score], dtype='float32')
-
-
-class EditDistance(Evaluator):
-    """
-    Warning: This would be deprecated in the future. Please use fluid.metrics.EditDistance
-    instead.
-    Accumulate edit distance sum and sequence number from mini-batches and
-    compute the average edit_distance and instance error of all batches.
-
-    Args:
-        input: the sequences predicted by network.
-        label: the target sequences which must has same sequence count
-        with input.
-        ignored_tokens(list of int): Tokens that should be removed before
-        calculating edit distance.
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.executor(place)
-            distance_evaluator = fluid.Evaluator.EditDistance(input, label)
-            for epoch in PASS_NUM:
-                distance_evaluator.reset(exe)
-                for data in batches:
-                    loss = exe.run(fetch_list=[cost])
-                distance, instance_error = distance_evaluator.eval(exe)
-
-        In the above example:
-        'distance' is the average of the edit distance in a pass.
-        'instance_error' is the instance error rate in a pass.
-
-    """
-
-    def __init__(self, input, label, ignored_tokens=None, **kwargs):
-        super(EditDistance, self).__init__("edit_distance", **kwargs)
-        main_program = self.helper.main_program
-        if main_program.current_block().idx != 0:
-            raise ValueError("You can only invoke Evaluator in root block")
-
-        self.total_distance = self._create_state(
-            dtype='float32', shape=[1], suffix='total_distance')
-        self.seq_num = self._create_state(
-            dtype='int64', shape=[1], suffix='seq_num')
-        self.instance_error = self._create_state(
-            dtype='int64', shape=[1], suffix='instance_error')
-        distances, seq_num = layers.edit_distance(
-            input=input, label=label, ignored_tokens=ignored_tokens)
-
-        zero = layers.fill_constant(shape=[1], value=0.0, dtype='float32')
-        compare_result = layers.equal(distances, zero)
-        compare_result_int = layers.cast(x=compare_result, dtype='int64')
-        seq_right_count = layers.reduce_sum(compare_result_int)
-        instance_error_count = layers.elementwise_sub(
-            x=seq_num, y=seq_right_count)
-        total_distance = layers.reduce_sum(distances)
-        layers.sums(
-            input=[self.total_distance, total_distance],
-            out=self.total_distance)
-        layers.sums(input=[self.seq_num, seq_num], out=self.seq_num)
-        layers.sums(
-            input=[self.instance_error, instance_error_count],
-            out=self.instance_error)
-        self.metrics.append(total_distance)
-        self.metrics.append(instance_error_count)
-
-    def eval(self, executor, eval_program=None):
-        if eval_program is None:
-            eval_program = Program()
-        block = eval_program.current_block()
-        with program_guard(main_program=eval_program):
-            total_distance = _clone_var_(block, self.total_distance)
-            seq_num = _clone_var_(block, self.seq_num)
-            instance_error = _clone_var_(block, self.instance_error)
-            seq_num = layers.cast(x=seq_num, dtype='float32')
-            instance_error = layers.cast(x=instance_error, dtype='float32')
-            avg_distance = layers.elementwise_div(x=total_distance, y=seq_num)
-            avg_instance_error = layers.elementwise_div(
-                x=instance_error, y=seq_num)
-            result = executor.run(
-                eval_program, fetch_list=[avg_distance, avg_instance_error])
-        return np.array(result[0]), np.array(result[1])
-
-
-class DetectionMAP(Evaluator):
-    """
-    Warning: This would be deprecated in the future. Please use fluid.metrics.DetectionMAP
-    instead.
-    Calculate the detection mean average precision (mAP).
-
-    The general steps are as follows:
-    1. calculate the true positive and false positive according to the input
-        of detection and labels.
-    2. calculate mAP value, support two versions: '11 point' and 'integral'.
-
-    Please get more information from the following articles:
-      https://sanchom.wordpress.com/tag/average-precision/
-      https://arxiv.org/abs/1512.02325
-
-    Args:
-        input (Variable): The detection results, which is a LoDTensor with shape
-            [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
-        gt_label (Variable): The ground truth label index, which is a LoDTensor
-            with shape [N, 1].
-        gt_box (Variable): The ground truth bounding box (bbox), which is a
-            LoDTensor with shape [N, 4]. The layout is [xmin, ymin, xmax, ymax].
-        gt_difficult (Variable|None): Whether this ground truth is a difficult
-            bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
-            it means all the ground truth labels are not difficult bbox.
-        class_num (int): The class number.
-        background_label (int): The index of background label, the background
-            label will be ignored. If set to -1, then all categories will be
-            considered, 0 by default.
-        overlap_threshold (float): The threshold for deciding true/false
-            positive, 0.5 by default.
-        evaluate_difficult (bool): Whether to consider difficult ground truth
-            for evaluation, True by default. This argument does not work when
-            gt_difficult is None.
-        ap_version (string): The average precision calculation ways, it must be
-            'integral' or '11point'. Please check
-            https://sanchom.wordpress.com/tag/average-precision/ for details.
-            - 11point: the 11-point interpolated average precision.
-            - integral: the natural integral of the precision-recall curve.
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.executor(place)
-            map_evaluator = fluid.Evaluator.DetectionMAP(input,
-                gt_label, gt_box, gt_difficult)
-            cur_map, accum_map = map_evaluator.get_map_var()
-            fetch = [cost, cur_map, accum_map]
-            for epoch in PASS_NUM:
-                map_evaluator.reset(exe)
-                for data in batches:
-                    loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
-
-        In the above example:
-
-        'cur_map_v' is the mAP of current mini-batch.
-        'accum_map_v' is the accumulative mAP of one pass.
-    """
-
-    def __init__(self,
-                 input,
-                 gt_label,
-                 gt_box,
-                 gt_difficult=None,
-                 class_num=None,
-                 background_label=0,
-                 overlap_threshold=0.5,
-                 evaluate_difficult=True,
-                 ap_version='integral'):
-        super(DetectionMAP, self).__init__("map_eval")
-
-        gt_label = layers.cast(x=gt_label, dtype=gt_box.dtype)
-        if gt_difficult:
-            gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
-            label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
-        else:
-            label = layers.concat([gt_label, gt_box], axis=1)
-
-        # calculate mean average precision (mAP) of current mini-batch
-        map = detection.detection_map(
-            input,
-            label,
-            class_num,
-            background_label,
-            overlap_threshold=overlap_threshold,
-            evaluate_difficult=evaluate_difficult,
-            ap_version=ap_version)
-
-        self._create_state(dtype='int32', shape=None, suffix='accum_pos_count')
-        self._create_state(dtype='float32', shape=None, suffix='accum_true_pos')
-        self._create_state(
-            dtype='float32', shape=None, suffix='accum_false_pos')
-
-        self.has_state = None
-        var = self.helper.create_variable(
-            persistable=True, dtype='int32', shape=[1])
-        self.helper.set_variable_initializer(
-            var, initializer=Constant(value=int(0)))
-        self.has_state = var
-
-        # calculate accumulative mAP
-        accum_map = detection.detection_map(
-            input,
-            label,
-            class_num,
-            background_label,
-            overlap_threshold=overlap_threshold,
-            evaluate_difficult=evaluate_difficult,
-            has_state=self.has_state,
-            input_states=self.states,
-            out_states=self.states,
-            ap_version=ap_version)
-
-        layers.fill_constant(
-            shape=self.has_state.shape,
-            value=1,
-            dtype=self.has_state.dtype,
-            out=self.has_state)
-
-        self.cur_map = map
-        self.accum_map = accum_map
-
-    def get_map_var(self):
-        return self.cur_map, self.accum_map
-
-    def reset(self, executor, reset_program=None):
-        if reset_program is None:
-            reset_program = Program()
-        with program_guard(main_program=reset_program):
-            var = _clone_var_(reset_program.current_block(), self.has_state)
-            layers.fill_constant(
-                shape=var.shape, value=0, dtype=var.dtype, out=var)
-        executor.run(reset_program)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
deleted file mode 100644
index ed0479be840f9703135f34ca99d327a008ff557d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/executor.py
+++ /dev/null
@@ -1,983 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import logging
-import os
-import multiprocessing
-import sys
-import warnings
-import numpy as np
-from .wrapped_decorator import signature_safe_contextmanager
-import six
-from .framework import Program, default_main_program, Variable
-from . import core
-from . import compiler
-from .. import compat as cpt
-from .trainer_factory import TrainerFactory
-
-__all__ = ['Executor', 'global_scope', 'scope_guard']
-
-g_scope = core.Scope()
-InferNativeConfig = core.NativeConfig
-InferAnalysisConfig = core.AnalysisConfig
-
-
-def global_scope():
-    """
-    Get the global/default scope instance. There are a lot of APIs use
-    :code:`global_scope` as its default value, e.g., :code:`Executor.run`
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
-          numpy.array(fluid.global_scope().find_var("data").get_tensor())
-
-    Returns:
-        Scope: The global/default scope instance.
-    """
-    return g_scope
-
-
-def _switch_scope(scope):
-    global g_scope
-    ex = g_scope
-    g_scope = scope
-    return ex
-
-
-@signature_safe_contextmanager
-def scope_guard(scope):
-    """
-    Change the global/default scope instance by Python `with` statement. All
-    variable in runtime will assigned to the new scope.
-
-    Args:
-        scope: The new global/default scope.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy
-
-            new_scope = fluid.Scope()
-            with fluid.scope_guard(new_scope):
-                 fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
-            numpy.array(new_scope.find_var("data").get_tensor())
-    """
-
-    ex = _switch_scope(scope)
-    yield
-    _switch_scope(ex)
-
-
-def as_numpy(tensor):
-    """
-    Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
-    For higher dimensional sequence data, please use LoDTensor directly.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          new_scope = fluid.Scope()
-          with fluid.scope_guard(new_scope):
-              fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
-          tensor = new_scope.find_var("data").get_tensor()
-          fluid.executor.as_numpy(tensor) # or numpy.array(new_scope.find_var("data").get_tensor())
-
-    Args:
-       tensor(Variable): a instance of Tensor
-
-    Returns:
-        numpy.ndarray
-    """
-    if isinstance(tensor, core.LoDTensorArray):
-        return [as_numpy(t) for t in tensor]
-    if isinstance(tensor, list):
-        return [as_numpy(t) for t in tensor]
-    assert isinstance(tensor, core.LoDTensor)
-    lod = tensor.lod()
-    if len(lod) > 0:
-        raise RuntimeError("Some of your fetched tensors hold LoD information. \
-            They can not be completely cast to Python ndarray. \
-            Please set the parameter 'return_numpy' as 'False' to \
-            return LoDTensor itself directly.")
-    if tensor._is_initialized():
-        return np.array(tensor)
-    else:
-        return None
-
-
-def has_feed_operators(block, feed_targets, feed_holder_name):
-    """ Check whether the block already has feed operators.
-
-    Return false if the block does not have any feed operators.
-    If some feed operators have been prepended to the block, check that
-    the info contained in these feed operators matches the feed_targets
-    and feed_holder_name. Raise exception when any mismatch is found.
-    Return true when the block has feed operators with matching info.
-
-    Args:
-        block: a block instance (typically global block of a program)
-        feed_targets: a dictionary of {feed_target_name: feed_target_data}
-        feed_holder_name: the name of the variable that holds the data of
-            all feed targets. The type of this feed_holder variable is
-            FEED_MINIBATCH, which is essentially vector<LoDTensor>.
-
-    Returns:
-        A boolean value that indicates whether a block has feed operators
-        that match the info contained in feed_targets and feed_holder_name.
-    """
-
-    feed_count = 0
-    for op in block.ops:
-        if op.desc.type() == 'feed':
-            feed_count += 1
-            assert op.desc.input('X')[0] == feed_holder_name
-            feed_target_name = op.desc.output('Out')[0]
-            if feed_target_name not in feed_targets:
-                raise Exception("'feed_targets' does not have {} variable".
-                                format(feed_target_name))
-        else:
-            break
-    if feed_count > 0 and feed_count != len(feed_targets):
-        raise Exception(
-            "Feed operators in program desc do not match 'feed_targets'")
-    return feed_count > 0
-
-
-def has_fetch_operators(block, fetch_targets, fetch_holder_name):
-    """ Check whether the block already has fetch operators.
-
-    Return false if the block does not have any fetch operators.
-    If some fetch operators have been appended to the block, check that
-    the info contained in these fetch operators matches the fetch_targets
-    and fetch_holder_name. Raise exception when any mismatch is found.
-    Return true when the block has fetch operators with matching info.
-
-    Args:
-        block: a block instance (typically global block of a program)
-        fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
-        fetch_holder_name: the name of the variable that holds the data of
-            all fetch targets. The type of this fetch_holder variable is
-            FETCH_LIST, which is essentially vector<LoDTensor>.
-
-    Return:
-        A boolean value that indicates whether a block has fetch operators
-        that match the info contained in fetch_targets and fetch_holder_name.
-    """
-
-    fetch_count = 0
-    for op in block.ops:
-        if op.desc.type() == 'fetch':
-            fetch_count += 1
-            assert op.desc.output('Out')[0] == fetch_holder_name
-            fetch_target_name = op.desc.input('X')[0]
-            if fetch_target_name not in [
-                    var.desc.name() for var in fetch_targets
-            ]:
-                raise Exception("'fetch_targets' does not have {} variable".
-                                format(fetch_target_name))
-            idx = op.desc.attr('col')
-            assert fetch_target_name == fetch_targets[idx].desc.name()
-    if fetch_count > 0 and fetch_count != len(fetch_targets):
-        raise Exception(
-            "Fetch operators in program desc do not match 'fetch_targets'")
-    return fetch_count > 0
-
-
-def _fetch_var(name, scope=None, return_numpy=True):
-    """
-    Fetch the value of the variable with the given name from the
-    given scope.
-
-    Args:
-        name(str): name of the variable. Typically, only persistable variables
-            can be found in the scope used for running the program.
-        scope(core.Scope|None): scope object. It should be the scope where
-            you pass to Executor.run() when running your program.
-            If None, global_scope() will be used. Default None.
-        return_numpy(bool): whether convert the tensor to numpy.ndarray.
-            Default True.
-
-    Returns:
-       LodTensor|numpy.ndarray
-    """
-    assert isinstance(name, str)
-    if scope is None:
-        scope = global_scope()
-    assert isinstance(scope, core._Scope)
-
-    var = scope.find_var(name)
-    assert var is not None, (
-        "Cannot find " + name + " in scope. Perhaps you need to make the"
-        " variable persistable by using var.persistable = True in your"
-        " program.")
-    tensor = var.get_tensor()
-    if return_numpy:
-        tensor = as_numpy(tensor)
-    return tensor
-
-
-def _to_name_str(var):
-    if isinstance(var, Variable):
-        return var.desc.name()
-    elif isinstance(var, str):
-        return var
-    elif isinstance(var, six.string_types):
-        return str(var)
-    else:
-        raise TypeError(str(var) + " should be Variable or str")
-
-
-def _get_strong_program_cache_key(program, feed, fetch_list):
-    return str(id(program)) + _get_program_cache_key(feed, fetch_list)
-
-
-def _get_program_cache_key(feed, fetch_list):
-    feed_var_names = list(feed.keys())
-    fetch_var_names = list(map(_to_name_str, fetch_list))
-
-    return str(feed_var_names + fetch_var_names)
-
-
-def _as_lodtensor(data, place):
-    """
-        Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
-        For higher dimensional sequence data, please use LoDTensor directly.
-
-        Examples:
-            >>> import paddle.fluid as fluid
-            >>> place = fluid.CPUPlace()
-            >>> exe = fluid.executor(place)
-            >>> data = np.array(size=(100, 200, 300))
-            >>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data)
-            >>>     ...
-
-        Args:
-            data(numpy.ndarray): a instance of array
-
-        Returns:
-            LoDTensor
-        """
-    if isinstance(data, list):
-        raise RuntimeError("Some of your feed data hold LoD information. \
-                They can not be completely cast from a list of Python \
-                ndarray to LoDTensor. Please convert data to LoDTensor \
-                directly before feeding the data.\
-                ")
-    # single tensor case
-    tensor = core.LoDTensor()
-    tensor.set(data, place)
-    return tensor
-
-
-class Executor(object):
-    """
-    An Executor in Python, supports single/multiple-GPU running,
-    and single/multiple-CPU running. Python executor takes a program,
-    adds feed operators and fetch operators to this program according
-    to feed map and fetch_list. Feed map provides input data for the
-    program. fetch_list provides the variables(or names) that user wants
-    to get after program runs. Note: the executor will run all operators
-    in the program but not only the operators dependent by the fetch_list.
-    It stores the global variables into the global scope, and creates a
-    local scope for the temporary variables. The contents in local scope
-    may be discarded after every minibatch forward/backward finished.
-    But the global scope variables will be persistent through different runs.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle.fluid.compiler as compiler
-          import numpy
-          import os
-
-          use_cuda = True
-          place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-          exe = fluid.Executor(place)
-
-          train_program = fluid.Program()
-          startup_program = fluid.Program()
-          with fluid.program_guard(train_program, startup_program):
-              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-              hidden = fluid.layers.fc(input=data, size=10)
-              loss = fluid.layers.mean(hidden)
-              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-          # Run the startup program once and only once.
-          # Not need to optimize/compile the startup program.
-          startup_program.random_seed=1
-          exe.run(startup_program)
-
-          # Run the main program directly without compile.
-          x = numpy.random.random(size=(10, 1)).astype('float32')
-          loss_data, = exe.run(train_program,
-                               feed={"X": x},
-                               fetch_list=[loss.name])
-
-          # Or, compiled the program and run. See `CompiledProgram`
-          # for more detail.
-          # NOTE: If you use CPU to run the program, you need
-          # to specify the CPU_NUM, otherwise, fluid will use
-          # all the number of the logic core as the CPU_NUM,
-          # in that case, the batch size of the input should be
-          # greater than CPU_NUM, if not, the process will be
-          # failed by an exception.
-          if not use_cuda:
-              os.environ['CPU_NUM'] = str(2)
-
-          compiled_prog = compiler.CompiledProgram(
-              train_program).with_data_parallel(
-              loss_name=loss.name)
-          loss_data, = exe.run(compiled_prog,
-                               feed={"X": x},
-                               fetch_list=[loss.name])
-
-    Args:
-        place(fluid.CPUPlace|fluid.CUDAPlace(n)): indicate the executor run on which device.
-
-    """
-
-    def __init__(self, place):
-        self.place = place
-        self.program_caches = dict()
-        self.ctx_caches = dict()
-        self.scope_caches = dict()
-        self.var_caches = dict()
-        p = core.Place()
-        p.set_place(self.place)
-        self._default_executor = core.Executor(p)
-        self._closed = False
-
-    def _get_var_cache(self, program_cache_key):
-        return self.var_caches.get(program_cache_key, None)
-
-    def _get_scope_cache(self, program_cache_key):
-        return self.scope_caches.get(program_cache_key, None)
-
-    def _get_ctx_cache(self, program_cache_key):
-        return self.ctx_caches.get(program_cache_key, None)
-
-    def _get_program_cache(self, program_cache_key):
-        return self.program_caches.get(program_cache_key, None)
-
-    def _add_program_cache(self, program_cache_key, program):
-        self.program_caches[program_cache_key] = program
-
-    def _add_ctx_cache(self, ctx_cache_key, ctx):
-        self.ctx_caches[ctx_cache_key] = ctx
-
-    def _add_scope_cache(self, scope_cache_key, scope):
-        self.scope_caches[scope_cache_key] = scope
-
-    def _add_var_cache(self, var_cache_key, var):
-        self.var_caches[var_cache_key] = var
-
-    def _add_feed_fetch_ops(self, program, feed, fetch_list, feed_var_name,
-                            fetch_var_name):
-        tmp_program = program.clone()
-
-        global_block = tmp_program.global_block()
-
-        if feed_var_name in global_block.vars:
-            feed_var = global_block.var(feed_var_name)
-        else:
-            feed_var = global_block.create_var(
-                name=feed_var_name,
-                type=core.VarDesc.VarType.FEED_MINIBATCH,
-                persistable=True)
-
-        if fetch_var_name in global_block.vars:
-            fetch_var = global_block.var(fetch_var_name)
-        else:
-            fetch_var = global_block.create_var(
-                name=fetch_var_name,
-                type=core.VarDesc.VarType.FETCH_LIST,
-                persistable=True)
-
-        # prepend feed operators
-        if not has_feed_operators(global_block, feed, feed_var_name):
-            for i, name in enumerate(feed):
-                out = global_block.var(name)
-                global_block._prepend_op(
-                    type='feed',
-                    inputs={'X': [feed_var]},
-                    outputs={'Out': [out]},
-                    attrs={'col': i})
-
-        # append fetch_operators
-        if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
-            for i, var in enumerate(fetch_list):
-                assert isinstance(var, Variable) or isinstance(
-                    var, six.string_types), (
-                        "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
-                global_block.append_op(
-                    type='fetch',
-                    inputs={'X': [var]},
-                    outputs={'Out': [fetch_var]},
-                    attrs={'col': i})
-
-        return tmp_program
-
-    def _feed_data(self, program, feed, feed_var_name, scope):
-        # feed var to framework
-        for op in program.global_block().ops:
-            if op.desc.type() == 'feed':
-                feed_target_name = op.desc.output('Out')[0]
-                cur_feed = feed[feed_target_name]
-                if not isinstance(cur_feed, core.LoDTensor):
-                    cur_feed = _as_lodtensor(cur_feed, self.place)
-                idx = op.desc.attr('col')
-                core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
-            else:
-                break
-
-    def _fetch_data(self, fetch_list, fetch_var_name, scope):
-        outs = [
-            core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in six.moves.range(len(fetch_list))
-        ]
-        return outs
-
-    '''
-    TODO(typhoonzero): Define "no longer use" meaning? Can user create
-    a new Executor for the same program and run?
-    TODO(panyx0718): Why ParallelExecutor doesn't have close?
-    '''
-
-    def close(self):
-        """
-        Close this executor.
-
-        You can no longer use this executor after calling this method.
-        For the distributed training, this method would free the resource
-        on PServers related to the current Trainer.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-
-              cpu = fluid.CPUPlace()
-              exe = fluid.Executor(cpu)
-              # execute training or testing
-              exe.close()
-        """
-        if not self._closed:
-            self._default_executor.close()
-            self._closed = True
-
-    def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
-                      return_numpy):
-        exe = program._executor
-        if isinstance(feed, dict):
-            feed_tensor_dict = dict()
-            for feed_name in feed:
-                feed_tensor = feed[feed_name]
-                if not isinstance(feed_tensor, core.LoDTensor):
-                    feed_tensor = core.LoDTensor()
-                    # always set to CPU place, since the tensor need to be split
-                    # it is fast in CPU
-                    assert isinstance( feed[feed_name], np.ndarray ), \
-                        "The input({}) should be numpy.array, but not {}.".format(
-                        feed_name, type(feed[feed_name]))
-                    feed_tensor.set(feed[feed_name], core.CPUPlace())
-                feed_tensor_dict[feed_name] = feed_tensor
-
-            exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict)
-        elif isinstance(feed, list) or isinstance(feed, tuple):
-            if len(feed) != len(program._places):
-                raise ValueError(
-                    "Feed a list of tensor, the list should be the same size as places"
-                )
-
-            res = list()
-            for i, each in enumerate(feed):
-                if not isinstance(each, dict):
-                    raise TypeError(
-                        "Each element of feed list should be a dict")
-                res_dict = dict()
-                for feed_name in each:
-                    tensor = each[feed_name]
-                    if not isinstance(tensor, core.LoDTensor):
-                        tmp = core.LoDTensor()
-                        assert isinstance(each[feed_name], np.ndarray), \
-                            "The input({}) should be numpy.array, but not {}.".format(
-                            feed_name, type(each[feed_name]))
-                        tmp.set(tensor, program._places[i])
-                        tensor = tmp
-                    res_dict[feed_name] = tensor
-                res.append(res_dict)
-            exe.feed_tensors_into_local_scopes(res)
-
-        fetch_var_names = list(map(_to_name_str, fetch_list))
-        tensors = exe.run(fetch_var_names)._move_to_list()
-        return as_numpy(tensors) if return_numpy else tensors
-
-    def run(self,
-            program=None,
-            feed=None,
-            fetch_list=None,
-            feed_var_name='feed',
-            fetch_var_name='fetch',
-            scope=None,
-            return_numpy=True,
-            use_program_cache=False):
-        """
-        Run program by this Executor. Feed data by feed map, fetch result by
-        fetch_list. Python executor takes a program, add feed operators and
-        fetch operators to this program according to feed map and fetch_list.
-        Feed map provides input data for the program. fetch_list provides
-        the variables(or names) that user want to get after program run.
-
-        Note: the executor will run all operators in the program but not
-        only the operators dependent by the fetch_list.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              import numpy
-
-              # First create the Executor.
-              place = fluid.CPUPlace() # fluid.CUDAPlace(0)
-              exe = fluid.Executor(place)
-
-              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-              hidden = fluid.layers.fc(input=data, size=10)
-              loss = fluid.layers.mean(hidden)
-              adam = fluid.optimizer.Adam()
-              adam.minimize(loss)
-
-              # Run the startup program once and only once.
-              exe.run(fluid.default_startup_program())
-
-              x = numpy.random.random(size=(10, 1)).astype('float32')
-              outs = exe.run(feed={'X': x},
-                             fetch_list=[loss.name])
-
-        Args:
-            program(Program|CompiledProgram): the program that need to run,
-                if not provided, then default_main_program (not compiled) will be used.
-            feed(dict): feed variable map, e.g. {"image": ImageData, "label": LabelData}
-            fetch_list(list): a list of variable or variable names that user 
-                wants to get, this method will return them according to this list.
-            feed_var_name(str): the name for the input variable of 
-                feed Operator.
-            fetch_var_name(str): the name for the output variable of 
-                fetch Operator.
-            scope(Scope): the scope used to run this program, you can switch 
-                it to different scope. default is global_scope
-            return_numpy(bool): if convert the fetched tensor to numpy
-            use_program_cache(bool): whether to use the cached program 
-                settings across batches. Setting it be true would be faster 
-                only when (1) the program is not compiled with data parallel, 
-                and (2) program, feed variable names and fetch_list variable 
-                names do not changed compared to the last step. 
-                
-        Returns:
-
-            list(numpy.array): fetch result according to fetch_list.
-        """
-        try:
-            return self._run_impl(
-                program=program,
-                feed=feed,
-                fetch_list=fetch_list,
-                feed_var_name=feed_var_name,
-                fetch_var_name=fetch_var_name,
-                scope=scope,
-                return_numpy=return_numpy,
-                use_program_cache=use_program_cache)
-        except Exception as e:
-            if not isinstance(e, core.EOFException):
-                print("!!!A non-EOF exception is thrown.")
-            six.reraise(*sys.exc_info())
-
-    def _run_impl(self, program, feed, fetch_list, feed_var_name,
-                  fetch_var_name, scope, return_numpy, use_program_cache):
-        if self._closed:
-            raise RuntimeError("Attempted to use a closed Executor")
-
-        use_default_main_program = program is None
-        if program is None:
-            program = default_main_program()
-        if isinstance(program, Program) and \
-                        len(program.global_block().ops) == 0:
-            error_info = "The current program is empty."
-            if use_default_main_program:
-                error_info += " Maybe you should pass the Program or the CompiledProgram manually."
-            warnings.warn(error_info)
-
-        if scope is None:
-            scope = global_scope()
-
-        if fetch_list is not None:
-            if isinstance(fetch_list, Variable) or isinstance(fetch_list, str):
-                fetch_list = [fetch_list]
-            assert isinstance(fetch_list, tuple) or isinstance(fetch_list, list), \
-                "Currently , The fetch_list type only should be list or tuple, \n"\
-                "but the input type is {}. For more information please refer to \n"\
-                "the executor.run(...).".format(type(fetch_list))
-        else:
-            fetch_list = []
-
-        compiled = isinstance(program, compiler.CompiledProgram)
-        # For backward compatibility, run directly.
-        if not compiled:
-            return self._run_program(
-                program,
-                feed=feed,
-                fetch_list=fetch_list,
-                feed_var_name=feed_var_name,
-                fetch_var_name=fetch_var_name,
-                scope=scope,
-                return_numpy=return_numpy,
-                use_program_cache=use_program_cache)
-
-        program._compile(scope, self.place)
-        if program._is_inference:
-            return self._run_inference(program._executor, feed)
-        else:
-            return self._run_parallel(
-                program,
-                scope=scope,
-                feed=feed,
-                fetch_list=fetch_list,
-                fetch_var_name=fetch_var_name,
-                return_numpy=return_numpy)
-
-    def _run_program(self, program, feed, fetch_list, feed_var_name,
-                     fetch_var_name, scope, return_numpy, use_program_cache):
-
-        if feed is None:
-            feed = {}
-        elif isinstance(feed, (list, tuple)):
-            assert len(feed) == 1, "Not compiled with data parallel"
-            feed = feed[0]
-
-        if not isinstance(feed, dict):
-            raise TypeError(
-                "feed requires dict as its Parameter. But you passed in %s" %
-                (type(feed)))
-
-        assert program is not None, "The program should not be Empty"
-        if not isinstance(program, Program):
-            raise TypeError(
-                "Executor requires Program as its Parameter. But you passed in %s"
-                % (type(program)))
-
-        if use_program_cache:
-            cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
-            cached_program = self._get_program_cache(cache_key)
-            cached_ctx = self._get_ctx_cache(cache_key)
-            cached_scope = self._get_scope_cache(cache_key)
-            cached_var = self._get_var_cache(cache_key)
-            if cached_program is None:
-                cached_program = self._add_feed_fetch_ops(
-                    program=program,
-                    feed=feed,
-                    fetch_list=fetch_list,
-                    feed_var_name=feed_var_name,
-                    fetch_var_name=fetch_var_name)
-                self._add_program_cache(cache_key, cached_program)
-                fetch_list_str = list(map(_to_name_str, fetch_list))
-                cached_ctx = self._default_executor.prepare_ctx_cache(
-                    cached_program.desc, 0, fetch_list_str, False)
-                cached_var = self._default_executor.create_variables(
-                    cached_program.desc, scope, 0)
-                # currently, we cache program, vars, sub_scope here
-                # we suppose that in a life cycle of training, a user
-                # will not create many programs. So, here the basic
-                # rule of caching is to cache all unseen (program, var, scope)
-                # when a user use use_program_cache.
-                cached_scope = scope.new_scope()
-                self._add_ctx_cache(cache_key, cached_ctx)
-                self._add_var_cache(cache_key, cached_var)
-                self._add_scope_cache(cache_key, cached_scope)
-            program = cached_program
-            ctx = cached_ctx
-            scope = cached_scope
-            var = cached_var
-        else:
-            program = self._add_feed_fetch_ops(
-                program=program,
-                feed=feed,
-                fetch_list=fetch_list,
-                feed_var_name=feed_var_name,
-                fetch_var_name=fetch_var_name)
-
-        self._feed_data(program, feed, feed_var_name, scope)
-        if not use_program_cache:
-            self._default_executor.run(program.desc, scope, 0, True, True,
-                                       fetch_var_name)
-        else:
-            self._default_executor.run_cached_prepared_ctx(ctx, scope, False,
-                                                           False, False)
-        arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
-        tensors = arr._move_to_list()
-        if return_numpy:
-            return as_numpy(tensors)
-        else:
-            return tensors
-
-    def _run_inference(self, exe, feed):
-        return exe.run(feed)
-
-    def _dump_debug_info(self, program=None, trainer=None):
-        with open(str(id(program)) + "_train_desc.prototxt", "w") as fout:
-            fout.write(str(trainer))
-        if program._fleet_opt:
-            with open("fleet_desc.prototxt", "w") as fout:
-                fout.write(str(program._fleet_opt["fleet_desc"]))
-
-    def _adjust_pipeline_resource(self, pipeline_opt, dataset, pipeline_num):
-        filelist_length = len(dataset.dataset.get_filelist())
-        if filelist_length < pipeline_num:
-            pipeline_num = filelist_length
-            print(
-                "Pipeline training: setting the pipeline num to %d is enough because there are only %d files"
-                % (filelist_length, filelist_length))
-        if filelist_length < pipeline_num * pipeline_opt["concurrency_list"][0]:
-            print(
-                "Pipeline training: setting the 1st element in concurrency_list to %d is enough because there are only %d files"
-                % (filelist_length // pipeline_num, filelist_length))
-            pipeline_opt["concurrency_list"][
-                0] = filelist_length // pipeline_num
-        dataset.set_thread(pipeline_opt["concurrency_list"][0] * pipeline_num)
-        return pipeline_num
-
-    def _prepare_trainer(self,
-                         program=None,
-                         dataset=None,
-                         scope=None,
-                         thread=0,
-                         debug=False,
-                         fetch_list=None,
-                         fetch_info=None,
-                         print_period=100):
-        if scope is None:
-            scope = global_scope()
-        if fetch_list is None:
-            fetch_list = []
-        if fetch_info is None:
-            fetch_info = []
-        assert len(fetch_list) == len(fetch_info)
-        compiled = isinstance(program, compiler.CompiledProgram)
-        if not compiled:
-            # TODO: Need a better way to distinguish and specify different execution mode
-            if program._pipeline_opt:
-                trainer = TrainerFactory()._create_trainer(
-                    program._pipeline_opt)
-            else:
-                trainer = TrainerFactory()._create_trainer(program._fleet_opt)
-            trainer._set_program(program)
-        else:
-            if program._pipeline_opt:
-                trainer = TrainerFactory()._create_trainer(
-                    program.program._pipeline_opt)
-            else:
-                trainer = TrainerFactory()._create_trainer(
-                    program.program._fleet_opt)
-            trainer._set_program(program.program)
-
-        if thread <= 0:
-            if dataset.thread_num <= 0:
-                raise RuntimeError(
-                    "You should set thread num first, either in Dataset"
-                    "or in Executor.train_from_dataset")
-            else:
-                trainer._set_thread(dataset.thread_num)
-        else:
-            trainer._set_thread(thread)
-
-        trainer._set_debug(debug)
-        trainer._set_fetch_var_and_info(fetch_list, fetch_info, print_period)
-        return scope, trainer
-
-    def infer_from_dataset(self,
-                           program=None,
-                           dataset=None,
-                           scope=None,
-                           thread=0,
-                           debug=False,
-                           fetch_list=None,
-                           fetch_info=None,
-                           print_period=100):
-        """
-        The document of infer_from_dataset is almost the same as
-        train_from_dataset, except that in distributed training,
-        push gradients will be disabled in infer_from_dataset.
-        infer_from_dataset() can be used for evaluation in multi-thread
-        very easily.
-
-        Args:
-            program(Program|CompiledProgram): the program that needs to be run,
-               if not provided, then default_main_program (not compiled) will be used.
-            dataset(paddle.fluid.Dataset): dataset created outside this function,
-               a user should provide a well-defined dataset before calling this function.
-               Please check the document of Dataset if needed. default is None
-            scope(Scope): the scope used to run this program, you can switch it to different scope
-               for each run. default is global_scope
-            thread(int): number of thread a user wants to run in this function. The actual number
-               of thread will be min(Dataset.thread_num, thread) if thread > 0, default is 0
-            debug(bool): whether a user wants to run infer_from_dataset, default is False
-            fetch_list(Variable List): fetch variable list, each variable
-                                       will be printed during training, default is None
-            fetch_info(String List): print information for each variable, default is None
-            print_period(int): the number of mini-batches for each print, default is 100
-
-        Returns:
-            None
-
-        Examples:
-
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                place = fluid.CPUPlace() # you can set place = fluid.CUDAPlace(0) to use gpu
-                exe = fluid.Executor(place)
-                x = fluid.layers.data(name="x", shape=[10, 10], dtype="int64")
-                y = fluid.layers.data(name="y", shape=[1], dtype="int64", lod_level=1)
-                dataset = fluid.DatasetFactory().create_dataset()
-                dataset.set_use_var([x, y])
-                dataset.set_thread(1)
-                filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"]
-                dataset.set_filelist(filelist)
-                exe.run(fluid.default_startup_program())
-                exe.infer_from_dataset(program=fluid.default_main_program(),
-                                       dataset=dataset)        
-
-        """
-        if dataset == None:
-            raise RuntimeError("dataset is needed and should be initialized")
-
-        dataset._prepare_to_run()
-        scope, trainer = self._prepare_trainer(
-            program=program,
-            dataset=dataset,
-            scope=scope,
-            thread=thread,
-            debug=debug,
-            fetch_list=fetch_list,
-            fetch_info=fetch_info,
-            print_period=print_period)
-        trainer._set_infer(True)
-        trainer._gen_trainer_desc()
-        self._dump_debug_info(program=program, trainer=trainer)
-        dataset._dynamic_adjust_before_train(trainer.proto_desc.thread_num)
-        self._default_executor.run_from_dataset(program.desc, scope,
-                                                dataset.dataset,
-                                                trainer._desc())
-        dataset._dynamic_adjust_after_train()
-        dataset._finish_to_run()
-        return None
-
-    def train_from_dataset(self,
-                           program=None,
-                           dataset=None,
-                           scope=None,
-                           thread=0,
-                           debug=False,
-                           fetch_list=None,
-                           fetch_info=None,
-                           print_period=100):
-        """
-        Train from a pre-defined Dataset. Dataset is defined in paddle.fluid.dataset.
-        Given a program, either a program or compiled program, train_from_dataset will
-        consume all data samples in dataset. Input scope can be given by users. By default,
-        scope is global_scope(). The total number of thread run in training is `thread`.
-        Thread number used in training will be minimum value of threadnum in Dataset and
-        the value of thread in this interface. Debug can be set so that executor will display
-        Run-Time for all operators and the throughputs of current training task.
-        
-        Note: train_from_dataset will destroy all resources created within executor for each run.
-
-        Args:
-            program(Program|CompiledProgram): the program that needs to be run,
-               if not provided, then default_main_program (not compiled) will be used.
-            dataset(paddle.fluid.Dataset): dataset created outside this function,
-               a user should provide a well-defined dataset before calling this function.
-               Please check the document of Dataset if needed.
-            scope(Scope): the scope used to run this program, you can switch it to different scope
-               for each run. default is global_scope
-            thread(int): number of thread a user wants to run in this function. The actual number
-               of thread will be min(Dataset.thread_num, thread)
-            debug(bool): whether a user wants to run train_from_dataset 
-            fetch_list(Variable List): fetch variable list, each variable
-                                       will be printed during training
-            fetch_info(String List): print information for each variable
-            print_period(int): the number of mini-batches for each print
-
-        Returns:
-            None
-        
-        Examples:
-        
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-
-              place = fluid.CPUPlace() # you can set place = fluid.CUDAPlace(0) to use gpu
-              exe = fluid.Executor(place)
-              x = fluid.layers.data(name="x", shape=[10, 10], dtype="int64")
-              y = fluid.layers.data(name="y", shape=[1], dtype="int64", lod_level=1)
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_use_var([x, y])
-              dataset.set_thread(1)
-              filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"]
-              dataset.set_filelist(filelist)
-              exe.run(fluid.default_startup_program())
-              exe.train_from_dataset(program=fluid.default_main_program(),
-                                     dataset=dataset)
-
-        """
-        if dataset == None:
-            raise RuntimeError("dataset is need and should be initialized")
-
-        if program._pipeline_opt:
-            thread = self._adjust_pipeline_resource(program._pipeline_opt,
-                                                    dataset, thread)
-
-        dataset._prepare_to_run()
-        scope, trainer = self._prepare_trainer(
-            program=program,
-            dataset=dataset,
-            scope=scope,
-            thread=thread,
-            debug=debug,
-            fetch_list=fetch_list,
-            fetch_info=fetch_info,
-            print_period=print_period)
-        trainer._gen_trainer_desc()
-        self._dump_debug_info(program=program, trainer=trainer)
-        dataset._dynamic_adjust_before_train(trainer.proto_desc.thread_num)
-        self._default_executor.run_from_dataset(program.desc, scope,
-                                                dataset.dataset,
-                                                trainer._desc())
-        dataset._dynamic_adjust_after_train()
-        dataset._finish_to_run()
-        return None
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
deleted file mode 100644
index c9fb957656c40fc04d7504b42ab7f37192c2a755..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/framework.py
+++ /dev/null
@@ -1,4052 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import collections
-from collections import defaultdict
-from collections import Iterable
-import contextlib
-from .wrapped_decorator import signature_safe_contextmanager
-import os
-import re
-import traceback
-import six
-
-import numpy as np
-import subprocess
-import multiprocessing
-import sys
-from .. import compat as cpt
-from .proto import framework_pb2
-
-from . import core
-from . import unique_name
-
-__all__ = [
-    'Program',
-    'default_startup_program',
-    'default_main_program',
-    'program_guard',
-    'name_scope',
-    'cuda_places',
-    'cpu_places',
-    'cuda_pinned_places',
-    'in_dygraph_mode',
-    'is_compiled_with_cuda',
-]
-
-EMPTY_VAR_NAME = core.kEmptyVarName()
-TEMP_VAR_NAME = core.kTempVarName()
-GRAD_VAR_SUFFIX = core.kGradVarSuffix()
-ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
-CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
-
-_dygraph_tracer_ = None
-_dygraph_current_expected_place_ = None
-
-
-def in_dygraph_mode():
-    """
-    Check program status(tracer), Whether it runs in dygraph mode or not
-
-    Returns:
-        out (boolean): True if the program is running in dynamic graph mode
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            if fluid.in_dygraph_mode():
-                pass
-
-    """
-    return _dygraph_tracer_ is not None
-
-
-def _dygraph_tracer():
-    return _dygraph_tracer_
-
-
-def _current_expected_place():
-    return _dygraph_current_expected_place_
-
-
-def _cpu_num():
-    if "CPU_NUM" not in os.environ.keys():
-        if multiprocessing.cpu_count() > 1:
-            sys.stderr.write(
-                '!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.\n'
-                'CPU_NUM indicates that how many CPUPlace are used in the current task.\n'
-                'And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.\n\n'
-                'export CPU_NUM={} # for example, set CPU_NUM as number of physical CPU core which is {}.\n\n'
-                '!!! The default number of CPU_NUM=1.\n'.format(
-                    multiprocessing.cpu_count(), multiprocessing.cpu_count()))
-        os.environ['CPU_NUM'] = str(1)
-    cpu_num = os.environ.get('CPU_NUM')
-    return int(cpu_num)
-
-
-def _cuda_ids():
-    gpus_env = os.getenv("FLAGS_selected_gpus")
-    if gpus_env:
-        device_ids = [int(s) for s in gpus_env.split(",")]
-    else:
-        device_ids = six.moves.range(core.get_cuda_device_count())
-    return device_ids
-
-
-def is_compiled_with_cuda():
-    """
-    Whether this whl package can be used to run the model on GPU.
-
-    Returns (bool): support gpu or not.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            support_gpu = fluid.is_compiled_with_cuda()
-    """
-    return core.is_compiled_with_cuda()
-
-
-def cuda_places(device_ids=None):
-    """
-    Create a list of :code:`fluid.CUDAPlace` objects.
-
-    If :code:`device_ids` is None, environment variable of
-    :code:`FLAGS_selected_gpus` would be checked first. If
-    :code:`FLAGS_selected_gpus=0,1,2`, the returned list would
-    be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
-    If :code:`FLAGS_selected_gpus` is not set, all visible
-    gpu places would be returned.  
-
-    If :code:`device_ids` is not None, it should be the device
-    ids of gpus. For example, if :code:`device_ids=[0,1,2]`, 
-    the returned list would be 
-    [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
-    
-    Args: 
-        device_ids (None|list(int)|tuple(int)): gpu device id list.
-
-    Returns:
-        out (list(fluid.CUDAPlace)): gpu place list.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            cuda_places = fluid.cuda_places()
-
-    """
-    assert core.is_compiled_with_cuda(), \
-        "Not compiled with CUDA"
-    if device_ids is None:
-        device_ids = _cuda_ids()
-    elif not isinstance(device_ids, (list, tuple)):
-        device_ids = [device_ids]
-    return [core.CUDAPlace(dev_id) for dev_id in device_ids]
-
-
-def cpu_places(device_count=None):
-    """
-    Create a list of :code:`fluid.CPUPlace` objects.
-    
-    If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
-    If :code:`CPU_NUM` is not set, the default value is 1,
-    i.e. CPU_NUM=1.
-
-    Args:
-        device_count (None|int): device number.
-
-    Returns:
-        out (list(fluid.CPUPlace)): cpu place list.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            cpu_places = fluid.cpu_places()
-    """
-
-    if device_count is None:
-        device_count = _cpu_num()
-    return [core.CPUPlace()] * device_count
-
-
-def cuda_pinned_places(device_count=None):
-    """
-    Create a list of :code:`fluid.CUDAPinnedPlace` objects.
-
-    If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
-    If :code:`CPU_NUM` is not set, the device count would
-    be determined by :code:`multiprocessing.cpu_count()`. 
-
-    Args:
-        device_count (None|int): device number.
-
-    Returns:
-        out (list(fluid.CUDAPinnedPlace)): cuda pinned place list.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            cuda_pinned_places_cpu_num = fluid.cuda_pinned_places()
-            # or
-            cuda_pinned_places = fluid.cuda_pinned_places(1)
-
-    """
-    assert core.is_compiled_with_cuda(), \
-        "Not compiled with CUDA"
-    if device_count is None:
-        device_count = _cpu_num()
-    return [core.cuda_pinned_places()] * device_count
-
-
-class NameScope(object):
-    def __init__(self, name="", parent=None):
-        self._children = dict()
-        self._name = name
-        self._parent = parent
-
-    def child(self, prefix):
-        if prefix not in self._children:
-            new_child = NameScope(prefix, self)
-            self._children[prefix] = [new_child]
-        else:
-            new_child = NameScope(prefix + "_%d" % len(self._children[prefix]),
-                                  self)
-            self._children[prefix].append(new_child)
-        return new_child
-
-    def parent(self):
-        return self._parent
-
-    def name(self):
-        return self._name
-
-
-_name_scope = NameScope()
-
-
-@signature_safe_contextmanager
-def name_scope(prefix=None):
-    """
-    Generate hierarchical name prefix for the operators.
-
-    Note: This should only used for debugging and visualization purpose.
-    Don't use it for serious analysis such as graph/program transformations.
-
-    Args:
-        prefix(str): prefix.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          with fluid.name_scope("s1"):
-              a = fluid.layers.data(name='data', shape=[1], dtype='int32')
-              b = a + 1
-              with fluid.name_scope("s2"):
-                  c = b * 1
-              with fluid.name_scope("s3"):
-                  d = c / 1
-          with fluid.name_scope("s1"):
-              f = fluid.layers.pow(d, 2.0)
-          with fluid.name_scope("s4"):
-              g = f - 1
-    """
-    # TODO(panyx0718): Only [0-9a-z].
-    # in dygraph we don't need namescope since it will cause mem leak
-    if not in_dygraph_mode():
-        assert prefix, "namescope prefix cannot be empty."
-        global _name_scope
-        _name_scope = _name_scope.child(prefix)
-        yield
-        _name_scope = _name_scope.parent()
-    else:
-        yield
-
-
-def _full_name_scope():
-    global _name_scope
-    scope = _name_scope
-    name = ""
-    while scope:
-        name = scope.name() + "/" + name
-        scope = scope.parent()
-    return name
-
-
-def generate_control_dev_var_name():
-    import random
-    return CONTROL_DEP_VAR_PREFIX + "@" + str(random.random())
-
-
-def grad_var_name(var_name):
-    """
-    Returns:
-        str: gradient name for a certain var name
-    """
-    return var_name + GRAD_VAR_SUFFIX
-
-
-def convert_np_dtype_to_dtype_(np_dtype):
-    """
-    Convert the data type in numpy to the data type in Paddle
-
-    Args:
-        np_dtype(np.dtype): the data type in numpy.
-
-    Returns:
-        core.VarDesc.VarType: the data type in Paddle.
-
-    """
-    dtype = np.dtype(np_dtype)
-    if dtype == np.float32:
-        return core.VarDesc.VarType.FP32
-    elif dtype == np.float64:
-        return core.VarDesc.VarType.FP64
-    elif dtype == np.float16:
-        return core.VarDesc.VarType.FP16
-    elif dtype == np.int32:
-        return core.VarDesc.VarType.INT32
-    elif dtype == np.int16:
-        return core.VarDesc.VarType.INT16
-    elif dtype == np.int64:
-        return core.VarDesc.VarType.INT64
-    elif dtype == np.bool:
-        return core.VarDesc.VarType.BOOL
-    elif dtype == np.uint16:
-        return core.VarDesc.VarType.INT16
-    elif dtype == np.uint8:
-        return core.VarDesc.VarType.UINT8
-    elif dtype == np.int8:
-        return core.VarDesc.VarType.INT8
-    else:
-        raise ValueError("Not supported numpy dtype %s" % dtype)
-
-
-def dtype_is_floating(dtype):
-    """
-    Check the data type is floating or not.
-    Args:
-        dtype(np.dtype|core.VarDesc.VarType): data type.
-            Could be numpy format or Paddle format
-
-    Returns(bool): True if data type is a float value
-
-    """
-    if not isinstance(dtype, core.VarDesc.VarType):
-        dtype = convert_np_dtype_to_dtype_(dtype)
-
-    return dtype in [
-        core.VarDesc.VarType.FP16, core.VarDesc.VarType.FP32,
-        core.VarDesc.VarType.FP64
-    ]
-
-
-def _debug_string_(proto, throw_on_error=True):
-    """
-    Get the debug string of a protobuf message. The message could be not
-    initialized.
-    Args:
-        proto(google.protobuf.message.Message): The protobuf message
-        throw_on_error(bool): True if raise an error when the protobuf message
-            is not initialized.
-
-    Returns(str): The debug string of the protobuf message
-
-    """
-    error_fields = list()
-    if not proto.IsInitialized(error_fields) and throw_on_error:
-        raise ValueError("{0} are not initialized.\nThe message is {1}:\n".
-                         format(error_fields, proto))
-    return proto.__str__()
-
-
-class Variable(object):
-    """
-    In Fluid, every input and output of an operator is a variable. In most
-    cases, variables are used for holding different kinds of data or training
-    labels. A variable belongs to a block. All variable has its own name and
-    two variables in different blocks could have the same name.
-
-    There are many kinds of variables. Each kind of them has its own attributes
-    and usages. Please refer to the framework.proto for details.
-
-    Most of a Variable's member variables can be setted to be None. It mean
-    it is not available or will be specified later.
-
-    Args:
-        block(Block): The block that the variable belongs to.
-        type(core.VarDesc.VarType): Variable type. Please reference the
-            framework.proto for details.
-        name(str|None): The name of the variable. If setted None, it will be
-            generated automatically. Default: None
-        shape(tuple|list|None): The shape of the variable. -1 means the batch size.
-            Some kinds of variable do not contain shape, just set it to None.
-            Default: None
-        dtype(np.dtype|core.VarDesc.VarType|str|None): The data type of variable.
-            Default: None
-        lod_level (int|None): The level of lod tensor. 0 means it is not a time
-            series data.
-            Default: None
-        capacity (int|None): The capacity of Channel variable. Ignored for other
-            types. Default: None
-        persistable (bool|None): True if the variable is persistable. A persistable
-            variable will not be deleted after an iteration ending. Defaults: None.
-        error_clip (BaseErrorClipAttr|None): The error clip attributes of the
-            corresponding gradient variable. Default: None
-        stop_gradient (bool): True if the variable will stop to calculate its
-            gradients when backward. Default: False.
-        is_data (bool): True if the variable is an input data. Default: False
-
-    Notes:
-        The constructor of Variable should not be invoked directly. Please
-        use `Block.create_var` to create a variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            cur_program = fluid.Program()
-            cur_block = cur_program.current_block()
-            new_variable = cur_block.create_var(name="X",
-                                                shape=[-1, 23, 48],
-                                                dtype='float32')
-    """
-
-    def __init__(self,
-                 block,
-                 type=core.VarDesc.VarType.LOD_TENSOR,
-                 name=None,
-                 shape=None,
-                 dtype=None,
-                 lod_level=None,
-                 capacity=None,
-                 persistable=None,
-                 error_clip=None,
-                 stop_gradient=False,
-                 is_data=False,
-                 **kwargs):
-        self.block = block
-        if name is None:
-            name = unique_name.generate('_generated_var')
-
-        if dtype is not None:
-            if not isinstance(dtype, core.VarDesc.VarType):
-                dtype = convert_np_dtype_to_dtype_(dtype)
-
-        if in_dygraph_mode():
-            # record vars in tracer rather than blocks
-            self._ivar = kwargs.get("ivar", None)
-            self.stop_gradient_ = kwargs.get("stop_gradient", True)
-            if not self._ivar:
-                self._ivar = core.VarBase(
-                    name, type
-                    if type else core.VarDesc.VarType.LOD_TENSOR, dtype
-                    if dtype else core.VarDesc.VarType.FP32,
-                    list(shape) if shape else [], True
-                    if persistable else False)
-            if persistable:
-                _dygraph_tracer().trace_var(name, self)
-            self.op = None
-        else:
-            self.error_clip = error_clip
-
-            is_new_var = False
-            name = cpt.to_text(name)
-            self.desc = self.block.desc.find_var(cpt.to_bytes(name))
-
-            if self.desc is None:
-                self.desc = self.block.desc.var(cpt.to_bytes(name))
-                is_new_var = True
-
-            if is_new_var:
-                self.desc.set_type(type)
-            elif self.desc.type() != type:
-                raise ValueError(
-                    "Variable {0} has been created before. The "
-                    "previous type is {1}; the new type is {2}. They"
-                    " are not matched".format(self.name, self.desc.type(),
-                                              type))
-
-            if shape is not None:
-                if is_new_var:
-                    self.desc.set_shape(shape)
-                else:
-                    old_shape = self.shape
-                    shape = tuple(shape)
-                    if shape != old_shape:
-                        raise ValueError(
-                            "Variable {0} has been created before. the previous "
-                            "shape is {1}; the new shape is {2}. They are not "
-                            "matched.".format(self.name, old_shape, shape))
-            if dtype is not None:
-                if is_new_var:
-                    self.desc.set_dtype(dtype)
-                else:
-                    old_dtype = self.dtype
-                    if dtype != old_dtype:
-                        raise ValueError(
-                            "Variable {0} has been created before. "
-                            "The previous data type is {1}; the new "
-                            "data type is {2}. They are not "
-                            "matched.".format(self.name, old_dtype, dtype))
-
-            if lod_level is not None:
-                if is_new_var:
-                    self.desc.set_lod_level(lod_level)
-                else:
-                    if lod_level != self.lod_level:
-                        raise ValueError(
-                            "Variable {0} has been created before. "
-                            "The previous lod_level is {1}; the new "
-                            "lod_level is {2}. They are not "
-                            "matched".format(self.name, self.lod_level,
-                                             lod_level))
-            if persistable is not None:
-                if is_new_var:
-                    self.desc.set_persistable(persistable)
-                else:
-                    if persistable != self.persistable:
-                        raise ValueError(
-                            "Variable {0} has been created before."
-                            "The previous persistable is {1}; the new "
-                            "persistable is {2}. They are not matched".format(
-                                self.name, self.persistable, persistable))
-
-            if capacity is not None:
-                if is_new_var:
-                    self.desc.set_capacity(capacity)
-                else:
-                    # TODO(abhinavarora) : Compare with set capacity once,
-                    # get_capacity is implemented
-                    pass
-
-            self.block.vars[name] = self
-            self.op = None
-            self._stop_gradient = stop_gradient
-            self.is_data = is_data
-
-    def detach(self):
-        """
-        Returns a new Variable, detached from the current graph.
-        
-        Returns:
-            Variable: The detached Variable.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import FC
-                import numpy as np
-
-                data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
-                with fluid.dygraph.guard():
-                    fc = FC("fc", 64, num_flatten_dims=2)
-                    data = to_variable(data)
-                    x = fc(data)
-                    y = x.detach()
-
-        """
-        if in_dygraph_mode():
-            new_var = self._cloneVar()
-            self.block.append_op(
-                type="assign",
-                inputs={'X': [self]},
-                outputs={'Out': [new_var]},
-                stop_gradient=True)
-            return new_var
-        else:
-            raise AttributeError("static graph model DO NOT supprt detach")
-
-    def numpy(self):
-        new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
-        return np.array(new_ivar.value().get_tensor())
-
-    def backward(self, backward_strategy=None):
-        if in_dygraph_mode():
-            from .dygraph import BackwardStrategy
-            if backward_strategy is None:
-                backward_strategy = BackwardStrategy()
-                backward_strategy.sort_sum_gradient = False
-
-            self._ivar._run_backward(backward_strategy, _dygraph_tracer())
-        else:
-            raise ValueError(
-                "Variable.backward() is only avaliable in DyGraph mode")
-
-    def gradient(self):
-        new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
-        return np.array(new_ivar.value().get_tensor())
-
-    def clear_gradient(self):
-        self._ivar._clear_gradient()
-
-    def __str__(self):
-        return self.to_string(True)
-
-    def to_string(self, throw_on_error, with_details=False):
-        """
-        Get debug string.
-
-        Args:
-            throw_on_error(bool): True if raise an exception when self is
-                not initialized.
-            with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when
-                with_details is True. Default False;
-
-        Returns:
-            str: The debug string.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                cur_program = fluid.Program()
-                cur_block = cur_program.current_block()
-                new_variable = cur_block.create_var(name="X",
-                                                    shape=[-1, 23, 48],
-                                                    dtype='float32')
-                new_variable.to_string(True)
-        """
-        if in_dygraph_mode():
-            # TODO(panyx0718): add more dygraph debug info.
-            tensor = self._ivar.value().get_tensor()
-            if tensor._is_initialized():
-                return 'name %s, dtype: %s shape: %s %s' % (
-                    self.name, self.dtype, self.shape, str(tensor))
-            else:
-                return 'name %s, shape: %s, not inited' % (self.name,
-                                                           self.shape)
-
-        assert isinstance(throw_on_error, bool) and isinstance(with_details,
-                                                               bool)
-        protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.VarDesc.FromString(six.binary_type(protostr))
-        res_str = _debug_string_(proto, throw_on_error)
-        if with_details:
-            additional_attr = ("error_clip", "stop_gradient")
-            for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (
-                    attr_name, six.binary_type(getattr(self, attr_name)))
-        return res_str
-
-    __repr__ = __str__
-
-    @property
-    def stop_gradient(self):
-        if in_dygraph_mode():
-            return self._ivar.stop_gradient
-        else:
-            return self._stop_gradient
-
-    @stop_gradient.setter
-    def stop_gradient(self, s):
-        if in_dygraph_mode():
-            self._ivar.stop_gradient = s
-        else:
-            self._stop_gradient = s
-
-    @property
-    def persistable(self):
-        if in_dygraph_mode():
-            return self._ivar.persistable
-        else:
-            return self.desc.persistable()
-
-    @persistable.setter
-    def persistable(self, p):
-        if in_dygraph_mode():
-            return self._ivar.persistable
-        else:
-            self.desc.set_persistable(p)
-
-    @property
-    def name(self):
-        if in_dygraph_mode():
-            return self._ivar.name
-        else:
-            return cpt.to_text(self.desc.name())
-
-    @name.setter
-    def name(self, new_name):
-        if in_dygraph_mode():
-            self._ivar.name = new_name
-        else:
-            self.desc.set_name(new_name)
-
-    @property
-    def shape(self):
-        # convert to tuple, make it as same as numpy API.
-        if in_dygraph_mode():
-            return self._ivar.shape
-        else:
-            return tuple(self.desc.shape())
-
-    @property
-    def dtype(self):
-        if in_dygraph_mode():
-            return self._ivar.dtype
-        else:
-            return self.desc.dtype()
-
-    @property
-    def lod_level(self):
-        # TODO(minqiyang): Support lod_level in dygraph mode
-        if in_dygraph_mode():
-            raise Exception("Dygraph model DO NOT supprt lod")
-        return self.desc.lod_level()
-
-    @property
-    def type(self):
-        if in_dygraph_mode():
-            return self._ivar.type
-        else:
-            return self.desc.type()
-
-    def _set_error_clip(self, error_clip):
-        """
-        Set the error_clip.
-
-        Args:
-            error_clip(BaseErrorClipAttr) : The new error_clip.
-
-        Returns:
-            None
-        """
-        self.error_clip = error_clip
-
-    def _slice_indices(self, slice, length):
-        """
-        Reference implementation for the slice.indices method.
-        """
-        # Compute step and length as integers.
-        step = 1 if slice.step is None else slice.step
-
-        # Raise ValueError for negative length or zero step.
-        if length < 0:
-            raise ValueError("length should not be negative")
-        if step == 0:
-            raise ValueError("slice step cannot be zero")
-
-        # Find lower and upper bounds for start and stop.
-        lower = -1 if step < 0 else 0
-        upper = length - 1 if step < 0 else length
-
-        # Compute start.
-        if slice.start is None:
-            start = upper if step < 0 else lower
-        else:
-            start = slice.start
-            start = max(start + length, lower) if start < 0 else min(start,
-                                                                     upper)
-
-        # Compute stop.
-        if slice.stop is None:
-            stop = lower if step < 0 else upper
-        else:
-            stop = slice.stop
-            stop = max(stop + length, lower) if stop < 0 else min(stop, upper)
-
-        return start, stop, step
-
-    def _detectEllipsis(self, item):
-        has_ellipsis = False
-        start = 0
-        end = len(self.shape)
-        for index, o in enumerate(item):
-            if o is Ellipsis:
-                if has_ellipsis:
-                    raise ValueError("Index can have one ellipsis only.")
-                has_ellipsis = True
-                start = index
-            else:
-                if has_ellipsis:
-                    end = index
-        return has_ellipsis, start, end
-
-    def _reconstructSliceinfo(self, item):
-        has_ellipsis, start, end = self._detectEllipsis(item)
-        if has_ellipsis:
-            newitem = []
-            for i in range(start):
-                newitem.append(item[i])
-            for i in range(start, end):
-                newitem.append(slice(None, None, None))
-            for i in range(end, len(item)):
-                newitem.append(item[i])
-            return newitem
-        else:
-            return None
-
-    def _detectContinuesSlice(self, item):
-        starts = []
-        ends = []
-        for index, o in enumerate(item):
-            if isinstance(o, int):
-                start = int(o)
-                if (index > 0 and index >= self.shape[index]) \
-                        or (index < 0 and (index + self.shape[index]) < 0):
-                    raise IndexError("invalid index")
-                start = max(start + self.shape[index], 0) if start < 0 else min(
-                    start, self.shape[index])
-                starts.append(start)
-                ends.append(start + 1)
-            elif isinstance(o, slice):
-                start, stop, step = self._slice_indices(o, self.shape[index])
-                if step == 1 or step == -1:
-                    starts.append(start)
-                    ends.append(stop)
-                else:
-                    return False, None
-            else:
-                raise IndexError("Valid index accept int or slice or ellipsis")
-        return True, [starts, ends]
-
-    def _cloneVar(self, copy=False):
-        if not copy:
-            return self.block.create_var(
-                name=unique_name.generate_with_ignorable_key(self.name),
-                dtype=self.dtype)
-        else:
-            return self
-
-    def _sliceVar(self, axes, starts, ends):
-        new_var = self._cloneVar()
-        self.block.append_op(
-            type="slice",
-            inputs={'Input': [self]},
-            outputs={'Out': [new_var]},
-            attrs={'axes': axes,
-                   'starts': starts,
-                   'ends': ends})
-        return new_var
-
-    def _concatVar(self, inputs, axis):
-        new_var = self._cloneVar()
-        self.block.append_op(
-            type="concat",
-            inputs={'X': inputs},
-            outputs={'Out': [new_var]},
-            attrs={'axis': axis, })
-        return new_var
-
-    def _sliceAndConcatVar(self, item, axis):
-        if isinstance(item, slice):
-            if self.shape[axis] < 0:
-                return self._cloneVar(True)
-            start, stop, step = self._slice_indices(item, self.shape[axis])
-            if step == 1:
-                return self._sliceVar([axis], [start], [stop])
-            else:
-                vars = []
-                if step > 0:
-                    while start < stop:
-                        vars.append(
-                            self._sliceVar([axis], [start], [start + 1]))
-                        start += step
-                else:
-                    while start > stop:
-                        vars.append(
-                            self._sliceVar([axis], [start], [start + 1]))
-                        start += step
-                return self._concatVar(vars, axis)
-        elif isinstance(item, int):
-            if self.shape[axis] < 0:
-                return self._cloneVar(True)
-            index = int(item)
-            if (index > 0 and index >= self.shape[axis])\
-                    or (index < 0 and (index + self.shape[axis]) < 0):
-                raise IndexError("invalid index")
-            return self._sliceVar([axis], [index], [index + 1])
-        else:
-            raise IndexError("Valid index accept int or slice or tuple")
-
-    def __getitem__(self, item):
-        """
-        Slice the variable.
-
-        Args:
-            item(int/slice/tuple) : the index.
-
-        Returns:
-            Sliced variable
-        """
-
-        if not isinstance(item, tuple):
-            item = [item]
-
-        decrease_axis = []
-        slice_axis = []
-        slice_start = []
-        slice_end = []
-        reverse_axis = []
-
-        def fill_constant(shape, dtype, value, force_cpu=False, out=None):
-            self.block.append_op(
-                type='fill_constant',
-                inputs={},
-                outputs={'Out': [out]},
-                attrs={
-                    'shape': shape,
-                    'dtype': out.dtype,
-                    'value': float(value),
-                    'force_cpu': force_cpu or force_init_on_cpu()
-                },
-                stop_gradient=True)
-            out.stop_gradient = True
-            return out
-
-        for dim, slice_item in enumerate(item):
-            if isinstance(slice_item, slice):
-                start = slice_item.start
-                end = slice_item.stop
-                step = slice_item.step if slice_item.step else 1
-
-                assert (step == 1 or step == -1)
-
-                if step == -1:
-                    reverse_axis.append(dim)
-                    assert (start is None and end is None)
-
-                if start is None and end is None:
-                    continue
-
-                if start is None:
-                    start = 0
-
-                if end is None:
-                    end = 10000000
-
-                slice_axis.append(dim)
-                slice_start.append(start)
-                slice_end.append(end)
-            else:
-                decrease_axis.append(dim)
-                slice_axis.append(dim)
-                slice_start.append(slice_item)
-                if isinstance(slice_item, Variable):
-                    temp_1 = self.block.create_var(dtype='int32')
-                    fill_constant([1], 'int32', 1, force_cpu=True, out=temp_1)
-                    temp_end = self.block.create_var(dtype='int32')
-                    self.block.append_op(
-                        type='elementwise_add',
-                        inputs={'X': slice_item,
-                                'Y': temp_1},
-                        outputs={'Out': temp_end},
-                        attrs={'axis': -1})
-                    slice_end.append(temp_end)
-                else:
-                    slice_end.append(slice_item + 1
-                                     if slice_item != -1 else 10000000)
-
-        def contain_var(one_list):
-            for ele in one_list:
-                if isinstance(ele, Variable):
-                    return True
-            return False
-
-        def get_new_list_tensor(old_list):
-            new_list_tensor = []
-            for dim in old_list:
-                if isinstance(dim, Variable):
-                    dim.stop_gradient = True
-                    new_list_tensor.append(dim)
-                else:
-                    assert (isinstance(dim, int))
-                    temp_out = self.block.create_var(dtype='int32')
-                    fill_constant(
-                        [1], 'int32', dim, force_cpu=True, out=temp_out)
-                    new_list_tensor.append(temp_out)
-            return new_list_tensor
-
-        inputs = {'Input': [self]}
-        attrs = {
-            'axes': slice_axis,
-            'starts': [],
-            'ends': [],
-            'decrease_axis': decrease_axis
-        }
-        infer_flags = list(1 for i in range(len(slice_axis)))
-
-        # starts
-        if not contain_var(slice_start):
-            attrs['starts'] = slice_start
-        else:
-            inputs['StartsTensorList'] = get_new_list_tensor(slice_start)
-            for i, dim in enumerate(slice_start):
-                if isinstance(dim, Variable):
-                    attrs['starts'].append(-1)
-                    infer_flags[i] = -1
-                else:
-                    attrs['starts'].append(dim)
-        # ends
-        if not contain_var(slice_end):
-            attrs['ends'] = slice_end
-        else:
-            inputs['EndsTensorList'] = get_new_list_tensor(slice_end)
-            for i, dim in enumerate(slice_end):
-                if isinstance(dim, Variable):
-                    attrs['ends'].append(-1)
-                    infer_flags[i] = -1
-                else:
-                    attrs['ends'].append(dim)
-        # infer_flags
-        attrs['infer_flags'] = infer_flags
-
-        out = self
-        if len(slice_axis) > 0:
-            # append slice_op here
-            slice_out_var = self.block.create_var(
-                name=unique_name.generate_with_ignorable_key(self.name +
-                                                             "_slice"),
-                dtype=self.dtype)
-
-            self.block.append_op(
-                type="slice",
-                inputs=inputs,
-                outputs={'Out': [slice_out_var]},
-                attrs=attrs)
-
-            out = slice_out_var
-
-        if len(reverse_axis) > 0:
-            reverse_out_var = self.block.create_var(
-                name=unique_name.generate_with_ignorable_key(self.name +
-                                                             "_slice_reverse"),
-                dtype=self.dtype)
-            self.block.append_op(
-                type="reverse",
-                inputs={'X': out},
-                outputs={'Out': [reverse_out_var]},
-                attrs={'axis': reverse_axis})
-
-            out = reverse_out_var
-
-        return out
-
-
-def get_all_op_protos():
-    """
-    Get all registered op proto from PaddlePaddle C++ end.
-
-    Returns:
-       list: list of OpProto.
-    """
-    protostrs = core.get_all_op_protos()
-    ret_values = []
-    for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
-        ret_values.append(op_proto)
-    return ret_values
-
-
-class OpProtoHolder(object):
-    """
-    A global variable to hold all OpProtos from C++ as a map
-    """
-
-    @classmethod
-    def instance(cls):
-        if not hasattr(cls, '_instance'):
-            cls._instance = cls()
-        return cls._instance
-
-    def __init__(self):
-        assert not hasattr(
-            self.__class__,
-            '_instance'), 'Please use `instance()` to get OpProtoHolder object!'
-        op_protos = get_all_op_protos()
-        self.op_proto_map = {}
-        for proto in op_protos:
-            self.op_proto_map[proto.type] = proto
-
-    def get_op_proto(self, type):
-        """
-        Get OpProto by a type string.
-        Args:
-            type(str): The type that operator registered in C++ side.
-
-        Returns(framework_pb2.OpProto): The OpProto
-
-        """
-        if type not in self.op_proto_map:
-            raise ValueError("Operator \"%s\" has not been registered." % type)
-        return self.op_proto_map[type]
-
-    @staticmethod
-    def generated_op_attr_names():
-        return {
-            core.op_proto_and_checker_maker.kOpRoleAttrName(),
-            core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
-            core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
-            core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
-        }
-
-
-class Operator(object):
-    """
-    In Fluid, all the operation are represented by Operator, and Operator
-    is regarded as a build in an instruction of a Block. Users can use the
-    build in instructions to describe their neural network.
-
-    Args:
-        block(Block): The block has the current operator.
-        desc(core.OpDesc): The protobuf description of Operator.
-        type(str): The type of operator. Default None.
-        inputs(dict): The input of this Operator. it is a dictionary, for every
-            element, key is the input parameter name, and value is a list of
-            variables. Default None.
-        outputs(dict): The output of this Operator. it is a dictionary, for
-            every element, key is the input parameter name, and value is a list
-            of variables. Default None.
-        attrs(dict): The attributes of this Operator. it is a dictionary, for
-            every element, key is attribute name, and value is the attribute value.
-            The attribute type should be as same as the type registered in C++ side.
-            Default None.
-
-    Returns:
-        Operator: The initialized Operator.
-
-    Raises:
-        ValueError: If the passed input, output and attrs doesn't match the
-            initializing Operator's that registered in C++ side.
-
-    Notes:
-        The constructor of operator should not be invoked directly. Use
-        Block.append_op or Block._prepend_op instead.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            cur_program = fluid.Program()
-            cur_block = cur_program.current_block()
-            # var1 += var2 + var3
-            cur_block.append_op(type="sum",
-                                inputs={"X": [var1, var2, var3]},
-                                outputs={"Out": [var1]})
-    """
-    OP_WITHOUT_KERNEL_SET = {
-        'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
-        'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
-        'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
-        'gen_nccl_id', 'c_gen_nccl_id', 'c_comm_init', 'c_sync_calc_stream',
-        'c_sync_comm_stream'
-    }
-
-    def __init__(self,
-                 block,
-                 desc,
-                 type=None,
-                 inputs=None,
-                 outputs=None,
-                 attrs=None):
-        if in_dygraph_mode():
-            if type is None:
-                raise ValueError(
-                    "`type` to initialized an Operator can not be None.")
-            self._type = type
-            self.attrs = attrs if attrs else {}
-        else:
-            self.block = block
-            self.desc = desc
-            # note: not add self.attrs here:
-            # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173
-            op_attrs = attrs
-            if op_attrs is None:
-                op_attrs = dict()
-            del attrs
-
-            op_maker = core.op_proto_and_checker_maker
-
-            if op_maker.kOpRoleAttrName() not in op_attrs:
-                op_attrs[op_maker.kOpRoleAttrName(
-                )] = self.block.program._op_role
-
-            role_var_name = op_maker.kOpRoleVarAttrName()
-            if len(self.block.program.
-                   _op_role_var) != 0 and role_var_name not in op_attrs:
-                op_attrs[role_var_name] = self.block.program._op_role_var
-
-            if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
-                del op_attrs[role_var_name]
-
-            if len(self.desc.type()) != 0:
-                return
-            if type is None:
-                raise ValueError(
-                    "`type` to initialized an Operator can not be None.")
-            else:
-                callstack_var_name = op_maker.kOpCreationCallstackAttrName()
-                op_attrs[callstack_var_name] = list(
-                    reversed(traceback.format_stack()))[1:]
-
-            self.desc.set_type(type)
-            proto = OpProtoHolder.instance().get_op_proto(type)
-
-            namescope_var_name = op_maker.kOpNameScopeAttrName()
-            op_attrs[namescope_var_name] = _full_name_scope()
-
-            def find_name(var_list, name):
-                for var_name in var_list:
-                    if var_list[var_name] is not None and var_name == name:
-                        return True
-                return False
-
-            if inputs is not None:
-                for in_proto in proto.inputs:
-                    found = find_name(inputs, in_proto.name)
-                    assert found or in_proto.dispensable, "Input {} not found".format(
-                        in_proto.name)
-                    if found:
-                        in_args = inputs[in_proto.name]
-                        if not isinstance(in_args, list):
-                            in_args = [in_args]
-                        if not in_proto.duplicable and len(in_args) > 1:
-                            raise ValueError(
-                                "Input %s expects only one input, but %d are given."
-                                % (in_proto.name, len(in_args)))
-                        in_arg_names = []
-                        for index, arg in enumerate(in_args):
-                            if isinstance(arg, six.string_types):
-                                in_arg_names.append(arg)
-                            elif isinstance(arg, six.binary_type):
-                                in_arg_names.append(arg.decode())
-                            elif isinstance(arg, Variable):
-                                in_arg_names.append(cpt.to_text(arg.name))
-                            else:
-                                raise ValueError(
-                                    "not suprt args type , should be[ string_type, binary_type, Varibale]"
-                                )
-                        self.desc.set_input(in_proto.name, in_arg_names)
-                    else:
-                        self.desc.set_input(in_proto.name, [])
-
-            if outputs is not None:
-                for m in proto.outputs:
-                    if (m.name not in outputs) and m.dispensable:
-                        continue
-                    if not ((m.name in outputs) or m.dispensable):
-                        raise ValueError(("Incorrect setting for output(s) of "
-                                          "operator \"%s\", should set: [%s].")
-                                         % (type, m.name))
-                for out_proto in proto.outputs:
-                    if out_proto.name not in outputs:
-                        continue
-                    out_args = outputs[out_proto.name]
-                    if not isinstance(out_args, list):
-                        out_args = [out_args]
-                    if not out_proto.duplicable and len(out_args) > 1:
-                        raise ValueError(
-                            "Output %s expects only one output, but %d are given."
-                            % (out_proto.name, len(out_args)))
-                    out_arg_names = []
-                    for arg in out_args:
-                        out_arg_names.append(cpt.to_text(arg.name))
-                        # TODO(minqiyang): could we remove variable's op in static mode?
-                        if not in_dygraph_mode():
-                            arg.op = self
-                    self.desc.set_output(out_proto.name, out_arg_names)
-
-            if op_attrs is not None:
-                if not isinstance(op_attrs, dict):
-                    raise TypeError("'attrs' should be a dict.")
-                for attr in proto.attrs:
-                    attr_name = attr.name
-                    if (attr_name not in op_attrs) or (
-                            op_attrs[attr_name] is None):
-                        continue
-                    attr_val = op_attrs[attr_name]
-                    self._update_desc_attr(attr_name, attr_val)
-
-            self.desc.check_attrs()
-            if self._has_kernel(type):
-                self.desc.infer_var_type(self.block.desc)
-                self.desc.infer_shape(self.block.desc)
-
-    def _has_kernel(self, op_type):
-        return op_type not in self.OP_WITHOUT_KERNEL_SET
-
-    def to_string(self, throw_on_error):
-        """
-        Get debug string.
-
-        Args:
-            throw_on_error(bool): Whether to raise exception if self is not
-                initialized.
-
-        Returns:
-            str: The debug string.
-
-        """
-        protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
-        return _debug_string_(proto, throw_on_error)
-
-    def __str__(self):
-        return self.to_string(True)
-
-    __repr__ = __str__
-
-    @property
-    def type(self):
-        if in_dygraph_mode():
-            return self._type
-        else:
-            return self.desc.type()
-
-    def input(self, name):
-        """
-        Get the input arguments according to the input parameter name.
-
-        Args:
-            name(str): The input parameter name.
-
-        Returns:
-            list: return the list of argument names that associated with \
-                the specific parameter name.
-        """
-        return self.desc.input(name)
-
-    def _rename_input(self, old_name, new_name):
-        """
-        Rename the `old_name` to `new_name`.
-
-        Args:
-            old_name(str): The old name of the Operator's input.
-            new_name(str): The new name of the Operator's input.
-
-        Returns:
-            None
-        """
-        self.desc._rename_input(old_name, new_name)
-
-    def _rename_output(self, old_name, new_name):
-        """
-        Rename the `old_name` to `new_name`.
-
-        Args:
-            old_name(str): The old name of the Operator's output.
-            new_name(str): The new name of the Operator's output.
-
-        Returns:
-            None
-        """
-        self.desc._rename_output(old_name, new_name)
-
-    @property
-    def input_names(self):
-        return self.desc.input_names()
-
-    @property
-    def input_arg_names(self):
-        return self.desc.input_arg_names()
-
-    @property
-    def output_arg_names(self):
-        return self.desc.output_arg_names()
-
-    def output(self, name):
-        """
-        Get output arguments by the output parameter name.
-
-        Args:
-            name(str): The output parameter name.
-
-        Returns:
-            list: return the list of argument names associated with \
-                the specific parameter name.
-        """
-        return self.desc.output(name)
-
-    @property
-    def output_names(self):
-        return self.desc.output_names()
-
-    @property
-    def idx(self):
-        for i, op in enumerate(self.block.ops):
-            if op == self:
-                return i
-        raise ValueError(
-            "Can't find op itself in it's block. It could be a bug of Paddle.")
-
-    def has_attr(self, name):
-        """
-        Whether this Operator has the attribute with name or not.
-
-        Args:
-            name(str): the attribute name.
-
-        Returns:
-            bool: True if has this attribute.
-
-        """
-        return self.desc.has_attr(name)
-
-    def attr_type(self, name):
-        """
-        Get the type of attribute by attribute's name.
-
-        Args:
-            name(str): the attribute name.
-
-        Returns:
-            core.AttrType: the attribute type.
-        """
-        return self.desc.attr_type(name)
-
-    def _set_attr(self, name, val):
-        """
-        Set the value of attribute by attribute's name.
-
-        Args:
-            name(str): the attribute name.
-            val(bool|int|str|float|list): the value of the attribute.
-
-        Raises:
-            ValueError: If the type of value doesn't match with desc.attr_type(name).
-        """
-        self._update_desc_attr(name, val)
-
-    def _remove_attr(self, name):
-        self.desc.remove_attr(name)
-
-    def _update_desc_attr(self, name, val):
-        """
-        Update the value of desc's attribute by attribute's name.
-
-        Args:
-            name(str): the attribute name.
-            val(bool|int|str|float|list): the value of the attribute.
-
-        Raises:
-            ValueError: If the type of value doesn't match with desc.attr_type(name).
-        """
-        if isinstance(val, Block):
-            self.desc.set_block_attr(name, val.desc)
-        elif isinstance(val, list) and val and all(
-                isinstance(v, Block) for v in val):
-            self.desc.set_blocks_attr(name, [v.desc for v in val])
-        elif isinstance(val, core.BlockDesc) or \
-                isinstance(val, core.ProgramDesc):
-            self.desc.set_serialized_attr(name, val.serialize_to_string())
-        else:
-            self.desc._set_attr(name, val)
-
-    @property
-    def attr_names(self):
-        return self.desc.attr_names()
-
-    def attr(self, name):
-        """
-        Get the attribute by name.
-
-        Args:
-            name(str): the attribute name.
-
-        Returns:
-            bool|int|str|float|list: The attribute value. The return value
-            can be any valid attribute type.
-        """
-        return self.desc.attr(name)
-
-    def _block_attr_id(self, name):
-        """
-        Get the block attribute's id by name.
-
-        Args:
-            name(str): the attribute name.
-
-        Returns:
-            int: the block index.
-        """
-        return self.desc._block_attr_id(name)
-
-    def _block_attr(self, name):
-        """
-        Get the block attribute  by name.
-
-        Args:
-            name(str): the attribute name.
-
-        Returns:
-            block: the block attribute.
-        """
-
-        id = self._block_attr_id(name)
-        assert (id >= 0 and id < len(self.block.program.blocks))
-        return self.block.program.blocks[id]
-
-    def _blocks_attr(self, name):
-        """
-        Get the blocks attribute  by name.
-
-        Args:
-            name(str): the attribute name.
-
-        Returns:
-            list: list of the blocks attribute.
-        """
-        attrs = []
-        for i in self._blocks_attr_ids(name):
-            assert (i >= 0 and i < len(self.block.program.blocks))
-            attrs.append(self.block.program.blocks[i])
-
-        return attrs
-
-    def _blocks_attr_ids(self, name):
-        """
-        Get the blocks attribute's ids by name.
-
-        Args:
-            name(str): the attribute name.
-
-        Returns:
-            list: list of the blocks ids.
-        """
-
-        return self.desc._blocks_attr_ids(name)
-
-    def all_attrs(self):
-        """
-        Get the attribute dict.
-
-        Returns:
-            dict: The Operator's attribute dict, name->attr.
-        """
-        attr_names = self.attr_names
-        attr_map = {}
-        for n in attr_names:
-            attr_type = self.desc.attr_type(n)
-            if attr_type == core.AttrType.BLOCK:
-                attr_map[n] = self._block_attr(n)
-                continue
-
-            if attr_type == core.AttrType.BLOCKS:
-                attr_map[n] = self._blocks_attr(n)
-                continue
-
-            attr_map[n] = self.attr(n)
-
-        return attr_map
-
-
-class Block(object):
-    """
-    In Fluid, a Program is consistence of multi-Block, and Block stores
-    VarDesc and OpDesc. In a specific Block, a VarDesc have a unique name.
-    One block could have some child blocks, and child block's name scopes
-    should inherit the parent's so that OpDesc in child block can reference
-    a VarDesc that is stored in the parent block.
-    Please reference the framework.proto for details.
-
-    Args:
-        program(Program): The Program that the Block belongs to.
-        idx(int): The block's id in the Program.
-
-    Notes:
-        The constructor of Block should not be invoked directly. Please
-        use `Program._create_block()` to create a block.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            cur_program = fluid.Program()
-            cur_block = cur_program.current_block()
-            var = cur_block.create_var(name="X",
-                                       shape=[-1, 23, 48],
-                                       dtype='float32')
-            cur_block.append_op(type="abs",
-                                inputs={"X": [var]},
-                                outputs={"Out": [var]})
-    """
-
-    def __init__(self, program, idx):
-        self.desc = program.desc.block(idx)
-        self.vars = collections.OrderedDict()  # var_name --> var
-        self.ops = list()  # operator list
-        self.program = program
-        self.removed_vars = collections.OrderedDict()
-
-    def __str__(self):
-        return self.to_string(True)
-
-    def to_string(self, throw_on_error, with_details=False):
-        """
-        Get debug string.
-
-        Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True.
-            with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when
-                with_details is True. Default False.
-
-        Returns:
-            str: The debug string.
-        """
-        assert isinstance(throw_on_error, bool) and isinstance(with_details,
-                                                               bool)
-        if with_details:
-            re_add_indent = re.compile(r"\n(.)")
-            res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
-                self.idx, self.parent_idx)
-            for var in list(self.vars.values()):
-                res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
-                    r"\n    \1", var.to_string(throw_on_error, with_details))
-            for op in self.ops:
-                res_str += "\n  ops {\n    %s  }" % re_add_indent.sub(
-                    r"\n    \1", op.to_string(throw_on_error))
-            res_str += "\n}"
-        else:
-            protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.BlockDesc.FromString(
-                six.binary_type(protostr))
-            res_str = _debug_string_(proto, throw_on_error)
-        return res_str
-
-    __repr__ = __str__
-
-    @property
-    def parent_idx(self):
-        return self.desc.parent
-
-    @property
-    def forward_block_idx(self):
-        return self.desc.get_forward_block_idx()
-
-    def _set_forward_block_idx(self, idx):
-        """
-        Set the forward block Idx.
-
-        Args:
-            idx(int): the block index.
-
-        Returns:
-            None
-        """
-        self.desc._set_forward_block_idx(idx)
-
-    @property
-    def idx(self):
-        return self.desc.id
-
-    def var(self, name):
-        """
-        Get a Variable by name from this block.
-
-        Args:
-            name(str): the Variable's name.
-
-        Raises:
-            ValueError: The If input's type is not str, or this block
-                doesn't have a Variable with the giving name.
-
-        Returns:
-            Variable: the Variable with the giving name.
-        """
-        if not isinstance(name, six.string_types):
-            raise TypeError(
-                "var require string as parameter, but get %s instead." %
-                (type(name)))
-        v = self.vars.get(name, None)
-        if v is None:
-            raise ValueError("var %s not in this block" % name)
-        return v
-
-    def _find_var_recursive(self, name):
-        """
-        Get a Variable by name from this block recursively.
-
-        Args:
-            name(str): the Variable's name.
-
-        Returns:
-            Variable: the Variable with the giving name. Or None if not found.
-        """
-        frontier = list()
-        visited = set()
-
-        frontier.append(self)
-
-        prog = self.program
-
-        while len(frontier) != 0:  # BFS
-            cur = frontier[0]
-            frontier = frontier[1:]
-
-            if id(cur) in visited:
-                continue
-
-            if cur.has_var(name):
-                return cur.var(name)
-
-            if cur.parent_idx != -1:
-                frontier.append(prog.block(cur.parent_idx))
-
-            if cur.forward_block_idx != -1:
-                frontier.append(prog.block(cur.forward_block_idx))
-
-            visited.add(id(cur))
-        return None
-
-    def _var_recursive(self, name):
-        """
-        Get a Variable by name from this block recursively.
-
-        Args:
-            name(str): the Variable's name.
-
-        Raises:
-            ValueError: this block and this parent block doesn't
-                have a Variable with the giving name.
-
-        Returns:
-            Variable: the Variable with the giving name.
-        """
-        var = self._find_var_recursive(name)
-        if var:
-            return var
-        else:
-            raise ValueError("Var {0} is not found recursively".format(name))
-
-    def all_parameters(self):
-        return list(self.iter_parameters())
-
-    def iter_parameters(self):
-        return (item[1] for item in six.iteritems(self.vars)
-                if isinstance(item[1], Parameter))
-
-    def create_var(self, *args, **kwargs):
-        var = Variable(block=self, *args, **kwargs)
-        if 'initializer' in kwargs:
-            kwargs['initializer'](var, self)
-        return var
-
-    def has_var(self, name):
-        return name in self.vars
-
-    def _rename_var(self, name, new_name):
-        """
-        Rename variable in vars and ops' inputs and outputs
-
-        Args:
-            name(str): the name that need to be renamed.
-            new_name(str): the name that need to rename to.
-
-        Raises:
-            ValueError: If this block doesn't have this the giving name,
-                or the type of the var with the giving name is not Parameter
-                or Variable.
-
-        Returns:
-            Variable: the Variable with the giving name.
-        """
-        name = cpt.to_text(name)
-        new_name = cpt.to_text(new_name)
-
-        if not self.has_var(name):
-            raise ValueError("var %s is not in current block" % name)
-        v = self.var(name)
-        if type(v) == Parameter:
-            var_type = "Parameter"
-            stop_gradient = v.stop_gradient
-            trainable = v.trainable
-            optimize_attr = v.optimize_attr
-            regularizer = v.regularizer
-            gradient_clip_attr = v.gradient_clip_attr
-            error_clip = v.error_clip
-        elif type(v) == Variable:
-            var_type = "Variable"
-            error_clip = v.error_clip
-            stop_gradient = v.stop_gradient
-        else:
-            raise ValueError("unsupported var type: %s", type(v))
-        orig_var_type = v.type
-        self.desc._rename_var(cpt.to_bytes(name), cpt.to_bytes(new_name))
-        # NOTE: v is destroyed by C++ after calling _rename_var.
-        d = self.desc.find_var(cpt.to_bytes(new_name))
-        if var_type == "Parameter":
-            var = Parameter(
-                self,
-                d.shape(),
-                d.dtype(),
-                type=orig_var_type,
-                name=new_name,
-                stop_gradient=stop_gradient,
-                trainable=trainable,
-                optimize_attr=optimize_attr,
-                regularizer=regularizer,
-                gradient_clip_attr=gradient_clip_attr,
-                error_clip=error_clip)
-        elif var_type == "Variable":
-            var = Variable(
-                self,
-                type=orig_var_type,
-                name=new_name,
-                error_clip=error_clip,
-                stop_gradient=stop_gradient)
-
-        # rename the python side, _sync_with_cpp will only add
-        # new vars/ops to python side.
-        self.vars[new_name] = var
-        del self.vars[name]
-        self._sync_with_cpp()
-        return var
-
-    def _remove_var(self, name):
-        self._sync_with_cpp()
-        self.desc._remove_var(cpt.to_bytes(name))
-        del self.vars[name]
-
-    def create_parameter(self, *args, **kwargs):
-        global_block = self.program.global_block()
-        param = Parameter(global_block, *args, **kwargs)
-        if 'initializer' in kwargs:
-
-            def _is_inited_by(block, var):
-                init_ops = []
-                for op in block.ops:
-                    if var.name in op.output_arg_names:
-                        # In startup_program, "c_broadcast" and "c_sync_comm_stream"
-                        # are treated as initialization ops that cause error. 
-                        # Think of "c_broadcast" and "c_sync_comm_stream" as a special case here.
-                        if op.type in ["c_broadcast", "c_sync_comm_stream"]:
-                            continue
-                        init_ops.append(op)
-                return init_ops
-
-            initializer = kwargs['initializer']
-            init_ops = _is_inited_by(global_block, param)
-            init_ops_len = len(init_ops)
-            if init_ops_len > 1:
-                raise RuntimeError("param " + param.name +
-                                   " is inited by multiple init ops " + str(
-                                       init_ops))
-            elif init_ops_len == 1:
-                #TODO already inited, do nothing, should log a warning
-                pass
-            else:
-                initializer(param, self)
-        param.stop_gradient = False
-        return param
-
-    def append_op(self, *args, **kwargs):
-        """
-        Appends a new Operator according to the giving arguments.
-
-        Returns:
-            Operator: the append Operator.
-        """
-        if in_dygraph_mode():
-            attrs = kwargs.get("attrs", {})
-            if _dygraph_tracer_._train_mode == False:
-                # eval mode
-                if ('trainable_statistics' not in attrs
-                    ) or not attrs['trainable_statistics']:
-                    attrs['is_test'] = True
-                else:
-                    attrs['is_test'] = False
-
-            type = kwargs.get("type", None)
-
-            op = Operator(
-                block=self,
-                desc=None,
-                type=type,
-                inputs=None,
-                outputs=None,
-                attrs=attrs)
-
-            # record ops in tracer rather than blocks
-            #
-            # TODO(minqiyang): add op stop_gradient support in static mode too.
-            # currently, we only support stop_gradient in dygraph mode.
-
-            _dygraph_tracer().trace_op(type,
-                                       kwargs.get("inputs", {}),
-                                       kwargs.get("outputs", {}), attrs
-                                       if attrs else {},
-                                       kwargs.get("stop_gradient", False))
-        else:
-            op_desc = self.desc.append_op()
-            op = Operator(
-                block=self,
-                desc=op_desc,
-                type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
-                outputs=kwargs.get("outputs", None),
-                attrs=kwargs.get("attrs", None))
-
-            self.ops.append(op)
-
-        return op
-
-    def _insert_op(self, index, *args, **kwargs):
-        """
-        Insert a Operator according to the giving arguments.
-
-        Args:
-            index(int): the place that the operator to insert.
-
-        Returns:
-            Operator: the insert Operator.
-        """
-        self._sync_with_cpp()
-        op_desc = self.desc._insert_op(index)
-        op = Operator(block=self, desc=op_desc, *args, **kwargs)
-        self.ops.insert(index, op)
-        return op
-
-    def _remove_op(self, index):
-        """
-        Remove the specific position operator.
-
-        Args:
-            index(int): the position that the operator to insert.
-
-        Returns:
-            None
-        """
-        self._sync_with_cpp()
-        self.desc._remove_op(index, index + 1)
-        del self.ops[index]
-
-    def _slice_ops(self, start, end):
-        """
-        Return the Operator between start and end.
-
-        Args:
-            start(int): the start position.
-            end(int): the end position.
-
-        Returns:
-            list: the Operators between start and end.
-        """
-        return self.ops[start:end]
-
-    def _prepend_op(self, *args, **kwargs):
-        if in_dygraph_mode():
-            type = kwargs.get("type", None)
-            attrs = kwargs.get("attrs", {})
-            op = Operator(
-                self, None, type=type, inputs=None, outputs=None, attrs=attrs)
-
-            _dygraph_tracer().trace_op(type,
-                                       kwargs.get("inputs", {}),
-                                       kwargs.get("outputs", {}), attrs
-                                       if attrs else {},
-                                       kwargs.get("stop_gradient", False))
-        else:
-            op_desc = self.desc._prepend_op()
-            op = Operator(
-                self,
-                op_desc,
-                type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
-                outputs=kwargs.get("outputs", None),
-                attrs=kwargs.get("attrs", None))
-            self.ops.insert(0, op)
-
-        return op
-
-    def _sync_with_cpp(self):
-        """
-        Sync from the desc on the c++ end. This method is used to synchronize
-        the c++ desc instance generated by backward.
-        """
-        # sync variables from cpp
-        for var in self.desc.all_vars():
-            if not self.has_var(var.name()):
-                self.create_var(name=var.name(), desc=var, type=var.type())
-
-        # sync variables removed from c++ end
-        for var in list(self.vars.keys()):
-            if not self.desc.find_var(cpt.to_bytes(var)):
-                self.vars.pop(var)
-
-        # sync operators from cpp
-        ops_in_cpp = []
-        for op_idx in range(0, self.desc.op_size()):
-            ops_in_cpp.append(self.desc.op(op_idx))
-
-        if len(self.ops) != 0:
-            first_op_in_python = self.ops[0].desc
-            last_op_in_python = self.ops[len(self.ops) - 1].desc
-            start_index = None
-            end_index = None
-            for index in range(len(ops_in_cpp)):
-                if first_op_in_python == ops_in_cpp[index]:
-                    start_index = index
-                if last_op_in_python == ops_in_cpp[index]:
-                    end_index = index
-            assert start_index is not None
-            assert end_index is not None
-            assert start_index <= end_index
-        else:
-            start_index = 0
-            end_index = -1
-
-        # sync ops append to the head of cpp_ops
-        for index in range((start_index - 1 - 1), -1, -1):
-            op_desc = ops_in_cpp[index]
-            op = Operator(self, op_desc)
-            self.ops.insert(0, op)
-
-        # sync ops append to the end of cpp_ops
-        for index in range((end_index + 1), len(ops_in_cpp)):
-            op_desc = ops_in_cpp[index]
-            op = Operator(self, op_desc)
-            self.ops.append(op)
-
-        # sync ops removed from c++ end
-        if end_index != -1 and end_index < len(self.ops):
-            ops_in_cpp_index = 0
-            ops_in_python_index = 0
-            while ops_in_python_index < len(
-                    self.ops) and ops_in_cpp_index < len(ops_in_cpp):
-                if self.ops[ops_in_python_index].desc != ops_in_cpp[
-                        ops_in_cpp_index]:
-                    del self.ops[ops_in_python_index]
-                else:
-                    ops_in_cpp_index += 1
-                    ops_in_python_index += 1
-
-        assert len(self.ops) == len(ops_in_cpp)
-        for index in range(len(self.ops)):
-            assert self.ops[index].desc == ops_in_cpp[index]
-
-    def _copy_param_info_from(self, other):
-        """
-        Copy the information of parameters from the other block.
-
-        Args:
-            other(Block): the other block.
-
-        Raises:
-            ValueError: If type of input is not Block, or the `other` and this
-                block is not in the same topology.
-
-        Returns:
-            None
-        """
-        if not isinstance(other, Block):
-            raise TypeError(
-                "_copy_param_info_from should be invoked with Block")
-        for p in other.iter_parameters():
-            assert isinstance(p, Parameter)
-            v = self.vars.get(p.name, None)
-            if v is None:
-                raise ValueError("_copy_param_info_from should be invoked with "
-                                 "same topology")
-            assert isinstance(v, Variable)
-            new_p = Parameter(
-                block=self,
-                shape=v.shape,
-                dtype=v.dtype,
-                type=v.type,
-                lod_level=v.lod_level,
-                stop_gradient=p.stop_gradient,
-                trainable=p.trainable,
-                optimize_attr=p.optimize_attr,
-                regularizer=p.regularizer,
-                gradient_clip_attr=p.gradient_clip_attr,
-                error_clip=p.error_clip,
-                name=v.name)
-            self.vars[new_p.name] = new_p
-
-    def _clone_variable(self, var, force_persistable=True):
-        """
-        Clone a variable into current block.
-
-        Args:
-            var: the variable to be cloned.
-            force_persistable(bool): True means setting the result variable to being persistable.
-                                     False means setting the persistable the same with that of input var.
-                                     default: True.
-
-        Returns:
-            Variable: the new  variable cloned from 'var' in current block.
-        """
-        assert isinstance(var, Variable)
-        ret_var = None
-        # make STEP_SCOPES var can be safely cloned.
-        if var.type == core.VarDesc.VarType.STEP_SCOPES:
-            ret_var = self.create_var(
-                name=var.name, persistable=var.persistable, type=var.type)
-        elif var.type == core.VarDesc.VarType.RAW:
-            ret_var = self.create_var(
-                name=var.name, persistable=var.persistable, type=var.type)
-        elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
-            ret_var = self.create_var(
-                name=var.name,
-                shape=var.shape,
-                dtype=var.dtype,
-                type=var.type,
-                persistable=True if force_persistable else var.persistable,
-                is_data=var.is_data)
-        else:
-            ret_var = self.create_var(
-                name=var.name,
-                shape=var.shape,
-                dtype=var.dtype,
-                type=var.type,
-                lod_level=var.lod_level,
-                persistable=True if force_persistable else var.persistable,
-                is_data=var.is_data)
-        return ret_var
-
-
-class IrNode(object):
-    """
-    Python IrNode. Beneath it is a core.Node, which is used for Ir Pass.
-    """
-
-    def __init__(self, node):
-        """
-        Construct an IrNode using core.Node.
-
-        Args:
-            node(core.Node): C++ Node.
-        """
-        assert isinstance(node,
-                          core.Node), 'node must be the instance of core.Node.'
-        self.node = node
-
-    def name(self):
-        """
-        Return the node name.
-
-        Returns:
-            str: node name.
-        """
-        return self.node.name()
-
-    def node_type(self):
-        """
-        Return the node type.
-
-        Returns:
-            core.Node.Type: node type(core.Node.Type.Operation or core.Node.Type.Variable).
-        """
-        return self.node.node_type()
-
-    def var(self):
-        """
-        Return the node variable description.
-
-        Returns:
-            core.VarDesc: node variable description.
-        """
-        return self.node.var()
-
-    def op(self):
-        """
-        Return the node operator description.
-
-        Returns:
-            core.OpDesc: node operator description.
-        """
-        return self.node.op()
-
-    def id(self):
-        """
-        Return the node id.
-
-        Returns:
-            int: node id.
-        """
-        return self.node.id()
-
-    def is_op(self):
-        """
-        If the node is an operator, then return true.
-
-        Returns:
-            bool: indicate whether the node is an operator.
-        """
-        return self.node.is_op()
-
-    def is_var(self):
-        """
-        If the node is a variable, then return true.
-
-        Returns:
-            bool: indicate whether the node is a variable.
-        """
-        return self.node.is_var()
-
-    def is_ctrl_var(self):
-        """
-        If the node is a control dependence variable, then return true.
-
-        Returns:
-            bool: indicate whether the node is a control dependence variable.
-        """
-        return self.node.is_ctrl_var()
-
-    def clear_inputs(self):
-        """
-        Clear the node inputs. After executing the `clear_inputs` function,
-        the node inputs will be empty.
-        """
-        self.node.clear_inputs()
-
-    def remove_input_by_id(self, node_id):
-        """
-        Remove a node from inputs by the given node id.
-
-        Args:
-            node_id(int): the given node id.
-        """
-        self.node.remove_input(node_id)
-
-    def remove_input(self, node):
-        """
-        Remove a node from inputs.
-
-        Args:
-            node(IrNode): the node being removed.
-        """
-        self.node.remove_input(node.node)
-
-    def append_input(self, node):
-        """
-        Append a node in inputs.
-
-        Args:
-            node(IrNode): the node being appended.
-        """
-        self.node.append_input(node.node)
-
-    def clear_outputs(self):
-        """
-        Clear the node outputs. After executing the `clear_outputs` function,
-        the node outputs will be empty.
-        """
-        self.node.clear_outputs()
-
-    def remove_output_by_id(self, node_id):
-        """
-        Remove a node from outputs by the given node id.
-
-        Args:
-            node_id(int): the given node id.
-        """
-        self.node.remove_output(node_id)
-
-    def remove_output(self, node):
-        """
-        Remove a node from outputs.
-
-        Args:
-            node(IrNode): the node being removed.
-        """
-        self.node.remove_output(node.node)
-
-    def append_output(self, node):
-        """
-        Append a node in outputs.
-
-        Args:
-            node(IrNode): the node being appended.
-        """
-        self.node.append_output(node.node)
-
-    @property
-    def inputs(self):
-        """
-        Return the node inputs.
-
-        Returns:
-            list(IrNode): node inputs wrapped by IrNode.
-        """
-        return [IrNode(n) for n in self.node.inputs]
-
-    @property
-    def outputs(self):
-        """
-        Return the node outputs.
-
-        Returns:
-            list(IrNode): node outputs wrapped by IrNode.
-        """
-        return [IrNode(n) for n in self.node.outputs]
-
-
-class IrVarNode(IrNode):
-    """
-    Python IrVarNode. Beneath it is a core.Node, it inherits from IrNode.
-    """
-
-    def __init__(self, node):
-        """
-        Construct an IrVarNode using core.Node.
-
-        Args:
-            node(core.Node): C++ Node.
-        """
-        assert isinstance(node, core.Node) and node.is_var(), \
-            'node must be the instance of core.Node and it must be a variable node.'
-        super(IrVarNode, self).__init__(node)
-        self.node = node
-
-    def set_shape(self, shape):
-        """
-        Set the node variable shape.
-
-        Args:
-            shape(list): shape to be set.
-        """
-        assert self.node.var() is not None, \
-            "The node variable description cannot be None."
-        self.node.var().set_shape(shape)
-
-    def persistable(self):
-        """
-        If the variable node is a persistable variable, then return true.
-
-        Returns:
-            bool: indicate whether the variable is persistable.
-        """
-        assert self.node.var() is not None, \
-            "The node variable description cannot be None."
-        return self.node.var().persistable()
-
-    def type(self):
-        """
-        Return the variable type.
-
-        Returns:
-            core.VarDesc.VarType: the variable type.
-        """
-        assert self.node.var() is not None, \
-            "The node variable description cannot be None."
-        return self.node.var().type()
-
-    def dtype(self):
-        """
-        Return the variable data type.
-
-        Returns:
-            core.VarDesc.VarType: the variable data type.
-        """
-        assert self.node.var() is not None, \
-            "The node variable description cannot be None."
-        return self.node.var().dtype()
-
-    def shape(self):
-        """
-        Return the variable shape.
-
-        Returns:
-            list: the variable shape.
-        """
-        assert self.node.var() is not None, \
-            "The node variable description cannot be None."
-        return self.node.var().shape()
-
-    @property
-    def inputs(self):
-        """
-        Return the node inputs.
-
-        Returns:
-            list(IrOpNode): node inputs wrapped by IrOpNode.
-        """
-        return [IrOpNode(n) for n in self.node.inputs]
-
-    @property
-    def outputs(self):
-        """
-        Return the node outputs.
-
-        Returns:
-            list(IrOpNode): node outputs wrapped by IrOpNode.
-        """
-        return [IrOpNode(n) for n in self.node.outputs]
-
-
-class IrOpNode(IrNode):
-    """
-    Python IrOpNode. Beneath it is a core.Node, it inherits from IrNode.
-    """
-
-    def __init__(self, node):
-        """
-        Construct an IrOpNode using core.Node.
-
-        Args:
-            node(core.Node): C++ Node.
-        """
-        assert isinstance(node, core.Node) and node.is_op(), \
-            'node must be the instance of core.Node and it must be a operator node.'
-        super(IrOpNode, self).__init__(node)
-        self.node = node
-
-    def rename_input(self, old_input_name, new_input_name):
-        """
-        Rename the input of this node.
-
-        Args:
-            old_input_name(str): the old input name.
-            new_input_name(str): the new input name.
-        """
-        assert self.node.op() is not None, \
-            "The node operator description cannot be None."
-        self.node.op()._rename_input(old_input_name, new_input_name)
-
-    def rename_output(self, old_output_name, new_output_name):
-        """
-        Rename the output of this node.
-
-        Args:
-            old_output_name(str): the old output name.
-            new_output_name(str): the new output name.
-        """
-        assert self.node.op() is not None, \
-            "The node operator description cannot be None."
-        print("op: {}, old: {}, new: {}\n".format(self.node.op().type(
-        ), old_output_name, new_output_name))
-        self.node.op()._rename_output(old_output_name, new_output_name)
-
-    def input(self, name):
-        """
-        Get the argument name list by the parameter name for input.
-
-        Args:
-            name(str): the parameter name.
-
-        Returns:
-            list(str): the argument name list.
-        """
-        assert self.node.op() is not None, \
-            "The node operator description cannot be None."
-        return self.node.op().input(name)
-
-    def output(self, name):
-        """
-        Get the argument name list by the parameter name for output.
-
-        Args:
-            name(str): the parameter name.
-
-        Returns:
-            list(str): the argument name list.
-        """
-        assert self.node.op() is not None, \
-            "The node operator description cannot be None."
-        return self.node.op().output(name)
-
-    def set_type(self, new_type):
-        """
-        Change the operator type into new type.
-
-        Args:
-            new_type(str): new operator type to be set.
-        """
-        assert self.node.op() is not None, \
-            "The node operator description cannot be None."
-        return self.node.op().set_type(new_type)
-
-    def set_attr(self, name, val):
-        """
-        Set the value of attribute by attribute's name.
-
-        Args:
-            name(str): the attribute name.
-            val(bool|int|str|float|list): the value of the attribute.
-        """
-        self._update_desc_attr(name, val)
-
-    def _update_desc_attr(self, name, val):
-        """
-        Update the value of the op desc's attribute by attribute's name.
-        """
-        assert self.node.op() is not None, \
-            "The node operator description cannot be None."
-        desc = self.node.op()
-        if isinstance(val, Block):
-            desc.set_block_attr(name, val.desc)
-        elif isinstance(val, list) and val and \
-            all(isinstance(v, Block) for v in val):
-            desc.set_blocks_attr(name, [v.desc for v in val])
-        elif isinstance(val, core.BlockDesc) or \
-            isinstance(val, core.ProgramDesc):
-            desc.set_serialized_attr(name, val.serialize_to_string())
-        else:
-            desc._set_attr(name, val)
-
-    def input_arg_names(self):
-        """
-        Return input arguments' names of this op node.
-
-        Returns:
-            list(str): input arguments' names of this op node.
-        """
-        assert self.node.op() is not None, \
-            "The node operator description cannot be None."
-        return self.node.op().input_arg_names()
-
-    def output_arg_names(self):
-        """
-        Return output arguments' names of this op node.
-
-        Returns:
-            list(str): output arguments' names of this op node.
-        """
-        assert self.node.op() is not None, \
-            "The node operator description cannot be None."
-        return self.node.op().output_arg_names()
-
-    @property
-    def inputs(self):
-        """
-        Return the node inputs.
-
-        Returns:
-            list(IrVarNode): node inputs wrapped by IrVarNode.
-        """
-        return [IrVarNode(n) for n in self.node.inputs]
-
-    @property
-    def outputs(self):
-        """
-        Return the node outputs.
-
-        Returns:
-            list(IrVarNode): node outputs wrapped by IrVarNode.
-        """
-        return [IrVarNode(n) for n in self.node.outputs]
-
-
-class IrGraph(object):
-    """
-    Python IrGraph. Beneath it is a core.Graph, which is used for
-    creating a c++ Ir Pass Graph. An IrGraph is just a graph view of
-    a Program. In an IrGraph, both Variables and Operators are graph
-    nodes.
-    """
-
-    def __init__(self, graph, for_test=False):
-        """
-        Construct an IrGraph using core.Graph.
-
-        Args:
-            graph(core.Graph): C++ Graph.
-            for_test(bool): True for the test graph and false for the train graph.
-        """
-        assert isinstance(
-            graph, core.Graph), 'graph must be the instance of core.Graph.'
-        self.graph = graph
-        self._for_test = for_test
-
-    def clone(self):
-        """
-        Create a new and duplicated IrGraph.
-
-        Warns:
-            The method only clones the graph structure, not its attributes.
-
-        Returns:
-            IrGraph: A new and duplicated graph.
-        """
-        g = self.graph.clone()
-        return IrGraph(g, self._for_test)
-
-    def is_test(self):
-        """
-        If the graph is used for testing, the function returns true. Otherwise, returns false.
-        """
-        return self._for_test
-
-    def all_nodes(self):
-        """
-        Return all nodes included in the graph as a set.
-        """
-        return {IrNode(node) for node in self.graph.nodes()}
-
-    def all_var_nodes(self):
-        """
-        Return all variable nodes included in the graph as a set.
-        """
-        return {IrVarNode(node) for node in self.graph.nodes() if node.is_var()}
-
-    def all_persistable_nodes(self):
-        """
-        Return all persistable variable nodes included in the graph as a set.
-        """
-        persistable_nodes = set()
-        for node in self.graph.nodes():
-            if node.is_var() and node.var() is not None and node.var(
-            ).persistable():
-                persistable_nodes.add(node)
-        return {IrVarNode(p) for p in persistable_nodes}
-
-    def all_op_nodes(self):
-        """
-        Return all operator nodes included in the graph as a set.
-        """
-        return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()}
-
-    def create_persistable_node(self, name, var_type, shape, var_dtype):
-        """
-        Create a persistable variable node in the graph. In IrGraph,
-        it can not distinguish between persistable variables and parameters.
-
-        Args:
-            name(str): the name of the persistable variable node.
-            vart_type(core.VarDesc.VarType): the type of the persistable variable node.
-            shape(list): the shape of the persistable variable node.
-            var_dtype(core.VarDesc.VarType): the data type of the persistable variable node.
-
-        Returns:
-            IrVarNode: the created persistable variable node.
-        """
-        var_desc = core.VarDesc(name)
-        var_desc.set_type(var_type)
-        var_desc.set_shape(shape)
-        var_desc.set_dtype(var_dtype)
-        var_desc.set_persistable(True)
-        return IrVarNode(self.graph.create_var_node(var_desc))
-
-    def create_var_node(self, name, var_type, shape, var_dtype):
-        """
-        Create a variable node in the graph. The created variable node is
-        not persistable.
-
-        Args:
-            name(str): the name of the variable node.
-            vart_type(core.VarDesc.VarType): the type of the variable node.
-            shape(list): the shape of the variable node.
-            var_dtype(core.VarDesc.VarType): the data type of the variable node.
-
-        Returns:
-            IrVarNode: the created variable node.
-        """
-
-        var_desc = core.VarDesc(name)
-        var_desc.set_type(var_type)
-        var_desc.set_shape(shape)
-        var_desc.set_dtype(var_dtype)
-        return IrVarNode(self.graph.create_var_node(var_desc))
-
-    def create_var_node_from_desc(self, var_desc):
-        """
-        Create a variable node by using an existing VarDesc in the graph.
-        Depend on the giving VarDesc, the created variable node may be persistable.
-
-        Args:
-            var_desc(core.VarDesc): the giving variable description.
-
-        Returns:
-            IrVarNode: the created variable node.
-        """
-        return IrVarNode(self.graph.create_var_node(var_desc))
-
-    def create_op_node(self, op_type, attrs, inputs, outputs):
-        """
-        Create a operator node in the graph.
-
-        Args:
-            op_type(str): the type of the operator node.
-            attrs(dict): the attributes of the operator node.
-            inputs(dict): the inputs of the operator node.
-            outputs(dict): the outpus of the operator node.
-
-        Returns:
-            IrOpNode: the created operator node.
-        """
-        op_desc = core.OpDesc()
-        op_desc.set_type(op_type)
-        for attr, value in six.iteritems(attrs):
-            self._update_desc_attr(op_desc, attr, value)
-        for input_name, var_nodes in six.iteritems(inputs):
-            if not isinstance(var_nodes, list):
-                var_nodes = [var_nodes]
-            op_desc.set_input(input_name,
-                              [var_node.name() for var_node in var_nodes])
-        for output_name, var_nodes in six.iteritems(outputs):
-            if not isinstance(var_nodes, list):
-                var_nodes = [var_nodes]
-            op_desc.set_output(output_name,
-                               [var_node.name() for var_node in var_nodes])
-        return IrOpNode(self.graph.create_op_node(op_desc))
-
-    def create_op_node_from_desc(self, op_desc):
-        """
-        Create a operator node by using an existing OpDesc in the graph.
-
-        Args:
-            op_desc(core.VarDesc): the giving operator description.
-
-        Returns:
-            IrOpNode: the created operator node.
-        """
-        return IrOpNode(self.graph.create_op_node(op_desc))
-
-    def update_input_link(self, old_input_node, new_input_node, op_node):
-        """
-        Update the input's link of a operator node.
-
-        Args:
-            old_input_node(IrNode): the old input node of the giving op_node.
-            new_input_node(IrNode): the new input node of the giving op_node.
-            op_node(IrOpNode): the operator node that is needed to update input's link.
-        """
-        assert old_input_node.node in self.graph.nodes() and new_input_node.node in \
-        self.graph.nodes() and op_node.node in self.graph.nodes(), \
-        'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
-        old_input_node.remove_output(op_node)
-        op_node.remove_input(old_input_node)
-        new_input_node.append_output(op_node)
-        op_node.append_input(new_input_node)
-        op_node.rename_input(old_input_node.name(), new_input_node.name())
-
-    def update_output_link(self, old_output_node, new_output_node, op_node):
-        """
-        Update the output's link of an operator node.
-
-        Args:
-            old_output_node(IrNode): the old output node of the giving op_node.
-            new_output_node(IrNode): the new output node of the giving op_node.
-            op_node(IrOpNode): the operator node that is needed to update input's link.
-        """
-        assert old_output_node.node in self.graph.nodes() and new_output_node.node in \
-        self.graph.nodes() and op_node.node in self.graph.nodes(), \
-        'The three arguments(old_output_node &new_output_node &op_node) must be in the graph nodes.'
-        old_output_node.remove_input(op_node)
-        op_node.remove_output(old_output_node)
-        new_output_node.append_input(op_node)
-        op_node.append_output(new_output_node)
-        op_node.rename_output(old_output_node.name(), new_output_node.name())
-
-    def link_to(self, node_in, node_out):
-        """
-        Connect two nodes.
-
-        Args:
-            node_in(IrNode): the input node.
-            node_out(IrNode): the output node.
-        """
-        assert node_in.node in self.graph.nodes() and node_out.node in self.graph.nodes(), \
-            'The two arguments(node_in&node_out) must be in the graph nodes.'
-        node_in.append_output(node_out)
-        node_out.append_input(node_in)
-
-    def safe_remove_nodes(self, remove_nodes):
-        """
-        Remove nodes safely since links connected to these removed nodes are
-        also removed.
-
-        Args:
-            remove_nodes(set): the nodes prepared to be removed.
-        """
-        if not isinstance(remove_nodes, set):
-            if isinstance(remove_nodes, Iterable):
-                remove_nodes = set(remove_nodes)
-            else:
-                remove_nodes = {remove_nodes}
-        original_nodes = {n.node for n in remove_nodes}
-        core.graph_safe_remove_nodes(self.graph, original_nodes)
-
-    def resolve_hazard(self):
-        ordered_nodes = core.topology_sort(self.graph)
-        var_nodes = dict()
-        for node in ordered_nodes:
-            if node.is_op() and node.op() is not None:
-                for each_var_name in node.op().input_arg_names():
-                    if each_var_name not in var_nodes:
-                        var_nodes[each_var_name] = [
-                            self._find_node_by_name(node.inputs, each_var_name)
-                        ]
-                for each_var_name in node.op().output_arg_names():
-                    if each_var_name not in var_nodes:
-                        var_nodes[each_var_name] = [
-                            self._find_node_by_name(node.outputs, each_var_name)
-                        ]
-                    else:
-                        var_nodes[each_var_name].append(
-                            self._find_node_by_name(node.outputs,
-                                                    each_var_name))
-        self.graph.resolve_hazard(var_nodes)
-
-    def has_circle(self):
-        """
-        Check if the graph has a circle.
-
-        Returns:
-            bool: True if the graph has a circle else False.
-        """
-        return core.has_circle(self.graph)
-
-    def graph_num(self):
-        """
-        Count the number of unconnected graphs in this graph.
-
-        Returns:
-            int: the number of unconnected graphs.
-        """
-        return core.graph_num(self.graph)
-
-    def topology_sort(self):
-        """
-        Perform the topology sort operation on the graph.
-
-        Notes: the `graph` cannot contain a circle.
-
-        Returns:
-            list(IrNode): nodes in topology order.
-        """
-        ordered_nodes = core.topology_sort(self.graph)
-        return [IrNode(n) for n in ordered_nodes]
-
-    def build_adjacency_list(self):
-        """
-        Build an adjacency list of operations for the `graph`.
-
-        Returns:
-            dict{IrNode: set(IrNode)}: the adjacency list.
-        """
-        adj_list = core.build_adjacency_list(self.graph)
-        wrapped_adj_list = dict()
-        for k, v in six.iteritems(adj_list):
-            wrapped_adj_list[IrNode(k)] = {IrNode(n) for n in v}
-        return wrapped_adj_list
-
-    def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
-        """
-        Draw the graph. If `dot` command is installed, the drawn graph
-        will be saved as pdf file type, otherwise dot file type is used.
-
-        Args:
-            save_path(str): the save path of drawn graph.
-            name(str): the name of drawn graph.
-            marked_nodes(set(IrNode)): nodes that are needed to be marked.
-            Default value is None.
-            remove_ctr_var(bool): If it is set True, all control variable nodes
-            in the graph will be removed. Default value is True.
-        """
-
-        def _convert_to_pdf(dot_file_path):
-            pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
-            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
-                            + ' -o ' + pdf_save_path, shell=True)
-            if exited_code != 0:
-                print('The dot command is needed for creating pdf files.')
-                print('The {} is saved as the dot filetype.'.format(
-                    dot_file_path))
-
-        remove_ctr_vars = set()
-        if remove_ctr_var:
-            for node in self.all_var_nodes():
-                if node.is_ctrl_var():
-                    remove_ctr_vars.add(node)
-            self.safe_remove_nodes(remove_ctr_vars)
-        print('Total ops num = {}.'.format(len(self.all_op_nodes())))
-
-        if marked_nodes is not None:
-            if not isinstance(marked_nodes, set):
-                if isinstance(marked_nodes, Iterable):
-                    marked_nodes = set(marked_nodes)
-                else:
-                    marked_nodes = {marked_nodes}
-            marked_nodes = {n.node for n in marked_nodes}
-            remove_ctr_vars = {n.node for n in remove_ctr_vars}
-            marked_nodes = marked_nodes - remove_ctr_vars
-            if self.graph.has('__graphviz__marked_node__'):
-                self.graph.erase('__graphviz__marked_node__')
-            self.graph.set('__graphviz__marked_node__', marked_nodes)
-        if not os.path.exists(save_path):
-            os.makedirs(save_path)
-        viz_dot_path = os.path.join(save_path, name) + '.dot'
-        viz_pass = core.get_pass('graph_viz_pass')
-        viz_pass.set('graph_viz_path', viz_dot_path)
-        viz_pass.apply(self.graph)
-        _convert_to_pdf(viz_dot_path)
-
-    def to_program(self):
-        """
-        Convert the graph into a Program.
-
-        WARN: When the graph includes backward operator nodes, the
-        conversion process may be failed. Usually, this function is
-        only used to convert a test graph.
-
-        Returns:
-            Program: a program converted from the graph.
-        """
-        convert_pass = core.get_pass('graph_to_program_pass')
-        desc = core.ProgramDesc()
-        convert_pass.set_not_owned('program', desc)
-        convert_pass.apply(self.graph)
-        program = Program._construct_from_desc(desc)
-        return program
-
-    def _find_node_by_name(self, nodes, node_name):
-        """
-        Find a node in the giving nodes set by the name.
-        """
-        target_node = None
-        for n in nodes:
-            if n.name() == node_name:
-                target_node = n
-        assert target_node is not None, "Cannot find the target node in the giving set."
-        return target_node
-
-    def _update_desc_attr(self, desc, name, val):
-        """
-        Update the value of desc's attribute by attribute's name.
-        """
-        if isinstance(val, Block):
-            desc.set_block_attr(name, val.desc)
-        elif isinstance(val, list) and val and all(
-                isinstance(v, Block) for v in val):
-            desc.set_blocks_attr(name, [v.desc for v in val])
-        elif isinstance(val, core.BlockDesc) or \
-                isinstance(val, core.ProgramDesc):
-            desc.set_serialized_attr(name, val.serialize_to_string())
-        else:
-            desc._set_attr(name, val)
-
-
-class Program(object):
-    """
-    Python Program. Beneath it is a ProgramDesc, which is used for
-    create c++ Program. A program is a self-contained programing
-    language like container. It has at least one Block, when the
-    control flow op like conditional_block, while_op is included,
-    it will contain nested block.
-    Please reference the framework.proto for details.
-
-    A set of Program usually contains startup program and main program.
-    A startup program is set to contain some initial work , and the main
-    program will contain the network structure and vars for train.
-
-    A set of Program can be used for test or train, in train program ,
-    Paddle will contain all content to build a train network,  in test
-    program Paddle will prune some content which is irrelevant to test, eg.
-    backward ops and vars.
-
-    Notes: we have default_startup_program and default_main_program
-    by default, a pair of them will shared the parameters.
-    The default_startup_program only run once to initialize parameters,
-    default_main_program run in every mini batch and adjust the weights.
-
-    Returns:
-        A empty program.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            main_program = fluid.Program()
-            startup_program = fluid.Program()
-            with fluid.program_guard(main_program=main_program, startup_program=startup_program):
-                x = fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
-                y = fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
-                z = fluid.layers.fc(name="fc", input=x, size=10, act="relu")
-
-            print("main program is: {}".format(main_program))
-            print("start up program is: {}".format(startup_program))
-
-    """
-
-    def __init__(self):
-        self.desc = core.ProgramDesc()
-        self.blocks = [Block(self, 0)]
-        self.current_block_idx = 0
-        self._seed = 0
-        self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
-        self.__op_role_var = []
-
-        # for distribute training
-        # _is_distributed = True if under distributed training
-        self._is_distributed = False
-        # _is_chief = True if the trainer is the first one, usually No.0
-        self._is_chief = False
-        # _parameters_on_pservers records all the parameters distributed on parameter servers.
-        self._parameters_on_pservers = None
-        # _endpoints is a list about parameter servers ip:port, such as ["ip:port","ip:port"]
-        self._endpoints = []
-        # if current role is parameter server, the _ps_endpoint is its "ip:port"
-        self._ps_endpoint = None
-        # trainers_endpoints, it is used for distribution.
-        self._trainers_endpoints = []
-        # the distributed lookup table names
-        self._distributed_lookup_table = None
-
-        # use Deep gradient comrepssion or not
-        self._enable_dgc = False
-        self._use_lamb = False
-
-        self._nccl_comm_num = 1
-        self._use_hierarchical_allreduce = False
-        self._hierarchical_allreduce_inter_nranks = 0
-
-        # if this program has been optimized by distributed optimizer
-        # fleet_opt will be given a value
-        self._fleet_opt = None
-        self._program_config = None
-
-        # assigned if this program has been parsed by a pipeline optimizer
-        self._pipeline_opt = None
-
-        # appending gradients times
-        self._appending_grad_times = 0
-
-    @property
-    def _op_role(self):
-        """
-        The operator role. In a enum {Forward, Backward, Optimize}.
-
-        Notes: this is a low level API. It is used only for ParallelExecutor to
-        duplicate or schedule operator to devices.
-
-        For example, the forward operator should be executed on every device.
-        The backward operator should be executed on every device and the
-        parameter gradient of backward (use :code:`_op_role_var` to get this
-        variable) operator should be merged to one device. The optimization
-        operators should be executed on only one device and broadcast the
-        optimization result, i.e., the new parameter, to every other device.
-        """
-        return self._current_role
-
-    @_op_role.setter
-    def _op_role(self, role):
-        self._current_role = role
-
-    @property
-    def _op_role_var(self):
-        """
-        The auxiliary variables for :code:`_op_role` property.
-
-        See Also: :code:`Program._op_role`'s documentation for details.
-
-        Notes: This is a very low-level API. Users should not use it directly.
-        """
-        return self.__op_role_var
-
-    @contextlib.contextmanager
-    def _backward_role_guard(self):
-        tmp_role = self._current_role
-
-        OpRole = core.op_proto_and_checker_maker.OpRole
-        self._current_role = OpRole.Backward
-        yield
-        self._current_role = tmp_role
-
-    @signature_safe_contextmanager
-    def _optimized_guard(self, param_and_grads):
-        """
-        A with guard to set :code:`Optimization` :code:`OpRole` and
-        :code:`OpRoleVar` automatically.
-
-        Notes: This is a very low level API. Users should not use it directly.
-
-        Args:
-            param_and_grads(list): The variables (names) to be optimized.
-
-        Examples:
-
-            >>> import paddle.fluid as fluid
-            >>> p, g = backward(...)
-            >>> with program._optimized_guard([p,g]):
-            >>>     p = p - 0.001 * g
-        """
-        tmp_role = self._current_role
-        tmp_var = self.__op_role_var
-
-        OpRole = core.op_proto_and_checker_maker.OpRole
-        self._current_role = OpRole.Optimize
-        self.__op_role_var = [
-            var.name if isinstance(var, Variable) else var
-            for var in param_and_grads
-        ]
-        yield
-        self.__op_role_var = tmp_var
-        self._current_role = tmp_role
-
-    @signature_safe_contextmanager
-    def _lr_schedule_guard(self, is_with_opt=False):
-        """
-        A with guard to set :code:`LRSched` :code:`OpRole` and
-        :code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
-        set to the target learning rate.
-
-        Notes: This is a very low level API. Users should not use it directly.
-
-        Args:
-            is_with_opt: Only set to true if these ops a in the middle
-                 of a bunch of optimize ops so that it can be treated
-                 correctly. For example, sgd->lr_op->sgd->lr_op->sgd.
-
-        Examples:
-
-            >>> import paddle.fluid as fluid
-            >>> p, g = backward(...)
-            >>> with program.lr_schedule_guard():
-            >>>     lr = lr * decay
-        """
-
-        tmp_role = self._current_role
-        tmp_var = self.__op_role_var
-
-        OpRole = core.op_proto_and_checker_maker.OpRole
-        self._current_role = OpRole.LRSched
-        if is_with_opt:
-            self._current_role = int(OpRole.LRSched) | int(OpRole.Optimize)
-        # TODO(typhoonzero): how to set target learning rate var
-        self.__op_role_var = []
-        yield
-        self.__op_role_var = tmp_var
-        self._current_role = tmp_role
-
-    def __str__(self):
-        """
-        Get the protobuf debug string of this Program.
-
-        Returns:
-            (str): The protobuf debug string.
-
-        Raises:
-            ValueError: If any of required fields is not set.
-        """
-        return self.to_string(True)
-
-    def to_string(self, throw_on_error, with_details=False):
-        """
-        To debug string.
-
-        Args:
-            throw_on_error(bool): raise Value error when any of required fields
-                is not set.
-
-            with_details(bool): True if more details about variables and
-                parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
-                to print.
-
-        Returns:
-            str : The debug string.
-
-        Raises:
-            ValueError: If any of required fields is not set and throw_on_error is
-                True.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                prog_string = prog.to_string(throw_on_error=True, with_details=False)
-                print(prog_string)
-
-        """
-        assert isinstance(throw_on_error, bool) and isinstance(with_details,
-                                                               bool)
-        if with_details:
-            res_str = ""
-            for block in self.blocks:
-                res_str += block.to_string(throw_on_error, with_details)
-        else:
-            protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.ProgramDesc.FromString(
-                six.binary_type(protostr))
-            res_str = _debug_string_(proto, throw_on_error)
-        return res_str
-
-    def _get_desc(self):
-        """
-        Get the C++ side of `ProgramDesc` object pointer. The C++ object is
-        exposed by :code:`pybind`.
-
-        Notes: This is a very low level API. Users should not use this API
-        directly.
-        """
-        return self.desc
-
-    def _version(self):
-        return self.desc._version()
-
-    def clone(self, for_test=False):
-        """
-        Create a new, duplicated program.
-
-
-        Some operators, e.g., :code:`batch_norm`, behave differently between
-        training and testing. They have an attribute, :code:`is_test`, to
-        control this behaviour. This method will change the :code:`is_test`
-        attribute of them to :code:`True` when :code:`for_test=True`.
-
-        * Set for_test to False when we want to clone the program for training.
-        * Set for_test to True when we want to clone the program for testing.
-          We will prune the backward and optimize part of the program when you
-          use :code:`clone` after :code:`Opimizer.minimize`, but we still
-          recommend you to use :code:`clone` before using :code:`Opimizer.minimize`.
-
-        Notes: 
-        1. :code:`Program.clone()` method DOES NOT clone :code:`py_reader`.
-        2. We recommend you to use :code:`clone(for_test=True)` before backward
-           and optimization. E.g.
-
-        .. code-block:: python
-
-            test_program = fluid.default_main_program().clone(for_test=True)
-            optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-            optimizer.minimize()
-
-        Args:
-            for_test(bool): True if change the :code:`is_test` attribute of
-                operators to :code:`True`.
-
-        Returns:
-            Program: The new, duplicated Program object.
-
-        Examples:
-
-        Notes: The Program Descs' order maybe different after :code:`clone` and
-        this will not affect your training or testing progress. In the following
-        example we give you an simple method :code:`print_prog(program)` to
-        print Program Descs inorder to make sure you have same print result
-        after :code:`clone`:
-
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                import six
-
-
-                def print_prog(prog):
-                    for name, value in sorted(six.iteritems(prog.block(0).vars)):
-                        print(value)
-                    for op in prog.block(0).ops:
-                        print("op type is {}".format(op.type))
-                        print("op inputs are {}".format(op.input_arg_names))
-                        print("op outputs are {}".format(op.output_arg_names))
-                        for key, value in sorted(six.iteritems(op.all_attrs())):
-                            if key not in ['op_callstack', 'op_role_var']:
-                                print(" [ attrs: {}:   {} ]".format(key, value))
-
-
-        1. To clone a test program, the sample code is:
-                .. code-block:: python
-
-                    import paddle.fluid as fluid
-                    import six
-
-                    def print_prog(prog):
-                        for name, value in sorted(six.iteritems(prog.block(0).vars)):
-                            print(value)
-                        for op in prog.block(0).ops:
-                            print("op type is {}".format(op.type))
-                            print("op inputs are {}".format(op.input_arg_names))
-                            print("op outputs are {}".format(op.output_arg_names))
-                            for key, value in sorted(six.iteritems(op.all_attrs())):
-                                if key not in ['op_callstack', 'op_role_var']:
-                                    print(" [ attrs: {}:   {} ]".format(key, value))
-
-                    train_program = fluid.Program()
-                    startup_program = fluid.Program()
-
-                    # startup_program is used to do some parameter init work,
-                    # and main program is used to hold the network
-                    with fluid.program_guard(train_program, startup_program):
-                        with fluid.unique_name.guard():
-                            img = fluid.layers.data(name='image', shape=[784])
-                            hidden = fluid.layers.fc(input=img, size=200, act='relu')
-                            hidden = fluid.layers.dropout(hidden, dropout_prob=0.5)
-                            loss = fluid.layers.cross_entropy(
-                                                      input=fluid.layers.fc(hidden, size=10, act='softmax'),
-                                        label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
-                            avg_loss = fluid.layers.mean(loss)
-                            test_program = train_program.clone(for_test=False)
-                    print_prog(test_program)
-
-                    # Due to parameter sharing usage for train and test, so we need to use startup program of train
-                    # instead of using test startup program, while nothing is in test's startup program
-
-                    # In Paddle Fluid we will share weights by using the same Variable name. In train and test program
-                    # all parameters will have the same name and this can make train and test program sharing parameters,
-                    # that's why we need to use startup program of train. And for startup program of test, it has nothing,
-                    # since it is a new program.
-
-                    with fluid.program_guard(train_program, startup_program):
-                        with fluid.unique_name.guard():
-                            sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                            sgd.minimize(avg_loss)
-
-
-        2. The clone method can be avoid if you create program for training and program for testing individually.
-                .. code-block:: python
-
-                    import paddle.fluid as fluid
-                    import six
-
-                    def print_prog(prog):
-                        for name, value in sorted(six.iteritems(prog.block(0).vars)):
-                            print(value)
-                        for op in prog.block(0).ops:
-                            print("op type is {}".format(op.type))
-                            print("op inputs are {}".format(op.input_arg_names))
-                            print("op outputs are {}".format(op.output_arg_names))
-                            for key, value in sorted(six.iteritems(op.all_attrs())):
-                                if key not in ['op_callstack', 'op_role_var']:
-                                    print(" [ attrs: {}:   {} ]".format(key, value))
-                    def network(is_test):
-                        img = fluid.layers.data(name='image', shape=[784])
-                        hidden = fluid.layers.fc(input=img, size=200, act='relu')
-                        hidden = fluid.layers.dropout(hidden, dropout_prob=0.5)
-                        loss = fluid.layers.cross_entropy(
-                            input=fluid.layers.fc(hidden, size=10, act='softmax'),
-                            label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
-                        avg_loss = fluid.layers.mean(loss)
-                        return avg_loss
-
-
-                    train_program_2 = fluid.Program()
-                    startup_program_2 = fluid.Program()
-                    test_program_2 = fluid.Program()
-                    with fluid.program_guard(train_program_2, startup_program_2):
-                        with fluid.unique_name.guard():
-                             sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                             sgd.minimize(avg_loss)
-                    # the test startup program is not used.
-                    with fluid.program_guard(test_program_2, fluid.Program()):
-                        with fluid.unique_name.guard():
-                            loss = network(is_test=True)
-                    print(test_program_2)
-
-        The two code snippets above will generate and print same programs.
-        """
-        if for_test:
-            if self._appending_grad_times > 0:
-                forward_prog = Program()
-                forward_prog.desc = core.prune_backward(self.desc)
-                forward_prog.blocks = [
-                    Block(forward_prog, i)
-                    for i in six.moves.range(forward_prog.desc.num_blocks())
-                ]
-                forward_prog._sync_with_cpp()
-                p = forward_prog._inference_optimize(prune_read_op=False)
-            else:
-                p = self._inference_optimize(prune_read_op=False)
-        else:
-            p = Program()
-            p.current_block_idx = self.current_block_idx
-            p._seed = self._seed
-            p.desc = core.ProgramDesc(self.desc)
-            p.blocks = [
-                Block(p, i) for i in six.moves.range(self.desc.num_blocks())
-            ]
-
-            p._current_role = self._current_role
-            p.__op_role_var = self.__op_role_var
-            p._appending_grad_times = self._appending_grad_times
-
-            p._sync_with_cpp()
-
-        p._copy_param_info_from(self)
-        p._copy_data_info_from(self)
-        p._copy_dist_param_info_from(self)
-        return p
-
-    def _prune(self, feeded_var_names, targets):
-        """
-        Prune operators and variables which are not needed to generate
-        :code:`targets`.
-
-        Notes: This is a very low level API. Users should not use this API
-        directly. This API is in flux and not stable.
-
-        Args:
-            targets(list|Variable|Operator): A list of variables or operators
-                need to be pruned
-
-        Returns:
-            Program:  A new, pruned program.
-
-        """
-        if not isinstance(feeded_var_names, list):
-            feeded_var_names = [feeded_var_names]
-        if not isinstance(targets, list):
-            targets = [targets]
-
-        for var in feeded_var_names:
-            if not isinstance(var, six.string_types):
-                raise ValueError("All feeded_var_names of prune() can only be "
-                                 "str.")
-
-        targets_idx = []
-        for t in targets:
-            if not isinstance(t, Operator):
-                if isinstance(t, Variable):
-                    # After transpiler processing, the op that output this
-                    # variable maybe has been changed, so t.op is not reliable
-                    # and we need to find the current op that generate this
-                    # variable here.
-                    t.op = None
-                    global_block = self.global_block()
-                    for idx, op in enumerate(global_block.ops):
-                        if t.name in op.output_arg_names:
-                            t.op = op
-                            break
-
-                    t = t.op
-                    if t is None:
-                        raise ValueError(
-                            "The target variable must have an "
-                            "associated operator that generates it.")
-                else:
-                    raise ValueError("All targets of prune() can only be "
-                                     "Variable or Operator.")
-
-            targets_idx.append([t.block.idx, t.idx])
-        res = Program()
-        res.desc = core.prune(self.desc, set(feeded_var_names), targets_idx)
-        res.blocks = [
-            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
-        ]
-        res._sync_with_cpp()
-        return res
-
-    def _inference_optimize(self, prune_read_op=True):
-        """
-        This method will create a new program and do following adjustments on it:
-        1. Remove all reader variables and their creator ops if exist.
-
-        2. Remove the :code:`read_op` if exists.
-
-        3. change the :code:`is_test`
-        attribute of operators to :code:`True`. All the :code:`Parameter`
-        information will be lost.
-
-        Args:
-            prune_read_op(bool): remove the read ops that are added by py_reader
-                                 for cpp inference library
-
-        Notes: This API is a very low level API. Use
-        :code:`Program.clone(for_test=True)` instead.
-
-        Returns:
-            Program: The new program.
-        """
-        res = Program()
-        res.desc = core.ProgramDesc(self.desc)
-
-        # remove all readers and the read_op if exist
-        read_op_idx = 0
-        root_block = res.desc.block(0)
-        if prune_read_op:
-            while True:
-                if read_op_idx >= root_block.op_size() or root_block.op(
-                        read_op_idx).type() == 'read':
-                    break
-                read_op_idx += 1
-            if read_op_idx < root_block.op_size():
-                root_block._remove_op(0, read_op_idx + 1)
-            for var in root_block.all_vars():
-                if var.type() == core.VarDesc.VarType.READER:
-                    root_block._remove_var(cpt.to_bytes(var.name()))
-
-        # change all `is_test` attributes to True
-        for i in six.moves.range(res.desc.num_blocks()):
-            block = res.desc.block(i)
-            for j in six.moves.range(block.op_size()):
-                op = block.op(j)
-                if op.has_attr('is_test'):
-                    op._set_attr('is_test', True)
-        res.blocks = [
-            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
-        ]
-        res._sync_with_cpp()
-        return res
-
-    @staticmethod
-    def parse_from_string(binary_str):
-        """
-        Deserialize a program desc from protobuf binary string.
-
-        Notes: All information about parameters will be lost after serialization
-        and deserialization.
-
-        Args:
-            binary_str_type(str): The binary prootbuf string.
-
-        Returns:
-            Program: A deserialized program desc.
-        """
-        p = Program()
-        p.desc = core.ProgramDesc(binary_str)
-        p.blocks = [Block(p, i) for i in six.moves.range(p.desc.num_blocks())]
-        p._sync_with_cpp()
-        return p
-
-    @staticmethod
-    def _construct_from_desc(desc):
-        """
-        Construct a program from program desc.
-
-        Args:
-            desc(core.ProgramDesc): The program desc for constructing.
-
-        Returns:
-            Program: A program.
-        """
-        p = Program()
-        p.desc = desc
-        p.blocks = [Block(p, i) for i in six.moves.range(p.desc.num_blocks())]
-        p._sync_with_cpp()
-        return p
-
-    @property
-    def random_seed(self):
-        """
-        The default random seed for random operators in Program. Zero means get
-        the random seed from random device.
-
-        Notes: It must be set before the operators have been added.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                random_seed = prog.random_seed
-                print(random_seed)
-                prog.random_seed = 1
-                print(prog.random_seed)
-        """
-        return self._seed
-
-    @property
-    def num_blocks(self):
-        """
-        The number of blocks in this program.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                num_blocks = prog.num_blocks
-                print(num_blocks)
-        """
-        return self.desc.num_blocks()
-
-    @random_seed.setter
-    def random_seed(self, seed):
-        if not isinstance(seed, int):
-            raise ValueError("Seed must be a integer.")
-        self._seed = seed
-
-    def __repr__(self):
-        return self.__str__()
-
-    def global_block(self):
-        """
-        Get the first block of this program.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                gb_block = prog.global_block()
-                print(gb_block)
-        """
-        return self.blocks[0]
-
-    def block(self, index):
-        """
-        Get the :code:`index` block of this program
-        Args:
-            index(int): The index of block to get
-
-        Returns:
-            Block: The :code:`index` block
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                block_0 = prog.block(0)
-                print(block_0)
-        """
-        return self.blocks[index]
-
-    def current_block(self):
-        """
-        Get the current block. The :code:`current` block is the block to append
-        operators.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                current_blk = prog.current_block()
-                print(current_blk)
-        """
-        return self.blocks[self.current_block_idx]
-
-    def _create_block(self, parent_idx=None):
-        """
-        Create a new block with the :code:`parent_idx` and change the current block
-        to new block.
-
-        Args:
-            parent_idx(int): The parent block index.
-
-        Returns:
-            Block: The new block.
-        """
-        new_block_idx = len(self.blocks)
-        parent = self.current_block() if parent_idx is None else self.block(
-            parent_idx)
-        self.desc.append_block(parent.desc)
-        self.current_block_idx = new_block_idx
-        self.blocks.append(Block(self, self.current_block_idx))
-        return self.current_block()
-
-    def _rollback(self):
-        """
-        Exit a code block, i.e., roll back to the parent block.
-        Returns:
-            None
-        """
-        self.current_block_idx = self.current_block().parent_idx
-
-    def _sync_with_cpp(self):
-        """
-        Synchronize Python instance to its binding C++ object instance.
-        If the program is modified in C++ space, this method should be invoked.
-
-        Notes: This is a very low level API. Users should not invoke it
-        directly.
-
-        Returns:
-            None
-        """
-        for block_idx in range(len(self.blocks), self.desc.num_blocks()):
-            self.blocks.append(Block(self, block_idx))
-        for block in self.blocks:
-            block._sync_with_cpp()
-
-    def _copy_param_info_from(self, other):
-        """
-        Copy the information of parameters from other program.
-
-        Notes: This is a very low level API. Users should not invoke it
-        directly.
-
-        Args:
-            other(Program): Other program
-
-        Returns:
-            None
-        """
-        if not isinstance(other, Program):
-            raise TypeError("_copy_param_info_from should be invoked with "
-                            "Program")
-
-        if len(self.blocks) != len(other.blocks):
-            raise ValueError("_copy_param_info_from should be invoked with two "
-                             "program, with represent the same topology")
-        self.global_block()._copy_param_info_from(other.global_block())
-
-    def _copy_dist_param_info_from(self, other):
-        """
-        Copy the information of distributed information from other program.
-
-        Args:
-            other(Program): Other program
-
-        Returns:
-            None
-        """
-        if not isinstance(other, Program):
-            raise TypeError("_copy_dist_param_info_from should be invoked with "
-                            "Program")
-        self._is_distributed = other._is_distributed
-        self._is_chief = other._is_chief
-        self._parameters_on_pservers = other._parameters_on_pservers
-        self._endpoints = other._endpoints
-        self._ps_endpoint = other._ps_endpoint
-        self._distributed_lookup_table = other._distributed_lookup_table
-
-    def _copy_data_info_from(self, other):
-        """
-        Copy the information of data variables from other program.
-
-        Notes: This is a very low level API. Users should not invoke it
-        directly.
-
-        Args:
-            other(Program): Other program
-
-        Returns:
-            None
-        """
-        if not isinstance(other, Program):
-            raise TypeError("_copy_param_info_from should be invoked with "
-                            "Program")
-
-        if len(self.blocks) != len(other.blocks):
-            raise ValueError("_copy_param_info_from should be invoked with two "
-                             "program, with represent the same topology")
-        for var in list(other.global_block().vars.values()):
-            if var.is_data:
-                self.global_block().var(var.name).is_data = True
-
-    def list_vars(self):
-        """
-        Get all variables from this Program. A iterable object is returned.
-
-        Returns:
-            iterable: The generator will yield every variable in this program.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                img = fluid.layers.data(name='img', shape=[1,28,28], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[128,1], dtype='int64')
-                for var in prog.list_vars():
-                    print(var)
-        """
-        for each_block in self.blocks:
-            for each_var in list(each_block.vars.values()):
-                yield each_var
-
-
-class Parameter(Variable):
-    """
-    Parameter is derived from Variable. A parameter is a persistable
-    Variable, and will be updated by optimizers after each iteration.
-    The training of a neural network is essentially the updating of
-    its parameters.
-
-    Relative to a general Variable, a Parameter has several its own
-    member variables:
-
-    Args:
-        trainable(bool): True if the parameter need to be updated after
-            iterations.
-        optimize_attr(map): Parameter attributes related with optimizing.
-            Currently, it only contains 'learning_rate'.
-            Default: {'learning_rate': 1.0}
-        regularizer(WeightDecayRegularizer): The Regularizer which will
-            be applied on the parameter. Default: None
-        gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy
-            which will be applied on the parameter. Default: None
-        do_model_average(bool): True if the model average strategy will
-            be applied on this parameter.
-    """
-
-    def __init__(self, block, shape, dtype, **kwargs):
-        if shape is None or dtype is None:
-            raise ValueError("Parameter must set shape and dtype")
-        if len(shape) == 0:
-            raise ValueError("Parameter shape cannot be empty")
-
-        for each in shape:
-            if each < 0:
-                raise ValueError("Parameter shape should not be related with "
-                                 "batch-size")
-
-        Variable.__init__(
-            self, block, persistable=True, shape=shape, dtype=dtype, **kwargs)
-        self.trainable = kwargs.get('trainable', True)
-
-        self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
-
-        self.regularizer = kwargs.get('regularizer', None)
-
-        self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
-
-        self.do_model_average = kwargs.get('do_model_average', None)
-
-        self.is_distributed = False
-
-    def __str__(self):
-        return self.to_string(True)
-
-    def to_string(self, throw_on_error, with_details=False):
-        """
-        To debug string.
-
-        Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
-            with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
-
-        Returns(str): The debug string.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                rlt = fluid.layers.data("fake_data", shape=[1,1], dtype='float32')
-                debug_str = prog.to_string(throw_on_error=True, with_details=False)
-                print(debug_str)
-        """
-        assert isinstance(throw_on_error, bool) and isinstance(with_details,
-                                                               bool)
-        if with_details:
-            res_str = Variable.to_string(self, throw_on_error, True)
-            additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "gradient_clip_attr", "do_model_average")
-            for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (
-                    attr_name, six.binary_type(getattr(self, attr_name)))
-        else:
-            res_str = Variable.to_string(self, throw_on_error, False)
-        return res_str
-
-    __repr__ = __str__
-
-
-# program is a global instance.
-_main_program_ = Program()
-_startup_program_ = Program()
-
-
-def default_startup_program():
-    """
-    Get default/global startup program.
-
-    The layer function in :code:`fluid.layers` will create parameters, readers,
-    NCCL handles as global variables. The :code:`startup_program` will
-    initialize them by the operators in startup program. The layer function will
-    append these initialization operators into startup program.
-
-    This method will return the :code:`default` or the :code:`current` startup
-    program. Users can use :code:`fluid.program_guard` to switch program.
-
-    Returns:
-        Program: startup program
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            main_program = fluid.Program()
-            startup_program = fluid.Program()
-            with fluid.program_guard(main_program=main_program, startup_program=startup_program):
-                x = fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
-                y = fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
-                z = fluid.layers.fc(name="fc", input=x, size=10, act="relu")
-
-                print("main program is: {}".format(fluid.default_main_program()))
-                print("start up program is: {}".format(fluid.default_startup_program()))
-    """
-    return _startup_program_
-
-
-def default_main_program():
-    """
-    Get default/global main program. The main program is used for training or
-    testing.
-
-    All layer function in :code:`fluid.layers` will append operators and
-    variables to the :code:`default_main_program`.
-
-    The :code:`default_main_program` is the default program in a lot of APIs.
-    For example, the :code:`Executor.run()` will execute the
-    :code:`default_main_program` when the program is not specified.
-
-    Returns:
-        Program: main program
-
-    Examples:
-        ..  code-block:: python
-
-            import paddle.fluid as fluid
-            
-            # Sample Network:
-            data = fluid.layers.data(name='image', shape=[3, 224, 224], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            
-            conv1 = fluid.layers.conv2d(data, 4, 5, 1, act=None)
-            bn1 = fluid.layers.batch_norm(conv1, act='relu')
-            pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
-            conv2 = fluid.layers.conv2d(pool1, 16, 5, 1, act=None)
-            bn2 = fluid.layers.batch_norm(conv2, act='relu')
-            pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
-            
-            fc1 = fluid.layers.fc(pool2, size=50, act='relu')
-            fc2 = fluid.layers.fc(fc1, size=102, act='softmax')
-            
-            loss = fluid.layers.cross_entropy(input=fc2, label=label)
-            loss = fluid.layers.mean(loss)
-            opt = fluid.optimizer.Momentum(
-                learning_rate=0.1,
-                momentum=0.9,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            opt.minimize(loss)
-            
-            print(fluid.default_main_program().num_blocks)
-            print(fluid.default_main_program().blocks[0].var('image'))
-    """
-    return _main_program_
-
-
-def switch_main_program(program):
-    """
-    Switch the main program to a new program.
-
-    Args:
-        program(Program): The new main program
-
-    Returns:
-        Program: The previous main program
-    """
-    global _main_program_
-    prev_program = _main_program_
-    _main_program_ = program
-    return prev_program
-
-
-def switch_startup_program(program):
-    """
-    Switch the startup program to a new program
-    Args:
-        program(Program): The new startup program
-
-    Returns:
-        Program: The previous startup program
-    """
-    global _startup_program_
-    prev_program = _startup_program_
-    _startup_program_ = program
-    return prev_program
-
-
-@signature_safe_contextmanager
-def program_guard(main_program, startup_program=None):
-    """
-    Change the global main program and startup program with `"with"` statement.
-    Layer functions in the Python `"with"` block will append operators and
-    variables to the new main programs.
-
-    Examples:
-       .. code-block:: python
-       
-         import paddle.fluid as fluid
-
-         main_program = fluid.Program()
-         startup_program = fluid.Program()
-         with fluid.program_guard(main_program, startup_program):
-             data = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-             hidden = fluid.layers.fc(input=data, size=10, act='relu')
-
-    Notes: The temporary :code:`Program` can be used if the user does not need
-    to construct either of startup program or main program.
-
-    Examples:
-       .. code-block:: python
-
-         import paddle.fluid as fluid
-
-         main_program = fluid.Program()
-         # does not care about startup program. Just pass a temporary value.
-         with fluid.program_guard(main_program, fluid.Program()):
-             data = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-
-    Args:
-        main_program(Program): New main program inside `"with"` statement.
-        startup_program(Program): New startup program inside `"with"` statement.
-            None means not changing startup program.
-    """
-    if not isinstance(main_program, Program):
-        raise TypeError("main_program should be Program")
-    main_program = switch_main_program(main_program)
-    if startup_program is not None:
-        if not isinstance(startup_program, Program):
-            raise TypeError("startup_program should be Program")
-        startup_program = switch_startup_program(startup_program)
-    yield
-    switch_main_program(main_program)
-    if startup_program is not None:
-        switch_startup_program(startup_program)
-
-
-def _get_var(name, program=None):
-    """
-    Get a variable by name from the global block of a program.
-
-    Args:
-        name(str): name of the variable
-        program(Program|None): program object.
-        If None, default_global_program() will be used.
-
-    Returns:
-        Variable
-    """
-    if program is None:
-        program = default_main_program()
-    assert isinstance(name, str)
-    assert isinstance(program, Program)
-
-    return program.global_block().var(name)
-
-
-@signature_safe_contextmanager
-def _dygraph_guard(tracer):
-    global _dygraph_tracer_
-    tmp_trace = _dygraph_tracer_
-    _dygraph_tracer_ = tracer
-
-    yield
-
-    _dygraph_tracer_ = tmp_trace
-
-
-@signature_safe_contextmanager
-def _dygraph_place_guard(place):
-    global _dygraph_current_expected_place_
-    tmp_place = _dygraph_current_expected_place_
-    _dygraph_current_expected_place_ = place
-
-    yield
-
-    _dygraph_current_expected_place_ = tmp_place
diff --git a/python/paddle/fluid/graphviz.py b/python/paddle/fluid/graphviz.py
deleted file mode 100644
index 2b18d854d18bcbebce2a0eb30b8690db49d9d246..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/graphviz.py
+++ /dev/null
@@ -1,272 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import random
-import six
-import functools
-import subprocess
-import logging
-
-
-def crepr(v):
-    if isinstance(v, six.string_types):
-        return '"%s"' % v
-    return str(v)
-
-
-class Rank(object):
-    def __init__(self, kind, name, priority):
-        '''
-        kind: str
-        name: str
-        priority: int
-        '''
-        self.kind = kind
-        self.name = name
-        self.priority = priority
-        self.nodes = []
-
-    def __str__(self):
-        if not self.nodes:
-            return ''
-
-        return '{' + 'rank={};'.format(self.kind) + \
-               ','.join([node.name for node in self.nodes]) + '}'
-
-
-class Graph(object):
-    rank_counter = 0
-
-    def __init__(self, title, **attrs):
-        self.title = title
-        self.attrs = attrs
-        self.nodes = []
-        self.edges = []
-        self.rank_groups = {}
-
-    def code(self):
-        return self.__str__()
-
-    def rank_group(self, kind, priority):
-        name = "rankgroup-%d" % Graph.rank_counter
-        Graph.rank_counter += 1
-        rank = Rank(kind, name, priority)
-        self.rank_groups[name] = rank
-        return name
-
-    def node(self, label, prefix, description="", **attrs):
-        node = Node(label, prefix, description, **attrs)
-
-        if 'rank' in attrs:
-            rank = self.rank_groups[attrs['rank']]
-            del attrs['rank']
-            rank.nodes.append(node)
-        self.nodes.append(node)
-        return node
-
-    def edge(self, source, target, **attrs):
-        edge = Edge(source, target, **attrs)
-        self.edges.append(edge)
-        return edge
-
-    def compile(self, dot_path):
-        file = open(dot_path, 'w')
-        file.write(self.__str__())
-        image_path = os.path.join(
-            os.path.dirname(dot_path), dot_path[:-3] + "pdf")
-        cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
-        subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
-        logging.warning("write block debug graph to {}".format(image_path))
-        return image_path
-
-    def show(self, dot_path):
-        image = self.compile(dot_path)
-        cmd = ["open", image]
-        subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
-
-    def _rank_repr(self):
-        ranks = sorted(
-            six.iteritems(self.rank_groups),
-            key=functools.cmp_to_key(
-                lambda a, b: a[1].priority > b[1].priority))
-        repr = []
-        for x in ranks:
-            repr.append(str(x[1]))
-        return '\n'.join(repr) + '\n'
-
-    def __str__(self):
-        reprs = [
-            'digraph G {',
-            'title = {}'.format(crepr(self.title)),
-        ]
-
-        for attr in self.attrs:
-            reprs.append("{key}={value};".format(
-                key=attr, value=crepr(self.attrs[attr])))
-
-        reprs.append(self._rank_repr())
-
-        random.shuffle(self.nodes)
-        reprs += [str(node) for node in self.nodes]
-
-        for x in self.edges:
-            reprs.append(str(x))
-
-        reprs.append('}')
-        return '\n'.join(reprs)
-
-
-class Node(object):
-    counter = 1
-
-    def __init__(self, label, prefix, description="", **attrs):
-        self.label = label
-        self.name = "%s_%d" % (prefix, Node.counter)
-        self.description = description
-        self.attrs = attrs
-        Node.counter += 1
-
-    def __str__(self):
-        reprs = '{name} [label={label} {extra} ];'.format(
-            name=self.name,
-            label=self.label,
-            extra=',' + ','.join("%s=%s" % (key, crepr(value))
-                                 for key, value in six.iteritems(self.attrs))
-            if self.attrs else "")
-        return reprs
-
-
-class Edge(object):
-    def __init__(self, source, target, **attrs):
-        '''
-        Link source to target.
-        :param source: Node
-        :param target: Node
-        :param graph: Graph
-        :param attrs: dic
-        '''
-        self.source = source
-        self.target = target
-        self.attrs = attrs
-
-    def __str__(self):
-        repr = "{source} -> {target} {extra}".format(
-            source=self.source.name,
-            target=self.target.name,
-            extra="" if not self.attrs else
-            "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
-                           for attr in six.iteritems(self.attrs)) + "]")
-        return repr
-
-
-class GraphPreviewGenerator(object):
-    '''
-    Generate a graph image for ONNX proto.
-    '''
-
-    def __init__(self, title):
-        # init graphviz graph
-        self.graph = Graph(
-            title,
-            layout="dot",
-            concentrate="true",
-            rankdir="TB", )
-
-        self.op_rank = self.graph.rank_group('same', 2)
-        self.param_rank = self.graph.rank_group('same', 1)
-        self.arg_rank = self.graph.rank_group('same', 0)
-
-    def __call__(self, path='temp.dot', show=False):
-        if not show:
-            self.graph.compile(path)
-        else:
-            self.graph.show(path)
-
-    def add_param(self, name, data_type, highlight=False):
-        label = '\n'.join([
-            '<<table cellpadding="5">',
-            '  <tr>',
-            '    <td bgcolor="#2b787e">',
-            '    <b>',
-            name,
-            '    </b>',
-            '    </td>',
-            '  </tr>',
-            '  <tr>',
-            '    <td>',
-            str(data_type),
-            '    </td>'
-            '  </tr>',
-            '</table>>',
-        ])
-        return self.graph.node(
-            label,
-            prefix="param",
-            description=name,
-            shape="none",
-            style="rounded,filled,bold",
-            width="1.3",
-            color="#148b97" if not highlight else "orange",
-            fontcolor="#ffffff",
-            fontname="Arial")
-
-    def add_op(self, opType, **kwargs):
-        highlight = False
-        if 'highlight' in kwargs:
-            highlight = kwargs['highlight']
-            del kwargs['highlight']
-        return self.graph.node(
-            "<<B>%s</B>>" % opType,
-            prefix="op",
-            description=opType,
-            shape="box",
-            style="rounded, filled, bold",
-            color="#303A3A" if not highlight else "orange",
-            fontname="Arial",
-            fontcolor="#ffffff",
-            width="1.3",
-            height="0.84", )
-
-    def add_arg(self, name, highlight=False):
-        return self.graph.node(
-            crepr(name),
-            prefix="arg",
-            description=name,
-            shape="box",
-            style="rounded,filled,bold",
-            fontname="Arial",
-            fontcolor="#999999",
-            color="#dddddd" if not highlight else "orange")
-
-    def add_edge(self, source, target, **kwargs):
-        highlight = False
-        if 'highlight' in kwargs:
-            highlight = kwargs['highlight']
-            del kwargs['highlight']
-        return self.graph.edge(
-            source,
-            target,
-            color="#00000" if not highlight else "orange",
-            **kwargs)
diff --git a/python/paddle/fluid/incubate/__init__.py b/python/paddle/fluid/incubate/__init__.py
deleted file mode 100644
index 76c5c6391fde3cafbd9a94e1d11e0ef4401420ed..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# incubate directory is mainly for internal use
-# after we have tested incubate APIs in industrial application for a period
-# we will move stable functions into fluid
-__version__ = '0.1.0'
diff --git a/python/paddle/fluid/incubate/data_generator/test_data_generator.py b/python/paddle/fluid/incubate/data_generator/test_data_generator.py
deleted file mode 100644
index dcacd67e92a8886c17cfb09b1aa18e0e48f6b605..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/data_generator/test_data_generator.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-from __init__ import *
-
-
-class SyntheticData(MultiSlotDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", [1, 2, 3, 4]), ("label", [0])
-
-        return data_iter
-
-
-class SyntheticStringData(MultiSlotStringDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", ["1", "2", "3", "4"], ("label", ["0"]))
-
-
-sd = SyntheticData()
-sd.run_from_memory()
-
-sd2 = SyntheticStringData()
-sd.run_from_memory()
diff --git a/python/paddle/fluid/incubate/fleet/__init__.py b/python/paddle/fluid/incubate/fleet/__init__.py
deleted file mode 100644
index a05baabca392b14a4cb09a3f395ae7687d8a5e62..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-__version__ = '0.1.0'
diff --git a/python/paddle/fluid/incubate/fleet/base/__init__.py b/python/paddle/fluid/incubate/fleet/base/__init__.py
deleted file mode 100644
index 8647330f3290f3142cabca9a7e3fe162a9838dda..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/base/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
deleted file mode 100644
index 8e7cee1fb692810dab37ed9375b6cadae41f97e5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ /dev/null
@@ -1,345 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import abc
-
-import paddle.fluid as fluid
-from paddle.fluid.executor import Executor
-from paddle.fluid.optimizer import SGD
-
-from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
-from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase
-from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker
-from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecison
-
-
-class Mode:
-    """
-    There are various mode for fleet, each of them is designed for different model.
-    """
-    TRANSPILER = 1
-    PSLIB = 2
-    COLLECTIVE = 3
-
-
-class Fleet(object):
-    """
-    Fleet is the base class, transpiler and pslib are implementation of Fleet.
-
-    Args:
-        mode(Mode): the implementation of Fleet's mode.
-
-    Returns:
-        None
-    """
-    __metaclass__ = abc.ABCMeta
-
-    def __init__(self, mode):
-        self._is_initialized = False
-        self._mode = mode
-        self._optimizer = None
-        self._role_maker = None
-        self._executor = None
-
-    def is_first_worker(self):
-        """
-        Check whether the node is the first instance of worker.
-
-        Returns:
-            bool: True if this is the first node of worker,
-                  False if not.
-        """
-        return self._role_maker.is_first_worker()
-
-    def worker_index(self):
-        """
-        Get current worker index.
-
-        Returns:
-            int: node id
-        """
-        return self._role_maker.worker_index()
-
-    def worker_num(self):
-        """
-        Get current total worker number.
-
-        Returns:
-            int: worker numbers
-        """
-        return self._role_maker.worker_num()
-
-    def is_worker(self):
-        """
-        Check whether the node is an instance of worker.
-
-        Returns:
-            bool: True if this is a node of worker,
-                  False if not.
-        """
-        return self._role_maker.is_worker()
-
-    def worker_endpoints(self, to_string=False):
-        """
-        Get current server endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
-
-        Returns:
-            list/string: server endpoints
-        """
-
-        if to_string:
-            return ",".join(self._role_maker.get_trainer_endpoints())
-        else:
-            return self._role_maker.get_trainer_endpoints()
-
-    def server_num(self):
-        """
-        Get current total worker number.
-
-        Returns:
-            int: server number
-        """
-        return len(self._role_maker.get_pserver_endpoints())
-
-    def server_index(self):
-        """
-        Get current server index.
-
-        Returns:
-            int: node id
-        """
-        return self._role_maker.server_index()
-
-    def server_endpoints(self, to_string=False):
-        """
-        Get current server endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
-
-        Returns:
-            list/string: server endpoints
-        """
-
-        if to_string:
-            return ",".join(self._role_maker.get_pserver_endpoints())
-        else:
-            return self._role_maker.get_pserver_endpoints()
-
-    def is_server(self):
-        """
-        Check whether the node is an instance of server.
-
-        Returns:
-            bool: True if this is a node of server,
-                  False if not.
-        """
-        return self._role_maker.is_server()
-
-    def split_files(self, files):
-        """
-        split files before distributed training,
-        example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
-                   0 gets [a, b, c] and trainer 1 gets [d, e].
-        example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
-                   [a], trainer 1 gets [b],  trainer 2 gets []
-
-        Args:
-            files(list): file list need to be read.
-
-        Returns:
-            list: files belongs to this worker.
-        """
-        if not isinstance(files, list):
-            raise TypeError("files should be a list of file need to be read.")
-
-        trainer_id = self.worker_index()
-        trainers = self.worker_num()
-
-        remainder = len(files) % trainers
-        blocksize = len(files) / trainers
-
-        blocks = [blocksize] * trainers
-        for i in range(remainder):
-            blocks[i] += 1
-
-        trainer_files = [[]] * trainers
-        begin = 0
-        for i in range(trainers):
-            trainer_files[i] = files[begin:begin + blocks[i]]
-            begin += blocks[i]
-
-        return trainer_files[trainer_id]
-
-    def init(self, role_maker=None):
-        """
-        should be called only once in user's python scripts,
-        init() will initialize RoleMaker which is used for identifying
-            current node's role, e.g. worker, server, etc.
-
-        Args:
-            role_maker(RoleMakerBase): subclass of RoleMakerBase.
-
-        Returns:
-            None
-        """
-        self._executor = Executor(fluid.CPUPlace())
-
-        if role_maker and not isinstance(role_maker, RoleMakerBase):
-            raise TypeError("role_maker must be an instance of RoleMakerBase")
-
-        self._role_maker = role_maker
-        self._role_maker.generate_role()
-        self._is_initialized = True
-
-    @abc.abstractmethod
-    def init_worker(self):
-        pass
-
-    @abc.abstractmethod
-    def init_server(self, model_dir=None):
-        pass
-
-    @abc.abstractmethod
-    def run_server(self):
-        pass
-
-    @abc.abstractmethod
-    def stop_worker(self):
-        pass
-
-    @abc.abstractmethod
-    def distributed_optimizer(self, optimizer, strategy=None):
-        pass
-
-    @abc.abstractmethod
-    def save_inference_model(self,
-                             executor,
-                             dirname,
-                             feeded_var_names,
-                             target_vars,
-                             main_program=None,
-                             export_for_deployment=True):
-        pass
-
-    @abc.abstractmethod
-    def save_persistables(self, executor, dirname, main_program=None):
-        pass
-
-
-class DistributedOptimizer(object):
-    """
-    DistributedOptimizer is a wrapper for paddle.fluid.optimizer
-    A user should pass a paddle.fluid.optimizer to DistributedOptimizer
-    minimize() function is implemented.
-    DistributedOptimizer is the starting point for a user who wants to
-    run distributed training. The optimized information will be stored in
-    Fleet() instance who holds the global information about current distributed
-    training.
-
-    Args:
-        optimizer(Optimizer): subclass of Optimizer.
-        strategy(any): the user define config for Optimizer.
-
-    Returns:
-        None
-
-    """
-    __metaclass__ = abc.ABCMeta
-
-    def __init__(self, optimizer, strategy=None):
-        if not isinstance(optimizer, SGD.__bases__) \
-                 and not isinstance(optimizer, OptimizerWithMixedPrecison):
-            raise TypeError("optimizer must be an instance of Optimizer")
-
-        self._optimizer = optimizer
-        self._strategy = strategy
-
-    @abc.abstractmethod
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None):
-        """
-        First part of `minimize`, do auto-diff to append backward ops for
-        the current program.
-
-        Args:
-            loss (Variable): loss variable to run optimizations.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-            callbacks (list|None): list of callables to run when appending backward
-                operator for one parameter.
-
-        Return:
-            list: list of (param, grad) pair, grad is the output of backward.
-
-        Examples:
-            See examples in `apply_gradients`.
-        """
-        pass
-
-    @abc.abstractmethod
-    def apply_gradients(self, params_grads):
-        """
-        Second part of `minimize`, appending optimization operators for
-        given `params_grads` pairs.
-
-        Args:
-            params_grads (list): list of (param, grad) pair to do optimization.
-
-        Returns:
-            list: A list of operators appended to the current program.
-
-        Examples:
-            .. code-block:: python
-
-                loss = network()
-                optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-                params_grads = optimizer.backward(loss)
-                # you may append operations for params_grads here
-                # ...
-                optimizer.apply_gradients(params_grads)
-        """
-        pass
-
-    @abc.abstractmethod
-    def minimize(self,
-                 losses,
-                 scopes=None,
-                 startup_programs=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        """
-        Add operations to minimize `loss` by updating `parameter_list`.
-
-        This method combines interface `backward()` and
-        `apply_gradients()` into one.
-
-        Args:
-            losses (Variable|Variable List): loss variable to run optimizations.
-            scopes (Scope| Scope List): scope instance.
-            startup_programs (Program|Program List): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-
-        Returns:
-            tuple: (optimize_ops, params_grads) which are, list of operators appended;
-            and list of (param, grad) Variables pair for optimization.
-        """
-        pass
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
deleted file mode 100644
index ab68a5248cf7f4b87b822fc5574228636e531768..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ /dev/null
@@ -1,553 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-__all__ = [
-    'Role', 'RoleMakerBase', 'MPISymetricRoleMaker', 'UserDefinedRoleMaker',
-    'UserDefinedCollectiveRoleMaker', 'PaddleCloudRoleMaker'
-]
-
-import os
-
-
-class Role:
-    WORKER = 1
-    SERVER = 2
-
-
-class RoleMakerBase(object):
-    """
-    RoleMakerBase is a base class for assigning a role to current process
-    in distributed training.
-    A paddle developer can implement RoleMakerBase to design a role maker
-    for worker or pserver assignment.
-    """
-
-    def __init__(self):
-        self._worker_endpoints = []
-        self._server_endpoints = []
-        self._role_is_generated = False
-        self._role = None
-        self._current_id = -1
-
-    def is_worker(self):
-        """
-        return is_worker() of current process
-        """
-        raise NotImplementedError("Please implement this method in child class")
-
-    def is_server(self):
-        """
-        return is_server() of current process
-        """
-        raise NotImplementedError("Please implement this method in child class")
-
-    def is_first_worker(self):
-        """
-        Check whether the node is the first instance of worker.
-        Returns:
-            bool: True if this is the first node of worker,
-                  False if not.
-        """
-        raise NotImplementedError("Please implement this method in child class")
-
-    def worker_num(self):
-        """
-        Get current total worker number.
-
-        Returns:
-            int: worker number
-        """
-        raise NotImplementedError("Please implement this method in child class")
-
-    def worker_index(self):
-        """
-        Get current worker id.
-
-        Returns:
-            int: node id
-        """
-        raise NotImplementedError("Please implement this method in child class")
-
-    def server_index(self):
-        """
-        Get current server id.
-
-        Returns:
-            int: node id
-        """
-        raise NotImplementedError("Please implement this method in child class")
-
-    def get_trainer_endpoints(self):
-        """
-        return trainer endpoints
-        """
-        return self._worker_endpoints
-
-    def get_pserver_endpoints(self):
-        """
-        return pserver endpoints
-        """
-        return self._server_endpoints
-
-    def to_string(self):
-        return "role: {}, current_id: {}, worker_endpoints: {}, server_endpoints: {}".format(
-            self._role, self._current_id, self._worker_endpoints,
-            self._server_endpoints)
-
-
-class MPIRoleMaker(RoleMakerBase):
-    """
-    MPIRoleMaker is a MPI-API based role maker which is a counter-part of K8SRoleMaker
-    mpi4py will be used if a developer inherits MPIRoleMaker
-    """
-
-    def __init__(self):
-        super(MPIRoleMaker, self).__init__()
-        from mpi4py import MPI
-        self.MPI = MPI
-        self._comm = MPI.COMM_WORLD
-        self._node_type_comm = None
-        self._ips = None
-        self._ip = None
-
-    def _get_rank(self):
-        """
-        return rank
-        """
-        self._rank = self._comm.Get_rank()
-        return self._rank
-
-    def _get_size(self):
-        """
-        return size
-        """
-        self._size = self._comm.Get_size()
-        return self._size
-
-    def _all_gather(self, obj):
-        """
-        all_gather(obj) will call MPI's allgather function
-        """
-        self._barrier_all()
-        return self._comm.allgather(obj)
-
-    def _worker_gather(self, obj):
-        """
-        worker_gather(obj) will call MPI's allgather function
-        """
-        if self.is_worker():
-            self._node_type_comm.barrier()
-            return self._node_type_comm.allgather(obj)
-        return None
-
-    def _barrier_all(self):
-        """
-        barrier_all() will call MPI's barrier_all function
-        """
-        self._comm.barrier()
-
-    def _finalize(self):
-        """
-        finalize the current MPI instance.
-        """
-        self.MPI.Finalize()
-
-    def _get_ips(self):
-        """
-        collect current distributed job's ip list
-        """
-        if not self._ips:
-            self._ips = self._comm.allgather(self.get_local_ip())
-        return self._ips
-
-    def get_local_ip(self):
-        """
-        return get local ip
-        """
-        import socket
-        self._ip = socket.gethostbyname(socket.gethostname())
-        return self._ip
-
-    def generate_role(self):
-        """
-        generate_role() should be called to identify current process's role
-        """
-        raise NotImplementedError("Please implement this method in child class")
-
-
-class MPISymetricRoleMaker(MPIRoleMaker):
-    """
-    MPISymetricRoleMaker is designed for worker and server assignment
-    under MPI. Typically, a worker and a server node will be appointed
-    on each physical node. This role maker can be only used under MPI.
-    """
-
-    def __init__(self):
-        super(MPISymetricRoleMaker, self).__init__()
-        self._node_type = None
-        self._proc_per_node = 2
-        self._pserver_rand_port = 0
-
-    def _check_role_generation(self):
-        if not self._role_is_generated:
-            raise NameError("generate_role() should be called first")
-        return True
-
-    def is_first_worker(self):
-        """
-        return whether current process is the first worker assigned by role maker
-        """
-        if self._check_role_generation():
-            return self.is_worker() and 0 == self.worker_index()
-        return False
-
-    def get_pserver_endpoints(self):
-        if self._pserver_rand_port <= 0:
-            import random
-            random.seed(self._server_num())
-            # port will be randomly generated from 60001 to 63999
-            # random seed is server num so that all nodes will get
-            # the same port
-            self._pserver_rand_port = random.randint(60001, 64000)
-        endpoints = [
-            x + ":" + str(self._pserver_rand_port)
-            for x in self._server_endpoints
-        ]
-        return endpoints
-
-    def worker_num(self):
-        return self._worker_num()
-
-    def is_worker(self):
-        """
-        return whether current process is worker assigned by role maker
-        """
-        if self._check_role_generation():
-            return self._node_type == 1
-        return False
-
-    def is_server(self):
-        """
-        return whether current process is server assigned by role maker
-        """
-        if self._check_role_generation():
-            return self._node_type == 0
-        return False
-
-    def _worker_num(self):
-        """
-        return the current number of worker
-        """
-        if self._check_role_generation():
-            if self.is_worker():
-                return self._get_size() / self._proc_per_node
-        return 0
-
-    def _server_num(self):
-        """
-        return the current number of server
-        """
-        if self._check_role_generation():
-            return self._get_size() / self._proc_per_node
-        else:
-            self.generate_role()
-            return self._get_size() / self._proc_per_node
-
-    def worker_index(self):
-        """
-        return the index of worker
-        """
-        if self._check_role_generation():
-            return self._rank / self._proc_per_node
-        else:
-            self.generate_role()
-            return self._get_size() / 2
-
-    def server_index(self):
-        """
-        return the index of server
-        """
-        if self._check_role_generation():
-            return self._rank / self._proc_per_node
-        else:
-            self.generate_role()
-            return self._get_size() / self._proc_per_node
-
-    def _barrier_worker(self):
-        """
-        barrier all workers in current distributed job
-        """
-        if self._check_role_generation():
-            if self.is_worker():
-                self._node_type_comm.barrier()
-        else:
-            raise Exception("You should check role generation first")
-
-    def _barrier_server(self):
-        """
-        barrier all servers in current distributed job
-        """
-        if self._check_role_generation():
-            if self.is_server():
-                self._node_type_comm.barrier()
-        else:
-            raise Exception("You should check role generation first")
-
-    def generate_role(self):
-        """
-        generate currently process's role
-        """
-        if not self._role_is_generated:
-            # TODO(guru4elephant): only allow to be called once
-            self._worker_endpoints = self._get_ips()[1::2]
-            self._server_endpoints = self._get_ips()[::2]
-
-            if 0 == self._get_rank() % self._proc_per_node % 2:
-                self._node_type = 0
-            else:
-                self._node_type = 1
-            self._node_type_comm = self._comm.Split(self._node_type)
-            self._role_is_generated = True
-        else:
-            raise Exception("You should check role generation first")
-
-
-class PaddleCloudRoleMaker(RoleMakerBase):
-    def __init__(self, is_collective=False):
-        super(PaddleCloudRoleMaker, self).__init__()
-        self._role_is_generated = False
-        self._is_collective = is_collective
-
-    def generate_role(self):
-        if not self._role_is_generated:
-            if not self._is_collective:
-                try:
-                    port = os.environ["PADDLE_PORT"]
-                    pserver_ips = os.environ["PADDLE_PSERVERS"].split(",")
-                    if "," in port:
-                        ports = port.split(",")
-                    else:
-                        ports = [port] * len(pserver_ips)
-                    eplist = []
-                    # note that, we usually assign the same port to different ips
-                    # if we run parameter server training in local mode
-                    # port should be different in environment variables
-                    for i, ip in enumerate(pserver_ips):
-                        eplist.append(':'.join([ip, ports[i]]))
-
-                    trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
-                    training_role = os.environ["TRAINING_ROLE"]
-
-                    if training_role not in ["TRAINER", "PSERVER"]:
-                        raise ValueError(
-                            "TRAINING_ROLE must be PSERVER or TRAINER")
-
-                    if training_role == "TRAINER":
-                        role = Role.WORKER
-                        current_id = int(os.environ["PADDLE_TRAINER_ID"])
-                    elif training_role == "PSERVER":
-                        role = Role.SERVER
-                        cur_ip = os.environ["POD_IP"]
-                        cur_idx = pserver_ips.index(cur_ip)
-                        current_id = eplist.index(":".join(
-                            [cur_ip, ports[cur_idx]]))
-                    else:
-                        raise ValueError(
-                            "TRAINING_ROLE must be PSERVER or TRAINER")
-                except ValueError as ve:
-                    raise ValueError(
-                        "something wrong with PaddleCloud, please check environment"
-                    )
-
-                self._trainers_num = trainers_num
-                self._server_endpoints = eplist
-                self._role = role
-                self._current_id = current_id
-            else:
-                self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-                self._training_role = os.getenv("PADDLE_TRAINING_ROLE",
-                                                "TRAINER")
-                assert (self._training_role == "TRAINER")
-                self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
-                self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-                assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
-                self._worker_endpoints = self._worker_endpoints.split(",")
-                self._trainers_num = len(self._worker_endpoints)
-
-            self._role_is_generated = True
-
-    def get_pserver_endpoints(self):
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._server_endpoints
-
-    def is_worker(self):
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._role == Role.WORKER
-
-    def is_server(self):
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._role == Role.SERVER
-
-    def is_first_worker(self):
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._role == Role.WORKER and self._current_id == 0
-
-    def worker_index(self):
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._current_id
-
-    def server_index(self):
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._current_id
-
-    def worker_num(self):
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._trainers_num
-
-
-class UserDefinedRoleMaker(RoleMakerBase):
-    def __init__(self,
-                 current_id=0,
-                 role=Role.WORKER,
-                 worker_num=0,
-                 server_endpoints=None):
-        """
-        UserDefinedRoleMaker is designed for worker and server assignment
-        under manual. Typically, a worker and a server node will be appointed
-        on each physical node, It can be assign by user.
-        """
-        super(UserDefinedRoleMaker, self).__init__()
-
-        if not isinstance(server_endpoints, list):
-            raise TypeError("server_endpoints must be as string list")
-        elif len(server_endpoints) <= 0:
-            raise ValueError(
-                "the length of server_endpoints list must be greater than 0")
-        elif len(server_endpoints) != len(set(server_endpoints)):
-            raise ValueError("server_endpoints can't have duplicate elements")
-        else:
-            for server_endpoint in server_endpoints:
-                if not isinstance(server_endpoint, str):
-                    raise TypeError(
-                        "every element in server_endpoints list must be as string"
-                    )
-            self._server_endpoints = server_endpoints
-
-        if role != Role.WORKER and role != Role.SERVER:
-            raise TypeError("role must be as Role")
-        else:
-            self._role = role
-
-        if not isinstance(current_id, int):
-            raise TypeError("current_id must be as int")
-        else:
-            if current_id < 0:
-                raise ValueError(
-                    "current_id must be greater than or equal to 0")
-            elif self._role == Role.SERVER and current_id >= len(
-                    server_endpoints):
-                raise ValueError(
-                    "if role is Role.SERVER, current_id must be less than or equal to len(server_endpoints) - 1"
-                )
-            self._current_id = current_id
-
-        if not isinstance(worker_num, int):
-            raise TypeError("worker_num must be as int")
-        else:
-            if worker_num <= 0:
-                raise ValueError("worker_num must be greater than 0")
-            self._worker_num = worker_num
-
-    def generate_role(self):
-        self._role_is_generated = True
-
-    def is_worker(self):
-        return self._role == Role.WORKER
-
-    def is_server(self):
-        return self._role == Role.SERVER
-
-    def is_first_worker(self):
-        return self._role == Role.WORKER and self._current_id == 0
-
-    def worker_index(self):
-        return self._current_id
-
-    def server_index(self):
-        return self._current_id
-
-    def worker_num(self):
-        return self._worker_num
-
-
-class UserDefinedCollectiveRoleMaker(RoleMakerBase):
-    def __init__(self, current_id=0, worker_endpoints=None):
-        """
-        UserDefinedCollectiveRoleMaker is designed for worker assignment
-        under manual for collective mode.
-        """
-        super(UserDefinedCollectiveRoleMaker, self).__init__()
-
-        if not isinstance(worker_endpoints, list):
-            raise TypeError("worker_endpoints must be as string list")
-        elif len(worker_endpoints) <= 0:
-            raise ValueError(
-                "the length of worker_endpoints list must be greater than 0")
-        elif len(worker_endpoints) != len(set(worker_endpoints)):
-            raise ValueError("worker_endpoints can't have duplicate elements")
-        else:
-            for worker_endpoint in worker_endpoints:
-                if not isinstance(worker_endpoint, str):
-                    raise TypeError(
-                        "every element in worker_endpoints list must be as string"
-                    )
-            self._worker_endpoints = worker_endpoints
-
-        if not isinstance(current_id, int):
-            raise TypeError("current_id must be as int")
-        else:
-            if current_id < 0:
-                raise ValueError(
-                    "current_id must be greater than or equal to 0")
-            elif current_id >= len(worker_endpoints):
-                raise ValueError(
-                    "current_id must be less than or equal to len(worker_endpoints) - 1"
-                )
-            self._current_id = current_id
-
-        self._worker_num = len(self._worker_endpoints)
-
-    def generate_role(self):
-        self._role_is_generated = True
-
-    def is_worker(self):
-        return True
-
-    def is_first_worker(self):
-        return self._current_id == 0
-
-    def worker_index(self):
-        return self._current_id
-
-    def worker_num(self):
-        return self._worker_num
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
deleted file mode 100644
index 4f939deac66e88f9c0618e2f05918b138d2c574a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ /dev/null
@@ -1,374 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-import logging
-
-import paddle.fluid as fluid
-import paddle.fluid.io as io
-import paddle.fluid.transpiler.distribute_transpiler as dist_transpiler
-
-from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
-from paddle.fluid.incubate.fleet.base.fleet_base import Mode
-from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
-
-from paddle.fluid import compiler
-
-import os
-import sys
-import six
-
-
-class LambConfig(object):
-    def __init__(self):
-        pass
-
-
-class DistFCConfig(object):
-    def __init__(self):
-        pass
-
-
-class Collective(Fleet):
-    def __init__(self):
-        super(Collective, self).__init__(Mode.COLLECTIVE)
-        self._local_ip = 0
-
-        self.startup_program = None
-        self._origin_program = None
-        self._transpiled_program = None
-        self.main_program = None
-
-    def init_worker(self):
-        logging.warn(
-            "You should not call 'init_worker' method for collective mode.")
-
-    def run_worker(self, main_programs=None, scopes=None):
-        logging.warn(
-            "You should not call 'run_worker' method for collective mode.")
-
-    def init_server(self, model_dir=None):
-        logging.warn(
-            "You should not call 'init_server' method for collective mode.")
-
-    def run_server(self):
-        logging.warn(
-            "You should not call 'run_server' method for collective mode.")
-
-    def stop_worker(self):
-        logging.warn(
-            "You should not call 'stop_worker' method for collective mode.")
-
-    def distributed_optimizer(self, optimizer, strategy=None):
-        self._optimizer = \
-            CollectiveOptimizer(optimizer, strategy)
-        return self._optimizer
-
-    def save_inference_model(self,
-                             executor,
-                             dirname,
-                             feeded_var_names=None,
-                             target_vars=None,
-                             main_program=None,
-                             export_for_deployment=True):
-        io.save_inference_model(dirname, feeded_var_names, target_vars,
-                                executor, main_program, None, None,
-                                export_for_deployment)
-
-    def save_persistables(self, executor, dirname, main_program=None):
-        io.save_persistables(executor, dirname, main_program, None)
-
-
-fleet = Collective()
-
-
-class DistributedStrategy(fluid.BuildStrategy):
-    """
-    Init function of DistributedStrategy
-    """
-
-    def __init__(self):
-        super(DistributedStrategy, self).__init__()
-        self.use_local_sgd = False
-        self.use_dist_fc = False
-
-        self.dist_fc_config = None  # DistFCConfig
-        self.mode = "nccl2"  # or collective
-        self.collective_mode = None  # local_sgd or grad_allreduce
-        self.nccl_comm_num = 1
-        self.forward_recompute = False
-        self.recompute_checkpoints = []
-
-        self.exec_strategy = fluid.ExecutionStrategy()
-
-        # configurations below are used for unit test
-        self._ut4grad_allreduce = False
-
-
-class CollectiveOpBasedOptimizer(DistributedOptimizer):
-    """
-    Collective Operator Base Class For Distributed Optimizer
-    The class is invisible to a user
-    """
-
-    def __init__(self, optimizer, strategy=None):
-        assert isinstance(
-            strategy,
-            DistributedStrategy), "strategy must be DistributedStrategy"
-        super(CollectiveOpBasedOptimizer, self).__init__(optimizer, strategy)
-
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None):
-        return self._optimizer.backward(loss, startup_program, parameter_list,
-                                        no_grad_set, callbacks)
-
-    def apply_gradients(self, params_grads):
-        return self._optimizer.apply_gradients(params_grads)
-
-
-class CollectiveOptimizer(DistributedOptimizer):
-    """
-    DistributedOptimizer is a wrapper for paddle.fluid.optimizer
-    A user should pass a paddle.fluid.optimizer to DistributedOptimizer
-    minimize() function is implemented.
-    DistributedOptimizer is the starting point for a user who wants to
-    run distributed training. The optimized information will be stored in
-    Fleet() instance who holds the global information about current distributed
-    training.
-    """
-
-    def __init__(self, optimizer, strategy=DistributedStrategy()):
-        super(CollectiveOptimizer, self).__init__(optimizer, strategy)
-        if strategy.forward_recompute:
-            self.forward_recompute = True
-            self.recompute_checkpoints = strategy.recompute_checkpoints
-        else:
-            self.forward_recompute = False
-        self.print_config = False
-
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None):
-        return self._optimizer.backward(loss, startup_program, parameter_list,
-                                        no_grad_set, callbacks)
-
-    def apply_gradients(self, params_grads):
-        return self._optimizer.apply_gradients(params_grads)
-
-    def _check_condition(self, name, **kwargs):
-        for k, v in six.iteritems(kwargs):
-            if v is True:
-                assert False, "you can't use %s and %s together" % (name, k)
-
-    def _check_collective_mode(self, main_program, optimizer, strategy):
-        """
-        Check the conflict condtions.
-        """
-        if strategy.use_local_sgd:
-            strategy.mode = "collective"
-            strategy.collective_mode = "local_sgd"
-            self._check_condition(
-                "use_local_sgd",
-                use_dgc=main_program._enable_dgc,
-                use_dist_fc=strategy.use_dist_fc,
-                use_lamb=main_program._use_lamb)
-
-        if strategy.use_dist_fc:
-            self._check_condition(
-                "use_dist_fc",
-                use_dgc=main_program._enable_dgc,
-                use_local_sgd=strategy.use_local_sgd,
-                use_lamb=main_program._use_lamb)
-            assert strategy.dist_fc_config is not None, "DistributedStrategy.dist_fc_config should be set"
-
-        if strategy._ut4grad_allreduce:
-            strategy.mode = "collective"
-            strategy.collective_mode = "grad_allreduce"
-            self._check_condition(
-                "_ut4grad_allreduce",
-                use_dgc=main_program._enable_dgc,
-                use_lamb=main_program._use_lamb)
-
-        if self._strategy.collective_mode=="local_sgd" \
-                or self._strategy.collective_mode == "grad_allreduce":
-            assert self._strategy.mode == "collective", \
-                "local_sgd and grad_allreduce can be used under collective mode"
-
-    def _transpile(self, startup_program, main_program):
-        """
-        Transpile the programs to distributed programs. And add the variables.
-        """
-        worker_endpoints = fleet.worker_endpoints()
-        trainer_id = fleet.worker_index()
-        current_endpoint = fleet.worker_endpoints()[trainer_id]
-        worker_endpoints_env = ','.join(worker_endpoints)
-        trainers_num = fleet.worker_num()
-
-        if self.print_config:
-            print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
-                  trainer_id:{}".format(worker_endpoints, trainers_num,
-                                        current_endpoint, trainer_id))
-
-        # call transpiler
-        config = dist_transpiler.DistributeTranspilerConfig()
-        config.mode = self._strategy.mode
-        config.collective_mode = self._strategy.collective_mode
-
-        config.nccl_comm_num = self._strategy.nccl_comm_num
-        config.use_hierarchical_allreduce = self._strategy.use_hierarchical_allreduce
-        config.hierarchical_allreduce_inter_nranks = self._strategy.hierarchical_allreduce_inter_nranks
-
-        t = dist_transpiler.DistributeTranspiler(config=config)
-        t.transpile(
-            trainer_id=trainer_id,
-            trainers=worker_endpoints_env,
-            startup_program=startup_program,
-            program=main_program,
-            current_endpoint=current_endpoint)
-
-    def _get_node_ips_from_endpoints(self, endpoints):
-        ss = set()
-        ips = []
-        for ep in endpoints:
-            ip = ep.split(":")[0].strip()
-            if ip not in ss:
-                ss.add(ip)
-                ips.append(ip)
-            else:
-                continue
-
-        return ips
-
-    def _node_num(self):
-        worker_endpoints = fleet.worker_endpoints()
-        current_endpoint = fleet.worker_endpoints()[fleet.worker_index()]
-        worker_endpoints_env = ','.join(worker_endpoints)
-
-        node_ips = self._get_node_ips_from_endpoints(worker_endpoints)
-        node_ip = current_endpoint.split(":")[0].strip()
-
-        node_num = len(node_ips)
-
-        return node_num
-
-    def _try_to_compile(self, startup_program, main_program):
-        node_num = self._node_num()
-        assert node_num >= 1, "nccl2 node_num must >= 1, now:{}" % node_num
-
-        self._strategy.fuse_all_reduce_ops = True
-        exec_strategy = self._strategy.exec_strategy
-
-        if node_num <= 1:
-            if self._strategy.nccl_comm_num > 1:
-                logging.warn("set nccl_comm_num=1 since you only have 1 node.")
-            self._strategy.nccl_comm_num = 1
-
-            if self._strategy.use_hierarchical_allreduce:
-                logging.warn(
-                    "set use_hierarchical_allreduce=False since you only have 1 node."
-                )
-            self._strategy.use_hierarchical_allreduce = False
-
-        sync_allreduce = os.getenv("FLAGS_sync_nccl_allreduce")
-        if sync_allreduce is None or sync_allreduce == "1":
-            exec_strategy.num_threads = self._strategy.nccl_comm_num + 1
-            if self._strategy.use_hierarchical_allreduce:
-                exec_strategy.num_threads = 2 * self._strategy.nccl_comm_num + 1
-            if exec_strategy.num_threads > 4:
-                logging.warn(
-                    "if you use use_hierarchical_allreduce or "
-                    "with multi nccl comm, please export FLAGS_sync_nccl_allreduce = 0"
-                )
-
-        if self.print_config:
-            print("node_num:", node_num, "num_threads:",
-                  exec_strategy.num_threads, "use_hierarchical_allreduce:",
-                  self._strategy.use_hierarchical_allreduce, "nccl_comm_num:",
-                  self._strategy.nccl_comm_num, "FLAGS_sync_nccl_allreduce:",
-                  sync_allreduce)
-
-        self._transpile(startup_program, main_program)
-
-        if self._strategy.mode == "collective":
-            return main_program
-
-        self._strategy.num_trainers = fleet.worker_num()
-        self._strategy.trainer_id = fleet.worker_index()
-        self._strategy.trainers_endpoints = fleet.worker_endpoints()
-        self._strategy.enable_backward_optimizer_op_deps = True
-
-        self._compiled_program = compiler.CompiledProgram(main_program)
-
-        self._compiled_program.with_data_parallel(
-            loss_name=self._loss.name,
-            build_strategy=self._strategy,
-            exec_strategy=self._strategy.exec_strategy,
-            share_vars_from=None)
-
-        return self._compiled_program
-
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        """
-        minimize a program through loss
-        Args:
-            loss (Variable|Variable List): loss variable or loss variable list to run optimization.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-        Returns:
-            tuple: (optimize_ops, params_grads) which are, list of operators appended;
-            and list of (param, grad) Variables pair for optimization.
-        Note that in parameter server mode, a worker will not get anything about optimize_os
-        Because optmizer algorithms run on pserver side. We will make this usable in pserver
-        process, but currently the optimization part is written into Fleet(). A user does not
-        need to care about how to startup a pserver node.
-        """
-        main_program = loss.block.program
-        if startup_program is None:
-            startup_program = fluid.default_startup_program()
-        fleet.startup_program = startup_program
-
-        self._loss = loss
-
-        self._check_collective_mode(main_program, self._optimizer,
-                                    self._strategy)
-
-        if self.forward_recompute:
-            assert (isinstance(self.recompute_checkpoints, list) and
-                    len(self.recompute_checkpoints) > 0)
-            self._optimizer = \
-                fluid.optimizer.RecomputeOptimizer(self._optimizer)
-            self._optimizer._set_checkpoints(self.recompute_checkpoints)
-
-        optimize_ops, param_grads = self._optimizer.minimize(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
-
-        fleet._origin_program = main_program.clone(for_test=False)
-        fleet._transpiled_program = main_program
-        fleet.main_program = self._try_to_compile(startup_program, main_program)
-
-        return optimize_ops, param_grads
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
deleted file mode 100644
index 33ed0ecf10ec4cad807ebb6df1590de65eeeab1e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
deleted file mode 100644
index 2cdbf2c280f2ee1ec04567260be4fbadbb23a048..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ /dev/null
@@ -1,389 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import warnings
-
-import paddle.fluid.io as io
-from paddle.fluid.communicator import Communicator
-from paddle.fluid.framework import default_main_program
-from paddle.fluid.framework import default_startup_program
-from paddle.fluid.framework import Program
-from paddle.fluid.compiler import CompiledProgram
-from paddle.fluid.executor import Executor
-from paddle.fluid.parallel_executor import ParallelExecutor
-from paddle.fluid.optimizer import Optimizer
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspiler as OriginTranspiler
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
-
-from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
-from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
-from paddle.fluid.incubate.fleet.base.fleet_base import Mode
-from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
-
-
-class DistributedTranspiler(Fleet):
-    """
-    A subclass for compatibility with fluid.transpiler.DistributeTranspiler.
-    """
-
-    def __init__(self):
-        super(DistributedTranspiler, self).__init__(Mode.TRANSPILER)
-        self._transpile_config = None
-        self._transpiler = None
-        self._origin_program = None
-        self.startup_program = None
-        self.main_program = None
-        self._communicator = None
-
-    def init_worker(self):
-        """
-        `init_worker` has many many functions to do before training,
-        first, wait for all parameter servers launch completely.
-        second, run executor to initialize startup program
-        third, wait for all worker initialize completely.
-
-        Returns:
-            None
-        """
-        # if MPISymetricRoleMaker is defined
-        # we suppose a user wants to submit job on mpi cluster
-        if isinstance(self._role_maker, MPISymetricRoleMaker):
-            # check whether server has been initialized
-            from paddle.fluid.transpiler.details.checkport import wait_server_ready
-            wait_server_ready(fleet.server_endpoints(to_string=False))
-
-        if not self._transpile_config.sync_mode:
-            self._communicator = Communicator(self.main_program)
-
-            if not self._communicator.is_running():
-                self._communicator.start()
-            else:
-                warnings.warn("communicator has been initialized, skip")
-
-    def init_server(self, model_dir=None):
-        """
-        `init_server` has many many functions to do before start pserver,
-        first, run executor to initialize startup program,
-        second, if the `model_dir` is not empty, it will load parameters from it for increment training.
-
-        Args:
-            model_dir(str): The directory path.
-
-        Returns:
-            None
-        """
-        if not self.startup_program:
-            raise ValueError(
-                "startup_program is None, need invoke DistributedOptimizer.minimize first"
-            )
-
-        self._executor.run(self.startup_program)
-
-        if model_dir:
-            if not os.path.isdir(model_dir):
-                raise ValueError("There is no directory named '%s'", model_dir)
-
-            io.load_persistables(self._executor, model_dir, self.main_program)
-
-    def run_server(self):
-        """
-        `run_server` execute executor to start pserver main program.
-
-        Returns:
-            None
-        """
-        if not self.main_program:
-            raise ValueError(
-                "main_program is None, need invoke DistributedOptimizer.minimize first"
-            )
-
-        self._executor.run(self.main_program)
-
-    def stop_worker(self):
-        """
-        Close this executor.
-
-        For the distributed training, this method would free the resource on PServers related to
-        the current Trainer.
-
-        Returns:
-            None
-        """
-        if not self._transpile_config.sync_mode and self._communicator.is_running(
-        ):
-            self._communicator.stop()
-        self._executor.close()
-
-        if isinstance(self._role_maker, MPISymetricRoleMaker):
-            self._role_maker._finalize()
-
-    def distributed_optimizer(self, optimizer, strategy=None):
-        """
-        Optimizer for distributed training.
-
-        For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
-        Which has basic Optimizer function and special features for distributed training.
-
-        Args:
-            optimizer(Optimizer): The executor to run for init server.
-            strategy(DistributeTranspilerConfig): Extra properties for distributed optimizer.
-
-        Returns:
-            TranspilerOptimizer: subclass of DistributedOptimizer.
-        """
-
-        if not isinstance(optimizer, Optimizer):
-            raise ValueError("optimizer must be an instance of Optimizer")
-        self._optimizer = TranspilerOptimizer(optimizer, strategy)
-        return self._optimizer
-
-    def save_inference_model(self,
-                             executor,
-                             dirname,
-                             feeded_var_names,
-                             target_vars,
-                             main_program=None,
-                             export_for_deployment=True):
-        """
-        Prune the given `main_program` to build a new program especially for inference,
-        and then save it and all related parameters to given `dirname` by the `executor`.
-        """
-        if isinstance(executor, ParallelExecutor):
-            raise TypeError(
-                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
-            )
-
-        if not isinstance(executor, Executor):
-            raise TypeError(
-                "in fleet.save_inference_model() function, executor must be as Executor type"
-            )
-
-        if main_program is not None:
-            if isinstance(main_program, CompiledProgram):
-                raise TypeError(
-                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
-                )
-            io.save_inference_model(dirname, feeded_var_names, target_vars,
-                                    executor, main_program, None, None,
-                                    export_for_deployment)
-        else:
-            io.save_inference_model(dirname, feeded_var_names, target_vars,
-                                    executor, self._origin_program, None, None,
-                                    export_for_deployment, True)
-
-            model_basename = "__model__"
-            model_filename = os.path.join(dirname, model_basename)
-
-            with open(model_filename, "rb") as f:
-                program_desc_str = f.read()
-
-            program = Program.parse_from_string(program_desc_str)
-            program._copy_dist_param_info_from(self.main_program)
-            self.save_persistables(executor, dirname, program)
-
-    def save_persistables(self, executor, dirname, main_program=None):
-        """
-        This function filters out all variables with `persistable==True` from the
-        give `main_program` and then saves these variables to the folder `dirname`
-        or file `filename`.
-
-        The `dirname` is used to specify the folder where persistable variables
-        are going to be saved. If you would like to save variables in separate
-        files, set `filename` None; if you would like to save all variables in a
-        single file, use `filename` to specify the file name.
-        """
-        if isinstance(executor, ParallelExecutor):
-            raise TypeError(
-                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
-            )
-
-        if not isinstance(executor, Executor):
-            raise TypeError(
-                "in fleet.save_persistables() function, executor must be as Executor type"
-            )
-
-        if main_program is None:
-            main_program = self.main_program
-
-        if isinstance(main_program, CompiledProgram):
-            raise TypeError(
-                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
-            )
-
-        if not main_program._is_distributed:
-            raise ValueError(
-                "main_program is for local, may not use fleet.save_persistables")
-
-        io.save_persistables(executor, dirname, main_program, None)
-
-    def _transpile(self, config):
-        if not isinstance(config, DistributeTranspilerConfig):
-            raise TypeError(
-                "config must be an instance of DistributeTranspilerConfig")
-
-        if not config.sync_mode:
-            config.runtime_split_send_recv = True
-
-        # _origin_program is a deep copy for default_main_program, for inference
-        self._origin_program = default_main_program().clone(for_test=False)
-
-        self._transpile_config = config
-        self._transpiler = OriginTranspiler(config)
-
-        if self.is_worker():
-            self._transpiler.transpile(
-                trainer_id=fleet.worker_index(),
-                pservers=fleet.server_endpoints(to_string=True),
-                trainers=fleet.worker_num(),
-                sync_mode=config.sync_mode)
-
-            if isinstance(self._role_maker, MPISymetricRoleMaker):
-                config.wait_port = False
-
-            self.main_program = self._transpiler.get_trainer_program(
-                wait_port=config.wait_port)
-            self.startup_program = default_startup_program()
-        else:
-            self._transpiler.transpile(
-                trainer_id=fleet.worker_index(),
-                pservers=fleet.server_endpoints(to_string=True),
-                trainers=fleet.worker_num(),
-                sync_mode=config.sync_mode,
-                current_endpoint=self.server_endpoints()[self.server_index()])
-            self.main_program, self.startup_program = \
-                self._transpiler.get_pserver_programs(self.server_endpoints()[self.server_index()])
-
-
-fleet = DistributedTranspiler()
-
-
-class TranspilerOptimizer(DistributedOptimizer):
-    """
-    DistributedOptimizer is a wrapper for paddle.fluid.optimizer
-    A user should pass a paddle.fluid.optimizer to DistributedOptimizer
-    minimize() function is implemented.
-    DistributedOptimizer is the starting point for a user who wants to
-    run distributed training. The optimized information will be stored in
-    Fleet() instance who holds the global information about current distributed
-    training.
-
-    Args:
-        optimizer(Optimizer): subclass of Optimizer.
-        strategy(DistributeTranspilerConfig): instance of DistributeTranspilerConfig.
-
-    Returns:
-        None
-    """
-
-    def __init__(self, optimizer, strategy=None):
-        super(TranspilerOptimizer, self).__init__(optimizer, strategy)
-
-        if strategy:
-            if not isinstance(strategy, DistributeTranspilerConfig):
-                raise TypeError(
-                    "In {} mode, strategy must be an instance of DistributeTranspilerConfig".
-                    format(fleet._mode))
-            else:
-                self._strategy = strategy
-        else:
-            self._strategy = DistributeTranspilerConfig()
-
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None):
-        """
-        First part of `minimize`, do auto-diff to append backward ops for
-        the current program.
-
-        Args:
-            loss (Variable): loss variable to run optimizations.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-            callbacks (list|None): list of callables to run when appending backward
-                operator for one parameter.
-
-        Return:
-            list: list of (param, grad) pair, grad is the output of backward.
-
-        Examples:
-            See examples in `apply_gradients`.
-        """
-        return self._optimizer.backward(loss, startup_program, parameter_list,
-                                        no_grad_set, callbacks)
-
-    def apply_gradients(self, params_grads):
-        """
-        Second part of `minimize`, appending optimization operators for
-        given `params_grads` pairs.
-
-        Args:
-            params_grads (list): list of (param, grad) pair to do optimization.
-
-        Returns:
-            list: A list of operators appended to the current program.
-
-        Examples:
-            .. code-block:: python
-
-                loss = network()
-                optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-                params_grads = optimizer.backward(loss)
-                # you may append operations for params_grads here
-                # ...
-                optimizer.apply_gradients(params_grads)
-        """
-        return self._optimizer.apply_gradients(params_grads)
-
-    def minimize(self,
-                 loss,
-                 scopes=None,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        """
-        Add operations to minimize `loss` by updating `parameter_list`.
-
-        This method combines interface `backward()` and
-        `apply_gradients()` into one.
-
-        Args:
-            loss (Variable): loss variable to run optimizations.
-            scopes (None): TranspilerOptimizer doesn't need scope parameter.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-
-        Returns:
-            tuple: (optimize_ops, params_grads) which are, list of operators appended;
-            and list of (param, grad) Variables pair for optimization.
-        """
-        if isinstance(loss, list):
-            raise TypeError(
-                "DistributedTranspiler's minimize can not accept loss with list")
-
-        if isinstance(startup_program, list):
-            raise TypeError(
-                "DistributedTranspiler's minimize can not accept program with list"
-            )
-
-        optimize_ops, params_grads = self._optimizer.minimize(
-            loss, startup_program, parameter_list, no_grad_set)
-        fleet._transpile(config=self._strategy)
-        return optimize_ops, params_grads
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
deleted file mode 100755
index 17a761bf0523ab39bf4846169214ad1bfad5d186..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ /dev/null
@@ -1,606 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-import os
-import sys
-from .optimizer_factory import *
-from google.protobuf import text_format
-import paddle.fluid as fluid
-from paddle.fluid.framework import Program
-
-from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
-from paddle.fluid.incubate.fleet.base.fleet_base import Mode
-from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
-from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
-
-
-class PSLib(Fleet):
-    def __init__(self):
-        super(PSLib, self).__init__(Mode.PSLIB)
-        self._opt_info = None
-        self._local_ip = 0
-        self._fleet_ptr = None
-        self._main_programs = []
-        self._scopes = []
-        self._client2client_request_timeout_ms = 500000
-        self._client2client_connect_timeout_ms = 10000
-        self._client2client_max_retry = 3
-
-    def init(self, role_maker=None):
-        super(PSLib, self).init(MPISymetricRoleMaker())
-        self._fleet_ptr = fluid.core.Fleet()
-
-    def _set_client_communication_config(self, request_timeout_ms,
-                                         connect_timeout_ms, max_retry):
-        self._client2client_request_timeout_ms = request_timeout_ms
-        self._client2client_connect_timeout_ms = connect_timeout_ms
-        self._client2client_max_retry = max_retry
-
-    def init_worker(self):
-        """
-        init_worker(): will be called by user. When a user knows current process is_server(), he/she
-                    should call init_worker() to initialize global information about worker and connect
-                    worker with pserver. You should run startup program before init_worker.
-
-        Args:
-            executor(Executor): The executor to run for init server.
-            programs(Program|None): The program that need to run.
-        """
-
-        if len(self._main_programs) == 0:
-            raise ValueError(
-                "You should run DistributedOptimizer.minimize() first")
-
-        if self._opt_info:
-            if "fleet_desc" in self._opt_info:
-                self._dist_desc_str = text_format.MessageToString(
-                    self._opt_info["fleet_desc"])
-                self._dist_desc = self._opt_info["fleet_desc"]
-            else:
-                raise Exception(
-                    "You should run DistributedOptimizer.minimize() first")
-            # barrier_all for init_server, wait for server starts
-            self._role_maker._barrier_all()
-            self.all_ips_ = self._role_maker._all_gather(self._local_ip)
-            self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_,
-                                        self._role_maker._get_size(),
-                                        self._role_maker._get_rank())
-            # barrier_all for init_worker
-            self._role_maker._barrier_all()
-            # prepare for client to client communication
-            info = self._fleet_ptr.get_clients_info()
-            all_info = self._role_maker._worker_gather(info[0])
-            self._fleet_ptr.gather_clients(all_info)
-            self._fleet_ptr.set_client2client_config(
-                self._client2client_request_timeout_ms,
-                self._client2client_connect_timeout_ms,
-                self._client2client_max_retry)
-            self._fleet_ptr.create_client2client_connection()
-            # barrier for init model
-            self._role_maker._barrier_worker()
-            if self._role_maker.is_first_worker():
-                tables = self._dist_desc.trainer_param[0].dense_table
-                for prog, scope in zip(self._main_programs, self._scopes):
-                    prog_id = str(id(prog))
-                    prog_conf = self._opt_info['program_configs'][prog_id]
-                    prog_tables = {}
-                    for key in prog_conf:
-                        if "dense" not in key:
-                            continue
-                        for table_id in prog_conf[key]:
-                            prog_tables[int(table_id)] = 0
-                    for table in tables:
-                        if int(table.table_id) not in prog_tables:
-                            continue
-                        var_name_list = []
-                        for i in range(0, len(table.dense_variable_name)):
-                            var_name = table.dense_variable_name[i]
-                            if scope.find_var(var_name) is None:
-                                raise ValueError(
-                                    "var " + var_name + " not found in scope, "
-                                    + "you should run startup program first")
-                            var_name_list.append(var_name)
-                        self._fleet_ptr.init_model(scope,
-                                                   int(table.table_id),
-                                                   var_name_list)
-            # barrier for init model done
-            self._role_maker._barrier_worker()
-        else:
-            raise NameError(
-                "You should run DistributedOptimizer.minimize() first")
-
-    def init_server(self, model_dir=None, **kwargs):
-        """
-        init_server() will be called by user. It will load model from model_dir.
-
-        Args:
-            model_dir(str): load model path, can be local or hdfs/afs path.
-            kwargs: user-defined attributes, currently support following:
-                model(int): load model mode.
-                            0 is for load whole model,
-                            1 is for load delta model (load diff),
-                            default is 0.
-
-        Example:
-            >>> fleet.init_server("/you/path/to/model", mode = 0)
-
-        """
-        mode = kwargs.get("mode", 0)
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.load_model(model_dir, mode)
-        self._role_maker._barrier_worker()
-
-    def run_server(self):
-        """
-         init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
-             should call init_pserver() to initialize global information about parameter server
-        """
-        if self._opt_info:
-            if "fleet_desc" in self._opt_info:
-                self._dist_desc_str = text_format.MessageToString(
-                    self._opt_info["fleet_desc"])
-                self._dist_desc = self._opt_info["fleet_desc"]
-            else:
-                raise Exception(
-                    "You should run DistributedOptimizer.minimize() first")
-            self._fleet_ptr.init_server(self._dist_desc_str,
-                                        self._role_maker._get_rank())
-            self._local_ip = self._fleet_ptr.run_server()
-
-            # barrier_all for init_server
-            self._role_maker._barrier_all()
-            self.all_ips_ = self._role_maker._all_gather(self._local_ip)
-
-            self._fleet_ptr.gather_servers(self.all_ips_,
-                                           self._role_maker._get_size())
-            # barrier_all for init_worker, wait all workers start
-            self._role_maker._barrier_all()
-        else:
-            raise Exception(
-                "You should run DistributedOptimizer.minimize() first")
-
-    def stop_worker(self):
-        """
-        stop(): will be called after a user finishes his/her training task. Fleet instance will be
-            destroyed when stop() is called.
-        """
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.stop_server()
-        self._role_maker._barrier_worker()
-        self._role_maker._barrier_all()
-        self._role_maker._finalize()
-
-    def distributed_optimizer(self, optimizer, strategy={}):
-        """
-        distributed_optimizer
-
-        Args:
-            optimizer(Optimizer): optimizer
-            strategy(dict): strategy
-
-        Examples:
-            .. code-block:: python
-
-              fleet.distributed_optimizer(optimizer)
-
-        Returns:
-            optimizer(DownpourOptimizer): downpour optimizer
-
-        """
-        self._optimizer = DownpourOptimizer(optimizer, strategy)
-        return self._optimizer
-
-    def save_inference_model(self,
-                             executor,
-                             dirname,
-                             feeded_var_names=None,
-                             target_vars=None,
-                             main_program=None,
-                             export_for_deployment=True):
-        """
-        save pserver model called from a worker
-
-        Args:
-            executor(Executor): fluid executor
-            dirname(str): save model path
-            feeded_var_names(list): default None
-            target_vars(list): default None
-            main_program(Program): default None
-            export_for_deployment(bool): default None
-
-        Examples:
-            .. code-block:: python
-
-              fleet.save_inference_model(dirname="hdfs:/my/path")
-
-        """
-        self._fleet_ptr.save_model(dirname)
-
-    def save_persistables(self, executor, dirname, main_program=None, **kwargs):
-        """
-        save presistable parameters,
-        when using fleet, it will save sparse and dense feature
-
-        Args:
-            executor(Executor): fluid executor
-            dirname(str): save path. It can be hdfs/afs path or local path
-            main_program(Program): fluid program, default None
-            kwargs: use define property, current support following
-                mode(int): 0 means save all pserver model,
-                           1 means save delta pserver model (save diff),
-                           2 means save xbox base,
-                           3 means save batch model.
-
-        Example:
-            >>> fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
-
-        """
-        mode = kwargs.get("mode", 0)
-        self._fleet_ptr.client_flush()
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.save_model(dirname, mode)
-        self._role_maker._barrier_worker()
-
-    def save_cache_model(self, executor, dirname, main_program=None, **kwargs):
-        """
-        save sparse cache table,
-        when using fleet, it will save sparse cache table
-
-        Args:
-            dirname(str): save path. It can be hdfs/afs path or local path
-            main_program(Program): fluid program, default None
-            kwargs: use define property, current support following
-                mode(int): define for feature extension in the future,
-                           currently no use, will pass a default value 0 
-
-        Example:
-            .. code-block:: python
-            >>> fleet.save_cache_model(None, dirname="/you/path/to/model", mode = 0)
-
-        """
-        mode = kwargs.get("mode", 0)
-        self._fleet_ptr.client_flush()
-        self._role_maker._barrier_worker()
-        cache_threshold = 0.0
-
-        if self._role_maker.is_first_worker():
-            cache_threshold = self._fleet_ptr.get_cache_threshold()
-        #check cache threshold right or not
-        self._role_maker._barrier_worker()
-
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.cache_shuffle(0, dirname, mode, cache_threshold)
-
-        self._role_maker._barrier_worker()
-
-        feasign_num = -1
-        if self._role_maker.is_first_worker():
-            feasign_num = self._fleet_ptr.save_cache(0, dirname, mode)
-
-        self._role_maker._barrier_worker()
-        return feasign_num
-
-    def shrink_sparse_table(self):
-        """
-        shrink cvm of all sparse embedding in pserver, the decay rate
-        is defined as "show_click_decay_rate" in fleet_desc.prototxt
-
-        Example:
-            >>> fleet.shrink_sparse_table()
-
-        """
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            for i in self._opt_info["fleet_desc"].trainer_param[0].sparse_table:
-                self._fleet_ptr.shrink_sparse_table(i.table_id)
-        self._role_maker._barrier_worker()
-
-    def shrink_dense_table(self, decay, emb_dim=11, scope=None, table_id=None):
-        """
-        shrink batch_sum in pserver by multiplying by decay
-
-        Args:
-            decay(float): the decay rate, usually range in (0, 1)
-            emb_dim(int): one element's length in datanorm layer
-            scope(Scope): Scope object, default is fluid.global_scope()
-            table_id(int): table id of shrinking dense table. None means shrink all,
-                           you should specify it when using multiple scopes,
-                           default is None.
-
-        Example:
-            >>> fleet.shrink_dense_table(0.98, 11, myscope1, 1)
-            >>> fleet.shrink_dense_table(0.98, 11, myscope1, 2)
-            >>> fleet.shrink_dense_table(0.98, 11, myscope2, 3)
-
-        """
-        if scope is None:
-            scope = fluid.global_scope()
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            for i in self._opt_info["fleet_desc"].trainer_param[0].dense_table:
-                if table_id is not None and table_id != i.table_id:
-                    continue
-                var_list = [var for var in i.dense_variable_name]
-                skip = False
-                for var in var_list:
-                    if scope.find_var(var) is None:
-                        skip = True
-                        break
-                if skip:
-                    continue
-                self._fleet_ptr.shrink_dense_table(i.table_id, scope, var_list,
-                                                   decay, emb_dim)
-        self._role_maker._barrier_worker()
-
-    def clear_model(self):
-        """
-        clear_model() will be called by user. It will clear sparse model.
-
-        Examples:
-            .. code-block:: python
-
-              fleet.clear_model()
-
-        """
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.clear_model()
-        self._role_maker._barrier_worker()
-
-    def clear_model(self):
-        """
-        clear_model() will be called by user. It will clear sparse model.
-
-        Examples:
-            .. code-block:: python
-
-              fleet.clear_model()
-
-        """
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.clear_model()
-        self._role_maker._barrier_worker()
-
-    def load_one_table(self, table_id, model_path, **kwargs):
-        """
-        load pslib model for one table or load params from paddle model
-
-        Args:
-            table_id(int): load table id
-            model_path(str): load model path, can be local or hdfs/afs path
-            kwargs(dict): user defined params, currently support following:
-                only for load pslib model for one table:
-                    mode(int): load model mode. 0 is for load whole model, 1 is
-                               for load delta model (load diff), default is 0.
-                only for load params from paddle model:
-                    scope(Scope): Scope object
-                    model_proto_file(str): path of program desc proto binary
-                                           file, can be local or hdfs/afs file
-                    var_names(list): var name list
-                    load_combine(bool): load from a file or splited param files
-                                        default False.
-
-        Examples:
-            .. code-block:: python
-
-              # load pslib model for one table
-              fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/")
-              fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0)
-
-              # load params from paddle model
-              fleet.load_one_table(2, "hdfs:/my_paddle_model/",
-                                   scope = my_scope,
-                                   model_proto_file = "./my_program.bin",
-                                   load_combine = False)
-
-              # below is how to save proto binary file
-              with open("my_program.bin", "wb") as fout:
-                  my_program = fluid.default_main_program()
-                  fout.write(my_program.desc.serialize_to_string())
-
-        """
-        self._role_maker._barrier_worker()
-        mode = kwargs.get("mode", 0)
-        scope = kwargs.get("scope", None)
-        model_proto_file = kwargs.get("model_proto_file", None)
-        var_names = kwargs.get("var_names", None)
-        load_combine = kwargs.get("load_combine", False)
-        self._role_maker._barrier_worker()
-        if scope is not None and model_proto_file is not None:
-            self._load_one_table_from_paddle_model(scope, table_id, model_path,
-                                                   model_proto_file, var_names,
-                                                   load_combine)
-        elif self._role_maker.is_first_worker():
-            self._fleet_ptr.load_model_one_table(table_id, model_path, mode)
-        self._role_maker._barrier_worker()
-
-    def _load_one_table_from_paddle_model(self,
-                                          scope,
-                                          table_id,
-                                          model_path,
-                                          model_proto_file,
-                                          var_names=None,
-                                          load_combine=False):
-        """
-        load params from paddle model, and push params to pserver
-
-        Args:
-            scope(Scope): Scope object
-            table_id(int): the id of table to load
-            model_path(str): path of paddle model, can be local or hdfs/afs file
-            model_proto_file(str): path of program desc proto binary file,
-                                   can be local or hdfs/afs file
-            var_names(list): load var names
-            load_combine(bool): load from a file or splited param files
-
-        """
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            # get fs config from fleet_desc
-            fs_name = self._opt_info["fleet_desc"].fs_client_param.uri
-            fs_ugi = self._opt_info["fleet_desc"].fs_client_param.user + "," + \
-                     self._opt_info["fleet_desc"].fs_client_param.passwd
-            hadoop_bin = self._opt_info["fleet_desc"].fs_client_param.hadoop_bin
-            # download model_path if it's hdfs/afs
-            if model_path.startswith("hdfs:") or model_path.startswith("afs:"):
-                dest = "./model_for_load_table_%s" % table_id
-                cmd = hadoop_bin + " fs -D fs.default.name=" + fs_name + \
-                      " -D hadoop.job.ugi=" + fs_ugi + " -get " + model_path + \
-                      " " + dest
-                ret = os.system(cmd)
-                if ret != 0:
-                    raise RuntimeError("download model failed")
-                model_path = dest
-            # download model_proto_file if it's hdfs/afs
-            if model_proto_file.startswith("hdfs:") or \
-                    model_proto_file.startswith("afs:"):
-                dest = "./model_proto_file_for_load_table_%s" % table_id
-                cmd = hadoop_bin + " fs -D fs.default.name=" + fs_name + \
-                      " -D hadoop.job.ugi=" + fs_ugi + " -get " + \
-                      model_proto_file + " " + dest
-                ret = os.system(cmd)
-                if ret != 0:
-                    raise RuntimeError("download model proto file failed")
-                model_proto_file = dest
-            for i in self._opt_info["fleet_desc"].trainer_param[0].dense_table:
-                if table_id is not None and table_id != i.table_id:
-                    continue
-                table_var_names = [var for var in i.dense_variable_name]
-                skip = False
-                for var in table_var_names:
-                    if scope.find_var(var) is None:
-                        skip = True
-                        break
-                if skip:
-                    continue
-                self._fleet_ptr.load_from_paddle_model(
-                    scope, table_id, var_names, model_path, model_proto_file,
-                    table_var_names, load_combine)
-        self._role_maker._barrier_worker()
-
-    def _set_opt_info(self, opt_info):
-        """
-        this function saves the result from DistributedOptimizer.minimize()
-        """
-        self._opt_info = opt_info
-
-
-fleet = PSLib()
-
-
-class DownpourOptimizer(DistributedOptimizer):
-    """
-    DistributedOptimizer is a wrapper for paddle.fluid.optimizer
-    A user should pass a paddle.fluid.optimizer to DistributedOptimizer
-    minimize() function is implemented.
-    DistributedOptimizer is the starting point for a user who wants to
-    run distributed training. The optimized information will be stored in
-    Fleet() instance who holds the global information about current distributed
-    training.
-
-    Args:
-        optimizer(Optimizer): subclass of Optimizer.
-        strategy(any): config for DownpourOptimizer.
-
-    Returns:
-        None
-    """
-
-    def __init__(self, optimizer, strategy=None):
-        super(DownpourOptimizer, self).__init__(optimizer, strategy)
-
-        self._optimizer = optimizer
-        self._optimizer_name = "Distributed%s" % optimizer.type.capitalize()
-        if optimizer.type != "adam":
-            print("Currently, distributed optimizer only support Adam"
-                  "Will config built-in adam for you."
-                  "We will support more functions in DistributedOptimizer",
-                  sys.stderr)
-            self._optimizer_name = "DistributedAdam"
-
-        self._distributed_optimizer = globals()[self._optimizer_name](optimizer)
-
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None):
-        """
-        Currently, backward function can not be called through DistributedOptimizer
-        """
-        raise NotImplementedError()
-
-    def apply_gradients(self, params_grads):
-        """
-        Currently, apply_gradients function can not be called through DistributedOptimizer
-        """
-        raise NotImplementedError()
-
-    def minimize(self,
-                 losses,
-                 scopes=None,
-                 startup_programs=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        """
-        minimize a program through loss, loss can be a list in DistributedOptimizer.
-        Note that in parameter server mode, a worker will not get anything about optimize_os
-        Because optmizer algorithms run on pserver side. We will make this usable in pserver
-        process, but currently the optimization part is written into Fleet(). A user does not
-        need to care about how to startup a pserver node.
-
-        Args:
-            losses (Variable|Variable List): loss variable or loss variable list to run optimization.
-            scopes (Scope| Scope List): scope instance.
-            startup_programs (Program|Program List): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-
-        Returns:
-            tuple: (optimize_ops, params_grads) which are, list of operators appended;
-            and list of (param, grad) Variables pair for optimization.
-        """
-
-        if not isinstance(losses, list):
-            losses = [losses]
-
-        optimize_ops, param_grads, opt_info = \
-                      self._distributed_optimizer._minimize(
-                          losses,
-                          startup_programs,
-                          parameter_list,
-                          no_grad_set,
-                          self._strategy)
-        opt_info["mpi_rank"] = fleet._role_maker._get_rank()
-        fleet._set_opt_info(opt_info)
-
-        programs = [loss.block.program for loss in losses]
-
-        if scopes is None:
-            scopes = [fluid.global_scope()] * len(programs)
-
-        if len(scopes) != len(programs):
-            raise ValueError(
-                "You should make sure len(scopes) == len(programs) or set scopes None"
-            )
-
-        fleet._main_programs = programs
-        fleet._scopes = scopes
-
-        return [optimize_ops, param_grads]
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
deleted file mode 100755
index 136b337d2cd6516005d1ccac11e5d578233e74e9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ /dev/null
@@ -1,409 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-from . import ps_pb2 as pslib
-
-
-class Server(object):
-    """
-        A Server basic class.
-    """
-
-    def __init__(self):
-        pass
-
-
-class Worker(object):
-    """
-        A Worker basic class.
-    """
-
-    def __init__(self):
-        pass
-
-
-class DownpourServer(Server):
-    """
-        DownpourServer class is used to generate server program_desc
-        Args:
-            server: it is pslib.ServerParameter() 
-        Examples:
-            server = DownpourServer()
-    """
-
-    def __init__(self):
-        self._server = pslib.ServerParameter()
-        self._server.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
-        self._server.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
-        self._server.downpour_server_param.service_param.service_class = "DownpourPsService"
-        self._server.downpour_server_param.service_param.start_server_port = 0
-        self._server.downpour_server_param.service_param.server_thread_num = 12
-
-    def add_sparse_table(self, table_id, strategy):
-        """
-        Args:
-            table_id(int): id of sparse params table
-            strategy(dict): the config dict.
-        Returns:
-            return None 
-        """
-
-        for table in self._server.downpour_server_param.downpour_table_param:
-            if table.table_id == table_id:
-                if table.type == pslib.PS_SPARSE_TABLE:
-                    return
-                else:
-                    raise ValueError("expect table %s type=%s, but actual type=%s" \
-                        %(table_id, pslib.PS_SPARSE_TABLE, table.type))
-        if strategy is None:
-            strategy = dict()
-        table = self._server.downpour_server_param.downpour_table_param.add()
-        table.table_id = table_id
-        table.type = pslib.PS_SPARSE_TABLE
-
-        support_sparse_key_list = ['sparse_table_class', 'sparse_compress_in_save', 'sparse_shard_num', \
-                    'sparse_accessor_class', 'sparse_learning_rate', 'sparse_initial_g2sum', 'sparse_initial_range', \
-                    'sparse_weight_bounds', 'sparse_embedx_dim', 'sparse_embedx_threshold', 'sparse_nonclk_coeff', \
-                    'sparse_click_coeff', 'sparse_base_threshold', 'sparse_delta_threshold', 'sparse_delta_keep_days', \
-                    'sparse_show_click_decay_rate', 'sparse_delete_threshold']
-
-        for key in strategy:
-            if key not in support_sparse_key_list:
-                raise ValueError("strategy key '%s' not support" % (key))
-
-        support_table_calss = ['DownpourSparseTable', 'DownpourSparseSSDTable']
-        if strategy.get('sparse_table_class') is not None:
-            table_class = strategy.get('sparse_table_class')
-            if table_class not in support_table_calss:
-                raise ValueError(
-                    "support sparse_table_class: [ 'DownpourSparseTable' , 'DownpourSparseSSDTable'], \
-                        but actual %s" % (table_class))
-        else:
-            table_class = 'DownpourSparseTable'
-
-        table.table_class = table_class
-
-        if table_class in support_table_calss:
-            table.compress_in_save = strategy.get('sparse_compress_in_save',
-                                                  True)
-            table.shard_num = strategy.get('sparse_shard_num', 1000)
-
-            support_accessor_class = [
-                'DownpourFeatureValueAccessor', 'DownpourCtrAccessor'
-            ]
-            if strategy.get('sparse_accessor_class') is not None:
-                accessor_class = strategy.get('sparse_accessor_class')
-                if accessor_class not in support_accessor_class:
-                    raise ValueError(
-                        "support sparse_accessor_class: ['DownpourFeatureValueAccessor', 'DownpourCtrAccessor'], \
-                            but actual %s" % (accessor_class))
-            else:
-                accessor_class = 'DownpourCtrAccessor'
-
-            table.accessor.accessor_class = accessor_class
-
-            if accessor_class == 'DownpourFeatureValueAccessor' or accessor_class == 'DownpourCtrAccessor':
-                table.accessor.sparse_sgd_param.learning_rate = strategy.get(
-                    'sparse_learning_rate', 0.05)
-                table.accessor.sparse_sgd_param.initial_g2sum = strategy.get(
-                    'sparse_initial_g2sum', 3)
-                table.accessor.sparse_sgd_param.initial_range = strategy.get(
-                    'sparse_initial_range', 1e-4)
-                if strategy.get('sparse_weight_bounds') is None:
-                    table.accessor.sparse_sgd_param.weight_bounds.extend(
-                        [-10, 10])
-                else:
-                    table.accessor.sparse_sgd_param.weight_bounds.extend(
-                        strategy.get('sparse_weight_bounds'))
-                table.accessor.embedx_dim = strategy.get('sparse_embedx_dim', 8)
-                table.accessor.embedx_threshold = strategy.get(
-                    'sparse_embedx_threshold', 10)
-                table.accessor.fea_dim = int(table.accessor.embedx_dim) + 3
-                table.accessor.downpour_accessor_param.nonclk_coeff = strategy.get(
-                    'sparse_nonclk_coeff', 0.1)
-                table.accessor.downpour_accessor_param.click_coeff = strategy.get(
-                    'sparse_click_coeff', 1)
-                table.accessor.downpour_accessor_param.base_threshold = strategy.get(
-                    'sparse_base_threshold', 1.5)
-                table.accessor.downpour_accessor_param.delta_threshold = strategy.get(
-                    'sparse_delta_threshold', 0.25)
-                table.accessor.downpour_accessor_param.delta_keep_days = strategy.get(
-                    'sparse_delta_keep_days', 16)
-                table.accessor.downpour_accessor_param.delete_after_unseen_days = strategy.get(
-                    'sparse_delete_after_unseen_days', 30)
-                table.accessor.downpour_accessor_param.show_click_decay_rate = strategy.get(
-                    'sparse_show_click_decay_rate', 0.98)
-                table.accessor.downpour_accessor_param.delete_threshold = strategy.get(
-                    'sparse_delete_threshold', 0.8)
-                table1 = table.accessor.table_accessor_save_param.add()
-                table1.param = 1
-                table1.converter = "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)"
-                table1.deconverter = "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)"
-                table2 = table.accessor.table_accessor_save_param.add()
-                table2.param = 2
-                table2.converter = "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)"
-                table2.deconverter = "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)"
-
-    def add_dense_table(self, table_id, param_var, grad_var, strategy,
-                        sparse_table_names):
-        """
-        Args:
-            table_id(int): id of sparse params table
-            strategy(dict): the dense config dict.
-        Returns:
-            return None 
-        """
-        fea_dim = 0
-        dense_param_vars = []
-        for p in param_var:
-            if p.name not in sparse_table_names:
-                dense_param_vars.append(p)
-
-        for param in dense_param_vars:
-            fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
-
-        for table in self._server.downpour_server_param.downpour_table_param:
-            if table.table_id == table_id:
-                if table.type == pslib.PS_DENSE_TABLE:
-                    table.accessor.fea_dim = fea_dim
-                    return
-                else:
-                    raise ValueError("expect table %s type=%s, but actual type=%s" \
-                        %(table_id, pslib.PS_DENSE_TABLE, table.type))
-
-        if strategy is None:
-            strategy = dict()
-        table = self._server.downpour_server_param.downpour_table_param.add()
-        table.table_id = table_id
-        support_dense_key_list = ['dense_table_class', 'dense_compress_in_save', 'dense_accessor_class', \
-                'dense_optimizer', 'dense_learning_rate', 'dense_avg_decay', 'dense_ada_decay', \
-                'dense_ada_epsilon', 'dense_mom_decay', 'dense_naive_lr']
-
-        for key in strategy:
-            if key not in support_dense_key_list:
-                raise ValueError("strategy key '%s' not support" % (key))
-
-        table.table_class = strategy.get('dense_table_class',
-                                         "DownpourDenseTable")
-        table.type = pslib.PS_DENSE_TABLE
-        table.compress_in_save = strategy.get('dense_compress_in_save', True)
-        table.accessor.accessor_class = strategy.get(
-            'dense_accessor_class', "DownpourDenseValueAccessor")
-        table.accessor.dense_sgd_param.name = strategy.get('dense_optimizer',
-                                                           "adam")
-        table.accessor.dense_sgd_param.adam.learning_rate = strategy.get(
-            'dense_learning_rate', 5e-06)
-        table.accessor.dense_sgd_param.adam.avg_decay_rate = strategy.get(
-            'dense_avg_decay', 0.999993)
-        table.accessor.dense_sgd_param.adam.ada_decay_rate = strategy.get(
-            'dense_ada_decay', 0.9999)
-        table.accessor.dense_sgd_param.adam.ada_epsilon = strategy.get(
-            'dense_ada_epsilon', 1e-8)
-        table.accessor.dense_sgd_param.adam.mom_decay_rate = strategy.get(
-            'dense_mom_decay', 0.99)
-        table.accessor.dense_sgd_param.naive.learning_rate = strategy.get(
-            'dense_naive_lr', 0.0002)
-        table.accessor.fea_dim = fea_dim
-
-    def add_data_norm_table(self, table_id, learning_rate, param_var, grad_var,
-                            strategy, sparse_table_names):
-        """
-        Args:
-            table_id(int): id of datanorm table
-            strategy(dict): the datanorm config dict.
-        Returns:
-            return None 
-        """
-        fea_dim = 0
-        dense_param_vars = []
-        for p in param_var:
-            if p.name not in sparse_table_names:
-                dense_param_vars.append(p)
-
-        for param in dense_param_vars:
-            fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
-
-        for table in self._server.downpour_server_param.downpour_table_param:
-            if table.table_id == table_id:
-                if table.type == pslib.PS_DENSE_TABLE:
-                    table.accessor.fea_dim = fea_dim
-                    return
-                else:
-                    raise ValueError("expect table %s type=%s, but actual type=%s" \
-                        %(table_id, pslib.PS_DENSE_TABLE, table.type))
-        if strategy is None:
-            strategy = dict()
-
-        support_datanorm_key_list = ['datanorm_table_class', 'datanorm_compress_in_save',\
-                'datanorm_accessor_class', 'datanorm_operation', 'datanorm_decay_rate']
-
-        for key in strategy:
-            if key not in support_datanorm_key_list:
-                raise ValueError("strategy key '%s' not support" % (key))
-
-        table = self._server.downpour_server_param.downpour_table_param.add()
-        table.table_id = table_id
-        table.table_class = strategy.get('datanorm_table_class',
-                                         "DownpourDenseDoubleTable")
-        table.type = pslib.PS_DENSE_TABLE
-        table.compress_in_save = strategy.get('datanorm_compress_in_save', True)
-        table.accessor.accessor_class = strategy.get(
-            'datanorm_accessor_class', "DownpourDenseValueDoubleAccessor")
-        table.accessor.dense_sgd_param.name = strategy.get('datanorm_operation',
-                                                           "summarydouble")
-        table.accessor.dense_sgd_param.summary.summary_decay_rate = strategy.get(
-            'datanorm_decay_rate', 0.999999)
-        table.accessor.fea_dim = fea_dim
-
-    def get_desc(self):
-        """
-        Return downpour server program_desc
-        """
-        return self._server
-
-
-class DownpourWorker(Worker):
-    """
-        DownpourWorker class is used to generate worker program_desc
-        Args:
-            window (int): push params frequency
-            worker: it is pslib.DownpourTrainerParameter 
-        Examples:
-            worker = DownpourWorker(1)
-    """
-
-    def __init__(self, window):
-        self.window = window
-        self._worker = pslib.DownpourTrainerParameter()
-
-    def add_sparse_table(self, table_id, slot_key_vars, slot_value_vars):
-        """
-        Args:
-            table_id(int): id of sparse params table
-            slot_key_vars(string): slot key id 
-            slot_value_var(string): slot key value after embedding
-        Returns:
-            return None 
-        """
-        for table in self._worker.sparse_table:
-            if table.table_id == table_id:
-                if [var.name for var in slot_key_vars
-                    ] == self._worker.sparse_table[table_id].slot_key:
-                    if [var.name for var in slot_value_vars
-                        ] == self._worker.sparse_table[table_id].slot_value:
-                        if [
-                                var.name + "@GRAD" for var in slot_value_vars
-                        ] == self._worker.sparse_table[table_id].slot_gradient:
-                            return
-                        else:
-                            raise ValueError(
-                                "sparse table %s slot_gradient error" %
-                                table_id)
-
-                    else:
-                        raise ValueError("sparse table %s slot_value error" %
-                                         table_id)
-                else:
-                    raise ValueError("sparse table %s slot_key error" %
-                                     table_id)
-
-        table = self._worker.sparse_table.add()
-        table.table_id = table_id
-        table.slot_key.extend([var.name for var in slot_key_vars])
-        table.slot_value.extend([var.name for var in slot_value_vars])
-        table.slot_gradient.extend(
-            [var.name + "@GRAD" for var in slot_value_vars])
-
-    def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars,
-                        dense_start_table_id, sparse_table_names):
-        """
-        Args:
-            table_id(int): id of sparse params table
-            learning_rate(float): the learning rate used to update parameters. \
-                Can be a float value
-            param_var(list): all dense param. it is a list.
-            grad_var(list): all dense grad parm it is a list.
-        Returns:
-            return None 
-        """
-        sparse_table_name_grad = []
-        for name in sparse_table_names:
-            sparse_table_name_grad.append(name + "@GRAD")
-
-        dense_param_name = []
-        for p in param_vars:
-            if p.name not in sparse_table_names:
-                dense_param_name.append(p.name)
-
-        dense_grad_name = []
-        for g in grad_vars:
-            if g.name not in sparse_table_name_grad:
-                dense_grad_name.append(g.name)
-
-        dense_param_name.sort()
-        dense_grad_name.sort()
-
-        for table in self._worker.dense_table:
-            if table.table_id == table_id:
-                desc_dense_param_name = list(self._worker.dense_table[
-                    table_id - dense_start_table_id].dense_variable_name)
-                desc_dense_param_name.sort()
-
-                if dense_param_name == desc_dense_param_name:
-                    desc_dense_grad_name = list(self._worker.dense_table[
-                        table_id - dense_start_table_id]
-                                                .dense_gradient_variable_name)
-                    desc_dense_grad_name.sort()
-                    if dense_grad_name == desc_dense_grad_name:
-                        return
-                    else:
-                        raise ValueError(
-                            "dense table %s dense_gradient_variable_name error"
-                            % table_id)
-                else:
-                    raise ValueError(
-                        "dense table %s dense_variable_name error" % table_id)
-
-        table = self._worker.dense_table.add()
-        table.table_id = table_id
-
-        def cmp_fc(x, y):
-            if x.startswith("fc_") and y.startswith("fc_"):
-                index_x = x.find('.')
-                index_y = y.find('.')
-                if index_x > 0 and index_y > 0:
-                    num_x = x[3:index_x]
-                    num_y = y[3:index_y]
-                    if num_x.isdigit() and num_y.isdigit():
-                        if int(num_x) < int(num_y):
-                            return -1
-                        if int(num_x) > int(num_y):
-                            return 1
-                        if x[index_x + 1] == 'w' and y[index_y + 1] == 'b':
-                            return -1
-                        if x[index_x + 1] == 'b' and y[index_y + 1] == 'w':
-                            return 1
-            if x < y:
-                return -1
-            else:
-                return 1
-
-        table.dense_variable_name.extend(sorted(dense_param_name, cmp_fc))
-        table.dense_gradient_variable_name.extend(
-            sorted(dense_grad_name, cmp_fc))
-
-    def get_desc(self):
-        """
-        Return downpour worker program_desc
-        """
-        return self._worker
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
deleted file mode 100755
index 259b713903e3f98206e9c32d622d206eeb98bec6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ /dev/null
@@ -1,268 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ["DistributedAdam"]
-import paddle.fluid as fluid
-from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
-from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs
-from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs
-from google.protobuf import text_format
-from .node import DownpourWorker, DownpourServer
-from . import ps_pb2 as pslib
-
-
-class DistributedOptimizerImplBase(object):
-    def __init__(self, optimizer):
-        self._optimizer = optimizer
-        self._learning_rate = optimizer._learning_rate
-        self._regularization = optimizer.regularization
-
-    def minimize(self,
-                 losses,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        pass
-
-
-class DistributedAdam(DistributedOptimizerImplBase):
-    def __init__(self, optimizer):
-        # todo(guru4elephant): add more optimizers here as argument
-        # todo(guru4elephant): make learning_rate as a variable
-        super(DistributedAdam, self).__init__(optimizer)
-        self._window = 1
-        self.type = "downpour"
-        self.data_norm_name = [
-            ".batch_size", ".batch_square_sum", ".batch_sum",
-            ".batch_size@GRAD", ".batch_square_sum@GRAD", ".batch_sum@GRAD"
-        ]
-
-    def _find_distributed_lookup_table_inputs(self, program, table_names):
-        """
-        Find input variable of distribute lookup table in program.
-        We could support multi-distribute table now.
-        Args:
-        program(Program): given program, locate distributed lookup table
-        table_name(str): given table names that is found beforehand
-        Returns:
-        inputs
-        """
-        local_vars = program.current_block().vars
-        inputs_dict = dict()
-        for table_name in table_names:
-            inputs_dict[table_name] = []
-
-        for op in program.global_block().ops:
-            if op.type == "lookup_table":
-                if op.input("W")[0] in table_names:
-                    inputs_dict[op.input("W")[0]].extend(
-                        [local_vars[name] for name in op.input("Ids")])
-        return inputs_dict
-
-    def _find_distributed_lookup_table_outputs(self, program, table_names):
-        """
-        Find output variable of distribute lookup table in program.
-        We could support multi-distribute table now.
-        Args:
-        program(Program): given program, locate distributed lookup table
-        table_name(str): given table name that is found beforehand
-        Returns:
-        outputs
-        """
-        local_vars = program.current_block().vars
-        outputs_dict = dict()
-        for table_name in table_names:
-            outputs_dict[table_name] = []
-
-        for op in program.global_block().ops:
-            if op.type == "lookup_table":
-                if op.input("W")[0] in table_names:
-                    outputs_dict[op.input("W")[0]].extend(
-                        [local_vars[name] for name in op.output("Out")])
-        return outputs_dict
-
-    def _find_multi_distributed_lookup_table(self, losses):
-        """
-        find multi-sparse-table
-        """
-        table_names = set()
-        for loss in losses:
-            for op in loss.block.program.global_block().ops:
-                if op.type == "lookup_table":
-                    if op.attr('is_distributed') is True:
-                        table_name = op.input("W")[0]
-                        table_names.add(table_name)
-        return list(table_names)
-
-    def _minimize(self,
-                  losses,
-                  startup_program=None,
-                  parameter_list=None,
-                  no_grad_set=None,
-                  strategy={}):
-        """
-        DownpounSGD is a distributed optimizer so
-        that user can call minimize to generate backward
-        operators and optimization operators within minmize function
-        Args:
-            loss(Variable): loss variable defined by user
-            startup_program(Program): startup program that defined by user
-            parameter_list(str list): parameter names defined by users
-            no_grad_set(set): a set of variables that is defined by users
-            so that these variables do not need gradient computation
-            strategy(dict): user-defined properties
-        Returns:
-            [optimize_ops, grads_and_weights]
-        """
-
-        sparse_table_names = self._find_multi_distributed_lookup_table(losses)
-        inputs_dict = self._find_distributed_lookup_table_inputs(
-            losses[0].block.program, sparse_table_names)
-
-        outputs_dict = self._find_distributed_lookup_table_outputs(
-            losses[0].block.program, sparse_table_names)
-
-        ps_param = pslib.PSParameter()
-        server = DownpourServer()
-        worker = DownpourWorker(self._window)
-        # if user specify a fleet_desc.prototxt file, then load the file
-        # instead of creating default fleet_desc.prototxt.
-        # user can specify server_param or trainer_param or fs_client_param.
-        if strategy.get("fleet_desc_file") is not None:
-            fleet_desc_file = strategy["fleet_desc_file"]
-            with open(fleet_desc_file) as f:
-                text_format.Merge(f.read(), ps_param)
-            server.get_desc().CopyFrom(ps_param.server_param)
-            worker.get_desc().CopyFrom(ps_param.trainer_param[0])
-
-        sparse_table_index = 0
-        for tn in sparse_table_names:
-            if strategy.get(tn) is not None:
-                server.add_sparse_table(sparse_table_index, strategy[tn])
-            else:
-                server.add_sparse_table(sparse_table_index, None)
-            worker.add_sparse_table(sparse_table_index, inputs_dict[tn],
-                                    outputs_dict[tn])
-            sparse_table_index += 1
-
-        dense_start_table_id = sparse_table_index
-        dense_table_index = sparse_table_index
-        program_configs = {}
-        param_grads_list = []
-
-        for loss_index in range(len(losses)):
-            program_id = str(id(losses[loss_index].block.program))
-            program_configs[program_id] = {
-                "pull_sparse":
-                [t_index for t_index in range(sparse_table_index)],
-                "push_sparse":
-                [t_index for t_index in range(sparse_table_index)]
-            }
-
-            params_grads = sorted(
-                fluid.backward.append_backward(losses[loss_index],
-                                               parameter_list, no_grad_set),
-                key=lambda x: x[0].name)
-            param_grads_list.append(params_grads)
-            params = []
-            grads = []
-            data_norm_params = []
-            data_norm_grads = []
-            for i in params_grads:
-                is_data_norm_data = False
-                for data_norm_name in self.data_norm_name:
-                    if i[0].name.endswith(data_norm_name):
-                        is_data_norm_data = True
-                        data_norm_params.append(i[0])
-                if not is_data_norm_data:
-                    params.append(i[0])
-
-            for i in params_grads:
-                is_data_norm_data = False
-                for data_norm_grad in self.data_norm_name:
-                    if i[0].name.endswith(data_norm_grad):
-                        is_data_norm_data = True
-                        data_norm_grads.append(i[1])
-                if not is_data_norm_data:
-                    grads.append(i[1])
-
-            if strategy.get('dense_table') is not None:
-                server.add_dense_table(dense_table_index, params, grads,
-                                       strategy['dense_table'],
-                                       sparse_table_names)
-            else:
-                server.add_dense_table(dense_table_index, params, grads, None,
-                                       sparse_table_names)
-            worker.add_dense_table(dense_table_index, self._learning_rate,
-                                   params, grads, dense_start_table_id,
-                                   sparse_table_names)
-            program_configs[program_id]["pull_dense"] = [dense_table_index]
-            program_configs[program_id]["push_dense"] = [dense_table_index]
-            if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
-                dense_table_index += 1
-                if strategy.get('datanorm_table') is not None:
-                    server.add_data_norm_table(
-                        dense_table_index, self._learning_rate,
-                        data_norm_params, data_norm_grads,
-                        strategy['datanorm_table'], sparse_table_names)
-                else:
-                    server.add_data_norm_table(
-                        dense_table_index, self._learning_rate,
-                        data_norm_params, data_norm_grads, None,
-                        sparse_table_names)
-
-                worker.add_dense_table(dense_table_index, self._learning_rate,
-                                       data_norm_params, data_norm_grads,
-                                       dense_start_table_id, sparse_table_names)
-                program_configs[program_id]["pull_dense"].extend(
-                    [dense_table_index])
-                program_configs[program_id]["push_dense"].extend(
-                    [dense_table_index])
-            dense_table_index += 1
-        ps_param.server_param.CopyFrom(server.get_desc())
-        if len(ps_param.trainer_param) == 0:
-            tp = ps_param.trainer_param.add()
-            tp.CopyFrom(worker.get_desc())
-        else:
-            ps_param.trainer_param[0].CopyFrom(worker.get_desc())
-        # Todo(guru4elephant): figure out how to support more sparse parameters
-        # currently only support lookup_table
-        worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
-        if len(ps_param.trainer_param[0].skip_op) == 0:
-            ps_param.trainer_param[0].skip_op.extend(worker_skipped_ops)
-
-        opt_info = {}
-        opt_info["program_configs"] = program_configs
-        opt_info["trainer"] = "DistMultiTrainer"
-        opt_info["device_worker"] = "DownpourSGD"
-        opt_info["optimizer"] = "DownpourSGD"
-        opt_info["fleet_desc"] = ps_param
-        opt_info["worker_skipped_ops"] = worker_skipped_ops
-        opt_info["use_cvm"] = strategy.get("use_cvm", False)
-        opt_info["stat_var_names"] = strategy.get("stat_var_names", [])
-        opt_info["scale_datanorm"] = strategy.get("scale_datanorm", -1)
-        opt_info["dump_slot"] = False
-        opt_info["dump_converter"] = ""
-        opt_info["dump_fields"] = strategy.get("dump_fields", [])
-        opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "")
-        opt_info["user_define_dump_filename"] = strategy.get("user_define_dump_filename", "")
-        if server._server.downpour_server_param.downpour_table_param[
-                0].accessor.accessor_class == "DownpourCtrAccessor":
-            opt_info["dump_slot"] = True
-        opt_info["adjust_ins_weight"] = strategy.get("adjust_ins_weight", {})
-
-        for loss in losses:
-            loss.block.program._fleet_opt = opt_info
-
-        return None, param_grads_list[0], opt_info
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
deleted file mode 100644
index d515a870fbccd00501a04102378ff4185276680b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
+++ /dev/null
@@ -1,1942 +0,0 @@
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: ps.proto
-
-import sys
-_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
-from google.protobuf.internal import enum_type_wrapper
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import message as _message
-from google.protobuf import reflection as _reflection
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf import descriptor_pb2
-# @@protoc_insertion_point(imports)
-
-_sym_db = _symbol_database.Default()
-
-
-
-
-DESCRIPTOR = _descriptor.FileDescriptor(
-  name='ps.proto',
-  package='paddle',
-  syntax='proto2',
-  serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\xb5\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12\x15\n\x0binit_gflags\x18\x04 \x01(\t:\x00\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x03(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xfd\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\x12-\n\x0eprogram_config\x18\x06 \x03(\x0b\x32\x15.paddle.ProgramConfig\"\x99\x01\n\rProgramConfig\x12\x12\n\nprogram_id\x18\x01 \x02(\t\x12\x1c\n\x14push_sparse_table_id\x18\x02 \x03(\x05\x12\x1b\n\x13push_dense_table_id\x18\x03 \x03(\x05\x12\x1c\n\x14pull_sparse_table_id\x18\x04 \x03(\x05\x12\x1b\n\x13pull_dense_table_id\x18\x05 \x03(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x02\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x17\n\tshard_num\x18\x03 \x01(\x04:\x04\x31\x30\x30\x30\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\'\n\x19\x65nable_sparse_table_cache\x18\x07 \x01(\x08:\x04true\x12\'\n\x17sparse_table_cache_rate\x18\x08 \x01(\x01:\x06\x30.0025\x12\'\n\x1bsparse_table_cache_file_num\x18\t \x01(\r:\x02\x31\x36\"\xc1\x04\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x13\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r:\x02\x31\x31\x12\x15\n\nembedx_dim\x18\x05 \x01(\r:\x01\x38\x12\x1c\n\x10\x65mbedx_threshold\x18\x06 \x01(\r:\x02\x31\x30\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\x12\x44\n\x16sparse_commonsgd_param\x18\t \x01(\x0b\x32$.paddle.SparseCommonSGDRuleParameter\x12=\n\x0f\x65mbed_sgd_param\x18\n \x01(\x0b\x32$.paddle.SparseCommonSGDRuleParameter\x12>\n\x10\x65mbedx_sgd_param\x18\x0b \x01(\x0b\x32$.paddle.SparseCommonSGDRuleParameter\"\xba\x02\n\x1e\x44ownpourTableAccessorParameter\x12\x19\n\x0cnonclk_coeff\x18\x01 \x01(\x02:\x03\x30.1\x12\x16\n\x0b\x63lick_coeff\x18\x02 \x01(\x02:\x01\x31\x12\x1b\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02:\x03\x31.5\x12\x1d\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02:\x04\x30.25\x12\x1b\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02:\x02\x31\x36\x12#\n\x15show_click_decay_rate\x18\x06 \x01(\x02:\x04\x30.98\x12\x1d\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02:\x03\x30.8\x12$\n\x18\x64\x65lete_after_unseen_days\x18\x08 \x01(\x02:\x02\x33\x30\x12\"\n\x17ssd_unseenday_threshold\x18\t \x01(\x05:\x01\x31\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"\x85\x01\n\x16SparseSGDRuleParameter\x12\x1b\n\rlearning_rate\x18\x01 \x01(\x01:\x04\x30.05\x12\x18\n\rinitial_g2sum\x18\x02 \x01(\x01:\x01\x33\x12\x1d\n\rinitial_range\x18\x03 \x01(\x01:\x06\x30.0001\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xc6\x01\n\x1cSparseCommonSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x32\n\x05naive\x18\x02 \x01(\x0b\x32#.paddle.SparseNaiveSGDRuleParameter\x12\x36\n\x07\x61\x64\x61grad\x18\x03 \x01(\x0b\x32%.paddle.SparseAdagradSGDRuleParameter\x12,\n\x04\x61\x64\x61m\x18\x04 \x01(\x0b\x32\x1e.paddle.SparseAdamSGDParameter\"p\n\x1bSparseNaiveSGDRuleParameter\x12\x1b\n\rlearning_rate\x18\x01 \x01(\x01:\x04\x30.05\x12\x1d\n\rinitial_range\x18\x02 \x01(\x01:\x06\x30.0001\x12\x15\n\rweight_bounds\x18\x03 \x03(\x02\"\x8c\x01\n\x1dSparseAdagradSGDRuleParameter\x12\x1b\n\rlearning_rate\x18\x01 \x01(\x01:\x04\x30.05\x12\x18\n\rinitial_g2sum\x18\x02 \x01(\x01:\x01\x33\x12\x1d\n\rinitial_range\x18\x03 \x01(\x01:\x06\x30.0001\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xc8\x01\n\x16SparseAdamSGDParameter\x12\x1c\n\rlearning_rate\x18\x01 \x01(\x01:\x05\x30.001\x12\x1d\n\rinitial_range\x18\x02 \x01(\x01:\x06\x30.0001\x12\x1d\n\x10\x62\x65ta1_decay_rate\x18\x03 \x01(\x01:\x03\x30.9\x12\x1f\n\x10\x62\x65ta2_decay_rate\x18\x04 \x01(\x01:\x05\x30.999\x12\x1a\n\x0b\x61\x64\x61_epsilon\x18\x05 \x01(\x01:\x05\x31\x65-08\x12\x15\n\rweight_bounds\x18\x06 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\xac\x01\n\x10\x41\x64\x61mSGDParameter\x12\x1c\n\rlearning_rate\x18\x01 \x01(\x01:\x05\x35\x65-06\x12 \n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01:\x08\x30.999993\x12\x1e\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01:\x06\x30.9999\x12\x1a\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01:\x05\x31\x65-08\x12\x1c\n\x0emom_decay_rate\x18\x05 \x01(\x01:\x04\x30.99\"J\n\x11NaiveSGDParameter\x12\x1d\n\rlearning_rate\x18\x01 \x01(\x01:\x06\x30.0002\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xba\x04\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x12\x1b\n\x17PS_SAVE_ONE_CACHE_TABLE\x10\r\x12\x1a\n\x16PS_GET_CACHE_THRESHOLD\x10\x0e\x12\x14\n\x10PS_CACHE_SHUFFLE\x10\x0f\x12\x11\n\rPS_COPY_TABLE\x10\x10\x12\x1c\n\x18PS_COPY_TABLE_BY_FEASIGN\x10\x11\x12(\n$PS_PULL_SPARSE_TABLE_WITH_DEPENDENCY\x10\x12\x12(\n$PS_PUSH_SPARSE_TABLE_WITH_DEPENDENCY\x10\x13\x12\x17\n\x13PS_PRINT_TABLE_STAT\x10\x14\x12\x0e\n\nPS_S2S_MSG\x10\x65\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x06\x80\x01\x01\xf8\x01\x01')
-)
-_sym_db.RegisterFileDescriptor(DESCRIPTOR)
-
-_TABLETYPE = _descriptor.EnumDescriptor(
-  name='TableType',
-  full_name='paddle.TableType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='PS_SPARSE_TABLE', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_DENSE_TABLE', index=1, number=1,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=4678,
-  serialized_end=4730,
-)
-_sym_db.RegisterEnumDescriptor(_TABLETYPE)
-
-TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE)
-_PSCMDID = _descriptor.EnumDescriptor(
-  name='PsCmdID',
-  full_name='paddle.PsCmdID',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='PS_PULL_DENSE_TABLE', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PUSH_DENSE_TABLE', index=1, number=1,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PULL_SPARSE_TABLE', index=2, number=2,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PUSH_SPARSE_TABLE', index=3, number=3,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_SHRINK_TABLE', index=4, number=4,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_SAVE_ONE_TABLE', index=5, number=5,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_SAVE_ALL_TABLE', index=6, number=6,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_LOAD_ONE_TABLE', index=7, number=7,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_LOAD_ALL_TABLE', index=8, number=8,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_CLEAR_ONE_TABLE', index=9, number=9,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_CLEAR_ALL_TABLE', index=10, number=10,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PUSH_DENSE_PARAM', index=11, number=11,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_STOP_SERVER', index=12, number=12,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_SAVE_ONE_CACHE_TABLE', index=13, number=13,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_GET_CACHE_THRESHOLD', index=14, number=14,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_CACHE_SHUFFLE', index=15, number=15,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_COPY_TABLE', index=16, number=16,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_COPY_TABLE_BY_FEASIGN', index=17, number=17,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PULL_SPARSE_TABLE_WITH_DEPENDENCY', index=18, number=18,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PUSH_SPARSE_TABLE_WITH_DEPENDENCY', index=19, number=19,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_PRINT_TABLE_STAT', index=20, number=20,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PS_S2S_MSG', index=21, number=101,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=4733,
-  serialized_end=5303,
-)
-_sym_db.RegisterEnumDescriptor(_PSCMDID)
-
-PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID)
-PS_SPARSE_TABLE = 0
-PS_DENSE_TABLE = 1
-PS_PULL_DENSE_TABLE = 0
-PS_PUSH_DENSE_TABLE = 1
-PS_PULL_SPARSE_TABLE = 2
-PS_PUSH_SPARSE_TABLE = 3
-PS_SHRINK_TABLE = 4
-PS_SAVE_ONE_TABLE = 5
-PS_SAVE_ALL_TABLE = 6
-PS_LOAD_ONE_TABLE = 7
-PS_LOAD_ALL_TABLE = 8
-PS_CLEAR_ONE_TABLE = 9
-PS_CLEAR_ALL_TABLE = 10
-PS_PUSH_DENSE_PARAM = 11
-PS_STOP_SERVER = 12
-PS_SAVE_ONE_CACHE_TABLE = 13
-PS_GET_CACHE_THRESHOLD = 14
-PS_CACHE_SHUFFLE = 15
-PS_COPY_TABLE = 16
-PS_COPY_TABLE_BY_FEASIGN = 17
-PS_PULL_SPARSE_TABLE_WITH_DEPENDENCY = 18
-PS_PUSH_SPARSE_TABLE_WITH_DEPENDENCY = 19
-PS_PRINT_TABLE_STAT = 20
-PS_S2S_MSG = 101
-
-
-_FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor(
-  name='FsApiType',
-  full_name='paddle.FsClientParameter.FsApiType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='HDFS', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='AFS', index=1, number=1,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=4646,
-  serialized_end=4676,
-)
-_sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE)
-
-
-_PSPARAMETER = _descriptor.Descriptor(
-  name='PSParameter',
-  full_name='paddle.PSParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='worker_class', full_name='paddle.PSParameter.worker_class', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='server_class', full_name='paddle.PSParameter.server_class', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='instance_class', full_name='paddle.PSParameter.instance_class', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='init_gflags', full_name='paddle.PSParameter.init_gflags', index=3,
-      number=4, type=9, cpp_type=9, label=1,
-      has_default_value=True, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='worker_param', full_name='paddle.PSParameter.worker_param', index=4,
-      number=101, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='server_param', full_name='paddle.PSParameter.server_param', index=5,
-      number=102, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='trainer_param', full_name='paddle.PSParameter.trainer_param', index=6,
-      number=301, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='fs_client_param', full_name='paddle.PSParameter.fs_client_param', index=7,
-      number=501, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=21,
-  serialized_end=330,
-)
-
-
-_WORKERPARAMETER = _descriptor.Descriptor(
-  name='WorkerParameter',
-  full_name='paddle.WorkerParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='downpour_worker_param', full_name='paddle.WorkerParameter.downpour_worker_param', index=0,
-      number=1, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=332,
-  serialized_end=413,
-)
-
-
-_SERVERPARAMETER = _descriptor.Descriptor(
-  name='ServerParameter',
-  full_name='paddle.ServerParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='downpour_server_param', full_name='paddle.ServerParameter.downpour_server_param', index=0,
-      number=1, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=415,
-  serialized_end=496,
-)
-
-
-_DOWNPOURWORKERPARAMETER = _descriptor.Descriptor(
-  name='DownpourWorkerParameter',
-  full_name='paddle.DownpourWorkerParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='downpour_table_param', full_name='paddle.DownpourWorkerParameter.downpour_table_param', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=498,
-  serialized_end=577,
-)
-
-
-_DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor(
-  name='DownpourTrainerParameter',
-  full_name='paddle.DownpourTrainerParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='dense_table', full_name='paddle.DownpourTrainerParameter.dense_table', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='sparse_table', full_name='paddle.DownpourTrainerParameter.sparse_table', index=1,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='push_sparse_per_batch', full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch', index=2,
-      number=3, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='push_dense_per_batch', full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', index=3,
-      number=4, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='skip_op', full_name='paddle.DownpourTrainerParameter.skip_op', index=4,
-      number=5, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='program_config', full_name='paddle.DownpourTrainerParameter.program_config', index=5,
-      number=6, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=580,
-  serialized_end=833,
-)
-
-
-_PROGRAMCONFIG = _descriptor.Descriptor(
-  name='ProgramConfig',
-  full_name='paddle.ProgramConfig',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='program_id', full_name='paddle.ProgramConfig.program_id', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='push_sparse_table_id', full_name='paddle.ProgramConfig.push_sparse_table_id', index=1,
-      number=2, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='push_dense_table_id', full_name='paddle.ProgramConfig.push_dense_table_id', index=2,
-      number=3, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='pull_sparse_table_id', full_name='paddle.ProgramConfig.pull_sparse_table_id', index=3,
-      number=4, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='pull_dense_table_id', full_name='paddle.ProgramConfig.pull_dense_table_id', index=4,
-      number=5, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=836,
-  serialized_end=989,
-)
-
-
-_DENSETABLEPARAMETER = _descriptor.Descriptor(
-  name='DenseTableParameter',
-  full_name='paddle.DenseTableParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='table_id', full_name='paddle.DenseTableParameter.table_id', index=0,
-      number=1, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dense_variable_name', full_name='paddle.DenseTableParameter.dense_variable_name', index=1,
-      number=2, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dense_gradient_variable_name', full_name='paddle.DenseTableParameter.dense_gradient_variable_name', index=2,
-      number=3, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='fea_dim', full_name='paddle.DenseTableParameter.fea_dim', index=3,
-      number=4, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=991,
-  serialized_end=1114,
-)
-
-
-_SPARSETABLEPARAMETER = _descriptor.Descriptor(
-  name='SparseTableParameter',
-  full_name='paddle.SparseTableParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='table_id', full_name='paddle.SparseTableParameter.table_id', index=0,
-      number=1, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='feature_dim', full_name='paddle.SparseTableParameter.feature_dim', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='slot_key', full_name='paddle.SparseTableParameter.slot_key', index=2,
-      number=3, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='slot_value', full_name='paddle.SparseTableParameter.slot_value', index=3,
-      number=4, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='slot_gradient', full_name='paddle.SparseTableParameter.slot_gradient', index=4,
-      number=5, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1116,
-  serialized_end=1238,
-)
-
-
-_DOWNPOURSERVERPARAMETER = _descriptor.Descriptor(
-  name='DownpourServerParameter',
-  full_name='paddle.DownpourServerParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='downpour_table_param', full_name='paddle.DownpourServerParameter.downpour_table_param', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='service_param', full_name='paddle.DownpourServerParameter.service_param', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1241,
-  serialized_end=1375,
-)
-
-
-_SERVERSERVICEPARAMETER = _descriptor.Descriptor(
-  name='ServerServiceParameter',
-  full_name='paddle.ServerServiceParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='server_class', full_name='paddle.ServerServiceParameter.server_class', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=True, default_value=_b("DownpourBrpcPsServer").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='client_class', full_name='paddle.ServerServiceParameter.client_class', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=True, default_value=_b("DownpourBrpcPsClient").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='service_class', full_name='paddle.ServerServiceParameter.service_class', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=True, default_value=_b("DownpourPsService").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='start_server_port', full_name='paddle.ServerServiceParameter.start_server_port', index=3,
-      number=4, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='server_thread_num', full_name='paddle.ServerServiceParameter.server_thread_num', index=4,
-      number=5, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=12,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1378,
-  serialized_end=1593,
-)
-
-
-_TABLEPARAMETER = _descriptor.Descriptor(
-  name='TableParameter',
-  full_name='paddle.TableParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='table_id', full_name='paddle.TableParameter.table_id', index=0,
-      number=1, type=4, cpp_type=4, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='table_class', full_name='paddle.TableParameter.table_class', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='shard_num', full_name='paddle.TableParameter.shard_num', index=2,
-      number=3, type=4, cpp_type=4, label=1,
-      has_default_value=True, default_value=1000,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='accessor', full_name='paddle.TableParameter.accessor', index=3,
-      number=4, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle.TableParameter.type', index=4,
-      number=5, type=14, cpp_type=8, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='compress_in_save', full_name='paddle.TableParameter.compress_in_save', index=5,
-      number=6, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='enable_sparse_table_cache', full_name='paddle.TableParameter.enable_sparse_table_cache', index=6,
-      number=7, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=True,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='sparse_table_cache_rate', full_name='paddle.TableParameter.sparse_table_cache_rate', index=7,
-      number=8, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0025),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='sparse_table_cache_file_num', full_name='paddle.TableParameter.sparse_table_cache_file_num', index=8,
-      number=9, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=16,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1596,
-  serialized_end=1915,
-)
-
-
-_TABLEACCESSORPARAMETER = _descriptor.Descriptor(
-  name='TableAccessorParameter',
-  full_name='paddle.TableAccessorParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='accessor_class', full_name='paddle.TableAccessorParameter.accessor_class', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='sparse_sgd_param', full_name='paddle.TableAccessorParameter.sparse_sgd_param', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dense_sgd_param', full_name='paddle.TableAccessorParameter.dense_sgd_param', index=2,
-      number=3, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='fea_dim', full_name='paddle.TableAccessorParameter.fea_dim', index=3,
-      number=4, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=11,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='embedx_dim', full_name='paddle.TableAccessorParameter.embedx_dim', index=4,
-      number=5, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=8,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='embedx_threshold', full_name='paddle.TableAccessorParameter.embedx_threshold', index=5,
-      number=6, type=13, cpp_type=3, label=1,
-      has_default_value=True, default_value=10,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='downpour_accessor_param', full_name='paddle.TableAccessorParameter.downpour_accessor_param', index=6,
-      number=7, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='table_accessor_save_param', full_name='paddle.TableAccessorParameter.table_accessor_save_param', index=7,
-      number=8, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='sparse_commonsgd_param', full_name='paddle.TableAccessorParameter.sparse_commonsgd_param', index=8,
-      number=9, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='embed_sgd_param', full_name='paddle.TableAccessorParameter.embed_sgd_param', index=9,
-      number=10, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='embedx_sgd_param', full_name='paddle.TableAccessorParameter.embedx_sgd_param', index=10,
-      number=11, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1918,
-  serialized_end=2495,
-)
-
-
-_DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
-  name='DownpourTableAccessorParameter',
-  full_name='paddle.DownpourTableAccessorParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='nonclk_coeff', full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff', index=0,
-      number=1, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(0.1),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='click_coeff', full_name='paddle.DownpourTableAccessorParameter.click_coeff', index=1,
-      number=2, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(1),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='base_threshold', full_name='paddle.DownpourTableAccessorParameter.base_threshold', index=2,
-      number=3, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(1.5),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='delta_threshold', full_name='paddle.DownpourTableAccessorParameter.delta_threshold', index=3,
-      number=4, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(0.25),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='delta_keep_days', full_name='paddle.DownpourTableAccessorParameter.delta_keep_days', index=4,
-      number=5, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(16),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='show_click_decay_rate', full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate', index=5,
-      number=6, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(0.98),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='delete_threshold', full_name='paddle.DownpourTableAccessorParameter.delete_threshold', index=6,
-      number=7, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(0.8),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='delete_after_unseen_days', full_name='paddle.DownpourTableAccessorParameter.delete_after_unseen_days', index=7,
-      number=8, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=float(30),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ssd_unseenday_threshold', full_name='paddle.DownpourTableAccessorParameter.ssd_unseenday_threshold', index=8,
-      number=9, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2498,
-  serialized_end=2812,
-)
-
-
-_TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
-  name='TableAccessorSaveParameter',
-  full_name='paddle.TableAccessorSaveParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='param', full_name='paddle.TableAccessorSaveParameter.param', index=0,
-      number=1, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='converter', full_name='paddle.TableAccessorSaveParameter.converter', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='deconverter', full_name='paddle.TableAccessorSaveParameter.deconverter', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2814,
-  serialized_end=2897,
-)
-
-
-_PSREQUESTMESSAGE = _descriptor.Descriptor(
-  name='PsRequestMessage',
-  full_name='paddle.PsRequestMessage',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='cmd_id', full_name='paddle.PsRequestMessage.cmd_id', index=0,
-      number=1, type=13, cpp_type=3, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='table_id', full_name='paddle.PsRequestMessage.table_id', index=1,
-      number=2, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='params', full_name='paddle.PsRequestMessage.params', index=2,
-      number=3, type=12, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='client_id', full_name='paddle.PsRequestMessage.client_id', index=3,
-      number=4, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='data', full_name='paddle.PsRequestMessage.data', index=4,
-      number=5, type=12, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b(""),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2899,
-  serialized_end=3000,
-)
-
-
-_SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
-  name='SparseSGDRuleParameter',
-  full_name='paddle.SparseSGDRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.SparseSGDRuleParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.05),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_g2sum', full_name='paddle.SparseSGDRuleParameter.initial_g2sum', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(3),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_range', full_name='paddle.SparseSGDRuleParameter.initial_range', index=2,
-      number=3, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0001),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='weight_bounds', full_name='paddle.SparseSGDRuleParameter.weight_bounds', index=3,
-      number=4, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3003,
-  serialized_end=3136,
-)
-
-
-_SPARSECOMMONSGDRULEPARAMETER = _descriptor.Descriptor(
-  name='SparseCommonSGDRuleParameter',
-  full_name='paddle.SparseCommonSGDRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle.SparseCommonSGDRuleParameter.name', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='naive', full_name='paddle.SparseCommonSGDRuleParameter.naive', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='adagrad', full_name='paddle.SparseCommonSGDRuleParameter.adagrad', index=2,
-      number=3, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='adam', full_name='paddle.SparseCommonSGDRuleParameter.adam', index=3,
-      number=4, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3139,
-  serialized_end=3337,
-)
-
-
-_SPARSENAIVESGDRULEPARAMETER = _descriptor.Descriptor(
-  name='SparseNaiveSGDRuleParameter',
-  full_name='paddle.SparseNaiveSGDRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.SparseNaiveSGDRuleParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.05),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_range', full_name='paddle.SparseNaiveSGDRuleParameter.initial_range', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0001),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='weight_bounds', full_name='paddle.SparseNaiveSGDRuleParameter.weight_bounds', index=2,
-      number=3, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3339,
-  serialized_end=3451,
-)
-
-
-_SPARSEADAGRADSGDRULEPARAMETER = _descriptor.Descriptor(
-  name='SparseAdagradSGDRuleParameter',
-  full_name='paddle.SparseAdagradSGDRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.SparseAdagradSGDRuleParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.05),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_g2sum', full_name='paddle.SparseAdagradSGDRuleParameter.initial_g2sum', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(3),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_range', full_name='paddle.SparseAdagradSGDRuleParameter.initial_range', index=2,
-      number=3, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0001),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='weight_bounds', full_name='paddle.SparseAdagradSGDRuleParameter.weight_bounds', index=3,
-      number=4, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3454,
-  serialized_end=3594,
-)
-
-
-_SPARSEADAMSGDPARAMETER = _descriptor.Descriptor(
-  name='SparseAdamSGDParameter',
-  full_name='paddle.SparseAdamSGDParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.SparseAdamSGDParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.001),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='initial_range', full_name='paddle.SparseAdamSGDParameter.initial_range', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0001),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='beta1_decay_rate', full_name='paddle.SparseAdamSGDParameter.beta1_decay_rate', index=2,
-      number=3, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.9),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='beta2_decay_rate', full_name='paddle.SparseAdamSGDParameter.beta2_decay_rate', index=3,
-      number=4, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.999),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ada_epsilon', full_name='paddle.SparseAdamSGDParameter.ada_epsilon', index=4,
-      number=5, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(1e-08),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='weight_bounds', full_name='paddle.SparseAdamSGDParameter.weight_bounds', index=5,
-      number=6, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3597,
-  serialized_end=3797,
-)
-
-
-_DENSESGDRULEPARAMETER = _descriptor.Descriptor(
-  name='DenseSGDRuleParameter',
-  full_name='paddle.DenseSGDRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle.DenseSGDRuleParameter.name', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='adam', full_name='paddle.DenseSGDRuleParameter.adam', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='naive', full_name='paddle.DenseSGDRuleParameter.naive', index=2,
-      number=3, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='summary', full_name='paddle.DenseSGDRuleParameter.summary', index=3,
-      number=4, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='moving_average', full_name='paddle.DenseSGDRuleParameter.moving_average', index=4,
-      number=5, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=3800,
-  serialized_end=4025,
-)
-
-
-_ADAMSGDPARAMETER = _descriptor.Descriptor(
-  name='AdamSGDParameter',
-  full_name='paddle.AdamSGDParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.AdamSGDParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(5e-06),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='avg_decay_rate', full_name='paddle.AdamSGDParameter.avg_decay_rate', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.999993),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ada_decay_rate', full_name='paddle.AdamSGDParameter.ada_decay_rate', index=2,
-      number=3, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.9999),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ada_epsilon', full_name='paddle.AdamSGDParameter.ada_epsilon', index=3,
-      number=4, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(1e-08),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='mom_decay_rate', full_name='paddle.AdamSGDParameter.mom_decay_rate', index=4,
-      number=5, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.99),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4028,
-  serialized_end=4200,
-)
-
-
-_NAIVESGDPARAMETER = _descriptor.Descriptor(
-  name='NaiveSGDParameter',
-  full_name='paddle.NaiveSGDParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='learning_rate', full_name='paddle.NaiveSGDParameter.learning_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.0002),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='avg_decay_rate', full_name='paddle.NaiveSGDParameter.avg_decay_rate', index=1,
-      number=2, type=1, cpp_type=5, label=1,
-      has_default_value=False, default_value=float(0),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4202,
-  serialized_end=4276,
-)
-
-
-_SUMMARYSGDPARAMETER = _descriptor.Descriptor(
-  name='SummarySGDParameter',
-  full_name='paddle.SummarySGDParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='summary_decay_rate', full_name='paddle.SummarySGDParameter.summary_decay_rate', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=True, default_value=float(0.999999),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4278,
-  serialized_end=4337,
-)
-
-
-_MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
-  name='MovingAverageRuleParameter',
-  full_name='paddle.MovingAverageRuleParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='momentum', full_name='paddle.MovingAverageRuleParameter.momentum', index=0,
-      number=1, type=1, cpp_type=5, label=1,
-      has_default_value=False, default_value=float(0),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4339,
-  serialized_end=4385,
-)
-
-
-_PSRESPONSEMESSAGE = _descriptor.Descriptor(
-  name='PsResponseMessage',
-  full_name='paddle.PsResponseMessage',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='err_code', full_name='paddle.PsResponseMessage.err_code', index=0,
-      number=1, type=5, cpp_type=1, label=2,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='err_msg', full_name='paddle.PsResponseMessage.err_msg', index=1,
-      number=2, type=9, cpp_type=9, label=2,
-      has_default_value=True, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='data', full_name='paddle.PsResponseMessage.data', index=2,
-      number=3, type=12, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b(""),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4387,
-  serialized_end=4460,
-)
-
-
-_FSCLIENTPARAMETER = _descriptor.Descriptor(
-  name='FsClientParameter',
-  full_name='paddle.FsClientParameter',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='fs_type', full_name='paddle.FsClientParameter.fs_type', index=0,
-      number=1, type=14, cpp_type=8, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='uri', full_name='paddle.FsClientParameter.uri', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='user', full_name='paddle.FsClientParameter.user', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='passwd', full_name='paddle.FsClientParameter.passwd', index=3,
-      number=4, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='buffer_size', full_name='paddle.FsClientParameter.buffer_size', index=4,
-      number=5, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='hadoop_bin', full_name='paddle.FsClientParameter.hadoop_bin', index=5,
-      number=51, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='afs_conf', full_name='paddle.FsClientParameter.afs_conf', index=6,
-      number=101, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-    _FSCLIENTPARAMETER_FSAPITYPE,
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=4463,
-  serialized_end=4676,
-)
-
-_PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER
-_PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER
-_PSPARAMETER.fields_by_name['trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER
-_PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER
-_WORKERPARAMETER.fields_by_name['downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER
-_SERVERPARAMETER.fields_by_name['downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER
-_DOWNPOURWORKERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER
-_DOWNPOURTRAINERPARAMETER.fields_by_name['dense_table'].message_type = _DENSETABLEPARAMETER
-_DOWNPOURTRAINERPARAMETER.fields_by_name['sparse_table'].message_type = _SPARSETABLEPARAMETER
-_DOWNPOURTRAINERPARAMETER.fields_by_name['program_config'].message_type = _PROGRAMCONFIG
-_DOWNPOURSERVERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER
-_DOWNPOURSERVERPARAMETER.fields_by_name['service_param'].message_type = _SERVERSERVICEPARAMETER
-_TABLEPARAMETER.fields_by_name['accessor'].message_type = _TABLEACCESSORPARAMETER
-_TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE
-_TABLEACCESSORPARAMETER.fields_by_name['sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['sparse_commonsgd_param'].message_type = _SPARSECOMMONSGDRULEPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['embed_sgd_param'].message_type = _SPARSECOMMONSGDRULEPARAMETER
-_TABLEACCESSORPARAMETER.fields_by_name['embedx_sgd_param'].message_type = _SPARSECOMMONSGDRULEPARAMETER
-_SPARSECOMMONSGDRULEPARAMETER.fields_by_name['naive'].message_type = _SPARSENAIVESGDRULEPARAMETER
-_SPARSECOMMONSGDRULEPARAMETER.fields_by_name['adagrad'].message_type = _SPARSEADAGRADSGDRULEPARAMETER
-_SPARSECOMMONSGDRULEPARAMETER.fields_by_name['adam'].message_type = _SPARSEADAMSGDPARAMETER
-_DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER
-_DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER
-_DENSESGDRULEPARAMETER.fields_by_name['summary'].message_type = _SUMMARYSGDPARAMETER
-_DENSESGDRULEPARAMETER.fields_by_name['moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER
-_FSCLIENTPARAMETER.fields_by_name['fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE
-_FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER
-DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER
-DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER
-DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER
-DESCRIPTOR.message_types_by_name['DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER
-DESCRIPTOR.message_types_by_name['DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER
-DESCRIPTOR.message_types_by_name['ProgramConfig'] = _PROGRAMCONFIG
-DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER
-DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER
-DESCRIPTOR.message_types_by_name['DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER
-DESCRIPTOR.message_types_by_name['ServerServiceParameter'] = _SERVERSERVICEPARAMETER
-DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER
-DESCRIPTOR.message_types_by_name['TableAccessorParameter'] = _TABLEACCESSORPARAMETER
-DESCRIPTOR.message_types_by_name['DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER
-DESCRIPTOR.message_types_by_name['TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER
-DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE
-DESCRIPTOR.message_types_by_name['SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER
-DESCRIPTOR.message_types_by_name['SparseCommonSGDRuleParameter'] = _SPARSECOMMONSGDRULEPARAMETER
-DESCRIPTOR.message_types_by_name['SparseNaiveSGDRuleParameter'] = _SPARSENAIVESGDRULEPARAMETER
-DESCRIPTOR.message_types_by_name['SparseAdagradSGDRuleParameter'] = _SPARSEADAGRADSGDRULEPARAMETER
-DESCRIPTOR.message_types_by_name['SparseAdamSGDParameter'] = _SPARSEADAMSGDPARAMETER
-DESCRIPTOR.message_types_by_name['DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER
-DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER
-DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER
-DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER
-DESCRIPTOR.message_types_by_name['MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER
-DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE
-DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER
-DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE
-DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID
-
-PSParameter = _reflection.GeneratedProtocolMessageType('PSParameter', (_message.Message,), dict(
-  DESCRIPTOR = _PSPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.PSParameter)
-  ))
-_sym_db.RegisterMessage(PSParameter)
-
-WorkerParameter = _reflection.GeneratedProtocolMessageType('WorkerParameter', (_message.Message,), dict(
-  DESCRIPTOR = _WORKERPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.WorkerParameter)
-  ))
-_sym_db.RegisterMessage(WorkerParameter)
-
-ServerParameter = _reflection.GeneratedProtocolMessageType('ServerParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SERVERPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.ServerParameter)
-  ))
-_sym_db.RegisterMessage(ServerParameter)
-
-DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType('DownpourWorkerParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DOWNPOURWORKERPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter)
-  ))
-_sym_db.RegisterMessage(DownpourWorkerParameter)
-
-DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType('DownpourTrainerParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DOWNPOURTRAINERPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter)
-  ))
-_sym_db.RegisterMessage(DownpourTrainerParameter)
-
-ProgramConfig = _reflection.GeneratedProtocolMessageType('ProgramConfig', (_message.Message,), dict(
-  DESCRIPTOR = _PROGRAMCONFIG,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.ProgramConfig)
-  ))
-_sym_db.RegisterMessage(ProgramConfig)
-
-DenseTableParameter = _reflection.GeneratedProtocolMessageType('DenseTableParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DENSETABLEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter)
-  ))
-_sym_db.RegisterMessage(DenseTableParameter)
-
-SparseTableParameter = _reflection.GeneratedProtocolMessageType('SparseTableParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSETABLEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter)
-  ))
-_sym_db.RegisterMessage(SparseTableParameter)
-
-DownpourServerParameter = _reflection.GeneratedProtocolMessageType('DownpourServerParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DOWNPOURSERVERPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter)
-  ))
-_sym_db.RegisterMessage(DownpourServerParameter)
-
-ServerServiceParameter = _reflection.GeneratedProtocolMessageType('ServerServiceParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SERVERSERVICEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter)
-  ))
-_sym_db.RegisterMessage(ServerServiceParameter)
-
-TableParameter = _reflection.GeneratedProtocolMessageType('TableParameter', (_message.Message,), dict(
-  DESCRIPTOR = _TABLEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.TableParameter)
-  ))
-_sym_db.RegisterMessage(TableParameter)
-
-TableAccessorParameter = _reflection.GeneratedProtocolMessageType('TableAccessorParameter', (_message.Message,), dict(
-  DESCRIPTOR = _TABLEACCESSORPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter)
-  ))
-_sym_db.RegisterMessage(TableAccessorParameter)
-
-DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType('DownpourTableAccessorParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DOWNPOURTABLEACCESSORPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter)
-  ))
-_sym_db.RegisterMessage(DownpourTableAccessorParameter)
-
-TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType('TableAccessorSaveParameter', (_message.Message,), dict(
-  DESCRIPTOR = _TABLEACCESSORSAVEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter)
-  ))
-_sym_db.RegisterMessage(TableAccessorSaveParameter)
-
-PsRequestMessage = _reflection.GeneratedProtocolMessageType('PsRequestMessage', (_message.Message,), dict(
-  DESCRIPTOR = _PSREQUESTMESSAGE,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage)
-  ))
-_sym_db.RegisterMessage(PsRequestMessage)
-
-SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseSGDRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSESGDRULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter)
-  ))
-_sym_db.RegisterMessage(SparseSGDRuleParameter)
-
-SparseCommonSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseCommonSGDRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSECOMMONSGDRULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseCommonSGDRuleParameter)
-  ))
-_sym_db.RegisterMessage(SparseCommonSGDRuleParameter)
-
-SparseNaiveSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseNaiveSGDRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSENAIVESGDRULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseNaiveSGDRuleParameter)
-  ))
-_sym_db.RegisterMessage(SparseNaiveSGDRuleParameter)
-
-SparseAdagradSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseAdagradSGDRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSEADAGRADSGDRULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseAdagradSGDRuleParameter)
-  ))
-_sym_db.RegisterMessage(SparseAdagradSGDRuleParameter)
-
-SparseAdamSGDParameter = _reflection.GeneratedProtocolMessageType('SparseAdamSGDParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SPARSEADAMSGDPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SparseAdamSGDParameter)
-  ))
-_sym_db.RegisterMessage(SparseAdamSGDParameter)
-
-DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('DenseSGDRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _DENSESGDRULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter)
-  ))
-_sym_db.RegisterMessage(DenseSGDRuleParameter)
-
-AdamSGDParameter = _reflection.GeneratedProtocolMessageType('AdamSGDParameter', (_message.Message,), dict(
-  DESCRIPTOR = _ADAMSGDPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter)
-  ))
-_sym_db.RegisterMessage(AdamSGDParameter)
-
-NaiveSGDParameter = _reflection.GeneratedProtocolMessageType('NaiveSGDParameter', (_message.Message,), dict(
-  DESCRIPTOR = _NAIVESGDPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter)
-  ))
-_sym_db.RegisterMessage(NaiveSGDParameter)
-
-SummarySGDParameter = _reflection.GeneratedProtocolMessageType('SummarySGDParameter', (_message.Message,), dict(
-  DESCRIPTOR = _SUMMARYSGDPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter)
-  ))
-_sym_db.RegisterMessage(SummarySGDParameter)
-
-MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType('MovingAverageRuleParameter', (_message.Message,), dict(
-  DESCRIPTOR = _MOVINGAVERAGERULEPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter)
-  ))
-_sym_db.RegisterMessage(MovingAverageRuleParameter)
-
-PsResponseMessage = _reflection.GeneratedProtocolMessageType('PsResponseMessage', (_message.Message,), dict(
-  DESCRIPTOR = _PSRESPONSEMESSAGE,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage)
-  ))
-_sym_db.RegisterMessage(PsResponseMessage)
-
-FsClientParameter = _reflection.GeneratedProtocolMessageType('FsClientParameter', (_message.Message,), dict(
-  DESCRIPTOR = _FSCLIENTPARAMETER,
-  __module__ = 'ps_pb2'
-  # @@protoc_insertion_point(class_scope:paddle.FsClientParameter)
-  ))
-_sym_db.RegisterMessage(FsClientParameter)
-
-
-DESCRIPTOR.has_options = True
-DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\200\001\001\370\001\001'))
-# @@protoc_insertion_point(module_scope)
diff --git a/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh b/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
deleted file mode 100644
index 1df6b0618de8d7ab4a3ae1fee9490b56e990c5ce..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-# start pserver0
-python fleet_deep_ctr.py \
-    --role pserver \
-    --endpoints 127.0.0.1:7000,127.0.0.1:7001 \
-    --current_endpoint 127.0.0.1:7000 \
-    --trainers 2 \
-    > pserver0.log 2>&1 &
-
-# start pserver1
-python fleet_deep_ctr.py \
-    --role pserver \
-    --endpoints 127.0.0.1:7000,127.0.0.1:7001 \
-    --current_endpoint 127.0.0.1:7001 \
-    --trainers 2 \
-    > pserver1.log 2>&1 &
-
-# start trainer0
-python fleet_deep_ctr.py \
-    --role trainer \
-    --endpoints 127.0.0.1:7000,127.0.0.1:7001 \
-    --trainers 2 \
-    --trainer_id 0 \
-    > trainer0.log 2>&1 &
-
-# start trainer1
-python fleet_deep_ctr.py \
-    --role trainer \
-    --endpoints 127.0.0.1:7000,127.0.0.1:7001 \
-    --trainers 2 \
-    --trainer_id 1 \
-    > trainer1.log 2>&1 &
diff --git a/python/paddle/fluid/incubate/fleet/tests/ctr_dataset_reader.py b/python/paddle/fluid/incubate/fleet/tests/ctr_dataset_reader.py
deleted file mode 100644
index 32ba0e512f54ee142cc02cd6e6a36589daffdd4e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/tests/ctr_dataset_reader.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import logging
-import tarfile
-import os
-
-import paddle
-import paddle.fluid.incubate.data_generator as data_generator
-from paddle.fluid.log_helper import get_logger
-
-logger = get_logger(
-    "paddle", logging.INFO, fmt='%(asctime)s - %(levelname)s - %(message)s')
-
-DATA_URL = "http://paddle-ctr-data.bj.bcebos.com/avazu_ctr_data.tgz"
-DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
-"""
-avazu_ctr_data/train.txt
-avazu_ctr_data/infer.txt
-avazu_ctr_data/test.txt
-avazu_ctr_data/data.meta.txt
-"""
-
-
-def download_file():
-    file_name = "avazu_ctr_data"
-    path = paddle.dataset.common.download(DATA_URL, file_name, DATA_MD5)
-
-    dir_name = os.path.dirname(path)
-    text_file_dir_name = os.path.join(dir_name, file_name)
-
-    if not os.path.exists(text_file_dir_name):
-        tar = tarfile.open(path, "r:gz")
-        tar.extractall(dir_name)
-    return text_file_dir_name
-
-
-def load_dnn_input_record(sent):
-    return list(map(int, sent.split()))
-
-
-def load_lr_input_record(sent):
-    res = []
-    for _ in [x.split(':') for x in sent.split()]:
-        res.append(int(_[0]))
-    return res
-
-
-class DatasetCtrReader(data_generator.MultiSlotDataGenerator):
-    def generate_sample(self, line):
-        def iter():
-            fs = line.strip().split('\t')
-            dnn_input = load_dnn_input_record(fs[0])
-            lr_input = load_lr_input_record(fs[1])
-            click = [int(fs[2])]
-            yield ("dnn_data", dnn_input), \
-                  ("lr_data", lr_input), \
-                  ("click", click)
-
-        return iter
-
-
-def prepare_data():
-    """
-    load data meta info from path, return (dnn_input_dim, lr_input_dim)
-    """
-    file_dir_name = download_file()
-    meta_file_path = os.path.join(file_dir_name, 'data.meta.txt')
-    train_file_path = os.path.join(file_dir_name, 'train.txt')
-    with open(meta_file_path, "r") as f:
-        lines = f.readlines()
-    err_info = "wrong meta format"
-    assert len(lines) == 2, err_info
-    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
-        1], err_info
-    res = map(int, [_.split(':')[1] for _ in lines])
-    res = list(res)
-    dnn_input_dim = res[0]
-    lr_input_dim = res[1]
-    logger.info('dnn input dim: %d' % dnn_input_dim)
-    logger.info('lr input dim: %d' % lr_input_dim)
-    return dnn_input_dim, lr_input_dim, train_file_path
-
-
-if __name__ == "__main__":
-    pairwise_reader = DatasetCtrReader()
-    pairwise_reader.run_from_stdin()
diff --git a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
deleted file mode 100644
index f22a13bde55cb2261a3b7ff5fd4342b91f392d65..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import time
-
-import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
-from paddle.fluid.log_helper import get_logger
-
-import ctr_dataset_reader
-
-logger = get_logger(
-    "fluid", logging.INFO, fmt='%(asctime)s - %(levelname)s - %(message)s')
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="PaddlePaddle Fleet ctr")
-
-    # the following arguments is used for distributed train, if is_local == false, then you should set them
-    parser.add_argument(
-        '--role',
-        type=str,
-        default='pserver',  # trainer or pserver
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--endpoints',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The pserver endpoints, like: 127.0.0.1:6000,127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The path for model to store (default: 127.0.0.1:6000)')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trainers, (default: 1)')
-
-    return parser.parse_args()
-
-
-def model():
-    dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
-    )
-    """ network definition """
-    dnn_data = fluid.layers.data(
-        name="dnn_data",
-        shape=[-1, 1],
-        dtype="int64",
-        lod_level=1,
-        append_batch_size=False)
-    lr_data = fluid.layers.data(
-        name="lr_data",
-        shape=[-1, 1],
-        dtype="int64",
-        lod_level=1,
-        append_batch_size=False)
-    label = fluid.layers.data(
-        name="click",
-        shape=[-1, 1],
-        dtype="int64",
-        lod_level=0,
-        append_batch_size=False)
-
-    datas = [dnn_data, lr_data, label]
-
-    # build dnn model
-    dnn_layer_dims = [128, 64, 32, 1]
-    dnn_embedding = fluid.layers.embedding(
-        is_distributed=False,
-        input=dnn_data,
-        size=[dnn_input_dim, dnn_layer_dims[0]],
-        param_attr=fluid.ParamAttr(
-            name="deep_embedding",
-            initializer=fluid.initializer.Constant(value=0.01)),
-        is_sparse=True)
-    dnn_pool = fluid.layers.sequence_pool(input=dnn_embedding, pool_type="sum")
-    dnn_out = dnn_pool
-    for i, dim in enumerate(dnn_layer_dims[1:]):
-        fc = fluid.layers.fc(
-            input=dnn_out,
-            size=dim,
-            act="relu",
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)),
-            name='dnn-fc-%d' % i)
-        dnn_out = fc
-
-    # build lr model
-    lr_embbding = fluid.layers.embedding(
-        is_distributed=False,
-        input=lr_data,
-        size=[lr_input_dim, 1],
-        param_attr=fluid.ParamAttr(
-            name="wide_embedding",
-            initializer=fluid.initializer.Constant(value=0.01)),
-        is_sparse=True)
-    lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
-
-    merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
-
-    predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
-    acc = fluid.layers.accuracy(input=predict, label=label)
-    auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
-                                                          label=label)
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    return datas, avg_cost, predict, train_file_path
-
-
-def train(args):
-    datas, avg_cost, predict, train_file_path = model()
-
-    endpoints = args.endpoints.split(",")
-    if args.role.upper() == "PSERVER":
-        current_id = endpoints.index(args.current_endpoint)
-    else:
-        current_id = 0
-    role = role_maker.UserDefinedRoleMaker(
-        current_id=current_id,
-        role=role_maker.Role.WORKER
-        if args.role.upper() == "TRAINER" else role_maker.Role.SERVER,
-        worker_num=args.trainers,
-        server_endpoints=endpoints)
-
-    exe = fluid.Executor(fluid.CPUPlace())
-    fleet.init(role)
-
-    strategy = DistributeTranspilerConfig()
-    strategy.sync_mode = False
-
-    optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
-    optimizer = fleet.distributed_optimizer(optimizer, strategy)
-    optimizer.minimize(avg_cost)
-
-    if fleet.is_server():
-        logger.info("run pserver")
-
-        fleet.init_server()
-        fleet.run_server()
-    elif fleet.is_worker():
-        logger.info("run trainer")
-
-        fleet.init_worker()
-        exe.run(fleet.startup_program)
-
-        thread_num = 2
-        filelist = []
-        for _ in range(thread_num):
-            filelist.append(train_file_path)
-
-        # config dataset
-        dataset = fluid.DatasetFactory().create_dataset()
-        dataset.set_batch_size(128)
-        dataset.set_use_var(datas)
-        pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
-
-        dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
-
-        for epoch_id in range(10):
-            logger.info("epoch {} start".format(epoch_id))
-            pass_start = time.time()
-            dataset.set_filelist(filelist)
-            exe.train_from_dataset(
-                program=fleet.main_program,
-                dataset=dataset,
-                fetch_list=[avg_cost],
-                fetch_info=["cost"],
-                print_period=100,
-                debug=False)
-            pass_time = time.time() - pass_start
-            logger.info("epoch {} finished, pass_time {}".format(epoch_id,
-                                                                 pass_time))
-        fleet.stop_worker()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    train(args)
diff --git a/python/paddle/fluid/incubate/fleet/utils/__init__.py b/python/paddle/fluid/incubate/fleet/utils/__init__.py
deleted file mode 100644
index d0c32e26092f6ea25771279418582a24ea449ab2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
deleted file mode 100644
index bce8da641c20c343826f725df9ac0f564accbab0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.contrib.utils import HDFSClient
-import os
-
-
-def check_all_trainers_ready(ready_path, epoch):
-    trainer_num = fleet.worker_num()
-    trainer_id = fleet.worker_index()
-
-    hadoop_home = os.getenv("HADOOP_HOME")
-    configs = {
-        "fs.default.name": os.getenv("FS_NAME"),
-        "hadoop.job.ugi": os.getenv("FS_UGI")
-    }
-
-    node_ready = "ready.{}.{}.done".format(epoch, trainer_id)
-
-    with open(node_ready, "w") as node:
-        node.write("")
-
-    client = HDFSClient(hadoop_home, configs)
-    if not client.is_dir(ready_path):
-        client.makedirs(ready_path)
-    client.upload(
-        hdfs_path=ready_path,
-        local_path=node_ready,
-        overwrite=True,
-        retry_times=0)
-
-    print("PUT {} ON HDFS {} OK".format(node_ready, ready_path))
-
-    while True:
-        ready_num = len(client.ls(ready_path))
-        print("have {} trainers need to be ready".format(trainer_num - ready_num
-                                                         % trainer_num))
-        if ready_num % trainer_num == 0:
-            break
-        time.sleep(10)
-        ready_num = len(client.ls(ready_path))
-
-    print("All trainers are ready, continue training")
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
deleted file mode 100755
index c1270bd6d6517723e5de13bc977e01ec63f63fa9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ /dev/null
@@ -1,1445 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fleet Utils."""
-
-import collections
-import json
-import logging
-import math
-import numpy as np
-import os
-import sys
-import time
-import paddle.fluid as fluid
-from paddle.fluid.log_helper import get_logger
-from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-from . import hdfs
-from .hdfs import *
-
-__all__ = ["FleetUtil"]
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class FleetUtil(object):
-    """
-    FleetUtil provides some common functions for users' convenience.
-
-    Examples:
-        .. code-block:: python
-
-          from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-          fleet_util = FleetUtil()
-          fleet_util.rank0_print("my log")
-
-    """
-
-    def rank0_print(self, s):
-        """
-        Worker of rank 0 print some log.
-
-        Args:
-            s(str): string to print
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.rank0_print("my log")
-
-        """
-        if fleet.worker_index() != 0:
-            return
-        print(s)
-        sys.stdout.flush()
-
-    def rank0_info(self, s):
-        """
-        Worker of rank 0 print some log info.
-
-        Args:
-            s(str): string to log
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.rank0_info("my log info")
-
-        """
-        if fleet.worker_index() != 0:
-            return
-        _logger.info(s)
-
-    def rank0_error(self, s):
-        """
-        Worker of rank 0 print some log error.
-
-        Args:
-            s(str): string to log
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.rank0_error("my log error")
-
-        """
-        if fleet.worker_index() != 0:
-            return
-        _logger.error(s)
-
-    def set_zero(self,
-                 var_name,
-                 scope=fluid.global_scope(),
-                 place=fluid.CPUPlace(),
-                 param_type="int64"):
-        """
-        Set tensor of a Variable to zero.
-
-        Args:
-            var_name(str): name of Variable
-            scope(Scope): Scope object, default is fluid.global_scope()
-            place(Place): Place object, default is fluid.CPUPlace()
-            param_type(str): param data type, default is int64
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.set_zero(myvar.name, myscope)
-
-        """
-        param = scope.var(var_name).get_tensor()
-        param_array = np.zeros(param._get_dims()).astype(param_type)
-        param.set(param_array, place)
-
-    def print_global_auc(self,
-                         scope=fluid.global_scope(),
-                         stat_pos="_generated_var_2",
-                         stat_neg="_generated_var_3",
-                         print_prefix=""):
-        """
-        Print global auc of all distributed workers.
-
-        Args:
-            scope(Scope): Scope object, default is fluid.global_scope()
-            stat_pos(str): name of auc pos bucket Variable
-            stat_neg(str): name of auc neg bucket Variable
-            print_prefix(str): prefix of print auc
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.print_global_auc(myscope, stat_pos=stat_pos.name,
-                                          stat_neg=stat_neg.name)
-
-              # below is part of model
-              emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
-                  emb, min=-15.0, max=15.0), name="similarity_norm")\
-              binary_predict = fluid.layers.concat(input=[\
-                  fluid.layers.elementwise_sub(\
-                      fluid.layers.ceil(similarity_norm), similarity_norm),\
-                  similarity_norm], axis=1)
-              auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, \
-                  stat_neg] = fluid.layers.auc(input=binary_predict,\
-                                               label=label, curve='ROC',\
-                                               num_thresholds=4096)
-
-        """
-        auc_value = self.get_global_auc(scope, stat_pos, stat_neg)
-        self.rank0_print(print_prefix + " global auc = %s" % auc_value)
-
-    def get_global_auc(self,
-                       scope=fluid.global_scope(),
-                       stat_pos="_generated_var_2",
-                       stat_neg="_generated_var_3"):
-        """
-        Get global auc of all distributed workers.
-
-        Args:
-            scope(Scope): Scope object, default is fluid.global_scope()
-            stat_pos(str): name of auc pos bucket Variable
-            stat_neg(str): name of auc neg bucket Variable
-
-        Returns:
-            auc_value(float), total_ins_num(int)
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              auc_value, _ = fleet_util.get_global_auc(myscope,
-                                                       stat_pos=stat_pos,
-                                                       stat_neg=stat_neg)
-
-        """
-        if scope.find_var(stat_pos) is None or scope.find_var(stat_neg) is None:
-            self.rank0_print("not found auc bucket")
-            return None
-        fleet._role_maker._barrier_worker()
-        # auc pos bucket
-        pos = np.array(scope.find_var(stat_pos).get_tensor())
-        # auc pos bucket shape
-        old_pos_shape = np.array(pos.shape)
-        # reshape to one dim
-        pos = pos.reshape(-1)
-        global_pos = np.copy(pos) * 0
-        # mpi allreduce
-        fleet._role_maker._node_type_comm.Allreduce(pos, global_pos)
-        # reshape to its original shape
-        global_pos = global_pos.reshape(old_pos_shape)
-
-        # auc neg bucket
-        neg = np.array(scope.find_var(stat_neg).get_tensor())
-        old_neg_shape = np.array(neg.shape)
-        neg = neg.reshape(-1)
-        global_neg = np.copy(neg) * 0
-        fleet._role_maker._node_type_comm.Allreduce(neg, global_neg)
-        global_neg = global_neg.reshape(old_neg_shape)
-
-        # calculate auc
-        num_bucket = len(global_pos[0])
-        area = 0.0
-        pos = 0.0
-        neg = 0.0
-        new_pos = 0.0
-        new_neg = 0.0
-        total_ins_num = 0
-        for i in xrange(num_bucket):
-            index = num_bucket - 1 - i
-            new_pos = pos + global_pos[0][index]
-            total_ins_num += global_pos[0][index]
-            new_neg = neg + global_neg[0][index]
-            total_ins_num += global_neg[0][index]
-            area += (new_neg - neg) * (pos + new_pos) / 2
-            pos = new_pos
-            neg = new_neg
-
-        auc_value = None
-        if pos * neg == 0 or total_ins_num == 0:
-            auc_value = 0.5
-        else:
-            auc_value = area / (pos * neg)
-
-        fleet._role_maker._barrier_worker()
-        return auc_value
-
-    def load_fleet_model_one_table(self, table_id, path):
-        """
-        load pslib model to one table
-
-        Args:
-            table_id(int): load model to one table, default is None, which mean
-                           load all table.
-            path(str): model path
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.load_fleet_model("hdfs:/my/model/path", table_id=1)
-        """
-        fleet.load_one_table(table_id, path)
-
-    def load_fleet_model(self, path, mode=0):
-        """
-        load pslib model
-
-        Args:
-            path(str): model path
-            mode(str): 0 or 1, which means load checkpoint or delta model,
-                       default is 0
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-
-              fleet_util.load_fleet_model("hdfs:/my/model/path")
-
-              fleet_util.load_fleet_model("hdfs:/my/model/path", mode=0)
-
-        """
-        fleet.init_server(path, mode=mode)
-
-    def save_fleet_model(self, path, mode=0):
-        """
-        save pslib model
-
-        Args:
-            path(str): model path
-            mode(str): 0 or 1, which means save checkpoint or delta model,
-                       default is 0
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_fleet_model("hdfs:/my/model/path")
-
-        """
-        fleet.save_persistables(None, path, mode=mode)
-
-    def _get_xbox_str(self,
-                      output_path,
-                      day,
-                      model_path,
-                      xbox_base_key,
-                      data_path,
-                      hadoop_fs_name,
-                      monitor_data={},
-                      mode="patch",
-                      dir_name="000",
-                      base_only=False):
-        xbox_dict = collections.OrderedDict()
-        if mode == "base":
-            xbox_dict["id"] = str(xbox_base_key)
-        elif mode == "patch":
-            xbox_dict["id"] = str(int(time.time()))
-        else:
-            print("warning: unknown mode %s, set it to patch" % mode)
-            mode = "patch"
-            xbox_dict["id"] = str(int(time.time()))
-        if base_only:
-            xbox_dict["key"] = str(int(time.time()))
-        else:
-            xbox_dict["key"] = str(xbox_base_key)
-        if model_path.startswith("hdfs:") or model_path.startswith("afs:"):
-            model_path = model_path[model_path.find(":") + 1:]
-        xbox_dict["input"] = hadoop_fs_name + model_path.rstrip("/") + "/%s" % dir_name
-        xbox_dict["record_count"] = "111111"
-        xbox_dict["partition_type"] = "2"
-        xbox_dict["job_name"] = "default_job_name"
-        xbox_dict["ins_tag"] = "feasign"
-        xbox_dict["ins_path"] = data_path
-        job_id_with_host = os.popen("echo -n ${JOB_ID}").read().strip()
-        instance_id = os.popen("echo -n ${INSTANCE_ID}").read().strip()
-        start_pos = instance_id.find(job_id_with_host)
-        end_pos = instance_id.find("--")
-        if start_pos != -1 and end_pos != -1:
-            job_id_with_host = instance_id[start_pos:end_pos]
-        xbox_dict["job_id"] = job_id_with_host
-        # currently hard code here, set monitor_data empty string
-        xbox_dict["monitor_data"] = ""
-        xbox_dict["monitor_path"] = output_path.rstrip("/") + "/monitor/" \
-                                    + day + ".txt"
-        xbox_dict["mpi_size"] = str(fleet.worker_num())
-        return json.dumps(xbox_dict)
-
-    def write_model_donefile(self,
-                             output_path,
-                             day,
-                             pass_id,
-                             xbox_base_key,
-                             hadoop_fs_name,
-                             hadoop_fs_ugi,
-                             hadoop_home="$HADOOP_HOME",
-                             donefile_name="donefile.txt"):
-        """
-        write donefile when save model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-            xbox_base_key(str|int): xbox base key
-            hadoop_fs_name(str): hdfs/afs fs name
-            hadoop_fs_ugi(str): hdfs/afs fs ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-            donefile_name(str): donefile name, default is "donefile.txt"
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.write_model_donefile(output_path="hdfs:/my/output",
-                                              model_path="hdfs:/my/model",
-                                              day=20190723,
-                                              pass_id=66,
-                                              xbox_base_key=int(time.time()),
-                                              hadoop_fs_name="hdfs://xxx",
-                                              hadoop_fs_ugi="user,passwd")
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        xbox_base_key = int(xbox_base_key)
-
-        if pass_id != "-1":
-            suffix_name = "/%s/%s/" % (day, pass_id)
-            model_path = output_path.rstrip("/") + suffix_name
-        else:
-            suffix_name = "/%s/0/" % day
-            model_path = output_path.rstrip("/") + suffix_name
-
-        if fleet.worker_index() == 0:
-            donefile_path = output_path + "/" + donefile_name
-            content  = "%s\t%lu\t%s\t%s\t%d" % (day, xbox_base_key,\
-                                                model_path, pass_id, 0)
-            configs = {
-                "fs.default.name": hadoop_fs_name,
-                "hadoop.job.ugi": hadoop_fs_ugi
-            }
-            client = HDFSClient(hadoop_home, configs)
-            if client.is_file(donefile_path):
-                pre_content = client.cat(donefile_path)
-                pre_content_list = pre_content.split("\n")
-                day_list = [i.split("\t")[0] for i in pre_content_list]
-                pass_list = [i.split("\t")[3] for i in pre_content_list]
-                exist = False
-                for i in range(len(day_list)):
-                    if int(day) == int(day_list[i]) and \
-                            int(pass_id) == int(pass_list[i]):
-                        exist = True
-                        break
-                if not exist:
-                    with open(donefile_name, "w") as f:
-                        f.write(pre_content + "\n")
-                        f.write(content + "\n")
-                    client.delete(donefile_path)
-                    client.upload(
-                        output_path,
-                        donefile_name,
-                        multi_processes=1,
-                        overwrite=False)
-                    self.rank0_error("write %s/%s %s succeed" % \
-                                      (day, pass_id, donefile_name))
-                else:
-                    self.rank0_error("not write %s because %s/%s already "
-                                     "exists" % (donefile_name, day, pass_id))
-            else:
-                with open(donefile_name, "w") as f:
-                    f.write(content + "\n")
-                client.upload(
-                    output_path,
-                    donefile_name,
-                    multi_processes=1,
-                    overwrite=False)
-                self.rank0_error("write %s/%s %s succeed" % \
-                               (day, pass_id, donefile_name))
-        fleet._role_maker._barrier_worker()
-
-    def write_xbox_donefile(self,
-                            output_path,
-                            day,
-                            pass_id,
-                            xbox_base_key,
-                            data_path,
-                            hadoop_fs_name,
-                            hadoop_fs_ugi,
-                            monitor_data={},
-                            hadoop_home="$HADOOP_HOME",
-                            donefile_name=None,
-                            dir_name=None,
-                            base_only=False):
-        """
-        write delta donefile or xbox base donefile
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day of model
-            pass_id(str|int): training pass id of model
-            xbox_base_key(str|int): xbox base key
-            data_path(str|list): training data path
-            hadoop_fs_name(str): hdfs/afs fs name
-            hadoop_fs_ugi(str): hdfs/afs fs ugi
-            monitor_data(dict): metrics
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-            donefile_name(str): donefile name, default is None"
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.write_xbox_donefile(
-                  output_path="hdfs:/my/output/",
-                  model_path="hdfs:/my/output/20190722/01",
-                  day=20190722,
-                  pass_id=1,
-                  xbox_base_key=int(time.time()),
-                  data_path="hdfs:/my/data/",
-                  hadoop_fs_name="hdfs://xxx",
-                  hadoop_fs_ugi="user,passwd",
-                  monitor_data={}
-                  )
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        xbox_base_key = int(xbox_base_key)
-        mode = None
-
-        if pass_id != "-1":
-            mode = "patch"
-            suffix_name = "/%s/delta-%s/" % (day, pass_id)
-            model_path = output_path.rstrip("/") + suffix_name
-            if donefile_name is None:
-                donefile_name = "xbox_patch_done.txt"
-        else:
-            mode = "base"
-            suffix_name = "/%s/base/" % day
-            model_path = output_path.rstrip("/") + suffix_name
-            if donefile_name is None:
-                donefile_name = "xbox_base_done.txt"
-
-        if dir_name is None:
-            dir_name = "000"
-
-        if isinstance(data_path, list):
-            data_path = ",".join(data_path)
-
-        if fleet.worker_index() == 0:
-            donefile_path = output_path + "/" + donefile_name
-            xbox_str = self._get_xbox_str(output_path, day, model_path, \
-                    xbox_base_key, data_path, hadoop_fs_name, {}, \
-                    mode, dir_name, base_only)
-            configs = {
-                "fs.default.name": hadoop_fs_name,
-                "hadoop.job.ugi": hadoop_fs_ugi
-            }
-            client = HDFSClient(hadoop_home, configs)
-            if client.is_file(donefile_path):
-                pre_content = client.cat(donefile_path)
-                last_dict = json.loads(pre_content.split("\n")[-1])
-                last_day = last_dict["input"].split("/")[-3]
-                last_pass = last_dict["input"].split("/")[-2].split("-")[-1]
-                if last_pass == "base":
-                    last_pass = "-1"
-                exist = False
-                if int(day) < int(last_day) or \
-                        int(day) == int(last_day) and \
-                        int(pass_id) <= int(last_pass):
-                    exist = True
-                if not exist:
-                    with open(donefile_name, "w") as f:
-                        f.write(pre_content + "\n")
-                        f.write(xbox_str + "\n")
-                    client.delete(donefile_path)
-                    client.upload(
-                        output_path,
-                        donefile_name,
-                        multi_processes=1,
-                        overwrite=False)
-                    self.rank0_error("write %s/%s %s succeed" % \
-                                      (day, pass_id, donefile_name))
-                else:
-                    self.rank0_error("not write %s because %s/%s already "
-                                     "exists" % (donefile_name, day, pass_id))
-            else:
-                with open(donefile_name, "w") as f:
-                    f.write(xbox_str + "\n")
-                client.upload(
-                    output_path,
-                    donefile_name,
-                    multi_processes=1,
-                    overwrite=False)
-                self.rank0_error("write %s/%s %s succeed" % \
-                               (day, pass_id, donefile_name))
-        fleet._role_maker._barrier_worker()
-
-    def write_cache_donefile(self,
-                             output_path,
-                             day,
-                             pass_id,
-                             key_num,
-                             hadoop_fs_name,
-                             hadoop_fs_ugi,
-                             hadoop_home="$HADOOP_HOME",
-                             donefile_name="sparse_cache.meta"):
-        """
-        write cache donefile
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day of model
-            pass_id(str|int): training pass id of model
-            key_num(str|int): save cache return value
-            hadoop_fs_name(str): hdfs/afs fs name
-            hadoop_fs_ugi(str): hdfs/afs fs ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-            donefile_name(str): donefile name, default is "sparse_cache.meta"
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.write_cache_donefile(
-                  output_path="hdfs:/my/output/",
-                  day=20190722,
-                  pass_id=1,
-                  key_num=123456,
-                  hadoop_fs_name="hdfs://xxx",
-                  hadoop_fs_ugi="user,passwd",
-                  )
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        key_num = int(key_num)
-
-        if pass_id != "-1":
-            suffix_name = "/%s/delta-%s/000_cache" % (day, pass_id)
-            model_path = output_path.rstrip("/") + suffix_name
-        else:
-            suffix_name = "/%s/base/000_cache" % day
-            model_path = output_path.rstrip("/") + suffix_name
-
-        if fleet.worker_index() == 0:
-            donefile_path = model_path + "/" + donefile_name
-            configs = {
-                "fs.default.name": hadoop_fs_name,
-                "hadoop.job.ugi": hadoop_fs_ugi
-            }
-            client = HDFSClient(hadoop_home, configs)
-            if client.is_file(donefile_path):
-                self.rank0_error( \
-                    "not write because %s already exists" % donefile_path)
-            else:
-                meta_str = \
-                    "file_prefix:part\npart_num:16\nkey_num:%d\n" % key_num
-                with open(donefile_name, "w") as f:
-                    f.write(meta_str)
-                client.upload(
-                    model_path,
-                    donefile_name,
-                    multi_processes=1,
-                    overwrite=False)
-                self.rank0_error("write %s succeed" % donefile_path)
-        fleet._role_maker._barrier_worker()
-
-    def load_model(self, output_path, day, pass_id):
-        """
-        load pslib model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.load_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        suffix_name = "/%s/%s/" % (day, pass_id)
-        load_path = output_path + suffix_name
-        self.rank0_error("going to load_model %s" % load_path)
-        self.load_fleet_model(load_path)
-        self.rank0_error("load_model done")
-
-    def save_model(self, output_path, day, pass_id):
-        """
-        save pslib model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        suffix_name = "/%s/%s/" % (day, pass_id)
-        model_path = output_path + suffix_name
-        self.rank0_print("going to save_model %s" % model_path)
-        self.save_fleet_model(model_path)
-        self.rank0_print("save_model done")
-
-    def save_batch_model(self, output_path, day):
-        """
-        save batch model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_batch_model("hdfs:/my/path", 20190722)
-
-        """
-        day = str(day)
-        suffix_name = "/%s/0/" % day
-        model_path = output_path + suffix_name
-        self.rank0_print("going to save_model %s" % model_path)
-        fleet.save_persistables(None, model_path, mode=3)
-        self.rank0_print("save_batch_model done")
-
-    def save_delta_model(self, output_path, day, pass_id):
-        """
-        save delta model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_batch_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        suffix_name = "/%s/delta-%s/" % (day, pass_id)
-        model_path = output_path + suffix_name
-        self.rank0_print("going to save_delta_model %s" % model_path)
-        fleet.save_persistables(None, model_path, mode=1)
-        self.rank0_print("save_delta_model done")
-
-    def save_xbox_base_model(self, output_path, day):
-        """
-        save xbox base model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_xbox_base_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        day = str(day)
-        suffix_name = "/%s/base/" % day
-        model_path = output_path + suffix_name
-        self.rank0_print("going to save_xbox_base_model " + model_path)
-        fleet.save_persistables(None, model_path, mode=2)
-        self.rank0_print("save_xbox_base_model done")
-
-    def save_cache_model(self, output_path, day, pass_id, mode=1):
-        """
-        save cache model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-            mode(str|int): save mode
-
-        Returns:
-            key_num(int): cache key num
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_cache_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        mode = int(mode)
-        suffix_name = "/%s/delta-%s" % (day, pass_id)
-        model_path = output_path.rstrip("/") + suffix_name
-        self.rank0_print("going to save_cache_model %s" % model_path)
-        key_num = fleet.save_cache_model(None, model_path, mode=mode)
-        self.rank0_print("save_cache_model done")
-        return key_num
-
-    def save_cache_base_model(self, output_path, day):
-        """
-        save cache model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-
-        Returns:
-            key_num(int): cache key num
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_cache_base_model("hdfs:/my/path", 20190722)
-
-        """
-        day = str(day)
-        suffix_name = "/%s/base" % day
-        model_path = output_path.rstrip("/") + suffix_name
-        self.rank0_print("going to save_cache_base_model %s" % model_path)
-        key_num = fleet.save_cache_model(None, model_path, mode=2)
-        self.rank0_print("save_cache_base_model done")
-        return key_num
-
-    def pull_all_dense_params(self, scope, program):
-        """
-        pull all dense params in trainer of rank 0
-
-        Args:
-            scope(Scope): fluid Scope
-            program(Program): fluid Program
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.pull_all_dense_params(my_scope, my_program)
-
-        """
-        fleet._role_maker._barrier_worker()
-        if fleet._role_maker.is_first_worker():
-            tables = fleet._dist_desc.trainer_param[0].dense_table
-            prog_id = str(id(program))
-            prog_conf = fleet._opt_info['program_configs'][prog_id]
-            prog_tables = {}
-            for key in prog_conf:
-                if "dense" not in key:
-                    continue
-                for table_id in prog_conf[key]:
-                    prog_tables[int(table_id)] = 0
-            for table in tables:
-                if int(table.table_id) not in prog_tables:
-                    continue
-                var_name_list = []
-                for i in range(0, len(table.dense_variable_name)):
-                    var_name = table.dense_variable_name[i]
-                    if scope.find_var(var_name) is None:
-                        raise ValueError("var " + var_name +
-                                         " not found in scope " +
-                                         "when pull dense")
-                    var_name_list.append(var_name)
-                fleet._fleet_ptr.pull_dense(scope,
-                                            int(table.table_id), var_name_list)
-        fleet._role_maker._barrier_worker()
-
-    def save_paddle_params(self,
-                           executor,
-                           scope,
-                           program,
-                           model_name,
-                           output_path,
-                           day,
-                           pass_id,
-                           hadoop_fs_name,
-                           hadoop_fs_ugi,
-                           hadoop_home="$HADOOP_HOME",
-                           var_names=None,
-                           save_combine=True):
-        """
-        save paddle model, and upload to hdfs dnn_plugin path
-
-        Args:
-            executor(Executor): fluid Executor
-            scope(Scope): fluid Scope
-            program(Program): fluid Program
-            model_name(str): save model local dir or filename
-            output_path(str): hdfs/afs output path
-            day(str|int): training day
-            pass_id(str|int): training pass
-            hadoop_fs_name(str): hadoop fs name
-            hadoop_fs_ugi(str): hadoop fs ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-            var_names(list): save persistable var names, default is None
-            save_combine(bool): whether to save in a file or seperate files,
-                                default is True
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_paddle_params(exe,
-                                            join_scope,
-                                            join_program,
-                                            "paddle_dense.model.0",
-                                            "hdfs:/my/output/path/",
-                                            day=20190727,
-                                            pass_id=6,
-                                            hadoop_fs_name="xxx",
-                                            hadoop_fs_ugi="xxx,xxx",
-                                            var_names=join_all_var_names)
-              fleet_util.save_paddle_params(exe,
-                                            join_scope,
-                                            join_program,
-                                            "paddle_dense.model.usr.0",
-                                            "hdfs:/my/output/path/",
-                                            day=20190727,
-                                            pass_id=6,
-                                            hadoop_fs_name="xxx",
-                                            hadoop_fs_ugi="xxx,xxx",
-                                            var_names=join_user_var_names)
-              fleet_util.save_paddle_params(exe,
-                                            join_scope,
-                                            join_program,
-                                            "paddle_dense.model.item.0",
-                                            "hdfs:/my/output/path/",
-                                            day=20190727,
-                                            pass_id=6,
-                                            hadoop_fs_name="xxx",
-                                            hadoop_fs_ugi="xxx,xxx",
-                                            var_names=join_user_item_names)
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        # pull dense before save
-        self.pull_all_dense_params(scope, program)
-        if fleet.worker_index() == 0:
-            vars = [program.global_block().var(i) for i in var_names]
-            with fluid.scope_guard(scope):
-                if save_combine:
-                    fluid.io.save_vars(
-                        executor, "./", program, vars=vars, filename=model_name)
-                else:
-                    fluid.io.save_vars(executor, model_name, program, vars=vars)
-
-            configs = {
-                "fs.default.name": hadoop_fs_name,
-                "hadoop.job.ugi": hadoop_fs_ugi
-            }
-            client = HDFSClient(hadoop_home, configs)
-
-            if pass_id == "-1":
-                dest = "%s/%s/base/dnn_plugin/" % (output_path, day)
-            else:
-                dest = "%s/%s/delta-%s/dnn_plugin/" % (output_path, day,
-                                                       pass_id)
-            if not client.is_exist(dest):
-                client.makedirs(dest)
-
-            if os.path.isdir(model_name):
-                client.upload_dir(dest, model_name)
-            else:
-                client.upload(dest, model_name)
-
-        fleet._role_maker._barrier_worker()
-
-    def get_last_save_xbox_base(self,
-                                output_path,
-                                hadoop_fs_name,
-                                hadoop_fs_ugi,
-                                hadoop_home="$HADOOP_HOME"):
-        """
-        get last saved base xbox info from xbox_base_done.txt
-
-        Args:
-            output_path(str): output path
-            hadoop_fs_name(str): hdfs/afs fs_name
-            hadoop_fs_ugi(str): hdfs/afs fs_ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-
-        Returns:
-            [last_save_day, last_path, xbox_base_key]
-            last_save_day(int): day of saved model
-            last_path(str): model path
-            xbox_base_key(int): xbox key
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              last_save_day, last_path, xbox_base_key = \
-                  fleet_util.get_last_save_xbox_base("hdfs:/my/path", 20190722,
-                                                     88)
-
-        """
-        donefile_path = output_path + "/xbox_base_done.txt"
-        configs = {
-            "fs.default.name": hadoop_fs_name,
-            "hadoop.job.ugi": hadoop_fs_ugi
-        }
-        client = HDFSClient(hadoop_home, configs)
-        if not client.is_file(donefile_path):
-            return [-1, -1, int(time.time())]
-        pre_content = client.cat(donefile_path)
-        last_dict = json.loads(pre_content.split("\n")[-1])
-        last_day = int(last_dict["input"].split("/")[-3])
-        last_path = "/".join(last_dict["input"].split("/")[:-1])
-        xbox_base_key = int(last_dict["key"])
-        return [last_day, last_path, xbox_base_key]
-
-    def get_last_save_xbox(self,
-                           output_path,
-                           hadoop_fs_name,
-                           hadoop_fs_ugi,
-                           hadoop_home="$HADOOP_HOME"):
-        """
-        get last saved xbox info from xbox_patch_done.txt
-
-        Args:
-            output_path(str): output path
-            hadoop_fs_name(str): hdfs/afs fs_name
-            hadoop_fs_ugi(str): hdfs/afs fs_ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-
-        Returns:
-            [last_save_day, last_save_pass, last_path, xbox_base_key]
-            last_save_day(int): day of saved model
-            last_save_pass(int): pass id of saved
-            last_path(str): model path
-            xbox_base_key(int): xbox key
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              last_save_day, last_save_pass, last_path, xbox_base_key = \
-                  fleet_util.get_last_save_xbox("hdfs:/my/path", 20190722, 88)
-
-        """
-        donefile_path = output_path + "/xbox_patch_done.txt"
-        configs = {
-            "fs.default.name": hadoop_fs_name,
-            "hadoop.job.ugi": hadoop_fs_ugi
-        }
-        client = HDFSClient(hadoop_home, configs)
-        if not client.is_file(donefile_path):
-            return [-1, -1, "", int(time.time())]
-        pre_content = client.cat(donefile_path)
-        last_dict = json.loads(pre_content.split("\n")[-1])
-        last_day = int(last_dict["input"].split("/")[-3])
-        last_pass = int(last_dict["input"].split("/")[-2].split("-")[-1])
-        last_path = "/".join(last_dict["input"].split("/")[:-1])
-        xbox_base_key = int(last_dict["key"])
-        return [last_day, last_pass, last_path, xbox_base_key]
-
-    def get_last_save_model(self,
-                            output_path,
-                            hadoop_fs_name,
-                            hadoop_fs_ugi,
-                            hadoop_home="$HADOOP_HOME"):
-        """
-        get last saved model info from donefile.txt
-
-        Args:
-            output_path(str): output path
-            hadoop_fs_name(str): hdfs/afs fs_name
-            hadoop_fs_ugi(str): hdfs/afs fs_ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-
-        Returns:
-            [last_save_day, last_save_pass, last_path, xbox_base_key]
-            last_save_day(int): day of saved model
-            last_save_pass(int): pass id of saved
-            last_path(str): model path
-            xbox_base_key(int): xbox key
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              last_save_day, last_save_pass, last_path, xbox_base_key = \
-                  fleet_util.get_last_save_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        last_save_day = -1
-        last_save_pass = -1
-        last_path = ""
-        donefile_path = output_path + "/donefile.txt"
-        configs = {
-            "fs.default.name": hadoop_fs_name,
-            "hadoop.job.ugi": hadoop_fs_ugi
-        }
-        client = HDFSClient(hadoop_home, configs)
-        if not client.is_file(donefile_path):
-            return [-1, -1, "", int(time.time())]
-        content = client.cat(donefile_path)
-        content = content.split("\n")[-1].split("\t")
-        last_save_day = int(content[0])
-        last_save_pass = int(content[3])
-        last_path = content[2]
-        xbox_base_key = int(content[1])
-        return [last_save_day, last_save_pass, last_path, xbox_base_key]
-
-    def get_online_pass_interval(self, days, hours, split_interval,
-                                 split_per_pass, is_data_hourly_placed):
-        """
-        get online pass interval
-
-        Args:
-            days(str): days to train
-            hours(str): hours to train
-            split_interval(int|str): split interval
-            split_per_pass(int}str): split per pass
-            is_data_hourly_placed(bool): is data hourly placed
-
-        Returns:
-            online_pass_interval(list)
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              online_pass_interval = fleet_util.get_online_pass_interval(
-                  days="{20190720..20190729}",
-                  hours="{0..23}",
-                  split_interval=5,
-                  split_per_pass=2,
-                  is_data_hourly_placed=False)
-
-        """
-        days = os.popen("echo -n " + days).read().split(" ")
-        hours = os.popen("echo -n " + hours).read().split(" ")
-        split_interval = int(split_interval)
-        split_per_pass = int(split_per_pass)
-        splits_per_day = 24 * 60 / split_interval
-        pass_per_day = splits_per_day / split_per_pass
-        left_train_hour = int(hours[0])
-        right_train_hour = int(hours[-1])
-
-        start = 0
-        split_path = []
-        for i in range(splits_per_day):
-            h = start / 60
-            m = start % 60
-            if h < left_train_hour or h > right_train_hour:
-                start += split_interval
-                continue
-            if is_data_hourly_placed:
-                split_path.append("%02d" % h)
-            else:
-                split_path.append("%02d%02d" % (h, m))
-            start += split_interval
-
-        start = 0
-        online_pass_interval = []
-        for i in range(pass_per_day):
-            online_pass_interval.append([])
-            for j in range(start, start + split_per_pass):
-                online_pass_interval[i].append(split_path[j])
-            start += split_per_pass
-
-        return online_pass_interval
-
-    def get_global_metrics(self,
-                           scope=fluid.global_scope(),
-                           stat_pos_name="_generated_var_2",
-                           stat_neg_name="_generated_var_3",
-                           sqrerr_name="sqrerr",
-                           abserr_name="abserr",
-                           prob_name="prob",
-                           q_name="q",
-                           pos_ins_num_name="pos",
-                           total_ins_num_name="total"):
-        """
-        get global metrics, including auc, bucket_error, mae, rmse,
-        actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.
-
-        Args:
-            scope(Scope): Scope object, default is fluid.global_scope()
-            stat_pos_name(str): name of auc pos bucket Variable
-            stat_neg_name(str): name of auc neg bucket Variable
-            sqrerr_name(str): name of sqrerr Variable
-            abserr_name(str): name of abserr Variable
-            prob_name(str): name of prob Variable
-            q_name(str): name of q Variable
-            pos_ins_num_name(str): name of pos ins num Variable
-            total_ins_num_name(str): name of total ins num Variable
-
-        Returns:
-            [auc, bucket_error, mae, rmse, actual_ctr, predicted_ctr, copc,
-             mean_predict_qvalue, total_ins_num]
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              metric_list = fleet_util.get_global_metrics(myscope,
-                                                          stat_pos.nane,
-                                                          stat_neg.name,
-                                                          local_sqrerr.name,
-                                                          local_abserr.name,
-                                                          local_prob.name,
-                                                          local_q.name,
-                                                          local_pos_ins.name,
-                                                          local_total_ins.name)
-
-              # below is part of example model
-              label = fluid.layers.data(name="click", shape=[-1, 1],\
-                  dtype="int64", lod_level=0, append_batch_size=False)
-              emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
-                  emb, min=-15.0, max=15.0), name="similarity_norm")\
-              binary_predict = fluid.layers.concat(input=[\
-                  fluid.layers.elementwise_sub(\
-                      fluid.layers.ceil(similarity_norm), similarity_norm),\
-                  similarity_norm], axis=1)
-              auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, \
-                  stat_neg] = fluid.layers.auc(input=binary_predict,\
-                                               label=label, curve='ROC',\
-                                               num_thresholds=4096)
-              local_sqrerr, local_abserr, local_prob, local_q, local_pos_ins,\
-                  local_total_ins = fluid.contrib.layers.ctr_metric_bundle(\
-                      similarity_norm, label)
-
-        """
-        if scope.find_var(stat_pos_name) is None or \
-                scope.find_var(stat_neg_name) is None:
-            self.rank0_print("not found auc bucket")
-            return [None] * 9
-        elif scope.find_var(sqrerr_name) is None:
-            self.rank0_print("not found sqrerr_name=%s" % sqrerr_name)
-            return [None] * 9
-        elif scope.find_var(abserr_name) is None:
-            self.rank0_print("not found abserr_name=%s" % abserr_name)
-            return [None] * 9
-        elif scope.find_var(prob_name) is None:
-            self.rank0_print("not found prob_name=%s" % prob_name)
-            return [None] * 9
-        elif scope.find_var(q_name) is None:
-            self.rank0_print("not found q_name=%s" % q_name)
-            return [None] * 9
-        elif scope.find_var(pos_ins_num_name) is None:
-            self.rank0_print("not found pos_ins_num_name=%s" % pos_ins_num_name)
-            return [None] * 9
-        elif scope.find_var(total_ins_num_name) is None:
-            self.rank0_print("not found total_ins_num_name=%s" % \
-                             total_ins_num_name)
-            return [None] * 9
-
-        # barrier worker to ensure all workers finished training
-        fleet._role_maker._barrier_worker()
-
-        # get auc
-        auc = self.get_global_auc(scope, stat_pos_name, stat_neg_name)
-        pos = np.array(scope.find_var(stat_pos_name).get_tensor())
-        # auc pos bucket shape
-        old_pos_shape = np.array(pos.shape)
-        # reshape to one dim
-        pos = pos.reshape(-1)
-        global_pos = np.copy(pos) * 0
-        # mpi allreduce
-        fleet._role_maker._node_type_comm.Allreduce(pos, global_pos)
-        # reshape to its original shape
-        global_pos = global_pos.reshape(old_pos_shape)
-        # auc neg bucket
-        neg = np.array(scope.find_var(stat_neg_name).get_tensor())
-        old_neg_shape = np.array(neg.shape)
-        neg = neg.reshape(-1)
-        global_neg = np.copy(neg) * 0
-        fleet._role_maker._node_type_comm.Allreduce(neg, global_neg)
-        global_neg = global_neg.reshape(old_neg_shape)
-
-        num_bucket = len(global_pos[0])
-
-        def get_metric(name):
-            metric = np.array(scope.find_var(name).get_tensor())
-            old_metric_shape = np.array(metric.shape)
-            metric = metric.reshape(-1)
-            global_metric = np.copy(metric) * 0
-            fleet._role_maker._node_type_comm.Allreduce(metric, global_metric)
-            global_metric = global_metric.reshape(old_metric_shape)
-            return global_metric[0]
-
-        global_sqrerr = get_metric(sqrerr_name)
-        global_abserr = get_metric(abserr_name)
-        global_prob = get_metric(prob_name)
-        global_q_value = get_metric(q_name)
-        # note: get ins_num from auc bucket is not actual value,
-        # so get it from metric op
-        pos_ins_num = get_metric(pos_ins_num_name)
-        total_ins_num = get_metric(total_ins_num_name)
-        neg_ins_num = total_ins_num - pos_ins_num
-
-        mae = global_abserr / total_ins_num
-        rmse = math.sqrt(global_sqrerr / total_ins_num)
-        return_actual_ctr = pos_ins_num / total_ins_num
-        predicted_ctr = global_prob / total_ins_num
-        mean_predict_qvalue = global_q_value / total_ins_num
-        copc = 0.0
-        if abs(predicted_ctr > 1e-6):
-            copc = return_actual_ctr / predicted_ctr
-
-        # calculate bucket error
-        last_ctr = -1.0
-        impression_sum = 0.0
-        ctr_sum = 0.0
-        click_sum = 0.0
-        error_sum = 0.0
-        error_count = 0.0
-        click = 0.0
-        show = 0.0
-        ctr = 0.0
-        adjust_ctr = 0.0
-        relative_error = 0.0
-        actual_ctr = 0.0
-        relative_ctr_error = 0.0
-        k_max_span = 0.01
-        k_relative_error_bound = 0.05
-        for i in xrange(num_bucket):
-            click = global_pos[0][i]
-            show = global_pos[0][i] + global_neg[0][i]
-            ctr = float(i) / num_bucket
-            if abs(ctr - last_ctr) > k_max_span:
-                last_ctr = ctr
-                impression_sum = 0.0
-                ctr_sum = 0.0
-                click_sum = 0.0
-            impression_sum += show
-            ctr_sum += ctr * show
-            click_sum += click
-            if impression_sum == 0:
-                continue
-            adjust_ctr = ctr_sum / impression_sum
-            if adjust_ctr == 0:
-                continue
-            relative_error = \
-                           math.sqrt((1 - adjust_ctr) / (adjust_ctr * impression_sum))
-            if relative_error < k_relative_error_bound:
-                actual_ctr = click_sum / impression_sum
-                relative_ctr_error = abs(actual_ctr / adjust_ctr - 1)
-                error_sum += relative_ctr_error * impression_sum
-                error_count += impression_sum
-                last_ctr = -1
-
-        bucket_error = error_sum / error_count if error_count > 0 else 0.0
-
-        return [
-            auc, bucket_error, mae, rmse, return_actual_ctr, predicted_ctr,
-            copc, mean_predict_qvalue, int(total_ins_num)
-        ]
-
-    def print_global_metrics(self,
-                             scope=fluid.global_scope(),
-                             stat_pos_name="_generated_var_2",
-                             stat_neg_name="_generated_var_3",
-                             sqrerr_name="sqrerr",
-                             abserr_name="abserr",
-                             prob_name="prob",
-                             q_name="q",
-                             pos_ins_num_name="pos",
-                             total_ins_num_name="total",
-                             print_prefix=""):
-        """
-        print global metrics, including auc, bucket_error, mae, rmse,
-        actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.
-
-        Args:
-            scope(Scope): Scope object, default is fluid.global_scope()
-            stat_pos_name(str): name of auc pos bucket Variable
-            stat_neg_name(str): name of auc neg bucket Variable
-            sqrerr_name(str): name of sqrerr Variable
-            abserr_name(str): name of abserr Variable
-            prob_name(str): name of prob Variable
-            q_name(str): name of q Variable
-            pos_ins_num_name(str): name of pos ins num Variable
-            total_ins_num_name(str): name of total ins num Variable
-            print_prefix(str): print prefix
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.print_global_metrics(myscope,
-                                              stat_pos.nane,
-                                              stat_neg.name,
-                                              local_sqrerr.name,
-                                              local_abserr.name,
-                                              local_prob.name,
-                                              local_q.name,
-                                              local_pos_ins.name,
-                                              local_total_ins.name)
-
-              # below is part of model
-              label = fluid.layers.data(name="click", shape=[-1, 1],\
-                  dtype="int64", lod_level=0, append_batch_size=False)
-              emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
-                  emb, min=-15.0, max=15.0), name="similarity_norm")\
-              binary_predict = fluid.layers.concat(input=[\
-                  fluid.layers.elementwise_sub(\
-                      fluid.layers.ceil(similarity_norm), similarity_norm),\
-                  similarity_norm], axis=1)
-              auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, \
-                  stat_neg] = fluid.layers.auc(input=binary_predict,\
-                                               label=label, curve='ROC',\
-                                               num_thresholds=4096)
-              local_sqrerr, local_abserr, local_prob, local_q, local_pos_ins, \
-                  local_total_ins = fluid.contrib.layers.ctr_metric_bundle(\
-                      similarity_norm, label)
-
-        """
-        if scope.find_var(stat_pos_name) is None or \
-                scope.find_var(stat_neg_name) is None:
-            self.rank0_print("not found auc bucket")
-            return
-        elif scope.find_var(sqrerr_name) is None:
-            self.rank0_print("not found sqrerr_name=%s" % sqrerr_name)
-            return
-        elif scope.find_var(abserr_name) is None:
-            self.rank0_print("not found abserr_name=%s" % abserr_name)
-            return
-        elif scope.find_var(prob_name) is None:
-            self.rank0_print("not found prob_name=%s" % prob_name)
-            return
-        elif scope.find_var(q_name) is None:
-            self.rank0_print("not found q_name=%s" % q_name)
-            return
-        elif scope.find_var(pos_ins_num_name) is None:
-            self.rank0_print("not found pos_ins_num_name=%s" % pos_ins_num_name)
-            return
-        elif scope.find_var(total_ins_num_name) is None:
-            self.rank0_print("not found total_ins_num_name=%s" % \
-                             total_ins_num_name)
-            return
-
-        auc, bucket_error, mae, rmse, actual_ctr, predicted_ctr, copc,\
-            mean_predict_qvalue, total_ins_num = self.get_global_metrics(\
-            scope, stat_pos_name, stat_neg_name, sqrerr_name, abserr_name,\
-            prob_name, q_name, pos_ins_num_name, total_ins_num_name)
-        self.rank0_print("%s global AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f "
-                         "RMSE=%.6f Actural_CTR=%.6f Predicted_CTR=%.6f "
-                         "COPC=%.6f MEAN Q_VALUE=%.6f Ins number=%s" %
-                         (print_prefix, auc, bucket_error, mae, rmse,
-                          actual_ctr, predicted_ctr, copc, mean_predict_qvalue,
-                          total_ins_num))
diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
deleted file mode 100644
index 1d1714bf72d306680d0b92d2b618f66f0f62a2be..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ /dev/null
@@ -1,618 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""HDFS Utils."""
-
-import os
-import sys
-import subprocess
-import multiprocessing
-from datetime import datetime
-
-import re
-import copy
-import errno
-
-import logging
-
-__all__ = ["HDFSClient"]
-
-
-def get_logger(name, level, fmt):
-    logger = logging.getLogger(name)
-    logger.setLevel(level)
-    handler = logging.FileHandler('hdfs.log', mode='w')
-    formatter = logging.Formatter(fmt=fmt)
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    return logger
-
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class HDFSClient(object):
-    """
-    A tool of HDFS 
-
-    Args:
-        hadoop_home (string): hadoop_home 
-        configs (dict): hadoop config, it is a dict, please contain \
-            key "fs.default.name" and "hadoop.job.ugi"
-        Can be a float value
-    Examples:
-        hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-        configs = {
-            "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-            "hadoop.job.ugi": "hello,hello123"
-        }
-
-        client = HDFSClient(hadoop_home, configs)
-
-        client.ls("/user/com/train-25")
-        files = client.lsr("/user/com/train-25/models")
-    """
-
-    def __init__(self, hadoop_home, configs):
-        self.pre_commands = []
-        hadoop_bin = '%s/bin/hadoop' % hadoop_home
-        self.pre_commands.append(hadoop_bin)
-        dfs = 'fs'
-        self.pre_commands.append(dfs)
-
-        for k, v in configs.iteritems():
-            config_command = '-D%s=%s' % (k, v)
-            self.pre_commands.append(config_command)
-
-    def __run_hdfs_cmd(self, commands, retry_times=5):
-        whole_commands = copy.deepcopy(self.pre_commands)
-        whole_commands.extend(commands)
-
-        ret_code = 0
-        ret_out = None
-        ret_err = None
-        whole_commands = " ".join(whole_commands)
-        for x in range(retry_times + 1):
-            proc = subprocess.Popen(
-                whole_commands,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                shell=True)
-            (output, errors) = proc.communicate()
-            ret_code, ret_out, ret_err = proc.returncode, output, errors
-
-            _logger.info(
-                'Times: %d, Running command: %s. Return code: %d, Msg: %s' %
-                (x, whole_commands, proc.returncode, errors))
-
-            if ret_code == 0:
-                break
-
-        return ret_code, ret_out, ret_err
-
-    def cat(self, hdfs_path=None):
-        """
-        cat hdfs file
-        Args:
-            hdfs_path(str): the hdfs file path
-        Returns:
-            file content
-        """
-        if self.is_file(hdfs_path):
-            exist_cmd = ['-cat', hdfs_path]
-            returncode, output, errors = self.__run_hdfs_cmd(
-                exist_cmd, retry_times=1)
-            if returncode != 0:
-                _logger.error("HDFS cat HDFS path: {} failed".format(hdfs_path))
-                return ""
-            else:
-                _logger.info("HDFS cat HDFS path: {} succeed".format(hdfs_path))
-                return output.strip()
-
-        else:
-            return ""
-
-    def is_exist(self, hdfs_path=None):
-        """
-        whether the remote HDFS path exists
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-        exist_cmd = ['-test', '-e', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            exist_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS is_exist HDFS path: {} failed".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
-                hdfs_path))
-            return True
-
-    def is_dir(self, hdfs_path=None):
-        """
-        whether the remote HDFS path is directory
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-
-        if not self.is_exist(hdfs_path):
-            return False
-
-        dir_cmd = ['-test', '-d', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS path: {} failed is not a directory".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} successfully is a directory".format(
-                hdfs_path))
-            return True
-
-    def is_file(self, hdfs_path=None):
-        """
-        whether the remote HDFS path is file
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-
-        if not self.is_exist(hdfs_path):
-            return False
-
-        dir_cmd = ['-test', '-d', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
-
-        if returncode == 0:
-            _logger.error("HDFS path: {} failed is not a file".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} successfully is a file".format(
-                hdfs_path))
-            return True
-
-    def delete(self, hdfs_path):
-        """
-        Remove a file or directory from HDFS.
-
-        whether the remote HDFS path exists
-
-        Args:
-            hdfs_path(str): HDFS path.
-
-        Returns:
-            True or False
-            This function returns `True` if the deletion was successful and `False` if
-            no file or directory previously existed at `hdfs_path`.
-        """
-        _logger.info('Deleting %r.', hdfs_path)
-
-        if not self.is_exist(hdfs_path):
-            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
-            return True
-
-        if self.is_dir(hdfs_path):
-            del_cmd = ['-rmr', hdfs_path]
-        else:
-            del_cmd = ['-rm', hdfs_path]
-
-        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
-
-        if returncode:
-            _logger.error("HDFS path: {} delete files failure".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} delete files successfully".format(
-                hdfs_path))
-            return True
-
-    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
-        """
-        Move a file or folder on HDFS.
-
-        Args:
-            hdfs_src_path(str): HDFS path
-            hdfs_dst_path(str): HDFS path
-            overwrite(bool|False): If the path already exists and overwrite is
-                                   False, will return False.
-        Returns:
-            True or False
-        """
-        assert hdfs_src_path is not None
-        assert hdfs_dst_path is not None
-
-        if not self.is_exist(hdfs_src_path):
-            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
-        if self.is_exist(hdfs_dst_path) and not overwrite:
-            _logger.error("HDFS path is exist: {} and overwrite=False".format(
-                hdfs_dst_path))
-
-        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            rename_command, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS rename path: {} to {} failed".format(
-                hdfs_src_path, hdfs_dst_path))
-            return False
-        else:
-            _logger.info("HDFS rename path: {} to {} successfully".format(
-                hdfs_src_path, hdfs_dst_path))
-            return True
-
-    @staticmethod
-    def make_local_dirs(local_path):
-        """
-        create a directiory local, is same to mkdir
-
-        Args:
-            local_path(str): local path that wants to create a directiory.
-        """
-        try:
-            os.makedirs(local_path)
-        except OSError as e:
-            if e.errno != errno.EEXIST:
-                raise
-
-    def makedirs(self, hdfs_path):
-        """
-        Create a remote directory, recursively if necessary.
-
-        Args:
-            hdfs_path(str): Remote path. Intermediate directories will be
-                            created appropriately.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Creating directories to %r.', hdfs_path)
-        assert hdfs_path is not None
-
-        if self.is_exist(hdfs_path):
-            _logger.error("HDFS path is exist: {}".format(hdfs_path))
-            return
-
-        mkdirs_commands = ['-mkdir', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            mkdirs_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS mkdir path: {} successfully".format(hdfs_path))
-            return True
-
-    def ls(self, hdfs_path):
-        """
-        ls directory contents about HDFS hdfs_path
-
-        Args:
-            hdfs_path(str): Remote HDFS path will be ls.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-ls', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list path: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
-
-            ret_lines = []
-            regex = re.compile('\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    ret_lines.append(re_line[7])
-            return ret_lines
-
-    def lsr(self, hdfs_path, excludes=[]):
-        """
-        list directory contents about HDFS hdfs_path recursively
-
-        Args:
-            hdfs_path(str): Remote HDFS path.
-            excludes(list): excludes
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-lsr', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list all files: {} successfully".format(
-                hdfs_path))
-            lines = []
-            regex = re.compile('\s+')
-            out_lines = output.strip().split("\n")
-            for line_id, line in enumerate(out_lines):
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    if re_line[0][0] == "d":
-                        continue
-                    if re_line[7] in excludes:
-                        continue
-                    else:
-                        lines.append((re_line[7], re_line[5] + " " + re_line[6],
-                                      line_id))
-            lines = sorted(lines, key=lambda line: line[2])
-            ret_lines = [ret[0] for ret in lines]
-            return ret_lines
-
-    @staticmethod
-    def split_files(files, trainer_id, trainers):
-        """
-        split file list
-
-        Args:
-            files(list): file list
-            trainer_id(int): trainer mpi rank id
-            trainers(int): all trainers num
-
-        Returns:
-            fileist(list): file list of current trainer
-        """
-        remainder = len(files) % trainers
-        blocksize = len(files) / trainers
-
-        blocks = [blocksize] * trainers
-        for i in range(remainder):
-            blocks[i] += 1
-
-        trainer_files = [[]] * trainers
-        begin = 0
-        for i in range(trainers):
-            trainer_files[i] = files[begin:begin + blocks[i]]
-            begin += blocks[i]
-
-        return trainer_files[trainer_id]
-
-    def download(self,
-                 hdfs_path,
-                 local_path,
-                 multi_processes=5,
-                 overwrite=False,
-                 retry_times=5):
-        """
-        Download files from HDFS using multi process.
-
-        Args:
-            hdfs_path(str): path on hdfs
-            local_path(str): path on local
-            multi_processes(int|5): the download data process at the same time, default=5
-            overwrite(bool): is overwrite
-            retry_times(int): retry times
-
-        Returns:
-            List:
-            Download files in local folder.
-        """
-
-        def __subprocess_download(local_path, datas):
-            """
-            download file from HDFS
-
-            Args:
-                hdfs_path(str): the hdfs file path
-                local_path(str): the local file path
-                overwrite(bool|None): will overwrite the file on HDFS or not
-                retry_times(int|5): retry times
-
-            Returns:
-                True or False
-            """
-            for data in datas:
-                download_commands = ["-get", data, local_path]
-
-                returncode, output, errors = self.__run_hdfs_cmd(
-                    download_commands, retry_times=retry_times)
-
-                if returncode:
-                    _logger.error(
-                        "Get local path: {} from HDFS path: {} failed".format(
-                            local_path, hdfs_path))
-                    return False
-            return True
-
-        self.make_local_dirs(local_path)
-
-        all_files = client.ls(hdfs_path)
-
-        procs = []
-        for i in range(multi_processes):
-            process_datas = HDFSClient.split_files(all_files, i,
-                                                   multi_processes)
-            p = multiprocessing.Process(
-                target=__subprocess_download,
-                args=(
-                    local_path,
-                    process_datas, ))
-            procs.append(p)
-            p.start()
-
-        # complete the processes
-        for proc in procs:
-            proc.join()
-
-        _logger.info("Finish {} multi process to download datas".format(
-            multi_processes))
-
-        local_downloads = []
-        for dirname, folder, files in os.walk(local_path):
-            for i in files:
-                t = os.path.join(dirname, i)
-                local_downloads.append(t)
-        return local_downloads
-
-    def upload(self,
-               hdfs_path,
-               local_path,
-               multi_processes=5,
-               overwrite=False,
-               retry_times=5):
-        """
-        Upload files to HDFS using multi process.
-
-        Args:
-            hdfs_path(str): path on hdfs
-            local_path(str): path on local
-            multi_processes(int|5): the upload data process at the same time, default=5
-            overwrite(bool|False): will overwrite file on HDFS or not
-            retry_times(int): upload file max retry time.
-
-        Returns:
-            None
-        """
-
-        def __subprocess_upload(hdfs_path_single, datas):
-            for data in datas:
-                put_commands = ["-put", data, hdfs_path_single]
-                returncode, output, errors = self.__run_hdfs_cmd(put_commands,
-                                                                 retry_times)
-
-                if returncode:
-                    _logger.error("Put local path: {} to HDFS path: {} failed".
-                                  format(data, hdfs_path_single))
-                    return False
-            return True
-
-        def get_local_files(path):
-            """
-            get local files
-
-            Args:
-                path(str): local path
-
-            Returns:
-                list of local files
-            """
-            rlist = []
-
-            if not os.path.exists(path):
-                return rlist
-
-            if os.path.isdir(path):
-                for file in os.listdir(path):
-                    t = os.path.join(path, file)
-                    rlist.append(t)
-            else:
-                rlist.append(path)
-            return rlist
-
-        all_files = get_local_files(local_path)
-        if not all_files:
-            _logger.info("there are nothing need to upload, exit")
-            return
-
-        if self.is_exist(hdfs_path) and overwrite:
-            self.delete(hdfs_path)
-            self.makedirs(hdfs_path)
-
-        procs = []
-        for i in range(multi_processes):
-            process_datas = HDFSClient.split_files(all_files, i,
-                                                   multi_processes)
-            p = multiprocessing.Process(
-                target=__subprocess_upload, args=(
-                    hdfs_path,
-                    process_datas, ))
-            procs.append(p)
-            p.start()
-
-        # complete the processes
-        for proc in procs:
-            proc.join()
-
-        _logger.info("Finish upload datas from {} to {}".format(local_path,
-                                                                hdfs_path))
-
-    def upload_dir(self, dest_dir, local_dir, overwrite=False):
-        """
-        upload dir to hdfs
-        Args:
-            dest_dir(str): hdfs dest dir
-            local_dir(str): hdfs local dir
-            overwrite(bool): is overwrite
-        Returns:
-            return code
-        """
-        local_dir = local_dir.rstrip("/")
-        dest_dir = dest_dir.rstrip("/")
-        local_basename = os.path.basename(local_dir)
-        if self.is_exist(dest_dir + "/" + local_basename) and overwrite:
-            self.delete(dest_dir + "/" + local_basename)
-        if not self.is_exist(dest_dir):
-            self.makedirs(dest_dir)
-        put_command = ["-put", local_dir, dest_dir]
-        returncode, output, errors = self.__run_hdfs_cmd(put_command,
-                                                         retry_times)
-        if returncode != 0:
-            _logger.error("Put local dir: {} to HDFS dir: {} failed".format(
-                local_dir, dest_dir))
-            return False
-        return True
-
-
-if __name__ == "__main__":
-    hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-    configs = {
-        "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-        "hadoop.job.ugi": "hello,hello123"
-    }
-
-    client = HDFSClient(hadoop_home, configs)
-
-    client.ls("/user/com/train-25")
-    files = client.lsr("/user/com/train-25/models")
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
deleted file mode 100644
index 7bdd430f985bd0b3818f6ef305ce2d7d8976106b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/inferencer.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: inferencer is moved into fluid.contrib.inferencer.
-__all__ = []
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
deleted file mode 100644
index 76da8b850c9929fad468a0ae273765cfe8816d79..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/initializer.py
+++ /dev/null
@@ -1,960 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import framework
-import numpy as np
-from .wrapped_decorator import signature_safe_contextmanager
-from .core import VarDesc
-from . import unique_name
-
-__all__ = [
-    'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
-    'MSRA', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
-    'UniformInitializer', 'NormalInitializer', 'TruncatedNormalInitializer',
-    'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer',
-    'NumpyArrayInitializer'
-]
-
-_force_init_on_cpu_ = False
-
-
-def force_init_on_cpu():
-    """
-    The flag of whether force to init variables on CPU.
-
-    Returns:
-        bool: the state if we should force init on CPU.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            if fluid.initializer.force_init_on_cpu():
-                step = fluid.layers.create_global_var(
-                    shape=[2,3], value=1.0, dtype='float32')
-
-    """
-    return _force_init_on_cpu_
-
-
-@signature_safe_contextmanager
-def init_on_cpu():
-    """
-    Force the variable to be inited on CPU.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            with fluid.initializer.init_on_cpu():
-                step = fluid.layers.create_global_var(
-                    shape=[2,3], value=1.0, dtype='float32')
-
-    """
-    global _force_init_on_cpu_
-
-    pre_state = force_init_on_cpu()
-    _force_init_on_cpu_ = True
-    yield
-    _force_init_on_cpu_ = pre_state
-
-
-class Initializer(object):
-    """Base class for variable initializers
-
-    Defines the common interface of variable initializers.
-    They add operations to the init program that are used
-    to initialize variables. Users should not use this class
-    directly, but need to use one of its implementations.
-    """
-
-    def __init__(self):
-        pass
-
-    def __call__(self, param, block):
-        """Add corresponding initialization operations to the network
-        """
-        raise NotImplementedError()
-
-    def _compute_fans(self, var):
-        """Compute the fan_in and the fan_out for layers
-
-        This method computes the fan_in and the fan_out
-        for neural network layers, if not specified. It is
-        not possible to perfectly estimate fan_in and fan_out.
-        This method will estimate it correctly for matrix multiply and
-        convolutions.
-
-        Args:
-            var: variable for which fan_in and fan_out have to be computed
-
-        Returns:
-            tuple of two integers (fan_in, fan_out)
-        """
-        shape = var.shape
-        if not shape or len(shape) == 0:
-            fan_in = fan_out = 1
-        elif len(shape) == 1:
-            fan_in = fan_out = shape[0]
-        elif len(shape) == 2:
-            # This is the case for simple matrix multiply
-            fan_in = shape[0]
-            fan_out = shape[1]
-        else:
-            # Assume this to be a convolutional kernel
-            # In PaddlePaddle, the shape of the kernel is like:
-            # [num_filters, num_filter_channels, ...] where the remaining
-            # dimensions are the filter_size
-            receptive_field_size = np.prod(shape[2:])
-            fan_in = shape[1] * receptive_field_size
-            fan_out = shape[0] * receptive_field_size
-
-        return (fan_in, fan_out)
-
-
-class ConstantInitializer(Initializer):
-    """Implements the constant initializer
-
-    Args:
-        value (float): constant value to initialize the variable
-
-    Examples:
-        .. code-block:: python
-
-    	    import paddle.fluid as fluid
-            x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-	    fc = fluid.layers.fc(input=x, size=10,
-    		param_attr=fluid.initializer.Constant(value=2.0))
-
-    """
-
-    def __init__(self, value=0.0, force_cpu=False):
-        assert value is not None
-        super(ConstantInitializer, self).__init__()
-        self._value = value
-        self._force_cpu = force_cpu
-
-    def __call__(self, var, block):
-        """Add constant initialization ops for a variable
-
-        Args:
-            var: Variable that needs to be initialized
-            block: The block in which initialization ops
-                   should be added
-
-        Returns:
-            the initialization op
-        """
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-
-        # to be compatible of fp16 initializers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['constant_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        # Initialization Ops should be prepended and not appended
-        op = block._prepend_op(
-            type="fill_constant",
-            outputs={"Out": out_var},
-            attrs={
-                "shape": var.shape,
-                "dtype": int(out_dtype),
-                "value": float(self._value),
-                'force_cpu': self._force_cpu or force_init_on_cpu()
-            },
-            stop_gradient=True)
-
-        if var.dtype == VarDesc.VarType.FP16:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
-
-        if not framework.in_dygraph_mode():
-            var.op = op
-        return op
-
-
-class UniformInitializer(Initializer):
-    """Implements the random uniform distribution initializer
-
-    Args:
-        low (float): lower boundary of the uniform distribution
-        high (float): upper boundary of the uniform distribution
-        seed (int): random seed
-        diag_num (int): the number of diagonal elements to initialize.
-            If set to 0, diagonal initialization will be not performed.
-        diag_step (int): Step size between two diagonal elements,
-            which is generally the width of the square matrix.
-        diag_val (float): the value of the diagonal element to be initialized,
-            default 1.0. It takes effect only if the diag_num is greater than 0.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-            fc = fluid.layers.fc(input=x, size=10,
-    		param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
-    """
-
-    def __init__(self,
-                 low=-1.0,
-                 high=1.0,
-                 seed=0,
-                 diag_num=0,
-                 diag_step=0,
-                 diag_val=1.0):
-        assert low is not None
-        assert high is not None
-        assert high >= low
-        assert seed is not None
-        assert diag_num is not None
-        assert diag_step is not None
-        assert diag_val is not None
-        if diag_num > 0 or diag_step > 0:
-            assert (diag_num > 0 and diag_step > 0)
-        super(UniformInitializer, self).__init__()
-        self._low = low
-        self._high = high
-        self._seed = seed
-        self._diag_num = diag_num
-        self._diag_step = diag_step
-        self._diag_val = diag_val
-
-    def __call__(self, var, block):
-        """Add uniform distribution initialization ops for a variable
-
-        Args:
-            var: Variable that needs to be initialized
-            block: The block in which initialization ops
-                   should be added
-
-        Returns:
-            the initialization op
-        """
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-        # Initialization Ops should be prepended and not appended
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initializers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['uniform_random', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        op = block._prepend_op(
-            type="uniform_random",
-            outputs={"Out": out_var},
-            attrs={
-                "shape": var.shape,
-                "dtype": out_dtype,
-                "min": self._low,
-                "max": self._high,
-                "seed": self._seed,
-                "diag_num": self._diag_num,
-                "diag_step": self._diag_step,
-                "diag_val": self._diag_val
-            },
-            stop_gradient=True)
-
-        if var.dtype == VarDesc.VarType.FP16:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
-
-        if not framework.in_dygraph_mode():
-            var.op = op
-        return op
-
-
-class NormalInitializer(Initializer):
-    """Implements the Random Normal(Gaussian) distribution initializer
-
-    Args:
-        loc (float): mean of the normal distribution
-        scale (float): standard deviation of the normal distribution
-        seed (int): random seed
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            fc = fluid.layers.fc(input=x, size=10,
-                param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
-
-    """
-
-    def __init__(self, loc=0.0, scale=1.0, seed=0):
-        assert loc is not None
-        assert scale is not None
-        assert seed is not None
-        super(NormalInitializer, self).__init__()
-        self._mean = loc
-        self._std_dev = scale
-        self._seed = seed
-
-    def __call__(self, var, block):
-        """Add normal distribution initialization ops for a variable
-
-        Args:
-            var: Variable that needs to be initialized
-            block: The block in which initialization ops
-                   should be added
-
-        Returns:
-            the initialization op
-        """
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-        # Initialization Ops should be prepended and not appended
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['gaussian_random', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        op = block._prepend_op(
-            type="gaussian_random",
-            outputs={"Out": out_var},
-            attrs={
-                "shape": var.shape,
-                "dtype": out_dtype,
-                "mean": self._mean,
-                "std": self._std_dev,
-                "seed": self._seed,
-                "use_mkldnn": False
-            },
-            stop_gradient=True)
-
-        if var.dtype == VarDesc.VarType.FP16:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
-        if not framework.in_dygraph_mode():
-            var.op = op
-        return op
-
-
-class TruncatedNormalInitializer(Initializer):
-    """Implements the Random TruncatedNormal(Gaussian) distribution initializer
-
-    Args:
-        loc (float): mean of the normal distribution
-        scale (float): standard deviation of the normal distribution
-        seed (int): random seed
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-            fc = fluid.layers.fc(input=x, size=10,
-                param_attr=fluid.initializer.TruncatedNormal(loc=0.0, scale=2.0))
-    """
-
-    def __init__(self, loc=0.0, scale=1.0, seed=0):
-        assert loc is not None
-        assert scale is not None
-        assert seed is not None
-        super(TruncatedNormalInitializer, self).__init__()
-        self._mean = loc
-        self._std_dev = scale
-        self._seed = seed
-
-    def __call__(self, var, block):
-        """Add truncated normal distribution initialization ops for a variable
-
-        Args:
-            var: Variable that needs to be initialized
-            block: The block in which initialization ops
-                   should be added
-
-        Returns:
-            the initialization op
-        """
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-        # Initialization Ops should be prepended and not appended
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['truncated_gaussian_random', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        op = block._prepend_op(
-            type="truncated_gaussian_random",
-            outputs={"Out": out_var},
-            attrs={
-                "shape": var.shape,
-                "dtype": out_dtype,
-                "mean": self._mean,
-                "std": self._std_dev,
-                "seed": self._seed
-            },
-            stop_gradient=True)
-
-        if var.dtype == VarDesc.VarType.FP16:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
-        if not framework.in_dygraph_mode():
-            var.op = op
-        return op
-
-
-class XavierInitializer(Initializer):
-    """
-    This class implements the Xavier weight initializer from the paper
-    `Understanding the difficulty of training deep feedforward neural
-    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
-    by Xavier Glorot and Yoshua Bengio.
-
-    This initializer is designed to keep the scale of the gradients
-    approximately same in all the layers. In case of Uniform distribution,
-    the range is [-x, x], where
-
-    .. math::
-
-        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
-
-    In case of Normal distribution, the mean is 0 and the standard deviation
-    is
-
-    .. math::
-
-        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
-
-
-    Args:
-        uniform (bool): whether to use uniform or normal distribution
-        fan_in (float): fan_in for Xavier initialization. If None, it is
-                inferred from the variable.
-        fan_out (float): fan_out for Xavier initialization. If None, it is
-                 inferred from the variable.
-        seed (int): random seed
-
-    Note:
-        It is recommended to set fan_in and fan_out to None for most cases.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            queries = fluid.layers.data(name='x', shape=[1], dtype='float32')
-            fc = fluid.layers.fc(
-                input=queries, size=10,
-                param_attr=fluid.initializer.Xavier(uniform=False))
-
-    """
-
-    def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
-        assert uniform is not None
-        assert seed is not None
-        super(XavierInitializer, self).__init__()
-        self._uniform = uniform
-        self._fan_in = fan_in
-        self._fan_out = fan_out
-        self._seed = seed
-
-    def __call__(self, var, block):
-        """Add xavier initialization ops for a variable
-
-        Args:
-            var: Variable that needs to be initialized
-            block: The block in which initialization ops
-                   should be added
-
-        Returns:
-            the initialization op
-        """
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-        f_in, f_out = self._compute_fans(var)
-
-        # If fan_in and fan_out are passed, use them
-        fan_in = f_in if self._fan_in is None else self._fan_in
-        fan_out = f_out if self._fan_out is None else self._fan_out
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['xavier_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if self._uniform:
-            limit = np.sqrt(6.0 / float(fan_in + fan_out))
-            op = block._prepend_op(
-                type="uniform_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": out_var.shape,
-                    "dtype": out_dtype,
-                    "min": -limit,
-                    "max": limit,
-                    "seed": self._seed
-                },
-                stop_gradient=True)
-
-        else:
-            std = np.sqrt(2.0 / float(fan_in + fan_out))
-            op = block._prepend_op(
-                type="gaussian_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": out_var.shape,
-                    "dtype": out_dtype,
-                    "mean": 0.0,
-                    "std": std,
-                    "seed": self._seed
-                },
-                stop_gradient=True)
-
-        if var.dtype == VarDesc.VarType.FP16:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
-
-        if not framework.in_dygraph_mode():
-            var.op = op
-        return op
-
-
-class MSRAInitializer(Initializer):
-    """Implements the MSRA initializer a.k.a. Kaiming Initializer
-
-    This class implements the weight initialization from the paper
-    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
-    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
-    robust initialization method that particularly considers the rectifier
-    nonlinearities. In case of Uniform distribution, the range is [-x, x], where
-
-    .. math::
-
-        x = \sqrt{\\frac{6.0}{fan\_in}}
-
-    In case of Normal distribution, the mean is 0 and the standard deviation
-    is
-
-    .. math::
-
-        \sqrt{\\frac{2.0}{fan\_in}}
-
-    Args:
-        uniform (bool): whether to use uniform or normal distribution
-        fan_in (float): fan_in for MSRAInitializer. If None, it is\
-        inferred from the variable.
-        seed (int): random seed
-
-    Note:
-        It is recommended to set fan_in to None for most cases.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            fc = fluid.layers.fc(input=x, size=10,
-                param_attr=fluid.initializer.MSRA(uniform=False))
-
-    """
-
-    def __init__(self, uniform=True, fan_in=None, seed=0):
-        """Constructor for MSRAInitializer
-        """
-        assert uniform is not None
-        assert seed is not None
-        super(MSRAInitializer, self).__init__()
-        self._uniform = uniform
-        self._fan_in = fan_in
-        self._seed = seed
-
-    def __call__(self, var, block):
-        """Add MSRA initialization ops for a variable
-
-        Args:
-            var: Variable that needs to be initialized
-            block: The block in which initialization ops
-                   should be added
-
-        Returns:
-            the initialization op
-        """
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-        f_in, f_out = self._compute_fans(var)
-
-        # If fan_in is passed, use it
-        fan_in = f_in if self._fan_in is None else self._fan_in
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['masra_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if self._uniform:
-            limit = np.sqrt(6.0 / float(fan_in))
-            op = block._prepend_op(
-                type="uniform_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": out_var.shape,
-                    "dtype": int(out_dtype),
-                    "min": -limit,
-                    "max": limit,
-                    "seed": self._seed
-                },
-                stop_gradient=True)
-
-        else:
-            std = np.sqrt(2.0 / float(fan_in))
-            op = block._prepend_op(
-                type="gaussian_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": out_var.shape,
-                    "dtype": int(out_dtype),
-                    "mean": 0.0,
-                    "std": std,
-                    "seed": self._seed
-                },
-                stop_gradient=True)
-
-        if var.dtype == VarDesc.VarType.FP16:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
-
-        if not framework.in_dygraph_mode():
-            var.op = op
-        return op
-
-
-class BilinearInitializer(Initializer):
-    """
-    This initializer can be used in transposed convolution operator to
-    act as upsampling. Users can upsample a feature map with shape of
-    (B, C, H, W) by any integer factor. The usage is:
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            factor = 2
-            C = 2
-            w_attr = fluid.param_attr.ParamAttr(
-                learning_rate=0., 
-                regularizer=fluid.regularizer.L2Decay(0.),
-                initializer=fluid.initializer.Bilinear())
-            x = fluid.layers.data(name="data", shape=[3, 32, 32], 
-                                  dtype="float32")
-            conv_up = fluid.layers.conv2d_transpose(
-                input=x,
-                num_filters=C,
-                output_size=None,
-                filter_size=2 * factor - factor % 2,
-                padding=int(math.ceil((factor - 1) / 2.)),
-                stride=factor,
-                groups=C,
-                param_attr=w_attr,
-                bias_attr=False)
-
-    Where, `num_filters=C` and `groups=C` means this is channel-wise transposed
-    convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
-    This initializer will set a (K, K) interpolation kernel for every channel
-    of the filter identically. The resulting shape of the output feature map
-    will be (B, C, factor * H, factor * W). Note that the learning rate and the
-    weight decay are set to 0 in order to keep coefficient values of bilinear
-    interpolation unchanged during training.
-
-    """
-
-    def __init__(self):
-        """Constructor for BilinearInitializer.
-        """
-        super(BilinearInitializer, self).__init__()
-
-    def __call__(self, var, block):
-        """Add biliear initialization ops for a variable
-
-        Args:
-            var (Variable): Variable that needs to be initialized.
-            block (Block): The block in which initialization ops should
-                           be added.
-
-        Returns:
-            Operator: the initialization op
-
-        Raises:
-            ValueError: If type of `var` and `block` is not right.
-                        If the shape of `var` size is not 4 and
-                        var.shape[2] != var.shape[3].
-        """
-        if not isinstance(var, framework.Variable):
-            raise ValueError("var must be framework.Variable.")
-
-        if not isinstance(block, framework.Block):
-            raise ValueError("block must be framework.Block.")
-
-        shape = var.shape
-        if len(shape) != 4:
-            raise ValueError("the length of shape must be 4.")
-        if shape[2] != shape[3]:
-            raise ValueError("shape[2] must be equal to shape[3].")
-
-        weight = np.zeros(np.prod(var.shape), dtype='float32')
-        size = shape[3]
-        # factor
-        f = np.ceil(size / 2.)
-        # center
-        c = (2 * f - 1 - f % 2) / (2. * f)
-        for i in range(np.prod(shape)):
-            x = i % size
-            y = (i / size) % size
-            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
-        weight = np.reshape(weight, shape)
-
-        # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['bilinear_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if out_dtype == VarDesc.VarType.FP32:
-            value_name = "fp32_values"
-            values = [float(v) for v in weight.flat]
-        else:
-            raise ValueError("Unsupported dtype %s", input.dtype)
-        if np.prod(shape) > 1024 * 1024:
-            raise ValueError("The size of input is too big. ")
-        op = block.append_op(
-            type='assign_value',
-            outputs={'Out': [out_var]},
-            attrs={
-                'dtype': out_dtype,
-                'shape': list(shape),
-                value_name: values
-            })
-
-        if var.dtype == VarDesc.VarType.FP16:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
-
-        if not framework.in_dygraph_mode():
-            var.op = op
-        return op
-
-
-class NumpyArrayInitializer(Initializer):
-    """Init an parameter with an numpy array
-
-    Args:
-        value (numpy): numpy array to initialize the variable
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[5], dtype='float32')
-            fc = fluid.layers.fc(input=x, size=10,
-                param_attr=fluid.initializer.NumpyArrayInitializer(numpy.array([1,2])))
-    """
-
-    def __init__(self, value):
-        import numpy
-        assert isinstance(value, numpy.ndarray)
-        super(NumpyArrayInitializer, self).__init__()
-        self._value = value
-
-    def __call__(self, var, block):
-        """Add constant initialization ops for a variable
-
-        Args:
-            var: Variable that needs to be initialized
-            block: The block in which initialization ops
-                   should be added
-
-        Returns:
-            the initialization op
-        """
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-
-        # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            np_value = self._value.astype("float32")
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['numpy_array_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_var = var
-            out_dtype = var.dtype
-            np_value = self._value
-
-        # Initialization Ops should be prepended and not appended
-        if out_dtype == VarDesc.VarType.FP32:
-            value_name = "fp32_values"
-            values = [float(v) for v in np_value.flat]
-        elif out_dtype == VarDesc.VarType.INT32:
-            value_name = "int32_values"
-            values = [int(v) for v in np_value.flat]
-        else:
-            raise ValueError("Unsupported dtype %s", self._value.dtype)
-        if self._value.size > 1024 * 1024 * 1024:
-            raise ValueError("The size of input is too big. Please consider "
-                             "saving it to file and 'load_op' to load it")
-        op = block._prepend_op(
-            type='assign_value',
-            outputs={'Out': out_var},
-            attrs={
-                'dtype': out_dtype,
-                'shape': list(self._value.shape),
-                value_name: values
-            },
-            stop_gradient=True)
-
-        if var.dtype == VarDesc.VarType.FP16:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
-
-        if not framework.in_dygraph_mode():
-            var.op = op
-        return op
-
-
-# We short the class name, since users will use the initializer with the package
-# name. The sample code:
-#
-# import paddle.fluid as fluid
-#
-# hidden = fluid.layers.fc(...,
-#                          param_attr=ParamAttr(fluid.initializer.Xavier()))
-#
-# It is no need to add an `Initializer` as the class suffix
-Constant = ConstantInitializer
-Uniform = UniformInitializer
-Normal = NormalInitializer
-TruncatedNormal = TruncatedNormalInitializer
-Xavier = XavierInitializer
-MSRA = MSRAInitializer
-Bilinear = BilinearInitializer
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
deleted file mode 100644
index 8afbd662ad7b87157872c61d2cafad1c84aa3b77..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/input.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from .framework import Variable, in_dygraph_mode
-from .layer_helper import LayerHelper
-
-__all__ = ['one_hot', 'embedding']
-
-
-def one_hot(input, depth, allow_out_of_range=False):
-    """
-    This layer creates the one-hot representations for input indices.
-
-    Args:
-        input(Variable): Input indices represent locations, which takes value 1.0
-            in indices, while all other locations take value 0.
-        depth(scalar): An interger defining the depth of the one-hot dimension.
-        allow_out_of_range(bool): A bool value indicating whether the input
-            indices could be out of range [0, depth). When input indices are
-            out of range, exceptions is raised if allow_out_of_range is False,
-            or zero-filling representations is created if it is set True
-
-    Returns:
-        Variable: The one-hot representations of input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            one_hot_label = fluid.one_hot(input=label, depth=10)
-    """
-    helper = LayerHelper("one_hot_v2", **locals())
-
-    one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
-
-    if in_dygraph_mode():
-        inputs = {'X': input}
-        attrs = {'depth': depth}
-    else:
-        if not isinstance(depth, Variable):
-            # user attribute 
-            inputs = {'X': input}
-            attrs = {'depth': depth}
-        else:
-            depth.stop_gradient = True
-            inputs = {'X': input, 'depth_tensor': depth}
-            attrs = {}
-    helper.append_op(
-        type="one_hot_v2",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={'Out': one_hot_out},
-        stop_gradient=True)
-    return one_hot_out
-
-
-def embedding(input,
-              size,
-              is_sparse=False,
-              is_distributed=False,
-              padding_idx=None,
-              param_attr=None,
-              dtype='float32'):
-    """
-    **Embedding Layer**
-
-    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
-    a lookup table. The result of this lookup is the embedding of each ID in the
-    :attr:`input`.
-
-    All the input variables are passed in as local variables to the LayerHelper
-    constructor.
-
-    Args:
-        input(Variable): Input is a Tensor<int64> Variable, which contains the IDs information.
-            The value of the input IDs should satisfy :math:`0<= id < size[0]`.
-        size(tuple|list): The shape of the look up table parameter. It should
-            have two elements which indicate the size of the dictionary of
-            embeddings and the size of each embedding vector respectively.
-        is_sparse(bool): The flag indicating whether to use sparse update.
-        is_distributed(bool): Whether to run lookup table from remote parameter server.
-        padding_idx(int|long|None): It will output all-zero padding data whenever
-            lookup encounters :math:`padding\_idx` in Ids. If set :attr:`None`, it makes
-            no effect to output. If :math:`padding\_idx < 0`, the :math:`padding\_idx`
-            will automatically be converted to :math:`size[0] + padding\_idx` to use.
-            Default: None.
-        param_attr(ParamAttr): Parameters for this layer.
-        dtype(np.dtype|core.VarDesc.VarType|str): The dtype refers to the data type of output
-            tensor. It can be float32, float_16, int etc.
-
-    Returns:
-        Variable: The tensor variable storing the embeddings of the \
-                  supplied inputs.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          # [batch_size, 20]  ->  [batch_size, 20, 64]
-          data = fluid.layers.data(name='sequence', shape=[20], dtype='int64')
-          emb = fluid.embedding(input=data, size=[128, 64])    
-    """
-
-    helper = LayerHelper('embedding', **locals())
-    remote_prefetch = is_sparse and (not is_distributed)
-    if remote_prefetch:
-        assert is_sparse is True and is_distributed is False
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
-    tmp = helper.create_variable_for_type_inference(dtype)
-    padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
-        size[0] + padding_idx)
-    helper.append_op(
-        type='lookup_table_v2',
-        inputs={'Ids': input,
-                'W': w},
-        outputs={'Out': tmp},
-        attrs={
-            'is_sparse': is_sparse,
-            'is_distributed': is_distributed,
-            'remote_prefetch': remote_prefetch,
-            'padding_idx': padding_idx
-        })
-    return tmp
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
deleted file mode 100644
index 05907562e5e9553fbbfe1c7147ba6019a7df34fd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/install_check.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from .framework import Program, program_guard, unique_name, cuda_places, cpu_places
-from .param_attr import ParamAttr
-from .initializer import Constant
-from . import layers
-from . import backward
-from .dygraph import Layer, nn
-from . import executor
-from . import optimizer
-from . import core
-from . import compiler
-import logging
-import numpy as np
-
-__all__ = ['run_check']
-
-
-class SimpleLayer(Layer):
-    def __init__(self, name_scope):
-        super(SimpleLayer, self).__init__(name_scope)
-        self._fc1 = nn.FC(self.full_name(),
-                          3,
-                          param_attr=ParamAttr(initializer=Constant(value=0.1)))
-
-    def forward(self, inputs):
-        x = self._fc1(inputs)
-        x = layers.reduce_sum(x)
-        return x
-
-
-def run_check():
-    ''' intall check to verify if install is success
-
-    This func should not be called only if you need to verify installation
-    '''
-    print("Running Verify Fluid Program ... ")
-
-    device_list = []
-    if core.is_compiled_with_cuda():
-        try:
-            core.get_cuda_device_count()
-        except Exception as e:
-            logging.warning(
-                "You are using GPU version Paddle Fluid, But Your CUDA Device is not set properly"
-                "\n Original Error is {}".format(e))
-            return 0
-        device_list = cuda_places()
-    else:
-        device_list = [core.CPUPlace(), core.CPUPlace()]
-
-    np_inp_single = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-    inp = []
-    for i in range(len(device_list)):
-        inp.append(np_inp_single)
-    np_inp_muti = np.array(inp)
-    np_inp_muti = np_inp_muti.reshape(len(device_list), 2, 2)
-
-    def test_parallerl_exe():
-        train_prog = Program()
-        startup_prog = Program()
-        scope = core.Scope()
-        with executor.scope_guard(scope):
-            with program_guard(train_prog, startup_prog):
-                with unique_name.guard():
-                    build_strategy = compiler.BuildStrategy()
-                    build_strategy.enable_inplace = True
-                    build_strategy.memory_optimize = True
-                    inp = layers.data(name="inp", shape=[2, 2])
-                    simple_layer = SimpleLayer("simple_layer")
-                    out = simple_layer(inp)
-                    exe = executor.Executor(
-                        core.CUDAPlace(0) if core.is_compiled_with_cuda() and
-                        (core.get_cuda_device_count() > 0) else core.CPUPlace())
-                    loss = layers.mean(out)
-                    loss.persistable = True
-                    optimizer.SGD(learning_rate=0.01).minimize(loss)
-                    startup_prog.random_seed = 1
-                    compiled_prog = compiler.CompiledProgram(
-                        train_prog).with_data_parallel(
-                            build_strategy=build_strategy,
-                            loss_name=loss.name,
-                            places=device_list)
-                    exe.run(startup_prog)
-
-                    exe.run(compiled_prog,
-                            feed={inp.name: np_inp_muti},
-                            fetch_list=[loss.name])
-
-    def test_simple_exe():
-        train_prog = Program()
-        startup_prog = Program()
-        scope = core.Scope()
-        with executor.scope_guard(scope):
-            with program_guard(train_prog, startup_prog):
-                with unique_name.guard():
-                    inp0 = layers.data(
-                        name="inp", shape=[2, 2], append_batch_size=False)
-                    simple_layer0 = SimpleLayer("simple_layer")
-                    out0 = simple_layer0(inp0)
-                    param_grads = backward.append_backward(
-                        out0, parameter_list=[simple_layer0._fc1._w.name])[0]
-                    exe0 = executor.Executor(
-                        core.CUDAPlace(0) if core.is_compiled_with_cuda() and
-                        (core.get_cuda_device_count() > 0) else core.CPUPlace())
-                    exe0.run(startup_prog)
-                    exe0.run(feed={inp0.name: np_inp_single},
-                             fetch_list=[out0.name, param_grads[1].name])
-
-    test_simple_exe()
-
-    print("Your Paddle Fluid works well on SINGLE GPU or CPU.")
-    try:
-        test_parallerl_exe()
-        print("Your Paddle Fluid works well on MUTIPLE GPU or CPU.")
-        print(
-            "Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now"
-        )
-    except Exception as e:
-        logging.warning(
-            "Your Paddle Fluid has some problem with multiple GPU. This may be caused by:"
-            "\n 1. There is only 1 or 0 GPU visible on your Device;"
-            "\n 2. No.1 or No.2 GPU or both of them are occupied now"
-            "\n 3. Wrong installation of NVIDIA-NCCL2, please follow instruction on https://github.com/NVIDIA/nccl-tests "
-            "\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
-        )
-
-        print("\n Original Error is: {}".format(e))
-        print(
-            "Your Paddle Fluid is installed successfully ONLY for SINGLE GPU or CPU! "
-            "\n Let's start deep Learning with Paddle Fluid now")
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
deleted file mode 100644
index 7d864cfbb433db79f508c9760d224fefb853f912..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/io.py
+++ /dev/null
@@ -1,1393 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import errno
-import warnings
-import six
-import logging
-from functools import reduce
-
-import paddle
-import paddle.reader
-from paddle.reader import *
-from paddle.fluid import layers
-from paddle.fluid.executor import Executor
-from paddle.fluid.evaluator import Evaluator
-from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard
-from paddle.fluid.compiler import CompiledProgram
-from paddle.fluid.log_helper import get_logger
-from . import reader
-from .reader import *
-from . import core
-from .. import compat as cpt
-
-batch = paddle.batch
-
-__all__ = [
-    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
-    'load_persistables', 'save_inference_model', 'load_inference_model', 'batch'
-] + reader.__all__ + paddle.reader.__all__
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-def is_parameter(var):
-    """
-    Check whether the given variable is an instance of Parameter.
-
-    Args:
-        var(Variable): The variable to be checked.
-
-    Returns:
-        bool: True if the given `var` is an instance of Parameter,
-        False if not.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            param = fluid.default_main_program().global_block().var('fc.w')
-            res = fluid.io.is_parameter(param)
-    """
-    return isinstance(var, Parameter)
-
-
-def is_persistable(var):
-    """
-    Check whether the given variable is persistable.
-
-    Args:
-        var(Variable): The variable to be checked.
-
-    Returns:
-        bool: True if the given `var` is persistable
-        False if not.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            param = fluid.default_main_program().global_block().var('fc.b')
-            res = fluid.io.is_persistable(param)
-    """
-    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-            var.desc.type() == core.VarDesc.VarType.READER:
-        return False
-    return var.persistable
-
-
-def _clone_var_in_block_(block, var):
-    assert isinstance(var, Variable)
-    if var.desc.type() == core.VarDesc.VarType.LOD_TENSOR:
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=True)
-    else:
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            persistable=True)
-
-
-def _get_valid_program(main_program):
-    if main_program is None:
-        main_program = default_main_program()
-    elif isinstance(main_program, CompiledProgram):
-        main_program = main_program._program
-        if main_program is None:
-            raise TypeError("program should be as Program type or None")
-        warnings.warn(
-            "The input is a CompiledProgram, this is not recommended.")
-    if not isinstance(main_program, Program):
-        raise TypeError("program should be as Program type or None")
-    return main_program
-
-
-def save_vars(executor,
-              dirname,
-              main_program=None,
-              vars=None,
-              predicate=None,
-              filename=None):
-    """
-    Save variables to the given directory by executor.
-
-    There are two ways to specify variables to be saved: The first way, list
-    variables in a list and assign it to the `vars`. The second way, assign the
-    `main_program` with an existing program, then all variables in the program
-    will be saved. The first way has a higher priority. In other words, if `vars`
-    are assigned, the `main_program` and the `predicate` will be ignored.
-
-    The `dirname` are used to specify the folder where to save variables.
-    If you prefer to save variables in separate files in the folder `dirname`,
-    set `filename` None; if you prefer to save all variables in a single file,
-    use `filename` to specify it.
-
-    Args:
-        executor(Executor): The executor to run for saving variables.
-        dirname(str): The directory path.
-        main_program(Program|None): The program whose variables will be saved.
-                                    If it is None, the default main program will
-                                    be used automatically.
-                                    Default: None
-        vars(list[Variable]|None): The list that contains all variables to save.
-                                   It has a higher priority than the `main_program`.
-                                   Default: None
-        predicate(function|None): If it is not None, only variables in the
-                                  `main_program` that makes predicate(variable)==True
-                                  will be saved. It only works when we are using the
-                                  `main_program` to specify variables (In other words
-                                  `vars` is None).
-                                  Default: None
-        filename(str|None): The file which to save all variables. If you prefer to save
-                            variables separately, set it to None.
-                            Default: None
-
-    Returns:
-        None
-
-    Raises:
-        TypeError: If `main_program` is not an instance of Program nor None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
-                w = fluid.layers.create_parameter(shape=[784, 200], dtype='float32', name='fc_w')
-                b = fluid.layers.create_parameter(shape=[200], dtype='float32', name='fc_b')
-                hidden_w = fluid.layers.matmul(x=data, y=w)
-                hidden_b = fluid.layers.elementwise_add(hidden_w, b)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
-
-            param_path = "./my_paddle_model"
-            # The first usage: using `main_program` to specify variables
-            def name_has_fc(var):
-                res = "fc" in var.name
-                return res
-            fluid.io.save_vars(executor=exe, dirname=param_path, main_program=main_prog,
-                               vars=None, predicate = name_has_fc)
-            # All variables in `main_program` whose name includes "fc" will be saved.
-            # And variables are going to be saved separately.
-
-
-            # The second usage: using `vars` to specify variables
-            var_list = [w, b]
-            path = "./my_paddle_vars"
-            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list,
-                               filename="vars_file")
-            # var_a, var_b and var_c will be saved. And they are going to be
-            # saved in the same file named 'var_file' in the path "./my_paddle_vars".
-    """
-    save_dirname = os.path.normpath(dirname)
-    main_program = _get_valid_program(main_program)
-
-    if vars is None:
-        save_vars(
-            executor,
-            main_program=main_program,
-            dirname=save_dirname,
-            vars=list(filter(predicate, main_program.list_vars())),
-            filename=filename)
-    else:
-        # give warning when there is no var in model
-        if len(list(vars)) == 0:
-            warnings.warn(
-                "no variable in your model, please ensure there are any variables in your model to save"
-            )
-            return None
-
-        save_program = Program()
-        save_block = save_program.global_block()
-
-        save_var_map = {}
-        for each_var in vars:
-            # NOTE: don't save the variable which type is RAW
-            if each_var.type == core.VarDesc.VarType.RAW:
-                continue
-            new_var = _clone_var_in_block_(save_block, each_var)
-            if filename is None:
-                save_file_path = os.path.join(save_dirname, new_var.name)
-                save_file_path = os.path.normpath(save_file_path)
-                save_block.append_op(
-                    type='save',
-                    inputs={'X': [new_var]},
-                    outputs={},
-                    attrs={'file_path': save_file_path})
-            else:
-                save_var_map[new_var.name] = new_var
-
-        if filename is not None:
-            save_var_list = []
-            for name in sorted(save_var_map.keys()):
-                save_var_list.append(save_var_map[name])
-
-            save_block.append_op(
-                type='save_combine',
-                inputs={'X': save_var_list},
-                outputs={},
-                attrs={'file_path': os.path.join(save_dirname, filename)})
-
-        executor.run(save_program)
-
-
-def save_params(executor, dirname, main_program=None, filename=None):
-    """
-    This function filters out all parameters from the give `main_program`
-    and then save them to the folder `dirname` or the file `filename`.
-
-    Use the `dirname` to specify the saving folder. If you would like to
-    save parameters in separate files, set `filename` None; if you would
-    like to save all parameters in a single file, use `filename` to specify
-    the file name.
-
-    NOTICE: Some variables are not Parameter while they are necessary for
-    training. So you can NOT save and continue your training just by
-    `save_params()` and `load_params()`. Please use `save_persistables()`
-    and `load_persistables()` instead. If you want to save your model for
-    the inference, please use the `save_inference_model` API. You can refer
-    to :ref:`api_guide_model_save_reader_en` for more details.
-
-    Args:
-        executor(Executor): The executor to run for saving parameters.
-        dirname(str): The saving directory path.
-        main_program(Program|None): The program whose parameters will be
-                                    saved. If it is None, the default
-                                    main program will be used automatically.
-                                    Default: None
-        filename(str|None): The file to save all parameters. If you prefer
-                            to save parameters in differnet files, set it
-                            to None.
-                            Default: None
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            fluid.io.save_params(executor=exe, dirname=param_path,
-                                 main_program=None)
-    """
-    save_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        vars=None,
-        predicate=is_parameter,
-        filename=filename)
-
-
-def _save_distributed_persistables(executor, dirname, main_program):
-    """
-    save_persistables for distributed training.
-    the method will do things listed below:
-    1.save part of persistable variables on trainer.
-    2.receive "remote prefetch variables" from parameter servers and merge them.
-    3.save "distributed lookup table" on parameter servers.
-    4.receive "optimizer variables" from parameter servers and merge them.
-
-    Args:
-        executor(Executor): The executor to run for saving parameters.
-        dirname(str): The saving directory path.
-        main_program(Program): The program whose parameters will be
-                            saved. the main_program must be the trainer_program
-                            get after transpiler.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            t = distribute_transpiler.DistributeTranspiler()
-            t.transpile(...)
-            train_program = t.get_trainer_program()
-            _save_distributed_persistables(executor=exe, dirname=param_path, main_program=train_program)
-    """
-
-    def __save_remote_params(executor, dirname, remote_params_map):
-        """
-        recive params on pserver through rpc.
-        if the params are be sliced, will concat them to one, then save it.
-        """
-        if not remote_params_map:
-            return
-
-        prog = Program()
-        block = prog.global_block()
-
-        # recv optimize vars from pserver
-        for name, remote_params in remote_params_map.items():
-            origin_var = None
-            is_slice = False
-            slice_vars = [0] * len(remote_params)
-            slice_var_names = [""] * len(remote_params)
-            endpoints = [""] * len(remote_params)
-
-            for idx, optimizer in enumerate(remote_params):
-                origin = optimizer.origin
-                slice = optimizer.slice
-                is_slice = optimizer.is_slice
-                block_id = optimizer.block_id
-                endpoint = optimizer.endpoint
-
-                if idx == 0:
-                    origin_var = block.create_var(
-                        name=origin.name,
-                        type=origin.type,
-                        shape=origin.shape,
-                        dtype=origin.dtype,
-                        persistable=True)
-
-                slice_var = block.create_var(
-                    name="{}.slice.{}".format(slice.name, idx),
-                    type=slice.type,
-                    shape=slice.shape,
-                    dtype=slice.dtype,
-                    persistable=True)
-
-                index = block_id if is_slice else idx
-                slice_vars[index] = slice_var
-                slice_var_names[index] = slice.name
-                endpoints[index] = endpoint
-
-            if is_slice:
-                block.append_op(
-                    type='recv',
-                    inputs={"X": []},
-                    outputs={"Out": slice_vars},
-                    attrs={
-                        "epmap": endpoints,
-                        "with_barrier": False,
-                        "varnames": slice_var_names,
-                        "sync_mode": True
-                    })
-                block.append_op(
-                    type='concat',
-                    inputs={'X': slice_vars},
-                    outputs={'Out': origin_var},
-                    attrs={})
-            else:
-                block.append_op(
-                    type='recv',
-                    inputs={"X": []},
-                    outputs={"Out": [origin_var]},
-                    attrs={
-                        "epmap": endpoints[:1],
-                        "with_barrier": False,
-                        "varnames": slice_var_names,
-                        "sync_mode": True
-                    })
-            block.append_op(
-                type='save',
-                inputs={'X': [origin_var]},
-                outputs={},
-                attrs={'file_path': os.path.join(dirname, origin_var.name)})
-            block.append_op(type='delete_var', inputs={'X': slice_vars})
-        executor.run(prog)
-
-    def __save_distributed_lookup_tables(executor, dirname,
-                                         distributed_lookup_table, endpoints):
-        """
-        because the distributed lookup table may too huge to merge and save at one place,
-        it will be saved at parameter server independent respectively.
-
-        the save directory is dirname/"__lookup_table__".
-
-        """
-        prog = Program()
-        block = prog.global_block()
-
-        # if there is lookup table, the trainer 0 will notify all pserver to save.
-        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
-        attrs = {}
-        attrs['epmap'] = endpoints
-        attrs['dir'] = lookup_table_filename
-        attrs['lookup_table'] = distributed_lookup_table
-        block.append_op(
-            type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-        executor.run(prog)
-
-    def __exclude_vars(exclude_var_names=[]):
-        def is_valid(var):
-            if var.name in exclude_var_names:
-                return False
-            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                        var.desc.type() == core.VarDesc.VarType.READER:
-                return False
-            return var.persistable
-
-        return is_valid
-
-    if not isinstance(main_program, Program):
-        raise TypeError("'main_program' should be an instance of Program.")
-
-    if not main_program._is_distributed:
-        raise ValueError(
-            "'_save_distributed_persistables' just be designed for distributed training."
-        )
-
-    remote_params_map = main_program._parameters_on_pservers.get_distributed_vars_by_vtypes(
-        ["Optimizer", "RemotePrefetch"], groupby=True)
-
-    exclude_var_names = []
-    if remote_params_map:
-        exclude_var_names.extend(remote_params_map.keys())
-
-    if main_program._distributed_lookup_table:
-        if isinstance(main_program._distributed_lookup_table, list):
-            exclude_var_names.extend(main_program._distributed_lookup_table)
-        else:
-            exclude_var_names.append(main_program._distributed_lookup_table)
-
-    local_vars = list(
-        filter(__exclude_vars(exclude_var_names), main_program.list_vars()))
-    save_vars(
-        executor, main_program=main_program, dirname=dirname, vars=local_vars)
-
-    if main_program._is_chief:
-        if remote_params_map:
-            __save_remote_params(executor, dirname, remote_params_map)
-        if main_program._distributed_lookup_table:
-            __save_distributed_lookup_tables(
-                executor, dirname, main_program._distributed_lookup_table,
-                main_program._endpoints)
-
-
-def save_persistables(executor, dirname, main_program=None, filename=None):
-    """
-    This function filters out all variables with `persistable==True` from the
-    give `main_program` and then saves these variables to the folder `dirname`
-    or file `filename`.
-
-    The `dirname` is used to specify the folder where persistable variables
-    are going to be saved. If you would like to save variables in separate
-    files, set `filename` None; if you would like to save all variables in a
-    single file, use `filename` to specify the file name.
-
-    Args:
-        executor(Executor): The executor to run for saving persistable variables.
-        dirname(str): The directory path.
-        main_program(Program|None): The program whose persistbale variables will
-                                    be saved. If it is None, the default main
-                                    program will be used automatically.
-                                    Default: None
-        filename(str|None): The file to saved all variables. If you prefer to
-                            save variables in differnet files, set it to None.
-                            Default: None
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            # `prog` can be a program defined by the user
-            prog = fluid.default_main_program()
-            fluid.io.save_persistables(executor=exe, dirname=param_path,
-                                       main_program=prog)
-    """
-    if main_program and main_program._is_distributed:
-        _save_distributed_persistables(
-            executor, dirname=dirname, main_program=main_program)
-    else:
-        save_vars(
-            executor,
-            dirname=dirname,
-            main_program=main_program,
-            vars=None,
-            predicate=is_persistable,
-            filename=filename)
-
-
-def load_vars(executor,
-              dirname,
-              main_program=None,
-              vars=None,
-              predicate=None,
-              filename=None):
-    """
-    Load variables from the given directory by executor.
-
-    There are two ways to specify variables to be loaded: The first way, list
-    variables in a list and assign it to the `vars`. The second way, assign the
-    `main_program` with an existing program, then all variables in the program
-    will be loaded. The first way has a higher priority. In other words if `vars`
-    are assigned, the `main_program` and the `predicate` will be ignored.
-
-    The `dirname` are used to specify the folder where to load variables.
-    If variables were saved in separate files in the folder `dirname`,
-    set `filename` None; if all variables were saved in a single file,
-    use `filename` to specify it.
-
-    Args:
-        executor(Executor): The executor to run for loading variables.
-        dirname(str): The directory path.
-        main_program(Program|None): The program whose variables will be loaded.
-                                    If it is None, the default main program will
-                                    be used automatically.
-                                    Default: None
-        vars(list[Variable]|None): The list that contains all variables to load.
-                                   It has a higher priority than the `main_program`.
-                                   Default: None
-        predicate(function|None): If it is not None, only variables in the
-                                  `main_program` that makes predicate(variable)==True
-                                  will be loaded. It only works when we are using the
-                                  `main_program` to specify variables (In other words
-                                  `vars` is None).
-                                  Default: None
-        filename(str|None): The file which saved all required variables. If variables
-                            were saved in differnet files, set it to None.
-                            Default: None
-
-    Returns:
-        None
-
-    Raises:
-        TypeError: If `main_program` is not an instance of Program nor None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
-                w = fluid.layers.create_parameter(shape=[784, 200], dtype='float32', name='fc_w')
-                b = fluid.layers.create_parameter(shape=[200], dtype='float32', name='fc_b')
-                hidden_w = fluid.layers.matmul(x=data, y=w)
-                hidden_b = fluid.layers.elementwise_add(hidden_w, b)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
-
-            param_path = "./my_paddle_model"
-            # The first usage: using `main_program` to specify variables
-            def name_has_fc(var):
-                res = "fc" in var.name
-                return res
-            fluid.io.save_vars(executor=exe, dirname=param_path, main_program=main_prog,
-                              vars=None, predicate=name_has_fc)
-            fluid.io.load_vars(executor=exe, dirname=param_path, main_program=main_prog,
-                               vars=None, predicate=name_has_fc)
-            # All variables in `main_program` whose name includes "fc" will be loaded.
-            # And all the variables are supposed to have been saved in differnet files.
-
-            # The second usage: using `vars` to specify variables
-            path = "./my_paddle_vars"
-            var_list = [w, b]
-            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list,
-                               filename="vars_file")
-            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list,
-                               filename="vars_file")
-            # w and b will be loaded. And they are supposed to haven
-            # been saved in the same file named 'var_file' in the path "./my_paddle_vars".
-    """
-    load_dirname = os.path.normpath(dirname)
-
-    if vars is None:
-        if main_program is None:
-            main_program = default_main_program()
-        if not isinstance(main_program, Program):
-            raise TypeError("program's type should be Program")
-
-        load_vars(
-            executor,
-            dirname=load_dirname,
-            main_program=main_program,
-            vars=list(filter(predicate, main_program.list_vars())),
-            filename=filename)
-    else:
-        load_prog = Program()
-        load_block = load_prog.global_block()
-
-        if main_program is None:
-            main_program = default_main_program()
-
-        if not isinstance(main_program, Program):
-            raise TypeError("program should be as Program type or None")
-
-        load_var_map = {}
-        for each_var in vars:
-            assert isinstance(each_var, Variable)
-            if each_var.type == core.VarDesc.VarType.RAW:
-                continue
-            new_var = _clone_var_in_block_(load_block, each_var)
-            if filename is None:
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [new_var]},
-                    attrs={
-                        'file_path': os.path.join(load_dirname, new_var.name)
-                    })
-            else:
-                load_var_map[new_var.name] = new_var
-
-        if filename is not None:
-            load_var_list = []
-            for name in sorted(load_var_map.keys()):
-                load_var_list.append(load_var_map[name])
-
-            load_block.append_op(
-                type='load_combine',
-                inputs={},
-                outputs={"Out": load_var_list},
-                attrs={'file_path': os.path.join(load_dirname, filename)})
-        executor.run(load_prog)
-
-
-def load_params(executor, dirname, main_program=None, filename=None):
-    """
-    This function filters out all parameters from the give `main_program`
-    and then trys to load these parameters from the folder `dirname` or
-    the file `filename`.
-
-    Use the `dirname` to specify the folder where parameters were saved. If
-    parameters were saved in separate files in the folder `dirname`, set
-    `filename` None; if all parameters were saved in a single file, use
-    `filename` to specify the file name.
-
-    NOTICE: Some variables are not Parameter while they are necessary for
-    training. So you can NOT save and continue your training just by
-    `save_params()` and `load_params()`. Please use `save_persistables()`
-    and `load_persistables()` instead.
-    If you want to load the pre-trained model structure and parameters
-    for the inference, please use the `load_inference_model` API. You can
-    refer to :ref:`api_guide_model_save_reader_en` for more details.
-
-    Args:
-        executor(Executor): The executor to run for loading parameters.
-        dirname(str): The directory path.
-        main_program(Program|None): The program whose parameters will be
-                                    loaded. If it is None, the default
-                                    main program will be used automatically.
-                                    Default: None
-        filename(str|None): The file which saved all parameters. If parameters
-                            were saved in differnet files, set it to None.
-                            Default: None
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            fluid.io.load_params(executor=exe, dirname=param_path,
-                                main_program=None)
-    """
-    load_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        predicate=is_parameter,
-        filename=filename)
-
-
-def load_persistables(executor, dirname, main_program=None, filename=None):
-    """
-    This function filters out all variables with `persistable==True` from the
-    give `main_program` and then trys to load these variables from the folder
-    `dirname` or the file `filename`.
-
-    Use the `dirname` to specify the folder where persistable variables were
-    saved. If variables were saved in separate files, set `filename` None;
-    if all variables were saved in a single file, use `filename` to specify
-    the file name.
-
-    Args:
-        executor(Executor): The executor to run for loading persistable variables.
-        dirname(str): The directory path.
-        main_program(Program|None): The program whose persistbale variables will
-                                    be loaded. If it is None, the default main
-                                    program will be used automatically.
-                                    Default: None
-        filename(str|None): The file which saved all variables. If variables were
-                            saved in differnet files, set it to None.
-                            Default: None
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            fluid.io.load_persistables(executor=exe, dirname=param_path,
-                                       main_program=None)
-    """
-
-    if main_program and main_program._is_distributed:
-        _load_distributed_persistables(
-            executor, dirname=dirname, main_program=main_program)
-    else:
-        load_vars(
-            executor,
-            dirname=dirname,
-            main_program=main_program,
-            predicate=is_persistable,
-            filename=filename)
-
-
-def _load_distributed_persistables(executor, dirname, main_program=None):
-    """
-    customized load_persistables for distributed training.
-    it should be used on parameter server,
-
-    Args:
-        executor(Executor): The executor to run for saving parameters.
-        dirname(str): The load directory path.
-        main_program(Program): The program whose parameters will be
-                            loaded. the main_program must be the pserver_program
-                            get after transpiler.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            t = distribute_transpiler.DistributeTranspiler()
-            t.transpile(...)
-            pserver_prog = t.get_pserver_program(...)
-            _load_distributed_persistables(executor=exe, dirname=param_path, main_program=pserver_prog)
-    """
-
-    def __is_distributed_part_var(varname):
-        trainer_idx = varname.find(".trainer_")
-        block_idx = varname.find(".block")
-        return trainer_idx or block_idx
-
-    def __load_persistable_vars(executor, dirname, need_load_vars):
-        load_prog = Program()
-        load_block = load_prog.global_block()
-        need_delete_vars = []
-
-        for param in need_load_vars:
-            origin_var = param.origin
-            slice_var = param.slice
-            is_slice = param.is_slice
-            offset = param.offset
-
-            if is_slice:
-                origin = load_block.create_var(
-                    name="{}.load".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-                slice = load_block.create_var(
-                    name=slice_var.name,
-                    type=slice_var.type,
-                    shape=slice_var.shape,
-                    dtype=slice_var.dtype,
-                    persistable=True)
-
-                dim1_flatten = 1
-                if len(slice.shape) >= 2:
-                    dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
-
-                start = int(offset / dim1_flatten)
-                end = int(offset / dim1_flatten + slice.shape[0])
-
-                load_block.append_op(
-                    type="slice",
-                    inputs={'Input': origin},
-                    outputs={'Out': slice},
-                    attrs={'axes': [0],
-                           'starts': [start],
-                           'ends': [end]})
-
-                need_delete_vars.append(origin)
-            else:
-                origin = load_block.create_var(
-                    name="{}".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-        load_block.append_op(
-            type='delete_var',
-            inputs={'X': need_delete_vars}, )
-
-        executor.run(load_prog)
-
-    if not isinstance(main_program, Program):
-        raise TypeError("'main_program' should be an instance of Program.")
-
-    if not main_program._is_distributed:
-        raise ValueError(
-            "'_load_distributed_persistables' just be designed for distributed training."
-        )
-
-    if not main_program._ps_endpoint:
-        raise ValueError(
-            "'_load_distributed_persistables' need current_endpoint set in DistributeTranspiler.transpile"
-        )
-
-    need_load_vars = main_program._parameters_on_pservers.get_distributed_vars_by_ep(
-        main_program._ps_endpoint)
-    __load_persistable_vars(executor, dirname, need_load_vars)
-
-
-def prepend_feed_ops(inference_program,
-                     feed_target_names,
-                     feed_holder_name='feed'):
-    if len(feed_target_names) == 0:
-        return
-
-    global_block = inference_program.global_block()
-    feed_var = global_block.create_var(
-        name=feed_holder_name,
-        type=core.VarDesc.VarType.FEED_MINIBATCH,
-        persistable=True)
-
-    for i, name in enumerate(feed_target_names):
-        out = global_block.var(name)
-        global_block._prepend_op(
-            type='feed',
-            inputs={'X': [feed_var]},
-            outputs={'Out': [out]},
-            attrs={'col': i})
-
-
-def append_fetch_ops(inference_program,
-                     fetch_target_names,
-                     fetch_holder_name='fetch'):
-    global_block = inference_program.global_block()
-    fetch_var = global_block.create_var(
-        name=fetch_holder_name,
-        type=core.VarDesc.VarType.FETCH_LIST,
-        persistable=True)
-
-    for i, name in enumerate(fetch_target_names):
-        global_block.append_op(
-            type='fetch',
-            inputs={'X': [name]},
-            outputs={'Out': [fetch_var]},
-            attrs={'col': i})
-
-
-def save_inference_model(dirname,
-                         feeded_var_names,
-                         target_vars,
-                         executor,
-                         main_program=None,
-                         model_filename=None,
-                         params_filename=None,
-                         export_for_deployment=True,
-                         program_only=False):
-    """
-    Prune the given `main_program` to build a new program especially for inference,
-    and then save it and all related parameters to given `dirname` by the `executor`.
-    If you just want to save parameters of your trained model, please use the
-    `save_params` API. You can refer to :ref:`api_guide_model_save_reader_en` for
-    more details.
-
-
-    Args:
-        dirname(str): The directory path to save the inference model.
-        feeded_var_names(list[str]): Names of variables that need to be feeded data
-                                     during inference.
-        target_vars(list[Variable]): Variables from which we can get inference
-                                     results.
-        executor(Executor): The executor that saves the inference model.
-        main_program(Program|None): The original program, which will be pruned to
-                                    build the inference model. If is setted None,
-                                    the default main program will be used.
-                                    Default: None.
-        model_filename(str|None): The name of file to save the inference program
-                                  itself. If is setted None, a default filename
-                                  `__model__` will be used.
-        params_filename(str|None): The name of file to save all related parameters.
-                                   If it is setted None, parameters will be saved
-                                   in separate files .
-        export_for_deployment(bool): If True, programs are modified to only support
-                                     direct inference deployment. Otherwise,
-                                     more information will be stored for flexible
-                                     optimization and re-training. Currently, only
-                                     True is supported.
-        program_only(bool): If True, It will save inference program only, and do not save params of Program.
-
-    Returns:
-        target_var_name_list(list): The fetch variables' name list
-
-    Raises:
-        ValueError: If `feed_var_names` is not a list of basestring.
-        ValueError: If `target_vars` is not a list of Variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            path = "./infer_model"
-
-            # User defined network, here a softmax regresssion example
-            image = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            feeder = fluid.DataFeeder(feed_list=[image, label], place=fluid.CPUPlace())
-            predict = fluid.layers.fc(input=image, size=10, act='softmax')
-
-            loss = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_loss = fluid.layers.mean(loss)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-
-            # Feed data and train process
-
-            # Save inference model. Note we don't save label and loss in this example
-            fluid.io.save_inference_model(dirname=path,
-                                          feeded_var_names=['img'],
-                                          target_vars=[predict],
-                                          executor=exe)
-
-            # In this example, the function will prune the default main program
-            # to make it suitable for infering the `predict` var. The pruned
-            # inference program is going to be saved in the "./infer_model/__model__"
-            # and parameters are going to be saved in separate files under folder
-            # "./infer_model".
-
-    """
-    if isinstance(feeded_var_names, six.string_types):
-        feeded_var_names = [feeded_var_names]
-    elif export_for_deployment:
-        if len(feeded_var_names) > 0:
-            # TODO(paddle-dev): polish these code blocks
-            if not (bool(feeded_var_names) and all(
-                    isinstance(name, six.string_types)
-                    for name in feeded_var_names)):
-                raise ValueError("'feed_var_names' should be a list of str.")
-
-    if isinstance(target_vars, Variable):
-        target_vars = [target_vars]
-    elif export_for_deployment:
-        if not (bool(target_vars) and
-                all(isinstance(var, Variable) for var in target_vars)):
-            raise ValueError("'target_vars' should be a list of Variable.")
-
-    main_program = _get_valid_program(main_program)
-
-    # remind user to set auc_states to zeros if the program contains auc op 
-    all_ops = main_program.global_block().ops
-    for op in all_ops:
-        if op.type == 'auc':
-            warnings.warn(
-                "please ensure that you have set the auc states to zeros before saving inference model"
-            )
-            break
-
-    # fix the bug that the activation op's output as target will be pruned.
-    # will affect the inference performance.
-    # TODO(Superjomn) add an IR pass to remove 1-scale op.
-    with program_guard(main_program):
-        uniq_target_vars = []
-        for i, var in enumerate(target_vars):
-            if isinstance(var, Variable):
-                var = layers.scale(
-                    var, 1., name="save_infer_model/scale_{}".format(i))
-            uniq_target_vars.append(var)
-        target_vars = uniq_target_vars
-    target_var_name_list = [var.name for var in target_vars]
-
-    # when a pserver and a trainer running on the same machine, mkdir may conflict
-    save_dirname = dirname
-    try:
-        save_dirname = os.path.normpath(dirname)
-        os.makedirs(save_dirname)
-    except OSError as e:
-        if e.errno != errno.EEXIST:
-            raise
-
-    if model_filename is not None:
-        model_basename = os.path.basename(model_filename)
-    else:
-        model_basename = "__model__"
-    model_basename = os.path.join(save_dirname, model_basename)
-
-    # When export_for_deployment is true, we modify the program online so that
-    # it can only be loaded for inference directly. If it's false, the whole
-    # original program and related meta are saved so that future usage can be
-    # more flexible.
-
-    origin_program = main_program.clone()
-
-    if export_for_deployment:
-        main_program = main_program.clone()
-        global_block = main_program.global_block()
-        need_to_remove_op_index = []
-        for i, op in enumerate(global_block.ops):
-            op.desc.set_is_target(False)
-            if op.type == "feed" or op.type == "fetch":
-                need_to_remove_op_index.append(i)
-
-        for index in need_to_remove_op_index[::-1]:
-            global_block._remove_op(index)
-
-        main_program.desc.flush()
-
-        main_program = main_program._prune(feeded_var_names, target_vars)
-        main_program = main_program._inference_optimize(prune_read_op=True)
-        fetch_var_names = [v.name for v in target_vars]
-
-        prepend_feed_ops(main_program, feeded_var_names)
-        append_fetch_ops(main_program, fetch_var_names)
-
-        with open(model_basename, "wb") as f:
-            f.write(main_program.desc.serialize_to_string())
-    else:
-        # TODO(panyx0718): Save more information so that it can also be used
-        # for training and more flexible post-processing.
-        with open(model_basename + ".main_program", "wb") as f:
-            f.write(main_program.desc.serialize_to_string())
-
-    if program_only:
-        warnings.warn(
-            "save_inference_model specified the param `program_only` to True, It will not save params of Program."
-        )
-        return target_var_name_list
-
-    main_program._copy_dist_param_info_from(origin_program)
-
-    if params_filename is not None:
-        params_filename = os.path.basename(params_filename)
-
-    save_persistables(executor, save_dirname, main_program, params_filename)
-    return target_var_name_list
-
-
-def load_inference_model(dirname,
-                         executor,
-                         model_filename=None,
-                         params_filename=None,
-                         pserver_endpoints=None):
-    """
-    Load inference model from a directory. By this API, you can get the model
-    structure(inference program) and model parameters. If you just want to load
-    parameters of the pre-trained model, please use the `load_params` API.
-    You can refer to :ref:`api_guide_model_save_reader_en` for more details.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        model_filename(str|None): The name of file to load inference program.
-                                  If it is None, the default filename
-                                  '__model__' will be used.
-                                  Default: None
-        params_filename(str|None): The name of file to load all parameters.
-                                   It is only used for the case that all
-                                   parameters were saved in a single binary
-                                   file. If parameters were saved in separate
-                                   files, set it as 'None'.
-        pserver_endpoints(list|None): This only need by distributed inference.
-                                    When use distributed look up table in training,
-                                    We also need it in inference.The parameter is
-                                    a list of pserver endpoints.
-
-    Returns:
-        tuple: The return of this function is a tuple with three elements:
-        (program, feed_target_names, fetch_targets). The `program` is a
-        Program, it's the program for inference. The `feed_target_names` is
-        a list of str, it contains Names of variables that need to feed
-        data in the inference program. The `fetch_targets` is a list of
-        Variable. It contains variables from which we can get inference
-        results.
-
-    Raises:
-        ValueError: If `dirname` is not a existing directory.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
-                w = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
-                b = fluid.layers.create_parameter(shape=[200], dtype='float32')
-                hidden_w = fluid.layers.matmul(x=data, y=w)
-                hidden_b = fluid.layers.elementwise_add(hidden_w, b)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
-            path = "./infer_model"
-            fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
-                         target_vars=[hidden_b], executor=exe, main_program=main_prog)
-            tensor_img = np.array(np.random.random((1, 64, 784)), dtype=np.float32)
-            [inference_program, feed_target_names, fetch_targets] = (
-                fluid.io.load_inference_model(dirname=path, executor=exe))
-            results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
-
-            # endpoints is your pserver endpoints list, the above is just an example
-            endpoints = ["127.0.0.1:2023","127.0.0.1:2024"]
-            # if we need lookup table, we will use:
-            [dist_inference_program, dist_feed_target_names, dist_fetch_targets] = (
-                fluid.io.load_inference_model(dirname=path,
-                                              executor=exe,
-                                              pserver_endpoints=endpoints))
-
-            # In this example, the inference program was saved in the
-            # "./infer_model/__model__" and parameters were saved in
-            # separate files in "./infer_model".
-            # After getting inference program, feed target names and
-            # fetch targets, we can use an Executor to run the inference
-            # program to get the inference result.
-    """
-    load_dirname = os.path.normpath(dirname)
-    if not os.path.isdir(load_dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if model_filename is not None:
-        model_filename = os.path.basename(model_filename)
-    else:
-        model_filename = "__model__"
-    model_filename = os.path.join(load_dirname, model_filename)
-
-    if params_filename is not None:
-        params_filename = os.path.basename(params_filename)
-
-    with open(model_filename, "rb") as f:
-        program_desc_str = f.read()
-
-    program = Program.parse_from_string(program_desc_str)
-    if not core._is_program_version_supported(program._version()):
-        raise ValueError("Unsupported program version: %d\n" %
-                         program._version())
-    # Binary data also need versioning.
-    load_persistables(executor, load_dirname, program, params_filename)
-
-    if pserver_endpoints:
-        program = _endpoints_replacement(program, pserver_endpoints)
-
-    feed_target_names = program.desc.get_feed_target_names()
-    fetch_target_names = program.desc.get_fetch_target_names()
-    fetch_targets = [
-        program.global_block().var(name) for name in fetch_target_names
-    ]
-
-    return [program, feed_target_names, fetch_targets]
-
-
-def _endpoints_replacement(program, endpoints):
-    ENDPOINT_MAP = "epmap"
-    for op in program.global_block().ops:
-        if op.has_attr(ENDPOINT_MAP):
-            op.set_attr(ENDPOINT_MAP, endpoints)
-    program._sync_with_cpp()
-    return program
-
-
-def get_parameter_value(para, executor):
-    """
-    Get the LoDTensor value of the given parameter.
-
-    Args:
-        para(Parameter): The parameter to get value from.
-        executor(Executor): The executor to run for retrieving the value.
-
-    Returns:
-        numpy.array: The given parameter's values.
-
-    Raises:
-        AssertionError: If the `para` is not an instance of Parameter.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            exe = fluid.Executor(fluid.CPUPlace())
-            param = fluid.default_main_program().global_block().var('fc.w')
-            p = fluid.io.get_parameter_value(param, exe)
-
-    """
-    assert is_parameter(para)
-
-    get_program = Program()
-    block = get_program.global_block()
-    new_var = _clone_var_in_block_(block, para)
-    return executor.run(get_program, feed={}, fetch_list=[new_var])[0]
-
-
-def get_parameter_value_by_name(name, executor, program=None):
-    """
-    Get the LoDTensor value of a certain parameter by its name.
-
-    Args:
-        name(str): The parameter's name.
-        executor(Executor): The executor to run for retrieving the value.
-        program(Program | None): The program where to find the parameter.
-                               If it's set to be None, the function will
-                               try to find the parameter in the default
-                               main program.
-
-    Returns:
-        numpy.array: The parameter's values.
-
-    Raises:
-        TypeError: If given `name` is not an instance of basestring.
-        TypeError: If the parameter with the given name doesn't exist.
-        AssertionError: If there is a varibale named `name` in the
-                        given program but it is not a Parameter.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            exe = fluid.Executor(fluid.CPUPlace())
-            p = fluid.io.get_parameter_value('fc.w', exe)
-    """
-    if program is None:
-        program = default_main_program()
-    var = program.global_block().var(name)
-    return get_parameter_value(var, executor)
-
-
-def _save_persistable_nodes(executor, dirname, graph):
-    """
-    Save persistable nodes to the given directory by the executor.
-
-    Args:
-        executor(Executor): The executor to run for saving node values.
-        dirname(str): The directory path.
-        graph(IrGraph): All the required persistable nodes in the graph will be saved.
-    """
-    persistable_node_names = set()
-    persistable_nodes = []
-    all_persistable_nodes = graph.all_persistable_nodes()
-    for node in all_persistable_nodes:
-        name = cpt.to_text(node.name())
-        if name not in persistable_node_names:
-            persistable_node_names.add(name)
-            persistable_nodes.append(node)
-    program = Program()
-    var_list = []
-    for node in persistable_nodes:
-        var_desc = node.var()
-        if var_desc.type() == core.VarDesc.VarType.RAW or \
-                var_desc.type() == core.VarDesc.VarType.READER:
-            continue
-        var = program.global_block().create_var(
-            name=var_desc.name(),
-            shape=var_desc.shape(),
-            dtype=var_desc.dtype(),
-            type=var_desc.type(),
-            lod_level=var_desc.lod_level(),
-            persistable=var_desc.persistable())
-        var_list.append(var)
-    save_vars(executor=executor, dirname=dirname, vars=var_list)
-
-
-def _load_persistable_nodes(executor, dirname, graph):
-    """
-    Load persistable node values from the given directory by the executor.
-
-    Args:
-        executor(Executor): The executor to run for loading node values.
-        dirname(str): The directory path.
-        graph(IrGraph): All the required persistable nodes in the graph will be loaded.
-    """
-    persistable_node_names = set()
-    persistable_nodes = []
-    all_persistable_nodes = graph.all_persistable_nodes()
-    for node in all_persistable_nodes:
-        name = cpt.to_text(node.name())
-        if name not in persistable_node_names:
-            persistable_node_names.add(name)
-            persistable_nodes.append(node)
-    program = Program()
-    var_list = []
-
-    def _exist(var):
-        return os.path.exists(os.path.join(dirname, var.name))
-
-    for node in persistable_nodes:
-        var_desc = node.var()
-        if var_desc.type() == core.VarDesc.VarType.RAW or \
-                var_desc.type() == core.VarDesc.VarType.READER:
-            continue
-        var = program.global_block().create_var(
-            name=var_desc.name(),
-            shape=var_desc.shape(),
-            dtype=var_desc.dtype(),
-            type=var_desc.type(),
-            lod_level=var_desc.lod_level(),
-            persistable=var_desc.persistable())
-        if _exist(var):
-            var_list.append(var)
-        else:
-            _logger.warn("Cannot find the var %s!!!" % (node.name()))
-    load_vars(executor=executor, dirname=dirname, vars=var_list)
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
deleted file mode 100644
index 11e3c4938bef4a3c97a724798e2f7273c25f06ed..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layer_helper.py
+++ /dev/null
@@ -1,175 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import copy
-import six
-
-from .framework import Parameter, dtype_is_floating, in_dygraph_mode
-from . import unique_name
-from paddle.fluid.initializer import Constant, Xavier
-from .param_attr import ParamAttr
-from . import core
-from six.moves import zip
-from .layer_helper_base import LayerHelperBase
-
-
-class LayerHelper(LayerHelperBase):
-    def __init__(self, layer_type, **kwargs):
-        self.kwargs = kwargs
-        name = self.kwargs.get('name', None)
-        # TODO(panyx0718, minqiyang): dygraph mode
-        # can not use both `layer_type` and `name`. Deprecate LayerHelper
-        # and write a Helper for dygraph mode.
-        if name is None:
-            self.kwargs['name'] = unique_name.generate(layer_type)
-
-        super(LayerHelper, self).__init__(
-            self.kwargs['name'], layer_type=layer_type)
-
-    def append_op(self, *args, **kwargs):
-        return self.main_program.current_block().append_op(*args, **kwargs)
-
-    def multiple_input(self, input_param_name='input'):
-        inputs = self.kwargs.get(input_param_name, [])
-        ret = []
-        if isinstance(inputs, list) or isinstance(inputs, tuple):
-            for inp in inputs:
-                ret.append(self.to_variable(inp))
-        else:
-            ret.append(self.to_variable(inputs))
-        return ret
-
-    def input(self, input_param_name='input'):
-        inputs = self.multiple_input(input_param_name)
-        if len(inputs) != 1:
-            raise "{0} layer only takes one input".format(self.layer_type)
-        return inputs[0]
-
-    @property
-    def param_attr(self):
-        return ParamAttr._to_attr(self.kwargs.get('param_attr', None))
-
-    @property
-    def bias_attr(self):
-        return ParamAttr._to_attr(self.kwargs.get('bias_attr', None))
-
-    #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of param_attr
-    def multiple_param_attr(self, length):
-        param_attr = self.param_attr
-        if isinstance(param_attr, ParamAttr):
-            param_attr = [param_attr]
-
-        if len(param_attr) != 1 and len(param_attr) != length:
-            raise ValueError("parameter number mismatch")
-        elif len(param_attr) == 1 and length != 1:
-            tmp = [None] * length
-            for i in six.moves.range(length):
-                tmp[i] = copy.deepcopy(param_attr[0])
-            param_attr = tmp
-        return param_attr
-
-    def iter_inputs_and_params(self, input_param_name='input'):
-        inputs = self.multiple_input(input_param_name)
-        param_attrs = self.multiple_param_attr(len(inputs))
-        for ipt, param_attr in zip(inputs, param_attrs):
-            yield ipt, param_attr
-
-    def input_dtype(self, input_param_name='input'):
-        inputs = self.multiple_input(input_param_name)
-        dtype = None
-        for each in inputs:
-            if dtype is None:
-                dtype = each.dtype
-            elif dtype != each.dtype:
-                raise ValueError("Data Type mismatch: %d to %d" %
-                                 (dtype, each.dtype))
-        return dtype
-
-    def get_parameter(self, name):
-        param = self.main_program.global_block().var(name)
-        if not isinstance(param, Parameter):
-            raise ValueError("no Parameter name %s found" % name)
-        return param
-
-    #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of bias_attr
-    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
-        """
-        Append bias operator and return its output. If the user does not set
-        bias_attr, append_bias_op will return input_var
-
-        :param input_var: the input variable. The len(input_var.shape) is
-        larger or equal than 2.
-        :bias_initializer: an instance of a subclass of Initializer used to
-        initialize the bias
-        :param dim_start:
-        :param dim_end: the shape of the bias will be
-        input_var.shape[dim_start:dim_end]. The bias is broadcasted to other
-        dimensions and added to input_var to get the output
-        """
-        size = list(input_var.shape[dim_start:dim_end])
-        bias_attr = self.bias_attr
-        if not bias_attr:
-            return input_var
-
-        b = self.create_parameter(
-            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
-        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-        self.append_op(
-            type='elementwise_add',
-            inputs={'X': [input_var],
-                    'Y': [b]},
-            outputs={'Out': [tmp]},
-            attrs={'axis': dim_start})
-        return tmp
-
-    #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of act
-    def append_activation(self, input_var):
-        act = self.kwargs.get('act', None)
-        if act is None:
-            return input_var
-        if isinstance(act, six.string_types):
-            act = {'type': act}
-        else:
-            raise TypeError(str(act) + " should be unicode or str")
-
-        if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
-            act['use_cudnn'] = self.kwargs.get('use_cudnn')
-        if 'use_mkldnn' in self.kwargs:
-            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
-        act_type = act.pop('type')
-
-        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-        self.append_op(
-            type=act_type,
-            inputs={"X": [input_var]},
-            outputs={"Out": [tmp]},
-            attrs=act)
-        return tmp
-
-    #TODO (jiabin): should we remove this since it has never be used
-    def _get_default_initializer(self, dtype):
-        if dtype is None or dtype_is_floating(dtype) is True:
-            return Xavier()
-        else:
-            # For integer and boolean types, initialize with all zeros
-            return Constant()
-
-    #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of kwargs
-    def is_instance(self, param_name, cls):
-        param = self.kwargs.get(param_name, None)
-        if not isinstance(param, cls):
-            raise TypeError("The input {0} parameter of method {1} must be {2}",
-                            param_name, self.layer_type, cls.__name__)
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
deleted file mode 100644
index 5fa7fef381061a5480d133acbc5729acf9e63d5f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layer_helper_base.py
+++ /dev/null
@@ -1,397 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import copy
-import numpy as np
-
-from .framework import Variable, default_main_program, default_startup_program, in_dygraph_mode, _current_expected_place
-from . import unique_name
-from .param_attr import ParamAttr, WeightNormParamAttr
-from . import core
-
-
-class LayerHelperBase(object):
-    def __init__(self, name, layer_type):
-        self._layer_type = layer_type
-        self._name = name
-
-    @property
-    def name(self):
-        return self._name
-
-    @property
-    def layer_type(self):
-        return self._layer_type
-
-    @property
-    def main_program(self):
-        return default_main_program()
-
-    @property
-    def startup_program(self):
-        return default_startup_program()
-
-    def to_variable(self, value, block=None):
-        """convert value to variable
-
-            Args:
-                value: value to be convert
-                block: the block of the variable
-
-        Return Variable construct from value
-        """
-        if isinstance(value, np.ndarray):
-            assert in_dygraph_mode(
-            ), "to_variable could only be called in dygraph mode"
-
-            if not block:
-                block = default_main_program().current_block()
-            py_var = Variable(
-                block,
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                name=None,
-                shape=value.shape,
-                dtype=value.dtype)
-            var = py_var._ivar.value()
-            tensor = var.get_tensor()
-            tensor.set(value, _current_expected_place())
-            return py_var
-        elif isinstance(value, Variable):
-            return value
-
-    def _create_weight_normalize(self, attr, shape, dtype):
-        from .layers import elementwise_mul, elementwise_div, reshape
-
-        # Remove these ops when LayerHelper and layers support indicating
-        # program and block.
-        def __norm_op(x,
-                      out=None,
-                      p=2,
-                      dim=None,
-                      keep_dim=False,
-                      block=self.startup_program.global_block()):
-            if out is None:
-                out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(".".join(
-                        [self.name, 'weight_norm_norm'])),
-                    dtype=dtype,
-                    persistable=False)
-            abs_out = block.create_var(
-                name=unique_name.generate_with_ignorable_key(".".join(
-                    [self.name, 'weight_norm_abs'])),
-                dtype=dtype,
-                persistable=False)
-            block.append_op(
-                type='abs', inputs={'X': x}, outputs={'Out': abs_out})
-            pow_out = block.create_var(
-                name=unique_name.generate_with_ignorable_key(".".join(
-                    [self.name, 'weight_norm_pow'])),
-                dtype=dtype,
-                persistable=False)
-            block.append_op(
-                type='pow',
-                inputs={'X': abs_out},
-                outputs={'Out': pow_out},
-                attrs={'factor': float(p)})
-            sum_out = block.create_var(
-                name=unique_name.generate_with_ignorable_key(".".join(
-                    [self.name, 'weight_norm_sum'])),
-                dtype=dtype,
-                persistable=False)
-            block.append_op(
-                type='reduce_sum',
-                inputs={'X': pow_out},
-                outputs={'Out': sum_out},
-                attrs={
-                    'dim': dim,
-                    'keep_dim': keep_dim,
-                    'reduce_all': True if dim is None else False
-                })
-            block.append_op(
-                type='pow',
-                inputs={'X': sum_out},
-                outputs={'Out': out},
-                attrs={'factor': 1. / p})
-            return out
-
-        def __reshape_op(x,
-                         shape,
-                         out=None,
-                         block=self.startup_program.global_block()):
-            if out is None:
-                out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(".".join(
-                        [self.name, 'weight_norm_reshape'])),
-                    dtype=dtype,
-                    persistable=False)
-            block.append_op(
-                type='reshape',
-                inputs={'X': x},
-                outputs={'Out': out},
-                attrs={'shape': shape})
-            return out
-
-        def __transpose_op(x,
-                           axis,
-                           out=None,
-                           block=self.startup_program.global_block()):
-            if out is None:
-                out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(".".join(
-                        [self.name, 'weight_norm_transpose'])),
-                    dtype=dtype,
-                    persistable=False)
-            block.append_op(
-                type='transpose',
-                inputs={'X': x},
-                outputs={'Out': out},
-                attrs={'axis': axis})
-            return out
-
-        def __norm_except_dim(x,
-                              out=None,
-                              dim=None,
-                              block=self.startup_program.global_block()):
-            """Computes the norm over all dimensions except dim"""
-            if out is None:
-                out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(".".join(
-                        [self.name, 'weight_norm_norm'])),
-                    dtype=dtype,
-                    persistable=False)
-            if dim is None:
-                __norm_op(x, out, dim=dim, block=block)
-            elif dim == 0:
-                out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
-                reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
-                norm = __norm_op(reshape, dim=[1], block=block)
-                __reshape_op(norm, out=out, shape=out_shape, block=block)
-            elif dim == len(x.shape) - 1:
-                out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
-                reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
-                norm = __norm_op(reshape, dim=[0], block=block)
-                __reshape_op(norm, out=out, shape=out_shape, block=block)
-            else:
-                perm = list(range(len(x.shape)))
-                perm[0], perm[dim] = dim, 0
-                transpose = __transpose_op(x, perm, block=block)
-                out_shape = [transpose.shape[0]] + [1] * (len(transpose.shape) -
-                                                          1)
-                reshape = __reshape_op(
-                    transpose, shape=[transpose.shape[0], -1], block=block)
-                norm = __norm_op(reshape, dim=[1], block=block)
-                reshape2 = __reshape_op(norm, shape=out_shape, block=block)
-                __transpose_op(reshape2, perm, out=out, block=block)
-            return out
-
-        def __weight_normalize(g, v, dim):
-            """Calculations for weight normalization"""
-            norm = __norm_except_dim(
-                v, dim=dim, block=self.main_program.current_block())
-            scale = elementwise_div(
-                x=g, y=norm)  # The shapes of g and norm are the same.
-            # Currently, elementwise_mul only support broadcast when the shape
-            # of y is a subset of the shape of x. Thus, we reshape y to squeeze
-            # to achive the subset.
-            w = elementwise_mul(
-                x=v,
-                y=scale if dim is None else reshape(
-                    x=scale, shape=[v.shape[dim]]),
-                axis=-1 if dim is None else dim)
-            # To serialize the original parameter for inference, maybe a
-            # parameter rather than a variable should be returned.
-            return w
-
-        g_param_attr = copy.deepcopy(attr)
-        g_param_attr.name = attr.name + '_g'
-        g_param_shape = [1] * len(shape)
-        if attr.dim is not None:
-            g_param_shape[attr.dim] = shape[attr.dim]
-        v_param_attr = copy.deepcopy(attr)
-        v_param_attr.name = attr.name + '_v'
-        v_param_shape = shape
-
-        # Add to startup_program to initialize g and v.
-        # Try to reconstruct the initializer of w by initializing g and v.
-        # Set the initializers of g and v as below, then the distribution
-        # of w is the same as initializing w with the given initializer.
-        # For Data-Dependent Initialization, please compute the init-values
-        # of g and v in external and then feed the values to g and v by
-        # executing an extra program.
-        g_param = self.startup_program.global_block().create_parameter(
-            dtype=dtype,
-            shape=g_param_shape,
-            **g_param_attr._to_kwargs(with_initializer=False))
-        v_param = self.startup_program.global_block().create_parameter(
-            dtype=dtype,
-            shape=v_param_shape,
-            **v_param_attr._to_kwargs(with_initializer=True))
-        __norm_except_dim(
-            x=v_param,
-            out=g_param,
-            dim=attr.dim,
-            block=self.startup_program.global_block())
-
-        # keep g_param shape to be consistent with that in main_program
-        __reshape_op(
-            g_param,
-            g_param_shape,
-            out=g_param,
-            block=self.startup_program.global_block())
-
-        # Add weight normalization to main_program
-        g_param = self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs())
-        v_param = self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs())
-        w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
-        return w_param
-
-    # TODO: hide the func after we move the layers to Layers
-    def create_parameter(self,
-                         attr,
-                         shape,
-                         dtype,
-                         is_bias=False,
-                         default_initializer=None,
-                         stop_gradient=False):
-        """Create parameters for this layers.
-
-           Args:
-               attr: [ParamAttr] should be the parameter attribute for this parameter
-               shape: shape of the paramter
-               dtype: data type of this parameter
-               is_bias: if this is a bias parameter
-               default_initializer: set the default initializer for this parameter
-
-        Returns created parameter Variable.
-        """
-        # Deepcopy the attr so that parameters can be shared in program
-        attr = copy.deepcopy(attr)
-        attr = ParamAttr._to_attr(attr)
-        if not attr:
-            return None
-        assert isinstance(attr, ParamAttr)
-        suffix = 'b' if is_bias else 'w'
-        if attr.name is None:
-            attr.name = unique_name.generate(".".join([self.name, suffix]))
-
-        if default_initializer is None and attr.initializer is None:
-            if isinstance(dtype, core.VarDesc.VarType):
-                if dtype != core.VarDesc.VarType.FP32 and \
-                        dtype != core.VarDesc.VarType.FP64 and \
-                        dtype != core.VarDesc.VarType.FP16:
-                    raise TypeError(
-                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
-                    )
-            else:
-                if not (dtype.startswith("float") or dtype == "double"):
-                    raise TypeError(
-                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
-                    )
-            if is_bias:
-                attr._set_default_bias_initializer()
-            else:
-                attr._set_default_param_initializer()
-        else:
-            attr._set_default_initializer(default_initializer)
-
-        # If weight normalization is set, insert extra parameters and ops.
-        # Refer to https://arxiv.org/pdf/1602.07868.pdf
-        if isinstance(attr, WeightNormParamAttr):
-            param = self._create_weight_normalize(attr, shape, dtype)
-            WeightNormParamAttr.params_with_weight_norm.append(param)
-            return param
-        if in_dygraph_mode():
-            # In dygraph mode, we want the returned parameter to be
-            # initialized so that it can be used imperatively.
-            return self.main_program.global_block().create_parameter(
-                dtype=dtype,
-                shape=shape,
-                stop_gradient=stop_gradient,
-                **attr._to_kwargs(with_initializer=True))
-        else:
-            self.startup_program.global_block().create_parameter(
-                dtype=dtype,
-                shape=shape,
-                **attr._to_kwargs(with_initializer=True))
-            return self.main_program.global_block().create_parameter(
-                dtype=dtype, shape=shape, **attr._to_kwargs())
-
-    def create_variable_for_type_inference(self, dtype, stop_gradient=False):
-        """Create a temporary variable that should be type inferred layer.
-
-        Note:
-            The default type will be set to LOD_TENSOR. However, when
-            the var is used as operator output, its type will be updated
-            based on operator's `VarTypeInference` implementation in
-            infer_var_type.
-        """
-        return self.main_program.current_block().create_var(
-            name=unique_name.generate_with_ignorable_key(".".join(
-                [self.name, 'tmp'])),
-            dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            persistable=False,
-            stop_gradient=stop_gradient)
-
-    def create_variable(self, *args, **kwargs):
-        """Create Variable for this layers.
-        Returns created Variable.
-        """
-        return self.main_program.current_block().create_var(*args, **kwargs)
-
-    def create_global_variable(self, persistable=False, *args, **kwargs):
-        """
-        create global variable, note that there is no initializer for this global variable.
-        Args:
-            persistable(bool): True if it is a checkpoint value.
-            *args: See create_var's documentation
-            **kwargs: See create_var's documentation
-
-        Returns(Variable): the created variable.
-        """
-        return self.main_program.global_block().create_var(
-            *args, persistable=persistable, **kwargs)
-
-    def create_or_get_global_variable(self, name, *args, **kwargs):
-        """
-        Creates a global variable if not exists and returns the variable and
-        a boolean flag which is true when it is a new variable.
-        """
-        if self.main_program.global_block().has_var(name):
-            return self.main_program.global_block().var(name), False
-        else:
-            return self.create_global_variable(name=name, *args, **kwargs), True
-
-    def set_variable_initializer(self, var, initializer):
-        """Set target Variable's initializer
-
-           Args:
-               var: target Variable
-               initializer: initializer to use
-        """
-        assert isinstance(var, Variable)
-        if in_dygraph_mode():
-            initializer(var, var.block)
-        else:
-            self.startup_program.global_block().create_var(
-                name=var.name,
-                type=var.type,
-                dtype=var.dtype,
-                shape=var.shape,
-                persistable=True,
-                initializer=initializer)
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
deleted file mode 100644
index d17636d6d54f9c8b00f289b9af961553651b775e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import ops
-from .ops import *
-from . import nn
-from .nn import *
-from . import io
-from .io import *
-from . import tensor
-from .tensor import *
-from . import control_flow
-from .control_flow import *
-from . import device
-from .device import *
-from . import math_op_patch
-from .math_op_patch import *
-from . import detection
-from .detection import *
-from . import metric_op
-from .metric_op import *
-from .learning_rate_scheduler import *
-from .collective import *
-from .distributions import *
-
-__all__ = []
-__all__ += nn.__all__
-__all__ += io.__all__
-__all__ += tensor.__all__
-__all__ += control_flow.__all__
-__all__ += ops.__all__
-__all__ += device.__all__
-__all__ += detection.__all__
-__all__ += metric_op.__all__
-__all__ += learning_rate_scheduler.__all__
-__all__ += distributions.__all__
diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py
deleted file mode 100644
index 9e96624cf7c70f24a7f65a91c7ee41af45ddeb6c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/collective.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from ..layer_helper import LayerHelper, unique_name
-from ..framework import Variable
-
-
-def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
-    helper = LayerHelper("allreduce", **locals())
-    # Convert string reduce type to op int type
-    red_typ_int = 0
-    if reduce_type == "sum":
-        red_typ_int = 0
-    elif reduce_type == "prod":
-        red_typ_int = 1
-    elif reduce_type == "max":
-        red_typ_int = 2
-    elif reduce_type == "min":
-        red_typ_int = 3
-    else:
-        raise TypeError("reduce type can only be [sum|prod|max|min]")
-
-    if out is None:
-        out = helper.create_variable(
-            name=unique_name.generate_with_ignorable_key(".".join(
-                [x.name, 'tmp'])),
-            shape=x.shape,
-            dtype=x.dtype,
-            type=x.type,
-            persistable=x.persistable,
-            stop_gradient=True)
-    helper.append_op(
-        type='allreduce',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={"reduce_type": red_typ_int,
-               "sync_mode": sync_mode})
-    return out
-
-
-def _broadcast(x, root, sync_mode=False):
-    helper = LayerHelper("broadcast", **locals())
-    helper.append_op(
-        type='broadcast',
-        inputs={'X': [x]},
-        outputs={'Out': [x]},
-        attrs={"sync_mode": sync_mode,
-               "root": root})
-    return x
-
-
-def _c_allreduce(x,
-                 out=None,
-                 reduce_type='sum',
-                 ring_id=0,
-                 use_calc_stream=False):
-    helper = LayerHelper('c_allreduce', **locals())
-
-    if reduce_type not in ['sum', 'prob', 'max', 'min']:
-        raise TypeError('reduce type can only be "sum|prod|max|min]"')
-
-    op_type = 'c_allreduce_' + reduce_type
-    if out is None:
-        out = helper.create_variable(
-            name=unique_name.generate_with_ignorable_key('.'.join(
-                [x.name, op_type])),
-            shape=x.shape,
-            dtype=x.dtype,
-            type=x.type,
-            persistable=x.persistable)
-
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'ring_id': ring_id,
-               'use_calc_stream': use_calc_stream})
-    return out
-
-
-def _c_broadcast(x, root=0, ring_id=0, use_calc_stream=False):
-    op_type = 'c_broadcast'
-    helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [x]},
-        outputs={'Out': [x]},
-        attrs={
-            'root': root,
-            'ring_id': ring_id,
-            'use_calc_stream': use_calc_stream
-        })
-    return x
-
-
-def _c_allgather(x, nranks, ring_id=0, use_calc_stream=False):
-    op_type = 'c_allgather'
-    helper = LayerHelper(op_type, **locals())
-    out_shape = list(x.shape[:])
-    if out_shape[0] > 0:
-        out_shape[0] *= nranks
-    out = helper.create_variable(
-        name=unique_name.generate_with_ignorable_key('.'.join(
-            [x.name, op_type])),
-        shape=out_shape,
-        dtype=x.dtype,
-        type=x.type,
-        persistable=x.persistable)
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={
-            'nranks': nranks,
-            'ring_id': ring_id,
-            'use_calc_stream': use_calc_stream
-        })
-    return out
-
-
-def _c_reducescatter(x, nranks, ring_id=0, use_calc_stream=False):
-    if not isinstance(x, Variable):
-        raise TypeError('x must be a Variable')
-
-    if x.shape[0] % nranks != 0:
-        raise ValueError('x.shape[0](%d) cannot be evenly divided by nranks(%d)'
-                         % (x.shape[0], nranks))
-
-    op_type = 'c_reducescatter'
-    helper = LayerHelper(op_type, **locals())
-    out_shape = list(x.shape[:])
-    if out_shape[0] > 0:
-        out_shape[0] //= nranks
-    out = helper.create_variable(
-        name=unique_name.generate_with_ignorable_key('.'.join(
-            [x.name, op_type])),
-        shape=out_shape,
-        dtype=x.dtype,
-        type=x.type,
-        persistable=x.persistable)
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={
-            'nranks': nranks,
-            'ring_id': ring_id,
-            'use_calc_stream': use_calc_stream
-        })
-    return out
-
-
-def _c_sync_calc_stream(x):
-    op_type = 'c_sync_calc_stream'
-    helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type, inputs={'X': [x]}, outputs={'Out': [x]})
-    return x
-
-
-def _c_sync_comm_stream(x, ring_id):
-    op_type = 'c_sync_comm_stream'
-    helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [x]},
-        outputs={'Out': [x]},
-        attrs={'ring_id': ring_id})
-    return x
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
deleted file mode 100644
index 69a7c019710db31ecb84a8ceb35a437f81f3b6f6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/control_flow.py
+++ /dev/null
@@ -1,2212 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from ..wrapped_decorator import signature_safe_contextmanager
-
-from .layer_function_generator import autodoc, templatedoc
-from .tensor import assign, fill_constant
-from .. import core
-from ..framework import Program, Variable, Operator
-from ..layer_helper import LayerHelper, unique_name
-from ..initializer import force_init_on_cpu
-from .nn import logical_and, logical_not, logical_or
-import numpy
-import warnings
-import six
-from functools import reduce
-
-__all__ = [
-    'While', 'Switch', 'increment', 'array_write', 'create_array', 'less_than',
-    'less_equal', 'greater_than', 'greater_equal', 'equal', 'not_equal',
-    'array_read', 'array_length', 'IfElse', 'DynamicRNN', 'StaticRNN',
-    'reorder_lod_tensor_by_rank', 'Print', 'is_empty'
-]
-
-
-def split_lod_tensor(input, mask, level=0):
-    """
-    This function takes in an input that contains the complete lod information,
-    and takes in a mask which is used to mask certain parts of the input.
-    The output is the true branch and the false branch with the mask applied to
-    the input at a certain level in the tensor. Mainly used in IfElse to split
-    data into two parts.
-
-    Args:
-        input(tuple|list|None): The input tensor that contains complete
-                                lod information needed to construct the output.
-        mask(list): A bool column vector which masks the input.
-        level(int): The specific lod level to split.
-
-    Returns:
-        tuple(Variable, Variable):
-        The true branch of tensor as per the mask applied to input.
-
-        The false branch of tensor as per the mask applied to input.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          x = fluid.layers.data(name='x', shape=[1])
-          x.persistable = True
-
-          y = fluid.layers.data(name='y', shape=[1])
-          y.persistable = True
-
-          out_true, out_false = fluid.layers.split_lod_tensor(
-                input=x, mask=y, level=level)
-
-    """
-    helper = LayerHelper('split_lod_tensor', **locals())
-    out_true = helper.create_variable_for_type_inference(dtype=input.dtype)
-    out_false = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='split_lod_tensor',
-        inputs={
-            'X': input,
-            'Mask': mask,
-        },
-        outputs={'OutTrue': out_true,
-                 'OutFalse': out_false},
-        attrs={'level': level})
-    return out_true, out_false
-
-
-def merge_lod_tensor(in_true, in_false, x, mask, level=0):
-    """
-    **merge_lod_tensor**
-
-    This function takes in an input :math:`x`, the True branch, the False
-    branch and a binary :math:`mask`. Using this information, this function
-    merges the True and False branches of the tensor into a single tensor as
-    output at a certain lod level indicated by :math:`level`. Used in IfElse
-    to merge the output if True block and False Block.
-
-    Args:
-        in_true(tuple|list|None): The True branch to be merged.
-        in_false(tuple|list|None): The False branch to be merged.
-        x(tuple|list|None): The input tensor that contains complete
-                            lod information needed to construct the output.
-        mask(list): A bool column vector which masks the input.
-        level(int): The specific lod level to merge.
-
-    Returns:
-        Variable: The merged output tensor.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          x = layers.data(
-                      name='x', shape=[1], dtype='float32', stop_gradient=False)
-          y = layers.data(
-                name='y', shape=[1], dtype='bool', stop_gradient=False)
-
-          level = 0
-
-          out_true, out_false = layers.split_lod_tensor(
-                input=x, mask=y, level=level)
-          out = layers.merge_lod_tensor(
-                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
-    """
-    helper = LayerHelper('merge_lod_tensor', **locals())
-    out = helper.create_variable_for_type_inference(dtype=in_true.dtype)
-    helper.append_op(
-        type='merge_lod_tensor',
-        inputs={'X': x,
-                'Mask': mask,
-                'InTrue': in_true,
-                'InFalse': in_false},
-        outputs={'Out': out},
-        attrs={'level': level})
-    return out
-
-
-def Print(input,
-          first_n=-1,
-          message=None,
-          summarize=20,
-          print_tensor_name=True,
-          print_tensor_type=True,
-          print_tensor_shape=True,
-          print_tensor_lod=True,
-          print_phase='both'):
-    '''
-    **Print operator**
-
-    This creates a print op that will print when a tensor is accessed.
-
-    Wraps the tensor passed in so that whenever that a tensor is accessed,
-    the message `message` is printed, along with the current value of the
-    tensor `t`.
-
-    Args:
-        input (Variable): A Tensor to print.
-        summarize (int): Print this number of elements in the tensor, will print
-                all if left is negative.
-        message (str): A string message to print as a prefix.
-        first_n (int): Only log `first_n` number of times.
-        print_tensor_name (bool): Print the tensor name.
-        print_tensor_type (bool): Print the tensor type.
-        print_tensor_shape (bool): Print the tensor shape.
-        print_tensor_lod (bool): Print the tensor lod.
-        print_phase (str): Which phase to displace, including 'forward',
-                'backward' and 'both'. If set to 'backward' or 'both', will
-                print the gradients of input tensor.
-
-    Returns:
-        Variable: Output tensor.
-
-    NOTES:
-        The input and output are two different variables, and in the
-        following process, you should use the output variable but not the input,
-        otherwise, the print layer doesn't have backward.
-
-    Examples:
-        .. code-block:: python
-           
-           import paddle.fluid as fluid
-           
-           input = fluid.layers.fill_constant(shape=[10,2], value=3, dtype='int64')
-           input = fluid.layers.Print(input, message="The content of input layer:")
-           
-           main_program = fluid.default_main_program()
-           exe = fluid.Executor(fluid.CPUPlace())
-           exe.run(main_program)
-
-    Output at runtime:
-        .. code-block:: bash 
-           
-           1564546375   The content of input layer:     The place is:CPUPlace
-           Tensor[fill_constant_0.tmp_0]
-               shape: [10,2,]
-               dtype: x
-               data: 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 
-               
-           # The information of dtype at runtime may vary in different environments.
-           # Eg: 
-           #    If the dtype='int64' of Tensor y, the corresponding c++ type is int64_t.
-           #    The dtype of output is "x" ("x" is typeid(int64_t).name()) with MacOS and gcc4.8.2
-    '''
-    helper = LayerHelper('print' + "_" + input.name, **locals())
-    output = helper.create_variable_for_type_inference(input.dtype)
-    helper.append_op(
-        type='print',
-        inputs={'In': input},
-        outputs={'Out': output},
-        attrs={
-            'first_n': first_n,
-            'summarize': summarize,
-            'message': message or "",
-            'print_tensor_name': print_tensor_name,
-            'print_tensor_type': print_tensor_type,
-            'print_tensor_shape': print_tensor_shape,
-            'print_tensor_lod': print_tensor_lod,
-            'print_phase': print_phase.upper()
-        })
-    return output
-
-
-class BlockGuard(object):
-    """
-    BlockGuard class.
-
-    BlockGuard class is used to create a sub-block in a program by
-    using the Python `with` keyword.
-    """
-
-    def __init__(self, main_program):
-        if not isinstance(main_program, Program):
-            raise TypeError("BlockGuard takes a program")
-        self.main_program = main_program
-
-    def __enter__(self):
-        self.main_program._create_block()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.main_program._rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-
-
-class BlockGuardWithCompletion(BlockGuard):
-    """
-    BlockGuardWithCompletion class.
-
-    BlockGuardWithCompletion class is used to create an op with a block in a program.
-    """
-
-    def __init__(self, rnn):
-        if not isinstance(rnn, StaticRNN):
-            raise TypeError("BlockGuardWithCompletion takes a StaticRNN")
-        super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program)
-        self.rnn = rnn
-
-    def __enter__(self):
-        self.rnn.status = StaticRNN.IN_RNN_BLOCK
-        return super(BlockGuardWithCompletion, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn._complete_op()
-        return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
-                                                              exc_tb)
-
-
-class StaticRNNMemoryLink(object):
-    """
-    StaticRNNMemoryLink class.
-
-    StaticRNNMemoryLink class is used to create a link between two
-    memory cells of a StaticRNN.
-
-
-    NOTE: This is a internal data structure of a very low-level API.
-    Please use StaticRNN instead.
-
-    Args:
-        init(Variable): the initial variable for Memory.
-        pre_mem(Variable): the memory variable in previous time step.
-        mem(Variable): the memory variable in current time step.
-    """
-
-    def __init__(self, init, pre_mem, mem=None):
-        self.init = init
-        self.pre_mem = pre_mem
-        self.mem = mem
-
-
-class StaticRNN(object):
-    """
-    StaticRNN class.
-
-    The StaticRNN can process a batch of sequence data. The length of each
-    sample sequence must be equal. The StaticRNN will have its own parameters
-    like inputs, outputs, memories. **Note that the first dimension of inputs
-    represents sequence length, and all the sequence length of inputs must be
-    the same. And the meaning of each axis of input and output are the same.**
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            vocab_size, hidden_size=10000, 200
-            x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64')
-            x_emb = layers.embedding(
-                input=x,
-                size=[vocab_size, hidden_size],
-                dtype='float32',
-                is_sparse=False)
-            x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
-
-            rnn = fluid.layers.StaticRNN()
-            with rnn.step():
-                word = rnn.step_input(x_emb)
-                prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
-                hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
-                rnn.update_memory(prev, hidden)  # set prev to hidden
-                rnn.step_output(hidden)
-                rnn.output(word)
-
-            result = rnn()
-
-    The StaticRNN will unfold sequence into time steps. Users need to define
-    how to process each time step during the :code:`with` step.
-
-    The :code:`memory` is used as a staging data cross time step. The initial
-    value of memory can be a variable that is filled with a constant value or
-    a specified variable.
-
-    The StaticRNN can mark multiple variables as its output. Use `rnn()` to
-    get the output sequence.
-    """
-    BEFORE_RNN_BLOCK = 0
-    IN_RNN_BLOCK = 1
-    AFTER_RNN_BLOCK = 2
-
-    def __init__(self, name=None):
-        self.helper = LayerHelper("static_rnn", name=name)
-        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
-        self.inputs = []  # input variable list in current block
-        self.outputs = []  # output variable list in parent block
-        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
-        # sequence length, since it is a static RNN, sequence length are fixed.
-        self.seq_len = None
-
-    def step(self):
-        """
-        The block for user to define operators in RNN.
-        """
-        return BlockGuardWithCompletion(self)
-
-    def _assert_in_rnn_block_(self, method):
-        if self.status != StaticRNN.IN_RNN_BLOCK:
-            raise ValueError("You must invoke {0} in rnn block".format(method))
-
-    def memory(self,
-               init=None,
-               shape=None,
-               batch_ref=None,
-               init_value=0.0,
-               init_batch_dim_idx=0,
-               ref_batch_dim_idx=1):
-        """
-        Create a memory variable for static rnn.
-
-        If the :code:`init` is not None, :code:`memory` will be initialized by
-        this Variable. If the :code:`init` is None, :code:`shape` and :code:`batch_ref`
-        must be set, and this function will initialize a :code:`init` Variable.
-
-        Args:
-            init(Variable|None): The initialized variable. If it is not set,
-                :code:`shape` and :code:`batch_ref` must be provided.
-                Default: None.
-            shape(list|tuple): The shape of the boot memory. NOTE the shape
-                does not contain batch_size. Default: None.
-            batch_ref(Variable|None): The batch size reference Variable.
-                Default: None.
-            init_value(float): the init value of boot memory. Default: 0.0.
-            init_batch_dim_idx(int): the batch_size axis of the
-                :code:`init` Variable. Default: 0.
-            ref_batch_dim_idx(int): the batch_size axis of the
-                :code:`batch_ref` Variable. Default: 1.
-
-        Returns:
-            The memory variable.
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                import paddle.fluid.layers as layers
-
-                vocab_size, hidden_size=10000, 200
-                x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64')
-                x_emb = layers.embedding(
-                    input=x,
-                    size=[vocab_size, hidden_size],
-                    dtype='float32',
-                    is_sparse=False)
-                x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
-
-                rnn = fluid.layers.StaticRNN()
-                with rnn.step():
-                    word = rnn.step_input(x_emb)
-                    prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
-                    hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
-                    rnn.update_memory(prev, hidden)
-        """
-        self._assert_in_rnn_block_('memory')
-        if init is None:
-            if shape is None or batch_ref is None:
-                raise ValueError(
-                    "if init is None, memory at least need shape and batch_ref")
-            parent_block = self._parent_block()
-            var_name = unique_name.generate_with_ignorable_key("@".join(
-                [self.helper.name, "memory_boot"]))
-            boot_var = parent_block.create_var(
-                name=var_name,
-                shape=shape,
-                dtype=batch_ref.dtype,
-                persistable=False)
-
-            parent_block.append_op(
-                type="fill_constant_batch_size_like",
-                inputs={'Input': [batch_ref]},
-                outputs={'Out': [boot_var]},
-                attrs={
-                    'value': init_value,
-                    'shape': boot_var.shape,
-                    'dtype': boot_var.dtype,
-                    'input_dim_idx': ref_batch_dim_idx,
-                    'output_dim_idx': init_batch_dim_idx
-                })
-
-            return self.memory(init=boot_var)
-        else:
-            pre_mem = self.helper.create_variable(
-                name=unique_name.generate_with_ignorable_key("@".join(
-                    [self.helper.name, "mem"])),
-                dtype=init.dtype,
-                shape=init.shape)
-            self.memories[pre_mem.name] = StaticRNNMemoryLink(
-                init=init, pre_mem=pre_mem)
-            return pre_mem
-
-    def step_input(self, x):
-        """
-        Mark a sequence as a StaticRNN input.
-
-        Args:
-            x(Variable): The input sequence, the shape of x
-                should be [seq_len, ...].
-
-        Returns:
-            The current time step in the input sequence.
-        """
-        self._assert_in_rnn_block_('step_input')
-        if not isinstance(x, Variable):
-            raise TypeError("step input takes a Variable")
-        if self.seq_len is None:
-            self.seq_len = x.shape[0]
-        elif x.shape[0] != -1 and self.seq_len != x.shape[0]:
-            raise ValueError("Static RNN only take fix seq_len input")
-
-        ipt = self.helper.create_variable(
-            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type)
-        self.inputs.append(ipt)
-        return ipt
-
-    def step_output(self, o):
-        """
-        Mark a sequence as a StaticRNN output.
-
-        Args:
-            o(Variable): The output sequence.
-
-        Returns:
-            None.
-        """
-        self._assert_in_rnn_block_('step_output')
-        if not isinstance(o, Variable):
-            raise TypeError("step output takes a Variable")
-
-        tmp_o = self.helper.create_variable_for_type_inference(dtype=o.dtype)
-        self.helper.append_op(
-            type='rnn_memory_helper',
-            inputs={'X': [o]},
-            outputs={'Out': tmp_o},
-            attrs={'dtype': o.dtype})
-
-        out_var = self._parent_block().create_var(
-            name=tmp_o.name,
-            shape=[self.seq_len] + list(tmp_o.shape),
-            dtype=tmp_o.dtype)
-
-        self.outputs.append(out_var)
-
-    def output(self, *outputs):
-        """
-        Mark the StaticRNN output variables.
-
-        Args:
-            outputs: The output Variables.
-
-        Returns:
-            None
-        """
-        for each in outputs:
-            self.step_output(each)
-
-    def update_memory(self, mem, var):
-        """
-        Update the memory from ex_mem to new_mem. NOTE that the shape and data
-        type of :code:`ex_mem` and :code:`new_mem` must be same.
-
-        Args:
-            mem(Variable): the memory variable.
-            var(Variable): the plain variable generated in RNN block.
-
-        Returns:
-            None
-        """
-        if not isinstance(mem, Variable) or not isinstance(var, Variable):
-            raise TypeError("update memory should take variables")
-        self.memories[mem.name].mem = var
-
-    def _parent_block(self):
-        prog = self.helper.main_program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-        return parent_block
-
-    def __call__(self, *args, **kwargs):
-        if self.status != StaticRNN.AFTER_RNN_BLOCK:
-            raise ValueError("RNN output can only be retrieved after rnn block")
-        if len(self.outputs) == 0:
-            raise ValueError("RNN has no output")
-        elif len(self.outputs) == 1:
-            return self.outputs[0]
-        else:
-            return self.outputs
-
-    def _complete_op(self):
-        main_program = self.helper.main_program
-        rnn_block = main_program.current_block()
-        parent_block = self._parent_block()
-
-        local_inputs = set()
-
-        for op in rnn_block.ops:
-            assert isinstance(op, Operator)
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-
-        for var in self.inputs:
-            local_inputs.add(var.name)
-        for m in self.memories:
-            local_inputs.add(m)
-
-        # NOTE(zcd): the params have two categories of variables.
-        #   - the variables that are the out of StaticRnn.
-        #   - the variables that are the parameters of some layers, for example, conv2d.
-        params = list()
-        for op in rnn_block.ops:
-            assert isinstance(op, Operator)
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in local_inputs:
-                        params.append(in_var_name)
-
-        parameters = [parent_block.var(name) for name in set(params)]
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        inlinks = [parent_block.var(i.name) for i in self.inputs]
-        outlinks = self.outputs
-
-        # NOTE(zcd): the states maybe empty in some case.
-        boot_memories = []
-        pre_memories = []
-        memories = []
-        for _, mem in six.iteritems(self.memories):
-            boot_memories.append(mem.init)
-            pre_memories.append(mem.pre_mem.name)
-            assert mem.mem is not None, "%s should be updated in every step." % (
-                mem.init.name)
-            mem_var = rnn_block.var(mem.mem.name)
-            assert isinstance(mem_var, Variable)
-            new_mem = self.helper.create_variable_for_type_inference(
-                dtype=mem_var.dtype)
-            rnn_block.append_op(
-                type='rnn_memory_helper',
-                inputs={'X': [mem_var]},
-                outputs={'Out': [new_mem]},
-                attrs={'dtype': mem_var.dtype})
-
-            memories.append(new_mem.name)
-
-        parent_block.append_op(
-            type='recurrent',
-            inputs={
-                'inputs': inlinks,
-                'initial_states': boot_memories,
-                'parameters': parameters
-            },
-            outputs={'outputs': outlinks,
-                     'step_scopes': [step_scope]},
-            attrs={
-                'has_states': len(pre_memories) > 0,
-                'ex_states': pre_memories,
-                'states': memories,
-                'sub_block': rnn_block
-            })
-
-
-class WhileGuard(BlockGuard):
-    def __init__(self, while_op):
-        if not isinstance(while_op, While):
-            raise TypeError("WhileGuard takes a while op")
-        super(WhileGuard, self).__init__(while_op.helper.main_program)
-        self.while_op = while_op
-
-    def __enter__(self):
-        self.while_op.status = While.IN_WHILE_BLOCK
-        return super(WhileGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self.while_op.status = While.AFTER_WHILE_BLOCK
-        self.while_op._complete()
-        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
-
-
-class While(object):
-    """
-    while loop control flow.
-
-    Args:
-        cond(Variable): condition used to compare.
-        is_test(bool): A flag indicating whether execution is in test phase.
-        name(str): The name of this layer.
-
-    Examples:
-          .. code-block:: python
-            
-            import paddle.fluid as fluid
-            
-            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
-            d0 = fluid.layers.data("d0", shape=[10], dtype='float32')
-            data_array = fluid.layers.array_write(x=d0, i=i)
-            array_len = fluid.layers.fill_constant(shape=[1],dtype='int64', value=3)
-
-            cond = fluid.layers.less_than(x=i, y=array_len)
-            while_op = fluid.layers.While(cond=cond)
-            with while_op.block():
-                d = fluid.layers.array_read(array=data_array, i=i)
-                i = fluid.layers.increment(x=i, value=1, in_place=True)
-                fluid.layers.less_than(x=i, y=array_len, cond=cond)            
-    """
-
-    BEFORE_WHILE_BLOCK = 0
-    IN_WHILE_BLOCK = 1
-    AFTER_WHILE_BLOCK = 2
-
-    def __init__(self, cond, is_test=False, name=None):
-        self.helper = LayerHelper("while", name=name)
-        self.status = While.BEFORE_WHILE_BLOCK
-        if not isinstance(cond, Variable):
-            raise TypeError("condition should be a variable")
-        assert isinstance(cond, Variable)
-        if cond.dtype != core.VarDesc.VarType.BOOL:
-            raise TypeError("condition should be a boolean variable")
-        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
-            raise TypeError(
-                "condition expected shape as [], but given shape as {0}.".
-                format(list(cond.shape)))
-        self.cond_var = cond
-        self.is_test = is_test
-
-    def block(self):
-        return WhileGuard(self)
-
-    def _complete(self):
-        main_program = self.helper.main_program
-        while_block = main_program.current_block()
-        parent_block = main_program.block(main_program.current_block()
-                                          .parent_idx)
-
-        inner_outputs = {self.cond_var.name}
-        x_name_list = set()
-        for op in while_block.ops:
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        x_name_list.add(in_var_name)
-
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            inner_var = parent_block._find_var_recursive(inner_out_name)
-            if inner_var:
-                out_vars.append(inner_var)
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        parent_block.append_op(
-            type='while',
-            inputs={
-                'X': [
-                    parent_block._var_recursive(x_name)
-                    for x_name in x_name_list
-                ],
-                'Condition': [self.cond_var]
-            },
-            outputs={'Out': out_vars,
-                     'StepScopes': [step_scope]},
-            attrs={'sub_block': while_block,
-                   "is_test": self.is_test})
-
-
-def lod_rank_table(x, level=0):
-    """
-    LoD Rank Table Operator. Given an input variable **x** and a level number
-    of LoD, this layer creates a LodRankTable object. A LoDRankTable object
-    contains a list of bi-element tuples. Each tuple consists of an index and
-    a length, both of which are int type. Refering to specified level of LoD,
-    the index is the sequence index number and the length representes the
-    sequence length. Please note that the list is ranked in descending order by
-    the length. The following is an example:
-
-        .. code-block:: text
-
-            x is a LoDTensor:
-                x.lod = [[2,                1],
-                         [5,             1, 1]]
-                x.data = [a, b, c, d, e, f, g]
-
-            1. set level to 0:
-                Create lod rank table:
-                    lod_rank_table_obj = lod_rank_table(x, level=0)
-
-                Get:
-                    lod_rank_table_obj.items() = [(0, 2), (1, 1)]
-
-            2. set level to 1:
-                Create lod rank table:
-                    lod_rank_table_obj = lod_rank_table(x, level=1)
-
-                Get:
-                    lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)]
-
-    Args:
-        x (Variable): Input variable, a LoDTensor based which to create the lod
-            rank table.
-        level (int): Specify the LoD level, on which to create the lod rank
-            table.
-
-    Returns:
-        Variable: The created LoDRankTable object.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[10],
-                                  dtype='float32', lod_level=1)
-            out = layers.lod_rank_table(x=x, level=0)
-    """
-    helper = LayerHelper("lod_rank_table", **locals())
-    table = helper.create_variable(
-        type=core.VarDesc.VarType.LOD_RANK_TABLE,
-        name=unique_name.generate("lod_rank_table"))
-    helper.append_op(
-        type='lod_rank_table',
-        inputs={'X': x},
-        outputs={'Out': table},
-        attrs={'level': level})
-    return table
-
-
-@templatedoc()
-def max_sequence_len(rank_table):
-    """
-    ${comment}
-
-    >>> import paddle.fluid as fluid
-    >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32',
-    >>>                       lod_level=1)
-    >>> rank_table = layers.lod_rank_table(x=x, level=0)
-    >>> max_seq_len = layers.max_sequence_len(rank_table)
-
-    Args:
-        rank_table(${rank_table_type}): ${rank_table_comment}.
-
-    Returns:
-        ${out_comment}.
-    """
-    helper = LayerHelper("max_seqence_len", **locals())
-    res = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(
-        type="max_sequence_len",
-        inputs={"RankTable": rank_table},
-        outputs={"Out": res})
-    return res
-
-
-def lod_tensor_to_array(x, table):
-    """
-    Convert a LoDTensor to a LoDTensorArray.
-
-    This function split a LoDTesnor to a LoDTensorArray according to its LoD
-    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in
-    PaddlePaddle. The generated LoDTensorArray of this function can be further read
-    or written by `read_from_array()` and `write_to_array()` operators. However,
-    this function is generally an internal component of PaddlePaddle `DynamicRNN`.
-    Users should not use it directly.
-
-    Args:
-        x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
-        table (ParamAttr|list): The variable that stores the level of lod
-                                which is ordered by sequence length in
-                                descending order. It is generally generated
-                                by `layers.lod_rank_table()` API.
-
-    Returns:
-        Variable: The LoDTensorArray that has been converted from the input tensor.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          x = fluid.layers.data(name='x', shape=[10])
-          table = fluid.layers.lod_rank_table(x, level=0)
-          array = fluid.layers.lod_tensor_to_array(x, table)
-    """
-    helper = LayerHelper("lod_tensor_to_array", **locals())
-    array = helper.create_variable(
-        name=unique_name.generate("lod_tensor_to_array"),
-        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=x.dtype)
-    helper.append_op(
-        type='lod_tensor_to_array',
-        inputs={'X': x,
-                'RankTable': table},
-        outputs={'Out': array})
-    return array
-
-
-def array_to_lod_tensor(x, table):
-    """Convert a LoD_Tensor_Aarry to an LoDTensor.
-
-    Args:
-        x (Variable|list): The lod tensor array to be converted to a tensor.
-        table (ParamAttr|list): The variable that stores the level of lod
-                                which is ordered by sequence length in
-                                descending order.
-
-    Returns:
-        Variable: The variable of type tensor that has been converted
-                  from an array.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          x = fluid.layers.data(name='x', shape=[10])
-          table = fluid.layers.lod_rank_table(x, level=0)
-          array = fluid.layers.lod_tensor_to_array(x, table)
-          lod_tensor = fluid.layers.array_to_lod_tensor(array, table)
-    """
-    helper = LayerHelper("array_to_lod_tensor", **locals())
-    tmp = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="array_to_lod_tensor",
-        inputs={'X': x,
-                'RankTable': table},
-        outputs={'Out': tmp})
-    return tmp
-
-
-def increment(x, value=1.0, in_place=True):
-    """
-    This function performs an operation that increments the value in the
-    input :math:`x` by an amount: :math:`value` as mentioned in the input
-    parameter. This operation is performed in-place by default. Notice that
-    the number of elements in :math:`x` must be equal to 1.
-
-    Args:
-        x (Variable|list): The tensor that has the input values.
-        value (float): The amount by which the values should be incremented.
-        in_place (bool): If the increment should be performed in-place.
-
-    Returns:
-        Variable: The elementwise-incremented object.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='data', shape=[1], dtype='float32',
-                                   append_batch_size=False)
-          data = fluid.layers.increment(x=data, value=3.0, in_place=True)
-    """
-    helper = LayerHelper("increment", **locals())
-    if not in_place:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = x
-    helper.append_op(
-        type='increment',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'step': float(value)})
-    return out
-
-
-def array_write(x, i, array=None):
-    """
-    This function writes the given input variable to the specified position
-    indicating by the arrary index to an output LOD_TENSOR_ARRAY. If the
-    output LOD_TENSOR_ARRAY is not given(None), a new one will be created and
-    returned.
-
-    Args:
-        x (Variable|list): The input tensor from which the data will be read.
-        i (Variable|list): The index of the output LOD_TENSOR_ARRAY, pointing to
-                           the position to which the input tensor will be
-                           written.
-        array (Variable|list): The output LOD_TENSOR_ARRAY to which the input
-                               tensor will be written. If this parameter is
-                               NONE, a new LOD_TENSOR_ARRAY will be created and
-                               returned.
-
-    Returns:
-        Variable: The output LOD_TENSOR_ARRAY where the input tensor is written.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
-          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
-          arr = fluid.layers.array_write(tmp, i=i)
-    """
-    helper = LayerHelper('array_write', **locals())
-    if array is None:
-        array = helper.create_variable(
-            name="{0}.out".format(helper.name),
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=x.dtype)
-    helper.append_op(
-        type='write_to_array',
-        inputs={'X': [x],
-                'I': [i]},
-        outputs={'Out': [array]})
-    return array
-
-
-def create_array(dtype):
-    """
-    **Create LoDTensorArray**
-
-    This function creates an array of LOD_TENSOR_ARRAY . It is mainly used to
-    implement RNN with array_write, array_read and While.
-
-    Args:
-        dtype (int|float): The data type of the elements in the lod_tensor_array.
-
-    Returns:
-        Variable: The lod_tensor_array variable storing the elements of data type.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.create_array(dtype='float32')
-
-    """
-    helper = LayerHelper("array", **locals())
-    return helper.create_variable(
-        name="{0}.out".format(helper.name),
-        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=dtype)
-
-
-@templatedoc()
-def less_than(x, y, force_cpu=None, cond=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}.
-        y(${y_type}): ${y_comment}.
-        force_cpu(${force_cpu_type}): ${force_cpu_comment}.
-        cond(Variable|None): Optional output variable to store the result of *less_than*
-
-    Returns:
-        ${out_comment}.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-          limit = fluid.layers.fill_constant(shape=[1], dtype='int64', value=5)
-          cond = fluid.layers.less_than(x=label, y=limit)
-    """
-    helper = LayerHelper("less_than", **locals())
-    if cond is None:
-        cond = helper.create_variable_for_type_inference(dtype='bool')
-        cond.stop_gradient = True
-
-    attrs = dict()
-    if force_cpu is not None:
-        attrs['force_cpu'] = force_cpu
-    elif force_init_on_cpu():
-        attrs['force_cpu'] = force_init_on_cpu()
-
-    helper.append_op(
-        type='less_than',
-        inputs={'X': [x],
-                'Y': [y]},
-        outputs={'Out': [cond]},
-        attrs=attrs)
-    return cond
-
-
-@templatedoc()
-def less_equal(x, y, cond=None):
-    """
-    This layer returns the truth value of :math:`x <= y` elementwise, which is equivalent to the overloaded operator `<=`.
-
-    Args:
-        x(Variable): First operand of *less_equal*
-        y(Variable): Second operand of *less_equal*
-        cond(Variable|None): Optional output variable to store the result of *less_equal*
-
-    Returns:
-        Variable: The tensor variable storing the output of *less_equal*.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          
-          label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-          limit = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64')
-          out = fluid.layers.less_equal(x=label, y=limit)
-    """
-    helper = LayerHelper("less_equal", **locals())
-    if cond is None:
-        cond = helper.create_variable_for_type_inference(dtype='bool')
-        cond.stop_gradient = True
-
-    attrs = dict()
-    if force_init_on_cpu():
-        attrs['force_cpu'] = force_init_on_cpu()
-
-    helper.append_op(
-        type='less_equal',
-        inputs={'X': [x],
-                'Y': [y]},
-        outputs={'Out': [cond]},
-        attrs=attrs)
-    return cond
-
-
-@templatedoc()
-def greater_than(x, y, cond=None):
-    """
-    This layer returns the truth value of :math:`x > y` elementwise, which is equivalent to the overloaded operator `>`.
-
-    Args:
-        x(Variable): First operand of *greater_than*
-        y(Variable): Second operand of *greater_than*
-        cond(Variable|None): Optional output variable to store the result of *greater_than*
-
-    Returns:
-        Variable: The tensor variable storing the output of *greater_than*.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          
-          label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-          limit = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64')
-          out = fluid.layers.greater_than(x=label, y=limit)
-    """
-    helper = LayerHelper("greater_than", **locals())
-    if cond is None:
-        cond = helper.create_variable_for_type_inference(dtype='bool')
-        cond.stop_gradient = True
-
-    attrs = dict()
-    if force_init_on_cpu():
-        attrs['force_cpu'] = force_init_on_cpu()
-
-    helper.append_op(
-        type='greater_than',
-        inputs={'X': [x],
-                'Y': [y]},
-        outputs={'Out': [cond]},
-        attrs=attrs)
-    return cond
-
-
-@templatedoc()
-def greater_equal(x, y, cond=None):
-    """
-    This layer returns the truth value of :math:`x >= y` elementwise, which is equivalent to the overloaded operator `>=`.
-
-    Args:
-        x(Variable): First operand of *greater_equal*
-        y(Variable): Second operand of *greater_equal*
-        cond(Variable|None): Optional output variable to store the result of *greater_equal*
-
-    Returns:
-        Variable: The tensor variable storing the output of *greater_equal*.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          
-          label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-          limit = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64')
-          out = fluid.layers.greater_equal(x=label, y=limit)
-
-    """
-    helper = LayerHelper("greater_equal", **locals())
-    if cond is None:
-        cond = helper.create_variable_for_type_inference(dtype='bool')
-        cond.stop_gradient = True
-
-    attrs = dict()
-    if force_init_on_cpu():
-        attrs['force_cpu'] = force_init_on_cpu()
-
-    helper.append_op(
-        type='greater_equal',
-        inputs={'X': [x],
-                'Y': [y]},
-        outputs={'Out': [cond]},
-        attrs=attrs)
-    return cond
-
-
-def equal(x, y, cond=None):
-    """
-    This layer returns the truth value of :math:`x == y` elementwise.
-
-    Args:
-        x(Variable): First operand of *equal*
-        y(Variable): Second operand of *equal*
-        cond(Variable|None): Optional output variable to store the result of *equal*
-
-    Returns:
-        Variable: The tensor variable storing the output of *equal*.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          label = fluid.layers.data(name="label", shape=[3,10,32,32], dtype="float32")
-          limit = fluid.layers.data(name="limit", shape=[3,10,32,32], dtype="float32")
-          less = fluid.layers.equal(x=label, y=limit)
-    """
-    helper = LayerHelper("equal", **locals())
-    if cond is None:
-        cond = helper.create_variable_for_type_inference(dtype='bool')
-        cond.stop_gradient = True
-
-    helper.append_op(
-        type='equal', inputs={'X': [x],
-                              'Y': [y]}, outputs={'Out': [cond]})
-    return cond
-
-
-def not_equal(x, y, cond=None):
-    """
-    This layer returns the truth value of :math:`x != y` elementwise, which is equivalent to the overloader operator `!=`.
-
-    Args:
-        x(Variable): First operand of *not_equal*
-        y(Variable): Second operand of *not_equal*
-        cond(Variable|None): Optional output variable to store the result of *not_equal*
-
-    Returns:
-        Variable: The tensor variable storing the output of *not_equal*.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          
-          label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-          limit = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64')
-          out = fluid.layers.not_equal(x=label, y=limit)
-    """
-    helper = LayerHelper("not_equal", **locals())
-    if cond is None:
-        cond = helper.create_variable_for_type_inference(dtype='bool')
-        cond.stop_gradient = True
-
-    helper.append_op(
-        type='not_equal', inputs={'X': [x],
-                                  'Y': [y]}, outputs={'Out': [cond]})
-    return cond
-
-
-def array_read(array, i):
-    """
-    This function performs the operation to read the data in as an
-    LOD_TENSOR_ARRAY.
-
-    .. code-block:: text
-
-        Given:
-
-        array = [0.6, 0.1, 0.3, 0.1]
-
-        And:
-
-        i = 2
-
-        Then:
-
-        output = 0.3
-
-    Args:
-        array (Variable|list): The input tensor that store data to be read.
-        i (Variable|list): The index of the data to be read from input array.
-
-    Returns:
-        Variable: The tensor type variable that has the data written to it.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          array = fluid.layers.create_array(dtype='float32')
-          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
-          item = fluid.layers.array_read(array, i)
-    """
-    helper = LayerHelper('array_read', **locals())
-    if not isinstance(
-            array,
-            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-        raise TypeError("array should be tensor array vairable")
-    out = helper.create_variable_for_type_inference(dtype=array.dtype)
-    helper.append_op(
-        type='read_from_array',
-        inputs={'X': [array],
-                'I': [i]},
-        outputs={'Out': [out]})
-    return out
-
-
-def shrink_memory(x, i, table):
-    """
-    This function creates an operator to shrink rnn memory using the RankTable
-    as mentioned in the input parameter.
-
-    NOTE: This API is very low-level API. It is used by DynamicRNN only.
-
-    Since the Dynamic RNN uses no-padding way to implement RNN. The sequence
-    will be sorted by order, and the length of valid memory will be shrink after
-    each time step.
-
-    Args:
-        x(Variable): The memory object in the previous time step.
-        i(Variable): The step count variable. A int scalar as LoDTensor.
-        table(Variable): The RNNRankTable object.
-
-    Returns:
-        the memory variable after shrink.
-
-    Examples:
-
-        Since this API is very low level API. The example is not provided.
-        Please reference the implementation of class DynamicRNN for detail
-        usage.
-    """
-    helper = LayerHelper('shrink_memory', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='shrink_rnn_memory',
-        inputs={'X': [x],
-                'I': [i],
-                'RankTable': [table]},
-        outputs={'Out': [out]},
-        attrs={})
-    return out
-
-
-def array_length(array):
-    """
-    **Get the Length of Input LoDTensorArray**
-
-    This function performs the operation to find the length of the input
-    LOD_TENSOR_ARRAY.
-
-    Related API: array_read, array_write, While.
-
-    Args:
-        array (LOD_TENSOR_ARRAY): The input array that will be used
-                                  to compute the length.
-
-    Returns:
-        Variable: The length of the input LoDTensorArray.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
-          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
-          arr = fluid.layers.array_write(tmp, i=i)
-          arr_len = fluid.layers.array_length(arr)
-
-    """
-    helper = LayerHelper('array_length', **locals())
-    tmp = helper.create_variable_for_type_inference(dtype='int64')
-    tmp.stop_gradient = True
-    helper.append_op(
-        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
-    return tmp
-
-
-class ConditionalBlockGuard(BlockGuard):
-    """
-    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for
-    holding a ConditionalBlock, and helping users entering and exiting the
-    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard
-    is generally an internal component of IfElse, users should not use it directly.
-    """
-
-    def __init__(self, block):
-        if not isinstance(block, ConditionalBlock):
-            raise TypeError("block should be conditional block")
-        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
-        self.block = block
-
-    def __enter__(self):
-        return super(ConditionalBlockGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.block.complete()
-        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
-                                                           exc_tb)
-
-
-class ConditionalBlock(object):
-    '''
-    **ConditionalBlock**
-
-    ConditionalBlock is an operator that bind a block to a specific condition,
-    if the condition matches, the corresponding block will be executed.
-
-    Args:
-        inputs (Variable): bool conditions.
-        is_scalar_condition (bool): whether the branch is controled by a scalar.
-        name(str): name of this ConditionalBlock.
-
-    Examples:
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             cond = layers.less_than(x=label, y=limit)
-             true_image, false_image = layers.split_lod_tensor(
-                 input=image, mask=cond)
-             true_cond = layers.ConditionalBlock([true_image])
-
-             with true_cond.block():
-                 ...
-             with false_cond.block():
-                 ...
-    '''
-
-    def __init__(self, inputs, is_scalar_condition=False, name=None):
-        for each_input in inputs:
-            if not isinstance(each_input, Variable):
-                raise TypeError("Each input should be variable")
-        self.inputs = inputs
-        self.is_scalar_condition = is_scalar_condition
-        self.helper = LayerHelper('conditional_block', name=name)
-
-    def block(self):
-        return ConditionalBlockGuard(self)
-
-    def complete(self):
-        inside_block = self.helper.main_program.current_block()
-        parent_block = self.helper.main_program.block(inside_block.parent_idx)
-
-        intermediate = set()
-        params = set()
-
-        for each_op in inside_block.ops:
-            assert isinstance(each_op, Operator)
-            for iname in each_op.input_names:
-                for in_var_name in each_op.input(iname):
-                    if in_var_name not in intermediate:
-                        params.add(in_var_name)
-
-            for oname in each_op.output_names:
-                for out_var_name in each_op.output(oname):
-                    intermediate.add(out_var_name)
-        input_set = set([ipt.name for ipt in self.inputs])
-
-        param_list = [
-            parent_block._var_recursive(each_name) for each_name in params
-            if each_name not in input_set
-        ]
-
-        out_list = []
-        for inner_out_name in intermediate:
-            inner_var = parent_block._find_var_recursive(inner_out_name)
-            if inner_var:
-                out_list.append(inner_var)
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-        parent_block.append_op(
-            type='conditional_block',
-            inputs={
-                'Cond': self.inputs,
-                'Input': param_list,
-            },
-            outputs={'Out': out_list,
-                     'Scope': [step_scope]},
-            attrs={
-                'sub_block': inside_block,
-                'is_scalar_condition': self.is_scalar_condition
-            })
-
-
-class Switch(object):
-    """
-    Switch class works just like a `if-elif-else`. Can be used in learning rate scheduler
-    to modify learning rate
-
-    The Semantics:
-
-    1. A `switch` control-flow checks cases one-by-one.
-
-    2. The condition of each case is a boolean value, which is a scalar Variable.
-
-    3. It runs the first matched case, or the default case if there is one.
-
-    4. Once it matches a case, it runs the corresponding branch and only that branch.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-
-            lr = fluid.layers.create_global_var(
-                shape=[1],
-                value=0.0,
-                dtype='float32',
-                persistable=True,
-                name="learning_rate")
-            zero_var = fluid.layers.fill_constant(
-                 shape=[1], dtype='float32', value=0.0)
-            one_var = fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=1.0)
-            two_var = fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=2.0) 
-
-            global_step = fluid.layers.autoincreased_step_counter(
-                   counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
-
-            with fluid.layers.control_flow.Switch() as switch:
-                with switch.case(global_step == zero_var):
-                    fluid.layers.assign(input=one_var, output=lr)
-                with switch.default():
-                    fluid.layers.assign(input=two_var, output=lr)
-
-    """
-
-    def __init__(self, name=None):
-        self.helper = LayerHelper('switch', name=name)
-        self.inside_scope = False
-        self.pre_not_conditions = []
-
-    def case(self, condition):
-        if not self.inside_scope:
-            raise ValueError("case should be called inside with")
-
-        if len(self.pre_not_conditions) == 0:
-            cond_block = ConditionalBlock([condition], is_scalar_condition=True)
-            not_cond = logical_not(x=condition)
-            self.pre_not_conditions.append(not_cond)
-        else:
-            pre_cond_num = len(self.pre_not_conditions)
-            pre_not_cond = self.pre_not_conditions[pre_cond_num - 1]
-            new_not_cond = logical_and(
-                x=pre_not_cond, y=logical_not(x=condition))
-            self.pre_not_conditions.append(new_not_cond)
-            cond_block = ConditionalBlock(
-                [logical_and(
-                    x=pre_not_cond, y=condition)],
-                is_scalar_condition=True)
-
-        return ConditionalBlockGuard(cond_block)
-
-    def default(self):
-        pre_cond_num = len(self.pre_not_conditions)
-        if pre_cond_num == 0:
-            raise ValueError("there should be at least one condition")
-        cond_block = ConditionalBlock(
-            [self.pre_not_conditions[pre_cond_num - 1]],
-            is_scalar_condition=True)
-        return ConditionalBlockGuard(cond_block)
-
-    def __enter__(self):
-        """
-        set flag that now is inside switch.block {}
-        :return:
-        """
-        self.inside_scope = True
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.inside_scope = False
-        if exc_type is not None:
-            return False  # re-raise exception
-
-        return True
-
-
-class IfElseBlockGuard(object):
-    def __init__(self, is_true, ifelse):
-        if not isinstance(ifelse, IfElse):
-            raise TypeError("ifelse must be an instance of IfElse class")
-
-        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("You cannot invoke IfElse.block() inside a block")
-
-        self.is_true = is_true
-        self.ie = ifelse
-        if is_true:
-            self.cond_block = ifelse.conditional_true_block
-        else:
-            self.cond_block = ifelse.conditional_false_block
-
-        if not isinstance(self.cond_block, ConditionalBlock):
-            raise TypeError("Unexpected situation")
-
-        self.cond_block = self.cond_block.block()
-
-    def __enter__(self):
-        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
-        self.cond_block.__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
-            # re-raise inside exception
-            return False
-        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
-            raise ValueError("Must set output inside block")
-        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
-
-
-class IfElse(object):
-    """
-    if-else control flow.
-
-    Args:
-        cond (Variable): condition used to compare.
-        name (str, default None): The name of this layer.
-
-    Examples:
-          .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            image = fluid.layers.data(name="X", shape=[2, 5, 5], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            limit = fluid.layers.fill_constant_batch_size_like(
-                 input=label, dtype='int64', shape=[1], value=5.0)
-            cond = fluid.layers.less_than(x=label, y=limit)
-            ie = fluid.layers.IfElse(cond)
-            with ie.true_block():
-                true_image = ie.input(image)
-                hidden = fluid.layers.fc(input=true_image, size=100, act='tanh')
-                prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            with ie.false_block():
-                false_image = ie.input(image)
-                hidden = fluid.layers.fc(
-                    input=false_image, size=200, act='tanh')
-                prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-            prob = ie()
-    """
-    OUT_IF_ELSE_BLOCKS = 0
-    IN_IF_ELSE_TRUE_BLOCKS = 1
-    IN_IF_ELSE_FALSE_BLOCKS = 2
-
-    def __init__(self, cond, name=None):
-        if not isinstance(cond, Variable):
-            raise TypeError("cond must be a Variable")
-        self.helper = LayerHelper('ifelse', name=name)
-        self.cond = cond
-        self.input_table = {}
-        self.status = IfElse.OUT_IF_ELSE_BLOCKS
-        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
-        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
-        self.output_table = ([], [])  # (true_outs, false_outs)
-
-    def input(self, x):
-        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("input must in true/false blocks")
-        if id(x) not in self.input_table:
-            parent_block = self._parent_block()
-            out_true = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key('ifelse_input' +
-                                                             self.helper.name),
-                dtype=x.dtype)
-
-            out_false = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key('ifelse_input' +
-                                                             self.helper.name),
-                dtype=x.dtype)
-            parent_block.append_op(
-                type='split_lod_tensor',
-                inputs={
-                    'X': x,
-                    'Mask': self.cond,
-                },
-                outputs={'OutTrue': out_true,
-                         'OutFalse': out_false},
-                attrs={'level': 0})
-            self.input_table[id(x)] = (out_true, out_false)
-        else:
-            out_true, out_false = self.input_table[id(x)]
-
-        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
-            return out_true
-        else:
-            return out_false
-
-    def _parent_block(self):
-        current_block = self.helper.main_program.current_block()
-        return self.helper.main_program.block(current_block.parent_idx)
-
-    def true_block(self):
-        return IfElseBlockGuard(True, self)
-
-    def false_block(self):
-        return IfElseBlockGuard(False, self)
-
-    def output(self, *outs):
-        if self.status == self.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("output can only be invoked in the sub-block")
-
-        out_table = self.output_table[1 if self.status ==
-                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
-        parent_block = self._parent_block()
-        for each_out in outs:
-            if not isinstance(each_out, Variable):
-                raise TypeError("Each output should be a variable")
-            # create outside tensor
-            outside_out = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key("_".join(
-                    [self.helper.name, 'output'])),
-                dtype=each_out.dtype)
-            out_table.append(outside_out)
-
-            # assign local var to outside
-            assign(input=each_out, output=outside_out)
-
-    def __call__(self):
-        if self.status != self.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("IfElse::__call__ must be out of sub-block")
-        false_len, true_len = list(map(len, self.output_table))
-        if false_len == 0 and true_len == 0:
-            raise ValueError("Must invoke true_block/false_block before "
-                             "__call__")
-        elif false_len != true_len and false_len != 0 and true_len != 0:
-            raise ValueError("The output side must be same")
-        elif false_len == 0 or true_len == 0:
-            return self.output_table[0 if false_len != 0 else 1]
-
-        # else none of false_len/true_len is zero
-        # merge together
-        rlist = []
-        for false_var, true_var in zip(*self.output_table):
-            rlist.append(
-                merge_lod_tensor(
-                    in_true=true_var,
-                    in_false=false_var,
-                    mask=self.cond,
-                    x=self.cond,
-                    level=0))
-        return rlist
-
-
-class DynamicRNN(object):
-    """
-    The dynamic RNN can process a batch of sequence data. The length of each
-    sample sequence can be different. This API automatically process them in
-    batch.
-
-    The input lod must be set. Please reference to `lod_tensor`.
-
-    The dynamic RNN will unfold sequence into timesteps. Users need to define
-    how to process each time step during the :code:`with` block.
-
-    The `memory` is used staging data cross time step. The initial value of
-    memory can be zero or another variable.
-
-    The dynamic RNN can mark multiple variables as its output. Use `drnn()` to
-    get the output sequence.
-
-    NOTES:
-        Currently it is not supported that setting is_sparse to True of any 
-        layers within DynamicRNN.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          sentence = fluid.layers.data(name='sentence', shape=[1], dtype='int64', lod_level=1)
-          embedding = fluid.layers.embedding(input=sentence, size=[65536, 32], is_sparse=True)
-    
-          drnn = fluid.layers.DynamicRNN()
-          with drnn.block():
-              word = drnn.step_input(embedding)
-              prev = drnn.memory(shape=[200])
-              hidden = fluid.layers.fc(input=[word, prev], size=200, act='relu')
-              drnn.update_memory(prev, hidden)  # set prev to hidden
-              drnn.output(hidden)
-
-          # Get the last time step of rnn. It is the encoding result.
-          rnn_output = drnn()
-          last = fluid.layers.sequence_last_step(rnn_output)
-    """
-    BEFORE_RNN = 0
-    IN_RNN = 1
-    AFTER_RNN = 2
-
-    def __init__(self, name=None):
-        self.helper = LayerHelper('dynamic_rnn', name=name)
-        self.status = DynamicRNN.BEFORE_RNN
-        self.lod_rank_table = None
-        self.max_seq_len = None
-        self.step_idx = None
-        self.zero_idx = None
-        self.mem_dict = dict()
-        self.output_array = []
-        self.outputs = []
-        self.cond = self.helper.create_variable_for_type_inference(dtype='bool')
-        self.cond.stop_gradient = False
-        self.while_op = While(self.cond)
-        self.input_array = []
-        self.mem_link = []
-
-    def step_input(self, x, level=0):
-        """
-        Mark a sequence as a dynamic RNN input.
-
-        Args:
-            x (Variable): The input sequence which should have lod information.
-            level (int): The level of lod used to split steps. Default: 0.
-
-        Returns:
-            The current timestep in the input sequence.
-        """
-        self._assert_in_rnn_block_("step_input")
-        if not isinstance(x, Variable):
-            raise TypeError(
-                "step_input() can only take a Variable as its input.")
-        parent_block = self._parent_block_()
-        if self.lod_rank_table is None:
-            self.lod_rank_table = parent_block.create_var(
-                name=unique_name.generate('lod_rank_table'),
-                type=core.VarDesc.VarType.LOD_RANK_TABLE)
-            self.lod_rank_table.stop_gradient = True
-            parent_block.append_op(
-                type='lod_rank_table',
-                inputs={"X": x},
-                outputs={"Out": self.lod_rank_table},
-                attrs={"level": level})
-            self.max_seq_len = parent_block.create_var(
-                name=unique_name.generate('dynamic_rnn_max_seq_len'),
-                dtype='int64')
-            self.max_seq_len.stop_gradient = False
-            parent_block.append_op(
-                type='max_sequence_len',
-                inputs={'RankTable': self.lod_rank_table},
-                outputs={"Out": self.max_seq_len})
-            self.cond.stop_gradient = True
-            parent_block.append_op(
-                type='less_than',
-                inputs={'X': self.step_idx,
-                        'Y': self.max_seq_len},
-                outputs={'Out': self.cond},
-                attrs={'force_cpu': True})
-
-        input_array = parent_block.create_var(
-            name=unique_name.generate('dynamic_rnn_input_array'),
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=x.dtype)
-        self.input_array.append((input_array, x.dtype))
-        parent_block.append_op(
-            type='lod_tensor_to_array',
-            inputs={'X': x,
-                    'RankTable': self.lod_rank_table},
-            outputs={'Out': input_array})
-        return array_read(array=input_array, i=self.step_idx)
-
-    def static_input(self, x):
-        """
-        Mark a variable as a RNN input. The input will not be scattered into
-        time steps. It is optional.
-
-        Args:
-            x (Variable): The input variable.
-
-        Returns:
-            The input variable that can access in RNN.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-
-              sentence = fluid.layers.data(name='sentence', dtype='float32', shape=[32], lod_level=1)
-              encoder_proj = fluid.layers.data(name='encoder_proj', dtype='float32', shape=[32], lod_level=1)
-              decoder_boot = fluid.layers.data(name='boot', dtype='float32', shape=[10], lod_level=1)
-
-              drnn = fluid.layers.DynamicRNN()
-              with drnn.block():
-                  current_word = drnn.step_input(sentence)
-                  encoder_word = drnn.static_input(encoder_proj)
-                  hidden_mem = drnn.memory(init=decoder_boot, need_reorder=True)
-                  fc_1 = fluid.layers.fc(input=encoder_word, size=30, bias_attr=False)
-                  fc_2 = fluid.layers.fc(input=current_word, size=30, bias_attr=False)
-                  decoder_inputs = fc_1 + fc_2
-                  h, _, _ = fluid.layers.gru_unit(input=decoder_inputs, hidden=hidden_mem, size=30)
-                  drnn.update_memory(hidden_mem, h)
-                  out = fluid.layers.fc(input=h, size=10, bias_attr=True, act='softmax') 
-                  drnn.output(out)
-
-              rnn_output = drnn()
-        """
-        self._assert_in_rnn_block_("static_input")
-        if not isinstance(x, Variable):
-            raise TypeError(
-                "static_input() can only take a Variable as its input")
-        if self.lod_rank_table is None:
-            raise RuntimeError(
-                "static_input() must be called after step_input().")
-        parent_block = self._parent_block_()
-        x_reordered = parent_block.create_var(
-            name=unique_name.generate("dynamic_rnn_static_input_reordered"),
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            dtype=x.dtype)
-        parent_block.append_op(
-            type='reorder_lod_tensor_by_rank',
-            inputs={'X': [x],
-                    'RankTable': [self.lod_rank_table]},
-            outputs={'Out': [x_reordered]})
-        return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table)
-
-    @signature_safe_contextmanager
-    def block(self):
-        """
-        The block for user to define operators in RNN.
-        """
-        if self.status != DynamicRNN.BEFORE_RNN:
-            raise ValueError("rnn.block() can only be invoke once")
-        self.step_idx = fill_constant(
-            shape=[1], dtype='int64', value=0, force_cpu=True)
-        self.step_idx.stop_gradient = False
-        self.status = DynamicRNN.IN_RNN
-        with self.while_op.block():
-            yield
-            increment(x=self.step_idx, value=1.0, in_place=True)
-
-            for new_mem, mem_array in self.mem_link:
-                array_write(x=new_mem, i=self.step_idx, array=mem_array)
-
-            less_than(
-                x=self.step_idx,
-                y=self.max_seq_len,
-                force_cpu=True,
-                cond=self.cond)
-
-        self.status = DynamicRNN.AFTER_RNN
-        for each_array in self.output_array:
-            self.outputs.append(
-                array_to_lod_tensor(
-                    x=each_array, table=self.lod_rank_table))
-
-    def __call__(self, *args, **kwargs):
-        """
-        Get the output of RNN. This API should only be invoked after RNN.block()
-        """
-        if self.status != DynamicRNN.AFTER_RNN:
-            raise ValueError(("Output of the dynamic RNN can only be visited "
-                              "outside the rnn block."))
-        if len(self.outputs) == 1:
-            return self.outputs[0]
-        else:
-            return self.outputs
-
-    def memory(self,
-               init=None,
-               shape=None,
-               value=0.0,
-               need_reorder=False,
-               dtype='float32'):
-        """
-        Create a memory variable for dynamic rnn.
-
-        If the :code:`init` is not None, :code:`memory` will be initialized by
-        this variable. The :code:`need_reorder` is used to reorder the memory as
-        the input variable. It should be set to true when the initialized memory
-        depends on the input sample.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-
-              sentence = fluid.layers.data(name='sentence', shape=[32], dtype='float32', lod_level=1)
-              boot_memory = fluid.layers.data(name='boot', shape=[10], dtype='float32', lod_level=1)
-              
-              drnn = fluid.layers.DynamicRNN()
-              with drnn.block():
-                  word = drnn.step_input(sentence)
-                  memory = drnn.memory(init=boot_memory, need_reorder=True)
-                  hidden = fluid.layers.fc(input=[word, memory], size=10, act='tanh')
-                  drnn.update_memory(ex_mem=memory, new_mem=hidden)
-                  drnn.output(hidden)
-
-              rnn_output = drnn()
-
-
-        Otherwise, if :code:`shape`, :code:`value`, :code:`dtype` are set, the
-        :code:`memory` will be initialized by this :code:`value`.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-
-              sentence = fluid.layers.data(name='sentence', dtype='float32', shape=[32], lod_level=1)
-              
-              drnn = fluid.layers.DynamicRNN()
-              with drnn.block():
-                  word = drnn.step_input(sentence)
-                  memory = drnn.memory(shape=[10], dtype='float32', value=0)
-                  hidden = fluid.layers.fc(input=[word, memory], size=10, act='tanh')
-                  drnn.update_memory(ex_mem=memory, new_mem=hidden)
-                  drnn.output(hidden)
-
-              rnn_output = drnn()
-
-
-        Args:
-            init(Variable|None): The initialized variable.
-            shape(list|tuple): The memory shape. The shape does not contain batch_size.
-            value(float): the initalized value.
-            need_reorder(bool): True if the initialized memory depends on the input sample.
-            dtype(str|numpy.dtype): The data type of the initialized memory.
-
-        Returns:
-            The memory variable.
-        """
-        self._assert_in_rnn_block_('memory')
-        self._init_zero_idx_()
-        if init is not None:
-            if not isinstance(init, Variable):
-                raise TypeError(
-                    "The input arg `init` of memory() must be a Variable")
-            parent_block = self._parent_block_()
-            init_tensor = init
-            if need_reorder == True:
-                if self.lod_rank_table is None:
-                    raise ValueError(
-                        'If set need_reorder to True, make sure step_input be '
-                        'invoked before '
-                        'memory(init=init, need_reordered=True, ...).')
-                init_reordered = parent_block.create_var(
-                    name=unique_name.generate('dynamic_rnn_mem_init_reordered'),
-                    type=core.VarDesc.VarType.LOD_TENSOR,
-                    dtype=init.dtype)
-                parent_block.append_op(
-                    type='reorder_lod_tensor_by_rank',
-                    inputs={
-                        'X': [init_tensor],
-                        'RankTable': [self.lod_rank_table]
-                    },
-                    outputs={'Out': [init_reordered]})
-                init_tensor = init_reordered
-            mem_array = parent_block.create_var(
-                name=unique_name.generate('dynamic_rnn_mem_array'),
-                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                dtype=init.dtype)
-            parent_block.append_op(
-                type='write_to_array',
-                inputs={'X': init_tensor,
-                        'I': self.zero_idx},
-                outputs={'Out': mem_array})
-            retv = array_read(array=mem_array, i=self.step_idx)
-            retv = shrink_memory(
-                x=retv, i=self.step_idx, table=self.lod_rank_table)
-            self.mem_dict[retv.name] = mem_array
-            return retv
-        else:
-            if len(self.input_array) == 0:
-                raise ValueError(
-                    "step_input should be invoked before memory(shape=..., value=...)"
-                )
-            parent_block = self._parent_block_()
-            init = parent_block.create_var(
-                name=unique_name.generate('mem_init'), dtype=dtype)
-            arr, dtype = self.input_array[0]
-            in0 = parent_block.create_var(
-                name=unique_name.generate('in0'), dtype=dtype)
-            parent_block.append_op(
-                type='read_from_array',
-                inputs={'X': [arr],
-                        'I': [self.zero_idx]},
-                outputs={'Out': [in0]})
-            parent_block.append_op(
-                type='fill_constant_batch_size_like',
-                inputs={'Input': [in0]},
-                outputs={'Out': [init]},
-                attrs={
-                    'shape': [-1] + shape,
-                    'value': float(value),
-                    'dtype': init.dtype
-                })
-            return self.memory(init=init)
-
-    def update_memory(self, ex_mem, new_mem):
-        """
-        Update the memory from ex_mem to new_mem. NOTE that the shape and data
-        type of :code:`ex_mem` and :code:`new_mem` must be same.
-        
-        Args:
-            ex_mem(Variable): the memory variable.
-            new_mem(Variable): the plain variable generated in RNN block.
-
-        Returns:
-            None
-        """
-        self._assert_in_rnn_block_('update_memory')
-        if not isinstance(ex_mem, Variable):
-            raise TypeError("The input arg `ex_mem` of update_memory() must "
-                            "be a Variable")
-        if not isinstance(new_mem, Variable):
-            raise TypeError("The input arg `new_mem` of update_memory() must "
-                            "be a Variable")
-
-        mem_array = self.mem_dict.get(ex_mem.name, None)
-        if mem_array is None:
-            raise ValueError("Please invoke memory before update_memory")
-        if self.lod_rank_table is None:
-            raise ValueError("Please invoke step_input before update_memory")
-
-        self.mem_link.append((new_mem, mem_array))
-
-    def output(self, *outputs):
-        """
-        Mark the RNN output variables.
-
-        Args:
-            outputs: The output variables.
-
-        Returns:
-            None
-        """
-        self._assert_in_rnn_block_('output')
-        parent_block = self._parent_block_()
-        for each in outputs:
-            outside_array = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key("_".join(
-                    [self.helper.name, "output_array", each.name])),
-                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                dtype=each.dtype)
-            array_write(x=each, i=self.step_idx, array=outside_array)
-            self.output_array.append(outside_array)
-
-    def _init_zero_idx_(self):
-        if self.zero_idx is None:
-            parent_block = self._parent_block_()
-            self.zero_idx = parent_block.create_var(
-                name=unique_name.generate('zero_idx'), dtype='int64')
-            parent_block.append_op(
-                type='fill_constant',
-                inputs={},
-                outputs={'Out': [self.zero_idx]},
-                attrs={
-                    'shape': [1],
-                    'dtype': self.zero_idx.dtype,
-                    'value': float(0),
-                    'force_cpu': True
-                })
-
-    def _parent_block_(self):
-        prog = self.helper.main_program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-
-        return parent_block
-
-    def _assert_in_rnn_block_(self, method):
-        if self.status != DynamicRNN.IN_RNN:
-            raise ValueError("{0} can only be invoked inside rnn block.".format(
-                method))
-
-
-@templatedoc()
-def reorder_lod_tensor_by_rank(x, rank_table):
-    """
-    ${comment}
-
-    Args:
-    
-        x(${x_type}): ${x_comment}
-        rank_table(${rank_table_type}): ${rank_table_type}
-    
-    Returns:
-        out(${out_type}): ${out_comment} 
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data_desc = (['input', [9], 0], ['ref', [5], 1])
-          data = fluid.layers.data(name=data_desc[0][0], shape=data_desc[0][1])
-          rank_data = fluid.layers.data(name=data_desc[1][0], shape=data_desc[1][1])
-          table = fluid.layers.control_flow.lod_rank_table(rank_data)
-          new_data = fluid.layers.reorder_lod_tensor_by_rank(
-                           x=data, rank_table=table)
-
-    """
-    helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
-    helper.is_instance('x', Variable)
-    helper.is_instance('rank_table', Variable)
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='reorder_lod_tensor_by_rank',
-        inputs={'X': [x],
-                'RankTable': [rank_table]},
-        outputs={'Out': [out]})
-    return out
-
-
-def is_empty(x, cond=None):
-    """
-    Test whether a Variable is empty.
-
-    Args:
-        x (Variable): The Variable to be tested.
-        cond (Variable|None): Output parameter. Returns the test result
-                              of given 'x'. Default: None
-
-    Returns:
-        Variable: A bool scalar. True if 'x' is an empty Variable.
-
-    Raises:
-        TypeError: If input cond is not a variable, or cond's dtype is
-                   not bool.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          input = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
-          res = fluid.layers.is_empty(x=input)
-          # or:
-          # fluid.layers.is_empty(x=input, cond=res)
-
-    """
-    helper = LayerHelper("is_empty", **locals())
-    if cond is None:
-        cond = helper.create_variable_for_type_inference(dtype='bool')
-        cond.stop_gradient = True
-    elif not isinstance(cond, Variable):
-        raise TypeError("cond takes a variable")
-    elif cond.dtype != 'bool':
-        raise TypeError("The data type of cond must be bool")
-
-    helper.append_op(
-        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]})
-    return cond
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
deleted file mode 100644
index aa6c7ed0b8c742e5e36453eb0dcdd6399fc8e841..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/detection.py
+++ /dev/null
@@ -1,3181 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-All layers just related to the detection neural network.
-"""
-
-from __future__ import print_function
-
-from .layer_function_generator import generate_layer_fn
-from .layer_function_generator import autodoc, templatedoc
-from ..layer_helper import LayerHelper
-from ..framework import Variable
-from . import tensor
-from . import nn
-from . import ops
-from ... import compat as cpt
-import math
-import six
-import numpy
-from functools import reduce
-
-__all__ = [
-    'prior_box',
-    'density_prior_box',
-    'multi_box_head',
-    'bipartite_match',
-    'target_assign',
-    'detection_output',
-    'ssd_loss',
-    'rpn_target_assign',
-    'retinanet_target_assign',
-    'sigmoid_focal_loss',
-    'anchor_generator',
-    'roi_perspective_transform',
-    'generate_proposal_labels',
-    'generate_proposals',
-    'generate_mask_labels',
-    'iou_similarity',
-    'box_coder',
-    'polygon_box_transform',
-    'yolov3_loss',
-    'yolo_box',
-    'box_clip',
-    'multiclass_nms',
-    'multiclass_nms2',
-    'retinanet_detection_output',
-    'distribute_fpn_proposals',
-    'box_decoder_and_assign',
-    'collect_fpn_proposals',
-]
-
-
-def retinanet_target_assign(bbox_pred,
-                            cls_logits,
-                            anchor_box,
-                            anchor_var,
-                            gt_boxes,
-                            gt_labels,
-                            is_crowd,
-                            im_info,
-                            num_classes=1,
-                            positive_overlap=0.5,
-                            negative_overlap=0.4):
-    """
-    **Target Assign Layer for Retinanet .**
-
-    This layer can be, for given the Intersection-over-Union (IoU) overlap
-    between anchors and ground truth boxes, to assign classification and
-    regression targets to each anchor, these target labels are used for training
-    retinanet. Every anchor is assigned with a length :attr:`num_classes`
-    one-hot vector of classification targets, and a 4-vector of box regression
-    targets. The assignment rules are as followed:
-    
-    1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
-    IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
-    than positive_overlap(0.5) with any ground-truth box.
-    
-    2. Anchors are assigned to background when its IoU ratio is lower than
-    negative_overlap (0.4) for all ground-truth boxes.
-    
-    When an anchor is assigned with a ground-truth box which is the i-th category,
-    the i-th entry in its C vector of targets is set to 1 and all other entries
-    are set to 0. When an anchor is assigned with background, all entries are set
-    to 0. Anchors that are not assigned do not contribute to the training
-    objective. The regression targets are the encoded ground-truth boxes
-    associated with the assigned anchors.
- 
-    Args:
-        bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
-            predicted locations of M bounding bboxes. N is the batch size,
-            and each bounding box has four coordinate values and the layout
-            is [xmin, ymin, xmax, ymax].
-        cls_logits(Variable): A 3-D Tensor with shape [N, M, C] represents the
-            predicted confidence predictions. N is the batch size, C is the
-            number of classes (excluding background), M is number of bounding boxes.
-        anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
-            each box is represented as [xmin, ymin, xmax, ymax],
-            [xmin, ymin] is the left top coordinate of the anchor box,
-            if the input is image feature map, they are close to the origin
-            of the coordinate system. [xmax, ymax] is the right bottom
-            coordinate of the anchor box.
-        anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded 
-            variances of anchors.
-        gt_boxes(Variable): The ground-truth bounding boxes (bboxes) are a 2D
-            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
-            bboxes of mini-batch input.
-        gt_labels(variable): The ground-truth labels are a 2D LoDTensor with
-            shape [Ng, 1], Ng is the total number of ground-truth labels of
-            mini-batch input.
-        is_crowd(Variable): A 1-D LoDTensor which indicates ground-truth is crowd.
-        im_info(Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
-            3 is the height, width and scale.
-        num_classes(int32): The number of classes.
-        positive_overlap(float): Minimum overlap required between an anchor
-            and ground-truth box for the (anchor, gt box) pair to be a positive
-            example.
-        negative_overlap(float): Maximum overlap allowed between an anchor
-            and ground-truth box for the (anchor, gt box) pair to be a negative
-            examples.
-
-    Returns:
-        tuple:
-               A tuple(predicted_scores, predicted_location, target_label,
-               target_bbox, bbox_inside_weight, fg_num) is returned. The
-               predicted_scores and predicted_location are the predicted result
-               of the retinanet.The target_label and target_bbox are the ground
-               truth, respectively. The predicted_location is a 2D Tensor with
-               shape [F, 4], and the shape of target_bbox is same as the shape of
-               the predicted_location, F is the number of the foreground
-               anchors. The predicted_scores is a 2D Tensor with shape
-               [F + B, C], and the shape of target_label is [F + B, 1], B is the
-               number of the background anchors, the F and B is depends on the
-               input of this operator. Bbox_inside_weight represents whether the
-               predicted location is fake foreground or not and the shape is [F, 4].
-               Fg_num is the foreground number (including fake foreground) which
-               is needed by focal loss.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          bbox_pred = layers.data(name='bbox_pred', shape=[1, 100, 4],
-                            append_batch_size=False, dtype='float32')
-          cls_logits = layers.data(name='cls_logits', shape=[1, 100, 10],
-                            append_batch_size=False, dtype='float32')
-          anchor_box = layers.data(name='anchor_box', shape=[100, 4],
-                            append_batch_size=False, dtype='float32')
-          anchor_var = layers.data(name='anchor_var', shape=[100, 4],
-                            append_batch_size=False, dtype='float32')
-          gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
-                            append_batch_size=False, dtype='float32')
-          gt_labels = layers.data(name='gt_labels', shape=[10, 1],
-                            append_batch_size=False, dtype='float32')
-          is_crowd = fluid.layers.data(name='is_crowd', shape=[1],
-                            append_batch_size=False, dtype='float32')
-          im_info = fluid.layers.data(name='im_infoss', shape=[1, 3],
-                            append_batch_size=False, dtype='float32')
-          loc_pred, score_pred, loc_target, score_target, bbox_inside_weight, fg_num =
-                fluid.layers.retinanet_target_assign(bbox_pred, cls_logits, anchor_box,
-                anchor_var, gt_boxes, gt_labels, is_crowd, im_info, 10)
-
-    """
-
-    helper = LayerHelper('retinanet_target_assign', **locals())
-    # Assign target label to anchors
-    loc_index = helper.create_variable_for_type_inference(dtype='int32')
-    score_index = helper.create_variable_for_type_inference(dtype='int32')
-    target_label = helper.create_variable_for_type_inference(dtype='int32')
-    target_bbox = helper.create_variable_for_type_inference(
-        dtype=anchor_box.dtype)
-    bbox_inside_weight = helper.create_variable_for_type_inference(
-        dtype=anchor_box.dtype)
-    fg_num = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="retinanet_target_assign",
-        inputs={
-            'Anchor': anchor_box,
-            'GtBoxes': gt_boxes,
-            'GtLabels': gt_labels,
-            'IsCrowd': is_crowd,
-            'ImInfo': im_info
-        },
-        outputs={
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': target_label,
-            'TargetBBox': target_bbox,
-            'BBoxInsideWeight': bbox_inside_weight,
-            'ForegroundNumber': fg_num
-        },
-        attrs={
-            'positive_overlap': positive_overlap,
-            'negative_overlap': negative_overlap
-        })
-
-    loc_index.stop_gradient = True
-    score_index.stop_gradient = True
-    target_label.stop_gradient = True
-    target_bbox.stop_gradient = True
-    bbox_inside_weight.stop_gradient = True
-    fg_num.stop_gradient = True
-
-    cls_logits = nn.reshape(x=cls_logits, shape=(-1, num_classes))
-    bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
-    predicted_cls_logits = nn.gather(cls_logits, score_index)
-    predicted_bbox_pred = nn.gather(bbox_pred, loc_index)
-
-    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox, bbox_inside_weight, fg_num
-
-
-def rpn_target_assign(bbox_pred,
-                      cls_logits,
-                      anchor_box,
-                      anchor_var,
-                      gt_boxes,
-                      is_crowd,
-                      im_info,
-                      rpn_batch_size_per_im=256,
-                      rpn_straddle_thresh=0.0,
-                      rpn_fg_fraction=0.5,
-                      rpn_positive_overlap=0.7,
-                      rpn_negative_overlap=0.3,
-                      use_random=True):
-    """
-    **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.**
-
-    This layer can be, for given the  Intersection-over-Union (IoU) overlap
-    between anchors and ground truth boxes, to assign classification and
-    regression targets to each each anchor, these target labels are used for
-    train RPN. The classification targets is a binary class label (of being
-    an object or not). Following the paper of Faster-RCNN, the positive labels
-    are two kinds of anchors: (i) the anchor/anchors with the highest IoU
-    overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
-    higher than rpn_positive_overlap(0.7) with any ground-truth box. Note
-    that a single ground-truth box may assign positive labels to multiple
-    anchors. A non-positive anchor is when its IoU ratio is lower than
-    rpn_negative_overlap (0.3) for all ground-truth boxes. Anchors that are
-    neither positive nor negative do not contribute to the training objective.
-    The regression targets are the encoded ground-truth boxes associated with
-    the positive anchors.
-
-    Args:
-        bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
-            predicted locations of M bounding bboxes. N is the batch size,
-            and each bounding box has four coordinate values and the layout
-            is [xmin, ymin, xmax, ymax].
-        cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the
-            predicted confidence predictions. N is the batch size, 1 is the
-            frontground and background sigmoid, M is number of bounding boxes.
-        anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
-            each box is represented as [xmin, ymin, xmax, ymax],
-            [xmin, ymin] is the left top coordinate of the anchor box,
-            if the input is image feature map, they are close to the origin
-            of the coordinate system. [xmax, ymax] is the right bottom
-            coordinate of the anchor box.
-        anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded 
-            variances of anchors.
-        gt_boxes (Variable): The ground-truth bounding boxes (bboxes) are a 2D
-            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
-            bboxes of mini-batch input.
-        is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd.
-        im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
-        3 is the height, width and scale.
-        rpn_batch_size_per_im(int): Total number of RPN examples per image.
-        rpn_straddle_thresh(float): Remove RPN anchors that go outside the image
-            by straddle_thresh pixels.
-        rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled
-            foreground (i.e. class > 0), 0-th class is background.
-        rpn_positive_overlap(float): Minimum overlap required between an anchor
-            and ground-truth box for the (anchor, gt box) pair to be a positive
-            example.
-        rpn_negative_overlap(float): Maximum overlap allowed between an anchor
-            and ground-truth box for the (anchor, gt box) pair to be a negative
-            examples.
-
-    Returns:
-        tuple:
-               A tuple(predicted_scores, predicted_location, target_label,
-               target_bbox, bbox_inside_weight) is returned. The predicted_scores 
-               and predicted_location is the predicted result of the RPN.
-               The target_label and target_bbox is the ground truth,
-               respectively. The predicted_location is a 2D Tensor with shape
-               [F, 4], and the shape of target_bbox is same as the shape of
-               the predicted_location, F is the number of the foreground
-               anchors. The predicted_scores is a 2D Tensor with shape
-               [F + B, 1], and the shape of target_label is same as the shape
-               of the predicted_scores, B is the number of the background
-               anchors, the F and B is depends on the input of this operator.
-               Bbox_inside_weight represents whether the predicted loc is fake_fg
-               or not and the shape is [F, 4].
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            bbox_pred = fluid.layers.data(name='bbox_pred', shape=[100, 4],
-                            append_batch_size=False, dtype='float32')
-            cls_logits = fluid.layers.data(name='cls_logits', shape=[100, 1],
-                            append_batch_size=False, dtype='float32')
-            anchor_box = fluid.layers.data(name='anchor_box', shape=[20, 4],
-                            append_batch_size=False, dtype='float32')
-            anchor_var = fluid.layers.data(name='anchor_var', shape=[20, 4],
-                            append_batch_size=False, dtype='float32')
-            gt_boxes = fluid.layers.data(name='gt_boxes', shape=[10, 4],
-                            append_batch_size=False, dtype='float32')
-            is_crowd = fluid.layers.data(name='is_crowd', shape=[1],
-                            append_batch_size=False, dtype='float32')
-            im_info = fluid.layers.data(name='im_infoss', shape=[1, 3],
-                            append_batch_size=False, dtype='float32')
-            loc, score, loc_target, score_target, inside_weight = fluid.layers.rpn_target_assign(
-                bbox_pred, cls_logits, anchor_box, anchor_var, gt_boxes, is_crowd, im_info)
-
-    """
-
-    helper = LayerHelper('rpn_target_assign', **locals())
-    # Assign target label to anchors
-    loc_index = helper.create_variable_for_type_inference(dtype='int32')
-    score_index = helper.create_variable_for_type_inference(dtype='int32')
-    target_label = helper.create_variable_for_type_inference(dtype='int32')
-    target_bbox = helper.create_variable_for_type_inference(
-        dtype=anchor_box.dtype)
-    bbox_inside_weight = helper.create_variable_for_type_inference(
-        dtype=anchor_box.dtype)
-    helper.append_op(
-        type="rpn_target_assign",
-        inputs={
-            'Anchor': anchor_box,
-            'GtBoxes': gt_boxes,
-            'IsCrowd': is_crowd,
-            'ImInfo': im_info
-        },
-        outputs={
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': target_label,
-            'TargetBBox': target_bbox,
-            'BBoxInsideWeight': bbox_inside_weight
-        },
-        attrs={
-            'rpn_batch_size_per_im': rpn_batch_size_per_im,
-            'rpn_straddle_thresh': rpn_straddle_thresh,
-            'rpn_positive_overlap': rpn_positive_overlap,
-            'rpn_negative_overlap': rpn_negative_overlap,
-            'rpn_fg_fraction': rpn_fg_fraction,
-            'use_random': use_random
-        })
-
-    loc_index.stop_gradient = True
-    score_index.stop_gradient = True
-    target_label.stop_gradient = True
-    target_bbox.stop_gradient = True
-    bbox_inside_weight.stop_gradient = True
-
-    cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1))
-    bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
-    predicted_cls_logits = nn.gather(cls_logits, score_index)
-    predicted_bbox_pred = nn.gather(bbox_pred, loc_index)
-
-    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox, bbox_inside_weight
-
-
-def sigmoid_focal_loss(x, label, fg_num, gamma=2, alpha=0.25):
-    """
-    **Sigmoid Focal Loss Operator.**
-
-    Focal loss is used to address the foreground-background class imbalance existed
-    on the training phase of one-stage detectors. This operator computes the sigmoid
-    value for each element in the input tensor, after which focal loss is measured.
-    
-    The focal loss is given as followed:
-
-    .. math::
-        loss_j = (-label_j * alpha * {(1 - \\sigma(x_j))}^{gamma} * \\log(\\sigma(x_j)) -
-        (1 - labels_j) * (1 - alpha) * {(\sigma(x_j)}^{ gamma} * \\log(1 - \\sigma(x_j)))
-        / fg\_num, j = 1,...,K
-
-    We know that
-    
-    .. math::
-        \\sigma(x_j) = \\frac{1}{1 + \\exp(-x_j)}
-
-    Args:
-        x(Variable): A 2-D tensor with shape [N, D], where N is the batch size and D is the number
-            of classes (excluding background). This input is a tensor of logits computed by the
-            previous operator.
-        label(Variable): A 2-D tensor with shape [N, 1], which is the probabilistic labels.
-        fg_num(Variable): A 1-D tensor with shape [1], which is the number of foreground.
-
-        gamma(float): Hyper-parameter to balance the easy and hard examples. Default value is
-            set to 2.0.
-        alpha(float): Hyper-parameter to balance the positive and negative example. Default value
-            is set to 0.25.
-
-    Returns:
-        out(Variable): A 2-D tensor with shape [N, D], which is the focal loss.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(
-                name='data', shape=[10,80], append_batch_size=False, dtype='float32')
-            label = fluid.layers.data(
-                name='label', shape=[10,1], append_batch_size=False, dtype='int32')
-            fg_num = fluid.layers.data(
-                name='fg_num', shape=[1], append_batch_size=False, dtype='int32')
-            loss = fluid.layers.sigmoid_focal_loss(x=input,
-                                                   label=label,
-                                                   fg_num=fg_num,
-                                                   gamma=2.,
-                                                   alpha=0.25)
-    """
-
-    helper = LayerHelper("sigmoid_focal_loss", **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type="sigmoid_focal_loss",
-        inputs={"X": x,
-                "Label": label,
-                "FgNum": fg_num},
-        attrs={"gamma": gamma,
-               'alpha': alpha},
-        outputs={"Out": out})
-    return out
-
-
-def detection_output(loc,
-                     scores,
-                     prior_box,
-                     prior_box_var,
-                     background_label=0,
-                     nms_threshold=0.3,
-                     nms_top_k=400,
-                     keep_top_k=200,
-                     score_threshold=0.01,
-                     nms_eta=1.0,
-                     return_index=False):
-    """
-    **Detection Output Layer for Single Shot Multibox Detector (SSD).**
-
-    This operation is to get the detection results by performing following
-    two steps:
-
-    1. Decode input bounding box predictions according to the prior boxes.
-    2. Get the final detection results by applying multi-class non maximum
-       suppression (NMS).
-
-    Please note, this operation doesn't clip the final output bounding boxes
-    to the image window.
-
-    Args:
-        loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
-            predicted locations of M bounding bboxes. N is the batch size,
-            and each bounding box has four coordinate values and the layout
-            is [xmin, ymin, xmax, ymax].
-        scores(Variable): A 3-D Tensor with shape [N, M, C] represents the
-            predicted confidence predictions. N is the batch size, C is the
-            class number, M is number of bounding boxes. For each category
-            there are total M scores which corresponding M bounding boxes.
-        prior_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
-            each box is represented as [xmin, ymin, xmax, ymax],
-            [xmin, ymin] is the left top coordinate of the anchor box,
-            if the input is image feature map, they are close to the origin
-            of the coordinate system. [xmax, ymax] is the right bottom
-            coordinate of the anchor box.
-        prior_box_var(Variable): A 2-D Tensor with shape [M, 4] holds M group
-            of variance.
-        background_label(float): The index of background label,
-            the background label will be ignored. If set to -1, then all
-            categories will be considered.
-        nms_threshold(float): The threshold to be used in NMS.
-        nms_top_k(int): Maximum number of detections to be kept according
-            to the confidences aftern the filtering detections based on
-            score_threshold.
-        keep_top_k(int): Number of total bboxes to be kept per image after
-            NMS step. -1 means keeping all bboxes after NMS step.
-        score_threshold(float): Threshold to filter out bounding boxes with
-            low confidence score. If not provided, consider all boxes.
-        nms_eta(float): The parameter for adaptive NMS.
-        return_index(bool): Whether return selected index. Default: False
-
-    Returns:
-
-        A tuple with two Variables: (Out, Index) if return_index is True,
-        otherwise, a tuple with one Variable(Out) is returned. 
-
-        Out: The detection outputs is a LoDTensor with shape [No, 6]. Each row 
-        has six values: [label, confidence, xmin, ymin, xmax, ymax]. `No` is 
-        the total number of detections in this mini-batch. For each instance, 
-        the offsets in first dimension are called LoD, the offset number is 
-        N + 1, N is the batch size. The i-th image has `LoD[i + 1] - LoD[i]` 
-        detected results, if it is 0, the i-th image has no detected results. 
-
-        If all images have not detected results, LoD will be set to {1}, and 
-        output tensor only contains one value, which is -1.
-        (After version 1.3, when no boxes detected, the lod is changed
-        from {0} to {1}.)       
- 
-        Index: Only return when return_index is True. A 2-D LoDTensor with 
-        shape [No, 1] represents the selected index which type is Integer. 
-        The index is the absolute value cross batches. No is the same number 
-        as Out. If the index is used to gather other attribute such as age, 
-        one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where
-        N is the batch size and M is the number of boxes.
-
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            pb = fluid.layers.data(name='prior_box', shape=[10, 4],
-                         append_batch_size=False, dtype='float32')
-            pbv = fluid.layers.data(name='prior_box_var', shape=[10, 4],
-                          append_batch_size=False, dtype='float32')
-            loc = fluid.layers.data(name='target_box', shape=[2, 21, 4],
-                          append_batch_size=False, dtype='float32')
-            scores = fluid.layers.data(name='scores', shape=[2, 21, 10],
-                          append_batch_size=False, dtype='float32')
-            nmsed_outs, index = fluid.layers.detection_output(scores=scores,
-                                       loc=loc,
-                                       prior_box=pb,
-                                       prior_box_var=pbv,
-                                       return_index=True)
-    """
-    helper = LayerHelper("detection_output", **locals())
-    decoded_box = box_coder(
-        prior_box=prior_box,
-        prior_box_var=prior_box_var,
-        target_box=loc,
-        code_type='decode_center_size')
-    scores = nn.softmax(input=scores)
-    scores = nn.transpose(scores, perm=[0, 2, 1])
-    scores.stop_gradient = True
-    nmsed_outs = helper.create_variable_for_type_inference(
-        dtype=decoded_box.dtype)
-    if return_index:
-        index = helper.create_variable_for_type_inference(dtype='int')
-        helper.append_op(
-            type="multiclass_nms2",
-            inputs={'Scores': scores,
-                    'BBoxes': decoded_box},
-            outputs={'Out': nmsed_outs,
-                     'Index': index},
-            attrs={
-                'background_label': 0,
-                'nms_threshold': nms_threshold,
-                'nms_top_k': nms_top_k,
-                'keep_top_k': keep_top_k,
-                'score_threshold': score_threshold,
-                'nms_eta': 1.0,
-            })
-        index.stop_gradient = True
-    else:
-        helper.append_op(
-            type="multiclass_nms",
-            inputs={'Scores': scores,
-                    'BBoxes': decoded_box},
-            outputs={'Out': nmsed_outs},
-            attrs={
-                'background_label': 0,
-                'nms_threshold': nms_threshold,
-                'nms_top_k': nms_top_k,
-                'keep_top_k': keep_top_k,
-                'score_threshold': score_threshold,
-                'nms_eta': 1.0,
-            })
-    nmsed_outs.stop_gradient = True
-    if return_index:
-        return nmsed_outs, index
-    return nmsed_outs
-
-
-@templatedoc()
-def iou_similarity(x, y, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name='x', shape=[4], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[4], dtype='float32')
-            iou = fluid.layers.iou_similarity(x=x, y=y)
-    """
-    helper = LayerHelper("iou_similarity", **locals())
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="iou_similarity",
-        inputs={"X": x,
-                "Y": y},
-        attrs={},
-        outputs={"Out": out})
-    return out
-
-
-@templatedoc()
-def box_coder(prior_box,
-              prior_box_var,
-              target_box,
-              code_type="encode_center_size",
-              box_normalized=True,
-              name=None,
-              axis=0):
-    """
-    **Box Coder Layer**
-
-    Encode/Decode the target bounding box with the priorbox information.
-    
-    The Encoding schema described below:
-
-    .. math::
-
-        ox = (tx - px) / pw / pxv
-
-        oy = (ty - py) / ph / pyv
-
-        ow = \log(\abs(tw / pw)) / pwv 
-
-        oh = \log(\abs(th / ph)) / phv 
-
-    The Decoding schema described below:
-    
-    .. math::
-  
-        ox = (pw * pxv * tx * + px) - tw / 2
-
-        oy = (ph * pyv * ty * + py) - th / 2
-
-        ow = \exp(pwv * tw) * pw + tw / 2
-
-        oh = \exp(phv * th) * ph + th / 2   
-
-    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, 
-    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote 
-    the priorbox's (anchor) center coordinates, width and height. `pxv`, 
-    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, 
-    `ow`, `oh` denote the encoded/decoded coordinates, width and height. 
-
-    During Box Decoding, two modes for broadcast are supported. Say target 
-    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or 
-    [M, 4]. Then prior box will broadcast to target box along the 
-    assigned axis. 
-
-    Args:
-        prior_box(Variable): Box list prior_box is a 2-D Tensor with shape 
-                             [M, 4] holds M boxes, each box is represented as
-                             [xmin, ymin, xmax, ymax], [xmin, ymin] is the 
-                             left top coordinate of the anchor box, if the 
-                             input is image feature map, they are close to 
-                             the origin of the coordinate system. [xmax, ymax]
-                             is the right bottom coordinate of the anchor box.       
-        prior_box_var(Variable|list|None): prior_box_var supports two types 
-                              of input. One is variable with shape [M, 4] 
-                              holds M group. The other one is list consist of 
-                              4 elements shared by all boxes. 
-        target_box(Variable): This input can be a 2-D LoDTensor with shape 
-                              [N, 4] when code_type is 'encode_center_size'. 
-                              This input also can be a 3-D Tensor with shape 
-                              [N, M, 4] when code_type is 'decode_center_size'. 
-                              Each box is represented as  
-                              [xmin, ymin, xmax, ymax]. This tensor can 
-                              contain LoD information to represent a batch 
-                              of inputs. 
-        code_type(string): The code type used with the target box. It can be
-                           encode_center_size or decode_center_size
-        box_normalized(int): Whether treat the priorbox as a noramlized box.
-                             Set true by default.
-        name(string): The name of box coder.
-        axis(int): Which axis in PriorBox to broadcast for box decode, 
-                   for example, if axis is 0 and TargetBox has shape
-                   [N, M, 4] and PriorBox has shape [M, 4], then PriorBox
-                   will broadcast to [N, M, 4] for decoding. It is only valid
-                   when code type is decode_center_size. Set 0 by default. 
-
-    Returns:
-        output_box(Variable): When code_type is 'encode_center_size', the 
-                              output tensor of box_coder_op with shape 
-                              [N, M, 4] representing the result of N target 
-                              boxes encoded with M Prior boxes and variances. 
-                              When code_type is 'decode_center_size', 
-                              N represents the batch size and M represents 
-                              the number of deocded boxes.
-
-    Examples:
- 
-        .. code-block:: python
- 
-            import paddle.fluid as fluid
-            prior_box = fluid.layers.data(name='prior_box', 
-                                          shape=[512, 4], 
-                                          dtype='float32',
-                                          append_batch_size=False)
-            target_box = fluid.layers.data(name='target_box',
-                                           shape=[512,81,4],
-                                           dtype='float32',
-                                           append_batch_size=False)
-            output = fluid.layers.box_coder(prior_box=prior_box,
-                                            prior_box_var=[0.1,0.1,0.2,0.2],
-                                            target_box=target_box,
-                                            code_type="decode_center_size",
-                                            box_normalized=False,
-                                            axis=1)
-
-    """
-    helper = LayerHelper("box_coder", **locals())
-
-    if name is None:
-        output_box = helper.create_variable_for_type_inference(
-            dtype=prior_box.dtype)
-    else:
-        output_box = helper.create_variable(
-            name=name, dtype=prior_box.dtype, persistable=False)
-
-    inputs = {"PriorBox": prior_box, "TargetBox": target_box}
-    attrs = {
-        "code_type": code_type,
-        "box_normalized": box_normalized,
-        "axis": axis
-    }
-    if isinstance(prior_box_var, Variable):
-        inputs['PriorBoxVar'] = prior_box_var
-    elif isinstance(prior_box_var, list):
-        attrs['variance'] = prior_box_var
-    else:
-        raise TypeError("Input variance of box_coder must be Variable or lisz")
-    helper.append_op(
-        type="box_coder",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"OutputBox": output_box})
-    return output_box
-
-
-@templatedoc()
-def polygon_box_transform(input, name=None):
-    """
-    ${comment}
-
-    Args:
-        input(${input_type}): ${input_comment}
-
-    Returns:
-        output(${output_type}): ${output_comment}
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name='input', shape=[4, 10, 5, 5],
-                                      append_batch_size=False, dtype='float32')
-            out = fluid.layers.polygon_box_transform(input)
-    """
-    helper = LayerHelper("polygon_box_transform", **locals())
-    if name is None:
-        output = helper.create_variable_for_type_inference(dtype=input.dtype)
-    else:
-        output = helper.create_variable(
-            name=name, dtype=prior_box.input, persistable=False)
-
-    helper.append_op(
-        type="polygon_box_transform",
-        inputs={"Input": input},
-        attrs={},
-        outputs={"Output": output})
-    return output
-
-
-@templatedoc(op_type="yolov3_loss")
-def yolov3_loss(x,
-                gt_box,
-                gt_label,
-                anchors,
-                anchor_mask,
-                class_num,
-                ignore_thresh,
-                downsample_ratio,
-                gt_score=None,
-                use_label_smooth=True,
-                name=None):
-    """
-    ${comment}
-
-    Args:
-        x (Variable): ${x_comment}
-        gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4],
-                          in the third dimenstion, x, y, w, h should be stored. 
-                          x,y is the center cordinate of boxes, w, h are the
-                          width and height, x, y, w, h should be divided by 
-                          input image height to scale to [0, 1].
-                          N is the batch number and B is the max box number in 
-                          an image.
-        gt_label (Variable): class id of ground truth boxes, shoud be in shape
-                            of [N, B].
-        anchors (list|tuple): ${anchors_comment}
-        anchor_mask (list|tuple): ${anchor_mask_comment}
-        class_num (int): ${class_num_comment}
-        ignore_thresh (float): ${ignore_thresh_comment}
-        downsample_ratio (int): ${downsample_ratio_comment}
-        name (string): the name of yolov3 loss. Default None.
-        gt_score (Variable): mixup score of ground truth boxes, shoud be in shape
-                            of [N, B]. Default None.
-        use_label_smooth (bool): ${use_label_smooth_comment}
-
-    Returns:
-        Variable: A 1-D tensor with shape [N], the value of yolov3 loss
-
-    Raises:
-        TypeError: Input x of yolov3_loss must be Variable
-        TypeError: Input gtbox of yolov3_loss must be Variable
-        TypeError: Input gtlabel of yolov3_loss must be Variable
-        TypeError: Input gtscore of yolov3_loss must be None or Variable
-        TypeError: Attr anchors of yolov3_loss must be list or tuple
-        TypeError: Attr class_num of yolov3_loss must be an integer
-        TypeError: Attr ignore_thresh of yolov3_loss must be a float number
-        TypeError: Attr use_label_smooth of yolov3_loss must be a bool value
-
-    Examples:
-      .. code-block:: python
-
-          import paddle.fluid as fluid
-          x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-          gt_box = fluid.layers.data(name='gt_box', shape=[6, 4], dtype='float32')
-          gt_label = fluid.layers.data(name='gt_label', shape=[6], dtype='int32')
-          gt_score = fluid.layers.data(name='gt_score', shape=[6], dtype='float32')
-          anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
-          anchor_mask = [0, 1, 2]
-          loss = fluid.layers.yolov3_loss(x=x, gt_box=gt_box, gt_label=gt_label,
-                                          gt_score=gt_score, anchors=anchors, 
-                                          anchor_mask=anchor_mask, class_num=80,
-                                          ignore_thresh=0.7, downsample_ratio=32)
-    """
-    helper = LayerHelper('yolov3_loss', **locals())
-
-    if not isinstance(x, Variable):
-        raise TypeError("Input x of yolov3_loss must be Variable")
-    if not isinstance(gt_box, Variable):
-        raise TypeError("Input gtbox of yolov3_loss must be Variable")
-    if not isinstance(gt_label, Variable):
-        raise TypeError("Input gtlabel of yolov3_loss must be Variable")
-    if gt_score is not None and not isinstance(gt_score, Variable):
-        raise TypeError("Input gtscore of yolov3_loss must be Variable")
-    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
-        raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
-    if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
-        raise TypeError("Attr anchor_mask of yolov3_loss must be list or tuple")
-    if not isinstance(class_num, int):
-        raise TypeError("Attr class_num of yolov3_loss must be an integer")
-    if not isinstance(ignore_thresh, float):
-        raise TypeError(
-            "Attr ignore_thresh of yolov3_loss must be a float number")
-    if not isinstance(use_label_smooth, bool):
-        raise TypeError(
-            "Attr use_label_smooth of yolov3_loss must be a bool value")
-
-    if name is None:
-        loss = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        loss = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
-    gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')
-
-    inputs = {
-        "X": x,
-        "GTBox": gt_box,
-        "GTLabel": gt_label,
-    }
-    if gt_score:
-        inputs["GTScore"] = gt_score
-
-    attrs = {
-        "anchors": anchors,
-        "anchor_mask": anchor_mask,
-        "class_num": class_num,
-        "ignore_thresh": ignore_thresh,
-        "downsample_ratio": downsample_ratio,
-        "use_label_smooth": use_label_smooth,
-    }
-
-    helper.append_op(
-        type='yolov3_loss',
-        inputs=inputs,
-        outputs={
-            'Loss': loss,
-            'ObjectnessMask': objectness_mask,
-            'GTMatchMask': gt_match_mask
-        },
-        attrs=attrs)
-    return loss
-
-
-@templatedoc(op_type="yolo_box")
-def yolo_box(x,
-             img_size,
-             anchors,
-             class_num,
-             conf_thresh,
-             downsample_ratio,
-             name=None):
-    """
-    ${comment}
-
-    Args:
-        x (Variable): ${x_comment}
-        img_size (Variable): ${img_size_comment}
-        anchors (list|tuple): ${anchors_comment}
-        class_num (int): ${class_num_comment}
-        conf_thresh (float): ${conf_thresh_comment}
-        downsample_ratio (int): ${downsample_ratio_comment}
-        name (string): the name of yolo box layer. Default None.
-
-    Returns:
-        Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
-        and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification 
-        scores of boxes.
-
-    Raises:
-        TypeError: Input x of yolov_box must be Variable
-        TypeError: Attr anchors of yolo box must be list or tuple
-        TypeError: Attr class_num of yolo box must be an integer
-        TypeError: Attr conf_thresh of yolo box must be a float number
-
-    Examples:
-
-    .. code-block:: python
-
-        import paddle.fluid as fluid
-        x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-        img_size = fluid.layers.data(name='img_size',shape=[2],dtype='int64')
-        anchors = [10, 13, 16, 30, 33, 23]
-        boxes,scores = fluid.layers.yolo_box(x=x, img_size=img_size, class_num=80, anchors=anchors, 
-                                        conf_thresh=0.01, downsample_ratio=32)
-    """
-    helper = LayerHelper('yolo_box', **locals())
-
-    if not isinstance(x, Variable):
-        raise TypeError("Input x of yolo_box must be Variable")
-    if not isinstance(img_size, Variable):
-        raise TypeError("Input img_size of yolo_box must be Variable")
-    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
-        raise TypeError("Attr anchors of yolo_box must be list or tuple")
-    if not isinstance(class_num, int):
-        raise TypeError("Attr class_num of yolo_box must be an integer")
-    if not isinstance(conf_thresh, float):
-        raise TypeError("Attr ignore_thresh of yolo_box must be a float number")
-
-    boxes = helper.create_variable_for_type_inference(dtype=x.dtype)
-    scores = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    attrs = {
-        "anchors": anchors,
-        "class_num": class_num,
-        "conf_thresh": conf_thresh,
-        "downsample_ratio": downsample_ratio,
-    }
-
-    helper.append_op(
-        type='yolo_box',
-        inputs={
-            "X": x,
-            "ImgSize": img_size,
-        },
-        outputs={
-            'Boxes': boxes,
-            'Scores': scores,
-        },
-        attrs=attrs)
-    return boxes, scores
-
-
-@templatedoc()
-def detection_map(detect_res,
-                  label,
-                  class_num,
-                  background_label=0,
-                  overlap_threshold=0.3,
-                  evaluate_difficult=True,
-                  has_state=None,
-                  input_states=None,
-                  out_states=None,
-                  ap_version='integral'):
-    """
-    ${comment}
-
-    Args:
-        detect_res: ${detect_res_comment}
-        label:  ${label_comment}
-        class_num: ${class_num_comment}
-        background_label: ${background_label_comment}
-        overlap_threshold: ${overlap_threshold_comment}
-        evaluate_difficult: ${evaluate_difficult_comment}
-        has_state: ${has_state_comment}
-        input_states: If not None, It contains 3 elements:
-            1. pos_count ${pos_count_comment}.
-            2. true_pos ${true_pos_comment}.
-            3. false_pos ${false_pos_comment}.
-        out_states: If not None, it contains 3 elements.
-            1. accum_pos_count ${accum_pos_count_comment}.
-            2. accum_true_pos ${accum_true_pos_comment}.
-            3. accum_false_pos ${accum_false_pos_comment}.
-        ap_version: ${ap_type_comment}
-
-    Returns:
-        ${map_comment}
-
-
-    Examples:
-          .. code-block:: python
-
-            import paddle.fluid as fluid
-            from fluid.layers import detection
-            detect_res = fluid.layers.data(
-                name='detect_res',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
-            label = fluid.layers.data(
-                name='label',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
-
-            map_out = detection.detection_map(detect_res, label, 21)
-    """
-    helper = LayerHelper("detection_map", **locals())
-
-    def __create_var(type):
-        return helper.create_variable_for_type_inference(dtype=type)
-
-    map_out = __create_var('float32')
-    accum_pos_count_out = out_states[0] if out_states else __create_var('int32')
-    accum_true_pos_out = out_states[1] if out_states else __create_var(
-        'float32')
-    accum_false_pos_out = out_states[2] if out_states else __create_var(
-        'float32')
-
-    pos_count = input_states[0] if input_states else None
-    true_pos = input_states[1] if input_states else None
-    false_pos = input_states[2] if input_states else None
-
-    helper.append_op(
-        type="detection_map",
-        inputs={
-            'Label': label,
-            'DetectRes': detect_res,
-            'HasState': has_state,
-            'PosCount': pos_count,
-            'TruePos': true_pos,
-            'FalsePos': false_pos
-        },
-        outputs={
-            'MAP': map_out,
-            'AccumPosCount': accum_pos_count_out,
-            'AccumTruePos': accum_true_pos_out,
-            'AccumFalsePos': accum_false_pos_out
-        },
-        attrs={
-            'overlap_threshold': overlap_threshold,
-            'evaluate_difficult': evaluate_difficult,
-            'ap_type': ap_version,
-            'class_num': class_num,
-        })
-    return map_out
-
-
-def bipartite_match(dist_matrix,
-                    match_type=None,
-                    dist_threshold=None,
-                    name=None):
-    """
-    This operator implements a greedy bipartite matching algorithm, which is
-    used to obtain the matching with the maximum distance based on the input
-    distance matrix. For input 2D matrix, the bipartite matching algorithm can
-    find the matched column for each row (matched means the largest distance),
-    also can find the matched row for each column. And this operator only
-    calculate matched indices from column to row. For each instance,
-    the number of matched indices is the column number of the input distance
-    matrix.
-
-    There are two outputs, matched indices and distance.
-    A simple description, this algorithm matched the best (maximum distance)
-    row entity to the column entity and the matched indices are not duplicated
-    in each row of ColToRowMatchIndices. If the column entity is not matched
-    any row entity, set -1 in ColToRowMatchIndices.
-
-    NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor.
-    If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
-    If Tensor, the height of ColToRowMatchIndices is 1.
-
-    NOTE: This API is a very low level API. It is used by :code:`ssd_loss`
-    layer. Please consider to use :code:`ssd_loss` instead.
-
-    Args:
-        dist_matrix(Variable): This input is a 2-D LoDTensor with shape
-            [K, M]. It is pair-wise distance matrix between the entities
-            represented by each row and each column. For example, assumed one
-            entity is A with shape [K], another entity is B with shape [M]. The
-            dist_matrix[i][j] is the distance between A[i] and B[j]. The bigger
-            the distance is, the better matching the pairs are.
-
-            NOTE: This tensor can contain LoD information to represent a batch
-            of inputs. One instance of this batch can contain different numbers
-            of entities.
-        match_type(string|None): The type of matching method, should be
-           'bipartite' or 'per_prediction'. [default 'bipartite'].
-        dist_threshold(float|None): If `match_type` is 'per_prediction',
-            this threshold is to determine the extra matching bboxes based
-            on the maximum distance, 0.5 by default.
-    Returns:
-        tuple: a tuple with two elements is returned. The first is
-        matched_indices, the second is matched_distance.
-
-        The matched_indices is a 2-D Tensor with shape [N, M] in int type.
-        N is the batch size. If match_indices[i][j] is -1, it
-        means B[j] does not match any entity in i-th instance.
-        Otherwise, it means B[j] is matched to row
-        match_indices[i][j] in i-th instance. The row number of
-        i-th instance is saved in match_indices[i][j].
-
-        The matched_distance is a 2-D Tensor with shape [N, M] in float type
-        . N is batch size. If match_indices[i][j] is -1,
-        match_distance[i][j] is also -1.0. Otherwise, assumed
-        match_distance[i][j] = d, and the row offsets of each instance
-        are called LoD. Then match_distance[i][j] =
-        dist_matrix[d+LoD[i]][j].
-
-    Examples:
-
-        >>> import paddle.fluid as fluid
-        >>> x = fluid.layers.data(name='x', shape=[4], dtype='float32')
-        >>> y = fluid.layers.data(name='y', shape=[4], dtype='float32')
-        >>> iou = fluid.layers.iou_similarity(x=x, y=y)
-        >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
-    """
-    helper = LayerHelper('bipartite_match', **locals())
-    match_indices = helper.create_variable_for_type_inference(dtype='int32')
-    match_distance = helper.create_variable_for_type_inference(
-        dtype=dist_matrix.dtype)
-    helper.append_op(
-        type='bipartite_match',
-        inputs={'DistMat': dist_matrix},
-        attrs={
-            'match_type': match_type,
-            'dist_threshold': dist_threshold,
-        },
-        outputs={
-            'ColToRowMatchIndices': match_indices,
-            'ColToRowMatchDist': match_distance
-        })
-    return match_indices, match_distance
-
-
-def target_assign(input,
-                  matched_indices,
-                  negative_indices=None,
-                  mismatch_value=None,
-                  name=None):
-    """
-    This operator can be, for given the target bounding boxes or labels,
-    to assign classification and regression targets to each prediction as well as
-    weights to prediction. The weights is used to specify which prediction would
-    not contribute to training loss.
-
-    For each instance, the output `out` and`out_weight` are assigned based on
-    `match_indices` and `negative_indices`.
-    Assumed that the row offset for each instance in `input` is called lod,
-    this operator assigns classification/regression targets by performing the
-    following steps:
-
-    1. Assigning all outputs based on `match_indices`:
-
-    .. code-block:: text
-
-        If id = match_indices[i][j] > 0,
-
-            out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
-            out_weight[i][j] = 1.
-
-        Otherwise,
-
-            out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
-            out_weight[i][j] = 0.
-
-    2. Assigning out_weight based on `neg_indices` if `neg_indices` is provided:
-
-    Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
-    for i-th instance and each `id` of neg_indices in this instance:
-
-    .. code-block:: text
-
-        out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
-        out_weight[i][id] = 1.0
-
-    Args:
-       inputs (Variable): This input is a 3D LoDTensor with shape [M, P, K].
-       matched_indices (Variable): Tensor<int>), The input matched indices
-           is 2D Tenosr<int32> with shape [N, P], If MatchIndices[i][j] is -1,
-           the j-th entity of column is not matched to any entity of row in
-           i-th instance.
-       negative_indices (Variable): The input negative example indices are
-           an optional input with shape [Neg, 1] and int32 type, where Neg is
-           the total number of negative example indices.
-       mismatch_value (float32): Fill this value to the mismatched location.
-
-    Returns:
-        tuple:
-               A tuple(out, out_weight) is returned. out is a 3D Tensor with
-               shape [N, P, K], N and P is the same as they are in
-               `neg_indices`, K is the same as it in input of X. If
-               `match_indices[i][j]`. out_weight is the weight for output with
-               the shape of [N, P, 1].
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(
-                name='x',
-                shape=[4, 20, 4],
-                dtype='float',
-                lod_level=1,
-                append_batch_size=False)
-            matched_id = fluid.layers.data(
-                name='indices',
-                shape=[8, 20],
-                dtype='int32',
-                append_batch_size=False)
-            trg, trg_weight = fluid.layers.target_assign(
-                x,
-                matched_id,
-                mismatch_value=0)
-    """
-    helper = LayerHelper('target_assign', **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    out_weight = helper.create_variable_for_type_inference(dtype='float32')
-    helper.append_op(
-        type='target_assign',
-        inputs={
-            'X': input,
-            'MatchIndices': matched_indices,
-            'NegIndices': negative_indices
-        },
-        outputs={'Out': out,
-                 'OutWeight': out_weight},
-        attrs={'mismatch_value': mismatch_value})
-    return out, out_weight
-
-
-def ssd_loss(location,
-             confidence,
-             gt_box,
-             gt_label,
-             prior_box,
-             prior_box_var=None,
-             background_label=0,
-             overlap_threshold=0.5,
-             neg_pos_ratio=3.0,
-             neg_overlap=0.5,
-             loc_loss_weight=1.0,
-             conf_loss_weight=1.0,
-             match_type='per_prediction',
-             mining_type='max_negative',
-             normalize=True,
-             sample_size=None):
-    """
-    **Multi-box loss layer for object detection algorithm of SSD**
-
-    This layer is to compute detection loss for SSD given the location offset
-    predictions, confidence predictions, prior boxes and ground-truth bounding
-    boxes and labels, and the type of hard example mining. The returned loss
-    is a weighted sum of the localization loss (or regression loss) and
-    confidence loss (or classification loss) by performing the following steps:
-
-    1. Find matched bounding box by bipartite matching algorithm.
-
-      1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
-
-      1.2 Compute matched boundding box by bipartite matching algorithm.
-
-    2. Compute confidence for mining hard examples
-
-      2.1. Get the target label based on matched indices.
-
-      2.2. Compute confidence loss.
-
-    3. Apply hard example mining to get the negative example indices and update
-       the matched indices.
-
-    4. Assign classification and regression targets
-
-      4.1. Encoded bbox according to the prior boxes.
-
-      4.2. Assign regression targets.
-
-      4.3. Assign classification targets.
-
-    5. Compute the overall objective loss.
-
-      5.1 Compute confidence loss.
-
-      5.2 Compute localization loss.
-
-      5.3 Compute the overall weighted loss.
-
-    Args:
-        location (Variable): The location predictions are a 3D Tensor with
-            shape [N, Np, 4], N is the batch size, Np is total number of
-            predictions for each instance. 4 is the number of coordinate values,
-            the layout is [xmin, ymin, xmax, ymax].
-        confidence (Variable): The confidence predictions are a 3D Tensor
-            with shape [N, Np, C], N and Np are the same as they are in
-            `location`, C is the class number.
-        gt_box (Variable): The ground-truth bounding boxes (bboxes) are a 2D
-            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
-            bboxes of mini-batch input.
-        gt_label (Variable): The ground-truth labels are a 2D LoDTensor
-            with shape [Ng, 1].
-        prior_box (Variable): The prior boxes are a 2D Tensor with shape [Np, 4].
-        prior_box_var (Variable): The variance of prior boxes are a 2D Tensor
-            with shape [Np, 4].
-        background_label (int): The index of background label, 0 by default.
-        overlap_threshold (float): If match_type is 'per_prediction', use
-            `overlap_threshold` to determine the extra matching bboxes when
-             finding matched boxes. 0.5 by default.
-        neg_pos_ratio (float): The ratio of the negative boxes to the positive
-            boxes, used only when mining_type is 'max_negative', 3.0 by default.
-        neg_overlap (float): The negative overlap upper bound for the unmatched
-            predictions. Use only when mining_type is 'max_negative',
-            0.5 by default.
-        loc_loss_weight (float): Weight for localization loss, 1.0 by default.
-        conf_loss_weight (float): Weight for confidence loss, 1.0 by default.
-        match_type (str): The type of matching method during training, should
-            be 'bipartite' or 'per_prediction', 'per_prediction' by default.
-        mining_type (str): The hard example mining type, should be 'hard_example'
-            or 'max_negative', now only support `max_negative`.
-        normalize (bool): Whether to normalize the SSD loss by the total number
-            of output locations, True by default.
-        sample_size (int): The max sample size of negative box, used only when
-            mining_type is 'hard_example'.
-
-    Returns:
-        The weighted sum of the localization loss and confidence loss, with \
-        shape [N * Np, 1], N and Np are the same as they are in `location`.
-
-    Raises:
-        ValueError: If mining_type is 'hard_example', now only support mining \
-        type of `max_negative`.
-
-    Examples:
-        >>> import paddle.fluid as fluid
-        >>> pb = fluid.layers.data(
-        >>>                   name='prior_box',
-        >>>                   shape=[10, 4],
-        >>>                   append_batch_size=False,
-        >>>                   dtype='float32')
-        >>> pbv = fluid.layers.data(
-        >>>                   name='prior_box_var',
-        >>>                   shape=[10, 4],
-        >>>                   append_batch_size=False,
-        >>>                   dtype='float32')
-        >>> loc = fluid.layers.data(name='target_box', shape=[10, 4], dtype='float32')
-        >>> scores = fluid.layers.data(name='scores', shape=[10, 21], dtype='float32')
-        >>> gt_box = fluid.layers.data(
-        >>>         name='gt_box', shape=[4], lod_level=1, dtype='float32')
-        >>> gt_label = fluid.layers.data(
-        >>>         name='gt_label', shape=[1], lod_level=1, dtype='float32')
-        >>> loss = fluid.layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
-    """
-
-    helper = LayerHelper('ssd_loss', **locals())
-    if mining_type != 'max_negative':
-        raise ValueError("Only support mining_type == max_negative now.")
-
-    num, num_prior, num_class = confidence.shape
-    conf_shape = nn.shape(confidence)
-
-    def __reshape_to_2d(var):
-        return nn.flatten(x=var, axis=2)
-
-    # 1. Find matched boundding box by prior box.
-    #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
-    iou = iou_similarity(x=gt_box, y=prior_box)
-    #   1.2 Compute matched boundding box by bipartite matching algorithm.
-    matched_indices, matched_dist = bipartite_match(iou, match_type,
-                                                    overlap_threshold)
-
-    # 2. Compute confidence for mining hard examples
-    # 2.1. Get the target label based on matched indices
-    gt_label = nn.reshape(
-        x=gt_label, shape=(len(gt_label.shape) - 1) * (0, ) + (-1, 1))
-    gt_label.stop_gradient = True
-    target_label, _ = target_assign(
-        gt_label, matched_indices, mismatch_value=background_label)
-    # 2.2. Compute confidence loss.
-    # Reshape confidence to 2D tensor.
-    confidence = __reshape_to_2d(confidence)
-    target_label = tensor.cast(x=target_label, dtype='int64')
-    target_label = __reshape_to_2d(target_label)
-    target_label.stop_gradient = True
-    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
-    # 3. Mining hard examples
-    actual_shape = nn.slice(conf_shape, axes=[0], starts=[0], ends=[2])
-    actual_shape.stop_gradient = True
-    # shape=(-1, 0) is set for compile-time, the correct shape is set by
-    # actual_shape in runtime.
-    conf_loss = nn.reshape(
-        x=conf_loss, shape=(-1, 0), actual_shape=actual_shape)
-    conf_loss.stop_gradient = True
-    neg_indices = helper.create_variable_for_type_inference(dtype='int32')
-    dtype = matched_indices.dtype
-    updated_matched_indices = helper.create_variable_for_type_inference(
-        dtype=dtype)
-    helper.append_op(
-        type='mine_hard_examples',
-        inputs={
-            'ClsLoss': conf_loss,
-            'LocLoss': None,
-            'MatchIndices': matched_indices,
-            'MatchDist': matched_dist,
-        },
-        outputs={
-            'NegIndices': neg_indices,
-            'UpdatedMatchIndices': updated_matched_indices
-        },
-        attrs={
-            'neg_pos_ratio': neg_pos_ratio,
-            'neg_dist_threshold': neg_overlap,
-            'mining_type': mining_type,
-            'sample_size': sample_size,
-        })
-
-    # 4. Assign classification and regression targets
-    # 4.1. Encoded bbox according to the prior boxes.
-    encoded_bbox = box_coder(
-        prior_box=prior_box,
-        prior_box_var=prior_box_var,
-        target_box=gt_box,
-        code_type='encode_center_size')
-    # 4.2. Assign regression targets
-    target_bbox, target_loc_weight = target_assign(
-        encoded_bbox, updated_matched_indices, mismatch_value=background_label)
-    # 4.3. Assign classification targets
-    target_label, target_conf_weight = target_assign(
-        gt_label,
-        updated_matched_indices,
-        negative_indices=neg_indices,
-        mismatch_value=background_label)
-
-    # 5. Compute loss.
-    # 5.1 Compute confidence loss.
-    target_label = __reshape_to_2d(target_label)
-    target_label = tensor.cast(x=target_label, dtype='int64')
-
-    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
-    target_conf_weight = __reshape_to_2d(target_conf_weight)
-    conf_loss = conf_loss * target_conf_weight
-
-    # the target_label and target_conf_weight do not have gradient.
-    target_label.stop_gradient = True
-    target_conf_weight.stop_gradient = True
-
-    # 5.2 Compute regression loss.
-    location = __reshape_to_2d(location)
-    target_bbox = __reshape_to_2d(target_bbox)
-
-    loc_loss = nn.smooth_l1(location, target_bbox)
-    target_loc_weight = __reshape_to_2d(target_loc_weight)
-    loc_loss = loc_loss * target_loc_weight
-
-    # the target_bbox and target_loc_weight do not have gradient.
-    target_bbox.stop_gradient = True
-    target_loc_weight.stop_gradient = True
-
-    # 5.3 Compute overall weighted loss.
-    loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
-    # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    # shape=(-1, 0) is set for compile-time, the correct shape is set by
-    # actual_shape in runtime.
-    loss = nn.reshape(x=loss, shape=(-1, 0), actual_shape=actual_shape)
-    loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
-    if normalize:
-        normalizer = nn.reduce_sum(target_loc_weight)
-        loss = loss / normalizer
-
-    return loss
-
-
-def prior_box(input,
-              image,
-              min_sizes,
-              max_sizes=None,
-              aspect_ratios=[1.],
-              variance=[0.1, 0.1, 0.2, 0.2],
-              flip=False,
-              clip=False,
-              steps=[0.0, 0.0],
-              offset=0.5,
-              name=None,
-              min_max_aspect_ratios_order=False):
-    """
-    **Prior Box Operator**
-
-    Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
-    Each position of the input produce N prior boxes, N is determined by
-    the count of min_sizes, max_sizes and aspect_ratios, The size of the
-    box is in range(min_size, max_size) interval, which is generated in
-    sequence according to the aspect_ratios.
-
-    Args:
-       input(Variable): The Input Variables, the format is NCHW.
-       image(Variable): The input image data of PriorBoxOp,
-            the layout is NCHW.
-       min_sizes(list|tuple|float value): min sizes of generated prior boxes.
-       max_sizes(list|tuple|None): max sizes of generated prior boxes.
-            Default: None.
-       aspect_ratios(list|tuple|float value): the aspect ratios of generated
-            prior boxes. Default: [1.].
-       variance(list|tuple): the variances to be encoded in prior boxes.
-            Default:[0.1, 0.1, 0.2, 0.2].
-       flip(bool): Whether to flip aspect ratios. Default:False.
-       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
-       step(list|tuple): Prior boxes step across width and height, If
-            step[0] == 0.0/step[1] == 0.0, the prior boxes step across
-            height/weight of the input will be automatically calculated.
-            Default: [0., 0.]
-       offset(float): Prior boxes center offset. Default: 0.5
-       name(str): Name of the prior box op. Default: None.
-       min_max_aspect_ratios_order(bool): If set True, the output prior box is
-            in order of [min, max, aspect_ratios], which is consistent with
-            Caffe. Please note, this order affects the weights order of
-            convolution layer followed by and does not affect the final
-            detection results. Default: False.
-
-    Returns:
-        tuple: A tuple with two Variable (boxes, variances)
-
-        boxes: the output prior boxes of PriorBox.
-        The layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input,
-        num_priors is the total
-        box count of each position of input.
-
-        variances: the expanded variances of PriorBox.
-        The layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input
-        num_priors is the total
-        box count of each position of input
-
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[3,6,9])
-            images = fluid.layers.data(name="images", shape=[3,9,12])
-            box, var = fluid.layers.prior_box(
-                input=input,
-                image=images,
-                min_sizes=[100.],
-                flip=True,
-                clip=True)
-    """
-    helper = LayerHelper("prior_box", **locals())
-    dtype = helper.input_dtype()
-
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if not _is_list_or_tuple_(min_sizes):
-        min_sizes = [min_sizes]
-    if not _is_list_or_tuple_(aspect_ratios):
-        aspect_ratios = [aspect_ratios]
-    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
-        raise ValueError('steps should be a list or tuple ',
-                         'with length 2, (step_width, step_height).')
-
-    min_sizes = list(map(float, min_sizes))
-    aspect_ratios = list(map(float, aspect_ratios))
-    steps = list(map(float, steps))
-
-    attrs = {
-        'min_sizes': min_sizes,
-        'aspect_ratios': aspect_ratios,
-        'variances': variance,
-        'flip': flip,
-        'clip': clip,
-        'step_w': steps[0],
-        'step_h': steps[1],
-        'offset': offset,
-        'min_max_aspect_ratios_order': min_max_aspect_ratios_order
-    }
-    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
-        if not _is_list_or_tuple_(max_sizes):
-            max_sizes = [max_sizes]
-        attrs['max_sizes'] = max_sizes
-
-    box = helper.create_variable_for_type_inference(dtype)
-    var = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="prior_box",
-        inputs={"Input": input,
-                "Image": image},
-        outputs={"Boxes": box,
-                 "Variances": var},
-        attrs=attrs, )
-    box.stop_gradient = True
-    var.stop_gradient = True
-    return box, var
-
-
-def density_prior_box(input,
-                      image,
-                      densities=None,
-                      fixed_sizes=None,
-                      fixed_ratios=None,
-                      variance=[0.1, 0.1, 0.2, 0.2],
-                      clip=False,
-                      steps=[0.0, 0.0],
-                      offset=0.5,
-                      flatten_to_2d=False,
-                      name=None):
-    """
-    **Density Prior Box Operator**
-
-    Generate density prior boxes for SSD(Single Shot MultiBox Detector) 
-    algorithm. Each position of the input produce N prior boxes, N is 
-    determined by the count of densities, fixed_sizes and fixed_ratios. 
-    Boxes center at grid points around each input position is generated by 
-    this operator, and the grid points is determined by densities and 
-    the count of density prior box is determined by fixed_sizes and fixed_ratios. 
-    Obviously, the number of fixed_sizes is equal to the number of densities.
-    For densities_i in densities:
-    N_density_prior_box =sum(N_fixed_ratios * densities_i^2),
-
-    Args:
-       input(Variable): The Input Variables, the format is NCHW.
-       image(Variable): The input image data of PriorBoxOp,
-            the layout is NCHW.
-       densities(list|tuple|None): the densities of generated density prior 
-            boxes, this attribute should be a list or tuple of integers. 
-            Default: None.
-       fixed_sizes(list|tuple|None): the fixed sizes of generated density
-            prior boxes, this attribute should a list or tuple of same 
-            length with :attr:`densities`. Default: None.
-       fixed_ratios(list|tuple|None): the fixed ratios of generated density
-            prior boxes, if this attribute is not set and :attr:`densities`
-            and :attr:`fix_sizes` is set, :attr:`aspect_ratios` will be used
-            to generate density prior boxes.
-       variance(list|tuple): the variances to be encoded in density prior boxes.
-            Default:[0.1, 0.1, 0.2, 0.2].
-       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
-       step(list|tuple): Prior boxes step across width and height, If
-            step[0] == 0.0/step[1] == 0.0, the density prior boxes step across
-            height/weight of the input will be automatically calculated.
-            Default: [0., 0.]
-       offset(float): Prior boxes center offset. Default: 0.5
-       flatten_to_2d(bool): Whether to flatten output prior boxes and variance
-           to 2D shape, the second dim is 4. Default: False.
-       name(str): Name of the density prior box op. Default: None.
-
-    Returns:
-        tuple: A tuple with two Variable (boxes, variances)
-
-        boxes: the output density prior boxes of PriorBox.
-            The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
-            The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
-            H is the height of input, W is the width of input,
-            num_priors is the total box count of each position of input.
-
-        variances: the expanded variances of PriorBox.
-            The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
-            The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
-            H is the height of input, W is the width of input
-            num_priors is the total box count of each position of input.
-
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[3,6,9])
-            images = fluid.layers.data(name="images", shape=[3,9,12])
-            box, var = fluid.layers.density_prior_box(
-                input=input,
-                image=images,
-                densities=[4, 2, 1],
-                fixed_sizes=[32.0, 64.0, 128.0],
-                fixed_ratios=[1.],
-                clip=True,
-                flatten_to_2d=True)
-    """
-    helper = LayerHelper("density_prior_box", **locals())
-    dtype = helper.input_dtype()
-
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if not _is_list_or_tuple_(densities):
-        raise TypeError('densities should be a list or a tuple or None.')
-    if not _is_list_or_tuple_(fixed_sizes):
-        raise TypeError('fixed_sizes should be a list or a tuple or None.')
-    if not _is_list_or_tuple_(fixed_ratios):
-        raise TypeError('fixed_ratios should be a list or a tuple or None.')
-    if len(densities) != len(fixed_sizes):
-        raise ValueError('densities and fixed_sizes length should be euqal.')
-    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
-        raise ValueError('steps should be a list or tuple ',
-                         'with length 2, (step_width, step_height).')
-
-    densities = list(map(int, densities))
-    fixed_sizes = list(map(float, fixed_sizes))
-    fixed_ratios = list(map(float, fixed_ratios))
-    steps = list(map(float, steps))
-
-    attrs = {
-        'variances': variance,
-        'clip': clip,
-        'step_w': steps[0],
-        'step_h': steps[1],
-        'offset': offset,
-        'densities': densities,
-        'fixed_sizes': fixed_sizes,
-        'fixed_ratios': fixed_ratios,
-        'flatten_to_2d': flatten_to_2d,
-    }
-    box = helper.create_variable_for_type_inference(dtype)
-    var = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="density_prior_box",
-        inputs={"Input": input,
-                "Image": image},
-        outputs={"Boxes": box,
-                 "Variances": var},
-        attrs=attrs, )
-    box.stop_gradient = True
-    var.stop_gradient = True
-    return box, var
-
-
-def multi_box_head(inputs,
-                   image,
-                   base_size,
-                   num_classes,
-                   aspect_ratios,
-                   min_ratio=None,
-                   max_ratio=None,
-                   min_sizes=None,
-                   max_sizes=None,
-                   steps=None,
-                   step_w=None,
-                   step_h=None,
-                   offset=0.5,
-                   variance=[0.1, 0.1, 0.2, 0.2],
-                   flip=True,
-                   clip=False,
-                   kernel_size=1,
-                   pad=0,
-                   stride=1,
-                   name=None,
-                   min_max_aspect_ratios_order=False):
-    """
-    Generate prior boxes for SSD(Single Shot MultiBox Detector)
-    algorithm. The details of this algorithm, please refer the
-    section 2.2 of SSD paper `SSD: Single Shot MultiBox Detector
-    <https://arxiv.org/abs/1512.02325>`_ .
-
-    Args:
-       inputs(list|tuple): The list of input Variables, the format
-            of all Variables is NCHW.
-       image(Variable): The input image data of PriorBoxOp,
-            the layout is NCHW.
-       base_size(int): the base_size is used to get min_size
-            and max_size according to min_ratio and max_ratio.
-       num_classes(int): The number of classes.
-       aspect_ratios(list|tuple): the aspect ratios of generated prior
-            boxes. The length of input and aspect_ratios must be equal.
-       min_ratio(int): the min ratio of generated prior boxes.
-       max_ratio(int): the max ratio of generated prior boxes.
-       min_sizes(list|tuple|None): If `len(inputs) <=2`,
-            min_sizes must be set up, and the length of min_sizes
-            should equal to the length of inputs. Default: None.
-       max_sizes(list|tuple|None): If `len(inputs) <=2`,
-            max_sizes must be set up, and the length of min_sizes
-            should equal to the length of inputs. Default: None.
-       steps(list|tuple): If step_w and step_h are the same,
-            step_w and step_h can be replaced by steps.
-       step_w(list|tuple): Prior boxes step
-            across width. If step_w[i] == 0.0, the prior boxes step
-            across width of the inputs[i] will be automatically
-            calculated. Default: None.
-       step_h(list|tuple): Prior boxes step across height, If
-            step_h[i] == 0.0, the prior boxes step across height of
-            the inputs[i] will be automatically calculated. Default: None.
-       offset(float): Prior boxes center offset. Default: 0.5
-       variance(list|tuple): the variances to be encoded in prior boxes.
-            Default:[0.1, 0.1, 0.2, 0.2].
-       flip(bool): Whether to flip aspect ratios. Default:False.
-       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
-       kernel_size(int): The kernel size of conv2d. Default: 1.
-       pad(int|list|tuple): The padding of conv2d. Default:0.
-       stride(int|list|tuple): The stride of conv2d. Default:1,
-       name(str): Name of the prior box layer. Default: None.
-       min_max_aspect_ratios_order(bool): If set True, the output prior box is
-            in order of [min, max, aspect_ratios], which is consistent with
-            Caffe. Please note, this order affects the weights order of
-            convolution layer followed by and does not affect the fininal
-            detection results. Default: False.
-
-    Returns:
-        tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances)
-
-        mbox_loc: The predicted boxes' location of the inputs. The layout
-        is [N, H*W*Priors, 4]. where Priors is the number of predicted
-        boxes each position of each input.
-
-        mbox_conf: The predicted boxes' confidence of the inputs. The layout
-        is [N, H*W*Priors, C]. where Priors is the number of predicted boxes
-        each position of each input and C is the number of Classes.
-
-        boxes: the output prior boxes of PriorBox. The layout is [num_priors, 4].
-        num_priors is the total box count of each position of inputs.
-
-        variances: the expanded variances of PriorBox. The layout is
-        [num_priors, 4]. num_priors is the total box count of each position of inputs
-
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          images = fluid.layers.data(name='data', shape=[3, 300, 300], dtype='float32')
-          conv1 = fluid.layers.data(name='conv1', shape=[512, 19, 19], dtype='float32')
-          conv2 = fluid.layers.data(name='conv2', shape=[1024, 10, 10], dtype='float32')
-          conv3 = fluid.layers.data(name='conv3', shape=[512, 5, 5], dtype='float32')
-          conv4 = fluid.layers.data(name='conv4', shape=[256, 3, 3], dtype='float32')
-          conv5 = fluid.layers.data(name='conv5', shape=[256, 2, 2], dtype='float32')
-          conv6 = fluid.layers.data(name='conv6', shape=[128, 1, 1], dtype='float32')
-
-          mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head(
-            inputs=[conv1, conv2, conv3, conv4, conv5, conv6],
-            image=images,
-            num_classes=21,
-            min_ratio=20,
-            max_ratio=90,
-            aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]],
-            base_size=300,
-            offset=0.5,
-            flip=True,
-            clip=True)
-    """
-
-    def _reshape_with_axis_(input, axis=1):
-        out = nn.flatten(x=input, axis=axis)
-        return out
-
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    def _is_list_or_tuple_and_equal(data, length, err_info):
-        if not (_is_list_or_tuple_(data) and len(data) == length):
-            raise ValueError(err_info)
-
-    if not _is_list_or_tuple_(inputs):
-        raise ValueError('inputs should be a list or tuple.')
-
-    num_layer = len(inputs)
-
-    if num_layer <= 2:
-        assert min_sizes is not None and max_sizes is not None
-        assert len(min_sizes) == num_layer and len(max_sizes) == num_layer
-    elif min_sizes is None and max_sizes is None:
-        min_sizes = []
-        max_sizes = []
-        step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
-        for ratio in six.moves.range(min_ratio, max_ratio + 1, step):
-            min_sizes.append(base_size * ratio / 100.)
-            max_sizes.append(base_size * (ratio + step) / 100.)
-        min_sizes = [base_size * .10] + min_sizes
-        max_sizes = [base_size * .20] + max_sizes
-
-    if aspect_ratios:
-        _is_list_or_tuple_and_equal(
-            aspect_ratios, num_layer,
-            'aspect_ratios should be list or tuple, and the length of inputs '
-            'and aspect_ratios should be the same.')
-    if step_h:
-        _is_list_or_tuple_and_equal(
-            step_h, num_layer,
-            'step_h should be list or tuple, and the length of inputs and '
-            'step_h should be the same.')
-    if step_w:
-        _is_list_or_tuple_and_equal(
-            step_w, num_layer,
-            'step_w should be list or tuple, and the length of inputs and '
-            'step_w should be the same.')
-    if steps:
-        _is_list_or_tuple_and_equal(
-            steps, num_layer,
-            'steps should be list or tuple, and the length of inputs and '
-            'step_w should be the same.')
-        step_w = steps
-        step_h = steps
-
-    mbox_locs = []
-    mbox_confs = []
-    box_results = []
-    var_results = []
-    for i, input in enumerate(inputs):
-        min_size = min_sizes[i]
-        max_size = max_sizes[i]
-
-        if not _is_list_or_tuple_(min_size):
-            min_size = [min_size]
-        if not _is_list_or_tuple_(max_size):
-            max_size = [max_size]
-
-        aspect_ratio = []
-        if aspect_ratios is not None:
-            aspect_ratio = aspect_ratios[i]
-            if not _is_list_or_tuple_(aspect_ratio):
-                aspect_ratio = [aspect_ratio]
-        step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0]
-
-        box, var = prior_box(input, image, min_size, max_size, aspect_ratio,
-                             variance, flip, clip, step, offset, None,
-                             min_max_aspect_ratios_order)
-
-        box_results.append(box)
-        var_results.append(var)
-
-        num_boxes = box.shape[2]
-
-        # get loc
-        num_loc_output = num_boxes * 4
-        mbox_loc = nn.conv2d(
-            input=input,
-            num_filters=num_loc_output,
-            filter_size=kernel_size,
-            padding=pad,
-            stride=stride)
-
-        mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
-        mbox_loc_flatten = nn.flatten(mbox_loc, axis=1)
-        mbox_locs.append(mbox_loc_flatten)
-
-        # get conf
-        num_conf_output = num_boxes * num_classes
-        conf_loc = nn.conv2d(
-            input=input,
-            num_filters=num_conf_output,
-            filter_size=kernel_size,
-            padding=pad,
-            stride=stride)
-        conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
-        conf_loc_flatten = nn.flatten(conf_loc, axis=1)
-        mbox_confs.append(conf_loc_flatten)
-
-    if len(box_results) == 1:
-        box = box_results[0]
-        var = var_results[0]
-        mbox_locs_concat = mbox_locs[0]
-        mbox_confs_concat = mbox_confs[0]
-    else:
-        reshaped_boxes = []
-        reshaped_vars = []
-        for i in range(len(box_results)):
-            reshaped_boxes.append(_reshape_with_axis_(box_results[i], axis=3))
-            reshaped_vars.append(_reshape_with_axis_(var_results[i], axis=3))
-
-        box = tensor.concat(reshaped_boxes)
-        var = tensor.concat(reshaped_vars)
-        mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
-        mbox_locs_concat = nn.reshape(mbox_locs_concat, shape=[0, -1, 4])
-        mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
-        mbox_confs_concat = nn.reshape(
-            mbox_confs_concat, shape=[0, -1, num_classes])
-
-    box.stop_gradient = True
-    var.stop_gradient = True
-    return mbox_locs_concat, mbox_confs_concat, box, var
-
-
-def anchor_generator(input,
-                     anchor_sizes=None,
-                     aspect_ratios=None,
-                     variance=[0.1, 0.1, 0.2, 0.2],
-                     stride=None,
-                     offset=0.5,
-                     name=None):
-    """
-    **Anchor generator operator**
-
-    Generate anchors for Faster RCNN algorithm.
-    Each position of the input produce N anchors, N =
-    size(anchor_sizes) * size(aspect_ratios). The order of generated anchors
-    is firstly aspect_ratios loop then anchor_sizes loop.
-
-    Args:
-       input(Variable): The input feature map, the format is NCHW.
-       anchor_sizes(list|tuple|float): The anchor sizes of generated anchors,
-                                       given in absolute pixels e.g. [64., 128., 256., 512.].
-                                       For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
-       aspect_ratios(list|tuple|float): The height / width ratios of generated
-                                        anchors, e.g. [0.5, 1.0, 2.0].
-       variance(list|tuple): The variances to be used in box regression deltas.
-                             Default:[0.1, 0.1, 0.2, 0.2].
-       stride(list|tuple): The anchors stride across width and height,e.g. [16.0, 16.0]
-       offset(float): Prior boxes center offset. Default: 0.5
-       name(str): Name of the prior box op. Default: None.
-
-    Returns:
-        Anchors(Variable),Variances(Variable):  
-        
-              two variables:
-        
-              - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. \
-                H is the height of input, W is the width of input, \
-                num_anchors is the box count of each position.  \
-                Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. 
-              - Variances(Variable): The expanded variances of anchors \
-                with a layout of [H, W, num_priors, 4]. \
-                H is the height of input, W is the width of input \
-                num_anchors is the box count of each position. \
-                Each variance is in (xcenter, ycenter, w, h) format.
-
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            conv1 = fluid.layers.data(name='conv1', shape=[48, 16, 16], dtype='float32')
-            anchor, var = fluid.layers.anchor_generator(
-                input=conv1,
-                anchor_sizes=[64, 128, 256, 512],
-                aspect_ratios=[0.5, 1.0, 2.0],
-                variance=[0.1, 0.1, 0.2, 0.2],
-                stride=[16.0, 16.0],
-                offset=0.5)
-    """
-    helper = LayerHelper("anchor_generator", **locals())
-    dtype = helper.input_dtype()
-
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if not _is_list_or_tuple_(anchor_sizes):
-        anchor_sizes = [anchor_sizes]
-    if not _is_list_or_tuple_(aspect_ratios):
-        aspect_ratios = [aspect_ratios]
-    if not (_is_list_or_tuple_(stride) and len(stride) == 2):
-        raise ValueError('stride should be a list or tuple ',
-                         'with length 2, (stride_width, stride_height).')
-
-    anchor_sizes = list(map(float, anchor_sizes))
-    aspect_ratios = list(map(float, aspect_ratios))
-    stride = list(map(float, stride))
-
-    attrs = {
-        'anchor_sizes': anchor_sizes,
-        'aspect_ratios': aspect_ratios,
-        'variances': variance,
-        'stride': stride,
-        'offset': offset
-    }
-
-    anchor = helper.create_variable_for_type_inference(dtype)
-    var = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="anchor_generator",
-        inputs={"Input": input},
-        outputs={"Anchors": anchor,
-                 "Variances": var},
-        attrs=attrs, )
-    anchor.stop_gradient = True
-    var.stop_gradient = True
-    return anchor, var
-
-
-def roi_perspective_transform(input,
-                              rois,
-                              transformed_height,
-                              transformed_width,
-                              spatial_scale=1.0):
-    """
-    ROI perspective transform op.
-
-    Args:
-        input (Variable): The input of ROIPerspectiveTransformOp. The format of 
-                          input tensor is NCHW. Where N is batch size, C is the
-                          number of input channels, H is the height of the feature,
-                          and W is the width of the feature.
-        rois (Variable):  ROIs (Regions of Interest) to be transformed. It should be
-                          a 2-D LoDTensor of shape (num_rois, 8). Given as 
-                          [[x1, y1, x2, y2, x3, y3, x4, y4], ...], (x1, y1) is the 
-                          top left coordinates, and (x2, y2) is the top right 
-                          coordinates, and (x3, y3) is the bottom right coordinates, 
-                          and (x4, y4) is the bottom left coordinates.
-        transformed_height (integer): The height of transformed output.
-        transformed_width (integer): The width of transformed output.
-        spatial_scale (float): Spatial scale factor to scale ROI coords. Default: 1.0
-
-    Returns:
-            tuple: A tuple with three Variables. (out, mask, transform_matrix)
-
-            out: The output of ROIPerspectiveTransformOp which is a 4-D tensor with shape
-            (num_rois, channels, transformed_h, transformed_w).
-
-            mask: The mask of ROIPerspectiveTransformOp which is a 4-D tensor with shape
-            (num_rois, 1, transformed_h, transformed_w).
-
-            transform_matrix: The transform matrix of ROIPerspectiveTransformOp which is
-            a 2-D tensor with shape (num_rois, 9).
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name='x', shape=[256, 28, 28], dtype='float32')
-            rois = fluid.layers.data(name='rois', shape=[8], lod_level=1, dtype='float32')
-            out, mask, transform_matrix = fluid.layers.roi_perspective_transform(x, rois, 7, 7, 1.0)
-    """
-    helper = LayerHelper('roi_perspective_transform', **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    mask = helper.create_variable_for_type_inference(dtype="int32")
-    transform_matrix = helper.create_variable_for_type_inference(dtype)
-    out2in_idx = helper.create_variable_for_type_inference(dtype="int32")
-    out2in_w = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="roi_perspective_transform",
-        inputs={"X": input,
-                "ROIs": rois},
-        outputs={
-            "Out": out,
-            "Out2InIdx": out2in_idx,
-            "Out2InWeights": out2in_w,
-            "Mask": mask,
-            "TransformMatrix": transform_matrix
-        },
-        attrs={
-            "transformed_height": transformed_height,
-            "transformed_width": transformed_width,
-            "spatial_scale": spatial_scale
-        })
-    return out, mask, transform_matrix
-
-
-def generate_proposal_labels(rpn_rois,
-                             gt_classes,
-                             is_crowd,
-                             gt_boxes,
-                             im_info,
-                             batch_size_per_im=256,
-                             fg_fraction=0.25,
-                             fg_thresh=0.25,
-                             bg_thresh_hi=0.5,
-                             bg_thresh_lo=0.0,
-                             bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
-                             class_nums=None,
-                             use_random=True,
-                             is_cls_agnostic=False,
-                             is_cascade_rcnn=False):
-    """
-
-    ** Generate Proposal Labels of Faster-RCNN **
-
-    This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
-    to sample foreground boxes and background boxes, and compute loss target.
-
-    RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes
-    were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction,
-    If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample.
-    If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi,
-    then it was considered as a background sample.
-    After all foreground and background boxes are chosen (so called Rois),
-    then we apply random sampling to make sure
-    the number of foreground boxes is no more than batch_size_per_im * fg_fraction.
-
-    For each box in Rois, we assign the classification (class label) and regression targets (box label) to it.
-    Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss.
-
-    Args:
-        rpn_rois(Variable): A 2-D LoDTensor with shape [N, 4]. N is the number of the GenerateProposalOp's output, each element is a bounding box with [xmin, ymin, xmax, ymax] format.
-        gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a class label of groundtruth.
-        is_crowd(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a flag indicates whether a groundtruth is crowd.
-        gt_boxes(Variable): A 2-D LoDTensor with shape [M, 4]. M is the number of groundtruth, each element is a bounding box with [xmin, ymin, xmax, ymax] format.
-        im_info(Variable): A 2-D LoDTensor with shape [B, 3]. B is the number of input images, each element consists of im_height, im_width, im_scale.
-
-        batch_size_per_im(int): Batch size of rois per images.
-        fg_fraction(float): Foreground fraction in total batch_size_per_im.
-        fg_thresh(float): Overlap threshold which is used to chose foreground sample.
-        bg_thresh_hi(float): Overlap threshold upper bound which is used to chose background sample.
-        bg_thresh_lo(float): Overlap threshold lower bound which is used to chose background sample.
-        bbox_reg_weights(list|tuple): Box regression weights.
-        class_nums(int): Class number.
-        use_random(bool): Use random sampling to choose foreground and background boxes.
-        is_cls_agnostic(bool): bbox regression use class agnostic simply which only represent fg and bg boxes.
-        is_cascade_rcnn(bool): it will filter some bbox crossing the image's boundary when setting True.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            rpn_rois = fluid.layers.data(name='rpn_rois', shape=[2, 4],
-                           append_batch_size=False, dtype='float32')
-            gt_classes = fluid.layers.data(name='gt_classes', shape=[8, 1],
-                           append_batch_size=False, dtype='float32')
-            is_crowd = fluid.layers.data(name='is_crowd', shape=[8, 1],
-                           append_batch_size=False, dtype='float32')
-            gt_boxes = fluid.layers.data(name='gt_boxes', shape=[8, 4],
-                           append_batch_size=False, dtype='float32')
-            im_info = fluid.layers.data(name='im_info', shape=[10, 3],
-                           append_batch_size=False, dtype='float32')
-            rois, labels, bbox, inside_weights, outside_weights = fluid.layers.generate_proposal_labels(
-                           rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
-                           class_nums=10)
-
-    """
-
-    helper = LayerHelper('generate_proposal_labels', **locals())
-
-    rois = helper.create_variable_for_type_inference(dtype=rpn_rois.dtype)
-    labels_int32 = helper.create_variable_for_type_inference(
-        dtype=gt_classes.dtype)
-    bbox_targets = helper.create_variable_for_type_inference(
-        dtype=rpn_rois.dtype)
-    bbox_inside_weights = helper.create_variable_for_type_inference(
-        dtype=rpn_rois.dtype)
-    bbox_outside_weights = helper.create_variable_for_type_inference(
-        dtype=rpn_rois.dtype)
-
-    helper.append_op(
-        type="generate_proposal_labels",
-        inputs={
-            'RpnRois': rpn_rois,
-            'GtClasses': gt_classes,
-            'IsCrowd': is_crowd,
-            'GtBoxes': gt_boxes,
-            'ImInfo': im_info
-        },
-        outputs={
-            'Rois': rois,
-            'LabelsInt32': labels_int32,
-            'BboxTargets': bbox_targets,
-            'BboxInsideWeights': bbox_inside_weights,
-            'BboxOutsideWeights': bbox_outside_weights
-        },
-        attrs={
-            'batch_size_per_im': batch_size_per_im,
-            'fg_fraction': fg_fraction,
-            'fg_thresh': fg_thresh,
-            'bg_thresh_hi': bg_thresh_hi,
-            'bg_thresh_lo': bg_thresh_lo,
-            'bbox_reg_weights': bbox_reg_weights,
-            'class_nums': class_nums,
-            'use_random': use_random,
-            'is_cls_agnostic': is_cls_agnostic,
-            'is_cascade_rcnn': is_cascade_rcnn
-        })
-
-    rois.stop_gradient = True
-    labels_int32.stop_gradient = True
-    bbox_targets.stop_gradient = True
-    bbox_inside_weights.stop_gradient = True
-    bbox_outside_weights.stop_gradient = True
-
-    return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights
-
-
-def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
-                         labels_int32, num_classes, resolution):
-    """
-    ** Generate Mask Labels for Mask-RCNN **
-
-    This operator can be, for given the RoIs and corresponding labels,
-    to sample foreground RoIs. This mask branch also has
-    a :math: `K \\times M^{2}` dimensional output targets for each foreground
-    RoI, which encodes K binary masks of resolution M x M, one for each of the
-    K classes. This mask targets are used to compute loss of mask branch.
-
-    Please note, the data format of groud-truth segmentation, assumed the
-    segmentations are as follows. The first instance has two gt objects.
-    The second instance has one gt object, this object has two gt segmentations.
-
-        .. code-block:: python
-
-            #[
-            #  [[[229.14, 370.9, 229.14, 370.9, ...]],
-            #   [[343.7, 139.85, 349.01, 138.46, ...]]], # 0-th instance
-            #  [[[500.0, 390.62, ...],[115.48, 187.86, ...]]] # 1-th instance
-            #]
-
-            batch_masks = []
-            for semgs in batch_semgs:
-                gt_masks = []
-                for semg in semgs:
-                    gt_segm = []
-                    for polys in semg:
-                        gt_segm.append(np.array(polys).reshape(-1, 2))
-                    gt_masks.append(gt_segm)
-                batch_masks.append(gt_masks)
-            
-            
-            place = fluid.CPUPlace()
-            feeder = fluid.DataFeeder(place=place, feed_list=feeds)
-            feeder.feed(batch_masks)
-
-    Args:
-        im_info(Variable): A 2-D Tensor with shape [N, 3]. N is the batch size,
-            each element is [height, width, scale] of image. Image scale is
-            target_size) / original_size.
-        gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the total
-            number of ground-truth, each element is a class label.
-        is_crowd(Variable): A 2-D LoDTensor with shape as gt_classes,
-            each element is a flag indicating whether a groundtruth is crowd.
-        gt_segms(Variable): This input is a 2D LoDTensor with shape [S, 2],
-            it's LoD level is 3. Usually users do not needs to understand LoD,
-            The users should return correct data format in reader.
-
-
-
-            The LoD[0] represents the gt objects number of
-            each instance. LoD[1] represents the segmentation counts of each
-            objects. LoD[2] represents the polygons number of each segmentation.
-            S the total number of polygons coordinate points. Each element is
-            (x, y) coordinate points.
-        rois(Variable): A 2-D LoDTensor with shape [R, 4]. R is the total
-            number of RoIs, each element is a bounding box with
-            (xmin, ymin, xmax, ymax) format in the range of original image.
-        labels_int32(Variable): A 2-D LoDTensor in shape of [R, 1] with type
-            of int32. R is the same as it in `rois`. Each element repersents
-            a class label of a RoI.
-        num_classes(int): Class number.
-        resolution(int): Resolution of mask predictions.
-
-    Returns:
-        mask_rois (Variable):  A 2D LoDTensor with shape [P, 4]. P is the total
-            number of sampled RoIs. Each element is a bounding box with
-            [xmin, ymin, xmax, ymax] format in range of orignal image size.
-        mask_rois_has_mask_int32 (Variable): A 2D LoDTensor with shape [P, 1],
-            each element repersents the output mask RoI index with regard to
-            to input RoIs.
-        mask_int32 (Variable): A 2D LoDTensor with shape [P, K * M * M],
-            K is the classes number and M is the resolution of mask predictions.
-            Each element repersents the binary mask targets.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          im_info = fluid.layers.data(name="im_info", shape=[3],
-              dtype="float32")
-          gt_classes = fluid.layers.data(name="gt_classes", shape=[1],
-              dtype="float32", lod_level=1)
-          is_crowd = fluid.layers.data(name="is_crowd", shape=[1],
-              dtype="float32", lod_level=1)
-          gt_masks = fluid.layers.data(name="gt_masks", shape=[2],
-              dtype="float32", lod_level=3)
-          # rois, roi_labels can be the output of
-          # fluid.layers.generate_proposal_labels.
-          rois = fluid.layers.data(name="rois", shape=[4],
-              dtype="float32", lod_level=1)
-          roi_labels = fluid.layers.data(name="roi_labels", shape=[1],
-              dtype="int32", lod_level=1)
-          mask_rois, mask_index, mask_int32 = fluid.layers.generate_mask_labels(
-              im_info=im_info,
-              gt_classes=gt_classes,
-              is_crowd=is_crowd,
-              gt_segms=gt_masks,
-              rois=rois,
-              labels_int32=roi_labels,
-              num_classes=81,
-              resolution=14)
-    """
-
-    helper = LayerHelper('generate_mask_labels', **locals())
-
-    mask_rois = helper.create_variable_for_type_inference(dtype=rois.dtype)
-    roi_has_mask_int32 = helper.create_variable_for_type_inference(
-        dtype=gt_classes.dtype)
-    mask_int32 = helper.create_variable_for_type_inference(
-        dtype=gt_classes.dtype)
-
-    helper.append_op(
-        type="generate_mask_labels",
-        inputs={
-            'ImInfo': im_info,
-            'GtClasses': gt_classes,
-            'IsCrowd': is_crowd,
-            'GtSegms': gt_segms,
-            'Rois': rois,
-            'LabelsInt32': labels_int32
-        },
-        outputs={
-            'MaskRois': mask_rois,
-            'RoiHasMaskInt32': roi_has_mask_int32,
-            'MaskInt32': mask_int32
-        },
-        attrs={'num_classes': num_classes,
-               'resolution': resolution})
-
-    mask_rois.stop_gradient = True
-    roi_has_mask_int32.stop_gradient = True
-    mask_int32.stop_gradient = True
-
-    return mask_rois, roi_has_mask_int32, mask_int32
-
-
-def generate_proposals(scores,
-                       bbox_deltas,
-                       im_info,
-                       anchors,
-                       variances,
-                       pre_nms_top_n=6000,
-                       post_nms_top_n=1000,
-                       nms_thresh=0.5,
-                       min_size=0.1,
-                       eta=1.0,
-                       name=None):
-    """
-    **Generate proposal Faster-RCNN**
-
-    This operation proposes RoIs according to each box with their
-    probability to be a foreground object and 
-    the box can be calculated by anchors. Bbox_deltais and scores
-    to be an object are the output of RPN. Final proposals
-    could be used to train detection net.
-
-    For generating proposals, this operation performs following steps:
-
-    1. Transposes and resizes scores and bbox_deltas in size of
-       (H*W*A, 1) and (H*W*A, 4)
-    2. Calculate box locations as proposals candidates. 
-    3. Clip boxes to image
-    4. Remove predicted boxes with small area. 
-    5. Apply NMS to get final proposals as output.
-
-    Args:
-        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents
-            the probability for each box to be an object.
-            N is batch size, A is number of anchors, H and W are height and
-            width of the feature map.
-        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W]
-            represents the differece between predicted box locatoin and
-            anchor location.
-        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin
-            image information for N batch. Info contains height, width and scale
-            between origin image size and the size of feature map.
-        anchors(Variable):   A 4-D Tensor represents the anchors with a layout
-            of [H, W, A, 4]. H and W are height and width of the feature map,
-            num_anchors is the box count of each position. Each anchor is
-            in (xmin, ymin, xmax, ymax) format an unnormalized.
-        variances(Variable): The expanded variances of anchors with a layout of
-            [H, W, num_priors, 4]. Each variance is in
-            (xcenter, ycenter, w, h) format.
-        pre_nms_top_n(float): Number of total bboxes to be kept per
-            image before NMS. 6000 by default.
-        post_nms_top_n(float): Number of total bboxes to be kept per
-            image after NMS. 1000 by default.
-        nms_thresh(float): Threshold in NMS, 0.5 by default.
-        min_size(float): Remove predicted boxes with either height or
-            width < min_size. 0.1 by default.
-        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5,
-            adaptive_threshold = adaptive_threshold * eta in each iteration.
-
-    Examples:
-        .. code-block:: python
-        
-            import paddle.fluid as fluid
-            scores = fluid.layers.data(name='scores', shape=[2, 4, 5, 5],
-                         append_batch_size=False, dtype='float32')
-            bbox_deltas = fluid.layers.data(name='bbox_deltas', shape=[2, 16, 5, 5],
-                         append_batch_size=False, dtype='float32')
-            im_info = fluid.layers.data(name='im_info', shape=[2, 3],
-                         append_batch_size=False, dtype='float32')
-            anchors = fluid.layers.data(name='anchors', shape=[5, 5, 4, 4],
-                         append_batch_size=False, dtype='float32')
-            variances = fluid.layers.data(name='variances', shape=[5, 5, 10, 4],
-                         append_batch_size=False, dtype='float32')
-            rois, roi_probs = fluid.layers.generate_proposals(scores, bbox_deltas,
-                         im_info, anchors, variances)
-
-    """
-    helper = LayerHelper('generate_proposals', **locals())
-
-    rpn_rois = helper.create_variable_for_type_inference(
-        dtype=bbox_deltas.dtype)
-    rpn_roi_probs = helper.create_variable_for_type_inference(
-        dtype=scores.dtype)
-    helper.append_op(
-        type="generate_proposals",
-        inputs={
-            'Scores': scores,
-            'BboxDeltas': bbox_deltas,
-            'ImInfo': im_info,
-            'Anchors': anchors,
-            'Variances': variances
-        },
-        attrs={
-            'pre_nms_topN': pre_nms_top_n,
-            'post_nms_topN': post_nms_top_n,
-            'nms_thresh': nms_thresh,
-            'min_size': min_size,
-            'eta': eta
-        },
-        outputs={'RpnRois': rpn_rois,
-                 'RpnRoiProbs': rpn_roi_probs})
-    rpn_rois.stop_gradient = True
-    rpn_roi_probs.stop_gradient = True
-
-    return rpn_rois, rpn_roi_probs
-
-
-def box_clip(input, im_info, name=None):
-    """
-    Clip the box into the size given by im_info
-    For each input box, The formula is given as follows:
-        
-    .. code-block:: text
-
-        xmin = max(min(xmin, im_w - 1), 0)
-        ymin = max(min(ymin, im_h - 1), 0) 
-        xmax = max(min(xmax, im_w - 1), 0)
-        ymax = max(min(ymax, im_h - 1), 0)
-    
-    where im_w and im_h are computed from im_info:
- 
-    .. code-block:: text
-
-        im_h = round(height / scale)
-        im_w = round(weight / scale)
-
-    Args:
-        input(variable): The input box, the last dimension is 4.
-        im_info(variable): The information of image with shape [N, 3] with 
-                            layout (height, width, scale). height and width
-                            is the input size and scale is the ratio of input
-                            size and original size.
-        name (str): The name of this layer. It is optional.
-    
-    Returns:
-        Variable: The cliped tensor variable.
-        
-    Examples:
-        .. code-block:: python
-        
-            import paddle.fluid as fluid
-            boxes = fluid.layers.data(
-                name='boxes', shape=[8, 4], dtype='float32', lod_level=1)
-            im_info = fluid.layers.data(name='im_info', shape=[3])
-            out = fluid.layers.box_clip(
-                input=boxes, im_info=im_info)
-    """
-
-    helper = LayerHelper("box_clip", **locals())
-    output = helper.create_variable_for_type_inference(dtype=input.dtype)
-    inputs = {"Input": input, "ImInfo": im_info}
-    helper.append_op(type="box_clip", inputs=inputs, outputs={"Output": output})
-
-    return output
-
-
-def retinanet_detection_output(bboxes,
-                               scores,
-                               anchors,
-                               im_info,
-                               score_threshold=0.05,
-                               nms_top_k=1000,
-                               keep_top_k=100,
-                               nms_threshold=0.3,
-                               nms_eta=1.):
-    """
-    **Detection Output Layer for Retinanet.**
-
-    This operation is to get the detection results by performing following
-    steps:
-
-    1. Decode top-scoring bounding box predictions per FPN level according 
-       to the anchor boxes.
-    2. Merge top predictions from all levels and apply multi-class non 
-       maximum suppression (NMS) on them to get the final detections.
-
-    Args:
-        bboxes(List): A list of tensors from multiple FPN levels. Each
-            element is a 3-D Tensor with shape [N, Mi, 4] representing the
-            predicted locations of Mi bounding boxes. N is the batch size,
-            Mi is the number of bounding boxes from i-th FPN level and each 
-            bounding box has four coordinate values and the layout is
-            [xmin, ymin, xmax, ymax].
-        scores(List): A list of tensors from multiple FPN levels. Each
-            element is a 3-D Tensor with shape [N, Mi, C] representing the
-            predicted confidence predictions. N is the batch size, C is the
-            class number (excluding background), Mi is the number of bounding
-            boxes from i-th FPN level. For each bounding box, there are total
-            C scores.
-        anchors(List): A 2-D Tensor with shape [Mi, 4] represents the locations
-            of Mi anchor boxes from all FPN level. Each bounding box has four
-            coordinate values and the layout is [xmin, ymin, xmax, ymax].
-        im_info(Variable): A 2-D LoDTensor with shape [N, 3] represents the
-            image information. N is the batch size, each image information
-            includes height, width and scale.
-        score_threshold(float): Threshold to filter out bounding boxes
-            with a confidence score.
-        nms_top_k(int): Maximum number of detections per FPN layer to be
-            kept according to the confidences before NMS.
-        keep_top_k(int): Number of total bounding boxes to be kept per image after
-            NMS step. -1 means keeping all bounding boxes after NMS step.
-        nms_threshold(float): The threshold to be used in NMS.
-        nms_eta(float): The parameter for adaptive NMS.
-
-    Returns:
-        Variable:
-            The detection output is a LoDTensor with shape [No, 6].
-            Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
-            `No` is the total number of detections in this mini-batch. For each
-            instance, the offsets in first dimension are called LoD, the offset
-            number is N + 1, N is the batch size. The i-th image has
-            `LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image
-            has no detected results. If all images have no detected results,
-            LoD will be set to 0, and the output tensor is empty (None).
-
-    Examples:
-        .. code-block:: python
-        
-            import paddle.fluid as fluid
-
-            bboxes = layers.data(name='bboxes', shape=[1, 21, 4],
-                append_batch_size=False, dtype='float32')
-            scores = layers.data(name='scores', shape=[1, 21, 10],
-                append_batch_size=False, dtype='float32')
-            anchors = layers.data(name='anchors', shape=[21, 4],
-                append_batch_size=False, dtype='float32')
-            im_info = layers.data(name="im_info", shape=[1, 3],
-                append_batch_size=False, dtype='float32')
-            nmsed_outs = fluid.layers.retinanet_detection_output(
-                                                    bboxes=[bboxes, bboxes],
-                                                    scores=[scores, scores],
-                                                    anchors=[anchors, anchors],
-                                                    im_info=im_info,
-                                                    score_threshold=0.05,
-                                                    nms_top_k=1000,
-                                                    keep_top_k=100,
-                                                    nms_threshold=0.3,
-                                                    nms_eta=1.)
-    """
-
-    helper = LayerHelper('retinanet_detection_output', **locals())
-    output = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('scores'))
-    helper.append_op(
-        type="retinanet_detection_output",
-        inputs={
-            'BBoxes': bboxes,
-            'Scores': scores,
-            'Anchors': anchors,
-            'ImInfo': im_info
-        },
-        attrs={
-            'score_threshold': score_threshold,
-            'nms_top_k': nms_top_k,
-            'nms_threshold': nms_threshold,
-            'keep_top_k': keep_top_k,
-            'nms_eta': 1.,
-        },
-        outputs={'Out': output})
-    output.stop_gradient = True
-    return output
-
-
-def multiclass_nms(bboxes,
-                   scores,
-                   score_threshold,
-                   nms_top_k,
-                   keep_top_k,
-                   nms_threshold=0.3,
-                   normalized=True,
-                   nms_eta=1.,
-                   background_label=0,
-                   name=None):
-    """
-    **Multiclass NMS**
-    
-    This operator is to do multi-class non maximum suppression (NMS) on
-    boxes and scores.
-
-    In the NMS step, this operator greedily selects a subset of detection bounding
-    boxes that have high scores larger than score_threshold, if providing this
-    threshold, then selects the largest nms_top_k confidences scores if nms_top_k
-    is larger than -1. Then this operator pruns away boxes that have high IOU
-    (intersection over union) overlap with already selected boxes by adaptive
-    threshold NMS based on parameters of nms_threshold and nms_eta.
-    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
-    per image if keep_top_k is larger than -1.
-
-    See below for an example:
-
-    .. code-block:: text
-
-        if:
-            box1.data = (2.0, 3.0, 7.0, 5.0) format is (xmin, ymin, xmax, ymax)
-            box1.scores = (0.7, 0.2, 0.4)  which is (label0.score=0.7, label1.score=0.2, label2.cores=0.4)
-
-            box2.data = (3.0, 4.0, 8.0, 5.0)
-            box2.score = (0.3, 0.3, 0.1)
-
-            nms_threshold = 0.3
-            background_label = 0
-            score_threshold = 0
-
-
-        Then:
-            iou = 4/11 > 0.3
-            out.data = [[1, 0.3, 3.0, 4.0, 8.0, 5.0],    
-                         [2, 0.4, 2.0, 3.0, 7.0, 5.0]]
-                         
-            Out format is (label, confidence, xmin, ymin, xmax, ymax)
-    Args:
-        bboxes (Variable): Two types of bboxes are supported:
-                           1. (Tensor) A 3-D Tensor with shape
-                           [N, M, 4 or 8 16 24 32] represents the
-                           predicted locations of M bounding bboxes,
-                           N is the batch size. Each bounding box has four
-                           coordinate values and the layout is 
-                           [xmin, ymin, xmax, ymax], when box size equals to 4.
-                           2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
-                           M is the number of bounding boxes, C is the 
-                           class number   
-        scores (Variable): Two types of scores are supported:
-                           1. (Tensor) A 3-D Tensor with shape [N, C, M]
-                           represents the predicted confidence predictions.
-                           N is the batch size, C is the class number, M is 
-                           number of bounding boxes. For each category there 
-                           are total M scores which corresponding M bounding
-                           boxes. Please note, M is equal to the 2nd dimension
-                           of BBoxes.
-                           2. (LoDTensor) A 2-D LoDTensor with shape [M, C].
-                           M is the number of bbox, C is the class number.
-                           In this case, input BBoxes should be the second
-                           case with shape [M, C, 4].
-        background_label (int): The index of background label, the background 
-                                label will be ignored. If set to -1, then all
-                                categories will be considered. Default: 0
-        score_threshold (float): Threshold to filter out bounding boxes with
-                                 low confidence score. If not provided, 
-                                 consider all boxes.
-        nms_top_k (int): Maximum number of detections to be kept according to
-                         the confidences aftern the filtering detections based
-                         on score_threshold.
-        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
-        nms_eta (float): The threshold to be used in NMS. Default: 1.0
-        keep_top_k (int): Number of total bboxes to be kept per image after NMS
-                          step. -1 means keeping all bboxes after NMS step.
-        normalized (bool): Whether detections are normalized. Default: True
-        name(str): Name of the multiclass nms op. Default: None.
-
-    Returns:
-        Out(Variable): A 2-D LoDTensor with shape [No, 6] represents the detections.
-             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
-             or A 2-D LoDTensor with shape [No, 10] represents the detections.
-             Each row has 10 values: 
-             [label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the 
-             total number of detections. If there is no detected boxes for all
-             images, lod will be set to {1} and Out only contains one value
-             which is -1.
-             (After version 1.3, when no boxes detected, the lod is changed 
-             from {0} to {1}) 
-
-
-    Examples:
-        .. code-block:: python
-
-
-            import paddle.fluid as fluid
-            boxes = fluid.layers.data(name='bboxes', shape=[81, 4],
-                                      dtype='float32', lod_level=1)
-            scores = fluid.layers.data(name='scores', shape=[81],
-                                      dtype='float32', lod_level=1)
-            out = fluid.layers.multiclass_nms(bboxes=boxes,
-                                              scores=scores,
-                                              background_label=0,
-                                              score_threshold=0.5,
-                                              nms_top_k=400,
-                                              nms_threshold=0.3,
-                                              keep_top_k=200,
-                                              normalized=False)
-    """
-    helper = LayerHelper('multiclass_nms', **locals())
-
-    output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
-    helper.append_op(
-        type="multiclass_nms",
-        inputs={'BBoxes': bboxes,
-                'Scores': scores},
-        attrs={
-            'background_label': background_label,
-            'score_threshold': score_threshold,
-            'nms_top_k': nms_top_k,
-            'nms_threshold': nms_threshold,
-            'nms_eta': nms_eta,
-            'keep_top_k': keep_top_k,
-            'nms_eta': nms_eta,
-            'normalized': normalized
-        },
-        outputs={'Out': output})
-    output.stop_gradient = True
-
-    return output
-
-
-def multiclass_nms2(bboxes,
-                    scores,
-                    score_threshold,
-                    nms_top_k,
-                    keep_top_k,
-                    nms_threshold=0.3,
-                    normalized=True,
-                    nms_eta=1.,
-                    background_label=0,
-                    return_index=False,
-                    name=None):
-    """
-    **Multiclass NMS2**
-    
-    This operator is to do multi-class non maximum suppression (NMS) on
-    boxes and scores.
-
-    In the NMS step, this operator greedily selects a subset of detection bounding
-    boxes that have high scores larger than score_threshold, if providing this
-    threshold, then selects the largest nms_top_k confidences scores if nms_top_k
-    is larger than -1. Then this operator pruns away boxes that have high IOU
-    (intersection over union) overlap with already selected boxes by adaptive
-    threshold NMS based on parameters of nms_threshold and nms_eta.
-
-    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
-    per image if keep_top_k is larger than -1.
-
-    Args:
-        bboxes (Variable): Two types of bboxes are supported:
-                           1. (Tensor) A 3-D Tensor with shape
-                           [N, M, 4 or 8 16 24 32] represents the
-                           predicted locations of M bounding bboxes,
-                           N is the batch size. Each bounding box has four
-                           coordinate values and the layout is 
-                           [xmin, ymin, xmax, ymax], when box size equals to 4.
-                           2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
-                           M is the number of bounding boxes, C is the 
-                           class number   
-        scores (Variable): Two types of scores are supported:
-                           1. (Tensor) A 3-D Tensor with shape [N, C, M]
-                           represents the predicted confidence predictions.
-                           N is the batch size, C is the class number, M is 
-                           number of bounding boxes. For each category there 
-                           are total M scores which corresponding M bounding
-                           boxes. Please note, M is equal to the 2nd dimension
-                           of BBoxes.
-                           2. (LoDTensor) A 2-D LoDTensor with shape [M, C].
-                           M is the number of bbox, C is the class number.
-                           In this case, input BBoxes should be the second
-                           case with shape [M, C, 4].
-        background_label (int): The index of background label, the background 
-                                label will be ignored. If set to -1, then all
-                                categories will be considered. Default: 0
-        score_threshold (float): Threshold to filter out bounding boxes with
-                                 low confidence score. If not provided, 
-                                 consider all boxes.
-        nms_top_k (int): Maximum number of detections to be kept according to
-                         the confidences aftern the filtering detections based
-                         on score_threshold.
-        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
-        nms_eta (float): The threshold to be used in NMS. Default: 1.0
-        keep_top_k (int): Number of total bboxes to be kept per image after NMS
-                          step. -1 means keeping all bboxes after NMS step.
-        normalized (bool): Whether detections are normalized. Default: True
-        return_index(bool): Whether return selected index. Default: False
-        name(str): Name of the multiclass nms op. Default: None.
-
-    Returns:
-        A tuple with two Variables: (Out, Index) if return_index is True,
-        otherwise, a tuple with one Variable(Out) is returned. 
-
-        Out: A 2-D LoDTensor with shape [No, 6] represents the detections. 
-        Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] 
-        or A 2-D LoDTensor with shape [No, 10] represents the detections. 
-        Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3, 
-        x4, y4]. No is the total number of detections. 
-
-        If all images have not detected results, all elements in LoD will be
-        0, and output tensor is empty (None).
-
-        Index: Only return when return_index is True. A 2-D LoDTensor with 
-        shape [No, 1] represents the selected index which type is Integer. 
-        The index is the absolute value cross batches. No is the same number 
-        as Out. If the index is used to gather other attribute such as age, 
-        one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where 
-        N is the batch size and M is the number of boxes.
-
-
-    Examples:
-        .. code-block:: python
-
-
-            import paddle.fluid as fluid
-            boxes = fluid.layers.data(name='bboxes', shape=[81, 4],
-                                      dtype='float32', lod_level=1)
-            scores = fluid.layers.data(name='scores', shape=[81],
-                                      dtype='float32', lod_level=1)
-            out, index = fluid.layers.multiclass_nms2(bboxes=boxes,
-                                              scores=scores,
-                                              background_label=0,
-                                              score_threshold=0.5,
-                                              nms_top_k=400,
-                                              nms_threshold=0.3,
-                                              keep_top_k=200,
-                                              normalized=False,
-                                              return_index=True)
-    """
-    helper = LayerHelper('multiclass_nms2', **locals())
-
-    output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
-    index = helper.create_variable_for_type_inference(dtype='int')
-    helper.append_op(
-        type="multiclass_nms2",
-        inputs={'BBoxes': bboxes,
-                'Scores': scores},
-        attrs={
-            'background_label': background_label,
-            'score_threshold': score_threshold,
-            'nms_top_k': nms_top_k,
-            'nms_threshold': nms_threshold,
-            'nms_eta': nms_eta,
-            'keep_top_k': keep_top_k,
-            'nms_eta': nms_eta,
-            'normalized': normalized
-        },
-        outputs={'Out': output,
-                 'Index': index})
-    output.stop_gradient = True
-    index.stop_gradient = True
-
-    if return_index:
-        return output, index
-    return output
-
-
-def distribute_fpn_proposals(fpn_rois,
-                             min_level,
-                             max_level,
-                             refer_level,
-                             refer_scale,
-                             name=None):
-    """
-    In Feature Pyramid Networks (FPN) models, it is needed to distribute all 
-    proposals into different FPN level, with respect to scale of the proposals,
-    the referring scale and the referring level. Besides, to restore the order
-    of proposals, we return an array which indicates the original index of rois
-    in current proposals. To compute FPN level for each roi, the formula is 
-    given as follows:
-    
-    .. math::
-
-        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
-
-        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
-
-    where BBoxArea is a function to compute the area of each roi.
-
-    Args:
-        fpn_rois(variable): The input fpn_rois, the second dimension is 4.
-        min_level(int): The lowest level of FPN layer where the proposals come 
-                        from.
-        max_level(int): The highest level of FPN layer where the proposals
-                        come from.
-        refer_level(int): The referring level of FPN layer with specified scale.
-        refer_scale(int): The referring scale of FPN layer with specified level.
-        name(str|None): The name of this operator.        
-
-    Returns:
-        tuple: 
-               A tuple(multi_rois, restore_ind) is returned. The multi_rois is 
-               a list of segmented tensor variables. The restore_ind is a 2D 
-               Tensor with shape [N, 1], N is the number of total rois. It is
-               used to restore the order of fpn_rois.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            fpn_rois = fluid.layers.data(
-                name='data', shape=[4], dtype='float32', lod_level=1)
-            multi_rois, restore_ind = fluid.layers.distribute_fpn_proposals(
-                fpn_rois=fpn_rois,
-                min_level=2,
-                max_level=5,
-                refer_level=4,
-                refer_scale=224)
-    """
-
-    helper = LayerHelper('distribute_fpn_proposals', **locals())
-    dtype = helper.input_dtype('fpn_rois')
-    num_lvl = max_level - min_level + 1
-    multi_rois = [
-        helper.create_variable_for_type_inference(dtype) for i in range(num_lvl)
-    ]
-    restore_ind = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type='distribute_fpn_proposals',
-        inputs={'FpnRois': fpn_rois},
-        outputs={'MultiFpnRois': multi_rois,
-                 'RestoreIndex': restore_ind},
-        attrs={
-            'min_level': min_level,
-            'max_level': max_level,
-            'refer_level': refer_level,
-            'refer_scale': refer_scale
-        })
-    return multi_rois, restore_ind
-
-
-@templatedoc()
-def box_decoder_and_assign(prior_box,
-                           prior_box_var,
-                           target_box,
-                           box_score,
-                           box_clip,
-                           name=None):
-    """
-    ${comment}
-    Args:
-        prior_box(${prior_box_type}): ${prior_box_comment}
-        prior_box_var(${prior_box_var_type}): ${prior_box_var_comment}
-        target_box(${target_box_type}): ${target_box_comment}
-        box_score(${box_score_type}): ${box_score_comment}
-        box_clip(${box_clip_type}): ${box_clip_comment}
-        name(str|None): The name of this operator
-    Returns:
-        decode_box(Variable), output_assign_box(Variable):
-
-            two variables:
-
-            - decode_box(${decode_box_type}): ${decode_box_comment}
-            - output_assign_box(${output_assign_box_type}): ${output_assign_box_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            pb = fluid.layers.data(
-                name='prior_box', shape=[4], dtype='float32')
-            pbv = fluid.layers.data(
-                name='prior_box_var', shape=[4], 
-                dtype='float32', append_batch_size=False)
-            loc = fluid.layers.data(
-                name='target_box', shape=[4*81], dtype='float32')
-            scores = fluid.layers.data(
-                name='scores', shape=[81], dtype='float32')
-            decoded_box, output_assign_box = fluid.layers.box_decoder_and_assign(
-                pb, pbv, loc, scores, 4.135)
-
-    """
-    helper = LayerHelper("box_decoder_and_assign", **locals())
-
-    decoded_box = helper.create_variable_for_type_inference(
-        dtype=prior_box.dtype)
-    output_assign_box = helper.create_variable_for_type_inference(
-        dtype=prior_box.dtype)
-
-    helper.append_op(
-        type="box_decoder_and_assign",
-        inputs={
-            "PriorBox": prior_box,
-            "PriorBoxVar": prior_box_var,
-            "TargetBox": target_box,
-            "BoxScore": box_score
-        },
-        attrs={"box_clip": box_clip},
-        outputs={
-            "DecodeBox": decoded_box,
-            "OutputAssignBox": output_assign_box
-        })
-    return decoded_box, output_assign_box
-
-
-def collect_fpn_proposals(multi_rois,
-                          multi_scores,
-                          min_level,
-                          max_level,
-                          post_nms_top_n,
-                          name=None):
-    """
-    Concat multi-level RoIs (Region of Interest) and select N RoIs 
-    with respect to multi_scores. This operation performs the following steps:
-
-    1. Choose num_level RoIs and scores as input: num_level = max_level - min_level
-    2. Concat multi-level RoIs and scores
-    3. Sort scores and select post_nms_top_n scores
-    4. Gather RoIs by selected indices from scores
-    5. Re-sort RoIs by corresponding batch_id
-
-    Args:
-        multi_ros(list): List of RoIs to collect
-        multi_scores(list): List of scores
-        min_level(int): The lowest level of FPN layer to collect
-        max_level(int): The highest level of FPN layer to collect
-        post_nms_top_n(int): The number of selected RoIs
-        name(str|None): A name for this layer(optional)
-        
-    Returns:
-        Variable: Output variable of selected RoIs. 
-
-    Examples:
-        .. code-block:: python
-           
-            import paddle.fluid as fluid
-            multi_rois = []
-            multi_scores = []
-            for i in range(4):
-                multi_rois.append(fluid.layers.data(
-                    name='roi_'+str(i), shape=[4], dtype='float32', lod_level=1))
-            for i in range(4):
-                multi_scores.append(fluid.layers.data(
-                    name='score_'+str(i), shape=[1], dtype='float32', lod_level=1))
-
-            fpn_rois = fluid.layers.collect_fpn_proposals(
-                multi_rois=multi_rois, 
-                multi_scores=multi_scores,
-                min_level=2, 
-                max_level=5, 
-                post_nms_top_n=2000)
-    """
-
-    helper = LayerHelper('collect_fpn_proposals', **locals())
-    dtype = helper.input_dtype('multi_rois')
-    num_lvl = max_level - min_level + 1
-    input_rois = multi_rois[:num_lvl]
-    input_scores = multi_scores[:num_lvl]
-    output_rois = helper.create_variable_for_type_inference(dtype)
-    output_rois.stop_gradient = True
-    helper.append_op(
-        type='collect_fpn_proposals',
-        inputs={
-            'MultiLevelRois': input_rois,
-            'MultiLevelScores': input_scores
-        },
-        outputs={'FpnRois': output_rois},
-        attrs={'post_nms_topN': post_nms_top_n})
-    return output_rois
diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py
deleted file mode 100644
index 78226a5201707a192b6fa38e11bfc243f5815a55..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/device.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-All util layers.
-"""
-
-from __future__ import print_function
-
-from .layer_function_generator import autodoc
-from ..framework import unique_name
-from ..layer_helper import LayerHelper
-from ..annotations import deprecated
-
-__all__ = []
-
-
-@deprecated(since='0.15.0', instead="ParallelExecutor")
-@autodoc()
-def get_places(device_count=None, device_type=None):
-    helper = LayerHelper('get_places', **locals())
-    out_places = helper.create_variable(
-        name=unique_name.generate_with_ignorable_key(helper.name + ".out"))
-    attrs = dict()
-    if device_count is not None:
-        attrs['device_count'] = int(device_count)
-    if device_type is not None:
-        attrs['device_type'] = str(device_type)
-
-    helper.append_op(
-        type='get_places', outputs={"Out": [out_places]}, attrs=attrs)
-
-    return out_places
diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py
deleted file mode 100644
index fd67706b995f9a23b8864df2b2d3357b8da18cfc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/distributions.py
+++ /dev/null
@@ -1,603 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import control_flow
-from . import tensor
-from . import ops
-from . import nn
-import math
-import numpy as np
-import warnings
-
-__all__ = ['Uniform', 'Normal', 'Categorical', 'MultivariateNormalDiag']
-
-
-class Distribution(object):
-    """
-    Distribution is the abstract base class for probability distributions.
-    """
-
-    def sample(self):
-        """Sampling from the distribution."""
-        raise NotImplementedError
-
-    def entropy(self):
-        """The entropy of the distribution."""
-        raise NotImplementedError
-
-    def kl_divergence(self, other):
-        """The KL-divergence between self distributions and other."""
-        raise NotImplementedError
-
-    def log_prob(self, value):
-        """Log probability density/mass function."""
-        raise NotImplementedError
-
-    def _validate_args(self, *args):
-        """
-        Argument validation for distribution args
-        Args:
-            value (float, list, numpy.ndarray, Variable)
-        Raises
-            ValueError: if one argument is Variable, all arguments should be Variable
-        """
-        is_variable = False
-        is_number = False
-        for arg in args:
-            if isinstance(arg, tensor.Variable):
-                is_variable = True
-            else:
-                is_number = True
-
-        if is_variable and is_number:
-            raise ValueError(
-                'if one argument is Variable, all arguments should be Variable')
-
-        return is_variable
-
-    def _to_variable(self, *args):
-        """
-        Argument convert args to Variable
-
-        Args:
-            value (float, list, numpy.ndarray, Variable)
-        Returns:
-            Variable of args.
-        """
-        numpy_args = []
-        variable_args = []
-        tmp = 0.
-
-        for arg in args:
-            valid_arg = False
-            for cls in [float, list, np.ndarray, tensor.Variable]:
-                if isinstance(arg, cls):
-                    valid_arg = True
-                    break
-            assert valid_arg, "type of input args must be float, list, numpy.ndarray or Variable."
-            if isinstance(arg, float):
-                arg = np.zeros(1) + arg
-            arg_np = np.array(arg)
-            arg_dtype = arg_np.dtype
-            if str(arg_dtype) not in ['float32']:
-                warnings.warn(
-                    "data type of argument only support float32, your argument will be convert to float32."
-                )
-                arg_np = arg_np.astype('float32')
-            tmp = tmp + arg_np
-            numpy_args.append(arg_np)
-
-        dtype = tmp.dtype
-        for arg in numpy_args:
-            arg_broadcasted, _ = np.broadcast_arrays(arg, tmp)
-            arg_variable = tensor.create_tensor(dtype=dtype)
-            tensor.assign(arg_broadcasted, arg_variable)
-            variable_args.append(arg_variable)
-
-        return tuple(variable_args)
-
-
-class Uniform(Distribution):
-    """Uniform distribution with `low` and `high` parameters.
-
-    Mathematical Details
-
-    The probability density function (pdf) is,
-
-    .. math::
-
-        pdf(x; a, b) = \\frac{1}{Z}, \ a <=x <b
-
-    .. math::
-
-        Z = b - a
-
-    In the above equation:
-
-    * :math:`low = a`,
-    * :math:`high = b`,
-    * :math:`Z`: is the normalizing constant.
-
-    The parameters `low` and `high` must be shaped in a way that supports
-    broadcasting (e.g., `high - low` is a valid operation).
-
-    Args:
-        low(float|list|numpy.ndarray|Variable): The lower boundary of uniform distribution.
-        high(float|list|numpy.ndarray|Variable): The higher boundary of uniform distribution.
-
-    Examples:
-        .. code-block:: python
-
-          from paddle.fluid import layers
-          from paddle.fluid.layers import Uniform
-
-          # Without broadcasting, a single uniform distribution [3, 4]:
-          u1 = Uniform(low=3.0, high=4.0)
-          # 2 distributions [1, 3], [2, 4]
-          u2 = Uniform(low=[1.0, 2.0],
-                        high=[3.0, 4.0])
-          # 4 distributions
-          u3 = Uniform(low=[[1.0, 2.0],
-                    [3.0, 4.0]],
-               high=[[1.5, 2.5],
-                     [3.5, 4.5]])
-
-          # With broadcasting:
-          u4 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])
-
-          # Variable as input
-          dims = 3
-
-          low = layers.data(name='low', shape=[dims], dtype='float32')
-          high = layers.data(name='high', shape=[dims], dtype='float32')
-          values = layers.data(name='values', shape=[dims], dtype='float32')
-
-          uniform = Uniform(low, high)
-
-          sample = uniform.sample([2, 3])
-          entropy = uniform.entropy()
-          lp = uniform.log_prob(values)
-
-    """
-
-    def __init__(self, low, high):
-        self.all_arg_is_float = False
-        self.batch_size_unknown = False
-        if self._validate_args(low, high):
-            self.batch_size_unknown = True
-            self.low = low
-            self.high = high
-        else:
-            if isinstance(low, float) and isinstance(high, float):
-                self.all_arg_is_float = True
-            self.low, self.high = self._to_variable(low, high)
-
-    def sample(self, shape, seed=0):
-        """Generate samples of the specified shape.
-
-        Args:
-          shape (list): 1D `int32`. Shape of the generated samples.
-          seed (int): Python integer number.
-
-        Returns:
-          Variable: A tensor with prepended dimensions shape.
-
-        """
-        batch_shape = list((self.low + self.high).shape)
-        if self.batch_size_unknown:
-            output_shape = shape + batch_shape
-            zero_tmp = tensor.fill_constant_batch_size_like(
-                self.low + self.high, batch_shape + shape, self.low.dtype, 0.)
-            uniform_random_tmp = nn.uniform_random_batch_size_like(
-                zero_tmp, zero_tmp.shape, min=0., max=1., seed=seed)
-            output = uniform_random_tmp * (zero_tmp + self.high - self.low
-                                           ) + self.low
-            return nn.reshape(output, output_shape)
-        else:
-            output_shape = shape + batch_shape
-            output = ops.uniform_random(
-                output_shape, seed=seed) * (tensor.zeros(
-                    output_shape, dtype=self.low.dtype) +
-                                            (self.high - self.low)) + self.low
-            if self.all_arg_is_float:
-                return nn.reshape(output, shape)
-            else:
-                return output
-
-    def log_prob(self, value):
-        """Log probability density/mass function.
-
-        Args:
-          value (Variable): The input tensor.
-
-        Returns:
-          Variable: log probability.
-
-        """
-        lb_bool = control_flow.less_than(self.low, value)
-        ub_bool = control_flow.less_than(value, self.high)
-        lb = tensor.cast(lb_bool, dtype=value.dtype)
-        ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return nn.log(lb * ub) - nn.log(self.high - self.low)
-
-    def entropy(self):
-        """Shannon entropy in nats.
-
-        Returns:
-          Variable: Shannon entropy of uniform distribution.
-
-        """
-        return nn.log(self.high - self.low)
-
-
-class Normal(Distribution):
-    """The Normal distribution with location `loc` and `scale` parameters.
-
-    Mathematical details
-
-    The probability density function (pdf) is,
-
-    .. math::
-
-        pdf(x; \mu, \sigma) = \\frac{1}{Z}e^{\\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
-
-    .. math::
-
-        Z = (2 \pi \sigma^2)^{0.5}
-
-    In the above equation:
-
-    * :math:`loc = \mu`: is the mean.
-    * :math:`scale = \sigma`: is the std.
-    * :math:`Z`: is the normalization constant.
-
-    Args:
-        loc(float|list|numpy.ndarray|Variable): The mean of normal distribution.
-        scale(float|list|numpy.ndarray|Variable): The std of normal distribution.
-
-    Examples:
-        .. code-block:: python
-
-          from paddle.fluid import layers
-          from paddle.fluid.layers import Normal
-
-          # Define a single scalar Normal distribution.
-          dist = Normal(loc=0., scale=3.)
-          # Define a batch of two scalar valued Normals.
-          # The first has mean 1 and standard deviation 11, the second 2 and 22.
-          dist = Normal(loc=[1, 2.], scale=[11, 22.])
-          # Get 3 samples, returning a 3 x 2 tensor.
-          dist.sample([3])
-
-          # Define a batch of two scalar valued Normals.
-          # Both have mean 1, but different standard deviations.
-          dist = Normal(loc=1., scale=[11, 22.])
-
-          # Define a batch of two scalar valued Normals.
-          # Both have mean 1, but different standard deviations.
-          dist = Normal(loc=1., scale=[11, 22.])
-
-          # Variable as input
-          dims = 3
-
-          loc = layers.data(name='loc', shape=[dims], dtype='float32')
-          scale = layers.data(name='scale', shape=[dims], dtype='float32')
-          other_loc = layers.data(
-              name='other_loc', shape=[dims], dtype='float32')
-          other_scale = layers.data(
-              name='other_scale', shape=[dims], dtype='float32')
-          values = layers.data(name='values', shape=[dims], dtype='float32')
-
-          normal = Normal(loc, scale)
-          other_normal = Normal(other_loc, other_scale)
-
-          sample = normal.sample([2, 3])
-          entropy = normal.entropy()
-          lp = normal.log_prob(values)
-          kl = normal.kl_divergence(other_normal)
-    """
-
-    def __init__(self, loc, scale):
-        self.batch_size_unknown = False
-        self.all_arg_is_float = False
-        if self._validate_args(loc, scale):
-            self.batch_size_unknown = True
-            self.loc = loc
-            self.scale = scale
-        else:
-            if isinstance(loc, float) and isinstance(scale, float):
-                self.all_arg_is_float = True
-            self.loc, self.scale = self._to_variable(loc, scale)
-
-    def sample(self, shape, seed=0):
-        """Generate samples of the specified shape.
-
-        Args:
-          shape (list): 1D `int32`. Shape of the generated samples.
-          seed (int): Python integer number.
-
-        Returns:
-          Variable: A tensor with prepended dimensions shape.
-
-        """
-        batch_shape = list((self.loc + self.scale).shape)
-
-        if self.batch_size_unknown:
-            output_shape = shape + batch_shape
-            zero_tmp = tensor.fill_constant_batch_size_like(
-                self.loc + self.scale, batch_shape + shape, self.loc.dtype, 0.)
-            normal_random_tmp = nn.gaussian_random_batch_size_like(
-                zero_tmp, zero_tmp.shape, mean=0., std=1., seed=seed)
-            output = normal_random_tmp * (zero_tmp + self.scale) + self.loc
-            return nn.reshape(output, output_shape)
-        else:
-            output_shape = shape + batch_shape
-            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed) * \
-                     (tensor.zeros(output_shape, dtype=self.loc.dtype) + self.scale) + self.loc
-            if self.all_arg_is_float:
-                return nn.reshape(output, shape)
-            else:
-                return output
-
-    def entropy(self):
-        """Shannon entropy in nats.
-
-        Returns:
-          Variable: Shannon entropy of normal distribution.
-
-        """
-        batch_shape = list((self.loc + self.scale).shape)
-        zero_tmp = tensor.fill_constant_batch_size_like(
-            self.loc + self.scale, batch_shape, self.loc.dtype, 0.)
-        return 0.5 + 0.5 * math.log(2 * math.pi) + nn.log(
-            (self.scale + zero_tmp))
-
-    def log_prob(self, value):
-        """Log probability density/mass function.
-
-        Args:
-          value (Variable): The input tensor.
-
-        Returns:
-          Variable: log probability.
-
-        """
-        var = self.scale * self.scale
-        log_scale = nn.log(self.scale)
-        return -1. * ((value - self.loc) * (value - self.loc)) / (
-            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
-
-    def kl_divergence(self, other):
-        """The KL-divergence between two normal distributions.
-
-        Args:
-            other (Normal): instance of Normal.
-
-        Returns:
-            Variable: kl-divergence between two normal distributions.
-
-        """
-        assert isinstance(other, Normal), "another distribution must be Normal"
-        var_ratio = self.scale / other.scale
-        var_ratio = (var_ratio * var_ratio)
-        t1 = (self.loc - other.loc) / other.scale
-        t1 = (t1 * t1)
-        return 0.5 * (var_ratio + t1 - 1. - nn.log(var_ratio))
-
-
-class Categorical(Distribution):
-    """
-    Categorical distribution is a discrete probability distribution that 
-    describes the possible results of a random variable that can take on 
-    one of K possible categories, with the probability of each category 
-    separately specified.
-
-    Args:
-        logits(list|numpy.ndarray|Variable): The logits input of categorical distribution.
-
-    Examples:
-        .. code-block:: python
-
-          import numpy as np
-          from paddle.fluid import layers
-          from paddle.fluid.layers import Categorical
-
-          a_logits_npdata = np.array([-0.602,-0.602], dtype="float32")
-          a_logits_tensor = layers.create_tensor(dtype="float32")
-          layers.assign(a_logits_npdata, a_logits_tensor)
-
-          b_logits_npdata = np.array([-0.102,-0.112], dtype="float32")
-          b_logits_tensor = layers.create_tensor(dtype="float32")
-          layers.assign(b_logits_npdata, b_logits_tensor)
-          
-          a = Categorical(a_logits_tensor)
-          b = Categorical(b_logits_tensor)
-
-          a.entropy()
-          # [0.6931472] with shape: [1]
-
-          b.entropy()
-          # [0.6931347] with shape: [1]
-
-          a.kl_divergence(b)
-          # [1.2516975e-05] with shape: [1]
-
-    """
-
-    def __init__(self, logits):
-        """
-        Args:
-            logits: A float32 tensor
-        """
-        if self._validate_args(logits):
-            self.logits = logits
-        else:
-            self.logits = self._to_variable(logits)[0]
-
-    def kl_divergence(self, other):
-        """The KL-divergence between two Categorical distributions.
-
-        Args:
-            other (Categorical): instance of Categorical.
-
-        Returns:
-            Variable: kl-divergence between two Categorical distributions.
-
-        """
-        assert isinstance(other, Categorical)
-
-        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
-        other_logits = other.logits - nn.reduce_max(
-            other.logits, dim=-1, keep_dim=True)
-        e_logits = ops.exp(logits)
-        other_e_logits = ops.exp(other_logits)
-        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
-        other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True)
-        prob = e_logits / z
-        kl = nn.reduce_sum(
-            prob * (logits - nn.log(z) - other_logits + nn.log(other_z)),
-            dim=-1,
-            keep_dim=True)
-
-        return kl
-
-    def entropy(self):
-        """Shannon entropy in nats.
-
-        Returns:
-          Variable: Shannon entropy of Categorical distribution.
-
-        """
-        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
-        e_logits = ops.exp(logits)
-        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
-        prob = e_logits / z
-        entropy = -1.0 * nn.reduce_sum(
-            prob * (logits - nn.log(z)), dim=-1, keep_dim=True)
-
-        return entropy
-
-
-class MultivariateNormalDiag(Distribution):
-    """
-    A multivariate normal (also called Gaussian) distribution parameterized by a mean vector
-    and a covariance matrix.
-
-    Args:
-        loc(list|numpy.ndarray|Variable): The mean of multivariateNormal distribution.
-        scale(list|numpy.ndarray|Variable): The positive definite diagonal covariance matrix of
-        multivariateNormal distribution.
-
-    Examples:
-        .. code-block:: python
-    
-            import numpy as np
-            from paddle.fluid import layers
-            from paddle.fluid.layers import MultivariateNormalDiag
-
-            a_loc_npdata = np.array([0.3,0.5],dtype="float32")
-            a_loc_tensor = layers.create_tensor(dtype="float32")
-            layers.assign(a_loc_npdata, a_loc_tensor)
-
-
-            a_scale_npdata = np.array([[0.4,0],[0,0.5]],dtype="float32")
-            a_scale_tensor = layers.create_tensor(dtype="float32")
-            layers.assign(a_scale_npdata, a_scale_tensor)
-
-            b_loc_npdata = np.array([0.2,0.4],dtype="float32")
-            b_loc_tensor = layers.create_tensor(dtype="float32")
-            layers.assign(b_loc_npdata, b_loc_tensor)
-
-            b_scale_npdata = np.array([[0.3,0],[0,0.4]],dtype="float32")
-            b_scale_tensor = layers.create_tensor(dtype="float32")
-            layers.assign(b_scale_npdata, b_scale_tensor)
-
-            a = MultivariateNormalDiag(a_loc_tensor, a_scale_tensor)
-            b = MultivariateNormalDiag(b_loc_tensor, b_scale_tensor)
-            
-            a.entropy()
-            # [2.033158] with shape: [1]
-            b.entropy()
-            # [1.7777451] with shaoe: [1]
-
-            a.kl_divergence(b)
-            # [0.06542051] with shape: [1]
-       
-    """
-
-    def __init__(self, loc, scale):
-        if self._validate_args(loc, scale):
-            self.loc = loc
-            self.scale = scale
-        else:
-            self.loc, self.scale = self._to_variable(loc, scale)
-
-    def _det(self, value):
-
-        batch_shape = list(value.shape)
-        one_all = tensor.ones(shape=batch_shape, dtype=self.loc.dtype)
-        one_diag = tensor.diag(
-            tensor.ones(
-                shape=[batch_shape[0]], dtype=self.loc.dtype))
-        det_diag = nn.reduce_prod(value + one_all - one_diag)
-
-        return det_diag
-
-    def _inv(self, value):
-
-        batch_shape = list(value.shape)
-        one_all = tensor.ones(shape=batch_shape, dtype=self.loc.dtype)
-        one_diag = tensor.diag(
-            tensor.ones(
-                shape=[batch_shape[0]], dtype=self.loc.dtype))
-        inv_diag = nn.elementwise_pow(value, (one_all - 2 * one_diag))
-
-        return inv_diag
-
-    def entropy(self):
-        """Shannon entropy in nats.
-
-        Returns:
-          Variable: Shannon entropy of Multivariate Normal distribution.
-
-        """
-        entropy = 0.5 * (
-            self.scale.shape[0] *
-            (1.0 + math.log(2 * math.pi)) + nn.log(self._det(self.scale)))
-
-        return entropy
-
-    def kl_divergence(self, other):
-        """The KL-divergence between two Multivariate Normal distributions.
-
-        Args:
-            other (MultivariateNormalDiag): instance of Multivariate Normal.
-
-        Returns:
-            Variable: kl-divergence between two Multivariate Normal distributions.
-
-        """
-        assert isinstance(other, MultivariateNormalDiag)
-
-        tr_cov_matmul = nn.reduce_sum(self._inv(other.scale) * self.scale)
-        loc_matmul_cov = nn.matmul((other.loc - self.loc),
-                                   self._inv(other.scale))
-        tri_matmul = nn.matmul(loc_matmul_cov, (other.loc - self.loc))
-        k = list(self.scale.shape)[0]
-        ln_cov = nn.log(self._det(other.scale)) - nn.log(self._det(self.scale))
-        kl = 0.5 * (tr_cov_matmul + tri_matmul - k + ln_cov)
-
-        return kl
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
deleted file mode 100644
index 39ce712081bc1ed275a0287b9bded90d2217b213..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/io.py
+++ /dev/null
@@ -1,883 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from ..wrapped_decorator import signature_safe_contextmanager
-import multiprocessing
-import os
-import six
-import sys
-import threading
-
-from ..data_feeder import DataFeeder
-from .control_flow import BlockGuard
-from .layer_function_generator import templatedoc
-from .. import core
-from ..executor import global_scope
-from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
-    default_startup_program, program_guard, Program, Variable
-from ..layer_helper import LayerHelper
-from ..unique_name import generate as unique_name
-import logging
-
-__all__ = [
-    'data', 'read_file', 'double_buffer', 'py_reader',
-    'create_py_reader_by_data', 'load'
-]
-
-
-def data(name,
-         shape,
-         append_batch_size=True,
-         dtype='float32',
-         lod_level=0,
-         type=core.VarDesc.VarType.LOD_TENSOR,
-         stop_gradient=True):
-    """
-    **Data Layer**
-
-    This function takes in the input and based on whether data has
-    to be returned back as a minibatch, it creates the global variable by using
-    the helper functions. The global variables can be accessed by all the
-    following operators in the graph.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
-
-    Notice that paddle would only use :code:`shape` to infer the shapes of 
-    following variables in the network during compile-time. During run-time, 
-    paddle would not check whether the shape of the feeded data matches the 
-    :code:`shape` settings in this function. 
-
-    Args:
-       name(str): The name/alias of the function
-       shape(list): Tuple declaring the shape. If :code:`append_batch_size` is 
-                    True and there is no -1 inside :code:`shape`, it should be 
-                    considered as the shape of the each sample. Otherwise, it
-                    should be considered as the shape of the batched data.  
-       append_batch_size(bool):
-          1. If true, it prepends -1 to the shape.
-            For example if shape=[1], the resulting shape is [-1, 1]. This will 
-            be useful to set different batch size at run time.
-          2. If shape contains -1, such as shape=[1, -1].
-            append_batch_size will be enforced to be be False (ineffective)
-            because PaddlePaddle cannot set more than 1 unknown number on the
-            shape.
-       dtype(np.dtype|VarType|str): The type of data : float32, float16, int etc
-       type(VarType): The output type. By default it is LOD_TENSOR.
-       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
-       stop_gradient(bool): A boolean that mentions whether gradient should flow.
-
-    Returns:
-        Variable: The global variable that gives access to the data.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='x', shape=[784], dtype='float32')
-    """
-    helper = LayerHelper('data', **locals())
-    shape = list(shape)
-    for i in six.moves.range(len(shape)):
-        if shape[i] is None:
-            shape[i] = -1
-            append_batch_size = False
-        elif shape[i] < 0:
-            append_batch_size = False
-
-    if append_batch_size:
-        shape = [-1] + shape  # append batch size as -1
-
-    data_var = helper.create_global_variable(
-        name=name,
-        shape=shape,
-        dtype=dtype,
-        type=type,
-        stop_gradient=stop_gradient,
-        lod_level=lod_level,
-        is_data=True)
-    return data_var
-
-
-class BlockGuardServ(BlockGuard):
-    """
-    BlockGuardServ class.
-
-    BlockGuardServ class is used to create an op with a block in a program.
-    """
-
-    def __init__(self, server):
-        if not (isinstance(server, ListenAndServ)):
-            raise TypeError("BlockGuardServ takes a ListenAndServ")
-        super(BlockGuardServ, self).__init__(server.helper.main_program)
-        self.server = server
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-
-        self.server.complete_op()
-        return super(BlockGuardServ, self).__exit__(exc_type, exc_val, exc_tb)
-
-
-class ListenAndServ(object):
-    """
-    **ListenAndServ Layer**
-
-    ListenAndServ is used to create a rpc server bind and listen
-    on specific TCP port, this server will run the sub-block when
-    received variables from clients.
-
-    Args:
-        endpoint(string): IP:port string which the server will listen on.
-        inputs(list): a list of variables that the server will get from clients.
-        fan_in(int): how many client are expected to report to this server, default: 1.
-        optimizer_mode(bool): whether to run the server as a parameter server, default: True.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            with fluid.program_guard(main):
-                serv = layers.ListenAndServ(
-                    "127.0.0.1:6170", ["X"], optimizer_mode=False)
-                with serv.do():
-                    x = layers.data(
-                        shape=[32, 32],
-                        dtype='float32',
-                        name="X",
-                        append_batch_size=False)
-                    fluid.initializer.Constant(value=1.0)(x, main.global_block())
-                    layers.scale(x=x, scale=10.0, out=out_var)
-
-            exe = fluid.Executor(place)
-            exe.run(main)
-    """
-
-    def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True):
-        self.helper = LayerHelper("listen_and_serv")
-        self.inputs = inputs
-        self.outputs = []
-        self.endpoint = endpoint
-        self.fan_in = fan_in
-        # FIXME(typhoonzero): add optimizer_mode is stupid, should make it more
-        # general.
-        self.optimizer_mode = optimizer_mode
-
-    def do(self):
-        return BlockGuardServ(self)
-
-    def get_params_and_grads(self):
-        main_program = self.helper.main_program
-        current_block = main_program.current_block()
-        parent_block = self.parent_block()
-        # params and grads in the same order.
-        params = list()
-        grads = list()
-        for op in current_block.ops:
-            # FIXME(typhoonzero): op.inputs is None if it's cloned.
-            if self.optimizer_mode:
-                if "Grad" in op.inputs and "Param" in op.inputs:
-                    params.append(op.inputs["Param"].name)
-                    grads.append(op.inputs["Grad"].name)
-            else:
-                # simple recv mode, recv operators inputs.
-                for iname in op.input_names:
-                    for in_var_name in op.input(iname):
-                        params.append(parent_block.var(in_var_name))
-                        grads.append(parent_block.var(in_var_name))
-
-        return params, grads
-
-    def parent_block(self):
-        prog = self.helper.main_program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-        return parent_block
-
-    def complete_op(self):
-        main_program = self.helper.main_program
-        current_block = main_program.current_block()
-        parent_block = self.parent_block()
-
-        parent_block.append_op(
-            type='listen_and_serv',
-            inputs={"X": self.inputs},
-            outputs={},
-            attrs={
-                'endpoint': self.endpoint,
-                'Fanin': self.fan_in,
-                'optimize_blocks': [
-                    current_block
-                ],  # did not support multiple optimize blocks in layers
-                'sync_mode': True,  # did not support async now in layers
-                'grad_to_block_id': [""]
-            })
-
-
-def Send(endpoints, send_vars, dummy_output=None, sync=True):
-    """
-    Send variables to the server side, and get vars from server
-    side when server have finished running server side program.
-
-    Args:
-        endpoints (str): comma seperated IP:PORT pairs in the order
-                   of send_vars to send
-        send_vars (list): variables to send to server
-        sync (bool): whether to wait the request finish
-
-    """
-    assert (type(send_vars) == list)
-
-    if dummy_output is None:
-        dummy_output = []
-    elif isinstance(dummy_output, Variable):
-        dummy_output = [dummy_output]
-
-    assert (type(dummy_output) == list)
-
-    epmap = endpoints.split(",")
-    endpoints = list(set(epmap))
-
-    helper = LayerHelper("Send", **locals())
-    rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
-
-    helper.append_op(
-        type="send",
-        inputs={"X": send_vars},
-        outputs={"Out": dummy_output},
-        attrs={
-            "endpoints": endpoints,
-            "epmap": epmap,
-            rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
-        })
-    if sync:
-        helper.append_op(
-            type="send_barrier",
-            inputs={"X": dummy_output},
-            outputs={"Out": []},
-            attrs={"endpoints": endpoints})
-
-
-def Recv(endpoints, get_vars, dummy_input=None, sync=True):
-    """
-    Receive variables from server side
-
-    Args:
-        endpoints (str): comma seperated IP:PORT pairs in the order
-                   of send_vars to send
-        get_vars (list): vars to get from server after send completes.
-        sync (bool): whether to wait the request finish
-
-    Returns:
-        list: list of received variables
-    """
-    assert (type(get_vars) == list)
-
-    if dummy_input is None:
-        dummy_input = []
-    elif isinstance(dummy_input, Variable):
-        dummy_input = [dummy_input]
-
-    assert (type(dummy_input) == list)
-
-    epmap = endpoints.split(",")
-    endpoints = list(set(epmap))
-
-    helper = LayerHelper("Recv", **locals())
-    helper.append_op(
-        type="recv",
-        inputs={"X": dummy_input},
-        outputs={"Out": get_vars},
-        attrs={"endpoints": endpoints,
-               "epmap": epmap})
-    if sync:
-        helper.append_op(
-            type="fetch_barrier",
-            outputs={"Out": get_vars},
-            attrs={"endpoints": endpoints})
-    return get_vars
-
-
-def monkey_patch_reader_methods(reader):
-    def __get_reader__():
-        scope = global_scope()
-        var = scope.find_var(reader.name)
-        return var.get_reader()
-
-    def reset():
-        return __get_reader__().reset()
-
-    reader.reset = reset
-    reader.stop_gradient = True
-    reader.persistable = True
-    return reader
-
-
-def _copy_reader_var_(block, var):
-    new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
-    new_var.desc.set_shapes(var.desc.shapes())
-    new_var.desc.set_dtypes(var.desc.dtypes())
-    new_var.desc.set_lod_levels(var.desc.lod_levels())
-    new_var.persistable = True
-    return new_var
-
-
-def _copy_reader_create_op_(block, op):
-    input_param_names = op.input_names
-    new_input_map = {}
-    for param_name in input_param_names:
-        new_input_map[param_name] = []
-        arg_names = op.input(param_name)
-        for arg_name in arg_names:
-            new_input_map[param_name].append(block.var(arg_name))
-
-    output_param_names = op.output_names
-    new_output_map = {}
-    for param_name in output_param_names:
-        new_output_map[param_name] = []
-        arg_names = op.output(param_name)
-        for arg_name in arg_names:
-            new_output_map[param_name].append(block.var(arg_name))
-
-    new_op = block.append_op(
-        type=op.type,
-        inputs=new_input_map,
-        outputs=new_output_map,
-        attrs=op.all_attrs())
-    return new_op
-
-
-def _py_reader(capacity,
-               shapes,
-               dtypes,
-               lod_levels=None,
-               name=None,
-               use_double_buffer=True,
-               feed_list=None):
-
-    if feed_list is not None:
-        if not isinstance(feed_list, list):
-            raise TypeError("feed_list should be a list of Variable"
-                            " instead of " + str(type(feed_list)))
-        lod_levels = []
-        dtypes = []
-        shape_concat = []
-        ranks = []
-        shapes = []
-
-        for feed_data in feed_list:
-            dtypes.append(feed_data.dtype)
-            shape_concat.extend(feed_data.shape)
-            ranks.append(len(feed_data.shape))
-            shapes.append(feed_data.shape)
-            lod_levels.append(feed_data.lod_level)
-    else:
-        dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
-        shape_concat = []
-        ranks = []
-
-        for shape in shapes:
-            shape_concat.extend(shape)
-            ranks.append(len(shape))
-
-        if lod_levels is None:
-            lod_levels = [0] * len(shapes)
-
-    if name is None:
-        queue_name = unique_name('lod_tensor_blocking_queue')
-        reader_name = unique_name('create_py_reader')
-        double_buffer_name = unique_name('double_buffer')
-    else:
-        queue_name = "_".join([name, "queue"])
-        reader_name = "_".join([name, "reader"])
-        double_buffer_name = "_".join([name, "double_buffer"])
-
-    var = global_scope().var(queue_name)
-    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity)
-
-    startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=reader_name)
-    startup_blk.append_op(
-        type='create_py_reader',
-        inputs={'blocking_queue': [queue_name]},
-        outputs={'Out': [startup_var]},
-        attrs={
-            'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
-            'ranks': ranks
-        })
-
-    startup_var.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
-
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                                      startup_var)
-
-    reader = monkey_patch_reader_methods(main_prog_var)
-    if use_double_buffer:
-        double_buffer_reader = double_buffer(reader, name=double_buffer_name)
-        # we return a double buffer reader. However, the reset method comes from
-        # py_reader.
-        double_buffer_reader.reset = reader.reset
-        reader = double_buffer_reader
-
-    # monkey patch py_reader special methods
-    reader.queue = feed_queue
-    current_reset_method = reader.reset
-    reader.thread = None
-    reader.tensor_provider = None
-    reader.exited = False
-
-    def start_provide_thread(func):
-        def __provider_thread__():
-            try:
-                for tensors in func():
-                    array = core.LoDTensorArray()
-                    for item in tensors:
-                        if not isinstance(item, core.LoDTensor):
-                            tmp = core.LoDTensor()
-                            tmp.set(item, core.CPUPlace())
-                            item = tmp
-
-                        array.append(item)
-
-                    if reader.exited:
-                        break
-                    feed_queue.push(array)
-                    if reader.exited:
-                        break
-                feed_queue.close()
-            except Exception as ex:
-                feed_queue.close()
-                logging.warn('Your decorated reader has raised an exception!')
-                six.reraise(*sys.exc_info())
-
-        reader.thread = threading.Thread(target=__provider_thread__)
-        reader.thread.daemon = True
-        reader.thread.start()
-
-    def __set_tensor_provider__(func):
-        reader.tensor_provider = func
-
-    def __set_paddle_reader__(paddle_reader):
-        with program_guard(Program(), Program()):
-            actual_feed_list = feed_list
-            if actual_feed_list is None:
-                actual_feed_list = []
-                counter = 0
-                for dtype, shape, lod_level in zip(dtypes, shapes, lod_levels):
-                    name = str(counter)
-                    actual_feed_list.append(
-                        data(
-                            name=name,
-                            dtype=dtype,
-                            shape=shape,
-                            lod_level=lod_level))
-                    counter += 1
-
-            data_names = [feed_data.name for feed_data in actual_feed_list]
-            feeder = DataFeeder(
-                feed_list=actual_feed_list, place=core.CPUPlace())
-            paddle_reader = feeder.decorate_reader(
-                paddle_reader, multi_devices=False)
-
-        def __tensor_provider__():
-            for slots in paddle_reader():
-                yield [slots[data_name] for data_name in data_names]
-
-        __set_tensor_provider__(__tensor_provider__)
-
-    def __reset__():
-        current_reset_method()
-        if reader.thread is not None and reader.tensor_provider is not None:
-            reader.exited = True
-            reader.thread.join()
-            reader.exited = False
-
-    def __start__():
-        start_provide_thread(reader.tensor_provider)
-
-    reader.reset = __reset__
-    reader.decorate_tensor_provider = __set_tensor_provider__
-    reader.decorate_paddle_reader = __set_paddle_reader__
-
-    reader.decorate_batch_generator = __set_tensor_provider__
-    reader.decorate_sample_list_generator = __set_paddle_reader__
-    reader.start = __start__
-
-    return reader
-
-
-def py_reader(capacity,
-              shapes,
-              dtypes,
-              lod_levels=None,
-              name=None,
-              use_double_buffer=True):
-    """
-    Create a Python reader for data feeding in Python
-
-    This layer returns a Reader Variable.
-    The Reader provides :code:`decorate_paddle_reader()` and
-    :code:`decorate_tensor_provider()` to set a Python generator as the data
-    source. More details :ref:`user_guide_use_py_reader_en` .  When
-    :code:`Executor::Run()` is invoked in C++ side, the data from the generator
-    would be read automatically. Unlike :code:`DataFeeder.feed()`, the data
-    reading process and :code:`Executor::Run()` process can run in parallel
-    using :code:`py_reader`. The :code:`start()` method of the Reader should be
-    called when each pass begins, while the :code:`reset()` method should be
-    called when the pass ends and :code:`fluid.core.EOFException` raises.
-    Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
-
-    Args:
-       capacity(int): The buffer capacity maintained by :code:`py_reader`.
-       shapes(list|tuple): List of tuples which declaring data shapes.
-       dtypes(list|tuple): List of strs which declaring data type.
-       lod_levels(list|tuple): List of ints which declaring data lod_level.
-       name(basestring): The prefix Python queue name and Reader name. None will
-            be generated automatically.
-       use_double_buffer(bool): Whether use double buffer or not.
-
-    Returns:
-       Variable: A Reader from which we can get feeding data.
-
-    Examples:
-       1. The basic usage of :code:`py_reader` is as follows:
-       
-       .. code-block:: python
-    
-         import paddle
-         import paddle.fluid as fluid
-         import paddle.dataset.mnist as mnist
-
-         def network(image, label):
-             # user defined network, here a softmax regresssion example
-             predict = fluid.layers.fc(input=image, size=10, act='softmax')
-             return fluid.layers.cross_entropy(input=predict, label=label)
-
-         reader = fluid.layers.py_reader(capacity=64,
-                                         shapes=[(-1, 1, 28, 28), (-1, 1)],
-                                         dtypes=['float32', 'int64'])
-         reader.decorate_paddle_reader(
-             paddle.reader.shuffle(paddle.batch(mnist.train(), batch_size=5),
-                                   buf_size=1000))
-
-         img, label = fluid.layers.read_file(reader)
-         loss = network(img, label)
-
-         fluid.Executor(fluid.CUDAPlace(0)).run(fluid.default_startup_program())
-         exe = fluid.ParallelExecutor(use_cuda=True)
-         for epoch_id in range(10):
-             reader.start()
-             try:
-                 while True:
-                     exe.run(fetch_list=[loss.name])
-             except fluid.core.EOFException:
-                 reader.reset()
-
-         fluid.io.save_inference_model(dirname='./model',
-                                       feeded_var_names=[img.name, label.name],
-                                       target_vars=[loss],
-                                       executor=fluid.Executor(fluid.CUDAPlace(0)))
-
-       2. When training and testing are both performed, two different
-       :code:`py_reader` should be created with different names, e.g.:
-
-       .. code-block:: python
-    
-         import paddle
-         import paddle.fluid as fluid
-         import paddle.dataset.mnist as mnist
-
-         def network(reader):
-             img, label = fluid.layers.read_file(reader)
-             # User defined network. Here a simple regression as example
-             predict = fluid.layers.fc(input=img, size=10, act='softmax')
-             loss = fluid.layers.cross_entropy(input=predict, label=label)
-             return fluid.layers.mean(loss)
-
-         # Create train_main_prog and train_startup_prog
-         train_main_prog = fluid.Program()
-         train_startup_prog = fluid.Program()
-         with fluid.program_guard(train_main_prog, train_startup_prog):
-             # Use fluid.unique_name.guard() to share parameters with test program
-             with fluid.unique_name.guard():
-                 train_reader = fluid.layers.py_reader(capacity=64,
-                                                       shapes=[(-1, 1, 28, 28),
-                                                               (-1, 1)],
-                                                       dtypes=['float32', 'int64'],
-                                                       name='train_reader')
-                 train_reader.decorate_paddle_reader(
-                     paddle.reader.shuffle(paddle.batch(mnist.train(), batch_size=5),
-                                           buf_size=500))
-                 train_loss = network(train_reader)  # some network definition
-                 adam = fluid.optimizer.Adam(learning_rate=0.01)
-                 adam.minimize(train_loss)
-
-         # Create test_main_prog and test_startup_prog
-         test_main_prog = fluid.Program()
-         test_startup_prog = fluid.Program()
-         with fluid.program_guard(test_main_prog, test_startup_prog):
-             # Use fluid.unique_name.guard() to share parameters with train program
-             with fluid.unique_name.guard():
-                 test_reader = fluid.layers.py_reader(capacity=32,
-                                                      shapes=[(-1, 1, 28, 28), (-1, 1)],
-                                                      dtypes=['float32', 'int64'],
-                                                      name='test_reader')
-                 test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
-                 test_loss = network(test_reader)
-
-         fluid.Executor(fluid.CUDAPlace(0)).run(train_startup_prog)
-         fluid.Executor(fluid.CUDAPlace(0)).run(test_startup_prog)
-
-         train_exe = fluid.ParallelExecutor(use_cuda=True,
-                                            loss_name=train_loss.name,
-                                            main_program=train_main_prog)
-         test_exe = fluid.ParallelExecutor(use_cuda=True,
-                                           loss_name=test_loss.name,
-                                           main_program=test_main_prog)
-         for epoch_id in range(10):
-             train_reader.start()
-             try:
-                 while True:
-                    train_exe.run(fetch_list=[train_loss.name])
-             except fluid.core.EOFException:
-                 train_reader.reset()
-
-         test_reader.start()
-         try:
-             while True:
-                 test_exe.run(fetch_list=[test_loss.name])
-         except fluid.core.EOFException:
-             test_reader.reset()
-    """
-    logging.warn(
-        'paddle.fluid.layers.py_reader() may be deprecated in the near future. '
-        'Please use paddle.fluid.io.PyReader() instead.')
-    return _py_reader(
-        capacity=capacity,
-        shapes=shapes,
-        dtypes=dtypes,
-        lod_levels=lod_levels,
-        name=name,
-        use_double_buffer=use_double_buffer)
-
-
-def create_py_reader_by_data(capacity,
-                             feed_list,
-                             name=None,
-                             use_double_buffer=True):
-    """
-    Create a Python reader for data feeding in Python
-
-    This layer returns a Reader Variable.
-
-    Works much like py_reader except that it's input is feed_list
-    instead of shapes, dtypes and lod_levels
-
-    Args:
-       capacity(int): The buffer capacity maintained by :code:`py_reader`.
-       feed_list(list(Variable)): The data feed list.
-       name(basestring): The prefix Python queue name and Reader name. None will
-            be generated automatically.
-       use_double_buffer(bool): Whether use double buffer or not.
-
-    Returns:
-       Variable: A Reader from which we can get feeding data.
-
-    Examples:
-       .. code-block:: python
-
-         import paddle
-         import paddle.fluid as fluid
-         import paddle.dataset.mnist as mnist
-         import paddle.fluid.compiler as compiler
-
-         def network(img, label):
-             # User defined network. Here a simple regression as example
-             predict = fluid.layers.fc(input=img, size=10, act='softmax')
-             loss = fluid.layers.cross_entropy(input=predict, label=label)
-             return fluid.layers.mean(loss)
-
-         MEMORY_OPT = False
-         USE_CUDA = False
-
-         image = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
-         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-         reader = fluid.layers.create_py_reader_by_data(capacity=64,
-                                                        feed_list=[image, label])
-         reader.decorate_paddle_reader(
-             paddle.reader.shuffle(paddle.batch(mnist.train(), batch_size=5),
-                                   buf_size=500))
-
-         img, label = fluid.layers.read_file(reader)
-         loss = network(img, label)  # some network definition
-
-         place = fluid.CUDAPlace(0) if USE_CUDA else fluid.CPUPlace()
-         exe = fluid.Executor(place)
-         exe.run(fluid.default_startup_program())
-
-         build_strategy = fluid.BuildStrategy()
-         build_strategy.memory_optimize = True if MEMORY_OPT else False
-         compiled_prog = compiler.CompiledProgram(
-             fluid.default_main_program()).with_data_parallel(
-                 loss_name=loss.name,
-                 build_strategy=build_strategy,
-                 exec_strategy=exec_strategy)
-
-         for epoch_id in range(2):
-             reader.start()
-             try:
-                 while True:
-                     exe.run(compiled_prog, fetch_list=[loss.name])
-             except fluid.core.EOFException:
-                 reader.reset()
-    """
-    return _py_reader(
-        capacity=capacity,
-        shapes=None,
-        dtypes=None,
-        lod_levels=None,
-        name=name,
-        use_double_buffer=use_double_buffer,
-        feed_list=feed_list)
-
-
-def __create_shared_decorated_reader__(op_type, reader, attrs):
-    var_name = unique_name(op_type)
-    startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=var_name)
-    startop_op = startup_blk.append_op(
-        type=op_type,
-        inputs={'UnderlyingReader': reader},
-        outputs={'Out': [startup_var]},
-        attrs=attrs)
-    startup_var.persistable = True
-    main_prog_block = default_main_program().current_block()
-    main_prog_var = _copy_reader_var_(main_prog_block, startup_var)
-    _copy_reader_create_op_(main_prog_block, startop_op)
-    return monkey_patch_reader_methods(main_prog_var)
-
-
-def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
-    new_reader_name = name if name is not None else unique_name(op_type)
-    main_blk = default_main_program().current_block()
-    new_reader = main_blk.create_var(name=new_reader_name)
-    main_blk.append_op(
-        type=op_type,
-        inputs={'UnderlyingReader': reader},
-        outputs={'Out': [new_reader]},
-        attrs=attrs)
-    return monkey_patch_reader_methods(new_reader)
-
-
-def double_buffer(reader, place=None, name=None):
-    """
-    Wrap a double buffer reader. The data will copy to target place with a
-    double buffer queue. If the target place is None, the place that executor
-    perform on will be used.
-
-    Args:
-        reader(Variable): the reader variable need to be wrapped.
-        place(Place): the place of target data. Default is the sample place of
-            executor perform.
-
-        name(str): Variable name. None if the user does not care.
-
-    Returns:
-        wrapped reader with double buffer.
-
-    Examples:
-        .. code-block:: python
-          
-           import paddle.fluid as fluid
-           reader = fluid.layers.py_reader(capacity=64,
-                                           shapes=[(-1, 1, 28, 28), (-1, 1)],
-                                           dtypes=['float32', 'int64'],
-                                           use_double_buffer=False)
-           reader = fluid.layers.double_buffer(reader)
-           image, label = fluid.layers.read_file(reader)
-    """
-    attrs = dict()
-    if place is not None:
-        attrs['place'] = str(place).upper()
-    return __create_unshared_decorated_reader__(
-        'create_double_buffer_reader', reader, attrs, name=name)
-
-
-def read_file(reader):
-    """
-    Execute the given reader and get data via it.
-
-    A reader is also a Variable. It can be a raw reader generated by
-    `fluid.layers.open_files()` or a decorated one generated by
-    `fluid.layers.double_buffer()` and so on.
-
-    Args:
-
-        reader(Variable): The reader to execute.
-
-    Returns:
-        Tuple[Variable]: Data read via the given reader.
-
-    Examples:
-        .. code-block:: python
-          
-           import paddle.fluid as fluid
-           reader = fluid.layers.py_reader(capacity=64,
-                                           shapes=[(-1, 1, 28, 28), (-1, 1)],
-                                           dtypes=['float32', 'int64'])
-           image, label = fluid.layers.read_file(reader)
-    """
-    helper = LayerHelper('read_file')
-    out = [
-        helper.create_variable_for_type_inference(
-            stop_gradient=True, dtype='float32')
-        for _ in range(len(reader.desc.shapes()))
-    ]
-    helper.append_op(
-        type='read', inputs={'Reader': [reader]}, outputs={'Out': out})
-    if len(out) == 1:
-        return out[0]
-    else:
-        return out
-
-
-@templatedoc()
-def load(out, file_path, load_as_fp16=None):
-    """
-    ${comment}
-
-    >>> import paddle.fluid as fluid
-    >>> tmp_tensor = fluid.layers.create_tensor(dtype='float32')
-    >>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin")
-
-    Args:
-        out(${out_type}): ${out_comment}.
-
-        file_path(${file_path_type}): ${file_path_comment}.
-
-        load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}.
-
-    Returns:
-        None
-    """
-    helper = LayerHelper("load", **locals())
-    attrs = {"file_path": file_path}
-    if load_as_fp16 is not None:
-        attrs['load_as_fp16'] = load_as_fp16
-    helper.append_op(type="load", inputs={}, output={"Out": out}, attrs=attrs)
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
deleted file mode 100755
index 2c982dc26d66d4df9bdc28e7435c37ccfe7b14d7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ /dev/null
@@ -1,349 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import re
-import functools
-import warnings
-import string
-
-from six.moves import cStringIO
-from ..proto import framework_pb2
-from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_
-from ..layer_helper import LayerHelper
-
-__all__ = [
-    'deprecated', 'generate_layer_fn', 'generate_activation_fn', 'autodoc',
-    'templatedoc'
-]
-
-
-def _convert_(name):
-    """
-    Formatting.
-
-    Args:
-       name: The name/alias
-
-    This function takes in a name and converts it to a standard format of
-    group1_group2. Where as per the regular expression, group1 can have
-    alphabets and numbers and group2 has capital alphabets.
-
-    """
-    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
-    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
-
-
-def _type_to_str_(tp):
-    return framework_pb2.AttrType.Name(tp)
-
-
-_two_dollar_pattern_ = re.compile(r"\$\$([^\$]+)\$\$")
-_single_dollar_pattern_ = re.compile(r"\$([^\$]+)\$")
-_two_bang_pattern_ = re.compile(r"!!([^!]+)!!")
-
-
-def escape_math(text):
-    return _two_bang_pattern_.sub(
-        r'$$\1$$',
-        _single_dollar_pattern_.sub(r':math:`\1`',
-                                    _two_dollar_pattern_.sub(r"!!\1!!", text)))
-
-
-def _generate_doc_string_(op_proto, additional_args_lines=None):
-    """
-    Generate docstring by OpProto
-
-    Args:
-        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
-
-    Returns:
-        str: the document string
-    """
-
-    if not isinstance(op_proto, framework_pb2.OpProto):
-        raise TypeError("OpProto should be `framework_pb2.OpProto`")
-
-    buf = cStringIO()
-    buf.write(escape_math(op_proto.comment))
-    buf.write('\nArgs:\n')
-    for each_input in op_proto.inputs:
-        line_begin = '    {0}: '.format(_convert_(each_input.name))
-        buf.write(line_begin)
-        buf.write(escape_math(each_input.comment))
-        if each_input.duplicable:
-            buf.write("  Duplicatable.")
-        if each_input.dispensable:
-            buf.write("  Optional.")
-        buf.write('\n')
-
-    skip_attrs = OpProtoHolder.generated_op_attr_names()
-    # attr use_mkldnn and is_test also should not be visible to users.
-    skip_attrs.add("use_mkldnn")
-    skip_attrs.add("is_test")
-    skip_attrs.add("use_cudnn")
-    for each_attr in op_proto.attrs:
-        if each_attr.name in skip_attrs:
-            continue
-        buf.write('    ')
-        buf.write(each_attr.name)
-        buf.write(' (')
-        buf.write(_type_to_str_(each_attr.type))
-        buf.write('): ')
-        buf.write(escape_math(each_attr.comment))
-        buf.write('\n')
-
-    if additional_args_lines is not None:
-        for line in additional_args_lines:
-            line = line.strip()
-            buf.write('    ')
-            buf.write(line)
-            buf.write('\n')
-
-    if len(op_proto.outputs) != 0:
-        buf.write('\nReturns:\n')
-        buf.write('    ')
-        for each_opt in op_proto.outputs:
-            if not each_opt.intermediate:
-                break
-        buf.write(escape_math(each_opt.comment))
-
-    return buf.getvalue()
-
-
-def generate_layer_fn(op_type):
-    """Register the Python layer for an Operator.
-
-    Args:
-       op_type: The name of the operator to be created.
-
-    This function takes in the operator type (sigmoid, mean , average etc) and
-    creates the operator functionality.
-
-    """
-    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
-    not_intermediate_outputs = \
-        [output for output in op_proto.outputs if not output.intermediate]
-    intermediate_outputs = \
-        [output for output in op_proto.outputs if output.intermediate]
-
-    if len(not_intermediate_outputs) != 1:
-        raise ValueError("Only one non intermediate output operator can be",
-                         "automatically generated. {0}".format(op_type))
-
-    if not_intermediate_outputs[0].duplicable:
-        raise ValueError(
-            "Only non duplicable op can be automatically generated.")
-
-    for output in intermediate_outputs:
-        if output.duplicable:
-            raise ValueError("The op can be automatically generated only when ",
-                             "all intermediate ops are not duplicable.")
-
-    o_name = not_intermediate_outputs[0].name
-    intermediate_output_names = [output.name for output in intermediate_outputs]
-
-    def infer_and_check_dtype(op_proto, *args, **kwargs):
-        """
-        This function performs the sanity check for dtype and
-        instance type.
-        """
-        dtype = None
-        for ipt in op_proto.inputs:
-            name = _convert_(ipt.name)
-            val = kwargs.pop(name, [])
-            if not isinstance(val, list) and not isinstance(val, tuple):
-                val = [val]
-            if len(val) == 0:
-                val = [args[0]]
-                args = args[1:]
-
-            for each in val:
-                if not isinstance(each, Variable):
-                    raise ValueError("input of {0} must be variable".format(
-                        op_type))
-
-                if dtype is None:
-                    dtype = each.dtype
-                elif dtype != each.dtype:
-                    raise ValueError(
-                        "operator {0} must input same dtype. {1} vs {2}".format(
-                            op_type, dtype, each.dtype))
-
-        if dtype is None:
-            arg_dtype = kwargs.get("dtype")
-            if arg_dtype:
-                if not isinstance(arg_dtype, core.VarDesc.VarType):
-                    dtype = convert_np_dtype_to_dtype_(arg_dtype)
-                else:
-                    dtype = arg_dtype
-            else:
-                dtype = core.VarDesc.VarType.FP32
-        return dtype
-
-    def func(*args, **kwargs):
-        helper = LayerHelper(op_type, **kwargs)
-
-        dtype = infer_and_check_dtype(op_proto, *args, **kwargs)
-
-        inputs = dict()
-        for ipt in op_proto.inputs:
-            name = _convert_(ipt.name)
-            val = kwargs.pop(name, [])
-            if not isinstance(val, list) and not isinstance(val, tuple):
-                val = [val]
-            if len(val) == 0 and len(args) != 0:
-                val = args[0]
-                args = args[1:]
-            inputs[ipt.name] = val
-
-        outputs = dict()
-        out = kwargs.pop(_convert_(o_name), [])
-        if out:
-            out_var = out[0] if (isinstance(out, list) or
-                                 isinstance(out, tuple)) else out
-        else:
-            out_var = helper.create_variable_for_type_inference(dtype=dtype)
-        outputs[o_name] = [out_var]
-        for name in intermediate_output_names:
-            outputs[name] = [
-                helper.create_variable_for_type_inference(dtype=dtype)
-            ]
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
-        return helper.append_activation(out_var)
-
-    func.__name__ = op_type
-    func.__doc__ = _generate_doc_string_(op_proto)
-    return func
-
-
-def generate_activation_fn(op_type):
-    """Register the Python layer for an Operator without Attribute.
-
-    Args:
-       op_type: The name of the operator to be created.
-
-    This function takes in the operator type (sigmoid, exp , tanh etc) and
-    creates the operator functionality.
-
-    """
-    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
-
-    def func(x, name=None):
-        helper = LayerHelper(op_type, **locals())
-        output = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": output})
-        return output
-
-    func.__name__ = op_type
-    func.__doc__ = _generate_doc_string_(op_proto)
-    func.__doc__ = func.__doc__ + """
-Examples:
-    .. code-block:: python
-
-        import paddle.fluid as fluid
-        data = fluid.layers.data(name="input", shape=[32, 784])
-        result = fluid.layers.%s(data)
-""" % op_type
-    return func
-
-
-def deprecated(func_or_class):
-    """
-    Deprecated warning decorator. It will result a warning message.
-    Should be used before class or function, member function
-    """
-
-    @functools.wraps(func)
-    def func_wrapper(*args, **kwargs):
-        """
-        Wrap func with deprecated warning
-        """
-        warnings.simplefilter('always', DeprecationWarning)  # turn off filter
-        warnings.warn(
-            "Call to deprecated function {}.".format(func.__name__),
-            category=DeprecationWarning,
-            stacklevel=2)
-        warnings.simplefilter('default', DeprecationWarning)  # reset filter
-        return func(*args, **kwargs)
-
-    return func_wrapper
-
-
-def autodoc(comment=""):
-    def __impl__(func):
-        func.__doc__ = _generate_doc_string_(OpProtoHolder.instance(
-        ).get_op_proto(func.__name__)) + comment
-        return func
-
-    return __impl__
-
-
-def templatedoc(op_type=None):
-    """
-    Decorator of layer function. It will use the docstring from the layer
-    function as the template. The template arguments are:
-
-    * ${comment}: The operator comment written in CPP.
-    * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput,
-        and AddInput. The ${name} is Python snake style. i.e., xxx_xxx.
-    * ${{name}_type}: The type of ${name}.
-
-    Returns:
-        Decorated function.
-    """
-
-    def trim_ending_dot(msg):
-        return msg.rstrip('.')
-
-    def __impl__(func):
-        if op_type is None:
-            op_type_name = func.__name__
-        else:
-            op_type_name = op_type
-        op_proto = OpProtoHolder.instance().get_op_proto(op_type_name)
-        tmpl = string.Template(func.__doc__)
-
-        comment_lines = op_proto.comment.split("\n")
-        comment = ""
-        for line in comment_lines:
-            line = line.strip()
-            if len(line) != 0:
-                comment += escape_math(line)
-                comment += " "
-            elif len(comment) != 0:
-                comment += "\n    \n    "
-
-        args = {"comment": trim_ending_dot(comment)}
-        for each_input in op_proto.inputs:
-            input_name = _convert_(each_input.name)
-            args["{0}_comment".format(input_name)] = trim_ending_dot(
-                each_input.comment)
-            args["{0}_type".format(input_name)] = "Variable"
-        for each_attr in op_proto.attrs:
-            input_name = _convert_(each_attr.name)
-            args["{0}_comment".format(input_name)] = trim_ending_dot(
-                each_attr.comment)
-            args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)
-
-        for each_opt in op_proto.outputs:
-            output_name = _convert_(each_opt.name)
-            args["{0}_comment".format(output_name)] = trim_ending_dot(
-                each_opt.comment)
-            args["{0}_type".format(output_name)] = "Variable"
-        func.__doc__ = tmpl.substitute(args)
-        return func
-
-    return __impl__
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
deleted file mode 100644
index 5be4ea756062c0ab7ed263b3bcd32ebce649d236..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ /dev/null
@@ -1,500 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-When training a model, it's often useful to decay the
-learning rate during training process, this is called
-learning_rate_decay. There are many strategies to do
-this, this module will provide some classical method.
-User can also implement their own learning_rate_decay
-strategy according to this module.
-"""
-
-from __future__ import print_function
-
-import math
-import numbers
-
-from . import control_flow
-from . import nn
-from . import ops
-from . import tensor
-from ..initializer import init_on_cpu
-from ..framework import default_main_program, Parameter, unique_name, name_scope
-from ..framework import Variable
-from ..dygraph import base as imperative_base
-from ..dygraph import learning_rate_scheduler as imperate_lr
-
-__all__ = [
-    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'cosine_decay',
-    'linear_lr_warmup'
-]
-
-
-def _decay_step_counter(begin=0):
-    # the first global step is zero in learning rate decay
-    global_step = nn.autoincreased_step_counter(
-        counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
-    global_step = tensor.cast(global_step, 'float32')
-    return global_step
-
-
-def noam_decay(d_model, warmup_steps):
-    """
-    Noam decay method. The numpy implementation of noam decay as follows.
-
-    .. code-block:: python
-      
-      import padde.fluid as fluid
-      import numpy as np
-      # set hyper parameters
-      d_model = 2
-      current_steps = 20
-      warmup_steps = 200
-      # compute
-      lr_value = np.power(d_model, -0.5) * np.min([
-                              np.power(current_steps, -0.5),
-                              np.power(warmup_steps, -1.5) * current_steps])
-
-    Please reference `attention is all you need
-    <https://arxiv.org/pdf/1706.03762.pdf>`_.
-
-    Args:
-        d_model(Variable): The dimensionality of input and output of model.
-
-        warmup_steps(Variable): A super parameter.
-
-    Returns:
-        The decayed learning rate.
-    Examples:
-        .. code-block:: python
-
-          import padde.fluid as fluid
-          warmup_steps = 100
-          learning_rate = 0.01
-          lr = fluid.layers.learning_rate_scheduler.noam_decay(
-                         1/(warmup_steps *(learning_rate ** 2)),
-                         warmup_steps)
-    """
-    with default_main_program()._lr_schedule_guard():
-        if imperative_base.enabled():
-            decay = imperate_lr.NoamDecay(d_model, warmup_steps)
-            return decay
-        else:
-            global_step = _decay_step_counter(1)
-
-            a = global_step**-0.5
-            b = (warmup_steps**-1.5) * global_step
-            lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
-
-            return lr_value
-
-
-def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """
-    Applies exponential decay to the learning rate.
-
-    When training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, the learning rate will be decayed by
-    'decay_rate' every 'decay_steps' steps.
-
-    >>> if staircase == True:
-    >>>     decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
-    >>> else:
-    >>>     decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
-
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        decay_steps(int): See the decay computation above.
-        decay_rate(float): The decay rate. See the decay computation above.
-        staircase(Boolean): If True, decay the learning rate at discrete intervals.
-                            Default: False
-
-    Returns:
-        Variable: The decayed learning rate
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          sgd_optimizer = fluid.optimizer.SGD(
-	      learning_rate=fluid.layers.exponential_decay(
-		    learning_rate=base_lr,
-		    decay_steps=10000,
-		    decay_rate=0.5,
-		    staircase=True))
-
-    """
-    with default_main_program()._lr_schedule_guard():
-        if imperative_base.enabled():
-            decay = imperate_lr.ExponentialDecay(learning_rate, decay_steps,
-                                                 decay_rate, staircase)
-            return decay
-        else:
-            global_step = _decay_step_counter()
-
-            div_res = global_step / decay_steps
-            if staircase:
-                div_res = ops.floor(div_res)
-            decayed_lr = learning_rate * (decay_rate**div_res)
-
-            return decayed_lr
-
-
-def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies natural exponential decay to the initial learning rate.
-
-    >>> if not staircase:
-    >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-    >>> else:
-    >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * floor(global_step / decay_steps))
-
-    Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
-
-    Returns:
-        The decayed learning rate
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          sgd_optimizer = fluid.optimizer.SGD(
-	      learning_rate=fluid.layers.natural_exp_decay(
-		    learning_rate=base_lr,
-		    decay_steps=10000,
-		    decay_rate=0.5,
-		    staircase=True))
-
-    """
-    with default_main_program()._lr_schedule_guard():
-        if imperative_base.enabled():
-            decay = imperate_lr.NaturalExpDecay(learning_rate, decay_steps,
-                                                decay_rate, staircase)
-            return decay
-        else:
-            global_step = _decay_step_counter()
-
-            div_res = global_step / decay_steps
-            if staircase:
-                div_res = ops.floor(div_res)
-            decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
-
-            return decayed_lr
-
-
-def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """
-    Applies inverse time decay to the initial learning rate.
-
-    When training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, an inverse decay function will be
-    applied to the initial learning rate.
-
-    >>> if staircase == True:
-    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
-    >>> else:
-    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
-
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        decay_steps(int): See the decay computation above.
-        decay_rate(float): The decay rate. See the decay computation above.
-        staircase(Boolean): If True, decay the learning rate at discrete intervals.
-                            Default: False
-
-    Returns:
-        Variable: The decayed learning rate
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          sgd_optimizer = fluid.optimizer.SGD(
-	      learning_rate=fluid.layers.natural_exp_decay(
-		    learning_rate=base_lr,
-		    decay_steps=10000,
-		    decay_rate=0.5,
-		    staircase=True))
-    """
-    with default_main_program()._lr_schedule_guard():
-        if imperative_base.enabled():
-            decay = imperate_lr.InverseTimeDecay(learning_rate, decay_steps,
-                                                 decay_rate, staircase)
-            return decay
-        else:
-            global_step = _decay_step_counter()
-
-            div_res = global_step / decay_steps
-            if staircase:
-                div_res = ops.floor(div_res)
-
-            decayed_lr = learning_rate / (1 + decay_rate * div_res)
-
-            return decayed_lr
-
-
-def polynomial_decay(learning_rate,
-                     decay_steps,
-                     end_learning_rate=0.0001,
-                     power=1.0,
-                     cycle=False):
-    """
-    Applies polynomial decay to the initial learning rate.
-
-    .. code-block:: text
-
-     if cycle:
-       decay_steps = decay_steps * ceil(global_step / decay_steps)
-     else:
-       global_step = min(global_step, decay_steps)
-       decayed_learning_rate = (learning_rate - end_learning_rate) *
-            (1 - global_step / decay_steps) ^ power + end_learning_rate
-
-    Args:
-        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
-          will be the initial learning rate during training.
-        decay_steps(int32): A Python `int32` number.
-        end_learning_rate(float): A Python `float` number.
-        power(float): A Python `float` number.
-        cycle(bool): If set true, decay the learning rate every decay_steps.
-
-    Returns:
-        Variable: The decayed learning rate
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          start_lr = 0.01
-          total_step = 5000
-          end_lr = 0
-          lr = fluid.layers.polynomial_decay(
-              start_lr, total_step, end_lr, power=1)
-
-    """
-    with default_main_program()._lr_schedule_guard():
-        if imperative_base.enabled():
-            decay = imperate_lr.PolynomialDecay(learning_rate, decay_steps,
-                                                end_learning_rate, power, cycle)
-            return decay
-        else:
-            global_step = _decay_step_counter()
-
-            if cycle:
-                div_res = ops.ceil(global_step / decay_steps)
-                zero_var = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=0.0)
-                one_var = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=1.0)
-
-                with control_flow.Switch() as switch:
-                    with switch.case(global_step == zero_var):
-                        tensor.assign(input=one_var, output=div_res)
-                decay_steps = decay_steps * div_res
-            else:
-                decay_steps_var = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=float(decay_steps))
-                global_step = nn.elementwise_min(
-                    x=global_step, y=decay_steps_var)
-
-            decayed_lr = (learning_rate - end_learning_rate) * \
-                ((1 - global_step / decay_steps) ** power) + end_learning_rate
-            return decayed_lr
-
-
-def piecewise_decay(boundaries, values):
-    """Applies piecewise decay to the initial learning rate.
-
-    The algorithm can be described as the code below.
-
-    .. code-block:: text
-
-      boundaries = [10000, 20000]
-      values = [1.0, 0.5, 0.1]
-      if step < 10000:
-          learning_rate = 1.0
-      elif 10000 <= step < 20000:
-          learning_rate = 0.5
-      else:
-          learning_rate = 0.1
-    Args:
-        boundaries: A list of steps numbers.
-        values: A list of learning rate values that will be picked during
-            different step boundaries.
-
-    Returns:
-        The decayed learning rate.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          boundaries = [10000, 20000]
-          values = [1.0, 0.5, 0.1]
-          optimizer = fluid.optimizer.Momentum(
-              momentum=0.9,
-              learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries, values=values),
-              regularization=fluid.regularizer.L2Decay(1e-4))
-
-
-    """
-    with default_main_program()._lr_schedule_guard():
-        if len(values) - len(boundaries) != 1:
-            raise ValueError("len(values) - len(boundaries) should be 1")
-
-        if imperative_base.enabled():
-            decay = imperate_lr.PiecewiseDecay(boundaries, values, 0)
-            return decay
-        else:
-            global_step = _decay_step_counter()
-
-            lr = tensor.create_global_var(
-                shape=[1],
-                value=0.0,
-                dtype='float32',
-                persistable=True,
-                name="learning_rate")
-
-            with control_flow.Switch() as switch:
-                for i in range(len(boundaries)):
-                    boundary_val = tensor.fill_constant(
-                        shape=[1],
-                        dtype='float32',
-                        value=float(boundaries[i]),
-                        force_cpu=True)
-                    value_var = tensor.fill_constant(
-                        shape=[1], dtype='float32', value=float(values[i]))
-                    with switch.case(global_step < boundary_val):
-                        tensor.assign(value_var, lr)
-                last_value_var = tensor.fill_constant(
-                    shape=[1],
-                    dtype='float32',
-                    value=float(values[len(values) - 1]))
-                with switch.default():
-                    tensor.assign(last_value_var, lr)
-
-            return lr
-
-
-def cosine_decay(learning_rate, step_each_epoch, epochs):
-    """
-    Applies cosine decay to the learning rate.
-
-    when training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, the learning rate will be decayed by
-    following cosine decay strategy.
-
-    .. math::
-
-        decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
-
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        step_each_epoch(int): the number of steps in an epoch.
-        epochs(int): the number of epochs.
-
-    Returns:
-        Variable: The decayed learning rate.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            base_lr = 0.1
-            lr = fluid.layers.cosine_decay(
-            learning_rate = base_lr, step_each_epoch=10000, epochs=120)
-    """
-
-    with default_main_program()._lr_schedule_guard():
-        if imperative_base.enabled():
-            decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch,
-                                            epochs)
-            return decay
-        else:
-            global_step = _decay_step_counter()
-
-            cur_epoch = ops.floor(global_step / step_each_epoch)
-            decayed_lr = learning_rate * 0.5 * (
-                ops.cos(cur_epoch * math.pi / epochs) + 1)
-            return decayed_lr
-
-
-def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
-    """
-    Applies linear learning rate warmup before the normal learning rate
-    scheduling.
-
-    .. code-block:: python
-
-     if global_step < warmup_steps:
-         linear_step = end_lr - start_lr
-         lr = start_lr + linear_step * (global_step / warmup_steps)
-
-    Args:
-        learning_rate (float | Variable): A float value or Variable.
-        warmup_steps (int): The warmup steps.
-        start_lr (float): The start learning rate of warmup.
-        end_lr (float): The end learning rate of warmup.
-
-    Returns:
-        The decayed learning rate in warmup period.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            boundaries = [100, 200]
-            lr_steps = [0.1, 0.01, 0.001]
-            warmup_steps = 50 
-            start_lr = 1. / 3. 
-            end_lr = 0.1
-            decayed_lr = fluid.layers.linear_lr_warmup(
-                fluid.layers.piecewise_decay(boundaries, lr_steps),
-                warmup_steps, start_lr, end_lr)
-
-    """
-    dtype = 'float32'
-    if isinstance(learning_rate, Variable):
-        dtype = learning_rate.dtype
-
-    linear_step = float(end_lr) - float(start_lr)
-    with default_main_program()._lr_schedule_guard():
-        lr = tensor.create_global_var(
-            shape=[1],
-            value=0.0,
-            dtype=dtype,
-            persistable=True,
-            name="learning_rate_warmup")
-
-        global_step = _decay_step_counter()
-
-        with control_flow.Switch() as switch:
-            with switch.case(global_step < warmup_steps):
-                decayed_lr = start_lr + linear_step * (global_step /
-                                                       float(warmup_steps))
-                tensor.assign(decayed_lr, lr)
-            with switch.default():
-                if not isinstance(learning_rate, Variable):
-                    learning_rate = tensor.fill_constant(
-                        shape=[1], dtype=dtype, value=float(learning_rate))
-                tensor.assign(learning_rate, lr)
-    return lr
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
deleted file mode 100644
index 9564eff73f2af24bbb2d6492f73eaa3fb2102356..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ /dev/null
@@ -1,247 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from .. import core
-from ..framework import Variable, unique_name
-from .layer_function_generator import OpProtoHolder
-from ..initializer import force_init_on_cpu
-
-_supported_int_dtype_ = [
-    core.VarDesc.VarType.UINT8,
-    core.VarDesc.VarType.INT8,
-    core.VarDesc.VarType.INT16,
-    core.VarDesc.VarType.INT32,
-    core.VarDesc.VarType.INT64,
-]
-
-
-def monkey_patch_variable():
-    def unique_tmp_name():
-        return unique_name.generate("tmp")
-
-    def safe_get_dtype(var):
-        try:
-            dtype = var.dtype
-        except:
-            raise ValueError("Cannot get data type from %s", var.name)
-        return dtype
-
-    def current_block(var):
-        return var.block
-
-    def create_new_tmp_var(block, dtype):
-        tmp_name = unique_tmp_name()
-        return block.create_var(name=tmp_name, dtype=dtype)
-
-    def create_tensor(block, value, dtype, shape):
-        value = float(value)
-        var = create_new_tmp_var(block, dtype)
-        block.append_op(
-            type="fill_constant",
-            outputs={'Out': [var]},
-            attrs={
-                'dtype': var.dtype,
-                'shape': shape,
-                'value': value,
-                'force_cpu': force_init_on_cpu()
-            },
-            stop_gradient=True)
-        var.stop_gradient = True
-        return var
-
-    def create_scalar(block, value, dtype):
-        return create_tensor(block, value, dtype, shape=[1])
-
-    def create_tensor_with_batchsize(ref_var, value, dtype):
-        assert isinstance(ref_var, Variable)
-        value = float(value)
-        block = current_block(ref_var)
-        var = create_new_tmp_var(block, dtype)
-        batch_dim = -1
-        for i, d in enumerate(ref_var.shape):
-            if d < 0:
-                batch_dim = i
-                break
-        assert batch_dim != -1
-        block.append_op(
-            type='fill_constant_batch_size_like',
-            outputs={'Out': [var]},
-            inputs={'Input': [ref_var]},
-            attrs={
-                'shape': ref_var.shape,
-                'value': value,
-                'input_dim_idx': batch_dim,
-                'output_dim_idx': batch_dim
-            },
-            stop_gradient=True)
-
-        var.stop_gradient = True
-        return var
-
-    def astype(self, dtype):
-        """
-        Cast a variable to a specified data type.
-        NOTE: The variable must be a Tensor
-        Args:
-            self(Variable): The source variable
-            dtype: The target dtype
-
-        Returns:
-            Variable with new dtype
-        """
-        block = current_block(self)
-        out = create_new_tmp_var(block, dtype)
-        block.append_op(
-            type="cast",
-            inputs={"X": [self]},
-            outputs={"Out": [out]},
-            attrs={"in_dtype": self.dtype,
-                   "out_dtype": out.dtype})
-        return out
-
-    def _scalar_elementwise_op_(var, scale, bias):
-        block = current_block(var)
-        out = create_new_tmp_var(block, var.dtype)
-        block.append_op(
-            type="scale",
-            inputs={"X": [var]},
-            outputs={"Out": [out]},
-            attrs={"scale": scale,
-                   "bias": bias})
-        return out
-
-    def _scalar_elementwise_add_(var, value):
-        return _scalar_elementwise_op_(var, 1.0, value)
-
-    def _scalar_elementwise_sub_(var, value):
-        return _scalar_elementwise_op_(var, 1.0, -value)
-
-    def _scalar_elementwise_rsub_(var, value):
-        return _scalar_elementwise_op_(var, -1.0, value)
-
-    def _scalar_elementwise_mul_(var, value):
-        return _scalar_elementwise_op_(var, value, 0.0)
-
-    def _scalar_elementwise_div_(var, value):
-        return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
-
-    def _elemwise_method_creator_(method_name,
-                                  op_type,
-                                  reverse=False,
-                                  scalar_method=None):
-        def __impl__(self, other_var):
-            if scalar_method is not None:
-                if isinstance(other_var, float):
-                    if self.dtype in _supported_int_dtype_:
-                        assert other_var == int(other_var), \
-                            "float value {} cannot convert to integer".format(other_var)
-                    return scalar_method(self, other_var)
-                elif isinstance(other_var, int):
-                    return scalar_method(self, float(other_var))
-
-            lhs_dtype = safe_get_dtype(self)
-
-            if not isinstance(other_var, Variable):
-                if reverse:
-                    has_batch_size = False
-                    for elem in self.shape:
-                        if elem < 0:
-                            has_batch_size = True
-                            break
-                    if not has_batch_size:
-                        other_var = create_tensor(
-                            current_block(self),
-                            other_var,
-                            dtype=lhs_dtype,
-                            shape=self.shape)
-                    else:
-                        other_var = create_tensor_with_batchsize(
-                            self, other_var, lhs_dtype)
-                else:
-                    # add fill_op to current_block
-                    other_var = create_scalar(
-                        current_block(self), value=other_var, dtype=lhs_dtype)
-
-            rhs_dtype = safe_get_dtype(other_var)
-            if lhs_dtype != rhs_dtype:
-                other_var = astype(other_var, lhs_dtype)
-            if reverse:
-                tmp = self
-                self = other_var
-                other_var = tmp
-
-            out = create_new_tmp_var(current_block(self), dtype=lhs_dtype)
-
-            axis = -1
-            if other_var.shape[0] == -1:
-                axis = 0
-            assert len(self.shape) >= len(other_var.shape), (
-                "The rank of the first argument of an binary operator cannot "
-                "be smaller than the rank of its second argument: %s vs %s" %
-                (len(self.shape), len(other_var.shape)))
-
-            current_block(self).append_op(
-                type=op_type,
-                inputs={'X': [self],
-                        'Y': [other_var]},
-                outputs={'Out': out},
-                attrs={'axis': axis})
-            return out
-
-        comment = OpProtoHolder.instance().get_op_proto(op_type).comment
-
-        __impl__.__doc__ = """
-        {0}
-        Args:
-            self(Variable): left hand variable
-            other_var(Variable|float|int): right hand variable
-
-        Returns:
-            Variable
-        """.format(comment)
-        __impl__.__name__ = method_name
-        return __impl__
-
-    # inject methods
-    for method_name, op_type, reverse, scalar_method in (
-        ("__add__", "elementwise_add", False, _scalar_elementwise_add_),
-            # a+b == b+a. Do not need to reverse explicitly
-        ("__radd__", "elementwise_add", False, _scalar_elementwise_add_),
-        ("__sub__", "elementwise_sub", False, _scalar_elementwise_sub_),
-        ("__rsub__", "elementwise_sub", True, _scalar_elementwise_rsub_),
-        ("__mul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-            # a*b == b*a. Do not need to reverse explicitly
-        ("__rmul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-        ("__div__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__truediv__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__rdiv__", "elementwise_div", True, None),
-        ("__rtruediv__", "elementwise_div", True, None),
-        ("__pow__", "elementwise_pow", False, None),
-        ("__rpow__", "elementwise_pow", True, None),
-        ("__floordiv__", "elementwise_floordiv", False, None),
-        ("__mod__", "elementwise_mod", False, None),
-            # for logical compare
-        ("__eq__", "equal", False, None),
-        ("__ne__", "not_equal", False, None),
-        ("__lt__", "less_than", False, None),
-        ("__le__", "less_equal", False, None),
-        ("__gt__", "greater_than", False, None),
-        ("__ge__", "greater_equal", False, None)):
-        setattr(Variable, method_name,
-                _elemwise_method_creator_(method_name, op_type, reverse,
-                                          scalar_method))
-
-    Variable.astype = astype
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
deleted file mode 100755
index e85d5ae4c7e386cf5bad243cfd1277da9fb292e1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/metric_op.py
+++ /dev/null
@@ -1,197 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-All layers just related to metric.
-"""
-
-from __future__ import print_function
-
-import warnings
-from ..layer_helper import LayerHelper
-from ..initializer import Normal, Constant
-from ..framework import Variable
-from ..param_attr import ParamAttr
-from . import nn
-
-__all__ = ['accuracy', 'auc']
-
-
-def accuracy(input, label, k=1, correct=None, total=None):
-    """
-    accuracy layer.
-    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
-
-    This function computes the accuracy using the input and label.
-    If the correct label occurs in top k predictions, then correct will increment by one.
-    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
-
-    Args:
-        input(Variable): The input of accuracy layer, which is the predictions of network.
-          Carry LoD information is supported.
-        label(Variable): The label of dataset.
-        k(int): The top k predictions for each class will be checked.
-        correct(Variable): The correct predictions count.
-        total(Variable): The total entries count.
-
-    Returns:
-        Variable: The correct rate.
-
-    Examples:
-        .. code-block:: python
-
-           import paddle.fluid as fluid
-           data = fluid.layers.data(name="data", shape=[-1, 32, 32], dtype="float32")
-           label = fluid.layers.data(name="label", shape=[-1,1], dtype="int32")
-           predict = fluid.layers.fc(input=data, size=10)
-           accuracy_out = fluid.layers.accuracy(input=predict, label=label, k=5)
-
-    """
-    helper = LayerHelper("accuracy", **locals())
-    topk_out, topk_indices = nn.topk(input, k=k)
-    acc_out = helper.create_variable_for_type_inference(dtype="float32")
-    if correct is None:
-        correct = helper.create_variable_for_type_inference(dtype="int64")
-    if total is None:
-        total = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
-    return acc_out
-
-
-def auc(input,
-        label,
-        curve='ROC',
-        num_thresholds=2**12 - 1,
-        topk=1,
-        slide_steps=1):
-    """
-    **Area Under the Curve (AUC) Layer**
-
-    This implementation computes the AUC according to forward output and label.
-    It is used very widely in binary classification evaluation.
-
-    Note: If input label contains values other than 0 and 1, it will be cast
-    to `bool`. Find the relevant definitions `here <https://en.wikipedia.org\
-    /wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
-
-    There are two types of possible curves:
-
-        1. ROC: Receiver operating characteristic;
-        2. PR: Precision Recall
-
-    Args:
-        input(Variable): A floating-point 2D Variable, values are in the range
-                         [0, 1]. Each row is sorted in descending order. This
-                         input should be the output of topk. Typically, this
-                         Variable indicates the probability of each label.
-        label(Variable): A 2D int Variable indicating the label of the training
-                         data. The height is batch size and width is always 1.
-        curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'.
-        num_thresholds(int): The number of thresholds to use when discretizing
-                             the roc curve. Default 200.
-        topk(int): only topk number of prediction output will be used for auc.
-        slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps.
-
-
-    Returns:
-        Variable: A tuple representing the current AUC.
-        The return tuple is auc_out, batch_auc_out, [
-        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg ]
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            label = fluid.layers.data(name="label", shape=[1], dtype="int32")
-            predict = fluid.layers.fc(input=data, size=2)
-            auc_out = fluid.layers.auc(input=predict, label=label)
-    """
-    helper = LayerHelper("auc", **locals())
-    auc_out = helper.create_variable_for_type_inference(dtype="float64")
-    batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
-    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
-
-    # for batch auc
-    batch_stat_pos = helper.create_global_variable(
-        persistable=True,
-        dtype='int64',
-        shape=[slide_steps, num_thresholds + 1])
-    batch_stat_neg = helper.create_global_variable(
-        persistable=True,
-        dtype='int64',
-        shape=[slide_steps, num_thresholds + 1])
-
-    # for global auc
-    stat_pos = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
-    stat_neg = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
-
-    for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
-        helper.set_variable_initializer(
-            var, Constant(
-                value=0.0, force_cpu=True))
-
-    # Batch AUC
-    helper.append_op(
-        type="auc",
-        inputs={
-            "Predict": [input],
-            "Label": [label],
-            "StatPos": [batch_stat_pos],
-            "StatNeg": [batch_stat_neg]
-        },
-        attrs={
-            "curve": curve,
-            "num_thresholds": num_thresholds,
-            "slide_steps": slide_steps
-        },
-        outputs={
-            "AUC": [batch_auc_out],
-            "StatPosOut": [batch_stat_pos],
-            "StatNegOut": [batch_stat_neg]
-        })
-    # Global AUC
-    helper.append_op(
-        type="auc",
-        inputs={
-            "Predict": [input],
-            "Label": [label],
-            "StatPos": [stat_pos],
-            "StatNeg": [stat_neg]
-        },
-        attrs={
-            "curve": curve,
-            "num_thresholds": num_thresholds,
-            "slide_steps": 0
-        },
-        outputs={
-            "AUC": [auc_out],
-            "StatPosOut": [stat_pos],
-            "StatNegOut": [stat_neg]
-        })
-    return auc_out, batch_auc_out, [
-        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
-    ]
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
deleted file mode 100755
index f4f40cf4a82a4b36d41740d6dfbd0a263e941a0d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/nn.py
+++ /dev/null
@@ -1,14490 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-All layers just related to the neural network.
-"""
-
-from __future__ import print_function
-
-import numpy as np
-import warnings
-import six
-import os
-import inspect
-from ..layer_helper import LayerHelper
-from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, in_dygraph_mode
-from ..dygraph import base
-from ..param_attr import ParamAttr
-from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
-from .tensor import concat, assign, fill_constant, zeros
-from . import utils
-from .. import unique_name
-from functools import reduce
-from .. import core
-from ..dygraph import layers
-from ..data_feeder import convert_dtype
-
-__all__ = [
-    'fc',
-    'center_loss',
-    'embedding',
-    'dynamic_lstm',
-    'dynamic_lstmp',
-    'dynamic_gru',
-    'gru_unit',
-    'linear_chain_crf',
-    'crf_decoding',
-    'cos_sim',
-    'cross_entropy',
-    'bpr_loss',
-    'square_error_cost',
-    'chunk_eval',
-    'sequence_conv',
-    'conv2d',
-    'conv3d',
-    'sequence_pool',
-    'sequence_softmax',
-    'softmax',
-    'pool2d',
-    'pool3d',
-    'adaptive_pool2d',
-    'adaptive_pool3d',
-    'batch_norm',
-    'instance_norm',
-    'data_norm',
-    'beam_search_decode',
-    'conv2d_transpose',
-    'conv3d_transpose',
-    'sequence_expand',
-    'sequence_expand_as',
-    'sequence_pad',
-    'sequence_unpad',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'reduce_prod',
-    'reduce_all',
-    'reduce_any',
-    'sequence_first_step',
-    'sequence_last_step',
-    'sequence_slice',
-    'dropout',
-    'split',
-    'ctc_greedy_decoder',
-    'edit_distance',
-    'l2_normalize',
-    'matmul',
-    'topk',
-    'warpctc',
-    'sequence_reshape',
-    'transpose',
-    'im2sequence',
-    'nce',
-    'sampled_softmax_with_cross_entropy',
-    'hsigmoid',
-    'beam_search',
-    'row_conv',
-    'multiplex',
-    'layer_norm',
-    'group_norm',
-    'spectral_norm',
-    'softmax_with_cross_entropy',
-    'smooth_l1',
-    'one_hot',
-    'autoincreased_step_counter',
-    'reshape',
-    'squeeze',
-    'unsqueeze',
-    'lod_reset',
-    'lod_append',
-    'lrn',
-    'pad',
-    'pad_constant_like',
-    'label_smooth',
-    'roi_pool',
-    'roi_align',
-    'dice_loss',
-    'image_resize',
-    'image_resize_short',
-    'resize_bilinear',
-    'resize_trilinear',
-    'resize_nearest',
-    'gather',
-    'gather_nd',
-    'scatter',
-    'scatter_nd_add',
-    'scatter_nd',
-    'sequence_scatter',
-    'random_crop',
-    'mean_iou',
-    'relu',
-    'selu',
-    'log',
-    'crop',
-    'crop_tensor',
-    'rank_loss',
-    'margin_rank_loss',
-    'elu',
-    'relu6',
-    'pow',
-    'stanh',
-    'hard_sigmoid',
-    'swish',
-    'prelu',
-    'brelu',
-    'leaky_relu',
-    'soft_relu',
-    'flatten',
-    'sequence_mask',
-    'stack',
-    'pad2d',
-    'unstack',
-    'sequence_enumerate',
-    'unique',
-    'unique_with_counts',
-    'expand',
-    'sequence_concat',
-    'scale',
-    'elementwise_add',
-    'elementwise_div',
-    'elementwise_sub',
-    'elementwise_mul',
-    'elementwise_max',
-    'elementwise_min',
-    'elementwise_pow',
-    'elementwise_mod',
-    'elementwise_floordiv',
-    'uniform_random_batch_size_like',
-    'gaussian_random',
-    'sampling_id',
-    'gaussian_random_batch_size_like',
-    'sum',
-    'slice',
-    'strided_slice',
-    'shape',
-    'rank',
-    'size',
-    'logical_and',
-    'logical_or',
-    'logical_xor',
-    'logical_not',
-    'clip',
-    'clip_by_norm',
-    'mean',
-    'mul',
-    'sigmoid_cross_entropy_with_logits',
-    'maxout',
-    'space_to_depth',
-    'affine_grid',
-    'sequence_reverse',
-    'affine_channel',
-    'similarity_focus',
-    'hash',
-    'grid_sampler',
-    'log_loss',
-    'add_position_encoding',
-    'bilinear_tensor_product',
-    'merge_selected_rows',
-    'get_tensor_from_selected_rows',
-    'lstm',
-    'shuffle_channel',
-    'temporal_shift',
-    'py_func',
-    'psroi_pool',
-    'prroi_pool',
-    'teacher_student_sigmoid_loss',
-    'huber_loss',
-    'kldiv_loss',
-    'npair_loss',
-    'pixel_shuffle',
-    'fsp_matrix',
-    'continuous_value_model',
-    'where',
-    'sign',
-    'deformable_conv',
-    'unfold',
-    'deformable_roi_pooling',
-    'filter_by_instag',
-    'shard_index',
-    'hard_swish',
-    'mse_loss',
-]
-
-kIgnoreIndex = -100
-
-
-def fc(input,
-       size,
-       num_flatten_dims=1,
-       param_attr=None,
-       bias_attr=None,
-       act=None,
-       name=None):
-    """
-    **Fully Connected Layer**
-
-    This function creates a fully connected layer in the network. It can take
-    one or multiple tensors as its inputs(input can be a list of Variable, see
-    Args in detail). It creates a variable called weights for each input tensor,
-    which represents a fully connected weight matrix from each input unit to
-    each output unit. The fully connected layer multiplies each input tensor
-    with its corresponding weight to produce an output Tensor with shape [M, `size`],
-    where M is batch size. If multiple input tensors are given, the results of
-    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
-    is not None, a bias variable will be created and added to the output.
-    Finally, if activation is not None, it will be applied to the output as well.
-
-    When the input is single tensor:
-
-    .. math::
-
-        Out = Act({XW + b})
-
-    When the input are multiple tensors:
-
-    .. math::
-
-        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
-
-    In the above equation:
-
-    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
-    * :math:`X_i`: The i-th input tensor.
-    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
-    * :math:`b`: The bias parameter created by this layer (if needed).
-    * :math:`Act`: The activation function.
-    * :math:`Out`: The output tensor.
-
-    See below for an example.
-
-    .. code-block:: text
-
-        Given:
-            data_1.data = [[[0.1, 0.2],
-                           [0.3, 0.4]]]
-            data_1.shape = (1, 2, 2) # 1 is batch_size
-
-            data_2 = [[[0.1, 0.2, 0.3]]]
-            data_2.shape = (1, 1, 3)
-
-            out = fluid.layers.fc(input=[data_1, data_2], size=2)
-
-        Then:
-            out.data = [[0.18669507, 0.1893476]]
-            out.shape = (1, 2)
-
-    Args:
-        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
-            the input tensor(s) is at least 2.
-        size(int): The number of output units in this layer.
-        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
-            two dimensions. If this happens, the multidimensional tensor will first be flattened
-            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
-            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
-            dimensions will be flatten to form the first dimension of the final matrix (height of
-            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
-            form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
-            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
-        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
-            parameters/weights of this layer.
-        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-        act (str, default None): Activation to be applied to the output of this layer.
-        name (str, default None): The name of this layer.
-
-    Returns:
-        Variable: The transformation result.
-
-    Raises:
-        ValueError: If rank of the input tensor is less than 2.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          # when input is single tensor
-          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
-
-          # when input are multiple tensors
-          data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
-          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
-          fc = fluid.layers.fc(input=[data_1, data_2], size=1000, act="tanh")
-    """
-    helper = LayerHelper("fc", **locals())
-
-    dtype = helper.input_dtype()
-
-    mul_results = []
-    for input_var, param_attr in helper.iter_inputs_and_params():
-        input_shape = input_var.shape
-        param_shape = [
-            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
-        ] + [size]
-
-        w = helper.create_parameter(
-            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
-        tmp = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type="mul",
-            inputs={"X": input_var,
-                    "Y": w},
-            outputs={"Out": tmp},
-            attrs={"x_num_col_dims": num_flatten_dims,
-                   "y_num_col_dims": 1})
-        mul_results.append(tmp)
-
-    if len(mul_results) == 1:
-        pre_bias = mul_results[0]
-    else:
-        pre_bias = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type="sum",
-            inputs={"X": mul_results},
-            outputs={"Out": pre_bias},
-            attrs={"use_mkldnn": False})
-    # add bias
-    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
-    # add activation
-    return helper.append_activation(pre_activation)
-
-
-def center_loss(input,
-                label,
-                num_classes,
-                alpha,
-                param_attr,
-                update_center=True):
-    """
-    **Center loss Cost layer**
-    
-    This layer accepts input (deep features,the output of the last hidden layer)
-    and target label and return the center loss cost
-    
-    For deep features, :math:`X`, and target labels, :math:`Y`, the equation is:
-    
-    .. math::
-
-        Out = \\frac{1}{2}(X - Y)^2
-
-    Args:
-        input (Variable): a 2-D tensor with shape[N x M].
-        label (Variable): the groud truth which is a 2-D tensor
-                         with shape[N x 1],where N is the batch size.
-        num_classes (int): the number of classification categories.
-        alpha (float|Variable): learning rate of centers.
-        param_attr (ParamAttr): Attribute initializer of centers. 
-        update_center (bool): whether to update value of center.
-
-    Returns:
-        Variable: 2-D tensor with shape [N * 1] 
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid 
-
-          input = fluid.layers.data(name='x',shape=[20,30],dtype='float32')
-          label = fluid.layers.data(name='y',shape=[20,1],dtype='int64')
-          num_classes = 1000
-          alpha = 0.01
-          param_attr = fluid.initializer.Xavier(uniform=False)
-          center_loss=fluid.layers.center_loss(input=input,
-                 label=label,
-                 num_classes=1000,
-                 alpha=alpha,
-                 param_attr=fluid.initializer.Xavier(uniform=False),
-                 update_center=True)
-    """
-    helper = LayerHelper('center_loss', **locals())
-    dtype = helper.input_dtype()
-    centers_shape = [num_classes, input.shape[1]]
-    centers_param = helper.create_parameter(
-        attr=param_attr, shape=centers_shape, dtype=dtype)
-    centers_param.stop_gradient = True
-    if isinstance(alpha, Variable):
-        alpha_param = alpha
-    else:
-        assert isinstance(alpha, float)
-        alpha_param = helper.create_variable(
-            name="centerloss_alpha",
-            shape=[1],
-            dtype="float32",
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            persistable=True,
-            stop_gradient=True,
-            initializer=Constant(alpha))
-
-    centersdiff = helper.create_variable_for_type_inference(dtype=input.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='center_loss',
-        inputs={
-            'X': [input],
-            'Label': [label],
-            'Centers': [centers_param],
-            'CenterUpdateRate': [alpha_param]
-        },
-        outputs={
-            'SampleCenterDiff': [centersdiff],
-            'Loss': [loss],
-            'CentersOut': [centers_param]
-        },
-        attrs={'cluster_num': num_classes,
-               'need_update': update_center})
-    return loss
-
-
-def embedding(input,
-              size,
-              is_sparse=False,
-              is_distributed=False,
-              padding_idx=None,
-              param_attr=None,
-              dtype='float32'):
-    """
-    **Embedding Layer**
-
-    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
-    a lookup table. The result of this lookup is the embedding of each ID in the
-    :attr:`input`.
-
-    All the input variables are passed in as local variables to the LayerHelper
-    constructor.
-
-    Args:
-        input(Variable): Input is a Tensor<int64> Variable, which contains the IDs information.
-            The value of the input IDs should satisfy :math:`0<= id < size[0]`.
-        size(tuple|list): The shape of the look up table parameter. It should
-            have two elements which indicate the size of the dictionary of
-            embeddings and the size of each embedding vector respectively.
-        is_sparse(bool): The flag indicating whether to use sparse update.
-        is_distributed(bool): Whether to run lookup table from remote parameter server.
-        padding_idx(int|long|None): It will output all-zero padding data whenever
-            lookup encounters :math:`padding\_idx` in Ids. If set :attr:`None`, it makes
-            no effect to output. If :math:`padding\_idx < 0`, the :math:`padding\_idx`
-            will automatically be converted to :math:`size[0] + padding\_idx` to use.
-            Default: None.
-        param_attr(ParamAttr): Parameters for this layer.
-        dtype(np.dtype|core.VarDesc.VarType|str): The dtype refers to the data type of output
-            tensor. It can be float32, float_16, int etc.
-
-    Returns:
-        Variable: The tensor variable storing the embeddings of the \
-                  supplied inputs.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='sequence', shape=[1], dtype='int64', lod_level=1)
-          emb = fluid.layers.embedding(input=data, size=[128, 64])    
-    """
-
-    helper = LayerHelper('embedding', **locals())
-    remote_prefetch = is_sparse and (not is_distributed)
-    if remote_prefetch:
-        assert is_sparse is True and is_distributed is False
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
-    tmp = helper.create_variable_for_type_inference(dtype)
-    padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
-        size[0] + padding_idx)
-    helper.append_op(
-        type='lookup_table',
-        inputs={'Ids': input,
-                'W': w},
-        outputs={'Out': tmp},
-        attrs={
-            'is_sparse': is_sparse,
-            'is_distributed': is_distributed,
-            'remote_prefetch': remote_prefetch,
-            'padding_idx': padding_idx
-        })
-    return tmp
-
-
-def _pull_box_sparse(input, size, dtype='float32'):
-    """
-    **Pull Box Sparse Layer**
-
-    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
-    BoxPS lookup table. The result of this lookup is the embedding of each ID in the
-    :attr:`input`.
-
-    Args:
-        input(Variable|list of Variable): Input is a Tensor<int64> Variable, which 
-            contains the IDs information.
-        size(int): The embedding size parameter, which indicates the size of 
-            each embedding vector respectively.
-        dtype(str): The dtype refers to the data type of output tensor. Only supports 
-	    float32 now.
-
-    Returns:
-        Variable|list of Variable: The tensor variable storing the embeddings of the \
-                  supplied inputs.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='sequence', shape=[1], dtype='int64', lod_level=1)
-          emb = fluid.layers.pull_box_sparse(input=data, size=[11])    
-    """
-    helper = LayerHelper('pull_box_sparse', **locals())
-    if dtype != 'float32':
-        raise ValueError(
-            "BoxPS only support float type embedding now, and your type is: " +
-            dtype)
-    helper.input_dtype()
-    inputs = helper.multiple_input()
-    outs = [
-        helper.create_variable_for_type_inference(dtype)
-        for i in range(len(inputs))
-    ]
-    helper.append_op(
-        type='pull_box_sparse',
-        inputs={'Ids': inputs},
-        outputs={'Out': outs},
-        attrs={'size': size})
-    if len(outs) == 1:
-        return outs[0]
-    return outs
-
-
-@templatedoc(op_type="lstm")
-def dynamic_lstm(input,
-                 size,
-                 h_0=None,
-                 c_0=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_peepholes=True,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 cell_activation='tanh',
-                 candidate_activation='tanh',
-                 dtype='float32',
-                 name=None):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): ${input_comment}
-        size (int): 4 * hidden size.
-        h_0(Variable): The initial hidden state is an optional input, default is zero.
-                       This is a tensor with shape (N x D), where N is the
-                       batch size and D is the hidden size.
-        c_0(Variable): The initial cell state is an optional input, default is zero.
-                       This is a tensor with shape (N x D), where N is the
-                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-                               hidden-hidden weights.
-
-                               - Weights = {:math:`W_{ch}, W_{ih}, \
-                                                W_{fh}, W_{oh}`}
-                               - The shape is (D x 4D), where D is the hidden
-                                 size.
-
-                               If it is set to None or one attribute of ParamAttr,
-                               dynamic_lstm will create ParamAttr as param_attr.
-                               If the Initializer of the param_attr is not set, the
-                               parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-                              weights, which contains two parts, input-hidden
-                              bias weights and peephole connections weights if
-                              setting `use_peepholes` to `True`.
-
-                              1. `use_peepholes = False`
-                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                 - The shape is (1 x 4D).
-                              2. `use_peepholes = True`
-                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                                 W_{fc}, W_{oc}`}.
-                                 - The shape is (1 x 7D).
-
-                              If it is set to None or one attribute of ParamAttr,
-                              dynamic_lstm will create ParamAttr as bias_attr.
-                              If the Initializer of the bias_attr is not set,
-                              the bias is initialized zero. Default: None.
-        use_peepholes (bool): ${use_peepholes_comment}
-        is_reverse (bool): ${is_reverse_comment}
-        gate_activation (str): ${gate_activation_comment}
-        cell_activation (str): ${cell_activation_comment}
-        candidate_activation (str): ${candidate_activation_comment}
-        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
-        name (str|None): A name for this layer(optional). If set None, the layer
-                         will be named automatically.
-
-    Returns:
-        tuple: The hidden state, and cell state of LSTM. The shape of both \
-        is (T x D), and lod is the same with the `input`.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            emb_dim = 256
-            vocab_size = 10000
-            hidden_dim = 512
-            
-            data = fluid.layers.data(name='x', shape=[1],
-                         dtype='int32', lod_level=1)
-            emb = fluid.layers.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
-
-            forward_proj = fluid.layers.fc(input=emb, size=hidden_dim * 4,
-                                           bias_attr=False)
-
-            forward, _ = fluid.layers.dynamic_lstm(
-                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
-    """
-    assert in_dygraph_mode(
-    ) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!"
-    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-    helper = LayerHelper('lstm', **locals())
-    size = size // 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-    bias_size = [1, 7 * size]
-    if not use_peepholes:
-        bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    hidden = helper.create_variable_for_type_inference(dtype)
-    cell = helper.create_variable_for_type_inference(dtype)
-    batch_gate = helper.create_variable_for_type_inference(dtype)
-    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    batch_size = input.shape[0]
-    if h_0:
-        assert h_0.shape == (batch_size, size), \
-            'The shape of h0 should be (batch_size, %d)' % size
-        inputs['H0'] = h_0
-    if c_0:
-        assert c_0.shape == (batch_size, size), \
-            'The shape of c0 should be (batch_size, %d)' % size
-        inputs['C0'] = c_0
-
-    helper.append_op(
-        type='lstm',
-        inputs=inputs,
-        outputs={
-            'Hidden': hidden,
-            'Cell': cell,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation
-        })
-    return hidden, cell
-
-
-def lstm(input,
-         init_h,
-         init_c,
-         max_len,
-         hidden_size,
-         num_layers,
-         dropout_prob=0.0,
-         is_bidirec=False,
-         is_test=False,
-         name=None,
-         default_initializer=None,
-         seed=-1):
-    """
-    If Device is GPU, This op will use cudnn LSTM implementation
-
-    A four-gate Long Short-Term Memory network with no peephole connections.
-    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
-    the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
-
-    .. math::
-
-       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i)
-
-       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f)
-
-       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o)
-
-       \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
-
-       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-
-       h_t &= o_t \odot tanh(c_t)
-
-    - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
-      of weights from the input gate to the input)
-    - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
-    - sigmoid is the logistic sigmoid function.
-    - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-      and cell activation vectors, respectively, all of which have the same size as
-      the cell output activation vector $h$.
-    - The :math:`\odot` is the element-wise product of the vectors.
-    - :math:`tanh` is the activation functions.
-    - :math:`\\tilde{c_t}` is also called candidate hidden state,
-      which is computed based on the current input and the previous hidden state.
-
-    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication,
-    X represensts a matrix multiplication
-
-
-    Args:
-        input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
-        init_h(Variable): The initial hidden state of the LSTM
-                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
-                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        init_c(Variable): The initial cell state of the LSTM.
-                       This is a tensor with shape ( num_layers x batch_size x hidden_size )
-                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
-        hidden_size (int): hidden size of the LSTM
-        num_layers (int): total layers number of the LSTM
-        dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
-                             There is NO dropout work on rnn output of the last RNN layers
-        is_bidirec (bool): If it is bidirectional
-        is_test (bool): If it is in test phrase
-        name (str|None): A name for this layer(optional). If set None, the layer
-                         will be named automatically.
-        default_initializer(Initialize|None): Where use initializer to initialize the Weight
-                         If set None, defaule initializer will be used
-        seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
-
-
-    Returns:
-        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):
-
-                        Three tensors, rnn_out, last_h, last_c:
-
-                        - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
-                          if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
-                        - last_h is the hidden state of the last step of LSTM \
-                          shape is ( num_layers x batch_size x hidden_size ) \
-                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
-                        - last_c(Tensor): the cell state of the last step of LSTM \
-                          shape is ( num_layers x batch_size x hidden_size ) \
-                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
-
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            emb_dim = 256
-            vocab_size = 10000
-            data = fluid.layers.data(name='x', shape=[-1, 100, 1],
-                         dtype='int32')
-            emb = fluid.layers.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
-            batch_size = 20
-            max_len = 100
-            dropout_prob = 0.2
-            input_size = 100
-            hidden_size = 150
-            num_layers = 1
-            init_h = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
-            init_c = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
-            rnn_out, last_h, last_c = layers.lstm( emb, init_h, init_c, \
-                    max_len, hidden_size, num_layers, \
-                    dropout_prob=dropout_prob)
-    """
-
-    helper = LayerHelper('cudnn_lstm', **locals())
-
-    dtype = input.dtype
-    input_shape = list(input.shape)
-    input_size = input_shape[-1]
-    weight_size = 0
-    for i in range(num_layers):
-        if i == 0:
-            input_weight_size = (input_size * hidden_size) * 4
-        else:
-            if is_bidirec:
-                input_weight_size = (hidden_size * 2 * hidden_size) * 4
-            else:
-                input_weight_size = (hidden_size * hidden_size) * 4
-
-        hidden_weight_size = (hidden_size * hidden_size) * 4
-
-        if is_bidirec:
-            weight_size += (input_weight_size + hidden_weight_size) * 2
-            weight_size += hidden_size * 8 * 2
-        else:
-            weight_size += input_weight_size + hidden_weight_size
-            weight_size += hidden_size * 8
-
-    weight = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[weight_size],
-        dtype=dtype,
-        default_initializer=default_initializer)
-
-    out = helper.create_variable_for_type_inference(dtype)
-    last_h = helper.create_variable_for_type_inference(dtype)
-    last_c = helper.create_variable_for_type_inference(dtype)
-
-    cache = helper.create_variable(
-        persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True)
-
-    helper.append_op(
-        type='cudnn_lstm',
-        inputs={
-            'Input': input,
-            'InitH': init_h,
-            'InitC': init_c,
-            'W': weight,
-            'Cache': cache,
-        },
-        outputs={
-            'Out': out,
-            'last_h': last_h,
-            'last_c': last_c,
-        },
-        attrs={
-            'max_len': max_len,
-            'is_bidirec': is_bidirec,
-            'input_size': input_size,
-            'hidden_size': hidden_size,
-            'num_layers': num_layers,
-            'is_test': is_test,
-            'dropout_prob': dropout_prob,
-            'seed': seed,
-        })
-    return out, last_h, last_c
-
-
-def dynamic_lstmp(input,
-                  size,
-                  proj_size,
-                  param_attr=None,
-                  bias_attr=None,
-                  use_peepholes=True,
-                  is_reverse=False,
-                  gate_activation='sigmoid',
-                  cell_activation='tanh',
-                  candidate_activation='tanh',
-                  proj_activation='tanh',
-                  dtype='float32',
-                  name=None,
-                  h_0=None,
-                  c_0=None,
-                  cell_clip=None,
-                  proj_clip=None):
-    """
-    **Dynamic LSTMP Layer**
-
-    LSTMP (LSTM with recurrent projection) layer has a separate projection
-    layer after the LSTM layer, projecting the original hidden state to a
-    lower-dimensional one, which is proposed to reduce the number of total
-    parameters and furthermore computational complexity for the LSTM,
-    espeacially for the case that the size of output units is relative
-    large (https://research.google.com/pubs/archive/43905.pdf).
-
-    The formula is as follows:
-
-    .. math::
-
-        i_t & = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
-
-        f_t & = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
-
-        \\tilde{c_t} & = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
-
-        o_t & = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o)
-
-        c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-
-        h_t & = o_t \odot act_h(c_t)
-
-        r_t & = \overline{act_h}(W_{rh}h_t)
-
-    In the above formula:
-
-    * :math:`W`: Denotes weight matrices (e.g. :math:`W_{xi}` is \
-          the matrix of weights from the input gate to the input).
-    * :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \
-          matrices for peephole connections. In our implementation, \
-          we use vectors to represent these diagonal weight matrices.
-    * :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \
-          bias vector).
-    * :math:`\sigma`: The activation, such as logistic sigmoid function.
-    * :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \
-          gate, and cell activation vectors, respectively, all of which have \
-          the same size as the cell output activation vector :math:`h`.
-    * :math:`h`: The hidden state.
-    * :math:`r`: The recurrent projection of the hidden state.
-    * :math:`\\tilde{c_t}`: The candidate hidden state, whose \
-          computation is based on the current input and previous hidden state.
-    * :math:`\odot`: The element-wise product of the vectors.
-    * :math:`act_g` and :math:`act_h`: The cell input and cell output \
-          activation functions and `tanh` is usually used for them.
-    * :math:`\overline{act_h}`: The activation function for the projection \
-          output, usually using `identity` or same as :math:`act_h`.
-
-    Set `use_peepholes` to `False` to disable peephole connection. The formula
-    is omitted here, please refer to the paper
-    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-
-    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
-    operations on the input :math:`x_{t}` are NOT included in this operator.
-    Users can choose to use fully-connected layer before LSTMP layer.
-
-    Args:
-        input(Variable): The input of dynamic_lstmp layer, which supports
-                         variable-time length input sequence. The underlying
-                         tensor in this Variable is a matrix with shape
-                         (T X 4D), where T is the total time steps in this
-                         mini-batch, D is the hidden size.
-        size(int): 4 * hidden size.
-        proj_size(int): The size of projection output.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-                               hidden-hidden weight and projection weight.
-
-                               - Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
-                                                W_{fh}, W_{oh}`}.
-                               - The shape of hidden-hidden weight is (P x 4D),
-                                 where P is the projection size and D the hidden
-                                 size.
-                               - Projection weight = {:math:`W_{rh}`}.
-                               - The shape of projection weight is (D x P).
-
-                               If it is set to None or one attribute of ParamAttr,
-                               dynamic_lstm will create ParamAttr as param_attr.
-                               If the Initializer of the param_attr is not set, the
-                               parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr|None): The bias attribute for the learnable bias
-                              weights, which contains two parts, input-hidden
-                              bias weights and peephole connections weights if
-                              setting `use_peepholes` to `True`.
-
-                              1. `use_peepholes = False`
-                                - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                - The shape is (1 x 4D).
-                              2. `use_peepholes = True`
-                                - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                                 W_{fc}, W_{oc}`}.
-                                - The shape is (1 x 7D).
-
-                              If it is set to None or one attribute of ParamAttr,
-                              dynamic_lstm will create ParamAttr as bias_attr.
-                              If the Initializer of the bias_attr is not set,
-                              the bias is initialized zero. Default: None.
-        use_peepholes(bool): Whether to enable diagonal/peephole connections,
-                             default `True`.
-        is_reverse(bool): Whether to compute reversed LSTM, default `False`.
-        gate_activation(str): The activation for input gate, forget gate and
-                              output gate. Choices = ["sigmoid", "tanh", "relu",
-                              "identity"], default "sigmoid".
-        cell_activation(str): The activation for cell output. Choices = ["sigmoid",
-                              "tanh", "relu", "identity"], default "tanh".
-        candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
-                              default "tanh".
-        proj_activation(str): The activation for projection output.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
-                              default "tanh".
-        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-        h_0(Variable): The initial hidden state is an optional input, default is zero.
-                       This is a tensor with shape (N x D), where N is the
-                       batch size and D is the projection size.
-        c_0(Variable): The initial cell state is an optional input, default is zero.
-                       This is a tensor with shape (N x D), where N is the
-                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
-        cell_clip(float): If provided the cell state is clipped
-                             by this value prior to the cell output activation.
-        proj_clip(float): If `num_proj > 0` and `proj_clip` is
-                            provided, then the projected values are clipped elementwise to within
-                            `[-proj_clip, proj_clip]`.
-
-    Returns:
-        tuple: A tuple of two output variable: the projection of hidden state, \
-               and cell state of LSTMP. The shape of projection is (T x P), \
-               for the cell state which is (T x D), and both LoD is the same \
-               with the `input`.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            dict_dim, emb_dim = 128, 64
-            data = fluid.layers.data(name='sequence', shape=[1],
-                                     dtype='int32', lod_level=1)
-            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
-            hidden_dim, proj_dim = 512, 256
-            fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4,
-                                     act=None, bias_attr=None)
-            proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
-                                                     size=hidden_dim * 4,
-                                                     proj_size=proj_dim,
-                                                     use_peepholes=False,
-                                                     is_reverse=True,
-                                                     cell_activation="tanh",
-                                                     proj_activation="tanh")
-    """
-
-    assert in_dygraph_mode(
-    ) is not True, "please use lstm instead of dynamic_lstmp in dygraph mode!"
-
-    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-    helper = LayerHelper('lstmp', **locals())
-    size = size // 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
-    proj_weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, proj_size], dtype=dtype)
-    bias_size = [1, 7 * size]
-    if not use_peepholes:
-        bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    projection = helper.create_variable_for_type_inference(dtype)
-    cell = helper.create_variable_for_type_inference(dtype)
-    ordered_proj0 = helper.create_variable_for_type_inference(dtype)
-    batch_hidden = helper.create_variable_for_type_inference(dtype)
-    batch_gate = helper.create_variable_for_type_inference(dtype)
-    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-    inputs = {
-        'Input': input,
-        'Weight': weight,
-        'ProjWeight': proj_weight,
-        'Bias': bias
-    }
-    batch_size = input.shape[0]
-    if h_0:
-        assert h_0.shape == (batch_size, proj_size), \
-            'The shape of h0 should be (batch_size, %d)' % proj_size
-        inputs['H0'] = h_0
-    if c_0:
-        assert c_0.shape == (batch_size, size), \
-            'The shape of c0 should be (batch_size, %d)' % size
-        inputs['C0'] = c_0
-
-    if cell_clip:
-        assert cell_clip >= 0, "cell_clip should not be negtive."
-    if proj_clip:
-        assert proj_clip >= 0, "proj_clip should not be negtive."
-
-    helper.append_op(
-        type='lstmp',
-        inputs=inputs,
-        outputs={
-            'Projection': projection,
-            'Cell': cell,
-            'BatchHidden': batch_hidden,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'cell_clip': cell_clip,
-            'proj_clip': proj_clip,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation,
-            'proj_activation': proj_activation
-        })
-    return projection, cell
-
-
-def dynamic_gru(input,
-                size,
-                param_attr=None,
-                bias_attr=None,
-                is_reverse=False,
-                gate_activation='sigmoid',
-                candidate_activation='tanh',
-                h_0=None,
-                origin_mode=False):
-    """
-    **Gated Recurrent Unit (GRU) Layer**
-
-    if origin_mode is False, then the equation of a gru step is from paper
-    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
-    Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_ .
-
-    The formula is as follows:
-
-    .. math::
-
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-
-        h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
-
-
-    if origin_mode is True then the equation is from paper
-    Learning Phrase Representations using RNN Encoder-Decoder for Statistical
-    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
-
-    .. math::
-
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
-
-    The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
-    is the update gate and reset gate activation function and :math:`sigmoid`
-    is usually used for it. :math:`act_c` is the activation function for
-    candidate hidden state and :math:`tanh` is usually used for it.
-
-    Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations on
-    the input :math:`x_{t}` are NOT included in this operator. Users can choose
-    to use fully-connect layer before GRU layer.
-
-    Args:
-        input(Variable): The input of dynamic_gru layer, which supports
-            variable-time length input sequence. The underlying tensor in this
-            Variable is a matrix with shape :math:`(T \\times 3D)`, where
-            :math:`T` is the total time steps in this mini-batch, :math:`D`
-            is the hidden size.
-        size(int): The dimension of the gru cell.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            hidden-hidden weight matrix. Note:
-
-            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
-              :math:`D` is the hidden size.
-            - All elements in the weight matrix can be divided into two parts.
-              The first part are weights of the update gate and reset gate with
-              shape :math:`(D \\times 2D)`, and the second part are weights for
-              candidate hidden state with shape :math:`(D \\times D)`.
-
-            If it is set to None or one attribute of ParamAttr, dynamic_gru will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
-            the bias in the update gate, reset gate and candidate calculations.
-            If it is set to False, no bias will be applied to the update gate,
-            reset gate and candidate calculations. If it is set to None or one
-            attribute of ParamAttr, dynamic_gru will create ParamAttr as
-            bias_attr. If the Initializer of the bias_attr is not set, the bias
-            is initialized zero. Default: None.
-        is_reverse(bool): Whether to compute reversed GRU, default
-            :attr:`False`.
-        gate_activation(str): The activation for update gate and reset gate.
-            Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
-        candidate_activation(str): The activation for candidate hidden state.
-            Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
-        h_0 (Variable): This is initial hidden state. If not set, default is
-            zero. This is a tensor with shape (N x D), where N is the number of
-            total time steps of input mini-batch feature and D is the hidden
-            size.
-
-    Returns:
-        Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
-            and sequence length is the same with the input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            dict_dim, emb_dim = 128, 64
-            data = fluid.layers.data(name='sequence', shape=[1],
-                                     dtype='int32', lod_level=1)
-            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
-            hidden_dim = 512
-            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
-            hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
-    """
-
-    assert in_dygraph_mode(
-    ) is not True, "please use gru instead of dynamic_gru in dygraph mode!"
-
-    helper = LayerHelper('gru', **locals())
-    dtype = helper.input_dtype()
-
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
-    batch_size = input.shape[0]
-    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    if h_0:
-        assert h_0.shape == (
-            batch_size, size
-        ), 'The shape of h0 should be(batch_size, %d)' % size
-        inputs['H0'] = h_0
-
-    hidden = helper.create_variable_for_type_inference(dtype)
-    batch_gate = helper.create_variable_for_type_inference(dtype)
-    batch_reset_hidden_prev = helper.create_variable_for_type_inference(dtype)
-    batch_hidden = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type='gru',
-        inputs=inputs,
-        outputs={
-            'Hidden': hidden,
-            'BatchGate': batch_gate,
-            'BatchResetHiddenPrev': batch_reset_hidden_prev,
-            'BatchHidden': batch_hidden
-        },
-        attrs={
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'activation': candidate_activation,
-            'origin_mode': origin_mode
-        })
-    return hidden
-
-
-def gru_unit(input,
-             hidden,
-             size,
-             param_attr=None,
-             bias_attr=None,
-             activation='tanh',
-             gate_activation='sigmoid',
-             origin_mode=False):
-    """
-    **GRU unit layer**
-
-    if origin_mode is True, then the equation of a gru step is from paper
-    `Learning Phrase Representations using RNN Encoder-Decoder for Statistical
-    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
-
-        .. math::
-            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
-
-            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
-
-            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    if origin_mode is False, then the equation of a gru step is from paper
-    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
-    Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
-
-        .. math::
-            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
-
-            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
-
-            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
-
-
-    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
-    of the equation above, the :math:`z_t` is split into 3 parts -
-    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
-    implement a full GRU unit operator for an input, a fully
-    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
-
-    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
-    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
-    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
-    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
-    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
-
-    Args:
-        input (Variable): The fc transformed input value of current step.
-        hidden (Variable): The hidden value of gru unit from previous step.
-        size (integer): The input dimension value.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            hidden-hidden weight matrix. Note:
-
-            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
-              :math:`D` is the hidden size.
-            - All elements in the weight matrix can be divided into two parts.
-              The first part are weights of the update gate and reset gate with
-              shape :math:`(D \\times 2D)`, and the second part are weights for
-              candidate hidden state with shape :math:`(D \\times D)`.
-
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
-            the bias in the update gate, reset gate and candidate calculations.
-            If it is set to False, no bias will be applied to the update gate,
-            reset gate and candidate calculations. If it is set to None or one
-            attribute of ParamAttr, gru_unit will create ParamAttr as
-            bias_attr. If the Initializer of the bias_attr is not set, the bias
-            is initialized zero. Default: None.
-        activation (string): The activation type for cell (actNode).
-                             Default: 'tanh'
-        gate_activation (string): The activation type for gates (actGate).
-                                  Default: 'sigmoid'
-
-    Returns:
-        tuple: The hidden value, reset-hidden value and gate values.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            dict_dim, emb_dim = 128, 64
-            data = fluid.layers.data(name='step_data', shape=[1], dtype='int32')
-            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
-            hidden_dim = 512
-            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
-            pre_hidden = fluid.layers.data(
-                name='pre_hidden', shape=[hidden_dim], dtype='float32')
-            hidden = fluid.layers.gru_unit(
-                input=x, hidden=pre_hidden, size=hidden_dim * 3)
-
-    """
-    activation_dict = dict(
-        identity=0,
-        sigmoid=1,
-        tanh=2,
-        relu=3, )
-    activation = activation_dict[activation]
-    gate_activation = activation_dict[gate_activation]
-
-    helper = LayerHelper('gru_unit', **locals())
-    dtype = helper.input_dtype()
-    size = size // 3
-
-    # create weight
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
-
-    gate = helper.create_variable_for_type_inference(dtype)
-    reset_hidden_pre = helper.create_variable_for_type_inference(dtype)
-    updated_hidden = helper.create_variable_for_type_inference(dtype)
-    inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
-    # create bias
-    if helper.bias_attr:
-        bias_size = [1, 3 * size]
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-        inputs['Bias'] = bias
-
-    helper.append_op(
-        type='gru_unit',
-        inputs=inputs,
-        outputs={
-            'Gate': gate,
-            'ResetHiddenPrev': reset_hidden_pre,
-            'Hidden': updated_hidden,
-        },
-        attrs={
-            'activation': 2,  # tanh
-            'gate_activation': 1,  # sigmoid
-        })
-
-    return updated_hidden, reset_hidden_pre, gate
-
-
-@templatedoc()
-def linear_chain_crf(input, label, param_attr=None, length=None):
-    """
-    Linear Chain CRF.
-
-    ${comment}
-
-    Args:
-        input(${emission_type}): ${emission_comment}
-        label(${label_type}): ${label_comment}
-        Length(${length_type}): ${length_comment}
-        param_attr(ParamAttr): The attribute of the learnable parameter for transition parameter.
-
-    Returns:
-        output(${emission_exps_type}): ${emission_exps_comment} \n
-        output(${transition_exps_type}): ${transition_exps_comment} \n
-        output(${log_likelihood_type}): ${log_likelihood_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-
-            #define net structure, using LodTensor
-            train_program = fluid.Program()
-            startup_program = fluid.Program()
-            with fluid.program_guard(train_program, startup_program):
-                input_data = fluid.layers.data(name='input_data', shape=[10], dtype='float32', lod_level=1)
-                label = fluid.layers.data(name='label', shape=[1], dtype='int', lod_level=1)
-                emission= fluid.layers.fc(input=input_data, size=10, act="tanh")
-                crf_cost = fluid.layers.linear_chain_crf(
-                    input=emission,
-                    label=label,
-                    param_attr=fluid.ParamAttr(
-                    name='crfw',
-                    learning_rate=0.01)) 
-            use_cuda = False
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_program)    
-            #define data, using LoDTensor
-            a = fluid.create_lod_tensor(np.random.rand(12,10).astype('float32'), [[3,3,4,2]], place)
-            b = fluid.create_lod_tensor(np.array([[1],[1],[2],[3],[1],[1],[1],[3],[1],[1],[1],[1]]),[[3,3,4,2]] , place)
-            feed1 = {'input_data':a,'label':b}
-            loss= exe.run(train_program,feed=feed1, fetch_list=[crf_cost])
-            print(loss) 
-
-            #define net structure, using padding
-            train_program = fluid.Program()
-            startup_program = fluid.Program()
-            with fluid.program_guard(train_program, startup_program):
-                input_data2 = fluid.layers.data(name='input_data2', shape=[10,10], dtype='float32')
-                label2 = fluid.layers.data(name='label2', shape=[10,1], dtype='int')
-                label_length = fluid.layers.data(name='length', shape=[1], dtype='int')
-                emission2= fluid.layers.fc(input=input_data2, size=10, act="tanh", num_flatten_dims=2)
-                crf_cost2 = fluid.layers.linear_chain_crf(
-                    input=emission2,
-                    label=label2,
-                    length=label_length,
-                    param_attr=fluid.ParamAttr(
-                     name='crfw',
-                     learning_rate=0.01))
-
-            use_cuda = False
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_program)
-
-            #define data, using padding
-            cc=np.random.rand(4,10,10).astype('float32')
-            dd=np.random.rand(4,10,1).astype('int64')
-            ll=np.array([[3,3,4,2]])
-            feed2 = {'input_data2':cc,'label2':dd,'length':ll}
-
-            loss2= exe.run(train_program,feed=feed2, fetch_list=[crf_cost2])
-            print(loss2) 
-            
-            #you can use find_var to get transition parameter.
-            transition=np.array(fluid.global_scope().find_var('crfw').get_tensor())
-            print(transition)
-    """
-    helper = LayerHelper('linear_chain_crf', **locals())
-    size = input.shape[1]
-    transition = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[size + 2, size],
-        dtype=helper.input_dtype())
-    alpha = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    emission_exps = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    transition_exps = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    log_likelihood = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    this_inputs = {
-        "Emission": [input],
-        "Transition": transition,
-        "Label": [label]
-    }
-    if length:
-        this_inputs['length'] = [length]
-    helper.append_op(
-        type='linear_chain_crf',
-        inputs=this_inputs,
-        outputs={
-            "Alpha": [alpha],
-            "EmissionExps": [emission_exps],
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood
-        })
-
-    return log_likelihood
-
-
-@templatedoc()
-def crf_decoding(input, param_attr, label=None):
-    """
-    ${comment}
-
-    Args:
-        input(${emission_type}): ${emission_comment}
-
-        param_attr(ParamAttr): The parameter attribute for training.
-
-        label(${label_type}): ${label_comment}
-
-    Returns:
-        Variable: ${viterbi_path_comment}
-
-    Examples:
-        .. code-block:: python
-
-           import paddle.fluid as fluid
-           images = fluid.layers.data(name='pixel', shape=[784], dtype='float32')
-           label = fluid.layers.data(name='label', shape=[1], dtype='int32')
-           hidden = fluid.layers.fc(input=images, size=2)
-           crf = fluid.layers.linear_chain_crf(input=hidden, label=label, 
-                     param_attr=fluid.ParamAttr(name="crfw"))
-           crf_decode = fluid.layers.crf_decoding(input=hidden, 
-                     param_attr=fluid.ParamAttr(name="crfw"))
-    """
-    helper = LayerHelper('crf_decoding', **locals())
-    transition = helper.get_parameter(param_attr.name)
-    viterbi_path = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    helper.append_op(
-        type='crf_decoding',
-        inputs={"Emission": [input],
-                "Transition": transition,
-                "Label": label},
-        outputs={"ViterbiPath": [viterbi_path]})
-
-    return viterbi_path
-
-
-@templatedoc()
-def cos_sim(X, Y):
-    """
-    ${comment}
-
-    Args:
-        X (Variable): ${x_comment}.
-        Y (Variable): ${y_comment}.
-
-    Returns:
-        Variable: the output of cosine(X, Y).
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[3, 7], dtype='float32', append_batch_size=False)
-            y = fluid.layers.data(name='y', shape=[1, 7], dtype='float32', append_batch_size=False)
-            out = fluid.layers.cos_sim(x, y)
-    """
-    helper = LayerHelper('cos_sim', **locals())
-    out = helper.create_variable_for_type_inference(dtype=X.dtype)
-    xnorm = helper.create_variable_for_type_inference(dtype=X.dtype)
-    ynorm = helper.create_variable_for_type_inference(dtype=X.dtype)
-    helper.append_op(
-        type='cos_sim',
-        inputs={'X': [X],
-                'Y': [Y]},
-        outputs={'Out': [out],
-                 'XNorm': [xnorm],
-                 'YNorm': [ynorm]})
-    return out
-
-
-def dropout(x,
-            dropout_prob,
-            is_test=False,
-            seed=None,
-            name=None,
-            dropout_implementation="downgrade_in_infer"):
-    """
-    Computes dropout.
-
-    Drop or keep each element of `x` independently. Dropout is a regularization
-    technique for reducing overfitting by preventing neuron co-adaption during
-    training. The dropout operator randomly sets (according to the given dropout
-    probability) the outputs of some units to zero, while others are remain
-    unchanged.
-
-    dropout op can be removed from the program to make the program more efficient.
-
-    Args:
-        x (Variable): The input tensor variable.
-        dropout_prob (float): Probability of setting units to zero.
-        is_test (bool): A flag indicating whether it is in test phrase or not.
-        seed (int): A Python integer used to create random seeds. If this
-                    parameter is set to None, a random seed is used.
-                    NOTE: If an integer seed is given, always the same output
-                    units will be dropped. DO NOT use a fixed seed in training.
-        name (str|None): A name for this layer(optional). If set None, the layer
-                         will be named automatically.
-        dropout_implementation(string): ['downgrade_in_infer'(default)|'upscale_in_train']
-
-                                        1. downgrade_in_infer(default), downgrade the outcome at inference
-
-                                           - train: out = input * mask
-                                           - inference: out = input * (1.0 - dropout_prob)
-
-                                           (mask is a tensor same shape with input, value is 0 or 1
-                                           ratio of 0 is dropout_prob)
-                                        2. upscale_in_train, upscale the outcome at training time
-
-                                           - train: out = input * mask / ( 1.0 - dropout_prob )
-                                           - inference: out = input
-
-                                           (mask is a tensor same shape with input, value is 0 or 1
-                                           ratio of 0 is dropout_prob)
-
-
-    Returns:
-        Variable: A tensor variable is the shape with `x`.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            droped = fluid.layers.dropout(x, dropout_prob=0.5)
-    """
-
-    helper = LayerHelper('dropout', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    mask = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
-
-    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
-        seed = helper.main_program.random_seed
-
-    helper.append_op(
-        type='dropout',
-        inputs={'X': [x]},
-        outputs={'Out': [out],
-                 'Mask': [mask]},
-        attrs={
-            'dropout_prob': dropout_prob,
-            'is_test': is_test,
-            'fix_seed': seed is not None,
-            'seed': seed if seed is not None else 0,
-            'dropout_implementation': dropout_implementation,
-        })
-    return out
-
-
-def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
-    """
-    **Cross Entropy Layer**
-
-    This layer computes the cross entropy between `input` and `label`. It
-    supports both standard cross-entropy and soft-label cross-entropy loss
-    computation.
-
-    1) One-hot cross-entropy:
-        `soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
-
-        .. math::
-
-            Y[i] = -\log(X[i, Label[i]])
-
-    2) Soft-label cross-entropy:
-        `soft_label = True`, `Label[i, j]` indicates the soft label of class j
-        for sample i:
-
-        .. math::
-
-            Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
-
-       Please make sure that in this case the summation of each row of `label`
-       equals one.
-
-    3) One-hot cross-entropy with vecterized `label`:
-         As a special case of 2), when each row of 'label' has only one
-         non-zero element which is equal to 1, soft-label cross-entropy degenerates
-         to a one-hot cross-entropy with one-hot label representation.
-
-    Args:
-        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
-                                batch size and D is the number of classes. This
-                                input is a probability computed by the previous
-                                operator, which is almost always the result of
-                                a softmax operator.
-        label (Variable|list): the ground truth which is a 2-D tensor. When
-                               `soft_label` is set to `False`, `label` is a
-                               tensor<int64> with shape [N x 1]. When
-                               `soft_label` is set to `True`, `label` is a
-                               tensor<float/double> with shape [N x D].
-        soft_label (bool): a flag indicating whether to
-                                           interpretate the given labels as soft
-                                           labels. Default: `False`.
-        ignore_index (int): Specifies a target value that is ignored and does
-                            not contribute to the input gradient. Only valid
-                            if soft_label is set to False. Default: kIgnoreIndex
-
-    Returns:
-         A 2-D tensor with shape [N x 1], the cross entropy loss.
-
-    Raises:
-         ValueError:
-
-                      1. the 1st dimension of ``input`` and ``label`` are not equal.
-
-                      2. when ``soft_label == True``, and the 2nd dimension of
-                         ``input`` and ``label`` are not equal.
-
-                      3. when ``soft_label == False``, and the 2nd dimension of
-                         ``label`` is not 1.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          classdim = 7
-          x = fluid.layers.data(name='x', shape=[3, 7], dtype='float32', append_batch_size=False)
-          label = fluid.layers.data(name='label', shape=[3, 1], dtype='float32', append_batch_size=False)
-          predict = fluid.layers.fc(input=x, size=classdim, act='softmax')
-          cost = fluid.layers.cross_entropy(input=predict, label=label)
-    """
-    if not soft_label:
-        return cross_entropy2(input, label, ignore_index)
-    helper = LayerHelper('cross_entropy', **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='cross_entropy',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out]},
-        attrs={"soft_label": soft_label,
-               "ignore_index": ignore_index})
-    return out
-
-
-def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
-    helper = LayerHelper('cross_entropy2', **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    xshape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    match_x = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='cross_entropy2',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out],
-                 'MatchX': [match_x],
-                 'XShape': [xshape]},
-        attrs={'ignore_index': ignore_index})
-    return out
-
-
-def bpr_loss(input, label, name=None):
-    """
-    **Bayesian Personalized Ranking Loss Operator**
-
-    This operator belongs to pairwise ranking loss. Label is the desired item.
-    The loss at a given point in one session is defined as:
-
-    .. math::
-        Y[i] = 1/(N[i] - 1) * \sum_j{\log(\sigma(X[i, Label[i]]-X[i, j]))}
-
-    Learn more details by reading paper <session-based recommendations with recurrent
-    neural networks>.
-
-    Args:
-        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
-                                batch size and D is the number of classes.
-                                This input is not probability but logits.
-        label (Variable|list):  the ground truth which is a 2-D tensor.  `label`
-                                is a tensor<int64> with shape [N x 1].
-        name (str|None):        A name for this layer(optional). If set None, the
-                                layer will be named automatically. Default: None.
-    Returns:
-        A 2-D tensor with shape [N x 1], the bpr loss.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          neg_size = 10
-          label = fluid.layers.data(
-                    name="label", shape=[1], dtype="int64")
-          predict = fluid.layers.data(
-                    name="predict", shape=[neg_size + 1], dtype="float32")
-          cost = fluid.layers.bpr_loss(input=predict, label=label)
-    """
-    helper = LayerHelper('bpr_loss', **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='bpr_loss',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out]})
-    return out
-
-
-def square_error_cost(input, label):
-    """
-    **Square error cost layer**
-
-    This layer accepts input predictions and target label and returns the
-    squared error cost.
-
-    For predictions, :math:`X`, and target labels, :math:`Y`, the equation is:
-
-    .. math::
-
-        Out = (X - Y)^2
-
-    In the above equation:
-
-        * :math:`X`: Input predictions, a tensor.
-        * :math:`Y`: Input labels, a tensor.
-        * :math:`Out`: Output value, same shape with :math:`X`.
-
-    Args:
-        input (Variable): Input tensor, has predictions.
-        label (Variable): Label tensor, has target labels.
-
-    Returns:
-        Variable: The tensor variable storing the element-wise squared error \
-                  difference of input and label.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-          y_predict = fluid.layers.data(name='y_predict', shape=[1], dtype='float32')
-          cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-
-    """
-    helper = LayerHelper('square_error_cost', **locals())
-    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': [input],
-                'Y': [label]},
-        outputs={'Out': [minus_out]})
-
-    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='square', inputs={'X': [minus_out]},
-        outputs={'Out': [square_out]})
-    return square_out
-
-
-@templatedoc()
-def chunk_eval(input,
-               label,
-               chunk_scheme,
-               num_chunk_types,
-               excluded_chunk_types=None,
-               seq_length=None):
-    """
-    **Chunk Evaluator**
-
-    This function computes and outputs the precision, recall and
-    F1-score of chunk detection.
-
-    For some basics of chunking, please refer to
-    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
-
-    ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
-    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
-    Here is a NER example of labeling for these tagging schemes:
-
-    .. code-block:: python
-
-       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
-              Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
-       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
-       IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
-       IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
-       IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
-       IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
-       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
-
-    There are three chunk types(named entity types) including PER(person), ORG(organization)
-    and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
-
-    Since the calculations actually use label ids rather than labels, extra attention
-    should be paid when mapping labels to ids to make CheckEvalOp work. The key point
-    is that the listed equations are satisfied by ids.
-
-    .. code-block:: python
-
-       tag_type = label % num_tag_type
-       chunk_type = label / num_tag_type
-
-    where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
-    is the num of chunk types, and `tag_type` get its value from the following table.
-
-    .. code-block:: python
-
-       Scheme Begin Inside End   Single
-        plain   0     -      -     -
-        IOB     0     1      -     -
-        IOE     -     0      1     -
-        IOBES   0     1      2     3
-
-    Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
-    PER and LOC. To satisfy the above equations, the label map can be like this:
-
-    .. code-block:: python
-
-       B-ORG  0
-       I-ORG  1
-       B-PER  2
-       I-PER  3
-       B-LOC  4
-       I-LOC  5
-       O      6
-
-    It's not hard to verify the equations noting that the num of chunk types
-    is 3 and the num of tag types in IOB scheme is 2. For example, the label
-    id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
-    I-LOC is 2, which consistent with the results from the equations.
-
-    Args:
-        input (Variable): prediction output of the network.
-        label (Variable): label of the test data set.
-        chunk_scheme (str): ${chunk_scheme_comment}
-        num_chunk_types (int): ${num_chunk_types_comment}
-        excluded_chunk_types (list): ${excluded_chunk_types_comment}
-        seq_length(Variable): 1-D Tensor specifying sequence length when input and label are Tensor type.
-
-    Returns:
-        tuple: tuple containing: precision, recall, f1_score,
-        num_infer_chunks, num_label_chunks,
-        num_correct_chunks
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            dict_size = 10000
-            label_dict_len = 7
-            sequence = fluid.layers.data(
-                name='id', shape=[1], lod_level=1, dtype='int64')
-            embedding = fluid.layers.embedding(
-                input=sequence, size=[dict_size, 512])
-            hidden = fluid.layers.fc(input=embedding, size=512)
-            label = fluid.layers.data(
-                name='label', shape=[1], lod_level=1, dtype='int32')
-            crf = fluid.layers.linear_chain_crf(
-                input=hidden, label=label, param_attr=fluid.ParamAttr(name="crfw"))
-            crf_decode = fluid.layers.crf_decoding(
-                input=hidden, param_attr=fluid.ParamAttr(name="crfw"))
-            fluid.layers.chunk_eval(
-                input=crf_decode,
-                label=label,
-                chunk_scheme="IOB",
-                num_chunk_types=(label_dict_len - 1) / 2)
-    """
-    helper = LayerHelper("chunk_eval", **locals())
-
-    # prepare output
-    precision = helper.create_variable_for_type_inference(dtype="float32")
-    recall = helper.create_variable_for_type_inference(dtype="float32")
-    f1_score = helper.create_variable_for_type_inference(dtype="float32")
-    num_infer_chunks = helper.create_variable_for_type_inference(dtype="int64")
-    num_label_chunks = helper.create_variable_for_type_inference(dtype="int64")
-    num_correct_chunks = helper.create_variable_for_type_inference(
-        dtype="int64")
-
-    this_input = {"Inference": [input], "Label": [label]}
-
-    if seq_length:
-        this_input["SeqLength"] = [seq_length]
-
-    helper.append_op(
-        type="chunk_eval",
-        inputs=this_input,
-        outputs={
-            "Precision": [precision],
-            "Recall": [recall],
-            "F1-Score": [f1_score],
-            "NumInferChunks": [num_infer_chunks],
-            "NumLabelChunks": [num_label_chunks],
-            "NumCorrectChunks": [num_correct_chunks]
-        },
-        attrs={
-            "num_chunk_types": num_chunk_types,
-            "chunk_scheme": chunk_scheme,
-            "excluded_chunk_types": excluded_chunk_types or []
-        })
-    return (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
-            num_correct_chunks)
-
-
-@templatedoc()
-def sequence_conv(input,
-                  num_filters,
-                  filter_size=3,
-                  filter_stride=1,
-                  padding=True,
-                  padding_start=None,
-                  bias_attr=None,
-                  param_attr=None,
-                  act=None,
-                  name=None):
-    """
-    The sequence_conv receives input sequences with variable length and other convolutional
-    configuration parameters for the filter and stride to apply the convolution operation.
-    It fills all-zero padding data on both sides of the sequence by default to ensure that
-    the output is the same length as the input. You can customize the padding behavior by
-    configuring the parameter :attr:`padding\_start`.
-    
-    **Warning:** the parameter :attr:`padding` take no effect and will be deprecated in the future.
-
-    .. code-block:: text
-
-            Here we'll illustrate the details of the padding operation:
-            For a mini-batch of 2 variable lengths sentences, containing 3, and 1 time-steps:
-            Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3, 4].
-            Besides, for the sake of simplicity, we assume M=1 and N=2.
-            X = [[a1, a2;
-                  b1, b2;
-                  c1, c2]
-                 [d1, d2]]
-
-            This is to say that input (X) has 4 words and the dimension of each word
-            representation is 2.
-
-            * Case1:
-
-                If padding_start is -1 and filter_size is 3.
-                The length of padding data is calculated as follows:
-                up_pad_len = max(0, -padding_start) = 1
-                down_pad_len = max(0, filter_size + padding_start - 1) = 1
-
-                The output of the input sequence after padding is:
-                data_aftet_padding = [[0,  0,  a1, a2, b1, b2;
-                                       a1, a2, b1, b2, c1, c2;
-                                       b1, b2, c1, c2, 0,  0 ]
-                                      [0,  0,  d1, d2, 0,  0 ]]
-
-                It will be multiplied by the filter weight to get the final output.
-
-    Args:
-        input (Variable): ${x_comment}
-        num_filters (int): the number of filters.
-        filter_size (int): the height of filter, the width is hidden size by default.
-        filter_stride (int): stride of the filter. Currently only supports :attr:`stride` = 1.
-        padding (bool): the parameter :attr:`padding` take no effect and will be discarded in the
-            future. Currently, it will always pad input to make sure the length of the output is
-            the same as input whether :attr:`padding` is set true or false. Because the length of
-            input sequence may be shorter than :attr:`filter\_size`, which will cause the convolution
-            result to not be computed correctly. These padding data will not be trainable or updated
-            while trainnig. 
-        padding_start (int|None): It is used to indicate the start index for padding the input
-            sequence, which can be negative. The negative number means to pad
-            :attr:`|padding_start|` time-steps of all-zero data at the beginning of each instance.
-            The positive number means to skip :attr:`padding_start` time-steps of each instance,
-            and it will pad :math:`filter\_size + padding\_start - 1` time-steps of all-zero data
-            at the end of the sequence to ensure that the output is the same length as the input.
-            If set None, the same length :math:`\\frac{filter\_size}{2}` of data will be filled
-            on both sides of the sequence. If set 0, the length of :math:`filter\_size - 1` data
-            is padded at the end of each input sequence.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, sequence_conv
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of sequence_conv. If it is set to None or one attribute of ParamAttr, sequence_conv
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None.
-
-    Returns:
-        Variable: output of sequence_conv
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-
-             x = fluid.layers.data(name='x', shape=[10,10], append_batch_size=False, dtype='float32')
-             x_conved = fluid.layers.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
-    """
-
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_conv', **locals())
-    dtype = helper.input_dtype()
-    filter_shape = [filter_size * input.shape[1], num_filters]
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-    pre_bias = helper.create_variable_for_type_inference(dtype)
-    if padding_start is None:
-        padding_start = -int(filter_size // 2)
-
-    helper.append_op(
-        type='sequence_conv',
-        inputs={
-            'X': [input],
-            'Filter': [filter_param],
-        },
-        outputs={"Out": pre_bias},
-        attrs={
-            'contextStride': filter_stride,
-            'contextStart': padding_start,
-            'contextLength': filter_size,
-        })
-    pre_act = helper.append_bias_op(pre_bias)
-    return helper.append_activation(pre_act)
-
-
-def sequence_softmax(input, use_cudnn=False, name=None):
-    """
-    This function computes the softmax activation among all time-steps for each
-    sequence. The dimension of each time-step should be 1. Thus, the shape of
-    input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N`
-    is the sum of the length of all sequences.
-
-    For i-th sequence in a mini-batch:
-
-    .. math::
-
-        Out(X[lod[i]:lod[i+1]], :) = \\frac{\exp(X[lod[i]:lod[i+1], :])}{\sum(\exp(X[lod[i]:lod[i+1], :]))}
-
-    For example, for a mini-batch of 3 sequences with variable-length,
-    each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
-    then softmax will be computed among :math:`X[0:2, :]`, :math:`X[2:5, :]`,
-    :math:`X[5:7, :]`, and :math:`N` turns out to be 7.
-
-    Args:
-        input (Variable): The input variable which is a LoDTensor.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
-            library is installed. Default: False.
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None.
-
-    Returns:
-        Variable: output of sequence_softmax
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.layers.data(name='x', shape=[7, 1],
-                              dtype='float32', lod_level=1)
-             x_sequence_softmax = fluid.layers.sequence_softmax(input=x)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_softmax', **locals())
-    dtype = helper.input_dtype()
-    softmax_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="sequence_softmax",
-        inputs={"X": input},
-        outputs={"Out": softmax_out},
-        attrs={"use_cudnn": use_cudnn})
-    return softmax_out
-
-
-def softmax(input, use_cudnn=False, name=None, axis=-1):
-    """
-    The input of the softmax operator is a tensor of any rank. The output tensor
-    has the same shape as the input.
-
-    The dimension :attr:`axis` of the input tensor will be permuted to the last.
-    Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
-    second dimension(row length) is the same as the dimension :attr:`axis` of the input
-    tensor, and the first dimension(column length) is the product of all other
-    dimensions of the input tensor. For each row of the matrix, the softmax operator
-    squashes the K-dimensional(K is the width of the matrix, which is also the size
-    of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
-    K-dimensional vector of real values in the range [0, 1] that add up to 1.
-
-    It computes the exponential of the given dimension and the sum of exponential
-    values of all the other dimensions in the K-dimensional vector input.
-    Then the ratio of the exponential of the given dimension and the sum of
-    exponential values of all the other dimensions is the output of the softmax
-    operator.
-
-    For each row :math:`i` and each column :math:`j` in the matrix, we have:
-
-    .. math::
-
-        Out[i, j] = \\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}
-
-    Args:
-        input (Variable): The input variable.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
-            library is installed. To improve numerical stablity, set use_cudnn to \
-            False by default. Default: False
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None.
-        axis (int): The index of dimension to perform softmax calculations, it should
-            be in range :math:`[-1, rank - 1]`, while :math:`rank` is the rank of
-            input variable. Default: -1.
-
-    Returns:
-        Variable: output of softmax
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.layers.data(name='x', shape=[2], dtype='float32')
-             fc = fluid.layers.fc(input=x, size=10)
-             # perform softmax in the second dimension
-             softmax = fluid.layers.softmax(input=fc, axis=1)
-             # perform softmax in the last dimension
-             softmax = fluid.layers.softmax(input=fc, axis=-1)
-
-    """
-    helper = LayerHelper('softmax', **locals())
-    if not isinstance(input, Variable):
-        raise TypeError(
-            "The type of 'input' in softmax must be Variable, but received %s" %
-            (type(input)))
-    if convert_dtype(input.dtype) not in ['float32', 'float64']:
-        raise TypeError(
-            "The data type of 'input' in softmax must be float32 or float64, but received %s."
-            % (convert_dtype(input.dtype)))
-
-    dtype = helper.input_dtype()
-    softmax_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="softmax",
-        inputs={"X": input},
-        outputs={"Out": softmax_out},
-        attrs={"axis": axis,
-               "use_cudnn": use_cudnn})
-    return softmax_out
-
-
-def conv2d(input,
-           num_filters,
-           filter_size,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=None,
-           param_attr=None,
-           bias_attr=None,
-           use_cudnn=True,
-           act=None,
-           name=None):
-    """
-    The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input and
-    Output are in NCHW format, where N is batch size, C is the number of
-    channels, H is the height of the feature, and W is the width of the feature.
-    Filter is in MCHW format, where M is the number of output image channels,
-    C is the number of input image channels, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input image channels divided by the groups.
-    Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
-    for more details.
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a tensor with NCHW format.
-    * :math:`W`: Filter value, a tensor with MCHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-
-    Note:
-        padding mode is 'SAME' and 'VALID' can reference this link<https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/PaddleGAN/network/base_network.py#L181>`_
-
-    Args:
-        input (Variable): The input image with [N, C, H, W] format.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        filter_size (int|tuple): The filter size. If filter_size 
-            is a tuple, it must contain two integers, (filter_size_height, 
-            filter_size_width). Otherwise, filter_size_height = filter_\
-            size_width = filter_size.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_height, stride_width). Otherwise,
-            stride_height = stride_width = stride. Default: stride = 1.
-        padding (int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_height, padding_width). Otherwise,
-            padding_height = padding_width =  padding. Default: padding = 0.
-        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_height, dilation_width). Otherwise,
-            dilation_height = dilation_width = dilation. Default: dilation = 1.
-        groups (int): The groups number of the Conv2d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None
-
-    Returns:
-        Variable: The tensor variable storing the convolution and \
-                  non-linearity activation result.
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
-    """
-
-    num_channels = input.shape[1]
-    assert param_attr is not False, "param_attr should not be False here."
-    l_type = 'conv2d'
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            not use_cudnn):
-        l_type = 'depthwise_conv2d'
-
-    helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype()
-
-    if groups is None:
-        num_filter_channels = num_channels
-    else:
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels // groups
-
-    filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-    stride = utils.convert_to_list(stride, 2, 'stride')
-    padding = utils.convert_to_list(padding, 2, 'padding')
-    dilation = utils.convert_to_list(dilation, 2, 'dilation')
-
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("use_cudnn should be True or False")
-
-    input_shape = input.shape
-    filter_shape = [num_filters, int(num_filter_channels)] + filter_size
-
-    def _get_default_param_initializer():
-        filter_elem_num = filter_size[0] * filter_size[1] * num_channels
-        std = (2.0 / filter_elem_num)**0.5
-        return Normal(0.0, std, 0)
-
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        default_initializer=_get_default_param_initializer())
-
-    pre_bias = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type=l_type,
-        inputs={
-            'Input': input,
-            'Filter': filter_param,
-        },
-        outputs={"Output": pre_bias},
-        attrs={
-            'strides': stride,
-            'paddings': padding,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
-            'fuse_relu_before_depthwise_conv': False
-        })
-
-    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
-
-    return helper.append_activation(pre_act)
-
-
-def conv3d(input,
-           num_filters,
-           filter_size,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=None,
-           param_attr=None,
-           bias_attr=None,
-           use_cudnn=True,
-           act=None,
-           name=None):
-    """
-    **Convlution3D Layer**
-
-    The convolution3D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are in NCDHW format. Where N is batch size C is the number of
-    channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. Convlution3D is similar with Convlution2D
-    but adds one dimension(depth). If bias attribution and activation type are
-    provided, bias is added to the output of the convolution, and the
-    corresponding activation function is applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    In the above equation:
-
-    * :math:`X`: Input value, a tensor with NCDHW format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
-
-        - Output:
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
-
-    Args:
-        input (Variable): The input image with [N, C, D, H, W] format.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        filter_size (int|tuple): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_depth, filter_size_height, 
-            filter_size_width). Otherwise, filter_size_depth = filter_size_height = \
-            filter_size_width = filter_size.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain three integers, (stride_depth, stride_height, stride_width). Otherwise,
-            stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        padding (int|tuple): The padding size. If padding is a tuple, it must
-            contain three integers, (padding_depth, padding_height, padding_width). Otherwise,
-            padding_depth = padding_height = padding_width = padding. Default: padding = 0.
-        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_depth, dilation_height, dilation_width). Otherwise,
-            dilation_depth = dilation_height = dilation_width = dilation. Default: dilation = 1.
-        groups (int): The groups number of the Conv3d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
-            will create ParamAttr as param_attr. If it is set to None, the parameter
-            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv3d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None.
-
-    Returns:
-        Variable: The tensor variable storing the convolution and \
-                  non-linearity activation result.
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
-          conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu")
-    """
-
-    l_type = 'conv3d'
-    assert param_attr is not False, "param_attr should not be False here."
-    helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype()
-
-    num_channels = input.shape[1]
-
-    if groups is None:
-        num_filter_channels = num_channels
-    else:
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels // groups
-
-    filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
-    stride = utils.convert_to_list(stride, 3, 'stride')
-    padding = utils.convert_to_list(padding, 3, 'padding')
-    dilation = utils.convert_to_list(dilation, 3, 'dilation')
-
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("use_cudnn should be True or False")
-
-    input_shape = input.shape
-    filter_shape = [num_filters, num_filter_channels] + filter_size
-
-    def _get_default_param_initializer():
-        filter_elem_num = filter_size[0] * filter_size[1] * filter_size[
-            2] * num_channels
-        std = (2.0 / filter_elem_num)**0.5
-        return Normal(0.0, std, 0)
-
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        default_initializer=_get_default_param_initializer())
-
-    pre_bias = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type=l_type,
-        inputs={
-            'Input': input,
-            'Filter': filter_param,
-        },
-        outputs={"Output": pre_bias},
-        attrs={
-            'strides': stride,
-            'paddings': padding,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'use_mkldnn': False
-        })
-
-    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
-
-    return helper.append_activation(pre_act)
-
-
-def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
-    """
-    This function add the operator for sequence pooling.
-    It pools features of all time-steps of each instance, and is applied
-    on top of the input using pool_type mentioned in the parameters.
-
-    It supports four pool_type:
-
-    - average: :math:`Out[i] = \\frac{\sum_i X_i}{N}`
-    - sum:     :math:`Out[i] = \sum_jX_{ij}`
-    - sqrt:    :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}`
-    - max:     :math:`Out[i] = max(X_i)`
-
-    .. code-block:: text
-
-       x is a 1-level LoDTensor and **pad_value** = 0.0:
-         x.lod = [[2, 3, 2, 0]]
-         x.data = [1, 3, 2, 4, 6, 5, 1]
-         x.dims = [7, 1]
-
-       then output is a Tensor:
-         out.dim = [4, 1]
-         with condition len(x.lod[-1]) == out.dims[0]
-
-       for different pool_type:
-         average: out.data = [2, 4, 3, 0.0], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
-         sum    : out.data = [4, 12, 6, 0.0], where 4=1+3, 12=2+4+6, 6=5+1
-         sqrt   : out.data = [2.82, 6.93, 4.24, 0.0], where 2.82=(1+3)/sqrt(2),
-                    6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
-         max    : out.data = [3, 6, 5, 0.0], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
-         last   : out.data = [3, 6, 1, 0.0], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
-         first  : out.data = [1, 2, 5, 0.0], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
-
-         and all above 0.0 = **pad_value**.
-
-    Args:
-        input (variable): The input variable which is a LoDTensor.
-        pool_type (string): The pooling type of sequence_pool.
-            It supports average, sum, sqrt and max.
-        is_test (bool): Used to distinguish training from scoring mode. Default False.
-        pad_value (float): Used to pad the pooling result for empty input sequence.
-
-    Returns:
-        The sequence pooling variable which is a Tensor.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-
-             x = fluid.layers.data(name='x', shape=[7, 1],
-                              dtype='float32', lod_level=1)
-             avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
-             sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
-             sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
-             max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
-             last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
-             first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_pool', **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-    max_index = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type="sequence_pool",
-        inputs={"X": input},
-        outputs={"Out": pool_out,
-                 "MaxIndex": max_index},
-        attrs={
-            "pooltype": pool_type.upper(),
-            "is_test": is_test,
-            "pad_value": pad_value
-        })
-
-    # when pool_type is max, variable max_index is initialized,
-    # so we stop the gradient explicitly here
-    if pool_type == 'max':
-        max_index.stop_gradient = True
-
-    return pool_out
-
-
-@templatedoc()
-def sequence_concat(input, name=None):
-    """
-    ${comment}
-
-    Args:
-        input(list): List of Variables to be concatenated.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-        Variable: Output variable of the concatenation.
-
-    Examples:
-        .. code-block:: python
-
-           import paddle.fluid as fluid
-           x = fluid.layers.data(name='x', shape=[10], dtype='float32')
-           y = fluid.layers.data(name='y', shape=[10], dtype='float32')
-           out = fluid.layers.sequence_concat(input=[x, y])
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_concat', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]})
-    return out
-
-
-def sequence_first_step(input):
-    """
-    This function gets the first step of sequence.
-
-    .. code-block:: text
-
-       x is a 1-level LoDTensor:
-         x.lod = [[2, 3, 2]]
-         x.data = [1, 3, 2, 4, 6, 5, 1]
-         x.dims = [7, 1]
-
-       then output is a Tensor:
-         out.dim = [3, 1]
-         with condition len(x.lod[-1]) == out.dims[0]
-         out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
-
-    Args:
-        input(variable): The input variable which is a LoDTensor.
-
-    Returns:
-        The sequence's first step variable which is a Tensor.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.layers.data(name='x', shape=[7, 1],
-                              dtype='float32', lod_level=1)
-             x_first_step = fluid.layers.sequence_first_step(input=x)
-    """
-    return sequence_pool(input=input, pool_type="first")
-
-
-def sequence_last_step(input):
-    """
-    This function gets the last step of sequence.
-
-    .. code-block:: text
-
-       x is a 1-level LoDTensor:
-         x.lod = [[2, 3, 2]]
-         x.data = [1, 3, 2, 4, 6, 5, 1]
-         x.dims = [7, 1]
-
-       then output is a Tensor:
-         out.dim = [3, 1]
-         with condition len(x.lod[-1]) == out.dims[0]
-         out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
-
-    Args:
-        input(variable): The input variable which is a LoDTensor.
-
-    Returns:
-        The sequence's last step variable which is a Tensor.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.layers.data(name='x', shape=[7, 1],
-                              dtype='float32', lod_level=1)
-             x_last_step = fluid.layers.sequence_last_step(input=x)
-    """
-    return sequence_pool(input=input, pool_type="last")
-
-
-def sequence_slice(input, offset, length, name=None):
-    """
-    **Sequence Slice Layer**
-
-    The layer crops a subsequence from given sequence with given start
-    offset and subsequence length.
-
-    It only supports sequence data (LoDTensor with lod_level equal to 1).
-
-    .. code-block:: text
-
-              - Case:
-
-            Given the input Variable **input**:
-
-                input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
-                input.lod = [[3, 2]],
-                input.dims = (5, 2),
-
-            with offset.data = [[0], [1]] and length.data = [[2], [1]],
-
-            the output Variable will be
-
-                out.data = [[a1, a2], [b1, b2], [e1, e2]],
-                out.lod = [[2, 1]],
-                out.dims = (3, 2).
-
-    Note:
-          The first dimension size of **input**, **offset** and **length**
-          should be equal. The **offset** should start from 0.
-
-    Args:
-        input(Variable): The input Variable which consists of the complete
-                         sequences.
-        offset(Variable): The offset to slice each sequence.
-        length(Variable): The length of each subsequence.
-        name(str|None): A name for this layer(optional). If set None, the
-                        layer will be named automatically.
-
-    Returns:
-        Variable: The output subsequences.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             import numpy as np
-             seqs = fluid.layers.data(name='x', shape=[10, 5],
-                              dtype='float32', lod_level=1)
-             offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
-             length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
-             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
-                                                   length=length)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper("sequence_slice", **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-
-    offset.stop_gradient = True
-    length.stop_gradient = True
-
-    helper.append_op(
-        type="sequence_slice",
-        inputs={"X": input,
-                "Offset": offset,
-                "Length": length},
-        outputs={"Out": out})
-
-    return out
-
-
-@templatedoc()
-def pool2d(input,
-           pool_size=-1,
-           pool_type="max",
-           pool_stride=1,
-           pool_padding=0,
-           global_pooling=False,
-           use_cudnn=True,
-           ceil_mode=False,
-           name=None,
-           exclusive=True):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the
-                          feature, and W is the width of the feature.
-        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
-        pool_type: ${pooling_type_comment}
-        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int.
-        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
-            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
-            Otherwise, the pool padding size will be a square of an int.
-        global_pooling (bool): ${global_pooling_comment}
-        use_cudnn (bool): ${use_cudnn_comment}
-        ceil_mode (bool): ${ceil_mode_comment}
-        name (str|None): A name for this layer(optional). If set None, the
-                        layer will be named automatically.
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is true
-
-    Returns:
-        Variable: The pooling result.
-
-    Raises:
-        ValueError: If 'pool_type' is not "max" nor "avg"
-        ValueError: If 'global_pooling' is False and 'pool_size' is -1
-        ValueError: If 'use_cudnn' is not a bool value.
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          pool2d = fluid.layers.pool2d(
-                            input=data,
-                            pool_size=2,
-                            pool_type='max',
-                            pool_stride=1,
-                            global_pooling=False)
-    """
-    if pool_type not in ["max", "avg"]:
-        raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
-
-    if global_pooling is False and pool_size == -1:
-        raise ValueError(
-            "When the global_pooling is False, pool_size must be passed "
-            "and be a valid value. Received pool_size: " + str(pool_size))
-
-    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
-    pool_padding = utils.convert_to_list(pool_padding, 2, 'pool_padding')
-    pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
-
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("use_cudnn should be True or False")
-
-    l_type = 'pool2d'
-
-    helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type=l_type,
-        inputs={"X": input},
-        outputs={"Out": pool_out},
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "global_pooling": global_pooling,
-            "strides": pool_stride,
-            "paddings": pool_padding,
-            "use_cudnn": use_cudnn,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": exclusive,
-        })
-
-    return pool_out
-
-
-@templatedoc()
-def pool3d(input,
-           pool_size=-1,
-           pool_type="max",
-           pool_stride=1,
-           pool_padding=0,
-           global_pooling=False,
-           use_cudnn=True,
-           ceil_mode=False,
-           name=None,
-           exclusive=True):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCDHW, where N is batch size, C is
-                          the number of channels, D is the depth of the feature,
-                          H is the height of the feature, and W is the width
-                          of the feature.
-        pool_size (int|list|tuple): The pool kernel size. If pool kernel size 
-            is a tuple or list, it must contain three integers, 
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be the cube of an int.
-        pool_type (string): ${pooling_type_comment}
-        pool_stride (int): stride of the pooling layer.
-        pool_padding (int): padding size.
-        global_pooling (bool): ${global_pooling_comment}
-        use_cudnn (bool): ${use_cudnn_comment}
-        ceil_mode (bool): ${ceil_mode_comment}
-        name (str): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is true
-
-    Returns:
-        Variable: output of pool3d layer.
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32, 32], dtype='float32')
-          pool3d = fluid.layers.pool3d(
-                            input=data,
-                            pool_size=2,
-                            pool_type='max',
-                            pool_stride=1,
-                            global_pooling=False)
-    """
-    if pool_type not in ["max", "avg"]:
-        raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
-
-    if global_pooling is False and pool_size == -1:
-        raise ValueError(
-            "When the global_pooling is False, pool_size must be passed "
-            "and be a valid value. Received pool_size: " + str(pool_size))
-
-    pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
-    pool_padding = utils.convert_to_list(pool_padding, 3, 'pool_padding')
-    pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride')
-
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("use_cudnn should be True or False")
-
-    l_type = "pool3d"
-    helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type=l_type,
-        inputs={"X": input},
-        outputs={"Out": pool_out},
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "global_pooling": global_pooling,
-            "strides": pool_stride,
-            "paddings": pool_padding,
-            "use_cudnn": use_cudnn,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": exclusive,
-        })
-
-    return pool_out
-
-
-@templatedoc(op_type="pool2d")
-def adaptive_pool2d(input,
-                    pool_size,
-                    pool_type="max",
-                    require_index=False,
-                    name=None):
-    """
-    **Adaptive Pool2d Operator**
-    The adaptive_pool2d operation calculates the output based on the input, pool_size,
-    pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
-    size, C is the number of channels, H is the height of the feature, and W is
-    the width of the feature. Parameters(pool_size) should contain two elements which
-    represent height and width, respectively. Also the H and W dimensions of output(Out)
-    is same as Parameter(pool_size).
-
-    For average adaptive pool2d:
-
-    ..  math::
-
-       hstart &= floor(i * H_{in} / H_{out})
-
-       hend &= ceil((i + 1) * H_{in} / H_{out})
-
-       wstart &= floor(j * W_{in} / W_{out})
-
-       wend &= ceil((j + 1) * W_{in} / W_{out})
-
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-
-    Args:
-        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the
-                          feature, and W is the width of the feature.
-        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-        pool_type: ${pooling_type_comment}
-        require_index (bool): If true, the index of max pooling point will be returned along
-            with outputs. It cannot be set in average pooling type.
-        name (str|None): A name for this layer(optional). If set None, the
-                        layer will be named automatically.
-
-    Returns:
-        Variable: The pooling result.
-
-    Raises:
-        ValueError: 'pool_type' is not 'max' nor 'avg'.
-        ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
-        ValueError: 'pool_size' should be a list or tuple with length as 2.
-
-    Examples:
-        .. code-block:: python
-
-          # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n],
-          # output shape is [N, C, m, n], adaptive pool divide H and W dimentions
-          # of input data into m * n grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive average pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         for j in range(n):
-          #             hstart = floor(i * H / m)
-          #             hend = ceil((i + 1) * H / m)
-          #             wstart = floor(i * W / n)
-          #             wend = ceil((i + 1) * W / n)
-          #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
-          #
-          import paddle.fluid as fluid
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          pool_out = fluid.layers.adaptive_pool2d(
-                            input=data,
-                            pool_size=[3, 3],
-                            pool_type='avg')
-    """
-    if pool_type not in ["max", "avg"]:
-        raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
-
-    if pool_type == "avg" and require_index:
-        raise ValueError(
-            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
-
-    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
-
-    if pool_type == "max":
-        l_type = 'max_pool2d_with_index'
-    else:
-        l_type = "pool2d"
-
-    helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    outputs = {"Out": pool_out}
-    if pool_type == "max":
-        mask = helper.create_variable_for_type_inference(dtype)
-        outputs["Mask"] = mask
-
-    helper.append_op(
-        type=l_type,
-        inputs={"X": input},
-        outputs=outputs,
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
-        })
-
-    return (pool_out, mask) if require_index else pool_out
-
-
-@templatedoc(op_type="pool3d")
-def adaptive_pool3d(input,
-                    pool_size,
-                    pool_type="max",
-                    require_index=False,
-                    name=None):
-    """
-    **Adaptive Pool3d Operator**
-    The adaptive_pool3d operation calculates the output based on the input, pool_size,
-    pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
-    size, C is the number of channels, D is the depth of the feature, H is the height of
-    the feature, and W is the width of the feature. Parameters(pool_size) should contain
-    three elements which represent height and width, respectively. Also the D, H and W
-    dimensions of output(Out) is same as Parameter(pool_size).
-
-    For average adaptive pool3d:
-
-    ..  math::
-
-      dstart &= floor(i * D_{in} / D_{out})
-
-      dend &= ceil((i + 1) * D_{in} / D_{out})
-
-      hstart &= floor(j * H_{in} / H_{out})
-
-      hend &= ceil((j + 1) * H_{in} / H_{out})
-
-      wstart &= floor(k * W_{in} / W_{out})
-
-      wend &= ceil((k + 1) * W_{in} / W_{out})
-
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-
-    Args:
-        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCDHW, where N is batch size, C is
-                          the number of channels, D is the depth of the feature,
-                          H is the height of the feature, and W is the width of the feature.
-        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three integers, (Depth, Height, Width).
-        pool_type: ${pooling_type_comment}
-        require_index (bool): If true, the index of max pooling point will be returned along
-            with outputs. It cannot be set in average pooling type.
-        name (str|None): A name for this layer(optional). If set None, the
-                        layer will be named automatically.
-
-    Returns:
-        Variable: The pooling result.
-
-    Raises:
-        ValueError: 'pool_type' is not 'max' nor 'avg'.
-        ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
-        ValueError: 'pool_size' should be a list or tuple with length as 2.
-
-    Examples:
-        .. code-block:: python
-
-          # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
-          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
-          # of input data into l * m * n grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive average pool performs calculations as follow:
-          #
-          #     for i in range(l):
-          #         for j in range(m):
-          #             for k in range(n):
-          #                 dstart = floor(i * D / l)
-          #                 dend = ceil((i + 1) * D / l)
-          #                 hstart = floor(j * H / m)
-          #                 hend = ceil((j + 1) * H / m)
-          #                 wstart = floor(k * W / n)
-          #                 wend = ceil((k + 1) * W / n)
-          #                 output[:, :, i, j, k] =
-          #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
-          #
-
-          import paddle.fluid as fluid
-
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32, 32], dtype='float32')
-          pool_out = fluid.layers.adaptive_pool3d(
-                            input=data,
-                            pool_size=[3, 3, 3],
-                            pool_type='avg')
-    """
-    if pool_type not in ["max", "avg"]:
-        raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
-
-    if pool_type == "avg" and require_index:
-        raise ValueError(
-            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
-
-    pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
-
-    if pool_type == "max":
-        l_type = 'max_pool3d_with_index'
-    else:
-        l_type = "pool3d"
-
-    helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    outputs = {"Out": pool_out}
-    if pool_type == "max":
-        mask = helper.create_variable_for_type_inference(dtype)
-        outputs["Mask"] = mask
-
-    helper.append_op(
-        type=l_type,
-        inputs={"X": input},
-        outputs=outputs,
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
-        })
-
-    return (pool_out, mask) if require_index else pool_out
-
-
-def batch_norm(input,
-               act=None,
-               is_test=False,
-               momentum=0.9,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               data_layout='NCHW',
-               in_place=False,
-               name=None,
-               moving_mean_name=None,
-               moving_variance_name=None,
-               do_model_average_for_mean_and_var=False,
-               fuse_with_relu=False,
-               use_global_stats=False):
-    """
-    **Batch Normalization Layer**
-
-    Can be used as a normalizer function for conv2d and fully_connected operations.
-    The required data format for this layer is one of the following:
-
-    1. NHWC `[batch, in_height, in_width, in_channels]`
-
-    2. NCHW `[batch, in_channels, in_height, in_width]`
-
-    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
-    for more details.
-
-    :math:`input` is the input features over a mini-batch.
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-        moving\_mean = moving\_mean * momentum + mini-batch\_mean * (1. - momentum)
-        moving\_var = moving\_var * momentum + mini-batch\_var * (1. - momentum)
-        moving_mean and moving_var is global mean and global variance.
-
-
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
-    They are global (or running) statistics. (It usually got from the
-    pre-trained model.)
-    The training and testing (or inference) have the same behavior:
-
-    ..  math::
-
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta
-
-    Note:
-        if build_strategy.sync_batch_norm=True, the batch_norm in network will use 
-        sync_batch_norm automatically.
-
-    Args:
-        input(variable): The rank of input variable can be 2, 3, 4, 5.
-        act(string, Default None): Activation type, linear|relu|prelu|...
-        is_test (bool, Default False): A flag indicating whether it is in
-            test phrase or not.
-        momentum(float, Default 0.9): The value used for the moving_mean and
-            moving_var computation. The updated formula is:
-            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
-            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
-            Default is 0.9.
-        epsilon(float, Default 1e-05): A value added to the denominator for
-            numerical stability. Default is 1e-5.
-        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized 
-	     with Xavier. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
-	     Default: None.
-        data_layout(string, default NCHW): NCHW|NHWC
-        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
-        name(string, Default None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean. If it 
-            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm 
-            will save global mean with the string.
-        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
-            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm 
-            will save global variance with the string.
-        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
-        fuse_with_relu (bool): if True, this OP performs relu after batch norm.
-        use_global_stats(bool, Default False): Whether to use global mean and
-            variance. In inference or test mode, set use_global_stats to true
-            or is_test to true, and the behavior is equivalent.
-            In train mode, when setting use_global_stats True, the global mean
-            and variance are also used during train period.
-
-    Returns:
-        Variable: A tensor variable which is the result after applying batch normalization on the input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[3, 7, 3, 7], dtype='float32', append_batch_size=False)
-            hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
-            hidden2 = fluid.layers.batch_norm(input=hidden1)
-    """
-    assert bias_attr is not False, "bias_attr should not be False in batch_norm."
-    helper = LayerHelper('batch_norm', **locals())
-    dtype = helper.input_dtype()
-
-    # use fp32 for bn parameter
-    if dtype == core.VarDesc.VarType.FP16:
-        dtype = core.VarDesc.VarType.FP32
-
-    input_shape = input.shape
-    if data_layout == 'NCHW':
-        channel_num = input_shape[1]
-    else:
-        if data_layout == 'NHWC':
-            channel_num = input_shape[-1]
-        else:
-            raise ValueError("unsupported data layout:" + data_layout)
-
-    param_shape = [channel_num]
-
-    # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        default_initializer=Constant(1.0))
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
-
-    mean = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_mean_name,
-            initializer=Constant(0.0),
-            trainable=False,
-            do_model_average=do_model_average_for_mean_and_var),
-        shape=param_shape,
-        dtype=dtype)
-    mean.stop_gradient = True
-
-    variance = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_variance_name,
-            initializer=Constant(1.0),
-            trainable=False,
-            do_model_average=do_model_average_for_mean_and_var),
-        shape=param_shape,
-        dtype=dtype)
-    variance.stop_gradient = True
-
-    # create output
-    # mean and mean_out share the same memory
-    mean_out = mean
-    # variance and variance out share the same memory
-    variance_out = variance
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-
-    batch_norm_out = input if in_place else helper.create_variable_for_type_inference(
-        dtype)
-
-    helper.append_op(
-        type="batch_norm",
-        inputs={
-            "X": input,
-            "Scale": scale,
-            "Bias": bias,
-            "Mean": mean,
-            "Variance": variance
-        },
-        outputs={
-            "Y": batch_norm_out,
-            "MeanOut": mean_out,
-            "VarianceOut": variance_out,
-            "SavedMean": saved_mean,
-            "SavedVariance": saved_variance
-        },
-        attrs={
-            "momentum": momentum,
-            "epsilon": epsilon,
-            "is_test": is_test,
-            "data_layout": data_layout,
-            "use_mkldnn": False,
-            "fuse_with_relu": fuse_with_relu,
-            "use_global_stats": use_global_stats
-        })
-
-    return helper.append_activation(batch_norm_out)
-
-
-def instance_norm(input,
-                  epsilon=1e-05,
-                  param_attr=None,
-                  bias_attr=None,
-                  name=None):
-    """
-    **Instance Normalization Layer**
-
-    Can be used as a normalizer function for conv2d and fully_connected operations.
-    The required data format for this layer is one of the following:
-
-    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
-
-    Refer to `Instance Normalization: The Missing Ingredient for 
-    Fast Stylization <https://arxiv.org/pdf/1607.08022.pdf>`_
-    for more details.
-
-    :math:`input` is the input features over a mini-batch.
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
-        \\ mean of one  feature map in mini-batch \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ variance of one feature map in mini-batch \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta
-
-    Args:
-        input(variable): The rank of input variable can be 2, 3, 4, 5.
-        epsilon(float, Default 1e-05): A value added to the denominator for
-            numerical stability. Default is 1e-5.
-        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
-             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
-	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized 
-	     with Xavier. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the bias of instance_norm.
-             If it is set to None or one attribute of ParamAttr, instance_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
-	     Default: None.
-        name(string, Default None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-
-    Returns:
-        Variable: A tensor variable which is the result after applying instance normalization on the input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[3, 7, 3, 7], dtype='float32', append_batch_size=False)
-            hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
-            hidden2 = fluid.layers.instance_norm(input=hidden1)
-    """
-    assert bias_attr is not False, "bias_attr should not be False in instance_norm."
-    helper = LayerHelper('instance_norm', **locals())
-    dtype = helper.input_dtype()
-
-    # use fp32 for in parameter
-    if dtype == core.VarDesc.VarType.FP16:
-        dtype = core.VarDesc.VarType.FP32
-
-    input_shape = input.shape
-    channel_num = input_shape[1]
-
-    param_shape = [channel_num]
-
-    # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        default_initializer=Constant(1.0))
-    bias = helper.create_parameter(
-        attr=helper.bias_attr,
-        shape=param_shape,
-        dtype=dtype,
-        is_bias=True,
-        default_initializer=Constant(0.0))
-
-    # create output
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-
-    instance_norm_out = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type="instance_norm",
-        inputs={
-            "X": input,
-            "Scale": scale,
-            "Bias": bias,
-        },
-        outputs={
-            "Y": instance_norm_out,
-            "SavedMean": saved_mean,
-            "SavedVariance": saved_variance
-        },
-        attrs={"epsilon": epsilon, })
-
-    return instance_norm_out
-
-
-def data_norm(input,
-              act=None,
-              epsilon=1e-05,
-              param_attr=None,
-              data_layout='NCHW',
-              in_place=False,
-              name=None,
-              moving_mean_name=None,
-              moving_variance_name=None,
-              do_model_average_for_mean_and_var=False):
-    """
-    **Data Normalization Layer**
-
-    Can be used as a normalizer function for conv2d and fully_connected operations.
-    The required data format for this layer is one of the following:
-
-    1. NHWC `[batch, in_height, in_width, in_channels]`
-
-    2. NCHW `[batch, in_channels, in_height, in_width]`
-
-    :math:`input` is the input features over a mini-batch.
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-    Args:
-        input(variable): The input variable which is a LoDTensor.
-        act(string, Default None): Activation type, linear|relu|prelu|...
-        epsilon(float, Default 1e-05):
-        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
-        data_layout(string, default NCHW): NCHW|NHWC
-        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
-        name(string, Default None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
-        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
-        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
-
-    Returns:
-        Variable: A tensor variable which is the result after applying data normalization on the input.
-
-    Examples:
-
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-
-            hidden1 = fluid.layers.data(name="hidden1", shape=[200])
-            hidden2 = fluid.layers.data_norm(name="hidden2", input=hidden1)
-    """
-    helper = LayerHelper('data_norm', **locals())
-    dtype = helper.input_dtype()
-
-    input_shape = input.shape
-    if data_layout == 'NCHW':
-        channel_num = input_shape[1]
-    else:
-        if data_layout == 'NHWC':
-            channel_num = input_shape[-1]
-        else:
-            raise ValueError("unsupported data layout:" + data_layout)
-
-    param_shape = [channel_num]
-
-    batch_size_default = 1e4
-    batch_sum_default = 0.0
-    batch_square_sum_default = 1e4
-
-    if param_attr and isinstance(param_attr, dict):
-        batch_size_default = param_attr.get("batch_size", 1e4)
-        batch_sum_default = param_attr.get("batch_sum", 0.0)
-        batch_square_sum_default = param_attr.get("batch_square", 1e4)
-
-    # create parameter
-    batch_size = helper.create_parameter(
-        attr=ParamAttr(
-            name=name + '.batch_size',
-            initializer=Constant(value=float(batch_size_default)),
-            trainable=True),
-        shape=param_shape,
-        dtype=input.dtype)
-
-    batch_sum = helper.create_parameter(
-        attr=ParamAttr(
-            name=name + '.batch_sum',
-            initializer=Constant(value=float(batch_sum_default)),
-            trainable=True),
-        shape=param_shape,
-        dtype=input.dtype)
-
-    batch_square_sum = helper.create_parameter(
-        attr=ParamAttr(
-            name=name + '.batch_square_sum',
-            initializer=Constant(value=float(batch_square_sum_default)),
-            trainable=True),
-        shape=param_shape,
-        dtype=input.dtype)
-
-    means = helper.create_variable(dtype=dtype, stop_gradient=True)
-    scales = helper.create_variable(dtype=dtype, stop_gradient=True)
-
-    data_norm_out = input if in_place else helper.create_variable(dtype=dtype)
-
-    helper.append_op(
-        type="data_norm",
-        inputs={
-            "X": input,
-            "BatchSize": batch_size,
-            "BatchSum": batch_sum,
-            "BatchSquareSum": batch_square_sum
-        },
-        outputs={"Y": data_norm_out,
-                 "Means": means,
-                 "Scales": scales},
-        attrs={"epsilon": epsilon})
-
-    return helper.append_activation(data_norm_out)
-
-
-@templatedoc()
-def layer_norm(input,
-               scale=True,
-               shift=True,
-               begin_norm_axis=1,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               act=None,
-               name=None):
-    """
-    ${comment}
-
-    The formula is as follows:
-
-    ..  math::
-
-        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
-
-        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
-
-        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
-
-    * :math:`a`: the vector representation of the summed inputs to the neurons
-    in that layer.
-
-    * :math:`H`: the number of hidden units in a layers
-
-    * :math:`g`: the trainable scale parameter.
-
-    * :math:`b`: the trainable bias parameter.
-
-    Args:
-        input(Variable): The input tensor variable.
-        scale(bool): Whether to learn the adaptive gain :math:`g` after
-            normalization. Default True.
-        shift(bool): Whether to learn the adaptive bias :math:`b` after
-            normalization. Default True.
-        begin_norm_axis(int): The normalization will be performed along
-            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
-            Default 1.
-        epsilon(float): The small value added to the variance to prevent
-            division by zero. Default 1e-05.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
-            omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
-            a default :code:`ParamAttr` would be added as scale. The
-            :attr:`param_attr` is initialized as 1 if it is added. Default None.
-        bias_attr(ParamAttr|None): The parameter attribute for the learnable
-            bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
-            omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
-            a default :code:`ParamAttr` would be added as bias. The
-            :attr:`bias_attr` is initialized as 0 if it is added. Default None.
-        act(str): Activation to be applied to the output of layer normalizaiton.
-                  Default None.
-        name(str): The name of this layer. It is optional. Default None, and a
-                   unique name would be generated automatically.
-
-    Returns:
-        ${y_comment}
-
-    Examples:
-
-        >>> import paddle.fluid as fluid
-        >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
-        >>>                          dtype='float32')
-        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
-    """
-    assert in_dygraph_mode(
-    ) is not True, "please use FC instead of fc in dygraph mode!"
-    helper = LayerHelper('layer_norm', **locals())
-    dtype = helper.input_dtype()
-
-    # create intput and parameters
-    inputs = {'X': input}
-    input_shape = input.shape
-    param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
-    if scale:
-        scale = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=param_shape,
-            dtype=dtype,
-            default_initializer=Constant(1.0))
-        inputs['Scale'] = scale
-    if shift:
-        assert bias_attr is not False
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
-        inputs['Bias'] = bias
-
-    # create output
-    mean_out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    variance_out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    layer_norm_out = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type="layer_norm",
-        inputs=inputs,
-        outputs={
-            "Y": layer_norm_out,
-            "Mean": mean_out,
-            "Variance": variance_out,
-        },
-        attrs={"epsilon": epsilon,
-               "begin_norm_axis": begin_norm_axis})
-
-    return helper.append_activation(layer_norm_out)
-
-
-@templatedoc()
-def group_norm(input,
-               groups,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               act=None,
-               data_layout='NCHW',
-               name=None):
-    """
-    **Group Normalization Layer**
-
-    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
-
-    Args:
-        input(Variable): The input tensor variable.
-        groups(int): The number of groups that divided from channels.
-        epsilon(float): The small value added to the variance to prevent
-            division by zero.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            scale :math:`g`. If it is set to False, no scale will be added to the output units.
-            If it is set to None, the bias is initialized one. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the learnable
-            bias :math:`b`. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-        act(str): Activation to be applied to the output of group normalizaiton.
-        data_layout(string, default NCHW): NCHW(num_batch, channels, h, w) or NHWC(num_batch, h, w, channels).
-        name (str): The name of this layer. It is optional.
-
-    Returns:
-        Variable: A tensor variable which is the result after applying group normalization on the input.
-
-    Examples:
-
-        >>> import paddle.fluid as fluid
-        >>> data = fluid.layers.data(name='data', shape=[8, 32, 32],
-        >>>                          dtype='float32')
-        >>> x = fluid.layers.group_norm(input=data, groups=4)
-    """
-    helper = LayerHelper('group_norm', **locals())
-    dtype = helper.input_dtype()
-
-    # create intput and parameters
-    inputs = {'X': input}
-    input_shape = input.shape
-    if data_layout != 'NCHW' and data_layout != 'NHWC':
-        raise ValueError(
-            "Param(data_layout) of Op(fluid.layers.group_norm) got wrong value: received "
-            + data_layout + " but only NCHW or NHWC supported.")
-    channel_num = input_shape[1] if data_layout == 'NCHW' else input_shape[-1]
-    param_shape = [channel_num]
-    if param_attr:
-        scale = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=param_shape,
-            dtype=dtype,
-            default_initializer=Constant(1.0))
-        inputs['Scale'] = scale
-    if bias_attr:
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
-        inputs['Bias'] = bias
-
-    # create output
-    mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
-    variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
-    group_norm_out = helper.create_variable(dtype=dtype)
-
-    helper.append_op(
-        type="group_norm",
-        inputs=inputs,
-        outputs={
-            "Y": group_norm_out,
-            "Mean": mean_out,
-            "Variance": variance_out,
-        },
-        attrs={
-            "epsilon": epsilon,
-            "groups": groups,
-            "data_layout": data_layout
-        })
-
-    return helper.append_activation(group_norm_out)
-
-
-@templatedoc()
-def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
-    """
-    **Spectral Normalization Layer**
-
-    This layer calculates the spectral normalization value of weight parameters of
-    fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D
-    Parameters. Calculations are showed as follows.
-
-    Step 1:
-    Generate vector U in shape of [H], and V in shape of [W].
-    While H is the :attr:`dim` th dimension of the input weights,
-    and W is the product result of remaining dimensions.
-
-    Step 2:
-    :attr:`power_iters` shoule be a positive interger, do following
-    calculations with U and V for :attr:`power_iters` rounds.
-
-    .. math:: 
-
-        \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
-
-        \mathbf{u} := \\frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}
-
-    Step 3:
-    Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
-
-    .. math::
-
-        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
-
-        \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})}
-                
-
-    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
-
-    Args:
-        weight(${weight_type}): ${weight_comment}
-        dim(int): ${dim_comment}
-        power_iters(int): ${power_iters_comment}
-        eps(float): ${eps_comment}
-        name (str): The name of this layer. It is optional.
-
-    Returns:
-        Variable: A tensor variable of weight parameters after spectral normalization.
-
-    Examples:
-       .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            weight = fluid.layers.data(name='weight', shape=[2, 8, 32, 32], 
-                                       append_batch_size=False, dtype='float32')
-            x = fluid.layers.spectral_norm(weight=weight, dim=1, power_iters=2)
-    """
-    helper = LayerHelper('spectral_norm', **locals())
-    dtype = weight.dtype
-
-    # create intput and parameters
-    inputs = {'Weight': weight}
-    input_shape = weight.shape
-    h = input_shape[dim]
-    w = np.prod(input_shape) // h
-
-    u = helper.create_parameter(
-        attr=ParamAttr(),
-        shape=[h],
-        dtype=dtype,
-        default_initializer=Normal(0., 1.))
-    u.stop_gradient = True
-    inputs['U'] = u
-    v = helper.create_parameter(
-        attr=ParamAttr(),
-        shape=[w],
-        dtype=dtype,
-        default_initializer=Normal(0., 1.))
-    inputs['V'] = v
-    v.stop_gradient = True
-
-    # create output
-    out = helper.create_variable(dtype=dtype)
-
-    helper.append_op(
-        type="spectral_norm",
-        inputs=inputs,
-        outputs={"Out": out, },
-        attrs={
-            "dim": dim,
-            "power_iters": power_iters,
-            "eps": eps,
-        })
-
-    return out
-
-
-def conv2d_transpose(input,
-                     num_filters,
-                     output_size=None,
-                     filter_size=None,
-                     padding=0,
-                     stride=1,
-                     dilation=1,
-                     groups=None,
-                     param_attr=None,
-                     bias_attr=None,
-                     use_cudnn=True,
-                     act=None,
-                     name=None):
-    """
-    **Convlution2D transpose layer**
-
-    The convolution2D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input(Input) and output(Output)
-    are in NCHW format. Where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-    Parameters(dilations, strides, paddings) are two elements. These two elements
-    represent height and width, respectively. The details of convolution transpose
-    layer, please refer to the following explanation and references
-    `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a tensor with NCHW format.
-    * :math:`W`: Filter value, a tensor with MCHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ] \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ] 
-
-    Note:
-          if output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; 
-          else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
-          and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`, 
-          conv2d_transpose can compute the kernel size automatically.
-
-    Args:
-        input(Variable): The input image with [N, C, H, W] format.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        output_size(int|tuple|None): The output image size. If output size is a
-            tuple, it must contain two integers, (image_height, image_width). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above.
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_height, filter_size_width).
-            Otherwise, filter_size_height = filter_size_width = filter_size. None if 
-            use output size to calculate filter_size.
-        padding(int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_height, padding_width). Otherwise, 
-            padding_height = padding_width = padding. Default: padding = 0.
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_height, stride_width). Otherwise,
-            stride_height = stride_width = stride. Default: stride = 1.
-        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_height, dilation_width). Otherwise, 
-            dilation_height = dilation_width = dilation. Default: dilation = 1.
-        groups(int): The groups number of the Conv2d transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            Default: groups = 1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: True.
-
-    Returns:
-        Variable: The tensor variable storing the convolution transpose result.
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-       .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
-    """
-    assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
-    input_channel = input.shape[1]
-
-    op_type = 'conv2d_transpose'
-    if (input_channel == groups and num_filters == input_channel and
-            not use_cudnn):
-        op_type = 'depthwise_conv2d_transpose'
-
-    helper = LayerHelper(op_type, **locals())
-    if not isinstance(input, Variable):
-        raise TypeError("Input of conv2d_transpose must be Variable")
-
-    padding = utils.convert_to_list(padding, 2, 'padding')
-    stride = utils.convert_to_list(stride, 2, 'stride')
-    dilation = utils.convert_to_list(dilation, 2, 'dilation')
-
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("use_cudnn should be True or False")
-
-    if filter_size is None:
-        if output_size is None:
-            raise ValueError("output_size must be set when filter_size is None")
-        if isinstance(output_size, int):
-            output_size = [output_size, output_size]
-
-        h_in = input.shape[2]
-        w_in = input.shape[3]
-
-        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
-                         padding[0] - 1) // dilation[0] + 1
-        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
-                         padding[1] - 1) // dilation[1] + 1
-        filter_size = [filter_size_h, filter_size_w]
-    else:
-        filter_size = utils.convert_to_list(filter_size, 2,
-                                            'conv2d_transpose.filter_size')
-
-    if output_size is None:
-        output_size = []
-    elif isinstance(output_size, list) or isinstance(output_size, int):
-        output_size = utils.convert_to_list(output_size, 2, 'output_size')
-    else:
-        raise ValueError("output_size should be list or int")
-    padding = utils.convert_to_list(padding, 2, 'padding')
-    groups = 1 if groups is None else groups
-    filter_shape = [input_channel, num_filters // groups] + filter_size
-
-    img_filter = helper.create_parameter(
-        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
-
-    pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type=op_type,
-        inputs={'Input': [input],
-                'Filter': [img_filter]},
-        outputs={'Output': pre_bias},
-        attrs={
-            'output_size': output_size,
-            'strides': stride,
-            'paddings': padding,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn
-        })
-
-    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
-    out = helper.append_activation(pre_act)
-    return out
-
-
-def conv3d_transpose(input,
-                     num_filters,
-                     output_size=None,
-                     filter_size=None,
-                     padding=0,
-                     stride=1,
-                     dilation=1,
-                     groups=None,
-                     param_attr=None,
-                     bias_attr=None,
-                     use_cudnn=True,
-                     act=None,
-                     name=None):
-    """
-    **Convlution3D transpose layer**
-
-    The convolution3D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input(Input) and output(Output)
-    are in NCDHW format. Where N is batch size, C is the number of channels,
-    D is the depth of the feature, H is the height of the feature, and W
-    is the width of the feature. Parameters(dilations, strides, paddings) are
-    two elements. These two elements represent height and width, respectively.
-    The details of convolution transpose layer, please refer to the following
-    explanation and references `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    In the above equation:
-
-    * :math:`X`: Input value, a tensor with NCDHW format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-           D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
-           H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
-           W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
-
-    Args:
-        input(Variable): The input image with [N, C, D, H, W] format.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        output_size(int|tuple|None): The output image size. If output size is a
-            tuple, it must contain three integers, (image_D, image_H, image_W). This
-            parameter only works when filter_size is None.
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_depth, filter_size_height, \
-            filter_size_width). Otherwise, filter_size_depth = filter_size_height = \
-            filter_size_width = filter_size. None if use output size to
-            calculate filter_size.
-        padding(int|tuple): The padding size. If padding is a tuple, it must
-            contain three integers, (padding_depth, padding_height, padding_width). Otherwise,
-            padding_depth = padding_height = padding_width = padding. Default: padding = 0.
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain three integers, (stride_depth, stride_height, stride_width). Otherwise,
-            stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_depth, dilation_height, dilation_width). Otherwise,
-            dilation_depth = dilation_height = dilation_width = dilation. Default: dilation = 1.
-        groups(int): The groups number of the Conv3d transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            Default: groups=1
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d_transpose.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv3d_transpose
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-
-    Returns:
-        Variable: The tensor variable storing the convolution transpose result.
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-       .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
-          conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3)
-    """
-    assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
-    l_type = "conv3d_transpose"
-    helper = LayerHelper(l_type, **locals())
-    if not isinstance(input, Variable):
-        raise TypeError("Input of conv3d_transpose must be Variable")
-    input_channel = input.shape[1]
-
-    padding = utils.convert_to_list(padding, 3, 'padding')
-    stride = utils.convert_to_list(stride, 3, 'stride')
-    dilation = utils.convert_to_list(dilation, 3, 'dilation')
-
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("use_cudnn should be True or False")
-
-    if filter_size is None:
-        if output_size is None:
-            raise ValueError("output_size must be set when filter_size is None")
-        if isinstance(output_size, int):
-            output_size = [output_size, output_size]
-
-        d_in = input.shape[2]
-        h_in = input.shape[3]
-        w_in = input.shape[4]
-
-        filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + 2 *
-                         padding[0] - 1) // dilation[0] + 1
-        filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + 2 *
-                         padding[1] - 1) // dilation[1] + 1
-        filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + 2 *
-                         padding[2] - 1) // dilation[2] + 1
-        filter_size = [filter_size_d, filter_size_h, filter_size_w]
-    else:
-        filter_size = utils.convert_to_list(filter_size, 3,
-                                            'conv3d_transpose.filter_size')
-
-    groups = 1 if groups is None else groups
-    filter_shape = [input_channel, num_filters // groups] + filter_size
-    img_filter = helper.create_parameter(
-        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
-
-    pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type=l_type,
-        inputs={'Input': [input],
-                'Filter': [img_filter]},
-        outputs={'Output': pre_bias},
-        attrs={
-            'strides': stride,
-            'paddings': padding,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn
-        })
-
-    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
-    out = helper.append_activation(pre_act)
-    return out
-
-
-def sequence_expand(x, y, ref_level=-1, name=None):
-    """Sequence Expand Layer. This layer will expand the input variable **x**
-    according to specified level lod of **y**. Please note that lod level of
-    **x** is at most 1 and rank of **x** is at least 2. When rank of **x**
-    is greater than 2, then it would be viewed as a 2-D tensor.
-    Following examples will explain how sequence_expand works:
-
-    .. code-block:: text
-
-        * Case 1
-            x is a LoDTensor:
-                x.lod  = [[2,        2]]
-                x.data = [[a], [b], [c], [d]]
-                x.dims = [4, 1]
-
-            y is a LoDTensor:
-                y.lod = [[2,    2],
-                         [3, 3, 1, 1]]
-
-            ref_level: 0
-
-            then output is a 1-level LoDTensor:
-                out.lod =  [[2,        2,        2,        2]]
-                out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
-                out.dims = [8, 1]
-
-        * Case 2
-            x is a Tensor:
-                x.data = [[a], [b], [c]]
-                x.dims = [3, 1]
-
-            y is a LoDTensor:
-                y.lod = [[2, 0, 3]]
-
-            ref_level: -1
-
-            then output is a Tensor:
-                out.data = [[a], [a], [c], [c], [c]]
-                out.dims = [5, 1]
-    Args:
-        x (Variable): The input variable which is a Tensor or LoDTensor.
-        y (Variable): The input variable which is a LoDTensor.
-        ref_level (int): Lod level of `y` to be referred by `x`. If set to -1,
-                         refer the last level of lod.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The expanded variable which is a LoDTensor.
-
-    Examples:
-        .. code-block:: python
-	
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[10, 20],
-                             dtype='float32', lod_level=1)
-            out = layers.sequence_expand(x=x, y=y, ref_level=0)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_expand', input=x, **locals())
-    dtype = helper.input_dtype()
-    tmp = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='sequence_expand',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': tmp},
-        attrs={'ref_level': ref_level})
-    return tmp
-
-
-def sequence_expand_as(x, y, name=None):
-    """Sequence Expand As Layer. This layer will expand the input variable **x**
-    according to the zeroth level lod of **y**. Current implementation requires
-    the level number of Input(Y)'s lod must be 1, and the first dimension of
-    Input(X) should be equal to the size of Input(Y)'s zeroth level lod, and
-    lod of Input(X) is not considered.
-
-    Following examples will explain how sequence_expand_as works:
-
-    .. code-block:: text
-
-        * Case 1:
-
-            Given a 1-level LoDTensor input(X)
-                X.data = [[a], [b], [c], [d]]
-                X.dims = [4, 1]
-            and input(Y)
-                Y.lod = [[0, 3, 6, 7, 8]]
-            ref_level: 0
-            then we get 1-level LoDTensor
-                Out.lod =  [[0,            3,              6,  7,  8]]
-                Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
-                Out.dims = [8, 1]
-
-        * Case 2:
-
-            Given a common Tensor input(X)
-                X.data = [[a, b], [c, d], [e, f]]
-                X.dims = [3, 2]
-            and input(Y)
-                Y.lod = [[0, 2, 3, 6]]
-            ref_level: 0
-            then we get a common LoDTensor
-                Out.lod =  [[0,             2,     3,                    6]]
-                Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
-                Out.dims = [6, 2]
-
-    Args:
-        x (Variable): The input variable which is a Tensor or LoDTensor.
-        y (Variable): The input variable which is a LoDTensor.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The expanded variable which is a LoDTensor.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[10, 20],
-                             dtype='float32', lod_level=1)
-            out = layers.sequence_expand_as(x=x, y=y)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_expand_as', input=x, **locals())
-    dtype = helper.input_dtype()
-    tmp = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='sequence_expand_as',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': tmp})
-    return tmp
-
-
-@templatedoc()
-def sequence_pad(x, pad_value, maxlen=None, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(Variable): Input variable which should contain lod information.
-        pad_value(Variable): The Variable that holds values that will be fill
-            into padded steps. It can be a scalar or a tensor whose shape
-            equals to time steps in sequences. If it's a scalar, it will be
-            automatically broadcasted to the shape of time step.
-        maxlen(int, default None): The length of padded sequences. It can be
-            None or any positive int. When it is None, all sequences will be
-            padded up to the length of the longest one among them; when it a
-            certain positive value, it must be greater than the length of the
-            longest original sequence.
-        name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-
-    Returns:
-        Variable: The padded sequence batch and the original lengths before
-                  padding. All sequences has the same length.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy
-
-            x = fluid.layers.data(name='x', shape=[10, 5],
-                             dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(
-                input=numpy.array([0.0], dtype=numpy.float32))
-            out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
-    """
-
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_pad', input=x, **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    length = helper.create_variable_for_type_inference(dtype)
-
-    pad_value.stop_gradient = True
-    length.stop_gradient = True
-
-    if maxlen is None:
-        maxlen = -1
-    helper.append_op(
-        type='sequence_pad',
-        inputs={'X': x,
-                'PadValue': pad_value},
-        outputs={'Out': out,
-                 'Length': length},
-        attrs={'padded_length': maxlen})
-    return out, length
-
-
-def sequence_unpad(x, length, name=None):
-    """
-    **Sequence Unpad Layer**
-
-    This layer removes the padding data in the input sequences and convert
-    them into sequences with actual length as output, identitied by lod
-    information.
-
-    .. code-block:: text
-
-	Example:
-
-	Given input Variable **x**:
-	    x.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
-		      [ 6.0,  7.0,  8.0,  9.0, 10.0],
-		      [11.0, 12.0, 13.0, 14.0, 15.0]],
-
-	in which there are 3 sequences padded to length 5, and the acutal length
-	specified by input Variable **length**:
-
-	    length.data = [2, 3, 4],
-
-	after unpadding, the output Variable will be:
-
-	    out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
-	    out.lod = [[2, 3, 4]]
-
-    Args:
-        x(Variable): Input Variable which contains the padded sequences with
-            equal length.
-        length(Variable): The Variable that specifies the actual ength of
-            sequences after unpadding.
-        name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-
-    Returns:
-        Variable: The Variable contains the unpadded sequences.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy
-
-            # pad data
-            x = fluid.layers.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(input=numpy.array([0.0], dtype=numpy.float32))
-            pad_data, len = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
-            
-            # upad data
-            unpad_data = fluid.layers.sequence_unpad(x=pad_data, length=len)
-    """
-
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_unpad', input=x, **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-
-    length.stop_gradient = True
-
-    helper.append_op(
-        type='sequence_unpad',
-        inputs={'X': x,
-                'Length': length},
-        outputs={'Out': out})
-    return out
-
-
-def beam_search(pre_ids,
-                pre_scores,
-                ids,
-                scores,
-                beam_size,
-                end_id,
-                level=0,
-                is_accumulated=True,
-                name=None,
-                return_parent_idx=False):
-    """
-    Beam search is a classical algorithm for selecting candidate words in a
-    machine translation task.
-
-    Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
-    for more details.
-
-    This layer does the search in beams for one time step. Specifically, it
-    selects the top-K candidate word ids of current step from :attr:`ids`
-    according to their :attr:`scores` for all source sentences, where K is
-    :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
-    computation cell. If :attr:`ids` is not set, it will be calculated out
-    according to :attr:`scores`. Additionally, :attr:`pre_ids` and
-    :attr:`pre_scores` are the output of beam_search at previous step, they
-    are needed for special use to handle ended candidate translations.
-
-    Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores`
-    passed in should be accumulated scores. Else, the :attr:`scores` are
-    considered as the straightforward scores and will be transformed to the
-    log field and accumulated the :attr:`pre_scores` in this operator.
-    Length penalty should be done with extra operators before calculating the
-    accumulated scores if needed.
-
-    Please see the following demo for a fully beam search usage example:
-
-        fluid/tests/book/test_machine_translation.py
-
-    Args:
-        pre_ids(Variable): The LodTensor variable which is the output of
-            beam_search at previous step. It should be a LodTensor with shape
-            :math:`(batch_size, 1)` and lod
-            :math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
-            first step.
-        pre_scores(Variable): The LodTensor variable which is the output of
-            beam_search at previous step.
-        ids(Variable): The LodTensor variable containing the candidates ids.
-            Its shape should be :math:`(batch_size \\times beam_size, K)`,
-            where :math:`K` supposed to be :attr:`beam_size`.
-        scores(Variable): The LodTensor variable containing the accumulated
-            scores corresponding to :attr:`ids` and its shape is the same as
-            the shape of :attr:`ids`.
-        beam_size(int): The beam width used in beam search.
-        end_id(int): The id of end token.
-        level(int, default 0): It can be ignored and mustn't change currently.
-            It means the source level of lod, which is explained as following.
-            The lod level of :attr:`ids` should be 2. The first level is source
-            level which describes how many prefixes (branchs) for each source
-            sentece (beam), and the second level is sentence level which
-            describes how these candidates belong to the prefix. The paths
-            linking prefixes and selected candidates are organized and reserved
-            in lod.
-        is_accumulated(bool, default True): Whether the input :attr:`score` is
-             accumulated scores.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-        return_parent_idx(bool): Whether to return an extra Tensor variable 
-                        preserving the selected_ids' parent indice in pre_ids
-                        in output, which can be used to gather cell states at
-                        the next time step.
-
-    Returns:
-        Variable: The LodTensor tuple containing the selected ids and the \
-            corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \
-            an extra Tensor variable preserving the selected_ids' parent indice \
-            is included.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            # Suppose `probs` contains predicted results from the computation
-            # cell and `pre_ids` and `pre_scores` is the output of beam_search
-            # at previous step.
-            beam_size = 4
-            end_id = 1
-            pre_ids = fluid.layers.data(
-                name='pre_id', shape=[1], lod_level=2, dtype='int64')
-            pre_scores = fluid.layers.data(
-                name='pre_scores', shape=[1], lod_level=2, dtype='float32')
-            probs = fluid.layers.data(
-                name='probs', shape=[10000], dtype='float32')
-            topk_scores, topk_indices = fluid.layers.topk(probs, k=beam_size)
-            accu_scores = fluid.layers.elementwise_add(
-                x=fluid.layers.log(x=topk_scores),
-                y=fluid.layers.reshape(pre_scores, shape=[-1]),
-                axis=0)
-            selected_ids, selected_scores = fluid.layers.beam_search(
-                pre_ids=pre_ids,
-                pre_scores=pre_scores,
-                ids=topk_indices,
-                scores=accu_scores,
-                beam_size=beam_size,
-                end_id=end_id)
-    """
-    helper = LayerHelper('beam_search', **locals())
-    score_type = pre_scores.dtype
-    id_type = pre_ids.dtype
-
-    inputs = {"pre_ids": pre_ids, "pre_scores": pre_scores, "scores": scores}
-    if ids is not None:
-        inputs["ids"] = ids
-
-    selected_scores = helper.create_variable_for_type_inference(
-        dtype=score_type)
-    selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
-    # parent_idx is a tensor used to gather cell states at the next time
-    # step. Though lod in selected_ids can also be used to gather by
-    # sequence_expand, it is not efficient.
-    # gather_op's index input only supports int32 dtype currently
-    parent_idx = helper.create_variable_for_type_inference(dtype="int32")
-
-    helper.append_op(
-        type='beam_search',
-        inputs=inputs,
-        outputs={
-            'selected_ids': selected_ids,
-            'selected_scores': selected_scores,
-            'parent_idx': parent_idx
-        },
-        attrs={
-            # TODO(ChunweiYan) to assure other value support
-            'level': level,
-            'beam_size': beam_size,
-            'end_id': end_id,
-            'is_accumulated': is_accumulated,
-        })
-    if return_parent_idx:
-        return selected_ids, selected_scores, parent_idx
-    else:
-        return selected_ids, selected_scores
-
-
-def beam_search_decode(ids, scores, beam_size, end_id, name=None):
-    """
-    Beam Search Decode Layer. This layer constructs the full hypotheses for
-    each source sentence by walking back along the LoDTensorArray :attr:`ids`
-    whose lods can be used to restore the path in the beam search tree.
-    Please see the following demo for a fully beam search usage example:
-        fluid/tests/book/test_machine_translation.py
-
-    Args:
-        ids(Variable): The LodTensorArray variable containing the selected ids
-            of all steps.
-        scores(Variable): The LodTensorArray variable containing the selected
-            scores of all steps.
-        beam_size(int): The beam width used in beam search.
-        end_id(int): The id of end token.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The LodTensor pair containing the generated id sequences \
-            and the corresponding scores. The shapes and lods of the two \
-            LodTensor are same. The lod level is 2 and the two levels \
-            separately indicate how many hypotheses each source sentence has \
-            and how many ids each hypothesis has.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            # Suppose `ids` and `scores` are LodTensorArray variables reserving
-            # the selected ids and scores of all steps
-            ids = fluid.layers.create_array(dtype='int64')
-            scores = fluid.layers.create_array(dtype='float32')
-            finished_ids, finished_scores = fluid.layers.beam_search_decode(
-                ids, scores, beam_size=5, end_id=0)
-    """
-    helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_variable_for_type_inference(dtype=ids.dtype)
-    sentence_scores = helper.create_variable_for_type_inference(dtype=ids.dtype)
-
-    helper.append_op(
-        type="beam_search_decode",
-        inputs={"Ids": ids,
-                "Scores": scores},
-        outputs={
-            "SentenceIds": sentence_ids,
-            "SentenceScores": sentence_scores
-        },
-        attrs={"beam_size": beam_size,
-               "end_id": end_id})
-
-    return sentence_ids, sentence_scores
-
-
-def lstm_unit(x_t,
-              hidden_t_prev,
-              cell_t_prev,
-              forget_bias=0.0,
-              param_attr=None,
-              bias_attr=None,
-              name=None):
-    """Lstm unit layer. The equation of a lstm step is:
-
-        .. math::
-
-            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i)
-
-            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f)
-
-            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c)
-
-            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o)
-
-            h_t & = o_t tanh(c_t)
-
-    The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and
-    :math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}`
-    should be same. The implementation separates the linear transformation and
-    non-linear transformation apart. Here, we take :math:`i_t` as an example.
-    The linear transformation is applied by calling a `fc` layer and the
-    equation is:
-
-        .. math::
-
-            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i
-
-    The non-linear transformation is applied by calling `lstm_unit_op` and the
-    equation is:
-
-        .. math::
-
-            i_t = \sigma(L_{i_t})
-
-    This layer has two outputs including :math:`h_t` and :math:`c_t`.
-
-    Args:
-        x_t (Variable): The input value of current step, a 2-D tensor with shape
-            M x N, M for batch size and N for input size.
-        hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor
-            with shape M x S, M for batch size and S for size of lstm unit.
-        cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
-            shape M x S, M for batch size and S for size of lstm unit.
-        forget_bias (float): The forget bias of lstm unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-                               hidden-hidden weights.
-                               If it is set to None or one attribute of ParamAttr,
-                               lstm_unit will create ParamAttr as param_attr.
-                               If the Initializer of the param_attr is not set, the
-                               parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-                              weights. If it is set to False, no bias will be added
-                              to the output units. If it is set to None or one attribute of ParamAttr,
-                              lstm_unit will create ParamAttr as bias_attr.
-                              If the Initializer of the bias_attr is not set,
-                              the bias is initialized zero. Default: None.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-        tuple: The hidden value and cell value of lstm unit.
-
-    Raises:
-        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**
-                    not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev**
-                    and **cell_t_prev** not be the same or the 2nd dimensions of
-                    **hidden_t_prev** and **cell_t_prev** not be the same.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            dict_dim, emb_dim, hidden_dim = 128, 64, 512
-            data = fluid.layers.data(name='step_data', shape=[1], dtype='int32')
-            x = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
-            pre_hidden = fluid.layers.data(
-                name='pre_hidden', shape=[hidden_dim], dtype='float32')
-            pre_cell = fluid.layers.data(
-                name='pre_cell', shape=[hidden_dim], dtype='float32')
-            hidden = fluid.layers.lstm_unit(
-                x_t=x,
-                hidden_t_prev=pre_hidden,
-                cell_t_prev=pre_cell)
-    """
-    helper = LayerHelper('lstm_unit', **locals())
-
-    if len(x_t.shape) != 2:
-        raise ValueError("Rank of x_t must be 2.")
-
-    if len(hidden_t_prev.shape) != 2:
-        raise ValueError("Rank of hidden_t_prev must be 2.")
-
-    if len(cell_t_prev.shape) != 2:
-        raise ValueError("Rank of cell_t_prev must be 2.")
-
-    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
-            0] != cell_t_prev.shape[0]:
-        raise ValueError("The 1st dimensions of x_t, hidden_t_prev and "
-                         "cell_t_prev must be the same.")
-
-    if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
-        raise ValueError("The 2nd dimensions of hidden_t_prev and "
-                         "cell_t_prev must be the same.")
-
-    if bias_attr is None:
-        bias_attr = ParamAttr()
-
-    size = cell_t_prev.shape[1]
-    concat_out = concat(input=[x_t, hidden_t_prev], axis=1)
-    fc_out = fc(input=concat_out,
-                size=4 * size,
-                param_attr=param_attr,
-                bias_attr=bias_attr)
-    dtype = x_t.dtype
-    c = helper.create_variable_for_type_inference(dtype)
-    h = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type='lstm_unit',
-        inputs={"X": fc_out,
-                "C_prev": cell_t_prev},
-        outputs={"C": c,
-                 "H": h},
-        attrs={"forget_bias": forget_bias})
-
-    return h, c
-
-
-def reduce_sum(input, dim=None, keep_dim=False, name=None):
-    """
-    Computes the sum of tensor elements over the given dimension.
-
-    Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (list|int|None): The dimensions along which the sum is performed. If
-            :attr:`None`, sum all elements of :attr:`input` and return a
-            Tensor variable with a single element, otherwise must be in the
-            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
-            the dimension to reduce is :math:`rank + dim[i]`.
-        keep_dim (bool|False): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-        Variable: The reduced Tensor variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the corresponding output tensor.
-            x = fluid.layers.data(name='x', shape=[4, 2], dtype='float32')
-            fluid.layers.reduce_sum(x)  # [3.5]
-            fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
-            fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
-            fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
-
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
-            #      [[[1, 2], [3, 4]],
-            #      [[5, 6], [7, 8]]]
-            # Each example is followed by the corresponding output tensor.
-            y = fluid.layers.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_sum(y, dim=[1, 2]) # [10, 26]
-            fluid.layers.reduce_sum(y, dim=[0, 1]) # [16, 20]
-
-    """
-    helper = LayerHelper('reduce_sum', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-    helper.append_op(
-        type='reduce_sum',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None else False
-        })
-    return out
-
-
-def reduce_mean(input, dim=None, keep_dim=False, name=None):
-    """
-    Computes the mean of the input tensor's elements along the given dimension.
-
-    Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (list|int|None): The dimension along which the mean is computed. If
-            `None`, compute the mean over all elements of :attr:`input`
-            and return a variable with a single element, otherwise it
-            must be in the range :math:`[-rank(input), rank(input))`. If
-            :math:`dim[i] < 0`, the dimension to reduce is
-            :math:`rank(input) + dim[i]`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set `None`, the layer
-                       will be named automatically.
-
-    Returns:
-        Variable: The reduced mean Variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
-            x = fluid.layers.data(name='x', shape=[4, 2], dtype='float32')
-            fluid.layers.reduce_mean(x)  # [0.4375]
-            fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
-            fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
-            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
-
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
-            #      [[[1.0, 2.0], [3.0, 4.0]],
-            #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
-            y = fluid.layers.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_mean(y, dim=[1, 2]) # [2.5, 6.5]
-            fluid.layers.reduce_mean(y, dim=[0, 1]) # [4.0, 5.0]
-    """
-    helper = LayerHelper('reduce_mean', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-    helper.append_op(
-        type='reduce_mean',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None else False
-        })
-    return out
-
-
-def reduce_max(input, dim=None, keep_dim=False, name=None):
-    """
-    Computes the maximum of tensor elements over the given dimension.
-
-    Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (list|int|None): The dimension along which the maximum is computed.
-            If :attr:`None`, compute the maximum over all elements of
-            :attr:`input` and return a Tensor variable with a single element,
-            otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-        Variable: The reduced Tensor variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
-            x = fluid.layers.data(name='x', shape=[4, 2], dtype='float32')
-            fluid.layers.reduce_max(x)  # [0.9]
-            fluid.layers.reduce_max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
-            fluid.layers.reduce_max(x, dim=-1)  # [0.9, 0.7]
-            fluid.layers.reduce_max(x, dim=1, keep_dim=True)  # [[0.9], [0.7]]
-
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
-            #      [[[1.0, 2.0], [3.0, 4.0]],
-            #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
-            y = fluid.layers.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_max(y, dim=[1, 2]) # [4.0, 8.0]
-            fluid.layers.reduce_max(y, dim=[0, 1]) # [7.0, 8.0]
-    """
-    helper = LayerHelper('reduce_max', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-    helper.append_op(
-        type='reduce_max',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None else False
-        })
-    return out
-
-
-def reduce_min(input, dim=None, keep_dim=False, name=None):
-    """
-    Computes the minimum of tensor elements over the given dimension.
-
-    Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (list|int|None): The dimensions along which the minimum is computed.
-            If :attr:`None`, compute the minimum over all elements of
-            :attr:`input` and return a Tensor variable with a single element,
-            otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-        Variable: The reduced Tensor variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
-            x = fluid.layers.data(name='x', shape=[4, 2], dtype='float32')
-            fluid.layers.reduce_min(x)  # [0.1]
-            fluid.layers.reduce_min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
-            fluid.layers.reduce_min(x, dim=-1)  # [0.2, 0.1]
-            fluid.layers.reduce_min(x, dim=1, keep_dim=True)  # [[0.2], [0.1]]
-
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
-            #      [[[1.0, 2.0], [3.0, 4.0]],
-            #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
-            y = fluid.layers.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_min(y, dim=[1, 2]) # [1.0, 5.0]
-            fluid.layers.reduce_min(y, dim=[0, 1]) # [1.0, 2.0]
-    """
-    helper = LayerHelper('reduce_min', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-    helper.append_op(
-        type='reduce_min',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None else False
-        })
-    return out
-
-
-def reduce_prod(input, dim=None, keep_dim=False, name=None):
-    """
-    Computes the product of tensor elements over the given dimension.
-
-    Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (list|int|None): The dimensions along which the product is performed. If
-            :attr:`None`, multipy all elements of :attr:`input` and return a
-            Tensor variable with a single element, otherwise must be in the
-            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
-            the dimension to reduce is :math:`rank + dim[i]`.
-        keep_dim (bool|False): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the
-            layer will be named automatically.
-
-    Returns:
-        Variable: The reduced Tensor variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
-            x = fluid.layers.data(name='x', shape=[4, 2], dtype='float32')
-            fluid.layers.reduce_prod(x)  # [0.0002268]
-            fluid.layers.reduce_prod(x, dim=0)  # [0.02, 0.06, 0.3, 0.63]
-            fluid.layers.reduce_prod(x, dim=-1)  # [0.027, 0.0084]
-            fluid.layers.reduce_prod(x, dim=1,
-                                     keep_dim=True)  # [[0.027], [0.0084]]
-
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
-            #      [[[1.0, 2.0], [3.0, 4.0]],
-            #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
-            y = fluid.layers.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_prod(y, dim=[1, 2]) # [24.0, 1680.0]
-            fluid.layers.reduce_prod(y, dim=[0, 1]) # [105.0, 384.0]
-    """
-    helper = LayerHelper('reduce_prod', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-    helper.append_op(
-        type='reduce_prod',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None else False
-        })
-    return out
-
-
-def reduce_all(input, dim=None, keep_dim=False, name=None):
-    """
-    Computes the ``logical and`` of tensor elements over the given dimension.
-
-    Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (list|int|None): The dimension along which the logical and is computed.
-            If :attr:`None`, compute the logical and over all elements of
-            :attr:`input` and return a Tensor variable with a single element,
-            otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-        Variable: The reduced Tensor variable.
-
-    Examples:
-        .. code-block:: python
-        
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            import numpy as np
-
-            # x is a bool Tensor variable with following elements:
-            #    [[True, False]
-            #     [True, True]]
-            x = layers.assign(np.array([[1, 0], [1, 1]], dtype='int32'))
-            x = layers.cast(x, 'bool')
-
-            out = layers.reduce_all(x)  # False 
-            out = layers.reduce_all(x, dim=0)  # [True, False]
-            out = layers.reduce_all(x, dim=-1)  # [False, True]
-            out = layers.reduce_all(x, dim=1, keep_dim=True)  # [[False], [True]]
-
-    """
-    helper = LayerHelper('reduce_all', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-    helper.append_op(
-        type='reduce_all',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None else False
-        })
-    return out
-
-
-def reduce_any(input, dim=None, keep_dim=False, name=None):
-    """
-    Computes the ``logical or`` of tensor elements over the given dimension.
-
-    Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (list|int|None): The dimension along which the logical or is computed.
-            If :attr:`None`, compute the logical or over all elements of
-            :attr:`input` and return a Tensor variable with a single element,
-            otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-        Variable: The reduced Tensor variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            import numpy as np
-
-            # x is a bool Tensor variable with following elements:
-            #    [[True, False]
-            #     [False, False]]
-            x = layers.assign(np.array([[1, 0], [0, 0]], dtype='int32'))
-            x = layers.cast(x, 'bool')
-
-            out = layers.reduce_any(x)  # True
-            out = layers.reduce_any(x, dim=0)  # [True, False]
-            out = layers.reduce_any(x, dim=-1)  # [True, False]
-            out = layers.reduce_any(x, dim=1,
-                                     keep_dim=True)  # [[True], [False]]
-
-    """
-    helper = LayerHelper('reduce_any', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-    helper.append_op(
-        type='reduce_any',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None else False
-        })
-    return out
-
-
-def split(input, num_or_sections, dim=-1, name=None):
-    """
-    Split the input tensor into multiple sub-tensors.
-
-    Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor.
-        num_or_sections (int|list): If :attr:`num_or_sections` is an integer,
-            then the integer indicates the number of equal sized sub-tensors
-            that the tensor will be divided into. If :attr:`num_or_sections`
-            is a list of integers, the length of list indicates the number of
-            sub-tensors and the integers indicate the sizes of sub-tensors'
-            :attr:`dim` dimension orderly.
-        dim (int): The dimension along which to split. If :math:`dim < 0`, the
-            dimension to split along is :math:`rank(input) + dim`.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-        list(Variable): The list of segmented tensor variables.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            # input is a variable which shape is [-1, 3, 9, 5]
-            input = fluid.layers.data(
-                 name="input", shape=[3, 9, 5], dtype="float32")
-
-            x0, x1, x2 = fluid.layers.split(input, num_or_sections=3, dim=2)
-            # x0.shape [-1, 3, 3, 5]
-            # x1.shape [-1, 3, 3, 5]
-            # x2.shape [-1, 3, 3, 5]
-
-            x0, x1, x2 = fluid.layers.split(input, num_or_sections=3, dim=2)
-            # x0.shape [-1, 3, 2, 5]
-            # x1.shape [-1, 3, 3, 5]
-            # x2.shape [-1, 3, 4, 5]
-    """
-    helper = LayerHelper('split', **locals())
-    input_shape = input.shape
-    dim = (len(input_shape) + dim) if dim < 0 else dim
-    if isinstance(num_or_sections, int):
-        assert num_or_sections > 1, 'num_or_sections must be more than 1.'
-        num = num_or_sections
-    else:
-        assert len(num_or_sections) <= input_shape[
-            dim], 'len(num_or_sections) must not be more than input.shape[dim].'
-        num = len(num_or_sections)
-    outs = [
-        helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-        for i in range(num)
-    ]
-    helper.append_op(
-        type='split',
-        inputs={'X': input},
-        outputs={'Out': outs},
-        attrs={
-            'num': num_or_sections if isinstance(num_or_sections, int) else 0,
-            'sections': num_or_sections
-            if isinstance(num_or_sections, list) else [],
-            'axis': dim
-        })
-    return outs
-
-
-def l2_normalize(x, axis, epsilon=1e-12, name=None):
-    """
-    **L2 normalize Layer**
-
-    The l2 normalize layer normalizes `x` along dimension `axis` using an L2
-    norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
-
-    .. math::
-
-        y = \\frac{x}{ \sqrt{\sum {x^2} + epsion }}
-
-    For `x` with more dimensions, this layer independently normalizes each 1-D
-    slice along dimension `axis`.
-
-    Args:
-        x(Variable|list): The input tensor to l2_normalize layer.
-        axis(int): The axis on which to apply normalization. If `axis < 0`, \
-            the dimension to normalization is rank(X) + axis. -1 is the
-            last dimension.
-        epsilon(float): The epsilon value is used to avoid division by zero, \
-            the default value is 1e-12.
-        name(str|None): A name for this layer(optional). If set None, the layer \
-            will be named automatically.
-
-    Returns:
-        Variable: The output tensor variable is the same shape with `x`.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name="data",
-                                     shape=(3, 17, 13),
-                                     dtype="float32")
-            normed = fluid.layers.l2_normalize(x=data, axis=1)
-    """
-
-    if len(x.shape) == 1:
-        axis = 0
-    helper = LayerHelper("l2_normalize", **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    norm = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="norm",
-        inputs={"X": x},
-        outputs={"Out": out,
-                 "Norm": norm},
-        attrs={
-            "axis": 1 if axis is None else axis,
-            "epsilon": epsilon,
-        })
-    return out
-
-
-def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
-    """
-    Applies matrix multiplication to two tensors.
-
-    Currently, the input tensors' rank can be any, but when the rank of any
-    inputs is bigger than 3, this two inputs' rank should be equal.
-
-    The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
-    flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
-
-    - If a transpose flag is specified, the last two dimensions of the tensor
-      are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
-      :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
-      :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
-      opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
-      :math:`[1, D]` in transposed form.
-
-    - After transpose, the two tensors are 2-D or n-D and matrix multiplication
-      performs in the following way.
-
-      - If both are 2-D, they are multiplied like conventional matrices.
-      - If either is n-D, it is treated as a stack of matrices residing in the
-        last two dimensions and a batched matrix multiply supporting broadcast
-        applies on the two tensors.
-
-    Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
-    nontransposed, the prepended or appended dimension :math:`1` will be
-    removed after matrix multiplication.
-
-    Args:
-        x (Variable): The input variable which is a Tensor or LoDTensor.
-        y (Variable): The input variable which is a Tensor or LoDTensor.
-        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
-        transpose_y (bool): Whether to transpose :math:`y` before multiplication.
-        alpha (float): The scale of output. Default 1.0.
-        name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-
-    Returns:
-        Variable: The product Tensor (or LoDTensor) variable.
-
-    Examples:
-        .. code-block:: python
-
-            # Examples to clarify shapes of the inputs and output
-            # x: [B, ..., M, K], y: [B, ..., K, N]
-            # fluid.layers.matmul(x, y)  # out: [B, ..., M, N]
-
-            # x: [B, M, K], y: [B, K, N]
-            # fluid.layers.matmul(x, y)  # out: [B, M, N]
-
-            # x: [B, M, K], y: [K, N]
-            # fluid.layers.matmul(x, y)  # out: [B, M, N]
-
-            # x: [M, K], y: [K, N]
-            # fluid.layers.matmul(x, y)  # out: [M, N]
-
-            # x: [B, M, K], y: [K]
-            # fluid.layers.matmul(x, y)  # out: [B, M]
-
-            # x: [K], y: [K]
-            # fluid.layers.matmul(x, y)  # out: [1]
-
-            # x: [M], y: [N]
-            # fluid.layers.matmul(x, y, True, True)  # out: [M, N]
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[2, 3], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[3, 2], dtype='float32')
-            out = fluid.layers.matmul(x, y, True, True)
-    """
-
-    def __check_input(x, y):
-        x_shape = list(x.shape)
-        y_shape = list(y.shape)
-        if len(x_shape) == 1:
-            x_shape = [1] + x_shape
-        if len(y_shape) == 1:
-            y_shape = y_shape + [1]
-
-        # check the inner 2 dimensions
-        if transpose_x:
-            x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
-        if transpose_y:
-            y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
-        if x_shape[-1] != y_shape[-2]:
-            raise ValueError("Invalid inputs for matmul. x: %s, y: %s\n" %
-                             (x_shape, y_shape))
-
-        if len(y_shape) > 2 and len(x_shape) > 2:
-            for i, dim_x in enumerate(x_shape[:-2]):
-                # don't check neg shape
-                if dim_x < 0 or y_shape[i] < 0:
-                    continue
-                if dim_x != y_shape[i]:
-                    raise ValueError("Invalid inputs for matmul. x(%s), y(%s)" %
-                                     (x.shape, y.shape))
-
-    __check_input(x, y)
-
-    helper = LayerHelper('matmul', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='matmul',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out},
-        attrs={
-            'transpose_X': transpose_x,
-            'transpose_Y': transpose_y,
-            'alpha': float(alpha),
-        })
-    return out
-
-
-def topk(input, k, name=None):
-    """
-    This operator is used to find values and indices of the k largest entries
-    for the last dimension.
-
-    If the input is a vector (1-D Tensor), finds the k largest entries in the vector
-    and outputs their values and indices as vectors. Thus values[j] is the j-th
-    largest entry in input, and its index is indices[j].
-
-    If the input is a Tensor with higher rank, this operator computes the top k
-    entries along the last dimension.
-
-    For example:
-
-    .. code-block:: text
-
-        If:
-            input = [[5, 4, 2, 3],
-                     [9, 7, 10, 25],
-                     [6, 2, 10, 1]]
-            k = 2
-
-        Then:
-            The first output:
-            values = [[5, 4],
-                      [10, 25],
-                      [6, 10]]
-
-            The second output:
-            indices = [[0, 1],
-                       [2, 3],
-                       [0, 2]]
-
-    Args:
-        input(Variable): The input variable which can be a vector or Tensor with
-            higher rank.
-        k(int | Variable):  The number of top elements to look for along the last dimension
-                 of input.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-                       Default: None
-
-    Returns:
-        Tuple[Variable]: A tuple with two elements. Each element is a Variable.
-        The first one is k largest elements along each last
-        dimensional slice. The second one is indices of values
-        within the last dimension of input.
-
-    Raises:
-        ValueError: If k < 1 or k is not less than the last dimension of input
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            input = layers.data(name="input", shape=[13, 11], dtype='float32')
-            top5_values, top5_indices = layers.topk(input, k=5)
-    """
-    helper = LayerHelper("top_k", **locals())
-    values = helper.create_variable_for_type_inference(dtype=input.dtype)
-    indices = helper.create_variable_for_type_inference(dtype="int64")
-    inputs = {"X": [input]}
-    attrs = None
-    if isinstance(k, Variable):
-        inputs['K'] = k
-    else:
-        attrs = {'k': k}
-    helper.append_op(
-        type="top_k",
-        inputs=inputs,
-        outputs={"Out": [values],
-                 "Indices": [indices]},
-        attrs=attrs)
-    values.stop_gradient = True
-    indices.stop_gradient = True
-    return values, indices
-
-
-def edit_distance(input,
-                  label,
-                  normalized=True,
-                  ignored_tokens=None,
-                  input_length=None,
-                  label_length=None):
-    """
-    Edit distance operator computes the edit distances between a batch of
-    hypothesis strings and their references. Edit distance, also called
-    Levenshtein distance, measures how dissimilar two strings are by counting
-    the minimum number of operations to transform one string into anthor.
-    Here the operations include insertion, deletion, and substitution.
-
-    For example, given hypothesis string A = "kitten" and reference
-    B = "sitting", the edit distance is 3 for A will be transformed into B
-    at least after two substitutions and one insertion:
-
-    "kitten" -> "sitten" -> "sittin" -> "sitting"
-
-    The input is a LoDTensor/Tensor consisting of all the hypothesis strings with
-    the total number denoted by `batch_size`, and the separation is specified
-    by the LoD information or input_length. And the `batch_size` reference strings are arranged
-    in order in the same way as `input`.
-
-    The output contains the `batch_size` results and each stands for the edit
-    distance for a pair of strings respectively. If Attr(normalized) is true,
-    the edit distance will be divided by the length of reference string.
-
-    Args:
-        input(Variable): The indices for hypothesis strings, it should have rank 2 and dtype int64.
-        label(Variable): The indices for reference strings, it should have rank 2 and dtype int64.
-        normalized(bool, default True): Indicated whether to normalize the edit distance by
-                          the length of reference string.
-        ignored_tokens(list<int>, default None): Tokens that should be removed before
-                                     calculating edit distance.
-        input_length(Variable): The length for each sequence in `input` if it's of Tensor type, it should have shape `[batch_size]` and dtype int64.
-        label_length(Variable): The length for each sequence in `label` if it's of Tensor type, it should have shape `[batch_size]` and dtype int64.
-
-    Returns:
-        edit_distance_out(Variable): edit distance result in shape [batch_size, 1]. \n
-        sequence_num(Variable): sequence number in shape [].
-        
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-
-            # using LoDTensor
-            x_lod = fluid.layers.data(name='x_lod', shape=[1], dtype='int64', lod_level=1)
-            y_lod = fluid.layers.data(name='y_lod', shape=[1], dtype='int64', lod_level=1)
-            distance_lod, seq_num_lod = fluid.layers.edit_distance(input=x_lod, label=y_lod)
-
-            # using Tensor
-            x_seq_len = 5
-            y_seq_len = 6
-            x_pad = fluid.layers.data(name='x_pad', shape=[x_seq_len], dtype='int64')
-            y_pad = fluid.layers.data(name='y_pad', shape=[y_seq_len], dtype='int64')
-            x_len = fluid.layers.data(name='x_len', shape=[], dtype='int64')
-            y_len = fluid.layers.data(name='y_len', shape=[], dtype='int64')
-            distance_pad, seq_num_pad = fluid.layers.edit_distance(input=x_pad, label=y_pad, input_length=x_len, label_length=y_len)
-
-    """
-    helper = LayerHelper("edit_distance", **locals())
-
-    # remove some tokens from input and labels
-    if ignored_tokens is not None and len(ignored_tokens) > 0:
-        erased_input = helper.create_variable_for_type_inference(dtype="int64")
-        erased_label = helper.create_variable_for_type_inference(dtype="int64")
-
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [input]},
-            outputs={"Out": [erased_input]},
-            attrs={"tokens": ignored_tokens})
-        input = erased_input
-
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [label]},
-            outputs={"Out": [erased_label]},
-            attrs={"tokens": ignored_tokens})
-        label = erased_label
-
-    this_inputs = {"Hyps": [input], "Refs": [label]}
-    if input_length and label_length:
-        this_inputs['HypsLength'] = [input_length]
-        this_inputs['RefsLength'] = [label_length]
-
-    # edit distance op
-    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
-    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(
-        type="edit_distance",
-        inputs=this_inputs,
-        outputs={"Out": [edit_distance_out],
-                 "SequenceNum": [sequence_num]},
-        attrs={"normalized": normalized})
-
-    return edit_distance_out, sequence_num
-
-
-def ctc_greedy_decoder(input,
-                       blank,
-                       input_length=None,
-                       padding_value=0,
-                       name=None):
-    """
-    This op is used to decode sequences by greedy policy by below steps:
-
-    1. Get the indexes of max value for each row in input. a.k.a.
-       numpy.argmax(input, axis=0).
-    2. For each sequence in result of step1, merge repeated tokens between two
-       blanks and delete all blanks.
-
-    A simple example as below:
-
-    .. code-block:: text
-
-        Given:
-        for lod mode:
-
-        input.data = [[0.6, 0.1, 0.3, 0.1],
-                      [0.3, 0.2, 0.4, 0.1],
-                      [0.1, 0.5, 0.1, 0.3],
-                      [0.5, 0.1, 0.3, 0.1],
-
-                      [0.5, 0.1, 0.3, 0.1],
-                      [0.2, 0.2, 0.2, 0.4],
-                      [0.2, 0.2, 0.1, 0.5],
-                      [0.5, 0.1, 0.3, 0.1]]
-
-        input.lod = [[4, 4]]
-
-        Computation:
-
-        step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
-               [[0], [2], [1], [0]]
-        step2: merge repeated tokens and remove blank which is 0. Then we get first output sequence:
-               [[2], [1]]
-
-        Finally:
-
-        output.data = [[2],
-                       [1],
-                       [3]]
-
-        output.lod = [[2, 1]]
-
-        for padding mode:
-
-         input.data = [[[0.6, 0.1, 0.3, 0.1],
-                        [0.3, 0.2, 0.4, 0.1],
-                        [0.1, 0.5, 0.1, 0.3],
-                        [0.5, 0.1, 0.3, 0.1]],
-
-                       [[0.5, 0.1, 0.3, 0.1],
-                        [0.2, 0.2, 0.2, 0.4],
-                        [0.2, 0.2, 0.1, 0.5],
-                        [0.5, 0.1, 0.3, 0.1]]]
-
-        input_length.data = [[4], [4]]
-        input.shape = [2, 4, 4]
-
-        step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
-               [[0], [2], [1], [0]], for input.data[4:8] is [[0], [3], [3], [0]], shape is [2,4,1]
-        step2: Change the argmax result to use padding mode, then argmax result is 
-                [[0, 2, 1, 0], [0, 3, 3, 0]], shape is [2, 4], lod is [], input_length is [[4], [4]]
-        step3: Apply ctc_align to padding argmax result, padding_value is 0
-
-        Finally:
-        output.data = [[2, 1, 0, 0],
-                       [3, 0, 0, 0]]
-        output_length.data = [[2], [1]]
-
-
-
-
-    Args:
-
-        input(Variable): (LoDTensor<float>), the probabilities of
-                         variable-length sequences. When in lod mode, it is a 2-D Tensor with
-                         LoD information. It's shape is [Lp, num_classes + 1] 
-                         where Lp is the sum of all input sequences' length and
-                         num_classes is the true number of classes. When in padding mode,
-                         it is a 3-D Tensor with padding, It's shape is [batch_size, N, num_classes + 1].
-                         (not including the blank label).
-        blank(int): the blank label index of Connectionist Temporal
-                    Classification (CTC) loss, which is in thehalf-opened
-                    interval [0, num_classes + 1).
-        input_length(Variable, optional): (LoDTensor<int>), shape is [batch_size, 1], when in lod mode, input_length
-                                 is None.
-        padding_value(int): padding value.
-        name (str, optional): The name of this layer. It is optional.
-
-    Returns:
-        output(Variable): For lod mode, CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
-                  'Lp' is the sum if all output sequences' length. If all the sequences \
-                  in result were empty, the result LoDTensor will be [-1] with  \
-                  LoD [[]] and dims [1, 1]. For padding mode, CTC greedy decode result is a 2-D tensor \
-                  with shape [batch_size, N], output length's shape is [batch_size, 1] which is length \
-                  of every sequence in output.
-        output_length(Variable, optional): length of each sequence of output for padding mode.
-
-    Examples:
-        .. code-block:: python
-
-            # for lod mode
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[8], dtype='float32')
-            cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
-
-            # for padding mode
-            x_pad = fluid.layers.data(name='x_pad', shape=[4,8], dtype='float32')
-            x_pad_len = fluid.layers.data(name='x_pad_len', shape=[1], dtype='int64')
-            out, out_len = fluid.layers.ctc_greedy_decoder(input=x_pad, blank=0,
-                            input_length=x_pad_len)
-
-    """
-    helper = LayerHelper("ctc_greedy_decoder", **locals())
-    _, topk_indices = topk(input, k=1)
-
-    # ctc align op
-    ctc_out = helper.create_variable_for_type_inference(dtype="int64")
-
-    if input_length is None:
-        helper.append_op(
-            type="ctc_align",
-            inputs={"Input": [topk_indices]},
-            outputs={"Output": [ctc_out]},
-            attrs={"merge_repeated": True,
-                   "blank": blank})
-        return ctc_out
-    else:
-        ctc_out_len = helper.create_variable_for_type_inference(dtype="int64")
-        ctc_input = squeeze(topk_indices, [2])
-
-        helper.append_op(
-            type="ctc_align",
-            inputs={"Input": [ctc_input],
-                    "InputLength": [input_length]},
-            outputs={"Output": [ctc_out],
-                     "OutputLength": [ctc_out_len]},
-            attrs={
-                "merge_repeated": True,
-                "blank": blank,
-                "padding_value": padding_value
-            })
-        return ctc_out, ctc_out_len
-
-
-def warpctc(input,
-            label,
-            blank=0,
-            norm_by_times=False,
-            input_length=None,
-            label_length=None):
-    """
-    An operator integrating the open source Warp-CTC library
-    (https://github.com/baidu-research/warp-ctc)
-    to compute Connectionist Temporal Classification (CTC) loss.
-    It can be aliased as softmax with CTC, since a native softmax activation is
-    interated to the Warp-CTC library, to to normlize values for each row of the
-    input tensor.
-
-    Args:
-       input (Variable): The unscaled probabilities of variable-length sequences,
-         which is a 2-D Tensor with LoD information, or a 3-D Tensor without Lod
-         information. When it is a 2-D LodTensor, it's shape is 
-         [Lp, num_classes + 1], where Lp is the sum of all input
-         sequences' length and num_classes is the true number of classes.
-         (not including the blank label). When it is a 3-D Tensor, it's shape 
-         is [max_logit_length, batch_size, num_classes + 1],
-         where max_logit_length is the length of the longest
-         input logit sequence.
-       label (Variable): The ground truth of variable-length sequence,
-         which is a 2-D Tensor with LoD information or a 2-D Tensor without
-         LoD information. When it is a 2-D LoDTensor or 2-D Tensor, 
-         it is of the shape [Lg, 1], where Lg is th sum of all labels' length.
-       blank (int, default 0): The blank label index of Connectionist
-         Temporal Classification (CTC) loss, which is in the
-         half-opened interval [0, num_classes + 1).
-       norm_by_times(bool, default false): Whether to normalize the gradients
-         by the number of time-step, which is also the sequence's length.
-         There is no need to normalize the gradients if warpctc layer was
-         follewed by a mean_op.
-       input_length(Variable): The length for each input sequence if it is 
-         of Tensor type, it should have shape `[batch_size]` and dtype int64.
-       label_length(Variable): The length for each label sequence if it is
-         of Tensor type, it should have shape `[batch_size]` and dtype int64.
-
-    Returns:
-        Variable: The Connectionist Temporal Classification (CTC) loss,
-        which is a 2-D Tensor of the shape [batch_size, 1].
-
-    Examples:
-        .. code-block:: python
-
-            # using LoDTensor
-            import paddle.fluid as fluid
-            import numpy as np
-            
-            label = fluid.layers.data(name='label', shape=[12, 1],
-                                      dtype='float32', lod_level=1)
-            predict = fluid.layers.data(name='predict', 
-                                        shape=[11, 8],
-                                        dtype='float32',lod_level=1)
-            cost = fluid.layers.warpctc(input=predict, label=label)
-
-            # using Tensor
-            input_length = fluid.layers.data(name='logits_length', shape=[11],
-                                         dtype='int64')
-            label_length = fluid.layers.data(name='labels_length', shape=[12],
-                                         dtype='int64')
-            target = fluid.layers.data(name='target', shape=[12, 1],
-                                       dtype='int32')
-            # length of the longest logit sequence
-            max_seq_length = 4
-            # number of logit sequences
-            batch_size = 4
-            output = fluid.layers.data(name='output', 
-                                       shape=[max_seq_length, batch_size, 8],
-                                       dtype='float32')
-            loss = fluid.layers.warpctc(input=output,label=target,
-                                        input_length=input_length,
-                                        label_length=label_length)
-
-    """
-    helper = LayerHelper('warpctc', **locals())
-    this_inputs = {'Logits': [input], 'Label': [label]}
-    if input_length and label_length:
-        this_inputs['LogitsLength'] = [input_length]
-        this_inputs['LabelLength'] = [label_length]
-
-    loss_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    grad_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-
-    helper.append_op(
-        type='warpctc',
-        inputs=this_inputs,
-        outputs={'WarpCTCGrad': [grad_out],
-                 'Loss': [loss_out]},
-        attrs={
-            'blank': blank,
-            'norm_by_times': norm_by_times,
-        })
-    return loss_out
-
-
-def sequence_reshape(input, new_dim):
-    """
-    **Sequence Reshape Layer**
-
-    This layer will rearrange the input sequences. The new dimension is set by
-    user. Length of each sequence is computed according to original length,
-    original dimension and new dimension. The following example will help to
-    illustrate the function of this layer:
-
-    .. code-block:: text
-
-        x is a LoDTensor:
-            x.lod  = [[0, 2, 6]]
-            x.data = [[1,  2], [3,  4],
-                      [5,  6], [7,  8],
-                      [9, 10], [11, 12]]
-            x.dims = [6, 2]
-
-        set new_dim = 4
-
-        then out is a LoDTensor:
-
-            out.lod  = [[0, 1, 3]]
-
-            out.data = [[1,  2,  3,  4],
-                        [5,  6,  7,  8],
-                        [9, 10, 11, 12]]
-            out.dims = [3, 4]
-
-    Currently, only 1-level LoDTensor is supported and please make sure
-    (original length * original dimension) can be divided by new dimension with
-    no remainder for each sequence.
-
-    Args:
-
-       input (Variable): A 2-D LoDTensor with shape being [N, M] where M for dimension.
-       new_dim (int): New dimension that the input LoDTensor is reshaped to.
-
-    Returns:
-
-        Variable: Reshaped LoDTensor according to new dimension.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[2, 6], append_batch_size=False, dtype='float32', lod_level=1)
-            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=4)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_reshape', **locals())
-    out = helper.create_variable_for_type_inference(helper.input_dtype())
-    helper.append_op(
-        type='sequence_reshape',
-        inputs={'X': [input]},
-        outputs={'Out': [out]},
-        attrs={'new_dim': new_dim})
-    return out
-
-
-# FIXME(wuyi): let docstring_checker.py understand @autodoc.
-# For now, the comments in c++ use types like Tensor, but in python side
-# the type is often "Variable", and arguments may vary.
-@templatedoc(op_type="nce")
-def nce(input,
-        label,
-        num_total_classes,
-        sample_weight=None,
-        param_attr=None,
-        bias_attr=None,
-        num_neg_samples=None,
-        name=None,
-        sampler="uniform",
-        custom_dist=None,
-        seed=0,
-        is_sparse=False):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): input variable.
-        label (Variable): label.
-        num_total_classes (int):${num_total_classes_comment}
-        sample_weight (Variable|None): A Variable of shape [batch_size, 1]
-            storing a weight for each sample. The default weight for each
-            sample is 1.0.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-             of nce. If it is set to None or one attribute of ParamAttr, nce
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of nce.
-             If it is set to False, no bias will be added to the output units.
-             If it is set to None or one attribute of ParamAttr, nce
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
-        num_neg_samples (int): ${num_neg_samples_comment}
-        name (str|None): A name for this layer(optional). If set None, the layer
-             will be named automatically. Default: None.
-        sampler (str): The sampler used to sample class from negtive classes.
-                       It can be 'uniform', 'log_uniform' or 'custom_dist'.
-                       default: 'uniform'.
-        custom_dist (float[]): A float[] with size=num_total_classes.
-                       It is used when sampler is set to 'custom_dist'.
-                       custom_dist[i] is the probsbility of i-th class to be sampled.
-                       default: None.
-        seed (int): The seed used in sampler. default: 0.
-        is_sparse(bool): The flag indicating whether to use sparse update, the weight@GRAD and bias@GRAD will be changed to SelectedRows.
-
-    Returns:
-        Variable: The output nce loss.
-
-    Examples:
-        .. code-block:: python
-
-
-            import paddle.fluid as fluid
-            import numpy as np
-
-            window_size = 5
-            words = []
-            for i in xrange(window_size):
-                words.append(fluid.layers.data(
-                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
-
-            dict_size = 10000
-            label_word = int(window_size / 2) + 1
-
-            embs = []
-            for i in xrange(window_size):
-                if i == label_word:
-                    continue
-
-                emb = fluid.layers.embedding(input=words[i], size=[dict_size, 32],
-                                   param_attr='embed', is_sparse=True)
-                embs.append(emb)
-
-            embs = fluid.layers.concat(input=embs, axis=1)
-            loss = fluid.layers.nce(input=embs, label=words[label_word],
-                      num_total_classes=dict_size, param_attr='nce.w_0',
-                      bias_attr='nce.b_0')
-
-             #or use custom distribution
-             dist = np.array([0.05,0.5,0.1,0.3,0.05])
-             loss = fluid.layers.nce(input=embs, label=words[label_word],
-                       num_total_classes=5, param_attr='nce.w_1',
-                       bias_attr='nce.b_1',
-                       num_neg_samples=3,
-                       sampler="custom_dist",
-                       custom_dist=dist)
-    """
-    helper = LayerHelper('nce', **locals())
-    assert isinstance(input, Variable)
-    assert isinstance(label, Variable)
-
-    dim = input.shape[1]
-    num_true_class = label.shape[1]
-    w = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[num_total_classes, dim],
-        is_bias=False,
-        dtype=input.dtype)
-    inputs = {}
-    if helper.bias_attr:
-        b = helper.create_parameter(
-            attr=helper.bias_attr,
-            shape=[num_total_classes, 1],
-            is_bias=True,
-            dtype=input.dtype)
-        inputs['Bias'] = b
-    cost = helper.create_variable_for_type_inference(dtype=input.dtype)
-    sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype)
-    sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype)
-
-    inputs['Input'] = input
-    inputs['Label'] = label
-    inputs['Weight'] = w
-    inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
-
-    if sampler == "uniform":
-        sampler = 0
-    elif sampler == "log_uniform":
-        sampler = 1
-    elif sampler == "custom_dist":
-        assert custom_dist is not None
-        # assert isinstance(custom_dist, Variable)
-
-        custom_dist_len = num_total_classes
-        alias_probs_ = [0] * custom_dist_len
-        alias_ = [0] * custom_dist_len
-        bigs = []
-        littles = []
-        for i in range(custom_dist_len):
-            normal_prob = custom_dist[i] * custom_dist_len
-            if normal_prob - 1.0 > 0:
-                bigs.append((i, normal_prob))
-            elif 1.0 - normal_prob > 0:
-                littles.append((i, normal_prob))
-            else:
-                alias_probs_[i] = normal_prob
-                alias_[i] = -1
-
-        while len(bigs) and len(littles):
-            big = bigs.pop(0)
-            little = littles.pop(0)
-
-            big_idx = big[0]
-            big_prob = big[1]
-
-            alias_probs_[little[0]] = little[1]
-            alias_[little[0]] = big_idx
-            big_left = big[1] + little[1] - 1
-            if big_left - 1.0 > 0:
-                bigs.append((big_idx, big_left))
-            elif 1.0 - big_left > 0:
-                littles.append((big_idx, big_left))
-            else:
-                alias_probs_[big_idx] = big_left
-                alias_[big_idx] = -1
-
-        if len(bigs):
-            big = bigs.pop(0)
-            alias_probs_[big[0]] = 1.0
-            alias_[big[0]] = -1
-        if len(littles):
-            little = littles.pop(0)
-            alias_probs_[little[0]] = 1.0
-            alias_[little[0]] = -1
-
-        def _init_by_numpy_array(numpy_array):
-            ret = helper.create_parameter(
-                attr=ParamAttr(),
-                shape=numpy_array.shape,
-                dtype=numpy_array.dtype,
-                default_initializer=NumpyArrayInitializer(numpy_array))
-            ret.stop_gradient = True
-            return ret
-
-        inputs['CustomDistProbs'] = _init_by_numpy_array(
-            np.array(custom_dist).astype('float32'))
-        inputs['CustomDistAlias'] = _init_by_numpy_array(
-            np.array(alias_).astype('int32'))
-        inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
-            np.array(alias_probs_).astype('float32'))
-        sampler = 2
-    else:
-        raise Exception("Unsupported sampler type.")
-
-    if num_neg_samples is None:
-        num_neg_samples = 10
-    else:
-        num_neg_samples = int(num_neg_samples)
-
-    remote_prefetch = is_sparse
-    print(
-        "With sparse mode, if your models has only small parameter prefetch may cause speed down"
-    )
-
-    attrs = {
-        'num_total_classes': int(num_total_classes),
-        'num_neg_samples': num_neg_samples,
-        'seed': seed,
-        'sampler': sampler,
-        'is_sparse': is_sparse,
-        'remote_prefetch': remote_prefetch
-    }
-
-    helper.append_op(
-        type='nce',
-        inputs=inputs,
-        outputs={
-            'Cost': cost,
-            'SampleLogits': sample_logits,
-            'SampleLabels': sample_labels
-        },
-        attrs=attrs)
-    return cost / (num_neg_samples + 1)
-
-
-def hsigmoid(input,
-             label,
-             num_classes,
-             param_attr=None,
-             bias_attr=None,
-             name=None,
-             path_table=None,
-             path_code=None,
-             is_custom=False,
-             is_sparse=False):
-    """
-    The hierarchical sigmoid operator is used to accelerate the training
-    process of language model. This operator organizes the classes into a
-    complete binary tree, or you can use is_custom to pass your own tree to
-    implement hierarchical. Each leaf node represents a class(a word) and each
-    internal node acts as a binary classifier. For each word there's a unique
-    path from root to it's leaf node, hsigmoid calculate the cost for each
-    internal node on the path, and sum them to get a total cost. hsigmoid can
-    achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
-    represents the size of word dict.
-
-    Using default tree you can Refer to `Hierarchical Probabilistic Neural Network Language Model
-    <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
-
-    And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first:
-
-    1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
-    2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
-    3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
-       means label of each binary classification, using 1 indicate true, 0 indicate false.
-    4. now, each word should has its path and code along the path, you can pass a batch of path and code
-       related to the same batch of inputs.
-
-    Args:
-        input (Variable): The input tensor variable with shape
-            :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
-            and :math:`D` is the feature size.
-        label (Variable): The tensor variable contains labels of training data.
-            It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
-            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
-            which indicates the num of classes using by binary classify.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-             of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of hsigmoid.
-             If it is set to False, no bias will be added to the output units.
-             If it is set to None or one attribute of ParamAttr, hsigmoid
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
-        name (str|None): A name for this layer(optional). If set None, the layer
-             will be named automatically. Default: None.
-        path_table: (Variable|None) this variable can store each batch of samples' path to root,
-            it should be in leaf -> root order
-            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
-            structure and each element in this array is indexes in parent nodes' Weight Matrix.
-        path_code:  (Variable|None) this variable can store each batch of samples' code,
-            each code consist with every code of parent nodes. it should be in leaf -> root order
-        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
-             set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
-        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
-             of W and input will be sparse.
-
-    Returns:
-        Out: (LodTensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[2], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[1], dtype='int64')
-            out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
-    """
-
-    helper = LayerHelper('hierarchical_sigmoid', **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    pre_out = helper.create_variable_for_type_inference(dtype)
-    dim = input.shape[1]
-    if ((num_classes is None) or (num_classes < 2)) and (not is_custom):
-        raise ValueError(
-            "num_classes must not be less than 2 with default tree")
-
-    if (not is_custom) and (is_sparse):
-        print("Sparse mode should not be used without custom tree")
-        is_sparse = False
-
-    if (not is_custom) and ((path_table is not None) or
-                            (path_code is not None)):
-        raise ValueError(
-            "only num_classes should be passed without custom tree")
-
-    if (is_custom) and (path_code is None):
-        raise ValueError("path_code should not be None with custom tree")
-    elif (is_custom) and (path_table is None):
-        raise ValueError("path_table should not be None with custom tree")
-    elif (is_custom) and (num_classes is None):
-        raise ValueError("num_classes should not be None with custom tree")
-    else:
-        pass
-
-    weights = None
-    remote_prefetch = is_sparse
-    print(
-        "With sparse mode, if your models has only small parameter prefetch may cause speed down"
-    )
-    if not is_custom:
-        weights = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=[num_classes - 1, dim],
-            is_bias=False,
-            dtype=input.dtype)
-    else:
-        weights = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=[num_classes, dim],
-            is_bias=False,
-            dtype=input.dtype)
-    inputs = {
-        "X": input,
-        "W": weights,
-        "PathTable": path_table,
-        "PathCode": path_code,
-        "Label": label
-    }
-    if helper.bias_attr:
-        if not is_custom:
-            bias = helper.create_parameter(
-                attr=helper.bias_attr,
-                shape=[num_classes - 1, 1],
-                is_bias=True,
-                dtype=input.dtype)
-            inputs['Bias'] = bias
-        else:
-            bias = helper.create_parameter(
-                attr=helper.bias_attr,
-                shape=[num_classes, 1],
-                is_bias=True,
-                dtype=input.dtype)
-            inputs['Bias'] = bias
-    helper.append_op(
-        type="hierarchical_sigmoid",
-        inputs=inputs,
-        outputs={"Out": out,
-                 "PreOut": pre_out,
-                 "W_Out": weights},
-        attrs={
-            "num_classes": num_classes,
-            "is_sparse": is_sparse,
-            "remote_prefetch": remote_prefetch
-        })
-    return out
-
-
-def transpose(x, perm, name=None):
-    """
-    Permute the dimensions of `input` according to `perm`.
-
-    The `i`-th dimension  of the returned tensor will correspond to the
-    perm[i]-th dimension of `input`.
-
-    Args:
-        x (Variable): The input Tensor.
-        perm (list): A permutation of the dimensions of `input`.
-        name (str): The name of this layer. It is optional.
-
-    Returns:
-        Variable: A transposed Tensor.
-
-    Examples:
-        .. code-block:: python
-
-            # use append_batch_size=False to avoid prepending extra
-            # batch size in shape
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[5, 10, 15],
-                            dtype='float32', append_batch_size=False)
-            x_transposed = fluid.layers.transpose(x, perm=[1, 0, 2])
-    """
-
-    if len(perm) != len(x.shape):
-        raise ValueError(
-            "Input(perm) is the permutation of dimensions of Input(input). "
-            "Its length should be equal to Input(input)'s rank.")
-    for idx, dim in enumerate(perm):
-        if dim >= len(x.shape):
-            raise ValueError(
-                "Each element in perm should be less than x's rank. "
-                "%d-th element in perm is %d which accesses x's rank %d." %
-                (idx, perm[idx], len(x.shape)))
-
-    helper = LayerHelper('transpose', **locals())
-    out = helper.create_variable_for_type_inference(x.dtype)
-    x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='transpose2',
-        inputs={'X': [x]},
-        outputs={'Out': [out],
-                 'XShape': [x_shape]},
-        attrs={'axis': perm})
-    return out
-
-
-def im2sequence(input,
-                filter_size=1,
-                stride=1,
-                padding=0,
-                input_image_size=None,
-                out_stride=1,
-                name=None):
-    """
-    Extracts image patches from the input tensor to form a tensor of shape
-    {input.batch_size * output_height * output_width, filter_size_H *
-    filter_size_W * input.channels} which is similar with im2col.
-    This op use filter / kernel to scan images and convert these images to
-    sequences. After expanding, the number of time step are
-    output_height * output_width for an image, in which output_height and
-    output_width are calculated by below equation:
-
-    .. math::
-
-        output\_size = 1 + \
-            (2 * padding + img\_size - block\_size + stride - 1) / stride
-
-    And the dimension of each time step is block_y * block_x * input.channels.
-
-    Args:
-        input (Variable): The input should be a tensor in NCHW format.
-
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-
-        padding(int|tuple): The padding size. If padding is a tuple, it can
-            contain two integers like (padding_H, padding_W) which means
-            padding_up = padding_down = padding_H and
-            padding_left = padding_right = padding_W. Or it can use
-            (padding_up, padding_left, padding_down, padding_right) to indicate
-            paddings of four direction. Otherwise, a scalar padding means
-            padding_up = padding_down = padding_left = padding_right = padding
-            Default: padding = 0.
-
-        input_image_size(Variable): the input contains image real size.It's dim
-            is [batchsize, 2]. It is dispensable.It is just for batch inference.
-
-        out_stride(int|tuple): The scaling of image through CNN. It is
-            dispensable. It is valid only when input_image_size is not null.
-            If out_stride is tuple,  it must contain two intergers,
-            (out_stride_H, out_stride_W). Otherwise,
-            the out_stride_H = out_stride_W = out_stride.
-
-        name (int): The name of this layer. It is optional.
-
-    Returns:
-        output: The output is a LoDTensor with shape
-        {input.batch_size * output_height * output_width,
-        filter_size_H * filter_size_W * input.channels}.
-        If we regard output as a matrix, each row of this matrix is
-        a step of a sequence.
-
-    Examples:
-
-        .. code-block:: text
-
-            Given:
-
-            x = [[[[ 6.  2.  1.]
-                   [ 8.  3.  5.]
-                   [ 0.  2.  6.]]
-
-                  [[ 2.  4.  4.]
-                   [ 6.  3.  0.]
-                   [ 6.  4.  7.]]]
-
-                 [[[ 6.  7.  1.]
-                   [ 5.  7.  9.]
-                   [ 2.  4.  8.]]
-
-                  [[ 1.  2.  1.]
-                   [ 1.  3.  5.]
-                   [ 9.  0.  8.]]]]
-
-            x.dims = {2, 2, 3, 3}
-
-            And:
-
-            filter = [2, 2]
-            stride = [1, 1]
-            padding = [0, 0]
-
-            Then:
-
-            output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
-                           [ 2.  1.  3.  5.  4.  4.  3.  0.]
-                           [ 8.  3.  0.  2.  6.  3.  6.  4.]
-                           [ 3.  5.  2.  6.  3.  0.  4.  7.]
-                           [ 6.  7.  5.  7.  1.  2.  1.  3.]
-                           [ 7.  1.  7.  9.  2.  1.  3.  5.]
-                           [ 5.  7.  2.  4.  1.  3.  9.  0.]
-                           [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-
-            output.dims = {8, 8}
-
-            output.lod = [[4, 4]]
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name='data', shape=[3, 32, 32],
-                                     dtype='float32')
-            output = fluid.layers.im2sequence(
-                input=data, stride=[1, 1], filter_size=[2, 2])
-
-
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-
-    if isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
-    if isinstance(stride, int):
-        stride = [stride, stride]
-    if isinstance(padding, int):
-        padding = [padding, padding]
-    if len(padding) == 2:
-        padding.append(padding[0])
-        padding.append(padding[1])
-    inputs = {"X": input}
-    attrs = {"kernels": filter_size, "strides": stride, "paddings": padding}
-    if input_image_size:
-        if isinstance(out_stride, int):
-            out_stride = [out_stride, out_stride]
-        inputs["Y"] = input_image_size
-        attrs["out_stride"] = out_stride
-    helper = LayerHelper('im2sequence', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    return out
-
-
-@templatedoc()
-def row_conv(input, future_context_size, param_attr=None, act=None):
-    """
-    ${comment}
-
-    Args:
-        input (${x_type}): ${x_comment}.
-        future_context_size (int): Future context size. Please note, the shape
-            of convolution kernel is [future_context_size + 1, D].
-        param_attr (ParamAttr): Attributes of parameters, including
-            name, initializer etc.
-        act (str): Non-linear activation to be applied to output variable.
-
-    Returns:
-        ${out_comment}.
-
-    Examples:
-        >>> import paddle.fluid as fluid
-        >>> x = fluid.layers.data(name='x', shape=[16],
-        >>>                        dtype='float32', lod_level=1)
-        >>> out = fluid.layers.row_conv(input=x, future_context_size=2)
-    """
-    helper = LayerHelper('row_conv', **locals())
-    dtype = helper.input_dtype()
-    filter_shape = [future_context_size + 1, input.shape[1]]
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='row_conv',
-        inputs={'X': [input],
-                'Filter': [filter_param]},
-        outputs={'Out': [out]})
-    return helper.append_activation(out)
-
-
-@templatedoc()
-def multiplex(inputs, index):
-    """
-    ${comment}
-
-    For Example:
-
-    .. code-block:: text
-
-        case 1:
-
-        Given:
-
-        X = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]],
-             [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]],
-             [[2,0,3,4], [2,1,7,8], [2,2,4,2], [2,3,3,4]],
-             [[3,0,3,4], [3,1,7,8], [3,2,4,2], [3,3,3,4]]]
-
-        index = [3,0,1,2]
-
-        out:[[3 0 3 4]    // X[3,0] (3 = index[i], 0 = i); i=0
-             [0 1 3 4]    // X[0,1] (0 = index[i], 1 = i); i=1
-             [1 2 4 2]    // X[1,2] (0 = index[i], 2 = i); i=2
-             [2 3 3 4]]   // X[2,3] (0 = index[i], 3 = i); i=3
-
-        case 2:
-
-        Given:
-
-        X = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]],
-             [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]]]
-
-        index = [1,0]
-
-        out:[[1 0 3 4]    // X[1,0] (3 = index[0], 0 = i); i=1
-             [0 1 3 4]    // X[0,1] (0 = index[1], 1 = i); i=2
-             [0 2 4 4]    // X[0,2] (0 = 0, 2 = i); i=3
-             [0 3 3 4]]   // X[0,3] (0 = 0, 3 = i); i=4
-
-    Examples:
-
-    .. code-block:: python
-
-        import paddle.fluid as fluid
-        x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
-        x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
-        index = fluid.layers.data(name='index', shape=[1], dtype='int32')
-        out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
-
-    Args:
-       inputs (list): ${x_comment}.
-       index (${ids_type}): ${ids_comment}.
-
-    Returns:
-        ${out_comment}.
-    """
-    helper = LayerHelper('multiplex', **locals())
-
-    if not isinstance(inputs, list) and len(inputs) < 2:
-        raise ValueError("inputs should be a list object and contains at least "
-                         "2 elements.")
-
-    out = helper.create_variable_for_type_inference(inputs[0].dtype)
-    helper.append_op(
-        type='multiplex',
-        inputs={'X': inputs,
-                'Ids': index},
-        outputs={'Out': [out]})
-    return out
-
-
-def softmax_with_cross_entropy(logits,
-                               label,
-                               soft_label=False,
-                               ignore_index=kIgnoreIndex,
-                               numeric_stable_mode=True,
-                               return_softmax=False,
-                               axis=-1):
-    """
-    **Softmax With Cross Entropy Operator.**
-
-    Cross entropy loss with softmax is used as the output layer extensively. This
-    operator computes the softmax normalized values for dimension :attr:`axis` of 
-    the input tensor, after which cross-entropy loss is computed. This provides 
-    a more numerically stable gradient.
-
-    Because this operator performs a softmax on logits internally, it expects
-    unscaled logits. This operator should not be used with the output of
-    softmax operator since that would produce incorrect results.
-
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
-    single label.
-
-    The equation is as follows:
-
-    1) Hard label (one-hot label, so every sample has exactly one class)
-
-    .. math::
-
-        loss_j =  -\\text{logit}_{label_j} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logit}_i)\\right), j = 1,..., K
-
-    2) Soft label (each sample can have a distribution over all classes)
-
-    .. math::
-
-        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
-        \\left(\\text{logit}_i - \\log\\left(\\sum_{i=0}^{K}
-        \\exp(\\text{logit}_i)\\right)\\right), j = 1,...,K
-
-    3) If :attr:`numeric_stable_mode` is :attr:`True`, softmax is calculated 
-    first by:
-
-    .. math::
-
-        max_j &= \\max_{i=0}^{K}{\\text{logit}_i}
-
-        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
-
-        softmax_j &= \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
-
-    and then cross entropy loss is calculated by softmax and label.
-
-    Args:
-        logits (Variable): The input tensor of unscaled log probabilities.
-        label (Variable): The ground truth  tensor. If :attr:`soft_label`
-            is set to :attr:`True`, Label is a Tensor<float/double> in the 
-            same shape with :attr:`logits`. If :attr:`soft_label` is set to 
-            :attr:`True`, Label is a Tensor<int64> in the same shape with 
-            :attr:`logits` expect shape in dimension :attr:`axis` as 1.
-        soft_label (bool): A flag to indicate whether to interpretate the given
-            labels as soft labels. Default False.
-        ignore_index (int): Specifies a target value that is ignored and does
-                            not contribute to the input gradient. Only valid
-                            if :attr:`soft_label` is set to :attr:`False`. 
-                            Default: kIgnoreIndex
-        numeric_stable_mode (bool): A flag to indicate whether to use a more
-                                    numerically stable algorithm. Only valid
-                                    when :attr:`soft_label` is :attr:`False` 
-                                    and GPU is used. When :attr:`soft_label` 
-                                    is :attr:`True` or CPU is used, the 
-                                    algorithm is always numerically stable.
-                                    Note that the speed may be slower when use
-                                    stable algorithm. Default: True
-        return_softmax (bool): A flag indicating whether to return the softmax
-                               along with the cross entropy loss. Default: False
-        axis (int): The index of dimension to perform softmax calculations. It 
-                    should be in range :math:`[-1, rank - 1]`, while :math:`rank`
-                    is the rank of input :attr:`logits`. Default: -1.
-
-    Returns:
-        Variable or Tuple of two Variables: Return the cross entropy loss if \
-                                            `return_softmax` is False, otherwise the tuple \
-                                            (loss, softmax), softmax is in the same shape \
-                                            with input logits and cross entropy loss is in \
-                                            the same shape with input logits except shape \
-                                            in dimension :attr:`axis` as 1.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            data = fluid.layers.data(name='data', shape=[128], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            fc = fluid.layers.fc(input=data, size=100)
-            out = fluid.layers.softmax_with_cross_entropy(
-                logits=fc, label=label)
-    """
-    helper = LayerHelper('softmax_with_cross_entropy', **locals())
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': logits,
-                'Label': label},
-        outputs={'Softmax': softmax,
-                 'Loss': loss},
-        attrs={
-            'soft_label': soft_label,
-            'ignore_index': ignore_index,
-            'numeric_stable_mode': numeric_stable_mode,
-            'axis': axis
-        })
-
-    if return_softmax:
-        return loss, softmax
-
-    return loss
-
-
-def sampled_softmax_with_cross_entropy(logits,
-                                       label,
-                                       num_samples,
-                                       num_true=1,
-                                       remove_accidental_hits=True,
-                                       use_customized_samples=False,
-                                       customized_samples=None,
-                                       customized_probabilities=None,
-                                       seed=0):
-    """
-    **Sampled Softmax With Cross Entropy Operator.**
-
-    Cross entropy loss with sampled softmax is used as the output layer for 
-    larger output classes extensively. This operator samples a number of samples
-    for all examples, and computes the softmax normalized values for each 
-    row of the sampled tensor, after which cross-entropy loss is computed. 
-
-    Because this operator performs a softmax on logits internally, it expects
-    unscaled logits. This operator should not be used with the output of
-    softmax operator since that would produce incorrect results.
-    
-    For examples with T true labels (T >= 1), we assume that each true label has
-    a probability of 1/T. For each sample, S samples are generated using a
-    log uniform distribution. True labels are concatenated with these samples to
-    form T + S samples for each example. So, assume the shape of logits is
-    [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a 
-    probability is calculated, which corresponds to the Q(y|x) in 
-    [Jean et al., 2014](http://arxiv.org/abs/1412.2007).
-    
-    Logits are sampled according to the sampled labels. Then if 
-    remove_accidental_hits is True, if a sample[i, j] accidentally hits true 
-    labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to 
-    make its softmax result close to zero. Then sampled logits are subtracted by
-    logQ(y|x), these sampled logits and re-indexed labels are used to compute 
-    a softmax with cross entropy.
-
-    Args:
-        logits (Variable): The unscaled log probabilities, which is a 2-D tensor
-            with shape [N x K]. N is the batch_size, and K is the class number.
-        label (Variable): The ground truth which is a 2-D tensor. Label is a 
-            Tensor<int64> with shape [N x T], where T is the number of true 
-            labels per example. 
-        num_samples (int): The number for each example, num_samples should be 
-            less than the number of class.
-        num_true(int): The number of target classes per training example.
-        remove_accidental_hits (bool): A flag indicating whether to remove 
-            accidental hits when sampling. If True and if a sample[i, j] 
-            accidentally hits true labels, then the corresponding 
-            sampled_logits[i, j] is minus by 1e20 to make its softmax result 
-            close to zero. Default is True.
-        use_customized_samples (bool): Whether to use custom samples and probabities to sample
-            logits.
-        customized_samples (Variable): User defined samples, which is a 2-D tensor
-            with shape [N, T + S]. S is the num_samples, and T is the number of true 
-            labels per example. 
-        customized_probabilities (Variable): User defined probabilities of samples, 
-            a 2-D tensor which has the same shape with customized_samples.
-        seed (int): The random seed for generating random number, which is used
-            in the process of sampling. Default is 0.
-
-    Returns:
-        Variable: Return the cross entropy loss which is a 2-D tensor with shape
-                  [N x 1].
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(name='data', shape=[256], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            fc = fluid.layers.fc(input=input, size=100)
-            out = fluid.layers.sampled_softmax_with_cross_entropy(
-                      logits=fc, label=label, num_samples=25)
-    """
-    helper = LayerHelper('sample_logits', **locals())
-    samples = helper.create_variable_for_type_inference(dtype='int64')
-    probabilities = helper.create_variable_for_type_inference(
-        dtype=logits.dtype)
-    sampled_logits \
-        = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    sampled_label = helper.create_variable_for_type_inference(dtype='int64')
-    sampled_softlabel = helper.create_variable_for_type_inference(
-        dtype=logits.dtype)
-    logits_dim = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    labels_dim = helper.create_variable_for_type_inference(dtype=label.type)
-
-    helper.append_op(
-        type='sample_logits',
-        inputs={
-            'Logits': logits,
-            'Labels': label,
-            'CustomizedSamples': customized_samples,
-            'CustomizedProbabilities': customized_probabilities
-        },
-        outputs={
-            'Samples': samples,
-            'Probabilities': probabilities,
-            'SampledLabels': sampled_label,
-            'SampledLogits': sampled_logits,
-            'LogitsDim': logits_dim,
-            'LabelsDim': labels_dim
-        },
-        attrs={
-            'use_customized_samples': use_customized_samples,
-            'uniq': True,
-            'remove_accidental_hits': remove_accidental_hits,
-            'num_samples': num_samples,
-            'seed': seed
-        })
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    helper.append_op(
-        type='one_hot',
-        inputs={'X': sampled_label},
-        attrs={'depth': num_samples + 1},
-        outputs={'Out': sampled_softlabel})
-
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': sampled_logits,
-                'Label': sampled_softlabel},
-        outputs={'Softmax': softmax,
-                 'Loss': loss},
-        attrs={
-            'soft_label': True,
-            'ignore_index': False,
-            'numeric_stable_mode': False
-        })
-    return loss / num_true
-
-
-def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
-    """
-    This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
-    It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
-    For each instance, it computes the smooth L1 loss element by element first
-    and then sums all the losses. So the shape of ouput Variable is
-    [batch_size, 1].
-
-    Args:
-        x (Variable): A tensor with rank at least 2. The input value of smooth
-            L1 loss op with shape [batch_size, dim1, ..., dimN].
-        y (Variable): A tensor with rank at least 2. The target value of smooth
-            L1 loss op with same shape as :attr:`x`.
-        inside_weight (Variable|None):  A tensor with rank at least 2. This
-            input is optional and should have same shape with :attr:`x`. If
-            provided, the result of (:attr:`x` - :attr:`y`) will be multiplied
-            by this tensor element by element.
-        outside_weight (Variable|None): A tensor with rank at least 2. This
-            input is optional and should have same shape with :attr:`x`. If
-            provided, the out smooth L1 loss will be multiplied by this tensor
-            element by element.
-        sigma (float|None): Hyper parameter of smooth L1 loss layer. A float
-           scalar with default value 1.0.
-
-    Returns:
-        Variable: The output smooth L1 loss with shape [batch_size, 1].
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name='data', shape=[128], dtype='float32')
-            label = fluid.layers.data(
-                name='label', shape=[100], dtype='float32')
-            fc = fluid.layers.fc(input=data, size=100)
-            out = fluid.layers.smooth_l1(x=fc, y=label)
-    """
-
-    helper = LayerHelper('smooth_l1_loss', **locals())
-    diff = helper.create_variable_for_type_inference(dtype=x.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='smooth_l1_loss',
-        inputs={
-            'X': x,
-            'Y': y,
-            'InsideWeight': inside_weight,
-            'OutsideWeight': outside_weight
-        },
-        outputs={'Diff': diff,
-                 'Out': loss},
-        attrs={'sigma': sigma if sigma is not None else 1.0})
-    return loss
-
-
-def one_hot(input, depth, allow_out_of_range=False):
-    """
-    This layer creates the one-hot representations for input indices.
-
-    Args:
-        input(Variable): Input indices, last dimension must be 1.
-        depth(scalar): An interger defining the depth of the one-hot dimension.
-        allow_out_of_range(bool): A bool value indicating whether the input
-            indices could be out of range [0, depth). When input indices are
-            out of range, exceptions is raised if allow_out_of_range is False,
-            or zero-filling representations is created if it is set True
-
-    Returns:
-        Variable: The one-hot representations of input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            one_hot_label = fluid.layers.one_hot(input=label, depth=10)
-    """
-    helper = LayerHelper("one_hot", **locals())
-
-    one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
-
-    if in_dygraph_mode():
-        inputs = {'X': input}
-        attrs = {'depth': depth}
-    else:
-        if not isinstance(depth, Variable):
-            # user attribute 
-            inputs = {'X': input}
-            attrs = {'depth': depth}
-        else:
-            depth.stop_gradient = True
-            inputs = {'X': input, 'depth_tensor': depth}
-            attrs = {}
-    helper.append_op(
-        type="one_hot",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={'Out': one_hot_out})
-    one_hot_out.stop_gradient = True
-    return one_hot_out
-
-
-def autoincreased_step_counter(counter_name=None, begin=1, step=1):
-    """
-    Create an auto-increase variable
-    which will be automatically increased by 1 every mini-batch
-    Return the run counter of the main program, default is started from 1.
-
-    Args:
-        counter_name(str): The counter name, default is '@STEP_COUNTER@'.
-        begin(int): The first value of this counter.
-        step(int): The increment step between each execution.
-
-    Returns:
-        Variable: The global run counter.
-
-    Examples:
-        .. code-block:: python
-
-           import paddle.fluid as fluid
-           global_step = fluid.layers.autoincreased_step_counter(
-               counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
-    """
-    helper = LayerHelper('global_step_counter')
-    if counter_name is None:
-        counter_name = '@STEP_COUNTER@'
-    counter, is_new_var = helper.create_or_get_global_variable(
-        name=counter_name, dtype='int64', shape=[1], persistable=True)
-    if is_new_var:
-        helper.set_variable_initializer(
-            counter, initializer=Constant(
-                value=begin - 1, force_cpu=True))
-        helper.main_program.global_block()._prepend_op(
-            type='increment',
-            inputs={'X': [counter]},
-            outputs={'Out': [counter]},
-            attrs={'step': float(step)})
-        counter.stop_gradient = True
-
-    return counter
-
-
-def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
-    """
-    Gives a new shape to the input Tensor without changing its data.
-
-    The target shape can be given by :attr:`shape` or :attr:`actual_shape`.
-    :attr:`shape` is a list of integer or tensor variable while :attr:`actual_shape` is a tensor
-    variable. :attr:`actual_shape` has a higher priority than :attr:`shape`
-    if it is provided and it only contains integer, while :attr:`shape` still should be set correctly to
-    gurantee shape inference in compile-time.
-
-    Some tricks exist when specifying the target shape.
-
-    1. -1 means the value of this dimension is inferred from the total element
-    number of x and remaining dimensions. Thus one and only one dimension can
-    be set -1.
-
-    2. 0 means the actual dimension value is going to be copied from the
-    corresponding dimension of x. The indice of 0s in shape can not exceed
-    Rank(X).
-
-    Here are some examples to explain it.
-
-    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    is [6, 8], the reshape operator will transform x into a 2-D tensor with
-    shape [6, 8] and leaving x's data unchanged.
-
-    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    specified is [2, 3, -1, 2], the reshape operator will transform x into a
-    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
-    case, one dimension of the target shape is set to -1, the value of this
-    dimension is inferred from the total element number of x and remaining
-    dimensions.
-
-    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
-    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
-    besides -1, 0 means the actual dimension value is going to be copied from
-    the corresponding dimension of x.
-
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the future and only use :attr:`shape` instead.
-
-    Args:
-        x(variable): The input tensor.
-        shape(list|tuple|Variable): The new shape. At most one dimension of the new shape can
-                     be -1. If :attr:`shape` is a list or tuple, it can contain Variable or not and
-                     the shape of Variable must be [1].
-
-        actual_shape(variable): An optional input. If provided, reshape
-                                according to this given shape rather than
-                                :attr:`shape` specifying shape. That is to
-                                say :attr:`actual_shape` has a higher priority
-                                than :attr:`shape(list|tuple)` but not :attr:`shape(Variable)`. \
-                                This argument :attr:`actual_shape` will be removed in a future version. \
-                                Instructions for updating: :attr:`actual_shape` is deprecated,
-                                only use :attr:`shape` instead.
-        act (str): The non-linear activation to be applied to the reshaped tensor
-                   variable.
-        inplace(bool): If ``inplace`` is `True`, the input and output of ``layers.reshape``
-                       are the same variable, otherwise, the input and output of
-                       ``layers.reshape`` are different variables. Note that if :attr:`x`
-                       is more than one layer's input, ``inplace`` must be :attr:`False`.
-        name (str): The name of this layer. It is optional.
-
-    Returns:
-        Variable: The reshaped tensor variable if :attr:`act` is None. It is a \
-                  new tensor variable if :attr:`inplace` is :attr:`False`, \
-                  otherwise it is :attr:`x`. If :attr:`act` is not None, return \
-                  the activated tensor variable.
-
-    Raises:
-        TypeError: if actual_shape is neither Variable nor None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            # example 1:
-            # attr shape is a list which doesn't contain tensor Variable.
-            data_1 = fluid.layers.data(
-                name='data_1', shape=[2, 4, 6], dtype='float32')
-            reshaped_1 = fluid.layers.reshape(
-                x=data_1, shape=[-1, 0, 3, 2], inplace=True)
-
-            # example 2:
-            # attr shape is a list which contains tensor Variable.
-            data_2 = fluid.layers.fill_constant([2,25], "int32", 3)
-            dim = fluid.layers.fill_constant([1], "int32", 5)
-            reshaped_2 = fluid.layers.reshape(data_2, shape=[dim, 10])
-    """
-
-    if not isinstance(shape, (list, tuple, Variable)):
-        raise TypeError(
-            "Input shape must be an Variable or python list or tuple.")
-
-    if not isinstance(actual_shape, Variable) and (actual_shape is not None):
-        raise TypeError("actual_shape should either be Variable or None.")
-
-    helper = LayerHelper("reshape2", **locals())
-    inputs = {"X": x}
-    attrs = {}
-
-    def contain_var(one_list):
-        for ele in one_list:
-            if isinstance(ele, Variable):
-                return True
-        return False
-
-    def get_new_shape_tensor(list_shape):
-        new_shape_tensor = []
-        for dim in list_shape:
-            if isinstance(dim, Variable):
-                dim.stop_gradient = True
-                new_shape_tensor.append(dim)
-            else:
-                assert (isinstance(dim, int))
-                temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out)
-                new_shape_tensor.append(temp_out)
-        return new_shape_tensor
-
-    def get_attr_shape(list_shape):
-        unk_dim_idx = -1
-        attrs_shape = []
-        for dim_idx, dim_size in enumerate(list_shape):
-            if isinstance(dim_size, Variable):
-                attrs_shape.append(-1)
-            else:
-                attrs_shape.append(dim_size)
-                if dim_size == -1:
-                    assert unk_dim_idx == -1, (
-                        "Only one dimension in shape can be unknown.")
-                    unk_dim_idx = dim_idx
-                elif dim_size == 0:
-                    assert dim_idx < len(x.shape), (
-                        "The indice of 0s in shape can not exceed Rank(X).")
-                else:
-                    assert dim_size > 0, (
-                        "Each dimension size given in shape must not be negtive "
-                        "except one unknown dimension.")
-        return attrs_shape
-
-    if in_dygraph_mode():
-        inputs = {'X': x}
-        attrs = {'shape': shape}
-    else:
-        if isinstance(shape, Variable):
-            shape.stop_gradient = True
-            inputs["Shape"] = shape
-        elif isinstance(shape, (list, tuple)):
-            assert len(shape) > 0, (
-                "The size of argument(shape) can't be zero.")
-            attrs["shape"] = get_attr_shape(shape)
-            if contain_var(shape):
-                inputs['ShapeTensor'] = get_new_shape_tensor(shape)
-            elif isinstance(actual_shape, Variable):
-                actual_shape.stop_gradient = True
-                inputs["Shape"] = actual_shape
-
-    out = x if inplace else helper.create_variable_for_type_inference(
-        dtype=x.dtype)
-    x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="reshape2",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"Out": out,
-                 "XShape": x_shape})
-
-    return helper.append_activation(out)
-
-
-def squeeze(input, axes, name=None):
-    """
-    Remove single-dimensional entries from the shape of a tensor. Takes a
-    parameter axes with a list of axes to squeeze. If axes is not provided, all
-    the single dimensions will be removed from the shape. If an axis is
-    selected with shape entry not equal to one, an error is raised.
-
-    For example:
-
-    .. code-block:: text
-
-        Case 1:
-
-          Given
-            X.shape = (1, 3, 1, 5)
-          and
-            axes = [0]
-          we get:
-            Out.shape = (3, 1, 5)
-
-        Case 2:
-
-          Given
-            X.shape = (1, 3, 1, 5)
-          and
-            axes = []
-          we get:
-            Out.shape = (3, 5)
-
-    Args:
-        input (Variable): The input variable to be squeezed.
-        axes (list): List of integers, indicating the dimensions to be squeezed.
-        name (str|None): Name for this layer.
-
-    Returns:
-        Variable: Output squeezed variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            x = layers.data(name='x', shape=[5, 1, 10])
-            y = layers.squeeze(input=x, axes=[1])
-    """
-    assert not in_dygraph_mode(), (
-        "squeeze layer is not supported in dygraph mode yet.")
-    helper = LayerHelper("squeeze", **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type="squeeze2",
-        inputs={"X": input},
-        attrs={"axes": axes},
-        outputs={"Out": out,
-                 "XShape": x_shape})
-
-    return out
-
-
-def unsqueeze(input, axes, name=None):
-    """
-    Insert single-dimensional entries to the shape of a tensor. Takes one
-    required argument axes, a list of dimensions that will be inserted.
-    Dimension indices in axes are as seen in the output tensor.
-
-    For example:
-
-    .. code-block:: text
-
-      Given a tensor such that tensor with shape [3, 4, 5],
-      then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
-
-    Args:
-        input (Variable): The input variable to be unsqueezed.
-        axes (list): List of integers, indicating the dimensions to be inserted.
-        name (str|None): Name for this layer.
-
-    Returns:
-        Variable: Output unsqueezed variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[5, 10])
-            y = fluid.layers.unsqueeze(input=x, axes=[1])
-    """
-    helper = LayerHelper("unsqueeze", **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type="unsqueeze2",
-        inputs={"X": input},
-        attrs={"axes": axes},
-        outputs={"Out": out,
-                 "XShape": x_shape})
-
-    return out
-
-
-def lod_reset(x, y=None, target_lod=None):
-    """
-    Set LoD of :attr:`x` to a new one specified by :attr:`y` or
-    :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be
-    considered as target LoD first, otherwise :attr:`y.data` would be
-    considered as target LoD. If :attr:`y` is not provided, target LoD should
-    be specified by :attr:`target_lod`. If target LoD is specified by
-    :attr:`y.data` or :attr:`target_lod`, only one level LoD is supported.
-
-    .. code-block:: text
-
-        * Example 1:
-
-            Given a 1-level LoDTensor x:
-                x.lod =  [[ 2,           3,                   1 ]]
-                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-                x.dims = [6, 1]
-
-            target_lod: [4, 2]
-
-            then we get a 1-level LoDTensor:
-                out.lod =  [[4,                          2]]
-                out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-                out.dims = [6, 1]
-
-        * Example 2:
-
-            Given a 1-level LoDTensor x:
-                x.lod =  [[2,            3,                   1]]
-                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-                x.dims = [6, 1]
-
-            y is a Tensor:
-                y.data = [[2, 4]]
-                y.dims = [1, 3]
-
-            then we get a 1-level LoDTensor:
-                out.lod =  [[2,            4]]
-                out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-                out.dims = [6, 1]
-
-        * Example 3:
-
-            Given a 1-level LoDTensor x:
-                x.lod =  [[2,            3,                   1]]
-                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-                x.dims = [6, 1]
-
-            y is a 2-level LoDTensor:
-                y.lod =  [[2, 2], [2, 2, 1, 1]]
-                y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]]
-                y.dims = [6, 1]
-
-            then we get a 2-level LoDTensor:
-                out.lod =  [[2, 2], [2, 2, 1, 1]]
-                out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-                out.dims = [6, 1]
-
-    Args:
-        x (Variable): Input variable which could be a Tensor or LoDTensor.
-        y (Variable|None): If provided, output's LoD would be derived
-                           from :attr:`y`.
-        target_lod (list|tuple|None): One level LoD which should be considered
-                                      as target LoD when :attr:`y` not provided.
-
-    Returns:
-        Variable: Output variable with LoD specified by this layer.
-
-    Raises:
-        ValueError: If :attr:`y` and :attr:`target_lod` are both None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[10])
-            y = fluid.layers.data(name='y', shape=[10, 20], lod_level=2)
-            out = fluid.layers.lod_reset(x=x, y=y)
-    """
-    helper = LayerHelper("lod_reset", **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    if y is not None:
-        helper.append_op(
-            type="lod_reset", inputs={'X': x,
-                                      'Y': y}, outputs={'Out': out})
-    elif target_lod is not None:
-        helper.append_op(
-            type="lod_reset",
-            inputs={'X': x},
-            attrs={'target_lod': target_lod},
-            outputs={'Out': out})
-    else:
-        raise ValueError("y and target_lod should not be both none.")
-    return out
-
-
-def lod_append(x, level):
-    """
-    Append level to LoD of :attr:`x`.
-
-    .. code-block:: text
-
-        * Example 1:
-
-            given a 1-level LoDTensor x:
-                x.lod =  [[ 2,           3,                   1 ]]
-                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-                x.dims = [6, 1]
-
-            level: [1, 1, 1, 1, 1, 1, 1]
-
-            then we get a 2-level LoDTensor:
-                x.lod =  [[ 2, 3, 1 ], [1, 1, 1, 1, 1, 1]]
-                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
-                x.dims = [6, 1]
-
-    Args:
-        x (Variable): Input variable which could be a tensor or LoDTensor.
-        level (list|tuple|Variable): The LoD level to be appended into LoD of x.
-
-    Returns:
-        Variable: Output variable with new LoD level.
-
-    Raises:
-        ValueError: If :attr:`y` is None or and :attr:`level` is not Iterator.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[6, 10], lod_level=1)
-            out = fluid.layers.lod_append(x, [1,1,1,1,1,1])
-    """
-    from collections import Iterable
-    if x is None:
-        raise ValueError("Input(x) can't be None.")
-    if (not isinstance(level, Iterable)) and (not isinstance(level, Variable)):
-        raise ValueError("Input(level) must be list, tuple or Variable.")
-
-    helper = LayerHelper("lod_append", **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    inputs = {'X': x}
-    attrs = {'append': True}
-
-    if isinstance(level, Variable):
-        inputs['Y'] = level
-    else:
-        attrs['target_lod'] = level
-    helper.append_op(
-        type="lod_reset", inputs=inputs, attrs=attrs, outputs={'Out': out})
-    return out
-
-
-def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
-    """
-    Local Response Normalization Layer. This layer performs a type of
-    "lateral inhibition" by normalizing over local input regions.
-
-    The formula is as follows:
-
-    .. math::
-
-      Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C-1, i + n/2)}_{j = \\max(0, i - n/2)}(Input(j, x, y))^2\\right)^{\\beta}
-
-    In the above equation:
-
-    * :math:`n`: The number of channels to sum over.
-    * :math:`k`: The offset (avoid being divided by 0).
-    * :math:`alpha`: The scaling parameter.
-    * :math:`beta`: The exponent parameter.
-
-    Refer to `ImageNet Classification with Deep Convolutional Neural Networks
-    <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
-
-    Args:
-        input (Variable): The input tensor of this layer, and the dimension of input tensor must be 4.
-        n (int, default 5): The number of channels to sum over.
-        k (float, default 1.0): An offset (usually positive to avoid dividing by 0).
-        alpha (float, default 1e-4): The scaling parameter.
-        beta (float, default 0.75): The exponent.
-        name (str, default None): A name for this operation.
-
-    Raises:
-        ValueError: If rank of the input tensor is not 4.
-
-    Returns:
-        A tensor variable storing the transformation result.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(
-              name="data", shape=[3, 112, 112], dtype="float32")
-          lrn = fluid.layers.lrn(input=data)
-    """
-    helper = LayerHelper('lrn', **locals())
-    dtype = helper.input_dtype()
-    input_shape = input.shape
-    dims = len(input_shape)
-
-    if dims != 4:
-        raise ValueError(
-            "dims of input must be 4(not %d), and it's order must be NCHW" %
-            (dims))
-
-    mid_out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    lrn_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="lrn",
-        inputs={"X": input},
-        outputs={
-            "Out": lrn_out,
-            "MidOut": mid_out,
-        },
-        attrs={"n": n,
-               "k": k,
-               "alpha": alpha,
-               "beta": beta})
-
-    return lrn_out
-
-
-def pad(x, paddings, pad_value=0., name=None):
-    """
-    Pads a tensor with a constant value given by :attr:`pad_value`, and the
-    padded width is specified by :attr:`paddings`.
-
-    Specifically, the number of values padded before the contents of :attr:`x`
-    in dimension :attr:`i` is indicated by :attr:`paddings[2i]`, and the number
-    of values padded after the contents of :attr:`x` in dimension :attr:`i` is
-    indicated by :attr:`paddings[2i+1]`.
-
-    See below for an example.
-
-    .. code-block:: text
-
-        Given:
-            x = [[1, 2], [3, 4]]
-
-            paddings = [0, 1, 1, 2]
-
-            pad_value = 0
-
-        Return:
-
-            out = [[0, 1, 2, 0, 0]
-                   [0, 3, 4, 0, 0]
-                   [0, 0, 0, 0, 0]]
-
-    Args:
-        x (Variable): The input tensor variable.
-        paddings (list): A list of integers. Its elements specify the padded
-                         width before and after for each dimension in turn.
-                         The length of :attr:paddings must be
-                         :math:`rank(x) \\times 2`.
-        pad_value (float): The constant value used to pad.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The padded tensor variable.
-
-    Examples:
-        .. code-block:: python
-
-            # x is a rank 2 tensor variable.
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='data', shape=[224], dtype='float32')
-            out = fluid.layers.pad(
-                x=x, paddings=[0, 1, 1, 2], pad_value=0.)
-    """
-    helper = LayerHelper('pad', input=x, **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='pad',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'paddings': paddings,
-               'pad_value': float(pad_value)})
-    return out
-
-
-def pad_constant_like(x, y, pad_value=0., name=None):
-    """
-    Pad input(Y) with :attr:`pad_value`, the number of values padded to
-    the edges of each axis is specified by the difference of the shape
-    of X and Y. ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n))
-    unique pad widths for each axis. The input should be a k-D
-    tensor(k > 0 and k < 7).
-
-    See below for an example.
-
-    .. code-block:: text
-
-        Given:
-            X = [[[[ 0,  1,  2],
-                   [ 3,  4,  5]],
-                  [[ 6,  7,  8],
-                   [ 9, 10, 11]],
-                  [[12, 13, 14],
-                   [15, 16, 17]]],
-                 [[[18, 19, 20],
-                   [21, 22, 23]],
-                  [[24, 25, 26],
-                   [27, 28, 29]],
-                  [[30, 31, 32],
-                   [33, 34, 35]]]]
-            X.shape = (2, 3, 2, 3)
-
-            Y = [[[[35, 36, 37]],
-                  [[38, 39, 40]],
-                  [[41, 42, 43]]]]
-            Y.shape = (1, 3, 1, 3)
-		And
-            pad_value = -1,
-
-        Return:
-            Out = [[[[35, 36, 37],
-                     [-1, -1, -1]],
-                    [[38, 39, 40],
-                     [-1, -1, -1]],
-                    [[41, 42, 43],
-                     [-1, -1, -1]]],
-                  [[[-1, -1, -1],
-                    [-1, -1, -1]],
-                   [[-1, -1, -1],
-                    [-1, -1, -1]],
-                   [[-1, -1, -1],
-                    [-1, -1, -1]]]]
-            Out.shape = (2, 3, 2, 3)
-
-    Args:
-        x (Variable): The input tensor variable.
-        y (Variable): The input tensor variable.
-        pad_value (float): The constant value used to pad.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The padded tensor variable.
-
-    Examples:
-        .. code-block:: python
-
-            # x is a rank 4 tensor variable, x.shape = (2, 3, 2, 3)
-            # y is a rank 4 tensor variable, y.shape = (1, 3, 1, 3)
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[2,3,2,3], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[1,3,1,3], dtype='float32')
-            out = fluid.layers.pad_constant_like(x=x, y=y, pad_value=0.)
-            # out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3]
-    """
-    helper = LayerHelper('pad_constant_like', input=x, **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='pad_constant_like',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out},
-        attrs={'pad_value': float(pad_value)})
-    return out
-
-
-def label_smooth(label,
-                 prior_dist=None,
-                 epsilon=0.1,
-                 dtype="float32",
-                 name=None):
-    """
-    Label smoothing is a mechanism to regularize the classifier layer and is
-    called label-smoothing regularization (LSR).
-
-    Label smoothing is proposed to encourage the model to be less confident,
-    since optimizing the log-likelihood of the correct label directly may
-    cause overfitting and reduce the ability of the model to adapt. Label
-    smoothing replaces the ground-truth label :math:`y` with the weighted sum
-    of itself and some fixed distribution :math:`\mu`. For class :math:`k`,
-    i.e.
-
-    .. math::
-
-        \\tilde{y_k} = (1 - \epsilon) * y_k + \epsilon * \mu_k,
-
-    where :math:`1 - \epsilon` and :math:`\epsilon` are the weights
-    respectively, and :math:`\\tilde{y}_k` is the smoothed label. Usually
-    uniform distribution is used for :math:`\mu`.
-
-    See more details about label smoothing in https://arxiv.org/abs/1512.00567.
-
-    Args:
-        label(Variable): The input variable containing the label data. The
-                          label data should use one-hot representation.
-        prior_dist(Variable): The prior distribution to be used to smooth
-                              labels. If not provided, an uniform distribution
-                              is used. The shape of :attr:`prior_dist` should
-                              be :math:`(1, class\_num)`.
-        epsilon(float): The weight used to mix up the original ground-truth
-                        distribution and the fixed distribution.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32,
-                                                  float_64, int etc.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The tensor variable containing the smoothed labels.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            label = layers.data(name="label", shape=[1], dtype="float32")
-            one_hot_label = layers.one_hot(input=label, depth=10)
-            smooth_label = layers.label_smooth(
-                label=one_hot_label, epsilon=0.1, dtype="float32")
-    """
-    if epsilon > 1. or epsilon < 0.:
-        raise ValueError("The value of epsilon must be between 0 and 1.")
-    helper = LayerHelper("label_smooth", **locals())
-    label.stop_gradient = True
-    smooth_label = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="label_smooth",
-        inputs={"X": label,
-                "PriorDist": prior_dist} if prior_dist else {"X": label},
-        outputs={"Out": smooth_label},
-        attrs={"epsilon": float(epsilon)})
-    return smooth_label
-
-
-@templatedoc()
-def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): ${x_comment}
-        rois (Variable): ROIs (Regions of Interest) to pool over.It should be
-                         a 2-D LoDTensor of shape (num_rois, 4), the lod level
-                         is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                         the top left coordinates, and (x2, y2) is the bottom
-                         right coordinates.
-        pooled_height (integer): ${pooled_height_comment} Default: 1
-        pooled_width (integer): ${pooled_width_comment} Default: 1
-        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
-
-    Returns:
-        Variable: ${out_comment}.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(
-                name='x', shape=[8, 112, 112], dtype='float32')
-            rois = fluid.layers.data(
-                name='roi', shape=[4], lod_level=1, dtype='float32')
-            pool_out = fluid.layers.roi_pool(
-                input=x,
-                rois=rois,
-                pooled_height=7,
-                pooled_width=7,
-                spatial_scale=1.0)
-
-    """
-    helper = LayerHelper('roi_pool', **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="roi_pool",
-        inputs={"X": input,
-                "ROIs": rois},
-        outputs={"Out": pool_out,
-                 "Argmax": argmaxes},
-        attrs={
-            "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
-            "spatial_scale": spatial_scale
-        })
-    return pool_out
-
-
-@templatedoc()
-def roi_align(input,
-              rois,
-              pooled_height=1,
-              pooled_width=1,
-              spatial_scale=1.0,
-              sampling_ratio=-1,
-              name=None):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): ${x_comment}
-        rois (Variable): ROIs (Regions of Interest) to pool over.It should be
-                         a 2-D LoDTensor of shape (num_rois, 4), the lod level
-                         is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                         the top left coordinates, and (x2, y2) is the bottom
-                         right coordinates. 
-        pooled_height (integer): ${pooled_height_comment} Default: 1
-        pooled_width (integer): ${pooled_width_comment} Default: 1
-        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
-        sampling_ratio(intger): ${sampling_ratio_comment} Default: -1
-
-    Returns:
-        Variable: ${out_comment}.
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(
-                name='data', shape=[256, 32, 32], dtype='float32')
-            rois = fluid.layers.data(
-                name='rois', shape=[4], dtype='float32')
-            align_out = fluid.layers.roi_align(input=x,
-                                               rois=rois,
-                                               pooled_height=7,
-                                               pooled_width=7,
-                                               spatial_scale=0.5,
-                                               sampling_ratio=-1)
-    """
-    helper = LayerHelper('roi_align', **locals())
-    dtype = helper.input_dtype()
-    align_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="roi_align",
-        inputs={"X": input,
-                "ROIs": rois},
-        outputs={"Out": align_out},
-        attrs={
-            "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
-            "spatial_scale": spatial_scale,
-            "sampling_ratio": sampling_ratio
-        })
-    return align_out
-
-
-def dice_loss(input, label, epsilon=0.00001):
-    """
-    Dice loss for comparing the similarity of two batch of data,
-    usually is used for binary image segmentation i.e. labels are binary.
-    The dice loss can be defined as below equation:
-
-    .. math::
-
-        dice\_loss &= 1 - \\frac{2 * intersection\_area}{total\_area} \\\\
-                  &= \\frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\\\
-                  &= \\frac{(union\_area - intersection\_area)}{total\_area}
-
-
-    Args:
-        input (Variable): The predictions with rank>=2. The first dimension is batch size,
-                          and the last dimension is class number.
-        label (Variable): The groud truth with the same rank with input. The first dimension
-                          is batch size, and the last dimension is 1.
-        epsilon (float): The epsilon will be added to the numerator and denominator.
-                         If both input and label are empty, it makes sure dice is 1.
-                         Default: 0.00001
-
-    Returns:
-        dice_loss (Variable): The dice loss with shape [1].
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='data', shape = [3, 224, 224, 2], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[3, 224, 224, 1], dtype='float32')
-            predictions = fluid.layers.softmax(x)
-            loss = fluid.layers.dice_loss(input=predictions, label=label)
-    """
-    label = one_hot(label, depth=input.shape[-1])
-    reduce_dim = list(range(1, len(input.shape)))
-    inse = reduce_sum(input * label, dim=reduce_dim)
-    dice_denominator = reduce_sum(
-        input, dim=reduce_dim) + reduce_sum(
-            label, dim=reduce_dim)
-    dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
-    return reduce_mean(dice_score)
-
-
-def image_resize(input,
-                 out_shape=None,
-                 scale=None,
-                 name=None,
-                 resample='BILINEAR',
-                 actual_shape=None,
-                 align_corners=True,
-                 align_mode=1,
-                 data_format='NCHW'):
-    """
-    **Resize a Batch of Images**
-
-    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w) 
-    or (num_batches, in_h, in_w, channels), or a 5-D Tensor of the shape 
-    (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels), 
-    and the resizing only applies on the three dimensions(depth, hight and width).
-
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the
-    future and only use :attr:`out_shape` instead.
-
-    Supporting resample methods:
-
-        'BILINEAR' : Bilinear interpolation
-
-        'TRILINEAR' : Trilinear interpolation
-
-        'NEAREST' : Nearest neighbor interpolation
-
-    Nearest neighbor interpolation is to perform nearest neighbor interpolation
-    in both the 3rd dimention(in height direction) and the 4th dimention(in width 
-    direction) on input tensor.
-            
-    Bilinear interpolation is an extension of linear interpolation for 
-    interpolating functions of two variables (e.g. H-direction and 
-    W-direction in this op) on a rectilinear 2D grid. The key idea is 
-    to perform linear interpolation first in one direction, and then 
-    again in the other direction.
-
-    Trilinear interpolation is an extension of linear interpolation for 
-    interpolating functions of three variables (e.g. D-direction, 
-    H-direction and W-direction in this op) on a rectilinear 3D grid. 
-    The linear interpolation is performed on three directions.
-
-    Align_corners and align_mode are optinal parameters,the calculation method 
-    of interpolation can be selected by them.
-
-    Example:
-
-    .. code-block:: text
-
-        For scale:
-          
-            if align_corners = True && out_size > 1 :
-
-              scale_factor = (in_size-1.0)/(out_size-1.0)
-            
-            else:
-              
-              scale_factor = float(in_size/out_size)
-            
-          
-        Nearest neighbor interpolation:
-          
-          if:
-              align_corners = False
-
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-
-              H_out = floor (H_{in} * scale_{factor})
-              W_out = floor (W_{in} * scale_{factor})
-
-          else:
-              align_corners = True
-
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-
-              H_out = round(H_{in} * scale_{factor})
-              W_out = round(W_{in} * scale_{factor})
-
-        Bilinear interpolation:
-
-          if:
-              align_corners = False , align_mode = 0
-              
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-
-          else:
-           
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-
-        Trilinear interpolation:
-
-          if:
-              align_corners = False , align_mode = 0
-              
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              
-              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-
-
-          else:
-           
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-
-              D_out = D_{in} * scale_{factor}
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-          
-    For details of nearest neighbor interpolation, please refer to Wikipedia: 
-    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-
-    For details of bilinear interpolation, please refer to Wikipedia: 
-    https://en.wikipedia.org/wiki/Bilinear_interpolation.
-
-    For details of trilinear interpolation, please refer to Wikipedia: 
-    https://en.wikipedia.org/wiki/Trilinear_interpolation.
-
-
-
-    Args:
-        input (Variable): 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
-                          its data format is specified by :attr:`data_format`.
-        out_shape(list|tuple|Variable|None): Output shape of image resize
-             layer, the shape is (out_h, out_w) when input is a 4-D Tensor and is
-             (out_d, out_h, out_w) when input is a 5-D Tensor. Default: None. If 
-             a list, each element can be an integer or a Tensor Variable of shape: [1].
-             If a Tensor Variable, its dimensions size should be a 1.
-        scale(float|Variable|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale`.
-             Default: None.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-        resample(str): The resample method. It supports 'BILINEAR', 'TRILINEAR'
-                       and 'NEAREST' currently. Default: 'BILINEAR'
-        actual_shape(Variable): An optional input to specify output shape
-                                dynamically. If provided, image resize
-                                according to this given shape rather than
-                                :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the
-                                highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
-                                shape dynamically, because :attr:`actual_shape` 
-                                will be deprecated. When using actual_shape to 
-                                specify output shape, one of :attr:`out_shape` 
-                                and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
-                                Default: None
-        align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the 
-                               input and output tensors are aligned, preserving the values at the 
-                               corner pixels.
-                               Default: True
-        align_mode(int)  :  An optional for bilinear interpolation. can be \'0\' 
-                            for src_idx = scale*(dst_indx+0.5)-0.5 , can be \'1\' for 
-                            src_idx = scale*dst_index.
-        data_format(str, optional): NCHW(num_batches, channels, height, width) or 
-                                    NHWC(num_batches, height, width, channels) for 4-D Tensor,
-                                    NCDHW(num_batches, channels, depth, height, width) or 
-                                    NDHWC(num_batches, depth, height, width, channels) for 5-D Tensor.
-                                    Default: 'NCHW'.
-
-    Returns:
-        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
-        or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
-
-    Raises:
-        TypeError: out_shape should be a list or tuple or Variable.
-        TypeError: actual_shape should either be Variable or None.
-        ValueError: The 'resample' of image_resize can only be 'BILINEAR',
-                    'TRILINEAR' or 'NEAREST' currently.
-        ValueError: 'BILINEAR' and 'NEAREST' only support 4-D tensor.
-        ValueError: 'TRILINEAR' only support 5-D tensor.
-        ValueError: One of out_shape and scale must not be None.
-        ValueError: out_shape length should be 2 for input 4-D tensor.
-        ValueError: out_shape length should be 3 for input 5-D tensor.
-        ValueError: scale should be greater than zero.
-        TypeError: align_corners shoule be a bool value
-        ValueError: align_mode can only be '0' or '1'
-        ValueError: data_format can only be 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[3, 6, 9], dtype="float32")
-            # input.shape = [-1, 3, 6, 9], where -1 indicates batch size, and it will get the exact value in runtime.
-
-            out0 = fluid.layers.image_resize(input, out_shape=[12, 12], resample="NEAREST")
-            # out0.shape = [-1, 3, 12, 12], it means out0.shape[0] = input.shape[0] in runtime.
-
-            # out_shape is a list in which each element is a integer or a tensor Variable
-            dim1 = fluid.layers.data(name="dim1", shape=[1], dtype="int32", append_batch_size=False)
-            out1 = fluid.layers.image_resize(input, out_shape=[12, dim1], resample="NEAREST")
-            # out1.shape = [-1, 3, 12, -1]
-
-            # out_shape is a 1-D tensor Variable
-            shape_tensor = fluid.layers.data(name="shape_tensor", shape=[2], dtype="int32", append_batch_size=False)
-            out2 = fluid.layers.image_resize(input, out_shape=shape_tensor, resample="NEAREST")
-            # out2.shape = [-1, 3, -1, -1]
-
-            # when use actual_shape
-            actual_shape_tensor = fluid.layers.data(name="actual_shape_tensor", shape=[2], dtype="int32", append_batch_size=False)
-            out3 = fluid.layers.image_resize(input, out_shape=[4, 4], resample="NEAREST", actual_shape=actual_shape_tensor)
-            # out3.shape = [-1, 3, 4, 4]
-
-            # scale is a Variable
-            scale_tensor = fluid.layers.data(name="scale", shape=[1], dtype="float32", append_batch_size=False)
-            out4 = fluid.layers.image_resize(input, scale=scale_tensor)
-            # out4.shape = [-1, 3, -1, -1]
-
-    """
-    resample_methods = {
-        'BILINEAR': 'bilinear',
-        'TRILINEAR': 'trilinear',
-        'NEAREST': 'nearest',
-    }
-    if resample not in resample_methods:
-        raise ValueError(
-            "The 'resample' of image_resize can only be 'BILINEAR', 'TRILINEAR' "
-            "or 'NEAREST' currently.")
-    resample_type = resample_methods[resample]
-
-    if resample in ['BILINEAR', 'NEAREST'] and len(input.shape) != 4:
-        raise ValueError("'BILINEAR' and 'NEAREST' only support 4-D tensor.")
-    if resample == 'TRILINEAR' and len(input.shape) != 5:
-        raise ValueError("'TRILINEAR'only support 5-D tensor.")
-
-    if not isinstance(align_corners, bool):
-        raise TypeError("Attr align_corners should be a bool value")
-    if align_mode != 0 and align_mode != 1:
-        raise ValueError("align_mode can only be 0 or 1")
-
-    if out_shape is None and scale is None:
-        raise ValueError("One of out_shape and scale must not be None.")
-    helper = LayerHelper('{}_interp'.format(resample_type), **locals())
-    dtype = helper.input_dtype()
-
-    if len(input.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
-        raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCHW` or `NHWC` supported for 4-D input.")
-    elif len(input.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
-        raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCDHW` or `NDHWC` supported for 5-D input.")
-
-    def _is_list_or_turple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if data_format == 'NCHW' or data_format == 'NCDHW':
-        data_layout = 'NCHW'
-    if data_format == 'NHWC' or data_format == 'NDHWC':
-        data_layout = 'NHWC'
-
-    inputs = {"X": input}
-    attrs = {
-        "out_d": -1,
-        "out_h": -1,
-        "out_w": -1,
-        "interp_method": resample_type,
-        "align_corners": align_corners,
-        "align_mode": align_mode,
-        "data_layout": data_layout
-    }
-
-    if out_shape is not None:
-        if isinstance(out_shape, Variable):
-            out_shape.stop_gradient = True
-            inputs['OutSize'] = out_shape
-        else:
-            if not (_is_list_or_turple_(out_shape)):
-                raise TypeError(
-                    "out_shape should be a list or tuple or Variable.")
-            # Validate the shape
-            contain_var = False
-            for dim_idx, dim_size in enumerate(out_shape):
-                if isinstance(dim_size, Variable):
-                    contain_var = True
-                    continue
-                assert dim_size > 0, (
-                    "Each dimension size given in out_shape must be greater than 0."
-                )
-
-            if contain_var:
-                new_size_tensor = []
-                size_list = []
-                for dim in out_shape:
-                    if isinstance(dim, Variable):
-                        dim.stop_gradient = True
-                        new_size_tensor.append(dim)
-                        size_list.append(-1)
-                    else:
-                        assert (isinstance(dim, int))
-                        temp_out = helper.create_variable_for_type_inference(
-                            'int32')
-                        fill_constant(
-                            [1], 'int32', dim, force_cpu=True, out=temp_out)
-                        new_size_tensor.append(temp_out)
-                        size_list.append(dim)
-                inputs['SizeTensor'] = new_size_tensor
-
-            if len(input.shape) == 4:
-                if len(out_shape) != 2:
-                    raise ValueError("out_shape length should be 2 for "
-                                     "input 4-D tensor.")
-                if contain_var:
-                    attrs['out_h'] = size_list[0]
-                    attrs['out_w'] = size_list[1]
-                else:
-                    out_shape = list(map(int, out_shape))
-                    attrs['out_h'] = out_shape[0]
-                    attrs['out_w'] = out_shape[1]
-            if len(input.shape) == 5:
-                if len(out_shape) != 3:
-                    raise ValueError("out_shape length should be 3 for "
-                                     "input 5-D tensor.")
-                if contain_var:
-                    attrs['out_d'] = size_list[0]
-                    attrs['out_h'] = size_list[1]
-                    attrs['out_w'] = size_list[2]
-                else:
-                    out_shape = list(map(int, out_shape))
-                    attrs['out_d'] = out_shape[0]
-                    attrs['out_h'] = out_shape[1]
-                    attrs['out_w'] = out_shape[2]
-
-    else:
-        if isinstance(scale, Variable):
-            scale.stop_gradient = True
-            inputs["Scale"] = scale
-        if isinstance(scale, float):
-            if scale <= 0:
-                raise ValueError("scale should be greater than zero.")
-            attrs['scale'] = float(scale)
-
-    if isinstance(actual_shape, Variable):
-        warnings.warn(
-            "actual_shape will be deprecated, it is recommended to use "
-            "out_shape instead of actual_shape to specify output shape dynamically."
-        )
-        actual_shape.stop_gradient = True
-        inputs["OutSize"] = actual_shape
-    elif actual_shape is not None:
-        raise TypeError("actual_shape should either be Variable or None.")
-
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='{}_interp'.format(resample_type),
-        inputs=inputs,
-        outputs={"Out": out},
-        attrs=attrs)
-    return out
-
-
-@templatedoc(op_type="bilinear_interp")
-def resize_bilinear(input,
-                    out_shape=None,
-                    scale=None,
-                    name=None,
-                    actual_shape=None,
-                    align_corners=True,
-                    align_mode=1,
-                    data_format='NCHW'):
-    """
-    Resize input by performing bilinear interpolation based on given
-    output shape which specified by actual_shape, out_shape and scale
-    in priority order.
-
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in 
-    the future and only use :attr:`out_shape` instead.
-
-    Bilinear interpolation is an extension of linear interpolation for
-    interpolating functions of two variables (e.g. H-direction and
-    W-direction in this op) on a rectilinear 2D grid. The key idea is
-    to perform linear interpolation first in one direction, and then
-    again in the other direction.
-
-    For details of bilinear interpolation, please refer to Wikipedia:
-    https://en.wikipedia.org/wiki/Bilinear_interpolation
-
-    Align_corners and align_mode are optinal parameters,the calculation 
-    method of interpolation can be selected by them.
-
-    Example:
-
-    .. code-block:: text
-
-        For scale:
-          
-            if align_corners = True && out_size > 1 :
-
-              scale_factor = (in_size-1.0)/(out_size-1.0)
-            
-            else:
-              
-              scale_factor = float(in_size/out_size)
-
-        Bilinear interpolation:
-
-          if:
-              align_corners = False , align_mode = 0
-              
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-
-          else:
-
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-
-    Args:
-        input(${x_type}): 4-D Tensor, its data type is float32, float64, or uint8,
-                          its data format is specified by :attr:`data_format`.
-        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
-            layer, the shape is (out_h, out_w).Default: None. If a list, each 
-            element can be an integer or a Tensor Variable with shape: [1]. If a 
-            Tensor Variable, its dimension size should be 1.
-        scale(float|Variable|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale` must be set. 
-             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
-             Default: None.
-        name(str|None): The output variable name.
-        actual_shape(Variable): An optional input to specify output shape
-                                dynamically. If provided, image resize
-                                according to this given shape rather than
-                                :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the
-                                highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
-                                shape dynamically, because :attr:`actual_shape` 
-                                will be deprecated. When using actual_shape to 
-                                specify output shape, one of :attr:`out_shape` 
-                                and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
-                                Default: None
-        align_corners(bool): ${align_corners_comment}
-        align_mode(bool): ${align_mode_comment}
-        data_format(str, optional): NCHW(num_batches, channels, height, width) or 
-                                    NHWC(num_batches, height, width, channels). Default: 'NCHW'.
-
-    Returns:
-        A 4-D Tensor in shape of (num_batches, channels, out_h, out_w) or
-        (num_batches, out_h, out_w, channels).
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[3, 6, 9], dtype="float32")
-            # input.shape = [-1, 3, 6, 9], where -1 indicates batch size, and it will get the exact value in runtime.
-
-            out0 = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
-            # out0.shape = [-1, 3, 12, 12], it means out0.shape[0] = input.shape[0] in runtime.
-
-            # out_shape is a list in which each element is a integer or a tensor Variable
-            dim1 = fluid.layers.data(name="dim1", shape=[1], dtype="int32", append_batch_size=False)
-            out1 = fluid.layers.resize_bilinear(input, out_shape=[12, dim1])
-            # out1.shape = [-1, 3, 12, -1]
-
-            # out_shape is a 1-D tensor Variable
-            shape_tensor = fluid.layers.data(name="shape_tensor", shape=[2], dtype="int32", append_batch_size=False)
-            out2 = fluid.layers.resize_bilinear(input, out_shape=shape_tensor)
-            # out2.shape = [-1, 3, -1, -1]
-
-            # when use actual_shape
-            actual_shape_tensor = fluid.layers.data(name="actual_shape_tensor", shape=[2], dtype="int32", append_batch_size=False)
-            out3 = fluid.layers.resize_bilinear(input, out_shape=[4, 4], actual_shape=actual_shape_tensor)
-            # out3.shape = [-1, 3, 4, 4]
-
-            # scale is a Variable
-            scale_tensor = fluid.layers.data(name="scale", shape=[1], dtype="float32", append_batch_size=False)
-            out4 = fluid.layers.resize_bilinear(input, scale=scale_tensor)
-            # out4.shape = [-1, 3, -1, -1]
-    """
-
-    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape,
-                        align_corners, align_mode, data_format)
-
-
-@templatedoc(op_type="trilinear_interp")
-def resize_trilinear(input,
-                     out_shape=None,
-                     scale=None,
-                     name=None,
-                     actual_shape=None,
-                     align_corners=True,
-                     align_mode=1,
-                     data_format='NCDHW'):
-    """
-    Resize input by performing trilinear interpolation based on given
-    output shape which specified by actual_shape, out_shape and scale
-    in priority order.
-
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated 
-    in the future and only use :attr:`out_shape` instead.
-
-    Trilinear interpolation is an extension of linear interpolation for 
-    interpolating functions of three variables (e.g. D-direction, 
-    H-direction and W-direction in this op) on a rectilinear 3D grid. 
-    The linear interpolation is performed on three directions.
-
-    For details of trilinear interpolation, please refer to Wikipedia:
-    https://en.wikipedia.org/wiki/Trilinear_interpolation
-
-    Align_corners and align_mode are optinal parameters,the calculation 
-    method of interpolation can be selected by them.
-
-    Example:
-
-    .. code-block:: text
-
-        For scale:
-          
-            if align_corners = True && out_size > 1 :
-
-              scale_factor = (in_size-1.0)/(out_size-1.0)
-            
-            else:
-              
-              scale_factor = float(in_size/out_size)     
-
-        Bilinear interpolation:
-
-          if:
-
-              align_corners = False , align_mode = 0
-              
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              
-              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-
-          else:
-
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-
-              D_out = D_{in} * scale_{factor}
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-
-    Args:
-        input(${x_type}): 5-D Tensor, its data type is float32, float64, or uint8,
-                          its data format is specified by :attr:`data_format`.
-        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
-            layer, the shape is (out_d, out_h, out_w). Default: None. If a list, 
-            each element can be  an integer or a Tensor Variable with shape: [1]. If 
-            a Tensor Variable, its dimension size should be 1.
-        scale(float|Variable|None): The multiplier for the input depth, height or width.
-             At least one of :attr:`out_shape` or :attr:`scale` must be set. 
-             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
-             Default: None.
-        name(str|None): The output variable name.
-        actual_shape(Variable): An optional input to specify output shape
-                                dynamically. If provided, image resize
-                                according to this given shape rather than
-                                :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the
-                                highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
-                                shape dynamically, because :attr:`actual_shape` 
-                                will be deprecated. When using actual_shape to 
-                                specify output shape, one of :attr:`out_shape` 
-                                and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
-                                Default: None
-        align_corners(bool): ${align_corners_comment}
-        align_mode(bool): ${align_mode_comment}
-        data_format(str, optional): NCDHW(num_batches, channels, depth, height, width) or 
-                                    NDHWC(num_batches, depth, height, width, channels).
-                                    Default: 'NCDHW'.
-
-    Returns:
-        A 5-D Tensor in shape of (num_batches, channels, out_d, out_h, out_w) or 
-        (num_batches, out_d, out_h, out_w, channels).
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[3, 6, 9, 11], dtype="float32")
-            # input.shape = [-1, 3, 6, 9, 11], where -1 indicates batch size, and it will get the exact value in runtime.
-
-            out0 = fluid.layers.resize_trilinear(input, out_shape=[12, 12, 12])
-            # out0.shape = [-1, 3, 12, 12, 12], it means out0.shape[0] = input.shape[0] in runtime.
-
-            # out_shape is a list in which each element is a integer or a tensor Variable
-            dim1 = fluid.layers.data(name="dim1", shape=[1], dtype="int32", append_batch_size=False)
-            out1 = fluid.layers.resize_trilinear(input, out_shape=[12, dim1, 4])
-            # out1.shape = [-1, 3, 12, -1, 4]
-
-            # out_shape is a 1-D tensor Variable
-            shape_tensor = fluid.layers.data(name="shape_tensor", shape=[3], dtype="int32", append_batch_size=False)
-            out2 = fluid.layers.resize_trilinear(input, out_shape=shape_tensor)
-            # out2.shape = [-1, 3, -1, -1, -1]
-
-            # when use actual_shape
-            actual_shape_tensor = fluid.layers.data(name="actual_shape_tensor", shape=[3], dtype="int32", append_batch_size=False)
-            out3 = fluid.layers.resize_trilinear(input, out_shape=[4, 4, 8], actual_shape=actual_shape_tensor)
-            # out3.shape = [-1, 3, 4, 4, 8]
-
-            # scale is a Variable
-            scale_tensor = fluid.layers.data(name="scale", shape=[1], dtype="float32", append_batch_size=False)
-            out4 = fluid.layers.resize_trilinear(input, scale=scale_tensor)
-            # out4.shape = [-1, 3, -1, -1, -1]
-    """
-
-    return image_resize(input, out_shape, scale, name, 'TRILINEAR',
-                        actual_shape, align_corners, align_mode, data_format)
-
-
-@templatedoc(op_type="nearest_interp")
-def resize_nearest(input,
-                   out_shape=None,
-                   scale=None,
-                   name=None,
-                   actual_shape=None,
-                   align_corners=True,
-                   data_format='NCHW'):
-    """
-    Resize input by performing nearest neighbor interpolation in both the
-    height direction and the width direction based on given output shape 
-    which is specified by actual_shape, out_shape and scale in priority order.
-
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the 
-    future and only use :attr:`out_shape` instead.
-
-    Example:
-
-    .. code-block:: text
-
-        For scale:
-          
-            if align_corners = True && out_size > 1 :
-              scale_factor = (in_size-1.0)/(out_size-1.0)
-            
-            else:
-              
-              scale_factor = float(in_size/out_size)
-          
-        Nearest neighbor interpolation:
-          
-          if:
-              align_corners = False
-
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-
-              H_out = floor(H_{in} * scale_{factor})
-              W_out = floor(W_{in} * scale_{factor})
-
-          else:
-              align_corners = True
-
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-
-              H_out = round(H_{in} * scale_{factor})
-              W_out = round(W_{in} * scale_{factor})
-
-
-    For details of nearest neighbor interpolation, please refer to Wikipedia:
-    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
-
-    Args:
-        input(${x_type}): 4-D Tensor, its data type is float32, float64, or uint8,
-                          its data format is specified by :attr:`data_format`.
-        out_shape(list|tuple|Variable|None): Output shape of resize nearest
-            layer, the shape is (out_h, out_w). Default: None. If a list, each 
-            element can be integer or a tensor Variable with shape: [1]. If a 
-            tensor Variable, its dimension size should be 1.
-        scale(float|Variable|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale` must be set. 
-             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
-             Default: None.
-        name(str|None): The output variable name.
-        actual_shape(Variable): An optional input to specify output shape
-                                dynamically. If provided, image resize
-                                according to this given shape rather than
-                                :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the
-                                highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
-                                shape dynamically, because :attr:`actual_shape` 
-                                will be deprecated. When using actual_shape to 
-                                specify output shape, one of :attr:`out_shape` 
-                                and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
-                                Default: None
-        align_corners(bool): ${align_corners_comment}
-        data_format(str, optional): NCHW(num_batches, channels, height, width) or 
-                                    NHWC(num_batches, height, width, channels).
-                                    Default: 'NCHW'.
-
-    Returns:
-        A 4-D Tensor in shape of (num_batches, channels, out_h, out_w) or 
-        (num_batches, out_h, out_w, channels).
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[3, 6, 9], dtype="float32")
-            # input.shape = [-1, 3, 6, 9], where -1 indicates batch size, and it will get the exact value in runtime.
-
-            out0 = fluid.layers.resize_nearest(input, out_shape=[12, 12])
-            # out0.shape = [-1, 3, 12, 12], it means out0.shape[0] = input.shape[0] in runtime.
-
-            # out_shape is a list in which each element is a integer or a tensor Variable
-            dim1 = fluid.layers.data(name="dim1", shape=[1], dtype="int32", append_batch_size=False)
-            out1 = fluid.layers.resize_nearest(input, out_shape=[12, dim1])
-            # out1.shape = [-1, 3, 12, -1]
-
-            # out_shape is a 1-D tensor Variable
-            shape_tensor = fluid.layers.data(name="resize_shape", shape=[2], dtype="int32", append_batch_size=False)
-            out2 = fluid.layers.resize_nearest(input, out_shape=shape_tensor)
-            # out2.shape = [-1, 3, -1, -1]
-
-            # when use actual_shape
-            actual_shape_tensor = fluid.layers.data(name="actual_shape_tensor", shape=[2], dtype="int32", append_batch_size=False)
-            out3 = fluid.layers.resize_nearest(input, out_shape=[4, 4], actual_shape=actual_shape_tensor)
-            # out3.shape = [-1, 3, 4, 4]
-
-            # scale is a Variable
-            scale_tensor = fluid.layers.data(name="scale", shape=[1], dtype="float32", append_batch_size=False)
-            out4 = fluid.layers.resize_nearest(input, scale=scale_tensor)
-            # out4.shape = [-1, 3, -1, -1]
-    """
-
-    return image_resize(
-        input,
-        out_shape,
-        scale,
-        name,
-        'NEAREST',
-        actual_shape,
-        align_corners,
-        align_mode=1,
-        data_format=data_format)
-
-
-def image_resize_short(input, out_short_len, resample='BILINEAR'):
-    """
-    Resize a batch of images. The short edge of input images will be
-    resized to the given 'out_short_len'. The long edge of input images
-    will be resized proportionately to make images' length-width ratio
-    constant.
-
-    Args:
-        input (Variable): The input tensor of image resize layer,
-                          This is a 4-D tensor of the shape
-                          (num_batches, channels, in_h, in_w).
-        out_short_len(int): The length of output images' short edge.
-        resample (str): resample method, default: BILINEAR.
-
-    Returns:
-        Variable: The output is a 4-D tensor of the shape
-        (num_batches, channls, out_h, out_w).
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[3,6,9], dtype="float32")
-            out = fluid.layers.image_resize_short(input, out_short_len=3)
-    """
-    in_shape = input.shape
-    if len(in_shape) != 4:
-        raise ValueError(
-            "The rank of input must be 4 (num_batches, channels, in_h, in_w).")
-    hw = in_shape[2:4]
-    short_idx = hw.index(min(hw))
-    long_idx = 1 - short_idx
-    out_shape = list(hw)
-    out_shape[short_idx] = out_short_len
-    out_shape[long_idx] = int(
-        float(out_shape[long_idx]) * (float(out_short_len) / float(hw[
-            short_idx])) + 0.5)
-    return image_resize(input=input, out_shape=out_shape, resample=resample)
-
-
-def gather(input, index, overwrite=True):
-    """
-    **Gather Layer**
-
-    Output is obtained by gathering entries of the outer-most dimension
-    of X indexed by `index` and concatenate them together.
-
-    .. math::
-
-        Out = X[Index]
-
-
-    .. code-block:: text
-
-
-                Given:
-
-                X = [[1, 2],
-                     [3, 4],
-                     [5, 6]]
-
-                Index = [1, 2]
-
-                Then:
-
-                Out = [[3, 4],
-                       [5, 6]]
-
-    Args:
-        input (Variable): The source input with rank>=1.
-        index (Variable): The index input with rank=1.
-        overwrite (bool): The mode that updating the grad when has same index.
-            If True, use the overwrite mode to update the grad of the same index,
-	    if False, use the accumulate mode to update the grad of the same index. 
-	    Default value is True.
-	    
-
-
-    Returns:
-        output (Variable): The output is a tensor with the same rank as input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[-1, 5], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[-1, 1], dtype='int32')
-            output = fluid.layers.gather(x, index)
-    """
-    helper = LayerHelper('gather', **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="gather",
-        inputs={"X": input,
-                "Index": index},
-        outputs={"Out": out},
-        attrs={'overwrite': overwrite})
-    return out
-
-
-def gather_nd(input, index, name=None):
-    """
-    **Gather Nd Layer**
-
-    This function is actually a high-dimensional extension of :code:`gather` 
-    and supports for simultaneous indexing by multiple axes. :attr:`index` is a 
-    K-dimensional integer tensor, which is regarded as a (K-1)-dimensional 
-    tensor of :attr:`index` into :attr:`input`, where each element defines 
-    a slice of params:
-
-    .. math::
-
-        output[(i_0, ..., i_{K-2})] = input[index[(i_0, ..., i_{K-2})]]
-
-    Obviously, :code:`index.shape[-1] <= input.rank` . And, the output tensor has
-    shape :code:`index.shape[:-1] + input.shape[index.shape[-1]:]` .
-
-    .. code-block:: text
-
-            Given:
-                input = [[[ 0,  1,  2,  3],
-                          [ 4,  5,  6,  7],
-                          [ 8,  9, 10, 11]],
-                         [[12, 13, 14, 15],
-                          [16, 17, 18, 19],
-                          [20, 21, 22, 23]]]
-                input.shape = (2, 3, 4)
-
-            * Case 1:
-                index = [[1]]
-                
-                gather_nd(input, index)  
-                         = [input[1, :, :]] 
-                         = [[12, 13, 14, 15],
-                            [16, 17, 18, 19],
-                            [20, 21, 22, 23]]
-
-            * Case 2:
-                index = [[0,2]]
-
-                gather_nd(input, index)
-                         = [input[0, 2, :]]
-                         = [8, 9, 10, 11]
-
-            * Case 3:
-                index = [[1, 2, 3]]
-
-                gather_nd(input, index)
-                         = [input[1, 2, 3]]
-                         = [23]
-
-    Args:
-        input (Variable): The source input
-        index (Variable): The index input with rank > 1, index.shape[-1] <= input.rank
-        name (str|None): A name for this layer(optional). If set None, the
-                         layer will be named automatically
-
-    Returns:
-        output (Variable): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[3, 4, 5], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[2, 2], dtype='int32')
-            output = fluid.layers.gather_nd(x, index)
-
-    """
-    helper = LayerHelper('gather_nd', **locals())
-    dtype = helper.input_dtype()
-    if name is None:
-        output = helper.create_variable_for_type_inference(dtype)
-    else:
-        output = helper.create_variable(
-            name=name, dtype=dtype, persistable=False)
-    helper.append_op(
-        type="gather_nd",
-        inputs={"X": input,
-                "Index": index},
-        outputs={"Out": output})
-    return output
-
-
-def scatter(input, index, updates, name=None, overwrite=True):
-    """
-    **Scatter Layer**
-
-    Output is obtained by updating the input on selected indices on the first
-    axis.
-
-    .. math::
-
-        Out = X
-        Out[Ids] = Updates
-
-    Args:
-        input (Variable): The source input with rank>=1.
-        index (Variable): The index input with rank=1. Its dtype should be
-                          int32 or int64 as it is used as indexes.
-        updates (Variable): The updated value of scatter op.
-        name (str|None): The output variable name. Default None.
-        overwrite (bool): The mode that updating the output when has same index.
-            If True, use the overwrite mode to update the output of the same index,
-	    if False, use the accumulate mode to update the output of the same index. 
-	    Default value is True.You can set overwrite=False to implement scatter_add.
-
-    Returns:
-        output (Variable): The output is a tensor with the same shape as input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(name='data', shape=[3, 5, 9], dtype='float32', append_batch_size=False)
-            index = fluid.layers.data(name='index', shape=[3], dtype='int64', append_batch_size=False)
-            updates = fluid.layers.data(name='update', shape=[3, 5, 9], dtype='float32', append_batch_size=False)
-
-            output = fluid.layers.scatter(input, index, updates)
-    """
-    helper = LayerHelper('scatter', **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="scatter",
-        inputs={"X": input,
-                "Ids": index,
-                "Updates": updates},
-        attrs={'overwrite': overwrite},
-        outputs={"Out": out})
-    return out
-
-
-def scatter_nd_add(ref, index, updates, name=None):
-    """
-    **Scatter_nd_add Layer**
-
-    Output is obtained by applying sparse addition to a single value
-    or slice in a Variable. :attr:`ref` is a Tensor with rank :math:`R` 
-    and :attr:`index` is a Tensor with rank :math:`K` . Thus, :attr:`index` 
-    has shape :math:`[i_0, i_1, ..., i_{K-2}, Q]` where :math:`Q \leq R` . :attr:`updates` 
-    is a Tensor with rank :math:`K - 1 + R - Q` and its
-    shape is :math:`index.shape[:-1] + ref.shape[index.shape[-1]:]` .
-    According to the :math:`[i_0, i_1, ..., i_{K-2}]` of :attr:`index` ,
-    add the corresponding :attr:`updates` slice to the :attr:`ref` slice
-    which is obtained by the last one dimension of :attr:`index` .
-
-    .. code-block:: text
-        
-        Given:
-
-        * Case 1:
-            ref = [0, 1, 2, 3, 4, 5]
-            index = [[1], [2], [3], [1]]
-            updates = [9, 10, 11, 12]
-
-          we get:
-             
-            output = [0, 22, 12, 14, 4, 5]
-
-        * Case 2:
-            ref = [[65, 17], [-14, -25]]
-            index = [[], []]
-            updates = [[[-1, -2], [1, 2]],
-                       [[3, 4], [-3, -4]]]
-            ref.shape = (2, 2)
-            index.shape = (2, 0)
-            updates.shape = (2, 2, 2)
-
-          we get:
-             
-            output = [[67, 19], [-16, -27]]
-
-    Args:
-        ref (Variable): The ref input.
-        index (Variable): The index input with rank > 1 and index.shape[-1] <= ref.rank.
-                          Its dtype should be int32 or int64 as it is used as indexes.
-        updates (Variable): The updated value of scatter_nd_add op, and it must have the same type
-                            as ref. It must have the shape index.shape[:-1] + ref.shape[index.shape[-1]:]
-        name (str|None): The output variable name. Default None.
-
-    Returns:
-        output (Variable): The output is a tensor with the same shape and type as ref.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            ref = fluid.layers.data(name='ref', shape=[3, 5, 9, 10], dtype='float32', append_batch_size=False)
-            index = fluid.layers.data(name='index', shape=[3, 2], dtype='int32', append_batch_size=False)
-            updates = fluid.layers.data(name='update', shape=[3, 9, 10], dtype='float32', append_batch_size=False)
-
-            output = fluid.layers.scatter_nd_add(ref, index, updates)
-    """
-    if ref.dtype != updates.dtype:
-        raise ValueError("ref and updates must have same data type.")
-
-    helper = LayerHelper('scatter_nd_add', **locals())
-    dtype = helper.input_dtype()
-    if name is None:
-        output = helper.create_variable_for_type_inference(dtype)
-    else:
-        output = helper.create_variable(
-            name=name, dtype=dtype, persistable=False)
-    helper.append_op(
-        type="scatter_nd_add",
-        inputs={"X": ref,
-                "Index": index,
-                "Updates": updates},
-        outputs={"Out": output})
-    return output
-
-
-def scatter_nd(index, updates, shape, name=None):
-    """
-    **Scatter_nd Layer**
-
-    Output is obtained by scattering the :attr:`updates` in a new tensor according 
-    to :attr:`index` . This op is similar to :code:`scatter_nd_add`, except the 
-    tensor of :attr:`shape` is zero-initialized. Correspondingly, :code:`scatter_nd(index, updates, shape)` 
-    is equal to :code:`scatter_nd_add(fluid.layers.zeros(shape, updates.dtype), index, updates)` . 
-    If :attr:`index` has repeated elements, then the corresponding updates are accumulated. 
-    Because of the numerical approximation issues, the different order of repeated elements 
-    in :attr:`index` may cause different results. The specific calculation method can be 
-    seen :code:`scatter_nd_add` . This op is the inverse of the :code:`gather_nd` op.
-
-    Args:
-        index (Variable): The index input with rank > 1 and index.shape[-1] <= len(shape).
-                          Its dtype should be int32 or int64 as it is used as indexes.
-        updates (Variable): The updated value of scatter_nd op. 
-                            It must have the shape index.shape[:-1] + shape[index.shape[-1]:]
-        shape(tuple|list): Shape of output tensor.
-        name (str|None): The output variable name. Default None.
-
-    Returns:
-        output (Variable): The output is a tensor with the same type as :attr:`updates` .
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            index = fluid.layers.data(name='index', shape=[3, 2], dtype='int64', append_batch_size=False)
-            updates = fluid.layers.data(name='update', shape=[3, 9, 10], dtype='float32', append_batch_size=False)
-            shape = [3, 5, 9, 10]
-
-            output = fluid.layers.scatter_nd(index, updates, shape)
-    """
-    return scatter_nd_add(zeros(shape, updates.dtype), index, updates, name)
-
-
-def sequence_scatter(input, index, updates, name=None):
-    """
-    **Sequence Scatter Layer**
-
-    This operator scatters the Updates tensor to the input X. It uses the LoD
-    information of Ids to select the rows to update, and use the values in Ids as
-    the columns to update in each row of X.
-
-    Here is an example:
-
-    Given the following input:
-
-    .. code-block:: text
-
-        input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-                      [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-                      [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
-        input.dims = [3, 6]
-
-        index.data = [[0], [1], [2], [5], [4], [3], [2], [1], [3], [2], [5], [4]]
-        index.lod =  [[0,        3,                       8,                 12]]
-
-        updates.data = [[0.3], [0.3], [0.4], [0.1], [0.2], [0.3], [0.4], [0.0], [0.2], [0.3], [0.1], [0.4]]
-        updates.lod =  [[  0,            3,                                 8,                         12]]
-
-    Then we have the output:
-
-    .. code-block:: text
-
-        out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
-                    [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
-                    [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
-        out.dims = X.dims = [3, 6]
-
-    Args:
-        input (Variable): The source input with rank>=1.
-        index (Variable): A LoD Tensor. The index input of sequence scatter op
-            where input will be  updated. The index input with rank=1. Its dtype
-            should be int32 or int64 as it is used as indexes.
-        updates (Variable): A LoD Tensor. The values to scatter to the input
-            tensor X, must be a LoDTensor with the same LoD information as index.
-        name (str|None): The output variable name. Default None.
-
-    Returns:
-        Variable: The output is a tensor with the same shape as input.
-
-    Examples:
-
-        .. code-block:: python
-	
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            input = layers.data( name="x", shape=[3, 6], append_batch_size=False, dtype='float32' )
-            index = layers.data( name='index', shape=[1], dtype='int32')
-            updates = layers.data( name='updates', shape=[1], dtype='float32')
-            output = fluid.layers.sequence_scatter(input, index, updates)
-
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_scatter', **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="sequence_scatter",
-        inputs={"X": input,
-                "Ids": index,
-                "Updates": updates},
-        outputs={"Out": out})
-    return out
-
-
-@templatedoc()
-def random_crop(x, shape, seed=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        shape(${shape_type}): ${shape_comment}
-        seed(int|${seed_type}|None): ${seed_comment} By default, the seed will
-            get from `random.randint(-65536, 65535)`.
-
-    Returns:
-        ${out_comment}
-
-    Examples:
-        >>> import paddle.fluid as fluid
-        >>> img = fluid.layers.data("img", [3, 256, 256])
-        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
-    """
-    helper = LayerHelper("random_crop", **locals())
-    dtype = x.dtype
-    out = helper.create_variable_for_type_inference(dtype)
-    if seed is None:
-        seed = np.random.randint(-65536, 65536)
-    op_attrs = {"shape": shape}
-    if isinstance(seed, int):
-        op_attrs["startup_seed"] = seed
-        seed = helper.create_variable(
-            name=unique_name.generate("random_crop_seed"),
-            dtype="int64",
-            persistable=True)
-    elif not isinstance(seed, Variable):
-        raise ValueError("'seed' must be a Variable or an int.")
-    helper.append_op(
-        type="random_crop",
-        inputs={"X": x,
-                "Seed": seed},
-        outputs={"Out": out,
-                 "SeedOut": seed},
-        attrs=op_attrs)
-    return out
-
-
-def log(x, name=None):
-    """
-    Calculates the natural log of the given input tensor, element-wise.
-
-    .. math::
-
-        Out = \\ln(x)
-
-    Args:
-        x (Variable): Input tensor.
-        name (str|None, default None): A name for this layer If set None,
-            the layer will be named automatically.
-
-    Returns:
-        Variable: The natural log of the input tensor computed element-wise.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3, 4], dtype="float32")
-            output = fluid.layers.log(x)
-    """
-    helper = LayerHelper('log', **locals())
-    dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out})
-    return out
-
-
-def relu(x, name=None):
-    """
-    Relu takes one input data (Tensor) and produces one output data (Tensor)
-    where the rectified linear function, y = max(0, x), is applied to
-    the tensor elementwise.
-
-    .. math::
-
-        Out = \\max(0, x)
-
-    Args:
-        x (Variable): The input tensor.
-        name (str|None, default None): A name for this layer If set None,
-            the layer will be named automatically.
-
-    Returns:
-        Variable: The output tensor with the same shape as input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3, 4], dtype="float32")
-            output = fluid.layers.relu(x)
-    """
-    helper = LayerHelper('relu', **locals())
-    dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out})
-    return out
-
-
-@templatedoc()
-def selu(x, scale=None, alpha=None, name=None):
-    """
-    ${comment}
-
-    Args:
-        x (Variable): The input tensor.
-        scale(float, None): If the scale is not set,
-            the default value is 1.0507009873554804934193349852946.
-            For more information about this value, please refer
-            to: https://arxiv.org/abs/1706.02515.
-        alpha(float, None): If the alpha is not set,
-            the default value is 1.6732632423543772848170429916717.
-            For more information about this value, please refer
-            to: https://arxiv.org/abs/1706.02515.
-        name (str|None, default None): A name for this layer If set None,
-            the layer will be named automatically.
-
-    Returns:
-        Variable: The output tensor with the same shape as input.
-
-    Examples:
-
-        .. code-block:: python
-             
-            import paddle.fluid as fluid
-          
-            input = fluid.layers.data(
-                 name="input", shape=[3, 9, 5], dtype="float32")
-            output = fluid.layers.selu(input)
-    """
-    helper = LayerHelper('selu', **locals())
-    dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_variable_for_type_inference(dtype)
-    attrs = {}
-    if scale is not None:
-        attrs["scale"] = scale
-    if alpha is not None:
-        attrs["alpha"] = alpha
-
-    helper.append_op(
-        type="selu", inputs={"X": x}, outputs={"Out": out}, attrs=attrs)
-    return out
-
-
-def mean_iou(input, label, num_classes):
-    """
-    Mean Intersection-Over-Union is a common evaluation metric for
-    semantic image segmentation, which first computes the IOU for each
-    semantic class and then computes the average over classes.
-    IOU is defined as follows:
-
-    .. math::
-
-        IOU = \\frac{true\_positive}{(true\_positive + false\_positive + false\_negative)}.
-
-    The predictions are accumulated in a confusion matrix and mean-IOU
-    is then calculated from it.
-
-
-    Args:
-        input (Variable): A Tensor of prediction results for semantic labels with type int32 or int64.
-        label (Variable): A Tensor of ground truth labels with type int32 or int64.
-                           Its shape should be the same as input.
-        num_classes (int): The possible number of labels.
-
-    Returns:
-        mean_iou (Variable),out_wrong(Variable),out_correct(Variable):
-
-                     Three variables:
-
-                     - mean_iou : A Tensor representing the mean intersection-over-union with shape [1].
-                     - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class.
-                     - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            iou_shape = [32, 32]
-            num_classes = 5
-            predict = fluid.layers.data(name='predict', shape=iou_shape)
-            label = fluid.layers.data(name='label', shape=iou_shape)
-            iou, wrongs, corrects = fluid.layers.mean_iou(predict, label,
-                                                          num_classes)
-    """
-    helper = LayerHelper('mean_iou', **locals())
-    dtype = helper.input_dtype()
-    out_mean_iou = helper.create_variable_for_type_inference(dtype='float32')
-    out_wrong = helper.create_variable_for_type_inference(dtype='int32')
-    out_correct = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="mean_iou",
-        inputs={"Predictions": input,
-                "Labels": label},
-        outputs={
-            "OutMeanIou": out_mean_iou,
-            "OutWrong": out_wrong,
-            "OutCorrect": out_correct
-        },
-        attrs={"num_classes": num_classes})
-    return out_mean_iou, out_wrong, out_correct
-
-
-def crop(x, shape=None, offsets=None, name=None):
-    """
-    Crop input into output, as specified by offsets and shape.
-
-    **Warning:** THIS FUNCTION IS DEPRECATED. It will be removed in a future version.
-    Instructions for updating: Use `fluid.layers.crop_tensor
-    <https://www.paddlepaddle.org.cn/documentation/docs/en/api/layers/nn.html#crop_tensor>`_
-    instead.
-
-    .. code-block:: text
-
-        * Case 1:
-            Given
-                X = [[0, 1, 2, 0, 0]
-                     [0, 3, 4, 0, 0]
-                     [0, 0, 0, 0, 0]],
-            and
-                shape = [2, 2],
-                offsets = [0, 1],
-            output is:
-                Out = [[1, 2],
-                       [3, 4]].
-        * Case 2:
-            Given
-                X = [[0, 1, 2, 5, 0]
-                     [0, 3, 4, 6, 0]
-                     [0, 0, 0, 0, 0]],
-            and shape is tensor
-                shape = [[0, 0, 0]
-                         [0, 0, 0]]
-            and
-                offsets = [0, 1],
-
-            output is:
-                Out = [[1, 2, 5],
-                       [3, 4, 6]].
-
-    Args:
-        x (Variable): The input tensor variable.
-        shape (Variable|list/tuple of integer): The output shape is specified
-            by `shape`, which can be a Variable or a list/tuple of integer.
-            If a tensor Variable, it's rank must be the same as `x`. This way
-            is suitable for the case that the output shape may be changed each
-            iteration. If a list/tuple of integer, it's length must be the same
-            as the rank of `x`
-        offsets (Variable|list/tuple of integer|None): Specifies the cropping
-            offsets at each dimension. It can be a Variable or a list/tuple
-            of integers. If a tensor Variable, it's rank must be the same as `x`.
-            This way is suitable for the case that the offsets may be changed
-            each iteration. If a list/tuple of integer, it's length must be the
-            same as the rank of `x`. If None, the offsets are 0 at each
-            dimension.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The cropped tensor variable.
-
-    Raises:
-        ValueError: If shape is not a list, tuple or Variable.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32")
-            y = fluid.layers.data(name="y", shape=[2, 3], dtype="float32")
-            crop = fluid.layers.crop(x, shape=y)
-
-            # or
-            z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
-            crop = fluid.layers.crop(z, shape=[-1, 2, 3])
-
-    """
-    helper = LayerHelper('crop', **locals())
-
-    if not (isinstance(shape, list) or isinstance(shape, tuple) or \
-            isinstance(shape, Variable)):
-        raise ValueError("The shape should be a list, tuple or Variable.")
-
-    if offsets is None:
-        offsets = [0] * len(x.shape)
-
-    out = helper.create_variable_for_type_inference(x.dtype)
-    ipts = {'X': x}
-    attrs = {}
-    if isinstance(shape, Variable):
-        ipts['Y'] = shape
-    else:
-        attrs['shape'] = shape
-    if isinstance(offsets, Variable):
-        ipts['Offsets'] = offsets
-    else:
-        attrs['offsets'] = offsets
-
-    helper.append_op(
-        type='crop',
-        inputs=ipts,
-        outputs={'Out': out},
-        attrs=None if len(attrs) == 0 else attrs)
-    return out
-
-
-def crop_tensor(x, shape=None, offsets=None, name=None):
-    """
-    Crop input into output, as specified by offsets and shape.
-
-    .. code-block:: text
-
-        * Case 1:
-            Given
-                X = [[0, 1, 2, 0, 0]
-                     [0, 3, 4, 0, 0]
-                     [0, 0, 0, 0, 0]],
-            and
-                shape = [2, 2],
-                offsets = [0, 1],
-            output is:
-                Out = [[1, 2],
-                       [3, 4]].
-        * Case 2:
-            Given
-                X =  [[[0, 1, 2, 3]
-                       [0, 5, 6, 7]
-                       [0, 0, 0, 0]],
-
-                      [[0, 3, 4, 5]
-                       [0, 6, 7, 8]
-                       [0, 0, 0, 0]]].
-            and
-                shape = [2, 2, 3],
-                offsets = [0, 0, 1],
-            output is:
-                Out = [[[1, 2, 3]
-                        [5, 6, 7]],
-
-                        [[3, 4, 5]
-                         [6, 7, 8]]].
-
-    Args:
-        x (Variable): The input tensor variable.
-        shape (Variable|list|tuple of integer): The output shape is specified
-            by `shape`. It can be a 1-D tensor Variable or a list/tuple. If a 
-            1-D tensor Variable, it's rank must be the same as `x`. If a 
-            list/tuple, it's length must be the same as the rank of `x`. Each 
-            element of list can be an integer or a tensor Variable of shape: [1].
-            If Variable contained, it is suitable for the case that the shape may 
-            be changed each iteration. Only the first element of list/tuple can be 
-            set to -1, it means that the first dimension of the output is the same 
-            as the input.
-        offsets (Variable|list|tuple of integer|None): Specifies the cropping
-            offsets at each dimension. It can be a 1-D tensor Variable or a list/tuple.
-            If a 1-D tensor Variable, it's rank must be the same as `x`. If a list/tuple, 
-            it's length must be the same as the rank of `x`. Each element of list can be
-            an integer or a tensor Variable of shape: [1]. If Variable contained, it is 
-            suitable for the case that the offsets may be changed each iteration. If None, 
-            the offsets are 0 at each dimension.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The cropped tensor variable.
-
-    Raises:
-        ValueError: If shape is not a list, tuple or Variable.
-        ValueError: If offsets is not None and not a list, tuple or Variable.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32")
-            # x.shape = [-1, 3, 5], where -1 indicates batch size, and it will get the exact value in runtime.
-
-            # shape is a 1-D tensor variable
-            crop_shape = fluid.layers.data(name="crop_shape", shape=[3], dtype="int32", append_batch_size=False)
-            crop0 = fluid.layers.crop_tensor(x, shape=crop_shape)
-            # crop0.shape = [-1, -1, -1], it means crop0.shape[0] = x.shape[0] in runtime.
-
-            # or shape is a list in which each element is a constant
-            crop1 = fluid.layers.crop_tensor(x, shape=[-1, 2, 3])
-            # crop1.shape = [-1, 2, 3]
-
-            # or shape is a list in which each element is a constant or variable
-            y = fluid.layers.data(name="y", shape=[3, 8, 8], dtype="float32")
-            dim1 = fluid.layers.data(name="dim1", shape=[1], dtype="int32", append_batch_size=False)
-            crop2 = fluid.layers.crop_tensor(y, shape=[-1, 3, dim1, 4])
-            # crop2.shape = [-1, 3, -1, 4]
-
-            # offsets is a 1-D tensor variable
-            crop_offsets = fluid.layers.data(name="crop_offsets", shape=[3], dtype="int32", append_batch_size=False)
-            crop3 = fluid.layers.crop_tensor(x, shape=[-1, 2, 3], offsets=crop_offsets)
-            # crop3.shape = [-1, 2, 3]
-
-            # offsets is a list in which each element is a constant or variable
-            offsets_var =  fluid.layers.data(name="dim1", shape=[1], dtype="int32", append_batch_size=False)
-            crop4 = fluid.layers.crop_tensor(x, shape=[-1, 2, 3], offsets=[0, 1, offsets_var])
-            # crop4.shape = [-1, 2, 3]
-
-    """
-    helper = LayerHelper('crop_tensor', **locals())
-
-    if not (isinstance(shape, list) or isinstance(shape, tuple) or \
-            isinstance(shape, Variable)):
-        raise ValueError("The shape should be a list, tuple or Variable.")
-
-    if offsets is None:
-        offsets = [0] * len(x.shape)
-
-    if not (isinstance(offsets, list) or isinstance(offsets, tuple) or \
-            isinstance(offsets, Variable)):
-        raise ValueError("The offsets should be a list, tuple or Variable.")
-
-    out = helper.create_variable_for_type_inference(x.dtype)
-    ipts = {'X': x}
-    attrs = {}
-
-    def contain_var(input_list):
-        for ele in input_list:
-            if isinstance(ele, Variable):
-                return True
-        return False
-
-    if isinstance(offsets, Variable):
-        offsets.stop_gradient = True
-        ipts['Offsets'] = offsets
-    elif contain_var(offsets):
-        new_offsets_tensor = []
-        for dim in offsets:
-            if isinstance(dim, Variable):
-                dim.stop_gradient = True
-                new_offsets_tensor.append(dim)
-            else:
-                assert (isinstance(dim, int))
-                assert dim >= 0, ("offsets should be greater or equal to zero.")
-                temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out)
-                new_offsets_tensor.append(temp_out)
-        ipts['OffsetsTensor'] = new_offsets_tensor
-    else:
-        attrs['offsets'] = offsets
-
-    unk_dim_idx = -1
-    if isinstance(shape, Variable):
-        shape.stop_gradient = True
-        ipts['Shape'] = shape
-    elif contain_var(shape):
-        new_shape_tensor = []
-        shape_attr = []
-        for dim_idx, dim_size in enumerate(shape):
-            if isinstance(dim_size, Variable):
-                dim_size.stop_gradient = True
-                new_shape_tensor.append(dim_size)
-                shape_attr.append(-1)
-            else:
-                assert (isinstance(dim_size, int))
-                if dim_size == -1:
-                    assert unk_dim_idx == -1, (
-                        "Only one element in shape can be unknown.")
-                    assert dim_idx == 0, (
-                        "Only the first element in shape can be -1.")
-                    unk_dim_idx = dim_idx
-                else:
-                    assert dim_size > 0, (
-                        "Each dimension size given in shape must be greater than zero."
-                    )
-                temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant(
-                    [1], 'int32', dim_size, force_cpu=True, out=temp_out)
-                new_shape_tensor.append(temp_out)
-                shape_attr.append(dim_size)
-        ipts['ShapeTensor'] = new_shape_tensor
-        attrs['shape'] = shape_attr
-    else:
-        attrs['shape'] = shape
-
-    helper.append_op(
-        type='crop_tensor',
-        inputs=ipts,
-        outputs={'Out': out},
-        attrs=None if len(attrs) == 0 else attrs)
-    return out
-
-
-def affine_grid(theta, out_shape, name=None):
-    """
-    It generates a grid of (x,y) coordinates using the parameters of
-    the affine transformation that correspond to a set of points where
-    the input feature map should be sampled to produce the transformed
-    output feature map.
-
-    .. code-block:: text
-
-        * Case 1:
-
-          Given:
-
-              theta = [[[x_11, x_12, x_13]
-                        [x_14, x_15, x_16]]
-                       [[x_21, x_22, x_23]
-                        [x_24, x_25, x_26]]]
-
-              out_shape = [2, 3, 5, 5]
-
-          Step 1:
-
-              Generate normalized coordinates according to out_shape.
-              The values of the normalized coordinates are in the interval between -1 and 1.
-              The shape of the normalized coordinates is [2, H, W] as below:
-
-              C = [[[-1.  -1.  -1.  -1.  -1. ]
-                    [-0.5 -0.5 -0.5 -0.5 -0.5]
-                    [ 0.   0.   0.   0.   0. ]
-                    [ 0.5  0.5  0.5  0.5  0.5]
-                    [ 1.   1.   1.   1.   1. ]]
-                   [[-1.  -0.5  0.   0.5  1. ]
-                    [-1.  -0.5  0.   0.5  1. ]
-                    [-1.  -0.5  0.   0.5  1. ]
-                    [-1.  -0.5  0.   0.5  1. ]
-                    [-1.  -0.5  0.   0.5  1. ]]]
-              C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
-
-          Step2:
-
-              Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
-              C_ = [[-1.  -1.   1. ]
-                    [-0.5 -1.   1. ]
-                    [ 0.  -1.   1. ]
-                    [ 0.5 -1.   1. ]
-                    [ 1.  -1.   1. ]
-                    [-1.  -0.5  1. ]
-                    [-0.5 -0.5  1. ]
-                    [ 0.  -0.5  1. ]
-                    [ 0.5 -0.5  1. ]
-                    [ 1.  -0.5  1. ]
-                    [-1.   0.   1. ]
-                    [-0.5  0.   1. ]
-                    [ 0.   0.   1. ]
-                    [ 0.5  0.   1. ]
-                    [ 1.   0.   1. ]
-                    [-1.   0.5  1. ]
-                    [-0.5  0.5  1. ]
-                    [ 0.   0.5  1. ]
-                    [ 0.5  0.5  1. ]
-                    [ 1.   0.5  1. ]
-                    [-1.   1.   1. ]
-                    [-0.5  1.   1. ]
-                    [ 0.   1.   1. ]
-                    [ 0.5  1.   1. ]
-                    [ 1.   1.   1. ]]
-          Step3:
-              Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
-
-    Args:
-        theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
-        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
-                                             ``out_shape`` can be a Variable or a list or tuple.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The output with shape [N, H, W, 2].
-
-    Raises:
-        ValueError: If the type of arguments is not supported.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
-            out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
-            data = fluid.layers.affine_grid(theta, out_shape)
-
-            # or
-            data = fluid.layers.affine_grid(theta, [5, 3, 28, 28])
-
-    """
-    helper = LayerHelper('affine_grid')
-
-    if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
-            isinstance(out_shape, Variable)):
-        raise ValueError("The out_shape should be a list, tuple or Variable.")
-
-    if not isinstance(theta, Variable):
-        raise ValueError("The theta should be a Variable.")
-
-    out = helper.create_variable_for_type_inference(theta.dtype)
-    ipts = {'Theta': theta}
-    attrs = {}
-    if isinstance(out_shape, Variable):
-        ipts['OutputShape'] = out_shape
-    else:
-        attrs['output_shape'] = out_shape
-
-    helper.append_op(
-        type='affine_grid',
-        inputs=ipts,
-        outputs={'Output': out},
-        attrs=None if len(attrs) == 0 else attrs)
-    return out
-
-
-def rank_loss(label, left, right, name=None):
-    """
-
-    **Rank loss layer for RankNet**
-
-    `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
-    is a pairwise ranking model with a training sample consisting of a pair
-    of documents, A and B. Label P indicates whether A is ranked higher than B
-    or not:
-
-    P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information
-    about the rank of the input pair.
-
-    Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
-    label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
-    for documents A and B and the value of label P. The following equation
-    computes rank loss C_{i,j} from the inputs:
-
-    .. math::
-
-      C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\
-
-      o_{i,j} &=  o_i - o_j  \\\\
-
-      \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \}
-
-
-    Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).
-
-    Args:
-        label (Variable): Indicats whether A ranked higher than B or not.
-        left (Variable): RankNet's output score for doc A.
-        right (Variable): RankNet's output score for doc B.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        list: The value of rank loss.
-
-    Raises:
-        ValueError: Any of label, left, and right is not a variable.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            label = fluid.layers.data(name="label", shape=[-1, 1], dtype="float32")
-            left = fluid.layers.data(name="left", shape=[-1, 1], dtype="float32")
-            right = fluid.layers.data(name="right", shape=[-1, 1], dtype="float32")
-            out = fluid.layers.rank_loss(label, left, right)
-
-    """
-    helper = LayerHelper('rank_loss', **locals())
-
-    if not (isinstance(label, Variable)):
-        raise ValueError("The label should be a Variable")
-
-    if not (isinstance(left, Variable)):
-        raise ValueError("The left should be a Variable")
-
-    if not (isinstance(right, Variable)):
-        raise ValueError("The right should be a Variable")
-
-    out = helper.create_variable_for_type_inference("float32")
-
-    helper.append_op(
-        type='rank_loss',
-        inputs={"Label": label,
-                "Left": left,
-                "Right": right},
-        outputs={'Out': out})
-    return out
-
-
-def margin_rank_loss(label, left, right, margin=0.1, name=None):
-    """
-    Margin Ranking Loss Layer for ranking problem,
-    which compares left score and right score passed in.
-    The ranking loss can be defined as following equation:
-
-    .. math::
-
-        rank\_loss = max(0, -label * (left - right) + margin)
-
-    Args:
-       label (Variable): Indicates whether the left is ranked higher than the right or not.
-       left (Variable): Ranking score for left.
-       right (Variable): Ranking score for right.
-       margin (float): Indicates the given margin.
-       name (str|None): A name for this layer (optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-       Variable: The ranking loss.
-
-    Raises:
-       ValueError: Any of label, left, and right is not a Variable.
-
-    Examples:
-
-        .. code-block:: python
-
-           import paddle.fluid as fluid
-           label = fluid.layers.data(name="label", shape=[-1, 1], dtype="float32")
-           left = fluid.layers.data(name="left", shape=[-1, 1], dtype="float32")
-           right = fluid.layers.data(name="right", shape=[-1, 1], dtype="float32")
-           out = fluid.layers.margin_rank_loss(label, left, right)
-    """
-    helper = LayerHelper('margin_rank_loss', **locals())
-    if not isinstance(label, Variable):
-        raise ValueError("The label should be a Variable.")
-    if not isinstance(left, Variable):
-        raise ValueError("The left should be a Variable.")
-    if not isinstance(right, Variable):
-        raise ValueError("The right should be a Variable.")
-    out = helper.create_variable_for_type_inference(left.dtype)
-    act = helper.create_variable_for_type_inference(left.dtype)
-    helper.append_op(
-        type='margin_rank_loss',
-        inputs={"Label": label,
-                "X1": left,
-                "X2": right},
-        outputs={'Out': out,
-                 'Activated': act},
-        attrs={'margin': margin})
-    return out
-
-
-def pad2d(input,
-          paddings=[0, 0, 0, 0],
-          mode='constant',
-          pad_value=0.0,
-          data_format="NCHW",
-          name=None):
-    """
-    Pad 2-d images accordding to 'paddings' and 'mode'.
-    If mode is 'reflect', paddings[0] and paddings[1] must be no greater
-    than height-1. And the width dimension has the same condition.
-
-    Example:
-        .. code-block:: text
-
-	      Given that X is a channel of image from input:
-
-	      X = [[1, 2, 3],
-		   [4, 5, 6]]
-
-	      Case 0:
-
-		paddings = [0, 1, 2, 3],
-		mode = 'constant'
-		pad_value = 0
-
-		Out = [[0, 0, 1, 2, 3, 0, 0, 0]
-		       [0, 0, 4, 5, 6, 0, 0, 0]
-		       [0, 0, 0, 0, 0, 0, 0, 0]]
-
-	      Case 1:
-
-		paddings = [0, 1, 2, 1],
-		mode = 'reflect'
-
-		Out = [[3, 2, 1, 2, 3, 2]
-		       [6, 5, 4, 5, 6, 5]
-		       [3, 2, 1, 2, 3, 2]]
-
-	      Case 2:
-
-		paddings = [0, 1, 2, 1],
-		mode = 'edge'
-
-		Out = [[1, 1, 1, 2, 3, 3]
-		       [4, 4, 4, 5, 6, 6]
-		       [4, 4, 4, 5, 6, 6]]
-
-
-    Args:
-        input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
-        paddings (tuple|list|Variable): The padding size. If padding is a tuple, it must
-            contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
-            Default: padding = [0, 0, 0, 0].
-        mode (str): Three modes: constant(default), reflect, edge. Default: constant
-        pad_value (float32): The value to fill the padded areas in constant mode. Default: 0
-        data_format (str): An optional string from: "NHWC", "NCHW". Specify the data format of
-                           the input data.
-                           Default: "NCHW"
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-
-    Returns:
-        Variable: The tensor variable padded accordding to paddings and mode.
-
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='data', shape=[3, 32, 32],
-                                   dtype='float32')
-          result = fluid.layers.pad2d(input=data, paddings=[1, 2, 3, 4],
-                                      mode='reflect')
-    """
-
-    helper = LayerHelper('pad2d', **locals())
-
-    assert mode in ['reflect', 'edge', 'constant'
-                    ], "mode should be one of constant, reflect, edge."
-
-    dtype = helper.input_dtype(input_param_name='input')
-    out = helper.create_variable_for_type_inference(dtype)
-    inputs = {'X': input}
-    attrs = {'mode': mode, 'pad_value': pad_value, 'data_format': data_format}
-
-    if isinstance(paddings, Variable):
-        inputs['Paddings'] = paddings
-        attrs['paddings'] = []
-    else:
-        attrs['paddings'] = paddings
-
-    helper.append_op(
-        type='pad2d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
-
-    return out
-
-
-@templatedoc()
-def elu(x, alpha=1.0, name=None):
-    """
-    ${comment}
-    Args:
-        x(${x_type}): ${x_comment}
-        alpha(${alpha_type}|1.0): ${alpha_comment}
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        output(${out_type}): ${out_comment}
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
-            y = fluid.layers.elu(x, alpha=0.2)
-    """
-    helper = LayerHelper('elu', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='elu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'alpha': alpha})
-    return out
-
-
-@templatedoc()
-def relu6(x, threshold=6.0, name=None):
-    """
-    ${comment}
-    Args:
-        x(${x_type}): ${x_comment}
-        threshold(${threshold_type}|6.0): ${threshold_comment}
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        output(${out_type}): ${out_comment}
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
-            y = fluid.layers.relu6(x, threshold=6.0)
-    """
-    helper = LayerHelper('relu6', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='relu6',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold})
-    return out
-
-
-@templatedoc()
-def pow(x, factor=1.0, name=None):
-    """
-    ${comment}
-    Args:
-        x(${x_type}): ${x_comment}
-        factor(float|Variable|1.0): The exponential factor of Pow.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        output(${out_type}): ${out_comment}
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
-
-            # example 1: argument factor is float
-            y_1 = fluid.layers.pow(x, factor=2.0)
-
-            # example 2: argument factor is Variable
-            factor_tensor = fluid.layers.fill_constant([1], "float32", 3.0)
-            y_2 = fluid.layers.pow(x, factor=factor_tensor)
-    """
-    helper = LayerHelper('pow', **locals())
-    inputs = {'X': x}
-    attrs = {}
-    if isinstance(factor, Variable):
-        factor.stop_gradient = True
-        inputs['FactorTensor'] = factor
-    else:
-        attrs['factor'] = factor
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    return out
-
-
-@templatedoc()
-def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None):
-    """
-    ${comment}
-    Args:
-        x(${x_type}): ${x_comment}
-        scale_a(${scale_a_type}|2.0 / 3.0): ${scale_a_comment}
-        scale_b(${scale_b_type}|1.7159): ${scale_b_comment}
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        output(${out_type}): ${out_comment}
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
-            y = fluid.layers.stanh(x, scale_a=0.67, scale_b=1.72)
-    """
-    helper = LayerHelper('stanh', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='stanh',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'scale_a': scale_a,
-               'scale_b': scale_b})
-    return out
-
-
-@templatedoc()
-def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
-    """
-    ${comment}
-    Args:
-        x(${x_type}): ${x_comment}
-        slope(${slope_type}|0.2): ${slope_comment}
-        offset(${offset_type}|0.5): ${offset_comment}
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        output(${out_type}): ${out_comment}
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
-            y = fluid.layers.hard_sigmoid(x, slope=0.3, offset=0.8)
-    """
-    helper = LayerHelper('hard_sigmoid', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='hard_sigmoid',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'slope': slope,
-               'offset': offset})
-    return out
-
-
-@templatedoc()
-def swish(x, beta=1.0, name=None):
-    """
-    ${comment}
-    Args:
-        x(${x_type}): ${x_comment}
-        beta(${beta_type}|1.0): ${beta_comment}
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        output(${out_type}): ${out_comment}
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
-            y = fluid.layers.swish(x, beta=2.0)
-    """
-    helper = LayerHelper('swish', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='swish',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'slope': beta})
-    return out
-
-
-def prelu(x, mode, param_attr=None, name=None):
-    """
-    Equation:
-
-    .. math::
-        y = \max(0, x) + \\alpha * \min(0, x)
-
-    There are three modes for the activation:
-
-    .. code-block:: text
-
-        all: All elements share same alpha.
-        channel: Elements in same channel share same alpha.
-        element: All elements do not share alpha. Each element has its own alpha.
-
-    Args:
-        x (Variable): The input tensor.
-        mode (string): The mode for weight sharing. 
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-          weight (alpha), it can be create by ParamAttr.
-        name(str|None): A name for this layer(optional). If set None, the layer
-          will be named automatically.
-
-    Returns:
-        Variable: The output tensor with the same shape as input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            from paddle.fluid.param_attr import ParamAttr
-            x = fluid.layers.data(name="x", shape=[5,10,10], dtype="float32")
-            mode = 'channel'
-            output = fluid.layers.prelu(
-                     x,mode,param_attr=ParamAttr(name='alpha'))
-
-    """
-    helper = LayerHelper('prelu', **locals())
-    if mode not in ['all', 'channel', 'element']:
-        raise ValueError('mode should be one of all, channel, element.')
-    alpha_shape = [1]
-    if mode == 'channel':
-        alpha_shape = [1, x.shape[1], 1, 1]
-    elif mode == 'element':
-        alpha_shape = x.shape
-    dtype = helper.input_dtype(input_param_name='x')
-    alpha = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=alpha_shape,
-        dtype='float32',
-        is_bias=False,
-        default_initializer=Constant(1.0))
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="prelu",
-        inputs={"X": x,
-                'Alpha': alpha},
-        attrs={"mode": mode},
-        outputs={"Out": out})
-    return out
-
-
-@templatedoc()
-def brelu(x, t_min=0.0, t_max=24.0, name=None):
-    """
-    ${comment}
-    Args:
-        x(${x_type}): ${x_comment}
-        t_min(${t_min_type}|0.0): ${t_min_comment}
-        t_max(${t_max_type}|24.0): ${t_max_comment}
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-    Returns:
-        output(${out_type}): ${out_comment}
-
-    Examples:
-
-    .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-            y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0)
-    """
-    helper = LayerHelper('brelu', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='brelu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'t_min': t_min,
-               't_max': t_max})
-    return out
-
-
-@templatedoc()
-def leaky_relu(x, alpha=0.02, name=None):
-    """
-    ${comment}
-    Args:
-        x(${x_type}): ${x_comment}
-        alpha(${alpha_type}|0.02): ${alpha_comment}
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-    Returns:
-        output(${out_type}): ${out_comment}
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-            y = fluid.layers.leaky_relu(x, alpha=0.01)
-    """
-    helper = LayerHelper('leaky_relu', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='leaky_relu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'alpha': alpha})
-    return out
-
-
-@templatedoc()
-def soft_relu(x, threshold=40.0, name=None):
-    """
-    ${comment}
-    Args:
-        x(${x_type}): ${x_comment}
-        threshold(${threshold_type}|40.0): ${threshold_comment}
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-    Returns:
-        output(${out_type}): ${out_comment}
-
-    Examples:
-
-        .. code-block:: python 
- 
-            import paddle.fluid as fluid
-   
-            x = fluid.layers.data(name="x", shape=[3,16,16], dtype="float32")
-            y = fluid.layers.soft_relu(x, threshold=20.0)
-    """
-    helper = LayerHelper('soft_relu', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='soft_relu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold})
-    return out
-
-
-def flatten(x, axis=1, name=None):
-    """
-    **Flatten layer**
-    Flattens the input tensor into a 2D matrix.
-
-    For Example:
-
-    .. code-block:: text
-
-        Case 1:
-
-          Given
-            X.shape = (3, 100, 100, 4)
-
-          and
-            axis = 2
-
-          We get:
-            Out.shape = (3 * 100, 4 * 100)
-
-        Case 2:
-
-          Given
-            X.shape = (3, 100, 100, 4)
-
-          and
-            axis = 0
-
-          We get:
-            Out.shape = (1, 3 * 100 * 100 * 4)
-
-    Args:
-        x (Variable): A tensor of rank >= axis.
-        axis (int): Indicate up to which input dimensions (exclusive) should
-                    be flattened to the outer dimension of the output.
-                    The value for axis must be in the range [0, R], where R
-                    is the rank of the input tensor. When axis = 0, the shape
-                    of the output tensor is (1, (d_0 X d_1 ... d_n), where the
-                    shape of the input tensor is (d_0, d_1, ... d_n).
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: A 2D tensor with the contents of the input tensor, with input \
-                  dimensions up to axis flattened to the outer dimension of \
-                  the output and remaining input dimensions flattened into the \
-                  inner dimension of the output.
-
-    Raises:
-        ValueError: If x is not a variable.
-        ValueError: If axis is not in range [0, rank(x)].
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[4, 4, 3], dtype="float32")
-            out = fluid.layers.flatten(x=x, axis=2)
-    """
-    helper = LayerHelper('flatten', **locals())
-
-    if not (isinstance(x, Variable)):
-        raise ValueError("The input x should be a Variable")
-
-    if not (isinstance(axis, int)) or axis > len(x.shape) or axis < 0:
-        raise ValueError("The axis should be a int, and in range [0, rank(x)]")
-
-    out = helper.create_variable_for_type_inference(x.dtype)
-    x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='flatten2',
-        inputs={"X": x},
-        outputs={'Out': out,
-                 'XShape': x_shape},
-        attrs={"axis": axis})
-    return out
-
-
-def sequence_enumerate(input, win_size, pad_value=0, name=None):
-    """
-    Generate a new sequence for the input index sequence, which enumerates all the
-    sub-sequences with length `win_size` of the input.
-    The enumerated sequence has the same 1st dimension with variable `input`, and
-    the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
-
-    .. code-block:: text
-
-        Case 1:
-
-          Input:
-            X.lod = [[0, 3, 5]]
-            X.data = [[1], [2], [3], [4], [5]]
-            X.dims = [5, 1]
-
-          Attrs:
-            win_size = 2
-            pad_value = 0
-
-          Output:
-            Out.lod = [[0, 3, 5]]
-            Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
-            Out.dims = [5, 2]
-
-    Args:
-        input (Variable): The input variable which is a index sequence.
-        win_size (int): The window size for enumerating all sub-sequences.
-        pad_value (int): The padding value, default 0.
-
-    Returns:
-        Variable: The enumerate sequence variable which is a LoDTensor.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
-            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper('sequence_enumerate', **locals())
-    out = helper.create_variable_for_type_inference(
-        helper.input_dtype(), stop_gradient=True)
-    helper.append_op(
-        type='sequence_enumerate',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={'win_size': win_size,
-               'pad_value': pad_value})
-    return out
-
-
-def sequence_mask(x, maxlen=None, dtype='int64', name=None):
-    """
-    **SequenceMask Layer**
-
-    This layer outputs a mask according to the input :code:`x` and
-    :code:`maxlen` with data type of :code:`dtype`.
-
-    Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
-    :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
-
-    .. math::
-
-        y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
-
-    Args:
-        x (Variable): Input tensor of sequence_mask layer,
-                      whose elements are integers less than :code:`maxlen`.
-        maxlen (int|None): Maximum length of the sequence. If :code:`maxlen`
-                           is None, it would be replace with :math:`max(x)`.
-        dtype (np.dtype|core.VarDesc.VarType|str): Data type of the output.
-        name (str|None): A name for this layer(optional). If set None, the
-                         layer will be named automatically.
-
-    Returns:
-        Variable: The output sequence mask.
-
-    Examples:
-        .. code-block:: python
-	
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            x = fluid.layers.data(name='x', shape=[10], dtype='float32', lod_level=1)
-            mask = layers.sequence_mask(x=x)
-
-    """
-    helper = LayerHelper('sequence_mask', **locals())
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
-    else:
-        out = helper.create_variable_for_type_inference(dtype=dtype, name=name)
-
-    inputs = {'X': [x]}
-    attrs = {'out_dtype': out.dtype}
-    if maxlen is not None:
-        if isinstance(maxlen, Variable):
-            inputs['MaxLenTensor'] = maxlen
-        else:
-            attrs['maxlen'] = maxlen
-
-    helper.append_op(
-        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs)
-
-    out.stop_gradient = True
-    return out
-
-
-def stack(x, axis=0):
-    """
-    **Stack Layer**
-
-    This layer stacks all of the input :code:`x` along axis.
-
-    Input :code:`x` can be a single variable, a :code:`list` of variables,
-    or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or
-    :code:`tuple`, the shapes of all these variables must be the same.
-    Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`,
-    the shape of the output variable would be
-    :math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`.
-    If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
-    If :code:`axis` is None, it would be replaced with 0.
-
-    For Example:
-
-    .. code-block:: text
-
-        Case 1:
-          Input:
-            x[0].data = [ [1.0 , 2.0 ] ]
-            x[0].dims = [1, 2]
-            x[1].data = [ [3.0 , 4.0 ] ]
-            x[1].dims = [1, 2]
-            x[2].data = [ [5.0 , 6.0 ] ]
-            x[2].dims = [1, 2]
-
-          Attrs:
-            axis = 0
-
-          Output:
-            Out.data =[ [ [1.0, 2.0] ],
-                        [ [3.0, 4.0] ],
-                        [ [5.0, 6.0] ] ]
-            Out.dims = [3, 1, 2]
-
-        Case 2:
-          Given
-            x[0].data = [ [1.0 , 2.0 ] ]
-            x[0].dims = [1, 2]
-            x[1].data = [ [3.0 , 4.0 ] ]
-            x[1].dims = [1, 2]
-            x[2].data = [ [5.0 , 6.0 ] ]
-            x[2].dims = [1, 2]
-
-          Attrs:
-            axis = 1 or axis = -2
-
-          Output:
-            Out.data =[ [ [1.0, 2.0]
-                          [3.0, 4.0]
-                          [5.0, 6.0] ] ]
-            Out.dims = [1, 3, 2]
-
-    Args:
-        x (Variable|list(Variable)|tuple(Variable)): Input variables.
-        axis (int|None): The axis along which all inputs are stacked.
-
-    Returns:
-        Variable: The stacked variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            x1 = layers.data(name='x1', shape=[1, 2], dtype='int32')
-            x2 = layers.data(name='x2', shape=[1, 2], dtype='int32')
-            data = layers.stack([x1,x2])
-
-    """
-
-    helper = LayerHelper('stack', **locals())
-    axis = 0 if axis is None else axis
-
-    if not isinstance(x, list) and not isinstance(x, tuple):
-        x = [x]
-
-    out = helper.create_variable_for_type_inference(x[0].dtype)
-    helper.append_op(
-        type='stack', inputs={'X': x}, outputs={'Y': out},
-        attrs={'axis': axis})
-
-    return out
-
-
-@templatedoc(op_type="filter_by_instag")
-def filter_by_instag(ins, ins_tag, filter_tag, is_lod):
-    """
-    **Filter By Instag Layer**
-   
-    This function filter a batch of ins by instag, 
-    There are multiple ins, and every ins belongs to some tags. 
-    We can specify some tags we want. So the ins which belongs to that tags
-    remains in the output, and others removed.
- 
-    For example, one batch has 4 ins. Every ins has its tag list. 
-     
-       | Ins   |   Ins_Tag |
-       |:-----:|:------:|
-       |  0    |   0, 1 |
-       |  1    |   1, 3 |
-       |  2    |   0, 3 |
-       |  3    |   2, 6 |
-
-    And Lod is [1,1,1,1]
-
-    And the filter tags [1]
-
-    From the definition above, ins which has tag 1 can pass the filter
-    So Ins 0 and Ins 1 can pass and be seen in the output,
-    Ins 2 and 3 cannot pass because they do not has tag 1.
-
-    Actually, if is_lod is false, it is normal tensor that equals to 
-    lod_tensor with all 1, similar to the example above.
-
-    Args:
-        ins (Variable): Input Variable (LoDTensor), usually it is 2D tensor
-                        And first dimension can have lod info or not.
-        ins_tag (Variable): Input Variable (LoDTensor), usually it is 1D list
-                        And split them by lod info
-        filter_tag (Variable): Input Variable (1D Tensor/List), usually it is 
-                        list that holds the tags.
-        is_lod (Bool): Boolean value to indicate ins is lod tensor or not.
-
-    Returns:
-        Variable: filtered ins (LoDTensor) and loss weight (Tensor)
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid.layers as layers
-          ins = layers.data(name='Ins', shape=[-1,32], lod_level=0, dtype='float64')
-          ins_tag = layers.data(name='Ins_tag', shape=[-1,16], lod_level=0, dtype='int64')
-          filter_tag = layers.data(name='Filter_tag', shape=[-1,16], dtype='int64')
-          out, loss_weight = layers.filter_by_instag(ins,  ins_tag,  filter_tag, True)
-        		
-    """
-    helper = LayerHelper('filter_by_instag', **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=ins.dtype)
-    loss_weight = helper.create_variable_for_type_inference(dtype=np.float64)
-    mmap = helper.create_variable_for_type_inference(dtype=ins_tag.dtype)
-    helper.append_op(
-        type='filter_by_instag',
-        inputs={'Ins': ins,
-                'Ins_tag': ins_tag,
-                'Filter_tag': filter_tag},
-        outputs={'Out': out,
-                 'LossWeight': loss_weight,
-                 'IndexMap': mmap},
-        attrs={'is_lod': is_lod})
-
-    return [out, loss_weight]
-
-
-def unstack(x, axis=0, num=None):
-    """
-    **UnStack Layer**
-
-    This layer unstacks input :code:`x` into several tensors along axis.
-
-    If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`.
-    If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`,
-    and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is
-    raised.
-
-    Args:
-        x (Variable): Input variable.
-        axis (int): The axis along which the input is unstacked.
-        num (int|None): The number of output variables.
-
-    Returns:
-        list(Variable): The unstacked variables.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[5, 10], dtype='float32')
-            y = fluid.layers.unstack(x, axis=1)
-    """
-
-    helper = LayerHelper('unstack', **locals())
-    if num is None:
-        if axis is None or x.shape[axis] <= 0:
-            raise ValueError('unknown unstack number')
-        else:
-            num = x.shape[axis]
-
-    outs = []
-    for _ in range(num):
-        outs.append(helper.create_variable_for_type_inference(x.dtype))
-
-    helper.append_op(
-        type='unstack',
-        inputs={'X': [x]},
-        outputs={'Y': outs},
-        attrs={'axis': axis,
-               'num': num})
-    return outs
-
-
-def expand(x, expand_times, name=None):
-    """Expand operator tiles the input by given times number. You should set times
-    number for each dimension by providing attribute 'expand_times'. The rank of X
-    should be in [1, 6]. Please note that size of 'expand_times' must be the same
-    with X's rank. Following is a using case:
-
-
-    .. code-block:: text
-
-        Input(X) is a 3-D tensor with shape [2, 3, 1]:
-
-                [
-                   [[1], [2], [3]],
-                   [[4], [5], [6]]
-                ]
-
-        Attr(expand_times):  [1, 2, 2]
-
-        Output(Out) is a 3-D tensor with shape [2, 6, 2]:
-
-                [
-                    [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
-                    [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
-                ]
-
-    Args:
-        x (Variable): A tensor with rank in [1, 6].
-        expand_times (list|tuple|Variable): Expand times number for each dimension.
-
-    Returns:
-        Variable: The expanded variable which is a LoDTensor. After expanding, size of each dimension of Output(Out) is equal to ithe size of the corresponding dimension of Input(X) multiplying the corresponding value given by expand_times.
-
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            # example 1:
-            data_1 = fluid.layers.fill_constant(shape=[2, 3, 1], dtype='int32', value=0)
-            expanded_1 = fluid.layers.expand(data_1, expand_times=[1, 2, 2])
-
-            # example 2:
-            data_2 = fluid.layers.fill_constant(shape=[12, 14], dtype="int32", value=3)
-            expand_times = fluid.layers.fill_constant(shape=[2], dtype="int32", value=4)
-            expanded_2 = fluid.layers.expand(data_2, expand_times=expand_times)
-    """
-
-    if not isinstance(expand_times, (list, tuple, Variable)):
-        raise ValueError(
-            "Input expand_times must be an Variable, python list or tuple.")
-
-    helper = LayerHelper('expand', input=x, **locals())
-    inputs = {"X": x}
-    attrs = {}
-
-    def contain_var(expand_times):
-        for ele in expand_times:
-            if isinstance(ele, Variable):
-                return True
-        return False
-
-    def get_attr_expand_times(list_expand_times):
-        attrs_expand_times = []
-        for idx, times in enumerate(list_expand_times):
-            if isinstance(times, Variable):
-                attrs_expand_times.append(-1)
-            else:
-                attrs_expand_times.append(times)
-                assert times > 0, (
-                    "Each element given in expand_times must not be negtive.")
-        return attrs_expand_times
-
-    def get_new_expand_times_tensor(list_expand_times):
-        new_expand_times_tensor = []
-        for ele in list_expand_times:
-            if isinstance(ele, Variable):
-                ele.stop_gradient = True
-                new_expand_times_tensor.append(ele)
-            else:
-                assert (isinstance(ele, int))
-                temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1], 'int32', ele, force_cpu=True, out=temp_out)
-                new_expand_times_tensor.append(temp_out)
-        return new_expand_times_tensor
-
-    if in_dygraph_mode():
-        inputs = {'X': x}
-        attrs = {'expand_times': expand_times}
-    else:
-        if isinstance(expand_times, Variable):
-            expand_times.stop_gradient = True
-            inputs['ExpandTimes'] = expand_times
-        elif isinstance(expand_times, (list, tuple)):
-            attrs['expand_times'] = get_attr_expand_times(expand_times)
-            if contain_var(expand_times):
-                inputs['expand_times_tensor'] = get_new_expand_times_tensor(
-                    expand_times)
-
-    dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='expand', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    return out
-
-
-from paddle.fluid.framework import convert_np_dtype_to_dtype_
-
-
-@templatedoc()
-def uniform_random_batch_size_like(input,
-                                   shape,
-                                   dtype='float32',
-                                   input_dim_idx=0,
-                                   output_dim_idx=0,
-                                   min=-1.0,
-                                   max=1.0,
-                                   seed=0):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): ${input_comment}
-        shape (tuple|list): ${shape_comment}
-        input_dim_idx (Int): ${input_dim_idx_comment}
-        output_dim_idx (Int): ${output_dim_idx_comment}
-        min (Float): ${min_comment}
-        max (Float): ${max_comment}
-        seed (Int): ${seed_comment}
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
-    Returns:
-        out (Variable): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers 
-
-            input = layers.data(name="input", shape=[13, 11], dtype='float32')
-            out = layers.uniform_random_batch_size_like(input, [-1, 11])
-    """
-
-    helper = LayerHelper('uniform_random_batch_size_like', **locals())
-    out = helper.create_variable_for_type_inference(dtype)
-    c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(
-        type='uniform_random_batch_size_like',
-        inputs={'Input': input},
-        outputs={'Out': out},
-        attrs={
-            'shape': shape,
-            'input_dim_idx': input_dim_idx,
-            'output_dim_idx': output_dim_idx,
-            'min': min,
-            'max': max,
-            'seed': seed,
-            'dtype': c_dtype
-        })
-
-    return out
-
-
-@templatedoc()
-def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
-    """
-    ${comment}
-
-    Args:
-        shape (tuple|list): ${shape_comment}
-        mean (Float): ${mean_comment}
-        std (Float): ${std_comment}
-        seed (Int): ${seed_comment}
-        dtype(np.dtype|core.VarDesc.VarType|str): Output data type.
-
-    Returns:
-        out (Variable): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            out = layers.gaussian_random(shape=[20, 30])
-    """
-
-    helper = LayerHelper('gaussian_random', **locals())
-    out = helper.create_variable_for_type_inference(dtype)
-    c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(
-        type='gaussian_random',
-        outputs={'Out': out},
-        attrs={
-            'shape': shape,
-            'mean': mean,
-            'std': std,
-            'seed': seed,
-            'dtype': c_dtype,
-            'use_mkldnn': False
-        })
-
-    return out
-
-
-@templatedoc()
-def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
-    """
-    ${comment}
-
-    Args:
-        x (Variable): ${x_comment}
-        min (Float): ${min_comment}
-        max (Float): ${max_comment}
-        seed (Float): ${seed_comment}
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
-
-    Returns:
-        out (Variable): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(
-                name="X",
-                shape=[13, 11],
-                dtype='float32',
-                append_batch_size=False)
-
-            out = fluid.layers.sampling_id(x)
-    """
-
-    helper = LayerHelper('sampling_id', **locals())
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='sampling_id',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'min': min,
-               'max': max,
-               'seed': seed})
-
-    return out
-
-
-@templatedoc()
-def gaussian_random_batch_size_like(input,
-                                    shape,
-                                    input_dim_idx=0,
-                                    output_dim_idx=0,
-                                    mean=0.0,
-                                    std=1.0,
-                                    seed=0,
-                                    dtype='float32'):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): ${input_comment}
-        shape (tuple|list): ${shape_comment}
-        input_dim_idx (Int): ${input_dim_idx_comment}
-        output_dim_idx (Int): ${output_dim_idx_comment}
-        mean (Float): ${mean_comment}
-        std (Float): ${std_comment}
-        seed (Int): ${seed_comment}
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
-
-    Returns:
-        out (Variable): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[13, 11], dtype='float32')
-
-            out = fluid.layers.gaussian_random_batch_size_like(
-                input, shape=[-1, 11], mean=1.0, std=2.0)
-    """
-
-    helper = LayerHelper('gaussian_random_batch_size_like', **locals())
-    out = helper.create_variable_for_type_inference(dtype)
-    c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(
-        type='gaussian_random_batch_size_like',
-        inputs={'Input': input},
-        outputs={'Out': out},
-        attrs={
-            'shape': shape,
-            'input_dim_idx': input_dim_idx,
-            'output_dim_idx': output_dim_idx,
-            'mean': mean,
-            'std': std,
-            'seed': seed,
-            'dtype': c_dtype
-        })
-
-    return out
-
-
-@templatedoc()
-def sum(x):
-    """
-    ${comment}
-
-    Args:
-        x (Variable): ${x_comment}
-
-    Returns:
-        out (Variable): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            input0 = layers.data(name="input0", shape=[13, 11], dtype='float32')
-            input1 = layers.data(name="input1", shape=[13, 11], dtype='float32')
-            out = layers.sum([input0,input1])
-    """
-
-    helper = LayerHelper('sum', **locals())
-    out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('x'))
-    helper.append_op(
-        type='sum',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'use_mkldnn': False})
-
-    return out
-
-
-@templatedoc()
-def slice(input, axes, starts, ends):
-    """
-    Slice Operator.
-
-    Produces a slice of the input tensor along multiple axes. Similar to numpy:
-    https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
-    Slice uses `axes`, `starts` and `ends` attributes to specify the start and
-    end dimension for each axis in the list of axes, it uses this information
-    to slice the input data tensor. If a negative value is passed for any of
-    the start or end indices, it represents number of elements before the end
-    of that dimension. If the value passed to start or end is larger than
-    the n (the number of elements in this dimension), it represents n.
-    For slicing to the end of a dimension with unknown size, it is recommended
-    to pass in INT_MAX. The size of axes must be equal to starts\' and ends\'.
-    Following examples will explain how slice works:
-
-    .. code-block:: text
-
-        Case1:
-            Given:
-                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-                axes = [0, 1]
-                starts = [1, 0]
-                ends = [2, 3]
-            Then:
-                result = [ [5, 6, 7], ]
-        
-        Case2:
-            Given:
-                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-                axes = [0, 1]
-                starts = [0, 1]
-                ends = [-1, 1000]
-            Then:
-                result = [ [2, 3, 4], ]
-    Args:
-        input (Variable): ${input_comment}.
-        axes (List): ${axes_comment}
-        starts (List|Variable): ${starts_comment}
-        ends (List|Variable): ${ends_comment}
-
-    Returns:
-        out (Variable): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(
-                name="input", shape=[3, 4, 5, 6], dtype='float32')
-
-            # example 1:
-            # attr starts is a list which doesn't contain tensor Variable.
-            axes = [0, 1, 2]
-            starts = [-3, 0, 2]
-            ends = [3, 2, 4]
-            sliced_1 = fluid.layers.slice(input, axes=axes, starts=starts, ends=ends)
-
-            # example 2:
-            # attr starts is a list which contain tensor Variable.
-            minus_3 = fluid.layers.fill_constant([1], "int32", -3)
-            sliced_2 = fluid.layers.slice(input, axes=axes, starts=[minus_3, 0, 2], ends=ends)
-    """
-
-    if not isinstance(starts, (list, tuple, Variable)):
-        raise ValueError(
-            "Input starts must be an Variable, python list or tuple.")
-    if not isinstance(ends, (list, tuple, Variable)):
-        raise ValueError(
-            "Input ends must be an Variable, python list or tuple.")
-
-    helper = LayerHelper('slice', **locals())
-
-    def contain_var(one_list):
-        for ele in one_list:
-            if isinstance(ele, Variable):
-                return True
-        return False
-
-    def get_new_list_tensor(old_list):
-        new_list_tensor = []
-        for dim in old_list:
-            if isinstance(dim, Variable):
-                dim.stop_gradient = True
-                new_list_tensor.append(dim)
-            else:
-                assert (isinstance(dim, int))
-                temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out)
-                new_list_tensor.append(temp_out)
-        return new_list_tensor
-
-    inputs = {'Input': input}
-    attrs = {'axes': axes}
-    infer_flags = list(1 for i in range(len(axes)))
-
-    if in_dygraph_mode():
-        inputs = {'Input': input}
-        attrs = {
-            'axes': axes,
-            'starts': starts,
-            'ends': ends,
-            'infer_flags': infer_flags
-        }
-    else:
-        # starts
-        if isinstance(starts, Variable):
-            starts.stop_gradient = True
-            inputs['StartsTensor'] = starts
-            infer_flags = list(-1 for i in range(len(axes)))
-        elif isinstance(starts, (list, tuple)):
-            attrs['starts'] = []
-            if not contain_var(starts):
-                attrs['starts'] = starts
-            else:
-                inputs['StartsTensorList'] = get_new_list_tensor(starts)
-                for i, dim in enumerate(starts):
-                    if isinstance(dim, Variable):
-                        attrs['starts'].append(-1)
-                        infer_flags[i] = -1
-                    else:
-                        attrs['starts'].append(dim)
-
-        # ends
-        if isinstance(ends, Variable):
-            ends.stop_gradient = True
-            inputs['EndsTensor'] = ends
-            infer_flags = list(-1 for i in range(len(axes)))
-        elif isinstance(ends, (list, tuple)):
-            attrs['ends'] = []
-            if not contain_var(ends):
-                attrs['ends'] = ends
-            else:
-                inputs['EndsTensorList'] = get_new_list_tensor(ends)
-                for i, dim in enumerate(ends):
-                    if isinstance(dim, Variable):
-                        attrs['ends'].append(-1)
-                        infer_flags[i] = -1
-                    else:
-                        attrs['ends'].append(dim)
-        # infer_flags
-        attrs['infer_flags'] = infer_flags
-    out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('input'))
-    helper.append_op(
-        type='slice', inputs=inputs, attrs=attrs, outputs={'Out': out})
-
-    return out
-
-
-@templatedoc()
-def strided_slice(input, axes, starts, ends, strides):
-    """
-    Strided Slice OP
-
-    The conceptualization that really helped me understand this was 
-    that this function emulates the indexing behavior of numpy arrays.
-    If you're familiar with numpy arrays, you'll know that you can make 
-    slices via input[start1:end1:step1, start2:end2:step2, ... startN:endN:stepN]. 
-    Basically, a very succinct way of writing for loops to get certain elements of the array.
-    strided_slice just allows you to do this fancy indexing without the syntactic sugar. 
-    The numpy (#input[start1:end1:step1, start2:end2:step2, ... startN:endN:stepN])
-    example from above just becomes fluid.strided_slice(input,[0, 1, ..., N], 
-    [start1, start2, ..., startN], [end1, end2, ..., endN], [strides1, strides2, ..., stridesN]),
-    the axes which controls the dimension you want to slice makes it more flexible.
-
-    .. code-block:: text
-
-        Case1:
-            Given:
-                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-                axes = [0, 1]
-                starts = [1, 0]
-                ends = [2, 3]
-                strides=[1, 1]
-            Then:
-                result = [ [5, 6, 7], ]
-        
-        Case2:
-            Given:
-                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-                axes = [0, 1]
-                starts = [0, 1]
-                ends = [-1, 1000]
-                strides = [1, 3]
-            Then:
-                result = [ [2], ]
-    Args:
-        input (Variable): ${input_comment}.
-        axes (List): ${axes_comment}
-        starts (List|Variable): ${starts_comment}
-        ends (List|Variable): ${ends_comment}
-
-    Returns:
-        out (Variable): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(
-                name="input", shape=[3, 4, 5, 6], dtype='float32')
-
-            # example 1:
-            # attr starts is a list which doesn't contain tensor Variable.
-            axes = [0, 1, 2]
-            starts = [-3, 0, 2]
-            ends = [3, 2, 4]
-            strides=[1, 1, 1]
-            sliced_1 = fluid.layers.strided_slice(input, axes=axes, starts=starts, ends=ends, strides=strides)
-
-            # example 2:
-            # attr starts is a list which contain tensor Variable.
-            minus_3 = fluid.layers.fill_constant([1], "int32", -3)
-            sliced_2 = fluid.layers.strided_slice(input, axes=axes, starts=[minus_3, 0, 2], ends=ends, strides=strides)
-    """
-    if not isinstance(starts, (list, tuple, Variable)):
-        raise ValueError(
-            "Input starts must be an Variable, python list or tuple.")
-    if not isinstance(ends, (list, tuple, Variable)):
-        raise ValueError(
-            "Input ends must be an Variable, python list or tuple.")
-    if not isinstance(strides, (list, tuple, Variable)):
-        raise ValueError(
-            "Input strides must be an Variable, python list or tuple.")
-
-    helper = LayerHelper('strided_slice', **locals())
-
-    def contain_var(one_list):
-        for ele in one_list:
-            if isinstance(ele, Variable):
-                return True
-        return False
-
-    def get_new_list_tensor(old_list):
-        new_list_tensor = []
-        for dim in old_list:
-            if isinstance(dim, Variable):
-                dim.stop_gradient = True
-                new_list_tensor.append(dim)
-            else:
-                assert (isinstance(dim, int))
-                temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out)
-                new_list_tensor.append(temp_out)
-        return new_list_tensor
-
-    inputs = {'Input': input}
-    attrs = {'axes': axes}
-    infer_flags = list(1 for i in range(len(axes)))
-
-    if in_dygraph_mode():
-        inputs = {'Input': input}
-        attrs = {
-            'axes': axes,
-            'starts': starts,
-            'ends': ends,
-            'strides': strides,
-            'infer_flags': infer_flags
-        }
-    else:
-        # starts
-        if isinstance(starts, Variable):
-            starts.stop_gradient = True
-            inputs['StartsTensor'] = starts
-        elif isinstance(starts, (list, tuple)):
-            attrs['starts'] = []
-            if not contain_var(starts):
-                attrs['starts'] = starts
-            else:
-                inputs['StartsTensorList'] = get_new_list_tensor(starts)
-                for i, dim in enumerate(starts):
-                    if isinstance(dim, Variable):
-                        attrs['starts'].append(-1)
-                        infer_flags[i] = -1
-                    else:
-                        attrs['starts'].append(dim)
-
-        # ends
-        if isinstance(ends, Variable):
-            ends.stop_gradient = True
-            inputs['EndsTensor'] = ends
-        elif isinstance(ends, (list, tuple)):
-            attrs['ends'] = []
-            if not contain_var(ends):
-                attrs['ends'] = ends
-            else:
-                inputs['EndsTensorList'] = get_new_list_tensor(ends)
-                for i, dim in enumerate(ends):
-                    if isinstance(dim, Variable):
-                        attrs['ends'].append(-1)
-                        infer_flags[i] = -1
-                    else:
-                        attrs['ends'].append(dim)
-        # strides
-        if isinstance(strides, Variable):
-            strides.stop_gradient = True
-            inputs['StridesTensor'] = strides
-        elif isinstance(strides, (list, tuple)):
-            attrs['strides'] = []
-            if not contain_var(strides):
-                attrs['strides'] = strides
-            else:
-                inputs['StridesTensorList'] = get_new_list_tensor(strides)
-                for i, dim in enumerate(strides):
-                    if isinstance(dim, Variable):
-                        attrs['strides'].append(-1)
-                        infer_flags[i] = -1
-                    else:
-                        attrs['strides'].append(dim)
-        attrs['infer_flags'] = infer_flags
-    out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('input'))
-    helper.append_op(
-        type='strided_slice', inputs=inputs, attrs=attrs, outputs={'Out': out})
-
-    return out
-
-
-def shape(input):
-    """
-    **Shape Layer**
-
-    Get the shape of the input.
-
-    Args:
-        input (Variable): The input variable.
-
-    Returns:
-        Variable: The shape of the input variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(
-                name="input", shape=[3, 100, 100], dtype="float32")
-            out = fluid.layers.shape(input)
-    """
-
-    helper = LayerHelper('shape', **locals())
-    out = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type='shape', inputs={'Input': input}, outputs={'Out': out})
-
-    return out
-
-
-def rank(input):
-    """
-    **Rank Layer**
-
-    Returns the number of dimensions for a tensor, which is a 0-D int32 Tensor.
-
-    Args:
-        input (Variable): The input variable.
-
-    Returns:
-        Variable: The rank of the input variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(name="input", shape=[3, 100, 100], dtype="float32")
-            rank = fluid.layers.rank(input) # 4
-    """
-
-    ndims = len(input.shape)
-    out = assign(np.array(ndims, 'int32'))
-
-    return out
-
-
-def size(input):
-    """
-    **Size Layer**
-
-    Returns the number of elements for a tensor, which is a int64 Tensor with shape [1].
-
-    Args:
-        input (Variable): The input variable.
-
-    Returns:
-        Variable: The number of elements for the input variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid.layers as layers
-
-            input = layers.data(
-                name="input", shape=[3, 100], dtype="float32", append_batch_size=False)
-            rank = layers.size(input) # 300
-    """
-
-    helper = LayerHelper('size', **locals())
-    out = helper.create_variable_for_type_inference(dtype='int64')
-    helper.append_op(type='size', inputs={'Input': input}, outputs={'Out': out})
-
-    return out
-
-
-def _elementwise_op(helper):
-    op_type = helper.layer_type
-    x = helper.kwargs.get('x', None)
-    y = helper.kwargs.get('y', None)
-    if in_dygraph_mode():
-        x = base.to_variable(x)
-        y = base.to_variable(y)
-
-    assert x is not None, 'x cannot be None in {}'.format(op_type)
-    assert y is not None, 'y cannot be None in {}'.format(op_type)
-    axis = helper.kwargs.get('axis', -1)
-    use_mkldnn = helper.kwargs.get('use_mkldnn', False)
-    name = helper.kwargs.get('name', None)
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type=op_type,
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out},
-        attrs={'axis': axis,
-               'use_mkldnn': use_mkldnn})
-    return helper.append_activation(out)
-
-
-@templatedoc()
-def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        scale(${scale_type}): ${scale_comment}
-        bias(${bias_type}): ${bias_comment}
-        bias_after_scale(${bias_after_scale_type}): ${bias_after_scale_comment}
-        act(basestring|None): Activation applied to the output.
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name="X", shape=[1, 2, 5, 5], dtype='float32')
-            y = fluid.layers.scale(x, scale = 2.0, bias = 1.0)
-    """
-
-    helper = LayerHelper('scale', **locals())
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type='scale',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={
-            'scale': float(scale),
-            'bias': float(bias),
-            'bias_after_scale': bias_after_scale
-        })
-    return helper.append_activation(out)
-
-
-def elementwise_add(x, y, axis=-1, act=None, name=None):
-    return _elementwise_op(LayerHelper('elementwise_add', **locals()))
-
-
-def elementwise_div(x, y, axis=-1, act=None, name=None):
-    return _elementwise_op(LayerHelper('elementwise_div', **locals()))
-
-
-def elementwise_sub(x, y, axis=-1, act=None, name=None):
-    return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
-
-
-def elementwise_mul(x, y, axis=-1, act=None, name=None):
-    return _elementwise_op(LayerHelper('elementwise_mul', **locals()))
-
-
-def elementwise_max(x, y, axis=-1, act=None, name=None):
-    return _elementwise_op(LayerHelper('elementwise_max', **locals()))
-
-
-def elementwise_min(x, y, axis=-1, act=None, name=None):
-    return _elementwise_op(LayerHelper('elementwise_min', **locals()))
-
-
-def elementwise_pow(x, y, axis=-1, act=None, name=None):
-    return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
-
-
-def elementwise_mod(x, y, axis=-1, act=None, name=None):
-    return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
-
-
-def elementwise_floordiv(x, y, axis=-1, act=None, name=None):
-    return _elementwise_op(LayerHelper('elementwise_floordiv', **locals()))
-
-
-for func in [
-        elementwise_add,
-        elementwise_div,
-        elementwise_sub,
-        elementwise_mul,
-        elementwise_max,
-        elementwise_min,
-        elementwise_pow,
-        elementwise_mod,
-        elementwise_floordiv,
-]:
-    op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
-    func.__doc__ = _generate_doc_string_(
-        op_proto,
-        additional_args_lines=[
-            "act (basestring|None): Activation applied to the output.",
-            "name (basestring|None): Name of the output."
-        ])
-    func.__doc__ = func.__doc__ + """
-
-Examples:
-  .. code-block:: python
-    
-    import paddle.fluid as fluid
-    # example 1: shape(x) = (2, 3, 4, 5), shape(y) = (2, 3, 4, 5)
-    x0 = fluid.layers.data(name="x0", shape=[2, 3, 4, 5], dtype='float32')
-    y0 = fluid.layers.data(name="y0", shape=[2, 3, 4, 5], dtype='float32')
-    z0 = fluid.layers.%s(x0, y0)
-
-    # example 2: shape(X) = (2, 3, 4, 5), shape(Y) = (5)
-    x1 = fluid.layers.data(name="x1", shape=[2, 3, 4, 5], dtype='float32')
-    y1 = fluid.layers.data(name="y1", shape=[5], dtype='float32')
-    z1 = fluid.layers.%s(x1, y1)
-
-    # example 3: shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
-    x2 = fluid.layers.data(name="x2", shape=[2, 3, 4, 5], dtype='float32')
-    y2 = fluid.layers.data(name="y2", shape=[4, 5], dtype='float32')
-    z2 = fluid.layers.%s(x2, y2, axis=2)
-
-    # example 4: shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-    x3 = fluid.layers.data(name="x3", shape=[2, 3, 4, 5], dtype='float32')
-    y3 = fluid.layers.data(name="y3", shape=[3, 4], dtype='float32')
-    z3 = fluid.layers.%s(x3, y3, axis=1)
-
-    # example 5: shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-    x4 = fluid.layers.data(name="x4", shape=[2, 3, 4, 5], dtype='float32')
-    y4 = fluid.layers.data(name="y4", shape=[2], dtype='float32')
-    z4 = fluid.layers.%s(x4, y4, axis=0)
-
-    # example 6: shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-    x5 = fluid.layers.data(name="x5", shape=[2, 3, 4, 5], dtype='float32')
-    y5 = fluid.layers.data(name="y5", shape=[2], dtype='float32')
-    z5 = fluid.layers.%s(x5, y5, axis=0)
-    """ % (func.__name__, func.__name__, func.__name__, func.__name__,
-           func.__name__, func.__name__)
-
-
-def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
-    helper = LayerHelper(op_name, **locals())
-
-    if binary_op:
-        assert x.dtype == y.dtype
-
-    if out is None:
-        if name is None:
-            out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        else:
-            out = helper.create_variable(
-                name=name, dtype=x.dtype, persistable=False)
-
-    if binary_op:
-        helper.append_op(
-            type=op_name, inputs={"X": x,
-                                  "Y": y}, outputs={"Out": out})
-    else:
-        helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
-
-    return out
-
-
-@templatedoc()
-def logical_and(x, y, out=None, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-        out(Tensor): Output tensor of logical operation.
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            left = fluid.layers.data(
-                name='left', shape=[1], dtype='bool')
-            right = fluid.layers.data(
-                name='right', shape=[1], dtype='bool')
-            result = fluid.layers.logical_and(x=left, y=right)
-    """
-
-    return _logical_op(
-        op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True)
-
-
-@templatedoc()
-def logical_or(x, y, out=None, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-        out(Tensor): Output tensor of logical operation.
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            left = fluid.layers.data(
-                name='left', shape=[1], dtype='bool')
-            right = fluid.layers.data(
-                name='right', shape=[1], dtype='bool')
-            result = fluid.layers.logical_or(x=left, y=right)
-    """
-
-    return _logical_op(
-        op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True)
-
-
-@templatedoc()
-def logical_xor(x, y, out=None, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-        out(Tensor): Output tensor of logical operation.
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            left = fluid.layers.data(
-                name='left', shape=[1], dtype='bool')
-            right = fluid.layers.data(
-                name='right', shape=[1], dtype='bool')
-            result = fluid.layers.logical_xor(x=left, y=right)
-    """
-
-    return _logical_op(
-        op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True)
-
-
-@templatedoc()
-def logical_not(x, out=None, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        out(Tensor): Output tensor of logical operation.
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            left = fluid.layers.data(
-                name='left', shape=[1], dtype='bool')
-            result = fluid.layers.logical_not(x=left)
-    """
-
-    return _logical_op(
-        op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False)
-
-
-@templatedoc()
-def clip(x, min, max, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        min(${min_type}): ${min_comment}
-        max(${max_type}): ${max_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(
-                name='data', shape=[1], dtype='float32')
-            reward = fluid.layers.clip(x=input, min=-1.0, max=1.0)
-    """
-
-    helper = LayerHelper("clip", **locals())
-
-    if name is None:
-        name = unique_name.generate_with_ignorable_key(".".join(
-            [helper.name, 'tmp']))
-
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="clip",
-        inputs={"X": x},
-        attrs={"min": min,
-               "max": max},
-        outputs={"Out": out})
-
-    return out
-
-
-@templatedoc()
-def clip_by_norm(x, max_norm, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        max_norm(${max_norm_type}): ${max_norm_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(
-                name='data', shape=[1], dtype='float32')
-            reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
-    """
-
-    helper = LayerHelper("clip_by_norm", **locals())
-
-    if name is None:
-        name = unique_name.generate_with_ignorable_key(".".join(
-            [helper.name, 'tmp']))
-
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="clip_by_norm",
-        inputs={"X": x},
-        attrs={"max_norm": max_norm},
-        outputs={"Out": out})
-
-    return out
-
-
-@templatedoc()
-def mean(x, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(
-                name='data', shape=[2, 3], dtype='float32')
-            mean = fluid.layers.mean(input)
-    """
-
-    helper = LayerHelper("mean", **locals())
-
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="mean", inputs={"X": x}, attrs={}, outputs={"Out": out})
-
-    return out
-
-
-@templatedoc()
-def merge_selected_rows(x, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            b = fluid.default_main_program().global_block()
-            var = b.create_var(
-                name="X", dtype="float32", persistable=True,
-                type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
-            y = fluid.layers.merge_selected_rows(var)
-    """
-
-    helper = LayerHelper("merge_selected_rows", **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="merge_selected_rows",
-        inputs={"X": x},
-        attrs={},
-        outputs={"Out": out})
-    return out
-
-
-@templatedoc()
-def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-        x_num_col_dims(${x_num_col_dims_type}): ${x_num_col_dims_comment}
-        y_num_col_dims(${y_num_col_dims_type}): ${y_num_col_dims_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            dataX = fluid.layers.data(name="dataX", append_batch_size = False, shape=[2, 5], dtype="float32")
-            dataY = fluid.layers.data(name="dataY", append_batch_size = False, shape=[5, 3], dtype="float32")
-            output = fluid.layers.mul(dataX, dataY,
-                                      x_num_col_dims = 1,
-                                      y_num_col_dims = 1)
-            
-
-    """
-
-    helper = LayerHelper("mul", **locals())
-
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="mul",
-        inputs={"X": x,
-                "Y": y},
-        attrs={
-            "x_num_col_dims": x_num_col_dims,
-            "y_num_col_dims": y_num_col_dims
-        },
-        outputs={"Out": out})
-    return out
-
-
-@templatedoc()
-def sigmoid_cross_entropy_with_logits(x,
-                                      label,
-                                      ignore_index=kIgnoreIndex,
-                                      name=None,
-                                      normalize=False):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        label(${label_type}): ${label_comment}
-        ignore_index(&{ignore_index}): ${ignore_index_comment}
-        name(basestring|None): Name of the output.
-        normalize(bool): If true, divide the output by the number of
-            targets != ignore_index.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(
-                name='data', shape=[10], dtype='float32')
-            label = fluid.layers.data(
-                name='data', shape=[10], dtype='float32')
-            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=input,
-                label=label,
-                ignore_index=-1,
-                normalize=True) # or False
-            # loss = fluid.layers.reduce_sum(loss) # summation of loss
-    """
-
-    helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
-
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="sigmoid_cross_entropy_with_logits",
-        inputs={"X": x,
-                "Label": label},
-        attrs={"ignore_index": ignore_index,
-               'normalize': normalize},
-        outputs={"Out": out})
-    return out
-
-
-@templatedoc()
-def maxout(x, groups, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        groups(${groups_type}): ${groups_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(
-                name='data', 
-                shape=[256, 32, 32], 
-                dtype='float32')
-            out = fluid.layers.maxout(input, groups=2)
-    """
-    helper = LayerHelper("maxout", **locals())
-
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="maxout",
-        inputs={"X": x},
-        attrs={"groups": groups},
-        outputs={"Out": out})
-    return out
-
-
-def space_to_depth(x, blocksize, name=None):
-    """
-    Gives a blocksize to space_to_depth the input LoDtensor with Layout: [batch, channel, height, width]
-
-    This op rearranges blocks of spatial data, into depth. More specifically, this op outputs a copy of the
-    input LoDtensor where values from the height and width dimensions are moved to the channel dimension.
-    The attr blocksize indicates the input block size.
-
-    space_to_depth will reorgnize the elements of input with shape[batch, channel, height, width] according
-    to blocksize to construct output with shape [batch, channel * blocksize * blocksize, height/blocksize, width/blocksize]:
-
-    space_to_depth is used to This operation is useful for resizing the activations between convolutions
-    (but keeping all data)
-
-    - Non-overlapping blocks of size block_size x block size are rearranged into depth at each location.
-    - The depth of the output tensor is block_size * block_size * input channel
-    - The Y, X coordinates within each block of the input become the high order component of the output channel index
-    - channel should be divisible by square of blocksize
-    - height, width should be divsible by blocksize
-
-
-    Args:
-        x(variable): The input LoDtensor.
-        blocksize(variable): The blocksize to select the element on each feature map should be > 2
-
-    Returns:
-        Variable: The output LoDtensor.
-
-    Raises:
-        TypeError: blocksize type must be a long.
-
-    Examples:
-        .. code-block:: python
-	
-            import paddle.fluid as fluid
-            import numpy as np
-
-            data = fluid.layers.data(
-                name='data', shape=[1, 4, 2, 2], dtype='float32', append_batch_size=False)
-            space_to_depthed = fluid.layers.space_to_depth(
-                x=data, blocksize=2)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            data_np = np.arange(0,16).reshape((1,4,2,2)).astype('float32')
-            out_main = exe.run(fluid.default_main_program(),
-                          feed={'data': data_np},
-                          fetch_list=[space_to_depthed])
-
-    """
-
-    helper = LayerHelper("space_to_depth", **locals())
-
-    if not (isinstance(blocksize, int)):
-        raise ValueError("blocksize must be a python Int")
-
-    if name is None:
-        out = helper.create_variable_for_type_inference(
-            dtype=x.dtype)  #fix create
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="space_to_depth",
-        inputs={"X": x},
-        attrs={"blocksize": blocksize},
-        outputs={"Out": out})
-    return out
-
-
-@templatedoc()
-def sequence_reverse(x, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${y_type}): ${y_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[2, 6], dtype='float32')
-            x_reversed = fluid.layers.sequence_reverse(x)
-    """
-    assert not in_dygraph_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
-    helper = LayerHelper("sequence_reverse", **locals())
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="sequence_reverse",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs=dict())
-    return out
-
-
-def affine_channel(x,
-                   scale=None,
-                   bias=None,
-                   data_layout='NCHW',
-                   name=None,
-                   act=None):
-    """
-    Applies a separate affine transformation to each channel of the input.
-    Useful for replacing spatial batch norm with its equivalent fixed
-    transformation. The input also can be 2D tensor and applies a affine
-    transformation in second dimension.
-
-    Args:
-        x (Variable): Feature map input can be a 4D tensor with order NCHW
-            or NHWC. It also can be a 2D tensor and the affine transformation
-            is applied in the second dimension.
-        scale (Variable): 1D input of shape (C), the c-th element is the scale
-            factor of the affine transformation for the c-th channel of
-            the input.
-        bias (Variable): 1D input of shape (C), the c-th element is the bias
-            of the affine transformation for the c-th channel of the input.
-        data_layout (string, default NCHW): NCHW or NHWC. If input is 2D
-            tensor, you can ignore data_layout.
-        name (str, default None): The name of this layer.
-        act (str, default None): Activation to be applied to the output of this layer.
-
-    Returns:
-        out (Variable): A tensor of the same shape and data layout with x.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name='data', shape=[3, 32, 32],
-                                     dtype='float32')
-            input_scale = fluid.layers.create_parameter(shape=[3],
-                                     dtype="float32")
-            input_bias = fluid.layers.create_parameter(shape=[3],
-                                     dtype="float32")
-            out = fluid.layers.affine_channel(data,scale=input_scale,
-                                     bias=input_bias)
-
-    """
-    helper = LayerHelper("affine_channel", **locals())
-
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-
-    helper.append_op(
-        type="affine_channel",
-        inputs={"X": x,
-                'Scale': scale,
-                'Bias': bias},
-        attrs={"data_layout": data_layout},
-        outputs={"Out": out})
-    return helper.append_activation(out)
-
-
-def similarity_focus(input, axis, indexes, name=None):
-    """
-    SimilarityFocus Operator
-
-    Generate a similarity focus mask with the same shape of input using the following method:
-
-    1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
-       to the axis according to the indexes. For example, if axis=1 and indexes=[a],
-       it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
-       is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
-    2. For each index, find the largest numbers in the tensor T, so that the same
-       row and same column has at most one number(what it means is that if the
-       largest number has been found in the i-th row and the j-th column, then
-       the numbers in the i-th row or j-th column will be skipped. And then the
-       next largest number will be selected from the remaining numbers. Obviously
-       there will be min(B, C) numbers), and mark the corresponding position of the
-       3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
-       each index.
-    3. Broadcast the 3-D similarity focus mask to the same shape of input X.
-
-    Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
-
-    .. code-block:: text
-
-        * Example :
-
-            Given a 4-D tensor x with the shape (BatchSize, C, A, B), where C is
-            the number of channels and the shape of feature map is (A, B):
-                x.shape = (2, 3, 2, 2)
-                x.data = [[[[0.8, 0.1],
-                            [0.4, 0.5]],
-
-                           [[0.9, 0.7],
-                            [0.9, 0.9]],
-
-                           [[0.8, 0.9],
-                            [0.1, 0.2]]],
-
-
-                          [[[0.2, 0.5],
-                            [0.3, 0.4]],
-
-                           [[0.9, 0.7],
-                            [0.8, 0.4]],
-
-                           [[0.0, 0.2],
-                            [0.4, 0.7]]]]
-
-            Given axis: 1 (the axis of the channel)
-            Given indexes: [0]
-
-            then we get a 4-D tensor out with the same shape of input x:
-                out.shape = (2, 3, 2, 2)
-                out.data = [[[[1.0, 0.0],
-                              [0.0, 1.0]],
-
-                             [[1.0, 0.0],
-                              [0.0, 1.0]],
-
-                             [[1.0, 0.0],
-                              [0.0, 1.0]]],
-
-                            [[[0.0, 1.0],
-                              [1.0, 0.0]],
-
-                             [[0.0, 1.0],
-                              [1.0, 0.0]],
-
-                             [[0.0, 1.0],
-                              [1.0, 0.0]]]]
-
-    Args:
-        input(Variable): The input tensor variable(default float). It should
-            be a 4-D tensor with shape [BatchSize, A, B, C].
-        axis(int): Indicating the dimension to be selected. It can only be
-            1, 2 or 3.
-        indexes(list): Indicating the indexes of the selected dimension.
-
-    Returns:
-        Variable: A tensor variable with the same shape and same type \
-                  as the input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            data = fluid.layers.data(
-                name='data', shape=[-1, 3, 2, 2], dtype='float32')
-            fluid.layers.similarity_focus(input=data, axis=1, indexes=[0])
-    """
-    helper = LayerHelper('similarity_focus', **locals())
-    # check attrs
-    if isinstance(axis, int) is False:
-        raise TypeError("axis must be int type.")
-    if isinstance(indexes, list) is False:
-        raise TypeError("indexes must be list type.")
-    if axis != 1 and axis != 2 and axis != 3:
-        raise ValueError("axis must be 1, 2 or 3.")
-    if len(indexes) == 0:
-        raise ValueError("indexes can not be empty.")
-
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=input.dtype, persistable=False)
-    helper.append_op(
-        type='similarity_focus',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={"axis": axis,
-               "indexes": indexes})
-    return out
-
-
-def hash(input, hash_size, num_hash=1, name=None):
-    """
-    Hash the input to an integer whose value is less than the given hash size.
-
-    The hash algorithm we used was xxHash - Extremely fast hash algorithm
-    (https://github.com/Cyan4973/xxHash/tree/v0.6.5)
-
-    A simple example as below:
-
-    .. code-block:: text
-
-        Given:
-
-        # shape [2, 2]
-        input.data = 
-            [[1, 2],
-             [3, 4]]
-
-        hash_size = 10000
-
-        num_hash = 4
-
-        Then:
-
-        Hash op will take all number in input's 2nd dimension as hash algorithm's
-        input for each time. Each input will be hashed for 4 times, and get an
-        array whose length is 4. Each value in the array ranges from 0 to 9999.
-
-        # shape [2, 4]
-        output.data = [
-            [[9662, 9217, 1129, 8487],
-             [8310, 1327, 1654, 4567]],
-        ]
-
-    Args:
-        input (Variable): The input variable which is a one-hot word. The
-            dimensions of the input variable must be 2. Both Tensor and LoDTensor are supported.
-        hash_size (int): The space size for hash algorithm. The output value
-            will keep in the range:math:`[0, hash_size - 1]`.
-        num_hash (int): The times of hash, default 1.
-        name (str, default None): The name of this layer.
-
-    Returns:
-       Variable: The hash result variable, which the same variable type as `input`.
-
-    Examples:
-       .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            # titles has shape [batch, 1]
-            titles = fluid.layers.data(name='titles', shape=[1], dtype='int32', lod_level=0)
-            # hash_r has shape [batch, 2]
-            hash_r = fluid.layers.hash(name='hash_x', input=titles, num_hash=2, hash_size=1000)
-
-
-            # titles has shape [batch, 1] and lod information
-            titles = fluid.layers.data(name='titles', shape=[1], dtype='int32', lod_level=1)
-            # hash_r has shape [batch, 2] and inherits lod information from titles
-            hash_r = fluid.layers.hash(name='hash_x', input=titles, num_hash=2, hash_size=1000)
-    """
-    helper = LayerHelper('hash', **locals())
-    out = helper.create_variable_for_type_inference(
-        helper.input_dtype(), stop_gradient=True)
-    helper.append_op(
-        type='hash',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={'num_hash': num_hash,
-               'mod_by': hash_size})
-    return out
-
-
-@templatedoc()
-def grid_sampler(x, grid, name=None):
-    """
-    This operation samples input X by using bilinear interpolation based on
-    flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of
-    shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
-    with shape [N, H, W] each, where grid_x is indexing the 4th dimension
-    (in width dimension) of input data x and grid_y is indexng the 3rd
-    dimention (in height dimension), finally results is the bilinear
-    interpolation value of 4 nearest corner points.
-
-    .. code-block:: text
-
-        Step 1:
-        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
-
-        grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
-        grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
-
-        Step 2:
-        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
-        interpolate point value by 4 nearest points.
-
-          wn ------- y_n ------- en
-          |           |           |
-          |          d_n          |
-          |           |           |
-         x_w --d_w-- grid--d_e-- x_e
-          |           |           |
-          |          d_s          |
-          |           |           |
-          ws ------- y_s ------- wn
-
-        x_w = floor(x)              // west side x coord
-        x_e = x_w + 1               // east side x coord
-        y_n = floor(y)              // north side y coord
-        y_s = y_s + 1               // south side y coord
-
-        d_w = grid_x - x_w          // distance to west side
-        d_e = x_e - grid_x          // distance to east side
-        d_n = grid_y - y_n          // distance to north side
-        d_s = y_s - grid_y          // distance to south side
-
-        wn = X[:, :, y_n, x_w]      // north-west point value
-        en = X[:, :, y_n, x_e]      // north-east point value
-        ws = X[:, :, y_s, x_w]      // south-east point value
-        es = X[:, :, y_s, x_w]      // north-east point value
-
-        output = wn * d_e * d_s + en * d_w * d_s
-               + ws * d_e * d_n + es * d_w * d_n
-
-    Args:
-        x(Variable): Input data of shape [N, C, H, W].
-        grid(Variable): Input grid tensor of shape [N, H, W, 2].
-        name (str, default None): The name of this layer.
-
-    Returns:
-        Variable: Output of shape [N, C, H, W] data samples input X
-        using bilnear interpolation based on input grid.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name='x', shape=[10, 32, 32], dtype='float32')
-            theta = fluid.layers.data(name='theta', shape=[2, 3], dtype='float32')
-            grid = fluid.layers.affine_grid(theta=theta, out_shape=[3, 10, 32, 32])
-            out = fluid.layers.grid_sampler(x=x, grid=grid)
-
-    """
-    helper = LayerHelper("grid_sampler", **locals())
-
-    if not isinstance(x, Variable):
-        return ValueError("The x should be a Variable")
-
-    if not isinstance(grid, Variable):
-        return ValueError("The grid should be a Variable")
-
-    out = helper.create_variable_for_type_inference(x.dtype)
-    ipts = {'X': x, 'Grid': grid}
-
-    helper.append_op(type='grid_sampler', inputs=ipts, outputs={'Output': out})
-    return out
-
-
-def log_loss(input, label, epsilon=1e-4, name=None):
-    """
-    **Negative Log Loss Layer**
-
-    This layer accepts input predictions and target label and returns the
-    negative log loss.
-
-    .. math::
-
-        Out = -label * \\log{(input + \\epsilon)}
-              - (1 - label) * \\log{(1 - input + \\epsilon)}
-
-    Args:
-        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
-                                batch size. This input is a probability computed
-                                by the previous operator.
-        label (Variable|list):  the ground truth which is a 2-D tensor with
-                                shape [N x 1], where N is the batch size.
-        epsilon (float): epsilon
-        name (string): the name of log_loss
-
-    Returns:
-        Variable: A 2-D tensor with shape [N x 1], the negative log loss.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-          prob = fluid.layers.data(name='prob', shape=[10], dtype='float32')
-          cost = fluid.layers.log_loss(input=prob, label=label)
-    """
-    helper = LayerHelper('log_loss', **locals())
-
-    if name is None:
-        loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-    else:
-        loss = helper.create_variable(
-            name=name, dtype=input.dtype, persistable=False)
-
-    helper.append_op(
-        type='log_loss',
-        inputs={'Predicted': [input],
-                'Labels': [label]},
-        outputs={'Loss': [loss]},
-        attrs={'epsilon': epsilon})
-    return loss
-
-
-def teacher_student_sigmoid_loss(input,
-                                 label,
-                                 soft_max_up_bound=15.0,
-                                 soft_max_lower_bound=-15.0):
-    """
-    **Teacher Student Log Loss Layer**
-
-    This layer accepts input predictions and target label and returns the
-    teacher_student loss.
-
-    .. math::
-        loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
-
-    Args:
-        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
-                                batch size. This input is a probability computed
-                                by the previous operator.
-        label (Variable|list):  the ground truth which is a 2-D tensor with
-                                shape [N x 1], where N is the batch size.
-        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound
-        soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
-
-    Returns:
-        Variable: A 2-D tensor with shape [N x 1], the teacher_student_sigmoid_loss.
-
-    Examples:
-        .. code-block:: python
-          
-          import paddle.fluid as fluid
-
-          batch_size = 64
-          label = fluid.layers.data(
-                    name="label", shape=[batch_size, 1], dtype="int64", append_batch_size=False)
-          similarity = fluid.layers.data(
-                    name="similarity", shape=[batch_size, 1], dtype="float32", append_batch_size=False)
-          cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
-
-    """
-    helper = LayerHelper('teacher_student_sigmoid_loss', **locals())
-    out = helper.create_variable(dtype=input.dtype)
-    helper.append_op(
-        type='teacher_student_sigmoid_loss',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out]},
-        attrs={"soft_max_lower_bound": float(soft_max_lower_bound), \
-                "soft_max_up_bound": float(soft_max_up_bound)})
-    return out
-
-
-def add_position_encoding(input, alpha, beta, name=None):
-    """
-    **Add Position Encoding Layer**
-
-    This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an
-    output Tensor of shape [N x M x P] with positional encoding value.
-
-    Refer to `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .
-
-    .. math::
-        PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})}   \\\\
-        PE(pos, 2i + 1) &= \\cos{(pos / 10000^{2i / P})}  \\\\
-        Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
-
-    Where:
-      - :math:`PE(pos, 2i)` : the increment for the number at even position
-      - :math:`PE(pos, 2i + 1)` : the increment for the number at odd position
-
-    Args:
-        input (Variable): 3-D input tensor with shape [N x M x P]
-        alpha (float): multiple of Input Tensor
-        beta (float): multiple of Positional Encoding Tensor
-        name (string): the name of position encoding layer
-
-    Returns:
-        Variable: A 3-D Tensor of shape [N x M x P] with positional encoding.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          tensor = fluid.layers.data(
-              name='tensor',
-              shape=[32, 64, 512],
-              dtype='float32',
-              append_batch_size=False)
-          position_tensor = fluid.layers.add_position_encoding(
-              input=tensor, alpha=1.0, beta=1.0)
-
-    """
-    helper = LayerHelper('add_position_encoding', **locals())
-    dtype = helper.input_dtype()
-
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
-    else:
-        out = helper.create_variable(name=name, dtype=dtype, persistable=False)
-
-    helper.append_op(
-        type="add_position_encoding",
-        inputs={"X": input},
-        outputs={"Out": out},
-        attrs={"alpha": alpha,
-               "beta": beta})
-    return out
-
-
-def bilinear_tensor_product(x,
-                            y,
-                            size,
-                            act=None,
-                            name=None,
-                            param_attr=None,
-                            bias_attr=None):
-    """
-    **Add Bilinear Tensor Product Layer**
-
-    This layer performs bilinear tensor product on two inputs.
-    For example:
-
-    .. math::
-       out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
-
-    In this formula:
-      - :math:`x`: the first input contains M elements, shape is [batch_size, M].
-      - :math:`y`: the second input contains N elements, shape is [batch_size, N].
-      - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
-      - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
-      - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
-
-    Args:
-        x (Variable): 2-D input tensor with shape [batch_size, M]
-        y (Variable): 2-D input tensor with shape [batch_size, N]
-        size (int): The dimension of this layer.
-        act (str, default None): Activation to be applied to the output of this layer.
-        name (str, default None): The name of this layer.
-        param_attr (ParamAttr, default None): The parameter attribute for the learnable w.
-            parameters/weights of this layer.
-        bias_attr (ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-
-    Returns:
-        Variable: A 2-D Tensor of shape [batch_size, size].
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          layer1 = fluid.layers.data("t1", shape=[-1, 5], dtype="float32")
-          layer2 = fluid.layers.data("t2", shape=[-1, 4], dtype="float32")
-          tensor = fluid.layers.bilinear_tensor_product(x=layer1, y=layer2, size=1000)
-    """
-    helper = LayerHelper('bilinear_tensor_product', **locals())
-    dtype = helper.input_dtype('x')
-
-    param_shape = [size, x.shape[1], y.shape[1]]
-
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=False)
-
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
-    else:
-        out = helper.create_variable(name=name, dtype=dtype, persistable=False)
-
-    inputs = {"X": x, "Y": y, "Weight": w}
-    if helper.bias_attr:
-        bias_size = [1, size]
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-        inputs["Bias"] = bias
-    helper.append_op(
-        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out})
-
-    # add activation
-    return helper.append_activation(out)
-
-
-@templatedoc()
-def get_tensor_from_selected_rows(x, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-	    
-            import paddle.fluid as fluid
-            b = fluid.default_main_program().global_block()
-            input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
-            out = fluid.layers.get_tensor_from_selected_rows(input)
-    """
-
-    helper = LayerHelper('get_tensor_from_selected_rows', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='get_tensor_from_selected_rows',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={})
-    return out
-
-
-def shuffle_channel(x, group, name=None):
-    """
-    **Shuffle Channel Operator**
-
-    This operator shuffles the channels of input x.
-    It divide the input channels in each group into :attr:`group` subgroups,
-    and obtain a new order by selecting element from every subgroup one by one.
-
-    Please refer to the paper
-    https://arxiv.org/pdf/1707.01083.pdf
-    
-    .. code-block:: text
-
-        Given a 4-D tensor input with the shape (N, C, H, W):
-            input.shape = (1, 4, 2, 2)
-            input.data =[[[[0.1, 0.2],
-                           [0.2, 0.3]],
-
-                          [[0.3, 0.4],
-                           [0.4, 0.5]],
-
-                          [[0.5, 0.6],
-                           [0.6, 0.7]],
-
-                          [[0.7, 0.8],
-                           [0.8, 0.9]]]]
-            Given group: 2
-            then we get a 4-D tensor out whth the same shape of input:
-            out.shape = (1, 4, 2, 2)
-            out.data = [[[[0.1, 0.2],
-                          [0.2, 0.3]],
-                          
-                         [[0.5, 0.6],
-                          [0.6, 0.7]],
-                          
-                         [[0.3, 0.4],
-                          [0.4, 0.5]],
-                          
-                         [[0.7, 0.8],
-                          [0.8, 0.9]]]]
-                        
-    Args: 
-        x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W]
-        group(int): Indicating the conuts of subgroups, It should divide the number of channels.
-
-    Returns:
-        out(Variable): the channels shuffling result is a tensor variable with the 
-        same shape and same type as the input.
-
-    Raises:
-        ValueError: If group is not an int type variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32')
-            out = fluid.layers.shuffle_channel(x=input, group=2)
-    """
-    helper = LayerHelper("shuffle_channel", **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    if not isinstance(group, int):
-        raise TypeError("group must be int type")
-
-    helper.append_op(
-        type="shuffle_channel",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={"group": group})
-    return out
-
-
-@templatedoc()
-def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
-    """
-    **Temporal Shift Operator**
-    
-    ${comment}
-                        
-    Args: 
-        x(Variable): ${x_comment}
-        seg_num(int): ${seg_num_comment}
-        shift_ratio(float): ${shift_ratio_comment}
-        name (str, default None): The name of this layer.
-
-    Returns:
-        out(Variable): The temporal shifting result is a tensor variable with the 
-        same shape and same type as the input.
-
-    Raises:
-        TypeError: seg_num must be int type.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32')
-            out = fluid.layers.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
-    """
-    helper = LayerHelper("temporal_shift", **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    if not isinstance(seg_num, int):
-        raise TypeError("seg_num must be int type.")
-
-    helper.append_op(
-        type="temporal_shift",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={"seg_num": seg_num,
-               "shift_ratio": shift_ratio})
-    return out
-
-
-class PyFuncRegistry(object):
-    _register_funcs = []
-
-    def __init__(self, func):
-        if func is None or not callable(func):
-            raise TypeError('func must be a Python function')
-
-        self._func = func
-        # find named args using reflection
-        args = inspect.getargspec(self._func)
-        if len(args[0]) == 0 and args[1] is None and args[2] is None:
-            # Function with no inputs
-            self._named_args = None
-        else:
-            self._named_args = args[0]
-        self._id = core._append_python_callable_object_and_return_id(self)
-        '''
-        Why record self here?
-
-        1. For debug usage. Users can call
-           :code:`py_func.registered_func(idx)` method
-           to find the registered function corresponding
-           to :code:`idx`.
-
-        2. For increasing reference count of self.
-           It seems that to release Python object
-           whose reference count is 1 would cause
-           segmentation fault error in C++ side.
-           May be lack of Python GC in C++ side?
-        '''
-        PyFuncRegistry._register_funcs.append(self)
-
-    @classmethod
-    def registered_func(cls, idx):
-        return cls._register_funcs[idx]._func
-
-    @classmethod
-    def registered_func_num(cls):
-        return len(cls._register_funcs)
-
-    @property
-    def id(self):
-        return self._id
-
-    def __call__(self, *args):
-        if self._named_args is None:
-            func_ret = self._func()
-        else:
-            kwargs = dict()
-            idx = 0
-            for arg in self._named_args:
-                kwargs[arg] = args[idx]
-                idx += 1
-            func_ret = self._func(*args[idx:], **kwargs)
-
-        if not isinstance(func_ret, (list, tuple)):
-            func_ret = (func_ret, )
-
-        ret = []
-        for each_ret in func_ret:
-            if each_ret is None or isinstance(each_ret, core.LoDTensor):
-                ret.append(each_ret)
-                continue
-
-            if not isinstance(each_ret, np.ndarray):
-                each_ret = np.array(each_ret)
-
-            tensor = core.LoDTensor()
-            tensor.set(each_ret, core.CPUPlace())
-            ret.append(tensor)
-
-        return tuple(ret)
-
-
-@templatedoc()
-def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
-    """
-    PyFunc Operator.
-
-    User can use :code:`py_func` to register operators in Python side.
-    The inputs of :code:`func` is :code:`LoDTensor` and outputs can be
-    numpy array or :code:`LoDTensor`. Paddle would call the registered
-    :code:`func` in forward part, and call :code:`backward_func` in
-    backward part (if :code:`backward_func` is not None).
-
-    User should set the right data type and shape of :code:`out` before
-    calling this function. However, data types and shapes of gradients of
-    :code:`out` and :code:`x` would be inferred automatically.
-
-    Input orders of :code:`backward_func` would be: forward inputs
-    :code:`x`, forward outputs :code:`out` and backward input gradients of
-    :code:`out`. If some variables of :code:`out` have no gradient, the input
-    tensor would be None in Python side. If some variables of :code:`in` have
-    no gradient, users should return None.
-
-    This function can also be used to debug the running network. User can
-    add a :code:`py_func` operator without output, and print input
-    :code:`x` inside :code:`func`.
-
-    Args:
-        func (callable): forward Python function.
-        x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`.
-        out (Variable|list(Variable)|tuple(Variable)): outputs of :code:`func`.
-            Paddle cannot infer shapes and data types of :code:`out`. Users
-            should create :code:`out` beforehand.
-        backward_func (callable|None): backward Python function.
-                                       None means no backward. Default None.
-        skip_vars_in_backward_input (Variable|list(Variable)|tuple(Variable)):
-            Variables that are not needed in :code:`backward_func` inputs.
-            These variables must be any of :code:`x` and :code:`out`.
-            If set, these vars would not be inputs of :code:`backward_func`,
-            Only useful when :code:`backward_func` is not None. Default None.
-
-    Returns:
-        out (Variable|list(Variable)|tuple(Variable)): input :code:`out`
-
-    Examples:
-
-        >>> import paddle.fluid as fluid
-        >>> import six
-        >>>
-        >>> def create_tmp_var(name, dtype, shape):
-        >>>     return fluid.default_main_program().current_block().create_var(
-        >>>         name=name, dtype=dtype, shape=shape)
-        >>>
-        >>> # tanh activation has been provided by Paddle C++ op
-        >>> # Here, we only use tanh to be an example to show the usage
-        >>> # of py_func
-        >>> def tanh(x):
-        >>>     return np.tanh(x)
-        >>>
-        >>> # forward input x is skipped
-        >>> def tanh_grad(y, dy):
-        >>>     return np.array(dy) * (1 - np.square(np.array(y)))
-        >>>
-        >>> def debug_func(x):
-        >>>     print(x)
-        >>>
-        >>> def simple_net(img, label):
-        >>>     hidden = img
-        >>>     for idx in six.moves.range(4):
-        >>>         hidden = fluid.layers.fc(hidden, size=200)
-        >>>         new_hidden = create_tmp_var(name='hidden_{}'.format(idx),
-        >>>             dtype=hidden.dtype, shape=hidden.shape)
-        >>>
-        >>>         # user-defined layers with forward and backward
-        >>>         hidden = fluid.layers.py_func(func=tanh, x=hidden,
-        >>>             out=new_hidden, backward_func=tanh_grad,
-        >>>             skip_vars_in_backward_input=hidden)
-        >>>
-        >>>         # user-defined debug layers to print variables
-        >>>         fluid.layers.py_func(func=debug_func, x=hidden, out=None)
-        >>>
-        >>>     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-        >>>     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        >>>     return fluid.layers.mean(loss)
-    """
-    helper = LayerHelper('py_func', **locals())
-    if x is None:
-        x = []
-    elif isinstance(x, Variable):
-        x = [x]
-    elif not isinstance(x, (list, tuple)):
-        raise TypeError('Input must be Variable/list(Variable)/tuple(Variable)')
-
-    if out is None:
-        out_list = []
-    elif isinstance(out, Variable):
-        out_list = [out]
-    elif isinstance(out, (list, tuple)):
-        out_list = out
-    else:
-        raise TypeError(
-            'Output must be Variable/list(Variable)/tuple(Variable)')
-
-    fwd_func_id = PyFuncRegistry(func).id
-    bwd_func_id = PyFuncRegistry(
-        backward_func).id if backward_func is not None else -1
-
-    for each_out in out_list:
-        if len(each_out.shape) == 0:
-            raise ValueError(
-                'Output shapes of py_func op should be provided by users manually'
-            )
-
-    backward_skip_vars = set()
-    if backward_func is not None and skip_vars_in_backward_input is not None:
-        if isinstance(skip_vars_in_backward_input, Variable):
-            skip_vars_in_backward_input = [skip_vars_in_backward_input]
-
-        fwd_in_out = [v.name for v in x]
-        fwd_in_out.extend([v.name for v in out_list])
-        fwd_in_out = set(fwd_in_out)
-        backward_skip_vars = set()
-        for v in skip_vars_in_backward_input:
-            if not v.name in fwd_in_out:
-                raise ValueError(
-                    'Variable {} is not found in forward inputs and outputs'
-                    .format(v.name))
-            backward_skip_vars.add(v.name)
-
-    helper.append_op(
-        type='py_func',
-        inputs={'X': x},
-        outputs={'Out': out_list},
-        attrs={
-            'forward_callable_id': fwd_func_id,
-            'backward_callable_id': bwd_func_id,
-            'backward_skip_vars': list(backward_skip_vars)
-        })
-    return out
-
-
-# For debug usage
-py_func.registered_func = PyFuncRegistry.registered_func
-py_func.registered_func_num = PyFuncRegistry.registered_func_num
-
-
-@templatedoc()
-def psroi_pool(input,
-               rois,
-               output_channels,
-               spatial_scale,
-               pooled_height,
-               pooled_width,
-               name=None):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): ${x_comment}
-        rois (Variable): ROIs (Regions of Interest) to pool over.It should be
-                         a 2-D LoDTensor of shape (num_rois, 4), the lod level
-                         is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                         the top left coordinates, and (x2, y2) is the bottom
-                         right coordinates.
-        output_channels (integer): ${output_channels_comment}
-        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
-        pooled_height (integer): ${pooled_height_comment} Default: 1
-        pooled_width (integer): ${pooled_width_comment} Default: 1
-        name (str, default None): The name of this layer.
-
-    Returns:
-        Variable: ${out_comment}.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[490, 28, 28], dtype='float32')
-            rois = fluid.layers.data(name='rois', shape=[4], lod_level=1, dtype='float32')
-            pool_out = fluid.layers.psroi_pool(x, rois, 10, 1.0, 7, 7)
-    """
-    helper = LayerHelper('psroi_pool', **locals())
-    # check attrs
-    if not isinstance(output_channels, int):
-        raise TypeError("output_channels must be int type")
-    if not isinstance(spatial_scale, float):
-        raise TypeError("spatial_scale must be float type")
-    if not isinstance(pooled_height, int):
-        raise TypeError("pooled_height must be int type")
-    if not isinstance(pooled_width, int):
-        raise TypeError("pooled_width must be int type")
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='psroi_pool',
-        inputs={'X': input,
-                'ROIs': rois},
-        outputs={'Out': out},
-        attrs={
-            'output_channels': output_channels,
-            'spatial_scale': spatial_scale,
-            'pooled_height': pooled_height,
-            'pooled_width': pooled_width
-        })
-    return out
-
-
-@templatedoc()
-def prroi_pool(input,
-               rois,
-               output_channels,
-               spatial_scale=1.0,
-               pooled_height=1,
-               pooled_width=1,
-               name=None):
-    """
-    The precise roi pooling implementation for paddle?https://arxiv.org/pdf/1807.11590.pdf
-
-    Args:
-        input (Variable):The input of Deformable PSROIPooling.The shape of input tensor is
-                        [N,C,H,W]. Where N is batch size,C is number of input channels,H
-                        is height of the feature, and W is the width of the feature.
-        rois (Variable): ROIs (Regions of Interest) to pool over.It should be
-                        a 2-D LoDTensor of shape (num_rois, 4), the lod level
-                        is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                        the top left coordinates, and (x2, y2) is the bottom
-                        right coordinates.
-        output_channels (integer): The output's channel.
-        spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width).
-                             Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
-        pooled_height (integer): The pooled output height. Default: 1.
-        pooled_width (integer): The pooled output width. Default: 1.
-        name (str, default None): The name of this operation.
-
-    Returns:
-        Variable(Tensor): The shape of the returned Tensor is (num_rois, output_channels, pooled_h, pooled_w), with value type float32,float16..
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[490, 28, 28], dtype='float32')
-            rois = fluid.layers.data(name='rois', shape=[4], lod_level=1, dtype='float32')
-            pool_out = fluid.layers.prroi_pool(x, rois, 10, 1.0, 7, 7)
-    """
-    helper = LayerHelper('prroi_pool', **locals())
-    # check attrs
-    if not isinstance(output_channels, int):
-        raise TypeError("output_channels must be int type")
-    if not isinstance(spatial_scale, float):
-        raise TypeError("spatial_scale must be float type")
-    if not isinstance(pooled_height, int):
-        raise TypeError("pooled_height must be int type")
-    if not isinstance(pooled_width, int):
-        raise TypeError("pooled_width must be int type")
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='prroi_pool',
-        inputs={'X': input,
-                'ROIs': rois},
-        outputs={'Out': out},
-        attrs={
-            'output_channels': output_channels,
-            'spatial_scale': spatial_scale,
-            'pooled_height': pooled_height,
-            'pooled_width': pooled_width
-        })
-    return out
-
-
-def huber_loss(input, label, delta):
-    """
-    Huber loss is a loss function used in robust.
-    Huber loss can evaluate the fitness of input to label.
-    Different from MSE loss, Huber loss is more robust for outliers.
-
-    When the difference between input and label is large than delta
-    .. math::
-
-        huber\_loss = delta * (label - input) - 0.5 * delta * delta
-
-    When the difference between input and label is less than delta
-    .. math::
-
-        huber\_loss = 0.5 * (label - input) * (label - input)
-
-
-    Args:
-        input (Variable): This input is a probability computed by the previous operator.
-                          The first dimension is batch size, and the last dimension is 1.
-        label (Variable): The groud truth whose first dimension is batch size
-                          and last dimension is 1.
-        delta (float): The parameter of huber loss, which controls
-                       the range of outliers
-
-    Returns:
-        huber\_loss (Variable): The huber loss with shape [batch_size, 1].
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            predict = fluid.layers.fc(input=x, size=1)
-            label = fluid.layers.data(
-                name='label', shape=[1], dtype='float32')
-            loss = fluid.layers.huber_loss(
-                input=predict, label=label, delta=1.0)
-
-    """
-    helper = LayerHelper('huber_loss', **locals())
-    residual = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='huber_loss',
-        inputs={'X': input,
-                'Y': label},
-        outputs={'Out': out,
-                 'Residual': residual},
-        attrs={'delta': delta})
-    return out
-
-
-@templatedoc()
-def kldiv_loss(x, target, reduction='mean', name=None):
-    """
-    ${comment}
-
-    Args:
-        x (Variable): ${x_comment}
-        target (Variable): ${target_comment}
-        reduction (Variable): ${reduction_comment}
-        name (str, default None): The name of this layer.
-
-    Returns:
-        kldiv\_loss (Variable): The KL divergence loss.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[4,2,2], dtype='float32')
-            target = fluid.layers.data(name='target', shape=[4,2,2], dtype='float32')
-            loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='batchmean')
-    """
-    helper = LayerHelper('kldiv_loss', **locals())
-    loss = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='kldiv_loss',
-        inputs={'X': x,
-                'Target': target},
-        outputs={'Loss': loss},
-        attrs={'reduction': reduction})
-    return loss
-
-
-from .ops import square
-from .control_flow import equal
-
-
-def npair_loss(anchor, positive, labels, l2_reg=0.002):
-    '''
-  **Npair Loss Layer**
-
-  Read `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_ .
-
-  Npair loss requires paired data. Npair loss has two parts: the first part is L2
-  regularizer on the embedding vector; the second part is cross entropy loss which
-  takes the similarity matrix of anchor and positive as logits.
-
-  Args:
-    anchor(Variable): embedding vector for the anchor image. shape=[batch_size, embedding_dims]
-    positive(Variable): embedding vector for the positive image. shape=[batch_size, embedding_dims]
-    labels(Variable): 1-D tensor. shape=[batch_size]
-    l2_reg(float32): L2 regularization term on embedding vector, default: 0.002
-
-  Returns:
-    npair loss(Variable): return npair loss, shape=[1]
-
-  Examples:
-    .. code-block:: python
-
-       import paddle.fluid as fluid
-       anchor = fluid.layers.data(
-                     name = 'anchor', shape = [18, 6], dtype = 'float32', append_batch_size=False)
-       positive = fluid.layers.data(
-                     name = 'positive', shape = [18, 6], dtype = 'float32', append_batch_size=False)
-       labels = fluid.layers.data(
-                     name = 'labels', shape = [18], dtype = 'float32', append_batch_size=False)
-
-       npair_loss = fluid.layers.npair_loss(anchor, positive, labels, l2_reg = 0.002)
-  '''
-    Beta = 0.25
-    batch_size = labels.shape[0]
-
-    labels = reshape(labels, shape=[batch_size, 1], inplace=True)
-    labels = expand(labels, expand_times=[1, batch_size])
-
-    labels = equal(labels, transpose(labels, perm=[1, 0])).astype('float32')
-    labels = labels / reduce_sum(labels, dim=1, keep_dim=True)
-
-    l2loss = reduce_mean(reduce_sum(square(anchor), 1)) \
-             + reduce_mean(reduce_sum(square(positive), 1))
-    l2loss = l2loss * Beta * l2_reg
-
-    similarity_matrix = matmul(
-        anchor, positive, transpose_x=False, transpose_y=True)
-    softmax_ce = softmax_with_cross_entropy(
-        logits=similarity_matrix, label=labels, soft_label=True)
-    cross_entropy = reduce_sum(labels * softmax_ce, 0)
-    celoss = reduce_mean(cross_entropy)
-
-    return l2loss + celoss
-
-
-def pixel_shuffle(x, upscale_factor):
-    """
-
-    **Pixel Shuffle Layer**
-
-    This layer rearranges elements in a tensor of shape [N, C, H, W]
-    to a tensor of shape [N, C/r**2, H*r, W*r].
-    This is useful for implementing efficient sub-pixel convolution
-    with a stride of 1/r.
-    Please refer to the paper: `Real-Time Single Image and Video Super-Resolution 
-    Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
-    by Shi et. al (2016) for more details.
-
-        .. code-block:: text
-        
-            Given a 4-D tensor with the shape:
-                x.shape = [1, 9, 4, 4]
-            Given upscale_factor:
-                upscale_factor= 3
-            output shape is:
-                [1, 1, 12, 12]
-    
-    Args:
-
-        x(Variable): The input tensor variable.
-        upscale_factor(int): factor to increase spatial resolution
-
-    Returns:
-
-        Out(Variable): Reshaped tensor according to the new dimension.
-
-    Raises:
-
-        ValueError: If the square of upscale_factor cannot divide the channels of input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[9,4,4])
-            output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3)
-
-    """
-
-    helper = LayerHelper("pixel_shuffle", **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    if not isinstance(upscale_factor, int):
-        raise TypeError("upscale factor must be int type")
-
-    helper.append_op(
-        type="pixel_shuffle",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={"upscale_factor": upscale_factor})
-    return out
-
-
-def fsp_matrix(x, y):
-    """
-
-    **FSP matrix op**
-
-    This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps.
-    Given feature map x with shape [x_channel, h, w] and feature map y with shape
-    [y_channel, h, w], we can get the fsp matrix of x and y in two steps:
-
-    1. reshape x into matrix with shape [x_channel, h * w] and reshape and
-       transpose y into matrix with shape [h * w, y_channel].
-    2. multiply x and y to get fsp matrix with shape [x_channel, y_channel].
-
-    The output is a batch of fsp matrices.
-
-    Args:
-
-        x (Variable): A feature map with shape [batch_size, x_channel, height, width].
-        y (Variable): A feature map with shape [batch_size, y_channel, height, width].
-                      The y_channel can be different with the x_channel of Input(X)
-                      while the other dimensions must be the same with Input(X)'s.
-
-    Returns:
-
-        fsp matrix (Variable): The output of FSP op with shape [batch_size, x_channel, y_channel].
-        The x_channel is the channel of x and the y_channel is the channel of y.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name='data', shape=[3, 32, 32])
-            feature_map_0 = fluid.layers.conv2d(data, num_filters=2,
-                                                filter_size=3)
-            feature_map_1 = fluid.layers.conv2d(feature_map_0, num_filters=2,
-                                                filter_size=1)
-            loss = fluid.layers.fsp_matrix(feature_map_0, feature_map_1)
-
-    """
-    helper = LayerHelper('fsp_matrix', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype(
-        input_param_name='x'))
-    helper.append_op(type='fsp', inputs={'X': x, 'Y': y}, outputs={'Out': out})
-    return out
-
-
-def continuous_value_model(input, cvm, use_cvm=True):
-    """
-
-    **continuous_value_model layers**
-
-    continuous value model(cvm). Now, it only considers show and click value in CTR project.
-    We assume that input is an embedding vector with cvm_feature, whose shape is [N * D] (D is 2 + embedding dim).
-    If use_cvm is True, it will log(cvm_feature), and output shape is [N * D].
-    If use_cvm is False, it will remove cvm_feature from input, and output shape is [N * (D - 2)].
-    
-    This layer accepts a tensor named input which is ID after embedded(lod level is 1), cvm is a show_click info.
-
-    Args:
-
-        input (Variable): a 2-D LodTensor with shape [N x D], where N is the batch size, D is 2 + the embedding dim. lod level = 1.
-        cvm (Variable):   a 2-D Tensor with shape [N x 2], where N is the batch size, 2 is show and click.
-        use_cvm  (bool):  use cvm or not. if use cvm, the output dim is the same as input
-                          if don't use cvm, the output dim is input dim - 2(remove show and click)
-                          (cvm op is a customized op, which input is a sequence has embed_with_cvm default, so we need an op named cvm to decided whever use it or not.)
-
-    Returns:
-
-        Variable: A 2-D LodTensor with shape [N x D], if use cvm, D is equal to input dim, if don't use cvm, D is equal to input dim - 2. 
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          input = fluid.layers.data(name="input", shape=[-1, 1], lod_level=1, append_batch_size=False, dtype="int64")#, stop_gradient=False)
-          label = fluid.layers.data(name="label", shape=[-1, 1], append_batch_size=False, dtype="int64")
-          embed = fluid.layers.embedding(
-                            input=input,
-                            size=[100, 11],
-                            dtype='float32')
-          ones = fluid.layers.fill_constant_batch_size_like(input=label, shape=[-1, 1], dtype="int64", value=1)
-          show_clk = fluid.layers.cast(fluid.layers.concat([ones, label], axis=1), dtype='float32')
-          show_clk.stop_gradient = True
-          input_with_cvm = fluid.layers.continuous_value_model(embed, show_clk, True)
-
-    """
-    helper = LayerHelper('cvm', **locals())
-    out = helper.create_variable(dtype=input.dtype)
-    helper.append_op(
-        type='cvm',
-        inputs={'X': [input],
-                'CVM': [cvm]},
-        outputs={'Y': [out]},
-        attrs={"use_cvm": use_cvm})
-    return out
-
-
-def where(condition):
-    """
-    Return an int64 tensor with rank 2, specifying the coordinate of true element in `condition`.
-
-    Output's first dimension is the number of true element, second dimension is rank(number of dimension) of `condition`.
-    If there is zero true element, then an empty tensor will be generated.  
-
-    Args:
-        condition(Variable): A bool tensor with rank at least 1.
-
-    Returns:
-        Variable: The tensor variable storing a 2-D tensor. 
-
-    Examples:
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             import paddle.fluid.layers as layers
-             import numpy as np
-
-             # condition is a tensor [True, False, True]
-             condition = layers.assign(np.array([1, 0, 1], dtype='int32'))
-             condition = layers.cast(condition, 'bool')
-             out = layers.where(condition) # [[0], [2]]
-
-             # condition is a tensor [[True, False], [False, True]]
-             condition = layers.assign(np.array([[1, 0], [0, 1]], dtype='int32'))
-             condition = layers.cast(condition, 'bool')
-             out = layers.where(condition) # [[0, 0], [1, 1]]
-
-             # condition is a tensor [False, False, False]
-             condition = layers.assign(np.array([0, 0, 0], dtype='int32'))
-             condition = layers.cast(condition, 'bool')
-             out = layers.where(condition) # [[]]
-
-    """
-    helper = LayerHelper("where", **locals())
-
-    out = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.INT64)
-
-    helper.append_op(
-        type='where', inputs={'Condition': condition}, outputs={'Out': [out]})
-    return out
-
-
-def sign(x):
-    """
-    **sign**
-
-    This function returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
-
-    Args:
-        x(Variable|numpy.ndarray): The input tensor.
-
-    Returns:
-        Variable: The output sign tensor with identical shape and dtype to `x`.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy as np
-
-          # [1, 0, -1]
-          data = fluid.layers.sign(np.array([3, 0, -2], dtype='int32')) 
-
-    """
-
-    helper = LayerHelper("sign", **locals())
-
-    if not isinstance(x, Variable):
-        x = assign(x)
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(type='sign', inputs={'X': [x]}, outputs={'Out': [out]})
-
-    return out
-
-
-def unique(x, dtype='int32'):
-    """
-    **unique** 
-
-    Return a unique tensor for `x` and an index tensor pointing to this unique tensor.
-
-    Args:
-        x(Variable): A 1-D input tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of index tensor: int32, int64.
-
-    Returns:
-        tuple: (out, index). `out` is the unique tensor for `x`, with identical dtype to `x`, and \
-            `index` is an index tensor pointing to `out`, by which user can recover the original `x` tensor.
-
-    Examples:
-        .. code-block:: python
-
-             import numpy as np
-             import paddle.fluid as fluid
-             x = fluid.assign(np.array([2, 3, 3, 1, 5, 3], dtype='int32'))
-             out, index = fluid.layers.unique(x) # out is [2, 3, 1, 5]; index is [0, 1, 1, 2, 3, 1]
-    """
-
-    helper = LayerHelper("unique", **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    index = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type='unique',
-        inputs={'X': x},
-        attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
-        outputs={'Out': [out],
-                 'Index': [index]})
-
-    return out, index
-
-
-def unique_with_counts(x, dtype='int32'):
-    """
-    **unique** 
-
-    Return a unique tensor for `x` and an index tensor pointing to this unique tensor.
-
-    Args:
-        x(Variable): A 1-D input tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of index tensor: int32, int64.
-
-    Returns:
-        tuple: (out, index, count). `out` is the unique tensor for `x`, with identical dtype to `x`, and \
-            `index` is an index tensor pointing to `out`, by which user can recover the original `x` tensor, \
-            `count` is count of unqiue element in the `x`.
-
-    Examples:
-        .. code-block:: python
-
-             import numpy as np
-             import paddle.fluid as fluid
-             x = fluid.layers.assign(np.array([2, 3, 3, 1, 5, 3], dtype='int32'))
-             out, index, count = fluid.layers.unique_with_counts(x) # out is [2, 3, 1, 5]; index is [0, 1, 1, 2, 3, 1]
-                                                        # count is [1, 3, 1, 1]
-    """
-    if not (dtype == 'int32' or dtype == 'int64'):
-        raise TypeError(
-            "Op unique_with_counts, index dtype must be int32 or int64")
-
-    if x is None or len(x.shape) != 1:
-        raise ValueError(
-            "Op unique_with_counts, x must not be null and size of dim must be 1"
-        )
-
-    helper = LayerHelper("unique_with_counts", **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    index = helper.create_variable_for_type_inference(dtype)
-
-    count = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type='unique_with_counts',
-        inputs={'X': x},
-        attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
-        outputs={'Out': [out],
-                 'Index': [index],
-                 'Count': [count]})
-
-    return out, index, count
-
-
-def deformable_conv(input,
-                    offset,
-                    mask,
-                    num_filters,
-                    filter_size,
-                    stride=1,
-                    padding=0,
-                    dilation=1,
-                    groups=None,
-                    deformable_groups=None,
-                    im2col_step=None,
-                    param_attr=None,
-                    bias_attr=None,
-                    modulated=True,
-                    name=None):
-    """
-    **Deformable Convolution Layer**
-
-    Compute 2-D deformable convolution on 4-D input.
-    Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
-   
-    
-    Deformable Convolution v2: 
-    
-    .. math::
-
-        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k}
-
-    Deformable Convolution v1:
-    
-    .. math::
-
-        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k)}
-    
-    Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location, 
-    which :math:`\Delta m_k` is one in deformable convolution v1. Please refer to `Deformable ConvNets v2: More Deformable, Better Results
-    <https://arxiv.org/abs/1811.11168v2>`_ and `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.
-    
-    Example:
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-
-          Offset shape: :math:`(N, 2 * deformable\_groups * H_f * H_w, H_{in}, W_{in})`
-
-          Mask shape: :math:`(N, deformable\_groups * H_f * H_w, H_{in}, W_{in})`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-
-    Args:
-        input (Variable): The input image with [N, C, H, W] format.
-        offset (Variable): The input coordinate offset of deformable convolution layer.
-        Mask (Variable): The input mask of deformable covolution layer.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups (int): The groups number of the deformable conv layer. According to
-            grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1.
-        deformable_groups (int): The number of deformable group partitions.
-            Default: deformable_groups = 1.
-        im2col_step (int): Maximum number of images per im2col computation; 
-            The total batch size should be divisable by this value or smaller
-            than this value; if you face out of memory problem, you can try
-            to use a smaller value here.
-            Default: im2col_step = 64.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of deformable conv. If it is set to None or one attribute of ParamAttr,
-            deformable conv will create ParamAttr as param_attr.
-            If the Initializer of the param_attr is not set, the parameter is
-            initialized with :math:`Normal(0.0, std)`, and the 
-            :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of
-            deformable conv layer. If it is set to False, no bias will be added
-            to the output units. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        modulated (bool): Make sure which version should be used between v1 and v2, where v2 is \
-            used while True. Default: True.
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None
-    Returns:
-        Variable: The tensor variable storing the deformable convolution \
-                  result.
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-    Examples:
-        .. code-block:: python
-
-          #deformable conv v2:
-         
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
-          offset = fluid.layers.data(name='offset', shape=[18, 32, 32], dtype='float32')
-          mask = fluid.layers.data(name='mask', shape=[9, 32, 32], dtype='float32')
-          out = fluid.layers.deformable_conv(input=data, offset=offset, mask=mask,
-                                             num_filters=2, filter_size=3, padding=1, modulated=True)
-
-          #deformable conv v1:
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
-          offset = fluid.layers.data(name='offset', shape=[18, 32, 32], dtype='float32')
-          out = fluid.layers.deformable_conv(input=data, offset=offset, mask=None,
-                                             num_filters=2, filter_size=3, padding=1, modulated=False)
-    """
-
-    num_channels = input.shape[1]
-    assert param_attr is not False, "param_attr should not be False here."
-
-    helper = LayerHelper('deformable_conv', **locals())
-    dtype = helper.input_dtype()
-
-    if not isinstance(input, Variable):
-        raise TypeError("Input of deformable_conv must be Variable")
-    if not isinstance(offset, Variable):
-        raise TypeError("Input Offset of deformable_conv must be Variable")
-
-    if groups is None:
-        num_filter_channels = num_channels
-    else:
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels // groups
-
-    filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-    stride = utils.convert_to_list(stride, 2, 'stride')
-    padding = utils.convert_to_list(padding, 2, 'padding')
-    dilation = utils.convert_to_list(dilation, 2, 'dilation')
-
-    input_shape = input.shape
-    filter_shape = [num_filters, int(num_filter_channels)] + filter_size
-
-    def _get_default_param_initializer():
-        filter_elem_num = filter_size[0] * filter_size[1] * num_channels
-        std = (2.0 / filter_elem_num)**0.5
-        return Normal(0.0, std, 0)
-
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        default_initializer=_get_default_param_initializer())
-
-    pre_bias = helper.create_variable_for_type_inference(dtype)
-
-    if modulated:
-        helper.append_op(
-            type='deformable_conv',
-            inputs={
-                'Input': input,
-                'Filter': filter_param,
-                'Offset': offset,
-                'Mask': mask,
-            },
-            outputs={"Output": pre_bias},
-            attrs={
-                'strides': stride,
-                'paddings': padding,
-                'dilations': dilation,
-                'groups': groups,
-                'deformable_groups': deformable_groups,
-                'im2col_step': im2col_step,
-            })
-
-    else:
-        helper.append_op(
-            type='deformable_conv_v1',
-            inputs={
-                'Input': input,
-                'Filter': filter_param,
-                'Offset': offset,
-            },
-            outputs={"Output": pre_bias},
-            attrs={
-                'strides': stride,
-                'paddings': padding,
-                'dilations': dilation,
-                'groups': groups,
-                'deformable_groups': deformable_groups,
-                'im2col_step': im2col_step,
-            })
-
-    output = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
-    return output
-
-
-def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
-    """
-
-    This function returns a col buffer of sliding local blocks of input x, also known
-    as im2col for batched 2D image tensors. For each block under the convolution filter,
-    all element will be rearranged as a column. While the convolution filter silding over
-    the input feature map, a series of such columns will be formed.
-
-    For each input :math:`X` with shape [N, C, H, W], the output shape [N, Cout, Lout]
-    can be calculated as following.
-
-    .. math::
-
-        dkernel[0] &= dilations[0] \\times (kernel\_sizes[0] - 1) + 1
-
-        dkernel[1] &= dilations[1] \\times (kernel\_sizes[1] - 1) + 1
-
-        hout &= \\frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1
-
-        wout &= \\frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1
-
-        Cout &= C \\times kernel\_sizes[0] \\times kernel\_sizes[1]
-
-        Lout &= hout \\times wout
-
-
-    Args:
-        x(Varaible):              The input tensor of format [N, C, H, W].
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
-                                  or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
-                                  or an integer stride treated as [sride, stride].
-                                  For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
-                                  [padding_top, padding_left, padding_bottom, padding_right]
-                                  or [padding_h, padding_w] or an integer padding.
-                                  If [padding_h, padding_w] was given, it will expanded to
-                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
-                                  padding was given, [padding, padding, padding, padding] will
-                                  be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, shold be
-                                  [dilation_h, dilation_w], or an integer dialtion treated as
-                                  [dilation, dilation]. For default, it will be [1, 1].
-
-    
-    Returns:
-        Variable: The tensor variable corresponding to the sliding local blocks. The output shape is [N, Cout, Lout] as decribled above. Cout is the  total number of values within each block, and Lout is the total number of such blocks.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name = 'data', shape = [3, 224, 224], dtype = 'float32')
-            y = fluid.layers.unfold(x, [3, 3], 1, 1, 1)
-    """
-
-    helper = LayerHelper("unfold", **locals())
-
-    assert len(x.shape) == 4, \
-            "input should be the format of [N, C, H, W]"
-
-    if isinstance(kernel_sizes, int):
-        kernel_sizes = [kernel_sizes, kernel_sizes]
-    else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
-
-    if isinstance(strides, int):
-        strides = [strides, strides]
-    else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
-
-    if isinstance(dilations, int):
-        dilations = [dilations, dilations]
-    else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
-
-    if isinstance(paddings, int):
-        paddings = [paddings] * 4
-    elif isinstance(paddings, list):
-        if len(paddings) == 2:
-            paddings = paddings * 2
-        elif len(paddings) == 4:
-            pass
-        else:
-            raise ValueError(
-                "paddings should either be an integer or a list of 2 or 4 integers"
-            )
-    else:
-        raise ValueError(
-            "Unexpected type of paddings, it should be either an integer or a list"
-            "of 2 or 4 integers")
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="unfold",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs={
-            "kernel_sizes": kernel_sizes,
-            "strides": strides,
-            "paddings": paddings,
-            "dilations": dilations
-        })
-    return out
-
-
-def deformable_roi_pooling(input,
-                           rois,
-                           trans,
-                           no_trans=False,
-                           spatial_scale=1.0,
-                           group_size=[1, 1],
-                           pooled_height=1,
-                           pooled_width=1,
-                           part_size=None,
-                           sample_per_part=1,
-                           trans_std=0.1,
-                           position_sensitive=False,
-                           name=None):
-    """
-    Deformable ROI Pooling Layer
-  
-    Performs deformable region-of-interest pooling on inputs. As described
-    in `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_, it will get offset for each bin after 
-    roi pooling so that pooling at correct region. Batch_size will change to the number of region bounding boxes after deformable_roi_pooling.
-  
-    The operation has three steps:
-    
-    1. Dividing each region proposal into equal-sized sections with the pooled_width and pooled_height.
-  
-    2. Add offset to pixel in ROI to get new location and the new value which are computed directly through
-       bilinear interpolation with four nearest pixel.
-     
-    3. Sample several points in each bin to get average values as output.
-  
-  
-    Args:
-        input (Variable):The input of deformable roi pooling and it is tensor which value type is float32. The shape of input is
-                         [N, C, H, W]. Where N is batch size, C is number of input channels,
-                         H is height of the feature, and W is the width of the feature.
-        rois (Variable): ROIs (Regions of Interest) with type float32 to pool over. It should be
-                         a 2-D LoDTensor of shape (num_rois, 4), and the lod level
-                         is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                         the top left coordinates, and (x2, y2) is the bottom
-                         right coordinates, which value type is float32.
-        trans (Variable): Offset of features on ROIs while pooling which value type is float32. The format is [N, C, H, W], where 
-                          N is number of ROIs, C is number of channels, which indicate the offset distance 
-                          in the x and y directions, H is pooled height, and W is pooled width. 
-        no_trans (bool): Whether to add offset to get new value or not while roi pooling, which value with type bool is True or False.
-                         If value is True, no offset will be added in operation. Default: False.
-        spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width), which value type is float32.
-                         Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
-        group_size (list|tuple): The number of groups which input channels are divided and the input is list or tuple, which value type is int32. (eg.number of input channels 
-                          is k1 * k2 * (C + 1), which k1 and k2 are group width and height and C+1 is number of output
-                          chanels.) eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
-        pooled_height (int): The pooled output height which value type is int32. Default: 1.
-        pooled_width (int): The pooled output width which value type is int32. Default: 1.
-        part_size (list|tuple): The height and width of offset which values in list or tuple is int32, eg.(4, 6), which height is 4 and width is 6, and values always equal to pooled_height \
-                         and pooled_width. Default: if None, default value is [pooled_height, pooled_width].
-        sample_per_part (int): The number of samples in each bin which value type is int32. If value is bigger, it will consume more performance. Default: 1.
-        trans_std (float): Coefficient of offset which value type is float32. It controls weight of offset. Default: 0.1.
-        position_sensitive (bool): Whether to choose deformable psroi pooling mode or not, and value type is bool(True or False). If value is False, input dimension equals to output dimension. \
-                                   If value is True, input dimension shoule be output dimension * pooled_height * pooled_width. Default: False.
-        name (str|None): Name of layer. Default: None.
-    Returns:
-        Variable: Output of deformable roi pooling is that, if position sensitive is False, input dimension equals to output dimension. If position sensitive is True,\
-                  input dimension should be the result of output dimension divided by pooled height and pooled width.
-
-    Examples:
-      .. code-block:: python
-
-        # position_sensitive=True
-        import paddle.fluid as fluid
-        input = fluid.layers.data(name="input",
-                                  shape=[2, 192, 64, 64], 
-                                  dtype='float32', 
-                                  append_batch_size=False)                   
-        rois = fluid.layers.data(name="rois",
-                                 shape=[4],
-                                 dtype='float32', 
-                                 lod_level=1)
-        trans = fluid.layers.data(name="trans",
-                                  shape=[2, 384, 64, 64], 
-                                  dtype='float32', 
-                                  append_batch_size=False) 
-        x = fluid.layers.nn.deformable_roi_pooling(input=input, 
-                                                     rois=rois, 
-                                                     trans=trans, 
-                                                     no_trans=False,
-                                                     spatial_scale=1.0, 
-                                                     group_size=(1, 1),
-                                                     pooled_height=8,
-                                                     pooled_width=8,
-                                                     part_size=(8, 8),
-                                                     sample_per_part=4, 
-                                                     trans_std=0.1,
-                                                     position_sensitive=True)
-  
-        # position_sensitive=False
-        import paddle.fluid as fluid
-        input = fluid.layers.data(name="input",
-                                  shape=[2, 192, 64, 64], 
-                                  dtype='float32', 
-                                  append_batch_size=False)                   
-        rois = fluid.layers.data(name="rois",
-                                 shape=[4],
-                                 dtype='float32', 
-                                 lod_level=1)
-        trans = fluid.layers.data(name="trans",
-                                  shape=[2, 384, 64, 64], 
-                                  dtype='float32', 
-                                  append_batch_size=False) 
-        x = fluid.layers.nn.deformable_roi_pooling(input=input, 
-                                                     rois=rois, 
-                                                     trans=trans, 
-                                                     no_trans=False,
-                                                     spatial_scale=1.0, 
-                                                     group_size=(1, 1),
-                                                     pooled_height=8,
-                                                     pooled_width=8,
-                                                     part_size=(8, 8),
-                                                     sample_per_part=4, 
-                                                     trans_std=0.1,
-                                                     position_sensitive=False)
-    """
-
-    input_channels = input.shape[1]
-    if position_sensitive == False:
-        output_channels = input_channels
-    else:
-        output_channels = input_channels / pooled_height / pooled_width
-
-    if part_size is None:
-        part_height = pooled_height
-        part_width = pooled_width
-        part_size = [part_height, part_width]
-    part_size = utils.convert_to_list(part_size, 2, 'part_size')
-    group_size = utils.convert_to_list(group_size, 2, 'group_size')
-    helper = LayerHelper('deformable_psroi_pooling', **locals())
-    dtype = helper.input_dtype()
-    output = helper.create_variable_for_type_inference(dtype)
-    top_count = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="deformable_psroi_pooling",
-        inputs={"Input": input,
-                "ROIs": rois,
-                "Trans": trans},
-        outputs={"Output": output,
-                 "TopCount": top_count},
-        attrs={
-            "no_trans": no_trans,
-            "spatial_scale": spatial_scale,
-            "output_dim": output_channels,
-            "group_size": group_size,
-            "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
-            "part_size": part_size,
-            "sample_per_part": sample_per_part,
-            "trans_std": trans_std
-        })
-    return output
-
-
-def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
-    """
-    This function recomputes the `input` indices according to the offset of the
-    shard. The length of the indices is evenly divided into N shards, and if
-    the `shard_id` matches the shard with the input index inside, the index is
-    recomputed on the basis of the shard offset, elsewise it is set to
-    `ignore_value`. The detail is as follows:
-    :: 
-        
-        shard_size = (index_num + nshards - 1) // nshards
-        y = x % shard_size if x // shard_size == shard_id else ignore_value
-
-    NOTE: If the length of indices cannot be evely divided by the shard number,
-    the size of the last shard will be less than the calculated `shard_size`
-
-    Examples:
-    ::
-    
-        Input:
-          X.shape = [4, 1]
-          X.data = [[1], [6], [12], [19]]
-          index_num = 20
-          nshards = 2
-          ignore_value = -1
-        
-        if shard_id == 0, we get:
-          Out.shape = [4, 1]
-          Out.data = [[1], [6], [-1], [-1]]
-        
-        if shard_id == 1, we get:
-          Out.shape = [4, 1]
-          Out.data = [[-1], [-1], [2], [9]]
-    
-    Args:
-        - **input** (Variable): Input indices, last dimension must be 1.
-        - **index_num** (scalar): An interger defining the range of the index.
-        - **nshards** (scalar): The number of shards
-        - **shard_id** (scalar): The index of the current shard
-        - **ignore_value** (scalar): An ingeter value out of sharded index range
-
-    Returns:
-        Variable: The sharded index of input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            shard_label = fluid.layers.shard_index(input=label,
-                                                   index_num=20,
-                                                   nshards=2,
-                                                   shard_id=0)
-    """
-    op_type = 'shard_index'
-    helper = LayerHelper(op_type, **locals())
-    if index_num % nshards != 0:
-        raise ValueError(
-            'The index_num(%d) cannot be evenly divided by nshards(%d)' %
-            (index_num, nshards))
-    if shard_id < 0 or shard_id >= nshards:
-        raise ValueError('The shard_id(%d) should be in [0, %d)' %
-                         (shard_id, nshards))
-
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [input]},
-        outputs={'Out': out},
-        attrs={
-            'index_num': index_num,
-            'nshards': nshards,
-            'shard_id': shard_id,
-            'ignore_value': ignore_value
-        },
-        stop_gradient=True)
-    return out
-
-
-@templatedoc()
-def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
-    """
-    ${comment}
-    Args:
-        x(Varaible): Input of HardSwish operator.
-        threshold(float): The threshold parameter of HardSwish operator. Default:threshold=6.0
-        scale(float): The scale parameter of HardSwish operator. Default:scale=6.0
-        offset(float): The offset parameter of HardSwish operator. Default:offset=3.0
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The output tensor with the same shape as input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
-            y = fluid.layers.hard_swish(x)
-    """
-    helper = LayerHelper('hard_swish', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='hard_swish',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold,
-               'scale': scale,
-               'offset': offset})
-    return out
-
-
-def mse_loss(input, label):
-    """
-    **Mean square error layer**
-
-    This layer accepts input predications and target label and returns the mean square error.
-
-    The loss can be described as:
-
-    .. math::
-        
-        Out = mean((X - Y)^2)
-
-    In the above equation:
-
-        * :math:`X`: Input predications, a tensor.
-        * :math:`Y`: Input labels, a tensor.
-        * :math:`Out`: Output value, same shape with :math:`X`.
-
-    Args:
-        input (Variable): Input tensor, has predictions.
-        label (Variable): Label tensor, has target labels.
-
-    Returns:
-        Variable: The tensor variable storing the mean square error difference of input and label.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            y_predict = fluid.layers.data(name='y_predict', shape=[1], dtype='float32')
-            mse = fluid.layers.mse_loss(input=y_predict, label=y)
-
-    """
-    return reduce_mean(square_error_cost(input, label))
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
deleted file mode 100644
index 23052da7800b8645bd64200b062791aeabf1a778..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/ops.py
+++ /dev/null
@@ -1,207 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-from .layer_function_generator import generate_layer_fn, generate_activation_fn
-from .. import core
-from ..framework import convert_np_dtype_to_dtype_
-
-__activations_noattr__ = [
-    'sigmoid',
-    'logsigmoid',
-    'exp',
-    'tanh',
-    'atan',
-    'tanh_shrink',
-    'sqrt',
-    'rsqrt',
-    'abs',
-    'ceil',
-    'floor',
-    'cos',
-    'acos',
-    'asin',
-    'sin',
-    'round',
-    'reciprocal',
-    'square',
-    'softplus',
-    'softsign',
-]
-
-__all__ = []
-
-for _OP in set(__all__):
-    globals()[_OP] = generate_layer_fn(_OP)
-
-# It is a hot fix in some unittest using:
-#   fluid.layers.scale(x=x, scale=10.0, out=out_var)
-# e.g.: test_program_code.py, test_dist_train.py
-globals()['_scale'] = generate_layer_fn('scale')
-
-globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
-
-__all__ += __activations_noattr__
-
-for _OP in set(__activations_noattr__):
-    globals()[_OP] = generate_activation_fn(_OP)
-
-__all__ += ["uniform_random"]
-
-_uniform_random_ = generate_layer_fn('uniform_random')
-
-
-def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
-    """
-    This operator initializes a variable with random values sampled from a
-    uniform distribution. The random result is in set [min, max].
-
-    Args:
-        shape (list): The shape of output variable.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of data, such as
-            float32, float64 etc. Default: float32.
-        min (float): Minimum value of uniform random. Default -1.0.
-        max (float): Maximun value of uniform random. Default 1.0.
-        seed (int): Random seed used for generating samples. 0 means use a
-            seed generated by the system. Note that if seed is not 0, this
-            operator will always generate the same random numbers every time.
-            Default 0.
-
-    Examples:
-        .. code-block:: python
-     
-            import paddle.fluid as fluid
-            result = fluid.layers.uniform_random(shape=[32, 784])
-    """
-
-    if not isinstance(dtype, core.VarDesc.VarType):
-        dtype = convert_np_dtype_to_dtype_(dtype)
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            kwargs[name] = val
-    return _uniform_random_(**kwargs)
-
-
-__all__ += ['softshrink']
-
-_softshrink_ = generate_layer_fn('softshrink')
-
-
-def softshrink(x, alpha=None):
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            if name == 'alpha':
-                kwargs['lambda'] = val
-            else:
-                kwargs[name] = val
-    return _softshrink_(**kwargs)
-
-
-softshrink.__doc__ = """
-:strong:`Softshrink Activation Operator`
-
-..  math::
-    out = \begin{cases}
-            x - \alpha, \text{if } x > \alpha \\
-            x + \alpha, \text{if } x < -\alpha \\
-            0,  \text{otherwise}
-            \end{cases}
-
-
-Args:
-    x: Input of Softshrink operator
-    alpha (FLOAT): non-negative offset
-    
-Returns:
-    Output of Softshrink operator
-
-Examples:
-    .. code-block:: python
-    
-        import paddle.fluid as fluid
-        data = fluid.layers.data(name="input", shape=[784])
-        result = fluid.layers.softshrink(x=data, alpha=0.3)
-"""
-
-__all__ += ['hard_shrink']
-
-_hard_shrink_ = generate_layer_fn('hard_shrink')
-
-
-def hard_shrink(x, threshold=None):
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            kwargs[name] = val
-    return _hard_shrink_(**kwargs)
-
-
-hard_shrink.__doc__ = _hard_shrink_.__doc__ + """
-Examples:
-
-    >>> import paddle.fluid as fluid
-    >>> data = fluid.layers.data(name="input", shape=[784])
-    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
-"""
-
-__all__ += ['cumsum']
-
-_cum_sum_ = generate_layer_fn('cumsum')
-
-
-def cumsum(x, axis=None, exclusive=None, reverse=None):
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            kwargs[name] = val
-    return _cum_sum_(**kwargs)
-
-
-cumsum.__doc__ = _cum_sum_.__doc__ + """
-Examples:
-
-    >>> import paddle.fluid as fluid
-    >>> data = fluid.layers.data(name="input", shape=[32, 784])
-    >>> result = fluid.layers.cumsum(data, axis=0)
-"""
-
-__all__ += ['thresholded_relu']
-
-_thresholded_relu_ = generate_layer_fn('thresholded_relu')
-
-
-def thresholded_relu(x, threshold=None):
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            kwargs[name] = val
-
-    return _thresholded_relu_(**kwargs)
-
-
-thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """
-Examples:
-
-    >>> import paddle.fluid as fluid
-    >>> data = fluid.layers.data(name="input", shape=[1])
-    >>> result = fluid.layers.thresholded_relu(data, threshold=0.4)
-"""
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
deleted file mode 100644
index b0838227f0d7340a304cf7b443eec27f26216e22..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/tensor.py
+++ /dev/null
@@ -1,1098 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unlessf required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from six.moves import reduce
-from ..layer_helper import LayerHelper
-from ..param_attr import ParamAttr
-from ..framework import convert_np_dtype_to_dtype_
-from ..framework import Variable
-from ..initializer import Constant, force_init_on_cpu
-from ..core import VarDesc
-from .layer_function_generator import templatedoc
-import numpy
-
-__all__ = [
-    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
-    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
-    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
-    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
-    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye'
-]
-
-
-def create_tensor(dtype, name=None, persistable=False):
-    """
-    Create an variable, which will hold a LoDTensor with data type dtype.
-
-    Args:
-        dtype(string): 'float32'|'int32'|..., the data type of the
-            created tensor.
-        name(string): The name of the created tensor, if not set,
-            the name will be a random unique one.
-        persistable(bool): Set the persistable flag of the create tensor.
-
-    Returns:
-        Variable: The tensor variable storing the created tensor.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          tensor = fluid.layers.create_tensor(dtype='float32')
-    """
-    helper = LayerHelper("create_tensor", **locals())
-    return helper.create_variable(
-        name=helper.name, dtype=dtype, persistable=persistable)
-
-
-def create_parameter(shape,
-                     dtype,
-                     name=None,
-                     attr=None,
-                     is_bias=False,
-                     default_initializer=None):
-    """
-    Create a parameter. The parameter is a learnable variable, which can have
-    gradient, and can be optimized.
-
-    NOTE: this is a very low-level API. This API is useful when you create
-    operator by your self. instead of using layers.
-
-    Args:
-        shape(list[int]): shape of the parameter
-        dtype(string): element type of the parameter
-        attr(ParamAttr): attributes of the parameter
-        is_bias(bool): This can affect which default initializer is chosen
-                       when default_initializer is None. If is_bias,
-                       initializer.Constant(0.0) will be used. Otherwise,
-                       Xavier() will be used.
-        default_initializer(Initializer): initializer for the parameter
-
-    Returns:
-        the created parameter.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            W = layers.create_parameter(shape=[784, 200], dtype='float32')
-    """
-    helper = LayerHelper("create_parameter", **locals())
-    if attr is None:
-        attr = ParamAttr(name=name)
-    return helper.create_parameter(attr, shape, dtype, is_bias,
-                                   default_initializer)
-
-
-def create_global_var(shape,
-                      value,
-                      dtype,
-                      persistable=False,
-                      force_cpu=False,
-                      name=None):
-    """
-    Create a new tensor variable with value in the global block(block 0).
-
-    Args:
-        shape(list[int]): shape of the variable
-        value(float): the value of the variable. The new created
-                      variable will be filled with it.
-        dtype(string): data type of the variable
-        persistable(bool): if this variable is persistable.
-                           Default: False
-        force_cpu(bool): force this variable to be on CPU.
-                         Default: False
-        name(str|None): The name of the variable. If set to None the variable
-                        name will be generated automatically.
-                        Default: None
-
-    Returns:
-        Variable: the created Variable
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            var = layers.create_global_var(shape=[2,3], value=1.0, dtype='float32',
-                                          persistable=True, force_cpu=True, name='new_var')
-    """
-    helper = LayerHelper("global_var", **locals())
-    var = helper.create_global_variable(
-        dtype=dtype,
-        shape=shape,
-        persistable=persistable,
-        name=name,
-        stop_gradient=True)
-    helper.set_variable_initializer(
-        var, initializer=Constant(
-            value=float(value), force_cpu=force_cpu))
-
-    return var
-
-
-def cast(x, dtype):
-    """
-    This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts
-    it to the output with :attr:`dtype`. It's meaningless if the output
-    dtype equals the input dtype, but it's fine if you do so.
-
-    Args:
-        x (Variable): The input Variable for casting.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output Variable.
-
-    Returns:
-        Variable: The output Variable after casting.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            result = fluid.layers.cast(x=data, dtype='float64')
-    """
-    helper = LayerHelper('cast', **locals())
-    out = helper.create_variable_for_type_inference(dtype=dtype)
-    helper.append_op(
-        type='cast',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'in_dtype': x.dtype,
-               'out_dtype': out.dtype})
-    return out
-
-
-def concat(input, axis=0, name=None):
-    """
-    **Concat**
-
-    This function concatenates the input along the axis mentioned
-    and returns that as the output.
-
-    Args:
-        input(list): List of tensors to be concatenated
-        axis(int): Integer axis along which the tensors will be concatenated
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-        Variable: Output variable of the concatenation
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            a = fluid.layers.data(name='a', shape=[2, 13], dtype='float32')
-            b = fluid.layers.data(name='b', shape=[2, 3], dtype='float32')
-            c = fluid.layers.data(name='c', shape=[2, 2], dtype='float32')
-            d = fluid.layers.data(name='d', shape=[2, 5], dtype='float32')
-            out = fluid.layers.concat(input=[a, b, c, d], axis=2)
-    """
-    helper = LayerHelper('concat', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='concat',
-        inputs={'X': input},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
-    return out
-
-
-def tensor_array_to_tensor(input, axis=1, name=None):
-    """
-    This function concatenates the input LodTensorArray along the axis mentioned
-    and returns that as the output.
-
-    A simple example as below:
-
-    .. code-block:: text
-
-        Given:
-
-        input.data = {[[0.6, 0.1, 0.3],
-                       [0.5, 0.3, 0.2]],
-                      [[1.3],
-                       [1.8]],
-                      [[2.3, 2.1],
-                       [2.5, 2.4]]}
-
-        axis = 1
-
-        Then:
-
-        output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1],
-                       [0.5, 0.3, 0.2, 1.8, 2.5, 2.4]]
-
-        output_index.data = [3, 1, 2]
-
-    Args:
-        input(list): Input LodTensorArray
-        axis(int): Integer axis along which the tensors will be concatenated
-        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
-    Returns:
-        Variable: Output variable of the concatenation
-        Variable: The input LodTensorArray items' dims along the axis
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            tensor_array = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
-            output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array)
-    """
-    helper = LayerHelper('tensor_array_to_tensor', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    out_index = helper.create_variable_for_type_inference(dtype="int32")
-    helper.append_op(
-        type='tensor_array_to_tensor',
-        inputs={'X': input},
-        outputs={'Out': [out],
-                 'OutIndex': [out_index]},
-        attrs={'axis': axis})
-    return out, out_index
-
-
-def sums(input, out=None):
-    """
-    This function performs the sum operation on the input and returns the
-    result as the output.
-
-    Args:
-        input (Variable|list): The input tensor that has the elements
-                               that need to be summed up.
-        out (Variable|None): Output parameter. The sum result.
-                             Default: None
-
-    Returns:
-        Variable: the sum of input. The same as the argument 'out'
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          # sum of several tensors
-          a0 = fluid.layers.fill_constant(shape=[1], dtype='int64', value=1)
-          a1 = fluid.layers.fill_constant(shape=[1], dtype='int64', value=2)
-          a2 = fluid.layers.fill_constant(shape=[1], dtype='int64', value=3)
-          sums = fluid.layers.sums(input=[a0, a1, a2])
-
-          # sum of a tensor array
-          array = fluid.layers.create_array('int64')
-          i = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)
-          fluid.layers.array_write(a0, array=array, i=i)
-          i = fluid.layers.increment(x=i)
-          fluid.layers.array_write(a1, array=array, i=i)
-          i = fluid.layers.increment(x=i)
-          fluid.layers.array_write(a2, array=array, i=i)
-          sums = fluid.layers.sums(input=array)
-    """
-    helper = LayerHelper('sum', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
-    helper.append_op(
-        type='sum',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={'use_mkldnn': False})
-    return out
-
-
-def assign(input, output=None):
-    """
-    **Assign**
-
-    This function copies the *input* Variable to the *output* Variable.
-
-    Args:
-        input(Variable|numpy.ndarray): The source variable
-        output(Variable|None): The destination variable
-
-    Returns:
-        Variable: The destination variable that was supplied as the *output*.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32")
-          out = fluid.layers.create_tensor(dtype='float32')
-          hidden = fluid.layers.fc(input=data, size=10)
-          fluid.layers.assign(hidden, out)
-    """
-    helper = LayerHelper('assign', **locals())
-    if output is None:
-        output = helper.create_variable_for_type_inference(dtype=input.dtype)
-    if isinstance(input, Variable):
-        helper.append_op(
-            type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
-    elif isinstance(input, numpy.ndarray):
-        dtype = convert_np_dtype_to_dtype_(input.dtype)
-        if dtype == VarDesc.VarType.FP32:
-            value_name = "fp32_values"
-            values = [float(v) for v in input.flat]
-        elif dtype == VarDesc.VarType.INT32:
-            value_name = "int32_values"
-            values = [int(v) for v in input.flat]
-        else:
-            raise ValueError("Unsupported dtype %s", input.dtype)
-        if input.size > 1024 * 1024:
-            raise ValueError("The size of input is too big. Please consider "
-                             "saving it to file and 'load_op' to load it")
-
-        helper.append_op(
-            type='assign_value',
-            outputs={'Out': [output]},
-            attrs={
-                'dtype': dtype,
-                'shape': list(input.shape),
-                value_name: values
-            })
-    else:
-        raise ValueError("Wrong type for assign input: %s" % type(input))
-
-    return output
-
-
-def fill_constant(shape, dtype, value, force_cpu=False, out=None):
-    """
-    **fill_constant**
-
-    This function creates a tensor with specified `shape` and `dtype`, and
-    initializes it with a constant specifed by `value`.
-
-    The attribute `stop_gradient` of the created tensor is set to True.
-
-    Args:
-        shape(tuple|list|None): Shape of the output tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output tensor.
-        value(float): The constant value used to initialize the output tensor.
-        out(Variable): The output tensor.
-        force_cpu(True|False): data should be on CPU if set true.
-
-    Returns:
-        Variable: The tensor variable storing the output.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
-    """
-
-    helper = LayerHelper("fill_constant", **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
-    helper.append_op(
-        type='fill_constant',
-        inputs={},
-        outputs={'Out': [out]},
-        attrs={
-            'shape': shape,
-            'dtype': out.dtype,
-            'value': float(value),
-            'force_cpu': force_cpu or force_init_on_cpu()
-        },
-        stop_gradient=True)
-    out.stop_gradient = True
-    return out
-
-
-@templatedoc()
-def fill_constant_batch_size_like(input,
-                                  shape,
-                                  dtype,
-                                  value,
-                                  input_dim_idx=0,
-                                  output_dim_idx=0):
-    """
-    ${comment}
-
-    It also sets *stop_gradient* to True.
-
-    Args:
-        input(${input_type}): ${input_comment}.
-
-        shape(${shape_type}): ${shape_comment}.
-
-        dtype(${dtype_type}): ${dtype_comment}.
-
-        value(${value_type}): ${value_comment}.
-
-        input_dim_idx(${input_dim_idx_type}): ${input_dim_idx_comment}.
-
-        output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}.
-
-    Returns:
-        ${out_comment}.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             like = fluid.layers.data(name='like', shape=[1], dtype='float32')
-             data = fluid.layers.fill_constant_batch_size_like(
-                         input=like, shape=[1], value=0, dtype='int64')
-
-    """
-    helper = LayerHelper("fill_constant_batch_size_like", **locals())
-    out = helper.create_variable_for_type_inference(dtype=dtype)
-    helper.append_op(
-        type='fill_constant_batch_size_like',
-        inputs={'Input': input},
-        outputs={'Out': [out]},
-        attrs={
-            'shape': shape,
-            'dtype': out.dtype,
-            'value': float(value),
-            'input_dim_idx': input_dim_idx,
-            'output_dim_idx': output_dim_idx
-        })
-    out.stop_gradient = True
-    return out
-
-
-def argmin(x, axis=0):
-    """
-    **argmin**
-
-    This function computes the indices of the min elements
-    of the input tensor's element along the provided axis.
-
-    Args:
-        x(Variable): The input to compute the indices of
-                     the min elements.
-        axis(int): Axis to compute indices along.
-
-    Returns:
-        Variable: The tensor variable storing the output
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3, 4], dtype="float32")
-            out = fluid.layers.argmin(x, axis=0)
-            out = fluid.layers.argmin(x, axis=-1)
-    """
-    helper = LayerHelper("arg_min", **locals())
-    out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
-    helper.append_op(
-        type='arg_min',
-        inputs={'X': x},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
-    return out
-
-
-def argmax(x, axis=0):
-    """
-    **argmax**
-
-    This function computes the indices of the max elements
-    of the input tensor's element along the provided axis.
-
-    Args:
-        x(Variable): The input to compute the indices of
-                     the max elements.
-        axis(int): Axis to compute indices along.
-
-    Returns:
-        Variable: The tensor variable storing the output
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3, 4], dtype="float32")
-            out = fluid.layers.argmax(x, axis=0)
-            out = fluid.layers.argmax(x, axis=-1)
-    """
-    helper = LayerHelper("arg_max", **locals())
-    out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
-    helper.append_op(
-        type='arg_max',
-        inputs={'X': x},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
-    return out
-
-
-def argsort(input, axis=-1, name=None):
-    """
-    Performs sorting on the input Variable along the given axis, and outputs
-    sorted data Varibale and its corresponding index Variable with the same
-    shape as :attr:`input`.
-
-    .. code-block:: text
-
-        For example, the given axis is -1 and the input Variable
-
-            input = [[0.15849551, 0.45865775, 0.8563702 ],
-                     [0.12070083, 0.28766365, 0.18776911]],
-
-        after argsort, the sorted Vairable becomes
-
-            out = [[0.15849551, 0.45865775, 0.8563702 ],
-                   [0.12070083, 0.18776911, 0.28766365]],
-
-        and the sorted indices along the given axis turn outs to be
-
-            indices = [[0, 1, 2],
-                       [0, 2, 1]]
-
-    Args:
-        input(Variable): The input Variable for sorting.
-        axis(int): The axis along which to sort the input Variable. When
-                   :attr:`axis` < 0, the actual axis will be :attr:`axis` +
-                   rank(:attr:`input`). Default -1, the last dimension.
-        name(str|None): (optional) A name for this layer. If set None, the
-                   layer will be named automatically.
-
-    Returns:
-        tuple: A tuple of sorted data Variable and the sorted indices.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3, 4], dtype="float32")
-            out, indices = fluid.layers.argsort(input=x, axis=0)
-    """
-    helper = LayerHelper("argsort", **locals())
-    out = helper.create_variable_for_type_inference(
-        dtype=input.dtype, stop_gradient=True)
-    ids = helper.create_variable_for_type_inference(
-        VarDesc.VarType.INT64, stop_gradient=True)
-    helper.append_op(
-        type='argsort',
-        inputs={'X': input},
-        outputs={'Out': out,
-                 'Indices': ids},
-        attrs={'axis': axis})
-    return out, ids
-
-
-def ones(shape, dtype, force_cpu=False):
-    """
-    **ones**
-
-    This function creates a tensor of specified *shape* and
-    *dtype*, and initializes this with 1.
-
-    It also sets *stop_gradient* to True.
-
-    Args:
-        shape(tuple|list): Shape of output tensor
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
-
-    Returns:
-        Variable: The tensor variable storing the output
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.ones(shape=[1], dtype='int64')
-    """
-    assert isinstance(shape, list) or isinstance(
-        shape, tuple), "The shape's type should be list or tuple."
-    assert reduce(lambda x, y: x * y,
-                  shape) > 0, "The shape is invalid: %s." % (str(shape))
-    return fill_constant(value=1.0, **locals())
-
-
-def zeros(shape, dtype, force_cpu=False):
-    """
-    **zeros**
-
-    This function creates a tensor of specified *shape* and
-    *dtype*, and initializes this with 0.
-
-    It also sets *stop_gradient* to True.
-
-    Args:
-        shape(tuple|list|None): Shape of output tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor.
-        force_cpu(bool, default False): Whether to make output stay on CPU.
-
-    Returns:
-        Variable: The tensor variable storing the output.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.zeros(shape=[1], dtype='int64')
-    """
-    return fill_constant(value=0.0, **locals())
-
-
-def reverse(x, axis):
-    """
-    **reverse**
-
-    This function reverse the input 'x' along given axises.
-
-    Args:
-        x(Vairbale): the input to be reversed.
-        axis(int|tuple|list): Axis that along which order of elements
-                    is reversed. If it is a tuple or a list, reversing
-                    will be apply on each axis in the tuple or list.
-
-    Returns:
-        Variable: The reversed tensor.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name="data", shape=[4, 8], dtype="float32")
-          out = fluid.layers.reverse(x=data, axis=0)
-          # or:
-          out = fluid.layers.reverse(x=data, axis=[0,1])
-    """
-    if isinstance(axis, int):
-        axis = [axis]
-    helper = LayerHelper("reverse", **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='reverse',
-        inputs={'X': x},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
-    return out
-
-
-def save(x, file_path, overwrite=True):
-    """
-    Saves a variable as a file.
-
-    Args:
-        x(variable): The Tensor/LoDTensor to be saved.
-        file_path(str): The file path where the variable will be saved.
-        overwrite(bool): Whether or not cover the given file when it has already
-            existed. If it's set 'False' and the file is existed, a runtime
-            error will be thrown.
-    """
-    helper = LayerHelper("save", **locals())
-    helper.append_op(
-        type="save",
-        inputs={"input": x},
-        outputs={},
-        args={"file_path": file_path,
-              "overwrite": overwrite})
-
-
-def save_combine(x, file_path, overwrite=True):
-    """
-    Saves a list of variables into a single file.
-
-    Args:
-        x(list): A list of Tensor/LoDTensor variables to be saved together in
-                 a single file.
-        file_path(str): The file path where variables will be saved.
-        overwrite(bool): Whether or not cover the given file when it has already
-            existed. If it's set 'False' and the file is existed, a runtime
-            error will be thrown.
-
-    Returns:
-        There is no return value.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            v1 = fluid.layers.data(name="data",
-                                   shape=(4, 6),
-                                   dtype="float32")
-            v2 = fluid.layers.data(name="data",
-                                   shape=(6, 8, 4),
-                                   dtype="float32")
-            normed = fluid.layers.save_combine([v1, v2], file_path="output")
-    """
-    helper = LayerHelper("save_combine", **locals())
-    helper.append_op(
-        type="save_combine",
-        inputs={"input": x},
-        outputs={},
-        args={"file_path": file_path,
-              "overwrite": overwrite})
-
-
-def load_combine(out, file_path):
-    """
-    Loads a list of vairables from a single file.
-
-    Args:
-        out(list): The list of variables to be read from the disk file.
-        file_path(str): The path of the disk file.
-    """
-    helper = LayerHelper("load_combine", **locals())
-    helper.append_op(
-        type="load_combine",
-        inputs={},
-        output={"Out": out},
-        args={"file_path": file_path})
-
-
-def has_inf(x):
-    """
-    Test if any of x contains an infinity number
-
-    Args:
-       x(variable): The Tensor/LoDTensor to be checked.
-
-    Returns:
-        Variable: The tensor variable storing the output, only a bool value.
-    
-    Examples:
-        .. code-block:: python
-          
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
-          res = fluid.layers.has_inf(data)
-
-    """
-    helper = LayerHelper("isinf", **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="isinf", inputs={"X": x}, outputs={"Out": out})
-    return out
-
-
-def has_nan(x):
-    """
-    Test if any of x contains a NAN
-
-    Args:
-       x(variable): The Tensor/LoDTensor to be checked.
-
-    Returns:
-        Variable: The tensor variable storing the output, only a bool value.
-    
-    Examples:
-        .. code-block:: python
-    
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
-          res = fluid.layers.has_nan(data)
-
-    """
-    helper = LayerHelper("isnan", **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="isnan", inputs={"X": x}, outputs={"Out": out})
-    return out
-
-
-def isfinite(x):
-    """
-    Test if any of x contains an infinity/NAN number. If all the elements are finite,
-    returns true, else false.
-
-    Args:
-       x(variable): The Tensor/LoDTensor to be checked.
-
-    Returns:
-        Variable: The tensor variable storing the output, contains a bool value.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            var = fluid.layers.data(name="data",
-                                    shape=(4, 6),
-                                    dtype="float32")
-            out = fluid.layers.isfinite(var)
-    """
-    helper = LayerHelper("isfinite", **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out})
-    return out
-
-
-def range(start, end, step, dtype):
-    """
-    Return evenly spaced values within a given interval.
-
-    Values are generated within the half-open interval [start, stop) (in other words,
-    the interval including start but excluding stop).
-
-    args:
-        start(int|float|Variable): Start of interval. The interval includes this value.
-        end(int|float|Variable): End of interval. The interval does not include this
-                                 value, except in some cases where step is not an integer
-                                 and floating point round-off affects the length of out. 
-        step(int|float|Variable): Spacing between values. For any output out, this is the
-                                  distance between two adjacent values, out[i+1] - out[i].
-                                  The default step size is 1.
-        dtype(string): 'float32'|'int32'|..., the data type of the output tensor.
-
-    returns:
-        Evenly spaced values within a given interval.
-
-    examples:
-
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             data = fluid.layers.range(0, 10, 2, 'int32')
-
-    """
-    helper = LayerHelper("range", **locals())
-
-    if not isinstance(start, Variable):
-        start = fill_constant([1], dtype, start)
-    if not isinstance(end, Variable):
-        end = fill_constant([1], dtype, end)
-    if not isinstance(step, Variable):
-        step = fill_constant([1], dtype, step)
-
-    out = helper.create_variable_for_type_inference(dtype=start.dtype)
-
-    helper.append_op(
-        type='range',
-        inputs={'Start': start,
-                'End': end,
-                'Step': step},
-        outputs={'Out': [out]})
-    out.stop_gradient = True
-    return out
-
-
-def linspace(start, stop, num, dtype):
-    """
-    Return fixed number of evenly spaced values within a given interval.
-
-    First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
-
-    Args:
-        start(float|Variable): First entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'.
-        stop(float|Variable): Last entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'.
-        num(int|Variable): Number of entry in the sequence. It is an int scalar, or a tensor of shape [1] with type int32.
-        dtype(string): 'float32'|'float64', the data type of the output tensor.
-
-    Returns:
-        Variable: The tensor variable storing a 1-D tensor. 
-
-    Examples:
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             data = fluid.layers.linspace(0, 10, 5, 'float32') # [0.0,  2.5,  5.0,  7.5, 10.0]
-             data = fluid.layers.linspace(0, 10, 1, 'float32') # [0.0]
-
-    """
-    helper = LayerHelper("linspace", **locals())
-
-    if not isinstance(start, Variable):
-        start = fill_constant([1], dtype, start)
-    if not isinstance(stop, Variable):
-        stop = fill_constant([1], dtype, stop)
-    if not isinstance(num, Variable):
-        num = fill_constant([1], 'int32', num)
-
-    out = helper.create_variable_for_type_inference(dtype=start.dtype)
-
-    helper.append_op(
-        type='linspace',
-        inputs={'Start': start,
-                'Stop': stop,
-                'Num': num},
-        outputs={'Out': [out]})
-    return out
-
-
-def zeros_like(x, out=None):
-    """
-    **zeros_like**
-
-    This function creates a zeros tensor which has identical shape and dtype 
-    with `x`.
-
-    Args:
-        x(Variable): The input tensor which specifies shape and dtype.
-        out(Variable): The output tensor.
-
-    Returns:
-        Variable: The tensor variable storing the output.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          x = fluid.layers.data(name='x', dtype='float32', shape=[3], append_batch_size=False)
-          data = fluid.layers.zeros_like(x) # [0.0, 0.0, 0.0]
-
-    """
-
-    helper = LayerHelper("zeros_like", **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
-    out.stop_gradient = True
-    return out
-
-
-def diag(diagonal):
-    """
-    **diag**
-
-    This function creates a square matrix which has diagonal values specified by `diagonal`.
-
-    Args:
-        diagonal(Variable|numpy.ndarray): The input tensor specifying diagonal values, should be of rank 1.
-
-    Returns:
-        Variable: The tensor variable storing the square matrix.
-
-    Examples:
-        .. code-block:: python
-
-          # [[3, 0, 0]
-          #  [0, 4, 0]
-          #  [0, 0, 5] 
-
-          import paddle.fluid as fluid
-          import numpy as np
-          data = fluid.layers.diag(np.arange(3, 6, dtype='int32')) 
-
-    """
-
-    helper = LayerHelper("diag", **locals())
-
-    if not isinstance(diagonal, Variable):
-        diagonal = assign(diagonal)
-
-    out = helper.create_variable_for_type_inference(dtype=diagonal.dtype)
-
-    helper.append_op(
-        type='diag', inputs={'Diagonal': [diagonal]}, outputs={'Out': [out]})
-
-    out.stop_gradient = True
-    return out
-
-
-def eye(num_rows, num_columns=None, batch_shape=None, dtype='float32'):
-    """
-    **eye**
-
-    This function constructs an identity tensor, or a batch of tensor.
-
-    Args:
-        num_rows(int): the number of rows in each batch tensor.
-        num_columns(int): the number of columns in each batch tensor.
-                          If None, default: num_rows.
-        batch_shape(list(int)): If provided, the returned tensor will have a leading
-                                batch size of this shape.
-        dtype(string): 'float32'|'int32'|..., the data type of the returned tensor.
-
-    Returns:
-        Variable: An identity tensor of shape batch_shape + [num_rows, num_columns].
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
- 	  data = fluid.layers.eye(3, dtype='int32')
-	  # [[1, 0, 0]
-          #  [0, 1, 0]
-	  #  [0, 0, 1]]
-    
-          data = fluid.layers.eye(2, 3, dtype='int32')
-	  # [[1, 0, 0]
-          #  [0, 1, 0]]
-    
-	  data = fluid.layers.eye(2, batch_shape=[3])
-          # Construct a batch of 3 identity tensors, each 2 x 2.
-          # data[i, :, :] is a 2 x 2 identity tensor, i = 0, 1, 2.
-
-    """
-
-    helper = LayerHelper("eye", **locals())
-    if not isinstance(num_rows, int) or num_rows < 0:
-        raise TypeError("num_rows should be a non-negative int")
-    if num_columns is not None:
-        if not isinstance(num_columns, int) or num_columns < 0:
-            raise TypeError("num_columns should be a non-negative int")
-    else:
-        num_columns = num_rows
-    out = helper.create_variable_for_type_inference(dtype=dtype)
-    c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(
-        type='eye',
-        inputs={},
-        outputs={'Out': [out]},
-        attrs={
-            'num_rows': num_rows,
-            'num_columns': num_columns,
-            'dtype': c_dtype
-        },
-        stop_gradient=True)
-    out.stop_gradient = True
-
-    if batch_shape is not None:
-        if not isinstance(batch_shape, list):
-            raise TypeError("batch_shape should be a list")
-        from .nn import stack
-        for batch_val in reversed(batch_shape):
-            if batch_val <= 0:
-                raise TypeError("batch_shape should be a positive int list")
-            else:
-                stack_vars = [out for _ in numpy.arange(batch_val)]
-                out = stack(stack_vars, axis=0)
-    return out
-
-
-def ones_like(x, out=None):
-    """
-    **ones_like**
-
-    This function creates a ones tensor which has identical shape and dtype 
-    with `x`.
-
-    Args:
-        x(Variable): The input tensor which specifies shape and dtype.
-        out(Variable): The output tensor.
-
-    Returns:
-        out(Variable): The tensor variable storing the output.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          x = fluid.layers.data(name='x', dtype='float32', shape=[3], append_batch_size=False)
-          data = fluid.layers.ones_like(x) # [1.0, 1.0, 1.0]
-
-    """
-
-    helper = LayerHelper("ones_like", **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='fill_any_like',
-        inputs={'X': [x]},
-        attrs={'value': 1.0},
-        outputs={'Out': [out]})
-    return out
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
deleted file mode 100644
index 5688f04ab2382f5731e69c60225765a2094bba8c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/utils.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-
-
-def convert_to_list(value, n, name, dtype=np.int):
-    """
-    Converts a single numerical type or iterable of numerical
-    types into an numerical type list.
-
-    Arguments:
-      value: The value to validate and convert. Could an int, or any iterable
-        of ints.
-      n: The size of the list to be returned.
-      name: The name of the argument being validated, e.g. "stride" or
-        "filter_size". This is only used to format error messages.
-      dtype: the numerical type of the element of the list to be returned.
-
-    Returns:
-      A list of n dtypes.
-
-    Raises:
-      ValueError: If something else than an int/long or iterable thereof was
-        passed.
-    """
-    if isinstance(value, dtype):
-        return [value, ] * n
-    else:
-        try:
-            value_list = list(value)
-        except TypeError:
-            raise ValueError("The " + name +
-                             "'s type must be list or tuple. Received: " + str(
-                                 value))
-        if len(value_list) != n:
-            raise ValueError("The " + name + "'s length must be " + str(n) +
-                             ". Received: " + str(value))
-        for single_value in value_list:
-            try:
-                dtype(single_value)
-            except (ValueError, TypeError):
-                raise ValueError(
-                    "The " + name + "'s type must be a list or tuple of " + str(
-                        n) + " " + str(dtype) + " . Received: " + str(
-                            value) + " "
-                    "including element " + str(single_value) + " of type" + " "
-                    + str(type(single_value)))
-        return value_list
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
deleted file mode 100644
index 9d5ed2e6d99773f8ebff4cfe0dd3b12d4a070e4e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/lod_tensor.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import core
-from .data_feeder import DataToLoDTensorConverter
-import numpy as np
-
-__all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
-
-
-def create_lod_tensor(data, recursive_seq_lens, place):
-    """
-    Create a lod tensor from a numpy array, a list, or an existing lod tensor.
-
-    Create a lod tensor by doing the following:
-
-    1. Check that the length-based level of detail (LoD) also known as
-       recursive_sequence_lengths of the input is valid.
-
-    2. Convert recursive_sequence_lengths to a offset-based LoD.
-
-    3. Copy the data from a numpy array, a list or a existing lod tensor to
-       CPU or GPU device (based on input place).
-
-    4. Set the level of detail (LoD) using the offset-based LoD.
-
-    Examples:
-
-        Suppose we want LoDTensor to hold data for sequences of word, where each
-        word is represented by an integer. If we want to create a LoDTensor to
-        represent two sentences, one of 2 words, and one of 3 words.
-
-        Then :code:`data` can be a numpy array of integers with shape (5, 1).
-        :code:`recursive_seq_lens` will be [[2, 3]], indicating the length(# of words) in each
-        sentence. This length-based :code:`recursive_seq_lens` [[2, 3]] will be converted to
-        offset-based LoD [[0, 2, 5]] inside the function call.
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy as np
-
-          t = fluid.create_lod_tensor(np.ndarray([5, 30]), [[2, 3]], fluid.CPUPlace())
-
-    Please reference :ref:`api_guide_low_level_lod_tensor` for more details
-    regarding LoD.
-
-    Args:
-        data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
-            list holding the data to be copied.
-        recursive_seq_lens(list): a list of lists indicating the length-based level of detail
-            info specified by the user.
-        place(Place): CPU or GPU place indicating where the data in the new
-            LoDTensor will be stored.
-
-    Returns:
-        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
-    """
-    if isinstance(data, core.LoDTensor):
-        return create_lod_tensor(np.array(data), recursive_seq_lens, place)
-    elif isinstance(data, list):
-        # dtype and shape are not important here,
-        # we only want to reuse code of DataToLoDTensorConverter
-        converter = DataToLoDTensorConverter(
-            place=place,
-            lod_level=len(recursive_seq_lens),
-            shape=[],
-            dtype=core.VarDesc.VarType.FP32)
-
-        new_recursive_seq_lens = []
-        for seq in data:
-            new_recursive_seq_lens.append(len(seq))
-            converter.feed(seq)
-
-        assert [
-            new_recursive_seq_lens
-        ] == recursive_seq_lens, "data and recursive_seq_lens do not match"
-
-        arr = np.array(converter.data)
-
-        # FIXME(zjl): the original logic of create_lod_tensor would append
-        # 1 to the shape. Maybe it is not a right way? Currently, we only
-        # follow the previous logic
-        arr = arr.reshape(arr.shape + (1, ))
-        tensor = core.LoDTensor()
-        tensor.set(arr, place)
-        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
-        return tensor
-    elif isinstance(data, np.ndarray):
-        tensor = core.LoDTensor()
-        tensor.set(data, place)
-        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
-        assert tensor.has_valid_recursive_sequence_lengths(
-        ), "the provided lod info is invalid"
-        return tensor
-    else:
-        raise TypeError(
-            "data should be either a LoDTensor, a Numpy array or a list")
-
-
-def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
-                                high):
-    """
-    Create a LoDTensor containing random integers.
-
-    This function is frequently used in the book examples. So we revised it
-    based on the new create_lod_tensor API and put it here in the lod_tensor
-    module to simplify the code.
-
-    The function does the following:
-
-    1. Calculate the overall shape of the LoDTensor based on the length-based
-       :code:`recursive_seq_lens` input and the shape of the basic element in
-       :code:`base_shape`.
-
-    2. Create a numpy array of this shape.
-
-    3. Create the LoDTensor using create_lod_tensor API.
-
-    Suppose we want LoDTensor to hold data for sequences of word, where each
-    word is represented by an integer. If we want to create a LoDTensor to
-    represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]].
-    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words
-    for two sentences.
-
-    Args:
-        recursive_seq_lens(list): a list of lists indicating the length-based
-            level of detail info specified by the user.
-        base_shape(list): the shape of the basic element to be held by the
-            LoDTensor.
-        place(Place): CPU or GPU place indicating where the data in the new
-            LoDTensor will be stored.
-        low(int): the lower bound of the random integers.
-        high(int): the upper bound of the random integers.
-
-    Returns:
-        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          t = fluid.create_random_int_lodtensor(recursive_seq_lens=[[2, 3]], 
-                base_shape=[30], place=fluid.CPUPlace(), low=0, high=10)
-    """
-    assert isinstance(base_shape, list), "base_shape should be a list"
-    # append the total number of basic elements to the front of its shape
-    overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
-    # the range of integer data elements is [low, high]
-    data = np.random.random_integers(low, high, overall_shape).astype("int64")
-    return create_lod_tensor(data, recursive_seq_lens, place)
diff --git a/python/paddle/fluid/log_helper.py b/python/paddle/fluid/log_helper.py
deleted file mode 100644
index 0933d7b904808a1d5deae1bb5add831cceb0f50e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/log_helper.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import logging
-
-__all__ = ['get_logger']
-
-
-def get_logger(name, level, fmt=None):
-    """
-    Get logger from logging with given name, level and format without
-    setting logging basicConfig. For setting basicConfig in paddle
-    will disable basicConfig setting after import paddle.
-
-    Args:
-        name (str): The logger name.
-        level (logging.LEVEL): The base level of the logger
-        fmt (str): Format of logger output
-
-    Returns:
-        logging.Logger: logging logger with given setttings
-
-    Examples:
-        .. code-block:: python
-
-            logger = log_helper.get_logger(__name__, logging.INFO,
-                            fmt='%(asctime)s-%(levelname)s: %(message)s')
-    """
-
-    logger = logging.getLogger(name)
-    logger.setLevel(level)
-    handler = logging.StreamHandler()
-
-    if fmt:
-        formatter = logging.Formatter(fmt=fmt)
-        handler.setFormatter(formatter)
-
-    logger.addHandler(handler)
-    return logger
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
deleted file mode 100644
index 6748f32fe5ba0e3b04a24549cf818269495dd0e3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/metrics.py
+++ /dev/null
@@ -1,889 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fluid Metrics
-"""
-
-from __future__ import print_function
-
-import numpy as np
-import copy
-import warnings
-import six
-
-from .layer_helper import LayerHelper
-from .initializer import Constant
-from . import unique_name
-from .framework import Program, Variable, program_guard
-from . import layers
-from .layers import detection
-
-__all__ = [
-    'MetricBase',
-    'CompositeMetric',
-    'Precision',
-    'Recall',
-    'Accuracy',
-    'ChunkEvaluator',
-    'EditDistance',
-    'DetectionMAP',
-    'Auc',
-]
-
-
-def _is_numpy_(var):
-    return isinstance(var, (np.ndarray, np.generic))
-
-
-def _is_number_(var):
-    return isinstance(var, int) or isinstance(var, np.int64) or isinstance(
-        var, float) or (isinstance(var, np.ndarray) and var.shape == (1, ))
-
-
-def _is_number_or_matrix_(var):
-    return _is_number_(var) or isinstance(var, np.ndarray)
-
-
-class MetricBase(object):
-    """
-    Base Class for all Metrics.
-    MetricBase define a group of interfaces for the
-    model evaluation methods. Metrics accumulate metric states between
-    consecutive minibatches, at every minibatch, use update
-    interface to add current minibatch value to global states.
-    Use eval to compute accumative metric value from last reset()
-    or from scratch on.
-    If you need to custom a new metric, please inherit from MetricBase and
-    custom implementation.
-
-    Args:
-        name(str): The name of metric instance. such as, "accuracy".
-                  It needed if you want to distinct different metrics in a model.
-
-    """
-
-    def __init__(self, name):
-        self._name = str(name) if name != None else self.__class__.__name__
-
-    def __str__(self):
-        return self._name
-
-    def reset(self):
-        """
-        reset clear the states of metrics. By default, the states
-        are the members who do not has _ prefix, reset set them to inital states.
-        If you violate the implicit name rule, please also custom the reset
-        interface.
-        """
-        states = {
-            attr: value
-            for attr, value in six.iteritems(self.__dict__)
-            if not attr.startswith("_")
-        }
-        for attr, value in six.iteritems(states):
-            if isinstance(value, int):
-                setattr(self, attr, 0)
-            elif isinstance(value, float):
-                setattr(self, attr, .0)
-            elif isinstance(value, (np.ndarray, np.generic)):
-                setattr(self, attr, np.zeros_like(value))
-            else:
-                setattr(self, attr, None)
-
-    def get_config(self):
-        """
-        Get the metric and current states.
-        The states are the members who do not has "_" prefix.
-
-        Args:
-            None
-
-        Returns:
-            dict: a dict of metric and states
-        """
-        states = {
-            attr: value
-            for attr, value in six.iteritems(self.__dict__)
-            if not attr.startswith("_")
-        }
-        config = {}
-        config.update({"name": self._name, "states": copy.deepcopy(states)})
-        return config
-
-    def update(self, preds, labels):
-        """
-        Updates the metric states at every minibatch.
-        One user can compute the minibatch metric via pure Python, or
-        via a c++ operator.
-
-        Args:
-            preds(numpy.array): the predictions of current minibatch
-            labels(numpy.array): the labels of current minibatch, if the label is one-hot
-                               or soft-label, should custom the corresponding update rule.
-        """
-        raise NotImplementedError(
-            "Should not use it directly, please extend it.")
-
-    def eval(self):
-        """
-        Evalute the current metrics based the accumulated states.
-
-        Returns:
-            float|list(float)|numpy.array: the metrics via Python.
-        """
-        raise NotImplementedError(
-            "Should not use it directly, please extend it.")
-
-
-class CompositeMetric(MetricBase):
-    """
-    Composite multiple metrics in one instance.
-    for example, merge F1, accuracy, recall into one Metric.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-            preds = [[0.1], [0.7], [0.8], [0.9], [0.2],
-                     [0.2], [0.3], [0.5], [0.8], [0.6]]
-            labels = [[0], [1], [1], [1], [1],
-                      [0], [0], [0], [0], [0]]
-            preds = np.array(preds)
-            labels = np.array(labels)
-
-            comp = fluid.metrics.CompositeMetric()
-            precision = fluid.metrics.Precision()
-            recall = fluid.metrics.Recall()
-            comp.add_metric(precision)
-            comp.add_metric(recall)
-
-            comp.update(preds=preds, labels=labels)
-            numpy_precision, numpy_recall = comp.eval()
-
-            print("expect precision: %.2f, got %.2f" % ( 3. / 5, numpy_precision ) )
-            print("expect recall: %.2f, got %.2f" % (3. / 4, numpy_recall ) )
-    """
-
-    def __init__(self, name=None):
-        super(CompositeMetric, self).__init__(name)
-        self._metrics = []
-
-    def add_metric(self, metric):
-        """
-        add one metric instance to CompositeMetric.
-
-        Args:
-            metric: a instance of MetricBase.
-        """
-        if not isinstance(metric, MetricBase):
-            raise ValueError("SubMetric should be inherit from MetricBase.")
-        self._metrics.append(metric)
-
-    def update(self, preds, labels):
-        """
-        Update every metrics in sequence.
-
-        Args:
-            preds(numpy.array): the predictions of current minibatch
-            labels(numpy.array): the labels of current minibatch, if the label is one-hot
-                               or soft-label, should custom the corresponding update rule.
-        """
-        for m in self._metrics:
-            m.update(preds, labels)
-
-    def eval(self):
-        """
-        Evaluate every metrics in sequence.
-
-        Returns:
-            list(float|numpy.array): a list of metrics value in Python.
-        """
-        ans = []
-        for m in self._metrics:
-            ans.append(m.eval())
-        return ans
-
-
-class Precision(MetricBase):
-    """
-    Precision (also called positive predictive value) is the fraction of
-    relevant instances among the retrieved instances.
-    https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers
-
-    This class mangages the precision score for binary classification task.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-
-            metric = fluid.metrics.Precision()
-
-            # generate the preds and labels
-
-            preds = [[0.1], [0.7], [0.8], [0.9], [0.2],
-                     [0.2], [0.3], [0.5], [0.8], [0.6]]
-
-            labels = [[0], [1], [1], [1], [1],
-                      [0], [0], [0], [0], [0]]
-
-            preds = np.array(preds)
-            labels = np.array(labels)
-
-            metric.update(preds=preds, labels=labels)
-            numpy_precision = metric.eval()
-
-            print("expct precision: %.2f and got %.2f" % ( 3.0 / 5.0, numpy_precision))
-    """
-
-    def __init__(self, name=None):
-        super(Precision, self).__init__(name)
-        self.tp = 0  # true positive
-        self.fp = 0  # false positive
-
-    def update(self, preds, labels):
-        if not _is_numpy_(preds):
-            raise ValueError("The 'preds' must be a numpy ndarray.")
-        if not _is_numpy_(labels):
-            raise ValueError("The 'labels' must be a numpy ndarray.")
-        sample_num = labels.shape[0]
-        preds = np.rint(preds).astype("int32")
-
-        for i in range(sample_num):
-            pred = preds[i]
-            label = labels[i]
-            if pred == 1:
-                if pred == label:
-                    self.tp += 1
-                else:
-                    self.fp += 1
-
-    def eval(self):
-        ap = self.tp + self.fp
-        return float(self.tp) / ap if ap != 0 else .0
-
-
-class Recall(MetricBase):
-    """
-    Recall (also known as sensitivity) is the fraction of
-    relevant instances that have been retrieved over the
-    total amount of relevant instances
-
-    https://en.wikipedia.org/wiki/Precision_and_recall
-
-    This class mangages the recall score for binary classification task.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-
-            metric = fluid.metrics.Recall()
-
-            # generate the preds and labels
-
-            preds = [[0.1], [0.7], [0.8], [0.9], [0.2],
-                     [0.2], [0.3], [0.5], [0.8], [0.6]]
-
-            labels = [[0], [1], [1], [1], [1],
-                      [0], [0], [0], [0], [0]]
-
-            preds = np.array(preds)
-            labels = np.array(labels)
-
-            metric.update(preds=preds, labels=labels)
-            numpy_precision = metric.eval()
-
-            print("expct precision: %.2f and got %.2f" % ( 3.0 / 4.0, numpy_precision))
-    """
-
-    def __init__(self, name=None):
-        super(Recall, self).__init__(name)
-        self.tp = 0  # true positive
-        self.fn = 0  # false negtive
-
-    def update(self, preds, labels):
-        if not _is_numpy_(preds):
-            raise ValueError("The 'preds' must be a numpy ndarray.")
-        if not _is_numpy_(labels):
-            raise ValueError("The 'labels' must be a numpy ndarray.")
-        sample_num = labels.shape[0]
-        preds = np.rint(preds).astype("int32")
-
-        for i in range(sample_num):
-            pred = preds[i]
-            label = labels[i]
-            if label == 1:
-                if pred == label:
-                    self.tp += 1
-                else:
-                    self.fn += 1
-
-    def eval(self):
-        recall = self.tp + self.fn
-        return float(self.tp) / recall if recall != 0 else .0
-
-
-class Accuracy(MetricBase):
-    """
-    Calculate the mean accuracy over multiple batches.
-    https://en.wikipedia.org/wiki/Accuracy_and_precision
-
-    Args:
-       name: the metrics name
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            #suppose we have batch_size = 128
-            batch_size=128
-            accuracy_manager = fluid.metrics.Accuracy()
-
-            #suppose the accuracy is 0.9 for the 1st batch
-            batch1_acc = 0.9
-            accuracy_manager.update(value = batch1_acc, weight = batch_size)
-            print("expect accuracy: %.2f, get accuracy: %.2f" % (batch1_acc, accuracy_manager.eval()))
-
-            #suppose the accuracy is 0.8 for the 2nd batch
-            batch2_acc = 0.8
-
-            accuracy_manager.update(value = batch2_acc, weight = batch_size)
-            #the joint acc for batch1 and batch2 is (batch1_acc * batch_size + batch2_acc * batch_size) / batch_size / 2
-            print("expect accuracy: %.2f, get accuracy: %.2f" % ((batch1_acc * batch_size + batch2_acc * batch_size) / batch_size / 2, accuracy_manager.eval()))
-
-            #reset the accuracy_manager
-            accuracy_manager.reset()
-            #suppose the accuracy is 0.8 for the 3rd batch
-            batch3_acc = 0.8
-            accuracy_manager.update(value = batch3_acc, weight = batch_size)
-            print("expect accuracy: %.2f, get accuracy: %.2f" % (batch3_acc, accuracy_manager.eval()))
-    """
-
-    def __init__(self, name=None):
-        super(Accuracy, self).__init__(name)
-        self.value = .0
-        self.weight = .0
-
-    def update(self, value, weight):
-        """
-        Update minibatch states.
-
-        Args:
-            value(float|numpy.array): accuracy of one minibatch.
-            weight(int|float): batch size.
-        """
-        if not _is_number_or_matrix_(value):
-            raise ValueError(
-                "The 'value' must be a number(int, float) or a numpy ndarray.")
-        if not _is_number_(weight):
-            raise ValueError("The 'weight' must be a number(int, float).")
-        if _is_number_(weight) and weight < 0:
-            raise ValueError("The 'weight' can not be negative")
-        self.value += value * weight
-        self.weight += weight
-
-    def eval(self):
-        """
-        Return the mean accuracy (float or numpy.array) for all accumulated batches.
-        """
-        if self.weight == 0:
-            raise ValueError("There is no data in Accuracy Metrics. \
-                Please check layers.accuracy output has added to Accuracy.")
-        return self.value / self.weight
-
-
-class ChunkEvaluator(MetricBase):
-    """
-    Accumulate counter numbers output by chunk_eval from mini-batches and
-    compute the precision recall and F1-score using the accumulated counter
-    numbers.
-    For some basics of chunking, please refer to 
-    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
-    ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
-    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            # init the chunck-level evaluation manager
-            metric = fluid.metrics.ChunkEvaluator()
-
-            # suppose the model predict 10 chuncks, while 8 ones are correct and the ground truth has 9 chuncks.
-            num_infer_chunks = 10
-            num_label_chunks = 9 
-            num_correct_chunks = 8
-
-            metric.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
-            numpy_precision, numpy_recall, numpy_f1 = metric.eval()
-
-            print("precision: %.2f, recall: %.2f, f1: %.2f" % (numpy_precision, numpy_recall, numpy_f1))
-
-            # the next batch, predicting 3 prefectly correct chuncks.
-            num_infer_chunks = 3
-            num_label_chunks = 3
-            num_correct_chunks = 3
-
-            metric.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
-            numpy_precision, numpy_recall, numpy_f1 = metric.eval()
-
-            print("precision: %.2f, recall: %.2f, f1: %.2f" % (numpy_precision, numpy_recall, numpy_f1))
-
-    """
-
-    def __init__(self, name=None):
-        super(ChunkEvaluator, self).__init__(name)
-        self.num_infer_chunks = 0
-        self.num_label_chunks = 0
-        self.num_correct_chunks = 0
-
-    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
-        """
-        Update the states based on the layers.chunk_eval() ouputs.
-
-        Args:
-            num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
-            num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
-            num_correct_chunks(int|float|numpy.array): The number of chunks both in Inference and Label on the
-                                                  given mini-batch.
-        """
-        if not _is_number_or_matrix_(num_infer_chunks):
-            raise ValueError(
-                "The 'num_infer_chunks' must be a number(int) or a numpy ndarray."
-            )
-        if not _is_number_or_matrix_(num_label_chunks):
-            raise ValueError(
-                "The 'num_label_chunks' must be a number(int, float) or a numpy ndarray."
-            )
-        if not _is_number_or_matrix_(num_correct_chunks):
-            raise ValueError(
-                "The 'num_correct_chunks' must be a number(int, float) or a numpy ndarray."
-            )
-        self.num_infer_chunks += num_infer_chunks
-        self.num_label_chunks += num_label_chunks
-        self.num_correct_chunks += num_correct_chunks
-
-    def eval(self):
-        precision = float(
-            self.num_correct_chunks
-        ) / self.num_infer_chunks if self.num_infer_chunks else 0
-        recall = float(self.num_correct_chunks
-                       ) / self.num_label_chunks if self.num_label_chunks else 0
-        f1_score = float(2 * precision * recall) / (
-            precision + recall) if self.num_correct_chunks else 0
-        return precision, recall, f1_score
-
-
-class EditDistance(MetricBase):
-    """
-    Edit distance is a way of quantifying how dissimilar two strings
-    (e.g., words) are to each another by counting the minimum number
-    of edit operations (add, remove or replace) required to transform
-    one string into the other.
-    Refer to https://en.wikipedia.org/wiki/Edit_distance
-
-    This EditDistance class takes two inputs by using update function:
-    1. distances: a (batch_size, 1) numpy.array, each element represents the
-    edit distance between two sequences.
-    2. seq_num: a int|float value, standing for the number of sequence pairs.
-
-    and returns the overall edit distance of multiple sequence-pairs.
-
-    Args:
-        name: the metrics name
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-
-            # suppose that batch_size is 128
-            batch_size = 128
-
-            # init the edit distance manager
-            distance_evaluator = fluid.metrics.EditDistance("EditDistance")
-
-            # generate the edit distance across 128 sequence pairs, the max distance is 10 here
-            edit_distances_batch0 = np.random.randint(low = 0, high = 10, size = (batch_size, 1))
-            seq_num_batch0 = batch_size
-
-            distance_evaluator.update(edit_distances_batch0, seq_num_batch0)
-            avg_distance, wrong_instance_ratio = distance_evaluator.eval()
-            print("the average edit distance for batch0 is %.2f and the wrong instance ratio is %.2f " % (avg_distance, wrong_instance_ratio))
-
-            edit_distances_batch1 = np.random.randint(low = 0, high = 10, size = (batch_size, 1))
-            seq_num_batch1 = batch_size
-
-            distance_evaluator.update(edit_distances_batch1, seq_num_batch1)
-            avg_distance, wrong_instance_ratio = distance_evaluator.eval()
-            print("the average edit distance for batch0 and batch1 is %.2f and the wrong instance ratio is %.2f " % (avg_distance, wrong_instance_ratio))
-
-            distance_evaluator.reset()
-
-            edit_distances_batch2 = np.random.randint(low = 0, high = 10, size = (batch_size, 1))
-            seq_num_batch2 = batch_size
-
-            distance_evaluator.update(edit_distances_batch2, seq_num_batch2)
-            avg_distance, wrong_instance_ratio = distance_evaluator.eval()
-            print("the average edit distance for batch2 is %.2f and the wrong instance ratio is %.2f " % (avg_distance, wrong_instance_ratio))
-
-    """
-
-    def __init__(self, name):
-        super(EditDistance, self).__init__(name)
-        self.total_distance = .0
-        self.seq_num = 0
-        self.instance_error = 0
-
-    def update(self, distances, seq_num):
-        """
-        Update the overall edit distance
-
-        Args:
-            distances: a (batch_size, 1) numpy.array, each element represents the 
-            edit distance between two sequences.
-            seq_num: a int|float value, standing for the number of sequence pairs.
-
-        """
-        if not _is_numpy_(distances):
-            raise ValueError("The 'distances' must be a numpy ndarray.")
-        if not _is_number_(seq_num):
-            raise ValueError("The 'seq_num' must be a number(int, float).")
-        seq_right_count = np.sum(distances == 0)
-        total_distance = np.sum(distances)
-        self.seq_num += seq_num
-        self.instance_error += seq_num - seq_right_count
-        self.total_distance += total_distance
-
-    def eval(self):
-        """
-        Return two floats:
-        avg_distance: the average distance for all sequence pairs updated using the update function.
-        avg_instance_error: the ratio of sequence pairs whose edit distance is not zero.
-        """
-        if self.seq_num == 0:
-            raise ValueError(
-                "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
-            )
-        avg_distance = self.total_distance / self.seq_num
-        avg_instance_error = self.instance_error / float(self.seq_num)
-        return avg_distance, avg_instance_error
-
-
-class Auc(MetricBase):
-    """
-    The auc metric is for binary classification.
-    Refer to https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
-    Please notice that the auc metric is implemented with python, which may be a little bit slow.
-    If you concern the speed, please use the fluid.layers.auc instead.
-
-    The `auc` function creates four local variables, `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` that are used to
-    compute the AUC. To discretize the AUC curve, a linearly spaced set of
-    thresholds is used to compute pairs of recall and precision values. The area
-    under the ROC-curve is therefore computed using the height of the recall
-    values by the false positive rate, while the area under the PR-curve is the
-    computed using the height of the precision values by the recall.
-
-    Args:
-        name: metric name
-        curve: Specifies the name of the curve to be computed, 'ROC' [default] or
-          'PR' for the Precision-Recall-curve.
-
-    "NOTE: only implement the ROC curve type via Python now."
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-            # init the auc metric
-            auc_metric = fluid.metrics.Auc("ROC")
-
-            # suppose that batch_size is 128
-            batch_num = 100
-            batch_size = 128
-
-            for batch_id in range(batch_num):
-
-                class0_preds = np.random.random(size = (batch_size, 1))
-                class1_preds = 1 - class0_preds
-
-                preds = np.concatenate((class0_preds, class1_preds), axis=1)
-
-                labels = np.random.randint(2, size = (batch_size, 1))
-                auc_metric.update(preds = preds, labels = labels)
-
-                # shall be some score closing to 0.5 as the preds are randomly assigned
-                print("auc for iteration %d is %.2f" % (batch_id, auc_metric.eval()))
-    """
-
-    def __init__(self, name, curve='ROC', num_thresholds=4095):
-        super(Auc, self).__init__(name=name)
-        self._curve = curve
-        self._num_thresholds = num_thresholds
-
-        _num_pred_buckets = num_thresholds + 1
-        self._stat_pos = [0] * _num_pred_buckets
-        self._stat_neg = [0] * _num_pred_buckets
-
-    def update(self, preds, labels):
-        """
-        Update the auc curve with the given predictions and labels
-
-        Args:
-             preds: an numpy array in the shape of (batch_size, 2), preds[i][j] denotes the probability
-             of classifying the instance i into the class j.
-             labels: an numpy array in the shape of (batch_size, 1), labels[i] is either o or 1, representing
-             the label of the instance i.
-        """
-        if not _is_numpy_(labels):
-            raise ValueError("The 'labels' must be a numpy ndarray.")
-        if not _is_numpy_(preds):
-            raise ValueError("The 'predictions' must be a numpy ndarray.")
-
-        for i, lbl in enumerate(labels):
-            value = preds[i, 1]
-            bin_idx = int(value * self._num_thresholds)
-            assert bin_idx <= self._num_thresholds
-            if lbl:
-                self._stat_pos[bin_idx] += 1.0
-            else:
-                self._stat_neg[bin_idx] += 1.0
-
-    @staticmethod
-    def trapezoid_area(x1, x2, y1, y2):
-        return abs(x1 - x2) * (y1 + y2) / 2.0
-
-    def eval(self):
-        """
-        Return the area (a float score) under auc curve
-        """
-        tot_pos = 0.0
-        tot_neg = 0.0
-        auc = 0.0
-
-        idx = self._num_thresholds
-        while idx >= 0:
-            tot_pos_prev = tot_pos
-            tot_neg_prev = tot_neg
-            tot_pos += self._stat_pos[idx]
-            tot_neg += self._stat_neg[idx]
-            auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
-                                       tot_pos_prev)
-            idx -= 1
-
-        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
-
-
-class DetectionMAP(object):
-    """
-    Calculate the detection mean average precision (mAP).
-
-    The general steps are as follows:
-
-    1. calculate the true positive and false positive according to the input
-       of detection and labels.
-    2. calculate mAP value, support two versions: '11 point' and 'integral'.
-
-    Please get more information from the following articles:
-
-      https://sanchom.wordpress.com/tag/average-precision/
-
-      https://arxiv.org/abs/1512.02325
-
-    Args:
-        input (Variable): The detection results, which is a LoDTensor with shape
-            [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
-        gt_label (Variable): The ground truth label index, which is a LoDTensor
-            with shape [N, 1].
-        gt_box (Variable): The ground truth bounding box (bbox), which is a
-            LoDTensor with shape [N, 4]. The layout is [xmin, ymin, xmax, ymax].
-        gt_difficult (Variable|None): Whether this ground truth is a difficult
-            bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
-            it means all the ground truth labels are not difficult bbox.
-        class_num (int): The class number.
-        background_label (int): The index of background label, the background
-            label will be ignored. If set to -1, then all categories will be
-            considered, 0 by default.
-        overlap_threshold (float): The threshold for deciding true/false
-            positive, 0.5 by default.
-        evaluate_difficult (bool): Whether to consider difficult ground truth
-            for evaluation, True by default. This argument does not work when
-            gt_difficult is None.
-        ap_version (string): The average precision calculation ways, it must be
-            'integral' or '11point'. Please check
-            https://sanchom.wordpress.com/tag/average-precision/ for details.
-            - 11point: the 11-point interpolated average precision.
-            - integral: the natural integral of the precision-recall curve.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            batch_size = -1 # can be any size
-            image_boxs_num = 10
-            bounding_bboxes_num = 21
-
-            pb = layers.data(name='prior_box', shape=[image_boxs_num, 4],
-                append_batch_size=False, dtype='float32')
-
-            pbv = layers.data(name='prior_box_var', shape=[image_boxs_num, 4],
-                append_batch_size=False, dtype='float32')
-
-            loc = layers.data(name='target_box', shape=[batch_size, bounding_bboxes_num, 4],
-                append_batch_size=False, dtype='float32')
-
-            scores = layers.data(name='scores', shape=[batch_size, bounding_bboxes_num, image_boxs_num],
-                append_batch_size=False, dtype='float32')
-
-            nmsed_outs = fluid.layers.detection_output(scores=scores,
-                loc=loc, prior_box=pb, prior_box_var=pbv)
-
-            gt_box = fluid.layers.data(name="gt_box", shape=[batch_size, 4], dtype="float32")
-            gt_label = fluid.layers.data(name="gt_label", shape=[batch_size, 1], dtype="float32")
-            difficult = fluid.layers.data(name="difficult", shape=[batch_size, 1], dtype="float32")
-
-            exe = fluid.Executor(fluid.CUDAPlace(0))
-            map_evaluator = fluid.metrics.DetectionMAP(nmsed_outs, gt_label, gt_box, difficult, class_num = 3)
-
-            cur_map, accum_map = map_evaluator.get_map_var()
-
- 
-    """
-
-    def __init__(self,
-                 input,
-                 gt_label,
-                 gt_box,
-                 gt_difficult=None,
-                 class_num=None,
-                 background_label=0,
-                 overlap_threshold=0.5,
-                 evaluate_difficult=True,
-                 ap_version='integral'):
-
-        self.helper = LayerHelper('map_eval')
-        gt_label = layers.cast(x=gt_label, dtype=gt_box.dtype)
-        if gt_difficult:
-            gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
-            label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
-        else:
-            label = layers.concat([gt_label, gt_box], axis=1)
-
-        # calculate mean average precision (mAP) of current mini-batch
-        map = detection.detection_map(
-            input,
-            label,
-            class_num,
-            background_label,
-            overlap_threshold=overlap_threshold,
-            evaluate_difficult=evaluate_difficult,
-            ap_version=ap_version)
-
-        states = []
-        states.append(
-            self._create_state(
-                dtype='int32', shape=None, suffix='accum_pos_count'))
-        states.append(
-            self._create_state(
-                dtype='float32', shape=None, suffix='accum_true_pos'))
-        states.append(
-            self._create_state(
-                dtype='float32', shape=None, suffix='accum_false_pos'))
-        var = self._create_state(dtype='int32', shape=[1], suffix='has_state')
-        self.helper.set_variable_initializer(
-            var, initializer=Constant(value=int(0)))
-        self.has_state = var
-
-        # calculate accumulative mAP
-        accum_map = detection.detection_map(
-            input,
-            label,
-            class_num,
-            background_label,
-            overlap_threshold=overlap_threshold,
-            evaluate_difficult=evaluate_difficult,
-            has_state=self.has_state,
-            input_states=states,
-            out_states=states,
-            ap_version=ap_version)
-
-        layers.fill_constant(
-            shape=self.has_state.shape,
-            value=1,
-            dtype=self.has_state.dtype,
-            out=self.has_state)
-
-        self.cur_map = map
-        self.accum_map = accum_map
-
-    def _create_state(self, suffix, dtype, shape):
-        """
-        Create state variable.
-        Args:
-            suffix(str): the state suffix.
-            dtype(str|core.VarDesc.VarType): the state data type
-            shape(tuple|list): the shape of state
-        Returns: State variable
-        """
-        state = self.helper.create_variable(
-            name="_".join([unique_name.generate(self.helper.name), suffix]),
-            persistable=True,
-            dtype=dtype,
-            shape=shape)
-        return state
-
-    def get_map_var(self):
-        """
-        Returns: mAP variable of current mini-batch and
-            accumulative mAP variable cross mini-batches.
-        """
-        return self.cur_map, self.accum_map
-
-    def reset(self, executor, reset_program=None):
-        """
-        Reset metric states at the begin of each pass/user specified batch.
-
-        Args:
-            executor(Executor): a executor for executing
-                the reset_program.
-            reset_program(Program|None): a single Program for reset process.
-                If None, will create a Program.
-        """
-
-        def _clone_var_(block, var):
-            assert isinstance(var, Variable)
-            return block.create_var(
-                name=var.name,
-                shape=var.shape,
-                dtype=var.dtype,
-                type=var.type,
-                lod_level=var.lod_level,
-                persistable=var.persistable)
-
-        if reset_program is None:
-            reset_program = Program()
-        with program_guard(main_program=reset_program):
-            var = _clone_var_(reset_program.current_block(), self.has_state)
-            layers.fill_constant(
-                shape=var.shape, value=0, dtype=var.dtype, out=var)
-        executor.run(reset_program)
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
deleted file mode 100644
index f991310384f769ce091197b16db953e7af94a3c3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/net_drawer.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import argparse
-import json
-import logging
-from collections import defaultdict
-
-import paddle.fluid.core as core
-import paddle.fluid.proto.framework_pb2 as framework_pb2
-from paddle.fluid.log_helper import get_logger
-
-logger = get_logger(__name__, logging.INFO)
-
-try:
-    from .graphviz import Graph
-except ImportError:
-    logger.info(
-        'Cannot import graphviz, which is required for drawing a network. This '
-        'can usually be installed in python with "pip install graphviz". Also, '
-        'pydot requires graphviz to convert dot files to pdf: in ubuntu, this '
-        'can usually be installed with "sudo apt-get install graphviz".')
-    print('net_drawer will not run correctly. Please install the correct '
-          'dependencies.')
-    exit(0)
-
-OP_STYLE = {
-    'shape': 'oval',
-    'color': '#0F9D58',
-    'style': 'filled',
-    'fontcolor': '#FFFFFF'
-}
-
-VAR_STYLE = {}
-
-GRAPH_STYLE = {"rankdir": "TB", }
-
-GRAPH_ID = 0
-
-
-def unique_id():
-    def generator():
-        GRAPH_ID += 1
-        return GRAPH_ID
-
-    return generator
-
-
-def draw_node(op):
-    node = OP_STYLE
-    node["name"] = op.type
-    node["label"] = op.type
-    return node
-
-
-def draw_edge(var_parent, op, var, arg):
-    edge = VAR_STYLE
-    edge["label"] = "%s(%s)" % (var.parameter, arg)
-    edge["head_name"] = op.type
-    edge["tail_name"] = var_parent[arg]
-    return edge
-
-
-def parse_graph(program, graph, var_dict, **kwargs):
-
-    # fill the known variables
-    for block in program.blocks:
-        for var in block.vars:
-            if var not in var_dict:
-                var_dict[var] = "Feed"
-
-    temp_id = 0
-    proto = framework_pb2.ProgramDesc.FromString(
-        program.desc.serialize_to_string())
-    for block in proto.blocks:
-        for op in block.ops:
-            op.type = op.type + "_" + str(temp_id)
-            temp_id += 1
-            graph.node(**draw_node(op))
-            for o in op.outputs:
-                for arg in o.arguments:
-                    var_dict[arg] = op.type
-            for e in op.inputs:
-                for arg in e.arguments:
-                    if arg in var_dict:
-                        graph.edge(**draw_edge(var_dict, op, e, arg))
-        break  # only plot the first block
-
-
-def draw_graph(startup_program, main_program, **kwargs):
-    if "graph_attr" in kwargs:
-        GRAPH_STYLE.update(kwargs[graph_attr])
-    if "node_attr" in kwargs:
-        OP_STYLE.update(kwargs[node_attr])
-    if "edge_attr" in kwargs:
-        VAR_STYLE.update(kwargs[edge_attr])
-
-    graph_id = unique_id()
-    filename = kwargs.get("filename")
-    if filename == None:
-        filename = str(graph_id) + ".gv"
-    g = Graph(
-        name=str(graph_id),
-        filename=filename,
-        graph_attr=GRAPH_STYLE,
-        node_attr=OP_STYLE,
-        edge_attr=VAR_STYLE,
-        **kwargs)
-
-    var_dict = {}
-    parse_graph(startup_program, g, var_dict)
-    parse_graph(main_program, g, var_dict)
-
-    if filename != None:
-        g.save()
-    return g
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
deleted file mode 100644
index e340f03161ef6d90cdce6549e56269d591c8a9b7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/nets.py
+++ /dev/null
@@ -1,533 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import six
-from . import layers
-
-__all__ = [
-    "simple_img_conv_pool",
-    "sequence_conv_pool",
-    "glu",
-    "scaled_dot_product_attention",
-    "img_conv_group",
-]
-
-
-def simple_img_conv_pool(input,
-                         num_filters,
-                         filter_size,
-                         pool_size,
-                         pool_stride,
-                         pool_padding=0,
-                         pool_type='max',
-                         global_pooling=False,
-                         conv_stride=1,
-                         conv_padding=0,
-                         conv_dilation=1,
-                         conv_groups=1,
-                         param_attr=None,
-                         bias_attr=None,
-                         act=None,
-                         use_cudnn=True):
-    """
-    The simple_img_conv_pool is composed with one Convolution2d and one Pool2d.
-
-    Args:
-        input (Variable): The input image with [N, C, H, W] format.
-        num_filters(int): The number of filter. It is as same as the output
-            feature channel.
-        filter_size (int|list|tuple): The filter size. If filter_size is a list or
-            tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise,
-            the filter_size_H = filter_size_W = filter_size.
-        pool_size (int|list|tuple): The pooling size of Pool2d layer. If pool_size
-            is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
-            Otherwise, the pool_size_H = pool_size_W = pool_size.
-        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
-            is a list or tuple, it must contain two integers, (pooling_stride_H, pooling_stride_W).
-            Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
-        pool_padding (int|list|tuple): The padding of Pool2d layer. If pool_padding is a list or
-            tuple, it must contain two integers, (pool_padding_H, pool_padding_W).
-            Otherwise, the pool_padding_H = pool_padding_W = pool_padding. Default 0.
-        pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
-            average-pooling. Default :math:`max`.
-        global_pooling (bool): Whether to use the global pooling. If global_pooling = true,
-            pool_size and pool_padding while be ignored. Default False
-        conv_stride (int|list|tuple): The stride size of the conv2d Layer. If stride is a
-            list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise,
-            the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1.
-        conv_padding (int|list|tuple): The padding size of the conv2d Layer. If padding is
-            a list or  tuple, it must contain two integers, (conv_padding_H, conv_padding_W).
-            Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0.
-        conv_dilation (int|list|tuple): The dilation size of the conv2d Layer. If dilation is
-            a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W).
-            Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1.
-        conv_groups (int): The groups number of the conv2d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`.
-            Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        act (str): Activation type for conv2d, if it is set to None, activation is not
-            appended. Default: None.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-
-    Return:
-        Variable: The result of input after Convolution2d and Pool2d.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-            conv_pool = fluid.nets.simple_img_conv_pool(input=img,
-                                                        filter_size=5,
-                                                        num_filters=20,
-                                                        pool_size=2,
-                                                        pool_stride=2,
-                                                        act="relu")
-    """
-    conv_out = layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=conv_stride,
-        padding=conv_padding,
-        dilation=conv_dilation,
-        groups=conv_groups,
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        act=act,
-        use_cudnn=use_cudnn)
-
-    pool_out = layers.pool2d(
-        input=conv_out,
-        pool_size=pool_size,
-        pool_type=pool_type,
-        pool_stride=pool_stride,
-        pool_padding=pool_padding,
-        global_pooling=global_pooling,
-        use_cudnn=use_cudnn)
-    return pool_out
-
-
-def img_conv_group(input,
-                   conv_num_filter,
-                   pool_size,
-                   conv_padding=1,
-                   conv_filter_size=3,
-                   conv_act=None,
-                   param_attr=None,
-                   conv_with_batchnorm=False,
-                   conv_batchnorm_drop_rate=0.0,
-                   pool_stride=1,
-                   pool_type="max",
-                   use_cudnn=True):
-    """
-    The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut,
-    and Pool2d. According to the input arguments, img_conv_group will do serials of
-    computation for Input using Convolution2d, BatchNorm, DropOut, and pass the last
-    result to Pool2d.
-
-    Args:
-        input (Variable): The input image with [N, C, H, W] format.
-        conv_num_filter(list|tuple): Indicates the numbers of filter of this group.
-        pool_size (int|list|tuple): The pooling size of Pool2d Layer. If pool_size
-            is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
-            Otherwise, the pool_size_H = pool_size_W = pool_size.
-        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
-            a list or tuple, its length must be equal to the length of conv_num_filter.
-            Otherwise the conv_padding of all Conv2d Layers are the same. Default 1.
-        conv_filter_size (int|list|tuple): The filter size. If filter_size is a list or
-            tuple, its length must be equal to the length of conv_num_filter.
-            Otherwise the conv_filter_size of all Conv2d Layers are the same. Default 3.
-        conv_act (str): Activation type for Conv2d Layer that is not followed by BatchNorm.
-            Default: None.
-        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
-        conv_with_batchnorm (bool|list): Indicates whether to use BatchNorm after Conv2d Layer.
-            If conv_with_batchnorm is a list, its length must be equal to the length of
-            conv_num_filter. Otherwise, conv_with_batchnorm indicates whether all the
-            Conv2d Layer follows a BatchNorm. Default False.
-        conv_batchnorm_drop_rate (float|list): Indicates the drop_rate of Dropout Layer
-            after BatchNorm. If conv_batchnorm_drop_rate is a list, its length must be
-            equal to the length of conv_num_filter. Otherwise, drop_rate of all Dropout
-            Layers is conv_batchnorm_drop_rate. Default 0.0.
-        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
-            is a list or tuple, it must contain two integers, (pooling_stride_H,
-            pooling_stride_W). Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
-            Default 1.
-        pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
-            average-pooling. Default :math:`max`.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-
-    Return:
-        Variable: The final result after serial computation using Convolution2d,
-            BatchNorm, DropOut, and Pool2d.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-            conv_pool = fluid.nets.img_conv_group(input=img,
-                                                  conv_padding=1,
-                                                  conv_num_filter=[3, 3],
-                                                  conv_filter_size=3,
-                                                  conv_act="relu",
-                                                  pool_size=2,
-                                                  pool_stride=2)
-    """
-    tmp = input
-    assert isinstance(conv_num_filter, list) or \
-        isinstance(conv_num_filter, tuple)
-
-    def __extend_list__(obj):
-        if not hasattr(obj, '__len__'):
-            return [obj] * len(conv_num_filter)
-        else:
-            assert len(obj) == len(conv_num_filter)
-            return obj
-
-    conv_padding = __extend_list__(conv_padding)
-    conv_filter_size = __extend_list__(conv_filter_size)
-    param_attr = __extend_list__(param_attr)
-    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
-    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
-
-    for i in six.moves.range(len(conv_num_filter)):
-        local_conv_act = conv_act
-        if conv_with_batchnorm[i]:
-            local_conv_act = None
-
-        tmp = layers.conv2d(
-            input=tmp,
-            num_filters=conv_num_filter[i],
-            filter_size=conv_filter_size[i],
-            padding=conv_padding[i],
-            param_attr=param_attr[i],
-            act=local_conv_act,
-            use_cudnn=use_cudnn)
-
-        if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
-            drop_rate = conv_batchnorm_drop_rate[i]
-            if abs(drop_rate) > 1e-5:
-                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
-
-    pool_out = layers.pool2d(
-        input=tmp,
-        pool_size=pool_size,
-        pool_type=pool_type,
-        pool_stride=pool_stride,
-        use_cudnn=use_cudnn)
-    return pool_out
-
-
-def sequence_conv_pool(input,
-                       num_filters,
-                       filter_size,
-                       param_attr=None,
-                       act="sigmoid",
-                       pool_type="max",
-                       bias_attr=None):
-    """
-    The sequence_conv_pool is composed with Sequence Convolution and Pooling.
-
-    Args:
-        input (Variable): The input of sequence_conv, which supports variable-time
-            length input sequence. The underlying of input is a matrix with shape
-            (T, N), where T is the total time steps in this mini-batch and N is
-            the input_hidden_size
-        num_filters(int): The number of filter.
-        filter_size (int): The filter size.
-        param_attr (ParamAttr): The parameters to the Sequence_conv Layer. Default: None.
-        act (str): Activation type for Sequence_conv Layer. Default: "sigmoid".
-        pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
-            average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
-            Default :math:`max`.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, sequence_conv
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-
-    Return:
-        Variable: The final result after Sequence Convolution and Pooling.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input_dim = 100 #len(word_dict)
-            emb_dim = 128
-            hid_dim = 512
-            data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1)
-            emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim], is_sparse=True)
-            seq_conv = fluid.nets.sequence_conv_pool(input=emb,
-                                                     num_filters=hid_dim,
-                                                     filter_size=3,
-                                                     act="tanh",
-                                                     pool_type="sqrt")
-    """
-    conv_out = layers.sequence_conv(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        act=act)
-
-    pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
-    return pool_out
-
-
-def glu(input, dim=-1):
-    """
-    The Gated Linear Units(GLU) composed by split, sigmoid activation and element-wise
-    multiplication. Specifically, Split the input into two equal sized parts,
-    :math:`a` and :math:`b`, along the given dimension and then compute as
-    following:
-
-        .. math::
-
-            {GLU}(a, b)= a \otimes \sigma(b)
-
-    Refer to `Language Modeling with Gated Convolutional Networks
-    <https://arxiv.org/pdf/1612.08083.pdf>`_.
-
-    Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int): The dimension along which to split. If :math:`dim < 0`, the
-            dimension to split along is :math:`rank(input) + dim`. Default -1.
-
-    Returns:
-        Variable: Variable with half the size of input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            data = fluid.layers.data(
-                name="words", shape=[-1, 6, 3, 9], dtype="float32")
-            # shape of output: [-1, 3, 3, 9]
-            output = fluid.nets.glu(input=data, dim=1)
-    """
-
-    a, b = layers.split(input, num_or_sections=2, dim=dim)
-    act_b = layers.sigmoid(x=b)
-    out = layers.elementwise_mul(x=a, y=act_b)
-    return out
-
-
-def scaled_dot_product_attention(queries,
-                                 keys,
-                                 values,
-                                 num_heads=1,
-                                 dropout_rate=0.):
-    """
-    The dot-product attention.
-
-    Attention mechanism can be seen as mapping a query and a set of key-value
-    pairs to an output. The output is computed as a weighted sum of the values,
-    where the weight assigned to each value is computed by a compatibility
-    function (dot-product here) of the query with the corresponding key.
-
-    The dot-product attention can be implemented through (batch) matrix
-    multipication as follows:
-
-        .. math::
-
-            Attention(Q, K, V)= softmax(QK^\mathrm{T})V
-
-    Refer to `Attention Is All You Need
-    <https://arxiv.org/pdf/1706.03762.pdf>`_.
-
-    Args:
-        queries (Variable): The input variable which should be a 3-D Tensor.
-        keys (Variable): The input variable which should be a 3-D Tensor.
-        values (Variable): The input variable which should be a 3-D Tensor.
-        num_heads (int): Head number to compute the scaled dot product
-            attention. Default: 1.
-        dropout_rate (float): The dropout rate to drop the attention weight.
-            Default: 0.0.
-
-    Returns:
-        Variable: A 3-D Tensor computed by multi-head scaled dot product\
-            attention.
-
-    Raises:
-        ValueError: If input queries, keys, values are not 3-D Tensors.
-
-    NOTES:
-        1. When num_heads > 1, three linear projections are learned respectively
-           to map input queries, keys and values into queries', keys' and values'.
-           queries', keys' and values' have the same shapes with queries, keys
-           and values.
-        2. When num_heads == 1, scaled_dot_product_attention has no learnable
-           parameters.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            queries = fluid.layers.data(name="queries",
-                                        shape=[3, 5, 9],
-                                        dtype="float32",
-                                        append_batch_size=False)
-            queries.stop_gradient = False
-            keys = fluid.layers.data(name="keys",
-                                     shape=[3, 6, 9],
-                                     dtype="float32",
-                                     append_batch_size=False)
-            keys.stop_gradient = False
-            values = fluid.layers.data(name="values",
-                                       shape=[3, 6, 10],
-                                       dtype="float32",
-                                       append_batch_size=False)
-            values.stop_gradient = False
-            contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values)
-            contexts.shape  # [3, 5, 10]
-    """
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError(
-            "Inputs quries, keys and values should all be 3-D tensors.")
-
-    if queries.shape[-1] != keys.shape[-1]:
-        raise ValueError(
-            "The hidden size of queries and keys should be the same.")
-    if keys.shape[-2] != values.shape[-2]:
-        raise ValueError(
-            "The max sequence length in query batch and in key batch "
-            "should be the same.")
-    if keys.shape[-1] % num_heads != 0:
-        raise ValueError("The hidden size of keys (%d) must be divisible "
-                         "by the number of attention heads (%d)." %
-                         (keys.shape[-1], num_heads))
-    if values.shape[-1] % num_heads != 0:
-        raise ValueError("The hidden size of values (%d) must be divisible "
-                         "by the number of attention heads (%d)." %
-                         (values.shape[-1], num_heads))
-
-    def __compute_qkv(queries, keys, values, num_heads):
-        """
-        Add linear projection to queries, keys, and values.
-
-        Args:
-            queries(Tensor): a 3-D input Tensor.
-            keys(Tensor): a 3-D input Tensor.
-            values(Tensor): a 3-D input Tensor.
-            num_heads(int): The number of heads. Linearly project the inputs
-                            ONLY when num_heads > 1.
-
-        Returns:
-            Tensor: linearly projected output Tensors: queries', keys' and
-                    values'. They have the same shapes with queries, keys and
-                    values.
-        """
-
-        if num_heads == 1:
-            return queries, keys, values
-
-        q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
-        k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
-        v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
-        return q, k, v
-
-    def __split_heads(x, num_heads):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions.
-
-        Args:
-            x(Tensor): a 3-D input Tensor.
-            num_heads(int): The number of heads.
-
-        Returns:
-            Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
-                    of the last dimension of x.
-        """
-        if num_heads == 1:
-            return x
-
-        hidden_size = x.shape[-1]
-        # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
-        # into a 4-D output:
-        # [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
-        reshaped = layers.reshape(
-            x=x,
-            shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
-
-        # permuate the dimensions into:
-        # [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Reshape the last two dimensions of inpunt tensor x so that it becomes
-        one dimension.
-
-        Args:
-            x(Tensor): a 4-D input Tensor with shape
-                       [bs, num_heads, max_sequence_length, hidden_dim].
-
-        Returns:
-            Tensor: a Tensor with shape
-                    [bs, max_sequence_length, num_heads * hidden_dim].
-        """
-
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        return layers.reshape(
-            x=trans_x,
-            shape=list(
-                map(int, [
-                    trans_x.shape[0], trans_x.shape[1], trans_x.shape[2] *
-                    trans_x.shape[3]
-                ])))
-
-    q, k, v = __compute_qkv(queries, keys, values, num_heads)
-
-    q = __split_heads(q, num_heads)
-    k = __split_heads(k, num_heads)
-    v = __split_heads(v, num_heads)
-
-    key_dim_per_head = keys.shape[-1] // num_heads
-    scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
-    product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-
-    weights = layers.reshape(
-        x=layers.reshape(
-            x=product, shape=[-1, product.shape[-1]], act="softmax"),
-        shape=product.shape)
-    if dropout_rate:
-        weights = layers.dropout(
-            weights, dropout_prob=dropout_rate, is_test=False)
-    ctx_multiheads = layers.matmul(weights, v)
-    return __combine_heads(ctx_multiheads)
diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py
deleted file mode 100644
index b8bb3db1eedcf25c9b6a02ad3b4f261e8be8efce..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/op.py
+++ /dev/null
@@ -1,292 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import six
-
-import paddle.fluid.core as core
-import paddle.fluid.proto.framework_pb2 as framework_pb2
-
-
-def get_all_op_protos():
-    """
-    Get all registered op proto from PaddlePaddle C++ end.
-    :return: A list of registered OpProto.
-    """
-    protostrs = core.get_all_op_protos()
-    ret_values = []
-    for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
-        ret_values.append(op_proto)
-    return ret_values
-
-
-def is_str(s):
-    return isinstance(s, six.string_types)
-
-
-class OpDescCreationMethod(object):
-    """
-    Convert the user's input(only keyword arguments are supported) to OpDesc
-    based on the OpProto.
-
-    :param op_proto: The OpProto object.
-    :type op_proto: op_proto_pb2.OpProto
-    """
-
-    def __init__(self, op_proto):
-        if not isinstance(op_proto, framework_pb2.OpProto):
-            raise TypeError(
-                "Type of op_proto should be OpProto in PaddlePaddle.")
-        self.__op_proto__ = op_proto
-
-    def __call__(self, *args, **kwargs):
-        """
-        Convert user's input to OpDesc. Only keyword arguments are supported.
-        :return: The OpDesc based on user input.
-        :rtype: op_desc_pb2.OpDesc
-        """
-        if len(args) != 0:
-            raise ValueError("Only keyword arguments are supported.")
-        op_desc = framework_pb2.OpDesc()
-        for input_parameter in self.__op_proto__.inputs:
-            input_arguments = kwargs.get(input_parameter.name, [])
-            if is_str(input_arguments):
-                input_arguments = [input_arguments]
-
-            if not input_parameter.duplicable and len(input_arguments) > 1:
-                raise ValueError(
-                    "Input %s expects only one input, but %d are given." %
-                    (input_parameter.name, len(input_arguments)))
-
-            ipt = op_desc.inputs.add()
-            ipt.parameter = input_parameter.name
-            ipt.arguments.extend(input_arguments)
-
-        for output_parameter in self.__op_proto__.outputs:
-            output_arguments = kwargs.get(output_parameter.name, [])
-            if is_str(output_arguments):
-                output_arguments = [output_arguments]
-
-            if not output_parameter.duplicable and len(output_arguments) > 1:
-                raise ValueError(
-                    "Output %s expects only one output, but %d are given." %
-                    (output_parameter.name, len(output_arguments)))
-
-            out = op_desc.outputs.add()
-            out.parameter = output_parameter.name
-            out.arguments.extend(output_arguments)
-
-        # Types
-        op_desc.type = self.__op_proto__.type
-
-        # Attrs
-        for attr in self.__op_proto__.attrs:
-            if attr.generated:
-                continue
-            user_defined_attr = kwargs.get(attr.name, None)
-            if user_defined_attr is not None:
-                new_attr = op_desc.attrs.add()
-                new_attr.name = attr.name
-                new_attr.type = attr.type
-                if isinstance(user_defined_attr, np.ndarray):
-                    user_defined_attr = user_defined_attr.tolist()
-                if attr.type == framework_pb2.INT:
-                    new_attr.i = user_defined_attr
-                elif attr.type == framework_pb2.FLOAT:
-                    new_attr.f = user_defined_attr
-                elif attr.type == framework_pb2.LONG:
-                    new_attr.l = user_defined_attr
-                elif attr.type == framework_pb2.STRING:
-                    new_attr.s = user_defined_attr
-                elif attr.type == framework_pb2.BOOLEAN:
-                    new_attr.b = user_defined_attr
-                elif attr.type == framework_pb2.INTS:
-                    new_attr.ints.extend(user_defined_attr)
-                elif attr.type == framework_pb2.FLOATS:
-                    new_attr.floats.extend(user_defined_attr)
-                elif attr.type == framework_pb2.STRINGS:
-                    new_attr.strings.extend(user_defined_attr)
-                elif attr.type == framework_pb2.BOOLEANS:
-                    new_attr.bools.extend(user_defined_attr)
-                elif attr.type == framework_pb2.LONGS:
-                    new_attr.longs.extend(user_defined_attr)
-                elif attr.type == framework_pb2.INT_PAIRS:
-                    for p in user_defined_attr:
-                        pair = new_attr.int_pairs.add()
-                        pair.first = p[0]
-                        pair.second = p[1]
-                else:
-                    raise NotImplementedError(
-                        "A not supported attribute type: %s." % (
-                            str(attr.type)))
-
-        return op_desc
-
-    @staticmethod
-    def any_is_true(generator):
-        """
-        Reduce a boolean array to a single boolean parameter. If any element in
-        the array is True, this function will return True, otherwise False.
-        """
-        for flag in generator:
-            if flag:
-                return True
-        return False
-
-
-class OpInfo(object):
-    def __init__(self, name, method, inputs, outputs, attrs):
-        self.name = name
-        self.method = method
-        self.inputs = inputs
-        self.outputs = outputs
-        self.attrs = attrs
-
-
-def create_op_creation_method(op_proto):
-    """
-    Generate op creation method for an OpProto.
-    """
-    method = OpDescCreationMethod(op_proto)
-
-    def __impl__(*args, **kwargs):
-        opdesc = method(*args, **kwargs)
-        return core.Operator.create(opdesc.SerializeToString())
-
-    return OpInfo(
-        method=__impl__,
-        name=op_proto.type,
-        inputs=[(var.name, var.duplicable) for var in op_proto.inputs],
-        outputs=[(var.name, var.duplicable) for var in op_proto.outputs],
-        attrs=[attr.name for attr in op_proto.attrs])
-
-
-class OperatorFactory(object):
-    def __init__(self):
-        self.op_methods = dict()
-
-        for op_proto in get_all_op_protos():
-            method = create_op_creation_method(op_proto)
-            self.op_methods[method.name] = method
-
-    def __call__(self, *args, **kwargs):
-        if "type" in kwargs:
-            if len(args) != 0:
-                raise ValueError(
-                    "Except the argument \"type\","
-                    "all of the other arguments should be keyword arguments.")
-            t = kwargs.pop("type")
-        else:
-            if len(args) != 1:
-                raise ValueError(
-                    "Except the argument \"type\","
-                    "all of the other arguments should be keyword arguments.")
-            t = args[0]
-
-        return self.get_op_info(t).method(**kwargs)
-
-    def types(self):
-        return list(self.op_methods.keys())
-
-    def get_op_info(self, t):
-        if t not in self.op_methods:
-            raise ValueError("The operator: %s is not registered." % t)
-        return self.op_methods.get(t)
-
-    def get_op_input_names(self, type):
-        return [x[0] for x in self.get_op_info(type).inputs]
-
-    def get_op_inputs(self, type):
-        return self.get_op_info(type).inputs
-
-    def get_op_output_names(self, type):
-        return [x[0] for x in self.get_op_info(type).outputs]
-
-    def get_op_outputs(self, type):
-        return self.get_op_info(type).outputs
-
-    def get_op_attr_names(self, type):
-        return self.get_op_info(type).attrs
-
-
-class __RecurrentOp__(object):
-    __proto__ = None
-    type = "recurrent"
-
-    def __init__(self):
-        # cache recurrent_op's proto
-        if self.__proto__ is None:
-            for op_proto in get_all_op_protos():
-                if op_proto.type == self.type:
-                    self.__proto__ = op_proto
-
-    def __call__(self, *args, **kwargs):
-        if self.type not in args and "type" not in kwargs:
-            kwargs["type"] = self.type
-        # create proto
-        create_method = OpDescCreationMethod(self.__proto__)
-        proto = create_method(*args, **kwargs)
-        # create rnnop
-        return core.RecurrentOp.create(proto.SerializeToString())
-
-
-class __DynamicRecurrentOp__(object):
-    __proto__ = None
-    type = "dynamic_recurrent"
-
-    def __init__(self):
-        # cache recurrent_op's proto
-        if self.__proto__ is None:
-            for op_proto in get_all_op_protos():
-                if op_proto.type == self.type:
-                    self.__proto__ = op_proto
-
-    def __call__(self, *args, **kwargs):
-        if self.type not in args and "type" not in kwargs:
-            kwargs["type"] = self.type
-        # create proto
-        create_method = OpDescCreationMethod(self.__proto__)
-        proto = create_method(*args, **kwargs)
-        # create rnnop
-        return core.DynamicRecurrentOp.create(proto.SerializeToString())
-
-
-class __CondOp__(object):
-    __proto__ = None
-    type = "cond"
-
-    def __init__(self):
-        # cache recurrent_op's proto
-        if self.__proto__ is None:
-            for op_proto in get_all_op_protos():
-                if op_proto.type == self.type:
-                    self.__proto__ = op_proto
-
-    def __call__(self, *args, **kwargs):
-        if self.type not in args and "type" not in kwargs:
-            kwargs["type"] = self.type
-        # create proto
-        create_method = OpDescCreationMethod(self.__proto__)
-        proto = create_method(*args, **kwargs)
-        # create condop
-        return core.CondOp.create(proto.SerializeToString())
-
-
-Operator = OperatorFactory()  # The default global factory
-RecurrentOp = __RecurrentOp__()
-DynamicRecurrentOp = __DynamicRecurrentOp__()
-CondOp = __CondOp__()
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
deleted file mode 100644
index a3690de36e2dbc6aa8b7d7f77d98ce5f90f0ee81..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/optimizer.py
+++ /dev/null
@@ -1,3508 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-from collections import defaultdict
-
-from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
-from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program
-
-from . import framework
-from . import layers
-from . import unique_name
-from .backward import append_backward, _some_in_set_, _append_grad_suffix_
-from .clip import append_gradient_clip_ops, error_clip_callback
-from .framework import program_guard
-from .initializer import Constant
-from .layer_helper import LayerHelper
-from .layers import ops
-from .regularizer import append_regularization_ops
-from .dygraph import base as imperative_base
-from .dygraph.learning_rate_scheduler import LearningRateDecay
-from paddle.fluid import core
-from paddle.fluid.layers import tensor
-from functools import reduce
-from .wrapped_decorator import signature_safe_contextmanager
-from .. import compat as cpt
-
-__all__ = [
-    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
-    'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer',
-    'AdamOptimizer', 'AdamaxOptimizer', 'DpsgdOptimizer',
-    'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta',
-    'AdadeltaOptimizer', 'ModelAverage', 'LarsMomentum',
-    'LarsMomentumOptimizer', 'DGCMomentumOptimizer', 'LambOptimizer',
-    'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer',
-    'RecomputeOptimizer'
-]
-
-
-class Optimizer(object):
-    """Optimizer Base class.
-
-    Define the common interface of an optimizer.
-    User should not use this class directly,
-    but need to use one of it's implementation.
-    """
-
-    @imperative_base.no_grad
-    def __init__(self, learning_rate, regularization=None, name=None):
-        if framework.in_dygraph_mode():
-            if not isinstance(learning_rate, float) and \
-                    not isinstance(learning_rate, LearningRateDecay):
-                raise TypeError(
-                    "learning rate should be float or LearningRateDecay, got %s here"
-                    % type(learning_rate))
-            if name is not None:
-                self._name = unique_name.generate(name)
-            else:
-                self._name = unique_name.generate(self.__class__.__name__)
-        else:
-            if not isinstance(learning_rate, float) and \
-                    not isinstance(learning_rate, framework.Variable):
-                raise TypeError(
-                    "learning rate should be float or Variable, got %s here" %
-                    type(learning_rate))
-            self._name = name
-
-        self.regularization = regularization
-        self._learning_rate = learning_rate
-        # the learning rate type should be inferenced from loss
-        self._dtype = None
-        # each program should have a independent learning rate
-        # program -> Variable(learning_rate)
-        self._learning_rate_map = dict()
-        if isinstance(self._learning_rate, framework.Variable):
-            self._learning_rate_map[framework.default_main_program(
-            )] = self._learning_rate
-        # Dictionary of accumulators. Some optimizer subclasses need to
-        # allocate and manage extra variables associated with the parameters
-        # to train. These variables are called accumulators.
-        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
-        self._accumulators = defaultdict(lambda: dict())
-        self.helper = None
-        self._opti_name_list = []
-
-    def load(self, stat_dict):
-        """
-        load optimizer with learning rate decay in dygraph mode
-        :return: None
-
-        Args:
-            stat_dict: the dict load by load_persistable method
-
-        Examples:
-
-        .. code-block:: python
-
-            from __future__ import print_function
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.fluid.optimizer import SGDOptimizer
-            from paddle.fluid.dygraph.nn import FC
-            from paddle.fluid.dygraph.base import to_variable
-
-            class MLP(fluid.Layer):
-                def __init__(self, name_scope):
-                    super(MLP, self).__init__(name_scope)
-
-                    self._fc1 = FC(self.full_name(), 10)
-                    self._fc2 = FC(self.full_name(), 10)
-
-                def forward(self, inputs):
-                    y = self._fc1(inputs)
-                    y = self._fc2(y)
-                    return y
-
-            with fluid.dygraph.guard():
-                mlp = MLP('mlp')
-                optimizer2 = SGDOptimizer(
-                    learning_rate=fluid.layers.natural_exp_decay(
-                    learning_rate=0.1,
-                    decay_steps=10000,
-                    decay_rate=0.5,
-                    staircase=True))
-
-                train_reader = paddle.batch(
-                        paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
-
-                for batch_id, data in enumerate(train_reader()):
-                    dy_x_data = np.array(
-                            [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-
-                    y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                            128, 1)
-
-                    img = to_variable(dy_x_data)
-                    label = to_variable(y_data)
-                    label._stop_gradient = True
-                    cost = mlp(img)
-                    avg_loss = fluid.layers.reduce_mean(cost)
-                    avg_loss.backward()
-                    optimizer.minimize(avg_loss)
-                    mlp.clear_gradients()
-                    fluid.dygraph.save_persistables(
-                            mlp.state_dict(), [optimizer, optimizer2], "save_dir_2")
-                    if batch_id == 2:
-                            break
-
-            with fluid.dygraph.guard():
-                mlp_load = MLP('mlp')
-                optimizer_load2 = SGDOptimizer(
-                        learning_rate=fluid.layers.natural_exp_decay(
-                        learning_rate=0.1,
-                        decay_steps=10000,
-                        decay_rate=0.5,
-                        staircase=True))
-                parameters, optimizers = fluid.dygraph.load_persistables(
-                    "save_dir_2")
-                mlp_load.load_dict(parameters)
-                optimizer_load2.load(optimizers)
-            self.assertTrue(optimizer2._learning_rate.__dict__ == optimizer_load2._learning_rate.__dict__)
-
-        """
-        if framework.in_dygraph_mode():
-            self._learning_rate = stat_dict[self._name]
-        else:
-            raise TypeError("load can only be used under DyGraph mode")
-
-    def get_opti_var_name_list(self):
-        return self._opti_name_list
-
-    def _create_global_learning_rate(self):
-        if imperative_base.enabled():
-            # create learning rate Variable
-            if isinstance(self._learning_rate, float):
-                lr = self._global_learning_rate()
-
-                if isinstance(lr, framework.Variable):
-                    return
-                else:
-                    self._learning_rate_map[framework.default_main_program(
-                    )] = layers.create_global_var(
-                        name=unique_name.generate("learning_rate"),
-                        shape=[1],
-                        value=float(self._learning_rate),
-                        dtype='float32' if self._dtype is None else self._dtype,
-                        persistable=True)
-            # get learning rate Variable from LearningRateDecay
-            elif isinstance(self._learning_rate, LearningRateDecay):
-                self._learning_rate_map[framework.default_main_program(
-                )] = self._learning_rate()
-            else:
-                raise TypeError(
-                    "optimizer's learning rate must be float or LearningRateDecay"
-                )
-        else:
-            lr = self._global_learning_rate()
-
-            if isinstance(lr, framework.Variable):
-                return
-            else:
-                if not isinstance(self._learning_rate, float):
-                    raise TypeError(
-                        "learning rate variable is create outside optimizer,"
-                        "can not create new learning rate variable for new program"
-                    )
-
-            # create learning rate in the current main program
-            self._learning_rate_map[framework.default_main_program(
-            )] = layers.create_global_var(
-                name=unique_name.generate("learning_rate"),
-                shape=[1],
-                value=float(self._learning_rate),
-                dtype='float32' if self._dtype is None else self._dtype,
-                persistable=True)
-
-    def _global_learning_rate(self, program=None):
-        """
-        get global decayed learning rate
-        :return:
-        """
-        if program is None:
-            program = framework.default_main_program()
-        return self._learning_rate_map.get(program, None)
-
-    def _append_optimize_op(self, block, param_and_grad):
-        """ append optimize operator to block and return all the added optimize_op
-        """
-        raise NotImplementedError()
-
-    def _create_param_lr(self, param_and_grad):
-        # create learning rate variable for every parameter
-        param = param_and_grad[0]
-        param_lr = param.optimize_attr['learning_rate']
-        if type(param_lr) == Variable:
-            return param_lr
-        else:
-            if param_lr == 1.0:
-                return self._global_learning_rate()
-            else:
-                with default_main_program()._lr_schedule_guard(
-                        is_with_opt=True), framework.name_scope(
-                            'scale_with_param_lr'):
-                    return self._global_learning_rate() * param_lr
-
-    def _create_accumulators(self, block, parameters):
-        """Create all accumulators needed by the parameters
-
-        Args:
-            block: the block in which the loss variable is present
-            parameters: list of parameter variables for the optimizer
-        """
-        pass
-
-    def _finish_update(self, block, parameters_and_grads):
-        """Finish any custom updates needed
-           before completing an optimization step
-
-        Args:
-            block: the block in which the loss variable is present
-            parameters: list of parameter variables for the optimizer
-
-        Returns:
-            None
-        """
-        pass
-
-    def _add_accumulator(self,
-                         name,
-                         param,
-                         dtype=None,
-                         fill_value=0.0,
-                         shape=None):
-        """Utility function to add an accumulator for a parameter
-
-        Args:
-            block: the block in which the loss variable is present
-            name: name of the accumulator
-            param: parameter variable for which accumulator is to be added
-            dtype: data type of the accumulator variable
-            fill_value: value to initialize the accumulator variable
-        """
-        if self._name is not None:
-            name = self._name + "_" + name
-        if (name in self._accumulators and
-                param.name in self._accumulators[name]):
-            if framework.in_dygraph_mode():
-                return self._accumulators[name][param.name]
-            raise Exception("Accumulator {} already exists for parameter {}".
-                            format(name, param.name))
-        if shape == None:
-            shape = param.shape
-        assert isinstance(self.helper, LayerHelper)
-
-        var_name = param.name + "_" + name
-        var_name = unique_name.generate(var_name)
-        self._opti_name_list.append(var_name)
-
-        var = self.helper.create_global_variable(
-            name=var_name,
-            persistable=True,
-            dtype=dtype or param.dtype,
-            type=param.type,
-            shape=shape)
-        self.helper.set_variable_initializer(
-            var, initializer=Constant(value=float(fill_value)))
-        self._accumulators[name][param.name] = var
-        return var
-
-    def _get_accumulator(self, name, param):
-        """Utility function to fetch an accumulator for a parameter
-
-        Args:
-            name: name of the accumulator
-            param: parameter variable for which accumulator is to be fetched
-
-        Returns:
-            accumulator variable for the parameter
-        """
-        if self._name is not None:
-            name = self._name + "_" + name
-        if (name not in self._accumulators or
-                param.name not in self._accumulators[name]):
-            raise Exception("Accumulator {} does not exist for parameter {}".
-                            format(name, param.name))
-        return self._accumulators[name][param.name]
-
-    def _create_optimization_pass(self, parameters_and_grads):
-        """Add optimization operators to update gradients to variables.
-
-        Args:
-          parameters_and_grads(list(tuple(Variable, Variable))):
-            a list of (variable, gradient) pair to update.
-
-        Returns:
-          return_op_list: a list of operators that will complete one step of
-            optimization. This will include parameter update ops, global step
-            update ops and any other custom ops required by subclasses to manage
-            their internal state.
-        """
-        # This is a default implementation of create_optimization_pass that
-        # can be shared by most optimizers. This implementation assumes that
-        # the subclass will implement the _append_optimize_op method and the
-        #  _initialize_tensors method. The subclass can extend the
-        # _create_accumulators method if it needs to create accumulators
-        # for parameters and extend _finish_update method to add custom ops.
-
-        # Allways called under program_guard use global block as loss block
-        global_block = framework.default_main_program().global_block()
-        start = len(global_block.ops)
-        self.helper = LayerHelper(self.__class__.__name__)
-        self._create_accumulators(
-            global_block,
-            [p[0] for p in parameters_and_grads if p[0].trainable])
-        self._create_global_learning_rate()
-
-        optimize_ops = []
-        if framework.in_dygraph_mode():
-            for param_and_grad in parameters_and_grads:
-                if param_and_grad[1] is None:
-                    continue
-                with param_and_grad[0].block.program._optimized_guard(
-                        param_and_grad):
-                    if param_and_grad[0].trainable is True:
-                        optimize_op = self._append_optimize_op(global_block,
-                                                               param_and_grad)
-                        optimize_ops.append(optimize_op)
-        else:
-            for param_and_grad in parameters_and_grads:
-                if param_and_grad[1] is None:
-                    continue
-                with param_and_grad[0].block.program._optimized_guard(
-                        param_and_grad), name_scope("optimizer"):
-                    if param_and_grad[0].trainable is True:
-                        optimize_op = self._append_optimize_op(global_block,
-                                                               param_and_grad)
-                        optimize_ops.append(optimize_op)
-
-        # Get custom finish ops for subclasses
-        # FIXME: Need to fix this once we figure out how to handle dependencies
-        self._finish_update(global_block, parameters_and_grads)
-
-        end = len(global_block.ops)
-        return global_block._slice_ops(start, end)
-
-    def _process_distribute_lookuptable(self, param_grads):
-        """
-        Because distribute lookup table only support SGD optimizer for now, not support
-        other optimizer and regularization, so we should find the table parameter out,
-        and avoid to add regularization and other op for it, and add sgd optimize op
-        for it independently.
-        :param param_grads(list((Var, Var))): list of (param, grad) pair.
-        :param loss: the loss variable.
-        :param startup_program: the startup program
-        """
-        program = framework.default_main_program()
-        global_block = framework.default_main_program().global_block()
-        table_name = find_distributed_lookup_table(program)
-        table_param = None
-        table_grad = None
-        new_param_grads = []
-        for p, g in param_grads:
-            if p.name == table_name:
-                if table_param is not None:
-                    raise RuntimeError(
-                        "multi dist table var found, only support one now!")
-                table_param = p
-                table_grad = g
-            else:
-                new_param_grads.append((p, g))
-        sgd_op = None
-        if table_param is not None:
-            param_and_grad = [table_param, table_grad]
-            with table_param.block.program._optimized_guard(param_and_grad), \
-                    framework.name_scope("optimizer"):
-                self._create_global_learning_rate()
-                # create the optimize op
-                sgd_op = global_block.append_op(
-                    type='sgd',
-                    inputs={
-                        "Param": table_param,
-                        "Grad": table_grad,
-                        "LearningRate": self._create_param_lr(param_and_grad)
-                    },
-                    outputs={"ParamOut": param_and_grad[0]})
-        return new_param_grads, (table_param, table_grad), sgd_op
-
-    def _append_dgc_ops(self, param_and_grad):
-        pass
-
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None):
-        """
-        First part of `minimize`, do auto-diff to append backward ops for
-        the current program.
-
-        Args:
-            loss (Variable): loss variable to run optimizations.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-            callbacks (list|None): list of callables to run when appending backward
-                operator for one parameter.
-
-        Return:
-            list: list of (param, grad) pair, grad is the output of backward.
-
-        Examples:
-            See examples in `apply_gradients`.
-        """
-        no_grad_set = self._get_no_grad_set(loss, no_grad_set)
-
-        self._dtype = loss.dtype
-        if framework.in_dygraph_mode():
-            if parameter_list is not None:
-                parameters = parameter_list
-            else:
-                parameters = framework._dygraph_tracer().all_parameters()
-
-            params_grads = []
-            for param in parameters:
-                if not param.trainable:
-                    continue
-                if param._ivar._grad_ivar() is not None:
-                    # create gradient variable
-                    grad_var = Variable(
-                        block=loss.block,
-                        name=param._ivar._grad_name(),
-                        stop_gradient=True,
-                        ivar=param._ivar._grad_ivar())
-                    params_grads.append((param, grad_var))
-        else:
-            if callbacks is None:
-                callbacks = [error_clip_callback]
-            else:
-                assert (isinstance(callbacks, list))
-            program = loss.block.program
-            assert len(loss.shape) == 1 and loss.shape[0] == 1, \
-                "The loss.shape should be (1L,), but the current loss.shape is {}. " \
-                "Maybe that you should call fluid.layers.mean to process the current loss.".format(
-                    loss.shape)
-            with program_guard(program, startup_program):
-                params_grads = append_backward(loss, parameter_list,
-                                               no_grad_set, callbacks)
-                # Note: since we can't use all_reduce_op now,
-                #  dgc_op should be the last op of one grad.
-                self._append_dgc_ops(params_grads)
-        return params_grads
-
-    def apply_gradients(self, params_grads):
-        """
-        Second part of `minimize`, appending optimization operators for
-        given `params_grads` pairs.
-
-        Args:
-            params_grads (list): list of (param, grad) pair to do optimization.
-
-        Returns:
-            list: A list of operators appended to the current program.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                loss = network()
-                optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-                params_grads = optimizer.backward(loss)
-                # you may append operations for params_grads here
-                # ...
-                optimizer.apply_gradients(params_grads)
-        """
-        params_grads = sorted(params_grads, key=lambda x: x[0].name)
-
-        params_grads, table_param_and_grad, table_optimize_op = \
-            self._process_distribute_lookuptable(params_grads)
-
-        params_grads = append_gradient_clip_ops(params_grads)
-
-        # Add regularization if any
-        params_grads = append_regularization_ops(params_grads,
-                                                 self.regularization)
-
-        optimize_ops = self._create_optimization_pass(params_grads)
-        if table_optimize_op is not None:
-            optimize_ops.append(table_optimize_op)
-            params_grads.append(table_param_and_grad)
-
-        return optimize_ops
-
-    def apply_optimize(self, loss, startup_program, params_grads):
-        """
-        Second part of `minimize`, appending optimization operators for
-        given `params_grads` pairs.
-
-        Args:
-            loss (Variable): loss variable to run optimizations.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            params_grads (list): list of (param, grad) pair to do optimization.
-
-        Returns:
-            list: A list of operators appended to the current program.
-        """
-        if framework.in_dygraph_mode():
-            with program_guard(framework.default_main_program(),
-                               framework.default_startup_program()):
-                params_grads = append_regularization_ops(params_grads,
-                                                         self.regularization)
-                optimize_ops = self._create_optimization_pass(params_grads)
-        else:
-            program = loss.block.program
-            with program_guard(program, startup_program):
-                optimize_ops = self.apply_gradients(params_grads)
-        return optimize_ops
-
-    def _get_no_grad_set(self, loss, no_grad_set=None):
-        if no_grad_set is None:
-            no_grad_set = set()
-        elif isinstance(no_grad_set, set) or isinstance(
-                no_grad_set, list) or isinstance(no_grad_set, tuple):
-            no_grad_set = set(no_grad_set)
-        else:
-            assert "no_grad_set should be a set, but the passed type is {}".format(
-                type(no_grad_set))
-        parameters = loss.block.program.global_block().all_parameters()
-        param_no_trainable = set(
-            [param.name for param in parameters if param.trainable is False])
-        # If the parameter is no trainable, it should not have a gradient.
-        no_grad_set.update(param_no_trainable)
-
-        return no_grad_set
-
-    @imperative_base.no_grad
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 grad_clip=None):
-        """
-        Add operations to minimize `loss` by updating `parameter_list`.
-
-        This method combines interface `backward()` and
-        `apply_gradients()` into one.
-
-        Args:
-            loss (Variable): loss variable to run optimizations.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-            grad_clip (GradClipBase|None) : Gradient clip strategy
-
-        Returns:
-            tuple: (optimize_ops, params_grads) which are, list of operators appended;
-            and list of (param, grad) Variables pair for optimization.
-        """
-        assert isinstance(loss, Variable), "The loss should be an Variable."
-        params_grads = self.backward(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
-
-        if grad_clip is not None and framework.in_dygraph_mode():
-            # TODO(hongyu): FIX later, this is only for dygraph, should be work for static mode
-            params_grads = grad_clip(params_grads)
-
-        optimize_ops = self.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
-
-        return optimize_ops, params_grads
-
-
-class SGDOptimizer(Optimizer):
-    """
-    Optimizer of the stochastic gradient descent algorithm.
-
-    .. math::
-
-        param\_out = param - learning\_rate * grad
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-        Can be a float value or a Variable with one float value as data element.
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            place = fluid.CPUPlace()
-            main = fluid.Program()
-            with fluid.program_guard(main):
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
-
-                sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-                sgd_optimizer.minimize(avg_cost)
-
-                fetch_list = [avg_cost]
-                train_reader = paddle.batch(
-                    paddle.dataset.uci_housing.train(), batch_size=1)
-                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in train_reader():
-                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
-    """
-
-    def __init__(self, learning_rate, regularization=None, name=None):
-        assert learning_rate is not None
-        super(SGDOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            name=name)
-        self.type = "sgd"
-
-    def _append_optimize_op(self, block, param_and_grad):
-        assert isinstance(block, framework.Block)
-
-        # create the optimize op
-        sgd_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "LearningRate": self._create_param_lr(param_and_grad)
-            },
-            outputs={"ParamOut": param_and_grad[0]},
-            stop_gradient=True)
-
-        return sgd_op
-
-
-class MomentumOptimizer(Optimizer):
-    """
-
-    Simple Momentum optimizer with velocity state
-
-    This optimizer has a flag for Nestrov Momentum.
-
-    The update equations are as follows:
-
-    .. math::
-
-        & velocity = mu * velocity + gradient
-
-        & if (use\_nesterov):
-
-        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
-
-        & else:
-
-        &\quad   param = param - learning\_rate * velocity
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-        Can be a float value or a Variable with one float value as data element.
-        momentum (float): momentum factor
-        use_nesterov (bool): enables Nesterov momentum
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            place = fluid.CPUPlace()
-            main = fluid.Program()
-            with fluid.program_guard(main):
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
-
-                moment_optimizer = fluid.optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-                moment_optimizer.minimize(avg_cost)
-
-                fetch_list = [avg_cost]
-                train_reader = paddle.batch(
-                    paddle.dataset.uci_housing.train(), batch_size=1)
-                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in train_reader():
-                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
-    """
-    _velocity_acc_str = "velocity"
-
-    def __init__(self,
-                 learning_rate,
-                 momentum,
-                 use_nesterov=False,
-                 regularization=None,
-                 name=None):
-        assert learning_rate is not None
-        assert momentum is not None
-        super(MomentumOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            name=name)
-        self.type = "momentum"
-        self._momentum = momentum
-        self._use_nesterov = bool(use_nesterov)
-
-    def _create_accumulators(self, block, parameters):
-        assert isinstance(block, framework.Block)
-
-        for p in parameters:
-            self._add_accumulator(self._velocity_acc_str, p)
-
-    def _append_optimize_op(self, block, param_and_grad):
-        assert isinstance(block, framework.Block)
-
-        velocity_acc = self._get_accumulator(self._velocity_acc_str,
-                                             param_and_grad[0])
-        # create the momentum optimize op
-        momentum_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "Velocity": velocity_acc,
-                "LearningRate": self._create_param_lr(param_and_grad)
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "VelocityOut": velocity_acc
-            },
-            attrs={"mu": self._momentum,
-                   "use_nesterov": self._use_nesterov},
-            stop_gradient=True)
-
-        return momentum_op
-
-
-class DGCMomentumOptimizer(MomentumOptimizer):
-    """
-
-    Original paper is https://arxiv.org/abs/1712.01887
-
-    DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\
-        only gradients larger than a threshold are transmitted.
-
-    To avoid losing information, DGC accumulates the rest of the gradients locally.
-
-    Eventually, these gradients become large enough to be transmitted.
-
-    Thus, DGC sends the large gradients immediately but eventually send all of the gradients over time.
-
-    To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance.
-
-    DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
-
-    This optimizer will do two things:
-
-        1. Compress the gradient by get TopK import value from tensor \
-            and use it for allreduce to reduce network bandwidth.
-
-        2. Call momentum to optimize on the cost.
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-            Can be a float value or a Variable with one float value as data element.
-        momentum (float): Momentum factor.
-        rampup_begin_step (int): The beginning step from which gradient compression is implemented.
-        rampup_step (int): How long it use the sparsity periods. Default is 1.
-            for example: If the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 5, \
-                it will use 0.75 at 0 step, and 0.9375 at 1 step, and so on. And when reach sparsity array ends, \
-                it will use 0.999 then and after.
-        sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity).
-        use_nesterov (bool): Enables Nesterov momentum. True means use nesterov.
-        local_grad_clip_norm (float): Clip norm value if needed.
-        num_trainers: The number of training nodes.
-        regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
-        name: An optional name prefix.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            optimizer = fluid.optimizer.DGCMomentumOptimizer(
-                        learning_rate=0.0001,
-                        momentum=0.9,
-                        rampup_step=1000,
-                        rampup_begin_step=1252,
-                        sparsity=[0.999, 0.999])
-
-    """
-
-    def __init__(self,
-                 learning_rate,
-                 momentum,
-                 rampup_begin_step,
-                 rampup_step=1,
-                 sparsity=[0.999],
-                 use_nesterov=False,
-                 local_grad_clip_norm=None,
-                 num_trainers=None,
-                 regularization=None,
-                 name=None):
-        self._sparsity = sparsity
-        self._rampup_step = rampup_step
-        self._rampup_step_var = None
-
-        self._rampup_begin_step = rampup_begin_step
-        self._rampup_begin_step_var = None
-
-        self._global_step_var = None
-        self._local_grad_clip_norm = None
-        self._clip_norm = None
-
-        if local_grad_clip_norm is not None:
-            assert isinstance(num_trainers, int)
-            assert isinstance(local_grad_clip_norm, float)
-            assert num_trainers > 0
-
-            self._local_grad_clip_norm = local_grad_clip_norm
-            self._num_trainers = num_trainers
-            self._clip_norm = local_grad_clip_norm / (num_trainers *
-                                                      num_trainers)
-
-        super(DGCMomentumOptimizer, self).__init__(
-            learning_rate, momentum, use_nesterov, regularization, name)
-
-        core.init_dgc()
-
-    def _add_auto_increment_var(self, counter_name, begin, step=1):
-        helper = LayerHelper('global_step_counter')
-        counter, is_new_var = helper.create_or_get_global_variable(
-            name=counter_name, dtype='float32', shape=[1], persistable=True)
-        if is_new_var:
-            helper.set_variable_initializer(
-                counter,
-                initializer=Constant(
-                    value=float(begin - 1), force_cpu=True))
-            helper.main_program.global_block()._prepend_op(
-                type='increment',
-                inputs={'X': [counter]},
-                outputs={'Out': [counter]},
-                attrs={'step': float(step)},
-                stop_gradient=True)
-            counter.stop_gradient = True
-
-        return counter
-
-    def _append_dgc_ops(self, param_and_grads):
-        start_program = default_startup_program()
-        main_program = default_main_program()
-        main_program._enable_dgc = True
-
-        # step counter
-        self._global_step_var = self._add_auto_increment_var(
-            counter_name=core.dgc.kDGCCounterName(), begin=0)
-
-        # rampup begin step var for all_reduce_op_handle
-        self._rampup_begin_step_var = tensor.create_global_var(
-            shape=[1],
-            dtype=core.VarDesc.VarType.FP32,
-            persistable=True,
-            name=core.dgc.kDGCRampUpBeginStepName(),
-            value=self._rampup_begin_step * 1.0,
-            force_cpu=True)
-
-        for param_var, grad_var in param_and_grads:
-            var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
-            if var_numel < 16384 or \
-                param_var.type == core.VarDesc.VarType.SELECTED_ROWS  or \
-                grad_var.type == core.VarDesc.VarType.SELECTED_ROWS  or  \
-                    param_var.dtype != core.VarDesc.VarType.FP32 :
-                continue
-
-            u_var = tensor.create_global_var(
-                shape=param_var.shape,
-                dtype=param_var.dtype,
-                persistable=True,
-                name=param_var.name + core.dgc.kDGCUName(),
-                value=0.0)
-            v_var = tensor.create_global_var(
-                shape=param_var.shape,
-                dtype=param_var.dtype,
-                persistable=True,
-                name=param_var.name + core.dgc.kDGCVName(),
-                value=0.0)
-
-            k_var = tensor.create_global_var(
-                shape=[1],
-                dtype=param_var.dtype,
-                persistable=True,
-                name=param_var.name + core.dgc.kDGCKName(),
-                value=0.0,
-                force_cpu=True)
-
-            encoded_var = tensor.create_global_var(
-                shape=[1],
-                dtype=param_var.dtype,
-                persistable=True,
-                name=param_var.name + core.dgc.kDGCEncodedName(),
-                value=0.0,
-                force_cpu=False)
-
-            # del back oprolevarname
-            op_maker = core.op_proto_and_checker_maker
-            backward = core.op_proto_and_checker_maker.OpRole.Backward
-            for op in main_program.global_block().ops:
-                if not self._is_the_backward_op(op):
-                    continue
-
-                var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()]
-                if param_var.name not in var_attr:
-                    continue
-
-                var_attr.remove(param_var.name)
-                var_attr.remove(grad_var.name)
-                if len(var_attr) > 1:
-                    op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr)
-                else:
-                    op._remove_attr(op_maker.kOpRoleVarAttrName())
-
-            clip_var = grad_var
-            if self._local_grad_clip_norm is not None:
-                clip_var = self._append_clip_norm(grad_var, self._clip_norm)
-            self._dgc_op(param_var, clip_var, grad_var, u_var, v_var, k_var,
-                         encoded_var)
-
-    def _is_the_backward_op(self, op):
-        op_maker = core.op_proto_and_checker_maker
-        backward = core.op_proto_and_checker_maker.OpRole.Backward
-        if op_maker.kOpRoleVarAttrName() in op.attr_names and \
-                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(backward):
-            return True
-        return False
-
-    def _clip_by_norm(self, x, max_norm, name=None):
-        args = {'x': x, 'max_norm': max_norm, 'name': name}
-
-        helper = LayerHelper("dgc_clip_by_norm_op", **args)
-
-        if name is None:
-            name = unique_name.generate_with_ignorable_key(".".join(
-                [helper.name, 'tmp']))
-
-        out = helper.create_variable(
-            type=x.type, name=name, dtype=x.dtype, persistable=False)
-
-        helper.append_op(
-            type="dgc_clip_by_norm",
-            inputs={"X": x,
-                    "current_step": self._global_step_var},
-            attrs={
-                "max_norm": max_norm,
-                "rampup_begin_step": float(self._rampup_begin_step)
-            },
-            outputs={"Out": out})
-        return out
-
-    def _append_clip_norm(self, grad_var, clip_norm):
-        with grad_var.block.program._backward_role_guard():
-            return self._clip_by_norm(
-                x=grad_var, max_norm=clip_norm, name=grad_var.name)
-
-    def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
-                encoded_var):
-        block = framework.default_main_program().global_block()
-        op_maker = core.op_proto_and_checker_maker
-        dgc_op = block.append_op(
-            type="dgc",
-            inputs={
-                "U": u_var,
-                "V": v_var,
-                "Grad": clip_var,
-                "current_step": self._global_step_var
-            },
-            outputs={
-                "U_out": u_var,
-                "V_out": v_var,
-                "EncodeGrad": encoded_var,
-                "k": k_var,
-                "Grad_out": grad_var
-            },
-            attrs={
-                "m": self._momentum,
-                "sparsity": self._sparsity,
-                "use_nesterov": self._use_nesterov,
-                "rampup_begin_step": float(self._rampup_begin_step),
-                "rampup_step": float(self._rampup_step)
-            },
-            stop_gradient=True)
-
-        backward = op_maker.OpRole.Backward
-        dgc_op._set_attr(op_maker.kOpRoleAttrName(), backward)
-        dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
-                         [param_var.name, grad_var.name])
-
-
-class LarsMomentumOptimizer(Optimizer):
-    """
-    Momentum optimizer with LARS support
-
-    The update equations are as follows:
-
-    .. math::
-
-        & local\_learning\_rate = learning\_rate * lars\_coeff * \\
-          \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
-
-        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param)
-
-        & param = param - velocity
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-        Can be a float value or a Variable with one float value as data element.
-        momentum (float): momentum factor
-        lars_coeff (float): defines how much we trust the layer to change its weights.
-        lars_weight_decay (float): weight decay coefficient for decaying using LARS.
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
-
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-
-            np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-            inp = fluid.layers.data(
-                name="inp", shape=[2, 2], append_batch_size=False)
-            out = fluid.layers.fc(inp, size=3)
-            out = fluid.layers.reduce_sum(out)
-            optimizer = fluid.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
-            optimizer.minimize(out)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-            exe.run(
-                feed={"inp": np_inp},
-                fetch_list=[out.name])
-    """
-    _velocity_acc_str = "velocity"
-
-    def __init__(self,
-                 learning_rate,
-                 momentum,
-                 lars_coeff=0.001,
-                 lars_weight_decay=0.0005,
-                 regularization=None,
-                 name=None):
-        assert learning_rate is not None
-        assert momentum is not None
-        super(LarsMomentumOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            name=name)
-        self.type = "lars_momentum"
-        self._momentum = momentum
-        self._lars_coeff = float(lars_coeff)
-        self._lars_weight_decay = float(lars_weight_decay)
-
-    def _create_accumulators(self, block, parameters):
-        assert isinstance(block, framework.Block)
-
-        for p in parameters:
-            self._add_accumulator(self._velocity_acc_str, p)
-
-    def _append_optimize_op(self, block, param_and_grad):
-        assert isinstance(block, framework.Block)
-
-        velocity_acc = self._get_accumulator(self._velocity_acc_str,
-                                             param_and_grad[0])
-        # create the momentum optimize op
-        momentum_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "Velocity": velocity_acc,
-                "LearningRate": self._create_param_lr(param_and_grad)
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "VelocityOut": velocity_acc
-            },
-            attrs={
-                "mu": self._momentum,
-                "lars_coeff": self._lars_coeff,
-                "lars_weight_decay": self._lars_weight_decay
-            },
-            stop_gradient=True)
-
-        return momentum_op
-
-
-class AdagradOptimizer(Optimizer):
-    """
-    **Adaptive Gradient Algorithm (Adagrad)**
-
-    The update is done as follows:
-
-    .. math::
-
-        moment\_out &= moment + grad * grad
-
-        param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
-
-    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-    does not have the epsilon attribute. It is added here in our implementation
-    as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
-    for numerical stability to avoid the division by zero error.
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-        Can be a float value or a Variable with one float value as data element.
-        epsilon (float): a small float value for numerical stability.
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
-        initial_accumulator_value (float): Initial value for moment accumulator.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-
-            np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-            inp = fluid.layers.data(
-                name="inp", shape=[2, 2], append_batch_size=False)
-            out = fluid.layers.fc(inp, size=3)
-            out = fluid.layers.reduce_sum(out)
-            optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
-            optimizer.minimize(out)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-            exe.run(
-                feed={"inp": np_inp},
-                fetch_list=[out.name])
-    """
-    _moment_acc_str = "moment"
-
-    def __init__(self,
-                 learning_rate,
-                 epsilon=1.0e-6,
-                 regularization=None,
-                 name=None,
-                 initial_accumulator_value=0.0):
-        assert learning_rate is not None
-        assert epsilon is not None
-        super(AdagradOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            name=name)
-        self.type = "adagrad"
-        self._epsilon = epsilon
-        self.initial_accumulator_value = initial_accumulator_value
-
-    def _create_accumulators(self, block, parameters):
-        assert isinstance(block, framework.Block)
-
-        for p in parameters:
-            self._add_accumulator(self._moment_acc_str, p)
-
-    def _append_optimize_op(self, block, param_and_grad):
-        assert isinstance(block, framework.Block)
-
-        moment_acc = self._get_accumulator(self._moment_acc_str,
-                                           param_and_grad[0])
-        startup_block = framework.default_startup_program().global_block()
-        startup_block.append_op(
-            type='fill_constant',
-            inputs={},
-            outputs={'Out': [moment_acc]},
-            attrs={
-                'dtype': moment_acc.dtype,
-                'value': self.initial_accumulator_value,
-                'shape': moment_acc.shape,
-            })
-
-        # Create the adagrad optimizer op
-        adagrad_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "Moment": moment_acc,
-                "LearningRate": self._create_param_lr(param_and_grad)
-            },
-            outputs={"ParamOut": param_and_grad[0],
-                     "MomentOut": moment_acc},
-            attrs={"epsilon": self._epsilon},
-            stop_gradient=True)
-
-        return adagrad_op
-
-
-class AdamOptimizer(Optimizer):
-    """
-    This implements the Adam optimizer from Section 2 of the Adam
-    paper : https://arxiv.org/abs/1412.6980.
-    Adam is a first-order gradient-based optimization method based on
-    adaptive estimates of lower-order moments.
-
-    Adam updates:
-
-    .. math::
-
-        t & = t + 1
-
-        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
-
-        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
-
-        learning\_rate & = learning\_rate * \\
-                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
-
-        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-        Can be a float value or a Variable with one float value as data element.
-        beta1 (float): The exponential decay rate for the 1st moment estimates.
-        beta2 (float): The exponential decay rate for the 2nd moment estimates.
-        epsilon (float): a small float value for numerical stability.
-        regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
-        lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators
-        the accumulators are updated at every step. Every element of the two moving-average is updated
-        in both dense mode and sparse mode. If the size of parameter is very large, then the update
-        may be very slow. The lazy mode only update the element that has gradient is the current
-        mini-batch, so it will be much more faster. But this mode has different semantics with the
-        original Adam algorithm and may lead to different result.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-
-            place = fluid.CPUPlace()
-            main = fluid.Program()
-            with fluid.program_guard(main):
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
-
-                adam_optimizer = fluid.optimizer.AdamOptimizer(0.01)
-                adam_optimizer.minimize(avg_cost)
-
-                fetch_list = [avg_cost]
-                train_reader = paddle.batch(
-                    paddle.dataset.uci_housing.train(), batch_size=1)
-                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in train_reader():
-                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
-    """
-    _moment1_acc_str = "moment1"
-    _moment2_acc_str = "moment2"
-    _beta1_pow_acc_str = "beta1_pow_acc"
-    _beta2_pow_acc_str = "beta2_pow_acc"
-
-    def __init__(self,
-                 learning_rate=0.001,
-                 beta1=0.9,
-                 beta2=0.999,
-                 epsilon=1e-8,
-                 regularization=None,
-                 name=None,
-                 lazy_mode=False):
-        assert learning_rate is not None
-        assert beta1 is not None
-        assert beta2 is not None
-        assert epsilon is not None
-        super(AdamOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            name=name)
-        self.type = "adam"
-        self._beta1 = beta1
-        self._beta2 = beta2
-        self._epsilon = epsilon
-        self._lazy_mode = lazy_mode
-
-    def _create_accumulators(self, block, parameters):
-        assert isinstance(block, framework.Block)
-
-        # Create accumulator tensors for first and second moments
-        for p in parameters:
-            self._add_accumulator(self._moment1_acc_str, p)
-            self._add_accumulator(self._moment2_acc_str, p)
-            self._add_accumulator(
-                name=self._beta1_pow_acc_str,
-                param=p,
-                dtype='float32',
-                fill_value=self._beta1,
-                shape=[1])
-            self._add_accumulator(
-                name=self._beta2_pow_acc_str,
-                param=p,
-                dtype='float32',
-                fill_value=self._beta2,
-                shape=[1])
-
-    def _append_optimize_op(self, block, param_and_grad):
-        assert isinstance(block, framework.Block)
-
-        moment1 = self._get_accumulator(self._moment1_acc_str,
-                                        param_and_grad[0])
-        moment2 = self._get_accumulator(self._moment2_acc_str,
-                                        param_and_grad[0])
-        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                              param_and_grad[0])
-        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                              param_and_grad[0])
-
-        # create the adam optimize op
-        adam_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "LearningRate": self._create_param_lr(param_and_grad),
-                "Moment1": moment1,
-                "Moment2": moment2,
-                "Beta1Pow": beta1_pow_acc,
-                "Beta2Pow": beta2_pow_acc
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "Moment1Out": moment1,
-                "Moment2Out": moment2
-            },
-            attrs={
-                "beta1": self._beta1,
-                "beta2": self._beta2,
-                "epsilon": self._epsilon,
-                "lazy_mode": self._lazy_mode,
-                "min_row_size_to_use_multithread": 1000
-            },
-            stop_gradient=True)
-
-        return adam_op
-
-    def _finish_update(self, block, param_and_grads):
-        """Update Beta1 and Beta2 Power accumulators
-        """
-        assert isinstance(block, framework.Block)
-        main_block = block.program.global_block()
-        for param, grad in param_and_grads:
-            if grad is None or param.trainable is False:
-                continue
-            with param.block.program._optimized_guard(
-                [param, grad]), name_scope("optimizer"):
-                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                                      param)
-                beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                                      param)
-                main_block.append_op(
-                    type="scale",
-                    inputs={"X": beta1_pow_acc},
-                    outputs={"Out": beta1_pow_acc},
-                    attrs={"scale": self._beta1},
-                    stop_gradient=True)
-
-                main_block.append_op(
-                    type="scale",
-                    inputs={"X": beta2_pow_acc},
-                    outputs={"Out": beta2_pow_acc},
-                    attrs={"scale": self._beta2},
-                    stop_gradient=True)
-
-
-class AdamaxOptimizer(Optimizer):
-    """
-    We implement the Adamax optimizer from Section 7 of the Adam
-    paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
-    Adam algorithm based on the infinity norm.
-
-    Adamax updates:
-
-    .. math::
-
-        t & = t + 1
-
-        moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
-
-        inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
-
-        learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
-
-        param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
-
-
-    The original paper does not have an epsilon attribute.
-    However, it is added here for numerical stability to prevent the
-    division by 0 error.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          # First create the Executor.
-          place = fluid.CPUPlace() # fluid.CUDAPlace(0)
-          exe = fluid.Executor(place)
-
-          train_program = fluid.Program()
-          startup_program = fluid.Program()
-          with fluid.program_guard(train_program, startup_program):
-              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-              hidden = fluid.layers.fc(input=data, size=10)
-              loss = fluid.layers.mean(hidden)
-              adam = fluid.optimizer.Adamax(learning_rate=0.2)
-              adam.minimize(loss)
-
-          # Run the startup program once and only once.
-          exe.run(startup_program)
-
-          x = numpy.random.random(size=(10, 1)).astype('float32')
-          outs = exe.run(program=train_program,
-                        feed={'X': x},
-                         fetch_list=[loss.name])
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-        Can be a float value or a Variable with one float value as data element.
-        beta1 (float): The exponential decay rate for the 1st moment estimates.
-        beta2 (float): The exponential decay rate for the 2nd moment estimates.
-        epsilon (float): a small float value for numerical stability.
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
-
-    Notes:
-       Currently, AdamaxOptimizer doesn't support sparse parameter optimization.
-    """
-    _moment_acc_str = "moment"
-    _inf_norm_acc_str = "inf_norm"
-    _beta1_pow_acc_str = "beta1_pow_acc"
-
-    def __init__(self,
-                 learning_rate=0.001,
-                 beta1=0.9,
-                 beta2=0.999,
-                 epsilon=1e-8,
-                 regularization=None,
-                 name=None):
-        assert learning_rate is not None
-        assert beta1 is not None
-        assert beta2 is not None
-        assert epsilon is not None
-        super(AdamaxOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            name=name)
-        self.type = "adamax"
-        self._beta1 = beta1
-        self._beta2 = beta2
-        self._epsilon = epsilon
-
-    def _create_accumulators(self, block, parameters):
-        # Create accumulator tensors for first moment and infinity norm
-        for p in parameters:
-            self._add_accumulator(self._moment_acc_str, p)
-            self._add_accumulator(self._inf_norm_acc_str, p)
-            self._add_accumulator(
-                name=self._beta1_pow_acc_str,
-                param=p,
-                dtype='float32',
-                fill_value=self._beta1,
-                shape=[1])
-
-    def _append_optimize_op(self, block, param_and_grad):
-        assert isinstance(block, framework.Block)
-
-        moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
-        inf_norm = self._get_accumulator(self._inf_norm_acc_str,
-                                         param_and_grad[0])
-        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                              param_and_grad[0])
-        # create the adamax optimize op
-        adamax_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "LearningRate": self._create_param_lr(param_and_grad),
-                "Moment": moment,
-                "InfNorm": inf_norm,
-                "Beta1Pow": beta1_pow_acc
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "MomentOut": moment,
-                "InfNormOut": inf_norm
-            },
-            attrs={
-                "beta1": self._beta1,
-                "beta2": self._beta2,
-                "epsilon": self._epsilon
-            },
-            stop_gradient=True)
-
-        return adamax_op
-
-    def _finish_update(self, block, parameters_and_grads):
-        """Update Beta1 Power accumulator
-        """
-        assert isinstance(block, framework.Block)
-        main_block = block.program.global_block()
-        for param, grad in parameters_and_grads:
-            if grad is None or param.trainable is False:
-                continue
-            with param.block.program._optimized_guard(
-                [param, grad]), name_scope('adamx'):
-                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                                      param)
-                main_block.append_op(
-                    type="scale",
-                    inputs={"X": beta1_pow_acc},
-                    outputs={"Out": beta1_pow_acc},
-                    attrs={"scale": self._beta1},
-                    stop_gradient=True)
-
-
-class DpsgdOptimizer(Optimizer):
-    """
-    We implement the Dpsgd optimizer according to CCS16 paper -
-    Deep Learning with Differential Privacy.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          # First create the Executor.
-          place = fluid.CPUPlace() # fluid.CUDAPlace(0)
-          exe = fluid.Executor(place)
-
-          train_program = fluid.Program()
-          startup_program = fluid.Program()
-          with fluid.program_guard(train_program, startup_program):
-              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-              hidden = fluid.layers.fc(input=data, size=10)
-              loss = fluid.layers.mean(hidden)
-              optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
-              optimizer.minimize(loss)
-
-          # Run the startup program once and only once.
-          exe.run(startup_program)
-
-          x = numpy.random.random(size=(10, 1)).astype('float32')
-          outs = exe.run(program=train_program,
-                        feed={'X': x},
-                         fetch_list=[loss.name])
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-        Can be a float value or a Variable with one float value as data element.
-        clip (float): clipping threshold
-        batch_size (float): batch size.
-        sigma (float): for gaussian noise.
-    Notes:
-       Currently, DpsgdOptimizer doesn't support sparse parameter optimization.
-    """
-
-    def __init__(self,
-                 learning_rate=0.001,
-                 clip=0.9,
-                 batch_size=0.999,
-                 sigma=1e-8):
-        assert learning_rate is not None
-        assert clip is not None
-        assert batch_size is not None
-        assert sigma is not None
-        super(DpsgdOptimizer, self).__init__(learning_rate=learning_rate)
-        self.type = "dpsgd"
-        self._clip = clip
-        self._batch_size = batch_size
-        self._sigma = sigma
-
-    def _append_optimize_op(self, block, param_and_grad):
-        assert isinstance(block, framework.Block)
-
-        # create the dpsgd optimize op
-        dpsgd_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "LearningRate": self._create_param_lr(param_and_grad)
-            },
-            outputs={"ParamOut": param_and_grad[0]},
-            attrs={
-                "clip": self._clip,
-                "batch_size": self._batch_size,
-                "sigma": self._sigma
-            },
-            stop_gradient=True)
-
-        return dpsgd_op
-
-
-class DecayedAdagradOptimizer(Optimizer):
-    """
-    **Decayed Adagrad Optimizer**
-
-    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-
-    The update is done as follows:
-
-    .. math::
-
-        moment\_out & = decay * moment + (1 - decay) * grad * grad
-
-        param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
-
-    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-    does not have an epsilon attribute. It is added here for numerical
-    stability to avoid the division by zero error.
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-        Can be a float value or a Variable with one float value as data element.
-        decay (float): decay rate.
-        epsilon (float): a small float value for numerical stability.
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            from paddle.fluid.optimizer import DecayedAdagrad
-
-            x = layers.data( name='x', shape=[-1, 10], dtype='float32' )
-            trans = layers.fc( x, 100 )
-            cost = layers.reduce_mean( trans )
-            optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
-            optimizer.minimize(cost)
-
-    Notes:
-       Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.
-    """
-    _moment_acc_str = "moment"
-
-    def __init__(self,
-                 learning_rate,
-                 decay=0.95,
-                 epsilon=1.0e-6,
-                 regularization=None,
-                 name=None):
-        assert learning_rate is not None
-        assert decay is not None
-        assert epsilon is not None
-
-        super(DecayedAdagradOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            name=name)
-        self.type = "decayed_adagrad"
-        self._decay = decay
-        self._epsilon = epsilon
-
-    def _create_accumulators(self, block, parameters):
-        assert isinstance(block, framework.Block)
-
-        for p in parameters:
-            self._add_accumulator(self._moment_acc_str, p)
-
-    def _append_optimize_op(self, block, param_and_grad):
-        assert isinstance(block, framework.Block)
-
-        moment_acc = self._get_accumulator(self._moment_acc_str,
-                                           param_and_grad[0])
-
-        # Create the decayed adagrad optimizer op
-        decayed_adagrad_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "Moment": moment_acc,
-                "LearningRate": self._create_param_lr(param_and_grad)
-            },
-            outputs={"ParamOut": param_and_grad[0],
-                     "MomentOut": moment_acc},
-            attrs={"epsilon": self._epsilon},
-            stop_gradient=True)
-
-        return decayed_adagrad_op
-
-
-class AdadeltaOptimizer(Optimizer):
-    """
-    **NOTES: This API does not support sparse parameter optimization.**
-
-    Adadelta Optimizer. Please refer to this for details:
-    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
-    <https://arxiv.org/abs/1212.5701>`_.
-
-    .. math::
-
-        E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2\\
-
-        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }\\
-
-        E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2
-
-    Args:
-        learning_rate(float|Variable): global learning rate.
-        epsilon(float): a small float number for numeric stability. Default 1.0e-6.
-        rho(float): a floating point value indicating the decay rate.
-        regularization(WeightDecayRegularizer, optional): A Regularizer, such as fluid.regularizer.L2DecayRegularizer. Default None, meaning that there is no regularization.
-        name(str, optional): A optional name prefix for debugging. Default None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            image = fluid.layers.data(name='image', shape=[28], dtype='float32')
-            fc = fluid.layers.fc(image, size=10)
-            cost = fluid.layers.reduce_mean(fc)
-            optimizer = fluid.optimizer.Adadelta(
-                learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
-
-            # optimizer_ops is a list of optimizer operators to update parameters
-            # params_grads is a list of (param, param_grad), where param is each
-            # parameter and param_grad is the gradient variable of param.
-            optimizer_ops, params_grads = optimizer.minimize(cost)
-    """
-
-    _avg_squared_grad_acc_str = "_avg_squared_grad"
-    _avg_squared_update_acc_str = "_avg_squared_update"
-
-    def __init__(self,
-                 learning_rate,
-                 epsilon=1.0e-6,
-                 rho=0.95,
-                 regularization=None,
-                 name=None):
-        if learning_rate is None:
-            raise ValueError("learning_rate is not set.")
-        if epsilon is None:
-            raise ValueError("epsilon is not set.")
-        if rho is None:
-            raise ValueError("rho is not set.")
-        super(AdadeltaOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            name=name)
-        self.type = "adadelta"
-        self._epsilon = epsilon
-        self._rho = rho
-
-    def _create_accumulators(self, block, parameters):
-        if not isinstance(block, framework.Block):
-            raise TypeError("block is not instance of framework.Block.")
-
-        for p in parameters:
-            self._add_accumulator(self._avg_squared_grad_acc_str, p)
-            self._add_accumulator(self._avg_squared_update_acc_str, p)
-
-    def _append_optimize_op(self, block, param_and_grad):
-        if not isinstance(block, framework.Block):
-            raise TypeError("block is not instance of framework.Block.")
-
-        avg_squared_grad_acc = self._get_accumulator(
-            self._avg_squared_grad_acc_str, param_and_grad[0])
-        avg_squared_update_acc = self._get_accumulator(
-            self._avg_squared_update_acc_str, param_and_grad[0])
-
-        # Create the adadelta optimizer op
-        adadelta_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "AvgSquaredGrad": avg_squared_grad_acc,
-                "AvgSquaredUpdate": avg_squared_update_acc
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "AvgSquaredGradOut": avg_squared_grad_acc,
-                "AvgSquaredUpdateOut": avg_squared_update_acc
-            },
-            attrs={"epsilon": self._epsilon,
-                   "rho": self._rho},
-            stop_gradient=True)
-
-        return adadelta_op
-
-
-class RMSPropOptimizer(Optimizer):
-    """
-    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
-    rate method. The original slides proposed RMSProp: Slide 29 of
-    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
-
-    The original equation is as follows:
-
-    ..  math::
-
-        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
-
-        w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
-
-    The first equation calculates moving average of the squared gradient for
-    each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
-
-    In some cases, adding a momentum term :math: `\\beta` is beneficial.
-    In our implementation, Nesterov momentum is used:
-
-    ..  math::
-
-        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
-
-        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
-            \\epsilon}} \\nabla Q_{i}(w)
-
-        w & = w - v(w, t)
-
-    if centered is True:
-
-    ..  math::
-
-        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
-
-        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
-
-        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
-            \\epsilon}} \\nabla Q_{i}(w)
-
-        w & = w - v(w, t)
-
-    where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
-    and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
-    smoothing term to avoid division by zero, usually set somewhere in range
-    from 1e-4 to 1e-8.
-
-
-    Args:
-        learning_rate(float): global learning rate.
-        rho(float): rho is :math: `\\rho` in equation, set 0.95 by default.
-        epsilon(float): :math: `\\epsilon` in equation is smoothing term to
-            avoid division by zero, set 1e-6 by default.
-        momentum(float): :math:`\\beta` in equation is the momentum term,
-            set 0.0 by default.
-        centered(bool): If True, gradients are normalized by the estimated variance of
-            the gradient; if False, by the uncentered second moment. Setting this to
-            True may help with training, but is slightly more expensive in terms of
-            computation and memory. Defaults to False.
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
-
-    Raises:
-        ValueError: If learning_rate, rho, epsilon, momentum are None.
-
-    Examples:
-          .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            place = fluid.CPUPlace()
-            main = fluid.Program()
-            with fluid.program_guard(main):
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
-
-                rms_optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
-                rms_optimizer.minimize(avg_cost)
-
-                fetch_list = [avg_cost]
-                train_reader = paddle.batch(
-                    paddle.dataset.uci_housing.train(), batch_size=1)
-                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in train_reader():
-                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
-    """
-
-    _momentum_acc_str = "momentum"
-    _mean_square_acc_str = "mean_square"
-    _mean_grad_acc_str = "mean_grad"
-
-    def __init__(self,
-                 learning_rate,
-                 rho=0.95,
-                 epsilon=1.0e-6,
-                 momentum=0.0,
-                 centered=False,
-                 regularization=None,
-                 name=None):
-        super(RMSPropOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            name=name)
-        if learning_rate is None:
-            raise ValueError("learning_rate is not set.")
-        if rho is None:
-            raise ValueError("rho is not set.")
-        if epsilon is None:
-            raise ValueError("epsilon is not set.")
-        if momentum is None:
-            raise ValueError("momentum is not set.")
-
-        self.type = "rmsprop"
-        self._rho = rho
-        self._epsilon = epsilon
-        self._momentum = momentum
-        self._centered = centered
-
-    def _create_accumulators(self, block, parameters):
-        if not isinstance(block, framework.Block):
-            raise TypeError("block is not instance of framework.Block.")
-
-        for p in parameters:
-            self._add_accumulator(self._momentum_acc_str, p)
-            self._add_accumulator(self._mean_square_acc_str, p)
-            self._add_accumulator(self._mean_grad_acc_str, p)
-
-    def _append_optimize_op(self, block, param_and_grad):
-        if not isinstance(block, framework.Block):
-            raise TypeError("block is not instance of framework.Block.")
-
-        momentum_acc = self._get_accumulator(self._momentum_acc_str,
-                                             param_and_grad[0])
-        mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
-                                                param_and_grad[0])
-        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
-                                              param_and_grad[0])
-        rmsprop_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "Moment": momentum_acc,
-                "MeanSquare": mean_square_acc,
-                "MeanGrad": mean_grad_acc,
-                "LearningRate": self._create_param_lr(param_and_grad),
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "MomentOut": momentum_acc,
-                "MeanSquareOut": mean_square_acc,
-                "MeanGradOut": mean_grad_acc
-            },
-            attrs={
-                "epsilon": self._epsilon,
-                "decay": self._rho,
-                "momentum": self._momentum,
-                "centered": self._centered
-            },
-            stop_gradient=True)
-
-        return rmsprop_op
-
-
-class FtrlOptimizer(Optimizer):
-    """
-    FTRL (Follow The Regularized Leader) Optimizer.
-
-    The paper that proposed Follow The Regularized Leader (FTRL):
-    (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
-
-    ..  math::
-
-        &new\_accum = squared\_accum + grad^2
-
-        &if (lr\_power == -0.5):
-
-        &\quad  linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param}
-
-        &else:
-
-        &\quad   linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}
-
-
-        &x = l1 * sign(linear\_accum) - linear\_accum
-
-        &if (lr\_power == -0.5):
-
-        &\quad   y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2)
-
-        &\quad   pre\_shrink = \\frac{x}{y}
-
-        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
-
-        &else:
-
-        &\quad   y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)
-
-        &\quad   pre\_shrink = \\frac{x}{y}
-
-        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
-
-        &squared\_accum += grad^2
-
-    Args:
-        learning_rate (float|Variable): global learning rate.
-        l1 (float): L1 regularization strength.
-        l2 (float): L2 regularization strength.
-        lr_power (float): Learning Rate Power.
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
-
-    Raises:
-        ValueError: If learning_rate, rho, epsilon, momentum are None.
-
-    Examples:
-          .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            place = fluid.CPUPlace()
-            main = fluid.Program()
-            with fluid.program_guard(main):
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
-
-                ftrl_optimizer = fluid.optimizer.Ftrl(learning_rate=0.1)
-                ftrl_optimizer.minimize(avg_cost)
-
-                fetch_list = [avg_cost]
-                train_reader = paddle.batch(
-                    paddle.dataset.uci_housing.train(), batch_size=1)
-                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in train_reader():
-                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
-    Notes:
-       Currently, FtrlOptimizer doesn't support sparse parameter optimization.
-    """
-
-    _squared_acc_str = "squared"
-    _linear_acc_str = "linear"
-
-    def __init__(self,
-                 learning_rate,
-                 l1=0.0,
-                 l2=0.0,
-                 lr_power=-0.5,
-                 regularization=None,
-                 name=None):
-        super(FtrlOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            name=name)
-        if learning_rate is None:
-            raise ValueError("learning_rate is not set.")
-
-        self.type = "ftrl"
-        self._l1 = l1
-        self._l2 = l2
-        self._lr_power = lr_power
-
-    def _create_accumulators(self, block, parameters):
-        if not isinstance(block, framework.Block):
-            raise TypeError("block is not instance of framework.Block.")
-
-        for p in parameters:
-            self._add_accumulator(self._squared_acc_str, p)
-            self._add_accumulator(self._linear_acc_str, p)
-
-    def _append_optimize_op(self, block, param_and_grad):
-        if not isinstance(block, framework.Block):
-            raise TypeError("block is not instance of framework.Block.")
-
-        squared_acc = self._get_accumulator(self._squared_acc_str,
-                                            param_and_grad[0])
-        linear_acc = self._get_accumulator(self._linear_acc_str,
-                                           param_and_grad[0])
-        ftrl_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "SquaredAccumulator": squared_acc,
-                "LinearAccumulator": linear_acc,
-                "LearningRate": self._create_param_lr(param_and_grad),
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "SquaredAccumOut": squared_acc,
-                "LinearAccumOut": linear_acc
-            },
-            attrs={"l1": self._l1,
-                   "l2": self._l1,
-                   "lr_power": self._lr_power},
-            stop_gradient=True)
-
-        return ftrl_op
-
-
-class LambOptimizer(AdamOptimizer):
-    """
-    LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
-
-    LAMB Optimizer is designed to scale up the batch size of training without losing 
-    accuracy, which supports adaptive element-wise updating and accurate layer-wise 
-    correction. For more information, please refer to `Large Batch Optimization for 
-    Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .
-
-    The updating of parameters follows:
-
-    ..  math::
-
-        m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t \\
-
-        v_t &= \\beta_2 v_{t - 1}  + (1 - \\beta_2)g_t^2 \\
-
-        r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon} \\
-
-        w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1})
-
-
-    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the 
-    learning rate, :math:`\\lambda` the LAMB weight decay rate.
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-                                        Can be a float value or a Variable with one \
-                                        float value as data element.
-        lamb_weight_decay (float): The LAMB weight decay rate.
-        beta1 (float): The exponential decay rate for the 1st moment estimates.
-        beta2 (float): The exponential decay rate for the 2nd moment estimates.
-        epsilon (float): A small float value for numerical stability.
-        regularization (Regularizer): A Regularizer, such as
-                        fluid.regularizer.L1DecayRegularizer.
-        exclude_from_weight_decay_fn (function): Exclude a parameter from weight 
-            decay when **exclude_from_weight_decay_fn(parameter)** returns true.
-        name (str|None): An optional name prefix.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid 
-
-            data = fluid.layers.data(name='x', shape=[5], dtype='float32')
-            hidden = fluid.layers.fc(input=data, size=10)
-            cost = fluid.layers.mean(hidden)
-
-            def exclude_fn(param):
-                return param.name.endswith('.b_0')
-
-            optimizer = fluid.optimizer.Lamb(learning_rate=0.002,
-                                             exclude_from_weight_decay_fn=exclude_fn)
-            optimizer.minimize(cost)
-    """
-    _moment1_acc_str = "moment1"
-    _moment2_acc_str = "moment2"
-    # these two not used in op temporarily
-    _beta1_pow_acc_str = "beta1_pow_acc"
-    _beta2_pow_acc_str = "beta2_pow_acc"
-
-    def __init__(self,
-                 learning_rate=0.001,
-                 lamb_weight_decay=0.01,
-                 beta1=0.9,
-                 beta2=0.999,
-                 epsilon=1e-6,
-                 regularization=None,
-                 exclude_from_weight_decay_fn=None,
-                 name=None):
-        assert learning_rate is not None
-        assert lamb_weight_decay is not None
-        assert beta1 is not None
-        assert beta2 is not None
-        assert epsilon is not None
-        super(LambOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            beta1=beta1,
-            beta2=beta2,
-            epsilon=epsilon,
-            name=name)
-        self.type = "lamb"
-        self._weight_decay = lamb_weight_decay
-        self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn
-
-    def _append_optimize_op(self, block, param_and_grad):
-        assert isinstance(block, framework.Block)
-        block.program._use_lamb = True
-
-        moment1 = self._get_accumulator(self._moment1_acc_str,
-                                        param_and_grad[0])
-        moment2 = self._get_accumulator(self._moment2_acc_str,
-                                        param_and_grad[0])
-        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                              param_and_grad[0])
-        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                              param_and_grad[0])
-
-        if self._exclude_from_weight_decay_fn is not None \
-            and self._exclude_from_weight_decay_fn(param_and_grad[0]):
-            weight_decay = 0.0
-        else:
-            weight_decay = self._weight_decay
-
-        # create the lamb optimize op
-        lamb_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "LearningRate": self._create_param_lr(param_and_grad),
-                "Moment1": moment1,
-                "Moment2": moment2,
-                "Beta1Pow": beta1_pow_acc,
-                "Beta2Pow": beta2_pow_acc
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "Moment1Out": moment1,
-                "Moment2Out": moment2
-            },
-            attrs={
-                "beta1": self._beta1,
-                "beta2": self._beta2,
-                "epsilon": self._epsilon,
-                "weight_decay": weight_decay
-            },
-            stop_gradient=True)
-
-        return lamb_op
-
-
-# We short the class name, since users will use the optimizer with the package
-# name. The sample code:
-#
-# import paddle.fluid as fluid
-#
-# sgd = fluid.optimizer.SGD(...)
-#
-# It is no need to add an `Optimizer` as the class suffix
-SGD = SGDOptimizer
-Momentum = MomentumOptimizer
-Adagrad = AdagradOptimizer
-Adam = AdamOptimizer
-Adamax = AdamaxOptimizer
-Dpsgd = DpsgdOptimizer
-DecayedAdagrad = DecayedAdagradOptimizer
-Adadelta = AdadeltaOptimizer
-RMSProp = RMSPropOptimizer
-Ftrl = FtrlOptimizer
-LarsMomentum = LarsMomentumOptimizer
-Lamb = LambOptimizer
-
-
-class ModelAverage(Optimizer):
-    """Accumulate the average of parameters within sliding window. The average
-    result will be saved in temporary variables which can be applied to
-    parameter variables of current model by calling 'apply()' method. And the
-    'restore()' method is used to restore the parameter values of current model.
-
-    The size of average window is determined by average_window_rate,
-    min_average_window, max_average_window and current update times.
-
-    Args:
-        average_window_rate: The rate of average window.
-        min_average_window: The minimum size of average window.
-        max_average_window: The maximum size of average window.
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
-
-    Examples:
-
-      .. code-block:: python
-
-        import paddle.fluid as fluid
-        import numpy
-
-        # First create the Executor.
-        place = fluid.CPUPlace()  # fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            # build net
-            data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-            hidden = fluid.layers.fc(input=data, size=10)
-            loss = fluid.layers.mean(hidden)
-            optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
-            optimizer.minimize(loss)
-
-            # build ModelAverage optimizer
-            model_average = fluid.optimizer.ModelAverage(0.15,
-                                                         min_average_window=10000,
-                                                         max_average_window=20000)
-
-            exe.run(startup_program)
-            x = numpy.random.random(size=(10, 1)).astype('float32')
-            outs = exe.run(program=train_program,
-                           feed={'X': x},
-                           fetch_list=[loss.name])
-
-            # apply ModelAverage
-            with model_average.apply(exe):
-                x = numpy.random.random(size=(10, 1)).astype('float32')
-                exe.run(program=train_program,
-                        feed={'X': x},
-                        fetch_list=[loss.name])
-    """
-
-    def __init__(self,
-                 average_window_rate,
-                 min_average_window=10000,
-                 max_average_window=10000,
-                 regularization=None,
-                 name=None):
-        super(ModelAverage, self).__init__(
-            0.0, regularization=regularization, name=name)
-        self.average_window = average_window_rate
-        self.min_average_window = min_average_window
-        self.max_average_window = max_average_window
-
-        self.params_grads = []
-        for param in framework.default_main_program().global_block(
-        ).all_parameters():
-            if param.do_model_average != False:
-                grad = param.block.create_var(
-                    name=unique_name.generate_with_ignorable_key(".".join(
-                        [param.name, 'tmp'])),
-                    dtype=param.dtype,
-                    persistable=False,
-                    stop_gradient=True)
-                self.params_grads.append((param, grad))
-
-        for param, grad in self.params_grads:
-            if grad is None:
-                continue
-            with param.block.program._optimized_guard(
-                [param, grad]), name_scope('move_average'):
-                self._append_average_accumulate_op(param)
-
-        self.apply_program = Program()
-        block = self.apply_program.global_block()
-        with program_guard(main_program=self.apply_program):
-            for param_grad in self.params_grads:
-                self._add_average_apply_op(block, param_grad)
-
-        self.restore_program = Program()
-        block = self.restore_program.global_block()
-        with program_guard(main_program=self.restore_program):
-            for param_grad in self.params_grads:
-                self._add_average_restore_op(block, param_grad)
-
-    def _add_average_apply_op(self, block, param_grad):
-        param = block._clone_variable(param_grad[0])
-        grad = block._clone_variable(param_grad[1])
-        sum_1 = block._clone_variable(self._get_accumulator('sum_1', param))
-        sum_2 = block._clone_variable(self._get_accumulator('sum_2', param))
-        sum_3 = block._clone_variable(self._get_accumulator('sum_3', param))
-        num_accumulates = block._clone_variable(
-            self._get_accumulator('num_accumulates', param))
-        old_num_accumulates = block._clone_variable(
-            self._get_accumulator('old_num_accumulates', param))
-        num_updates = block._clone_variable(
-            self._get_accumulator('num_updates', param))
-        # backup param value to grad
-        layers.assign(input=param, output=grad)
-        # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
-        tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
-        sum = layers.sum(x=[sum_1, sum_2, sum_3])
-        tmp = layers.cast(
-            x=tmp, dtype='float32' if self._dtype == None else self._dtype)
-        sum = layers.cast(
-            x=sum, dtype='float32' if self._dtype == None else self._dtype)
-        ops._elementwise_div(x=sum, y=tmp, out=param)
-
-    def _add_average_restore_op(self, block, param_grad):
-        param = block._clone_variable(param_grad[0])
-        grad = block._clone_variable(param_grad[1])
-        layers.assign(input=grad, output=param)
-
-    def _append_average_accumulate_op(self, param):
-        self.helper = LayerHelper("average_accumulate")
-        sum_1 = self._add_accumulator('sum_1', param)
-        sum_2 = self._add_accumulator('sum_2', param)
-        sum_3 = self._add_accumulator('sum_3', param)
-        num_accumulates = self._add_accumulator(
-            'num_accumulates', param, dtype='int64', shape=[1])
-        old_num_accumulates = self._add_accumulator(
-            'old_num_accumulates', param, dtype='int64', shape=[1])
-        num_updates = self._add_accumulator(
-            'num_updates', param, dtype='int64', shape=[1])
-
-        self.helper.append_op(
-            type='average_accumulates',
-            inputs={
-                "param": param,
-                "in_sum_1": sum_1,
-                "in_sum_2": sum_2,
-                "in_sum_3": sum_3,
-                "in_num_accumulates": num_accumulates,
-                "in_old_num_accumulates": old_num_accumulates,
-                "in_num_updates": num_updates
-            },
-            outputs={
-                "out_sum_1": sum_1,
-                "out_sum_2": sum_2,
-                "out_sum_3": sum_3,
-                "out_num_accumulates": num_accumulates,
-                "out_old_num_accumulates": old_num_accumulates,
-                "out_num_updates": num_updates,
-            },
-            attrs={
-                "average_window": self.average_window,
-                "min_average_window": self.min_average_window,
-                "max_average_window": self.max_average_window,
-            },
-            stop_gradient=True)
-
-    @signature_safe_contextmanager
-    def apply(self, executor, need_restore=True):
-        """Apply average values to parameters of current model.
-
-        Args:
-            executor(fluid.Executor): current executor.
-            need_restore(bool): If you finally need to do restore, set it to True. Default is True.
-        """
-        executor.run(self.apply_program)
-        try:
-            yield
-        finally:
-            if need_restore:
-                self.restore(executor)
-
-    def restore(self, executor):
-        """Restore parameter values of current model.
-        
-        Args:
-            executor(fluid.Executor): current executor.
-        """
-        executor.run(self.restore_program)
-
-
-class ExponentialMovingAverage(object):
-    """
-    Compute the moving average of parameters with exponential decay.
-    Given a parameter :math:`\\theta`, its exponential moving average (EMA)
-    will be
-
-    ..  math::
-
-        \\text{EMA}_0 & = 0
-
-	\\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t
-
-    The average results calculated by **update()** method will be saved in 
-    temporary variables which are created and maintained by the object, and can 
-    be applied to parameters of current model by calling **apply()** method. And 
-    the **restore()** method is used to restore the parameters.
-
-    **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be 
-    zero biased, which can be corrected by divided by a factor 
-    :math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters 
-    when calling **apply()** method would be 
-
-    ..  math::
-    
-        \\widehat{\\text{EMA}}_t = \\frac{\\text{EMA}_t}{1 - \\text{decay}^t}
-
-    **Decay rate scheduling**. A large decay rate very close to 1 would result 
-    in that the averages move very slowly. And a better strategy is to set a 
-    relative smaller decay rate in the very beginning. The argument **thres_steps**
-    allows users to pass a Variable to schedule the decay rate, in this case, 
-    the actual decay rate becomes
-     
-    ..  math::
-    
-        \\min(\\text{decay}, \\frac{1 + \\text{thres_steps}}{10 + \\text{thres_steps}})
-
-    Usually **thres_steps** can be the global training steps.
-
-
-    Args:
-	decay (float): The exponential decay rate, usually close to 1, such as 
-                       0.999, 0.9999, ... .
-        thres_steps (Variable|None): If not `None`, schedule the decay rate.
-	name (str|None): An optional name prefix.
-
-
-    Examples:
-
-	.. code-block:: python
-
-	    import numpy
-	    import paddle
-	    import paddle.fluid as fluid
-
-	    data = fluid.layers.data(name='x', shape=[5], dtype='float32')
-	    hidden = fluid.layers.fc(input=data, size=10)
-	    cost = fluid.layers.mean(hidden)
-
-	    test_program = fluid.default_main_program().clone(for_test=True)
-
-	    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-	    optimizer.minimize(cost)
-
-	    global_steps = fluid.layers.learning_rate_scheduler._decay_step_counter()
-	    ema = fluid.optimizer.ExponentialMovingAverage(0.999, thres_steps=global_steps)
-	    ema.update()
-
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
-
-	    for pass_id in range(3):
-		for batch_id in range(6):
-		    data = numpy.random.random(size=(10, 5)).astype('float32')
-		    exe.run(program=fluid.default_main_program(),
-			feed={'x': data}, 
-			fetch_list=[cost.name])
-
-		# usage 1
-		with ema.apply(exe):
-		    data = numpy.random.random(size=(10, 5)).astype('float32')
-		    exe.run(program=test_program,
-			    feed={'x': data}, 
-			    fetch_list=[hidden.name])
-			    
-
-		 # usage 2
-		with ema.apply(exe, need_restore=False):
-		    data = numpy.random.random(size=(10, 5)).astype('float32')
-		    exe.run(program=test_program,
-			    feed={'x': data}, 
-			    fetch_list=[hidden.name])
-		ema.restore(exe)
-    """
-
-    def __init__(self, decay=0.999, thres_steps=None, name=None):
-        self._decay = decay
-        self._thres_steps = thres_steps
-        self._name = name if name is not None else ''
-        self._decay_var = self._get_ema_decay()
-
-        self._params_tmps = []
-        for param in default_main_program().global_block().all_parameters():
-            if param.do_model_average != False:
-                tmp = param.block.create_var(
-                    name=unique_name.generate(".".join(
-                        [self._name + param.name, 'ema_tmp'])),
-                    dtype=param.dtype,
-                    persistable=False,
-                    stop_gradient=True)
-                self._params_tmps.append((param, tmp))
-
-        self._ema_vars = {}
-        for param, tmp in self._params_tmps:
-            with param.block.program._optimized_guard(
-                [param, tmp]), name_scope('moving_average'):
-                self._ema_vars[param.name] = self._create_ema_vars(param)
-
-        self.apply_program = Program()
-        block = self.apply_program.global_block()
-        with program_guard(main_program=self.apply_program):
-            decay_pow = self._get_decay_pow(block)
-            for param, tmp in self._params_tmps:
-                param = block._clone_variable(param)
-                tmp = block._clone_variable(tmp)
-                ema = block._clone_variable(self._ema_vars[param.name])
-                layers.assign(input=param, output=tmp)
-                # bias correction
-                ema = ema / (1.0 - decay_pow)
-                layers.assign(input=ema, output=param)
-
-        self.restore_program = Program()
-        block = self.restore_program.global_block()
-        with program_guard(main_program=self.restore_program):
-            for param, tmp in self._params_tmps:
-                tmp = block._clone_variable(tmp)
-                param = block._clone_variable(param)
-                layers.assign(input=tmp, output=param)
-
-    def _get_ema_decay(self):
-        with default_main_program()._lr_schedule_guard():
-            decay_var = layers.tensor.create_global_var(
-                shape=[1],
-                value=self._decay,
-                dtype='float32',
-                persistable=True,
-                name="scheduled_ema_decay_rate")
-
-            if self._thres_steps is not None:
-                decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0)
-                with layers.control_flow.Switch() as switch:
-                    with switch.case(decay_t < self._decay):
-                        layers.tensor.assign(decay_t, decay_var)
-                    with switch.default():
-                        layers.tensor.assign(
-                            np.array(
-                                [self._decay], dtype=np.float32),
-                            decay_var)
-        return decay_var
-
-    def _get_decay_pow(self, block):
-        global_steps = layers.learning_rate_scheduler._decay_step_counter()
-        decay_var = block._clone_variable(self._decay_var)
-        decay_pow_acc = layers.elementwise_pow(decay_var, global_steps + 1)
-        return decay_pow_acc
-
-    def _create_ema_vars(self, param):
-        param_ema = layers.create_global_var(
-            name=unique_name.generate(self._name + param.name + '_ema'),
-            shape=param.shape,
-            value=0.0,
-            dtype=param.dtype,
-            persistable=True)
-
-        return param_ema
-
-    def update(self):
-        """ 
-        Update Exponential Moving Average. Should only call this method in 
-        train program.
-        """
-        param_master_emas = []
-        for param, tmp in self._params_tmps:
-            with param.block.program._optimized_guard(
-                [param, tmp]), name_scope('moving_average'):
-                param_ema = self._ema_vars[param.name]
-                if param.name + '.master' in self._ema_vars:
-                    master_ema = self._ema_vars[param.name + '.master']
-                    param_master_emas.append([param_ema, master_ema])
-                else:
-                    ema_t = param_ema * self._decay_var + param * (
-                        1 - self._decay_var)
-                    layers.assign(input=ema_t, output=param_ema)
-
-        # for fp16 params
-        for param_ema, master_ema in param_master_emas:
-            default_main_program().global_block().append_op(
-                type="cast",
-                inputs={"X": master_ema},
-                outputs={"Out": param_ema},
-                attrs={
-                    "in_dtype": master_ema.dtype,
-                    "out_dtype": param_ema.dtype
-                })
-
-    @signature_safe_contextmanager
-    def apply(self, executor, need_restore=True):
-        """
-        Apply moving average to parameters for evaluation.
-        
-        Args:
-            executor (Executor): The Executor to execute applying.
-            need_restore (bool): Whether to restore parameters after applying.
-        """
-        executor.run(self.apply_program)
-        try:
-            yield
-        finally:
-            if need_restore:
-                self.restore(executor)
-
-    def restore(self, executor):
-        """Restore parameters.
-        
-        Args:
-            executor (Executor): The Executor to execute restoring.
-        """
-        executor.run(self.restore_program)
-
-
-class PipelineOptimizer(object):
-    """
-    Pipeline Optimizer
-
-    Train with pipeline mode. The program will be splited by cut_list. 
-
-    If the len of cut_list is k, then the whole program (including \
-    backward part) will be splited to 2*k-1 sections. 
-    
-    So the length of place_list and concurrency_list must be also 2*k-1.
-
-    Note: Though the asynchronous mode is applied in pipeline training to speed up, \
-    the final performance depends on the training progress of each pipeline heavily.
-
-    And we will try the synchronous mode in the future.
-
-    Args:
-        optimizer (Optimizer): The based optimizer, such as SGD.
-        cut_list (list of Variable list): The cut variable of the main_program.
-        place_list (list of Place): The place where the section will run on.
-        concurrency_list (list of int): The concurrency degree.
-        queue_size (int): Each section will consume scopes from its in-scope queue 
-                        and produce scopes to out-scope queue. And this parameter 
-                        specify the scope queue size. [Optional. Default: 30].
-        sync_steps (int): The synchronization steps between different cards. [Optional. Default: 1].
-        start_cpu_core_id (int): specify the first cpu core id. [Optional. Default:0].
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
-            y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
-            emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False)
-            emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False)
-            concat = layers.concat([emb_x, emb_y], axis=1)
-            fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False)
-            loss = layers.reduce_mean(fc)
-            optimizer = fluid.optimizer.SGD(learning_rate=0.5)
-            optimizer = fluid.optimizer.PipelineOptimizer(optimizer,
-                    cut_list=[[emb_x, emb_y], [loss]],
-                    place_list=[fluid.CPUPlace(), fluid.CUDAPlace(0), fluid.CPUPlace()],
-                    concurrency_list=[1, 1, 4],
-                    queue_size=2,
-                    sync_steps=1,
-                    )
-            optimizer.minimize(loss)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"]
-            dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset")
-            dataset.set_use_var([x,y])
-            dataset.set_batch_size(batch_size)
-            dataset.set_filelist(filelist)
-            exe.train_from_dataset(
-                        fluid.default_main_program(),
-                        dataset,
-                        thread=2,
-                        debug=False,
-                        fetch_list=[],
-                        fetch_info=[],
-                        print_period=1)
-    """
-
-    def __init__(self,
-                 optimizer,
-                 cut_list=None,
-                 place_list=None,
-                 concurrency_list=None,
-                 queue_size=30,
-                 sync_steps=1,
-                 start_cpu_core_id=0):
-        # TODO: check properties
-        self._optimizer = optimizer
-        self._cut_list = cut_list
-        self._place_list = place_list
-        self._concurrency_list = concurrency_list
-        self._queue_size = queue_size
-        self._sync_steps = sync_steps
-        self._start_cpu_core_id = start_cpu_core_id
-
-    def _create_vars(self, block, main_program):
-        used_var_set = set()
-        for op_idx in range(block.desc.op_size()):
-            op_desc = block.desc.op(op_idx)
-            vars = op_desc.input_arg_names() + op_desc.output_arg_names()
-            for var in vars:
-                if var in used_var_set:
-                    continue
-                used_var_set.add(var)
-                source_var = main_program.block(0).var(str(var))
-                block._clone_variable(source_var, False)
-
-    def _extract_section_opt_ops(self, ops, cut_point_name):
-        """
-        Extract opt ops in the given section
-        """
-        output_names = set(cut_point_name)
-        relevant_op_flags = [True] * len(ops)
-        for i, op in reversed(list(enumerate(ops))):
-            if _some_in_set_(op.desc.output_arg_names(), output_names):
-                for name in op.desc.input_arg_names():
-                    output_names.add(name)
-            else:
-                relevant_op_flags[i] = False
-
-        op_path = [ops[i] for i in range(len(ops)) if relevant_op_flags[i]]
-        return op_path
-
-    def _find_input_output(self, ops, name, is_forward=True):
-        """
-        Find the inputs or outputs of a section
-        """
-        all_set = set()
-        part_set = set()
-        for op in ops:
-            if is_forward:
-                part_set.update(op.desc.output_arg_names())
-            else:
-                part_set.update(op.desc.input_arg_names())
-            all_set.update(op.desc.output_arg_names())
-            all_set.update(op.desc.input_arg_names())
-        return all_set - part_set
-
-    def _find_persistable_vars(self, ops, whole_parameters):
-        """
-        find the persistable input vars in current section
-        """
-        res = set()
-        for op in ops:
-            vars = op.desc.input_arg_names()
-            for var in vars:
-                if var in whole_parameters:
-                    res.add(var)
-        return res
-
-    def _is_opt_role_op(self, op):
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attr_names and \
-                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) & int(optimize_role) != 0:
-            return True
-        return False
-
-    def _is_lr_role_op(self, op):
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.LRSched
-        if op_maker.kOpRoleAttrName() in op.attr_names and \
-                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
-            return True
-        return False
-
-    def _extract_section_ops(self, ops, cut_point_name):
-        """
-        Extract ops in the given section 
-        """
-        output_names = set(cut_point_name)
-        relevant_op_flags = [True] * len(ops)
-        for i, op in reversed(list(enumerate(ops))):
-            if not self._is_opt_role_op(op) and _some_in_set_(
-                    op.desc.output_arg_names(), output_names):
-                for name in op.desc.input_arg_names():
-                    output_names.add(name)
-            elif op.desc.type() == "print" and op.desc.input_arg_names()[
-                    0] in output_names:
-                continue
-            else:
-                relevant_op_flags[i] = False
-
-        op_path = [ops[i] for i in range(len(ops)) if relevant_op_flags[i]]
-        return op_path
-
-    def _find_section_opt(self, ops, params):
-        res = self._extract_section_opt_ops(ops, params)
-        return res
-
-    def _split_program(self, main_program, cut_list):
-        programs = []
-        block = main_program.block(0)
-        whole_parameters = [e.name for e in block.all_parameters()]
-        cut_var_names = []
-        cut_len = len(cut_list)
-        sec_params = []
-        for i, cut_vars in enumerate(cut_list[:-1]):
-            cut_var_names.append([cut_var.name for cut_var in cut_vars])
-        for i, cut_vars in reversed(list(enumerate(cut_list[:-1]))):
-            cut_var_names.append(
-                [_append_grad_suffix_(cut_var.name) for cut_var in cut_vars])
-            if i == 0:
-                cut_var_names[-1] += [var.name for var in cut_list[-1]]
-        ops = block.ops[:]
-        for i, cut_vars in enumerate(cut_var_names):
-            program = {
-                "program": Program(),
-                "input_set": set(),
-                "output_set": set()
-            }
-            cur_ops = self._extract_section_ops(ops, cut_vars)
-            if i == 0:
-                for op in ops:
-                    if self._is_lr_role_op(op):
-                        cur_ops.append(op)
-            #prevent inplace in/out
-            program["input_set"].update(
-                self._find_input_output(
-                    cur_ops, [], is_forward=True))
-            for e in cur_ops:
-                ops.remove(e)
-
-            if i < cut_len:
-                sec_params.append(
-                    self._find_persistable_vars(cur_ops, whole_parameters))
-            if i >= cut_len - 1:
-                opt_ops = self._find_section_opt(
-                    ops, sec_params[2 * cut_len - 2 - i])
-
-                for e in opt_ops:
-                    ops.remove(e)
-                cur_ops += opt_ops
-
-            op_descs = [op.desc for op in cur_ops]
-            for op_desc in op_descs:
-                ap_op = program["program"].block(0).desc.append_op()
-                ap_op.copy_from(op_desc)
-            program["input_set"].update(
-                self._find_input_output(
-                    cur_ops, cut_vars, is_forward=True))
-            program["input_set"].update(sec_params[min(i, 2 * cut_len - 2 - i)])
-            program["output_set"].update(
-                self._find_input_output(
-                    cur_ops, cut_vars, is_forward=False))
-            programs.append(program)
-        program = {
-            "program": Program(),
-            "input_set": set(),
-            "output_set": set()
-        }
-        op_descs = [op.desc for op in ops]
-        for op_desc in op_descs:
-            ap_op = program["program"].block(0).desc.append_op()
-            ap_op.copy_from(op_desc)
-        program["input_set"].update(
-            [cut_var.name + "@GRAD" for cut_var in cut_list[0]])
-        program["input_set"].update(
-            self._find_input_output(
-                ops, [], is_forward=True))
-        program["input_set"].update(sec_params[0])
-        programs.append(program)
-        inputs = set()
-        for program in reversed(list(programs)):
-            output_list = list(program["output_set"])
-            for output in output_list:
-                if output not in inputs:
-                    program["output_set"].remove(output)
-            inputs.update(program["input_set"])
-        return programs
-
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        self._optimizer.minimize(loss, startup_program, parameter_list,
-                                 no_grad_set)
-        program = loss.block.program
-        program_list = self._split_program(program, self._cut_list)
-        for p in program_list:
-            self._create_vars(p["program"].block(0), program)
-        whole_parameters = [e.name for e in program.block(0).all_parameters()]
-        param_need_sync = []
-        for i, section_p in enumerate(program_list):
-            if not isinstance(self._place_list[i], core.CUDAPlace):
-                continue
-            section_var = [e for e in section_p["program"].block(0).vars]
-            for p in section_var:
-                if p in whole_parameters:
-                    param_need_sync.append(p)
-        program._pipeline_opt = {
-            "trainer": "PipelineTrainer",
-            "device_worker": "Section",
-            "section_program_list": program_list,
-            "place_list": self._place_list,
-            "concurrency_list": self._concurrency_list,
-            "queue_size": self._queue_size,
-            "start_cpu_core_id": self._start_cpu_core_id,
-            "sync_steps": self._sync_steps,
-            "param_need_sync": param_need_sync
-        }
-
-
-class RecomputeOptimizer(Optimizer):
-    """
-    Recompute Optimizer Wrapper
-
-    Normally, a training step contains three sub-steps: first, run forward
-    Operators to calculate the loss; second, run backward Operators to 
-    calculate gradient of the parameters; third, apply optimization method
-    to update the value of the parameters.
-
-    In the forward computation process, all variables that are needed by 
-    backward computation process will be kept in memory, which occupy a great
-    amount of memory when the network becomes very deep.
-
-    Recompute split the network to k segments. In each segment, It will 
-    recompute the forward Operators, before running backward operators. It is
-    very helpful for saving memory.
- 
-    The Variables that separate a network to segments are called as checkpoints,
-    and users should set it manually. The usage is very simple:
-
-    Args:
-        optimizer (Optimizer): The optimizer that is applied to parameters.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-            def gen_data():
-                return {"x": np.random.random(size=(32, 32)).astype('float32'),
-                "y": np.random.randint(2, size=(32, 1)).astype('int64')}
-            def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                print(input_x)
-                fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
-                prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
-                cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-                sum_cost = fluid.layers.reduce_mean(cost)
-                return sum_cost, fc_1, prediction
-            input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
-            input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
-            cost, fc_1, pred = mlp(input_x, input_y)
-
-            sgd = fluid.optimizer.Adam(learning_rate=0.01)
-            sgd = fluid.optimizer.RecomputeOptimizer(sgd)
-            sgd._set_checkpoints([fc_1, pred])
-            sgd.minimize(cost)
-
-            print("Finished optimize")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            step = 10
-
-            for i in range(step):
-                cost_val = exe.run(feed=gen_data(),
-                       program=fluid.default_main_program(),
-                       fetch_list=[cost.name])
-                print("step=%d cost=%f" % (i, cost_val[0]))
-
-    """
-
-    def __init__(self, optimizer):
-        self._optimizer = optimizer
-        self._checkpoints = None
-
-    def _set_checkpoints(self, checkpoints):
-        self._checkpoints = checkpoints
-
-    def load(self, stat_dict):
-        """
-        load function is not supported by Recompute Optimizer for now.
-        :return: None
-
-        Args:
-            stat_dict: the dict load by load_persistable method
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                import paddle.compat as cpt
-                
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
-                    prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
-                    cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-                    sum_cost = fluid.layers.reduce_mean(cost)
-                    return sum_cost, fc_1, prediction
-                
-                input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
-                input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-                
-                sgd = fluid.optimizer.Adam(learning_rate=0.01)
-                sgd = fluid.optimizer.RecomputeOptimizer(sgd)
-                sgd._set_checkpoints([fc_1, pred])
-                try:
-                    stat_dict = {}
-                    sgd.load(stat_dict)
-                except NotImplementedError as e:
-                    print(cpt.get_exception_message(e))
-        """
-        raise NotImplementedError(
-            "load function is not supported by Recompute Optimizer for now")
-
-    def apply_gradients(self, params_grads):
-        """
-        call apply_gradients function of self._optimizer.
-
-        Args:
-            params_grads (list): list of (param, grad) pair to do optimization.
-
-        Returns:
-            list: A list of operators appended to the current program.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                import paddle.fluid.framework as framework
-
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
-                    prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
-                    cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-                    sum_cost = fluid.layers.reduce_mean(cost)
-                    return sum_cost, fc_1, prediction
-
-
-                input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
-                input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-
-                sgd = fluid.optimizer.Adam(learning_rate=0.01)
-                sgd = fluid.optimizer.RecomputeOptimizer(sgd)
-                params_grads = sgd.backward(
-                    cost,
-                    startup_program=None,
-                    parameter_list=None,
-                    no_grad_set=None,
-                    checkpoints=[fc_1, pred])
-
-                program = cost.block.program
-                with framework.program_guard(program, None):
-                    optimize_ops = sgd.apply_gradients(params_grads)
-
-                print("Finished apply gradients")
-        """
-
-        return self._optimizer.apply_gradients(params_grads=params_grads)
-
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None,
-                 checkpoints=None):
-        """
-        call append_backward with checkpoints.
-
-        Args:
-            loss (Variable): loss variable to run optimizations.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-            callbacks (list|None): list of callables to run when appending backward
-                operator for one parameter.
-            checkpoints (list): list of Variables as checkpoints
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-    
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
-                    prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
-                    cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-                    sum_cost = fluid.layers.reduce_mean(cost)
-                    return sum_cost, fc_1, prediction
-    
-    
-                input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
-                input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-    
-                sgd = fluid.optimizer.Adam(learning_rate=0.01)
-                sgd = fluid.optimizer.RecomputeOptimizer(sgd)
-                params_grads = sgd.backward(
-                    cost,
-                    startup_program=None,
-                    parameter_list=None,
-                    no_grad_set=None,
-                    checkpoints=[fc_1, pred])
-                print("Finished backward")
-        """
-
-        if framework.in_dygraph_mode():
-            raise NotImplementedError(
-                "DyGraph current does not support recompute")
-
-        self._dtype = loss.dtype
-        program = loss.block.program
-        with program_guard(program, startup_program):
-            params_grads = append_backward(
-                loss,
-                parameter_list,
-                no_grad_set,
-                checkpoints=self._checkpoints)
-        return params_grads
-
-    def apply_optimize(self, loss, startup_program, params_grads):
-        """
-        call the apply_optimize function of self._optimizer
-
-        Args:
-            loss (Variable): loss variable to run optimizations.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            params_grads (list): list of (param, grad) pair to do optimization.
-
-        Examples:
-            .. code-block:: python
-                import paddle.fluid as fluid
-                
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
-                    prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
-                    cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-                    sum_cost = fluid.layers.reduce_mean(cost)
-                    return sum_cost, fc_1, prediction
-                
-                
-                input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
-                input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-                
-                sgd = fluid.optimizer.Adam(learning_rate=0.01)
-                sgd = fluid.optimizer.RecomputeOptimizer(sgd)
-                params_grads = sgd.backward(
-                    cost,
-                    startup_program=None,
-                    parameter_list=None,
-                    no_grad_set=None,
-                    checkpoints=[fc_1, pred])
-                
-                optimize_ops = sgd.apply_optimize(
-                    cost, startup_program=None, params_grads=params_grads)
-                
-                print("Finished apply_optimize")
-        """
-
-        return self._optimizer.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
-
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 grad_clip=None):
-
-        assert (isinstance(loss, Variable)), "The loss should be an Variable."
-        assert (self._checkpoints is not None
-                ), "You should call _set_checkpoints first"
-        if framework.in_dygraph_mode():
-            raise NotImplementedError(
-                "DyGraph current does not support recompute")
-
-        params_grads = self.backward(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set,
-            checkpoints=self._checkpoints)
-
-        if grad_clip:
-            # TODO(guru4elephant): should add grad_clip for static graph
-            pass
-
-        optimize_ops = self.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
-
-        return optimize_ops, params_grads
-
-
-class LookaheadOptimizer(object):
-    """
-    This implements the Lookahead optimizer of the
-    paper : https://arxiv.org/abs/1907.08610.
-
-    Lookahead keeps two sets of params: the fast_params and
-    the slow_params. inner_optimizer update fast_params every 
-    training step. Lookahead updates the slow_params and fast_params 
-    every k training steps as follows:
-
-    .. math::
-        
-        slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1})
-	
-	fast\_param_t &=  slow\_param_t
-
-    Args:
-        inner_optimizer (Optimizer): The optimizer that update fast params step by step. 
-        alpha (float): The learning rate of Lookahead.
-        k (int): The slow params is updated every k steps.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-	    x = fluid.layers.data(name='x', shape=[2], dtype='float32')
-	    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-	    y = fluid.layers.fc(input=[x], size=2, act="softmax")
-	    loss = fluid.layers.cross_entropy(input=y, label=label)
-	    loss = fluid.layers.mean(x=loss)
-	    sgd = fluid.optimizer.SGD(learning_rate=0.01)
-	    optimizer = fluid.optimizer.LookaheadOptimizer(sgd,
-                                            alpha=0.5,
-                                            k=5)
-	    optimizer.minimize(loss)
-	    main_program = fluid.default_main_program()
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
-
-	    feeder = fluid.DataFeeder(feed_list=[x, label], place=place)
-
-	    step = 0
-            while(step < 10):
-                step += 1
-		exe.run(fluid.default_main_program(),
-            	feed=feeder.feed(batch_data))
-
-    """
-
-    def __init__(self, inner_optimizer, alpha=0.5, k=5):
-
-        assert (inner_optimizer is not None), "inner optimizer can not be None"
-        assert (
-            0.0 <= alpha <= 1.0
-        ), "alpha should be larger or equal to 0.0, and less or equal than 1.0"
-        assert (isinstance(k, int) and k > 0), "k should be a positive integer"
-
-        self.inner_optimizer = inner_optimizer
-        self.alpha = alpha
-        self.k = k
-        self.type = "lookahead"
-
-    def minimize(self, loss, startup_program=None):
-
-        # Apply inner optimizer to the main_program
-        mini_out = self.inner_optimizer.minimize(
-            loss, startup_program=startup_program)
-
-        # Get startup_program and main_program
-        if startup_program is None:
-            startup_program = default_startup_program()
-        main_block = loss.block
-
-        # add some vars to the main_program
-        params = [param.name for param in main_block.all_parameters()]
-        param_to_slow = {}
-        for param in params:
-            fast_var = main_block.var(param)
-            assert (fast_var is not None)
-            slow_var = main_block.create_var(
-                name=param + "@SLOW",
-                shape=fast_var.shape,
-                dtype=fast_var.dtype,
-                persistable=True)
-            param_to_slow[param] = slow_var
-
-        # add some vars to the startup_program
-        startup_block = startup_program.global_block()
-        for param in params:
-            fast_var = startup_block.var(param)
-            assert (fast_var is not None)
-            slow_var = startup_block.create_var(
-                name=param + "@SLOW",
-                shape=fast_var.shape,
-                dtype=fast_var.dtype,
-                persistable=True)
-
-            startup_block.append_op(
-                type="assign",
-                inputs={"X": fast_var},
-                outputs={"Out": slow_var})
-
-        # Add Var k to main prog and startup prog
-        k = layers.create_global_var(
-            name="lookahead_k",
-            shape=[1],
-            value=int(self.k),
-            dtype='int32',
-            persistable=True)
-
-        # Add Var alpha to main prog and startup prog
-        alpha = layers.create_global_var(
-            name="lookahead_alpha",
-            shape=[1],
-            value=float(self.alpha),
-            dtype='float32',
-            persistable=True)
-
-        # Add Var step
-        step = layers.create_global_var(
-            name="lookahead_step",
-            shape=[1],
-            value=int(0),
-            dtype='int32',
-            persistable=True)
-        layers.increment(x=step, value=1.0, in_place=True)
-
-        # lookahead
-        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
-
-        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
-
-        mod = layers.elementwise_mod(step, k)
-        with layers.control_flow.Switch() as switch:
-            with switch.case(mod == zero_var):
-                for param_name in params:
-                    fast_var = main_block.var(param_name)
-                    slow_var = param_to_slow[param_name]
-                    tmp_var = layers.elementwise_add(
-                        layers.elementwise_mul(fast_var, alpha),
-                        layers.elementwise_mul(
-                            slow_var, layers.elementwise_sub(one_var, alpha)))
-                    layers.assign(input=tmp_var, output=slow_var)
-                    layers.assign(input=tmp_var, output=fast_var)
-            with switch.default():
-                pass
-        return mini_out
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
deleted file mode 100644
index b1594ee202874403c1aa4af06768151486a65946..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/parallel_executor.py
+++ /dev/null
@@ -1,349 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from . import core
-from . import framework
-from . import executor
-from . import compiler
-import sys
-
-__all__ = ['ParallelExecutor']
-
-ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
-BuildStrategy = core.ParallelExecutor.BuildStrategy
-
-
-class ParallelExecutor(object):
-    """
-    ParallelExecutor is designed for data parallelism, which focuses on distributing
-    the data across different nodes and every node operates on the data in parallel.
-    If you use ParallelExecutor to run the current program on GPU, the node means GPU
-    device, and ParallelExecutor will get the available GPU device automatically on
-    the current machine. If you use ParallelExecutor to run the current program on CPU,
-    the node means the CPU device, and you can specify the CPU device number by adding
-    'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
-    is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
-    of CPUs in the system.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-          import os
-
-          use_cuda = True
-          place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-          # NOTE: If you use CPU to run the program, you need
-          # to specify the CPU_NUM, otherwise, fluid will use
-          # all the number of the logic core as the CPU_NUM,
-          # in that case, the batch size of the input should be
-          # greater than CPU_NUM, if not, the process will be
-          # failed by an exception.
-          if not use_cuda:
-              os.environ['CPU_NUM'] = str(2)
-
-          exe = fluid.Executor(place)
-
-          train_program = fluid.Program()
-          startup_program = fluid.Program()
-          with fluid.program_guard(train_program, startup_program):
-              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-              hidden = fluid.layers.fc(input=data, size=10)
-              loss = fluid.layers.mean(hidden)
-              test_program = fluid.default_main_program().clone(for_test=True)
-              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-          startup_program.random_seed=1
-          exe.run(startup_program)
-
-          train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                             main_program=train_program,
-                                             loss_name=loss.name)
-          test_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                            main_program=test_program,
-                                            share_vars_from=train_exe)
-
-          x = numpy.random.random(size=(10, 1)).astype('float32')
-          loss_data, = train_exe.run(feed={"X": x},
-                                     fetch_list=[loss.name])
-
-          loss_data, = test_exe.run(feed={"X": x},
-                                    fetch_list=[loss.name])
-
-    Args:
-        use_cuda (bool): Whether to use CUDA or not.
-        loss_name (str): The loss name must set in training. Default None.
-        main_program (Program): The program that need to run, if not provided,
-            then default_main_program will be used. Default None.
-        share_vars_from(ParallelExecutor): If provide, it will share variables
-            from the specified ParallelExecutor. Default None.
-        exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
-            the program in ParallelExecutor, for example how many threads are used to
-            execute the program, how many iterations to clean up the temp variables
-            which is generated during execution. For more information, please refer
-            to fluid.ExecutionStrategy. Default None.
-        build_strategy(BuildStrategy): build_strategy is used to control how to
-            build the SSA Graph in ParallelExecutor by setting the property,
-            for example reduce_strategy, gradient_scale_strategy. For more information,
-            please refer to fluid.BuildStrategy. Default None.
-        num_trainers(int): If greater than 1, NCCL will be initialized with
-            multiple rank of nodes, each node should have same number of GPUs.
-            Distributed training will be enabled then. Default 1.
-        trainer_id(int): Must use together with num_trainers. trainer_id is the
-            "rank" of current node starts from 0. Default 0.
-        scope(Scope): scope to run with, default use fluid.global_scope().
-
-    Returns:
-        ParallelExecutor: The initialized ParallelExecutor object.
-
-    Raises:
-        TypeError: If share_vars_from is provided, but not ParallelExecutor object.
-
-    """
-
-    def __init__(self,
-                 use_cuda,
-                 loss_name=None,
-                 main_program=None,
-                 share_vars_from=None,
-                 exec_strategy=None,
-                 build_strategy=None,
-                 num_trainers=1,
-                 trainer_id=0,
-                 scope=None):
-        if build_strategy is None:
-            build_strategy = BuildStrategy()
-
-        # TODO(paddle-dev): trainer_id and num_trainers should be removed from parameter list.
-        if num_trainers != 1 and build_strategy.num_trainers != num_trainers:
-            sys.stderr.write(
-                'The value of build_strategy.num_trainers[%d] is overwritten '
-                'by the passed num_trainers[%d].\n' %
-                (build_strategy.num_trainers, num_trainers))
-            build_strategy.num_trainers = num_trainers
-        if trainer_id != 0 and build_strategy.trainer_id != trainer_id:
-            sys.stderr.write(
-                'The value of build_strategy.trainer_id[%d] is overwritten '
-                'by the passed trainer_id[%d].\n' %
-                (build_strategy.trainer_id, trainer_id))
-            build_strategy.trainer_id = trainer_id
-
-        self._places = framework.cuda_places(
-        ) if use_cuda else framework.cpu_places()
-        self._scope = scope if scope is not None else executor.global_scope()
-
-        if main_program is not None and main_program._enable_dgc:
-            assert build_strategy.num_trainers > 1, "dgc is not useful when num_trainers <= 1"
-            assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce, "dgc \
-                only used for allreduce"
-
-            assert build_strategy.num_trainers * len(
-                self._places) > 1, "dgc is not useful for single card training"
-            assert use_cuda, "dgc only used under cuda"
-
-        main_program = main_program if main_program is not None \
-            else framework.default_main_program()
-
-        self._compiled_program = compiler.CompiledProgram(main_program)
-        if share_vars_from:
-            assert isinstance(
-                share_vars_from, ParallelExecutor
-            ), "The share_vars_from should be ParallelExecutor."
-
-        self._compiled_program.with_data_parallel(
-            loss_name=loss_name,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy,
-            share_vars_from=share_vars_from._compiled_program
-            if share_vars_from else None)
-
-        self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
-        self._exe = executor.Executor(self._place)
-
-    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
-        """
-        Run a parallel executor with fetch_list.
-
-        The feed parameter can be a dict or a list. If feed is a dict, the
-        feed data will be split into multiple devices. If feed is a list, we
-        assume the data has been split into multiple devices, the each
-        element in the list will be copied to each device directly.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              import numpy
-              import os
-
-              use_cuda = True
-              place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-              # NOTE: If you use CPU to run the program, you need
-              # to specify the CPU_NUM, otherwise, fluid will use
-              # all the number of the logic core as the CPU_NUM,
-              # in that case, the batch size of the input should be
-              # greater than CPU_NUM, if not, the process will be
-              # failed by an exception.
-              if not use_cuda:
-                  os.environ['CPU_NUM'] = str(2)
-
-              exe = fluid.Executor(place)
-
-              train_program = fluid.Program()
-              startup_program = fluid.Program()
-              with fluid.program_guard(train_program, startup_program):
-                  data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-                  hidden = fluid.layers.fc(input=data, size=10)
-                  loss = fluid.layers.mean(hidden)
-                  fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-              exe.run(startup_program)
-
-              train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                                 main_program=train_program,
-                                                 loss_name=loss.name)
-
-              # If the feed is a dict:
-              # the image will be splitted into devices. If there is two devices
-              # each device will process an image with shape (5, 1)
-              x = numpy.random.random(size=(10, 1)).astype('float32')
-              loss_data, = train_exe.run(feed={"X": x},
-                                         fetch_list=[loss.name])
-
-              # If the feed is a list:
-              # each device will process each element in the list.
-              # the 1st device will process an image with shape (10, 1)
-              # the 2nd device will process an image with shape (9, 1)
-              #
-              # you can use exe.device_count to get the device number.
-              x2 = numpy.random.random(size=(9, 1)).astype('float32')
-              loss_data, = train_exe.run(feed=[{"X": x}, {"X": x2}],
-                                         fetch_list=[loss.name])
-
-        Args:
-            fetch_list(list): The fetched variable names
-            feed(list|dict|None): The feed variables. If the feed is a dict,
-                tensors in that dict will be split into each devices. If
-                the feed is a list, each element of the list will be copied
-                to each device. Default None.
-            feed_dict: Alias for feed parameter, for backward compatibility.
-                This parameter has been deprecated. Default None.
-            return_numpy(bool): Whether converts the fetched tensor to numpy.
-                Default: True.
-
-        Returns:
-            List: The fetched result list.
-
-        Raises:
-            ValueError: If the feed is a list, but its length is not equal the
-                length of active places, or its element's is not dict.
-
-        NOTES:
-            1. If the feed's type is dict, the number of data that feeds to
-               ParallelExecutor must be bigger than active places. Otherwise,
-               it will throw exception from C++ side. Special attention should be
-               paid to check whether the last batch of the dataset is bigger
-               than active places.
-            2. If active places are more than one, the fetch results for each
-               variable is a list, and each element of this list is the variable of
-               respective active place.
-
-        Examples:
-            .. code-block:: python
-
-                pe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                            loss_name=avg_cost.name,
-                                            main_program=fluid.default_main_program())
-                loss = pe.run(feed=feeder.feed(cur_batch),
-                              fetch_list=[avg_cost.name]))
-        """
-        return self._exe.run(program=self._compiled_program,
-                             scope=self._scope,
-                             feed=feed,
-                             fetch_list=fetch_list,
-                             return_numpy=return_numpy)
-
-    @property
-    def device_count(self):
-        return len(self._places)
-
-    def drop_local_exe_scopes(self):
-        """
-        Drop the local execution scope immediately.
-
-        During the execution of the Program, the generate intermediate
-        results are placed in local execution scope, in some model the
-        creation and deletion of those intermediate results are time-consuming.
-        To resolve that problem, ParallelExecutor provides an option in
-        ExecutionStrategy, i.g. num_iteration_per_drop_scope, this option
-        indicates how many iterations to run before dropping the local execution
-        scope. But in some situation, each iteration generates different
-        intermediate results, it will lead to the result that the memory which
-        is needed by local execution scope gradually increase. And if you want
-        to run another program at this time, there may be insufficient storage,
-        At this point you should drop the local execution scope of other Programs.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              import numpy
-              import os
-
-              use_cuda = True
-              # NOTE: If you use CPU to run the program, you need
-              # to specify the CPU_NUM, otherwise, fluid will use
-              # all the number of the logic core as the CPU_NUM,
-              # in that case, the batch size of the input should be
-              # greater than CPU_NUM, if not, the process will be
-              # failed by an exception.
-              if not use_cuda:
-                  os.environ['CPU_NUM'] = str(2)
-
-              train_program = fluid.Program()
-              startup_program = fluid.Program()
-              with fluid.program_guard(train_program, startup_program):
-                  data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-                  hidden = fluid.layers.fc(input=data, size=10)
-                  loss = fluid.layers.mean(hidden)
-
-              place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-              exe = fluid.Executor(place)
-              exe.run(startup_program)
-
-              parallel_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                                 main_program=train_program,
-                                                 loss_name=loss.name)
-
-              x = numpy.random.random(size=(10, 1)).astype('float32')
-              loss_data, = parallel_exe.run(feed={"X": x},
-                                         fetch_list=[loss.name])
-
-              parallel_exe.drop_local_exe_scopes()
-        """
-        assert isinstance(
-            self._compiled_program._executor,
-            core.ParallelExecutor), "The Executor should be ParallelExecutor."
-        self._compiled_program._executor.drop_local_exe_scopes()
-
-    # This API is used to check whether DropLocalExeScopes can work.
-    def _need_create_local_exe_scopes(self):
-        assert isinstance(
-            self._compiled_program._executor,
-            core.ParallelExecutor), "The Executor should be ParallelExecutor."
-        return self._compiled_program._executor._need_create_local_exe_scopes()
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
deleted file mode 100644
index 028aada68cd8cee32fd144739d766f359d84c22b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/param_attr.py
+++ /dev/null
@@ -1,238 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import six
-
-from .initializer import Initializer, Xavier, Constant
-from .regularizer import WeightDecayRegularizer
-
-__all__ = [
-    'ParamAttr',
-    'WeightNormParamAttr',
-]
-
-
-class ParamAttr(object):
-    """
-    Parameter attributes object. To fine-tuning network training process, user
-    can set parameter's attributes to control training details. Such as learning rate,
-    regularization, trainable, do_model_average and the method to initialize param.
-
-
-    Args:
-        name(str): The parameter's name. Default None.
-        initializer(Initializer): The method to initial this parameter. Default None.
-        learning_rate(float): The parameter's learning rate. The learning rate when
-            optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
-            Default 1.0.
-        regularizer(WeightDecayRegularizer): Regularization factor. Default None.
-        trainable(bool): Whether this parameter is trainable. Default True.
-        gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
-            gradient. Default None.
-        do_model_average(bool): Whether this parameter should do model average 
-            when model average is enabled. Default True.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            w_param_attrs = fluid.ParamAttr(name="fc_weight",
-                                            learning_rate=0.5,
-                                            regularizer=fluid.regularizer.L2Decay(1.0),
-                                            trainable=True)
-            x = fluid.layers.data(name='X', shape=[1], dtype='float32')
-            y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
-    """
-
-    def __init__(self,
-                 name=None,
-                 initializer=None,
-                 learning_rate=1.0,
-                 regularizer=None,
-                 trainable=True,
-                 gradient_clip=None,
-                 do_model_average=True):
-        self.name = name
-        self.initializer = initializer
-        self.learning_rate = learning_rate
-        self.regularizer = regularizer
-        self.trainable = trainable
-        self.gradient_clip = gradient_clip
-        self.do_model_average = do_model_average
-
-    def _set_default_initializer(self, initializer):
-        """
-        Set the default initializer, the initializer should be Constant,
-        Uniform, Normal, Xavier, MSRA.
-
-        Args:
-            initializer(Initializer): the initializer to set.
-
-        Returns:
-            None
-        """
-        if initializer is None:
-            if self.initializer is None:
-                raise ValueError("ParamAttr.initializer is not set")
-            return
-
-        if self.initializer is not None:
-            return
-
-        self.initializer = initializer
-
-    def _set_default_param_initializer(self):
-        """
-        Set the default initializer for the parameter with Xavier.
-
-        Args:
-            None.
-
-        Returns:
-            None.
-        """
-        self._set_default_initializer(Xavier())
-
-    def _set_default_bias_initializer(self):
-        """
-        Set the default initializer for the bias with Constant(0.0).
-
-        Args:
-            None.
-
-        Returns:
-            None.
-        """
-        self._set_default_initializer(Constant(0.0))
-
-    @staticmethod
-    def _to_attr(arg):
-        """
-        Create ParamAttr[s].
-
-        Args:
-            arg: Arguments to initialize ParamAttr[s]. arg's type can be
-                str, Initializer, float, WeightDecayRegularizer, BaseGradientClipAttr,
-                bool, ParamAttr, or a list of above type.
-
-        Returns:
-            ParamAttr[s]: ParamAttr[s] initialized with arg.
-
-        Raises:
-            arg can not initialize a ParamAttr.
-        """
-        if arg is None:
-            return ParamAttr()
-        elif isinstance(arg, list) or isinstance(arg, tuple):
-            return [ParamAttr._to_attr(a) for a in arg]
-        elif isinstance(arg, ParamAttr):
-            return arg
-        elif isinstance(arg, six.string_types):
-            return ParamAttr(name=arg)
-        elif isinstance(arg, Initializer):
-            return ParamAttr(initializer=arg)
-        elif isinstance(arg, WeightDecayRegularizer):
-            return ParamAttr(regularizer=arg)
-        elif isinstance(arg, bool):
-            return ParamAttr._to_attr(None) if arg else False
-        else:
-            raise TypeError("{0} cast to ParamAttr".format(type(arg)))
-
-    def _to_kwargs(self, with_initializer=False):
-        """
-        Returns the attributes of this parameter.
-
-        Args:
-            with_initializer(bool): Whether to add initializer attr.
-
-        Returns:
-            Parameter attributes(map): The attributes of this parameter.
-        """
-        kwargs = {
-            'name': self.name,
-            'optimize_attr': {
-                'learning_rate': self.learning_rate
-            },
-            'regularizer': self.regularizer,
-            'trainable': self.trainable,
-            'gradient_clip_attr': self.gradient_clip,
-            'do_model_average': self.do_model_average
-        }
-        if with_initializer:
-            kwargs['initializer'] = self.initializer
-        return kwargs
-
-
-class WeightNormParamAttr(ParamAttr):
-    """
-    Used for weight Norm. Weight Norm is a reparameterization of the weight vectors
-    in a neural network that decouples the magnitude of those weight vectors from
-    their direction. Weight Norm has been implemented as discussed in this
-    paper: `Weight Normalization: A Simple Reparameterization to Accelerate
-    Training of Deep Neural Networks
-    <https://arxiv.org/pdf/1602.07868.pdf>`_.
-
-    Args:
-        dim(int): Dimension over which to compute the norm. Default None.
-        name(str): The parameter's name. Default None.
-        initializer(Initializer): The method to initial this parameter. Default None.
-        learning_rate(float): The parameter's learning rate. The learning rate when
-            optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
-            Default 1.0.
-        regularizer(WeightDecayRegularizer): Regularization factor. Default None.
-        trainable(bool): Whether this parameter is trainable. Default True.
-        gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
-            gradient. Default None.
-        do_model_average(bool): Whether this parameter should do model average.
-            Default False.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32")
-            fc = fluid.layers.fc(input=data,
-                                 size=1000,
-                                 param_attr=fluid.WeightNormParamAttr(
-                                      dim=None,
-                                      name='weight_norm_param'))
-
-    """
-    # List to record the parameters reparameterized by weight normalization.
-    # If these parameters are treated as Variable rather than Parameter,
-    # it can be used to discriminate these parameters and help to serialize
-    # these paramters for inference.
-    params_with_weight_norm = []
-
-    def __init__(self,
-                 dim=None,
-                 name=None,
-                 initializer=None,
-                 learning_rate=1.0,
-                 regularizer=None,
-                 trainable=True,
-                 gradient_clip=None,
-                 do_model_average=False):
-        super(WeightNormParamAttr, self).__init__(
-            name=name,
-            initializer=initializer,
-            learning_rate=learning_rate,
-            regularizer=regularizer,
-            trainable=trainable,
-            gradient_clip=gradient_clip,
-            do_model_average=do_model_average)
-        self.dim = dim
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
deleted file mode 100644
index b0e168929b46a1dd1410d126e093883d79b99895..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/profiler.py
+++ /dev/null
@@ -1,283 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import core
-from .wrapped_decorator import signature_safe_contextmanager
-import os
-import six
-
-__all__ = [
-    'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
-    'stop_profiler'
-]
-
-NVPROF_CONFIG = [
-    "gpustarttimestamp",
-    "gpuendtimestamp",
-    "gridsize3d",
-    "threadblocksize",
-    "streamid",
-    "enableonstart 0",
-    "conckerneltrace",
-]
-
-
-@signature_safe_contextmanager
-def cuda_profiler(output_file, output_mode=None, config=None):
-    """The CUDA profiler.
-    This fuctions is used to profile CUDA program by CUDA runtime application
-    programming interface. The profiling result will be written into
-    `output_file` with Key-Value pair format or Comma separated values format.
-    The user can set the output mode by `output_mode` argument and set the
-    counters/options for profiling by `config` argument. The default config
-    is ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d',
-    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
-    Then users can use NVIDIA Visual Profiler
-    (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
-    this output file to visualize results.
-
-    Args:
-        output_file (string) : The output file name, the result will be
-            written into this file.
-        output_mode (string) : The output mode has Key-Value pair format and
-            Comma separated values format. It should be 'kvp' or 'csv'.
-        config (list of string) : The profiler options and counters can refer
-            to "Compute Command Line Profiler User Guide".
-
-    Raises:
-        ValueError: If `output_mode` is not in ['kvp', 'csv'].
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-            import numpy as np
-
-            epoc = 8
-            dshape = [4, 3, 28, 28]
-            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
-            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-
-            place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-
-            output_file = 'cuda_profiler.txt'
-            with profiler.cuda_profiler(output_file, 'csv') as nvprof:
-                for i in range(epoc):
-                    input = np.random.random(dshape).astype('float32')
-                    exe.run(fluid.default_main_program(), feed={'data': input})
-            # then use  NVIDIA Visual Profiler (nvvp) to load this output file
-            # to visualize results.
-    """
-    if output_mode is None:
-        output_mode = 'csv'
-    if output_mode not in ['kvp', 'csv']:
-        raise ValueError("The output mode must be 'kvp' or 'csv'.")
-    config = NVPROF_CONFIG if config is None else config
-    config_file = 'nvprof_config_file'
-    with open(config_file, 'wb') as fp:
-        fp.writelines([six.b("%s\n" % item) for item in config])
-    core.nvprof_init(output_file, output_mode, config_file)
-    # Enables profiler collection by the active CUDA profiling tool.
-    core.nvprof_start()
-    yield
-    # Disables profiler collection.
-    core.nvprof_stop()
-    os.remove(config_file)
-
-
-def reset_profiler():
-    """
-    Clear the previous time record. This interface does not work for
-    `fluid.profiler.cuda_profiler`, it only works for
-    `fluid.profiler.start_profiler`, `fluid.profiler.stop_profiler`,
-    and `fluid.profiler.profiler`.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-            with profiler.profiler('CPU', 'total', '/tmp/profile'):
-                for iter in range(10):
-                    if iter == 2:
-                        profiler.reset_profiler()
-                    # ...
-    """
-    core.reset_profiler()
-
-
-def start_profiler(state):
-    """
-    Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to insert the code, except the usage of
-    `fluid.profiler.profiler` interface.
-
-    Args:
-        state (string) : The profiling state, which should be 'CPU', 'GPU'
-            or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
-            GPU as well. 'All' also generates timeline.
-
-    Raises:
-        ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-
-            profiler.start_profiler('GPU')
-            for iter in range(10):
-                if iter == 2:
-                    profiler.reset_profiler()
-                # except each iteration
-            profiler.stop_profiler('total', '/tmp/profile')
-    """
-    if core.is_profiler_enabled():
-        return
-    if state not in ['CPU', 'GPU', "All"]:
-        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
-    if state == "GPU":
-        prof_state = core.ProfilerState.kCUDA
-    elif state == "CPU":
-        prof_state = core.ProfilerState.kCPU
-    else:
-        prof_state = core.ProfilerState.kAll
-    core.enable_profiler(prof_state)
-
-
-def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
-    """
-    Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to insert the code, except the usage of
-    `fluid.profiler.profiler` interface.
-
-    Args:
-        sorted_key (string) : If None, the profiling results will be printed
-            in the order of first end time of events. Otherwise, the profiling
-            results will be sorted by the this flag. This flag should be one
-            of 'calls', 'total', 'max', 'min' or 'ave'.
-            The `calls` means sorting by the number of calls.
-            The `total` means sorting by the total execution time.
-            The `max` means sorting by the maximum execution time.
-            The `min` means sorting by the minimum execution time.
-            The `ave` means sorting by the average execution time.
-        profile_path (string) : If state == 'All', it will write a profile
-            proto output file.
-
-    Raises:
-        ValueError: If `sorted_key` is not in
-            ['calls', 'total', 'max', 'min', 'ave'].
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-
-            profiler.start_profiler('GPU')
-            for iter in range(10):
-                if iter == 2:
-                    profiler.reset_profiler()
-                # except each iteration
-            profiler.stop_profiler('total', '/tmp/profile')
-    """
-    if not core.is_profiler_enabled():
-        return
-    sorted_key = 'default' if sorted_key is None else sorted_key
-    if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
-        raise ValueError("The sorted_key must be None or in 'calls', 'total', "
-                         "'max', 'min' and 'ave'")
-    key_map = {
-        'default': core.EventSortingKey.kDefault,
-        'calls': core.EventSortingKey.kCalls,
-        'total': core.EventSortingKey.kTotal,
-        'max': core.EventSortingKey.kMax,
-        'min': core.EventSortingKey.kMin,
-        'ave': core.EventSortingKey.kAve,
-    }
-    # TODO(qingqing) : redirect C++ ostream to Python stream.
-    # with core.ostream_redirect(stdout=True, stderr=True):
-    core.disable_profiler(key_map[sorted_key], profile_path)
-
-
-@signature_safe_contextmanager
-def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
-    """The profiler interface.
-    Different from cuda_profiler, this profiler can be used to profile both CPU
-    and GPU program. By default, it records the CPU and GPU operator kernels,
-    if you want to profile other program, you can refer the profiling tutorial
-    to add more records in C++ code.
-
-    If the state == 'All', a profile proto file will be written to
-    `profile_path`. This file records timeline information during the execution.
-    Then users can visualize this file to see the timeline, please refer
-    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
-
-    Args:
-        state (string) : The profiling state, which should be 'CPU' or 'GPU',
-            telling the profiler to use CPU timer or GPU timer for profiling.
-            Although users may have already specified the execution place
-            (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler
-            would not inherit this place.
-        sorted_key (string) : If None, the profiling results will be printed
-            in the order of first end time of events. Otherwise, the profiling
-            results will be sorted by the this flag. This flag should be one
-            of 'calls', 'total', 'max', 'min' or 'ave'.
-            The `calls` means sorting by the number of calls.
-            The `total` means sorting by the total execution time.
-            The `max` means sorting by the maximum execution time.
-            The `min` means sorting by the minimum execution time.
-            The `ave` means sorting by the average execution time.
-        profile_path (string) : If state == 'All', it will write a profile
-            proto output file.
-
-    Raises:
-        ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
-            not in ['calls', 'total', 'max', 'min', 'ave'].
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-            import numpy as np
-
-            epoc = 8
-            dshape = [4, 3, 28, 28]
-            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
-            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-
-            with profiler.profiler('CPU', 'total', '/tmp/profile') as prof:
-                for i in range(epoc):
-                    input = np.random.random(dshape).astype('float32')
-                    exe.run(fluid.default_main_program(), feed={'data': input})
-    """
-    start_profiler(state)
-    yield
-    stop_profiler(sorted_key, profile_path)
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
deleted file mode 100644
index f10a0ed5548cc26ba67f116d2fd887f452742856..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/reader.py
+++ /dev/null
@@ -1,1019 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import core, dygraph
-import sys
-import six
-import warnings
-import numpy as np
-import threading
-import paddle
-from .framework import Program, Variable, program_guard, default_main_program, default_startup_program, in_dygraph_mode, cpu_places
-from .executor import global_scope
-from .data_feeder import DataFeeder, BatchedTensorProvider, ListTensorProvider
-from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer
-from .unique_name import UniqueNameGenerator
-import logging
-from .dataset import DatasetBase, InMemoryDataset
-
-__all__ = ['PyReader', 'DataLoader']
-
-data_loader_unique_name_generator = UniqueNameGenerator()
-
-
-def _convert_places(places):
-    if not isinstance(places, (list, tuple)):
-        places = [places]
-
-    ret = []
-    for p in places:
-        if not isinstance(p, core.Place):
-            tmp = core.Place()
-            tmp.set_place(p)
-            p = tmp
-
-        ret.append(p)
-    return ret
-
-
-class DataLoaderBase(object):
-    def __init__(self):
-        self._places = None
-
-    def __call__(self):
-        return self
-
-    def next(self):
-        '''
-        Get the next item in the DataLoader object. This method    
-        should not be called by users directly. It is used for
-        implementing iterator protocol of Python 2.x inside
-        PaddlePaddle framework.
-        '''
-        return self.__next__()
-
-    def __iter__(self):
-        raise NotImplementedError()
-
-    def __next__(self):
-        raise NotImplementedError()
-
-
-class DataLoader(object):
-    @staticmethod
-    def from_generator(feed_list=None,
-                       capacity=None,
-                       use_double_buffer=True,
-                       iterable=True,
-                       return_list=False):
-        """
-        Create a DataLoader object for loading data from Python generator. 
-        Data would be prefetched using Python thread and be pushed
-        into a queue asynchronously.
-
-        The created DataLoader object provides 3 methods to set the data source
-        :code:`set_sample_generator` , :code:`set_sample_list_generator` and 
-        :code:`set_batch_generator` . Please see the following example codes
-        to know their usages.
-
-        If iterable = True, the created DataLoader object is a Python generator
-        object, which is iterable using for-range loop.
-
-        If iterable = False, the created DataLoader object provides 
-        :code:`start()` and :code:`reset()` method to control the data reading
-        process. This mode is designed to be compatible with the 
-        :code:`fluid.layers.py_reader` interface. Users can migrate the codes   
-        from :code:`fluid.layers.py_reader` to :code:`fluid.io.DataLoader` 
-        easily when using iterable=False. 
-
-        Args:  
-            feed_list (list(Variable)|tuple(Variable)): feed variable list.
-                The variables should be created by :code:`fluid.layers.data()`.
-            capacity (int): capacity of the queue maintained in DataLoader.
-                The unit is batch number. Set larger capacity if your reader 
-                is fast. 
-            use_double_buffer (bool): whether to use double_buffer_reader. 
-                If use_double_buffer=True, the DataLoader would prefetch next 
-                batch data asynchronously, so it would speed up data feeding 
-                and occupies a little more CPU or GPU memory, i.e., the memory
-                of one batch input data. 
-            iterable (bool): whether the created DataLoader is iterable. 
-            return_list (bool): whether the return value on each device is 
-                presented as a list. It is only valid when iterable=True. 
-                If return_list=False, the return value on each device would 
-                be a dict of str -> LoDTensor, where the key of the dict is 
-                the name of each feeded variables. If return_list=True, the 
-                return value on each device would be a list(LoDTensor). It is
-                recommended to use return_list=False in static graph mode and
-                use return_list=True in dygraph mode.   
-
-        Returns:
-            loader (DataLoader): the created DataLoader object.
-
-        Examples:
-            
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                import numpy as np
-
-                BATCH_NUM = 10 
-                BATCH_SIZE = 16
-                EPOCH_NUM = 4
-
-                CLASS_NUM = 10
-
-                ITERABLE = True # whether the created DataLoader object is iterable
-                USE_GPU = False # whether to use GPU
-
-                DATA_FORMAT = 'batch_generator' # data format of data source user provides 
-
-                def simple_net(image, label):
-                    fc_tmp = fluid.layers.fc(image, size=CLASS_NUM)
-                    cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label)
-                    loss = fluid.layers.reduce_mean(cross_entropy)
-                    sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                    sgd.minimize(loss)
-                    return loss
-
-                def get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
-
-                # If the data generator yields one sample each time,
-                # use DataLoader.set_sample_generator to set the data source.
-                def sample_generator_creator(): 
-                    def __reader__():
-                        for _ in range(BATCH_NUM * BATCH_SIZE):
-                            image, label = get_random_images_and_labels([784], [1])
-                            yield image, label
-
-                    return __reader__
-
-                # If the data generator yield list of samples each time,
-                # use DataLoader.set_sample_list_generator to set the data source.
-                def sample_list_generator_creator():
-                    def __reader__():
-                        for _ in range(BATCH_NUM): 
-                            sample_list = []
-                            for _ in range(BATCH_SIZE):
-                                image, label = get_random_images_and_labels([784], [1])
-                                sample_list.append([image, label])
-
-                            yield sample_list
-
-                    return __reader__ 
-
-                # If the data generator yields a batch each time, 
-                # use DataLoader.set_batch_generator to set the data source.
-                def batch_generator_creator():
-                    def __reader__():
-                        for _ in range(BATCH_NUM):
-                            batch_image, batch_label = get_random_images_and_labels([BATCH_SIZE, 784], [BATCH_SIZE, 1]) 
-                            yield batch_image, batch_label
-
-                    return __reader__
-
-                # If DataLoader is iterable, use for loop to train the network 
-                def train_iterable(exe, prog, loss, loader):
-                    for _ in range(EPOCH_NUM):
-                        for data in loader():
-                            exe.run(prog, feed=data, fetch_list=[loss])
-
-                # If DataLoader is not iterable, use start() and reset() method to control the process 
-                def train_non_iterable(exe, prog, loss, loader):
-                    for _ in range(EPOCH_NUM):
-                        loader.start() # call DataLoader.start() before each epoch starts
-                        try:
-                            while True:
-                                exe.run(prog, fetch_list=[loss])
-                        except fluid.core.EOFException:
-                            loader.reset() # call DataLoader.reset() after catching EOFException 
-
-                def set_data_source(loader, places):
-                    if DATA_FORMAT == 'sample_generator':
-                        loader.set_sample_generator(sample_generator_creator(), batch_size=BATCH_SIZE, drop_last=True, places=places)
-                    elif DATA_FORMAT == 'sample_list_generator':
-                        loader.set_sample_list_generator(sample_list_generator_creator(), places=places)
-                    elif DATA_FORMAT == 'batch_generator':
-                        loader.set_batch_generator(batch_generator_creator(), places=places)
-                    else:
-                        raise ValueError('Unsupported data format')
-
-                image = fluid.layers.data(name='image', shape=[784], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-                # Define DataLoader 
-                loader = fluid.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE)
-
-                # Define network
-                loss = simple_net(image, label)
-
-                # Set data source of DataLoader
-                #
-                # If DataLoader is iterable, places must be given and the number of places must be the same with device number.  
-                #  - If you are using GPU, call `fluid.cuda_places()` to get all GPU places. 
-                #  - If you are using CPU, call `fluid.cpu_places()` to get all CPU places. 
-                # 
-                # If DataLoader is not iterable, places can be None.
-                places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
-                set_data_source(loader, places)
-
-                exe = fluid.Executor(places[0])
-                exe.run(fluid.default_startup_program())
-
-                prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
-
-                if loader.iterable:
-                    train_iterable(exe, prog, loss, loader)
-                else:
-                    train_non_iterable(exe, prog, loss, loader)
-
-
-                '''
-                Users can use return_list = True in dygraph mode. 
-                '''
-                with fluid.dygraph.guard(places[0]):
-                    loader = fluid.io.DataLoader.from_generator(capacity=2, return_list=True)
-                    set_data_source(loader, places[0]) 
-                    for image, label in loader():
-                        relu = fluid.layers.relu(image)
-                        assert image.shape == [BATCH_SIZE, 784] 
-                        assert label.shape == [BATCH_SIZE, 1]
-                        assert relu.shape == [BATCH_SIZE, 784]
-        """
-        return GeneratorLoader(feed_list, capacity, use_double_buffer, iterable,
-                               return_list)
-
-    @staticmethod
-    def from_dataset(dataset, places, drop_last=True):
-        """
-        Create an iterable DataLoader object for loading data from Dataset.    
-        Dataset is only supported in Linux system currently.
-
-        Args:
-            dataset (InMemoryDataset|QueueDataset): the dataset object.
-            places (list(CUDAPlace)|list(CPUPlace)): places where the result 
-                data should be converted.   
-            drop_last (bool): whether to drop the last batch whose sample 
-                number is less than batch size. If drop_last = True, they
-                would be dropped. If drop_last = False, they would be kept. 
-
-        Returns:
-            loader (DataLoader): the created DataLoader object, which can be 
-                treated as a Python generator.   
-
-        Examples:
-
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                image = fluid.layers.data(name='image', shape=[784], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-                dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-                dataset.set_batch_size(32)
-                dataset.set_filelist(['a.txt', 'b.txt', 'c.txt'])
-                dataset.set_use_var([image, label])
-                dataset.set_pipe_command('cat') 
-
-                loader = fluid.io.DataLoader.from_dataset(dataset, fluid.cpu_places())
-        """
-        return DatasetLoader(dataset, places, drop_last)
-
-
-class GeneratorLoader(DataLoaderBase):
-    def __init__(self,
-                 feed_list=None,
-                 capacity=None,
-                 use_double_buffer=True,
-                 iterable=True,
-                 return_list=False):
-        self._tensor_reader = None
-        self._places = None
-        self._thread = None
-        self._feed_list = feed_list
-        if not capacity:
-            raise ValueError("Please give value to capacity.")
-        # force to use iterable mode under dygraph mode
-        if in_dygraph_mode():
-            if not iterable:
-                warnings.warn(
-                    "Please NOTE: dygraph can support iterable mode only. Change to iterable mode."
-                )
-            self._iterable = True
-            if not return_list:
-                warnings.warn(
-                    "Please NOTE: dygraph can support return as list only. Change to return as list."
-                )
-            self._return_list = True
-        else:
-            self._iterable = iterable
-            self._return_list = return_list
-            if not self._feed_list:
-                raise Exception("Feed list must be given under static mode.")
-        self._use_double_buffer = use_double_buffer
-        self._capacity = capacity
-        if not self._iterable:
-            self._init_non_iterable()
-
-    def _wait_thread_ends(self):
-        # Get self._thread first to prevent data race, because __thread_main__ 
-        # would set self._thread be None at the end
-        thread = self._thread
-        if thread is not None and self._iterable:
-            self._queue.close()
-            thread.join()
-
-    def _init_iterable(self):
-        self._wait_thread_ends()
-        if in_dygraph_mode():
-            self._var_names = []
-        else:
-            self._var_names = [v.name for v in self._feed_list]
-        self._queue = core.init_lod_tensor_blocking_queue(core.Variable(),
-                                                          self._capacity)
-        self._reader = core.create_py_reader(
-            self.queue, self._var_names, self._places, self._use_double_buffer)
-
-    def _init_non_iterable(self):
-        lod_levels = []
-        dtypes = []
-        shape_concat = []
-        ranks = []
-        shapes = []
-
-        for feed_data in self._feed_list:
-            dtypes.append(feed_data.dtype)
-            shape_concat.extend(feed_data.shape)
-            ranks.append(len(feed_data.shape))
-            shapes.append(feed_data.shape)
-            lod_levels.append(feed_data.lod_level)
-
-        queue_name = data_loader_unique_name_generator(
-            'lod_tensor_blocking_queue')
-        reader_name = data_loader_unique_name_generator('create_py_reader')
-        double_buffer_name = data_loader_unique_name_generator('double_buffer')
-
-        var = global_scope().var(queue_name)
-        self._queue = core.init_lod_tensor_blocking_queue(var, self._capacity)
-
-        startup_blk = default_startup_program().current_block()
-        startup_var = startup_blk.create_var(name=reader_name)
-
-        startup_blk.append_op(
-            type='create_py_reader',
-            inputs={'blocking_queue': [queue_name]},
-            outputs={'Out': [startup_var]},
-            attrs={
-                'shape_concat': shape_concat,
-                'lod_levels': lod_levels,
-                'ranks': ranks
-            })
-
-        startup_var.desc.set_dtypes(dtypes)
-        startup_var.persistable = True
-
-        main_prog_var = _copy_reader_var_(
-            default_main_program().current_block(), startup_var)
-
-        main_prog_var.stop_gradient = True
-        main_prog_var.persistable = True
-
-        reader = monkey_patch_reader_methods(main_prog_var)
-        if self._use_double_buffer:
-            double_buffer_reader = double_buffer(
-                reader, name=double_buffer_name)
-            # we return a double buffer reader. However, the reset method comes from
-            # py_reader.
-            double_buffer_reader.reset = reader.reset
-            reader = double_buffer_reader
-
-        self._reader = reader
-
-        default_main_program().current_block().append_op(
-            type='read',
-            inputs={'Reader': [self._reader]},
-            outputs={'Out': self._feed_list})
-
-    @property
-    def queue(self):
-        return self._queue
-
-    @property
-    def iterable(self):
-        return self._iterable
-
-    def __iter__(self):
-        assert self.iterable, "DataLoader is not iterable"
-        assert self._tensor_reader is not None, \
-            "Data source of DataLoader has not set yet"
-
-        self._init_iterable()
-        self._start()
-        return self
-
-    def __next__(self):
-        try:
-            if not in_dygraph_mode():
-                if self._return_list:
-                    return self._reader.read_next_list()
-                else:
-                    return self._reader.read_next()
-            else:
-                ret = self._reader.read_next_list()[0]
-                return [dygraph.base.to_variable(np.array(v)) for v in ret]
-        except StopIteration:
-            self._queue.close()
-            self._reset()
-            six.reraise(*sys.exc_info())
-
-    def start(self):
-        if not in_dygraph_mode():
-            assert not self._iterable, "start() cannot be called when DataLoader is iterable"
-            self._start()
-
-    def reset(self):
-        if not in_dygraph_mode():
-            assert not self._iterable, "reset() cannot be called when DataLoader is iterable"
-            self._reset()
-
-    def _start(self):
-        def __thread_main__():
-            try:
-                for tensors in self._tensor_reader():
-                    array = core.LoDTensorArray()
-                    for item in tensors:
-                        if not isinstance(item, core.LoDTensor):
-                            tmp = core.LoDTensor()
-                            tmp.set(item, core.CPUPlace())
-                            item = tmp
-
-                        array.append(item)
-
-                    if not self._queue.push(array):
-                        break
-
-                self._queue.close()
-                self._thread = None
-            except Exception as ex:
-                self._queue.close()
-                self._thread = None
-                logging.warn('Your reader has raised an exception!')
-                six.reraise(*sys.exc_info())
-
-        self._thread = threading.Thread(target=__thread_main__)
-        self._thread.daemon = True
-        self._thread.start()
-
-    def _reset(self):
-        self._reader.reset()
-        thread = self._thread
-        if thread is not None:
-            thread.join()
-
-    def set_sample_generator(self,
-                             reader,
-                             batch_size,
-                             drop_last=True,
-                             places=None):
-        assert batch_size > 0, "batch_size must be larger than 0"
-        if not in_dygraph_mode():
-            has_lod = False
-            for f in self._feed_list:
-                if f.lod_level != 0:
-                    has_lod = True
-                    break
-
-            if has_lod:
-                self.set_sample_list_generator(
-                    paddle.batch(
-                        reader, batch_size=batch_size, drop_last=drop_last),
-                    places=places)
-            else:
-                reader = BatchedTensorProvider(
-                    feed_list=self._feed_list,
-                    place=core.CPUPlace(),
-                    batch_size=batch_size,
-                    generator=reader,
-                    drop_last=drop_last)
-                self.set_batch_generator(reader, places=places)
-        else:
-            self.set_sample_list_generator(
-                paddle.batch(
-                    reader, batch_size=batch_size, drop_last=drop_last),
-                places=places)
-        return self
-
-    def set_sample_list_generator(self, reader, places=None):
-        if not in_dygraph_mode():
-            with program_guard(Program(), Program()):
-                feeder = DataFeeder(
-                    feed_list=self._feed_list, place=core.CPUPlace())
-                paddle_reader = feeder.decorate_reader(
-                    reader, multi_devices=False)
-
-            def __tensor_reader_impl__():
-                for slots in paddle_reader():
-                    yield [slots[var.name] for var in self._feed_list]
-        else:
-            provider = ListTensorProvider(reader, places)
-
-            def __tensor_reader_impl__():
-                for slots in provider():
-                    yield slots[0]
-
-        self.set_batch_generator(__tensor_reader_impl__, places)
-        return self
-
-    def set_batch_generator(self, reader, places=None):
-        self._tensor_reader = reader
-        if self._iterable:
-            assert places is not None, "Places cannot be None when DataLoader is iterable"
-            self._places = _convert_places(places)
-            if in_dygraph_mode():
-                assert len(self._places
-                           ) == 1, "Number of places must be 1 in dygraph mode"
-        else:
-            if places is not None:
-                logging.info(
-                    'places would be ommited when DataLoader is not iterable')
-        return self
-
-
-class PyReader(DataLoaderBase):
-    """
-    Create a reader object for data feeding in Python. 
-    Data would be prefetched using Python thread and be pushed
-    into a queue asynchronously. Data in the queue would be extracted 
-    automatically when `Executor.run(...)` is called.
-
-    Args:  
-        feed_list (list(Variable)|tuple(Variable)): feed variable list.
-            The variables should be created by :code:`fluid.layers.data()`.
-        capacity (int): capacity of the queue maintained in PyReader.
-            The unit is batch number. Set larger capacity if your reader 
-            is fast. 
-        use_double_buffer (bool): whether to use double_buffer_reader. 
-            If use_double_buffer=True, PyReader would prefetch next 
-            batch data asynchronously, so it would speed up data feeding 
-            and occupies a little more CPU or GPU memory, i.e., the memory
-            of one batch input data. 
-        iterable (bool): whether the created PyReader is iterable. 
-        return_list (bool): whether the return value on each device is 
-            presented as a list. It is only valid when iterable=True. 
-            If return_list=False, the return value on each device would 
-            be a dict of str -> LoDTensor, where the key of the dict is 
-            the name of each feeded variables. If return_list=True, the 
-            return value on each device would be a list(LoDTensor). It is
-            recommended to use return_list=False in static graph mode and
-            use return_list=True in dygraph mode. 
-
-    Returns:
-        reader (Reader): the created reader object.
-
-    Examples:
-        1. If iterable = False, the created PyReader object is almost the
-           same as :code:`fluid.layers.py_reader()`. Operators would be 
-           inserted into the program. User should call :code:`start()` 
-           before each epoch and catch :code:`fluid.core.EOFException`
-           thrown by :code:`Executor.run()` when epoch ends. Once the 
-           exception is caught, user should call :code:`reset()` to reset 
-           the reader manually.
-
-        .. code-block:: python
-
-           import paddle
-           import paddle.fluid as fluid
-           import numpy as np
-
-           EPOCH_NUM = 3
-           ITER_NUM = 5
-           BATCH_SIZE = 3
-
-           def reader_creator_random_image_and_label(height, width):
-               def reader():
-                   for i in range(ITER_NUM):
-                       fake_image = np.random.uniform(low=0,
-                                                      high=255,
-                                                      size=[height, width])
-                       fake_label = np.ones([1])
-                       yield fake_image, fake_label
-               return reader
-
-           image = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-           label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-           reader = fluid.io.PyReader(feed_list=[image, label],
-                                      capacity=4,
-                                      iterable=False)
-
-           user_defined_reader = reader_creator_random_image_and_label(784, 784)
-           reader.decorate_sample_list_generator(
-               paddle.batch(user_defined_reader, batch_size=BATCH_SIZE))
-           # definition of network is omitted
-           executor = fluid.Executor(fluid.CUDAPlace(0))
-           executor.run(fluid.default_startup_program())
-           for i in range(EPOCH_NUM):
-               reader.start()
-               while True:
-                   try:
-                       executor.run(feed=None)
-                   except fluid.core.EOFException:
-                       reader.reset()
-                       break
-
- 
-        2. If iterable=True, the created PyReader object is decoupled with
-           the program. No operator would be inserted into the program. 
-           In this case, the created reader is a Python generator, which 
-           is iterable. User should feed the data yielded from PyReader 
-           object into :code:`Executor.run(feed=...)`.  
-
-        .. code-block:: python
-
-           import paddle
-           import paddle.fluid as fluid
-           import numpy as np
-
-           EPOCH_NUM = 3
-           ITER_NUM = 5
-           BATCH_SIZE = 10
-
-           def reader_creator_random_image(height, width):
-               def reader():
-                   for i in range(ITER_NUM):
-                       yield np.random.uniform(low=0, high=255, size=[height, width]),
-               return reader
-
-           image = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-           reader = fluid.io.PyReader(feed_list=[image], capacity=4, iterable=True, return_list=False)
-
-           user_defined_reader = reader_creator_random_image(784, 784)
-           reader.decorate_sample_list_generator(
-               paddle.batch(user_defined_reader, batch_size=BATCH_SIZE),
-               fluid.core.CUDAPlace(0))
-           # definition of network is omitted
-           executor = fluid.Executor(fluid.CUDAPlace(0))
-           executor.run(fluid.default_main_program())
-
-           for _ in range(EPOCH_NUM):
-               for data in reader():
-                   executor.run(feed=data)
-
-
-        3. If return_list=True, the return values would be presented as list instead of dict. 
-           This is usually used in dygraph mode.
-
-        .. code-block:: python
-
-           import paddle
-           import paddle.fluid as fluid
-           import numpy as np
-
-           ITER_NUM = 5
-           BATCH_SIZE = 10
-
-           def reader_creator_random_image(height, width):
-               def reader():
-                   for i in range(ITER_NUM):
-                       yield np.random.uniform(low=0, high=255, size=[height, width]), \
-                           np.random.random_integers(low=0, high=9, size=[1])
-               return reader
-
-           place = fluid.CPUPlace()
-           with fluid.dygraph.guard(place):
-               py_reader = fluid.io.PyReader(capacity=2, return_list=True)
-               user_defined_reader = reader_creator_random_image(784, 784)
-               py_reader.decorate_sample_list_generator(
-                   paddle.batch(user_defined_reader, batch_size=BATCH_SIZE),
-                   place)
-               for image, label in py_reader():
-                   relu = fluid.layers.relu(image)
-    """
-
-    def __init__(self,
-                 feed_list=None,
-                 capacity=None,
-                 use_double_buffer=True,
-                 iterable=True,
-                 return_list=False):
-        self._loader = DataLoader.from_generator(
-            feed_list, capacity, use_double_buffer, iterable, return_list)
-
-    @property
-    def queue(self):
-        return self._loader.queue
-
-    @property
-    def iterable(self):
-        return self._loader.iterable
-
-    def __iter__(self):
-        return self._loader.__iter__()
-
-    def __next__(self):
-        return self._loader.__next__()
-
-    def start(self):
-        '''
-        Start the data feeding thread. 
-        Can only call when the reader object is not iterable.  
-        
-	    Example:
-	        .. code-block:: python
-    
-                import paddle
-                import paddle.fluid as fluid
-                import numpy as np
-
-                BATCH_SIZE = 10
-
-                def generator():
-                    for i in range(5):
-                        yield np.random.uniform(low=0, high=255, size=[784, 784]),
-
-                image = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-                reader = fluid.io.PyReader(feed_list=[image], capacity=4, iterable=False)
-                reader.decorate_sample_list_generator(
-                    paddle.batch(generator, batch_size=BATCH_SIZE))
-
-                executor = fluid.Executor(fluid.CUDAPlace(0))
-                executor.run(fluid.default_startup_program())
-                for i in range(3):
-                    reader.start()
-                    while True:
-                        try:
-                            executor.run(feed=None)
-                        except fluid.core.EOFException:
-                            reader.reset()
-                            break
-
-	    '''
-        self._loader.start()
-
-    def reset(self):
-        '''
-        Reset the reader object when :code:`fluid.core.EOFException` raises. 
-        Can only call when the reader object is not iterable.
-        
-        Example:
-            .. code-block:: python
-
-                import paddle
-                import paddle.fluid as fluid
-                import numpy as np
-
-                BATCH_SIZE = 10
-
-                def generator():
-                    for i in range(5):
-                        yield np.random.uniform(low=0, high=255, size=[784, 784]),
-
-                image = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-                reader = fluid.io.PyReader(feed_list=[image], capacity=4, iterable=False)
-                reader.decorate_sample_list_generator(
-                    paddle.batch(generator, batch_size=BATCH_SIZE))
-
-                executor = fluid.Executor(fluid.CUDAPlace(0))
-                executor.run(fluid.default_startup_program())
-                for i in range(3):
-                    reader.start()
-                    while True:
-                        try:
-                            executor.run(feed=None)
-                        except fluid.core.EOFException:
-                            reader.reset()
-                            break        
-
-        '''
-        self._loader.reset()
-
-    def decorate_sample_generator(self,
-                                  sample_generator,
-                                  batch_size,
-                                  drop_last=True,
-                                  places=None):
-        '''
-        Set the data source of the PyReader object.
-        
-        The provided :code:`sample_generator` should be a Python generator,
-        which yields list(numpy.ndarray)-typed data of each sample.
-
-        :code:`places` must be set when the PyReader object is iterable.
-
-        If all inputs have no lods, this method is faster than 
-        :code:`decorate_sample_list_generator(paddle.batch(sample_generator, ...))` .
-
-        Args:
-            sample_generator (generator): Python generator that yields
-                list(numpy.ndarray)-typed sample data.
-            batch_size (int): batch size. Must be larger than 0.
-            drop_last (bool): Whether to drop the last batch when sample number
-                is less than batch_size. 
-            places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
-                be provided when PyReader is iterable.
-
-        Example:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                import numpy as np
-
-                EPOCH_NUM = 3
-                ITER_NUM = 15
-                BATCH_SIZE = 3
-
-                def random_image_and_label_generator(height, width):
-                    def generator():
-                        for i in range(ITER_NUM):
-                            fake_image = np.random.uniform(low=0,
-                                                           high=255,
-                                                           size=[height, width])
-                            fake_label = np.array([1])
-                            yield fake_image, fake_label
-                    return generator
-
-                image = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int32')
-                reader = fluid.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
-
-                user_defined_generator = random_image_and_label_generator(784, 784)
-                reader.decorate_sample_generator(user_defined_generator,
-                                                 batch_size=BATCH_SIZE,
-                                                 places=[fluid.CUDAPlace(0)])
-                # definition of network is omitted
-                executor = fluid.Executor(fluid.CUDAPlace(0))
-                executor.run(fluid.default_main_program())
-
-                for _ in range(EPOCH_NUM):
-                    for data in reader():
-                        executor.run(feed=data)
-    
-        '''
-        self._loader.set_sample_generator(sample_generator, batch_size,
-                                          drop_last, places)
-
-    def decorate_sample_list_generator(self, reader, places=None):
-        '''
-        Set the data source of the PyReader object. 
-
-        The provided :code:`reader` should be a Python generator,
-        which yields list(numpy.ndarray) typed batched data. 
-        
-        :code:`places` must be set when the PyReader object is iterable.
-
-        Args:
-            reader (generator): Python generator that yields 
-                list(numpy.ndarray)-typed batched data. 
-            places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
-                be provided when PyReader is iterable.
-        
-        Example:
-            .. code-block:: python
-
-                import paddle
-                import paddle.fluid as fluid
-                import numpy as np
-
-                EPOCH_NUM = 3
-                ITER_NUM = 15
-                BATCH_SIZE = 3
-
-                def random_image_and_label_generator(height, width):
-                    def generator():
-                        for i in range(ITER_NUM):
-                            fake_image = np.random.uniform(low=0,
-                                                           high=255,
-                                                           size=[height, width])
-                            fake_label = np.ones([1])
-                            yield fake_image, fake_label
-                    return generator
-
-                image = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int32')
-                reader = fluid.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
-
-                user_defined_generator = random_image_and_label_generator(784, 784)
-                reader.decorate_sample_list_generator(
-                    paddle.batch(user_defined_generator, batch_size=BATCH_SIZE),
-                    fluid.core.CUDAPlace(0))
-                # definition of network is omitted
-                executor = fluid.Executor(fluid.core.CUDAPlace(0))
-                executor.run(fluid.default_main_program())
-
-                for _ in range(EPOCH_NUM):
-                    for data in reader():
-                        executor.run(feed=data)
-                 
-        '''
-        self._loader.set_sample_list_generator(reader, places)
-
-    def decorate_batch_generator(self, reader, places=None):
-        '''
-        Set the data source of the PyReader object.
-
-        The provided :code:`reader` should be a Python generator,
-        which yields numpy.ndarray-typed or LoDTensor-typed batched data.
-
-        :code:`places` must be set when the PyReader object is iterable.
-
-        Args:
-            reader (generator): Python generator that yields LoDTensor-typed
-                batched data.
-            places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
-                be provided when PyReader is iterable.
-
-        Example:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                import numpy as np
-
-                EPOCH_NUM = 3
-                ITER_NUM = 15
-                BATCH_SIZE = 3
-
-                def random_image_and_label_generator(height, width):
-                    def generator():
-                        for i in range(ITER_NUM):
-                            batch_image = np.random.uniform(low=0,
-                                                            high=255,
-                                                            size=[BATCH_SIZE, height, width])
-                            batch_label = np.ones([BATCH_SIZE, 1])
-                            yield batch_image, batch_label
-                    return generator
-
-                image = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int32')
-                reader = fluid.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
-
-                user_defined_generator = random_image_and_label_generator(784, 784)
-                reader.decorate_batch_generator(user_defined_generator, fluid.CUDAPlace(0))
-                # definition of network is omitted
-                executor = fluid.Executor(fluid.CUDAPlace(0))
-                executor.run(fluid.default_main_program())
-
-                for _ in range(EPOCH_NUM):
-                    for data in reader():
-                        executor.run(feed=data)
-
-        '''
-        self._loader.set_batch_generator(reader, places)
-
-
-class DatasetLoader(DataLoaderBase):
-    def __init__(self, dataset, places, drop_last):
-        assert isinstance(dataset,
-                          DatasetBase), "dataset must be type of DatasetBase"
-        assert not in_dygraph_mode(
-        ), "DatasetLoader is not supported in dygraph mode yet"
-
-        thread_num = len(places)
-
-        assert len(dataset.filelist) >= thread_num, \
-            "Filelist number of dataset {} must be not less than place number {}".format(len(dataset.filelist), thread_num)
-
-        if dataset.thread_num != 0 and dataset.thread_num != thread_num:
-            logging.warn('thread_num {} which is set in Dataset is ignored'.
-                         format(dataset.thread_num))
-
-        dataset.set_thread(thread_num)
-
-        if isinstance(dataset,
-                      InMemoryDataset) and dataset.queue_num > thread_num:
-            logging.warn("queue_num {} which is set in Dataset is ignored".
-                         format(dataset.queue_num))
-            dataset.set_queue_num(thread_num)
-
-        self._dataset = dataset
-        use_slots = [
-            slot.name for slot in dataset.proto_desc.multi_slot_desc.slots
-            if slot.is_used
-        ]
-
-        self._iterable_dataset = core.IterableDatasetWrapper(
-            dataset.dataset, use_slots,
-            _convert_places(places), dataset.proto_desc.batch_size, drop_last)
-
-    def __iter__(self):
-        self._dataset._finish_to_run()
-        self._dataset._prepare_to_run()
-        self._iterable_dataset._start()
-        return self
-
-    def __next__(self):
-        return self._iterable_dataset._next()
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
deleted file mode 100644
index 822029a372b31fd86fb8b4568b2346aa98c003db..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/regularizer.py
+++ /dev/null
@@ -1,270 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import framework
-from . import core
-
-__all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
-
-
-def append_regularization_ops(parameters_and_grads, regularization=None):
-    """Create and add backward regularization Operators
-
-    Creates and adds backward regularization operators in the BlockDesc.
-    This will add gradients of the regularizer function to the gradients
-    of the parameters and return these modified gradients. This is the
-    same as implementing weight decay in optimizers for regularization.
-
-    Args:
-        parameters_and_grads: A list of (parameters, gradients) pairs
-                              that need to be regularized.
-        regularization: A global regularizer. If the parameter is not
-                        set. It will be applied with regularizer.
-
-    Returns:
-        list[(Variable, Variable)]: list of (parameters, gradients) \
-        pair with the regularized gradient
-
-    Raises:
-        Exception: Unknown regularization type
-    """
-    params_and_grads = []
-    for param, grad in parameters_and_grads:
-        # If no gradient then we don't need to do anything
-        if grad is None:
-            params_and_grads.append((param, grad))
-            continue
-        with param.block.program._optimized_guard(
-            [param, grad]), framework.name_scope('regularization'):
-            regularization_term = None
-            if param.regularizer is not None:
-                # Add variable for regularization term in grad block
-                regularization_term = param.regularizer(param, grad, grad.block)
-            elif regularization is not None:
-                regularization_term = regularization(param, grad, grad.block)
-
-            # If no regularization specified, then we don't need to do anything
-            if regularization_term is None:
-                params_and_grads.append((param, grad))
-                continue
-
-            new_grad = grad
-            if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
-                # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
-                # the grad's type and name will be changed. But the gradient's name
-                # is used in ParallelExecutor Reduce mode, so I add a flag for
-                # the new_grad here.
-                new_grad = grad.block.create_var(
-                    name=grad.name + core.kNewGradSuffix(),
-                    dtype=param.dtype,
-                    shape=param.shape,
-                    lod_level=param.lod_level,
-                    type=core.VarDesc.VarType.LOD_TENSOR)
-
-            grad.block.append_op(
-                type='sum',
-                inputs={"X": [grad, regularization_term]},
-                outputs={"Out": new_grad})
-
-            params_and_grads.append((param, new_grad))
-
-    return params_and_grads
-
-
-class WeightDecayRegularizer(object):
-    """Base class for weight decay regularizers
-
-    Defines the common interface of weight-decay regularizers.
-    Weight-decay regularizers are added only during the backward
-    pass for faster regularization. They add operations to the network
-    that correspond to gradient of the regularization function.
-    Users should not use this class directly, but need to use one
-    of its implementations
-    """
-
-    def __init__(self):
-        pass
-
-    def __call__(self, param, grad, block):
-        """Add corresponding weight decay operations to the network
-        """
-        raise NotImplementedError()
-
-    def __str__(self):
-        """Debug string
-        """
-        raise NotImplementedError()
-
-
-class L2DecayRegularizer(WeightDecayRegularizer):
-    """Implements the L2 Weight Decay Regularization
-
-    Small values of L2 can help prevent over fitting the training data.
-
-    .. math::
-
-        L2WeightDecay = reg\_coeff * parameter
-
-    Args:
-        regularization_coeff(float): regularization coeff
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                data = fluid.layers.data(name='image', shape=[3, 28, 28], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-                hidden = fluid.layers.fc(input=data, size=128, act='relu')
-                prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-                loss = fluid.layers.cross_entropy(input=prediction, label=label)
-                avg_loss = fluid.layers.mean(loss)
-            optimizer = fluid.optimizer.Adagrad(
-                learning_rate=1e-4,
-                regularization=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=0.1))
-            optimizer.minimize(avg_loss)
-    """
-
-    def __init__(self, regularization_coeff=0.0):
-        assert regularization_coeff is not None
-        super(L2DecayRegularizer, self).__init__()
-        self._regularization_coeff = regularization_coeff
-
-    def __call__(self, param, grad, block):
-        """Add L2 weight decay ops to network
-
-        Adds L2 weight decay ops.
-        L2WeightDecay = reg_coeff * parameter
-
-        Args:
-            param: parameter variable for which regularization is applied
-            block: block in which variable is to be created
-
-        Returns:
-            new variable for weight decay
-        """
-        assert isinstance(param, framework.Parameter)
-        assert isinstance(block, framework.Block)
-
-        if framework.in_dygraph_mode():
-            decay = block.create_var(dtype=param.dtype, shape=param.shape)
-        else:
-            decay = block.create_var(
-                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
-
-        # Append Op to calculate decay
-        block.append_op(
-            type='scale',
-            inputs={"X": param},
-            outputs={"Out": decay},
-            attrs={"scale": self._regularization_coeff})
-
-        return decay
-
-    def __str__(self):
-        return "L2Decay, regularization_coeff=%f" % self._regularization_coeff
-
-
-class L1DecayRegularizer(WeightDecayRegularizer):
-    """Implements the L1 Weight Decay Regularization
-
-    L1 regularization encourages sparsity.
-
-    .. math::
-
-        L1WeightDecay = reg\_coeff * sign(parameter)
-
-    Args:
-        regularization_coeff(float): regularization coeff
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                data = fluid.layers.data(name='image', shape=[3, 28, 28], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-                hidden = fluid.layers.fc(input=data, size=128, act='relu')
-                prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-                loss = fluid.layers.cross_entropy(input=prediction, label=label)
-                avg_loss = fluid.layers.mean(loss)
-            optimizer = fluid.optimizer.Adagrad(
-                learning_rate=1e-4,
-                regularization=fluid.regularizer.L1DecayRegularizer(
-                    regularization_coeff=0.1))
-            optimizer.minimize(avg_loss)
-    """
-
-    def __init__(self, regularization_coeff=0.0):
-        assert regularization_coeff is not None
-        super(L1DecayRegularizer, self).__init__()
-        self._regularization_coeff = regularization_coeff
-
-    def __call__(self, param, grad, block):
-        """Add L1 weight decay ops to network
-
-        Adds L1 weight decay ops.
-        L1WeightDecay = reg_coeff * sign(parameter)
-
-        Args:
-            param: parameter variable for which regularization is applied
-            block: block in which variable is to be created
-
-        Returns:
-            new variable for weight decay
-        """
-        assert isinstance(param, framework.Parameter)
-        assert isinstance(block, framework.Block)
-
-        if framework.in_dygraph_mode():
-            decay = block.create_var(dtype=param.dtype, shape=param.shape)
-        else:
-            decay = block.create_var(
-                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
-
-        # Append sign op
-        block.append_op(
-            type='sign', inputs={"X": param}, outputs={"Out": decay})
-
-        # Append scale op to the output of sign op
-        block.append_op(
-            type='scale',
-            inputs={"X": decay},
-            outputs={"Out": decay},
-            attrs={"scale": self._regularization_coeff})
-
-        return decay
-
-    def __str__(self):
-        return "L1Decay, regularization_coeff=%f" % self._regularization_coeff
-
-
-# We short the class name, since users will use the regulaizer with the package
-# name. The sample code:
-#
-# import paddle.fluid as fluid
-#
-# hidden = fluid.layers.fc(...,
-#                          param_attr=fluid.regularizer.Xavier())
-#
-# It is no need to add a `Regularizer` as the class suffix
-L1Decay = L1DecayRegularizer
-L2Decay = L2DecayRegularizer
diff --git a/python/paddle/fluid/sampcd_processor.py b/python/paddle/fluid/sampcd_processor.py
deleted file mode 100644
index 6597b9378236f1b56ba7df32b0d7ba360fdb6a1b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/sampcd_processor.py
+++ /dev/null
@@ -1,1006 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import subprocess
-
-
-def find_all(srcstr, substr):
-    '''
-    to find all desired substring in the source string
-     and return their starting indices as a list
-
-    Args:
-        srcstr(str): the parent string
-        substr(str): substr
-    
-    Returns:
-        list: a list of the indices of the substrings 
-              found
-    '''
-
-    indices = []
-
-    gotone = srcstr.find(substr)
-
-    while (gotone != -1):
-
-        indices.append(gotone)
-
-        gotone = srcstr.find(substr, gotone + 1)
-
-    return indices
-
-
-def check_indent(cdline):
-    '''
-    to check the indent of a given code line
-    
-    to get the number of starting blank chars,
-    e.t. blankspaces and \t
-    
-    \t will be interpreted as 4 single blankspaces, 
-    e.t. '\t'='    '
-    
-    Args:
-        cdline(str) : a single line of code from the source file
-
-    Returns:
-        int : the indent of the number of interpreted 
-             blankspaces
-    '''
-
-    indent = 0
-    for c in cdline:
-        if c == '\t':
-            indent += 4
-        elif c == ' ':
-            indent += 1
-        if c != ' ' and c != '\t':
-            break
-
-    return indent
-
-
-#srccom: raw comments in the source,including ''' and original indent
-
-
-def sampcd_extract_and_run(srccom,
-                           name,
-                           logf,
-                           htype="def",
-                           hname="",
-                           show_details=False):
-    '''
-    Extract and run sample codes from source comment and
-    the result will be returned.
-
-    As an ultimate result, this function returns a list of 
-    status codes for each sample code (in top-down order)
-    found in srccom.
-
-    status code deciphering:
-
-        3:error sample code
-        2:have sample code but format is wrong
-        1:no sample code
-        0:successful
-        -1:no comments found 
-        -2:in white list
-    
-    there may be several examples in a source comment
-    so status deserves a list to contain the states.
-    For instance, some API has three example codes, 
-    code 1 is successful, code 2 is error, code 3 is successful
-    so the list to return is [0,3,0]
-
-    Args:
-        srccom(str): the source comment of some API whose
-                     example codes will be extracted and run.
-        name(str): the name of the API.
-        logf(file): for logging the output in case they are
-                    flushed.
-        htype(str): the type of hint banners, def/class/method.
-        hname(str): the name of the hint  banners , e.t. def hname.
-        show_details(bool):  Set it to False to print wrong sample 
-                             codes only.
-    
-    Returns:
-        list: the status code of all the sample codes found in srccom.
-                  
-
-    
-
-    '''
-
-    def sampcd_header_print(name, sampcd, htype, hname, logf):
-        '''
-        print hint banner headers.
-        
-        Args:
-            name(str): the name of the API.
-            sampcd(str): sample code string
-            htype(str): the type of hint banners, def/class/method.
-            hname(str): the name of the hint  banners , e.t. def hname.
-            logf(file): for logging the output in case they are
-            flushed.
-        '''
-        print_header(logf, htype, hname)
-
-        print "Sample code " + str(y) + " extracted for " + name + "   :"
-        print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
-        print(sampcd)
-        print "----example code check----\n"
-        print "executing sample code ....."
-        print "execution result:"
-        logf.write("\nSample code extracted for " + name + "   :\n")
-        logf.write("\n" + sampcd + "\n")
-        logf.write("\n----example code check----\n")
-        logf.write("\nexecuting sample code .....\n")
-        logf.write("\nexecution result:\n")
-
-    sampcd_begins = find_all(srccom, " code-block:: python")
-
-    status = []
-
-    if (len(sampcd_begins) == 0):
-
-        print_header(logf, htype, hname)
-        '''
-        detect sample codes using >>> to format
-        and consider this situation as wrong
-        '''
-        if (srccom.find("Examples:") != -1):
-            print "----example code check----\n"
-            logf.write("\n----example code check----\n")
-
-            if (srccom.find(">>>") != -1):
-                logf.write(
-                    "Deprecated sample code style:\n\n    Examples:\n\n        >>>codeline\n        >>>codeline\n\n\n "
-                    + "Please use '.. code-block:: python' to " +
-                    "format sample code.\n")
-                print(
-                    "Deprecated sample code style:\n\n    Examples:\n\n        >>>codeline\n        >>>codeline\n\n\n "
-                    + "Please use '.. code-block:: python' to " +
-                    "format sample code.\n")
-                status.append(2)
-                print "status code for all sample codes in " + name + " : " + str(
-                    status)
-
-        else:
-            print "No sample code!\n"
-            logf.write("\nNo sample code!\n")
-            status.append(1)
-            print "status code for all sample codes in " + name + " : " + str(
-                status)
-
-    for y in range(1, len(sampcd_begins) + 1):
-
-        sampcd_begin = sampcd_begins[y - 1]
-        sampcd = srccom[sampcd_begin + len(" code-block:: python") + 1:]
-
-        sampcd = sampcd.split("\n")
-
-        #remove starting empty lines
-        while sampcd[0].replace(' ', '').replace('\t', '') == '':
-            sampcd.pop(0)
-
-        #the mininmum indent, which is the indent of the first 
-        #non-empty line
-        min_indent = check_indent(sampcd[0])
-
-        sampcd_to_write = []
-
-        for i in range(0, len(sampcd)):
-
-            cdline = sampcd[i]
-
-            #handle empty lines or those only with spaces/tabs
-            if cdline.strip() == '':
-                continue
-
-            this_indent = check_indent(cdline)
-            if (this_indent < min_indent):
-                break
-
-            else:
-                cdline = cdline.replace('\t', '    ')
-                sampcd_to_write.append(cdline[min_indent:])
-
-        sampcd = '\n'.join(sampcd_to_write)
-        if sys.argv[1] == "cpu":
-            sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
-        if sys.argv[1] == "gpu":
-            sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = "0"\n' + sampcd
-        sampcd += '\nprint ' + '\"' + name + ' sample code is executed successfully!\"\n'
-
-        if (len(sampcd_begins) > 1):
-            tfname = name + "_example_" + str(y) + ".py"
-        else:
-            tfname = name + "_example" + ".py"
-
-        tempf = open("samplecode_temp/" + tfname, 'w')
-        tempf.write(sampcd)
-        tempf.close()
-
-        cmd = ["python", "samplecode_temp/" + tfname]
-
-        subprc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        output, error = subprc.communicate()
-
-        msg = "".join(output)
-        err = "".join(error)
-
-        if (subprc.returncode != 0):
-
-            print("\nSample code error found in " + name + ":\n")
-            sampcd_header_print(name, sampcd, htype, hname, logf)
-            print "subprocess return code: " + str(subprc.returncode)
-            print("Error Raised from Sample Code " + name + " :\n")
-            print err
-            print msg
-            logf.write("\nError Raised from Sample Code " + name + " :\n")
-            logf.write("\n" + msg + "\n")
-
-            status.append(3)
-            print "status code for all sample codes in " + name + str(status)
-        #It works!
-        else:
-            status.append(0)
-            if show_details:
-                sampcd_header_print(name, sampcd, htype, hname, logf)
-                print "subprocess return code: " + str(subprc.returncode)
-                print msg
-                logf.write("\n" + msg + "\n")
-                print "status code for all sample codes in " + name + " : " + str(
-                    status)
-
-        #msg is the returned code execution report
-
-        os.remove("samplecode_temp/" + tfname)
-
-    return status
-
-
-def single_defcom_extract(start_from, srcls, is_class_begin=False):
-    '''
-    to extract a def function/class/method comments body
-
-    Args: 
-        start_from(int): the line num of "def" header
-        srcls(list): the source file in lines
-        is_class_begin(bool): whether the start_from is a beginning a class. \
-        For a sole class body itself may end up with its method if it has no
-        docstring. But the body of \
-        a common def function can only be ended up by a none-indented def/class
-    
-    Returns:
-        string : the extracted comment body, inclusive of its quote marks.
-
-    '''
-    i = start_from
-
-    fcombody = ""  #def comment body
-
-    comstart = -1  # the starting line index of comment mark "'''" or """""" 
-    #if it is not -1, it indicates the loop is in the comment body
-    comstyle = 0  # comment mark style ,comments quoted with ''' is coded as 1
-    # comments quoted with """ is coded as 2
-
-    for x in range(i + 1, len(srcls)):
-
-        if is_class_begin:
-
-            if (srcls[x].replace('\t', '    ').startswith('    def ')):
-                break
-
-        if ((srcls[x].startswith('def ') or srcls[x].startswith('class '))):
-            break
-
-        else:
-
-            if (comstart == -1 and srcls[x].replace(" ", '').replace(
-                    "\t", '').replace("\n", '').startswith("\"\"\"")):
-                comstart = x
-                comstyle = 2
-                continue
-            if (comstyle == 2 and comstart != -1 and
-                    srcls[x].replace(" ", '').replace("\t", '').replace(
-                        "\n", '').startswith("\"\"\"")):
-                break
-
-            if (comstart == -1 and srcls[x].replace(" ", '').replace(
-                    "\t", '').replace("\n", '').startswith("\'\'\'")):
-                comstart = x
-                comstyle = 1
-                continue
-            if (comstyle == 1 and comstart != -1 and
-                    srcls[x].replace(" ", '').replace("\t", '').replace(
-                        "\n", '').startswith("\'\'\'")):
-                break
-            if (comstart !=
-                    -1):  #when the comments start, begin to add line to fcombody
-                fcombody += srcls[x]
-
-    return fcombody
-
-
-def print_header(logf, htype, name):
-
-    print htype + " name:" + name
-    print "-----------------------"
-
-    logf.write("\n\n" + htype + " name:" + name + "\n")
-    logf.write("-----------------------\n")
-
-
-def srcf_print(srcfile):
-
-    print "source file name:" + srcfile.name
-    print "---------------------------------------------------"
-
-    logf.write("source file name:" + srcfile.name + "\n")
-    logf.write("---------------------------------------------------\n\n")
-
-
-def show_alllist(alllist):
-
-    print "__all__:" + str(alllist) + "\n"
-    logf.write("__all__:" + str(alllist) + "\n\n")
-
-
-def srccoms_extract(srcfile, logf, status_all, wlist, show_details):
-    '''
-    Given a source file ``srcfile``, this function will
-    extract its API(doc comments) and run sample codes in the
-    API.
-
-    Args:
-        srcfile(file): the source file
-        logf(file): log recording file
-        status_all(dict): record all the sample code execution states.
-        wlist(list): white list
-        show_details(bool): if show_details is True, the whole process will be printed for you
-        to debug it locally
-
-    Returns:
-
-        string: the length of __all__ list in srcfile versus the exact number of
-                analysed API to make sure no API is missed in this srcfile and it
-                is useful for statistic practices.
-    '''
-
-    srcc = srcfile.read()
-
-    #2. get defs and classes header line number
-    #set file pointer to its beginning
-    srcfile.seek(0, 0)
-    srcls = srcfile.readlines()  #source lines
-
-    if show_details:
-        srcf_print(srcfile)
-
-    #1. fetch__all__ list
-    allidx = srcc.find("__all__")
-
-    if (allidx != -1):
-
-        alllist = []
-
-        #get all list for layers/ops.py
-        if (srcfile.name.find("ops.py") != -1):
-
-            for ai in range(0, len(srcls)):
-
-                if (srcls[ai].startswith("__all__")):
-
-                    lb = srcls[ai].find('[')
-                    rb = srcls[ai].find(']')
-                    if (lb == -1):
-                        continue
-                    allele = srcls[ai][lb + 1:rb].replace("'", '').replace(
-                        " ", '').replace("\"", '')
-
-                    alllist.append(allele)
-
-            if '' in alllist:
-                alllist.remove('')
-
-            if show_details:
-                show_alllist(alllist)
-
-        else:
-            alllist_b = allidx + len("__all__")
-
-            allstr = srcc[alllist_b + srcc[alllist_b:].find("[") + 1:alllist_b +
-                          srcc[alllist_b:].find("]")]
-            allstr = allstr.replace("\n", '').replace(" ", '').replace(
-                "'", '').replace("\"", '')
-            alllist = allstr.split(',')
-            if '' in alllist:
-                alllist.remove('')
-
-            if show_details:
-                show_alllist(alllist)
-
-        api_alllist_count = len(alllist)
-        api_count = 0
-
-        handled = []
-
-        #get src contents in layers/ops.py
-        if (srcfile.name.find("ops.py") != -1):
-
-            for i in range(0, len(srcls)):
-
-                if srcls[i].find("__doc__") != -1:
-
-                    opname = srcls[i][:srcls[i].find("__doc__") - 1]
-
-                    if opname in wlist:
-
-                        status_all[srcfile.name + '/' + opname] = [-2]
-
-                        if show_details:
-                            print_header(logf, "def", opname)
-                            print opname + " is in white list, thus skipped"
-                            logf.write("\n" + opname +
-                                       " is in white list, thus skipped\n")
-                            print status_all[srcfile.name + '/' + opname]
-                            logf.write("\n" + "execution status" + str(
-                                status_all[srcfile.name + '/' + opname]) + "\n")
-
-                        continue
-
-                    comstart = i
-                    for j in range(i, len(srcls)):
-                        if (srcls[j].find("\"\"\"") != -1):
-                            comstart = i
-
-                    opcom = ""
-                    for j in range(comstart + 1, len(srcls)):
-                        opcom += srcls[j]
-                        if (srcls[j].find("\"\"\"") != -1):
-                            break
-
-                    status = sampcd_extract_and_run(opcom, opname, logf, "def",
-                                                    opname, show_details)
-                    api_count += 1
-                    status_all[srcfile.name + '/' + opname] = status
-
-                    handled.append(
-                        opname)  #ops.py also has normal formatted functions
-                    #use list 'handled'  to mark the functions have been handled here
-                    #which will be ignored in the following step
-
-        for i in range(0, len(srcls)):
-
-            if srcls[i].startswith(
-                    'def '):  #a function header is detected in line i
-
-                f_header = srcls[i].replace(" ", '')
-                fn = f_header[len('def'):f_header.find('(')]  #function name
-
-                if fn in handled:
-                    continue
-
-                if fn in alllist:
-
-                    api_count += 1
-
-                    if fn in wlist or fn + "@" + srcfile.name in wlist:
-
-                        status_all[srcfile.name + '/' + fn] = [-2]
-
-                        if show_details:
-                            print_header(logf, "def", fn)
-                            print fn + " is in white list, thus skipped"
-                            logf.write("\n" + fn +
-                                       " is in white list, thus skipped\n")
-                            print status_all[srcfile.name + '/' + fn]
-                            logf.write("\n" + "execution status" + str(
-                                status_all[srcfile.name + '/' + fn]) + "\n")
-
-                        continue
-
-                    fcombody = single_defcom_extract(i, srcls)
-                    if (fcombody == ""):  #if no comment 
-                        print_header(logf, "def", fn)
-                        print "WARNING: no comments in function " + fn + ", but it deserves."
-                        logf.write("no comments in function " + fn + "\n\n")
-                        status_all[srcfile.name + '/' + fn] = [-1]
-                        print status_all[srcfile.name + '/' + fn]
-                        logf.write("\n" + "execution status" + str(status_all[
-                            srcfile.name + '/' + fn]) + "\n")
-
-                        continue
-                    else:
-                        status = sampcd_extract_and_run(fcombody, fn, logf,
-                                                        "def", fn, show_details)
-                        status_all[srcfile.name + '/' + fn] = status
-
-                else:
-                    if show_details:
-                        print_header(logf, "def", fn)
-                        print fn + " not in __all__ list"
-                        logf.write(fn + " not in __all__ list\n\n")
-
-            if srcls[i].startswith('class '):
-
-                c_header = srcls[i].replace(" ", '')
-                cn = c_header[len('class'):c_header.find('(')]  #class name
-
-                if cn in handled:
-                    continue
-
-                if cn in alllist:
-
-                    api_count += 1
-
-                    if cn in wlist or cn + "@" + srcfile.name in wlist:
-
-                        status_all[srcfile.name + '/' + cn] = [-2]
-
-                        if show_details:
-
-                            print cn + " is in white list, thus skipped"
-                            logf.write("\n" + cn +
-                                       " is in white list, thus skipped\n")
-
-                            print status_all[srcfile.name + '/' + cn]
-                            logf.write("\n" + "execution status" + str(
-                                status_all[srcfile.name + '/' + cn]) + "\n")
-
-                        continue
-
-                    #class comment
-                    classcom = single_defcom_extract(i, srcls, True)
-
-                    if (classcom != ""):
-
-                        status = sampcd_extract_and_run(
-                            classcom, cn, logf, "class", cn, show_details)
-                        status_all[srcfile.name + '/' + cn] = status
-
-                    else:
-                        print "WARNING: no comments in class itself " + cn + ", but it deserves.\n"
-                        logf.write("no comments in class itself " + cn +
-                                   "\n\n\n")
-                        status_all[srcfile.name + '/' + cn] = [-1]
-                        print status_all[srcfile.name + '/' + cn]
-                        logf.write("\n" + "execution status" + str(status_all[
-                            srcfile.name + '/' + cn]) + "\n")
-
-                    #handling methods in class bodies
-                    for x in range(
-                            i + 1,
-                            len(srcls)):  #from the next line of class header 
-
-                        if (srcls[x].startswith('def ') or
-                                srcls[x].startswith('class ')):
-                            break
-                        else:
-                            #member method def header
-                            srcls[x] = srcls[x].replace('\t', '    ')
-                            if (srcls[x].startswith(
-                                    '    def ')):  #detect a mehtod header..
-
-                                thisl = srcls[x]
-                                indent = len(thisl) - len(thisl.lstrip())
-                                mn = thisl[indent + len('def '):thisl.find(
-                                    '(')]  #method name
-
-                                name = cn + "." + mn  #full name
-
-                                if mn.startswith('_'):
-
-                                    if show_details:
-
-                                        print mn + " is hidden, not visible to users\n"
-                                        logf.write(
-                                            "\n" + mn +
-                                            " is hidden, not visible to users\n")
-
-                                    continue
-
-                                if name in wlist or name + "@" + srcfile.name in wlist:
-
-                                    status_all[srcfile.name + '/' + name] = [-2]
-
-                                    if show_details:
-
-                                        print name + " is in white list, thus skipped"
-                                        logf.write(
-                                            "\n" + name +
-                                            " is in white list, thus skipped\n")
-                                        print status_all[srcfile.name + '/' +
-                                                         name]
-                                        logf.write(
-                                            "\n" + "execution status" + str(
-                                                status_all[srcfile.name + '/' +
-                                                           name]) + "\n")
-
-                                    continue
-
-                                thismethod = []  #method body lines
-                                thismethod.append(thisl[indent:])
-
-                                #get all the lines of a single method body 
-                                #into thismethod(list)
-                                #and send it to single_defcom_extract
-                                for y in range(x + 1, len(srcls)):
-                                    srcls[y] = srcls[y].replace('\t', '    ')
-                                    if (srcls[y].startswith('def ') or
-                                            srcls[y].startswith('class ')):
-                                        #end of method
-                                        break
-                                    elif (srcls[y].startswith('    def ')):
-                                        #end of method
-                                        break
-                                    else:
-                                        thismethod.append(srcls[y][indent:])
-
-                                thismtdcom = single_defcom_extract(0,
-                                                                   thismethod)
-
-                                if (thismtdcom != ""):
-                                    status = sampcd_extract_and_run(
-                                        thismtdcom, name, logf, "method", name,
-                                        show_details)
-                                    status_all[srcfile.name + '/' +
-                                               name] = status
-
-                                else:
-
-                                    if show_details:
-                                        print "no comments in method " + name + "\n"
-                                        logf.write("no comments in method " +
-                                                   name + "\n\n\n")
-                                        status_all[srcfile.name + '/' +
-                                                   name] = [-1]
-                                        print status_all[srcfile.name + '/' +
-                                                         name]
-                                        logf.write(
-                                            "\n" + "execution status" + str(
-                                                status_all[srcfile.name + '/' +
-                                                           name]) + "\n")
-
-                else:
-                    if show_details:
-                        print cn + " is not in __all__ list"
-                        logf.write(cn + " is not in __all__ list\n\n")
-
-    return [
-        srcfile.name + " all list length: " + str(api_alllist_count),
-        "analysed api count: " + str(api_count)
-    ]
-
-
-'''
-Important constant lists:
-
-    filenames : the modules pending for check .
-    wlist : a list of API that should not trigger the example check .
-            It is composed of wlist_temp + wlist_inneed + wlist_ignore.
-    show_details: a boolean value to indicate whether it should be run
-                  in debugging mode.
-    status_all: a status list containing all the execution status of all
-                APIs
-    srcfile: the source .py code file
-'''
-
-filenames = [
-    "layers/control_flow.py", "layers/io.py", "layers/nn.py", "layers/ops.py",
-    "layers/tensor.py", "layers/learning_rate_scheduler.py",
-    "layers/detection.py", "layers/metric_op.py"
-]
-filenames += [
-    "dygraph/layers.py", "dygraph/base.py", "dygraph/nn.py",
-    "dygraph/tracer.py", "dygraph/profiler.py", "dygraph/parallel.py",
-    "dygraph/checkpoint.py", "dygraph/learning_rate_scheduler.py",
-    "dygraph/backward_strategy.py"
-]
-
-filenames += [
-    "data_feeder.py", "dataset.py", "clip.py", "metrics.py", "executor.py",
-    "initializer.py", "io.py", "nets.py", "optimizer.py", "profiler.py",
-    "regularizer.py", "backward.py", "average.py", "unique_name.py",
-    "framework.py", "evaluator.py", "param_attr.py"
-]
-
-wlist_inneed = [
-    "append_LARS", "BuildStrategy.debug_graphviz_path",
-    "BuildStrategy.enable_sequential_execution",
-    "BuildStrategy.fuse_elewise_add_act_ops",
-    "BuildStrategy.fuse_relu_depthwise_conv",
-    "BuildStrategy.gradient_scale_strategy", "BuildStrategy.reduce_strategy",
-    "BuildStrategy.remove_unnecessary_lock", "BuildStrategy.sync_batch_norm",
-    "DynamicRNN.step_input", "DynamicRNN.static_input", "DynamicRNN.block",
-    "DynamicRNN.update_memory", "DynamicRNN.output",
-    "transpiler.DistributeTranspilerConfig",
-    "transpiler.DistributeTranspilerConfig.slice_var_up",
-    "transpiler.DistributeTranspilerConfig.split_method",
-    "transpiler.DistributeTranspilerConfig.min_block_size",
-    "DistributeTranspilerConfig.slice_var_up",
-    "DistributeTranspilerConfig.split_method", "ModelAverage.apply",
-    "ModelAverage.restore", "DistributeTranspilerConfig",
-    "DistributeTranspilerConfig.min_block_size",
-    "ExecutionStrategy.allow_op_delay", "load", "Accuracy.update",
-    "ChunkEvaluator.update", "ExecutionStrategy.num_iteration_per_drop_scope",
-    "ExecutionStrategy.num_threads", "CompiledProgram._with_inference_optimize",
-    "CompositeMetric.add_metric", "CompositeMetric.update",
-    "CompositeMetric.eval", "DetectionMAP.get_map_var", "MetricBase",
-    "MetricBase.reset", "MetricBase.get_config", "MetricBase.update",
-    "MetricBase.eval", "Accuracy.eval", "Auc.update", "Auc.eval",
-    "EditDistance.update", "EditDistance.eval",
-    "ExponentialMovingAverage.apply", "ExponentialMovingAverage.restore",
-    "ExponentialMovingAverage.update", "StaticRNN.step", "StaticRNN.step_input",
-    "StaticRNN.step_output", "StaticRNN.update_memory", "DetectionMAP.reset",
-    'StaticRNN.output', "cuda_places", "CUDAPinnedPlace", "CUDAPlace",
-    "Program.parse_from_string"
-]
-
-wlist_temp = [
-    'ChunkEvaluator',
-    'EditDistance',
-    'ErrorClipByValue',
-    'Program.clone',
-    'cuda_pinned_places',
-    'DataFeeder',
-    'elementwise_floordiv',
-    'Layer',
-    'Layer.create_parameter',
-    'Layer.create_variable',
-    'Layer.sublayers',
-    'Layer.add_parameter',
-    'Layer.add_sublayer',
-    'Layer.parameters',
-    'Tracer',
-    'Layer.full_name',
-    'InMemoryDataset',
-    'layer_norm',
-    'bipartite_match',
-    'double_buffer',
-    'cumsum',
-    'thresholded_relu',
-    'group_norm',
-    'random_crop',
-    'py_func',
-    'row_conv',
-    'hard_shrink',
-    'ssd_loss',
-    'retinanet_target_assign',
-    'InMemoryDataset.global_shuffle',
-    'InMemoryDataset.get_memory_data_size',
-    'DetectionMAP',
-    'hash',
-    'InMemoryDataset.set_queue_num',
-    'LayerNorm',
-    'Preprocessor',
-    'chunk_eval',
-    'GRUUnit',
-    'ExponentialMovingAverage',
-    'QueueDataset.global_shuffle',
-    'NumpyArrayInitializer',
-    'create_py_reader_by_data',
-    'InMemoryDataset.local_shuffle',
-    'InMemoryDataset.get_shuffle_data_size',
-    'size',
-    'edit_distance',
-    'nce',
-    'BilinearInitializer',
-    'NaturalExpDecay',
-    'noam_decay',
-    'retinanet_detection_output',
-    'Pool2D',
-    'PipelineOptimizer',
-    'generate_mask_labels',
-    'isfinite',
-    'InMemoryDataset.set_fleet_send_batch_size',
-    'cuda_profiler',
-    'unfold',
-    'Executor',
-    'InMemoryDataset.load_into_memory',
-    'ExponentialDecay',
-    'BatchNorm',
-    'deformable_conv',
-    'InMemoryDataset.preload_into_memory',
-    'py_reader',
-    'linear_lr_warmup',
-    'InMemoryDataset.wait_preload_done',
-    'CosineDecay',
-    'roi_perspective_transform',
-    'unique',
-    'ones_like',
-    'LambOptimizer',
-    'InMemoryDataset.release_memory',
-    'Conv2DTranspose',
-    'QueueDataset.local_shuffle',
-    # wrong in dygraph/checkpoint.py  ok in io.py [duplicated name]
-    'save_persistables@dygraph/checkpoint.py',
-    'load_persistables@dygraph/checkpoint.py'
-]
-'''
-white list of private API/ redundant API
-'''
-wlist_ignore = [
-    'elementwise_pow', 'WeightedAverage.reset', 'ChunkEvaluator.eval',
-    'NCE.forward', 'elementwise_div', 'BilinearTensorProduct.forward',
-    'NoamDecay.step', 'elementwise_min', 'PiecewiseDecay.step',
-    'Conv3DTranspose.forward', 'elementwise_add', 'IfElse.output',
-    'IfElse.true_block', 'InverseTimeDecay.step', 'PolynomialDecay.step',
-    'Precision.eval', 'enabled', 'elementwise_max', 'stop_gperf_profiler',
-    'IfElse.false_block', 'WeightedAverage.add', 'Auc.trapezoid_area',
-    'elementwise_mul', 'GroupNorm.forward', 'SpectralNorm.forward',
-    'elementwise_sub', 'Switch.case', 'IfElse.input', 'prepare_context',
-    'PRelu.forward', 'Recall.update', 'start_gperf_profiler',
-    'TreeConv.forward', 'Conv2D.forward', 'Switch.default', 'elementwise_mod',
-    'Precision.update', 'WeightedAverage.eval', 'Conv3D.forward',
-    'Embedding.forward', 'Recall.eval', 'FC.forward', 'While.block'
-]
-
-# only white on CPU
-gpu_not_white = [
-    "deformable_conv", "cuda_places", "CUDAPinnedPlace", "CUDAPlace",
-    "cuda_profiler"
-]
-
-wlist = wlist_temp + wlist_inneed + wlist_ignore
-
-if len(sys.argv) < 2:
-    print "Error: inadequate number of arguments"
-    print('''If you are going to run it on 
-        "CPU: >>> python sampcd_processor.py cpu
-        "GPU: >>> python sampcd_processor.py gpu
-        ''')
-    sys.exit("lack arguments")
-
-else:
-
-    show_details = False
-
-    if sys.argv[1] == "gpu":
-        for _gnw in gpu_not_white:
-            wlist.remove(_gnw)
-    elif sys.argv[1] != "cpu":
-        print("Unrecognized argument:'" + sys.argv[1] + "' , 'cpu' or 'gpu' is "
-              + "desired\n")
-        sys.exit("Invalid arguments")
-
-    if len(sys.argv) == 3:
-        if sys.argv[2] == "sd":
-            show_details = True
-        else:
-            print("Unrecognized argument:'" + sys.argv[2] + "' , 'sd' is " +
-                  "desired\n")
-            sys.exit("Invalid arguments")
-
-    print("* * * * * * * * * * * * * * * * * * * * * * * *\n" +
-          "*                                             *\n" +
-          "*   API check -- Example Code Cheker          *\n" +
-          "*                                             *\n" +
-          "*                                             *\n" +
-          "*   This process is meant to check            *\n" +
-          "*   all example codes per CI to ensure        *\n" +
-          "*   the example codes can be run successfully *\n" +
-          "*                                             *\n" +
-          "*                                             *\n" +
-          "*   Refer to the comments for detailed        *\n" +
-          "*   introduction                              *\n" +
-          "*                                             *\n" +
-          "*                                             *\n" +
-          "* * * * * * * * * * * * * * * * * * * * * * * *\n")
-
-    status_all = {}
-
-    #a file to record the terminal output
-    logf = open("example-code-check-log.txt", 'w')
-
-    # a temp directory to store temporary sample code file
-    # subprocess needs a single file to run the code 
-
-    if not os.path.isdir("./samplecode_temp"):
-        os.mkdir("./samplecode_temp")
-
-    to_check = filenames
-    for filename in to_check:
-
-        srcfile = open(filename, 'r')
-
-        counts = srccoms_extract(srcfile, logf, status_all, wlist, show_details)
-
-        if show_details:
-            logf.write("\n\n" + str(counts) + "\n\n")
-
-        srcfile.close()
-
-    # clear temp files
-    for root, dirs, files in os.walk("./samplecode_temp"):
-        for fntemp in files:
-            os.remove("./samplecode_temp/" + fntemp)
-
-    os.rmdir("./samplecode_temp")
-
-    status_groups = {-2: [], -1: [], 0: [], 1: [], 2: [], 3: []}
-
-    ci_pass = True
-
-    for key in status_all:
-        statusl = status_all[key]
-        for ele in statusl:
-            if (ele != 0 and ele != -2 and ele != -1):
-                ci_pass = False
-                break
-
-        if len(statusl) == 1:
-            status_groups[statusl[0]].append(key)
-        else:
-            for u in range(0, len(statusl)):
-                status_groups[statusl[u]].append(key + '_' + str(u + 1))
-
-    logf.close()
-
-    print(
-        "\n\n------------------End of the Check-------------------------------------------\n\n"
-    )
-
-    errorapisl = status_groups[1] + status_groups[2] + status_groups[3]
-    if len(errorapisl) > 0:
-        print "Error raised from: " + str(errorapisl)
-
-    if not ci_pass:
-
-        print(
-            "\nOh no.. Mistakes found in sample codes, refer to the log for details\n\n"
-        )
-        print('''
-- How to run it locally?
-
-    Simply put this script under directory:
-    
-        Paddle/python/paddle/fluid/
-    
-    and run in python 2.7 (as some interfaces of subprocess may
-    not work in python 3)
-    
-    You must specify the device type to run the sample code on:
-    
-        CPU: >>> python sampcd_processor.py cpu
-        GPU: >>> python sampcd_processor.py gpu
-    
-- How to debug?
-        
-    This script has an option for showing the details of 
-    the execution status:
-
-    >>> python sampcd_processor.py cpu sd
-    
-- NOTE:
-
-    Please ensure your are using 
-    
-        .. code-block:: python 
-            
-            [sample code starts here]
-    
-    ONLY 1 BLANKSPACE between '::' and 'python'
-      
-              ''')
-
-        exit(1)
-    else:
-
-        print "Sample code check is successful!"
diff --git a/python/paddle/fluid/tests/.gitignore b/python/paddle/fluid/tests/.gitignore
deleted file mode 100644
index 62f82151eb42342cd90657b1e4dfc93410950e62..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-image/
-fit_a_line.model/
-tmp
-cuda_profiler.txt
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
deleted file mode 100644
index 8cfd026f8ff8e044ffbd2cc76c34843072261ab1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-if(NOT WITH_DISTRIBUTE)
-  list(REMOVE_ITEM TEST_OPS test_communicator)
-endif(NOT WITH_DISTRIBUTE)
-
-foreach(src ${TEST_OPS})
-  py_test(${src} SRCS ${src}.py)
-endforeach()
-
-add_subdirectory(unittests)
-add_subdirectory(book)
diff --git a/python/paddle/fluid/tests/__init__.py b/python/paddle/fluid/tests/__init__.py
deleted file mode 100644
index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/fluid/tests/book/.gitignore b/python/paddle/fluid/tests/book/.gitignore
deleted file mode 100644
index dd28d354f4160b4be68b46a7bebcdf2097d5811a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.inference.model
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
deleted file mode 100644
index 999a765b6dc32323a24f9069f11134360dbadcb8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
-
-if(WITH_HIGH_LEVEL_API_TEST)
-  add_subdirectory(high-level-api)
-endif()
diff --git a/python/paddle/fluid/tests/book/__init__.py b/python/paddle/fluid/tests/book/__init__.py
deleted file mode 100644
index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
deleted file mode 100644
index 4712a7676948515bfc9e5f1dc8ce71a457caf24c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*_new_api.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# This test is buggy
-# py_test(test_understand_sentiment_dynamic_rnn SRCS
-# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
-LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn_new_api)
-
-if(NOT APPLE)
-    # default test
-    foreach(src ${TEST_OPS})
-        py_test(${src} SRCS ${src}.py)
-    endforeach()
-else()
-    foreach(src ${TEST_OPS})
-        if(${src} STREQUAL "test_image_classification_vgg_new_api")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        elseif(${src} STREQUAL "test_image_classification_resnet_new_api")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        elseif(${src} STREQUAL "test_recognize_digits_conv_new_api")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        elseif(${src} STREQUAL "test_recognize_digits_mlp_new_api")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        elseif()
-            py_test(${src} SRCS ${src}.py)
-            set_tests_properties(${src} PROPERTIES LABELS "RUN_TYPE=DIST")
-        endif()
-    endforeach()
-endif()
diff --git a/python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py
deleted file mode 100644
index 6f24ec45aa6f27814e489b8dce49fe69f62d4f10..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-CIFAR dataset.
-
-This module will download dataset from
-https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
-paddle reader creators.
-
-The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
-with 6000 images per class. There are 50000 training images and 10000 test
-images.
-
-The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
-containing 600 images each. There are 500 training images and 100 testing
-images per class.
-
-"""
-
-from __future__ import print_function
-
-import itertools
-import numpy
-import paddle.dataset.common
-import tarfile
-import six
-from six.moves import cPickle as pickle
-
-__all__ = ['train10']
-
-URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
-CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
-CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
-
-
-def reader_creator(filename, sub_name, batch_size=None):
-    def read_batch(batch):
-        data = batch[six.b('data')]
-        labels = batch.get(
-            six.b('labels'), batch.get(six.b('fine_labels'), None))
-        assert labels is not None
-        for sample, label in six.moves.zip(data, labels):
-            yield (sample / 255.0).astype(numpy.float32), int(label)
-
-    def reader():
-        with tarfile.open(filename, mode='r') as f:
-            names = [
-                each_item.name for each_item in f if sub_name in each_item.name
-            ]
-
-            batch_count = 0
-            for name in names:
-                if six.PY2:
-                    batch = pickle.load(f.extractfile(name))
-                else:
-                    batch = pickle.load(f.extractfile(name), encoding='bytes')
-                for item in read_batch(batch):
-                    if isinstance(batch_size, int) and batch_count > batch_size:
-                        break
-                    batch_count += 1
-                    yield item
-
-    return reader
-
-
-def train10(batch_size=None):
-    """
-    CIFAR-10 training set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch',
-        batch_size=batch_size)
-
-
-def test10(batch_size=None):
-    """
-    CIFAR-10 test set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :return: Test reader creator.
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch',
-        batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_fit_a_line_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_fit_a_line_new_api.py
deleted file mode 100644
index 87f3b7502e26d3e6a437985f99d7897b060e101e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_fit_a_line_new_api.py
+++ /dev/null
@@ -1,187 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import sys
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-import contextlib
-import numpy
-import unittest
-
-# train reader
-BATCH_SIZE = 20
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-test_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.test(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-
-def inference_program():
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-    return y_predict
-
-
-def train_program():
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    y_predict = inference_program()
-
-    loss = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_loss = fluid.layers.mean(loss)
-
-    return [avg_loss, y_predict]
-
-
-def optimizer_func():
-    return fluid.optimizer.SGD(learning_rate=0.001)
-
-
-def train(use_cuda, train_program, params_dirname, inference_model_dirname):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    trainer = Trainer(
-        train_func=train_program, place=place, optimizer_func=optimizer_func)
-
-    def event_handler(event):
-        if isinstance(event, EndStepEvent):
-            if event.step == 10:
-                test_metrics = trainer.test(
-                    reader=test_reader, feed_order=['x', 'y'])
-                print(test_metrics)
-                '''
-                ...
-                ['25.768919467926025']
-                ['15.343549569447836']
-                ...
-                '''
-                if params_dirname is not None:
-                    trainer.save_params(params_dirname)
-                    trainer.save_inference_model(inference_model_dirname,
-                                                 ['x'], [1])
-                trainer.stop()
-
-    trainer.train(
-        reader=train_reader,
-        num_epochs=100,
-        event_handler=event_handler,
-        feed_order=['x', 'y'])
-
-
-# infer
-def infer(use_cuda, inference_program, params_dirname=None):
-    if params_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
-
-    batch_size = 10
-    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
-
-    results = inferencer.infer({'x': tensor_x})
-    print("infer results: ", results[0])
-
-
-def infer_by_saved_model(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        # The input's dimension should be 2-D and the second dim is 13
-        # The input data should be >= 0
-        batch_size = 10
-
-        test_reader = paddle.batch(
-            paddle.dataset.uci_housing.test(), batch_size=batch_size)
-
-        test_data = next(test_reader())
-        test_feat = numpy.array(
-            [data[0] for data in test_data]).astype("float32")
-        test_label = numpy.array(
-            [data[1] for data in test_data]).astype("float32")
-
-        assert feed_target_names[0] == 'x'
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: numpy.array(test_feat)},
-                          fetch_list=fetch_targets)
-        print("infer shape: ", results[0].shape)
-        print("infer results: ", results[0])
-        print("ground truth: ", test_label)
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    # Directory for saving the trained model
-    params_dirname = "fit_a_line.model"
-    inference_model_dirname = "fit_a_line.inference_model"
-
-    train(use_cuda, train_program, params_dirname, inference_model_dirname)
-    infer(use_cuda, inference_program, params_dirname)
-    infer_by_saved_model(use_cuda, inference_model_dirname)
-
-
-class TestFitALine(unittest.TestCase):
-    def test_cpu(self):
-        with self.program_scope_guard():
-            with fluid.unique_name.guard():
-                main(use_cuda=False)
-
-    def test_cuda(self):
-        with self.program_scope_guard():
-            with fluid.unique_name.guard():
-                main(use_cuda=True)
-
-    @contextlib.contextmanager
-    def program_scope_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_image_classification_resnet_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_image_classification_resnet_new_api.py
deleted file mode 100644
index e87c1d58c812f3186658a78c4280f16d34c466ee..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_image_classification_resnet_new_api.py
+++ /dev/null
@@ -1,194 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-
-import paddle
-import paddle.fluid as fluid
-
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-import paddle.fluid.core as core
-import numpy
-import os
-import cifar10_small_test_set
-
-
-def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    def shortcut(input, ch_in, ch_out, stride):
-        if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-        else:
-            return input
-
-    def basicblock(input, ch_in, ch_out, stride):
-        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
-        short = shortcut(input, ch_in, ch_out, stride)
-        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
-
-    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
-        tmp = block_func(input, ch_in, ch_out, stride)
-        for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1)
-        return tmp
-
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) // 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    predict = fluid.layers.fc(input=pool, size=10, act='softmax')
-    return predict
-
-
-def inference_network():
-    data_shape = [3, 32, 32]
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    predict = resnet_cifar10(images, 32)
-    return predict
-
-
-def train_network():
-    predict = inference_network()
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    return [avg_cost, accuracy]
-
-
-def optimizer_func():
-    return fluid.optimizer.Adam(learning_rate=0.001)
-
-
-def train(use_cuda, train_program, parallel, params_dirname):
-    BATCH_SIZE = 128
-    EPOCH_NUM = 1
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE,
-        drop_last=False)
-
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
-
-    def event_handler(event):
-        if isinstance(event, EndStepEvent):
-            avg_cost, accuracy = trainer.test(
-                reader=test_reader, feed_order=['pixel', 'label'])
-
-            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
-
-            if accuracy > 0.01:  # Low threshold for speeding up CI
-                if params_dirname is not None:
-                    trainer.save_params(params_dirname)
-                return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    trainer = Trainer(
-        train_func=train_program,
-        optimizer_func=optimizer_func,
-        place=place,
-        parallel=parallel)
-
-    trainer.train(
-        reader=train_reader,
-        num_epochs=EPOCH_NUM,
-        event_handler=event_handler,
-        feed_order=['pixel', 'label'])
-
-
-def infer(use_cuda, inference_program, parallel, params_dirname=None):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = Inferencer(
-        infer_func=inference_program,
-        param_path=params_dirname,
-        place=place,
-        parallel=parallel)
-
-    # The input's dimension of conv should be 4-D or 5-D.
-    # Use normilized image pixels as input data, which should be in the range
-    # [0, 1.0].
-    tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
-    results = inferencer.infer({'pixel': tensor_img})
-
-    print("infer results: ", results)
-
-
-def main(use_cuda, parallel):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    save_path = "image_classification_resnet.inference.model"
-
-    os.environ['CPU_NUM'] = str(4)
-    train(
-        use_cuda=use_cuda,
-        train_program=train_network,
-        params_dirname=save_path,
-        parallel=parallel)
-
-    # FIXME(zcd): in the inference stage, the number of
-    # input data is one, it is not appropriate to use parallel.
-    if parallel and use_cuda:
-        return
-
-    os.environ['CPU_NUM'] = str(1)
-    infer(
-        use_cuda=use_cuda,
-        inference_program=inference_network,
-        params_dirname=save_path,
-        parallel=parallel)
-
-
-if __name__ == '__main__':
-    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
-    if not on_ci:
-        for use_cuda in (False, True):
-            for parallel in (False, True):
-                if use_cuda and not core.is_compiled_with_cuda():
-                    continue
-                main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py
deleted file mode 100644
index 0a27aa0fcfece36f1a8ae5ad0477d75a15fd88da..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py
+++ /dev/null
@@ -1,168 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-
-import paddle
-import paddle.fluid as fluid
-
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-import paddle.fluid.core as core
-import numpy
-import os
-import cifar10_small_test_set
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    predict = fluid.layers.fc(input=fc2, size=10, act='softmax')
-    return predict
-
-
-def inference_network():
-    data_shape = [3, 32, 32]
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    predict = vgg16_bn_drop(images)
-    return predict
-
-
-def train_network():
-    predict = inference_network()
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    return [avg_cost, accuracy]
-
-
-def optimizer_func():
-    return fluid.optimizer.Adam(learning_rate=0.001)
-
-
-def train(use_cuda, train_program, parallel, params_dirname):
-    BATCH_SIZE = 128
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE,
-        drop_last=False)
-    # Use only part of the test set data validation program
-    test_reader = paddle.batch(
-        cifar10_small_test_set.test10(BATCH_SIZE),
-        batch_size=BATCH_SIZE,
-        drop_last=False)
-
-    def event_handler(event):
-        if isinstance(event, EndStepEvent):
-            avg_cost, accuracy = trainer.test(
-                reader=test_reader, feed_order=['pixel', 'label'])
-
-            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
-
-            if accuracy > 0.01:  # Low threshold for speeding up CI
-                if params_dirname is not None:
-                    trainer.save_params(params_dirname)
-                return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    trainer = Trainer(
-        train_func=train_program,
-        place=place,
-        optimizer_func=optimizer_func,
-        parallel=parallel)
-
-    trainer.train(
-        reader=train_reader,
-        num_epochs=1,
-        event_handler=event_handler,
-        feed_order=['pixel', 'label'])
-
-
-def infer(use_cuda, inference_program, parallel, params_dirname=None):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = Inferencer(
-        infer_func=inference_program,
-        param_path=params_dirname,
-        place=place,
-        parallel=parallel)
-
-    # The input's dimension of conv should be 4-D or 5-D.
-    # Use normilized image pixels as input data, which should be in the range
-    # [0, 1.0].
-    tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
-    results = inferencer.infer({'pixel': tensor_img})
-
-    print("infer results: ", results)
-
-
-def main(use_cuda, parallel):
-    save_path = "image_classification_vgg.inference.model"
-
-    os.environ['CPU_NUM'] = str(4)
-    train(
-        use_cuda=use_cuda,
-        train_program=train_network,
-        params_dirname=save_path,
-        parallel=parallel)
-
-    # FIXME(zcd): in the inference stage, the number of
-    # input data is one, it is not appropriate to use parallel.
-    if parallel and use_cuda:
-        return
-    os.environ['CPU_NUM'] = str(1)
-    infer(
-        use_cuda=use_cuda,
-        inference_program=inference_network,
-        params_dirname=save_path,
-        parallel=parallel)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        for parallel in (False, True):
-            if use_cuda and not core.is_compiled_with_cuda():
-                continue
-            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_label_semantic_roles_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_label_semantic_roles_new_api.py
deleted file mode 100755
index 9e155a59145db88dab27576a4a67a5d450bcfc9d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_label_semantic_roles_new_api.py
+++ /dev/null
@@ -1,276 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import sys
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-import numpy as np
-
-WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict()
-WORD_DICT_LEN = len(WORD_DICT)
-LABEL_DICT_LEN = len(LABEL_DICT)
-PRED_DICT_LEN = len(VERB_DICT)
-MARK_DICT_LEN = 2
-IS_SPARSE = True
-BATCH_SIZE = 10
-EMBEDDING_NAME = 'emb'
-
-
-def lstm_net():
-    WORD_DIM = 32
-    MARK_DIM = 5
-    HIDDEN_DIM = 512
-    DEPTH = 8
-
-    # Data definitions
-    word = fluid.layers.data(
-        name='word_data', shape=[1], dtype='int64', lod_level=1)
-    predicate = fluid.layers.data(
-        name='verb_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n2 = fluid.layers.data(
-        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n1 = fluid.layers.data(
-        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_0 = fluid.layers.data(
-        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p1 = fluid.layers.data(
-        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p2 = fluid.layers.data(
-        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-    mark = fluid.layers.data(
-        name='mark_data', shape=[1], dtype='int64', lod_level=1)
-
-    # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        size=[PRED_DICT_LEN, WORD_DIM],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr='vemb')
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        size=[MARK_DICT_LEN, MARK_DIM],
-        dtype='float32',
-        is_sparse=IS_SPARSE)
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        fluid.layers.embedding(
-            size=[WORD_DICT_LEN, WORD_DIM],
-            input=x,
-            param_attr=fluid.ParamAttr(name=EMBEDDING_NAME))
-        for x in word_input
-        #name=EMBEDDING_NAME, trainable=False)) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-
-    hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=HIDDEN_DIM, act='tanh')
-        for emb in emb_layers
-    ]
-
-    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
-
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=HIDDEN_DIM,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid')
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, DEPTH):
-        mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=HIDDEN_DIM, act='tanh'),
-            fluid.layers.fc(input=input_tmp[1], size=HIDDEN_DIM, act='tanh')
-        ])
-
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=HIDDEN_DIM,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1))
-
-        input_tmp = [mix_hidden, lstm]
-
-    feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=LABEL_DICT_LEN, act='tanh'),
-        fluid.layers.fc(input=input_tmp[1], size=LABEL_DICT_LEN, act='tanh')
-    ])
-
-    return feature_out
-
-
-def inference_program():
-    predict = lstm_net()
-
-    return predict
-
-
-def train_program():
-    MIX_HIDDEN_LR = 1e-3
-
-    predict = lstm_net()
-    target = fluid.layers.data(
-        name='target', shape=[1], dtype='int64', lod_level=1)
-    crf_cost = fluid.layers.linear_chain_crf(
-        input=predict,
-        label=target,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=MIX_HIDDEN_LR))
-    avg_cost = fluid.layers.mean(crf_cost)
-
-    return [avg_cost]
-
-
-def optimize_func():
-    return fluid.optimizer.SGD(learning_rate=fluid.layers.exponential_decay(
-        learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True))
-
-
-def train(use_cuda, train_program, params_dirname):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    trainer = Trainer(
-        train_func=train_program, place=place, optimizer_func=optimize_func)
-
-    feed_order = [
-        'word_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
-        'ctx_p2_data', 'verb_data', 'mark_data', 'target'
-    ]
-
-    #embedding_param = fluid.global_scope().find_var(
-    #        EMBEDDING_NAME).get_tensor()
-    #embedding_param.set(
-    #        load_parameter(conll05.get_embedding(), WORD_DICT_LEN, WORD_DIM),
-    #        place)
-
-    def event_handler(event):
-        if isinstance(event, EndEpochEvent):
-            test_reader = paddle.batch(
-                paddle.dataset.conll05.test(), batch_size=BATCH_SIZE)
-            avg_cost_set = trainer.test(
-                reader=test_reader, feed_order=feed_order)
-
-            # get avg cost
-            avg_cost = np.array(avg_cost_set).mean()
-
-            print("avg_cost: %s" % avg_cost)
-
-            if float(avg_cost) < 100.0:  # Large value to increase CI speed
-                trainer.save_params(params_dirname)
-            else:
-                print(
-                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-                                                             float(avg_cost))))
-                if math.isnan(float(avg_cost)):
-                    sys.exit("got NaN loss, training failed.")
-
-        elif isinstance(event, EndStepEvent):
-            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, list(map(np.array, event.metrics))))
-            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(params_dirname)
-                trainer.stop()
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.conll05.test(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-    trainer.train(
-        num_epochs=1,
-        event_handler=event_handler,
-        reader=train_reader,
-        feed_order=feed_order)
-
-
-def infer(use_cuda, inference_program, params_dirname):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = Inferencer(
-        inference_program, param_path=params_dirname, place=place)
-
-    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of
-    # each word (base_shape) should be [1] since it is simply an index to
-    # look up for the corresponding word vector.
-    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only
-    # one higher level structure (sequence of words, or sentence) than the basic
-    # element (word). Hence the LoDTensor will hold data for three sentences of
-    # length 3, 4 and 2, respectively.
-    # Note that recursive_sequence_lengths should be a list of lists.
-    recursive_seq_lens = [[3, 4, 2]]
-    base_shape = [1]
-    # The range of random integers is [low, high]
-    word = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
-    ctx_n2 = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
-    ctx_n1 = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
-    ctx_0 = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
-    ctx_p1 = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
-    ctx_p2 = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
-    pred = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
-    mark = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
-
-    results = inferencer.infer(
-        {
-            'word_data': word,
-            'ctx_n2_data': ctx_n2,
-            'ctx_n1_data': ctx_n1,
-            'ctx_0_data': ctx_0,
-            'ctx_p1_data': ctx_p1,
-            'ctx_p2_data': ctx_p2,
-            'verb_data': pred,
-            'mark_data': mark
-        },
-        return_numpy=False)
-
-    print("infer results: ", np.array(results[0]).shape)
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    params_dirname = "label_semantic_roles.inference.model"
-    train(use_cuda, train_program, params_dirname)
-    infer(use_cuda, inference_program, params_dirname)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_machine_translation_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_machine_translation_new_api.py
deleted file mode 100644
index b597dcf801dc5ad4b5957875634018cfdcd0b83b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_machine_translation_new_api.py
+++ /dev/null
@@ -1,335 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import contextlib
-import sys
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-import paddle.fluid.framework as framework
-import paddle.fluid.layers as pd
-from paddle.fluid.executor import Executor
-from functools import partial
-import unittest
-
-dict_size = 30000
-source_dict_dim = target_dict_dim = dict_size
-hidden_dim = 32
-word_dim = 16
-batch_size = 2
-max_length = 8
-topk_size = 50
-trg_dic_size = 10000
-beam_size = 2
-
-decoder_size = hidden_dim
-
-
-def encoder(is_sparse):
-    # encoder
-    src_word_id = pd.data(
-        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = pd.embedding(
-        input=src_word_id,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse,
-        param_attr=fluid.ParamAttr(name='vemb'))
-
-    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
-    lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = pd.sequence_last_step(input=lstm_hidden0)
-    return encoder_out
-
-
-def train_decoder(context, is_sparse):
-    # decoder
-    trg_language_word = pd.data(
-        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = pd.embedding(
-        input=trg_language_word,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse,
-        param_attr=fluid.ParamAttr(name='vemb'))
-
-    rnn = pd.DynamicRNN()
-    with rnn.block():
-        current_word = rnn.step_input(trg_embedding)
-        pre_state = rnn.memory(init=context)
-        current_state = pd.fc(input=[current_word, pre_state],
-                              size=decoder_size,
-                              act='tanh')
-
-        current_score = pd.fc(input=current_state,
-                              size=target_dict_dim,
-                              act='softmax')
-        rnn.update_memory(pre_state, current_state)
-        rnn.output(current_score)
-
-    return rnn()
-
-
-def decode(context, is_sparse):
-    init_state = context
-    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
-    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
-
-    # fill the first element with init_state
-    state_array = pd.create_array('float32')
-    pd.array_write(init_state, array=state_array, i=counter)
-
-    # ids, scores as memory
-    ids_array = pd.create_array('int64')
-    scores_array = pd.create_array('float32')
-
-    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
-    init_scores = pd.data(
-        name="init_scores", shape=[1], dtype="float32", lod_level=2)
-
-    pd.array_write(init_ids, array=ids_array, i=counter)
-    pd.array_write(init_scores, array=scores_array, i=counter)
-
-    cond = pd.less_than(x=counter, y=array_len)
-
-    while_op = pd.While(cond=cond)
-    with while_op.block():
-        pre_ids = pd.array_read(array=ids_array, i=counter)
-        pre_state = pd.array_read(array=state_array, i=counter)
-        pre_score = pd.array_read(array=scores_array, i=counter)
-
-        # expand the lod of pre_state to be the same with pre_score
-        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
-
-        pre_ids_emb = pd.embedding(
-            input=pre_ids,
-            size=[dict_size, word_dim],
-            dtype='float32',
-            is_sparse=is_sparse)
-
-        # use rnn unit to update rnn
-        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
-                              size=decoder_size,
-                              act='tanh')
-        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
-        # use score to do beam search
-        current_score = pd.fc(input=current_state_with_lod,
-                              size=target_dict_dim,
-                              act='softmax')
-        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
-        # calculate accumulated scores after topk to reduce computation cost
-        accu_scores = pd.elementwise_add(
-            x=pd.log(topk_scores), y=pd.reshape(
-                pre_score, shape=[-1]), axis=0)
-        selected_ids, selected_scores = pd.beam_search(
-            pre_ids,
-            pre_score,
-            topk_indices,
-            accu_scores,
-            beam_size,
-            end_id=10,
-            level=0)
-
-        pd.increment(x=counter, value=1, in_place=True)
-
-        # update the memories
-        pd.array_write(current_state, array=state_array, i=counter)
-        pd.array_write(selected_ids, array=ids_array, i=counter)
-        pd.array_write(selected_scores, array=scores_array, i=counter)
-
-        # update the break condition: up to the max length or all candidates of
-        # source sentences have ended.
-        length_cond = pd.less_than(x=counter, y=array_len)
-        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
-        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
-
-    translation_ids, translation_scores = pd.beam_search_decode(
-        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
-
-    # return init_ids, init_scores
-
-    return translation_ids, translation_scores
-
-
-def train_program(is_sparse):
-    context = encoder(is_sparse)
-    rnn_out = train_decoder(context, is_sparse)
-    label = pd.data(
-        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = pd.cross_entropy(input=rnn_out, label=label)
-    avg_cost = pd.mean(cost)
-    return avg_cost
-
-
-def optimizer_func():
-    return fluid.optimizer.Adagrad(
-        learning_rate=1e-4,
-        regularization=fluid.regularizer.L2DecayRegularizer(
-            regularization_coeff=0.1))
-
-
-def train(use_cuda, is_sparse, is_local=True):
-    EPOCH_NUM = 1
-
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
-
-    feed_order = [
-        'src_word_id', 'target_language_word', 'target_language_next_word'
-    ]
-
-    def event_handler(event):
-        if isinstance(event, EndStepEvent):
-            print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step))
-            if event.step == 10:
-                trainer.stop()
-
-    trainer = Trainer(
-        train_func=partial(train_program, is_sparse),
-        place=place,
-        optimizer_func=optimizer_func)
-
-    trainer.train(
-        reader=train_reader,
-        num_epochs=EPOCH_NUM,
-        event_handler=event_handler,
-        feed_order=feed_order)
-
-
-def decode_main(use_cuda, is_sparse):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    context = encoder(is_sparse)
-    translation_ids, translation_scores = decode(context, is_sparse)
-
-    exe = Executor(place)
-    exe.run(framework.default_startup_program())
-
-    init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
-    init_scores_data = np.array(
-        [1. for _ in range(batch_size)], dtype='float32')
-    init_ids_data = init_ids_data.reshape((batch_size, 1))
-    init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_recursive_seq_lens = [1] * batch_size
-    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
-
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
-                                       place)
-    init_scores = fluid.create_lod_tensor(init_scores_data,
-                                          init_recursive_seq_lens, place)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
-
-    feed_order = ['src_word_id']
-    feed_list = [
-        framework.default_main_program().global_block().var(var_name)
-        for var_name in feed_order
-    ]
-    feeder = fluid.DataFeeder(feed_list, place)
-
-    for data in train_data():
-        feed_dict = feeder.feed([[x[0]] for x in data])
-        feed_dict['init_ids'] = init_ids
-        feed_dict['init_scores'] = init_scores
-
-        result_ids, result_scores = exe.run(
-            framework.default_main_program(),
-            feed=feed_dict,
-            fetch_list=[translation_ids, translation_scores],
-            return_numpy=False)
-        print(result_ids.recursive_sequence_lengths())
-        break
-
-
-class TestMachineTranslation(unittest.TestCase):
-    pass
-
-
-@contextlib.contextmanager
-def scope_prog_guard():
-    prog = fluid.Program()
-    startup_prog = fluid.Program()
-    scope = fluid.core.Scope()
-    with fluid.scope_guard(scope):
-        with fluid.program_guard(prog, startup_prog):
-            yield
-
-
-def inject_test_train(use_cuda, is_sparse):
-    f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
-                                         if is_sparse else 'dense')
-
-    def f(*args):
-        with scope_prog_guard():
-            train(use_cuda, is_sparse)
-
-    setattr(TestMachineTranslation, f_name, f)
-
-
-def inject_test_decode(use_cuda, is_sparse, decorator=None):
-    f_name = 'test_{0}_{1}_decode'.format('cuda'
-                                          if use_cuda else 'cpu', 'sparse'
-                                          if is_sparse else 'dense')
-
-    def f(*args):
-        with scope_prog_guard():
-            decode_main(use_cuda, is_sparse)
-
-    if decorator is not None:
-        f = decorator(f)
-
-    setattr(TestMachineTranslation, f_name, f)
-
-
-for _use_cuda_ in (False, True):
-    for _is_sparse_ in (False, True):
-        inject_test_train(_use_cuda_, _is_sparse_)
-
-for _use_cuda_ in (False, True):
-    for _is_sparse_ in (False, True):
-
-        _decorator_ = None
-        if _use_cuda_:
-            _decorator_ = unittest.skip(
-                reason='Beam Search does not support CUDA!')
-
-        inject_test_decode(
-            is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_conv_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_conv_new_api.py
deleted file mode 100644
index ce183883e3bddd8633dd9c393ee358ba6210ea61..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_conv_new_api.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-
-import paddle.fluid as fluid
-
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-import paddle.fluid.core as core
-import paddle
-import numpy
-import math
-import sys
-import os
-
-BATCH_SIZE = 64
-
-
-def inference_program():
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-    return prediction
-
-
-def train_program():
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    predict = inference_program()
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    acc = fluid.layers.accuracy(input=predict, label=label)
-    return [avg_cost, acc]
-
-
-def optimizer_func():
-    return fluid.optimizer.Adam(learning_rate=0.001)
-
-
-def train(use_cuda, train_program, parallel, params_dirname):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    trainer = Trainer(
-        train_func=train_program,
-        place=place,
-        optimizer_func=optimizer_func,
-        parallel=parallel)
-
-    def event_handler(event):
-        if isinstance(event, EndEpochEvent):
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
-            avg_cost, acc = trainer.test(
-                reader=test_reader, feed_order=['img', 'label'])
-
-            print("avg_cost: %s" % avg_cost)
-            print("acc     : %s" % acc)
-
-            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(params_dirname)
-            else:
-                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
-                    event.epoch + 1, avg_cost, acc))
-                if math.isnan(avg_cost):
-                    sys.exit("got NaN loss, training failed.")
-        elif isinstance(event, EndStepEvent):
-            print(
-                ("Step {0}, Epoch {1} Metrics {2}".format(
-                    event.step, event.epoch,
-                    list(map(numpy.array, event.metrics)))))
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-
-    trainer.train(
-        num_epochs=1,
-        event_handler=event_handler,
-        reader=train_reader,
-        feed_order=['img', 'label'])
-
-
-def infer(use_cuda, inference_program, parallel, params_dirname=None):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    inferencer = Inferencer(
-        infer_func=inference_program,
-        param_path=params_dirname,
-        place=place,
-        parallel=parallel)
-
-    batch_size = 1
-    tensor_img = numpy.random.uniform(-1.0, 1.0,
-                                      [batch_size, 1, 28, 28]).astype("float32")
-
-    results = inferencer.infer({'img': tensor_img})
-
-    print("infer results: ", results[0])
-
-
-def main(use_cuda, parallel):
-    params_dirname = "recognize_digits_conv.inference.model"
-
-    # call train() with is_local argument to run distributed train
-    os.environ['CPU_NUM'] = str(4)
-    train(
-        use_cuda=use_cuda,
-        train_program=train_program,
-        params_dirname=params_dirname,
-        parallel=parallel)
-
-    # FIXME(zcd): in the inference stage, the number of
-    # input data is one, it is not appropriate to use parallel.
-    if parallel and use_cuda:
-        return
-    os.environ['CPU_NUM'] = str(1)
-    infer(
-        use_cuda=use_cuda,
-        inference_program=inference_program,
-        params_dirname=params_dirname,
-        parallel=parallel)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        for parallel in (False, True):
-            if use_cuda and not core.is_compiled_with_cuda():
-                continue
-            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_mlp_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_mlp_new_api.py
deleted file mode 100644
index 45a5ff34af00f2dbe69bd4f08a50626d6ca814f8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_mlp_new_api.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-
-import paddle.fluid as fluid
-
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-import paddle.fluid.core as core
-import paddle
-import numpy
-import math
-import sys
-import os
-
-BATCH_SIZE = 64
-
-
-def inference_program():
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-
-    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
-    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    return prediction
-
-
-def train_program():
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    predict = inference_program()
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    acc = fluid.layers.accuracy(input=predict, label=label)
-    return [avg_cost, acc]
-
-
-def optimizer_func():
-    return fluid.optimizer.Adam(learning_rate=0.001)
-
-
-def train(use_cuda, train_program, params_dirname, parallel):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    trainer = Trainer(
-        train_func=train_program,
-        place=place,
-        optimizer_func=optimizer_func,
-        parallel=parallel)
-
-    def event_handler(event):
-        if isinstance(event, EndEpochEvent):
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
-            avg_cost, acc = trainer.test(
-                reader=test_reader, feed_order=['img', 'label'])
-
-            print("avg_cost: %s" % avg_cost)
-            print("acc     : %s" % acc)
-
-            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(params_dirname)
-            else:
-                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
-                    event.epoch + 1, avg_cost, acc))
-                if math.isnan(avg_cost):
-                    sys.exit("got NaN loss, training failed.")
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-
-    trainer.train(
-        num_epochs=1,
-        event_handler=event_handler,
-        reader=train_reader,
-        feed_order=['img', 'label'])
-
-
-def infer(use_cuda, inference_program, parallel, params_dirname=None):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    inferencer = Inferencer(
-        infer_func=inference_program,
-        param_path=params_dirname,
-        place=place,
-        parallel=parallel)
-
-    batch_size = 1
-    tensor_img = numpy.random.uniform(-1.0, 1.0,
-                                      [batch_size, 1, 28, 28]).astype("float32")
-
-    results = inferencer.infer({'img': tensor_img})
-
-    print("infer results: ", results[0])
-
-
-def main(use_cuda, parallel):
-    params_dirname = "recognize_digits_mlp.inference.model"
-
-    # call train() with is_local argument to run distributed train
-    os.environ['CPU_NUM'] = str(4)
-    train(
-        use_cuda=use_cuda,
-        train_program=train_program,
-        params_dirname=params_dirname,
-        parallel=parallel)
-
-    # FIXME(zcd): in the inference stage, the number of
-    # input data is one, it is not appropriate to use parallel.
-    if parallel and use_cuda:
-        return
-    os.environ['CPU_NUM'] = str(1)
-    infer(
-        use_cuda=use_cuda,
-        inference_program=inference_program,
-        params_dirname=params_dirname,
-        parallel=parallel)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        for parallel in (False, True):
-            if use_cuda and not core.is_compiled_with_cuda():
-                continue
-            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_recommender_system_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_recommender_system_new_api.py
deleted file mode 100644
index 07afa742c6b7d28b129192e4b9ffc41a405d3367..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_recommender_system_new_api.py
+++ /dev/null
@@ -1,276 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import math
-import sys
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import sys
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-import paddle.fluid.layers as layers
-import paddle.fluid.nets as nets
-
-IS_SPARSE = True
-USE_GPU = False
-BATCH_SIZE = 256
-
-
-def get_usr_combined_features():
-    # FIXME(dzh) : old API integer_value(10) may have range check.
-    # currently we don't have user configurated check.
-
-    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
-
-    uid = layers.data(name='user_id', shape=[1], dtype='int64')
-
-    usr_emb = layers.embedding(
-        input=uid,
-        dtype='float32',
-        size=[USR_DICT_SIZE, 32],
-        param_attr='user_table',
-        is_sparse=IS_SPARSE)
-
-    usr_fc = layers.fc(input=usr_emb, size=32)
-
-    USR_GENDER_DICT_SIZE = 2
-
-    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
-
-    usr_gender_emb = layers.embedding(
-        input=usr_gender_id,
-        size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr='gender_table',
-        is_sparse=IS_SPARSE)
-
-    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
-
-    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
-    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
-
-    usr_age_emb = layers.embedding(
-        input=usr_age_id,
-        size=[USR_AGE_DICT_SIZE, 16],
-        is_sparse=IS_SPARSE,
-        param_attr='age_table')
-
-    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
-
-    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
-    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
-
-    usr_job_emb = layers.embedding(
-        input=usr_job_id,
-        size=[USR_JOB_DICT_SIZE, 16],
-        param_attr='job_table',
-        is_sparse=IS_SPARSE)
-
-    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
-
-    concat_embed = layers.concat(
-        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
-
-    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-
-    return usr_combined_features
-
-
-def get_mov_combined_features():
-
-    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
-
-    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
-
-    mov_emb = layers.embedding(
-        input=mov_id,
-        dtype='float32',
-        size=[MOV_DICT_SIZE, 32],
-        param_attr='movie_table',
-        is_sparse=IS_SPARSE)
-
-    mov_fc = layers.fc(input=mov_emb, size=32)
-
-    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
-
-    category_id = layers.data(
-        name='category_id', shape=[1], dtype='int64', lod_level=1)
-
-    mov_categories_emb = layers.embedding(
-        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-
-    mov_categories_hidden = layers.sequence_pool(
-        input=mov_categories_emb, pool_type="sum")
-
-    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
-
-    mov_title_id = layers.data(
-        name='movie_title', shape=[1], dtype='int64', lod_level=1)
-
-    mov_title_emb = layers.embedding(
-        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-
-    mov_title_conv = nets.sequence_conv_pool(
-        input=mov_title_emb,
-        num_filters=32,
-        filter_size=3,
-        act="tanh",
-        pool_type="sum")
-
-    concat_embed = layers.concat(
-        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
-
-    # FIXME(dzh) : need tanh operator
-    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-
-    return mov_combined_features
-
-
-def inference_program():
-    usr_combined_features = get_usr_combined_features()
-    mov_combined_features = get_mov_combined_features()
-
-    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
-    scale_infer = layers.scale(x=inference, scale=5.0)
-
-    return scale_infer
-
-
-def train_program():
-
-    scale_infer = inference_program()
-
-    label = layers.data(name='score', shape=[1], dtype='float32')
-    square_cost = layers.square_error_cost(input=scale_infer, label=label)
-    avg_cost = layers.mean(square_cost)
-
-    return [avg_cost, scale_infer]
-
-
-def optimizer_func():
-    return fluid.optimizer.SGD(learning_rate=0.2)
-
-
-def train(use_cuda, train_program, params_dirname):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    trainer = Trainer(
-        train_func=train_program, place=place, optimizer_func=optimizer_func)
-
-    feed_order = [
-        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
-        'movie_title', 'score'
-    ]
-
-    def event_handler(event):
-        if isinstance(event, EndStepEvent):
-            test_reader = paddle.batch(
-                paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
-            avg_cost_set = trainer.test(
-                reader=test_reader, feed_order=feed_order)
-
-            # get avg cost
-            avg_cost = np.array(avg_cost_set).mean()
-
-            print("avg_cost: %s" % avg_cost)
-
-            if float(avg_cost) < 4:  # Smaller value to increase CI speed
-                trainer.save_params(params_dirname)
-                trainer.stop()
-            else:
-                print(
-                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-                                                             float(avg_cost))))
-                if math.isnan(float(avg_cost)):
-                    sys.exit("got NaN loss, training failed.")
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.movielens.train(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-
-    trainer.train(
-        num_epochs=1,
-        event_handler=event_handler,
-        reader=train_reader,
-        feed_order=feed_order)
-
-
-def infer(use_cuda, inference_program, params_dirname):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = Inferencer(
-        inference_program, param_path=params_dirname, place=place)
-
-    # Use the first data from paddle.dataset.movielens.test() as input.
-    # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
-    # to generate LoD Tensor where `data` is a list of sequences of index 
-    # numbers, `recursive_sequence_lengths` is the length-based level of detail 
-    # (lod) info associated with `data`.
-    # For example, data = [[10, 2, 3], [2, 3]] means that it contains
-    # two sequences of indexes, of length 3 and 2, respectively.
-    # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
-    # level of detail info, indicating that `data` consists of two sequences 
-    # of length 3 and 2, respectively. 
-    user_id = fluid.create_lod_tensor([[np.int64(1)]], [[1]], place)
-    gender_id = fluid.create_lod_tensor([[np.int64(1)]], [[1]], place)
-    age_id = fluid.create_lod_tensor([[np.int64(0)]], [[1]], place)
-    job_id = fluid.create_lod_tensor([[np.int64(10)]], [[1]], place)
-    movie_id = fluid.create_lod_tensor([[np.int64(783)]], [[1]], place)
-    category_id = fluid.create_lod_tensor(
-        [np.array(
-            [10, 8, 9], dtype='int64')], [[3]], place)
-    movie_title = fluid.create_lod_tensor(
-        [np.array(
-            [1069, 4140, 2923, 710, 988], dtype='int64')], [[5]], place)
-
-    results = inferencer.infer(
-        {
-            'user_id': user_id,
-            'gender_id': gender_id,
-            'age_id': age_id,
-            'job_id': job_id,
-            'movie_id': movie_id,
-            'category_id': category_id,
-            'movie_title': movie_title
-        },
-        return_numpy=False)
-
-    print("infer results: ", np.array(results[0]))
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    params_dirname = "recommender_system.inference.model"
-    train(
-        use_cuda=use_cuda,
-        train_program=train_program,
-        params_dirname=params_dirname)
-    infer(
-        use_cuda=use_cuda,
-        inference_program=inference_program,
-        params_dirname=params_dirname)
-
-
-if __name__ == '__main__':
-    main(USE_GPU)
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_conv_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_conv_new_api.py
deleted file mode 100644
index 14719774b9d90c2e96d8f6134469502241a5f1f2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_conv_new_api.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import sys
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-from functools import partial
-import numpy as np
-
-CLASS_DIM = 2
-EMB_DIM = 128
-HID_DIM = 512
-BATCH_SIZE = 128
-
-
-def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
-    prediction = fluid.layers.fc(input=[conv_3, conv_4],
-                                 size=class_dim,
-                                 act="softmax")
-    return prediction
-
-
-def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
-    dict_dim = len(word_dict)
-    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
-    return net
-
-
-def train_program(word_dict):
-    prediction = inference_program(word_dict)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return [avg_cost, accuracy]
-
-
-def optimizer_func():
-    return fluid.optimizer.Adagrad(learning_rate=0.002)
-
-
-def train(use_cuda, train_program, params_dirname):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    trainer = Trainer(
-        train_func=partial(train_program, word_dict),
-        place=place,
-        optimizer_func=optimizer_func)
-
-    def event_handler(event):
-        if isinstance(event, EndEpochEvent):
-            test_reader = paddle.batch(
-                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
-            avg_cost, acc = trainer.test(
-                reader=test_reader, feed_order=['words', 'label'])
-
-            print("avg_cost: %s" % avg_cost)
-            print("acc     : %s" % acc)
-
-            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(params_dirname)
-                trainer.stop()
-
-            else:
-                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
-                    event.epoch + 1, avg_cost, acc))
-                if math.isnan(avg_cost):
-                    sys.exit("got NaN loss, training failed.")
-        elif isinstance(event, EndStepEvent):
-            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, list(map(np.array, event.metrics))))
-            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(params_dirname)
-                trainer.stop()
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=25000),
-        batch_size=BATCH_SIZE)
-
-    trainer.train(
-        num_epochs=1,
-        event_handler=event_handler,
-        reader=train_reader,
-        feed_order=['words', 'label'])
-
-
-def infer(use_cuda, inference_program, params_dirname=None):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    word_dict = paddle.dataset.imdb.word_dict()
-
-    inferencer = Inferencer(
-        infer_func=partial(inference_program, word_dict),
-        param_path=params_dirname,
-        place=place)
-
-    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of
-    # each word (base_shape) should be [1] since it is simply an index to
-    # look up for the corresponding word vector.
-    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only
-    # one higher level structure (sequence of words, or sentence) than the basic
-    # element (word). Hence the LoDTensor will hold data for three sentences of
-    # length 3, 4 and 2, respectively.
-    # Note that recursive_sequence_lengths should be a list of lists.
-    recursive_seq_lens = [[3, 4, 2]]
-    base_shape = [1]
-    # The range of random integers is [low, high]
-    tensor_words = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
-    results = inferencer.infer({'words': tensor_words})
-    print("infer results: ", results)
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    params_dirname = "understand_sentiment_conv.inference.model"
-    train(use_cuda, train_program, params_dirname)
-    infer(use_cuda, inference_program, params_dirname)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_dynamic_rnn_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_dynamic_rnn_new_api.py
deleted file mode 100644
index 62fbba6fe1a62da6a93d50abc074bf5d794cf458..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_dynamic_rnn_new_api.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import sys
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-from functools import partial
-import numpy as np
-
-CLASS_DIM = 2
-EMB_DIM = 128
-BATCH_SIZE = 128
-LSTM_SIZE = 128
-
-
-def dynamic_rnn_lstm(data, input_dim, class_dim, emb_dim, lstm_size):
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh')
-
-    rnn = fluid.layers.DynamicRNN()
-    with rnn.block():
-        word = rnn.step_input(sentence)
-        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
-        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
-
-        def gate_common(ipt, hidden, size):
-            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
-            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
-            return gate0 + gate1
-
-        forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                         lstm_size))
-        input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                        lstm_size))
-        output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                         lstm_size))
-        cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                       lstm_size))
-
-        cell = forget_gate * prev_cell + input_gate * cell_gate
-        hidden = output_gate * fluid.layers.tanh(x=cell)
-        rnn.update_memory(prev_cell, cell)
-        rnn.update_memory(prev_hidden, hidden)
-        rnn.output(hidden)
-
-    last = fluid.layers.sequence_last_step(rnn())
-    prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax")
-    return prediction
-
-
-def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
-    dict_dim = len(word_dict)
-    pred = dynamic_rnn_lstm(data, dict_dim, CLASS_DIM, EMB_DIM, LSTM_SIZE)
-    return pred
-
-
-def train_program(word_dict):
-    prediction = inference_program(word_dict)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return [avg_cost, accuracy]
-
-
-def optimizer_func():
-    return fluid.optimizer.Adagrad(learning_rate=0.002)
-
-
-def train(use_cuda, train_program, params_dirname):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    trainer = Trainer(
-        train_func=partial(train_program, word_dict),
-        place=place,
-        optimizer_func=optimizer_func)
-
-    def event_handler(event):
-        if isinstance(event, EndEpochEvent):
-            test_reader = paddle.batch(
-                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
-            avg_cost, acc = trainer.test(
-                reader=test_reader, feed_order=['words', 'label'])
-
-            print("avg_cost: %s" % avg_cost)
-            print("acc     : %s" % acc)
-
-            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(params_dirname)
-                trainer.stop()
-
-            else:
-                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
-                    event.epoch + 1, avg_cost, acc))
-                if math.isnan(avg_cost):
-                    sys.exit("got NaN loss, training failed.")
-        elif isinstance(event, EndStepEvent):
-            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, list(map(np.array, event.metrics))))
-            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(params_dirname)
-                trainer.stop()
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=25000),
-        batch_size=BATCH_SIZE)
-
-    trainer.train(
-        num_epochs=1,
-        event_handler=event_handler,
-        reader=train_reader,
-        feed_order=['words', 'label'])
-
-
-def infer(use_cuda, inference_program, params_dirname=None):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    word_dict = paddle.dataset.imdb.word_dict()
-
-    inferencer = Inferencer(
-        infer_func=partial(inference_program, word_dict),
-        param_path=params_dirname,
-        place=place)
-
-    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of
-    # each word (base_shape) should be [1] since it is simply an index to
-    # look up for the corresponding word vector.
-    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only
-    # one higher level structure (sequence of words, or sentence) than the basic
-    # element (word). Hence the LoDTensor will hold data for three sentences of
-    # length 3, 4 and 2, respectively.
-    # Note that recursive_sequence_lengths should be a list of lists.
-    recursive_seq_lens = [[3, 4, 2]]
-    base_shape = [1]
-    # The range of random integers is [low, high]
-    tensor_words = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
-    results = inferencer.infer({'words': tensor_words})
-    print("infer results: ", results)
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    params_dirname = "understand_sentiment_conv.inference.model"
-    train(use_cuda, train_program, params_dirname)
-    infer(use_cuda, inference_program, params_dirname)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_stacked_lstm_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_stacked_lstm_new_api.py
deleted file mode 100644
index 7523ad3fef17f61b1bde1fc687761cc6b86c3d9e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_stacked_lstm_new_api.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import sys
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-from functools import partial
-import numpy as np
-
-CLASS_DIM = 2
-EMB_DIM = 128
-HID_DIM = 512
-STACKED_NUM = 3
-BATCH_SIZE = 128
-
-
-def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
-    assert stacked_num % 2 == 1
-
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-
-    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
-
-    inputs = [fc1, lstm1]
-
-    for i in range(2, stacked_num + 1):
-        fc = fluid.layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = fluid.layers.dynamic_lstm(
-            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
-        inputs = [fc, lstm]
-
-    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
-
-    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
-                                 size=class_dim,
-                                 act='softmax')
-    return prediction
-
-
-def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
-    dict_dim = len(word_dict)
-    net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM,
-                           STACKED_NUM)
-    return net
-
-
-def train_program(word_dict):
-    prediction = inference_program(word_dict)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return [avg_cost, accuracy]
-
-
-def optimizer_func():
-    return fluid.optimizer.Adagrad(learning_rate=0.002)
-
-
-def train(use_cuda, train_program, params_dirname):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    trainer = Trainer(
-        train_func=partial(train_program, word_dict),
-        place=place,
-        optimizer_func=optimizer_func)
-
-    def event_handler(event):
-        if isinstance(event, EndEpochEvent):
-            test_reader = paddle.batch(
-                paddle.dataset.imdb.test(word_dict),
-                batch_size=BATCH_SIZE,
-                drop_last=False)
-            avg_cost, acc = trainer.test(
-                reader=test_reader, feed_order=['words', 'label'])
-
-            print("avg_cost: %s" % avg_cost)
-            print("acc     : %s" % acc)
-
-            if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(params_dirname)
-                trainer.stop()
-
-            else:
-                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
-                    event.epoch + 1, avg_cost, acc))
-                if math.isnan(avg_cost):
-                    sys.exit("got NaN loss, training failed.")
-        elif isinstance(event, EndStepEvent):
-            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, list(map(np.array, event.metrics))))
-            if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(params_dirname)
-                trainer.stop()
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=25000),
-        batch_size=BATCH_SIZE,
-        drop_last=False)
-
-    trainer.train(
-        num_epochs=1,
-        event_handler=event_handler,
-        reader=train_reader,
-        feed_order=['words', 'label'])
-
-
-def infer(use_cuda, inference_program, params_dirname=None):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    word_dict = paddle.dataset.imdb.word_dict()
-
-    inferencer = Inferencer(
-        infer_func=partial(inference_program, word_dict),
-        param_path=params_dirname,
-        place=place)
-
-    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of
-    # each word (base_shape) should be [1] since it is simply an index to
-    # look up for the corresponding word vector.
-    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only
-    # one higher level structure (sequence of words, or sentence) than the basic
-    # element (word). Hence the LoDTensor will hold data for three sentences of
-    # length 3, 4 and 2, respectively.
-    # Note that recursive_sequence_lengths should be a list of lists.
-    recursive_seq_lens = [[3, 4, 2]]
-    base_shape = [1]
-    # The range of random integers is [low, high]
-    tensor_words = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
-    results = inferencer.infer({'words': tensor_words})
-    print("infer results: ", results)
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    params_dirname = "understand_sentiment_stacked_lstm.inference.model"
-    train(use_cuda, train_program, params_dirname)
-    infer(use_cuda, inference_program, params_dirname)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_word2vec_new_api.py
deleted file mode 100644
index e4c0cc5429d3fe891034161d90fadfa9dd078b0b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/test_word2vec_new_api.py
+++ /dev/null
@@ -1,187 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import sys
-try:
-    from paddle.fluid.contrib.trainer import *
-    from paddle.fluid.contrib.inferencer import *
-except ImportError:
-    print(
-        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
-        file=sys.stderr)
-    from paddle.fluid.trainer import *
-    from paddle.fluid.inferencer import *
-import numpy as np
-import math
-import sys
-from functools import partial
-
-PASS_NUM = 100
-EMBED_SIZE = 32
-HIDDEN_SIZE = 256
-N = 5
-BATCH_SIZE = 32
-
-word_dict = paddle.dataset.imikolov.build_dict()
-dict_size = len(word_dict)
-
-
-def inference_program(is_sparse):
-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
-
-    embed_first = fluid.layers.embedding(
-        input=first_word,
-        size=[dict_size, EMBED_SIZE],
-        dtype='float32',
-        is_sparse=is_sparse,
-        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
-        input=second_word,
-        size=[dict_size, EMBED_SIZE],
-        dtype='float32',
-        is_sparse=is_sparse,
-        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
-        input=third_word,
-        size=[dict_size, EMBED_SIZE],
-        dtype='float32',
-        is_sparse=is_sparse,
-        param_attr='shared_w')
-    embed_forth = fluid.layers.embedding(
-        input=forth_word,
-        size=[dict_size, EMBED_SIZE],
-        dtype='float32',
-        is_sparse=is_sparse,
-        param_attr='shared_w')
-
-    concat_embed = fluid.layers.concat(
-        input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-    hidden1 = fluid.layers.fc(input=concat_embed,
-                              size=HIDDEN_SIZE,
-                              act='sigmoid')
-    predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
-    return predict_word
-
-
-def train_program(is_sparse):
-    # The declaration of 'next_word' must be after the invoking of inference_program,
-    # or the data input order of train program would be [next_word, firstw, secondw,
-    # thirdw, forthw], which is not correct.
-    predict_word = inference_program(is_sparse)
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
-    avg_cost = fluid.layers.mean(cost)
-    return avg_cost
-
-
-def optimizer_func():
-    return fluid.optimizer.SGD(learning_rate=0.001)
-
-
-def train(use_cuda, train_program, params_dirname):
-    train_reader = paddle.batch(
-        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-    test_reader = paddle.batch(
-        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    def event_handler(event):
-        if isinstance(event, EndStepEvent):
-            outs = trainer.test(
-                reader=test_reader,
-                feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
-            avg_cost = outs[0]
-            print("loss= ", avg_cost)
-
-            if avg_cost < 10.0:
-                trainer.save_params(params_dirname)
-                trainer.stop()
-
-            if math.isnan(avg_cost):
-                sys.exit("got NaN loss, training failed.")
-
-    trainer = Trainer(
-        train_func=train_program, optimizer_func=optimizer_func, place=place)
-
-    trainer.train(
-        reader=train_reader,
-        num_epochs=1,
-        event_handler=event_handler,
-        feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
-
-
-def infer(use_cuda, inference_program, params_dirname=None):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
-
-    # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
-    # is simply an index to look up for the corresponding word vector and hence 
-    # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
-    # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
-    # meaning there is only one level of detail and there is only one sequence of 
-    # one word on this level.
-    # Note that recursive_sequence_lengths should be a list of lists.
-    recursive_seq_lens = [[1]]
-    base_shape = [1]
-    # The range of random integers is [low, high]
-    first_word = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-    second_word = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-    third_word = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-    fourth_word = fluid.create_random_int_lodtensor(
-        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-
-    result = inferencer.infer(
-        {
-            'firstw': first_word,
-            'secondw': second_word,
-            'thirdw': third_word,
-            'forthw': fourth_word
-        },
-        return_numpy=False)
-    print(np.array(result[0]))
-
-
-def main(use_cuda, is_sparse):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    params_dirname = "word2vec.inference.model"
-
-    train(
-        use_cuda=use_cuda,
-        train_program=partial(train_program, is_sparse),
-        params_dirname=params_dirname)
-
-    infer(
-        use_cuda=use_cuda,
-        inference_program=partial(inference_program, is_sparse),
-        params_dirname=params_dirname)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        for is_sparse in (False, True):
-            main(use_cuda=use_cuda, is_sparse=is_sparse)
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
deleted file mode 100644
index 5658bb4ec44e7319c384daed2d8b2d4c420c4160..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from paddle.fluid.layers.device import get_places
-import unittest
-import paddle.fluid as fluid
-import paddle
-import contextlib
-import math
-import numpy as np
-import sys
-import os
-
-
-def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
-                    hid_dim=32):
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
-    prediction = fluid.layers.fc(input=[conv_3, conv_4],
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, prediction
-
-
-def dyn_rnn_lstm(data, label, input_dim, class_dim=2, emb_dim=32,
-                 lstm_size=128):
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh')
-
-    rnn = fluid.layers.DynamicRNN()
-    with rnn.block():
-        word = rnn.step_input(sentence)
-        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
-        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
-
-        def gate_common(ipt, hidden, size):
-            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
-            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
-            return gate0 + gate1
-
-        forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                         lstm_size))
-        input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                        lstm_size))
-        output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                         lstm_size))
-        cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                       lstm_size))
-
-        cell = forget_gate * prev_cell + input_gate * cell_gate
-        hidden = output_gate * fluid.layers.tanh(x=cell)
-        rnn.update_memory(prev_cell, cell)
-        rnn.update_memory(prev_hidden, hidden)
-        rnn.output(hidden)
-
-    last = fluid.layers.sequence_last_step(rnn())
-    prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, prediction
-
-
-def stacked_lstm_net(data,
-                     label,
-                     input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3):
-    assert stacked_num % 2 == 1
-
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    # add bias attr
-
-    # TODO(qijun) linear act
-    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
-
-    inputs = [fc1, lstm1]
-
-    for i in range(2, stacked_num + 1):
-        fc = fluid.layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = fluid.layers.dynamic_lstm(
-            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
-        inputs = [fc, lstm]
-
-    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
-
-    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
-                                 size=class_dim,
-                                 act='softmax')
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, prediction
-
-
-def train(word_dict,
-          net_method,
-          use_cuda,
-          parallel=False,
-          save_dirname=None,
-          is_local=True):
-    BATCH_SIZE = 128
-    PASS_NUM = 5
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    if not parallel:
-        cost, acc_out, prediction = net_method(
-            data, label, input_dim=dict_dim, class_dim=class_dim)
-    else:
-        raise NotImplementedError()
-
-    adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
-    adagrad.minimize(cost)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-    def train_loop(main_program):
-        exe.run(fluid.default_startup_program())
-
-        for pass_id in range(PASS_NUM):
-            for data in train_data():
-                cost_val, acc_val = exe.run(main_program,
-                                            feed=feeder.feed(data),
-                                            fetch_list=[cost, acc_out])
-                print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-                if cost_val < 0.4 and acc_val > 0.8:
-                    if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, ["words"],
-                                                      prediction, exe)
-                    return
-                if math.isnan(float(cost_val)):
-                    sys.exit("got NaN loss, training failed.")
-        raise AssertionError("Cost is too large for {0}".format(
-            net_method.__name__))
-
-    if is_local:
-        train_loop(fluid.default_main_program())
-    else:
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
-        t = fluid.DistributeTranspiler()
-        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            train_loop(t.get_trainer_program())
-
-
-def infer(word_dict, use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        word_dict_len = len(word_dict)
-
-        # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of
-        # each word (base_shape) should be [1] since it is simply an index to
-        # look up for the corresponding word vector.
-        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only
-        # one higher level structure (sequence of words, or sentence) than the basic
-        # element (word). Hence the LoDTensor will hold data for three sentences of
-        # length 3, 4 and 2, respectively.
-        # Note that recursive_sequence_lengths should be a list of lists.
-        recursive_seq_lens = [[3, 4, 2]]
-        base_shape = [1]
-        # The range of random integers is [low, high]
-        tensor_words = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        assert feed_target_names[0] == "words"
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_words},
-                          fetch_list=fetch_targets,
-                          return_numpy=False)
-        print(results[0].recursive_sequence_lengths())
-        np_data = np.array(results[0])
-        print("Inference Shape: ", np_data.shape)
-        print("Inference results: ", np_data)
-
-
-def main(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    train(
-        word_dict,
-        net_method,
-        use_cuda,
-        parallel=parallel,
-        save_dirname=save_dirname)
-    infer(word_dict, use_cuda, save_dirname)
-
-
-class TestUnderstandSentiment(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.word_dict = paddle.dataset.imdb.word_dict()
-
-    @contextlib.contextmanager
-    def new_program_scope(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-    def test_conv_cpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=False,
-                save_dirname="understand_sentiment_conv.inference.model")
-
-    def test_conv_cpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=False,
-                parallel=True)
-
-    @unittest.skip(reason="make CI faster")
-    def test_stacked_lstm_cpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=False,
-                save_dirname="understand_sentiment_stacked_lstm.inference.model")
-
-    def test_stacked_lstm_cpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=False,
-                parallel=True)
-
-    def test_conv_gpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=True,
-                save_dirname="understand_sentiment_conv.inference.model")
-
-    def test_conv_gpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=True,
-                parallel=True)
-
-    @unittest.skip(reason="make CI faster")
-    def test_stacked_lstm_gpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=True,
-                save_dirname="understand_sentiment_stacked_lstm.inference.model")
-
-    def test_stacked_lstm_gpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=True,
-                parallel=True)
-
-    @unittest.skip(reason='make CI faster')
-    def test_dynrnn_lstm_gpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=dyn_rnn_lstm,
-                use_cuda=True,
-                parallel=False)
-
-    def test_dynrnn_lstm_gpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=dyn_rnn_lstm,
-                use_cuda=True,
-                parallel=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
deleted file mode 100644
index 334294ab485cf203aa0ccf680a53010322d3af3b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ /dev/null
@@ -1,165 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import contextlib
-import numpy
-import unittest
-import math
-import sys
-import os
-
-
-def train(use_cuda, save_dirname, is_local):
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-
-    BATCH_SIZE = 20
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.uci_housing.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    def train_loop(main_program):
-        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-        exe.run(fluid.default_startup_program())
-
-        PASS_NUM = 100
-        for pass_id in range(PASS_NUM):
-            for data in train_reader():
-                avg_loss_value, = exe.run(main_program,
-                                          feed=feeder.feed(data),
-                                          fetch_list=[avg_cost])
-                print(avg_loss_value)
-                if avg_loss_value[0] < 10.0:
-                    if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, ['x'],
-                                                      [y_predict], exe)
-                    return
-                if math.isnan(float(avg_loss_value)):
-                    sys.exit("got NaN loss, training failed.")
-        raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
-            avg_loss_value[0]))
-
-    if is_local:
-        train_loop(fluid.default_main_program())
-    else:
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
-        t = fluid.DistributeTranspiler()
-        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            train_loop(t.get_trainer_program())
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        # The input's dimension should be 2-D and the second dim is 13
-        # The input data should be >= 0
-        batch_size = 10
-
-        test_reader = paddle.batch(
-            paddle.dataset.uci_housing.test(), batch_size=batch_size)
-
-        test_data = next(test_reader())
-        test_feat = numpy.array(
-            [data[0] for data in test_data]).astype("float32")
-        test_label = numpy.array(
-            [data[1] for data in test_data]).astype("float32")
-
-        assert feed_target_names[0] == 'x'
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: numpy.array(test_feat)},
-                          fetch_list=fetch_targets)
-        print("infer shape: ", results[0].shape)
-        print("infer results: ", results[0])
-        print("ground truth: ", test_label)
-
-
-def main(use_cuda, is_local=True):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    # Directory for saving the trained model
-    save_dirname = "fit_a_line.inference.model"
-
-    train(use_cuda, save_dirname, is_local)
-    infer(use_cuda, save_dirname)
-
-
-class TestFitALine(unittest.TestCase):
-    def test_cpu(self):
-        with self.program_scope_guard():
-            main(use_cuda=False)
-
-    def test_cuda(self):
-        with self.program_scope_guard():
-            main(use_cuda=True)
-
-    @contextlib.contextmanager
-    def program_scope_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
deleted file mode 100644
index 95d71d72c156484eddc4eaf26aaa61bb5a93b1b1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ /dev/null
@@ -1,275 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import contextlib
-import math
-import sys
-import numpy
-import unittest
-import os
-import numpy as np
-
-
-def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    def shortcut(input, ch_in, ch_out, stride):
-        if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-        else:
-            return input
-
-    def basicblock(input, ch_in, ch_out, stride):
-        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
-        short = shortcut(input, ch_in, ch_out, stride)
-        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
-
-    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
-        tmp = block_func(input, ch_in, ch_out, stride)
-        for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1)
-        return tmp
-
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) // 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    return pool
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
-
-
-def train(net_type, use_cuda, save_dirname, is_local):
-    classdim = 10
-    data_shape = [3, 32, 32]
-
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if net_type == "vgg":
-        print("train vgg net")
-        net = vgg16_bn_drop(images)
-    elif net_type == "resnet":
-        print("train resnet")
-        net = resnet_cifar10(images, 32)
-    else:
-        raise ValueError("%s network is not supported" % net_type)
-
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    acc = fluid.layers.accuracy(input=predict, label=label)
-
-    # Test program
-    test_program = fluid.default_main_program().clone(for_test=True)
-
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimizer.minimize(avg_cost)
-
-    BATCH_SIZE = 128
-    PASS_NUM = 1
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10(), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
-
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-
-    def train_loop(main_program):
-        exe.run(fluid.default_startup_program())
-        loss = 0.0
-        for pass_id in range(PASS_NUM):
-            for batch_id, data in enumerate(train_reader()):
-                exe.run(main_program, feed=feeder.feed(data))
-
-                if (batch_id % 10) == 0:
-                    acc_list = []
-                    avg_loss_list = []
-                    for tid, test_data in enumerate(test_reader()):
-                        loss_t, acc_t = exe.run(program=test_program,
-                                                feed=feeder.feed(test_data),
-                                                fetch_list=[avg_cost, acc])
-                        if math.isnan(float(loss_t)):
-                            sys.exit("got NaN loss, training failed.")
-                        acc_list.append(float(acc_t))
-                        avg_loss_list.append(float(loss_t))
-                        break  # Use 1 segment for speeding up CI
-
-                    acc_value = numpy.array(acc_list).mean()
-                    avg_loss_value = numpy.array(avg_loss_list).mean()
-
-                    print(
-                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
-                        format(pass_id, batch_id + 1,
-                               float(avg_loss_value), float(acc_value)))
-
-                    if acc_value > 0.01:  # Low threshold for speeding up CI
-                        fluid.io.save_inference_model(save_dirname, ["pixel"],
-                                                      [predict], exe)
-                        return
-
-    if is_local:
-        train_loop(fluid.default_main_program())
-    else:
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
-        t = fluid.DistributeTranspiler()
-        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            train_loop(t.get_trainer_program())
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        # The input's dimension of conv should be 4-D or 5-D.
-        # Use normilized image pixels as input data, which should be in the range [0, 1.0].
-        batch_size = 1
-        tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
-
-        print("infer results: ", results[0])
-
-        fluid.io.save_inference_model(save_dirname, feed_target_names,
-                                      fetch_targets, exe, inference_program)
-
-
-def main(net_type, use_cuda, is_local=True):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    # Directory for saving the trained model
-    save_dirname = "image_classification_" + net_type + ".inference.model"
-
-    train(net_type, use_cuda, save_dirname, is_local)
-    infer(use_cuda, save_dirname)
-
-
-class TestImageClassification(unittest.TestCase):
-    def test_vgg_cuda(self):
-        with self.scope_prog_guard():
-            main('vgg', use_cuda=True)
-
-    def test_resnet_cuda(self):
-        with self.scope_prog_guard():
-            main('resnet', use_cuda=True)
-
-    def test_vgg_cpu(self):
-        with self.scope_prog_guard():
-            main('vgg', use_cuda=False)
-
-    def test_resnet_cpu(self):
-        with self.scope_prog_guard():
-            main('resnet', use_cuda=False)
-
-    @contextlib.contextmanager
-    def scope_prog_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
deleted file mode 100644
index 3d40b762281ae09d3214f2d2bc496c4966984866..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ /dev/null
@@ -1,378 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import contextlib
-import math
-import numpy as np
-import os
-import time
-import unittest
-
-import paddle
-import paddle.dataset.conll05 as conll05
-import paddle.fluid as fluid
-
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_dict_len = len(verb_dict)
-
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-mix_hidden_lr = 1e-3
-
-IS_SPARSE = True
-PASS_NUM = 2
-BATCH_SIZE = 10
-
-embedding_name = 'emb'
-
-
-def load_parameter(file_name, h, w):
-    with open(file_name, 'rb') as f:
-        f.read(16)  # skip header.
-        return np.fromfile(f, dtype=np.float32).reshape(h, w)
-
-
-def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
-            **ignored):
-    # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        size=[pred_dict_len, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr='vemb')
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        size=[mark_dict_len, mark_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE)
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        fluid.layers.embedding(
-            size=[word_dict_len, word_dim],
-            input=x,
-            param_attr=fluid.ParamAttr(
-                name=embedding_name, trainable=False)) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-
-    hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
-    ]
-
-    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
-
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=hidden_dim,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid')
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
-        ])
-
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=hidden_dim,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1))
-
-        input_tmp = [mix_hidden, lstm]
-
-    feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
-    ])
-
-    return feature_out
-
-
-def train(use_cuda, save_dirname=None, is_local=True):
-    # define network topology
-    word = fluid.layers.data(
-        name='word_data', shape=[1], dtype='int64', lod_level=1)
-    predicate = fluid.layers.data(
-        name='verb_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n2 = fluid.layers.data(
-        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n1 = fluid.layers.data(
-        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_0 = fluid.layers.data(
-        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p1 = fluid.layers.data(
-        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p2 = fluid.layers.data(
-        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-    mark = fluid.layers.data(
-        name='mark_data', shape=[1], dtype='int64', lod_level=1)
-    feature_out = db_lstm(**locals())
-    target = fluid.layers.data(
-        name='target', shape=[1], dtype='int64', lod_level=1)
-    crf_cost = fluid.layers.linear_chain_crf(
-        input=feature_out,
-        label=target,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=mix_hidden_lr))
-    avg_cost = fluid.layers.mean(crf_cost)
-
-    # TODO(qiao)
-    # check other optimizers and check why out will be NAN
-    sgd_optimizer = fluid.optimizer.SGD(
-        learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.01,
-            decay_steps=100000,
-            decay_rate=0.5,
-            staircase=True))
-    sgd_optimizer.minimize(avg_cost)
-
-    # TODO(qiao)
-    # add dependency track and move this config before optimizer
-    crf_decode = fluid.layers.crf_decoding(
-        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.conll05.test(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    feeder = fluid.DataFeeder(
-        feed_list=[
-            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
-        ],
-        place=place)
-    exe = fluid.Executor(place)
-
-    def train_loop(main_program):
-        exe.run(fluid.default_startup_program())
-        embedding_param = fluid.global_scope().find_var(
-            embedding_name).get_tensor()
-        embedding_param.set(
-            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
-            place)
-
-        start_time = time.time()
-        batch_id = 0
-        for pass_id in range(PASS_NUM):
-            for data in train_data():
-                cost = exe.run(main_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_cost])
-                cost = cost[0]
-
-                if batch_id % 10 == 0:
-                    print("avg_cost:" + str(cost))
-                    if batch_id != 0:
-                        print("second per batch: " + str((time.time(
-                        ) - start_time) / batch_id))
-                    # Set the threshold low to speed up the CI test
-                    if float(cost) < 80.0:
-                        if save_dirname is not None:
-                            # TODO(liuyiqun): Change the target to crf_decode
-                            fluid.io.save_inference_model(save_dirname, [
-                                'word_data', 'verb_data', 'ctx_n2_data',
-                                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
-                                'ctx_p2_data', 'mark_data'
-                            ], [feature_out], exe)
-                        return
-
-                batch_id = batch_id + 1
-
-        raise RuntimeError(
-            "This model should save_inference_model and return, but not reach here, please check!"
-        )
-
-    if is_local:
-        train_loop(fluid.default_main_program())
-    else:
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
-        t = fluid.DistributeTranspiler()
-        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            train_loop(t.get_trainer_program())
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of
-        # each word (base_shape) should be [1] since it is simply an index to
-        # look up for the corresponding word vector.
-        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only
-        # one higher level structure (sequence of words, or sentence) than the basic
-        # element (word). Hence the LoDTensor will hold data for three sentences of
-        # length 3, 4 and 2, respectively.
-        # Note that recursive_sequence_lengths should be a list of lists.
-        recursive_seq_lens = [[3, 4, 2]]
-        base_shape = [1]
-        # The range of random integers is [low, high]
-        word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        pred = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=pred_dict_len - 1)
-        ctx_n2 = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        ctx_n1 = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        ctx_0 = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        ctx_p1 = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        ctx_p2 = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        mark = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=mark_dict_len - 1)
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        assert feed_target_names[0] == 'word_data'
-        assert feed_target_names[1] == 'verb_data'
-        assert feed_target_names[2] == 'ctx_n2_data'
-        assert feed_target_names[3] == 'ctx_n1_data'
-        assert feed_target_names[4] == 'ctx_0_data'
-        assert feed_target_names[5] == 'ctx_p1_data'
-        assert feed_target_names[6] == 'ctx_p2_data'
-        assert feed_target_names[7] == 'mark_data'
-
-        results = exe.run(inference_program,
-                          feed={
-                              feed_target_names[0]: word,
-                              feed_target_names[1]: pred,
-                              feed_target_names[2]: ctx_n2,
-                              feed_target_names[3]: ctx_n1,
-                              feed_target_names[4]: ctx_0,
-                              feed_target_names[5]: ctx_p1,
-                              feed_target_names[6]: ctx_p2,
-                              feed_target_names[7]: mark
-                          },
-                          fetch_list=fetch_targets,
-                          return_numpy=False)
-        print(results[0].recursive_sequence_lengths())
-        np_data = np.array(results[0])
-        print("Inference Shape: ", np_data.shape)
-
-
-def main(use_cuda, is_local=True):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    # Directory for saving the trained model
-    save_dirname = "label_semantic_roles.inference.model"
-
-    train(use_cuda, save_dirname, is_local)
-    infer(use_cuda, save_dirname)
-
-
-class TestLabelSemanticRoles(unittest.TestCase):
-    def test_cuda(self):
-        with self.scope_prog_guard():
-            main(use_cuda=True)
-
-    def test_cpu(self):
-        with self.scope_prog_guard():
-            main(use_cuda=False)
-
-    @contextlib.contextmanager
-    def scope_prog_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
deleted file mode 100644
index 5e241aaa32727686b84a0354a11d5a92f9576a90..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ /dev/null
@@ -1,347 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import contextlib
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.framework as framework
-import paddle.fluid.layers as pd
-from paddle.fluid.executor import Executor
-import unittest
-import os
-
-dict_size = 30000
-source_dict_dim = target_dict_dim = dict_size
-hidden_dim = 32
-word_dim = 16
-batch_size = 2
-max_length = 8
-topk_size = 50
-trg_dic_size = 10000
-beam_size = 2
-
-decoder_size = hidden_dim
-
-
-def encoder(is_sparse):
-    # encoder
-    src_word_id = pd.data(
-        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = pd.embedding(
-        input=src_word_id,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse,
-        param_attr=fluid.ParamAttr(name='vemb'))
-
-    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
-    lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = pd.sequence_last_step(input=lstm_hidden0)
-    return encoder_out
-
-
-def decoder_train(context, is_sparse):
-    # decoder
-    trg_language_word = pd.data(
-        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = pd.embedding(
-        input=trg_language_word,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse,
-        param_attr=fluid.ParamAttr(name='vemb'))
-
-    rnn = pd.DynamicRNN()
-    with rnn.block():
-        current_word = rnn.step_input(trg_embedding)
-        pre_state = rnn.memory(init=context)
-        current_state = pd.fc(input=[current_word, pre_state],
-                              size=decoder_size,
-                              act='tanh')
-
-        current_score = pd.fc(input=current_state,
-                              size=target_dict_dim,
-                              act='softmax')
-        rnn.update_memory(pre_state, current_state)
-        rnn.output(current_score)
-
-    return rnn()
-
-
-def decoder_decode(context, is_sparse):
-    init_state = context
-    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
-    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
-
-    # fill the first element with init_state
-    state_array = pd.create_array('float32')
-    pd.array_write(init_state, array=state_array, i=counter)
-
-    # ids, scores as memory
-    ids_array = pd.create_array('int64')
-    scores_array = pd.create_array('float32')
-
-    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
-    init_scores = pd.data(
-        name="init_scores", shape=[1], dtype="float32", lod_level=2)
-
-    pd.array_write(init_ids, array=ids_array, i=counter)
-    pd.array_write(init_scores, array=scores_array, i=counter)
-
-    cond = pd.less_than(x=counter, y=array_len)
-
-    while_op = pd.While(cond=cond)
-    with while_op.block():
-        pre_ids = pd.array_read(array=ids_array, i=counter)
-        pre_state = pd.array_read(array=state_array, i=counter)
-        pre_score = pd.array_read(array=scores_array, i=counter)
-
-        # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
-        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
-
-        pre_ids_emb = pd.embedding(
-            input=pre_ids,
-            size=[dict_size, word_dim],
-            dtype='float32',
-            is_sparse=is_sparse)
-
-        # use rnn unit to update rnn
-        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
-                              size=decoder_size,
-                              act='tanh')
-        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
-        # use score to do beam search
-        current_score = pd.fc(input=current_state_with_lod,
-                              size=target_dict_dim,
-                              act='softmax')
-        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
-        # calculate accumulated scores after topk to reduce computation cost
-        accu_scores = pd.elementwise_add(
-            x=pd.log(topk_scores), y=pd.reshape(
-                pre_score, shape=[-1]), axis=0)
-        selected_ids, selected_scores = pd.beam_search(
-            pre_ids,
-            pre_score,
-            topk_indices,
-            accu_scores,
-            beam_size,
-            end_id=10,
-            level=0)
-
-        pd.increment(x=counter, value=1, in_place=True)
-
-        # update the memories
-        pd.array_write(current_state, array=state_array, i=counter)
-        pd.array_write(selected_ids, array=ids_array, i=counter)
-        pd.array_write(selected_scores, array=scores_array, i=counter)
-
-        # update the break condition: up to the max length or all candidates of
-        # source sentences have ended.
-        length_cond = pd.less_than(x=counter, y=array_len)
-        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
-        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
-
-    translation_ids, translation_scores = pd.beam_search_decode(
-        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
-
-    # return init_ids, init_scores
-
-    return translation_ids, translation_scores
-
-
-def train_main(use_cuda, is_sparse, is_local=True):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    context = encoder(is_sparse)
-    rnn_out = decoder_train(context, is_sparse)
-    label = pd.data(
-        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = pd.cross_entropy(input=rnn_out, label=label)
-    avg_cost = pd.mean(cost)
-
-    optimizer = fluid.optimizer.Adagrad(
-        learning_rate=1e-4,
-        regularization=fluid.regularizer.L2DecayRegularizer(
-            regularization_coeff=0.1))
-    optimizer.minimize(avg_cost)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
-
-    feed_order = [
-        'src_word_id', 'target_language_word', 'target_language_next_word'
-    ]
-
-    exe = Executor(place)
-
-    def train_loop(main_program):
-        exe.run(framework.default_startup_program())
-
-        feed_list = [
-            main_program.global_block().var(var_name) for var_name in feed_order
-        ]
-        feeder = fluid.DataFeeder(feed_list, place)
-
-        batch_id = 0
-        for pass_id in range(1):
-            for data in train_data():
-                outs = exe.run(main_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_cost])
-                avg_cost_val = np.array(outs[0])
-                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
-                      " avg_cost=" + str(avg_cost_val))
-                if batch_id > 3:
-                    break
-                batch_id += 1
-
-    if is_local:
-        train_loop(framework.default_main_program())
-    else:
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
-        t = fluid.DistributeTranspiler()
-        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            train_loop(t.get_trainer_program())
-
-
-def decode_main(use_cuda, is_sparse):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    context = encoder(is_sparse)
-    translation_ids, translation_scores = decoder_decode(context, is_sparse)
-
-    exe = Executor(place)
-    exe.run(framework.default_startup_program())
-
-    init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
-    init_scores_data = np.array(
-        [1. for _ in range(batch_size)], dtype='float32')
-    init_ids_data = init_ids_data.reshape((batch_size, 1))
-    init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_recursive_seq_lens = [1] * batch_size
-    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
-
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
-                                       place)
-    init_scores = fluid.create_lod_tensor(init_scores_data,
-                                          init_recursive_seq_lens, place)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
-
-    feed_order = ['src_word_id']
-    feed_list = [
-        framework.default_main_program().global_block().var(var_name)
-        for var_name in feed_order
-    ]
-    feeder = fluid.DataFeeder(feed_list, place)
-
-    for data in train_data():
-        feed_dict = feeder.feed([[x[0]] for x in data])
-        feed_dict['init_ids'] = init_ids
-        feed_dict['init_scores'] = init_scores
-
-        result_ids, result_scores = exe.run(
-            framework.default_main_program(),
-            feed=feed_dict,
-            fetch_list=[translation_ids, translation_scores],
-            return_numpy=False)
-        print(result_ids.recursive_sequence_lengths())
-        break
-
-
-class TestMachineTranslation(unittest.TestCase):
-    pass
-
-
-@contextlib.contextmanager
-def scope_prog_guard():
-    prog = fluid.Program()
-    startup_prog = fluid.Program()
-    scope = fluid.core.Scope()
-    with fluid.scope_guard(scope):
-        with fluid.program_guard(prog, startup_prog):
-            yield
-
-
-def inject_test_train(use_cuda, is_sparse):
-    f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
-                                         if is_sparse else 'dense')
-
-    def f(*args):
-        with scope_prog_guard():
-            train_main(use_cuda, is_sparse)
-
-    setattr(TestMachineTranslation, f_name, f)
-
-
-def inject_test_decode(use_cuda, is_sparse, decorator=None):
-    f_name = 'test_{0}_{1}_decode'.format('cuda'
-                                          if use_cuda else 'cpu', 'sparse'
-                                          if is_sparse else 'dense')
-
-    def f(*args):
-        with scope_prog_guard():
-            decode_main(use_cuda, is_sparse)
-
-    if decorator is not None:
-        f = decorator(f)
-
-    setattr(TestMachineTranslation, f_name, f)
-
-
-for _use_cuda_ in (False, True):
-    for _is_sparse_ in (False, True):
-        inject_test_train(_use_cuda_, _is_sparse_)
-
-for _use_cuda_ in (False, True):
-    for _is_sparse_ in (False, True):
-
-        _decorator_ = None
-        if _use_cuda_:
-            _decorator_ = unittest.skip(
-                reason='Beam Search does not support CUDA!')
-
-        inject_test_decode(
-            is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
deleted file mode 100644
index 3b2c4af8ae510492052cd825077ef9dfd355b417..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ /dev/null
@@ -1,275 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid.core as core
-import math
-import os
-import sys
-import unittest
-
-import numpy
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.layers.device import get_places
-
-BATCH_SIZE = 64
-
-
-def loss_net(hidden, label):
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-    return prediction, avg_loss, acc
-
-
-def mlp(img, label):
-    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
-    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
-    return loss_net(hidden, label)
-
-
-def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    return loss_net(conv_pool_2, label)
-
-
-def train(nn_type,
-          use_cuda,
-          parallel,
-          save_dirname=None,
-          save_full_dirname=None,
-          model_filename=None,
-          params_filename=None,
-          is_local=True):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if nn_type == 'mlp':
-        net_conf = mlp
-    else:
-        net_conf = conv_net
-
-    if parallel:
-        raise NotImplementedError()
-    else:
-        prediction, avg_loss, acc = net_conf(img, label)
-
-    test_program = fluid.default_main_program().clone(for_test=True)
-
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimizer.minimize(avg_loss)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    exe = fluid.Executor(place)
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
-    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
-
-    def train_loop(main_program):
-        exe.run(fluid.default_startup_program())
-
-        PASS_NUM = 100
-        for pass_id in range(PASS_NUM):
-            for batch_id, data in enumerate(train_reader()):
-                # train a mini-batch, fetch nothing
-                exe.run(main_program, feed=feeder.feed(data))
-                if (batch_id + 1) % 10 == 0:
-                    acc_set = []
-                    avg_loss_set = []
-                    for test_data in test_reader():
-                        acc_np, avg_loss_np = exe.run(
-                            program=test_program,
-                            feed=feeder.feed(test_data),
-                            fetch_list=[acc, avg_loss])
-                        acc_set.append(float(acc_np))
-                        avg_loss_set.append(float(avg_loss_np))
-                    # get test acc and loss
-                    acc_val = numpy.array(acc_set).mean()
-                    avg_loss_val = numpy.array(avg_loss_set).mean()
-                    if float(acc_val
-                             ) > 0.2:  # Smaller value to increase CI speed
-                        if save_dirname is not None:
-                            fluid.io.save_inference_model(
-                                save_dirname, ["img"], [prediction],
-                                exe,
-                                model_filename=model_filename,
-                                params_filename=params_filename)
-                        if save_full_dirname is not None:
-                            fluid.io.save_inference_model(
-                                save_full_dirname, [], [],
-                                exe,
-                                model_filename=model_filename,
-                                params_filename=params_filename,
-                                export_for_deployment=False)
-                        return
-                    else:
-                        print(
-                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
-                            format(pass_id, batch_id + 1,
-                                   float(avg_loss_val), float(acc_val)))
-                        if math.isnan(float(avg_loss_val)):
-                            sys.exit("got NaN loss, training failed.")
-        raise AssertionError("Loss of recognize digits is too large")
-
-    if is_local:
-        train_loop(fluid.default_main_program())
-    else:
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
-        t = fluid.DistributeTranspiler()
-        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            train_loop(t.get_trainer_program())
-
-
-def infer(use_cuda,
-          save_dirname=None,
-          model_filename=None,
-          params_filename=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             save_dirname, exe, model_filename, params_filename)
-
-        # The input's dimension of conv should be 4-D or 5-D.
-        # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
-        batch_size = 1
-        tensor_img = numpy.random.uniform(
-            -1.0, 1.0, [batch_size, 1, 28, 28]).astype("float32")
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
-        print("infer results: ", results[0])
-
-
-def main(use_cuda, parallel, nn_type, combine):
-    save_dirname = None
-    save_full_dirname = None
-    model_filename = None
-    params_filename = None
-    if not use_cuda and not parallel:
-        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
-        save_full_dirname = "recognize_digits_" + nn_type + ".train.model"
-        if combine == True:
-            model_filename = "__model_combined__"
-            params_filename = "__params_combined__"
-
-    # call train() with is_local argument to run distributed train
-    train(
-        nn_type=nn_type,
-        use_cuda=use_cuda,
-        parallel=parallel,
-        save_dirname=save_dirname,
-        save_full_dirname=save_full_dirname,
-        model_filename=model_filename,
-        params_filename=params_filename)
-    infer(
-        use_cuda=use_cuda,
-        save_dirname=save_dirname,
-        model_filename=model_filename,
-        params_filename=params_filename)
-
-
-class TestRecognizeDigits(unittest.TestCase):
-    pass
-
-
-def inject_test_method(use_cuda, parallel, nn_type, combine):
-    def __impl__(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                main(use_cuda, parallel, nn_type, combine)
-
-    fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda'
-                                       if use_cuda else 'cpu', 'parallel'
-                                       if parallel else 'normal', 'combine'
-                                       if combine else 'separate')
-
-    setattr(TestRecognizeDigits, fn, __impl__)
-
-
-def inject_all_tests():
-    for use_cuda in (False, True):
-        if use_cuda and not core.is_compiled_with_cuda():
-            continue
-        for parallel in (False, ):
-            for nn_type in ('mlp', 'conv'):
-                inject_test_method(use_cuda, parallel, nn_type, True)
-
-    # Two unit-test for saving parameters as separate files
-    inject_test_method(False, False, 'mlp', False)
-    inject_test_method(False, False, 'conv', False)
-
-
-inject_all_tests()
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
deleted file mode 100644
index 0e1efc8212ec2913ca3653c47bd2d9e298a772ee..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ /dev/null
@@ -1,328 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import math
-import sys
-import os
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.framework as framework
-import paddle.fluid.layers as layers
-import paddle.fluid.nets as nets
-from paddle.fluid.executor import Executor
-from paddle.fluid.optimizer import SGDOptimizer
-
-IS_SPARSE = True
-USE_GPU = False
-BATCH_SIZE = 256
-
-
-def get_usr_combined_features():
-    # FIXME(dzh) : old API integer_value(10) may has range check.
-    # currently we don't have user configurated check.
-
-    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
-
-    uid = layers.data(name='user_id', shape=[1], dtype='int64')
-
-    usr_emb = layers.embedding(
-        input=uid,
-        dtype='float32',
-        size=[USR_DICT_SIZE, 32],
-        param_attr='user_table',
-        is_sparse=IS_SPARSE)
-
-    usr_fc = layers.fc(input=usr_emb, size=32)
-
-    USR_GENDER_DICT_SIZE = 2
-
-    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
-
-    usr_gender_emb = layers.embedding(
-        input=usr_gender_id,
-        size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr='gender_table',
-        is_sparse=IS_SPARSE)
-
-    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
-
-    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
-    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
-
-    usr_age_emb = layers.embedding(
-        input=usr_age_id,
-        size=[USR_AGE_DICT_SIZE, 16],
-        is_sparse=IS_SPARSE,
-        param_attr='age_table')
-
-    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
-
-    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
-    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
-
-    usr_job_emb = layers.embedding(
-        input=usr_job_id,
-        size=[USR_JOB_DICT_SIZE, 16],
-        param_attr='job_table',
-        is_sparse=IS_SPARSE)
-
-    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
-
-    concat_embed = layers.concat(
-        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
-
-    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-
-    return usr_combined_features
-
-
-def get_mov_combined_features():
-
-    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
-
-    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
-
-    mov_emb = layers.embedding(
-        input=mov_id,
-        dtype='float32',
-        size=[MOV_DICT_SIZE, 32],
-        param_attr='movie_table',
-        is_sparse=IS_SPARSE)
-
-    mov_fc = layers.fc(input=mov_emb, size=32)
-
-    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
-
-    category_id = layers.data(
-        name='category_id', shape=[1], dtype='int64', lod_level=1)
-
-    mov_categories_emb = layers.embedding(
-        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-
-    mov_categories_hidden = layers.sequence_pool(
-        input=mov_categories_emb, pool_type="sum")
-
-    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
-
-    mov_title_id = layers.data(
-        name='movie_title', shape=[1], dtype='int64', lod_level=1)
-
-    mov_title_emb = layers.embedding(
-        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-
-    mov_title_conv = nets.sequence_conv_pool(
-        input=mov_title_emb,
-        num_filters=32,
-        filter_size=3,
-        act="tanh",
-        pool_type="sum")
-
-    concat_embed = layers.concat(
-        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
-
-    # FIXME(dzh) : need tanh operator
-    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-
-    return mov_combined_features
-
-
-def model():
-    usr_combined_features = get_usr_combined_features()
-    mov_combined_features = get_mov_combined_features()
-
-    # need cos sim
-    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
-    scale_infer = layers.scale(x=inference, scale=5.0)
-
-    label = layers.data(name='score', shape=[1], dtype='float32')
-    square_cost = layers.square_error_cost(input=scale_infer, label=label)
-    avg_cost = layers.mean(square_cost)
-
-    return scale_infer, avg_cost
-
-
-def train(use_cuda, save_dirname, is_local=True):
-    scale_infer, avg_cost = model()
-
-    # test program
-    test_program = fluid.default_main_program().clone(for_test=True)
-
-    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
-    sgd_optimizer.minimize(avg_cost)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    exe = Executor(place)
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.movielens.train(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
-
-    feed_order = [
-        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
-        'movie_title', 'score'
-    ]
-
-    def train_loop(main_program):
-        exe.run(framework.default_startup_program())
-
-        feed_list = [
-            main_program.global_block().var(var_name) for var_name in feed_order
-        ]
-        feeder = fluid.DataFeeder(feed_list, place)
-
-        PASS_NUM = 100
-        for pass_id in range(PASS_NUM):
-            for batch_id, data in enumerate(train_reader()):
-                # train a mini-batch
-                outs = exe.run(program=main_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_cost])
-                out = np.array(outs[0])
-                if (batch_id + 1) % 10 == 0:
-                    avg_cost_set = []
-                    for test_data in test_reader():
-                        avg_cost_np = exe.run(program=test_program,
-                                              feed=feeder.feed(test_data),
-                                              fetch_list=[avg_cost])
-                        avg_cost_set.append(avg_cost_np[0])
-                        break  # test only 1 segment for speeding up CI
-
-                    # get test avg_cost
-                    test_avg_cost = np.array(avg_cost_set).mean()
-                    if test_avg_cost < 6.0:
-                        # if avg_cost less than 6.0, we think our code is good.
-                        if save_dirname is not None:
-                            fluid.io.save_inference_model(save_dirname, [
-                                "user_id", "gender_id", "age_id", "job_id",
-                                "movie_id", "category_id", "movie_title"
-                            ], [scale_infer], exe)
-                        return
-
-                if math.isnan(float(out[0])):
-                    sys.exit("got NaN loss, training failed.")
-
-    if is_local:
-        train_loop(fluid.default_main_program())
-    else:
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
-        t = fluid.DistributeTranspiler()
-        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            train_loop(t.get_trainer_program())
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        # Use the first data from paddle.dataset.movielens.test() as input
-        assert feed_target_names[0] == "user_id"
-        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API
-        # to generate LoD Tensor where `data` is a list of sequences of index
-        # numbers, `recursive_sequence_lengths` is the length-based level of detail
-        # (lod) info associated with `data`.
-        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
-        # two sequences of indexes, of length 3 and 2, respectively.
-        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
-        # level of detail info, indicating that `data` consists of two sequences
-        # of length 3 and 2, respectively.
-        user_id = fluid.create_lod_tensor([[np.int64(1)]], [[1]], place)
-
-        assert feed_target_names[1] == "gender_id"
-        gender_id = fluid.create_lod_tensor([[np.int64(1)]], [[1]], place)
-
-        assert feed_target_names[2] == "age_id"
-        age_id = fluid.create_lod_tensor([[np.int64(0)]], [[1]], place)
-
-        assert feed_target_names[3] == "job_id"
-        job_id = fluid.create_lod_tensor([[np.int64(10)]], [[1]], place)
-
-        assert feed_target_names[4] == "movie_id"
-        movie_id = fluid.create_lod_tensor([[np.int64(783)]], [[1]], place)
-
-        assert feed_target_names[5] == "category_id"
-        category_id = fluid.create_lod_tensor(
-            [np.array(
-                [10, 8, 9], dtype='int64')], [[3]], place)
-
-        assert feed_target_names[6] == "movie_title"
-        movie_title = fluid.create_lod_tensor(
-            [np.array(
-                [1069, 4140, 2923, 710, 988], dtype='int64')], [[5]],
-            place)
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        results = exe.run(inference_program,
-                          feed={
-                              feed_target_names[0]: user_id,
-                              feed_target_names[1]: gender_id,
-                              feed_target_names[2]: age_id,
-                              feed_target_names[3]: job_id,
-                              feed_target_names[4]: movie_id,
-                              feed_target_names[5]: category_id,
-                              feed_target_names[6]: movie_title
-                          },
-                          fetch_list=fetch_targets,
-                          return_numpy=False)
-        print("inferred score: ", np.array(results[0]))
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    # Directory for saving the inference model
-    save_dirname = "recommender_system.inference.model"
-
-    train(use_cuda, save_dirname)
-    infer(use_cuda, save_dirname)
-
-
-if __name__ == '__main__':
-    main(USE_GPU)
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
deleted file mode 100644
index 91c8705aa4c88dbfeea45e15c368459ba5b5ac1f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ /dev/null
@@ -1,283 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-import paddle.fluid.layers as layers
-import contextlib
-import math
-import sys
-import unittest
-from paddle.fluid.executor import Executor
-
-dict_size = 30000
-source_dict_dim = target_dict_dim = dict_size
-src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-hidden_dim = 32
-embedding_dim = 16
-batch_size = 10
-max_length = 50
-topk_size = 50
-encoder_size = decoder_size = hidden_dim
-IS_SPARSE = True
-USE_PEEPHOLES = False
-
-
-def bi_lstm_encoder(input_seq, hidden_size):
-    input_forward_proj = fluid.layers.fc(input=input_seq,
-                                         size=hidden_size * 4,
-                                         bias_attr=True)
-    forward, _ = fluid.layers.dynamic_lstm(
-        input=input_forward_proj,
-        size=hidden_size * 4,
-        use_peepholes=USE_PEEPHOLES)
-    input_backward_proj = fluid.layers.fc(input=input_seq,
-                                          size=hidden_size * 4,
-                                          bias_attr=True)
-    backward, _ = fluid.layers.dynamic_lstm(
-        input=input_backward_proj,
-        size=hidden_size * 4,
-        is_reverse=True,
-        use_peepholes=USE_PEEPHOLES)
-
-    forward_last = fluid.layers.sequence_last_step(input=forward)
-    backward_first = fluid.layers.sequence_first_step(input=backward)
-
-    return forward_last, backward_first
-
-
-# FIXME(peterzhang2029): Replace this function with the lstm_unit_op.
-def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
-    def linear(inputs):
-        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
-
-    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
-
-    cell_t = fluid.layers.sums(input=[
-        fluid.layers.elementwise_mul(
-            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
-                x=input_gate, y=cell_tilde)
-    ])
-
-    hidden_t = fluid.layers.elementwise_mul(
-        x=output_gate, y=fluid.layers.tanh(x=cell_t))
-
-    return hidden_t, cell_t
-
-
-def lstm_decoder_without_attention(target_embedding, decoder_boot, context,
-                                   decoder_size):
-    rnn = fluid.layers.DynamicRNN()
-
-    cell_init = fluid.layers.fill_constant_batch_size_like(
-        input=decoder_boot,
-        value=0.0,
-        shape=[-1, decoder_size],
-        dtype='float32')
-    cell_init.stop_gradient = False
-
-    with rnn.block():
-        current_word = rnn.step_input(target_embedding)
-        context = rnn.static_input(context)
-
-        hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
-        cell_mem = rnn.memory(init=cell_init)
-        decoder_inputs = fluid.layers.concat(
-            input=[context, current_word], axis=1)
-        h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
-        rnn.update_memory(hidden_mem, h)
-        rnn.update_memory(cell_mem, c)
-        out = fluid.layers.fc(input=h,
-                              size=target_dict_dim,
-                              bias_attr=True,
-                              act='softmax')
-        rnn.output(out)
-    return rnn()
-
-
-def seq_to_seq_net():
-    """Construct a seq2seq network."""
-
-    src_word_idx = fluid.layers.data(
-        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
-
-    src_embedding = fluid.layers.embedding(
-        input=src_word_idx,
-        size=[source_dict_dim, embedding_dim],
-        dtype='float32')
-
-    src_forward_last, src_backward_first = bi_lstm_encoder(
-        input_seq=src_embedding, hidden_size=encoder_size)
-
-    encoded_vector = fluid.layers.concat(
-        input=[src_forward_last, src_backward_first], axis=1)
-
-    decoder_boot = fluid.layers.fc(input=src_backward_first,
-                                   size=decoder_size,
-                                   bias_attr=False,
-                                   act='tanh')
-
-    trg_word_idx = fluid.layers.data(
-        name='target_sequence', shape=[1], dtype='int64', lod_level=1)
-
-    trg_embedding = fluid.layers.embedding(
-        input=trg_word_idx,
-        size=[target_dict_dim, embedding_dim],
-        dtype='float32')
-
-    prediction = lstm_decoder_without_attention(trg_embedding, decoder_boot,
-                                                encoded_vector, decoder_size)
-    label = fluid.layers.data(
-        name='label_sequence', shape=[1], dtype='int64', lod_level=1)
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-
-    return avg_cost, prediction
-
-
-def train(use_cuda, save_dirname=None):
-    [avg_cost, prediction] = seq_to_seq_net()
-
-    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
-    optimizer.minimize(avg_cost)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = Executor(place)
-    exe.run(framework.default_startup_program())
-
-    feed_order = ['source_sequence', 'target_sequence', 'label_sequence']
-    feed_list = [
-        framework.default_main_program().global_block().var(var_name)
-        for var_name in feed_order
-    ]
-    feeder = fluid.DataFeeder(feed_list, place)
-
-    batch_id = 0
-    for pass_id in range(2):
-        for data in train_data():
-            outs = exe.run(framework.default_main_program(),
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_cost])
-
-            avg_cost_val = np.array(outs[0])
-            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
-                  " avg_cost=" + str(avg_cost_val))
-            if math.isnan(float(avg_cost_val[0])):
-                sys.exit("got NaN loss, training failed.")
-            if batch_id > 3:
-                if save_dirname is not None:
-                    fluid.io.save_inference_model(
-                        save_dirname, ['source_sequence',
-                                       'target_sequence'], [prediction], exe)
-                return
-
-            batch_id += 1
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of
-        # each word (base_shape) should be [1] since it is simply an index to
-        # look up for the corresponding word vector.
-        # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
-        # which has only one level of detail. Then the created LoDTensor will have only
-        # one higher level structure (sequence of words, or sentence) than the basic
-        # element (word). Hence the LoDTensor will hold data for two sentences of
-        # length 4 and 6, respectively.
-        # Note that recursive_sequence_lengths should be a list of lists.
-        recursive_seq_lens = [[4, 6]]
-        base_shape = [1]
-        # The range of random integers is [low, high]
-        word_data = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=1)
-        trg_word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=1)
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        assert feed_target_names[0] == 'source_sequence'
-        assert feed_target_names[1] == 'target_sequence'
-        results = exe.run(inference_program,
-                          feed={
-                              feed_target_names[0]: word_data,
-                              feed_target_names[1]: trg_word,
-                          },
-                          fetch_list=fetch_targets,
-                          return_numpy=False)
-        print(results[0].recursive_sequence_lengths())
-        np_data = np.array(results[0])
-        print("Inference shape: ", np_data.shape)
-        print("Inference results: ", np_data)
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    # Directory for saving the trained model
-    save_dirname = "rnn_encoder_decoder.inference.model"
-
-    train(use_cuda, save_dirname)
-    infer(use_cuda, save_dirname)
-
-
-class TestRnnEncoderDecoder(unittest.TestCase):
-    def test_cuda(self):
-        with self.scope_prog_guard():
-            main(use_cuda=True)
-
-    def test_cpu(self):
-        with self.scope_prog_guard():
-            main(use_cuda=False)
-
-    @contextlib.contextmanager
-    def scope_prog_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
deleted file mode 100644
index cfa6b1a74006c8b0f9792eaa302f1d11a0dab4ee..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ /dev/null
@@ -1,280 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.layers.device import get_places
-import unittest
-import os
-import numpy as np
-import math
-import sys
-
-
-def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
-    PASS_NUM = 100
-    EMBED_SIZE = 32
-    HIDDEN_SIZE = 256
-    N = 5
-    BATCH_SIZE = 32
-    IS_SPARSE = is_sparse
-
-    def __network__(words):
-        embed_first = fluid.layers.embedding(
-            input=words[0],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_second = fluid.layers.embedding(
-            input=words[1],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_third = fluid.layers.embedding(
-            input=words[2],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_forth = fluid.layers.embedding(
-            input=words[3],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-
-        concat_embed = fluid.layers.concat(
-            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-        hidden1 = fluid.layers.fc(input=concat_embed,
-                                  size=HIDDEN_SIZE,
-                                  act='sigmoid')
-        predict_word = fluid.layers.fc(input=hidden1,
-                                       size=dict_size,
-                                       act='softmax')
-        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
-        avg_cost = fluid.layers.mean(cost)
-        return avg_cost, predict_word
-
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict)
-
-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-
-    if not is_parallel:
-        avg_cost, predict_word = __network__(
-            [first_word, second_word, third_word, forth_word, next_word])
-    else:
-        raise NotImplementedError()
-
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-
-    train_reader = paddle.batch(
-        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(
-        feed_list=[first_word, second_word, third_word, forth_word, next_word],
-        place=place)
-
-    def train_loop(main_program):
-        exe.run(fluid.default_startup_program())
-
-        for pass_id in range(PASS_NUM):
-            for data in train_reader():
-                avg_cost_np = exe.run(main_program,
-                                      feed=feeder.feed(data),
-                                      fetch_list=[avg_cost])
-                if avg_cost_np[0] < 5.0:
-                    if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, [
-                            'firstw', 'secondw', 'thirdw', 'forthw'
-                        ], [predict_word], exe)
-                    return
-                if math.isnan(float(avg_cost_np[0])):
-                    sys.exit("got NaN loss, training failed.")
-
-        raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
-
-    if is_local:
-        train_loop(fluid.default_main_program())
-    else:
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
-        t = fluid.DistributeTranspiler()
-        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            train_loop(t.get_trainer_program())
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        word_dict = paddle.dataset.imikolov.build_dict()
-        dict_size = len(word_dict)
-
-        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
-        # is simply an index to look up for the corresponding word vector and hence
-        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths,
-        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]]
-        # meaning there is only one level of detail and there is only one sequence of
-        # one word on this level.
-        # Note that recursive_sequence_lengths should be a list of lists.
-        recursive_seq_lens = [[1]]
-        base_shape = [1]
-        # The range of random integers is [low, high]
-        first_word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-        second_word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-        third_word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-        fourth_word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-
-        assert feed_target_names[0] == 'firstw'
-        assert feed_target_names[1] == 'secondw'
-        assert feed_target_names[2] == 'thirdw'
-        assert feed_target_names[3] == 'forthw'
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        results = exe.run(inference_program,
-                          feed={
-                              feed_target_names[0]: first_word,
-                              feed_target_names[1]: second_word,
-                              feed_target_names[2]: third_word,
-                              feed_target_names[3]: fourth_word
-                          },
-                          fetch_list=fetch_targets,
-                          return_numpy=False)
-
-        def to_infer_tensor(lod_tensor):
-            infer_tensor = fluid.core.PaddleTensor()
-            infer_tensor.lod = lod_tensor.lod()
-            infer_tensor.data = fluid.core.PaddleBuf(np.array(lod_tensor))
-            infer_tensor.shape = lod_tensor.shape()
-            infer_tensor.dtype = fluid.core.PaddleDType.INT64
-            return infer_tensor
-
-        infer_inputs = [first_word, second_word, third_word, fourth_word]
-        infer_inputs = [to_infer_tensor(t) for t in infer_inputs]
-
-        infer_config = fluid.core.NativeConfig()
-        infer_config.model_dir = 'word2vec.inference.model'
-        infer_config.use_gpu = use_cuda
-        if use_cuda:
-            infer_config.device = 0
-            infer_config.fraction_of_gpu_memory = 0.15
-        compiled_program = fluid.compiler.CompiledProgram(inference_program)
-        compiled_program._with_inference_optimize(infer_config)
-        assert compiled_program._is_inference is True
-        infer_outputs = exe.run(compiled_program, feed=infer_inputs)
-        np_data = np.array(results[0])
-        infer_out = infer_outputs[0].data.float_data()
-        for a, b in zip(np_data[0], infer_out):
-            assert np.isclose(a, b), "a: {}, b: {}".format(a, b)
-
-
-def main(use_cuda, is_sparse, is_parallel):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    if not is_parallel:
-        save_dirname = "word2vec.inference.model"
-    else:
-        save_dirname = None
-
-    train(use_cuda, is_sparse, is_parallel, save_dirname)
-    infer(use_cuda, save_dirname)
-
-
-FULL_TEST = os.getenv('FULL_TEST',
-                      '0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
-SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
-
-
-class W2VTest(unittest.TestCase):
-    pass
-
-
-def inject_test_method(use_cuda, is_sparse, is_parallel):
-    fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
-                                        if is_sparse else "dense", "parallel"
-                                        if is_parallel else "normal")
-
-    def __impl__(*args, **kwargs):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                main(
-                    use_cuda=use_cuda,
-                    is_sparse=is_sparse,
-                    is_parallel=is_parallel)
-
-    if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse:
-        fn = __impl__
-    else:
-        # skip the other test when on CI server
-        fn = unittest.skipUnless(
-            condition=FULL_TEST, reason=SKIP_REASON)(__impl__)
-
-    setattr(W2VTest, fn_name, fn)
-
-
-for use_cuda in (False, True):
-    for is_sparse in (False, True):
-        for is_parallel in (False, ):
-            inject_test_method(use_cuda, is_sparse, is_parallel)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/demo/executor_train_dataset.py b/python/paddle/fluid/tests/demo/executor_train_dataset.py
deleted file mode 100644
index 6938982de725c296aae29e70d0640749d0876353..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/executor_train_dataset.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tarfile
-import paddle.fluid as fluid
-import paddle
-from paddle.fluid import core
-
-URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
-MD5 = '2a405a31508969b3ab823f42c0f522ca'
-
-
-def bow_net(data,
-            label,
-            dict_dim=89528,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    models/fluid/PaddleNLP/text_classification/nets.py
-    """
-    # embedding
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bowh = fluid.layers.tanh(bow)
-    # fc layer after conv
-    fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    # probability of each class
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    # cross entropy loss
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    # mean loss
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, acc, prediction
-
-
-def train():
-    # Download data
-    with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf:
-        tarf.extractall(path='./')
-        tarf.close()
-
-    # Initialize dataset description
-    dataset = fluid.DatasetFactory().create_dataset()
-    dataset.set_batch_size(128)  # See API doc for how to change other fields
-
-    # define network
-    # input text data
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    # label data
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    dataset.set_use_var([data, label])
-    avg_cost, acc, prediction = bow_net(data, label)
-    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
-    opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
-
-    # Run startup program
-    startup_program = fluid.default_startup_program()
-    place = fluid.CPUPlace()
-    executor = fluid.Executor(place)
-    executor.run(startup_program)
-
-    main_program = fluid.default_main_program()
-    epochs = 10
-    filelist = ["train_data/part-%d" % i for i in range(12)]
-    dataset.set_filelist(filelist)
-    for i in range(epochs):
-        dataset.set_thread(4)
-        executor.train_from_dataset(
-            main_program,  # This can be changed during iteration
-            dataset,  # This can be changed during iteration
-            debug=False)
-        fluid.io.save_inference_model('imdb/epoch%d.model' % i,
-                                      [data.name, label.name], [acc], executor)
-
-
-if __name__ == "__main__":
-    train()
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
deleted file mode 100644
index bd77779ce6ab5cf19e3e5ace3e51e39734b27c10..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ /dev/null
@@ -1,173 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import errno
-import math
-import os
-
-import matplotlib
-import numpy
-
-import paddle
-import paddle.fluid as fluid
-
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-
-NOISE_SIZE = 100
-NUM_PASS = 1000
-NUM_REAL_IMGS_IN_BATCH = 121
-NUM_TRAIN_TIMES_OF_DG = 3
-LEARNING_RATE = 2e-5
-
-
-def D(x):
-    hidden = fluid.layers.fc(input=x,
-                             size=200,
-                             act='relu',
-                             param_attr='D.w1',
-                             bias_attr='D.b1')
-    logits = fluid.layers.fc(input=hidden,
-                             size=1,
-                             act=None,
-                             param_attr='D.w2',
-                             bias_attr='D.b2')
-    return logits
-
-
-def G(x):
-    hidden = fluid.layers.fc(input=x,
-                             size=200,
-                             act='relu',
-                             param_attr='G.w1',
-                             bias_attr='G.b1')
-    img = fluid.layers.fc(input=hidden,
-                          size=28 * 28,
-                          act='tanh',
-                          param_attr='G.w2',
-                          bias_attr='G.b2')
-    return img
-
-
-def plot(gen_data):
-    gen_data.resize(gen_data.shape[0], 28, 28)
-    n = int(math.ceil(math.sqrt(gen_data.shape[0])))
-    fig = plt.figure(figsize=(n, n))
-    gs = gridspec.GridSpec(n, n)
-    gs.update(wspace=0.05, hspace=0.05)
-
-    for i, sample in enumerate(gen_data):
-        ax = plt.subplot(gs[i])
-        plt.axis('off')
-        ax.set_xticklabels([])
-        ax.set_yticklabels([])
-        ax.set_aspect('equal')
-        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
-
-    return fig
-
-
-def main():
-    try:
-        os.makedirs("./out")
-    except OSError as e:
-        if e.errno != errno.EEXIST:
-            raise
-
-    startup_program = fluid.Program()
-    d_program = fluid.Program()
-    dg_program = fluid.Program()
-
-    with fluid.program_guard(d_program, startup_program):
-        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
-        d_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-            x=D(img),
-            label=fluid.layers.data(
-                name='label', shape=[1], dtype='float32'))
-        d_loss = fluid.layers.mean(d_loss)
-
-    with fluid.program_guard(dg_program, startup_program):
-        noise = fluid.layers.data(
-            name='noise', shape=[NOISE_SIZE], dtype='float32')
-        g_img = G(x=noise)
-        g_program = dg_program.clone()
-        dg_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-            x=D(g_img),
-            label=fluid.layers.fill_constant_batch_size_like(
-                input=noise, dtype='float32', shape=[-1, 1], value=1.0))
-        dg_loss = fluid.layers.mean(dg_loss)
-
-    opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)
-
-    opt.minimize(loss=d_loss, startup_program=startup_program)
-    opt.minimize(
-        loss=dg_loss,
-        startup_program=startup_program,
-        parameter_list=[
-            p.name for p in g_program.global_block().all_parameters()
-        ])
-    exe = fluid.Executor(fluid.CPUPlace())
-    exe.run(startup_program)
-
-    num_true = NUM_REAL_IMGS_IN_BATCH
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=60000),
-        batch_size=num_true)
-
-    for pass_id in range(NUM_PASS):
-        for batch_id, data in enumerate(train_reader()):
-            num_true = len(data)
-            n = numpy.random.uniform(
-                low=-1.0, high=1.0,
-                size=[num_true * NOISE_SIZE]).astype('float32').reshape(
-                    [num_true, NOISE_SIZE])
-            generated_img = exe.run(g_program,
-                                    feed={'noise': n},
-                                    fetch_list={g_img})[0]
-            real_data = numpy.array([x[0] for x in data]).astype('float32')
-            real_data = real_data.reshape(num_true, 784)
-            total_data = numpy.concatenate([real_data, generated_img])
-            total_label = numpy.concatenate([
-                numpy.ones(
-                    shape=[real_data.shape[0], 1], dtype='float32'),
-                numpy.zeros(
-                    shape=[real_data.shape[0], 1], dtype='float32')
-            ])
-            d_loss_np = exe.run(d_program,
-                                feed={'img': total_data,
-                                      'label': total_label},
-                                fetch_list={d_loss})[0]
-            for _ in range(NUM_TRAIN_TIMES_OF_DG):
-                n = numpy.random.uniform(
-                    low=-1.0, high=1.0,
-                    size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
-                        [2 * num_true, NOISE_SIZE, 1, 1])
-                dg_loss_np = exe.run(dg_program,
-                                     feed={'noise': n},
-                                     fetch_list={dg_loss})[0]
-            print("Pass ID={0}, Batch ID={1}, D-Loss={2}, DG-Loss={3}".format(
-                pass_id, batch_id, d_loss_np, dg_loss_np))
-        # generate image each batch
-        fig = plot(generated_img)
-        plt.savefig(
-            'out/{0}.png'.format(str(pass_id).zfill(3)), bbox_inches='tight')
-        plt.close(fig)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/demo/pipeline_train.py b/python/paddle/fluid/tests/demo/pipeline_train.py
deleted file mode 100644
index 54fa719e29d3ed0cd6ec6fefd388f9ce1f3604c2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/pipeline_train.py
+++ /dev/null
@@ -1,508 +0,0 @@
-#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import numpy as np
-import copy
-import pickle
-import os
-from functools import partial
-import logging
-import time
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import argparse
-import random
-import sys
-import math
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-batch_size = 100
-ncards = 4
-nreaders = 4
-nscopes = 30
-learning_rate = 0.1
-is_profile = False
-sync_steps = 1
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("gnn")
-    parser.add_argument(
-        '--train_path',
-        type=str,
-        default='./data/diginetica/train.txt',
-        help='dir of training data')
-    parser.add_argument(
-        '--config_path',
-        type=str,
-        default='./data/diginetica/config.txt',
-        help='dir of config')
-    parser.add_argument(
-        '--model_path',
-        type=str,
-        default='./saved_model',
-        help="path of model parameters")
-    parser.add_argument(
-        '--epoch_num',
-        type=int,
-        default=30,
-        help='number of epochs to train for')
-    parser.add_argument(
-        '--batch_size', type=int, default=100, help='input batch size')
-    parser.add_argument(
-        '--hidden_size', type=int, default=100, help='hidden state size')
-    parser.add_argument('--l2', type=float, default=1e-5, help='l2 penalty')
-    parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
-    parser.add_argument(
-        '--emb_lr_rate', type=float, default=0.5, help='learning rate')
-    parser.add_argument(
-        '--step', type=int, default=1, help='gnn propogation steps')
-    parser.add_argument(
-        '--lr_dc', type=float, default=0.1, help='learning rate decay rate')
-    parser.add_argument(
-        '--lr_dc_step',
-        type=int,
-        default=3,
-        help='the number of steps after which the learning rate decay')
-    parser.add_argument(
-        '--use_cuda', type=int, default=0, help='whether to use gpu')
-    parser.add_argument(
-        '--use_parallel',
-        type=int,
-        default=1,
-        help='whether to use parallel executor')
-    return parser.parse_args()
-
-
-def network(batch_size, items_num, hidden_size, step, rate):
-    stdv = 1.0 / math.sqrt(hidden_size)
-
-    items = layers.data(
-        name="items",
-        shape=[batch_size, -1, 1],
-        dtype="int64",
-        append_batch_size=False)  #[bs, uniq_max, 1]
-    seq_index = layers.data(
-        name="seq_index",
-        shape=[batch_size, -1],
-        dtype="int64",
-        append_batch_size=False)  #[-1(seq_max)*batch_size, 1]
-    last_index = layers.data(
-        name="last_index",
-        shape=[batch_size],
-        dtype="int64",
-        append_batch_size=False)  #[batch_size, 1]
-    adj_in = layers.data(
-        name="adj_in",
-        shape=[batch_size, -1, -1],
-        dtype="float32",
-        append_batch_size=False)
-    adj_out = layers.data(
-        name="adj_out",
-        shape=[batch_size, -1, -1],
-        dtype="float32",
-        append_batch_size=False)
-    mask = layers.data(
-        name="mask",
-        shape=[batch_size, -1, 1],
-        dtype="float32",
-        append_batch_size=False)
-    label = layers.data(
-        name="label",
-        shape=[batch_size, 1],
-        dtype="int64",
-        append_batch_size=False)
-
-    items_emb = layers.embedding(
-        input=items,
-        is_sparse=True,
-        param_attr=fluid.ParamAttr(
-            name="emb",
-            learning_rate=rate,
-            initializer=fluid.initializer.Uniform(
-                low=-stdv, high=stdv)),
-        size=[items_num, hidden_size])  #[batch_size, uniq_max, h]
-    data_feed = [items, seq_index, last_index, adj_in, adj_out, mask, label]
-
-    pre_state = items_emb
-    for i in range(step):
-        pre_state = layers.reshape(
-            x=pre_state, shape=[batch_size, -1, hidden_size])
-        state_in = layers.fc(
-            input=pre_state,
-            name="state_in",
-            size=hidden_size,
-            act=None,
-            num_flatten_dims=2,
-            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-stdv, high=stdv)),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]
-        state_out = layers.fc(
-            input=pre_state,
-            name="state_out",
-            size=hidden_size,
-            act=None,
-            num_flatten_dims=2,
-            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-stdv, high=stdv)),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]
-
-        state_adj_in = layers.matmul(adj_in,
-                                     state_in)  #[batch_size, uniq_max, h]
-        state_adj_out = layers.matmul(adj_out,
-                                      state_out)  #[batch_size, uniq_max, h]
-
-        gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
-
-        gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
-        gru_fc = layers.fc(input=gru_input,
-                           name="gru_fc",
-                           size=3 * hidden_size,
-                           bias_attr=False)
-        pre_state, _, _ = fluid.layers.gru_unit(
-            input=gru_fc,
-            hidden=layers.reshape(
-                x=pre_state, shape=[-1, hidden_size]),
-            size=3 * hidden_size)
-
-    final_state = pre_state
-    seq_index = layers.reshape(seq_index, shape=[-1])
-    seq = layers.gather(final_state, seq_index)  #[batch_size*-1(seq_max), h]
-    last = layers.gather(final_state, last_index)  #[batch_size, h]
-
-    seq = layers.reshape(
-        seq, shape=[batch_size, -1, hidden_size])  #[batch_size, -1(seq_max), h]
-    last = layers.reshape(
-        last, shape=[batch_size, hidden_size])  #[batch_size, h]
-
-    seq_fc = layers.fc(
-        input=seq,
-        name="seq_fc",
-        size=hidden_size,
-        bias_attr=False,
-        act=None,
-        num_flatten_dims=2,
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-            low=-stdv, high=stdv)))  #[batch_size, -1(seq_max), h]
-    last_fc = layers.fc(input=last,
-                        name="last_fc",
-                        size=hidden_size,
-                        bias_attr=False,
-                        act=None,
-                        num_flatten_dims=1,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Uniform(
-                                low=-stdv, high=stdv)))  #[bathc_size, h]
-
-    seq_fc_t = layers.transpose(
-        seq_fc, perm=[1, 0, 2])  #[-1(seq_max), batch_size, h]
-    add = layers.elementwise_add(seq_fc_t,
-                                 last_fc)  #[-1(seq_max), batch_size, h]
-    b = layers.create_parameter(
-        shape=[hidden_size],
-        dtype='float32',
-        default_initializer=fluid.initializer.Constant(value=0.0))  #[h]
-    add = layers.elementwise_add(add, b)  #[-1(seq_max), batch_size, h]
-
-    add_sigmoid = layers.sigmoid(add)  #[-1(seq_max), batch_size, h] 
-    add_sigmoid = layers.transpose(
-        add_sigmoid, perm=[1, 0, 2])  #[batch_size, -1(seq_max), h]
-
-    weight = layers.fc(input=add_sigmoid,
-                       name="weight_fc",
-                       size=1,
-                       act=None,
-                       num_flatten_dims=2,
-                       bias_attr=False,
-                       param_attr=fluid.ParamAttr(
-                           initializer=fluid.initializer.Uniform(
-                               low=-stdv, high=stdv)))  #[batch_size, -1, 1]
-    weight *= mask
-    weight_mask = layers.elementwise_mul(seq, weight, axis=0)
-    global_attention = layers.reduce_sum(weight_mask, dim=1)
-
-    final_attention = layers.concat(
-        [global_attention, last], axis=1)  #[batch_size, 2*h]
-    final_attention_fc = layers.fc(
-        input=final_attention,
-        name="fina_attention_fc",
-        size=hidden_size,
-        bias_attr=False,
-        act=None,
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-            low=-stdv, high=stdv)))  #[batch_size, h]
-
-    all_vocab = layers.create_global_var(
-        shape=[items_num - 1, 1],
-        value=0,
-        dtype="int64",
-        persistable=True,
-        name="all_vocab")
-
-    all_emb = layers.embedding(
-        input=all_vocab,
-        is_sparse=True,
-        param_attr=fluid.ParamAttr(
-            name="emb",
-            learning_rate=rate,
-            initializer=fluid.initializer.Uniform(
-                low=-stdv, high=stdv)),
-        size=[items_num, hidden_size])  #[all_vocab, h]
-
-    logits = layers.matmul(
-        x=final_attention_fc, y=all_emb,
-        transpose_y=True)  #[batch_size, all_vocab]
-    softmax = layers.softmax_with_cross_entropy(
-        logits=logits, label=label)  #[batch_size, 1]
-    loss = layers.reduce_mean(softmax)  # [1]
-    #fluid.layers.Print(loss)
-    acc = layers.accuracy(input=logits, label=label, k=20)
-    return loss, acc, data_feed, [items_emb, all_emb]
-
-
-def train():
-    args = parse_args()
-    lr = args.lr
-    rate = args.emb_lr_rate
-    train_data_dir = "./gnn_data_new_8"
-    filelist = [
-        os.path.join(train_data_dir, f) for f in os.listdir(train_data_dir)
-        if os.path.isfile(os.path.join(train_data_dir, f))
-    ][:]
-
-    items_num = read_config(args.config_path)
-    loss, acc, data_vars, cut_list = network(batch_size, items_num,
-                                             args.hidden_size, args.step, rate)
-
-    print("card: %d, thread: %d, lr: %f, lr_rate: %f, scope: %d, sync_step: %d"
-          % (ncards, nreaders, lr, rate, nscopes, sync_steps))
-
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    step_per_epoch = 750000 // batch_size
-    """
-    opt = fluid.optimizer.SGD(
-	learning_rate=fluid.layers.exponential_decay(
-	    learning_rate=args.lr,
-	    decay_steps=step_per_epoch * 10,
-	    decay_rate=args.lr_dc),
-	regularization=fluid.regularizer.L2DecayRegularizer(regularization_coeff=args.l2))
-    """
-    opt = fluid.optimizer.SGD(lr)
-    opt = fluid.optimizer.PipelineOptimizer(
-        opt,
-        cut_list=[cut_list, [loss, acc]],
-        place_list=[fluid.CPUPlace(), fluid.CUDAPlace(0), fluid.CPUPlace()],
-        concurrency_list=[1, 1, nreaders],
-        queue_size=nscopes,
-        sync_steps=sync_steps)
-    opt.minimize(loss)
-
-    exe.run(fluid.default_startup_program())
-
-    all_vocab = fluid.global_scope().var("all_vocab").get_tensor()
-    all_vocab.set(
-        np.arange(1, items_num).astype("int64").reshape((-1, 1)), place)
-
-    logger.info("begin train")
-
-    dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset")
-    dataset.set_use_var(data_vars)
-    dataset.set_batch_size(batch_size)
-    dataset.set_filelist(filelist)
-
-    total_time = []
-    start_time = time.time()
-    loss_sum = 0.0
-    acc_sum = 0.0
-    global_step = 0
-
-    for i in range(25):
-        logger.info("begin epoch %d" % (i))
-        epoch_sum = []
-        random.shuffle(filelist)
-        dataset.set_filelist(filelist)
-        exe.train_from_dataset(
-            fluid.default_main_program(),
-            dataset,
-            thread=ncards,
-            debug=is_profile,
-            fetch_list=[loss, acc],
-            fetch_info=["loss", "acc"],
-            print_period=1)
-        model_path = args.model_path
-        model_path += "_" + str(lr) + "_" + str(rate)
-        save_dir = model_path + "/epoch_" + str(i)
-        fetch_vars = [loss, acc]
-        feed_list = [
-            "items", "seq_index", "last_index", "adj_in", "adj_out", "mask",
-            "label"
-        ]
-        fluid.io.save_inference_model(save_dir, feed_list, fetch_vars, exe)
-
-
-class Data():
-    def __init__(self, path, shuffle=False):
-        data = pickle.load(open(path, 'rb'))
-        self.shuffle = shuffle
-        self.length = len(data[0])
-        self.input = list(zip(data[0], data[1]))
-
-    def make_data(self, cur_batch, batch_size):
-        cur_batch = [list(e) for e in cur_batch]
-        max_seq_len = 0
-        for e in cur_batch:
-            max_seq_len = max(max_seq_len, len(e[0]))
-        last_id = []
-        for e in cur_batch:
-            last_id.append(len(e[0]) - 1)
-            e[0] += [0] * (max_seq_len - len(e[0]))
-
-        max_uniq_len = 0
-        for e in cur_batch:
-            max_uniq_len = max(max_uniq_len, len(np.unique(e[0])))
-
-        items, adj_in, adj_out, seq_index, last_index = [], [], [], [], []
-        mask, label = [], []
-
-        id = 0
-        for e in cur_batch:
-            node = np.unique(e[0])
-            items.append(node.tolist() + (max_uniq_len - len(node)) * [0])
-            adj = np.zeros((max_uniq_len, max_uniq_len))
-
-            for i in np.arange(len(e[0]) - 1):
-                if e[0][i + 1] == 0:
-                    break
-                u = np.where(node == e[0][i])[0][0]
-                v = np.where(node == e[0][i + 1])[0][0]
-                adj[u][v] = 1
-
-            u_deg_in = np.sum(adj, 0)
-            u_deg_in[np.where(u_deg_in == 0)] = 1
-            adj_in.append(np.divide(adj, u_deg_in).transpose())
-
-            u_deg_out = np.sum(adj, 1)
-            u_deg_out[np.where(u_deg_out == 0)] = 1
-            adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose())
-
-            seq_index.append(
-                [np.where(node == i)[0][0] + id * max_uniq_len for i in e[0]])
-            last_index.append(
-                np.where(node == e[0][last_id[id]])[0][0] + id * max_uniq_len)
-            label.append(e[1] - 1)
-            mask.append([[1] * (last_id[id] + 1) + [0] *
-                         (max_seq_len - last_id[id] - 1)])
-            id += 1
-
-        items = np.array(items).astype("uint64").reshape((batch_size, -1, 1))
-        seq_index = np.array(seq_index).astype("uint64").reshape(
-            (batch_size, -1))
-        last_index = np.array(last_index).astype("uint64").reshape(
-            (batch_size, 1))
-        adj_in = np.array(adj_in).astype("float32").reshape(
-            (batch_size, max_uniq_len, max_uniq_len))
-        adj_out = np.array(adj_out).astype("float32").reshape(
-            (batch_size, max_uniq_len, max_uniq_len))
-        mask = np.array(mask).astype("float32").reshape((batch_size, -1, 1))
-        label = np.array(label).astype("uint64").reshape((batch_size, 1))
-        return list(
-            zip(items, seq_index, last_index, adj_in, adj_out, mask, label))
-
-    def reader(self, batch_size, batch_group_size, train=True):
-        if self.shuffle:
-            random.shuffle(self.input)
-        group_remain = self.length % batch_group_size
-        for bg_id in range(0, self.length - group_remain, batch_group_size):
-            cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size])
-            if train:
-                cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True)
-            for i in range(0, batch_group_size, batch_size):
-                cur_batch = cur_bg[i:i + batch_size]
-                yield self.make_data(cur_batch, batch_size)
-
-        #deal with the remaining, discard at most batch_size data
-        if group_remain < batch_size:
-            return
-        remain_data = copy.deepcopy(self.input[-group_remain:])
-        if train:
-            remain_data = sorted(
-                remain_data, key=lambda x: len(x[0]), reverse=True)
-        for i in range(0, batch_group_size, batch_size):
-            if i + batch_size <= len(remain_data):
-                cur_batch = remain_data[i:i + batch_size]
-                yield self.make_data(cur_batch, batch_size)
-
-
-def read_config(path):
-    with open(path, "r") as fin:
-        item_num = int(fin.readline())
-    return item_num
-
-
-induce_map = {0: [0], 1: [0], 2: [], 3: [0, 1], 4: [0, 1], 5: [0], 6: []}
-
-
-def binary_print(slot, fout, index):
-    shape_array = slot.shape
-    num = 1
-    for e in shape_array:
-        num *= e
-    num += len(induce_map[index])
-    num = np.uint16(num)
-    num.tofile(fout)
-    for e in induce_map[index]:
-        tmp_shape = np.uint64(shape_array[e])
-        tmp_shape.tofile(fout)
-    slot.tofile(fout)
-
-
-def make_binary_data():
-    data_reader = Data('./data/diginetica/train.txt', True)
-    index = 0
-    id = -1
-    filename = None
-    fout = None
-    binary = True
-    for data in data_reader.reader(batch_size, 20 * batch_size, True):
-        if index % (batch_size * 900) == 0:
-            id += 1
-            if not binary:
-                filename = "./gnn_data_text/" + str(id)
-            else:
-                filename = "./gnn_data_new_8/" + str(id)
-            print("filename: " + filename)
-            if fout:
-                fout.close()
-            fout = open(filename, "wb" if binary else "w")
-
-        for ins in data:
-            for i, slot in enumerate(ins):
-                if binary:
-                    binary_print(slot, fout, i)
-                else:
-                    text_print(slot, fout, i)
-        index += batch_size
-
-
-if __name__ == "__main__":
-    make_binary_data()
-    train()
diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py
deleted file mode 100644
index 6995346ffa61ea65119930296be2fba5a10c5451..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy
-import six
-
-import paddle
-import paddle.dataset.mnist as mnist
-import paddle.fluid as fluid
-
-
-def network(is_train):
-    reader = fluid.layers.py_reader(
-        capacity=10,
-        shapes=((-1, 784), (-1, 1)),
-        dtypes=('float32', 'int64'),
-        name="train_reader" if is_train else "test_reader",
-        use_double_buffer=True)
-    img, label = fluid.layers.read_file(reader)
-
-    hidden = img
-
-    for i in six.moves.xrange(2):
-        hidden = fluid.layers.fc(input=hidden, size=100, act='tanh')
-        hidden = fluid.layers.dropout(
-            hidden, dropout_prob=0.5, is_test=not is_train)
-
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    return fluid.layers.mean(loss), reader
-
-
-def main():
-    train_prog = fluid.Program()
-    startup_prog = fluid.Program()
-
-    with fluid.program_guard(train_prog, startup_prog):
-        with fluid.unique_name.guard():
-            loss, train_reader = network(True)
-            adam = fluid.optimizer.Adam(learning_rate=0.01)
-            adam.minimize(loss)
-
-    test_prog = fluid.Program()
-    test_startup = fluid.Program()
-    with fluid.program_guard(test_prog, test_startup):
-        with fluid.unique_name.guard():
-            test_loss, test_reader = network(False)
-
-    use_cuda = fluid.core.is_compiled_with_cuda()
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    fluid.Executor(place).run(startup_prog)
-    fluid.Executor(place).run(test_startup)
-
-    trainer = fluid.ParallelExecutor(
-        use_cuda=use_cuda, loss_name=loss.name, main_program=train_prog)
-
-    tester = fluid.ParallelExecutor(
-        use_cuda=use_cuda, share_vars_from=trainer, main_program=test_prog)
-
-    train_reader.decorate_paddle_reader(
-        paddle.reader.shuffle(
-            paddle.batch(mnist.train(), 512), buf_size=8192))
-
-    test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
-
-    for epoch_id in six.moves.xrange(10):
-        train_reader.start()
-        try:
-            while True:
-                print(
-                    'train_loss',
-                    numpy.array(trainer.run(fetch_list=[loss.name])))
-        except fluid.core.EOFException:
-            print('End of epoch', epoch_id)
-            train_reader.reset()
-
-        test_reader.start()
-        try:
-            while True:
-                print(
-                    'test loss',
-                    numpy.array(tester.run(fetch_list=[test_loss.name])))
-        except fluid.core.EOFException:
-            print('End of testing')
-            test_reader.reset()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
deleted file mode 100644
index fe8a9daa3bea4b99bb42edc78538685c5ce11fe3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ /dev/null
@@ -1,267 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A simple machine translation demo using beam search decoder.
-"""
-
-from __future__ import print_function
-
-import contextlib
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-import paddle.fluid.layers as layers
-from paddle.fluid.executor import Executor
-from paddle.fluid.contrib.decoder.beam_search_decoder import *
-import unittest
-import os
-
-dict_size = 30000
-source_dict_dim = target_dict_dim = dict_size
-src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-hidden_dim = 32
-word_dim = 32
-decoder_size = hidden_dim
-IS_SPARSE = True
-batch_size = 2
-max_length = 8
-topk_size = 50
-trg_dic_size = 10000
-beam_size = 2
-
-
-def encoder():
-    # encoder
-    src_word = layers.data(
-        name="src_word", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = layers.embedding(
-        input=src_word,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE)
-
-    fc1 = layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
-    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
-    return encoder_out
-
-
-def decoder_state_cell(context):
-    h = InitState(init=context, need_reorder=True)
-    state_cell = StateCell(inputs={'x': None}, states={'h': h}, out_state='h')
-
-    @state_cell.state_updater
-    def updater(state_cell):
-        current_word = state_cell.get_input('x')
-        prev_h = state_cell.get_state('h')
-        # make sure lod of h heritted from prev_h
-        h = layers.fc(input=[prev_h, current_word],
-                      size=decoder_size,
-                      act='tanh')
-        state_cell.set_state('h', h)
-
-    return state_cell
-
-
-def decoder_train(state_cell):
-    # decoder
-    trg_language_word = layers.data(
-        name="target_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = layers.embedding(
-        input=trg_language_word,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE)
-
-    decoder = TrainingDecoder(state_cell)
-
-    with decoder.block():
-        current_word = decoder.step_input(trg_embedding)
-        decoder.state_cell.compute_state(inputs={'x': current_word})
-        current_score = layers.fc(input=decoder.state_cell.get_state('h'),
-                                  size=target_dict_dim,
-                                  act='softmax')
-        decoder.state_cell.update_states()
-        decoder.output(current_score)
-
-    return decoder()
-
-
-def decoder_decode(state_cell):
-    init_ids = layers.data(
-        name="init_ids", shape=[1], dtype="int64", lod_level=2)
-    init_scores = layers.data(
-        name="init_scores", shape=[1], dtype="float32", lod_level=2)
-
-    decoder = BeamSearchDecoder(
-        state_cell=state_cell,
-        init_ids=init_ids,
-        init_scores=init_scores,
-        target_dict_dim=target_dict_dim,
-        word_dim=word_dim,
-        input_var_dict={},
-        topk_size=topk_size,
-        sparse_emb=IS_SPARSE,
-        max_len=max_length,
-        beam_size=beam_size,
-        end_id=1,
-        name=None)
-    decoder.decode()
-    translation_ids, translation_scores = decoder()
-
-    return translation_ids, translation_scores
-
-
-def train_main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    context = encoder()
-    state_cell = decoder_state_cell(context)
-    rnn_out = decoder_train(state_cell)
-    label = layers.data(
-        name="target_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = layers.mean(x=cost)
-
-    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3)
-    optimizer.minimize(avg_cost)
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
-    feed_order = ['src_word', 'target_word', 'target_next_word']
-
-    exe = Executor(place)
-
-    def train_loop(main_program):
-        exe.run(framework.default_startup_program())
-
-        feed_list = [
-            main_program.global_block().var(var_name) for var_name in feed_order
-        ]
-        feeder = fluid.DataFeeder(feed_list, place)
-
-        for pass_id in range(1):
-            for batch_id, data in enumerate(train_reader()):
-                outs = exe.run(main_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_cost])
-                avg_cost_val = np.array(outs[0])
-                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
-                      " avg_cost=" + str(avg_cost_val))
-                if batch_id > 3:
-                    break
-
-    train_loop(framework.default_main_program())
-
-
-def decode_main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    context = encoder()
-    state_cell = decoder_state_cell(context)
-    translation_ids, translation_scores = decoder_decode(state_cell)
-
-    exe = Executor(place)
-    exe.run(framework.default_startup_program())
-
-    init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
-    init_scores_data = np.array(
-        [1. for _ in range(batch_size)], dtype='float32')
-    init_ids_data = init_ids_data.reshape((batch_size, 1))
-    init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [1] * batch_size
-    init_lod = [init_lod, init_lod]
-
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
-    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
-
-    feed_order = ['src_word']
-    feed_list = [
-        framework.default_main_program().global_block().var(var_name)
-        for var_name in feed_order
-    ]
-    feeder = fluid.DataFeeder(feed_list, place)
-
-    data = next(train_reader())
-    feed_dict = feeder.feed([[x[0]] for x in data])
-    feed_dict['init_ids'] = init_ids
-    feed_dict['init_scores'] = init_scores
-
-    result_ids, result_scores = exe.run(
-        framework.default_main_program(),
-        feed=feed_dict,
-        fetch_list=[translation_ids, translation_scores],
-        return_numpy=False)
-    print(result_ids.lod())
-
-
-class TestBeamSearchDecoder(unittest.TestCase):
-    pass
-
-
-@contextlib.contextmanager
-def scope_prog_guard():
-    prog = fluid.Program()
-    startup_prog = fluid.Program()
-    scope = fluid.core.Scope()
-    with fluid.scope_guard(scope):
-        with fluid.program_guard(prog, startup_prog):
-            yield
-
-
-def inject_test_train(use_cuda):
-    f_name = 'test_{0}_train'.format('cuda' if use_cuda else 'cpu')
-
-    def f(*args):
-        with scope_prog_guard():
-            train_main(use_cuda)
-
-    setattr(TestBeamSearchDecoder, f_name, f)
-
-
-def inject_test_decode(use_cuda, decorator=None):
-    f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu', 'sparse')
-
-    def f(*args):
-        with scope_prog_guard():
-            decode_main(use_cuda)
-
-    if decorator is not None:
-        f = decorator(f)
-
-    setattr(TestBeamSearchDecoder, f_name, f)
-
-
-for _use_cuda_ in (False, True):
-    inject_test_train(_use_cuda_)
-
-for _use_cuda_ in (False, True):
-    _decorator_ = None
-    inject_test_decode(use_cuda=_use_cuda_, decorator=_decorator_)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/test_communicator.py b/python/paddle/fluid/tests/test_communicator.py
deleted file mode 100644
index 42448758bcfa28d4c0b3a192d23e9685495f74c9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_communicator.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import time
-
-import paddle.fluid as fluid
-from paddle.fluid.communicator import Communicator
-
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
-
-
-class TestCommunicator(unittest.TestCase):
-    def net(self):
-        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        return avg_cost
-
-    def test_communicator_init_and_start(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=2,
-            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
-
-        fleet.init(role)
-        avg_cost = self.net()
-
-        optimizer = fluid.optimizer.SGD(0.01)
-
-        strategy = DistributeTranspilerConfig()
-        strategy.sync_mode = True
-        strategy.wait_port = False
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        optimizer.minimize(avg_cost)
-
-        comm = Communicator(fleet.main_program)
-        comm.start()
-        time.sleep(10)
-        comm.stop()
-
-
-class TestCommunicator2(unittest.TestCase):
-    def test_communicator_init_and_start(self):
-        prog = fluid.Program()
-        comm = Communicator(prog)
-        comm.start()
-        comm.stop()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
deleted file mode 100644
index 16a33fd3ab3c794494687ba39278e327560686ec..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-import unittest
-
-
-class TestDataFeeder(unittest.TestCase):
-    def test_lod_level_0_converter(self):
-        img = fluid.layers.data(name='image', shape=[1, 28, 28])
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
-        result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
-
-        self.assertEqual(result['image'].shape(), [2, 1, 28, 28])
-        self.assertEqual(result['label'].shape(), [2, 1])
-        self.assertEqual(result['image'].recursive_sequence_lengths(), [])
-        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
-
-        try:
-            result = feeder.feed([([0] * 783, [9]), ([1] * 783, [1])])
-            self.assertTrue(False)
-        except ValueError:
-            self.assertTrue(True)
-
-    def test_lod_level_1_converter(self):
-        # lod_level = 1
-        # each sentence has a different number of words
-        sentences = fluid.layers.data(
-            name='sentences', shape=[1], dtype='int64', lod_level=1)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        feeder = fluid.DataFeeder([sentences, label], fluid.CPUPlace())
-
-        # lod = [[0, 3, 5, 9]]
-        # data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
-        # label = [1] * len(data)
-        result = feeder.feed(
-            [([1, 2, 3], [1]), ([4, 5], [1]), ([6, 7, 8, 9], [1])])
-
-        self.assertEqual(result['sentences'].shape(), [9, 1])
-        self.assertEqual(result['label'].shape(), [3, 1])
-        self.assertEqual(result['sentences'].recursive_sequence_lengths(),
-                         [[3, 2, 4]])
-        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
-
-    def test_lod_level_2_converter(self):
-        # lod_level = 2
-        # paragraphs -> sentences -> words
-        paragraphs = fluid.layers.data(
-            name='paragraphs', shape=[1], dtype='int64', lod_level=2)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        feeder = fluid.DataFeeder([paragraphs, label], fluid.CPUPlace())
-
-        # lod = [[0, 2, 3], [0, 3, 5, 9]]
-        # data = [[[1, 2, 3], [4, 5]], [[6, 7, 8, 9]]]
-        # label = [1] * len(data)
-        result = feeder.feed(
-            [([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8, 9]], [1])])
-
-        self.assertEqual(result['paragraphs'].shape(), [9, 1])
-        self.assertEqual(result['label'].shape(), [2, 1])
-        self.assertEqual(result['paragraphs'].recursive_sequence_lengths(),
-                         [[2, 1], [3, 2, 4]])
-        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
deleted file mode 100644
index d4151428c8a61e976fb593c8cf9363d6cfccdebf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_detection.py
+++ /dev/null
@@ -1,592 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.layers import detection
-from paddle.fluid.framework import Program, program_guard
-import unittest
-
-
-class TestDetection(unittest.TestCase):
-    def test_detection_output(self):
-        program = Program()
-        with program_guard(program):
-            pb = layers.data(
-                name='prior_box',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            pbv = layers.data(
-                name='prior_box_var',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            loc = layers.data(
-                name='target_box',
-                shape=[2, 10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            scores = layers.data(
-                name='scores',
-                shape=[2, 10, 20],
-                append_batch_size=False,
-                dtype='float32')
-            out = layers.detection_output(
-                scores=scores, loc=loc, prior_box=pb, prior_box_var=pbv)
-            out2, index = layers.detection_output(
-                scores=scores,
-                loc=loc,
-                prior_box=pb,
-                prior_box_var=pbv,
-                return_index=True)
-            self.assertIsNotNone(out)
-            self.assertIsNotNone(out2)
-            self.assertIsNotNone(index)
-            self.assertEqual(out.shape[-1], 6)
-        print(str(program))
-
-    def test_box_coder_api(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[4], dtype='float32')
-            y = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
-            bcoder = layers.box_coder(
-                prior_box=x,
-                prior_box_var=[0.1, 0.2, 0.1, 0.2],
-                target_box=y,
-                code_type='encode_center_size')
-            self.assertIsNotNone(bcoder)
-        print(str(program))
-
-    def test_detection_api(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[4], dtype='float32')
-            y = layers.data(name='y', shape=[4], dtype='float32')
-            z = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
-            iou = layers.iou_similarity(x=x, y=y)
-            bcoder = layers.box_coder(
-                prior_box=x,
-                prior_box_var=y,
-                target_box=z,
-                code_type='encode_center_size')
-            self.assertIsNotNone(iou)
-            self.assertIsNotNone(bcoder)
-
-            matched_indices, matched_dist = layers.bipartite_match(iou)
-            self.assertIsNotNone(matched_indices)
-            self.assertIsNotNone(matched_dist)
-
-            gt = layers.data(
-                name='gt', shape=[1, 1], dtype='int32', lod_level=1)
-            trg, trg_weight = layers.target_assign(
-                gt, matched_indices, mismatch_value=0)
-            self.assertIsNotNone(trg)
-            self.assertIsNotNone(trg_weight)
-
-            gt2 = layers.data(
-                name='gt2', shape=[10, 4], dtype='float32', lod_level=1)
-            trg, trg_weight = layers.target_assign(
-                gt2, matched_indices, mismatch_value=0)
-            self.assertIsNotNone(trg)
-            self.assertIsNotNone(trg_weight)
-
-        print(str(program))
-
-    def test_ssd_loss(self):
-        program = Program()
-        with program_guard(program):
-            pb = layers.data(
-                name='prior_box',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            pbv = layers.data(
-                name='prior_box_var',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
-            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
-            gt_box = layers.data(
-                name='gt_box', shape=[4], lod_level=1, dtype='float32')
-            gt_label = layers.data(
-                name='gt_label', shape=[1], lod_level=1, dtype='int32')
-            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
-            self.assertIsNotNone(loss)
-            self.assertEqual(loss.shape[-1], 1)
-        print(str(program))
-
-
-class TestPriorBox(unittest.TestCase):
-    def test_prior_box(self):
-        program = Program()
-        with program_guard(program):
-            data_shape = [3, 224, 224]
-            images = fluid.layers.data(
-                name='pixel', shape=data_shape, dtype='float32')
-            conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-            box, var = layers.prior_box(
-                input=conv1,
-                image=images,
-                min_sizes=[100.0],
-                aspect_ratios=[1.],
-                flip=True,
-                clip=True)
-            assert len(box.shape) == 4
-            assert box.shape == var.shape
-            assert box.shape[3] == 4
-
-
-class TestDensityPriorBox(unittest.TestCase):
-    def test_density_prior_box(self):
-        program = Program()
-        with program_guard(program):
-            data_shape = [3, 224, 224]
-            images = fluid.layers.data(
-                name='pixel', shape=data_shape, dtype='float32')
-            conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-            box, var = layers.density_prior_box(
-                input=conv1,
-                image=images,
-                densities=[3, 4],
-                fixed_sizes=[50., 60.],
-                fixed_ratios=[1.0],
-                clip=True)
-            assert len(box.shape) == 4
-            assert box.shape == var.shape
-            assert box.shape[-1] == 4
-
-
-class TestAnchorGenerator(unittest.TestCase):
-    def test_anchor_generator(self):
-        data_shape = [3, 224, 224]
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-        anchor, var = fluid.layers.anchor_generator(
-            input=conv1,
-            anchor_sizes=[64, 128, 256, 512],
-            aspect_ratios=[0.5, 1.0, 2.0],
-            variance=[0.1, 0.1, 0.2, 0.2],
-            stride=[16.0, 16.0],
-            offset=0.5)
-        assert len(anchor.shape) == 4
-        assert anchor.shape == var.shape
-        assert anchor.shape[3] == 4
-
-
-class TestGenerateProposalLabels(unittest.TestCase):
-    def test_generate_proposal_labels(self):
-        program = Program()
-        with program_guard(program):
-            rpn_rois = layers.data(
-                name='rpn_rois',
-                shape=[4, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            gt_classes = layers.data(
-                name='gt_classes',
-                shape=[6],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[6],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            gt_boxes = layers.data(
-                name='gt_boxes',
-                shape=[6, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            class_nums = 5
-            outs = fluid.layers.generate_proposal_labels(
-                rpn_rois=rpn_rois,
-                gt_classes=gt_classes,
-                is_crowd=is_crowd,
-                gt_boxes=gt_boxes,
-                im_info=im_info,
-                batch_size_per_im=2,
-                fg_fraction=0.5,
-                fg_thresh=0.5,
-                bg_thresh_hi=0.5,
-                bg_thresh_lo=0.0,
-                bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
-                class_nums=class_nums)
-            rois = outs[0]
-            labels_int32 = outs[1]
-            bbox_targets = outs[2]
-            bbox_inside_weights = outs[3]
-            bbox_outside_weights = outs[4]
-            assert rois.shape[1] == 4
-            assert rois.shape[0] == labels_int32.shape[0]
-            assert rois.shape[0] == bbox_targets.shape[0]
-            assert rois.shape[0] == bbox_inside_weights.shape[0]
-            assert rois.shape[0] == bbox_outside_weights.shape[0]
-            assert bbox_targets.shape[1] == 4 * class_nums
-            assert bbox_inside_weights.shape[1] == 4 * class_nums
-            assert bbox_outside_weights.shape[1] == 4 * class_nums
-
-
-class TestGenerateMaskLabels(unittest.TestCase):
-    def test_generate_mask_labels(self):
-        program = Program()
-        with program_guard(program):
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            gt_classes = layers.data(
-                name='gt_classes',
-                shape=[2, 1],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[2, 1],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            gt_segms = layers.data(
-                name='gt_segms',
-                shape=[20, 2],
-                dtype='float32',
-                lod_level=3,
-                append_batch_size=False)
-            rois = layers.data(
-                name='rois',
-                shape=[4, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            labels_int32 = layers.data(
-                name='labels_int32',
-                shape=[4, 1],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            num_classes = 5
-            resolution = 14
-            outs = fluid.layers.generate_mask_labels(
-                im_info=im_info,
-                gt_classes=gt_classes,
-                is_crowd=is_crowd,
-                gt_segms=gt_segms,
-                rois=rois,
-                labels_int32=labels_int32,
-                num_classes=num_classes,
-                resolution=resolution)
-            mask_rois, roi_has_mask_int32, mask_int32 = outs
-            assert mask_rois.shape[1] == 4
-            assert mask_int32.shape[1] == num_classes * resolution * resolution
-
-
-class TestMultiBoxHead(unittest.TestCase):
-    def test_multi_box_head(self):
-        data_shape = [3, 224, 224]
-        mbox_locs, mbox_confs, box, var = self.multi_box_head_output(data_shape)
-
-        assert len(box.shape) == 2
-        assert box.shape == var.shape
-        assert box.shape[1] == 4
-        assert mbox_locs.shape[1] == mbox_confs.shape[1]
-
-    def multi_box_head_output(self, data_shape):
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-        conv2 = fluid.layers.conv2d(conv1, 3, 3, 2)
-        conv3 = fluid.layers.conv2d(conv2, 3, 3, 2)
-        conv4 = fluid.layers.conv2d(conv3, 3, 3, 2)
-        conv5 = fluid.layers.conv2d(conv4, 3, 3, 2)
-
-        mbox_locs, mbox_confs, box, var = layers.multi_box_head(
-            inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
-            image=images,
-            num_classes=21,
-            min_ratio=20,
-            max_ratio=90,
-            aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]],
-            base_size=300,
-            offset=0.5,
-            flip=True,
-            clip=True)
-
-        return mbox_locs, mbox_confs, box, var
-
-
-class TestDetectionMAP(unittest.TestCase):
-    def test_detection_map(self):
-        program = Program()
-        with program_guard(program):
-            detect_res = layers.data(
-                name='detect_res',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
-            label = layers.data(
-                name='label',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
-
-            map_out = detection.detection_map(detect_res, label, 21)
-            self.assertIsNotNone(map_out)
-            self.assertEqual(map_out.shape, (1, ))
-        print(str(program))
-
-
-class TestRpnTargetAssign(unittest.TestCase):
-    def test_rpn_target_assign(self):
-        program = Program()
-        with program_guard(program):
-            bbox_pred_shape = [10, 50, 4]
-            cls_logits_shape = [10, 50, 2]
-            anchor_shape = [50, 4]
-
-            bbox_pred = layers.data(
-                name='bbox_pred',
-                shape=bbox_pred_shape,
-                append_batch_size=False,
-                dtype='float32')
-            cls_logits = layers.data(
-                name='cls_logits',
-                shape=cls_logits_shape,
-                append_batch_size=False,
-                dtype='float32')
-            anchor_box = layers.data(
-                name='anchor_box',
-                shape=anchor_shape,
-                append_batch_size=False,
-                dtype='float32')
-            anchor_var = layers.data(
-                name='anchor_var',
-                shape=anchor_shape,
-                append_batch_size=False,
-                dtype='float32')
-            gt_boxes = layers.data(
-                name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[1, 10],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            outs = layers.rpn_target_assign(
-                bbox_pred=bbox_pred,
-                cls_logits=cls_logits,
-                anchor_box=anchor_box,
-                anchor_var=anchor_var,
-                gt_boxes=gt_boxes,
-                is_crowd=is_crowd,
-                im_info=im_info,
-                rpn_batch_size_per_im=256,
-                rpn_straddle_thresh=0.0,
-                rpn_fg_fraction=0.5,
-                rpn_positive_overlap=0.7,
-                rpn_negative_overlap=0.3,
-                use_random=False)
-            pred_scores = outs[0]
-            pred_loc = outs[1]
-            tgt_lbl = outs[2]
-            tgt_bbox = outs[3]
-            bbox_inside_weight = outs[4]
-
-            self.assertIsNotNone(pred_scores)
-            self.assertIsNotNone(pred_loc)
-            self.assertIsNotNone(tgt_lbl)
-            self.assertIsNotNone(tgt_bbox)
-            self.assertIsNotNone(bbox_inside_weight)
-            assert pred_scores.shape[1] == 1
-            assert pred_loc.shape[1] == 4
-            assert pred_loc.shape[1] == tgt_bbox.shape[1]
-            print(str(program))
-
-
-class TestGenerateProposals(unittest.TestCase):
-    def test_generate_proposals(self):
-        program = Program()
-        with program_guard(program):
-            data_shape = [20, 64, 64]
-            images = fluid.layers.data(
-                name='images', shape=data_shape, dtype='float32')
-            im_info = fluid.layers.data(
-                name='im_info', shape=[3], dtype='float32')
-            anchors, variances = fluid.layers.anchor_generator(
-                name='anchor_generator',
-                input=images,
-                anchor_sizes=[32, 64],
-                aspect_ratios=[1.0],
-                variance=[0.1, 0.1, 0.2, 0.2],
-                stride=[16.0, 16.0],
-                offset=0.5)
-            num_anchors = anchors.shape[2]
-            scores = fluid.layers.data(
-                name='scores', shape=[num_anchors, 8, 8], dtype='float32')
-            bbox_deltas = fluid.layers.data(
-                name='bbox_deltas',
-                shape=[num_anchors * 4, 8, 8],
-                dtype='float32')
-            rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
-                name='generate_proposals',
-                scores=scores,
-                bbox_deltas=bbox_deltas,
-                im_info=im_info,
-                anchors=anchors,
-                variances=variances,
-                pre_nms_top_n=6000,
-                post_nms_top_n=1000,
-                nms_thresh=0.5,
-                min_size=0.1,
-                eta=1.0)
-            self.assertIsNotNone(rpn_rois)
-            self.assertIsNotNone(rpn_roi_probs)
-            print(rpn_rois.shape)
-
-
-class TestYoloDetection(unittest.TestCase):
-    def test_yolov3_loss(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            gt_box = layers.data(name='gt_box', shape=[10, 4], dtype='float32')
-            gt_label = layers.data(name='gt_label', shape=[10], dtype='int32')
-            gt_score = layers.data(name='gt_score', shape=[10], dtype='float32')
-            loss = layers.yolov3_loss(
-                x,
-                gt_box,
-                gt_label, [10, 13, 30, 13], [0, 1],
-                10,
-                0.7,
-                32,
-                gt_score=gt_score,
-                use_label_smooth=False)
-
-            self.assertIsNotNone(loss)
-
-    def test_yolo_box(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            img_size = layers.data(name='img_size', shape=[2], dtype='int32')
-            boxes, scores = layers.yolo_box(x, img_size, [10, 13, 30, 13], 10,
-                                            0.01, 32)
-            self.assertIsNotNone(boxes)
-            self.assertIsNotNone(scores)
-
-
-class TestBoxClip(unittest.TestCase):
-    def test_box_clip(self):
-        program = Program()
-        with program_guard(program):
-            input_box = layers.data(
-                name='input_box', shape=[7, 4], dtype='float32', lod_level=1)
-            im_info = layers.data(name='im_info', shape=[3], dtype='float32')
-            out = layers.box_clip(input_box, im_info)
-            self.assertIsNotNone(out)
-
-
-class TestMulticlassNMS(unittest.TestCase):
-    def test_multiclass_nms(self):
-        program = Program()
-        with program_guard(program):
-            bboxes = layers.data(
-                name='bboxes', shape=[-1, 10, 4], dtype='float32')
-            scores = layers.data(name='scores', shape=[-1, 10], dtype='float32')
-            output = layers.multiclass_nms(bboxes, scores, 0.3, 400, 200, 0.7)
-            self.assertIsNotNone(output)
-
-
-class TestMulticlassNMS2(unittest.TestCase):
-    def test_multiclass_nms2(self):
-        program = Program()
-        with program_guard(program):
-            bboxes = layers.data(
-                name='bboxes', shape=[-1, 10, 4], dtype='float32')
-            scores = layers.data(name='scores', shape=[-1, 10], dtype='float32')
-            output = layers.multiclass_nms2(bboxes, scores, 0.3, 400, 200, 0.7)
-            output2, index = layers.multiclass_nms2(
-                bboxes, scores, 0.3, 400, 200, 0.7, return_index=True)
-            self.assertIsNotNone(output)
-            self.assertIsNotNone(output2)
-            self.assertIsNotNone(index)
-
-
-class TestCollectFpnPropsals(unittest.TestCase):
-    def test_collect_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
-            multi_bboxes = []
-            multi_scores = []
-            for i in range(4):
-                bboxes = layers.data(
-                    name='rois' + str(i),
-                    shape=[10, 4],
-                    dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
-                scores = layers.data(
-                    name='scores' + str(i),
-                    shape=[10, 1],
-                    dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
-                multi_bboxes.append(bboxes)
-                multi_scores.append(scores)
-            fpn_rois = layers.collect_fpn_proposals(multi_bboxes, multi_scores,
-                                                    2, 5, 10)
-            self.assertIsNotNone(fpn_rois)
-
-
-class TestDistributeFpnProposals(unittest.TestCase):
-    def test_distribute_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
-            fpn_rois = fluid.layers.data(
-                name='data', shape=[4], dtype='float32', lod_level=1)
-            multi_rois, restore_ind = layers.distribute_fpn_proposals(
-                fpn_rois=fpn_rois,
-                min_level=2,
-                max_level=5,
-                refer_level=4,
-                refer_scale=224)
-            self.assertIsNotNone(multi_rois)
-            self.assertIsNotNone(restore_ind)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
deleted file mode 100644
index 3c977afc7c813908fbe2dfb7445d9ca183cf2231..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-BATCH_SIZE = 128
-CLIP_MAX = 2e-6
-CLIP_MIN = -1e-6
-
-prog = fluid.framework.Program()
-
-with fluid.program_guard(main_program=prog):
-    image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-
-    hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-    hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-    predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-
-    label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(cost)
-
-prog_clip = prog.clone()
-prog_clip.block(0).var(hidden1.name)._set_error_clip(
-    fluid.clip.ErrorClipByValue(
-        max=CLIP_MAX, min=CLIP_MIN))
-
-avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
-fluid.backward.append_backward(loss=avg_cost)
-fluid.backward.append_backward(
-    loss=avg_cost_clip, callbacks=[fluid.clip.error_clip_callback])
-
-hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD")
-hidden1_grad_clip = prog_clip.block(0).var(hidden1.name + "@GRAD")
-
-hidden2_grad = prog.block(0).var(hidden2.name + "@GRAD")
-hidden2_grad_clip = prog_clip.block(0).var(hidden2.name + "@GRAD")
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=8192),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
-exe.run(fluid.default_startup_program())
-
-count = 0
-for data in train_reader():
-    count += 1
-    if count > 5:
-        break
-    out1, out2 = exe.run(prog,
-                         feed=feeder.feed(data),
-                         fetch_list=[hidden1_grad, hidden2_grad])
-    out1_clip, out2_clip = exe.run(
-        prog_clip,
-        feed=feeder.feed(data),
-        fetch_list=[hidden1_grad_clip, hidden2_grad_clip])
-    if not ((out1.clip(
-            min=CLIP_MIN, max=CLIP_MAX) == out1_clip).all() and
-            (out2 == out2_clip).all()):
-        exit(1)
-
-exit(0)
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
deleted file mode 100644
index 61d81f483636a99ea9e0282de89f12e47f3b824c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ /dev/null
@@ -1,225 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid.layers as layers
-from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.executor import Executor
-from paddle.fluid.optimizer import MomentumOptimizer
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid.layers.control_flow import split_lod_tensor
-from paddle.fluid.layers.control_flow import merge_lod_tensor
-from paddle.fluid.layers.control_flow import ConditionalBlock
-
-import unittest
-import numpy as np
-
-
-class TestMNISTIfElseOp(unittest.TestCase):
-    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
-    def not_test_raw_api(self):
-        prog = Program()
-        startup_prog = Program()
-        with program_guard(prog, startup_prog):
-            image = layers.data(name='x', shape=[784], dtype='float32')
-
-            label = layers.data(name='y', shape=[1], dtype='int64')
-
-            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
-            cond = layers.less_than(x=label, y=limit)
-            true_image, false_image = split_lod_tensor(input=image, mask=cond)
-
-            true_out = layers.create_tensor(dtype='float32')
-            true_cond = ConditionalBlock([cond])
-
-            with true_cond.block():
-                hidden = layers.fc(input=true_image, size=100, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                layers.assign(input=prob, output=true_out)
-
-            false_out = layers.create_tensor(dtype='float32')
-            false_cond = ConditionalBlock([cond])
-
-            with false_cond.block():
-                hidden = layers.fc(input=false_image, size=200, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                layers.assign(input=prob, output=false_out)
-
-            prob = merge_lod_tensor(
-                in_true=true_out, in_false=false_out, mask=cond, x=image)
-            loss = layers.cross_entropy(input=prob, label=label)
-            avg_loss = layers.mean(loss)
-
-            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-            optimizer.minimize(avg_loss, startup_prog)
-
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=10)
-
-        place = core.CPUPlace()
-        exe = Executor(place)
-
-        exe.run(startup_prog)
-        PASS_NUM = 100
-        for pass_id in range(PASS_NUM):
-            for data in train_reader():
-                x_data = np.array([x[0] for x in data]).astype("float32")
-                y_data = np.array([x[1] for x in data]).astype("int64")
-                y_data = np.expand_dims(y_data, axis=1)
-
-                outs = exe.run(prog,
-                               feed={'x': x_data,
-                                     'y': y_data},
-                               fetch_list=[avg_loss])
-                print(outs[0])
-                if outs[0] < 1.0:
-                    return
-        self.assertFalse(True)
-
-    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
-    def not_test_ifelse(self):
-        prog = Program()
-        startup_prog = Program()
-        with program_guard(prog, startup_prog):
-            image = layers.data(name='x', shape=[784], dtype='float32')
-
-            label = layers.data(name='y', shape=[1], dtype='int64')
-
-            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
-            cond = layers.less_than(x=label, y=limit)
-            ie = layers.IfElse(cond)
-
-            with ie.true_block():
-                true_image = ie.input(image)
-                hidden = layers.fc(input=true_image, size=100, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            with ie.false_block():
-                false_image = ie.input(image)
-                hidden = layers.fc(input=false_image, size=200, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            prob = ie()
-            loss = layers.cross_entropy(input=prob[0], label=label)
-            avg_loss = layers.mean(loss)
-
-            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-            optimizer.minimize(avg_loss, startup_prog)
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=200)
-
-        place = core.CPUPlace()
-        exe = Executor(place)
-
-        exe.run(startup_prog)
-        PASS_NUM = 100
-        for pass_id in range(PASS_NUM):
-            for data in train_reader():
-                x_data = np.array([x[0] for x in data]).astype("float32")
-                y_data = np.array([x[1] for x in data]).astype("int64")
-                y_data = y_data.reshape((y_data.shape[0], 1))
-
-                outs = exe.run(prog,
-                               feed={'x': x_data,
-                                     'y': y_data},
-                               fetch_list=[avg_loss])
-                print(outs[0])
-                if outs[0] < 1.0:
-                    return
-        self.assertFalse(True)
-
-
-class TestIfElse(unittest.TestCase):
-    def set_test_case(self):
-        # condiction is: self.data < self.cond_value
-        self.cond_value = 0.5
-        self.data = np.random.rand(25, 1).astype(np.float32)
-
-    def numpy_cal(self):
-        s1 = self.data[np.where(self.data < self.cond_value)]
-        res = np.sum(np.exp(s1))
-        s2 = self.data[np.where(self.data >= self.cond_value)]
-        res += np.sum(np.tanh(s2))
-        return res
-
-    def compare_ifelse_op_and_numpy(self, place):
-        self.set_test_case()
-
-        prog = Program()
-        startup_prog = Program()
-        with program_guard(prog, startup_prog):
-            src = layers.data(name='data', shape=[1], dtype='float32')
-            cond = layers.fill_constant(
-                [1], dtype='float32', value=self.cond_value)
-            ifcond = layers.less_than(x=src, y=cond)
-            ie = layers.IfElse(ifcond)
-            with ie.true_block():
-                true_target = ie.input(src)
-                true_target = fluid.layers.exp(true_target)
-                ie.output(true_target)
-
-            with ie.false_block():
-                false_target = ie.input(src)
-                false_target = fluid.layers.tanh(false_target)
-                ie.output(false_target)
-            if_out = ie()
-            out = layers.reduce_sum(if_out)
-
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            fetch_list = [out]
-            o1, = exe.run(fluid.default_main_program(),
-                          feed={'data': self.data},
-                          fetch_list=[out])
-            o2 = self.numpy_cal()
-
-            self.assertTrue(
-                np.allclose(
-                    o1, o2, atol=1e-8),
-                "IfElse result : " + str(o1) + "\n Numpy result :" + str(o2))
-
-    def test_cpu(self):
-        self.compare_ifelse_op_and_numpy(fluid.CPUPlace())
-
-    def test_cuda(self):
-        if not core.is_compiled_with_cuda():
-            return
-        self.compare_ifelse_op_and_numpy(fluid.CUDAPlace(0))
-
-
-class TestIfElseTrueBranch(TestIfElse):
-    def set_test_case(self):
-        # condiction is: self.data < self.cond_value
-        self.cond_value = 10.
-        self.data = np.random.rand(25, 1).astype(np.float32)
-
-
-class TestIfElseFalseBranch(TestIfElse):
-    def set_test_case(self):
-        # condiction is: self.data < self.cond_value
-        self.cond_value = -10.
-        self.data = np.random.rand(25, 1).astype(np.float32)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
deleted file mode 100644
index a3eae5a3c83d3155cb057e0c23b7c10fd2cd6a47..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor
-import numpy as np
-import unittest
-
-
-class TestLoDTensor(unittest.TestCase):
-    def test_pybind_recursive_seq_lens(self):
-        tensor = fluid.LoDTensor()
-        recursive_seq_lens = []
-        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
-        recursive_seq_lens = [[], [1], [3]]
-        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
-                          recursive_seq_lens)
-        recursive_seq_lens = [[0], [2], [3]]
-        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
-                          recursive_seq_lens)
-
-        recursive_seq_lens = [[1, 2, 3]]
-        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
-        self.assertEqual(tensor.recursive_sequence_lengths(),
-                         recursive_seq_lens)
-        tensor.set(np.random.random([6, 1]), fluid.CPUPlace())
-        self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
-        tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
-        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
-
-        # Each level's sum should be equal to the number of items in the next level
-        # Moreover, last level's sum should be equal to the tensor height
-        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 2]]
-        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
-        self.assertEqual(tensor.recursive_sequence_lengths(),
-                         recursive_seq_lens)
-        tensor.set(np.random.random([8, 1]), fluid.CPUPlace())
-        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
-        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 1]]
-        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
-        self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
-        tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
-        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
-
-    def test_create_lod_tensor(self):
-        # Create LoDTensor from a list
-        data = [[np.int64(1), np.int64(2), np.int64(3)],
-                [np.int64(3), np.int64(4)]]
-        wrong_recursive_seq_lens = [[2, 2]]
-        correct_recursive_seq_lens = [[3, 2]]
-        self.assertRaises(AssertionError, create_lod_tensor, data,
-                          wrong_recursive_seq_lens, fluid.CPUPlace())
-        tensor = create_lod_tensor(data, correct_recursive_seq_lens,
-                                   fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(),
-                         correct_recursive_seq_lens)
-        self.assertEqual(tensor._dtype(), core.VarDesc.VarType.INT64)
-        self.assertEqual(tensor.shape(), [5, 1])
-        self.assertTrue(
-            np.array_equal(
-                np.array(tensor),
-                np.array([1, 2, 3, 3, 4]).reshape(tensor.shape()).astype(
-                    'int64')))
-
-        # Create LoDTensor from numpy array
-        data = np.random.random([10, 1]).astype('float64')
-        recursive_seq_lens = [[2, 1], [3, 3, 4]]
-        tensor = create_lod_tensor(data, recursive_seq_lens, fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(),
-                         recursive_seq_lens)
-        self.assertEqual(tensor._dtype(), core.VarDesc.VarType.FP64)
-        self.assertEqual(tensor.shape(), [10, 1])
-        self.assertTrue(np.array_equal(np.array(tensor), data))
-
-        # Create LoDTensor from another LoDTensor, they are differnt instances
-        new_recursive_seq_lens = [[2, 2, 1], [1, 2, 2, 3, 2]]
-        new_tensor = create_lod_tensor(tensor, new_recursive_seq_lens,
-                                       fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(),
-                         recursive_seq_lens)
-        self.assertEqual(new_tensor.recursive_sequence_lengths(),
-                         new_recursive_seq_lens)
-
-    def test_create_random_int_lodtensor(self):
-        # The shape of a word, commonly used in speech and NLP problem, is [1]
-        shape = [1]
-        recursive_seq_lens = [[2, 3, 5]]
-        dict_size = 10000
-        low = 0
-        high = dict_size - 1
-        tensor = create_random_int_lodtensor(recursive_seq_lens, shape,
-                                             fluid.CPUPlace(), low, high)
-        self.assertEqual(tensor.recursive_sequence_lengths(),
-                         recursive_seq_lens)
-        self.assertEqual(tensor.shape(), [10, 1])
-
-    def test_print_lodtensor(self):
-        shape = [1]
-        recursive_seq_lens = [[2, 3, 5]]
-        dict_size = 100
-        low = 0
-        high = dict_size - 1
-        tensor = create_random_int_lodtensor(recursive_seq_lens, shape,
-                                             fluid.CPUPlace(), low, high)
-        print(tensor)
-        self.assertTrue(isinstance(str(tensor), str))
-
-        if core.is_compiled_with_cuda():
-            gtensor = create_random_int_lodtensor(recursive_seq_lens, shape,
-                                                  fluid.CUDAPlace(0), low, high)
-            print(gtensor)
-            self.assertTrue(isinstance(str(gtensor), str))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/test_python_operator_overriding.py b/python/paddle/fluid/tests/test_python_operator_overriding.py
deleted file mode 100644
index 5f92c437ec726f510d9194d23f1a01a5478827d6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_python_operator_overriding.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import numpy as np
-
-import paddle.fluid.layers as layers
-import paddle.fluid.framework as framework
-import paddle.fluid as fluid
-
-
-class TestPythonOperatorOverride(unittest.TestCase):
-    def check_result(self, fn, place, dtype):
-        shape = [9, 10]
-
-        x_data = np.random.random(size=shape).astype(dtype)
-        y_data = np.random.random(size=shape).astype(dtype)
-        python_out = fn(x_data, y_data)
-
-        x_var = layers.create_global_var(
-            name='x', shape=shape, value=0.0, dtype=dtype, persistable=True)
-        y_var = layers.create_global_var(
-            name='y', shape=shape, value=0.0, dtype=dtype, persistable=True)
-        out = fn(x_var, y_var)
-
-        exe = fluid.Executor(place)
-
-        exe.run(fluid.default_startup_program())
-        fluid_out = exe.run(fluid.default_main_program(),
-                            feed={'x': x_data,
-                                  'y': y_data},
-                            fetch_list=[out])
-
-        np.testing.assert_array_equal(python_out, fluid_out[0])
-
-    def test_override(self):
-        # compare func to check
-        compare_fns = [
-            lambda _a, _b: _a == _b,
-            lambda _a, _b: _a != _b,
-            lambda _a, _b: _a < _b,
-            lambda _a, _b: _a <= _b,
-            lambda _a, _b: _a > _b,
-            lambda _a, _b: _a >= _b,
-        ]
-
-        # places to check
-        places = [fluid.CPUPlace()]
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-
-        # dtypes to check
-        dtypes = ['int32', 'float32']
-
-        for place in places:
-            for dtype in dtypes:
-                for compare_fn in compare_fns:
-                    with framework.program_guard(framework.Program(),
-                                                 framework.Program()):
-                        self.check_result(compare_fn, place, dtype)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
deleted file mode 100644
index 6809f88da03f426cee785384ee8e8524f8efa880..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ /dev/null
@@ -1,339 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FLAGS_memory_fraction_of_eager_deletion=1.0)
-set(dist_ENVS http_proxy="" https_proxy="")
-
-if(NOT WITH_DISTRIBUTE)
-    list(REMOVE_ITEM TEST_OPS test_recv_op)
-    list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
-    list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler)
-    list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_fleetapi)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_dgc_nccl)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_hallreduce)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_multi_comm)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_ring_allreduce)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_backward_deps)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_lars)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
-    LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op)
-    LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_fleet_ctr)
-endif(NOT WITH_DISTRIBUTE)
-
-
-if(NOT WITH_GPU OR WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
-    LIST(REMOVE_ITEM TEST_OPS test_allgather)
-    LIST(REMOVE_ITEM TEST_OPS test_allreduce)
-    LIST(REMOVE_ITEM TEST_OPS test_broadcast)
-    LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
-endif()
-
-if(WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_boxps)
-    LIST(REMOVE_ITEM TEST_OPS test_trainer_desc)
-    LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
-    LIST(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization)
-endif()
-
-LIST(REMOVE_ITEM TEST_OPS test_launch)
-
-if (NOT ${WITH_GPU})
-    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist) # TODO(Yancey1989): parallel dygraph support CPU device in future
-elseif(${CUDNN_VERSION} VERSION_LESS 7100)
-    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
-endif()
-
-if(NOT WITH_GPU OR WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_pipeline)
-endif()
-list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
-list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
-list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
-
-
-list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
-list(REMOVE_ITEM TEST_OPS decorator_helper) # decorator_helper is a helper python file, not a test
-if(APPLE)
-    if(NOT WITH_DISTRIBUTE)
-        list(REMOVE_ITEM TEST_OPS test_desc_clone)
-        list(REMOVE_ITEM TEST_OPS test_program_code)
-    endif(NOT WITH_DISTRIBUTE)
-    message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext_*")
-    # this op is not support on mac
-    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
-    # TODO: add the unitest back when it fixed
-    list(REMOVE_ITEM TEST_OPS test_detection_map_op)
-    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
-    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
-    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
-    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
-    # TODO(tangwei12): add the unitest back when it fixed
-    list(REMOVE_ITEM TEST_OPS test_dist_word2vec)
-    list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
-endif()
-if(NOT WITH_MKLML)
-    # this op is not support on openblas
-    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
-endif()
-
-if(NOT WITH_MKL)
-  list(REMOVE_ITEM TEST_OPS test_match_matrix_tensor_op)
-  list(REMOVE_ITEM TEST_OPS test_var_conv_2d)
-endif(NOT WITH_MKL)
-
-if(WITH_GPU OR NOT WITH_MKLML)
-    # matmul with multiple heads need MKL support
-    LIST(REMOVE_ITEM TEST_OPS test_matmul_op_with_head)
-endif()
-
-function(py_test_modules TARGET_NAME)
-  if(WITH_TESTING)
-    set(options SERIAL)
-    set(oneValueArgs "")
-    set(multiValueArgs MODULES DEPS ENVS)
-    cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if(WITH_COVERAGE)
-      add_test(NAME ${TARGET_NAME}
-          COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
-          COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-          ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    else()
-      add_test(NAME ${TARGET_NAME}
-          COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
-          ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    endif()
-
-    if (py_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-    endif()
-
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 350)
-  endif()
-endfunction()
-
-function(bash_test_modules TARGET_NAME)
-    if(NOT WITH_TESTING)
-        return()
-    endif()
-
-    set(options SERIAL)
-    set(oneValueArgs "")
-    set(multiValueArgs MODULES DEPS ENVS)
-    cmake_parse_arguments(bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    message(STATUS "CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR})
-
-    add_test(NAME ${TARGET_NAME}
-        COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${bash_test_modules_ENVS}
-        bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_MODULES}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    if (bash_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-    endif()
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
-endfunction()
-
-list(REMOVE_ITEM TEST_OPS test_warpctc_op)
-
-LIST(REMOVE_ITEM TEST_OPS test_lookup_remote_table_op)
-LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op)
-LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op)
-LIST(REMOVE_ITEM TEST_OPS test_dist_train)
-LIST(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
-list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf_auto_growth)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
-list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
-list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
-list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
-list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
-list(REMOVE_ITEM TEST_OPS test_dgc_op)
-list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl)
-list(REMOVE_ITEM TEST_OPS test_dist_transformer)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer_auto_growth)
-list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
-list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
-list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
-list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient)
-list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient)
-list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
-list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
-list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
-list(REMOVE_ITEM TEST_OPS test_layers)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_base_cpu)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_with_reduce_cpu)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
-list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model)
-list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist)
-list(REMOVE_ITEM TEST_OPS test_install_check)
-list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
-list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
-list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
-list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
-list(REMOVE_ITEM TEST_OPS test_imperative_debug_string)
-
-if (APPLE OR WIN32)
-  list(REMOVE_ITEM TEST_OPS test_dataset)
-  list(REMOVE_ITEM TEST_OPS test_dataset_dataloader)
-endif()
-
-# Some ops need to check results when gc is enabled
-# Currently, only ops that register NoNeedBufferVarsInference need to do this test
-set(TEST_OPS_WITH_GC
-  test_affine_channel_op
-  test_concat_op
-  test_elementwise_add_op
-  test_elementwise_sub_op
-  test_fill_constant_batch_size_like_op
-  test_fill_zeros_like2_op
-  test_gather_op
-  test_gather_nd_op
-  test_gaussian_random_batch_size_like_op
-  test_linear_chain_crf_op
-  test_lod_reset_op
-  test_lookup_table_op
-  test_mean_op
-  test_pad2d_op
-  test_scatter_op
-  test_sequence_concat
-  test_seq_conv
-  test_seq_pool
-  test_sequence_expand_as
-  test_sequence_expand
-  test_sequence_pad_op
-  test_sequence_unpad_op
-  test_sequence_scatter_op
-  test_sequence_slice_op
-  test_slice_op
-  test_space_to_depth_op
-  test_squared_l2_distance_op
-  test_uniform_random_batch_size_like_op)
-
-foreach(TEST_OP ${TEST_OPS_WITH_GC})
-  list(REMOVE_ITEM TEST_OPS ${TEST_OP})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
-endforeach()
-
-foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
-py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op)
-py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
-py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
-py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
-    FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-py_test_modules(test_imperative_resnet_sorted_gradient MODULES test_imperative_resnet_sorted_gradient ENVS
-        FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
-    FLAGS_cudnn_deterministic=1)
-py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mnist_sorted_gradient ENVS
-        FLAGS_cudnn_deterministic=1)
-py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
-    FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_attention_model ENVS
-        FLAGS_cudnn_deterministic=1 SERIAL)
-py_test_modules(test_install_check MODULES test_install_check ENVS
-        FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
-py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1)
-if(WITH_DISTRIBUTE)
-    py_test_modules(test_dist_train MODULES test_dist_train ENVS ${dist_ENVS})
-    py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS})
-    py_test_modules(test_hsigmoid_remote_table_op MODULES test_hsigmoid_remote_table_op ENVS ${dist_ENVS})
-    py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS})
-    #py_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv_op ENVS ${dist_ENVS})
-    if(WITH_DGC)
-        py_test_modules(test_dgc_op MODULES test_dgc_op)
-    endif()
-    if(NOT APPLE)
-        bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh)
-        set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 100 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_listen_and_serv_op test_nce_remote_table_op test_hsigmoid_remote_table_op PROPERTIES LABELS "RUN_TYPE=DIST")
-
-        set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_dgc_nccl PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_fleetapi  PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_simnet_bow PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_text_classification PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-
-        list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
-        list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
-	    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
-	    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
-
-        py_test_modules(test_dist_se_resnext_dgc MODULES test_dist_se_resnext_dgc)
-	    py_test_modules(test_dist_se_resnext_sync MODULES test_dist_se_resnext_sync)
-        py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
-        bash_test_modules(test_launch MODULES test_launch.sh)
-
-        # FIXME(typhoonzero): add these tests back
-        # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
-        # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
-        set_tests_properties(test_dist_se_resnext_dgc PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_se_resnext_sync PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_se_resnext_nccl PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    endif(NOT APPLE)
-    # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
-endif()
-
-py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
-py_test_modules(test_parallel_executor_crf_auto_growth MODULES test_parallel_executor_crf_auto_growth ENVS FLAGS_allocator_strategy=auto_growth)
-py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed)
-set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
-py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer)
-py_test_modules(test_parallel_executor_transformer_auto_growth MODULES test_parallel_executor_transformer_auto_growth ENVS FLAGS_allocator_strategy=auto_growth)
-py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
-py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_executor_seresnext_base_cpu)
-py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES test_parallel_executor_seresnext_with_reduce_cpu)
-py_test_modules(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
-
-if(NOT WIN32)
-    py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer)
-endif()
-
-set_tests_properties(test_parallel_executor_seresnext_base_cpu PROPERTIES TIMEOUT 900)
-set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu PROPERTIES TIMEOUT 750)
-set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu PROPERTIES TIMEOUT 750)
-
-if (WITH_NGRAPH)
-    add_subdirectory(ngraph)
-endif()
-
-if (WITH_MKLDNN)
-    add_subdirectory(mkldnn)
-endif()
-
-if (WITH_TESTING)
-    set_property(TEST test_parallel_executor_mnist PROPERTY ENVIRONMENT GLOG_vmodule=scope_buffered_ssa_graph_executor=5)
-endif()
-
-set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist
-        test_parallel_executor_seresnext_base_gpu test_parallel_executor_seresnext_with_reduce_gpu
-        test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
-        test_parallel_executor_crf test_sync_batch_norm_op
-        test_parallel_executor_feed_persistable_var
-        test_parallel_executor_crf_auto_growth test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-        test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST")
diff --git a/python/paddle/fluid/tests/unittests/__init__.py b/python/paddle/fluid/tests/unittests/__init__.py
deleted file mode 100644
index b94a21a7e406b833797f8f521c62a2351c2bc30a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
deleted file mode 100644
index 9ea95f3e8700274977eda4ca113a6468c631584c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import time
-import itertools
-import six
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from op_test import OpTest
-
-
-class BenchmarkSuite(OpTest):
-    def timeit_function(self, callback, iters, *args, **kwargs):
-        assert iters != 0, "Iters should >= 1"
-        start = time.time()
-        for i in range(iters):
-            callback(*args, **kwargs)
-        elapse = time.time() - start
-        return elapse / iters
-
-    def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol):
-        for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs,
-                                                        fetch_list):
-            # the cpu version is baseline, expect gpu version keep same with cpu version.
-            expect = item_cpu_out
-            expect_t = np.array(item_cpu_out)
-            actual = item_gpu_out
-            actual_t = np.array(item_gpu_out)
-            var_name = variable if isinstance(
-                variable, six.string_types) else variable.name
-            self.assertTrue(
-                np.allclose(
-                    actual_t, expect_t, atol=atol),
-                "Output (" + var_name + ") has diff" + str(actual_t) + "\n" +
-                str(expect_t))
-            self.assertListEqual(actual.lod(),
-                                 expect.lod(),
-                                 "Output (" + var_name + ") has different lod")
-
-    def _get_input_names(self):
-        inputs = []
-        for name, value in six.iteritems(self.inputs):
-            if isinstance(value, list):
-                inputs.extend([sub_name for sub_name, _ in value])
-            inputs.append(name)
-        return inputs
-
-    def _get_output_names(self):
-        outputs = []
-        for var_name, var in six.iteritems(self.outputs):
-            if isinstance(var, list):
-                for sub_var_name, sub_var in var:
-                    outputs.append(sub_var_name)
-            else:
-                outputs.append(var_name)
-        if len(outputs) == 0:
-            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
-                outputs.append(str(out_name))
-        return outputs
-
-    def check_output_stability(self, atol=1e-8):
-        places = self._get_places()
-        if len(places) < 2:
-            return
-        cpu_outs, fetch_list = self._calc_output(places[0])
-        gpu_outs, _ = self._calc_output(places[1])
-        self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol)
-
-    def timeit_output_with_place(self, place, iters):
-        return self.timeit_function(self.calc_output, iters, place)
-
-    def timeit_output(self, iters=100):
-        places = self._get_places()
-        elapses = []
-        for place in places:
-            elapses.append(self.timeit_output_with_place(place, iters))
-        for place, elapse in zip(places, elapses):
-            print("One pass of ({2}_op) at {0} cost {1}".format(
-                str(place), elapse, self.op_type))
-
-    def timeit_grad_with_place(self, place, iters=100):
-        inputs_to_check = self._get_input_names()
-        output_names = self._get_output_names()
-        return self.timeit_function(
-            self._get_gradient,
-            iters,
-            inputs_to_check,
-            place,
-            output_names,
-            no_grad_set=None)
-
-    def timeit_grad(self, iters=100):
-        places = self._get_places()
-        elapses = []
-        for place in places:
-            elapses.append(self.timeit_grad_with_place(place, iters))
-        for place, elapse in zip(places, elapses):
-            print("One pass of ({2}_grad_op) at {0} cost {1}".format(
-                str(place), elapse, self.op_type))
diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
deleted file mode 100644
index 0e7338b839e2a7f5808e7a752e9ca6389622c2cb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-from benchmark import BenchmarkSuite
-from op_test import OpTest
-
-# This is a demo op test case for operator benchmarking and high resolution number stability alignment.
-
-
-class TestSumOp(BenchmarkSuite):
-    def setUp(self):
-        self.op_type = "sum"
-        self.customize_testcase()
-        self.customize_fetch_list()
-
-    def customize_fetch_list(self):
-        """
-        customize fetch list, configure the wanted variables.
-        >>> self.fetch_list = ["Out"]
-        """
-        self.fetch_list = ["Out"]
-        # pass
-
-    def customize_testcase(self):
-        # a test case
-        x0 = np.random.random((300, 400)).astype('float32')
-        x1 = np.random.random((300, 400)).astype('float32')
-        x2 = np.random.random((300, 400)).astype('float32')
-
-        # NOTE: if the output is empty, then it will autofilled by benchmarkSuite.
-        # only the output dtype is used, the shape, lod and data is computed from input.
-        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
-        self.outputs = {"Out": x0 + x1 + x2}
-
-    def test_check_output(self):
-        """
-        compare the output with customized output. In this case,
-        you should set the correct output by hands.
-        >>> self.outputs = {"Out": x0 + x1 + x2}
-        """
-        self.check_output(atol=1e-8)
-
-    def test_output_stability(self):
-        # compare the cpu gpu output in high resolution.
-        self.check_output_stability()
-
-    def test_timeit_output(self):
-        """
-        perf the op, time cost will be averged in iters.
-        output example
-        >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
-        >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
-        """
-        self.timeit_output(iters=100)
-
-    def test_timeit_grad(self):
-        """
-        perf the op gradient, time cost will be averged in iters.
-        output example
-        >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
-        >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
-        """
-        self.timeit_grad(iters=100)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_op.py b/python/paddle/fluid/tests/unittests/collective_allgather_op.py
deleted file mode 100644
index 349996547687657453497956007d2431b11ea45f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/collective_allgather_op.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import os
-import sys
-import signal
-import time
-from contextlib import closing
-from six import string_types
-import math
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import paddle.fluid.layers as layers
-from functools import reduce
-from test_collective_base import TestCollectiveRunnerBase, runtime_main
-
-
-class TestCollectiveAllGather(TestCollectiveRunnerBase):
-    def __init__(self):
-        self.global_ring_id = 0
-
-    def get_model(self, main_prog, startup_program):
-        ring_id = 0
-        nranks = 2
-        with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
-            toutdata = main_prog.current_block().create_var(
-                name="outofgather",
-                dtype='float32',
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-                stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_allgather",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'nranks': nranks},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
-            return toutdata
-
-
-if __name__ == "__main__":
-    runtime_main(TestCollectiveAllGather, "allgather", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_op.py b/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
deleted file mode 100644
index 9aef8879cab15ade735195ab173d9386764fb690..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import os
-import sys
-import signal
-import time
-import socket
-from contextlib import closing
-from six import string_types
-import math
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import paddle.fluid.layers as layers
-from functools import reduce
-from test_collective_base import TestCollectiveRunnerBase, runtime_main
-
-
-class TestCollectiveAllreduce(TestCollectiveRunnerBase):
-    def __init__(self):
-        self.global_ring_id = 0
-
-    def get_model(self, main_prog, startup_program):
-        ring_id = 0
-        with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
-            toutdata = main_prog.current_block().create_var(
-                name="outofallreduce",
-                dtype='float32',
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-                stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_allreduce_sum",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
-            return toutdata
-
-
-if __name__ == "__main__":
-    runtime_main(TestCollectiveAllreduce, "allreduce", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_op.py b/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
deleted file mode 100644
index 18f0485f923e4f72f76be3b0b34ebeb1d89c926c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import os
-import sys
-import signal
-import time
-import socket
-from contextlib import closing
-from six import string_types
-import math
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import paddle.fluid.layers as layers
-from functools import reduce
-from test_collective_base import TestCollectiveRunnerBase, runtime_main
-
-
-class TestCollectiveBroadcast(TestCollectiveRunnerBase):
-    def __init__(self):
-        self.global_ring_id = 0
-
-    def get_model(self, main_prog, startup_program):
-        ring_id = 0
-        rootid = 1
-        with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
-            toutdata = main_prog.current_block().create_var(
-                name="outofbroadcast",
-                dtype='float32',
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-                stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_broadcast",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'root': rootid},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
-            return toutdata
-
-
-if __name__ == "__main__":
-    runtime_main(TestCollectiveBroadcast, "broadcast", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py b/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
deleted file mode 100644
index 3e286d7f43db6e9cd290b88a0be5a4ae1215737a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import os
-import sys
-import signal
-import time
-import socket
-from contextlib import closing
-from six import string_types
-import math
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import paddle.fluid.layers as layers
-from functools import reduce
-from test_collective_base import TestCollectiveRunnerBase, runtime_main
-
-
-class TestCollectiveReduceScatter(TestCollectiveRunnerBase):
-    def __init__(self):
-        self.global_ring_id = 0
-
-    def get_model(self, main_prog, startup_program):
-        ring_id = 0
-        nranks = 2
-        with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
-            toutdata = main_prog.current_block().create_var(
-                name="outofrs",
-                dtype='float32',
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-                stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_reducescatter",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'nranks': nranks},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
-            return toutdata
-
-
-if __name__ == "__main__":
-    runtime_main(TestCollectiveReduceScatter, "reduce_scatter", 0)
diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
deleted file mode 100644
index ace4b01144b41d9ac404d086838e759cf279ac28..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import logging
-import tarfile
-import os
-
-import paddle
-import paddle.fluid.incubate.data_generator as data_generator
-
-logging.basicConfig()
-logger = logging.getLogger("paddle")
-logger.setLevel(logging.INFO)
-
-DATA_URL = "http://paddle-ctr-data.bj.bcebos.com/avazu_ctr_data.tgz"
-DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
-"""
-avazu_ctr_data/train.txt
-avazu_ctr_data/infer.txt
-avazu_ctr_data/test.txt
-avazu_ctr_data/data.meta.txt
-"""
-
-
-def download_file():
-    file_name = "avazu_ctr_data"
-    path = paddle.dataset.common.download(DATA_URL, file_name, DATA_MD5)
-
-    dir_name = os.path.dirname(path)
-    text_file_dir_name = os.path.join(dir_name, file_name)
-
-    if not os.path.exists(text_file_dir_name):
-        tar = tarfile.open(path, "r:gz")
-        tar.extractall(dir_name)
-    return text_file_dir_name
-
-
-def load_dnn_input_record(sent):
-    return list(map(int, sent.split()))
-
-
-def load_lr_input_record(sent):
-    res = []
-    for _ in [x.split(':') for x in sent.split()]:
-        res.append(int(_[0]))
-    return res
-
-
-class DatasetCtrReader(data_generator.MultiSlotDataGenerator):
-    def generate_sample(self, line):
-        def iter():
-            fs = line.strip().split('\t')
-            dnn_input = load_dnn_input_record(fs[0])
-            lr_input = load_lr_input_record(fs[1])
-            click = [int(fs[2])]
-            yield ("dnn_data", dnn_input), \
-                  ("lr_data", lr_input), \
-                  ("click", click)
-
-        return iter
-
-
-def prepare_data():
-    """
-    load data meta info from path, return (dnn_input_dim, lr_input_dim)
-    """
-    file_dir_name = download_file()
-    meta_file_path = os.path.join(file_dir_name, 'data.meta.txt')
-    train_file_path = os.path.join(file_dir_name, 'train.txt')
-    with open(meta_file_path, "r") as f:
-        lines = f.readlines()
-    err_info = "wrong meta format"
-    assert len(lines) == 2, err_info
-    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
-        1], err_info
-    res = map(int, [_.split(':')[1] for _ in lines])
-    res = list(res)
-    dnn_input_dim = res[0]
-    lr_input_dim = res[1]
-    logger.info('dnn input dim: %d' % dnn_input_dim)
-    logger.info('lr input dim: %d' % lr_input_dim)
-    return dnn_input_dim, lr_input_dim, train_file_path
-
-
-if __name__ == "__main__":
-    pairwise_reader = DatasetCtrReader()
-    pairwise_reader.run_from_stdin()
diff --git a/python/paddle/fluid/tests/unittests/decorator_helper.py b/python/paddle/fluid/tests/unittests/decorator_helper.py
deleted file mode 100644
index 1a5f4540cf033b4d3244537cc5016ee06f341464..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/decorator_helper.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-
-__all__ = ['many_times', 'prog_scope']
-
-
-def many_times(times):
-    def __impl__(fn):
-        def __fn__(*args, **kwargs):
-            for _ in range(times):
-                fn(*args, **kwargs)
-
-        return __fn__
-
-    return __impl__
-
-
-def prog_scope():
-    def __impl__(fn):
-        def __fn__(*args, **kwargs):
-            prog = fluid.Program()
-            startup_prog = fluid.Program()
-            scope = fluid.core.Scope()
-            with fluid.scope_guard(scope):
-                with fluid.program_guard(prog, startup_prog):
-                    fn(*args, **kwargs)
-
-        return __fn__
-
-    return __impl__
diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
deleted file mode 100644
index 88a3cd14c43334f2abed9c8b435b64d47a65dc85..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import math
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import os
-import signal
-from functools import reduce
-from test_dist_base import TestDistRunnerBase, runtime_main
-
-DTYPE = "float32"
-paddle.dataset.mnist.fetch()
-
-# Fix seed for test
-fluid.default_startup_program().random_seed = 1
-fluid.default_main_program().random_seed = 1
-
-
-def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=data,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.01)))
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.01)))
-
-    SIZE = 10
-    input_shape = conv_pool_2.shape
-    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
-    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
-
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
-    return predict
-
-
-class TestDistMnist2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2, single_device=False):
-        # Input data
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-        # Train program
-        predict = cnn_model(images)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
-
-        inference_program = fluid.default_main_program().clone()
-
-        # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-
-        # Optimization
-        # TODO(typhoonzero): fix distributed adam optimizer
-        # opt = fluid.optimizer.AdamOptimizer(
-        #     learning_rate=0.001, beta1=0.9, beta2=0.999)
-        opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
-        if single_device:
-            opt.minimize(avg_cost)
-        else:
-            # multi device or distributed multi device
-            params_grads = opt.backward(avg_cost)
-            data_parallel_param_grads = []
-            for p, g in params_grads:
-                # NOTE: scale will be done on loss scale in multi_devices_graph_pass using nranks.
-                grad_reduce = fluid.layers.collective._allreduce(g)
-                data_parallel_param_grads.append([p, grad_reduce])
-            opt.apply_gradients(data_parallel_param_grads)
-
-        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
-
-
-if __name__ == "__main__":
-    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
deleted file mode 100644
index fd09d47258fdfbf6d4a285df7d53c81f7489f39e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import os
-
-import dist_ctr_reader
-from test_dist_base import TestDistRunnerBase, runtime_main
-
-IS_SPARSE = True
-os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
-
-# Fix seed for test
-fluid.default_startup_program().random_seed = 1
-fluid.default_main_program().random_seed = 1
-
-
-class TestDistCTR2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2):
-
-        dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta()
-        """ network definition """
-        dnn_data = fluid.layers.data(
-            name="dnn_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        lr_data = fluid.layers.data(
-            name="lr_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        label = fluid.layers.data(
-            name="click",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=0,
-            append_batch_size=False)
-
-        # build dnn model
-        dnn_layer_dims = [128, 64, 32, 1]
-        dnn_embedding = fluid.layers.embedding(
-            is_distributed=False,
-            input=dnn_data,
-            size=[dnn_input_dim, dnn_layer_dims[0]],
-            param_attr=fluid.ParamAttr(
-                name="deep_embedding",
-                initializer=fluid.initializer.Constant(value=0.01)),
-            is_sparse=IS_SPARSE)
-        dnn_pool = fluid.layers.sequence_pool(
-            input=dnn_embedding, pool_type="sum")
-        dnn_out = dnn_pool
-        for i, dim in enumerate(dnn_layer_dims[1:]):
-            fc = fluid.layers.fc(
-                input=dnn_out,
-                size=dim,
-                act="relu",
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.01)),
-                name='dnn-fc-%d' % i)
-            dnn_out = fc
-
-        # build lr model
-        lr_embbding = fluid.layers.embedding(
-            is_distributed=False,
-            input=lr_data,
-            size=[lr_input_dim, 1],
-            param_attr=fluid.ParamAttr(
-                name="wide_embedding",
-                initializer=fluid.initializer.Constant(value=0.01)),
-            is_sparse=IS_SPARSE)
-        lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
-
-        merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
-
-        predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
-        acc = fluid.layers.accuracy(input=predict, label=label)
-        auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
-                                                              label=label)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        inference_program = paddle.fluid.default_main_program().clone()
-
-        regularization = None
-        use_l2_decay = bool(os.getenv('USE_L2_DECAY', 0))
-        if use_l2_decay:
-            regularization = fluid.regularizer.L2DecayRegularizer(
-                regularization_coeff=1e-1)
-
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001,
-                                            regularization=regularization)
-        sgd_optimizer.minimize(avg_cost)
-
-        dataset = dist_ctr_reader.Dataset()
-        train_reader = paddle.batch(dataset.train(), batch_size=batch_size)
-        test_reader = paddle.batch(dataset.test(), batch_size=batch_size)
-
-        return inference_program, avg_cost, train_reader, test_reader, None, predict
-
-
-if __name__ == "__main__":
-    runtime_main(TestDistCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
deleted file mode 100644
index c030afdd4ff9be323ccbc19ebb5e119a8c9f040b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import paddle
-import tarfile
-
-from paddle.fluid.log_helper import get_logger
-
-logger = get_logger("paddle", logging.INFO)
-
-DATA_URL = "http://paddle-ctr-data.bj.bcebos.com/avazu_ctr_data.tgz"
-DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
-"""
-avazu_ctr_data/train.txt
-avazu_ctr_data/infer.txt
-avazu_ctr_data/test.txt
-avazu_ctr_data/data.meta.txt
-"""
-
-
-def read_data(file_name):
-    path = paddle.dataset.common.download(DATA_URL, "avazu_ctr_data", DATA_MD5)
-    tar = tarfile.open(path, "r:gz")
-    tar_info = None
-    for member in tar.getmembers():
-        if member.name.endswith(file_name):
-            tar_info = member
-    f = tar.extractfile(tar_info)
-    ret_lines = [_.decode('utf-8') for _ in f.readlines()]
-    return ret_lines
-
-
-class TaskMode:
-    TRAIN_MODE = 0
-    TEST_MODE = 1
-    INFER_MODE = 2
-
-    def __init__(self, mode):
-        self.mode = mode
-
-    def is_train(self):
-        return self.mode == self.TRAIN_MODE
-
-    def is_test(self):
-        return self.mode == self.TEST_MODE
-
-    def is_infer(self):
-        return self.mode == self.INFER_MODE
-
-    @staticmethod
-    def create_train():
-        return TaskMode(TaskMode.TRAIN_MODE)
-
-    @staticmethod
-    def create_test():
-        return TaskMode(TaskMode.TEST_MODE)
-
-    @staticmethod
-    def create_infer():
-        return TaskMode(TaskMode.INFER_MODE)
-
-
-class ModelType:
-    CLASSIFICATION = 0
-    REGRESSION = 1
-
-    def __init__(self, mode):
-        self.mode = mode
-
-    def is_classification(self):
-        return self.mode == self.CLASSIFICATION
-
-    def is_regression(self):
-        return self.mode == self.REGRESSION
-
-    @staticmethod
-    def create_classification():
-        return ModelType(ModelType.CLASSIFICATION)
-
-    @staticmethod
-    def create_regression():
-        return ModelType(ModelType.REGRESSION)
-
-
-def load_dnn_input_record(sent):
-    return list(map(int, sent.split()))
-
-
-def load_lr_input_record(sent):
-    res = []
-    for _ in [x.split(':') for x in sent.split()]:
-        res.append(int(_[0]))
-    return res
-
-
-feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2}
-
-
-class Dataset(object):
-    def train(self):
-        '''
-        Load trainset.
-        '''
-        file_name = "train.txt"
-        logger.info("load trainset from %s" % file_name)
-        mode = TaskMode.create_train()
-        return self._parse_creator(file_name, mode)
-
-    def test(self):
-        '''
-        Load testset.
-        '''
-        file_name = "test.txt"
-        logger.info("load testset from %s" % file_name)
-        mode = TaskMode.create_test()
-        return self._parse_creator(file_name, mode)
-
-    def infer(self):
-        '''
-        Load infer set.
-        '''
-        file_name = "infer.txt"
-        logger.info("load inferset from %s" % file_name)
-        mode = TaskMode.create_infer()
-        return self._parse_creator(file_name, mode)
-
-    def _parse_creator(self, file_name, mode):
-        '''
-        Parse dataset.
-        '''
-
-        def _parse():
-            data = read_data(file_name)
-            for line_id, line in enumerate(data):
-                fs = line.strip().split('\t')
-                dnn_input = load_dnn_input_record(fs[0])
-                lr_input = load_lr_input_record(fs[1])
-                if not mode.is_infer():
-                    click = int(fs[2])
-                    yield [dnn_input, lr_input, click]
-                else:
-                    yield [dnn_input, lr_input]
-
-        return _parse
-
-
-def load_data_meta():
-    '''
-    load data meta info from path, return (dnn_input_dim, lr_input_dim)
-    '''
-    lines = read_data('data.meta.txt')
-    err_info = "wrong meta format"
-    assert len(lines) == 2, err_info
-    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
-        1], err_info
-    res = map(int, [_.split(':')[1] for _ in lines])
-    res = list(res)
-    logger.info('dnn input dim: %d' % res[0])
-    logger.info('lr input dim: %d' % res[1])
-    return res
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
deleted file mode 100644
index a477e38d6edbddd89fc22ee50c8abe7e737c591f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import shutil
-import tempfile
-import time
-
-import paddle.fluid as fluid
-import os
-
-import ctr_dataset_reader
-from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
-
-# Fix seed for test
-fluid.default_startup_program().random_seed = 1
-fluid.default_main_program().random_seed = 1
-
-
-class TestDistCTR2x2(FleetDistRunnerBase):
-    def net(self, batch_size=4, lr=0.01):
-        dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
-        )
-        """ network definition """
-        dnn_data = fluid.layers.data(
-            name="dnn_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        lr_data = fluid.layers.data(
-            name="lr_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        label = fluid.layers.data(
-            name="click",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=0,
-            append_batch_size=False)
-
-        datas = [dnn_data, lr_data, label]
-
-        # build dnn model
-        dnn_layer_dims = [128, 64, 32, 1]
-        dnn_embedding = fluid.layers.embedding(
-            is_distributed=False,
-            input=dnn_data,
-            size=[dnn_input_dim, dnn_layer_dims[0]],
-            param_attr=fluid.ParamAttr(
-                name="deep_embedding",
-                initializer=fluid.initializer.Constant(value=0.01)),
-            is_sparse=True)
-        dnn_pool = fluid.layers.sequence_pool(
-            input=dnn_embedding, pool_type="sum")
-        dnn_out = dnn_pool
-        for i, dim in enumerate(dnn_layer_dims[1:]):
-            fc = fluid.layers.fc(
-                input=dnn_out,
-                size=dim,
-                act="relu",
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.01)),
-                name='dnn-fc-%d' % i)
-            dnn_out = fc
-
-        # build lr model
-        lr_embbding = fluid.layers.embedding(
-            is_distributed=False,
-            input=lr_data,
-            size=[lr_input_dim, 1],
-            param_attr=fluid.ParamAttr(
-                name="wide_embedding",
-                initializer=fluid.initializer.Constant(value=0.01)),
-            is_sparse=True)
-        lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
-
-        merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
-
-        predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
-        acc = fluid.layers.accuracy(input=predict, label=label)
-        auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
-                                                              label=label)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        self.feeds = datas
-        self.train_file_path = train_file_path
-        self.avg_cost = avg_cost
-        self.predict = predict
-
-        return avg_cost
-
-    def check_model_right(self, dirname):
-        model_filename = os.path.join(dirname, "__model__")
-
-        with open(model_filename, "rb") as f:
-            program_desc_str = f.read()
-
-        program = fluid.Program.parse_from_string(program_desc_str)
-        with open(os.path.join(dirname, "__model__.proto"), "w") as wn:
-            wn.write(str(program))
-
-    def do_training(self, fleet):
-        dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
-        )
-
-        exe = fluid.Executor(fluid.CPUPlace())
-
-        fleet.init_worker()
-        exe.run(fleet.startup_program)
-
-        thread_num = 2
-        filelist = []
-        for _ in range(thread_num):
-            filelist.append(train_file_path)
-
-        # config dataset
-        dataset = fluid.DatasetFactory().create_dataset()
-        dataset.set_batch_size(128)
-        dataset.set_use_var(self.feeds)
-        pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
-
-        dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
-
-        for epoch_id in range(2):
-            pass_start = time.time()
-            dataset.set_filelist(filelist)
-            exe.train_from_dataset(
-                program=fleet.main_program,
-                dataset=dataset,
-                fetch_list=[self.avg_cost],
-                fetch_info=["cost"],
-                print_period=100,
-                debug=False)
-            pass_time = time.time() - pass_start
-
-        model_dir = tempfile.mkdtemp()
-        fleet.save_inference_model(
-            exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
-        self.check_model_right(model_dir)
-        shutil.rmtree(model_dir)
-        fleet.stop_worker()
-
-
-if __name__ == "__main__":
-    runtime_main(TestDistCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
deleted file mode 100644
index 25616155b10dd89238fe9140cae04f65c9d4fe58..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ /dev/null
@@ -1,120 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import math
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import os
-import signal
-from functools import reduce
-from test_dist_base import TestDistRunnerBase, runtime_main
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
-
-DTYPE = "float32"
-paddle.dataset.mnist.fetch()
-
-# Fix seed for test
-fluid.default_startup_program().random_seed = 1
-fluid.default_main_program().random_seed = 1
-
-
-def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=data,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.01)))
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.01)))
-
-    SIZE = 10
-    input_shape = conv_pool_2.shape
-    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
-    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
-
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
-    return predict
-
-
-class TestDistMnist2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
-        # Input data
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-        # Train program
-        predict = cnn_model(images)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
-
-        inference_program = fluid.default_main_program().clone()
-        # Optimization
-        # TODO(typhoonzero): fix distributed adam optimizer
-        # opt = fluid.optimizer.AdamOptimizer(
-        #     learning_rate=0.001, beta1=0.9, beta2=0.999)
-        if not use_dgc:
-            opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
-        else:
-            opt = fluid.optimizer.DGCMomentumOptimizer(
-                learning_rate=self.lr, momentum=0.9, rampup_begin_step=0)
-
-        # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-
-        if dist_strategy:
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=dist_strategy)
-            _, param_grads = dist_opt.minimize(avg_cost)
-        else:
-            opt.minimize(avg_cost)
-
-        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
-
-
-if __name__ == "__main__":
-    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
deleted file mode 100644
index d386e75fd887a898f5a13e48e378e08ff6c99ea0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import math
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import os
-import signal
-from functools import reduce
-from test_dist_base import TestDistRunnerBase, runtime_main
-from dist_mnist import cnn_model
-
-DTYPE = "float32"
-
-
-def test_merge_reader(repeat_batch_size=8):
-    orig_reader = paddle.dataset.mnist.test()
-    record_batch = []
-    b = 0
-    for d in orig_reader():
-        if b >= repeat_batch_size:
-            break
-        record_batch.append(d)
-        b += 1
-    while True:
-        for d in record_batch:
-            yield d
-
-
-class TestDistMnist2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2):
-        # Input data
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-        # Train program
-        predict = cnn_model(images)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
-
-        inference_program = fluid.default_main_program().clone()
-        # Optimization
-        opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
-
-        # Reader
-        train_reader = paddle.batch(test_merge_reader, batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        opt.minimize(avg_cost)
-        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
-
-
-if __name__ == "__main__":
-    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
deleted file mode 100644
index 977e17c37f7676ae81d9ab29b6b36089ccbeeacf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import math
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import os
-import signal
-from functools import reduce
-from test_dist_base import TestDistRunnerBase, runtime_main
-from dist_mnist import cnn_model
-
-DTYPE = "float32"
-paddle.dataset.mnist.fetch()
-
-# Fix seed for test
-fluid.default_startup_program().random_seed = 1
-fluid.default_main_program().random_seed = 1
-
-
-class TestDistMnist2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2):
-        # Input data
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-        # Train program
-        predict = cnn_model(images)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
-
-        inference_program = fluid.default_main_program().clone()
-        # Optimization
-        opt = fluid.optimizer.LarsMomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
-
-        # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        opt.minimize(avg_cost)
-        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
-
-
-if __name__ == "__main__":
-    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
deleted file mode 100644
index f3a6b19d819644aef24cc65dbc4bdea6bfd3b692..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ /dev/null
@@ -1,204 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import sys
-import signal
-import subprocess
-import argparse
-import time
-import math
-import random
-from multiprocessing import Process
-from functools import reduce
-
-import numpy as np
-import pickle
-import unittest
-import six
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid import io
-
-from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP
-from dist_simnet_bow import TestDistSimnetBow2x2, DATA_URL, DATA_MD5
-
-
-class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
-    def _load_persistable_vars(self, executor, dirname, program):
-        def _is_checkpoint_var(var):
-            """
-            the checkpoint will not save or load all the variables.
-            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-            : param var(Variable)
-            """
-            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                    var.desc.type() == core.VarDesc.VarType.RAW:
-                return False
-            # @GRAD are named for gradient variables, checkpoint will not save it.
-            if "@GRAD" in var.name:
-                return False
-            # .trainer_ are named for distribute train variables, checkpoint will not save it.
-            if ".trainer_" in var.name:
-                return False
-
-            # .block is named for distribute train variables, checkpoint will not save it.
-            if ".block" in var.name:
-                return False
-
-            if "tmp_" in var.name:
-                return False
-
-            return var.persistable
-
-        io.load_vars(
-            executor,
-            dirname=dirname,
-            main_program=program,
-            predicate=_is_checkpoint_var,
-            filename=None)
-
-    def run_pserver(self, args):
-        self.get_model(batch_size=2)
-        # NOTE: pserver should not call memory optimize
-        t = self.get_transpiler(args.trainer_id,
-                                fluid.default_main_program(), args.endpoints,
-                                args.trainers, args.sync_mode, False,
-                                args.current_endpoint)
-        pserver_prog = t.get_pserver_program(args.current_endpoint)
-        startup_prog = t.get_startup_program(args.current_endpoint,
-                                             pserver_prog)
-
-        need_load = bool(int(os.getenv("LOAD", "0")))
-        model_dir = os.getenv("MODEL_DIR", "")
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-
-        if need_load and model_dir:
-            fluid.io.load_persistables(exe, model_dir, pserver_prog)
-
-        exe.run(pserver_prog)
-
-    def run_trainer(self, args):
-        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
-            self.get_model(batch_size=2)
-
-        if args.update_method == "pserver":
-            t = self.get_transpiler(args.trainer_id,
-                                    fluid.default_main_program(),
-                                    args.endpoints, args.trainers,
-                                    args.sync_mode)
-
-            trainer_prog = t.get_trainer_program()
-        else:
-            trainer_prog = fluid.default_main_program()
-
-        if args.use_cuda:
-            place = fluid.CUDAPlace(0)
-        else:
-            place = fluid.CPUPlace()
-
-        startup_exe = fluid.Executor(place)
-        startup_exe.run(fluid.default_startup_program())
-
-        strategy = fluid.ExecutionStrategy()
-        strategy.num_threads = 1
-
-        build_stra = fluid.BuildStrategy()
-
-        if args.use_reduce:
-            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        else:
-            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-
-        exe = fluid.ParallelExecutor(
-            args.use_cuda,
-            loss_name=avg_cost.name,
-            exec_strategy=strategy,
-            build_strategy=build_stra)
-
-        feed_var_list = [
-            var for var in trainer_prog.global_block().vars.values()
-            if var.is_data
-        ]
-
-        feeder = fluid.DataFeeder(feed_var_list, place)
-        reader_generator = train_reader()
-
-        def get_data():
-            origin_batch = next(reader_generator)
-            if args.update_method == "pserver" and args.use_reader_alloc:
-                new_batch = []
-                for offset, item in enumerate(origin_batch):
-                    if offset % 2 == args.trainer_id:
-                        new_batch.append(item)
-                return new_batch
-            else:
-                return origin_batch
-
-        need_save = bool(int(os.getenv("SAVE", "0")))
-        model_dir = os.getenv("MODEL_DIR", "")
-        save_mode = os.getenv("SAVE_MODE", "")
-
-        if save_mode == "LOCAL":
-            if need_save:
-                for _ in six.moves.xrange(RUN_STEP):
-                    loss, = exe.run(fetch_list=[avg_cost.name],
-                                    feed=feeder.feed(get_data()))
-                if need_save and model_dir:
-                    io.save_persistables(startup_exe, model_dir, trainer_prog)
-
-            var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor(
-            ))
-            if six.PY2:
-                print(pickle.dumps(np.ravel(var).tolist()))
-            else:
-                sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
-
-        elif save_mode == "DIST":
-            skip_steps = int(os.getenv("SKIP_STEPS"))
-            loss = None
-            if need_save:
-                for idx in six.moves.xrange(8):
-                    loss, = exe.run(fetch_list=[avg_cost.name],
-                                    feed=feeder.feed(get_data()))
-                    if need_save and model_dir and idx == skip_steps and args.trainer_id == 0:
-                        io.save_persistables(startup_exe, model_dir,
-                                             trainer_prog)
-            else:
-                for idx in six.moves.xrange(8):
-                    data = get_data()
-                    if idx <= skip_steps:
-                        continue
-                    loss, = exe.run(fetch_list=[avg_cost.name],
-                                    feed=feeder.feed(data))
-            if six.PY2:
-                print(pickle.dumps(loss.tolist()))
-            else:
-                sys.stdout.buffer.write(pickle.dumps(loss.tolist()))
-        else:
-            raise Exception("save_mode must be LOCAL or DIST")
-
-
-if __name__ == "__main__":
-    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
-    runtime_main(TestDistSaveLoad2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
deleted file mode 100644
index a2fd61e2387ee362946c15788d76cba4dec46055..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ /dev/null
@@ -1,265 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import math
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import os
-import sys
-import signal
-from test_dist_base import TestDistRunnerBase, runtime_main
-
-# Fix seed for test
-fluid.default_startup_program().random_seed = 1
-fluid.default_main_program().random_seed = 1
-
-train_parameters = {
-    "input_size": [3, 224, 224],
-    "input_mean": [0.485, 0.456, 0.406],
-    "input_std": [0.229, 0.224, 0.225],
-    "learning_strategy": {
-        "name": "piecewise_decay",
-        "epochs": [30, 60, 90],
-        "steps": [0.1, 0.01, 0.001, 0.0001]
-    }
-}
-
-
-class SE_ResNeXt():
-    def __init__(self, layers=50):
-        self.params = train_parameters
-        self.layers = layers
-
-    def net(self, input, class_dim=1000):
-        layers = self.layers
-        supported_layers = [50, 101, 152]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(supported_layers, layers)
-        if layers == 50:
-            cardinality = 32
-            reduction_ratio = 16
-            depth = [3, 4, 6, 3]
-            num_filters = [128, 256, 512, 1024]
-
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv,
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-        elif layers == 101:
-            cardinality = 32
-            reduction_ratio = 16
-            depth = [3, 4, 23, 3]
-            num_filters = [128, 256, 512, 1024]
-
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv,
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-        elif layers == 152:
-            cardinality = 64
-            reduction_ratio = 16
-            depth = [3, 8, 36, 3]
-            num_filters = [128, 256, 512, 1024]
-
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=3,
-                stride=2,
-                act='relu')
-            conv = self.conv_bn_layer(
-                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
-            conv = self.conv_bn_layer(
-                input=conv,
-                num_filters=128,
-                filter_size=3,
-                stride=1,
-                act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
-                pool_type='max')
-
-        for block in range(len(depth)):
-            for i in range(depth[block]):
-                conv = self.bottleneck_block(
-                    input=conv,
-                    num_filters=num_filters[block],
-                    stride=2 if i == 0 and block != 0 else 1,
-                    cardinality=cardinality,
-                    reduction_ratio=reduction_ratio)
-
-        pool = fluid.layers.pool2d(
-            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
-        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
-        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
-        out = fluid.layers.fc(
-            input=drop,
-            size=class_dim,
-            act='softmax',
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)))
-        return out
-
-    def shortcut(self, input, ch_out, stride):
-        ch_in = input.shape[1]
-        if ch_in != ch_out or stride != 1:
-            filter_size = 1
-            return self.conv_bn_layer(input, ch_out, filter_size, stride)
-        else:
-            return input
-
-    def bottleneck_block(self, input, num_filters, stride, cardinality,
-                         reduction_ratio):
-        conv0 = self.conv_bn_layer(
-            input=input, num_filters=num_filters, filter_size=1, act='relu')
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            groups=cardinality,
-            act='relu')
-        conv2 = self.conv_bn_layer(
-            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
-        scale = self.squeeze_excitation(
-            input=conv2,
-            num_channels=num_filters * 2,
-            reduction_ratio=reduction_ratio)
-
-        short = self.shortcut(input, num_filters * 2, stride)
-
-        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-
-    def conv_bn_layer(self,
-                      input,
-                      num_filters,
-                      filter_size,
-                      stride=1,
-                      groups=1,
-                      act=None):
-        conv = fluid.layers.conv2d(
-            input=input,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            # avoid pserver CPU init differs from GPU
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)),
-            bias_attr=False)
-        return fluid.layers.batch_norm(input=conv, act=act)
-
-    def squeeze_excitation(self, input, num_channels, reduction_ratio):
-        pool = fluid.layers.pool2d(
-            input=input, pool_size=0, pool_type='avg', global_pooling=True)
-        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-        squeeze = fluid.layers.fc(
-            input=pool,
-            size=num_channels // reduction_ratio,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)),
-            act='relu')
-        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
-        excitation = fluid.layers.fc(
-            input=squeeze,
-            size=num_channels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)),
-            act='sigmoid')
-        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
-        return scale
-
-
-class DistSeResneXt2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2, use_dgc=False):
-        # Input data
-        image = fluid.layers.data(
-            name="data", shape=[3, 224, 224], dtype='float32')
-        label = fluid.layers.data(name="int64", shape=[1], dtype='int64')
-
-        # Train program
-        model = SE_ResNeXt(layers=50)
-        out = model.net(input=image, class_dim=102)
-        cost = fluid.layers.cross_entropy(input=out, label=label)
-
-        avg_cost = fluid.layers.mean(x=cost)
-        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-
-        # Evaluator
-        test_program = fluid.default_main_program().clone(for_test=True)
-
-        # Optimization
-        total_images = 6149  # flowers
-        epochs = [30, 60, 90]
-        step = int(total_images / batch_size + 1)
-
-        bd = [step * e for e in epochs]
-        base_lr = 0.1
-        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-
-        if not use_dgc:
-            optimizer = fluid.optimizer.Momentum(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr),
-                momentum=0.9,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-        else:
-            optimizer = fluid.optimizer.DGCMomentumOptimizer(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr),
-                momentum=0.9,
-                rampup_begin_step=0,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-        optimizer.minimize(avg_cost)
-
-        # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
-
-        return test_program, avg_cost, train_reader, test_reader, acc_top1, out
-
-
-if __name__ == "__main__":
-    runtime_main(DistSeResneXt2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
deleted file mode 100644
index 09afae6114e2b6cc8bce9b2be3b221ba9825db8c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ /dev/null
@@ -1,255 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import math
-import random
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import os
-import signal
-from functools import reduce
-from test_dist_base import TestDistRunnerBase, runtime_main
-
-DTYPE = "int64"
-DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
-DATA_MD5 = '24e49366eb0611c552667989de2f57d5'
-
-# For Net
-base_lr = 0.2
-emb_lr = base_lr * 3
-dict_dim = 1500
-emb_dim = 128
-hid_dim = 128
-margin = 0.1
-sample_rate = 1
-
-# Fix seed for test
-fluid.default_startup_program().random_seed = 1
-fluid.default_main_program().random_seed = 1
-
-
-def get_acc(cos_q_nt, cos_q_pt, batch_size):
-    cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
-    cond = fluid.layers.cast(cond, dtype='float64')
-    cond_3 = fluid.layers.reduce_sum(cond)
-    acc = fluid.layers.elementwise_div(
-        cond_3,
-        fluid.layers.fill_constant(
-            shape=[1], value=batch_size * 1.0, dtype='float64'),
-        name="simnet_acc")
-    return acc
-
-
-def get_loss(cos_q_pt, cos_q_nt):
-    loss_op1 = fluid.layers.elementwise_sub(
-        fluid.layers.fill_constant_batch_size_like(
-            input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'),
-        cos_q_pt)
-    loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-    loss_op3 = fluid.layers.elementwise_max(
-        fluid.layers.fill_constant_batch_size_like(
-            input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
-        loss_op2)
-    avg_cost = fluid.layers.mean(loss_op3)
-    return avg_cost
-
-
-def get_optimizer(op="sgd"):
-    if op.upper() == "sgd".upper():
-        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
-    elif op.upper() == "adam".upper():
-        optimizer = fluid.optimizer.Adam(learning_rate=base_lr)
-    else:
-        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
-    return optimizer
-
-
-def train_network(batch_size,
-                  is_distributed=False,
-                  is_sparse=False,
-                  is_self_contained_lr=False):
-    # query
-    q = fluid.layers.data(
-        name="query_ids", shape=[1], dtype="int64", lod_level=1)
-    ## embedding
-    q_emb = fluid.layers.embedding(
-        input=q,
-        is_distributed=is_distributed,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__emb__",
-            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
-                name="__emb__"),
-        is_sparse=is_sparse)
-    ## vsum
-    q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-    q_ss = fluid.layers.softsign(q_sum)
-    ## fc layer after conv
-    q_fc = fluid.layers.fc(
-        input=q_ss,
-        size=hid_dim,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__q_fc__",
-            learning_rate=base_lr))
-    # label data
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    # pt
-    pt = fluid.layers.data(
-        name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
-    ## embedding
-    pt_emb = fluid.layers.embedding(
-        input=pt,
-        is_distributed=is_distributed,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__emb__",
-            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
-                name="__emb__"),
-        is_sparse=is_sparse)
-    ## vsum
-    pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-    pt_ss = fluid.layers.softsign(pt_sum)
-    ## fc layer
-    pt_fc = fluid.layers.fc(
-        input=pt_ss,
-        size=hid_dim,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__fc__",
-            learning_rate=base_lr),
-        bias_attr=fluid.ParamAttr(name="__fc_b__"))
-    # nt
-    nt = fluid.layers.data(
-        name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
-    ## embedding
-    nt_emb = fluid.layers.embedding(
-        input=nt,
-        is_distributed=is_distributed,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__emb__",
-            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
-                name="__emb__"),
-        is_sparse=is_sparse)
-    ## vsum
-    nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-    nt_ss = fluid.layers.softsign(nt_sum)
-    ## fc layer
-    nt_fc = fluid.layers.fc(
-        input=nt_ss,
-        size=hid_dim,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__fc__",
-            learning_rate=base_lr),
-        bias_attr=fluid.ParamAttr(name="__fc_b__"))
-    cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
-    cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
-    # loss
-    avg_cost = get_loss(cos_q_pt, cos_q_nt)
-    # acc
-    acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
-    return [avg_cost, acc, cos_q_pt]
-
-
-def combination(x, y):
-    res = [[[xi, yi] for yi in y] for xi in x]
-    return res[0]
-
-
-def get_one_data(file_list):
-    for file in file_list:
-        contents = []
-        with open(file, "r") as fin:
-            for i in fin:
-                contents.append(i.strip())
-            for index, q in enumerate(contents):
-                try:
-                    one_data = [[int(j) for j in i.split(" ")]
-                                for i in q.split(";")[:-1]]
-                    if one_data[1][0] + one_data[1][1] != len(one_data) - 3:
-                        q = fin.readline()
-                        continue
-                    tmp = combination(one_data[3:3 + one_data[1][0]],
-                                      one_data[3 + one_data[1][0]:])
-                except Exception as e:
-                    continue
-
-                for each in tmp:
-                    yield [one_data[2], 0, each[0], each[1]]
-
-
-def get_batch_reader(file_list, batch_size):
-    def batch_reader():
-        res = []
-        for i in get_one_data(file_list):
-            if random.random() <= sample_rate:
-                res.append(i)
-            if len(res) >= batch_size:
-                yield res
-                res = []
-
-    return batch_reader
-
-
-def get_train_reader(batch_size):
-    # The training data set.
-    train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet",
-                              "train")
-    train_reader = get_batch_reader([train_file], batch_size)
-    train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
-    return train_reader, train_feed
-
-
-class TestDistSimnetBow2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2):
-        # Train program
-        avg_cost, acc, predict = \
-            train_network(batch_size,
-                          bool(int(os.environ["IS_DISTRIBUTED"])),
-                          bool(int(os.environ["IS_SPARSE"])),
-                          bool(int(os.environ["IS_SELF_CONTAINED_LR"])))
-
-        inference_program = fluid.default_main_program().clone()
-
-        # Optimization
-        opt = os.getenv('OPTIMIZER', 'sgd')
-        opt = get_optimizer(opt)
-        opt.minimize(avg_cost)
-
-        # Reader
-        train_reader, _ = get_train_reader(batch_size)
-        return inference_program, avg_cost, train_reader, train_reader, acc, predict
-
-
-if __name__ == "__main__":
-    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
-    runtime_main(TestDistSimnetBow2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_test_utils.py b/python/paddle/fluid/tests/unittests/dist_test_utils.py
deleted file mode 100644
index 7725a07aa5a99e5c98d2a73f05cbcdb2b1555a57..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_test_utils.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, errno
-
-
-def silentremove(filename):
-    try:
-        os.remove(filename)
-    except OSError as e:  # this would be "except OSError, e:" before Python 2.6
-        if e.errno != errno.ENOENT:  # errno.ENOENT = no such file or directory
-            raise  # re-raise exception if a different error occurred
-
-
-def remove_ps_flag(pid):
-    silentremove("/tmp/paddle.%d.port" % pid)
diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py
deleted file mode 100644
index 095a474fd3ac056c678f9051ed80ef363ae968c9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ /dev/null
@@ -1,231 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import math
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import os
-import signal
-import six
-import tarfile
-import string
-import re
-from functools import reduce
-from test_dist_base import TestDistRunnerBase, runtime_main
-
-DTYPE = "float32"
-VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab'
-VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2'
-DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz'
-DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a'
-
-
-# Load dictionary.
-def load_vocab(filename):
-    vocab = {}
-    if six.PY2:
-        with open(filename, 'r') as f:
-            for idx, line in enumerate(f):
-                vocab[line.strip()] = idx
-    else:
-        with open(filename, 'r', encoding="utf-8") as f:
-            for idx, line in enumerate(f):
-                vocab[line.strip()] = idx
-    return vocab
-
-
-def get_worddict(dict_path):
-    word_dict = load_vocab(dict_path)
-    word_dict["<unk>"] = len(word_dict)
-    dict_dim = len(word_dict)
-    return word_dict, dict_dim
-
-
-def conv_net(input,
-             dict_dim,
-             emb_dim=128,
-             window_size=3,
-             num_filters=128,
-             fc0_dim=96,
-             class_dim=2):
-    emb = fluid.layers.embedding(
-        input=input,
-        size=[dict_dim, emb_dim],
-        is_sparse=False,
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.01)))
-
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=num_filters,
-        filter_size=window_size,
-        act="tanh",
-        pool_type="max",
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
-
-    fc_0 = fluid.layers.fc(
-        input=[conv_3],
-        size=fc0_dim,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
-
-    prediction = fluid.layers.fc(
-        input=[fc_0],
-        size=class_dim,
-        act="softmax",
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
-
-    return prediction
-
-
-def inference_network(dict_dim):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    out = conv_net(data, dict_dim)
-    return out
-
-
-def get_reader(word_dict, batch_size):
-    # The training data set.
-    train_reader = paddle.batch(train(word_dict), batch_size=batch_size)
-
-    # The testing data set.
-    test_reader = paddle.batch(test(word_dict), batch_size=batch_size)
-
-    return train_reader, test_reader
-
-
-def get_optimizer(learning_rate):
-    optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
-    return optimizer
-
-
-class TestDistTextClassification2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2):
-        vocab = os.path.join(paddle.dataset.common.DATA_HOME,
-                             "text_classification", "imdb.vocab")
-        word_dict, dict_dim = get_worddict(vocab)
-
-        # Input data
-        data = fluid.layers.data(
-            name="words", shape=[1], dtype="int64", lod_level=1)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-        # Train program
-        predict = conv_net(data, dict_dim)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        acc = fluid.layers.accuracy(input=predict, label=label)
-        inference_program = fluid.default_main_program().clone()
-
-        # Optimization
-        opt = get_optimizer(learning_rate=0.001)
-        opt.minimize(avg_cost)
-
-        # Reader
-        train_reader, test_reader = get_reader(word_dict, batch_size)
-
-        return inference_program, avg_cost, train_reader, test_reader, acc, predict
-
-
-def tokenize(pattern):
-    """
-    Read files that match the given pattern.  Tokenize and yield each file.
-    """
-
-    with tarfile.open(
-            paddle.dataset.common.download(DATA_URL, 'text_classification',
-                                           DATA_MD5)) as tarf:
-        # Note that we should use tarfile.next(), which does
-        # sequential access of member files, other than
-        # tarfile.extractfile, which does random access and might
-        # destroy hard disks.
-        tf = tarf.next()
-        while tf != None:
-            if bool(pattern.match(tf.name)):
-                # newline and punctuations removal and ad-hoc tokenization.
-                yield tarf.extractfile(tf).read().rstrip(six.b(
-                    "\n\r")).translate(
-                        None, six.b(string.punctuation)).lower().split()
-            tf = tarf.next()
-
-
-def reader_creator(pos_pattern, neg_pattern, word_idx):
-    UNK = word_idx['<unk>']
-    INS = []
-
-    def load(pattern, out, label):
-        for doc in tokenize(pattern):
-            out.append(([word_idx.get(w, UNK) for w in doc], label))
-
-    load(pos_pattern, INS, 0)
-    load(neg_pattern, INS, 1)
-
-    def reader():
-        for doc, label in INS:
-            yield doc, label
-
-    return reader
-
-
-def train(word_idx):
-    """
-    IMDB training set creator.
-
-    It returns a reader creator, each sample in the reader is an zero-based ID
-    sequence and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        re.compile("train/pos/.*\.txt$"),
-        re.compile("train/neg/.*\.txt$"), word_idx)
-
-
-def test(word_idx):
-    """
-    IMDB test set creator.
-
-    It returns a reader creator, each sample in the reader is an zero-based ID
-    sequence and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        re.compile("test/pos/.*\.txt$"),
-        re.compile("test/neg/.*\.txt$"), word_idx)
-
-
-if __name__ == "__main__":
-    paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5)
-    paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5)
-    runtime_main(TestDistTextClassification2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
deleted file mode 100644
index b8d83323600a7d9ca437ceeafd95fef74bf4f056..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ /dev/null
@@ -1,1729 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import math
-import os
-import sys
-import six
-import argparse
-import ast
-import multiprocessing
-import time
-from functools import partial
-from os.path import expanduser
-import glob
-import random
-import tarfile
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid import core
-from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP
-import paddle.compat as cpt
-from paddle.compat import long_type
-
-import hashlib
-
-from paddle.fluid.transpiler.details import program_to_code
-
-const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001))
-const_bias_attr = const_para_attr
-
-# Fix seed for test
-fluid.default_startup_program().random_seed = 1
-fluid.default_main_program().random_seed = 1
-
-
-#from transformer_config import ModelHyperParams, TrainTaskConfig, merge_cfg_from_list
-class TrainTaskConfig(object):
-    # only support GPU currently
-    use_gpu = True
-    # the epoch number to train.
-    pass_num = 1
-    # the number of sequences contained in a mini-batch.
-    # deprecated, set batch_size in args.
-    batch_size = 20
-    # the hyper parameters for Adam optimizer.
-    # This static learning_rate will be multiplied to the LearningRateScheduler
-    # derived learning rate the to get the final learning rate.
-    learning_rate = 1
-    beta1 = 0.9
-    beta2 = 0.98
-    eps = 1e-9
-    # the parameters for learning rate scheduling.
-    warmup_steps = 4000
-    # the weight used to mix up the ground-truth distribution and the fixed
-    # uniform distribution in label smoothing when training.
-    # Set this as zero if label smoothing is not wanted.
-    label_smooth_eps = 0.1
-    # the directory for saving trained models.
-    model_dir = "trained_models"
-    # the directory for saving checkpoints.
-    ckpt_dir = "trained_ckpts"
-    # the directory for loading checkpoint.
-    # If provided, continue training from the checkpoint.
-    ckpt_path = None
-    # the parameter to initialize the learning rate scheduler.
-    # It should be provided if use checkpoints, since the checkpoint doesn't
-    # include the training step counter currently.
-    start_step = 0
-
-    check_acc = True
-
-    data_path = expanduser("~") + (
-        "/.cache/paddle/dataset/test_dist_transformer/")
-    src_vocab_fpath = data_path + "vocab.bpe.32000"
-    trg_vocab_fpath = data_path + "vocab.bpe.32000"
-    train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de"
-    val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de.cut"
-    pool_size = 2000
-    sort_type = None
-    local = True
-    shuffle = False
-    shuffle_batch = False
-    special_token = ['<s>', '<e>', '<unk>']
-    token_delimiter = ' '
-    use_token_batch = False
-
-
-class InferTaskConfig(object):
-    use_gpu = True
-    # the number of examples in one run for sequence generation.
-    batch_size = 10
-    # the parameters for beam search.
-    beam_size = 5
-    max_out_len = 256
-    # the number of decoded sentences to output.
-    n_best = 1
-    # the flags indicating whether to output the special tokens.
-    output_bos = False
-    output_eos = False
-    output_unk = True
-    # the directory for loading the trained model.
-    model_path = "trained_models/pass_1.infer.model"
-
-
-class ModelHyperParams(object):
-    # These following five vocabularies related configurations will be set
-    # automatically according to the passed vocabulary path and special tokens.
-    # size of source word dictionary.
-    src_vocab_size = 10000
-    # size of target word dictionay
-    trg_vocab_size = 10000
-    # index for <bos> token
-    bos_idx = 0
-    # index for <eos> token
-    eos_idx = 1
-    # index for <unk> token
-    unk_idx = 2
-    # max length of sequences deciding the size of position encoding table.
-    # Start from 1 and count start and end tokens in.
-    max_length = 256
-    # the dimension for word embeddings, which is also the last dimension of
-    # the input and output of multi-head attention, position-wise feed-forward
-    # networks, encoder and decoder.
-    d_model = 512
-    # size of the hidden layer in position-wise feed-forward networks.
-    d_inner_hid = 2048
-    # the dimension that keys are projected to for dot-product attention.
-    d_key = 64
-    # the dimension that values are projected to for dot-product attention.
-    d_value = 64
-    # number of head used in multi-head attention.
-    n_head = 8
-    # number of sub-layers to be stacked in the encoder and decoder.
-    n_layer = 6
-    # dropout rate used by all dropout layers.
-    dropout = 0.0  # no random
-    # random seed used in dropout for CE.
-    dropout_seed = None
-    # the flag indicating whether to share embedding and softmax weights.
-    # vocabularies in source and target should be same for weight sharing.
-    weight_sharing = True
-
-
-def merge_cfg_from_list(cfg_list, g_cfgs):
-    """
-    Set the above global configurations using the cfg_list.
-    """
-    assert len(cfg_list) % 2 == 0
-    for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
-        for g_cfg in g_cfgs:
-            if hasattr(g_cfg, key):
-                try:
-                    value = eval(value)
-                except Exception:  # for file path
-                    pass
-                setattr(g_cfg, key, value)
-                break
-
-
-# The placeholder for batch_size in compile time. Must be -1 currently to be
-# consistent with some ops' infer-shape output in compile time, such as the
-# sequence_expand op used in beamsearch decoder.
-batch_size = -1
-# The placeholder for squence length in compile time.
-seq_len = ModelHyperParams.max_length
-# Here list the data shapes and data types of all inputs.
-# The shapes here act as placeholder and are set to pass the infer-shape in
-# compile time.
-input_descs = {
-    # The actual data shape of src_word is:
-    # [batch_size * max_src_len_in_batch, 1]
-    "src_word": [(batch_size, seq_len, long_type(1)), "int64", 2],
-    # The actual data shape of src_pos is:
-    # [batch_size * max_src_len_in_batch, 1]
-    "src_pos": [(batch_size, seq_len, long_type(1)), "int64"],
-    # This input is used to remove attention weights on paddings in the
-    # encoder.
-    # The actual data shape of src_slf_attn_bias is:
-    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
-    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
-    # The actual data shape of trg_word is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "trg_word": [(batch_size, seq_len, long_type(1)), "int64",
-                 2],  # lod_level is only used in fast decoder.
-    # The actual data shape of trg_pos is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "trg_pos": [(batch_size, seq_len, long_type(1)), "int64"],
-    # This input is used to remove attention weights on paddings and
-    # subsequent words in the decoder.
-    # The actual data shape of trg_slf_attn_bias is:
-    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
-    "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
-    # This input is used to remove attention weights on paddings of the source
-    # input in the encoder-decoder attention.
-    # The actual data shape of trg_src_attn_bias is:
-    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
-    "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
-    # This input is used in independent decoder program for inference.
-    # The actual data shape of enc_output is:
-    # [batch_size, max_src_len_in_batch, d_model]
-    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
-    # The actual data shape of label_word is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_word": [(batch_size * seq_len, long_type(1)), "int64"],
-    # This input is used to mask out the loss of paddding tokens.
-    # The actual data shape of label_weight is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_weight": [(batch_size * seq_len, long_type(1)), "float32"],
-    # These inputs are used to change the shape tensor in beam-search decoder.
-    "trg_slf_attn_pre_softmax_shape_delta": [(long_type(2), ), "int32"],
-    "trg_slf_attn_post_softmax_shape_delta": [(long_type(4), ), "int32"],
-    "init_score": [(batch_size, long_type(1)), "float32"],
-}
-
-# Names of word embedding table which might be reused for weight sharing.
-word_emb_param_names = (
-    "src_word_emb_table",
-    "trg_word_emb_table", )
-# Names of position encoding table which will be initialized externally.
-pos_enc_param_names = (
-    "src_pos_enc_table",
-    "trg_pos_enc_table", )
-# separated inputs for different usages.
-encoder_data_input_fields = (
-    "src_word",
-    "src_pos",
-    "src_slf_attn_bias", )
-decoder_data_input_fields = (
-    "trg_word",
-    "trg_pos",
-    "trg_slf_attn_bias",
-    "trg_src_attn_bias",
-    "enc_output", )
-label_data_input_fields = (
-    "lbl_word",
-    "lbl_weight", )
-# In fast decoder, trg_pos (only containing the current time step) is generated
-# by ops and trg_slf_attn_bias is not needed.
-fast_decoder_data_input_fields = (
-    "trg_word",
-    "init_score",
-    "trg_src_attn_bias", )
-
-# fast_decoder_util_input_fields = (
-#     "trg_slf_attn_pre_softmax_shape_delta",
-#     "trg_slf_attn_post_softmax_shape_delta", )
-
-
-#from optim import LearningRateScheduler
-class LearningRateScheduler(object):
-    """
-    Wrapper for learning rate scheduling as described in the Transformer paper.
-    LearningRateScheduler adapts the learning rate externally and the adapted
-    learning rate will be feeded into the main_program as input data.
-    """
-
-    def __init__(self,
-                 d_model,
-                 warmup_steps,
-                 learning_rate=0.001,
-                 current_steps=0,
-                 name="learning_rate"):
-        self.current_steps = current_steps
-        self.warmup_steps = warmup_steps
-        self.d_model = d_model
-        self.static_lr = learning_rate
-        self.learning_rate = layers.create_global_var(
-            name=name,
-            shape=[1],
-            value=float(learning_rate),
-            dtype="float32",
-            persistable=True)
-
-    def update_learning_rate(self):
-        self.current_steps += 1
-        lr_value = np.power(self.d_model, -0.5) * np.min([
-            np.power(self.current_steps, -0.5),
-            np.power(self.warmup_steps, -1.5) * self.current_steps
-        ]) * self.static_lr
-        return np.array([lr_value], dtype="float32")
-
-
-#from transformer_train import train_loop
-def pad_batch_data(insts,
-                   pad_idx,
-                   n_head,
-                   is_target=False,
-                   is_label=False,
-                   return_attn_bias=True,
-                   return_max_len=True,
-                   return_num_token=False):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias.
-    """
-    return_list = []
-    max_len = max(len(inst) for inst in insts)
-    num_token = six.moves.reduce(
-        lambda x, y: x + y,
-        [len(inst) for inst in insts]) if return_num_token else 0
-    # Any token included in dict can be used to pad, since the paddings' loss
-    # will be masked out by weights and make no effect on parameter gradients.
-    inst_data = np.array(
-        [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
-    return_list += [inst_data.astype("int64").reshape([-1, 1])]
-    if is_label:  # label weight
-        inst_weight = np.array(
-            [[1.] * len(inst) + [0.] * (max_len - len(inst)) for inst in insts])
-        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
-    else:  # position data
-        inst_pos = np.array([
-            list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
-            for inst in insts
-        ])
-        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
-    if return_attn_bias:
-        if is_target:
-            # This is used to avoid attention on paddings and subsequent
-            # words.
-            slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len))
-            slf_attn_bias_data = np.triu(slf_attn_bias_data,
-                                         1).reshape([-1, 1, max_len, max_len])
-            slf_attn_bias_data = np.tile(slf_attn_bias_data,
-                                         [1, n_head, 1, 1]) * [-1e9]
-        else:
-            # This is used to avoid attention on paddings.
-            slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
-                                           (max_len - len(inst))
-                                           for inst in insts])
-            slf_attn_bias_data = np.tile(
-                slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
-                [1, n_head, max_len, 1])
-        return_list += [slf_attn_bias_data.astype("float32")]
-    if return_max_len:
-        return_list += [max_len]
-    if return_num_token:
-        return_list += [num_token]
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
-                        n_head, d_model):
-    """
-    Put all padded data needed by training into a dict.
-    """
-    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
-        [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
-    src_word = src_word.reshape(-1, src_max_len, 1)
-    src_pos = src_pos.reshape(-1, src_max_len, 1)
-    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data(
-        [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
-    trg_word = trg_word.reshape(-1, trg_max_len, 1)
-    trg_pos = trg_pos.reshape(-1, trg_max_len, 1)
-
-    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
-                                [1, 1, trg_max_len, 1]).astype("float32")
-
-    lbl_word, lbl_weight, num_token = pad_batch_data(
-        [inst[2] for inst in insts],
-        trg_pad_idx,
-        n_head,
-        is_target=False,
-        is_label=True,
-        return_attn_bias=False,
-        return_max_len=False,
-        return_num_token=True)
-
-    data_input_dict = dict(
-        list(
-            zip(data_input_names, [
-                src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
-                trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
-            ])))
-    return data_input_dict, np.asarray([num_token], dtype="float32")
-
-
-def read_multiple(reader, count, clip_last=True):
-    """
-    Stack data from reader for multi-devices.
-    """
-
-    def __impl__():
-        res = []
-        for item in reader():
-            res.append(item)
-            if len(res) == count:
-                yield res
-                res = []
-        if len(res) == count:
-            yield res
-        elif not clip_last:
-            data = []
-            for item in res:
-                data += item
-            if len(data) > count:
-                inst_num_per_part = len(data) // count
-                yield [
-                    data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
-                    for i in range(count)
-                ]
-
-    return __impl__
-
-
-def split_data(data, num_part):
-    """
-    Split data for each device.
-    """
-    if len(data) == num_part:
-        return data
-    data = data[0]
-    inst_num_per_part = len(data) // num_part
-    return [
-        data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
-        for i in range(num_part)
-    ]
-
-
-def test_context(test_program, avg_cost, train_exe, dev_count, data_input_names,
-                 sum_cost, token_num):
-    val_data = DataReader(
-        src_vocab_fpath=TrainTaskConfig.src_vocab_fpath,
-        trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath,
-        fpattern=TrainTaskConfig.val_file_pattern,
-        token_delimiter=TrainTaskConfig.token_delimiter,
-        use_token_batch=TrainTaskConfig.use_token_batch,
-        batch_size=TrainTaskConfig.batch_size *
-        (1 if TrainTaskConfig.use_token_batch else dev_count),
-        pool_size=TrainTaskConfig.pool_size,
-        sort_type=TrainTaskConfig.sort_type,
-        start_mark=TrainTaskConfig.special_token[0],
-        end_mark=TrainTaskConfig.special_token[1],
-        unk_mark=TrainTaskConfig.special_token[2],
-        # count start and end tokens out
-        max_length=ModelHyperParams.max_length - 2,
-        clip_last_batch=False,
-        shuffle=False,
-        shuffle_batch=False)
-
-    build_strategy = fluid.BuildStrategy()
-
-    strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = 1
-
-    test_exe = fluid.ParallelExecutor(
-        use_cuda=TrainTaskConfig.use_gpu,
-        main_program=test_program,
-        share_vars_from=train_exe,
-        build_strategy=build_strategy,
-        exec_strategy=strategy)
-
-    def test(exe=test_exe):
-        test_total_cost = 0
-        test_total_token = 0
-        test_data = read_multiple(
-            reader=val_data.batch_generator,
-            count=dev_count if TrainTaskConfig.use_token_batch else 1)
-        for batch_id, data in enumerate(test_data()):
-            feed_list = []
-            for place_id, data_buffer in enumerate(
-                    split_data(
-                        data, num_part=dev_count)):
-                data_input_dict, _ = prepare_batch_input(
-                    data_buffer, data_input_names, ModelHyperParams.eos_idx,
-                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
-                    ModelHyperParams.d_model)
-                feed_list.append(data_input_dict)
-
-            outs = exe.run(feed=feed_list,
-                           fetch_list=[sum_cost.name, token_num.name])
-            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
-            test_total_cost += sum_cost_val.sum()
-            test_total_token += token_num_val.sum()
-        test_avg_cost = test_total_cost / test_total_token
-        test_ppl = np.exp([min(test_avg_cost, 100)])
-        return test_avg_cost, test_ppl
-
-    return test
-
-
-def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
-               token_num, predict, test_program):
-    # Initialize the parameters.
-    if TrainTaskConfig.ckpt_path:
-        lr_scheduler.current_steps = TrainTaskConfig.start_step
-    else:
-        exe.run(fluid.framework.default_startup_program())
-
-    train_data = DataReader(
-        src_vocab_fpath=TrainTaskConfig.src_vocab_fpath,
-        trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath,
-        fpattern=TrainTaskConfig.train_file_pattern,
-        token_delimiter=TrainTaskConfig.token_delimiter,
-        use_token_batch=TrainTaskConfig.use_token_batch,
-        batch_size=TrainTaskConfig.batch_size *
-        (1 if TrainTaskConfig.use_token_batch else dev_count),
-        pool_size=TrainTaskConfig.pool_size,
-        sort_type=TrainTaskConfig.sort_type,
-        shuffle=TrainTaskConfig.shuffle,
-        shuffle_batch=TrainTaskConfig.shuffle_batch,
-        start_mark=TrainTaskConfig.special_token[0],
-        end_mark=TrainTaskConfig.special_token[1],
-        unk_mark=TrainTaskConfig.special_token[2],
-        # count start and end tokens out
-        max_length=ModelHyperParams.max_length - 2,
-        clip_last_batch=False)
-    train_data = read_multiple(
-        reader=train_data.batch_generator,
-        count=dev_count if TrainTaskConfig.use_token_batch else 1)
-
-    build_strategy = fluid.BuildStrategy()
-    # Since the token number differs among devices, customize gradient scale to
-    # use token average cost among multi-devices. and the gradient scale is
-    # `1 / token_number` for average cost.
-    build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
-
-    strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = 1
-
-    train_exe = fluid.ParallelExecutor(
-        use_cuda=TrainTaskConfig.use_gpu,
-        loss_name=sum_cost.name,
-        main_program=train_progm,
-        build_strategy=build_strategy,
-        exec_strategy=strategy)
-
-    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
-                                                                             -1] + label_data_input_fields
-
-    if TrainTaskConfig.val_file_pattern is not None:
-        test = test_context(test_program, avg_cost, train_exe, dev_count,
-                            data_input_names, sum_cost, token_num)
-
-    # the best cross-entropy value with label smoothing
-    loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log(
-        (1. - TrainTaskConfig.label_smooth_eps
-         )) + TrainTaskConfig.label_smooth_eps *
-                        np.log(TrainTaskConfig.label_smooth_eps / (
-                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))
-    init = False
-    for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
-        pass_start_time = time.time()
-        for batch_id, data in enumerate(train_data()):
-            if batch_id >= RUN_STEP:
-                break
-
-            feed_list = []
-            total_num_token = 0
-
-            if TrainTaskConfig.local:
-                lr_rate = lr_scheduler.update_learning_rate()
-
-            for place_id, data_buffer in enumerate(
-                    split_data(
-                        data, num_part=dev_count)):
-                data_input_dict, num_token = prepare_batch_input(
-                    data_buffer, data_input_names, ModelHyperParams.eos_idx,
-                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
-                    ModelHyperParams.d_model)
-                total_num_token += num_token
-                feed_kv_pairs = list(data_input_dict.items())
-                if TrainTaskConfig.local:
-                    feed_kv_pairs += list({
-                        lr_scheduler.learning_rate.name: lr_rate
-                    }.items())
-                feed_list.append(dict(feed_kv_pairs))
-
-                if not init:
-                    for pos_enc_param_name in pos_enc_param_names:
-                        pos_enc = position_encoding_init(
-                            ModelHyperParams.max_length + 1,
-                            ModelHyperParams.d_model)
-                        feed_list[place_id][pos_enc_param_name] = pos_enc
-
-            if not TrainTaskConfig.check_acc:
-                for feed_dict in feed_list:
-                    feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token
-            else:
-                b = 100 * TrainTaskConfig.batch_size
-                a = np.asarray([b], dtype="float32")
-                for feed_dict in feed_list:
-                    feed_dict[sum_cost.name + "@GRAD"] = 1. / a
-
-            outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name],
-                                 feed=feed_list)
-
-            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
-            total_sum_cost = sum_cost_val.sum()
-            total_token_num = token_num_val.sum()
-            total_avg_cost = total_sum_cost / total_token_num
-
-            init = True
-
-            # Validate and save the model for inference.
-            if TrainTaskConfig.val_file_pattern is not None:
-                val_avg_cost, val_ppl = test()
-                print("[%f]" % val_avg_cost)
-            else:
-                assert (False)
-
-
-#import transformer_reader as reader
-class SortType(object):
-    GLOBAL = 'global'
-    POOL = 'pool'
-    NONE = "none"
-
-
-class Converter(object):
-    def __init__(self, vocab, beg, end, unk, delimiter):
-        self._vocab = vocab
-        self._beg = beg
-        self._end = end
-        self._unk = unk
-        self._delimiter = delimiter
-
-    def __call__(self, sentence):
-        return [self._beg] + [
-            self._vocab.get(w, self._unk)
-            for w in sentence.split(self._delimiter)
-        ] + [self._end]
-
-
-class ComposedConverter(object):
-    def __init__(self, converters):
-        self._converters = converters
-
-    def __call__(self, parallel_sentence):
-        return [
-            self._converters[i](parallel_sentence[i])
-            for i in range(len(self._converters))
-        ]
-
-
-class SentenceBatchCreator(object):
-    def __init__(self, batch_size):
-        self.batch = []
-        self._batch_size = batch_size
-
-    def append(self, info):
-        self.batch.append(info)
-        if len(self.batch) == self._batch_size:
-            tmp = self.batch
-            self.batch = []
-            return tmp
-
-
-class TokenBatchCreator(object):
-    def __init__(self, batch_size):
-        self.batch = []
-        self.max_len = -1
-        self._batch_size = batch_size
-
-    def append(self, info):
-        cur_len = info.max_len
-        max_len = max(self.max_len, cur_len)
-        if max_len * (len(self.batch) + 1) > self._batch_size:
-            result = self.batch
-            self.batch = [info]
-            self.max_len = cur_len
-            return result
-        else:
-            self.max_len = max_len
-            self.batch.append(info)
-
-
-class SampleInfo(object):
-    def __init__(self, i, max_len, min_len):
-        self.i = i
-        self.min_len = min_len
-        self.max_len = max_len
-
-
-class MinMaxFilter(object):
-    def __init__(self, max_len, min_len, underlying_creator):
-        self._min_len = min_len
-        self._max_len = max_len
-        self._creator = underlying_creator
-
-    def append(self, info):
-        if info.max_len > self._max_len or info.min_len < self._min_len:
-            return
-        else:
-            return self._creator.append(info)
-
-    @property
-    def batch(self):
-        return self._creator.batch
-
-
-class DataReader(object):
-    """
-    The data reader loads all data from files and produces batches of data
-    in the way corresponding to settings.
-
-    An example of returning a generator producing data batches whose data
-    is shuffled in each pass and sorted in each pool:
-
-    ```
-    train_data = DataReader(
-        src_vocab_fpath='data/src_vocab_file',
-        trg_vocab_fpath='data/trg_vocab_file',
-        fpattern='data/part-*',
-        use_token_batch=True,
-        batch_size=2000,
-        pool_size=10000,
-        sort_type=SortType.POOL,
-        shuffle=True,
-        shuffle_batch=True,
-        start_mark='<s>',
-        end_mark='<e>',
-        unk_mark='<unk>',
-        clip_last_batch=False).batch_generator
-    ```
-
-    :param src_vocab_fpath: The path of vocabulary file of source language.
-    :type src_vocab_fpath: basestring
-    :param trg_vocab_fpath: The path of vocabulary file of target language.
-    :type trg_vocab_fpath: basestring
-    :param fpattern: The pattern to match data files.
-    :type fpattern: basestring
-    :param batch_size: The number of sequences contained in a mini-batch.
-        or the maximum number of tokens (include paddings) contained in a
-        mini-batch.
-    :type batch_size: int
-    :param pool_size: The size of pool buffer.
-    :type pool_size: int
-    :param sort_type: The grain to sort by length: 'global' for all
-        instances; 'pool' for instances in pool; 'none' for no sort.
-    :type sort_type: basestring
-    :param clip_last_batch: Whether to clip the last uncompleted batch.
-    :type clip_last_batch: bool
-    :param tar_fname: The data file in tar if fpattern matches a tar file.
-    :type tar_fname: basestring
-    :param min_length: The minimum length used to filt sequences.
-    :type min_length: int
-    :param max_length: The maximum length used to filt sequences.
-    :type max_length: int
-    :param shuffle: Whether to shuffle all instances.
-    :type shuffle: bool
-    :param shuffle_batch: Whether to shuffle the generated batches.
-    :type shuffle_batch: bool
-    :param use_token_batch: Whether to produce batch data according to
-        token number.
-    :type use_token_batch: bool
-    :param field_delimiter: The delimiter used to split source and target in
-        each line of data file.
-    :type field_delimiter: basestring
-    :param token_delimiter: The delimiter used to split tokens in source or
-        target sentences.
-    :type token_delimiter: basestring
-    :param start_mark: The token representing for the beginning of
-        sentences in dictionary.
-    :type start_mark: basestring
-    :param end_mark: The token representing for the end of sentences
-        in dictionary.
-    :type end_mark: basestring
-    :param unk_mark: The token representing for unknown word in dictionary.
-    :type unk_mark: basestring
-    :param seed: The seed for random.
-    :type seed: int
-    """
-
-    def __init__(self,
-                 src_vocab_fpath,
-                 trg_vocab_fpath,
-                 fpattern,
-                 batch_size,
-                 pool_size,
-                 sort_type=SortType.GLOBAL,
-                 clip_last_batch=True,
-                 tar_fname=None,
-                 min_length=0,
-                 max_length=100,
-                 shuffle=True,
-                 shuffle_batch=False,
-                 use_token_batch=False,
-                 field_delimiter="\t",
-                 token_delimiter=" ",
-                 start_mark="<s>",
-                 end_mark="<e>",
-                 unk_mark="<unk>",
-                 seed=0):
-        self._src_vocab = self.load_dict(src_vocab_fpath)
-        self._only_src = True
-        if trg_vocab_fpath is not None:
-            self._trg_vocab = self.load_dict(trg_vocab_fpath)
-            self._only_src = False
-        self._pool_size = pool_size
-        self._batch_size = batch_size
-        self._use_token_batch = use_token_batch
-        self._sort_type = sort_type
-        self._clip_last_batch = clip_last_batch
-        self._shuffle = shuffle
-        self._shuffle_batch = shuffle_batch
-        self._min_length = min_length
-        self._max_length = max_length
-        self._field_delimiter = field_delimiter
-        self._token_delimiter = token_delimiter
-        self.load_src_trg_ids(end_mark, fpattern, start_mark, tar_fname,
-                              unk_mark)
-        self._random = random.Random(x=seed)
-
-    def load_src_trg_ids(self, end_mark, fpattern, start_mark, tar_fname,
-                         unk_mark):
-        converters = [
-            Converter(
-                vocab=self._src_vocab,
-                beg=self._src_vocab[start_mark],
-                end=self._src_vocab[end_mark],
-                unk=self._src_vocab[unk_mark],
-                delimiter=self._token_delimiter)
-        ]
-        if not self._only_src:
-            converters.append(
-                Converter(
-                    vocab=self._trg_vocab,
-                    beg=self._trg_vocab[start_mark],
-                    end=self._trg_vocab[end_mark],
-                    unk=self._trg_vocab[unk_mark],
-                    delimiter=self._token_delimiter))
-
-        converters = ComposedConverter(converters)
-
-        self._src_seq_ids = []
-        self._trg_seq_ids = None if self._only_src else []
-        self._sample_infos = []
-
-        for i, line in enumerate(self._load_lines(fpattern, tar_fname)):
-            src_trg_ids = converters(line)
-            self._src_seq_ids.append(src_trg_ids[0])
-            lens = [len(src_trg_ids[0])]
-            if not self._only_src:
-                self._trg_seq_ids.append(src_trg_ids[1])
-                lens.append(len(src_trg_ids[1]))
-            self._sample_infos.append(SampleInfo(i, max(lens), min(lens)))
-
-    def _load_lines(self, fpattern, tar_fname):
-        fpaths = glob.glob(fpattern)
-
-        if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]):
-            if tar_fname is None:
-                raise Exception("If tar file provided, please set tar_fname.")
-
-            f = tarfile.open(fpaths[0], "r")
-            for line in f.extractfile(tar_fname):
-                line = cpt.to_text(line)
-                fields = line.strip("\n").split(self._field_delimiter)
-                if (not self._only_src and len(fields) == 2) or (
-                        self._only_src and len(fields) == 1):
-                    yield fields
-        else:
-            for fpath in fpaths:
-                if not os.path.isfile(fpath):
-                    raise IOError("Invalid file: %s" % fpath)
-
-                with open(fpath, "rb") as f:
-                    for line in f:
-                        line = cpt.to_text(line)
-                        fields = line.strip("\n").split(self._field_delimiter)
-                        if (not self._only_src and len(fields) == 2) or (
-                                self._only_src and len(fields) == 1):
-                            yield fields
-
-    @staticmethod
-    def load_dict(dict_path, reverse=False):
-        word_dict = {}
-        with open(dict_path, "rb") as fdict:
-            for idx, line in enumerate(fdict):
-                line = cpt.to_text(line)
-                if reverse:
-                    word_dict[idx] = line.strip("\n")
-                else:
-                    word_dict[line.strip("\n")] = idx
-        return word_dict
-
-    def batch_generator(self):
-        # global sort or global shuffle
-        if self._sort_type == SortType.GLOBAL:
-            infos = sorted(
-                self._sample_infos, key=lambda x: x.max_len, reverse=True)
-        else:
-            if self._shuffle:
-                infos = self._sample_infos
-                self._random.shuffle(infos)
-            else:
-                infos = self._sample_infos
-
-            if self._sort_type == SortType.POOL:
-                for i in range(0, len(infos), self._pool_size):
-                    infos[i:i + self._pool_size] = sorted(
-                        infos[i:i + self._pool_size], key=lambda x: x.max_len)
-
-        # concat batch
-        batches = []
-        batch_creator = TokenBatchCreator(
-            self._batch_size
-        ) if self._use_token_batch else SentenceBatchCreator(self._batch_size)
-        batch_creator = MinMaxFilter(self._max_length, self._min_length,
-                                     batch_creator)
-
-        for info in infos:
-            batch = batch_creator.append(info)
-            if batch is not None:
-                batches.append(batch)
-
-        if not self._clip_last_batch and len(batch_creator.batch) != 0:
-            batches.append(batch_creator.batch)
-
-        if self._shuffle_batch:
-            self._random.shuffle(batches)
-
-        for batch in batches:
-            batch_ids = [info.i for info in batch]
-
-            if self._only_src:
-                yield [[self._src_seq_ids[idx]] for idx in batch_ids]
-            else:
-                yield [(self._src_seq_ids[idx], self._trg_seq_ids[idx][:-1],
-                        self._trg_seq_ids[idx][1:]) for idx in batch_ids]
-
-
-#from transformer_model import transformer
-def position_encoding_init(n_position, d_pos_vec):
-    """
-    Generate the initial values for the sinusoid position encoding table.
-    """
-    position_enc = np.array([[
-        pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
-        for j in range(d_pos_vec)
-    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
-    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
-    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
-    return position_enc.astype("float32")
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=const_para_attr,
-                      bias_attr=const_bias_attr)
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=const_para_attr,
-                      bias_attr=const_bias_attr)
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=const_para_attr,
-                      bias_attr=const_bias_attr)
-        return q, k, v
-
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        if n_head == 1:
-            return x
-
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(
-            x=x, shape=[0, 0, n_head, hidden_size // n_head])
-
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(
-            x=trans_x,
-            shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])))
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(
-                weights,
-                dropout_prob=dropout_rate,
-                seed=ModelHyperParams.dropout_seed,
-                is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
-    if cache is not None:  # use cache and concat time steps
-        k = cache["k"] = layers.concat([cache["k"], k], axis=1)
-        v = cache["v"] = layers.concat([cache["v"], v], axis=1)
-
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
-                                                  dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=const_para_attr,
-                         bias_attr=const_bias_attr)
-    return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act="relu",
-                       param_attr=const_para_attr,
-                       bias_attr=const_bias_attr)
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=const_para_attr,
-                    bias_attr=const_bias_attr)
-    return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.initializer.Constant(1.),
-                bias_attr=fluid.initializer.Constant(0.))
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(
-                    out,
-                    dropout_prob=dropout_rate,
-                    seed=ModelHyperParams.dropout_seed,
-                    is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def prepare_encoder(src_word,
-                    src_pos,
-                    src_vocab_size,
-                    src_emb_dim,
-                    src_max_len,
-                    dropout_rate=0.,
-                    word_emb_param_name=None,
-                    pos_enc_param_name=None):
-    """Add word embeddings and position encodings.
-    The output tensor has a shape of:
-    [batch_size, max_src_length_in_batch, d_model].
-    This module is used at the bottom of the encoder stacks.
-    """
-    if TrainTaskConfig.check_acc:
-        src_word_emb = layers.embedding(
-            src_word,
-            size=[src_vocab_size, src_emb_dim],
-            param_attr=fluid.ParamAttr(
-                name=word_emb_param_name,
-                initializer=fluid.initializer.ConstantInitializer(0.001)))
-    else:
-        src_word_emb = layers.embedding(
-            src_word,
-            size=[src_vocab_size, src_emb_dim],
-            param_attr=fluid.ParamAttr(
-                name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
-
-    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
-    src_pos_enc = layers.embedding(
-        src_pos,
-        size=[src_max_len, src_emb_dim],
-        param_attr=fluid.ParamAttr(
-            name=pos_enc_param_name,
-            trainable=False,
-            initializer=fluid.initializer.ConstantInitializer(0.001)))
-    src_pos_enc.stop_gradient = True
-    enc_input = src_word_emb + src_pos_enc
-    return layers.dropout(
-        enc_input,
-        dropout_prob=dropout_rate,
-        seed=ModelHyperParams.dropout_seed,
-        is_test=False) if dropout_rate else enc_input
-
-
-prepare_encoder = partial(
-    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
-prepare_decoder = partial(
-    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  dropout_rate=0.):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(enc_input, enc_input, enc_input,
-                                       attn_bias, d_key, d_value, d_model,
-                                       n_head, dropout_rate)
-    attn_output = post_process_layer(enc_input, attn_output, "dan",
-                                     dropout_rate)
-    ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
-    return post_process_layer(attn_output, ffd_output, "dan", dropout_rate)
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            dropout_rate=0.):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value,
-                                   d_model, d_inner_hid, dropout_rate)
-        enc_input = enc_output
-    return enc_output
-
-
-def decoder_layer(dec_input,
-                  enc_output,
-                  slf_attn_bias,
-                  dec_enc_attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  dropout_rate=0.,
-                  cache=None):
-    """ The layer to be stacked in decoder part.
-    The structure of this module is similar to that in the encoder part except
-    a multi-head attention is added to implement encoder-decoder attention.
-    """
-    slf_attn_output = multi_head_attention(
-        dec_input,
-        dec_input,
-        dec_input,
-        slf_attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        dropout_rate,
-        cache, )
-    slf_attn_output = post_process_layer(
-        dec_input,
-        slf_attn_output,
-        "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
-    enc_attn_output = multi_head_attention(
-        slf_attn_output,
-        enc_output,
-        enc_output,
-        dec_enc_attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        dropout_rate, )
-    enc_attn_output = post_process_layer(
-        slf_attn_output,
-        enc_attn_output,
-        "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
-    ffd_output = positionwise_feed_forward(
-        enc_attn_output,
-        d_inner_hid,
-        d_model, )
-    dec_output = post_process_layer(
-        enc_attn_output,
-        ffd_output,
-        "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
-    return dec_output
-
-
-def decoder(dec_input,
-            enc_output,
-            dec_slf_attn_bias,
-            dec_enc_attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            dropout_rate=0.,
-            caches=None):
-    """
-    The decoder is composed of a stack of identical decoder_layer layers.
-    """
-    for i in range(n_layer):
-        cache = None
-        if caches is not None:
-            cache = caches[i]
-
-        dec_output = decoder_layer(
-            dec_input,
-            enc_output,
-            dec_slf_attn_bias,
-            dec_enc_attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            dropout_rate,
-            cache=cache)
-        dec_input = dec_output
-    return dec_output
-
-
-def make_all_inputs(input_fields):
-    """
-    Define the input data layers for the transformer model.
-    """
-    inputs = []
-    for input_field in input_fields:
-        input_var = layers.data(
-            name=input_field,
-            shape=input_descs[input_field][0],
-            dtype=input_descs[input_field][1],
-            lod_level=input_descs[input_field][2]
-            if len(input_descs[input_field]) == 3 else 0,
-            append_batch_size=False)
-        inputs.append(input_var)
-    return inputs
-
-
-def transformer(
-        src_vocab_size,
-        trg_vocab_size,
-        max_length,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate,
-        weight_sharing,
-        label_smooth_eps, ):
-    if weight_sharing:
-        assert src_vocab_size == src_vocab_size, (
-            "Vocabularies in source and target should be same for weight sharing."
-        )
-    enc_inputs = make_all_inputs(encoder_data_input_fields)
-
-    enc_output = wrap_encoder(
-        src_vocab_size,
-        max_length,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate,
-        weight_sharing,
-        enc_inputs, )
-
-    dec_inputs = make_all_inputs(decoder_data_input_fields[:-1])
-
-    predict = wrap_decoder(
-        trg_vocab_size,
-        max_length,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate,
-        weight_sharing,
-        dec_inputs,
-        enc_output, )
-
-    # Padding index do not contribute to the total loss. The weights is used to
-    # cancel padding index in calculating the loss.
-    label, weights = make_all_inputs(label_data_input_fields)
-    if label_smooth_eps:
-        label = layers.label_smooth(
-            label=layers.one_hot(
-                input=label, depth=trg_vocab_size),
-            epsilon=label_smooth_eps)
-
-    cost = layers.softmax_with_cross_entropy(
-        logits=layers.reshape(
-            predict, shape=[-1, trg_vocab_size]),
-        label=label,
-        soft_label=True if label_smooth_eps else False)
-    weighted_cost = cost * weights
-    sum_cost = layers.reduce_sum(weighted_cost)
-    token_num = layers.reduce_sum(weights)
-    avg_cost = sum_cost / token_num
-    avg_cost.stop_gradient = True
-    return sum_cost, avg_cost, predict, token_num
-
-
-def wrap_encoder(src_vocab_size,
-                 max_length,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 dropout_rate,
-                 weight_sharing,
-                 enc_inputs=None):
-    """
-    The wrapper assembles together all needed layers for the encoder.
-    """
-    if enc_inputs is None:
-        # This is used to implement independent encoder program in inference.
-        src_word, src_pos, src_slf_attn_bias = \
-            make_all_inputs(encoder_data_input_fields)
-    else:
-        src_word, src_pos, src_slf_attn_bias = \
-            enc_inputs
-    enc_input = prepare_encoder(
-        src_word,
-        src_pos,
-        src_vocab_size,
-        d_model,
-        max_length,
-        dropout_rate,
-        word_emb_param_name=word_emb_param_names[0])
-    enc_output = encoder(enc_input, src_slf_attn_bias, n_layer, n_head, d_key,
-                         d_value, d_model, d_inner_hid, dropout_rate)
-    return enc_output
-
-
-def wrap_decoder(trg_vocab_size,
-                 max_length,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 dropout_rate,
-                 weight_sharing,
-                 dec_inputs=None,
-                 enc_output=None,
-                 caches=None):
-    """
-    The wrapper assembles together all needed layers for the decoder.
-    """
-    if dec_inputs is None:
-        # This is used to implement independent decoder program in inference.
-        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
-        enc_output = make_all_inputs(
-            decoder_data_input_fields + decoder_util_input_fields)
-    else:
-        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
-
-    dec_input = prepare_decoder(
-        trg_word,
-        trg_pos,
-        trg_vocab_size,
-        d_model,
-        max_length,
-        dropout_rate,
-        word_emb_param_name=word_emb_param_names[0]
-        if weight_sharing else word_emb_param_names[1])
-    dec_output = decoder(
-        dec_input,
-        enc_output,
-        trg_slf_attn_bias,
-        trg_src_attn_bias,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate,
-        caches=caches)
-    # Return logits for training and probs for inference.
-    if weight_sharing:
-        predict = layers.matmul(
-            x=dec_output,
-            y=fluid.framework._get_var(word_emb_param_names[0]),
-            transpose_y=True)
-    else:
-        predict = layers.fc(input=dec_output,
-                            size=trg_vocab_size,
-                            num_flatten_dims=2,
-                            param_attr=const_para_attr,
-                            bias_attr=const_bias_attr)
-    if dec_inputs is None:
-        predict = layers.softmax(predict)
-    return predict
-
-
-def fast_decode(
-        src_vocab_size,
-        trg_vocab_size,
-        max_in_len,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate,
-        weight_sharing,
-        beam_size,
-        max_out_len,
-        eos_idx, ):
-    """
-    Use beam search to decode. Caches will be used to store states of history
-    steps which can make the decoding faster.
-    """
-    enc_output = wrap_encoder(src_vocab_size, max_in_len, n_layer, n_head,
-                              d_key, d_value, d_model, d_inner_hid,
-                              dropout_rate, weight_sharing)
-    start_tokens, init_scores, trg_src_attn_bias = \
-        make_all_inputs(fast_decoder_data_input_fields )
-
-    def beam_search():
-        max_len = layers.fill_constant(
-            shape=[1], dtype=start_tokens.dtype, value=max_out_len)
-        step_idx = layers.fill_constant(
-            shape=[1], dtype=start_tokens.dtype, value=0)
-        cond = layers.less_than(x=step_idx, y=max_len)
-        while_op = layers.While(cond)
-        # array states will be stored for each step.
-        ids = layers.array_write(
-            layers.reshape(start_tokens, (-1, 1)), step_idx)
-        scores = layers.array_write(init_scores, step_idx)
-        # cell states will be overwrited at each step.
-        # caches contains states of history steps to reduce redundant
-        # computation in decoder.
-        caches = [{
-            "k": layers.fill_constant_batch_size_like(
-                input=start_tokens,
-                shape=[-1, 0, d_model],
-                dtype=enc_output.dtype,
-                value=0),
-            "v": layers.fill_constant_batch_size_like(
-                input=start_tokens,
-                shape=[-1, 0, d_model],
-                dtype=enc_output.dtype,
-                value=0)
-        } for i in range(n_layer)]
-        with while_op.block():
-            pre_ids = layers.array_read(array=ids, i=step_idx)
-            pre_ids = layers.reshape(pre_ids, (-1, 1, 1))
-            pre_scores = layers.array_read(array=scores, i=step_idx)
-            # sequence_expand can gather sequences according to lod thus can be
-            # used in beam search to sift states corresponding to selected ids.
-            pre_src_attn_bias = layers.sequence_expand(
-                x=trg_src_attn_bias, y=pre_scores)
-            pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
-            pre_caches = [{
-                "k": layers.sequence_expand(
-                    x=cache["k"], y=pre_scores),
-                "v": layers.sequence_expand(
-                    x=cache["v"], y=pre_scores),
-            } for cache in caches]
-            pre_pos = layers.elementwise_mul(
-                x=layers.fill_constant_batch_size_like(
-                    input=pre_enc_output,  # can't use pre_ids here since it has lod
-                    value=1,
-                    shape=[-1, 1, 1],
-                    dtype=pre_ids.dtype),
-                y=layers.increment(
-                    x=step_idx, value=1.0, in_place=False),
-                axis=0)
-            logits = wrap_decoder(
-                trg_vocab_size,
-                max_in_len,
-                n_layer,
-                n_head,
-                d_key,
-                d_value,
-                d_model,
-                d_inner_hid,
-                dropout_rate,
-                weight_sharing,
-                dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias),
-                enc_output=pre_enc_output,
-                caches=pre_caches)
-            logits = layers.reshape(logits, (-1, trg_vocab_size))
-
-            topk_scores, topk_indices = layers.topk(
-                input=layers.softmax(logits), k=beam_size)
-            accu_scores = layers.elementwise_add(
-                x=layers.log(topk_scores),
-                y=layers.reshape(
-                    pre_scores, shape=[-1]),
-                axis=0)
-            # beam_search op uses lod to distinguish branches.
-            topk_indices = layers.lod_reset(topk_indices, pre_ids)
-            selected_ids, selected_scores = layers.beam_search(
-                pre_ids=pre_ids,
-                pre_scores=pre_scores,
-                ids=topk_indices,
-                scores=accu_scores,
-                beam_size=beam_size,
-                end_id=eos_idx)
-
-            layers.increment(x=step_idx, value=1.0, in_place=True)
-            # update states
-            layers.array_write(selected_ids, i=step_idx, array=ids)
-            layers.array_write(selected_scores, i=step_idx, array=scores)
-            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
-            layers.assign(pre_enc_output, enc_output)
-            for i in range(n_layer):
-                layers.assign(pre_caches[i]["k"], caches[i]["k"])
-                layers.assign(pre_caches[i]["v"], caches[i]["v"])
-            length_cond = layers.less_than(x=step_idx, y=max_len)
-            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
-            layers.logical_and(x=length_cond, y=finish_cond, out=cond)
-
-        finished_ids, finished_scores = layers.beam_search_decode(
-            ids, scores, beam_size=beam_size, end_id=eos_idx)
-        return finished_ids, finished_scores
-
-    finished_ids, finished_scores = beam_search()
-    return finished_ids, finished_scores
-
-
-def get_model(is_dist, is_async):
-    sum_cost, avg_cost, predict, token_num = transformer(
-        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
-        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
-        ModelHyperParams.n_head, ModelHyperParams.d_key,
-        ModelHyperParams.d_value, ModelHyperParams.d_model,
-        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
-        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
-
-    local_lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
-                                               TrainTaskConfig.warmup_steps,
-                                               TrainTaskConfig.learning_rate)
-    # Context to do validation.
-    test_program = fluid.default_main_program().clone(for_test=True)
-
-    if not is_dist:
-        optimizer = fluid.optimizer.Adam(
-            learning_rate=local_lr_scheduler.learning_rate,
-            beta1=TrainTaskConfig.beta1,
-            beta2=TrainTaskConfig.beta2,
-            epsilon=TrainTaskConfig.eps)
-        optimizer.minimize(sum_cost)
-    elif is_async:
-        optimizer = fluid.optimizer.SGD(0.003)
-        optimizer.minimize(sum_cost)
-    else:
-        lr_decay = fluid.layers\
-         .learning_rate_scheduler\
-         .noam_decay(ModelHyperParams.d_model,
-            TrainTaskConfig.warmup_steps)
-
-        optimizer = fluid.optimizer.Adam(
-            learning_rate=lr_decay,
-            beta1=TrainTaskConfig.beta1,
-            beta2=TrainTaskConfig.beta2,
-            epsilon=TrainTaskConfig.eps)
-        optimizer.minimize(sum_cost)
-
-    return sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program
-
-
-def update_args():
-    src_dict = DataReader.load_dict(TrainTaskConfig.src_vocab_fpath)
-    trg_dict = DataReader.load_dict(TrainTaskConfig.trg_vocab_fpath)
-    dict_args = [
-        "src_vocab_size", str(len(src_dict)), "trg_vocab_size",
-        str(len(trg_dict)), "bos_idx",
-        str(src_dict[TrainTaskConfig.special_token[0]]), "eos_idx",
-        str(src_dict[TrainTaskConfig.special_token[1]]), "unk_idx",
-        str(src_dict[TrainTaskConfig.special_token[2]])
-    ]
-    merge_cfg_from_list(dict_args, [TrainTaskConfig, ModelHyperParams])
-
-
-class DistTransformer2x2(TestDistRunnerBase):
-    def run_pserver(self, args):
-        get_model(True, not args.sync_mode)
-        t = self.get_transpiler(args.trainer_id,
-                                fluid.default_main_program(), args.endpoints,
-                                args.trainers, args.sync_mode)
-        pserver_prog = t.get_pserver_program(args.current_endpoint)
-        startup_prog = t.get_startup_program(args.current_endpoint,
-                                             pserver_prog)
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-        exe.run(pserver_prog)
-
-    def run_trainer(self, args):
-        TrainTaskConfig.use_gpu = args.use_cuda
-        sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
-            args.is_dist, not args.sync_mode)
-
-        if args.is_dist:
-            t = self.get_transpiler(args.trainer_id,
-                                    fluid.default_main_program(),
-                                    args.endpoints, args.trainers,
-                                    args.sync_mode)
-            trainer_prog = t.get_trainer_program()
-            TrainTaskConfig.batch_size = 10
-            TrainTaskConfig.train_file_pattern = TrainTaskConfig.data_path + "train.tok.clean.bpe.32000.en-de.train_{}".format(
-                args.trainer_id)
-        else:
-            TrainTaskConfig.batch_size = 20
-            trainer_prog = fluid.default_main_program()
-
-        if args.use_cuda:
-            place = fluid.CUDAPlace(0)
-        else:
-            place = fluid.CPUPlace()
-
-        startup_exe = fluid.Executor(place)
-
-        TrainTaskConfig.local = not args.is_dist
-
-        train_loop(startup_exe, trainer_prog, 1, sum_cost, avg_cost,
-                   local_lr_scheduler, token_num, predict, test_program)
-
-
-if __name__ == "__main__":
-    update_args()
-    runtime_main(DistTransformer2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py
deleted file mode 100644
index 835306edd0f17490dd10110db40f42dce30b25bb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import math
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import os
-import signal
-from test_dist_base import TestDistRunnerBase, runtime_main
-
-IS_SPARSE = True
-EMBED_SIZE = 32
-HIDDEN_SIZE = 256
-N = 5
-
-# Fix seed for test
-fluid.default_startup_program().random_seed = 1
-fluid.default_main_program().random_seed = 1
-
-
-class TestDistWord2vec2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2):
-        BATCH_SIZE = batch_size
-
-        def __network__(words):
-            embed_first = fluid.layers.embedding(
-                input=words[0],
-                size=[dict_size, EMBED_SIZE],
-                dtype='float32',
-                is_sparse=IS_SPARSE,
-                param_attr=fluid.ParamAttr(
-                    name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1)))
-            embed_second = fluid.layers.embedding(
-                input=words[1],
-                size=[dict_size, EMBED_SIZE],
-                dtype='float32',
-                is_sparse=IS_SPARSE,
-                param_attr=fluid.ParamAttr(
-                    name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1)))
-            embed_third = fluid.layers.embedding(
-                input=words[2],
-                size=[dict_size, EMBED_SIZE],
-                dtype='float32',
-                is_sparse=IS_SPARSE,
-                param_attr=fluid.ParamAttr(
-                    name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1)))
-            embed_forth = fluid.layers.embedding(
-                input=words[3],
-                size=[dict_size, EMBED_SIZE],
-                dtype='float32',
-                is_sparse=IS_SPARSE,
-                param_attr=fluid.ParamAttr(
-                    name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1)))
-
-            concat_embed = fluid.layers.concat(
-                input=[embed_first, embed_second, embed_third, embed_forth],
-                axis=1)
-            hidden1 = fluid.layers.fc(
-                input=concat_embed,
-                size=HIDDEN_SIZE,
-                act='sigmoid',
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.1)))
-            predict_word = fluid.layers.fc(
-                input=hidden1,
-                size=dict_size,
-                act='softmax',
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.1)))
-            cost = fluid.layers.cross_entropy(
-                input=predict_word, label=words[4])
-            avg_cost = fluid.layers.mean(cost)
-            return avg_cost, predict_word
-
-        word_dict = paddle.dataset.imikolov.build_dict()
-        dict_size = len(word_dict)
-
-        first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-        second_word = fluid.layers.data(
-            name='secondw', shape=[1], dtype='int64')
-        third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-        forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
-        next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-        avg_cost, predict_word = __network__(
-            [first_word, second_word, third_word, forth_word, next_word])
-
-        inference_program = paddle.fluid.default_main_program().clone()
-
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-        sgd_optimizer.minimize(avg_cost)
-
-        train_reader = paddle.batch(
-            paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-        test_reader = paddle.batch(
-            paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
-
-        return inference_program, avg_cost, train_reader, test_reader, None, predict_word
-
-
-if __name__ == "__main__":
-    import os
-    os.environ['CPU_NUM'] = '1'
-    os.environ['USE_CUDA'] = "FALSE"
-    runtime_main(TestDistWord2vec2x2)
diff --git a/python/paddle/fluid/tests/unittests/fake_reader.py b/python/paddle/fluid/tests/unittests/fake_reader.py
deleted file mode 100644
index 34a256e15dd2f3a8a83aaba4e178efe52c8d8547..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/fake_reader.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import six
-
-
-def fake_imdb_reader(word_dict_size,
-                     sample_num,
-                     lower_seq_len=100,
-                     upper_seq_len=200,
-                     class_dim=2):
-    def __reader__():
-        for _ in six.moves.range(sample_num):
-            length = np.random.random_integers(
-                low=lower_seq_len, high=upper_seq_len, size=[1])[0]
-            ids = np.random.random_integers(
-                low=0, high=word_dict_size - 1, size=[length]).astype('int64')
-            label = np.random.random_integers(
-                low=0, high=class_dim - 1, size=[1]).astype('int64')[0]
-            yield ids, label
-
-    return __reader__
diff --git a/python/paddle/fluid/tests/unittests/feed_data_reader.py b/python/paddle/fluid/tests/unittests/feed_data_reader.py
deleted file mode 100644
index 1e6016d57bd776ecc1f3ee0db63808e5bcb97eea..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/feed_data_reader.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import six
-import paddle.fluid as fluid
-from paddle.fluid.framework import Variable
-
-
-def cyclic_reader(reader):
-    def __reader__():
-        while True:
-            for data in reader():
-                yield data
-
-    return __reader__
-
-
-class FeedDataReader(object):
-    def __init__(self, feed_list, reader):
-        self._feed_list = []
-        for var in feed_list:
-            if isinstance(var, Variable):
-                self._feed_list.append(var.name)
-            else:
-                self._feed_list.append(var)
-
-        self._reader = cyclic_reader(reader)
-        self._iter = self._reader()
-
-    def _feed_executor(self):
-        next_data = next(self._iter)
-        feed_data = dict()
-        assert len(self._feed_list) == len(next_data)
-        for key, value in six.moves.zip(self._feed_list, next_data):
-            feed_data[key] = value
-        return feed_data
-
-    def _feed_parallel_executor(self, device_num):
-        feed_data = []
-        for _ in six.moves.range(device_num):
-            feed_data.append(self._feed_executor())
-
-        return feed_data
-
-    def get_next(self, exe, program):
-        result = []
-        assert isinstance(exe, fluid.Executor), "exe must be Executor"
-        use_cuda = isinstance(exe.place, fluid.CUDAPlace)
-        if isinstance(program, fluid.CompiledProgram):
-            if program._is_data_parallel:
-                use_executor = False
-                if program._places is None:
-                    device_num = len(fluid.cuda_places()) if use_cuda else len(
-                        fluid.cpu_places())
-                else:
-                    device_num = len(program._places)
-            else:
-                use_executor = True
-                device_num = 1
-        else:
-            use_executor = True
-            device_num = 1
-
-        if use_executor:
-            return self._feed_executor()
-        else:
-            return self._feed_parallel_executor(device_num)
diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py
deleted file mode 100644
index 644a9a92ab9ea806c55e2bdfceb1b246e80cd691..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ /dev/null
@@ -1,393 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This is the lib for gradient checker unittest."""
-
-from __future__ import print_function
-
-import unittest
-import six
-import collections
-import numpy as np
-from itertools import product
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.executor import Executor
-from paddle.fluid.backward import _append_grad_suffix_, _as_list
-
-
-def _product(t):
-    if isinstance(t, int):
-        return t
-    else:
-        return np.product(t)
-
-
-def dtype_to_np_dtype(dtype):
-    if dtype == core.VarDesc.VarType.FP32:
-        return np.float32
-    elif dtype == core.VarDesc.VarType.FP64:
-        return np.float64
-    elif dtype == core.VarDesc.VarType.FP16:
-        return np.float16
-    else:
-        raise ValueError("Not supported data type " + str(dtype))
-
-
-def _get_item(t, i, np_dtype):
-    if np_dtype == np.float16:
-        np_t = np.array(t).astype(np.float16)
-        np_t = np_t.flatten()
-        return np_t[i]
-    elif np_dtype == np.float32:
-        return t._get_float_element(i)
-    elif np_dtype == np.float64:
-        return t._get_double_element(i)
-    else:
-        raise ValueError("Not supported data type " + str(np_dtype))
-
-
-def _set_item(t, i, e, np_dtype):
-    if np_dtype == np.float16:
-        np_t = np.array(t).astype(np.float16)
-        shape = np_t.shape
-        np_t = np_t.flatten()
-        np_t[i] = e
-        np_t = np_t.reshape(shape).view(np.uint16)
-        t.set(np_t, place)
-    elif np_dtype == np.float32:
-        t._set_float_element(i, e)
-    elif np_dtype == np.float64:
-        t._set_double_element(i, e)
-    else:
-        raise ValueError("Not supported data type " + str(np_dtype))
-
-
-def set_var_in_scope(scope, place, name, value, recursive_seq_len=None):
-    t = scope.var(name).get_tensor()
-    t.set(value, place)
-    if recursive_seq_len:
-        t.set_recursive_sequence_lengths(recursive_seq_len)
-    return t
-
-
-def var_to_np_array_in_scope(scope, place, name):
-    return np.array(scope.var(name).get_tensor())
-
-
-def make_jacobian(x, y_size, np_dtype):
-    if isinstance(x, fluid.framework.Variable):
-        return np.zeros((_product(x.shape), y_size), dtype=np_dtype)
-    elif isinstance(x, collections.Sequence):
-        jacobians = list(
-            filter(lambda t: t is not None, (make_jacobian(
-                item, y_size, np_dtype) for item in x)))
-        return jacobians
-    else:
-        None
-
-
-def _compute_numerical_jacobian(program, x, y, place, scope, delta):
-    """Computes the numeric Jacobian for dy/dx.
-
-    Computes the numeric Jacobian by slightly perturbing the inputs and
-    measuring the differences on the output.
-
-    Args:
-        program (Program): the network program.
-        x (Variable): the input variables.
-        y (list[Variable]): the output variables.
-        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
-        scope (Scope): the scope used to run program.
-        delta: the amount of perturbation we give to the input
-
-    Returns:
-        A list of 2-D numpy array, the list length is len(y).
-        Each 2-D numpy array represents the Jacobian for dy_i/dx.
-        It has "x_size" rows and "y_size" columns
-        where "x_size" is the number of elements in x and
-        "y_size" is the number of elements in each y_i.
-    """
-    if not isinstance(x, fluid.framework.Variable):
-        raise TypeError('x is not Variable')
-
-    # To compute the jacobian, treat x and y as one-dimensional vectors.
-    y = _as_list(y)
-    exe = fluid.Executor(place)
-
-    def run():
-        y_res = exe.run(program, scope=scope, fetch_list=y)
-        return [yi.flatten() for yi in y_res]
-
-    x_name = x.name
-    x_shape = x.shape
-    x_size = _product(x_shape)
-    x_t = scope.find_var(x_name).get_tensor()
-
-    np_type = dtype_to_np_dtype(x.dtype)
-    jacobian = [make_jacobian(x, _product(yi.shape), np_type) for yi in y]
-
-    for i in six.moves.xrange(x_size):
-        orig = _get_item(x_t, i, np_type)
-        x_pos = orig + delta
-        _set_item(x_t, i, x_pos, np_type)
-        y_pos = run()
-
-        x_neg = orig - delta
-        _set_item(x_t, i, x_neg, np_type)
-        y_neg = run()
-
-        _set_item(x_t, i, orig, np_type)
-
-        for j in six.moves.xrange(len(y)):
-            jacobian[j][i, :] = (y_pos[j] - y_neg[j]) / delta / 2.
-
-    return jacobian
-
-
-def _compute_analytical_jacobian(program, x, y, place, scope):
-    """Computes the analytical Jacobian for dy/dx.
-
-    Args:
-        program (Program): a Program with forward pass.
-        x (Variable|list[Variable]): a variable or list of variable
-        y (Variable): the target variable.
-        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
-        scope (Scope): the scope used to run program.
-
-    Returns:
-        A list of 2-D numpy array. The list length is len(x).
-        Each 2-D numpy array represents the Jacobian for dy/dx_i.
-        It has "xi_size" rows and "dy_size" columns
-        where "x_size" is the number of elements in x_i and
-        "dy_size" is the number of elements in y.
-    """
-    if not isinstance(y, fluid.framework.Variable):
-        raise TypeError('y is not Variable')
-
-    dy_name = _append_grad_suffix_(y.name)
-
-    np_type = dtype_to_np_dtype(y.dtype)
-    # create dy Variable in Program
-    dy = program.global_block().create_var(
-        name=dy_name, shape=y.shape, dtype=np_type, persistable=True)
-    # append backward
-    dx = fluid.gradients(y, x, dy)
-
-    # init dy tensor in scope
-    value = np.zeros(y.shape, dtype=np_type)
-    dy_t = set_var_in_scope(scope, place, dy_name, value)
-
-    exe = fluid.Executor(place)
-
-    y_size = _product(y.shape)
-
-    x = _as_list(x)
-    jacobian = make_jacobian(x, y_size, np_type)
-
-    # filter None in dx for DX/DY may be None in kernel
-    # only fetch not None dx in exe.run
-    filted = [(i, dxi) for i, dxi in enumerate(dx) if dxi is not None]
-    filted_idx, filted_dx = zip(*filted)
-
-    for i in six.moves.xrange(y_size):
-        _set_item(dy_t, i, 1, np_type)
-
-        dx_res = exe.run(program, scope=scope, fetch_list=filted_dx)
-
-        for j in six.moves.xrange(len(filted_dx)):
-            dx_idx = filted_idx[j]
-            if dx_res[j] is not None:
-                jacobian[dx_idx][:, i] = dx_res[j].flatten()
-            else:
-                jacobian[dx_idx][:, i] = np.zeros(
-                    dx[dx_idx].shape, dtype=np_type).flatten()
-
-        _set_item(dy_t, i, 0, np_type)
-
-    return jacobian
-
-
-def grad_check(x,
-               y,
-               x_init=None,
-               place=None,
-               program=None,
-               eps=1e-6,
-               atol=1e-5,
-               rtol=1e-3,
-               raise_exception=True):
-    """
-    Check numerical and analytical gradients for dy/dx.
-    Each Jacobian gradients is a 2-D array with shape [xi_size, yi_size].
-
-    Args:
-        x (Variable|list[Variable]): input variables to the program.
-        y (Variable|list[Variable]): output variables to the program.
-        x_init (numpy.array|list[numpy.array]|None): the init value for input x.
-        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
-        program (Program|None): a Program with forward pass.
-            If None, use fluid.default_main_program().
-        eps (float): perturbation for finite differences.
-        atol (float): absolute tolerance.
-        rtol (float): relative tolerance.
-        raise_exception (bool): whether to raise an exception if
-            the check fails. Default is True.
-    Returns:
-        True if all differences satisfy numpy.allclose condition.
-    """
-
-    def fail_test(msg):
-        if raise_exception:
-            raise RuntimeError(msg)
-        return False
-
-    # check input arguments
-    x = _as_list(x)
-    y = _as_list(y)
-
-    for v in x:
-        v.stop_gradient = False
-        v.persistable = True
-    if place is None:
-        place = fluid.CPUPlace()
-    if program is None:
-        program = fluid.default_main_program()
-
-    # init variable in strtup program
-    scope = fluid.executor.global_scope()
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    x_init = _as_list(x_init)
-    # init inputs if x_init is not None
-    if x_init:
-        if len(x_init) != len(x):
-            raise ValueError('len(x_init) (=%d) is not the same'
-                             ' as len(x) (= %d)' % (len(x_init), len(x)))
-        # init variable in main program
-        for var, arr in zip(x, x_init):
-            assert var.shape == arr.shape
-        feeds = {k.name: v for k, v in zip(x, x_init)}
-        exe.run(program, feed=feeds, scope=scope)
-
-    # [x_idx, y_idx]
-    numerical = [
-        _compute_numerical_jacobian(program, xi, y, place, scope, eps)
-        for xi in x
-    ]
-
-    # [y_idx, x_idx]
-    analytical = []
-    for yi in y:
-        prog = program.clone()
-
-        clone_x = []
-        clone_y = None
-        for b in prog.blocks:
-            if b.has_var(yi.name):
-                clone_y = b.var(yi.name)
-                break
-        for xi in x:
-            for b in prog.blocks:
-                if b.has_var(xi.name):
-                    clone_x.append(b.var(xi.name))
-                    break
-
-        analytical.append(
-            _compute_analytical_jacobian(prog, clone_x, clone_y, place, scope))
-
-    for i, (x_idx,
-            y_idx) in enumerate(product(*[range(len(x)), range(len(y))])):
-        a = analytical[y_idx][x_idx]
-        n = numerical[x_idx][y_idx]
-        if not np.allclose(a, n, rtol, atol):
-            msg = 'Jacobian mismatch for output %s ' \
-                  'with respect to input %s on %s,\n' \
-                  'numerical:%s\nanalytical:%s\n' \
-                  % (y[y_idx].name, x[x_idx].name, str(place), n, a)
-            return fail_test(msg)
-    return True
-
-
-def double_grad_check(x,
-                      y,
-                      x_init=None,
-                      y_grads=None,
-                      place=None,
-                      program=None,
-                      eps=1e-6,
-                      atol=1e-5,
-                      rtol=1e-3,
-                      raise_exception=True):
-    """
-    Check gradients of gradients. This function will append backward to the
-    program before second order gradient check.
-
-    Args:
-        x (Variable|list[Variable]): input variables to the program.
-        y (Variable|list[Variable]): output variables to the program.
-        x_init (numpy.array|list[numpy.array]|None): the init value for input x.
-        y_grads (numpy.array|list[numpy.array]|None): the gradients with respect to y.
-        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
-        program (Program|None): a Program with forward pass.
-            If None, use fluid.default_main_program().
-        eps (float): perturbation for finite differences.
-        atol (float): absolute tolerance.
-        rtol (float): relative tolerance.
-        raise_exception (bool): whether to raise an exception if
-            the check fails. Default is True.
-    Returns:
-        True if all differences satisfy numpy.allclose condition.
-    """
-    # check input arguments
-    x = _as_list(x)
-    for v in x:
-        v.stop_gradient = False
-        v.persistable = True
-    y = _as_list(y)
-
-    if program is None:
-        program = fluid.default_main_program()
-
-    if y_grads is None:
-        scope = fluid.executor.global_scope()
-        y_grads = []
-        y_grads_init = []
-        for yi in y:
-            dyi_name = _append_grad_suffix_(yi.name)
-            np_type = dtype_to_np_dtype(yi.dtype)
-            dy = program.global_block().create_var(
-                name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True)
-            dy.stop_gradient = False
-            v = np.random.random(size=yi.shape).astype(np_type)
-            set_var_in_scope(scope, place, dyi_name, v)
-            y_grads.append(dy)
-            y_grads_init.append(v)
-    else:
-        y_grads = _as_list(y_grads)
-        y_grads_init = [
-            var_to_np_array_in_scope(scope, place, v.name) for v in y_grads
-        ]
-
-    # append first order grads
-    target_grads = fluid.gradients(y, x, y_grads)
-
-    # y_grads are the input of first-order backward,
-    # so, they are also the input of second-order backward.
-    x += y_grads
-    x_init = _as_list(x_init)
-    x_init += y_grads_init
-
-    grad_check(x, target_grads, x_init, place, program, eps, atol, rtol)
diff --git a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
deleted file mode 100644
index 0e4fd8f69dcd3fb5ecca5635c8b04df86d1e6bab..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import six
-import unittest
-import time
-import math
-import multiprocessing
-import numpy as np
-
-import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-
-# open eager delete mode
-os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
-os.environ['FLAGS_fast_eager_deletion_mode'] = 'true'
-os.environ['CPU_NUM'] = '2'
-
-
-class BuildIrMemOptBase(unittest.TestCase):
-    def setup_reader(self):
-        self.batch_size = 32
-        self.word_dict = paddle.dataset.imdb.word_dict()
-        self.train_reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict),
-            batch_size=self.batch_size)
-
-    def check_network_convergence(self,
-                                  network,
-                                  use_cuda=True,
-                                  use_ir_memory_optimize=True,
-                                  enable_inplace=True,
-                                  iter=5):
-        if use_cuda and not core.is_compiled_with_cuda():
-            print('Skip use_cuda=True because Paddle is not compiled with cuda')
-            return
-
-        if os.name == 'nt':
-            print(
-                'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
-            )
-            return
-        fluid.default_startup_program().random_seed = 100
-        fluid.default_main_program().random_seed = 100
-
-        data = fluid.layers.data(
-            name="words", shape=[1], dtype="int64", lod_level=1)
-
-        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-        cost = network(data, label, len(self.word_dict))
-        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-        optimizer.minimize(cost)
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.enable_inplace = enable_inplace
-        build_strategy.memory_optimize = use_ir_memory_optimize
-
-        # execution
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-        reader = feeder.decorate_reader(self.train_reader, multi_devices=True)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        train_cp = compiler.CompiledProgram(fluid.default_main_program())
-        train_cp = train_cp.with_data_parallel(
-            loss_name=cost.name, build_strategy=build_strategy)
-        fetch_list = [cost.name]
-
-        begin = time.time()
-        first_loss, last_loss = None, None
-        step_id = 0
-        custom_iter = getattr(self, "iter", None)
-        if not custom_iter == None:
-            iter = custom_iter
-        for data in reader():
-            ret = exe.run(train_cp, feed=data, fetch_list=fetch_list)
-            print(ret)
-            step_id += 1
-            if step_id == 1:
-                first_loss = ret[0]
-            if step_id == iter:
-                last_loss = ret[0]
-                break
-        end = time.time()
-
-        print("%.4f Instance per second" % (
-            (self.batch_size * iter) / (end - begin)))
-
-        print(first_loss, last_loss)
-        avg_last_loss_val = np.array(last_loss).mean()
-        avg_first_loss_val = np.array(first_loss).mean()
-        if math.isnan(float(avg_last_loss_val)) or math.isnan(
-                float(avg_first_loss_val)):
-            sys.exit("got NaN loss, training failed.")
-
-        return first_loss, last_loss
-
-
-class TestIrMemOptBase(BuildIrMemOptBase):
-    def setUp(self):
-        self.network = None
-
-    def test_network(self):
-        if self.network is None or not core.is_compiled_with_cuda():
-            return
-
-        self.setup_reader()
-
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            with fluid.scope_guard(core.Scope()):
-                baseline_first_loss, baseline_last_loss = self.check_network_convergence(
-                    self.network)
-
-                cur_first_loss, cur_last_loss = self.check_network_convergence(
-                    self.network)
-
-                self.assertAlmostEquals(
-                    np.mean(baseline_last_loss),
-                    np.mean(cur_last_loss),
-                    delta=1e-6)
-                self.assertAlmostEquals(
-                    np.mean(baseline_first_loss),
-                    np.mean(cur_first_loss),
-                    delta=1e-6)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
deleted file mode 100644
index f71e04c09aa38b8cf7b3a167b84d4dc0e6cc3ec7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/__init__.py b/python/paddle/fluid/tests/unittests/mkldnn/__init__.py
deleted file mode 100644
index b94a21a7e406b833797f8f521c62a2351c2bc30a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
deleted file mode 100644
index c47115c466fc97548f5152cbca14d29aec9f675a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+++ /dev/null
@@ -1,161 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-
-
-def __assert_close(test_case, tensor, np_array, msg, atol=1e-4):
-    test_case.assertTrue(
-        np.allclose(
-            np.array(tensor), np_array, atol=atol), msg)
-
-
-def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
-                                            out_grad, x_grad):
-    place = core.CPUPlace()
-
-    var_dict = {'x': x, 'out': out, 'out@GRAD': out_grad, 'x@GRAD': x_grad}
-    var_names = list(var_dict.keys())
-    ground_truth = {name: var_dict[name] for name in var_names}
-
-    program = fluid.Program()
-    with fluid.program_guard(program):
-        block = program.global_block()
-        for name in ground_truth:
-            block.create_var(
-                name=name, dtype=np.float32, shape=ground_truth[name].shape)
-
-        op = block.append_op(
-            type=op_type,
-            inputs={'X': block.var('x'), },
-            outputs={'Out': block.var('out')},
-            attrs={'use_mkldnn': True})
-
-        # Generate backward op_desc
-        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
-                                                                  set(), [])
-        grad_op_desc = grad_op_desc_list[0]
-        new_op_desc = block.desc.append_op()
-        new_op_desc.copy_from(grad_op_desc)
-        for var_name in grad_op_desc.output_arg_names():
-            block.desc.var(var_name.encode('ascii'))
-        grad_op_desc.infer_var_type(block.desc)
-        grad_op_desc.infer_shape(block.desc)
-        for arg in grad_op_desc.output_arg_names():
-            grad_var = block.desc.find_var(arg.encode('ascii'))
-            grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-        exe = fluid.Executor(place)
-
-        # Do at least 2 iterations
-        for i in range(2):
-            out = exe.run(
-                program,
-                feed={name: var_dict[name]
-                      for name in ['x', 'out@GRAD']},
-                fetch_list=['x@GRAD', 'out'])
-
-        __assert_close(test_case, x_grad, out[0], 'x@GRAD')
-
-
-def check_if_mkldnn_batchnorm_primitives_exist_in_bwd(
-        test_case, var_dict, place, shape, data_layout):
-
-    var_names = [
-        'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
-        'saved_variance'
-    ]
-    ground_truth = {name: var_dict[name] for name in var_names}
-    program = fluid.Program()
-    with fluid.program_guard(program):
-        block = program.global_block()
-        for name in ground_truth:
-            block.create_var(
-                name=name, dtype='float32', shape=ground_truth[name].shape)
-        bn_op = block.append_op(
-            type="batch_norm",
-            inputs={
-                "X": block.var('x'),
-                "Scale": block.var('scale'),
-                "Bias": block.var('bias'),
-                "Mean": block.var('mean'),
-                "Variance": block.var('variance')
-            },
-            outputs={
-                "Y": block.var('y'),
-                "MeanOut": block.var('mean'),  # share memory
-                "VarianceOut": block.var('variance'),  # share memory
-                "SavedMean": block.var('saved_mean'),
-                "SavedVariance": block.var('saved_variance')
-            },
-            attrs={
-                "momentum": test_case.momentum,
-                "epsilon": test_case.epsilon,
-                "is_test": False,
-                "data_layout": data_layout,
-                "use_mkldnn": test_case.use_mkldnn,
-                "fuse_with_relu": test_case.fuse_with_relu,
-                "use_global_stats": test_case.use_global_stats
-            })
-        block.create_var(
-            name='y@GRAD', dtype='float32', shape=var_dict['y'].shape)
-
-        # generate backward op_desc
-        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-            bn_op.desc, test_case.no_grad_set, [])
-        grad_op_desc = grad_op_desc_list[0]
-        new_op_desc = block.desc.append_op()
-        new_op_desc.copy_from(grad_op_desc)
-        for var_name in grad_op_desc.output_arg_names():
-            block.desc.var(var_name.encode("ascii"))
-        grad_op_desc.infer_var_type(block.desc)
-        grad_op_desc.infer_shape(block.desc)
-        for arg in grad_op_desc.output_arg_names():
-            grad_var = block.desc.find_var(arg.encode("ascii"))
-            grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-        exe = fluid.Executor(place)
-
-        # Do at least 2 iterations
-        for i in range(2):
-            out = exe.run(
-                program,
-                feed={
-                    name: var_dict[name]
-                    for name in
-                    ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
-                },
-                fetch_list=test_case.fetch_list)
-            for id, name in enumerate(test_case.fetch_list):
-                __assert_close(test_case, var_dict[name], out[id], name)
-
-        print("MKLDNN op test forward passed: ", str(place), data_layout)
-
-
-def format_reorder(out, size):
-    in_n = size[0]
-    out_h = size[2]
-    out_w = size[3]
-    out_c = size[1]
-    out_tmp = np.zeros((in_n, out_h, out_w, out_c))
-    for n in range(in_n):
-        for i in range(out_h):
-            for j in range(out_w):
-                for m in range(out_c):
-                    out_tmp[n, i, j, m] = out[n, m, i, j]
-    return out_tmp.reshape(in_n, out_c, out_h, out_w)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
deleted file mode 100644
index fb9cc6b3a17a4381e71d825c47e100486f6739d7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs, TestLeakyRelu
-from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
-
-
-class TestMKLDNNReluDim2(TestRelu):
-    def setUp(self):
-        super(TestMKLDNNReluDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNLeakyReluDim2(TestLeakyRelu):
-    def setUp(self):
-        super(TestMKLDNNLeakyReluDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNTanhDim2(TestTanh):
-    def setUp(self):
-        super(TestMKLDNNTanhDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNSqrtDim2(TestSqrt):
-    def setUp(self):
-        super(TestMKLDNNSqrtDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNAbsDim2(TestAbs):
-    def setUp(self):
-        super(TestMKLDNNAbsDim2, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNReluDim4(TestRelu):
-    def setUp(self):
-        super(TestMKLDNNReluDim4, self).setUp()
-
-        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNLeakyReluDim4(TestLeakyRelu):
-    def setUp(self):
-        super(TestMKLDNNLeakyReluDim4, self).setUp()
-
-        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0.02 * x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNTanhDim4(TestTanh):
-    def setUp(self):
-        super(TestMKLDNNTanhDim4, self).setUp()
-
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.tanh(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNSqrtDim4(TestSqrt):
-    def setUp(self):
-        super(TestMKLDNNSqrtDim4, self).setUp()
-
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNAbsDim4(TestAbs):
-    def setUp(self):
-        super(TestMKLDNNAbsDim4, self).setUp()
-
-        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.abs(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
-# Check if primitives already exist in backward
-class TestMKLDNNAbsPrimitivesAlreadyExist(unittest.TestCase):
-    def setUp(self):
-        super(TestMKLDNNAbsPrimitivesAlreadyExist, self).setUp()
-
-        np.random.seed(123)
-        self.op_type = 'abs'
-        self.x = np.random.uniform(-1, 1, [2, 2]).astype(np.float32)
-        self.out = np.abs(self.x)
-        self.out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
-        self.x_grad = self.__abs_bwd(self.x, self.out_grad)
-
-    # Abs grad calculation
-    def __abs_bwd(self, x, out_grad):
-        return out_grad * np.sign(x)
-
-    def test_check(self):
-        check_if_mkldnn_primitives_exist_in_bwd(
-            self, self.op_type, self.x, self.out, self.out_grad, self.x_grad)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
deleted file mode 100644
index eb12470789ab9a6e416e829832986a11cd576474..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import paddle.fluid as fluid
-from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.framework import grad_var_name
-from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
-from mkldnn_op_test import check_if_mkldnn_batchnorm_primitives_exist_in_bwd
-
-
-class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_formats = ["NCHW"]
-
-    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
-                             epsilon, momentum, shape, data_layout):
-        # run forward
-        y, saved_mean, saved_variance = _reference_training(
-            x, scale, bias, epsilon, data_layout)
-        mean_out = saved_mean * (1. - momentum) + momentum * mean
-        variance_out = saved_variance * (1. - momentum) + momentum * variance
-        # run backward
-        x_grad, scale_grad, bias_grad = _reference_grad(
-            x, y_grad, scale, saved_mean, saved_variance, epsilon, data_layout)
-
-        return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
-
-
-class TestMKLDNNBatchNormOpExistedPrimitives(TestMKLDNNBatchNormOpTraining):
-    def init_test_case(self):
-        TestMKLDNNBatchNormOpTraining.init_test_case(self)
-        self.fetch_list = ['y', 'x@GRAD']
-
-    def test_forward_backward(self):
-        place = core.CPUPlace()
-        shape = [2, 3, 4, 5]
-        scale_shape = [3]
-        data_layout = "NCHW"
-        # initialize the ground-truth
-        np.random.seed(123)
-        x = np.random.random_sample(shape).astype(np.float32)
-        scale = np.random.random_sample(scale_shape).astype(np.float32)
-        bias = np.random.random_sample(scale_shape).astype(np.float32)
-        mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
-        y_grad = np.random.random_sample(shape).astype(np.float32)
-
-        y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
-            x, y_grad, scale, bias, mean, variance, self.epsilon, self.momentum,
-            shape, data_layout)
-        var_dict = locals()
-        var_dict['y@GRAD'] = y_grad
-        var_dict['x@GRAD'] = x_grad
-        var_dict['scale@GRAD'] = scale_grad
-        var_dict['bias@GRAD'] = bias_grad
-        check_if_mkldnn_batchnorm_primitives_exist_in_bwd(self, var_dict, place,
-                                                          shape, data_layout)
-
-
-class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def test_check_output(self):
-        place = core.CPUPlace()
-        data_format = "NCHW"
-        self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
-
-
-class TestMKLDNNBatchNormOpWithReluInference(TestBatchNormOpInference):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.fuse_with_relu = True
-
-    def test_check_output(self):
-        place = core.CPUPlace()
-        data_format = "NCHW"
-        self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
deleted file mode 100644
index 0b6556746cd91676d153d862126dd48661fa281d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-
-
-class TestConcatOp(OpTest):
-    def setUp(self):
-        self.op_type = "concat"
-        self.use_mkldnn = True
-        self._cpu_only = True
-        self.init_axis()
-        self.init_shape()
-        self.init_test_data()
-        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': True}
-
-        self.output = np.concatenate(
-            (self.x0, self.x1, self.x2), axis=self.axis).astype('int')
-
-        self.outputs = {'Out': self.output}
-
-    def test_check_output(self):
-        self.check_output()
-
-#--------------------test concat s8 in with axis 0--------------------
-
-    def init_test_data(self):
-        self.x0 = (np.random.randint(0, 100, self.x0_shape) - 50).astype('int8')
-        self.x1 = (np.random.randint(0, 80, self.x1_shape) - 30).astype('int8')
-        self.x2 = (np.random.randint(0, 110, self.x2_shape) - 80).astype('int8')
-
-    def init_axis(self):
-        self.axis = 0
-
-    def init_shape(self):
-        self.x0_shape = [2, 2, 1, 2]
-        self.x1_shape = [1, 2, 1, 2]
-        self.x2_shape = [3, 2, 1, 2]
-
-
-#--------------------test concat u8 in with axis 0--------------------
-
-
-class TestConcatOp2(TestConcatOp):
-    def init_test_data(self):
-        self.x0 = (np.random.randint(0, 100, self.x0_shape)).astype('uint8')
-        self.x1 = (np.random.randint(0, 50, self.x1_shape)).astype('uint8')
-        self.x2 = (np.random.randint(0, 80, self.x2_shape)).astype('uint8')
-
-    def init_axis(self):
-        self.axis = 0
-
-    def init_shape(self):
-        self.x0_shape = [2, 1, 5, 5]
-        self.x1_shape = [1, 1, 5, 5]
-        self.x2_shape = [3, 1, 5, 5]
-
-
-def create_test_int8_class(parent):
-
-    #--------------------test concat s8/u8 in with axis 1--------------------
-
-    class TestAxis1Case(parent):
-        def init_axis(self):
-            self.axis = 1
-
-        def init_shape(self):
-            self.x0_shape = [1, 1, 5, 5]
-            self.x1_shape = [1, 2, 5, 5]
-            self.x2_shape = [1, 3, 5, 5]
-
-#--------------------test concat s8/u8 in with axis 2--------------------
-
-    class TestAxis2Case(parent):
-        def init_axis(self):
-            self.axis = 2
-
-        def init_shape(self):
-            self.x0_shape = [2, 3, 4, 5]
-            self.x1_shape = [2, 3, 5, 5]
-            self.x2_shape = [2, 3, 6, 5]
-
-#--------------------test concat s8/u8 in with axis 3--------------------
-
-    class TestAxis3Case(parent):
-        def init_axis(self):
-            self.axis = 3
-
-        def init_shape(self):
-            self.x0_shape = [2, 3, 5, 5]
-            self.x1_shape = [2, 3, 5, 6]
-            self.x2_shape = [2, 3, 5, 7]
-
-    cls_name_1 = "{0}_axis_{1}".format(parent.__name__, "1")
-    cls_name_2 = "{0}_axis_{1}".format(parent.__name__, "2")
-    cls_name_3 = "{0}_axis_{1}".format(parent.__name__, "3")
-    TestAxis1Case.__name__ = cls_name_1
-    TestAxis2Case.__name__ = cls_name_2
-    TestAxis3Case.__name__ = cls_name_3
-    globals()[cls_name_1] = TestAxis1Case
-    globals()[cls_name_2] = TestAxis2Case
-    globals()[cls_name_3] = TestAxis3Case
-
-create_test_int8_class(TestConcatOp)
-create_test_int8_class(TestConcatOp2)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
deleted file mode 100644
index 1a399740692eab8ccea0c984a1a4f2ac984eb045..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3
-
-
-class TestMKLDNNConcatOp(TestConcatOp):
-    def setUp(self):
-        super(TestMKLDNNConcatOp, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
-
-    def test_check_grad(self):
-        pass
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNConcatOp2(TestConcatOp2):
-    def setUp(self):
-        super(TestMKLDNNConcatOp2, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
-
-    def test_check_grad(self):
-        pass
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNConcatOp3(TestConcatOp3):
-    def setUp(self):
-        super(TestMKLDNNConcatOp3, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
-
-    def test_check_grad(self):
-        pass
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
deleted file mode 100644
index 9413554db93e38f4c8d54318bfc0bbc14bb73fdd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ /dev/null
@@ -1,347 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
-
-
-def conv2d_forward_refer(input, filter, group, conv_param):
-    out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
-                                                          conv_param)
-    return out
-
-
-class TestConv2dInt8Op(TestConv2dOp):
-    def setUp(self):
-        self.op_type = "conv2d"
-        self.use_cudnn = False
-        self.exhaustive_search = False
-        self.use_cuda = False
-        self.use_mkldnn = False
-        self.data_format = "AnyLayout"
-        self.weighttype = np.float32
-        self.use_mkldnn = True
-        self.init_group()
-        self.init_dilation()
-        self.init_test_case()
-        self.init_fuse_relu()
-        self.init_fuse_residual()
-        self.init_data_type()
-
-        conv2d_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-
-        filter = np.random.random(self.filter_size).astype(self.weighttype)
-        if self.srctype == np.uint8:
-            input = np.random.randint(0, 10,
-                                      self.input_size).astype(self.srctype)
-        else:
-            input = np.random.randint(-5, 5,
-                                      self.input_size).astype(self.srctype)
-            input_shift = (np.ones(self.input_size) * 128).astype(np.uint8)
-
-        if self.srctype == np.int8:
-            filter_int = np.round(filter * self.scale_weights[0] *
-                                  0.5).astype(np.int32)
-            scale_output_shift = self.scale_out / (self.scale_in *
-                                                   self.scale_weights[0] * 0.5)
-            output1 = conv2d_forward_refer(
-                np.round((input.astype(np.int32) + input_shift) *
-                         self.scale_in).astype(np.int32), filter_int,
-                self.groups,
-                conv2d_param).astype(np.float32) * scale_output_shift
-            output2 = conv2d_forward_refer(
-                np.round((input_shift) * self.scale_in).astype(np.int32),
-                filter_int, self.groups,
-                conv2d_param).astype(np.float32) * scale_output_shift
-            if self.fuse_residual:
-                input_residual = np.random.randint(
-                    -5, 5, self.input_residual_size).astype(self.srctype)
-                output_tmp = np.round(output1 - output2 + input_residual.astype(
-                    self.srctype) * (self.scale_out / self.scale_in_eltwise))
-                if self.fuse_activation == "relu":
-                    output = np.maximum(output_tmp, 0).astype(self.dsttype)
-                else:
-                    output = output_tmp.astype(self.dsttype)
-            else:
-                if self.fuse_activation == "relu":
-                    output = np.maximum(np.round(output1 - output2),
-                                        0).astype(self.dsttype)
-                else:
-                    output = np.round(output1 - output2).astype(self.dsttype)
-
-        else:
-            filter_int = np.round(filter *
-                                  self.scale_weights[0]).astype(np.int32)
-            scale_output_shift = self.scale_out / (self.scale_in *
-                                                   self.scale_weights[0])
-            output1 = conv2d_forward_refer(
-                input.astype(np.int32), filter_int, self.groups,
-                conv2d_param).astype(np.float32)
-            output1_tmp = np.round(output1 * (
-                self.scale_out / (self.scale_in * self.scale_weights[0])))
-
-            if self.fuse_residual:
-                input_residual = np.random.randint(
-                    0, 10, self.input_residual_size).astype(self.srctype)
-                output_tmp_res = np.round(output1 * (self.scale_out / (
-                    self.scale_in * self.scale_weights[
-                        0])) + input_residual.astype(np.int32) * (
-                            self.scale_out / self.scale_in_eltwise))
-                if self.fuse_activation == "relu":
-                    output = np.maximum(output_tmp_res, 0).astype(self.dsttype)
-                else:
-                    output = output_tmp_res.astype(self.dsttype)
-            else:
-                if self.fuse_activation == "relu":
-                    output = np.maximum(output1_tmp, 0).astype(self.dsttype)
-                else:
-                    output = output1_tmp.astype(self.dsttype)
-
-        self.inputs = {
-            'Input':
-            OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)),
-            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
-        }
-        if self.fuse_residual:
-            self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
-                input_residual)
-
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'groups': self.groups,
-            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format,
-            'exhaustive_search': self.exhaustive_search,
-            'Scale_in': self.scale_in,
-            'Scale_out': self.scale_out,
-            'Scale_weights': self.scale_weights,
-            'Scale_in_eltwise': self.scale_in_eltwise,
-            'fuse_activation': self.fuse_activation,
-            'fuse_residual_connection': self.fuse_residual
-        }
-        self.outputs = {'Output': output}
-
-    def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace(), atol=0)
-
-    def test_check_grad(self):
-        pass
-
-    def test_check_grad_no_filter(self):
-        pass
-
-    def test_check_grad_no_input(self):
-        pass
-
-    def init_test_case(self):
-        TestConv2dOp.init_test_case(self)
-        self.input_size = [1, 1, 5, 5]  # NCHW
-        f_c = self.input_size[1] // self.groups
-        self.input_residual_size = [1, 2, 3, 3]
-        self.filter_size = [2, f_c, 3, 3]
-        self.scale_in = 1.0
-        self.scale_out = 0.5
-        self.scale_weights = [10.0]
-        self.scale_in_eltwise = 0.6
-
-    def init_data_type(self):
-        self.srctype = np.uint8
-        self.dsttype = np.int8
-
-    def init_fuse_relu(self):
-        self.fuse_activation = "relu"
-
-    def init_fuse_residual(self):
-        self.fuse_residual = True
-
-
-#--------------------test conv2d u8 in and u8 out with residual fuse--------------------
-
-
-class TestConv2d(TestConv2dInt8Op):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.input_residual_size = [2, 6, 3, 3]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.scale_in = 1.0
-        self.scale_out = 0.5
-        self.scale_weights = [10.0]
-        self.scale_in_eltwise = 0.6
-
-
-class TestWithPad(TestConv2d):
-    def init_test_case(self):
-        TestConv2d.init_test_case(self)
-        self.pad = [1, 1]
-        self.input_residual_size = [2, 6, 5, 5]
-
-
-class TestWithGroup(TestConv2d):
-    def init_group(self):
-        self.groups = 3
-
-
-class TestWithStride(TestConv2dInt8Op):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 6, 6]
-        self.input_residual_size = [2, 6, 3, 3]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.scale_in = 1.0
-        self.scale_out = 0.8
-        self.scale_weights = [10.0]
-        self.scale_in_eltwise = 0.5
-
-
-class TestWith1x1(TestConv2dInt8Op):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [1, 3, 5, 5]
-        self.input_residual_size = [1, 6, 5, 5]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 1, 1]
-        self.scale_in = 1.0
-        self.scale_out = 0.5
-        self.scale_weights = [12.0]
-        self.scale_in_eltwise = 0.5
-
-
-class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 1, 1]
-        self.input_residual_size = [2, 6, 1, 1]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 1, 1]
-        self.scale_in = 1.0
-        self.scale_out = 0.5
-        self.scale_weights = [10.0]
-        self.scale_in_eltwise = 0.8
-
-    def init_group(self):
-        self.groups = 3
-
-
-def init_data_type_with_fusion(self, input_dt, fuse_activation, fuse_residual):
-    self.srctype = input_dt
-    self.dsttype = np.uint8 if fuse_activation == "relu" else np.int8
-
-    self.fuse_activation = fuse_activation
-
-    self.fuse_residual = fuse_residual
-
-
-def create_test_int8_class(parent):
-
-    #--------------------test conv2d s8 in and u8 out--------------------
-
-    class TestS8U8Case(parent):
-        def init_data_type(self):
-            init_data_type_with_fusion(self, np.int8, "relu", False)
-
-    #--------------------test conv2d s8 in and s8 out--------------------
-
-    class TestS8S8Case(parent):
-        def init_data_type(self):
-            init_data_type_with_fusion(self, np.int8, "", False)
-
-    #--------------------test conv2d u8 in and s8 out--------------------
-
-    class TestU8S8Case(parent):
-        def init_data_type(self):
-            init_data_type_with_fusion(self, np.uint8, "", False)
-
-    #--------------------test conv2d u8 in and u8 out without residual fuse--------------------
-
-    class TestU8U8Case(parent):
-        def init_data_type(self):
-            init_data_type_with_fusion(self, np.uint8, "relu", False)
-
-    #--------------------test conv2d s8 in and u8 out with residual fuse--------------------
-
-    class TestS8U8ResCase(parent):
-        def init_data_type(self):
-            init_data_type_with_fusion(self, np.int8, "relu", True)
-
-    #--------------------test conv2d s8 in and s8 out with residual fuse--------------------
-
-    class TestS8S8ResCase(parent):
-        def init_data_type(self):
-            init_data_type_with_fusion(self, np.int8, "", True)
-
-    #--------------------test conv2d u8 in and s8 out with residual fuse--------------------
-
-    class TestU8S8ResCase(parent):
-        def init_data_type(self):
-            init_data_type_with_fusion(self, np.uint8, "", True)
-
-    cls_name_s8u8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "1")
-    cls_name_s8s8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "0")
-    cls_name_u8s8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "0")
-    cls_name_u8u8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "1")
-    cls_name_s8u8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__,
-                                                            "1", "1")
-    cls_name_s8s8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__,
-                                                            "0", "1")
-    cls_name_u8s8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__,
-                                                            "0", "1")
-    TestS8U8Case.__name__ = cls_name_s8u8
-    TestS8S8Case.__name__ = cls_name_s8s8
-    TestU8S8Case.__name__ = cls_name_u8s8
-    TestU8U8Case.__name__ = cls_name_u8u8
-    TestS8U8ResCase.__name__ = cls_name_s8u8_re_1
-    TestS8S8ResCase.__name__ = cls_name_s8s8_re_1
-    TestU8S8ResCase.__name__ = cls_name_u8s8_re_1
-    globals()[cls_name_s8u8] = TestS8U8Case
-    globals()[cls_name_s8s8] = TestS8S8Case
-    globals()[cls_name_u8s8] = TestU8S8Case
-    globals()[cls_name_u8u8] = TestU8U8Case
-    globals()[cls_name_s8u8_re_1] = TestS8U8ResCase
-    globals()[cls_name_s8s8_re_1] = TestS8S8ResCase
-    globals()[cls_name_u8s8_re_1] = TestU8S8ResCase
-
-
-create_test_int8_class(TestConv2dInt8Op)
-create_test_int8_class(TestWithPad)
-create_test_int8_class(TestWithStride)
-create_test_int8_class(TestWithGroup)
-create_test_int8_class(TestWith1x1)
-create_test_int8_class(TestWithInput1x1Filter1x1)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
deleted file mode 100644
index 756d10a9c7d2b917b547d2a007e0aa5917642674..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp
-
-
-def conv2d_bias_naive(out, bias):
-    _, out_c, _, _ = out.shape
-
-    for l in range(out_c):
-        out[:, l, :, :] = out[:, l, :, :] + bias[l]
-    return out
-
-
-def conv2d_residual_naive(out, residual):
-    assert out.shape == residual.shape
-    out = np.add(out, residual)
-    return out
-
-
-class TestConv2dMKLDNNOp(TestConv2dOp):
-    def init_group(self):
-        self.groups = 1
-
-    def init_kernel_type(self):
-        self.data_format = "NCHW"
-        self.use_mkldnn = True
-        self._cpu_only = True
-
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def setUp(self):
-        self.fuse_bias = False
-        self.bias_size = None
-        self.fuse_activation = ""
-        self.fuse_alpha = 0
-        self.fuse_beta = 0
-        self.fuse_brelu_threshold = 6.0
-        self.fuse_residual_connection = False
-        self.input_residual_size = None
-        TestConv2dOp.setUp(self)
-
-        output = self.outputs['Output']
-
-        #mkldnn only support either conv-sum-relu, or conv-relu.
-        if self.fuse_bias and self.bias_size is not None:
-            bias = np.random.random(self.bias_size).astype(self.dtype)
-            output = conv2d_bias_naive(output, bias)
-            output = output.astype(self.dtype)
-            self.attrs['fuse_bias'] = self.fuse_bias
-            self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
-
-        if self.fuse_residual_connection and self.input_residual_size is not None:
-            input_residual = np.random.random(self.input_residual_size).astype(
-                self.dtype)
-            output = conv2d_residual_naive(output, input_residual)
-
-            self.attrs[
-                'fuse_residual_connection'] = self.fuse_residual_connection
-            self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
-                input_residual)
-
-        if self.fuse_activation == "relu":
-            output = np.maximum(output, 0).astype(self.dsttype)
-
-        if self.fuse_activation == "relu6":
-            output = np.minimum(np.maximum(output, 0),
-                                self.fuse_alpha).astype(self.dsttype)
-        output = output.astype(self.dtype)
-
-        self.attrs['fuse_bias'] = self.fuse_bias
-        self.attrs['fuse_activation'] = self.fuse_activation
-        self.attrs['fuse_alpha'] = self.fuse_alpha
-        self.attrs['fuse_beta'] = self.fuse_beta
-        self.attrs['fuse_brelu_threshold'] = self.fuse_brelu_threshold
-        self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
-
-        self.outputs['Output'] = output
-
-
-class TestWithbreluFusion(TestConv2dMKLDNNOp):
-    def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
-        self.fuse_activation = "relu6"
-        self.fuse_alpha = 6.0
-        self.dsttype = np.float32
-
-    def test_check_grad(self):
-        pass
-
-    def test_check_grad_no_filter(self):
-        pass
-
-    def test_check_grad_no_input(self):
-        pass
-
-
-class TestWithFuse(TestConv2dMKLDNNOp):
-    def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
-        self.pad = [1, 1]
-        self.fuse_bias = True
-        self.bias_size = [6]
-        self.fuse_residual_connection = True
-        self.input_residual_size = [2, 6, 5, 5]
-
-    def test_check_grad(self):
-        pass
-
-    def test_check_grad_no_filter(self):
-        pass
-
-    def test_check_grad_no_input(self):
-        pass
-
-
-class TestWithPadWithBias(TestConv2dMKLDNNOp):
-    def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
-        self.pad = [1, 1]
-        self.input_size = [2, 3, 6, 6]
-
-
-class TestWithStride(TestConv2dMKLDNNOp):
-    def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 6, 6]
-
-
-class TestWithGroup(TestConv2dMKLDNNOp):
-    def init_group(self):
-        self.groups = 3
-
-
-class TestWith1x1(TestConv2dMKLDNNOp):
-    def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
-        self.filter_size = [6, 3, 1, 1]
-
-
-class TestWithInput1x1Filter1x1(TestConv2dMKLDNNOp):
-    def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
-        self.input_size = [2, 3, 1, 1]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 1, 1]
-
-    def init_group(self):
-        self.groups = 3
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
deleted file mode 100644
index 33f5ea7ad6f2f1b78e17d49e5ea05b6c450d531e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest
-
-from paddle.fluid.tests.unittests.test_conv2d_transpose_op import conv2dtranspose_forward_naive, TestConv2dTransposeOp
-
-
-def conv2d_bias_naive(out, bias):
-    _, out_c, _, _ = out.shape
-
-    for l in range(out_c):
-        out[:, l, :, :] = out[:, l, :, :] + bias[l]
-    return out
-
-
-class TestConv2dTransposeMKLDNNOp(TestConv2dTransposeOp):
-    def test_check_grad(self):
-        return
-
-    def test_check_grad_no_input(self):
-        return
-
-    def test_check_grad_no_filter(self):
-        return
-
-    def init_op_type(self):
-        self.data_format = "NCHW"
-        self.op_type = "conv2d_transpose"
-        self._cpu_only = True
-
-    def init_test_case(self):
-        self.use_mkldnn = True
-        self.is_test = True
-        self.pad = [0, 0]
-        self.fuse_bias = False
-        self.bias_size = None
-        self.fuse_activation = ""
-        self.fuse_alpha = 0.0
-        self.fuse_beta = 0.0
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-        self.groups = 1
-
-    def setUp(self):
-        TestConv2dTransposeOp.setUp(self)
-
-        output = self.outputs['Output']
-
-        if self.fuse_bias and self.bias_size is not None:
-            bias = np.random.random(self.bias_size).astype(self.dtype)
-            output = conv2d_bias_naive(output, bias)
-            output = output.astype(self.dtype)
-            self.attrs['fuse_bias'] = self.fuse_bias
-            self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
-
-        if self.fuse_activation == "relu":
-            output = np.maximum(output, 0).astype(self.dtype)
-        output = output.astype(self.dtype)
-
-        self.attrs['fuse_activation'] = self.fuse_activation
-        self.attrs['fuse_alpha'] = self.fuse_alpha
-        self.attrs['fuse_beta'] = self.fuse_beta
-
-        self.outputs['Output'] = output
-
-
-class TestMKLDNNFuseBias(TestConv2dTransposeMKLDNNOp):
-    def init_test_case(self):
-        TestConv2dTransposeMKLDNNOp.init_test_case(self)
-        self.pad = [1, 1]
-        self.fuse_bias = True
-        self.bias_size = [6]
-
-
-class TestMKLDNNWithPad(TestConv2dTransposeMKLDNNOp):
-    def init_test_case(self):
-        TestConv2dTransposeMKLDNNOp.init_test_case(self)
-        self.pad = [1, 1]
-        self.input_size = [2, 3, 10, 10]
-
-
-class TestMKLDNNWithStride(TestConv2dTransposeMKLDNNOp):
-    def init_test_case(self):
-        TestConv2dTransposeMKLDNNOp.init_test_case(self)
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 6, 6]  # NCHW
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
deleted file mode 100644
index 080b74502fbe83e97e88a65866e0d9b66b37033e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-from paddle.fluid.tests.unittests.test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
-
-
-class TestMKLDNN(TestConv3dOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
-
-
-class TestMKLDNNCase1(TestCase1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
-
-
-class TestMKLDNNGroup1(TestWithGroup1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
-
-
-class TestMKLDNNGroup2(TestWithGroup2):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
-
-
-class TestMKLDNNWith1x1(TestWith1x1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
-
-
-class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
deleted file mode 100644
index 9a54f927cbde648bbbb06d043bbc1391ee43c314..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-
-
-class TestDeQuantizeOp(OpTest):
-    def setUp(self):
-        self.op_type = 'dequantize'
-        self.scale = 2.0
-        self.input_size = [1, 1, 5, 5]  #Naive nChw16c
-        self.data_type = 'int8'
-        self.set_scale()
-        self.set_data_type()
-
-        if self.data_type == 'int8':
-            input = (np.random.randint(0, 100, self.input_size) - 50
-                     ).astype(self.data_type)
-            output = (input * (1 / self.scale)).astype('float')
-        else:
-            input = (np.random.randint(0, 100,
-                                       self.input_size)).astype(self.data_type)
-            output = (input * (1 / self.scale)).astype('float')
-
-        self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(input)}
-
-        self.outputs = {'Output': output}
-
-        self.attrs = {'Scale': self.scale, }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def set_scale(self):
-        pass
-
-    def set_data_type(OpTest):
-        pass
-
-
-class TestDeQuantizeOp1(TestDeQuantizeOp):
-    def set_scale(self):
-        self.scale = 1.5
-
-    def set_data_type(self):
-        self.data_type = 'int8'
-
-
-class TestDeQuantizeOp2(TestDeQuantizeOp):
-    def set_scale(self):
-        self.scale = 0.8
-
-    def set_data_type(self):
-        self.data_type = 'uint8'
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
deleted file mode 100644
index c3a42656b71d09dbc22abf8ce2ddc243b43b422f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_elementwise_add_op import *
-'''
-Some tests differ from the tests defined in test_elementwise_add_op.py
-because MKLDNN does not support tensors of number of dimensions 3.
-Such dimensions cause exceptions in MKLDNN reorder primitive.
-'''
-
-
-class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNElementwiseAddOp_scalar(TestElementwiseAddOp_scalar):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(1).astype(self.dtype)
-        self.out = self.x + self.y
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(1, 1).astype(self.dtype)
-        self.out = self.x + self.y
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNElementwiseAddOp_Vector(TestElementwiseAddOp_Vector):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TesMKLDNNtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(2).astype(self.dtype)
-        self.out = self.x + self.y.reshape(2, 1, 1, 1)
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(3).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 3, 1, 1)
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(4).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 1, 1, 4)
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNElementwiseAddOp_rowwise_add_0(
-        TestElementwiseAddOp_rowwise_add_0):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(3, 4).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 3, 4, 1)
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNElementwiseAddOp_rowwise_add_1(
-        TestElementwiseAddOp_rowwise_add_1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNElementwiseAddOp_channelwise_add(
-        TestElementwiseAddOp_channelwise_add):
-    def init_input_output(self):
-        self.x = np.random.rand(3, 5, 20, 20).astype(self.dtype)
-        self.y = np.random.rand(3, 1, 1, 1).astype(self.dtype)
-        self.out = self.x + self.y
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
deleted file mode 100644
index 043c544f26aed1e632e828d7d7bfec5627fac3f7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
+++ /dev/null
@@ -1,330 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.tests.unittests.test_elementwise_mul_op import *
-from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive
-from paddle.fluid.tests.unittests.mkldnn.mkldnn_op_test import __assert_close
-import paddle.fluid as fluid
-
-
-# For UT coverage, integrate conv2d + elementwise-mul so that nchw16C could be automatically chosen when mkldnn-kernel is enabled
-class TestElementwiseMulMKLDNNOp_Integrated_With_Convs(ElementwiseMulOp):
-    def setUp(self):
-        self.dtype = np.float32
-        self.init_dtype()
-        self.init_kernel_type()
-        self.init_axis()
-        self._cpu_only = True
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.groups = 1
-        self.input_size = [1, 3, 5, 5]  # NCHW
-        self.filter_size = [16, 3, 3, 3]
-        self.filter_size2 = [1, 16, 2, 2]
-        self.dilations = [1, 1]
-        self.use_cudnn = False
-        self.data_format = "NCHW"
-        self.input = np.random.random(self.input_size).astype(self.dtype)
-        self.filter = np.random.random(self.filter_size).astype(self.dtype)
-        self.filter2 = np.random.random(self.filter_size2).astype(self.dtype)
-        self.elt_mul_y_size = [1, 16]
-        self.elt_mul_y = np.random.random(self.elt_mul_y_size).astype(
-            self.dtype)
-        conv2d_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-        conv_out, _, _, _, _ = conv2d_forward_naive(
-            self.input, self.filter, self.groups, conv2d_param)  #[1, 16, 2, 2]
-        self.conv_output = conv_out
-        self.elt_mul_output = self.conv_output * self.elt_mul_y.reshape(
-            1, 16, 1, 1)  # the result shape is [1, 16, 2, 2]
-        conv_output2, _, _, _, _ = conv2d_forward_naive(
-            self.elt_mul_output, self.filter2, self.groups, conv2d_param)
-        self.conv_output2 = conv_output2
-        self.fetch_list = ["conv_output2"]
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_axis(self):
-        self.axis = 0
-
-    def test_check_output(self):
-        ground_truth = {
-            "input": self.input,
-            "filter": self.filter,
-            "filter2": self.filter2,
-            "conv_output": self.conv_output,
-            "elt_mul_y": self.elt_mul_y,
-            "elt_mul_output": self.elt_mul_output,
-            "conv_output2": self.conv_output2,
-        }
-        program = fluid.Program()
-        with fluid.program_guard(program):
-            block = program.global_block()
-            for name in ground_truth:
-                block.create_var(
-                    name=name, dtype="float32", shape=ground_truth[name].shape)
-            conv2d_op = block.append_op(
-                type="conv2d",
-                inputs={
-                    "Input": block.var('input'),
-                    'Filter': block.var('filter')
-                },
-                outputs={"Output": block.var('conv_output')},
-                attrs={
-                    'strides': self.stride,
-                    'paddings': self.pad,
-                    'groups': self.groups,
-                    'dilations': self.dilations,
-                    'use_cudnn': self.use_cudnn,
-                    'use_mkldnn': self.use_mkldnn
-                })
-            elementwise_mul_op = block.append_op(
-                type="elementwise_mul",
-                inputs={
-                    'X': block.var('conv_output'),
-                    'Y': block.var('elt_mul_y'),
-                },
-                outputs={"Out": block.var('elt_mul_output')},
-                attrs={
-                    'use_cudnn': self.use_cudnn,
-                    'use_mkldnn': self.use_mkldnn,
-                    'axis': self.axis
-                })
-            conv2d_op2 = block.append_op(
-                type="conv2d",
-                inputs={
-                    "Input": block.var('elt_mul_output'),
-                    'Filter': block.var('filter2')
-                },
-                outputs={"Output": block.var('conv_output2')},
-                attrs={
-                    'strides': self.stride,
-                    'paddings': self.pad,
-                    'groups': self.groups,
-                    'dilations': self.dilations,
-                    'use_cudnn': self.use_cudnn,
-                    'use_mkldnn': self.use_mkldnn,
-                    'data_format': self.data_format
-                })
-            place = core.CPUPlace()
-            exe = fluid.Executor(place)
-            out = exe.run(
-                program,
-                feed={
-                    name: ground_truth[name]
-                    for name in ["input", "filter", "filter2", "elt_mul_y"]
-                },
-                fetch_list=self.fetch_list)
-
-            for id, name in enumerate(self.fetch_list):
-                self.assertTrue(
-                    np.allclose(
-                        ground_truth[name], out[id], atol=1e-4), name)
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-class TestElementwiseMulMKLDNNOp_FallbackNCHW(ElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
-        self.y = np.random.rand(1, 16).astype(self.dtype)
-
-        self.out = self.x * self.y.reshape(1, 16, 1, 1)
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_axis(self):
-        self.axis = 0
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-class TestElementwiseMulMKLDNNOp_FallbackNCHW16C(ElementwiseMulOp):
-    def init_input_output(self):
-        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
-        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
-        y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
-        self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
-
-        self.out = x * y
-
-    def setUp(self):
-        super(TestElementwiseMulMKLDNNOp_FallbackNCHW16C, self).setUp()
-        self.attrs["x_data_format"] = "nchw16c"
-        self.attrs["y_data_format"] = "nchw16c"
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_axis(self):
-        self.axis = 0
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-class TestElementwiseMulMKLDNNOp_FallbackNoReorders(ElementwiseMulOp):
-    def init_input_output(self):
-        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
-        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
-        y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
-        self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
-
-        self.out = x * y
-
-    def setUp(self):
-        super(TestElementwiseMulMKLDNNOp_FallbackNoReorders, self).setUp()
-        self.attrs["x_data_format"] = "nchw16c"
-        self.attrs["y_data_format"] = "nchw16c"
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_axis(self):
-        self.axis = 0
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-class TestElementwiseMulMKLDNNOp_FallbackWithReorder1(ElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
-        y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
-        self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
-
-        self.out = self.x * y
-
-    def setUp(self):
-        super(TestElementwiseMulMKLDNNOp_FallbackWithReorder1, self).setUp()
-        self.attrs["x_data_format"] = "nchw"
-        self.attrs["y_data_format"] = "nchw16c"
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_axis(self):
-        self.axis = 0
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-class TestElementwiseMulMKLDNNOp_FallbackWithReorder2(ElementwiseMulOp):
-    def init_input_output(self):
-        self.y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
-        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
-        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
-
-        self.out = x * self.y
-
-    def setUp(self):
-        super(TestElementwiseMulMKLDNNOp_FallbackWithReorder2, self).setUp()
-        self.attrs["x_data_format"] = "nchw16c"
-        self.attrs["y_data_format"] = "nchw"
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_axis(self):
-        self.axis = 0
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-class TestElementwiseMulMKLDNNOp_FallbackNoReorders2(ElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.random.rand(1, 16).astype(self.dtype)
-        self.y = np.random.rand(1, 16).astype(self.dtype)
-
-        self.out = self.x * self.y
-
-    def setUp(self):
-        super(TestElementwiseMulMKLDNNOp_FallbackNoReorders2, self).setUp()
-        self.attrs["x_data_format"] = "nc"
-        self.attrs["y_data_format"] = "nc"
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_axis(self):
-        self.axis = 0
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
deleted file mode 100644
index 8f0a9898dce32b7162a710ed23bde4c1f7c7a1ff..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-
-
-def fully_connected_naive(input, weights, bias_data=None):
-    result = None
-
-    if not bias_data:
-        result = np.dot(input, weights)
-    else:
-        result = np.dot(input, weights) + bias_data
-
-    return result
-
-
-class MatrixGenerate:
-    def __init__(self, mb, ic, oc, h, w):
-        self.input = np.random.random((mb, ic * h * w)).astype("float32")
-        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
-
-
-class TestFCMKLDNNOp(OpTest):
-    def create_data(self):
-        self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
-
-    def setUp(self):
-        self.op_type = "fc"
-        self.use_mkldnn = True
-        self.create_data()
-        self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
-
-        self.attrs = {'use_mkldnn': self.use_mkldnn, }
-
-        self.outputs = {
-            'Out': fully_connected_naive(self.matrix.input, self.matrix.weights)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_no_weight(self):
-        pass
-
-
-class TestFCMKLDNNOp1(TestFCMKLDNNOp):
-    def create_data(self):
-        self.matrix = MatrixGenerate(2, 15, 48, 2, 2)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
deleted file mode 100644
index 5ecf8cc80f7eb12b7ecd3a2238d92b2e71ceaa6d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-from paddle.fluid.tests.unittests.test_gaussian_random_op import TestGaussianRandomOp
-
-
-class TestMKLDNNGaussianRandomOpSeed10(TestGaussianRandomOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNGaussianRandomOpSeed0(TestGaussianRandomOp):
-    def setUp(self):
-        TestGaussianRandomOp.setUp(self)
-        self.attrs = {
-            "shape": [1000, 784],
-            "mean": .0,
-            "std": 1.,
-            "seed": 0,
-            "use_mkldnn": self.use_mkldnn
-        }
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
deleted file mode 100644
index a5e6e116a5f1bc1e051ce3cfdac8cd1e5f3ed90e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_lrn_op import TestLRNOp
-
-
-class TestLRNMKLDNNOp(TestLRNOp):
-    def get_attrs(self):
-        attrs = TestLRNOp.get_attrs(self)
-        attrs['use_mkldnn'] = True
-        return attrs
-
-    def test_check_output(self):
-        self.check_output(atol=0.002)
-
-
-class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
-    def get_attrs(self):
-        attrs = TestLRNMKLDNNOp.get_attrs(self)
-        attrs['is_test'] = True
-        return attrs
-
-    def test_check_grad_normal(self):
-        def check_raise_is_test():
-            try:
-                self.check_grad(['X'], 'Out', max_relative_error=0.01)
-            except Exception as e:
-                t = \
-                "is_test attribute should be set to False in training phase."
-                if t in str(e):
-                    raise AttributeError
-
-        self.assertRaises(AttributeError, check_raise_is_test)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
deleted file mode 100644
index 51ab00e1915274a768487970e907eaaf0b9a903f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest
-'''
- test case for s8 * s8
-'''
-
-
-class TestMKLDNNMulOpS8S8(OpTest):
-    def setUp(self):
-        self.op_type = "mul"
-        self.init_kernel_type()
-        self.init_data_type()
-        self.init_data()
-        self.attrs = {
-            "use_mkldnn": self.use_mkldnn,
-            "scale_x": self.scale_x,
-            "scale_y": self.scale_y,
-            "scale_out": self.scale_out,
-            "force_fp32_output": self.force_fp32,
-        }
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.force_fp32 = True
-
-    def init_data_type(self):
-        self.srctype = np.uint8
-        self.dsttype = np.float32 if self.force_fp32 else np.int8
-
-    def init_data(self):
-        self.scale_x = 0.6
-        self.scale_y = [0.8]
-        self.scale_out = 1.0
-
-        # limit random range inside |-127, 127| to avoid overflow on SKL
-        if self.srctype == np.int8:
-            A_data = np.random.randint(-127, 127, (2, 5)).astype(np.int8)
-        else:
-            A_data = np.random.randint(0, 127, (2, 5)).astype(np.uint8)
-
-        B_data = np.random.uniform(-127, 127, (5, 3)).astype(np.float32)
-
-        quant_B = np.round(B_data * self.scale_y[0]).astype(np.int)
-        output = np.dot(A_data, quant_B)
-
-        scale_output_shift = (self.scale_out) / \
-            (self.scale_x * self.scale_y[0])
-
-        if (self.force_fp32):
-            output = (output * scale_output_shift).astype(self.dsttype)
-        else:
-            output = np.round(output * scale_output_shift).astype(self.dsttype)
-
-        self.inputs = {'X': A_data, 'Y': B_data}
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace(), atol=0)
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-'''
- test case for  s8 * u8 
-'''
-
-
-class TestMKLDNNMulOpS8U8(TestMKLDNNMulOpS8S8):
-    def init_data_type(self):
-        self.srctype = np.uint8
-        self.dsttype = np.float32 if self.force_fp32 else np.int8
-
-
-'''
- test case for  s8 * s8 
-'''
-
-
-class TestMKLDNNMulOpS8S8WithFlatten(TestMKLDNNMulOpS8S8):
-    def setUp(self):
-        self.op_type = "mul"
-        self.init_kernel_type()
-        self.init_data_type()
-        self.init_data()
-        self.attrs = {
-            "use_mkldnn": self.use_mkldnn,
-            "scale_x": self.scale_x,
-            "scale_y": self.scale_y,
-            "scale_out": self.scale_out,
-            "force_fp32_output": self.force_fp32,
-            "x_num_col_dims": 2,
-            "y_num_col_dims": 2,
-        }
-
-    def init_data(self):
-        self.scale_x = 0.6
-        self.scale_y = [0.8]
-        self.scale_out = 1.0
-
-        # limit random range inside |-127, 127| to avoid overflow on SKL
-        if self.srctype == np.int8:
-            A_data = np.random.randint(-127, 127, (3, 4, 4, 3)).astype(np.int8)
-        else:
-            A_data = np.random.randint(0, 127, (3, 4, 4, 3)).astype(np.uint8)
-
-        B_data = np.random.uniform(-127, 127,
-                                   (2, 6, 1, 2, 3)).astype(np.float32)
-
-        A_data_reshape = A_data.reshape(3 * 4, 4 * 3)
-        B_data_reshape = B_data.reshape(2 * 6, 1 * 2 * 3)
-
-        quant_B = np.round(B_data_reshape * self.scale_y[0]).astype(np.int)
-        output = np.dot(A_data_reshape, quant_B)
-
-        scale_output_shift = (self.scale_out) / \
-            (self.scale_x * self.scale_y[0])
-
-        if (self.force_fp32):
-            output = (output * scale_output_shift).astype(self.dsttype)
-        else:
-            output = np.round(output * scale_output_shift).astype(self.dsttype)
-
-        output = output.reshape(3, 4, 1, 2, 3)
-
-        self.inputs = {'X': A_data, 'Y': B_data}
-        self.outputs = {'Out': output}
-
-
-'''
- test case for  s8 * u8 
-'''
-
-
-class TestMKLDNNMulOpS8U8WithFlatten(TestMKLDNNMulOpS8S8WithFlatten):
-    def init_data_type(self):
-        self.srctype = np.uint8
-        self.dsttype = np.float32 if self.force_fp32 else np.int8
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
deleted file mode 100644
index fca906fecc5fe8d25b9251c886398f8df778043f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from __future__ import division
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive
-
-
-class TestPool2dMKLDNNInt8_Op(TestPool2D_Op):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_data_type(self):
-        self.dtype = np.int8
-
-    def setUp(self):
-        TestPool2D_Op.setUp(self)
-        assert self.dtype in [np.int8, np.uint8
-                              ], 'Dtype should be int8 or uint8'
-
-    def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace(), atol=1e-5)
-
-    def test_check_grad(self):
-        pass
-
-
-class TestCase1Avg(TestPool2dMKLDNNInt8_Op):
-    def init_test_case(self):
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
-
-    def init_global_pool(self):
-        self.global_pool = False
-
-
-class TestCase2Avg(TestPool2dMKLDNNInt8_Op):
-    def init_test_case(self):
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
-
-    def init_global_pool(self):
-        self.global_pool = False
-
-
-class TestCase0Max(TestPool2dMKLDNNInt8_Op):
-    def init_pool_type(self):
-        self.pool_type = "max"
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-
-
-class TestCase1Max(TestCase1Avg):
-    def init_pool_type(self):
-        self.pool_type = "max"
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-
-
-class TestCase2Max(TestCase2Avg):
-    def init_pool_type(self):
-        self.pool_type = "max"
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-
-
-def create_test_s8_u8_class(parent):
-    class TestS8Case(parent):
-        def init_data_type(self):
-            self.dtype = np.int8
-
-    class TestU8Case(parent):
-        def init_data_type(self):
-            self.dtype = np.uint8
-
-    cls_name_s8 = "{0}_{1}".format(parent.__name__, "mkldnn_s8")
-    cls_name_u8 = "{0}_{1}".format(parent.__name__, "mkldnn_u8")
-    TestS8Case.__name__ = cls_name_s8
-    TestU8Case.__name__ = cls_name_u8
-    globals()[cls_name_s8] = TestS8Case
-    globals()[cls_name_u8] = TestU8Case
-
-
-create_test_s8_u8_class(TestPool2dMKLDNNInt8_Op)
-create_test_s8_u8_class(TestCase1Avg)
-create_test_s8_u8_class(TestCase2Avg)
-create_test_s8_u8_class(TestCase0Max)
-create_test_s8_u8_class(TestCase1Max)
-create_test_s8_u8_class(TestCase2Max)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
deleted file mode 100644
index feb2a563eeaed7a83a82ec56ec08a0ed8664d126..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
-
-
-def create_test_mkldnn_use_ceil_class(parent):
-    class TestMKLDNNPool2DUseCeilCase(parent):
-        def init_kernel_type(self):
-            self.use_mkldnn = True
-
-        def init_ceil_mode(self):
-            self.ceil_mode = True
-
-    cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNCeilModeCast")
-    TestMKLDNNPool2DUseCeilCase.__name__ = cls_name
-    globals()[cls_name] = TestMKLDNNPool2DUseCeilCase
-
-
-create_test_mkldnn_use_ceil_class(TestPool2D_Op)
-create_test_mkldnn_use_ceil_class(TestCase1)
-create_test_mkldnn_use_ceil_class(TestCase2)
-
-
-def create_test_mkldnn_class(parent):
-    class TestMKLDNNCase(parent):
-        def init_kernel_type(self):
-            self.use_mkldnn = True
-
-    cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNOp")
-    TestMKLDNNCase.__name__ = cls_name
-    globals()[cls_name] = TestMKLDNNCase
-
-
-create_test_mkldnn_class(TestPool2D_Op)
-create_test_mkldnn_class(TestCase1)
-create_test_mkldnn_class(TestCase2)
-create_test_mkldnn_class(TestCase3)
-create_test_mkldnn_class(TestCase4)
-create_test_mkldnn_class(TestCase5)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
deleted file mode 100644
index 132f7bd039f7797fb0fc332d6f7b8c242af46535..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-
-
-class TestQuantizeOp(OpTest):
-    def setUp(self):
-        self.op_type = 'quantize'
-        self.scale = 2.0
-        self.input_size = [1, 1, 5, 5]  #Naive nChw16c
-        self.is_negative = False
-        self.set_scale()
-        self.set_is_negative()
-
-        if self.is_negative:
-            input = (100 * np.random.random_sample(self.input_size) - 50
-                     ).astype('float32')
-            output = np.round(input * self.scale).astype('int8')
-        else:
-            input = (100 *
-                     np.random.random_sample(self.input_size)).astype('float32')
-            output = np.round(input * self.scale).astype('uint8')
-
-        self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(input)}
-
-        self.outputs = {'Output': output}
-
-        self.attrs = {
-            'Scale': self.scale,
-            'is_negative_input': self.is_negative
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def set_scale(self):
-        pass
-
-    def set_is_negative(self):
-        pass
-
-
-class TestQuantizeOp1(TestQuantizeOp):
-    def set_scale(self):
-        self.scale = 1.5
-
-    def set_is_negative(self):
-        self.is_nagative = True
-
-
-class TestQuantizeOp2(TestQuantizeOp):
-    def set_scale(self):
-        self.scale = 0.1
-
-    def set_is_negative(self):
-        self.is_nagative = False
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
deleted file mode 100644
index b7a4683558539d3f9daa6a1146355acc3ff2bab7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-from mkldnn_op_test import format_reorder
-
-
-class TestReQuantizeOp(OpTest):
-    def setUp(self):
-        self.op_type = 'requantize'
-        self.scale_in = 2.0
-        self.scale_out = 1.5
-        self.input_size = [1, 1, 5, 5]
-        self.data_type = 'int8'
-        self.set_scale()
-        self.set_data_type()
-
-        scale_shift = self.scale_out / self.scale_in
-
-        if self.data_type == 'int8':
-            input = (np.random.randint(0, 100, self.input_size) - 50
-                     ).astype(self.data_type)
-            output_tmp = np.round(input.astype('float32') *
-                                  scale_shift).astype('int8')
-        else:
-            input = (np.random.randint(0, 100,
-                                       self.input_size)).astype(self.data_type)
-            output_tmp = np.round(input.astype('float32') *
-                                  scale_shift).astype('uint8')
-
-        output = format_reorder(output_tmp, self.input_size)
-
-        self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(input)}
-
-        self.outputs = {'Output': output}
-
-        self.attrs = {'Scale_in': self.scale_in, 'Scale_out': self.scale_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def set_scale(self):
-        pass
-
-    def set_data_type(OpTest):
-        pass
-
-
-#--------------------test requantize with s8 input--------------------
-
-
-class TestReQuantizeOp1(TestReQuantizeOp):
-    def set_scale(self):
-        self.scale_in = 1.5
-        self.scale_out = 1.5
-
-
-class TestReQuantizeOp2(TestReQuantizeOp):
-    def set_scale(self):
-        self.scale_in = 0.1
-        self.scale_out = 0.2
-
-
-#--------------------test requantize with u8 input--------------------
-
-
-class TestReQuantizeOp3(TestReQuantizeOp1):
-    def set_data_type(self):
-        self.data_type = 'uint8'
-
-
-class TestReQuantizeOp4(TestReQuantizeOp2):
-    def set_data_type(self):
-        self.data_type = 'uint8'
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
deleted file mode 100644
index 11d79d35944005009aadeddd19dc35bf561de6cd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.test_softmax_op import *
-from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
-
-
-class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestSoftmaxMKLDNNOp2(TestSoftmaxOp2):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestSoftmaxMKLDNNOp3(TestSoftmaxOp3):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestSoftmaxMKLDNNOp4(TestSoftmaxOp4):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestSoftmaxMKLDNNOp5(TestSoftmaxOp5):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestSoftmaxMKLDNNOp6(TestSoftmaxOp6):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-# Check if primitives already exist in backward
-class TestSoftmaxMKLDNNPrimitivesAlreadyExist(unittest.TestCase):
-    def setUp(self):
-        super(TestSoftmaxMKLDNNPrimitivesAlreadyExist, self).setUp()
-
-        np.random.seed(123)
-        self.op_type = 'softmax'
-        self.x = np.random.uniform(-1, 1, 2).astype(np.float32)
-        self.out = stable_softmax(self.x)
-        self.out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
-        self.x_grad = self.__softmax_bwd(self.out, self.out_grad)
-
-    # Softmax grad calculation
-    def __softmax_bwd(self, out, out_grad):
-        return out * (out_grad - np.dot(out, out_grad))
-
-    def test_check(self):
-        check_if_mkldnn_primitives_exist_in_bwd(
-            self, self.op_type, self.x, self.out, self.out_grad, self.x_grad)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
deleted file mode 100644
index 5928047b5171bcf33b024040ce79577b8aa0b53a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-from paddle.fluid.tests.unittests.test_sum_op import TestSumOp
-
-
-class TestMKLDNN(TestSumOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
deleted file mode 100644
index a8127bcc781378fa5ef4a189a0b14d079a793946..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-from mkldnn_op_test import format_reorder
-
-
-class TestTransposeOp(OpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.initTestCase()
-        self.initInputData()
-        self.use_mkldnn = True
-        self.axis = (0, 2, 3, 1)
-
-        self.inputs = {
-            'X': format_reorder(self.input_data, self.shape)
-        }  #transform data format to 'NHWC' for INT8 transpose specially.
-
-        self.attrs = {
-            'axis': list(self.axis),
-            'use_mkldnn': self.use_mkldnn,
-        }
-
-        self.outputs = {
-            'XShape': np.random.random(self.shape).astype('int8'),
-            'Out': self.inputs['X'].transpose(self.axis)
-        }
-
-    def init_op_type(self):
-        self.op_type = "transpose2"
-
-    def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
-
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5)
-
-    def initInputData(self):
-        self.input_data = (
-            np.random.randint(0, 100, self.shape) - 50).astype('int8')
-
-
-class TestINT8Case(TestTransposeOp):
-    def initTestCase(self):
-        self.shape = (2, 4, 6, 8)
-
-    def initInputData(self):
-        self.input_data = (
-            np.random.randint(0, 100, self.shape) - 50).astype('int8')
-
-
-class TestUINT8Case(TestTransposeOp):
-    def initTestCase(self):
-        self.shape = (1, 3, 5, 7)
-
-    def initDataType(self):
-        self.input_data = (np.random.randint(0, 100,
-                                             self.shape)).astype('uint8')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
deleted file mode 100644
index 4845eefe367f1ad6a2eb6ffd1f9b0598b1b4fbbd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-from paddle.fluid.tests.unittests.test_transpose_op import TestTransposeOp
-
-
-class TestTransposeMKLDNN(TestTransposeOp):
-    def init_op_type(self):
-        self.op_type = "transpose2"
-        self.use_mkldnn = True
-        return
-
-
-class TestCase0MKLDNN(TestTransposeMKLDNN):
-    def initTestCase(self):
-        self.shape = (3, )
-        self.axis = (0, )
-
-
-class TestCase1a(TestTransposeMKLDNN):
-    def initTestCase(self):
-        self.shape = (3, 4, 5)
-        self.axis = (0, 2, 1)
-
-
-class TestCase1b(TestTransposeMKLDNN):
-    def initTestCase(self):
-        self.shape = (3, 4, 5)
-        self.axis = (2, 1, 0)
-
-
-class TestCase2(TestTransposeMKLDNN):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5)
-        self.axis = (0, 2, 3, 1)
-
-
-class TestCase3(TestTransposeMKLDNN):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5, 6)
-        self.axis = (4, 2, 3, 1, 0)
-
-
-class TestCase4(TestTransposeMKLDNN):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5, 6, 1)
-        self.axis = (4, 2, 3, 1, 0, 5)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/multi_process.py b/python/paddle/fluid/tests/unittests/multi_process.py
deleted file mode 100644
index a67634adfcc0c27d5c9b470c81b880af9130462f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/multi_process.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import time
-
-
-def train():
-    selected_gpus = os.getenv("FLAGS_selected_gpus")
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
-    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-    worker_endpoints = worker_endpoints_env
-    trainers_num = len(worker_endpoints.split(','))
-
-    name = "selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
-        .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
-
-    print(name)
-    with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
-        f.write(name)
-
-
-def train_abort():
-    selected_gpus = os.getenv("FLAGS_selected_gpus")
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
-    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-    worker_endpoints = worker_endpoints_env
-    trainers_num = len(worker_endpoints.split(','))
-
-    if trainer_id == 0:
-        try:
-            # train abort 
-            exit(1)
-        except SystemExit:
-            name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
-                .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
-            print(name)
-            with open("multi_process.check_{}.log".format(trainer_id),
-                      "w") as f:
-                f.write(name)
-            raise
-    else:
-        # sleep 30s to make sure paddle.distributed.launch will terminate this process
-        time.sleep(30)
-        name = "selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
-            .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
-
-        print(name)
-        with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
-            f.write(name)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) == 2 and sys.argv[1] == "abort":
-        train_abort()
-    else:
-        train()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
deleted file mode 100644
index 5ed2d0aa80cd0462d3ac1902a2ec13fc2c1bd844..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP}  ENVS FLAGS_use_ngraph=true)
-endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/ngraph/__init__.py b/python/paddle/fluid/tests/unittests/ngraph/__init__.py
deleted file mode 100644
index b94a21a7e406b833797f8f521c62a2351c2bc30a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
deleted file mode 100644
index 5298c3c2f6f0113977342ab3e09830027585ada1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
deleted file mode 100644
index a7f167cbd415c9000311aa45bef0432f61e668ea..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_activation_op import TestAbs, TestGelu, TestSigmoid, TestSquare, TestRelu, TestTanh
-
-
-class TestNGRAPHReluDim4(TestRelu):
-    def setUp(self):
-        super(TestNGRAPHReluDim4, self).setUp()
-
-        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-
-class TestNGRAPHTanhDim4(TestTanh):
-    def setUp(self):
-        super(TestNGRAPHTanhDim4, self).setUp()
-
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.tanh(self.inputs['X'])}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_adam_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_adam_ngraph_op.py
deleted file mode 100644
index ef2aedf65f4c0cc182738c7a7a538095f8f628d5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_adam_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_adam_op import TestAdamOp1, TestAdamOp2, TestAdamOpMultipleSteps, TestSparseAdamOp
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_assign_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_assign_ngraph_op.py
deleted file mode 100644
index 2c3e7ee6cce75a4176feed14ff4a20d2961e0e53..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_assign_ngraph_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_assign_op import TestAssignOp
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
deleted file mode 100644
index 34fb73f3cf7e8b3d906ed4e04d151923aa219ab1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpTraining, TestBatchNormOpInference
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_cast_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_cast_ngraph_op.py
deleted file mode 100644
index 7732637d2299b0e9ea2092e4d244e09cc8a21c0e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_cast_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_cast_op import TestCastOp1
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_compare_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_compare_ngraph_op.py
deleted file mode 100644
index 2f731a7693bbcee9540a40b3996e9aaef59d13ea..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_compare_ngraph_op.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import sys
-sys.path.append("../")
-import test_compare_op
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py
deleted file mode 100644
index 8517f7cc87ba5d72ccdadf6e1b201a1d7d7989b3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3, TestConcatOp4, TestConcatOp5
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
deleted file mode 100644
index 764d136ec8d1e4f6772d4d1cdd03b6494aa735d1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1, TestDepthwiseConv, TestDepthwiseConv2, TestDepthwiseConv3, TestDepthwiseConvWithDilation, TestDepthwiseConvWithDilation2
-
-
-class TestNGRAPHDepthwiseConv(TestDepthwiseConv):
-    def init_test_case(self):
-        super(TestNGRAPHDepthwiseConv, self).init_test_case()
-        self.use_cuda = False
-
-
-class TestNGRAPHDepthwiseConv2(TestDepthwiseConv2):
-    def init_test_case(self):
-        super(TestNGRAPHDepthwiseConv2, self).init_test_case()
-        self.use_cuda = False
-
-
-class TestNGRAPHDepthwiseConv3(TestDepthwiseConv3):
-    def init_test_case(self):
-        super(TestNGRAPHDepthwiseConv3, self).init_test_case()
-        self.use_cuda = False
-
-
-class TestNGRAPHDepthwiseConvWithDilation(TestDepthwiseConvWithDilation):
-    def init_test_case(self):
-        super(TestNGRAPHDepthwiseConvWithDilation, self).init_test_case()
-        self.use_cuda = False
-
-
-class TestNGRAPHDepthwiseConvWithDilation2(TestDepthwiseConvWithDilation2):
-    def init_test_case(self):
-        super(TestNGRAPHDepthwiseConvWithDilation2, self).init_test_case()
-        self.use_cuda = False
-
-
-del TestDepthwiseConv, TestDepthwiseConv2, TestDepthwiseConv3, TestDepthwiseConvWithDilation, TestDepthwiseConvWithDilation2
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py
deleted file mode 100644
index 3057218a1d80deffe7eb3164c2350143fc38007d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_cross_entropy_op import TestCrossEntropyOp, TestCrossEntropyOp2, TestCrossEntropyOp3, TestCrossEntropyOp4, TestCrossEntropyOp5, TestCrossEntropyOp6, TestCrossEntropyOp7
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_dropout_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_dropout_ngraph_op.py
deleted file mode 100644
index 0448bed10204fb8ddba8546608750313191c4cc9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_dropout_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_dropout_op import TestDropoutOp, TestDropoutOp2, TestDropoutOp3, TestDropoutOp4, TestDropoutOp5, TestDropoutOp6, TestDropoutOp7, TestDropoutOp8, TestDropoutOp9
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
deleted file mode 100644
index 3fb9af3a542d5e6b0de7d8d839408759abdaedcb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#	Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp, TestElementwiseAddOp_broadcast_0
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_div_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_div_ngraph_op.py
deleted file mode 100644
index 55a2a05e23f56b6bc33c979ab46027212e506882..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_div_ngraph_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_elementwise_div_op import ElementwiseDivOp, TestElementwiseDivOp_scalar, TestElementwiseDivOp_Vector, TestElementwiseDivOp_broadcast_0, TestElementwiseDivOp_broadcast_1, TestElementwiseDivOp_broadcast_2, TestElementwiseDivOp_broadcast_3
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_max_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_max_ngraph_op.py
deleted file mode 100644
index c6802417205d634f3fdfc4bdd4d9d2d4aa676d33..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_max_ngraph_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#	Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_elementwise_max_op import TestElementwiseMaxOp_scalar, TestElementwiseMaxOp_Vector, TestElementwiseMaxOp_broadcast_0
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_min_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_min_ngraph_op.py
deleted file mode 100644
index 443445288ad2cdfc2872b7f41ca002fc06a54961..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_min_ngraph_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#	Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_elementwise_min_op import TestElementwiseMinOp_scalar, TestElementwiseMinOp_Vector, TestElementwiseMinOp_broadcast_0
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_mul_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_mul_ngraph_op.py
deleted file mode 100644
index 4f3d913d386fb1b196d708092b5f860ba562d6c6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_mul_ngraph_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#	Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_elementwise_mul_op import TestElementwiseMulOp_scalar, TestElementwiseMulOp_Vector, TestElementwiseMulOp_broadcast_0, TestElementwiseMulOp_broadcast_1, TestElementwiseMulOp_broadcast_2, TestElementwiseMulOp_broadcast_3
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_pow_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_pow_ngraph_op.py
deleted file mode 100644
index 1601de23136e1ea9c8f5abd149df51140b2134d3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_pow_ngraph_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#	Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_elementwise_pow_op import TestElementwisePowOp_scalar, TestElementwisePowOp_tensor, TestElementwisePowOp_broadcast_0
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_sub_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_sub_ngraph_op.py
deleted file mode 100644
index fe29008f5eae71ce9fcf69285ec311a814ad5fa8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_sub_ngraph_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#	Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_elementwise_sub_op import TestElementwiseSubOp_scalar, TestElementwiseSubOp_Vector, TestElementwiseSubOp_broadcast_0
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
deleted file mode 100644
index 2b10b8f7a3ac0f978c13bd86824b939e69c5336a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.test_fill_constant_op import TestFillConstantOp1, TestFillConstantOp2, TestFillConstantOpWithSelectedRows
-
-
-class TestNGRAPHFillConstantFP64(TestFillConstantOp1):
-    def setUp(self):
-        super(TestNGRAPHFillConstantFP64, self).setUp()
-
-        self.attrs = {'shape': [123, 92], 'value': 3.8, 'dtype': 6}
-        self.outputs = {'Out': np.full((123, 92), 3.8)}
-
-
-class TestNGRAPHFillConstantINT32(TestFillConstantOp2):
-    def setUp(self):
-        super(TestNGRAPHFillConstantINT32, self).setUp()
-
-        self.attrs = {'shape': [123, 92], 'dtype': 2}
-        self.outputs = {'Out': np.full((123, 92), 0)}
-
-
-class TestNGRAPHFillConstantINT64(TestFillConstantOp2):
-    def setUp(self):
-        super(TestNGRAPHFillConstantINT64, self).setUp()
-
-        self.attrs = {'shape': [123, 92], 'dtype': 3}
-        self.outputs = {'Out': np.full((123, 92), 0)}
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_fill_zeros_like_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_fill_zeros_like_ngraph_op.py
deleted file mode 100644
index 45e6af7f961cbebb3ee04c3181068c807830dedc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_fill_zeros_like_ngraph_op.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_fill_zeros_like_op import TestFillZerosLikeOp
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_gather_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_gather_ngraph_op.py
deleted file mode 100644
index 403145dd7347453b5dac4d6175cc626de7d5a3f0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_gather_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_gather_op import TestGatherOp, TestCase1
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_increment_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_increment_ngraph_op.py
deleted file mode 100644
index 557cdc935600bcac47e811adba6ff7f7eba71382..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_increment_ngraph_op.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-
-
-class TestNGRAPHIncrementOp(OpTest):
-    def setUp(self):
-        self.op_type = "increment"
-        self.dtype = np.float32
-        self.init_dtype_type()
-        self.inputs = {'X': np.random.random(1).astype(self.dtype)}
-        self.attrs = {'step': 2.0}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.dtype(self.attrs['step'])
-        }
-        self._cpu_only = True
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_layer_norm_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_layer_norm_ngraph_op.py
deleted file mode 100644
index ffdc64a23018521086219e3690a81e9b77aca3a7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_layer_norm_ngraph_op.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_layer_norm_op import TestLayerNormOp
-
-
-class TestLayerNormNGRAPHOp(TestLayerNormOp):
-    def setUp(self):
-        super(TestLayerNormNGRAPHOp, self).setUp()
-        self.use_cudnn = False
-
-
-del TestLayerNormOp
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_logical_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_logical_ngraph_op.py
deleted file mode 100644
index 2f227ce87ca483ccd9af78ce02262d8f9effd39c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_logical_ngraph_op.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-import numpy as np
-
-from test_logical_op import create_test_class
-
-create_test_class('logical_and', lambda _a, _b: np.logical_and(_a, _b))
-create_test_class('logical_or', lambda _a, _b: np.logical_or(_a, _b))
-create_test_class('logical_not', lambda _a: np.logical_not(_a), False)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_lookup_table_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_lookup_table_ngraph_op.py
deleted file mode 100644
index d6ec4b22324bd3201f0c77ff8159bbc4183bf58b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_lookup_table_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest, sys
-sys.path.append("../")
-from test_lookup_table_op import TestLookupTableOp, TestLookupTableOpWithTensorIds, TestLookupTableOpWithPadding, TestLookupTableOpWithTensorIdsAndPadding, TestLookupTableWIsSelectedRows, TestLookupTableWithTensorIdsWIsSelectedRows
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_lrn_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_lrn_ngraph_op.py
deleted file mode 100644
index 4c998c6ca2ec3bba8d7c3257ed3fe4fddc70a46a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_lrn_ngraph_op.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_lrn_op import TestLRNOp
-
-
-class TestLRNNGRAPHOp(TestLRNOp):
-    def test_check_output(self):
-        self.check_output(atol=0.002)
-
-
-del TestLRNOp
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_matmul_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_matmul_ngraph_op.py
deleted file mode 100644
index 4bb28529d8772a36f5146a91c84a1219843d7792..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_matmul_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#	Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_matmul_op import *
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
deleted file mode 100644
index b4894734cbcc11cf5eec7401297dc35545aa7268..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_momentum_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_momentum_ngraph_op.py
deleted file mode 100644
index 2c3549d907f5f67abc0cbd448a492d95b8ae6c32..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_momentum_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_momentum_op import TestMomentumOp1, TestMomentumOp2, TestLarsMomentumOp, TestSparseMomentumOp, TestSparseMomentumOp2
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
deleted file mode 100644
index 549d03f6e92dc7e88ec8618e5f97287bb68ed0d9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_parallel_executor_ngraph.py b/python/paddle/fluid/tests/unittests/ngraph/test_parallel_executor_ngraph.py
deleted file mode 100644
index 3afe2b99083df320834c39f2cfef58b63bc22c9e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_parallel_executor_ngraph.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from paddle.fluid.tests.unittests.simple_nets import simple_fc_net
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import compiler
-import numpy as np
-import unittest
-import os
-import sys
-import math
-
-
-class TestPallelExecutorNgraph(unittest.TestCase):
-    def check_network_convergence(self, build_strategy=None):
-        os.environ['CPU_NUM'] = str(2)
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net()
-            test_program = main.clone(for_test=True)
-
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
-
-            batch_size = 32
-            image = np.random.normal(size=(batch_size, 784)).astype('float32')
-            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup)
-            feed_dict = {'image': image, 'label': label}
-
-            train_cp = compiler.CompiledProgram(main).with_data_parallel(
-                loss_name=loss.name, build_strategy=build_strategy)
-            test_cp = compiler.CompiledProgram(test_program).with_data_parallel(
-                loss_name=loss.name,
-                build_strategy=build_strategy,
-                share_vars_from=train_cp)
-
-            for i in range(5):
-                _ = exe.run(train_cp, fetch_list=[loss.name], feed=feed_dict)
-                test_loss, = exe.run(test_cp,
-                                     fetch_list=[loss.name],
-                                     feed=feed_dict)
-                train_loss = exe.run(train_cp,
-                                     fetch_list=[loss.name],
-                                     feed=feed_dict)
-
-                avg_test_loss_val = np.array(test_loss).mean()
-                if math.isnan(float(avg_test_loss_val)):
-                    sys.exit("got NaN loss, testing failed.")
-
-                avg_train_loss_val = np.array(train_loss).mean()
-                if math.isnan(float(avg_train_loss_val)):
-                    sys.exit("got NaN loss, training failed.")
-
-                self.assertTrue(
-                    np.allclose(
-                        train_loss, test_loss, atol=1e-8),
-                    "Train loss: " + str(train_loss) + "\n Test loss:" +
-                    str(test_loss))
-
-    def test_parallel_testing(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.enable_inplace = False
-        build_strategy.memory_optimize = False
-        self.check_network_convergence(build_strategy=build_strategy)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
deleted file mode 100644
index ff82e9fa1d3d343aa7faf56a0bd27d2c9edc1ea4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
-
-
-class TestNGRAPHCeilMode(TestCase1):
-    def setUp(self):
-        super(TestNGRAPHCeilMode, self).setUp()
-
-    def init_ceil_mode(self):
-        self.ceil_mode = True
-
-
-class TestNGRAPHAdaptive(TestCase1):
-    def setUp(self):
-        super(TestNGRAPHAdaptive, self).setUp()
-
-    def init_adaptive(self):
-        self.adaptive = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_reduce_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_reduce_ngraph_op.py
deleted file mode 100644
index 458f65338da923cdfff2e553b63b53f7bf4abf94..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_reduce_ngraph_op.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-import numpy as np
-from test_reduce_op import TestSumOp, Test1DReduce, \
-     Test2DReduce0, Test2DReduce1, Test3DReduce0, Test3DReduce1, Test3DReduce2, \
-     Test3DReduce3, TestKeepDimReduce, TestKeepDimReduceSumMultiAxises, \
-     TestReduceSumWithDimOne, TestReduceSumWithNumelOne
-
-
-class Test3DReduce21(Test1DReduce):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.attrs = {'dim': [1, 2]}
-        self.inputs = {'X': np.random.random((20, 1, 5)).astype("float64")}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
-        }
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_reshape_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_reshape_ngraph_op.py
deleted file mode 100644
index 928e1cb4de993f33e394e495f0eab12b952a49ea..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_reshape_ngraph_op.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-
-from test_reshape_op import TestReshapeOp, TestReshapeOpDimInfer1, TestReshapeOpDimInfer2
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
deleted file mode 100644
index 8beb44f55e487eef5f1957e9284d4a711c9770aa..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import unittest
-from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_slice_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_slice_ngraph_op.py
deleted file mode 100644
index b6f1f4e0dc80742c0b8ca34c4a398ab989dff62d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_slice_ngraph_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_slice_op import TestSliceOp, TestSliceOp_decs_dim, TestSliceOp_decs_dim_2, TestSliceOp_decs_dim_3, TestSliceOp_decs_dim_5, TestSliceOp_decs_dim_6, TestCase1, TestCase2
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py
deleted file mode 100644
index 09c52e2b1084fc5f716a6d1abfb4968d2c5460da..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from paddle.fluid.tests.unittests.test_softmax_op import TestSoftmaxOp, TestSoftmaxOp2, TestSoftmaxOp3, TestSoftmaxOp4, TestSoftmaxOp5
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py
deleted file mode 100644
index 86961b8c366c69a210e47ab5d1ece6ba85d1d262..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from paddle.fluid.tests.unittests.test_softmax_with_cross_entropy_op import TestSoftmaxWithCrossEntropyOp, TestSoftmaxWithCrossEntropyOp2, TestSoftmaxWithCrossEntropyOp3
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_stack_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_stack_ngraph_op.py
deleted file mode 100644
index 23ef261331ede0b1324da116ade9d7ba0f5a5832..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_stack_ngraph_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_stack_op import TestStackOpBase, TestStackOp1, TestStackOp2, TestStackOp3, TestStackOp4, TestStackOp5, TestStackOp6
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py
deleted file mode 100644
index ed9fb618024301818a12fd0d02b09c6f3a5f2c53..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import unittest
-from paddle.fluid.tests.unittests.test_sum_op import TestSumOp, TestSelectedRowsSumOp, TestLoDTensorAndSelectedRowsOp
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
deleted file mode 100644
index d2319c4d921fccb950b1a3059fdecd3b3b044182..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d, TestTopkOp2, TestTopkOp3, TestTopkOp4
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_transpose_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_transpose_ngraph_op.py
deleted file mode 100644
index 27bf82fc5983b25569962ae2347391a3cf0e3b7b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ngraph/test_transpose_ngraph_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest, sys
-sys.path.append("../")
-from test_transpose_op import TestTransposeOp, TestCase0, TestCase1, TestCase2, TestCase3, TestCase4
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
deleted file mode 100644
index aed0008350be7ce4e93e75ee1a5aeb5f75e71175..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ /dev/null
@@ -1,1043 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import unittest
-import warnings
-import numpy as np
-import random
-import six
-import time
-import itertools
-import collections
-from collections import defaultdict
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.backward import append_backward
-from paddle.fluid.op import Operator
-from paddle.fluid.executor import Executor
-from paddle.fluid.framework import Program, OpProtoHolder, Variable
-from testsuite import create_op, set_input, append_input_output, append_loss_ops
-
-
-def randomize_probability(batch_size, class_num, dtype='float32'):
-    prob = np.random.uniform(
-        0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
-    prob_sum = prob.sum(axis=1)
-    for i in six.moves.xrange(len(prob)):
-        prob[i] /= prob_sum[i]
-    return prob
-
-
-def get_numeric_gradient(place,
-                         scope,
-                         op,
-                         inputs,
-                         input_to_check,
-                         output_names,
-                         delta=0.005,
-                         in_place=False):
-    # FIXME: change this method by compile time concepts
-    set_input(scope, op, inputs, place)
-
-    def product(dim):
-        return six.moves.reduce(lambda a, b: a * b, dim, 1)
-
-    tensor_to_check = scope.find_var(input_to_check).get_tensor()
-    tensor_size = product(tensor_to_check.shape())
-    tensor_to_check_dtype = tensor_to_check._dtype()
-    if tensor_to_check_dtype == core.VarDesc.VarType.FP32:
-        tensor_to_check_dtype = np.float32
-    elif tensor_to_check_dtype == core.VarDesc.VarType.FP64:
-        tensor_to_check_dtype = np.float64
-    elif tensor_to_check_dtype == core.VarDesc.VarType.FP16:
-        tensor_to_check_dtype = np.float16
-        # set delta as np.float16, will automatic convert to float32, float64
-        delta = np.array(delta).astype(np.float16)
-    else:
-        raise ValueError("Not supported data type " + str(
-            tensor_to_check_dtype))
-
-    def get_output():
-        sum = []
-        op.run(scope, place)
-        for output_name in output_names:
-            sum.append(
-                np.array(scope.find_var(output_name).get_tensor()).astype(
-                    tensor_to_check_dtype).mean())
-        return tensor_to_check_dtype(np.array(sum).sum() / len(output_names))
-
-    gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
-
-    def __get_elem__(tensor, i):
-        if tensor_to_check_dtype == np.float16:
-            numpy_tensor = np.array(tensor).astype(np.float16)
-            numpy_tensor = numpy_tensor.flatten()
-            return numpy_tensor[i]
-        elif tensor_to_check_dtype == np.float32:
-            return tensor._get_float_element(i)
-        else:
-            return tensor._get_double_element(i)
-
-    def __set_elem__(tensor, i, e):
-        if tensor_to_check_dtype == np.float16:
-            numpy_tensor = np.array(tensor).astype(np.float16)
-            shape = numpy_tensor.shape
-            numpy_tensor = numpy_tensor.flatten()
-            numpy_tensor[i] = e
-            numpy_tensor = numpy_tensor.reshape(shape).view(np.uint16)
-            tensor.set(numpy_tensor, place)
-        elif tensor_to_check_dtype == np.float32:
-            tensor._set_float_element(i, e)
-        else:
-            tensor._set_double_element(i, e)
-
-    # we only compute gradient of one element each time.
-    # we use a for loop to compute the gradient of every element.
-    for i in six.moves.xrange(tensor_size):
-        if in_place:
-            set_input(scope, op, inputs, place)
-
-        # get one input element throw it's index i.
-        origin = __get_elem__(tensor_to_check, i)
-        # add delta to it, run op and then get the sum of the result tensor.
-        x_pos = origin + delta
-        __set_elem__(tensor_to_check, i, x_pos)
-        y_pos = get_output()
-
-        if in_place:
-            set_input(scope, op, inputs, place)
-
-        x_neg = origin - delta
-        __set_elem__(tensor_to_check, i, x_neg)
-        y_neg = get_output()
-
-        __set_elem__(tensor_to_check, i, origin)
-        gradient_flat[i] = (y_pos - y_neg) / delta / 2
-
-    return gradient_flat.reshape(tensor_to_check.shape())
-
-
-class OpTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        '''Fix random seeds to remove randomness from tests'''
-        cls._np_rand_state = np.random.get_state()
-        cls._py_rand_state = random.getstate()
-        cls.call_once = False
-        cls.dtype = "float32"
-        cls.outputs = {}
-
-        np.random.seed(123)
-        random.seed(124)
-
-    @classmethod
-    def tearDownClass(cls):
-        """Restore random seeds"""
-        np.random.set_state(cls._np_rand_state)
-        random.setstate(cls._py_rand_state)
-
-    def try_call_once(self, data_type):
-        if not self.call_once:
-            self.call_once = True
-            self.dtype = data_type
-            # See the comment of np_dtype_to_fluid_dtype
-            # If the input type is uint16, we assume use float16
-            # for lodtensor dtype.
-            if self.dtype == np.uint16:
-                self.dtype == np.float16
-
-    def infer_dtype_from_inputs_outputs(self, inputs, outputs):
-        def infer_dtype(numpy_dict):
-            assert isinstance(
-                numpy_dict,
-                dict), "self.inputs, self.outputs must be numpy_dict"
-            for var_name, var_value in six.iteritems(numpy_dict):
-                if isinstance(var_value, (np.ndarray, np.generic)):
-                    self.try_call_once(var_value.dtype)
-                elif isinstance(var_value, (list, tuple)):
-                    # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
-                    if len(var_value) > 1 and isinstance(var_value[1], (
-                            np.ndarray, np.generic)):
-                        instance = var_value[1]
-                        self.try_call_once(instance[1].dtype)
-                else:
-                    self.try_call_once("float32")
-
-        infer_dtype(inputs)
-        infer_dtype(outputs)
-
-    def feed_var(self, input_vars, place):
-        feed_map = {}
-        for var_name in input_vars:
-            if isinstance(input_vars[var_name], list):
-                for name, np_value in self.inputs[var_name]:
-                    tensor = core.LoDTensor()
-                    if isinstance(np_value, tuple):
-                        tensor.set(
-                            OpTest.np_value_to_fluid_value(np_value[0]), place)
-                        tensor.set_recursive_sequence_lengths(np_value[1])
-                    else:
-                        tensor.set(
-                            OpTest.np_value_to_fluid_value(np_value), place)
-                    feed_map[name] = tensor
-            else:
-                tensor = core.LoDTensor()
-                if isinstance(self.inputs[var_name], tuple):
-                    tensor.set(
-                        OpTest.np_value_to_fluid_value(self.inputs[var_name][
-                            0]), place)
-                    tensor.set_recursive_sequence_lengths(self.inputs[var_name][
-                        1])
-                else:
-                    tensor.set(
-                        OpTest.np_value_to_fluid_value(self.inputs[var_name]),
-                        place)
-                feed_map[var_name] = tensor
-
-        return feed_map
-
-    def _append_ops(self, block):
-        op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
-        "infer datatype from inputs and outputs for this test case"
-        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
-        inputs = append_input_output(block, op_proto, self.inputs, True,
-                                     self.dtype)
-        outputs = append_input_output(block, op_proto, self.outputs, False,
-                                      self.dtype)
-
-        if hasattr(self, "cache_name_list"):
-            for name in self.cache_name_list:
-                inputs[name] = block.create_var(
-                    name=name,
-                    persistable=True,
-                    type=core.VarDesc.VarType.RAW,
-                    stop_gradient=True)
-
-        op = block.append_op(
-            type=self.op_type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=self.attrs if hasattr(self, "attrs") else dict())
-        # infer variable type and infer shape in compile-time 
-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-
-        return op
-
-    def _get_io_vars(self, block, numpy_inputs):
-        inputs = {}
-        for name, value in six.iteritems(numpy_inputs):
-            if isinstance(value, list):
-                var_list = [
-                    block.var(sub_name) for sub_name, sub_value in value
-                ]
-                inputs[name] = var_list
-            else:
-                inputs[name] = block.var(name)
-        return inputs
-
-    def _get_inputs(self, block):
-        return self._get_io_vars(block, self.inputs)
-
-    def _get_outputs(self, block):
-        return self._get_io_vars(block, self.outputs)
-
-    def calc_output(self, place):
-        outs, _ = self._calc_output(place)
-        return outs
-
-    def _create_var_from_numpy(self, value):
-        if isinstance(value, tuple):
-            data = value[0]
-            lod = value[1]
-            v = fluid.dygraph.base.to_variable(value=data)
-            v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod)
-            return v
-        else:
-            return fluid.dygraph.base.to_variable(value)
-
-    def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
-        with fluid.dygraph.base.guard(place=place):
-            block = fluid.default_main_program().global_block()
-
-            # prepare input variable
-            inputs = defaultdict(list)
-            for name, np_value in six.iteritems(self.inputs):
-                if not isinstance(np_value, list):
-                    np_value = [np_value]
-
-                for i in range(len(np_value)):
-                    inputs[name].append(
-                        self._create_var_from_numpy(np_value[i]))
-
-            # prepare output variable
-            outputs = defaultdict(list)
-            for name, np_value in six.iteritems(self.outputs):
-                if not isinstance(np_value, list):
-                    np_value = [np_value]
-
-                for i in range(len(np_value)):
-                    value = np_value[i]
-                    if isinstance(value, tuple):
-                        v = block.create_var(
-                            name="%s_out%d" % (name, i),
-                            dtype=value[0].dtype,
-                            type=core.VarDesc.VarType.LOD_TENSOR,
-                            persistable=False,
-                            stop_gradient=False)
-                        v._ivar.value().get_tensor(
-                        ).set_recursive_sequence_lengths(value[1])
-                    else:
-                        v = block.create_var(
-                            name="%s_out%d" % (name, i),
-                            dtype=value.dtype,
-                            type=core.VarDesc.VarType.LOD_TENSOR,
-                            persistable=False,
-                            stop_gradient=False)
-                    outputs[name].append(v)
-
-            block.append_op(
-                type=self.op_type,
-                inputs=inputs,
-                outputs=outputs,
-                attrs=self.attrs)
-            return outputs
-
-    def _calc_output(self,
-                     place,
-                     parallel=False,
-                     no_check_set=None,
-                     loss=None,
-                     enable_inplace=None,
-                     for_inplace_test=None):
-        program = Program()
-        block = program.global_block()
-        op = self._append_ops(block)
-
-        inputs = self._get_inputs(block)
-        outputs = self._get_outputs(block)
-        feed_map = self.feed_var(inputs, place)
-
-        if for_inplace_test:
-            # Some variables' tensors hold no buffer (tensor's _holder is NULL), like XShape in reshape2 op, 
-            # and the shapes of those variables contain 0 (eg. Xshape.shape = [0, 2, 5]). 
-            # Set persistable for those variables in order to get them from global_scope for inplace grad test directly other than feed them,
-            # since feed op calls check_memory_size() which fails when tensor's holder_ is NULL.
-            for out_name in op.output_arg_names:
-                var = block.var(out_name)
-                if 0 in var.shape:
-                    var.persistable = True
-        original_program = program
-        if parallel:
-            use_cuda = False
-            if isinstance(place, fluid.CUDAPlace):
-                use_cuda = True
-            compiled_prog = fluid.CompiledProgram(program).with_data_parallel(
-                loss_name=loss.name if loss else None, places=place)
-            program = compiled_prog
-        fetch_list = getattr(self, "fetch_list", [])
-        # if the fetch_list is customized by user, we use it directly.
-        # if not, fill the fetch_list by the user configured outputs in test.
-        if len(fetch_list) == 0:
-            for var_name, var in six.iteritems(outputs):
-                if no_check_set is not None and var_name in no_check_set:
-                    continue
-                if isinstance(var, list):
-                    for v in var:
-                        fetch_list.append(v.name)
-                else:
-                    fetch_list.append(var.name)
-        # if the fetch_list still empty, fill the fetch_list by the operator output.
-        if len(fetch_list) == 0:
-            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
-                fetch_list.append(str(out_name))
-
-        if enable_inplace is not None:
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.enable_inplace = enable_inplace
-
-            compiled_prog = fluid.CompiledProgram(program).with_data_parallel(
-                build_strategy=build_strategy, places=place)
-            program = compiled_prog
-
-        executor = Executor(place)
-        outs = executor.run(program,
-                            feed=feed_map,
-                            fetch_list=fetch_list,
-                            return_numpy=False)
-        if for_inplace_test:
-            return outs, fetch_list, feed_map, original_program, op.desc
-        else:
-            return outs, fetch_list
-
-    def _compare_expect_and_actual_outputs(self,
-                                           place,
-                                           fetch_list,
-                                           expect_outs,
-                                           actual_outs,
-                                           inplace_atol=None):
-        """Compare expect outs and actual outs of an tested op.
-
-        Args:
-            place (CPUPlace | CUDAPlace): The place where the op runs. 
-            fetch_list (list): The outputs of tested op.
-            expect_outs (list): The expect outs of tested op.
-            actual_outs (list): The actual outs of tested op.
-            inplace_atol (float): The tolerable error, only set when tested op doesn't ensure computational consistency, like group_norm op.
-
-        Returns:
-            None.
-        """
-        # compare expect_outs and actual_outs
-        for i, name in enumerate(fetch_list):
-            if inplace_atol is not None:
-                self.assertTrue(
-                    np.allclose(
-                        np.array(expect_outs[i]),
-                        np.array(actual_outs[i]),
-                        atol=inplace_atol),
-                    "Output (" + name + ") has diff at " + str(place) +
-                    " when using and not using inplace" + "\nExpect " +
-                    str(expect_outs[i]) + "\n" + "But Got" + str(actual_outs[i])
-                    + " in class " + self.__class__.__name__)
-            else:
-                self.assertTrue(
-                    np.array_equal(
-                        np.array(expect_outs[i]), np.array(actual_outs[i])),
-                    "Output (" + name + ") has diff at " + str(place) +
-                    " when using and not using inplace" + "\nExpect " +
-                    str(expect_outs[i]) + "\n" + "But Got" + str(actual_outs[i])
-                    + " in class " + self.__class__.__name__ + '\n')
-
-    def _construct_grad_program_from_forward(self, fwd_program, grad_op_desc,
-                                             op_grad_to_var):
-        """Generate grad_program which contains the grad_op.
-
-        Args:
-            fwd_program (tuple): The program that contains grad_op_desc's corresponding forward op.
-            grad_op_desc (OpDesc): The OpDesc of grad op.
-            op_grad_to_var (dict): The relation of variables in grad op and its forward op. 
-
-        Returns:
-            grad_program (program): The program which contains the grad_op.
-        """
-        grad_program = Program()
-        grad_block = grad_program.global_block()
-        new_op_desc = grad_block.desc.append_op()
-        new_op_desc.copy_from(grad_op_desc)
-        grad_program._sync_with_cpp()
-
-        # Create grad vars based on fwd vars (shape and dtype)
-        for arg in grad_op_desc.input_arg_names(
-        ) + grad_op_desc.output_arg_names():
-            fwd_var_name = op_grad_to_var.get(arg, None)
-            if fwd_var_name is None:
-                fwd_var_name = arg
-            fwd_var = fwd_program.global_block().vars.get(fwd_var_name)
-            assert fwd_var is not None, "{} cannot be found".format(
-                fwd_var_name)
-            grad_var = grad_block.create_var(
-                name=arg,
-                dtype=fwd_var.dtype,
-                shape=fwd_var.shape,
-                type=fwd_var.type,
-                persistable=False)
-
-            # Some variables' tensors hold no buffer (tensor's _holder is NULL), like XShape in reshape2 op, 
-            # and the shapes of those variables contain 0 (eg. Xshape.shape = [0, 2, 5]). 
-            # Set persistable for those variables in order to get them from global_scope for inplace grad test directly other than feed them,
-            # since feed op calls check_memory_size() which fails when tensor's holder_ is NULL.
-            if 0 in grad_var.shape:
-                grad_var.persistable = True
-        grad_program._sync_with_cpp()
-        return grad_program
-
-    def _construct_grad_feed_map_from_forward(self, place, fwd_res,
-                                              grad_op_desc, op_grad_to_var):
-        """Generate grad_feed_map for grad_program.
-
-        since we don`t really check gradient accuracy, but check the consistency when using and not using inplace,
-        we use fwd outs (also inputs sometimes) to construct grad inputs.
-
-        Args:
-            place (CPUPlace | CUDAPlace): The place where the op runs. 
-            fwd_res (tuple): The outputs of its forward op, in the same form as returns of _calc_outputs() when for_inplace_test is True.
-                i.e., tuple(fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc)
-            grad_op_desc (OpDesc): The OpDesc of grad op.
-            op_grad_to_var (dict): The relation of variables in grad op and its fwd_op. 
-
-        Returns:
-            grad_feed_map (dict): The feed_map of grad_op.
-        """
-        fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc = fwd_res
-        p = core.Place()
-        p.set_place(place)
-        grad_feed_map = {}
-        for arg in grad_op_desc.input_arg_names():
-            if arg in fwd_feed_map.keys():
-                grad_feed_map[arg] = fwd_feed_map[arg]._copy(p)
-            else:
-                fwd_var_name = op_grad_to_var.get(arg, None)
-                if fwd_var_name is None:
-                    fwd_var_name = arg
-
-                for i, out_name in enumerate(fwd_fetch_list):
-                    if out_name == fwd_var_name:
-                        # don't feed variables whose tensors hold no buffer (shape contains 0 like shape = [0,2,5] and holder_ is NULL), like XShape in reshape2 op.
-                        # get them from global_scope directly since we have set them persistable in fwd execution
-                        if 0 in fwd_program.global_block().var(out_name).shape:
-                            continue
-                        else:
-                            grad_feed_map[arg] = fwd_outs[i]._copy(p)
-        return grad_feed_map
-
-    def _get_need_run_ops(self, op_desc, fwd_op_desc=None):
-        """Postorder traversal of the 'grad' tree to get all ops that need to run during inplace test.
-        An op needs to run druing inplace check if,
-        (1) it has infer_inplace,
-        (2) it has infer_inplace in its grad descendants. (since we need its outputs as to construct its grad's inputs)
-        
-        Args:
-            op_desc (OpDesc): The op_desc of current op. 
-            fwd_op_desc (OpDesc): The op_desc of current op's forward op, None if current op has no forward op. 
-                Eg. relu's fwd_op is None, relu_grad's fwd_op is relu, relu_grad_grad's fwd_op is relu_grad, etc.
-            
-        Returns:
-            need_run_ops (list[(op_desc, fwd_op_desc)]): The ops that need to run during inplace test.
-        """
-        need_run_ops = []
-        visited_ops = []
-
-        def _dfs_grad_op(op_desc, fwd_op_desc=None):
-            visited_ops.append(op_desc.type())
-            has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type())
-            has_grad_op_maker = fluid.core.has_grad_op_maker(op_desc.type())
-            has_infer_inplace_in_grad_descendants = False
-            if not has_grad_op_maker:
-                has_infer_inplace_in_descendants = False
-            else:
-                # get grad_op_desc 
-                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    op_desc, set(), [])
-                if not grad_op_desc_list:
-                    has_infer_inplace_in_grad_descendants = False
-                else:
-                    for i, grad_op_desc in enumerate(grad_op_desc_list):
-                        if grad_op_desc.type(
-                        ) not in visited_ops and _dfs_grad_op(
-                                grad_op_desc, fwd_op_desc=op_desc):
-                            has_infer_inplace_in_grad_descendants = True
-            if has_infer_inplace or has_infer_inplace_in_grad_descendants:
-                need_run_ops.append((op_desc, fwd_op_desc))
-                return True
-            else:
-                return False
-
-        _dfs_grad_op(op_desc, fwd_op_desc=fwd_op_desc)
-        return need_run_ops
-
-    def _check_forward_inplace(self,
-                               place,
-                               no_check_set=None,
-                               inplace_atol=None):
-        """Chech the inplace correctness of given op (self.op_type).
-        Run the op twice with same inputs, one enable inplace and another disable, compare their outputs.
-        
-        Args:
-            place (CPUPlace | CUDAPlace): The place where the op runs. 
-            no_check_set (list): The names of outputs that needn't check, like XShape of reshape op.
-            inplace_atol (float): The tolerable error, only set when op doesn't ensure computational consistency, like group_norm op.
-
-        Returns:
-            expect_res (tuple(outs, fetch_list, feed_map, program, op_desc)): The results of given op. 
-                We return this to construct grad_program and grad_feed_map for grad inplace check. 
-        """
-        # _calc_output() returns in the form tuple(outs, fetch_list, feed_map, program, op_desc) when for_inplace_test=True.
-        expect_res = self._calc_output(
-            place,
-            no_check_set=no_check_set,
-            enable_inplace=False,
-            for_inplace_test=True)
-        actual_res = self._calc_output(
-            place,
-            no_check_set=no_check_set,
-            enable_inplace=True,
-            for_inplace_test=True)
-        # compare expect_outs and actual_outs
-        self._compare_expect_and_actual_outputs(
-            place,
-            expect_res[1],
-            expect_res[0],
-            actual_res[0],
-            inplace_atol=inplace_atol)
-        return expect_res
-
-    def _calc_grad_output(self,
-                          place,
-                          fwd_res,
-                          grad_op_desc,
-                          enable_inplace=None):
-        """Calculate grad_output for given grad_op_desc.
-
-        since we don`t really check gradient accuracy, but check the consistency when using and not using inplace,
-        we use fwd outs (also inputs sometimes) to construct grad inputs.
-
-        Args:
-            place (CPUPlace | CUDAPlace): The place where the op runs. 
-            fwd_res (tuple): The outputs of its forward op, in the same form as returns of _calc_outputs() when for_inplace_test is True.
-                i.e., tuple(fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc).
-            grad_op_desc (OpDesc): The OpDesc of grad op.
-            enable_inplace (bool): Enable inplace or not.
-
-        Returns:
-            res (tuple(outs, fetch_list, feed_map, program, op_desc)): The results of given grad_op_desc.
-        """
-        fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc = fwd_res
-        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(fwd_op_desc,
-                                                                  set(), [])
-        grad_program = self._construct_grad_program_from_forward(
-            fwd_program, grad_op_desc, op_grad_to_var)
-        grad_feed_map = self._construct_grad_feed_map_from_forward(
-            place, fwd_res, grad_op_desc, op_grad_to_var)
-        grad_fetch_list = grad_op_desc.output_arg_names()
-        exe = Executor(place)
-        program = grad_program
-        if enable_inplace is not None:
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.enable_inplace = enable_inplace
-            compiled_program = fluid.CompiledProgram(
-                grad_program).with_data_parallel(
-                    loss_name="", build_strategy=build_strategy, places=place)
-            program = compiled_program
-        outs = exe.run(program,
-                       feed=grad_feed_map,
-                       fetch_list=grad_fetch_list,
-                       return_numpy=False)
-        return outs, grad_fetch_list, grad_feed_map, grad_program, grad_op_desc
-
-    def _check_grad_inplace(self,
-                            place,
-                            fwd_res,
-                            grad_op_desc,
-                            inplace_atol=None):
-        """Chech the inplace correctness of given grad_op_desc.
-
-        Run the grad op twice with same inputs, one enable inplace and another disable, compare their outputs.
-        It works like _check_forward_inplace, but the way to construct program and feed_map differs.
-        So we define a new function for grad, grad_grad, etc.
-
-        Args:
-            place (CPUPlace | CUDAPlace): The place where the op runs. 
-            fwd_res (tuple): The outputs of its forward op, in the same form as returns of _calc_outputs() when for_inplace_test is True.
-                i.e., tuple(fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc).
-            grad_op_desc (OpDesc): The OpDesc of grad op.
-            inplace_atol (float): The tolerable error, only set when op doesn't ensure computational consistency, like group_norm op.
-
-        Returns:
-            expect_res (tuple(outs, fetch_list, feed_map, program, op_desc)): The results of given op. 
-                We return this to construct grad_program and grad_feed_map for grad inplace check. 
-        """
-        expect_res = self._calc_grad_output(
-            place, fwd_res, grad_op_desc, enable_inplace=False)
-        actual_res = self._calc_grad_output(
-            place, fwd_res, grad_op_desc, enable_inplace=True)
-        self._compare_expect_and_actual_outputs(
-            place,
-            expect_res[1],
-            expect_res[0],
-            actual_res[0],
-            inplace_atol=inplace_atol)
-        return expect_res
-
-    def check_inplace_output_with_place(self,
-                                        place,
-                                        no_check_set=None,
-                                        inplace_atol=None):
-        """Chech the inplace correctness of given op, its grad op, its grad_grad op, etc.
-
-        (1) Get all ops need to run. (see conditions in _get_need_run_ops())
-        (2) Run op in need_run_ops, and do inplace check if it has infer_inplace.
-
-        Args:
-            place (CPUPlace | CUDAPlace): The place where the op runs. 
-            no_check_set (list): The names of outputs that needn't check, like XShape of reshape op.
-            inplace_atol (float): The tolerable error, only set when op doesn't ensure computational consistency, like group_norm op.
-
-        Returns:
-            None
-        """
-        has_infer_inplace = fluid.core.has_infer_inplace(self.op_type)
-        has_grad_op_maker = fluid.core.has_grad_op_maker(self.op_type)
-
-        fwd_res = self._calc_output(
-            place, no_check_set=no_check_set, for_inplace_test=True)
-        op_desc = fwd_res[4]
-        need_run_ops = self._get_need_run_ops(op_desc)
-
-        res = {}
-        for op_desc, father_op_desc in reversed(need_run_ops):
-            # The first one is the forward op
-            has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type())
-            if op_desc.type() == self.op_type:
-                if has_infer_inplace:
-                    res[op_desc] = self._check_forward_inplace(
-                        place,
-                        no_check_set=no_check_set,
-                        inplace_atol=inplace_atol)
-                else:
-                    res[op_desc] = self._calc_output(
-                        place, no_check_set=no_check_set, for_inplace_test=True)
-            else:
-                # TODO(zhiqiu): enhance inplace_grad test for ops (sum and activation) using mkldnn/ngraph
-                # skip op that use_mkldnn and use_ngraph currently
-                flags_use_mkldnn = fluid.core.get_flags_use_mkldnn()
-                attrs_use_mkldnn = hasattr(
-                    self,
-                    'attrs') and bool(self.attrs.get('use_mkldnn', False))
-                if flags_use_mkldnn or attrs_use_mkldnn:
-                    warnings.warn(
-                        "check inplace_grad for ops using mkldnn is not supported"
-                    )
-                    continue
-                use_ngraph = fluid.core.is_compiled_with_ngraph(
-                ) and fluid.core.get_flags_use_ngraph()
-                if use_ngraph:
-                    warnings.warn(
-                        "check inplace_grad for ops using ngraph is not supported"
-                    )
-                    continue
-                if has_infer_inplace:
-                    fwd_res = res[father_op_desc]
-                    res[op_desc] = self._check_grad_inplace(
-                        place, fwd_res, op_desc, inplace_atol=inplace_atol)
-                else:
-                    res[op_desc] = self._calc_grad_output(place, fwd_res,
-                                                          op_desc)
-
-    def check_output_with_place(self,
-                                place,
-                                atol,
-                                no_check_set=None,
-                                equal_nan=False,
-                                check_dygraph=False,
-                                inplace_atol=None):
-        if check_dygraph:
-            dygraph_outs = self._calc_dygraph_output(
-                place, no_check_set=no_check_set)
-        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
-        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
-            if out_name not in self.outputs:
-                continue
-            if no_check_set is not None and out_name in no_check_set:
-                continue
-
-            def find_actual(target_name, fetch_list):
-                found = [
-                    i for i, var_name in enumerate(fetch_list)
-                    if var_name == target_name
-                ]
-                self.assertTrue(
-                    len(found) == 1, "Found {} {}".format(
-                        len(found), target_name))
-                return found[0]
-
-            if out_dup:
-                sub_out = self.outputs[out_name]
-                if not isinstance(sub_out, list):
-                    raise AssertionError("sub_out type %s is not list",
-                                         type(sub_out))
-                for item in sub_out:
-                    sub_out_name, expect = item[0], item[1]
-                    if check_dygraph:
-                        imperative_actual = dygraph_outs[sub_out_name][0]
-                        imperative_actual_t = np.array(
-                            imperative_actual._ivar.value().get_tensor())
-                    idx = find_actual(sub_out_name, fetch_list)
-                    actual = outs[idx]
-                    actual_t = np.array(actual)
-                    expect_t = expect[0] \
-                        if isinstance(expect, tuple) else expect
-                    self.assertTrue(
-                        np.allclose(
-                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-                        "Output (" + sub_out_name + ") has diff at " +
-                        str(place))
-                    if check_dygraph:
-                        self.assertTrue(
-                            np.allclose(
-                                imperative_actual_t,
-                                expect_t,
-                                atol=atol,
-                                equal_nan=equal_nan),
-                            "Output (" + sub_out_name + ") has diff at " +
-                            str(place) + " in dygraph mode")
-                    if isinstance(expect, tuple):
-                        self.assertListEqual(
-                            actual.recursive_sequence_lengths(), expect[1],
-                            "Output (" + sub_out_name +
-                            ") has different lod at " + str(place))
-                    if check_dygraph:
-                        self.assertListEqual(
-                            imperative_actual._ivar.value().get_tensor()
-                            .recursive_sequence_lengths(), expect[1],
-                            "Output (" + out_name + ") has different lod at " +
-                            str(place) + " in dygraph mode")
-            else:
-                if check_dygraph:
-                    imperative_actual = dygraph_outs[out_name][0]
-                    imperative_actual_t = np.array(
-                        imperative_actual._ivar.value().get_tensor())
-                idx = find_actual(out_name, fetch_list)
-                actual = outs[idx]
-                actual_t = np.array(actual)
-                expect = self.outputs[out_name]
-                expect_t = expect[0] if isinstance(expect, tuple) else expect
-                self.assertTrue(
-                    np.allclose(
-                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-                    "Output (" + out_name + ") has diff at " + str(place) +
-                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
-                    str(actual_t) + " in class " + self.__class__.__name__)
-                if check_dygraph:
-                    self.assertTrue(
-                        np.allclose(
-                            imperative_actual_t,
-                            expect_t,
-                            atol=atol,
-                            equal_nan=equal_nan),
-                        "Output (" + out_name + ") has diff at " + str(place) +
-                        "\nExpect " + str(expect_t) + "\n" + "But Got" +
-                        str(imperative_actual_t) + " in class " +
-                        self.__class__.__name__)
-                if isinstance(expect, tuple):
-                    self.assertListEqual(actual.recursive_sequence_lengths(),
-                                         expect[1], "Output (" + out_name +
-                                         ") has different lod at " + str(place))
-                    if check_dygraph:
-                        self.assertListEqual(
-                            imperative_actual._ivar.value().get_tensor()
-                            .recursive_sequence_lengths(), expect[1],
-                            "Output (" + out_name + ") has different lod at " +
-                            str(place) + " in dygraph mode")
-
-        # inplace_atol only used when op doesn't ensure computational consistency
-        if inplace_atol is not None:
-            warnings.warn(
-                "By default, inplace_atol should not be set, please check it")
-        # Check inplace for given op, its grad op, its grad_grad op, etc.
-        # No effect on original OpTest 
-        self.check_inplace_output_with_place(
-            place, no_check_set=no_check_set, inplace_atol=inplace_atol)
-
-    def _get_places(self):
-        if self.dtype == np.float16:
-            if core.is_compiled_with_cuda() and core.op_support_gpu(
-                    self.op_type):
-                place = core.CUDAPlace(0)
-                if core.is_float16_supported(place):
-                    return [place]
-                else:
-                    return []
-            else:
-                return []
-        places = [fluid.CPUPlace()]
-        cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False
-        use_ngraph = fluid.core.is_compiled_with_ngraph(
-        ) and fluid.core.get_flags_use_ngraph()
-        if use_ngraph:
-            cpu_only = True
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\
-           and not cpu_only:
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def check_output(self,
-                     atol=1e-5,
-                     no_check_set=None,
-                     equal_nan=False,
-                     check_dygraph=False,
-                     inplace_atol=None):
-        places = self._get_places()
-        for place in places:
-            self.check_output_with_place(place, atol, no_check_set, equal_nan,
-                                         check_dygraph)
-
-    def check_output_customized(self, checker):
-        places = self._get_places()
-        for place in places:
-            outs = self.calc_output(place)
-            outs = [np.array(out) for out in outs]
-            outs.sort(key=len)
-            checker(outs)
-
-    def _assert_is_close(self, numeric_grads, analytic_grads, names,
-                         max_relative_error, msg_prefix):
-
-        for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
-            abs_a = np.abs(a)
-            abs_a[abs_a < 1e-3] = 1
-
-            diff_mat = np.abs(a - b) / abs_a
-            max_diff = np.max(diff_mat)
-
-            def err_msg():
-                offset = np.argmax(diff_mat > max_relative_error)
-                return ("%s Variable %s max gradient diff %f over limit %f, "
-                        "the first error element is %d, expected %f, but got %f"
-                        ) % (msg_prefix, name, max_diff, max_relative_error,
-                             offset, a.flatten()[offset], b.flatten()[offset])
-
-            self.assertLessEqual(max_diff, max_relative_error, err_msg())
-
-    def check_grad(self,
-                   inputs_to_check,
-                   output_names,
-                   no_grad_set=None,
-                   numeric_grad_delta=0.005,
-                   in_place=False,
-                   max_relative_error=0.005,
-                   user_defined_grads=None):
-        places = self._get_places()
-        for place in places:
-            self.check_grad_with_place(place, inputs_to_check, output_names,
-                                       no_grad_set, numeric_grad_delta,
-                                       in_place, max_relative_error,
-                                       user_defined_grads)
-
-    def check_grad_with_place(self,
-                              place,
-                              inputs_to_check,
-                              output_names,
-                              no_grad_set=None,
-                              numeric_grad_delta=0.005,
-                              in_place=False,
-                              max_relative_error=0.005,
-                              user_defined_grads=None):
-        self.scope = core.Scope()
-        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
-        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
-        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
-
-        cache_list = None
-        if hasattr(self, "cache_name_list"):
-            cache_list = self.cache_name_list
-        self.op = create_op(
-            self.scope,
-            self.op_type,
-            op_inputs,
-            op_outputs,
-            op_attrs,
-            cache_list=cache_list)
-
-        if no_grad_set is None:
-            no_grad_set = set()
-
-        if not type(output_names) is list:
-            output_names = [output_names]
-
-        numeric_grads = user_defined_grads or [
-            get_numeric_gradient(
-                place,
-                self.scope,
-                self.op,
-                self.inputs,
-                input_to_check,
-                output_names,
-                delta=numeric_grad_delta,
-                in_place=in_place) for input_to_check in inputs_to_check
-        ]
-        analytic_grads = self._get_gradient(inputs_to_check, place,
-                                            output_names, no_grad_set)
-
-        self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
-                              max_relative_error,
-                              "Gradient Check On %s" % str(place))
-
-    @staticmethod
-    def _numpy_to_lod_tensor(np_value, lod, place):
-        tensor = core.LoDTensor()
-        tensor.set(np_value, place)
-        if lod is not None:
-            tensor.set_recursive_sequence_lengths(lod)
-        return tensor
-
-    @staticmethod
-    def np_dtype_to_fluid_dtype(input):
-        """Change the dtype of float16 numpy array
-
-        numpy float16 is binded to paddle::platform::float16
-        in tensor_py.h via the help of uint16 data type since
-        the internal memory representation of float16 is
-        uint16_t in paddle and np.uint16 in numpy, which are
-        themselves binded together by pybind.
-
-        Args:
-            input: input numpy array
-
-        Returns:
-            input: The dtype of input will be changed to np.uint16 if
-                it is originally np.float16, such that the internal memory
-                of input will be reinterpreted as of dtype np.uint16.
-        """
-        if input.dtype == np.float16:
-            input.dtype = np.uint16
-        return input
-
-    @staticmethod
-    def fluid_dtype_to_np_dtype(self, dtype):
-        """
-        See above, convert the dtype to normal type.
-        """
-        if dtype == np.uint16:
-            dtype = np.float16
-        return dtype
-
-    @staticmethod
-    def np_value_to_fluid_value(input):
-        if input.dtype == np.float16:
-            input = input.view(np.uint16)
-        return input
-
-    def _get_gradient(self,
-                      input_to_check,
-                      place,
-                      output_names,
-                      no_grad_set,
-                      parallel=False):
-        prog = Program()
-        block = prog.global_block()
-        self._append_ops(block)
-        loss = append_loss_ops(block, output_names)
-        param_grad_list = append_backward(
-            loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
-
-        inputs = self._get_inputs(block)
-        feed_dict = self.feed_var(inputs, place)
-
-        fetch_list = [g for p, g in param_grad_list]
-        if parallel:
-            use_cuda = False
-            if isinstance(place, fluid.CUDAPlace):
-                use_cuda = True
-            compiled_prog = fluid.CompiledProgram(prog).with_data_parallel(
-                loss_name=loss.name, places=place)
-            prog = compiled_prog
-        executor = fluid.Executor(place)
-        return list(
-            map(np.array,
-                executor.run(prog, feed_dict, fetch_list, return_numpy=False)))
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
deleted file mode 100644
index 3890236013c8a29288acde08198dd05abaeb6620..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import contextlib
-import unittest
-import numpy as np
-import six
-import pickle
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.dygraph as dygraph
-from paddle.fluid import core
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
-from paddle.fluid.dygraph.base import to_variable
-
-from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
-
-
-class SimpleImgConvPool(fluid.dygraph.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 pool_size,
-                 pool_stride,
-                 pool_padding=0,
-                 pool_type='max',
-                 global_pooling=False,
-                 conv_stride=1,
-                 conv_padding=0,
-                 conv_dilation=1,
-                 conv_groups=1,
-                 act=None,
-                 use_cudnn=False,
-                 param_attr=None,
-                 bias_attr=None):
-        super(SimpleImgConvPool, self).__init__(name_scope)
-
-        self._conv2d = Conv2D(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=None,
-            bias_attr=None,
-            use_cudnn=use_cudnn)
-
-        self._pool2d = Pool2D(
-            self.full_name(),
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
-
-    def forward(self, inputs):
-        x = self._conv2d(inputs)
-        x = self._pool2d(x)
-        return x
-
-
-class MNIST(fluid.dygraph.Layer):
-    def __init__(self, name_scope):
-        super(MNIST, self).__init__(name_scope)
-
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 1, 20, 5, 2, 2, act="relu")
-
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 20, 50, 5, 2, 2, act="relu")
-
-        pool_2_shape = 50 * 4 * 4
-        SIZE = 10
-        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(self.full_name(),
-                      10,
-                      param_attr=fluid.param_attr.ParamAttr(
-                          initializer=fluid.initializer.NormalInitializer(
-                              loc=0.0, scale=scale)),
-                      act="softmax")
-
-    def forward(self, inputs, label):
-        x = self._simple_img_conv_pool_1(inputs)
-        x = self._simple_img_conv_pool_2(x)
-        cost = self._fc(x)
-        loss = fluid.layers.cross_entropy(cost, label)
-        avg_loss = fluid.layers.mean(loss)
-        return avg_loss
-
-
-class TestMnist(TestParallelDyGraphRunnerBase):
-    def get_model(self):
-        model = MNIST("mnist")
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
-        opt = fluid.optimizer.SGD(learning_rate=1e-3)
-        return model, train_reader, opt
-
-    def run_one_loop(self, model, opt, data):
-        batch_size = len(data)
-        dy_x_data = np.array([x[0].reshape(1, 28, 28)
-                              for x in data]).astype('float32')
-        y_data = np.array(
-            [x[1] for x in data]).astype('int64').reshape(batch_size, 1)
-        img = to_variable(dy_x_data)
-        label = to_variable(y_data)
-        label.stop_gradient = True
-
-        avg_loss = model(img, label)
-
-        return avg_loss
-
-
-if __name__ == "__main__":
-    runtime_main(TestMnist)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
deleted file mode 100644
index bdf5b483812fb72c47794be72cfcbb57f3dea0c3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import contextlib
-import unittest
-import numpy as np
-import six
-import pickle
-import sys
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.dygraph as dygraph
-from paddle.fluid import core
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, LayerNorm
-from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.layer_helper import LayerHelper
-import math
-from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
-
-momentum_rate = 0.9
-l2_decay = 1.2e-4
-
-
-def optimizer_setting(params):
-    ls = params["learning_strategy"]
-    if "total_images" not in params:
-        total_images = 6149
-    else:
-        total_images = params["total_images"]
-
-    batch_size = ls["batch_size"]
-    step = int(math.ceil(float(total_images) / batch_size))
-    bd = [step * e for e in ls["epochs"]]
-    lr = params["lr"]
-    num_epochs = params["num_epochs"]
-    optimizer = fluid.optimizer.Momentum(
-        learning_rate=fluid.layers.cosine_decay(
-            learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
-        momentum=momentum_rate,
-        regularization=fluid.regularizer.L2Decay(l2_decay))
-
-    return optimizer
-
-
-class ConvBNLayer(fluid.dygraph.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__(name_scope)
-
-        self._conv = Conv2D(
-            "conv2d",
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False,
-            param_attr=fluid.ParamAttr(name="weights"))
-
-        self._layer_norm = LayerNorm(self.full_name(), begin_norm_axis=1)
-
-    def forward(self, inputs):
-        y = self._conv(inputs)
-        y = self._layer_norm(y)
-
-        return y
-
-
-class SqueezeExcitation(fluid.dygraph.Layer):
-    def __init__(self, name_scope, num_channels, reduction_ratio):
-
-        super(SqueezeExcitation, self).__init__(name_scope)
-        self._pool = Pool2D(
-            self.full_name(), pool_size=0, pool_type='avg', global_pooling=True)
-        stdv = 1.0 / math.sqrt(num_channels * 1.0)
-        self._squeeze = FC(
-            self.full_name(),
-            size=num_channels // reduction_ratio,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)),
-            act='relu')
-        stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0)
-        self._excitation = FC(
-            self.full_name(),
-            size=num_channels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)),
-            act='sigmoid')
-
-    def forward(self, input):
-        y = self._pool(input)
-        y = self._squeeze(y)
-        y = self._excitation(y)
-        y = fluid.layers.elementwise_mul(x=input, y=y, axis=0)
-        return y
-
-
-class BottleneckBlock(fluid.dygraph.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_channels,
-                 num_filters,
-                 stride,
-                 cardinality,
-                 reduction_ratio,
-                 shortcut=True):
-        super(BottleneckBlock, self).__init__(name_scope)
-
-        self.conv0 = ConvBNLayer(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=1,
-            act="relu")
-        self.conv1 = ConvBNLayer(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            groups=cardinality,
-            act="relu")
-        self.conv2 = ConvBNLayer(
-            self.full_name(),
-            num_filters=num_filters * 2,
-            filter_size=1,
-            act=None)
-
-        self.scale = SqueezeExcitation(
-            self.full_name(),
-            num_channels=num_filters * 2,
-            reduction_ratio=reduction_ratio)
-
-        if not shortcut:
-            self.short = ConvBNLayer(
-                self.full_name(),
-                num_filters=num_filters * 2,
-                filter_size=1,
-                stride=stride)
-
-        self.shortcut = shortcut
-
-        self._num_channels_out = num_filters * 2
-
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
-        conv2 = self.conv2(conv1)
-        scale = self.scale(conv2)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-
-        y = fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-        return y
-
-
-class SeResNeXt(fluid.dygraph.Layer):
-    def __init__(self, name_scope, layers=50, class_dim=102):
-        super(SeResNeXt, self).__init__(name_scope)
-
-        self.layers = layers
-        supported_layers = [50, 101, 152]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(supported_layers, layers)
-
-        if layers == 50:
-            cardinality = 32
-            reduction_ratio = 16
-            depth = [3, 4, 6, 3]
-            num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                self.full_name(),
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                self.full_name(),
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-        elif layers == 101:
-            cardinality = 32
-            reduction_ratio = 16
-            depth = [3, 4, 23, 3]
-            num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                self.full_name(),
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                self.full_name(),
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-        elif layers == 152:
-            cardinality = 64
-            reduction_ratio = 16
-            depth = [3, 8, 36, 3]
-            num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                self.full_name(),
-                num_filters=64,
-                filter_size=3,
-                stride=2,
-                act='relu')
-            self.conv1 = ConvBNLayer(
-                self.full_name(),
-                num_filters=64,
-                filter_size=3,
-                stride=1,
-                act='relu')
-            self.conv2 = ConvBNLayer(
-                self.full_name(),
-                num_filters=128,
-                filter_size=3,
-                stride=1,
-                act='relu')
-            self.pool = Pool2D(
-                self.full_name(),
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-
-        self.bottleneck_block_list = []
-        num_channels = 64
-        for block in range(len(depth)):
-            shortcut = False
-            for i in range(depth[block]):
-                bottleneck_block = self.add_sublayer(
-                    'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        self.full_name(),
-                        num_channels=num_channels,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        cardinality=cardinality,
-                        reduction_ratio=reduction_ratio,
-                        shortcut=shortcut))
-                num_channels = bottleneck_block._num_channels_out
-                self.bottleneck_block_list.append(bottleneck_block)
-                shortcut = True
-
-        self.pool2d_avg = Pool2D(
-            self.full_name(), pool_size=7, pool_type='avg', global_pooling=True)
-        stdv = 1.0 / math.sqrt(2048 * 1.0)
-
-        self.out = FC(self.full_name(),
-                      size=class_dim,
-                      param_attr=fluid.param_attr.ParamAttr(
-                          initializer=fluid.initializer.Uniform(-stdv, stdv)))
-
-    def forward(self, inputs):
-        if self.layers == 50 or self.layers == 101:
-            y = self.conv0(inputs)
-            y = self.pool(y)
-        elif self.layers == 152:
-            y = self.conv0(inputs)
-            y = self.conv1(inputs)
-            y = self.conv2(inputs)
-            y = self.pool(y)
-
-        for bottleneck_block in self.bottleneck_block_list:
-            y = bottleneck_block(y)
-        y = self.pool2d_avg(y)
-        y = self.out(y)
-        return y
-
-
-class TestSeResNeXt(TestParallelDyGraphRunnerBase):
-    def get_model(self):
-        model = SeResNeXt("se-resnext")
-        train_reader = paddle.batch(
-            paddle.dataset.flowers.test(use_xmap=False),
-            batch_size=4,
-            drop_last=True)
-
-        opt = fluid.optimizer.SGD(learning_rate=1e-3)
-        return model, train_reader, opt
-
-    def run_one_loop(self, model, opt, data):
-        bs = len(data)
-        dy_x_data = np.array([x[0].reshape(3, 224, 224)
-                              for x in data]).astype('float32')
-        y_data = np.array([x[1] for x in data]).astype('int64').reshape(bs, 1)
-        img = to_variable(dy_x_data)
-        label = to_variable(y_data)
-        label.stop_gradient = True
-
-        out = model(img)
-        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
-        loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
-        avg_loss = fluid.layers.mean(x=loss)
-        return avg_loss
-
-
-if __name__ == "__main__":
-    runtime_main(TestSeResNeXt)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
deleted file mode 100644
index ef4779f0e6f2df2f0b79f776d1e7b6c5cbf31a22..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import multiprocessing
-import os
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import compiler
-import time
-import numpy as np
-import math
-import sys
-from feed_data_reader import FeedDataReader
-
-__all__ = ['TestParallelExecutorBase']
-
-
-class TestParallelExecutorBase(unittest.TestCase):
-    @classmethod
-    def check_network_convergence(cls,
-                                  method,
-                                  use_cuda=True,
-                                  iter=50,
-                                  batch_size=None,
-                                  feed_dict=None,
-                                  feed_data_reader=None,
-                                  get_data_from_feeder=None,
-                                  use_parallel_executor=True,
-                                  use_reduce=False,
-                                  use_ir_memory_optimize=True,
-                                  enable_inplace=True,
-                                  fuse_elewise_add_act_ops=False,
-                                  fuse_all_optimizer_ops=False,
-                                  fuse_all_reduce_ops=False,
-                                  fuse_relu_depthwise_conv=False,
-                                  optimizer=fluid.optimizer.Adam,
-                                  use_fast_executor=False,
-                                  enable_sequential_execution=False):
-        def run_executor(exe, binary, feed, fetch_list):
-            if feed_data_reader is None:
-                res = exe.run(binary, feed=feed, fetch_list=fetch_list)
-            else:
-                res = exe.run(binary,
-                              feed=feed_data_reader.get_next(exe, binary),
-                              fetch_list=fetch_list)
-            return res
-
-        if feed_data_reader is not None:
-            assert isinstance(
-                feed_data_reader, FeedDataReader
-            ), "feed_data_reader must be type of FeedDataReader"
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        startup.random_seed = 1
-        main.random_seed = 1
-        with fluid.program_guard(main, startup):
-            feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder,
-                                              main, method, optimizer)
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup)
-
-        build_strategy, exec_strategy = cls.set_strategy(
-            enable_inplace, enable_sequential_execution, fuse_all_optimizer_ops,
-            fuse_all_reduce_ops, fuse_elewise_add_act_ops,
-            fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize,
-            use_reduce, use_cuda)
-
-        if use_parallel_executor:
-            binary = compiler.CompiledProgram(main).with_data_parallel(
-                loss_name=loss.name,
-                build_strategy=build_strategy,
-                exec_strategy=exec_strategy)
-        else:
-            binary = main
-
-        if batch_size is not None:
-            batch_size *= fluid.core.get_cuda_device_count(
-            ) if use_cuda else int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-
-        begin = time.time()
-        first_loss, = run_executor(
-            exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
-        for _ in range(iter):
-            run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[])
-        last_loss, = run_executor(
-            exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
-        end = time.time()
-
-        if batch_size is not None:
-            print("%.4f Instance per second" % (
-                (batch_size * iter + 2) / (end - begin)))
-
-        avg_last_loss_val = np.array(last_loss).mean()
-        avg_first_loss_val = np.array(first_loss).mean()
-        if math.isnan(float(avg_last_loss_val)) or math.isnan(
-                float(avg_first_loss_val)):
-            sys.exit("got NaN loss, training failed.")
-
-        print(first_loss, last_loss)
-        # self.assertGreater(first_loss[0], last_loss[0])
-        return first_loss, last_loss
-
-    @classmethod
-    def check_pass_conflict(cls,
-                            method,
-                            use_cuda=True,
-                            feed_dict=None,
-                            get_data_from_feeder=None,
-                            use_reduce=False,
-                            use_ir_memory_optimize=True,
-                            enable_inplace=True,
-                            fuse_elewise_add_act_ops=False,
-                            fuse_all_optimizer_ops=False,
-                            fuse_all_reduce_ops=False,
-                            fuse_relu_depthwise_conv=False,
-                            optimizer=fluid.optimizer.Adam,
-                            use_fast_executor=True,
-                            enable_sequential_execution=False):
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder,
-                                              main, method, optimizer)
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup)
-
-        build_strategy, exec_strategy = cls.set_strategy(
-            enable_inplace, enable_sequential_execution, fuse_all_optimizer_ops,
-            fuse_all_reduce_ops, fuse_elewise_add_act_ops,
-            fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize,
-            use_reduce, use_cuda)
-
-        binary = compiler.CompiledProgram(main).with_data_parallel(
-            loss_name=loss.name,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
-
-        exe.run(binary, feed=feed_dict, fetch_list=[loss.name])
-
-    @classmethod
-    def set_strategy(cls, enable_inplace, enable_sequential_execution,
-                     fuse_all_optimizer_ops, fuse_all_reduce_ops,
-                     fuse_elewise_add_act_ops, fuse_relu_depthwise_conv,
-                     use_fast_executor, use_ir_memory_optimize, use_reduce,
-                     use_cuda):
-        exec_strategy = fluid.ExecutionStrategy()
-        if use_fast_executor:
-            exec_strategy.use_experimental_executor = True
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
-            if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
-        build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
-        build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
-        build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops
-        build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops
-        build_strategy.memory_optimize = use_ir_memory_optimize
-        build_strategy.enable_inplace = enable_inplace
-        build_strategy.enable_sequential_execution = enable_sequential_execution
-
-        if use_cuda and core.is_compiled_with_cuda():
-            build_strategy.remove_unnecessary_lock = True
-        return build_strategy, exec_strategy
-
-    @classmethod
-    def build_model(cls, feed_dict, get_data_from_feeder, main, method,
-                    optimizer):
-        loss = method(use_feed=feed_dict is not None)
-        # NOTE(zjl): memory_optimize/inplace pass would not require
-        # that loss.persistable = True.
-        # We set loss.persistable = False here to verify our memory
-        # optimization strategies intentionally.
-        loss.persistable = False
-        if optimizer:
-            optimizer().minimize(loss)
-
-        if get_data_from_feeder is not None:
-            assert feed_dict is None
-            feed_dict = get_data_from_feeder()
-        return feed_dict, loss
diff --git a/python/paddle/fluid/tests/unittests/py_precise_roi_pool.py b/python/paddle/fluid/tests/unittests/py_precise_roi_pool.py
deleted file mode 100644
index 618ffbdf9fc690b08ade81443a6515b1a74ebc12..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/py_precise_roi_pool.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import numpy as np
-
-
-class PyPrRoIPool(object):
-    def __init__(self):
-        pass
-
-    def _PrRoIPoolingGetData(self, data, h, w, height, width):
-        overflow = (h < 0) or (w < 0) or (h >= height) or (w >= width)
-        if overflow:
-            return 0.0
-        else:
-            return data[h][w]
-
-    def _PrRoIPoolingMatCalculation(self, this_data, s_h, s_w, e_h, e_w, y0, x0,
-                                    y1, x1, h0, w0):
-        sum_out = 0.0
-        alpha = x0 - float(s_w)
-        beta = y0 - float(s_h)
-        lim_alpha = x1 - float(s_w)
-        lim_beta = y1 - float(s_h)
-        tmp = (
-            lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha + 0.5 * alpha *
-            alpha) * (
-                lim_beta - 0.5 * lim_beta * lim_beta - beta + 0.5 * beta * beta)
-        sum_out += self._PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp
-
-        alpha = float(e_w) - x1
-        lim_alpha = float(e_w) - x0
-        tmp = (
-            lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha + 0.5 * alpha *
-            alpha) * (
-                lim_beta - 0.5 * lim_beta * lim_beta - beta + 0.5 * beta * beta)
-        sum_out += self._PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp
-
-        alpha = x0 - float(s_w)
-        beta = float(e_h) - y1
-        lim_alpha = x1 - float(s_w)
-        lim_beta = float(e_h) - y0
-        tmp = (
-            lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha + 0.5 * alpha *
-            alpha) * (
-                lim_beta - 0.5 * lim_beta * lim_beta - beta + 0.5 * beta * beta)
-        sum_out += self._PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp
-
-        alpha = float(e_w) - x1
-        lim_alpha = float(e_w) - x0
-        tmp = (
-            lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha + 0.5 * alpha *
-            alpha) * (
-                lim_beta - 0.5 * lim_beta * lim_beta - beta + 0.5 * beta * beta)
-        sum_out += self._PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp
-
-        return sum_out
-
-    def compute(self,
-                x,
-                rois,
-                output_channels,
-                spatial_scale=0.1,
-                pooled_height=1,
-                pooled_width=1):
-        '''
-        calculate the precise roi pooling values
-        Note: This function is implements as pure python without any paddle concept involved
-        :param x (array): array[N, C, H, W]
-        :param rois (array): ROIs[id, x1, y1, x2, y2] (Regions of Interest) to pool over.
-        :param output_channels (Integer): Expected output channels
-        :param spatial_scale (float): spatial scale, default = 0.1
-        :param pooled_height (Integer): Expected output height, default = 1
-        :param pooled_width (Integer): Expected output width, default = 1
-        :return: array[len(rois), output_channels, pooled_height, pooled_width]
-        '''
-        if not isinstance(output_channels, int):
-            raise TypeError("output_channels must be int type")
-        if not isinstance(spatial_scale, float):
-            raise TypeError("spatial_scale must be float type")
-        if not isinstance(pooled_height, int):
-            raise TypeError("pooled_height must be int type")
-        if not isinstance(pooled_width, int):
-            raise TypeError("pooled_width must be int type")
-
-        (batch_size, channels, height, width) = np.array(x).shape
-        rois_num = len(rois)
-        output_shape = (rois_num, output_channels, pooled_height, pooled_width)
-        out_data = np.zeros(output_shape)
-        for i in range(rois_num):
-            roi = rois[i]
-            roi_batch_id = int(roi[0])
-            roi_start_w = roi[1] * spatial_scale
-            roi_start_h = roi[2] * spatial_scale
-            roi_end_w = roi[3] * spatial_scale
-            roi_end_h = roi[4] * spatial_scale
-
-            roi_width = max(roi_end_w - roi_start_w, 0.0)
-            roi_height = max(roi_end_h - roi_start_h, 0.0)
-            bin_size_h = roi_height / float(pooled_height)
-            bin_size_w = roi_width / float(pooled_width)
-
-            x_i = x[roi_batch_id]
-
-            for c in range(output_channels):
-                for ph in range(pooled_height):
-                    for pw in range(pooled_width):
-                        win_start_w = roi_start_w + bin_size_w * pw
-                        win_start_h = roi_start_h + bin_size_h * ph
-                        win_end_w = win_start_w + bin_size_w
-                        win_end_h = win_start_h + bin_size_h
-
-                        win_size = max(0.0, bin_size_w * bin_size_h)
-                        if win_size == 0.0:
-                            out_data[i, c, ph, pw] = 0.0
-                        else:
-                            sum_out = 0
-
-                            s_w = math.floor(win_start_w)
-                            e_w = math.ceil(win_end_w)
-                            s_h = math.floor(win_start_h)
-                            e_h = math.ceil(win_end_h)
-
-                            c_in = (c * pooled_height + ph) * pooled_width + pw
-
-                            for w_iter in range(int(s_w), int(e_w)):
-                                for h_iter in range(int(s_h), int(e_h)):
-                                    sum_out += self._PrRoIPoolingMatCalculation(
-                                        x_i[c_in], h_iter, w_iter, h_iter + 1,
-                                        w_iter + 1,
-                                        max(win_start_h, float(h_iter)),
-                                        max(win_start_w, float(w_iter)),
-                                        min(win_end_h, float(h_iter) + 1.0),
-                                        min(win_end_w, float(w_iter + 1.0)),
-                                        height, width)
-
-                            out_data[i, c, ph, pw] = sum_out / win_size
-
-        return out_data
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
deleted file mode 100644
index 4d63c208de34dcaf81c36170bba4aaba143d1668..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.fluid as fluid
-fluid.core._set_eager_deletion_mode(-1, -1, False)
-
-import paddle.fluid.layers.ops as ops
-from paddle.fluid.initializer import init_on_cpu
-from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
-from simple_nets import init_data
-import math
-import os
-os.environ['CPU_NUM'] = str(4)
-
-# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
-# and Executor is different. Because, for ParallelExecutor, the dropout_op of
-# the neural net will be copied N copies(N is the number of device). This will
-# lead to the random numbers generated by ParallelExecutor and Executor are different.
-# So, if we compare the loss of ParallelExecutor and Executor, we should remove the
-# dropout_op.
-remove_dropout = False
-
-# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor
-# and Executor is different.
-remove_bn = False
-
-remove_dropout = True
-remove_bn = True
-
-
-def squeeze_excitation(input, num_channels, reduction_ratio):
-    # pool = fluid.layers.pool2d(
-    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
-    conv = input
-    shape = conv.shape
-    reshape = fluid.layers.reshape(
-        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
-    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-
-    squeeze = fluid.layers.fc(input=pool,
-                              size=num_channels // reduction_ratio,
-                              act='relu')
-    excitation = fluid.layers.fc(input=squeeze,
-                                 size=num_channels,
-                                 act='sigmoid')
-    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
-    return scale
-
-
-def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) // 2,
-        groups=groups,
-        act=None,
-        bias_attr=False)
-    return conv if remove_bn else fluid.layers.batch_norm(
-        input=conv, act=act, momentum=0.1)
-
-
-def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1]
-    if ch_in != ch_out:
-        if stride == 1:
-            filter_size = 1
-        else:
-            filter_size = 3
-        return conv_bn_layer(input, ch_out, filter_size, stride)
-    else:
-        return input
-
-
-def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
-    # The number of first 1x1 convolutional channels for each bottleneck build block
-    # was halved to reduce the compution cost.
-    conv0 = conv_bn_layer(
-        input=input, num_filters=num_filters, filter_size=1, act='relu')
-    conv1 = conv_bn_layer(
-        input=conv0,
-        num_filters=num_filters * 2,
-        filter_size=3,
-        stride=stride,
-        groups=cardinality,
-        act='relu')
-    conv2 = conv_bn_layer(
-        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
-    scale = squeeze_excitation(
-        input=conv2,
-        num_channels=num_filters * 2,
-        reduction_ratio=reduction_ratio)
-
-    short = shortcut(input, num_filters * 2, stride)
-
-    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-
-
-img_shape = [3, 224, 224]
-
-
-def SE_ResNeXt50Small(use_feed):
-
-    img = fluid.layers.data(name='image', shape=img_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    conv = conv_bn_layer(
-        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
-    conv = fluid.layers.pool2d(
-        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-
-    cardinality = 32
-    reduction_ratio = 16
-    depth = [3, 4, 6, 3]
-    num_filters = [128, 256, 512, 1024]
-
-    for block in range(len(depth)):
-        for i in range(depth[block]):
-            conv = bottleneck_block(
-                input=conv,
-                num_filters=num_filters[block],
-                stride=2 if i == 0 and block != 0 else 1,
-                cardinality=cardinality,
-                reduction_ratio=reduction_ratio)
-
-    shape = conv.shape
-    reshape = fluid.layers.reshape(
-        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
-    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-    dropout = pool if remove_dropout else fluid.layers.dropout(
-        x=pool, dropout_prob=0.2, seed=1)
-    # Classifier layer:
-    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def cosine_decay(learning_rate, step_each_epoch, epochs=120):
-    """
-    Applies cosine decay to the learning rate.
-    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
-    """
-    global_step = _decay_step_counter()
-
-    with init_on_cpu():
-        epoch = ops.floor(global_step / step_each_epoch)
-        decayed_lr = learning_rate * \
-                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
-    return decayed_lr
-
-
-def optimizer(learning_rate=0.01):
-    optimizer = fluid.optimizer.Momentum(
-        learning_rate=cosine_decay(
-            learning_rate=learning_rate, step_each_epoch=2, epochs=1),
-        momentum=0.9,
-        regularization=fluid.regularizer.L2Decay(1e-4))
-    return optimizer
-
-
-model = SE_ResNeXt50Small
-
-
-def batch_size():
-    return 12
-
-
-def iter(use_cuda):
-    if use_cuda:
-        return 10
-    return 2
-
-
-gpu_img, gpu_label = init_data(
-    batch_size=batch_size(), img_shape=img_shape, label_range=999)
-cpu_img, cpu_label = init_data(
-    batch_size=batch_size(), img_shape=img_shape, label_range=999)
-feed_dict_gpu = {"image": gpu_img, "label": gpu_label}
-feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
-
-
-def feed_dict(use_cuda):
-    if use_cuda:
-        return feed_dict_gpu
-    return feed_dict_cpu
diff --git a/python/paddle/fluid/tests/unittests/seresnext_test_base.py b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
deleted file mode 100644
index 65879d39d91145b2403ac1b0c29e51df1960c8d1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/seresnext_test_base.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import seresnext_net
-import paddle.fluid.core as core
-from parallel_executor_test_base import TestParallelExecutorBase
-import numpy as np
-
-
-class TestResnetBase(TestParallelExecutorBase):
-    def _compare_result_with_origin_model(self,
-                                          check_func,
-                                          use_cuda,
-                                          delta2=1e-5,
-                                          compare_seperately=True):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        func_1_first_loss, func_1_last_loss = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer)
-
-        func_2_first_loss, func_2_last_loss = check_func(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda)
-
-        if compare_seperately:
-            for loss in zip(func_1_first_loss, func_2_first_loss):
-                self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-            for loss in zip(func_1_last_loss, func_2_last_loss):
-                self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
-        else:
-            self.assertAlmostEquals(
-                np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5)
-            self.assertAlmostEquals(
-                np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py
deleted file mode 100644
index f65bc7c3f430af6350519af937c7ac92dc4b919a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/simple_nets.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import numpy as np
-
-
-def simple_fc_net_with_inputs(img, label, class_num=10):
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=class_num, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def simple_fc_net(use_feed=None):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    return simple_fc_net_with_inputs(img, label, class_num=10)
-
-
-def fc_with_batchnorm(use_feed=None):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(2):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def bow_net(use_feed,
-            dict_dim,
-            is_sparse=False,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    fluid/PaddleNLP/text_classification/nets.py
-    """
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
-    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    return avg_cost
-
-
-def init_data(batch_size=32, img_shape=[784], label_range=9):
-    np.random.seed(5)
-    assert isinstance(img_shape, list)
-    input_shape = [batch_size] + img_shape
-    img = np.random.random(size=input_shape).astype(np.float32)
-    label = np.array(
-        [np.random.randint(0, label_range) for _ in range(batch_size)]).reshape(
-            (-1, 1)).astype("int64")
-    return img, label
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
deleted file mode 100644
index b57aaeb52a053babb2102aae10e8ed96eec634ae..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestAccuracyOp(OpTest):
-    def setUp(self):
-        self.op_type = "accuracy"
-        self.dtype = np.float32
-        self.init_dtype()
-        n = 8192
-        infer = np.random.random((n, 1)).astype(self.dtype)
-        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
-        label = np.random.randint(0, 2, (n, 1)).astype('int64')
-        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
-        num_correct = 0
-        for rowid in range(n):
-            for ele in indices[rowid]:
-                if ele == label[rowid]:
-                    num_correct += 1
-                    break
-        self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
-            'Correct': np.array([num_correct]).astype("int32"),
-            'Total': np.array([n]).astype("int32")
-        }
-
-    def init_dtype(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAccuracyOpFp16(TestAccuracyOp):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        self.check_output(atol=1e-3)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
deleted file mode 100644
index 733643287cea27edf7c39be6179d7c259d287034..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-import gradient_checker
-
-from decorator_helper import prog_scope
-
-
-class TestReluDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 3, 7, 9]
-        eps = 0.005
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        x.persistable = True
-        y = layers.relu(x)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        x_arr[np.abs(x_arr) < 0.005] = 0.02
-
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestLeakyReluDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 3, 7, 9]
-        eps = 0.005
-        alpha = 0.2
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        x.persistable = True
-
-        y = layers.leaky_relu(x, alpha=alpha)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        x_arr[np.abs(x_arr) < 0.005] = 0.02
-
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places = [fluid.CUDAPlace(0)]
-        for p in places:
-            self.func(p)
-
-
-class TestSqrtDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 3, 7, 9]
-        eps = 0.0001
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        x.persistable = True
-
-        y = layers.sqrt(x)
-        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places = [fluid.CUDAPlace(0)]
-        for p in places:
-            self.func(p)
-
-
-class TestSquareDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
-        eps = 0.005
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        x.persistable = True
-        y = layers.square(x)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
deleted file mode 100644
index 012d0401b05b56154c90dd6bec56f4fdbe0a3a49..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ /dev/null
@@ -1,849 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest
-from scipy.special import expit, erf
-
-
-class TestActivation(OpTest):
-    def setUp(self):
-        self.op_type = "exp"
-        self.dtype = np.float32
-        self.init_dtype()
-        self.init_kernel_type()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.exp(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def init_kernel_type(self):
-        pass
-
-
-class TestSigmoid(TestActivation):
-    def setUp(self):
-        self.op_type = "sigmoid"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = 1 / (1 + np.exp(-x))
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.01)
-
-
-class TestLogSigmoid(TestActivation):
-    def setUp(self):
-        self.op_type = "logsigmoid"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = np.log(1 / (1 + np.exp(-x)))
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
-
-
-class TestTanh(TestActivation):
-    def setUp(self):
-        self.op_type = "tanh"
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.tanh(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestAtan(TestActivation):
-    def setUp(self):
-        self.op_type = "atan"
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.arctan(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestTanhShrink(TestActivation):
-    def setUp(self):
-        self.op_type = "tanh_shrink"
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [10, 17]).astype(self.dtype)
-        out = x - np.tanh(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
-
-
-class TestHardShrink(TestActivation):
-    def setUp(self):
-        self.op_type = "hard_shrink"
-        self.init_dtype()
-
-        threshold = 0.5
-        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        out = np.copy(x)
-        out[(out >= -threshold) & (out <= threshold)] = 0
-
-        self.attrs = {'lambda': threshold}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.005)
-
-
-class TestSoftShrink(TestActivation):
-    def setUp(self):
-        self.op_type = "softshrink"
-        self.init_dtype()
-
-        lambda_val = 0.1
-        x = np.random.uniform(0.25, 10, [4, 4]).astype(self.dtype)
-        out = np.copy(x)
-        out = (out < -lambda_val) * (out + lambda_val) + (out > lambda_val) * (
-            out - lambda_val)
-
-        self.attrs = {'lambda': lambda_val}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestSqrt(TestActivation):
-    def setUp(self):
-        self.op_type = "sqrt"
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.sqrt(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestRsqrt(TestActivation):
-    def setUp(self):
-        self.op_type = "rsqrt"
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [2, 3]).astype(self.dtype)
-        out = 1.0 / np.sqrt(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.0005)
-
-
-class TestAbs(TestActivation):
-    def setUp(self):
-        self.op_type = "abs"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        # Because we set delta = 0.005 in calculating numeric gradient,
-        # if x is too small, such as 0.002, x_neg will be -0.003
-        # x_pos will be 0.007, so the numeric gradient is inaccurate.
-        # we should avoid this
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.abs(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestCeil(TestActivation):
-    def setUp(self):
-        self.op_type = "ceil"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        out = np.ceil(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    # The same reason with TestFloor
-    def test_check_grad(self):
-        pass
-
-
-class TestFloor(TestActivation):
-    def setUp(self):
-        self.op_type = "floor"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        out = np.floor(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    # the gradient on floor, ceil, round is undefined.
-    # we return zero as gradient, but the numpy return nan
-    # The same reason with TestFloor
-    def test_check_grad(self):
-        pass
-
-
-class TestCos(TestActivation):
-    def setUp(self):
-        self.op_type = "cos"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        out = np.cos(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestAcos(TestActivation):
-    def setUp(self):
-        self.op_type = "acos"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        out = np.arccos(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestSin(TestActivation):
-    def setUp(self):
-        self.op_type = "sin"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        out = np.sin(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestAsin(TestActivation):
-    def setUp(self):
-        self.op_type = "asin"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        out = np.arcsin(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestRound(TestActivation):
-    def setUp(self):
-        self.op_type = "round"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        out = np.round(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        pass
-
-
-class TestRelu(TestActivation):
-    def setUp(self):
-        self.op_type = "relu"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestLeakyRelu(TestActivation):
-    def setUp(self):
-        self.op_type = "leaky_relu"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0.02 * x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestGelu(TestActivation):
-    def setUp(self):
-        self.op_type = "gelu"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = 0.5 * x * (1.0 + erf(x / np.sqrt(2.0)))
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestBRelu(TestActivation):
-    def setUp(self):
-        self.op_type = "brelu"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        t_min = 1.0
-        t_max = 4.0
-        # The same with TestAbs
-        x[np.abs(x - t_min) < 0.005] = t_min + 0.02
-        x[np.abs(x - t_max) < 0.005] = t_max + 0.02
-        t = np.copy(x)
-        t[t < t_min] = t_min
-        t[t > t_max] = t_max
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'t_min': t_min, 't_max': t_max}
-        self.outputs = {'Out': t}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.02)
-
-
-class TestRelu6(TestActivation):
-    def setUp(self):
-        self.op_type = "relu6"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [4, 10]).astype(self.dtype)
-        threshold = 6.0
-        # The same with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
-        out = np.minimum(np.maximum(x, 0), threshold)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'threshold': threshold}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.02)
-
-
-class TestHardSwish(TestActivation):
-    def setUp(self):
-        self.op_type = 'hard_swish'
-        self.init_dtype()
-
-        x = np.random.uniform(-6, 6, [4, 4]).astype(self.dtype)
-        threshold = 6.0
-        scale = 6.0
-        offset = 3.0
-        #the same with TestAbs
-        x[np.abs(x + offset) < 0.005] = 0.02
-        x[np.abs(x - threshold + offset) < 0.005] = threshold - offset + 0.02
-        out = x * np.minimum(np.maximum(x + offset, 0), threshold) / scale
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'threshold': threshold, 'scale': scale, 'offset': offset}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.02)
-
-
-class TestSoftRelu(TestActivation):
-    def setUp(self):
-        self.op_type = "soft_relu"
-        self.init_dtype()
-
-        x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
-        threshold = 2.0
-        # The same reason with TestAbs
-        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
-        x[np.abs(x + threshold) < 0.005] = -threshold + 0.02
-        t = np.copy(x)
-        t[t < -threshold] = -threshold
-        t[t > threshold] = threshold
-        out = np.log((np.exp(t) + 1))
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'threshold': threshold}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.02)
-
-
-class TestELU(TestActivation):
-    def setUp(self):
-        self.op_type = "elu"
-        self.init_dtype()
-
-        x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
-        alpha = 1.
-        out = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
-        # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
-        # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
-        self.inputs = {'X': x}
-        self.attrs = {'alpha': alpha}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.02)
-
-
-class TestReciprocal(TestActivation):
-    def setUp(self):
-        self.op_type = "reciprocal"
-        self.init_dtype()
-
-        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
-        out = np.reciprocal(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.01)
-
-
-class TestLog(TestActivation):
-    def setUp(self):
-        self.op_type = "log"
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.log(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestSquare(TestActivation):
-    def setUp(self):
-        self.op_type = "square"
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.square(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestPow(TestActivation):
-    def setUp(self):
-        self.op_type = "pow"
-        self.init_dtype()
-
-        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
-        out = np.power(x, 3)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'factor': 3.0}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.02)
-
-
-class TestPow_factor_tensor(TestActivation):
-    def setUp(self):
-        self.op_type = "pow"
-        self.init_dtype()
-
-        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
-        out = np.power(x, 3)
-
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(x),
-            'FactorTensor': np.array([3.0]).astype("float32")
-        }
-
-        self.attrs = {}
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.02)
-
-    def test_api(self):
-        import paddle.fluid as fluid
-
-        input = np.random.uniform(1, 2, [11, 17]).astype("float32")
-        x = fluid.layers.data(
-            name="x", shape=[11, 17], append_batch_size=False, dtype="float32")
-
-        factor_1 = 2.0
-        factor_2 = fluid.layers.fill_constant([1], "float32", 3.0)
-        out_1 = fluid.layers.pow(x, factor=factor_1)
-        out_2 = fluid.layers.pow(x, factor=factor_2)
-
-        exe = fluid.Executor(place=fluid.CPUPlace())
-        res_1, res_2 = exe.run(fluid.default_main_program(),
-                               feed={"x": input},
-                               fetch_list=[out_1, out_2])
-
-        assert np.array_equal(res_1, np.power(input, 2))
-        assert np.array_equal(res_2, np.power(input, 3))
-
-
-class TestSTanh(TestActivation):
-    def setUp(self):
-        self.op_type = "stanh"
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        scale_a = 2.0 / 3.0
-        scale_b = 1.7159
-        out = scale_b * np.tanh(x * scale_a)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestSoftplus(TestActivation):
-    def setUp(self):
-        self.op_type = "softplus"
-        self.init_dtype()
-        self.dtype = np.float64
-
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = np.log(1 + np.exp(x))
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestSoftsign(TestActivation):
-    def setUp(self):
-        self.op_type = "softsign"
-        self.init_dtype()
-
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = np.divide(x, 1 + np.abs(x))
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestThresholdedRelu(TestActivation):
-    def setUp(self):
-        self.op_type = "thresholded_relu"
-        self.init_dtype()
-
-        threshold = 0.25
-        self.relative_error = 0.005
-        X = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-
-        # Same reason as TestAbs
-        X[np.abs(X - threshold) < self.relative_error] = threshold + 0.2
-        out = (X > threshold) * X
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
-        self.attrs = {'threshold': threshold}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=self.relative_error)
-
-
-class TestHardSigmoid(TestActivation):
-    def setUp(self):
-        self.op_type = "hard_sigmoid"
-        self.init_dtype()
-
-        self.relative_error = 0.002
-
-        X = np.random.uniform(-5, 5, [2, 2]).astype("float32")
-        slope = 0.2
-        offset = 0.5
-        lower_threshold = -offset / slope
-        upper_threshold = (1 - offset) / slope
-
-        # Same reason as TestAbs
-        X[np.abs(X - lower_threshold) < self.relative_error] = \
-            lower_threshold + 0.2
-        X[np.abs(X - upper_threshold) < self.relative_error] = \
-            upper_threshold - 0.2
-
-        temp = X * slope + offset
-        out = np.maximum(0.0, np.minimum(1.0, temp))
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.002)
-
-
-class TestSwish(TestActivation):
-    def setUp(self):
-        self.op_type = "swish"
-        self.init_dtype()
-
-        X = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        beta = 2.3
-        out = X * expit(beta * X)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
-        self.attrs = {'beta': beta}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
-
-
-#------------------ Test Cudnn Activation----------------------
-def create_test_act_cudnn_class(parent, atol=1e-3, grad_atol=1e-3):
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    class TestActCudnn(parent):
-        def init_kernel_type(self):
-            self.attrs = {"use_cudnn": True}
-
-    cls_name = "{0}_{1}".format(parent.__name__, "cudnn")
-    TestActCudnn.__name__ = cls_name
-    globals()[cls_name] = TestActCudnn
-
-
-create_test_act_cudnn_class(TestRelu)
-create_test_act_cudnn_class(TestRelu6)
-create_test_act_cudnn_class(TestSigmoid)
-create_test_act_cudnn_class(TestTanh)
-
-
-#------------------ Test Fp16 ----------------------
-def create_test_act_fp16_class(parent,
-                               atol=1e-3,
-                               grad_check=True,
-                               grad_atol=0.80):
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    class TestActFp16(parent):
-        def init_dtype(self):
-            self.dtype = np.float16
-
-        def test_check_output(self):
-            place = core.CUDAPlace(0)
-            support_fp16 = core.is_float16_supported(place)
-            if support_fp16:
-                self.check_output_with_place(place, atol=atol)
-
-        def test_check_grad(self):
-            place = core.CUDAPlace(0)
-            support_fp16 = core.is_float16_supported(place)
-            if support_fp16 and grad_check:
-                self.check_grad_with_place(
-                    place, ['X'], 'Out', max_relative_error=grad_atol)
-
-    cls_name = "{0}_{1}".format(parent.__name__, "fp16")
-    TestActFp16.__name__ = cls_name
-    globals()[cls_name] = TestActFp16
-
-
-create_test_act_fp16_class(TestActivation)
-create_test_act_fp16_class(TestSigmoid)
-create_test_act_fp16_class(TestLogSigmoid)
-create_test_act_fp16_class(TestTanh)
-create_test_act_fp16_class(TestTanhShrink)
-create_test_act_fp16_class(TestHardShrink)
-create_test_act_fp16_class(TestSoftShrink)
-create_test_act_fp16_class(TestSqrt)
-create_test_act_fp16_class(TestAbs)
-create_test_act_fp16_class(TestCeil, grad_check=False)
-create_test_act_fp16_class(TestFloor, grad_check=False)
-create_test_act_fp16_class(TestCos, grad_atol=0.85)
-create_test_act_fp16_class(TestAcos, grad_atol=0.85)
-create_test_act_fp16_class(TestSin)
-create_test_act_fp16_class(TestAsin)
-create_test_act_fp16_class(TestAtan)
-create_test_act_fp16_class(TestRound, grad_check=False)
-create_test_act_fp16_class(TestRelu)
-create_test_act_fp16_class(TestGelu)
-create_test_act_fp16_class(TestBRelu)
-create_test_act_fp16_class(TestRelu6)
-create_test_act_fp16_class(TestSoftRelu)
-create_test_act_fp16_class(TestELU)
-create_test_act_fp16_class(TestReciprocal)
-create_test_act_fp16_class(TestLog)
-create_test_act_fp16_class(TestSquare)
-create_test_act_fp16_class(TestPow, atol=5e-2)
-create_test_act_fp16_class(TestPow_factor_tensor, atol=5e-2)
-create_test_act_fp16_class(TestSTanh, grad_atol=0.9)
-create_test_act_fp16_class(TestSoftplus)
-create_test_act_fp16_class(TestSoftsign)
-create_test_act_fp16_class(TestThresholdedRelu)
-create_test_act_fp16_class(TestHardSigmoid)
-create_test_act_fp16_class(TestSwish)
-create_test_act_fp16_class(TestHardSwish)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
deleted file mode 100644
index 969a7da3b71b69296f3313342adbf989c60edb50..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestAdadeltaOp1(OpTest):
-    def setUp(self):
-        self.op_type = "adadelta"
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        # The squared gradient is positive
-        avg_squared_grad = np.random.random((102, 105)).astype("float32")
-        # The squared update is positive
-        avg_squared_update = np.random.random((102, 105)).astype("float32")
-
-        rho = 0.95
-        epsilon = 1e-6
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'AvgSquaredGrad': avg_squared_grad,
-            'AvgSquaredUpdate': avg_squared_update
-        }
-
-        self.attrs = {'rho': rho, 'epsilon': epsilon}
-
-        avg_squared_grad_out = rho * avg_squared_grad + \
-            (1 - rho) * np.square(grad)
-        update = -np.multiply(
-            np.sqrt(
-                np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
-                          epsilon)), grad)
-
-        avg_squared_update_out = rho * avg_squared_update + \
-            (1 - rho) * np.square(update)
-
-        param_out = param + update
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'AvgSquaredGradOut': avg_squared_grad_out,
-            'AvgSquaredUpdateOut': avg_squared_update_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAdadeltaOp2(OpTest):
-    '''Test Adadelta op with default attribute values
-    '''
-
-    def setUp(self):
-        self.op_type = "adadelta"
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        # The squared gradient is positive
-        avg_squared_grad = np.random.random((102, 105)).astype("float32")
-        # The squared update is positive
-        avg_squared_update = np.random.random((102, 105)).astype("float32")
-
-        rho = 0.95
-        epsilon = 1e-6
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'AvgSquaredGrad': avg_squared_grad,
-            'AvgSquaredUpdate': avg_squared_update
-        }
-
-        avg_squared_grad_out = rho * avg_squared_grad + \
-            (1 - rho) * np.square(grad)
-        update = -np.multiply(
-            np.sqrt(
-                np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
-                          epsilon)), grad)
-
-        avg_squared_update_out = rho * avg_squared_update + \
-            (1 - rho) * np.square(update)
-
-        param_out = param + update
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'AvgSquaredGradOut': avg_squared_grad_out,
-            'AvgSquaredUpdateOut': avg_squared_update_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
deleted file mode 100644
index fc3b7ce2fd87afc22030bcca55236fb949c1f129..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from op_test import OpTest
-import math
-
-
-class TestAdagradOp1(OpTest):
-    ''' Test Adagrad operator with explicit attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "adagrad"
-
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-        lr = 0.01
-        epsilon = 1e-8
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment': moment,
-            'LearningRate': np.array([lr]).astype("float32")
-        }
-
-        self.attrs = {'epsilon': epsilon}
-
-        moment_out = moment + grad * grad
-        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
-
-        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAdagradOp2(OpTest):
-    ''' Test Adagrad operator with default attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "adagrad"
-
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-        lr = 0.01
-        epsilon = 1e-6
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment': moment,
-            'LearningRate': np.array([lr]).astype("float32")
-        }
-
-        self.attrs = {'epsilon': epsilon}
-
-        moment_out = moment + grad * grad
-        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
-
-        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSparseAdagradOp(unittest.TestCase):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Grad Variable   
-        height = 10
-        rows = [0, 4, 7, 4]
-        row_numel = 12
-
-        grad_selected_rows = scope.var('Grad').get_selected_rows()
-        grad_selected_rows.set_height(height)
-        grad_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 8] = 4.0
-
-        grad_tensor = grad_selected_rows.get_tensor()
-        grad_tensor.set(np_array, place)
-
-        # create and initialize Param Variable
-        param = scope.var('Param').get_tensor()
-        param_array = np.full((height, row_numel), 5.0).astype("float32")
-        param.set(param_array, place)
-
-        # create and initialize LeraningRate Variable
-        lr = scope.var('LearningRate').get_tensor()
-        lr_array = np.full((1), 2.0).astype("float32")
-        lr.set(lr_array, place)
-
-        # create and initialize moment Variable
-        moment = scope.var('Moment').get_tensor()
-        moment_np_array = np.full((height, row_numel), 2.0).astype("float32")
-        moment.set(moment_np_array, place)
-
-        # create and run sgd operator
-        adagrad_op = Operator(
-            "adagrad",
-            Param='Param',
-            Grad='Grad',
-            ParamOut='Param',
-            Moment='Moment',
-            MomentOut='Moment',
-            LearningRate='LearningRate',
-            epsilon=2.0)
-
-        adagrad_op.run(scope, place)
-
-        # get and compare moment result
-        moment_result_array = np.array(moment)
-
-        self.assertAlmostEqual(6.0, moment_result_array[rows[0], 0])
-        self.assertAlmostEqual(3.0, moment_result_array[rows[0], 2])
-        self.assertAlmostEqual(2.0, moment_result_array[1, 0])
-        # 2.0 + (1.0 + 1.0)^2
-        self.assertAlmostEqual(6.0, moment_result_array[rows[1], 10])
-        self.assertAlmostEqual(6.0, moment_result_array[rows[3], 4])
-
-        self.assertAlmostEqual(2.0, moment_result_array[5, 8])
-        self.assertAlmostEqual(3.0, moment_result_array[rows[2], 1])
-        self.assertAlmostEqual(18.0, moment_result_array[rows[2], 8])
-
-        # get and compare param result
-        result_array = np.array(param)
-
-        def get_out(param, lr, grad, m, epsilon):
-            return param - lr * grad / (math.sqrt(m) + epsilon)
-
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 2.0, 6.0, 2.0),
-            result_array[rows[0], 0],
-            places=5)
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 1.0, 3.0, 2.0),
-            result_array[rows[0], 2],
-            places=5)
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[1, 0], places=5)
-
-        # grad_merge = 1.0 + 1.0
-        # m = 6.0
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 2.0, 6.0, 2.0),
-            result_array[rows[1], 10],
-            places=5)
-
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[5, 8], places=5)
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 1.0, 3.0, 2.0),
-            result_array[rows[2], 1],
-            places=5)
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 4.0, 18.0, 2.0),
-            result_array[rows[2], 8],
-            places=5)
-
-    def test_sparse_adagrad(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
deleted file mode 100644
index 15f277cdc0aca30b8c768b6a6ee20e44880b2304..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ /dev/null
@@ -1,334 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from paddle.fluid import core
-from paddle.fluid.op import Operator
-
-
-class TestAdamOp1(OpTest):
-    def setUp(self):
-        '''Test Adam Op with supplied attributes
-        '''
-        self.op_type = "adam"
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        # The second moment is positive
-        moment2 = np.random.random((102, 105)).astype("float32")
-
-        learning_rate = 0.004
-        beta1 = 0.78
-        beta2 = 0.836
-        epsilon = 1e-4
-        beta1_pow = beta1**10
-        beta2_pow = beta2**10
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment1': moment1,
-            'Moment2': moment2,
-            'LearningRate': np.array([learning_rate]).astype("float32"),
-            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([beta2_pow]).astype("float32")
-        }
-
-        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
-
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, self.attrs)
-
-        self.outputs = {
-            'Moment1Out': moment1_out,
-            'Moment2Out': moment2_out,
-            'ParamOut': param_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAdamOp2(OpTest):
-    def setUp(self):
-        '''Test Adam Op with supplied attributes
-        '''
-        self.op_type = "adam"
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        # The second moment is positive
-        moment2 = np.random.random((102, 105)).astype("float32")
-
-        learning_rate = 0.001
-        beta1 = 0.9
-        beta2 = 0.999
-        epsilon = 1e-8
-        beta1_pow = beta1**10
-        beta2_pow = beta2**10
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment1': moment1,
-            'Moment2': moment2,
-            'LearningRate': np.array([learning_rate]).astype("float32"),
-            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([beta2_pow]).astype("float32")
-        }
-
-        attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
-
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, attributes)
-
-        self.outputs = {
-            'Moment1Out': moment1_out,
-            'Moment2Out': moment2_out,
-            'ParamOut': param_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAdamOpMultipleSteps(OpTest):
-    def setUp(self):
-        '''Test Adam Operator with supplied attributes
-        '''
-        self.op_type = "adam"
-        self.num_steps = 10
-
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        # The second moment is positive
-        moment2 = np.random.random((102, 105)).astype("float32")
-
-        learning_rate = 0.001
-        beta1 = 0.9
-        beta2 = 0.999
-        epsilon = 1e-8
-        beta1_pow = beta1**10
-        beta2_pow = beta2**10
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment1': moment1,
-            'Moment2': moment2,
-            'LearningRate': np.array([learning_rate]).astype("float32"),
-            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([beta2_pow]).astype("float32")
-        }
-
-        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
-
-    def test_check_output(self):
-        for _ in range(self.num_steps):
-            param_out, moment1_out, \
-                moment2_out = adam_step(self.inputs, self.attrs)
-
-            self.outputs = {
-                'Moment1Out': moment1_out,
-                'Moment2Out': moment2_out,
-                'ParamOut': param_out
-            }
-
-            # Verify output for this step
-            self.check_output()
-
-            # Output of this step becomes input for next step
-            self.inputs['Param'] = param_out
-            self.inputs['Moment1'] = moment1_out
-            self.inputs['Moment2'] = moment2_out
-
-            # Update powers of Beta1 and Beta2 for next time step
-            self.inputs['Beta1Pow'] *= self.attrs['beta1']
-            self.inputs['Beta2Pow'] *= self.attrs['beta1']
-
-            # Randomize gradient for next step
-            self.inputs['Grad'] = np.random.uniform(
-                -1, 1, (102, 105)).astype("float32")
-
-
-def adam_step(inputs, attributes):
-    '''
-    Simulate one step of the adam optimizer
-    :param inputs: dict of inputs
-    :param attributes: dict of attributes
-    :return tuple: tuple of output param, moment1, moment2,
-    beta1 power accumulator and beta2 power accumulator
-    '''
-    param = inputs['Param']
-    grad = inputs['Grad']
-    moment1 = inputs['Moment1']
-    moment2 = inputs['Moment2']
-    lr = inputs['LearningRate']
-    beta1_pow = inputs['Beta1Pow']
-    beta2_pow = inputs['Beta2Pow']
-
-    beta1 = attributes['beta1']
-    beta2 = attributes['beta2']
-    epsilon = attributes['epsilon']
-
-    moment1_out = beta1 * moment1 + (1 - beta1) * grad
-    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
-    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
-    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
-    return param_out, moment1_out, moment2_out
-
-
-def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
-                     lazy_mode):
-    '''
-    Simulate one step of the adam optimizer
-    :param inputs: dict of inputs
-    :param attributes: dict of attributes
-    :return tuple: tuple of output param, moment1, moment2,
-    beta1 power accumulator and beta2 power accumulator
-    '''
-    param = inputs['Param']
-    # grad = inputs['Grad']
-    moment1 = inputs['Moment1']
-    moment2 = inputs['Moment2']
-    lr = inputs['LearningRate']
-    beta1_pow = inputs['Beta1Pow']
-    beta2_pow = inputs['Beta2Pow']
-
-    beta1 = attributes['beta1']
-    beta2 = attributes['beta2']
-    epsilon = attributes['epsilon']
-
-    moment1_out = np.zeros(shape=[height, row_numel])
-    moment2_out = np.zeros(shape=[height, row_numel])
-    param_out = np.zeros(shape=[height, row_numel])
-
-    def update_row(row_id, update_value):
-        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
-                                                         ) * update_value
-        moment2_out[row_id] = beta2 * moment2[row_id] + (
-            1 - beta2) * np.square(update_value)
-        lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
-        param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
-            np.sqrt(moment2_out[row_id]) + epsilon))
-
-    if lazy_mode:
-        for idx, row_id in enumerate(rows):
-            update_row(row_id, np_grad[idx])
-    else:
-        for row_id in range(param_out.shape[0]):
-            update_value = np.zeros(np_grad[0].shape).astype("float32")
-            if row_id in rows:
-                update_value = np_grad[rows.index(row_id)]
-            update_row(row_id, update_value)
-
-    return param_out, moment1_out, moment2_out
-
-
-class TestSparseAdamOp(unittest.TestCase):
-    def setup(self, scope, place, lazy_mode):
-        beta1 = 0.78
-        beta2 = 0.836
-        epsilon = 1e-4
-
-        height = 10
-        rows = [0, 4, 7]
-        self.rows = rows
-        row_numel = 12
-        self.row_numel = row_numel
-        self.dense_inputs = {
-            "Param": np.full((height, row_numel), 5.0).astype("float32"),
-            "Moment1": np.full((height, row_numel), 5.0).astype("float32"),
-            "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
-            'Beta1Pow': np.array([beta1**10]).astype("float32"),
-            'Beta2Pow': np.array([beta2**10]).astype("float32"),
-            "LearningRate": np.full((1), 2.0).astype("float32")
-        }
-        self.init_output = np.full((height, row_numel), 0.0).astype("float32")
-        self.attrs = {
-            'epsilon': epsilon,
-            'beta1': beta1,
-            'beta2': beta2,
-            'min_row_size_to_use_multithread': 2
-        }
-
-        grad_selected_rows = scope.var('Grad').get_selected_rows()
-        grad_selected_rows.set_height(height)
-        grad_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 8] = 4.0
-
-        grad_tensor = grad_selected_rows.get_tensor()
-        grad_tensor.set(np_array, place)
-
-        self.sparse_inputs = ["Grad"]
-
-        param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
-                                                 height, rows, row_numel,
-                                                 np_array, lazy_mode)
-        self.outputs = {
-            "ParamOut": param_out,
-            "Moment1Out": mom1,
-            "Moment2Out": mom2
-        }
-
-    def check_with_place(self, place, lazy_mode):
-        scope = core.Scope()
-        self.setup(scope, place, lazy_mode)
-
-        op_args = dict()
-        op_args['lazy_mode'] = lazy_mode
-        for key, np_array in self.dense_inputs.items():
-            var = scope.var(key).get_tensor()
-            var.set(np_array, place)
-            op_args[key] = key
-        for s in self.sparse_inputs:
-            op_args[s] = s
-        for s in self.outputs:
-            var = scope.var(s).get_tensor()
-            var.set(self.init_output, place)
-            op_args[s] = s
-        for k in self.attrs:
-            op_args[k] = self.attrs[k]
-
-        # create and run sgd operator
-        adam_op = Operator("adam", **op_args)
-        adam_op.run(scope, place)
-
-        for key, np_array in self.outputs.items():
-            out_var = scope.var(key).get_tensor()
-            actual = np.array(out_var)
-            actual = actual.reshape([actual.size])
-            np_array = np_array.reshape([np_array.size])
-
-            for i in range(np_array.size):
-                self.assertLess((actual[i] - np_array[i]), 0.00001)
-
-    def test_sparse_adam(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            for lazy_mode in (True, False):
-                self.check_with_place(place, lazy_mode)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_op.py b/python/paddle/fluid/tests/unittests/test_adamax_op.py
deleted file mode 100644
index a6d1be7616c73019cd8f66dcf0c108cd58ec600b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_adamax_op.py
+++ /dev/null
@@ -1,188 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestAdamaxOp1(OpTest):
-    def setUp(self):
-        '''Test Adamax Operator with supplied attributes
-        '''
-        self.op_type = "adamax"
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        # The infinity norm is positive
-        inf_norm = np.random.random((102, 105)).astype("float32")
-
-        learning_rate = 0.002
-        beta1 = 0.78
-        beta2 = 0.899
-        epsilon = 1e-5
-        beta1_pow = beta1**10
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment': moment,
-            'InfNorm': inf_norm,
-            'LearningRate': np.array([learning_rate]).astype("float32"),
-            'Beta1Pow': np.array([beta1_pow]).astype("float32")
-        }
-
-        self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
-
-        param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
-                                                          self.attrs)
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'MomentOut': moment_out,
-            'InfNormOut': inf_norm_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAdamaxOp2(OpTest):
-    '''Test Adamax Operator with default attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "adamax"
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        # The infinity norm is positive
-        inf_norm = np.random.random((102, 105)).astype("float32")
-
-        learning_rate = 0.002
-        beta1 = 0.9
-        beta2 = 0.999
-        epsilon = 1e-8
-        beta1_pow = beta1**8
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment': moment,
-            'InfNorm': inf_norm,
-            'LearningRate': np.array([learning_rate]).astype("float32"),
-            'Beta1Pow': np.array([beta1_pow]).astype("float32")
-        }
-
-        attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
-        param_out, moment_out, inf_norm_out = adamax_step(self.inputs, attrs)
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'MomentOut': moment_out,
-            'InfNormOut': inf_norm_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAdamaxOpMultipleSteps(OpTest):
-    def setUp(self):
-        '''Test Adamax Operator with supplied attributes
-        '''
-        self.op_type = "adamax"
-        self.num_steps = 10
-
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        # The infinity norm is positive
-        inf_norm = np.random.random((102, 105)).astype("float32")
-
-        learning_rate = 0.002
-        beta1 = 0.8
-        beta2 = 0.99
-        epsilon = 1e-5
-        beta1_pow = 1
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment': moment,
-            'InfNorm': inf_norm,
-            'LearningRate': np.array([learning_rate]).astype("float32"),
-            'Beta1Pow': np.array([beta1_pow]).astype("float32")
-        }
-
-        self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
-
-    def test_check_output(self):
-        for _ in range(self.num_steps):
-            param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
-                                                              self.attrs)
-
-            self.outputs = {
-                'ParamOut': param_out,
-                'MomentOut': moment_out,
-                'InfNormOut': inf_norm_out
-            }
-
-            # Verify output for this step
-            self.check_output()
-
-            # Output of this step becomes input for next step
-            self.inputs['Param'] = param_out
-            self.inputs['Moment'] = moment_out
-            self.inputs['InfNorm'] = inf_norm_out
-
-            # Update Beta1 Power accumulator for next step
-            self.inputs['Beta1Pow'] *= self.attrs['beta1']
-
-            # Randomize gradient for next step
-            self.inputs['Grad'] = np.random.uniform(
-                -1, 1, (102, 105)).astype("float32")
-
-
-def adamax_step(inputs, attributes):
-    '''
-    Simulate one step of the adamax optimizer
-    :param inputs: dict of inputs
-    :param attributes: dict of attributes
-    :return tuple: tuple of output param, moment, inf_norm and
-    beta1 power accumulator
-    '''
-    param = inputs['Param']
-    grad = inputs['Grad']
-    moment = inputs['Moment']
-    inf_norm = inputs['InfNorm']
-    lr = inputs['LearningRate']
-    beta1_pow = inputs['Beta1Pow']
-
-    beta1 = attributes['beta1']
-    beta2 = attributes['beta2']
-    epsilon = attributes['epsilon']
-
-    moment_out = beta1 * moment + (1 - beta1) * grad
-    inf_norm_out = np.maximum(beta2 * inf_norm + epsilon, np.abs(grad))
-    lr_t = (lr / (1 - beta1_pow))
-    param_out = param - lr_t * np.divide(moment_out, inf_norm_out)
-
-    return param_out, moment_out, inf_norm_out
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
deleted file mode 100644
index 3f2a33793028f0883ffe94dd8a32626ad5c0351c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-import math
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-class TestAddPositionEncodingTensorOp(OpTest):
-    """
-    This class is to test the AddPositionEncodingOp
-    """
-
-    def setUp(self):
-        """
-        the prepared section for add position encoding op
-        """
-        self.op_type = "add_position_encoding"
-        self.dtype = np.float32
-        self.init_input_output()
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x), }
-        self.outputs = {'Out': self.out}
-        self.attrs = {'alpha': self.alpha, 'beta': self.beta}
-
-    def test_check_output(self):
-        """
-        check the correctness of output
-        """
-        self.check_output()
-
-    def test_check_grad(self):
-        """
-        check the correctness of grad
-        """
-        self.check_grad(['X'], 'Out', max_relative_error=0.005)
-
-    def init_input_output(self):
-        """
-        init the input and output for test cases
-        """
-        self.alpha = 0.6
-        self.beta = 0.5
-        self.x = np.random.uniform(0.1, 1, [2, 4, 4]).astype(self.dtype)
-        self.out = np.copy(self.x)
-
-        batch_size = self.x.shape[0]
-        max_length = self.x.shape[1]
-        enc_size = self.x.shape[2]
-
-        half_shape = int(enc_size / 2)
-        for i in range(batch_size):
-            for j in range(max_length):
-                for k in range(half_shape):
-                    val = j / pow(10000.0, k / (
-                        half_shape - 1)) if half_shape > 1 else j / 10000.0
-                    self.out[i, j, k] = \
-                        self.x[i, j, k] * self.alpha + math.sin(val) * self.beta
-                    self.out[i, j, half_shape + k] = \
-                        self.x[i, j, half_shape + k] * self.alpha + math.cos(val) * self.beta
-
-
-class TestAddPositionEncodingLoDTensorOp(OpTest):
-    """
-    This class is to test the AddPositionEncodingLoDTensorOp
-    """
-
-    def setUp(self):
-        """
-        the prepared section for add position encoding LoDTensor op
-        """
-        self.op_type = "add_position_encoding"
-        self.dtype = np.float32
-        self.init_input_output()
-
-        self.inputs = {'X': (self.x, self.lod), }
-        self.outputs = {'Out': (self.out, self.lod)}
-        self.attrs = {'alpha': self.alpha, 'beta': self.beta}
-
-    def test_check_output(self):
-        """
-        check the correctness of output
-        """
-        self.check_output()
-
-    def test_check_grad(self):
-        """
-        check the correctness of grad
-        """
-        self.check_grad(['X'], 'Out', max_relative_error=0.005)
-
-    def init_input_output(self):
-        """
-        init the input and output for test cases
-        """
-        self.alpha = 0.6
-        self.beta = 0.5
-        self.x = np.random.uniform(0.1, 1, [10, 4]).astype(self.dtype)
-        self.lod = [[3, 7]]
-        self.out = np.copy(self.x)
-
-        batch_size = len(self.lod[0])
-        enc_size = self.x.shape[1]
-
-        start = 0
-        half_shape = int(enc_size / 2)
-        for i in range(batch_size):
-            max_length = self.lod[0][i]
-            for j in range(max_length):
-                for k in range(half_shape):
-                    val = j / pow(10000.0, k / (
-                        half_shape - 1)) if half_shape > 1 else j / 10000.0
-                    pos = start + j
-                    self.out[pos, k] = \
-                        self.x[pos, k] * self.alpha + math.sin(val) * self.beta
-                    self.out[pos, half_shape + k] = \
-                        self.x[pos, half_shape + k] * self.alpha + math.cos(val) * self.beta
-            start += max_length
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
deleted file mode 100644
index 90aa69bca6cec8f2892f7cc2f51be6f244650384..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-
-
-def affine_channel(x, scale, bias, layout):
-    C = x.shape[1] if layout == 'NCHW' else x.shape[-1]
-    if len(x.shape) == 4:
-        new_shape = (1, C, 1, 1) if layout == 'NCHW' else (1, 1, 1, C)
-    else:
-        new_shape = (1, C)
-    scale = scale.reshape(new_shape)
-    bias = bias.reshape(new_shape)
-    return x * scale + bias
-
-
-class TestAffineChannelOp(OpTest):
-    def setUp(self):
-        self.op_type = "affine_channel"
-        self.init_test_case()
-
-        x = np.random.random(self.shape).astype("float32")
-        scale = np.random.random(self.C).astype("float32")
-        bias = np.random.random(self.C).astype("float32")
-
-        y = affine_channel(x, scale, bias, self.layout)
-
-        self.inputs = {'X': x, 'Scale': scale, 'Bias': bias}
-        self.attrs = {'data_layout': self.layout}
-        self.outputs = {'Out': y}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Scale', 'Bias'], 'Out')
-
-    def test_check_grad_stopgrad_dx(self):
-        self.check_grad(['Scale', 'Bias'], 'Out', no_grad_set=set('X'))
-
-    def test_check_grad_stopgrad_dscale_dbias(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set(['Scale', 'Bias']))
-
-    def init_test_case(self):
-        self.shape = [2, 32, 14, 14]
-        self.C = 32
-        self.layout = 'NCHW'
-
-
-class TestAffineChannelNHWC(TestAffineChannelOp):
-    def init_test_case(self):
-        self.shape = [2, 14, 14, 32]
-        self.C = 32
-        self.layout = 'NHWC'
-
-    def test_check_grad_stopgrad_dx(self):
-        return
-
-    def test_check_grad_stopgrad_dscale_dbias(self):
-        return
-
-
-class TestAffineChannel2D(TestAffineChannelOp):
-    def init_test_case(self):
-        self.shape = [16, 64]
-        self.C = 64
-        self.layout = 'NCHW'
-
-    def test_check_grad_stopgrad_dx(self):
-        return
-
-    def test_check_grad_stopgrad_dscale_dbias(self):
-        return
-
-
-class TestAffineChannelNCHWLargeShape(TestAffineChannelOp):
-    def init_test_case(self):
-        self.shape = [4, 128, 112, 112]
-        self.C = 128
-        self.layout = 'NCHW'
-
-    # since the gradient check is very slow in large shape, so skip check_grad
-    def test_check_grad(self):
-        pass
-
-    def test_check_grad_stopgrad_dx(self):
-        pass
-
-    def test_check_grad_stopgrad_dscale_dbias(self):
-        pass
-
-
-class TestAffineChannelNHWCLargeShape(TestAffineChannelNCHWLargeShape):
-    def init_test_case(self):
-        self.shape = [64, 32, 32, 128]
-        self.C = 128
-        self.layout = 'NHWC'
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
deleted file mode 100644
index 576d00940c4c7a5e30af5550e14b674a73e7df11..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def AffineGrid(theta, size):
-    n = size[0]
-    w = size[3]
-    h = size[2]
-    h_idx = np.repeat(
-        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
-    w_idx = np.repeat(
-        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
-    grid = np.concatenate(
-        [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
-    grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
-
-    ret = np.zeros([n, h * w, 2])
-    theta = theta.transpose([0, 2, 1])
-    for i in range(len(theta)):
-        ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
-
-#    print ret.reshape([h * w, 2]).astype("float32")    
-    return ret.reshape([n, h, w, 2]).astype("float32")
-
-
-class TestAffineGridOp(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = "affine_grid"
-        theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
-        theta = np.ones(self.theta_shape).astype("float32")
-        self.inputs = {'Theta': theta}
-        self.attrs = {"use_cudnn": True}
-        if self.dynamic_shape:
-            self.inputs['OutputShape'] = self.output_shape
-        else:
-            self.attrs['output_shape'] = self.output_shape
-        self.outputs = {'Output': AffineGrid(theta, self.output_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(
-            ['Theta'],
-            'Output',
-            no_grad_set=['OutputShape'],
-            max_relative_error=0.006)
-
-    def initTestCase(self):
-        self.theta_shape = (3, 2, 3)
-        self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
-        self.dynamic_shape = False
-
-
-class TestAffineGridOpCase1(TestAffineGridOp):
-    def initTestCase(self):
-        self.theta_shape = (3, 2, 3)
-        self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
-        self.dynamic_shape = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_allgather.py b/python/paddle/fluid/tests/unittests/test_allgather.py
deleted file mode 100644
index 877ae6f6e16c2269d7674c38b1ec30ad02f453c0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_allgather.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-
-from test_collective_base import TestDistBase
-
-
-class TestAllGatherOp(TestDistBase):
-    def _setup_config(self):
-        pass
-
-    def test_allgather(self, col_type="allgather"):
-        self.check_with_place("collective_allgather_op.py", col_type)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_allreduce.py b/python/paddle/fluid/tests/unittests/test_allreduce.py
deleted file mode 100644
index e0b6422a67b408840be9b96210b6003165dcb3a8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_allreduce.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-
-from test_collective_base import TestDistBase
-
-
-class TestAllReduceOp(TestDistBase):
-    def _setup_config(self):
-        pass
-
-    def test_allreduce(self, col_type="allreduce"):
-        self.check_with_place("collective_allreduce_op.py", col_type)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
deleted file mode 100644
index d31eaa0114c3b035add3e6ca792696b5cafb9690..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://w_idxw.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-from op_test import OpTest
-
-
-def anchor_generator_in_python(input_feat, anchor_sizes, aspect_ratios,
-                               variances, stride, offset):
-    num_anchors = len(aspect_ratios) * len(anchor_sizes)
-    layer_h = input_feat.shape[2]
-    layer_w = input_feat.shape[3]
-    out_dim = (layer_h, layer_w, num_anchors, 4)
-    out_anchors = np.zeros(out_dim).astype('float32')
-
-    for h_idx in range(layer_h):
-        for w_idx in range(layer_w):
-            x_ctr = (w_idx * stride[0]) + offset * (stride[0] - 1)
-            y_ctr = (h_idx * stride[1]) + offset * (stride[1] - 1)
-            idx = 0
-            for r in range(len(aspect_ratios)):
-                ar = aspect_ratios[r]
-                for s in range(len(anchor_sizes)):
-                    anchor_size = anchor_sizes[s]
-                    area = stride[0] * stride[1]
-                    area_ratios = area / ar
-                    base_w = np.round(np.sqrt(area_ratios))
-                    base_h = np.round(base_w * ar)
-                    scale_w = anchor_size / stride[0]
-                    scale_h = anchor_size / stride[1]
-                    w = scale_w * base_w
-                    h = scale_h * base_h
-                    out_anchors[h_idx, w_idx, idx, :] = [
-                        (x_ctr - 0.5 * (w - 1)), (y_ctr - 0.5 * (h - 1)),
-                        (x_ctr + 0.5 * (w - 1)), (y_ctr + 0.5 * (h - 1))
-                    ]
-                    idx += 1
-
-    # set the variance.
-    out_var = np.tile(variances, (layer_h, layer_w, num_anchors, 1))
-    out_anchors = out_anchors.astype('float32')
-    out_var = out_var.astype('float32')
-    return out_anchors, out_var
-
-
-class TestAnchorGeneratorOp(OpTest):
-    def set_data(self):
-        self.init_test_params()
-        self.init_test_input()
-        self.init_test_output()
-        self.inputs = {'Input': self.input}
-
-        self.attrs = {
-            'anchor_sizes': self.anchor_sizes,
-            'aspect_ratios': self.aspect_ratios,
-            'stride': self.stride,
-            'offset': self.offset,
-            'variances': self.variances,
-        }
-
-        self.outputs = {'Anchors': self.out_anchors, 'Variances': self.out_var}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "anchor_generator"
-        self.set_data()
-
-    def init_test_params(self):
-        self.batch_size = 1
-        self.input_channels = 2
-        self.layer_h = 2
-        self.layer_w = 2
-
-        self.anchor_sizes = [64., 128., 256., 512.]
-        self.aspect_ratios = [0.5, 1., 2.]
-        self.stride = [16., 16.]
-
-        self.offset = 0.5
-
-        self.variances = [0.1, 0.1, 0.2, 0.2]
-
-    def init_test_input(self):
-        self.input = np.random.random(
-            (self.batch_size, self.input_channels, self.layer_h,
-             self.layer_w)).astype('float32')
-
-    def init_test_output(self):
-        self.out_anchors, self.out_var = anchor_generator_in_python(
-            self.input, self.anchor_sizes, self.aspect_ratios, self.variances,
-            self.stride, self.offset)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
deleted file mode 100644
index 4f9f1ec2253ca01eb4b07a06a248f91d4676c9c4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class BaseTestCase(OpTest):
-    def initTestCase(self):
-        self.op_type = 'arg_min'
-        self.dims = (3, 4, 5)
-        self.dtype = 'float32'
-        self.axis = 0
-
-    def setUp(self):
-        self.initTestCase()
-        self.x = (1000 * np.random.random(self.dims)).astype(self.dtype)
-        self.inputs = {'X': self.x}
-        self.attrs = {'axis': self.axis}
-        if self.op_type == "arg_min":
-            self.outputs = {'Out': np.argmin(self.x, axis=self.axis)}
-        else:
-            self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestCase0(BaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4, 5)
-        self.dtype = 'float32'
-        self.axis = 0
-
-
-class TestCase1(BaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_min'
-        self.dims = (3, 4)
-        self.dtype = 'float64'
-        self.axis = 1
-
-
-class TestCase2(BaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4)
-        self.dtype = 'int64'
-        self.axis = 0
-
-
-class TestCase2_1(BaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4)
-        self.dtype = 'int64'
-        self.axis = -1
-
-
-class TestCase3(BaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, )
-        self.dtype = 'int64'
-        self.axis = 0
-
-
-class TestCase4(BaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_min'
-        self.dims = (1, )
-        self.dtype = 'int32'
-        self.axis = 0
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
deleted file mode 100644
index 7bc6f2599d617b192908da9b57d0cd715019bd71..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestArgsortOp(OpTest):
-    def setUp(self):
-        self.init_axis()
-        x = np.random.random((2, 3, 4, 5, 10)).astype("float32")
-        if self.axis < 0:
-            self.axis = self.axis + len(x.shape)
-        self.indices = np.argsort(x, kind='quicksort', axis=self.axis)
-        self.out = np.sort(x, kind='quicksort', axis=self.axis)
-        self.op_type = "argsort"
-        self.inputs = {'X': x}
-        self.attrs = {'axis': self.axis}
-        self.outputs = {'Indices': self.indices, 'Out': self.out}
-
-    def init_axis(self):
-        self.axis = -1
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestArgsortOpAxis0(TestArgsortOp):
-    def init_axis(self):
-        self.axis = 0
-
-
-class TestArgsortOpAxis1(TestArgsortOp):
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestArgsortOpAxisNeg2(TestArgsortOp):
-    def init_axis(self):
-        self.axis = -2
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
deleted file mode 100644
index b86d0bc43a9f84988f2b1b27f7aeffce46a46bd9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-from paddle.fluid.executor import Executor
-from paddle.fluid.backward import append_backward
-from paddle.fluid.framework import default_main_program
-import numpy
-
-
-class TestArrayReadWrite(unittest.TestCase):
-    def test_read_write(self):
-        x = [
-            layers.data(
-                name='x0', shape=[100]), layers.data(
-                    name='x1', shape=[100]), layers.data(
-                        name='x2', shape=[100])
-        ]
-
-        for each_x in x:
-            each_x.stop_gradient = False
-
-        i = layers.zeros(shape=[1], dtype='int64')
-        i.stop_gradient = False
-        arr = layers.array_write(x=x[0], i=i)
-        i = layers.increment(x=i)
-        arr = layers.array_write(x=x[1], i=i, array=arr)
-        i = layers.increment(x=i)
-        arr = layers.array_write(x=x[2], i=i, array=arr)
-
-        i = layers.zeros(shape=[1], dtype='int64')
-        i.stop_gradient = False
-        a0 = layers.array_read(array=arr, i=i)
-        i = layers.increment(x=i)
-        a1 = layers.array_read(array=arr, i=i)
-        i = layers.increment(x=i)
-        a2 = layers.array_read(array=arr, i=i)
-
-        mean_a0 = layers.mean(a0)
-        mean_a1 = layers.mean(a1)
-        mean_a2 = layers.mean(a2)
-
-        a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])
-
-        mean_x0 = layers.mean(x[0])
-        mean_x1 = layers.mean(x[1])
-        mean_x2 = layers.mean(x[2])
-
-        x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])
-
-        scope = core.Scope()
-        cpu = core.CPUPlace()
-
-        exe = Executor(cpu)
-
-        tensor = numpy.random.random(size=(100, 100)).astype('float32')
-
-        outs = exe.run(feed={'x0': tensor,
-                             'x1': tensor,
-                             'x2': tensor},
-                       fetch_list=[a_sum, x_sum],
-                       scope=scope)
-        self.assertEqual(outs[0], outs[1])
-
-        total_sum = layers.sums(input=[a_sum, x_sum])
-        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
-
-        append_backward(total_sum_scaled)
-
-        g_vars = list(
-            map(default_main_program().global_block().var,
-                [each_x.name + "@GRAD" for each_x in x]))
-        g_out = [
-            item.sum()
-            for item in exe.run(
-                feed={'x0': tensor,
-                      'x1': tensor,
-                      'x2': tensor},
-                fetch_list=g_vars)
-        ]
-        g_out_sum = numpy.array(g_out).sum()
-
-        # since our final gradient is 1 and the neural network are all linear
-        # with mean_op.
-        # the input gradient should also be 1
-        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
deleted file mode 100644
index ba2eecfaf197ea63c187e77ae7ae8cf34873d66b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import op_test
-import numpy
-import unittest
-
-
-class TestAssignOp(op_test.OpTest):
-    def setUp(self):
-        self.op_type = "assign"
-        x = numpy.random.random(size=(100, 10))
-        self.inputs = {'X': x}
-        self.outputs = {'Out': x}
-
-    def test_forward(self):
-        self.check_output()
-
-    def test_backward(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_value_op.py b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
deleted file mode 100644
index 5a9d8efef1f3e5a9e116720c2ffe32c2ef0a082f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import op_test
-import numpy
-import unittest
-import paddle.fluid.framework as framework
-
-
-class TestAssignValueOp(op_test.OpTest):
-    def setUp(self):
-        self.op_type = "assign_value"
-        x = numpy.random.random(size=(2, 5)).astype(numpy.float32)
-        self.inputs = {}
-        self.outputs = {'Out': x}
-        self.attrs = {
-            'shape': x.shape,
-            'dtype': framework.convert_np_dtype_to_dtype_(x.dtype),
-            'fp32_values': [float(v) for v in x.flat]
-        }
-
-    def test_forward(self):
-        self.check_output()
-
-    def test_assign(self):
-        val = (
-            -100 + 200 * numpy.random.random(size=(2, 5))).astype(numpy.int32)
-        x = layers.create_tensor(dtype="float32")
-        layers.assign(input=val, output=x)
-        exe = fluid.Executor(fluid.CPUPlace())
-        fetched_x = exe.run(fluid.default_main_program(),
-                            feed={},
-                            fetch_list=[x])[0]
-        self.assertTrue(
-            numpy.array_equal(fetched_x, val),
-            "fetch_x=%s val=%s" % (fetched_x, val))
-        self.assertEqual(fetched_x.dtype, val.dtype)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
deleted file mode 100644
index abc463a0fb0f8b521f0d833a1f9cd507718d3c9d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import unittest
-
-import numpy
-import time
-import paddle
-import paddle.fluid as fluid
-
-BATCH_SIZE = 64
-
-
-def convolutional_neural_network(use_py_reader):
-    with fluid.unique_name.guard():
-        img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-        py_reader = None
-        if use_py_reader:
-            py_reader = fluid.layers.create_py_reader_by_data(
-                capacity=64,
-                feed_list=[img, label],
-                name='py_reader',
-                use_double_buffer=False)
-            img, label = fluid.layers.read_file(py_reader)
-
-        conv_pool_1 = fluid.nets.simple_img_conv_pool(
-            input=img,
-            filter_size=5,
-            num_filters=20,
-            pool_size=2,
-            pool_stride=2,
-            act="relu")
-        conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-        conv_pool_2 = fluid.nets.simple_img_conv_pool(
-            input=conv_pool_1,
-            filter_size=5,
-            num_filters=50,
-            pool_size=2,
-            pool_stride=2,
-            act="relu")
-
-        prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-        loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_loss = fluid.layers.mean(loss)
-        acc = fluid.layers.accuracy(input=prediction, label=label)
-
-        return img, label, prediction, avg_loss, acc, py_reader
-
-
-def test():
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
-
-    img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
-        use_py_reader=False)
-    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
-
-    def train_test(train_test_program, train_test_feed, train_test_reader):
-        acc_set = []
-        avg_loss_set = []
-        for test_data in train_test_reader():
-            acc_np, avg_loss_np = exe.run(program=train_test_program,
-                                          feed=train_test_feed.feed(test_data),
-                                          fetch_list=[acc, avg_loss])
-            acc_set.append(float(acc_np))
-            avg_loss_set.append(float(avg_loss_np))
-        # get test acc and loss
-        acc_val_mean = numpy.array(acc_set).mean()
-        avg_loss_val_mean = numpy.array(avg_loss_set).mean()
-        return avg_loss_val_mean, acc_val_mean
-
-    # test for epoch
-    avg_loss_val, acc_val = train_test(
-        train_test_program=fluid.default_main_program(),
-        train_test_reader=test_reader,
-        train_test_feed=feeder)
-
-    print("Test: avg_cost: %s, acc: %s" % (avg_loss_val, acc_val))
-    assert acc_val > 0.96
-
-
-def train(use_cuda, thread_num, cpu_num):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        print("paddle is not compiled with cuda, exit!")
-        return
-
-    img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
-        use_py_reader=True)
-    print("build convolutional neural network done.")
-
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimizer.minimize(avg_loss)
-    print("Adam optimizer minimize done.")
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-    print("declared train reader done.")
-
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    print("going to run startup program")
-    exe.run(fluid.default_startup_program())
-    print("run startup program done.")
-
-    os.environ['CPU_NUM'] = str(cpu_num)
-
-    print("cpu_num:" + str(cpu_num))
-    print("thread_num:" + str(thread_num))
-
-    build_strategy = fluid.BuildStrategy()
-    build_strategy.async_mode = True
-
-    exec_strategy = fluid.ExecutionStrategy()
-    exec_strategy.num_threads = thread_num
-    exec_strategy.num_iteration_per_run = 10
-
-    main_program = fluid.default_main_program()
-    pe = fluid.ParallelExecutor(
-        use_cuda=False,
-        loss_name=avg_loss.name,
-        main_program=main_program,
-        build_strategy=build_strategy,
-        exec_strategy=exec_strategy)
-    print("declare parallel executor done.")
-
-    py_reader.decorate_paddle_reader(train_reader)
-
-    for pass_id in range(2):
-        step = 0
-        py_reader.start()
-        try:
-            while True:
-                loss_val = pe.run(fetch_list=[avg_loss.name])
-                loss_val = numpy.mean(loss_val)
-                if step % 10 == 0:
-                    print("Pass %d, Batch %d, Cost %f, queue size %d" %
-                          (pass_id, step, loss_val, py_reader.queue.size()))
-                step += 1
-        except fluid.core.EOFException:
-            print("train end pass = " + str(pass_id))
-            py_reader.reset()
-
-    return step
-
-
-class TestAsyncSSAGraphExecutor(unittest.TestCase):
-    def test_check_async_ssa_exe_train(self):
-        step_list = []
-        for cpu_num in [1, 2, 4]:
-            print("run cpu_num -> " + str(cpu_num))
-            with fluid.scope_guard(fluid.core.Scope()):
-                with fluid.program_guard(
-                        main_program=fluid.Program(),
-                        startup_program=fluid.Program()):
-                    start_time = time.time()
-                    step = train(
-                        use_cuda=False, thread_num=cpu_num, cpu_num=cpu_num)
-                    end_time = time.time()
-                    step_list.append(step)
-                print("cpu_num -> " + str(cpu_num) + " step -> " + str(step) +
-                      " time -> " + str(end_time - start_time))
-                with fluid.program_guard(
-                        main_program=fluid.Program(),
-                        startup_program=fluid.Program()):
-                    test()
-        assert abs(int(step_list[0] / 2) - int(step_list[1])) < 5
-        assert abs(int(step_list[1] / 2) - int(step_list[2])) < 5
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
deleted file mode 100644
index 1b9c3efe0fa9e9f1b8ad09029079898622e7d489..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
+++ /dev/null
@@ -1,208 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_fusion_lstm_op import fc, ACTIVATION
-from test_softmax_op import stable_softmax
-
-
-def attention_lstm(
-        x,  # T x M
-        lod,  # 1 x N
-        h0,  # N x D
-        c0,  # N x D
-        fcws,  # (M+D) x 1, 1x1
-        fcbs,  # 1 x 1, 1x1
-        w,  # (M+D) x 4D
-        b,  # 1 x 4D
-        act_gate,
-        act_cell,
-        act_cand):
-
-    T = sum(lod[0])
-    N = len(lod[0])
-    M = x.shape[1]
-    D = b.shape[1] // 4
-    assert T == x.shape[0]
-    assert len(fcws) == len(fcbs)
-    hidden = []
-    cell = []
-
-    start_offset = 0
-    for bid in range(N):
-        seq_len = lod[0][bid]
-        xi = np.copy(x[start_offset:start_offset + seq_len, :]).reshape(seq_len,
-                                                                        M)
-        prev_cell = np.copy(c0[bid]).reshape([1, D])
-        prev_hidden = np.copy(h0[bid]).reshape([1, D])
-        for step in range(seq_len):
-            expanded_cell = np.repeat(prev_cell, seq_len, axis=0)
-            tmp = np.concatenate((xi, expanded_cell), axis=1)
-            assert tmp.shape[0] == seq_len
-            assert tmp.shape[1] == M + D
-            for fcid in range(len(fcbs)):
-                tmp = fc(tmp, fcws[fcid], fcbs[fcid])
-                tmp = ACTIVATION['relu'](tmp)
-            tmp = np.reshape(tmp, (1, seq_len))
-            tmp = stable_softmax(tmp).reshape(seq_len, 1)
-            lstmx = xi * tmp  # seq * M
-            lstmx = np.sum(lstmx.reshape(seq_len, M), axis=0).reshape([1, M])
-            lstmin = np.concatenate((prev_hidden, lstmx), axis=1)
-            lstmout = fc(lstmin, w, b).reshape([1, 4 * D])
-
-            g_f, g_i, g_o, cand = np.split(lstmout, 4, axis=1)
-            g_f = act_gate(g_f).reshape([1, D])
-            g_i = act_gate(g_i).reshape([1, D])
-            g_o = act_gate(g_o).reshape([1, D])
-            cand = act_cand(cand).reshape([1, D])
-
-            cell_t = (prev_cell * g_f) + (g_i * cand)
-            hidden_t = g_o * act_cell(cell_t)
-
-            hidden.append(hidden_t.flatten())
-            cell.append(cell_t.flatten())
-
-            prev_cell = cell_t.reshape([1, D])
-            prev_hidden = hidden_t.reshape([1, D])
-
-        start_offset += seq_len
-
-    hidden = np.array(hidden).astype('float32').reshape([T, D])
-    cell = np.array(cell).astype('float32').reshape([T, D])
-    return hidden, cell
-
-
-class TestAttentionLSTMOp(OpTest):
-    def set_conf(self):
-        pass
-
-    def setUp(self):
-        self.op_type = 'attention_lstm'
-        self.lod = [[3]]
-        self.M = 30
-        self.D = 15
-        self.has_initial_hidden = True
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-        self.set_conf()
-
-        T = sum(self.lod[0])
-        bs = len(self.lod[0])
-
-        x = np.random.normal(size=(T, self.M)).astype('float32')
-        c0 = np.random.normal(size=(bs, self.D)).astype('float32')
-        if self.has_initial_hidden:
-            h0 = np.random.normal(size=(bs, self.D)).astype('float32')
-        else:
-            h0 = np.zeros((bs, self.D)).astype('float32')
-
-        fcw1 = np.random.normal(size=(self.M + self.D, 1)).astype('float32')
-        fcb1 = np.random.normal(size=(1, 1)).astype('float32')
-        fcw2 = np.random.normal(size=(1, 1)).astype('float32')
-        fcb2 = np.random.normal(size=(1, 1)).astype('float32')
-
-        # lstm weight and bias
-        w = np.random.normal(size=(self.M + self.D,
-                                   self.D * 4)).astype('float32')
-        b = np.random.normal(size=(1, self.D * 4)).astype('float32')
-
-        h, c = attention_lstm(x, self.lod, h0, c0, [fcw1, fcw2], [fcb1, fcb2],
-                              w, b, ACTIVATION[self.act_gate],
-                              ACTIVATION[self.act_cell],
-                              ACTIVATION[self.act_cand])
-
-        self.inputs = {
-            'X': (x, self.lod),
-            'C0': c0,
-            'AttentionWeight': fcw1,
-            'AttentionBias': fcb1,
-            'AttentionScalar': fcw2,
-            'AttentionScalarBias': fcb2,
-            'LSTMWeight': w,
-            'LSTMBias': b
-        }
-
-        if self.has_initial_hidden:
-            self.inputs['H0'] = h0
-
-        self.outputs = {
-            'Hidden': (h, self.lod),
-            'Cell': (c, self.lod),
-        }
-        self.attrs = {
-            'gate_activation': self.act_gate,
-            'cell_activation': self.act_cell,
-            'candidate_activation': self.act_cand
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAttentionOpNonInit(TestAttentionLSTMOp):
-    def set_conf(self):
-        self.has_initial_hidden = False
-
-
-class TestAttentionOpAct(TestAttentionLSTMOp):
-    def set_conf(self):
-        self.M = 3
-        self.D = 2
-        self.act_gate = 'relu'
-        self.act_cell = 'tanh'
-        self.act_cand = 'sigmoid'
-
-
-class TestAttentionOpMD1(TestAttentionLSTMOp):
-    def set_conf(self):
-        self.M = 36
-        self.D = 8
-
-
-class TestAttentionOpMD2(TestAttentionLSTMOp):
-    def set_conf(self):
-        self.M = 8
-        self.D = 8
-
-
-class TestAttentionOpMD3(TestAttentionLSTMOp):
-    def set_conf(self):
-        self.M = 15
-        self.D = 30
-
-
-class TestAttentionOpBS1(TestAttentionLSTMOp):
-    def set_conf(self):
-        self.lod = [[5]]
-        self.M = 16
-        self.D = 32
-
-
-class TestAttentionOpBS2(TestAttentionLSTMOp):
-    def set_conf(self):
-        self.lod = [[3, 6]]
-
-
-class TestAttentionOpBS5(TestAttentionLSTMOp):
-    def set_conf(self):
-        self.lod = [[3, 2, 4, 7, 5]]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py
deleted file mode 100644
index b75abd424a46976e3426c16b91fb977b3be2e94f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from paddle.fluid import metrics
-
-
-class TestAucOp(OpTest):
-    def setUp(self):
-        self.op_type = "auc"
-        pred = np.random.random((128, 2)).astype("float32")
-        labels = np.random.randint(0, 2, (128, 1)).astype("int64")
-        num_thresholds = 200
-
-        stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
-        stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")
-
-        self.inputs = {
-            'Predict': pred,
-            'Label': labels,
-            "StatPos": stat_pos,
-            "StatNeg": stat_neg
-        }
-        self.attrs = {
-            'curve': 'ROC',
-            'num_thresholds': num_thresholds,
-            "slide_steps": 1
-        }
-
-        python_auc = metrics.Auc(name="auc",
-                                 curve='ROC',
-                                 num_thresholds=num_thresholds)
-        python_auc.update(pred, labels)
-
-        self.outputs = {
-            'AUC': np.array(python_auc.eval()),
-            'StatPosOut': np.array(python_auc._stat_pos),
-            'StatNegOut': np.array(python_auc._stat_neg)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py b/python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py
deleted file mode 100644
index 6d3e93fa57b081fa1ce0ec6309ee166335b05ec9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from paddle.fluid import metrics
-
-
-class TestAucSinglePredOp(OpTest):
-    def setUp(self):
-        self.op_type = "auc"
-        pred = np.random.random((128, 2)).astype("float32")
-        pred0 = pred[:, 0].reshape(128, 1)
-        labels = np.random.randint(0, 2, (128, 1)).astype("int64")
-        num_thresholds = 200
-
-        stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
-        stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")
-
-        self.inputs = {
-            'Predict': pred0,
-            'Label': labels,
-            "StatPos": stat_pos,
-            "StatNeg": stat_neg
-        }
-        self.attrs = {
-            'curve': 'ROC',
-            'num_thresholds': num_thresholds,
-            "slide_steps": 1
-        }
-
-        python_auc = metrics.Auc(name="auc",
-                                 curve='ROC',
-                                 num_thresholds=num_thresholds)
-        for i in range(128):
-            pred[i][1] = pred[i][0]
-        python_auc.update(pred, labels)
-
-        self.outputs = {
-            'AUC': np.array(python_auc.eval()),
-            'StatPosOut': np.array(python_auc._stat_pos),
-            'StatNegOut': np.array(python_auc._stat_neg)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py b/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
deleted file mode 100644
index 8572572f14636bc3bc019264ab6b0f0476323806..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-
-
-class TestAvoidTwiceInitialization(unittest.TestCase):
-    def test_avoid_twice_initialization(self):
-        cur_program = fluid.Program()
-        cur_block = cur_program.current_block()
-        var = cur_block.create_parameter(
-            initializer=fluid.initializer.Constant(value=0.01),
-            shape=[2, 2],
-            dtype='float32',
-            name='var_a')
-        cur_block.append_op(
-            type="c_broadcast",
-            inputs={"X": [var]},
-            outputs={"Out": [var]},
-            attrs={'root': 0,
-                   'ring_id': 0,
-                   'use_calc_stream': False})
-        cur_block.append_op(
-            type="c_sync_comm_stream",
-            inputs={'X': [var]},
-            outputs={'Out': [var]},
-            attrs={'ring_id': 0})
-        var2 = cur_block.create_parameter(
-            initializer=fluid.initializer.Constant(value=0.01),
-            shape=[2, 2],
-            dtype='float32',
-            name='var_a')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_backward.py b/python/paddle/fluid/tests/unittests/test_backward.py
deleted file mode 100644
index dc98e04775f3762b931a4ec54ca21468fb3081fb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_backward.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-from simple_nets import init_data
-
-
-def case1_fill_grad_vars():
-    x = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    feature = fluid.layers.fc(input=x, size=20, act=None)
-    part1, part2 = fluid.layers.split(feature, num_or_sections=[10, 10], dim=1)
-    # Note that: part2 is not used.
-    loss = fluid.layers.cross_entropy(input=part1, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def case2_prune_no_grad_branch():
-    x = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    feature = fluid.layers.fc(input=x, size=10, act=None)
-    label = fluid.layers.cast(label, dtype="float32")
-    label = fluid.layers.cast(label, dtype='int64')
-    # Note that the label is not persistable in fluid.layers.cross_entropy.
-    loss = fluid.layers.cross_entropy(input=feature, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def case3_prune_no_grad_branch2():
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    label = fluid.layers.cast(label, dtype="float32")
-    label = fluid.layers.cast(label, dtype='int64')
-    out = fluid.layers.one_hot(input=label, depth=100)
-    loss = fluid.layers.mean(out)
-    return loss
-
-
-def case4_with_no_grad_op_maker():
-    out = fluid.layers.gaussian_random(shape=[20, 30])
-    loss = fluid.layers.mean(out)
-    return loss
-
-
-class TestBackward(unittest.TestCase):
-    def check_backward(self, model, feed_dict):
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        startup = fluid.Program()
-
-        with fluid.program_guard(main, startup):
-            loss = model()
-
-            optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-            optimizer.minimize(loss)
-
-            exe.run(fluid.default_startup_program())
-            exe.run(feed=feed_dict)
-
-    def test_backward(self):
-        batch_size = 2
-        img, label = init_data(batch_size, img_shape=[784], label_range=9)
-        feed_dict = {'image': img, 'label': label}
-        self.check_backward(case1_fill_grad_vars, feed_dict)
-        self.check_backward(case2_prune_no_grad_branch, feed_dict)
-        self.check_backward(case3_prune_no_grad_branch2, {'label': label})
-        self.check_backward(case4_with_no_grad_op_maker, {})
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
deleted file mode 100644
index 04a36f7cafe7b4445125c4e9bd58f6d30d6c71aa..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-
-
-class L1(fluid.Layer):
-    def __init__(self, prefix):
-        super(L1, self).__init__(prefix)
-        self._param_attr = fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.1))
-        self.w1 = self.create_parameter(
-            attr=self._param_attr, shape=[2, 2], dtype='float32', is_bias=False)
-        self.w2 = self.create_parameter(
-            attr=self._param_attr, shape=[2, 2], dtype='float32', is_bias=False)
-
-    def forward(self):
-        return self.w1 + self.w2
-
-
-class L2(fluid.Layer):
-    def __init__(self, prefix):
-        super(L2, self).__init__(prefix)
-        self.layer1 = L1(self.full_name())
-        self.layer2 = L1(self.full_name())
-
-    def forward(self):
-        return self.layer1() + self.layer2()
-
-
-class L3(fluid.Layer):
-    def __init__(self, prefix):
-        super(L3, self).__init__(prefix)
-        self.layer1 = L2(self.full_name())
-        self.layer2 = L2(self.full_name())
-
-    def forward(self):
-        return self.layer1() + self.layer2()
-
-
-class TestBaseLayer(unittest.TestCase):
-    def test_one_level(self):
-        with fluid.dygraph.guard():
-            l = L1('test_one_level')
-            ret = l()
-            self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0")
-            self.assertEqual(l.w2.name, "test_one_level/L1_0.w_1")
-            self.assertTrue(np.allclose(ret.numpy(), 0.2 * np.ones([2, 2])))
-
-    def test_three_level(self):
-        with fluid.dygraph.guard():
-            l = L3('test_three_level')
-            names = [p.name for p in l.parameters()]
-            ret = l()
-            self.assertEqual(names[0], "test_three_level/L3_0/L2_0/L1_0.w_0")
-            self.assertEqual(names[1], "test_three_level/L3_0/L2_0/L1_0.w_1")
-            self.assertEqual(names[2], "test_three_level/L3_0/L2_0/L1_1.w_0")
-            self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1.w_1")
-            self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0.w_0")
-            self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0.w_1")
-            self.assertTrue(np.allclose(ret.numpy(), 0.8 * np.ones([2, 2])))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_basic_gru_api.py b/python/paddle/fluid/tests/unittests/test_basic_gru_api.py
deleted file mode 100644
index 37cf56bf19ea2c446dd39765f8378a7fce13d04f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_basic_gru_api.py
+++ /dev/null
@@ -1,334 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-from paddle.fluid.contrib.layers import basic_gru
-from paddle.fluid.executor import Executor
-from paddle.fluid import framework
-
-import numpy as np
-
-SIGMOID_THRESHOLD_MIN = -40.0
-SIGMOID_THRESHOLD_MAX = 13.0
-EXP_MAX_INPUT = 40.0
-
-
-def sigmoid(x):
-    y = np.copy(x)
-    y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
-    y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
-    return 1. / (1. + np.exp(-y))
-
-
-def tanh(x):
-    y = -2. * x
-    y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
-    return (2. / (1. + np.exp(y))) - 1.
-
-
-def gru_np(input,
-           init_h,
-           hidden_size,
-           gate_weight,
-           gate_bias,
-           candidate_weight,
-           candidate_bias,
-           num_layers=1,
-           batch_first=False,
-           is_bidirect=False,
-           sequence_length=None):
-    def step(step_in, pre_hidden, gate_w, gate_b, candidate_w, candidate_b):
-        concat_1 = np.concatenate([step_in, pre_hidden], 1)
-
-        gate_input = np.matmul(concat_1, gate_w)
-        gate_input += gate_b
-        gate_input = sigmoid(gate_input)
-        r, u = np.split(gate_input, indices_or_sections=2, axis=1)
-
-        r_hidden = r * pre_hidden
-
-        candidate = np.matmul(
-            np.concatenate([step_in, pre_hidden], 1), candidate_w)
-
-        candidate += candidate_b
-        c = tanh(candidate)
-
-        new_hidden = u * pre_hidden + (1 - u) * c
-
-        return new_hidden
-
-    if batch_first:
-        input = np.tranpose(input, [1, 0, 2])
-
-    batch_size = input.shape[1]
-    mask = None
-    if sequence_length is not None:
-        max_seq_len = input.shape[0]
-
-        mask = np.zeros([batch_size, max_seq_len])
-
-        for i, len in enumerate(sequence_length):
-            mask[i, :len] = 1.0
-
-        mask = np.transpose(mask, [1, 0])
-
-    direc_num = 1
-    if is_bidirect:
-        direc_num = 2
-    if init_h:
-        init_h = np.reshape(
-            init_h, shape=[num_layers, direc_num, -1, hidden_size])
-    else:
-        init_h = np.zeros([num_layers, direc_num, batch_size, hidden_size])
-
-    def get_single_direction_output(rnn_input, mask=None, direc_index=0):
-        seq_len = rnn_input.shape[0]
-
-        output = []
-        # init pre hidden
-        pre_hidden_array = []
-        for i in range(num_layers):
-            pre_hidden_array.append(init_h[i, direc_index])
-
-        for i in range(seq_len):
-            step_input = rnn_input[i]
-
-            if mask is not None:
-                step_mask = mask[i]
-                step_mask = np.reshape(step_mask, [-1, 1])
-
-            for i in range(num_layers):
-                new_hidden = step(
-                    step_input, pre_hidden_array[i],
-                    gate_weight[direc_index * num_layers + i],
-                    gate_bias[direc_index * num_layers + i],
-                    candidate_weight[direc_index * num_layers + i],
-                    candidate_bias[direc_index * num_layers + i])
-
-                if mask is not None:
-                    new_hidden = new_hidden * step_mask + (
-                        1 - step_mask) * pre_hidden_array[i]
-
-                pre_hidden_array[i] = new_hidden
-
-                step_input = new_hidden
-            output.append(step_input)
-        rnn_out = np.concatenate(output, 0)
-        rnn_out = np.reshape(rnn_out, [seq_len, -1, hidden_size])
-
-        last_hidden_out = np.concatenate(pre_hidden_array, 0)
-        last_hidden_out = np.reshape(last_hidden_out,
-                                     [num_layers, -1, hidden_size])
-
-        return rnn_out, last_hidden_out
-
-    fw_rnn_out, fw_last_hidden = get_single_direction_output(
-        input, mask, direc_index=0)
-
-    if is_bidirect:
-        bw_input = input[::-1]
-        bw_mask = None
-        if mask is not None:
-            bw_mask = mask[::-1]
-
-        bw_rnn_out, bw_last_hidden = get_single_direction_output(
-            bw_input, bw_mask, direc_index=1)
-
-        bw_rnn_out = bw_rnn_out[::-1]
-
-        rnn_out = np.concatenate([fw_rnn_out, bw_rnn_out], 2)
-        last_hidden = np.concatenate([fw_last_hidden, bw_last_hidden], 1)
-        last_hidden = np.reshape(last_hidden,
-                                 [num_layers * direc_num, -1, hidden_size])
-
-        if batch_first:
-            rnn_out = np.transpose(rnn_out, [1, 0, 2])
-
-        return rnn_out, last_hidden
-    else:
-        rnn_out = fw_rnn_out
-        last_hidden = fw_last_hidden
-
-        if batch_first:
-            rnn_out = np.transpose(rnn_out, [1, 0, 2])
-
-        return rnn_out, last_hidden
-
-
-class TestBasicGRUApi(unittest.TestCase):
-    def setUp(self):
-        self.hidden_size = 10
-        self.batch_size = 5
-        self.seq_len = 6
-        self.num_layers = 2
-        self.is_bidirect = True
-        self.batch_first = False
-
-    def test_run(self):
-        x = layers.data(
-            name='x',
-            shape=[-1, self.batch_size, self.hidden_size],
-            dtype='float32')
-        sequence_length = layers.data(
-            name="sequence_length", shape=[-1], dtype='float32')
-
-        rnn_out, last_hidden = basic_gru( x, None, self.hidden_size, num_layers=self.num_layers, \
-                batch_first = self.batch_first, bidirectional=self.is_bidirect, sequence_length=sequence_length )
-
-        last_hidden.persisbale = True
-        rnn_out.persisbale = True
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        exe = Executor(place)
-        exe.run(framework.default_startup_program())
-
-        param_list = fluid.default_main_program().block(0).all_parameters()
-
-        # process weight and bias
-        gate_weight = []
-        gate_bias = []
-        candidate_weight = []
-        candidate_bias = []
-
-        for i in range(self.num_layers):
-            gate_w_name = "basic_gru_layers_" + str(i) + "/BasicGRUUnit_0.w_0"
-            gate_b_name = "basic_gru_layers_" + str(i) + "/BasicGRUUnit_0.b_0"
-            candidate_w_name = "basic_gru_layers_" + str(
-                i) + "/BasicGRUUnit_0.w_1"
-            candidate_b_name = "basic_gru_layers_" + str(
-                i) + "/BasicGRUUnit_0.b_1"
-
-            gate_w = np.array(fluid.global_scope().find_var(gate_w_name)
-                              .get_tensor())
-            gate_w = np.random.uniform(
-                -0.1, 0.1, size=gate_w.shape).astype('float32')
-            fluid.global_scope().find_var(gate_w_name).get_tensor().set(gate_w,
-                                                                        place)
-
-            gate_b = np.array(fluid.global_scope().find_var(gate_b_name)
-                              .get_tensor())
-            gate_b = np.random.uniform(
-                -0.1, 0.1, size=gate_b.shape).astype('float32')
-            fluid.global_scope().find_var(gate_b_name).get_tensor().set(gate_b,
-                                                                        place)
-
-            candidate_w = np.array(fluid.global_scope().find_var(
-                candidate_w_name).get_tensor())
-            candidate_w = np.random.uniform(
-                -0.1, 0.1, size=candidate_w.shape).astype('float32')
-            fluid.global_scope().find_var(candidate_w_name).get_tensor().set(
-                candidate_w, place)
-
-            candidate_b = np.array(fluid.global_scope().find_var(
-                candidate_b_name).get_tensor())
-            candidate_b = np.random.uniform(
-                -0.1, 0.1, size=candidate_b.shape).astype('float32')
-            fluid.global_scope().find_var(candidate_b_name).get_tensor().set(
-                candidate_b, place)
-
-            gate_weight.append(gate_w)
-            gate_bias.append(gate_b)
-            candidate_weight.append(candidate_w)
-            candidate_bias.append(candidate_b)
-
-        if self.is_bidirect:
-            for i in range(self.num_layers):
-                gate_w_name = "basic_gru_reverse_layers_" + str(
-                    i) + "/BasicGRUUnit_0.w_0"
-                gate_b_name = "basic_gru_reverse_layers_" + str(
-                    i) + "/BasicGRUUnit_0.b_0"
-                candidate_w_name = "basic_gru_reverse_layers_" + str(
-                    i) + "/BasicGRUUnit_0.w_1"
-                candidate_b_name = "basic_gru_reverse_layers_" + str(
-                    i) + "/BasicGRUUnit_0.b_1"
-
-                gate_w = np.array(fluid.global_scope().find_var(gate_w_name)
-                                  .get_tensor())
-                gate_w = np.random.uniform(
-                    -0.1, 0.1, size=gate_w.shape).astype('float32')
-                fluid.global_scope().find_var(gate_w_name).get_tensor().set(
-                    gate_w, place)
-
-                gate_b = np.array(fluid.global_scope().find_var(gate_b_name)
-                                  .get_tensor())
-                gate_b = np.random.uniform(
-                    -0.1, 0.1, size=gate_b.shape).astype('float32')
-                fluid.global_scope().find_var(gate_b_name).get_tensor().set(
-                    gate_b, place)
-
-                candidate_w = np.array(fluid.global_scope().find_var(
-                    candidate_w_name).get_tensor())
-                candidate_w = np.random.uniform(
-                    -0.1, 0.1, size=candidate_w.shape).astype('float32')
-                fluid.global_scope().find_var(candidate_w_name).get_tensor(
-                ).set(candidate_w, place)
-
-                candidate_b = np.array(fluid.global_scope().find_var(
-                    candidate_b_name).get_tensor())
-                candidate_b = np.random.uniform(
-                    -0.1, 0.1, size=candidate_b.shape).astype('float32')
-                fluid.global_scope().find_var(candidate_b_name).get_tensor(
-                ).set(candidate_b, place)
-
-                gate_weight.append(gate_w)
-                gate_bias.append(gate_b)
-                candidate_weight.append(candidate_w)
-                candidate_bias.append(candidate_b)
-
-        step_input_np = np.random.uniform(-0.1, 0.1, (
-            self.seq_len, self.batch_size, self.hidden_size)).astype('float32')
-        sequence_length_np = np.random.randint(
-            self.seq_len // 2, self.seq_len,
-            size=(self.batch_size)).astype('int64')
-
-        out = exe.run(
-            feed={'x': step_input_np,
-                  'sequence_length': sequence_length_np},
-            fetch_list=[rnn_out, last_hidden])
-
-        api_rnn_out = out[0]
-        api_last_hidden = out[1]
-
-        np_out = gru_np(
-            step_input_np,
-            None,
-            self.hidden_size,
-            gate_weight,
-            gate_bias,
-            candidate_weight,
-            candidate_bias,
-            num_layers=self.num_layers,
-            batch_first=self.batch_first,
-            is_bidirect=self.is_bidirect,
-            sequence_length=sequence_length_np)
-
-        self.assertTrue(np.allclose(api_rnn_out, np_out[0], rtol=1e-4, atol=0))
-
-        self.assertTrue(
-            np.allclose(
-                api_last_hidden, np_out[1], rtol=1e-4, atol=0))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_basic_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_basic_gru_unit_op.py
deleted file mode 100644
index 6c137f3cce83ca1454a9e82c2e302dd8de3169da..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_basic_gru_unit_op.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-from paddle.fluid.contrib.layers import BasicGRUUnit
-from paddle.fluid.executor import Executor
-from paddle.fluid import framework
-
-import numpy as np
-
-SIGMOID_THRESHOLD_MIN = -40.0
-SIGMOID_THRESHOLD_MAX = 13.0
-EXP_MAX_INPUT = 40.0
-
-
-def sigmoid(x):
-    y = np.copy(x)
-    y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
-    y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
-    return 1. / (1. + np.exp(-y))
-
-
-def tanh(x):
-    y = -2. * x
-    y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
-    return (2. / (1. + np.exp(y))) - 1.
-
-
-def step(step_in, pre_hidden, gate_w, gate_b, candidate_w, candidate_b):
-    concat_1 = np.concatenate([step_in, pre_hidden], 1)
-
-    gate_input = np.matmul(concat_1, gate_w)
-    gate_input += gate_b
-    gate_input = sigmoid(gate_input)
-    r, u = np.split(gate_input, indices_or_sections=2, axis=1)
-
-    r_hidden = r * pre_hidden
-
-    candidate = np.matmul(np.concatenate([step_in, pre_hidden], 1), candidate_w)
-
-    candidate += candidate_b
-    c = tanh(candidate)
-
-    new_hidden = u * pre_hidden + (1 - u) * c
-
-    return new_hidden
-
-
-class TestBasicGRUUnit(unittest.TestCase):
-    def setUp(self):
-        self.hidden_size = 5
-        self.batch_size = 5
-
-    def test_run(self):
-        x = layers.data(name='x', shape=[-1, self.hidden_size], dtype='float32')
-        pre_hidden = layers.data(
-            name="pre_hidden", shape=[-1, self.hidden_size], dtype='float32')
-        gru_unit = BasicGRUUnit("gru_unit", self.hidden_size)
-
-        new_hidden = gru_unit(x, pre_hidden)
-
-        new_hidden.persisbale = True
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        exe = Executor(place)
-        exe.run(framework.default_startup_program())
-
-        param_list = fluid.default_main_program().block(0).all_parameters()
-
-        # process weight and bias
-
-        gate_w_name = "gru_unit/BasicGRUUnit_0.w_0"
-        gate_b_name = "gru_unit/BasicGRUUnit_0.b_0"
-        candidate_w_name = "gru_unit/BasicGRUUnit_0.w_1"
-        candidate_b_name = "gru_unit/BasicGRUUnit_0.b_1"
-
-        gate_w = np.array(fluid.global_scope().find_var(gate_w_name).get_tensor(
-        ))
-        gate_w = np.random.uniform(
-            -0.1, 0.1, size=gate_w.shape).astype('float32')
-        fluid.global_scope().find_var(gate_w_name).get_tensor().set(gate_w,
-                                                                    place)
-
-        gate_b = np.array(fluid.global_scope().find_var(gate_b_name).get_tensor(
-        ))
-        gate_b = np.random.uniform(
-            -0.1, 0.1, size=gate_b.shape).astype('float32')
-        fluid.global_scope().find_var(gate_b_name).get_tensor().set(gate_b,
-                                                                    place)
-
-        candidate_w = np.array(fluid.global_scope().find_var(candidate_w_name)
-                               .get_tensor())
-        candidate_w = np.random.uniform(
-            -0.1, 0.1, size=candidate_w.shape).astype('float32')
-        fluid.global_scope().find_var(candidate_w_name).get_tensor().set(
-            candidate_w, place)
-
-        candidate_b = np.array(fluid.global_scope().find_var(candidate_b_name)
-                               .get_tensor())
-        candidate_b = np.random.uniform(
-            -0.1, 0.1, size=candidate_b.shape).astype('float32')
-        fluid.global_scope().find_var(candidate_b_name).get_tensor().set(
-            candidate_b, place)
-
-        step_input_np = np.random.uniform(-0.1, 0.1, (
-            self.batch_size, self.hidden_size)).astype('float32')
-        pre_hidden_np = np.random.uniform(-0.1, 0.1, (
-            self.batch_size, self.hidden_size)).astype('float32')
-
-        out = exe.run(feed={'x': step_input_np,
-                            'pre_hidden': pre_hidden_np},
-                      fetch_list=[new_hidden])
-
-        api_out = out[0]
-
-        np_out = step(step_input_np, pre_hidden_np, gate_w, gate_b, candidate_w,
-                      candidate_b)
-
-        self.assertTrue(np.allclose(api_out, np_out, rtol=1e-4, atol=0))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_basic_lstm_api.py b/python/paddle/fluid/tests/unittests/test_basic_lstm_api.py
deleted file mode 100644
index a09d6e79dad6015696e6b43124a4b1ce89234959..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_basic_lstm_api.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-from paddle.fluid.contrib.layers import basic_lstm
-from paddle.fluid.executor import Executor
-from paddle.fluid import framework
-
-import numpy as np
-
-SIGMOID_THRESHOLD_MIN = -40.0
-SIGMOID_THRESHOLD_MAX = 13.0
-EXP_MAX_INPUT = 40.0
-
-
-def sigmoid(x):
-    y = np.copy(x)
-    y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
-    y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
-    return 1. / (1. + np.exp(-y))
-
-
-def tanh(x):
-    y = -2. * x
-    y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
-    return (2. / (1. + np.exp(y))) - 1.
-
-
-def lstm_np(input,
-            init_h,
-            init_c,
-            hidden_size,
-            gate_weight,
-            gate_bias,
-            num_layers=1,
-            batch_first=False,
-            is_bidirect=False,
-            sequence_length=None,
-            forget_bias=1.0):
-    def step(step_in, pre_hidden, pre_cell, gate_w, gate_b):
-        concat_1 = np.concatenate([step_in, pre_hidden], 1)
-
-        gate_input = np.matmul(concat_1, gate_w)
-        gate_input += gate_b
-        i, j, f, o = np.split(gate_input, indices_or_sections=4, axis=1)
-
-        new_cell = pre_cell * sigmoid(f + forget_bias) + sigmoid(i) * tanh(j)
-        new_hidden = tanh(new_cell) * sigmoid(o)
-
-        return new_hidden, new_cell
-
-    if batch_first:
-        input = np.tranpose(input, [1, 0, 2])
-        if mask is not None:
-            mask = np.transpose(mask, [1, 0])
-
-    batch_size = input.shape[1]
-    mask = None
-    if sequence_length is not None:
-        max_seq_len = input.shape[0]
-
-        mask = np.zeros([batch_size, max_seq_len])
-
-        for i, len in enumerate(sequence_length):
-            mask[i, :len] = 1.0
-
-        mask = np.transpose(mask, [1, 0])
-
-    direc_num = 1
-    if is_bidirect:
-        direc_num = 2
-    if init_h:
-        init_h = np.reshape(init_h, [num_layers, direc_num, -1, hidden_size])
-        init_c = np.reshape(init_c, [num_layers, direc_num, -1, hidden_size])
-    else:
-        init_h = np.zeros([num_layers, direc_num, batch_size, hidden_size])
-        init_c = np.zeros([num_layers, direc_num, batch_size, hidden_size])
-
-    def get_single_direction_output(rnn_input, mask=None, direc_index=0):
-        seq_len = rnn_input.shape[0]
-
-        output = []
-        # init pre hidden
-        pre_hidden_array = []
-        pre_cell_array = []
-        for i in range(num_layers):
-            pre_hidden_array.append(init_h[i, direc_index])
-            pre_cell_array.append(init_c[i, direc_index])
-
-        for i in range(seq_len):
-            step_input = rnn_input[i]
-
-            if mask is not None:
-                step_mask = mask[i]
-                step_mask = np.reshape(step_mask, [-1, 1])
-                #print("np mask", step_mask.shape  )
-
-            for i in range(num_layers):
-                new_hidden, new_cell = step(
-                    step_input, pre_hidden_array[i], pre_cell_array[i],
-                    gate_weight[direc_index * num_layers + i],
-                    gate_bias[direc_index * num_layers + i])
-
-                if mask is not None:
-
-                    new_hidden = np.multiply(
-                        new_hidden, step_mask) - np.multiply(
-                            pre_hidden_array[i], (step_mask - 1.0))
-                    #new_hidden = new_hidden * step_mask - pre_hidden_array[i] * ( step_mask -1 )
-                    #new_cell = new_cell * step_mask - pre_cell_array[i] * (step_mask -1)
-                    new_cell = np.multiply(new_cell, step_mask) - np.multiply(
-                        pre_cell_array[i], (step_mask - 1.0))
-
-                pre_hidden_array[i] = new_hidden
-                pre_cell_array[i] = new_cell
-
-                step_input = new_hidden
-            output.append(step_input)
-        rnn_out = np.concatenate(output, 0)
-        rnn_out = np.reshape(rnn_out, [seq_len, -1, hidden_size])
-
-        last_hidden_out = np.concatenate(pre_hidden_array, 0)
-        last_hidden_out = np.reshape(last_hidden_out,
-                                     [num_layers, -1, hidden_size])
-
-        last_cell_out = np.concatenate(pre_cell_array, 0)
-        last_cell_out = np.reshape(last_cell_out, [num_layers, -1, hidden_size])
-
-        return rnn_out, last_hidden_out, last_cell_out
-
-    fw_rnn_out, fw_last_hidden, fw_last_cell = get_single_direction_output(
-        input, mask, direc_index=0)
-
-    if is_bidirect:
-        bw_input = input[::-1]
-        bw_mask = None
-        if mask is not None:
-            bw_mask = mask[::-1]
-
-        bw_rnn_out, bw_last_hidden, bw_last_cell = get_single_direction_output(
-            bw_input, bw_mask, direc_index=1)
-
-        bw_rnn_out = bw_rnn_out[::-1]
-
-        rnn_out = np.concatenate([fw_rnn_out, bw_rnn_out], 2)
-        last_hidden = np.concatenate([fw_last_hidden, bw_last_hidden], 1)
-        last_hidden = np.reshape(last_hidden,
-                                 [num_layers * direc_num, -1, hidden_size])
-
-        last_cell = np.concatenate([fw_last_cell, bw_last_cell], 1)
-        last_cell = np.reshape(last_cell,
-                               [num_layers * direc_num, -1, hidden_size])
-
-        if batch_first:
-            rnn_out = np.transpose(rnn_out, [1, 0, 2])
-
-        return rnn_out, last_hidden, last_cell
-    else:
-        rnn_out = fw_rnn_out
-        last_hidden = fw_last_hidden
-        last_cell = fw_last_cell
-
-        if batch_first:
-            rnn_out = np.transpose(rnn_out, [1, 0, 2])
-
-        return rnn_out, last_hidden, last_cell
-
-
-class TestBasicLSTMApi(unittest.TestCase):
-    def setUp(self):
-        self.hidden_size = 10
-        self.batch_size = 5
-        self.seq_len = 6
-        self.num_layers = 2
-        self.is_bidirect = True
-        self.batch_first = False
-        self.forget_bias = 1.0
-
-    def test_run(self):
-        x = layers.data(
-            name='x',
-            shape=[-1, self.batch_size, self.hidden_size],
-            dtype='float32')
-        sequence_length = layers.data(
-            name="sequence_length", shape=[-1], dtype='float32')
-
-        rnn_out, last_hidden, last_cell = basic_lstm( x, None, None, self.hidden_size, num_layers=self.num_layers, \
-                batch_first = self.batch_first, bidirectional=self.is_bidirect, sequence_length=sequence_length, forget_bias = self.forget_bias )
-
-        last_hidden.persisbale = True
-        rnn_out.persisbale = True
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = Executor(place)
-        exe.run(framework.default_startup_program())
-
-        param_list = fluid.default_main_program().block(0).all_parameters()
-
-        # process weight and bias
-        gate_weight = []
-        gate_bias = []
-
-        for i in range(self.num_layers):
-            gate_w_name = "basic_lstm_layers_" + str(i) + "/BasicLSTMUnit_0.w_0"
-            gate_b_name = "basic_lstm_layers_" + str(i) + "/BasicLSTMUnit_0.b_0"
-
-            gate_w = np.array(fluid.global_scope().find_var(gate_w_name)
-                              .get_tensor())
-            gate_w = np.random.uniform(
-                -0.1, 0.1, size=gate_w.shape).astype('float32')
-            fluid.global_scope().find_var(gate_w_name).get_tensor().set(gate_w,
-                                                                        place)
-
-            gate_b = np.array(fluid.global_scope().find_var(gate_b_name)
-                              .get_tensor())
-            gate_b = np.random.uniform(
-                -0.1, 0.1, size=gate_b.shape).astype('float32')
-            fluid.global_scope().find_var(gate_b_name).get_tensor().set(gate_b,
-                                                                        place)
-
-            gate_weight.append(gate_w)
-            gate_bias.append(gate_b)
-
-        if self.is_bidirect:
-            for i in range(self.num_layers):
-                gate_w_name = "basic_lstm_reverse_layers_" + str(
-                    i) + "/BasicLSTMUnit_0.w_0"
-                gate_b_name = "basic_lstm_reverse_layers_" + str(
-                    i) + "/BasicLSTMUnit_0.b_0"
-
-                gate_w = np.array(fluid.global_scope().find_var(gate_w_name)
-                                  .get_tensor())
-                gate_w = np.random.uniform(
-                    -0.1, 0.1, size=gate_w.shape).astype('float32')
-                fluid.global_scope().find_var(gate_w_name).get_tensor().set(
-                    gate_w, place)
-
-                gate_b = np.array(fluid.global_scope().find_var(gate_b_name)
-                                  .get_tensor())
-                gate_b = np.random.uniform(
-                    -0.1, 0.1, size=gate_b.shape).astype('float32')
-                fluid.global_scope().find_var(gate_b_name).get_tensor().set(
-                    gate_b, place)
-
-                gate_weight.append(gate_w)
-                gate_bias.append(gate_b)
-
-        step_input_np = np.random.uniform(-0.1, 0.1, (
-            self.seq_len, self.batch_size, self.hidden_size)).astype('float32')
-        sequence_length_np = np.random.randint(
-            self.seq_len // 2, self.seq_len,
-            size=(self.batch_size)).astype('int64')
-
-        out = exe.run(
-            feed={'x': step_input_np,
-                  'sequence_length': sequence_length_np},
-            fetch_list=[rnn_out, last_hidden, last_cell])
-
-        api_rnn_out = out[0]
-        api_last_hidden = out[1]
-        api_last_cell = out[2]
-
-        np_out = lstm_np(
-            step_input_np,
-            None,
-            None,
-            self.hidden_size,
-            gate_weight,
-            gate_bias,
-            num_layers=self.num_layers,
-            batch_first=self.batch_first,
-            is_bidirect=self.is_bidirect,
-            sequence_length=sequence_length_np)
-
-        self.assertTrue(np.allclose(api_rnn_out, np_out[0], rtol=1e-4, atol=0))
-        self.assertTrue(
-            np.allclose(
-                api_last_hidden, np_out[1], rtol=1e-4, atol=0))
-        self.assertTrue(
-            np.allclose(
-                api_last_cell, np_out[2], rtol=1e-4, atol=0))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_basic_lstm_unit_op.py b/python/paddle/fluid/tests/unittests/test_basic_lstm_unit_op.py
deleted file mode 100644
index b79219c26ac7e9e6beb5ad9c9d0d435c9895a2db..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_basic_lstm_unit_op.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-from paddle.fluid.contrib.layers import BasicLSTMUnit
-from paddle.fluid.executor import Executor
-from paddle.fluid import framework
-
-import numpy as np
-
-SIGMOID_THRESHOLD_MIN = -40.0
-SIGMOID_THRESHOLD_MAX = 13.0
-EXP_MAX_INPUT = 40.0
-
-
-def sigmoid(x):
-    y = np.copy(x)
-    y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
-    y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
-    return 1. / (1. + np.exp(-y))
-
-
-def tanh(x):
-    y = -2. * x
-    y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
-    return (2. / (1. + np.exp(y))) - 1.
-
-
-def step(step_in, pre_hidden, pre_cell, gate_w, gate_b, forget_bias=1.0):
-    concat_1 = np.concatenate([step_in, pre_hidden], 1)
-
-    gate_input = np.matmul(concat_1, gate_w)
-    gate_input += gate_b
-    i, j, f, o = np.split(gate_input, indices_or_sections=4, axis=1)
-
-    new_cell = pre_cell * sigmoid(f + forget_bias) + sigmoid(i) * tanh(j)
-    new_hidden = tanh(new_cell) * sigmoid(o)
-
-    return new_hidden, new_cell
-
-
-class TestBasicGRUUnit(unittest.TestCase):
-    def setUp(self):
-        self.hidden_size = 5
-        self.batch_size = 5
-
-    def test_run(self):
-        x = layers.data(name='x', shape=[-1, self.hidden_size], dtype='float32')
-        pre_hidden = layers.data(
-            name="pre_hidden", shape=[-1, self.hidden_size], dtype='float32')
-        pre_cell = layers.data(
-            name="pre_cell", shape=[-1, self.hidden_size], dtype='float32')
-
-        lstm_unit = BasicLSTMUnit("lstm_unit", self.hidden_size)
-
-        new_hidden, new_cell = lstm_unit(x, pre_hidden, pre_cell)
-
-        new_hidden.persisbale = True
-        new_cell.persisbale = True
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        exe = Executor(place)
-        exe.run(framework.default_startup_program())
-
-        param_list = fluid.default_main_program().block(0).all_parameters()
-
-        # process weight and bias
-
-        gate_w_name = "lstm_unit/BasicLSTMUnit_0.w_0"
-        gate_b_name = "lstm_unit/BasicLSTMUnit_0.b_0"
-
-        gate_w = np.array(fluid.global_scope().find_var(gate_w_name).get_tensor(
-        ))
-        gate_w = np.random.uniform(
-            -0.1, 0.1, size=gate_w.shape).astype('float32')
-        fluid.global_scope().find_var(gate_w_name).get_tensor().set(gate_w,
-                                                                    place)
-
-        gate_b = np.array(fluid.global_scope().find_var(gate_b_name).get_tensor(
-        ))
-        gate_b = np.random.uniform(
-            -0.1, 0.1, size=gate_b.shape).astype('float32')
-        fluid.global_scope().find_var(gate_b_name).get_tensor().set(gate_b,
-                                                                    place)
-
-        step_input_np = np.random.uniform(-0.1, 0.1, (
-            self.batch_size, self.hidden_size)).astype('float32')
-        pre_hidden_np = np.random.uniform(-0.1, 0.1, (
-            self.batch_size, self.hidden_size)).astype('float32')
-        pre_cell_np = np.random.uniform(-0.1, 0.1, (
-            self.batch_size, self.hidden_size)).astype('float32')
-
-        out = exe.run( feed={ 'x' : step_input_np, 'pre_hidden' : pre_hidden_np, \
-                              'pre_cell' : pre_cell_np },
-                fetch_list=[ new_hidden, new_cell])
-
-        api_hidden_out = out[0]
-        api_cell_out = out[1]
-
-        np_hidden_out, np_cell_out = step(step_input_np, pre_hidden_np,
-                                          pre_cell_np, gate_w, gate_b)
-
-        self.assertTrue(
-            np.allclose(
-                api_hidden_out, np_hidden_out, rtol=1e-4, atol=0))
-        self.assertTrue(
-            np.allclose(
-                api_cell_out, np_cell_out, rtol=1e-4, atol=0))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
deleted file mode 100644
index ec96e5f79ca39998dad8d2222cecd573d477ce5b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ /dev/null
@@ -1,534 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import paddle.fluid as fluid
-from op_test import OpTest
-from paddle.fluid.framework import grad_var_name
-
-
-def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
-    x_shape = x.shape
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
-
-    if data_format == "NCHW":
-        n, c, h, w = x.shape
-        mean_tile = np.reshape(mean, (1, c, 1, 1))
-        mean_tile = np.tile(mean_tile, (n, 1, h, w))
-        var_tile = np.reshape(var, (1, c, 1, 1))
-        var_tile = np.tile(var_tile, (n, 1, h, w))
-        normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon)
-        scale_tile = np.reshape(scale, (1, c, 1, 1))
-        scale_tile = np.tile(scale_tile, (n, 1, h, w))
-        offset_tile = np.reshape(offset, (1, c, 1, 1))
-        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
-        y = normalized * scale_tile + offset_tile
-    elif data_format == "NHWC":
-        normalized = (x - mean) / np.sqrt(var + epsilon)
-        y = normalized * scale + offset
-    else:
-        raise ValueError("Unknown data order.")
-
-    if len(x_shape) == 2:
-        y = np.reshape(y, x_shape)
-    return y
-
-
-def _cal_mean_variance(x, epsilon, data_format):
-    assert data_format in ['NCHW', 'NHWC']
-    x_square = x * x
-    axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2)
-    C = x.shape[1] if data_format == 'NCHW' else x.shape[-1]
-    x_square_sum = np.sum(x_square, axis)
-    x_sum = np.sum(x, axis=axis)
-    element_count = np.size(x) / C
-    mean = x_sum / element_count
-    var = x_square_sum / element_count - mean * mean
-    return mean, var
-
-
-def _reference_training(x, scale, offset, epsilon, data_format):
-    x_shape = x.shape
-
-    if data_format == "NCHW":
-        n, c, h, w = x.shape
-        x_square = x * x
-        x_square_sum = np.sum(x_square, (0, 2, 3))
-        x_sum = np.sum(x, axis=(0, 2, 3))
-        element_count = np.size(x) / int(np.shape(x)[1])
-        mean = x_sum / element_count
-        var = x_square_sum / element_count - mean * mean
-        mean_tile = np.reshape(mean, (1, c, 1, 1))
-        mean_tile = np.tile(mean_tile, (n, 1, h, w))
-        var_tile = np.reshape(var, (1, c, 1, 1))
-        var_tile = np.tile(var_tile, (n, 1, h, w))
-        normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon)
-        scale_tile = np.reshape(scale, (1, c, 1, 1))
-        scale_tile = np.tile(scale_tile, (n, 1, h, w))
-        offset_tile = np.reshape(offset, (1, c, 1, 1))
-        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
-        y = normalized * scale_tile + offset_tile
-        return y, mean, var
-    elif data_format == "NHWC":
-        x_square = x * x
-        x_square_sum = np.sum(x_square, (0, 1, 2))
-        x_sum = np.sum(x, axis=(0, 1, 2))
-        element_count = np.size(x) / int(np.shape(x)[-1])
-        mean = x_sum / element_count
-        var = x_square_sum / element_count - mean * mean
-        normalized = (x - mean) / np.sqrt(var + epsilon)
-        y = normalized * scale + offset
-        return y, mean, var
-    else:
-        raise ValueError("Unknown data order.")
-
-
-def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
-    # Use the following formulas to calculate gradients:
-    # grad_scale =
-    #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
-    #
-    # grad_offset = sum(output_y)
-    #
-    # x_grad =
-    #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
-    #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
-
-    # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
-    if data_format != "NCHW" and data_format != "NHWC":
-        raise ValueError("Unknown data order.")
-
-    if data_format == "NCHW":
-        x = np.transpose(x, (0, 2, 3, 1))
-        y_grad = np.transpose(y_grad, (0, 2, 3, 1))
-
-    x_grad = scale * (y_grad - np.mean(
-        y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
-            y_grad * (x - mean), axis=(0, 1, 2)) /
-                      (var + epsilon)) / np.sqrt(var + epsilon)
-    grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
-                        axis=(0, 1, 2))
-    grad_offset = np.sum(y_grad, axis=(0, 1, 2))
-
-    # transfer back to N, C, H, W
-    if data_format == "NCHW":
-        x_grad = np.transpose(x_grad, (0, 3, 1, 2))
-        x = np.transpose(x, (0, 3, 1, 2))
-        y_grad = np.transpose(y_grad, (0, 3, 1, 2))
-
-    return x_grad, grad_scale, grad_offset
-
-
-def create_or_get_tensor(scope, var_name, var, place):
-    tensor = scope.var(var_name).get_tensor()
-    if var is not None:
-        assert isinstance(var, np.ndarray)
-        tensor.set_recursive_sequence_lengths([])
-        tensor.set(var, place)
-    return tensor
-
-
-def set_output_grad(scope, outputs, place, feed_dict=None):
-    def __set_tensor__(name, data=None):
-        out_tensor = scope.find_var(name).get_tensor()
-        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
-        out_dtype = out_tensor.dtype()
-        if data is None:
-            if out_dtype == core.VarDesc.VarType.FP64:
-                data = np.ones(out_tensor.shape(), dtype=np.float64)
-            elif out_dtype == core.VarDesc.VarType.FP32:
-                data = np.ones(out_tensor.shape(), dtype=np.float32)
-            else:
-                raise ValueError("Not supported data type " + str(out_dtype))
-        grad_tensor.set(data, place)
-
-    for output in outputs:
-        data = None
-        if output in feed_dict:
-            data = feed_dict[output]
-        __set_tensor__(output, data)
-
-
-class TestBatchNormOpInference(unittest.TestCase):
-    def setUp(self):
-        self.dtype = np.float32
-        self.use_mkldnn = False
-        self.fuse_with_relu = False
-        self.init_kernel_type()
-
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
-
-    def check_with_place(self, place, data_layout, dtype, shape):
-        epsilon = 0.00001
-        if len(shape) == 2:
-            x_shape = shape
-            c = x_shape[1]
-        else:
-            n, h, w, c = shape[0], shape[1], shape[2], shape[3]
-            if data_layout == "NHWC":
-                x_shape = [n, h, w, c]
-            elif data_layout == "NCHW":
-                x_shape = [n, c, h, w]
-            else:
-                raise ValueError("Unknown data layout.")
-        scale_shape = [c]
-
-        x_val = np.random.random_sample(x_shape).astype(dtype)
-        # generate some negative values to test case with relu fused
-        x_val = x_val - 0.5
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-
-        y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
-                                   epsilon, data_layout).astype(dtype)
-        if self.fuse_with_relu:
-            y_out = np.maximum(y_out, 0)
-
-        scope = core.Scope()
-
-        # create input
-        x_tensor = create_or_get_tensor(scope, "x_val",
-                                        OpTest.np_dtype_to_fluid_dtype(x_val),
-                                        place)
-        scale_tensor = create_or_get_tensor(
-            scope, "scale_val",
-            OpTest.np_dtype_to_fluid_dtype(scale_val), place)
-        bias_tensor = create_or_get_tensor(
-            scope, "bias_val", OpTest.np_dtype_to_fluid_dtype(bias_val), place)
-        mean_tensor = create_or_get_tensor(scope, "mean",
-                                           OpTest.np_dtype_to_fluid_dtype(mean),
-                                           place)
-        variance_tensor = create_or_get_tensor(
-            scope, "variance", OpTest.np_dtype_to_fluid_dtype(variance), place)
-
-        # create output
-        y_tensor = create_or_get_tensor(scope, "y_out", None, place)
-        saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
-                                                 place)
-        saved_variance_tensor = create_or_get_tensor(scope, "saved_variance",
-                                                     None, place)
-        mean_out_tensor = mean_tensor
-        variance_out_tensor = variance_tensor
-
-        batch_norm_op = Operator(
-            "batch_norm",
-            # inputs
-            X="x_val",
-            Scale="scale_val",
-            Bias="bias_val",
-            Mean="mean",
-            Variance="variance",
-            # outputs
-            Y="y_out",
-            MeanOut="mean",
-            VarianceOut="variance",
-            SavedMean="saved_mean",
-            SavedVariance="saved_variance",
-            # attrs
-            is_test=True,
-            data_layout=data_layout,
-            use_mkldnn=self.use_mkldnn,
-            fuse_with_relu=self.fuse_with_relu,
-            epsilon=epsilon)
-
-        batch_norm_op.run(scope, place)
-
-        # check inference result
-        self.__assert_close(
-            y_tensor,
-            y_out,
-            "inference output are different at " + str(place) + ", " +
-            data_layout + ", " + str(np.dtype(dtype)) +
-            str(np.array(y_tensor)) + str(y_out),
-            atol=1e-3)
-
-    def test_check_output(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            for data_format in ["NCHW", "NHWC"]:
-                self.check_with_place(place, data_format, self.dtype,
-                                      [2, 3, 4, 5])
-                self.check_with_place(place, data_format, self.dtype, [2, 3])
-
-    def init_kernel_type(self):
-        pass
-
-
-class TestFP16BatchNormOpInference(TestBatchNormOpInference):
-    def setUp(self):
-        self.dtype = np.float16
-        self.use_mkldnn = False
-        self.fuse_with_relu = False
-        self.init_kernel_type()
-
-    def test_check_output(self):
-        places = []
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                places.append(place)
-
-        for place in places:
-            for data_format in ["NCHW", "NHWC"]:
-                self.check_with_place(place, data_format, self.dtype,
-                                      [2, 3, 4, 5])
-                self.check_with_place(place, data_format, self.dtype, [2, 3])
-
-
-class TestBatchNormOpTraining(unittest.TestCase):
-    def setUp(self):
-        self.use_mkldnn = False
-        self.fuse_with_relu = False
-        self.data_formats = ["NCHW", "NHWC"]
-        self.momentum = 0.9
-        self.epsilon = 0.00001
-        self.init_kernel_type()
-        self.init_test_case()
-
-    def init_test_case(self):
-        self.use_global_stats = False
-        self.no_grad_set = set()
-        self.fetch_list = [
-            'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD',
-            'scale@GRAD', 'bias@GRAD'
-        ]
-
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        np.allclose(np.array(tensor), np_array, atol=atol)
-
-    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
-                             epsilon, momentum, shape, data_layout):
-        # run forward
-        y, saved_mean, var_ref = _reference_training(x, scale, bias, epsilon,
-                                                     data_layout)
-        mean_out = saved_mean * (1. - momentum) + momentum * mean
-        variance_out = var_ref * (1. - momentum) + momentum * variance
-        saved_variance = 1. / np.sqrt(var_ref + epsilon)
-        # run backward
-        x_grad, scale_grad, bias_grad = _reference_grad(
-            x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
-
-        return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
-
-    def set_mean_variance(self, scale_shape, x, data_layout):
-        mean, variance = _cal_mean_variance(x, self.epsilon, data_layout)
-        mean_pre = np.zeros(scale_shape).astype(np.float32)
-        variance_pre = np.ones(scale_shape).astype(np.float32)
-        # computing global mean/variance for one step
-        if self.use_global_stats:
-            mom = self.momentum
-            mean = mean * (1. - mom) + mom * mean_pre
-            variance = variance * (1. - mom) + mom * variance_pre
-        return mean, variance
-
-    def test_forward_backward(self):
-        def test_with_place(place, data_layout, shape):
-            # attr
-            epsilon = self.epsilon
-            momentum = self.momentum
-            if data_layout == "NCHW":
-                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
-            else:
-                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
-            scale_shape = [c]
-
-            np.random.seed(123)
-            x = np.random.random_sample(shape).astype(np.float32)
-            scale = np.random.random_sample(scale_shape).astype(np.float32)
-            bias = np.random.random_sample(scale_shape).astype(np.float32)
-            mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
-            y_grad = np.random.random_sample(shape).astype(np.float32)
-
-            y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
-                x, y_grad, scale, bias, mean, variance, epsilon, momentum,
-                shape, data_layout)
-
-            var_dict = locals()
-            var_dict['y@GRAD'] = y_grad
-            var_dict['x@GRAD'] = x_grad
-            var_dict['scale@GRAD'] = scale_grad
-            var_dict['bias@GRAD'] = bias_grad
-
-            var_names = [
-                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
-                'saved_variance'
-            ]
-            ground_truth = {name: var_dict[name] for name in var_names}
-
-            program = fluid.Program()
-            with fluid.program_guard(program):
-                block = program.global_block()
-                for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
-                bn_op = block.append_op(
-                    type="batch_norm",
-                    inputs={
-                        "X": block.var('x'),
-                        "Scale": block.var('scale'),
-                        "Bias": block.var('bias'),
-                        "Mean": block.var('mean'),
-                        "Variance": block.var('variance')
-                    },
-                    outputs={
-                        "Y": block.var('y'),
-                        "MeanOut": block.var('mean'),  # share memory
-                        "VarianceOut": block.var('variance'),  # share memory
-                        "SavedMean": block.var('saved_mean'),
-                        "SavedVariance": block.var('saved_variance')
-                    },
-                    attrs={
-                        "momentum": momentum,
-                        "epsilon": epsilon,
-                        "is_test": False,
-                        "data_layout": data_layout,
-                        "use_mkldnn": self.use_mkldnn,
-                        "fuse_with_relu": self.fuse_with_relu,
-                        "use_global_stats": self.use_global_stats
-                    })
-                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
-
-                # generate backward op_desc
-                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    bn_op.desc, self.no_grad_set, [])
-                grad_op_desc = grad_op_desc_list[0]
-                new_op_desc = block.desc.append_op()
-                new_op_desc.copy_from(grad_op_desc)
-                for var_name in grad_op_desc.output_arg_names():
-                    block.desc.var(var_name.encode("ascii"))
-                grad_op_desc.infer_var_type(block.desc)
-                grad_op_desc.infer_shape(block.desc)
-                for arg in grad_op_desc.output_arg_names():
-                    grad_var = block.desc.find_var(arg.encode("ascii"))
-                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-                exe = fluid.Executor(place)
-                out = exe.run(
-                    program,
-                    feed={
-                        name: var_dict[name]
-                        for name in
-                        ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
-                    },
-                    fetch_list=self.fetch_list)
-
-            for id, name in enumerate(self.fetch_list):
-                if name == 'variance':
-                    self.__assert_close(
-                        var_dict[name], out[id], name, atol=1e-3)
-                    continue
-                self.__assert_close(var_dict[name], out[id], name)
-            print("op test forward passed: ", str(place), data_layout)
-
-        places = [core.CPUPlace()]
-
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            for data_format in self.data_formats:
-                test_with_place(place, data_format, [2, 3, 4, 5])
-
-    def init_kernel_type(self):
-        pass
-
-
-class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
-    def init_test_case(self):
-        self.use_global_stats = False
-        self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
-        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
-
-
-class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
-    def init_test_case(self):
-        self.use_global_stats = True
-        self.no_grad_set = set()
-        self.fetch_list = [
-            'y', 'mean', 'variance', 'x@GRAD', 'scale@GRAD', 'bias@GRAD'
-        ]
-
-    def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
-        if data_format == "NCHW":
-            x = np.transpose(x, (0, 2, 3, 1))
-            y_grad = np.transpose(y_grad, (0, 2, 3, 1))
-
-        x_grad = scale * y_grad / np.sqrt(var + epsilon)
-        grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
-                            axis=(0, 1, 2))
-        grad_offset = np.sum(y_grad, axis=(0, 1, 2))
-
-        # transfer back to N, C, H, W
-        if data_format == "NCHW":
-            x_grad = np.transpose(x_grad, (0, 3, 1, 2))
-            x = np.transpose(x, (0, 3, 1, 2))
-            y_grad = np.transpose(y_grad, (0, 3, 1, 2))
-
-        return x_grad, grad_scale, grad_offset
-
-    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
-                             epsilon, momentum, shape, data_layout):
-        if data_layout != "NCHW" and data_layout != "NHWC":
-            raise ValueError("Unknown data order.")
-
-        if data_layout == "NCHW":
-            x = np.transpose(x, (0, 2, 3, 1))
-
-        # run normalizaton
-        normalized = (x - mean) / np.sqrt(variance + epsilon)
-        y = normalized * scale + bias
-
-        # transfer back to N, C, H, W
-        if data_layout == "NCHW":
-            x = np.transpose(x, (0, 3, 1, 2))
-            y = np.transpose(y, (0, 3, 1, 2))
-
-        mean_out = mean
-        variance_out = variance
-        saved_variance = 1. / np.sqrt(variance + epsilon)
-        # run backward
-        x_grad, scale_grad, bias_grad = self.reference_grad(
-            x, y_grad, scale, mean, variance, epsilon, data_layout)
-
-        return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad
-
-
-class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
-        TestBatchNormOpFreezeStatsTraining):
-    def init_test_case(self):
-        self.use_global_stats = True
-        self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
-        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
deleted file mode 100644
index 51eee41ab2d4d1113426991c63bee949cca15ad4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestBeamSearchDecodeOp(unittest.TestCase):
-    """unittest of beam_search_decode_op"""
-
-    def setUp(self):
-        self.scope = core.Scope()
-        self.place = core.CPUPlace()
-
-    def append_lod_tensor(self, tensor_array, lod, data):
-        lod_tensor = core.LoDTensor()
-        lod_tensor.set_lod(lod)
-        lod_tensor.set(data, self.place)
-        tensor_array.append(lod_tensor)
-
-    def test_get_set(self):
-        ids = self.scope.var("ids").get_lod_tensor_array()
-        scores = self.scope.var("scores").get_lod_tensor_array()
-        # Construct sample data with 5 steps and 2 source sentences
-        # beam_size = 2, end_id = 1
-        # start with start_id
-        [
-            self.append_lod_tensor(
-                array, [[0, 1, 2], [0, 1, 2]], np.array(
-                    [0, 0], dtype=dtype))
-            for array, dtype in ((ids, "int64"), (scores, "float32"))
-        ]
-        [
-            self.append_lod_tensor(
-                array, [[0, 1, 2], [0, 2, 4]],
-                np.array(
-                    [2, 3, 4, 5], dtype=dtype))
-            for array, dtype in ((ids, "int64"), (scores, "float32"))
-        ]
-        [
-            self.append_lod_tensor(
-                array, [[0, 2, 4], [0, 2, 2, 4, 4]],
-                np.array(
-                    [3, 1, 5, 4], dtype=dtype))
-            for array, dtype in ((ids, "int64"), (scores, "float32"))
-        ]
-        [
-            self.append_lod_tensor(
-                array, [[0, 2, 4], [0, 1, 2, 3, 4]],
-                np.array(
-                    [1, 1, 3, 5], dtype=dtype))
-            for array, dtype in ((ids, "int64"), (scores, "float32"))
-        ]
-        [
-            self.append_lod_tensor(
-                array, [[0, 2, 4], [0, 0, 0, 2, 2]],
-                np.array(
-                    [5, 1], dtype=dtype))
-            for array, dtype in ((ids, "int64"), (scores, "float32"))
-        ]
-
-        sentence_ids = self.scope.var("sentence_ids").get_tensor()
-        sentence_scores = self.scope.var("sentence_scores").get_tensor()
-
-        beam_search_decode_op = Operator(
-            "beam_search_decode",
-            # inputs
-            Ids="ids",
-            Scores="scores",
-            # outputs
-            SentenceIds="sentence_ids",
-            SentenceScores="sentence_scores",
-            beam_size=2,
-            end_id=1, )
-
-        beam_search_decode_op.run(self.scope, self.place)
-
-        expected_lod = [[0, 2, 4], [0, 4, 7, 12, 17]]
-        self.assertEqual(sentence_ids.lod(), expected_lod)
-        self.assertEqual(sentence_scores.lod(), expected_lod)
-
-        expected_data = np.array(
-            [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64")
-        self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
-        self.assertTrue(
-            np.array_equal(np.array(sentence_scores), expected_data))
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp):
-    def setUp(self):
-        self.scope = core.Scope()
-        self.place = core.CUDAPlace(0)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
deleted file mode 100644
index 1d9f4b78f30fefa21c189036c3731e0afe39ea9e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import logging
-from paddle.fluid.op import Operator, DynamicRecurrentOp
-import paddle.fluid.core as core
-import unittest
-import numpy as np
-
-
-def create_tensor(scope, name, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class BeamSearchOpTester(unittest.TestCase):
-    """unittest of beam_search_op"""
-
-    def setUp(self):
-        self.scope = core.Scope()
-        self._create_ids()
-        self._create_pre_scores()
-        self._create_scores()
-        self._create_pre_ids()
-        self.scope.var('selected_ids')
-        self.scope.var('selected_scores')
-        self.scope.var('parent_idx')
-
-    def test_run(self):
-        op = Operator(
-            'beam_search',
-            pre_ids='pre_ids',
-            pre_scores='pre_scores',
-            ids='ids',
-            scores='scores',
-            selected_ids='selected_ids',
-            selected_scores='selected_scores',
-            parent_idx='parent_idx',
-            level=0,
-            beam_size=2,
-            end_id=0, )
-        op.run(self.scope, core.CPUPlace())
-        selected_ids = self.scope.find_var("selected_ids").get_tensor()
-        selected_scores = self.scope.find_var("selected_scores").get_tensor()
-        parent_idx = self.scope.find_var("parent_idx").get_tensor()
-        self.assertTrue(
-            np.allclose(
-                np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
-        self.assertTrue(
-            np.allclose(
-                np.array(selected_scores),
-                np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
-        self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
-        self.assertTrue(
-            np.allclose(np.array(parent_idx), np.array([0, 1, 2, 3])))
-
-    def _create_pre_ids(self):
-        np_data = np.array([[1, 2, 3, 4]], dtype='int64')
-        tensor = create_tensor(self.scope, 'pre_ids', np_data)
-
-    def _create_pre_scores(self):
-        np_data = np.array([[0.1, 0.2, 0.3, 0.4]], dtype='float32')
-        tensor = create_tensor(self.scope, 'pre_scores', np_data)
-
-    def _create_ids(self):
-        self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        np_data = np.array(
-            [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
-        tensor = create_tensor(self.scope, "ids", np_data)
-        tensor.set_lod(self.lod)
-
-    def _create_scores(self):
-        np_data = np.array(
-            [
-                [0.5, 0.3, 0.2],
-                [0.6, 0.3, 0.1],
-                [0.9, 0.5, 0.1],
-                [0.7, 0.5, 0.1],
-            ],
-            dtype='float32')
-        tensor = create_tensor(self.scope, "scores", np_data)
-        tensor.set_lod(self.lod)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
deleted file mode 100644
index f77fe90c4cee5f4e7ffd9159de73b08f74010529..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ /dev/null
@@ -1,542 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-
-
-def bilinear_interp_np(input,
-                       out_h,
-                       out_w,
-                       out_size=None,
-                       actual_shape=None,
-                       align_corners=True,
-                       align_mode=0,
-                       data_layout='NCHW'):
-    """bilinear interpolation implement in shape [N, C, H, W]"""
-    if data_layout == "NHWC":
-        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    batch_size, channel, in_h, in_w = input.shape
-
-    ratio_h = ratio_w = 0.0
-    if out_h > 1:
-        if (align_corners):
-            ratio_h = (in_h - 1.0) / (out_h - 1.0)
-        else:
-            ratio_h = 1.0 * in_h / out_h
-    if out_w > 1:
-        if (align_corners):
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((batch_size, channel, out_h, out_w))
-
-    for i in range(out_h):
-        if (align_mode == 0 and not align_corners):
-            h = int(ratio_h * (i + 0.5) - 0.5)
-        else:
-            h = int(ratio_h * i)
-
-        h = max(0, h)
-        hid = 1 if h < in_h - 1 else 0
-        if (align_mode == 0 and not align_corners):
-            idx_src_h = max(ratio_h * (i + 0.5) - 0.5, 0)
-            h1lambda = idx_src_h - h
-        else:
-            h1lambda = ratio_h * i - h
-        h2lambda = 1.0 - h1lambda
-        for j in range(out_w):
-            if (align_mode == 0 and not align_corners):
-                w = int(ratio_w * (j + 0.5) - 0.5)
-            else:
-                w = int(ratio_w * j)
-            w = max(0, w)
-            wid = 1 if w < in_w - 1 else 0
-            if (align_mode == 0 and not align_corners):
-                idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
-                w1lambda = idx_src_w - w
-            else:
-                w1lambda = ratio_w * j - w
-            w2lambda = 1.0 - w1lambda
-
-            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
-                                        w1lambda*input[:, :, h, w+wid]) + \
-                h1lambda*(w2lambda*input[:, :, h+hid, w] +
-                          w1lambda*input[:, :, h+hid, w+wid])
-
-    if data_layout == "NHWC":
-        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
-
-    return out.astype(input.dtype)
-
-
-class TestBilinearInterpOp(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "bilinear_interp"
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.data_layout == "NCHW":
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.scale > 0:
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners,
-                                       self.align_mode, self.data_layout)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-            'data_layout': self.data_layout
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 4, 4]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase1(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase2(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase3(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase4(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase5(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = np.array([11, 11]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase6(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-        self.scale = 0.
-        self.out_size = np.array([65, 129]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpSame(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 128, 64]
-        self.out_h = 128
-        self.out_w = 64
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpActualShape(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpDataLayout(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 4, 4, 3]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-        self.data_layout = "NHWC"
-
-
-class TestBilinearInterpOpUint8(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "bilinear_interp"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
-
-        if self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners,
-                                       self.align_mode)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output_with_place(place=core.CPUPlace(), atol=1)
-
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [1, 3, 9, 6]
-        self.out_h = 10
-        self.out_w = 9
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 128, 64]
-        self.out_h = 120
-        self.out_w = 50
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 5
-        self.out_w = 13
-        self.scale = 0.
-        self.out_size = np.array([6, 15]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 1
-
-
-class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = True
-        self.align_mode = 0
-
-
-class TestBilinearInterpScale1(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 2.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpScale2(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 1.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpScale3(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 1.5
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpZero(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 0.2
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestBilinearInterpOp_attr_tensor(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "bilinear_interp"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-            self.attrs['scale'] = self.scale
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners)
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 4, 4]
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.
-        self.out_size = [3, 3]
-        self.align_corners = True
-
-
-# out_size is a 1-D tensor
-class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = [8, 12]
-        self.align_corners = True
-
-
-# scale is a 1-D tensor
-class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_1Dtensor = True
-
-
-class TestBilinearInterpOpAPI(OpTest):
-    def test_case(self):
-        x = fluid.layers.data(name="x", shape=[3, 6, 6], dtype="float32")
-
-        dim = fluid.layers.data(
-            name="dim", shape=[1], dtype="int32", append_batch_size=False)
-        shape_tensor = fluid.layers.data(
-            name="shape_tensor",
-            shape=[2],
-            dtype="int32",
-            append_batch_size=False)
-        actual_size = fluid.layers.data(
-            name="actual_size",
-            shape=[2],
-            dtype="int32",
-            append_batch_size=False)
-        scale_tensor = fluid.layers.data(
-            name="scale_tensor",
-            shape=[1],
-            dtype="float32",
-            append_batch_size=False)
-
-        out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12])
-        out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim])
-        out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor)
-        out4 = fluid.layers.resize_bilinear(
-            x, out_shape=[4, 4], actual_shape=actual_size)
-        out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor)
-
-        x_data = np.random.random((1, 3, 6, 6)).astype("float32")
-        dim_data = np.array([12]).astype("int32")
-        shape_data = np.array([12, 12]).astype("int32")
-        actual_size_data = np.array([12, 12]).astype("int32")
-        scale_data = np.array([2.0]).astype("float32")
-
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        results = exe.run(fluid.default_main_program(),
-                          feed={
-                              "x": x_data,
-                              "dim": dim_data,
-                              "shape_tensor": shape_data,
-                              "actual_size": actual_size_data,
-                              "scale_tensor": scale_data
-                          },
-                          fetch_list=[out1, out2, out3, out4, out5],
-                          return_numpy=True)
-
-        expect_res = bilinear_interp_np(
-            x_data, out_h=12, out_w=12, align_corners=True)
-        for res in results:
-            self.assertTrue(np.allclose(res, expect_res))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
deleted file mode 100644
index 46831119c5fee938780ec8fdb9d0cdb3b63a473d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestBilinearTensorProductOp(OpTest):
-    def setUp(self):
-        self.op_type = "bilinear_tensor_product"
-        batch_size = 6
-        size0 = 3
-        size1 = 4
-        size2 = 5
-        a = np.random.random((batch_size, size0)).astype("float32")
-        b = np.random.random((batch_size, size1)).astype("float32")
-        w = np.random.random((size2, size0, size1)).astype("float32")
-        bias = np.random.random((1, size2)).astype("float32")
-        output = np.zeros((batch_size, size2)).astype("float32")
-        for i in range(size2):
-            w_i = w[i, :, :]
-            output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1)
-        self.inputs = {
-            'X': a,
-            'Y': b,
-            'Weight': w,
-            'Bias': bias,
-        }
-        self.outputs = {'Out': output + bias}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y', 'Weight', 'Bias'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
deleted file mode 100644
index 5cc8e2ba15d260b988ee66a5711aed42ca04c10b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def bipartite_match(distance, match_indices, match_dist):
-    """Bipartite Matching algorithm.
-    Arg:
-        distance (numpy.array) : The distance of two entries with shape [M, N].
-        match_indices (numpy.array): the matched indices from column to row
-            with shape [1, N], it must be initialized to -1.
-        match_dist (numpy.array): The matched distance from column to row
-            with shape [1, N], it must be initialized to 0.
-    """
-    match_pair = []
-    row, col = distance.shape
-    for i in range(row):
-        for j in range(col):
-            match_pair.append((i, j, distance[i][j]))
-
-    match_sorted = sorted(match_pair, key=lambda tup: tup[2], reverse=True)
-
-    row_indices = -1 * np.ones((row, ), dtype=np.int)
-
-    idx = 0
-    for i, j, dist in match_sorted:
-        if idx >= row:
-            break
-        if match_indices[j] == -1 and row_indices[i] == -1 and dist > 0:
-            match_indices[j] = i
-            row_indices[i] = j
-            match_dist[j] = dist
-            idx += 1
-
-
-def argmax_match(distance, match_indices, match_dist, threshold):
-    r, c = distance.shape
-    for j in range(c):
-        if match_indices[j] != -1:
-            continue
-        col_dist = distance[:, j]
-        indices = np.argwhere(col_dist >= threshold).flatten()
-        if len(indices) < 1:
-            continue
-        match_indices[j] = indices[np.argmax(col_dist[indices])]
-        match_dist[j] = col_dist[match_indices[j]]
-
-
-def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
-    """Bipartite Matching algorithm for batch input.
-    Arg:
-        distance (numpy.array) : The distance of two entries with shape [M, N].
-        lod (list of int): The offsets of each input in this batch.
-    """
-    n = len(lod)
-    m = distance.shape[1]
-    match_indices = -1 * np.ones((n, m), dtype=np.int)
-    match_dist = np.zeros((n, m), dtype=np.float32)
-    cur_offset = 0
-    for i in range(n):
-        bipartite_match(distance[cur_offset:(cur_offset + lod[i]), :],
-                        match_indices[i, :], match_dist[i, :])
-        if match_type == 'per_prediction':
-            argmax_match(distance[cur_offset:(cur_offset + lod[i]), :],
-                         match_indices[i, :], match_dist[i, :], dist_threshold)
-        cur_offset += lod[i]
-    return match_indices, match_dist
-
-
-class TestBipartiteMatchOpWithLoD(OpTest):
-    def setUp(self):
-        self.op_type = 'bipartite_match'
-        lod = [[5, 6, 12]]
-        dist = np.random.random((23, 217)).astype('float32')
-        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
-
-        self.inputs = {'DistMat': (dist, lod)}
-        self.outputs = {
-            'ColToRowMatchIndices': match_indices,
-            'ColToRowMatchDist': match_dist,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestBipartiteMatchOpWithoutLoD(OpTest):
-    def setUp(self):
-        self.op_type = 'bipartite_match'
-        lod = [[8]]
-        dist = np.random.random((8, 17)).astype('float32')
-        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
-
-        self.inputs = {'DistMat': dist}
-        self.outputs = {
-            'ColToRowMatchIndices': match_indices,
-            'ColToRowMatchDist': match_dist,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestBipartiteMatchOpWithoutLoDLargeScaleInput(OpTest):
-    def setUp(self):
-        self.op_type = 'bipartite_match'
-        lod = [[300]]
-        dist = np.random.random((300, 17)).astype('float32')
-        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
-
-        self.inputs = {'DistMat': dist}
-        self.outputs = {
-            'ColToRowMatchIndices': match_indices,
-            'ColToRowMatchDist': match_dist,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestBipartiteMatchOpWithPerPredictionType(OpTest):
-    def setUp(self):
-        self.op_type = 'bipartite_match'
-        lod = [[5, 6, 12]]
-        dist = np.random.random((23, 237)).astype('float32')
-        match_indices, match_dist = batch_bipartite_match(dist, lod[0],
-                                                          'per_prediction', 0.5)
-
-        self.inputs = {'DistMat': (dist, lod)}
-        self.outputs = {
-            'ColToRowMatchIndices': match_indices,
-            'ColToRowMatchDist': match_dist,
-        }
-        self.attrs = {
-            'match_type': 'per_prediction',
-            'dist_threshold': 0.5,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_box_clip_op.py b/python/paddle/fluid/tests/unittests/test_box_clip_op.py
deleted file mode 100644
index b2b0598f31dd27e12e5ce329129129b5e0f1caf0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_box_clip_op.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-from op_test import OpTest
-import copy
-
-
-def box_clip(input_box, im_info, output_box):
-    im_w = round(im_info[1] / im_info[2])
-    im_h = round(im_info[0] / im_info[2])
-    output_box[:, :, 0] = np.maximum(
-        np.minimum(input_box[:, :, 0], im_w - 1), 0)
-    output_box[:, :, 1] = np.maximum(
-        np.minimum(input_box[:, :, 1], im_h - 1), 0)
-    output_box[:, :, 2] = np.maximum(
-        np.minimum(input_box[:, :, 2], im_w - 1), 0)
-    output_box[:, :, 3] = np.maximum(
-        np.minimum(input_box[:, :, 3], im_h - 1), 0)
-
-
-def batch_box_clip(input_boxes, im_info, lod):
-    n = input_boxes.shape[0]
-    m = input_boxes.shape[1]
-    output_boxes = np.zeros((n, m, 4), dtype=np.float32)
-    cur_offset = 0
-    for i in range(len(lod)):
-        box_clip(input_boxes[cur_offset:(cur_offset + lod[i]), :, :],
-                 im_info[i, :],
-                 output_boxes[cur_offset:(cur_offset + lod[i]), :, :])
-        cur_offset += lod[i]
-    return output_boxes
-
-
-class TestBoxClipOp(OpTest):
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "box_clip"
-        lod = [[1, 2, 3]]
-        input_boxes = np.random.random((6, 10, 4)) * 5
-        im_info = np.array([[5, 8, 1.], [6, 6, 1.], [7, 5, 1.]])
-        output_boxes = batch_box_clip(input_boxes, im_info, lod[0])
-
-        self.inputs = {
-            'Input': (input_boxes.astype('float32'), lod),
-            'ImInfo': im_info.astype('float32'),
-        }
-        self.outputs = {'Output': output_boxes}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
deleted file mode 100644
index 220bffebe83925c60af65aa9594ddd8a29c38145..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ /dev/null
@@ -1,239 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-from op_test import OpTest
-
-
-def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0):
-    pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False)
-    pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False)
-    pb_x = pb_w * 0.5 + p_box[:, 0]
-    pb_y = pb_h * 0.5 + p_box[:, 1]
-    shape = (1, p_box.shape[0]) if axis == 0 else (p_box.shape[0], 1)
-
-    pb_w = pb_w.reshape(shape)
-    pb_h = pb_h.reshape(shape)
-    pb_x = pb_x.reshape(shape)
-    pb_y = pb_y.reshape(shape)
-
-    if pb_v.ndim == 2:
-        var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else (
-            pb_v.shape[0], 1, pb_v.shape[1])
-        pb_v = pb_v.reshape(var_shape)
-    if pb_v.ndim == 1:
-        tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x
-        tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y
-        tb_w = np.exp(pb_v[2] * t_box[:, :, 2]) * pb_w
-        tb_h = np.exp(pb_v[3] * t_box[:, :, 3]) * pb_h
-    else:
-        tb_x = pb_v[:, :, 0] * t_box[:, :, 0] * pb_w + pb_x
-        tb_y = pb_v[:, :, 1] * t_box[:, :, 1] * pb_h + pb_y
-        tb_w = np.exp(pb_v[:, :, 2] * t_box[:, :, 2]) * pb_w
-        tb_h = np.exp(pb_v[:, :, 3] * t_box[:, :, 3]) * pb_h
-    output_box[:, :, 0] = tb_x - tb_w / 2
-    output_box[:, :, 1] = tb_y - tb_h / 2
-    output_box[:, :, 2] = tb_x + tb_w / 2 - (not norm)
-    output_box[:, :, 3] = tb_y + tb_h / 2 - (not norm)
-
-
-def box_encoder(t_box, p_box, pb_v, output_box, norm):
-    pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False)
-    pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False)
-    pb_x = pb_w * 0.5 + p_box[:, 0]
-    pb_y = pb_h * 0.5 + p_box[:, 1]
-    shape = (1, p_box.shape[0])
-
-    pb_w = pb_w.reshape(shape)
-    pb_h = pb_h.reshape(shape)
-    pb_x = pb_x.reshape(shape)
-    pb_y = pb_y.reshape(shape)
-
-    if pb_v.ndim == 2:
-        pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1])
-    tb_x = ((t_box[:, 2] + t_box[:, 0]) / 2).reshape(t_box.shape[0], 1)
-    tb_y = ((t_box[:, 3] + t_box[:, 1]) / 2).reshape(t_box.shape[0], 1)
-    tb_w = (t_box[:, 2] - t_box[:, 0]).reshape(t_box.shape[0], 1) + (not norm)
-    tb_h = (t_box[:, 3] - t_box[:, 1]).reshape(t_box.shape[0], 1) + (not norm)
-    if pb_v.ndim == 1:
-        output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[0]
-        output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[1]
-        output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[2]
-        output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[3]
-    else:
-        output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[:, :, 0]
-        output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[:, :, 1]
-        output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[:, :, 2]
-        output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[:, :, 3]
-
-
-def batch_box_coder(p_box, pb_v, t_box, lod, code_type, norm, axis=0):
-    n = t_box.shape[0]
-    m = p_box.shape[0]
-    if code_type == "DecodeCenterSize":
-        m = t_box.shape[1]
-    output_box = np.zeros((n, m, 4), dtype=np.float32)
-    cur_offset = 0
-    for i in range(len(lod)):
-        if (code_type == "EncodeCenterSize"):
-            box_encoder(t_box[cur_offset:(cur_offset + lod[i]), :], p_box, pb_v,
-                        output_box[cur_offset:(cur_offset + lod[i]), :, :],
-                        norm)
-        elif (code_type == "DecodeCenterSize"):
-            box_decoder(t_box, p_box, pb_v, output_box, norm, axis)
-        cur_offset += lod[i]
-    return output_box
-
-
-class TestBoxCoderOp(OpTest):
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "box_coder"
-        lod = [[1, 1, 1, 1, 1]]
-        prior_box = np.random.random((81, 4)).astype('float32')
-        prior_box_var = np.random.random((81, 4)).astype('float32')
-        target_box = np.random.random((20, 81, 4)).astype('float32')
-        code_type = "DecodeCenterSize"
-        box_normalized = False
-        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type, box_normalized)
-        self.inputs = {
-            'PriorBox': prior_box,
-            'PriorBoxVar': prior_box_var,
-            'TargetBox': target_box,
-        }
-        self.attrs = {
-            'code_type': 'decode_center_size',
-            'box_normalized': False
-        }
-        self.outputs = {'OutputBox': output_box}
-
-
-class TestBoxCoderOpWithoutBoxVar(OpTest):
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "box_coder"
-        lod = [[0, 1, 2, 3, 4, 5]]
-        prior_box = np.random.random((81, 4)).astype('float32')
-        prior_box_var = np.ones((81, 4)).astype('float32')
-        target_box = np.random.random((20, 81, 4)).astype('float32')
-        code_type = "DecodeCenterSize"
-        box_normalized = False
-        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type, box_normalized)
-
-        self.inputs = {
-            'PriorBox': prior_box,
-            'TargetBox': target_box,
-        }
-        self.attrs = {
-            'code_type': 'decode_center_size',
-            'box_normalized': False
-        }
-        self.outputs = {'OutputBox': output_box}
-
-
-class TestBoxCoderOpWithLoD(OpTest):
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "box_coder"
-        lod = [[10, 20, 20]]
-        prior_box = np.random.random((20, 4)).astype('float32')
-        prior_box_var = np.random.random((20, 4)).astype('float32')
-        target_box = np.random.random((50, 4)).astype('float32')
-        code_type = "EncodeCenterSize"
-        box_normalized = True
-        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type, box_normalized)
-
-        self.inputs = {
-            'PriorBox': prior_box,
-            'PriorBoxVar': prior_box_var,
-            'TargetBox': (target_box, lod),
-        }
-        self.attrs = {'code_type': 'encode_center_size', 'box_normalized': True}
-        self.outputs = {'OutputBox': output_box}
-
-
-class TestBoxCoderOpWithAxis(OpTest):
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "box_coder"
-        lod = [[1, 1, 1, 1, 1]]
-        prior_box = np.random.random((30, 4)).astype('float32')
-        prior_box_var = np.random.random((30, 4)).astype('float32')
-        target_box = np.random.random((30, 81, 4)).astype('float32')
-        code_type = "DecodeCenterSize"
-        box_normalized = False
-        axis = 1
-        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type, box_normalized, axis)
-
-        self.inputs = {
-            'PriorBox': prior_box,
-            'PriorBoxVar': prior_box_var,
-            'TargetBox': target_box,
-        }
-        self.attrs = {
-            'code_type': 'decode_center_size',
-            'box_normalized': False,
-            'axis': axis
-        }
-        self.outputs = {'OutputBox': output_box}
-
-
-class TestBoxCoderOpWithVariance(OpTest):
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "box_coder"
-        lod = [[1, 1, 1, 1, 1]]
-        prior_box = np.random.random((30, 4)).astype('float32')
-        prior_box_var = np.random.random((4)).astype('float32')
-        target_box = np.random.random((30, 81, 4)).astype('float32')
-        code_type = "DecodeCenterSize"
-        box_normalized = False
-        axis = 1
-        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type, box_normalized, axis)
-
-        self.inputs = {
-            'PriorBox': prior_box,
-            'TargetBox': target_box,
-        }
-        self.attrs = {
-            'code_type': 'decode_center_size',
-            'box_normalized': False,
-            'variance': prior_box_var.astype(np.float).flatten(),
-            'axis': axis
-        }
-        self.outputs = {'OutputBox': output_box}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py b/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
deleted file mode 100644
index b0afc2a2e4ad7b72b341536babfc595c2b6c3455..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-from op_test import OpTest
-
-
-def box_decoder_and_assign(deltas, weights, boxes, box_score, box_clip):
-    boxes = boxes.astype(deltas.dtype, copy=False)
-    widths = boxes[:, 2] - boxes[:, 0] + 1.0
-    heights = boxes[:, 3] - boxes[:, 1] + 1.0
-    ctr_x = boxes[:, 0] + 0.5 * widths
-    ctr_y = boxes[:, 1] + 0.5 * heights
-    wx, wy, ww, wh = weights
-    dx = deltas[:, 0::4] * wx
-    dy = deltas[:, 1::4] * wy
-    dw = deltas[:, 2::4] * ww
-    dh = deltas[:, 3::4] * wh
-    # Prevent sending too large values into np.exp()
-    dw = np.minimum(dw, box_clip)
-    dh = np.minimum(dh, box_clip)
-    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
-    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
-    pred_w = np.exp(dw) * widths[:, np.newaxis]
-    pred_h = np.exp(dh) * heights[:, np.newaxis]
-    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
-    # x1
-    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
-    # y1
-    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
-    # x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
-    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
-    # y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
-    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
-
-    output_assign_box = []
-    for ino in range(len(pred_boxes)):
-        rank = np.argsort(-box_score[ino])
-        maxidx = rank[0]
-        if maxidx == 0:
-            maxidx = rank[1]
-        beg_pos = maxidx * 4
-        end_pos = maxidx * 4 + 4
-        output_assign_box.append(pred_boxes[ino, beg_pos:end_pos])
-    output_assign_box = np.array(output_assign_box)
-
-    return pred_boxes, output_assign_box
-
-
-class TestBoxDecoderAndAssignOpWithLoD(OpTest):
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "box_decoder_and_assign"
-        lod = [[4, 8, 8]]
-        num_classes = 10
-        prior_box = np.random.random((20, 4)).astype('float32')
-        prior_box_var = np.array([0.1, 0.1, 0.2, 0.2], dtype=np.float32)
-        target_box = np.random.random((20, 4 * num_classes)).astype('float32')
-        box_score = np.random.random((20, num_classes)).astype('float32')
-        box_clip = 4.135
-        output_box, output_assign_box = box_decoder_and_assign(
-            target_box, prior_box_var, prior_box, box_score, box_clip)
-
-        self.inputs = {
-            'PriorBox': (prior_box, lod),
-            'PriorBoxVar': prior_box_var,
-            'TargetBox': (target_box, lod),
-            'BoxScore': (box_score, lod),
-        }
-        self.attrs = {'box_clip': box_clip}
-        self.outputs = {
-            'DecodeBox': output_box,
-            'OutputAssignBox': output_assign_box
-        }
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_boxps.py b/python/paddle/fluid/tests/unittests/test_boxps.py
deleted file mode 100644
index 0b07f965dd8b2ac32c32716bb34cd4a712a5be93..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_boxps.py
+++ /dev/null
@@ -1,104 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import numpy as np
-import os
-import paddle.fluid.core as core
-import unittest
-from paddle.fluid.layers.nn import _pull_box_sparse
-
-
-class TestBoxPSPreload(unittest.TestCase):
-    """  TestCases for BoxPS Preload """
-
-    def test_boxps_cpu(self):
-        self.run_boxps_preload(True)
-
-    def test_boxps_gpu(self):
-        self.run_boxps_preload(False)
-
-    def run_boxps_preload(self, is_cpu=True):
-        x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
-        y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
-        emb_x, emb_y = _pull_box_sparse([x, y], size=2)
-        emb_xp = _pull_box_sparse(x, size=2)
-        layers.Print(emb_xp)
-        concat = layers.concat([emb_x, emb_y], axis=1)
-        fc = layers.fc(input=concat,
-                       name="fc",
-                       size=1,
-                       num_flatten_dims=1,
-                       bias_attr=False)
-        loss = layers.reduce_mean(fc)
-        layers.Print(loss)
-        place = fluid.CPUPlace() if is_cpu or not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        optimizer = fluid.optimizer.SGD(learning_rate=0.5)
-        batch_size = 2
-
-        def binary_print(slot, fout):
-            fout.write(str(len(slot)) + " ")
-            for e in slot:
-                fout.write(str(e) + " ")
-
-        batch1 = np.ones(
-            (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
-        filelist = []
-        place_str = "cpu" if is_cpu else "gpu"
-        for i in range(2):
-            filelist.append("test_hdfs_" + place_str + "_" + str(i))
-        for f in filelist:
-            with open(f, "w") as fout:
-                for ins in batch1:
-                    for slot in ins:
-                        binary_print(slot, fout)
-                fout.write("\n")
-
-        def create_dataset():
-            dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
-            dataset.set_use_var([x, y])
-            dataset.set_batch_size(2)
-            dataset.set_thread(1)
-            dataset.set_filelist(filelist)
-            return dataset
-
-        datasets = []
-        datasets.append(create_dataset())
-        datasets.append(create_dataset())
-        optimizer.minimize(loss)
-        exe.run(fluid.default_startup_program())
-        datasets[0].load_into_memory()
-        datasets[0].begin_pass()
-        datasets[1].preload_into_memory()
-        exe.train_from_dataset(
-            program=fluid.default_main_program(),
-            dataset=datasets[0],
-            print_period=1)
-        datasets[0].end_pass()
-        datasets[1].wait_preload_done()
-        datasets[1].begin_pass()
-        exe.train_from_dataset(
-            program=fluid.default_main_program(),
-            dataset=datasets[1],
-            print_period=1)
-        datasets[1].end_pass()
-        for f in filelist:
-            os.remove(f)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py
deleted file mode 100644
index c8dc5fbd237d17f2d4e45b06e5806fff5cbf58fe..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest, randomize_probability
-
-
-class TestBprLossOp1(OpTest):
-    """Test BprLoss with discrete one-hot labels.
-    """
-
-    def setUp(self):
-        self.op_type = "bpr_loss"
-        batch_size = 40
-        class_num = 5
-        X = randomize_probability(batch_size, class_num, dtype='float64')
-        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
-        bpr_loss_result = []
-        for i in range(batch_size):
-            sum = 0.0
-            for j in range(class_num):
-                if j == label[i][0]:
-                    continue
-                sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label[i][0]])))
-            bpr_loss_result.append(-sum / (class_num - 1))
-        bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64")
-        self.inputs = {"X": X, "Label": label}
-        self.outputs = {"Y": bpr_loss}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast.py b/python/paddle/fluid/tests/unittests/test_broadcast.py
deleted file mode 100644
index 029e881d6f69ec0781c1d8ad8e66a9b6fd48cec1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_broadcast.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-
-from test_collective_base import TestDistBase
-
-
-class TestCBroadcastOp(TestDistBase):
-    def _setup_config(self):
-        pass
-
-    def test_broadcast(self):
-        self.check_with_place("collective_broadcast_op.py", "broadcast")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
deleted file mode 100644
index 671efd8c721550256c181059528bead43deb0718..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from paddle.fluid.framework import Parameter
-import numpy as np
-from simple_nets import simple_fc_net
-import random
-import unittest
-import os
-
-batch_size = 32
-
-feed_dict = {
-    'image': np.random.random([batch_size, 784]).astype('float32'),
-    'label': np.random.random_integers(
-        low=0, high=9, size=[batch_size, 1]).astype('int64')
-}
-
-
-class InplaceTestBase(unittest.TestCase):
-    def initParameter(self):
-        self.use_cuda = True
-        self.fuse_all_optimizer_ops = False
-
-    def setUp(self):
-        self.initParameter()
-        if self.use_cuda and fluid.core.is_compiled_with_cuda():
-            self.device_count = fluid.core.get_cuda_device_count()
-        else:
-            self.device_count = 4
-        assert batch_size % self.device_count == 0
-
-    def build_program_and_scope(self):
-        self.place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
-        startup_program = fluid.Program()
-        main_program = fluid.Program()
-        startup_program.random_seed = 1
-        main_program.random_seed = 1
-
-        scope = fluid.Scope()
-        with fluid.program_guard(main_program, startup_program):
-            with fluid.unique_name.guard():
-                loss = simple_fc_net()
-                adam = fluid.optimizer.Adam(learning_rate=1e-3)
-                adam.minimize(loss)
-
-                with fluid.scope_guard(scope):
-                    exe = fluid.Executor(
-                        fluid.CUDAPlace(0)
-                        if self.use_cuda else fluid.CPUPlace())
-                    exe.run(startup_program)
-
-        return main_program, scope, exe, loss
-
-    def is_invalid_test(self):
-        return self.use_cuda and not fluid.core.is_compiled_with_cuda()
-
-    def get_all_vars(self, program):
-        all_vars = program.global_block().vars
-        all_vars_name = []
-        for name, var in all_vars.items():
-            if 0 not in var.shape and not var.persistable:
-                all_vars_name.append(name)
-
-        return all_vars_name
-
-    def check_single_card_fetch_var(self):
-        if self.is_invalid_test():
-            return
-
-        prog1, scope1, exe, loss1 = self.build_program_and_scope()
-        scopes = []
-        compiled_programs = []
-        for memory_optimize in [False, True]:
-            for enable_inplace in [False, True]:
-                prog, scope, _, loss = self.build_program_and_scope()
-                scopes.append(scope)
-                build_strategy = fluid.BuildStrategy()
-                build_strategy.memory_optimize = memory_optimize
-                build_strategy.enable_inplace = enable_inplace
-                build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
-                compiled_prog = fluid.CompiledProgram(prog).with_data_parallel(
-                    loss_name=loss.name,
-                    build_strategy=build_strategy,
-                    places=self.place)
-                compiled_programs.append(compiled_prog)
-
-        all_vars_name = self.get_all_vars(prog1)
-        repeated_var_names = all_vars_name * 2
-        random.shuffle(repeated_var_names)  # add some random 
-
-        for fetch_var in repeated_var_names:
-            for _ in range(4):
-                with fluid.scope_guard(scope1):
-                    fetch_val1, = exe.run(prog1,
-                                          feed=feed_dict,
-                                          fetch_list=[fetch_var])
-
-                for scope, compiled_prog in zip(scopes, compiled_programs):
-                    with fluid.scope_guard(scope):
-                        fetch_val2, = exe.run(compiled_prog,
-                                              feed=feed_dict,
-                                              fetch_list=[fetch_var])
-
-                        self.assertTrue(np.array_equal(fetch_val1, fetch_val2))
-
-    def check_multi_card_fetch_var(self):
-        if self.is_invalid_test():
-            return
-
-        prog1, scope1, exe, loss1 = self.build_program_and_scope()
-        scopes = []
-        compiled_programs = []
-
-        if self.use_cuda:
-            places = fluid.cuda_places()
-        else:
-            places = fluid.cpu_places(self.device_count)
-
-        for memory_optimize in [False, True]:
-            for enable_inplace in [False, True]:
-                prog, scope, _, loss = self.build_program_and_scope()
-                scopes.append(scope)
-                build_strategy = fluid.BuildStrategy()
-                build_strategy.memory_optimize = memory_optimize
-                build_strategy.enable_inplace = enable_inplace
-                build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
-                compiled_program = fluid.CompiledProgram(
-                    prog).with_data_parallel(
-                        loss_name=loss.name,
-                        build_strategy=build_strategy,
-                        places=places)
-                compiled_programs.append(compiled_program)
-
-        repeated_var_names = self.get_all_vars(prog1) * 2
-        random.shuffle(repeated_var_names)  # add some random 
-
-        for fetch_var in repeated_var_names:
-            for _ in range(4):
-                fetch_vals = []
-                for scope, compiled_prog in zip(scopes, compiled_programs):
-                    with fluid.scope_guard(scope):
-                        fetch_val, = exe.run(compiled_prog,
-                                             feed=feed_dict,
-                                             fetch_list=[fetch_var])
-                        fetch_vals.append(fetch_val)
-
-                for item in fetch_vals:
-                    self.assertTrue(np.array_equal(fetch_vals[0], item))
-
-
-class CUDAInplaceTest(InplaceTestBase):
-    def initParameter(self):
-        self.use_cuda = True
-        self.fuse_all_optimizer_ops = False
-
-    def test_multi_card_fetch_var(self):
-        self.check_multi_card_fetch_var()
-
-    def test_single_card_fetch_var(self):
-        self.check_single_card_fetch_var()
-
-
-class CPUInplaceTest(InplaceTestBase):
-    def initParameter(self):
-        self.use_cuda = False
-        self.fuse_all_optimizer_ops = False
-
-    def test_multi_card_fetch_var(self):
-        self.check_multi_card_fetch_var()
-
-    def test_single_card_fetch_var(self):
-        self.check_single_card_fetch_var()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
deleted file mode 100644
index fe0f67635f7f9a95e3ca3efe810782f00b9fb451..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from test_buffer_shared_memory_reuse_pass import InplaceTestBase
-import unittest
-
-
-class CUDAInplaceTestWithFuseOptimizationOps(InplaceTestBase):
-    def initParameter(self):
-        self.use_cuda = True
-        self.fuse_all_optimizer_ops = True
-
-    def test_multi_card_fetch_var(self):
-        self.check_multi_card_fetch_var()
-
-    def test_single_card_fetch_var(self):
-        self.check_single_card_fetch_var()
-
-
-class CPUAInplaceTestWithFuseOptimizationOps(InplaceTestBase):
-    def initParameter(self):
-        self.use_cuda = False
-        self.fuse_all_optimizer_ops = True
-
-    def test_multi_card_fetch_var(self):
-        self.check_multi_card_fetch_var()
-
-    # TODO(zcd): should check why this test failed.
-    @unittest.skip("should fix this later.")
-    def test_single_card_fetch_var(self):
-        self.check_single_card_fetch_var()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py b/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py
deleted file mode 100644
index 042f03e19ab18547ed993771831bf3aac9a1fc2e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-
-
-class TestCCommInitAllOp(unittest.TestCase):
-    def setUp(self):
-        self.place = fluid.CUDAPlace(0)
-        self.exe = fluid.Executor(self.place)
-
-    def test_default_attrs(self):
-        program = fluid.Program()
-        block = program.global_block()
-        block.append_op(type='c_comm_init_all', attrs={'ring_id': 0})
-        self.exe.run(program)
-
-    def test_init_with_same_ring_id(self):
-        program = fluid.Program()
-        block = program.global_block()
-        block.append_op(type='c_comm_init_all', attrs={'ring_id': 0})
-        with self.assertRaises(core.EnforceNotMet):
-            self.exe.run(program)
-
-    def test_specifying_devices(self):
-        program = fluid.Program()
-        block = program.global_block()
-        block.append_op(
-            type='c_comm_init_all', attrs={'devices': [0],
-                                           'ring_id': 1})
-        self.exe.run(program)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
deleted file mode 100644
index 4120a18b72f87c7e750a0fb68780292b58e3a7f4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.backward import calc_gradient
-
-
-class TestCalcGradient(unittest.TestCase):
-    def test_calc_gradient(self):
-        x = layers.create_parameter(dtype="float32", shape=[5, 10])
-        y = layers.create_parameter(dtype="float32", shape=[10, 8])
-        mul_out = layers.mul(x=x, y=y)
-        mean_out = layers.mean(mul_out)
-        a = calc_gradient(mean_out, mul_out)
-        b = calc_gradient(mean_out, x)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
deleted file mode 100644
index 71a2ccb6da47588d84c263105560626435ac461a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import op_test
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-
-
-class TestCastOp1(op_test.OpTest):
-    def setUp(self):
-        ipt = np.random.random(size=[10, 10])
-        self.inputs = {'X': ipt.astype('float32')}
-        self.outputs = {'Out': ipt.astype('float64')}
-        self.attrs = {
-            'in_dtype': int(core.VarDesc.VarType.FP32),
-            'out_dtype': int(core.VarDesc.VarType.FP64)
-        }
-        self.op_type = 'cast'
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_grad(self):
-        self.check_grad(['X'], ['Out'])
-
-
-class TestCastOp2(op_test.OpTest):
-    def setUp(self):
-        ipt = np.random.random(size=[10, 10])
-        # numpy float16 is binded to fluid float16 via uint16
-        self.inputs = {'X': ipt.astype('float16').view(np.uint16)}
-        self.outputs = {'Out': ipt.astype('float32')}
-        self.attrs = {
-            'in_dtype': int(core.VarDesc.VarType.FP16),
-            'out_dtype': int(core.VarDesc.VarType.FP32)
-        }
-        self.op_type = 'cast'
-
-    def test_check_output(self):
-        self.check_output(atol=1e-3)
-
-
-class TestCastOp3(op_test.OpTest):
-    def setUp(self):
-        ipt = np.random.random(size=[10, 10])
-        self.inputs = {'X': ipt.astype('float32')}
-        self.outputs = {'Out': ipt.astype('float16')}
-        self.attrs = {
-            'in_dtype': int(core.VarDesc.VarType.FP32),
-            'out_dtype': int(core.VarDesc.VarType.FP16)
-        }
-        self.op_type = 'cast'
-
-    def test_check_output(self):
-        self.check_output(atol=1e-3)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_center_loss.py b/python/paddle/fluid/tests/unittests/test_center_loss.py
deleted file mode 100644
index 50dd6b5e940d25fa95b16d53858396fd6fa476f4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_center_loss.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-
-
-class TestCenterLossOp(OpTest):
-    def setUp(self):
-        self.op_type = "center_loss"
-        self.dtype = np.float32
-        self.init_dtype_type()
-        batch_size = 6
-        feet_dim = 10
-        cluster_num = 8
-        self.attrs = {}
-        self.attrs['cluster_num'] = cluster_num
-        self.attrs['lambda'] = 0.1
-        self.config()
-        self.attrs['need_update'] = self.need_update
-        labels = np.random.randint(cluster_num, size=batch_size, dtype='int64')
-        feat = np.random.random((batch_size, feet_dim)).astype(np.float32)
-        centers = np.random.random((cluster_num, feet_dim)).astype(np.float32)
-        var_sum = np.zeros((cluster_num, feet_dim), dtype=np.float32)
-        centers_select = centers[labels]
-        output = feat - centers_select
-        diff_square = np.square(output).reshape(batch_size, feet_dim)
-        loss = 0.5 * np.sum(diff_square, axis=1).reshape(batch_size, 1)
-        cout = []
-        for i in range(cluster_num):
-            cout.append(0)
-        for i in range(batch_size):
-            cout[labels[i]] += 1
-            var_sum[labels[i]] += output[i]
-        for i in range(cluster_num):
-            var_sum[i] /= (1 + cout[i])
-        var_sum *= 0.1
-        result = centers + var_sum
-        rate = np.array([0.1]).astype(np.float32)
-
-        self.inputs = {
-            'X': feat,
-            'Label': labels,
-            'Centers': centers,
-            'CenterUpdateRate': rate
-        }
-
-        if self.need_update == True:
-            self.outputs = {
-                'SampleCenterDiff': output,
-                'Loss': loss,
-                'CentersOut': result
-            }
-        else:
-            self.outputs = {
-                'SampleCenterDiff': output,
-                'Loss': loss,
-                'CentersOut': centers
-            }
-
-    def config(self):
-        self.need_update = True
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Loss')
-
-
-class TestCenterLossOpNoUpdate(TestCenterLossOp):
-    def config(self):
-        self.need_update = False
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_check_import_scipy.py b/python/paddle/fluid/tests/unittests/test_check_import_scipy.py
deleted file mode 100644
index 55c26f0a1aa545e82e64f726967138b2fc3e9db4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_check_import_scipy.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#import paddle
-#from paddle.importScipy import funcImportScipy
-import six.moves.builtins as builtins
-from paddle.check_import_scipy import check_import_scipy
-import unittest
-
-
-def my_import(name, globals=None, locals=None, fromlist=(), level=0):
-    raise ImportError('DLL load failed,unittest: import scipy  failed')
-
-
-class importTest(unittest.TestCase):
-    def test_import(self):
-        testOsName = 'nt'
-        builtins.__import__ = my_import
-        self.assertRaises(ImportError, check_import_scipy, testOsName)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
deleted file mode 100644
index 2b7f92656db566c866a191d4aac49a35a2cdc528..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ /dev/null
@@ -1,233 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class Segment(object):
-    def __init__(self, chunk_type, start_idx, end_idx):
-        self.chunk_type = chunk_type
-        self.start_idx = start_idx
-        self.end_idx = end_idx
-
-    def __str__(self):
-        return '(Segment: %s, %s, %s)' % (self.chunk_type, self.start_idx,
-                                          self.end_idx)
-
-    __repr__ = __str__
-
-
-class TestChunkEvalOp(OpTest):
-    num_sequences = 5
-    batch_size = 50
-
-    def parse_scheme(self):
-        if self.scheme == 'IOB':
-            self.num_tag_types = 2
-        elif self.scheme == 'IOE':
-            self.num_tag_types = 2
-
-    def fill_with_chunks(self, data, chunks):
-        for chunk in chunks:
-            if self.scheme == 'IOB':
-                data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
-                data[chunk.start_idx + 1:
-                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
-                         self.num_tag_types - 1)
-                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
-                    self.num_tag_types - 1
-                ) if chunk.start_idx < chunk.end_idx else data[chunk.start_idx]
-            elif self.scheme == 'IOE':
-                data[chunk.start_idx:
-                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types
-                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
-                    self.num_tag_types - 1)
-
-    def rand_chunks(self, starts, num_chunks):
-        if num_chunks < 0:
-            num_chunks = np.random.randint(starts[-1])
-        chunks = []
-        # generate chunk beginnings
-        chunk_begins = sorted(
-            np.random.choice(
-                list(range(starts[-1])), num_chunks, replace=False))
-        seq_chunk_begins = []
-        begin_idx = 0
-        # divide chunks into sequences
-        for i in range(len(starts) - 1):
-            tmp_chunk_begins = []
-            while begin_idx < len(chunk_begins) and chunk_begins[
-                    begin_idx] < starts[i + 1]:
-                tmp_chunk_begins.append(chunk_begins[begin_idx])
-                begin_idx += 1
-            seq_chunk_begins.append(tmp_chunk_begins)
-        # generate chunk ends
-        chunk_ends = []
-        for i in range(len(seq_chunk_begins)):
-            for j in range(len(seq_chunk_begins[i])):
-                low = seq_chunk_begins[i][j]
-                high = seq_chunk_begins[i][j + 1] if j < len(seq_chunk_begins[
-                    i]) - 1 else starts[i + 1]
-                chunk_ends.append(np.random.randint(low, high))
-        # generate chunks
-        for chunk_pos in zip(chunk_begins, chunk_ends):
-            chunk_type = np.random.randint(self.num_chunk_types)
-            chunks.append(Segment(chunk_type, *chunk_pos))
-        return chunks
-
-    def gen_chunks(self, infer, label, starts):
-        chunks = self.rand_chunks(starts,
-                                  self.num_infer_chunks + self.num_label_chunks
-                                  - self.num_correct_chunks)
-        correct_chunks = np.random.choice(
-            list(range(len(chunks))), self.num_correct_chunks, replace=False)
-        infer_chunks = np.random.choice(
-            [x for x in range(len(chunks)) if x not in correct_chunks],
-            self.num_infer_chunks - self.num_correct_chunks,
-            replace=False)
-        infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist())
-        label_chunks = np.random.choice(
-            [x for x in range(len(chunks)) if x not in infer_chunks],
-            self.num_label_chunks - self.num_correct_chunks,
-            replace=False)
-        label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist())
-        self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks])
-        self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks])
-        # exclude types in excluded_chunk_types
-        if len(self.excluded_chunk_types) > 0:
-            for idx in correct_chunks:
-                if chunks[idx].chunk_type in self.excluded_chunk_types:
-                    self.num_correct_chunks -= 1
-            for idx in infer_chunks:
-                if chunks[idx].chunk_type in self.excluded_chunk_types:
-                    self.num_infer_chunks -= 1
-            for idx in label_chunks:
-                if chunks[idx].chunk_type in self.excluded_chunk_types:
-                    self.num_label_chunks -= 1
-        return self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks
-
-    def set_confs(self):
-        # Use the IOB scheme and labels with 2 chunk types
-        self.scheme = 'IOB'
-        self.num_chunk_types = 2
-        self.excluded_chunk_types = []
-        self.other_chunk_type = self.num_chunk_types
-        self.attrs = {
-            'num_chunk_types': self.num_chunk_types,
-            'chunk_scheme': self.scheme,
-            'excluded_chunk_types': self.excluded_chunk_types
-        }
-        self.parse_scheme()
-        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9
-
-    def set_data(self):
-        infer = np.zeros((self.batch_size, )).astype('int64')
-        infer.fill(self.num_chunk_types * self.num_tag_types)
-        label = np.copy(infer)
-        starts = np.random.choice(
-            list(range(1, self.batch_size)),
-            self.num_sequences - 1,
-            replace=False).tolist()
-        starts.extend([0, self.batch_size])
-        starts = sorted(starts)
-        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
-            infer, label, starts)
-        lod = []
-        for i in range(len(starts) - 1):
-            lod.append(starts[i + 1] - starts[i])
-        self.set_input(infer, label, lod)
-        precision = float(
-            self.num_correct_chunks
-        ) / self.num_infer_chunks if self.num_infer_chunks else 0
-        recall = float(self.num_correct_chunks
-                       ) / self.num_label_chunks if self.num_label_chunks else 0
-        f1 = float(2 * precision * recall) / (
-            precision + recall) if self.num_correct_chunks else 0
-        self.outputs = {
-            'Precision': np.asarray(
-                [precision], dtype='float32'),
-            'Recall': np.asarray(
-                [recall], dtype='float32'),
-            'F1-Score': np.asarray(
-                [f1], dtype='float32'),
-            'NumInferChunks': np.asarray(
-                [self.num_infer_chunks], dtype='int64'),
-            'NumLabelChunks': np.asarray(
-                [self.num_label_chunks], dtype='int64'),
-            'NumCorrectChunks': np.asarray(
-                [self.num_correct_chunks], dtype='int64')
-        }
-
-    def set_input(self, infer, label, lod):
-        self.inputs = {'Inference': (infer, [lod]), 'Label': (label, [lod])}
-
-    def setUp(self):
-        self.op_type = 'chunk_eval'
-        self.set_confs()
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestChunkEvalOpWithExclude(TestChunkEvalOp):
-    def set_confs(self):
-        # Use the IOE scheme and labels with 3 chunk types
-        self.scheme = 'IOE'
-        self.num_chunk_types = 3
-        self.excluded_chunk_types = [1]
-        self.other_chunk_type = self.num_chunk_types
-        self.attrs = {
-            'num_chunk_types': self.num_chunk_types,
-            'chunk_scheme': self.scheme,
-            'excluded_chunk_types': self.excluded_chunk_types
-        }
-        self.parse_scheme()
-        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 15, 18, 20
-
-
-class TestChunkEvalOpWithTensorInput(TestChunkEvalOp):
-    def set_input(self, infer, label, lod):
-        max_len = np.max(lod)
-        pad_infer = []
-        pad_label = []
-        start = 0
-        for i in range(len(lod)):
-            end = lod[i] + start
-            pad_infer.append(
-                np.pad(infer[start:end], (0, max_len - lod[i]),
-                       'constant',
-                       constant_values=(-1, )))
-            pad_label.append(
-                np.pad(label[start:end], (0, max_len - lod[i]),
-                       'constant',
-                       constant_values=(-1, )))
-            start = end
-
-        pad_infer = np.expand_dims(np.array(pad_infer, dtype='int64'), 2)
-        pad_label = np.expand_dims(np.array(pad_label, dtype='int64'), 2)
-        lod = np.array(lod, dtype='int64')
-        self.inputs = {
-            'Inference': pad_infer,
-            'Label': pad_label,
-            'SeqLength': lod
-        }
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
deleted file mode 100644
index 46433d78252219fe02c3c4b5ddfc525bda177f18..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-
-
-class TestClipByNormOp(OpTest):
-    def setUp(self):
-        self.max_relative_error = 0.006
-        self.initTestCase()
-        input = np.random.random(self.shape).astype("float32")
-        input[np.abs(input) < self.max_relative_error] = 0.5
-        self.op_type = "clip_by_norm"
-        self.inputs = {'X': input, }
-        self.attrs = {}
-        self.attrs['max_norm'] = self.max_norm
-        norm = np.sqrt(np.sum(np.square(input)))
-        if norm > self.max_norm:
-            output = self.max_norm * input / norm
-        else:
-            output = input
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def initTestCase(self):
-        self.shape = (100, )
-        self.max_norm = 1.0
-
-
-class TestCase1(TestClipByNormOp):
-    def initTestCase(self):
-        self.shape = (100, )
-        self.max_norm = 1e20
-
-
-class TestCase2(TestClipByNormOp):
-    def initTestCase(self):
-        self.shape = (16, 16)
-        self.max_norm = 0.1
-
-
-class TestCase3(TestClipByNormOp):
-    def initTestCase(self):
-        self.shape = (4, 8, 16)
-        self.max_norm = 1.0
-
-
-class TestClipByNormOpWithSelectedRows(OpTest):
-    def check_with_place(self, place):
-        self.config_test_case()
-        scope = core.Scope()
-
-        # set input
-        x_selected_rows = scope.var('X').get_selected_rows()
-        x_selected_rows.set_rows(self.grad_rows)
-        x_tensor = x_selected_rows.get_tensor()
-        x_np = np.random.random(self.grad_shape).astype("float32")
-        x_np[np.abs(x_np) < self.max_relative_error] = 0.5
-        x_tensor.set(x_np, place)
-
-        # set output
-        out_selected_rows = scope.var('Out').get_selected_rows()
-
-        # run clip_by_norm_op
-        clip_by_norm_op = fluid.op.Operator(
-            "clip_by_norm", max_norm=self.max_norm, X='X', Out='Out')
-        clip_by_norm_op.run(scope, place)
-
-        # check output
-        self.assertEqual(out_selected_rows.rows(), self.grad_clipped_rows)
-        out_tensor = out_selected_rows.get_tensor()
-        y_np = np.zeros(self.grad_clipped_shape)
-        y_np[0] = np.sum(x_np[0:2])
-        y_np[1] = x_np[2]
-        y_np[2] = x_np[3]
-        norm = np.sqrt(np.sum(np.square(y_np)))
-        if norm > self.max_norm:
-            output = self.max_norm * y_np / norm
-        else:
-            output = y_np
-        self.assertTrue(
-            np.allclose(
-                np.array(out_tensor), output, atol=1e-5, equal_nan=False))
-
-    def test_clip_by_norm_with_selected_ros(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            self.check_with_place(place)
-
-    def config_test_case(self):
-        self.max_norm = 1.0
-        self.max_relative_error = 0.006
-        self.grad_shape = (4, 1)
-        self.grad_clipped_shape = (3, 1)
-        self.grad_rows = [0, 0, 1, 2]
-        self.grad_clipped_rows = [0, 1, 2]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
deleted file mode 100644
index 32677bdb4c897b4e20f8fb166b080ac6e6a221b7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestClipOp(OpTest):
-    def setUp(self):
-        self.max_relative_error = 0.006
-        self.initTestCase()
-        input = np.random.random(self.shape).astype("float32")
-        input[np.abs(input - self.min) < self.max_relative_error] = 0.5
-        input[np.abs(input - self.max) < self.max_relative_error] = 0.5
-        self.op_type = "clip"
-        self.inputs = {'X': input, }
-        self.attrs = {}
-        self.attrs['min'] = self.min
-        self.attrs['max'] = self.max
-        self.outputs = {
-            'Out': np.clip(self.inputs['X'], self.attrs['min'],
-                           self.attrs['max'])
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=self.max_relative_error)
-
-    def initTestCase(self):
-        self.shape = (4, 4)
-        self.max = 0.7
-        self.min = 0.1
-
-
-class TestCase1(TestClipOp):
-    def initTestCase(self):
-        self.shape = (8, 16, 8)
-        self.max = 0.7
-        self.min = 0.0
-
-
-class TestCase2(TestClipOp):
-    def initTestCase(self):
-        self.shape = (8, 16)
-        self.max = 1.0
-        self.min = 0.0
-
-
-class TestCase3(TestClipOp):
-    def initTestCase(self):
-        self.shape = (4, 8, 16)
-        self.max = 0.7
-        self.min = 0.2
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py b/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py
deleted file mode 100644
index c04abe658e555feec5cf8c63411c875e6d4e9324..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from paddle.fluid import core
-
-alignment = 256
-
-
-class TestAllocContinuousSpace(OpTest):
-    def setUp(self):
-        self.op_type = "coalesce_tensor"
-        self.dtype = np.float32
-        attrs = self.init_attr()
-        self.copy_data = attrs["copy_data"]
-        self.constant = attrs["constant"]
-        self.set_constant = attrs["set_constant"]
-        self.Inputs = self.init_input()
-        self.Outputs, self.FusedOutput = self.init_output(
-            self.Inputs, self.set_constant, self.constant)
-        self.inputs = {'Input': self.Inputs}
-        self.attrs = attrs
-        self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def init_input(self):
-        inputs = []
-        inputs.append(("x1", np.random.random([20, 3]).astype(self.dtype)))
-        inputs.append(("x2", np.random.random([20]).astype(self.dtype)))
-        inputs.append(("x3", np.random.random([1]).astype(self.dtype)))
-        inputs.append(("x4", np.random.random([200, 30]).astype(self.dtype)))
-        inputs.append(("x5", np.random.random([30]).astype(self.dtype)))
-        inputs.append(("x6", np.random.random([1]).astype(self.dtype)))
-        return inputs
-
-    def init_attr(self):
-        return {"copy_data": True, "set_constant": False, "constant": 0.0}
-
-    def init_output(self, input_list, set_constant, constant):
-        inputs = []
-        outputs = input_list
-
-        for input in input_list:
-            length = len(input[1].flatten())
-            aligned_len = (length + alignment) / alignment * alignment
-            out = np.zeros(int(aligned_len))
-            out[0:length] = input[1].flatten()
-            inputs.append(out)
-
-        coalesce_tensor_var = np.concatenate([input for input in inputs])
-        if set_constant:
-            coalesce_tensor_var = np.ones((len(coalesce_tensor_var))) * constant
-            outputs = [(out[0],
-                        np.ones(out[1].shape).astype(self.dtype) * constant)
-                       for out in outputs]
-        return outputs, coalesce_tensor_var
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            self.check_output_with_place(
-                place=core.CUDAPlace(0),
-                no_check_set=["FusedOutput"],
-                atol=1e-5)
-
-
-class TestAllocContinuousSpace2(TestAllocContinuousSpace):
-    def init_attr(self):
-        return {"copy_data": False, "set_constant": True, "constant": 0.5}
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            self.check_output_with_place(
-                place=core.CUDAPlace(0),
-                no_check_set=["FusedOutput"],
-                atol=1e-5)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
deleted file mode 100644
index 5b8600f004bb2d4b057bd93415ba29b989d858ce..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#    Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-import sys
-from op_test import OpTest
-
-
-class TestCollectFPNProposalstOp(OpTest):
-    def set_data(self):
-        self.init_test_case()
-        self.make_rois()
-        self.scores_input = [('y%d' % i,
-                              (self.scores[i].reshape(-1, 1), self.rois_lod[i]))
-                             for i in range(self.num_level)]
-        self.rois, self.lod = self.calc_rois_collect()
-        inputs_x = [('x%d' % i, (self.roi_inputs[i][:, 1:], self.rois_lod[i]))
-                    for i in range(self.num_level)]
-        self.inputs = {
-            'MultiLevelRois': inputs_x,
-            "MultiLevelScores": self.scores_input
-        }
-        self.attrs = {'post_nms_topN': self.post_nms_top_n, }
-        self.outputs = {'FpnRois': (self.rois, [self.lod])}
-
-    def init_test_case(self):
-        self.post_nms_top_n = 20
-        self.images_shape = [100, 100]
-
-    def resort_roi_by_batch_id(self, rois):
-        batch_id_list = rois[:, 0]
-        batch_size = int(batch_id_list.max())
-        sorted_rois = []
-        new_lod = []
-        for batch_id in range(batch_size + 1):
-            sub_ind = np.where(batch_id_list == batch_id)[0]
-            sub_rois = rois[sub_ind, 1:]
-            sorted_rois.append(sub_rois)
-            new_lod.append(len(sub_rois))
-        new_rois = np.concatenate(sorted_rois)
-        return new_rois, new_lod
-
-    def calc_rois_collect(self):
-        roi_inputs = np.concatenate(self.roi_inputs)
-        scores = np.concatenate(self.scores)
-        inds = np.argsort(-scores)[:self.post_nms_top_n]
-        rois = roi_inputs[inds, :]
-        new_rois, new_lod = self.resort_roi_by_batch_id(rois)
-        return new_rois, new_lod
-
-    def make_rois(self):
-        self.num_level = 4
-        self.roi_inputs = []
-        self.scores = []
-        self.rois_lod = [[[20, 10]], [[30, 20]], [[20, 30]], [[10, 10]]]
-        for lvl in range(self.num_level):
-            rois = []
-            scores_pb = []
-            lod = self.rois_lod[lvl][0]
-            bno = 0
-            for roi_num in lod:
-                for i in range(roi_num):
-                    xywh = np.random.rand(4)
-                    xy1 = xywh[0:2] * 20
-                    wh = xywh[2:4] * (self.images_shape - xy1)
-                    xy2 = xy1 + wh
-                    roi = [bno, xy1[0], xy1[1], xy2[0], xy2[1]]
-                    rois.append(roi)
-                bno += 1
-                scores_pb.extend(list(np.random.uniform(0.0, 1.0, roi_num)))
-            rois = np.array(rois).astype("float32")
-            self.roi_inputs.append(rois)
-            scores_pb = np.array(scores_pb).astype("float32")
-            self.scores.append(scores_pb)
-
-    def setUp(self):
-        self.op_type = "collect_fpn_proposals"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py
deleted file mode 100644
index e0789178b302866b622d4b6bfa390b7ec3a66f43..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import unittest
-import time
-import argparse
-import os
-import six
-import sys
-import subprocess
-import traceback
-import functools
-import pickle
-from contextlib import closing
-from six import string_types
-import paddle.fluid as fluid
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-
-
-class TestCollectiveRunnerBase(object):
-    def get_model(self, train_prog, startup_prog):
-        raise NotImplementedError(
-            "get model should be implemented by child class.")
-
-    def wait_server_ready(self, endpoints):
-        assert not isinstance(endpoints, string_types)
-        while True:
-            all_ok = True
-            not_ready_endpoints = []
-            for ep in endpoints:
-                ip_port = ep.split(":")
-                with closing(
-                        socket.socket(socket.AF_INET,
-                                      socket.SOCK_STREAM)) as sock:
-                    sock.settimeout(2)
-                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
-                    if result != 0:
-                        all_ok = False
-                        not_ready_endpoints.append(ep)
-            if not all_ok:
-                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" + str(
-                    not_ready_endpoints) + "\n")
-                sys.stderr.flush()
-                time.sleep(3)
-            else:
-                break
-
-#endpoints should be ["ip1:port1","ip2:port2"]
-
-    def initCommunicator(self, program, rank, nranks, wait_port,
-                         current_endpoint, endpoints):
-        other_endpoints = endpoints[:]
-        other_endpoints.remove(current_endpoint)
-        if rank == 0 and wait_port:
-            self.wait_server_ready(other_endpoints)
-        block = program.global_block()
-        nccl_id_var = block.create_var(
-            name=nameGen.generate('nccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints
-            })
-
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': self.global_ring_id
-            })
-
-    def run_trainer(self, args):
-        train_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        endpoints = args["endpoints"].split(",")
-        rank = args["trainerid"]
-        current_endpoint = args["currentendpoint"]
-        nranks = 2
-        self.initCommunicator(startup_prog, rank, nranks, True,
-                              current_endpoint, endpoints)
-        result = self.get_model(train_prog, startup_prog)
-        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-        place = fluid.CUDAPlace(
-            device_id)  #if args.use_gpu else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-        np.random.seed(os.getpid())
-        indata = np.random.random((10, 1000))
-        out = exe.run(train_prog,
-                      feed={'tindata': indata},
-                      fetch_list=[result.name])
-        if six.PY2:
-            print(pickle.dumps(out))
-        else:
-            sys.stdout.buffer.write(pickle.dumps(out))
-
-
-def runtime_main(test_class, col_type, sub_type):
-    args = {}
-    model = test_class()
-    args["deviceid"] = os.getenv("FLAGS_selected_gpus")
-    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
-    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
-    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
-    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
-    args["col_type"] = col_type
-    model.run_trainer(args)
-
-
-import paddle.compat as cpt
-import socket
-from contextlib import closing
-
-
-class TestDistBase(unittest.TestCase):
-    def setUp(self):
-        self._port_set = set()
-        self._trainers = 2
-        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
-        self._python_interp = sys.executable
-
-    def _find_free_port(self):
-        def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
-                s.bind(('', 0))
-                return s.getsockname()[1]
-
-        while True:
-            port = __free_port()
-            if port not in self._port_set:
-                self._port_set.add(port)
-                return port
-
-    def _run_cluster(self, model_file, envs):
-        worker_endpoints = self._ps_endpoints.split(",")
-        w0_ep, w1_ep = worker_endpoints
-        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
-        env0 = {
-            "FLAGS_selected_gpus": "2",
-            "PADDLE_TRAINER_ID": "0",
-            "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
-        }
-
-        env1 = {
-            "FLAGS_selected_gpus": "3",
-            "PADDLE_TRAINER_ID": "1",
-            "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
-        }
-        #update environment
-        env0.update(envs)
-        env1.update(envs)
-        tr_cmd = "%s %s"
-        tr0_cmd = tr_cmd % (self._python_interp, model_file)
-        tr1_cmd = tr_cmd % (self._python_interp, model_file)
-        tr0_pipe = open("/tmp/tr0_err.log", "wb")
-        tr1_pipe = open("/tmp/tr1_err.log", "wb")
-        #print(tr0_cmd) 
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=env0)
-
-        tr1_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=env1)
-
-        tr0_out, tr0_err = tr0_proc.communicate()
-        tr1_out, tr1_err = tr1_proc.communicate()
-        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
-        # close trainer file
-        tr0_pipe.close()
-        tr1_pipe.close()
-        return pickle.loads(tr0_out), pickle.loads(
-            tr1_out), tr0_proc.pid, tr1_proc.pid
-
-    def check_with_place(self,
-                         model_file,
-                         col_type,
-                         check_error_log=False,
-                         need_envs={}):
-        required_envs = {
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
-            "FLAGS_eager_delete_tensor_gb": "0.0",
-            "PATH": os.getenv("PATH"),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
-            "GLOG_v": "0",
-            "NCCL_P2P_DISABLE": "1"
-        }
-        required_envs.update(need_envs)
-        if check_error_log:
-            required_envs["GLOG_v"] = "3"
-            required_envs["GLOG_logtostderr"] = "1"
-        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
-                                                         required_envs)
-        np.random.seed(pid0)
-        input1 = np.random.random((10, 1000))
-        np.random.seed(pid1)
-        input2 = np.random.random((10, 1000))
-        if col_type == "allgather":
-            need_result = np.vstack((input1, input2))
-            self.assertTrue(np.allclose(tr0_out, need_result))
-            self.assertTrue(np.allclose(tr1_out, need_result))
-        elif col_type == "broadcast":
-            need_result = input2
-            self.assertTrue(np.allclose(tr0_out, need_result))
-            self.assertTrue(np.allclose(tr1_out, need_result))
-        elif col_type == "allreduce":
-            need_result = input1 + input2
-            self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
-            self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
-        elif col_type == "reduce_scatter":
-            tmp = input1 + input2
-            need_result1 = tmp[0:tmp.shape[0] // 2]
-            need_result2 = tmp[tmp.shape[0] // 2:]
-            self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result1, rtol=1e-05, atol=1e-05))
-            self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result2, rtol=1e-05, atol=1e-05))
-        elif col_type == "reduce_slicegather":
-            slicesize = input1.shape[0] // 2
-            tmp10 = input1[0:slicesize]
-            tmp11 = input2[0:slicesize]
-            need_result1 = np.concatenate((tmp10, tmp11), axis=1)
-            tmp20 = input1[slicesize:]
-            tmp21 = input2[slicesize:]
-            need_result2 = np.concatenate((tmp20, tmp21), axis=1)
-            self.assertTrue(np.allclose(tr0_out, need_result1))
-            self.assertTrue(np.allclose(tr1_out, need_result2))
-        else:
-            pass
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
deleted file mode 100644
index 437ad35538a5fa380f950fd3b71e334276214ec7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import op_test
-import unittest
-import numpy
-
-
-def create_test_class(op_type, typename, callback):
-    class Cls(op_test.OpTest):
-        def setUp(self):
-            a = numpy.random.random(size=(10, 7)).astype(typename)
-            b = numpy.random.random(size=(10, 7)).astype(typename)
-            c = callback(a, b)
-            self.inputs = {'X': a, 'Y': b}
-            self.outputs = {'Out': c}
-            self.op_type = op_type
-
-        def test_output(self):
-            self.check_output()
-
-    cls_name = "{0}_{1}".format(op_type, typename)
-    Cls.__name__ = cls_name
-    globals()[cls_name] = Cls
-
-
-for _type_name in {'float32', 'float64', 'int32', 'int64'}:
-    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
-    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
-    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
-    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
-    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
-    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_compat.py b/python/paddle/fluid/tests/unittests/test_compat.py
deleted file mode 100644
index 1c2c46f99a82875b917a330d6ec76062222420de..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_compat.py
+++ /dev/null
@@ -1,505 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.compat as cpt
-import six
-
-
-class TestCompatible(unittest.TestCase):
-    def test_type(self):
-        if six.PY2:
-            self.assertEqual(cpt.int_type, int)
-            self.assertEqual(cpt.long_type, long)
-        else:
-            self.assertEqual(cpt.int_type, int)
-            self.assertEqual(cpt.long_type, int)
-
-    def test_to_text(self):
-        # Only support python2.x and python3.x now
-        self.assertTrue(six.PY2 | six.PY3)
-
-        if six.PY2:
-            # check None
-            self.assertIsNone(cpt.to_text(None))
-
-            # check all string related types
-            self.assertTrue(isinstance(cpt.to_text(str("")), unicode))
-            self.assertTrue(isinstance(cpt.to_text(str("123")), unicode))
-            self.assertTrue(isinstance(cpt.to_text(b""), unicode))
-            self.assertTrue(isinstance(cpt.to_text(b""), unicode))
-            self.assertTrue(isinstance(cpt.to_text(u""), unicode))
-            self.assertTrue(isinstance(cpt.to_text(u""), unicode))
-
-            self.assertEqual(u"", cpt.to_text(str("")))
-            self.assertEqual(u"123", cpt.to_text(str("123")))
-            self.assertEqual(u"", cpt.to_text(b""))
-            self.assertEqual(u"123", cpt.to_text(b"123"))
-            self.assertEqual(u"", cpt.to_text(u""))
-            self.assertEqual(u"123", cpt.to_text(u"123"))
-
-            # check list types, not inplace
-            l = [""]
-            l2 = cpt.to_text(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([u""], l2)
-            l = ["", "123"]
-            l2 = cpt.to_text(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([u"", u"123"], l2)
-            l = ["", b'123', u"321"]
-            l2 = cpt.to_text(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([u"", u"123", u"321"], l2)
-            for i in l2:
-                self.assertTrue(isinstance(i, unicode))
-
-            # check list types, inplace
-            l = [""]
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([u""], l2)
-            l = ["", "123"]
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([u"", u"123"], l2)
-            l = ["", b"123", u"321"]
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([u"", u"123", u"321"], l2)
-
-            # check set types, not inplace
-            l = set("")
-            l2 = cpt.to_text(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set(u""), l2)
-            l = set([b"", b"123"])
-            l2 = cpt.to_text(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set([u"", u"123"]), l2)
-            l = set(["", b"123", u"321"])
-            l2 = cpt.to_text(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set([u"", u"123", u"321"]), l2)
-            for i in l2:
-                self.assertTrue(isinstance(i, unicode))
-
-            # check set types, inplace
-            l = set("")
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set(u""), l2)
-            l = set([b"", b"123"])
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set([u"", u"123"]), l2)
-            l = set(["", b"123", u"321"])
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set([u"", u"123", u"321"]), l2)
-
-        elif six.PY3:
-            self.assertIsNone(cpt.to_text(None))
-
-            self.assertTrue(isinstance(cpt.to_text(str("")), str))
-            self.assertTrue(isinstance(cpt.to_text(str("123")), str))
-            self.assertTrue(isinstance(cpt.to_text(b""), str))
-            self.assertTrue(isinstance(cpt.to_text(b""), str))
-            self.assertTrue(isinstance(cpt.to_text(u""), str))
-            self.assertTrue(isinstance(cpt.to_text(u""), str))
-
-            self.assertEqual("", cpt.to_text(str("")))
-            self.assertEqual("123", cpt.to_text(str("123")))
-            self.assertEqual("", cpt.to_text(b""))
-            self.assertEqual("123", cpt.to_text(b"123"))
-            self.assertEqual("", cpt.to_text(u""))
-            self.assertEqual("123", cpt.to_text(u"123"))
-
-            # check list types, not inplace
-            l = [""]
-            l2 = cpt.to_text(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([""], l2)
-            l = ["", "123"]
-            l2 = cpt.to_text(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(["", "123"], l2)
-            l = ["", b"123", u"321"]
-            l2 = cpt.to_text(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertNotEqual(l, l2)
-            self.assertEqual(["", "123", "321"], l2)
-
-            # check list types, inplace
-            l = [""]
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([""], l2)
-            l = ["", b"123"]
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(["", "123"], l2)
-            l = ["", b"123", u"321"]
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(["", "123", "321"], l2)
-            for i in l2:
-                self.assertTrue(isinstance(i, str))
-
-            # check set types, not inplace
-            l = set("")
-            l2 = cpt.to_text(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set(""), l2)
-            l = set([b"", b"123"])
-            l2 = cpt.to_text(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertNotEqual(l, l2)
-            self.assertEqual(set(["", "123"]), l2)
-            l = set(["", b"123", u"321"])
-            l2 = cpt.to_text(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertNotEqual(l, l2)
-            self.assertEqual(set(["", "123", "321"]), l2)
-
-            # check set types, inplace
-            l = set("")
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set(""), l2)
-            l = set([b"", b"123"])
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set(["", "123"]), l2)
-            l = set(["", b"123", u"321"])
-            l2 = cpt.to_text(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set(["", "123", "321"]), l2)
-            for i in l2:
-                self.assertTrue(isinstance(i, str))
-
-    def test_to_bytes(self):
-        # Only support python2.x and python3.x now
-        self.assertTrue(six.PY2 | six.PY3)
-
-        if six.PY2:
-            # check None
-            self.assertIsNone(cpt.to_bytes(None))
-
-            # check all string related types
-            self.assertTrue(isinstance(cpt.to_bytes(str("")), bytes))
-            self.assertTrue(isinstance(cpt.to_bytes(str("123")), bytes))
-            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
-            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
-            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
-            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
-
-            self.assertEqual(b"", cpt.to_bytes(str("")))
-            self.assertEqual(b"123", cpt.to_bytes(str("123")))
-            self.assertEqual(b"", cpt.to_bytes(b""))
-            self.assertEqual(b"123", cpt.to_bytes(b"123"))
-            self.assertEqual(b"", cpt.to_bytes(u""))
-            self.assertEqual(b"123", cpt.to_bytes(u"123"))
-
-            # check list types, not inplace
-            l = [""]
-            l2 = cpt.to_bytes(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([b""], l2)
-            l = ["", "123"]
-            l2 = cpt.to_bytes(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([b"", b"123"], l2)
-            l = ["", b'123', u"321"]
-            l2 = cpt.to_bytes(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([b"", b"123", b"321"], l2)
-            for i in l2:
-                self.assertTrue(isinstance(i, bytes))
-
-            # check list types, inplace
-            l = [""]
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([b""], l2)
-            l = ["", "123"]
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([b"", b"123"], l2)
-            l = ["", b"123", u"321"]
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([b"", b"123", b"321"], l2)
-
-            # check set types, not inplace
-            l = set("")
-            l2 = cpt.to_bytes(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set(b""), l2)
-            l = set([b"", b"123"])
-            l2 = cpt.to_bytes(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set([b"", b"123"]), l2)
-            l = set(["", b"123", u"321"])
-            l2 = cpt.to_bytes(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set([b"", b"123", b"321"]), l2)
-            for i in l2:
-                self.assertTrue(isinstance(i, bytes))
-
-            # check set types, inplace
-            l = set("")
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set(b""), l2)
-            l = set([b"", b"123"])
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set([b"", b"123"]), l2)
-            l = set(["", b"123", u"321"])
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set([b"", b"123", b"321"]), l2)
-
-        elif six.PY3:
-            self.assertIsNone(cpt.to_bytes(None))
-
-            self.assertTrue(isinstance(cpt.to_bytes(str("")), bytes))
-            self.assertTrue(isinstance(cpt.to_bytes(str("123")), bytes))
-            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
-            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
-            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
-            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
-
-            self.assertEqual(b"", cpt.to_bytes(str("")))
-            self.assertEqual(b"123", cpt.to_bytes(str("123")))
-            self.assertEqual(b"", cpt.to_bytes(b""))
-            self.assertEqual(b"123", cpt.to_bytes(b"123"))
-            self.assertEqual(b"", cpt.to_bytes(u""))
-            self.assertEqual(b"123", cpt.to_bytes(u"123"))
-
-            # check list types, not inplace
-            l = [""]
-            l2 = cpt.to_bytes(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertNotEqual(l, l2)
-            self.assertEqual([b""], l2)
-            l = ["", "123"]
-            l2 = cpt.to_bytes(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertNotEqual(l, l2)
-            self.assertEqual([b"", b"123"], l2)
-            l = ["", b"123", u"321"]
-            l2 = cpt.to_bytes(l)
-            self.assertTrue(isinstance(l2, list))
-            self.assertFalse(l is l2)
-            self.assertNotEqual(l, l2)
-            self.assertEqual([b"", b"123", b"321"], l2)
-
-            # check list types, inplace
-            l = [""]
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([b""], l2)
-            l = ["", b"123"]
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([b"", b"123"], l2)
-            l = ["", b"123", u"321"]
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, list))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual([b"", b"123", b"321"], l2)
-            for i in l2:
-                self.assertTrue(isinstance(i, bytes))
-
-            # check set types, not inplace
-            l = set([""])
-            l2 = cpt.to_bytes(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertNotEqual(l, l2)
-            self.assertEqual(set([b""]), l2)
-            l = set([u"", u"123"])
-            l2 = cpt.to_bytes(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertNotEqual(l, l2)
-            self.assertEqual(set([b"", b"123"]), l2)
-            l = set(["", b"123", u"321"])
-            l2 = cpt.to_bytes(l, inplace=False)
-            self.assertTrue(isinstance(l2, set))
-            self.assertFalse(l is l2)
-            self.assertNotEqual(l, l2)
-            self.assertEqual(set([b"", b"123", b"321"]), l2)
-
-            # check set types, inplace
-            l = set("")
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set(b""), l2)
-            l = set([u"", u"123"])
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set([b"", b"123"]), l2)
-            l = set(["", b"123", u"321"])
-            l2 = cpt.to_bytes(l, inplace=True)
-            self.assertTrue(isinstance(l2, set))
-            self.assertTrue(l is l2)
-            self.assertEqual(l, l2)
-            self.assertEqual(set([b"", b"123", b"321"]), l2)
-            for i in l2:
-                self.assertTrue(isinstance(i, bytes))
-
-    def test_round(self):
-        self.assertEqual(3.0, cpt.round(3.4))
-        self.assertEqual(4.0, cpt.round(3.5))
-        self.assertEqual(0.0, cpt.round(0.1))
-        self.assertEqual(0.0, cpt.round(0.0))
-        self.assertEqual(-0.0, cpt.round(-0.0))
-        self.assertEqual(-0.0, cpt.round(-0.1))
-        self.assertEqual(-3.0, cpt.round(-3.4))
-        self.assertEqual(-4.0, cpt.round(-3.5))
-        self.assertEqual(5.0, cpt.round(5))
-        self.assertRaises(TypeError, cpt.round, None)
-
-    def test_floor_division(self):
-        self.assertEqual(0.0, cpt.floor_division(3, 4))
-        self.assertEqual(1.0, cpt.floor_division(4, 3))
-        self.assertEqual(2.0, cpt.floor_division(6, 3))
-        self.assertEqual(-2.0, cpt.floor_division(-4, 3))
-        self.assertEqual(-2.0, cpt.floor_division(-6, 3))
-        self.assertRaises(ZeroDivisionError, cpt.floor_division, 3, 0)
-        self.assertRaises(TypeError, cpt.floor_division, None, None)
-
-    def test_get_exception_message(self):
-        exception_message = "test_message"
-        self.assertRaises(AssertionError, cpt.get_exception_message, None)
-        if six.PY2:
-            self.assertRaises(AttributeError, cpt.get_exception_message,
-                              exception_message)
-            try:
-                raise RuntimeError(exception_message)
-            except Exception as e:
-                self.assertEqual(exception_message,
-                                 cpt.get_exception_message(e))
-                self.assertIsNotNone(e)
-
-            try:
-                raise Exception(exception_message)
-            except Exception as e:
-                self.assertEqual(exception_message,
-                                 cpt.get_exception_message(e))
-                self.assertIsNotNone(e)
-
-        if six.PY3:
-            try:
-                raise RuntimeError(exception_message)
-            except Exception as e:
-                self.assertEqual(exception_message,
-                                 cpt.get_exception_message(e))
-                self.assertIsNotNone(e)
-
-            try:
-                raise Exception(exception_message)
-            except Exception as e:
-                self.assertEqual(exception_message,
-                                 cpt.get_exception_message(e))
-                self.assertIsNotNone(e)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
deleted file mode 100644
index b5d1115723e350f56e0d3e04d191886e43a15667..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestConcatOp(OpTest):
-    def setUp(self):
-        self.op_type = "concat"
-        self.init_test_data()
-        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
-        self.attrs = {'axis': self.axis}
-        if self.axis < 0:
-            self.actual_axis = self.axis + len(self.x0.shape)
-            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
-        else:
-            self.actual_axis = self.axis
-
-        self.outputs = {
-            'Out': np.concatenate(
-                (self.x0, self.x1, self.x2), axis=self.actual_axis)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-        self.check_grad(['x2'], 'Out')
-
-    def init_test_data(self):
-        self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
-        self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
-        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
-        self.axis = 1
-
-
-class TestConcatOp2(TestConcatOp):
-    def init_test_data(self):
-        self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
-        self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
-        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
-        self.axis = 1
-
-
-class TestConcatOp3(TestConcatOp):
-    def init_test_data(self):
-        self.x0 = np.random.random((1, 256, 170, 256)).astype('float32')
-        self.x1 = np.random.random((1, 128, 170, 256)).astype('float32')
-        self.x2 = np.random.random((1, 128, 170, 256)).astype('float32')
-        self.axis = 1
-
-    def test_check_grad(self):
-        pass
-
-
-class TestConcatOp4(TestConcatOp):
-    def init_test_data(self):
-        self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
-        self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
-        self.x2 = np.random.random((0, 3, 4, 5)).astype('float32')
-        self.axis = 0
-
-    def test_check_grad(self):
-        pass
-
-
-class TestConcatOp5(TestConcatOp):
-    def init_test_data(self):
-        self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
-        self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
-        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
-        self.axis = -3
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py
deleted file mode 100644
index 5b2b71d050c42b4fea84bab89824d3f5c164b36e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-from paddle.fluid.framework import default_startup_program, default_main_program
-from paddle.fluid.executor import Executor
-from paddle.fluid.backward import append_backward
-from paddle.fluid.layers.control_flow import ConditionalBlock
-import numpy
-
-
-class ConditionalBlockTest(unittest.TestCase):
-    def test_forward(self):
-        data = layers.data(name='X', shape=[1], dtype='float32')
-        data.stop_gradient = False
-        cond = ConditionalBlock(inputs=[data])
-        out = layers.create_tensor(dtype='float32')
-        with cond.block():
-            hidden = layers.fc(input=data, size=10)
-            layers.assign(hidden, out)
-
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-        exe.run(default_startup_program())
-
-        x = numpy.random.random(size=(10, 1)).astype('float32')
-
-        outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
-        print(outs)
-        loss = layers.mean(out)
-        append_backward(loss=loss)
-        outs = exe.run(
-            feed={'X': x},
-            fetch_list=[
-                default_main_program().block(0).var(data.name + "@GRAD")
-            ])[0]
-        print(outs)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_const_value.py b/python/paddle/fluid/tests/unittests/test_const_value.py
deleted file mode 100644
index 0b2431d7726e845da33f6bcf9c74058788dd9654..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_const_value.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.framework as framework
-
-
-class ConstantTest(unittest.TestCase):
-    def test_const_value(self):
-        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
-        self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
-        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
-        self.assertEqual(framework.ZERO_VAR_SUFFIX, "@ZERO")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
deleted file mode 100644
index 3a302f2c41579d5e3bc6ac0a58b8f9ca1c7fb861..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ /dev/null
@@ -1,190 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from op_test import OpTest
-
-from test_conv2d_op import conv2d_forward_naive
-
-
-class TestConv2dFusionOp(OpTest):
-    def setUp(self):
-        self.op_type = "conv2d_fusion"
-        self.exhaustive_search = False
-        self.data_format = "AnyLayout"
-        self.dtype = np.float32
-        self.activation = 'relu'
-        self.add_bias = True
-        self.add_residual_data = True
-        self.channels = None
-        self.outputs = None
-
-        self.init_group()
-        self.init_dilation()
-        self.init_test_case()
-        self.init_bias_residual()
-        self.init_activation()
-        self.set_search_method()
-
-        conv2d_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-
-        input = np.random.random(self.input_size).astype(self.dtype)
-        filter = np.random.random(self.filter_size).astype(self.dtype)
-
-        self.output, _, _, _, _ = conv2d_forward_naive(
-            input, filter, self.groups, conv2d_param)
-        self.output = self.output.astype(self.dtype)
-
-        self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
-        }
-
-        if self.add_residual_data:
-            residual_data = np.random.random(self.output.shape).astype(
-                self.dtype)
-            self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
-                residual_data)
-            self.output += residual_data
-
-        if self.add_bias:
-            bias = np.random.random(self.filter_size[0]).astype(self.dtype)
-            self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
-            self.output = self.output + bias.reshape((1, bias.size, 1, 1))
-
-        assert self.activation in ['relu', 'identity']
-        if self.activation == 'relu':
-            self.output = np.maximum(self.output, 0)
-
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'groups': self.groups,
-            'dilations': self.dilations,
-            'data_format': self.data_format,
-            'exhaustive_search': self.exhaustive_search,
-            'activation': self.activation,
-            'split_channels': self.channels
-        }
-        self.outputs = {'Output': self.output}
-
-        self.set_outputs()
-
-    def has_cuda(self):
-        return core.is_compiled_with_cuda()
-
-    def test_check_output(self):
-        if self.has_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            pass
-
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_dilation(self):
-        self.dilations = [1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-    def init_bias_residual(self):
-        self.add_bias = True
-        self.add_residual_data = True
-
-    def init_activation(self):
-        self.activation = 'relu'
-
-    def set_search_method(self):
-        self.exhaustive_search = False
-
-    def set_outputs(self):
-        pass
-
-
-class TestWithoutResidual(TestConv2dFusionOp):
-    def init_bias_residual(self):
-        self.add_residual_data = False
-
-
-class TestIdentityActivation(TestConv2dFusionOp):
-    def init_activation(self):
-        self.activation = 'identity'
-
-
-class TestIdentityActivation(TestConv2dFusionOp):
-    def init_activation(self):
-        self.activation = 'identity'
-        self.add_residual_data = False
-
-
-class TestWithGroup(TestConv2dFusionOp):
-    def init_group(self):
-        self.groups = 3
-
-
-class TestWithDilation(TestConv2dFusionOp):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 10, 10]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_dilation(self):
-        self.dilations = [2, 2]
-
-    def init_group(self):
-        self.groups = 3
-
-
-class TestCUDNNExhaustiveSearch(TestConv2dFusionOp):
-    def set_search_method(self):
-        self.exhaustive_search = True
-
-
-class TestMultipleOutputs(TestConv2dFusionOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [1, 32, 17, 17]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [126, f_c, 3, 3]
-        self.channels = [84, 42]
-
-    def set_outputs(self):
-        out1 = self.output[:, 0:84, :, :]
-        out2 = self.output[:, 84:126, :, :]
-        self.outputs['Outputs'] = [('out1', out1), ('out2', out2)]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
deleted file mode 100644
index 725953b67df3c48f79e292fef9a465d1781fc692..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ /dev/null
@@ -1,506 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-def conv2d_forward_naive(input, filter, group, conv_param):
-    in_n, in_c, in_h, in_w = input.shape
-    out_c, f_c, f_h, f_w = filter.shape
-    assert f_c * group == in_c
-    assert np.mod(out_c, group) == 0
-    sub_out_c = out_c // group
-
-    stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
-        'dilation']
-    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
-    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
-    out = np.zeros((in_n, out_c, out_h, out_w))
-
-    d_bolck_h = (dilation[0] * (f_h - 1) + 1)
-    d_bolck_w = (dilation[1] * (f_w - 1) + 1)
-
-    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
-                       mode='constant',
-                       constant_values=0)
-
-    filter_dilation = np.zeros((out_c, f_c, d_bolck_h, d_bolck_w))
-    filter_dilation[:, :, 0:d_bolck_h:dilation[0], 0:d_bolck_w:dilation[
-        1]] = filter
-
-    for i in range(out_h):
-        for j in range(out_w):
-            for g in range(group):
-                input_pad_masked = \
-                    input_pad[:, g * f_c:(g + 1) * f_c,
-                    i * stride[0]:i * stride[0] + d_bolck_h,
-                    j * stride[1]:j * stride[1] + d_bolck_w]
-
-                f_sub = filter_dilation[g * sub_out_c:(g + 1) *
-                                        sub_out_c, :, :, :]
-                for k in range(sub_out_c):
-                    out[:, g * sub_out_c + k, i, j] = \
-                        np.sum(input_pad_masked * f_sub[k, :, :, :],
-                               axis=(1, 2, 3))
-
-    return out, in_n, out_h, out_w, out_c
-
-
-class TestConv2dOp(OpTest):
-    def setUp(self):
-        self.op_type = "conv2d"
-        self.use_cudnn = False
-        self.exhaustive_search = False
-        self.use_cuda = False
-        self.use_mkldnn = False
-        self.fuse_relu_before_depthwise_conv = False
-        self.data_format = "AnyLayout"
-        self.dtype = np.float32
-        self.init_kernel_type()
-        self.init_group()
-        self.init_dilation()
-        self.init_test_case()
-
-        conv2d_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-
-        input = np.random.random(self.input_size).astype(self.dtype)
-        if not self.has_cuda():
-            self.fuse_relu_before_depthwise_conv = False
-        if self.fuse_relu_before_depthwise_conv:
-            input = input - 0.5
-            input -= (input < 0) * 0.1
-            input += (input >= 0) * 0.1
-            input2 = np.maximum(input, 0.0)
-        else:
-            input2 = input
-        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
-        output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
-                                                  conv2d_param)
-        output = output.astype(self.dtype)
-
-        self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
-        }
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'groups': self.groups,
-            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format,
-            'fuse_relu_before_depthwise_conv':
-            self.fuse_relu_before_depthwise_conv,
-            'exhaustive_search': self.exhaustive_search
-        }
-        self.outputs = {'Output': output}
-
-    def has_cuda(self):
-        return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                 self.use_cuda)
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        self.check_output_with_place(place, atol=1e-5)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, {'Input', 'Filter'}, 'Output', max_relative_error=0.02)
-
-    def test_check_grad_no_filter(self):
-        if self.dtype == np.float16:
-            return
-        place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['Input'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Filter']))
-
-    def test_check_grad_no_input(self):
-        if self.dtype == np.float16:
-            return
-        place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['Filter'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Input']))
-
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_dilation(self):
-        self.dilations = [1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-    def init_kernel_type(self):
-        pass
-
-
-class TestWithPad(TestConv2dOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-
-class TestWithStride(TestConv2dOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 6, 6]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-
-class TestWithGroup(TestConv2dOp):
-    def init_group(self):
-        self.groups = 3
-
-
-class TestWith1x1(TestConv2dOp):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 1, 1]
-
-    def init_group(self):
-        self.groups = 3
-
-
-class TestWithDepthWise3x3(TestConv2dOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [3, 4, 10, 10]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [8, f_c, 3, 3]
-
-    def init_dilation(self):
-        self.dilations = [2, 2]
-
-    def init_group(self):
-        self.groups = 4
-
-
-class TestWithDepthWise5x5(TestConv2dOp):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 4, 10, 10]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [8, f_c, 5, 5]
-
-    def init_group(self):
-        self.groups = 4
-
-
-class TestWithDepthWise7x7(TestConv2dOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 8, 10, 10]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [16, f_c, 7, 7]
-
-    def init_group(self):
-        self.groups = 8
-
-
-class TestWithDilation(TestConv2dOp):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 10, 10]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_dilation(self):
-        self.dilations = [2, 2]
-
-    def init_group(self):
-        self.groups = 3
-
-
-class TestWithInput1x1Filter1x1(TestConv2dOp):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 1, 1]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 1, 1]
-
-    def init_group(self):
-        self.groups = 3
-
-
-#----------------Conv2dCUDNN----------------
-
-
-def create_test_cudnn_class(parent):
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    class TestCUDNNCase(parent):
-        def init_kernel_type(self):
-            self.use_cudnn = True
-
-    cls_name = "{0}_{1}".format(parent.__name__, "CUDNN")
-    TestCUDNNCase.__name__ = cls_name
-    globals()[cls_name] = TestCUDNNCase
-
-
-create_test_cudnn_class(TestConv2dOp)
-create_test_cudnn_class(TestWithPad)
-create_test_cudnn_class(TestWithStride)
-create_test_cudnn_class(TestWithGroup)
-create_test_cudnn_class(TestWith1x1)
-create_test_cudnn_class(TestWithInput1x1Filter1x1)
-
-#----------------Conv2dCUDNN----------------
-
-
-def create_test_cudnn_fp16_class(parent, grad_check=True):
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    class TestConv2DCUDNNFp16(parent):
-        def init_kernel_type(self):
-            self.use_cudnn = True
-            self.dtype = np.float16
-
-        def test_check_output(self):
-            if core.is_compiled_with_cuda():
-                place = core.CUDAPlace(0)
-                if core.is_float16_supported(place):
-                    self.check_output_with_place(place, atol=2e-2)
-
-        def test_check_grad_no_filter(self):
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place) and grad_check:
-                self.check_grad_with_place(
-                    place, ['Input'],
-                    'Output',
-                    max_relative_error=0.02,
-                    no_grad_set=set(['Filter']))
-
-        def test_check_grad_no_input(self):
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place) and grad_check:
-                self.check_grad_with_place(
-                    place, ['Filter'],
-                    'Output',
-                    max_relative_error=0.02,
-                    no_grad_set=set(['Input']))
-
-    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16")
-    TestConv2DCUDNNFp16.__name__ = cls_name
-    globals()[cls_name] = TestConv2DCUDNNFp16
-
-
-create_test_cudnn_fp16_class(TestConv2dOp, grad_check=False)
-create_test_cudnn_fp16_class(TestWithPad, grad_check=False)
-create_test_cudnn_fp16_class(TestWithStride, grad_check=False)
-create_test_cudnn_fp16_class(TestWithGroup, grad_check=False)
-create_test_cudnn_fp16_class(TestWith1x1, grad_check=False)
-create_test_cudnn_fp16_class(TestWithInput1x1Filter1x1, grad_check=False)
-
-# -------TestDepthwiseConv
-
-
-class TestDepthwiseConv(TestConv2dOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [3, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConv2(TestConv2dOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [3, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConv3(TestConv2dOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConvWithDilation(TestConv2dOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConvWithDilation2(TestConv2dOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConvandFuse(TestConv2dOp):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [3, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConv2andFuse(TestConv2dOp):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [3, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConv3andFuse(TestConv2dOp):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConvWithDilationandFuse(TestConv2dOp):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConvWithDilation2andFuse(TestConv2dOp):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestCUDNNExhaustiveSearch(TestConv2dOp):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.exhaustive_search = True
-
-
-# Please Don't remove the following code.
-# Currently, CI use cudnn V5.0 which not support dilation conv.
-# class TestCUDNNWithDilation(TestWithDilation):
-#     def init_op_type(self):
-#         self.op_type = "conv_cudnn"
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
deleted file mode 100644
index 3b820f6ad716e5717e45d0c6341fb89010406d59..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ /dev/null
@@ -1,316 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-def conv2dtranspose_forward_naive(input_, filter_, attrs):
-    in_n, in_c, in_h, in_w = input_.shape
-    f_c, f_out_c, f_h, f_w = filter_.shape
-    groups = attrs['groups']
-    assert in_c == f_c
-    out_c = f_out_c * groups
-    sub_in_c = in_c // groups
-
-    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
-        'dilations']
-    d_bolck_h = dilations[0] * (f_h - 1) + 1
-    d_bolck_w = dilations[1] * (f_w - 1) + 1
-    out_h = (in_h - 1) * stride[0] + d_bolck_h
-    out_w = (in_w - 1) * stride[1] + d_bolck_w
-    if 'output_size' in attrs:
-        output_size = attrs['output_size']
-        out_h = output_size[0] + 2 * pad[0]
-        out_w = output_size[1] + 2 * pad[1]
-
-    out = np.zeros((in_n, out_c, out_h, out_w))
-
-    for n in range(in_n):
-        for i in range(in_h):
-            for j in range(in_w):
-                for g in range(groups):
-                    input_masked = input_[n, g * sub_in_c:(g + 1) * sub_in_c, i,
-                                          j]  # (c)
-                    input_masked = np.reshape(input_masked, (sub_in_c, 1, 1))
-                    input_masked = np.tile(input_masked, (1, f_h, f_w))
-
-                    for k in range(f_out_c):
-                        tmp_out = np.sum(
-                            input_masked *
-                            filter_[g * sub_in_c:(g + 1) * sub_in_c, k, :, :],
-                            axis=0)
-                        i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
-                        j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
-                        out[n, g * f_out_c + k, i1:i2:dilations[0], j1:j2:
-                            dilations[1]] += tmp_out
-
-    out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
-    return out
-
-
-class TestConv2dTransposeOp(OpTest):
-    def setUp(self):
-        # init as conv transpose
-        self.is_test = False
-        self.use_cudnn = False
-        self.use_mkldnn = False
-        self.output_size = None
-        self.data_format = "AnyLayout"
-        self.init_op_type()
-        self.init_test_case()
-
-        input_ = np.random.random(self.input_size).astype("float32")
-        filter_ = np.random.random(self.filter_size).astype("float32")
-
-        self.inputs = {'Input': input_, 'Filter': filter_}
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'groups': self.groups,
-            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
-            'is_test': self.is_test,
-            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format
-        }
-        if self.output_size is not None:
-            self.attrs['output_size'] = self.output_size
-
-        output = conv2dtranspose_forward_naive(input_, filter_,
-                                               self.attrs).astype('float32')
-
-        self.outputs = {'Output': output}
-
-    def test_check_output(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
-
-    def test_check_grad_no_input(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Input']))
-        else:
-            self.check_grad(
-                ['Filter'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Input']))
-
-    def test_check_grad_no_filter(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']))
-        else:
-            self.check_grad(
-                ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']))
-
-    def test_check_grad(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['Input', 'Filter']),
-                'Output',
-                max_relative_error=0.02)
-        else:
-            self.check_grad(
-                set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
-
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-    def init_op_type(self):
-        self.op_type = "conv2d_transpose"
-
-
-class TestWithPad(TestConv2dTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-
-class TestWithGroups(TestConv2dTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 2
-        self.input_size = [2, 4, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 3, 3, 3]
-
-
-class TestWithStride(TestConv2dTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.dilations = [1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-
-class TestWithDilation(TestConv2dTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.groups = 1
-        self.dilations = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-
-class TestWithEvenUpsample(TestConv2dTransposeOp):
-    def init_test_case(self):
-        self.pad = [2, 2]
-        self.stride = [2, 2]
-        self.groups = 1
-        self.dilations = [1, 1]
-        self.output_size = [14, 14]
-        self.input_size = [2, 3, 7, 7]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 5, 5]
-
-
-# ------------ test_cudnn ------------
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNN(TestConv2dTransposeOp):
-    def init_op_type(self):
-        self.use_cudnn = True
-        self.op_type = "conv2d_transpose"
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNNWithPad(TestWithPad):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.groups = 1
-        self.dilations = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-    def init_op_type(self):
-        self.use_cudnn = True
-        self.op_type = "conv2d_transpose"
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNNWithStride(TestWithStride):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.groups = 1
-        self.dilations = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-    def init_op_type(self):
-        self.use_cudnn = True
-        self.op_type = "conv2d_transpose"
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNNWithGroups(TestWithGroups):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 2
-        self.input_size = [2, 4, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 3, 3, 3]
-
-    def init_op_type(self):
-        self.use_cudnn = True
-        self.op_type = "conv2d_transpose"
-
-
-class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.dilations = [1, 1]
-        self.input_size = [2, 8, 16, 16]  # NCHW
-        self.groups = 8
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [self.input_size[1], f_c, 4, 4]
-        self.op_type = "depthwise_conv2d_transpose"
-
-
-# ------------ test_cudnn ------------
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNNWithEvenUpsample(TestWithEvenUpsample):
-    def init_op_type(self):
-        self.use_cudnn = True
-        self.op_type = "conv2d_transpose"
-
-
-# Please Don't remove the following code.
-# Currently, CI use cudnn V5.0 which not support dilation conv.
-# class TestCUDNNWithDilation(TestWithDilation):
-#     def init_test_case(self):
-#         self.pad = [1, 1]
-#         self.stride = [2, 2]
-#         self.dilations = [2, 2]
-#         self.input_size = [2, 3, 5, 5]  # NCHW
-#         f_c = self.input_size[1]
-#         self.filter_size = [f_c, 6, 3, 3]
-#
-#     def init_op_type(self):
-#         self.op_type = "conv2d_transpose"
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
deleted file mode 100644
index aedd85ad9a7e81cb0a82b2521e70ea8e46a26814..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ /dev/null
@@ -1,330 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-def conv3d_forward_naive(input, filter, group, conv_param):
-    in_n, in_c, in_d, in_h, in_w = input.shape
-    out_c, f_c, f_d, f_h, f_w = filter.shape
-    assert f_c * group == in_c
-    assert np.mod(out_c, group) == 0
-    sub_out_c = out_c // group
-
-    stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
-        'dilations']
-
-    out_d = 1 + (in_d + 2 * pad[0] - (dilation[0] * (f_d - 1) + 1)) // stride[0]
-    out_h = 1 + (in_h + 2 * pad[1] - (dilation[1] * (f_h - 1) + 1)) // stride[1]
-    out_w = 1 + (in_w + 2 * pad[2] - (dilation[2] * (f_w - 1) + 1)) // stride[2]
-
-    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
-
-    d_bolck_d = (dilation[0] * (f_d - 1) + 1)
-    d_bolck_h = (dilation[1] * (f_h - 1) + 1)
-    d_bolck_w = (dilation[2] * (f_w - 1) + 1)
-
-    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], ),
-                               (pad[2], )),
-                       mode='constant',
-                       constant_values=0)
-
-    filter_dilation = np.zeros((out_c, f_c, d_bolck_d, d_bolck_h, d_bolck_w))
-    filter_dilation[:, :, 0:d_bolck_d:dilation[0], 0:d_bolck_h:dilation[1], 0:
-                    d_bolck_w:dilation[2]] = filter
-
-    for d in range(out_d):
-        for i in range(out_h):
-            for j in range(out_w):
-                for g in range(group):
-                    input_pad_masked = \
-                        input_pad[:, g * f_c:(g + 1) * f_c,
-                        d * stride[0]:d * stride[0] + d_bolck_d,
-                        i * stride[1]:i * stride[1] + d_bolck_h,
-                        j * stride[2]:j * stride[2] + d_bolck_w]
-
-                    f_sub = filter_dilation[g * sub_out_c:(g + 1) *
-                                            sub_out_c, :, :, :, :]
-                    for k in range(sub_out_c):
-                        out[:, g * sub_out_c + k, d, i, j] = \
-                            np.sum(input_pad_masked * f_sub[k, :, :, :, :],
-                                   axis=(1, 2, 3, 4))
-
-    return out
-
-
-class TestConv3dOp(OpTest):
-    def setUp(self):
-        self.op_type = "conv3d"
-        self.use_cudnn = False
-        self.use_mkldnn = False
-        self.data_format = "AnyLayout"
-        self.dtype = np.float32
-        self.init_kernel_type()
-        self.init_group()
-        self.init_dilation()
-        self.init_test_case()
-
-        conv3d_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilations': self.dilations
-        }
-
-        input = np.random.random(self.input_size).astype(self.dtype)
-        filter = np.random.random(self.filter_size).astype(self.dtype)
-        output = conv3d_forward_naive(input, filter, self.groups,
-                                      conv3d_param).astype(self.dtype)
-
-        self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
-        }
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'groups': self.groups,
-            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format
-        }
-        self.outputs = {'Output': output}
-
-    def has_cudnn(self):
-        return core.is_compiled_with_cuda() and self.use_cudnn
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        self.check_output_with_place(place, atol=1e-5)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, {'Input', 'Filter'}, 'Output', max_relative_error=0.03)
-
-    def test_check_grad_no_filter(self):
-        if self.dtype == np.float16:
-            return
-        place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']))
-
-    def test_check_grad_no_input(self):
-        if self.dtype == np.float16:
-            return
-        place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']))
-
-    def init_test_case(self):
-        self.pad = [0, 0, 0]
-        self.stride = [1, 1, 1]
-        self.input_size = [2, 3, 4, 4, 4]  # NCDHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3, 3]
-
-    def init_dilation(self):
-        self.dilations = [1, 1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-    def init_kernel_type(self):
-        pass
-
-
-class TestCase1(TestConv3dOp):
-    def init_test_case(self):
-        self.pad = [1, 1, 1]
-        self.stride = [1, 1, 1]
-        self.input_size = [2, 3, 4, 4, 4]  # NCDHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3, 3]
-
-
-class TestWithGroup1(TestConv3dOp):
-    def init_group(self):
-        self.groups = 3
-
-
-class TestWithGroup2(TestCase1):
-    def init_group(self):
-        self.groups = 3
-
-
-class TestWith1x1(TestConv3dOp):
-    def init_test_case(self):
-        self.pad = [0, 0, 0]
-        self.stride = [1, 1, 1]
-        self.input_size = [2, 3, 4, 4, 4]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 1, 1, 1]
-
-    def init_dilation(self):
-        self.dilations = [1, 1, 1]
-
-    def init_group(self):
-        self.groups = 3
-
-
-class TestWithInput1x1Filter1x1(TestConv3dOp):
-    def init_test_case(self):
-        self.pad = [0, 0, 0]
-        self.stride = [1, 1, 1]
-        self.input_size = [2, 3, 1, 1, 1]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 1, 1, 1]
-
-    def init_dilation(self):
-        self.dilations = [1, 1, 1]
-
-    def init_group(self):
-        self.groups = 3
-
-
-class TestWithDilation(TestConv3dOp):
-    def init_test_case(self):
-        self.pad = [0, 0, 0]
-        self.stride = [1, 1, 1]
-        self.input_size = [2, 3, 6, 6, 6]  # NCDHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 2, 2, 2]
-
-    def init_dilation(self):
-        self.dilations = [2, 2, 2]
-
-    def init_group(self):
-        self.groups = 3
-
-
-#----------------Conv3dCUDNN----------------
-class TestCUDNN(TestConv3dOp):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16CUDNN(TestConv3dOp):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=2e-2)
-
-
-class TestWithGroup1CUDNN(TestWithGroup1):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16WithGroup1CUDNN(TestWithGroup1):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=2e-2)
-
-
-class TestWithGroup2CUDNN(TestWithGroup2):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16WithGroup2CUDNN(TestWithGroup2):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=2e-2)
-
-
-class TestWith1x1CUDNN(TestWith1x1):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16With1x1CUDNN(TestWith1x1):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=2e-2)
-
-
-class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=2e-2)
-
-
-class TestCUDNNExhaustiveSearch(TestCUDNN):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.exhaustive_search = True
-
-
-# FIXME(typhoonzero): find a way to determine if
-# using cudnn > 6 in python
-# class TestWithDilationCUDNN(TestWithDilation):
-#     def init_op_type(self):
-#         self.op_type = "conv3d"
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
deleted file mode 100644
index 8d9075961cbec32bc34fcf0c92cfbb7e6c00d886..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ /dev/null
@@ -1,276 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-def conv3dtranspose_forward_naive(input_, filter_, attrs):
-    in_n, in_c, in_d, in_h, in_w = input_.shape
-    f_c, f_out_c, f_d, f_h, f_w = filter_.shape
-    groups = attrs['groups']
-    assert in_c == f_c
-    out_c = f_out_c * groups
-    sub_in_c = in_c // groups
-
-    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
-        'dilations']
-
-    d_bolck_d = dilations[0] * (f_d - 1) + 1
-    d_bolck_h = dilations[1] * (f_h - 1) + 1
-    d_bolck_w = dilations[2] * (f_w - 1) + 1
-    out_d = (in_d - 1) * stride[0] + d_bolck_d
-    out_h = (in_h - 1) * stride[1] + d_bolck_h
-    out_w = (in_w - 1) * stride[2] + d_bolck_w
-    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
-
-    for n in range(in_n):
-        for d in range(in_d):
-            for i in range(in_h):
-                for j in range(in_w):
-                    for g in range(groups):
-                        input_masked = input_[n, g * sub_in_c:(g + 1
-                                                               ) * sub_in_c, d,
-                                              i, j]  # (c)
-                        input_masked = np.reshape(input_masked,
-                                                  (sub_in_c, 1, 1, 1))
-                        input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
-
-                        for k in range(f_out_c):
-                            tmp_out = np.sum(input_masked * filter_[
-                                g * sub_in_c:(g + 1) * sub_in_c, k, :, :, :],
-                                             axis=0)
-                            d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
-                            i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
-                            j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
-                            out[n, g * f_out_c + k, d1:d2:dilations[0], i1:i2:
-                                dilations[1], j1:j2:dilations[2]] += tmp_out
-
-    out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
-              pad[2]]
-    return out
-
-
-class TestConv3dTransposeOp(OpTest):
-    def setUp(self):
-        # init as conv transpose
-        self.use_cudnn = False
-        self.init_op_type()
-        self.init_test_case()
-
-        input_ = np.random.random(self.input_size).astype("float32")
-        filter_ = np.random.random(self.filter_size).astype("float32")
-
-        self.inputs = {'Input': input_, 'Filter': filter_}
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'dilations': self.dilations,
-            'groups': self.groups,
-            'use_cudnn': self.use_cudnn,
-            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
-        }
-
-        output = conv3dtranspose_forward_naive(input_, filter_,
-                                               self.attrs).astype("float32")
-
-        self.outputs = {'Output': output}
-
-    def test_check_output(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
-
-    def test_check_grad(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['Input', 'Filter']),
-                'Output',
-                max_relative_error=0.03)
-        else:
-            self.check_grad(
-                set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
-
-    def test_check_grad_no_filter(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Filter']))
-        else:
-            self.check_grad(
-                ['Input'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Filter']))
-
-    def test_check_grad_no_input(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Input']))
-        else:
-            self.check_grad(
-                ['Filter'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Input']))
-
-    def init_test_case(self):
-        self.pad = [0, 0, 0]
-        self.stride = [1, 1, 1]
-        self.dilations = [1, 1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3, 3]
-
-    def init_op_type(self):
-        self.op_type = "conv3d_transpose"
-
-
-class TestWithPad(TestConv3dTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1, 1]
-        self.stride = [1, 1, 1]
-        self.dilations = [1, 1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3, 3]
-
-
-class TestWithGroups(TestConv3dTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1, 1]
-        self.stride = [1, 1, 1]
-        self.dilations = [1, 1, 1]
-        self.groups = 2
-        self.input_size = [2, 4, 5, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 3, 3, 3, 3]
-
-
-class TestWithStride(TestConv3dTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1, 1]
-        self.stride = [2, 2, 2]
-        self.dilations = [1, 1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3, 3]
-
-
-class TestWithDilation(TestConv3dTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1, 1]
-        self.stride = [1, 1, 1]
-        self.dilations = [2, 2, 2]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3, 3]
-
-
-# ------------ test_cudnn ------------
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNN(TestConv3dTransposeOp):
-    def init_op_type(self):
-        self.use_cudnn = True
-        self.op_type = "conv3d_transpose"
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNNWithPad(TestWithPad):
-    def init_test_case(self):
-        self.pad = [1, 1, 1]
-        self.stride = [1, 1, 1]
-        self.dilations = [1, 1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3, 3]
-
-    def init_op_type(self):
-        self.use_cudnn = True
-        self.op_type = "conv3d_transpose"
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNNWithStride(TestWithStride):
-    def init_test_case(self):
-        self.pad = [1, 1, 1]
-        self.stride = [2, 2, 2]
-        self.dilations = [1, 1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3, 3]
-
-    def init_op_type(self):
-        self.use_cudnn = True
-        self.op_type = "conv3d_transpose"
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNNWithGroups(TestWithGroups):
-    def init_test_case(self):
-        self.pad = [1, 1, 1]
-        self.stride = [1, 1, 1]
-        self.dilations = [1, 1, 1]
-        self.groups = 2
-        self.input_size = [2, 4, 5, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 3, 3, 3, 3]
-
-    def init_op_type(self):
-        self.use_cudnn = True
-        self.op_type = "conv3d_transpose"
-
-
-# Please Don't remove the following code.
-# Currently, CI use cudnn V5.0 which not support dilation conv.
-# class TestCUDNNWithDilation(TestWithDilation):
-#     def init_test_case(self):
-#         self.pad = [1, 1, 1]
-#         self.stride = [2, 2, 2]
-#         self.dilations = [2, 2, 2]
-#         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
-#         f_c = self.input_size[1]
-#         self.filter_size = [f_c, 6, 3, 3, 3]
-#
-#     def init_op_type(self):
-#         self.op_type = "conv3d_transpose"
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
deleted file mode 100644
index 81f902d529e73c5dcc5065c24328bea592236a93..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-import gradient_checker
-
-from decorator_helper import prog_scope
-
-
-class TestConvDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 4, 7, 8]
-        eps = 0.005
-        dtype = np.float64
-        x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d(x, 4, 1, bias_attr=False)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = fluid.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestConvDoubleGradCheckTest1(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 3, 4, 5]
-        eps = 0.005
-        dtype = np.float64
-        x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d(x, 4, 1, padding=1, bias_attr=False)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = fluid.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestConv3DDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 4, 3, 4, 2]
-        eps = 0.005
-        dtype = np.float64
-        x = layers.data('x', shape, False, dtype)
-        y = layers.conv3d(x, 4, 1, bias_attr=False)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = fluid.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestConv3DDoubleGradCheckTest1(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 4, 5, 3, 2]
-        eps = 0.005
-        dtype = np.float64
-        x = layers.data('x', shape, False, dtype)
-        y = layers.conv3d(x, 4, 1, padding=1, bias_attr=False)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = fluid.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
deleted file mode 100644
index b7364e869e7420e610363eafcc4964b825e57326..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
+++ /dev/null
@@ -1,63 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def conv_shift_forward(x, y):
-    out = np.zeros_like(x)
-    M = x.shape[1]
-    N = y.shape[1]
-    y_half_width = (N - 1) // 2
-    for i in range(M):
-        for j in range(N):
-            out[:, i] += x[:, (i + j + M - y_half_width) % M] * y[:, j]
-    return out
-
-
-class TestConvShiftOp(OpTest):
-    def setUp(self):
-        self.op_type = "conv_shift"
-
-        batch_size = 4
-        x_dim = 17
-        y_dim = 3  # must be odd and <= x_dim
-        x = np.random.random((batch_size, x_dim)).astype("float32")
-        y = np.random.random((batch_size, y_dim)).astype("float32")
-        self.inputs = {'X': x, 'Y': y}
-
-        out = conv_shift_forward(x, y)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
-
-    def test_check_grad_ignore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cos_sim_op.py b/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
deleted file mode 100644
index 3c3fd6d4d71503ccc3678ca69d55bcc8536c8c6a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestCosSimOp(OpTest):
-    def setUp(self):
-        self.op_type = "cos_sim"
-        self.inputs = {
-            'X': np.random.random((6, 5)).astype("float32"),
-            'Y': np.random.random((6, 5)).astype("float32")
-        }
-        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
-        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
-        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
-            expect_x_norm / expect_y_norm
-        self.outputs = {
-            'XNorm': np.expand_dims(expect_x_norm, 1),
-            'YNorm': np.expand_dims(expect_y_norm, 1),
-            'Out': np.expand_dims(expect_out, 1)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.06)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.06, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Y'))
-
-
-class TestCosSimOp2(TestCosSimOp):
-    def setUp(self):
-        self.op_type = "cos_sim"
-        self.inputs = {
-            'X': np.random.random((6, 5)).astype("float32"),
-            'Y': np.random.random((1, 5)).astype("float32")
-        }
-        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
-        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
-        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
-            expect_x_norm / expect_y_norm
-        self.outputs = {
-            'XNorm': np.expand_dims(expect_x_norm, 1),
-            'YNorm': np.expand_dims(expect_y_norm, 1),
-            'Out': np.expand_dims(expect_out, 1)
-        }
-
-
-class TestCosSimOp3(TestCosSimOp):
-    def setUp(self):
-        self.op_type = "cos_sim"
-        self.inputs = {
-            'X': np.random.random((6, 5, 2)).astype("float32"),
-            'Y': np.random.random((6, 5, 2)).astype("float32")
-        }
-        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2))
-        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2))
-        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2)) / \
-            expect_x_norm / expect_y_norm
-        self.outputs = {
-            'XNorm': np.expand_dims(expect_x_norm, 1),
-            'YNorm': np.expand_dims(expect_y_norm, 1),
-            'Out': np.expand_dims(expect_out, 1)
-        }
-
-
-class TestCosSimOp4(TestCosSimOp):
-    def setUp(self):
-        self.op_type = "cos_sim"
-        self.inputs = {
-            'X': np.random.random((6, 5, 2)).astype("float32"),
-            'Y': np.random.random((1, 5, 2)).astype("float32")
-        }
-        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2))
-        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2))
-        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2)) / \
-            expect_x_norm / expect_y_norm
-        self.outputs = {
-            'XNorm': np.expand_dims(expect_x_norm, 1),
-            'YNorm': np.expand_dims(expect_y_norm, 1),
-            'Out': np.expand_dims(expect_out, 1)
-        }
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
deleted file mode 100644
index fd34c8fc9390b69afd93229b56aa9189da2a8b28..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.layers as layers
-
-
-class TestDocString(unittest.TestCase):
-    def test_layer_doc_string(self):
-        print(layers.dropout.__doc__)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
deleted file mode 100644
index 5c8682a0756910897b0a708d20cc41690d870db3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ /dev/null
@@ -1,230 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import random
-import numpy as np
-
-from op_test import OpTest
-
-
-class CRFDecoding(object):
-    def __init__(self, emission_weights, transition_weights,
-                 seq_start_positions):
-        assert (emission_weights.shape[0] == sum(seq_start_positions))
-        self.tag_num = emission_weights.shape[1]
-        self.seq_num = len(seq_start_positions)
-
-        self.seq_start_positions = seq_start_positions
-        self.x = emission_weights
-
-        self.a = transition_weights[0, :]
-        self.b = transition_weights[1, :]
-        self.w = transition_weights[2:, :]
-
-        self.track = np.zeros(
-            (sum(seq_start_positions), self.tag_num), dtype="int64")
-        self.decoded_path = np.zeros(
-            (sum(seq_start_positions), 1), dtype="int64")
-
-    def _decode_one_sequence(self, decoded_path, x):
-        seq_len, tag_num = x.shape
-        alpha = np.zeros((seq_len, tag_num), dtype="float64")
-        track = np.zeros((seq_len, tag_num), dtype="int64")
-
-        for i in range(tag_num):
-            alpha[0, i] = self.a[i] + x[0, i]
-
-        for k in range(1, seq_len):
-            for i in range(tag_num):
-                max_score = -np.finfo("float64").max
-                max_idx = 0
-                for j in range(tag_num):
-                    score = alpha[k - 1, j] + self.w[j, i]
-                    if score > max_score:
-                        max_score = score
-                        max_idx = j
-                alpha[k, i] = max_score + x[k, i]
-                track[k, i] = max_idx
-
-        max_score = -np.finfo("float64").max
-        max_idx = 0
-        for i in range(tag_num):
-            score = alpha[seq_len - 1, i] + self.b[i]
-            if score > max_score:
-                max_score = score
-                max_idx = i
-
-        decoded_path[-1] = max_idx
-        for i in range(seq_len - 1, 0, -1):
-            decoded_path[i - 1] = max_idx = track[i, max_idx]
-
-    def decode(self):
-        cur_pos = 0
-        for i in range(self.seq_num):
-            start = cur_pos
-            cur_pos += self.seq_start_positions[i]
-            end = cur_pos
-            self._decode_one_sequence(self.decoded_path[start:end, :],
-                                      self.x[start:end, :])
-        return self.decoded_path
-
-
-class TestCRFDecodingOp1(OpTest):
-    """
-    Compare the dynamic program with random generated parameters and inputs
-    with grouth truth not being given.
-    """
-
-    def set_test_data(self):
-        SEQ_NUM = 3
-        TAG_NUM = 17
-        MAX_SEQ_LEN = 10
-
-        lod = [[]]
-        total_len = 0
-        for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
-            total_len += lod[-1][-1]
-        emission = np.random.uniform(-1, 1,
-                                     [total_len, TAG_NUM]).astype("float64")
-        transition = np.random.uniform(-0.5, 0.5,
-                                       [TAG_NUM + 2, TAG_NUM]).astype("float64")
-
-        self.inputs = {
-            "Emission": (emission, lod),
-            "Transition": transition,
-        }
-
-        decoder = CRFDecoding(emission, transition, lod[0])
-        decoded_path = decoder.decode()
-
-        self.outputs = {"ViterbiPath": decoded_path}
-
-    def setUp(self):
-        self.op_type = "crf_decoding"
-        self.set_test_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestCRFDecodingOp2(OpTest):
-    """
-    Compare the dynamic program with brute force computation with
-    ground truth being given.
-    """
-
-    def init_lod(self):
-        self.lod = [[1, 2, 3, 4]]
-
-    def setUp(self):
-        self.op_type = "crf_decoding"
-        TAG_NUM = 5
-
-        self.init_lod()
-        total_len = sum(self.lod[-1])
-        transition = np.repeat(
-            np.arange(
-                TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
-            TAG_NUM + 2,
-            axis=0)
-        emission = np.repeat(
-            np.arange(
-                TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
-            total_len,
-            axis=0)
-
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(total_len, 1), dtype="int64")
-        predicted_labels = np.ones(
-            (total_len, 1), dtype="int64") * (TAG_NUM - 1)
-        expected_output = (labels == predicted_labels).astype("int64")
-
-        self.inputs = {
-            "Emission": (emission, self.lod),
-            "Transition": transition,
-            "Label": (labels, self.lod)
-        }
-
-        self.outputs = {"ViterbiPath": expected_output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestCRFDecodingOp3(TestCRFDecodingOp2):
-    def init_lod(self):
-        self.lod = [[1, 0, 0, 4]]
-
-
-class TestCRFDecodingOp4(TestCRFDecodingOp2):
-    def init_lod(self):
-        self.lod = [[0, 2, 3, 0]]
-
-
-class TestCRFDecodingOp5(OpTest):
-    """
-    Compare the dynamic program with random generated parameters and inputs
-    with grouth truth not being given.
-    """
-
-    def seq_pad(self, data, length):
-        max_len = np.max(length)
-        shape = [len(length), max_len] + list(data.shape[1:])
-        padded = np.zeros(shape).astype(data.dtype)
-        offset = 0
-        for i, l in enumerate(length):
-            padded[i, 0:l] = data[offset:offset + l]
-            offset += l
-        return np.squeeze(padded)
-
-    def set_test_data(self):
-        SEQ_NUM = 3
-        TAG_NUM = 17
-        MAX_SEQ_LEN = 10
-
-        lod = [[]]
-        total_len = 0
-        for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
-            total_len += lod[-1][-1]
-        emission = np.random.uniform(-1, 1,
-                                     [total_len, TAG_NUM]).astype("float64")
-        transition = np.random.uniform(-0.5, 0.5,
-                                       [TAG_NUM + 2, TAG_NUM]).astype("float64")
-
-        self.inputs = {
-            "Emission": self.seq_pad(emission, lod[0]),
-            "Transition": transition,
-            "Length": np.array(lod).astype('int64'),
-        }
-
-        decoder = CRFDecoding(emission, transition, lod[0])
-        decoded_path = decoder.decode()
-
-        self.outputs = {"ViterbiPath": self.seq_pad(decoded_path, lod[0])}
-
-    def setUp(self):
-        self.op_type = "crf_decoding"
-        self.set_test_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
deleted file mode 100644
index d7bcfba8deab1b73e4cbab8a27f9eeef9a37d29b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def crop(data, offsets, crop_shape):
-    def indexOf(shape, index):
-        result = []
-        for dim in reversed(shape):
-            result.append(index % dim)
-            index = index / dim
-        return result[::-1]
-
-    result = []
-    for i, value in enumerate(data.flatten()):
-        index = indexOf(data.shape, i)
-        selected = True
-        if len(index) == len(offsets):
-            for j, offset in enumerate(offsets):
-                selected = selected and index[j] >= offset and index[
-                    j] < crop_shape[j] + offset
-            if selected:
-                result.append(value)
-    return np.array(result).reshape(crop_shape)
-
-
-class TestCropOp(OpTest):
-    def setUp(self):
-        self.op_type = "crop"
-        self.crop_by_input = False
-        self.offset_by_input = False
-        self.attrs = {}
-        self.initTestCase()
-        if self.crop_by_input:
-            self.inputs = {
-                'X': np.random.random(self.x_shape).astype("float32"),
-                'Y': np.random.random(self.crop_shape).astype("float32")
-            }
-        else:
-            self.attrs['shape'] = self.crop_shape
-            self.inputs = {
-                'X': np.random.random(self.x_shape).astype("float32"),
-            }
-        if self.offset_by_input:
-            self.inputs['Offsets'] = np.array(self.offsets).astype('int32')
-        else:
-            self.attrs['offsets'] = self.offsets
-        self.outputs = {
-            'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
-        }
-
-    def initTestCase(self):
-        self.x_shape = (8, 8)
-        self.crop_shape = (2, 2)
-        self.offsets = [1, 2]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.006)
-
-
-class TestCase1(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (16, 8, 32)
-        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 5, 3]
-
-
-class TestCase2(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (4, 8)
-        self.crop_shape = [4, 8]
-        self.offsets = [0, 0]
-
-
-class TestCase3(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (4, 8, 16)
-        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 5, 3]
-        self.crop_by_input = True
-
-
-class TestCase4(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (4, 4)
-        self.crop_shape = [4, 4]
-        self.offsets = [0, 0]
-        self.crop_by_input = True
-
-
-class TestCase5(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (3, 4, 5)
-        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 0, 2]
-        self.offset_by_input = True
-
-
-class TestCase6(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (10, 9, 14)
-        self.crop_shape = [3, 3, 5]
-        self.offsets = [3, 5, 4]
-        self.crop_by_input = True
-        self.offset_by_input = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py b/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
deleted file mode 100644
index ed04a85004215831690c72605199dce9258cac0e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
+++ /dev/null
@@ -1,218 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid as fluid
-
-
-def crop(data, offsets, crop_shape):
-    def indexOf(shape, index):
-        result = []
-        for dim in reversed(shape):
-            result.append(index % dim)
-            index = index / dim
-        return result[::-1]
-
-    result = []
-    for i, value in enumerate(data.flatten()):
-        index = indexOf(data.shape, i)
-        selected = True
-        if len(index) == len(offsets):
-            for j, offset in enumerate(offsets):
-                selected = selected and index[j] >= offset and index[
-                    j] < crop_shape[j] + offset
-            if selected:
-                result.append(value)
-    return np.array(result).reshape(crop_shape)
-
-
-class TestCropTensorOp(OpTest):
-    def setUp(self):
-        self.op_type = "crop_tensor"
-        self.crop_by_1D_shape = False
-        self.offset_by_input = False
-        self.unk_dim_idx = -1
-        self.attrs = {}
-        self.initTestCase()
-
-        if self.crop_by_1D_shape:
-            self.inputs = {
-                'X': np.random.random(self.x_shape).astype("float32"),
-                'Shape': np.array(self.crop_shape).astype("int32")
-            }
-        else:
-            self.attrs['shape'] = self.crop_shape
-            self.inputs = {
-                'X': np.random.random(self.x_shape).astype("float32"),
-            }
-        if self.offset_by_input:
-            self.inputs['Offsets'] = np.array(self.offsets).astype('int32')
-        else:
-            self.attrs['offsets'] = self.offsets
-
-        if self.unk_dim_idx != -1:
-            self.crop_shape[self.unk_dim_idx] = self.x_shape[self.unk_dim_idx]
-        self.outputs = {
-            'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
-        }
-
-    def initTestCase(self):
-        self.x_shape = (8, 8)
-        self.crop_shape = [2, 2]
-        self.offsets = [1, 2]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.006)
-
-
-class TestCase1(TestCropTensorOp):
-    def initTestCase(self):
-        self.x_shape = (100)
-        self.crop_shape = [64]
-        self.offsets = [13]
-
-
-class TestCase2(TestCropTensorOp):
-    def initTestCase(self):
-        self.x_shape = (12, 24)
-        self.crop_shape = [-1, 8]  #only the first dimension (batch) can be -1
-        self.offsets = [0, 0]
-        self.unk_dim_idx = 0
-
-
-class TestCase3(TestCropTensorOp):
-    def initTestCase(self):
-        self.x_shape = (4, 8, 16)
-        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 5, 3]
-        self.crop_by_1D_shape = True
-
-
-class TestCase4(TestCropTensorOp):
-    def initTestCase(self):
-        self.x_shape = (8, 3, 6, 6)
-        self.crop_shape = [-1, 3, 4, 4]
-        self.offsets = [0, 0, 0, 0]
-        self.crop_by_1D_shape = True
-        self.unk_dim_idx = 0
-
-
-class TestCase5(TestCropTensorOp):
-    def initTestCase(self):
-        self.x_shape = (2, 4, 5, 8, 8)
-        self.crop_shape = [1, 1, 2, 4, 4]
-        self.offsets = [1, 0, 0, 2, 2]
-        self.offset_by_input = True
-
-
-class TestCase6(TestCropTensorOp):
-    def initTestCase(self):
-        self.x_shape = (2, 2, 4, 4, 4, 2)
-        self.crop_shape = [1, 1, 4, 2, 2, 2]
-        self.offsets = [0, 0, 0, 0, 0, 0]
-        self.crop_by_1D_shape = True
-        self.offset_by_input = True
-
-
-class TestCropTensorOp_attr_tensor(OpTest):
-    def setUp(self):
-        self.op_type = "crop_tensor"
-        self.mixed_type = False
-        self.OffsetsTensor = False
-        self.ShapeTensor = True
-        self.attrs = {}
-        self.initTestCase()
-
-        if self.ShapeTensor:
-            shape_tensor = []
-            for index, ele in enumerate(self.crop_shape):
-                shape_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs = {
-                'X': np.random.random(self.x_shape).astype("float32"),
-                'ShapeTensor': shape_tensor
-            }
-            if self.mixed_type:
-                self.attrs['shape'] = self.shape_attr
-
-        if self.OffsetsTensor:
-            offsets_tensor = []
-            for index, ele in enumerate(self.offsets):
-                offsets_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs = {
-                'X': np.random.random(self.x_shape).astype("float32"),
-                'OffsetsTensor': offsets_tensor
-            }
-        else:
-            self.attrs['offsets'] = self.offsets
-
-        self.outputs = {
-            'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
-        }
-
-    def initTestCase(self):
-        self.x_shape = (8, 8)
-        self.crop_shape = (2, 2)
-        self.offsets = [1, 2]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(["X"], "Out", max_relative_error=0.006)
-
-
-class TestCropTensorOp_attr_tensor_case1(TestCropTensorOp_attr_tensor):
-    def init_data(self):
-        self.x_shape = (16, 8, 32)
-        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 5, 3]
-
-
-class TestCropTensorOp_attr_tensor_case2(TestCropTensorOp_attr_tensor):
-    def init_data(self):
-        self.x_shape = (4, 8, 16, 8)
-        self.crop_shape = [2, 2, 3, 4]
-        self.offsets = [1, 5, 3, 0]
-        self.shape_attr = [-1, -1, 3, 4]
-        self.mixed_type = True
-
-
-class TestCropTensorOp_attr_tensor_case3(TestCropTensorOp_attr_tensor):
-    def init_data(self):
-        self.x_shape = (16, 8, 32)
-        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 5, 3]
-        self.ShapeTensor = False
-        self.OffsetsTensor = True
-
-
-class TestCropTensorOp_attr_tensor_case4(TestCropTensorOp_attr_tensor):
-    def init_data(self):
-        self.x_shape = (16, 8, 32)
-        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 5, 3]
-        self.OffsetsTensor = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
deleted file mode 100644
index 813d90b426e6c9bc7850bcab92b1c7d590cff945..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from op_test import OpTest
-import unittest
-import numpy as np
-import six
-
-
-class CrossEntropy2OpTestBase(OpTest):
-    def initParameters(self):
-        return [32, 64], 'float32', -100, False
-
-    def calc_output(self, logits, label, ignore_index):
-        ret = np.zeros(shape=label.shape, dtype=logits.dtype)
-        match_x = np.zeros(shape=label.shape, dtype=logits.dtype)
-        for idx in six.moves.range(label.shape[0]):
-            if label[idx] == ignore_index:
-                continue
-            match_x[idx] = logits[idx][label[idx]]
-            ret[idx] = -np.log(match_x[idx])
-        return ret, match_x
-
-    def setUp(self):
-        self.shape, self.dtype, self.ignore_index, self.drop_last_dim = self.initParameters(
-        )
-        self.op_type = 'cross_entropy2'
-        feature_size = int(self.shape[-1])
-        batch_size = int(np.prod(self.shape) / feature_size)
-        logits = (np.random.random(size=self.shape) + 1).astype(self.dtype)
-        label_shape = self.shape[0:-1] if self.drop_last_dim else self.shape[
-            0:-1] + [1]
-        label = np.random.random_integers(
-            low=0, high=feature_size - 1, size=label_shape).astype('int64')
-        outputs, match_x = self.calc_output(
-            np.reshape(logits, [batch_size, feature_size]),
-            np.reshape(label, [batch_size, 1]), self.ignore_index)
-        self.inputs = {'X': logits, 'Label': label}
-        out_shape = label_shape
-        self.outputs = {
-            'Y': np.reshape(outputs, out_shape),
-            'MatchX': np.reshape(match_x, self.shape[:-1] + [1]),
-            'XShape': np.zeros(
-                shape=logits.shape, dtype=logits.dtype)
-        }
-        self.attrs = {'ignore_index': self.ignore_index}
-
-    def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        self.check_grad(
-            inputs_to_check=['X'],
-            output_names=['Y'],
-            no_grad_set=['XShape', 'MatchX', 'Label'])
-
-
-class CrossEntropy2OpTest2(CrossEntropy2OpTestBase):
-    def initParameters(self):
-        return [32, 64], 'float64', 3, False
-
-
-class CrossEntropy2OpTest2RemoveLastDim(CrossEntropy2OpTestBase):
-    def initParameters(self):
-        return [32, 64], 'float64', 3, True
-
-
-class CrossEntropy2OpTest3(CrossEntropy2OpTestBase):
-    def initParameters(self):
-        return [4, 8, 16, 32], 'float32', -100, False
-
-
-class CrossEntropy2OpTest3RemoveLastDim(CrossEntropy2OpTestBase):
-    def initParameters(self):
-        return [4, 8, 16, 32], 'float32', -100, True
-
-
-class CrossEntropy2OpTest4(CrossEntropy2OpTestBase):
-    def initParameters(self):
-        return [4, 8, 16, 32], 'float32', 3, False
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
deleted file mode 100644
index fc8484df2d5e219a6ecc335cd00c735119de7f32..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ /dev/null
@@ -1,360 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest, randomize_probability
-
-
-class TestCrossEntropyOp(OpTest):
-    """Test cross-entropy with discrete one-hot labels.
-    """
-
-    def setUp(self):
-        self.op_type = "cross_entropy"
-        self.soft_label = False
-        self.ignore_index = -100
-        self.dtype = np.float64
-        self.batch_size = 30
-        self.class_num = 10
-
-        self.init_dtype_type()
-        self.init_attr_type()
-        self.init_bs_class_num()
-        self.init_x()
-        self.init_label()
-        self.get_cross_entropy()
-
-        self.inputs = {"X": self.x, "Label": self.label}
-        self.outputs = {"Y": self.cross_entropy}
-        self.attrs = {
-            "soft_label": self.soft_label,
-            "ignore_index": self.ignore_index
-        }
-
-    def init_x(self):
-        self.x = randomize_probability(
-            self.batch_size, self.class_num, dtype=self.dtype)
-
-    def init_label(self):
-        self.label = np.random.randint(
-            0, self.class_num, (self.batch_size, 1), dtype="int64")
-
-    def get_cross_entropy(self):
-        self.cross_entropy = np.asmatrix(
-            [[-np.log(self.x[i][self.label[i][0]])]
-             for i in range(self.x.shape[0])],
-            dtype="float64")
-
-    def init_attr_type(self):
-        pass
-
-    def init_dtype_type(self):
-        pass
-
-    def init_bs_class_num(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
-
-
-class TestCrossEntropyOpRemoveLastDim(TestCrossEntropyOp):
-    """Test cross-entropy with discrete one-hot labels with shape [batch_size]
-    """
-
-    def init_label(self):
-        self.label = np.random.randint(
-            0, self.class_num, (self.batch_size), dtype="int64")
-
-    def get_cross_entropy(self):
-        self.cross_entropy = np.asmatrix(
-            [
-                -np.log(self.x[i][self.label[i]])
-                for i in range(self.x.shape[0])
-            ],
-            dtype="float64")
-
-
-class TestCrossEntropyOp2(TestCrossEntropyOp):
-    """Test cross-entropy with vectorized soft labels.
-    """
-
-    def init_label(self):
-        self.label = np.random.uniform(
-            0.1, 1.0, [self.batch_size, self.class_num]).astype(self.dtype)
-        self.label /= self.label.sum(axis=1, keepdims=True)
-
-    def get_cross_entropy(self):
-        self.cross_entropy = (-self.label * np.log(self.x)).sum(
-            axis=1, keepdims=True).astype(self.dtype)
-
-    def init_attr_type(self):
-        self.soft_label = True
-
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def init_bs_class_num(self):
-        self.batch_size = 5
-        self.class_num = 37
-
-    def test_check_grad(self):
-        self.check_grad(
-            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
-
-
-class TestCrossEntropyOp3(TestCrossEntropyOp):
-    """Test cross-entropy with vectorized one-hot representation of labels.
-    """
-
-    def init_label(self):
-        self.label_index = np.random.randint(0, self.class_num,
-                                             (self.batch_size))
-        self.label = np.zeros(self.x.shape).astype(self.dtype)
-        self.label[np.arange(self.batch_size), self.label_index] = 1
-
-    def get_cross_entropy(self):
-        self.cross_entropy = np.asmatrix(
-            [[-np.log(self.x[i][self.label_index[i]])]
-             for i in range(self.x.shape[0])]).astype(self.dtype)
-
-    def init_attr_type(self):
-        self.soft_label = True
-
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def init_bs_class_num(self):
-        self.batch_size = 5
-        self.class_num = 17
-
-    def test_check_grad(self):
-        self.check_grad(
-            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
-
-
-class TestCrossEntropyOp4(TestCrossEntropyOp):
-    """Test high rank tensor cross-entropy with discrete one-hot labels.
-    """
-
-    def init_x(self):
-        self.shape = [10, 2, 4]
-        self.ins_num = np.prod(np.array(self.shape))
-        self.X_2d = randomize_probability(self.ins_num,
-                                          self.class_num).astype(self.dtype)
-        self.x = self.X_2d.reshape(self.shape + [self.class_num])
-
-    def init_label(self):
-        self.label_2d = np.random.randint(
-            0, self.class_num, (self.ins_num, 1), dtype="int64")
-        self.label = self.label_2d.reshape(self.shape + [1])
-
-    def get_cross_entropy(self):
-        cross_entropy_2d = np.asmatrix(
-            [[-np.log(self.X_2d[i][self.label_2d[i][0]])]
-             for i in range(self.X_2d.shape[0])]).astype(self.dtype)
-        self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
-                                                                [1])
-
-    def init_attr_type(self):
-        self.soft_label = False
-
-    def init_dtype_type(self):
-        self.dtype = np.float64
-
-    def init_bs_class_num(self):
-        self.class_num = 10
-
-
-class TestCrossEntropyOp4RemoveLastDim(TestCrossEntropyOp4):
-    """Test high rank tensor cross-entropy with discrete one-hot labels with shape [batch_size]
-    """
-
-    def init_label(self):
-        self.label_2d = np.random.randint(
-            0, self.class_num, (self.ins_num, 1), dtype="int64")
-        self.label = self.label_2d.reshape(self.shape)
-
-    def get_cross_entropy(self):
-        cross_entropy_2d = np.asmatrix(
-            [[-np.log(self.X_2d[i][self.label_2d[i][0]])]
-             for i in range(self.X_2d.shape[0])]).astype(self.dtype)
-        self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape)
-
-
-class TestCrossEntropyOp5(TestCrossEntropyOp):
-    """Test high rank tensor cross-entropy with vectorized soft labels.
-    """
-
-    def init_x(self):
-        self.shape = [4, 3]
-        self.ins_num = np.prod(np.array(self.shape))
-        self.X_2d = randomize_probability(self.ins_num,
-                                          self.class_num).astype(self.dtype)
-        self.x = self.X_2d.reshape(self.shape + [self.class_num])
-
-    def init_label(self):
-        self.label_2d = np.random.uniform(
-            0.1, 1.0, [self.ins_num, self.class_num]).astype(self.dtype)
-        self.label_2d /= self.label_2d.sum(axis=1, keepdims=True)
-        self.label = self.label_2d.reshape(self.shape + [self.class_num])
-
-    def get_cross_entropy(self):
-        cross_entropy_2d = (-self.label_2d * np.log(self.X_2d)).sum(
-            axis=1, keepdims=True).astype(self.dtype)
-        self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
-                                                                [1])
-
-    def init_attr_type(self):
-        self.soft_label = True
-
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def init_bs_class_num(self):
-        self.class_num = 37
-
-    def test_check_grad(self):
-        self.check_grad(
-            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
-
-
-class TestCrossEntropyOp6(TestCrossEntropyOp):
-    """Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
-    """
-
-    def init_x(self):
-        self.shape = [4, 3, 2]
-        self.ins_num = np.prod(np.array(self.shape))
-        self.X_2d = randomize_probability(self.ins_num,
-                                          self.class_num).astype(self.dtype)
-        self.x = self.X_2d.reshape(self.shape + [self.class_num])
-
-    def init_label(self):
-        self.label_index_2d = np.random.randint(
-            0, self.class_num, (self.ins_num), dtype="int64")
-        label_2d = np.zeros(self.X_2d.shape)
-        label_2d[np.arange(self.ins_num), self.label_index_2d] = 1
-        self.label = label_2d.reshape(self.shape + [self.class_num]).astype(
-            self.dtype)
-
-    def get_cross_entropy(self):
-        cross_entropy_2d = np.asmatrix(
-            [[-np.log(self.X_2d[i][self.label_index_2d[i]])]
-             for i in range(self.X_2d.shape[0])])
-        self.cross_entropy = np.array(cross_entropy_2d).reshape(
-            self.shape + [1]).astype(self.dtype)
-
-    def init_attr_type(self):
-        self.soft_label = True
-
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def init_bs_class_num(self):
-        self.class_num = 17
-
-    def test_check_grad(self):
-        self.check_grad(
-            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
-
-
-class TestCrossEntropyOp7(TestCrossEntropyOp):
-    """Test cross-entropy with ignore index.
-    """
-
-    def init_label(self):
-        self.label = np.random.randint(
-            0, self.class_num, (self.batch_size, 1), dtype="int64")
-
-    def get_cross_entropy(self):
-        self.cross_entropy = np.asmatrix(
-            [[-np.log(self.x[i][self.label[i][0]])]
-             if self.label[i][0] != self.ignore_index else [0]
-             for i in range(self.x.shape[0])]).astype(self.dtype)
-
-    def init_attr_type(self):
-        self.soft_label = False
-        self.ignore_index = 3
-
-    def init_dtype_type(self):
-        self.dtype = np.float64
-
-    def init_bs_class_num(self):
-        self.batch_size = 30
-        self.class_num = 10
-
-
-class TestCrossEntropyOp7RemoveLastDim(TestCrossEntropyOp7):
-    """Test cross-entropy with ignore index with shape [batch_size].
-    """
-
-    def init_label(self):
-        self.label = np.random.randint(
-            0, self.class_num, (self.batch_size), dtype="int64")
-
-    def get_cross_entropy(self):
-        self.cross_entropy = np.asmatrix(
-            [[-np.log(self.x[i][self.label[i]])]
-             if self.label[i] != self.ignore_index else [0]
-             for i in range(self.x.shape[0])]).astype(self.dtype)
-        self.cross_entropy = np.array(self.cross_entropy).reshape(
-            [self.batch_size]).astype(self.dtype)
-
-
-# Add Fp16 test
-def create_test_class(parent, cls_name):
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    class TestCrossEntropyFP16Op(parent):
-        def init_dtype_type(self):
-            return np.float16
-
-        def test_check_output(self):
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=2e-1)
-
-        def test_check_grad(self):
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_grad_with_place(
-                    place, ['X'], 'Y', max_relative_error=0.9)
-
-    cls_name = "{0}".format(cls_name)
-    TestCrossEntropyFP16Op.__name__ = cls_name
-    globals()[cls_name] = TestCrossEntropyFP16Op
-
-
-create_test_class(TestCrossEntropyOp, "TestCrossEntropyF16Op")
-#create_test_class(TestCrossEntropyOp2, "TestCrossEntropyF16Op2")
-create_test_class(TestCrossEntropyOp3, "TestCrossEntropyF16Op3")
-create_test_class(TestCrossEntropyOp4, "TestCrossEntropyF16Op4")
-create_test_class(TestCrossEntropyOp4RemoveLastDim,
-                  "TestCrossEntropyF16Op4RemoveLastDim")
-#create_test_class(TestCrossEntropyOp5, "TestCrossEntropyF16Op5")
-create_test_class(TestCrossEntropyOp6, "TestCrossEntropyF16Op6")
-create_test_class(TestCrossEntropyOp7, "TestCrossEntropyF16Op7")
-create_test_class(TestCrossEntropyOp7RemoveLastDim,
-                  "TestCrossEntropyF16Op7RemoveLastDim")
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
deleted file mode 100644
index 2078ff8ef1c32fab7698b2599d7e7c9ab0863082..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ /dev/null
@@ -1,221 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_softmax_op import stable_softmax
-import paddle.fluid as fluid
-
-
-def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None):
-    if input_length is None:
-        lod0 = lod[0]
-        result = []
-        cur_offset = 0
-        for i in range(len(lod0)):
-            prev_token = -1
-            for j in range(cur_offset, cur_offset + lod0[i]):
-                token = input[j][0]
-                if (token != blank) and not (merge_repeated and
-                                             token == prev_token):
-                    result.append(token)
-                prev_token = token
-            cur_offset += lod0[i]
-        result = np.array(result).reshape([len(result), 1]).astype("int32")
-        if len(result) == 0:
-            result = np.array([-1])
-        return result
-    else:
-        result = [[] for i in range(len(input))]
-        output_length = []
-        for i in range(len(input)):
-            prev_token = -1
-            for j in range(input_length[i][0]):
-                token = input[i][j]
-                if (token != blank) and not (merge_repeated and
-                                             token == prev_token):
-                    result[i].append(token)
-                prev_token = token
-            start = len(result[i])
-            output_length.append([start])
-            for j in range(start, len(input[i])):
-                result[i].append(padding)
-        result = np.array(result).reshape(
-            [len(input), len(input[0])]).astype("int32")
-        output_length = np.array(output_length).reshape(
-            [len(input), 1]).astype("int32")
-
-    return result, output_length
-
-
-class TestCTCAlignOp(OpTest):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = [[11, 7]]
-        self.blank = 0
-        self.merge_repeated = False
-        self.input = np.array(
-            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0]).reshape(
-                [18, 1]).astype("int32")
-
-    def setUp(self):
-        self.config()
-        output = CTCAlign(self.input, self.input_lod, self.blank,
-                          self.merge_repeated)
-
-        self.inputs = {"Input": (self.input, self.input_lod), }
-        self.outputs = {"Output": output}
-        self.attrs = {
-            "blank": self.blank,
-            "merge_repeated": self.merge_repeated
-        }
-
-    def test_check_output(self):
-        self.check_output()
-        pass
-
-
-class TestCTCAlignOpCase1(TestCTCAlignOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = [[11, 8]]
-        self.blank = 0
-        self.merge_repeated = True
-        self.input = np.array(
-            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0, 0]).reshape(
-                [19, 1]).astype("int32")
-
-
-class TestCTCAlignOpCase2(TestCTCAlignOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = [[4]]
-        self.blank = 0
-        self.merge_repeated = True
-        self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
-
-
-class TestCTCAlignPaddingOp(OpTest):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = []
-        self.blank = 0
-        self.padding_value = 0
-        self.merge_repeated = True
-        self.input = np.array([[0, 2, 4, 4, 0, 6, 3, 6, 6, 0, 0],
-                               [1, 1, 3, 0, 0, 4, 5, 6, 0, 0, 0]]).reshape(
-                                   [2, 11]).astype("int32")
-        self.input_length = np.array([[9], [8]]).reshape([2, 1]).astype("int32")
-
-    def setUp(self):
-        self.config()
-        output, output_length = CTCAlign(self.input, self.input_lod, self.blank,
-                                         self.merge_repeated,
-                                         self.padding_value, self.input_length)
-        self.inputs = {
-            "Input": (self.input, self.input_lod),
-            "InputLength": self.input_length
-        }
-        self.outputs = {"Output": output, "OutputLength": output_length}
-        self.attrs = {
-            "blank": self.blank,
-            "merge_repeated": self.merge_repeated,
-            "padding_value": self.padding_value
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestCTCAlignOpCase3(TestCTCAlignPaddingOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = True
-        self.padding_value = 0
-        self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
-                               [0, 7, 7, 7, 0, 0]]).reshape(
-                                   [3, 6]).astype("int32")
-        self.input_length = np.array([[6], [5],
-                                      [4]]).reshape([3, 1]).astype("int32")
-
-
-class TestCTCAlignOpCase4(TestCTCAlignPaddingOp):
-    '''
-    # test tensor input which has attr input padding_value
-    '''
-
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = False
-        self.padding_value = 0
-        self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
-                               [0, 7, 7, 7, 0, 0]]).reshape(
-                                   [3, 6]).astype("int32")
-        self.input_length = np.array([[6], [5],
-                                      [4]]).reshape([3, 1]).astype("int32")
-
-
-class TestCTCAlignOpCase5(TestCTCAlignPaddingOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = False
-        self.padding_value = 1
-        self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
-                               [0, 7, 1, 7, 0, 0]]).reshape(
-                                   [3, 6]).astype("int32")
-        self.input_length = np.array([[6], [5],
-                                      [4]]).reshape([3, 1]).astype("int32")
-
-
-class TestCTCAlignOpApi(unittest.TestCase):
-    def test_api(self):
-        x = fluid.layers.data('x', shape=[4], dtype='float32')
-        y = fluid.layers.ctc_greedy_decoder(x, blank=0)
-
-        x_pad = fluid.layers.data('x_pad', shape=[4, 4], dtype='float32')
-        x_pad_len = fluid.layers.data('x_pad_len', shape=[1], dtype='int64')
-        y_pad, y_pad_len = fluid.layers.ctc_greedy_decoder(
-            x_pad, blank=0, input_length=x_pad_len)
-
-        place = fluid.CPUPlace()
-        x_tensor = fluid.create_lod_tensor(
-            np.random.rand(8, 4).astype("float32"), [[4, 4]], place)
-
-        x_pad_tensor = np.random.rand(2, 4, 4).astype("float32")
-        x_pad_len_tensor = np.array([[4], [4]]).reshape([2, 1]).astype("int64")
-
-        exe = fluid.Executor(place)
-
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={
-            'x': x_tensor,
-            'x_pad': x_pad_tensor,
-            'x_pad_len': x_pad_len_tensor
-        },
-                      fetch_list=[y, y_pad, y_pad_len],
-                      return_numpy=False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
deleted file mode 100644
index 13a4eacece8a211513d6537db0d09b80c238178e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSumOp1(OpTest):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.attrs = {'axis': 2}
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].cumsum(axis=2)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSumOp2(OpTest):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.attrs = {'axis': -1, 'reverse': True}
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.outputs = {
-            'Out': np.flip(
-                np.flip(
-                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSumOp3(OpTest):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.attrs = {'axis': 1}
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSumOp4(OpTest):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.attrs = {'axis': 0}
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSumOp5(OpTest):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.inputs = {'X': np.random.random((5, 6)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSumOp7(OpTest):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.inputs = {'X': np.random.random((6)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSumOp8(OpTest):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.attrs = {'axis': 2, "exclusive": True}
-        a = np.random.random((5, 6, 3)).astype("float64")
-        self.inputs = {'X': a}
-        self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (5, 6, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cvm_op.py b/python/paddle/fluid/tests/unittests/test_cvm_op.py
deleted file mode 100644
index 69bc0b66510fefb2f7ae0d34a206bac2d47a1a84..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cvm_op.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from math import log
-from math import exp
-from op_test import OpTest
-import unittest
-
-
-def cvm_compute(X, item_width, use_cvm):
-    cvm_offset = 0 if use_cvm else 2
-    batch_size = X.shape[0]
-
-    Y = np.ones([batch_size, item_width - cvm_offset], np.float32)
-
-    for idx in range(batch_size):
-        if use_cvm:
-            Y[idx] = X[idx]
-            Y[idx][0] = log(Y[idx][0] + 1)
-            Y[idx][1] = log(Y[idx][1] + 1) - Y[idx][0]
-        else:
-            Y[idx] = X[idx][2:]
-
-    return Y
-
-
-def cvm_grad_compute(DY, CVM, item_width, use_cvm):
-    batch_size = DY.shape[0]
-    DX = np.ones([batch_size, item_width], np.float32)
-
-    for idx in range(batch_size):
-        DX[idx][0] = CVM[idx][0]
-        DX[idx][1] = CVM[idx][1]
-
-        if use_cvm:
-            DX[idx][2:] = DY[idx][2:]
-        else:
-            DX[idx][2:] = DY[idx]
-    return DX
-
-
-class TestCVMOpWithLodTensor(OpTest):
-    """
-        Test cvm op with discrete one-hot labels.
-    """
-
-    def setUp(self):
-        self.op_type = "cvm"
-        self.use_cvm = True
-
-        batch_size = 8
-        dims = 11
-
-        lod = [[1]]
-        self.inputs = {
-            'X': (np.random.uniform(0, 1, [1, dims]).astype("float32"), lod),
-            'CVM': np.array([[0.6, 0.4]]).astype("float32"),
-        }
-        self.attrs = {'use_cvm': False}
-        out = []
-        for index, emb in enumerate(self.inputs["X"][0]):
-            out.append(emb[2:])
-        self.outputs = {'Y': (np.array(out), lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestCVMOpWithOutLodTensor1(OpTest):
-    """
-    Test cvm op with discrete one-hot labels.
-    """
-
-    def setUp(self):
-        self.op_type = "cvm"
-        self.use_cvm = True
-
-        batch_size = 2
-        item_width = 11
-
-        input = np.random.uniform(0, 1,
-                                  (batch_size, item_width)).astype('float32')
-        output = cvm_compute(input, item_width, self.use_cvm)
-        cvm = np.array([[0.6, 0.4]]).astype("float32")
-
-        self.inputs = {'X': input, 'CVM': cvm}
-        self.attrs = {'use_cvm': self.use_cvm}
-        self.outputs = {'Y': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestCVMOpWithOutLodTensor2(OpTest):
-    """
-    Test cvm op with discrete one-hot labels.
-    """
-
-    def setUp(self):
-        self.op_type = "cvm"
-        self.use_cvm = False
-
-        batch_size = 2
-        item_width = 11
-
-        input = np.random.uniform(0, 1,
-                                  (batch_size, item_width)).astype('float32')
-        output = cvm_compute(input, item_width, self.use_cvm)
-        cvm = np.array([[0.6, 0.4]]).astype("float32")
-
-        self.inputs = {'X': input, 'CVM': cvm}
-        self.attrs = {'use_cvm': self.use_cvm}
-        self.outputs = {'Y': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cyclic_cifar_dataset.py b/python/paddle/fluid/tests/unittests/test_cyclic_cifar_dataset.py
deleted file mode 100644
index 01a588c4058a4cc17bfc1d3e593086809ac893c9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cyclic_cifar_dataset.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import unittest
-
-
-class TestCifar10(unittest.TestCase):
-    def test_main(self):
-        reader = paddle.dataset.cifar.train10(cycle=False)
-        sample_num = 0
-        for _ in reader():
-            sample_num += 1
-
-        cyclic_reader = paddle.dataset.cifar.train10(cycle=True)
-        read_num = 0
-        for data in cyclic_reader():
-            read_num += 1
-            self.assertEquals(len(data), 2)
-            if read_num == sample_num * 2:
-                break
-
-        self.assertEquals(read_num, sample_num * 2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_data_norm_op.py b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
deleted file mode 100644
index 0273664d5d793c67ffee709ed3ab5d265879c997..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_data_norm_op.py
+++ /dev/null
@@ -1,203 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This is unit test of Test data_norm Op."""
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import paddle.fluid as fluid
-from op_test import OpTest
-from paddle.fluid.framework import grad_var_name
-
-
-def _reference_testing(x, batch_size, batch_sum, batch_square_sum):
-    x_shape = x.shape
-    means_arr = batch_sum / batch_size
-    scales_arr = np.sqrt(batch_size / batch_square_sum)
-    for i in range(x_shape[0]):
-        x[i] -= means_arr
-        x[i] *= scales_arr
-    y = np.array(x)
-    return y
-
-
-def create_or_get_tensor(scope, var_name, var, place):
-    tensor = scope.var(var_name).get_tensor()
-    if var is not None:
-        assert isinstance(var, np.ndarray)
-        tensor.set_recursive_sequence_lengths([])
-        tensor.set(var, place)
-    return tensor
-
-
-class TestDataNormOpInference(unittest.TestCase):
-    """
-    test class for data norm op
-    test forward
-    """
-
-    def setUp(self):
-        """
-        init members of this class
-        """
-        self.dtype = np.float32
-        self.use_mkldnn = False
-
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
-
-    def check_with_place(self, place, data_layout, dtype, shape):
-        """
-        do forward and check
-
-        Args:
-            place(Place): CPUPlace
-            data_layout(str): NCHW or NWHC
-            dtype(dtype): np.float32
-            shape(list): input shape
-
-        """
-        epsilon = 0.00001
-        if len(shape) == 2:
-            x_shape = shape
-            c = x_shape[1]
-        else:
-            ValueError("len(shape) should be equal to 2")
-        scale_shape = [c]
-
-        x_val = np.random.random_sample(x_shape).astype(dtype)
-        x_val = x_val - 0.5
-        batch_size = np.ones(scale_shape).astype(np.float32)
-        batch_size *= 1e4
-        batch_sum = np.zeros(scale_shape).astype(np.float32)
-        batch_square_sum = np.ones(scale_shape).astype(np.float32)
-        batch_square_sum *= 1e4
-
-        y_out = _reference_testing(x_val, batch_size, batch_sum,
-                                   batch_square_sum).astype(dtype)
-
-        scope = core.Scope()
-
-        # create input
-        x_tensor = create_or_get_tensor(scope, "x_val",
-                                        OpTest.np_dtype_to_fluid_dtype(x_val),
-                                        place)
-        batch_size_tensor = create_or_get_tensor(
-            scope, "batch_size",
-            OpTest.np_dtype_to_fluid_dtype(batch_size), place)
-        batch_sum_tensor = create_or_get_tensor(
-            scope, "batch_sum",
-            OpTest.np_dtype_to_fluid_dtype(batch_sum), place)
-        batch_square_sum_tensor = create_or_get_tensor(
-            scope, "batch_square_sum",
-            OpTest.np_dtype_to_fluid_dtype(batch_square_sum), place)
-
-        # create output
-        y_tensor = create_or_get_tensor(scope, "y_out", None, place)
-        mean_tensor = create_or_get_tensor(scope, "mean", None, place)
-        scales_tensor = create_or_get_tensor(scope, "scales", None, place)
-
-        data_norm_op = Operator(
-            "data_norm",
-            # inputs
-            X="x_val",
-            BatchSize="batch_size",
-            BatchSum="batch_sum",
-            BatchSquareSum="batch_square_sum",
-            # outputs
-            Y="y_out",
-            Means="mean",
-            Scales="scales",
-            # attrs
-            epsilon=epsilon,
-            use_mkldnn=self.use_mkldnn)
-
-        data_norm_op.run(scope, place)
-
-        # check inference result
-        self.__assert_close(
-            y_tensor,
-            y_out,
-            "inference output are different at " + str(place) + ", " +
-            data_layout + ", " + str(np.dtype(dtype)) +
-            str(np.array(y_tensor)) + str(y_out),
-            atol=1e-3)
-
-    def test_check_output(self):
-        """
-        test check forward, check output
-        """
-        places = [core.CPUPlace()]
-        for place in places:
-            for data_format in ["NCHW", "NHWC"]:
-                self.check_with_place(place, data_format, self.dtype, [2, 3])
-
-
-class TestDataNormOp(OpTest):
-    """
-    test class for data norm op
-    test forward and backward
-    """
-
-    def setUp(self):
-        """
-        init data norm op test env
-        """
-        self.op_type = 'data_norm'
-        self.use_mkldnn = False
-        epsilon = 0.00001
-        x_shape = [2, 3]
-        scale_shape = [3]
-        tp = np.float32
-
-        x_val = np.array([[-0.35702616, -0.42756206, -0.08306625],
-                          [0.41199666, -0.21719968, -0.10180971]]).astype(tp)
-        batch_size = np.ones(scale_shape).astype(tp)
-        batch_size *= 1e4
-        batch_sum = np.zeros(scale_shape).astype(tp)
-        batch_square_sum = np.ones(scale_shape).astype(tp)
-        batch_square_sum *= 1e4
-
-        y = np.array(x_val)
-
-        mean = np.array([[0, 0, 0], [0, 0, 0]]).astype(tp)
-        scale = np.array([[1, 1, 1], [1, 1, 1]]).astype(tp)
-
-        self.inputs = {
-            "X": x_val,
-            "BatchSize": batch_size,
-            "BatchSum": batch_sum,
-            "BatchSquareSum": batch_square_sum
-        }
-        self.outputs = {"Y": y, "Means": mean, "Scales": scale}
-        self.attrs = {"epsilon": epsilon, "use_mkldnn": self.use_mkldnn}
-
-    def test_check_output(self):
-        """
-        test check forward, check output
-        """
-        self.check_output()
-
-    def test_check_grad(self):
-        """
-        test check backward, check grad
-        """
-        self.check_grad(['X'], 'Y', no_grad_set=set([]))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
deleted file mode 100644
index 8bfa88dc2c75b74060e0640664e6e73fff6b5144..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ /dev/null
@@ -1,414 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-TestCases for Dataset,
-including create, config, run, etc.
-"""
-
-from __future__ import print_function
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import numpy as np
-import os
-import shutil
-import unittest
-
-
-class TestDataset(unittest.TestCase):
-    """  TestCases for Dataset. """
-
-    def setUp(self):
-        self.use_data_loader = False
-        self.epoch_num = 10
-        self.drop_last = False
-
-    def test_dataset_create(self):
-        """ Testcase for dataset create. """
-        try:
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-        except:
-            self.assertTrue(False)
-
-        try:
-            dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-        except:
-            self.assertTrue(False)
-
-        try:
-            dataset = fluid.DatasetFactory().create_dataset(
-                "FileInstantDataset")
-        except:
-            self.assertTrue(False)
-
-        try:
-            dataset = fluid.DatasetFactory().create_dataset("MyOwnDataset")
-            self.assertTrue(False)
-        except:
-            self.assertTrue(True)
-
-    def test_config(self):
-        """
-        Testcase for python config.
-        """
-        dataset = fluid.InMemoryDataset()
-        dataset.set_parse_ins_id(True)
-        dataset.set_parse_content(True)
-        self.assertTrue(dataset.parse_ins_id)
-        self.assertTrue(dataset.parse_content)
-
-    def test_run_with_dump(self):
-        """
-        Testcase for InMemoryDataset from create to run.
-        """
-        with open("test_run_with_dump_a.txt", "w") as f:
-            data = "1 a 1 a 1 1 2 3 3 4 5 5 5 5 1 1\n"
-            data += "1 b 1 b 1 2 2 3 4 4 6 6 6 6 1 2\n"
-            data += "1 c 1 c 1 3 2 3 5 4 7 7 7 7 1 3\n"
-            f.write(data)
-        with open("test_run_with_dump_b.txt", "w") as f:
-            data = "1 d 1 d 1 4 2 3 3 4 5 5 5 5 1 4\n"
-            data += "1 e 1 e 1 5 2 3 4 4 6 6 6 6 1 5\n"
-            data += "1 f 1 f 1 6 2 3 5 4 7 7 7 7 1 6\n"
-            data += "1 g 1 g 1 7 2 3 6 4 8 8 8 8 1 7\n"
-            f.write(data)
-
-        slots = ["slot1", "slot2", "slot3", "slot4"]
-        slots_vars = []
-        for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="int64", lod_level=1)
-            slots_vars.append(var)
-
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
-        dataset.set_filelist(
-            ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
-        dataset.set_parse_ins_id(True)
-        dataset.set_parse_content(True)
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
-        dataset.load_into_memory()
-        dataset.set_fea_eval(10000, True)
-        dataset.local_shuffle()
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_startup_program())
-        for i in range(2):
-            try:
-                exe.train_from_dataset(fluid.default_main_program(), dataset)
-            except ImportError as e:
-                pass
-            except Exception as e:
-                self.assertTrue(False)
-
-        os.remove("./test_run_with_dump_a.txt")
-        os.remove("./test_run_with_dump_b.txt")
-
-    def test_dataset_config(self):
-        """ Testcase for dataset configuration. """
-        dataset = fluid.core.Dataset("MultiSlotDataset")
-        dataset.set_thread_num(12)
-        dataset.set_filelist(["a.txt", "b.txt", "c.txt"])
-        dataset.set_trainer_num(4)
-        dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
-
-        thread_num = dataset.get_thread_num()
-        self.assertEqual(thread_num, 12)
-
-        filelist = dataset.get_filelist()
-        self.assertEqual(len(filelist), 3)
-        self.assertEqual(filelist[0], "a.txt")
-        self.assertEqual(filelist[1], "b.txt")
-        self.assertEqual(filelist[2], "c.txt")
-
-        trainer_num = dataset.get_trainer_num()
-        self.assertEqual(trainer_num, 4)
-
-        name, ugi = dataset.get_hdfs_config()
-        self.assertEqual(name, "my_fs_name")
-        self.assertEqual(ugi, "my_fs_ugi")
-
-    def test_in_memory_dataset_run(self):
-        """
-        Testcase for InMemoryDataset from create to run.
-        """
-        with open("test_in_memory_dataset_run_a.txt", "w") as f:
-            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
-            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
-            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
-            f.write(data)
-        with open("test_in_memory_dataset_run_b.txt", "w") as f:
-            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
-            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
-            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
-            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
-            f.write(data)
-
-        slots = ["slot1", "slot2", "slot3", "slot4"]
-        slots_vars = []
-        for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="int64", lod_level=1)
-            slots_vars.append(var)
-
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
-        dataset.set_filelist([
-            "test_in_memory_dataset_run_a.txt",
-            "test_in_memory_dataset_run_b.txt"
-        ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
-        dataset.load_into_memory()
-        dataset.set_fea_eval(10000, True)
-        dataset.slots_shuffle(["slot1"])
-        dataset.local_shuffle()
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_startup_program())
-        if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
-            for i in range(self.epoch_num):
-                for data in data_loader():
-                    exe.run(fluid.default_main_program(), feed=data)
-        else:
-            for i in range(self.epoch_num):
-                try:
-                    exe.train_from_dataset(fluid.default_main_program(),
-                                           dataset)
-                except Exception as e:
-                    self.assertTrue(False)
-
-        os.remove("./test_in_memory_dataset_run_a.txt")
-        os.remove("./test_in_memory_dataset_run_b.txt")
-
-    def test_in_memory_dataset_run_2(self):
-        """
-        Testcase for InMemoryDataset from create to run.
-        Use CUDAPlace
-        Use float type id
-        """
-        with open("test_in_memory_dataset_run_a.txt", "w") as f:
-            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
-            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
-            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
-            f.write(data)
-        with open("test_in_memory_dataset_run_b.txt", "w") as f:
-            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
-            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
-            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
-            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
-            f.write(data)
-
-        slots = ["slot1_f", "slot2_f", "slot3_f", "slot4_f"]
-        slots_vars = []
-        for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="float32", lod_level=1)
-            slots_vars.append(var)
-
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
-        dataset.set_filelist([
-            "test_in_memory_dataset_run_a.txt",
-            "test_in_memory_dataset_run_b.txt"
-        ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
-        dataset.load_into_memory()
-        dataset.local_shuffle()
-
-        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0))
-        exe.run(fluid.default_startup_program())
-
-        for i in range(2):
-            try:
-                exe.train_from_dataset(fluid.default_main_program(), dataset)
-                exe.train_from_dataset(
-                    fluid.default_main_program(), dataset, thread=1)
-                exe.train_from_dataset(
-                    fluid.default_main_program(), dataset, thread=2)
-                exe.train_from_dataset(
-                    fluid.default_main_program(), dataset, thread=2)
-                exe.train_from_dataset(
-                    fluid.default_main_program(), dataset, thread=3)
-                exe.train_from_dataset(
-                    fluid.default_main_program(), dataset, thread=4)
-            except ImportError as e:
-                pass
-            except Exception as e:
-                self.assertTrue(False)
-
-        if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
-            for i in range(self.epoch_num):
-                for data in data_loader():
-                    exe.run(fluid.default_main_program(), feed=data)
-        else:
-            for i in range(self.epoch_num):
-                try:
-                    exe.train_from_dataset(fluid.default_main_program(),
-                                           dataset)
-                except Exception as e:
-                    self.assertTrue(False)
-
-        dataset.set_merge_by_lineid(slots_vars)
-        dataset.set_fleet_send_sleep_seconds(2)
-        dataset.preload_into_memory()
-        dataset.wait_preload_done()
-        dataset.release_memory()
-        dataset.preload_into_memory(1)
-        dataset.wait_preload_done()
-        fleet_ptr = fluid.core.Fleet()
-        fleet_ptr.set_client2client_config(1, 1, 1)
-
-        os.remove("./test_in_memory_dataset_run_a.txt")
-        os.remove("./test_in_memory_dataset_run_b.txt")
-
-    def test_queue_dataset_run(self):
-        """
-        Testcase for QueueDataset from create to run.
-        """
-        with open("test_queue_dataset_run_a.txt", "w") as f:
-            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
-            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
-            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
-            f.write(data)
-        with open("test_queue_dataset_run_b.txt", "w") as f:
-            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
-            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
-            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
-            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
-            f.write(data)
-
-        slots = ["slot1", "slot2", "slot3", "slot4"]
-        slots_vars = []
-        for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="int64", lod_level=1)
-            slots_vars.append(var)
-
-        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
-        dataset.set_filelist(
-            ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_startup_program())
-        if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
-            for i in range(self.epoch_num):
-                for data in data_loader():
-                    exe.run(fluid.default_main_program(), feed=data)
-        else:
-            for i in range(self.epoch_num):
-                try:
-                    exe.train_from_dataset(fluid.default_main_program(),
-                                           dataset)
-                except Exception as e:
-                    self.assertTrue(False)
-
-        dataset2 = fluid.DatasetFactory().create_dataset("QueueDataset")
-        dataset2.set_use_var(slots_vars)
-        dataset2.set_batch_size(32)
-        dataset2.set_thread(3)
-        dataset2.set_pipe_command("cat")
-        dataset.set_filelist([])
-        try:
-            exe.train_from_dataset(fluid.default_main_program(), dataset2)
-        except ImportError as e:
-            print("warning: we skip trainer_desc_pb2 import problem in windows")
-        except Exception as e:
-            self.assertTrue(False)
-
-        os.remove("./test_queue_dataset_run_a.txt")
-        os.remove("./test_queue_dataset_run_b.txt")
-
-    def test_queue_dataset_run_2(self):
-        """
-        Testcase for QueueDataset from create to run.
-        Use CUDAPlace
-        Use float type id
-        """
-        with open("test_queue_dataset_run_a.txt", "w") as f:
-            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
-            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
-            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
-            f.write(data)
-        with open("test_queue_dataset_run_b.txt", "w") as f:
-            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
-            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
-            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
-            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
-            f.write(data)
-
-        slots = ["slot1_f", "slot2_f", "slot3_f", "slot4_f"]
-        slots_vars = []
-        for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="float32", lod_level=1)
-            slots_vars.append(var)
-
-        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
-        dataset.set_filelist(
-            ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
-
-        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0))
-        exe.run(fluid.default_startup_program())
-        if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
-            for i in range(self.epoch_num):
-                for data in data_loader():
-                    exe.run(fluid.default_main_program(), feed=data)
-        else:
-            for i in range(self.epoch_num):
-                try:
-                    exe.train_from_dataset(fluid.default_main_program(),
-                                           dataset)
-                except Exception as e:
-                    self.assertTrue(False)
-
-        os.remove("./test_queue_dataset_run_a.txt")
-        os.remove("./test_queue_dataset_run_b.txt")
-
-
-class TestDatasetWithDataLoader(TestDataset):
-    def setUp(self):
-        self.use_data_loader = True
-        self.epoch_num = 10
-        self.drop_last = False
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
deleted file mode 100644
index 10aefbb222bb029c48648ce27ead4666b15dfc4d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import numpy as np
-import six
-import os
-import unittest
-from simple_nets import simple_fc_net_with_inputs
-
-BATCH_SIZE = 32
-BATCH_NUM = 10
-EPOCH_NUM = 4
-
-IMAGE_SHAPE = [2, 3]
-LABEL_SHAPE = [1]
-
-ALL_WRITTEN_FILES = set()
-
-
-def get_place_string(p):
-    if isinstance(p, (fluid.CPUPlace or fluid.CUDAPlace)):
-        tmp = fluid.core.Place()
-        tmp.set_place(p)
-        p = tmp
-
-    if p._type() == fluid.CPUPlace()._type():
-        return 'CPUPlace()'
-    else:
-        return 'CUDAPlace()'
-
-
-def remove_all_written_files():
-    for filename in ALL_WRITTEN_FILES:
-        os.remove(filename)
-
-
-def write_reader_data_to_file(filename, reader):
-    ALL_WRITTEN_FILES.add(filename)
-    with open(filename, 'w') as fid:
-        for instance_list in reader():
-            for i, instance in enumerate(instance_list):
-                instance = np.reshape(instance, [instance.size, ])
-                fid.write(str(instance.size) + ' ')
-                fid.write(' '.join(map(str, instance)))
-                fid.write(' ')
-
-            fid.write('\n')
-
-
-def fake_reader(batch_size=BATCH_SIZE, batch_num=BATCH_NUM):
-    def __reader__():
-        iteration = BATCH_SIZE * BATCH_NUM
-        iteration = int(iteration + BATCH_SIZE / 2)
-        for _ in six.moves.range(iteration):
-            image = np.random.random(size=IMAGE_SHAPE).astype('float32')
-            label = np.random.random_integers(
-                size=LABEL_SHAPE, low=0, high=9).astype('int64')
-            yield image, label
-
-    return __reader__
-
-
-class DatasetLoaderTestBase(unittest.TestCase):
-    def setUp(self):
-        self.dataset_name = "QueueDataset"
-        self.drop_last = False
-
-    def tearDown(self):
-        return
-        remove_all_written_files()
-
-    def build_network(self):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.layers.data(
-                name='image', shape=IMAGE_SHAPE, dtype='float32')
-            label = fluid.layers.data(
-                name='label', shape=LABEL_SHAPE, dtype='int64')
-
-            simple_fc_net_with_inputs(image, label)
-
-        return main_prog, startup_prog, [image, label]
-
-    def check_batch_number(self, place, randomize_batch_num=False):
-        main_prog, startup_prog, feeds = self.build_network()
-        dataset = fluid.DatasetFactory().create_dataset(self.dataset_name)
-        dataset.set_batch_size(BATCH_SIZE)
-
-        if isinstance(place, fluid.CPUPlace):
-            file_num = 10
-            os.environ['CPU_NUM'] = str(file_num)
-            places = fluid.cpu_places()
-            use_cuda = False
-        else:
-            file_num = fluid.core.get_cuda_device_count()
-            places = fluid.cuda_places()
-            use_cuda = True
-
-        filelist = []
-        if file_num > 1 and randomize_batch_num:
-            random_delta_batch_size = np.random.random_integers(
-                low=-BATCH_NUM / 2, high=BATCH_NUM / 2, size=[file_num])
-            random_delta_batch_size[-1] = -int(
-                np.sum(random_delta_batch_size[0:-1]))
-        else:
-            random_delta_batch_size = np.zeros(shape=[file_num])
-
-        for i in six.moves.range(file_num):
-            filename = 'dataset_test_{}.txt'.format(i)
-            filelist.append(filename)
-            write_reader_data_to_file(
-                filename,
-                fake_reader(batch_num=BATCH_NUM + random_delta_batch_size[i]))
-
-        dataset.set_filelist(filelist)
-        dataset.set_use_var(feeds)
-        dataset.set_pipe_command("cat")
-        if self.dataset_name == 'InMemoryDataset':
-            dataset.load_into_memory()
-
-        dataloader = fluid.io.DataLoader.from_dataset(
-            dataset=dataset, places=places, drop_last=self.drop_last)
-        prog = fluid.CompiledProgram(main_prog).with_data_parallel()
-        exe = fluid.Executor(place)
-
-        exe.run(startup_prog)
-
-        for _ in six.moves.range(EPOCH_NUM):
-            has_complete_batch = False
-            for batch_id, data in enumerate(dataloader):
-                self.assertEquals(len(places), len(data))
-                for idx, data_on_each_device in enumerate(data):
-                    image = data_on_each_device["image"]
-                    label = data_on_each_device["label"]
-
-                    if self.drop_last:
-                        batch_size = BATCH_SIZE
-                    else:
-                        if batch_id == BATCH_NUM:
-                            batch_size = BATCH_SIZE / 2
-                        else:
-                            batch_size = BATCH_SIZE
-
-                    self.assertEquals(image.shape()[1:], IMAGE_SHAPE)
-                    self.assertTrue(
-                        image._place()._equals(places[idx]),
-                        msg=get_place_string(image._place()) + ' vs ' +
-                        get_place_string(places[idx]))
-                    if self.drop_last:
-                        self.assertEquals(image.shape()[0], BATCH_SIZE)
-                    else:
-                        self.assertTrue(image.shape()[0] == BATCH_SIZE or
-                                        image.shape()[0] == BATCH_SIZE / 2)
-
-                    self.assertEquals(label.shape()[1:], LABEL_SHAPE)
-                    self.assertTrue(label._place()._equals(places[idx]))
-                    if self.drop_last:
-                        self.assertEquals(label.shape()[0], BATCH_SIZE)
-                    else:
-                        self.assertTrue(label.shape()[0] == BATCH_SIZE or
-                                        label.shape()[0] == BATCH_SIZE / 2)
-
-                    self.assertEquals(image.shape()[0], label.shape()[0])
-
-                    if image.shape()[0] == BATCH_SIZE:
-                        has_complete_batch = True
-
-                exe.run(prog, feed=data)
-
-            self.assertTrue(has_complete_batch)
-
-    def get_all_places(self):
-        p = [fluid.CPUPlace()]
-        if fluid.is_compiled_with_cuda():
-            p.append(fluid.CUDAPlace(0))
-        return p
-
-    def test_batch_number_with_same_length_files(self):
-        for p in self.get_all_places():
-            with fluid.scope_guard(fluid.Scope()):
-                self.check_batch_number(place=p, randomize_batch_num=False)
-
-    def test_batch_number_with_different_length_files(self):
-        for p in self.get_all_places():
-            with fluid.scope_guard(fluid.Scope()):
-                self.check_batch_number(place=p, randomize_batch_num=True)
-
-
-class QueueDatasetTestWithoutDropLast(DatasetLoaderTestBase):
-    def setUp(self):
-        self.dataset_name = "QueueDataset"
-        self.drop_last = True
-
-
-class InMemoryDatasetTestWithoutDropLast(DatasetLoaderTestBase):
-    def setUp(self):
-        self.dataset_name = "InMemoryDataset"
-        self.drop_last = False
-
-
-class InMemoryDatasetTestWithDropLast(DatasetLoaderTestBase):
-    def setUp(self):
-        self.dataset_name = "InMemoryDataset"
-        self.drop_last = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py
deleted file mode 100644
index f4c9466d63a201ba9a5e77515ae64a33bedc5b23..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import debugger
-from paddle.fluid.framework import Program
-
-
-class TestDebugger(unittest.TestCase):
-    def test_debug_str(self):
-        p = Program()
-        b = p.current_block()
-
-        #selected_rows
-        b.create_var(
-            name='selected_rows',
-            dtype="float32",
-            shape=[5, 10],
-            type=core.VarDesc.VarType.SELECTED_ROWS)
-
-        #tensor array
-        b.create_var(
-            name='tensor_array',
-            shape=[5, 10],
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
-
-        #operator
-        mul_x = b.create_parameter(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = b.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = b.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        b.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-
-        print(debugger.pprint_program_codes(p))
-
-        debugger.draw_block_graphviz(p.block(0), path="./test.dot")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
deleted file mode 100644
index a664a1529f4de1f372241319b57fad6b0ba8b8a2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestDecayedAdagradOp1(OpTest):
-    ''' Test DecayedAdagrad operator with explicit attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "decayed_adagrad"
-
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-        lr = 0.01
-        decay = 0.80
-        epsilon = 1e-8
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment': moment,
-            'LearningRate': np.array([lr]).astype("float32")
-        }
-
-        self.attrs = {'decay': decay, 'epsilon': epsilon}
-
-        moment_out = decay * moment + (1 - decay) * grad * grad
-        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
-
-        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestDecayedAdagradOp2(OpTest):
-    ''' Test DecayedAdagrad operator with default attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "decayed_adagrad"
-
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-        lr = 0.01
-        decay = 0.95
-        epsilon = 1e-6
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment': moment,
-            'LearningRate': np.array([lr]).astype("float32")
-        }
-
-        self.attrs = {'decay': decay, 'epsilon': epsilon}
-
-        moment_out = decay * moment + (1 - decay) * grad * grad
-        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
-
-        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
deleted file mode 100644
index a16f21c0f97c0902dd6c26561ed3f707b28ff947..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.fluid as fluid
-import numpy as np
-import time
-import six
-import unittest
-
-EPOCH_NUM = 20
-BATCH_SIZE = 32
-BATCH_NUM = 20
-CLASS_NUM = 10
-
-
-def random_reader():
-    np.random.seed(1)
-    for i in range(BATCH_SIZE * BATCH_NUM):
-        image = np.random.random([784])
-        label = np.random.random_integers(low=0, high=CLASS_NUM - 1)
-        yield image, label
-
-
-def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
-    startup_prog = fluid.Program()
-    main_prog = fluid.Program()
-    startup_prog.random_seed = 1
-    main_prog.random_seed = 1
-
-    with fluid.unique_name.guard():
-        with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.layers.data(
-                name='image', shape=[784], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            py_reader = fluid.io.PyReader(
-                feed_list=[image, label],
-                capacity=4,
-                iterable=not use_legacy_py_reader,
-                use_double_buffer=use_double_buffer)
-            hidden = image
-            for hidden_size in [10, 20, 30]:
-                hidden = fluid.layers.fc(
-                    hidden,
-                    size=hidden_size,
-                    act='tanh',
-                    bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(value=1.0)))
-
-            predict_label = fluid.layers.fc(hidden,
-                                            size=CLASS_NUM,
-                                            act='softmax')
-            loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=predict_label, label=label))
-
-            optimizer = fluid.optimizer.Adam()
-            optimizer.minimize(loss)
-    return startup_prog, main_prog, py_reader, loss
-
-
-class TestBase(unittest.TestCase):
-    def run_main(self, use_legacy_py_reader, with_data_parallel, places,
-                 use_double_buffer):
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
-            startup_prog, main_prog, py_reader, loss = simple_fc_net(
-                places, use_legacy_py_reader, use_double_buffer)
-
-            reader = paddle.batch(random_reader, batch_size=BATCH_SIZE)
-
-            ps = places if use_double_buffer else fluid.cpu_places(len(places))
-
-            py_reader.decorate_sample_list_generator(
-                reader, places=ps if py_reader.iterable else None)
-
-            exe = fluid.Executor(place=places[0])
-            exe.run(startup_prog)
-
-            prog = fluid.CompiledProgram(main_prog)
-            if with_data_parallel:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
-
-            step = 0
-            step_list = []
-            loss_list = []
-            start_t = time.time()
-            if not py_reader.iterable:
-                for _ in six.moves.range(EPOCH_NUM):
-                    step = 0
-                    py_reader.start()
-                    while True:
-                        try:
-                            L, = exe.run(program=prog,
-                                         fetch_list=[loss],
-                                         use_program_cache=True)
-                            loss_list.append(np.mean(L))
-                            step += 1
-                        except fluid.core.EOFException:
-                            py_reader.reset()
-                            break
-                    step_list.append(step)
-            else:
-                for _ in six.moves.range(EPOCH_NUM):
-                    step = 0
-                    for d in py_reader():
-                        assert len(d) == len(places)
-                        for i, item in enumerate(d):
-                            image = item['image']
-                            label = item['label']
-                            assert image.shape() == [BATCH_SIZE, 784]
-                            assert label.shape() == [BATCH_SIZE, 1]
-                            assert image._place()._equals(ps[i])
-                            assert label._place()._equals(ps[i])
-                        L, = exe.run(program=prog,
-                                     feed=d,
-                                     fetch_list=[loss],
-                                     use_program_cache=True)
-                        loss_list.append(np.mean(L))
-                        step += 1
-                    step_list.append(step)
-            end_t = time.time()
-            ret = {
-                "time": end_t - start_t,
-                "step": step_list,
-                "loss": np.array(loss_list)
-            }
-            return ret
-
-    def prepare_places(self, with_data_parallel, with_cpu=True, with_gpu=True):
-        places = []
-        if with_cpu:
-            places.append([fluid.CPUPlace()])
-            if with_data_parallel:
-                places.append([fluid.CPUPlace()] * 2)
-
-        if with_gpu and fluid.core.is_compiled_with_cuda():
-            tmp = fluid.cuda_places()
-            assert len(tmp) > 0, "no gpu detected"
-            if with_data_parallel:
-                places.append(tmp)
-            places.append([tmp[0]])
-        return places
-
-    def test_main(self):
-        for with_data_parallel in [True, False]:
-            for p in self.prepare_places(with_data_parallel):
-                for use_double_buffer in [False, True]:
-                    results = []
-                    for use_legacy_py_reader in [False, True]:
-                        ret = self.run_main(
-                            use_legacy_py_reader=use_legacy_py_reader,
-                            with_data_parallel=with_data_parallel,
-                            places=p,
-                            use_double_buffer=use_double_buffer)
-                        results.append(ret)
-                    if not use_double_buffer:
-                        diff = np.max(
-                            np.abs(results[0]['loss'] - results[1]['loss']))
-                        self.assertLess(diff, 1e-3)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
deleted file mode 100644
index 4d767709ef56f11d6790c85206b544d63883841e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle
-import numpy as np
-import unittest
-import six
-
-
-class TestClass(unittest.TestCase):
-    def setUp(self):
-        self.use_double_buffer = True
-        self.use_py_reader = True
-
-    def test_reader_data(self):
-        img_shape = [28, 31]
-        label_shape = [1]
-        batch_size = 32
-        batch_num = 10
-
-        def fake_reader():
-            for _ in six.moves.range(batch_size * batch_num):
-                img = np.random.random(size=img_shape).astype('float32')
-                label = np.random.random_integers(
-                    low=0, high=9, size=label_shape).astype('int64')
-                yield img, label
-
-        reader = fluid.io.cache(fake_reader)
-        batch_reader = fluid.io.batch(reader, batch_size=batch_size)
-
-        places = [fluid.CPUPlace()]
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-
-        for p in places:
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                img = fluid.layers.data(
-                    shape=img_shape, dtype='float32', name='image')
-                label = fluid.layers.data(
-                    shape=label_shape, dtype='int64', name='label')
-
-                feeder = fluid.DataFeeder(feed_list=[img, label], place=p)
-
-                use_double_buffer = self.use_double_buffer
-                if p._type() != fluid.CPUPlace()._type(
-                ) and not use_double_buffer:
-                    use_double_buffer = True
-
-                if self.use_py_reader:
-                    py_reader = fluid.io.PyReader(
-                        feed_list=[img, label],
-                        capacity=4,
-                        iterable=True,
-                        use_double_buffer=use_double_buffer)
-                    py_reader.decorate_sample_list_generator(
-                        batch_reader, places=p)
-                else:
-                    py_reader = fluid.io.DataLoader.from_generator(
-                        feed_list=[img, label],
-                        capacity=4,
-                        iterable=True,
-                        use_double_buffer=use_double_buffer
-                    ).set_sample_list_generator(
-                        batch_reader, places=p)
-
-                for break_beforehand in [True, False]:
-                    for epoch_id in six.moves.range(10):
-                        gen = batch_reader()
-                        batch_id = 0
-                        for d in py_reader():
-                            feed = feeder.feed(next(gen))
-                            I1, L1 = feed['image'], feed['label']
-                            I2, L2 = d[0]['image'], d[0]['label']
-
-                            I1 = np.array(I1)
-                            I2 = np.array(I2)
-                            L1 = np.array(L1)
-                            L2 = np.array(L2)
-
-                            self.assertTrue(np.array_equal(I1, I2))
-                            self.assertTrue(np.array_equal(L1, L2))
-
-                            batch_id += 1
-                            if break_beforehand and batch_id >= int(batch_num /
-                                                                    2):
-                                break
-
-                        if break_beforehand:
-                            self.assertTrue(next(gen, None) is not None)
-                        else:
-                            self.assertTrue(next(gen, None) is None)
-
-
-class TestClass2(TestClass):
-    def setUp(self):
-        self.use_double_buffer = False
-        self.use_py_reader = True
-
-
-class TestClass3(TestClass):
-    def setUp(self):
-        self.use_double_buffer = True
-        self.use_py_reader = False
-
-
-class TestClass4(TestClass):
-    def setUp(self):
-        self.use_double_buffer = False
-        self.use_py_reader = False
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
deleted file mode 100644
index 01a7b6824885b32e922a8eb34f5d8117ee3e584f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from paddle.fluid.default_scope_funcs import *
-import unittest
-
-
-class TestDefaultScopeFuncs(unittest.TestCase):
-    def test_cur_scope(self):
-        self.assertIsNotNone(get_cur_scope())
-
-    def test_none_variable(self):
-        self.assertIsNone(find_var("test"))
-
-    def test_create_var_get_var(self):
-        var_a = var("var_a")
-        self.assertIsNotNone(var_a)
-        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
-        enter_local_scope()
-        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
-        leave_local_scope()
-
-    def test_var_get_int(self):
-        def __new_scope__():
-            i = var("var_i")
-            self.assertFalse(i.is_int())
-            i.set_int(10)
-            self.assertTrue(i.is_int())
-            self.assertEqual(10, i.get_int())
-
-        for _ in range(10):
-            scoped_function(__new_scope__)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
deleted file mode 100644
index db3b2a8f96380fd9f469dcf4cd265e1cc4b41436..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
+++ /dev/null
@@ -1,281 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-def dmc_bilinear(data_im, height, width, h, w):
-    h_low = int(np.floor(h))
-    w_low = int(np.floor(w))
-    h_high = h_low + 1
-    w_high = w_low + 1
-
-    lh = h - h_low
-    lw = w - w_low
-    hh = 1 - lh
-    hw = 1 - lw
-
-    v1 = 0
-    if h_low >= 0 and w_low >= 0:
-        v1 = data_im[h_low, w_low]
-    v2 = 0
-    if h_low >= 0 and w_high <= width - 1:
-        v2 = data_im[h_low, w_high]
-    v3 = 0
-    if h_high <= height - 1 and w_low >= 0:
-        v3 = data_im[h_high, w_low]
-    v4 = 0
-    if h_high <= height - 1 and w_high <= width - 1:
-        v4 = data_im[h_high, w_high]
-
-    w1, w2, w3, w4 = hh * hw, hh * lw, lh * hw, lh * lw
-    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
-
-    return val
-
-
-def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param):
-    in_n, in_c, in_h, in_w = input.shape
-    out_c, f_c, f_h, f_w = filter.shape
-
-    assert offset.shape == (in_n, 2 * f_h * f_w, in_h, in_w)
-    assert mask.shape == (in_n, f_h * f_w, in_h, in_w)
-    assert f_c * group == in_c
-    assert np.mod(out_c, group) == 0
-
-    stride, pad, dilation = conv_param['stride'], conv_param['pad'],\
-        conv_param['dilation']
-    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
-    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
-    assert out_h == in_h
-    assert out_w == in_w
-
-    col_buffer = np.zeros((in_n, in_c * f_h * f_w, in_h * in_w))
-    for n in range(in_n):
-        for c in range(in_c):
-            for h in range(out_h):
-                for w in range(out_w):
-                    for kh in range(f_h):
-                        for kw in range(f_w):
-                            offset_h_table = \
-                                    offset[n, ::2, h, w].reshape(f_h, f_w)
-                            offset_w_table = \
-                                    offset[n, 1::2, h, w].reshape(f_h, f_w)
-                            mask_table = \
-                                mask[n, :, h, w].reshape(f_h, f_w)
-                            offset_h = offset_h_table[kh, kw]
-                            offset_w = offset_w_table[kh, kw]
-                            val = 0
-                            im_h = h * stride[0] + kh * dilation[0] \
-                                + offset_h - pad[0]
-                            im_w = w * stride[0] + kw * dilation[0] \
-                                + offset_w - pad[1]
-                            if im_h > -1 and im_w > -1 and \
-                                im_h < in_h and im_w < in_h:
-                                val = dmc_bilinear(input[n, c], in_h, in_w,
-                                                   im_h, im_w)
-                            val_out = val * mask_table[kh, kw]
-                            col_buffer[n, c * f_h * f_w + kh * f_w + kw, h *
-                                       in_w + w] = val_out
-
-    out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
-    weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
-    col_buffer = col_buffer.reshape(
-        (in_n, group, int(in_c // group * f_h * f_w), in_h * in_w))
-    for n in range(in_n):
-        for g in range(group):
-            out[n, g] = np.matmul(weight[g], col_buffer[n, g])
-    out = out.reshape(in_n, out_c, out_h, out_w)
-    return out
-
-
-class TestModulatedDeformableConvOp(OpTest):
-    def setUp(self):
-        self.op_type = "deformable_conv"
-        self.dtype = np.float32
-        self.init_group()
-        self.init_dilation()
-        self.init_test_case()
-
-        conv_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-
-        input = np.random.random(self.input_size).astype(self.dtype)
-        offset = 10 * np.random.random(self.offset_size).astype(self.dtype)
-        mask = 10 * np.random.random(self.mask_size).astype(self.dtype)
-        filter = np.random.random(self.filter_size).astype(self.dtype)
-
-        output = dconv_im2col_gemm(input, offset, mask, filter, self.groups,
-                                   conv_param)
-        output = output.astype(self.dtype)
-
-        self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'Offset': OpTest.np_dtype_to_fluid_dtype(offset),
-            'Mask': OpTest.np_dtype_to_fluid_dtype(mask),
-            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
-        }
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'groups': self.groups,
-            'deformable_groups': self.deformable_groups,
-            'im2col_step': self.im2col_step,
-            'dilations': self.dilations,
-        }
-        self.outputs = {'Output': output}
-
-    def test_check_output(self):
-        self.check_output(atol=1e-5)
-
-    def test_check_grad(self):
-        self.check_grad(
-            {'Input', 'Offset', 'Mask', 'Filter'},
-            'Output',
-            max_relative_error=0.05)
-
-    def test_check_grad_no_filter(self):
-        self.check_grad(
-            ['Input', 'Offset', 'Mask'],
-            'Output',
-            max_relative_error=0.1,
-            no_grad_set=set(['Filter']))
-
-    def test_check_grad_no_input(self):
-        self.check_grad(
-            ['Filter', 'Offset', 'Mask'],
-            'Output',
-            max_relative_error=0.1,
-            no_grad_set=set(['Input']))
-
-    def test_check_grad_no_offset_no_mask(self):
-        self.check_grad(
-            ['Input', 'Filter'],
-            'Output',
-            max_relative_error=0.1,
-            no_grad_set=set(['Offset', 'Mask']))
-
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.input_size = [2, 4, 4, 4]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [4, f_c, 3, 3]
-        self.im2col_step = 1
-        self.deformable_groups = 1
-        offset_c = 2 * self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        mask_c = self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        self.offset_size = [
-            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
-        ]
-        self.mask_size = [
-            self.input_size[0], mask_c, self.input_size[2], self.input_size[3]
-        ]
-
-    def init_dilation(self):
-        self.dilations = [1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-
-class TestWithStride(TestModulatedDeformableConvOp):
-    def init_test_case(self):
-        self.pad = [3, 3]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.im2col_step = 1
-        self.deformable_groups = 1
-        offset_c = 2 * self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        mask_c = self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        self.offset_size = [
-            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
-        ]
-        self.mask_size = [
-            self.input_size[0], mask_c, self.input_size[2], self.input_size[3]
-        ]
-
-
-class TestWithDilation(TestModulatedDeformableConvOp):
-    def init_test_case(self):
-        self.pad = [2, 2]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 4, 4]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.im2col_step = 1
-        self.deformable_groups = 1
-        offset_c = 2 * self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        mask_c = self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        self.offset_size = [
-            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
-        ]
-        self.mask_size = [
-            self.input_size[0], mask_c, self.input_size[2], self.input_size[3]
-        ]
-
-    def init_dilation(self):
-        self.dilations = [2, 2]
-
-
-class TestWith1x1(TestModulatedDeformableConvOp):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 1, 1]
-        self.im2col_step = 1
-        self.deformable_groups = 1
-        offset_c = 2 * self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        mask_c = self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        self.offset_size = [
-            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
-        ]
-        self.mask_size = [
-            self.input_size[0], mask_c, self.input_size[2], self.input_size[3]
-        ]
-
-
-class TestWithGroup(TestModulatedDeformableConvOp):
-    def init_group(self):
-        self.groups = 2
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
deleted file mode 100644
index 5646f72f7297485855b8d65d98a94f6709712563..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ /dev/null
@@ -1,240 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-def dmc_bilinear(data_im, height, width, h, w):
-    h_low = int(np.floor(h))
-    w_low = int(np.floor(w))
-    h_high = h_low + 1
-    w_high = w_low + 1
-
-    lh = h - h_low
-    lw = w - w_low
-    hh = 1 - lh
-    hw = 1 - lw
-
-    v1 = 0
-    if h_low >= 0 and w_low >= 0:
-        v1 = data_im[h_low, w_low]
-    v2 = 0
-    if h_low >= 0 and w_high <= width - 1:
-        v2 = data_im[h_low, w_high]
-    v3 = 0
-    if h_high <= height - 1 and w_low >= 0:
-        v3 = data_im[h_high, w_low]
-    v4 = 0
-    if h_high <= height - 1 and w_high <= width - 1:
-        v4 = data_im[h_high, w_high]
-
-    w1, w2, w3, w4 = hh * hw, hh * lw, lh * hw, lh * lw
-    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
-
-    return val
-
-
-def dconv_im2col_gemm(input, offset, filter, group, conv_param):
-    in_n, in_c, in_h, in_w = input.shape
-    out_c, f_c, f_h, f_w = filter.shape
-
-    assert offset.shape == (in_n, 2 * f_h * f_w, in_h, in_w)
-    assert f_c * group == in_c
-    assert np.mod(out_c, group) == 0
-
-    stride, pad, dilation = conv_param['stride'], conv_param['pad'],\
-        conv_param['dilation']
-    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
-    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
-    assert out_h == in_h
-    assert out_w == in_w
-
-    col_buffer = np.zeros((in_n, in_c * f_h * f_w, in_h * in_w))
-    for n in range(in_n):
-        for c in range(in_c):
-            for h in range(out_h):
-                for w in range(out_w):
-                    for kh in range(f_h):
-                        for kw in range(f_w):
-                            offset_h_table = \
-                                    offset[n, ::2, h, w].reshape(f_h, f_w)
-                            offset_w_table = \
-                                    offset[n, 1::2, h, w].reshape(f_h, f_w)
-                            offset_h = offset_h_table[kh, kw]
-                            offset_w = offset_w_table[kh, kw]
-                            val = 0
-                            im_h = h * stride[0] + kh * dilation[0] \
-                                + offset_h - pad[0]
-                            im_w = w * stride[0] + kw * dilation[0] \
-                                + offset_w - pad[1]
-                            if im_h > -1 and im_w > -1 and \
-                                im_h < in_h and im_w < in_h:
-                                val = dmc_bilinear(input[n, c], in_h, in_w,
-                                                   im_h, im_w)
-                            val_out = val
-
-                            col_buffer[n, c * f_h * f_w + kh * f_w + kw, h *
-                                       in_w + w] = val_out
-
-    out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
-    weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
-    col_buffer = col_buffer.reshape(
-        (in_n, group, int(in_c // group * f_h * f_w), in_h * in_w))
-    for n in range(in_n):
-        for g in range(group):
-            out[n, g] = np.matmul(weight[g], col_buffer[n, g])
-    out = out.reshape(in_n, out_c, out_h, out_w)
-    return out
-
-
-class TestModulatedDeformableConvOp(OpTest):
-    def setUp(self):
-        self.op_type = "deformable_conv_v1"
-        self.dtype = np.float32
-        self.init_group()
-        self.init_dilation()
-        self.init_test_case()
-
-        conv_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-
-        input = np.random.random(self.input_size).astype(self.dtype)
-        offset = 10 * np.random.random(self.offset_size).astype(self.dtype)
-        filter = np.random.random(self.filter_size).astype(self.dtype)
-
-        output = dconv_im2col_gemm(input, offset, filter, self.groups,
-                                   conv_param)
-        output = output.astype(self.dtype)
-        self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'Offset': OpTest.np_dtype_to_fluid_dtype(offset),
-            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
-        }
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'groups': self.groups,
-            'deformable_groups': self.deformable_groups,
-            'im2col_step': self.im2col_step,
-            'dilations': self.dilations,
-        }
-        self.outputs = {'Output': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['Input', 'Offset', 'Filter'], 'Output', max_relative_error=0.05)
-
-    def test_check_grad_no_filter(self):
-        self.check_grad(
-            ['Input', 'Offset'],
-            'Output',
-            max_relative_error=0.1,
-            no_grad_set=set(['Filter']))
-
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.input_size = [2, 4, 4, 4]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [4, f_c, 3, 3]
-        self.im2col_step = 1
-        self.deformable_groups = 1
-        offset_c = 2 * self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        self.offset_size = [
-            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
-        ]
-
-    def init_dilation(self):
-        self.dilations = [1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-
-class TestWithStride(TestModulatedDeformableConvOp):
-    def init_test_case(self):
-        self.pad = [3, 3]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.im2col_step = 1
-        self.deformable_groups = 1
-        offset_c = 2 * self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        self.offset_size = [
-            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
-        ]
-
-
-class TestWithDilation(TestModulatedDeformableConvOp):
-    def init_test_case(self):
-        self.pad = [2, 2]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 4, 4]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.im2col_step = 1
-        self.deformable_groups = 1
-        offset_c = 2 * self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        self.offset_size = [
-            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
-        ]
-
-    def init_dilation(self):
-        self.dilations = [2, 2]
-
-
-class TestWith1x1(TestModulatedDeformableConvOp):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 1, 1]
-        self.im2col_step = 1
-        self.deformable_groups = 1
-        offset_c = 2 * self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        self.offset_size = [
-            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
-        ]
-
-
-class TestWithGroup(TestModulatedDeformableConvOp):
-    def init_group(self):
-        self.groups = 2
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py b/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py
deleted file mode 100644
index 6aa408e5d72c63bb97d165698a086fb0d437b185..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def set_input(input, rois, trans):
-    inputs = {'Input': input, "ROIs": rois, "Trans": trans}
-    return inputs
-
-
-def set_attrs(no_trans, spatial_scale, output_channels, group_size,
-              pooled_height, pooled_width, part_size, sample_per_part,
-              trans_std):
-    attrs = {
-        'no_trans': no_trans,
-        'spatial_scale': spatial_scale,
-        'output_dim': output_channels,
-        'group_size': group_size,
-        'pooled_height': pooled_height,
-        'pooled_width': pooled_width,
-        'part_size': part_size,
-        'sample_per_part': sample_per_part,
-        'trans_std': trans_std
-    }
-    return attrs
-
-
-def set_outputs(output, top_count):
-    outputs = {
-        'Output': output.astype('float32'),
-        'TopCount': top_count.astype('float32')
-    }
-    return outputs
-
-
-class TestDeformablePSROIPoolOp(OpTest):
-    def set_data(self):
-        self.start_test1()
-        self.start_test2()
-        self.start_test3()
-        self.start_test4()
-
-    def start_test1(self):
-        self.init_test_case1()
-        self.make_rois()
-        self.calc_deformable_psroi_pooling()
-
-        inputs = self.input
-        rois = (self.rois[:, 1:5], self.rois_lod)
-        trans = self.trans
-        self.inputs = set_input(inputs, rois, trans)
-
-        no_trans = self.no_trans
-        spatial_scale = self.spatial_scale
-        output_channels = self.output_channels
-        group_size = self.group_size
-        pooled_height = self.pooled_height
-        pooled_width = self.pooled_width
-        part_size = self.part_size
-        sample_per_part = self.sample_per_part
-        trans_std = self.trans_std
-
-        self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
-                               group_size, pooled_height, pooled_width,
-                               part_size, sample_per_part, trans_std)
-
-        output = self.out.astype('float32')
-        top_count = self.top_count.astype('float32')
-        self.outputs = set_outputs(output, top_count)
-
-    def start_test2(self):
-        self.init_test_case2()
-        self.make_rois()
-        self.calc_deformable_psroi_pooling()
-
-        inputs = self.input
-        rois = (self.rois[:, 1:5], self.rois_lod)
-        trans = self.trans
-        self.inputs = set_input(inputs, rois, trans)
-
-        no_trans = self.no_trans
-        spatial_scale = self.spatial_scale
-        output_channels = self.output_channels
-        group_size = self.group_size
-        pooled_height = self.pooled_height
-        pooled_width = self.pooled_width
-        part_size = self.part_size
-        sample_per_part = self.sample_per_part
-        trans_std = self.trans_std
-
-        self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
-                               group_size, pooled_height, pooled_width,
-                               part_size, sample_per_part, trans_std)
-
-        output = self.out.astype('float32')
-        top_count = self.top_count.astype('float32')
-        self.outputs = set_outputs(output, top_count)
-
-    def start_test3(self):
-        self.init_test_case3()
-        self.make_rois()
-        self.calc_deformable_psroi_pooling()
-
-        inputs = self.input
-        rois = (self.rois[:, 1:5], self.rois_lod)
-        trans = self.trans
-        self.inputs = set_input(inputs, rois, trans)
-
-        no_trans = self.no_trans
-        spatial_scale = self.spatial_scale
-        output_channels = self.output_channels
-        group_size = self.group_size
-        pooled_height = self.pooled_height
-        pooled_width = self.pooled_width
-        part_size = self.part_size
-        sample_per_part = self.sample_per_part
-        trans_std = self.trans_std
-
-        self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
-                               group_size, pooled_height, pooled_width,
-                               part_size, sample_per_part, trans_std)
-
-        output = self.out.astype('float32')
-        top_count = self.top_count.astype('float32')
-        self.outputs = set_outputs(output, top_count)
-
-    def start_test4(self):
-        self.init_test_case4()
-        self.make_rois()
-        self.calc_deformable_psroi_pooling()
-
-        inputs = self.input
-        rois = (self.rois[:, 1:5], self.rois_lod)
-        trans = self.trans
-        self.inputs = set_input(inputs, rois, trans)
-
-        no_trans = self.no_trans
-        spatial_scale = self.spatial_scale
-        output_channels = self.output_channels
-        group_size = self.group_size
-        pooled_height = self.pooled_height
-        pooled_width = self.pooled_width
-        part_size = self.part_size
-        sample_per_part = self.sample_per_part
-        trans_std = self.trans_std
-
-        self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
-                               group_size, pooled_height, pooled_width,
-                               part_size, sample_per_part, trans_std)
-
-        output = self.out.astype('float32')
-        top_count = self.top_count.astype('float32')
-        self.outputs = set_outputs(output, top_count)
-
-    def init_test_case1(self):
-        self.batch_size = 3
-        self.channels = 3 * 2 * 2
-        self.height = 12
-        self.width = 12
-        self.input_dim = [
-            self.batch_size, self.channels, self.height, self.width
-        ]
-        self.no_trans = False
-        self.spatial_scale = 1.0 / 4.0
-        self.output_channels = 12
-        self.group_size = [1, 1]
-        self.pooled_height = 4
-        self.pooled_width = 4
-        self.part_size = [4, 4]
-        self.sample_per_part = 2
-        self.trans_std = 0.1
-        self.input = np.random.random(self.input_dim).astype('float32')
-
-    def init_test_case2(self):
-        self.batch_size = 2
-        self.channels = 3 * 2 * 2
-        self.height = 12
-        self.width = 12
-        self.input_dim = [
-            self.batch_size, self.channels, self.height, self.width
-        ]
-        self.no_trans = True
-        self.spatial_scale = 1.0 / 2.0
-        self.output_channels = 12
-        self.group_size = [1, 1]
-        self.pooled_height = 7
-        self.pooled_width = 7
-        self.part_size = [7, 7]
-        self.sample_per_part = 4
-        self.trans_std = 0.1
-        self.input = np.random.random(self.input_dim).astype('float32')
-
-    def init_test_case3(self):
-        self.batch_size = 2
-        self.channels = 3 * 2 * 2
-        self.height = 12
-        self.width = 12
-        self.input_dim = [
-            self.batch_size, self.channels, self.height, self.width
-        ]
-        self.no_trans = False
-        self.spatial_scale = 1.0 / 4.0
-        self.output_channels = 12
-        self.group_size = [1, 1]
-        self.pooled_height = 3
-        self.pooled_width = 3
-        self.part_size = [3, 3]
-        self.sample_per_part = 3
-        self.trans_std = 0.2
-        self.input = np.random.random(self.input_dim).astype('float32')
-
-    def init_test_case4(self):
-        self.batch_size = 2
-        self.channels = 3 * 2 * 2
-        self.height = 12
-        self.width = 12
-        self.input_dim = [
-            self.batch_size, self.channels, self.height, self.width
-        ]
-        self.no_trans = True
-        self.spatial_scale = 1.0 / 2.0
-        self.output_channels = 12
-        self.group_size = [1, 1]
-        self.pooled_height = 6
-        self.pooled_width = 2
-        self.part_size = [6, 6]
-        self.sample_per_part = 6
-        self.trans_std = 0.4
-        self.input = np.random.random(self.input_dim).astype('float32')
-
-    def make_rois(self):
-        rois = []
-        self.rois_lod = [[]]
-        for bno in range(self.batch_size):
-            self.rois_lod[0].append(bno + 1)
-            for i in range(bno + 1):
-                x_1 = np.random.random_integers(
-                    0, self.width // self.spatial_scale - self.pooled_width)
-                y_1 = np.random.random_integers(
-                    0, self.height // self.spatial_scale - self.pooled_height)
-                x_2 = np.random.random_integers(
-                    x_1 + self.pooled_width, self.width // self.spatial_scale)
-                y_2 = np.random.random_integers(
-                    y_1 + self.pooled_height, self.height // self.spatial_scale)
-                roi = [bno, x_1, y_1, x_2, y_2]
-                rois.append(roi)
-        self.rois_num = len(rois)
-        self.rois = np.array(rois).astype("float32")
-
-    def dmc_bilinear(self, data_im, p_h, p_w):
-        h_low = int(np.floor(p_h))
-        w_low = int(np.floor(p_w))
-        h_high = h_low + 1
-        w_high = w_low + 1
-        l_h = p_h - h_low
-        l_w = p_w - w_low
-        h_h = 1 - l_h
-        h_w = 1 - l_w
-        v_1 = 0
-        if h_low >= 0 and w_low >= 0:
-            v_1 = data_im[h_low, w_low]
-        v_2 = 0
-        if h_low >= 0 and w_high <= self.width - 1:
-            v_2 = data_im[h_low, w_high]
-        v_3 = 0
-        if h_high <= self.height - 1 and w_low >= 0:
-            v_3 = data_im[h_high, w_low]
-        v_4 = 0
-        if h_high <= self.height - 1 and w_high <= self.width - 1:
-            v_4 = data_im[h_high, w_high]
-        w_1, w_2, w_3, w_4 = h_h * h_w, h_h * l_w, l_h * h_w, l_h * l_w
-        val = w_1 * v_1 + w_2 * v_2 + w_3 * v_3 + w_4 * v_4
-        return val
-
-    def calc_deformable_psroi_pooling(self):
-        output_shape = (self.rois_num, self.output_channels, self.pooled_height,
-                        self.pooled_width)
-        self.out = np.zeros(output_shape)
-        self.trans = np.random.rand(self.rois_num, 2, self.part_size[0],
-                                    self.part_size[1]).astype('float32')
-        self.top_count = np.random.random((output_shape)).astype('float32')
-        count = self.rois_num * self.output_channels * self.pooled_height * self.pooled_width
-        for index in range(count):
-            p_w = int(index % self.pooled_width)
-            p_h = int(index / self.pooled_width % self.pooled_height)
-            ctop = int(index / self.pooled_width / self.pooled_height %
-                       self.output_channels)
-            n_out = int(index / self.pooled_width / self.pooled_height /
-                        self.output_channels)
-            roi = self.rois[n_out]
-            roi_batch_id = int(roi[0])
-            roi_start_w = int(np.round(roi[1])) * self.spatial_scale - 0.5
-            roi_start_h = int(np.round(roi[2])) * self.spatial_scale - 0.5
-            roi_end_w = int(np.round(roi[3] + 1)) * self.spatial_scale - 0.5
-            roi_end_h = int(np.round(roi[4] + 1)) * self.spatial_scale - 0.5
-            roi_width = max(roi_end_w - roi_start_w, 0.1)
-            roi_height = max(roi_end_h - roi_start_h, 0.1)
-            bin_size_h = float(roi_height) / float(self.pooled_height)
-            bin_size_w = float(roi_width) / float(self.pooled_width)
-            sub_bin_size_h = bin_size_h / self.sample_per_part
-            sub_bin_size_w = bin_size_w / self.sample_per_part
-            part_h = int(np.floor(p_h) / self.pooled_height * self.part_size[0])
-            part_w = int(np.floor(p_w) / self.pooled_width * self.part_size[1])
-            if self.no_trans:
-                trans_x = 0
-                trans_y = 0
-            else:
-                trans_x = self.trans[n_out][0][part_h][part_w] * self.trans_std
-                trans_y = self.trans[n_out][1][part_h][part_w] * self.trans_std
-            wstart = p_w * bin_size_w + roi_start_w
-            wstart = wstart + trans_x * roi_width
-            hstart = p_h * bin_size_h + roi_start_h
-            hstart = hstart + trans_y * roi_height
-            sum = 0
-            num_sample = 0
-            g_w = np.floor(p_w * self.group_size[0] / self.pooled_height)
-            g_h = np.floor(p_h * self.group_size[1] / self.pooled_width)
-            g_w = min(max(g_w, 0), self.group_size[0] - 1)
-            g_h = min(max(g_h, 0), self.group_size[1] - 1)
-            input_i = self.input[roi_batch_id]
-            for i_w in range(self.sample_per_part):
-                for i_h in range(self.sample_per_part):
-                    w_sample = wstart + i_w * sub_bin_size_w
-                    h_sample = hstart + i_h * sub_bin_size_h
-                    if w_sample < -0.5 or w_sample > self.width - 0.5 or \
-                    h_sample < -0.5 or h_sample > self.height - 0.5:
-                        continue
-                    w_sample = min(max(w_sample, 0.), self.width - 1.)
-                    h_sample = min(max(h_sample, 0.), self.height - 1.)
-                    c_sample = int((ctop * self.group_size[0] + g_h) *
-                                   self.group_size[1] + g_w)
-                    val = self.dmc_bilinear(input_i[c_sample], h_sample,
-                                            w_sample)
-                    sum = sum + val
-                    num_sample = num_sample + 1
-            if num_sample == 0:
-                self.out[n_out][ctop][p_h][p_w] = 0
-            else:
-                self.out[n_out][ctop][p_h][p_w] = sum / num_sample
-            self.top_count[n_out][ctop][p_h][p_w] = num_sample
-
-    def setUp(self):
-        self.op_type = "deformable_psroi_pooling"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Input'], 'Output')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
deleted file mode 100644
index 4b0bc1dcf85fbb384eea09ee286d35ec248aae70..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-from op_test import OpTest
-
-
-class TestDensityPriorBoxOp(OpTest):
-    def set_data(self):
-        self.init_test_params()
-        self.init_test_input()
-        self.init_test_output()
-        self.inputs = {'Input': self.input, 'Image': self.image}
-
-        self.attrs = {
-            'variances': self.variances,
-            'clip': self.clip,
-            'step_w': self.step_w,
-            'step_h': self.step_h,
-            'offset': self.offset,
-            'densities': self.densities,
-            'fixed_sizes': self.fixed_sizes,
-            'fixed_ratios': self.fixed_ratios,
-            'flatten_to_2d': self.flatten_to_2d
-        }
-        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "density_prior_box"
-        self.set_data()
-
-    def set_density(self):
-        self.densities = [4, 2, 1]
-        self.fixed_sizes = [32.0, 64.0, 128.0]
-        self.fixed_ratios = [1.0]
-        self.layer_w = 17
-        self.layer_h = 17
-        self.image_w = 533
-        self.image_h = 533
-        self.flatten_to_2d = False
-
-    def init_test_params(self):
-        self.set_density()
-
-        self.step_w = float(self.image_w) / float(self.layer_w)
-        self.step_h = float(self.image_h) / float(self.layer_h)
-
-        self.input_channels = 2
-        self.image_channels = 3
-        self.batch_size = 10
-
-        self.variances = [0.1, 0.1, 0.2, 0.2]
-        self.variances = np.array(self.variances, dtype=np.float).flatten()
-
-        self.clip = True
-        self.num_priors = 0
-        if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
-            for density in self.densities:
-                if len(self.fixed_ratios) > 0:
-                    self.num_priors += len(self.fixed_ratios) * (pow(density,
-                                                                     2))
-        self.offset = 0.5
-
-    def init_test_input(self):
-        self.image = np.random.random(
-            (self.batch_size, self.image_channels, self.image_w,
-             self.image_h)).astype('float32')
-
-        self.input = np.random.random(
-            (self.batch_size, self.input_channels, self.layer_w,
-             self.layer_h)).astype('float32')
-
-    def init_test_output(self):
-        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
-        out_boxes = np.zeros(out_dim).astype('float32')
-        out_var = np.zeros(out_dim).astype('float32')
-
-        step_average = int((self.step_w + self.step_h) * 0.5)
-        for h in range(self.layer_h):
-            for w in range(self.layer_w):
-                idx = 0
-                c_x = (w + self.offset) * self.step_w
-                c_y = (h + self.offset) * self.step_h
-                # Generate density prior boxes with fixed size
-                for density, fixed_size in zip(self.densities,
-                                               self.fixed_sizes):
-                    if (len(self.fixed_ratios) > 0):
-                        for ar in self.fixed_ratios:
-                            shift = int(step_average / density)
-                            box_width_ratio = fixed_size * math.sqrt(ar)
-                            box_height_ratio = fixed_size / math.sqrt(ar)
-                            for di in range(density):
-                                for dj in range(density):
-                                    c_x_temp = c_x - step_average / 2.0 + shift / 2.0 + dj * shift
-                                    c_y_temp = c_y - step_average / 2.0 + shift / 2.0 + di * shift
-                                    out_boxes[h, w, idx, :] = [
-                                        max((c_x_temp - box_width_ratio / 2.0) /
-                                            self.image_w, 0),
-                                        max((c_y_temp - box_height_ratio / 2.0)
-                                            / self.image_h, 0),
-                                        min((c_x_temp + box_width_ratio / 2.0) /
-                                            self.image_w, 1),
-                                        min((c_y_temp + box_height_ratio / 2.0)
-                                            / self.image_h, 1)
-                                    ]
-                                    idx += 1
-        if self.clip:
-            out_boxes = np.clip(out_boxes, 0.0, 1.0)
-        out_var = np.tile(self.variances,
-                          (self.layer_h, self.layer_w, self.num_priors, 1))
-        self.out_boxes = out_boxes.astype('float32')
-        self.out_var = out_var.astype('float32')
-        if self.flatten_to_2d:
-            self.out_boxes = self.out_boxes.reshape((-1, 4))
-            self.out_var = self.out_var.reshape((-1, 4))
-
-
-class TestDensityPriorBox(TestDensityPriorBoxOp):
-    def set_density(self):
-        self.densities = [3, 4]
-        self.fixed_sizes = [1.0, 2.0]
-        self.fixed_ratios = [1.0]
-        self.layer_w = 32
-        self.layer_h = 32
-        self.image_w = 40
-        self.image_h = 40
-        self.flatten_to_2d = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py b/python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py
deleted file mode 100644
index c3a21ba0bcbb656ccbf6945e778b0f80f18045c6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import unittest
-from simple_nets import simple_fc_net
-
-
-class DeprecatedMemoryOptimizationInterfaceTest(unittest.TestCase):
-    def setUp(self):
-        self.method = fluid.memory_optimize
-
-    def build_network(self, call_interface):
-        startup_prog = fluid.Program()
-        main_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            with fluid.unique_name.guard():
-                loss = simple_fc_net()
-                opt = fluid.optimizer.Adam(learning_rate=1e-3)
-                opt.minimize(loss)
-
-                if call_interface:
-                    self.method(main_prog)
-
-        return main_prog
-
-    def assert_program_equal(self, prog1, prog2):
-        block_num = prog1.num_blocks
-        self.assertEquals(block_num, prog2.num_blocks)
-
-        for block_id in range(block_num):
-            block1 = prog1.block(block_id)
-            block2 = prog2.block(block_id)
-            self.assertEquals(len(block1.ops), len(block2.ops))
-            for op1, op2 in zip(block1.ops, block2.ops):
-                self.assertEquals(op1.input_arg_names, op2.input_arg_names)
-                self.assertEquals(op1.output_arg_names, op2.output_arg_names)
-
-            self.assertEquals(len(block1.vars), len(block2.vars))
-            for var1 in block1.vars.values():
-                self.assertTrue(var1.name in block2.vars)
-                var2 = block2.vars.get(var1.name)
-                self.assertEquals(var1.name, var2.name)
-
-    def test_main(self):
-        prog1 = self.build_network(False)
-        prog2 = self.build_network(True)
-        self.assert_program_equal(prog1, prog2)
-
-
-class ReleaseMemoryTest(DeprecatedMemoryOptimizationInterfaceTest):
-    def setUp(self):
-        self.method = fluid.release_memory
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
deleted file mode 100644
index 82e704169e4b828549dab4c47b0fa46d9afd8f7e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ /dev/null
@@ -1,203 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import math
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import os
-import signal
-import six
-import collections
-
-SEED = 1
-DTYPE = "float32"
-paddle.dataset.mnist.fetch()
-
-
-# random seed must set before configuring the network.
-# fluid.default_startup_program().random_seed = SEED
-def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=data,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-
-    # TODO(dzhwinter) : refine the initializer and random seed settting
-    SIZE = 10
-    input_shape = conv_pool_2.shape
-    param_shape = [six.moves.reduce(lambda a, b: a * b, input_shape[1:], 1)
-                   ] + [SIZE]
-    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
-
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale)))
-    return predict
-
-
-def get_model(batch_size):
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    predict = cnn_model(images)
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    inference_program = fluid.default_main_program().clone()
-    # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
-
-    # Reader
-    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=batch_size)
-    opt.minimize(avg_cost)
-    return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
-
-
-def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
-    t = fluid.DistributeTranspiler()
-    t.transpile(
-        trainer_id=trainer_id,
-        program=main_program,
-        pservers=pserver_endpoints,
-        trainers=trainers)
-    return t
-
-
-from paddle.fluid.transpiler.details import op_to_code
-
-
-def operator_equal(a, b):
-    if op_to_code(a) != op_to_code(b):
-        raise ValueError("In operator_equal not equal\n")
-
-    for k, v in six.iteritems(a.__dict__):
-        if isinstance(v, fluid.framework.Program) or \
-                isinstance(v, fluid.framework.Block):
-            continue
-
-        elif isinstance(v, core.OpDesc):
-            continue
-
-        elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(list(six.iteritems(v)), key=lambda x: x[0])
-            v1 = sorted(list(six.iteritems(b.__dict__[k])), key=lambda x: x[0])
-
-            if v0 != v1:
-                raise ValueError("In operator_equal not equal:{0}\n".format(k))
-
-        elif (v != b.__dict__[k]):
-            raise ValueError("In operator_equal not equal:{0}\n".format(k))
-
-    return True
-
-
-def block_equal(a, b):
-    for k, v in six.iteritems(a.__dict__):
-        if isinstance(v, core.ProgramDesc) or isinstance(
-                v, fluid.framework.Program) or isinstance(v, core.BlockDesc):
-            continue
-
-        elif k == "ops":
-            assert (len(a.ops) == len(b.ops))
-            for i in range(0, len(a.ops)):
-                if not operator_equal(a.ops[i], b.ops[i]):
-                    raise ValueError("In block_equal not equal:{0}\n".format(k))
-
-        elif isinstance(v, collections.OrderedDict):
-            for key, value in six.iteritems(v):
-                if str(value) != str(b.__dict__[k][key]):
-                    raise ValueError("In block_equal not equal:{0}\n".format(k))
-
-        elif (v != b.__dict__[k]):
-            raise ValueError("In block_equal not equal:{0}\n".format(k))
-
-    return True
-
-
-def program_equal(a, b):
-    for k, v in six.iteritems(a.__dict__):
-        if isinstance(v, core.ProgramDesc):
-            continue
-
-        elif k == 'blocks':
-            for i in range(0, len(a.blocks)):
-                if not block_equal(a.blocks[i], b.blocks[i]):
-                    raise ValueError("In operator_equal not equal:{0}\n".format(
-                        k))
-                    return False
-            assert (len(a.blocks) == len(b.blocks))
-
-        elif (v != b.__dict__[k]):
-            raise ValueError("In program_equal not equal:{0}\n".format(k))
-
-    return True
-
-
-class TestDistMnist(unittest.TestCase):
-    def test_desc_clone(self):
-        get_model(batch_size=20)
-
-        pserver_endpoints = "127.0.0.1:9123"
-        trainers = 1
-        current_endpoint = "127.0.0.1:9123"
-        t = get_transpiler(0,
-                           fluid.default_main_program(), pserver_endpoints,
-                           trainers)
-
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
-        main = pserver_prog.clone()
-        startup = startup_prog.clone()
-
-        self.assertTrue(program_equal(main, pserver_prog))
-        self.assertTrue(program_equal(startup, startup_prog))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_detach.py b/python/paddle/fluid/tests/unittests/test_detach.py
deleted file mode 100644
index 684fe3298e2bf3cbe530da80f48843ef56d6e30e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_detach.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import paddle.fluid as fluid
-
-from paddle.fluid import FC
-from paddle.fluid.dygraph import FC
-from paddle.fluid.dygraph.base import to_variable
-
-import unittest
-
-
-class Test_Detach(unittest.TestCase):
-    def generate_Data(self):
-        data = np.array(
-            [[1, 8, 3, 9], [7, 20, 9, 6], [4, 6, 8, 10]]).astype('float32')
-        return data
-
-    def no_detach_multi(self):
-        data = self.generate_Data()
-        with fluid.dygraph.guard():
-            fc_w_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(5.0))
-            fc_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(6.0))
-            fc = FC("fc",
-                    10,
-                    num_flatten_dims=1,
-                    param_attr=fc_w_param_attrs,
-                    bias_attr=fc_b_param_attrs)
-            fc1_w_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(7.0))
-            fc1_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(8.0))
-            fc1 = FC("fc",
-                     1,
-                     num_flatten_dims=1,
-                     param_attr=fc1_w_param_attrs,
-                     bias_attr=fc1_b_param_attrs)
-            fc2_w_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(9.0))
-            fc2_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(10.0))
-            fc2 = FC("fc",
-                     1,
-                     num_flatten_dims=1,
-                     param_attr=fc2_w_param_attrs,
-                     bias_attr=fc2_b_param_attrs)
-            data = to_variable(data)
-            x = fc(data)
-            x1 = fc1(x)
-            x2 = fc2(x)
-            loss = x1 + x2
-            # print(loss, loss.shape)
-            loss.backward()
-            return x.gradient()
-
-    def no_detach_single(self):
-        data = self.generate_Data()
-        with fluid.dygraph.guard():
-            fc_w_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(5.0))
-            fc_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(6.0))
-            fc = FC("fc",
-                    10,
-                    num_flatten_dims=1,
-                    param_attr=fc_w_param_attrs,
-                    bias_attr=fc_b_param_attrs)
-            fc1_w_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(7.0))
-            fc1_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(8.0))
-            fc1 = FC("fc",
-                     1,
-                     num_flatten_dims=1,
-                     param_attr=fc1_w_param_attrs,
-                     bias_attr=fc1_b_param_attrs)
-            data = to_variable(data)
-            x = fc(data)
-            x1 = fc1(x)
-            loss = x1
-            # print(loss, loss.shape)
-            loss.backward()
-            return x.gradient()
-
-    def detach_multi(self):
-        data = self.generate_Data()
-        with fluid.dygraph.guard():
-            fc_w_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(5.0))
-            fc_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(6.0))
-            fc = FC("fc",
-                    10,
-                    num_flatten_dims=1,
-                    param_attr=fc_w_param_attrs,
-                    bias_attr=fc_b_param_attrs)
-            fc1_w_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(7.0))
-            fc1_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(8.0))
-            fc1 = FC("fc",
-                     1,
-                     num_flatten_dims=1,
-                     param_attr=fc1_w_param_attrs,
-                     bias_attr=fc1_b_param_attrs)
-            fc2_w_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(9.0))
-            fc2_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(10.0))
-            fc2 = FC("fc",
-                     1,
-                     num_flatten_dims=1,
-                     param_attr=fc2_w_param_attrs,
-                     bias_attr=fc2_b_param_attrs)
-            data = to_variable(data)
-            x = fc(data)
-            x_detach = x.detach()
-            x1 = fc1(x)
-            x2 = fc2(x_detach)
-            loss = x1 + x2
-            # print(loss, loss.shape)
-            loss.backward()
-            return x.gradient()
-
-    def test_NoDetachMulti_DetachMulti(self):
-        array_no_detach_multi = self.no_detach_multi()
-        array_detach_multi = self.detach_multi()
-
-        assert not np.array_equal(array_no_detach_multi, array_detach_multi)
-
-    def test_NoDetachSingle_DetachMulti(self):
-        array_no_detach_single = self.no_detach_single()
-        array_detach_multi = self.detach_multi()
-        assert np.array_equal(array_no_detach_single, array_detach_multi)
-
-    def test_detach_exception(self):
-        x = fluid.layers.data(name="a", shape=[3, 4], dtype='float32')
-        y = fluid.layers.fc(input=x, size=10, bias_attr=True)
-        try:
-            y_detach = y.detach()
-        except Exception as e:
-            assert type(e) == AttributeError
-            assert str(e) == 'static graph model DO NOT supprt detach'
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
deleted file mode 100644
index 0c5343a97d5ef0f97fc6b144dfc82174eacb8573..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ /dev/null
@@ -1,285 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import six
-import sys
-import collections
-import math
-import paddle.fluid as fluid
-from op_test import OpTest
-
-
-class TestDetectionMAPOp(OpTest):
-    def set_data(self):
-        self.class_num = 4
-        self.init_test_case()
-        self.mAP = [self.calc_map(self.tf_pos, self.tf_pos_lod)]
-        self.label = np.array(self.label).astype('float32')
-        self.detect = np.array(self.detect).astype('float32')
-        self.mAP = np.array(self.mAP).astype('float32')
-
-        if len(self.class_pos_count) > 0:
-            self.class_pos_count = np.array(self.class_pos_count).astype(
-                'int32')
-            self.true_pos = np.array(self.true_pos).astype('float32')
-            self.false_pos = np.array(self.false_pos).astype('float32')
-            self.has_state = np.array([1]).astype('int32')
-
-            self.inputs = {
-                'Label': (self.label, self.label_lod),
-                'DetectRes': (self.detect, self.detect_lod),
-                'HasState': self.has_state,
-                'PosCount': self.class_pos_count,
-                'TruePos': (self.true_pos, self.true_pos_lod),
-                'FalsePos': (self.false_pos, self.false_pos_lod)
-            }
-        else:
-            self.inputs = {
-                'Label': (self.label, self.label_lod),
-                'DetectRes': (self.detect, self.detect_lod),
-            }
-
-        self.attrs = {
-            'overlap_threshold': self.overlap_threshold,
-            'evaluate_difficult': self.evaluate_difficult,
-            'ap_type': self.ap_type,
-            'class_num': self.class_num
-        }
-
-        self.out_class_pos_count = np.array(self.out_class_pos_count).astype(
-            'int')
-        self.out_true_pos = np.array(self.out_true_pos).astype('float32')
-        self.out_false_pos = np.array(self.out_false_pos).astype('float32')
-
-        self.outputs = {
-            'MAP': self.mAP,
-            'AccumPosCount': self.out_class_pos_count,
-            'AccumTruePos': (self.out_true_pos, self.out_true_pos_lod),
-            'AccumFalsePos': (self.out_false_pos, self.out_false_pos_lod)
-        }
-
-    def init_test_case(self):
-        self.overlap_threshold = 0.3
-        self.evaluate_difficult = True
-        self.ap_type = "integral"
-
-        self.label_lod = [[2, 2]]
-        # label difficult xmin ymin xmax ymax
-        self.label = [[1, 0, 0.1, 0.1, 0.3, 0.3], [1, 1, 0.6, 0.6, 0.8, 0.8],
-                      [2, 0, 0.3, 0.3, 0.6, 0.5], [1, 0, 0.7, 0.1, 0.9, 0.3]]
-
-        # label score xmin ymin xmax ymax difficult
-        self.detect_lod = [[3, 4]]
-        self.detect = [
-            [1, 0.3, 0.1, 0.0, 0.4, 0.3], [1, 0.7, 0.0, 0.1, 0.2, 0.3],
-            [1, 0.9, 0.7, 0.6, 0.8, 0.8], [2, 0.8, 0.2, 0.1, 0.4, 0.4],
-            [2, 0.1, 0.4, 0.3, 0.7, 0.5], [1, 0.2, 0.8, 0.1, 1.0, 0.3],
-            [3, 0.2, 0.8, 0.1, 1.0, 0.3]
-        ]
-
-        # label score true_pos false_pos
-        self.tf_pos_lod = [[3, 4]]
-        self.tf_pos = [[1, 0.9, 1, 0], [1, 0.7, 1, 0], [1, 0.3, 0, 1],
-                       [1, 0.2, 1, 0], [2, 0.8, 0, 1], [2, 0.1, 1, 0],
-                       [3, 0.2, 0, 1]]
-
-        self.class_pos_count = []
-        self.true_pos_lod = [[]]
-        self.true_pos = [[]]
-        self.false_pos_lod = [[]]
-        self.false_pos = [[]]
-
-    def calc_map(self, tf_pos, tf_pos_lod):
-        mAP = 0.0
-        count = 0
-
-        def get_input_pos(class_pos_count, true_pos, true_pos_lod, false_pos,
-                          false_pos_lod):
-            class_pos_count_dict = collections.Counter()
-            true_pos_dict = collections.defaultdict(list)
-            false_pos_dict = collections.defaultdict(list)
-            for i, count in enumerate(class_pos_count):
-                class_pos_count_dict[i] = count
-
-            cur_pos = 0
-            for i in range(len(true_pos_lod[0])):
-                start = cur_pos
-                cur_pos += true_pos_lod[0][i]
-                end = cur_pos
-                for j in range(start, end):
-                    true_pos_dict[i].append(true_pos[j])
-
-            cur_pos = 0
-            for i in range(len(false_pos_lod[0])):
-                start = cur_pos
-                cur_pos += false_pos_lod[0][i]
-                end = cur_pos
-                for j in range(start, end):
-                    false_pos_dict[i].append(false_pos[j])
-
-            return class_pos_count_dict, true_pos_dict, false_pos_dict
-
-        def get_output_pos(label_count, true_pos, false_pos):
-            label_number = self.class_num
-
-            out_class_pos_count = []
-            out_true_pos_lod = []
-            out_true_pos = []
-            out_false_pos_lod = []
-            out_false_pos = []
-
-            for i in range(label_number):
-                out_class_pos_count.append([label_count[i]])
-                true_pos_list = true_pos[i]
-                out_true_pos += true_pos_list
-                out_true_pos_lod.append(len(true_pos_list))
-                false_pos_list = false_pos[i]
-                out_false_pos += false_pos_list
-                out_false_pos_lod.append(len(false_pos_list))
-
-            return out_class_pos_count, out_true_pos, [
-                out_true_pos_lod
-            ], out_false_pos, [out_false_pos_lod]
-
-        def get_accumulation(pos_list):
-            sorted_list = sorted(pos_list, key=lambda pos: pos[0], reverse=True)
-            sum = 0
-            accu_list = []
-            for (score, count) in sorted_list:
-                sum += count
-                accu_list.append(sum)
-            return accu_list
-
-        label_count, true_pos, false_pos = get_input_pos(
-            self.class_pos_count, self.true_pos, self.true_pos_lod,
-            self.false_pos, self.false_pos_lod)
-        for v in self.label:
-            label = v[0]
-            difficult = False if len(v) == 5 else v[1]
-            if self.evaluate_difficult:
-                label_count[label] += 1
-            elif not difficult:
-                label_count[label] += 1
-
-        for (label, score, tp, fp) in tf_pos:
-            true_pos[label].append([score, tp])
-            false_pos[label].append([score, fp])
-
-        for (label, label_pos_num) in six.iteritems(label_count):
-            if label_pos_num == 0 or label not in true_pos: continue
-            label_true_pos = true_pos[label]
-            label_false_pos = false_pos[label]
-
-            accu_tp_sum = get_accumulation(label_true_pos)
-            accu_fp_sum = get_accumulation(label_false_pos)
-
-            precision = []
-            recall = []
-
-            for i in range(len(accu_tp_sum)):
-                precision.append(
-                    float(accu_tp_sum[i]) /
-                    float(accu_tp_sum[i] + accu_fp_sum[i]))
-                recall.append(float(accu_tp_sum[i]) / label_pos_num)
-
-            if self.ap_type == "11point":
-                max_precisions = [0.0] * 11
-                start_idx = len(accu_tp_sum) - 1
-                for j in range(10, -1, -1):
-                    for i in range(start_idx, -1, -1):
-                        if recall[i] < float(j) / 10.0:
-                            start_idx = i
-                            if j > 0:
-                                max_precisions[j - 1] = max_precisions[j]
-                                break
-                        else:
-                            if max_precisions[j] < precision[i]:
-                                max_precisions[j] = precision[i]
-                for j in range(10, -1, -1):
-                    mAP += max_precisions[j] / 11
-                count += 1
-            elif self.ap_type == "integral":
-                average_precisions = 0.0
-                prev_recall = 0.0
-                for i in range(len(accu_tp_sum)):
-                    if math.fabs(recall[i] - prev_recall) > 1e-6:
-                        average_precisions += precision[i] * \
-                            math.fabs(recall[i] - prev_recall)
-                        prev_recall = recall[i]
-
-                mAP += average_precisions
-                count += 1
-        pcnt, tp, tp_lod, fp, fp_lod = get_output_pos(label_count, true_pos,
-                                                      false_pos)
-        self.out_class_pos_count = pcnt
-        self.out_true_pos = tp
-        self.out_true_pos_lod = tp_lod
-        self.out_false_pos = fp
-        self.out_false_pos_lod = fp_lod
-        if count != 0:
-            mAP /= count
-        return mAP
-
-    def setUp(self):
-        self.op_type = "detection_map"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
-    def init_test_case(self):
-        super(TestDetectionMAPOpSkipDiff, self).init_test_case()
-
-        self.evaluate_difficult = False
-
-        self.tf_pos_lod = [[2, 4]]
-        # label score true_pos false_pos
-        self.tf_pos = [[1, 0.7, 1, 0], [1, 0.3, 0, 1], [1, 0.2, 1, 0],
-                       [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]]
-
-
-class TestDetectionMAPOpWithoutDiff(TestDetectionMAPOp):
-    def init_test_case(self):
-        super(TestDetectionMAPOpWithoutDiff, self).init_test_case()
-
-        # label xmin ymin xmax ymax
-        self.label = [[1, 0.1, 0.1, 0.3, 0.3], [1, 0.6, 0.6, 0.8, 0.8],
-                      [2, 0.3, 0.3, 0.6, 0.5], [1, 0.7, 0.1, 0.9, 0.3]]
-
-
-class TestDetectionMAPOp11Point(TestDetectionMAPOp):
-    def init_test_case(self):
-        super(TestDetectionMAPOp11Point, self).init_test_case()
-
-        self.ap_type = "11point"
-
-
-class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
-    def init_test_case(self):
-        super(TestDetectionMAPOpMultiBatch, self).init_test_case()
-        self.class_pos_count = [0, 2, 1, 0]
-        self.true_pos_lod = [[0, 3, 2]]
-        self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
-        self.false_pos_lod = [[0, 3, 2]]
-        self.false_pos = [[0.7, 0.], [0.3, 1.], [0.2, 0.], [0.8, 1.], [0.1, 0.]]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_op.py b/python/paddle/fluid/tests/unittests/test_dgc_op.py
deleted file mode 100644
index 04766dd858496e18642d6532e49bd810ef34cac0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dgc_op.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import paddle.fluid as fluid
-
-g_array_size = 102400
-
-
-class TestDGCOp(unittest.TestCase):
-    def setup(self, place, array_size=g_array_size):
-        size = array_size
-        np.random.seed(5)  # fix seed
-
-        self.scope = fluid.global_scope()
-        self.place = place
-        print("place:", place)
-
-        # numpy data
-        # inputs: U, V, Grad, current_step
-        self.u_name = "U"
-        self.u = np.random.random(size).astype("float32")
-
-        self.v_name = "V"
-        self.v = np.random.random(size).astype("float32")
-
-        self.grad_name = "Grad"
-        self.grad = np.random.random(size).astype("float32")
-
-        self.current_step_name = "current_step"
-        self.current_step = np.full((1), 0.0).astype("float32")
-
-        # output: U_out, V_out, EncodeGrad, GradLocal_out
-        self.encode_grad_name = "EncodeGrad"
-        self.k_name = "k"
-        self.k = np.full((1), 0.0).astype("float32")
-
-        # scope data 
-        self.u_tensor = self.scope.var(self.u_name).get_tensor()
-        self.u_tensor.set(self.u, place)
-
-        self.v_tensor = self.scope.var(self.v_name).get_tensor()
-        self.v_tensor.set(self.v, place)
-
-        self.grad_tensor = self.scope.var(self.grad_name).get_tensor()
-        self.grad_tensor.set(self.grad, place)
-
-        self.encode_grad_tensor = self.scope.var(
-            self.encode_grad_name).get_tensor()
-
-        self.current_step_tensor = self.scope.var(
-            self.current_step_name).get_tensor()
-        self.current_step_tensor.set(self.current_step, core.CPUPlace())
-
-        self.k_tensor = self.scope.var(self.k_name).get_tensor()
-        self.k_tensor.set(self.k, core.CPUPlace())
-
-    def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
-        self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol),
-            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-            + str(expect_t) + "\n" + "But Got" + str(actual_t))
-
-    def test_run_and_check(self):
-        self.setup(place=core.CUDAPlace(0))
-        kwargs = {
-            # inputs
-            'U': self.u_name,
-            'V': self.v_name,
-            'Grad': self.grad_name,
-            'current_step': self.current_step_name,
-
-            # outputs
-            'U_out': self.u_name,
-            'V_out': self.v_name,
-            'EncodeGrad': self.encode_grad_name,
-            'Grad_out': self.grad_name,
-            'k': self.k_name,
-
-            # attrs
-            'm': 0.9,
-            'sparsity': [0.75, 0.9375, 0.984375, 0.996, 0.999],
-            'use_nesterov': True,
-            'rampup_begin_step': float(0.0),
-            'rampup_step': float(10.0),
-        }
-
-        dgc_op = Operator('dgc', **kwargs)
-
-        #atol = 1e-6
-        dgc_op.run(self.scope, self.place)
-
-        u_out = np.array(self.u_tensor)
-        v_out = np.array(self.v_tensor)
-        grad_out = np.array(self.grad_tensor)
-        encode_grad_out = np.array(self.encode_grad_tensor)
-        k = int(np.array(self.k_tensor)[0])
-
-        print("u_out:", u_out[0:20])
-        print("v_out:", v_out[0:20])
-        print("encode_grad_out:", encode_grad_out)
-        print("k_out:", k)
-
-        self.assertEqual(k, int(g_array_size * 0.25))
-
-        index = encode_grad_out[0:k].view(dtype=np.int32)
-        value = encode_grad_out[k:2 * k]
-
-        acl = 1e-7
-
-        for i in range(0, k):
-            self.assertAlmostEqual(u_out[index[i]], 0.0)
-            self.assertAlmostEqual(v_out[index[i]], 0.0)
-
-        a_min = np.amin(value)
-        dangling = [x for x in v_out if x > a_min]
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_diag.py b/python/paddle/fluid/tests/unittests/test_diag.py
deleted file mode 100644
index eed8b91f0e3dc5a0552e9d912e2b63d724c4d6d9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestDiagOp(OpTest):
-    def setUp(self):
-        self.op_type = "diag"
-        self.init_config()
-        self.inputs = {'Diagonal': self.case}
-
-        self.outputs = {'Out': np.diag(self.inputs['Diagonal'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_config(self):
-        self.case = np.arange(3, 6)
-
-
-class TestDiagOpCase1(TestDiagOp):
-    def init_config(self):
-        self.case = np.array([3], dtype='int32')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
deleted file mode 100644
index fbeff20c63b2f4a3f01ac4131ac7063aff0204cf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistNCCL2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._nccl2_reduce_layer = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_allreduce_op.py", delta=1e-5)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
deleted file mode 100644
index c9230b68fef75d8708b3b4c397b08f0fcd8eb345..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ /dev/null
@@ -1,848 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import time
-
-import unittest
-import os
-import sys
-import signal
-import subprocess
-import six
-import argparse
-import pickle
-import numpy as np
-import time
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-import paddle.fluid.dygraph as dygraph
-from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.parallel import DataParallel
-
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-
-RUN_STEP = 5
-DEFAULT_BATCH_SIZE = 2
-
-
-def print_to_out(out_losses):
-    if six.PY2:
-        print(pickle.dumps(out_losses))
-    else:
-        sys.stdout.buffer.write(pickle.dumps(out_losses))
-
-
-def print_to_err(class_name, log_str):
-    localtime = time.asctime(time.localtime(time.time()))
-    print_str = localtime + "\t" + class_name + "\t" + log_str
-    if six.PY2:
-        sys.stderr.write(pickle.dumps(print_str))
-    else:
-        sys.stderr.buffer.write(pickle.dumps(print_str))
-
-
-def eprint(*args, **kwargs):
-    print(*args, file=sys.stderr, **kwargs)
-
-
-class TestDistRunnerBase(object):
-    def get_model(self,
-                  batch_size=DEFAULT_BATCH_SIZE,
-                  lr=0.1,
-                  single_device=False,
-                  use_dgc=False):
-        raise NotImplementedError(
-            "get_model should be implemented by child classes.")
-
-    @staticmethod
-    def get_transpiler(trainer_id,
-                       main_program,
-                       pserver_endpoints,
-                       trainers,
-                       sync_mode,
-                       dc_asgd=False,
-                       current_endpoint=None,
-                       nccl_comm_num=1):
-        # NOTE: import fluid until runtime, or else forking processes will cause error.
-        config = fluid.DistributeTranspilerConfig()
-        config.enable_dc_asgd = dc_asgd
-        config.sync_mode = sync_mode
-        if nccl_comm_num > 1:
-            config.nccl_comm_num = nccl_comm_num
-        # config.runtime_split_send_recv = True
-        t = fluid.DistributeTranspiler(config=config)
-        t.transpile(
-            trainer_id=trainer_id,
-            program=main_program,
-            pservers=pserver_endpoints,
-            trainers=trainers,
-            current_endpoint=current_endpoint)
-        return t
-
-    def run_pserver(self, args):
-        self.lr = args.lr
-        self.get_model(batch_size=args.batch_size)
-        # NOTE: pserver should not call memory optimize
-        t = self.get_transpiler(args.trainer_id,
-                                fluid.default_main_program(), args.endpoints,
-                                args.trainers, args.sync_mode, args.dc_asgd)
-        pserver_prog = t.get_pserver_program(args.current_endpoint)
-        startup_prog = t.get_startup_program(args.current_endpoint,
-                                             pserver_prog)
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-        print_to_err(type(self).__name__, "run pserver startup program done.")
-        exe.run(pserver_prog)
-        print_to_err(type(self).__name__, "run pserver main program done.")
-
-    def run_gpu_fleet_api_trainer(self, args):
-        assert args.update_method == "nccl2"
-
-        self.lr = args.lr
-
-        exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.num_threads = 1
-
-        dist_strategy = DistributedStrategy()
-        dist_strategy.exec_strategy = exec_strategy
-        dist_strategy.fuse_memory_size = 1  #MB
-        dist_strategy.fuse_laryer_size = 1
-        if args.use_local_sgd:
-            dist_strategy.use_local_sgd = True
-        if args.ut4grad_allreduce:
-            dist_strategy._ut4grad_allreduce = True
-
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        print_to_err("gpu_fleet", "fleet.node_num:")
-        #"fleet.node_id:", fleet.node_id(),
-        #"fleet.trainer_num:", fleet.worker_num())
-
-        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
-                self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)
-
-        trainer_prog = fleet._origin_program
-        dist_prog = fleet.main_program
-
-        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-        place = fluid.CUDAPlace(device_id)
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        eprint(type(self).__name__, "run worker startup program done.")
-
-        feed_var_list = [
-            var for var in trainer_prog.global_block().vars.values()
-            if var.is_data
-        ]
-
-        feeder = fluid.DataFeeder(feed_var_list, place)
-        reader_generator = train_reader()
-
-        def get_data():
-            origin_batch = next(reader_generator)
-            if args.update_method != "local" and args.use_reader_alloc:
-                new_batch = []
-                for offset, item in enumerate(origin_batch):
-                    if offset % 2 == args.trainer_id:
-                        new_batch.append(item)
-                return new_batch
-            else:
-                return origin_batch
-
-        print_to_err(type(self).__name__, "begin to train on trainer")
-        out_losses = []
-        for i in six.moves.xrange(RUN_STEP):
-            loss, = exe.run(dist_prog,
-                            fetch_list=[avg_cost.name],
-                            feed=feeder.feed(get_data()))
-            out_losses.append(loss[0])
-            print_to_err(type(self).__name__, "run step %d finished" % i)
-        print_to_err(type(self).__name__, "trainer run finished")
-
-        if six.PY2:
-            print(pickle.dumps(out_losses))
-        else:
-            sys.stdout.buffer.write(pickle.dumps(out_losses))
-
-    def run_trainer(self, args):
-        self.lr = args.lr
-        if args.nccl2_reduce_layer_local_run:
-            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
-                self.get_model(batch_size=args.batch_size, single_device=True)
-        elif args.use_dgc:
-            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
-                self.get_model(batch_size=args.batch_size, use_dgc=args.use_dgc)
-        else:
-            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
-                self.get_model(batch_size=args.batch_size)
-
-        if args.update_method == "pserver":
-            print_to_err(
-                type(self).__name__,
-                "begin to run transpile on trainer with pserver mode")
-            t = self.get_transpiler(args.trainer_id,
-                                    fluid.default_main_program(),
-                                    args.endpoints, args.trainers,
-                                    args.sync_mode, args.dc_asgd)
-            trainer_prog = t.get_trainer_program()
-            print_to_err(
-                type(self).__name__,
-                "get trainer program done with pserver mode.")
-        elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
-            # transpile for nccl2
-            config = fluid.DistributeTranspilerConfig()
-            config.mode = "nccl2"
-            config.nccl_comm_num = args.nccl_comm_num
-            if args.use_hallreduce:
-                config.use_hierarchical_allreduce = True
-                config.hierarchical_allreduce_inter_nranks = args.hallreduce_inter_nranks
-            print_to_err(
-                type(self).__name__,
-                "begin to run transpile on trainer with nccl2 mode")
-            nccl2_t = fluid.DistributeTranspiler(config=config)
-            nccl2_t.transpile(
-                args.trainer_id,
-                program=fluid.default_main_program(),
-                startup_program=fluid.default_startup_program(),
-                trainers=args.endpoints,
-                current_endpoint=args.current_endpoint)
-            print_to_err(
-                type(self).__name__,
-                "get trainer program done. with nccl2 mode")
-            trainer_prog = fluid.default_main_program()
-        else:
-            print_to_err(
-                type(self).__name__,
-                "do nothing about main program, just use it")
-            trainer_prog = fluid.default_main_program()
-            print_to_err(type(self).__name__, "use main program done.")
-
-        if args.use_cuda:
-            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-            place = fluid.CUDAPlace(device_id)
-        else:
-            place = fluid.CPUPlace()
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        print_to_err(type(self).__name__, "run worker startup program done.")
-
-        exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.num_threads = 1
-
-        build_stra = fluid.BuildStrategy()
-        # FIXME force disable enable_inplace and memory_optimize
-        build_stra.enable_inplace = False
-        build_stra.memory_optimize = False
-
-        if args.enable_backward_deps:
-            build_stra.enable_backward_optimizer_op_deps = True
-
-        if args.use_reduce:
-            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        else:
-            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-
-        pass_builder = None
-        if args.batch_merge_repeat > 1:
-            pass_builder = build_stra._finalize_strategy_and_create_passes()
-            mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass")
-            mypass.set("num_repeats", args.batch_merge_repeat)
-
-        if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
-            build_stra.num_trainers = len(args.endpoints.split(","))
-            build_stra.trainer_id = args.trainer_id
-        else:
-            # case args.update_method == "nccl2_reduce_layer":
-            build_stra.num_trainers = 1
-            build_stra.trainer_id = 0
-
-        print_to_err(type(self).__name__, "begin to compile with data parallel")
-        binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
-            loss_name=avg_cost.name,
-            build_strategy=build_stra,
-            exec_strategy=exec_strategy)
-        print_to_err(type(self).__name__, "program compiled with data parallel")
-
-        feed_var_list = [
-            var for var in trainer_prog.global_block().vars.values()
-            if var.is_data
-        ]
-
-        feeder = fluid.DataFeeder(feed_var_list, place)
-        reader_generator = train_reader()
-
-        def get_data():
-            origin_batch = next(reader_generator)
-            if args.update_method != "local" and args.use_reader_alloc:
-                new_batch = []
-                for offset, item in enumerate(origin_batch):
-                    if offset % 2 == args.trainer_id:
-                        new_batch.append(item)
-                return new_batch
-            else:
-                return origin_batch
-
-        print_to_err(type(self).__name__, "begin to train on trainer")
-        out_losses = []
-        for i in six.moves.xrange(RUN_STEP):
-            loss, = exe.run(binary,
-                            fetch_list=[avg_cost.name],
-                            feed=feeder.feed(get_data()))
-            out_losses.append(loss[0])
-            print_to_err(type(self).__name__, "run step %d finished" % i)
-        print_to_err(type(self).__name__, "trainer run finished")
-
-        print_to_out(out_losses)
-
-
-class TestParallelDyGraphRunnerBase(object):
-    def get_model(self):
-        raise NotImplementedError(
-            "get_model should be implemented by child classes.")
-
-    def run_one_loop(self, model, opt, data):
-        raise NotImplementedError(
-            "train_one_loop should be implemented by the child classes.")
-
-    def run_trainer(self, args):
-
-        seed = 90
-        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-        place = fluid.CUDAPlace(device_id)
-
-        def _get_data(batch):
-            if args.update_method != "local":
-                new_batch = []
-                for offset, item in enumerate(batch):
-                    if offset % 2 == args.trainer_id:
-                        new_batch.append(item)
-                return new_batch
-            else:
-                return batch
-
-        with fluid.dygraph.guard(place):
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            np.random.seed(seed)
-            import random
-            random.seed = seed
-            model, train_reader, opt = self.get_model()
-            nranks = len(args.endpoints.split(",")) if args.endpoints else 1
-
-            if args.update_method == "nccl2":
-                strategy = dygraph.parallel.ParallelStrategy()
-                strategy.nranks = nranks
-                strategy.local_rank = args.trainer_id
-                strategy.trainer_endpoints = args.endpoints.split(",")
-                strategy.current_endpoint = args.current_endpoint
-                print_to_err(
-                    type(self).__name__,
-                    "begin to prepare context in dygraph with nccl2")
-                dygraph.parallel.prepare_context(strategy)
-                model = dygraph.parallel.DataParallel(model, strategy)
-                print_to_err(type(self).__name__, "model built in dygraph")
-            out_losses = []
-            print_to_err(type(self).__name__, "begin to run dygraph training")
-            for step_id, data in enumerate(train_reader()):
-                data = _get_data(data)
-                if step_id == RUN_STEP:
-                    break
-                loss = self.run_one_loop(model, opt, data)
-                if step_id % 10 == 0:
-                    print_to_err(
-                        type(self).__name__,
-                        "loss at step %d: %f" % (step_id, loss.numpy()))
-                out_losses.append(loss.numpy())
-
-                # FIXME(Yancey1989): scale the loss inplace
-                if args.update_method == "nccl2":
-                    loss = model.scale_loss(loss)
-
-                loss.backward()
-                if args.update_method == "nccl2":
-                    model.apply_collective_grads()
-
-                opt.minimize(loss)
-                model.clear_gradients()
-        print_to_out(out_losses)
-
-
-def runtime_main(test_class):
-    parser = argparse.ArgumentParser(description='Run dist test.')
-    parser.add_argument(
-        '--role', type=str, required=True, choices=['pserver', 'trainer'])
-    parser.add_argument('--endpoints', type=str, required=False, default="")
-    parser.add_argument(
-        '--update_method',
-        type=str,
-        default="local",
-        choices=["pserver", "nccl2", "local", "nccl2_reduce_layer"])
-    parser.add_argument('--trainer_id', type=int, required=False, default=0)
-    parser.add_argument('--trainers', type=int, required=False, default=1)
-    parser.add_argument('--nccl_comm_num', type=int, required=False, default=1)
-    parser.add_argument('--enable_backward_deps', action='store_true')
-    parser.add_argument('--use_hallreduce', action='store_true')
-    parser.add_argument('--gpu_fleet_api', action='store_true')
-    parser.add_argument('--use_local_sgd', action='store_true')
-    parser.add_argument('--ut4grad_allreduce', action='store_true')
-    parser.add_argument(
-        '--hallreduce_inter_nranks', type=int, required=False, default=2)
-    parser.add_argument(
-        '--current_endpoint', type=str, required=False, default="")
-    parser.add_argument('--sync_mode', action='store_true')
-    parser.add_argument('--use_cuda', action='store_true')
-    parser.add_argument('--use_dgc', action='store_true')
-    parser.add_argument('--use_reduce', action='store_true')
-    parser.add_argument('--dc_asgd', action='store_true')
-    parser.add_argument(
-        '--use_reader_alloc', action='store_true', required=False)
-    parser.add_argument('--batch_size', required=False, type=int, default=2)
-    parser.add_argument('--lr', required=False, type=float, default=0.001)
-    parser.add_argument(
-        '--batch_merge_repeat', required=False, type=int, default=1)
-    parser.add_argument(
-        '--nccl2_reduce_layer_local_run',
-        required=False,
-        type=bool,
-        default=False)
-
-    args = parser.parse_args()
-
-    model = test_class()
-    if args.role == "pserver" and args.update_method == "pserver":
-        model.run_pserver(args)
-    elif args.gpu_fleet_api:
-        model.run_gpu_fleet_api_trainer(args)
-    else:
-        model.run_trainer(args)
-
-
-import paddle.compat as cpt
-import socket
-from contextlib import closing
-
-
-class TestDistBase(unittest.TestCase):
-    def _setup_config(self):
-        raise NotImplementedError("tests should have _setup_config implemented")
-
-    def _after_setup_config(self):
-        if self._enforce_place == "CPU":
-            self.__use_cuda = False
-            self._use_dgc = False
-        elif self._enforce_place == "GPU":
-            self.__use_cuda = True
-        else:
-            if fluid.core.is_compiled_with_cuda():
-                self.__use_cuda = True
-            else:
-                self.__use_cuda = False
-                self._use_dgc = False
-
-        if self._use_reduce:
-            assert not self._use_dgc
-
-    def setUp(self):
-        self._trainers = 2
-        self._pservers = 2
-        self._port_set = set()
-        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
-        self._python_interp = sys.executable
-        self._sync_mode = True
-        self._enforce_place = None
-        self._use_reduce = False
-        self._dc_asgd = False  # must use with async mode
-        self._use_reader_alloc = True
-        self._nccl2_mode = False
-        self._mp_mode = False
-        # FIXME(typhoonzero): I added this stupid argument to enable
-        # testing allreduce layers, which users can call layers.allreduce
-        # to accumulate tensors at anywhere. Find a better way to do this
-        # test, reduce check this argument everywhere.
-        self._nccl2_reduce_layer = False
-        self._lr = 0.001
-        self._use_dgc = False
-        self._dygraph = False
-        self._nccl_comm_num = 1
-        self._enable_backward_deps = False
-        self._gpu_fleet_api = False
-        self._use_local_sgd = False
-        self._ut4grad_allreduce = False
-        self._use_hallreduce = False
-        self._setup_config()
-        self._after_setup_config()
-
-    def _find_free_port(self):
-        def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
-                s.bind(('', 0))
-                print_to_err(
-                    type(self).__name__, "socket name: %s" % s.getsockname()[1])
-                return s.getsockname()[1]
-
-        while True:
-            port = __free_port()
-            if port not in self._port_set:
-                self._port_set.add(port)
-                return port
-
-    def start_pserver(self, model_file, check_error_log, required_envs):
-        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps_cmd = "%s"
-
-        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
-            required_envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
-            ps_cmd += " -m coverage run --branch -p"
-
-        ps_cmd += " %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver"
-
-        ps0_cmd = ps_cmd % \
-                  (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
-                   self._trainers)
-        ps1_cmd = ps_cmd % \
-                  (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
-                   self._trainers)
-
-        if self._sync_mode:
-            ps0_cmd += " --sync_mode"
-            ps1_cmd += " --sync_mode"
-
-        print(ps0_cmd)
-        print(ps1_cmd)
-        ps0_pipe = open("/tmp/ps0_err.log", "wb")
-        ps1_pipe = open("/tmp/ps1_err.log", "wb")
-
-        print_to_err(type(self).__name__, "going to start pserver process 0")
-        ps0_proc = subprocess.Popen(
-            ps0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps0_pipe,
-            env=required_envs)
-        print_to_err(type(self).__name__, "going to start pserver process 1")
-        ps1_proc = subprocess.Popen(
-            ps1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps1_pipe,
-            env=required_envs)
-
-        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
-
-    def _run_local(self,
-                   model,
-                   envs,
-                   check_error_log=False,
-                   batch_size=DEFAULT_BATCH_SIZE,
-                   batch_merge_repeat=1):
-
-        cmd = self._python_interp
-
-        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
-            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
-            cmd += " -m coverage run --branch -p"
-
-        cmd += " %s --role trainer --lr %f" % (model, self._lr)
-
-        if batch_size != DEFAULT_BATCH_SIZE:
-            cmd += " --batch_size %d" % batch_size
-        if batch_merge_repeat > 1:
-            cmd += " --batch_merge_repeat %d" % batch_merge_repeat
-        if self._nccl2_reduce_layer:
-            cmd += " --nccl2_reduce_layer_local_run 1"
-
-        if self.__use_cuda:
-            cmd += " --use_cuda"
-            env_local = {
-                "CUDA_VISIBLE_DEVICES": "0",
-                "PADDLE_TRAINERS_NUM": "1",
-                "PADDLE_TRAINER_ID": "0"
-            }
-        else:
-            env_local = {'CPU_NUM': '1'}
-
-        env_local.update(envs)
-        print("local_cmd: {}, env: {}".format(cmd, env_local))
-
-        if check_error_log:
-            err_log = open("/tmp/trainer.err.log", "wb")
-            local_proc = subprocess.Popen(
-                cmd.split(" "),
-                stdout=subprocess.PIPE,
-                stderr=err_log,
-                env=env_local)
-        else:
-            local_proc = subprocess.Popen(
-                cmd.split(" "),
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                env=env_local)
-
-        local_out, local_err = local_proc.communicate()
-
-        if check_error_log:
-            err_log.close()
-
-        sys.stderr.write('local_stderr: %s\n' % local_err)
-        sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
-
-        return pickle.loads(local_out)
-
-    def _run_cluster(self, model, envs, check_error_log):
-        # Run dist train to compare with local results
-        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
-                                                          check_error_log, envs)
-
-        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-
-        tr_cmd = "%s"
-
-        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
-            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
-            tr_cmd += " -m coverage run --branch -p"
-
-        tr_cmd += " %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver --lr %f"
-
-        tr0_cmd = tr_cmd % \
-                  (self._python_interp, model, self._ps_endpoints,
-                   0, ps0_ep, self._trainers, self._lr)
-        tr1_cmd = tr_cmd % \
-                  (self._python_interp, model, self._ps_endpoints,
-                   1, ps1_ep, self._trainers, self._lr)
-
-        if self._sync_mode:
-            tr0_cmd += " --sync_mode"
-            tr1_cmd += " --sync_mode"
-        if self._use_reduce:
-            tr0_cmd += " --use_reduce"
-            tr1_cmd += " --use_reduce"
-        if self._use_reader_alloc:
-            tr0_cmd += " --use_reader_alloc"
-            tr1_cmd += " --use_reader_alloc"
-        if self.__use_cuda:
-            tr0_cmd += " --use_cuda"
-            tr1_cmd += " --use_cuda"
-            env0 = {"CUDA_VISIBLE_DEVICES": "0"}
-            env1 = {"CUDA_VISIBLE_DEVICES": "1"}
-        else:
-            env0 = {'CPU_NUM': '1'}
-            env1 = {'CPU_NUM': '1'}
-
-        env0.update(envs)
-        env1.update(envs)
-
-        print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0))
-        print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1))
-        tr0_pipe = open("/tmp/tr0_err.log", "wb")
-        tr1_pipe = open("/tmp/tr1_err.log", "wb")
-
-        print_to_err(type(self).__name__, "going to start trainer process 0")
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=env0)
-        print_to_err(type(self).__name__, "going to start trainer process 1")
-        tr1_proc = subprocess.Popen(
-            tr1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=env1)
-
-        # Wait until trainer process terminate
-        while True:
-            stat0 = tr0_proc.poll()
-            time.sleep(0.1)
-            if stat0 is not None:
-                break
-        while True:
-            stat1 = tr1_proc.poll()
-            time.sleep(0.1)
-            if stat1 is not None:
-                break
-
-        tr0_out, tr0_err = tr0_proc.communicate()
-        tr1_out, tr1_err = tr1_proc.communicate()
-
-        # close trainer file
-        tr0_pipe.close()
-        tr1_pipe.close()
-        ps0_pipe.close()
-        ps1_pipe.close()
-
-        ps0.terminate()
-        ps1.terminate()
-
-        return pickle.loads(tr0_out), pickle.loads(tr1_out)
-
-    def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id,
-                               trainer_num):
-        env = {}
-        tr_cmd = "%s -u"
-
-        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
-            tr_cmd += " -m coverage run --branch -p"
-
-        tr_cmd += " %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f"
-
-        tr_cmd = tr_cmd % \
-                  (self._python_interp, model, self._ps_endpoints,
-                   trainer_id, ep, update_method, self._lr)
-
-        if self._use_reduce:
-            tr_cmd += " --use_reduce"
-        if self._use_reader_alloc:
-            tr_cmd += " --use_reader_alloc"
-        if self.__use_cuda:
-            tr_cmd += " --use_cuda"
-            env.update({
-                "CUDA_VISIBLE_DEVICES": "{}".format(trainer_id),
-                "PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
-                "PADDLE_TRAINER_ID": "{}".format(trainer_id),
-                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-                "PADDLE_CURRENT_ENDPOINT": ep,
-            })
-        else:
-            env.update({'CPU_NUM': '1'})
-
-        if self._use_dgc:
-            tr_cmd += " --use_dgc"
-
-        if self._mp_mode:
-            env = {"FLAGS_selected_gpus": "{}".format(trainer_id)}
-
-        if self._nccl_comm_num > 1:
-            tr_cmd += " --nccl_comm_num {}".format(self._nccl_comm_num)
-
-        if self._use_hallreduce:
-            tr_cmd += " --use_hallreduce --hallreduce_inter_nranks 2"
-
-        if self._enable_backward_deps:
-            tr_cmd += " --enable_backward_deps"
-
-        if self._gpu_fleet_api:
-            tr_cmd += " --gpu_fleet_api"
-            if self._use_local_sgd:
-                tr_cmd += " --use_local_sgd"
-            if self._ut4grad_allreduce:
-                tr_cmd += " --ut4grad_allreduce"
-
-        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
-            env['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
-
-        return tr_cmd, env
-
-    def _run_cluster_nccl2(self, model, envs, nccl2_reduce_layer,
-                           check_error_log):
-        if self._use_hallreduce:
-            self._ps_endpoints = ""
-            for i in range(0, 4):
-                self._ps_endpoints += "127.0.0.1:%s," % (self._find_free_port())
-            self._ps_endpoints = self._ps_endpoints[:-1]
-
-        # NOTE: we reuse ps_endpoints as nccl2 worker endpoints
-        worker_endpoints = self._ps_endpoints.split(",")
-        if nccl2_reduce_layer:
-            update_method = "nccl2_reduce_layer"
-        else:
-            update_method = "nccl2"
-
-        trainer_num = len(worker_endpoints)
-
-        procs = []
-        pipes = []
-        for i in range(0, trainer_num):
-            tr_cmd, tr_env = self._get_nccl2_trainer_cmd(
-                model, worker_endpoints[i], update_method, i, trainer_num)
-            tr_env.update(envs)
-            print("use_hallreduce:{} tr_cmd:{}, env: {}".format(
-                self._use_hallreduce, tr_cmd, tr_env))
-
-            tr_pipe = open("/tmp/tr{}_err.log".format(i), "wb")
-
-            print_to_err(
-                type(self).__name__,
-                "going to start process {} with nccl2".format(i))
-            tr_proc = subprocess.Popen(
-                tr_cmd.strip().split(" "),
-                stdout=subprocess.PIPE,
-                stderr=tr_pipe,
-                env=tr_env)
-
-            procs.append(tr_proc)
-            pipes.append(tr_pipe)
-
-        outs = []
-        for i in range(0, trainer_num):
-            tr_out, tr_err = procs[i].communicate()
-            outs.append(tr_out)
-            pipes[i].close()
-            sys.stderr.write('trainer {} stderr: {}\n'.format(i, tr_err))
-
-        if check_error_log:
-            print("outs[0]:", outs[0])
-            print("outs[1]:", outs[1])
-        return pickle.loads(outs[0]), pickle.loads(outs[1])
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        # TODO(typhoonzero): should auto adapt GPU count on the machine.
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
-            "FLAGS_rpc_deadline": "30000",  # 5sec to fail fast
-            "FLAGS_cudnn_deterministic": "1",
-            "http_proxy": "",
-            "NCCL_P2P_DISABLE": "1",
-            "NCCL_SHM_DISABLE": "1"
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "10"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        local_losses\
-            = self._run_local(model_file, required_envs,
-                                check_error_log)
-        if self._nccl2_mode:
-            if self._nccl2_reduce_layer:
-                tr0_losses, tr1_losses = self._run_cluster_nccl2(
-                    model_file, required_envs, True, check_error_log)
-            else:
-                tr0_losses, tr1_losses = self._run_cluster_nccl2(
-                    model_file, required_envs, False, check_error_log)
-        else:
-            tr0_losses, tr1_losses = self._run_cluster(
-                model_file, required_envs, check_error_log)
-
-        for step_id in range(RUN_STEP):
-            local_loss = local_losses[step_id]
-            tr0_loss = tr0_losses[step_id]
-            tr1_loss = tr1_losses[step_id]
-            dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
-            print("=======", local_loss, ":", dist_loss[0], "=======")
-            self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
deleted file mode 100644
index 55234a85731ab8f11b7f0d4cb0443672722cdbbd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import os
-import unittest
-from test_dist_base import TestDistBase
-
-
-def skip_ci(func):
-    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
-
-    def __func__(*args, **kwargs):
-        if on_ci:
-            return
-        return func(*args, **kwargs)
-
-    return __func__
-
-
-@skip_ci
-class TestDistCTR2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_dist_ctr(self):
-        self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
-
-
-@skip_ci
-class TestDistCTRWithL2Decay2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_dist_ctr(self):
-        need_envs = {"USE_L2_DECAY": "1"}
-        self.check_with_place(
-            "dist_ctr.py",
-            delta=1e-7,
-            check_error_log=True,
-            need_envs=need_envs)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
deleted file mode 100644
index 1f3a7ec62082bce999b6c002c479c4d93e455f41..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ /dev/null
@@ -1,269 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import argparse
-import os
-import pickle
-import subprocess
-import sys
-import time
-import traceback
-import math
-import collections
-import socket
-from contextlib import closing
-
-import six
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
-
-RUN_STEP = 5
-LEARNING_RATE = 0.01
-
-
-class FleetDistRunnerBase(object):
-    def run_pserver(self, args):
-        if args.role.upper() != "PSERVER":
-            raise ValueError("args role must be PSERVER")
-
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=args.current_id,
-            role=role_maker.Role.SERVER,
-            worker_num=args.trainers,
-            server_endpoints=args.endpoints.split(","))
-
-        fleet.init(role)
-
-        strategy = DistributeTranspilerConfig()
-        strategy.sync_mode = args.sync_mode
-
-        avg_cost = self.net()
-
-        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        optimizer.minimize(avg_cost)
-
-        fleet.init_server()
-        fleet.run_server()
-
-    def run_trainer(self, args):
-        if args.role.upper() != "TRAINER":
-            raise ValueError("args role must be TRAINER")
-
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=args.current_id,
-            role=role_maker.Role.WORKER,
-            worker_num=args.trainers,
-            server_endpoints=args.endpoints.split(","))
-
-        fleet.init(role)
-
-        strategy = DistributeTranspilerConfig()
-        strategy.sync_mode = args.sync_mode
-
-        avg_cost = self.net()
-
-        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        optimizer.minimize(avg_cost)
-
-        self.do_training(fleet)
-        out = self.do_training(fleet)
-
-    def net(self, batch_size=4, lr=0.01):
-        raise NotImplementedError(
-            "get_model should be implemented by child classes.")
-
-    def do_training(self, fleet):
-        raise NotImplementedError(
-            "do_training should be implemented by child classes.")
-
-
-class TestFleetBase(unittest.TestCase):
-    def _setup_config(self):
-        raise NotImplementedError("tests should have _setup_config implemented")
-
-    def setUp(self):
-        self._sync_mode = True
-        self._trainers = 2
-        self._pservers = 2
-        self._port_set = set()
-        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
-        self._python_interp = sys.executable
-        self._setup_config()
-
-    def _find_free_port(self):
-        def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
-                s.bind(('', 0))
-                return s.getsockname()[1]
-
-        while True:
-            port = __free_port()
-            if port not in self._port_set:
-                self._port_set.add(port)
-                return port
-
-    def _start_pserver(self, cmd, required_envs):
-        ps0_cmd, ps1_cmd = cmd.format(0), cmd.format(1)
-
-        ps0_pipe = open("/tmp/ps0_err.log", "wb+")
-        ps1_pipe = open("/tmp/ps1_err.log", "wb+")
-
-        ps0_proc = subprocess.Popen(
-            ps0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps0_pipe,
-            env=required_envs)
-        ps1_proc = subprocess.Popen(
-            ps1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps1_pipe,
-            env=required_envs)
-
-        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
-
-    def _start_trainer(self, cmd, required_envs):
-        tr0_cmd, tr1_cmd = cmd.format(0), cmd.format(1)
-
-        tr0_pipe = open("/tmp/tr0_err.log", "wb+")
-        tr1_pipe = open("/tmp/tr1_err.log", "wb+")
-
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=required_envs)
-        tr1_proc = subprocess.Popen(
-            tr1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=required_envs)
-
-        return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe
-
-    def _run_cluster(self, model, envs):
-        env = {'CPU_NUM': '1'}
-        env.update(envs)
-
-        tr_cmd = "{0} {1} --role trainer --endpoints {2} --current_id {{}} --trainers {3}".format(
-            self._python_interp, model, self._ps_endpoints, self._trainers)
-
-        ps_cmd = "{0} {1} --role pserver --endpoints {2} --current_id {{}} --trainers {3}".format(
-            self._python_interp, model, self._ps_endpoints, self._trainers)
-
-        if self._sync_mode:
-            tr_cmd += " --sync_mode"
-            ps_cmd += " --sync_mode"
-
-        # Run dist train to compare with local results
-        ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
-        tr0, tr1, tr0_pipe, tr1_pipe = self._start_trainer(tr_cmd, env)
-
-        # Wait until trainer process terminate
-        while True:
-            stat0 = tr0.poll()
-            time.sleep(0.1)
-            if stat0 is not None:
-                break
-        while True:
-            stat1 = tr1.poll()
-            time.sleep(0.1)
-            if stat1 is not None:
-                break
-
-        tr0_out, tr0_err = tr0.communicate()
-        tr1_out, tr1_err = tr1.communicate()
-
-        # close trainer file
-        tr0_pipe.close()
-        tr1_pipe.close()
-        ps0_pipe.close()
-        ps1_pipe.close()
-
-        ps0.terminate()
-        ps1.terminate()
-        '''
-        with open("/tmp/tr0_out.log", "wb+") as wn:
-            wn.write(tr0_out)
-        with open("/tmp/tr1_out.log", "wb+") as wn:
-            wn.write(tr1_out)
-        # print server log
-        '''
-
-        # print server log
-        '''
-        with open("/tmp/ps0_err.log", "r") as fn:
-            sys.stderr.write("ps0 stderr: %s\n" % fn.read())
-        with open("/tmp/ps1_err.log", "r") as fn:
-            sys.stderr.write("ps1 stderr: %s\n" % fn.read())
-        '''
-
-        # print log
-        '''
-        with open("/tmp/tr0_err.log", "r") as fn:
-            sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
-        with open("/tmp/tr1_err.log", "r") as fn:
-            sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
-        '''
-
-        return 0, 0
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
-            "http_proxy": ""
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "3"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
-
-
-def runtime_main(test_class):
-    parser = argparse.ArgumentParser(description='Run Fleet test.')
-    parser.add_argument(
-        '--role', type=str, required=True, choices=['pserver', 'trainer'])
-    parser.add_argument('--endpoints', type=str, required=False, default="")
-    parser.add_argument('--current_id', type=int, required=False, default=0)
-    parser.add_argument('--trainers', type=int, required=False, default=1)
-    parser.add_argument('--sync_mode', action='store_true')
-
-    args = parser.parse_args()
-
-    model = test_class()
-    if args.role == "pserver":
-        model.run_pserver(args)
-    else:
-        model.run_trainer(args)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
deleted file mode 100644
index 9bad641a8cbd867c6c64467991b00ff9d7aa3011..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import unittest
-from test_dist_fleet_base import TestFleetBase
-
-
-def skip_ci(func):
-    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
-
-    def __func__(*args, **kwargs):
-        if on_ci:
-            return
-        return func(*args, **kwargs)
-
-    return __func__
-
-
-@skip_ci
-class TestDistMnist2x2(TestFleetBase):
-    def _setup_config(self):
-        self._sync_mode = False
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
-            "http_proxy": ""
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "3"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
-
-    def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
deleted file mode 100644
index 89bbc69fa889880ef6765ae7a00521e2e69ae7ac..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnist2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-
-    def test_dist_train(self):
-        self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-class TestDistMnist2x2WithMemopt(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._mem_opt = True
-
-    def test_dist_train(self):
-        self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-class TestDistMnistAsync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._use_reduce = False
-
-    def test_dist_train(self):
-        self.check_with_place("dist_mnist.py", delta=200)
-
-
-class TestDistMnistDcAsgd(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._dc_asgd = True
-
-    def test_se_resnext(self):
-        self.check_with_place("dist_mnist.py", delta=200)
-
-
-# FIXME(typhoonzero): enable these tests once we have 4
-# 4 GPUs on CI machine, and the base class should be updated.
-#
-# class TestDistMnist2x2ReduceMode(TestDistBase):
-#     def _setup_config(self):
-#         self._sync_mode = True
-#         self._use_reduce = True
-
-#     def test_se_resnext(self):
-#         self.check_with_place("dist_mnist.py", delta=1e-7)
-
-# class TestDistMnistAsyncReduceMode(TestDistBase):
-#     def _setup_config(self):
-#         self._sync_mode = False
-#         self._use_reduce = True
-
-#     def test_se_resnext(self):
-#         self.check_with_place("dist_mnist.py", delta=200)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
deleted file mode 100644
index 1f6274ec16488323c9f7e6b14a94e0d9182d7aca..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistNCCL2BackWardDeps(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._enable_backward_deps = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
deleted file mode 100644
index 22d4b7929033529c5cea60064e6d9de57eddeb8e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-import os
-
-
-class TestDistMnist2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-
-    def test_dist_train(self):
-        self.check_with_place("dist_mnist_batch_merge.py", delta=1e-5)
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        # TODO(typhoonzero): should auto adapt GPU count on the machine.
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
-            "FLAGS_cudnn_deterministic": "1",
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "7"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        no_merge_losses = self._run_local(
-            model_file,
-            required_envs,
-            check_error_log=check_error_log,
-            batch_size=4)
-
-        batch_merge_losses = self._run_local(
-            model_file,
-            required_envs,
-            check_error_log=check_error_log,
-            batch_size=2,
-            batch_merge_repeat=2)
-        # Ensure both result have values.
-        self.assertGreater(len(no_merge_losses), 1)
-        self.assertEqual(len(no_merge_losses), len(batch_merge_losses))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
deleted file mode 100644
index 529bd330ac92a6f06a06db3629f518ba4026b6bd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistNCCL2DGC(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._use_dgc = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
deleted file mode 100644
index 30f8592e1dace21401e29b4e05f7330502f55f47..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistNCCL2FleetApi(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._gpu_fleet_api = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
deleted file mode 100644
index 247e4c0500f160588261c525f90a404741635170..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistNCCL2HAllreduce(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._use_hallreduce = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_lars.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_lars.py
deleted file mode 100644
index 53c7527fdafbd0c4dec9c78ec9c8212422d046db..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_lars.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnist2x2Lars(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-
-    def test_se_resnext(self):
-        self.check_with_place("dist_mnist_lars.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
deleted file mode 100644
index d0a21fe0dca6024ec2a061749560e2d51358687f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistNCCL2MultiNCCLComm(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._nccl_comm_num = 3
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
deleted file mode 100644
index d063f8473e0f50256dc424429ce1244a4b893ccf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistNCCL2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_mnist.py",
-                delta=1,
-                need_envs={
-                    "FLAGS_enable_parallel_graph": "1",
-                    "FLAGS_sync_nccl_allreduce": "1"
-                })
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
deleted file mode 100644
index fd15020275bdce1a6424f3134ff089bd761ee1b1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistNCCL2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
deleted file mode 100644
index 4f4941aa217b985c829391e9e8652d91f72b0c98..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistLocalSGDFleetApi(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._gpu_fleet_api = True
-        self._use_local_sgd = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-class TestDistMnistGradAllReduceFleetApi(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._gpu_fleet_api = True
-        self._ut4grad_allreduce = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
deleted file mode 100644
index 8c2d6d9b4dc0624daea7b6968d47bae9e925e034..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import os
-import shutil
-import unittest
-import tempfile
-
-import numpy as np
-
-from test_dist_base import TestDistBase, RUN_STEP
-
-
-class TestDistSaveLoadDense2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "http_proxy": ""
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "3"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        model_dir = tempfile.mkdtemp()
-
-        local_env = {}
-        local_env["SAVE"] = "1"
-        local_env["MODEL_DIR"] = model_dir
-        local_env.update(required_envs)
-
-        cluster_env = {}
-        cluster_env["LOAD"] = "1"
-        cluster_env["MODEL_DIR"] = model_dir
-        cluster_env.update(required_envs)
-
-        local_var = self._run_local(model_file, local_env, check_error_log)
-        tr0_var, tr1_var = self._run_cluster(model_file, cluster_env,
-                                             check_error_log)
-
-        shutil.rmtree(model_dir)
-
-        local_np = np.array(local_var)
-        train0_np = np.array(tr0_var)
-        train1_np = np.array(tr1_var)
-
-        np.testing.assert_almost_equal(local_np, train0_np, decimal=2)
-        np.testing.assert_almost_equal(local_np, train1_np, decimal=2)
-        np.testing.assert_almost_equal(train0_np, train1_np, decimal=2)
-
-    def test_dist(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1',
-            'SAVE_MODE': 'LOCAL',
-        }
-        self.check_with_place(
-            "dist_save_load.py",
-            delta=0,
-            check_error_log=False,
-            need_envs=need_envs)
-
-
-class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "http_proxy": ""
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "3"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        model_dir = tempfile.mkdtemp()
-
-        save_env = {}
-        save_env["SAVE_MODE"] = "DIST"
-        save_env["SAVE"] = "1"
-        save_env["MODEL_DIR"] = model_dir
-        save_env.update(required_envs)
-
-        tr0_var_1, tr1_var_1 = self._run_cluster(model_file, save_env,
-                                                 check_error_log)
-
-        load_env = {}
-        load_env["LOAD"] = "1"
-        load_env["MODEL_DIR"] = model_dir
-        load_env.update(required_envs)
-        tr0_var_2, tr1_var_2 = self._run_cluster(model_file, load_env,
-                                                 check_error_log)
-
-        shutil.rmtree(model_dir)
-
-        train0_1_np = np.array(tr0_var_1)
-        train1_1_np = np.array(tr1_var_1)
-        train0_2_np = np.array(tr0_var_2)
-        train1_2_np = np.array(tr1_var_2)
-
-        np.testing.assert_almost_equal(train0_1_np, train0_2_np, decimal=2)
-        np.testing.assert_almost_equal(train1_1_np, train1_2_np, decimal=2)
-
-    def test_dist(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1',
-            'SAVE_MODE': 'DIST',
-            'OPTIMIZER': 'ADAM',
-            'SKIP_STEPS': str(np.random.randint(2, 6))
-        }
-        self.check_with_place(
-            "dist_save_load.py",
-            delta=0,
-            check_error_log=False,
-            need_envs=need_envs)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_async.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_async.py
deleted file mode 100644
index 0d99728965abf7a90377d39bfb91cad752cfe303..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_async.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-import os
-
-
-def skip_ci(func):
-    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
-
-    def __func__(*args, **kwargs):
-        if on_ci:
-            return
-        return func(*args, **kwargs)
-
-    return __func__
-
-
-class TestDistSeResneXt2x2Async(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._use_reader_alloc = False
-
-    @skip_ci
-    def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=100)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py
deleted file mode 100644
index 24ef0736a019fec434e06785252ede84f3e15d34..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-import os
-
-
-def skip_ci(func):
-    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
-
-    def __func__(*args, **kwargs):
-        if on_ci:
-            return
-        return func(*args, **kwargs)
-
-    return __func__
-
-
-class TestDistSeResnetNCCL2DGC(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._use_dgc = True
-
-    @skip_ci
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_se_resnext.py", delta=30)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
deleted file mode 100644
index 3e55efb633d77bd9d93b890e60e844b55a4522ed..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-import os
-
-
-def skip_ci(func):
-    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
-
-    def __func__(*args, **kwargs):
-        if on_ci:
-            return
-        return func(*args, **kwargs)
-
-    return __func__
-
-
-class TestDistSeResneXtNCCL(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-
-    @skip_ci
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_se_resnext.py", delta=1e-5)
-
-
-class TestDistSeResneXtNCCLMP(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._mp_mode = True
-
-    @skip_ci
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_se_resnext.py",
-                delta=1e-5,
-                check_error_log=True,
-                need_envs={"NCCL_P2P_DISABLE": "1"})
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py
deleted file mode 100644
index 23987f4eff42084f38059ca80f89753de08c04db..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-import os
-
-
-def skip_ci(func):
-    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
-
-    def __func__(*args, **kwargs):
-        if on_ci:
-            return
-        return func(*args, **kwargs)
-
-    return __func__
-
-
-class TestDistSeResneXt2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reader_alloc = False
-
-    @skip_ci
-    def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=1e-7)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync_with_memopt.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync_with_memopt.py
deleted file mode 100644
index e39e07a58e8d9e816ce139566eab0f02d204c70d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync_with_memopt.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-import os
-
-
-def skip_ci(func):
-    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
-
-    def __func__(*args, **kwargs):
-        if on_ci:
-            return
-        return func(*args, **kwargs)
-
-    return __func__
-
-
-class TestDistseResnXt2x2WithMemopt(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._mem_opt = True
-        self._use_reader_alloc = False
-
-    @skip_ci
-    def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=1e-7)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
deleted file mode 100644
index 30a7ec095e66acf1292fbb6602533d04bec9d5bf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import os
-import unittest
-
-from test_dist_base import TestDistBase
-
-
-class TestDistSimnetBowDense2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=False,
-            need_envs=need_envs)
-
-
-class TestDistSimnetBow2x2DenseAsync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    #FIXME(typhoonzero): fix async tests later
-    def notest_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1',
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=100,
-            check_error_log=False,
-            need_envs=need_envs)
-
-
-class TestDistSimnetBowSparse2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=False,
-            need_envs=need_envs)
-
-
-class TestDistSimnetBow2x2SparseAsync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=100,
-            check_error_log=False,
-            need_envs=need_envs)
-
-
-# FIXME(tangwei): Learningrate variable is not created on pserver.
-class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '1',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=True,
-            need_envs=need_envs)
-
-
-class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '1',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=100,
-            check_error_log=False,
-            need_envs=need_envs)
-
-
-class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '1',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '0'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=False,
-            need_envs=need_envs)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
deleted file mode 100644
index 0c1680359e2b84807084b06eab0534b41ecd6133..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistTextClassification2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_text_classification(self):
-        self.check_with_place("dist_text_classification.py", delta=1e-6)
-
-
-class TestDistTextClassification2x2Async(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    def test_se_resnext(self):
-        self.check_with_place("dist_text_classification.py", delta=100)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
deleted file mode 100644
index e9f39f10904111cbf3b0b0b317362428c46b07bb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import time
-import unittest
-from multiprocessing import Process
-import signal
-
-import numpy
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.layers.io import ListenAndServ
-from paddle.fluid.layers.io import Recv
-from paddle.fluid.layers.io import Send
-import paddle.fluid.layers.ops as ops
-from dist_test_utils import *
-
-from paddle.fluid import core
-
-RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
-)
-RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
-
-
-class TestSendOp(unittest.TestCase):
-    def test_send(self):
-        remove_ps_flag(os.getpid())
-        # Run init_serv in a thread
-        place = fluid.CPUPlace()
-        # NOTE: python thread will not work here due to GIL.
-        p = Process(target=self.init_serv, args=(place, ))
-        p.daemon = True
-        p.start()
-
-        self.ps_timeout = 5
-        self._wait_ps_ready(p.pid)
-
-        with open("/tmp/paddle.%d.port" % p.pid, "r") as fn:
-            selected_port = int(fn.readlines()[0])
-        self.init_client(place, selected_port)
-
-        self.run_local(place)
-        self.assertTrue(numpy.allclose(self.local_out, self.dist_out))
-
-        os.kill(p.pid, signal.SIGINT)
-        p.join()
-
-    def _wait_ps_ready(self, pid):
-        start_left_time = self.ps_timeout
-        sleep_time = 0.5
-        while True:
-            assert start_left_time >= 0, "wait ps ready failed"
-            time.sleep(sleep_time)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                start_left_time -= sleep_time
-
-    def init_serv(self, place):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
-            with serv.do():
-                out_var = main.global_block().create_var(
-                    name="scale_0.tmp_0",
-                    psersistable=True,
-                    dtype="float32",
-                    shape=[32, 32])
-                x = layers.data(
-                    shape=[32, 32],
-                    dtype='float32',
-                    name="X",
-                    append_batch_size=False)
-                fluid.initializer.Constant(value=1.0)(x, main.global_block())
-                ops._scale(x=x, scale=10.0, out=out_var)
-
-        self.server_exe = fluid.Executor(place)
-        self.server_exe.run(main)
-
-    def init_client(self, place, port):
-        main = fluid.Program()
-        with fluid.program_guard(main):
-            main.global_block().append_op(
-                type="fetch_barrier",
-                inputs={},
-                outputs={"Out": []},
-                attrs={
-                    "endpoints": ["127.0.0.1:{0}".format(port)],
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-                })
-
-            x = layers.data(
-                shape=[32, 32],
-                dtype='float32',
-                name='X',
-                append_batch_size=False)
-            x.persistable = True
-            fluid.initializer.Constant(value=2.3)(x, main.global_block())
-
-            get_var = main.global_block().create_var(
-                name="scale_0.tmp_0",  # server side var
-                dtype="float32",
-                persistable=False,
-                shape=[32, 32])
-            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
-
-            # NOTE(zjl): `Send` is async send, which means that the sent 
-            # variable would be needed even though `Send` op runs. 
-            # Is it a right design? If I do not set `x.persistable = True`,
-            # this unittest would hang in rpc client after x is deleted. 
-            #
-            # BTW, `Send` is not a public API to users. So I set 
-            # `x.persistable = True` to be a hot fix of this unittest. 
-            Send("127.0.0.1:%d" % port, [x])
-            o = Recv("127.0.0.1:%d" % port, [get_var])
-
-        exe = fluid.Executor(place)
-        self.dist_out = exe.run(main, fetch_list=o)  # o is a list
-
-    def run_local(self, place):
-        main = fluid.Program()
-        with fluid.program_guard(main):
-            x = layers.data(
-                shape=[32, 32],
-                dtype='float32',
-                name='X',
-                append_batch_size=False)
-            fluid.initializer.Constant(value=2.3)(x, main.global_block())
-            o = layers.scale(x=x, scale=10.0)
-        exe = fluid.Executor(place)
-        self.local_out = exe.run(main, fetch_list=[o])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
deleted file mode 100644
index 3307caa8b2d62d5a31a7eeb36bb207b31d749b55..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import unittest
-import paddle
-from test_dist_base import TestDistBase
-
-
-def download_files():
-    url_prefix = 'http://paddle-unittest-data.bj.bcebos.com/dist_transformer/'
-    vocab_url = url_prefix + 'vocab.bpe.32000'
-    vocab_md5 = 'a86d345ca6e27f6591d0dccb1b9be853'
-    paddle.dataset.common.download(vocab_url, 'test_dist_transformer',
-                                   vocab_md5)
-
-    local_train_url = url_prefix + 'train.tok.clean.bpe.32000.en-de'
-    local_train_md5 = '033eb02b9449e6dd823f050782ac8914'
-    paddle.dataset.common.download(local_train_url, 'test_dist_transformer',
-                                   local_train_md5)
-
-    train0_url = url_prefix + 'train.tok.clean.bpe.32000.en-de.train_0'
-    train0_md5 = 'ddce7f602f352a0405267285379a38b1'
-    paddle.dataset.common.download(train0_url, 'test_dist_transformer',
-                                   train0_md5)
-
-    train1_url = url_prefix + 'train.tok.clean.bpe.32000.en-de.train_1'
-    train1_md5 = '8757798200180285b1a619cd7f408747'
-    paddle.dataset.common.download(train1_url, 'test_dist_transformer',
-                                   train1_md5)
-
-    test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de'
-    test_md5 = '9dd74a266dbdb25314183899f269b4a2'
-    paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5)
-    # cut test data for faster CI
-    orig_path = os.path.join(paddle.dataset.common.DATA_HOME,
-                             "test_dist_transformer",
-                             "newstest2013.tok.bpe.32000.en-de")
-    head_path = os.path.join(paddle.dataset.common.DATA_HOME,
-                             "test_dist_transformer",
-                             "newstest2013.tok.bpe.32000.en-de.cut")
-    os.system("head -n10 %s > %s" % (orig_path, head_path))
-
-
-class TestDistTransformer2x2Sync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-
-    def test_dist_train(self):
-        download_files()
-        self.check_with_place(
-            "dist_transformer.py", delta=1e-5, check_error_log=False)
-
-
-class TestDistTransformer2x2Async(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-
-    def test_dist_train(self):
-        download_files()
-        self.check_with_place(
-            "dist_transformer.py", delta=1.0, check_error_log=False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
deleted file mode 100644
index f81d4fda50be195b239fcb149382d997275405fd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ /dev/null
@@ -1,993 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import traceback
-import math
-import collections
-
-import six
-import unittest
-import numpy as np
-
-import gc
-gc.set_debug(gc.DEBUG_COLLECTABLE)
-
-import paddle.fluid as fluid
-
-
-class TranspilerTest(unittest.TestCase):
-    def setUp(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        # NOTE: we do not actually bind this port
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
-        self.pserver1_ep = "127.0.0.1:6174"
-        self.pserver2_ep = "127.0.0.1:6175"
-        self.sync_mode = True
-        self.transpiler = None
-
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-        sgd_optimizer.minimize(avg_cost)
-
-    def get_main_program(self):
-        main = fluid.Program()
-        main.random_seed = 1
-        with fluid.program_guard(main):
-            self.net_conf()
-        self.origin_prog = main.clone()
-        return main
-
-    def get_trainer(self, config=None):
-        src = fluid.default_startup_program().clone()
-
-        t = self._transpiler_instance(config)
-
-        trainer_main = t.get_trainer_program(wait_port=False)
-        trainer_startup = fluid.default_startup_program()
-
-        assert (src.num_blocks == 1)
-        assert (trainer_startup.num_blocks == src.num_blocks)
-
-        return trainer_main, trainer_startup
-
-    def get_pserver(self, ep, config=None, sync_mode=True):
-        t = self._transpiler_instance(config, sync_mode)
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
-    def _transpiler_instance(self, config=None, sync_mode=True):
-        if not self.transpiler:
-            main = self.get_main_program()
-            self.transpiler = fluid.DistributeTranspiler(config=config)
-            self.transpiler.transpile(
-                self.trainer_id,
-                program=main,
-                pservers=self.pserver_eps,
-                trainers=self.trainers,
-                sync_mode=sync_mode)
-
-        return self.transpiler
-
-    def transpiler_test_impl(self):
-        pass
-
-    def test_transpiler(self):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                self.transpiler_test_impl()
-        # NOTE: run gc.collect to eliminate pybind side objects to
-        # prevent random double-deallocate when inherited in python.
-        del self.transpiler
-        del main
-        del startup
-        gc.collect()
-
-
-class TestBasicModel(TranspilerTest):
-    def transpiler_test_impl(self):
-        pserver, startup = self.get_pserver(self.pserver1_ep)
-        pserver2, startup2 = self.get_pserver(self.pserver2_ep)
-
-        trainer, trainer_startup = self.get_trainer()
-
-        # splited var blocks should be in startup program
-        self.assertTrue("fc_w.block0" in trainer_startup.global_block().vars)
-        self.assertTrue("fc_w.block1" in trainer_startup.global_block().vars)
-        self.assertTrue("fc_w" in trainer_startup.global_block().vars)
-        self.assertTrue("fc_b" in trainer_startup.global_block().vars)
-        self.assertTrue("fc_w@GRAD" not in trainer_startup.global_block().vars)
-        self.assertTrue("fc_b@GRAD" not in trainer_startup.global_block().vars)
-
-        src = [op.type for op in trainer_startup.global_block().ops]
-        dst = ['fill_constant', 'fill_constant', 'uniform_random', 'recv', 'recv', \
-               'fetch_barrier', 'concat']
-
-        self.assertEqual(src, dst)
-
-        self.assertEqual([op.type for op in trainer.global_block().ops], [
-            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
-            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
-            'elementwise_add_grad', 'send', 'mul_grad', 'split_byref', 'send',
-            'send_barrier', 'recv', 'recv', 'fetch_barrier', 'concat'
-        ])
-
-        self.assertEqual(len(pserver.blocks), 3)
-        # block0: listen_and_serv
-        self.assertEqual([op.type for op in pserver.blocks[0].ops],
-                         ["listen_and_serv"])
-        # block1~2: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[1].ops],
-                         ["sum", "scale", "sgd"])
-        # confirm startup program
-        self.assertEqual([op.type for op in startup.global_block().ops],
-                         ["fill_constant", "fill_constant", "uniform_random"])
-        # the variable #fc_w will be split into two blocks
-        fc_w_var = startup.global_block().var("fc_w.block1")
-        self.assertEqual(fc_w_var.shape, (500, 1000))
-        # all parameters should be optimized on pserver
-
-        pserver_params = []
-        for prog in [pserver, pserver2]:
-            for blk in prog.blocks:
-                for op in blk.ops:
-                    if "Param" in op.input_names:
-                        param_name = op.input("Param")[0]
-                        is_block_idx = param_name.find(".block")
-                        if is_block_idx != -1:
-                            origin_param_name = param_name[:is_block_idx]
-                        else:
-                            origin_param_name = param_name
-                        pserver_params.append(origin_param_name)
-        trainer_params = []
-        for op in self.origin_prog.global_block().ops:
-            if "Param" in op.input_names:
-                trainer_params.append(op.input("Param")[0])
-        self.assertEqual(set(pserver_params), set(trainer_params))
-
-
-class TestBasicModelWithLargeBlockSize(TranspilerTest):
-    def transpiler_test_impl(self):
-        config = fluid.DistributeTranspilerConfig()
-        config.min_block_size = 1048576
-
-        pserver, startup = self.get_pserver(self.pserver1_ep, config)
-        pserver2, startup2 = self.get_pserver(self.pserver2_ep, config)
-
-        trainer, _ = self.get_trainer(config)
-
-        self.assertEqual([op.type for op in trainer.global_block().ops], [
-            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
-            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
-            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'send_barrier',
-            'recv', 'recv', 'fetch_barrier'
-        ])
-
-        self.assertEqual(len(pserver.blocks), 2)
-        # block0: listen_and_serv
-        self.assertEqual([op.type for op in pserver.blocks[0].ops],
-                         ["listen_and_serv"])
-        # block1~2: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[1].ops],
-                         ["sum", "scale", "sgd"])
-        # confirm startup program
-        self.assertEqual([op.type for op in startup.global_block().ops],
-                         ["fill_constant", "fill_constant"])
-        # the variable #fc_w will be split into two blocks
-        fc_w_var = startup2.global_block().var("fc_w")
-        self.assertEqual(fc_w_var.shape, (1000, 1000))
-        # all parameters should be optimized on pserver
-
-        pserver_params = []
-        for prog in [pserver, pserver2]:
-            for blk in prog.blocks:
-                for op in blk.ops:
-                    if "Param" in op.input_names:
-                        param_name = op.input("Param")[0]
-                        is_block_idx = param_name.find(".block")
-                        if is_block_idx != -1:
-                            origin_param_name = param_name[:is_block_idx]
-                        else:
-                            origin_param_name = param_name
-                        pserver_params.append(origin_param_name)
-        trainer_params = []
-        for op in self.origin_prog.global_block().ops:
-            if "Param" in op.input_names:
-                trainer_params.append(op.input("Param")[0])
-        self.assertEqual(set(pserver_params), set(trainer_params))
-
-
-class TestNoSliceVar(TranspilerTest):
-    def setUp(self):
-        super(TestNoSliceVar, self).setUp()
-
-    def transpiler_test_impl(self):
-        config = fluid.DistributeTranspilerConfig()
-        config.slice_var_up = False
-
-        _, startup = self.get_pserver(self.pserver1_ep, config)
-        _, startup2 = self.get_pserver(self.pserver2_ep, config)
-
-        if "fc_w" in startup.global_block().vars:
-            fc_w_var = startup.global_block().vars["fc_w"]
-        elif "fc_w" in startup2.global_block().vars:
-            fc_w_var = startup2.global_block().vars["fc_w"]
-
-        self.assertEqual(fc_w_var.shape, (1000, 1000))
-
-
-class TestLRDecay(TranspilerTest):
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=1.0,
-                decay_steps=2100,
-                decay_rate=0.1,
-                staircase=True))
-        sgd_optimizer.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer, _ = self.get_trainer()
-
-        self.assertEqual(len(pserver.blocks), 4)
-        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
-        self.assertEqual(lr_decay_ops, [
-            "increment", "cast", "fill_constant", "elementwise_div", "floor",
-            "fill_constant", "elementwise_pow", "fill_constant",
-            "elementwise_mul"
-        ])
-
-
-class TestDecayedAdagrad(TranspilerTest):
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1)
-        opt.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer, _ = self.get_trainer()
-
-
-class TestFtrl(TranspilerTest):
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        opt = fluid.optimizer.Ftrl(learning_rate=0.1)
-        opt.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer, _ = self.get_trainer()
-
-
-class TestLRDecayConditional(TranspilerTest):
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.piecewise_decay([10000, 20000],
-                                                       [1.0, 0.5, 1.0]))
-        sgd_optimizer.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer, _ = self.get_trainer()
-
-        serv_op = pserver.blocks[0].ops[0]
-        sub_blocks = []
-        optimize_blocks = []
-        for b in serv_op.all_attrs()["optimize_blocks"]:
-            optimize_blocks.append(b.idx)
-        for b in pserver.blocks:
-            if b.idx not in optimize_blocks:
-                sub_blocks.append(b.idx)
-
-        self.assertEqual(len(pserver.blocks), 7)
-        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
-        self.assertEqual(lr_decay_ops, [
-            "increment", "cast", "fill_constant", "fill_constant", "less_than",
-            "logical_not", "conditional_block", "fill_constant",
-            "fill_constant", "less_than", "logical_not", "logical_and",
-            "logical_and", "conditional_block", "fill_constant",
-            "conditional_block"
-        ])
-        # test the condition blocks
-        for b in sub_blocks:
-            if b == 0:
-                continue
-            block = pserver.blocks[b]
-            self.assertEqual([op.type for op in block.ops], ["assign"])
-
-
-class TestL2Decay(TranspilerTest):
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(
-            input=x,
-            size=1000,
-            act=None,
-            param_attr=fluid.ParamAttr(
-                name='fc_w',
-                regularizer=fluid.regularizer.L2Decay(),
-                gradient_clip=fluid.clip.GradientClipByValue(0.1)),
-            bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-        sgd_optimizer.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer, _ = self.get_trainer()
-
-        self.assertEqual(len(pserver.blocks), 3)
-        self.assertEqual([op.type for op in pserver.blocks[1].ops],
-                         ["sum", "scale", "clip", "sgd"])
-        self.assertEqual([op.type for op in pserver.blocks[2].ops],
-                         ["sum", "scale", "clip", "scale", "sum", "sgd"])
-        # TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer
-
-
-class TestL2DecayWithPiecewise(TranspilerTest):
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        base_lr = 1.0
-        bd = [1, 10, 20, 30]
-        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        sgd_optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=bd, values=lr),
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
-        sgd_optimizer.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer, _ = self.get_trainer()
-
-        self.assertEqual(len(pserver.blocks), 9)
-        self.assertEqual([op.type for op in pserver.blocks[1].ops], [
-            "increment", "cast", "fill_constant", "fill_constant", "less_than",
-            "logical_not", "conditional_block", "fill_constant",
-            "fill_constant", "less_than", "logical_not", "logical_and",
-            "logical_and", "conditional_block", "fill_constant",
-            "fill_constant", "less_than", "logical_not", "logical_and",
-            "logical_and", "conditional_block", "fill_constant",
-            "fill_constant", "less_than", "logical_not", "logical_and",
-            "logical_and", "conditional_block", "fill_constant",
-            "conditional_block"
-        ])
-        self.assertEqual([op.type for op in pserver.blocks[7].ops],
-                         ["sum", "scale", "scale", "sum", "momentum"])
-        self.assertEqual([op.type for op in pserver.blocks[8].ops],
-                         ["sum", "scale", "scale", "sum", "momentum"])
-
-
-class TestEmptyPserverOptimizeBlocks(TranspilerTest):
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        # only one parameter
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=False)
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=1.0)
-        sgd_optimizer.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        config = fluid.DistributeTranspilerConfig()
-        config.slice_var_up = False
-
-        pserver, startup = self.get_pserver(ep=self.pserver2_ep, config=config)
-
-        self.assertEqual(len(pserver.blocks), 2)
-        self.assertEqual(len(pserver.blocks[1].ops), 0)
-
-
-class TestDistLookupTableBase(TranspilerTest):
-    def network_with_table(self, is_sparse, is_distributed):
-        self.table_size = 1000
-        self.emb_size = 64
-        self.lookup_table_name = 'shared_w'
-
-        def emb_pool(ids, table_name, is_distributed):
-            emb = fluid.layers.embedding(
-                input=ids,
-                size=[self.table_size, self.emb_size],
-                dtype='float32',
-                param_attr=table_name,
-                is_sparse=is_sparse,
-                is_distributed=is_distributed)
-            pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
-            return pool
-
-        title_ids = fluid.layers.data(
-            name='title_ids', shape=[1], dtype='int64', lod_level=1)
-        brand_ids = fluid.layers.data(
-            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
-        profile_ids = fluid.layers.data(
-            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
-        title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed)
-        brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed)
-        profile_emb = emb_pool(profile_ids, "profile_emb", False)
-        fc0 = fluid.layers.concat(
-            input=[title_emb, brand_emb, profile_emb], axis=1)
-        predict = fluid.layers.fc(input=fc0,
-                                  size=2,
-                                  act=None,
-                                  param_attr=fluid.ParamAttr(name='fc_w'),
-                                  bias_attr=fluid.ParamAttr(name='fc_b'))
-
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(cost)
-        optimizer = fluid.optimizer.Adam(learning_rate=0.003)
-        optimizer.minimize(avg_cost)
-
-
-class TestLocalLookupTable(TestDistLookupTableBase):
-    def net_conf(self):
-        self.network_with_table(is_sparse=True, is_distributed=False)
-
-    def transpiler_test_impl(self):
-        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
-
-        self.assertEqual(len(pserver1.blocks), 4)
-        # 0 listen_and_serv
-        # 1 optimize for fc_w or fc_b adam
-        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
-                         ["sum", "scale", "adam", "scale", "scale"])
-        # 2 optimize for table adam
-        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
-        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
-                         ["sum", "scale", "adam", "scale", "scale"])
-
-        # 3 optimize for table 2 adam
-        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
-        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
-                         ["sum", "scale", "adam", "scale", "scale"])
-
-        trainer, _ = self.get_trainer()
-        self.assertEqual(len(trainer.blocks), 1)
-        ops = [
-            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
-            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
-            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'split_selected_rows', 'send', 'sequence_pool_grad',
-            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
-            'recv', 'fetch_barrier'
-        ]
-        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
-
-
-class TestDistLookupTable(TestDistLookupTableBase):
-    def net_conf(self):
-        self.network_with_table(is_sparse=True, is_distributed=True)
-
-    def transpiler_test_impl(self):
-        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
-
-        self.assertEqual(len(pserver1.blocks), 6)
-        # 0 listen_and_serv
-        # 1 optimize for fc_w or fc_b adam
-        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
-                         ["sum", "scale", "adam", "scale", "scale"])
-        # 4 prefetch -> lookup_sparse_table for data0
-        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
-                         ["sum", "scale", "adam", "scale", "scale"])
-        # 2 optimize for table sgd
-        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
-                         ["sum", "sgd"])
-        # 3 prefetch -> lookup_sparse_table for data0
-        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
-                         ["lookup_sparse_table"])
-        # 5 save table
-        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
-
-        trainer, trainer_startup = self.get_trainer()
-        self.assertEqual(len(trainer.blocks), 1)
-        ops = [
-            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
-            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
-            'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
-            'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
-            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'split_selected_rows', 'send',
-            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier',
-            'recv', 'recv', 'fetch_barrier'
-        ]
-        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
-        startup_ops = [
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'uniform_random',
-            'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
-            'fake_init'
-        ]
-        self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
-                         startup_ops)
-
-
-class TestAsyncLocalLookupTable(TestDistLookupTableBase):
-    def net_conf(self):
-        self.network_with_table(is_sparse=True, is_distributed=False)
-
-    def transpiler_test_impl(self):
-        config = fluid.DistributeTranspilerConfig()
-        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
-
-        self.assertEqual(len(pserver1.blocks), 4)
-        # 0 listen_and_serv
-        # 1 optimize for fc_w or fc_b adam
-        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
-                         ["adam", "scale", "scale"])
-        # 2 optimize for table adam
-        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
-        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
-                         ["adam", "scale", "scale"])
-        # 3 optimize for table adam
-        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
-        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
-                         ["adam", "scale", "scale"])
-
-        trainer, _ = self.get_trainer(config)
-        self.assertEqual(len(trainer.blocks), 1)
-        ops = [
-            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
-            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
-            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'split_selected_rows', 'send', 'sequence_pool_grad',
-            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'sum', 'split_selected_rows', 'send', 'recv', 'recv'
-        ]
-        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
-
-
-class TestAsyncDistLookupTable(TestDistLookupTableBase):
-    def net_conf(self):
-        self.network_with_table(is_sparse=True, is_distributed=True)
-
-    def transpiler_test_impl(self):
-        config = fluid.DistributeTranspilerConfig()
-
-        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
-
-        self.assertEqual(len(pserver1.blocks), 6)
-        # 0 listen_and_serv
-        # 1 optimize for fc_w or fc_b adam
-        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
-                         ["adam", "scale", "scale"])
-        # 2 optimize for table adam
-        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
-                         ["adam", "scale", "scale"])
-        # 3 optimize for table sgd
-        self.assertEqual([op.type for op in pserver1.blocks[3].ops], ["sgd"])
-        # 4 prefetch -> lookup_sparse_table for data0
-        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
-                         ["lookup_sparse_table"])
-        # 5 save table
-        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
-
-        trainer, trainer_startup = self.get_trainer(config)
-        self.assertEqual(len(trainer.blocks), 1)
-        ops = [
-            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
-            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
-            'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
-            'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
-            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'split_selected_rows', 'send',
-            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv'
-        ]
-        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
-        startup_ops = [
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'uniform_random',
-            'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
-            'fake_init'
-        ]
-        self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
-                         startup_ops)
-
-
-class TestDistLookupTableSliceSize(TestDistLookupTableBase):
-    def net_conf(self):
-        self.network_with_table(is_sparse=True, is_distributed=True)
-
-    def transpiler_test_impl(self):
-        config = fluid.DistributeTranspilerConfig()
-        pserver1, _ = self.get_pserver(self.pserver1_ep, config)
-
-        self.assertTrue(self.transpiler.has_distributed_lookup_table)
-        lookup_table_var = pserver1.global_block().vars[
-            self.transpiler.table_name]
-        row_size = lookup_table_var.shape[0]
-        calc_row_size = int(math.ceil(self.table_size / self.pservers))
-        self.assertEqual(row_size, calc_row_size)
-
-
-class TestDistArgsInProgram(TestDistLookupTableBase):
-    def net_conf(self):
-        self.network_with_table(is_sparse=True, is_distributed=True)
-
-    def transpiler_test_impl(self):
-        trainer, _ = self.get_trainer()
-
-        self.assertTrue(trainer._is_distributed)
-        self.assertTrue(trainer._is_chief)
-        self.assertEqual(trainer._distributed_lookup_table,
-                         self.lookup_table_name)
-        self.assertEqual(trainer._endpoints,
-                         [self.pserver1_ep, self.pserver2_ep])
-
-
-class TestRMSPropOptimizer(TranspilerTest):
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
-        optimizer.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        pserver, startup = self.get_pserver(self.pserver1_ep)
-        pserver2, startup2 = self.get_pserver(self.pserver2_ep)
-
-        self.assertEqual(len(pserver.blocks), 3)
-        # block1~2: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[1].ops],
-                         ["sum", "scale", "rmsprop"])
-        # the variable #fc_w will be split into two blocks
-        fc_w_var = startup.global_block().var("fc_w.block1")
-        self.assertEqual(fc_w_var.shape, (500, 1000))
-        moment_var = startup.global_block().var("momentum_1")
-        self.assertEqual(moment_var.shape, (500, 1000))
-
-
-class TestLoadSliceVar(TranspilerTest):
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
-        optimizer.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        pserver, _ = self.get_pserver(self.pserver1_ep)
-        pserver2, _ = self.get_pserver(self.pserver2_ep)
-
-        vars_ps1 = pserver._parameters_on_pservers.get_distributed_vars_by_ep(
-            self.pserver1_ep)
-        vars_ps2 = pserver._parameters_on_pservers.get_distributed_vars_by_ep(
-            self.pserver2_ep)
-
-        self.assertTrue(vars_ps1)
-        self.assertTrue(vars_ps2)
-
-        for idx in six.moves.xrange(len(vars_ps1)):
-            total_numel = 0
-            ps1_numel, ps2_numel = 0, 0
-
-            ps1_var = vars_ps1[idx]
-
-            if not ps1_var.is_slice:
-                total_numel = six.moves.reduce(lambda x, y: x * y,
-                                               vars_ps1[idx].origin.shape)
-                ps1_numel = six.moves.reduce(lambda x, y: x * y,
-                                             vars_ps1[idx].slice.shape)
-            else:
-                ps2_var = None
-                for var in vars_ps2:
-                    if var.origin.name == ps1_var.origin.name:
-                        ps2_var = var
-                        break
-
-                total_numel = six.moves.reduce(lambda x, y: x * y,
-                                               ps1_var.origin.shape)
-                ps1_numel = six.moves.reduce(lambda x, y: x * y,
-                                             ps1_var.slice.shape)
-                ps2_numel = six.moves.reduce(lambda x, y: x * y,
-                                             ps2_var.slice.shape)
-
-            self.assertEqual(total_numel, ps1_numel + ps2_numel)
-
-
-class TestNCCL2Transpile(TranspilerTest):
-    def test_nccl2_transpile(self):
-        if fluid.core.is_compiled_with_cuda():  #test nccl2 only with cuda
-            main = fluid.Program()
-            startup = fluid.Program()
-            with fluid.program_guard(main, startup):
-                self.net_conf()
-
-            config = fluid.DistributeTranspilerConfig()
-            config.mode = "nccl2"
-            config.wait_port = False
-            t = fluid.DistributeTranspiler(config=config)
-            t.transpile(
-                0,
-                trainers="127.0.0.1:6174,127.0.0.1:6175",
-                current_endpoint="127.0.0.1:6174",
-                startup_program=startup)
-            print([op.type for op in startup.global_block().ops])
-            self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id")
-            self.assertIsNotNone(startup.global_block().vars.get("NCCLID"))
-            gc.collect()
-        else:
-            pass
-
-
-# test for remote prefetch
-class TestRemoteLookupTable(TestDistLookupTableBase):
-    def net_conf(self):
-        import os
-        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
-        self.network_with_table(is_sparse=True, is_distributed=False)
-
-    def transpiler_test_impl(self):
-        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
-
-        self.assertEqual(len(pserver1.blocks), 4)
-        # 0 listen_and_serv
-        # 1 optimize for fc_w or fc_b adam
-        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
-                         ["sum", "scale", "adam", "scale", "scale"])
-        # 2 optimize for table adam
-        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
-        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
-                         ["sum", "scale", "adam", "scale", "scale"])
-
-        # 3 optimize for table 2 adam
-        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
-        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
-                         ["sum", "scale", "adam", "scale", "scale"])
-
-        trainer, _ = self.get_trainer()
-        self.assertEqual(len(trainer.blocks), 1)
-        ops = [
-            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
-            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
-            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'split_selected_rows', 'send', 'sequence_pool_grad',
-            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
-            'recv', 'fetch_barrier'
-        ]
-        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
-
-
-# test for remote prefetch
-class TestRemoteNce(TestDistLookupTableBase):
-    def network_with_table(self, is_sparse, is_distributed):
-
-        num_total_classes = 20
-        sampler = "uniform"
-        nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
-
-        input = fluid.layers.data(name="input", shape=[10], dtype="float32")
-        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-        w_param = fluid.default_main_program().global_block().create_parameter(
-            shape=[num_total_classes, 10],
-            dtype='float32',
-            name='nce_w',
-            initializer=fluid.initializer.ConstantInitializer())
-        b_param = fluid.default_main_program().global_block().create_parameter(
-            shape=[num_total_classes, 1],
-            dtype='float32',
-            name='nce_b',
-            initializer=fluid.initializer.ConstantInitializer())
-
-        cost = fluid.layers.nce(input=input,
-                                label=label,
-                                num_total_classes=num_total_classes,
-                                sampler=sampler,
-                                custom_dist=nid_freq_arr.tolist(),
-                                sample_weight=None,
-                                param_attr='nce_w',
-                                bias_attr='nce_b',
-                                seed=1,
-                                num_neg_samples=5,
-                                is_sparse=is_sparse)
-        avg_cost = fluid.layers.mean(cost)
-        # optimizer
-        optimizer = fluid.optimizer.Adam(learning_rate=0.003)
-        optimizer.minimize(avg_cost)
-
-    def net_conf(self):
-        import os
-        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
-        self.network_with_table(is_sparse=True, is_distributed=False)
-
-    def transpiler_test_impl(self):
-        trainer, _ = self.get_trainer()
-
-        out_vars = ["nce_w"]
-        in_vars = ["nce_b"]
-
-        recv_var_names = []
-
-        for op in trainer.blocks[0].ops:
-            if op.type == "recv":
-                for var in op.output("Out"):
-                    recv_var_names.append(var)
-
-        for out_var in out_vars:
-            self.assertFalse(out_var in recv_var_names)
-        for in_var in in_vars:
-            self.assertTrue(in_var in recv_var_names)
-
-
-# test for remote prefetch
-class TestRemoteHsigmoid(TestDistLookupTableBase):
-    def network_with_table(self, is_sparse, is_distributed):
-
-        num_total_classes = 3
-
-        input = fluid.layers.data(name="input", shape=[1], dtype="float32")
-        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-        path_table = fluid.layers.data(
-            name='path_table', shape=[3], dtype='int64')
-        path_code = fluid.layers.data(
-            name='path_code', shape=[3], dtype='int64')
-        w_param = fluid.default_main_program().global_block().create_parameter(
-            shape=[num_total_classes, 10],
-            dtype='float32',
-            name='hs_w',
-            initializer=fluid.initializer.ConstantInitializer())
-        b_param = fluid.default_main_program().global_block().create_parameter(
-            shape=[3, 1],
-            dtype='float32',
-            name='hs_b',
-            initializer=fluid.initializer.ConstantInitializer())
-
-        emb = fluid.layers.embedding(
-            input=input,
-            is_sparse=is_sparse,
-            size=[3, 3],
-            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
-                scale=1 / math.sqrt(num_total_classes))))
-
-        cost = fluid.layers.hsigmoid(
-            input=emb,
-            label=label,
-            num_classes=num_total_classes,
-            path_table=path_table,
-            path_code=path_code,
-            is_custom=True,
-            is_sparse=is_sparse)
-        avg_cost = fluid.layers.mean(cost)
-        # optimizer
-        optimizer = fluid.optimizer.SGD(learning_rate=0.003)
-        optimizer.minimize(avg_cost)
-
-    def net_conf(self):
-        import os
-        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
-        self.network_with_table(is_sparse=True, is_distributed=False)
-
-    def transpiler_test_impl(self):
-        trainer, _ = self.get_trainer()
-        params_to_check = list()
-        for op in trainer.blocks[0].ops:
-            if op.type == "hierarchical_sigmoid":
-                params_to_check = [op.input("W")[0], op.input("Bias")[0]]
-                for name in ["epmap", "table_names", "epmap"]:
-                    assert op.has_attr(name)
-                    if name == "epmap":
-                        assert op.attr(name)[0] == u'127.0.0.1:6174'
-                    elif name == "table_names":
-                        assert op.attr(name)[0] == u'hierarchical_sigmoid_0.w_0'
-                    else:
-                        assert op.attr(name) == 3
-            elif op.type == "lookup_table":
-                params_to_check.append(op.input("W")[0])
-            else:
-                pass
-        op_count = 0
-        for op in trainer.blocks[0].ops:
-            if op.type == "recv":
-                assert len(op.output("Out")) == 1
-                assert op.output("Out")[0] == u'hierarchical_sigmoid_0.b_0'
-                op_count += 1
-        assert op_count == 1
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
deleted file mode 100644
index 4553cb0ffd7038860d49aa04b1c111e91d9f895f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistW2V2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_dist_train(self):
-        self.check_with_place("dist_word2vec.py", delta=1e-4)
-
-
-class TestDistW2V2x2WithMemOpt(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._mem_opt = True
-        self._enforce_place = "CPU"
-
-    def test_dist_train(self):
-        self.check_with_place("dist_word2vec.py", delta=1e-4)
-
-
-class TestDistW2V2x2Async(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    def test_dist_train(self):
-        self.check_with_place("dist_word2vec.py", delta=100)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
deleted file mode 100644
index 55b21f1a722f822f1bfcb7bbbda645109092b8a3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ /dev/null
@@ -1,121 +0,0 @@
-#    Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-import sys
-from op_test import OpTest
-
-
-class TestDistributeFPNProposalsOp(OpTest):
-    def set_data(self):
-        self.init_test_case()
-        self.make_rois()
-        self.rois_fpn, self.rois_idx_restore = self.calc_rois_distribute()
-        self.inputs = {'FpnRois': (self.rois[:, 1:5], self.rois_lod)}
-        self.attrs = {
-            'max_level': self.roi_max_level,
-            'min_level': self.roi_min_level,
-            'refer_scale': self.canonical_scale,
-            'refer_level': self.canonical_level
-        }
-        output = [('out%d' % i, self.rois_fpn[i])
-                  for i in range(len(self.rois_fpn))]
-        self.outputs = {
-            'MultiFpnRois': output,
-            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1)
-        }
-
-    def init_test_case(self):
-        self.roi_max_level = 5
-        self.roi_min_level = 2
-        self.canonical_scale = 224
-        self.canonical_level = 4
-        self.images_shape = [512, 512]
-
-    def boxes_area(self, boxes):
-        w = (boxes[:, 2] - boxes[:, 0] + 1)
-        h = (boxes[:, 3] - boxes[:, 1] + 1)
-        areas = w * h
-        assert np.all(areas >= 0), 'Negative areas founds'
-        return areas
-
-    def map_rois_to_fpn_levels(self, rois, lvl_min, lvl_max):
-        s = np.sqrt(self.boxes_area(rois))
-        s0 = self.canonical_scale
-        lvl0 = self.canonical_level
-        target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-6))
-        target_lvls = np.clip(target_lvls, lvl_min, lvl_max)
-        return target_lvls
-
-    def get_sub_lod(self, sub_lvl):
-        sub_lod = [0, 0]
-        max_batch_id = sub_lvl[-1]
-        for i in range(max_batch_id.astype(np.int32) + 1):
-            sub_lod[i] = np.where(sub_lvl == i)[0].size
-        return sub_lod
-
-    def add_multilevel_roi(self, rois, target_lvls, lvl_min, lvl_max):
-        rois_idx_order = np.empty((0, ))
-        rois_fpn = []
-        for lvl in range(lvl_min, lvl_max + 1):
-            idx_lvl = np.where(target_lvls == lvl)[0]
-            if len(idx_lvl) == 0:
-                rois_fpn.append((np.empty(shape=(0, 4)), [[0, 0]]))
-                continue
-            sub_lod = self.get_sub_lod(rois[idx_lvl, 0])
-            rois_fpn.append((rois[idx_lvl, 1:], [sub_lod]))
-            rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
-        rois_idx_restore = np.argsort(rois_idx_order).astype(
-            np.int32, copy=False)
-        return rois_fpn, rois_idx_restore
-
-    def calc_rois_distribute(self):
-        lvl_min = self.roi_min_level
-        lvl_max = self.roi_max_level
-        target_lvls = self.map_rois_to_fpn_levels(self.rois[:, 1:5], lvl_min,
-                                                  lvl_max)
-        rois_fpn, rois_idx_restore = self.add_multilevel_roi(
-            self.rois, target_lvls, lvl_min, lvl_max)
-        return rois_fpn, rois_idx_restore
-
-    def make_rois(self):
-        self.rois_lod = [[100, 200]]
-        rois = []
-        lod = self.rois_lod[0]
-        bno = 0
-        for roi_num in lod:
-            for i in range(roi_num):
-                xywh = np.random.rand(4)
-                xy1 = xywh[0:2] * 20
-                wh = xywh[2:4] * (self.images_shape - xy1)
-                xy2 = xy1 + wh
-                roi = [bno, xy1[0], xy1[1], xy2[0], xy2[1]]
-                rois.append(roi)
-            bno += 1
-        self.rois = np.array(rois).astype("float32")
-
-    def setUp(self):
-        self.op_type = "distribute_fpn_proposals"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_distributions.py b/python/paddle/fluid/tests/unittests/test_distributions.py
deleted file mode 100644
index 3de9c10e6d96a4c406bd4ee0f82987ab9d5caf7a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_distributions.py
+++ /dev/null
@@ -1,578 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import unittest
-from paddle import fluid
-from paddle.fluid import layers
-from paddle.fluid.layers.distributions import *
-import math
-
-
-class DistributionNumpy():
-    """
-        Distribution is the abstract base class for probability distributions.
-    """
-
-    def sample(self):
-        """Sampling from the distribution."""
-        raise NotImplementedError
-
-    def entropy(self):
-        """The entropy of the distribution."""
-        raise NotImplementedError
-
-    def kl_divergence(self, other):
-        """The KL-divergence between self distributions and other."""
-        raise NotImplementedError
-
-    def log_prob(self, value):
-        """Log probability density/mass function."""
-        raise NotImplementedError
-
-
-class UniformNumpy(DistributionNumpy):
-    def __init__(self, low, high):
-        self.low = np.array(low).astype('float32')
-        self.high = np.array(high).astype('float32')
-
-    def sample(self, shape):
-        shape = tuple(shape) + (self.low + self.high).shape
-        return self.low + (np.random.uniform(size=shape) *
-                           (self.high - self.low))
-
-    def log_prob(self, value):
-        lb = np.less(self.low, value).astype('float32')
-        ub = np.less(value, self.high).astype('float32')
-        return np.log(lb * ub) - np.log(self.high - self.low)
-
-    def entropy(self):
-        return np.log(self.high - self.low)
-
-
-class NormalNumpy(DistributionNumpy):
-    def __init__(self, loc, scale):
-        self.loc = np.array(loc).astype('float32')
-        self.scale = np.array(scale).astype('float32')
-
-    def sample(self, shape):
-        shape = tuple(shape) + (self.loc + self.scale).shape
-        return self.loc + (np.random.randn(*shape) * self.scale)
-
-    def log_prob(self, value):
-        var = self.scale * self.scale
-        log_scale = np.log(self.scale)
-        return -((value - self.loc) * (value - self.loc)) / (
-            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
-
-    def entropy(self):
-        return 0.5 + 0.5 * np.log(np.array(2. * math.pi).astype(
-            'float32')) + np.log(self.scale)
-
-    def kl_divergence(self, other):
-        var_ratio = (self.scale / other.scale)
-        var_ratio = var_ratio * var_ratio
-        t1 = ((self.loc - other.loc) / other.scale)
-        t1 = (t1 * t1)
-        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
-
-
-class CategoricalNumpy(DistributionNumpy):
-    def __init__(self, logits):
-        self.logits = np.array(logits).astype('float32')
-
-    def entropy(self):
-        logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
-        e_logits = np.exp(logits)
-        z = np.sum(e_logits, axis=-1, keepdims=True)
-        prob = e_logits / z
-        return -1. * np.sum(prob * (logits - np.log(z)), axis=-1, keepdims=True)
-
-    def kl_divergence(self, other):
-        logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
-        other_logits = other.logits - np.max(
-            other.logits, axis=-1, keepdims=True)
-        e_logits = np.exp(logits)
-        other_e_logits = np.exp(other_logits)
-        z = np.sum(e_logits, axis=-1, keepdims=True)
-        other_z = np.sum(other_e_logits, axis=-1, keepdims=True)
-        prob = e_logits / z
-        return np.sum(prob * (logits - np.log(z) - other_logits \
-            + np.log(other_z)), axis=-1, keepdims=True)
-
-
-class MultivariateNormalDiagNumpy(DistributionNumpy):
-    def __init__(self, loc, scale):
-        self.loc = np.array(loc).astype('float32')
-        self.scale = np.array(scale).astype('float32')
-
-    def _det(self, value):
-        batch_shape = list(value.shape)
-        one_all = np.ones(shape=batch_shape, dtype='float32')
-        one_diag = np.eye(batch_shape[0], dtype='float32')
-        det_diag = np.prod(value + one_all - one_diag)
-
-        return det_diag
-
-    def _inv(self, value):
-        batch_shape = list(value.shape)
-        one_all = np.ones(shape=batch_shape, dtype='float32')
-        one_diag = np.eye(batch_shape[0], dtype='float32')
-        inv_diag = np.power(value, (one_all - 2 * one_diag))
-
-        return inv_diag
-
-    def entropy(self):
-        return 0.5 * (self.scale.shape[0] *
-                      (1.0 + np.log(np.array(2 * math.pi).astype('float32'))
-                       ) + np.log(self._det(self.scale)))
-
-    def kl_divergence(self, other):
-        tr_cov_matmul = np.sum(self._inv(other.scale) * self.scale)
-        loc_matmul_cov = np.matmul((other.loc - self.loc),
-                                   self._inv(other.scale))
-        tri_matmul = np.matmul(loc_matmul_cov, (other.loc - self.loc))
-        k = list(self.scale.shape)[0]
-        ln_cov = np.log(self._det(other.scale)) - np.log(self._det(self.scale))
-        kl = 0.5 * (tr_cov_matmul + tri_matmul - k + ln_cov)
-
-        return kl
-
-
-class DistributionTest(unittest.TestCase):
-    def setUp(self, use_gpu=False):
-        self.use_gpu = use_gpu
-        if not use_gpu:
-            place = fluid.CPUPlace()
-            self.gpu_id = -1
-        else:
-            place = fluid.CUDAPlace(0)
-            self.gpu_id = 0
-        self.executor = fluid.Executor(place)
-
-    def build_normal_program(self, test_program, batch_size, dims, loc_float,
-                             scale_float, other_loc_float, other_scale_float,
-                             scale_np, other_scale_np, loc_np, other_loc_np,
-                             values_np):
-        with fluid.program_guard(test_program):
-            loc = layers.data(name='loc', shape=[dims], dtype='float32')
-            scale = layers.data(name='scale', shape=[dims], dtype='float32')
-
-            other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float32')
-            other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float32')
-
-            values = layers.data(name='values', shape=[dims], dtype='float32')
-
-            normal_float = Normal(loc_float, scale_float)
-            other_normal_float = Normal(other_loc_float, other_scale_float)
-
-            normal_float_np_broadcast = Normal(loc_float, scale_np)
-            other_normal_float_np_broadcast = Normal(other_loc_float,
-                                                     other_scale_np)
-
-            normal_np = Normal(loc_np, scale_np)
-            other_normal_np = Normal(other_loc_np, other_scale_np)
-
-            normal_variable = Normal(loc, scale)
-            other_normal_variable = Normal(other_loc, other_scale)
-
-            sample_float = normal_float.sample([batch_size, dims])
-            sample_float_np_broadcast = normal_float_np_broadcast.sample(
-                [batch_size, dims])
-            sample_np = normal_np.sample([batch_size, dims])
-            sample_variable = normal_variable.sample([batch_size, dims])
-
-            entropy_float = normal_float.entropy()
-            entropy_float_np_broadcast = normal_float_np_broadcast.entropy()
-            entropy_np = normal_np.entropy()
-            entropy_variable = normal_variable.entropy()
-
-            lp_float_np_broadcast = normal_float_np_broadcast.log_prob(values)
-            lp_np = normal_np.log_prob(values)
-            lp_variable = normal_variable.log_prob(values)
-
-            kl_float = normal_float.kl_divergence(other_normal_float)
-            kl_float_np_broadcast = normal_float_np_broadcast.kl_divergence(
-                other_normal_float_np_broadcast)
-            kl_np = normal_np.kl_divergence(other_normal_np)
-            kl_variable = normal_variable.kl_divergence(other_normal_variable)
-
-        fetch_list = [
-            sample_float, sample_float_np_broadcast, sample_np, sample_variable,
-            entropy_float, entropy_float_np_broadcast, entropy_np,
-            entropy_variable, lp_float_np_broadcast, lp_np, lp_variable,
-            kl_float, kl_float_np_broadcast, kl_np, kl_variable
-        ]
-        feed_vars = {
-            'loc': loc_np,
-            'scale': scale_np,
-            'other_loc': other_loc_np,
-            'other_scale': other_scale_np,
-            'values': values_np
-        }
-        return feed_vars, fetch_list
-
-    def get_normal_random_input(self, batch_size, dims):
-        loc_np = np.random.randn(batch_size, dims).astype('float32')
-        other_loc_np = np.random.randn(batch_size, dims).astype('float32')
-
-        loc_float = (np.random.ranf() - 0.5) * 4
-        scale_float = (np.random.ranf() - 0.5) * 4
-        while scale_float < 0:
-            scale_float = (np.random.ranf() - 0.5) * 4
-
-        other_loc_float = (np.random.ranf() - 0.5) * 4
-        other_scale_float = (np.random.ranf() - 0.5) * 4
-        while other_scale_float < 0:
-            other_scale_float = (np.random.ranf() - 0.5) * 4
-
-        scale_np = np.random.randn(batch_size, dims).astype('float32')
-        other_scale_np = np.random.randn(batch_size, dims).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        while not np.all(scale_np > 0):
-            scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(other_scale_np > 0):
-            other_scale_np = np.random.randn(batch_size, dims).astype('float32')
-        return loc_np, other_loc_np, loc_float, scale_float, other_loc_float, \
-               other_scale_float, scale_np, other_scale_np, values_np
-
-    def test_normal_distribution(self, batch_size=2, dims=3, tolerance=1e-6):
-        test_program = fluid.Program()
-        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = self.get_normal_random_input(
-            batch_size, dims)
-
-        feed_vars, fetch_list = self.build_normal_program(
-            test_program, batch_size, dims, loc_float, scale_float,
-            other_loc_float, other_scale_float, scale_np, other_scale_np,
-            loc_np, other_loc_np, values_np)
-        self.executor.run(fluid.default_startup_program())
-
-        np_normal_float = NormalNumpy(loc_float, scale_float)
-        np_other_normal_float = NormalNumpy(other_loc_float, other_scale_float)
-        np_normal_float_np_broadcast = NormalNumpy(loc_float, scale_np)
-        np_other_normal_float_np_broadcast = NormalNumpy(other_loc_float,
-                                                         other_scale_np)
-        np_normal = NormalNumpy(loc_np, scale_np)
-        np_other_normal = NormalNumpy(other_loc_np, other_scale_np)
-
-        gt_sample_float = np_normal_float.sample([batch_size, dims])
-        gt_sample_float_np_broadcast = np_normal_float_np_broadcast.sample(
-            [batch_size, dims])
-        gt_sample_np = np_normal.sample([batch_size, dims])
-        gt_entropy_float = np_normal_float.entropy()
-        gt_entropy_float_np_broadcast = np_normal_float_np_broadcast.entropy()
-        gt_entropy = np_normal.entropy()
-        gt_lp_float_np_broadcast = np_normal_float_np_broadcast.log_prob(
-            values_np)
-        gt_lp = np_normal.log_prob(values_np)
-        gt_kl_float = np_normal_float.kl_divergence(np_other_normal_float)
-        gt_kl_float_np_broadcast = np_normal_float_np_broadcast.kl_divergence(
-            np_other_normal_float_np_broadcast)
-        gt_kl = np_normal.kl_divergence(np_other_normal)
-
-        [
-            output_sample_float, output_sample_float_np_broadcast,
-            output_sample_np, output_sample_variable, output_entropy_float,
-            output_entropy_float_np_broadcast, output_entropy_np,
-            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
-            output_lp_variable, output_kl_float, output_kl_float_np_broadcast,
-            output_kl_np, output_kl_variable
-        ] = self.executor.run(program=test_program,
-                              feed=feed_vars,
-                              fetch_list=fetch_list)
-
-        np.testing.assert_allclose(
-            output_sample_float.shape,
-            gt_sample_float.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_float_np_broadcast.shape,
-            gt_sample_float_np_broadcast.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_np.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_variable.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float_np_broadcast,
-            gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_float_np_broadcast,
-            gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float, gt_kl_float, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float_np_broadcast,
-            gt_kl_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_np, gt_kl, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_variable, gt_kl, rtol=tolerance, atol=tolerance)
-
-    def build_uniform_program(self, test_program, batch_size, dims, low_float,
-                              high_float, high_np, low_np, values_np):
-        with fluid.program_guard(test_program):
-            low = layers.data(name='low', shape=[dims], dtype='float32')
-            high = layers.data(name='high', shape=[dims], dtype='float32')
-
-            values = layers.data(name='values', shape=[dims], dtype='float32')
-
-            uniform_float = Uniform(low_float, high_float)
-            uniform_float_np_broadcast = Uniform(low_float, high_np)
-            uniform_np = Uniform(low_np, high_np)
-            uniform_variable = Uniform(low, high)
-
-            sample_float = uniform_float.sample([batch_size, dims])
-            sample_float_np_broadcast = uniform_float_np_broadcast.sample(
-                [batch_size, dims])
-            sample_np = uniform_np.sample([batch_size, dims])
-            sample_variable = uniform_variable.sample([batch_size, dims])
-
-            entropy_float = uniform_float.entropy()
-            entropy_float_np_broadcast = uniform_float_np_broadcast.entropy()
-            entropy_np = uniform_np.entropy()
-            entropy_variable = uniform_variable.entropy()
-
-            lp_float_np_broadcast = uniform_float_np_broadcast.log_prob(values)
-            lp_np = uniform_np.log_prob(values)
-            lp_variable = uniform_variable.log_prob(values)
-
-        fetch_list = [
-            sample_float, sample_float_np_broadcast, sample_np, sample_variable,
-            entropy_float, entropy_float_np_broadcast, entropy_np,
-            entropy_variable, lp_float_np_broadcast, lp_np, lp_variable
-        ]
-        feed_vars = {'low': low_np, 'high': high_np, 'values': values_np}
-        return feed_vars, fetch_list
-
-    def test_uniform_distribution(self, batch_size=2, dims=3, tolerance=1e-6):
-        test_program = fluid.Program()
-
-        low_np = np.random.randn(batch_size, dims).astype('float32')
-        low_float = np.random.uniform(-2, 1)
-        high_float = np.random.uniform(1, 3)
-        high_np = np.random.uniform(-5.0, 5.0,
-                                    (batch_size, dims)).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        feed_vars, fetch_list = self.build_uniform_program(
-            test_program, batch_size, dims, low_float, high_float, high_np,
-            low_np, values_np)
-
-        self.executor.run(fluid.default_startup_program())
-
-        np_uniform_float = UniformNumpy(low_float, high_float)
-        np_uniform_float_np_broadcast = UniformNumpy(low_float, high_np)
-        np_uniform = UniformNumpy(low_np, high_np)
-
-        gt_sample_float = np_uniform_float.sample([batch_size, dims])
-        gt_sample_float_np_broadcast = np_uniform_float_np_broadcast.sample(
-            [batch_size, dims])
-        gt_sample_np = np_uniform.sample([batch_size, dims])
-        gt_entropy_float = np_uniform_float.entropy()
-        gt_entropy_float_np_broadcast = np_uniform_float_np_broadcast.entropy()
-        gt_entropy = np_uniform.entropy()
-        gt_lp_float_np_broadcast = np_uniform_float_np_broadcast.log_prob(
-            values_np)
-        gt_lp = np_uniform.log_prob(values_np)
-
-        # result calculated by paddle
-        [
-            output_sample_float, output_sample_float_np_broadcast,
-            output_sample_np, output_sample_variable, output_entropy_float,
-            output_entropy_float_np_broadcast, output_entropy_np,
-            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
-            output_lp_variable
-        ] = self.executor.run(program=test_program,
-                              feed=feed_vars,
-                              fetch_list=fetch_list)
-
-        np.testing.assert_allclose(
-            output_sample_float.shape,
-            gt_sample_float.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_float_np_broadcast.shape,
-            gt_sample_float_np_broadcast.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_np.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_variable.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float_np_broadcast,
-            gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_float_np_broadcast,
-            gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
-
-    def test_categorical_distribution(self,
-                                      batch_size=2,
-                                      dims=3,
-                                      tolerance=1e-6):
-        test_program = fluid.Program()
-
-        logits_np = np.random.randn(batch_size, dims).astype('float32')
-        other_logits_np = np.random.randn(batch_size, dims).astype('float32')
-
-        with fluid.program_guard(test_program):
-            logits = layers.data(name='logits', shape=[dims], dtype='float32')
-            other_logits = layers.data(
-                name='other_logits', shape=[dims], dtype='float32')
-
-            categorical_np = Categorical(logits_np)
-            other_categorical_np = Categorical(other_logits_np)
-
-            entropy_np = categorical_np.entropy()
-            kl_np = categorical_np.kl_divergence(other_categorical_np)
-
-        self.executor.run(fluid.default_main_program())
-
-        np_categorical = CategoricalNumpy(logits_np)
-        np_other_categorical = CategoricalNumpy(other_logits_np)
-        gt_entropy_np = np_categorical.entropy()
-        gt_kl_np = np_categorical.kl_divergence(np_other_categorical)
-
-        # result calculated by paddle
-        [output_entropy_np,
-         output_kl_np] = self.executor.run(program=test_program,
-                                           feed={'logits': logits_np},
-                                           fetch_list=[entropy_np, kl_np])
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy_np, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_np, gt_kl_np, rtol=tolerance, atol=tolerance)
-
-    def test_multivariateNormalDiag_distribution(self,
-                                                 batch_size=2,
-                                                 tolerance=1e-6):
-        test_program = fluid.Program()
-
-        loc_np = np.random.random(batch_size, ).astype('float32')
-        scale_np = np.diag(np.random.random(batch_size, )).astype('float32')
-        other_loc_np = np.random.random(batch_size, ).astype('float32')
-        other_scale_np = np.diag(np.random.random(batch_size, )).astype(
-            'float32')
-
-        with fluid.program_guard(test_program):
-            loc = layers.data(
-                name='loc',
-                shape=[batch_size, ],
-                dtype='float32',
-                append_batch_size=False)
-            scale = layers.data(
-                name='scale',
-                shape=[batch_size, batch_size],
-                dtype='float32',
-                append_batch_size=False)
-            other_loc = layers.data(
-                name='other_loc',
-                shape=[batch_size, ],
-                dtype='float32',
-                append_batch_size=False)
-            other_scale = layers.data(
-                name='other_scale',
-                shape=[batch_size, batch_size],
-                dtype='float32',
-                append_batch_size=False)
-
-            multivariate_np = MultivariateNormalDiag(loc, scale)
-            other_multivariate_np = MultivariateNormalDiag(other_loc,
-                                                           other_scale)
-
-            entropy_np = multivariate_np.entropy()
-            other_entropy_np = other_multivariate_np.entropy()
-            kl_np = multivariate_np.kl_divergence(other_multivariate_np)
-
-        self.executor.run(fluid.default_main_program())
-
-        np_multivariate = MultivariateNormalDiagNumpy(loc_np, scale_np)
-        np_other_multivariate = MultivariateNormalDiagNumpy(other_loc_np,
-                                                            other_scale_np)
-        gt_entropy_np = np_multivariate.entropy()
-        gt_kl_np = np_multivariate.kl_divergence(np_other_multivariate)
-
-        # result calculated by paddle
-        [output_entropy_np,
-         output_kl_np] = self.executor.run(program=test_program,
-                                           feed={
-                                               'loc': loc_np,
-                                               'scale': scale_np,
-                                               'other_loc': other_loc_np,
-                                               'other_scale': other_scale_np
-                                           },
-                                           fetch_list=[entropy_np, kl_np])
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy_np, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_np, gt_kl_np, rtol=tolerance, atol=tolerance)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_downpoursgd.py b/python/paddle/fluid/tests/unittests/test_downpoursgd.py
deleted file mode 100644
index d1b54d5f22a3c322f874f8907bc0cf3aac13691a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_downpoursgd.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import os
-import signal
-import subprocess
-import time
-import unittest
-import sys
-from op_test import OpTest
-from paddle.fluid.trainer_desc import DistMultiTrainer
-from paddle.fluid.device_worker import DownpourSGD
-from google.protobuf import text_format
-import paddle.fluid.incubate.fleet.parameter_server.pslib.ps_pb2 as pslib
-
-
-class TestListenAndServOp(OpTest):
-    def setUp(self):
-        pass
-
-    def test_device_work_use_cvm(self):
-        if sys.platform == 'win32' or sys.platform == 'sys.platform':
-            pass
-        else:
-            print(sys.platform)
-            cmd = "wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt"
-            os.system(cmd)
-            x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-            x_emb = fluid.layers.embedding(
-                input=x, size=[1, 2], is_distributed=True)
-            y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
-
-            ps_param = pslib.PSParameter()
-            with open("fleet_desc.prototxt") as f:
-                text_format.Merge(f.read(), ps_param)
-            fleet_desc = ps_param
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-
-            opt_info = {}
-            main_program = fluid.default_main_program()
-            program_id = str(id(avg_cost.block.program))
-            program_configs = {}
-            program_configs[program_id] = {
-                "pull_sparse": [0],
-                "push_sparse": [0]
-            }
-            program_configs[program_id]["pull_dense"] = [1]
-            program_configs[program_id]["push_dense"] = [1]
-
-            worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
-            opt_info["program_configs"] = program_configs
-            opt_info["trainer"] = "DistMultiTrainer"
-            opt_info["device_worker"] = "DownpourSGD"
-            opt_info["optimizer"] = "DownpourSGD"
-            opt_info["fleet_desc"] = ps_param
-            opt_info["worker_skipped_ops"] = worker_skipped_ops
-            opt_info["use_cvm"] = True
-            opt_info["scale_datanorm"] = -1
-            opt_info["dump_slot"] = False
-            opt_info["stat_var_names"] = []
-
-            main_program._fleet_opt = opt_info
-            trainer = DistMultiTrainer()
-            trainer._set_program(main_program)
-            device_worker = DownpourSGD()
-            device_worker._set_fleet_desc(fleet_desc)
-            trainer._set_device_worker(device_worker)
-            trainer._set_fleet_desc(fleet_desc)
-            trainer._gen_trainer_desc()
-            cmd = "rm fleet_desc.prototxt*"
-            os.system(cmd)
-
-    def test_device_work(self):
-        if sys.platform == 'win32' or sys.platform == 'sys.platform':
-            pass
-        else:
-            print(sys.platform)
-            cmd = "wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt"
-            os.system(cmd)
-            x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-            x_emb = fluid.layers.embedding(
-                input=x, size=[1, 2], is_distributed=True)
-            y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
-
-            ps_param = pslib.PSParameter()
-            with open("fleet_desc.prototxt") as f:
-                text_format.Merge(f.read(), ps_param)
-            fleet_desc = ps_param
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-
-            opt_info = {}
-            main_program = fluid.default_main_program()
-            program_id = str(id(avg_cost.block.program))
-            program_configs = {}
-            program_configs[program_id] = {
-                "pull_sparse": [0],
-                "push_sparse": [0]
-            }
-            program_configs[program_id]["pull_dense"] = [1]
-            program_configs[program_id]["push_dense"] = [1]
-
-            worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
-            opt_info["program_configs"] = program_configs
-            opt_info["trainer"] = "DistMultiTrainer"
-            opt_info["device_worker"] = "DownpourSGD"
-            opt_info["optimizer"] = "DownpourSGD"
-            opt_info["fleet_desc"] = ps_param
-            opt_info["worker_skipped_ops"] = worker_skipped_ops
-            opt_info["use_cvm"] = False
-            opt_info["scale_datanorm"] = -1
-            opt_info["dump_slot"] = False
-            opt_info["stat_var_names"] = []
-
-            main_program._fleet_opt = opt_info
-            trainer = DistMultiTrainer()
-            trainer._set_program(main_program)
-            device_worker = DownpourSGD()
-            device_worker._set_fleet_desc(fleet_desc)
-            trainer._set_device_worker(device_worker)
-            trainer._set_fleet_desc(fleet_desc)
-            trainer._gen_trainer_desc()
-            cmd = "rm fleet_desc.prototxt*"
-            os.system(cmd)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dpsgd_op.py b/python/paddle/fluid/tests/unittests/test_dpsgd_op.py
deleted file mode 100644
index 48bf786e139dd493fcb3ed6122b4d617a5f5bf2f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dpsgd_op.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestDpsgdOp(OpTest):
-    def setUp(self):
-        '''Test Dpsgd Operator with supplied attributes
-        '''
-        self.op_type = "dpsgd"
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-
-        learning_rate = 0.001
-        clip = 10000.0
-        batch_size = 16.0
-        sigma = 0.0
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'LearningRate': np.array([learning_rate]).astype("float32")
-        }
-
-        self.attrs = {'clip': clip, 'batch_size': batch_size, 'sigma': sigma}
-
-        param_out = dpsgd_step(self.inputs, self.attrs)
-
-        self.outputs = {'ParamOut': param_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-def dpsgd_step(inputs, attributes):
-    '''
-    Simulate one step of the dpsgd optimizer
-    :param inputs: dict of inputs
-    :param attributes: dict of attributes
-    :return tuple: tuple of output param, moment, inf_norm and
-    beta1 power accumulator
-    '''
-    param = inputs['Param']
-    grad = inputs['Grad']
-    lr = inputs['LearningRate']
-
-    clip = attributes['clip']
-    batch_size = attributes['batch_size']
-    sigma = attributes['sigma']
-
-    param_out = param - lr * grad
-
-    return param_out
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
deleted file mode 100644
index 59918a7bb21c42359f7d6c4f6109ca4b1cdc4449..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ /dev/null
@@ -1,184 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-class TestDropoutOp(OpTest):
-    def setUp(self):
-        self.op_type = "dropout"
-        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64)).astype('uint8')
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.05)
-
-
-class TestDropoutOp2(TestDropoutOp):
-    def setUp(self):
-        self.op_type = "dropout"
-        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
-        self.outputs = {
-            'Out': np.zeros((32, 64)).astype('float32'),
-            'Mask': np.zeros((32, 64)).astype('uint8')
-        }
-
-
-class TestDropoutOp3(TestDropoutOp):
-    def setUp(self):
-        self.op_type = "dropout"
-        self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64, 2)).astype('uint8')
-        }
-
-
-class TestDropoutOp4(OpTest):
-    def setUp(self):
-        self.op_type = "dropout"
-        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.35, 'fix_seed': True, 'is_test': True}
-        self.outputs = {
-            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestDropoutOp5(OpTest):
-    def setUp(self):
-        self.op_type = "dropout"
-        self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.75, 'is_test': True}
-        self.outputs = {
-            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestDropoutOp6(TestDropoutOp):
-    def setUp(self):
-        self.op_type = "dropout"
-        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {
-            'dropout_prob': 1.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': np.zeros((32, 64)).astype('float32'),
-            'Mask': np.zeros((32, 64)).astype('uint8')
-        }
-
-
-class TestDropoutOp7(TestDropoutOp):
-    def setUp(self):
-        self.op_type = "dropout"
-        self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {
-            'dropout_prob': 0.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64, 2)).astype('uint8')
-        }
-
-
-class TestDropoutOp8(OpTest):
-    def setUp(self):
-        self.op_type = "dropout"
-        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {
-            'dropout_prob': 0.35,
-            'fix_seed': True,
-            'is_test': True,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {'Out': self.inputs['X']}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestDropoutOp9(OpTest):
-    def setUp(self):
-        self.op_type = "dropout"
-        self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
-        self.attrs = {
-            'dropout_prob': 0.75,
-            'is_test': True,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {'Out': self.inputs['X']}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFP16DropoutOp(OpTest):
-    def setUp(self):
-        self.op_type = "dropout"
-        self.init_test_case()
-
-        x = np.random.random(self.input_size).astype("float16")
-        out = x * (1.0 - self.prob)
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {
-            'dropout_prob': self.prob,
-            'fix_seed': self.fix_seed,
-            'is_test': True
-        }
-        self.outputs = {'Out': out}
-
-    def init_test_case(self):
-        self.input_size = [32, 64]
-        self.prob = 0.35
-        self.fix_seed = True
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"):
-            self.check_output_with_place(core.CUDAPlace(0), atol=1e-3)
-
-
-class TestFP16DropoutOp2(TestFP16DropoutOp):
-    def init_test_case(self):
-        self.input_size = [32, 64, 3]
-        self.prob = 0.75
-        self.fix_seed = False
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
deleted file mode 100644
index 1c72a41411e0bc23973ae4c69bcd75a192cce5ad..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
-
-
-class SimpleImgConvPool(fluid.dygraph.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 pool_size,
-                 pool_stride,
-                 pool_padding=0,
-                 pool_type='max',
-                 global_pooling=False,
-                 conv_stride=1,
-                 conv_padding=0,
-                 conv_dilation=1,
-                 conv_groups=1,
-                 act=None,
-                 use_cudnn=False,
-                 dtype='float32',
-                 param_attr=None,
-                 bias_attr=None):
-        super(SimpleImgConvPool, self).__init__(name_scope)
-
-        self._conv2d = Conv2D(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            use_cudnn=use_cudnn,
-            dtype=dtype,
-            act=act)
-
-        self._pool2d = Pool2D(
-            self.full_name(),
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
-
-    def forward(self, inputs):
-        x = self._conv2d(inputs)
-        x = self._pool2d(x)
-        return x
-
-
-class MNIST(fluid.dygraph.Layer):
-    def __init__(self, name_scope, dtype="float32"):
-        super(MNIST, self).__init__(name_scope)
-
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(),
-            num_filters=20,
-            filter_size=5,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            dtype=dtype,
-            use_cudnn=True)
-
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(),
-            num_filters=50,
-            filter_size=5,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            dtype=dtype,
-            use_cudnn=True)
-
-        pool_2_shape = 50 * 4 * 4
-        SIZE = 10
-        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(self.full_name(),
-                      10,
-                      param_attr=fluid.param_attr.ParamAttr(
-                          initializer=fluid.initializer.NormalInitializer(
-                              loc=0.0, scale=scale)),
-                      act="softmax",
-                      dtype=dtype)
-
-    def forward(self, inputs, label):
-        x = self._simple_img_conv_pool_1(inputs)
-        x = self._simple_img_conv_pool_2(x)
-        cost = self._fc(x)
-        loss = fluid.layers.cross_entropy(cost, label)
-        avg_loss = fluid.layers.mean(loss)
-        return avg_loss
-
-
-class TestMnist(unittest.TestCase):
-    # FIXME(zcd): disable this random failed test temporally.
-    @unittest.skip("should fix this later")
-    def test_mnist_fp16(self):
-        if not fluid.is_compiled_with_cuda():
-            return
-        x = np.random.randn(1, 3, 224, 224).astype("float16")
-        y = np.random.randn(1, 1).astype("int64")
-        with fluid.dygraph.guard(fluid.CUDAPlace(0)):
-            model = MNIST("mnist", dtype="float16")
-            x = fluid.dygraph.to_variable(x)
-            y = fluid.dygraph.to_variable(y)
-            loss = model(x, y)
-            print(loss.numpy())
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
deleted file mode 100644
index f473c435e59825486afe1669858971fcb772179e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import contextlib
-import unittest
-import numpy as np
-import six
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-
-
-class SimpleImgConvPool(fluid.dygraph.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 pool_size,
-                 pool_stride,
-                 pool_padding=0,
-                 pool_type='max',
-                 global_pooling=False,
-                 conv_stride=1,
-                 conv_padding=0,
-                 conv_dilation=1,
-                 conv_groups=1,
-                 act=None,
-                 use_cudnn=False,
-                 param_attr=None,
-                 bias_attr=None):
-        super(SimpleImgConvPool, self).__init__(name_scope)
-
-        self._conv2d = Conv2D(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=None,
-            bias_attr=None,
-            use_cudnn=use_cudnn)
-
-        self._pool2d = Pool2D(
-            self.full_name(),
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
-
-    def forward(self, inputs):
-        x = self._conv2d(inputs)
-        x = self._pool2d(x)
-        return x
-
-
-class MNIST(fluid.dygraph.Layer):
-    def __init__(self, name_scope):
-        super(MNIST, self).__init__(name_scope)
-
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 1, 20, 5, 2, 2, act="relu")
-
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 20, 50, 5, 2, 2, act="relu")
-
-        pool_2_shape = 50 * 4 * 4
-        SIZE = 10
-        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(self.full_name(),
-                      10,
-                      param_attr=fluid.param_attr.ParamAttr(
-                          initializer=fluid.initializer.NormalInitializer(
-                              loc=0.0, scale=scale)),
-                      act="softmax")
-
-    def forward(self, inputs):
-        x = self._simple_img_conv_pool_1(inputs)
-        x = self._simple_img_conv_pool_2(x)
-        x = self._fc(x)
-        return x
-
-
-class TestDygraphMultiForward(unittest.TestCase):
-    def test_mnist_forward_float32(self):
-        seed = 90
-        epoch_num = 1
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            mnist = MNIST("mnist")
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
-
-            dy_param_init_value = {}
-            mnist.eval()
-            for epoch in range(epoch_num):
-                for batch_id, data in enumerate(train_reader()):
-                    dy_x_data = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(128, 1)
-
-                    img = to_variable(dy_x_data)
-                    label = to_variable(y_data)
-                    label.stop_gradient = True
-
-                    cost = mnist(img)
-                    loss = fluid.layers.cross_entropy(cost, label)
-                    avg_loss = fluid.layers.mean(loss)
-
-                    dy_out = avg_loss.numpy()
-
-                    if epoch == 0 and batch_id == 0:
-                        for param in mnist.parameters():
-                            dy_param_init_value[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-
-            mnist = MNIST("mnist")
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
-
-            img = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            cost = mnist(img)
-            loss = fluid.layers.cross_entropy(cost, label)
-            avg_loss = fluid.layers.mean(loss)
-
-            # initialize params and fetch them
-            static_param_init_value = {}
-            static_param_name_list = []
-            for param in mnist.parameters():
-                static_param_name_list.append(param.name)
-
-            out = exe.run(fluid.default_startup_program(),
-                          fetch_list=static_param_name_list)
-
-            for i in range(len(static_param_name_list)):
-                static_param_init_value[static_param_name_list[i]] = out[i]
-
-            for epoch in range(epoch_num):
-                for batch_id, data in enumerate(train_reader()):
-                    static_x_data = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape([128, 1])
-
-                    fetch_list = [avg_loss.name]
-                    out = exe.run(
-                        fluid.default_main_program(),
-                        feed={"pixel": static_x_data,
-                              "label": y_data},
-                        fetch_list=fetch_list)
-
-                    static_out = out[0]
-
-        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
-
-        for key, value in six.iteritems(static_param_init_value):
-            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
-
-        self.assertTrue(np.allclose(static_out, dy_out))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
deleted file mode 100644
index 4b0195d307dc83f77ff04e89544d7bc751b8c011..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ /dev/null
@@ -1,287 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-import paddle
-import unittest
-import numpy
-
-from paddle.fluid.layers.control_flow import lod_rank_table
-from paddle.fluid.layers.control_flow import max_sequence_len
-from paddle.fluid.layers.control_flow import lod_tensor_to_array
-from paddle.fluid.layers.control_flow import array_to_lod_tensor
-from paddle.fluid.layers.control_flow import shrink_memory
-from fake_reader import fake_imdb_reader
-
-
-class TestDynRNN(unittest.TestCase):
-    def setUp(self):
-        self.word_dict_len = 5147
-        self.BATCH_SIZE = 2
-        reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100)
-        self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE)
-
-    def test_plain_while_op(self):
-        main_program = fluid.Program()
-        startup_program = fluid.Program()
-
-        with fluid.program_guard(main_program, startup_program):
-            sentence = fluid.layers.data(
-                name='word', shape=[1], dtype='int64', lod_level=1)
-            sent_emb = fluid.layers.embedding(
-                input=sentence, size=[self.word_dict_len, 32], dtype='float32')
-
-            label = fluid.layers.data(name='label', shape=[1], dtype='float32')
-
-            rank_table = lod_rank_table(x=sent_emb)
-
-            sent_emb_array = lod_tensor_to_array(x=sent_emb, table=rank_table)
-
-            seq_len = max_sequence_len(rank_table=rank_table)
-            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
-            i.stop_gradient = False
-
-            boot_mem = fluid.layers.fill_constant_batch_size_like(
-                input=fluid.layers.array_read(
-                    array=sent_emb_array, i=i),
-                value=0,
-                shape=[-1, 100],
-                dtype='float32')
-            boot_mem.stop_gradient = False
-
-            mem_array = fluid.layers.array_write(x=boot_mem, i=i)
-
-            cond = fluid.layers.less_than(x=i, y=seq_len)
-            cond.stop_gradient = False
-            while_op = fluid.layers.While(cond=cond)
-            out = fluid.layers.create_array(dtype='float32')
-
-            with while_op.block():
-                mem = fluid.layers.array_read(array=mem_array, i=i)
-                ipt = fluid.layers.array_read(array=sent_emb_array, i=i)
-
-                mem = shrink_memory(x=mem, i=i, table=rank_table)
-
-                hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
-
-                fluid.layers.array_write(x=hidden, i=i, array=out)
-                fluid.layers.increment(x=i, in_place=True)
-                fluid.layers.array_write(x=hidden, i=i, array=mem_array)
-                fluid.layers.less_than(x=i, y=seq_len, cond=cond)
-
-            all_timesteps = array_to_lod_tensor(x=out, table=rank_table)
-            last = fluid.layers.sequence_last_step(input=all_timesteps)
-            logits = fluid.layers.fc(input=last, size=1, act=None)
-            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=logits, label=label)
-            loss = fluid.layers.mean(loss)
-            sgd = fluid.optimizer.SGD(1e-4)
-            sgd.minimize(loss=loss)
-        cpu = fluid.CPUPlace()
-        exe = fluid.Executor(cpu)
-        exe.run(startup_program)
-        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
-
-        data = next(self.train_data())
-        val = exe.run(main_program, feed=feeder.feed(data),
-                      fetch_list=[loss])[0]
-        self.assertEqual((1, ), val.shape)
-        print(val)
-        self.assertFalse(numpy.isnan(val))
-
-    def test_train_dyn_rnn(self):
-        main_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(main_program, startup_program):
-            sentence = fluid.layers.data(
-                name='word', shape=[1], dtype='int64', lod_level=1)
-            sent_emb = fluid.layers.embedding(
-                input=sentence, size=[self.word_dict_len, 32], dtype='float32')
-
-            rnn = fluid.layers.DynamicRNN()
-
-            with rnn.block():
-                in_ = rnn.step_input(sent_emb)
-                mem = rnn.memory(shape=[100], dtype='float32')
-                out_ = fluid.layers.fc(input=[in_, mem], size=100, act='tanh')
-                rnn.update_memory(mem, out_)
-                rnn.output(out_)
-
-            last = fluid.layers.sequence_last_step(input=rnn())
-            logits = fluid.layers.fc(input=last, size=1, act=None)
-            label = fluid.layers.data(name='label', shape=[1], dtype='float32')
-            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=logits, label=label)
-            loss = fluid.layers.mean(loss)
-            sgd = fluid.optimizer.Adam(1e-3)
-            sgd.minimize(loss=loss)
-
-        cpu = fluid.CPUPlace()
-        exe = fluid.Executor(cpu)
-        exe.run(startup_program)
-        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
-        data = next(self.train_data())
-        loss_0 = exe.run(main_program,
-                         feed=feeder.feed(data),
-                         fetch_list=[loss])[0]
-        for _ in range(100):
-            val = exe.run(main_program,
-                          feed=feeder.feed(data),
-                          fetch_list=[loss])[0]
-        # loss should be small after 100 mini-batch
-        self.assertLess(val[0], loss_0[0])
-
-    # this unit test is just used to the two layer nested dyn_rnn.
-    def test_train_nested_dyn_rnn(self):
-        word_dict = [i for i in range(30)]
-
-        def fake_reader():
-            seq_len, label = [[2, 2]], [0, 1]
-            data = []
-            for ele in seq_len:
-                for j in ele:
-                    data.append([numpy.random.randint(30) \
-                                 for _ in range(j)])
-
-            while True:
-                yield data, label
-
-        train_data = paddle.batch(fake_reader, batch_size=2)
-
-        main_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(main_program, startup_program):
-            sentence = fluid.layers.data(
-                name='word', shape=[1], dtype='int64', lod_level=2)
-            label = fluid.layers.data(
-                name='label', shape=[1], dtype='float32', lod_level=1)
-
-            rnn = fluid.layers.DynamicRNN()
-            with rnn.block():
-                in_ = rnn.step_input(sentence)
-                assert in_.lod_level == 1, "the lod level of in_ should be 1"
-                sent_emb = fluid.layers.embedding(
-                    input=in_, size=[len(word_dict), 32], dtype='float32')
-                out_ = fluid.layers.fc(input=sent_emb, size=100, act='tanh')
-
-                rnn1 = fluid.layers.DynamicRNN()
-                with rnn1.block():
-                    in_1 = rnn1.step_input(out_)
-                    assert in_1.lod_level == 0, "the lod level of in_1 should be 0"
-                    out_1 = fluid.layers.fc(input=[in_1], size=100, act='tanh')
-                    rnn1.output(out_1)
-
-                last = fluid.layers.sequence_last_step(input=rnn1())
-                rnn.output(last)
-
-            last = rnn()
-            logits = fluid.layers.fc(input=last, size=1, act=None)
-            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=logits, label=label)
-            loss = fluid.layers.mean(loss)
-            sgd = fluid.optimizer.SGD(1e-3)
-            #sgd = fluid.optimizer.Adam(1e-3)
-            sgd.minimize(loss=loss)
-
-        cpu = fluid.CPUPlace()
-        exe = fluid.Executor(cpu)
-        exe.run(startup_program)
-        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
-        data = next(train_data())
-        val = exe.run(main_program, feed=feeder.feed(data),
-                      fetch_list=[loss])[0]
-
-        for _ in range(100):
-            val = exe.run(main_program,
-                          feed=feeder.feed(data),
-                          fetch_list=[loss])[0]
-            print(val)
-
-    # this unit test is just used to the two layer nested dyn_rnn.
-    def test_train_nested_dyn_rnn2(self):
-        word_dict = [i for i in range(30)]
-
-        def fake_reader():
-            seq_len, label = [[2, 2]], [0, 1]
-            data = []
-            for ele in seq_len:
-                for j in ele:
-                    data.append([numpy.random.randint(30) \
-                                 for _ in range(j)])
-
-            while True:
-                yield data, label
-
-        train_data = paddle.batch(fake_reader, batch_size=2)
-        hidden_size = 32
-        main_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(main_program, startup_program):
-            sentence = fluid.layers.data(
-                name='word', shape=[1], dtype='int64', lod_level=2)
-            label = fluid.layers.data(
-                name='label', shape=[1], dtype='float32', lod_level=1)
-
-            rnn = fluid.layers.DynamicRNN()
-            with rnn.block():
-                in_ = rnn.step_input(sentence)
-                sent_emb = fluid.layers.embedding(
-                    input=in_,
-                    size=[len(word_dict), hidden_size],
-                    dtype='float32')
-                input_forward_proj = fluid.layers.fc(input=sent_emb,
-                                                     size=hidden_size * 4,
-                                                     act=None,
-                                                     bias_attr=False)
-                forward, _ = fluid.layers.dynamic_lstm(
-                    input=input_forward_proj,
-                    size=hidden_size * 4,
-                    use_peepholes=False)
-
-                rnn1 = fluid.layers.DynamicRNN()
-                with rnn1.block():
-                    in_1 = rnn1.step_input(forward)
-                    out_1 = fluid.layers.fc(input=[in_1], size=100, act='tanh')
-                    rnn1.output(out_1)
-
-                last = fluid.layers.sequence_last_step(input=rnn1())
-                rnn.output(last)
-
-            last = rnn()
-            logits = fluid.layers.fc(input=last, size=1, act=None)
-            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=logits, label=label)
-            loss = fluid.layers.mean(loss)
-            sgd = fluid.optimizer.SGD(1e-3)
-            #sgd = fluid.optimizer.Adam(1e-3)
-            sgd.minimize(loss=loss)
-
-        cpu = fluid.CPUPlace()
-        exe = fluid.Executor(cpu)
-        exe.run(startup_program)
-        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
-        data = next(train_data())
-        val = exe.run(main_program, feed=feeder.feed(data),
-                      fetch_list=[loss])[0]
-
-        for _ in range(100):
-            val = exe.run(main_program,
-                          feed=feeder.feed(data),
-                          fetch_list=[loss])[0]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
deleted file mode 100644
index 5328f73b31513745a4ddd51044bea7b3f59eaf5f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ /dev/null
@@ -1,379 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy
-import random
-import collections
-import paddle.fluid as fluid
-import unittest
-from decorator_helper import *
-
-
-class Memory(object):
-    def __init__(self, shape, dtype='float32'):
-        self.ex = numpy.zeros(shape=shape, dtype=dtype)
-        self.cur = None
-
-    def update(self, val):
-        assert val.shape == self.ex.shape
-        assert val.dtype == self.ex.dtype
-        self.cur = val
-
-    def next(self):
-        self.ex = self.cur
-        self.cur = None
-
-    def __next__(self):
-        self.next()
-
-    def reset(self):
-        self.ex = numpy.zeros(shape=self.ex.shape, dtype=self.ex.dtype)
-        self.cur = None
-
-
-class Output(object):
-    def __init__(self):
-        self.outs = []
-
-    def next_sequence(self):
-        self.outs.append([])
-
-    def out(self, val):
-        self.outs[-1].append(val)
-
-    def last(self):
-        return self.outs[-1][-1]
-
-
-class BaseRNN(object):
-    def __init__(self, ins, mems, params, outs, num_seq=5, max_seq_len=15):
-        self.num_seq = num_seq
-        self.inputs = collections.defaultdict(list)
-
-        for _ in range(num_seq):
-            seq_len = random.randint(1, max_seq_len - 1)
-            for iname in ins:
-                ishape = ins[iname].get('shape', None)
-                idtype = ins[iname].get('dtype', 'float32')
-                lst = []
-                for _ in range(seq_len):
-                    lst.append(numpy.random.random(size=ishape).astype(idtype))
-                self.inputs[iname].append(lst)
-
-        self.mems = dict()
-        for mname in mems:
-            mshape = mems[mname].get('shape', None)
-            mdtype = mems[mname].get('dtype', 'float32')
-            self.mems[mname] = Memory(shape=mshape, dtype=mdtype)
-
-        self.params = dict()
-        for pname in params:
-            pshape = params[pname].get('shape', None)
-            pdtype = params[pname].get('dtype', 'float32')
-            self.params[pname] = numpy.random.random(size=pshape).astype(pdtype)
-
-        self.outputs = dict()
-
-        for oname in outs:
-            self.outputs[oname] = Output()
-
-    def step(self, **kwargs):
-        raise NotImplementedError()
-
-    def exe(self):
-        retv = dict()
-        for out in self.outputs:
-            retv[out] = []
-
-        for seq_id in range(self.num_seq):
-            for mname in self.mems:
-                self.mems[mname].reset()
-            for out in self.outputs:
-                self.outputs[out].next_sequence()
-
-            iname0 = list(self.inputs.keys())[0]
-            seq_len = len(self.inputs[iname0][seq_id])
-
-            for step_id in range(seq_len):
-                xargs = dict()
-
-                for iname in self.inputs:
-                    xargs[iname] = self.inputs[iname][seq_id][step_id]
-
-                for mname in self.mems:
-                    xargs[mname] = self.mems[mname]
-
-                for pname in self.params:
-                    xargs[pname] = self.params[pname]
-
-                for out in self.outputs:
-                    xargs[out] = self.outputs[out]
-
-                self.step(**xargs)
-
-                for mname in self.mems:
-                    next(self.mems[mname])
-
-            for out in self.outputs:
-                retv[out].append(self.outputs[out].last())
-
-        for out in retv:
-            retv[out] = numpy.array(retv[out])
-        return retv
-
-    def to_feed(self, place):
-        feed_dict = dict()
-
-        for iname in self.inputs:
-            lod = []
-            np_flatten = []
-            for seq_id in range(len(self.inputs[iname])):
-                seq_len = len(self.inputs[iname][seq_id])
-                lod.append(seq_len)
-                np_flatten.extend(self.inputs[iname][seq_id])
-
-            t = fluid.Tensor()
-            t.set(numpy.array(np_flatten), place)
-            t.set_recursive_sequence_lengths([lod])
-            feed_dict[iname] = t
-
-        for pname in self.params:
-            feed_dict[pname] = self.params[pname]
-        return feed_dict
-
-    def get_numeric_gradient_of_param(self, param_name, delta=0.001):
-        p = self.params[param_name]
-        if len(p.shape) != 2:
-            raise ValueError("Not support get numeric gradient of an parameter,"
-                             " which is not matrix")
-        g = numpy.zeros(shape=p.shape, dtype=p.dtype)
-
-        for i in range(p.shape[0]):
-            for j in range(p.shape[1]):
-                o = p[i][j]
-                p[i][j] += delta
-                pos = self._exe_mean_out_()
-                p[i][j] -= 2 * delta
-                neg = self._exe_mean_out_()
-                p[i][j] = o
-                g[i][j] = (pos - neg) / (delta * 2)
-        return g
-
-    def get_numeric_gradient_of_input(self,
-                                      input_name,
-                                      delta=0.001,
-                                      return_one_tensor=True):
-        ipt = self.inputs[input_name]
-        grad = []
-
-        for seq in ipt:
-            seq_grad = []
-            for item in seq:
-                item_grad = numpy.zeros(shape=item.shape, dtype=item.dtype)
-                if len(item.shape) != 1:
-                    raise ValueError("Not support")
-
-                for i in range(len(item)):
-                    o = item[i]
-                    item[i] += delta
-                    pos = self._exe_mean_out_()
-                    item[i] -= 2 * delta
-                    neg = self._exe_mean_out_()
-                    item[i] = o
-                    item_grad[i] = (pos - neg) / (delta * 2)
-                seq_grad.append(item_grad)
-            grad.append(seq_grad)
-
-        if not return_one_tensor:
-            return grad
-
-        for i in range(len(grad)):
-            grad[i] = numpy.concatenate(grad[i])
-        grad = numpy.concatenate(grad)
-        return grad
-
-    def _exe_mean_out_(self):
-        outs = self.exe()
-        return numpy.array([o.mean() for o in outs.values()]).mean()
-
-
-class SeedFixedTestCase(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Fix random seeds to remove randomness from tests"""
-        cls._np_rand_state = numpy.random.get_state()
-        cls._py_rand_state = random.getstate()
-
-        numpy.random.seed(123)
-        random.seed(124)
-
-    @classmethod
-    def tearDownClass(cls):
-        """Restore random seeds"""
-        numpy.random.set_state(cls._np_rand_state)
-        random.setstate(cls._py_rand_state)
-
-
-class TestSimpleMul(SeedFixedTestCase):
-    DATA_NAME = 'X'
-    DATA_WIDTH = 32
-    PARAM_NAME = 'W'
-    HIDDEN_WIDTH = 10
-    OUT_NAME = 'Out'
-
-    class SimpleMul(BaseRNN):
-        def __init__(self):
-            base = TestSimpleMul
-            super(base.SimpleMul, self).__init__({
-                base.DATA_NAME: {
-                    'shape': [base.DATA_WIDTH]
-                }
-            }, {}, {
-                base.PARAM_NAME: {
-                    'shape': [base.DATA_WIDTH, base.HIDDEN_WIDTH]
-                }
-            }, [base.OUT_NAME])
-
-        def step(self, X, W, Out):
-            Out.out(numpy.matmul(X, W))
-
-    # Test many times in local to ensure the random seed cannot breaks CI
-    # @many_times(10)
-    @prog_scope()
-    def test_forward_backward(self):
-        py_rnn = TestSimpleMul.SimpleMul()
-        dat = fluid.layers.data(
-            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
-        dat.stop_gradient = False
-
-        rnn = fluid.layers.DynamicRNN()
-        with rnn.block():
-            d = rnn.step_input(dat)
-            o = fluid.layers.fc(input=d,
-                                param_attr=self.PARAM_NAME,
-                                bias_attr=False,
-                                size=self.HIDDEN_WIDTH,
-                                act=None)
-            rnn.output(o)
-
-        out = rnn()
-        out = fluid.layers.sequence_pool(out, pool_type='last')
-        loss = fluid.layers.mean(out)
-        fluid.backward.append_backward(loss)
-
-        cpu = fluid.CPUPlace()
-        exe = fluid.Executor(cpu)
-        out, w_g, i_g = list(
-            map(numpy.array,
-                exe.run(feed=py_rnn.to_feed(cpu),
-                        fetch_list=[
-                            out, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
-                            "@GRAD"
-                        ],
-                        return_numpy=False)))
-        out_by_python = py_rnn.exe()[self.OUT_NAME]
-        self.assertTrue(numpy.allclose(out, out_by_python))
-        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
-        self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.05))
-        i_g_num = py_rnn.get_numeric_gradient_of_input(
-            input_name=self.DATA_NAME)
-        i_g_num = i_g_num.reshape(i_g.shape)
-        self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.05))
-
-
-class TestSimpleMulWithMemory(SeedFixedTestCase):
-    DATA_WIDTH = 32
-    HIDDEN_WIDTH = 20
-    DATA_NAME = 'X'
-    PARAM_NAME = 'W'
-
-    class SimpleMulWithMemory(BaseRNN):
-        def __init__(self):
-            super(TestSimpleMulWithMemory.SimpleMulWithMemory, self).__init__({
-                TestSimpleMulWithMemory.DATA_NAME: {
-                    'shape': [TestSimpleMulWithMemory.DATA_WIDTH]
-                }
-            }, {'Mem': {
-                'shape': [TestSimpleMulWithMemory.HIDDEN_WIDTH]
-            }}, {
-                TestSimpleMulWithMemory.PARAM_NAME: {
-                    'shape': [
-                        TestSimpleMulWithMemory.DATA_WIDTH,
-                        TestSimpleMulWithMemory.HIDDEN_WIDTH
-                    ]
-                }
-            }, ['Out'])
-
-        def step(self, X, Mem, W, Out):
-            o = numpy.matmul(X, W)
-            assert isinstance(Mem, Memory)
-            o += Mem.ex
-            Mem.update(o)
-            assert isinstance(Out, Output)
-            Out.out(o)
-
-    # many_times used locally for debug. Make sure the calculation is stable.
-    # @many_times(10)
-    @prog_scope()
-    def test_forward_backward(self):
-        py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory()
-        data = fluid.layers.data(
-            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
-        data.stop_gradient = False
-        rnn = fluid.layers.DynamicRNN()
-        with rnn.block():
-            d = rnn.step_input(data)
-            mem = rnn.memory(value=0.0, shape=[self.HIDDEN_WIDTH])
-            hidden = fluid.layers.fc(input=d,
-                                     size=self.HIDDEN_WIDTH,
-                                     param_attr=self.PARAM_NAME,
-                                     bias_attr=False,
-                                     act=None)
-            o = fluid.layers.elementwise_add(x=hidden, y=mem)
-            rnn.update_memory(mem, o)
-            rnn.output(o)
-
-        out = rnn()
-        last = fluid.layers.sequence_pool(input=out, pool_type='last')
-        loss = fluid.layers.mean(last)
-        fluid.backward.append_backward(loss)
-
-        cpu = fluid.CPUPlace()
-        exe = fluid.Executor(cpu)
-        feed = py_rnn.to_feed(cpu)
-        last_np, w_g, i_g = list(
-            map(numpy.array,
-                exe.run(feed=feed,
-                        fetch_list=[
-                            last, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
-                            "@GRAD"
-                        ],
-                        return_numpy=False)))
-        last_by_py, = list(py_rnn.exe().values())
-        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
-        self.assertTrue(numpy.allclose(last_np, last_by_py))
-
-        self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.1))
-        i_g_num = py_rnn.get_numeric_gradient_of_input(self.DATA_NAME)
-        i_g_num = i_g_num.reshape(i_g.shape)
-
-        # Since this RNN has many float add. The number could be not stable.
-        # rtol = 0.1
-        self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.1))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
deleted file mode 100644
index b4359fc69ae18b45774af0d2b20c1540bd99da5c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ /dev/null
@@ -1,211 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid.backward import append_backward
-import paddle.fluid.framework as framework
-from paddle.fluid.framework import Program, switch_main_program
-import bisect
-import numpy as np
-
-fluid.default_startup_program().random_seed = 1
-
-
-class TestDyRnnStaticInput(unittest.TestCase):
-    def setUp(self):
-        self._delta = 0.005
-        self._max_sequence_len = 3
-        self._program = Program()
-        switch_main_program(self._program)
-        self.output_dim = 10
-        self.place = core.CPUPlace()
-        self.prepare_x_tensor()
-        self.prepare_static_input_tensor()
-        self.exe = fluid.Executor(self.place)
-
-    def prepare_x_tensor(self):
-        self.x_tensor_dim = 10
-        lod = [[2, 1, 3]]
-        shape = [sum(lod[0]), self.x_tensor_dim]
-        self.x_tensor_data = np.random.random(shape).astype('float32')
-        self.x_tensor = core.LoDTensor()
-        self.x_tensor.set_recursive_sequence_lengths(lod)
-        self.x_tensor.set(self.x_tensor_data, self.place)
-
-    def prepare_static_input_tensor(self):
-        self.static_input_tensor_dim = 4
-        lod = [[1, 2, 3]]
-        shape = [sum(lod[0]), self.static_input_tensor_dim]
-        self.static_input_data = np.random.random(shape).astype('float32')
-        self.static_input_tensor = core.LoDTensor()
-        self.static_input_tensor.set_recursive_sequence_lengths(lod)
-        self.static_input_tensor.set(self.static_input_data, self.place)
-
-    def fetch_value(self, var):
-        fetch_outs = self.exe.run(feed={
-            'x_tensor': self.x_tensor,
-            'static_input_tensor': self.static_input_tensor
-        },
-                                  fetch_list=[var],
-                                  return_numpy=False)
-        return self._lodtensor_to_ndarray(fetch_outs[0])
-
-    def _lodtensor_to_ndarray(self, lod_tensor):
-        dims = lod_tensor.shape()
-        ndarray = np.zeros(shape=dims).astype('float32')
-        for i in range(np.product(dims)):
-            ndarray.ravel()[i] = lod_tensor._get_float_element(i)
-        return ndarray, lod_tensor.recursive_sequence_lengths()
-
-    def build_graph(self, only_forward=False):
-        x_tensor = fluid.layers.data(
-            name='x_tensor',
-            shape=[self.x_tensor_dim],
-            dtype='float32',
-            lod_level=1)
-        x_tensor.stop_gradient = False
-
-        static_input_tensor = fluid.layers.data(
-            name='static_input_tensor',
-            shape=[self.static_input_tensor_dim],
-            dtype='float32',
-            lod_level=1)
-        static_input_tensor.stop_gradient = False
-
-        if only_forward:
-            static_input_out_array = self._program.global_block().create_var(
-                name='static_input_out_array',
-                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                dtype='float32')
-            static_input_out_array.stop_gradient = True
-
-        rnn = fluid.layers.DynamicRNN()
-        with rnn.block():
-            step_x = rnn.step_input(x_tensor)
-            step_static_input = rnn.static_input(static_input_tensor)
-            if only_forward:
-                fluid.layers.array_write(
-                    x=step_static_input,
-                    i=rnn.step_idx,
-                    array=static_input_out_array)
-            last = fluid.layers.sequence_pool(
-                input=step_static_input, pool_type='last')
-            projected = fluid.layers.fc(input=[step_x, last],
-                                        size=self.output_dim)
-            rnn.output(projected)
-
-        if only_forward:
-            static_input_step_outs = []
-            step_idx = fluid.layers.fill_constant(
-                shape=[1], dtype='int64', value=0)
-            step_idx.stop_gradient = True
-
-            for i in range(self._max_sequence_len):
-                step_out = fluid.layers.array_read(static_input_out_array,
-                                                   step_idx)
-                step_out.stop_gradient = True
-                static_input_step_outs.append(step_out)
-                fluid.layers.increment(x=step_idx, value=1.0, in_place=True)
-
-        if only_forward:
-            return static_input_step_outs
-
-        last = fluid.layers.sequence_pool(input=rnn(), pool_type='last')
-        loss = fluid.layers.mean(last)
-        append_backward(loss)
-        static_input_grad = self._program.global_block().var(
-            framework.grad_var_name('static_input_tensor'))
-        return static_input_grad, loss
-
-    def get_expected_static_step_outs(self):
-        x_lod = self.x_tensor.recursive_sequence_lengths()
-        x_seq_len = x_lod[0]
-        x_seq_len_sorted = sorted(x_seq_len)
-        x_sorted_indices = np.argsort(x_seq_len)[::-1]
-
-        static_lod = self.static_input_tensor.recursive_sequence_lengths()
-        static_sliced = []
-        cur_offset = 0
-        for i in range(len(static_lod[0])):
-            static_sliced.append(self.static_input_data[cur_offset:(
-                cur_offset + static_lod[0][i])])
-            cur_offset += static_lod[0][i]
-        static_seq_len = static_lod[0]
-        static_reordered = []
-        for i in range(len(x_sorted_indices)):
-            static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
-        static_seq_len_reordered = [
-            static_seq_len[x_sorted_indices[i]]
-            for i in range(len(x_sorted_indices))
-        ]
-
-        static_step_outs = []
-        static_step_lods = []
-
-        for i in range(self._max_sequence_len):
-            end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
-            lod = []
-            total_len = 0
-            for i in range(end):
-                lod.append(static_seq_len_reordered[i])
-                total_len += lod[-1]
-            static_step_lods.append([lod])
-            end = total_len
-            static_step_outs.append(
-                np.array(static_reordered[:end]).astype('float32'))
-
-        return static_step_outs, static_step_lods
-
-    def test_step_out(self):
-        static_step_outs = self.build_graph(only_forward=True)
-        self.exe.run(framework.default_startup_program())
-        expected_outs, expected_lods = self.get_expected_static_step_outs()
-        for i in range(self._max_sequence_len):
-            step_out, lod = self.fetch_value(static_step_outs[i])
-            self.assertTrue(np.allclose(step_out, expected_outs[i]))
-            self.assertTrue(np.allclose(lod, expected_lods[i]))
-
-    def test_network_gradient(self):
-        static_input_grad, loss = self.build_graph()
-        self.exe.run(framework.default_startup_program())
-
-        actual_gradients, actual_lod = self.fetch_value(static_input_grad)
-
-        static_input_shape = self.static_input_tensor.shape()
-        numeric_gradients = np.zeros(shape=static_input_shape).astype('float32')
-        # calculate numeric gradients
-        tensor_size = np.product(static_input_shape)
-        for i in range(tensor_size):
-            origin = self.static_input_tensor._get_float_element(i)
-            x_pos = origin + self._delta
-            self.static_input_tensor._set_float_element(i, x_pos)
-            y_pos = self.fetch_value(loss)[0][0]
-            x_neg = origin - self._delta
-            self.static_input_tensor._set_float_element(i, x_neg)
-            y_neg = self.fetch_value(loss)[0][0]
-            self.static_input_tensor._set_float_element(i, origin)
-            numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2
-        self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001))
-        self.assertTrue(
-            np.allclose(actual_lod,
-                        self.static_input_tensor.recursive_sequence_lengths()))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py
deleted file mode 100644
index 95cae1c2029c472c5a34b37a79739e2ff088feb2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import unittest
-
-fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
-
-from test_conditional_block import *
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
deleted file mode 100644
index adf07897d561cf49c70841c5a4114b51b4cf55f1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import numpy as np
-os.environ['FLAGS_use_ngraph'] = '0'
-os.environ['FLAGS_use_mkldnn'] = '0'
-os.environ['CPU_NUM'] = '4'
-
-import paddle.fluid as fluid
-import six
-import unittest
-import multiprocessing
-
-fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
-
-
-def simple_fc_net():
-    image = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = image
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
-    optimizer.minimize(loss)
-    return image, label, loss
-
-
-def get_persistables_and_non_persistables(prog, fetch_list):
-    num_block = prog.num_blocks
-    persitables = set()
-    non_persistables = set()
-    for bid in six.moves.range(num_block):
-        block = prog.block(bid)
-        for _, var in block.vars.items():
-            if var.persistable or var.name in fetch_list:
-                persitables.add(var.name)
-            else:
-                non_persistables.add(var.name)
-
-    return persitables, non_persistables
-
-
-class TestExecutor(unittest.TestCase):
-    def test_executor_main(self):
-        places = [fluid.CPUPlace()]
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-
-        for p in places:
-            self.place = p
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                with fluid.scope_guard(fluid.Scope()):
-                    with fluid.unique_name.guard():
-                        self.executor_main()
-
-        for p in places:
-            self.place = p
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                with fluid.scope_guard(fluid.Scope()):
-                    with fluid.unique_name.guard():
-                        self.pe_main()
-
-    def prepare_feed(self, image, label, dev_cnt=1):
-        batch_size = 32 * dev_cnt
-        image_shape = (batch_size, ) + tuple(image.shape[1:])
-        label_shape = (batch_size, ) + tuple(label.shape[1:])
-
-        image_np = np.random.random(size=image_shape).astype('float32')
-        label_np = np.random.random_integers(
-            low=0, high=9, size=label_shape).astype('int64')
-
-        return image_np, label_np
-
-    def assertScopeVar(self, scope, persitables, non_persistables):
-        outline_p_vars = []
-        for name in persitables:
-            var = scope.find_var(name)
-            self.assertTrue(var is not None)
-            t = var.get_tensor()
-            if not t._is_initialized():
-                outline_p_vars.append(name)
-
-        outline_np_vars = []
-        for name in non_persistables:
-            var = scope.find_var(name)
-            self.assertTrue(var is not None)
-            t = var.get_tensor()
-            if t._is_initialized():
-                outline_np_vars.append(name)
-
-        print('Non-alive persistable vars {} in {}'.format(outline_p_vars,
-                                                           persitables))
-        print('Alive non-persistable vars {} in {}'.format(outline_np_vars,
-                                                           non_persistables))
-        self.assertEqual(len(outline_p_vars), 0)
-        self.assertEqual(len(outline_np_vars), 0)
-
-    def executor_main(self):
-        image, label, loss = simple_fc_net()
-        loss.persistable = False
-        persistables, non_persistables = get_persistables_and_non_persistables(
-            fluid.default_main_program(), [loss.name])
-        print('Non-persistable var number {}'.format(len(non_persistables)))
-        print(non_persistables)
-
-        exe = fluid.Executor(self.place)
-        exe.run(fluid.default_startup_program())
-
-        p = fluid.core.Place()
-        p.set_place(self.place)
-        exe = fluid.core.Executor(p)
-
-        for _ in six.moves.range(10):
-            image_np, label_np = self.prepare_feed(image, label)
-            fluid.global_scope().var(image.name).get_tensor().set(image_np,
-                                                                  self.place)
-            fluid.global_scope().var(label.name).get_tensor().set(label_np,
-                                                                  self.place)
-            # exe.run would not create local scope
-            # so that we can detect whether gc clears temporary variables
-            exe.run(fluid.default_main_program().desc,
-                    fluid.global_scope(), 0, False, True, [loss.name])
-            self.assertScopeVar(fluid.global_scope(), persistables,
-                                non_persistables)
-
-    def pe_main(self):
-        image, label, loss = simple_fc_net()
-        loss.persistable = False
-        persitables, non_persistables = get_persistables_and_non_persistables(
-            fluid.default_main_program(), [loss.name])
-
-        exe = fluid.Executor(self.place)
-        exe.run(fluid.default_startup_program())
-
-        exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.num_iteration_per_drop_scope = 100
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.memory_optimize = False
-        build_strategy.enable_inplace = False
-
-        prog = fluid.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(
-            loss_name=loss.name, exec_strategy=exec_strategy)
-
-        dev_cnt = fluid.core.get_cuda_device_count() if isinstance(self.place, fluid.CUDAPlace)    \
-            else int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-
-        for idx in six.moves.range(10):
-            image_np, label_np = self.prepare_feed(image, label, dev_cnt)
-            feed = {image.name: image_np, label.name: label_np}
-
-            exe.run(program=prog, feed=feed, fetch_list=[loss])
-
-            local_scopes = prog._local_scopes
-            for scope in local_scopes:
-                kids = scope._kids()
-                self.assertTrue(len(kids) == 1)
-                self.assertScopeVar(kids[0], persistables, non_persistables)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
deleted file mode 100644
index e4bde606ca670780680dbb461f8ac6300b5ae4d1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-os.environ['CPU_NUM'] = '2'
-
-import six
-import unittest
-
-import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-import numpy as np
-from fake_reader import fake_imdb_reader
-
-
-def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
-    if use_cuda and not core.is_compiled_with_cuda():
-        print('Skip use_cuda=True because Paddle is not compiled with cuda')
-        return
-
-    if use_parallel_executor and os.name == 'nt':
-        print(
-            'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
-        )
-        return
-
-    word_dict_size = 5147
-    reader = fake_imdb_reader(word_dict_size, batch_size * 40)
-    train_reader = paddle.batch(reader, batch_size=batch_size)
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    cost = network(data, label, word_dict_size)
-    cost.persistable = True
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
-    optimizer.minimize(cost)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-    reader = feeder.decorate_reader(
-        train_reader, multi_devices=use_parallel_executor)
-
-    exe = fluid.Executor(place)
-    fluid.default_startup_program().random_seed = 1
-    fluid.default_main_program().random_seed = 1
-    exe.run(fluid.default_startup_program())
-
-    train_cp = fluid.default_main_program()
-    if use_parallel_executor:
-        train_cp = compiler.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(loss_name=cost.name)
-        fetch_list = [cost.name]
-    else:
-        fetch_list = [cost]
-
-    for pass_id in six.moves.xrange(pass_num):
-        batch_id = 0
-        for data in reader():
-            exe.run(train_cp,
-                    feed=data,
-                    fetch_list=fetch_list if batch_id % 4 == 0 else [])
-            batch_id += 1
-            if batch_id > 16:
-                break
-
-
-class TestBase(unittest.TestCase):
-    def setUp(self):
-        self.net = None
-
-    def test_network(self):
-        if self.net is None:
-            return
-
-        for use_cuda in [True, False]:
-            for use_parallel_executor in [False, True]:
-                print('network: {}, use_cuda: {}, use_parallel_executor: {}'.
-                      format(self.net.__name__, use_cuda,
-                             use_parallel_executor))
-                with fluid.program_guard(fluid.Program(), fluid.Program()):
-                    with fluid.scope_guard(core.Scope()):
-                        train(self.net, use_cuda, use_parallel_executor)
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
deleted file mode 100644
index 1023c18f410fb60592154bbdf421d58aa88c71ae..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from test_eager_deletion_dynamic_rnn_base import TestBase
-import paddle.fluid as fluid
-
-fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
-
-
-def gru_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            emb_lr=400.0):
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
-    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
-    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
-    gru_max_tanh = fluid.layers.tanh(gru_max)
-    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    return avg_cost
-
-
-class GRUTest(TestBase):
-    def setUp(self):
-        self.net = gru_net
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
deleted file mode 100644
index 6784edb9d7b2e9cd95f8646e9f8a210296dac94e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from test_eager_deletion_dynamic_rnn_base import TestBase
-import paddle.fluid as fluid
-import unittest
-
-fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
-
-
-def lstm_net(data,
-             label,
-             dict_dim,
-             emb_dim=128,
-             hid_dim=128,
-             hid_dim2=96,
-             class_dim=2,
-             emb_lr=30.0):
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
-    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
-    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    return avg_cost
-
-
-class LSTMTest(TestBase):
-    def setUp(self):
-        self.net = lstm_net
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
deleted file mode 100644
index ecdf9efa451743f8368079183fcb33f1769a6ab5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import paddle.fluid as fluid
-
-fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
-
-# FIXME(zjl): It seems that this unittest fails randomly 
-# when comparing all reduce last loss and reduce last loss
-# e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta
-# Disable it temporarily.
-'''
-from test_parallel_executor_mnist import TestMNIST
-
-
-class EagerDeletionTestMNIST(TestMNIST):
-    pass
-'''
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
deleted file mode 100644
index 530964a7314582ce0171d570fa99c4db70909c62..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ /dev/null
@@ -1,660 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-import time
-import os
-
-from paddle.fluid import ParamAttr
-from paddle.fluid.contrib.layers import basic_lstm
-from paddle.fluid.executor import Executor
-from paddle.fluid.layers.control_flow import StaticRNN as PaddingRNN
-
-os.environ["CPU_NUM"] = "1"
-
-
-class RnnConfig(object):
-    def __init__(self, model_type, rnn_model):
-        self.model_type = model_type
-        self.rnn_model = rnn_model
-
-        self.vocab_size = 10000
-        if self.model_type == "test":
-            self.num_layers = 1
-            self.batch_size = 2
-            self.hidden_size = 10
-            self.num_steps = 3
-            self.init_scale = 0.1
-            self.max_grad_norm = 5.0
-            self.epoch_start_decay = 1
-            self.max_epoch = 1
-            self.dropout = 0.0
-            self.lr_decay = 0.5
-            self.base_learning_rate = 1.0
-        elif self.model_type == "small":
-            self.num_layers = 2
-            self.batch_size = 20
-            self.hidden_size = 200
-            self.num_steps = 20
-            self.init_scale = 0.1
-            self.max_grad_norm = 5.0
-            self.epoch_start_decay = 4
-            self.max_epoch = 13
-            self.dropout = 0.0
-            self.lr_decay = 0.5
-            self.base_learning_rate = 1.0
-        elif self.model_type == "medium":
-            self.num_layers = 2
-            self.batch_size = 20
-            self.hidden_size = 650
-            self.num_steps = 35
-            self.init_scale = 0.05
-            self.max_grad_norm = 5.0
-            self.epoch_start_decay = 6
-            self.max_epoch = 39
-            self.dropout = 0.5
-            self.lr_decay = 0.8
-            self.base_learning_rate = 1.0
-        elif self.model_type == "large":
-            self.num_layers = 2
-            self.batch_size = 20
-            self.hidden_size = 1500
-            self.num_steps = 35
-            self.init_scale = 0.04
-            self.max_grad_norm = 10.0
-            self.epoch_start_decay = 14
-            self.max_epoch = 55
-            self.dropout = 0.65
-            self.lr_decay = 1.0 / 1.15
-            self.base_learning_rate = 1.0
-        else:
-            raise ValueError('Unsupported model_type.')
-
-        if rnn_model not in ('static', 'padding', 'cudnn', 'basic_lstm'):
-            raise ValueError('Unsupported rnn_model.')
-
-        self.batch_size = 12
-        self.max_epoch = 3
-        self.random_seed = 123
-
-
-# Fake data reader for test
-class Reader(object):
-    def get_data_iter(self, rnn_config):
-        for i in range(rnn_config.max_epoch):
-            x = np.zeros(
-                shape=(rnn_config.batch_size, rnn_config.num_steps),
-                dtype='int64')
-            y = np.ones(
-                shape=(rnn_config.batch_size, rnn_config.num_steps),
-                dtype='int64')
-            yield (x, y)
-
-
-# Model from PaddleNLP/models/language_model/lm_model.py in Paddle Models repo
-def lm_model(hidden_size,
-             vocab_size,
-             batch_size,
-             num_layers=2,
-             num_steps=20,
-             init_scale=0.1,
-             dropout=None,
-             rnn_model='static',
-             use_py_reader=False):
-    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
-        weight_1_arr = []
-        weight_2_arr = []
-        bias_arr = []
-        hidden_array = []
-        cell_array = []
-        mask_array = []
-        for i in range(num_layers):
-            weight_1 = layers.create_parameter(
-                [hidden_size * 2, hidden_size * 4],
-                dtype="float32",
-                name="fc_weight1_" + str(i),
-                default_initializer=fluid.initializer.UniformInitializer(
-                    low=-init_scale, high=init_scale))
-            weight_1_arr.append(weight_1)
-            bias_1 = layers.create_parameter(
-                [hidden_size * 4],
-                dtype="float32",
-                name="fc_bias1_" + str(i),
-                default_initializer=fluid.initializer.Constant(0.0))
-            bias_arr.append(bias_1)
-
-            pre_hidden = layers.slice(
-                init_hidden, axes=[0], starts=[i], ends=[i + 1])
-            pre_cell = layers.slice(
-                init_cell, axes=[0], starts=[i], ends=[i + 1])
-            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
-            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
-            hidden_array.append(pre_hidden)
-            cell_array.append(pre_cell)
-
-        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
-        rnn = PaddingRNN()
-
-        with rnn.step():
-            input = rnn.step_input(input_embedding)
-            for k in range(num_layers):
-                pre_hidden = rnn.memory(init=hidden_array[k])
-                pre_cell = rnn.memory(init=cell_array[k])
-                weight_1 = weight_1_arr[k]
-                bias = bias_arr[k]
-
-                nn = layers.concat([input, pre_hidden], 1)
-                gate_input = layers.matmul(x=nn, y=weight_1)
-
-                gate_input = layers.elementwise_add(gate_input, bias)
-                i = layers.slice(
-                    gate_input, axes=[1], starts=[0], ends=[hidden_size])
-                j = layers.slice(
-                    gate_input,
-                    axes=[1],
-                    starts=[hidden_size],
-                    ends=[hidden_size * 2])
-                f = layers.slice(
-                    gate_input,
-                    axes=[1],
-                    starts=[hidden_size * 2],
-                    ends=[hidden_size * 3])
-                o = layers.slice(
-                    gate_input,
-                    axes=[1],
-                    starts=[hidden_size * 3],
-                    ends=[hidden_size * 4])
-
-                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
-                    i) * layers.tanh(j)
-                m = layers.tanh(c) * layers.sigmoid(o)
-
-                rnn.update_memory(pre_hidden, m)
-                rnn.update_memory(pre_cell, c)
-
-                rnn.step_output(m)
-                rnn.step_output(c)
-
-                input = m
-
-                if dropout != None and dropout > 0.0:
-                    input = layers.dropout(
-                        input,
-                        dropout_prob=dropout,
-                        dropout_implementation='upscale_in_train')
-
-            rnn.step_output(input)
-        rnnout = rnn()
-
-        last_hidden_array = []
-        last_cell_array = []
-        real_res = rnnout[-1]
-        for i in range(num_layers):
-            m = rnnout[i * 2]
-            c = rnnout[i * 2 + 1]
-            m.stop_gradient = True
-            c.stop_gradient = True
-            last_h = layers.slice(
-                m, axes=[0], starts=[num_steps - 1], ends=[num_steps])
-            last_hidden_array.append(last_h)
-            last_c = layers.slice(
-                c, axes=[0], starts=[num_steps - 1], ends=[num_steps])
-            last_cell_array.append(last_c)
-        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
-        last_hidden = layers.concat(last_hidden_array, 0)
-        last_cell = layers.concat(last_cell_array, 0)
-
-        return real_res, last_hidden, last_cell
-
-    def encoder_static(input_embedding, len=3, init_hidden=None,
-                       init_cell=None):
-
-        weight_1_arr = []
-        weight_2_arr = []
-        bias_arr = []
-        hidden_array = []
-        cell_array = []
-        mask_array = []
-        for i in range(num_layers):
-            weight_1 = layers.create_parameter(
-                [hidden_size * 2, hidden_size * 4],
-                dtype="float32",
-                name="fc_weight1_" + str(i),
-                default_initializer=fluid.initializer.UniformInitializer(
-                    low=-init_scale, high=init_scale))
-            weight_1_arr.append(weight_1)
-            bias_1 = layers.create_parameter(
-                [hidden_size * 4],
-                dtype="float32",
-                name="fc_bias1_" + str(i),
-                default_initializer=fluid.initializer.Constant(0.0))
-            bias_arr.append(bias_1)
-
-            pre_hidden = layers.slice(
-                init_hidden, axes=[0], starts=[i], ends=[i + 1])
-            pre_cell = layers.slice(
-                init_cell, axes=[0], starts=[i], ends=[i + 1])
-            pre_hidden = layers.reshape(
-                pre_hidden, shape=[-1, hidden_size], inplace=True)
-            pre_cell = layers.reshape(
-                pre_cell, shape=[-1, hidden_size], inplace=True)
-            hidden_array.append(pre_hidden)
-            cell_array.append(pre_cell)
-
-        res = []
-        sliced_inputs = layers.split(
-            input_embedding, num_or_sections=len, dim=1)
-
-        for index in range(len):
-            input = sliced_inputs[index]
-            input = layers.reshape(input, shape=[-1, hidden_size], inplace=True)
-            for k in range(num_layers):
-                pre_hidden = hidden_array[k]
-                pre_cell = cell_array[k]
-                weight_1 = weight_1_arr[k]
-                bias = bias_arr[k]
-
-                nn = layers.concat([input, pre_hidden], 1)
-                gate_input = layers.matmul(x=nn, y=weight_1)
-
-                gate_input = layers.elementwise_add(gate_input, bias)
-                i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
-
-                try:
-                    from paddle.fluid.contrib.layers import fused_elemwise_activation
-                    # fluid.contrib.layers.fused_elemwise_activation can do a fused
-                    # operation, like:
-                    # 1) x + sigmoid(y); x + tanh(y)
-                    # 2) tanh(x + y)
-                    # Now the unary operation supported in this fused op is limit, and
-                    # we will extent this operation to support more unary operations and
-                    # do this kind of fusion automitically in future version of paddle.fluid.
-                    # layers.sigmoid(i) * layers.tanh(j)
-                    tmp0 = fused_elemwise_activation(
-                        x=layers.tanh(j),
-                        y=i,
-                        functor_list=['elementwise_mul', 'sigmoid'],
-                        save_intermediate_out=False)
-                    # pre_cell * layers.sigmoid(f)
-                    tmp1 = fused_elemwise_activation(
-                        x=pre_cell,
-                        y=f,
-                        functor_list=['elementwise_mul', 'sigmoid'],
-                        save_intermediate_out=False)
-                    c = tmp0 + tmp1
-                    # layers.tanh(c) * layers.sigmoid(o)
-                    m = fused_elemwise_activation(
-                        x=layers.tanh(c),
-                        y=o,
-                        functor_list=['elementwise_mul', 'sigmoid'],
-                        save_intermediate_out=False)
-                except ImportError:
-                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
-                        i) * layers.tanh(j)
-                    m = layers.tanh(c) * layers.sigmoid(o)
-
-                hidden_array[k] = m
-                cell_array[k] = c
-                input = m
-
-                if dropout != None and dropout > 0.0:
-                    input = layers.dropout(
-                        input,
-                        dropout_prob=dropout,
-                        dropout_implementation='upscale_in_train')
-
-            res.append(input)
-
-        last_hidden = layers.concat(hidden_array, 1)
-        last_hidden = layers.reshape(
-            last_hidden, shape=[-1, num_layers, hidden_size], inplace=True)
-        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])
-
-        last_cell = layers.concat(cell_array, 1)
-        last_cell = layers.reshape(
-            last_cell, shape=[-1, num_layers, hidden_size])
-        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])
-
-        real_res = layers.concat(res, 0)
-        real_res = layers.reshape(
-            real_res, shape=[len, -1, hidden_size], inplace=True)
-        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
-
-        return real_res, last_hidden, last_cell
-
-    batch_size_each = batch_size
-    if use_py_reader:
-        feed_shapes = [[batch_size_each, num_steps, 1],
-                       [batch_size_each * num_steps, 1]]
-        py_reader = fluid.layers.py_reader(
-            capacity=16, shapes=feed_shapes, dtypes=['int64', 'int64'])
-        x, y = fluid.layers.read_file(py_reader)
-    else:
-        x = layers.data(
-            name="x",
-            shape=[batch_size_each, num_steps, 1],
-            dtype='int64',
-            append_batch_size=False)
-        y = layers.data(
-            name="y",
-            shape=[batch_size_each * num_steps, 1],
-            dtype='int64',
-            append_batch_size=False)
-
-    init_hidden = layers.data(
-        name="init_hidden",
-        shape=[num_layers, batch_size_each, hidden_size],
-        dtype='float32',
-        append_batch_size=False)
-    init_cell = layers.data(
-        name="init_cell",
-        shape=[num_layers, batch_size_each, hidden_size],
-        dtype='float32',
-        append_batch_size=False)
-
-    init_cell.persistable = True
-    init_hidden.persistable = True
-
-    init_hidden_reshape = layers.reshape(
-        init_hidden, shape=[num_layers, -1, hidden_size])
-    init_cell_reshape = layers.reshape(
-        init_cell, shape=[num_layers, -1, hidden_size])
-
-    x_emb = layers.embedding(
-        input=x,
-        size=[vocab_size, hidden_size],
-        dtype='float32',
-        is_sparse=False,
-        param_attr=fluid.ParamAttr(
-            name='embedding_para',
-            initializer=fluid.initializer.UniformInitializer(
-                low=-init_scale, high=init_scale)))
-
-    x_emb = layers.reshape(
-        x_emb, shape=[-1, num_steps, hidden_size], inplace=True)
-    if dropout != None and dropout > 0.0:
-        x_emb = layers.dropout(
-            x_emb,
-            dropout_prob=dropout,
-            dropout_implementation='upscale_in_train')
-
-    if rnn_model == "padding":
-        rnn_out, last_hidden, last_cell = padding_rnn(
-            x_emb,
-            len=num_steps,
-            init_hidden=init_hidden_reshape,
-            init_cell=init_cell_reshape)
-    elif rnn_model == "static":
-        rnn_out, last_hidden, last_cell = encoder_static(
-            x_emb,
-            len=num_steps,
-            init_hidden=init_hidden_reshape,
-            init_cell=init_cell_reshape)
-    elif rnn_model == "cudnn":
-        x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
-        rnn_out, last_hidden, last_cell = layers.lstm(
-            x_emb,
-            init_hidden_reshape,
-            init_cell_reshape,
-            num_steps,
-            hidden_size,
-            num_layers,
-            is_bidirec=False,
-            default_initializer=fluid.initializer.UniformInitializer(
-                low=-init_scale, high=init_scale))
-        rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2])
-    elif rnn_model == "basic_lstm":
-        rnn_out, last_hidden, last_cell = basic_lstm( x_emb, init_hidden, init_cell, hidden_size, \
-                num_layers=num_layers, batch_first=True, dropout_prob=dropout, \
-                param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ), \
-                bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ), \
-                forget_bias = 0.0)
-    else:
-        print("type not support")
-        return
-
-    rnn_out = layers.reshape(
-        rnn_out, shape=[-1, num_steps, hidden_size], inplace=True)
-
-    softmax_weight = layers.create_parameter(
-        [hidden_size, vocab_size],
-        dtype="float32",
-        name="softmax_weight",
-        default_initializer=fluid.initializer.UniformInitializer(
-            low=-init_scale, high=init_scale))
-    softmax_bias = layers.create_parameter(
-        [vocab_size],
-        dtype="float32",
-        name='softmax_bias',
-        default_initializer=fluid.initializer.UniformInitializer(
-            low=-init_scale, high=init_scale))
-
-    projection = layers.matmul(rnn_out, softmax_weight)
-    projection = layers.elementwise_add(projection, softmax_bias)
-    projection = layers.reshape(
-        projection, shape=[-1, vocab_size], inplace=True)
-
-    loss = layers.softmax_with_cross_entropy(
-        logits=projection, label=y, soft_label=False)
-
-    loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
-    loss = layers.reduce_mean(loss, dim=[0])
-    loss = layers.reduce_sum(loss)
-
-    loss.persistable = True
-    last_cell.persistable = True
-    last_hidden.persistable = True
-
-    # This will feed last_hidden, last_cell to init_hidden, init_cell, which
-    # can be used directly in next batch. This can avoid the fetching of
-    # last_hidden and last_cell and feeding of init_hidden and init_cell in
-    # each training step.
-    layers.assign(input=last_cell, output=init_cell)
-    layers.assign(input=last_hidden, output=init_hidden)
-
-    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
-    if use_py_reader:
-        return loss, last_hidden, last_cell, feeding_list, py_reader
-    else:
-        return loss, last_hidden, last_cell, feeding_list
-
-
-class EagerDeletionPaddingRnnTest(unittest.TestCase):
-    def setUp(self):
-        self.reader = Reader()
-
-    def prepare_program(self, config):
-        self.main_program = fluid.Program()
-        self.startup_program = fluid.Program()
-        self.startup_program.random_seed = config.random_seed
-        with fluid.program_guard(self.main_program, self.startup_program):
-            with fluid.unique_name.guard():
-                res_vars = lm_model(
-                    config.hidden_size,
-                    config.vocab_size,
-                    config.batch_size,
-                    num_layers=config.num_layers,
-                    num_steps=config.num_steps,
-                    init_scale=config.init_scale,
-                    dropout=config.dropout,
-                    rnn_model=config.rnn_model,
-                    use_py_reader=False)
-                self.loss, self.last_hidden, self.last_cell, self.feed_order = res_vars
-
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByGlobalNorm(
-                        clip_norm=config.max_grad_norm))
-
-                self.learning_rate = fluid.layers.create_global_var(
-                    name="learning_rate",
-                    shape=[1],
-                    value=1.0,
-                    dtype='float32',
-                    persistable=True)
-
-                optimizer = fluid.optimizer.SGD(
-                    learning_rate=self.learning_rate)
-                optimizer.minimize(self.loss)
-        self.exe = Executor(fluid.CPUPlace())
-        self.exe.run(self.startup_program)
-
-        self.device_count = 1
-        exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.num_threads = self.device_count
-        exec_strategy.num_iteration_per_drop_scope = 100
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.enable_inplace = True
-        build_strategy.memory_optimize = False
-        build_strategy.fuse_all_optimizer_ops = True
-
-        self.train_program = fluid.compiler.CompiledProgram(
-            self.main_program).with_data_parallel(
-                loss_name=self.loss.name,
-                build_strategy=build_strategy,
-                exec_strategy=exec_strategy)
-
-    def generate_init_data(self):
-        init_hidden = np.zeros(
-            (self.config.num_layers, self.config.batch_size,
-             self.config.hidden_size),
-            dtype='float32')
-        init_cell = np.zeros(
-            (self.config.num_layers, self.config.batch_size,
-             self.config.hidden_size),
-            dtype='float32')
-        return init_hidden, init_cell
-
-    def generate_new_lr(self, epoch_id=0, device_count=1):
-        new_lr = self.config.base_learning_rate * (self.config.lr_decay**max(
-            epoch_id + 1 - self.config.epoch_start_decay, 0.0))
-        lr = np.ones((self.device_count), dtype='float32') * new_lr
-        return lr
-
-    def prepare_input(self,
-                      batch,
-                      init_hidden=None,
-                      init_cell=None,
-                      epoch_id=0,
-                      with_lr=True,
-                      device_count=1):
-        x, y = batch
-        x = x.reshape((-1, self.config.num_steps, 1))
-        y = y.reshape((-1, 1))
-
-        res = {}
-        res['x'] = x
-        res['y'] = y
-        if init_hidden is not None:
-            res['init_hidden'] = init_hidden
-        if init_cell is not None:
-            res['init_cell'] = init_cell
-        if with_lr:
-            res['learning_rate'] = self.generate_new_lr(epoch_id, device_count)
-        return res
-
-    def train_an_epoch(self, epoch_id, batch_times):
-        train_data_iter = self.reader.get_data_iter(self.config)
-
-        total_loss = 0
-        iters = 0
-
-        init_hidden, init_cell = self.generate_init_data()
-        ppl = np.zeros(shape=(0))
-        for batch_id, batch in enumerate(train_data_iter):
-            input_data_feed = self.prepare_input(
-                batch,
-                init_hidden=init_hidden,
-                init_cell=init_cell,
-                epoch_id=epoch_id,
-                with_lr=True,
-                device_count=self.device_count)
-
-            batch_start_time = time.time()
-            fetch_outs = self.exe.run(self.train_program,
-                                      feed=input_data_feed,
-                                      fetch_list=[
-                                          self.loss.name, "learning_rate",
-                                          self.last_hidden.name,
-                                          self.last_cell.name
-                                      ],
-                                      use_program_cache=True)
-            batch_time = time.time() - batch_start_time
-            batch_times.append(batch_time)
-
-            cost_train = np.array(fetch_outs[0])
-            lr = np.array(fetch_outs[1])
-            init_hidden = np.array(fetch_outs[2])
-            init_cell = np.array(fetch_outs[3])
-
-            total_loss += cost_train
-            iters += self.config.num_steps
-
-            batch_ppl = np.exp(total_loss / iters)
-            ppl = np.append(ppl, batch_ppl)
-        return ppl
-
-    def train(self, config):
-        self.config = config
-        self.prepare_program(config)
-        total_time = 0.0
-        ppl = np.zeros(shape=(0, config.batch_size))
-        for epoch_id in range(config.max_epoch):
-            batch_times = []
-            epoch_start_time = time.time()
-            train_ppl = self.train_an_epoch(epoch_id, batch_times)
-            epoch_time = time.time() - epoch_start_time
-            total_time += epoch_time
-            ppl = np.append(ppl, train_ppl)
-        return ppl
-
-    def compare_padding_static_mode(self):
-        '''
-          Test that train ppl of padding mode is same to that of static mode 
-        '''
-        config = RnnConfig('test', 'padding')
-        with fluid.scope_guard(fluid.Scope()):
-            padding_rnn_ppl = self.train(config)
-        config = RnnConfig('test', 'static')
-        with fluid.scope_guard(fluid.Scope()):
-            static_rnn_ppl = self.train(config)
-        self.assertTrue(
-            np.isclose(
-                padding_rnn_ppl, static_rnn_ppl, rtol=0.001).all())
-
-    def test_padding_mode_no_eager_deletion(self):
-        '''
-           Test that train ppl of padding mode is same to that of static mode without eager deletion
-        '''
-        fluid.core._set_eager_deletion_mode(-1.0, 1.0, True)
-        self.compare_padding_static_mode()
-
-    def test_padding_mode_eager_deletion(self):
-        '''
-          Test that train ppl of padding mode is same to that of static mode under eager deletion
-        '''
-        fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
-        self.compare_padding_static_mode()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
deleted file mode 100644
index 4ae44365f25dfdb4d87b23f4d1605614eaf2f4df..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ /dev/null
@@ -1,693 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-import unittest
-
-from paddle.fluid import ParamAttr
-from paddle.fluid.framework import Program, grad_var_name
-from paddle.fluid.executor import Executor
-from paddle.fluid.backward import append_backward
-
-np.random.seed(123)
-os.environ["CPU_NUM"] = "1"
-fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
-
-
-class PyRNNBase(object):
-    def __init__(self, input_shape, output_shape):
-        self.x = np.ones(shape=input_shape).astype("float32")
-        self.y = np.zeros(shape=output_shape).astype("float32")
-
-    def step(self, step_id, x):
-        raise NotImplementedError
-
-    def forward(self):
-        for step_id in range(self.x.shape[0]):
-            self.step(step_id, self.x[step_id])
-        return np.array([np.mean(self.y)])
-
-    def segment_inputs(self):
-        return [self.x[i] for i in range(self.x.shape[0])]
-
-
-class PySimpleRNN1(PyRNNBase):
-    def __init__(self, input_shape, output_shape):
-        super(PySimpleRNN1, self).__init__(input_shape, output_shape)
-
-        seq_len, batch_size, input_dim = input_shape
-        self.h_boot = np.random.normal(size=(batch_size,
-                                             input_dim)).astype("float32")
-
-        self.scale = 1.0 / 2.0
-        men_dim = (seq_len, batch_size, input_dim)
-        self.mems = np.zeros(shape=men_dim).astype("float32")
-
-    def step(self, step_id, x):
-        if step_id == 0:
-            pre_mem = self.h_boot
-        else:
-            pre_mem = self.mems[step_id - 1]
-        self.mems[step_id] = (pre_mem + x) * self.scale
-        self.y[step_id] = self.mems[step_id]
-
-
-class PySimpleRNN2(PyRNNBase):
-    def __init__(self, input_shape, output_shape):
-        super(PySimpleRNN2, self).__init__(input_shape, output_shape)
-
-        seq_len, batch_size, input_dim = input_shape
-        self.W = np.ones(shape=(input_dim, input_dim)).astype("float32")
-        self.U = np.zeros(shape=(input_dim, input_dim)).astype("float32")
-        self.h_boot = np.ones(shape=(batch_size, input_dim)).astype("float32")
-
-        men_dim = (seq_len, batch_size, input_dim)
-        self.mems = np.zeros(shape=men_dim).astype("float32")
-
-    def step(self, step_id, x):
-        if step_id > 0:
-            pre_mem = self.mems[step_id - 1]
-        else:
-            pre_mem = self.h_boot
-        xW = np.matmul(x, self.W).astype("float32")
-        hU = np.matmul(pre_mem, self.U).astype("float32")
-
-        def py_sigmoid(x):
-            return 1. / (1. + np.exp(-x))
-
-        self.mems[step_id] = py_sigmoid(xW + hU)
-        self.y[step_id] = self.mems[step_id]
-
-
-def create_tensor(np_data, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_data, place)
-    return tensor
-
-
-class EagerDeletionRecurrentOpTest1(unittest.TestCase):
-    '''
-    Test RNNOp
-    equation:
-        h_t = ( x_t + h_{t-1} ) / scale
-    vars:
-        - x
-    memories:
-        - h
-    outputs:
-        - h
-    '''
-
-    input_dim = 2
-    batch_size = 1
-    sent_len = 1
-
-    def setup_program(self):
-        self.main_program = Program()
-        self.startup_program = Program()
-        self.place = core.CPUPlace()
-
-    def setUp(self):
-        self.setup_program()
-        self.data_field = {"x", "h_boot"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-        h_boot = layers.data(
-            shape=[self.input_dim], dtype='float32', name='h_boot')
-        h_boot.stop_gradient = False
-
-        rnn = layers.StaticRNN()
-        with rnn.step():
-            h_pre = rnn.memory(init=h_boot)
-            x_t = rnn.step_input(x)
-
-            h = layers.scale(
-                x=layers.elementwise_add(
-                    x=h_pre, y=x_t),
-                scale=self.py_rnn.scale)
-
-            rnn.update_memory(h_pre, h)
-            rnn.output(h)
-
-        return rnn()
-
-    def forward(self):
-        self.feed_map = {
-            x: create_tensor(getattr(self.py_rnn, x), self.place)
-            for x in self.data_field
-        }
-        exe = Executor(self.place)
-        out = exe.run(self.main_program,
-                      feed=self.feed_map,
-                      fetch_list=[self.output])
-
-        return out[0]
-
-    def backward(self):
-        self.feed_map = {
-            x: create_tensor(getattr(self.py_rnn, x), self.place)
-            for x in self.data_field
-        }
-        fetch_list = [
-            self.main_program.global_block().var(grad_var_name(x))
-            for x in self.data_field
-        ]
-
-        exe = Executor(self.place)
-        return exe.run(self.main_program,
-                       feed=self.feed_map,
-                       fetch_list=fetch_list,
-                       return_numpy=False)
-
-    def test_backward(self, rtol=0.01):
-        self.check_forward()
-        num_grad = self.get_numerical_gradient()
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            append_backward(self.output)
-
-        ana_grad = [np.array(x) for x in self.backward()]
-
-        for idx, name in enumerate(self.data_field):
-            self.assertEqual(num_grad[idx].shape, ana_grad[idx].shape)
-            self.assertTrue(
-                np.isclose(
-                    num_grad[idx], ana_grad[idx], rtol=rtol).all(),
-                "num_grad (" + name + ") has diff at " + str(self.place) +
-                "\nExpect " + str(num_grad[idx]) + "\n" + "But Got" +
-                str(ana_grad[idx]) + " in class " + self.__class__.__name__)
-
-    def check_forward(self):
-        pd_output = self.forward()
-        py_output = self.py_rnn.forward()
-        self.assertEqual(pd_output.shape, py_output.shape)
-        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.01).all())
-
-    def get_numerical_gradient(self, delta=0.005):
-        dloss_dout = 1.0
-        feed_list = [getattr(self.py_rnn, x) for x in self.data_field]
-        grad_list = [np.zeros_like(x) for x in feed_list]
-        for feed, grad in zip(feed_list, grad_list):
-            for f, g in np.nditer([feed, grad], op_flags=['readwrite']):
-                o = float(f)
-                f[...] = o + delta
-                y_pos = self.forward()
-
-                f[...] = o - delta
-                y_neg = self.forward()
-
-                f[...] = o
-                dout_dfeed = (y_pos - y_neg) / (delta * 2)
-                g[...] = dout_dfeed[0]
-
-        return grad_list
-
-
-class EagerDeletionRecurrentOpTest2(EagerDeletionRecurrentOpTest1):
-    '''
-    Test RNNOp
-    equation:
-        h_t = \sigma (W x_t + U h_{t-1})
-    weights:
-        - W
-        - U
-    vars:
-        - x
-    memories:
-        - h
-    outputs:
-       - h
-    '''
-
-    input_dim = 2
-    batch_size = 10
-    sent_len = 2
-
-    def setUp(self):
-        self.setup_program()
-
-        self.data_field = {"x", "h_boot", "W", "U"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-        h_boot = layers.data(
-            shape=[self.input_dim], dtype='float32', name='h_boot')
-        h_boot.stop_gradient = False
-
-        rnn = layers.StaticRNN()
-        with rnn.step():
-            h_pre = rnn.memory(init=h_boot)
-            x_t = rnn.step_input(x)
-
-            temp_l = layers.fc(
-                input=x_t,
-                size=self.input_dim,
-                param_attr=ParamAttr(
-                    name='W',
-                    initializer=fluid.initializer.ConstantInitializer(1.0)),
-                bias_attr=False)
-            temp_r = layers.fc(
-                input=h_pre,
-                size=self.input_dim,
-                param_attr=ParamAttr(
-                    name='U',
-                    initializer=fluid.initializer.ConstantInitializer(0.0)),
-                bias_attr=False)
-
-            h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r))
-
-            rnn.update_memory(h_pre, h)
-            rnn.output(h)
-
-        return rnn()
-
-    def test_backward(self):
-        super(EagerDeletionRecurrentOpTest2, self).test_backward(rtol=0.01)
-
-
-class EagerDeletionRecurrentOpMultipleMemoryTest(EagerDeletionRecurrentOpTest1):
-    '''
-    Test RNNOp with two memories
-    equation:
-        h_1 = h_pre_1
-        h_2 = h_pre_2
-        y = h_1 + h_2
-    vars:
-        - x
-    memories:
-        - h_1, h_2
-    outputs:
-       - y
-    '''
-
-    class PySimpleRNN3(PyRNNBase):
-        def __init__(self, input_shape, output_shape):
-            super(EagerDeletionRecurrentOpMultipleMemoryTest.PySimpleRNN3,
-                  self).__init__(input_shape, output_shape)
-
-            seq_len, batch_size, input_dim = input_shape
-            self.h_boot1 = np.random.normal(size=(batch_size,
-                                                  input_dim)).astype("float32")
-            self.h_boot2 = np.random.normal(size=(batch_size,
-                                                  input_dim)).astype("float32")
-
-            men_dim = (seq_len, batch_size, input_dim)
-            self.mems1 = np.zeros(shape=men_dim).astype("float32")
-            self.mems2 = np.zeros(shape=men_dim).astype("float32")
-
-        def step(self, step_id, x):
-            if step_id == 0:
-                pre_mem1 = self.h_boot1
-                pre_mem2 = self.h_boot2
-            else:
-                pre_mem1 = self.mems1[step_id - 1]
-                pre_mem2 = self.mems2[step_id - 1]
-            self.mems1[step_id] = pre_mem1
-            self.mems2[step_id] = pre_mem2
-            self.y[step_id] = self.mems1[step_id] + self.mems2[step_id] + x
-
-    input_dim = 1
-    batch_size = 1
-    sent_len = 2
-
-    def setUp(self):
-        self.setup_program()
-
-        self.data_field = {"x", "h_boot1", "h_boot2"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = EagerDeletionRecurrentOpMultipleMemoryTest.PySimpleRNN3(
-            self.input_shape, self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-        h_boot1 = layers.data(
-            shape=[self.batch_size, self.input_dim],
-            dtype='float32',
-            name='h_boot1',
-            append_batch_size=False)
-        h_boot1.stop_gradient = False
-        h_boot2 = layers.data(
-            shape=[self.batch_size, self.input_dim],
-            dtype='float32',
-            name='h_boot2',
-            append_batch_size=False)
-        h_boot2.stop_gradient = False
-
-        rnn = layers.StaticRNN()
-        with rnn.step():
-            h_pre1 = rnn.memory(init=h_boot1)
-            h_pre2 = rnn.memory(init=h_boot2)
-            x_t = rnn.step_input(x)
-
-            mem1 = layers.scale(x=h_pre1, scale=1.0)
-            mem2 = layers.scale(x=h_pre2, scale=1.0)
-            out = layers.sums(input=[mem1, x_t, mem2])
-
-            rnn.update_memory(h_pre1, mem1)
-            rnn.update_memory(h_pre2, mem2)
-            rnn.output(out)
-
-        return rnn()
-
-
-class EagerDeletionRecurrentOpNoMemBootTest(EagerDeletionRecurrentOpTest1):
-    '''
-    Test RNNOp without memory boot
-    equation:
-        mem = x + mem_pre
-        y = mem
-    vars:
-        - x
-    memories:
-        - mem
-    outputs:
-       - y
-    '''
-
-    class PySimpleRNN4(PyRNNBase):
-        def __init__(self, input_shape, output_shape):
-            super(EagerDeletionRecurrentOpNoMemBootTest.PySimpleRNN4,
-                  self).__init__(input_shape, output_shape)
-            men_dim = input_shape
-            self.mems = np.zeros(shape=men_dim).astype("float32")
-
-        def step(self, step_id, x):
-            if step_id == 0:
-                pre_mem = np.zeros_like(x)
-            else:
-                pre_mem = self.mems[step_id - 1]
-            self.mems[step_id] = pre_mem + x
-            self.y[step_id] = self.mems[step_id]
-
-    input_dim = 1
-    batch_size = 1
-    sent_len = 2
-
-    def setUp(self):
-        self.setup_program()
-
-        self.data_field = {"x"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = EagerDeletionRecurrentOpNoMemBootTest.PySimpleRNN4(
-            self.input_shape, self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-
-        rnn = layers.StaticRNN()
-        with rnn.step():
-            mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x)
-            x_t = rnn.step_input(x)
-            mem = layers.elementwise_add(x=mem_pre, y=x_t)
-            rnn.update_memory(mem_pre, mem)
-            rnn.output(mem)
-
-        return rnn()
-
-
-class EagerDeletionTwoRecurrentOpsTest(EagerDeletionRecurrentOpTest1):
-    '''
-    Test RNNOp with two recurrent ops
-    equation:
-        first_rnn:
-            mem_inside = x + mem_pre_inside
-            first_inside_out = mem_inside
-        second_rnn:
-            mem = x + reduce_sum(rnn_inside_out)
-            y = mem + mem_pre
-    vars:
-        - x
-    memories:
-        - mem_inside
-        - mem
-    outputs:
-       - y
-    '''
-
-    class PySimpleRNN5(PyRNNBase):
-        def __init__(self, input_shape, output_shape):
-            super(EagerDeletionTwoRecurrentOpsTest.PySimpleRNN5,
-                  self).__init__(input_shape, output_shape)
-            self.mem_0 = np.zeros(shape=input_shape).astype("float32")
-            self.mem_1 = np.zeros(shape=input_shape).astype("float32")
-            self.rnn_0_output = np.zeros(shape=input_shape).astype("float32")
-
-        def step(self, step_id, x):
-            # First Rnn
-            for step in range(self.x.shape[0]):
-                x_t = self.x[step]
-                pre_mem = np.zeros_like(x_t) if step == 0 else self.mem_0[step -
-                                                                          1]
-                self.mem_0[step] = x_t + pre_mem
-                self.rnn_0_output[step] = self.mem_0[step]
-            # Second RNN
-            pre_mem = np.zeros_like(x) if step_id == 0 else self.mem_1[step_id -
-                                                                       1]
-            self.mem_1[step_id] = x + np.sum(self.rnn_0_output)
-            self.y[step_id] = self.mem_1[step_id] + pre_mem
-
-    input_dim = 1
-    batch_size = 1
-    sent_len = 1
-
-    def setUp(self):
-        self.setup_program()
-
-        self.data_field = {"x"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = EagerDeletionTwoRecurrentOpsTest.PySimpleRNN5(
-            self.input_shape, self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-
-        rnn_0 = layers.StaticRNN()
-        with rnn_0.step():
-            x_t = rnn_0.step_input(x)
-            mem_pre = rnn_0.memory(shape=[-1, self.input_dim], batch_ref=x)
-            mem = layers.elementwise_add(x=mem_pre, y=x_t)
-            rnn_0.update_memory(mem_pre, mem)
-            rnn_0.output(mem)
-
-        rnn_1 = layers.StaticRNN()
-        with rnn_1.step():
-            mem_pre = rnn_1.memory(shape=[-1, self.input_dim], batch_ref=x)
-            x_t = rnn_1.step_input(x)
-            last_rnn_output = rnn_0()
-            last_rnn_sum = fluid.layers.reduce_sum(last_rnn_output)
-            mem = layers.elementwise_add(x=x_t, y=last_rnn_sum)
-            y = layers.elementwise_add(x=mem_pre, y=mem)
-            rnn_1.update_memory(mem_pre, mem)
-            rnn_1.output(y)
-        return rnn_1()
-
-
-class EagerDeletionRecurrentOpParallelExecutorTest(
-        EagerDeletionRecurrentOpTest1):
-    '''
-    Test RNNOp with ParallelExecutor
-    equation:
-        h_t = ( x_t + h_{t-1} ) / scale
-    vars:
-        - x
-    memories:
-        - h
-    outputs:
-        - h
-    '''
-
-    def forward(self):
-        self.feed_map = {
-            x: create_tensor(getattr(self.py_rnn, x), self.place)
-            for x in self.data_field
-        }
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.enable_inplace = True
-        exec_strategy = fluid.ExecutionStrategy()
-        parallel_exe = fluid.ParallelExecutor(
-            use_cuda=False,
-            main_program=self.main_program,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
-        out = parallel_exe.run(feed=self.feed_map, fetch_list=[self.output])
-        return out[0]
-
-    def backward(self):
-        self.feed_map = {
-            x: create_tensor(getattr(self.py_rnn, x), self.place)
-            for x in self.data_field
-        }
-        fetch_list = [
-            self.main_program.global_block().var(grad_var_name(x))
-            for x in self.data_field
-        ]
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.enable_inplace = True
-        exec_strategy = fluid.ExecutionStrategy()
-        parallel_exe = fluid.ParallelExecutor(
-            use_cuda=False,
-            loss_name=self.output.name,
-            main_program=self.main_program,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
-        return parallel_exe.run(feed=self.feed_map,
-                                fetch_list=fetch_list,
-                                return_numpy=False)
-
-
-class EagerDeletionFarwardOnlyRnnAndBackwardRnnTest(
-        EagerDeletionRecurrentOpTest1):
-    '''
-      Test one forward only RNN and one backward RNN in one program
-    '''
-
-    def setUp(self):
-        self.setup_program()
-        self.data_field = {"x", "h_boot"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            x = layers.data(
-                shape=[self.sent_len, self.batch_size, self.input_dim],
-                dtype='float32',
-                name='x',
-                append_batch_size=False)
-            x.stop_gradient = False
-            h_boot = layers.data(
-                shape=[self.input_dim], dtype='float32', name='h_boot')
-            h_boot.stop_gradient = False
-
-            forward_only_rnn = layers.StaticRNN()
-            with forward_only_rnn.step():
-                h_pre = forward_only_rnn.memory(init=h_boot)
-                x_t = forward_only_rnn.step_input(x)
-
-                h = layers.scale(
-                    x=layers.elementwise_add(
-                        x=h_pre, y=x_t),
-                    scale=self.py_rnn.scale)
-
-                forward_only_rnn.update_memory(h_pre, h)
-                forward_only_rnn.output(h)
-            forward_only_output = forward_only_rnn()
-            forward_only_output.stop_gradient = True
-            self.forward_only_output = layers.mean(forward_only_output)
-
-            rnn = layers.StaticRNN()
-            with rnn.step():
-                h_pre = rnn.memory(init=h_boot)
-                x_t = rnn.step_input(x)
-
-                h = layers.scale(
-                    x=layers.elementwise_add(
-                        x=h_pre, y=x_t),
-                    scale=self.py_rnn.scale)
-
-                rnn.update_memory(h_pre, h)
-                rnn.output(h)
-
-            self.output = layers.mean(rnn())
-
-    def forward_two_rnn(self):
-        self.feed_map = {
-            x: create_tensor(getattr(self.py_rnn, x), self.place)
-            for x in self.data_field
-        }
-        exe = Executor(self.place)
-        out = exe.run(self.main_program,
-                      feed=self.feed_map,
-                      fetch_list=[self.forward_only_output, self.output])
-
-        return out[0], out[1]
-
-    def check_forward(self):
-        forward_only_output, pd_output = self.forward_two_rnn()
-        py_output = self.py_rnn.forward()
-        self.assertEqual(forward_only_output.shape, py_output.shape)
-        self.assertEqual(pd_output.shape, py_output.shape)
-        self.assertTrue(
-            np.isclose(
-                forward_only_output, py_output, rtol=0.01).all)
-        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.01).all())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
deleted file mode 100644
index 1b507042541c100942dd61065bc78d92a2c399e4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import paddle.fluid as fluid
-
-fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
-
-from test_parallel_executor_transformer import TestTransformer
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
deleted file mode 100644
index 45f385968cf41cd52ae625ed8008602982ae4d42..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-os.environ['CPU_NUM'] = '2'
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.executor import Executor
-import paddle.fluid.core as core
-from paddle.fluid.backward import append_backward
-import paddle.fluid.compiler as compiler
-import numpy
-import multiprocessing
-
-fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
-
-
-class TestEagerDeletionWhileOpBase(unittest.TestCase):
-    def test_main(self):
-        places = [core.CPUPlace(), ]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for p in places:
-            for with_data_parallel in [False, True]:
-                with fluid.program_guard(fluid.Program(), fluid.Program()):
-                    with fluid.scope_guard(fluid.Scope()):
-                        self.run_main(p, with_data_parallel)
-
-    def run_main(self, place, with_data_parallel):
-        self.place = place
-        self.with_data_parallel = with_data_parallel
-
-        if not core.is_compiled_with_cuda() and isinstance(self.place,
-                                                           core.CUDAPlace):
-            return
-
-        if isinstance(self.place, core.CUDAPlace):
-            device_cnt = core.get_cuda_device_count(
-            ) if self.with_data_parallel else 1
-        else:
-            device_cnt = int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count(
-                ))) if self.with_data_parallel else 1
-
-        d0 = layers.data(
-            "d0", shape=[10], append_batch_size=False, dtype='float32')
-        d1 = layers.data(
-            "d1", shape=[10], append_batch_size=False, dtype='float32')
-        d2 = layers.data(
-            "d2", shape=[10], append_batch_size=False, dtype='float32')
-
-        i = layers.zeros(shape=[1], dtype='int64')
-        i.stop_gradient = True
-
-        init = layers.zeros(shape=[10], dtype='float32')
-        mem_array = layers.array_write(x=init, i=i)
-        data_array = layers.array_write(x=d0, i=i)
-
-        i = layers.increment(i)
-        layers.array_write(d1, i, array=data_array)
-
-        i = layers.increment(i)
-        layers.array_write(d2, i, array=data_array)
-
-        i = layers.zeros(shape=[1], dtype='int64')
-        i.stop_gradient = True
-
-        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
-        array_len.stop_gradient = True
-        cond = layers.less_than(x=i, y=array_len)
-
-        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
-        j.stop_gradient = True
-
-        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
-        array_len2.stop_gradient = True
-        cond2 = layers.less_than(x=j, y=array_len2)
-
-        while_op = layers.While(cond=cond)
-        while_op2 = layers.While(cond=cond2)
-        with while_op.block():
-            d = layers.array_read(array=data_array, i=i)
-            prev = layers.array_read(array=mem_array, i=i)
-            d = layers.reshape(d, shape=[10])
-            prev = layers.reshape(prev, shape=[10])
-            result = layers.sums(input=[d, prev])
-
-            i = layers.increment(x=i, in_place=True)
-            layers.array_write(result, i=i, array=mem_array)
-            layers.less_than(x=i, y=array_len, cond=cond)
-            with while_op2.block():
-                d2 = layers.array_read(array=data_array, i=j)
-                prev2 = layers.array_read(array=mem_array, i=j)
-                d2 = layers.reshape(d2, shape=[10])
-                prev2 = layers.reshape(prev2, shape=[10])
-                result2 = layers.sums(input=[d2, prev2])
-
-                j = layers.increment(x=j, in_place=True)
-                layers.array_write(result2, i=j, array=mem_array)
-                layers.less_than(x=j, y=array_len2, cond=cond2)
-
-        sum_result = layers.array_read(array=mem_array, i=j)
-        sum_result.persistable = True
-        tmp = layers.unsqueeze(sum_result, axes=[0])
-        tmp = layers.expand(tmp, expand_times=[10, 1])
-        fc = layers.fc(tmp, size=256)
-        loss = layers.mean(sum_result)
-
-        optim = fluid.optimizer.Adam(learning_rate=1e-3)
-        optim.minimize(loss)
-
-        exe = Executor(self.place)
-        exe.run(fluid.default_startup_program())
-
-        prog = fluid.default_main_program()
-        if self.with_data_parallel:
-            prog = compiler.CompiledProgram(fluid.default_main_program(
-            )).with_data_parallel(loss_name=loss.name)
-
-        for _ in range(5):
-            d = []
-            for i in range(3):
-                tmp = numpy.random.random(size=[10]).astype('float32')
-                if not self.with_data_parallel:
-                    d.append(tmp)
-                else:
-                    d.append(numpy.array([tmp] * device_cnt))
-
-            outs = exe.run(program=prog,
-                           feed={'d0': d[0],
-                                 'd1': d[1],
-                                 'd2': d[2]},
-                           fetch_list=[sum_result])
-            self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
deleted file mode 100644
index ba48b143a8e43731f633e6b64299225c0fefe0e9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ /dev/null
@@ -1,187 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def Levenshtein(hyp, ref):
-    """ Compute the Levenshtein distance between two strings.
-
-    :param hyp: hypothesis string in index
-    :type hyp: list
-    :param ref: reference string in index
-    :type ref: list
-    """
-    m = len(hyp)
-    n = len(ref)
-    if m == 0:
-        return n
-    if n == 0:
-        return m
-
-    dist = np.zeros((m + 1, n + 1)).astype("float32")
-    for i in range(0, m + 1):
-        dist[i][0] = i
-    for j in range(0, n + 1):
-        dist[0][j] = j
-
-    for i in range(1, m + 1):
-        for j in range(1, n + 1):
-            cost = 0 if hyp[i - 1] == ref[j - 1] else 1
-            deletion = dist[i - 1][j] + 1
-            insertion = dist[i][j - 1] + 1
-            substitution = dist[i - 1][j - 1] + cost
-            dist[i][j] = min(deletion, insertion, substitution)
-    return dist[m][n]
-
-
-class TestEditDistanceOp(OpTest):
-    def setUp(self):
-        self.op_type = "edit_distance"
-        normalized = False
-        x1 = np.array([[12, 3, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[12, 4, 7, 8]]).astype("int64")
-        x1 = np.transpose(x1)
-        x2 = np.transpose(x2)
-        self.x1_lod = [1, 4]
-        self.x2_lod = [3, 1]
-
-        num_strs = len(self.x1_lod)
-        distance = np.zeros((num_strs, 1)).astype("float32")
-        sequence_num = np.array(2).astype("int64")
-
-        x1_offset = 0
-        x2_offset = 0
-        for i in range(0, num_strs):
-            distance[i] = Levenshtein(
-                hyp=x1[x1_offset:(x1_offset + self.x1_lod[i])],
-                ref=x2[x2_offset:(x2_offset + self.x2_lod[i])])
-            x1_offset += self.x1_lod[i]
-            x2_offset += self.x2_lod[i]
-            if normalized is True:
-                len_ref = self.x2_lod[i]
-                distance[i] = distance[i] / len_ref
-
-        self.attrs = {'normalized': normalized}
-        self.inputs = {'Hyps': (x1, [self.x1_lod]), 'Refs': (x2, [self.x2_lod])}
-        self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestEditDistanceOpNormalizedCase0(OpTest):
-    def reset_config(self):
-        pass
-
-    def post_config(self):
-        pass
-
-    def setUp(self):
-        self.op_type = "edit_distance"
-        normalized = True
-        self.x1 = np.array([[10, 3, 6, 5, 8, 2]]).astype("int64")
-        self.x2 = np.array([[10, 4, 6, 7, 8]]).astype("int64")
-        self.x1_lod = [3, 0, 3]
-        self.x2_lod = [2, 1, 2]
-        self.x1 = np.transpose(self.x1)
-        self.x2 = np.transpose(self.x2)
-
-        self.reset_config()
-
-        num_strs = len(self.x1_lod)
-        distance = np.zeros((num_strs, 1)).astype("float32")
-        sequence_num = np.array(num_strs).astype("int64")
-
-        x1_offset = 0
-        x2_offset = 0
-        for i in range(0, num_strs):
-            distance[i] = Levenshtein(
-                hyp=self.x1[x1_offset:(x1_offset + self.x1_lod[i])],
-                ref=self.x2[x2_offset:(x2_offset + self.x2_lod[i])])
-            x1_offset += self.x1_lod[i]
-            x2_offset += self.x2_lod[i]
-            if normalized is True:
-                len_ref = self.x2_lod[i]
-                distance[i] = distance[i] / len_ref
-
-        self.attrs = {'normalized': normalized}
-        self.inputs = {
-            'Hyps': (self.x1, [self.x1_lod]),
-            'Refs': (self.x2, [self.x2_lod])
-        }
-        self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
-
-        self.post_config()
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestEditDistanceOpNormalizedCase1(TestEditDistanceOpNormalizedCase0):
-    def reset_config(self):
-        self.x1_lod = [0, 6, 0]
-        self.x2_lod = [2, 1, 2]
-
-
-class TestEditDistanceOpNormalizedCase2(TestEditDistanceOpNormalizedCase0):
-    def reset_config(self):
-        self.x1_lod = [0, 0, 6]
-        self.x2_lod = [2, 2, 1]
-
-
-class TestEditDistanceOpNormalizedTensor(OpTest):
-    def reset_config(self):
-        self.x1 = np.array([[10, 3, 0, 0], [6, 5, 8, 2]], dtype=np.int64)
-        self.x2 = np.array([[10, 4, 0], [6, 7, 8]], dtype=np.int64)
-        self.x1_lod = np.array([2, 4], dtype=np.int64)
-        self.x2_lod = np.array([2, 3], dtype=np.int64)
-
-    def setUp(self):
-        self.op_type = "edit_distance"
-        normalized = True
-
-        self.reset_config()
-
-        num_strs = len(self.x1_lod)
-        distance = np.zeros((num_strs, 1)).astype("float32")
-        sequence_num = np.array(num_strs).astype("int64")
-
-        for i in range(0, num_strs):
-            distance[i] = Levenshtein(
-                hyp=self.x1[i][0:self.x1_lod[i]],
-                ref=self.x2[i][0:self.x2_lod[i]])
-            if normalized is True:
-                len_ref = self.x2_lod[i]
-                distance[i] = distance[i] / len_ref
-
-        self.attrs = {'normalized': normalized}
-        self.inputs = {
-            'Hyps': self.x1,
-            'Refs': self.x2,
-            'HypsLength': self.x1_lod,
-            'RefsLength': self.x2_lod
-        }
-        self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
deleted file mode 100644
index 5783048f5fb68217dfbad829b33449e7951d8b0b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ /dev/null
@@ -1,310 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-class TestElementwiseAddOp(OpTest):
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.dtype = np.float32
-        self.axis = -1
-        self.init_dtype()
-        self.init_input_output()
-        self.init_kernel_type()
-        self.init_axis()
-
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
-        }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': self.out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
-
-    def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
-
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-    def init_dtype(self):
-        pass
-
-    def init_axis(self):
-        pass
-
-
-class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
-
-class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(1).astype(self.dtype)
-        self.out = self.x + self.y
-
-
-class TestFP16ElementwiseAddOp_scalar(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(1).astype(self.dtype)
-        self.out = self.x + self.y
-
-
-class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(1, 1).astype(self.dtype)
-        self.out = self.x + self.y
-
-
-class TestFP16ElementwiseAddOp_scalar2(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(1, 1).astype(self.dtype)
-        self.out = self.x + self.y
-
-
-class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.random((32, )).astype(self.dtype)
-        self.y = np.random.random((32, )).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-
-class TestFP16ElementwiseAddOp_Vector(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.random((32, )).astype(self.dtype)
-        self.y = np.random.random((32, )).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-
-class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(2).astype(self.dtype)
-        self.out = self.x + self.y.reshape(2, 1, 1)
-
-    def init_axis(self):
-        self.axis = 0
-
-
-class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(2).astype(self.dtype)
-        self.out = self.x + self.y.reshape(2, 1, 1)
-
-    def init_axis(self):
-        self.axis = 0
-
-
-class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(3).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 3, 1)
-
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(3).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 3, 1)
-
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(4).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 1, 4)
-
-
-class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(4).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 1, 4)
-
-
-class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(3, 4).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 3, 4, 1)
-
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(3, 4).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 3, 4, 1)
-
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(2, 1).astype(self.dtype)
-        self.out = self.x + self.y.reshape(2, 1, 1, 1)
-
-    def init_axis(self):
-        self.axis = 0
-
-
-class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(2, 1).astype(self.dtype)
-        self.out = self.x + self.y.reshape(2, 1, 1, 1)
-
-    def init_axis(self):
-        self.axis = 0
-
-
-class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(2, 1, 4).astype(self.dtype)
-        self.out = self.x + self.y
-
-
-class TestFP16ElementwiseAddOp_broadcast_5(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(2, 1, 4).astype(self.dtype)
-        self.out = self.x + self.y
-
-
-class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(2, 3, 1, 5).astype(self.dtype)
-        self.out = self.x + self.y
-
-
-class TestFP16ElementwiseAddOp_broadcast_6(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-        self.y = np.random.rand(2, 3, 1, 5).astype(self.dtype)
-        self.out = self.x + self.y
-
-
-class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(3, 4).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 3, 4)
-
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestFP16ElementwiseAddOp_rowwise_add_0(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(3, 4).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 3, 4)
-
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 1).astype(self.dtype)
-        self.y = np.random.rand(1).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 1)
-
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 1).astype(self.dtype)
-        self.y = np.random.rand(1).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 1)
-
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
-        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
-        self.out = self.x + self.y
-
-    def init_axis(self):
-        self.axis = -1
-
-
-class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(3, 10, 20).astype(self.dtype)
-        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
-        self.out = self.x + self.y
-
-    def init_axis(self):
-        self.axis = -1
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
deleted file mode 100644
index 4e679607d1398aa35745502e0af8130d2f28f18e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ /dev/null
@@ -1,171 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class ElementwiseDivOp(OpTest):
-    def setUp(self):
-        self.op_type = "elementwise_div"
-        self.dtype = np.float32
-        self.init_dtype()
-        """ Warning
-        CPU gradient check error!
-        'X': np.random.random((32,84)).astype("float32"),
-        'Y': np.random.random((32,84)).astype("float32")
-        """
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        }
-        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
-
-    def init_dtype(self):
-        pass
-
-
-class TestElementwiseDivOp_scalar(ElementwiseDivOp):
-    def setUp(self):
-        self.op_type = "elementwise_div"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32),
-            'Y': np.random.uniform(0.1, 1, [1]).astype(np.float32)
-        }
-        self.outputs = {'Out': self.inputs['X'] / self.inputs['Y']}
-
-
-class TestElementwiseDivOp_Vector(ElementwiseDivOp):
-    def setUp(self):
-        self.op_type = "elementwise_div"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [32]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [32]).astype("float32")
-        }
-        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwiseDivOp_broadcast_0(ElementwiseDivOp):
-    def setUp(self):
-        self.op_type = "elementwise_div"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [2]).astype("float32")
-        }
-
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out':
-            np.divide(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
-        }
-
-
-class TestElementwiseDivOp_broadcast_1(ElementwiseDivOp):
-    def setUp(self):
-        self.op_type = "elementwise_div"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [3]).astype("float32")
-        }
-
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out':
-            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
-        }
-
-
-class TestElementwiseDivOp_broadcast_2(ElementwiseDivOp):
-    def setUp(self):
-        self.op_type = "elementwise_div"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [4]).astype("float32")
-        }
-
-        self.outputs = {
-            'Out':
-            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
-        }
-
-
-class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
-    def setUp(self):
-        self.op_type = "elementwise_div"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [3, 4]).astype("float32")
-        }
-
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out':
-            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
-        }
-
-
-class TestElementwiseDivOp_broadcast_4(ElementwiseDivOp):
-    def setUp(self):
-        self.op_type = "elementwise_div"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [2, 1, 4]).astype("float32")
-        }
-        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwiseDivOp_broadcast_5(ElementwiseDivOp):
-    def setUp(self):
-        self.op_type = "elementwise_div"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [2, 3, 1, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwiseDivOpFp16(ElementwiseDivOp):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=1, no_grad_set=set('Y'))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
deleted file mode 100644
index 104e896b6e440f5657a90e0ce741b49f72ba75c6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest
-
-import random
-
-
-class TestElementwiseModOp(OpTest):
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def setUp(self):
-        self.op_type = "elementwise_floordiv"
-        self.dtype = np.int32
-        self.axis = -1
-        self.init_dtype()
-        self.init_input_output()
-        self.init_kernel_type()
-        self.init_axis()
-
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
-        }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': self.out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_input_output(self):
-        self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
-        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
-        self.out = np.floor_divide(self.x, self.y)
-
-    def init_dtype(self):
-        pass
-
-    def init_axis(self):
-        pass
-
-
-class TestElementwiseModOp_scalar(TestElementwiseModOp):
-    def init_input_output(self):
-        scale_x = random.randint(0, 100000000)
-        scale_y = random.randint(1, 100000000)
-        self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype)
-        self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype)
-        self.out = np.floor_divide(self.x, self.y)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
deleted file mode 100644
index 9f452ffde74ee18d14f155fb5ed53fee57f12f49..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-
-
-class TestElementWiseAddOp(unittest.TestCase):
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
-
-    def check_forward_backward(self):
-        def test_with_place(place):
-            out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
-            x_grad = out_grad
-            sum_axis = list(range(0, len(self.x.shape)))
-            del sum_axis[self.axis]
-            y_grad = np.sum(out_grad, axis=tuple(sum_axis))
-
-            var_dict = locals()
-            var_dict['y'] = self.y
-            var_dict['x'] = self.x
-            var_dict['out'] = self.out
-            var_dict['y@GRAD'] = y_grad
-            var_dict['x@GRAD'] = x_grad
-            var_dict['out@GRAD'] = out_grad
-
-            var_names = ['x', 'y', 'out', 'y@GRAD', 'x@GRAD', 'out@GRAD']
-            ground_truth = {name: var_dict[name] for name in var_names}
-
-            program = fluid.Program()
-            with fluid.program_guard(program):
-                block = program.global_block()
-                for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
-                elementwise_add_op = block.append_op(
-                    type="elementwise_add",
-                    inputs={
-                        "X": block.var('x'),
-                        "Y": block.var('y'),
-                    },
-                    outputs={"Out": block.var('out'), },
-                    attrs={"axis": self.axis, })
-
-                # generate backward op_desc
-                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    elementwise_add_op.desc, set(), [])
-                grad_op_desc = grad_op_desc_list[0]
-                new_op_desc = block.desc.append_op()
-                new_op_desc.copy_from(grad_op_desc)
-                for var_name in grad_op_desc.output_arg_names():
-                    block.desc.var(var_name.encode("ascii"))
-                grad_op_desc.infer_var_type(block.desc)
-                grad_op_desc.infer_shape(block.desc)
-                for arg in grad_op_desc.output_arg_names():
-                    grad_var = block.desc.find_var(arg.encode("ascii"))
-                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-                exe = fluid.Executor(place)
-                out = exe.run(program,
-                              feed={
-                                  name: var_dict[name]
-                                  for name in ['x', 'y', 'out@GRAD']
-                              },
-                              fetch_list=['x@GRAD', 'y@GRAD'])
-                self.__assert_close(x_grad, out[0], "x@GRAD")
-                self.__assert_close(y_grad, out[1], "y@GRAD", atol=1.4)
-
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(
-                "elementwise_add"):
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            test_with_place(place)
-
-    def test_check_forward_backward_with_scale_and_bias(self):
-        np.random.seed(123)
-        self.x = np.random.random((4, 32, 220, 220)).astype(np.float32)
-        self.y = np.random.random((32)).astype(np.float32)
-        self.out = self.x + self.y.reshape(1, 32, 1, 1)
-        self.axis = 1
-        self.check_forward_backward()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
deleted file mode 100644
index db7f5a640e5451895d1d2977a691cffc582a43dc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestElementwiseOp(OpTest):
-    def setUp(self):
-        self.op_type = "elementwise_max"
-        # If x and y have the same value, the max() is not differentiable.
-        # So we generate test data by the following method
-        # to avoid them being too close to each other.
-        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
-        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
-        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
-        self.inputs = {'X': x, 'Y': y}
-        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
-
-
-class TestElementwiseMaxOp_scalar(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_max"
-        x = np.random.random_integers(-5, 5, [2, 3, 4]).astype("float32")
-        y = np.array([0.5]).astype("float32")
-        self.inputs = {'X': x, 'Y': y}
-        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwiseMaxOp_Vector(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_max"
-        x = np.random.random((32, )).astype("float32")
-        sgn = np.random.choice([-1, 1], (32, )).astype("float32")
-        y = x + sgn * np.random.uniform(0.1, 1, (32, )).astype("float32")
-        self.inputs = {'X': x, 'Y': y}
-        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwiseMaxOp_broadcast_0(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_max"
-        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
-        sgn = np.random.choice([-1, 1], (2, )).astype(np.float32)
-        y = x[:, 0, 0] + sgn * \
-            np.random.uniform(1, 2, (2, )).astype(np.float32)
-        self.inputs = {'X': x, 'Y': y}
-
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out':
-            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
-        }
-
-
-class TestElementwiseMaxOp_broadcast_1(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_max"
-        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
-        sgn = np.random.choice([-1, 1], (3, )).astype(np.float32)
-        y = x[0, :, 0] + sgn * \
-            np.random.uniform(1, 2, (3, )).astype(np.float32)
-        self.inputs = {'X': x, 'Y': y}
-
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out':
-            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
-        }
-
-
-class TestElementwiseMaxOp_broadcast_2(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_max"
-        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
-        sgn = np.random.choice([-1, 1], (4, )).astype(np.float32)
-        y = x[0, 0, :] + sgn * \
-            np.random.uniform(1, 2, (4, )).astype(np.float32)
-        self.inputs = {'X': x, 'Y': y}
-
-        self.outputs = {
-            'Out':
-            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
-        }
-
-
-class TestElementwiseMaxOp_broadcast_3(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_max"
-        x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float32)
-        sgn = np.random.choice([-1, 1], (3, 4)).astype(np.float32)
-        y = x[0, :, :, 0] + sgn * \
-            np.random.uniform(1, 2, (3, 4)).astype(np.float32)
-        self.inputs = {'X': x, 'Y': y}
-
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out':
-            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
-        }
-
-
-class TestElementwiseMaxOp_broadcast_4(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_max"
-        x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float32)
-        sgn = np.random.choice([-1, 1], (2, 3, 1, 5)).astype(np.float32)
-        y = x + sgn * \
-            np.random.uniform(1, 2, (2, 3, 1, 5)).astype(np.float32)
-        self.inputs = {'X': x, 'Y': y}
-
-        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
deleted file mode 100644
index c1e93f6a4e70e218a90d7c7bf9ac1cbcba6e8e78..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestElementwiseOp(OpTest):
-    def setUp(self):
-        self.op_type = "elementwise_min"
-        # If x and y have the same value, the min() is not differentiable.
-        # So we generate test data by the following method
-        # to avoid them being too close to each other.
-        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
-        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
-        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
-        self.inputs = {'X': x, 'Y': y}
-        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
-
-
-class TestElementwiseMinOp_scalar(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_min"
-        x = np.random.random_integers(-5, 5, [2, 3, 4]).astype("float32")
-        y = np.array([0.5]).astype("float32")
-        self.inputs = {'X': x, 'Y': y}
-        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwiseMinOp_Vector(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_min"
-        x = np.random.random((32, )).astype("float32")
-        sgn = np.random.choice([-1, 1], (32, )).astype("float32")
-        y = x + sgn * np.random.uniform(0.1, 1, (32, )).astype("float32")
-        self.inputs = {'X': x, 'Y': y}
-        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwiseMinOp_broadcast_0(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_min"
-        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
-        sgn = np.random.choice([-1, 1], (2, )).astype(np.float32)
-        y = x[:, 0, 0] + sgn * \
-            np.random.uniform(1, 2, (2, )).astype(np.float32)
-        self.inputs = {'X': x, 'Y': y}
-
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out':
-            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
-        }
-
-
-class TestElementwiseMinOp_broadcast_1(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_min"
-        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
-        sgn = np.random.choice([-1, 1], (3, )).astype(np.float32)
-        y = x[0, :, 0] + sgn * \
-            np.random.uniform(1, 2, (3, )).astype(np.float32)
-        self.inputs = {'X': x, 'Y': y}
-
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out':
-            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
-        }
-
-
-class TestElementwiseMinOp_broadcast_2(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_min"
-        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
-        sgn = np.random.choice([-1, 1], (4, )).astype(np.float32)
-        y = x[0, 0, :] + sgn * \
-            np.random.uniform(1, 2, (4, )).astype(np.float32)
-        self.inputs = {'X': x, 'Y': y}
-
-        self.outputs = {
-            'Out':
-            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
-        }
-
-
-class TestElementwiseMinOp_broadcast_3(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_min"
-        x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float32)
-        sgn = np.random.choice([-1, 1], (3, 4)).astype(np.float32)
-        y = x[0, :, :, 0] + sgn * \
-            np.random.uniform(1, 2, (3, 4)).astype(np.float32)
-        self.inputs = {'X': x, 'Y': y}
-
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out':
-            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
-        }
-
-
-class TestElementwiseMinOp_broadcast_4(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_min"
-        x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float32)
-        sgn = np.random.choice([-1, 1], (2, 3, 1, 5)).astype(np.float32)
-        y = x + sgn * \
-            np.random.uniform(1, 2, (2, 3, 1, 5)).astype(np.float32)
-        self.inputs = {'X': x, 'Y': y}
-
-        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
deleted file mode 100644
index fcda179a093cf2306b9d79264c9118fe3b68b35c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest
-
-import random
-
-
-class TestElementwiseModOp(OpTest):
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def setUp(self):
-        self.op_type = "elementwise_mod"
-        self.axis = -1
-        self.init_dtype()
-        self.init_input_output()
-        self.init_kernel_type()
-        self.init_axis()
-
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
-        }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': self.out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_input_output(self):
-        self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
-        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
-        self.out = np.mod(self.x, self.y)
-
-    def init_dtype(self):
-        self.dtype = np.int32
-
-    def init_axis(self):
-        pass
-
-
-class TestElementwiseModOp_scalar(TestElementwiseModOp):
-    def init_input_output(self):
-        scale_x = random.randint(0, 100000000)
-        scale_y = random.randint(1, 100000000)
-        self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype)
-        self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype)
-        self.out = np.mod(self.x, self.y)
-
-
-class TestElementwiseModOpFloat(TestElementwiseModOp):
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def init_input_output(self):
-        self.x = np.random.uniform(-1000, 1000, [10, 10]).astype(self.dtype)
-        self.y = np.random.uniform(-100, 100, [10, 10]).astype(self.dtype)
-        self.out = np.fmod(self.x, self.y)
-
-    def test_check_output(self):
-        self.check_output(atol=2e-5)
-
-
-class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
-    def init_dtype(self):
-        self.dtype = np.float64
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
deleted file mode 100644
index 2415aeb0cbf224f730b26d7b1b06b8c427b27966..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class ElementwiseMulOp(OpTest):
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def setUp(self):
-        self.op_type = "elementwise_mul"
-        self.dtype = np.float32
-        self.axis = -1
-        self.init_dtype()
-        self.init_input_output()
-        self.init_kernel_type()
-        self.init_axis()
-
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
-        }
-        self.outputs = {'Out': self.out}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
-
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        self.out = np.multiply(self.x, self.y)
-
-    def init_dtype(self):
-        pass
-
-    def init_axis(self):
-        pass
-
-
-class TestElementwiseMulOp_scalar(ElementwiseMulOp):
-    def setUp(self):
-        self.op_type = "elementwise_mul"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(1).astype(np.float32)
-        }
-        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
-
-
-class TestElementwiseMulOp_Vector(ElementwiseMulOp):
-    def setUp(self):
-        self.op_type = "elementwise_mul"
-        self.inputs = {
-            'X': np.random.random((32, )).astype("float64"),
-            'Y': np.random.random((32, )).astype("float64")
-        }
-        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.y = np.random.rand(2).astype(self.dtype)
-        self.out = self.x * self.y.reshape(2, 1, 1)
-
-    def init_axis(self):
-        self.axis = 0
-
-
-class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
-    def setUp(self):
-        self.op_type = "elementwise_mul"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float64),
-            'Y': np.random.rand(3).astype(np.float64)
-        }
-
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 3, 1)
-        }
-
-
-class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
-    def setUp(self):
-        self.op_type = "elementwise_mul"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float64),
-            'Y': np.random.rand(4).astype(np.float64)
-        }
-
-        self.outputs = {
-            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 4)
-        }
-
-
-class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
-    def setUp(self):
-        self.op_type = "elementwise_mul"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float64),
-            'Y': np.random.rand(3, 4).astype(np.float64)
-        }
-
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 3, 4, 1)
-        }
-
-
-class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
-    def setUp(self):
-        self.op_type = "elementwise_mul"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float64),
-            'Y': np.random.rand(2, 1, 4).astype(np.float64)
-        }
-        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
-
-
-class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
-    def setUp(self):
-        self.op_type = "elementwise_mul"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float64),
-            'Y': np.random.rand(2, 3, 1, 5).astype(np.float64)
-        }
-        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
-
-
-class TestElementwiseMulOpFp16(ElementwiseMulOp):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
deleted file mode 100644
index 52d44d69fae8ad9751a01d68a4ef21db3e7aab46..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
+++ /dev/null
@@ -1,247 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-import gradient_checker
-
-from decorator_helper import prog_scope
-
-
-class TestElementwiseMulDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
-        eps = 0.005
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        y = layers.data('y', shape, False, dtype)
-        x.persistable = True
-        y.persistable = True
-        out = layers.elementwise_mul(x, y)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        y_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestElementwiseMulBroadcastDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
-        eps = 0.005
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        y = layers.data('y', shape[:-1], False, dtype)
-        x.persistable = True
-        y.persistable = True
-        out = layers.elementwise_mul(x, y, axis=0)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestElementwiseAddDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
-        eps = 0.005
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        y = layers.data('y', shape, False, dtype)
-        x.persistable = True
-        y.persistable = True
-        out = layers.elementwise_add(x, y)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        y_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
-        eps = 0.005
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        y = layers.data('y', shape[:-1], False, dtype)
-        x.persistable = True
-        y.persistable = True
-        out = layers.elementwise_add(x, y, axis=0)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
-        eps = 0.005
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        y = layers.data('y', shape, False, dtype)
-        x.persistable = True
-        y.persistable = True
-        out = layers.elementwise_sub(x, y)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        y_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
-        eps = 0.005
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        y = layers.data('y', shape[:-1], False, dtype)
-        x.persistable = True
-        y.persistable = True
-        out = layers.elementwise_sub(x, y, axis=0)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
-        eps = 0.0001
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        y = layers.data('y', shape, False, dtype)
-        x.persistable = True
-        y.persistable = True
-        out = layers.elementwise_div(x, y, axis=0)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        y_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        y_arr[np.abs(y_arr) < 0.005] = 0.02
-
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps, atol=1e-3)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestElementwiseDivBroadcastDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
-        eps = 0.0001
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        y = layers.data('y', shape[1:-1], False, dtype)
-        x.persistable = True
-        y.persistable = True
-        out = layers.elementwise_div(x, y, axis=1)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        y_arr = np.random.uniform(-1, 1, shape[1:-1]).astype(dtype)
-        y_arr[np.abs(y_arr) < 0.005] = 0.02
-
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps, atol=1e-3)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
deleted file mode 100644
index e6a065889c7b1a0c445b85c1282f3d6311caf816..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestElementwisePowOp(OpTest):
-    def setUp(self):
-        self.op_type = "elementwise_pow"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [2, 3]).astype("float32")
-        }
-        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-class TestElementwisePowOp_scalar(TestElementwisePowOp):
-    def setUp(self):
-        self.op_type = "elementwise_pow"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [3, 3, 4]).astype(np.float32),
-            'Y': np.random.uniform(0.1, 1, [1]).astype(np.float32)
-        }
-        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwisePowOp_tensor(TestElementwisePowOp):
-    def setUp(self):
-        self.op_type = "elementwise_pow"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [32]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [32]).astype("float32")
-        }
-        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwisePowOp_broadcast_0(TestElementwisePowOp):
-    def setUp(self):
-        self.op_type = "elementwise_pow"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [4]).astype("float32")
-        }
-        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
-
-
-class TestElementwisePowOp_broadcast_1(TestElementwisePowOp):
-    def setUp(self):
-        self.op_type = "elementwise_pow"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [3]).astype("float32")
-        }
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': np.power(self.inputs['X'], self.inputs['Y'].reshape(3, 1))
-        }
-
-
-class TestElementwisePowOp_broadcast_2(TestElementwisePowOp):
-    def setUp(self):
-        self.op_type = "elementwise_pow"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [2]).astype("float32")
-        }
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out': np.power(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
-        }
-
-
-class TestElementwisePowOp_broadcast_3(TestElementwisePowOp):
-    def setUp(self):
-        self.op_type = "elementwise_pow"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [3, 4]).astype("float32")
-        }
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': np.power(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4,
-                                                                       1))
-        }
-
-
-class TestElementwisePowOp_broadcast_4(TestElementwisePowOp):
-    def setUp(self):
-        self.op_type = "elementwise_pow"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [2, 3, 1, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
deleted file mode 100644
index e9a389bbaf5aa5d00c0b93aba1da797745afdc5e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestElementwiseOp(OpTest):
-    def setUp(self):
-        self.op_type = "elementwise_sub"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [2, 3]).astype("float32")
-        }
-        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
-
-
-class TestElementwiseSubOp_scalar(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_sub"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(1).astype(np.float32)
-        }
-        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
-
-
-class TestElementwiseSubOp_Vector(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_sub"
-        self.inputs = {
-            'X': np.random.random((32, )).astype("float32"),
-            'Y': np.random.random((32, )).astype("float32")
-        }
-        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
-
-
-class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_sub"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(2).astype(np.float32)
-        }
-
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out': self.inputs['X'] - self.inputs['Y'].reshape(2, 1, 1)
-        }
-
-
-class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_sub"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(3).astype(np.float32)
-        }
-
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 3, 1)
-        }
-
-
-class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_sub"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(4).astype(np.float32)
-        }
-
-        self.outputs = {
-            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 4)
-        }
-
-
-class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_sub"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
-            'Y': np.random.rand(3, 4).astype(np.float32)
-        }
-
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 3, 4, 1)
-        }
-
-
-class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_sub"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
-            'Y': np.random.rand(2, 3, 1, 5).astype(np.float32)
-        }
-        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_exception.py b/python/paddle/fluid/tests/unittests/test_exception.py
deleted file mode 100644
index 798ed53cddade22e986cae65109b6c6ac7a291b6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_exception.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.compat as cpt
-import paddle.fluid.core as core
-import unittest
-
-
-class TestException(unittest.TestCase):
-    def test_exception(self):
-        exception = None
-        try:
-            core.__unittest_throw_exception__()
-        except core.EnforceNotMet as ex:
-            self.assertIn("test exception", cpt.get_exception_message(ex))
-            exception = ex
-
-        self.assertIsNotNone(exception)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
deleted file mode 100644
index b1f89eca6e58aec41b5863f4c885d5c6231a72f4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import numpy
-import paddle.fluid.core as core
-from paddle.fluid.executor import Executor
-from paddle.fluid.layers import mul, data
-
-
-class TestExecutor(unittest.TestCase):
-    def test_mul(self):
-        a = data(name='a', shape=[784], dtype='float32')
-        b = data(
-            name='b',
-            shape=[784, 100],
-            dtype='float32',
-            append_batch_size=False)
-        out = mul(x=a, y=b)
-        place = core.CPUPlace()
-        a_np = numpy.random.random((100, 784)).astype('float32')
-        b_np = numpy.random.random((784, 100)).astype('float32')
-        exe = Executor(place)
-        outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
-        out = outs[0]
-        self.assertEqual((100, 100), out.shape)
-        self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py b/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py
deleted file mode 100644
index e1aaa82845bac6c02b8825adbbfa6dcf33d97894..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import numpy
-import paddle.fluid.core as core
-from paddle.fluid.executor import Executor
-from paddle.fluid.layers import mul, data
-
-
-class TestExecutor(unittest.TestCase):
-    def test_mul(self):
-        a = data(name='a', shape=[784], dtype='float32')
-        b = data(
-            name='b',
-            shape=[784, 100],
-            dtype='float32',
-            append_batch_size=False)
-        output = mul(x=a, y=b)
-        place = core.CPUPlace()
-        a_np = numpy.random.random((100, 784)).astype('float32')
-        b_np = numpy.random.random((784, 100)).astype('float32')
-        exe = Executor(place)
-        import time
-        use_cache = True
-        step_num = 3
-        run_time = 0.0
-        for i in range(step_num):
-            begin = time.time()
-            outs = exe.run(feed={'a': a_np,
-                                 'b': b_np},
-                           fetch_list=[output.name],
-                           use_program_cache=use_cache)
-            end = time.time()
-            run_time += end - begin
-            out = outs[0]
-            self.assertEqual((100, 100), out.shape)
-            self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
-        print("run time %f" % run_time)
-        use_cache = False
-        run_time = 0.0
-        for i in range(step_num):
-            begin = time.time()
-            outs = exe.run(feed={'a': a_np,
-                                 'b': b_np},
-                           fetch_list=[output.name],
-                           use_program_cache=use_cache)
-            end = time.time()
-            run_time += end - begin
-            out = outs[0]
-            self.assertEqual((100, 100), out.shape)
-            self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
-        print("run time %f" % run_time)
-        use_cache = True
-        run_time = 0.0
-        for i in range(step_num):
-            begin = time.time()
-            outs = exe.run(feed={'a': a_np,
-                                 'b': b_np},
-                           fetch_list=[output.name],
-                           use_program_cache=use_cache)
-            end = time.time()
-            run_time += end - begin
-            out = outs[0]
-            self.assertEqual((100, 100), out.shape)
-            self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
-        print("run time %f" % run_time)
-
-        use_cache = True
-        run_time = 0.0
-        for i in range(step_num):
-            begin = time.time()
-            outs = exe.run(feed={'a': a_np,
-                                 'b': b_np},
-                           fetch_list=[output],
-                           use_program_cache=use_cache)
-            end = time.time()
-            run_time += end - begin
-            out = outs[0]
-            self.assertEqual((100, 100), out.shape)
-            self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
-        print("run time %f" % run_time)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py b/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
deleted file mode 100644
index 64a8f20dae1e8f14ca44979dfade519958d4d4c7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from op_test import OpTest
-
-
-class TestExecutorReturnTensorNotOverwritingWithOptest(OpTest):
-    def setUp(self):
-        pass
-
-    def calc_add_out(self, place=None, parallel=None):
-        self.x = np.random.random((2, 5)).astype(np.float32)
-        self.y = np.random.random((2, 5)).astype(np.float32)
-        self.out = np.add(self.x, self.y)
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
-        }
-        self.outputs = {'Out': self.out}
-        self.op_type = "elementwise_add"
-        self.dtype = np.float32
-        outs, fetch_list = self._calc_output(place, parallel=parallel)
-        return outs
-
-    def calc_mul_out(self, place=None, parallel=None):
-        self.x = np.random.random((2, 5)).astype(np.float32)
-        self.y = np.random.random((5, 2)).astype(np.float32)
-        self.out = np.dot(self.x, self.y)
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
-        }
-        self.outputs = {'Out': self.out}
-        self.op_type = "elementwise_mul"
-        self.dtype = np.float32
-        outs, fetch_list = self._calc_output(place, parallel=parallel)
-        return outs
-
-    def test_executor_run_twice(self):
-        places = [fluid.CPUPlace()]
-        if fluid.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-
-        for place in places:
-            for parallel in [True, False]:
-                add_out = self.calc_add_out(place, parallel)
-                add_out1 = np.array(add_out[0])
-                mul_out = self.calc_mul_out(place, parallel)
-                add_out2 = np.array(add_out[0])
-                self.assertTrue(np.array_equal(add_out1, add_out2))
-
-
-class TestExecutorReturnTensorNotOverOverwritingWithLayers(unittest.TestCase):
-    def setUp(self):
-        pass
-
-    def calc_add_out(self, place=None, parallel=None):
-        x = fluid.layers.ones(shape=[3, 3], dtype='float32')
-        y = fluid.layers.ones(shape=[3, 3], dtype='float32')
-        out = fluid.layers.elementwise_add(x=x, y=y)
-        program = fluid.default_main_program()
-        if parallel:
-            program = fluid.CompiledProgram(program).with_data_parallel(
-                places=place)
-        exe = fluid.Executor(place)
-        out = exe.run(program, fetch_list=[out], return_numpy=False)
-        return out
-
-    def calc_sub_out(self, place=None, parallel=None):
-        x = fluid.layers.ones(shape=[2, 2], dtype='float32')
-        y = fluid.layers.ones(shape=[2, 2], dtype='float32')
-        out = fluid.layers.elementwise_sub(x=x, y=y)
-        program = fluid.default_main_program()
-        if parallel:
-            program = fluid.CompiledProgram(program).with_data_parallel(
-                places=place)
-        exe = fluid.Executor(place)
-        out = exe.run(program, fetch_list=[out], return_numpy=False)
-        return out
-
-    def test_executor_run_twice(self):
-        places = [fluid.CPUPlace()]
-        if fluid.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-
-        for place in places:
-            for parallel in [True, False]:
-                add_out = self.calc_add_out(place, parallel)
-                add_out1 = np.array(add_out[0])
-                sub_out = self.calc_sub_out(place, parallel)
-                add_out2 = np.array(add_out[0])
-                self.assertTrue(np.array_equal(add_out1, add_out2))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
deleted file mode 100644
index 449cda29b45ba4c9ac7aa20d40d04dd3c6a4496f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ /dev/null
@@ -1,208 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid as fluid
-
-
-# Situation 1: expand_times is a list(without tensor)
-class TestExpandOpRank1(OpTest):
-    def setUp(self):
-        self.op_type = "expand"
-        self.init_data()
-
-        self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")}
-        self.attrs = {'expand_times': self.expand_times}
-        output = np.tile(self.inputs['X'], self.expand_times)
-        self.outputs = {'Out': output}
-
-    def init_data(self):
-        self.ori_shape = [12]
-        self.expand_times = [2]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestExpandOpRank2_Corner(TestExpandOpRank1):
-    def init_data(self):
-        self.ori_shape = [12]
-        self.expand_times = [2]
-
-
-class TestExpandOpRank2(TestExpandOpRank1):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.expand_times = [2, 3]
-
-
-class TestExpandOpRank3_Corner(TestExpandOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 4, 5)
-        self.expand_times = (1, 1, 1)
-
-
-class TestExpandOpRank3(TestExpandOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 4, 5)
-        self.expand_times = (2, 1, 4)
-
-
-class TestExpandOpRank4(TestExpandOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 4, 5, 7)
-        self.expand_times = (3, 2, 1, 2)
-
-
-# Situation 2: expand_times is a list(with tensor)
-class TestExpandOpRank1_tensor_attr(OpTest):
-    def setUp(self):
-        self.op_type = "expand"
-        self.init_data()
-        expand_times_tensor = []
-        for index, ele in enumerate(self.expand_times):
-            expand_times_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float32"),
-            'expand_times_tensor': expand_times_tensor,
-        }
-        self.attrs = {"expand_times": self.infer_expand_times}
-        output = np.tile(self.inputs['X'], self.expand_times)
-        self.outputs = {'Out': output}
-
-    def init_data(self):
-        self.ori_shape = [12]
-        self.expand_times = [2]
-        self.infer_expand_times = [-1]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestExpandOpRank2_Corner_tensor_attr(TestExpandOpRank1_tensor_attr):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.expand_times = [1, 1]
-        self.infer_expand_times = [1, -1]
-
-
-class TestExpandOpRank2_attr_tensor(TestExpandOpRank1_tensor_attr):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.expand_times = [2, 3]
-        self.infer_expand_times = [-1, 3]
-
-
-# Situation 3: expand_times is a tensor
-class TestExpandOpRank1_tensor(OpTest):
-    def setUp(self):
-        self.op_type = "expand"
-        self.init_data()
-
-        self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float32"),
-            'ExpandTimes': np.array(self.expand_times).astype("int32"),
-        }
-        self.attrs = {}
-        output = np.tile(self.inputs['X'], self.expand_times)
-        self.outputs = {'Out': output}
-
-    def init_data(self):
-        self.ori_shape = [12]
-        self.expand_times = [2]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestExpandOpRank2_tensor(TestExpandOpRank1_tensor):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.expand_times = [2, 3]
-
-
-# Situation 4: input x is Integer
-class TestExpandOpInteger(OpTest):
-    def setUp(self):
-        self.op_type = "expand"
-        self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("int32")
-        }
-        self.attrs = {'expand_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-# Situation 5: input x is Bool
-class TestExpandOpBoolean(OpTest):
-    def setUp(self):
-        self.op_type = "expand"
-        self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
-        self.attrs = {'expand_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-# Test python API
-class TestExpandAPI(OpTest):
-    def test_api(self):
-        input = np.random.random([12, 14]).astype("float32")
-        x = fluid.layers.data(
-            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
-
-        positive_2 = fluid.layers.fill_constant([1], "int32", 2)
-        expand_times = fluid.layers.data(
-            name="expand_times", shape=[2], append_batch_size=False)
-
-        out_1 = fluid.layers.expand(x, expand_times=[2, 3])
-        out_2 = fluid.layers.expand(x, expand_times=[positive_2, 3])
-        out_3 = fluid.layers.expand(x, expand_times=expand_times)
-
-        exe = fluid.Executor(place=fluid.CPUPlace())
-        res_1, res_2, res_3 = exe.run(fluid.default_main_program(),
-                                      feed={
-                                          "x": input,
-                                          "expand_times":
-                                          np.array([1, 3]).astype("int32")
-                                      },
-                                      fetch_list=[out_1, out_2, out_3])
-        assert np.array_equal(res_1, np.tile(input, (2, 3)))
-        assert np.array_equal(res_2, np.tile(input, (2, 3)))
-        assert np.array_equal(res_3, np.tile(input, (1, 3)))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eye_op.py b/python/paddle/fluid/tests/unittests/test_eye_op.py
deleted file mode 100644
index ea37584b6a5e1d72badc65c294898bdf08f32a2a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eye_op.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-import paddle.fluid.framework as framework
-
-
-class TestEyeOp(OpTest):
-    def setUp(self):
-        '''
-	Test eye op with specified shape
-        '''
-        self.op_type = "eye"
-
-        self.inputs = {}
-        self.attrs = {
-            'num_rows': 219,
-            'num_columns': 319,
-            'dtype': framework.convert_np_dtype_to_dtype_(np.int32)
-        }
-        self.outputs = {'Out': np.eye(219, 319, dtype=np.int32)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestEyeOp1(OpTest):
-    def setUp(self):
-        '''
-	Test eye op with default parameters
-        '''
-        self.op_type = "eye"
-
-        self.inputs = {}
-        self.attrs = {'num_rows': 50}
-        self.outputs = {'Out': np.eye(50, dtype=float)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestEyeOp2(OpTest):
-    def setUp(self):
-        '''
-        Test eye op with specified shape
-        '''
-        self.op_type = "eye"
-
-        self.inputs = {}
-        self.attrs = {'num_rows': 99, 'num_columns': 1}
-        self.outputs = {'Out': np.eye(99, 1, dtype=float)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
deleted file mode 100644
index 0812b02b47db7fa2d43e1d3bbd0a3f7b59911326..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-from op_test import OpTest
-
-
-def quantize_max_abs(x, max_range):
-    scale = np.max(np.abs(x).flatten())
-    y = np.round(x / scale * max_range)
-    return y, scale
-
-
-def dequantize_max_abs(x, scale, max_range):
-    y = (scale / max_range) * x
-    return y
-
-
-def channel_wise_quantize_max_abs(x, quant_bit=8, use_second_dim=False):
-    scales = []
-    if not use_second_dim:
-        for i in range(x.shape[0]):
-            scales.append(np.max(np.abs(x[i])).astype("float32"))
-        y = x.copy()
-        max_range = math.pow(2, quant_bit - 1) - 1
-        for i, scale in enumerate(scales):
-            y[i] = np.round(x[i] / scale * max_range)
-    else:
-        for i in range(x.shape[0]):
-            s = []
-            for j in range(x.shape[1]):
-                s.append(np.max(np.abs(x[i][j])).astype("float32"))
-            scales.append(s)
-        scales = np.amax(np.array(scales), axis=0)
-        y = x.copy()
-        max_range = math.pow(2, quant_bit - 1) - 1
-        for i in range(x.shape[0]):
-            for j, scale in enumerate(scales):
-                y[i][j] = np.round(x[i][j] / scale * max_range)
-    return y, scales
-
-
-def channel_wise_dequantize_max_abs(x,
-                                    scales,
-                                    quant_bits,
-                                    activation_scale=None):
-    if activation_scale is None:
-        y = x.copy()
-        for i in range(x.shape[0]):
-            y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * x[i]
-    else:
-        y = x.copy()
-        for i in range(x.shape[0]):
-            for j in range(x.shape[1]):
-                y[i][j] = (scales[j] /
-                           (math.pow(2, quant_bits[0] - 1) - 1)) * x[i][j]
-        y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
-    return y
-
-
-class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
-    def set_args(self):
-        self.quant_bits = [8, 8]
-        self.data_type = "float32"
-        self.activation_scale = 0.7861
-
-    def setUp(self):
-        self.set_args()
-        self.op_type = "fake_channel_wise_dequantize_max_abs"
-        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
-        yq, scales = channel_wise_quantize_max_abs(
-            x, self.quant_bits[0], use_second_dim=True)
-        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
-                                              self.activation_scale)
-
-        self.inputs = {
-            'X': yq,
-            'Scales': [("scales0", np.array(scales).astype(self.data_type)),
-                       ("scales1", np.array(
-                           [self.activation_scale]).astype(self.data_type))]
-        }
-        self.attrs = {'quant_bits': self.quant_bits}
-        self.outputs = {'Out': ydq}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest):
-    def set_args(self):
-        self.quant_bits = [8]
-        self.data_type = "float32"
-
-    def setUp(self):
-        self.set_args()
-        self.op_type = "fake_channel_wise_dequantize_max_abs"
-        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
-        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
-        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits)
-
-        self.inputs = {
-            'X': yq,
-            'Scales': [("scales0", np.array(scales).astype(self.data_type))]
-        }
-        self.attrs = {'quant_bits': self.quant_bits}
-        self.outputs = {'Out': ydq}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFakeDequantizeMaxAbsOp(OpTest):
-    def set_args(self):
-        self.num_bits = 8
-        self.max_range = math.pow(2, self.num_bits - 1) - 1
-        self.data_type = "float32"
-
-    def setUp(self):
-        self.set_args()
-        self.op_type = "fake_dequantize_max_abs"
-        x = np.random.randn(31, 65).astype(self.data_type)
-        yq, scale = quantize_max_abs(x, self.max_range)
-        ydq = dequantize_max_abs(yq, scale, self.max_range)
-
-        self.inputs = {'X': yq, 'Scale': np.array(scale).astype(self.data_type)}
-        self.attrs = {'max_range': self.max_range}
-        self.outputs = {'Out': ydq}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFakeDequantizeMaxAbsOpDouble(TestFakeDequantizeMaxAbsOp):
-    def set_args(self):
-        self.num_bits = 8
-        self.max_range = math.pow(2, self.num_bits - 1) - 1
-        self.data_type = "float64"
-
-
-class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp):
-    def set_args(self):
-        self.num_bits = 5
-        self.max_range = math.pow(2, self.num_bits - 1) - 1
-        self.data_type = "float32"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_init_op.py b/python/paddle/fluid/tests/unittests/test_fake_init_op.py
deleted file mode 100644
index a62b7aed66b59940b4ba654d98479e3e35c7b78b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fake_init_op.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestFakeInitOpSelectedRows(unittest.TestCase):
-    def check_with_place(self, place, is_selected_rows):
-        scope = core.Scope()
-
-        out_var_name = 'Out'
-        if is_selected_rows:
-            out_tensor = scope.var(out_var_name).get_selected_rows().get_tensor(
-            )
-        else:
-            out_tensor = scope.var(out_var_name).get_tensor()
-
-        var_shape = [4, 784]
-
-        # create and run fake_init_op
-        fake_init_op = Operator("fake_init", Out=out_var_name, shape=var_shape)
-        fake_init_op.run(scope, place)
-
-        self.assertEqual(var_shape, out_tensor._get_dims())
-
-    def test_fake_init_selected_rows(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            for is_selected_rows in [True, False]:
-                self.check_with_place(place, is_selected_rows)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
deleted file mode 100644
index 8fe814dc50d486c8a59c74f965f7e9c5e9b40d7c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ /dev/null
@@ -1,214 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-
-
-class TestFakeQuantizeOp(OpTest):
-    def setUp(self):
-        self.op_type = "fake_quantize_abs_max"
-        self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
-        self.outputs = {
-            'Out': np.round(self.inputs['X'] / scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype("float32"),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFakeChannelWiseQuantizeOp(OpTest):
-    def setUp(self):
-        self.op_type = "fake_channel_wise_quantize_abs_max"
-        self.attrs = {'bit_length': 8}
-        self.inputs = {
-            'X': np.random.random((4, 3, 64, 64)).astype("float32"),
-        }
-        scales = []
-        for i in range(self.inputs['X'].shape[0]):
-            scales.append(np.max(np.abs(self.inputs['X'][i])).astype("float32"))
-        outputs = self.inputs['X'].copy()
-        for i, scale in enumerate(scales):
-            outputs[i] = np.round(outputs[i] / scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1))
-
-        self.outputs = {
-            'Out': outputs,
-            'OutScale': np.array(scales).astype("float32"),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFakeQuantizeRangeAbsMaxOp(OpTest):
-    def setUp(self):
-        self.op_type = "fake_quantize_range_abs_max"
-        self.attrs = {
-            'bit_length': int(5),
-            'window_size': int(1),
-            'is_test': False
-        }
-        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
-        x = x.astype("float32")
-        self.inputs = {
-            'X': x,
-            'Iter': np.zeros(1).astype("int64"),
-            'InScale': np.zeros(1).astype("float32")
-        }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
-
-        out_scales = np.zeros(self.attrs['window_size']).astype("float32")
-        out_scales[0] = scale
-        self.outputs = {
-            'Out': np.round(self.inputs['X'] / scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': scale,
-            'OutScales': out_scales,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestMovingAverageAbsMaxScaleOp(OpTest):
-    def setUp(self):
-        self.op_type = "moving_average_abs_max_scale"
-        self.attrs = {'moving_rate': float(0.9), 'is_test': False}
-        accum = np.zeros(1).astype("float32")
-        accum[0] = 1
-        state = np.zeros(1).astype("float32")
-        state[0] = 1
-        self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
-            'InAccum': accum,
-            'InState': state,
-        }
-
-        out_accum = np.zeros(1).astype("float32")
-        out_state = np.zeros(1).astype("float32")
-        out_scale = np.zeros(1).astype("float32")
-        out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
-            np.abs(self.inputs['X'])).astype("float32")
-        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
-        out_scale = out_accum / out_state
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'OutAccum': out_accum,
-            'OutState': out_state,
-            'OutScale': out_scale,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
-    def setUp(self):
-        self.op_type = "fake_quantize_range_abs_max"
-        self.attrs = {
-            'bit_length': int(8),
-            'window_size': int(1),
-            'is_test': True
-        }
-        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
-        x = x.astype("float32")
-        scale = np.max(np.abs(x)).astype("float32") - 1.0
-        out_scales = np.zeros(self.attrs['window_size']).astype("float32")
-        out_scales[0] = scale
-
-        self.inputs = {
-            'X': x,
-            'Iter': np.zeros(1).astype("int64"),
-            'InScale': scale.astype("float32")
-        }
-        xs = np.clip(x, -scale, scale)
-        qs = np.round(xs / scale * ((1 << (self.attrs['bit_length'] - 1)) - 1))
-        self.outputs = {
-            'Out': qs,
-            'OutScale': scale.astype("float32"),
-            'OutScales': out_scales,
-        }
-
-    def test_check_output(self):
-        self.check_output(no_check_set=set(['OutScale', 'OutScales']))
-
-
-class TestMovingOpBase(OpTest):
-    def setUp(self):
-        self.init_type()
-        self.attrs = {
-            'bit_length': int(5),
-            'moving_rate': float(0.9),
-            'is_test': False
-        }
-        accum = np.zeros(1).astype("float32")
-        accum[0] = 1
-        state = np.zeros(1).astype("float32")
-        state[0] = 1
-        scale = np.zeros(1).astype("float32")
-        scale[0] = 0.001
-        self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
-            'InScale': scale,
-            'InAccum': accum,
-            'InState': state,
-        }
-
-        out_accum = np.zeros(1).astype("float32")
-        out_state = np.zeros(1).astype("float32")
-        out_scale = np.zeros(1).astype("float32")
-        out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
-            np.abs(self.inputs['X'])).astype("float32")
-        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
-        out_scale = out_accum / out_state
-        out_data = self.calc_output(out_scale)
-        self.outputs = {
-            'Out': out_data,
-            'OutAccum': out_accum,
-            'OutState': out_state,
-            'OutScale': out_scale,
-        }
-
-    def init_type(self):
-        self.op_type = "fake_quantize_moving_average_abs_max"
-
-    def calc_output(self, out_scale):
-        return np.round(self.inputs['X'] / out_scale * (
-            (1 << (self.attrs['bit_length'] - 1)) - 1))
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFakeQuantDequantMovingOp(TestMovingOpBase):
-    def init_type(self):
-        self.op_type = "fake_quantize_dequantize_moving_average_abs_max"
-
-    def calc_output(self, out_scale):
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        return np.round(self.inputs['X'] / out_scale *
-                        range_v) * out_scale / range_v
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
deleted file mode 100644
index 6c2088af3dde213274ee068e0931df1fc699b815..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def fc_refer(matrix, with_bias, with_relu=False):
-    in_n, in_c, in_h, in_w = matrix.input.shape
-    w_i, w_o = matrix.weights.shape
-
-    x_data = np.reshape(matrix.input, [in_n, in_c * in_h * in_w])
-    w_data = np.reshape(matrix.weights, [w_i, w_o])
-    b_data = np.reshape(matrix.bias, [1, w_o])
-    result = None
-
-    if with_bias:
-        result = np.dot(x_data, w_data) + b_data
-    else:
-        result = np.dot(x_data, w_data)
-
-    if with_relu:
-        return np.maximum(result, 0)
-    else:
-        return result
-
-
-class MatrixGenerate:
-    def __init__(self, mb, ic, oc, h, w, bias_dims=2):
-        self.input = np.random.random((mb, ic, h, w)).astype("float32")
-        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
-        if bias_dims == 2:
-            self.bias = np.random.random((1, oc)).astype("float32")
-        else:
-            self.bias = np.random.random((oc)).astype("float32")
-
-
-class TestFCOp(OpTest):
-    def config(self):
-        self.with_bias = True
-        self.with_relu = True
-        self.matrix = MatrixGenerate(1, 10, 15, 3, 3, 2)
-
-    def setUp(self):
-        self.op_type = "fc"
-        self.config()
-
-        if self.with_bias:
-            self.inputs = {
-                'Input': self.matrix.input,
-                'W': self.matrix.weights,
-                'Bias': self.matrix.bias
-            }
-        else:
-            self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
-
-        if self.with_relu:
-            activation_type = "relu"
-        else:
-            activation_type = ""
-        self.attrs = {'use_mkldnn': False, 'activation_type': activation_type}
-
-        self.outputs = {
-            'Out': fc_refer(self.matrix, self.with_bias, self.with_relu)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFCOpNoBias1(TestFCOp):
-    def config(self):
-        self.with_bias = False
-        self.with_relu = False
-        self.matrix = MatrixGenerate(2, 8, 10, 1, 1, 2)
-
-
-class TestFCOpNoBias2(TestFCOp):
-    def config(self):
-        self.with_bias = False
-        self.with_relu = False
-        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
-
-
-class TestFCOpNoBias4(TestFCOp):
-    def config(self):
-        self.with_bias = False
-        self.with_relu = False
-        self.matrix = MatrixGenerate(1, 32, 64, 3, 3, 1)
-
-
-class TestFCOpWithBias1(TestFCOp):
-    def config(self):
-        self.with_bias = True
-        self.with_relu = False
-        self.matrix = MatrixGenerate(3, 8, 10, 2, 1, 2)
-
-
-class TestFCOpWithBias2(TestFCOp):
-    def config(self):
-        self.with_bias = True
-        self.with_relu = True
-        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
-
-
-class TestFCOpWithBias3(TestFCOp):
-    def config(self):
-        self.with_bias = True
-        self.with_relu = True
-        self.matrix = MatrixGenerate(1, 64, 32, 3, 3, 1)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
deleted file mode 100644
index b823d397e9530362f5fee417278e36477d65f6f5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid.core as core
-import unittest
-import numpy as np
-
-
-class TestFeedFetch(unittest.TestCase):
-    def test_feed_fetch(self):
-        scope = core.Scope()
-        place = core.CPUPlace()
-        input_array = np.ones((4, 4, 6)).astype("float32")
-        input_array[0, 0, 0] = 3
-        input_array[3, 3, 5] = 10
-        input_tensor = core.LoDTensor([[2, 2]])
-        input_tensor.set(input_array, place)
-
-        core.set_feed_variable(scope, input_tensor, "feed", 0)
-
-        output_tensor = core.get_fetch_variable(scope, "feed", 0)
-
-        output_lod = output_tensor.recursive_sequence_lengths()
-        self.assertEqual(2, output_lod[0][0])
-        self.assertEqual(2, output_lod[0][1])
-
-        output_array = np.array(output_tensor)
-        self.assertEqual(3, output_array[0, 0, 0])
-        self.assertEqual(10, output_array[3, 3, 5])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_var.py b/python/paddle/fluid/tests/unittests/test_fetch_var.py
deleted file mode 100644
index 14eb4c4ed78eaa1c9e6bb6e2ba481ceda2e9ba61..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fetch_var.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import op_test
-import numpy
-import unittest
-
-
-class TestFetchVar(op_test.OpTest):
-    def set_input(self):
-        self.val = numpy.array([1, 3, 5]).astype(numpy.int32)
-
-    def test_fetch_var(self):
-        self.set_input()
-        x = layers.create_tensor(dtype="int32", persistable=True, name="x")
-        layers.assign(input=self.val, output=x)
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
-        fetched_x = fluid.executor._fetch_var("x")
-        self.assertTrue(
-            numpy.array_equal(fetched_x, self.val),
-            "fetch_x=%s val=%s" % (fetched_x, self.val))
-        self.assertEqual(fetched_x.dtype, self.val.dtype)
-
-
-class TestFetchNullVar(TestFetchVar):
-    def set_input(self):
-        self.val = numpy.array([]).astype(numpy.int32)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
deleted file mode 100644
index 58d4bce3b09a5d638a1161be8bac1375a0f125d1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid.core as core
-import paddle.compat as cpt
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestFillAnyLikeOp(OpTest):
-    def setUp(self):
-        self.op_type = "fill_any_like"
-        self.dtype = np.int32
-        self.value = 0.0
-        self.init()
-        self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
-        self.attrs = {'value': self.value}
-        self.outputs = {'Out': self.value * np.ones_like(self.inputs["X"])}
-
-    def init(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp):
-    def init(self):
-        self.dtype = np.float32
-        self.value = 0.0
-
-
-class TestFillAnyLikeOpValue1(TestFillAnyLikeOp):
-    def init(self):
-        self.value = 1.0
-
-
-class TestFillAnyLikeOpValue2(TestFillAnyLikeOp):
-    def init(self):
-        self.value = 1e-10
-
-
-class TestFillAnyLikeOpValue3(TestFillAnyLikeOp):
-    def init(self):
-        self.value = 1e-100
-
-
-class TestFillAnyLikeOpOverflow(TestFillAnyLikeOp):
-    def init(self):
-        self.value = 1e100
-
-    def test_check_output(self):
-        exception = None
-        try:
-            self.check_output()
-        except core.EnforceNotMet as ex:
-            exception = ex
-        self.assertIsNotNone(exception)
-
-
-class TestFillAnyLikeOpFloat16(TestFillAnyLikeOp):
-    def init(self):
-        self.dtype = np.float16
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
deleted file mode 100644
index fdc8a118e56f4473da5ed60169daebec14c7c33c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestFillConstantBatchSizeLikeWhenFirstDimIsBatchSize(OpTest):
-    def setUp(self):
-        self.op_type = "fill_constant_batch_size_like"
-        self.inputs = {'Input': np.random.random((219, 232)).astype("float32")}
-        self.attrs = {'value': 3.5, 'shape': [-1, 132, 7]}
-
-        out = np.random.random((219, 132, 7)).astype("float32")
-        out.fill(3.5)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
-    def setUp(self):
-        self.op_type = "fill_constant_batch_size_like"
-        self.inputs = {'Input': np.random.random((219, 232)).astype("float32")}
-        self.attrs = {
-            'value': 3.5,
-            'shape': [132, -1, 7],
-            'input_dim_idx': 0,
-            'output_dim_idx': 1
-        }
-
-        out = np.random.random((132, 219, 7)).astype("float32")
-        out.fill(3.5)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
-    def setUp(self):
-        self.op_type = "fill_constant_batch_size_like"
-        self.inputs = {
-            'Input': (np.random.random((31, 28)).astype("float32"),
-                      [[9, 14, 8]])
-        }
-        self.attrs = {
-            'value': 3.5,
-            'shape': [-1, 16],
-            'input_dim_idx': 0,
-            'output_dim_idx': 0
-        }
-
-        out = np.random.random((3, 16)).astype("float32")
-        out.fill(3.5)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
deleted file mode 100644
index e22bd09ed06a5dc2385006498a7794a70c776de8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ /dev/null
@@ -1,108 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestFillConstantOp1(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with specified value
-        '''
-        self.op_type = "fill_constant"
-
-        self.inputs = {}
-        self.attrs = {'shape': [123, 92], 'value': 3.8}
-        self.outputs = {'Out': np.full((123, 92), 3.8)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillConstantOp2(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with default value
-        '''
-        self.op_type = "fill_constant"
-
-        self.inputs = {}
-        self.attrs = {'shape': [123, 92]}
-        self.outputs = {'Out': np.full((123, 92), 0.0)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillConstantOp3(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with specified int64 value
-        '''
-        self.op_type = "fill_constant"
-
-        self.inputs = {}
-        self.attrs = {'shape': [123, 92], 'value': 10000000000}
-        self.outputs = {'Out': np.full((123, 92), 10000000000)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillConstantOp4(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with specified int value
-        '''
-        self.op_type = "fill_constant"
-
-        self.inputs = {}
-        self.attrs = {'shape': [123, 92], 'value': 3}
-        self.outputs = {'Out': np.full((123, 92), 3)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillConstantOpWithSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-        # create Out Variable
-        out = scope.var('Out').get_selected_rows()
-
-        # create and run fill_constant_op operator
-        fill_constant_op = Operator(
-            "fill_constant", shape=[123, 92], value=3.8, Out='Out')
-        fill_constant_op.run(scope, place)
-
-        # get result from Out
-        result_array = np.array(out.get_tensor())
-        full_array = np.full((123, 92), 3.8, 'float32')
-
-        self.assertTrue(np.array_equal(result_array, full_array))
-
-    def test_fill_constant_with_selected_rows(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            self.check_with_place(place)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_op.py b/python/paddle/fluid/tests/unittests/test_fill_op.py
deleted file mode 100644
index 0dd1b0d869ae7f21f1d64c374010e2175e70ee33..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fill_op.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestFillOp1(OpTest):
-    def setUp(self):
-        self.op_type = "fill"
-        val = np.random.random(size=[100, 200])
-        self.inputs = {}
-        self.attrs = {
-            'value': val.flatten().tolist(),
-            'shape': [100, 200],
-            'dtype': int(core.VarDesc.VarType.FP64),
-            'force_cpu': False
-        }
-        self.outputs = {'Out': val.astype('float64')}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillOp2(OpTest):
-    def setUp(self):
-        self.op_type = "fill"
-        val = np.random.random(size=[100, 200])
-        self.inputs = {}
-        self.attrs = {
-            'value': val.flatten().tolist(),
-            'shape': [100, 200],
-            'dtype': int(core.VarDesc.VarType.FP64),
-            'force_cpu': True
-        }
-        self.outputs = {'Out': val.astype('float64')}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillOp3(OpTest):
-    def check_with_place(self, place, f_cpu):
-        scope = core.Scope()
-        # create Out Variable
-        out = scope.var('Out').get_tensor()
-
-        # create and run fill_op operator
-        val = np.random.random(size=[300, 200])
-        fill_op = Operator(
-            "fill",
-            value=val.flatten(),
-            shape=[300, 200],
-            dtype=int(core.VarDesc.VarType.FP32),
-            force_cpu=f_cpu,
-            Out='Out')
-        fill_op.run(scope, place)
-
-        # get result from Out
-        result_array = np.array(out)
-        full_array = np.array(val, 'float32')
-
-        self.assertTrue(np.array_equal(result_array, full_array))
-
-    def test_fill_op(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            self.check_with_place(place, True)
-            self.check_with_place(place, False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py
deleted file mode 100644
index 935653b07a6a4e1d344e8040fa4a0ed72b9b164d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from paddle.fluid.framework import convert_np_dtype_to_dtype_
-from op_test import OpTest
-
-
-class TestFillZerosLike2Op(OpTest):
-    def setUp(self):
-        self.op_type = "fill_zeros_like2"
-        self.dtype = np.float32
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
-        self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
-        self.attrs = {'dtype': convert_np_dtype_to_dtype_(self.dtype)}
-
-    def init_dtype(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillZerosLike2OpFp16(TestFillZerosLike2Op):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFillZerosLike2OpFp64(TestFillZerosLike2Op):
-    def init_dtype(self):
-        self.dtype = np.float64
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
deleted file mode 100644
index 20f1a110c35d689064c49efba246f078c3badd33..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestFillZerosLikeOp(OpTest):
-    def setUp(self):
-        self.op_type = "fill_zeros_like"
-        self.dtype = np.float32
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
-        self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
-
-    def init_dtype(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillZerosLikeOpFp16(TestFillZerosLikeOp):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py b/python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py
deleted file mode 100644
index 0b51bf5bab7e9a51fea9e03fccaf5610d0e19bba..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This is unit test of Test filter_instag Op."""
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-from op_test import OpTest
-import random
-from decorator_helper import prog_scope
-"""This is Test Case 1"""
-
-
-class TestFilterByInstagOp(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-        x1 = np.zeros((36, 4), dtype=np.float64)
-        for i in range(36):
-            for j in range(4):
-                x1[i, j] = i
-        x1_lod = [[1, 2, 3, 4, 5, 6, 7, 8]]
-
-        x2 = np.array([[1], [2], [1], [2], [1], [2], [1], [2]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1, 1, 1, 1, 1]]
-
-        x3 = np.array([2]).astype('int64')
-
-        out = np.zeros((20, 4), dtype=np.float64)
-        out_lod = [[2, 4, 6, 8]]
-        start_num_lst = [1, 6, 15, 28]
-
-        ln = 0
-        for i in range(4):
-            start = start_num_lst[i]
-            len = out_lod[0][i]
-            for j in range(len):
-                cur = start + j
-                for k in range(4):
-                    out[ln, k] = cur
-                ln += 1
-
-        mmap = np.array(
-            [[0, 1, 2], [2, 6, 4], [6, 15, 6], [12, 28, 8]]).astype('int64')
-        mmap_lod = [[1, 1, 1, 1]]
-
-        loss_weight = np.array([[1], [1], [1], [1]]).astype('double')
-
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod)
-        }
-
-        self.attrs = {'is_lod': True}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
-
-
-"""This is Test Case 2"""
-
-
-class TestFilterByInstagOp2(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-        batch_size = 4
-        x1_embed_size = 4
-        fc_cnt = 2
-
-        x1 = np.array([[10, 13, 12, 1], [1, 1, 1, 1], [1, 1, 1, 1],
-                       [1, 1, 1, 1]]).astype('double')
-        x1_lod = [[1, 1, 1, 1]]
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([1]).astype('int64')
-
-        out = np.array([[1, 1, 1, 1], [1, 1, 1, 1]]).astype('double')
-        out_lod = [[1, 1]]
-
-        mmap = np.array([[0, 1, 1], [1, 3, 1]]).astype('int64')
-        mmap_lod = [[1, 1]]
-
-        loss_weight = np.array([[1], [1]]).astype('double')
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod)
-        }
-        self.attrs = {'is_lod': True, }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
-
-
-"""This is Test Case 3"""
-
-
-class TestFilterByInstagOp3(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-        batch_size = 4
-        x1_embed_size = 4
-        fc_cnt = 2
-
-        x1 = np.array([[10, 13, 12, 1], [1, 1, 1, 1], [1, 1, 1, 1],
-                       [1, 1, 1, 1]]).astype('double')
-        x1_lod = [[1, 1, 1, 1]]
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.array([[0, 0, 0, 0]]).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod)
-        }
-        self.attrs = {'is_lod': True, }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
-
-
-"""This is Test Case 4"""
-
-
-class TestFilterByInstagOp4(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-        batch_size = 4
-        x1_embed_size = 4
-        fc_cnt = 2
-
-        x1 = np.array([[10, 13, 12, 1], [1, 1, 1, 1], [1, 1, 1, 1],
-                       [1, 1, 1, 1]]).astype('double')
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.array([[0, 0, 0, 0]]).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': x1,
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod)
-        }
-        self.attrs = {'is_lod': False, }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py
deleted file mode 100644
index fa393074a485330c8505388a55355b7a55d89dcc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py
+++ /dev/null
@@ -1,178 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import Program
-import os
-import signal
-import subprocess
-import time
-import unittest
-from multiprocessing import Process
-from op_test import OpTest
-import numpy
-import urllib
-import sys
-from dist_test_utils import *
-
-
-def run_trainer(use_cuda, sync_mode, ip, port, trainers, trainer_id):
-    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    # loss function
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-    # optimizer
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-    with open("trainer_recv_program.dms", "rb") as f:
-        trainer_recv_program_desc_str = f.read()
-    with open("trainer_main_program.dms", "rb") as f:
-        trainer_main_program_desc_str = f.read()
-    with open("trainer_send_program.dms", "rb") as f:
-        trainer_send_program_desc_str = f.read()
-    recv_program = Program.parse_from_string(trainer_recv_program_desc_str)
-    main_program = Program.parse_from_string(trainer_main_program_desc_str)
-    send_program = Program.parse_from_string(trainer_send_program_desc_str)
-
-    trainer_startup_program = fluid.default_startup_program()
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    exe.run(trainer_startup_program)
-    for i in range(5):
-        exe.run(recv_program)
-        exe.run(main_program,
-                feed={
-                    "x": numpy.array([1, 2]).astype('float32').reshape(2, 1),
-                    "y": numpy.array([2, 3]).astype('float32').reshape(2, 1)
-                })
-        exe.run(send_program)
-
-
-def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
-    remove_ps_flag(os.getpid())
-    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    # loss function
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-    # optimizer
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-    with open("pserver_startup_program.dms", "rb") as f:
-        pserver_startup_program_desc_str = f.read()
-    with open("pserver_main_program.dms", "rb") as f:
-        pserver_main_program_desc_str = f.read()
-
-    startup_program = Program.parse_from_string(
-        pserver_startup_program_desc_str)
-    main_program = Program.parse_from_string(pserver_main_program_desc_str)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(startup_program)
-    exe.run(main_program)
-
-
-class TestFlListenAndServOp(OpTest):
-    def setUp(self):
-        self.ps_timeout = 5
-        self.ip = "127.0.0.1"
-        self.port = "6000"
-        self.trainers = 2
-        self.trainer_id = 0
-
-    def _start_pserver(self, use_cuda, sync_mode, pserver_func):
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
-                  self.trainer_id))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _start_trainer0(self, use_cuda, sync_mode, pserver_func):
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, 0))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _start_trainer1(self, use_cuda, sync_mode, pserver_func):
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, 1))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _wait_ps_ready(self, pid):
-        start_left_time = self.ps_timeout
-        sleep_time = 0.5
-        while True:
-            assert start_left_time >= 0, "wait ps ready failed"
-            time.sleep(sleep_time)
-            try:
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                start_left_time -= sleep_time
-
-    def test_rpc_interfaces(self):
-        # TODO(Yancey1989): need to make sure the rpc interface correctly.
-        pass
-
-    def test_handle_signal_in_serv_op(self):
-        # run pserver on CPU in sync mode
-        if sys.platform == 'win32' or sys.platform == 'sys.platform':
-            pass
-        else:
-            print(sys.platform)
-            cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/pserver_startup_program.dms"
-            os.system(cmd)
-            cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/pserver_main_program.dms"
-            os.system(cmd)
-            cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/trainer_recv_program.dms"
-            os.system(cmd)
-            cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/trainer_main_program.dms"
-            os.system(cmd)
-            cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/trainer_send_program.dms"
-            os.system(cmd)
-            p1 = self._start_pserver(False, True, run_pserver)
-            self._wait_ps_ready(p1.pid)
-            time.sleep(5)
-            t1 = self._start_trainer0(False, True, run_trainer)
-            time.sleep(2)
-            t2 = self._start_trainer1(False, True, run_trainer)
-            # raise SIGTERM to pserver
-            time.sleep(2)
-            cmd_del = "rm trainer*dms* pserver*dms*"
-            os.system(cmd_del)
-            os.kill(p1.pid, signal.SIGINT)
-            p1.join()
-            os.kill(t1.pid, signal.SIGINT)
-            t1.join()
-            os.kill(t2.pid, signal.SIGINT)
-            t2.join()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_flatten2_op.py b/python/paddle/fluid/tests/unittests/test_flatten2_op.py
deleted file mode 100644
index 59185855a5f13b82ca26bc26ead73fbe5fb96443..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_flatten2_op.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-from op_test import OpTest
-
-
-class TestFlattenOp(OpTest):
-    def setUp(self):
-        self.op_type = "flatten2"
-        self.init_test_case()
-        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
-        self.init_attrs()
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.new_shape),
-            "XShape": np.random.random(self.in_shape).astype("float32")
-        }
-
-    def test_check_output(self):
-        self.check_output(no_check_set=["XShape"])
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 2, 5)
-        self.axis = 1
-        self.new_shape = (3, 20)
-
-    def init_attrs(self):
-        self.attrs = {"axis": self.axis}
-
-
-class TestFlattenOp(TestFlattenOp):
-    def init_test_case(self):
-        self.in_shape = (3, 2, 2, 3)
-        self.axis = 0
-        self.new_shape = (1, 36)
-
-
-class TestFlattenOpWithDefaultAxis(TestFlattenOp):
-    def init_test_case(self):
-        self.in_shape = (3, 2, 2, 3)
-        self.new_shape = (3, 12)
-
-    def init_attrs(self):
-        self.attrs = {}
-
-
-class TestFlattenOpSixDims(TestFlattenOp):
-    def init_test_case(self):
-        self.in_shape = (3, 2, 3, 2, 4, 4)
-        self.axis = 4
-        self.new_shape = (36, 16)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py
deleted file mode 100644
index 91251147ebc7908893e90467c1305fca89917ed7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-from op_test import OpTest
-
-
-class TestFlattenOp(OpTest):
-    def setUp(self):
-        self.op_type = "flatten"
-        self.init_test_case()
-        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
-        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 2, 5)
-        self.axis = 1
-        self.new_shape = (3, 20)
-
-    def init_attrs(self):
-        self.attrs = {"axis": self.axis}
-
-
-class TestFlattenOp(TestFlattenOp):
-    def init_test_case(self):
-        self.in_shape = (3, 2, 2, 3)
-        self.axis = 0
-        self.new_shape = (1, 36)
-
-
-class TestFlattenOpWithDefaultAxis(TestFlattenOp):
-    def init_test_case(self):
-        self.in_shape = (3, 2, 2, 3)
-        self.new_shape = (3, 12)
-
-    def init_attrs(self):
-        self.attrs = {}
-
-
-class TestFlattenOpSixDims(TestFlattenOp):
-    def init_test_case(self):
-        self.in_shape = (3, 2, 3, 2, 4, 4)
-        self.axis = 4
-        self.new_shape = (36, 16)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_api_input.py b/python/paddle/fluid/tests/unittests/test_fleet_api_input.py
deleted file mode 100644
index 49cdeaa6a426e84907cf7bbcdcd05fed5e782522..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fleet_api_input.py
+++ /dev/null
@@ -1,208 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
-from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker
-from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedCollectiveRoleMaker
-from paddle.fluid.incubate.fleet.base.role_maker import Role
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import TranspilerOptimizer
-
-
-class DistributeTranspilerConfigTest(unittest.TestCase):
-    def set_runtime_split_send_recv(self, config, value):
-        config.runtime_split_send_recv = value
-
-    def set_sync_mode(self, config, value):
-        config.sync_mode = value
-
-    def testConfig(self):
-        config = DistributeTranspilerConfig()
-        self.assertRaises(Exception, self.set_sync_mode, config, None)
-        self.assertRaises(Exception, self.set_runtime_split_send_recv, config,
-                          None)
-        self.assertRaises(Exception, self.set_runtime_split_send_recv, config,
-                          True)
-        self.set_sync_mode(config, False)
-        self.assertFalse(config.sync_mode)
-        self.set_runtime_split_send_recv(config, True)
-        self.assertRaises(Exception, self.set_sync_mode, config, True)
-
-
-class FleetTest(unittest.TestCase):
-    def testInvalidInputs(self):
-        self.assertRaises(Exception, fleet.split_files, "files")
-        self.assertRaises(Exception, fleet.init, "pserver")
-
-        data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-        hidden = fluid.layers.fc(input=data, size=10)
-        loss = fluid.layers.mean(hidden)
-        adam = fluid.optimizer.Adam()
-        adam.minimize(loss)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        pe = fluid.ParallelExecutor(use_cuda=False, loss_name=loss.name)
-        self.assertRaises(
-            Exception,
-            fleet.save_inference_model,
-            dirname='/tmp/',
-            feeded_var_names=['X'],
-            target_vars=[loss],
-            executor=pe)
-        self.assertRaises(
-            Exception,
-            fleet.save_inference_model,
-            dirname='/tmp/',
-            feeded_var_names=['X'],
-            target_vars=[loss],
-            executor="executor")
-        compiled_prog = fluid.compiler.CompiledProgram(
-            fluid.default_main_program())
-        self.assertRaises(
-            Exception,
-            fleet.save_inference_model,
-            dirname='/tmp/',
-            feeded_var_names=['X'],
-            target_vars=[loss],
-            executor=exe,
-            main_program=compiled_prog)
-        self.assertRaises(
-            Exception, fleet.save_persistables, executor=pe, dirname='/tmp/')
-        self.assertRaises(
-            Exception,
-            fleet.save_persistables,
-            executor="executor",
-            dirname='/tmp/')
-        self.assertRaises(
-            Exception,
-            fleet.save_persistables,
-            executor=exe,
-            dirname='/tmp/',
-            main_program=compiled_prog)
-        self.assertRaises(Exception, fleet._transpile, "config")
-
-
-class TranspilerOptimizerTest(unittest.TestCase):
-    def testInvalidInputs(self):
-        self.assertRaises(Exception, TranspilerOptimizer, "Adam", None)
-        self.assertRaises(Exception, TranspilerOptimizer,
-                          fluid.optimizer.Adam(0.001), "strategy")
-
-        transpiler = TranspilerOptimizer(fluid.optimizer.Adam(0.001))
-        self.assertRaises(Exception, transpiler.minimize, loss=[])
-        data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-        hidden = fluid.layers.fc(input=data, size=10)
-        loss = fluid.layers.mean(hidden)
-        self.assertRaises(
-            Exception, transpiler.minimize, loss=loss.name, startup_program=[])
-
-
-class UserDefinedRoleMakerTest(unittest.TestCase):
-    def createRoleMaker(self,
-                        current_id=0,
-                        role=Role.WORKER,
-                        worker_num=1,
-                        server_endpoints=["127.0.0.1:8080"]):
-        role = UserDefinedRoleMaker(current_id, role, worker_num,
-                                    server_endpoints)
-
-    def testRoleMaker(self):
-        self.createRoleMaker()
-        ## test all invalid server_endpoints
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            server_endpoints=None)  # server_endpoints must be as list
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            server_endpoints=[])  # server_endpoints can't be empty
-        self.assertRaises(
-            Exception, self.createRoleMaker, server_endpoints=[
-                3, []
-            ])  # element in server_endpoints must be as string
-        self.assertRaises(
-            Exception,
-            self.createRoleMaker,
-            server_endpoints=["127.0.0.1:8080", "127.0.0.1:8080"]
-        )  # element in server_endpoints can't be duplicate
-        ## test all invalid current_id 
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            current_id="0")  # current_id must be as int
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            current_id=-1)  # current_id must be greater than or equal to 0
-        self.assertRaises(
-            Exception,
-            self.createRoleMaker,
-            current_id=1,
-            role=Role.SERVER,
-            server_endpoints=["127.0.0.1:8080"]
-        )  # if role is server, current_id must be less than len(server_endpoints)
-        ## test all invalid worker_num
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            worker_num="1")  # worker_num must be as int
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            worker_num=0)  # worker_num must be greater than 0
-        ## test all invalid role
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            role=3)  # role must be as Role(Role.WORKER=1, Role.SERVER=2)
-
-
-class UserDefinedCollectiveRoleMakerTest(unittest.TestCase):
-    def createRoleMaker(self, current_id=0,
-                        worker_endpoints=["127.0.0.1:8080"]):
-        role = UserDefinedCollectiveRoleMaker(current_id, worker_endpoints)
-
-    def testRoleMaker(self):
-        self.createRoleMaker()
-        ## test all invalid worker_endpoints
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            worker_endpoints=None)  # worker_endpoints must be as list
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            worker_endpoints=[])  # worker_endpoints can't be empty
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            worker_endpoints=[3,
-                              []])  # element worker_endpoints must be as string
-        self.assertRaises(
-            Exception,
-            self.createRoleMaker,
-            worker_endpoints=["127.0.0.1:8080", "127.0.0.1:8080"]
-        )  # element in worker_endpoints can't be duplicate
-        ## test all invalid current_id
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            current_id="0")  # current_id must be as int
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            current_id=-1)  # current_id must be greater than or equal to 0
-        self.assertRaises(
-            Exception,
-            self.createRoleMaker,
-            current_id=1,
-            worker_endpoints=["127.0.0.1:8080"]
-        )  # current_id must be less than len(worker_endpoints)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
deleted file mode 100644
index f8ee00a3188d1f4efb50c9cea70d3a55ff311ed0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-import unittest
-
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-
-
-class TestCloudRoleMaker(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_PORT"] = "36001"
-        os.environ["PADDLE_PSERVERS"] = "127.0.0.1,127.0.0.2"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-
-    def test_tr_rolemaker(self):
-        os.environ["TRAINING_ROLE"] = "TRAINER"
-        os.environ["PADDLE_TRAINER_ID"] = "0"
-
-        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
-        ro.generate_role()
-
-        self.assertTrue(ro.is_worker())
-        self.assertFalse(ro.is_server())
-        self.assertEqual(ro.worker_num(), 2)
-
-    def test_ps_rolemaker(self):
-        os.environ["TRAINING_ROLE"] = "PSERVER"
-        os.environ["POD_IP"] = "127.0.0.1"
-
-        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
-        ro.generate_role()
-        self.assertFalse(ro.is_worker())
-        self.assertTrue(ro.is_server())
-        self.assertEqual(ro.worker_num(), 2)
-
-    def test_traing_role(self):
-        os.environ["TRAINING_ROLE"] = "TEST"
-        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
-        self.assertRaises(ValueError, ro.generate_role)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
deleted file mode 100644
index a26b27ff4ffee976f99503d38d71c8056337d8b7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fleet_utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.fluid as fluid
-import unittest
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.utils.fleet_barrier_util import check_all_trainers_ready
-
-
-class TestFleetUtils(unittest.TestCase):
-    def test_fleet_barrier(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=1,
-            server_endpoints=['127.0.0.1'])
-        fleet.init(role)
-        check_all_trainers_ready("/ready_path/", 0)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_framework_debug_str.py b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
deleted file mode 100644
index 72f43e56ccbe04f56cfd5a655fb57c58369039bb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.framework import Program
-
-
-class TestDebugStringFramework(unittest.TestCase):
-    def test_debug_str(self):
-        p = Program()
-        p.current_block().create_var(name='t', shape=[0, 1])
-        self.assertRaises(ValueError, callableObj=p.__str__)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fsp_op.py b/python/paddle/fluid/tests/unittests/test_fsp_op.py
deleted file mode 100644
index 01991f4d36caf83173452c6a032c37852fa35586..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fsp_op.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def fsp_matrix(a, b):
-    batch = a.shape[0]
-    a_channel = a.shape[1]
-    b_channel = b.shape[1]
-    h = a.shape[2]
-    w = a.shape[3]
-    a_t = a.transpose([0, 2, 3, 1])
-    a_t = a_t.reshape([batch, h * w, a_channel])
-    b_t = b.transpose([0, 2, 3, 1]).reshape([batch, h * w, b_channel])
-    a_r = a_t.repeat(
-        b_channel, axis=1).reshape(
-            [batch, h * w, b_channel, a_channel]).transpose([0, 1, 3, 2])
-    b_r = b_t.repeat(
-        a_channel, axis=1).reshape([batch, h * w, a_channel, b_channel])
-    return np.mean(a_r * b_r, axis=1)
-
-
-class TestFSPOp(OpTest):
-    def setUp(self):
-        self.op_type = "fsp"
-        self.initTestCase()
-
-        feature_map_0 = np.random.uniform(0, 10, self.a_shape).astype('float64')
-        feature_map_1 = np.random.uniform(0, 10, self.b_shape).astype('float64')
-
-        self.inputs = {'X': feature_map_0, 'Y': feature_map_1}
-        self.outputs = {'Out': fsp_matrix(feature_map_0, feature_map_1)}
-
-    def initTestCase(self):
-        self.a_shape = (2, 3, 5, 6)
-        self.b_shape = (2, 4, 5, 6)
-
-    @unittest.skip("Disable temporarily.")
-    def test_check_output(self):
-        self.check_output()
-
-    @unittest.skip("Disable temporarily.")
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ftrl_op.py b/python/paddle/fluid/tests/unittests/test_ftrl_op.py
deleted file mode 100644
index a6390b054f06184831c289fe9556216ae213be7c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_ftrl_op.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestFTRLOp(OpTest):
-    def setUp(self):
-        self.op_type = "ftrl"
-        w = np.random.random((102, 105)).astype("float32")
-        g = np.random.random((102, 105)).astype("float32")
-        sq_accum = np.full((102, 105), 0.1).astype("float32")
-        linear_accum = np.full((102, 105), 0.1).astype("float32")
-        lr = np.array([0.01]).astype("float32")
-        l1 = 0.1
-        l2 = 0.2
-        lr_power = -0.5
-
-        self.inputs = {
-            'Param': w,
-            'SquaredAccumulator': sq_accum,
-            'LinearAccumulator': linear_accum,
-            'Grad': g,
-            'LearningRate': lr
-        }
-        self.attrs = {
-            'l1': l1,
-            'l2': l2,
-            'lr_power': lr_power,
-            'learning_rate': lr
-        }
-        new_accum = sq_accum + g * g
-        if lr_power == -0.5:
-            linear_out = linear_accum + g - (
-                (np.sqrt(new_accum) - np.sqrt(sq_accum)) / lr) * w
-        else:
-            linear_out = linear_accum + g - ((np.power(
-                new_accum, -lr_power) - np.power(sq_accum, -lr_power)) / lr) * w
-
-        x = (l1 * np.sign(linear_out) - linear_out)
-        if lr_power == -0.5:
-            y = (np.sqrt(new_accum) / lr) + (2 * l2)
-            pre_shrink = x / y
-            param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
-        else:
-            y = (np.power(new_accum, -lr_power) / lr) + (2 * l2)
-            pre_shrink = x / y
-            param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
-
-        sq_accum_out = sq_accum + g * g
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'SquaredAccumOut': sq_accum_out,
-            'LinearAccumOut': linear_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
deleted file mode 100644
index 5ce82b267ac24e1e40915b2354100acd8aaf7c68..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net
-from fake_reader import fake_imdb_reader
-from parallel_executor_test_base import TestParallelExecutorBase
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from functools import partial
-import paddle
-import unittest
-import os
-
-
-class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def compare_fuse_all_reduce_ops(self,
-                                    model,
-                                    use_cuda,
-                                    init_feed_dicta=None,
-                                    get_data_from_feeder=None,
-                                    optimizer=None,
-                                    fuse_all_optimizer_ops=False):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        feed_dict_data = None
-        if init_feed_dicta is not None:
-            img, label = init_feed_dicta()
-            feed_dict_data = {"image": img, "label": label}
-
-        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict_data,
-            get_data_from_feeder=get_data_from_feeder,
-            use_cuda=use_cuda,
-            fuse_all_reduce_ops=False,
-            fuse_all_optimizer_ops=fuse_all_optimizer_ops,
-            optimizer=optimizer)
-        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict_data,
-            get_data_from_feeder=get_data_from_feeder,
-            use_cuda=use_cuda,
-            fuse_all_reduce_ops=True,
-            fuse_all_optimizer_ops=fuse_all_optimizer_ops,
-            optimizer=optimizer)
-
-        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
-        for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
-
-    def optimizer(self, learning_rate=1e-3):
-        optimizer = fluid.optimizer.SGD(
-            learning_rate=learning_rate,
-            regularization=fluid.regularizer.L2Decay(1e-3))
-        return optimizer
-
-
-class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
-    def _decorate_compare_fused_all_reduce(self, model, use_cuda):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_cuda,
-            init_feed_dicta=init_data,
-            optimizer=self.optimizer,
-            fuse_all_optimizer_ops=True)
-
-    def test_simple_fc_with_fuse_all_reduce(self):
-        self._decorate_compare_fused_all_reduce(simple_fc_net, True)
-        self._decorate_compare_fused_all_reduce(simple_fc_net, False)
-
-    def test_batchnorm_fc_with_fuse_all_reduce(self):
-        self._decorate_compare_fused_all_reduce(fc_with_batchnorm, True)
-        self._decorate_compare_fused_all_reduce(fc_with_batchnorm, False)
-
-
-class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps):
-    def _decorate_compare_fused_all_reduce(self, model, use_cuda):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_cuda,
-            init_feed_dicta=init_data,
-            optimizer=self.optimizer,
-            fuse_all_optimizer_ops=True)
-
-
-class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-        cls.word_dict_len = 5147
-        batch_size = 64
-        reader = fake_imdb_reader(cls.word_dict_len, batch_size * 100)
-        reader = paddle.batch(reader, batch_size=batch_size)()
-        cls.train_data = next(reader)
-
-    def get_data_from_feeder(self):
-        place = fluid.CPUPlace()
-        feeder = fluid.DataFeeder(feed_list=["words", "label"], place=place)
-        return feeder.feed(self.train_data)
-
-    def _decorate_compare_fused_all_reduce(self, model, use_cuda):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_cuda,
-            get_data_from_feeder=self.get_data_from_feeder,
-            optimizer=self.optimizer)
-
-    def test_simple_bow_net_with_fuse_all_reduce(self):
-        model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
-        self._decorate_compare_fused_all_reduce(model, True)
-        self._decorate_compare_fused_all_reduce(model, False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
deleted file mode 100644
index 617fecffe07fad33759e69c629eb84ac2c9072a0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from simple_nets import simple_fc_net, fc_with_batchnorm, init_data
-from parallel_executor_test_base import TestParallelExecutorBase
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import unittest
-import os
-
-
-class TestMNIST(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def _compare_fuse_elewise_add_act_ops(self, model, use_cuda):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-        img, label = init_data()
-
-        def _optimizer(learning_rate=1e-6):
-            optimizer = fluid.optimizer.SGD(
-                learning_rate=learning_rate,
-                regularization=fluid.regularizer.L2Decay(1e-6))
-            return optimizer
-
-        # NOTE(dzh):
-        # need to make it compatible with elewise fuse act
-        # FIXME (liuwei12)
-        # the new memory optimize strategy will crash this unittest
-        # add enable_inplace=False here to force pass the unittest
-        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            fuse_elewise_add_act_ops=False,
-            use_ir_memory_optimize=False,
-            enable_inplace=False,
-            optimizer=_optimizer)
-        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            fuse_elewise_add_act_ops=True,
-            use_ir_memory_optimize=False,
-            enable_inplace=False,
-            optimizer=_optimizer)
-
-        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
-        for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
-
-    def test_simple_fc_with_fuse_op(self):
-        self._compare_fuse_elewise_add_act_ops(simple_fc_net, True)
-        self._compare_fuse_elewise_add_act_ops(simple_fc_net, False)
-
-    def test_batchnorm_fc_with_fuse_op(self):
-        self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, True)
-        self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
deleted file mode 100644
index b47bcd2a032a32f30b2bcdd2b48541c660abdab2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net
-from fake_reader import fake_imdb_reader
-from parallel_executor_test_base import TestParallelExecutorBase
-from functools import partial
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import unittest
-import os
-
-
-class TestFuseOptimizationOps(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def _get_feed_dict(self):
-        img, label = init_data()
-        return {"image": img, "label": label}
-
-    def _compare_fused_optimizer_ops(self,
-                                     model,
-                                     use_cuda,
-                                     feed_dict=None,
-                                     get_data_from_feeder=None,
-                                     optimizer=fluid.optimizer.Adam):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_cuda=use_cuda,
-            fuse_all_optimizer_ops=False,
-            optimizer=optimizer)
-        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_cuda=use_cuda,
-            fuse_all_optimizer_ops=True,
-            optimizer=optimizer)
-
-        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
-        for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
-
-    def _decorate_compare_fused_optimizer_ops(self, model, use_cuda, optimizer):
-        self._compare_fused_optimizer_ops(
-            model,
-            use_cuda,
-            feed_dict=self._get_feed_dict(),
-            optimizer=optimizer)
-
-
-class TestFuseAdamOps(TestFuseOptimizationOps):
-    def optimizer(self, learning_rate=1e-4):
-        return fluid.optimizer.Adam(learning_rate=learning_rate)
-
-    def test_batchnorm_fc_with_fuse_op(self):
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, True, optimizer=self.optimizer)
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, False, optimizer=self.optimizer)
-
-
-class TestFuseSGDOps(TestFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.SGD(learning_rate=learning_rate)
-
-
-class TestFuseMomentumOps(TestFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1)
-
-
-class TestSpareFuseAdamOps(TestFuseOptimizationOps):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-        cls.word_dict_len = 5147
-        batch_size = 64
-        reader = fake_imdb_reader(cls.word_dict_len, batch_size * 100)
-        reader = paddle.batch(reader, batch_size=batch_size)()
-        cls.train_data = next(reader)
-
-    def _get_data_from_feeder(self):
-        place = fluid.CPUPlace()
-        feeder = fluid.DataFeeder(feed_list=["words", "label"], place=place)
-        return feeder.feed(self.train_data)
-
-    def _decorate_compare_fused_optimizer_ops(self, model, use_cuda, optimizer):
-        self._compare_fused_optimizer_ops(
-            model,
-            use_cuda,
-            get_data_from_feeder=self._get_data_from_feeder,
-            optimizer=optimizer)
-
-    def optimizer(self, learning_rate=1e-4):
-        return fluid.optimizer.Adam(learning_rate=learning_rate)
-
-    def test_simple_bow_net_with_fuse_op(self):
-        model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
-        self._decorate_compare_fused_optimizer_ops(
-            model, True, optimizer=self.optimizer)
-        self._decorate_compare_fused_optimizer_ops(
-            model, False, optimizer=self.optimizer)
-
-
-class TestSpareFuseSGDOps(TestSpareFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.SGD(learning_rate=learning_rate)
-
-
-class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1)
-
-
-class TestPassConflictBase(TestFuseAdamOps):
-    def _compare_fused_optimizer_ops(self,
-                                     model,
-                                     use_cuda,
-                                     feed_dict=None,
-                                     get_data_from_feeder=None,
-                                     optimizer=fluid.optimizer.Adam):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        self.check_pass_conflict(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_cuda=use_cuda,
-            fuse_all_optimizer_ops=True,
-            optimizer=optimizer,
-            enable_sequential_execution=True)
-
-
-class TestFuseAdamOpsPassConflict(TestPassConflictBase):
-    def optimizer(self, learning_rate=1e-4):
-        return fluid.optimizer.Adam(learning_rate=learning_rate)
-
-    def test_batchnorm_fc_with_fuse_op(self):
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, True, optimizer=self.optimizer)
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, False, optimizer=self.optimizer)
-
-
-class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):
-    def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.SGD(learning_rate=learning_rate)
-
-
-class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict):
-    def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
deleted file mode 100644
index 7c9b56d403092ebbd4effe5b15ade9520a4f5d8c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from parallel_executor_test_base import TestParallelExecutorBase
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import numpy as np
-import paddle
-import paddle.dataset.mnist as mnist
-import unittest
-import os
-
-
-def norm(*args, **kargs):
-    return fluid.layers.batch_norm(*args, **kargs)
-
-
-def sep_conv(input, channel, stride, filter, dilation=1, act=None):
-    # with scope('depthwise'):
-    input = fluid.layers.conv2d(
-        input,
-        input.shape[1],
-        filter,
-        stride,
-        groups=input.shape[1],
-        padding=(filter // 2) * dilation,
-        dilation=dilation,
-        use_cudnn=False,
-        bias_attr=False)
-    input = norm(input)
-    if act: input = act(input)
-    # with scope('pointwise'):
-    input = fluid.layers.conv2d(
-        input, channel, 1, 1, groups=1, padding=0, bias_attr=False)
-    input = norm(input)
-    if act: input = act(input)
-    return input
-
-
-def simple_depthwise_net(use_feed):
-    assert use_feed
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = fluid.layers.reshape(img, (-1, 1, 28, 28))
-    for _ in range(4):
-        hidden = sep_conv(hidden, channel=200, stride=2, filter=5)
-        hidden = fluid.layers.relu(hidden)
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-class TestMNIST(TestParallelExecutorBase):
-    def _init_data(self, random=True):
-        np.random.seed(5)
-        if random:
-            img = np.random.random(size=[32, 784]).astype(np.float32)
-        else:
-            img = np.ones(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
-    def _compare(self, model, use_cuda, random_data=True, only_forward=False):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-        img, label = self._init_data(random_data)
-
-        def _optimizer(learning_rate=1e-6):
-            optimizer = fluid.optimizer.SGD(
-                learning_rate=learning_rate,
-                regularization=fluid.regularizer.L2Decay(1e-6))
-            return optimizer
-
-        if only_forward:
-            _optimizer = None
-
-        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            fuse_relu_depthwise_conv=True,
-            use_ir_memory_optimize=True,
-            optimizer=_optimizer)
-        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            fuse_relu_depthwise_conv=False,
-            optimizer=_optimizer)
-
-        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
-        for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
-
-    def test_simple_depthwise_with_fuse_op(self):
-        self._compare(simple_depthwise_net, True)
-        self._compare(simple_depthwise_net, False)
-
-    def test_simple_depthwise_with_fuse_op_only_forward(self):
-        self._compare(simple_depthwise_net, True, only_forward=True)
-        self._compare(simple_depthwise_net, False, only_forward=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
deleted file mode 100644
index 3cf8e7229add49ba67abd7d7996a1eb3c4d13846..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
+++ /dev/null
@@ -1,330 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-from functools import partial
-import paddle.fluid.core as core
-from op_test import OpTest
-
-#   TestFusedElementwiseActivationOp
-#   TestFusedElementwiseActivationOp_scalar
-#   TestFusedElementwiseActivationOp_scalar2
-#   TestFusedElementwiseActivationOp_Vector
-#   TestFusedElementwiseActivationOp_broadcast_0
-#   TestFusedElementwiseActivationOp_broadcast_1
-#   TestFusedElementwiseActivationOp_broadcast_2
-#   TestFusedElementwiseActivationOp_broadcast_3
-#   TestFusedElementwiseActivationOp_broadcast_4
-#   TestFusedElementwiseActivationOp_rowwise_add_0
-#   TestFusedElementwiseActivationOp_rowwise_add_1
-#   TestFusedElementwiseActivationOp_channelwise_add
-
-
-def create_test_class(test_case, callback, attrs):
-    class TestFusedElementwiseActivationOp_base(OpTest):
-        def setUp(self):
-            self.op_type = "fused_elemwise_activation"
-            self.dtype = np.float32
-            self.axis = -1
-
-            self.init_input()
-            self.init_output()
-            self.init_attr()
-
-            self.inputs = {
-                'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-                'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
-            }
-            if self.attrs["save_intermediate_out"]:
-                self.outputs = {
-                    'Out': self.out,
-                    "IntermediateOut": self.intermediate_out
-                }
-            else:
-                self.outputs = {'Out': self.out}
-
-        def init_input(self):
-            self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-            self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-            self.axis = -1
-
-        def init_output(self):
-            self.x, self.y, self.intermediate_out, self.out = \
-                callback(self.x, self.y, self.x, self.y)
-
-        def init_attr(self):
-            self.attrs = {'axis': self.axis, }
-            for key in attrs.keys():
-                self.attrs[key] = attrs[key]
-
-        def test_check_output(self):
-            self.check_output()
-
-        # FIXME(zcd): the intermediate_out_grad is not checked.
-        def test_check_grad_normal(self):
-            if self.attrs["save_intermediate_out"]:
-                self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)
-            else:
-                self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)
-
-        def test_check_grad_ingore_x(self):
-            if self.attrs["save_intermediate_out"]:
-                self.check_grad(
-                    ['Y'], ['Out'],
-                    max_relative_error=0.005,
-                    no_grad_set=set("X"))
-            else:
-                self.check_grad(
-                    ['Y'], ['Out'],
-                    max_relative_error=0.005,
-                    no_grad_set=set("X"))
-
-        def test_check_grad_ingore_y(self):
-            if self.attrs["save_intermediate_out"]:
-                self.check_grad(
-                    ['X'], ['Out'],
-                    max_relative_error=0.005,
-                    no_grad_set=set("Y"))
-            else:
-                self.check_grad(
-                    ['X'], ['Out'],
-                    max_relative_error=0.005,
-                    no_grad_set=set("Y"))
-
-    class TestFusedElementwiseActivationOp_scalar(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-            self.y = np.random.rand(1).astype(self.dtype)
-
-    class TestFusedElementwiseActivationOp_scalar2(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-            self.y = np.random.rand(1, 1).astype(self.dtype)
-
-    class TestFusedElementwiseActivationOp_Vector(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.random((32, )).astype(self.dtype)
-            self.y = np.random.random((32, )).astype(self.dtype)
-
-    class TestFusedElementwiseActivationOp_broadcast_0(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-            self.y = np.random.rand(2).astype(self.dtype)
-            self.axis = 0
-
-        def init_output(self):
-            self.x, self.y, self.intermediate_out, self.out = \
-                callback(self.x, self.y, self.x, self.y.reshape(2, 1, 1))
-
-    class TestFusedElementwiseActivationOp_broadcast_1(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-            self.y = np.random.rand(3).astype(self.dtype)
-            self.axis = 1
-
-        def init_output(self):
-            self.x, self.y, self.intermediate_out, self.out = \
-                callback(self.x, self.y, self.x, self.y.reshape(1, 3, 1))
-
-    class TestFusedElementwiseActivationOp_broadcast_2(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-            self.y = np.random.rand(4).astype(self.dtype)
-
-        def init_output(self):
-            self.x, self.y, self.intermediate_out, self.out = \
-                callback(self.x, self.y, self.x, self.y.reshape(1, 1, 4))
-
-    class TestFusedElementwiseActivationOp_broadcast_3(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-            self.y = np.random.rand(3, 4).astype(self.dtype)
-            self.axis = 1
-
-        def init_output(self):
-            self.x, self.y, self.intermediate_out, self.out = \
-                callback(self.x, self.y, self.x, self.y.reshape(1, 3, 4, 1))
-
-    class TestFusedElementwiseActivationOp_broadcast_4(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
-            self.y = np.random.rand(2, 1).astype(self.dtype)
-            self.axis = 0
-
-        def init_output(self):
-            self.x, self.y, self.intermediate_out, self.out = \
-                callback(self.x, self.y, self.x, self.y.reshape(2, 1, 1, 1))
-
-    class TestFusedElementwiseActivationOp_rowwise_add_0(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-            self.y = np.random.rand(3, 4).astype(self.dtype)
-            self.axis = 1
-
-        def init_output(self):
-            self.x, self.y, self.intermediate_out, self.out = \
-                callback(self.x, self.y, self.x, self.y.reshape(1, 3, 4))
-
-    class TestFusedElementwiseActivationOp_rowwise_add_1(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.rand(2, 1).astype(self.dtype)
-            self.y = np.random.rand(1).astype(self.dtype)
-            self.axis = 1
-
-        def init_output(self):
-            self.x, self.y, self.intermediate_out, self.out = \
-                callback(self.x, self.y, self.x, self.y.reshape(1, 1))
-
-    class TestFusedElementwiseActivationOp_channelwise_add(
-            TestFusedElementwiseActivationOp_base):
-        def init_input(self):
-            self.x = np.random.rand(3, 20, 20).astype(self.dtype)
-            self.y = np.random.rand(3, 1, 1).astype(self.dtype)
-
-    TestFusedElementwiseActivationOp_base.__name__ = test_case + "_base"
-    TestFusedElementwiseActivationOp_scalar.__name__ = test_case + "_scalar"
-    TestFusedElementwiseActivationOp_scalar2.__name__ = test_case + "_scalar2"
-    TestFusedElementwiseActivationOp_Vector.__name__ = test_case + "_Vector"
-    TestFusedElementwiseActivationOp_broadcast_0.__name__ = test_case + "_broadcast_0"
-    TestFusedElementwiseActivationOp_broadcast_1.__name__ = test_case + "_broadcast_1"
-    TestFusedElementwiseActivationOp_broadcast_2.__name__ = test_case + "_broadcast_2"
-    TestFusedElementwiseActivationOp_broadcast_3.__name__ = test_case + "_broadcast_3"
-    TestFusedElementwiseActivationOp_broadcast_4.__name__ = test_case + "_broadcast_4"
-    TestFusedElementwiseActivationOp_rowwise_add_0.__name__ = test_case + "_rowwise_add_0"
-    TestFusedElementwiseActivationOp_rowwise_add_1.__name__ = test_case + "_rowwise_add_1"
-    TestFusedElementwiseActivationOp_channelwise_add.__name__ = test_case + "_channelwise_add"
-
-    globals()[test_case + "_base"] = TestFusedElementwiseActivationOp_base
-    globals()[test_case + "_scalar"] = TestFusedElementwiseActivationOp_scalar
-    globals()[test_case + "_scalar2"] = TestFusedElementwiseActivationOp_scalar2
-    globals()[test_case + "_Vector"] = TestFusedElementwiseActivationOp_Vector
-    globals()[test_case +
-              "_broadcast_0"] = TestFusedElementwiseActivationOp_broadcast_0
-    globals()[test_case +
-              "_broadcast_1"] = TestFusedElementwiseActivationOp_broadcast_1
-    globals()[test_case +
-              "_broadcast_2"] = TestFusedElementwiseActivationOp_broadcast_2
-    globals()[test_case +
-              "_broadcast_3"] = TestFusedElementwiseActivationOp_broadcast_3
-    globals()[test_case +
-              "_broadcast_4"] = TestFusedElementwiseActivationOp_broadcast_4
-    globals()[test_case +
-              "_rowwise_add_0"] = TestFusedElementwiseActivationOp_rowwise_add_0
-    globals()[test_case +
-              "_rowwise_add_1"] = TestFusedElementwiseActivationOp_rowwise_add_1
-    globals(
-    )[test_case +
-      "_channelwise_add"] = TestFusedElementwiseActivationOp_channelwise_add
-
-
-def scale_add_func(x, y, x_bcast, y_bcast, scale, mode=0):
-    if mode == 0:
-        return x, y, (x_bcast + y_bcast), (x_bcast + y_bcast) * scale
-    else:
-        return y, x, (x_bcast + y_bcast), (x_bcast + y_bcast) * scale
-
-
-def add_scale_func(x, y, x_bcast, y_bcast, scale, mode=0):
-    if mode == 0:
-        return x, y, y * scale, x_bcast + y_bcast * scale
-    else:
-        return y, x, x * scale, y_bcast + x_bcast * scale
-
-
-def add_relu_func(x, y, x_bcast, y_bcast, mode=0):
-    # Copy from test_activation_op.py
-    # Because we set delta = 0.005 in calculating numeric gradient,
-    # if x is too small, such as 0.002, x_neg will be -0.003
-    # x_pos will be 0.007, so the numeric gradient is inaccurate.
-    # we should avoid this
-    if mode == 0:
-        y[np.abs(y) < 0.005] = 0.02
-        y_bcast[np.abs(y_bcast) < 0.005] = 0.02
-        return x, y, np.maximum(y, 0), x_bcast + np.maximum(y_bcast, 0)
-    else:
-        x[np.abs(x) < 0.005] = 0.02
-        x_bcast[np.abs(x_bcast) < 0.005] = 0.02
-        return y, x, np.maximum(x, 0), y_bcast + np.maximum(x_bcast, 0)
-
-
-def relu_add_func(x, y, x_bcast, y_bcast, mode=0):
-    intermediate_out = x_bcast + y_bcast
-    out = np.maximum(intermediate_out, 0)
-    out[np.abs(out) < 0.005] = 0.02
-    if mode == 0:
-        return x, y, intermediate_out, out
-    else:
-        return y, x, intermediate_out, out
-
-
-def mul_scale_func(x, y, x_bcast, y_bcast, scale, mode=0):
-    if mode == 0:
-        return x, y, y * scale, x_bcast * (y_bcast * scale)
-    else:
-        return y, x, x * scale, y_bcast * (x_bcast * scale)
-
-
-scale = 0.1
-scale_add_func = partial(scale_add_func, scale=scale)
-add_scale_func = partial(add_scale_func, scale=scale)
-mul_scale_func = partial(mul_scale_func, scale=scale)
-
-for mode in {0, 1}:
-    scale_add_func = partial(scale_add_func, mode=mode)
-    add_scale_func = partial(add_scale_func, mode=mode)
-    mul_scale_func = partial(mul_scale_func, mode=mode)
-    relu_add_func = partial(relu_add_func, mode=mode)
-    add_relu_func = partial(add_relu_func, mode=mode)
-
-    for save_intermediate_out in {True, False}:
-        suffix = ("_save_intermediate_out" if save_intermediate_out else "") \
-                 + ("_mode_"+ str(mode))
-        create_test_class('scale_add' + suffix, scale_add_func, {
-            'scale': scale,
-            'functor_list': ["scale", "elementwise_add"],
-            'save_intermediate_out': save_intermediate_out,
-        })
-        create_test_class('add_scale' + suffix, add_scale_func, {
-            'scale': scale,
-            'functor_list': ["elementwise_add", "scale"],
-            'save_intermediate_out': save_intermediate_out,
-        })
-        create_test_class('add_relu' + suffix, add_relu_func, {
-            'functor_list': ["elementwise_add", "relu"],
-            'save_intermediate_out': save_intermediate_out,
-        })
-        create_test_class('relu_add' + suffix, relu_add_func, {
-            'functor_list': ["relu", "elementwise_add"],
-            'save_intermediate_out': save_intermediate_out,
-        })
-        create_test_class('mul_scale' + suffix, mul_scale_func, {
-            'scale': scale,
-            'functor_list': ["elementwise_mul", "scale"],
-            'save_intermediate_out': save_intermediate_out,
-        })
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
deleted file mode 100644
index 09523d65b48db70568cb5a479f7b78f4d90cd6ce..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import platform
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid.op import Operator
-import paddle.compat as cpt
-import paddle.version as ver
-
-
-class TestFusedEmbeddingSeqPoolOp(OpTest):
-    def setUp(self):
-        self.op_type = "fused_embedding_seq_pool"
-        self.emb_size = 2
-        self.table = np.random.random((17, self.emb_size)).astype("float32")
-        self.ids = np.array([[[4], [3]], [[4], [3]], [[2], [1]],
-                             [[16], [1]]]).astype("int64")
-        ids_expand = np.expand_dims(self.ids, axis=1)
-        self.lod = [[3, 1]]
-        self.attrs = {'is_sparse': True}
-        self.inputs = {'W': self.table, 'Ids': (ids_expand, self.lod)}
-        self.outputs = {
-            'Out': np.reshape(
-                np.array([
-                    self.table[[4, 3]] + self.table[[4, 3]] +
-                    self.table[[2, 1]], self.table[[16, 1]]
-                ]), [len(self.lod[0]), 2 * self.emb_size])
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        if ver.mkl() == "ON" and 'Linux' in platform.platform():
-            self.attrs = {'is_sparse': False}
-            self.check_grad(['W'], 'Out', no_grad_set=('Ids'))
-
-
-class TestLookupTableOpWithPadding(TestFusedEmbeddingSeqPoolOp):
-    def test_check_output(self):
-        if ver.mkl() == "ON" and 'Linux' in platform.platform():
-            ids = np.squeeze(self.ids, axis=2)
-            padding_idx = np.random.choice(ids.flatten(), 1)[0]
-            output = list()
-            index = 0
-            for count in self.lod[0]:
-                arr = ids[index:count + index]
-                out = np.reshape(self.table[arr.flatten()],
-                                 [arr.shape[0], arr.shape[1], self.emb_size])
-                idx = np.argwhere(arr == padding_idx)
-                for item in idx:
-                    out[item[0], item[1], :] = np.zeros(self.emb_size)
-                output.append(np.sum(out, 0))
-                index += count
-            self.outputs = {
-                'Out': np.reshape(
-                    np.array(output), [len(self.lod[0]), 2 * self.emb_size])
-            }
-            self.attrs = {'padding_idx': int(padding_idx)}
-            self.check_output()
-
-    def test_check_grad(self):
-        if ver.mkl() == "ON" and 'Linux' in platform.platform():
-            ids = np.squeeze(self.ids, axis=2)
-            padding_idx = np.random.choice(ids.flatten(), 1)[0]
-            self.attrs = {'padding_idx': int(padding_idx), 'is_sparse': False}
-            self.check_grad(['W'], 'Out', no_grad_set=('Ids'))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
deleted file mode 100644
index 70ca521d3387ac11cd41d8496b4d094667232d4c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
+++ /dev/null
@@ -1,218 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_lstm_op import lstm, ACTIVATION
-
-
-def fc(x, w, b):
-    return np.dot(x, w) + b
-
-
-def fused_embedded_fc_lstm(
-        ids,  # T x 1
-        lod,  # 1 x N
-        embeddings=None,  # Dict_size x M
-        wx=None,  # M x 4D
-        bx=None,  # 1 x 4D
-        h0=None,  # N x D
-        c0=None,  # N x D
-        w_h=None,  # D x 4D
-        w_b=None,  # 1 x 4D
-        w_c=None,  # 1 x 3D
-        is_reverse=False,
-        act_gate=None,
-        act_cell=None,
-        act_cand=None):
-    # Make a lookup for embeddings and pass result into lstm reference
-    T = ids.shape[0]
-    M = embeddings.shape[1]
-    x = embeddings[ids].reshape([T, M])
-    return lstm(
-        fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
-        act_cell, act_cand)
-
-
-class TestFusionLSTMOp(OpTest):
-    def set_conf(self):
-        pass
-
-    def setUp(self):
-        self.op_type = 'fused_embedding_fc_lstm'
-        self.lod = [[2, 3, 5, 4]]
-        self.M = 8  # Embedding size
-        self.D = 16  # Hidden size 
-        self.dict_size = 18
-        self.has_initial_state = False
-        self.use_peepholes = False
-        self.is_reverse = False
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-        self.set_conf()
-
-        T = sum(self.lod[0])
-        bs = len(self.lod[0])
-
-        # this is the weight of fc
-        wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float32')
-        # this is the bias of fc
-        bx = np.random.normal(size=(1, 4 * self.D)).astype('float32')
-
-        if self.use_peepholes:
-            b = np.random.normal(size=(1, 7 * self.D)).astype('float32')
-        else:
-            b = np.random.normal(size=(1, 4 * self.D)).astype('float32')
-        w_b = np.copy(b[:, 0:4 * self.D])
-        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
-
-        # low is 0 , high is voc_size - 1
-        ids = np.random.randint(
-            low=0, high=self.dict_size - 1, size=(T, 1)).astype("int64")
-        # embeddings as they were trained , so each entry is of M size
-        embeddings = np.random.random(
-            (self.dict_size, self.M)).astype("float32")
-
-        # multiply embeddings via Weights
-        fc_embeddings = np.dot(embeddings, wx)
-
-        # bias should be manually added into the bias of this fused embedding fc LSTM
-        b[0, 0:4 * self.D] += bx[0, :]
-        combined_biases = b[:, 0:4 * self.D]
-        # So let broadcast it , so they can be added
-        ones = np.ones([self.dict_size, 1])
-        broadcasted_biases = np.dot(ones, combined_biases)
-        # Sum biases with Wx*embeddings
-        fc_embeddings += broadcasted_biases
-
-        if self.has_initial_state:
-            h0 = np.random.normal(size=(bs, self.D)).astype('float32')
-            c0 = np.random.normal(size=(bs, self.D)).astype('float32')
-        else:
-            h0 = np.zeros((bs, self.D)).astype('float32')
-            c0 = np.zeros((bs, self.D)).astype('float32')
-
-        wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32')
-
-        h, c = fused_embedded_fc_lstm(
-            ids, self.lod, embeddings, wx, bx, h0, c0, wh, w_b, w_c,
-            self.is_reverse, ACTIVATION[self.act_gate],
-            ACTIVATION[self.act_cell], ACTIVATION[self.act_cand])
-
-        self.inputs = {
-            'Ids': (ids, self.lod),
-            'Embeddings': fc_embeddings,
-            'WeightH': wh,
-            'Bias': b
-        }
-
-        if self.has_initial_state:
-            self.inputs['H0'] = h0
-            self.inputs['C0'] = c0
-
-        self.outputs = {
-            'Hidden': (h, self.lod),
-            'Cell': (c, self.lod),
-        }
-        self.attrs = {
-            'use_peepholes': self.use_peepholes,
-            'is_reverse': self.is_reverse,
-            'gate_activation': self.act_gate,
-            'cell_activation': self.act_cell,
-            'candidate_activation': self.act_cand
-        }
-
-    def test_check_output(self):
-        for use_seq in {True, False}:
-            self.attrs['use_seq'] = use_seq
-            self.check_output()
-
-
-class TestFusionLSTMOpInit(TestFusionLSTMOp):
-    def set_conf(self):
-        self.has_initial_state = True
-
-
-class TestFusionLSTMOpReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.is_reverse = True
-
-
-class TestFusionLSTMOpInitReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.has_initial_state = True
-        self.is_reverse = True
-
-
-class TestFusionLSTMOpMD1(TestFusionLSTMOp):
-    def set_conf(self):
-        self.M = 36
-        self.D = 8
-
-
-class TestFusionLSTMOpMD2(TestFusionLSTMOp):
-    def set_conf(self):
-        self.M = 8
-        self.D = 8
-
-
-class TestFusionLSTMOpMD3(TestFusionLSTMOp):
-    def set_conf(self):
-        self.M = 15
-        self.D = 3
-
-
-class TestFusionLSTMOpBS1(TestFusionLSTMOp):
-    def set_conf(self):
-        self.lod = [[3]]
-        self.D = 16
-
-
-class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_peepholes = True
-
-
-class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_peepholes = True
-        self.has_initial_state = True
-
-
-class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_peepholes = True
-        self.is_reverse = True
-
-
-class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_peepholes = True
-        self.has_initial_state = True
-        self.is_reverse = True
-
-
-class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_peepholes = True
-        self.lod = [[2]]
-        self.D = 8
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_fc_elementwise_layernorm_op.py b/python/paddle/fluid/tests/unittests/test_fused_fc_elementwise_layernorm_op.py
deleted file mode 100644
index 9604201e04e1dc0e176fdb899275a9cadc325ad1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fused_fc_elementwise_layernorm_op.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from paddle.fluid import core
-from test_fc_op import fc_refer, MatrixGenerate
-from test_layer_norm_op import _reference_layer_norm_naive
-
-np.random.random(123)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "Paddle core is not compiled with CUDA")
-class TestFusedFCElementwiseLayerNormOp(OpTest):
-    def config(self):
-        self.matrix = MatrixGenerate(1, 10, 15, 3, 3, 2)
-        self.y_shape = [1, 15]
-        self.begin_norm_axis = 1
-
-    def setUp(self):
-        self.op_type = "fused_fc_elementwise_layernorm"
-        self.config()
-
-        # Attr of layer_norm
-        epsilon = 0.00001
-
-        # fc
-        fc_out = fc_refer(self.matrix, True, True)
-        # elementwise_add
-        y = np.random.random_sample(self.y_shape).astype(np.float32)
-        add_out = fc_out + y
-        # layer_norm
-        scale_shape = [np.prod(self.y_shape[self.begin_norm_axis:])]
-        scale = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_1 = np.random.random_sample(scale_shape).astype(np.float32)
-        out, mean, variance = _reference_layer_norm_naive(
-            add_out, scale, bias_1, epsilon, self.begin_norm_axis)
-
-        self.inputs = {
-            "X": self.matrix.input,
-            "W": self.matrix.weights,
-            "Bias0": self.matrix.bias,
-            "Y": y,
-            "Scale": scale,
-            "Bias1": bias_1
-        }
-        self.attrs = {
-            "activation_type": "relu",
-            "epsilon": epsilon,
-            "begin_norm_axis": self.begin_norm_axis
-        }
-        self.outputs = {"Out": out, "Mean": mean, "Variance": variance}
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=2e-3)
-
-
-class TestFusedFCElementwiseLayerNormOp2(TestFusedFCElementwiseLayerNormOp):
-    def config(self):
-        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
-        self.y_shape = [4, 6]
-        self.begin_norm_axis = 1
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
deleted file mode 100644
index 377454e7802e40f90c371987adfe50cce922c764..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-from op_test import OpTest
-from test_gru_op import gru
-from test_fusion_lstm_op import fc, ACTIVATION
-
-
-def fusion_gru(
-        x,  # T x M
-        lod,  # 1 x N
-        h0,  # N x D
-        wx,  # M x 3D
-        wh,  # D x 3D
-        bias,  # 1 x 3D
-        is_reverse,
-        act_state,
-        act_gate):
-    return gru(fc(x, wx, bias),
-               lod,
-               h0,
-               wh,
-               np.zeros(
-                   (1, wh.shape[1]), dtype='float32'),
-               is_reverse,
-               act_state,
-               act_gate)
-
-
-class TestFusionGRUOp(OpTest):
-    def set_confs(self):
-        pass
-
-    def setUp(self):
-        self.op_type = "fusion_gru"
-        self.lod = [[2, 4, 3]]
-        self.M = 3
-        self.D = 5
-        self.is_reverse = False
-        self.with_h0 = True
-        self.with_bias = True
-        self.act_state = 'tanh'
-        self.act_gate = 'sigmoid'
-        self.set_confs()
-
-        T = sum(self.lod[0])
-        N = len(self.lod[0])
-
-        x = np.random.rand(T, self.M).astype('float32')
-        wx = np.random.rand(self.M, 3 * self.D).astype('float32')
-        wh = np.random.rand(self.D, 3 * self.D).astype('float32')
-        bias = np.random.rand(
-            1, 3 * self.D).astype('float32') if self.with_bias else np.zeros(
-                (1, 3 * self.D), dtype='float32')
-        h0 = np.random.rand(
-            N, self.D).astype('float32') if self.with_h0 else np.zeros(
-                (N, self.D), dtype='float32')
-
-        _, _, _, hidden = fusion_gru(
-            x, self.lod, h0, wx, wh, bias, self.is_reverse,
-            ACTIVATION[self.act_state], ACTIVATION[self.act_gate])
-
-        self.inputs = {'X': (x, self.lod), 'WeightX': wx, 'WeightH': wh}
-
-        if self.with_bias:
-            self.inputs['Bias'] = bias
-
-        if self.with_h0:
-            self.inputs['H0'] = h0
-
-        self.outputs = {'Hidden': (hidden, self.lod)}
-
-        self.attrs = {
-            'activation': self.act_state,
-            'gate_activation': self.act_gate,
-            'is_reverse': self.is_reverse
-        }
-
-    def test_check_output(self):
-        for use_seq in {True, False}:
-            self.attrs['use_seq'] = use_seq
-            self.check_output()
-
-
-class TestFusionGRUOpNoInitial(TestFusionGRUOp):
-    def set_confs(self):
-        self.with_h0 = False
-
-
-class TestFusionGRUOpNoBias(TestFusionGRUOp):
-    def set_confs(self):
-        self.with_bias = False
-
-
-class TestFusionGRUOpReverse(TestFusionGRUOp):
-    def set_confs(self):
-        self.is_reverse = True
-
-
-class TestFusionGRUOpMD1(TestFusionGRUOp):
-    def set_confs(self):
-        self.M = 36
-        self.D = 8
-
-
-class TestFusionGRUOpMD2(TestFusionGRUOp):
-    def set_confs(self):
-        self.M = 8
-        self.D = 8
-
-
-class TestFusionGRUOpMD3(TestFusionGRUOp):
-    def set_confs(self):
-        self.M = 17
-        self.D = 15
-
-
-class TestFusionGRUOpBS1(TestFusionGRUOp):
-    def set_confs(self):
-        self.lod = [[3]]
-        self.D = 16
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
deleted file mode 100644
index de0c86f96db958eebd7e74346bec244f0c804ed9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ /dev/null
@@ -1,194 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_lstm_op import lstm, ACTIVATION
-
-
-def fc(x, w, b):
-    return np.dot(x, w) + b
-
-
-def fusion_lstm(
-        x,  # T x M
-        lod,  # 1 x N
-        wx=None,  # M x 4D
-        bx=None,  # 1 x 4D
-        h0=None,  # N x D
-        c0=None,  # N x D
-        w_h=None,  # D x 4D
-        w_b=None,  # 1 x 4D
-        w_c=None,  # 1 x 3D
-        is_reverse=False,
-        act_gate=None,
-        act_cell=None,
-        act_cand=None):
-    return lstm(
-        fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
-        act_cell, act_cand)
-
-
-class TestFusionLSTMOp(OpTest):
-    def set_conf(self):
-        pass
-
-    def setUp(self):
-        self.op_type = 'fusion_lstm'
-        self.lod = [[2, 3, 5, 4]]
-        self.M = 8
-        self.D = 16
-        self.has_initial_state = False
-        self.use_peepholes = False
-        self.is_reverse = False
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-        self.set_conf()
-
-        T = sum(self.lod[0])
-        bs = len(self.lod[0])
-
-        x = np.random.normal(size=(T, self.M)).astype('float32')
-        if self.has_initial_state:
-            h0 = np.random.normal(size=(bs, self.D)).astype('float32')
-            c0 = np.random.normal(size=(bs, self.D)).astype('float32')
-        else:
-            h0 = np.zeros((bs, self.D)).astype('float32')
-            c0 = np.zeros((bs, self.D)).astype('float32')
-
-        wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32')
-
-        if self.use_peepholes:
-            b = np.random.normal(size=(1, 7 * self.D)).astype('float32')
-        else:
-            b = np.random.normal(size=(1, 4 * self.D)).astype('float32')
-        w_b = np.copy(b[:, 0:4 * self.D])
-        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
-
-        # this is the weight of fc
-        wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float32')
-        # this is the bias of fc
-        # and it should be manually added into the bias of this fusion LSTM
-        bx = np.random.normal(size=(1, 4 * self.D)).astype('float32')
-        b[0, 0:4 * self.D] += bx[0, :]
-        h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c,
-                           self.is_reverse, ACTIVATION[self.act_gate],
-                           ACTIVATION[self.act_cell], ACTIVATION[self.act_cand])
-
-        self.inputs = {
-            'X': (x, self.lod),
-            'WeightX': wx,
-            'WeightH': wh,
-            'Bias': b
-        }
-
-        if self.has_initial_state:
-            self.inputs['H0'] = h0
-            self.inputs['C0'] = c0
-
-        self.outputs = {
-            'Hidden': (h, self.lod),
-            'Cell': (c, self.lod),
-        }
-        self.attrs = {
-            'use_peepholes': self.use_peepholes,
-            'is_reverse': self.is_reverse,
-            'gate_activation': self.act_gate,
-            'cell_activation': self.act_cell,
-            'candidate_activation': self.act_cand
-        }
-
-    def test_check_output(self):
-        for use_seq in {True, False}:
-            self.attrs['use_seq'] = use_seq
-            self.check_output()
-
-
-class TestFusionLSTMOpInit(TestFusionLSTMOp):
-    def set_conf(self):
-        self.has_initial_state = True
-
-
-class TestFusionLSTMOpReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.is_reverse = True
-
-
-class TestFusionLSTMOpInitReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.has_initial_state = True
-        self.is_reverse = True
-
-
-class TestFusionLSTMOpMD1(TestFusionLSTMOp):
-    def set_conf(self):
-        self.M = 36
-        self.D = 8
-
-
-class TestFusionLSTMOpMD2(TestFusionLSTMOp):
-    def set_conf(self):
-        self.M = 8
-        self.D = 8
-
-
-class TestFusionLSTMOpMD3(TestFusionLSTMOp):
-    def set_conf(self):
-        self.M = 15
-        self.D = 3
-
-
-class TestFusionLSTMOpBS1(TestFusionLSTMOp):
-    def set_conf(self):
-        self.lod = [[3]]
-        self.D = 16
-
-
-class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_peepholes = True
-
-
-class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_peepholes = True
-        self.has_initial_state = True
-
-
-class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_peepholes = True
-        self.is_reverse = True
-
-
-class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_peepholes = True
-        self.has_initial_state = True
-        self.is_reverse = True
-
-
-class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_peepholes = True
-        self.lod = [[2]]
-        self.D = 8
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_repeated_fc_relu_op.py b/python/paddle/fluid/tests/unittests/test_fusion_repeated_fc_relu_op.py
deleted file mode 100644
index d21368fbf82797eec5f6b2fe9fc1cc6f592b6754..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fusion_repeated_fc_relu_op.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_fc_op import fc_refer, MatrixGenerate
-
-
-class TestFusionRepeatedFCReluOp(OpTest):
-    def setUp(self):
-        self.bs = 3
-        self.ic = 9
-        self.oc = [2, 4, 3]
-        assert len(self.oc) > 1, 'Should larger than 1'
-        self.set_conf()
-        self.op_type = 'fusion_repeated_fc_relu'
-        sz = len(self.oc)
-        ics = [self.ic] + self.oc[0:sz - 1]
-        assert len(ics) == len(self.oc)
-        weights = []
-        biases = []
-        outs = []
-
-        i = 0
-        matrix = MatrixGenerate(self.bs, ics[i], self.oc[i], 1, 1)
-        inp = np.reshape(matrix.input, [self.bs, ics[i]])
-        weights.append(('W_{0}'.format(i), np.reshape(matrix.weights,
-                                                      [ics[i], self.oc[i]])))
-        biases.append(('B_{0}'.format(i), matrix.bias))
-        outs.append(
-            np.reshape(
-                np.maximum(fc_refer(matrix, True), 0), [self.bs, self.oc[i]]))
-
-        for i in range(sz - 1):
-            matrix = MatrixGenerate(self.bs, ics[i + 1], self.oc[i + 1], 1, 1)
-            matrix.input = np.reshape(outs[i], [self.bs, ics[i + 1], 1, 1])
-            out = fc_refer(matrix, True)
-            weights.append(
-                ('W_{0}'.format(i + 1),
-                 np.reshape(matrix.weights, [ics[i + 1], self.oc[i + 1]])))
-            biases.append(('B_{0}'.format(i + 1), matrix.bias))
-            outs.append(
-                np.reshape(np.maximum(out, 0), [self.bs, self.oc[i + 1]]))
-
-        relu_outs = []
-        for i in range(sz - 1):
-            relu_outs.append(('ReluOut_{0}'.format(i), outs[i]))
-
-        self.inputs = {
-            'X': inp,
-            'W': weights,
-            'Bias': biases,
-        }
-
-        self.outputs = {'Out': outs[-1], 'ReluOut': relu_outs}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def set_conf(self):
-        pass
-
-
-class TestFusionRepeatedFCReluOpBS1(TestFusionRepeatedFCReluOp):
-    def set_conf(self):
-        self.bs = 1
-        self.oc = [4, 2, 7, 5]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
deleted file mode 100644
index ba6f1415b1c832eb688443953866652e3458b172..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import random
-from op_test import OpTest
-from test_seq_conv import seqconv
-
-
-class TestSeqConvEltAddRelu(OpTest):
-    def set_conf(self):
-        pass
-
-    def setUp(self):
-        self.op_type = 'fusion_seqconv_eltadd_relu'
-        self.lod = [[6, 4]]
-        self.in_fea_size = 16
-        self.out_fea_size = 8
-        self.context_length = 4
-        self.context_stride = 1
-        self.context_start = 0
-        self.set_conf()
-
-        assert self.context_stride == 1
-
-        T = sum(self.lod[0])
-        x = np.random.uniform(-1, 1, [T, self.in_fea_size]).astype('float32')
-        w = np.random.uniform(
-            -1, 1, [self.in_fea_size * self.context_length,
-                    self.out_fea_size]).astype('float32')
-        b = np.random.uniform(-2, 1, [1, self.out_fea_size]).astype('float32')
-        out = seqconv(x, self.lod, w, self.context_length, self.context_start)
-        out = np.maximum(out + b, 0)
-
-        self.inputs = {'X': (x, self.lod), 'Filter': w, 'Bias': b}
-        self.attrs = {
-            'contextStart': self.context_start,
-            'contextLength': self.context_length,
-            'contextStride': self.context_stride
-        }
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSeqConvEltAddReluBS1(TestSeqConvEltAddRelu):
-    def set_conf(self):
-        self.lod = [[10]]
-
-
-class TestSeqConvEltAddReluBS1Case2(TestSeqConvEltAddRelu):
-    def set_conf(self):
-        self.lod = [[2]]
-
-
-class TestSeqConvEltAddReluCase1(TestSeqConvEltAddRelu):
-    def set_conf(self):
-        self.lod = [[3, 5, 1, 6]]
-        self.context_length = 3
-        self.context_start = -2
-
-
-class TestSeqConvEltAddReluCase2(TestSeqConvEltAddRelu):
-    def set_conf(self):
-        self.lod = [[10, 1, 2, 4, 1, 5, 6]]
-        self.in_fea_size = 2
-        self.context_length = 4
-        self.context_start = -1
-
-
-class TestSeqConvEltAddReluCase3(TestSeqConvEltAddRelu):
-    def set_conf(self):
-        self.lod = [[10, 1, 2, 4, 1, 5, 6]]
-        self.context_length = 5
-        self.context_start = -4
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
deleted file mode 100644
index aeee3a9999a94b4979fc3793150101352e50be85..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_fusion_lstm_op import fc, ACTIVATION
-
-
-def fusion_seqexpand_concat_fc(xs, lod, w, b, fc_act):
-
-    T = sum(lod[0])
-    N = len(lod[0])
-    num_inputs = len(xs)
-    D = w.shape[1]
-
-    expanded_inputs = [xs[0]]
-    for i in range(num_inputs - 1):
-        x = xs[i + 1]
-        assert x.shape[0] == N
-        expanded = np.repeat(x, lod[0], axis=0)
-        assert expanded.shape[0] == T
-        assert expanded.shape[1] == x.shape[1]
-        expanded_inputs.append(expanded)
-
-    fc_input = np.concatenate(expanded_inputs, axis=1)
-    assert fc_input.shape[0] == T
-    assert fc_input.shape[1] == w.shape[0]
-    fc_out = fc(fc_input, w, b)
-    fc_out = fc_act(fc_out)
-    assert fc_out.shape[0] == T
-    assert fc_out.shape[1] == D
-    return fc_out
-
-
-class TestFusionSeqExpandConcatFCOp(OpTest):
-    def set_conf(self):
-        pass
-
-    def setUp(self):
-        self.op_type = 'fusion_seqexpand_concat_fc'
-        self.lod = [[3, 5, 8, 2]]
-        self.inputs_M = [15, 10, 10]
-        self.D = 20
-        self.with_bias = True
-        self.fc_act = 'relu'
-        self.set_conf()
-
-        T = sum(self.lod[0])
-        bs = len(self.lod[0])
-        num_inputs = len(self.inputs_M)
-
-        x0 = np.random.normal(size=(T, self.inputs_M[0])).astype('float32')
-        xs = [x0]
-        for i in range(num_inputs - 1):
-            xi = np.random.normal(size=(bs,
-                                        self.inputs_M[i + 1])).astype('float32')
-            xs.append(xi)
-
-        # fc weight and bias
-        w = np.random.normal(size=(sum(self.inputs_M),
-                                   self.D)).astype('float32')
-        b = np.random.normal(size=(
-            1, self.D)).astype('float32') if self.with_bias else np.zeros(
-                (1, self.D)).astype('float32')
-
-        out = fusion_seqexpand_concat_fc(xs, self.lod, w, b,
-                                         ACTIVATION[self.fc_act])
-
-        self.inputs = {'X': [('x0', (x0, self.lod))], 'FCWeight': w}
-        normal_lod = [[1] * bs]
-        for i in range(num_inputs - 1):
-            self.inputs['X'].append(('x%d' % (i + 1), (xs[i + 1], normal_lod)))
-
-        if self.with_bias:
-            self.inputs['FCBias'] = b
-
-        self.outputs = {'Out': (out, self.lod)}
-        self.attrs = {'fc_activation': self.fc_act}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFusionSECFCOpNonBias(TestFusionSeqExpandConcatFCOp):
-    def set_conf(self):
-        self.with_bias = False
-
-
-class TestFusionSECFCOpNonAct(TestFusionSeqExpandConcatFCOp):
-    def set_conf(self):
-        self.fc_act = 'identity'
-
-
-class TestFusionSECFCOpMD1(TestFusionSeqExpandConcatFCOp):
-    def set_conf(self):
-        self.inputs_M = [3, 4, 2, 1, 5]
-        self.D = 8
-
-
-class TestFusionSECFCOpMD2(TestFusionSeqExpandConcatFCOp):
-    def set_conf(self):
-        self.lod = [[5, 6]]
-        self.inputs_M = [1, 1]
-
-
-class TestFusionSECFCOpBS1_1(TestFusionSeqExpandConcatFCOp):
-    def set_conf(self):
-        self.lod = [[1]]
-        self.inputs_M = [3, 4, 2]
-
-
-class TestFusionSECFCOpBS1_2(TestFusionSeqExpandConcatFCOp):
-    def set_conf(self):
-        self.lod = [[1]]
-        self.inputs_M = [3, 4]
-
-
-class TestFusionSECFCOpBS1_3(TestFusionSeqExpandConcatFCOp):
-    def set_conf(self):
-        self.lod = [[5]]
-        self.inputs_M = [6, 3]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
deleted file mode 100644
index 8a6837dae2c800ba7059f77978aa7bd3c2f50136..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_reorder_lod_tensor import convert_to_offset
-from test_seq_pool import compute_seqpool_sum, compute_seqpool_avg, compute_seqpool_sqrt
-
-
-class TestFusionSeqPoolConcatOp(OpTest):
-    def setUp(self):
-        self.w = 11
-        self.lods = [[[2, 3, 5]], [[1, 5, 2]]]
-        self.set_conf()
-        self.set_pooltype()
-        self.op_type = 'fusion_seqpool_concat'
-        self.axis = 1
-        bs = len(self.lods[0][0])
-        inputs = []
-        outs = []
-        i = 0
-        for lod in self.lods:
-            assert bs == len(lod[0]), 'All lod size should be equal'
-            x = np.random.uniform(0.1, 1,
-                                  [sum(lod[0]), self.w]).astype('float32')
-            offset = convert_to_offset(lod)
-            out = np.zeros((bs, self.w)).astype('float32')
-            if self.pooltype == "SUM":
-                compute_seqpool_sum(x, offset, out)
-            elif self.pooltype == "AVERAGE":
-                compute_seqpool_avg(x, offset, out)
-            elif self.pooltype == "SQRT":
-                compute_seqpool_sqrt(x, offset, out)
-            else:
-                raise Exception("Unsupported pool type!")
-            inputs.append(('x_{0}'.format(i), (x, lod)))
-            outs.append(out)
-            i = i + 1
-
-        self.inputs = {'X': inputs}
-        self.outputs = {'Out': np.concatenate(outs, axis=self.axis)}
-        self.attrs = {
-            'pooltype': self.pooltype,
-            'axis': self.axis,
-        }
-
-    def set_pooltype(self):
-        self.pooltype = "SUM"
-
-    def set_conf(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFusionSeqPoolConcatOpCase1(TestFusionSeqPoolConcatOp):
-    def set_conf(self):
-        self.lods = [[[1]]]
-
-
-class TestFusionSeqPoolConcatOpCase2(TestFusionSeqPoolConcatOp):
-    def set_conf(self):
-        self.lods = [[[1]], [[1]], [[1]]]
-
-
-class TestFusionSeqPoolConcatOpCase3(TestFusionSeqPoolConcatOp):
-    def set_conf(self):
-        self.lods = [[[1, 3, 4, 6]]]
-        self.w = 10
-
-
-class TestFusionSeqPoolConcatOpCase4(TestFusionSeqPoolConcatOp):
-    def set_conf(self):
-        self.lods = [[[2, 13, 4]], [[1, 1, 1]], [[5, 3, 1]], [[9, 10, 3]]]
-        self.w = 3
-
-
-## test avg pool and sqrt
-def create_test_avg_sqrt_class(parent):
-    class TestSeqPoolAvgCase(parent):
-        def set_pooltype(self):
-            self.pooltype = "AVERAGE"
-
-    class TestSeqPoolSqrtCase(parent):
-        def set_pooltype(self):
-            self.pooltype = "SQRT"
-
-    cls_name_avg = "{0}_{1}".format(parent.__name__, "avg")
-    cls_name_sqrt = "{0}_{1}".format(parent.__name__, "sqrt")
-    TestSeqPoolAvgCase.__name__ = cls_name_avg
-    TestSeqPoolSqrtCase.__name__ = cls_name_sqrt
-    globals()[cls_name_avg] = TestSeqPoolAvgCase
-    globals()[cls_name_sqrt] = TestSeqPoolSqrtCase
-
-
-create_test_avg_sqrt_class(TestFusionSeqPoolConcatOp)
-create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase1)
-create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase2)
-create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase3)
-create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase4)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py
deleted file mode 100644
index 332f48ae71a9cc7b64d6aa7641c1ef8db63bc3a4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_reorder_lod_tensor import convert_to_offset
-from test_seq_pool import compute_seqpool_sum, compute_seqpool_avg, compute_seqpool_sqrt
-from test_cvm_op import cvm_compute
-
-
-class TestFusionSeqPoolCVMConcatOp(OpTest):
-    def setUp(self):
-        self.w = 11
-        self.use_cvm = True
-        self.lods = [[[2, 3, 5]], [[1, 5, 2]]]
-        self.set_conf()
-        self.set_pooltype()
-        self.op_type = 'fusion_seqpool_cvm_concat'
-        self.axis = 1
-        bs = len(self.lods[0][0])
-        inputs = []
-        outs = []
-        # The cvm variable is not actually used.
-        cvm = np.array([[0.6, 0.4]]).astype("float32")
-        i = 0
-        for lod in self.lods:
-            assert bs == len(lod[0]), 'All lod size should be equal'
-            x = np.random.uniform(0.1, 1,
-                                  [sum(lod[0]), self.w]).astype('float32')
-            offset = convert_to_offset(lod)
-            out = np.zeros((bs, self.w)).astype('float32')
-            if self.pooltype == "SUM":
-                compute_seqpool_sum(x, offset, out)
-                out = cvm_compute(out, self.w, self.use_cvm)
-            elif self.pooltype == "AVERAGE":
-                compute_seqpool_avg(x, offset, out)
-                out = cvm_compute(out, self.w, self.use_cvm)
-            elif self.pooltype == "SQRT":
-                compute_seqpool_sqrt(x, offset, out)
-                out = cvm_compute(out, self.w, self.use_cvm)
-            else:
-                raise Exception("Unsupported pool type!")
-            inputs.append(('x_{0}'.format(i), (x, lod)))
-            outs.append(out)
-            i = i + 1
-
-        self.inputs = {'X': inputs, "CVM": cvm}
-        self.outputs = {'Out': np.concatenate(outs, axis=self.axis)}
-        self.attrs = {
-            'pooltype': self.pooltype,
-            'axis': self.axis,
-        }
-
-    def set_pooltype(self):
-        self.pooltype = "SUM"
-
-    def set_conf(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFusionSeqPoolCVMConcatOpCase1(TestFusionSeqPoolCVMConcatOp):
-    def set_conf(self):
-        self.lods = [[[1]]]
-
-
-class TestFusionSeqPoolCVMConcatOpCase2(TestFusionSeqPoolCVMConcatOp):
-    def set_conf(self):
-        self.lods = [[[1]], [[1]], [[1]]]
-
-
-class TestFusionSeqPoolCVMConcatOpCase3(TestFusionSeqPoolCVMConcatOp):
-    def set_conf(self):
-        self.lods = [[[1, 3, 4, 6]]]
-        self.w = 10
-
-
-class TestFusionSeqPoolCVMConcatOpCase4(TestFusionSeqPoolCVMConcatOp):
-    def set_conf(self):
-        self.lods = [[[2, 13, 4]], [[1, 1, 1]], [[5, 3, 1]], [[9, 10, 3]]]
-        self.w = 3
-
-
-## test avg pool and sqrt
-def create_test_avg_sqrt_class(parent):
-    class TestSeqPoolAvgCase(parent):
-        def set_pooltype(self):
-            self.pooltype = "AVERAGE"
-
-    class TestSeqPoolSqrtCase(parent):
-        def set_pooltype(self):
-            self.pooltype = "SQRT"
-
-    cls_name_avg = "{0}_{1}".format(parent.__name__, "avg")
-    cls_name_sqrt = "{0}_{1}".format(parent.__name__, "sqrt")
-    TestSeqPoolAvgCase.__name__ = cls_name_avg
-    TestSeqPoolSqrtCase.__name__ = cls_name_sqrt
-    globals()[cls_name_avg] = TestSeqPoolAvgCase
-    globals()[cls_name_sqrt] = TestSeqPoolSqrtCase
-
-
-create_test_avg_sqrt_class(TestFusionSeqPoolCVMConcatOp)
-create_test_avg_sqrt_class(TestFusionSeqPoolCVMConcatOpCase1)
-create_test_avg_sqrt_class(TestFusionSeqPoolCVMConcatOpCase2)
-create_test_avg_sqrt_class(TestFusionSeqPoolCVMConcatOpCase3)
-create_test_avg_sqrt_class(TestFusionSeqPoolCVMConcatOpCase4)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_squared_mat_sub_op.py b/python/paddle/fluid/tests/unittests/test_fusion_squared_mat_sub_op.py
deleted file mode 100644
index a097d3d9a20f0b4b5dddf286f064d5698de35b5f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fusion_squared_mat_sub_op.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestFusionSquaredMatSubOp(OpTest):
-    def setUp(self):
-        self.op_type = 'fusion_squared_mat_sub'
-        self.m = 11
-        self.n = 12
-        self.k = 4
-        self.scalar = 0.5
-        self.set_conf()
-        matx = np.random.random((self.m, self.k)).astype("float32")
-        maty = np.random.random((self.k, self.n)).astype("float32")
-
-        self.inputs = {'X': matx, 'Y': maty}
-        self.outputs = {
-            'Out':
-            (np.dot(matx, maty)**2 - np.dot(matx**2, maty**2)) * self.scalar
-        }
-        self.attrs = {'scalar': self.scalar, }
-
-    def set_conf(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFusionSquaredMatSubOpCase1(TestFusionSquaredMatSubOp):
-    def set_conf(self):
-        self.scalar = -0.3
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
deleted file mode 100644
index 4aa7f76495abc03646ced1f183731f30d50c4223..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-
-
-class TestFusionTransposeFlattenConcationOp(OpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = "fusion_transpose_flatten_concat"
-
-        ins = []
-        flats = []
-        for i in range(len(self.shapes)):
-            in_shape = self.shapes[i]
-            a = np.random.random(in_shape).astype("float32")
-            ins.append(("x%d" % i, a))
-
-            b = a.transpose(self.trans_axis)
-            flat_shape = (np.prod(b.shape[:self.flatten_axis]),
-                          np.prod(b.shape[self.flatten_axis:]))
-            c = b.reshape(flat_shape)
-            flats.append(c)
-        out = np.concatenate(flats, axis=self.concat_axis)
-
-        self.inputs = {'X': ins}
-        self.attrs = {
-            'trans_axis': list(self.trans_axis),
-            'flatten_axis': self.flatten_axis,
-            'concat_axis': self.concat_axis
-        }
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, 1e-6)
-        else:
-            pass
-
-    def init_test_case(self):
-        self.shapes = [(3, 4, 17, 17), (3, 8, 7, 7), (3, 12, 5, 5)]
-        self.trans_axis = (0, 2, 3, 1)
-        self.flatten_axis = 1
-        self.concat_axis = 1
-
-
-class TestCase1(TestFusionTransposeFlattenConcationOp):
-    def init_test_case(self):
-        self.shapes = [(3, 4, 18, 17), (3, 8, 18, 7), (6, 12, 9, 5)]
-        self.trans_axis = (0, 2, 3, 1)
-        self.flatten_axis = 2
-        self.concat_axis = 1
-
-
-class TestCase2(TestFusionTransposeFlattenConcationOp):
-    def init_test_case(self):
-        self.shapes = [(3, 8, 20, 17), (3, 8, 19, 17), (3, 8, 40, 17)]
-        self.trans_axis = (0, 2, 3, 1)
-        self.flatten_axis = 2
-        self.concat_axis = 0
-
-
-class TestCase3(TestFusionTransposeFlattenConcationOp):
-    def init_test_case(self):
-        self.shapes = [(3, 8, 20, 17), (3, 8, 19, 17), (3, 8, 40, 17)]
-        self.trans_axis = (0, 3, 2, 1)
-        self.flatten_axis = 1
-        self.concat_axis = 1
-
-
-class TestCase4(TestFusionTransposeFlattenConcationOp):
-    def init_test_case(self):
-        self.shapes = [(3, 8, 9, 17), (8, 3, 9, 17), (4, 6, 9, 17)]
-        self.trans_axis = (0, 2, 1, 3)
-        self.flatten_axis = 3
-        self.concat_axis = 1
-
-
-class TestCase5(TestFusionTransposeFlattenConcationOp):
-    def init_test_case(self):
-        self.shapes = [(3, 8, 9, 17, 2), (3, 8, 2, 17, 9), (3, 17, 9, 8, 2)]
-        self.trans_axis = (0, 2, 1, 4, 3)
-        self.flatten_axis = 1
-        self.concat_axis = 1
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
deleted file mode 100644
index 357b2dc060742e545ce896b60373fe39badf4e3d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid as fluid
-
-
-class TestGatherNdOpWithEmptyIndex(OpTest):
-    """
-    Index has empty element, which means copy entire tensor
-    """
-
-    def setUp(self):
-        self.op_type = "gather_nd"
-        xnp = np.array(
-            [[65, 17, 2], [-14, -25, -1], [76, 22, 3]]).astype("float32")
-        self.inputs = {'X': xnp, 'Index': np.array([[], []]).astype("int32")}
-        self.outputs = {
-            'Out': np.vstack((xnp[np.newaxis, :], xnp[np.newaxis, :]))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestGatherNdOpWithLowIndex(OpTest):
-    """
-    Index has low rank, X has high rank
-    """
-
-    def setUp(self):
-        self.op_type = "gather_nd"
-        xnp = np.array(
-            [[65, 17, 2], [14, 25, 1], [76, 22, 3]]).astype("float32")
-        index = np.array([[1], [2]]).astype("int64")
-
-        self.inputs = {'X': xnp, 'Index': index}
-
-        self.outputs = {'Out': xnp[tuple(index.T)]}  #[[14, 25, 1], [76, 22, 3]]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestGatherNdOpWithSameIndexAsX(OpTest):
-    """
-    Index has same rank as X's rank
-    """
-
-    def setUp(self):
-        self.op_type = "gather_nd"
-        xnp = np.array(
-            [[65, 17, 2], [14, 25, 1], [76, 22, 3]]).astype("float64")
-        index = np.array([[1, 1], [2, 1]]).astype("int64")
-
-        self.inputs = {'X': xnp, 'Index': index}
-        self.outputs = {'Out': xnp[tuple(index.T)]}  #[25, 22]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestGatherNdOpWithHighRankSame(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) = Rank(X)
-    """
-
-    def setUp(self):
-        self.op_type = "gather_nd"
-        shape = (20, 9, 8, 1, 31)
-        xnp = np.random.rand(*shape)
-        index = np.vstack([np.random.randint(0, s, size=150) for s in shape]).T
-
-        self.inputs = {'X': xnp, 'Index': index.astype("int32")}
-        self.outputs = {'Out': xnp[tuple(index.T)]}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestGatherNdOpWithHighRankDiff(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) < Rank(X)
-    """
-
-    def setUp(self):
-        self.op_type = "gather_nd"
-        shape = (20, 9, 8, 1, 31)
-        xnp = np.random.rand(*shape).astype("double")
-        index = np.vstack([np.random.randint(0, s, size=1000) for s in shape]).T
-        index_re = index.reshape([10, 5, 20, 5])
-
-        self.inputs = {'X': xnp, 'Index': index_re.astype("int32")}
-        self.outputs = {'Out': xnp[tuple(index.T)].reshape([10, 5, 20])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-#Test Python API
-class TestGatherNdOpAPI(OpTest):
-    def test_case1(self):
-        x1 = fluid.layers.data(
-            name='x1', shape=[30, 40, 50, 60], dtype='float32')
-        index1 = fluid.layers.data(name='index1', shape=[2, 4], dtype='int32')
-        output1 = fluid.layers.gather_nd(x1, index1)
-
-    def test_case2(self):
-        x2 = fluid.layers.data(name='x2', shape=[30, 40, 50], dtype='float32')
-        index2 = fluid.layers.data(name='index2', shape=[2, 2], dtype='int64')
-        output2 = fluid.layers.gather_nd(x2, index2)
-
-    def test_case3(self):
-        x3 = fluid.layers.data(name='x3', shape=[3, 4, 5], dtype='float32')
-        index3 = fluid.layers.data(name='index3', shape=[2, 1], dtype='int32')
-        output3 = fluid.layers.gather_nd(x3, index3, name="gather_nd_layer")
-
-
-#Test Raise Index Error
-class TestGatherNdOpRaise(OpTest):
-    def test_check_raise(self):
-        def check_raise_is_test():
-            try:
-                x = fluid.layers.data(
-                    name='x', shape=[3, 4, 5], dtype='float32')
-                index = fluid.layers.data(
-                    name='index', shape=[2, 10], dtype='int32')
-                output = fluid.layers.gather_nd(x, index)
-            except Exception as e:
-                t = \
-                "Input(Index).shape[-1] should be no greater than Input(X).rank"
-                if t in str(e):
-                    raise IndexError
-
-        self.assertRaises(IndexError, check_raise_is_test)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
deleted file mode 100644
index 119f64ce7343819ff6c2f22e2d23c3900ac24691..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ /dev/null
@@ -1,110 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestGatherOp(OpTest):
-    def setUp(self):
-        self.op_type = "gather"
-        self.config()
-        xnp = np.random.random(self.x_shape).astype(self.x_type)
-        self.inputs = {
-            'X': xnp,
-            'Index': np.array(self.index).astype(self.index_type)
-        }
-        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-    def config(self):
-        """
-        For multi-dimension input
-        """
-        self.x_shape = (10, 20)
-        self.x_type = "float32"
-        self.index = [1, 3, 5]
-        self.index_type = "int32"
-
-
-class TestCase1(TestGatherOp):
-    def config(self):
-        """
-        For one dimension input
-        """
-        self.x_shape = (10)
-        self.x_type = "float32"
-        self.index = [1, 3, 5]
-        self.index_type = "int32"
-
-
-class TestCase2(TestGatherOp):
-    def config(self):
-        """
-        For int64_t index type
-        """
-        self.x_shape = (10)
-        self.x_type = "float32"
-        self.index = [1, 3, 5]
-        self.index_type = "int64"
-
-
-class TestCase3(TestGatherOp):
-    def config(self):
-        """
-        For other input type
-        """
-        self.x_shape = (10, 20)
-        self.x_type = "double"
-        self.index = [1, 3, 5]
-        self.index_type = "int64"
-
-
-class TestCase4(TestGatherOp):
-    def config(self):
-        self.x_shape = (10, 20)
-        self.attrs = {'overwrite': False}
-        self.x_type = "double"
-        self.index = [1, 1]
-        self.index_type = "int32"
-
-
-class TestCase5(TestGatherOp):
-    def config(self):
-        self.x_shape = (10, 20)
-        self.attrs = {'overwrite': False}
-        self.x_type = "float"
-        self.index = [1, 1, 3]
-        self.index_type = "int32"
-
-
-class TestCase6(TestGatherOp):
-    def config(self):
-        self.x_shape = (10, 20)
-        self.attrs = {'overwrite': True}
-        self.x_type = "float"
-        self.index = [1, 3]
-        self.index_type = "int32"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
deleted file mode 100644
index 9a0631fa26a3e93c5c2115fd03a37de3fac46ce5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestGaussianRandomBatchSizeLike(OpTest):
-    def setUp(self):
-        self.op_type = "gaussian_random_batch_size_like"
-        self.inputs = {'Input': np.zeros((500, 2000), dtype="float32")}
-        self.attrs = {'mean': 1., 'std': 2., 'shape': [-1, 2000]}
-        self.outputs = {'Out': np.zeros((500, 2000), dtype='float32')}
-
-    def test_check_output(self):
-        self.check_output_customized(self.verify_output)
-
-    def verify_output(self, outs):
-        self.assertEqual(outs[0].shape, (500, 2000))
-        hist, _ = np.histogram(outs[0], range=(-3, 5))
-        hist = hist.astype("float32")
-        hist /= float(outs[0].size)
-        data = np.random.normal(size=(500, 2000), loc=1, scale=2)
-        hist2, _ = np.histogram(data, range=(-3, 5))
-        hist2 = hist2.astype("float32")
-        hist2 /= float(outs[0].size)
-        self.assertTrue(
-            np.allclose(
-                hist, hist2, rtol=0, atol=0.01),
-            "hist: " + str(hist) + " hist2: " + str(hist2))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
deleted file mode 100644
index 496aa4111056591efce14549011d66f9ae49713a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.executor import Executor
-
-
-class TestGaussianRandomOp(unittest.TestCase):
-    def setUp(self):
-        self.op_type = "gaussian_random"
-        self.inputs = {}
-        self.use_mkldnn = False
-        self.init_kernel_type()
-        self.attrs = {
-            "shape": [1000, 784],
-            "mean": .0,
-            "std": 1.,
-            "seed": 10,
-            "use_mkldnn": self.use_mkldnn
-        }
-
-        self.outputs = ["Out"]
-
-    def test_cpu(self):
-        self.gaussian_random_test(place=fluid.CPUPlace())
-
-    def test_gpu(self):
-        if core.is_compiled_with_cuda():
-            self.gaussian_random_test(place=fluid.CUDAPlace(0))
-
-    def gaussian_random_test(self, place):
-
-        program = fluid.Program()
-        block = program.global_block()
-        vout = block.create_var(name="Out")
-        op = block.append_op(
-            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
-
-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-
-        fetch_list = []
-        for var_name in self.outputs:
-            fetch_list.append(block.var(var_name))
-
-        exe = Executor(place)
-        outs = exe.run(program, fetch_list=fetch_list)
-        tensor = outs[0]
-
-        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
-        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
-
-    def init_kernel_type(self):
-        pass
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
deleted file mode 100644
index 1d7ce33ea7ca2c53dc2bb2a7048444c818d4f33f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
+++ /dev/null
@@ -1,421 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-import six
-import paddle.fluid as fluid
-from op_test import OpTest
-'''
-# Equivalent code
-rles = mask_util.frPyObjects([segm], im_h, im_w)
-mask = mask_util.decode(rles)
-'''
-
-
-def decode(cnts, m):
-    v = 0
-    mask = []
-    for j in range(m):
-        for k in range(cnts[j]):
-            mask.append(v)
-        v = 1 - v
-    return mask
-
-
-def poly2mask(xy, k, h, w):
-    scale = 5.
-    x = [int(scale * p + 0.5) for p in xy[::2]]
-    x = x + [x[0]]
-    y = [int(scale * p + 0.5) for p in xy[1::2]]
-    y = y + [y[0]]
-    m = sum([
-        int(max(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1]))) + int(1)
-        for j in range(k)
-    ])
-
-    u, v = [], []
-    for j in range(k):
-        xs = x[j]
-        xe = x[j + 1]
-        ys = y[j]
-        ye = y[j + 1]
-        dx = abs(xe - xs)
-        dy = abs(ys - ye)
-        flip = (dx >= dy and xs > xe) or (dx < dy and ys > ye)
-        if flip:
-            xs, xe = xe, xs
-            ys, ye = ye, ys
-
-        if dx >= dy:
-            if (dx == 0): assert ye - ys == 0
-            s = 0 if dx == 0 else float(ye - ys) / dx
-        else:
-            if (dy == 0): assert xe - xs == 0
-            s = 0 if dy == 0 else float(xe - xs) / dy
-
-        if dx >= dy:
-            ts = [dx - d if flip else d for d in range(dx + 1)]
-            u.extend([xs + t for t in ts])
-            v.extend([int(ys + s * t + .5) for t in ts])
-        else:
-            ts = [dy - d if flip else d for d in range(dy + 1)]
-            v.extend([t + ys for t in ts])
-            u.extend([int(xs + s * t + .5) for t in ts])
-
-    k = len(u)
-    x = np.zeros((k), np.int)
-    y = np.zeros((k), np.int)
-    m = 0
-    for j in six.moves.xrange(1, k):
-        if u[j] != u[j - 1]:
-            xd = float(u[j] if (u[j] < u[j - 1]) else (u[j] - 1))
-            xd = (xd + .5) / scale - .5
-            if (math.floor(xd) != xd or xd < 0 or xd > (w - 1)):
-                continue
-            yd = float(v[j] if v[j] < v[j - 1] else v[j - 1])
-            yd = (yd + .5) / scale - .5
-            yd = math.ceil(0 if yd < 0 else (h if yd > h else yd))
-            x[m] = int(xd)
-            y[m] = int(yd)
-            m += 1
-    k = m
-    a = [int(x[i] * h + y[i]) for i in range(k)]
-    a.append(h * w)
-    a.sort()
-    b = [0] + a[:len(a) - 1]
-    a = [c - d for (c, d) in zip(a, b)]
-
-    k += 1
-    b = [0 for i in range(k)]
-    b[0] = a[0]
-    m, j = 1, 1
-    while (j < k):
-        if a[j] > 0:
-            b[m] = a[j]
-            m += 1
-            j += 1
-        else:
-            j += 1
-            if (j < k):
-                b[m - 1] += a[j]
-                j += 1
-    mask = decode(b, m)
-    mask = np.array(mask, dtype=np.int).reshape((w, h))
-    mask = mask.transpose((1, 0))
-    return mask
-
-
-def polys_to_boxes(polys):
-    """Convert a list of polygons into an array of tight bounding boxes."""
-    boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
-    for i in range(len(polys)):
-        poly = polys[i]
-        x0 = min(min(p[::2]) for p in poly)
-        x1 = max(max(p[::2]) for p in poly)
-        y0 = min(min(p[1::2]) for p in poly)
-        y1 = max(max(p[1::2]) for p in poly)
-        boxes_from_polys[i, :] = [x0, y0, x1, y1]
-    return boxes_from_polys
-
-
-def bbox_overlaps(boxes, query_boxes):
-    N = boxes.shape[0]
-    K = query_boxes.shape[0]
-    overlaps = np.zeros((N, K), dtype=boxes.dtype)
-    for k in range(K):
-        box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) *\
-                   (query_boxes[k, 3] - query_boxes[k, 1] + 1)
-        for n in range(N):
-            iw = min(boxes[n, 2], query_boxes[k, 2]) -\
-                 max(boxes[n, 0], query_boxes[k, 0]) + 1
-            if iw > 0:
-                ih = min(boxes[n, 3], query_boxes[k, 3]) -\
-                     max(boxes[n, 1], query_boxes[k, 1]) + 1
-                if ih > 0:
-                    ua = float(
-                         (boxes[n, 2] - boxes[n, 0] + 1) *\
-                         (boxes[n, 3] - boxes[n, 1] + 1) +\
-                         box_area - iw * ih)
-                    overlaps[n, k] = iw * ih / ua
-    return overlaps
-
-
-def polys_to_mask_wrt_box(polygons, box, M):
-    """Convert from the COCO polygon segmentation format to a binary mask
-    encoded as a 2D array of data type numpy.float32. The polygon segmentation
-    is understood to be enclosed in the given box and rasterized to an M x M
-    mask. The resulting mask is therefore of shape (M, M).
-    """
-    w = box[2] - box[0]
-    h = box[3] - box[1]
-
-    w = np.maximum(w, 1)
-    h = np.maximum(h, 1)
-
-    polygons_norm = []
-    for poly in polygons:
-        p = np.array(poly, dtype=np.float32)
-        p[0::2] = (p[0::2] - box[0]) * M / w
-        p[1::2] = (p[1::2] - box[1]) * M / h
-        polygons_norm.append(p)
-
-    mask = []
-    for polygons in polygons_norm:
-        assert polygons.shape[0] % 2 == 0
-        k = polygons.shape[0] // 2
-        mask.append(poly2mask(polygons, k, M, M))
-    mask = np.array(mask)
-    # Flatten in case polygons was a list
-    mask = np.sum(mask, axis=0)
-    mask = np.array(mask > 0, dtype=np.float32)
-    return mask
-
-
-def expand_mask_targets(masks, mask_class_labels, resolution, num_classes):
-    """Expand masks from shape (#masks, resolution ** 2)
-    to (#masks, #classes * resolution ** 2) to encode class
-    specific mask targets.
-    """
-    assert masks.shape[0] == mask_class_labels.shape[0]
-
-    # Target values of -1 are "don't care" / ignore labels
-    mask_targets = -np.ones(
-        (masks.shape[0], num_classes * resolution**2), dtype=np.int32)
-    for i in range(masks.shape[0]):
-        cls = int(mask_class_labels[i])
-        start = resolution**2 * cls
-        end = start + resolution**2
-        # Ignore background instance
-        # (only happens when there is no fg samples in an image)
-        if cls > 0:
-            mask_targets[i, start:end] = masks[i, :]
-    return mask_targets
-
-
-def generate_mask_labels(num_classes, im_info, gt_classes, is_crowd,
-                         label_int32, gt_polys, resolution, rois, roi_lod,
-                         gt_lod):
-    mask_rois = []
-    roi_has_mask_int32 = []
-    mask_int32 = []
-    new_lod = []
-    for i in range(len(im_info)):
-        roi_s = roi_lod[i]
-        roi_e = roi_lod[i + 1]
-        gt_s = gt_lod[i]
-        gt_e = gt_lod[i + 1]
-        mask_blob = _sample_mask(num_classes, im_info[i], gt_classes[gt_s:gt_e],
-                                 is_crowd[gt_s:gt_e], label_int32[roi_s:roi_e],
-                                 gt_polys[i], resolution, rois[roi_s:roi_e])
-        new_lod.append(mask_blob['mask_rois'].shape[0])
-        mask_rois.append(mask_blob['mask_rois'])
-        roi_has_mask_int32.append(mask_blob['roi_has_mask_int32'])
-        mask_int32.append(mask_blob['mask_int32'])
-    return mask_rois, roi_has_mask_int32, mask_int32, new_lod
-
-
-def _sample_mask(
-        num_classes,
-        im_info,
-        gt_classes,
-        is_crowd,
-        label_int32,
-        gt_polys,  # [[[], []], []]
-        resolution,
-        rois):
-    mask_blob = {}
-    im_scale = im_info[2]
-    sample_boxes = rois
-    polys_gt_inds = np.where((gt_classes > 0) & (is_crowd == 0))[0]
-    polys_gt = [gt_polys[i] for i in polys_gt_inds]
-    boxes_from_polys = polys_to_boxes(polys_gt)
-
-    fg_inds = np.where(label_int32 > 0)[0]
-    roi_has_mask = fg_inds.copy()
-    if fg_inds.shape[0] > 0:
-        mask_class_labels = label_int32[fg_inds]
-        masks = np.zeros((fg_inds.shape[0], resolution**2), dtype=np.int32)
-        rois_fg = sample_boxes[fg_inds]
-        overlaps_bbfg_bbpolys = bbox_overlaps(
-            rois_fg.astype(np.float32), boxes_from_polys.astype(np.float32))
-        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)
-        for i in range(rois_fg.shape[0]):
-            fg_polys_ind = fg_polys_inds[i]
-            poly_gt = polys_gt[fg_polys_ind]
-            roi_fg = rois_fg[i]
-            mask = polys_to_mask_wrt_box(poly_gt, roi_fg, resolution)
-            mask = np.array(mask > 0, dtype=np.int32)
-            masks[i, :] = np.reshape(mask, resolution**2)
-    else:
-        bg_inds = np.where(label_int32 == 0)[0]
-        rois_fg = sample_boxes[bg_inds[0]].reshape((1, -1))
-        masks = -np.ones((1, resolution**2), dtype=np.int32)
-        mask_class_labels = np.zeros((1, ))
-        roi_has_mask = np.append(roi_has_mask, 0)
-    masks = expand_mask_targets(masks, mask_class_labels, resolution,
-                                num_classes)
-    rois_fg *= im_scale
-    mask_blob['mask_rois'] = rois_fg
-    mask_blob['roi_has_mask_int32'] = roi_has_mask
-    mask_blob['mask_int32'] = masks
-    return mask_blob
-
-
-def trans_lod(lod):
-    new_lod = [0]
-    for i in range(len(lod)):
-        new_lod.append(lod[i] + new_lod[i])
-    return new_lod
-
-
-class TestGenerateMaskLabels(OpTest):
-    def set_data(self):
-        self.init_test_case()
-        self.make_generate_proposal_labels_out()
-        self.generate_gt_polys()
-        self.generate_groundtruth()
-        self.init_test_output()
-        self.inputs = {
-            'ImInfo': self.im_info,
-            'GtClasses': (self.gt_classes.astype(np.int32), self.gt_lod),
-            'IsCrowd': (self.is_crowd.astype(np.int32), self.gt_lod),
-            'LabelsInt32': (self.label_int32.astype(np.int32), self.rois_lod),
-            'GtSegms': (self.gt_polys.astype(np.float32), self.masks_lod),
-            'Rois': (self.rois.astype(np.float32), self.rois_lod)
-        }
-        self.attrs = {
-            'num_classes': self.num_classes,
-            'resolution': self.resolution
-        }
-        self.outputs = {
-            'MaskRois': (self.mask_rois, [self.new_lod]),
-            'RoiHasMaskInt32': (self.roi_has_mask_int32, [self.new_lod]),
-            'MaskInt32': (self.mask_int32, [self.new_lod])
-        }
-
-    def init_test_case(self):
-        self.num_classes = 81
-        self.resolution = 14
-        self.batch_size = 2
-        self.batch_size_per_im = 64
-        self.images_shape = [100, 200]
-        np.random.seed(0)
-
-    def make_generate_proposal_labels_out(self):
-        rois = []
-        self.rois_lod = [[]]
-        self.label_int32 = []
-        for bno in range(self.batch_size):
-            self.rois_lod[0].append(self.batch_size_per_im)
-            for i in range(self.batch_size_per_im):
-                xywh = np.random.rand(4)
-                xy1 = xywh[0:2] * 2
-                wh = xywh[2:4] * (self.images_shape[0] - xy1)
-                xy2 = xy1 + wh
-                roi = [xy1[0], xy1[1], xy2[0], xy2[1]]
-                rois.append(roi)
-        self.rois = np.array(rois).astype("float32")
-        for idx, roi_num in enumerate(self.rois_lod[0]):
-            for roi_id in range(roi_num):
-                class_id = np.random.random_integers(self.num_classes - 1)
-                if idx == 0:
-                    # set an image with no foreground, to test the empty case
-                    self.label_int32.append(0)
-                else:
-                    self.label_int32.append(class_id)
-        label_np = np.array(self.label_int32)
-        self.label_int32 = label_np[:, np.newaxis]
-
-    def generate_gt_polys(self):
-        h, w = self.images_shape[0:2]
-        self.gt_polys = []
-        self.gt_polys_list = []
-        max_gt = 4
-        max_poly_num = 5
-        min_poly_size = 4
-        max_poly_size = 16
-        lod0 = []
-        lod1 = []
-        lod2 = []
-        for i in range(self.batch_size):
-            gt_num = np.random.randint(1, high=max_gt, size=1)[0]
-            lod0.append(gt_num)
-            ptss = []
-            for i in range(gt_num):
-                poly_num = np.random.randint(1, max_poly_num, size=1)[0]
-                lod1.append(poly_num)
-                pts = []
-                for j in range(poly_num):
-                    poly_size = np.random.randint(
-                        min_poly_size, max_poly_size, size=1)[0]
-                    x = np.random.rand(poly_size, 1) * w
-                    y = np.random.rand(poly_size, 1) * h
-                    xy = np.concatenate((x, y), axis=1)
-                    pts.append(xy.flatten().tolist())
-                    self.gt_polys.extend(xy.flatten().tolist())
-                    lod2.append(poly_size)
-                ptss.append(pts)
-            self.gt_polys_list.append(ptss)
-        self.masks_lod = [lod0, lod1, lod2]
-        self.gt_lod = [lod0]
-        self.gt_polys = np.array(self.gt_polys).astype('float32').reshape(-1, 2)
-
-    def generate_groundtruth(self):
-        self.im_info = []
-        self.gt_classes = []
-        self.is_crowd = []
-        for roi_num in self.gt_lod[0]:
-            self.im_info.append(self.images_shape + [1.0])
-            for roi_id in range(roi_num):
-                class_id = np.random.random_integers(self.num_classes - 1)
-                self.gt_classes.append(class_id)
-                self.is_crowd.append(0)
-        self.im_info = np.array(self.im_info).astype(np.float32)
-        gt_classes_np = np.array(self.gt_classes)
-        self.gt_classes = gt_classes_np[:, np.newaxis]
-        is_crowd_np = np.array(self.is_crowd)
-        self.is_crowd = is_crowd_np[:, np.newaxis]
-
-    def init_test_output(self):
-        roi_lod = trans_lod(self.rois_lod[0])
-        gt_lod = trans_lod(self.gt_lod[0])
-        outs = generate_mask_labels(self.num_classes, self.im_info,
-                                    self.gt_classes, self.is_crowd,
-                                    self.label_int32, self.gt_polys_list,
-                                    self.resolution, self.rois, roi_lod, gt_lod)
-        self.mask_rois = outs[0]
-        self.roi_has_mask_int32 = outs[1]
-        self.mask_int32 = outs[2]
-        self.new_lod = outs[3]
-
-        self.mask_rois = np.vstack(self.mask_rois)
-        self.roi_has_mask_int32 = np.hstack(self.roi_has_mask_int32)[:,
-                                                                     np.newaxis]
-        self.mask_int32 = np.vstack(self.mask_int32)
-
-    def setUp(self):
-        self.op_type = "generate_mask_labels"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
deleted file mode 100644
index 406c255970a52d50c14efb685f55c89947958339..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ /dev/null
@@ -1,357 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-import paddle.fluid as fluid
-from op_test import OpTest
-
-
-def generate_proposal_labels_in_python(
-        rpn_rois, gt_classes, is_crowd, gt_boxes, im_info, batch_size_per_im,
-        fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
-        class_nums, is_cls_agnostic, is_cascade_rcnn):
-    rois = []
-    labels_int32 = []
-    bbox_targets = []
-    bbox_inside_weights = []
-    bbox_outside_weights = []
-    lod = []
-    assert len(rpn_rois) == len(
-        im_info), 'batch size of rpn_rois and ground_truth is not matched'
-
-    for im_i in range(len(im_info)):
-        frcn_blobs = _sample_rois(rpn_rois[im_i], gt_classes[im_i],
-                                  is_crowd[im_i], gt_boxes[im_i], im_info[im_i],
-                                  batch_size_per_im, fg_fraction, fg_thresh,
-                                  bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
-                                  class_nums, is_cls_agnostic, is_cascade_rcnn)
-        lod.append(frcn_blobs['rois'].shape[0])
-        rois.append(frcn_blobs['rois'])
-        labels_int32.append(frcn_blobs['labels_int32'])
-        bbox_targets.append(frcn_blobs['bbox_targets'])
-        bbox_inside_weights.append(frcn_blobs['bbox_inside_weights'])
-        bbox_outside_weights.append(frcn_blobs['bbox_outside_weights'])
-
-    return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod
-
-
-def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
-                 batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
-                 bg_thresh_lo, bbox_reg_weights, class_nums, is_cls_agnostic,
-                 is_cascade_rcnn):
-    rois_per_image = int(batch_size_per_im)
-    fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
-
-    # Roidb
-    im_scale = im_info[2]
-    inv_im_scale = 1. / im_scale
-    rpn_rois = rpn_rois * inv_im_scale
-    if is_cascade_rcnn:
-        rpn_rois = rpn_rois[gt_boxes.shape[0]:, :]
-    boxes = np.vstack([gt_boxes, rpn_rois])
-    gt_overlaps = np.zeros((boxes.shape[0], class_nums))
-    box_to_gt_ind_map = np.zeros((boxes.shape[0]), dtype=np.int32)
-    if len(gt_boxes) > 0:
-        proposal_to_gt_overlaps = _bbox_overlaps(boxes, gt_boxes)
-
-        overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)
-        overlaps_max = proposal_to_gt_overlaps.max(axis=1)
-        # Boxes which with non-zero overlap with gt boxes
-        overlapped_boxes_ind = np.where(overlaps_max > 0)[0]
-        overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[
-            overlapped_boxes_ind]]
-        gt_overlaps[overlapped_boxes_ind,
-                    overlapped_boxes_gt_classes] = overlaps_max[
-                        overlapped_boxes_ind]
-        box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
-            overlapped_boxes_ind]
-
-    crowd_ind = np.where(is_crowd)[0]
-    gt_overlaps[crowd_ind] = -1
-
-    max_overlaps = gt_overlaps.max(axis=1)
-    max_classes = gt_overlaps.argmax(axis=1)
-
-    # Cascade RCNN Decode Filter
-    if is_cascade_rcnn:
-        ws = boxes[:, 2] - boxes[:, 0] + 1
-        hs = boxes[:, 3] - boxes[:, 1] + 1
-        keep = np.where((ws > 0) & (hs > 0))[0]
-        boxes = boxes[keep]
-        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
-        bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
-                                                            bg_thresh_lo))[0]
-        fg_rois_per_this_image = fg_inds.shape[0]
-        bg_rois_per_this_image = bg_inds.shape[0]
-    else:
-        # Foreground
-        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
-        fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
-        # Sample foreground if there are too many
-        if fg_inds.shape[0] > fg_rois_per_this_image:
-            fg_inds = np.random.choice(
-                fg_inds, size=fg_rois_per_this_image, replace=False)
-        fg_inds = fg_inds[:fg_rois_per_this_image]
-        # Background
-        bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
-                                                            bg_thresh_lo))[0]
-        bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
-        bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
-                                            bg_inds.shape[0])
-        # Sample background if there are too many
-        if bg_inds.shape[0] > bg_rois_per_this_image:
-            bg_inds = np.random.choice(
-                bg_inds, size=bg_rois_per_this_image, replace=False)
-        bg_inds = bg_inds[:bg_rois_per_this_image]
-
-    keep_inds = np.append(fg_inds, bg_inds)
-    sampled_labels = max_classes[keep_inds]
-    sampled_labels[fg_rois_per_this_image:] = 0
-    sampled_boxes = boxes[keep_inds]
-    sampled_gts = gt_boxes[box_to_gt_ind_map[keep_inds]]
-    sampled_gts[fg_rois_per_this_image:, :] = gt_boxes[0]
-    bbox_label_targets = _compute_targets(sampled_boxes, sampled_gts,
-                                          sampled_labels, bbox_reg_weights)
-    bbox_targets, bbox_inside_weights = _expand_bbox_targets(
-        bbox_label_targets, class_nums, is_cls_agnostic)
-    bbox_outside_weights = np.array(
-        bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)
-    # Scale rois
-    sampled_rois = sampled_boxes * im_scale
-
-    # Faster RCNN blobs
-    frcn_blobs = dict(
-        rois=sampled_rois,
-        labels_int32=sampled_labels,
-        bbox_targets=bbox_targets,
-        bbox_inside_weights=bbox_inside_weights,
-        bbox_outside_weights=bbox_outside_weights)
-    return frcn_blobs
-
-
-def _bbox_overlaps(roi_boxes, gt_boxes):
-    w1 = np.maximum(roi_boxes[:, 2] - roi_boxes[:, 0] + 1, 0)
-    h1 = np.maximum(roi_boxes[:, 3] - roi_boxes[:, 1] + 1, 0)
-    w2 = np.maximum(gt_boxes[:, 2] - gt_boxes[:, 0] + 1, 0)
-    h2 = np.maximum(gt_boxes[:, 3] - gt_boxes[:, 1] + 1, 0)
-    area1 = w1 * h1
-    area2 = w2 * h2
-
-    overlaps = np.zeros((roi_boxes.shape[0], gt_boxes.shape[0]))
-    for ind1 in range(roi_boxes.shape[0]):
-        for ind2 in range(gt_boxes.shape[0]):
-            inter_x1 = np.maximum(roi_boxes[ind1, 0], gt_boxes[ind2, 0])
-            inter_y1 = np.maximum(roi_boxes[ind1, 1], gt_boxes[ind2, 1])
-            inter_x2 = np.minimum(roi_boxes[ind1, 2], gt_boxes[ind2, 2])
-            inter_y2 = np.minimum(roi_boxes[ind1, 3], gt_boxes[ind2, 3])
-            inter_w = np.maximum(inter_x2 - inter_x1 + 1, 0)
-            inter_h = np.maximum(inter_y2 - inter_y1 + 1, 0)
-            inter_area = inter_w * inter_h
-            iou = inter_area / (area1[ind1] + area2[ind2] - inter_area)
-            overlaps[ind1, ind2] = iou
-    return overlaps
-
-
-def _compute_targets(roi_boxes, gt_boxes, labels, bbox_reg_weights):
-    assert roi_boxes.shape[0] == gt_boxes.shape[0]
-    assert roi_boxes.shape[1] == 4
-    assert gt_boxes.shape[1] == 4
-
-    targets = np.zeros(roi_boxes.shape)
-    bbox_reg_weights = np.asarray(bbox_reg_weights)
-    targets = _box_to_delta(
-        ex_boxes=roi_boxes, gt_boxes=gt_boxes, weights=bbox_reg_weights)
-
-    return np.hstack([labels[:, np.newaxis], targets]).astype(
-        np.float32, copy=False)
-
-
-def _box_to_delta(ex_boxes, gt_boxes, weights):
-    ex_w = ex_boxes[:, 2] - ex_boxes[:, 0] + 1
-    ex_h = ex_boxes[:, 3] - ex_boxes[:, 1] + 1
-    ex_ctr_x = ex_boxes[:, 0] + 0.5 * ex_w
-    ex_ctr_y = ex_boxes[:, 1] + 0.5 * ex_h
-
-    gt_w = gt_boxes[:, 2] - gt_boxes[:, 0] + 1
-    gt_h = gt_boxes[:, 3] - gt_boxes[:, 1] + 1
-    gt_ctr_x = gt_boxes[:, 0] + 0.5 * gt_w
-    gt_ctr_y = gt_boxes[:, 1] + 0.5 * gt_h
-
-    dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
-    dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
-    dw = (np.log(gt_w / ex_w)) / weights[2]
-    dh = (np.log(gt_h / ex_h)) / weights[3]
-
-    targets = np.vstack([dx, dy, dw, dh]).transpose()
-    return targets
-
-
-def _expand_bbox_targets(bbox_targets_input, class_nums, is_cls_agnostic):
-    class_labels = bbox_targets_input[:, 0]
-    fg_inds = np.where(class_labels > 0)[0]
-    #if is_cls_agnostic:
-    #	class_labels = [1 if ll > 0 else 0 for ll in class_labels]
-    #    class_labels = np.array(class_labels, dtype=np.int32)
-    #	class_nums = 2
-    bbox_targets = np.zeros((class_labels.shape[0], 4 * class_nums
-                             if not is_cls_agnostic else 4 * 2))
-    bbox_inside_weights = np.zeros(bbox_targets.shape)
-    for ind in fg_inds:
-        class_label = int(class_labels[ind]) if not is_cls_agnostic else 1
-        start_ind = class_label * 4
-        end_ind = class_label * 4 + 4
-        bbox_targets[ind, start_ind:end_ind] = bbox_targets_input[ind, 1:]
-        bbox_inside_weights[ind, start_ind:end_ind] = (1.0, 1.0, 1.0, 1.0)
-    return bbox_targets, bbox_inside_weights
-
-
-class TestGenerateProposalLabelsOp(OpTest):
-    def set_data(self):
-        self.init_test_params()
-        self.init_test_input()
-        self.init_test_output()
-        self.inputs = {
-            'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod),
-            'GtClasses': (self.gt_classes[0], self.gts_lod),
-            'IsCrowd': (self.is_crowd[0], self.gts_lod),
-            'GtBoxes': (self.gt_boxes[0], self.gts_lod),
-            'ImInfo': self.im_info
-        }
-        self.attrs = {
-            'batch_size_per_im': self.batch_size_per_im,
-            'fg_fraction': self.fg_fraction,
-            'fg_thresh': self.fg_thresh,
-            'bg_thresh_hi': self.bg_thresh_hi,
-            'bg_thresh_lo': self.bg_thresh_lo,
-            'bbox_reg_weights': self.bbox_reg_weights,
-            'class_nums': self.class_nums,
-            'use_random': False,
-            'is_cls_agnostic': self.is_cls_agnostic,
-            'is_cascade_rcnn': self.is_cascade_rcnn
-        }
-        self.outputs = {
-            'Rois': (self.rois, [self.lod]),
-            'LabelsInt32': (self.labels_int32, [self.lod]),
-            'BboxTargets': (self.bbox_targets, [self.lod]),
-            'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]),
-            'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = 'generate_proposal_labels'
-        self.set_data()
-
-    def init_test_params(self):
-        self.batch_size_per_im = 512
-        self.fg_fraction = 0.25
-        self.fg_thresh = 0.5
-        self.bg_thresh_hi = 0.5
-        self.bg_thresh_lo = 0.0
-        self.bbox_reg_weights = [0.1, 0.1, 0.2, 0.2]
-        #self.class_nums = 81
-        self.is_cls_agnostic = False  #True
-        self.is_cascade_rcnn = True
-        self.class_nums = 2 if self.is_cls_agnostic else 81
-
-    def init_test_input(self):
-        np.random.seed(0)
-        gt_nums = 6  # Keep same with batch_size_per_im for unittest
-        proposal_nums = 2000 if not self.is_cascade_rcnn else 512  #self.batch_size_per_im - gt_nums
-        images_shape = [[64, 64]]
-        self.im_info = np.ones((len(images_shape), 3)).astype(np.float32)
-        for i in range(len(images_shape)):
-            self.im_info[i, 0] = images_shape[i][0]
-            self.im_info[i, 1] = images_shape[i][1]
-            self.im_info[i, 2] = 0.8  #scale
-
-        self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape,
-                                                               proposal_nums)
-        ground_truth, self.gts_lod = _generate_groundtruth(
-            images_shape, self.class_nums, gt_nums)
-        self.gt_classes = [gt['gt_classes'] for gt in ground_truth]
-        self.gt_boxes = [gt['boxes'] for gt in ground_truth]
-        self.is_crowd = [gt['is_crowd'] for gt in ground_truth]
-
-    def init_test_output(self):
-        self.rois, self.labels_int32, self.bbox_targets, \
-        self.bbox_inside_weights, self.bbox_outside_weights, \
-        self.lod = generate_proposal_labels_in_python(
-                self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info,
-                self.batch_size_per_im, self.fg_fraction,
-                self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo,
-                self.bbox_reg_weights, self.class_nums,
-                self.is_cls_agnostic, self.is_cascade_rcnn
-            )
-        self.rois = np.vstack(self.rois)
-        self.labels_int32 = np.hstack(self.labels_int32)
-        self.labels_int32 = self.labels_int32[:, np.newaxis]
-        self.bbox_targets = np.vstack(self.bbox_targets)
-        self.bbox_inside_weights = np.vstack(self.bbox_inside_weights)
-        self.bbox_outside_weights = np.vstack(self.bbox_outside_weights)
-
-
-def _generate_proposals(images_shape, proposal_nums):
-    rpn_rois = []
-    rpn_rois_lod = []
-    num_proposals = 0
-    for i, image_shape in enumerate(images_shape):
-        proposals = _generate_boxes(image_shape, proposal_nums)
-        rpn_rois.append(proposals)
-        num_proposals = len(proposals)
-        rpn_rois_lod.append(num_proposals)
-    return rpn_rois, [rpn_rois_lod]
-
-
-def _generate_groundtruth(images_shape, class_nums, gt_nums):
-    ground_truth = []
-    gts_lod = []
-    num_gts = 0
-    for i, image_shape in enumerate(images_shape):
-        # Avoid background
-        gt_classes = np.random.randint(
-            low=1, high=class_nums, size=gt_nums).astype(np.int32)
-        gt_boxes = _generate_boxes(image_shape, gt_nums)
-        is_crowd = np.zeros((gt_nums), dtype=np.int32)
-        is_crowd[0] = 1
-        ground_truth.append(
-            dict(
-                gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd))
-        num_gts += len(gt_classes)
-        gts_lod.append(num_gts)
-    return ground_truth, [gts_lod]
-
-
-def _generate_boxes(image_size, box_nums):
-    width = image_size[0]
-    height = image_size[1]
-    xywh = np.random.rand(box_nums, 4)
-    xy1 = xywh[:, [0, 1]] * image_size
-    wh = xywh[:, [2, 3]] * (image_size - xy1)
-    xy2 = xy1 + wh
-    boxes = np.hstack([xy1, xy2])
-    boxes[:, [0, 2]] = np.minimum(width - 1., np.maximum(0., boxes[:, [0, 2]]))
-    boxes[:, [1, 3]] = np.minimum(height - 1., np.maximum(0., boxes[:, [1, 3]]))
-    return boxes.astype(np.float32)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
deleted file mode 100644
index 5ce405dccae4cfd66cde471c097698b0869f29fe..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ /dev/null
@@ -1,330 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-import paddle.fluid as fluid
-from op_test import OpTest
-from test_multiclass_nms_op import nms
-from test_anchor_generator_op import anchor_generator_in_python
-import copy
-
-
-def generate_proposals_in_python(scores, bbox_deltas, im_info, anchors,
-                                 variances, pre_nms_topN, post_nms_topN,
-                                 nms_thresh, min_size, eta):
-    all_anchors = anchors.reshape(-1, 4)
-    rois = np.empty((0, 5), dtype=np.float32)
-    roi_probs = np.empty((0, 1), dtype=np.float32)
-
-    rpn_rois = []
-    rpn_roi_probs = []
-    lod = []
-    num_images = scores.shape[0]
-    for img_idx in range(num_images):
-        img_i_boxes, img_i_probs = proposal_for_one_image(
-            im_info[img_idx, :], all_anchors, variances,
-            bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
-            pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta)
-        lod.append(img_i_probs.shape[0])
-        rpn_rois.append(img_i_boxes)
-        rpn_roi_probs.append(img_i_probs)
-
-    return rpn_rois, rpn_roi_probs, lod
-
-
-def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
-                           pre_nms_topN, post_nms_topN, nms_thresh, min_size,
-                           eta):
-    # Transpose and reshape predicted bbox transformations to get them
-    # into the same order as the anchors:
-    #   - bbox deltas will be (4 * A, H, W) format from conv output
-    #   - transpose to (H, W, 4 * A)
-    #   - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
-    #     in slowest to fastest order to match the enumerated anchors
-    bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape(-1, 4)
-    all_anchors = all_anchors.reshape(-1, 4)
-    variances = variances.reshape(-1, 4)
-    # Same story for the scores:
-    #   - scores are (A, H, W) format from conv output
-    #   - transpose to (H, W, A)
-    #   - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
-    #     to match the order of anchors and bbox_deltas
-    scores = scores.transpose((1, 2, 0)).reshape(-1, 1)
-
-    # sort all (proposal, score) pairs by score from highest to lowest
-    # take top pre_nms_topN (e.g. 6000)
-    if pre_nms_topN <= 0 or pre_nms_topN >= len(scores):
-        order = np.argsort(-scores.squeeze())
-    else:
-        # Avoid sorting possibly large arrays;
-        # First partition to get top K unsorted
-        # and then sort just thoes
-        inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
-        order = np.argsort(-scores[inds].squeeze())
-        order = inds[order]
-    scores = scores[order, :]
-    bbox_deltas = bbox_deltas[order, :]
-    all_anchors = all_anchors[order, :]
-    proposals = box_coder(all_anchors, bbox_deltas, variances)
-    # clip proposals to image (may result in proposals with zero area
-    # that will be removed in the next step)
-    proposals = clip_tiled_boxes(proposals, im_info[:2])
-    # remove predicted boxes with height or width < min_size
-    keep = filter_boxes(proposals, min_size, im_info)
-    proposals = proposals[keep, :]
-    scores = scores[keep, :]
-
-    # apply loose nms (e.g. threshold = 0.7)
-    # take post_nms_topN (e.g. 1000)
-    # return the top proposals
-    if nms_thresh > 0:
-        keep = nms(boxes=proposals,
-                   scores=scores,
-                   nms_threshold=nms_thresh,
-                   eta=eta)
-        if post_nms_topN > 0 and post_nms_topN < len(keep):
-            keep = keep[:post_nms_topN]
-        proposals = proposals[keep, :]
-        scores = scores[keep, :]
-
-    return proposals, scores
-
-
-def box_coder(all_anchors, bbox_deltas, variances):
-    """
-    Decode proposals by anchors and bbox_deltas from RPN 
-    """
-    #proposals: xmin, ymin, xmax, ymax
-    proposals = np.zeros_like(bbox_deltas, dtype=np.float32)
-
-    #anchor_loc: width, height, center_x, center_y
-    anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
-
-    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1
-    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1
-    anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
-    anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
-
-    #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height 
-    pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
-    if variances is not None:
-        for i in range(bbox_deltas.shape[0]):
-            pred_bbox[i, 0] = variances[i, 0] * bbox_deltas[i, 0] * anchor_loc[
-                i, 0] + anchor_loc[i, 2]
-            pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[
-                i, 1] + anchor_loc[i, 3]
-            pred_bbox[i, 2] = math.exp(
-                min(variances[i, 2] * bbox_deltas[i, 2], math.log(
-                    1000 / 16.0))) * anchor_loc[i, 0]
-            pred_bbox[i, 3] = math.exp(
-                min(variances[i, 3] * bbox_deltas[i, 3], math.log(
-                    1000 / 16.0))) * anchor_loc[i, 1]
-    else:
-        for i in range(bbox_deltas.shape[0]):
-            pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[
-                i, 2]
-            pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[
-                i, 3]
-            pred_bbox[i, 2] = math.exp(
-                min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i,
-                                                                            0]
-            pred_bbox[i, 3] = math.exp(
-                min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i,
-                                                                            1]
-
-    proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
-    proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
-    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1
-    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1
-
-    return proposals
-
-
-def clip_tiled_boxes(boxes, im_shape):
-    """Clip boxes to image boundaries. im_shape is [height, width] and boxes
-    has shape (N, 4 * num_tiled_boxes)."""
-    assert boxes.shape[1] % 4 == 0, \
-        'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
-        boxes.shape[1]
-    )
-    # x1 >= 0
-    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
-    # y1 >= 0
-    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
-    # x2 < im_shape[1]
-    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
-    # y2 < im_shape[0]
-    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
-    return boxes
-
-
-def filter_boxes(boxes, min_size, im_info):
-    """Only keep boxes with both sides >= min_size and center within the image.
-    """
-    # Scale min_size to match image scale
-    im_scale = im_info[2]
-    min_size = max(min_size, 1.0)
-    ws = boxes[:, 2] - boxes[:, 0] + 1
-    hs = boxes[:, 3] - boxes[:, 1] + 1
-    ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
-    hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
-    x_ctr = boxes[:, 0] + ws / 2.
-    y_ctr = boxes[:, 1] + hs / 2.
-    keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) &
-                    (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0]
-    return keep
-
-
-def iou(box_a, box_b):
-    """
-	Apply intersection-over-union overlap between box_a and box_b
-    """
-    xmin_a = min(box_a[0], box_a[2])
-    ymin_a = min(box_a[1], box_a[3])
-    xmax_a = max(box_a[0], box_a[2])
-    ymax_a = max(box_a[1], box_a[3])
-
-    xmin_b = min(box_b[0], box_b[2])
-    ymin_b = min(box_b[1], box_b[3])
-    xmax_b = max(box_b[0], box_b[2])
-    ymax_b = max(box_b[1], box_b[3])
-
-    area_a = (ymax_a - ymin_a + 1) * (xmax_a - xmin_a + 1)
-    area_b = (ymax_b - ymin_b + 1) * (xmax_b - xmin_b + 1)
-    if area_a <= 0 and area_b <= 0:
-        return 0.0
-
-    xa = max(xmin_a, xmin_b)
-    ya = max(ymin_a, ymin_b)
-    xb = min(xmax_a, xmax_b)
-    yb = min(ymax_a, ymax_b)
-
-    inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0)
-
-    iou_ratio = inter_area / (area_a + area_b - inter_area)
-
-    return iou_ratio
-
-
-def nms(boxes, scores, nms_threshold, eta=1.0):
-    """Apply non-maximum suppression at test time to avoid detecting too many
-    overlapping bounding boxes for a given object.
-    Args:
-        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
-        scores: (tensor) The class predscores for the img, Shape:[num_priors].
-        nms_threshold: (float) The overlap thresh for suppressing unnecessary
-            boxes.
-        eta: (float) The parameter for adaptive NMS.
-    Return:
-        The indices of the kept boxes with respect to num_priors.
-    """
-    all_scores = copy.deepcopy(scores)
-    all_scores = all_scores.flatten()
-
-    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
-    sorted_scores = all_scores[sorted_indices]
-    selected_indices = []
-    adaptive_threshold = nms_threshold
-    for i in range(sorted_scores.shape[0]):
-        idx = sorted_indices[i]
-        keep = True
-        for k in range(len(selected_indices)):
-            if keep:
-                kept_idx = selected_indices[k]
-                overlap = iou(boxes[idx], boxes[kept_idx])
-                keep = True if overlap <= adaptive_threshold else False
-            else:
-                break
-        if keep:
-            selected_indices.append(idx)
-        if keep and eta < 1 and adaptive_threshold > 0.5:
-            adaptive_threshold *= eta
-    return selected_indices
-
-
-class TestGenerateProposalsOp(OpTest):
-    def set_data(self):
-        self.init_test_params()
-        self.init_test_input()
-        self.init_test_output()
-        self.inputs = {
-            'Scores': self.scores,
-            'BboxDeltas': self.bbox_deltas,
-            'ImInfo': self.im_info.astype(np.float32),
-            'Anchors': self.anchors,
-            'Variances': self.variances
-        }
-
-        self.attrs = {
-            'pre_nms_topN': self.pre_nms_topN,
-            'post_nms_topN': self.post_nms_topN,
-            'nms_thresh': self.nms_thresh,
-            'min_size': self.min_size,
-            'eta': self.eta
-        }
-
-        self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.lod]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod])
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "generate_proposals"
-        self.set_data()
-
-    def init_test_params(self):
-        self.pre_nms_topN = 12000  # train 12000, test 2000
-        self.post_nms_topN = 5000  # train 6000, test 1000
-        self.nms_thresh = 0.7
-        self.min_size = 3.0
-        self.eta = 1.
-
-    def init_test_input(self):
-        batch_size = 1
-        input_channels = 20
-        layer_h = 16
-        layer_w = 16
-        input_feat = np.random.random(
-            (batch_size, input_channels, layer_h, layer_w)).astype('float32')
-        self.anchors, self.variances = anchor_generator_in_python(
-            input_feat=input_feat,
-            anchor_sizes=[16., 32.],
-            aspect_ratios=[0.5, 1.0],
-            variances=[1.0, 1.0, 1.0, 1.0],
-            stride=[16.0, 16.0],
-            offset=0.5)
-        self.im_info = np.array([[64., 64., 8.]])  #im_height, im_width, scale
-        num_anchors = self.anchors.shape[2]
-        self.scores = np.random.random(
-            (batch_size, num_anchors, layer_h, layer_w)).astype('float32')
-        self.bbox_deltas = np.random.random(
-            (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32')
-
-    def init_test_output(self):
-        self.rpn_rois, self.rpn_roi_probs, self.lod = generate_proposals_in_python(
-            self.scores, self.bbox_deltas, self.im_info, self.anchors,
-            self.variances, self.pre_nms_topN, self.post_nms_topN,
-            self.nms_thresh, self.min_size, self.eta)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
deleted file mode 100644
index 0945b59321a7dd32195effbf79f00fa7cf0f24c9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.fluid as fluid
-import numpy as np
-import time
-import six
-import unittest
-from paddle.fluid.reader import DataLoaderBase
-
-EPOCH_NUM = 20
-BATCH_SIZE = 32
-BATCH_NUM = 20
-CLASS_NUM = 10
-
-
-def random_reader():
-    np.random.seed(1)
-    for i in range(BATCH_SIZE * BATCH_NUM):
-        image = np.random.random([784])
-        label = np.random.random_integers(low=0, high=CLASS_NUM - 1)
-        yield image, label
-
-
-def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
-    startup_prog = fluid.Program()
-    main_prog = fluid.Program()
-    startup_prog.random_seed = 1
-    main_prog.random_seed = 1
-
-    with fluid.unique_name.guard():
-        with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.layers.data(
-                name='image', shape=[784], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            py_reader = fluid.io.DataLoader.from_generator(
-                feed_list=[image, label],
-                capacity=4,
-                iterable=not use_legacy_py_reader,
-                use_double_buffer=use_double_buffer)
-            hidden = image
-            for hidden_size in [10, 20, 30]:
-                hidden = fluid.layers.fc(
-                    hidden,
-                    size=hidden_size,
-                    act='tanh',
-                    bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(value=1.0)))
-
-            predict_label = fluid.layers.fc(hidden,
-                                            size=CLASS_NUM,
-                                            act='softmax')
-            loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=predict_label, label=label))
-
-            optimizer = fluid.optimizer.Adam()
-            optimizer.minimize(loss)
-    return startup_prog, main_prog, py_reader, loss
-
-
-class TestBase(unittest.TestCase):
-    def run_main(self, use_legacy_py_reader, with_data_parallel, places,
-                 use_double_buffer):
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
-            startup_prog, main_prog, py_reader, loss = simple_fc_net(
-                places, use_legacy_py_reader, use_double_buffer)
-
-            reader = paddle.batch(random_reader, batch_size=BATCH_SIZE)
-
-            ps = places if use_double_buffer else fluid.cpu_places(len(places))
-
-            py_reader.set_sample_list_generator(
-                reader, places=ps if py_reader.iterable else None)
-
-            exe = fluid.Executor(place=places[0])
-            exe.run(startup_prog)
-
-            prog = fluid.CompiledProgram(main_prog)
-            if with_data_parallel:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
-
-            step = 0
-            step_list = []
-            loss_list = []
-            start_t = time.time()
-            if not py_reader.iterable:
-                for _ in six.moves.range(EPOCH_NUM):
-                    step = 0
-                    py_reader.start()
-                    while True:
-                        try:
-                            L, = exe.run(program=prog,
-                                         fetch_list=[loss],
-                                         use_program_cache=True)
-                            loss_list.append(np.mean(L))
-                            step += 1
-                        except fluid.core.EOFException:
-                            py_reader.reset()
-                            break
-                    step_list.append(step)
-            else:
-                for _ in six.moves.range(EPOCH_NUM):
-                    step = 0
-                    for d in py_reader():
-                        print(d)
-                        assert len(d) == len(places), "{} != {}".format(
-                            len(d), len(places))
-                        for i, item in enumerate(d):
-                            image = item['image']
-                            label = item['label']
-                            assert image.shape() == [BATCH_SIZE, 784]
-                            assert label.shape() == [BATCH_SIZE, 1]
-                            assert image._place()._equals(ps[i])
-                            assert label._place()._equals(ps[i])
-                        L, = exe.run(program=prog,
-                                     feed=d,
-                                     fetch_list=[loss],
-                                     use_program_cache=True)
-                        loss_list.append(np.mean(L))
-                        step += 1
-                    step_list.append(step)
-            end_t = time.time()
-            ret = {
-                "time": end_t - start_t,
-                "step": step_list,
-                "loss": np.array(loss_list)
-            }
-            return ret
-
-    def prepare_places(self, with_data_parallel, with_cpu=True, with_gpu=True):
-        places = []
-        if with_cpu:
-            places.append([fluid.CPUPlace()])
-            if with_data_parallel:
-                places.append([fluid.CPUPlace()] * 2)
-
-        if with_gpu and fluid.core.is_compiled_with_cuda():
-            tmp = fluid.cuda_places()
-            assert len(tmp) > 0, "no gpu detected"
-            if with_data_parallel:
-                places.append(tmp)
-            places.append([tmp[0]])
-        return places
-
-    def test_main(self):
-        for with_data_parallel in [True, False]:
-            for p in self.prepare_places(with_data_parallel):
-                for use_double_buffer in [False, True]:
-                    results = []
-                    for use_legacy_py_reader in [False, True]:
-                        print(p, use_double_buffer, use_legacy_py_reader)
-                        ret = self.run_main(
-                            use_legacy_py_reader=use_legacy_py_reader,
-                            with_data_parallel=with_data_parallel,
-                            places=p,
-                            use_double_buffer=use_double_buffer)
-                        results.append(ret)
-                    if not use_double_buffer:
-                        diff = np.max(
-                            np.abs(results[0]['loss'] - results[1]['loss']))
-                        self.assertLess(diff, 1e-3)
-
-
-class TestDataLoaderBaseAbstract(unittest.TestCase):
-    def test_main(self):
-        loader = DataLoaderBase()
-        try:
-            loader.__iter__()
-            self.assertTrue(False)
-        except NotImplementedError:
-            self.assertTrue(True)
-
-        try:
-            loader.__next__()
-            self.assertTrue(False)
-        except NotImplementedError:
-            self.assertTrue(True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_places_op.py b/python/paddle/fluid/tests/unittests/test_get_places_op.py
deleted file mode 100644
index e6be3a3a3e5b6ae7570d2ebdf2836e48345f5734..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-from paddle.fluid.layers.device import get_places
-from decorator_helper import prog_scope
-import unittest
-
-
-class TestGetPlaces(unittest.TestCase):
-    @prog_scope()
-    def test_get_places(self):
-        places = get_places()
-        cpu = fluid.CPUPlace()
-        exe = fluid.Executor(cpu)
-        exe.run(fluid.default_main_program())
-        self.assertEqual(places.type, fluid.core.VarDesc.VarType.PLACE_LIST)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
deleted file mode 100644
index 6cd02dad577b681b8c452bdb9574df60ffb4f82e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import numpy as np
-from paddle.fluid.op import Operator
-
-
-class TestGetTensorFromSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        x_rows = [0, 5, 5, 4, 19]
-        height = 20
-        row_numel = 2
-
-        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
-        np_array[1, :] = 2.0
-        np_array[2, :] = 3.0
-        np_array[3, :] = 4.0
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(x_rows)
-        x.set_height(height)
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        # initialize input variable Out
-        out = scope.var("Out").get_tensor()
-
-        op = Operator("get_tensor_from_selected_rows", X="X", Out="Out")
-
-        op.run(scope, place)
-
-        out_array = np.array(out)
-        self.assertEqual((5, 2), out_array.shape)
-        assert (out_array == np_array).all()
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
deleted file mode 100644
index fb80b5c1d2c9187572b13d21f952bd3976cfa6cc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import contextlib
-import unittest
-import numpy as np
-import six
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-
-from paddle.fluid.dygraph.base import to_variable
-
-from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
-
-
-class TestGradClipByGlobalNorm(unittest.TestCase):
-    def init_value(self):
-        self.max_global_norm = 5.0
-        self.init_scale = 1.0
-
-        self.shape = (20, 20)
-
-    def generate_p_g(self):
-
-        self.para_and_grad = []
-        for i in range(10):
-            self.para_and_grad.append(
-                (np.random.uniform(-self.init_scale, self.init_scale,
-                                   self.shape).astype('float32'),
-                 np.random.uniform(-self.init_scale, self.init_scale,
-                                   self.shape).astype('float32')))
-
-    def get_numpy_global_norm_result(self):
-        gloabl_norm = 0.0
-        for p, g in self.para_and_grad:
-            gloabl_norm += np.sum(np.square(g))
-
-        gloabl_norm_np = np.sqrt(gloabl_norm)
-
-        new_np_p_g = []
-        scale = 1.0
-        if gloabl_norm_np > self.max_global_norm:
-            scale = self.max_global_norm / gloabl_norm_np
-
-        for p, g in self.para_and_grad:
-            new_np_p_g.append((p, g * scale))
-
-        return new_np_p_g
-
-    def get_dygrap_global_norm_result(self):
-        with fluid.dygraph.guard():
-
-            gloabl_norm_clip = GradClipByGlobalNorm(self.max_global_norm)
-            p_g_var = []
-            for p, g in self.para_and_grad:
-                new_p = to_variable(p)
-                new_g = to_variable(g)
-                p_g_var.append((new_p, new_g))
-
-            new_p_g_var = gloabl_norm_clip(p_g_var)
-
-            p_g_dy_out = []
-            for p, g in new_p_g_var:
-                p_g_dy_out.append((p.numpy(), g.numpy()))
-
-            return p_g_dy_out
-
-    def test_clip_by_global_norm(self):
-        self.init_value()
-        self.generate_p_g()
-        np_p_g = self.get_numpy_global_norm_result()
-        dy_out_p_g = self.get_dygrap_global_norm_result()
-
-        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
-            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
-
-    def test_clip_by_global_norm_2(self):
-        self.init_value()
-
-        self.init_scale = 0.2
-        self.max_global_norm = 10
-        self.generate_p_g()
-        np_p_g = self.get_numpy_global_norm_result()
-        dy_out_p_g = self.get_dygrap_global_norm_result()
-
-        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
-            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
-
-
-class TestGradClipByNorm(unittest.TestCase):
-    def init_value(self):
-        self.max_norm = 5.0
-        self.init_scale = 1.0
-
-        self.shape = (10, 10)
-
-    def generate_p_g(self):
-
-        self.para_and_grad = []
-        for i in range(10):
-            self.para_and_grad.append(
-                (np.random.uniform(-self.init_scale, self.init_scale,
-                                   self.shape).astype('float32'),
-                 np.random.uniform(-self.init_scale, self.init_scale,
-                                   self.shape).astype('float32')))
-
-    def get_numpy_norm_result(self):
-
-        new_p_g = []
-        for p, g in self.para_and_grad:
-            norm = np.sqrt(np.sum(np.square(g)))
-
-            if norm > self.max_norm:
-                new_p_g.append((p, g * self.max_norm / norm))
-            else:
-                new_p_g.append((p, g))
-
-        return new_p_g
-
-    def get_dygrap_norm_result(self):
-        with fluid.dygraph.guard():
-
-            norm_clip = GradClipByNorm(self.max_norm)
-            p_g_var = []
-            for p, g in self.para_and_grad:
-                new_p = to_variable(p)
-                new_g = to_variable(g)
-                p_g_var.append((new_p, new_g))
-
-            new_p_g_var = norm_clip(p_g_var)
-
-            p_g_dy_out = []
-            for p, g in new_p_g_var:
-                p_g_dy_out.append((p.numpy(), g.numpy()))
-
-            return p_g_dy_out
-
-    def test_clip_by_norm(self):
-        self.init_value()
-        self.generate_p_g()
-        np_p_g = self.get_numpy_norm_result()
-        dy_out_p_g = self.get_dygrap_norm_result()
-
-        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
-            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
-
-    def test_clip_by_norm_2(self):
-        self.init_value()
-
-        self.init_scale = 0.2
-        self.max_norm = 10.0
-        self.generate_p_g()
-        np_p_g = self.get_numpy_norm_result()
-        dy_out_p_g = self.get_dygrap_norm_result()
-
-        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
-            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
-
-
-class TestGradClipByValue(unittest.TestCase):
-    def init_value(self):
-        self.max_value = 0.8
-        self.min_value = -0.1
-        self.init_scale = 1.0
-
-        self.shape = (10, 10)
-
-    def generate_p_g(self):
-
-        self.para_and_grad = []
-        for i in range(10):
-            self.para_and_grad.append(
-                (np.random.uniform(-self.init_scale, self.init_scale,
-                                   self.shape).astype('float32'),
-                 np.random.uniform(-self.init_scale, self.init_scale,
-                                   self.shape).astype('float32')))
-
-    def get_numpy_clip_result(self):
-
-        new_p_g = []
-        for p, g in self.para_and_grad:
-            new_p_g.append((p, np.clip(g, self.min_value, self.max_value)))
-
-        return new_p_g
-
-    def get_dygrap_clip_result(self):
-        with fluid.dygraph.guard():
-
-            value_clip = GradClipByValue(self.min_value, self.max_value)
-            p_g_var = []
-            for p, g in self.para_and_grad:
-                new_p = to_variable(p)
-                new_g = to_variable(g)
-                p_g_var.append((new_p, new_g))
-
-            new_p_g_var = value_clip(p_g_var)
-
-            p_g_dy_out = []
-            for p, g in new_p_g_var:
-                p_g_dy_out.append((p.numpy(), g.numpy()))
-
-            return p_g_dy_out
-
-    def test_clip_by_value(self):
-        self.init_value()
-        self.generate_p_g()
-        np_p_g = self.get_numpy_clip_result()
-        dy_out_p_g = self.get_dygrap_clip_result()
-
-        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
-            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
-
-    def test_clip_by_norm_2(self):
-        self.init_value()
-
-        self.init_scale = 0.2
-        self.generate_p_g()
-        np_p_g = self.get_numpy_clip_result()
-        dy_out_p_g = self.get_dygrap_clip_result()
-
-        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
-            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
-
-    def test_clip_by_norm_3(self):
-        self.init_value()
-
-        self.init_scale = 0.5
-        self.max_value = 0.6
-        self.min_value = None
-        self.generate_p_g()
-        np_p_g = self.get_numpy_clip_result()
-        dy_out_p_g = self.get_dygrap_clip_result()
-
-        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
-            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
deleted file mode 100644
index 470187e6421173d1cb1213d06660331c164859c4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ /dev/null
@@ -1,162 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-import six
-from fake_reader import fake_imdb_reader
-
-
-def bow_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    fluid/PaddleNLP/text_classification/nets.py
-    """
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=True, size=[dict_dim, emb_dim])
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
-    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    return avg_cost
-
-
-class TestGradientClip(unittest.TestCase):
-    def setUp(self):
-        self.word_dict_len = 5147
-        self.BATCH_SIZE = 2
-        reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100)
-        self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE)
-
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def check_operators(self, place):
-        CLIP = 1
-
-        prog = fluid.framework.Program()
-        startup_program = fluid.framework.Program()
-        with fluid.program_guard(
-                main_program=prog, startup_program=startup_program):
-            image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-
-            hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-            hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-            predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-
-            cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(cost)
-
-        prog_clip = prog.clone()
-        avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
-
-        p_g = fluid.backward.append_backward(loss=avg_cost)
-        p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
-
-        with fluid.program_guard(
-                main_program=prog_clip, startup_program=startup_program):
-            fluid.clip.set_gradient_clip(
-                fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
-            p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
-
-        grad_list = [elem[1] for elem in p_g]
-        grad_clip_list = [elem[1] for elem in p_g_clip]
-
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=128)
-
-        exe = fluid.Executor(place)
-        feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
-        exe.run(startup_program)
-
-        count = 0
-        for data in train_reader():
-            count += 1
-            if count > 5:
-                break
-            out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
-            out_clip = exe.run(prog_clip,
-                               feed=feeder.feed(data),
-                               fetch_list=grad_clip_list)
-            global_norm = 0
-            for v in out:
-                global_norm += np.sum(np.power(v, 2))
-            global_norm = np.sqrt(global_norm)
-
-            global_norm_clip = 0
-            for v in out_clip:
-                global_norm_clip += np.sum(np.power(v, 2))
-            global_norm_clip = np.sqrt(global_norm_clip)
-
-            assert np.isclose(
-                a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3)
-
-    def check_sparse_gradient_clip(self, place):
-        prog = fluid.framework.Program()
-        startup_program = fluid.framework.Program()
-        with fluid.program_guard(
-                main_program=prog, startup_program=startup_program):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            cost = bow_net(data, label, self.word_dict_len)
-
-            fluid.clip.set_gradient_clip(
-                clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
-
-            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-            sgd_optimizer.minimize(cost)
-
-        exe = fluid.Executor(place)
-        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-        exe.run(startup_program)
-
-        data = next(self.train_data())
-        val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0]
-        self.assertEqual((1, ), val.shape)
-        print(val)
-        self.assertFalse(np.isnan(val))
-
-    def test_operators(self):
-        self.check_operators(core.CPUPlace())
-
-    def test_sparse_gradient_clip(self):
-        for place in self.get_places():
-            self.check_sparse_gradient_clip(place)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
deleted file mode 100644
index c2529e0d70c9a359d2a44c671769d50a92650a73..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def AffineGrid(theta, size):
-    n = size[0]
-    h = size[2]
-    w = size[3]
-    h_idx = np.repeat(
-        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
-    w_idx = np.repeat(
-        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
-    grid = np.concatenate(
-        [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
-    grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
-
-    ret = np.zeros([n, h * w, 2])
-    theta = theta.transpose([0, 2, 1])
-    for i in range(len(theta)):
-        ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
-
-    return ret.reshape([n, h, w, 2]).astype("float32")
-
-
-def getGridPointValue(data, x, y):
-    data_shape = data.shape
-    N = data_shape[0]
-    H = data_shape[2]
-    W = data_shape[3]
-
-    out = np.zeros(data_shape, dtype='float')
-    for i in range(N):
-        for j in range(H):
-            for k in range(W):
-                if y[i, j, k] < 0 or y[i, j, k] > H - 1 or x[i, j, k] < 0 or x[
-                        i, j, k] > W - 1:
-                    out[i, :, j, k] = 0
-                else:
-                    out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]]
-
-    return out
-
-
-def GridSampler(data, grid):
-    dims = data.shape
-    N = dims[0]
-    C = dims[1]
-    H = dims[2]
-    W = dims[3]
-
-    x = grid[:, :, :, 0]
-    y = grid[:, :, :, 1]
-    y_max = H - 1
-    x_max = W - 1
-
-    x = 0.5 * ((x.astype('float32') + 1.0) * x_max)
-    y = 0.5 * ((y.astype('float32') + 1.0) * y_max)
-
-    x0 = np.floor(x).astype('int32')
-    x1 = x0 + 1
-    y0 = np.floor(y).astype('int32')
-    y1 = y0 + 1
-
-    wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
-
-    va = getGridPointValue(data, x0, y0)
-    vb = getGridPointValue(data, x0, y1)
-    vc = getGridPointValue(data, x1, y0)
-    vd = getGridPointValue(data, x1, y1)
-
-    out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float32')
-    return out
-
-
-class TestGridSamplerOp(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = 'grid_sampler'
-        x = np.random.randint(0, 255, self.x_shape).astype('float32')
-
-        theta = np.zeros(self.theta_shape).astype('float32')
-        for i in range(self.theta_shape[0]):
-            for j in range(2):
-                for k in range(3):
-                    theta[i, j, k] = np.random.rand(1)[0]
-        grid = AffineGrid(theta, self.x_shape)
-
-        self.inputs = {'X': x, 'Grid': grid}
-        self.attrs = {'use_cudnn': True}
-        self.outputs = {'Output': GridSampler(x, grid)}
-
-    def test_check_output(self):
-        self.check_output(atol=1e-3)
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61)
-
-    def initTestCase(self):
-        self.x_shape = (2, 5, 7, 3)
-        self.grid_shape = (2, 7, 3, 2)
-        self.theta_shape = (2, 2, 3)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op.py b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
deleted file mode 100644
index 7fcde530fe99a2ee910f5e58780ea2682f18c797..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ /dev/null
@@ -1,237 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-
-from operator import mul
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from op_test import OpTest
-
-from testsuite import create_op
-
-
-def group_norm_naive(x, scale, bias, epsilon, groups, data_layout):
-    if data_layout == "NHWC":
-        x = np.transpose(x, (0, 3, 1, 2))  # NHWC => NCHW
-    N, C, H, W = x.shape
-    G = groups
-    x = x.reshape((N * G, -1))
-    mean = np.mean(x, axis=1, keepdims=True)
-    var = np.var(x, axis=1, keepdims=True)
-    output = (x - mean) / np.sqrt(var + epsilon)
-    output = output.reshape((N, C, H, W)) * scale.reshape(
-        (-1, 1, 1)) + bias.reshape((-1, 1, 1))
-    if data_layout == "NHWC":
-        output = np.transpose(output, (0, 2, 3, 1))  # NCHW => NHWC
-    return output, mean.reshape((N, G)), var.reshape((N, G))
-
-
-class TestGroupNormOp(OpTest):
-    def setUp(self):
-        self.op_type = "group_norm"
-        self.data_format = "NCHW"
-        self.dtype = np.float32
-        self.shape = (2, 4, 3, 3)
-        self.attrs = {'epsilon': 1e-5, 'groups': 2, 'data_layout': "NCHW"}
-        self.compare_between_place = False
-        self.init_test_case()
-
-        input = np.random.random(self.shape).astype(self.dtype)
-        if self.data_format == "NHWC":
-            input = np.transpose(input, (0, 2, 3, 1))
-        scale = np.random.random([self.shape[1]]).astype(self.dtype)
-        bias = np.random.random([self.shape[1]]).astype(self.dtype)
-        output, mean, var = group_norm_naive(
-            input, scale, bias, self.attrs['epsilon'], self.attrs['groups'],
-            self.data_format)
-
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(input),
-            'Scale': OpTest.np_dtype_to_fluid_dtype(scale),
-            'Bias': OpTest.np_dtype_to_fluid_dtype(bias)
-        }
-        self.outputs = {'Y': output, 'Mean': mean, 'Variance': var}
-        self.attrs['data_layout'] = self.data_format
-
-    def test_check_output(self):
-        atol = 1e-4
-        inplace_atol = 1e-4
-        place = core.CPUPlace()
-        # add inplace_atol bacause group_norm doesn't ensure computational consistency
-        self.check_output_with_place(
-            place, atol=atol, inplace_atol=inplace_atol)
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(
-                place, atol=atol, inplace_atol=inplace_atol)
-
-    def do_compare_between_place(self):
-        if not core.is_compiled_with_cuda(): return
-        place = core.CPUPlace()
-        place2 = core.CUDAPlace(0)
-        self.scope = core.Scope()
-        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
-        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
-        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
-        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
-                            op_attrs)
-        inputs_to_check = set(['X', 'Scale', 'Bias'])
-        output_names = 'Y'
-        cpu_grads = self._get_gradient(inputs_to_check, place, output_names,
-                                       None)
-        gpu_grads = self._get_gradient(inputs_to_check, place2, output_names,
-                                       None)
-        self._assert_is_close(cpu_grads, gpu_grads, inputs_to_check, 0.005,
-                              "Gradient Check On %s" % str(place))
-
-    def test_check_grad(self):
-        if self.compare_between_place:
-            self.do_compare_between_place()
-            return
-
-        place = core.CPUPlace()
-        self.check_grad_with_place(
-            place, set(['X', 'Scale', 'Bias']), 'Y', max_relative_error=0.01)
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['X', 'Scale', 'Bias']),
-                'Y',
-                max_relative_error=0.01)
-
-    def init_test_case(self):
-        pass
-
-
-class TestGroupNormOp1(TestGroupNormOp):
-    def init_test_case(self):
-        self.attrs['groups'] = 1
-
-
-class TestGroupNormOp2(TestGroupNormOp):
-    def init_test_case(self):
-        self.attrs['groups'] = 4
-
-
-class TestGroupNormOpBigEps1(TestGroupNormOp):
-    def init_test_case(self):
-        self.attrs['groups'] = 1
-        self.attrs['epsilon'] = 0.5
-
-
-class TestGroupNormOpBigEps2(TestGroupNormOp):
-    def init_test_case(self):
-        self.attrs['groups'] = 4
-        self.attrs['epsilon'] = 0.5
-
-
-class TestGroupNormOpBigEps3(TestGroupNormOp):
-    def init_test_case(self):
-        self.attrs['epsilon'] = 0.5
-
-
-class TestGroupNormOpLargeData(TestGroupNormOp):
-    def init_test_case(self):
-        self.shape = (2, 32, 64, 64)
-        self.attrs['groups'] = 8
-        self.compare_between_place = True
-
-
-class TestGroupNormOp1_With_NHWC(TestGroupNormOp):
-    def init_test_case(self):
-        self.attrs['groups'] = 1
-        self.data_format = "NHWC"
-
-
-class TestGroupNormOp2_With_NHWC(TestGroupNormOp):
-    def init_test_case(self):
-        self.attrs['groups'] = 4
-        self.data_format = "NHWC"
-
-
-class TestGroupNormOpBigEps1_With_NHWC(TestGroupNormOp):
-    def init_test_case(self):
-        self.attrs['groups'] = 1
-        self.attrs['epsilon'] = 0.5
-        self.data_format = "NHWC"
-
-
-class TestGroupNormOpBigEps2_With_NHWC(TestGroupNormOp):
-    def init_test_case(self):
-        self.attrs['groups'] = 4
-        self.attrs['epsilon'] = 0.5
-        self.data_format = "NHWC"
-
-
-class TestGroupNormOpBigEps3_With_NHWC(TestGroupNormOp):
-    def init_test_case(self):
-        self.attrs['epsilon'] = 0.5
-        self.data_format = "NHWC"
-
-
-class TestGroupNormOpLargeData_With_NHWC(TestGroupNormOp):
-    def init_test_case(self):
-        self.shape = (2, 64, 32, 32)  # NCHW
-        self.attrs['groups'] = 8
-        self.data_format = "NHWC"
-        self.compare_between_place = True
-
-
-class TestGroupNormAPI_With_NHWC(OpTest):
-    def test_case1(self):
-        data1 = fluid.layers.data(
-            name='data1', shape=[3, 3, 4], dtype='float32')
-        out1 = fluid.layers.group_norm(
-            input=data1, groups=2, data_layout="NHWC")
-        data2 = fluid.layers.data(
-            name='data2', shape=[4, 3, 3], dtype='float32')
-        out2 = fluid.layers.group_norm(
-            input=data2, groups=2, data_layout="NCHW")
-
-        data1_np = np.random.random((2, 3, 3, 4)).astype("float32")
-        data2_np = np.random.random((2, 4, 3, 3)).astype("float32")
-        scale = np.array([1]).astype("float32")
-        bias = np.array([0]).astype("float32")
-
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        results = exe.run(fluid.default_main_program(),
-                          feed={"data1": data1_np,
-                                "data2": data2_np},
-                          fetch_list=[out1, out2],
-                          return_numpy=True)
-        expect_res1 = group_norm_naive(
-            data1_np, scale, bias, epsilon=1e-5, groups=2, data_layout="NHWC")
-        expect_res2 = group_norm_naive(
-            data2_np, scale, bias, epsilon=1e-5, groups=2, data_layout="NCHW")
-        self.assertTrue(np.allclose(results[0], expect_res1[0]))
-        self.assertTrue(np.allclose(results[1], expect_res2[0]))
-
-    # data_layout is not NHWC or NCHW
-    def test_case2(self):
-        data = fluid.layers.data(name='data', shape=[3, 3, 4], dtype="float32")
-        try:
-            out = fluid.layers.group_norm(
-                input=data, groups=2, data_layout="NDHW")
-        except:
-            pass
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
deleted file mode 100644
index 17af1d88d086f9a53ef8075572184a4cd4d3be88..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ /dev/null
@@ -1,233 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-import functools
-from op_test import OpTest
-from test_lstm_op import ACTIVATION
-
-
-def gru(
-        input,  # T x 3D
-        lod,  # 1 x N
-        h0,  # N x D
-        weight,  # D x 3D
-        bias,  # 1 x 3D
-        is_reverse,
-        act_state,
-        act_gate,
-        dtype='float32',
-        origin_mode=False):
-    def _seq_to_batch(lod, is_reverse):
-        idx_in_seq_list = []
-        seq_lens = lod[0]
-        seq_starts = [0]
-        for i in range(len(seq_lens)):
-            seq_starts.append(seq_starts[-1] + seq_lens[i])
-        sorted_seqs = sorted(
-            list(range(len(seq_lens))),
-            key=functools.cmp_to_key(lambda x, y: seq_lens[y] - seq_lens[x]))
-        num_batch = seq_lens[sorted_seqs[0]]
-        for batch_idx in range(num_batch):
-            idx_in_seq = []
-            for i in range(len(seq_lens)):
-                if seq_lens[sorted_seqs[i]] <= batch_idx:
-                    break
-                idx = (seq_starts[sorted_seqs[i] + 1] - 1 - batch_idx
-                       ) if is_reverse else (
-                           seq_starts[sorted_seqs[i]] + batch_idx)
-                idx_in_seq.append(idx)
-            idx_in_seq_list.append(idx_in_seq)
-        return idx_in_seq_list, sorted_seqs
-
-    def _step(x, h_p, w, b, act_state, act_gate):
-        T = x.shape[0]
-        D = w.shape[0]
-        g = x + np.tile(b, (T, 1))
-        w_u_r = w.flatten()[:D * D * 2].reshape((D, D * 2))
-        u_r = act_gate(np.dot(h_p, w_u_r) + g[:, :D * 2])
-        u = u_r[:, :D]
-        r = u_r[:, D:D * 2]
-        r_h_p = r * h_p
-        w_c = w.flatten()[D * D * 2:].reshape((D, D))
-        c = act_state(np.dot(r_h_p, w_c) + g[:, D * 2:])
-        g = np.hstack((u_r, c))
-        if origin_mode:
-            h = (1 - u) * c + u * h_p
-        else:
-            h = u * c + (1 - u) * h_p
-        return g, r_h_p, h
-
-    T = sum(lod[0])
-    N = len(lod[0])
-    D = weight.shape[0]
-    batch_gate = np.zeros((T, 3 * D), dtype=dtype)
-    batch_reset_hidden_prev = np.zeros((T, D), dtype=dtype)
-    batch_hidden = np.zeros((T, D), dtype=dtype)
-    hidden = np.zeros((T, D), dtype=dtype)
-
-    idx_in_seq_list, sorted_seqs = _seq_to_batch(lod, is_reverse)
-    h_p = h0[[seq for seq in sorted_seqs if lod[0][seq] > 0]]
-
-    max_seq_len = len(idx_in_seq_list)
-    end_idx = 0
-    for batch_idx in range(max_seq_len):
-        x = input[idx_in_seq_list[batch_idx]]
-        g, r_h_p, h = _step(x, h_p, weight, bias, act_state, act_gate)
-        if batch_idx < (max_seq_len - 1):
-            h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
-        start_idx = end_idx
-        end_idx = start_idx + len(idx_in_seq_list[batch_idx])
-        batch_gate[start_idx:end_idx] = g
-        batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
-        batch_hidden[start_idx:end_idx] = h
-        hidden[idx_in_seq_list[batch_idx]] = h
-    return batch_gate, batch_reset_hidden_prev, batch_hidden, hidden
-
-
-class TestGRUOp(OpTest):
-    def set_confs(self):
-        pass
-
-    def setUp(self):
-        self.op_type = "gru"
-        self.lod = [[2, 4, 3]]
-        self.D = 5
-        self.is_reverse = False
-        self.with_h0 = True
-        self.with_bias = True
-        self.act_state = 'tanh'
-        self.act_gate = 'sigmoid'
-        self.dtype = 'float64'
-        self.origin_mode = False
-        self.set_confs()
-
-        T = sum(self.lod[0])
-        N = len(self.lod[0])
-        input = np.random.rand(T, 3 * self.D).astype(self.dtype)
-        weight = np.random.rand(self.D, 3 * self.D).astype(self.dtype)
-        bias = np.random.rand(
-            1, 3 * self.D).astype(self.dtype) if self.with_bias else np.zeros(
-                (1, 3 * self.D), dtype=self.dtype)
-        h0 = np.random.rand(
-            N, self.D).astype(self.dtype) if self.with_h0 else np.zeros(
-                (N, self.D), dtype=self.dtype)
-
-        batch_gate, batch_reset_hidden_prev, batch_hidden, hidden = gru(
-            input, self.lod, h0, weight, bias, self.is_reverse,
-            ACTIVATION[self.act_state], ACTIVATION[self.act_gate], self.dtype,
-            self.origin_mode)
-        self.inputs = {'Input': (input, self.lod), 'Weight': weight}
-
-        if self.with_bias:
-            self.inputs['Bias'] = bias
-
-        if self.with_h0:
-            self.inputs['H0'] = h0
-
-        self.outputs = {
-            'Hidden': (hidden, self.lod),
-            'BatchGate': batch_gate,
-            'BatchResetHiddenPrev': batch_reset_hidden_prev,
-            'BatchHidden': batch_hidden,
-        }
-
-        self.attrs = {
-            'activation': self.act_state,
-            'gate_activation': self.act_gate,
-            'is_reverse': self.is_reverse,
-            'origin_mode': self.origin_mode
-        }
-
-    def test_check_output(self):
-        self.check_output(atol=1e-8, check_dygraph=True)
-
-    def test_check_grad(self):
-        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
-
-
-class TestGRUOriginMode(TestGRUOp):
-    def set_confs(self):
-        self.origin_mode = True
-
-
-class TestGRUOp2(TestGRUOp):
-    def set_confs(self):
-        self.D = 19
-        self.dtype = 'float32'
-
-
-class TestGRUOp2Len0(TestGRUOp):
-    def set_confs(self):
-        self.D = 19
-        self.lod = [[2, 0, 4]]
-        self.dtype = 'float32'
-
-
-class TestGRUOp2OriginMode(TestGRUOp):
-    def set_confs(self):
-        self.D = 19
-        self.dtype = 'float32'
-        self.origin_mode = True
-
-
-class TestGRUOp2OriginModeLen0(TestGRUOp):
-    def set_confs(self):
-        self.D = 19
-        self.lod = [[0, 3, 4]]
-        self.dtype = 'float32'
-        self.origin_mode = True
-
-
-class TestGRUOp2OriginModeLastLen0(TestGRUOp):
-    def set_confs(self):
-        self.D = 19
-        self.lod = [[0, 3, 0]]
-        self.dtype = 'float32'
-        self.origin_mode = True
-
-
-class TestGRUOpNoInitial(TestGRUOp):
-    def set_confs(self):
-        self.with_h0 = False
-
-    def test_check_grad(self):
-        self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'])
-
-
-class TestGRUOpNoBias(TestGRUOp):
-    def set_confs(self):
-        self.with_bias = False
-
-    def test_check_grad(self):
-        self.check_grad(['Input', 'H0', 'Weight'], ['Hidden'])
-
-
-class TestGRUOpReverse(TestGRUOp):
-    def set_confs(self):
-        self.is_reverse = True
-
-
-class TestGRUOpReverseOriginMode(TestGRUOp):
-    def set_confs(self):
-        self.is_reverse = True
-        self.origin_mode = True
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
deleted file mode 100644
index 78f2f030f5b6dc9d827f3930dff590a0f5b784fb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import math
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class GRUActivationType(OpTest):
-    identity = 0
-    sigmoid = 1
-    tanh = 2
-    relu = 3
-
-
-def identity(x):
-    return x
-
-
-def sigmoid(x):
-    return 1. / (1. + np.exp(-x))
-
-
-def tanh(x):
-    return 2. * sigmoid(2. * x) - 1.
-
-
-def relu(x):
-    return np.maximum(x, 0)
-
-
-class TestGRUUnitOp(OpTest):
-    batch_size = 5
-    frame_size = 10
-    activate = {
-        GRUActivationType.identity: identity,
-        GRUActivationType.sigmoid: sigmoid,
-        GRUActivationType.tanh: tanh,
-        GRUActivationType.relu: relu,
-    }
-
-    def set_inputs(self, origin_mode=False):
-        batch_size = self.batch_size
-        frame_size = self.frame_size
-        self.op_type = 'gru_unit'
-        self.inputs = {
-            'Input': np.random.uniform(
-                -0.1, 0.1, (batch_size, frame_size * 3)).astype('float64'),
-            'HiddenPrev': np.random.uniform(
-                -0.1, 0.1, (batch_size, frame_size)).astype('float64'),
-            'Weight': np.random.uniform(
-                -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size),
-                (frame_size, frame_size * 3)).astype('float64'),
-        }
-        self.attrs = {
-            'activation': GRUActivationType.tanh,
-            'gate_activation': GRUActivationType.sigmoid,
-            'origin_mode': origin_mode
-        }
-
-    def set_outputs(self, origin_mode=False):
-        # GRU calculations
-        batch_size = self.batch_size
-        frame_size = self.frame_size
-        x = self.inputs['Input']
-        h_p = self.inputs['HiddenPrev']
-        w = self.inputs['Weight']
-        b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros(
-            (1, frame_size * 3))
-        g = x + np.tile(b, (batch_size, 1))
-        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
-            (frame_size, frame_size * 2))
-        u_r = self.activate[self.attrs['gate_activation']](np.dot(
-            h_p, w_u_r) + g[:, :frame_size * 2])
-        u = u_r[:, :frame_size]
-        r = u_r[:, frame_size:frame_size * 2]
-        r_h_p = r * h_p
-        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
-            (frame_size, frame_size))
-        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
-                                                    g[:, frame_size * 2:])
-        g = np.hstack((u_r, c))
-        if origin_mode:
-            h = (1 - u) * c + u * h_p
-        else:
-            h = u * c + (1 - u) * h_p
-        self.outputs = {
-            'Gate': g.astype('float64'),
-            'ResetHiddenPrev': r_h_p.astype('float64'),
-            'Hidden': h.astype('float64')
-        }
-
-    def setUp(self):
-        self.set_inputs()
-        self.set_outputs()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Input', 'HiddenPrev', 'Weight'], ['Hidden'])
-
-
-class TestGRUUnitOpOriginMode(TestGRUUnitOp):
-    def setUp(self):
-        self.set_inputs(origin_mode=True)
-        self.set_outputs(origin_mode=True)
-
-
-class TestGRUUnitOpWithBias(TestGRUUnitOp):
-    def set_inputs(self, origin_mode=False):
-        batch_size = self.batch_size
-        frame_size = self.frame_size
-        super(TestGRUUnitOpWithBias, self).set_inputs()
-        self.inputs['Bias'] = np.random.uniform(
-            -0.1, 0.1, (1, frame_size * 3)).astype('float64')
-        self.attrs = {
-            'activation': GRUActivationType.identity,
-            'gate_activation': GRUActivationType.sigmoid,
-            'origin_mode': origin_mode
-        }
-
-    def test_check_grad(self):
-        self.check_grad(['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'])
-
-    def test_check_grad_ingore_input(self):
-        self.check_grad(
-            ['HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
-            no_grad_set=set('Input'))
-
-
-class TestGRUUnitOpWithBiasOriginMode(TestGRUUnitOpWithBias):
-    def setUp(self):
-        self.set_inputs(origin_mode=True)
-        self.set_outputs(origin_mode=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hash_op.py b/python/paddle/fluid/tests/unittests/test_hash_op.py
deleted file mode 100644
index 75af02bd5f46ea61f0bf4bc2494cb941fb1f64b4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_hash_op.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestHashOp(OpTest):
-    def setUp(self):
-        self.op_type = "hash"
-        self.init_test_case()
-        self.inputs = {'X': (self.in_seq, self.lod)}
-        self.attrs = {'num_hash': 2, 'mod_by': 10000}
-        self.outputs = {'Out': (self.out_seq, self.lod)}
-
-    def init_test_case(self):
-        np.random.seed(1)
-        self.in_seq = np.random.randint(0, 10, (8, 1)).astype("int32")
-        self.lod = [[2, 6]]
-        self.out_seq = [[[3481], [7475]], [[1719], [5986]], [[8473], [694]],
-                        [[3481], [7475]], [[4372], [9456]], [[4372], [9456]],
-                        [[6897], [3218]], [[9038], [7951]]]
-        self.out_seq = np.array(self.out_seq)
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestHashNotLoDOp(TestHashOp):
-    def setUp(self):
-        self.op_type = "hash"
-        self.init_test_case()
-        self.inputs = {'X': self.in_seq}
-        self.attrs = {'num_hash': 2, 'mod_by': 10000}
-        self.outputs = {'Out': self.out_seq}
-
-    def init_test_case(self):
-        np.random.seed(1)
-        self.in_seq = np.random.randint(0, 10, (8, 1)).astype("int32")
-        self.out_seq = [[[3481], [7475]], [[1719], [5986]], [[8473], [694]],
-                        [[3481], [7475]], [[4372], [9456]], [[4372], [9456]],
-                        [[6897], [3218]], [[9038], [7951]]]
-        self.out_seq = np.array(self.out_seq)
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestHashOp2(TestHashOp):
-    """
-    Case:
-    int64 type input
-    """
-
-    def setUp(self):
-        self.op_type = "hash"
-        self.init_test_case()
-        self.inputs = {'X': self.in_seq}
-        self.attrs = {'num_hash': 2, 'mod_by': 10000}
-        self.outputs = {'Out': self.out_seq}
-
-    def init_test_case(self):
-        self.in_seq = np.array([1, 2**32 + 1]).reshape((2, 1)).astype("int64")
-        self.out_seq = np.array([1269, 9609, 3868, 7268]).reshape((2, 2, 1))
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestHashOp3(TestHashOp):
-    """
-    Case:
-    int64 type input
-    int64 type mod_by attr
-    """
-
-    def setUp(self):
-        self.op_type = "hash"
-        self.init_test_case()
-        self.inputs = {'X': self.in_seq}
-        self.attrs = {'num_hash': 2, 'mod_by': 2**32}
-        self.outputs = {'Out': self.out_seq}
-
-    def init_test_case(self):
-        self.in_seq = np.array([10, 5]).reshape((2, 1)).astype("int64")
-        self.out_seq = np.array(
-            [1204014882, 393011615, 3586283837, 2814821595]).reshape((2, 2, 1))
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py b/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
deleted file mode 100644
index 1eb441e2c52905c2b60104de5e04037714b34648..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestHingeLossOp(OpTest):
-    def setUp(self):
-        self.op_type = 'hinge_loss'
-        samples_num = 64
-        logits = np.random.uniform(-10, 10, (samples_num, 1)).astype('float32')
-        labels = np.random.randint(0, 2, (samples_num, 1)).astype('float32')
-
-        self.inputs = {
-            'Logits': logits,
-            'Labels': labels,
-        }
-        loss = np.maximum(1.0 - (2 * labels - 1) * logits, 0)
-        self.outputs = {'Loss': loss}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Logits'], 'Loss', max_relative_error=0.008)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
deleted file mode 100644
index 8ed5074dc2626ff58fc65d8af1340e260c029572..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ /dev/null
@@ -1,349 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-import math
-from op_test import OpTest
-
-np.random.seed(100)
-
-
-def find_latest_set(num):
-    return 1 + int(math.floor(math.log(num, 2)))
-
-
-class CodeTable(object):
-    def __init__(self, num_classes, code):
-        self.c = num_classes + code
-
-    def cal_index(self, bit):
-        return (self.c >> (bit + 1)) - 1
-
-    def get_length(self):
-        return find_latest_set(self.c) - 1
-
-    def cal_bit(self, bit):
-        return self.c & (1 << bit)
-
-
-class CodeTableWithCustomTree(object):
-    def __init__(self, path_table, path_code, index):
-        self.ptable_ = path_table
-        self.pcode_ = path_code
-        self.index_ = index
-
-    def cal_index(self, bit):
-        return self.ptable_[self.index_][bit]
-
-    def get_length(self):
-        length = 0
-        for ele in self.ptable_[self.index_]:  # find the first -1 to stop trace
-
-            if ele >= 0:
-                length = length + 1
-            else:
-                return length
-        return length
-
-    def cal_bit(self, bit):
-        return self.pcode_[self.index_][bit]
-
-
-def hsigmoid(x, w, label, bias, num_classes):
-    batch_size = x.shape[0]
-    code_length = find_latest_set(num_classes - 1)
-    code_table = [0 for _ in range(code_length)]
-    pre_output = np.zeros((batch_size, code_length))
-    pre_sum = np.zeros((batch_size, 1))
-    out = np.zeros((batch_size, 1)).astype("float32")
-    for i in range(batch_size):
-        code_table = CodeTable(num_classes, label[i])
-        length = code_table.get_length()
-        for j in range(length):
-            idx = code_table.cal_index(j)
-            pre_output[i][j] += bias[idx][0]
-    for i in range(batch_size):
-        code_table = CodeTable(num_classes, label[i])
-        length = code_table.get_length()
-        for j in range(length):
-            idx = code_table.cal_index(j)
-            pre_output[i][j] += np.dot(w[idx], x[i])
-    # clip[-40.0, 40.0]
-    pre_output = np.clip(pre_output, -40.0, 40.0)
-    # out(i, 0) = \sum_j  bit(i, j) * preout(i, j)
-    for i in range(batch_size):
-        code_table = CodeTable(num_classes, label[i])
-        length = code_table.get_length()
-        sum = 0.0
-        for j in range(length):
-            if code_table.cal_bit(j):
-                sum += pre_output[i][j]
-        out[i] = -1.0 * sum
-    # soft relu
-    pre_output = np.log(1 + np.exp(pre_output))
-    pre_sum = pre_output.sum(1).reshape((batch_size, 1))
-    out += pre_sum
-    return pre_output, out
-
-
-def hsigmoidWithCustomTree(x, w, path_table, path_code, label, bias,
-                           num_classes):
-    batch_size = x.shape[0]
-    code_length = len(path_table[0])
-    code_table = [0 for _ in range(code_length)]
-    # init pre_out with shape [N, code_length]
-    pre_output = np.zeros((batch_size, code_length))
-    pre_sum = np.zeros((batch_size, 1))
-    out = np.zeros((batch_size, 1)).astype("float32")
-    if isinstance(bias, np.ndarray):
-        for i in range(batch_size):
-            code_table = CodeTableWithCustomTree(path_table, path_code, i)
-            length = code_table.get_length()
-            for j in range(length):
-                idx = code_table.cal_index(j)
-                pre_output[i][j] += bias[idx][0]
-    for i in range(batch_size):
-        code_table = CodeTableWithCustomTree(path_table, path_code, i)
-        length = code_table.get_length()
-        for j in range(length):
-            idx = code_table.cal_index(j)
-            pre_output[i][j] += np.dot(w[idx], x[i])
-    # clip[-40.0, 40.0]
-    pre_output = np.clip(pre_output, -40.0, 40.0)
-    # out(i, 0) = \sum_j  bit(i, j) * preout(i, j)
-    for i in range(batch_size):
-        code_table = CodeTableWithCustomTree(path_table, path_code, i)
-        length = code_table.get_length()
-        sum = 0.0
-        for j in range(length):
-            if code_table.cal_bit(j):
-                sum += pre_output[i][j]
-        out[i] = -1.0 * sum
-    # soft relu
-    pre_output = np.log(1 + np.exp(pre_output))
-    pre_sum = pre_output.sum(1).reshape((batch_size, 1))
-    out += pre_sum
-    return pre_output, out
-
-
-class TestHSigmoidOp(OpTest):
-    def setUp(self):
-        self.op_type = "hierarchical_sigmoid"
-        num_classes = 6
-        feature_size = 8
-        batch_size = 4
-        x = np.random.random((batch_size, feature_size)).astype("float32") * 2
-        w = np.random.random(
-            (num_classes - 1, feature_size)).astype("float32") * 2
-        label = np.random.randint(0, num_classes, (batch_size, 1))
-        bias = np.random.random((num_classes - 1, 1)).astype("float32")
-        self.attrs = {'num_classes': num_classes, 'is_sparse': False}
-        self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
-        pre_output, out = hsigmoid(x, w, label, bias, num_classes)
-        self.outputs = {'PreOut': pre_output, 'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
-
-
-class TestHSigmoidOpSparse(OpTest):
-    def setUp(self):
-        self.op_type = "hierarchical_sigmoid"
-        num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
-        feature_size = 8
-        batch_size = 4
-        x = np.random.random((batch_size, feature_size)).astype("float32")
-        w = np.random.random((num_classes - 1, feature_size)).astype("float32")
-        label = np.array([0, 1, 4, 5])
-        path_table = np.array(
-            [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
-             (0, 2, -1, -1,
-              -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
-        bias = np.random.random((num_classes - 1, 1)).astype("float32")
-        self.attrs = {'num_classes': num_classes, 'is_sparse': True}
-        self.inputs = {
-            'X': x,
-            'W': w,
-            'PathTable': path_table,
-            'PathCode': path_code,
-            'Label': label,
-            'Bias': bias
-        }
-        pre_output, out = hsigmoidWithCustomTree(x, w, path_table, path_code,
-                                                 label, bias, num_classes)
-        self.outputs = {'PreOut': pre_output, 'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
-    def hs_net_conf(self, is_sparse):
-        input_word = fluid.layers.data(name="x", shape=[1], dtype='int64')
-        path_table = fluid.layers.data(
-            name='path_table', shape=[3], dtype='int64')
-        path_code = fluid.layers.data(
-            name='path_code', shape=[3], dtype='int64')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-        data_list = [input_word, path_table, path_code, label]
-
-        emb = fluid.layers.embedding(
-            input=input_word,
-            is_sparse=is_sparse,
-            size=[3, 3],
-            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
-                scale=1 / math.sqrt(3))))
-
-        cost = fluid.layers.hsigmoid(
-            input=emb,
-            label=label,
-            bias_attr=True,
-            num_classes=3,
-            path_table=path_table,
-            path_code=path_code,
-            is_custom=True,
-            is_sparse=is_sparse)
-
-        avg_cost = fluid.layers.reduce_mean(cost)
-
-        return avg_cost, data_list
-
-    def training_test(self, is_sparse):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            start_up = fluid.default_startup_program()
-            start_up.random_seed = 1  # Fix random seed
-            x = np.arange(6).reshape(6)
-            path_table = np.array([(1, 2, -1), (1, 2, -1)])
-            path_code = np.array([(1, 0, -1), (0, 0, -1)])
-            label = np.array([1, 4])
-
-            loss, data_list = self.hs_net_conf(is_sparse)
-            optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
-            optimizer.minimize(loss)
-
-            main_program = fluid.default_main_program()
-            place = fluid.CPUPlace()
-            feeder = fluid.DataFeeder(feed_list=data_list, place=place)
-            exe = fluid.Executor(place)
-
-            exe.run(start_up)
-            result = list()
-            for i in range(10):
-                data = [([[x[i % 2]]], [list(path_table[i % 2])],
-                         [list(path_code[i % 2])], [label[i % 2]])]
-
-                loss_val = exe.run(main_program,
-                                   feed=feeder.feed(data),
-                                   fetch_list=[loss])
-                result.append(loss_val)
-        return result
-
-    def test_hs_grad_with_sparse(self):
-        dense_result = self.training_test(is_sparse=False)
-        sparse_result = self.training_test(is_sparse=True)
-        assert (dense_result == sparse_result)
-
-
-class TestHSigmoidOpWithCostumTree(OpTest):
-    def setUp(self):
-        self.op_type = "hierarchical_sigmoid"
-        num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
-        feature_size = 8
-        batch_size = 4
-        x = np.random.random((batch_size, feature_size)).astype("float32") * 2
-        w = np.random.random(
-            (num_classes - 1, feature_size)).astype("float32") * 2
-        label = np.array([0, 1, 4, 5])
-        path_table = np.array(
-            [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
-             (0, 2, -1, -1,
-              -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
-        bias = np.random.random((num_classes - 1, 1)).astype("float32")
-        self.attrs = {'num_classes': num_classes, 'is_sparse': False}
-        self.inputs = {
-            'X': x,
-            'W': w,
-            'PathTable': path_table,
-            'PathCode': path_code,
-            'Label': label,
-            'Bias': bias
-        }
-        pre_output, out = hsigmoidWithCustomTree(x, w, path_table, path_code,
-                                                 label, bias, num_classes)
-        self.outputs = {'PreOut': pre_output, 'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
-
-
-class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest):
-    def setUp(self):
-        self.op_type = "hierarchical_sigmoid"
-        num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
-        feature_size = 8
-        batch_size = 4
-        x = np.random.random((batch_size, feature_size)).astype("float32") * 2
-        w = np.random.random(
-            (num_classes - 1, feature_size)).astype("float32") * 2
-        label = np.array([0, 1, 4, 5])
-        path_table = np.array(
-            [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
-             (0, 2, -1, -1,
-              -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
-        # bias = np.random.random((num_classes - 1, 1)).astype("float32")
-        self.attrs = {'num_classes': num_classes, 'is_sparse': False}
-        self.inputs = {
-            'X': x,
-            'W': w,
-            'PathTable': path_table,
-            'PathCode': path_code,
-            'Label': label,
-        }
-        pre_output, out = hsigmoidWithCustomTree(
-            x=x,
-            w=w,
-            path_table=path_table,
-            path_code=path_code,
-            label=label,
-            bias=None,
-            num_classes=num_classes)
-        self.outputs = {'PreOut': pre_output, 'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'W'], ['Out'], no_grad_set=set('Label'))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
deleted file mode 100644
index 014d30486d7e2009165a2e05cbedffc7c175ccee..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
+++ /dev/null
@@ -1,271 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import signal
-import time
-import unittest
-from multiprocessing import Process
-
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.framework import Program, program_guard
-from dist_test_utils import *
-
-
-def run_pserver(pserver_id, use_cuda, sync_mode):
-    remove_ps_flag(os.getpid())
-    scope = fluid.core.Scope()
-    program = Program()
-    with fluid.scope_guard(scope):
-        with program_guard(program, startup_program=Program()):
-            # create table parameter in scope
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            # create and initialize Param Variable
-            param = scope.var('table').get_tensor()
-
-            param_array = np.ones((5, 8)).astype("float32")
-            for i in range(len(param_array)):
-                param_array[i] *= param_array[i] * i + pserver_id * 10 + 1
-            param.set(param_array, place)
-
-            optimize_block = program._create_block(program.global_block().idx)
-            program.global_block().append_op(
-                type="listen_and_serv",
-                inputs={'X': []},
-                outputs={},
-                attrs={
-                    "optimize_blocks": [optimize_block],
-                    "endpoint": '127.0.0.1:0',
-                    "Fanin": 1,
-                    "sync_mode": True,
-                    "grad_to_block_id": []
-                })
-
-            exe = fluid.Executor(place)
-            exe.run(program)
-
-
-class TestListenAndServOp(unittest.TestCase):
-    def setUp(self):
-        self.ps_timeout = 5
-
-    def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
-        p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _wait_ps_ready(self, pid):
-        start_left_time = self.ps_timeout
-        sleep_time = 0.5
-        while True:
-            assert start_left_time >= 0, "wait ps ready failed"
-            time.sleep(sleep_time)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                start_left_time -= sleep_time
-
-    def _get_pserver_port(self, pid):
-        with open("/tmp/paddle.%d.port" % pid, 'r') as f:
-            port = int(f.read().strip())
-        return port
-
-    def _run_hsigmoid_op_one_pserver(self, place, port):
-        scope = fluid.core.Scope()
-        program = Program()
-        with fluid.scope_guard(scope):
-            with program_guard(program, startup_program=Program()):
-                x = scope.var('X').get_tensor()
-                x_array = np.random.random((4, 8)).astype("float32") * 2
-                x.set(x_array, place)
-                # create and initialize Param Variable
-                param = scope.var('W').get_tensor()
-                param_array = np.zeros((5, 8)).astype("float32") * 2
-                param.set(param_array, place)
-
-                path_table = scope.var('PathTable').get_tensor()
-                path_table_array = np.array(
-                    [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1),
-                     (0, 2, -1, -1, -1)]).astype(
-                         "int64"
-                     )  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-                path_table.set(path_table_array, place)
-
-                path_code = scope.var('PathCode').get_tensor()
-                path_code_array = np.array(
-                    [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
-                     (0, 1, -1, -1, -1)]).astype("int64")  #np.array to store 
-                path_code.set(path_code_array, place)
-
-                label = scope.var('Label').get_tensor()
-                label_array = np.array([0, 1, 4, 5])
-                label.set(label_array, place)
-
-                bias = scope.var('Bias').get_tensor()
-                bias_array = np.random.random((5, 1)).astype("float32")
-                bias.set(bias_array, place)
-
-                out = scope.var('Out').get_tensor()
-
-                pre_out = scope.var('PreOut').get_tensor
-
-                w_out = scope.var('W_Out').get_tensor()
-                w_out.set(param_array, place)
-
-                emaps = ['127.0.0.1:' + str(port)]
-                table_names = ['table']
-                height_sections = [2]
-
-                # create and run sgd operator
-                hsigmoid_op = Operator(
-                    "hierarchical_sigmoid",
-                    X='X',
-                    W='W',
-                    PathTable='PathTable',
-                    PathCode='PathCode',
-                    Label='Label',
-                    Bias='Bias',
-                    Out='Out',
-                    PreOut='PreOut',
-                    W_Out='W_Out',
-                    remote_prefetch=True,
-                    epmap=emaps,
-                    table_names=table_names,
-                    height_sections=height_sections)
-
-                hsigmoid_op.run(scope, place)
-
-                # get and compare result
-                result_array = np.array(w_out)
-                self.assertEqual(list(result_array.shape), [5, 8])
-                correct = None
-                for i in range(5):
-                    if i != 3:
-                        correct = np.full((1, 8), i + 1).astype("float32")
-                        self.assertTrue((result_array[i] == correct).all())
-                    else:
-                        correct = np.full((1, 8), 0).astype("float32")
-                        self.assertTrue((result_array[i] == correct).all())
-
-    def _run_hsigmoid_op_two_pserver(self, place, port0, port1):
-        scope = fluid.core.Scope()
-        program = Program()
-        with fluid.scope_guard(scope):
-            with program_guard(program, startup_program=Program()):
-                x = scope.var('X').get_tensor()
-                x_array = np.random.random((4, 8)).astype("float32") * 2
-                x.set(x_array, place)
-                # create and initialize Param Variable
-                param = scope.var('W').get_tensor()
-                param_array = np.zeros((5, 8)).astype("float32") * 2
-                param.set(param_array, place)
-
-                path_table = scope.var('PathTable').get_tensor()
-                path_table_array = np.array(
-                    [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
-                     (0, 2, -1, -1, -1)]).astype(
-                         "int64"
-                     )  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-                path_table.set(path_table_array, place)
-
-                path_code = scope.var('PathCode').get_tensor()
-                path_code_array = np.array(
-                    [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
-                     (0, 1, -1, -1, -1)]).astype("int64")  #np.array to store 
-                path_code.set(path_code_array, place)
-
-                label = scope.var('Label').get_tensor()
-                label_array = np.array([0, 1, 4, 5])
-                label.set(label_array, place)
-
-                bias = scope.var('Bias').get_tensor()
-                bias_array = np.random.random((5, 1)).astype("float32")
-                bias.set(bias_array, place)
-
-                out = scope.var('Out').get_tensor()
-
-                pre_out = scope.var('PreOut').get_tensor
-
-                w_out = scope.var('W_Out').get_tensor()
-                w_out.set(param_array, place)
-
-                emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
-                table_names = ['table', 'table']
-                height_sections = [2, 3]
-
-                # create and run sgd operator
-                hsigmoid_op = Operator(
-                    "hierarchical_sigmoid",
-                    X='X',
-                    W='W',
-                    PathTable='PathTable',
-                    PathCode='PathCode',
-                    Label='Label',
-                    Bias='Bias',
-                    Out='Out',
-                    PreOut='PreOut',
-                    W_Out='W_Out',
-                    remote_prefetch=True,
-                    epmap=emaps,
-                    table_names=table_names,
-                    height_sections=height_sections)
-                hsigmoid_op.run(scope, place)
-
-                # get and compare result
-                result_array = np.array(w_out)
-                self.assertEqual(list(result_array.shape), [5, 8])
-                correct = None
-                for i in range(5):
-                    if i < 2:
-                        correct = np.full((1, 8), i + 1).astype("float32")
-                        self.assertTrue((result_array[i] == correct).all())
-                    else:
-                        correct = np.full((1, 8), i + 9).astype("float32")
-                        self.assertTrue((result_array[i] == correct).all())
-
-    def test_hsigmoid_op_remote(self):
-        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
-        # run pserver on CPU in sync mode
-        p0 = self._start_pserver(0, False, True, run_pserver)
-        self._wait_ps_ready(p0.pid)
-        port0 = self._get_pserver_port(p0.pid)
-
-        p1 = self._start_pserver(1, False, True, run_pserver)
-        self._wait_ps_ready(p1.pid)
-        port1 = self._get_pserver_port(p1.pid)
-
-        places = [core.CPUPlace()]
-
-        for place in places:
-            self._run_hsigmoid_op_one_pserver(place, port0)
-            self._run_hsigmoid_op_two_pserver(place, port0, port1)
-
-        # raise SIGTERM to pserver
-        os.kill(p0.pid, signal.SIGINT)
-        p0.join()
-        os.kill(p1.pid, signal.SIGINT)
-        p1.join()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
deleted file mode 100644
index dc617cf18f075a1682f24123537565b68e15ee63..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def huber_loss_forward(val, delta):
-    abs_val = abs(val)
-    if abs_val <= delta:
-        return 0.5 * val * val
-    else:
-        return delta * (abs_val - 0.5 * delta)
-
-
-class TestHuberLossOp(OpTest):
-    def setUp(self):
-        self.op_type = 'huber_loss'
-        self.samples_num = 64
-        self.delta = 1.0
-        self.init_input()
-        residual = self.inputs['Y'].reshape(
-            self.samples_num, 1) - self.inputs['X'].reshape(self.samples_num, 1)
-        loss = np.vectorize(huber_loss_forward)(residual,
-                                                self.delta).astype('float32')
-        self.attrs = {'delta': self.delta}
-        self.outputs = {
-            'Residual': residual,
-            'Out': loss.reshape((self.samples_num, 1))
-        }
-
-    def init_input(self):
-        self.inputs = {
-            'X': np.random.uniform(0, 1.,
-                                   (self.samples_num, 1)).astype('float32'),
-            'Y': np.random.uniform(0, 1.,
-                                   (self.samples_num, 1)).astype('float32'),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.008)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.008, no_grad_set=set("residual"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.008, no_grad_set=set('residual'))
-
-
-def TestHuberLossOp1(TestHuberLossOp):
-    def init_input(self):
-        self.inputs = {
-            'X': np.random.uniform(0, 1.,
-                                   (self.samples_num, 1)).astype('float32'),
-            'Y': np.random.uniform(0, 1., (self.samples_num)).astype('float32'),
-        }
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
deleted file mode 100644
index 833e46483c2532e283fd672dc56cb93941f5b4ba..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ /dev/null
@@ -1,283 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def get_output_shape(attrs, in_shape, img_real_size):
-    batchsize = in_shape[0]
-    img_height = in_shape[2]
-    img_width = in_shape[3]
-    paddings = np.array(attrs['paddings']).astype("int32")
-    kernels = np.array(attrs['kernels']).astype("int32")
-    strides = np.array(attrs['strides']).astype("int32")
-    output_height = np.zeros((1, batchsize)).astype("int32")
-    output_width = np.zeros((1, batchsize)).astype("int32")
-    if len(img_real_size):
-        out_stride = np.array(attrs['out_stride']).astype("int32")
-        imgreal_h = 0
-        imgreal_w = 0
-        for index in range(batchsize):
-            if img_real_size[index, 0] % out_stride[0] == 0:
-                imgreal_h = img_real_size[index, 0] / out_stride[0]
-            else:
-                imgreal_h = img_real_size[index, 0] / out_stride[0] + 1
-            if img_real_size[index, 0] % out_stride[1] == 0:
-                imgreal_w = img_real_size[index, 1] / out_stride[1]
-            else:
-                imgreal_w = img_real_size[index, 0] / out_stride[1] + 1
-            output_height[0,index] = \
-              1 +  \
-              (imgreal_h + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
-                  strides[0]
-
-            output_width[0,index] = \
-              1 + \
-              (imgreal_w + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
-                  strides[1]
-    else:
-        for index in range(batchsize):
-            output_height[0,index] = \
-              1 +  \
-              (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
-                  strides[0]
-
-            output_width[0,index] = \
-              1 + \
-              (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
-                  strides[1]
-
-    return output_height, output_width
-
-
-def im2col(attrs, im, col):
-    """
-    im: {CHW}
-    col:
-        {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
-    """
-    input_channels, input_height, input_width = im.shape
-    output_height, output_width, _, filter_height, filter_width = col.shape
-
-    stride_height, stride_width = attrs['strides']
-    padding_height, padding_width = attrs['paddings'][0:2]
-
-    for col_row_idx in range(0, output_height):
-        for col_col_idx in range(0, output_width):
-            for channel in range(0, input_channels):
-                for filter_row_idx in range(0, filter_height):
-                    for filter_col_idx in range(0, filter_width):
-                        im_row_offset = col_row_idx * stride_height \
-                            + filter_row_idx - padding_height
-
-                        im_col_offset = col_col_idx * stride_width \
-                            + filter_col_idx - padding_width
-
-                        if (im_row_offset < 0 or
-                                im_row_offset >= input_height or
-                                im_col_offset < 0 or
-                                im_col_offset >= input_width):
-                            col[col_row_idx][col_col_idx][channel][\
-                                filter_row_idx][filter_col_idx] = 0.0
-                        else:
-                            im_offset = (channel * input_height + im_row_offset \
-                                         ) * input_width + im_col_offset
-
-                            col[col_row_idx][col_col_idx][channel][\
-                                filter_row_idx][filter_col_idx] = im[channel][ \
-                                    im_row_offset][im_col_offset]
-
-
-def Im2Sequence(inputs, img_real_size, attrs):
-    output_height, output_width = get_output_shape(attrs, inputs.shape,
-                                                   img_real_size)
-    img_channels = inputs.shape[1]
-    batch_size = inputs.shape[0]
-    out = []
-    for index in range(batch_size):
-        tmp = np.zeros([
-            output_height[0, index], output_width[0, index], img_channels,
-            attrs['kernels'][0], attrs['kernels'][1]
-        ]).astype("float32")
-        out.append(tmp)
-    for index in range(len(inputs)):
-        im2col(attrs, inputs[index], out[index])
-        out[index] = out[index].reshape([
-            output_height[0, index] * output_width[0, index],
-            img_channels * attrs['kernels'][0] * attrs['kernels'][1]
-        ])
-    out = np.concatenate(out, axis=0)
-    return out
-
-
-class TestBlockExpandOp(OpTest):
-    def config(self):
-        self.batch_size = 1
-        self.img_channels = 3
-        self.img_height = 4
-        self.img_width = 4
-        self.attrs = {
-            'kernels': [2, 2],
-            'strides': [1, 1],
-            'paddings': [1, 1, 1, 1],
-        }
-
-    def setUp(self):
-        self.config()
-        self.op_type = "im2sequence"
-        x = np.random.uniform(0.1, 1, [
-            self.batch_size, self.img_channels, self.img_height, self.img_width
-        ]).astype("float32")
-
-        real_size = np.array([]).astype("float32")
-        out = Im2Sequence(x, real_size, self.attrs)
-        self.inputs = {'X': x}
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestBlockExpandOpCase2(TestBlockExpandOp):
-    def config(self):
-        self.batch_size = 2
-        self.img_channels = 3
-        self.img_height = 4
-        self.img_width = 5
-        self.attrs = {
-            'kernels': [2, 1],
-            'strides': [2, 1],
-            'paddings': [2, 1, 2, 1],
-        }
-
-
-class TestBlockExpandOpCase3(TestBlockExpandOp):
-    def config(self):
-        self.batch_size = 2
-        self.img_channels = 1
-        self.img_height = 4
-        self.img_width = 5
-        self.attrs = {
-            'kernels': [2, 1],
-            'strides': [2, 1],
-            'paddings': [2, 0, 2, 0],
-        }
-
-
-class TestBlockExpandOpCase4(TestBlockExpandOp):
-    def config(self):
-        self.batch_size = 2
-        self.img_channels = 2
-        self.img_height = 3
-        self.img_width = 3
-        self.attrs = {
-            'kernels': [2, 2],
-            'strides': [1, 1],
-            'paddings': [0, 0, 0, 0],
-        }
-
-
-class TestBlockExpandOpCase5(OpTest):
-    def config(self):
-        self.batch_size = 1
-        self.img_channels = 3
-        self.img_height = 4
-        self.img_width = 5
-        self.attrs = {
-            'kernels': [2, 1],
-            'strides': [2, 1],
-            'paddings': [2, 1, 2, 1],
-            'out_stride': [2, 2],
-        }
-
-    def setUp(self):
-        self.config()
-        self.op_type = "im2sequence"
-        x = np.random.uniform(0.1, 1, [
-            self.batch_size, self.img_channels, self.img_height, self.img_width
-        ]).astype("float32")
-        real_size = np.array([[8, 10], [5, 8]]).astype("float32")
-        out = np.array(Im2Sequence(x, real_size, self.attrs))
-        self.inputs = {'X': x, 'Y': real_size}  #l ??
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestBlockExpandOpCase6(OpTest):
-    def config(self):
-        self.batch_size = 3
-        self.img_channels = 1
-        self.img_height = 4
-        self.img_width = 5
-        self.attrs = {
-            'kernels': [2, 1],
-            'strides': [1, 1],
-            'paddings': [0, 0, 0, 0],
-            'out_stride': [1, 1],
-        }
-
-    def setUp(self):
-        self.config()
-        self.op_type = "im2sequence"
-        x = np.random.uniform(0.1, 1, [
-            self.batch_size, self.img_channels, self.img_height, self.img_width
-        ]).astype("float32")
-        real_size = np.array([[8, 10], [5, 8], [5, 8]]).astype("float32")
-        out = np.array(Im2Sequence(x, real_size, self.attrs))
-        self.inputs = {'X': x, 'Y': real_size}  #l ??
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestBlockExpandOpCase7(OpTest):
-    def config(self):
-        self.batch_size = 2
-        self.img_channels = 2
-        self.img_height = 3
-        self.img_width = 3
-        self.attrs = {
-            'kernels': [2, 2],
-            'strides': [1, 1],
-            'paddings': [1, 0, 1, 0],
-            'out_stride': [2, 2],
-        }
-
-    def setUp(self):
-        self.config()
-        self.op_type = "im2sequence"
-        x = np.random.uniform(0.1, 1, [
-            self.batch_size, self.img_channels, self.img_height, self.img_width
-        ]).astype("float32")
-        real_size = np.array([[6, 6], [4, 4]]).astype("float32")
-        out = np.array(Im2Sequence(x, real_size, self.attrs))
-        self.inputs = {'X': x, 'Y': real_size}
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
-#set shiftwidth=4 set expandtab set tabstop=4
diff --git a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
deleted file mode 100644
index 405637969af6fb515a24ecb077e470279c3ffc24..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle.fluid as fluid
-import paddle.fluid.nets as nets
-from paddle.fluid.framework import Program
-
-
-def conv_block(input, num_filter, groups, dropouts):
-    return nets.img_conv_group(
-        input=input,
-        pool_size=2,
-        pool_stride=2,
-        conv_num_filter=[num_filter] * groups,
-        conv_filter_size=3,
-        conv_act='relu',
-        conv_with_batchnorm=True,
-        conv_batchnorm_drop_rate=dropouts,
-        pool_type='max')
-
-
-class TestLayer(unittest.TestCase):
-    def test_batch_norm_layer(self):
-        main_program = Program()
-        startup_program = Program()
-        with fluid.program_guard(main_program, startup_program):
-            images = fluid.layers.data(
-                name='pixel', shape=[3, 48, 48], dtype='float32')
-            hidden1 = fluid.layers.batch_norm(input=images)
-            hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
-            fluid.layers.batch_norm(input=hidden2)
-
-        print(str(main_program))
-
-    def test_dropout_layer(self):
-        main_program = Program()
-        startup_program = Program()
-        with fluid.program_guard(main_program, startup_program):
-            images = fluid.layers.data(
-                name='pixel', shape=[3, 48, 48], dtype='float32')
-            fluid.layers.dropout(x=images, dropout_prob=0.5)
-
-        print(str(main_program))
-
-    def test_img_conv_group(self):
-        main_program = Program()
-        startup_program = Program()
-
-        with fluid.program_guard(main_program, startup_program):
-            images = fluid.layers.data(
-                name='pixel', shape=[3, 48, 48], dtype='float32')
-            conv1 = conv_block(images, 64, 2, [0.3, 0])
-            conv_block(conv1, 256, 3, [0.4, 0.4, 0])
-
-        print(str(main_program))
-
-    def test_elementwise_add_with_act(self):
-        main_program = Program()
-        startup_program = Program()
-        with fluid.program_guard(main_program, startup_program):
-            image1 = fluid.layers.data(
-                name='pixel1', shape=[3, 48, 48], dtype='float32')
-            image2 = fluid.layers.data(
-                name='pixel2', shape=[3, 48, 48], dtype='float32')
-            fluid.layers.elementwise_add(x=image1, y=image2, act='relu')
-        print(main_program)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
deleted file mode 100644
index ac849e1cfb856b426f3088b10a06a0afb237568e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import paddle.fluid as fluid
-import numpy as np
-
-
-class AutoPruneLayer0(fluid.Layer):
-    def __init__(self, name_scope):
-        super(AutoPruneLayer0, self).__init__(name_scope)
-        self.fc1 = fluid.dygraph.FC(
-            "FC_1",
-            5,
-            param_attr=fluid.initializer.ConstantInitializer(value=2),
-            bias_attr=False)
-        self.fc2 = fluid.dygraph.FC(
-            "FC_2",
-            5,
-            param_attr=fluid.initializer.ConstantInitializer(value=2),
-            bias_attr=False)
-
-    def forward(self, x, y):
-        a = self.fc1(x)
-        b = self.fc2(y)
-        c = fluid.layers.mul(a, b)
-        d = fluid.layers.reduce_mean(c)
-        return d
-
-
-class AutoPruneLayer1(fluid.Layer):
-    def __init__(self, name_scope):
-        super(AutoPruneLayer1, self).__init__(name_scope)
-        self.fc1 = fluid.dygraph.FC(
-            "FC_1",
-            5,
-            param_attr=fluid.initializer.ConstantInitializer(value=2),
-            bias_attr=False)
-        self.fc2 = fluid.dygraph.FC(
-            "FC_2",
-            5,
-            param_attr=fluid.initializer.ConstantInitializer(value=2),
-            bias_attr=False)
-
-    def forward(self, x, y):
-        a = self.fc1(x)
-        b = self.fc2(y)
-        b.stop_gradient = True
-        c = fluid.layers.mul(a, b)
-        d = fluid.layers.reduce_mean(c)
-        return d
-
-
-class AutoPruneLayer2(fluid.Layer):
-    def __init__(self, name_scope):
-        super(AutoPruneLayer2, self).__init__(name_scope)
-        self.fc = fluid.dygraph.FC("FC1", size=10, act=None)
-        self.fc2 = fluid.dygraph.FC("FC2", size=1, act=None)
-
-    def forward(self, x, label):
-        feature = self.fc(x)
-        label = self.fc2(label)
-        label = fluid.layers.cast(label, dtype="float32")
-        label = fluid.layers.cast(label, dtype='int64')
-        # Note that the label is not persistable in fluid.layers.cross_entropy.
-        loss = fluid.layers.cross_entropy(input=feature, label=label)
-        loss = fluid.layers.mean(loss)
-        return loss
-
-
-class AutoPruneLayer3(fluid.Layer):
-    def __init__(self, name_scope):
-        super(AutoPruneLayer3, self).__init__(name_scope)
-        self.fc = fluid.dygraph.FC("FC1", size=20, act=None)
-
-    def forward(self, x, label, test_num):
-        feature = self.fc(x)
-        part1, part2 = fluid.layers.split(
-            feature, num_or_sections=[10, 10], dim=1)
-        # Note that: part2 is not used.
-        loss = fluid.layers.cross_entropy(input=part1, label=label)
-        loss = fluid.layers.mean(loss)
-        if test_num == 1:
-            return loss, part2
-        else:
-            return loss, part1, part2
-
-
-class MyLayer(fluid.Layer):
-    def __init__(self, name_scope, vocab_size, size, dtype="float32"):
-        super(MyLayer, self).__init__(name_scope, dtype)
-        self.embed0 = fluid.Embedding(self.full_name(), size=(vocab_size, size))
-        self.embed1 = fluid.Embedding(self.full_name(), size=(vocab_size, size))
-        self.fc0 = fluid.FC(self.full_name(), size=size, dtype=dtype)
-        self.fc1 = fluid.FC(self.full_name(), size=size, dtype=dtype)
-
-    def forward(self, x):
-        # this method involves only the fc layers
-        loss = fluid.layers.reduce_mean(self.fc0(x) + self.fc1(x))
-        return loss
-
-    def linear0(self, x):
-        loss = fluid.layers.reduce_mean(self.fc0(x))
-        return loss
-
-    def embed_linear0(self, x):
-        loss = fluid.layers.reduce_mean(self.fc0(self.embed0(x)))
-        return loss
-
-
-class MyLayer2(fluid.Layer):
-    def __init__(self, name_scope, vocab_size, size, dtype="float32"):
-        super(MyLayer2, self).__init__(name_scope, dtype)
-        self.embed0 = fluid.Embedding(self.full_name(), size=(vocab_size, size))
-        self.embed1 = fluid.Embedding(self.full_name(), size=(vocab_size, size))
-        self.fc0 = fluid.FC(self.full_name(), size=size, dtype=dtype)
-        self.fc1 = fluid.FC(self.full_name(), size=size, dtype=dtype)
-
-    def forward(self, indices):
-        # mind the difference with MyLayer
-        # In this example, the forward method involes all params
-        loss = fluid.layers.reduce_mean(
-            self.fc0(self.embed0(indices)) + self.fc1(self.embed1(indices)))
-        return loss
-
-    def linear0(self, x):
-        loss = fluid.layers.reduce_mean(self.fc0(x))
-        return loss
-
-    def embed_linear0(self, x):
-        loss = fluid.layers.reduce_mean(self.fc0(self.embed0(x)))
-        return loss
-
-
-class TestImperativeAutoPrune(unittest.TestCase):
-    def test_auto_prune(self):
-        with fluid.dygraph.guard():
-            case1 = AutoPruneLayer0("l1")
-            value1 = np.arange(25).reshape(5, 5).astype("float32")
-            value2 = np.arange(25).reshape(5, 5).astype("float32")
-            v1 = fluid.dygraph.to_variable(value1)
-            v2 = fluid.dygraph.to_variable(value2)
-            loss = case1(v1, v2)
-            loss.backward()
-            self.assertTrue(case1.fc2._w._ivar._grad_ivar() is not None)
-            self.assertTrue(case1.fc1._w._ivar._grad_ivar() is not None)
-
-    def test_auto_prune2(self):
-        with fluid.dygraph.guard():
-            case2 = AutoPruneLayer1("l1")
-            value1 = np.arange(25).reshape(5, 5).astype("float32")
-            value2 = np.arange(25).reshape(5, 5).astype("float32")
-            v1 = fluid.dygraph.to_variable(value1)
-            v2 = fluid.dygraph.to_variable(value2)
-            loss = case2(v1, v2)
-            loss.backward()
-            self.assertTrue(case2.fc2._w._ivar._grad_ivar() is None)
-            self.assertTrue(case2.fc1._w._ivar._grad_ivar() is not None)
-
-    def test_auto_prune3(self):
-        with fluid.dygraph.guard():
-            case3 = AutoPruneLayer3("l3")
-            value1 = np.arange(784).reshape(1, 784).astype("float32")
-            value2 = np.arange(1).reshape(1, 1).astype("int64")
-            v1 = fluid.dygraph.to_variable(value1)
-            v2 = fluid.dygraph.to_variable(value2)
-            loss, part2 = case3(v1, v2, 1)
-            loss.backward()
-            self.assertTrue(case3.fc._w._ivar._grad_ivar() is not None)
-            self.assertTrue((part2.gradient() == 0).all())
-
-    def test_auto_prune4(self):
-        with fluid.dygraph.guard():
-            case4 = AutoPruneLayer3("l3")
-            value1 = np.arange(784).reshape(1, 784).astype("float32")
-            value2 = np.arange(1).reshape(1, 1).astype("int64")
-            v1 = fluid.dygraph.to_variable(value1)
-            v2 = fluid.dygraph.to_variable(value2)
-            loss, part2 = case4(v1, v2, 1)
-            part2.backward()
-            self.assertTrue(case4.fc._w._ivar._grad_ivar() is not None)
-            self.assertTrue((part2.gradient() == 1).all())
-
-    def test_auto_prune5(self):
-        with fluid.dygraph.guard():
-            case4 = AutoPruneLayer3("l3")
-            value1 = np.arange(784).reshape(1, 784).astype("float32")
-            value2 = np.arange(1).reshape(1, 1).astype("int64")
-            v1 = fluid.dygraph.to_variable(value1)
-            v2 = fluid.dygraph.to_variable(value2)
-            loss, part1, part2 = case4(v1, v2, 2)
-            part1.backward()
-            self.assertTrue(case4.fc._w._ivar._grad_ivar() is not None)
-            self.assertTrue((part2.gradient() == 0).all())
-
-    def test_auto_prune6(self):
-        with fluid.dygraph.guard():
-            value0 = np.arange(26).reshape(2, 13).astype("float32")
-            value1 = np.arange(6).reshape(2, 3).astype("float32")
-            value2 = np.arange(10).reshape(2, 5).astype("float32")
-            fc = fluid.FC("fc1", size=5, dtype="float32")
-            fc2 = fluid.FC("fc2", size=3, dtype="float32")
-            a = fluid.dygraph.to_variable(value0)
-            b = fluid.dygraph.to_variable(value1)
-            c = fluid.dygraph.to_variable(value2)
-            out1 = fc(a)
-            out2 = fc2(b)
-            out1.stop_gradient = True
-            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
-            out.backward()
-            self.assertTrue((fc._w.gradient() == 0).all())
-            self.assertTrue((out1.gradient() == 0).all())
-
-    def test_auto_prune7(self):
-        with fluid.dygraph.guard():
-            value0 = np.arange(26).reshape(2, 13).astype("float32")
-            value1 = np.arange(6).reshape(2, 3).astype("float32")
-            value2 = np.arange(10).reshape(2, 5).astype("float32")
-            fc = fluid.FC("fc1", size=5, dtype="float32")
-            fc2 = fluid.FC("fc2", size=3, dtype="float32")
-            a = fluid.dygraph.to_variable(value0)
-            b = fluid.dygraph.to_variable(value1)
-            c = fluid.dygraph.to_variable(value2)
-            out1 = fc(a)
-            out2 = fc2(b)
-            out1.stop_gradient = True
-            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            out.backward(backward_strategy)
-            self.assertTrue((fc._w.gradient() == 0).all())
-            self.assertTrue((out1.gradient() == 0).all())
-
-    def test_auto_prune_with_optimizer(self):
-        vocab_size = 100
-        size = 20
-        batch_size = 16
-
-        indices = np.random.randint(
-            low=0, high=100, size=(batch_size, 1)).astype("int64")
-        embed = np.random.randn(batch_size, size).astype("float32")
-
-        place = fluid.CPUPlace()
-        with fluid.dygraph.guard(place):
-            model = MyLayer("mylayer", vocab_size, size)
-            optimizer = fluid.optimizer.AdamOptimizer(0.001)
-            grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
-
-            indices = fluid.dygraph.to_variable(indices)
-            emebd = fluid.dygraph.to_variable(embed)
-            dummy_loss = model(embed)
-
-            loss = model.embed_linear0(indices)
-            loss.backward()
-            _, params_grads = optimizer.minimize(loss, grad_clip=grad_clip)
-            for items in params_grads:
-                assert items[0].name is not model.embed1._w.name
-                assert items[0].name is not model.fc1._w.name
-            assert model.embed1._w._ivar._grad_ivar() is None
-            assert model.fc1._w._ivar._grad_ivar() is None
-
-        with fluid.dygraph.guard(place):
-            model = MyLayer2("mylayer", vocab_size, size)
-            optimizer = fluid.optimizer.AdamOptimizer(0.001)
-            grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
-
-            indices = fluid.dygraph.to_variable(indices)
-            emebd = fluid.dygraph.to_variable(embed)
-            dummy_loss = model(indices)
-
-            loss = model.embed_linear0(indices)
-            loss.backward()
-            optimizer.minimize(loss, grad_clip=grad_clip)
-            for items in params_grads:
-                assert items[0].name is not model.embed1._w.name
-                assert items[0].name is not model.fc1._w.name
-            assert model.embed1._w._ivar._grad_ivar() is None
-            assert model.fc1._w._ivar._grad_ivar() is None
-
-    def test_case2_prune_no_grad_branch(self):
-        with fluid.dygraph.guard():
-            value1 = np.arange(784).reshape(1, 784)
-            value2 = np.arange(1).reshape(1, 1)
-            v1 = fluid.dygraph.to_variable(value1).astype("float32")
-            v2 = fluid.dygraph.to_variable(value2).astype("float32")
-            case3 = AutoPruneLayer2("l2")
-            loss = case3(v1, v2)
-            loss.backward()
-            self.assertTrue(case3.fc2._w._ivar._grad_ivar() is None)
-            self.assertTrue(case3.fc._w._ivar._grad_ivar() is not None)
-
-    def test_case2_prune_no_grad_branch(self):
-        with fluid.dygraph.guard():
-            value1 = np.arange(784).reshape(1, 784)
-            value2 = np.arange(1).reshape(1, 1)
-            v1 = fluid.dygraph.to_variable(value1).astype("float32")
-            v2 = fluid.dygraph.to_variable(value2).astype("float32")
-            case3 = AutoPruneLayer2("l2")
-            loss = case3(v1, v2)
-            loss.backward()
-            self.assertTrue(case3.fc2._w._ivar._grad_ivar() is None)
-            self.assertTrue(case3.fc._w._ivar._grad_ivar() is not None)
-
-    def test_case3_prune_no_grad_branch2(self):
-        with fluid.dygraph.guard():
-            value1 = np.arange(1).reshape(1, 1)
-            fc = fluid.dygraph.FC("FC1", size=1, act=None)
-            label = fluid.dygraph.to_variable(value1).astype("float32")
-            label = fc(label)
-            label = fluid.layers.cast(label, dtype="float32")
-            label = fluid.layers.cast(label, dtype='int64')
-            out = fluid.layers.one_hot(input=label, depth=100)
-            loss = fluid.layers.mean(out)
-            loss.backward()
-            self.assertTrue(fc._w._ivar._grad_ivar() is None)
-
-    def test_case4_with_no_grad_op_maker(self):
-        with fluid.dygraph.guard():
-            out = fluid.layers.gaussian_random(shape=[20, 30])
-            loss = fluid.layers.mean(out)
-            loss.backward()
-            self.assertTrue(out._ivar._grad_ivar() is None)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_base.py b/python/paddle/fluid/tests/unittests/test_imperative_base.py
deleted file mode 100644
index 1dd5348a8852d78fde73ab9ddf9d0015e903cb3f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_base.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid import core
-
-
-@contextlib.contextmanager
-def new_program_scope(main=None, startup=None, scope=None):
-    prog = main if main else fluid.Program()
-    startup_prog = startup if startup else fluid.Program()
-    scope = scope if scope else fluid.core.Scope()
-    with fluid.scope_guard(scope):
-        with fluid.program_guard(prog, startup_prog):
-            with fluid.unique_name.guard():
-                yield
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
deleted file mode 100644
index acfc1e75c0fb5b3d5709896d7efb64699e0d62d2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ /dev/null
@@ -1,423 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid import FC
-from test_imperative_base import new_program_scope
-
-
-class MyLayer(fluid.Layer):
-    def __init__(self, name_scope):
-        super(MyLayer, self).__init__(name_scope)
-
-    def forward(self, inputs):
-        x = fluid.layers.relu(inputs)
-        self._x_for_debug = x
-        x = fluid.layers.elementwise_mul(x, x)
-        x = fluid.layers.reduce_sum(x)
-        return [x]
-
-
-class MLP(fluid.Layer):
-    def __init__(self, name_scope):
-        super(MLP, self).__init__(name_scope)
-        self._fc1 = FC(self.full_name(),
-                       3,
-                       param_attr=fluid.ParamAttr(
-                           initializer=fluid.initializer.Constant(value=0.1)),
-                       bias_attr=fluid.ParamAttr(
-                           initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = FC(self.full_name(),
-                       4,
-                       param_attr=fluid.ParamAttr(
-                           initializer=fluid.initializer.Constant(value=0.1)),
-                       bias_attr=fluid.ParamAttr(
-                           initializer=fluid.initializer.Constant(value=0.1)))
-
-    def forward(self, inputs):
-        x = self._fc1(inputs)
-        x = self._fc2(x)
-        x = fluid.layers.reduce_sum(x)
-        return x
-
-
-class SimpleRNNCell(fluid.Layer):
-    def __init__(self, name_scope, step_input_size, hidden_size, output_size,
-                 param_attr):
-        super(SimpleRNNCell, self).__init__(name_scope)
-        self.step_input_size = step_input_size
-        self.hidden_size = hidden_size
-        self.output_size = output_size
-        self._dtype = core.VarDesc.VarType.FP32
-        self.param_attr = param_attr
-
-    def _build_once(self, inputs, pre_hidden):
-        i2h_param_shape = [self.step_input_size, self.hidden_size]
-        h2h_param_shape = [self.hidden_size, self.hidden_size]
-        h2o_param_shape = [self.output_size, self.hidden_size]
-        self._i2h_w = self.create_parameter(
-            attr=self.param_attr,
-            shape=i2h_param_shape,
-            dtype=self._dtype,
-            is_bias=False)
-        self._h2h_w = self.create_parameter(
-            attr=self.param_attr,
-            shape=h2h_param_shape,
-            dtype=self._dtype,
-            is_bias=False)
-        self._h2o_w = self.create_parameter(
-            attr=self.param_attr,
-            shape=h2o_param_shape,
-            dtype=self._dtype,
-            is_bias=False)
-
-    def forward(self, input, pre_hidden):
-
-        tmp_i2h = self.create_variable(dtype=self._dtype)
-        tmp_h2h = self.create_variable(dtype=self._dtype)
-        hidden = self.create_variable(dtype=self._dtype)
-        out = self.create_variable(dtype=self._dtype)
-        softmax_out = self.create_variable(dtype=self._dtype)
-        reduce_out = self.create_variable(dtype=self._dtype)
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": input,
-                    "Y": self._i2h_w},
-            outputs={"Out": tmp_i2h},
-            attrs={"x_num_col_dims": 1,
-                   "y_num_col_dims": 1})
-
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": pre_hidden,
-                    "Y": self._h2h_w},
-            outputs={"Out": tmp_h2h},
-            attrs={"x_num_col_dims": 1,
-                   "y_num_col_dims": 1})
-
-        self._helper.append_op(
-            type="elementwise_add",
-            inputs={'X': tmp_h2h,
-                    'Y': tmp_i2h},
-            outputs={'Out': hidden},
-            attrs={'axis': -1,
-                   'use_mkldnn': False})
-        hidden = self._helper.append_activation(hidden, act='tanh')
-
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": hidden,
-                    "Y": self._h2o_w},
-            outputs={"Out": out},
-            attrs={"x_num_col_dims": 1,
-                   "y_num_col_dims": 1})
-
-        self._helper.append_op(
-            type="softmax",
-            inputs={"X": out},
-            outputs={"Out": softmax_out},
-            attrs={"use_cudnn": False})
-
-        self._helper.append_op(
-            type='reduce_sum',
-            inputs={'X': softmax_out},
-            outputs={'Out': reduce_out},
-            attrs={'dim': [],
-                   'keep_dim': False,
-                   'reduce_all': True})
-
-        return reduce_out, hidden
-
-
-class SimpleRNN(fluid.Layer):
-    def __init__(self, name_scope):
-        super(SimpleRNN, self).__init__(name_scope)
-        self.seq_len = 4
-        self._cell = SimpleRNNCell(
-            self.full_name(),
-            3,
-            3,
-            3,
-            fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.1)))
-
-    def forward(self, inputs):
-        outs = list()
-        pre_hiddens = list()
-
-        init_hidden = self.create_parameter(
-            attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            shape=[1, 3],
-            dtype='float32',
-            is_bias=False)
-        pre_hidden = init_hidden
-        for i in range(self.seq_len):
-            input = fluid.layers.slice(
-                inputs, axes=[1], starts=[i], ends=[i + 1])
-            input = fluid.layers.reshape(input, shape=[1, 3])
-            out_softmax, pre_hidden = self._cell(input, pre_hidden)
-            outs.append(out_softmax)
-
-        return outs, pre_hiddens
-
-
-class TestImperative(unittest.TestCase):
-    def test_sum_op(self):
-        x = np.ones([2, 2], np.float32)
-        with fluid.dygraph.guard():
-            inputs = []
-            for _ in range(10):
-                tmp = fluid.dygraph.base.to_variable(x)
-                tmp.stop_gradient = False
-                inputs.append(tmp)
-            ret = fluid.layers.sums(inputs)
-            loss = fluid.layers.reduce_sum(ret)
-            loss.backward()
-        with fluid.dygraph.guard():
-            inputs2 = []
-            for _ in range(10):
-                tmp = fluid.dygraph.base.to_variable(x)
-                tmp.stop_gradient = False
-                inputs2.append(tmp)
-            ret2 = fluid.layers.sums(inputs2)
-            loss2 = fluid.layers.reduce_sum(ret2)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            loss2.backward(backward_strategy)
-
-            self.assertTrue(np.allclose(ret.numpy(), x * 10))
-            self.assertTrue(np.allclose(inputs[0].gradient(), x))
-            self.assertTrue(np.allclose(ret2.numpy(), x * 10))
-            a = inputs2[0].gradient()
-            self.assertTrue(np.allclose(inputs2[0].gradient(), x))
-
-    def test_layer(self):
-        with fluid.dygraph.guard():
-            cl = core.Layer()
-            cl.forward([])
-            l = fluid.Layer("l")
-            self.assertRaises(NotImplementedError, l.forward, [])
-
-    def test_layer_in_out(self):
-        np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
-        with fluid.dygraph.guard():
-            var_inp = fluid.dygraph.base.to_variable(np_inp)
-            var_inp.stop_gradient = False
-            l = MyLayer("my_layer")
-            x = l(var_inp)[0]
-            self.assertIsNotNone(x)
-            dy_out = x.numpy()
-            x.backward()
-            dy_grad = l._x_for_debug.gradient()
-
-        with fluid.dygraph.guard():
-            var_inp2 = fluid.dygraph.base.to_variable(np_inp)
-            var_inp2.stop_gradient = False
-            l2 = MyLayer("my_layer")
-            x2 = l2(var_inp2)[0]
-            self.assertIsNotNone(x2)
-            dy_out2 = x2.numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            x2.backward(backward_strategy)
-            dy_grad2 = l2._x_for_debug.gradient()
-
-        with new_program_scope():
-            inp = fluid.layers.data(
-                name="inp", shape=[3], append_batch_size=False)
-            l = MyLayer("my_layer")
-            x = l(inp)[0]
-            param_grads = fluid.backward.append_backward(
-                x, parameter_list=[l._x_for_debug.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-
-            static_out, static_grad = exe.run(
-                feed={inp.name: np_inp},
-                fetch_list=[x.name, param_grads[1].name])
-
-        self.assertTrue(np.allclose(dy_out, static_out))
-        self.assertTrue(np.allclose(dy_grad, static_grad))
-        self.assertTrue(np.allclose(dy_out2, static_out))
-        self.assertTrue(np.allclose(dy_grad2, static_grad))
-
-    def test_mlp(self):
-        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        with fluid.dygraph.guard():
-            var_inp = fluid.dygraph.base.to_variable(np_inp)
-            mlp = MLP("mlp")
-            out = mlp(var_inp)
-            dy_out = out.numpy()
-            out.backward()
-            dy_grad = mlp._fc1._w.gradient()
-
-        with fluid.dygraph.guard():
-            var_inp2 = fluid.dygraph.base.to_variable(np_inp)
-            mlp2 = MLP("mlp")
-            out2 = mlp2(var_inp2)
-            dy_out2 = out2.numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            out2.backward(backward_strategy)
-            dy_grad2 = mlp2._fc1._w.gradient()
-
-        with new_program_scope():
-            inp = fluid.layers.data(
-                name="inp", shape=[2, 2], append_batch_size=False)
-            mlp = MLP("mlp")
-            out = mlp(inp)
-            param_grads = fluid.backward.append_backward(
-                out, parameter_list=[mlp._fc1._w.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-            exe.run(fluid.default_startup_program())
-
-            static_out, static_grad = exe.run(
-                feed={inp.name: np_inp},
-                fetch_list=[out.name, param_grads[1].name])
-
-        self.assertTrue(np.allclose(dy_out, static_out))
-        self.assertTrue(np.allclose(dy_grad, static_grad))
-        self.assertTrue(np.allclose(dy_out2, static_out))
-        self.assertTrue(np.allclose(dy_grad2, static_grad))
-
-        params = mlp.parameters(True)
-        self.assertEqual("mlp/MLP_0/FC_0.w_0", params[0].name)
-        self.assertEqual("mlp/MLP_0/FC_0.b_0", params[1].name)
-        self.assertEqual("mlp/MLP_0/FC_1.w_0", params[2].name)
-        self.assertEqual("mlp/MLP_0/FC_1.b_0", params[3].name)
-        self.assertEqual(len(params), 4)
-
-        sublayers = mlp.sublayers(True)
-        self.assertEqual(mlp._fc1, sublayers[0])
-        self.assertEqual(mlp._fc2, sublayers[1])
-        self.assertEqual(len(sublayers), 2)
-
-    def test_dygraph_vs_static(self):
-        inp1 = np.random.rand(4, 3, 3)
-        inp2 = np.random.rand(4, 3, 3)
-
-        # dynamic graph
-        with fluid.dygraph.guard():
-            if np.sum(inp1) < np.sum(inp2):
-                x = fluid.layers.elementwise_add(inp1, inp2)
-            else:
-                x = fluid.layers.elementwise_sub(inp1, inp2)
-            dygraph_result = x.numpy()
-
-        # static graph
-        with new_program_scope():
-            inp_data1 = fluid.layers.data(
-                name='inp1', shape=[3, 3], dtype=np.float32)
-            inp_data2 = fluid.layers.data(
-                name='inp2', shape=[3, 3], dtype=np.float32)
-
-            a = fluid.layers.expand(
-                fluid.layers.reshape(
-                    fluid.layers.reduce_sum(inp_data1), [1, 1]), [4, 1])
-            b = fluid.layers.expand(
-                fluid.layers.reshape(
-                    fluid.layers.reduce_sum(inp_data2), [1, 1]), [4, 1])
-            cond = fluid.layers.less_than(x=a, y=b)
-
-            ie = fluid.layers.IfElse(cond)
-            with ie.true_block():
-                d1 = ie.input(inp_data1)
-                d2 = ie.input(inp_data2)
-                d3 = fluid.layers.elementwise_add(d1, d2)
-                ie.output(d3)
-
-            with ie.false_block():
-                d1 = ie.input(inp_data1)
-                d2 = ie.input(inp_data2)
-                d3 = fluid.layers.elementwise_sub(d1, d2)
-                ie.output(d3)
-            out = ie()
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-            static_result = exe.run(fluid.default_main_program(),
-                                    feed={'inp1': inp1,
-                                          'inp2': inp2},
-                                    fetch_list=out)[0]
-        self.assertTrue(np.allclose(dygraph_result, static_result))
-
-    def test_rnn(self):
-        np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
-                           [10.0, 11.0, 12.0]])
-        np_inp = np_inp.reshape((1, 4, 3))
-        np_inp = np_inp.astype(np.float32)
-        with fluid.dygraph.guard():
-            var_inp = fluid.dygraph.base.to_variable(np_inp)
-            var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
-            simple_rnn = SimpleRNN("simple_rnn")
-            outs, pre_hiddens = simple_rnn.forward(var_inp)
-            dy_out = outs[3].numpy()
-            outs[3].backward()
-            dy_grad_h2o = simple_rnn._cell._h2o_w.gradient()
-            dy_grad_h2h = simple_rnn._cell._h2h_w.gradient()
-            dy_grad_i2h = simple_rnn._cell._i2h_w.gradient()
-
-        with fluid.dygraph.guard():
-            var_inp2 = fluid.dygraph.base.to_variable(np_inp)
-            var_inp2 = fluid.layers.reshape(var_inp2, shape=[1, 4, 3])
-            simple_rnn2 = SimpleRNN("simple_rnn")
-            outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2)
-            dy_out2 = outs2[3].numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            outs2[3].backward(backward_strategy)
-            dy_grad_h2o2 = simple_rnn2._cell._h2o_w.gradient()
-            dy_grad_h2h2 = simple_rnn2._cell._h2h_w.gradient()
-            dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient()
-
-        with new_program_scope():
-            inp = fluid.layers.data(
-                name="inp", shape=[1, 4, 3], append_batch_size=False)
-            simple_rnn = SimpleRNN("simple_rnn")
-            outs, pre_hiddens = simple_rnn(inp)
-            param_grads = fluid.backward.append_backward(outs[3])
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-            static_out, static_grad_h2o, static_grad_h2h, static_grad_i2h = exe.run(
-                feed={inp.name: np_inp},
-                fetch_list=[
-                    outs[3].name, param_grads[0][1].name,
-                    param_grads[1][1].name, param_grads[2][1].name
-                ])
-
-        self.assertTrue(np.allclose(dy_out, static_out))
-        self.assertTrue(np.allclose(dy_grad_h2o, static_grad_h2o))
-        self.assertTrue(np.allclose(dy_grad_h2h, static_grad_h2h))
-        self.assertTrue(np.allclose(dy_grad_i2h, static_grad_i2h))
-        self.assertTrue(np.allclose(dy_out2, static_out))
-        self.assertTrue(np.allclose(dy_grad_h2o2, static_grad_h2o))
-        self.assertTrue(np.allclose(dy_grad_h2h2, static_grad_h2h))
-        self.assertTrue(np.allclose(dy_grad_i2h2, static_grad_i2h))
-
-    def test_layer_attrs(self):
-        layer = fluid.dygraph.Layer("test")
-        layer.test_attr = 1
-        self.assertFalse(hasattr(layer, "whatever"))
-        self.assertTrue(hasattr(layer, "test_attr"))
-        self.assertEqual(layer.test_attr, 1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
deleted file mode 100644
index 609662cf9880795b7f1ff57efb1205ac1eda0e72..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid import Conv2D, Pool2D, FC, core
-from paddle.fluid.dygraph.base import to_variable
-
-
-class SimpleImgConvPool(fluid.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 pool_size,
-                 pool_stride,
-                 pool_padding=0,
-                 pool_type='max',
-                 global_pooling=False,
-                 conv_stride=1,
-                 conv_padding=0,
-                 conv_dilation=1,
-                 conv_groups=1,
-                 act=None,
-                 use_cudnn=False,
-                 param_attr=None,
-                 bias_attr=None):
-        super(SimpleImgConvPool, self).__init__(name_scope)
-
-        self._conv2d = Conv2D(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=None,
-            bias_attr=None,
-            use_cudnn=use_cudnn)
-
-        self._pool2d = Pool2D(
-            self.full_name(),
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
-
-    def forward(self, inputs):
-        x = self._conv2d(inputs)
-        x = self._pool2d(x)
-        return x
-
-
-class MNIST(fluid.Layer):
-    def __init__(self, name_scope):
-        super(MNIST, self).__init__(name_scope)
-
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 20, 5, 2, 2, act="relu")
-
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 50, 5, 2, 2, act="relu")
-
-        pool_2_shape = 50 * 4 * 4
-        SIZE = 10
-        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(self.full_name(),
-                      10,
-                      param_attr=fluid.param_attr.ParamAttr(
-                          initializer=fluid.initializer.NormalInitializer(
-                              loc=0.0, scale=scale)),
-                      act="softmax")
-
-    def forward(self, inputs):
-        x = self._simple_img_conv_pool_1(inputs)
-        x = self._simple_img_conv_pool_2(x)
-        x = self._fc(x)
-        return x
-
-
-class TestDygraphCheckpoint(unittest.TestCase):
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                image = np.array(item[0]).reshape(1, 28, 28)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield image, label
-
-        return _reader_imple
-
-    def test_save_load_persistables(self):
-        seed = 90
-        epoch_num = 1
-        batch_size = 128
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            mnist = MNIST("mnist")
-            sgd = SGDOptimizer(learning_rate=1e-3)
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(paddle.dataset.mnist.train()),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
-
-            dy_param_init_value = {}
-
-            for epoch in range(epoch_num):
-                for batch_id, data in enumerate(batch_py_reader()):
-                    img = data[0]
-                    label = data[1]
-                    label.stop_gradient = True
-
-                    cost = mnist(img)
-                    loss = fluid.layers.cross_entropy(cost, label)
-                    avg_loss = fluid.layers.mean(loss)
-
-                    dy_out = avg_loss.numpy()
-
-                    avg_loss.backward()
-                    sgd.minimize(avg_loss)
-                    fluid.dygraph.save_persistables(mnist.state_dict(),
-                                                    "save_dir")
-                    mnist.clear_gradients()
-
-                    for param in mnist.parameters():
-                        dy_param_init_value[param.name] = param.numpy()
-
-                    restore, _ = fluid.dygraph.load_persistables("save_dir")
-
-                    self.assertRaises(IOError, fluid.dygraph.load_persistables,
-                                      "not_exist_dir")
-
-                    mnist.load_dict(restore)
-
-                    self.assertEqual(len(dy_param_init_value), len(restore))
-                    for ky, value in restore.items():
-                        self.assertTrue(
-                            np.allclose(value.numpy(), dy_param_init_value[
-                                value.name]))
-                        self.assertTrue(np.isfinite(value.numpy().all()))
-                        self.assertFalse(np.isnan(value.numpy().any()))
-
-                    if batch_id > 10:
-                        break
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py b/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py
deleted file mode 100644
index 1b201fc7f15f629cf03adadd0899f37b75cc8134..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import numpy as np
-
-
-class MLP(fluid.Layer):
-    def __init__(self, name_scope):
-        super(MLP, self).__init__(name_scope)
-        self._fc1 = fluid.dygraph.FC(
-            self.full_name(),
-            3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = fluid.dygraph.FC(
-            self.full_name(),
-            4,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-
-    def forward(self, inputs):
-        x = self._fc1(inputs)
-        x = self._fc2(x)
-        x = fluid.layers.reduce_sum(x)
-        return x
-
-
-class TestDygraphDebugString(unittest.TestCase):
-    def test_dygraph_debug_string(self):
-        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        unique_name = 0
-        trace_var = 0
-        alive_var = 0
-        with fluid.dygraph.guard():
-            mlp = MLP("mlp")
-            for i in range(10):
-                var_inp = fluid.dygraph.base.to_variable(np_inp)
-                out = mlp(var_inp)
-                out.backward()
-                mlp.clear_gradients()
-                unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg(
-                    is_test=True)
-                if i > 0:
-                    self.assertGreaterEqual(unique_name, unique_name_tmp)
-                    self.assertGreaterEqual(trace_var, trace_var_tmp)
-                    self.assertGreaterEqual(alive_var, alive_var_tmp)
-                else:
-                    unique_name = unique_name_tmp
-                    trace_var = trace_var_tmp
-                    alive_var = alive_var_tmp
-                try:
-                    fluid.dygraph.base._print_debug_msg()
-                except Exception as e:
-                    raise RuntimeError(
-                        "No Exception is accepted in _print_debug_msg, but we got: {}".
-                        format(e))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
deleted file mode 100644
index f55f36c00f5f1c2c62169d5db7ca97e30ed2259d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle.fluid.framework as framework
-import unittest
-from test_imperative_base import new_program_scope
-
-
-class TestTracerMode(unittest.TestCase):
-    def setUp(self):
-        self.init_mode = True
-
-    def get_tracer_mode(self):
-        assert fluid.in_dygraph_mode(), "Dygraph mode must be enabled"
-
-    @fluid.dygraph.no_grad
-    def no_grad_func(self, a):
-        self.assertEqual(self.tracer._train_mode, False)
-        return a
-
-    @fluid.dygraph.base._not_support
-    def not_support_func(self):
-        return True
-
-    def check_not_support_rlt(self, ans):
-        try:
-            rlt = self.not_support_func()
-        except AssertionError:
-            rlt = False
-        finally:
-            self.assertEqual(rlt, ans)
-
-    def test_main(self):
-        with fluid.dygraph.guard():
-            self.tracer = framework._dygraph_tracer()
-            self.tracer._train_mode = self.init_mode
-
-            self.assertEqual(self.no_grad_func(1), 1)
-
-            self.assertEqual(self.tracer._train_mode, self.init_mode)
-
-        with fluid.dygraph.guard():
-            self.check_not_support_rlt(False)
-
-        with new_program_scope():
-            self.check_not_support_rlt(True)
-
-
-class TestTracerMode2(TestTracerMode):
-    def setUp(self):
-        self.init_mode = False
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
deleted file mode 100644
index 579b073d0829435a98b01ffb7ca4be46b2a272a7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-import random
-import os
-import sys
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from test_imperative_base import new_program_scope
-from paddle.fluid.dygraph.base import to_variable
-
-# Can use Amusic dataset as the DeepCF describes.
-DATA_PATH = os.environ.get('DATA_PATH', '')
-
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 128))
-NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5))
-NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
-
-
-class DMF(fluid.Layer):
-    def __init__(self, name_scope):
-        super(DMF, self).__init__(name_scope)
-        self._user_latent = fluid.FC(self.full_name(), 256)
-        self._item_latent = fluid.FC(self.full_name(), 256)
-
-        self._user_layers = []
-        self._item_layers = []
-        self._hid_sizes = [128, 64]
-        for i in range(len(self._hid_sizes)):
-            self._user_layers.append(
-                self.add_sublayer(
-                    'user_layer_%d' % i,
-                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
-            self._item_layers.append(
-                self.add_sublayer(
-                    'item_layer_%d' % i,
-                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
-
-    def forward(self, users, items):
-        users = self._user_latent(users)
-        items = self._item_latent(items)
-
-        for ul, il in zip(self._user_layers, self._item_layers):
-            users = ul(users)
-            items = il(items)
-        return fluid.layers.elementwise_mul(users, items)
-
-
-class MLP(fluid.Layer):
-    def __init__(self, name_scope):
-        super(MLP, self).__init__(name_scope)
-        self._user_latent = fluid.FC(self.full_name(), 256)
-        self._item_latent = fluid.FC(self.full_name(), 256)
-        self._match_layers = []
-        self._hid_sizes = [128, 64]
-        for i in range(len(self._hid_sizes)):
-            self._match_layers.append(
-                self.add_sublayer(
-                    'match_layer_%d' % i,
-                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
-
-    def forward(self, users, items):
-        users = self._user_latent(users)
-        items = self._item_latent(items)
-        match_vec = fluid.layers.concat(
-            [users, items], axis=len(users.shape) - 1)
-        for l in self._match_layers:
-            match_vec = l(match_vec)
-        return match_vec
-
-
-class DeepCF(fluid.Layer):
-    def __init__(self, name_scope, num_users, num_items, matrix):
-        super(DeepCF, self).__init__(name_scope)
-        self._num_users = num_users
-        self._num_items = num_items
-        self._rating_matrix = self.create_parameter(
-            fluid.ParamAttr(trainable=False),
-            matrix.shape,
-            matrix.dtype,
-            is_bias=False,
-            default_initializer=fluid.initializer.NumpyArrayInitializer(matrix))
-        self._rating_matrix.stop_gradient = True
-
-        self._mlp = MLP(self.full_name())
-        self._dmf = DMF(self.full_name())
-        self._match_fc = fluid.FC(self.full_name(), 1, act='sigmoid')
-
-    def forward(self, users, items):
-        # users_emb = self._user_emb(users)
-        # items_emb = self._item_emb(items)
-        users_emb = fluid.layers.gather(self._rating_matrix, users)
-        items_emb = fluid.layers.gather(
-            fluid.layers.transpose(self._rating_matrix, [1, 0]), items)
-        users_emb.stop_gradient = True
-        items_emb.stop_gradient = True
-
-        mlp_predictive = self._mlp(users_emb, items_emb)
-        dmf_predictive = self._dmf(users_emb, items_emb)
-        predictive = fluid.layers.concat(
-            [mlp_predictive, dmf_predictive],
-            axis=len(mlp_predictive.shape) - 1)
-        prediction = self._match_fc(predictive)
-        return prediction
-
-
-def get_data():
-    user_ids = []
-    item_ids = []
-    labels = []
-    NUM_USERS = 100
-    NUM_ITEMS = 1000
-    matrix = np.zeros([NUM_USERS, NUM_ITEMS], dtype=np.float32)
-
-    for uid in range(NUM_USERS):
-        for iid in range(NUM_ITEMS):
-            label = float(random.randint(1, 6) == 1)
-            user_ids.append(uid)
-            item_ids.append(iid)
-            labels.append(label)
-            matrix[uid, iid] = label
-    indices = np.arange(len(user_ids))
-    np.random.shuffle(indices)
-    users_np = np.array(user_ids, dtype=np.int32)[indices]
-    items_np = np.array(item_ids, dtype=np.int32)[indices]
-    labels_np = np.array(labels, dtype=np.float32)[indices]
-    return np.expand_dims(users_np, -1), \
-           np.expand_dims(items_np, -1), \
-           np.expand_dims(labels_np, -1), NUM_USERS, NUM_ITEMS, matrix
-
-
-def load_data(DATA_PATH):
-    sys.stderr.write('loading from %s\n' % DATA_PATH)
-    likes = dict()
-    num_users = -1
-    num_items = -1
-    with open(DATA_PATH, 'r') as f:
-        for l in f.readlines():
-            uid, iid, rating = [int(v) for v in l.split('\t')]
-            num_users = max(num_users, uid + 1)
-            num_items = max(num_items, iid + 1)
-            if float(rating) > 0.0:
-                likes[(uid, iid)] = 1.0
-
-    user_ids = []
-    item_ids = []
-    labels = []
-    matrix = np.zeros([num_users, num_items], dtype=np.float32)
-    for uid, iid in likes.keys():
-        user_ids.append(uid)
-        item_ids.append(iid)
-        labels.append(1.0)
-        matrix[uid, iid] = 1.0
-
-        negative = 0
-        while negative < 3:
-            nuid = random.randint(0, num_users - 1)
-            niid = random.randint(0, num_items - 1)
-            if (nuid, niid) not in likes:
-                negative += 1
-                user_ids.append(nuid)
-                item_ids.append(niid)
-                labels.append(0.0)
-
-    indices = np.arange(len(user_ids))
-    np.random.shuffle(indices)
-    users_np = np.array(user_ids, dtype=np.int32)[indices]
-    items_np = np.array(item_ids, dtype=np.int32)[indices]
-    labels_np = np.array(labels, dtype=np.float32)[indices]
-    return np.expand_dims(users_np, -1), \
-           np.expand_dims(items_np, -1), \
-           np.expand_dims(labels_np, -1), num_users, num_items, matrix
-
-
-class TestDygraphDeepCF(unittest.TestCase):
-    def test_deefcf(self):
-        seed = 90
-        if DATA_PATH:
-            (users_np, items_np, labels_np, num_users, num_items,
-             matrix) = load_data(DATA_PATH)
-        else:
-            (users_np, items_np, labels_np, num_users, num_items,
-             matrix) = get_data()
-
-        startup = fluid.Program()
-        startup.random_seed = seed
-        main = fluid.Program()
-        main.random_seed = seed
-
-        scope = fluid.core.Scope()
-        with new_program_scope(main=main, startup=startup, scope=scope):
-            users = fluid.layers.data('users', [1], dtype='int32')
-            items = fluid.layers.data('items', [1], dtype='int32')
-            labels = fluid.layers.data('labels', [1], dtype='float32')
-
-            deepcf = DeepCF('deepcf', num_users, num_items, matrix)
-            prediction = deepcf(users, items)
-            loss = fluid.layers.reduce_sum(
-                fluid.layers.log_loss(prediction, labels))
-            adam = fluid.optimizer.AdamOptimizer(0.01)
-            adam.minimize(loss)
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-            exe.run(startup)
-            for e in range(NUM_EPOCHES):
-                sys.stderr.write('epoch %d\n' % e)
-                for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
-                    if slice + BATCH_SIZE >= users_np.shape[0]:
-                        break
-                    static_loss = exe.run(
-                        main,
-                        feed={
-                            users.name: users_np[slice:slice + BATCH_SIZE],
-                            items.name: items_np[slice:slice + BATCH_SIZE],
-                            labels.name: labels_np[slice:slice + BATCH_SIZE]
-                        },
-                        fetch_list=[loss])[0]
-                    sys.stderr.write('static loss %s\n' % static_loss)
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            deepcf = DeepCF('deepcf', num_users, num_items, matrix)
-            adam = fluid.optimizer.AdamOptimizer(0.01)
-            for e in range(NUM_EPOCHES):
-                sys.stderr.write('epoch %d\n' % e)
-                for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
-                    if slice + BATCH_SIZE >= users_np.shape[0]:
-                        break
-                    prediction = deepcf(
-                        to_variable(users_np[slice:slice + BATCH_SIZE]),
-                        to_variable(items_np[slice:slice + BATCH_SIZE]))
-                    loss = fluid.layers.reduce_sum(
-                        fluid.layers.log_loss(prediction,
-                                              to_variable(labels_np[
-                                                  slice:slice + BATCH_SIZE])))
-                    loss.backward()
-                    adam.minimize(loss)
-                    deepcf.clear_gradients()
-                    dy_loss = loss.numpy()
-                    sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            deepcf2 = DeepCF('deepcf', num_users, num_items, matrix)
-            adam2 = fluid.optimizer.AdamOptimizer(0.01)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            for e in range(NUM_EPOCHES):
-                sys.stderr.write('epoch %d\n' % e)
-                for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
-                    if slice + BATCH_SIZE >= users_np.shape[0]:
-                        break
-                    prediction2 = deepcf2(
-                        to_variable(users_np[slice:slice + BATCH_SIZE]),
-                        to_variable(items_np[slice:slice + BATCH_SIZE]))
-                    loss2 = fluid.layers.reduce_sum(
-                        fluid.layers.log_loss(prediction2,
-                                              to_variable(labels_np[
-                                                  slice:slice + BATCH_SIZE])))
-                    loss2.backward(backward_strategy)
-                    adam2.minimize(loss2)
-                    deepcf2.clear_gradients()
-                    dy_loss2 = loss2.numpy()
-                    sys.stderr.write('dynamic loss: %s %s\n' %
-                                     (slice, dy_loss2))
-
-        self.assertEqual(static_loss, dy_loss)
-        self.assertEqual(static_loss, dy_loss2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_framework.py b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
deleted file mode 100644
index 0f83f89f7bd3876f6a9a8aedb9ca43082395f7a9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_framework.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import numpy as np
-from test_imperative_base import new_program_scope
-
-
-class MLP(fluid.Layer):
-    def __init__(self, name_scope):
-        super(MLP, self).__init__(name_scope)
-        self._fc1 = fluid.dygraph.FC(
-            self.full_name(),
-            3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = fluid.dygraph.FC(
-            self.full_name(),
-            4,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-
-    def forward(self, inputs):
-        x = self._fc1(inputs)
-        x = self._fc2(x)
-        x = fluid.layers.reduce_sum(x)
-        return x
-
-
-class TestDygraphFramework(unittest.TestCase):
-    def test_dygraph_backward(self):
-        with new_program_scope():
-            mlp = MLP("mlp")
-            var_inp = fluid.layers.data(
-                "input", shape=[2, 2], dtype="float32", append_batch_size=False)
-            out = mlp(var_inp)
-            try:
-                out.backward()
-                raise AssertionError(
-                    "backward should not be usable in static graph mode")
-            except ValueError as e:
-                self.assertTrue((e is not None))
-
-    def test_dygraph_to_string(self):
-        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        with fluid.dygraph.guard():
-            var_inp = fluid.dygraph.base.to_variable(np_inp)
-            var_inp.to_string(throw_on_error=True)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
deleted file mode 100644
index 7e8cebab44eee1889327ec78f8007ed28fe38981..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import unittest
-import numpy as np
-import six
-import sys
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid import Conv2D, Pool2D, FC
-from test_imperative_base import new_program_scope
-from paddle.fluid.dygraph.base import to_variable
-
-
-class Discriminator(fluid.Layer):
-    def __init__(self, name_scope):
-        super(Discriminator, self).__init__(name_scope)
-        self._fc1 = FC(self.full_name(), size=32, act='elu')
-        self._fc2 = FC(self.full_name(), size=1)
-
-    def forward(self, inputs):
-        x = self._fc1(inputs)
-        return self._fc2(x)
-
-
-class Generator(fluid.Layer):
-    def __init__(self, name_scope):
-        super(Generator, self).__init__(name_scope)
-        self._fc1 = FC(self.full_name(), size=64, act='elu')
-        self._fc2 = FC(self.full_name(), size=64, act='elu')
-        self._fc3 = FC(self.full_name(), size=1)
-
-    def forward(self, inputs):
-        x = self._fc1(inputs)
-        x = self._fc2(x)
-        return self._fc3(x)
-
-
-class TestDygraphGAN(unittest.TestCase):
-    def test_gan_float32(self):
-        seed = 90
-
-        startup = fluid.Program()
-        startup.random_seed = seed
-        discriminate_p = fluid.Program()
-        generate_p = fluid.Program()
-        discriminate_p.random_seed = seed
-        generate_p.random_seed = seed
-
-        scope = fluid.core.Scope()
-        with new_program_scope(
-                main=discriminate_p, startup=startup, scope=scope):
-            discriminator = Discriminator("d")
-            generator = Generator("g")
-
-            img = fluid.layers.data(
-                name="img", shape=[2, 1], append_batch_size=False)
-            noise = fluid.layers.data(
-                name="noise", shape=[2, 2], append_batch_size=False)
-
-            d_real = discriminator(img)
-            d_loss_real = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_real,
-                    label=fluid.layers.fill_constant(
-                        shape=[2, 1], dtype='float32', value=1.0)))
-
-            d_fake = discriminator(generator(noise))
-            d_loss_fake = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_fake,
-                    label=fluid.layers.fill_constant(
-                        shape=[2, 1], dtype='float32', value=0.0)))
-
-            d_loss = d_loss_real + d_loss_fake
-
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            sgd.minimize(d_loss)
-
-        with new_program_scope(main=generate_p, startup=startup, scope=scope):
-            discriminator = Discriminator("d")
-            generator = Generator("g")
-
-            noise = fluid.layers.data(
-                name="noise", shape=[2, 2], append_batch_size=False)
-
-            d_fake = discriminator(generator(noise))
-            g_loss = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_fake,
-                    label=fluid.layers.fill_constant(
-                        shape=[2, 1], dtype='float32', value=1.0)))
-
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            sgd.minimize(g_loss)
-
-        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0))
-        static_params = dict()
-        with fluid.scope_guard(scope):
-            img = np.ones([2, 1], np.float32)
-            noise = np.ones([2, 2], np.float32)
-            exe.run(startup)
-            static_d_loss = exe.run(discriminate_p,
-                                    feed={'img': img,
-                                          'noise': noise},
-                                    fetch_list=[d_loss])[0]
-            static_g_loss = exe.run(generate_p,
-                                    feed={'noise': noise},
-                                    fetch_list=[g_loss])[0]
-
-            # generate_p contains all parameters needed.
-            for param in generate_p.global_block().all_parameters():
-                static_params[param.name] = np.array(
-                    scope.find_var(param.name).get_tensor())
-
-        dy_params = dict()
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            discriminator = Discriminator("d")
-            generator = Generator("g")
-            sgd = SGDOptimizer(learning_rate=1e-3)
-
-            d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
-            d_loss_real = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_real, label=to_variable(np.ones([2, 1], np.float32))))
-
-            d_fake = discriminator(
-                generator(to_variable(np.ones([2, 2], np.float32))))
-            d_loss_fake = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_fake, label=to_variable(np.zeros([2, 1], np.float32))))
-
-            d_loss = d_loss_real + d_loss_fake
-            d_loss.backward()
-            sgd.minimize(d_loss)
-            discriminator.clear_gradients()
-            generator.clear_gradients()
-
-            d_fake = discriminator(
-                generator(to_variable(np.ones([2, 2], np.float32))))
-            g_loss = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_fake, label=to_variable(np.ones([2, 1], np.float32))))
-            g_loss.backward()
-            sgd.minimize(g_loss)
-            for p in discriminator.parameters():
-                dy_params[p.name] = p.numpy()
-            for p in generator.parameters():
-                dy_params[p.name] = p.numpy()
-
-            dy_g_loss = g_loss.numpy()
-            dy_d_loss = d_loss.numpy()
-
-        dy_params2 = dict()
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            discriminator2 = Discriminator("d")
-            generator2 = Generator("g")
-            sgd2 = SGDOptimizer(learning_rate=1e-3)
-
-            d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32)))
-            d_loss_real2 = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_real2, label=to_variable(np.ones([2, 1], np.float32))))
-
-            d_fake2 = discriminator2(
-                generator2(to_variable(np.ones([2, 2], np.float32))))
-            d_loss_fake2 = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_fake2, label=to_variable(np.zeros([2, 1], np.float32))))
-
-            d_loss2 = d_loss_real2 + d_loss_fake2
-            d_loss2.backward(backward_strategy)
-            sgd2.minimize(d_loss2)
-            discriminator2.clear_gradients()
-            generator2.clear_gradients()
-
-            d_fake2 = discriminator2(
-                generator2(to_variable(np.ones([2, 2], np.float32))))
-            g_loss2 = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_fake2, label=to_variable(np.ones([2, 1], np.float32))))
-            g_loss2.backward(backward_strategy)
-            sgd2.minimize(g_loss2)
-            for p in discriminator2.parameters():
-                dy_params2[p.name] = p.numpy()
-            for p in generator.parameters():
-                dy_params2[p.name] = p.numpy()
-
-            dy_g_loss2 = g_loss2.numpy()
-            dy_d_loss2 = d_loss2.numpy()
-
-        self.assertEqual(dy_g_loss, static_g_loss)
-        self.assertEqual(dy_d_loss, static_d_loss)
-        for k, v in six.iteritems(dy_params):
-            self.assertTrue(np.allclose(v, static_params[k]))
-
-        self.assertEqual(dy_g_loss2, static_g_loss)
-        self.assertEqual(dy_d_loss2, static_d_loss)
-        for k, v in six.iteritems(dy_params2):
-            self.assertTrue(np.allclose(v, static_params[k]))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
deleted file mode 100644
index 8531eda86978302f4014e11577f7055f1ef156b6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import unittest
-import numpy as np
-import sys
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.optimizer import AdamOptimizer
-from test_imperative_base import new_program_scope
-from paddle.fluid.dygraph.base import to_variable
-
-
-def gen_data():
-    pass
-
-
-class GraphConv(fluid.Layer):
-    def __init__(self, name_scope, in_features, out_features):
-        super(GraphConv, self).__init__(name_scope)
-
-        self._in_features = in_features
-        self._out_features = out_features
-        self.weight = self.create_parameter(
-            attr=None,
-            dtype='float32',
-            shape=[self._in_features, self._out_features])
-        self.bias = self.create_parameter(
-            attr=None, dtype='float32', shape=[self._out_features])
-
-    def forward(self, features, adj):
-        support = fluid.layers.matmul(features, self.weight)
-        # TODO(panyx0718): sparse matmul?
-        return fluid.layers.matmul(adj, support) + self.bias
-
-
-class GCN(fluid.Layer):
-    def __init__(self, name_scope, num_hidden):
-        super(GCN, self).__init__(name_scope)
-        self.gc = GraphConv(self.full_name(), num_hidden, 32)
-        self.gc2 = GraphConv(self.full_name(), 32, 10)
-
-    def forward(self, x, adj):
-        x = fluid.layers.relu(self.gc(x, adj))
-        return self.gc2(x, adj)
-
-
-class TestDygraphGNN(unittest.TestCase):
-    def test_gnn_float32(self):
-        seed = 90
-
-        startup = fluid.Program()
-        startup.random_seed = seed
-        main = fluid.Program()
-        main.random_seed = seed
-
-        scope = fluid.core.Scope()
-        with new_program_scope(main=main, startup=startup, scope=scope):
-            features = fluid.layers.data(
-                name='features',
-                shape=[1, 100, 50],
-                dtype='float32',
-                append_batch_size=False)
-            # Use selected rows when it's supported.
-            adj = fluid.layers.data(
-                name='adj',
-                shape=[1, 100, 100],
-                dtype='float32',
-                append_batch_size=False)
-            labels = fluid.layers.data(
-                name='labels',
-                shape=[100, 1],
-                dtype='int64',
-                append_batch_size=False)
-
-            model = GCN('test_gcn', 50)
-            logits = model(features, adj)
-            logits = fluid.layers.reshape(logits, logits.shape[1:])
-            # In other example, it's nll with log_softmax. However, paddle's
-            # log_loss only supports binary classification now.
-            loss = fluid.layers.softmax_with_cross_entropy(logits, labels)
-            loss = fluid.layers.reduce_sum(loss)
-
-            adam = AdamOptimizer(learning_rate=1e-3)
-            adam.minimize(loss)
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-            exe.run(startup)
-            static_loss = exe.run(feed={
-                'features': np.ones(
-                    [1, 100, 50], dtype=np.float32),
-                'adj': np.ones(
-                    [1, 100, 100], dtype=np.float32),
-                'labels': np.ones(
-                    [100, 1], dtype=np.int64)
-            },
-                                  fetch_list=[loss])[0]
-
-            static_weight = np.array(
-                scope.find_var(model.gc.weight.name).get_tensor())
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            features = np.ones([1, 100, 50], dtype=np.float32)
-            # Use selected rows when it's supported.
-            adj = np.ones([1, 100, 100], dtype=np.float32)
-            labels = np.ones([100, 1], dtype=np.int64)
-
-            model = GCN('test_gcn', 50)
-            logits = model(to_variable(features), to_variable(adj))
-            logits = fluid.layers.reshape(logits, logits.shape[1:])
-            # In other example, it's nll with log_softmax. However, paddle's
-            # log_loss only supports binary classification now.
-            loss = fluid.layers.softmax_with_cross_entropy(logits,
-                                                           to_variable(labels))
-            loss = fluid.layers.reduce_sum(loss)
-            loss.backward()
-            adam = AdamOptimizer(learning_rate=1e-3)
-
-            adam.minimize(loss)
-            model.clear_gradients()
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            features2 = np.ones([1, 100, 50], dtype=np.float32)
-            # Use selected rows when it's supported.
-            adj2 = np.ones([1, 100, 100], dtype=np.float32)
-            labels2 = np.ones([100, 1], dtype=np.int64)
-
-            model2 = GCN('test_gcn', 50)
-            logits2 = model2(to_variable(features2), to_variable(adj2))
-            logits2 = fluid.layers.reshape(logits2, logits2.shape[1:])
-            # In other example, it's nll with log_softmax. However, paddle's
-            # log_loss only supports binary classification now.
-            loss2 = fluid.layers.softmax_with_cross_entropy(
-                logits2, to_variable(labels2))
-            loss2 = fluid.layers.reduce_sum(loss2)
-            loss2.backward()
-            adam2 = AdamOptimizer(learning_rate=1e-3)
-            adam2.minimize(loss2)
-            model2.clear_gradients()
-
-        self.assertEqual(static_loss, loss.numpy())
-        self.assertTrue(np.allclose(static_weight, model.gc.weight.numpy()))
-        self.assertEqual(static_loss, loss2.numpy())
-        self.assertTrue(np.allclose(static_weight, model2.gc.weight.numpy()))
-        sys.stderr.write('%s %s\n' % (static_loss, loss.numpy()))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
deleted file mode 100644
index c3a12addfc8ef6e743a34bcdae9237a994b2d178..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import contextlib
-import unittest
-import numpy as np
-import six
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-
-
-class SimpleImgConvPool(fluid.dygraph.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 pool_size,
-                 pool_stride,
-                 pool_padding=0,
-                 pool_type='max',
-                 global_pooling=False,
-                 conv_stride=1,
-                 conv_padding=0,
-                 conv_dilation=1,
-                 conv_groups=1,
-                 act=None,
-                 use_cudnn=False,
-                 param_attr=None,
-                 bias_attr=None):
-        super(SimpleImgConvPool, self).__init__(name_scope)
-
-        self._conv2d = Conv2D(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=None,
-            bias_attr=None,
-            use_cudnn=use_cudnn)
-
-        self._pool2d = Pool2D(
-            self.full_name(),
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
-
-    def forward(self, inputs):
-        x = self._conv2d(inputs)
-        x = self._pool2d(x)
-        return x
-
-
-class MNIST(fluid.dygraph.Layer):
-    def __init__(self, name_scope):
-        super(MNIST, self).__init__(name_scope)
-
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 20, 5, 2, 2, act="relu")
-
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 50, 5, 2, 2, act="relu")
-
-        pool_2_shape = 50 * 4 * 4
-        SIZE = 10
-        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(self.full_name(),
-                      10,
-                      param_attr=fluid.param_attr.ParamAttr(
-                          initializer=fluid.initializer.NormalInitializer(
-                              loc=0.0, scale=scale)),
-                      act="softmax")
-
-    def forward(self, inputs):
-        x = self._simple_img_conv_pool_1(inputs)
-        x = self._simple_img_conv_pool_2(x)
-        x = self._fc(x)
-        return x
-
-
-class TestImperativeMnist(unittest.TestCase):
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                image = np.array(item[0]).reshape(1, 28, 28)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield image, label
-
-        return _reader_imple
-
-    def test_mnist_float32(self):
-        seed = 90
-        epoch_num = 1
-        batch_size = 128
-        batch_num = 50
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            mnist = MNIST("mnist")
-            sgd = SGDOptimizer(learning_rate=1e-3)
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(paddle.dataset.mnist.train()),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
-
-            mnist.train()
-            dy_param_init_value = {}
-            for epoch in range(epoch_num):
-                for batch_id, data in enumerate(batch_py_reader()):
-                    if batch_id >= batch_num:
-                        break
-                    img = data[0]
-                    dy_x_data = img.numpy()
-                    label = data[1]
-                    label.stop_gradient = True
-
-                    cost = mnist(img)
-                    loss = fluid.layers.cross_entropy(cost, label)
-                    avg_loss = fluid.layers.mean(loss)
-
-                    dy_out = avg_loss.numpy()
-
-                    if epoch == 0 and batch_id == 0:
-                        for param in mnist.parameters():
-                            dy_param_init_value[param.name] = param.numpy()
-
-                    avg_loss.backward()
-                    sgd.minimize(avg_loss)
-                    mnist.clear_gradients()
-
-                    dy_param_value = {}
-                    for param in mnist.parameters():
-                        dy_param_value[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-
-            mnist = MNIST("mnist")
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(),
-                batch_size=batch_size,
-                drop_last=True)
-
-            img = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            cost = mnist(img)
-            loss = fluid.layers.cross_entropy(cost, label)
-            avg_loss = fluid.layers.mean(loss)
-            sgd.minimize(avg_loss)
-
-            # initialize params and fetch them
-            static_param_init_value = {}
-            static_param_name_list = []
-            for param in mnist.parameters():
-                static_param_name_list.append(param.name)
-
-            out = exe.run(fluid.default_startup_program(),
-                          fetch_list=static_param_name_list)
-
-            for i in range(len(static_param_name_list)):
-                static_param_init_value[static_param_name_list[i]] = out[i]
-
-            for epoch in range(epoch_num):
-                for batch_id, data in enumerate(train_reader()):
-                    if batch_id >= batch_num:
-                        break
-                    static_x_data = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(
-                            [batch_size, 1])
-
-                    fetch_list = [avg_loss.name]
-                    fetch_list.extend(static_param_name_list)
-                    out = exe.run(
-                        fluid.default_main_program(),
-                        feed={"pixel": static_x_data,
-                              "label": y_data},
-                        fetch_list=fetch_list)
-
-                    static_param_value = {}
-                    static_out = out[0]
-                    for i in range(1, len(out)):
-                        static_param_value[static_param_name_list[i - 1]] = out[
-                            i]
-
-        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
-
-        for key, value in six.iteritems(static_param_init_value):
-            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
-
-        self.assertTrue(np.allclose(static_out, dy_out))
-
-        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
deleted file mode 100644
index 0f5eb52e22ae5d397bc6321258bbb1dc672f4865..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import contextlib
-import unittest
-import numpy as np
-import six
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-from test_imperative_mnist import MNIST
-
-
-class TestImperativeMnistSortGradient(unittest.TestCase):
-    def test_mnist_sort_gradient_float32(self):
-        seed = 90
-        epoch_num = 1
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-
-            mnist2 = MNIST("mnist")
-            sgd2 = SGDOptimizer(learning_rate=1e-3)
-            train_reader2 = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
-
-            mnist2.train()
-            dy_param_init_value2 = {}
-            for epoch in range(epoch_num):
-                for batch_id, data in enumerate(train_reader2()):
-                    dy_x_data2 = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data2 = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(128, 1)
-
-                    img2 = to_variable(dy_x_data2)
-                    label2 = to_variable(y_data2)
-                    label2.stop_gradient = True
-
-                    cost2 = mnist2(img2)
-                    loss2 = fluid.layers.cross_entropy(cost2, label2)
-                    avg_loss2 = fluid.layers.mean(loss2)
-
-                    dy_out2 = avg_loss2.numpy()
-
-                    if epoch == 0 and batch_id == 0:
-                        for param in mnist2.parameters():
-                            dy_param_init_value2[param.name] = param.numpy()
-
-                    avg_loss2.backward(backward_strategy)
-                    sgd2.minimize(avg_loss2)
-                    mnist2.clear_gradients()
-
-                    dy_param_value2 = {}
-                    for param in mnist2.parameters():
-                        dy_param_value2[param.name] = param.numpy()
-                    if batch_id == 20:
-                        break
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-
-            mnist = MNIST("mnist")
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
-
-            img = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            cost = mnist(img)
-            loss = fluid.layers.cross_entropy(cost, label)
-            avg_loss = fluid.layers.mean(loss)
-            sgd.minimize(avg_loss)
-
-            # initialize params and fetch them
-            static_param_init_value = {}
-            static_param_name_list = []
-            for param in mnist.parameters():
-                static_param_name_list.append(param.name)
-
-            out = exe.run(fluid.default_startup_program(),
-                          fetch_list=static_param_name_list)
-
-            for i in range(len(static_param_name_list)):
-                static_param_init_value[static_param_name_list[i]] = out[i]
-
-            for epoch in range(epoch_num):
-                for batch_id, data in enumerate(train_reader()):
-                    static_x_data = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape([128, 1])
-
-                    fetch_list = [avg_loss.name]
-                    fetch_list.extend(static_param_name_list)
-                    out = exe.run(
-                        fluid.default_main_program(),
-                        feed={"pixel": static_x_data,
-                              "label": y_data},
-                        fetch_list=fetch_list)
-
-                    static_param_value = {}
-                    static_out = out[0]
-                    for i in range(1, len(out)):
-                        static_param_value[static_param_name_list[i - 1]] = out[
-                            i]
-                    if batch_id == 20:
-                        break
-
-        self.assertTrue(np.allclose(dy_x_data2.all(), static_x_data.all()))
-
-        for key, value in six.iteritems(static_param_init_value):
-            self.assertTrue(np.allclose(value, dy_param_init_value2[key]))
-
-        self.assertTrue(np.allclose(static_out, dy_out2))
-
-        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value2[key], atol=1e-5))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
deleted file mode 100644
index 22bd2e55d28342b81475fd2832a2a407382e2d32..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ /dev/null
@@ -1,584 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import six
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm, Embedding, GRUUnit
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-
-
-class Config(object):
-    '''
-    config for training
-    '''
-    # decoder size for decoder stage
-    decoder_size = 128
-    # size for word embedding
-    word_vector_dim = 128
-    # max length for label padding
-    max_length = 5
-    # optimizer setting
-    LR = 1.0
-    learning_rate_decay = None
-
-    # batch size to train
-    batch_size = 16
-    # class number to classify
-    num_classes = 481
-
-    use_gpu = False
-    # special label for start and end
-    SOS = 0
-    EOS = 1
-    # settings for ctc data, not use in unittest
-    DATA_DIR_NAME = "./dataset/ctc_data/data"
-    TRAIN_DATA_DIR_NAME = "train_images"
-    TRAIN_LIST_FILE_NAME = "train.list"
-
-    # data shape for input image
-    DATA_SHAPE = [1, 48, 384]
-
-
-class ConvBNPool(fluid.dygraph.Layer):
-    def __init__(self,
-                 name_scope,
-                 group,
-                 out_ch,
-                 channels,
-                 act="relu",
-                 is_test=False,
-                 pool=True,
-                 use_cudnn=True):
-        super(ConvBNPool, self).__init__(name_scope)
-        self.group = group
-        self.pool = pool
-
-        filter_size = 3
-        conv_std_0 = (2.0 / (filter_size**2 * channels[0]))**0.5
-        conv_param_0 = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, conv_std_0))
-
-        conv_std_1 = (2.0 / (filter_size**2 * channels[1]))**0.5
-        conv_param_1 = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, conv_std_1))
-
-        self.conv_0_layer = Conv2D(
-            self.full_name(),
-            out_ch[0],
-            3,
-            padding=1,
-            param_attr=conv_param_0,
-            bias_attr=False,
-            act=None,
-            use_cudnn=use_cudnn)
-        self.bn_0_layer = BatchNorm(
-            self.full_name(), out_ch[0], act=act, is_test=is_test)
-        self.conv_1_layer = Conv2D(
-            self.full_name(),
-            num_filters=out_ch[1],
-            filter_size=3,
-            padding=1,
-            param_attr=conv_param_1,
-            bias_attr=False,
-            act=None,
-            use_cudnn=use_cudnn)
-        self.bn_1_layer = BatchNorm(
-            self.full_name(), out_ch[1], act=act, is_test=is_test)
-
-        if self.pool:
-            self.pool_layer = Pool2D(
-                self.full_name(),
-                pool_size=2,
-                pool_type='max',
-                pool_stride=2,
-                use_cudnn=use_cudnn,
-                ceil_mode=True)
-
-    def forward(self, inputs):
-        conv_0 = self.conv_0_layer(inputs)
-        bn_0 = self.bn_0_layer(conv_0)
-        conv_1 = self.conv_1_layer(bn_0)
-        bn_1 = self.bn_1_layer(conv_1)
-        if self.pool:
-            bn_pool = self.pool_layer(bn_1)
-            return bn_pool
-        return bn_1
-
-
-class OCRConv(fluid.dygraph.Layer):
-    def __init__(self, name_scope, is_test=False, use_cudnn=True):
-        super(OCRConv, self).__init__(name_scope)
-        self.conv_bn_pool_1 = ConvBNPool(
-            self.full_name(),
-            2, [16, 16], [1, 16],
-            is_test=is_test,
-            use_cudnn=use_cudnn)
-        self.conv_bn_pool_2 = ConvBNPool(
-            self.full_name(),
-            2, [32, 32], [16, 32],
-            is_test=is_test,
-            use_cudnn=use_cudnn)
-        self.conv_bn_pool_3 = ConvBNPool(
-            self.full_name(),
-            2, [64, 64], [32, 64],
-            is_test=is_test,
-            use_cudnn=use_cudnn)
-        self.conv_bn_pool_4 = ConvBNPool(
-            self.full_name(),
-            2, [128, 128], [64, 128],
-            is_test=is_test,
-            pool=False,
-            use_cudnn=use_cudnn)
-
-    def forward(self, inputs):
-        inputs_1 = self.conv_bn_pool_1(inputs)
-        inputs_2 = self.conv_bn_pool_2(inputs_1)
-        inputs_3 = self.conv_bn_pool_3(inputs_2)
-        inputs_4 = self.conv_bn_pool_4(inputs_3)
-
-        return inputs_4
-
-
-class DynamicGRU(fluid.dygraph.Layer):
-    def __init__(self,
-                 scope_name,
-                 size,
-                 param_attr=None,
-                 bias_attr=None,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 candidate_activation='tanh',
-                 h_0=None,
-                 origin_mode=False):
-        super(DynamicGRU, self).__init__(scope_name)
-
-        self.gru_unit = GRUUnit(
-            self.full_name(),
-            size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
-
-        self.h_0 = h_0
-        self.is_reverse = is_reverse
-
-    def forward(self, inputs):
-        hidden = self.h_0
-        res = []
-        for i in range(inputs.shape[1]):
-            if self.is_reverse:
-                i = inputs.shape[1] - 1 - i
-            input_ = fluid.layers.slice(
-                inputs, axes=[1], starts=[i], ends=[i + 1])
-            input_ = fluid.layers.reshape(
-                input_, [-1, input_.shape[2]], inplace=False)
-            hidden, reset, gate = self.gru_unit(input_, hidden)
-            hidden_ = fluid.layers.reshape(
-                hidden, [-1, 1, hidden.shape[1]], inplace=False)
-            if self.is_reverse:
-                res = [hidden_] + res
-            else:
-                res.append(hidden_)
-        res = fluid.layers.concat(res, axis=1)
-        return res
-
-
-class EncoderNet(fluid.dygraph.Layer):
-    def __init__(self,
-                 scope_name,
-                 rnn_hidden_size=200,
-                 is_test=False,
-                 use_cudnn=True):
-        super(EncoderNet, self).__init__(scope_name)
-        self.rnn_hidden_size = rnn_hidden_size
-        para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0,
-                                                                         0.02))
-        bias_attr = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)
-        if fluid.framework.in_dygraph_mode():
-            h_0 = np.zeros(
-                (Config.batch_size, rnn_hidden_size), dtype="float32")
-            h_0 = to_variable(h_0)
-        else:
-            h_0 = fluid.layers.fill_constant(
-                shape=[Config.batch_size, rnn_hidden_size],
-                dtype='float32',
-                value=0)
-        self.ocr_convs = OCRConv(
-            self.full_name(), is_test=is_test, use_cudnn=use_cudnn)
-
-        self.fc_1_layer = FC(self.full_name(),
-                             rnn_hidden_size * 3,
-                             param_attr=para_attr,
-                             bias_attr=False,
-                             num_flatten_dims=2)
-        self.fc_2_layer = FC(self.full_name(),
-                             rnn_hidden_size * 3,
-                             param_attr=para_attr,
-                             bias_attr=False,
-                             num_flatten_dims=2)
-        self.gru_forward_layer = DynamicGRU(
-            self.full_name(),
-            size=rnn_hidden_size,
-            h_0=h_0,
-            param_attr=para_attr,
-            bias_attr=bias_attr,
-            candidate_activation='relu')
-        self.gru_backward_layer = DynamicGRU(
-            self.full_name(),
-            size=rnn_hidden_size,
-            h_0=h_0,
-            param_attr=para_attr,
-            bias_attr=bias_attr,
-            candidate_activation='relu',
-            is_reverse=True)
-
-        self.encoded_proj_fc = FC(self.full_name(),
-                                  Config.decoder_size,
-                                  bias_attr=False,
-                                  num_flatten_dims=2)
-
-    def forward(self, inputs):
-        conv_features = self.ocr_convs(inputs)
-        #sliced_feature = fluid.layers.im2sequence(
-        #    input=conv_features,
-        #    stride=[1, 1],
-        #    filter_size=[conv_features.shape[2], 1])
-
-        transpose_conv_features = fluid.layers.transpose(
-            conv_features, perm=[0, 3, 1, 2])
-
-        sliced_feature = fluid.layers.reshape(
-            transpose_conv_features, [
-                -1, 48, transpose_conv_features.shape[2] *
-                transpose_conv_features.shape[3]
-            ],
-            inplace=False)
-        fc_1 = self.fc_1_layer(sliced_feature)
-        fc_2 = self.fc_2_layer(sliced_feature)
-        gru_forward = self.gru_forward_layer(fc_1)
-
-        gru_backward = self.gru_backward_layer(fc_2)
-
-        encoded_vector = fluid.layers.concat(
-            input=[gru_forward, gru_backward], axis=2)
-
-        encoded_proj = self.encoded_proj_fc(encoded_vector)
-
-        return gru_backward, encoded_vector, encoded_proj
-
-
-class SimpleAttention(fluid.dygraph.Layer):
-    def __init__(self, scope_name, decoder_size):
-        super(SimpleAttention, self).__init__(scope_name)
-
-        self.fc_1 = FC(self.full_name(),
-                       decoder_size,
-                       act=None,
-                       bias_attr=False)
-        self.fc_2 = FC(self.full_name(),
-                       1,
-                       num_flatten_dims=2,
-                       act=None,
-                       bias_attr=False)
-
-    def forward(self, encoder_vec, encoder_proj, decoder_state):
-
-        decoder_state_fc = self.fc_1(decoder_state)
-        decoder_state_proj_reshape = fluid.layers.reshape(
-            decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]], inplace=False)
-        decoder_state_expand = fluid.layers.expand(
-            decoder_state_proj_reshape, [1, encoder_proj.shape[1], 1])
-        concated = fluid.layers.elementwise_add(encoder_proj,
-                                                decoder_state_expand)
-        concated = fluid.layers.tanh(x=concated)
-        attention_weight = self.fc_2(concated)
-
-        weights_reshape = fluid.layers.reshape(
-            x=attention_weight,
-            shape=[attention_weight.shape[0], attention_weight.shape[1]],
-            inplace=False)
-
-        weights_reshape = fluid.layers.softmax(weights_reshape)
-        scaled = fluid.layers.elementwise_mul(
-            x=encoder_vec, y=weights_reshape, axis=0)
-        context = fluid.layers.reduce_sum(scaled, dim=1)
-
-        return context
-
-
-class GRUDecoderWithAttention(fluid.dygraph.Layer):
-    def __init__(self, scope_name, decoder_size, num_classes):
-        super(GRUDecoderWithAttention, self).__init__(scope_name)
-        self.simple_attention = SimpleAttention(self.full_name(), decoder_size)
-
-        self.fc_1_layer = FC(self.full_name(),
-                             size=decoder_size * 3,
-                             bias_attr=False)
-        self.fc_2_layer = FC(self.full_name(),
-                             size=decoder_size * 3,
-                             bias_attr=False)
-        self.gru_unit = GRUUnit(
-            self.full_name(),
-            size=decoder_size * 3,
-            param_attr=None,
-            bias_attr=None)
-        self.out_layer = FC(self.full_name(),
-                            size=num_classes + 2,
-                            bias_attr=None,
-                            act='softmax')
-
-        self.decoder_size = decoder_size
-
-    def forward(self, target_embedding, encoder_vec, encoder_proj,
-                decoder_boot):
-        res = []
-        hidden_mem = decoder_boot
-        for i in range(target_embedding.shape[1]):
-            current_word = fluid.layers.slice(
-                target_embedding, axes=[1], starts=[i], ends=[i + 1])
-            current_word = fluid.layers.reshape(
-                current_word, [-1, current_word.shape[2]], inplace=False)
-
-            context = self.simple_attention(encoder_vec, encoder_proj,
-                                            hidden_mem)
-            fc_1 = self.fc_1_layer(context)
-            fc_2 = self.fc_2_layer(current_word)
-            decoder_inputs = fluid.layers.elementwise_add(x=fc_1, y=fc_2)
-
-            h, _, _ = self.gru_unit(decoder_inputs, hidden_mem)
-            hidden_mem = h
-            out = self.out_layer(h)
-            res.append(out)
-
-        res1 = fluid.layers.concat(res, axis=1)
-
-        return res1
-
-
-class OCRAttention(fluid.dygraph.Layer):
-    def __init__(self, scope_name):
-        super(OCRAttention, self).__init__(scope_name)
-        self.encoder_net = EncoderNet(self.full_name())
-        self.fc = FC(self.full_name(),
-                     size=Config.decoder_size,
-                     bias_attr=False,
-                     act='relu')
-        self.embedding = Embedding(
-            self.full_name(), [Config.num_classes + 2, Config.word_vector_dim],
-            dtype='float32')
-        self.gru_decoder_with_attention = GRUDecoderWithAttention(
-            self.full_name(), Config.decoder_size, Config.num_classes)
-
-    def forward(self, inputs, label_in):
-        gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs)
-        backward_first = fluid.layers.slice(
-            gru_backward, axes=[1], starts=[0], ends=[1])
-        backward_first = fluid.layers.reshape(
-            backward_first, [-1, backward_first.shape[2]], inplace=False)
-        decoder_boot = self.fc(backward_first)
-        label_in = fluid.layers.reshape(label_in, [-1, 1], inplace=False)
-        trg_embedding = self.embedding(label_in)
-
-        trg_embedding = fluid.layers.reshape(
-            trg_embedding, [-1, Config.max_length, trg_embedding.shape[1]],
-            inplace=False)
-
-        prediction = self.gru_decoder_with_attention(
-            trg_embedding, encoded_vector, encoded_proj, decoder_boot)
-
-        return prediction
-
-
-class TestDygraphOCRAttention(unittest.TestCase):
-    def test_while_op(self):
-        seed = 90
-        epoch_num = 2
-        if core.is_compiled_with_cuda():
-            batch_num = 20
-        else:
-            print("in CPU")
-            batch_num = 2
-        np.random.seed = seed
-        image_np = np.random.randn(Config.batch_size, Config.DATA_SHAPE[0],
-                                   Config.DATA_SHAPE[1],
-                                   Config.DATA_SHAPE[2]).astype('float32')
-        label_in_np = np.arange(
-            0, Config.max_length,
-            dtype='int64').reshape([1, Config.max_length])
-        for i in range(2, Config.batch_size + 1):
-            label_in_np = np.vstack((label_in_np, np.arange(
-                (i - 1) * Config.max_length,
-                i * Config.max_length,
-                dtype='int64').reshape([1, Config.max_length])))
-
-        label_out_np = np.arange(
-            0, Config.max_length,
-            dtype='int64').reshape([1, Config.max_length])
-        for i in range(2, Config.batch_size + 1):
-            label_out_np = np.vstack((label_out_np, np.arange(
-                (i - 1) * Config.max_length,
-                i * Config.max_length,
-                dtype='int64').reshape([1, Config.max_length])))
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            ocr_attention = OCRAttention("ocr_attention")
-
-            if Config.learning_rate_decay == "piecewise_decay":
-                learning_rate = fluid.layers.piecewise_decay(
-                    [50000], [Config.LR, Config.LR * 0.01])
-            else:
-                learning_rate = Config.LR
-            optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-            dy_param_init_value = {}
-            for param in ocr_attention.parameters():
-                dy_param_init_value[param.name] = param.numpy()
-            for epoch in range(epoch_num):
-                for batch_id in range(batch_num):
-                    label_in = to_variable(label_in_np)
-                    label_out = to_variable(label_out_np)
-                    label_out._stop_gradient = True
-                    label_out.trainable = False
-                    img = to_variable(image_np)
-                    dy_prediction = ocr_attention(img, label_in)
-                    label_out = fluid.layers.reshape(
-                        label_out, [-1, 1], inplace=False)
-                    dy_prediction = fluid.layers.reshape(
-                        dy_prediction, [label_out.shape[0], -1], inplace=False)
-                    loss = fluid.layers.cross_entropy(
-                        input=dy_prediction, label=label_out)
-                    avg_loss = fluid.layers.reduce_sum(loss)
-
-                    dy_out = avg_loss.numpy()
-
-                    if epoch == 0 and batch_id == 0:
-                        for param in ocr_attention.parameters():
-                            if param.name not in dy_param_init_value:
-                                dy_param_init_value[param.name] = param.numpy()
-                    avg_loss.backward(backward_strategy)
-                    dy_grad_value = {}
-                    for param in ocr_attention.parameters():
-                        if param.trainable:
-                            np_array = np.array(param._ivar._grad_ivar().value()
-                                                .get_tensor())
-                            dy_grad_value[param.name + core.grad_var_suffix(
-                            )] = np_array
-
-                    optimizer.minimize(avg_loss)
-                    ocr_attention.clear_gradients()
-                    dy_param_value = {}
-                    for param in ocr_attention.parameters():
-                        dy_param_value[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            # print("static start")
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-            ocr_attention = OCRAttention("ocr_attention")
-
-            if Config.learning_rate_decay == "piecewise_decay":
-                learning_rate = fluid.layers.piecewise_decay(
-                    [50000], [Config.LR, Config.LR * 0.01])
-            else:
-                learning_rate = Config.LR
-
-            optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-
-            images = fluid.layers.data(
-                name='pixel', shape=Config.DATA_SHAPE, dtype='float32')
-            static_label_in = fluid.layers.data(
-                name='label_in', shape=[1], dtype='int64', lod_level=0)
-            static_label_out = fluid.layers.data(
-                name='label_out', shape=[1], dtype='int64', lod_level=0)
-            static_label_out._stop_gradient = True
-            static_label_out.trainable = False
-
-            static_prediction = ocr_attention(images, static_label_in)
-
-            static_prediction = fluid.layers.reshape(
-                static_prediction, shape=[-1, Config.num_classes + 2])
-
-            cost = fluid.layers.cross_entropy(
-                input=static_prediction, label=static_label_out)
-            static_avg_loss = fluid.layers.reduce_sum(cost)
-            # param_grad_list = fluid.backward.append_backward(static_avg_loss)
-            optimizer.minimize(static_avg_loss)
-
-            static_param_init_value = {}
-            static_param_name_list = []
-            static_grad_name_list = []
-            for param in ocr_attention.parameters():
-                static_param_name_list.append(param.name)
-                if param.trainable:
-                    static_grad_name_list.append(param.name +
-                                                 core.grad_var_suffix())
-
-            out = exe.run(fluid.default_startup_program(),
-                          fetch_list=static_param_name_list)
-
-            for i in range(len(static_param_name_list)):
-                static_param_init_value[static_param_name_list[i]] = out[i]
-
-            fetch_list = [static_avg_loss.name]
-            fetch_list.extend(static_param_name_list)
-            fetch_list.extend(static_grad_name_list)
-            for epoch in range(epoch_num):
-                for batch_id in range(batch_num):
-                    static_label_in = label_in_np
-                    static_label_out = label_out_np
-                    static_label_out = static_label_out.reshape((-1, 1))
-                    out = exe.run(fluid.default_main_program(),
-                                  feed={
-                                      "pixel": image_np,
-                                      "label_in": static_label_in,
-                                      "label_out": static_label_out
-                                  },
-                                  fetch_list=fetch_list)
-                    static_param_value = {}
-                    static_grad_value = {}
-                    static_out = out[0]
-                    # static_test_grad = out[1]
-                    for i in range(1, len(static_param_name_list) + 1):
-                        static_param_value[static_param_name_list[i - 1]] = out[
-                            i]
-                    grad_start_pos = len(static_param_name_list) + 1
-                    for i in range(grad_start_pos,
-                                   len(static_grad_name_list) + grad_start_pos):
-                        static_grad_value[static_grad_name_list[
-                            i - grad_start_pos]] = out[i]
-
-        self.assertTrue(np.array_equal(static_out, dy_out))
-
-        for key, value in six.iteritems(static_param_init_value):
-            self.assertTrue(np.array_equal(value, dy_param_init_value[key]))
-
-        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-20))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
deleted file mode 100644
index a7c39f7ff2ad8e9dedc99bb37fc0f997853da572..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import contextlib
-import unittest
-import numpy as np
-import six
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.optimizer import SGDOptimizer, Adam
-from paddle.fluid.dygraph.nn import FC
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-
-
-class MLP(fluid.Layer):
-    def __init__(self, name_scope, param_attr=None, bias_attr=None):
-        super(MLP, self).__init__(name_scope)
-
-        self._fc1 = FC(self.full_name(), 10)
-        self._fc2 = FC(self.full_name(), 10)
-
-    def forward(self, inputs):
-        y = self._fc1(inputs)
-        y = self._fc2(y)
-        return y
-
-
-class TestImperativeOptimizerBase(unittest.TestCase):
-    def setUp(self):
-        self.batch_num = 20
-
-    def get_optimizer(self):
-        raise NotImplementedError()
-
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                image = np.array(item[0]).reshape(1, 28, 28)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield image, label
-
-        return _reader_imple
-
-    def _check_mlp(self):
-        seed = 90
-        batch_size = 128
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            mlp = MLP('mlp')
-            optimizer = self.get_optimizer()
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(paddle.dataset.mnist.train()),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
-
-            dy_param_init_value = {}
-            for batch_id, data in enumerate(batch_py_reader()):
-                if batch_id >= self.batch_num:
-                    break
-
-                img = data[0]
-                label = data[1]
-                label._stop_gradient = True
-
-                cost = mlp(img)
-                avg_loss = fluid.layers.reduce_mean(cost)
-                dy_out = avg_loss.numpy()
-
-                if batch_id == 0:
-                    for param in mlp.parameters():
-                        dy_param_init_value[param.name] = param.numpy()
-
-                avg_loss.backward()
-                optimizer.minimize(avg_loss)
-                mlp.clear_gradients()
-                dy_param_value = {}
-                for param in mlp.parameters():
-                    dy_param_value[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-
-            mlp = MLP('mlp')
-            optimizer = self.get_optimizer()
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
-
-            img = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            cost = mlp(img)
-            avg_loss = fluid.layers.reduce_mean(cost)
-            optimizer.minimize(avg_loss)
-
-            # initialize params and fetch them
-            static_param_init_value = {}
-            static_param_name_list = []
-            for param in mlp.parameters():
-                static_param_name_list.append(param.name)
-
-            out = exe.run(fluid.default_startup_program(),
-                          fetch_list=static_param_name_list)
-
-            for i in range(len(static_param_name_list)):
-                static_param_init_value[static_param_name_list[i]] = out[i]
-
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= self.batch_num:
-                    break
-
-                static_x_data = np.array(
-                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [128, 1])
-
-                fetch_list = [avg_loss.name]
-                fetch_list.extend(static_param_name_list)
-                out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
-                              fetch_list=fetch_list)
-
-                static_param_value = {}
-                static_out = out[0]
-                for i in range(1, len(out)):
-                    static_param_value[static_param_name_list[i - 1]] = out[i]
-
-        for key, value in six.iteritems(static_param_init_value):
-            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
-
-        self.assertTrue(np.allclose(static_out, dy_out))
-
-        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
-
-
-class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        bd = [3, 6, 9]
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
-            boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
-        return optimizer
-
-    def test_adam(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
-            learning_rate=0.1, decay_steps=5, cycle=self.cycle))
-        return optimizer
-
-    def test_sgd_cycle(self):
-        self.cycle = True
-        self._check_mlp()
-
-    def test_sgd(self):
-        self.cycle = False
-        self._check_mlp()
-
-
-class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
-            learning_rate=0.1, step_each_epoch=10000, epochs=120))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
-            d_model=512, warmup_steps=8000))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py b/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
deleted file mode 100644
index c6a2ad9e3d5ce79298160bdca2506c989f356ce0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import numpy as np
-
-
-class TestImperativePartitialBackward(unittest.TestCase):
-    def test_partitial_backward(self):
-        with fluid.dygraph.guard():
-            x = np.random.randn(2, 4, 5).astype("float32")
-            x = fluid.dygraph.to_variable(x)
-            fc1 = fluid.dygraph.FC("fc1", 10, num_flatten_dims=2)
-            fc2 = fluid.dygraph.FC("fc2", 10, num_flatten_dims=2)
-
-            y = fc1(x[:, :2])
-            z = fc2(x[:, 2:])
-            loss = fluid.layers.reduce_mean(y)
-            loss.backward()
-
-            for param in fc1.parameters():
-                self.assertIsNotNone(param._ivar._grad_ivar())
-
-            for param in fc2.parameters():
-                self.assertIsNone(param._ivar._grad_ivar())
-
-            optimizer = fluid.optimizer.AdamOptimizer()
-            _, params_grads = optimizer.minimize(loss)
-
-            self.assertListEqual(
-                sorted([p.name for p in fc1.parameters()]),
-                sorted([p_g[0].name for p_g in params_grads]))
-
-            fc1.clear_gradients()
-            fc2.clear_gradients()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
deleted file mode 100644
index 5f6c5b1cb6a5a641b23dbbd82b98c78313efb1ca..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ /dev/null
@@ -1,345 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.dygraph.nn import Embedding
-import paddle.fluid.framework as framework
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-import numpy as np
-import six
-
-
-class SimpleLSTMRNN(fluid.Layer):
-    def __init__(self,
-                 name_scope,
-                 hidden_size,
-                 num_steps,
-                 num_layers=2,
-                 init_scale=0.1,
-                 dropout=None):
-        super(SimpleLSTMRNN, self).__init__(name_scope)
-        self._hidden_size = hidden_size
-        self._num_layers = num_layers
-        self._init_scale = init_scale
-        self._dropout = dropout
-        self._input = None
-        self._num_steps = num_steps
-        self.cell_array = []
-        self.hidden_array = []
-
-    def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
-        self.weight_1_arr = []
-        self.weight_2_arr = []
-        self.bias_arr = []
-        self.mask_array = []
-
-        for i in range(self._num_layers):
-            weight_1 = self.create_parameter(
-                attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-self._init_scale, high=self._init_scale)),
-                shape=[self._hidden_size * 2, self._hidden_size * 4],
-                dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
-                    low=-self._init_scale, high=self._init_scale))
-            self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
-            bias_1 = self.create_parameter(
-                attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-self._init_scale, high=self._init_scale)),
-                shape=[self._hidden_size * 4],
-                dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0))
-            self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
-
-    def forward(self, input_embedding, init_hidden=None, init_cell=None):
-        self.cell_array = []
-        self.hidden_array = []
-
-        for i in range(self._num_layers):
-            pre_hidden = fluid.layers.slice(
-                init_hidden, axes=[0], starts=[i], ends=[i + 1])
-            pre_cell = fluid.layers.slice(
-                init_cell, axes=[0], starts=[i], ends=[i + 1])
-            pre_hidden = fluid.layers.reshape(
-                pre_hidden, shape=[-1, self._hidden_size])
-            pre_cell = fluid.layers.reshape(
-                pre_cell, shape=[-1, self._hidden_size])
-            self.hidden_array.append(pre_hidden)
-            self.cell_array.append(pre_cell)
-
-        res = []
-        for index in range(self._num_steps):
-            self._input = fluid.layers.slice(
-                input_embedding, axes=[1], starts=[index], ends=[index + 1])
-            self._input = fluid.layers.reshape(
-                self._input, shape=[-1, self._hidden_size])
-            for k in range(self._num_layers):
-                pre_hidden = self.hidden_array[k]
-                pre_cell = self.cell_array[k]
-                weight_1 = self.weight_1_arr[k]
-                bias = self.bias_arr[k]
-
-                nn = fluid.layers.concat([self._input, pre_hidden], 1)
-                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
-
-                gate_input = fluid.layers.elementwise_add(gate_input, bias)
-                i, j, f, o = fluid.layers.split(
-                    gate_input, num_or_sections=4, dim=-1)
-                c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
-                    i) * fluid.layers.tanh(j)
-                m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
-                self.hidden_array[k] = m
-                self.cell_array[k] = c
-                self._input = m
-
-                if self._dropout is not None and self._dropout > 0.0:
-                    self._input = fluid.layers.dropout(
-                        self._input,
-                        dropout_prob=self._dropout,
-                        dropout_implementation='upscale_in_train')
-            res.append(
-                fluid.layers.reshape(
-                    self._input, shape=[1, -1, self._hidden_size]))
-        real_res = fluid.layers.concat(res, 0)
-        real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2])
-        last_hidden = fluid.layers.concat(self.hidden_array, 1)
-        last_hidden = fluid.layers.reshape(
-            last_hidden, shape=[-1, self._num_layers, self._hidden_size])
-        last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2])
-        last_cell = fluid.layers.concat(self.cell_array, 1)
-        last_cell = fluid.layers.reshape(
-            last_cell, shape=[-1, self._num_layers, self._hidden_size])
-        last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2])
-        return real_res, last_hidden, last_cell
-
-
-class PtbModel(fluid.Layer):
-    def __init__(self,
-                 name_scope,
-                 hidden_size,
-                 vocab_size,
-                 num_layers=2,
-                 num_steps=20,
-                 init_scale=0.1,
-                 dropout=None):
-        super(PtbModel, self).__init__(name_scope)
-        self.hidden_size = hidden_size
-        self.vocab_size = vocab_size
-        self.init_scale = init_scale
-        self.num_layers = num_layers
-        self.num_steps = num_steps
-        self.dropout = dropout
-        self.simple_lstm_rnn = SimpleLSTMRNN(
-            self.full_name(),
-            hidden_size,
-            num_steps,
-            num_layers=num_layers,
-            init_scale=init_scale,
-            dropout=dropout)
-        self.embedding = Embedding(
-            self.full_name(),
-            size=[vocab_size, hidden_size],
-            dtype='float32',
-            is_sparse=False,
-            param_attr=fluid.ParamAttr(
-                name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
-                    low=-init_scale, high=init_scale)))
-        self.softmax_weight = self.create_parameter(
-            attr=fluid.ParamAttr(),
-            shape=[self.hidden_size, self.vocab_size],
-            dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
-                low=-self.init_scale, high=self.init_scale))
-        self.softmax_bias = self.create_parameter(
-            attr=fluid.ParamAttr(),
-            shape=[self.vocab_size],
-            dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
-                low=-self.init_scale, high=self.init_scale))
-
-    def forward(self, input, label, init_hidden, init_cell):
-        init_h = fluid.layers.reshape(
-            init_hidden, shape=[self.num_layers, -1, self.hidden_size])
-
-        init_c = fluid.layers.reshape(
-            init_cell, shape=[self.num_layers, -1, self.hidden_size])
-
-        x_emb = self.embedding(input)
-        x_emb = fluid.layers.reshape(
-            x_emb, shape=[-1, self.num_steps, self.hidden_size])
-        if self.dropout is not None and self.dropout > 0.0:
-            x_emb = fluid.layers.dropout(
-                x_emb,
-                dropout_prob=self.drop_out,
-                dropout_implementation='upscale_in_train')
-        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
-                                                               init_c)
-        rnn_out = fluid.layers.reshape(
-            rnn_out, shape=[-1, self.num_steps, self.hidden_size])
-        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
-        projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
-        projection = fluid.layers.reshape(
-            projection, shape=[-1, self.vocab_size])
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False)
-        loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
-        loss = fluid.layers.reduce_sum(loss)
-        loss.permissions = True
-
-        return loss, last_hidden, last_cell
-
-
-class TestDygraphPtbRnn(unittest.TestCase):
-    def test_ptb_rnn_cpu_float32(self):
-        seed = 90
-        hidden_size = 10
-        vocab_size = 1000
-        num_layers = 1
-        num_steps = 3
-        init_scale = 0.1
-        batch_size = 4
-        batch_num = 200
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
-
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            dy_param_updated = dict()
-            dy_param_init = dict()
-            dy_loss = None
-            last_hidden = None
-            last_cell = None
-
-            for i in range(batch_num):
-                x_data = np.arange(12).reshape(4, 3).astype('int64')
-                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
-                x_data = x_data.reshape((-1, num_steps, 1))
-                y_data = y_data.reshape((-1, 1))
-                init_hidden_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
-                x = to_variable(x_data)
-                y = to_variable(y_data)
-                init_hidden = to_variable(init_hidden_data)
-                init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
-                if i == 0:
-                    for param in ptb_model.parameters():
-                        dy_param_init[param.name] = param.numpy()
-                dy_loss.backward()
-                sgd.minimize(dy_loss)
-                ptb_model.clear_gradients()
-                if i == batch_num - 1:
-                    for param in ptb_model.parameters():
-                        dy_param_updated[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps, 1], dtype='int64')
-            y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
-
-            static_loss, static_last_hidden, static_last_cell = ptb_model(
-                x, y, init_hidden, init_cell)
-            sgd.minimize(static_loss)
-            static_param_updated = dict()
-            static_param_init = dict()
-            static_param_name_list = list()
-            for param in ptb_model.parameters():
-                static_param_name_list.append(param.name)
-
-            out = exe.run(framework.default_startup_program(),
-                          fetch_list=static_param_name_list)
-            for i in range(len(static_param_name_list)):
-                static_param_init[static_param_name_list[i]] = out[i]
-            static_loss_value = None
-            static_last_cell_value = None
-            static_last_hidden_value = None
-            for i in range(batch_num):
-                x_data = np.arange(12).reshape(4, 3).astype('int64')
-                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
-                x_data = x_data.reshape((-1, num_steps, 1))
-                y_data = y_data.reshape((-1, 1))
-                init_hidden_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
-                fetch_list = [static_loss, static_last_hidden, static_last_cell]
-                fetch_list.extend(static_param_name_list)
-                out = exe.run(fluid.default_main_program(),
-                              feed={
-                                  "x": x_data,
-                                  "y": y_data,
-                                  "init_hidden": init_hidden_data,
-                                  "init_cell": init_cell_data
-                              },
-                              fetch_list=fetch_list)
-                static_loss_value = out[0]
-                static_last_hidden_value = out[1]
-                static_last_cell_value = out[2]
-
-                if i == batch_num - 1:
-                    for k in range(3, len(out)):
-                        static_param_updated[static_param_name_list[k -
-                                                                    3]] = out[k]
-
-        self.assertTrue(np.array_equal(static_loss_value, dy_loss.numpy()))
-        self.assertTrue(
-            np.array_equal(static_last_cell_value, last_cell.numpy()))
-        self.assertTrue(
-            np.array_equal(static_last_hidden_value, last_hidden.numpy()))
-        for key, value in six.iteritems(static_param_init):
-            self.assertTrue(np.array_equal(value, dy_param_init[key]))
-        for key, value in six.iteritems(static_param_updated):
-            self.assertTrue(np.array_equal(value, dy_param_updated[key]))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
deleted file mode 100644
index d3beed7b00773e4be7037afb33b363f9d6e81f4d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
+++ /dev/null
@@ -1,165 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.dygraph.nn import Embedding
-import paddle.fluid.framework as framework
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-from test_imperative_ptb_rnn import PtbModel
-import numpy as np
-import six
-
-
-class TestDygraphPtbRnnSortGradient(unittest.TestCase):
-    def test_ptb_rnn_sort_gradient_cpu_float32(self):
-        seed = 90
-        hidden_size = 10
-        vocab_size = 1000
-        num_layers = 1
-        num_steps = 3
-        init_scale = 0.1
-        batch_size = 4
-        batch_num = 200
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
-
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            dy_param_updated = dict()
-            dy_param_init = dict()
-            dy_loss = None
-            last_hidden = None
-            last_cell = None
-
-            for i in range(batch_num):
-                x_data = np.arange(12).reshape(4, 3).astype('int64')
-                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
-                x_data = x_data.reshape((-1, num_steps, 1))
-                y_data = y_data.reshape((-1, 1))
-                init_hidden_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
-                x = to_variable(x_data)
-                y = to_variable(y_data)
-                init_hidden = to_variable(init_hidden_data)
-                init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
-                if i == 0:
-                    for param in ptb_model.parameters():
-                        dy_param_init[param.name] = param.numpy()
-                dy_loss.backward(backward_strategy)
-                sgd.minimize(dy_loss)
-                ptb_model.clear_gradients()
-                if i == batch_num - 1:
-                    for param in ptb_model.parameters():
-                        dy_param_updated[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps, 1], dtype='int64')
-            y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
-
-            static_loss, static_last_hidden, static_last_cell = ptb_model(
-                x, y, init_hidden, init_cell)
-            sgd.minimize(static_loss)
-            static_param_updated = dict()
-            static_param_init = dict()
-            static_param_name_list = list()
-            for param in ptb_model.parameters():
-                static_param_name_list.append(param.name)
-
-            out = exe.run(framework.default_startup_program(),
-                          fetch_list=static_param_name_list)
-            for i in range(len(static_param_name_list)):
-                static_param_init[static_param_name_list[i]] = out[i]
-            static_loss_value = None
-            static_last_cell_value = None
-            static_last_hidden_value = None
-            for i in range(batch_num):
-                x_data = np.arange(12).reshape(4, 3).astype('int64')
-                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
-                x_data = x_data.reshape((-1, num_steps, 1))
-                y_data = y_data.reshape((-1, 1))
-                init_hidden_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
-                fetch_list = [static_loss, static_last_hidden, static_last_cell]
-                fetch_list.extend(static_param_name_list)
-                out = exe.run(fluid.default_main_program(),
-                              feed={
-                                  "x": x_data,
-                                  "y": y_data,
-                                  "init_hidden": init_hidden_data,
-                                  "init_cell": init_cell_data
-                              },
-                              fetch_list=fetch_list)
-                static_loss_value = out[0]
-                static_last_hidden_value = out[1]
-                static_last_cell_value = out[2]
-
-                if i == batch_num - 1:
-                    for k in range(3, len(out)):
-                        static_param_updated[static_param_name_list[k -
-                                                                    3]] = out[k]
-
-        self.assertTrue(np.array_equal(static_loss_value, dy_loss.numpy()))
-        self.assertTrue(
-            np.array_equal(static_last_cell_value, last_cell.numpy()))
-        self.assertTrue(
-            np.array_equal(static_last_hidden_value, last_hidden.numpy()))
-        for key, value in six.iteritems(static_param_init):
-            self.assertTrue(np.array_equal(value, dy_param_init[key]))
-        for key, value in six.iteritems(static_param_updated):
-            self.assertTrue(np.array_equal(value, dy_param_updated[key]))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
deleted file mode 100644
index 268e24fa5ebe3ce717e33ff9a996640adc8db6d2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.dygraph.nn import Embedding
-import paddle.fluid.framework as framework
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-import numpy as np
-import six
-
-
-class RecurrentTest(fluid.Layer):
-    def __init__(self, name_scope):
-        super(RecurrentTest, self).__init__(name_scope)
-
-    def forward(self, in1, in2):
-        out = fluid.layers.mul(in1, in2)
-        sum_out = fluid.layers.reduce_sum(out)
-        return sum_out, out
-
-
-class TestRecurrentFeed(unittest.TestCase):
-    def test_recurrent_feed(self):
-
-        seed = 90
-        original_np1 = np.arange(1, 5).reshape(2, 2).astype("float32")
-        original_np2 = np.arange(5, 9).reshape(2, 2).astype("float32")
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            original_in1 = to_variable(original_np1)
-            original_in2 = to_variable(original_np2)
-            original_in1.stop_gradient = False
-            original_in2.stop_gradient = False
-            rt = RecurrentTest("RecurrentTest")
-
-            for i in range(3):
-                sum_out, out = rt(original_in1, original_in2)
-                original_in1 = out
-                sum_out_value = sum_out.numpy()
-                sum_out.backward()
-                dyout = out.gradient()
-                rt.clear_gradients()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            in1 = fluid.layers.data(
-                name="inp1", shape=[2, 2], append_batch_size=False)
-            in2 = fluid.layers.data(
-                name="inp2", shape=[2, 2], append_batch_size=False)
-            rt1 = RecurrentTest("RecurrentTest")
-            static_sum_out, static_out = rt1(in1, in2)
-            fluid.backward.append_backward(static_sum_out)
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-
-            static_dout = fluid.default_main_program().block(
-                0)._find_var_recursive(static_out.name + "@GRAD")
-            fetch_list = [static_sum_out, static_out, static_dout]
-            for i in range(3):
-                out = exe.run(
-                    fluid.default_main_program(),
-                    feed={"inp1": original_np1,
-                          "inp2": original_np2},
-                    fetch_list=fetch_list)
-                static_out_value = out[1]
-                static_sum_out = out[0]
-                static_dout = out[2]
-                original_np1 = static_out_value
-
-        self.assertTrue(np.array_equal(static_sum_out, sum_out_value))
-        self.assertTrue(np.array_equal(static_dout, dyout))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
deleted file mode 100644
index 36f6daeb37fda17feb71d8a5205884ce47d9d612..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import contextlib
-import unittest
-import numpy as np
-import six
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
-import paddle.fluid.dygraph.nn as nn
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-
-
-class Policy(fluid.dygraph.Layer):
-    def __init__(self, name_scope):
-        super(Policy, self).__init__(name_scope)
-
-        self.affine1 = nn.FC(self.full_name(), size=128)
-        self.affine2 = nn.FC(self.full_name(), size=2)
-        self.dropout_ratio = 0.6
-
-        self.saved_log_probs = []
-        self.rewards = []
-
-    def forward(self, inputs):
-        x = fluid.layers.reshape(inputs, shape=[-1, 4])
-        x = self.affine1(x)
-        x = fluid.layers.dropout(x, self.dropout_ratio)
-        x = fluid.layers.relu(x)
-        action_scores = self.affine2(x)
-        return fluid.layers.softmax(action_scores, axis=1)
-
-
-class TestImperativeMnist(unittest.TestCase):
-    def test_mnist_float32(self):
-        seed = 90
-        epoch_num = 1
-
-        state = np.random.normal(size=4).astype("float32")
-        state_list = state.tolist()
-        reward = np.random.random(size=[1, 1]).astype("float32")
-        reward_list = reward.tolist()
-        action_list = [1]
-        action = np.array(action_list).astype("float32")
-        mask_list = [[0, 1]]
-        mask = np.array(mask_list).astype("float32")
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            policy = Policy("PolicyModel")
-
-            dy_state = fluid.dygraph.base.to_variable(state)
-            dy_state.stop_gradient = True
-            loss_probs = policy(dy_state)
-
-            dy_mask = fluid.dygraph.base.to_variable(mask)
-            dy_mask.stop_gradient = True
-
-            loss_probs = fluid.layers.log(loss_probs)
-            loss_probs = fluid.layers.elementwise_mul(loss_probs, dy_mask)
-            loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1)
-
-            dy_reward = fluid.dygraph.base.to_variable(reward)
-            dy_reward.stop_gradient = True
-
-            loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs)
-            loss = fluid.layers.reduce_sum(loss_probs)
-
-            sgd = SGDOptimizer(learning_rate=1e-3)
-
-            dy_param_init_value = {}
-
-            dy_out = loss.numpy()
-
-            for param in policy.parameters():
-                dy_param_init_value[param.name] = param.numpy()
-
-            loss.backward()
-            sgd.minimize(loss)
-            policy.clear_gradients()
-
-            dy_param_value = {}
-            for param in policy.parameters():
-                dy_param_value[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-
-            policy = Policy("PolicyModel")
-
-            st_sgd = SGDOptimizer(learning_rate=1e-3)
-
-            st_state = fluid.layers.data(
-                name='st_state', shape=[4], dtype='float32')
-            st_reward = fluid.layers.data(
-                name='st_reward', shape=[1], dtype='float32')
-            st_mask = fluid.layers.data(
-                name='st_mask', shape=[2], dtype='float32')
-
-            st_loss_probs = policy(st_state)
-
-            st_loss_probs = fluid.layers.log(st_loss_probs)
-            st_loss_probs = fluid.layers.elementwise_mul(st_loss_probs, st_mask)
-            st_loss_probs = fluid.layers.reduce_sum(st_loss_probs, dim=-1)
-
-            st_loss_probs = fluid.layers.elementwise_mul(st_reward,
-                                                         st_loss_probs)
-            st_loss = fluid.layers.reduce_sum(st_loss_probs)
-
-            st_sgd.minimize(st_loss)
-
-            # initialize params and fetch them
-            static_param_init_value = {}
-            static_param_name_list = []
-            for param in policy.parameters():
-                static_param_name_list.append(param.name)
-
-            out = exe.run(fluid.default_startup_program(),
-                          fetch_list=static_param_name_list)
-
-            for i in range(len(static_param_name_list)):
-                static_param_init_value[static_param_name_list[i]] = out[i]
-
-            fetch_list = [st_loss.name]
-            fetch_list.extend(static_param_name_list)
-
-            out = exe.run(
-                fluid.default_main_program(),
-                feed={"st_state": state,
-                      "st_reward": reward,
-                      "st_mask": mask},
-                fetch_list=fetch_list)
-
-            static_param_value = {}
-            static_out = out[0]
-            for i in range(1, len(out)):
-                static_param_value[static_param_name_list[i - 1]] = out[i]
-
-        #self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
-
-        for key, value in six.iteritems(static_param_init_value):
-            self.assertTrue(np.equal(value, dy_param_init_value[key]).all())
-
-        self.assertTrue(np.equal(static_out, dy_out).all())
-
-        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.equal(value, dy_param_value[key]).all())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
deleted file mode 100644
index 9eab5abc06c96423a99855910009d85cab089f89..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import unittest
-import numpy as np
-import six
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid import Conv2D, Pool2D, BatchNorm, FC
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-
-batch_size = 8
-train_parameters = {
-    "input_size": [3, 224, 224],
-    "input_mean": [0.485, 0.456, 0.406],
-    "input_std": [0.229, 0.224, 0.225],
-    "learning_strategy": {
-        "name": "piecewise_decay",
-        "batch_size": batch_size,
-        "epochs": [30, 60, 90],
-        "steps": [0.1, 0.01, 0.001, 0.0001]
-    },
-    "batch_size": batch_size,
-    "lr": 0.1,
-    "total_images": 1281164,
-}
-
-
-def optimizer_setting(params):
-    ls = params["learning_strategy"]
-    if ls["name"] == "piecewise_decay":
-        if "total_images" not in params:
-            total_images = 1281167
-        else:
-            total_images = params["total_images"]
-        batch_size = ls["batch_size"]
-        step = int(total_images / batch_size + 1)
-
-        bd = [step * e for e in ls["epochs"]]
-        base_lr = params["lr"]
-        lr = []
-        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-        # TODO(minqiyang): Add learning rate scheduler support to dygraph mode
-        #  optimizer = fluid.optimizer.Momentum(
-    #  learning_rate=params["lr"],
-    #  learning_rate=fluid.layers.piecewise_decay(
-    #  boundaries=bd, values=lr),
-    #  momentum=0.9,
-    #  regularization=fluid.regularizer.L2Decay(1e-4))
-
-    return optimizer
-
-
-class ConvBNLayer(fluid.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__(name_scope)
-
-        self._conv = Conv2D(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=None)
-
-        self._batch_norm = BatchNorm(self.full_name(), num_filters, act=act)
-
-    def forward(self, inputs):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-
-        return y
-
-
-class BottleneckBlock(fluid.Layer):
-    def __init__(self, name_scope, num_filters, stride, shortcut=True):
-        super(BottleneckBlock, self).__init__(name_scope)
-
-        self.conv0 = ConvBNLayer(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
-        self.conv2 = ConvBNLayer(
-            self.full_name(),
-            num_filters=num_filters * 4,
-            filter_size=1,
-            act=None)
-
-        if not shortcut:
-            self.short = ConvBNLayer(
-                self.full_name(),
-                num_filters=num_filters * 4,
-                filter_size=1,
-                stride=stride)
-
-        self.shortcut = shortcut
-
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
-        conv2 = self.conv2(conv1)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-
-        y = fluid.layers.elementwise_add(x=short, y=conv2)
-
-        layer_helper = LayerHelper(self.full_name(), act='relu')
-        return layer_helper.append_activation(y)
-
-
-class ResNet(fluid.Layer):
-    def __init__(self, name_scope, layers=50, class_dim=102):
-        super(ResNet, self).__init__(name_scope)
-
-        self.layers = layers
-        supported_layers = [50, 101, 152]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(supported_layers, layers)
-
-        if layers == 50:
-            depth = [3, 4, 6, 3]
-        elif layers == 101:
-            depth = [3, 4, 23, 3]
-        elif layers == 152:
-            depth = [3, 8, 36, 3]
-        num_filters = [64, 128, 256, 512]
-
-        self.conv = ConvBNLayer(
-            self.full_name(),
-            num_filters=64,
-            filter_size=7,
-            stride=2,
-            act='relu')
-        self.pool2d_max = Pool2D(
-            self.full_name(),
-            pool_size=3,
-            pool_stride=2,
-            pool_padding=1,
-            pool_type='max')
-
-        self.bottleneck_block_list = []
-        for block in range(len(depth)):
-            shortcut = False
-            for i in range(depth[block]):
-                bottleneck_block = self.add_sublayer(
-                    'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        self.full_name(),
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        shortcut=shortcut))
-                self.bottleneck_block_list.append(bottleneck_block)
-                shortcut = True
-
-        self.pool2d_avg = Pool2D(
-            self.full_name(), pool_size=7, pool_type='avg', global_pooling=True)
-
-        import math
-        stdv = 1.0 / math.sqrt(2048 * 1.0)
-
-        self.out = FC(self.full_name(),
-                      size=class_dim,
-                      act='softmax',
-                      param_attr=fluid.param_attr.ParamAttr(
-                          initializer=fluid.initializer.Uniform(-stdv, stdv)))
-
-    def forward(self, inputs):
-        y = self.conv(inputs)
-        y = self.pool2d_max(y)
-        for bottleneck_block in self.bottleneck_block_list:
-            y = bottleneck_block(y)
-        y = self.pool2d_avg(y)
-        y = self.out(y)
-        return y
-
-
-class TestDygraphResnet(unittest.TestCase):
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                doc = np.array(item[0]).reshape(3, 224, 224)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield doc, label
-
-        return _reader_imple
-
-    def test_resnet_float32(self):
-        seed = 90
-
-        batch_size = train_parameters["batch_size"]
-        batch_num = 10
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            resnet = ResNet("resnet")
-            optimizer = optimizer_setting(train_parameters)
-            np.random.seed(seed)
-            import random
-            random.seed = seed
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(
-                        paddle.dataset.flowers.train(use_xmap=False)),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
-
-            dy_param_init_value = {}
-            for param in resnet.parameters():
-                dy_param_init_value[param.name] = param.numpy()
-
-            for batch_id, data in enumerate(batch_py_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                img = data[0]
-                label = data[1]
-                label.stop_gradient = True
-
-                out = resnet(img)
-                loss = fluid.layers.cross_entropy(input=out, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
-
-                dy_out = avg_loss.numpy()
-
-                if batch_id == 0:
-                    for param in resnet.parameters():
-                        if param.name not in dy_param_init_value:
-                            dy_param_init_value[param.name] = param.numpy()
-
-                avg_loss.backward()
-
-                dy_grad_value = {}
-                for param in resnet.parameters():
-                    if param.trainable:
-                        np_array = np.array(param._ivar._grad_ivar().value()
-                                            .get_tensor())
-                        dy_grad_value[param.name + core.grad_var_suffix(
-                        )] = np_array
-
-                optimizer.minimize(avg_loss)
-                resnet.clear_gradients()
-
-                dy_param_value = {}
-                for param in resnet.parameters():
-                    dy_param_value[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-
-            resnet = ResNet("resnet")
-            optimizer = optimizer_setting(train_parameters)
-
-            np.random.seed(seed)
-            import random
-            random.seed = seed
-            train_reader = paddle.batch(
-                paddle.dataset.flowers.train(use_xmap=False),
-                batch_size=batch_size)
-
-            img = fluid.layers.data(
-                name='pixel', shape=[3, 224, 224], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            out = resnet(img)
-            loss = fluid.layers.cross_entropy(input=out, label=label)
-            avg_loss = fluid.layers.mean(x=loss)
-            optimizer.minimize(avg_loss)
-
-            # initialize params and fetch them
-            static_param_init_value = {}
-            static_param_name_list = []
-            static_grad_name_list = []
-            for param in resnet.parameters():
-                static_param_name_list.append(param.name)
-            for param in resnet.parameters():
-                if param.trainable:
-                    static_grad_name_list.append(param.name +
-                                                 core.grad_var_suffix())
-
-            out = exe.run(fluid.default_startup_program(),
-                          fetch_list=static_param_name_list)
-
-            for i in range(len(static_param_name_list)):
-                static_param_init_value[static_param_name_list[i]] = out[i]
-
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                static_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [batch_size, 1])
-
-                fetch_list = [avg_loss.name]
-                fetch_list.extend(static_param_name_list)
-                fetch_list.extend(static_grad_name_list)
-                out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
-                              fetch_list=fetch_list)
-
-                static_param_value = {}
-                static_grad_value = {}
-                static_out = out[0]
-                param_start_pos = 1
-                grad_start_pos = len(static_param_name_list) + param_start_pos
-                for i in range(param_start_pos,
-                               len(static_param_name_list) + param_start_pos):
-                    static_param_value[static_param_name_list[
-                        i - param_start_pos]] = out[i]
-                for i in range(grad_start_pos,
-                               len(static_grad_name_list) + grad_start_pos):
-                    static_grad_value[static_grad_name_list[
-                        i - grad_start_pos]] = out[i]
-
-        self.assertTrue(np.allclose(static_out, dy_out))
-
-        self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
-
-        for key, value in six.iteritems(static_param_init_value):
-            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
-            self.assertTrue(np.isfinite(value.all()))
-            self.assertFalse(np.isnan(value.any()))
-
-        self.assertEqual(len(dy_grad_value), len(static_grad_value))
-        for key, value in six.iteritems(static_grad_value):
-            self.assertTrue(np.allclose(value, dy_grad_value[key]))
-            self.assertTrue(np.isfinite(value.all()))
-            self.assertFalse(np.isnan(value.any()))
-
-        self.assertEqual(len(dy_param_value), len(static_param_value))
-        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
-            self.assertTrue(np.isfinite(value.all()))
-            self.assertFalse(np.isnan(value.any()))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
deleted file mode 100644
index 74560535074f2550429de79385415162987dab7d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-import six
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-from test_imperative_resnet import ResNet
-
-batch_size = 8
-train_parameters = {
-    "input_size": [3, 224, 224],
-    "input_mean": [0.485, 0.456, 0.406],
-    "input_std": [0.229, 0.224, 0.225],
-    "learning_strategy": {
-        "name": "piecewise_decay",
-        "batch_size": batch_size,
-        "epochs": [30, 60, 90],
-        "steps": [0.1, 0.01, 0.001, 0.0001]
-    },
-    "batch_size": batch_size,
-    "lr": 0.1,
-    "total_images": 1281164,
-}
-
-
-def optimizer_setting(params):
-    ls = params["learning_strategy"]
-    if ls["name"] == "piecewise_decay":
-        if "total_images" not in params:
-            total_images = 1281167
-        else:
-            total_images = params["total_images"]
-        batch_size = ls["batch_size"]
-        step = int(total_images / batch_size + 1)
-
-        bd = [step * e for e in ls["epochs"]]
-        base_lr = params["lr"]
-        lr = []
-        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-        # TODO(minqiyang): Add learning rate scheduler support to dygraph mode
-        #  optimizer = fluid.optimizer.Momentum(
-    #  learning_rate=params["lr"],
-    #  learning_rate=fluid.layers.piecewise_decay(
-    #  boundaries=bd, values=lr),
-    #  momentum=0.9,
-    #  regularization=fluid.regularizer.L2Decay(1e-4))
-
-    return optimizer
-
-
-class TestDygraphResnetSortGradient(unittest.TestCase):
-    def test_resnet_sort_gradient_float32(self):
-        seed = 90
-
-        batch_size = train_parameters["batch_size"]
-        batch_num = 10
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            resnet = ResNet("resnet")
-            optimizer = optimizer_setting(train_parameters)
-            np.random.seed(seed)
-            import random
-            random.seed = seed
-            train_reader = paddle.batch(
-                paddle.dataset.flowers.train(use_xmap=False),
-                batch_size=batch_size)
-
-            dy_param_init_value = {}
-            for param in resnet.parameters():
-                dy_param_init_value[param.name] = param.numpy()
-
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                dy_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    batch_size, 1)
-
-                img = to_variable(dy_x_data)
-                label = to_variable(y_data)
-                label.stop_gradient = True
-
-                out = resnet(img)
-                loss = fluid.layers.cross_entropy(input=out, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
-
-                dy_out = avg_loss.numpy()
-
-                if batch_id == 0:
-                    for param in resnet.parameters():
-                        if param.name not in dy_param_init_value:
-                            dy_param_init_value[param.name] = param.numpy()
-
-                avg_loss.backward(backward_strategy)
-
-                dy_grad_value = {}
-                for param in resnet.parameters():
-                    if param.trainable:
-                        np_array = np.array(param._ivar._grad_ivar().value()
-                                            .get_tensor())
-                        dy_grad_value[param.name + core.grad_var_suffix(
-                        )] = np_array
-
-                optimizer.minimize(avg_loss)
-                resnet.clear_gradients()
-
-                dy_param_value = {}
-                for param in resnet.parameters():
-                    dy_param_value[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-
-            resnet = ResNet("resnet")
-            optimizer = optimizer_setting(train_parameters)
-
-            np.random.seed(seed)
-            import random
-            random.seed = seed
-            train_reader = paddle.batch(
-                paddle.dataset.flowers.train(use_xmap=False),
-                batch_size=batch_size)
-
-            img = fluid.layers.data(
-                name='pixel', shape=[3, 224, 224], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            out = resnet(img)
-            loss = fluid.layers.cross_entropy(input=out, label=label)
-            avg_loss = fluid.layers.mean(x=loss)
-            optimizer.minimize(avg_loss)
-
-            # initialize params and fetch them
-            static_param_init_value = {}
-            static_param_name_list = []
-            static_grad_name_list = []
-            for param in resnet.parameters():
-                static_param_name_list.append(param.name)
-            for param in resnet.parameters():
-                if param.trainable:
-                    static_grad_name_list.append(param.name +
-                                                 core.grad_var_suffix())
-
-            out = exe.run(fluid.default_startup_program(),
-                          fetch_list=static_param_name_list)
-
-            for i in range(len(static_param_name_list)):
-                static_param_init_value[static_param_name_list[i]] = out[i]
-
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                static_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [batch_size, 1])
-
-                fetch_list = [avg_loss.name]
-                fetch_list.extend(static_param_name_list)
-                fetch_list.extend(static_grad_name_list)
-                out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
-                              fetch_list=fetch_list)
-
-                static_param_value = {}
-                static_grad_value = {}
-                static_out = out[0]
-                param_start_pos = 1
-                grad_start_pos = len(static_param_name_list) + param_start_pos
-                for i in range(param_start_pos,
-                               len(static_param_name_list) + param_start_pos):
-                    static_param_value[static_param_name_list[
-                        i - param_start_pos]] = out[i]
-                for i in range(grad_start_pos,
-                               len(static_grad_name_list) + grad_start_pos):
-                    static_grad_value[static_grad_name_list[
-                        i - grad_start_pos]] = out[i]
-
-        self.assertTrue(np.allclose(static_out, dy_out))
-
-        self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
-
-        for key, value in six.iteritems(static_param_init_value):
-            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
-            self.assertTrue(np.isfinite(value.all()))
-            self.assertFalse(np.isnan(value.any()))
-
-        self.assertEqual(len(dy_grad_value), len(static_grad_value))
-        for key, value in six.iteritems(static_grad_value):
-            self.assertTrue(np.allclose(value, dy_grad_value[key]))
-            self.assertTrue(np.isfinite(value.all()))
-            self.assertFalse(np.isnan(value.any()))
-
-        self.assertEqual(len(dy_param_value), len(static_param_value))
-        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
-            self.assertTrue(np.isfinite(value.all()))
-            self.assertFalse(np.isnan(value.any()))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_optimizer.py
deleted file mode 100644
index a65db433e66d4833bc5ccf284a1083e89489fb46..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_optimizer.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.optimizer import SGDOptimizer, Adam
-from paddle.fluid.dygraph.nn import FC
-from paddle.fluid.dygraph.base import to_variable
-
-
-class MLP(fluid.Layer):
-    def __init__(self, name_scope):
-        super(MLP, self).__init__(name_scope)
-
-        self._fc1 = FC(self.full_name(), 10)
-        self._fc2 = FC(self.full_name(), 10)
-
-    def forward(self, inputs):
-        y = self._fc1(inputs)
-        y = self._fc2(y)
-        return y
-
-
-class TestImperativeOptimizerBase(unittest.TestCase):
-    def setUp(self):
-        self.batch_num = 20
-
-    def get_optimizer(self):
-        raise NotImplementedError()
-
-    def _check_mlp(self):
-        seed = 90
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            mlp = MLP('mlp')
-            optimizer = self.get_optimizer()
-            optimizer2 = SGDOptimizer(
-                learning_rate=fluid.layers.natural_exp_decay(
-                    learning_rate=0.1,
-                    decay_steps=10000,
-                    decay_rate=0.5,
-                    staircase=True))
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
-
-            for batch_id, data in enumerate(train_reader()):
-                dy_x_data = np.array(
-                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    128, 1)
-
-                img = to_variable(dy_x_data)
-                label = to_variable(y_data)
-                label._stop_gradient = True
-
-                cost = mlp(img)
-                avg_loss = fluid.layers.reduce_mean(cost)
-
-                avg_loss.backward()
-                optimizer.minimize(avg_loss)
-                optimizer2.minimize(avg_loss)
-                mlp.clear_gradients()
-                fluid.dygraph.save_persistables(mlp.state_dict(), "save_dir_2",
-                                                [optimizer, optimizer2])
-                if batch_id == 2:
-                    break
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            mlp_load = MLP('mlp')
-            optimizer_load1 = self.get_optimizer()
-            optimizer_load2 = SGDOptimizer(
-                learning_rate=fluid.layers.natural_exp_decay(
-                    learning_rate=0.1,
-                    decay_steps=10000,
-                    decay_rate=0.5,
-                    staircase=True))
-            parameters, optimizers = fluid.dygraph.load_persistables(
-                "save_dir_2")
-            mlp_load.load_dict(parameters)
-            optimizer_load1.load(optimizers)
-            optimizer_load2.load(optimizers)
-
-        self.assertTrue(optimizer._learning_rate.__dict__ ==
-                        optimizer_load1._learning_rate.__dict__)
-        self.assertTrue(optimizer2._learning_rate.__dict__ ==
-                        optimizer_load2._learning_rate.__dict__)
-
-
-class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        bd = [3, 6, 9]
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
-            boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
-        return optimizer
-
-    def test_adam(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
-            learning_rate=0.1, decay_steps=5, cycle=self.cycle))
-        return optimizer
-
-    def test_sgd_cycle(self):
-        self.cycle = True
-        self._check_mlp()
-
-    def test_sgd(self):
-        self.cycle = False
-        self._check_mlp()
-
-
-class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
-            learning_rate=0.1, step_each_epoch=10000, epochs=120))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
-    def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
-            d_model=512, warmup_steps=8000))
-        return optimizer
-
-    def test_sgd(self):
-        self._check_mlp()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
deleted file mode 100644
index f6585d1b30dacc5a54e38455e8db82980057f1a0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ /dev/null
@@ -1,476 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import unittest
-import numpy as np
-import six
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
-from paddle.fluid.dygraph.base import to_variable
-from test_imperative_base import new_program_scope
-
-batch_size = 8
-train_parameters = {
-    "input_size": [3, 224, 224],
-    "input_mean": [0.485, 0.456, 0.406],
-    "input_std": [0.229, 0.224, 0.225],
-    "learning_strategy": {
-        "name": "piecewise_decay",
-        "batch_size": batch_size,
-        "epochs": [30, 60, 90],
-        "steps": [0.1, 0.01, 0.001, 0.0001]
-    },
-    "batch_size": batch_size,
-    "lr": 0.1,
-    "total_images": 6149,
-}
-
-
-def optimizer_setting(params):
-    ls = params["learning_strategy"]
-    if ls["name"] == "piecewise_decay":
-        if "total_images" not in params:
-            total_images = 6149
-        else:
-            total_images = params["total_images"]
-        # TODO(Yancey1989): using lr decay if it is ready.
-        #batch_size = ls["batch_size"]
-        #step = int(total_images / batch_size + 1)
-
-        #bd = [step * e for e in ls["epochs"]]
-        #base_lr = params["lr"]
-        #lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-
-    return optimizer
-
-
-class ConvBNLayer(fluid.dygraph.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__(name_scope)
-
-        self._conv = Conv2D(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=None)
-
-        self._batch_norm = BatchNorm(self.full_name(), num_filters, act=act)
-
-    def forward(self, inputs):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-
-        return y
-
-
-class SqueezeExcitation(fluid.dygraph.Layer):
-    def __init__(self, name_scope, num_channels, reduction_ratio):
-
-        super(SqueezeExcitation, self).__init__(name_scope)
-        self._pool = Pool2D(
-            self.full_name(), pool_size=0, pool_type='avg', global_pooling=True)
-        self._squeeze = FC(
-            self.full_name(),
-            size=num_channels // reduction_ratio,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)),
-            act='relu')
-        self._excitation = FC(
-            self.full_name(),
-            size=num_channels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)),
-            act='sigmoid')
-
-    def forward(self, input):
-        y = self._pool(input)
-        y = self._squeeze(y)
-        y = self._excitation(y)
-        y = fluid.layers.elementwise_mul(x=input, y=y, axis=0)
-        return y
-
-
-class BottleneckBlock(fluid.dygraph.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_channels,
-                 num_filters,
-                 stride,
-                 cardinality,
-                 reduction_ratio,
-                 shortcut=True):
-        super(BottleneckBlock, self).__init__(name_scope)
-
-        self.conv0 = ConvBNLayer(
-            self.full_name(), num_filters=num_filters, filter_size=1)
-        self.conv1 = ConvBNLayer(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            groups=cardinality)
-        self.conv2 = ConvBNLayer(
-            self.full_name(),
-            num_filters=num_filters * 4,
-            filter_size=1,
-            act='relu')
-
-        self.scale = SqueezeExcitation(
-            self.full_name(),
-            num_channels=num_filters * 4,
-            reduction_ratio=reduction_ratio)
-
-        if not shortcut:
-            self.short = ConvBNLayer(
-                self.full_name(),
-                num_filters=num_filters * 4,
-                filter_size=1,
-                stride=stride)
-
-        self.shortcut = shortcut
-
-        self._num_channels_out = num_filters * 4
-
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
-        conv2 = self.conv2(conv1)
-        scale = self.scale(conv2)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-
-        y = fluid.layers.elementwise_add(x=short, y=scale)
-
-        layer_helper = LayerHelper(self.full_name(), act='relu')
-        y = layer_helper.append_activation(y)
-        return y
-
-
-class SeResNeXt(fluid.dygraph.Layer):
-    def __init__(self, name_scope, layers=50, class_dim=102):
-        super(SeResNeXt, self).__init__(name_scope)
-
-        self.layers = layers
-        supported_layers = [50, 101, 152]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(supported_layers, layers)
-
-        if layers == 50:
-            cardinality = 32
-            reduction_ratio = 16
-            depth = [3, 4, 6, 3]
-            num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                self.full_name(),
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                self.full_name(),
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-        elif layers == 101:
-            cardinality = 32
-            reduction_ratio = 16
-            depth = [3, 4, 23, 3]
-            num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                self.full_name(),
-                num_filters=3,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                self.full_name(),
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-        elif layers == 152:
-            cardinality = 64
-            reduction_ratio = 16
-            depth = [3, 8, 36, 3]
-            num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                self.full_name(),
-                num_filters=3,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.conv1 = ConvBNLayer(
-                self.full_name(),
-                num_filters=3,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.conv2 = ConvBNLayer(
-                self.full_name(),
-                num_filters=3,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                self.full_name(),
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-
-        self.bottleneck_block_list = []
-        num_channels = 64
-        for block in range(len(depth)):
-            shortcut = False
-            for i in range(depth[block]):
-                bottleneck_block = self.add_sublayer(
-                    'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        self.full_name(),
-                        num_channels=num_channels,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        cardinality=cardinality,
-                        reduction_ratio=reduction_ratio,
-                        shortcut=shortcut))
-                num_channels = bottleneck_block._num_channels_out
-                self.bottleneck_block_list.append(bottleneck_block)
-                shortcut = True
-
-        self.pool2d_avg = Pool2D(
-            self.full_name(), pool_size=7, pool_type='avg', global_pooling=True)
-        import math
-        stdv = 1.0 / math.sqrt(2048 * 1.0)
-
-        self.out = FC(self.full_name(),
-                      size=class_dim,
-                      act='softmax',
-                      param_attr=fluid.param_attr.ParamAttr(
-                          initializer=fluid.initializer.Uniform(-stdv, stdv)))
-
-    def forward(self, inputs):
-        if self.layers == 50 or self.layers == 101:
-            y = self.conv0(inputs)
-            y = self.pool(y)
-        elif self.layers == 152:
-            y = self.conv0(inputs)
-            y = self.conv1(inputs)
-            y = self.conv2(inputs)
-            y = self.pool(y)
-
-        for bottleneck_block in self.bottleneck_block_list:
-            y = bottleneck_block(y)
-        y = self.pool2d_avg(y)
-        y = fluid.layers.dropout(y, dropout_prob=0.2)
-        y = self.out(y)
-        return y
-
-
-class TestImperativeResneXt(unittest.TestCase):
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                doc = np.array(item[0]).reshape(3, 224, 224)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield doc, label
-
-        return _reader_imple
-
-    def test_se_resnext_float32(self):
-        seed = 90
-
-        batch_size = train_parameters["batch_size"]
-        batch_num = 1
-        epoch_num = 1
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            se_resnext = SeResNeXt("se_resnext")
-            optimizer = optimizer_setting(train_parameters)
-            np.random.seed(seed)
-            import random
-            random.seed = seed
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(
-                        paddle.dataset.flowers.train(use_xmap=False)),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
-
-            dy_param_init_value = {}
-            for param in se_resnext.parameters():
-                dy_param_init_value[param.name] = param.numpy()
-            for epoch_id in range(epoch_num):
-                for batch_id, data in enumerate(batch_py_reader()):
-
-                    if batch_id >= batch_num and batch_num != -1:
-                        break
-
-                    img = data[0]
-                    label = data[1]
-                    label.stop_gradient = True
-                    label.stop_gradient = True
-
-                    out = se_resnext(img)
-                    loss = fluid.layers.cross_entropy(input=out, label=label)
-                    avg_loss = fluid.layers.mean(x=loss)
-
-                    dy_out = avg_loss.numpy()
-
-                    if batch_id == 0:
-                        for param in se_resnext.parameters():
-                            if param.name not in dy_param_init_value:
-                                dy_param_init_value[param.name] = param.numpy()
-                    avg_loss.backward()
-
-                    #dy_grad_value = {}
-                    #for param in se_resnext.parameters():
-                    #    if param.trainable:
-                    #        np_array = np.array(param._ivar._grad_ivar().value()
-                    #                            .get_tensor())
-                    #        dy_grad_value[param.name + core.grad_var_suffix()] = np_array
-
-                    optimizer.minimize(avg_loss)
-                    se_resnext.clear_gradients()
-
-                    dy_param_value = {}
-                    for param in se_resnext.parameters():
-                        dy_param_value[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-
-            se_resnext = SeResNeXt("se_resnext")
-            optimizer = optimizer_setting(train_parameters)
-
-            np.random.seed(seed)
-            import random
-            random.seed = seed
-            train_reader = paddle.batch(
-                paddle.dataset.flowers.train(use_xmap=False),
-                batch_size=batch_size,
-                drop_last=True)
-
-            img = fluid.layers.data(
-                name='pixel', shape=[3, 224, 224], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            out = se_resnext(img)
-            loss = fluid.layers.cross_entropy(input=out, label=label)
-            avg_loss = fluid.layers.mean(x=loss)
-            optimizer.minimize(avg_loss)
-
-            # initialize params and fetch them
-            static_param_init_value = {}
-            static_param_name_list = []
-            static_grad_name_list = []
-            for param in se_resnext.parameters():
-                static_param_name_list.append(param.name)
-            for param in se_resnext.parameters():
-                if param.trainable:
-                    static_grad_name_list.append(param.name +
-                                                 core.grad_var_suffix())
-
-            out = exe.run(fluid.default_startup_program(),
-                          fetch_list=static_param_name_list)
-
-            for i in range(len(static_param_name_list)):
-                static_param_init_value[static_param_name_list[i]] = out[i]
-            for epoch_id in range(epoch_num):
-                for batch_id, data in enumerate(train_reader()):
-                    if batch_id >= batch_num and batch_num != -1:
-                        break
-
-                    static_x_data = np.array(
-                        [x[0].reshape(3, 224, 224)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(
-                            [batch_size, 1])
-
-                    fetch_list = [avg_loss.name]
-                    fetch_list.extend(static_param_name_list)
-                    fetch_list.extend(static_grad_name_list)
-                    out = exe.run(
-                        fluid.default_main_program(),
-                        feed={"pixel": static_x_data,
-                              "label": y_data},
-                        fetch_list=fetch_list)
-
-                    static_param_value = {}
-                    static_grad_value = {}
-                    static_out = out[0]
-                    param_start_pos = 1
-                    grad_start_pos = len(
-                        static_param_name_list) + param_start_pos
-                    for i in range(
-                            param_start_pos,
-                            len(static_param_name_list) + param_start_pos):
-                        static_param_value[static_param_name_list[
-                            i - param_start_pos]] = out[i]
-                    for i in range(grad_start_pos,
-                                   len(static_grad_name_list) + grad_start_pos):
-                        static_grad_value[static_grad_name_list[
-                            i - grad_start_pos]] = out[i]
-        self.assertTrue(np.allclose(static_out, dy_out))
-
-        self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
-
-        for key, value in six.iteritems(static_param_init_value):
-            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
-            self.assertTrue(np.isfinite(value.all()))
-            self.assertFalse(np.isnan(value.any()))
-        # FIXME(Yancey1989): np.array(_ivar.value().get_tensor()) leads to memory lake
-        #self.assertEqual(len(dy_grad_value), len(static_grad_value))
-        #for key, value in six.iteritems(static_grad_value):
-        #    self.assertTrue(np.allclose(value, dy_grad_value[key]))
-        #    self.assertTrue(np.isfinite(value.all()))
-        #    self.assertFalse(np.isnan(value.any()))
-
-        self.assertEqual(len(dy_param_value), len(static_param_value))
-        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
-            self.assertTrue(np.isfinite(value.all()))
-            self.assertFalse(np.isnan(value.any()))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
deleted file mode 100644
index 51fb66f7743e7d79fb3d75feb2d32e080f1f48df..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ /dev/null
@@ -1,1085 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-from paddle.fluid import Embedding, LayerNorm, FC, Layer
-from paddle.fluid.dygraph import to_variable, guard
-from test_imperative_base import new_program_scope
-from paddle.fluid import core
-import numpy as np
-import six
-np.set_printoptions(suppress=True)
-
-
-# Copy from models
-class TrainTaskConfig(object):
-    # support both CPU and GPU now.
-    use_gpu = True
-    # the epoch number to train.
-    pass_num = 30
-    # the number of sequences contained in a mini-batch.
-    # deprecated, set batch_size in args.
-    batch_size = 32
-    # the hyper parameters for Adam optimizer.
-    # This static learning_rate will be multiplied to the LearningRateScheduler
-    # derived learning rate the to get the final learning rate.
-    learning_rate = 2.0
-    beta1 = 0.9
-    beta2 = 0.997
-    eps = 1e-9
-    # the parameters for learning rate scheduling.
-    warmup_steps = 8000
-    # the weight used to mix up the ground-truth distribution and the fixed
-    # uniform distribution in label smoothing when training.
-    # Set this as zero if label smoothing is not wanted.
-    label_smooth_eps = 0.1
-    # the directory for saving trained models.
-    model_dir = "trained_models"
-    # the directory for saving checkpoints.
-    ckpt_dir = "trained_ckpts"
-    # the directory for loading checkpoint.
-    # If provided, continue training from the checkpoint.
-    ckpt_path = None
-    # the parameter to initialize the learning rate scheduler.
-    # It should be provided if use checkpoints, since the checkpoint doesn't
-    # include the training step counter currently.
-    start_step = 0
-    # the frequency to save trained models.
-    save_freq = 10000
-
-
-class InferTaskConfig(object):
-    use_gpu = True
-    # the number of examples in one run for sequence generation.
-    batch_size = 10
-    # the parameters for beam search.
-    beam_size = 5
-    max_out_len = 256
-    # the number of decoded sentences to output.
-    n_best = 1
-    # the flags indicating whether to output the special tokens.
-    output_bos = False
-    output_eos = False
-    output_unk = True
-    # the directory for loading the trained model.
-    model_path = "trained_models/pass_1.infer.model"
-
-
-class ModelHyperParams(object):
-    # These following five vocabularies related configurations will be set
-    # automatically according to the passed vocabulary path and special tokens.
-    # size of source word dictionary.
-    src_vocab_size = 10000
-    # size of target word dictionay
-    trg_vocab_size = 10000
-    # index for <bos> token
-    bos_idx = 0
-    # index for <eos> token
-    eos_idx = 1
-    # index for <unk> token
-    unk_idx = 2
-    # max length of sequences deciding the size of position encoding table.
-    max_length = 4
-    # the dimension for word embeddings, which is also the last dimension of
-    # the input and output of multi-head attention, position-wise feed-forward
-    # networks, encoder and decoder.
-    d_model = 512
-    # size of the hidden layer in position-wise feed-forward networks.
-    d_inner_hid = 2048
-    # the dimension that keys are projected to for dot-product attention.
-    d_key = 64
-    # the dimension that values are projected to for dot-product attention.
-    d_value = 64
-    # number of head used in multi-head attention.
-    n_head = 8
-    # number of sub-layers to be stacked in the encoder and decoder.
-    n_layer = 6
-    # dropout rates of different modules.
-    prepostprocess_dropout = 0.1
-    attention_dropout = 0.1
-    relu_dropout = 0.1
-    # to process before each sub-layer
-    preprocess_cmd = "n"  # layer normalization
-    # to process after each sub-layer
-    postprocess_cmd = "da"  # dropout + residual connection
-    # random seed used in dropout for CE.
-    dropout_seed = None
-    # the flag indicating whether to share embedding and softmax weights.
-    # vocabularies in source and target should be same for weight sharing.
-    weight_sharing = True
-
-
-def merge_cfg_from_list(cfg_list, g_cfgs):
-    """
-    Set the above global configurations using the cfg_list.
-    """
-    assert len(cfg_list) % 2 == 0
-    for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
-        for g_cfg in g_cfgs:
-            if hasattr(g_cfg, key):
-                try:
-                    value = eval(value)
-                except Exception:  # for file path
-                    pass
-                setattr(g_cfg, key, value)
-                break
-
-
-def position_encoding_init(n_position, d_pos_vec):
-    """
-    Generate the initial values for the sinusoid position encoding table.
-    """
-    channels = d_pos_vec
-    position = np.arange(n_position)
-    num_timescales = channels // 2
-    log_timescale_increment = (np.log(float(1e4) / float(1)) /
-                               (num_timescales - 1))
-    inv_timescales = np.exp(np.arange(
-        num_timescales)) * -log_timescale_increment
-    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
-                                                               0)
-    signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
-    signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
-    position_enc = signal
-    return position_enc.astype("float32")
-
-
-def create_data(is_static=False):
-    if is_static:
-        return [
-            src_word_np, src_pos_np, src_slf_attn_bias_np, trg_word_np,
-            trg_pos_np, trg_slf_attn_bias_np, trg_src_attn_bias_np, lbl_word_np,
-            lbl_weight_np
-        ]
-    else:
-        enc_inputs = [
-            to_variable(
-                src_word_np, name='src_word'), to_variable(
-                    src_pos_np, name='src_pos'), to_variable(
-                        src_slf_attn_bias_np, name='src_slf_attn_bias')
-        ]
-        dec_inputs = [
-            to_variable(
-                trg_word_np, name='trg_word'), to_variable(
-                    trg_pos_np, name='trg_pos'), to_variable(
-                        trg_slf_attn_bias_np, name='trg_slf_attn_bias'),
-            to_variable(
-                trg_src_attn_bias_np, name='trg_src_attn_bias')
-        ]
-        label = to_variable(lbl_word_np, name='lbl_word')
-        weight = to_variable(lbl_weight_np, name='lbl_weight')
-        return enc_inputs, dec_inputs, label, weight
-
-
-def create_feed_dict_list(data, init=False):
-    if init:
-        data_input_names = encoder_data_input_fields + \
-                           decoder_data_input_fields[:-1] + label_data_input_fields + pos_enc_param_names
-    else:
-        data_input_names = encoder_data_input_fields + \
-                           decoder_data_input_fields[:-1] + label_data_input_fields
-    feed_dict_list = dict()
-    for i in range(len(data_input_names)):
-        feed_dict_list[data_input_names[i]] = data[i]
-    return feed_dict_list
-
-
-def make_all_inputs(input_fields):
-    """
-    Define the input data layers for the transformer model.
-    """
-    inputs = []
-    for input_field in input_fields:
-        input_var = fluid.layers.data(
-            name=input_field,
-            shape=input_descs[input_field][0],
-            dtype=input_descs[input_field][1],
-            lod_level=input_descs[input_field][2]
-            if len(input_descs[input_field]) == 3 else 0,
-            append_batch_size=False)
-        inputs.append(input_var)
-    return inputs
-
-
-# The placeholder for batch_size in compile time. Must be -1 currently to be
-# consistent with some ops' infer-shape output in compile time, such as the
-# sequence_expand op used in beamsearch decoder.
-batch_size = -1
-# The placeholder for squence length in compile time.
-seq_len = ModelHyperParams.max_length
-# Here list the data shapes and data types of all inputs.
-# The shapes here act as placeholder and are set to pass the infer-shape in
-# compile time.
-input_descs = {
-    # The actual data shape of src_word is:
-    # [batch_size, max_src_len_in_batch, 1]
-    "src_word": [(batch_size, seq_len, 1), "int64", 2],
-    # The actual data shape of src_pos is:
-    # [batch_size, max_src_len_in_batch, 1]
-    "src_pos": [(batch_size, seq_len, 1), "int64"],
-    # This input is used to remove attention weights on paddings in the
-    # encoder.
-    # The actual data shape of src_slf_attn_bias is:
-    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
-    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
-    # The actual data shape of trg_word is:
-    # [batch_size, max_trg_len_in_batch, 1]
-    "trg_word": [(batch_size, seq_len, 1), "int64",
-                 2],  # lod_level is only used in fast decoder.
-    # The actual data shape of trg_pos is:
-    # [batch_size, max_trg_len_in_batch, 1]
-    "trg_pos": [(batch_size, seq_len, 1), "int64"],
-    # This input is used to remove attention weights on paddings and
-    # subsequent words in the decoder.
-    # The actual data shape of trg_slf_attn_bias is:
-    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
-    "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
-    # This input is used to remove attention weights on paddings of the source
-    # input in the encoder-decoder attention.
-    # The actual data shape of trg_src_attn_bias is:
-    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
-    "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
-    # This input is used in independent decoder program for inference.
-    # The actual data shape of enc_output is:
-    # [batch_size, max_src_len_in_batch, d_model]
-    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
-    # The actual data shape of label_word is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_word": [(batch_size * seq_len, 1), "int64"],
-    # This input is used to mask out the loss of paddding tokens.
-    # The actual data shape of label_weight is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_weight": [(batch_size * seq_len, 1), "float32"],
-    # This input is used in beam-search decoder.
-    "init_score": [(batch_size, 1), "float32", 2],
-    # This input is used in beam-search decoder for the first gather
-    # (cell states updation)
-    "init_idx": [(batch_size, ), "int32"],
-}
-
-# Names of word embedding table which might be reused for weight sharing.
-word_emb_param_names = (
-    "src_word_emb_table",
-    "trg_word_emb_table", )
-# Names of position encoding table which will be initialized externally.
-pos_enc_param_names = (
-    "src_pos_enc_table",
-    "trg_pos_enc_table", )
-# separated inputs for different usages.
-encoder_data_input_fields = (
-    "src_word",
-    "src_pos",
-    "src_slf_attn_bias", )
-decoder_data_input_fields = (
-    "trg_word",
-    "trg_pos",
-    "trg_slf_attn_bias",
-    "trg_src_attn_bias",
-    "enc_output", )
-label_data_input_fields = (
-    "lbl_word",
-    "lbl_weight", )
-# In fast decoder, trg_pos (only containing the current time step) is generated
-# by ops and trg_slf_attn_bias is not needed.
-fast_decoder_data_input_fields = (
-    "trg_word",
-    "init_score",
-    "init_idx",
-    "trg_src_attn_bias", )
-# if we use py_reader
-use_py_reader = False
-
-# if we run sync mode
-sync = False
-
-# how many batches we use
-batch_num = 5
-
-np.random.seed = 90
-src_word_np = np.arange(1, TrainTaskConfig.batch_size * seq_len + 1).reshape(
-    [TrainTaskConfig.batch_size, seq_len, 1]).astype('int64')
-src_pos_np = np.random.randint(
-    1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
-src_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
-                                       ModelHyperParams.n_head, seq_len,
-                                       seq_len).astype('float32')
-
-trg_word_np = np.arange(1, TrainTaskConfig.batch_size * seq_len + 1).reshape(
-    [TrainTaskConfig.batch_size, seq_len, 1]).astype('int64')
-trg_pos_np = np.random.randint(
-    1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
-trg_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
-                                       ModelHyperParams.n_head, seq_len,
-                                       seq_len).astype('float32')
-trg_src_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
-                                       ModelHyperParams.n_head, seq_len,
-                                       seq_len).astype('float32')
-
-lbl_word_np = np.random.randint(
-    1,
-    ModelHyperParams.src_vocab_size - 1,
-    size=(TrainTaskConfig.batch_size * seq_len, 1),
-    dtype='int64')
-lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len,
-                                1).astype('float32')
-
-pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
-                                  ModelHyperParams.d_model)
-pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
-                                  ModelHyperParams.d_model)
-
-
-class PrePostProcessLayer(Layer):
-    def __init__(self, name_scope, process_cmd, shape_len=None):
-        super(PrePostProcessLayer, self).__init__(name_scope)
-        for cmd in process_cmd:
-            if cmd == "n":
-                self._layer_norm = LayerNorm(
-                    name_scope=self.full_name(),
-                    begin_norm_axis=shape_len - 1,
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(1.)),
-                    bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(0.)))
-
-    def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
-        for cmd in process_cmd:
-            if cmd == "a":  # add residual connection
-                out = out + prev_out if prev_out else out
-            elif cmd == "n":  # add layer normalization
-                out = self._layer_norm(out)
-            elif cmd == "d":  # add dropout
-                if dropout_rate:
-                    out = fluid.layers.dropout(
-                        out,
-                        dropout_prob=dropout_rate,
-                        seed=ModelHyperParams.dropout_seed,
-                        is_test=False)
-        return out
-
-
-class PositionwiseFeedForwardLayer(Layer):
-    def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate):
-        super(PositionwiseFeedForwardLayer, self).__init__(name_scope)
-        self._i2h = FC(name_scope=self.full_name(),
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act="relu")
-        self._h2o = FC(name_scope=self.full_name(),
-                       size=d_hid,
-                       num_flatten_dims=2)
-        self._dropout_rate = dropout_rate
-
-    def forward(self, x):
-        hidden = self._i2h(x)
-        if self._dropout_rate:
-            hidden = fluid.layers.dropout(
-                hidden,
-                dropout_prob=self._dropout_rate,
-                seed=ModelHyperParams.dropout_seed,
-                is_test=False)
-        out = self._h2o(hidden)
-        return out
-
-
-class MultiHeadAttentionLayer(Layer):
-    def __init__(self,
-                 name_scope,
-                 d_key,
-                 d_value,
-                 d_model,
-                 n_head=1,
-                 dropout_rate=0.,
-                 cache=None,
-                 gather_idx=None,
-                 static_kv=False):
-        super(MultiHeadAttentionLayer, self).__init__(name_scope)
-        self._n_head = n_head
-        self._d_key = d_key
-        self._d_value = d_value
-        self._d_model = d_model
-        self._dropout_rate = dropout_rate
-        self._q_fc = FC(name_scope=self.full_name(),
-                        size=d_key * n_head,
-                        bias_attr=False,
-                        num_flatten_dims=2)
-        self._k_fc = FC(name_scope=self.full_name(),
-                        size=d_key * n_head,
-                        bias_attr=False,
-                        num_flatten_dims=2)
-        self._v_fc = FC(name_scope=self.full_name(),
-                        size=d_value * n_head,
-                        bias_attr=False,
-                        num_flatten_dims=2)
-        self._proj_fc = FC(name_scope=self.full_name(),
-                           size=self._d_model,
-                           bias_attr=False,
-                           num_flatten_dims=2)
-
-    def forward(self, queries, keys, values, attn_bias):
-        # compute q ,k ,v
-        keys = queries if keys is None else keys
-        values = keys if values is None else values
-
-        q = self._q_fc(queries)
-        k = self._k_fc(keys)
-        v = self._v_fc(values)
-
-        # split head
-        reshaped_q = fluid.layers.reshape(
-            x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False)
-        transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
-        reshaped_k = fluid.layers.reshape(
-            x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False)
-        transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
-        reshaped_v = fluid.layers.reshape(
-            x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
-        transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
-
-        # scale dot product attention
-        product = fluid.layers.matmul(
-            x=transpose_q,
-            y=transpose_k,
-            transpose_y=True,
-            alpha=self._d_model**-0.5)
-        if attn_bias:
-            product += attn_bias
-        weights = fluid.layers.softmax(product)
-        if self._dropout_rate:
-            weights_droped = fluid.layers.dropout(
-                weights,
-                dropout_prob=self._dropout_rate,
-                seed=ModelHyperParams.dropout_seed,
-                is_test=False)
-            out = fluid.layers.matmul(weights_droped, transpose_v)
-        else:
-            out = fluid.layers.matmul(weights, transpose_v)
-
-        # combine heads
-        if len(out.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-        trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
-        final_out = fluid.layers.reshape(
-            x=trans_x,
-            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
-            inplace=False)
-
-        # fc to output
-        proj_out = self._proj_fc(final_out)
-        return proj_out
-
-
-class EncoderSubLayer(Layer):
-    def __init__(self,
-                 name_scope,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
-
-        super(EncoderSubLayer, self).__init__(name_scope)
-        self._preprocess_cmd = preprocess_cmd
-        self._postprocess_cmd = postprocess_cmd
-        self._prepostprocess_dropout = prepostprocess_dropout
-
-        self._preprocess_layer = PrePostProcessLayer(self.full_name(),
-                                                     self._preprocess_cmd, 3)
-        self._multihead_attention_layer = MultiHeadAttentionLayer(
-            self.full_name(), d_key, d_value, d_model, n_head,
-            attention_dropout)
-        self._postprocess_layer = PrePostProcessLayer(
-            self.full_name(), self._postprocess_cmd, None)
-        self._preprocess_layer2 = PrePostProcessLayer(self.full_name(),
-                                                      self._preprocess_cmd, 3)
-        self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
-            self.full_name(), d_inner_hid, d_model, relu_dropout)
-        self._postprocess_layer2 = PrePostProcessLayer(
-            self.full_name(), self._postprocess_cmd, None)
-
-    def forward(self, enc_input, attn_bias):
-        pre_process_multihead = self._preprocess_layer(
-            None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout)
-        attn_output = self._multihead_attention_layer(pre_process_multihead,
-                                                      None, None, attn_bias)
-        attn_output = self._postprocess_layer(enc_input, attn_output,
-                                              self._postprocess_cmd,
-                                              self._prepostprocess_dropout)
-        pre_process2_output = self._preprocess_layer2(
-            None, attn_output, self._preprocess_cmd,
-            self._prepostprocess_dropout)
-        ffd_output = self._positionwise_feed_forward(pre_process2_output)
-        return self._postprocess_layer2(attn_output, ffd_output,
-                                        self._postprocess_cmd,
-                                        self._prepostprocess_dropout)
-
-
-class EncoderLayer(Layer):
-    def __init__(self,
-                 name_scope,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
-
-        super(EncoderLayer, self).__init__(name_scope)
-        self._preprocess_cmd = preprocess_cmd
-        self._encoder_sublayers = list()
-        self._prepostprocess_dropout = prepostprocess_dropout
-        self._n_layer = n_layer
-        self._preprocess_layer = PrePostProcessLayer(self.full_name(),
-                                                     self._preprocess_cmd, 3)
-        for i in range(n_layer):
-            self._encoder_sublayers.append(
-                self.add_sublayer(
-                    'esl_%d' % i,
-                    EncoderSubLayer(
-                        self.full_name(), n_head, d_key, d_value, d_model,
-                        d_inner_hid, prepostprocess_dropout, attention_dropout,
-                        relu_dropout, preprocess_cmd, postprocess_cmd)))
-
-    def forward(self, enc_input, attn_bias):
-        for i in range(self._n_layer):
-            enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
-            enc_input = enc_output
-
-        return self._preprocess_layer(None, enc_output, self._preprocess_cmd,
-                                      self._prepostprocess_dropout)
-
-
-class PrepareEncoderDecoderLayer(Layer):
-    def __init__(self,
-                 name_scope,
-                 src_vocab_size,
-                 src_emb_dim,
-                 src_max_len,
-                 dropout_rate,
-                 word_emb_param_name=None,
-                 pos_enc_param_name=None):
-        super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
-        self._src_max_len = src_max_len
-        self._src_emb_dim = src_emb_dim
-        self._src_vocab_size = src_vocab_size
-        self._dropout_rate = dropout_rate
-        self._input_emb = Embedding(
-            name_scope=self.full_name(),
-            size=[src_vocab_size, src_emb_dim],
-            padding_idx=0,
-            param_attr=fluid.ParamAttr(
-                name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
-
-        if pos_enc_param_name is pos_enc_param_names[0]:
-            pos_inp = pos_inp1
-        else:
-            pos_inp = pos_inp2
-        self._pos_emb = Embedding(
-            name_scope=self.full_name(),
-            size=[self._src_max_len, src_emb_dim],
-            param_attr=fluid.ParamAttr(
-                name=pos_enc_param_name,
-                initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
-                trainable=False))
-
-        # use in dygraph_mode to fit different length batch
-        # self._pos_emb._w = to_variable(
-        #     position_encoding_init(self._src_max_len, self._src_emb_dim))
-
-    def forward(self, src_word, src_pos):
-        src_word_emb = self._input_emb(src_word)
-        src_word_emb = fluid.layers.scale(
-            x=src_word_emb, scale=self._src_emb_dim**0.5)
-        # # TODO change this to fit dynamic length input
-        src_pos_emb = self._pos_emb(src_pos)
-        src_pos_emb.stop_gradient = True
-        enc_input = src_word_emb + src_pos_emb
-        return fluid.layers.dropout(
-            enc_input,
-            dropout_prob=self._dropout_rate,
-            seed=ModelHyperParams.dropout_seed,
-            is_test=False) if self._dropout_rate else enc_input
-
-
-class WrapEncoderLayer(Layer):
-    def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head,
-                 d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
-                 attention_dropout, relu_dropout, preprocess_cmd,
-                 postprocess_cmd, weight_sharing):
-        """
-        The wrapper assembles together all needed layers for the encoder.
-        """
-        super(WrapEncoderLayer, self).__init__(name_cope)
-
-        self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
-            self.full_name(),
-            src_vocab_size,
-            d_model,
-            max_length,
-            prepostprocess_dropout,
-            word_emb_param_name=word_emb_param_names[0],
-            pos_enc_param_name=pos_enc_param_names[0])
-        self._encoder = EncoderLayer(
-            self.full_name(), n_layer, n_head, d_key, d_value, d_model,
-            d_inner_hid, prepostprocess_dropout, attention_dropout,
-            relu_dropout, preprocess_cmd, postprocess_cmd)
-
-    def forward(self, enc_inputs):
-        src_word, src_pos, src_slf_attn_bias = enc_inputs
-        enc_input = self._prepare_encoder_layer(src_word, src_pos)
-        enc_output = self._encoder(enc_input, src_slf_attn_bias)
-        return enc_output
-
-
-class DecoderSubLayer(Layer):
-    def __init__(self,
-                 name_scope,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd,
-                 postprocess_cmd,
-                 cache=None,
-                 gather_idx=None):
-        super(DecoderSubLayer, self).__init__(name_scope)
-        self._postprocess_cmd = postprocess_cmd
-        self._preprocess_cmd = preprocess_cmd
-        self._prepostprcess_dropout = prepostprocess_dropout
-        self._pre_process_layer = PrePostProcessLayer(self.full_name(),
-                                                      preprocess_cmd, 3)
-        self._multihead_attention_layer = MultiHeadAttentionLayer(
-            self.full_name(),
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            cache=cache,
-            gather_idx=gather_idx)
-        self._post_process_layer = PrePostProcessLayer(self.full_name(),
-                                                       postprocess_cmd, None)
-        self._pre_process_layer2 = PrePostProcessLayer(self.full_name(),
-                                                       preprocess_cmd, 3)
-        self._multihead_attention_layer2 = MultiHeadAttentionLayer(
-            self.full_name(),
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            cache=cache,
-            gather_idx=gather_idx,
-            static_kv=True)
-        self._post_process_layer2 = PrePostProcessLayer(self.full_name(),
-                                                        postprocess_cmd, None)
-        self._pre_process_layer3 = PrePostProcessLayer(self.full_name(),
-                                                       preprocess_cmd, 3)
-        self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
-            self.full_name(), d_inner_hid, d_model, relu_dropout)
-        self._post_process_layer3 = PrePostProcessLayer(self.full_name(),
-                                                        postprocess_cmd, None)
-
-    def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
-        pre_process_rlt = self._pre_process_layer(
-            None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
-        slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
-                                                          None, slf_attn_bias)
-        slf_attn_output_pp = self._post_process_layer(
-            dec_input, slf_attn_output, self._postprocess_cmd,
-            self._prepostprcess_dropout)
-        pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp,
-                                                    self._preprocess_cmd,
-                                                    self._prepostprcess_dropout)
-        enc_attn_output_pp = self._multihead_attention_layer2(
-            pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
-        enc_attn_output = self._post_process_layer2(
-            slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
-            self._prepostprcess_dropout)
-        pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
-                                                    self._preprocess_cmd,
-                                                    self._prepostprcess_dropout)
-        ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
-        dec_output = self._post_process_layer3(enc_attn_output, ffd_output,
-                                               self._postprocess_cmd,
-                                               self._prepostprcess_dropout)
-        return dec_output
-
-
-class DecoderLayer(Layer):
-    def __init__(self,
-                 name_scope,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd,
-                 postprocess_cmd,
-                 caches=None,
-                 gather_idx=None):
-        super(DecoderLayer, self).__init__(name_scope)
-        self._pre_process_layer = PrePostProcessLayer(self.full_name(),
-                                                      preprocess_cmd, 3)
-        self._decoder_sub_layers = list()
-        self._n_layer = n_layer
-        self._preprocess_cmd = preprocess_cmd
-        self._prepostprocess_dropout = prepostprocess_dropout
-        for i in range(n_layer):
-            self._decoder_sub_layers.append(
-                self.add_sublayer(
-                    'dsl_%d' % i,
-                    DecoderSubLayer(
-                        self.full_name(),
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        cache=None if caches is None else caches[i],
-                        gather_idx=gather_idx)))
-
-    def forward(self, dec_input, enc_output, dec_slf_attn_bias,
-                dec_enc_attn_bias):
-        for i in range(self._n_layer):
-            tmp_dec_output = self._decoder_sub_layers[i](
-                dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
-            dec_input = tmp_dec_output
-
-        dec_output = self._pre_process_layer(None, tmp_dec_output,
-                                             self._preprocess_cmd,
-                                             self._prepostprocess_dropout)
-        return dec_output
-
-
-class WrapDecoderLayer(Layer):
-    def __init__(self,
-                 name_scope,
-                 trg_vocab_size,
-                 max_length,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd,
-                 postprocess_cmd,
-                 weight_sharing,
-                 caches=None,
-                 gather_idx=None):
-        """
-        The wrapper assembles together all needed layers for the encoder.
-        """
-        super(WrapDecoderLayer, self).__init__(name_scope)
-
-        self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
-            self.full_name(),
-            trg_vocab_size,
-            d_model,
-            max_length,
-            prepostprocess_dropout,
-            word_emb_param_name=word_emb_param_names[1],
-            pos_enc_param_name=pos_enc_param_names[1])
-        self._decoder_layer = DecoderLayer(
-            self.full_name(),
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            caches=caches,
-            gather_idx=gather_idx)
-        self._weight_sharing = weight_sharing
-        if not weight_sharing:
-            self._fc = FC(self.full_name(),
-                          size=trg_vocab_size,
-                          bias_attr=False)
-
-    def forward(self, dec_inputs=None, enc_output=None):
-        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
-        dec_input = self._prepare_decoder_layer(trg_word, trg_pos)
-        dec_output = self._decoder_layer(dec_input, enc_output,
-                                         trg_slf_attn_bias, trg_src_attn_bias)
-
-        dec_output_reshape = fluid.layers.reshape(
-            dec_output, shape=[-1, dec_output.shape[-1]], inplace=False)
-
-        if self._weight_sharing:
-            predict = fluid.layers.matmul(
-                x=dec_output_reshape,
-                y=self._prepare_decoder_layer._input_emb._w,
-                transpose_y=True)
-        else:
-            predict = self._fc(dec_output_reshape)
-
-        if dec_inputs is None:
-            # Return probs for independent decoder program.
-            predict_out = fluid.layers.softmax(predict)
-            return predict_out
-        return predict
-
-
-class TransFormer(Layer):
-    def __init__(self,
-                 name_scope,
-                 src_vocab_size,
-                 trg_vocab_size,
-                 max_length,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd,
-                 postprocess_cmd,
-                 weight_sharing,
-                 label_smooth_eps,
-                 use_py_reader=False,
-                 is_test=False):
-        super(TransFormer, self).__init__(name_scope)
-        self._label_smooth_eps = label_smooth_eps
-        self._trg_vocab_size = trg_vocab_size
-        if weight_sharing:
-            assert src_vocab_size == trg_vocab_size, (
-                "Vocabularies in source and target should be same for weight sharing."
-            )
-        self._wrap_encoder_layer = WrapEncoderLayer(
-            self.full_name(), src_vocab_size, max_length, n_layer, n_head,
-            d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
-            attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
-            weight_sharing)
-        self._wrap_decoder_layer = WrapDecoderLayer(
-            self.full_name(), trg_vocab_size, max_length, n_layer, n_head,
-            d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
-            attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
-            weight_sharing)
-
-        if weight_sharing:
-            self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w
-
-    def forward(self, enc_inputs, dec_inputs, label, weights):
-        enc_output = self._wrap_encoder_layer(enc_inputs)
-        predict = self._wrap_decoder_layer(dec_inputs, enc_output)
-        if self._label_smooth_eps:
-            label_out = fluid.layers.label_smooth(
-                label=fluid.layers.one_hot(
-                    input=label, depth=self._trg_vocab_size),
-                epsilon=self._label_smooth_eps)
-
-        cost = fluid.layers.softmax_with_cross_entropy(
-            logits=predict,
-            label=label_out,
-            soft_label=True if self._label_smooth_eps else False)
-        weighted_cost = cost * weights
-        sum_cost = fluid.layers.reduce_sum(weighted_cost)
-        token_num = fluid.layers.reduce_sum(weights)
-        token_num.stop_gradient = True
-        avg_cost = sum_cost / token_num
-        return sum_cost, avg_cost, predict, token_num
-
-
-class TestDygraphTransformerSortGradient(unittest.TestCase):
-    def test_transformer_sort_gradient_float32(self):
-        seed = 90
-
-        with guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            transformer = TransFormer(
-                'transformer',
-                ModelHyperParams.src_vocab_size,
-                ModelHyperParams.trg_vocab_size,
-                ModelHyperParams.max_length + 1,
-                ModelHyperParams.n_layer,
-                ModelHyperParams.n_head,
-                ModelHyperParams.d_key,
-                ModelHyperParams.d_value,
-                ModelHyperParams.d_model,
-                ModelHyperParams.d_inner_hid,
-                ModelHyperParams.prepostprocess_dropout,
-                ModelHyperParams.attention_dropout,
-                ModelHyperParams.relu_dropout,
-                ModelHyperParams.preprocess_cmd,
-                ModelHyperParams.postprocess_cmd,
-                ModelHyperParams.weight_sharing,
-                TrainTaskConfig.label_smooth_eps,
-                use_py_reader=use_py_reader,
-                is_test=False)
-            if sync:
-                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
-                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
-                with fluid.default_main_program()._lr_schedule_guard():
-                    learning_rate = lr_decay * TrainTaskConfig.learning_rate
-                optimizer = fluid.optimizer.Adam(
-                    learning_rate=learning_rate,
-                    beta1=TrainTaskConfig.beta1,
-                    beta2=TrainTaskConfig.beta2,
-                    epsilon=TrainTaskConfig.eps)
-            else:
-                optimizer = fluid.optimizer.SGD(learning_rate=0.003)
-            dy_param_init = dict()
-            dy_param_updated = dict()
-            for i in range(batch_num):
-                enc_inputs, dec_inputs, label, weights = create_data()
-                dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer(
-                    enc_inputs, dec_inputs, label, weights)
-
-                if i == 0:
-                    for param in transformer.parameters():
-                        dy_param_init[param.name] = param.numpy()
-
-                dy_avg_cost.backward(backward_strategy)
-                optimizer.minimize(dy_avg_cost)
-                transformer.clear_gradients()
-
-                if i == batch_num - 1:
-                    for param in transformer.parameters():
-                        dy_param_updated[param.name] = param.numpy()
-
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            transformer = TransFormer(
-                'transformer',
-                ModelHyperParams.src_vocab_size,
-                ModelHyperParams.trg_vocab_size,
-                ModelHyperParams.max_length + 1,
-                ModelHyperParams.n_layer,
-                ModelHyperParams.n_head,
-                ModelHyperParams.d_key,
-                ModelHyperParams.d_value,
-                ModelHyperParams.d_model,
-                ModelHyperParams.d_inner_hid,
-                ModelHyperParams.prepostprocess_dropout,
-                ModelHyperParams.attention_dropout,
-                ModelHyperParams.relu_dropout,
-                ModelHyperParams.preprocess_cmd,
-                ModelHyperParams.postprocess_cmd,
-                ModelHyperParams.weight_sharing,
-                TrainTaskConfig.label_smooth_eps,
-                use_py_reader=use_py_reader,
-                is_test=False)
-            exe = fluid.Executor(fluid.CPUPlace(
-            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-            optimizer = fluid.optimizer.SGD(learning_rate=0.003)
-
-            data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
-                                                                                     -1] + label_data_input_fields
-            all_inputs = make_all_inputs(data_input_names)
-            enc_inputs_len = len(encoder_data_input_fields)
-            dec_inputs_len = len(decoder_data_input_fields[:-1])
-            enc_inputs = all_inputs[0:enc_inputs_len]
-            dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
-                                    dec_inputs_len]
-            label = all_inputs[-2]
-            weights = all_inputs[-1]
-            static_param_updated = dict()
-            static_param_init = dict()
-            static_param_name_list = list()
-            static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
-                enc_inputs, dec_inputs, label, weights)
-            optimizer.minimize(static_avg_cost)
-            for param in transformer.parameters():
-                static_param_name_list.append(param.name)
-            out = exe.run(fluid.default_startup_program(),
-                          fetch_list=static_param_name_list)
-            for i in range(len(static_param_name_list)):
-                static_param_init[static_param_name_list[i]] = out[i]
-            static_sum_cost_value = None
-            static_avg_cost_value = None
-            static_predict_value = None
-            static_token_num_value = None
-            for i in range(batch_num):
-                feed_dict = create_feed_dict_list(create_data(True))
-                fetch_list = [
-                    static_sum_cost, static_avg_cost, static_predict,
-                    static_token_num
-                ]
-
-                fetch_list.extend(static_param_name_list)
-                out = exe.run(fluid.default_main_program(),
-                              feed=feed_dict,
-                              fetch_list=fetch_list)
-                static_sum_cost_value = out[0]
-                static_avg_cost_value = out[1]
-                static_predict_value = out[2]
-                static_token_num_value = out[3]
-                if i == batch_num - 1:
-                    for k in range(4, len(out)):
-                        static_param_updated[static_param_name_list[k -
-                                                                    4]] = out[k]
-
-        self.assertTrue(
-            np.array_equal(static_avg_cost_value, dy_avg_cost.numpy()))
-        self.assertTrue(
-            np.array_equal(static_sum_cost_value, dy_sum_cost.numpy()))
-        self.assertTrue(
-            np.array_equal(static_predict_value, dy_predict.numpy()))
-        self.assertTrue(
-            np.array_equal(static_token_num_value, dy_token_num.numpy()))
-
-        for key, value in six.iteritems(static_param_init):
-            self.assertTrue(np.array_equal(value, dy_param_init[key]))
-        for key, value in six.iteritems(static_param_updated):
-            self.assertTrue(np.array_equal(value, dy_param_updated[key]))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_infer_shape.py b/python/paddle/fluid/tests/unittests/test_infer_shape.py
deleted file mode 100644
index 553ebaec7f1bc69ce1bbecd6c88283d6207c179c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import six
-import paddle.fluid.core as core
-
-
-class TestInferShape(unittest.TestCase):
-    def test_sum_op(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
-        self.assertIsNotNone(block)
-
-        shape = [10, 20]
-
-        # prepare input/output
-        x1 = block.var(six.b("x1"))
-        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
-        x1.set_shape(shape)
-        x2 = block.var(six.b("x2"))
-        x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
-        x2.set_shape(shape)
-
-        out = block.var(six.b("out"))
-        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
-
-        # prepare the operator
-        sum_op_desc = block.append_op()
-        sum_op_desc.set_type("sum")
-        sum_op_desc.set_input("X", ["x1", "x2"])
-        sum_op_desc.set_output("Out", ["out"])
-
-        sum_op_desc.check_attrs()
-        sum_op_desc.infer_shape(block)
-        self.assertEqual(out.shape(), shape)
-
-    def test_mul_op(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
-        self.assertIsNotNone(block)
-
-        x_shape = [10, 20]
-        y_shape = [20, 30]
-
-        # prepare input/output
-        x1 = block.var(six.b("x"))
-        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
-        x1.set_shape(x_shape)
-        x2 = block.var(six.b("y"))
-        x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
-        x2.set_shape(y_shape)
-
-        out = block.var(six.b("out"))
-        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
-
-        # prepare the operator
-        mul_op_desc = block.append_op()
-        mul_op_desc.set_type("mul")
-        mul_op_desc.set_input("X", ["x"])
-        mul_op_desc.set_input("Y", ["y"])
-        mul_op_desc.set_output("Out", ["out"])
-        mul_op_desc._set_attr("x_num_col_dims", 1)
-        mul_op_desc._set_attr("y_num_col_dims", 1)
-
-        mul_op_desc.check_attrs()
-        mul_op_desc.infer_shape(block)
-        self.assertEqual(out.shape(), [x_shape[0], y_shape[1]])
-
-    def test_expand_op(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
-        self.assertIsNotNone(block)
-
-        shape = [-1, 20]
-        expand_times = [3, 1]
-
-        # prepare input/output
-        x1 = block.var(six.b("x"))
-        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
-        x1.set_shape(shape)
-
-        out = block.var(six.b("out"))
-        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
-
-        # prepare the operator
-        sum_op_desc = block.append_op()
-        sum_op_desc.set_type("expand")
-        sum_op_desc.set_input("X", ["x"])
-        sum_op_desc.set_input('expand_times_tensor', [])
-        sum_op_desc.set_output("Out", ["out"])
-        sum_op_desc._set_attr('expand_times', expand_times)
-
-        sum_op_desc.check_attrs()
-        sum_op_desc.infer_shape(block)
-        self.assertEqual(out.shape(), shape)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inference_api.py b/python/paddle/fluid/tests/unittests/test_inference_api.py
deleted file mode 100644
index c6491b719a39cacee8a76af864305b4836836457..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_inference_api.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, shutil
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-from paddle.fluid.core import PaddleTensor
-from paddle.fluid.core import PaddleDType
-
-
-class TestInferenceApi(unittest.TestCase):
-    def test_inference_api(self):
-        tensor32 = np.random.randint(10, 20, size=[20, 2]).astype('int32')
-        paddletensor32 = PaddleTensor(tensor32)
-        value32 = np.array(paddletensor32.data.int32_data()).reshape(*[20, 2])
-        dtype32 = paddletensor32.dtype
-        self.assertEqual(value32.all(), tensor32.all())
-        self.assertEqual(dtype32, PaddleDType.INT32)
-        self.assertEqual(
-            type(paddletensor32.data.tolist('int32')), type(tensor32.tolist()))
-        self.assertEqual(
-            paddletensor32.data.tolist('int32'), tensor32.ravel().tolist())
-        self.assertEqual(type(paddletensor32.as_ndarray()), type(tensor32))
-        paddletensor32.data.reset(tensor32)
-        self.assertEqual(paddletensor32.as_ndarray().all(), tensor32.all())
-
-        tensor64 = np.random.randint(10, 20, size=[20, 2]).astype('int64')
-        paddletensor64 = PaddleTensor(tensor64)
-        value64 = np.array(paddletensor64.data.int64_data()).reshape(*[20, 2])
-        dtype64 = paddletensor64.dtype
-        self.assertEqual(value64.all(), tensor64.all())
-        self.assertEqual(dtype64, PaddleDType.INT64)
-        self.assertEqual(
-            type(paddletensor64.data.tolist('int64')), type(tensor64.tolist()))
-        self.assertEqual(
-            paddletensor64.data.tolist('int64'), tensor64.ravel().tolist())
-        self.assertEqual(type(paddletensor64.as_ndarray()), type(tensor64))
-        paddletensor64.data.reset(tensor64)
-        self.assertEqual(paddletensor64.as_ndarray().all(), tensor64.all())
-
-        tensor_float = np.random.randn(20, 2).astype('float32')
-        paddletensor_float = PaddleTensor(tensor_float)
-        value_float = np.array(paddletensor_float.data.float_data()).reshape(
-            *[20, 2])
-        dtype_float = paddletensor_float.dtype
-        self.assertEqual(value_float.all(), tensor_float.all())
-        self.assertEqual(dtype_float, PaddleDType.FLOAT32)
-        self.assertEqual(
-            type(paddletensor_float.data.tolist('float32')),
-            type(tensor_float.tolist()))
-        self.assertEqual(
-            paddletensor_float.data.tolist('float32'),
-            tensor_float.ravel().tolist())
-        self.assertEqual(
-            type(paddletensor_float.as_ndarray()), type(tensor_float))
-        paddletensor_float.data.reset(tensor_float)
-        self.assertEqual(paddletensor_float.as_ndarray().all(),
-                         tensor_float.all())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
deleted file mode 100644
index 7060449041c095d8ceb3053fe140334d506772d4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ /dev/null
@@ -1,175 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import six
-import numpy as np
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-import warnings
-
-import paddle.fluid.executor as executor
-import paddle.fluid.layers as layers
-import paddle.fluid.optimizer as optimizer
-from paddle.fluid.compiler import CompiledProgram
-from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.io import save_inference_model, load_inference_model
-from paddle.fluid.transpiler import memory_optimize
-
-
-class TestBook(unittest.TestCase):
-    def test_fit_line_inference_model(self):
-        MODEL_DIR = "./tmp/inference_model"
-
-        init_program = Program()
-        program = Program()
-
-        with program_guard(program, init_program):
-            x = layers.data(name='x', shape=[2], dtype='float32')
-            y = layers.data(name='y', shape=[1], dtype='float32')
-
-            y_predict = layers.fc(input=x, size=1, act=None)
-
-            cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
-
-            sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-            sgd_optimizer.minimize(avg_cost, init_program)
-
-        place = core.CPUPlace()
-        exe = executor.Executor(place)
-
-        exe.run(init_program, feed={}, fetch_list=[])
-
-        for i in six.moves.xrange(100):
-            tensor_x = np.array(
-                [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
-            tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
-
-            exe.run(program,
-                    feed={'x': tensor_x,
-                          'y': tensor_y},
-                    fetch_list=[avg_cost])
-
-        save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program)
-        expected = exe.run(program,
-                           feed={'x': tensor_x,
-                                 'y': tensor_y},
-                           fetch_list=[avg_cost])[0]
-
-        six.moves.reload_module(executor)  # reload to build a new scope
-        exe = executor.Executor(place)
-
-        [infer_prog, feed_var_names, fetch_vars] = load_inference_model(
-            MODEL_DIR, exe)
-
-        outs = exe.run(
-            infer_prog,
-            feed={feed_var_names[0]: tensor_x,
-                  feed_var_names[1]: tensor_y},
-            fetch_list=fetch_vars)
-        actual = outs[0]
-
-        self.assertEqual(feed_var_names, ["x", "y"])
-        self.assertEqual(len(fetch_vars), 1)
-        print("fetch %s" % str(fetch_vars[0]))
-        self.assertTrue("scale" in str(fetch_vars[0]))
-        self.assertEqual(expected, actual)
-
-
-class TestSaveInferenceModel(unittest.TestCase):
-    def test_save_inference_model(self):
-        MODEL_DIR = "./tmp/inference_model2"
-        init_program = Program()
-        program = Program()
-
-        # fake program without feed/fetch
-        with program_guard(program, init_program):
-            x = layers.data(name='x', shape=[2], dtype='float32')
-            y = layers.data(name='y', shape=[1], dtype='float32')
-
-            y_predict = layers.fc(input=x, size=1, act=None)
-
-            cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
-
-        place = core.CPUPlace()
-        exe = executor.Executor(place)
-        exe.run(init_program, feed={}, fetch_list=[])
-
-        save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program)
-
-    def test_save_inference_model_with_auc(self):
-        MODEL_DIR = "./tmp/inference_model4"
-        init_program = Program()
-        program = Program()
-
-        # fake program without feed/fetch
-        with program_guard(program, init_program):
-            x = layers.data(name='x', shape=[2], dtype='float32')
-            y = layers.data(name='y', shape=[1], dtype='float32')
-            predict = fluid.layers.fc(input=x, size=2, act='softmax')
-            acc = fluid.layers.accuracy(input=predict, label=y)
-            auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
-                                                                  label=y)
-            cost = fluid.layers.cross_entropy(input=predict, label=y)
-            avg_cost = fluid.layers.mean(x=cost)
-
-        place = core.CPUPlace()
-        exe = executor.Executor(place)
-        exe.run(init_program, feed={}, fetch_list=[])
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe,
-                                 program)
-            expected_warn = "please ensure that you have set the auc states to zeros before saving inference model"
-            self.assertTrue(len(w) > 0)
-            self.assertTrue(expected_warn == str(w[0].message))
-
-
-class TestInstance(unittest.TestCase):
-    def test_save_inference_model(self):
-        MODEL_DIR = "./tmp/inference_model3"
-        init_program = Program()
-        program = Program()
-
-        # fake program without feed/fetch
-        with program_guard(program, init_program):
-            x = layers.data(name='x', shape=[2], dtype='float32')
-            y = layers.data(name='y', shape=[1], dtype='float32')
-
-            y_predict = layers.fc(input=x, size=1, act=None)
-
-            cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
-
-        place = core.CPUPlace()
-        exe = executor.Executor(place)
-        exe.run(init_program, feed={}, fetch_list=[])
-
-        # will print warning message
-
-        cp_prog = CompiledProgram(program).with_data_parallel(
-            loss_name=avg_cost.name)
-
-        save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, cp_prog)
-        self.assertRaises(TypeError, save_inference_model,
-                          [MODEL_DIR, ["x", "y"], [avg_cost], [], cp_prog])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
deleted file mode 100644
index c6bed4db72e50135fba7b22f805efb281c178e2d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ /dev/null
@@ -1,520 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-
-import paddle.fluid.framework as framework
-import paddle.fluid.initializer as initializer
-from paddle.fluid.core import VarDesc
-
-DELTA = 0.00001
-
-
-def check_cast_op(op):
-    return op.type == 'cast' and \
-           op.attr('in_dtype') == VarDesc.VarType.FP32 and \
-           op.attr('out_dtype') == VarDesc.VarType.FP16
-
-
-class TestConstantInitializer(unittest.TestCase):
-    def test_constant_initializer_default_value(self, dtype="float32"):
-        """Test the constant initializer with default value
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.ConstantInitializer())
-        num_ops = 2 if dtype == "float16" else 1
-        self.assertEqual(len(block.ops), num_ops)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'fill_constant')
-        self.assertAlmostEqual(init_op.attr('value'), 0.0, delta=DELTA)
-        return block
-
-    def test_constant_initializer(self, dtype="float32"):
-        """Test constant initializer with supplied value
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.ConstantInitializer(2.3))
-        num_ops = 2 if dtype == "float16" else 1
-        self.assertEqual(len(block.ops), num_ops)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'fill_constant')
-        self.assertAlmostEqual(init_op.attr('value'), 2.3, delta=DELTA)
-        return block
-
-    def test_constant_initializer_fp16(self):
-        """Test constant initializer with float16
-        """
-        block = self.test_constant_initializer_default_value("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-        block = self.test_constant_initializer("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-
-
-class TestUniformInitializer(unittest.TestCase):
-    def test_uniform_initializer_default_value(self, dtype="float32"):
-        """Test the uniform initializer with default value
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.UniformInitializer())
-        num_ops = 2 if dtype == "float16" else 1
-        self.assertEqual(len(block.ops), num_ops)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        self.assertAlmostEqual(init_op.attr('min'), -1.0, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), 1.0, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 0)
-        return block
-
-    def test_uniform_initializer_random_seed(self):
-        """Test the uniform initializer with manually setting seed
-        """
-        program = framework.Program()
-        program.random_seed = 123
-        block = program.global_block()
-        for _ in range(2):
-            block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param1",
-                initializer=initializer.UniformInitializer())
-            block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param2",
-                initializer=initializer.UniformInitializer(seed=456))
-        init_op = block.ops[1]
-        self.assertEqual(init_op.attr("seed"), 123)
-        init_op1 = block.ops[0]
-        self.assertEqual(init_op1.attr("seed"), 456)
-
-    def test_uniform_initializer(self, dtype="float32"):
-        """Test uniform initializer with supplied attributes
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
-        num_ops = 2 if dtype == "float16" else 1
-        self.assertEqual(len(block.ops), num_ops)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        self.assertAlmostEqual(init_op.attr('min'), -4.2, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 123)
-        return block
-
-    def test_uniform_initializer_two_op(self, dtype="float32"):
-        """Test uniform initializer with supplied attributes
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for i in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.UniformInitializer(-4.2, float(i), 123))
-        num_ops = 2 if dtype == "float16" else 1
-        self.assertEqual(len(block.ops), num_ops)
-        init_op0 = block.ops[0]
-        self.assertEqual(init_op0.type, 'uniform_random')
-        self.assertAlmostEqual(init_op0.attr('min'), -4.2, delta=DELTA)
-        self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA)
-        self.assertEqual(init_op0.attr('seed'), 123)
-        return block
-
-    def test_uniform_initializer_fp16(self):
-        """Test uniform initializer with float16
-        """
-        block = self.test_uniform_initializer_default_value("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-        block = self.test_uniform_initializer(dtype="float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-        block = self.test_uniform_initializer_two_op("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-
-
-class TestNormalInitializer(unittest.TestCase):
-    def test_normal_initializer_default_value(self):
-        """Test the normal initializer with default value
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.NormalInitializer())
-        self.assertEqual(len(block.ops), 1)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'gaussian_random')
-        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('std'), 1.0, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 0)
-
-    def test_normal_initializer(self, dtype="float32"):
-        """Test normal initializer with supplied attributes
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.NormalInitializer(2.3, 1.9, 123))
-        num_ops = 2 if dtype == "float16" else 1
-        self.assertEqual(len(block.ops), num_ops)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'gaussian_random')
-        self.assertAlmostEqual(init_op.attr('mean'), 2.3, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('std'), 1.9, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 123)
-        return block
-
-    def test_normal_initializer_fp16(self):
-        """Test normal initializer with float16
-        """
-        block = self.test_normal_initializer("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-
-
-class TestXavierInitializer(unittest.TestCase):
-    def test_uniform_xavier_initializer(self):
-        """Test Xavier initializer with uniform distribution on
-           for matrix multiply.
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            param = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.XavierInitializer())
-        self.assertEqual(len(block.ops), 1)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        limit = np.sqrt(6.0 / (param.shape[0] + param.shape[1]))
-        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 0)
-
-    def test_uniform_xavier_initializer_conv(self):
-        """Test Xavier initializer with uniform distribution on
-           for convolutions.
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            param = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10, 15, 20],
-                lod_level=0,
-                name="param",
-                initializer=initializer.XavierInitializer())
-        self.assertEqual(len(block.ops), 1)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        receptive_field_size = float(15 * 20)
-        limit = np.sqrt(6.0 / (
-            (param.shape[0] + param.shape[1]) * receptive_field_size))
-        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 0)
-
-    def test_normal_xavier_initializer(self):
-        """Test Xavier initializer with normal distribution on
-           for matrix multiply.
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            param = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.XavierInitializer(uniform=False))
-        self.assertEqual(len(block.ops), 1)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'gaussian_random')
-        std = np.sqrt(2.0 / (param.shape[0] + param.shape[1]))
-        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 0)
-
-    def test_normal_xavier_initializer_conv(self):
-        """Test Xavier initializer with normal distribution on
-           for convolutions.
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            param = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10, 15, 20],
-                lod_level=0,
-                name="param",
-                initializer=initializer.XavierInitializer(uniform=False))
-        self.assertEqual(len(block.ops), 1)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'gaussian_random')
-        receptive_field_size = float(15 * 20)
-        std = np.sqrt(2.0 / (
-            (param.shape[0] + param.shape[1]) * receptive_field_size))
-        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 0)
-
-    def test_xavier_initializer_supplied_arguments(self, dtype="float32"):
-        """Test the Xavier initializer with supplied arguments
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.XavierInitializer(
-                    fan_in=12, fan_out=23, seed=134))
-        num_ops = 2 if dtype == "float16" else 1
-        self.assertEqual(len(block.ops), num_ops)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        limit = np.sqrt(6.0 / (12 + 23))
-        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 134)
-        return block
-
-    def test_xavier_initializer_fp16(self):
-        """Test the Xavier initializer with float16
-        """
-        block = self.test_xavier_initializer_supplied_arguments("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-
-
-class TestMSRAInitializer(unittest.TestCase):
-    def test_uniform_msra_initializer(self):
-        """Test MSRA initializer with uniform distribution on
-           for matrix multiply.
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            param = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.MSRAInitializer())
-        self.assertEqual(len(block.ops), 1)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        limit = np.sqrt(6.0 / param.shape[0])
-        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 0)
-
-    def test_uniform_msra_initializer_conv(self):
-        """Test MSRA initializer with uniform distribution on
-           for convolutions.
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            param = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10, 15, 20],
-                lod_level=0,
-                name="param",
-                initializer=initializer.MSRAInitializer())
-        self.assertEqual(len(block.ops), 1)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        receptive_field_size = float(15 * 20)
-        limit = np.sqrt(6.0 / (param.shape[1] * receptive_field_size))
-        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 0)
-
-    def test_normal_msra_initializer(self):
-        """Test MSRA initializer with normal distribution on
-           for matrix multiply.
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            param = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.MSRAInitializer(uniform=False))
-        self.assertEqual(len(block.ops), 1)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'gaussian_random')
-        std = np.sqrt(2.0 / param.shape[0])
-        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 0)
-
-    def test_normal_msra_initializer_conv(self):
-        """Test MSRA initializer with normal distribution on
-           for convolutions.
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            param = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10, 15, 20],
-                lod_level=0,
-                name="param",
-                initializer=initializer.MSRAInitializer(uniform=False))
-        self.assertEqual(len(block.ops), 1)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'gaussian_random')
-        receptive_field_size = float(15 * 20)
-        std = np.sqrt(2.0 / (param.shape[1] * receptive_field_size))
-        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 0)
-
-    def test_msra_initializer_supplied_arguments(self, dtype="float32"):
-        """Test the MSRA initializer with supplied arguments
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.MSRAInitializer(
-                    fan_in=12, seed=134))
-        num_ops = 2 if dtype == "float16" else 1
-        self.assertEqual(len(block.ops), num_ops)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        limit = np.sqrt(6.0 / 12)
-        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
-        self.assertEqual(init_op.attr('seed'), 134)
-        return block
-
-    def test_msra_initializer_fp16(self):
-        """Test the MSRA initializer with float16
-        """
-        block = self.test_msra_initializer_supplied_arguments("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-
-
-class TestBilinearInitializer(unittest.TestCase):
-    def test_bilinear_initializer(self, dtype="float32"):
-        """Test the bilinear initializer with supplied arguments
-        """
-        program = framework.Program()
-        block = program.global_block()
-        for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[8, 1, 3, 3],
-                lod_level=0,
-                name="param",
-                initializer=initializer.BilinearInitializer())
-        num_ops = 2 if dtype == "float16" else 1
-        self.assertEqual(len(block.ops), num_ops)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'assign_value')
-        return block
-
-    def test_bilinear_initializer_fp16(self):
-        """Test the bilinear initializer with supplied arguments
-        """
-        block = self.test_bilinear_initializer("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-
-
-class TestNumpyArrayInitializer(unittest.TestCase):
-    def test_numpy_array_initializer(self, dtype="float32"):
-        """Test the numpy array initializer with supplied arguments
-        """
-        import numpy
-        program = framework.Program()
-        block = program.global_block()
-        np_array = numpy.random.random((10000)).astype(dtype)
-        for _ in range(2):
-            block.create_parameter(
-                dtype=np_array.dtype,
-                shape=np_array.shape,
-                lod_level=0,
-                name="param",
-                initializer=initializer.NumpyArrayInitializer(np_array))
-        num_ops = 2 if dtype == "float16" else 1
-        self.assertEqual(len(block.ops), num_ops)
-        init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'assign_value')
-        assert (init_op.attr('fp32_values') == np_array).all()
-        return block
-
-    def test_numpy_array_initializer_fp16(self):
-        """Test the numpy array initializer with float16
-        """
-        block = self.test_numpy_array_initializer("float16")
-        self.assertTrue(block.ops[1])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
deleted file mode 100644
index 90666d4ebb6e6069ff74ec5efb2834d3c384b1bc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import layers
-import numpy as np
-import unittest
-
-
-class TestSoftmaxWithXe(unittest.TestCase):
-    def setUp(self):
-        self.initParameter()
-        self.m, self.n = np.random.random_integers(
-            low=100, high=2000, size=[2]).astype('int64')
-
-    def initParameter(self):
-        self.dtype = 'float32'
-        self.soft_label = False
-
-    def softmax_with_xe(self,
-                        x,
-                        y,
-                        place,
-                        inplace=True,
-                        numeric_stable_mode=True):
-        m, n = x.shape
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            with fluid.scope_guard(fluid.Scope()):
-                x_d = fluid.layers.data(
-                    name='x',
-                    shape=[m, n],
-                    dtype=self.dtype,
-                    append_batch_size=False)
-                y_d = fluid.layers.data(
-                    name='y',
-                    shape=[m, 1] if not self.soft_label else [m, n],
-                    dtype='int64' if not self.soft_label else self.dtype,
-                    append_batch_size=False)
-                z_d, s_d = fluid.layers.softmax_with_cross_entropy(
-                    x_d,
-                    y_d,
-                    soft_label=self.soft_label,
-                    return_softmax=True,
-                    numeric_stable_mode=numeric_stable_mode)
-
-                exe = fluid.Executor(place)
-
-                exe.run(fluid.default_startup_program())
-
-                build_strategy = fluid.BuildStrategy()
-                build_strategy.enable_inplace = inplace
-                prog = fluid.CompiledProgram(fluid.default_main_program(
-                )).with_data_parallel(
-                    build_strategy=build_strategy, places=place)
-
-                fetch_list = [z_d.name, s_d.name]
-
-                print('Inplace is {}'.format("ON" if inplace else "OFF"))
-
-                z, s = exe.run(prog,
-                               feed={x_d.name: x,
-                                     y_d.name: y},
-                               fetch_list=fetch_list)
-                return z, s
-
-    def main_with_place(self, place):
-        x = np.random.random(size=[self.m, self.n]).astype(self.dtype)
-        x_range = [(-30, 30), (10, 20), (-1, 1), (2, 3), (0, 0.3), (-200, -100)]
-
-        for a, b in x_range:
-            x = ((b - a) * x + a).astype(self.dtype)
-            if not self.soft_label:
-                y = np.random.random_integers(
-                    size=[self.m, 1], low=0, high=self.n - 1).astype('int64')
-            else:
-                y = np.random.random(size=[self.m, self.n]).astype(self.dtype)
-                norm_y = np.broadcast_to(
-                    np.reshape(
-                        np.sum(y, axis=1), [-1, 1]), y.shape)
-                y = y / norm_y
-
-            z1, s1 = self.softmax_with_xe(
-                x, y, place, inplace=False, numeric_stable_mode=False)
-            z2, s2 = self.softmax_with_xe(
-                x, y, place, inplace=True, numeric_stable_mode=False)
-
-            self.assertTrue((z1 == z2).all())
-            self.assertTrue((s1 == s2).all())
-
-            z1, s1 = self.softmax_with_xe(
-                x, y, place, inplace=False, numeric_stable_mode=True)
-            z2, s2 = self.softmax_with_xe(
-                x, y, place, inplace=True, numeric_stable_mode=True)
-            self.assertTrue((z1 == z2).all())
-            self.assertTrue((s1 == s2).all())
-
-    def test_main(self):
-        self.main_with_place(fluid.CPUPlace())
-        if fluid.core.is_compiled_with_cuda():
-            self.main_with_place(fluid.CUDAPlace(0))
-
-
-class TestSoftmaxWithXe1(TestSoftmaxWithXe):
-    def initParameter(self):
-        self.dtype = 'float32'
-        self.soft_label = True
-
-
-class TestSoftmaxWithXe2(TestSoftmaxWithXe):
-    def initParameter(self):
-        self.dtype = 'float64'
-        self.soft_label = False
-
-
-class TestSoftmaxWithXe3(TestSoftmaxWithXe):
-    def initParameter(self):
-        self.dtype = 'float64'
-        self.soft_label = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_install_check.py b/python/paddle/fluid/tests/unittests/test_install_check.py
deleted file mode 100644
index 5cb199d4967a49f11d656818676bfe855a957bda..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_install_check.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import paddle.fluid as fluid
-
-
-class TestInstallCheck(unittest.TestCase):
-    def test_install_check(self):
-        fluid.install_check.run_check()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
deleted file mode 100644
index ccdf12849c7ee61fad5916aea0403b760a0302db..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
+++ /dev/null
@@ -1,195 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid.op import Operator
-from op_test import OpTest
-
-
-def _reference_instance_norm_naive(x, scale, bias, epsilon, mean, var):
-    x_shape = x.shape
-    if len(x_shape) == 2:
-        x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-    n, c, h, w = x.shape
-
-    mean_tile = np.reshape(mean, (n, c, 1, 1))
-    mean_tile = np.tile(mean_tile, (1, 1, h, w))
-    var_tile = np.reshape(var, (n, c, 1, 1))
-    var_tile = np.tile(var_tile, (1, 1, h, w))
-
-    x_norm = (x - mean_tile) / np.sqrt(var_tile + epsilon).astype('float32')
-    scale_tile = np.reshape(scale, (1, c, 1, 1))
-    scale_tile = np.tile(scale_tile, (n, 1, h, w))
-    bias_tile = np.reshape(bias, (1, c, 1, 1))
-    bias_tile = np.tile(bias_tile, (n, 1, h, w))
-    y = scale_tile * x_norm + bias_tile
-    if len(x_shape) == 2:
-        y = np.reshape(y, x_shape)
-    return y, mean, var
-
-
-def _reference_instance_norm_grad(x, d_y, scale, mean, var, epsilon):
-    # d_scale = sum(d_y * (x-mean) / sqrt(var+epsilon))
-    # d_offset = sum(d_y)
-    # d_x = scale / sqrt(var+epsilon) * (d_y - np.mean(d_y, axis=(2,3)) - (x-mean)/sqrt(var+epsilon)* np.mean(y_grad * (x-mean)/sqrt(var+epsilon), axis=(2,3)))
-    n, c, h, w = x.shape
-
-    d_bias = np.sum(d_y, axis=(0, 2, 3))
-
-    mean_tile = np.reshape(mean, (n, c, 1, 1))
-    mean_tile = np.tile(mean_tile, (1, 1, h, w))
-    var_tile = np.reshape(var, (n, c, 1, 1))
-    var_tile = np.tile(var_tile, (1, 1, h, w))
-
-    d_scale = np.sum(d_y * (x - mean_tile) * var_tile, axis=(0, 2, 3))
-    var_inv = var_tile
-    scale_tile = np.reshape(scale, (1, c, 1, 1))
-    scale_tile = np.tile(scale_tile, (n, 1, h, w))
-
-    d_x = scale_tile * var_inv * (d_y - np.mean(
-        d_y, axis=(2, 3), keepdims=True) - (x - mean_tile) * var_inv * np.mean(
-            d_y * (x - mean_tile) * var_inv, axis=(2, 3), keepdims=True))
-    return d_x, d_scale, d_bias
-
-
-def _cal_mean_variance(x, epsilon, mean_shape):
-    mean = np.reshape(np.mean(x, axis=(2, 3)), mean_shape)
-    var = np.reshape(np.var(x, axis=(2, 3)), mean_shape)
-    return mean, var
-
-
-class TestInstanceNormOpTraining(unittest.TestCase):
-    def setUp(self):
-        self.epsilon = 1e-5
-        self.init_test_case()
-
-    def init_test_case(self):
-        self.use_global_stats = False
-        self.no_grad_set = set()
-        self.fetch_list = [
-            'y', 'saved_mean', 'saved_variance', 'x@GRAD', 'scale@GRAD',
-            'bias@GRAD'
-        ]
-
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
-
-    def set_global_mean_var(self, mean_shape, x):
-        mean, variance = _cal_mean_variance(x, self.epsilon, mean_shape)
-        return mean, variance
-
-    def test_forward_backward(self):
-        def test_with_place(place, shape):
-            epsilon = self.epsilon
-            n, c, h, w = shape[0], shape[1], shape[2], shape[3]
-            scale_shape = [c]
-            mean_shape = [n * c]
-
-            np.random.seed()
-            x = np.random.random_sample(shape).astype(np.float32)
-            scale = np.random.random_sample(scale_shape).astype(np.float32)
-            bias = np.random.random_sample(scale_shape).astype(np.float32)
-            mean, variance = self.set_global_mean_var(mean_shape, x)
-            d_y = np.random.random_sample(shape).astype(np.float32)
-
-            y, saved_mean, variance_tmp = _reference_instance_norm_naive(
-                x, scale, bias, epsilon, mean, variance)
-
-            saved_variance = 1 / np.sqrt(variance_tmp + epsilon)
-
-            d_x, d_scale, d_bias = _reference_instance_norm_grad(
-                x, d_y, scale, saved_mean, saved_variance, epsilon)
-
-            var_dict = locals()
-            var_dict['y@GRAD'] = d_y
-            var_dict['x@GRAD'] = d_x
-            var_dict['scale@GRAD'] = d_scale
-            var_dict['bias@GRAD'] = d_bias
-
-            var_names = [
-                'x', 'scale', 'bias', 'y', 'saved_mean', 'saved_variance'
-            ]
-            ground_truth = {name: var_dict[name] for name in var_names}
-
-            program = fluid.Program()
-            with fluid.program_guard(program):
-                block = program.global_block()
-                for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
-                in_op = block.append_op(
-                    type="instance_norm",
-                    inputs={
-                        "X": block.var("x"),
-                        "Scale": block.var("scale"),
-                        "Bias": block.var("bias"),
-                    },
-                    outputs={
-                        "Y": block.var("y"),
-                        "SavedMean": block.var("saved_mean"),
-                        "SavedVariance": block.var("saved_variance")
-                    },
-                    attrs={"epsilon": epsilon, })
-
-                block.create_var(name="y@GRAD", dtype='float32', shape=y.shape)
-
-                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    in_op.desc, self.no_grad_set, [])
-                grad_op_desc = grad_op_desc_list[0]
-                new_op_desc = block.desc.append_op()
-                new_op_desc.copy_from(grad_op_desc)
-                for var_name in grad_op_desc.output_arg_names():
-                    block.desc.var(var_name.encode("ascii"))
-                grad_op_desc.infer_var_type(block.desc)
-                grad_op_desc.infer_shape(block.desc)
-                for arg in grad_op_desc.output_arg_names():
-                    grad_var = block.desc.find_var(arg.encode("ascii"))
-                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-                exe = fluid.Executor(place)
-                out = exe.run(program,
-                              feed={
-                                  name: var_dict[name]
-                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
-                              },
-                              fetch_list=self.fetch_list)
-
-            for id, name in enumerate(self.fetch_list):
-                self.__assert_close(var_dict[name], out[id], name)
-            print("op test forward passes: ", str(place))
-
-        places = [core.CPUPlace()]
-
-        if core.is_compiled_with_cuda() and core.op_support_gpu(
-                "instance_norm"):
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            test_with_place(place, [2, 3, 4, 5])
-
-
-class TestInstanceNormOpTrainingCase1(TestInstanceNormOpTraining):
-    def init_test_case(self):
-        self.use_global_stats = False
-        self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
-        self.fetch_list = ['y', 'saved_mean', 'saved_variance', 'x@GRAD']
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
deleted file mode 100644
index 7c1808cf998e84c22c46df68ef07259c1a021c19..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import numpy.random as random
-import sys
-import math
-from op_test import OpTest
-
-
-class TestIOUSimilarityOp(OpTest):
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "iou_similarity"
-        self.boxes1 = random.rand(2, 4).astype('float32')
-        self.boxes2 = random.rand(3, 4).astype('float32')
-        self.output = random.rand(2, 3).astype('float32')
-        for row in range(self.boxes1.shape[0]):
-            for col in range(self.boxes2.shape[0]):
-                xmin1, ymin1, xmax1, ymax1 = self.boxes1[row]
-                xmin2, ymin2, xmax2, ymax2 = self.boxes2[col]
-                area1 = (ymax1 - ymin1) * (xmax1 - xmin1)
-                area2 = (ymax2 - ymin2) * (xmax2 - xmin2)
-                inter_xmax = min(xmax1, xmax2)
-                inter_ymax = min(ymax1, ymax2)
-                inter_xmin = max(xmin1, xmin2)
-                inter_ymin = max(ymin1, ymin2)
-                inter_height = inter_ymax - inter_ymin
-                inter_width = inter_xmax - inter_xmin
-                inter_height = max(inter_height, 0)
-                inter_width = max(inter_width, 0)
-                inter_area = inter_width * inter_height
-                union_area = area1 + area2 - inter_area
-                sim_score = inter_area / union_area
-                self.output[row, col] = sim_score
-        self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
-
-        self.outputs = {'Out': self.output}
-
-
-class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp):
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        super(TestIOUSimilarityOpWithLoD, self).setUp()
-        self.boxes1_lod = [[1, 1]]
-        self.output_lod = [[1, 1]]
-
-        self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
-        self.outputs = {'Out': (self.output, self.output_lod)}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_graph.py b/python/paddle/fluid/tests/unittests/test_ir_graph.py
deleted file mode 100644
index ba6e4a8b2effade67821f5da9c2bbf7849a8cf79..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_ir_graph.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import six
-from paddle import fluid
-
-
-class TestIRGraph(unittest.TestCase):
-    """
-    TODO(fc500110): `resolve_hazard` api will be tested when it can be used.
-    """
-
-    def test_nodes(self):
-        graph = build_graph()
-        self.assertTrue(
-            {node.name()
-             for node in graph.nodes()} == {"x1", "x2", "out", "sum"})
-
-    def test_has_set_get(self):
-        graph = build_graph()
-        for attr_name in ["int", "float", "string"]:
-            self.assertFalse(graph.has(attr_name))
-        graph.set("int", 1)
-        graph.set("float", 0.5)
-        graph.set("string", "string")
-        for attr_name in ["int", "float", "string"]:
-            self.assertTrue(graph.has(attr_name))
-
-        self.assertTrue(graph.get_int("int") == 1)
-        self.assertTrue(graph.get_float("float") == 0.5)
-        self.assertTrue(graph.get_string("string") == "string")
-
-    def test_erase(self):
-        graph = build_graph()
-        graph.set("test", 0)
-        self.assertTrue(graph.has("test"))
-        graph.erase("test")
-        self.assertFalse(graph.has("test"))
-
-    def test_create_var_node(self):
-        prog = fluid.core.ProgramDesc()
-        block = prog.block(0)
-        shape = [10, 20]
-        x1 = block.var(six.b("x1"))
-        x1.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
-        x1.set_shape(shape)
-        graph = fluid.core.Graph(prog)
-        node = graph.create_var_node(x1)
-        self.assertTrue(node.node_type() == fluid.core.Node.Type.Variable)
-
-    def test_create_op_node(self):
-        prog = fluid.core.ProgramDesc()
-        block = prog.block(0)
-        sum_op_desc = block.append_op()
-        graph = fluid.core.Graph(prog)
-        node = graph.create_op_node(sum_op_desc)
-        self.assertTrue(node.node_type() == fluid.core.Node.Type.Operation)
-
-    def test_create_control_dep_var(self):
-        graph = build_graph()
-        name = "__control_var@{}".format(len(graph.nodes()))
-        node = graph.create_control_dep_var()
-        self.assertTrue(node.name() == name)
-
-    def test_create_empty_node(self):
-        prog = fluid.core.ProgramDesc()
-        graph = fluid.core.Graph(prog)
-        n1 = graph.create_empty_node('x', fluid.core.Node.Type.Operation)
-        self.assertTrue(n1.name() == 'x')
-        n2 = graph.create_empty_node('y', fluid.core.Node.Type.Variable)
-        self.assertTrue(n2.name() == 'y')
-
-    def test_release_nodes(self):
-        graph = build_graph()
-        nodes = graph.release_nodes()
-        self.assertTrue(len(graph.nodes()) == 0)
-        self.assertTrue({node.name()
-                         for node in nodes} == {"x1", "x2", "out", "sum"})
-
-    def test_remove_node(self):
-        graph = build_graph()
-        nodes = graph.nodes()
-        for node in nodes:
-            if node.name() == "sum":
-                break
-        self.assertTrue({node.name()
-                         for node in nodes} == {"x1", "x2", "out", "sum"})
-        nodes.remove(node)
-        self.assertTrue({node.name() for node in nodes} == {"x1", "x2", "out"})
-
-    def test_retrieve_node(self):
-        graph = build_graph()
-        nodes = []
-        for i in range(len(graph.nodes())):
-            nodes.append(graph.retrieve_node(i))
-
-        for node in nodes:
-            self.assertTrue(node in graph.nodes())
-
-    def resolve_hazard(self):
-        pass
-
-
-def build_graph():
-    prog = fluid.core.ProgramDesc()
-    block = prog.block(0)
-
-    shape = [10, 20]
-
-    # prepare input/output
-    x1 = block.var(six.b("x1"))
-    x1.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
-    x1.set_shape(shape)
-    x2 = block.var(six.b("x2"))
-    x2.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
-    x2.set_shape(shape)
-
-    out = block.var(six.b("out"))
-    out.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
-
-    sum_op_desc = block.append_op()
-    sum_op_desc.set_type("sum")
-    sum_op_desc.set_input("X", ["x1", "x2"])
-    sum_op_desc.set_output("Out", ["out"])
-
-    sum_op_desc.check_attrs()
-    sum_op_desc.infer_shape(block)
-    graph = fluid.core.Graph(prog)
-    return graph
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
deleted file mode 100644
index c1ef0f49afbb287104edea0659f89b7025a560bc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from parallel_executor_test_base import TestParallelExecutorBase
-
-
-def fc_with_batchnorm(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(3):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-class TestIrInplace(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def _fc_with_batchnorm(self, ir_memory_optimize, enable_inplace):
-
-        if not core.is_compiled_with_cuda():
-            return
-        np.random.seed(5)
-        img = np.random.random(size=[32, 784]).astype(np.float32)
-        label = np.ones(shape=[32, 1], dtype='int64')
-        self.check_network_convergence(
-            fc_with_batchnorm,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=True,
-            use_ir_memory_optimize=ir_memory_optimize,
-            enable_inplace=enable_inplace)
-
-    def test_fc_with_batchnorm(self, delta=1e-3):
-        loss00 = self._fc_with_batchnorm(False, False)
-        loss10 = self._fc_with_batchnorm(True, False)
-        loss01 = self._fc_with_batchnorm(False, True)
-        loss11 = self._fc_with_batchnorm(True, True)
-        self.assertAlmostEqual(loss00, loss10, delta=delta)
-        self.assertAlmostEqual(loss00, loss01, delta=delta)
-        self.assertAlmostEqual(loss00, loss11, delta=delta)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
deleted file mode 100644
index c5228fcf122748d2518238aa21ea486ed5f60d46..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# nlp model stack of op operate on lod. It's a classical test case in optimize pass.
-
-from __future__ import print_function
-
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-import unittest
-import paddle.fluid.core as core
-
-from paddle.fluid import compiler, Program, program_guard
-from paddle.fluid.executor import Executor
-from paddle.fluid.backward import append_backward
-from paddle.fluid.optimizer import MomentumOptimizer
-from ir_memory_optimize_net_base import TestIrMemOptBase
-
-
-class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
-    def check_network_convergence(self,
-                                  use_cuda=True,
-                                  use_mem_opt=False,
-                                  iter_num=5):
-        prog = Program()
-        startup_prog = Program()
-        prog.random_seed = 100
-        startup_prog.random_seed = 100
-        with program_guard(prog, startup_prog):
-            image = layers.data(name='x', shape=[784], dtype='float32')
-
-            label = layers.data(name='y', shape=[1], dtype='int64')
-
-            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
-            cond = layers.less_than(x=label, y=limit)
-            ie = layers.IfElse(cond)
-
-            with ie.true_block():
-                true_image = ie.input(image)
-                hidden = layers.fc(input=true_image, size=100, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            with ie.false_block():
-                false_image = ie.input(image)
-                hidden = layers.fc(input=false_image, size=200, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            prob = ie()
-            loss = layers.cross_entropy(input=prob[0], label=label)
-            avg_loss = layers.mean(loss)
-
-            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-            optimizer.minimize(avg_loss, startup_prog)
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=200)
-
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = Executor(place)
-
-            exec_strategy = fluid.ExecutionStrategy()
-            exec_strategy.use_cuda = use_cuda
-
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.memory_optimize = use_mem_opt
-
-            train_cp = compiler.CompiledProgram(fluid.default_main_program())
-            train_cp = train_cp.with_data_parallel(
-                loss_name=avg_loss.name,
-                exec_strategy=exec_strategy,
-                build_strategy=build_strategy)
-            fetch_list = [avg_loss.name]
-
-            exe.run(startup_prog)
-            PASS_NUM = 100
-            loop = 0
-            ret = []
-            for pass_id in range(PASS_NUM):
-                for data in train_reader():
-                    x_data = np.array([x[0] for x in data]).astype("float32")
-                    y_data = np.array([x[1] for x in data]).astype("int64")
-                    y_data = y_data.reshape((y_data.shape[0], 1))
-
-                    outs = exe.run(train_cp,
-                                   feed={'x': x_data,
-                                         'y': y_data},
-                                   fetch_list=[avg_loss])
-
-                    loop += 1
-                    ret.append(outs[0])
-                    if iter_num == loop:
-                        return ret
-            return ret
-
-    def test_ifelse(self):
-        ret1 = self.check_network_convergence(False, True)
-        print(ret1)
-        ret2 = self.check_network_convergence(False, False)
-        print(ret2)
-        self.assertTrue(np.allclose(ret1, ret2))
-
-        if fluid.core.is_compiled_with_cuda():
-            ret1 = self.check_network_convergence(True, True)
-            print(ret1)
-            ret2 = self.check_network_convergence(True, False)
-            print(ret2)
-            self.assertTrue(np.allclose(ret1, ret2))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
deleted file mode 100644
index 30b6d6106cdc46cfed201e5bb44a0c80d7e8ca3d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# nlp model stack of op operate on lod. It's a classical test case in optimize pass.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-import unittest
-from ir_memory_optimize_net_base import TestIrMemOptBase
-
-
-def lstm_net(data,
-             label,
-             dict_dim,
-             emb_dim=128,
-             hid_dim=128,
-             hid_dim2=96,
-             class_dim=2,
-             emb_lr=30.0):
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
-
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
-    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
-    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    return avg_cost
-
-
-class TestIrMemOptRNN(TestIrMemOptBase):
-    def setUp(self):
-        self.network = lstm_net
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
deleted file mode 100644
index d9f68c2d15ee7c728379140f2601e69dc0c245fc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from parallel_executor_test_base import TestParallelExecutorBase
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import numpy as np
-import paddle
-import paddle.dataset.mnist as mnist
-import unittest
-import os
-
-
-def _feed_data_helper():
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    return img, label
-
-
-def simple_fc_net(use_feed):
-    assert use_feed
-    x, y = _feed_data_helper()
-    hidden_layer = 4
-    for _ in range(hidden_layer):
-        x = fluid.layers.fc(input=x, size=20, act='relu')
-    y_predict = fluid.layers.fc(input=x, size=10, act='softmax')
-    cost = fluid.layers.cross_entropy(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-    return avg_cost
-
-
-def fc_with_inplace_net(use_feed):
-    assert use_feed
-    x, y = _feed_data_helper()
-    fc = fluid.layers.fc(input=x, size=20, act='relu')
-    fc = fluid.layers.fc(input=fc, size=10, act='relu')
-    reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5])
-    reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2])
-    y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax')
-    cost = fluid.layers.cross_entropy(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-    return avg_cost
-
-
-class TestMNIST(TestParallelExecutorBase):
-    def _dummy_data(self):
-        np.random.seed(5)
-        img = np.random.random(size=[32, 784]).astype(np.float32)
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
-    def _compare_ir_memory_optimize(self, model, use_cuda):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        img, label = self._dummy_data()
-        first_loss0, last_loss0 = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_ir_memory_optimize=False)
-        first_loss1, last_loss1 = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_ir_memory_optimize=True)
-        for loss in zip(first_loss0, first_loss1):
-            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
-        for loss in zip(last_loss0, last_loss1):
-            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
-
-    def test_simple_fc_net(self):
-        self._compare_ir_memory_optimize(simple_fc_net, False)
-        self._compare_ir_memory_optimize(simple_fc_net, True)
-
-    def test_fc_with_reshape_net(self):
-        self._compare_ir_memory_optimize(fc_with_inplace_net, False)
-        self._compare_ir_memory_optimize(fc_with_inplace_net, True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
deleted file mode 100644
index 1af696f873315c2a6494266fc931185525e023ac..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import unittest
-from timeit import default_timer as timer
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.dataset.wmt16 as wmt16
-
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
-
-from parallel_executor_test_base import TestParallelExecutorBase
-from test_parallel_executor_transformer import get_feed_data_reader, transformer
-
-
-# NOTE(dzhwinter): test diferent strategy colisions.
-# open the eager delete tensor strategy by default.
-class TestTransformerWithIR(TestParallelExecutorBase):
-    def test_main(self):
-        if core.is_compiled_with_cuda():
-            # check python transpiler
-            self.check_network_convergence(
-                transformer,
-                use_cuda=True,
-                feed_data_reader=get_feed_data_reader(),
-                use_ir_memory_optimize=False,
-                iter=2)
-            # check IR memory optimize
-            self.check_network_convergence(
-                transformer,
-                use_cuda=True,
-                feed_data_reader=get_feed_data_reader(),
-                use_ir_memory_optimize=True,
-                iter=2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_is_empty_op.py b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
deleted file mode 100644
index 26d607718aec0bdffa00b9b4bca06ec6c0196217..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestEmpty(OpTest):
-    def setUp(self):
-        self.op_type = "is_empty"
-        self.inputs = {'X': np.array([1, 2, 3])}
-        self.outputs = {'Out': np.array([False])}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestNotEmpty(TestEmpty):
-    def setUp(self):
-        self.op_type = "is_empty"
-        self.inputs = {'X': np.array([])}
-        self.outputs = {'Out': np.array([True])}
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
deleted file mode 100644
index d96ae15c7288c9a8d585d8d70d2aa8922b8f22b3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_isfinite_op.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestInf(OpTest):
-    def setUp(self):
-        self.op_type = "isinf"
-        self.dtype = np.float32
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        x[0] = np.inf
-        x[-1] = np.inf
-
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.array(True).astype(self.dtype)}
-
-    def init_dtype(self):
-        pass
-
-    def test_output(self):
-        self.check_output()
-
-
-class TestFP16Inf(TestInf):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-class TestNAN(OpTest):
-    def setUp(self):
-        self.op_type = "isnan"
-        self.dtype = np.float32
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        x[0] = np.nan
-        x[-1] = np.nan
-
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.array(True).astype(self.dtype)}
-
-    def init_dtype(self):
-        pass
-
-    def test_output(self):
-        self.check_output()
-
-
-class TestFP16NAN(TestNAN):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-class TestIsfinite(OpTest):
-    def setUp(self):
-        self.op_type = "isfinite"
-        self.dtype = np.float32
-        self.init_dtype()
-
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        x[0] = np.inf
-        x[-1] = np.nan
-        out = np.isinf(x) | np.isnan(x)
-
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.array(False).astype(self.dtype)}
-
-    def init_dtype(self):
-        pass
-
-    def test_output(self):
-        self.check_output()
-
-
-class TestFP16Isfinite(TestIsfinite):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
deleted file mode 100644
index d0212d177e6f1c60b916a0cb0eef7cd7f54a3585..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def kldiv_loss(x, target, reduction):
-    output = target * (np.log(target) - x)
-    loss = np.where(target >= 0, output, np.zeros_like(x))
-
-    if reduction == "batchmean":
-        return loss.sum() / x.shape[0]
-    if reduction == "mean":
-        return loss.mean()
-    if reduction == "sum":
-        return loss.sum()
-
-    return loss
-
-
-class TestKLDivLossOp(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = 'kldiv_loss'
-        x = np.random.uniform(-10, 10, self.x_shape).astype('float32')
-        target = np.random.uniform(-10, 10, self.x_shape).astype('float32')
-
-        self.attrs = {"reduction": self.reduction}
-
-        self.inputs = {
-            'X': x,
-            'Target': target,
-        }
-        loss = kldiv_loss(x, target, self.reduction)
-        self.outputs = {'Loss': loss.astype('float32')}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Loss', no_grad_set=set(["Target"]), max_relative_error=0.06)
-
-    def initTestCase(self):
-        self.x_shape = (2, 5, 5)
-        self.reduction = 'batchmean'
-
-
-class TestKLDivLossOp2(TestKLDivLossOp):
-    def initTestCase(self):
-        self.x_shape = (3, 2, 7, 7)
-        self.reduction = 'none'
-
-
-class TestKLDivLossOp3(TestKLDivLossOp):
-    def initTestCase(self):
-        self.x_shape = (2, 3, 5, 7, 9)
-        self.reduction = 'mean'
-
-
-class TestKLDivLossOp4(TestKLDivLossOp):
-    def initTestCase(self):
-        self.x_shape = (5, 7)
-        self.reduction = 'sum'
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_l1_norm_op.py b/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
deleted file mode 100644
index 4e24a78ee54dfb1fb0e4f97317642cfaffe9436e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-from op_test import OpTest
-
-
-class TestL1NormOp(OpTest):
-    """Test l1_norm
-    """
-
-    def setUp(self):
-        self.op_type = "l1_norm"
-        self.max_relative_error = 0.005
-
-        X = np.random.uniform(-1, 1, (13, 19)).astype("float32")
-        X[np.abs(X) < self.max_relative_error] = 0.1
-        self.inputs = {'X': X}
-        self.outputs = {'Out': np.sum(np.abs(X))}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=self.max_relative_error)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
deleted file mode 100644
index 62d385bc52cfb3a9fe15a82096ff33abc1bcc552..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestLabelSmoothOp(OpTest):
-    def config(self):
-        self.op_type = "label_smooth"
-        self.epsilon = 0.1
-        batch_size, self.label_dim = 5, 10
-        self.label = np.zeros((batch_size, self.label_dim)).astype("float64")
-        nonzero_index = np.random.randint(self.label_dim, size=(batch_size))
-        self.label[np.arange(batch_size), nonzero_index] = 1
-
-    def setUp(self):
-        self.config()
-        smoothed_label = (1 - self.epsilon
-                          ) * self.label + self.epsilon / self.label_dim
-        self.inputs = {'X': self.label}
-        self.attrs = {'epsilon': self.epsilon}
-        self.outputs = {'Out': smoothed_label}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
-    def setUp(self):
-        self.config()
-        dist = np.random.random((1, self.label_dim))
-        smoothed_label = (1 - self.epsilon) * self.label + self.epsilon * dist
-        self.inputs = {'X': self.label, 'PriorDist': dist}
-        self.attrs = {'epsilon': self.epsilon}
-        self.outputs = {'Out': smoothed_label}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lamb_op.py b/python/paddle/fluid/tests/unittests/test_lamb_op.py
deleted file mode 100644
index 48375184cc7515db8309c9718592bf9bee17bcf6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lamb_op.py
+++ /dev/null
@@ -1,296 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from paddle.fluid import core
-from paddle.fluid.op import Operator
-
-
-class TestLambOp1(OpTest):
-    def set_attrs(self):
-        self.attrs = {
-            'epsilon': 1e-4,
-            'beta1': 0.78,
-            'beta2': 0.836,
-            'weight_decay': 0.01
-        }
-
-    def setUp(self):
-        '''Test Lamb Op with supplied attributes
-        '''
-        self.op_type = "lamb"
-        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
-        moment2 = np.random.random((102, 105)).astype("float32")
-
-        learning_rate = 0.001
-        self.set_attrs()
-        beta1_pow = self.attrs['beta1']**10
-        beta2_pow = self.attrs['beta2']**10
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment1': moment1,
-            'Moment2': moment2,
-            'LearningRate': np.array([learning_rate]).astype("float32"),
-            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([beta2_pow]).astype("float32")
-        }
-
-
-        param_out, moment1_out, \
-            moment2_out = lamb_step(self.inputs, self.attrs)
-
-        self.outputs = {
-            'Moment1Out': moment1_out,
-            'Moment2Out': moment2_out,
-            'ParamOut': param_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestLambOp2(TestLambOp1):
-    def set_attrs(self):
-        self.attrs = {
-            'epsilon': 1e-8,
-            'beta1': 0.9,
-            'beta2': 0.999,
-            'weight_decay': 0.01
-        }
-
-
-class TestLambOpMultipleSteps(TestLambOp1):
-    def set_attrs(self):
-        self.attrs = {
-            'epsilon': 1e-8,
-            'beta1': 0.9,
-            'beta2': 0.999,
-            'weight_decay': 0.01
-        }
-        self.num_steps = 10
-
-    def test_check_output(self):
-        for _ in range(self.num_steps):
-            param_out, moment1_out, \
-                moment2_out = lamb_step(self.inputs, self.attrs)
-
-            self.outputs = {
-                'Moment1Out': moment1_out,
-                'Moment2Out': moment2_out,
-                'ParamOut': param_out
-            }
-
-            # Verify output for this step
-            self.check_output()
-
-            # Output of this step becomes input for next step
-            self.inputs['Param'] = param_out
-            self.inputs['Moment1'] = moment1_out
-            self.inputs['Moment2'] = moment2_out
-
-            # Update powers of Beta1 and Beta2 for next time step
-            self.inputs['Beta1Pow'] *= self.attrs['beta1']
-            self.inputs['Beta2Pow'] *= self.attrs['beta1']
-
-            # Randomize gradient for next step
-            self.inputs['Grad'] = np.random.uniform(
-                -1, 1, (102, 105)).astype("float32")
-
-
-def lamb_step(inputs, attributes):
-    '''
-    Simulate one step of the lamb optimizer
-    :param inputs: dict of inputs
-    :param attributes: dict of attributes
-    :return tuple: tuple of output param, moment1, moment2,
-    beta1 power accumulator and beta2 power accumulator
-    '''
-    param = inputs['Param']
-    grad = inputs['Grad']
-    moment1 = inputs['Moment1']
-    moment2 = inputs['Moment2']
-    lr = inputs['LearningRate']
-    beta1_pow = inputs['Beta1Pow']
-    beta2_pow = inputs['Beta2Pow']
-
-    beta1 = attributes['beta1']
-    beta2 = attributes['beta2']
-    epsilon = attributes['epsilon']
-    weight_decay = attributes['weight_decay']
-
-    moment1_out = beta1 * moment1 + (1 - beta1) * grad
-    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
-
-    r_1 = np.linalg.norm(param)
-    r_2 = np.linalg.norm(moment1_out / (np.sqrt(moment2_out) + epsilon) +
-                         weight_decay * param)
-    lr_t = lr * r_1 / r_2
-
-    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon) +
-                                weight_decay * param)
-    return param_out, moment1_out, moment2_out
-
-
-def lamb_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
-    '''
-    Simulate one step of the lamb optimizer
-    :param inputs: dict of inputs
-    :param attributes: dict of attributes
-    :return tuple: tuple of output param, moment1, moment2,
-    beta1 power accumulator and beta2 power accumulator
-    '''
-    param = inputs['Param']
-    # grad = inputs['Grad']
-    moment1 = inputs['Moment1']
-    moment2 = inputs['Moment2']
-    lr = inputs['LearningRate']
-    beta1_pow = inputs['Beta1Pow']
-    beta2_pow = inputs['Beta2Pow']
-
-    beta1 = attributes['beta1']
-    beta2 = attributes['beta2']
-    epsilon = attributes['epsilon']
-    weight_decay = attributes['weight_decay']
-
-    moment1_out = np.zeros(shape=[height, row_numel])
-    moment2_out = np.zeros(shape=[height, row_numel])
-    param_out = np.zeros(shape=[height, row_numel])
-
-    def update_mom(row_id, update_value):
-        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
-                                                         ) * update_value
-        moment2_out[row_id] = beta2 * moment2[row_id] + (
-            1 - beta2) * np.square(update_value)
-
-        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
-                                                         ) * update_value
-        moment2_out[row_id] = beta2 * moment2[row_id] + (
-            1 - beta2) * np.square(update_value)
-
-    def update_param():
-        r_1 = np.linalg.norm(param)
-        r_2 = np.linalg.norm(moment1_out / (np.sqrt(moment2_out) + epsilon) +
-                             weight_decay * param)
-        lr_t = lr * r_1 / r_2
-
-        param_out = param - lr_t * (moment1_out / (
-            np.sqrt(moment2_out) + epsilon) + weight_decay * param)
-
-    for row_id in range(param_out.shape[0]):
-        update_value = np.zeros(np_grad[0].shape).astype("float32")
-        if row_id in rows:
-            update_value = np_grad[rows.index(row_id)]
-        update_mom(row_id, update_value)
-
-    update_param()
-
-    return param_out, moment1_out, moment2_out
-
-
-class TestSparseLambOp(unittest.TestCase):
-    def setup(self, scope, place):
-        beta1 = 0.78
-        beta2 = 0.836
-        epsilon = 1e-4
-
-        height = 10
-        rows = [0, 4, 7]
-        self.rows = rows
-        row_numel = 12
-        self.row_numel = row_numel
-        self.dense_inputs = {
-            "Param": np.full((height, row_numel), 5.0).astype("float32"),
-            "Moment1": np.full((height, row_numel), 5.0).astype("float32"),
-            "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
-            'Beta1Pow': np.array([beta1**10]).astype("float32"),
-            'Beta2Pow': np.array([beta2**10]).astype("float32"),
-            "LearningRate": np.full((1), 2.0).astype("float32")
-        }
-        self.init_output = np.full((height, row_numel), 0.0).astype("float32")
-        self.attrs = {
-            'epsilon': epsilon,
-            'beta1': beta1,
-            'beta2': beta2,
-            'weight_decay': 0.05
-        }
-
-        grad_selected_rows = scope.var('Grad').get_selected_rows()
-        grad_selected_rows.set_height(height)
-        grad_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 8] = 4.0
-
-        grad_tensor = grad_selected_rows.get_tensor()
-        grad_tensor.set(np_array, place)
-
-        self.sparse_inputs = ["Grad"]
-
-        param_out, mom1, mom2 = lamb_step_sparse(
-            self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
-        self.outputs = {
-            "ParamOut": param_out,
-            "Moment1Out": mom1,
-            "Moment2Out": mom2
-        }
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        self.setup(scope, place)
-
-        op_args = dict()
-        for key, np_array in self.dense_inputs.items():
-            var = scope.var(key).get_tensor()
-            var.set(np_array, place)
-            op_args[key] = key
-        for s in self.sparse_inputs:
-            op_args[s] = s
-        for s in self.outputs:
-            var = scope.var(s).get_tensor()
-            var.set(self.init_output, place)
-            op_args[s] = s
-        for k in self.attrs:
-            op_args[k] = self.attrs[k]
-
-        # create and run sgd operator
-        lamb_op = Operator("lamb", **op_args)
-        lamb_op.run(scope, place)
-
-        for key, np_array in self.outputs.items():
-            out_var = scope.var(key).get_tensor()
-            actual = np.array(out_var)
-            actual = actual.reshape([actual.size])
-            np_array = np_array.reshape([np_array.size])
-
-            for i in range(np_array.size):
-                self.assertLess((actual[i] - np_array[i]), 0.00001)
-
-    def test_sparse_lamb(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh
deleted file mode 100644
index 1419ba7335b247031edfc1c67eaf490db646a57b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-set -e
-# use default values
-python -m paddle.distributed.launch multi_process.py
-
-# use paddlecloud
-cluster_node_ips="10.0.0.1"
-node_ip="10.0.0.1"
-export PADDLE_TRAINERS_NUM=2
-export POD_IP=127.0.0.1
-export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
-export PADDLE_TRAINER_ID=0
-
-distributed_args="--use_paddlecloud True --cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog"
-python -m paddle.distributed.launch ${distributed_args} multi_process.py
-
-str1="selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
-str2="selected_gpus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171 trainer_id:1"
-file_0="multi_process.check_0.log"
-file_1="multi_process.check_1.log"
-
-echo "paddlecloud params test"
-if grep -q "$str1" "$file_0"; then
-    echo "find trainer 0"
-else
-    echo "not find trainer 0"
-    exit -1
-fi
-
-if grep -q "$str2" "$file_1"; then
-    echo "find trainer 1"
-else
-    echo "not find trainer 1"
-    exit -1
-fi
-
-# test async poll process
-if [ -f $file_0 ]; then
-    rm $file_0
-fi
-if [ -f $file_1 ]; then
-    rm $file_1
-fi
-
-echo ""
-echo "paddle.distributed.launch async poll process test"
-if ! python -m paddle.distributed.launch ${distributed_args} multi_process.py abort; then
-    echo "train abort as planned"
-fi
-
-abort_str1="abort>>> selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
-
-if grep -q "$abort_str1" "$file_0"; then
-    echo "trainer 0 abort as planned"
-else
-    echo "trainer 0 not abort as planned"
-    exit -1
-fi
-
-if [ ! -f $file_1 ]; then
-    echo "trainer 1 terminate as planned"
-else
-    echo "trainer 1 not terminate as planned"
-    exit -1
-fi
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
deleted file mode 100644
index ff68599dce6bdb7ba7a6f35cc05f69ec8f543ab4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-
-from operator import mul
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from functools import reduce
-
-np.random.random(123)
-
-
-def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
-    x_shape = x.shape
-    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
-    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
-    x.shape = [N, D]
-
-    mean = np.mean(x, axis=1)
-    var = np.var(x, axis=1) + epsilon
-    output = scale.reshape([1, D]) * np.divide(
-        (x - mean.reshape([N, 1])),
-        (np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D])
-
-    x.shape, output.shape = x_shape, x_shape
-    return output, mean, var
-
-
-def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
-    x_shape = x.shape
-    scale_shape = scale.shape
-    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
-    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
-    x.shape, grad_y.shape = [N, D], [N, D]
-    var.shape, mean.shape = [N, 1], [N, 1]
-    scale.shape = [1, D]
-
-    # d_bias
-    d_bias = np.sum(grad_y, axis=0).reshape([1, D])
-    # d_scale
-    d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y,
-                     axis=0).reshape([1, D])
-    # dx
-    dx_end = scale * np.sqrt(1.0 / var) * grad_y
-    d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
-        [N, 1])  # the second part equals to zero.
-    d_mean = 1.0 / D * d_mean_0
-    d_std = np.sum(
-        -(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * (
-            1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
-
-    grad_x = dx_end + d_mean + d_std
-
-    grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
-    scale.shape = scale_shape
-    var.shape, mean.shape = [N, ], [N, ]
-    return grad_x, d_scale, d_bias
-
-
-class TestLayerNormOp(unittest.TestCase):
-    def setUp(self):
-        self.use_cudnn = True
-
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
-
-    def check_forward_backward(self, shape, begin_norm_axis):
-        def test_with_place(place, shape, begin_norm_axis):
-            # attr
-            epsilon = 0.00001
-            x_shape = shape
-            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
-            scale_shape = [D]
-
-            np.random.seed(123)
-            x = np.random.random_sample(x_shape).astype(np.float32)
-            scale = np.random.random_sample(scale_shape).astype(np.float32)
-            bias = np.random.random_sample(scale_shape).astype(np.float32)
-            y_grad = np.random.random_sample(x_shape).astype(np.float32)
-
-            # reference forward & backward
-            y, mean, variance = _reference_layer_norm_naive(
-                x, scale, bias, epsilon, begin_norm_axis)
-            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
-                x, y_grad, scale, mean, variance, begin_norm_axis)
-
-            var_dict = locals()
-            var_dict['y@GRAD'] = y_grad
-            var_names = [
-                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'y@GRAD'
-            ]
-            ground_truth = {name: var_dict[name] for name in var_names}
-
-            program = fluid.Program()
-            with fluid.program_guard(program):
-                block = program.global_block()
-                for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
-                layer_norm_op = block.append_op(
-                    type="layer_norm",
-                    inputs={
-                        "X": block.var('x'),
-                        "Scale": block.var('scale'),
-                        "Bias": block.var('bias'),
-                    },
-                    outputs={
-                        "Y": block.var('y'),
-                        "Mean": block.var('mean'),  # share the same memory
-                        "Variance":
-                        block.var('variance'),  # share the same memory
-                    },
-                    attrs={
-                        "epsilon": epsilon,
-                        "begin_norm_axis": begin_norm_axis
-                    })
-
-                # generate backward op_desc
-                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    layer_norm_op.desc, set(), [])
-                grad_op_desc = grad_op_desc_list[0]
-                new_op_desc = block.desc.append_op()
-                new_op_desc.copy_from(grad_op_desc)
-                for var_name in grad_op_desc.output_arg_names():
-                    block.desc.var(var_name.encode("ascii"))
-                grad_op_desc.infer_var_type(block.desc)
-                grad_op_desc.infer_shape(block.desc)
-                for arg in grad_op_desc.output_arg_names():
-                    grad_var = block.desc.find_var(arg.encode("ascii"))
-                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-                exe = fluid.Executor(place)
-                out = exe.run(program,
-                              feed={
-                                  name: var_dict[name]
-                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
-                              },
-                              fetch_list=[
-                                  'y', 'mean', 'variance', 'x@GRAD',
-                                  'scale@GRAD', 'bias@GRAD'
-                              ])
-                self.__assert_close(y, out[0], "y")
-                self.__assert_close(mean, out[1], "mean")
-                self.__assert_close(variance, out[2], "variance", 1e-3)
-                self.__assert_close(x_grad, out[3], "x_grad")
-                self.__assert_close(scale_grad, out[4], "scale_grad", 1e-3)
-                self.__assert_close(bias_grad, out[5], "bias_grad")
-
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(
-                "layer_norm") and self.use_cudnn:
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            test_with_place(place, shape, begin_norm_axis)
-
-    def test_check_forward_backward_with_scale_and_bias(self):
-        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
-        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
deleted file mode 100644
index 664a295660f54376d96b0890022d611d34286b6d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ /dev/null
@@ -1,2542 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-
-import contextlib
-import numpy as np
-from decorator_helper import prog_scope
-import inspect
-from six.moves import filter
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.layers.device import get_places
-import paddle.fluid.nets as nets
-from paddle.fluid.framework import Program, program_guard, default_main_program
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid import core
-from paddle.fluid.initializer import Constant
-import paddle.fluid.layers as layers
-from test_imperative_base import new_program_scope
-from paddle.fluid.dygraph import nn
-from paddle.fluid.dygraph import base
-
-
-class LayerTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.seed = 111
-
-    @classmethod
-    def tearDownClass(cls):
-        pass
-
-    def _get_place(self, force_to_use_cpu=False):
-        # this option for ops that only have cpu kernel
-        if force_to_use_cpu:
-            return core.CPUPlace()
-        else:
-            if core.is_compiled_with_cuda():
-                return core.CUDAPlace(0)
-            return core.CPUPlace()
-
-    @contextlib.contextmanager
-    def static_graph(self):
-        with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
-            yield
-
-    def get_static_graph_result(self,
-                                feed,
-                                fetch_list,
-                                with_lod=False,
-                                force_to_use_cpu=False):
-        exe = fluid.Executor(self._get_place(force_to_use_cpu))
-        exe.run(fluid.default_startup_program())
-        return exe.run(fluid.default_main_program(),
-                       feed=feed,
-                       fetch_list=fetch_list,
-                       return_numpy=(not with_lod))
-
-    @contextlib.contextmanager
-    def dynamic_graph(self, force_to_use_cpu=False):
-        with fluid.dygraph.guard(
-                self._get_place(force_to_use_cpu=force_to_use_cpu)):
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
-            yield
-
-
-class TestLayer(LayerTest):
-    def test_fc(self):
-        inp = np.ones([3, 32, 32], dtype='float32')
-        with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
-            ret = layers.fc(t, size=4, bias_attr=False, num_flatten_dims=1)
-            ret2 = layers.fc(ret, size=4)
-            static_ret = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret2])[0]
-        with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
-            fc1 = nn.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
-            fc2 = nn.FC('fc2', size=4)
-            ret = fc1(t)
-            ret2 = fc2(ret)
-            static_ret2 = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret2])[0]
-        with self.dynamic_graph():
-            t = base.to_variable(inp)
-            fc1 = nn.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
-            fc2 = nn.FC('fc2', size=4)
-            ret = fc1(t)
-            dy_ret = fc2(ret)
-
-        self.assertTrue(np.array_equal(static_ret, static_ret2))
-        self.assertTrue(np.array_equal(static_ret, dy_ret.numpy()))
-
-    def test_layer_norm(self):
-        inp = np.ones([3, 32, 32], dtype='float32')
-        with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
-            ret = layers.layer_norm(
-                t,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
-            static_ret = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret])[0]
-        with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
-            lm = nn.LayerNorm(
-                'layer_norm',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
-            ret = lm(t)
-            static_ret2 = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret])[0]
-        with self.dynamic_graph():
-            lm = nn.LayerNorm(
-                'layer_norm',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
-            dy_ret = lm(base.to_variable(inp))
-        with self.dynamic_graph():
-            lm = nn.LayerNorm(
-                'layer_norm',
-                shift=False,
-                scale=False,
-                param_attr=fluid.initializer.ConstantInitializer(value=1),
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
-            lm(base.to_variable(inp))
-
-            self.assertFalse(hasattr(lm, "_scale_w"))
-            self.assertFalse(hasattr(lm, "_bias_w"))
-
-        self.assertTrue(np.array_equal(static_ret, static_ret2))
-        self.assertTrue(np.array_equal(dy_ret.numpy(), static_ret2))
-
-    def test_relu(self):
-        with self.static_graph():
-            t = layers.data(name='t', shape=[3, 3], dtype='float32')
-            ret = layers.relu(t)
-            static_ret = self.get_static_graph_result(
-                feed={'t': np.ones(
-                    [3, 3], dtype='float32')}, fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            t = np.ones([3, 3], dtype='float32')
-            dy_ret = layers.relu(base.to_variable(t))
-
-        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
-
-    def test_matmul(self):
-        with self.static_graph():
-            t = layers.data(name='t', shape=[3, 3], dtype='float32')
-            t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
-            ret = layers.matmul(t, t2)
-            static_ret = self.get_static_graph_result(
-                feed={
-                    't': np.ones(
-                        [3, 3], dtype='float32'),
-                    't2': np.ones(
-                        [3, 3], dtype='float32')
-                },
-                fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            t = np.ones([3, 3], dtype='float32')
-            t2 = np.ones([3, 3], dtype='float32')
-            dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
-
-        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
-
-    def test_conv2d(self):
-        with self.static_graph():
-            images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
-            ret = layers.conv2d(input=images, num_filters=3, filter_size=[2, 2])
-            static_ret = self.get_static_graph_result(
-                feed={'pixel': np.ones(
-                    [2, 3, 5, 5], dtype='float32')},
-                fetch_list=[ret])[0]
-
-        with self.static_graph():
-            images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D('conv2d', num_filters=3, filter_size=[2, 2])
-            ret = conv2d(images)
-            static_ret2 = self.get_static_graph_result(
-                feed={'pixel': np.ones(
-                    [2, 3, 5, 5], dtype='float32')},
-                fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            images = np.ones([2, 3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D('conv2d', num_filters=3, filter_size=[2, 2])
-            dy_ret = conv2d(base.to_variable(images))
-
-        with self.dynamic_graph():
-            images = np.ones([2, 3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D(
-                'conv2d', num_filters=3, filter_size=[2, 2], bias_attr=False)
-            dy_ret = conv2d(base.to_variable(images))
-            self.assertTrue(conv2d._bias_param is None)
-
-        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
-        self.assertTrue(np.allclose(static_ret, static_ret2))
-
-    def test_gru_unit(self):
-        lod = [[2, 4, 3]]
-        D = 5
-        T = sum(lod[0])
-        N = len(lod[0])
-
-        input = np.random.rand(T, 3 * D).astype('float32')
-        hidden_input = np.random.rand(T, D).astype('float32')
-
-        with self.static_graph():
-            x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
-            hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
-            updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
-                input=x, hidden=hidden, size=D * 3)
-            static_ret = self.get_static_graph_result(
-                feed={'x': input,
-                      'hidden': hidden_input},
-                fetch_list=[updated_hidden, reset_hidden_pre, gate])
-
-        with self.static_graph():
-            x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
-            hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
-            updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
-                input=x, hidden=hidden, size=D * 3)
-            gru = nn.GRUUnit('gru', size=D * 3)
-            updated_hidden, reset_hidden_pre, gate = gru(x, hidden)
-
-            static_ret2 = self.get_static_graph_result(
-                feed={'x': input,
-                      'hidden': hidden_input},
-                fetch_list=[updated_hidden, reset_hidden_pre, gate])
-
-        with self.dynamic_graph():
-            gru = nn.GRUUnit('gru', size=D * 3)
-            dy_ret = gru(
-                base.to_variable(input), base.to_variable(hidden_input))
-
-        for i in range(len(static_ret)):
-            self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
-            self.assertTrue(np.allclose(static_ret[i], dy_ret[i].numpy()))
-
-    def test_elementwise_math(self):
-        n = np.ones([3, 3], dtype='float32')
-        n2 = np.ones([3, 3], dtype='float32') * 1.1
-        n3 = np.ones([3, 3], dtype='float32') * 2
-        n4 = np.ones([3, 3], dtype='float32') * 3
-        n5 = np.ones([3, 3], dtype='float32') * 4
-        n6 = np.ones([3, 3], dtype='float32') * 5
-
-        with self.static_graph():
-            t = layers.data(name='t', shape=[3, 3], dtype='float32')
-            t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
-            t3 = layers.data(name='t3', shape=[3, 3], dtype='float32')
-            t4 = layers.data(name='t4', shape=[3, 3], dtype='float32')
-            t5 = layers.data(name='t5', shape=[3, 3], dtype='float32')
-            t6 = layers.data(name='t6', shape=[3, 3], dtype='float32')
-
-            ret = layers.elementwise_add(t, t2)
-            ret = layers.elementwise_pow(ret, t3)
-            ret = layers.elementwise_div(ret, t4)
-            ret = layers.elementwise_sub(ret, t5)
-            ret = layers.elementwise_mul(ret, t6)
-
-            static_ret = self.get_static_graph_result(
-                feed={
-                    't': n,
-                    't2': n2,
-                    't3': n3,
-                    't4': n4,
-                    't5': n5,
-                    't6': n6
-                },
-                fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            ret = layers.elementwise_add(n, n2)
-            ret = layers.elementwise_pow(ret, n3)
-            ret = layers.elementwise_div(ret, n4)
-            ret = layers.elementwise_sub(ret, n5)
-            dy_ret = layers.elementwise_mul(ret, n6)
-        self.assertTrue(
-            np.allclose(static_ret, dy_ret.numpy()),
-            '%s vs %s' % (static_ret, dy_ret.numpy()))
-
-    def test_elementwise_minmax(self):
-        n = np.ones([3, 3], dtype='float32')
-        n2 = np.ones([3, 3], dtype='float32') * 2
-
-        with self.dynamic_graph():
-            min_ret = layers.elementwise_min(n, n2)
-            max_ret = layers.elementwise_max(n, n2)
-
-        self.assertTrue(np.allclose(n, min_ret.numpy()))
-        self.assertTrue(np.allclose(n2, max_ret.numpy()))
-
-    def test_sequence_conv(self):
-        inp_np = np.arange(12).reshape([3, 4]).astype('float32')
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        with self.static_graph():
-            seq = layers.data(
-                name='seq_in',
-                shape=[3, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            out = layers.sequence_conv(seq, 2, act='sigmoid')
-            static_rlt = self.get_static_graph_result(
-                feed={
-                    "seq_in": fluid.create_lod_tensor(
-                        data=inp_np,
-                        recursive_seq_lens=[[1, 1, 1]],
-                        place=place)
-                },
-                fetch_list=[out],
-                with_lod=True)[0]
-
-        with self.static_graph():
-            seq = layers.data(
-                name='seq_in',
-                shape=[3, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            seq_conv = nn.SequenceConv('seq_conv', num_filters=2, act='sigmoid')
-            out = seq_conv(seq)
-            static_rlt2 = self.get_static_graph_result(
-                feed={
-                    "seq_in": fluid.create_lod_tensor(
-                        data=inp_np,
-                        recursive_seq_lens=[[1, 1, 1]],
-                        place=place)
-                },
-                fetch_list=[out],
-                with_lod=True)[0]
-        self.assertTrue(
-            np.array_equal(np.array(static_rlt), np.array(static_rlt2)))
-
-    def test_conv2d_transpose(self):
-        inp_np = np.arange(0, 24).reshape([2, 3, 2, 2]).astype('float32')
-        with self.static_graph():
-            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
-            out = layers.conv2d_transpose(
-                input=img,
-                num_filters=10,
-                output_size=28,
-                act='sigmoid',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1))
-            static_rlt = self.get_static_graph_result(
-                feed={'pixel': inp_np}, fetch_list=[out])[0]
-        with self.static_graph():
-            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
-            conv2d_transpose = nn.Conv2DTranspose(
-                'conv2d_transpose',
-                num_filters=10,
-                output_size=28,
-                act='sigmoid',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1))
-            out = conv2d_transpose(img)
-            static_rlt2 = self.get_static_graph_result(
-                feed={'pixel': inp_np}, fetch_list=[out])[0]
-        with self.dynamic_graph():
-            conv2d_transpose = nn.Conv2DTranspose(
-                'conv2d_transpose',
-                num_filters=10,
-                output_size=28,
-                act='sigmoid',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1))
-            dy_rlt = conv2d_transpose(base.to_variable(inp_np))
-        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt2))
-
-    def test_bilinear_tensor_product(self):
-        inp_np_x = np.array([[1, 2, 3]]).astype('float32')
-        inp_np_y = np.array([[4, 5, 6]]).astype('float32')
-
-        with self.static_graph():
-            data_x = layers.data(
-                name='x',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            data_y = layers.data(
-                name='y',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            out = layers.bilinear_tensor_product(
-                data_x,
-                data_y,
-                6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
-
-            static_rlt = self.get_static_graph_result(
-                feed={'x': inp_np_x,
-                      'y': inp_np_y}, fetch_list=[out])[0]
-
-        with self.static_graph():
-            data_x = layers.data(
-                name='x',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            data_y = layers.data(
-                name='y',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            btp = nn.BilinearTensorProduct(
-                'btp',
-                6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
-            out = btp(data_x, data_y)
-            static_rlt2 = self.get_static_graph_result(
-                feed={'x': inp_np_x,
-                      'y': inp_np_y}, fetch_list=[out])[0]
-        with self.dynamic_graph():
-            btp = nn.BilinearTensorProduct(
-                'btp',
-                6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
-            dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
-
-        with self.dynamic_graph():
-            btp2 = nn.BilinearTensorProduct('btp', 6, act='sigmoid')
-            dy_rlt2 = btp2(
-                base.to_variable(inp_np_x), base.to_variable(inp_np_y))
-
-        with self.static_graph():
-            data_x2 = layers.data(
-                name='x',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            data_y2 = layers.data(
-                name='y',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            out2 = layers.bilinear_tensor_product(
-                data_x2, data_y2, 6, act='sigmoid')
-
-            static_rlt3 = self.get_static_graph_result(
-                feed={'x': inp_np_x,
-                      'y': inp_np_y}, fetch_list=[out2])[0]
-
-        self.assertTrue(np.array_equal(dy_rlt2.numpy(), static_rlt3))
-        self.assertTrue(np.array_equal(static_rlt2, static_rlt))
-        self.assertTrue(np.array_equal(dy_rlt.numpy(), static_rlt))
-
-    def test_prelu(self):
-        inp_np = np.ones([5, 200, 100, 100]).astype('float32')
-
-        with self.static_graph():
-            data_t = layers.data(
-                name="input",
-                shape=[5, 200, 100, 100],
-                dtype="float32",
-                append_batch_size=False)
-            mode = 'channel'
-            out = layers.prelu(
-                data_t, mode, param_attr=ParamAttr(initializer=Constant(1.0)))
-            static_rlt = self.get_static_graph_result(
-                feed={"input": inp_np}, fetch_list=[out])[0]
-
-        with self.static_graph():
-            data_t = layers.data(
-                name="input",
-                shape=[5, 200, 100, 100],
-                dtype="float32",
-                append_batch_size=False)
-            mode = 'channel'
-            prelu = nn.PRelu(
-                'prelu',
-                mode=mode,
-                param_attr=ParamAttr(initializer=Constant(1.0)))
-            out = prelu(data_t)
-            static_rlt2 = self.get_static_graph_result(
-                feed={"input": inp_np}, fetch_list=[out])[0]
-
-        with self.dynamic_graph():
-            mode = 'channel'
-            prelu = nn.PRelu(
-                'prelu',
-                mode=mode,
-                param_attr=ParamAttr(initializer=Constant(1.0)))
-            dy_rlt = prelu(base.to_variable(inp_np))
-
-        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
-
-    def test_embeding(self):
-        inp_word = np.array([[[1]]]).astype('int64')
-        dict_size = 20
-        with self.static_graph():
-            data_t = layers.data(name='word', shape=[1], dtype='int64')
-            emb = layers.embedding(
-                input=data_t,
-                size=[dict_size, 32],
-                param_attr='emb.w',
-                is_sparse=False)
-            static_rlt = self.get_static_graph_result(
-                feed={'word': inp_word}, fetch_list=[emb])[0]
-        with self.static_graph():
-            data_t = layers.data(name='word', shape=[1], dtype='int64')
-            emb2 = nn.Embedding(
-                name_scope='embedding',
-                size=[dict_size, 32],
-                param_attr='emb.w',
-                is_sparse=False)
-            emb_rlt = emb2(data_t)
-            static_rlt2 = self.get_static_graph_result(
-                feed={'word': inp_word}, fetch_list=[emb_rlt])[0]
-        with self.dynamic_graph():
-            emb2 = nn.Embedding(
-                name_scope='embedding',
-                size=[dict_size, 32],
-                param_attr='emb.w',
-                is_sparse=False)
-            static_rlt3 = emb2(base.to_variable(inp_word))
-
-        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(static_rlt3.numpy(), static_rlt))
-
-    def test_nce(self):
-        window_size = 5
-        dict_size = 20
-        label_word = int(window_size // 2) + 1
-        inp_word = np.array([[[1]], [[2]], [[3]], [[4]], [[5]]]).astype('int64')
-        nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
-        seed = 1
-        with self.static_graph():
-            words = []
-            for i in range(window_size):
-                words.append(
-                    layers.data(
-                        name='word_{0}'.format(i), shape=[1], dtype='int64'))
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1)
-            embs = []
-            for i in range(window_size):
-                if i == label_word:
-                    continue
-
-                emb = layers.embedding(
-                    input=words[i],
-                    size=[dict_size, 32],
-                    param_attr='emb.w',
-                    is_sparse=False)
-                embs.append(emb)
-
-            embs = layers.concat(input=embs, axis=1)
-            nce_loss = layers.nce(input=embs,
-                                  label=words[label_word],
-                                  num_total_classes=dict_size,
-                                  num_neg_samples=2,
-                                  sampler="custom_dist",
-                                  custom_dist=nid_freq_arr.tolist(),
-                                  seed=seed,
-                                  param_attr='nce.w',
-                                  bias_attr='nce.b',
-                                  sample_weight=sample_weights)
-            feed_dict = dict()
-            for i in range(window_size):
-                feed_dict['word_{0}'.format(i)] = inp_word[i]
-            static_rlt = self.get_static_graph_result(
-                feed=feed_dict, fetch_list=[nce_loss])[0]
-        with self.static_graph():
-            words = []
-            for i in range(window_size):
-                words.append(
-                    layers.data(
-                        name='word_{0}'.format(i), shape=[1], dtype='int64'))
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1)
-            emb = nn.Embedding(
-                'embedding',
-                size=[dict_size, 32],
-                param_attr='emb.w',
-                is_sparse=False)
-
-            embs2 = []
-            for i in range(window_size):
-                if i == label_word:
-                    continue
-
-                emb_rlt = emb(words[i])
-                embs2.append(emb_rlt)
-
-            embs2 = layers.concat(input=embs2, axis=1)
-            nce = nn.NCE('nce',
-                         num_total_classes=dict_size,
-                         num_neg_samples=2,
-                         sampler="custom_dist",
-                         custom_dist=nid_freq_arr.tolist(),
-                         seed=seed,
-                         param_attr='nce.w',
-                         bias_attr='nce.b',
-                         sample_weight=sample_weights)
-
-            nce_loss2 = nce(embs2, words[label_word])
-            feed_dict = dict()
-            for i in range(len(words)):
-                feed_dict['word_{0}'.format(i)] = inp_word[i]
-
-            static_rlt2 = self.get_static_graph_result(
-                feed=feed_dict, fetch_list=[nce_loss2])[0]
-
-        with self.dynamic_graph(force_to_use_cpu=True):
-            words = []
-            for i in range(window_size):
-                words.append(base.to_variable(inp_word[i]))
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1)
-            emb = nn.Embedding(
-                'embedding',
-                size=[dict_size, 32],
-                param_attr='emb.w',
-                is_sparse=False)
-
-            embs3 = []
-            for i in range(window_size):
-                if i == label_word:
-                    continue
-
-                emb_rlt = emb(words[i])
-                embs3.append(emb_rlt)
-
-            embs3 = layers.concat(input=embs3, axis=1)
-            nce = nn.NCE('nce',
-                         num_total_classes=dict_size,
-                         num_neg_samples=2,
-                         sampler="custom_dist",
-                         custom_dist=nid_freq_arr.tolist(),
-                         seed=seed,
-                         param_attr='nce.w',
-                         bias_attr='nce.b',
-                         sample_weight=sample_weights)
-
-            nce_loss3 = nce(embs3, words[label_word])
-
-        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(nce_loss3.numpy(), static_rlt))
-
-    def test_conv3d(self):
-        with self.static_graph():
-            images = layers.data(
-                name='pixel', shape=[3, 6, 6, 6], dtype='float32')
-            ret = layers.conv3d(input=images, num_filters=3, filter_size=2)
-            static_ret = self.get_static_graph_result(
-                feed={'pixel': np.ones(
-                    [2, 3, 6, 6, 6], dtype='float32')},
-                fetch_list=[ret])[0]
-
-        with self.static_graph():
-            images = layers.data(
-                name='pixel', shape=[3, 6, 6, 6], dtype='float32')
-            conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2)
-            ret = conv3d(images)
-            static_ret2 = self.get_static_graph_result(
-                feed={'pixel': np.ones(
-                    [2, 3, 6, 6, 6], dtype='float32')},
-                fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            images = np.ones([2, 3, 6, 6, 6], dtype='float32')
-            conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2)
-            dy_ret = conv3d(base.to_variable(images))
-
-        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
-        self.assertTrue(np.allclose(static_ret, static_ret2))
-
-    def test_row_conv(self):
-        input = np.arange(15).reshape([3, 5]).astype('float32')
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        with self.static_graph():
-            x = layers.data(
-                name='X',
-                shape=[3, 5],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            ret = layers.row_conv(input=x, future_context_size=2)
-            static_ret = self.get_static_graph_result(
-                feed={
-                    'X': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1, 1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
-
-        with self.static_graph():
-            x = layers.data(
-                name='X',
-                shape=[3, 5],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            rowConv = nn.RowConv('RowConv', future_context_size=2)
-            ret = rowConv(x)
-            static_ret2 = self.get_static_graph_result(
-                feed={
-                    'X': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1, 1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
-
-        # TODO: dygraph can't support LODTensor
-
-        self.assertTrue(np.allclose(static_ret, static_ret2))
-
-    def test_group_norm(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        shape = (2, 4, 3, 3)
-
-        input = np.random.random(shape).astype('float32')
-
-        with self.static_graph():
-            X = fluid.layers.data(
-                name='X',
-                shape=shape,
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            ret = layers.group_norm(input=X, groups=2)
-            static_ret = self.get_static_graph_result(
-                feed={
-                    'X': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
-
-        with self.static_graph():
-            X = fluid.layers.data(
-                name='X',
-                shape=shape,
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            groupNorm = nn.GroupNorm('GroupNorm', groups=2)
-            ret = groupNorm(X)
-            static_ret2 = self.get_static_graph_result(
-                feed={
-                    'X': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
-
-        with self.dynamic_graph():
-            groupNorm = nn.GroupNorm('GroupNorm', groups=2)
-            dy_ret = groupNorm(base.to_variable(input))
-
-        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
-        self.assertTrue(np.allclose(static_ret, static_ret2))
-
-    def test_spectral_norm(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        shape = (2, 4, 3, 3)
-
-        input = np.random.random(shape).astype('float32')
-
-        with self.static_graph():
-            Weight = fluid.layers.data(
-                name='Weight',
-                shape=shape,
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            ret = layers.spectral_norm(weight=Weight, dim=1, power_iters=2)
-            static_ret = self.get_static_graph_result(
-                feed={
-                    'Weight': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1]], place=place),
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
-
-        with self.static_graph():
-            Weight = fluid.layers.data(
-                name='Weight',
-                shape=shape,
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
-            ret = spectralNorm(Weight)
-            static_ret2 = self.get_static_graph_result(
-                feed={
-                    'Weight': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
-
-        with self.dynamic_graph():
-            spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
-            dy_ret = spectralNorm(base.to_variable(input))
-
-        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
-        self.assertTrue(np.allclose(static_ret, static_ret2))
-
-    def test_tree_conv(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        adj_array = [1, 2, 1, 3, 1, 4, 1, 5, 2, 6, 2, 7, 2, 8, 4, 9, 4, 10]
-        adj = np.array(adj_array).reshape((1, 9, 2)).astype('int32')
-        adj = np.tile(adj, (1, 1, 1))
-        vectors = np.random.random((1, 10, 5)).astype('float32')
-        with self.static_graph():
-            NodesVector = fluid.layers.data(
-                name='NodesVector',
-                shape=(1, 10, 5),
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            EdgeSet = fluid.layers.data(
-                name='EdgeSet',
-                shape=(1, 9, 2),
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            ret = fluid.contrib.layers.tree_conv(
-                nodes_vector=NodesVector,
-                edge_set=EdgeSet,
-                output_size=6,
-                num_filters=1,
-                max_depth=2)
-            static_ret = self.get_static_graph_result(
-                feed={
-                    'NodesVector': fluid.create_lod_tensor(
-                        data=vectors, recursive_seq_lens=[[1]], place=place),
-                    'EdgeSet': fluid.create_lod_tensor(
-                        data=adj, recursive_seq_lens=[[1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=False)[0]
-
-        with self.static_graph():
-            NodesVector = fluid.layers.data(
-                name='NodesVector',
-                shape=(1, 10, 5),
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            EdgeSet = fluid.layers.data(
-                name='EdgeSet',
-                shape=(1, 9, 2),
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            treeConv = nn.TreeConv(
-                'TreeConv', output_size=6, num_filters=1, max_depth=2)
-            ret = treeConv(NodesVector, EdgeSet)
-            static_ret2 = self.get_static_graph_result(
-                feed={
-                    'NodesVector': fluid.create_lod_tensor(
-                        data=vectors, recursive_seq_lens=[[1]], place=place),
-                    'EdgeSet': fluid.create_lod_tensor(
-                        data=adj, recursive_seq_lens=[[1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=False)[0]
-
-        with self.dynamic_graph():
-            treeConv = nn.TreeConv(
-                'SpectralNorm', output_size=6, num_filters=1, max_depth=2)
-            dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj))
-
-        self.assertTrue(np.allclose(static_ret, static_ret2))
-        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
-
-    def test_conv3d_transpose(self):
-        input_array = np.arange(0, 48).reshape(
-            [2, 3, 2, 2, 2]).astype('float32')
-
-        with self.static_graph():
-            img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32')
-            out = layers.conv3d_transpose(
-                input=img, num_filters=12, filter_size=12, use_cudnn=False)
-            static_rlt = self.get_static_graph_result(
-                feed={'pixel': input_array}, fetch_list=[out])[0]
-        with self.static_graph():
-            img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32')
-            conv3d_transpose = nn.Conv3DTranspose(
-                'Conv3DTranspose',
-                num_filters=12,
-                filter_size=12,
-                use_cudnn=False)
-            out = conv3d_transpose(img)
-            static_rlt2 = self.get_static_graph_result(
-                feed={'pixel': input_array}, fetch_list=[out])[0]
-        with self.dynamic_graph():
-            conv3d_transpose = nn.Conv3DTranspose(
-                'Conv3DTranspose',
-                num_filters=12,
-                filter_size=12,
-                use_cudnn=False)
-            dy_rlt = conv3d_transpose(base.to_variable(input_array))
-        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
-
-    def test_eye_op(self):
-        np_eye = np.eye(3, 2)
-        array_rlt1 = [np_eye for _ in range(3)]
-        stack_rlt1 = np.stack(array_rlt1, axis=0)
-        array_rlt2 = [stack_rlt1 for _ in range(4)]
-        stack_rlt2 = np.stack(array_rlt2, axis=0)
-
-        with self.dynamic_graph():
-            eye_tensor = layers.eye(num_rows=3, num_columns=2)
-            eye_tensor_rlt1 = layers.eye(num_rows=3,
-                                         num_columns=2,
-                                         batch_shape=[3])
-            eye_tensor_rlt2 = layers.eye(num_rows=3,
-                                         num_columns=2,
-                                         batch_shape=[4, 3])
-            diag_tensor = layers.eye(20)
-
-        self.assertTrue(np.allclose(eye_tensor.numpy(), np_eye))
-        self.assertTrue(np.allclose(eye_tensor_rlt1.numpy(), stack_rlt1))
-        self.assertTrue(np.allclose(eye_tensor_rlt2.numpy(), stack_rlt2))
-        self.assertTrue(np.allclose(diag_tensor.numpy(), np.eye(20)))
-
-        with self.assertRaises(TypeError):
-            layers.eye(num_rows=3.1)
-        with self.assertRaises(TypeError):
-            layers.eye(num_rows=3, num_columns=2.2)
-        with self.assertRaises(TypeError):
-            layers.eye(num_rows=3, batch_shape=2)
-        with self.assertRaises(TypeError):
-            layers.eye(num_rows=3, batch_shape=[-1])
-
-    def test_hard_swish(self):
-        with self.static_graph():
-            t = layers.data(name='t', shape=[3, 3], dtype='float32')
-            ret = layers.hard_swish(t)
-            static_ret = self.get_static_graph_result(
-                feed={'t': np.ones(
-                    [3, 3], dtype='float32')}, fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            t = np.ones([3, 3], dtype='float32')
-            dy_ret = layers.hard_swish(base.to_variable(t))
-
-        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
-
-    def test_compare(self):
-        value_a = np.arange(3)
-        value_b = np.arange(3)
-        # less than
-        with self.static_graph():
-            a = layers.data(name='a', shape=[1], dtype='int64')
-            b = layers.data(name='b', shape=[1], dtype='int64')
-            cond = layers.less_than(x=a, y=b)
-            static_ret = self.get_static_graph_result(
-                feed={"a": value_a,
-                      "b": value_b}, fetch_list=[cond])[0]
-        with self.dynamic_graph():
-            da = base.to_variable(value_a)
-            db = base.to_variable(value_b)
-            dcond = layers.less_than(x=da, y=db)
-
-        for i in range(len(static_ret)):
-            self.assertTrue(dcond.numpy()[i] == static_ret[i])
-
-        # less equal
-        with self.static_graph():
-            a1 = layers.data(name='a1', shape=[1], dtype='int64')
-            b1 = layers.data(name='b1', shape=[1], dtype='int64')
-            cond1 = layers.less_equal(x=a1, y=b1)
-            static_ret1 = self.get_static_graph_result(
-                feed={"a1": value_a,
-                      "b1": value_b}, fetch_list=[cond1])[0]
-        with self.dynamic_graph():
-            da1 = base.to_variable(value_a)
-            db1 = base.to_variable(value_b)
-            dcond1 = layers.less_equal(x=da1, y=db1)
-
-            for i in range(len(static_ret1)):
-                self.assertTrue(dcond1.numpy()[i] == static_ret1[i])
-
-        #greater than
-        with self.static_graph():
-            a2 = layers.data(name='a2', shape=[1], dtype='int64')
-            b2 = layers.data(name='b2', shape=[1], dtype='int64')
-            cond2 = layers.greater_than(x=a2, y=b2)
-            static_ret2 = self.get_static_graph_result(
-                feed={"a2": value_a,
-                      "b2": value_b}, fetch_list=[cond2])[0]
-        with self.dynamic_graph():
-            da2 = base.to_variable(value_a)
-            db2 = base.to_variable(value_b)
-            dcond2 = layers.greater_than(x=da2, y=db2)
-
-            for i in range(len(static_ret2)):
-                self.assertTrue(dcond2.numpy()[i] == static_ret2[i])
-
-        #greater equal
-        with self.static_graph():
-            a3 = layers.data(name='a3', shape=[1], dtype='int64')
-            b3 = layers.data(name='b3', shape=[1], dtype='int64')
-            cond3 = layers.greater_equal(x=a3, y=b3)
-            static_ret3 = self.get_static_graph_result(
-                feed={"a3": value_a,
-                      "b3": value_b}, fetch_list=[cond3])[0]
-        with self.dynamic_graph():
-            da3 = base.to_variable(value_a)
-            db3 = base.to_variable(value_b)
-            dcond3 = layers.greater_equal(x=da3, y=db3)
-
-            for i in range(len(static_ret3)):
-                self.assertTrue(dcond3.numpy()[i] == static_ret3[i])
-
-        # equal
-        with self.static_graph():
-            a4 = layers.data(name='a4', shape=[1], dtype='int64')
-            b4 = layers.data(name='b4', shape=[1], dtype='int64')
-            cond4 = layers.equal(x=a4, y=b4)
-            static_ret4 = self.get_static_graph_result(
-                feed={"a4": value_a,
-                      "b4": value_b}, fetch_list=[cond4])[0]
-        with self.dynamic_graph():
-            da4 = base.to_variable(value_a)
-            db4 = base.to_variable(value_b)
-            dcond4 = layers.equal(x=da4, y=db4)
-
-            for i in range(len(static_ret4)):
-                self.assertTrue(dcond4.numpy()[i] == static_ret4[i])
-
-        # not equal
-        with self.static_graph():
-            a5 = layers.data(name='a5', shape=[1], dtype='int64')
-            b5 = layers.data(name='b5', shape=[1], dtype='int64')
-            cond5 = layers.equal(x=a5, y=b5)
-            static_ret5 = self.get_static_graph_result(
-                feed={"a5": value_a,
-                      "b5": value_b}, fetch_list=[cond5])[0]
-        with self.dynamic_graph():
-            da5 = base.to_variable(value_a)
-            db5 = base.to_variable(value_b)
-            dcond5 = layers.equal(x=da5, y=db5)
-
-            for i in range(len(static_ret5)):
-                self.assertTrue(dcond5.numpy()[i] == static_ret5[i])
-
-    def test_crop_tensor(self):
-        with self.static_graph():
-            x = fluid.layers.data(name="x1", shape=[6, 5, 8])
-
-            dim1 = fluid.layers.data(
-                name="dim1", shape=[1], append_batch_size=False)
-            dim2 = fluid.layers.data(
-                name="dim2", shape=[1], append_batch_size=False)
-            crop_shape1 = (1, 2, 4, 4)
-            crop_shape2 = fluid.layers.data(
-                name="crop_shape", shape=[4], append_batch_size=False)
-            crop_shape3 = [-1, dim1, dim2, 4]
-            crop_offsets1 = [0, 0, 1, 0]
-            crop_offsets2 = fluid.layers.data(
-                name="crop_offset", shape=[4], append_batch_size=False)
-            crop_offsets3 = [0, dim1, dim2, 0]
-
-            out1 = fluid.layers.crop_tensor(
-                x, shape=crop_shape1, offsets=crop_offsets1)
-            out2 = fluid.layers.crop_tensor(
-                x, shape=crop_shape2, offsets=crop_offsets2)
-            out3 = fluid.layers.crop_tensor(
-                x, shape=crop_shape3, offsets=crop_offsets3)
-
-            self.assertIsNotNone(out1)
-            self.assertIsNotNone(out2)
-            self.assertIsNotNone(out3)
-
-
-class TestBook(LayerTest):
-    def test_all_layers(self):
-        attrs = (getattr(self, name) for name in dir(self))
-        methods = filter(inspect.ismethod, attrs)
-        for method in methods:
-            if not method.__name__.startswith('make_'):
-                continue
-            self._low_data_bound = 0
-            self._high_data_bound = 2
-            self._batch_size = 2
-            self._feed_dict = {}
-            self._force_to_use_cpu = False
-            with self.static_graph():
-                static_var = method()
-                if isinstance(static_var, tuple):
-                    static_var = static_var[0]
-
-                if static_var is not None:
-                    fetch_list = [static_var.name]
-                    static_result = self.get_static_graph_result(
-                        feed=self._feed_dict,
-                        fetch_list=fetch_list,
-                        force_to_use_cpu=self._force_to_use_cpu)
-                else:
-                    assert method.__name__ in ('make_get_places')
-                    continue
-
-            with self.dynamic_graph(self._force_to_use_cpu):
-                dy_result = method()
-                if isinstance(dy_result, tuple):
-                    dy_result = dy_result[0]
-
-        self.assertTrue(np.array_equal(static_result[0], dy_result.numpy()))
-
-    def _get_np_data(self, shape, dtype, append_batch_size=True):
-        np.random.seed(self.seed)
-        if append_batch_size:
-            shape = [self._batch_size] + shape
-        if dtype == 'float32':
-            return np.random.random(shape).astype(dtype)
-        elif dtype == 'float64':
-            return np.random.random(shape).astype(dtype)
-        elif dtype == 'int32':
-            return np.random.randint(self._low_data_bound,
-                                     self._high_data_bound, shape).astype(dtype)
-        elif dtype == 'int64':
-            return np.random.randint(self._low_data_bound,
-                                     self._high_data_bound, shape).astype(dtype)
-
-    def _get_data(self,
-                  name,
-                  shape,
-                  dtype,
-                  set_feed_dict=True,
-                  append_batch_size=True):
-        if base.enabled():
-            return base.to_variable(
-                value=self._get_np_data(shape, dtype, append_batch_size),
-                name=name)
-        else:
-            if set_feed_dict:
-                self._feed_dict[name] = self._get_np_data(shape, dtype,
-                                                          append_batch_size)
-            return layers.data(
-                name=name,
-                shape=shape,
-                dtype=dtype,
-                append_batch_size=append_batch_size)
-
-    def make_sampled_softmax_with_cross_entropy(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            logits = self._get_data(name='Logits', shape=[256], dtype='float32')
-            label = self._get_data(name='Label', shape=[1], dtype='int64')
-            num_samples = 25
-            output = layers.sampled_softmax_with_cross_entropy(logits, label,
-                                                               num_samples)
-            return (output)
-
-    def make_fit_a_line(self):
-        with program_guard(
-                fluid.default_main_program(),
-                startup_program=fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[13], dtype='float32')
-            y_predict = layers.fc(input=x, size=1, act=None)
-            y = self._get_data(name='y', shape=[1], dtype='float32')
-            cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
-            return (avg_cost)
-
-    def make_recognize_digits_mlp(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            # Change g_program, so the rest layers use `g_program`
-            images = self._get_data(name='pixel', shape=[784], dtype='float32')
-            label = self._get_data(name='label', shape=[1], dtype='int64')
-            hidden1 = layers.fc(input=images, size=128, act='relu')
-            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
-            predict = layers.fc(input=[hidden2, hidden1],
-                                size=10,
-                                act='softmax',
-                                param_attr=["sftmax.w1", "sftmax.w2"])
-            cost = layers.cross_entropy(input=predict, label=label)
-            avg_cost = layers.mean(cost)
-            return (avg_cost)
-
-    def make_conv2d_transpose(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            img = self._get_data(name='pixel', shape=[3, 2, 2], dtype='float32')
-            return layers.conv2d_transpose(
-                input=img, num_filters=10, output_size=28)
-
-    def make_recognize_digits_conv(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            images = self._get_data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
-            label = self._get_data(name='label', shape=[1], dtype='int64')
-            conv_pool_1 = nets.simple_img_conv_pool(
-                input=images,
-                filter_size=5,
-                num_filters=2,
-                pool_size=2,
-                pool_stride=2,
-                act="relu")
-            conv_pool_2 = nets.simple_img_conv_pool(
-                input=conv_pool_1,
-                filter_size=5,
-                num_filters=4,
-                pool_size=2,
-                pool_stride=2,
-                act="relu")
-
-            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
-            cost = layers.cross_entropy(input=predict, label=label)
-            avg_cost = layers.mean(cost)
-            return avg_cost
-
-    def make_word_embedding(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            dict_size = 10000
-            embed_size = 32
-            first_word = self._get_data(name='firstw', shape=[1], dtype='int64')
-            second_word = self._get_data(
-                name='secondw', shape=[1], dtype='int64')
-            third_word = self._get_data(name='thirdw', shape=[1], dtype='int64')
-            forth_word = self._get_data(name='forthw', shape=[1], dtype='int64')
-            next_word = self._get_data(name='nextw', shape=[1], dtype='int64')
-
-            embed_first = layers.embedding(
-                input=first_word,
-                size=[dict_size, embed_size],
-                dtype='float32',
-                param_attr='shared_w')
-            embed_second = layers.embedding(
-                input=second_word,
-                size=[dict_size, embed_size],
-                dtype='float32',
-                param_attr='shared_w')
-
-            embed_third = layers.embedding(
-                input=third_word,
-                size=[dict_size, embed_size],
-                dtype='float32',
-                param_attr='shared_w')
-            embed_forth = layers.embedding(
-                input=forth_word,
-                size=[dict_size, embed_size],
-                dtype='float32',
-                param_attr='shared_w')
-
-            concat_embed = layers.concat(
-                input=[embed_first, embed_second, embed_third, embed_forth],
-                axis=1)
-
-            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
-            predict_word = layers.fc(input=hidden1,
-                                     size=dict_size,
-                                     act='softmax')
-            cost = layers.cross_entropy(input=predict_word, label=next_word)
-            avg_cost = layers.mean(cost)
-            return (avg_cost)
-
-    def make_sigmoid_cross_entropy(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            dat = self._get_data(name='data', shape=[10], dtype='float32')
-            lbl = self._get_data(name='label', shape=[10], dtype='float32')
-            ignore_index = -1
-            return (layers.sigmoid_cross_entropy_with_logits(
-                x=dat, label=lbl, ignore_index=ignore_index))
-
-    def make_hsigmoid(self):
-        self._force_to_use_cpu = True
-        with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
-            x = self._get_data(name='x', shape=[2], dtype='float32')
-            y = self._get_data(name='y', shape=[2], dtype='int64')
-            return (layers.hsigmoid(input=x, label=y, num_classes=2))
-
-        # test hsigmod with custom tree structure
-        program2 = Program()
-        with program_guard(program2):
-            x2 = self._get_data(name='x2', shape=[4, 8], dtype='float32')
-            y2 = self._get_data(name='y2', shape=[4], dtype='int64')
-            path_table = self._get_data(
-                name='path_table', shape=[4, 6], dtype='int64')
-            path_code = self._get_data(
-                name='path_code', shape=[4, 6], dtype='int64')
-            return (layers.hsigmoid(
-                input=x2,
-                label=y2,
-                num_classes=6,
-                path_table=path_table,
-                path_code=path_code,
-                is_custom=True))
-
-    def make_pool2d(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 224, 224], dtype='float32')
-            return (layers.pool2d(
-                x, pool_size=[5, 3], pool_stride=[1, 2], pool_padding=(2, 1)))
-
-    def make_adaptive_pool2d(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 224, 224], dtype='float32')
-            return (layers.adaptive_pool2d(x, [3, 3], pool_type='avg'))
-            pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True)
-            return (pool)
-            return (mask)
-            return (layers.adaptive_pool2d(x, 3, pool_type='avg'))
-            pool, mask = layers.adaptive_pool2d(x, 3, require_index=True)
-            return (pool)
-            return (mask)
-
-    def make_adaptive_pool3d(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(
-                name='x', shape=[3, 244, 224, 224], dtype='float32')
-            return (layers.adaptive_pool3d(x, [3, 3, 3], pool_type='avg'))
-            pool, mask = layers.adaptive_pool3d(
-                x, [3, 3, 3], require_index=True)
-            return (pool)
-            return (mask)
-            return (layers.adaptive_pool3d(x, 3, pool_type='avg'))
-            pool, mask = layers.adaptive_pool3d(x, 3, require_index=True)
-            return (pool)
-            return (mask)
-
-    def make_lstm_unit(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x_t_data = self._get_data(
-                name='x_t_data', shape=[10, 10], dtype='float32')
-            x_t = layers.fc(input=x_t_data, size=10)
-            prev_hidden_data = self._get_data(
-                name='prev_hidden_data', shape=[10, 30], dtype='float32')
-            prev_hidden = layers.fc(input=prev_hidden_data, size=30)
-            prev_cell_data = self._get_data(
-                name='prev_cell', shape=[10, 30], dtype='float32')
-            prev_cell = layers.fc(input=prev_cell_data, size=30)
-            return (layers.lstm_unit(
-                x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
-
-    def make_softmax(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(name='data', shape=[10], dtype='float32')
-            hid = layers.fc(input=data, size=20)
-            return (layers.softmax(hid, axis=1))
-
-    def make_space_to_depth(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(
-                name='data',
-                shape=[32, 9, 6, 6],
-                append_batch_size=False,
-                dtype='float32')
-            return (layers.space_to_depth(data, 3))
-
-    def make_lrn(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(name='data', shape=[6, 2, 2], dtype='float32')
-            return (layers.lrn(data))
-
-    def make_get_places(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            get_places(device_count=1)
-
-    @prog_scope()
-    def make_nce(self):
-        window_size = 5
-        words = []
-        for i in range(window_size):
-            words.append(
-                self._get_data(
-                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
-
-        dict_size = 10000
-        label_word = int(window_size // 2) + 1
-
-        embs = []
-        for i in range(window_size):
-            if i == label_word:
-                continue
-
-            emb = layers.embedding(
-                input=words[i],
-                size=[dict_size, 32],
-                param_attr='emb.w',
-                is_sparse=True)
-
-            embs.append(emb)
-
-        embs = layers.concat(input=embs, axis=1)
-        loss = layers.nce(input=embs,
-                          label=words[label_word],
-                          num_total_classes=dict_size,
-                          param_attr='nce.w',
-                          bias_attr='nce.b')
-        avg_loss = layers.mean(loss)
-        return (avg_loss)
-
-    def make_multiplex(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x1 = self._get_data(name='x1', shape=[4], dtype='float32')
-            x2 = self._get_data(name='x2', shape=[4], dtype='float32')
-            index = self._get_data(name='index', shape=[1], dtype='int32')
-            out = layers.multiplex(inputs=[x1, x2], index=index)
-            return (out)
-
-    def make_softmax_with_cross_entropy(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[16], dtype='float32')
-            y = self._get_data(name='label', shape=[1], dtype='int64')
-            loss, softmax = layers.softmax_with_cross_entropy(
-                x, y, return_softmax=True)
-            self.assertIsNotNone(loss)
-            self.assertIsNotNone(softmax)
-
-            loss = layers.softmax_with_cross_entropy(x, y)
-            self.assertIsNotNone(loss)
-
-            x1 = self._get_data(name='x1', shape=[16, 32, 64], dtype='float32')
-            y1 = self._get_data(name='label1', shape=[1, 32, 64], dtype='int64')
-            y2 = self._get_data(name='label2', shape=[16, 1, 64], dtype='int64')
-            y3 = self._get_data(name='label3', shape=[16, 32, 1], dtype='int64')
-            loss1 = layers.softmax_with_cross_entropy(x1, y1, axis=1)
-            loss2 = layers.softmax_with_cross_entropy(x1, y2, axis=2)
-            loss3 = layers.softmax_with_cross_entropy(x1, y3, axis=3)
-            loss4 = layers.softmax_with_cross_entropy(x1, y3, axis=-1)
-            self.assertIsNotNone(loss1)
-            self.assertIsNotNone(loss2)
-            self.assertIsNotNone(loss3)
-            self.assertIsNotNone(loss4)
-            return (loss4)
-
-    def make_smooth_l1(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[4], dtype='float32')
-            y = self._get_data(name='label', shape=[4], dtype='float32')
-            loss = layers.smooth_l1(x, y)
-            return (loss)
-
-    def make_scatter(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(
-                name='x',
-                shape=[3, 3],
-                append_batch_size=False,
-                dtype='float32')
-            idx = self._get_data(
-                name='idx', shape=[2], append_batch_size=False, dtype='int32')
-            updates = self._get_data(
-                name='updates',
-                shape=[2, 3],
-                append_batch_size=False,
-                dtype='float32')
-            out = layers.scatter(input=x, index=idx, updates=updates)
-            return (out)
-
-    def make_one_hot(self):
-        with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
-            label = self._get_data(name="label", shape=[1], dtype="int32")
-            one_hot_label = layers.one_hot(input=label, depth=10)
-            return (one_hot_label)
-
-    def make_label_smooth(self):
-        # TODO(minqiyang): support gpu ut
-        self._force_to_use_cpu = True
-        with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
-            label = self._get_data(name="label", shape=[1], dtype="int32")
-            one_hot_label = layers.one_hot(input=label, depth=10)
-            smooth_label = layers.label_smooth(
-                label=one_hot_label, epsilon=0.1, dtype="int32")
-            return (smooth_label)
-
-    def make_topk(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(name="label", shape=[200], dtype="float32")
-            values, indices = layers.topk(data, k=5)
-            return (values)
-            return (indices)
-
-    def make_resize_bilinear(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 9, 6], dtype="float32")
-            output = layers.resize_bilinear(x, out_shape=[12, 12])
-            return (output)
-
-    def make_resize_bilinear_by_scale(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 9, 6], dtype="float32")
-            output = layers.resize_bilinear(x, scale=1.5)
-            return (output)
-
-    def make_resize_nearest(self):
-        try:
-            with program_guard(fluid.default_main_program(),
-                               fluid.default_startup_program()):
-                x = self._get_data(name='x1', shape=[3, 9, 6], dtype="float32")
-                output = layers.resize_nearest(x, out_shape=[12, 12])
-        except ValueError:
-            pass
-
-        try:
-            with program_guard(fluid.default_main_program(),
-                               fluid.default_startup_program()):
-                x = self._get_data(
-                    name='x2', shape=[3, 9, 6, 7], dtype="float32")
-                output = layers.resize_nearest(x, out_shape=[12, 12, 12])
-        except ValueError:
-            pass
-
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 9, 6], dtype="float32")
-            output = layers.resize_nearest(x, out_shape=[12, 12])
-            return (output)
-
-    def make_resize_nearest_by_scale(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x1', shape=[3, 9, 6], dtype="float32")
-            output = layers.resize_nearest(x, scale=1.8)
-            return (output)
-
-    def make_resize_trilinear(self):
-        try:
-            with program_guard(fluid.default_main_program(),
-                               fluid.default_startup_program()):
-                x = self._get_data(name='x2', shape=[3, 9, 6], dtype="float32")
-                output = layers.resize_trilinear(x, out_shape=[12, 12, 12])
-        except ValueError:
-            pass
-
-        try:
-            with program_guard(fluid.default_main_program(),
-                               fluid.default_startup_program()):
-                x = self._get_data(
-                    name='x', shape=[3, 9, 6, 7], dtype="float32")
-                output = layers.resize_trilinear(x, out_shape=[12, 12])
-        except ValueError:
-            pass
-
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 9, 6, 7], dtype="float32")
-            output = layers.resize_trilinear(x, out_shape=[12, 12, 12])
-            return (output)
-
-    def make_resize_trilinear_by_scale(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 9, 6, 7], dtype="float32")
-            output = layers.resize_trilinear(x, scale=2.1)
-            return (output)
-
-    def make_polygon_box_transform(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[8, 4, 4], dtype="float32")
-            output = layers.polygon_box_transform(input=x)
-            return (output)
-
-    def make_l2_normalize(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[8, 7, 10], dtype="float32")
-            output = layers.l2_normalize(x, axis=1)
-            return output
-
-    def make_maxout(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(name='x', shape=[8, 6, 6], dtype="float32")
-            output = layers.maxout(x=data, groups=2)
-            return (output)
-
-    def make_crop(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 5], dtype="float32")
-            y = self._get_data(name='y', shape=[2, 3], dtype="float32")
-            output = layers.crop(x, shape=y)
-            return (output)
-
-    def make_mean_iou(self):
-        with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
-            x = self._get_data(name='x', shape=[16], dtype='int32')
-            y = self._get_data(name='label', shape=[16], dtype='int32')
-            iou = layers.mean_iou(x, y, self._high_data_bound)
-            return (iou)
-
-    def make_argsort(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(name='x', shape=[2, 3, 3], dtype="float32")
-            out, ids = layers.argsort(input=data, axis=1)
-            return (out)
-            return (ids)
-
-    def make_rank_loss(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            label = self._get_data(
-                name='label',
-                append_batch_size=False,
-                shape=[16, 1],
-                dtype="float32")
-            left = self._get_data(
-                name='left',
-                append_batch_size=False,
-                shape=[16, 1],
-                dtype="float32")
-            right = self._get_data(
-                name='right',
-                append_batch_size=False,
-                shape=[16, 1],
-                dtype="float32")
-            out = layers.rank_loss(label, left, right, name="rank_loss")
-            return (out)
-
-    def make_shape(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[3, 100, 100], dtype="float32")
-            out = layers.shape(input)
-            return (out)
-
-    def make_pad2d(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[3, 100, 100], dtype="float32")
-            paddings = layers.fill_constant(shape=[4], dtype='int32', value=1)
-            out = layers.pad2d(
-                input,
-                paddings=[1, 2, 3, 4],
-                mode='reflect',
-                data_format='NCHW',
-                name="shape")
-            out_1 = layers.pad2d(
-                input,
-                paddings=paddings,
-                mode='reflect',
-                data_format='NCHW',
-                name="shape")
-            return (out)
-            return (out_1)
-
-    def make_prelu(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[5, 200, 100, 100], dtype="float32")
-            mode = 'channel'
-            out = layers.prelu(
-                input,
-                mode,
-                param_attr=ParamAttr(initializer=Constant(1.0)),
-                name='prelu')
-            return (out)
-
-    def make_brelu(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.brelu(input, t_min=1.0, t_max=20.0, name='brelu')
-            return (out)
-
-    def make_leaky_relu(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.leaky_relu(input, alpha=0.1, name='leaky_relu')
-            return (out)
-
-    def make_soft_relu(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.soft_relu(input, threshold=30.0, name='soft_relu')
-            return (out)
-
-    def make_sigmoid(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.sigmoid(input, name='sigmoid')
-            return (out)
-
-    def make_logsigmoid(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.logsigmoid(input, name='logsigmoid')
-            return (out)
-
-    def make_exp(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.exp(input, name='exp')
-            return (out)
-
-    def make_tanh(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.tanh(input, name='tanh')
-            return (out)
-
-    def make_tanh_shrink(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.tanh_shrink(input, name='tanh_shrink')
-            return (out)
-
-    def make_sqrt(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.sqrt(input, name='sqrt')
-            return (out)
-
-    def make_abs(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.abs(input, name='abs')
-            return (out)
-
-    def make_ceil(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.ceil(input, name='ceil')
-            return (out)
-
-    def make_floor(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.floor(input, name='floor')
-            return (out)
-
-    def make_cos(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.cos(input, name='cos')
-            return (out)
-
-    def make_sin(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.sin(input, name='sin')
-            return (out)
-
-    def make_round(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.round(input, name='round')
-            return (out)
-
-    def make_reciprocal(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.reciprocal(input, name='reciprocal')
-            return (out)
-
-    def make_square(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.square(input, name='square')
-            return (out)
-
-    def make_softplus(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.softplus(input, name='softplus')
-            return (out)
-
-    def make_softsign(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.softsign(input, name='softsign')
-            return (out)
-
-    def make_cross_entropy(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name="x", shape=[30, 10], dtype="float32")
-            label = self._get_data(name="label", shape=[30, 1], dtype="int64")
-            mode = 'channel'
-            out = layers.cross_entropy(x, label, False, 4)
-            return (out)
-
-    def make_bpr_loss(self):
-        self._force_to_use_cpu = True
-        with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
-            x = self._get_data(name="x", shape=[30, 10], dtype="float32")
-            label = self._get_data(name="label", shape=[30, 1], dtype="int64")
-            out = layers.bpr_loss(x, label)
-            return (out)
-
-    def make_expand(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name="input", shape=[10], dtype='int32')
-            out = layers.expand(x, [1, 2])
-            return out
-
-    def make_uniform_random_batch_size_like(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[13, 11], dtype='float32')
-            out = layers.uniform_random_batch_size_like(input, [-1, 11])
-            return (out)
-
-    def make_gaussian_random(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            out = layers.gaussian_random(shape=[20, 30])
-            return (out)
-
-    def make_sampling_id(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(
-                name="X",
-                shape=[13, 11],
-                dtype='float32',
-                append_batch_size=False)
-
-            out = layers.sampling_id(x)
-            return (out)
-
-    def make_gaussian_random_batch_size_like(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[13, 11], dtype='float32')
-
-            out = layers.gaussian_random_batch_size_like(
-                input, shape=[-1, 11], mean=1.0, std=2.0)
-            return (out)
-
-    def make_sum(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[13, 11], dtype='float32')
-
-            out = layers.sum(input)
-            return (out)
-
-    def make_slice(self):
-        starts = [1, 0, 2]
-        ends = [3, 3, 4]
-        axes = [0, 1, 2]
-
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[3, 4, 5, 6], dtype='float32')
-
-            out = layers.slice(input, axes=axes, starts=starts, ends=ends)
-            return out
-
-    def make_softshrink(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.softshrink(input, alpha=0.3)
-            return (out)
-
-    def make_iou_similarity(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name="x", shape=[4], dtype="float32")
-            y = self._get_data(name="y", shape=[4], dtype="float32")
-            out = layers.iou_similarity(x, y, name='iou_similarity')
-            return (out)
-
-    def make_grid_sampler(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 5, 7], dtype='float32')
-            grid = self._get_data(name='grid', shape=[5, 7, 2], dtype='float32')
-            out = layers.grid_sampler(x, grid)
-            return (out)
-
-    def make_bilinear_tensor_product_layer(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(name='data', shape=[4], dtype="float32")
-
-            theta = self._get_data(name="theta", shape=[5], dtype="float32")
-            out = layers.bilinear_tensor_product(data, theta, 6)
-            return (out)
-
-    def make_batch_norm(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(
-                name='data', shape=[32, 128, 128], dtype="float32")
-            out = layers.batch_norm(data)
-            return (out)
-
-    def make_range(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            layers.range(0, 10, 2, 'int32')
-            y = layers.range(0.1, 10.0, 0.2, 'float32')
-            return y
-
-    def make_spectral_norm(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            weight = self._get_data(
-                name='weight',
-                shape=[2, 3, 32, 32],
-                dtype="float32",
-                append_batch_size=False)
-            out = layers.spectral_norm(weight, dim=1, power_iters=1)
-            return (out)
-
-    def make_kldiv_loss(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(
-                name='x',
-                shape=[32, 128, 128],
-                dtype="float32",
-                append_batch_size=False)
-            target = self._get_data(
-                name='target',
-                shape=[32, 128, 128],
-                dtype="float32",
-                append_batch_size=False)
-            loss = layers.kldiv_loss(x=x, target=target, reduction='batchmean')
-            return (loss)
-
-    def make_temporal_shift(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name="X", shape=[16, 4, 4], dtype="float32")
-            out = layers.temporal_shift(x, seg_num=2, shift_ratio=0.2)
-            return (out)
-
-    def make_shuffle_channel(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name="X", shape=[16, 4, 4], dtype="float32")
-            out = layers.shuffle_channel(x, group=4)
-            return (out)
-
-    def make_fsp_matrix(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name="X", shape=[16, 4, 4], dtype="float32")
-            y = self._get_data(name="Y", shape=[8, 4, 4], dtype="float32")
-            out = layers.fsp_matrix(x, y)
-            return (out)
-
-    def make_pixel_shuffle(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name="X", shape=[9, 4, 4], dtype="float32")
-            out = layers.pixel_shuffle(x, upscale_factor=3)
-            return (out)
-
-    def make_mse_loss(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name="X", shape=[1], dtype="float32")
-            y = self._get_data(name="Y", shape=[1], dtype="float32")
-            out = layers.mse_loss(input=x, label=y)
-            return (out)
-
-    def make_square_error_cost(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name="X", shape=[1], dtype="float32")
-            y = self._get_data(name="Y", shape=[1], dtype="float32")
-            out = layers.square_error_cost(input=x, label=y)
-            return (out)
-
-    def test_dynamic_lstmp(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            hidden_dim, proj_dim = 16, 8
-            seq_data = layers.data(
-                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
-            fc_out = layers.fc(input=seq_data, size=4 * hidden_dim)
-            self.assertIsNotNone(
-                layers.dynamic_lstmp(
-                    input=fc_out, size=4 * hidden_dim, proj_size=proj_dim))
-
-    def test_linear_chain_crf(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            label_dict_len = 10
-            images = layers.data(name='pixel', shape=[784], dtype='float32')
-            label = layers.data(name='label', shape=[1], dtype='int32')
-            hidden = layers.fc(input=images, size=2)
-            crf = layers.linear_chain_crf(
-                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
-            crf_decode = layers.crf_decoding(
-                input=hidden, param_attr=ParamAttr(name="crfw"))
-            self.assertFalse(crf is None)
-            self.assertFalse(crf_decode is None)
-            return layers.chunk_eval(
-                input=crf_decode,
-                label=label,
-                chunk_scheme="IOB",
-                num_chunk_types=(label_dict_len - 1) // 2)
-
-    def test_im2sequence(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
-            y = layers.data(name='y', shape=[], dtype='float32')
-            output = layers.im2sequence(
-                input=x,
-                input_image_size=y,
-                stride=[1, 1],
-                filter_size=[2, 2],
-                out_stride=[1, 1])
-            return (output)
-
-    def test_lod_reset(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            # case 1
-            x = layers.data(name='x', shape=[10], dtype='float32')
-            y = layers.data(
-                name='y', shape=[10, 20], dtype='float32', lod_level=2)
-            z = layers.lod_reset(x=x, y=y)
-            self.assertTrue(z.lod_level == 2)
-            # case 2
-            lod_tensor_in = layers.data(name='lod_in', shape=[1], dtype='int64')
-            z = layers.lod_reset(x=x, y=lod_tensor_in)
-            self.assertTrue(z.lod_level == 1)
-            # case 3
-            z = layers.lod_reset(x=x, target_lod=[1, 2, 3])
-            self.assertTrue(z.lod_level == 1)
-            return z
-
-    def test_affine_grid(self):
-        with self.static_graph():
-            data = layers.data(name='data', shape=[2, 3, 3], dtype="float32")
-            out, ids = layers.argsort(input=data, axis=1)
-
-            theta = layers.data(name="theta", shape=[2, 3], dtype="float32")
-            out_shape = layers.data(
-                name="out_shape", shape=[-1], dtype="float32")
-            data_0 = layers.affine_grid(theta, out_shape)
-            data_1 = layers.affine_grid(theta, [5, 3, 28, 28])
-
-            self.assertIsNotNone(data_0)
-            self.assertIsNotNone(data_1)
-
-    def test_stridedslice(self):
-        axes = [0, 1, 2]
-        starts = [1, 0, 2]
-        ends = [3, 3, 4]
-        strides = [1, 1, 1]
-        with self.static_graph():
-            x = layers.data(name="x", shape=[245, 30, 30], dtype="float32")
-            out = layers.strided_slice(
-                x, axes=axes, starts=starts, ends=ends, strides=strides)
-            return out
-
-    def test_psroi_pool(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name="x", shape=[245, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            output = layers.psroi_pool(x, rois, 5, 0.25, 7, 7)
-            return (output)
-
-    def test_sequence_expand(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name='x', shape=[10], dtype='float32')
-            y = layers.data(
-                name='y', shape=[10, 20], dtype='float32', lod_level=2)
-            return (layers.sequence_expand(x=x, y=y, ref_level=1))
-
-    def test_sequence_reshape(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name='x', shape=[8], dtype='float32', lod_level=1)
-            out = layers.sequence_reshape(input=x, new_dim=16)
-            return (out)
-
-    def test_sequence_unpad(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name='x', shape=[10, 5], dtype='float32')
-            length = layers.data(name='length', shape=[], dtype='int64')
-            return (layers.sequence_unpad(x=x, length=length))
-
-    def test_sequence_softmax(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            seq_data = layers.data(
-                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
-            seq = layers.fc(input=seq_data, size=20)
-            return (layers.sequence_softmax(seq))
-
-    def test_sequence_unsqueeze(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name='x', shape=[8, 2], dtype='float32')
-            out = layers.unsqueeze(input=x, axes=[1])
-            return (out)
-
-    def test_sequence_scatter(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(
-                name='x',
-                shape=[3, 6],
-                append_batch_size=False,
-                dtype='float32')
-            idx = layers.data(
-                name='idx',
-                shape=[12, 1],
-                append_batch_size=False,
-                dtype='int32',
-                lod_level=1)
-            updates = layers.data(
-                name='updates',
-                shape=[12, 1],
-                append_batch_size=False,
-                dtype='float32',
-                lod_level=1)
-            out = layers.sequence_scatter(input=x, index=idx, updates=updates)
-            return (out)
-
-    def test_sequence_slice(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            import numpy as np
-            seqs = layers.data(
-                name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            offset = layers.assign(input=np.array([[0, 1]]).astype('int32'))
-            length = layers.assign(input=np.array([[2, 1]]).astype('int32'))
-            out = layers.sequence_slice(
-                input=seqs, offset=offset, length=length)
-            return (out)
-
-    def test_filter_by_instag(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x1 = layers.data(
-                name='Ins', shape=[32, 1], dtype='float32', lod_level=0)
-            x2 = layers.data(
-                name='Ins_tag',
-                shape=[32, 1],
-                dtype='int64',
-                lod_level=0,
-                stop_gradient=True)
-            x3 = layers.create_global_var(
-                shape=[1, 1],
-                value=20,
-                dtype='int64',
-                persistable=True,
-                force_cpu=True,
-                name='Filter_tag')
-            out1, out2 = layers.filter_by_instag(x1, x2, x3, is_lod=True)
-
-    def test_roi_pool(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            output = layers.roi_pool(x, rois, 7, 7, 0.6)
-            return (output)
-
-    def test_sequence_enumerate(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name="input", shape=[1], dtype='int32', lod_level=1)
-            out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
-
-    def test_roi_align(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            output = layers.roi_align(x, rois, 14, 14, 0.5, 2)
-            return (output)
-
-    def test_roi_perspective_transform(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[8], dtype="float32", lod_level=1)
-            output = layers.roi_perspective_transform(x, rois, 7, 7, 0.6)
-            return (output)
-
-    def test_row_conv(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name='x', shape=[16], dtype='float32', lod_level=1)
-            out = layers.row_conv(input=x, future_context_size=2)
-            return (out)
-
-    def test_simple_conv2d(self):
-        # TODO(minqiyang): dygraph do not support layers with param now
-        with self.static_graph():
-            images = layers.data(
-                name='pixel', shape=[3, 48, 48], dtype='float32')
-            return layers.conv2d(
-                input=images, num_filters=3, filter_size=[4, 4])
-
-    def test_squeeze(self):
-        # TODO(minqiyang): dygraph do not support layers with param now
-        with self.static_graph():
-            x = layers.data(name='x', shape=[1, 1, 4], dtype='float32')
-            out = layers.squeeze(input=x, axes=[2])
-            return (out)
-
-    def test_flatten(self):
-        # TODO(minqiyang): dygraph do not support op without kernel now
-        with self.static_graph():
-            x = layers.data(
-                name='x',
-                append_batch_size=False,
-                shape=[4, 4, 3],
-                dtype="float32")
-            out = layers.flatten(x, axis=1, name="flatten")
-            return (out)
-
-    def test_linspace(self):
-        program = Program()
-        with program_guard(program):
-            out = layers.linspace(20, 10, 5, 'float64')
-            self.assertIsNotNone(out)
-        print(str(program))
-
-    def test_deformable_conv(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = layers.data(
-                name='input',
-                append_batch_size=False,
-                shape=[2, 3, 32, 32],
-                dtype="float32")
-            offset = layers.data(
-                name='offset',
-                append_batch_size=False,
-                shape=[2, 18, 32, 32],
-                dtype="float32")
-            mask = layers.data(
-                name='mask',
-                append_batch_size=False,
-                shape=[2, 9, 32, 32],
-                dtype="float32")
-            out = layers.deformable_conv(
-                input=input,
-                offset=offset,
-                mask=mask,
-                num_filters=2,
-                filter_size=3,
-                padding=1)
-            return (out)
-
-    def test_unfold(self):
-        with self.static_graph():
-            x = layers.data(name='x', shape=[3, 20, 20], dtype='float32')
-            out = layers.unfold(x, [3, 3], 1, 1, 1)
-            return (out)
-
-    def test_deform_roi_pooling(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = layers.data(
-                name='input',
-                shape=[2, 3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
-            rois = layers.data(
-                name="rois", shape=[4], dtype='float32', lod_level=1)
-            trans = layers.data(
-                name="trans",
-                shape=[2, 3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
-            out = layers.deformable_roi_pooling(
-                input=input,
-                rois=rois,
-                trans=trans,
-                no_trans=False,
-                spatial_scale=1.0,
-                group_size=(1, 1),
-                pooled_height=8,
-                pooled_width=8,
-                part_size=(8, 8),
-                sample_per_part=4,
-                trans_std=0.1)
-        return (out)
-
-    def test_deformable_conv_v1(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = layers.data(
-                name='input',
-                append_batch_size=False,
-                shape=[2, 3, 32, 32],
-                dtype="float32")
-            offset = layers.data(
-                name='offset',
-                append_batch_size=False,
-                shape=[2, 18, 32, 32],
-                dtype="float32")
-            out = layers.deformable_conv(
-                input=input,
-                offset=offset,
-                mask=None,
-                num_filters=2,
-                filter_size=3,
-                padding=1,
-                modulated=False)
-            return (out)
-
-    def test_retinanet_target_assign(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            bbox_pred = layers.data(
-                name='bbox_pred',
-                shape=[1, 100, 4],
-                append_batch_size=False,
-                dtype='float32')
-            cls_logits = layers.data(
-                name='cls_logits',
-                shape=[1, 100, 10],
-                append_batch_size=False,
-                dtype='float32')
-            anchor_box = layers.data(
-                name='anchor_box',
-                shape=[100, 4],
-                append_batch_size=False,
-                dtype='float32')
-            anchor_var = layers.data(
-                name='anchor_var',
-                shape=[100, 4],
-                append_batch_size=False,
-                dtype='float32')
-            gt_boxes = layers.data(
-                name='gt_boxes',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            gt_labels = layers.data(
-                name='gt_labels',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='float32')
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[1],
-                append_batch_size=False,
-                dtype='float32')
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                append_batch_size=False,
-                dtype='float32')
-            return (layers.retinanet_target_assign(
-                bbox_pred, cls_logits, anchor_box, anchor_var, gt_boxes,
-                gt_labels, is_crowd, im_info, 10))
-
-    def test_sigmoid_focal_loss(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = layers.data(
-                name='data',
-                shape=[10, 80],
-                append_batch_size=False,
-                dtype='float32')
-            label = layers.data(
-                name='label',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='int32')
-            fg_num = layers.data(
-                name='fg_num',
-                shape=[1],
-                append_batch_size=False,
-                dtype='int32')
-            out = fluid.layers.sigmoid_focal_loss(
-                x=input, label=label, fg_num=fg_num, gamma=2., alpha=0.25)
-            return (out)
-
-    def test_retinanet_detection_output(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            bboxes = layers.data(
-                name='bboxes',
-                shape=[1, 21, 4],
-                append_batch_size=False,
-                dtype='float32')
-            scores = layers.data(
-                name='scores',
-                shape=[1, 21, 10],
-                append_batch_size=False,
-                dtype='float32')
-            anchors = layers.data(
-                name='anchors',
-                shape=[21, 4],
-                append_batch_size=False,
-                dtype='float32')
-            im_info = layers.data(
-                name="im_info",
-                shape=[1, 3],
-                append_batch_size=False,
-                dtype='float32')
-            nmsed_outs = layers.retinanet_detection_output(
-                bboxes=[bboxes, bboxes],
-                scores=[scores, scores],
-                anchors=[anchors, anchors],
-                im_info=im_info,
-                score_threshold=0.05,
-                nms_top_k=1000,
-                keep_top_k=100,
-                nms_threshold=0.3,
-                nms_eta=1.)
-            return (nmsed_outs)
-
-    def test_warpctc_with_padding(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            input_length = layers.data(
-                name='logits_length', shape=[11], dtype='int64')
-            label_length = layers.data(
-                name='labels_length', shape=[12], dtype='int64')
-            label = layers.data(name='label', shape=[12, 1], dtype='int32')
-            predict = layers.data(
-                name='predict', shape=[4, 4, 8], dtype='float32')
-            output = layers.warpctc(
-                input=predict,
-                label=label,
-                input_length=input_length,
-                label_length=label_length)
-            return (output)
-
-    def test_edit_distance(self):
-        with self.static_graph():
-            predict = layers.data(
-                name='predict', shape=[-1, 1], dtype='int64', lod_level=1)
-            label = layers.data(
-                name='label', shape=[-1, 1], dtype='int64', lod_level=1)
-            evaluator = fluid.evaluator.EditDistance(predict, label)
-            return evaluator.metrics
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
deleted file mode 100644
index 88d9919f59619cabad2e4ceca839e4a13d2cfd23..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import copy
-import math
-import unittest
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.framework as framework
-import paddle.fluid.core as core
-
-
-def exponential_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False):
-    exponent = global_step / decay_steps
-    if staircase:
-        exponent = math.floor(exponent)
-    return learning_rate * decay_rate**exponent
-
-
-def natural_exp_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False):
-    exponent = float(global_step) / float(decay_steps)
-    if staircase:
-        exponent = math.floor(exponent)
-    return learning_rate * math.exp(-1 * decay_rate * exponent)
-
-
-def inverse_time_decay(learning_rate,
-                       global_step,
-                       decay_steps,
-                       decay_rate,
-                       staircase=False):
-    temp = float(global_step) / float(decay_steps)
-    if staircase:
-        temp = math.floor(temp)
-    return learning_rate / (1 + decay_rate * temp)
-
-
-def polynomial_decay(learning_rate,
-                     global_step,
-                     decay_steps,
-                     end_learning_rate=0.0001,
-                     power=1.0,
-                     cycle=False):
-    if cycle:
-        div = math.ceil(global_step / float(decay_steps))
-        if div == 0:
-            div = 1
-        decay_steps = decay_steps * div
-    else:
-        global_step = min(global_step, decay_steps)
-    return (learning_rate - end_learning_rate) * \
-           ((1 - float(global_step) / float(decay_steps)) ** power) + end_learning_rate
-
-
-def piecewise_decay(global_step, boundaries, values):
-    assert len(boundaries) + 1 == len(values)
-    for i in range(len(boundaries)):
-        if global_step < boundaries[i]:
-            return values[i]
-    return values[len(values) - 1]
-
-
-def cosine_decay(global_step, learning_rate, step_each_epoch, epochs):
-    cur_epoch = math.floor(global_step / step_each_epoch)
-    decayed_lr = learning_rate * 0.5 * (
-        math.cos(cur_epoch * math.pi / epochs) + 1)
-    return decayed_lr
-
-
-class TestLearningRateDecay(unittest.TestCase):
-    def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            self.check_decay_with_place(place, python_decay_fn, fluid_decay_fn,
-                                        kwargs)
-
-    def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
-                               kwargs):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-
-        with fluid.program_guard(main_prog, startup_prog):
-            decayed_lr = fluid_decay_fn(**kwargs)
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        exe.run(startup_prog)
-
-        for step in range(10):
-            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
-            python_decayed_lr = python_decay_fn(
-                global_step=float(step), **kwargs)
-            self.assertAlmostEqual(
-                python_decayed_lr,
-                lr_val[0],
-                msg='Failed lr scheduler is {0}, step {1}, Python result is {2}, Fluid result is {3}'.
-                format(python_decay_fn.__name__,
-                       str(step), str(python_decayed_lr), str(lr_val[0])))
-
-    def test_decay(self):
-        common_kwargs_true = {
-            "learning_rate": 1.0,
-            "decay_steps": 5,
-            "decay_rate": 0.5,
-            "staircase": True
-        }
-        common_kwargs_false = copy.deepcopy(common_kwargs_true)
-        common_kwargs_false["staircase"] = False
-
-        decay_fns = [
-            (exponential_decay, layers.exponential_decay, common_kwargs_true),
-            (exponential_decay, layers.exponential_decay, common_kwargs_false),
-            (natural_exp_decay, layers.natural_exp_decay, common_kwargs_true),
-            (natural_exp_decay, layers.natural_exp_decay, common_kwargs_false),
-            (inverse_time_decay, layers.inverse_time_decay, common_kwargs_true),
-            (inverse_time_decay, layers.inverse_time_decay,
-             common_kwargs_false),
-            (polynomial_decay, layers.polynomial_decay, {
-                "learning_rate": 1.0,
-                "decay_steps": 5,
-                "cycle": True
-            }),
-            (polynomial_decay, layers.polynomial_decay, {
-                "learning_rate": 1.0,
-                "decay_steps": 5,
-                "cycle": False
-            }),
-            (piecewise_decay, layers.piecewise_decay, {
-                "boundaries": [3, 6, 9],
-                "values": [0.1, 0.2, 0.3, 0.4]
-            }),
-            (cosine_decay, layers.cosine_decay, {
-                "learning_rate": 0.1,
-                "step_each_epoch": 100,
-                "epochs": 120
-            }),
-        ]
-
-        for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
-            print("class=" + self.__class__.__name__ + "decay_fn=" +
-                  py_decay_fn.__name__ + " kwargs=" + str(kwargs))
-            main_program = framework.Program()
-            startup_program = framework.Program()
-            with framework.program_guard(main_program, startup_program):
-                self.check_decay(py_decay_fn, fluid_decay_fn, kwargs)
-
-
-def linear_lr_warmup(global_step, warmup_steps, start_lr, end_lr):
-    linear_step = end_lr - start_lr
-    decayed_lr = start_lr + linear_step * (global_step / warmup_steps)
-    return decayed_lr
-
-
-class TestLinearWamrupLearningRateDecay(TestLearningRateDecay):
-    def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
-                               kwargs):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-
-        warmup_steps = 10
-        start_lr = 0.1 / 3.
-        end_lr = 0.1
-
-        with fluid.program_guard(main_prog, startup_prog):
-            decayed_lr = layers.linear_lr_warmup(
-                fluid_decay_fn(**kwargs), warmup_steps, start_lr, end_lr)
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-
-        for step in range(20):
-            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
-            if step < warmup_steps:
-                python_decayed_lr = linear_lr_warmup(
-                    float(step), warmup_steps, start_lr, end_lr)
-            else:
-                python_decayed_lr = python_decay_fn(
-                    global_step=float(step), **kwargs)
-            self.assertAlmostEqual(
-                python_decayed_lr,
-                lr_val[0],
-                msg='Test {0} Failed, step {1}, Python result is {2}, Fluid result is {3}'.
-                format(python_decay_fn.__name__,
-                       str(step), str(python_decayed_lr), str(lr_val[0])))
-
-
-class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase):
-    def run_scalar_lr(self, place, lr, start_lr, end_lr):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-
-        warmup_steps = 10
-
-        with fluid.program_guard(main_prog, startup_prog):
-            decayed_lr = layers.linear_lr_warmup(lr, warmup_steps, start_lr,
-                                                 end_lr)
-
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-
-        for step in range(20):
-            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
-            if step < warmup_steps:
-                expected_lr = linear_lr_warmup(
-                    float(step), warmup_steps, start_lr, end_lr)
-            else:
-                expected_lr = lr
-            self.assertAlmostEqual(
-                expected_lr,
-                lr_val[0],
-                msg='Test failed, step {0}, expected {1}, but got {2}'.format(
-                    step, expected_lr, lr_val[0]))
-
-    def test_scalar_lr(self):
-        def run_places(lr, start_lr, end_lr):
-            places = [fluid.CPUPlace()]
-            if core.is_compiled_with_cuda():
-                places.append(fluid.CUDAPlace(0))
-            for p in places:
-                self.run_scalar_lr(p, lr, start_lr, end_lr)
-
-        # float
-        lr = 0.2
-        start_lr = 0.1 / 3.
-        end_lr = 0.2
-        run_places(lr, start_lr, end_lr)
-
-        # int end_lr
-        lr = 2.
-        start_lr = 0.1 / 3.
-        end_lr = 1
-        run_places(lr, start_lr, end_lr)
-
-        # int
-        lr = 1
-        start_lr = 0
-        end_lr = 1
-        run_places(lr, start_lr, end_lr)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
deleted file mode 100755
index b86d9586019672e347064415f45bd56517c18f88..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ /dev/null
@@ -1,237 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import random
-import numpy as np
-
-from op_test import OpTest
-
-
-class LinearChainCrfForward(object):
-    def __init__(self, seq_start_positions, emission_weights, emission_row_max,
-                 emission_exps, transition_weights, transition_exps, labels):
-        self.tag_num = emission_weights.shape[1]
-        self.seq_num = len(seq_start_positions) - 1
-
-        self.seq_start_positions = seq_start_positions
-        self.labels = labels
-        self.x = emission_weights
-
-        self.x_row_max = emission_row_max
-        self.x_exps = emission_exps
-
-        # unnormalized logits of the transition weights for the start mark.
-        self.a = transition_weights[0, :]
-        self.a_exps = transition_exps[0, :]
-        # unnormalized logits of the transition weights for the end mark.
-        self.b = transition_weights[1, :]
-        self.b_exps = transition_exps[1, :]
-        # unnormalized logits of the transition weights for all the other tags.
-        self.w = transition_weights[2:, :]
-        self.w_exps = transition_exps[2:, :]
-
-        # The output of linear chain crf operator.
-        # alpha is a memo table in dynamic programming to caculate
-        # nomalization factor.
-        self.alpha = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="float64")
-        self.log_likelihood = np.zeros((self.seq_num, 1))
-
-    def _l1_norm(self, x):
-        s = np.sum(x)
-        x /= s
-        return s
-
-    def _forward_a_sequence(self, x, x_row_max, x_exps, label, alpha):
-        seq_len = x_row_max.shape[0]
-        log_likelihood = 0.
-
-        for i in range(self.tag_num):
-            alpha[0, i] = self.a_exps[i] * x_exps[0, i]
-        log_likelihood = -x_row_max[0] - np.log(self._l1_norm(alpha[0, :]))
-
-        # calculate the unnormalized logits of the normalization factor.
-        for k in range(1, seq_len):
-            for i in range(self.tag_num):
-                s = 0.
-                for j in range(self.tag_num):
-                    s += alpha[k - 1, j] * self.w_exps[j, i]
-                alpha[k, i] = x_exps[k, i] * s
-            log_likelihood -= x_row_max[k] + np.log(self._l1_norm(alpha[k, :]))
-        s = 0.
-        for i in range(self.tag_num):
-            s += alpha[-1, i] * self.b_exps[i]
-        log_likelihood -= np.log(s)
-
-        # calculate the nominator part.
-        log_likelihood += (
-            self.a[label[0]] + x[0, label[0]] + self.b[label[-1]])
-
-        for k in range(1, seq_len):
-            log_likelihood += (x[k, label[k]] + self.w[label[k - 1], label[k]])
-        return -log_likelihood
-
-    def crf_forward_compute(self):
-        for i in range(self.seq_num):
-            start = self.seq_start_positions[i]
-            end = self.seq_start_positions[i + 1]
-            if start >= end:
-                continue
-            self.log_likelihood[i] = self._forward_a_sequence(
-                self.x[start:end, :], self.x_row_max[start:end, :],
-                self.x_exps[start:end, :], self.labels[start:end, :],
-                self.alpha[start:end, :])
-        return self.alpha, self.log_likelihood
-
-
-class TestLinearChainCrfOp(OpTest):
-    def set_test_data(self):
-        # TODO(caoying) Fix the unittest by: add the boundary cases when
-        # sequence lengths are 1, 2, and 3.
-
-        SEQ_NUM = 3
-        TAG_NUM = 17
-        MAX_SEQ_LEN = 5
-
-        # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[]]
-        seq_start_pos = [0]
-        for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
-            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
-        emission = np.random.uniform(
-            -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64")
-        emission_row_max = np.amax(emission, axis=1, keepdims=True)
-        emission_exps = np.exp(emission - emission_row_max)
-
-        transition = np.random.uniform(-0.5, 0.5,
-                                       [TAG_NUM + 2, TAG_NUM]).astype("float64")
-        transition_exps = np.exp(transition)
-
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64")
-
-        self.inputs = {
-            "Emission": (emission, lod),
-            "Transition": transition,
-            "Label": (labels, lod)
-        }
-        crf = LinearChainCrfForward(seq_start_pos, emission, emission_row_max,
-                                    emission_exps, transition, transition_exps,
-                                    labels)
-        alpha, log_likelihood = crf.crf_forward_compute()
-
-        self.outputs = {
-            "Alpha": alpha,
-            "EmissionExps": emission_exps,
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood
-        }
-
-    def setUp(self):
-        self.op_type = "linear_chain_crf"
-        self.set_test_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Emission", "Transition"], "LogLikelihood")
-
-    def test_check_grad_ignore_transition(self):
-        self.check_grad(
-            ["Emission"], "LogLikelihood", no_grad_set=set("Transition"))
-
-
-class TestLinearChainCrfPaddingTensor(OpTest):
-    def seq_pad(self, data, length):
-        max_len = np.max(length)
-        shape = [len(length), max_len] + list(data.shape[1:])
-        padded = np.zeros(shape).astype(data.dtype)
-        offset = 0
-        for i, l in enumerate(length):
-            padded[i, 0:l] = data[offset:offset + l]
-            offset += l
-        return padded
-
-    def seq_pad_exps(self, data, length):
-        # Adding for transition_exps
-        max_len = np.max(length)
-        shape = [len(length), max_len] + list(data.shape[1:])
-        padded = np.ones(shape).astype(data.dtype)
-        offset = 0
-        for i, l in enumerate(length):
-            padded[i, 0:l] = data[offset:offset + l]
-            offset += l
-        return padded
-
-    def set_test_data_1(self):
-        # Fix the unittest by: add padding tensor in inputs 
-        SEQ_NUM = 3
-        TAG_NUM = 17
-        MAX_SEQ_LEN = 5
-
-        # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[]]
-        seq_start_pos = [0]
-        for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
-            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
-        emission = np.random.uniform(
-            -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64")
-        emission_row_max = np.amax(emission, axis=1, keepdims=True)
-        emission_exps = np.exp(emission - emission_row_max)
-        transition = np.random.uniform(-0.5, 0.5,
-                                       [TAG_NUM + 2, TAG_NUM]).astype("float64")
-        transition_exps = np.exp(transition)
-
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64")
-        self.inputs = {
-            "Emission": self.seq_pad(emission, lod[0]),
-            "Transition": transition,
-            "Label": self.seq_pad(labels, lod[0]),
-            "length": np.array(lod).astype("int64")
-        }
-        crf = LinearChainCrfForward(seq_start_pos, emission, emission_row_max,
-                                    emission_exps, transition, transition_exps,
-                                    labels)
-        alpha, log_likelihood = crf.crf_forward_compute()
-        self.outputs = {
-            "Alpha": self.seq_pad(alpha, lod[0]),
-            "EmissionExps": self.seq_pad_exps(emission_exps, lod[0]),
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood
-        }
-
-    def setUp(self):
-        self.op_type = "linear_chain_crf"
-        self.set_test_data_1()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Emission", "Transition"], "LogLikelihood")
-
-    def test_check_grad_ignore_transition(self):
-        self.check_grad(
-            ["Emission"], "LogLikelihood", no_grad_set=set("Transition"))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
deleted file mode 100644
index eeecf178320327cc251f32bfe46c1622200339f4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestLinspaceOpCommonCase(OpTest):
-    def setUp(self):
-        self.op_type = "linspace"
-        dtype = 'float32'
-        self.inputs = {
-            'Start': np.array([0]).astype(dtype),
-            'Stop': np.array([10]).astype(dtype),
-            'Num': np.array([11]).astype('int32')
-        }
-
-        self.outputs = {'Out': np.arange(0, 11).astype(dtype)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestLinspaceOpReverseCase(OpTest):
-    def setUp(self):
-        self.op_type = "linspace"
-        dtype = 'float32'
-        self.inputs = {
-            'Start': np.array([10]).astype(dtype),
-            'Stop': np.array([0]).astype(dtype),
-            'Num': np.array([11]).astype('int32')
-        }
-
-        self.outputs = {'Out': np.arange(10, -1, -1).astype(dtype)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestLinspaceOpNumOneCase(OpTest):
-    def setUp(self):
-        self.op_type = "linspace"
-        dtype = 'float32'
-        self.inputs = {
-            'Start': np.array([10]).astype(dtype),
-            'Stop': np.array([0]).astype(dtype),
-            'Num': np.array([1]).astype('int32')
-        }
-
-        self.outputs = {'Out': np.array(10, dtype=dtype)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh b/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
deleted file mode 100644
index f47e869f9b76fc99fc63d388ba85e2134ea38c44..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-unset https_proxy http_proxy
-
-nohup python -u test_listen_and_serv_op.py > test_listen_and_serv_op.log 2>&1 &
-pid=$!
-
-flag1=test_handle_signal_in_serv_op.flag
-flag2=test_list_and_serv_run_empty_optimize_block.flag
-
-for i in {1..10}; do 
-    sleep 3s
-    if [[ -f "${flag1}" && -f "${flag2}" ]];  then
-        echo "test_listen_and_serv_op exit"
-        exit 0
-    fi
-done
-
-echo "test_listen_and_serv_op.log context"
-cat test_listen_and_serv_op.log
-
-#display system context
-for i in {1..4}; do 
-    sleep 2 
-    top -b -n1  | head -n 50
-    echo "${i}"
-    top -b -n1 -i  | head -n 50
-    nvidia-smi
-done
-
-#display /tmp/files
-ls -l /tmp/paddle.*
-
-if ! pgrep -x test_listen_and_serv_op; then
-    exit 1
-fi
-
-kill -9 $pid
-
-echo "after kill ${pid}"
-
-#display system context
-for i in {1..4}; do 
-    sleep 2 
-    top -b -n1  | head -n 50
-    top -b -n1 -i  | head -n 50
-    nvidia-smi
-done
-
-exit 1
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
deleted file mode 100644
index 07a0ae9a82eb05416f821baaaa4c4a84cc30f6e2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ /dev/null
@@ -1,193 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from dist_test_utils import *
-
-silentremove("test_handle_signal_in_serv_op.flag")
-silentremove("test_list_and_serv_run_empty_optimize_block.flag")
-
-import paddle
-import paddle.fluid as fluid
-import signal
-import subprocess
-import time
-import unittest
-from multiprocessing import Process
-from op_test import OpTest
-
-
-def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
-    remove_ps_flag(os.getpid())
-    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-    # loss function
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-
-    # optimizer
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    pserver_endpoints = ip + ":" + port
-    current_endpoint = ip + ":" + port
-
-    config = fluid.DistributeTranspilerConfig()
-    config.sync_mode = sync_mode
-    t = fluid.DistributeTranspiler(config=config)
-    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-
-
-def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
-                                 trainer_id):
-    remove_ps_flag(os.getpid())
-    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None, bias_attr=False)
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-    # loss function
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-
-    # optimizer
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    ps1 = ip + ":" + str(int(port) + 1)
-    ps2 = ip + ":" + port
-    pserver_endpoints = ps1 + "," + ps2
-
-    config = fluid.DistributeTranspilerConfig()
-    config.sync_mode = sync_mode
-    config.slice_var_up = False
-
-    t = fluid.DistributeTranspiler(config=config)
-    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-    pserver_prog = t.get_pserver_program(ps2)
-
-    # pserver2 have no parameter
-    assert (len(pserver_prog.blocks) == 2)
-    assert (len(pserver_prog.blocks[1].ops) == 0)
-
-    pserver_startup = t.get_startup_program(ps2, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-
-
-def gen_complete_file_flag(flag_file):
-    with open(flag_file, "w") as f:
-        f.write("complete")
-
-
-class TestListenAndServOp(unittest.TestCase):
-    def setUp(self):
-        self.ps_timeout = 5
-        self.ip = "127.0.0.1"
-        self.port = "0"
-        self.trainers = 1
-        self.trainer_id = 0
-
-    def _start_pserver(self, use_cuda, sync_mode, pserver_func):
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
-                  self.trainer_id))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _wait_ps_ready(self, pid):
-        start_left_time = self.ps_timeout
-        sleep_time = 0.5
-        while True:
-            assert start_left_time >= 0, "wait ps ready failed"
-            time.sleep(sleep_time)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                start_left_time -= sleep_time
-
-    def test_rpc_interfaces(self):
-        # TODO(Yancey1989): need to make sure the rpc interface correctly.
-        pass
-
-    def test_handle_signal_in_serv_op(self):
-        # run pserver on CPU in sync mode
-        p1 = self._start_pserver(False, True, run_pserver)
-        print("test_handle_signal_in_serv_op before _wait_ps_ready")
-        self._wait_ps_ready(p1.pid)
-
-        # raise SIGTERM to pserver
-        os.kill(p1.pid, signal.SIGINT)
-        print("test_handle_signal_in_serv_op after kill pid:", p1.pid)
-        p1.join()
-
-        # run pserver on CPU in async mode
-        p2 = self._start_pserver(False, False, run_pserver)
-        print("test_handle_signal_in_serv_op after start p2 pid:", p2.pid)
-        self._wait_ps_ready(p2.pid)
-
-        # raise SIGTERM to pserver
-        os.kill(p2.pid, signal.SIGTERM)
-        print("test_handle_signal_in_serv_op before join p2 pid:", p2.pid)
-        p2.join()
-
-        gen_complete_file_flag("test_handle_signal_in_serv_op.flag")
-
-    def test_list_and_serv_run_empty_optimize_block(self):
-        # run pserver on CPU in sync mode
-        p1 = self._start_pserver(False, True, run_pserver_with_empty_block)
-        print(
-            "test_list_and_serv_run_empty_optimize_block before _wait_ps_ready")
-        self._wait_ps_ready(p1.pid)
-
-        # raise SIGTERM to pserver
-        os.kill(p1.pid, signal.SIGINT)
-        print("test_list_and_serv_run_empty_optimize_block after kill pid:",
-              p1.pid)
-        p1.join()
-
-        # run pserver on CPU in async mode
-        p2 = self._start_pserver(False, False, run_pserver_with_empty_block)
-        print("test_list_and_serv_run_empty_optimize_block after start p2 pid:",
-              p2.pid)
-        self._wait_ps_ready(p2.pid)
-
-        # raise SIGTERM to pserver
-        os.kill(p2.pid, signal.SIGTERM)
-        print("test_list_and_serv_run_empty_optimize_block before join p2 pid:",
-              p2.pid)
-        p2.join()
-        gen_complete_file_flag(
-            "test_list_and_serv_run_empty_optimize_block.flag")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
deleted file mode 100644
index 15485df5ac440f2ff666ca27ef8e8bcc5df866c0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.layers as layers
-from paddle.fluid.executor import Executor
-import paddle.fluid.core as core
-import numpy
-
-
-class TestLoDArrayLength(unittest.TestCase):
-    def test_array_length(self):
-        tmp = layers.zeros(shape=[10], dtype='int32')
-        i = layers.fill_constant(shape=[1], dtype='int64', value=10)
-        arr = layers.array_write(tmp, i=i)
-        arr_len = layers.array_length(arr)
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-        result = exe.run(fetch_list=[arr_len])[0]
-        self.assertEqual(11, result[0])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
deleted file mode 100644
index 865ca118d55f82c66d44f4e3d553baafa0c14c3a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from paddle.fluid.layers import data
-from paddle.fluid.layers.control_flow import lod_rank_table
-from paddle.fluid.executor import Executor
-import paddle.fluid.core as core
-import numpy
-import unittest
-
-
-class TestLoDRankTable(unittest.TestCase):
-    def test_lod_rank_table(self):
-        x = data(name='x', shape=[100])
-        cpu = core.CPUPlace()
-        rank_table = lod_rank_table(x=x, level=1)
-        rank_table.persistable = True
-        exe = Executor(cpu)
-        scope = core.Scope()
-
-        tensor = core.LoDTensor()
-        tensor.set(numpy.random.random(size=(17, 100)), cpu)
-        tensor.set_recursive_sequence_lengths(
-            [[1, 2], [5, 1, 1], [3, 1, 5, 1, 3, 3, 1]])
-        exe.run(scope=scope, feed={'x': tensor})
-        var = scope.find_var(rank_table.name)
-        table = var.get_lod_rank_table()
-        self.assertEqual([(0, 5), (1, 1), (2, 1)], list(table.items()))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
deleted file mode 100644
index 6947ea7c8db93e365292fbba74a074a65d44c727..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ /dev/null
@@ -1,126 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestLodResetOpByAttr(OpTest):
-    def setUp(self):
-        self.op_type = "lod_reset"
-        x = np.random.random((10, 20)).astype("float32")
-        lod = [[3, 2, 5]]
-        # target_offset_lod and target_lod are the same lod info represented
-        # in offset-based format and length-based format, respectively.
-        target_offset_lod = [0, 7, 10]
-        target_lod = [7, 3]
-        self.inputs = {'X': (x, lod)}
-        # The `target_lod` attribute is still based on offset
-        self.attrs = {'target_lod': target_offset_lod}
-        self.outputs = {'Out': (x, [target_lod])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestLodResetOpByInput(OpTest):
-    def setUp(self):
-        self.op_type = "lod_reset"
-        x = np.random.random((10, 20)).astype("float32")
-        lod = [[3, 2, 5]]
-        # target_offset_lod and target_lod are the same lod info represented
-        # in offset-based format and length-based format, respectively.
-        target_offset_lod = [0, 4, 7, 10]
-        target_lod = [4, 3, 3]
-        self.inputs = {
-            'X': (x, lod),
-            'Y': np.array([target_offset_lod]).astype('int32')
-        }
-        self.outputs = {'Out': (x, [target_lod])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out", no_grad_set=set("Y"))
-
-
-class TestLodResetOpBoth(OpTest):
-    def setUp(self):
-        self.op_type = "lod_reset"
-        x = np.random.random((10, 20)).astype("float32")
-        lod = [[3, 2, 5]]
-        target_offset_lod_attr = [0, 7, 10]
-        target_offset_lod_in = [0, 4, 7, 10]
-        target_lod_in = [4, 3, 3]
-        self.inputs = {
-            'X': (x, lod),
-            'Y': np.array(target_offset_lod_in).astype('int32')
-        }
-        self.attrs = {'target_lod': target_offset_lod_attr}
-        self.outputs = {'Out': (x, [target_lod_in])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out", no_grad_set=set("Y"))
-
-
-class TestLodResetOpYIsLoDTensor(OpTest):
-    def setUp(self):
-        self.op_type = "lod_reset"
-        x = np.random.random((10, 20)).astype("float32")
-        lod = [[3, 2, 5]]
-        y = np.random.random((10, 10)).astype("float32")
-        target_lod = [[4, 3, 3]]
-        self.inputs = {'X': (x, lod), 'Y': (y, target_lod)}
-        self.outputs = {'Out': (x, target_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out", no_grad_set=set("Y"))
-
-
-class TestLodAppendOpByAttr(OpTest):
-    def setUp(self):
-        self.op_type = "lod_reset"
-        x = np.random.random((10, 20)).astype("float32")
-        lod = [[3, 2, 5]]
-        # target_offset_lod and target_lod are the same lod info represented
-        # in offset-based format and length-based format, respectively.
-        target_offset_lod = [i for i in range(11)]
-        self.inputs = {'X': (x, lod)}
-        out_lod = [[3, 2, 5], [1] * 10]
-        # The `target_lod` attribute is still based on offset
-        self.attrs = {'target_lod': target_offset_lod, 'append': True}
-        self.outputs = {'Out': (x, out_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
deleted file mode 100644
index 6ad27de9a0e42d1a15ec4a17804c7c0f7ebf5d94..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import numpy
-
-
-class TestLoDTensorArray(unittest.TestCase):
-    def test_get_set(self):
-        scope = core.Scope()
-        arr = scope.var('tmp_lod_tensor_array')
-        tensor_array = arr.get_lod_tensor_array()
-        self.assertEqual(0, len(tensor_array))
-        cpu = core.CPUPlace()
-        for i in range(10):
-            t = core.LoDTensor()
-            t.set(numpy.array([i], dtype='float32'), cpu)
-            t.set_recursive_sequence_lengths([[1]])
-            tensor_array.append(t)
-
-        self.assertEqual(10, len(tensor_array))
-
-        for i in range(10):
-            t = tensor_array[i]
-            self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
-            self.assertEqual([[1]], t.recursive_sequence_lengths())
-
-            t = core.LoDTensor()
-            t.set(numpy.array([i + 10], dtype='float32'), cpu)
-            t.set_recursive_sequence_lengths([[1]])
-            tensor_array[i] = t
-            t = tensor_array[i]
-            self.assertEqual(
-                numpy.array(t), numpy.array(
-                    [i + 10], dtype='float32'))
-            self.assertEqual([[1]], t.recursive_sequence_lengths())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
deleted file mode 100644
index 6a78ef5078a738efa2ae39ea23645fedaecce63b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ /dev/null
@@ -1,223 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import numpy
-import paddle.fluid.layers as layers
-from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.executor import Executor
-from paddle.fluid.backward import append_backward
-
-from paddle.fluid.layers.control_flow import lod_rank_table
-from paddle.fluid.layers.control_flow import max_sequence_len
-from paddle.fluid.layers.control_flow import lod_tensor_to_array
-from paddle.fluid.layers.control_flow import array_to_lod_tensor
-
-
-class TestCPULoDTensorArrayOps(unittest.TestCase):
-    def place(self):
-        return core.CPUPlace()
-
-    def test_lod_tensor_to_array_level_0(self):
-        tensor = core.LoDTensor()
-        tensor.set(
-            numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
-        expect = [
-            numpy.array(x).astype('int32')
-            for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
-        ]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=[] * 6,
-            expect_max_len=6)
-
-    def test_lod_tensor_to_array_level_0_empty_seq(self):
-        tensor = core.LoDTensor()
-        tensor.set(
-            numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_recursive_sequence_lengths([[3, 6, 0, 1]])
-        expect = [
-            numpy.array(x).astype('int32')
-            for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
-        ]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=[] * 6,
-            expect_max_len=6)
-
-    def test_lod_tensor_to_array_level_1(self):
-        tensor = core.LoDTensor()
-        tensor.set(
-            numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
-        tensor.set_recursive_sequence_lengths([[2, 3], [3, 6, 2, 6, 3]])
-
-        expect = [
-            numpy.array(
-                [9, 10, 0, 1, 2], dtype='int32'), numpy.array(
-                    [11, 12, 13, 14, 15, 16, 3, 4, 5, 6, 7, 8], dtype='int32'),
-            numpy.array(
-                [17, 18, 19], dtype='int32')
-        ]
-
-        lod = [[[2, 3]], [[6, 6]], [[3]]]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=lod,
-            expect_max_len=3)
-
-    def test_lod_tensor_to_array_level_1_empty_seq(self):
-        tensor = core.LoDTensor()
-        tensor.set(
-            numpy.arange(31).reshape(31, 1).astype('int32'), self.place())
-
-        tensor.set_recursive_sequence_lengths(
-            [[3, 2, 4, 2], [3, 4, 4, 0, 1, 5, 2, 2, 2, 7, 1]])
-
-        expect = [
-            numpy.array(
-                item, dtype='int32')
-            for item in [[
-                12, 13, 14, 15, 16, 0, 1, 2, 23, 24, 25, 26, 27, 28, 29
-            ], [17, 18, 3, 4, 5, 6, 11, 30], [19, 20, 7, 8, 9, 10], [21, 22]]
-        ]
-
-        lod = [[[5, 3, 0, 7]], [[2, 4, 1, 1]], [[2, 4]], [[2]]]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=lod,
-            expect_max_len=4)
-
-    def test_lod_tensor_to_array_level_2(self):
-        tensor = core.LoDTensor()
-        tensor.set(
-            numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
-        tensor.set_recursive_sequence_lengths(
-            [[2, 3, 1], [2, 3, 1, 4, 2, 1],
-             [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
-
-        expect = [
-            numpy.array(
-                item, dtype='int32')
-            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], list(
-                range(22, 39)) + list(range(7, 21)), list(range(39, 46))]
-        ]
-        lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]],
-               [[2], [6, 1]]]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=lod,
-            expect_max_len=3)
-
-    def test_lod_tensor_to_array_level_2_skip_level(self):
-        tensor = core.LoDTensor()
-        tensor.set(
-            numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
-        tensor.set_recursive_sequence_lengths(
-            [[2, 3, 1], [2, 3, 1, 4, 2, 1],
-             [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
-        self.main(
-            tensor=tensor,
-            expect_array=None,
-            expect_lod=None,
-            expect_max_len=4,
-            level=1)
-
-    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
-        place = self.place()
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[10])
-            x.persistable = True
-            table = lod_rank_table(x, level=level)
-            max_len = max_sequence_len(table)
-            max_len.persistable = True
-            array = lod_tensor_to_array(x, table)
-            array.persistable = True
-
-            result = array_to_lod_tensor(array, table)
-            result.persistable = True
-        exe = Executor(place)
-        scope = core.Scope()
-        exe.run(program, feed={'x': tensor}, scope=scope)
-        var = scope.find_var(array.name)
-        array = var.get_lod_tensor_array()
-        if expect_array is not None and expect_lod is not None:
-            self.check_array_same(array, expect_array, expect_lod)
-        self.check_tensor_same(scope.find_var(result.name).get_tensor(), tensor)
-
-        self.assertEqual(
-            numpy.array(scope.find_var(max_len.name).get_tensor())[0],
-            expect_max_len)
-
-    def check_array_same(self, array, expect_tensor, expect_lod):
-        self.assertEqual(len(expect_tensor), len(array))
-        for i, exp in enumerate(zip(expect_tensor, expect_lod)):
-            exp_tensor, exp_lod = exp
-            exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
-            self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
-            self.assertEqual(exp_lod, array[i].recursive_sequence_lengths())
-
-    def check_tensor_same(self, actual, expect):
-        self.assertTrue(
-            numpy.allclose(numpy.array(actual), numpy.array(expect)))
-        self.assertEqual(actual.recursive_sequence_lengths(),
-                         expect.recursive_sequence_lengths())
-
-
-class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
-    def test_grad(self):
-        place = core.CPUPlace()
-        program = Program()
-
-        with program_guard(program):
-            x = layers.data(
-                name='x', shape=[1], dtype='float32', stop_gradient=False)
-            table = lod_rank_table(x, level=0)
-            array = lod_tensor_to_array(x, table)
-            result = array_to_lod_tensor(array, table)
-
-            mean = layers.mean(result)
-
-            append_backward(mean)
-
-        tensor = core.LoDTensor()
-        tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
-        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
-
-        g_vars = program.global_block().var(x.name + "@GRAD")
-
-        exe = Executor(place)
-        g_out = [
-            numpy.array(item).sum()
-            for item in exe.run(program,
-                                feed={'x': tensor},
-                                fetch_list=[g_vars],
-                                return_numpy=False)
-        ]
-        g_out_sum = numpy.array(g_out).sum()
-
-        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_log_loss_op.py b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
deleted file mode 100644
index 784f4f648d52bdf4f2357f4454d790a8d53288f3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_log_loss_op.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestLogLossOp(OpTest):
-    def setUp(self):
-        self.op_type = 'log_loss'
-        samples_num = 32
-
-        predicted = np.random.uniform(0.1, 1.0,
-                                      (samples_num, 1)).astype("float32")
-        labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
-        epsilon = 1e-4
-        self.inputs = {
-            'Predicted': predicted,
-            'Labels': labels,
-        }
-
-        self.attrs = {'epsilon': epsilon}
-        loss = -labels * np.log(predicted + epsilon) - (
-            1 - labels) * np.log(1 - predicted + epsilon)
-        self.outputs = {'Loss': loss}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py
deleted file mode 100644
index 521851a3d57a4a3e8b2c8e1639325cc6c88fdd84..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import op_test
-import unittest
-import numpy as np
-
-
-def create_test_class(op_type, callback, binary_op=True):
-    class Cls(op_test.OpTest):
-        def setUp(self):
-            a = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
-            if binary_op:
-                b = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
-                c = callback(a, b)
-            else:
-                c = callback(a)
-            self.outputs = {'Out': c}
-            self.op_type = op_type
-            if binary_op:
-                self.inputs = {'X': a, 'Y': b}
-            else:
-                self.inputs = {'X': a}
-
-        def test_output(self):
-            self.check_output()
-
-    Cls.__name__ = op_type
-    globals()[op_type] = Cls
-
-
-create_test_class('logical_and', lambda _a, _b: np.logical_and(_a, _b))
-create_test_class('logical_or', lambda _a, _b: np.logical_or(_a, _b))
-create_test_class('logical_not', lambda _a: np.logical_not(_a), False)
-create_test_class('logical_xor', lambda _a, _b: np.logical_xor(_a, _b))
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
deleted file mode 100644
index 1b02c8d19ad84fe35a96f7223d0a233520230cba..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
+++ /dev/null
@@ -1,203 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import signal
-import time
-import unittest
-from multiprocessing import Process
-
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.framework import Program, program_guard
-from dist_test_utils import *
-
-
-def run_pserver(pserver_id, use_cuda, sync_mode):
-    remove_ps_flag(os.getgid())
-    scope = fluid.core.Scope()
-    program = Program()
-    with fluid.scope_guard(scope):
-        with program_guard(program, startup_program=Program()):
-            # create table parameter in scope
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            # create and initialize Param Variable
-            param = scope.var('table').get_tensor()
-
-            param_array = np.ones((10, 8)).astype("float32")
-            for i in range(len(param_array)):
-                param_array[i] *= param_array[i] * i + pserver_id * 10
-            param.set(param_array, place)
-
-            optimize_block = program._create_block(program.global_block().idx)
-            program.global_block().append_op(
-                type="listen_and_serv",
-                inputs={'X': []},
-                outputs={},
-                attrs={
-                    "optimize_blocks": [optimize_block],
-                    "endpoint": '127.0.0.1:0',
-                    "Fanin": 1,
-                    "sync_mode": True,
-                    "grad_to_block_id": []
-                })
-
-            exe = fluid.Executor(place)
-            exe.run(program)
-
-
-class TestListenAndServOp(unittest.TestCase):
-    def setUp(self):
-        self.ps_timeout = 5
-
-    def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
-        p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _wait_ps_ready(self, pid):
-        start_left_time = self.ps_timeout
-        sleep_time = 0.5
-        while True:
-            assert start_left_time >= 0, "wait ps ready failed"
-            time.sleep(sleep_time)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                start_left_time -= sleep_time
-
-    def _get_pserver_port(self, pid):
-        with open("/tmp/paddle.%d.port" % pid, 'r') as f:
-            port = int(f.read().strip())
-        return port
-
-    def _run_lookup_table_op_one_pserver(self, place, port):
-        scope = fluid.core.Scope()
-        program = Program()
-        with fluid.scope_guard(scope):
-            with program_guard(program, startup_program=Program()):
-                # create and initialize Param Variable
-                param = scope.var('W').get_tensor()
-                param_array = np.full((10, 8), 1.0).astype("float32")
-                param.set(param_array, place)
-
-                ids = scope.var('Ids').get_tensor()
-                ids_array = np.array([[1], [2], [5]]).astype("int64")
-                ids.set(ids_array, place)
-                ids_lod = [[0, 1, 2, 3]]
-                ids.set_lod(ids_lod)
-
-                out = scope.var('Out').get_tensor()
-
-                emaps = ['127.0.0.1:' + str(port)]
-                table_names = ['table']
-                height_sections = [10]
-
-                # create and run sgd operator
-                lookup_table_op = Operator(
-                    "lookup_table",
-                    W='W',
-                    Ids='Ids',
-                    Out='Out',
-                    remote_prefetch=True,
-                    epmap=emaps,
-                    table_names=table_names,
-                    height_sections=height_sections)
-                lookup_table_op.run(scope, place)
-
-                # get and compare result
-                result_array = np.array(out)
-
-                self.assertEqual(out.lod(), ids_lod)
-                self.assertEqual(list(result_array.shape), [len(ids_array), 8])
-                for i in range(len(ids_array)):
-                    id = ids_array[i][0]
-                    self.assertTrue((result_array[i] == id).all())
-
-    def _run_lookup_table_op_two_pserver(self, place, port0, port1):
-        scope = fluid.core.Scope()
-        program = Program()
-        with fluid.scope_guard(scope):
-            with program_guard(program, startup_program=Program()):
-                # create and initialize Param Variable
-                param = scope.var('W').get_tensor()
-                param_array = np.full((10, 8), 1.0).astype("float32")
-                param.set(param_array, place)
-
-                ids = scope.var('Ids').get_tensor()
-                ids_array = np.array([[1], [2], [11], [13]]).astype("int64")
-                ids.set(ids_array, place)
-                ids_lod = [[0, 2, 3, 4]]
-                ids.set_lod(ids_lod)
-
-                out = scope.var('Out').get_tensor()
-
-                emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
-                table_names = ['table', 'table']
-                height_sections = [10, 20]
-
-                # create and run sgd operator
-                lookup_table_op = Operator(
-                    "lookup_table",
-                    W='W',
-                    Ids='Ids',
-                    Out='Out',
-                    remote_prefetch=True,
-                    epmap=emaps,
-                    table_names=table_names,
-                    height_sections=height_sections)
-                lookup_table_op.run(scope, place)
-
-                # get and compare result
-                result_array = np.array(out)
-                self.assertEqual(out.lod(), ids_lod)
-                self.assertEqual(list(result_array.shape), [len(ids_array), 8])
-                for i in range(len(ids_array)):
-                    id = ids_array[i][0]
-                    self.assertTrue((result_array[i] == id).all())
-
-    def test_lookup_remote_table(self):
-        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
-        # run pserver on CPU in sync mode
-        p0 = self._start_pserver(0, False, True, run_pserver)
-        self._wait_ps_ready(p0.pid)
-        port0 = self._get_pserver_port(p0.pid)
-
-        p1 = self._start_pserver(1, False, True, run_pserver)
-        self._wait_ps_ready(p1.pid)
-        port1 = self._get_pserver_port(p1.pid)
-
-        places = [core.CPUPlace()]
-
-        for place in places:
-            self._run_lookup_table_op_one_pserver(place, port0)
-            self._run_lookup_table_op_two_pserver(place, port0, port1)
-
-        # raise SIGTERM to pserver
-        os.kill(p0.pid, signal.SIGINT)
-        p0.join()
-        os.kill(p1.pid, signal.SIGINT)
-        p1.join()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
deleted file mode 100644
index c7f4f3e913bfd66cbbb703c0e73336f9a3563507..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestLookupSpraseTable(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize W Variable
-        table_size = 10000
-        row_numel = 8
-
-        w_selected_rows = scope.var('W').get_selected_rows()
-        w_selected_rows.set_height(table_size)
-        w_array = np.ones((table_size, row_numel)).astype("float32")
-        for i in range(table_size):
-            w_array[i] *= i
-        w_tensor = w_selected_rows.get_tensor()
-        w_tensor.set(w_array, place)
-
-        # create and initialize Id Variable
-        ids = scope.var("Ids").get_tensor()
-        ids_array1 = np.array([0, 2, 3, 2, 5, 0, 100]).astype("int64")
-        ids.set(ids_array1, place)
-
-        # create Out Variable
-        out_tensor = scope.var('Out').get_tensor()
-
-        # create and run lookup_table operator
-        lookup_table = Operator(
-            "lookup_sparse_table",
-            W='W',
-            Ids='Ids',
-            Out='Out',
-            min=-5.0,
-            max=10.0,
-            seed=10)
-        lookup_table.run(scope, place)
-
-        # get result from Out
-        result_array1 = np.array(out_tensor)
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        assert (result_array1[0] == w_array[0]).all()
-        assert (result_array1[1] == w_array[1]).all()
-        assert (result_array1[2] == w_array[2]).all()
-        assert (result_array1[3] == w_array[1]).all()
-        assert (result_array1[4] == w_array[3]).all()
-        assert (result_array1[5] == w_array[0]).all()
-        assert (result_array1[6] == w_array[4]).all()
-
-        # create and initialize Id Variable
-        ids = scope.var("Ids").get_tensor()
-        ids_array2 = np.array([4, 2, 3, 7, 100000]).astype("int64")
-        ids.set(ids_array2, place)
-        lookup_table.run(scope, place)
-
-        result_array2 = np.array(out_tensor)
-        assert (result_array2[0] == w_array[5]).all()
-        assert (result_array2[1] == w_array[1]).all()
-        assert (result_array2[2] == w_array[2]).all()
-        assert (result_array2[3] == w_array[6]).all()
-        assert (result_array2[4] == w_array[7]).all()
-
-        # create and run lookup_table operator
-        test_lookup_table = Operator(
-            "lookup_sparse_table",
-            W='W',
-            Ids='Ids',
-            Out='Out',
-            min=-5.0,
-            max=10.0,
-            seed=10,
-            is_test=True)
-
-        ids = scope.var("Ids").get_tensor()
-        unknown_id = [44, 22, 33]
-        ids_array2 = np.array([4, 2, 3, 7, 100000] + unknown_id).astype("int64")
-        ids.set(ids_array2, place)
-        test_lookup_table.run(scope, place)
-
-        result_array2 = np.array(out_tensor)
-        assert (result_array2[0] == w_array[5]).all()
-        assert (result_array2[1] == w_array[1]).all()
-        assert (result_array2[2] == w_array[2]).all()
-        assert (result_array2[3] == w_array[6]).all()
-        assert (result_array2[4] == w_array[7]).all()
-
-        for i in [5, 6, 7]:
-            assert np.all(result_array2[i] == 0)
-
-    def test_w_is_selected_rows(self):
-        places = [core.CPUPlace()]
-        # currently only support CPU
-        for place in places:
-            self.check_with_place(place)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
deleted file mode 100644
index 4990ee898d81089735f6db4ee4ad6758944e311a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import paddle.compat as cpt
-
-
-class TestLookupTableOp(OpTest):
-    def setUp(self):
-        self.op_type = "lookup_table"
-        table = np.random.random((17, 31)).astype("float32")
-        ids = np.random.randint(0, 17, 4).astype("int64")
-        ids_expand = np.expand_dims(ids, axis=1)
-        self.inputs = {'W': table, 'Ids': ids_expand}
-        self.outputs = {'Out': table[ids]}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
-
-
-class TestLookupTableOpWithTensorIds(OpTest):
-    def setUp(self):
-        self.op_type = "lookup_table"
-        table = np.random.random((17, 31)).astype("float32")
-        ids = np.random.randint(
-            low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
-        self.inputs = {'W': table, 'Ids': ids}
-        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
-
-
-class TestLookupTableOpWithPadding(TestLookupTableOp):
-    def test_check_output(self):
-        ids = np.squeeze(self.inputs['Ids'])
-        padding_idx = np.random.choice(ids, 1)[0]
-        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
-        self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output()
-
-    def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of
-        # paddings makes no sense and we don't test the gradient here.
-        pass
-
-
-class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
-    def test_check_output(self):
-        ids = self.inputs['Ids']
-        flatten_idx = ids.flatten()
-        padding_idx = np.random.choice(flatten_idx, 1)[0]
-        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
-        self.attrs = {'padding_idx': cpt.long_type(padding_idx)}
-        self.check_output()
-
-    def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of
-        # paddings makes no sense and we don't test the gradient here.
-        pass
-
-
-class TestLookupTableWIsSelectedRows(OpTest):
-    def prepare_ids(self, scope, place):
-        ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
-        ids_tensor.set(ids_array, place)
-        return ids_array
-
-    def prepare_w(self, scope, place):
-        rows = [0, 1, 2, 3, 4, 5, 6]
-        row_numel = 12
-
-        w_selected_rows = scope.var('W').get_selected_rows()
-        w_selected_rows.set_height(len(rows))
-        w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
-            w_array[i] *= i
-        w_tensor = w_selected_rows.get_tensor()
-        w_tensor.set(w_array, place)
-
-    def create_out_tensor(self, scope, place):
-        return scope.var('Out').get_tensor()
-
-    def check_result(self, ids_array, result_array):
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array):
-            assert (row[0] == result_array[idx]).all()
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        ids_array = self.prepare_ids(scope, place)
-
-        self.prepare_w(scope, place)
-
-        out_tensor = self.create_out_tensor(scope, place)
-
-        # create and run lookup_table operator
-        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
-        lookup_table.run(scope, place)
-
-        # get result from Out
-        result_array = np.array(out_tensor)
-
-        self.check_result(ids_array, result_array)
-
-    def test_w_is_selected_rows(self):
-        places = [core.CPUPlace()]
-        # currently only support CPU
-        for place in places:
-            self.check_with_place(place)
-
-
-class TestLookupTableWithTensorIdsWIsSelectedRows(
-        TestLookupTableWIsSelectedRows):
-    def prepare_ids(self, scope, place):
-        ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.random.randint(
-            low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
-        ids_tensor.set(ids_array, place)
-        return ids_array
-
-    def check_result(self, ids_array, result_array):
-        for idx, row in np.ndenumerate(ids_array):
-            assert (row == result_array[idx]).all()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
deleted file mode 100644
index 46a219bbb2fd9ad131793a2b52768b975e1debdb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ /dev/null
@@ -1,216 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid.op import Operator
-import paddle.compat as cpt
-
-
-class TestLookupTableOp(OpTest):
-    def setUp(self):
-        self.op_type = "lookup_table_v2"
-        table = np.random.random((17, 31)).astype("float32")
-        ids = np.random.randint(0, 17, 4).astype("int64")
-        self.inputs = {'W': table, 'Ids': ids}
-        self.outputs = {'Out': table[ids]}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
-
-
-class TestLookupTableOpWithTensorIds(OpTest):
-    def setUp(self):
-        self.op_type = "lookup_table_v2"
-        table = np.random.random((17, 31)).astype("float32")
-        ids = np.random.randint(low=0, high=17, size=(2, 4, 5)).astype("int64")
-        self.inputs = {'W': table, 'Ids': ids}
-        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
-
-
-class TestLookupTableOpWithPadding(TestLookupTableOp):
-    def test_check_output(self):
-        ids = np.squeeze(self.inputs['Ids'])
-        padding_idx = np.random.choice(ids, 1)[0]
-        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
-        self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output()
-
-    def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of
-        # paddings makes no sense and we don't test the gradient here.
-        pass
-
-
-class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
-    def test_check_output(self):
-        ids = self.inputs['Ids']
-        flatten_idx = ids.flatten()
-        padding_idx = np.random.choice(flatten_idx, 1)[0]
-        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
-        self.attrs = {'padding_idx': cpt.long_type(padding_idx)}
-        self.check_output()
-
-    def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of
-        # paddings makes no sense and we don't test the gradient here.
-        pass
-
-
-class TestLookupTableWIsSelectedRows(OpTest):
-    def prepare_ids(self, scope, place):
-        ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.array([0, 4, 3, 5]).astype("int64")
-        ids_tensor.set(ids_array, place)
-        return ids_array
-
-    def prepare_w(self, scope, place):
-        rows = [0, 1, 2, 3, 4, 5, 6]
-        row_numel = 12
-
-        w_selected_rows = scope.var('W').get_selected_rows()
-        w_selected_rows.set_height(len(rows))
-        w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
-            w_array[i] *= i
-        w_tensor = w_selected_rows.get_tensor()
-        w_tensor.set(w_array, place)
-
-    def create_out_tensor(self, scope, place):
-        return scope.var('Out').get_tensor()
-
-    def check_result(self, ids_array, result_array):
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array):
-            assert (row == result_array[idx]).all()
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        ids_array = self.prepare_ids(scope, place)
-
-        self.prepare_w(scope, place)
-
-        out_tensor = self.create_out_tensor(scope, place)
-
-        # create and run lookup_table operator
-        lookup_table = Operator("lookup_table_v2", W='W', Ids='Ids', Out='Out')
-        lookup_table.run(scope, place)
-
-        # get result from Out
-        result_array = np.array(out_tensor)
-
-        self.check_result(ids_array, result_array)
-
-    def test_w_is_selected_rows(self):
-        places = [core.CPUPlace()]
-        # currently only support CPU
-        for place in places:
-            self.check_with_place(place)
-
-
-class TestLookupTableWithTensorIdsWIsSelectedRows(
-        TestLookupTableWIsSelectedRows):
-    def prepare_ids(self, scope, place):
-        ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.random.randint(
-            low=0, high=6, size=(2, 4, 3)).astype("int64")
-        ids_tensor.set(ids_array, place)
-        return ids_array
-
-    def check_result(self, ids_array, result_array):
-        for idx, row in np.ndenumerate(ids_array):
-            assert (row == result_array[idx]).all()
-
-
-class TestLookupTableIsSparse(unittest.TestCase):
-    def init_data(self):
-        self.x_data = np.array([[1, 3, 0, 4, 7]]).astype("int64")
-        self.y_data = np.array([[0.1, 0.3, 0, 0.4, 0.7]]).astype("float32")
-
-    def get_w_grad(self, is_sparse):
-        self.init_data()
-        main_program = fluid.Program()
-        with fluid.program_guard(main_program, fluid.Program()):
-            x = fluid.layers.data(name='x', shape=[5], dtype='int64')
-            y_ = fluid.layers.data(name='y_', shape=[5], dtype='float32')
-            emb = fluid.input.embedding(
-                input=x,
-                size=[10, 16],
-                param_attr=fluid.ParamAttr(
-                    name="emb_weight",
-                    learning_rate=10,
-                    initializer=fluid.initializer.NumpyArrayInitializer(
-                        self.w_data)),
-                is_sparse=is_sparse)
-            y = fluid.layers.reduce_sum(emb, dim=-1)
-
-            loss = fluid.layers.square_error_cost(input=y, label=y_)
-            loss = fluid.layers.mean(loss)
-
-            sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
-            sgd_optimizer.minimize(loss)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            ret = exe.run(feed={'x': self.x_data,
-                                'y_': self.y_data},
-                          fetch_list=['emb_weight'],
-                          return_numpy=False)
-            return np.array(ret[0])
-
-    def test_w_grad(self):
-        self.w_data = np.random.random(size=(10, 16)).astype("float32")
-        w_grad = self.get_w_grad(False)
-        w_grad_with_sparse = self.get_w_grad(True)
-        self.check_grad(w_grad, w_grad_with_sparse)
-
-    def check_grad(self, w_grad1, w_grad2, tolerance=1e-6):
-        np.testing.assert_allclose(
-            w_grad1, w_grad2, rtol=tolerance, atol=tolerance)
-
-
-class TestLookupTableApi(unittest.TestCase):
-    def test_api(self):
-        x = fluid.layers.data(name='x', shape=[20], dtype='int64')
-        emb = fluid.embedding(input=x, size=[128, 64])
-
-        place = fluid.CPUPlace()
-        x_data = np.random.randint(0, 127, [2, 20]).astype("int64")
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'x': x_data, },
-                      fetch_list=[emb],
-                      return_numpy=False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
deleted file mode 100644
index bb91f26bbb53de454a6d037af4c9d96262866ce3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestLRNOp(OpTest):
-    def get_input(self):
-        ''' TODO(gongweibao): why it's grad diff is so large?
-        x = np.ndarray(
-            shape=(self.N, self.C, self.H, self.W), dtype=float, order='C')
-        for m in range(0, self.N):
-            for i in range(0, self.C):
-                for h in range(0, self.H):
-                    for w in range(0, self.W):
-                        x[m][i][h][w] = m * self.C * self.H * self.W +  \
-                                        i * self.H * self.W +  \
-                                        h * self.W + w + 1
-        '''
-        x = np.random.rand(self.N, self.C, self.H, self.W).astype("float32")
-        return x + 1
-
-    def get_out(self):
-        start = -(self.n - 1) // 2
-        end = start + self.n
-
-        mid = np.empty((self.N, self.C, self.H, self.W)).astype("float32")
-        mid.fill(self.k)
-        for m in range(0, self.N):
-            for i in range(0, self.C):
-                for c in range(start, end):
-                    ch = i + c
-                    if ch < 0 or ch >= self.C:
-                        continue
-
-                    s = mid[m][i][:][:]
-                    r = self.x[m][ch][:][:]
-                    s += np.square(r) * self.alpha
-
-        mid2 = np.power(mid, -self.beta)
-        return np.multiply(self.x, mid2), mid
-
-    def get_attrs(self):
-        attrs = {
-            'n': self.n,
-            'k': self.k,
-            'alpha': self.alpha,
-            'beta': self.beta
-        }
-        return attrs
-
-    def setUp(self):
-        self.op_type = "lrn"
-        self.N = 2
-        self.C = 3
-        self.H = 5
-        self.W = 5
-
-        self.n = 5
-        self.k = 2.0
-        self.alpha = 0.0001
-        self.beta = 0.75
-        self.x = self.get_input()
-        self.out, self.mid_out = self.get_out()
-
-        self.inputs = {'X': self.x}
-        self.outputs = {'Out': self.out, 'MidOut': self.mid_out}
-        self.attrs = self.get_attrs()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.01)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
deleted file mode 100644
index 72eed1498e44803d9e2ef449273ecfb86cee3d03..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from op_test import OpTest
-import paddle.fluid as fluid
-
-SIGMOID_THRESHOLD_MIN = -40.0
-SIGMOID_THRESHOLD_MAX = 13.0
-EXP_MAX_INPUT = 40.0
-
-
-def lstm_naive(
-        input,
-        w, ):
-    seq_len, batch_size, hidden_size = input.shape
-
-    offset = 0
-    wi = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    wf = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    wc = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    wo = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    ri = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    rf = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    rc = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    ro = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-
-    bi_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bf_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bc_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bo_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-
-    bi_2 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bf_2 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bc_2 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bo_2 = w[offset:offset + hidden_size]
-
-    def sigmoid(x):
-        y = np.copy(x)
-        y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
-        y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
-        return 1. / (1. + np.exp(-y))
-
-    def tanh(x):
-        y = -2. * x
-        y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
-        return (2. / (1. + np.exp(y))) - 1.
-
-    output = []
-    pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype)
-    pre_c = np.zeros((batch_size, hidden_size), dtype=input.dtype)
-
-    for i in range(seq_len):
-        emb_1 = input[i]
-
-        input_gate = sigmoid(
-            np.matmul(emb_1, wi) + np.matmul(pre_h, ri) + bi_1 + bi_2)
-        forget_gate = sigmoid(
-            np.matmul(emb_1, wf) + np.matmul(pre_h, rf) + bf_1 + bf_2)
-        output_gate = sigmoid(
-            np.matmul(emb_1, wo) + np.matmul(pre_h, ro) + bo_1 + bo_2)
-        c_t_temp = tanh(
-            np.matmul(emb_1, wc) + np.matmul(pre_h, rc) + bc_1 + bc_2)
-        new_c = input_gate * c_t_temp + forget_gate * pre_c
-        new_h = output_gate * tanh(new_c)
-
-        pre_h = new_h
-        pre_c = new_c
-
-        output.append(new_h)
-
-    output = np.concatenate(output, -1)
-    output = output.reshape((batch_size, -1, hidden_size))
-
-    output = output.transpose((1, 0, 2))
-
-    return output, pre_h, pre_c
-
-
-class TestCUDNNLstmOp(OpTest):
-    def setUp(self):
-        self.op_type = "cudnn_lstm"
-        self.dtype = np.float32
-
-        num_steps = 20
-        batch_size = 5
-        hidden_size = 20
-
-        input_weight_size = (hidden_size * hidden_size) * 4
-        hidden_weight_size = (hidden_size * hidden_size) * 4
-        weight_size = input_weight_size + hidden_weight_size
-        weight_size += hidden_size * 8
-
-        input = np.random.uniform(
-            low=-0.1, high=0.1, size=(num_steps, batch_size,
-                                      hidden_size)).astype(self.dtype)
-        flat_w = np.random.uniform(
-            low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype)
-
-        output, last_hidden, last_cell = lstm_naive(input, flat_w)
-
-        init_h = np.zeros((batch_size, hidden_size), dtype=np.float32)
-        init_c = np.zeros((batch_size, hidden_size), dtype=np.float32)
-        scope = core.Scope()
-        program = fluid.Program()
-        block = program.global_block()
-
-        cache_temp = block.create_var(
-            name="Cache",
-            persistable=True,
-            type=core.VarDesc.VarType.RAW,
-            stop_gradient=True)
-        self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'W': OpTest.np_dtype_to_fluid_dtype(flat_w),
-            'InitH': OpTest.np_dtype_to_fluid_dtype(init_h),
-            'InitC': OpTest.np_dtype_to_fluid_dtype(init_c),
-        }
-        self.cache_name_list = ['Cache']
-        self.attrs = {
-            'max_len': num_steps,
-            'dropout_prob': 0.0,
-            'is_bidirec': False,
-            'input_size': hidden_size,
-            'hidden_size': hidden_size,
-            'num_layers': 1,
-        }
-        self.outputs = {
-            'Out': output,
-            "last_h": last_hidden,
-            'last_c': last_cell
-        }
-
-    def test_output_with_place(self):
-        if self.has_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-
-    def test_grad_with_place(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['Input', 'W', 'InitH', 'InitC']),
-                ['Out', 'last_h', 'last_c'],
-                max_relative_error=0.02)
-
-    def has_cuda(self):
-        return core.is_compiled_with_cuda()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
deleted file mode 100644
index 7ee33c6e9ec1995f6b365e556c7adce20eb16270..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ /dev/null
@@ -1,318 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-SIGMOID_THRESHOLD_MIN = -40.0
-SIGMOID_THRESHOLD_MAX = 13.0
-EXP_MAX_INPUT = 40.0
-
-
-def identity(x):
-    return x
-
-
-def sigmoid(x):
-    y = np.copy(x)
-    y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
-    y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
-    return 1. / (1. + np.exp(-y))
-
-
-def tanh(x):
-    y = -2. * x
-    y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
-    return (2. / (1. + np.exp(y))) - 1.
-
-
-def relu(x):
-    return np.maximum(x, 0)
-
-
-ACTIVATION = {
-    'identity': identity,
-    'sigmoid': sigmoid,
-    'tanh': tanh,
-    'relu': relu
-}
-
-
-def lstm(
-        input,  # T x 4D
-        lod,  # 1 x N
-        h0=None,  # N x D
-        c0=None,  # N x D
-        w_h=None,  # D x 4D
-        w_b=None,  # 1 x 4D
-        w_c=None,  # 1 x 3D
-        is_reverse=False,
-        act_gate=None,
-        act_cell=None,
-        act_cand=None):
-    def _step(x, w_h, w_c, h_pre, c_pre, act_gate, act_cell, act_cand):
-        g = np.dot(h_pre, w_h)  # 1 x 4D
-        g = g + x
-        g = np.reshape(g, (1, g.size))
-        c, g_i, g_f, g_o = np.split(g, 4, axis=1)
-        if w_c is None:
-            g_i = act_gate(g_i)  # 1 x D
-            g_f = act_gate(g_f)  # 1 x D
-        else:
-            w_ic, w_fc, w_oc = np.split(w_c, 3, axis=1)
-            g_i = act_gate(g_i + w_ic * c_pre)  # 1 x D
-            g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
-        c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
-
-        if w_c is None:
-            g_o = act_gate(g_o)  # 1 x D
-        else:
-            _, _, w_oc = np.split(w_c, 3, axis=1)
-            g_o = act_gate(g_o + w_oc * c)  # 1 x D
-        h = g_o * act_cell(c)
-        return h, c
-
-    def _reverse(x, offset):
-        y = np.zeros_like(x)
-        for i in range(len(offset) - 1):
-            b, e = offset[i], offset[i + 1]
-            y[b:e, :] = np.flip(x[b:e, :], 0)
-        return y
-
-    offset = [0]
-    for l in lod[0]:
-        offset.append(offset[-1] + l)
-    batch_size = len(lod[0])
-    hidden = []
-    cell = []
-    input = _reverse(input, offset) if is_reverse else input
-    if w_b is not None:
-        input = input + np.tile(w_b, (offset[-1], 1))
-    for i in range(batch_size):
-        # compute one sequence
-        seq_len = lod[0][i]
-        x = input[offset[i]:offset[i + 1], :]
-        h_pre = h0[i]  # 1 x D
-        c_pre = c0[i]  # 1 x D
-        for j in range(seq_len):
-            # compute one step
-            h_pre, c_pre = _step(x[j], w_h, w_c, h_pre, c_pre, act_gate,
-                                 act_cell, act_cand)
-            hidden.append(h_pre.flatten())
-            cell.append(c_pre.flatten())
-
-    hidden = np.array(hidden).astype('float64')
-    cell = np.array(cell).astype('float64')
-
-    hidden = _reverse(hidden, offset) if is_reverse else hidden
-    cell = _reverse(cell, offset) if is_reverse else cell
-
-    assert hidden.shape == (input.shape[0], input.shape[1] / 4)
-    assert cell.shape == (input.shape[0], input.shape[1] / 4)
-    return hidden, cell
-
-
-class TestLstmOp(OpTest):
-    def set_lod(self):
-        self.lod = [[2, 3, 2]]
-
-    def set_argument(self):
-        self.set_lod()
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = False
-        self.use_peepholes = True
-
-    def setUp(self):
-        self.set_argument()
-        self.op_type = 'lstm'
-        T = sum(self.lod[0])
-        N = len(self.lod[0])
-
-        x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
-        if self.has_initial_state:
-            h0 = np.random.normal(size=(N, self.D)).astype('float64')
-            c0 = np.random.normal(size=(N, self.D)).astype('float64')
-        else:
-            h0 = np.zeros((N, self.D)).astype('float64')
-            c0 = np.zeros((N, self.D)).astype('float64')
-        w = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
-        if self.use_peepholes:
-            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
-        else:
-            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
-
-        w_b = b[:, 0:4 * self.D]
-        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
-        h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
-                    ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
-                    ACTIVATION[self.act_cand])
-
-        self.inputs = {'Input': (x, self.lod), 'Weight': w}
-
-        self.inputs['Bias'] = b
-
-        if self.has_initial_state:
-            self.inputs['H0'] = h0
-            self.inputs['C0'] = c0
-
-        self.outputs = {
-            'Hidden': (h, self.lod),
-            'Cell': (c, self.lod),
-        }
-        self.attrs = {
-            'use_peepholes': self.use_peepholes,
-            'is_reverse': self.is_reverse,
-            'gate_activation': self.act_gate,
-            'cell_activation': self.act_cell,
-            'candidate_activation': self.act_cand
-        }
-
-    def test_check_output(self):
-        self.check_output(atol=1e-8)
-
-    def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
-
-
-class TestLstmOpCase1(TestLstmOp):
-    def set_lod(self):
-        self.lod = [[0, 3, 2]]
-
-
-class TestLstmOpCase2(TestLstmOp):
-    def set_lod(self):
-        self.lod = [[0, 3, 0]]
-
-
-class TestLstmOpCase3(TestLstmOp):
-    def set_lod(self):
-        self.lod = [[2, 0, 4]]
-
-
-# class TestLstmOpHasInitial(TestLstmOp):
-#     def set_argument(self):
-#         self.lod = [[2, 3, 2]]
-#         self.D = 16
-
-#         self.act_gate = 'sigmoid'
-#         self.act_cell = 'tanh'
-#         self.act_cand = 'tanh'
-
-#         self.has_initial_state = True
-#         self.is_reverse = True
-#         self.use_peepholes = True
-
-#     def test_check_grad(self):
-#         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-#         N = len(self.lod[0])
-#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-#         self.outputs['BatchCellPreAct'] = np.zeros(
-#             (N, self.D)).astype('float64')
-#         self.check_grad(
-#             ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
-#             max_relative_error=5e-4)
-
-#     def test_check_grad_ingore_bias(self):
-#         N = len(self.lod[0])
-#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-#         self.outputs['BatchCellPreAct'] = np.zeros(
-#             (N, self.D)).astype('float64')
-#         self.check_grad(
-#             ['Input', 'Weight'], ['Hidden'],
-#             max_relative_error=5e-4,
-#             no_grad_set=set('Bias'))
-
-#     def test_check_grad_ingore_weight(self):
-#         N = len(self.lod[0])
-#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-#         self.outputs['BatchCellPreAct'] = np.zeros(
-#             (N, self.D)).astype('float64')
-#         self.check_grad(
-#             ['Input', 'Bias'], ['Hidden'],
-#             max_relative_error=5e-4,
-#             no_grad_set=set('Weight'))
-
-#     def test_check_grad_ingore_input(self):
-#         N = len(self.lod[0])
-#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-#         self.outputs['BatchCellPreAct'] = np.zeros(
-#             (N, self.D)).astype('float64')
-#         self.check_grad(
-#             ['Weight', 'Bias'], ['Hidden'],
-#             max_relative_error=5e-4,
-#             no_grad_set=set('Input'))
-
-#     def test_check_grad_ingore_h0(self):
-#         N = len(self.lod[0])
-#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-#         self.outputs['BatchCellPreAct'] = np.zeros(
-#             (N, self.D)).astype('float64')
-#         self.check_grad(
-#             ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
-#             max_relative_error=5e-4,
-#             no_grad_set=set('H0'))
-
-#     def test_check_grad_ingore_c0(self):
-#         N = len(self.lod[0])
-#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-#         self.outputs['BatchCellPreAct'] = np.zeros(
-#             (N, self.D)).astype('float64')
-#         self.check_grad(
-#             ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
-#             max_relative_error=5e-4,
-#             no_grad_set=set('C0'))
-
-# class TestLstmOpRerverse(TestLstmOp):
-#     def set_argument(self):
-#         self.lod = [[2, 3, 2]]
-#         self.D = 16
-
-#         self.act_gate = 'sigmoid'
-#         self.act_cell = 'tanh'
-#         self.act_cand = 'tanh'
-
-#         self.has_initial_state = False
-#         self.is_reverse = True
-#         self.use_peepholes = True
-
-# class TestLstmOpNotUsePeepholes(TestLstmOp):
-#     def set_argument(self):
-#         self.lod = [[2, 3, 2]]
-#         self.D = 16
-
-#         self.act_gate = 'sigmoid'
-#         self.act_cell = 'tanh'
-#         self.act_cand = 'tanh'
-
-#         self.has_initial_state = False
-#         self.is_reverse = True
-#         self.use_peepholes = False
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
deleted file mode 100644
index eaa6b774c4d3e7add555c34f887e86dc847583b2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def sigmoid_np(x):
-    return 1. / (1. + np.exp(-x))
-
-
-def tanh_np(x):
-    return 2 * sigmoid_np(2. * x) - 1.
-
-
-class LstmUnitTest(OpTest):
-    def setUp(self):
-        self.op_type = "lstm_unit"
-        x_np = np.random.normal(size=(5, 16)).astype("float64")
-        c_np = np.random.normal(size=(5, 4)).astype("float64")
-        i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1)
-        forget_bias_np = 0.
-        self.attrs = {'forget_bias': 0.}
-
-        new_c = c_np * sigmoid_np(f_np + forget_bias_np) + sigmoid_np(
-            i_np) * tanh_np(j_np)
-        new_h = tanh_np(new_c) * sigmoid_np(o_np)
-
-        self.inputs = {'X': x_np, 'C_prev': c_np}
-        self.outputs = {'C': new_c, 'H': new_h}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'C_prev'], ['C', 'H'])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
deleted file mode 100644
index 70a0af6c9854efdf4d8b7c849c15e7aff6935fb2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ /dev/null
@@ -1,319 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import test_lstm_op as LstmTest
-
-ACTIVATION = {
-    'identity': LstmTest.identity,
-    'sigmoid': LstmTest.sigmoid,
-    'tanh': LstmTest.tanh,
-    'relu': LstmTest.relu
-}
-
-
-# LSTM with recurrent projection Layer
-def lstmp(
-        input,  # T x 4D
-        lod,  # 1 x N
-        h0=None,  # N x D
-        c0=None,  # N x D
-        w_r=None,  # P x 4D
-        w_rh=None,  # D x P
-        w_b=None,  # 1 x 4D
-        w_c=None,  # 1 x 3D
-        is_reverse=False,
-        proj_clip=0.0,
-        cell_clip=0.0,
-        act_gate=None,
-        act_cell=None,
-        act_cand=None,
-        act_proj=None):
-    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, proj_clip, cell_clip, act_gate,
-              act_cell, act_cand, act_proj):
-        g = np.dot(r_pre, w_r)  # 1 x 4D
-        g = g + x
-        g = np.reshape(g, (1, g.size))
-        c, g_i, g_f, g_o = np.split(g, 4, axis=1)
-        if w_c is None:
-            g_i = act_gate(g_i)  # 1 x D
-            g_f = act_gate(g_f)  # 1 x D
-        else:
-            w_ic, w_fc, _ = np.split(w_c, 3, axis=1)
-            g_i = act_gate(g_i + w_ic * c_pre)  # 1 x D
-            g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
-        c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
-
-        def array_clip(a, clip):
-            size = np.prod(a.shape)
-            new_a = np.reshape(a, (size))
-            for i in range(size):
-                new_a[i] = max(new_a[i], -1.0 * clip)
-                new_a[i] = min(new_a[i], clip)
-            new_a = np.reshape(new_a, a.shape)
-            return new_a
-
-        if cell_clip > 0.0:
-            c = array_clip(c, cell_clip)
-        if w_c is None:
-            g_o = act_gate(g_o)  # 1 x D
-        else:
-            _, _, w_oc = np.split(w_c, 3, axis=1)
-            g_o = act_gate(g_o + w_oc * c)  # 1 x D
-        h = g_o * act_cell(c)
-        # projection
-        r = np.dot(h, w_rh)
-        r = act_proj(r)
-        if proj_clip > 0.0:
-            r = array_clip(r, proj_clip)
-        return r, c
-
-    def _reverse(x, offset):
-        y = np.zeros_like(x)
-        for i in range(len(offset) - 1):
-            b, e = offset[i], offset[i + 1]
-            y[b:e, :] = np.flip(x[b:e, :], 0)
-        return y
-
-    offset = [0]
-    for l in lod[0]:
-        offset.append(offset[-1] + l)
-    batch_size = len(lod[0])
-    # recurrent projection state
-    projection = []
-    cell = []
-    input = _reverse(input, offset) if is_reverse else input
-    if w_b is not None:
-        input = input + np.tile(w_b, (offset[-1], 1))
-    for i in range(batch_size):
-        # compute one sequence
-        seq_len = lod[0][i]
-        x = input[offset[i]:offset[i + 1], :]
-        r_pre = h0[i]
-        c_pre = c0[i]  # 1 x D
-        for j in range(seq_len):
-            # compute one step
-            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, proj_clip,
-                                 cell_clip, act_gate, act_cell, act_cand,
-                                 act_proj)
-            projection.append(r_pre.flatten())
-            cell.append(c_pre.flatten())
-
-    projection = np.array(projection).astype('float64')
-    cell = np.array(cell).astype('float64')
-
-    projection = _reverse(projection, offset) if is_reverse else projection
-    cell = _reverse(cell, offset) if is_reverse else cell
-
-    assert projection.shape == (input.shape[0], w_r.shape[0])  # T x P
-    assert cell.shape == (input.shape[0], input.shape[1] / 4)  # T x D
-    return projection, cell
-
-
-class TestLstmpOp(LstmTest.TestLstmOp):
-    def reset_argument(self):
-        pass
-
-    def setUp(self):
-        self.set_argument()
-        # projection size
-        self.P = 10
-        self.act_proj = self.act_cell
-
-        self.reset_argument()
-        self.op_type = 'lstmp'
-
-        T = sum(self.lod[0])
-        N = len(self.lod[0])
-        x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
-        if self.has_initial_state:
-            h0 = np.random.normal(size=(N, self.P)).astype('float64')
-            c0 = np.random.normal(size=(N, self.D)).astype('float64')
-        else:
-            h0 = np.zeros((N, self.P)).astype('float64')
-            c0 = np.zeros((N, self.D)).astype('float64')
-        w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
-        if self.use_peepholes:
-            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
-        else:
-            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
-
-        w_b = b[:, 0:4 * self.D]
-        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
-        w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
-        proj_clip = 0.1
-        cell_clip = 0.1
-        r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
-                     proj_clip, cell_clip, ACTIVATION[self.act_gate],
-                     ACTIVATION[self.act_cell], ACTIVATION[self.act_cand],
-                     ACTIVATION[self.act_proj])
-
-        self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
-
-        self.inputs['Bias'] = b
-
-        if self.has_initial_state:
-            self.inputs['H0'] = h0
-            self.inputs['C0'] = c0
-
-        self.outputs = {
-            'Projection': (r, self.lod),
-            'Cell': (c, self.lod),
-        }
-        self.attrs = {
-            'use_peepholes': self.use_peepholes,
-            'is_reverse': self.is_reverse,
-            'proj_clip': proj_clip,
-            'cell_clip': cell_clip,
-            'gate_activation': self.act_gate,
-            'cell_activation': self.act_cell,
-            'candidate_activation': self.act_cand,
-            'proj_activation': self.act_proj
-        }
-
-    def test_check_output(self):
-        self.check_output(atol=1e-8)
-
-    def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
-            max_relative_error=1e-2,
-            numeric_grad_delta=0.0000005)
-
-
-class TestLstmpOpHasInitial(TestLstmpOp):
-    def reset_argument(self):
-        self.has_initial_state = True
-
-    def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
-            ['Projection'],
-            numeric_grad_delta=0.0000005,
-            max_relative_error=1e-2)
-
-    def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'ProjWeight', 'Weight'], ['Projection'],
-            max_relative_error=1e-2,
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('Bias'))
-
-    def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'ProjWeight', 'Bias'], ['Projection'],
-            max_relative_error=1e-2,
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('Weight'))
-
-    def test_check_grad_ingore_proj_weight(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias'], ['Projection'],
-            max_relative_error=1e-2,
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('ProjWeight'))
-
-    def test_check_grad_ingore_input(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Weight', 'ProjWeight', 'Bias'], ['Projection'],
-            max_relative_error=1e-2,
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('Input'))
-
-    def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
-            max_relative_error=1e-2,
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('H0'))
-
-    def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0])
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
-            max_relative_error=1e-2,
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('C0'))
-
-
-class TestLstmpOpRerverse(TestLstmpOp):
-    def reset_argument(self):
-        self.is_reverse = True
-
-
-class TestLstmpOpNotUsePeepholes(TestLstmpOp):
-    def reset_argument(self):
-        self.use_peepholes = False
-
-
-class TestLstmpOpLinearProjection(TestLstmpOp):
-    def reset_argument(self):
-        self.act_proj = 'identity'
-
-
-class TestLstmpOpLen0Case1(TestLstmpOp):
-    def reset_argument(self):
-        self.lod = [[0, 4, 0]]
-
-
-class TestLstmpOpLen0Case2(TestLstmpOp):
-    def reset_argument(self):
-        self.lod = [[2, 0, 3]]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
deleted file mode 100644
index 4a7e952436bd46c92c6256b4ec2d0652cfa38959..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestMarginRankLossOp(OpTest):
-    def setUp(self):
-        self.op_type = "margin_rank_loss"
-        batch_size = 5
-        margin = 0.5
-        # labels_{i} = {-1, 1}
-        label = 2 * np.random.randint(
-            0, 2, size=(batch_size, 1)).astype("float32") - 1
-        x1 = np.random.random((batch_size, 1)).astype("float32")
-        x2 = np.random.random((batch_size, 1)).astype("float32")
-        # loss = max(0, -label * (x1 - x2) + margin)
-        loss = -label * (x1 - x2) + margin
-        loss = np.where(loss > 0, loss, 0)
-        act = np.where(loss > 0, 1., 0.)
-
-        self.attrs = {'margin': margin}
-        self.inputs = {'Label': label, 'X1': x1, 'X2': x2}
-        self.outputs = {'Activated': act, 'Out': loss}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X1", "X2"], "Out")
-
-    def test_check_grad_ignore_x1(self):
-        self.check_grad(["X2"], "Out", no_grad_set=set('X1'))
-
-    def test_check_grad_ignore_x2(self):
-        self.check_grad(["X1"], "Out", no_grad_set=set('X2'))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py b/python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py
deleted file mode 100644
index 49f630ba8f60364db68aa2d73c2f44adcd693167..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid as fluid
-
-
-class TestMatchMatrixTensorOp(OpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.set_data()
-        self.compute()
-
-    def init_op_type(self):
-        self.op_type = "match_matrix_tensor"
-
-    def set_data(self):
-        ix, iy, h, dim_t = [5, 8, 3, 4]
-        x_lod = [[1, 2, 2]]
-        y_lod = [[3, 1, 4]]
-        self.init_data(ix, x_lod, iy, y_lod, h, dim_t)
-
-    def init_data(self, ix, x_lod, iy, y_lod, h, dim_t):
-        x_data = np.random.random((ix, h)).astype('float32')
-        y_data = np.random.random((iy, h)).astype('float32')
-        w_data = np.random.random((h, dim_t, h)).astype('float32')
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod), 'W': w_data}
-        self.attrs = {'dim_t': dim_t}
-
-    def compute(self):
-        x_data, x_lod = self.inputs['X']
-        y_data, y_lod = self.inputs['Y']
-        # [k, dim_t, k] -> [dim_t, k, k]
-        w_data = self.inputs['W'].transpose(1, 0, 2)
-        out = np.zeros((0, 1), dtype=x_data.dtype)
-        # for x*w
-        tmp = np.zeros((0, 1), dtype=x_data.dtype)
-        out_lod = [[]]
-        tmp_lod = [[]]
-
-        x_offset, y_offset = 0, 0
-        for idx in range(len(x_lod[0])):
-            x_len = x_lod[0][idx]
-            y_len = y_lod[0][idx]
-            x_sub = x_data[x_offset:(x_offset + x_len), :]
-            y_sub = y_data[y_offset:(y_offset + y_len), :]
-            tmp_sub = np.dot(x_sub, w_data)
-            tmp = np.vstack((tmp, tmp_sub.reshape(tmp_sub.size, 1)))
-
-            out_sub = np.dot(tmp_sub, y_sub.T).transpose(1, 0, 2)
-            out_lod[0].append(out_sub.size)
-            out = np.vstack((out, out_sub.reshape(out_sub.size, 1)))
-
-            x_offset += x_len
-            y_offset += y_len
-        self.outputs = {'Out': (out, out_lod), 'Tmp': tmp}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
-
-
-class TestMatchMatrixTensorOpCase1(TestMatchMatrixTensorOp):
-    def set_data(self):
-        ix, iy, h, dim_t = [5, 8, 16, 4]
-        x_lod = [[5]]
-        y_lod = [[8]]
-        self.init_data(ix, x_lod, iy, y_lod, h, dim_t)
-
-
-class TestMatchMatrixTensorOpCase2(TestMatchMatrixTensorOp):
-    def set_data(self):
-        ix, iy, h, dim_t = [7, 8, 1, 4]
-        x_lod = [[2, 3, 2]]
-        y_lod = [[3, 1, 4]]
-        self.init_data(ix, x_lod, iy, y_lod, h, dim_t)
-
-
-class TestMatchMatrixTensorOpCase3(TestMatchMatrixTensorOp):
-    def set_data(self):
-        ix, iy, h, dim_t = [5, 9, 32, 1]
-        x_lod = [[1, 2, 2]]
-        y_lod = [[3, 2, 4]]
-        self.init_data(ix, x_lod, iy, y_lod, h, dim_t)
-
-
-class TestMatchMatrixTensorOpCase4(TestMatchMatrixTensorOp):
-    def set_data(self):
-        ix, iy, h, dim_t = [8, 12, 16, 5]
-        x_lod = [[1, 2, 3, 1, 1]]
-        y_lod = [[3, 2, 4, 1, 2]]
-        self.init_data(ix, x_lod, iy, y_lod, h, dim_t)
-
-    def test_api(self):
-        x_lod_tensor = fluid.layers.data(name='x', shape=[10], lod_level=1)
-        y_lod_tensor = fluid.layers.data(name='y', shape=[10], lod_level=1)
-        out, out_tmp = fluid.contrib.match_matrix_tensor(
-            x=x_lod_tensor, y=y_lod_tensor, channel_num=3)
-
-        place = fluid.CPUPlace()
-        x_data = np.random.rand(7, 10).astype('float32')
-        y_data = np.random.rand(9, 10).astype('float32')
-        x = fluid.create_lod_tensor(x_data, [[2, 5]], place)
-        y = fluid.create_lod_tensor(y_data, [[3, 6]], place)
-
-        exe = fluid.Executor(place=place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'x': x,
-                            'y': y},
-                      fetch_list=[out],
-                      return_numpy=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
deleted file mode 100644
index f6cdb17def9e472414bf1213d8756f6d2977adfa..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ /dev/null
@@ -1,191 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from decorator_helper import prog_scope
-import paddle.fluid as fluid
-import numpy
-
-
-class TestMathOpPatches(unittest.TestCase):
-    @prog_scope()
-    def test_add_scalar(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = a + 10
-        ab = fluid.layers.concat(input=[a, b], axis=1)
-        c = ab + 10
-        d = ab + a
-        # e = a + ab
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np, c_np, d_np = exe.run(fluid.default_main_program(),
-                                   feed={"a": a_np},
-                                   fetch_list=[b, c, d])
-        self.assertTrue(numpy.allclose(a_np + 10, b_np))
-        ab_np = numpy.concatenate([a_np, b_np], axis=1)
-        self.assertTrue(numpy.allclose(ab_np + 10, c_np))
-        d_expected = ab_np + numpy.concatenate([a_np, a_np], axis=1)
-        self.assertTrue(numpy.allclose(d_expected, d_np))
-
-    @prog_scope()
-    def test_radd_scalar(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = 10 + a
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np},
-                       fetch_list=[b])
-        self.assertTrue(numpy.allclose(a_np + 10, b_np))
-
-    @prog_scope()
-    def test_sub_scalar(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = a - 10
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np},
-                       fetch_list=[b])
-        self.assertTrue(numpy.allclose(a_np - 10, b_np))
-
-    @prog_scope()
-    def test_radd_scalar(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = 10 - a
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np},
-                       fetch_list=[b])
-        self.assertTrue(numpy.allclose(10 - a_np, b_np))
-
-    @prog_scope()
-    def test_mul_scalar(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = a * 10
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np},
-                       fetch_list=[b])
-        self.assertTrue(numpy.allclose(a_np * 10, b_np))
-
-    @prog_scope()
-    def test_rmul_scalar(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = 10 * a
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np},
-                       fetch_list=[b])
-        self.assertTrue(numpy.allclose(10 * a_np, b_np))
-
-    @prog_scope()
-    def test_div_scalar(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = a / 10
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np},
-                       fetch_list=[b])
-        self.assertTrue(numpy.allclose(a_np / 10, b_np))
-
-    @prog_scope()
-    def test_rdiv_scalar(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = 10 / a
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32') + 1e-2
-
-        b_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np},
-                       fetch_list=[b])
-        self.assertTrue(numpy.allclose(10 / a_np, b_np))
-
-    @prog_scope()
-    def test_div_two_tensor(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = fluid.layers.data(name="b", shape=[1])
-        c = a / b
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = numpy.random.random(size=[10, 1]).astype('float32') + 1e-2
-        c_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np,
-                             'b': b_np},
-                       fetch_list=[c])
-        self.assertTrue(numpy.allclose(a_np / b_np, c_np))
-
-    @prog_scope()
-    def test_mul_two_tensor(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = fluid.layers.data(name="b", shape=[1])
-        c = a * b
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = numpy.random.random(size=[10, 1]).astype('float32')
-        c_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np,
-                             'b': b_np},
-                       fetch_list=[c])
-        self.assertTrue(numpy.allclose(a_np * b_np, c_np))
-
-    @prog_scope()
-    def test_add_two_tensor(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = fluid.layers.data(name="b", shape=[1])
-        c = a + b
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = numpy.random.random(size=[10, 1]).astype('float32')
-        c_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np,
-                             'b': b_np},
-                       fetch_list=[c])
-        self.assertTrue(numpy.allclose(a_np + b_np, c_np))
-
-    @prog_scope()
-    def test_sub_two_tensor(self):
-        a = fluid.layers.data(name="a", shape=[1])
-        b = fluid.layers.data(name="b", shape=[1])
-        c = a - b
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = numpy.random.random(size=[10, 1]).astype('float32')
-        c_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np,
-                             'b': b_np},
-                       fetch_list=[c])
-        self.assertTrue(numpy.allclose(a_np - b_np, c_np))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
deleted file mode 100644
index abf10437d83268a6a84a1c62399eb02cd3b1d663..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ /dev/null
@@ -1,174 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
-    BATCH_SIZE = 2
-    M = 3
-    N = 4
-    K = 5
-    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
-        K = 1
-    if dim_X == 1:
-        if transpose_X:
-            shape_X = [M]
-        else:
-            shape_X = [K]
-    if dim_Y == 1:
-        if transpose_Y:
-            shape_Y = [N]
-        else:
-            shape_Y = [K]
-    if dim_X >= 2:
-        if transpose_X:
-            shape_X = [K, M]
-        else:
-            shape_X = [M, K]
-    if dim_X == 3:
-        shape_X = [BATCH_SIZE] + shape_X
-    if dim_Y >= 2:
-        if transpose_Y:
-            shape_Y = [N, K]
-        else:
-            shape_Y = [K, N]
-    if dim_Y == 3:
-        shape_Y = [BATCH_SIZE] + shape_Y
-    return shape_X, shape_Y
-
-
-def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        if X.ndim == 1:
-            X = X.reshape((X.size, 1))
-        elif X.ndim == 2:
-            X = X.T
-        else:
-            dim = [i for i in range(len(X.shape))]
-            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
-            X = np.transpose(X, tuple(dim))
-    if transpose_Y:
-        if Y.ndim == 1:
-            Y = Y.reshape((1, Y.size))
-        else:
-            dim = [i for i in range(len(Y.shape))]
-            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
-            Y = np.transpose(Y, tuple(dim))
-
-    Out = np.matmul(X, Y)
-    if not Out.shape:
-        # We do not support 0-dimensional Tensors (scalars). So where
-        # np.matmul outputs a scalar, we must convert to a Tensor of
-        # shape (1, ) instead.
-        # Everywhere else, we are compatible with np.matmul.
-        Out = np.array([Out], dtype="float32")
-    return Out
-
-
-class Generator(object):
-    def setUp(self):
-        self.op_type = "matmul"
-        X = np.random.random(self.shape_X).astype("float32")
-        Y = np.random.random(self.shape_Y).astype("float32")
-        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y)
-        self.inputs = {'X': X, 'Y': Y}
-        self.attrs = {
-            'transpose_X': self.transpose_X,
-            'transpose_Y': self.transpose_Y
-        }
-        self.outputs = {'Out': Out}
-
-    def test_check_output(self):
-        self.check_output(atol=1e-3)
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
-
-    def test_check_grad_ignore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
-
-
-# Generate test cases for all possibilities
-def inject_test(dim_x, dim_y, trans_x, trans_y):
-    test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-        dim_x, dim_y, trans_x, trans_y))
-    shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
-                                                  trans_y)
-    globals()[test_name] = type(test_name, (Generator, OpTest), {
-        'shape_X': shape_x,
-        'shape_Y': shape_y,
-        'transpose_X': trans_x,
-        'transpose_Y': trans_y,
-    })
-
-
-for dim_X in (1, 2, 3):
-    for dim_Y in (1, 2, 3):
-        for transose_x in (False, True):
-            for transose_y in (False, True):
-                inject_test(dim_X, dim_Y, transose_x, transose_y)
-
-
-# Test case n-dim
-def generate_compatible_shapes(dim, transpose_X, transpose_Y):
-    M = 2
-    N = 4
-    K = 3
-    shape_X = [2 for _ in range(dim - 2)]
-    shape_Y = [2 for _ in range(dim - 2)]
-
-    if transpose_X:
-        shape_X += [K, M]
-    else:
-        shape_X += [M, K]
-
-    if transpose_Y:
-        shape_Y += [N, K]
-    else:
-        shape_Y += [K, N]
-
-    return shape_X, shape_Y
-
-
-# # Test case n-dim
-for dim in [4]:
-    for transpose_X in [False, True]:
-        for transpose_Y in [False, True]:
-            test_name = (
-                'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-                    dim, dim, transpose_X, transpose_Y))
-            shape_X, shape_Y = generate_compatible_shapes(dim, transpose_X,
-                                                          transpose_Y)
-            globals()[test_name] = type(test_name, (Generator, OpTest), {
-                'shape_X': shape_X,
-                'shape_Y': shape_Y,
-                'transpose_X': transpose_X,
-                'transpose_Y': transpose_Y,
-            })
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op_with_head.py b/python/paddle/fluid/tests/unittests/test_matmul_op_with_head.py
deleted file mode 100644
index 3cca8af2d4973ed84f891101511432a102d1b84f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_matmul_op_with_head.py
+++ /dev/null
@@ -1,294 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def generate_compatible_shapes_mul_head(dim_X, dim_Y, transpose_X, transpose_Y):
-    BATCH_SIZE = 2
-    M = 3
-    N = 4
-    K = 24
-    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
-        K = 1
-    if dim_X == 1:
-        if transpose_X:
-            shape_X = [M]
-        else:
-            shape_X = [K]
-    if dim_Y == 1:
-        if transpose_Y:
-            shape_Y = [N]
-        else:
-            shape_Y = [K]
-    if dim_X >= 2:
-        if transpose_X:
-            shape_X = [K, M]
-        else:
-            shape_X = [M, K]
-    if dim_X == 3:
-        shape_X = [BATCH_SIZE] + shape_X
-    if dim_Y >= 2:
-        if transpose_Y:
-            shape_Y = [N, K]
-        else:
-            shape_Y = [K, N]
-    if dim_Y == 3:
-        shape_Y = [BATCH_SIZE] + shape_Y
-    return shape_X, shape_Y
-
-
-def matmul_head(X, Y, head_number=1):
-    x = []
-    y = []
-    z = []
-    sub_x_width = X.shape[-1] // head_number
-    sub_y_height = Y.shape[-2] // head_number
-    if np.ndim(X) == 2:
-        for i in range(0, head_number):
-            x.append(X[:, i * sub_x_width:i * sub_x_width + sub_x_width])
-            y.append(Y[i * sub_y_height:i * sub_y_height + sub_y_height, :])
-        for i in range(0, head_number):
-            z.append(np.matmul(x[i], y[i]))
-        Z = np.concatenate((z), axis=1)
-
-    elif np.ndim(X) == 3:
-        for i in range(0, head_number):
-            x.append(X[:, :, i * sub_x_width:i * sub_x_width + sub_x_width])
-            y.append(Y[:, i * sub_y_height:i * sub_y_height + sub_y_height, :])
-        for i in range(0, head_number):
-            z.append(np.matmul(x[i], y[i]))
-        Z = np.concatenate((z), axis=2)
-    else:
-        print("ERROR: Not supported dimension")
-
-    return Z
-
-
-def transpose_mat(X):
-    if X.ndim >= 2:
-        dim = np.arange(X.ndim)
-        dim[[-1, -2]] = dim[[-2, -1]]
-        X = np.transpose(X, tuple(dim))
-
-    return X
-
-
-def reference_matmul_mul_head(X,
-                              Y,
-                              head_number=1,
-                              transpose_X=False,
-                              transpose_Y=False):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        X = transpose_mat(X)
-    if transpose_Y:
-        Y = transpose_mat(Y)
-
-    Out = matmul_head(X, Y, head_number)
-    if not Out.shape:
-        # We do not support 0-dimensional Tensors (scalars). So where
-        # np.matmul outputs a scalar, we must convert to a Tensor of
-        # shape (1, ) instead.
-        # Everywhere else, we are compatible with np.matmul.
-        Out = np.array([Out], dtype="float32")
-    return Out
-
-
-# Generator for multiple head
-class GeneratorMulHead(object):
-    def setUp(self):
-        self.op_type = "matmul"
-        X = np.random.random(self.shape_X).astype("float32")
-        Y = np.random.random(self.shape_Y).astype("float32")
-        Out = reference_matmul_mul_head(X, Y, 4, self.transpose_X,
-                                        self.transpose_Y)
-
-        self.inputs = {'X': X, 'Y': Y}
-        self.attrs = {
-            'transpose_X': self.transpose_X,
-            'transpose_Y': self.transpose_Y,
-            'head_number': self.head_number
-        }
-        self.outputs = {'Out': Out}
-
-    def test_check_output(self):
-        self.check_output(atol=1e-3)
-
-
-def inject_test_multiple_head(dim_x, dim_y, trans_x, trans_y, head_number):
-    test_name = (
-        'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_head_{}'.format(
-            dim_x, dim_y, trans_x, trans_y, head_number))
-    shape_x, shape_y = generate_compatible_shapes_mul_head(dim_x, dim_y,
-                                                           trans_x, trans_y)
-    globals()[test_name] = type(test_name, (GeneratorMulHead, OpTest), {
-        'shape_X': shape_x,
-        'shape_Y': shape_y,
-        'transpose_X': trans_x,
-        'transpose_Y': trans_y,
-        'head_number': head_number
-    })
-
-
-def matmul_head2(X, Y, head_number=1):
-    x = []
-    y = []
-    z = []
-    sub_x_width = X.shape[-1] // head_number
-    sub_y_width = Y.shape[-1] // head_number
-    assert (sub_x_width == Y.shape[-2]
-            ), "Error: incompatible head number or matrix size!"
-    if np.ndim(X) == 2:
-        for i in range(0, head_number):
-            x.append(X[:, i * sub_x_width:i * sub_x_width + sub_x_width])
-            y.append(Y[:, i * sub_y_width:i * sub_y_width + sub_y_width])
-        for i in range(0, head_number):
-            z.append(np.matmul(x[i], y[i]))
-        Z = np.concatenate((z), axis=1)
-
-    elif np.ndim(X) == 3:
-        for i in range(0, head_number):
-            x.append(X[:, :, i * sub_x_width:i * sub_x_width + sub_x_width])
-            y.append(Y[:, :, i * sub_y_width:i * sub_y_width + sub_y_width])
-        for i in range(0, head_number):
-            z.append(np.matmul(x[i], y[i]))
-        Z = np.concatenate((z), axis=2)
-    else:
-        assert False, "ERROR: Not supported dimension!"
-    return Z
-
-
-def reference_matmul_mul_head2(X,
-                               Y,
-                               head_number=1,
-                               transpose_X=False,
-                               transpose_Y=False):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        X = transpose_mat(X)
-    if transpose_Y:
-        Y = transpose_mat(Y)
-
-    Out = matmul_head2(X, Y, head_number)
-    if not Out.shape:
-        # We do not support 0-dimensional Tensors (scalars). So where
-        # np.matmul outputs a scalar, we must convert to a Tensor of
-        # shape (1, ) instead.
-        # Everywhere else, we are compatible with np.matmul.
-        Out = np.array([Out], dtype="float32")
-    return Out
-
-
-def generate_compatible_shapes_mul_head2(dim_X, dim_Y, transpose_X,
-                                         transpose_Y):
-    BATCH_SIZE = 2
-    # Assume head number H is 4. We need make sure K1/H = M2
-    M1 = 3
-    K1 = 8
-    M2 = 2
-    K2 = 16
-
-    if dim_X >= 2:
-        if transpose_X:
-            shape_X = [K1, M1]
-        else:
-            shape_X = [M1, K1]
-    if dim_X == 3:
-        shape_X = [BATCH_SIZE] + shape_X
-    if dim_Y >= 2:
-        if transpose_Y:
-            shape_Y = [K2, M2]
-        else:
-            shape_Y = [M2, K2]
-    if dim_Y == 3:
-        shape_Y = [BATCH_SIZE] + shape_Y
-    return shape_X, shape_Y
-
-
-# Generator for multiple head, case 2 when width of X is not same as height of Y
-class GeneratorMulHead2(object):
-    def setUp(self):
-        self.op_type = "matmul"
-
-        X = np.zeros(self.shape_X)
-        Y = np.zeros(self.shape_Y)
-        if len(self.shape_X) == 2:
-            X = np.arange(
-                0, self.shape_X[-1] * self.shape_X[-2],
-                dtype=np.float32).reshape(self.shape_X)
-            Y = np.arange(
-                0, self.shape_Y[-1] * self.shape_Y[-2],
-                dtype=np.float32).reshape(self.shape_Y)
-        else:
-            for i in range(0, len(self.shape_X) - 1):
-                X[i, :, :] = np.arange(
-                    0, self.shape_X[-1] * self.shape_X[-2],
-                    dtype=np.float32).reshape(list(self.shape_X)[-2:])
-                Y[i, :, :] = np.arange(
-                    0, self.shape_Y[-1] * self.shape_Y[-2],
-                    dtype=np.float32).reshape(list(self.shape_Y)[-2:])
-
-        Out = reference_matmul_mul_head2(X, Y, 4, self.transpose_X,
-                                         self.transpose_Y)
-
-        self.inputs = {'X': X, 'Y': Y}
-        self.attrs = {
-            'transpose_X': self.transpose_X,
-            'transpose_Y': self.transpose_Y,
-            'head_number': self.head_number
-        }
-        self.outputs = {'Out': Out}
-
-    def test_check_output(self):
-        self.check_output(atol=1e-3)
-
-
-def inject_test_multiple_head2(dim_x, dim_y, trans_x, trans_y, head_number):
-    test_name = (
-        'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_head2_{}'.format(
-            dim_x, dim_y, trans_x, trans_y, head_number))
-    shape_x, shape_y = generate_compatible_shapes_mul_head2(dim_x, dim_y,
-                                                            trans_x, trans_y)
-    globals()[test_name] = type(test_name, (GeneratorMulHead2, OpTest), {
-        'shape_X': shape_x,
-        'shape_Y': shape_y,
-        'transpose_X': trans_x,
-        'transpose_Y': trans_y,
-        'head_number': head_number
-    })
-
-
-#test case for multiple head
-for dim in (2, 3):
-    for transose_x in (False, True):
-        for transose_y in (False, True):
-            inject_test_multiple_head(dim, dim, transose_x, transose_y, 4)
-
-#test case for multiple head when X.width != Y.height
-for dim in (2, 3):
-    for transose_x in (False, True):
-        for transose_y in (False, True):
-            inject_test_multiple_head2(dim, dim, transose_x, transose_y, 4)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
deleted file mode 100644
index d588b22fe2607a6041359d420ebba757d8a632d6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def maxout_forward_naive(input, groups):
-    s0, s1, s2, s3 = input.shape
-    return np.ndarray([s0, s1 // groups, groups, s2, s3], \
-        buffer = input, dtype=input.dtype).max(axis=(2))
-
-
-class TestMaxOutOp(OpTest):
-    def setUp(self):
-        self.op_type = "maxout"
-        self.init_test_case()
-        input = np.random.random(self.shape).astype("float32")
-        output = self.MaxOut_forward_naive(input, self.groups).astype("float32")
-
-        self.inputs = {'X': input}
-        self.attrs = {'groups': self.groups}
-
-        self.outputs = {'Out': output.astype('float32')}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-    def init_test_case(self):
-        self.MaxOut_forward_naive = maxout_forward_naive
-        self.shape = [100, 6, 2, 2]
-        self.groups = 2
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py
deleted file mode 100644
index 03e94483178e83adad9886cd7df2107581360dd1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from __future__ import division
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects,
-                     in_mean_ious):
-    assert predictions.shape == labels.shape
-    predictions = predictions.flatten()
-    labels = labels.flatten()
-
-    out_wrong = np.zeros([num_classes]).astype("int32")
-    for _, wrong in in_wrongs:
-        out_wrong += wrong
-    out_correct = np.zeros([num_classes]).astype("int32")
-    for _, correct in in_corrects:
-        out_correct += correct
-
-    for pred, label in zip(predictions, labels):
-        if pred == label:
-            out_correct[pred] += 1
-        else:
-            out_wrong[pred] += 1
-            out_wrong[label] += 1
-
-    denominator = out_wrong + out_correct
-    valid_count = (denominator != 0).sum()
-    denominator = np.where(denominator > 0, denominator,
-                           np.ones(denominator.shape))
-    mean_iou = (out_correct / denominator).sum() / valid_count
-
-    for _, in_mean_iou in in_mean_ious:
-        mean_iou += in_mean_iou
-    return mean_iou, out_wrong, out_correct
-
-
-class TestMeanIOUOp(OpTest):
-    def setUp(self):
-        self.config()
-        self.op_type = "mean_iou"
-        predictions = np.random.randint(0, self.num_classes,
-                                        self.image_size).astype("int32")
-        labels = np.random.randint(0, self.num_classes,
-                                   self.image_size).astype("int32")
-
-        in_wrongs = []
-        for i in range(self.in_wrong_num):
-            in_wrongs.append(("in_wrong_%d" % i, np.random.randint(
-                0, 10, [self.num_classes]).astype("int32")))
-
-        in_corrects = []
-        for i in range(self.in_correct_num):
-            in_corrects.append(("in_correct_%d" % i, np.random.randint(
-                0, 10, [self.num_classes]).astype("int32")))
-
-        in_mean_ious = []
-        for i in range(self.in_mean_iou_num):
-            in_mean_ious.append(("in_mean_iou_%d" % i, np.random.uniform(
-                0, 1, [1]).astype("float32")))
-
-        self.inputs = {
-            'Predictions': predictions,
-            'Labels': labels,
-            'InWrongs': in_wrongs,
-            'InCorrects': in_corrects,
-            'InMeanIou': in_mean_ious
-        }
-        self.attrs = {'num_classes': int(self.num_classes)}
-        mean_iou, out_wrong, out_correct = compute_mean_iou(
-            predictions, labels, self.num_classes, in_wrongs, in_corrects,
-            in_mean_ious)
-        self.outputs = {
-            'OutMeanIou': mean_iou,
-            'OutWrong': out_wrong,
-            'OutCorrect': out_correct
-        }
-
-    def config(self):
-        self.num_classes = 10
-        self.image_size = [128, 128]
-        self.in_wrong_num = 0
-        self.in_correct_num = 0
-        self.in_mean_iou_num = 0
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestCase1(TestMeanIOUOp):
-    def config(self):
-        self.num_classes = 5
-        self.image_size = [100, 128]
-        self.in_wrong_num = 2
-        self.in_correct_num = 2
-        self.in_mean_iou_num = 2
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
deleted file mode 100644
index beae909e9b4c88eb7ddfbbe4e5ad2cf583a953ef..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-
-
-class TestMeanOp(OpTest):
-    def setUp(self):
-        self.op_type = "mean"
-        self.dtype = np.float32
-        self.init_dtype_type()
-        self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
-        self.outputs = {'Out': np.mean(self.inputs["X"])}
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_checkout_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestFP16MeanOp(TestMeanOp):
-    def init_dtype_type(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_output_with_place(place, atol=2e-3)
-
-    def test_checkout_grad(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['X'], 'Out', max_relative_error=0.8)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py b/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py
deleted file mode 100644
index a1b7380fdd9a201a6cb17aeaf72ca8577506a63a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import numpy as np
-import unittest
-
-
-class TestMemoryReuseExcludeFeedVar(unittest.TestCase):
-    def setUp(self):
-        self.image_shape = [28, 28]
-        self.iteration = 10
-
-    def main_impl(self, place):
-        image = fluid.layers.data(
-            name='image', shape=self.image_shape, dtype='float32')
-        relu_image = fluid.layers.relu(image)
-        loss = fluid.layers.reduce_mean(relu_image)
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.enable_inplace = True
-        build_strategy.memory_optimize = True
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        compiled_prog = fluid.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
-
-        image_tensor = fluid.LoDTensor()
-        np_image = np.random.uniform(
-            low=-10, high=10, size=self.image_shape).astype('float32')
-        image_tensor.set(np_image, place)
-
-        feed_dict = [{image.name: image_tensor}]
-
-        for _ in range(self.iteration):
-            exe.run(compiled_prog, feed=feed_dict, fetch_list=[loss.name])
-            self.assertTrue(np.array_equal(np.array(image_tensor), np_image))
-
-    def test_main(self):
-        places = [fluid.CPUPlace()]
-        if fluid.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-
-        for p in places:
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                with fluid.unique_name.guard():
-                    with fluid.scope_guard(fluid.Scope()):
-                        self.main_impl(p)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py
deleted file mode 100644
index 4cdb5b5d9f7f020c4eb9a3b3a804c074d7ddbb35..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_memory_usage.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle
-import paddle.fluid as fluid
-import contextlib
-import unittest
-
-
-def train_simulator(test_batch_size=10):
-    if test_batch_size <= 0:
-        raise ValueError("batch_size should be a positive integeral value, "
-                         "but got batch_size={}".format(test_batch_size))
-
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-
-    # Calculate memory usage in current network config
-    lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
-        fluid.default_main_program(), batch_size=test_batch_size)
-
-    print("memory usage is about %.3f - %.3f %s" %
-          (lower_usage, upper_usage, unit))
-
-
-class TestMemoryUsage(unittest.TestCase):
-    def test_with_unit_B(self):
-        with self.program_scope_guard():
-            train_simulator()
-
-    def test_with_unit_KB(self):
-        with self.program_scope_guard():
-            train_simulator(test_batch_size=1000)
-
-    def test_with_unit_MB(self):
-        with self.program_scope_guard():
-            train_simulator(test_batch_size=100000)
-
-    @contextlib.contextmanager
-    def program_scope_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
deleted file mode 100644
index b109e4ea62669c735128f4824eb9d02ad43900e0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestMergeIdsOp(OpTest):
-    def setUp(self):
-        self.op_type = "merge_ids"
-        ids1 = np.array([[0], [2], [5], [6]]).astype('int64')
-        ids2 = np.array([[0], [2], [2], [3]]).astype('int64')
-
-        rows1 = np.array([[0], [2]]).astype('int64')
-        rows2 = np.array([[3], [5]]).astype('int64')
-        rows3 = np.array([[6]]).astype('int64')
-
-        x0 = np.array([[0.1, 0.2], [0.2, 0.3]]).astype('float32')
-        x1 = np.array([[0.3, 0.4], [0.4, 0.5]]).astype('float32')
-        x2 = np.array([[0.5, 0.6]]).astype('float32')
-
-        out1 = np.array(
-            [[0.1, 0.2], [0.2, 0.3], [0.4, 0.5], [0.5, 0.6]]).astype('float32')
-        out2 = np.array(
-            [[0.1, 0.2], [0.2, 0.3], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
-
-        self.inputs = {
-            'Ids': [('ids1', ids1), ('ids2', ids2)],
-            "Rows": [('rows1', rows1), ('rows2', rows2), ('rows3', rows3)],
-            "X": [('x0', x0), ('x1', x1), ('x2', x2)]
-        }
-        self.outputs = {'Out': [('out1', out1), ('out2', out2)]}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
deleted file mode 100644
index d2fa344b67ab33a93f92733efd68e896c767bad2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import numpy as np
-from paddle.fluid.op import Operator
-
-
-class TestMergeSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        x_rows = [0, 5, 5, 4, 19]
-        out_rows = [0, 4, 5, 19]
-        height = 20
-        row_numel = 2
-
-        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
-        np_array[1, :] = 2.0
-        np_array[2, :] = 3.0
-        np_array[3, :] = 4.0
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(x_rows)
-        x.set_height(height)
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        # initialize input variable Out
-        out = scope.var("Out").get_selected_rows()
-
-        op = Operator("merge_selected_rows", X="X", Out="Out")
-
-        op.run(scope, place)
-
-        self.assertEqual(out.rows(), out_rows)
-        self.assertEqual(out.height(), height)
-
-        out_array = np.array(out.get_tensor())
-        self.assertEqual((4, 2), out_array.shape)
-
-        assert (out_array[0, :] == 1.0).all()
-        assert (out_array[1, :] == 4.0).all()
-        assert (out_array[2, :] == 5.0).all()
-        assert (out_array[3, :] == 1.0).all()
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_metrics.py b/python/paddle/fluid/tests/unittests/test_metrics.py
deleted file mode 100644
index ec27884cae2b0462951f6597b1b83e58d1c8af5d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_metrics.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle.fluid as fluid
-from paddle.fluid.framework import Program, program_guard
-
-
-class TestMetricsDetectionMap(unittest.TestCase):
-    def test_detection_map(self):
-        program = fluid.Program()
-        with program_guard(program):
-            detect_res = fluid.layers.data(
-                name='detect_res',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
-            label = fluid.layers.data(
-                name='label',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='float32')
-            box = fluid.layers.data(
-                name='bbox',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            map_eval = fluid.metrics.DetectionMAP(
-                detect_res, label, box, class_num=21)
-            cur_map, accm_map = map_eval.get_map_var()
-            self.assertIsNotNone(cur_map)
-            self.assertIsNotNone(accm_map)
-        print(str(program))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
deleted file mode 100644
index 4e5cc91268c5df4be3de3c04a82ef65b33cf4d20..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-from op_test import OpTest
-
-
-class TestMineHardExamplesOp(OpTest):
-    def set_data(self):
-        self.init_test_data()
-        self.inputs = {
-            'ClsLoss': self.cls_loss,
-            'LocLoss': self.loc_loss,
-            'MatchIndices': self.match_indices,
-            'MatchDist': self.match_dis
-        }
-
-        self.attrs = {
-            'neg_pos_ratio': self.neg_pos_ratio,
-            'neg_overlap': self.neg_overlap,
-            'sample_size': self.sample_size,
-            'mining_type': self.mining_type
-        }
-
-        self.outputs = {
-            'NegIndices': (self.neg_indices, self.neg_indices_lod),
-            'UpdatedMatchIndices': self.updated_match_indices
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        return
-
-    def setUp(self):
-        self.op_type = "mine_hard_examples"
-        self.set_data()
-
-    def init_test_data(self):
-        self.neg_pos_ratio = 1.0
-        self.neg_overlap = 0.5
-        self.sample_size = 0
-        self.mining_type = "max_negative"
-        self.cls_loss = np.array([[0.1, 0.1, 0.3],
-                                  [0.3, 0.1, 0.1]]).astype('float32')
-
-        self.loc_loss = np.array([[0.1, 0.2, 0.3],
-                                  [0.3, 0.4, 0.1]]).astype('float32')
-
-        self.match_dis = np.array([[0.2, 0.4, 0.8],
-                                   [0.1, 0.9, 0.3]]).astype('float32')
-
-        self.match_indices = np.array([[0, -1, -1],
-                                       [-1, 0, -1]]).astype('int32')
-
-        self.updated_match_indices = self.match_indices
-
-        self.neg_indices_lod = [[1, 1]]
-        self.neg_indices = np.array([[1], [0]]).astype('int32')
-
-
-class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
-    def init_test_data(self):
-        super(TestMineHardExamplesOpHardExample, self).init_test_data()
-        self.mining_type = "hard_example"
-        self.sample_size = 2
-
-        self.cls_loss = np.array([[0.5, 0.1, 0.3],
-                                  [0.3, 0.1, 0.1]]).astype('float32')
-
-        self.loc_loss = np.array([[0.2, 0.2, 0.3],
-                                  [0.3, 0.1, 0.2]]).astype('float32')
-
-        self.match_indices = np.array([[0, -1, -1],
-                                       [-1, 0, -1]]).astype('int32')
-
-        self.updated_match_indices = np.array([[0, -1, -1],
-                                               [-1, -1, -1]]).astype('int32')
-
-        self.neg_indices_lod = [[1, 2]]
-        self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_minus_op.py b/python/paddle/fluid/tests/unittests/test_minus_op.py
deleted file mode 100644
index 54253b17b967871b03628023c5a9fdb339af1828..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_minus_op.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestMinusOp(OpTest):
-    def setUp(self):
-        self.op_type = "minus"
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((32, 84)).astype("float32")
-        }
-        self.outputs = {'Out': (self.inputs['X'] - self.inputs['Y'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
deleted file mode 100644
index 5ccf855ebc3604389fa6e8b30367b040978c3ed4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid.core as core
-import math
-import os
-import sys
-import unittest
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from simple_nets import init_data
-from parallel_executor_test_base import TestParallelExecutorBase
-
-batch_size = 12
-img_shape = [1, 28, 28]
-
-
-def loss_net(hidden, label):
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
-    return avg_loss
-
-
-def conv_net(use_feed):
-    img = fluid.layers.data(name='image', shape=img_shape, dtype='float16')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-
-    conv_pool_1 = fluid.layers.cast(conv_pool_1, np.float32)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    hidden = fluid.layers.cast(conv_pool_2, np.float32)
-    return loss_net(hidden, label)
-
-
-def _optimizer(learning_rate=1e-6):
-    optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
-    return optimizer
-
-
-class TestResnet(TestParallelExecutorBase):
-    def check_model(self, use_cuda):
-        img, label = init_data(
-            batch_size=batch_size, img_shape=img_shape, label_range=9)
-        img = np.float16(img).view(np.uint16)
-        feed_dict = {"image": img, "label": label}
-
-        TestParallelExecutorBase.check_network_convergence(
-            conv_net,
-            feed_dict=feed_dict,
-            iter=10,
-            use_cuda=use_cuda,
-            fuse_all_reduce_ops=True,
-            optimizer=_optimizer)
-
-    def test_model(self):
-        if core.is_compiled_with_cuda():
-            self.check_model(True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
deleted file mode 100644
index 02fecfe47ec3fbff085b0a7f24316e5d0f6cd814..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def modified_huber_loss_forward(val):
-    if val < -1:
-        return -4. * val
-    elif val < 1:
-        return (1. - val) * (1. - val)
-    else:
-        return 0.
-
-
-class TestModifiedHuberLossOp(OpTest):
-    def setUp(self):
-        self.op_type = 'modified_huber_loss'
-        samples_num = 32
-
-        x_np = np.random.uniform(-2., 2., (samples_num, 1)).astype('float32')
-        y_np = np.random.choice([0, 1], samples_num).reshape(
-            (samples_num, 1)).astype('float32')
-        product_res = x_np * (2. * y_np - 1.)
-        # keep away from the junction of piecewise function
-        for pos, val in np.ndenumerate(product_res):
-            while abs(val - 1.) < 0.05:
-                x_np[pos] = np.random.uniform(-2., 2.)
-                y_np[pos] = np.random.choice([0, 1])
-                product_res[pos] = x_np[pos] * (2 * y_np[pos] - 1)
-                val = product_res[pos]
-
-        self.inputs = {'X': x_np, 'Y': y_np}
-        loss = np.vectorize(modified_huber_loss_forward)(product_res)
-
-        self.outputs = {
-            'IntermediateVal': product_res.astype('float32'),
-            'Out': loss.reshape((samples_num, 1)).astype('float32')
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.01)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
deleted file mode 100644
index 77ec6f9b6bcda7568325698634fd4f86557cd1be..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ /dev/null
@@ -1,238 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from op_test import OpTest
-
-
-class TestMomentumOp1(OpTest):
-    def setUp(self):
-        self.op_type = "momentum"
-        self.dtype = np.float32
-        self.init_dtype()
-
-        param = np.random.random((123, 321)).astype(self.dtype)
-        grad = np.random.random((123, 321)).astype(self.dtype)
-        velocity = np.zeros((123, 321)).astype(self.dtype)
-        learning_rate = np.array([0.001]).astype(self.dtype)
-        mu = 0.0001
-        use_nesterov = False
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Velocity': velocity,
-            'LearningRate': learning_rate
-        }
-
-        self.attrs = {'mu': mu}
-
-        velocity_out = mu * velocity + grad
-        if use_nesterov:
-            param_out = param - grad * learning_rate - \
-                        velocity_out * mu * learning_rate
-        else:
-            param_out = param - learning_rate * velocity_out
-
-        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
-
-    def init_dtype(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestMomentumOpFp16(TestMomentumOp1):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        self.check_output(atol=1e-3)
-
-
-class TestMomentumOp2(OpTest):
-    '''Test Momentum with default values for attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "momentum"
-
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        velocity = np.zeros((123, 321)).astype("float32")
-        learning_rate = np.array([0.001]).astype("float32")
-        mu = 0.0001
-        use_nesterov = True
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Velocity': velocity,
-            'LearningRate': learning_rate
-        }
-
-        self.attrs = {'mu': mu, 'use_nesterov': use_nesterov}
-
-        velocity_out = mu * velocity + grad
-        if use_nesterov:
-            param_out = param - grad * learning_rate - \
-                        velocity_out * mu * learning_rate
-        else:
-            param_out = param - learning_rate * velocity_out
-
-        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestLarsMomentumOp(OpTest):
-    def setUp(self):
-        self.op_type = "lars_momentum"
-
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        velocity = np.zeros((123, 321)).astype("float32")
-        learning_rate = np.array([0.001]).astype("float32")
-        mu = 0.0001
-        lars_coeff = 0.001
-        lars_weight_decay = 0.0005
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Velocity': velocity,
-            'LearningRate': learning_rate
-        }
-
-        self.attrs = {
-            'mu': mu,
-            'lars_coeff': lars_coeff,
-            'lars_weight_decay': lars_weight_decay
-        }
-
-        pnorm = np.sqrt(np.square(param).sum())
-        gnorm = np.sqrt(np.square(grad).sum())
-        local_lr = learning_rate * lars_coeff * pnorm / (
-            gnorm + lars_weight_decay * param)
-        velocity_out = mu * velocity + local_lr * (grad + lars_weight_decay *
-                                                   param)
-        param_out = param - velocity_out
-
-        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSparseMomentumOp(unittest.TestCase):
-    def setUp(self):
-        self.use_nesterov = False
-
-    def check_with_place(self, place):
-        self.init_kernel()
-        scope = core.Scope()
-        # create and initialize Grad Variable
-        height = 10
-        rows = [0, 4, 7]
-        row_numel = 12
-        mu = 1.0
-        use_nesterov = self.use_nesterov
-
-        # create and initialize Param Variable
-        param = scope.var('Param').get_tensor()
-        param_array = np.full((height, row_numel), 5.0).astype("float32")
-        param.set(param_array, place)
-        param_out = scope.var("ParamOut").get_tensor()
-        param_out_array = np.full((height, row_numel), 0.0).astype("float32")
-        param_out.set(param_out_array, place)
-
-        grad_selected_rows = scope.var('Grad').get_selected_rows()
-        grad_selected_rows.set_height(height)
-        grad_selected_rows.set_rows(rows)
-        grad_np_array = np.ones((len(rows), row_numel)).astype("float32")
-        grad_np_array[0, 0] = 2.0
-        grad_np_array[2, 8] = 4.0
-        grad_tensor = grad_selected_rows.get_tensor()
-        grad_tensor.set(grad_np_array, place)
-
-        velocity = scope.var('Velocity').get_tensor()
-        velocity_np_array = np.ones((height, row_numel)).astype("float32")
-        velocity.set(velocity_np_array, place)
-        velocity_out = scope.var('VelocityOut').get_tensor()
-        velocity_out_np_array = np.full((height, row_numel),
-                                        0.0).astype("float32")
-        velocity_out.set(velocity_out_np_array, place)
-
-        # create and initialize LeraningRate Variable
-        lr = scope.var('LearningRate').get_tensor()
-        lr_array = np.full((1), 2.0).astype("float32")
-        lr.set(lr_array, place)
-
-        # create and run operator
-        op = Operator(
-            "momentum",
-            Param='Param',
-            Grad='Grad',
-            Velocity='Velocity',
-            ParamOut='ParamOut',
-            VelocityOut='VelocityOut',
-            LearningRate='LearningRate',
-            mu=mu,
-            use_nesterov=use_nesterov)
-        op.run(scope, place)
-
-        # get and compare result
-        param_out_np_array = np.array(param_out)
-        velocity_out_np_array = np.array(velocity_out)
-
-        # TODO(dzh): add a more suitable general numpy interface
-        # for sparse update.
-        _grad_np_array = np.full((height, row_numel), 0.0).astype("float32")
-        for i in range(len(rows)):
-            _grad_np_array[rows[i]] = grad_np_array[i]
-        _velocity_out = mu * velocity_np_array + _grad_np_array
-        _param = param_array
-        if use_nesterov:
-            _param_out = _param - (_grad_np_array + _velocity_out * mu
-                                   ) * lr_array
-        else:
-            _param_out = _param - lr_array * _velocity_out
-        self.assertTrue((_velocity_out == velocity_out_np_array).all())
-        self.assertTrue((_param_out == param_out_np_array).all())
-
-    def init_kernel(self):
-        pass
-
-    def test_sparse_momentum(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
-
-
-class TestSparseMomentumOp2(TestSparseMomentumOp):
-    def init_kernel(self):
-        self.use_nesterov = True
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
deleted file mode 100644
index 64b4004e4becbc24fa4533f8797f4057b6ae43ce..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.executor import Executor
-
-
-class TestMseLoss(unittest.TestCase):
-    def test_mse_loss(self):
-        input_val = np.random.uniform(0.1, 0.5, (2, 3)).astype("float32")
-        label_val = np.random.uniform(0.1, 0.5, (2, 3)).astype("float32")
-
-        sub = input_val - label_val
-        np_result = np.mean(sub * sub)
-
-        input_var = layers.create_tensor(dtype="float32", name="input")
-        label_var = layers.create_tensor(dtype="float32", name="label")
-
-        layers.assign(input=input_val, output=input_var)
-        layers.assign(input=label_val, output=label_var)
-        output = layers.mse_loss(input=input_var, label=label_var)
-        for use_cuda in ([False, True]
-                         if core.is_compiled_with_cuda() else [False]):
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = Executor(place)
-            result = exe.run(fluid.default_main_program(),
-                             feed={"input": input_var,
-                                   "label": label_var},
-                             fetch_list=[output])
-
-            self.assertTrue(np.isclose(np_result, result).all())
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
deleted file mode 100644
index d54326714acf47bd5d2abd2d919b0e3b0cab3546..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ /dev/null
@@ -1,161 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-class TestMulOp(OpTest):
-    def setUp(self):
-        self.op_type = "mul"
-        self.dtype = np.float32
-        self.init_dtype_type()
-        self.inputs = {
-            'X': np.random.random((2, 5)).astype(self.dtype),
-            'Y': np.random.random((5, 3)).astype(self.dtype)
-        }
-        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-
-
-class TestMulOp2(OpTest):
-    def setUp(self):
-        self.op_type = "mul"
-        self.dtype = np.float32
-        self.init_dtype_type()
-        self.inputs = {
-            'X': np.random.random((3, 4, 4, 3)).astype(self.dtype),
-            'Y': np.random.random((2, 6, 1, 2, 3)).astype(self.dtype)
-        }
-        self.attrs = {
-            'x_num_col_dims': 2,
-            'y_num_col_dims': 2,
-        }
-        result = np.dot(self.inputs['X'].reshape(3 * 4, 4 * 3),
-                        self.inputs['Y'].reshape(2 * 6, 1 * 2 * 3))
-        result = result.reshape(3, 4, 1, 2, 3)
-        self.outputs = {'Out': result}
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set('X'))
-
-    def test_check_grad_ignore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestFP16MulOp1(TestMulOp):
-    def init_dtype_type(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_output_with_place(place, atol=1e-1)
-
-    def test_check_grad_normal(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['Y'],
-                'Out',
-                max_relative_error=0.5,
-                no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['X'],
-                'Out',
-                max_relative_error=0.5,
-                no_grad_set=set('Y'))
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestFP16MulOp2(TestMulOp2):
-    def init_dtype_type(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_output_with_place(place, atol=2e-1)
-
-    def test_check_grad_normal(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['X', 'Y'], 'Out', max_relative_error=0.9)
-
-    def test_check_grad_ingore_x(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['Y'],
-                'Out',
-                max_relative_error=0.5,
-                no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['X'],
-                'Out',
-                max_relative_error=0.9,
-                no_grad_set=set('Y'))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
deleted file mode 100644
index 98391260883488b209077c2aa108cf66302a75bf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ /dev/null
@@ -1,503 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import copy
-from op_test import OpTest
-
-
-def iou(box_a, box_b, norm):
-    """Apply intersection-over-union overlap between box_a and box_b
-    """
-    xmin_a = min(box_a[0], box_a[2])
-    ymin_a = min(box_a[1], box_a[3])
-    xmax_a = max(box_a[0], box_a[2])
-    ymax_a = max(box_a[1], box_a[3])
-
-    xmin_b = min(box_b[0], box_b[2])
-    ymin_b = min(box_b[1], box_b[3])
-    xmax_b = max(box_b[0], box_b[2])
-    ymax_b = max(box_b[1], box_b[3])
-
-    area_a = (ymax_a - ymin_a + (norm == False)) * (xmax_a - xmin_a +
-                                                    (norm == False))
-    area_b = (ymax_b - ymin_b + (norm == False)) * (xmax_b - xmin_b +
-                                                    (norm == False))
-    if area_a <= 0 and area_b <= 0:
-        return 0.0
-
-    xa = max(xmin_a, xmin_b)
-    ya = max(ymin_a, ymin_b)
-    xb = min(xmax_a, xmax_b)
-    yb = min(ymax_a, ymax_b)
-
-    inter_area = max(xb - xa + (norm == False),
-                     0.0) * max(yb - ya + (norm == False), 0.0)
-
-    iou_ratio = inter_area / (area_a + area_b - inter_area)
-
-    return iou_ratio
-
-
-def nms(boxes,
-        scores,
-        score_threshold,
-        nms_threshold,
-        top_k=200,
-        normalized=True,
-        eta=1.0):
-    """Apply non-maximum suppression at test time to avoid detecting too many
-    overlapping bounding boxes for a given object.
-    Args:
-        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
-        scores: (tensor) The class predscores for the img, Shape:[num_priors].
-        score_threshold: (float) The confidence thresh for filtering low
-            confidence boxes.
-        nms_threshold: (float) The overlap thresh for suppressing unnecessary
-            boxes.
-        top_k: (int) The maximum number of box preds to consider.
-        eta: (float) The parameter for adaptive NMS.
-    Return:
-        The indices of the kept boxes with respect to num_priors.
-    """
-    all_scores = copy.deepcopy(scores)
-    all_scores = all_scores.flatten()
-    selected_indices = np.argwhere(all_scores > score_threshold)
-    selected_indices = selected_indices.flatten()
-    all_scores = all_scores[selected_indices]
-
-    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
-    sorted_scores = all_scores[sorted_indices]
-    sorted_indices = selected_indices[sorted_indices]
-    if top_k > -1 and top_k < sorted_indices.shape[0]:
-        sorted_indices = sorted_indices[:top_k]
-        sorted_scores = sorted_scores[:top_k]
-
-    selected_indices = []
-    adaptive_threshold = nms_threshold
-    for i in range(sorted_scores.shape[0]):
-        idx = sorted_indices[i]
-        keep = True
-        for k in range(len(selected_indices)):
-            if keep:
-                kept_idx = selected_indices[k]
-                overlap = iou(boxes[idx], boxes[kept_idx], normalized)
-                keep = True if overlap <= adaptive_threshold else False
-            else:
-                break
-        if keep:
-            selected_indices.append(idx)
-        if keep and eta < 1 and adaptive_threshold > 0.5:
-            adaptive_threshold *= eta
-    return selected_indices
-
-
-def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
-                   nms_top_k, keep_top_k, normalized, shared):
-    if shared:
-        class_num = scores.shape[0]
-        priorbox_num = scores.shape[1]
-    else:
-        box_num = scores.shape[0]
-        class_num = scores.shape[1]
-
-    selected_indices = {}
-    num_det = 0
-    for c in range(class_num):
-        if c == background: continue
-        if shared:
-            indices = nms(boxes, scores[c], score_threshold, nms_threshold,
-                          nms_top_k, normalized)
-        else:
-            indices = nms(boxes[:, c, :], scores[:, c], score_threshold,
-                          nms_threshold, nms_top_k, normalized)
-        selected_indices[c] = indices
-        num_det += len(indices)
-
-    if keep_top_k > -1 and num_det > keep_top_k:
-        score_index = []
-        for c, indices in selected_indices.items():
-            for idx in indices:
-                if shared:
-                    score_index.append((scores[c][idx], c, idx))
-                else:
-                    score_index.append((scores[idx][c], c, idx))
-
-        sorted_score_index = sorted(
-            score_index, key=lambda tup: tup[0], reverse=True)
-        sorted_score_index = sorted_score_index[:keep_top_k]
-        selected_indices = {}
-
-        for _, c, _ in sorted_score_index:
-            selected_indices[c] = []
-        for s, c, idx in sorted_score_index:
-            selected_indices[c].append(idx)
-        if not shared:
-            for labels in selected_indices:
-                selected_indices[labels].sort()
-        num_det = keep_top_k
-
-    return selected_indices, num_det
-
-
-def lod_multiclass_nms(boxes, scores, background, score_threshold,
-                       nms_threshold, nms_top_k, keep_top_k, box_lod,
-                       normalized):
-    num_class = boxes.shape[1]
-    det_outs = []
-    lod = []
-    head = 0
-    for n in range(len(box_lod[0])):
-        box = boxes[head:head + box_lod[0][n]]
-        score = scores[head:head + box_lod[0][n]]
-        offset = head
-        head = head + box_lod[0][n]
-        nmsed_outs, nmsed_num = multiclass_nms(
-            box,
-            score,
-            background,
-            score_threshold,
-            nms_threshold,
-            nms_top_k,
-            keep_top_k,
-            normalized,
-            shared=False)
-        lod.append(nmsed_num)
-
-        if nmsed_num == 0:
-            continue
-        tmp_det_out = []
-        for c, indices in nmsed_outs.items():
-            for idx in indices:
-                xmin, ymin, xmax, ymax = box[idx, c, :]
-                tmp_det_out.append([
-                    c, score[idx][c], xmin, ymin, xmax, ymax,
-                    offset * num_class + idx * num_class + c
-                ])
-        sorted_det_out = sorted(
-            tmp_det_out, key=lambda tup: tup[0], reverse=False)
-        det_outs.extend(sorted_det_out)
-
-    return det_outs, lod
-
-
-def batched_multiclass_nms(boxes,
-                           scores,
-                           background,
-                           score_threshold,
-                           nms_threshold,
-                           nms_top_k,
-                           keep_top_k,
-                           normalized=True):
-    batch_size = scores.shape[0]
-    num_boxes = scores.shape[2]
-    det_outs = []
-    index_outs = []
-    lod = []
-    for n in range(batch_size):
-        nmsed_outs, nmsed_num = multiclass_nms(
-            boxes[n],
-            scores[n],
-            background,
-            score_threshold,
-            nms_threshold,
-            nms_top_k,
-            keep_top_k,
-            normalized,
-            shared=True)
-        lod.append(nmsed_num)
-
-        if nmsed_num == 0:
-            continue
-        tmp_det_out = []
-        for c, indices in nmsed_outs.items():
-            for idx in indices:
-                xmin, ymin, xmax, ymax = boxes[n][idx][:]
-                tmp_det_out.append([
-                    c, scores[n][c][idx], xmin, ymin, xmax, ymax,
-                    idx + n * num_boxes
-                ])
-        sorted_det_out = sorted(
-            tmp_det_out, key=lambda tup: tup[0], reverse=False)
-        det_outs.extend(sorted_det_out)
-    return det_outs, lod
-
-
-class TestMulticlassNMSOp(OpTest):
-    def set_argument(self):
-        self.score_threshold = 0.01
-
-    def setUp(self):
-        self.set_argument()
-        N = 7
-        M = 1200
-        C = 21
-        BOX_SIZE = 4
-        background = 0
-        nms_threshold = 0.3
-        nms_top_k = 400
-        keep_top_k = 200
-        score_threshold = self.score_threshold
-
-        scores = np.random.random((N * M, C)).astype('float32')
-
-        def softmax(x):
-            shiftx = x - np.max(x).clip(-64.)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
-        scores = np.apply_along_axis(softmax, 1, scores)
-        scores = np.reshape(scores, (N, M, C))
-        scores = np.transpose(scores, (0, 2, 1))
-
-        boxes = np.random.random((N, M, BOX_SIZE)).astype('float32')
-        boxes[:, :, 0:2] = boxes[:, :, 0:2] * 0.5
-        boxes[:, :, 2:4] = boxes[:, :, 2:4] * 0.5 + 0.5
-
-        det_outs, lod = batched_multiclass_nms(boxes, scores, background,
-                                               score_threshold, nms_threshold,
-                                               nms_top_k, keep_top_k)
-        lod = [1] if not det_outs else lod
-        det_outs = [[-1, 0]] if not det_outs else det_outs
-        det_outs = np.array(det_outs)
-        nmsed_outs = det_outs[:, :-1].astype('float32')
-
-        self.op_type = 'multiclass_nms'
-        self.inputs = {'BBoxes': boxes, 'Scores': scores}
-        self.outputs = {'Out': (nmsed_outs, [lod])}
-        self.attrs = {
-            'background_label': 0,
-            'nms_threshold': nms_threshold,
-            'nms_top_k': nms_top_k,
-            'keep_top_k': keep_top_k,
-            'score_threshold': score_threshold,
-            'nms_eta': 1.0,
-            'normalized': True,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
-    def set_argument(self):
-        # Here set 2.0 to test the case there is no outputs.
-        # In practical use, 0.0 < score_threshold < 1.0
-        self.score_threshold = 2.0
-
-
-class TestMulticlassNMSLoDInput(OpTest):
-    def set_argument(self):
-        self.score_threshold = 0.01
-
-    def setUp(self):
-        self.set_argument()
-        M = 1200
-        C = 21
-        BOX_SIZE = 4
-        box_lod = [[1200]]
-        background = 0
-        nms_threshold = 0.3
-        nms_top_k = 400
-        keep_top_k = 200
-        score_threshold = self.score_threshold
-        normalized = False
-
-        scores = np.random.random((M, C)).astype('float32')
-
-        def softmax(x):
-            shiftx = x - np.max(x).clip(-64.)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
-        scores = np.apply_along_axis(softmax, 1, scores)
-
-        boxes = np.random.random((M, C, BOX_SIZE)).astype('float32')
-        boxes[:, :, 0] = boxes[:, :, 0] * 10
-        boxes[:, :, 1] = boxes[:, :, 1] * 10
-        boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
-        boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
-
-        det_outs, lod = lod_multiclass_nms(
-            boxes, scores, background, score_threshold, nms_threshold,
-            nms_top_k, keep_top_k, box_lod, normalized)
-        det_outs = np.array(det_outs).astype('float32')
-        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
-            det_outs) else det_outs
-        self.op_type = 'multiclass_nms'
-        self.inputs = {
-            'BBoxes': (boxes, box_lod),
-            'Scores': (scores, box_lod),
-        }
-        self.outputs = {'Out': (nmsed_outs, [lod])}
-        self.attrs = {
-            'background_label': 0,
-            'nms_threshold': nms_threshold,
-            'nms_top_k': nms_top_k,
-            'keep_top_k': keep_top_k,
-            'score_threshold': score_threshold,
-            'nms_eta': 1.0,
-            'normalized': normalized,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestIOU(unittest.TestCase):
-    def test_iou(self):
-        box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
-        box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32')
-
-        expt_output = np.array([2.0 / 16.0]).astype('float32')
-        calc_output = np.array([iou(box1, box2, True)]).astype('float32')
-        self.assertTrue(np.allclose(calc_output, expt_output))
-
-
-class TestMulticlassNMS2Op(TestMulticlassNMSOp):
-    def setUp(self):
-        self.set_argument()
-        N = 7
-        M = 1200
-        C = 21
-        BOX_SIZE = 4
-        background = 0
-        nms_threshold = 0.3
-        nms_top_k = 400
-        keep_top_k = 200
-        score_threshold = self.score_threshold
-
-        scores = np.random.random((N * M, C)).astype('float32')
-
-        def softmax(x):
-            shiftx = x - np.max(x).clip(-64.)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
-        scores = np.apply_along_axis(softmax, 1, scores)
-        scores = np.reshape(scores, (N, M, C))
-        scores = np.transpose(scores, (0, 2, 1))
-
-        boxes = np.random.random((N, M, BOX_SIZE)).astype('float32')
-        boxes[:, :, 0:2] = boxes[:, :, 0:2] * 0.5
-        boxes[:, :, 2:4] = boxes[:, :, 2:4] * 0.5 + 0.5
-
-        det_outs, lod = batched_multiclass_nms(boxes, scores, background,
-                                               score_threshold, nms_threshold,
-                                               nms_top_k, keep_top_k)
-        det_outs = np.array(det_outs)
-
-        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
-            det_outs) else det_outs
-        index_outs = det_outs[:, -1:].astype('int') if len(
-            det_outs) else det_outs
-        self.op_type = 'multiclass_nms2'
-        self.inputs = {'BBoxes': boxes, 'Scores': scores}
-        self.outputs = {
-            'Out': (nmsed_outs, [lod]),
-            'Index': (index_outs, [lod])
-        }
-        self.attrs = {
-            'background_label': 0,
-            'nms_threshold': nms_threshold,
-            'nms_top_k': nms_top_k,
-            'keep_top_k': keep_top_k,
-            'score_threshold': score_threshold,
-            'nms_eta': 1.0,
-            'normalized': True,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestMulticlassNMS2OpNoOutput(TestMulticlassNMS2Op):
-    def set_argument(self):
-        # Here set 2.0 to test the case there is no outputs.
-        # In practical use, 0.0 < score_threshold < 1.0
-        self.score_threshold = 2.0
-
-
-class TestMulticlassNMS2LoDInput(TestMulticlassNMSLoDInput):
-    def setUp(self):
-        self.set_argument()
-        M = 1200
-        C = 21
-        BOX_SIZE = 4
-        box_lod = [[1200]]
-        background = 0
-        nms_threshold = 0.3
-        nms_top_k = 400
-        keep_top_k = 200
-        score_threshold = self.score_threshold
-        normalized = False
-
-        scores = np.random.random((M, C)).astype('float32')
-
-        def softmax(x):
-            shiftx = x - np.max(x).clip(-64.)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
-        scores = np.apply_along_axis(softmax, 1, scores)
-
-        boxes = np.random.random((M, C, BOX_SIZE)).astype('float32')
-        boxes[:, :, 0] = boxes[:, :, 0] * 10
-        boxes[:, :, 1] = boxes[:, :, 1] * 10
-        boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
-        boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
-
-        det_outs, lod = lod_multiclass_nms(
-            boxes, scores, background, score_threshold, nms_threshold,
-            nms_top_k, keep_top_k, box_lod, normalized)
-
-        det_outs = np.array(det_outs)
-        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
-            det_outs) else det_outs
-        index_outs = det_outs[:, -1:].astype('int') if len(
-            det_outs) else det_outs
-        self.op_type = 'multiclass_nms2'
-        self.inputs = {
-            'BBoxes': (boxes, box_lod),
-            'Scores': (scores, box_lod),
-        }
-        self.outputs = {
-            'Out': (nmsed_outs, [lod]),
-            'Index': (index_outs, [lod])
-        }
-        self.attrs = {
-            'background_label': 0,
-            'nms_threshold': nms_threshold,
-            'nms_top_k': nms_top_k,
-            'keep_top_k': keep_top_k,
-            'score_threshold': score_threshold,
-            'nms_eta': 1.0,
-            'normalized': normalized,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestMulticlassNMS2LoDNoOutput(TestMulticlassNMS2LoDInput):
-    def set_argument(self):
-        # Here set 2.0 to test the case there is no outputs.
-        # In practical use, 0.0 < score_threshold < 1.0
-        self.score_threshold = 2.0
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multihead_attention.py b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
deleted file mode 100644
index f60da862ac091ca1eefccfe2834201d1c79e2def..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_multihead_attention.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import numpy as np
-
-
-class TestMultiheadAttention(unittest.TestCase):
-    def gen_random_input(self):
-        """Generate random input data.
-        """
-        # batch_size, max_sequence_length, hidden dimension
-        self.input_shape = (3, 13, 16)
-        self.queries = np.random.random(size=self.input_shape).astype("float32")
-        self.keys = np.random.random(size=self.input_shape).astype("float32")
-
-    def set_program(self):
-        """Build the test program.
-        """
-        queries = fluid.layers.data(
-            name="queries",
-            shape=self.input_shape,
-            dtype="float32",
-            append_batch_size=False)
-        queries.stop_gradient = False
-        keys = fluid.layers.data(
-            name="keys",
-            shape=self.input_shape,
-            dtype="float32",
-            append_batch_size=False)
-        keys.stop_gradient = False
-
-        contexts = fluid.nets.scaled_dot_product_attention(
-            queries=queries,
-            keys=keys,
-            values=keys,
-            num_heads=8,
-            dropout_rate=0.)
-        out = fluid.layers.reduce_sum(contexts, dim=None)
-        fluid.backward.append_backward(loss=out)
-
-        self.fetch_list = [contexts]
-
-    def run_program(self):
-        """Run the test program.
-        """
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            self.set_inputs(place)
-            exe = fluid.Executor(place)
-
-            exe.run(fluid.default_startup_program())
-            output = exe.run(fluid.default_main_program(),
-                             feed=self.inputs,
-                             fetch_list=self.fetch_list,
-                             return_numpy=True)
-            self.op_output = output
-
-    def set_inputs(self, place):
-        """Set the randomly generated data to the test program.
-        """
-        self.inputs = {}
-        queries = fluid.Tensor()
-        queries.set(self.queries, place)
-
-        keys = fluid.Tensor()
-        keys.set(self.keys, place)
-
-        self.inputs["keys"] = keys
-        self.inputs["queries"] = queries
-
-    def test_multihead_attention(self):
-        self.gen_random_input()
-
-        self.set_program()
-        self.run_program()
-
-        #fixme(caoying) add more meaningfull unittest.
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiplex_op.py b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
deleted file mode 100644
index 1567a74808aa37e5e18bbe583cc1d8987b31cd58..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestMultiplexOp(OpTest):
-    def setUp(self):
-        self.op_type = "multiplex"
-        rows = 4
-        index = np.arange(0, rows).astype('int32')
-        np.random.shuffle(index)
-        index = np.reshape(index, (rows, 1))
-        ins1 = np.random.random((rows, 10)).astype("float32")
-        ins2 = np.random.random((rows, 10)).astype("float32")
-        ins3 = np.random.random((rows, 10)).astype("float32")
-        ins4 = np.random.random((rows, 10)).astype("float32")
-        self.inputs = {
-            'Ids': index,
-            'X': [('x1', ins1), ('x2', ins2), ('x3', ins3), ('x4', ins4)]
-        }
-        # multiplex output
-        output = np.zeros_like(ins1)
-        for i in range(0, rows):
-            k = index[i][0]
-            output[i] = self.inputs['X'][k][1][i]
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['x1', 'x2', 'x3', 'x4'], 'Out')
-
-    def test_check_grad_ignore_x1(self):
-        self.check_grad(['x2', 'x3', 'x4'], 'Out', no_grad_set=set('x1'))
-
-    def test_check_grad_ignore_x1_x2(self):
-        self.check_grad(['x3', 'x4'], 'Out', no_grad_set=set(['x1', 'x2']))
-
-    def test_check_grad_ignore_x3(self):
-        self.check_grad(['x1', 'x2', 'x4'], 'Out', no_grad_set=set('x3'))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
deleted file mode 100644
index 0507b05e6239924d8b7ff18ad45f50533768f924..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from paddle.fluid.io import multiprocess_reader
-import unittest
-import numpy as np
-import six
-import sys
-
-
-class TestMultiprocessReaderException(unittest.TestCase):
-    def setUp(self):
-        self.use_pipe = False
-        self.raise_exception = False
-
-    def places(self):
-        if fluid.is_compiled_with_cuda():
-            return [fluid.CPUPlace(), fluid.CUDAPlace(0)]
-        else:
-            return [fluid.CPUPlace()]
-
-    def main_impl(self, place, iterable):
-        def fake_reader():
-            def __impl__():
-                for _ in range(40):
-                    if not self.raise_exception:
-                        yield list(
-                            np.random.uniform(
-                                low=-1, high=1, size=[10])),
-                    else:
-                        raise ValueError()
-
-            return __impl__
-
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            image = fluid.layers.data(name='image', dtype='float32', shape=[10])
-
-            reader = fluid.io.PyReader(
-                feed_list=[image], capacity=2, iterable=iterable)
-
-            image_p_1 = image + 1
-
-            decorated_reader = multiprocess_reader(
-                [fake_reader(), fake_reader()], use_pipe=self.use_pipe)
-
-            if isinstance(place, fluid.CUDAPlace):
-                reader.decorate_sample_generator(
-                    decorated_reader, batch_size=4, places=fluid.cuda_places())
-            else:
-                reader.decorate_sample_generator(
-                    decorated_reader, batch_size=4, places=fluid.cpu_places())
-
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-
-            if iterable:
-                for _ in range(3):
-                    num = 0
-                    for data in reader():
-                        exe.run(feed=data, fetch_list=[image_p_1])
-                        num += 1
-                    if not self.raise_exception:
-                        self.assertEquals(num, 20)
-                    else:
-                        self.assertEquals(num, 0)
-                        raise ValueError('Reader raises exception')
-            else:
-                for _ in range(3):
-                    num = 0
-                    reader.start()
-                    try:
-                        while True:
-                            exe.run(fetch_list=[image_p_1])
-                            num += 1
-                    except fluid.core.EOFException:
-                        reader.reset()
-                        if not self.raise_exception:
-                            self.assertEquals(num, 20)
-                        else:
-                            self.assertEquals(num, 0)
-                            raise ValueError('Reader raises exception')
-
-    def test_main(self):
-        for p in self.places():
-            for iterable in [False, True]:
-                try:
-                    with fluid.scope_guard(fluid.Scope()):
-                        self.main_impl(p, iterable)
-
-                    self.assertTrue(not self.raise_exception)
-                except ValueError:
-                    self.assertTrue(self.raise_exception)
-
-
-class TestCase1(TestMultiprocessReaderException):
-    def setUp(self):
-        self.use_pipe = False
-        self.raise_exception = True
-
-
-class TestCase2(TestMultiprocessReaderException):
-    def setUp(self):
-        self.use_pipe = True
-        self.raise_exception = False
-
-
-class TestCase3(TestMultiprocessReaderException):
-    def setUp(self):
-        self.use_pipe = True
-        self.raise_exception = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_name_scope.py b/python/paddle/fluid/tests/unittests/test_name_scope.py
deleted file mode 100644
index 08c802e20d2bb364ef7f116ee0042a2ad21a9b2b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_name_scope.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-
-
-class TestNameScope(unittest.TestCase):
-    def test_name_scope(self):
-        with fluid.name_scope("s1"):
-            a = fluid.layers.data(name='data', shape=[1], dtype='int32')
-            b = a + 1
-            with fluid.name_scope("s2"):
-                c = b * 1
-            with fluid.name_scope("s3"):
-                d = c / 1
-        with fluid.name_scope("s1"):
-            f = fluid.layers.pow(d, 2.0)
-        with fluid.name_scope("s4"):
-            g = f - 1
-
-        for op in fluid.default_main_program().block(0).ops:
-            if op.type == 'elementwise_add':
-                self.assertEqual(op.desc.attr("op_namescope"), '/s1/')
-            elif op.type == 'elementwise_mul':
-                self.assertEqual(op.desc.attr("op_namescope"), '/s1/s2/')
-            elif op.type == 'elementwise_div':
-                self.assertEqual(op.desc.attr("op_namescope"), '/s1/s3/')
-            elif op.type == 'elementwise_sub':
-                self.assertEqual(op.desc.attr("op_namescope"), '/s4/')
-            elif op.type == 'pow':
-                self.assertEqual(op.desc.attr("op_namescope"), '/s1_1/')
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
deleted file mode 100644
index 1e462d13d0755f48fd73a9eae335584858ecb17f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ /dev/null
@@ -1,223 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-
-import paddle.fluid as fluid
-import paddle.fluid.initializer as initializer
-
-from op_test import OpTest
-
-
-def nce(input, weight, bias, sample_weight, labels, num_classes,
-        num_sample_class):
-    samples = []
-    sample_labels = []
-    batch_size = input.shape[0]
-    num_true_class = labels.shape[1]
-    for i in range(batch_size):
-        w = 1 if sample_weight is None else sample_weight[i]
-        for label in labels[i]:
-            samples.append((i, label, True, w))
-            sample_labels.append(label)
-        for num in range(num_sample_class):
-            samples.append((i, num, False, w))
-            sample_labels.append(num)
-    # forward bias
-    sample_out = np.zeros(len(samples)).astype(np.float32)
-    if bias is not None:
-        for i in range(len(samples)):
-            sample_out[i] = bias[samples[i][1]]
-    # forward weight
-    for i in range(len(samples)):
-        sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]])
-
-    # forward activation
-    sample_out = 1.0 / (1.0 + np.exp(-sample_out))
-    # forward cost
-    out = np.zeros(batch_size).astype(np.float32)
-    b = 1.0 / num_classes * num_sample_class
-    for i in range(len(samples)):
-        o = sample_out[i]
-        cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
-        out[samples[i][0]] += cost * samples[i][3]
-    return (out[:, np.newaxis], np.array(sample_out).reshape(
-        batch_size, num_sample_class + num_true_class),
-            np.array(sample_labels).reshape(batch_size,
-                                            num_sample_class + num_true_class))
-
-
-class TestNCE(OpTest):
-    def generate_data(self, dim, batch_size, num_classes, num_true_class,
-                      num_neg_samples, is_sparse):
-        input = np.random.randn(batch_size, dim).astype(np.float32)
-        weight = np.random.randn(num_classes, dim).astype(np.float32)
-        bias = np.random.randn(num_classes).astype(np.float32)
-        sample_weight = np.random.randn(batch_size).astype(np.float32)
-        labels = np.random.randint(0, num_classes,
-                                   (batch_size, num_true_class)).astype("int64")
-        self.attrs = {
-            'num_total_classes': num_classes,
-            'num_neg_samples': num_neg_samples,
-            'custom_neg_classes': list(range(num_neg_samples)),
-            'seed': 0,
-            'sampler': 0,
-            'is_sparse': is_sparse
-        }
-        self.inputs = {
-            'Input': input,
-            'Label': labels,
-            'Weight': weight,
-            'Bias': bias,
-            'SampleWeight': sample_weight
-        }
-
-    def set_data(self):
-        self.generate_data(5, 5, 4, 1, 2, False)
-
-    def compute(self):
-        out = nce(self.inputs['Input'], self.inputs['Weight'],
-                  self.inputs['Bias'], self.inputs['SampleWeight'],
-                  self.inputs['Label'], self.attrs['num_total_classes'],
-                  self.attrs['num_neg_samples'])
-        self.outputs = {
-            'Cost': out[0],
-            'SampleLogits': out[1],
-            'SampleLabels': out[2]
-        }
-
-    def setUp(self):
-        self.op_type = 'nce'
-        self.set_data()
-        self.compute()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02)
-
-
-class TestNCECase1Tensor(TestNCE):
-    def set_data(self):
-        self.generate_data(10, 20, 10, 2, 5, False)
-
-
-class TestNCECase1SelectedRows(unittest.TestCase):
-    def setUp(self):
-        self.base_lr = 0.0001
-        self.batch_size = 8
-
-    @staticmethod
-    def get_place():
-        place = fluid.core.CPUPlace()
-        return place
-
-    @staticmethod
-    def get_train_data(batch_size):
-        batchs = []
-        for i in range(batch_size):
-            input = np.random.randn(batch_size, 10).astype(np.float32)
-            labels = np.random.randint(0, 20, (batch_size, 1))
-            batchs.append([input, labels])
-        return batchs
-
-    def get_optimizer(self):
-        # SGD optimizer
-        optimizer = fluid.optimizer.SGD(learning_rate=self.base_lr)
-        return optimizer
-
-    def train_network(self, num_total_classes, num_neg_samples, sampler,
-                      custom_dist, is_sparse):
-        input = fluid.layers.data(name="input", shape=[10], dtype="float32")
-        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-        w_param = fluid.default_main_program().global_block().create_parameter(
-            shape=[num_total_classes, 10],
-            dtype='float32',
-            name='nce_w',
-            initializer=initializer.ConstantInitializer())
-        b_param = fluid.default_main_program().global_block().create_parameter(
-            shape=[num_total_classes, 1],
-            dtype='float32',
-            name='nce_b',
-            initializer=initializer.ConstantInitializer())
-
-        cost = fluid.layers.nce(input=input,
-                                label=label,
-                                num_total_classes=num_total_classes,
-                                sampler=sampler,
-                                custom_dist=custom_dist,
-                                sample_weight=None,
-                                param_attr='nce_w',
-                                bias_attr='nce_b',
-                                seed=1,
-                                num_neg_samples=num_neg_samples,
-                                is_sparse=is_sparse)
-        avg_cost = fluid.layers.mean(cost)
-        # optimizer
-        optimizer = self.get_optimizer()
-        optimizer.minimize(avg_cost)
-
-        return [avg_cost, [input, label]]
-
-    def test_input_is_selected_rows(self):
-        place = self.get_place()
-        exe = fluid.Executor(place)
-
-        data = self.get_train_data(self.batch_size)
-        nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
-
-        rets = []
-        # for dense
-        dense_scope = fluid.core.Scope()
-        dense_startup_program = fluid.framework.Program()
-        dense_train_program = fluid.framework.Program()
-        with fluid.scope_guard(dense_scope):
-            with fluid.program_guard(dense_train_program,
-                                     dense_startup_program):
-                cost, feeds = self.train_network(20, 5, "custom_dist",
-                                                 nid_freq_arr.tolist(), False)
-                feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-                exe.run(dense_startup_program)
-                loss_val = exe.run(dense_train_program,
-                                   feed=feeder.feed(data),
-                                   fetch_list=[cost.name])
-                rets.append(np.mean(loss_val))
-
-        # for sparse
-        sparse_scope = fluid.core.Scope()
-        sparse_startup_program = fluid.framework.Program()
-        sparse_train_program = fluid.framework.Program()
-        with fluid.scope_guard(sparse_scope):
-            with fluid.program_guard(sparse_train_program,
-                                     sparse_startup_program):
-                cost, feeds = self.train_network(20, 5, "custom_dist",
-                                                 nid_freq_arr.tolist(), True)
-                feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-                exe.run(sparse_startup_program)
-                loss_val = exe.run(sparse_train_program,
-                                   feed=feeder.feed(data),
-                                   fetch_list=[cost.name])
-                rets.append(np.mean(loss_val))
-
-        self.assertEqual(rets[0], rets[1])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
deleted file mode 100644
index 3ec69923a116fad209558a68f941e35cf30726e8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
+++ /dev/null
@@ -1,238 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import signal
-import time
-import unittest
-from multiprocessing import Process
-
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.framework import Program, program_guard
-from dist_test_utils import *
-
-
-def nce(input, weight, bias, sample_weight, labels, num_classes,
-        num_sample_class):
-    samples = []
-    sample_labels = []
-    batch_size = input.shape[0]
-    num_true_class = labels.shape[1]
-    for i in range(batch_size):
-        w = 1 if sample_weight is None else sample_weight[i]
-        for label in labels[i]:
-            samples.append((i, label, True, w))
-            sample_labels.append(label)
-        for num in range(num_sample_class):
-            samples.append((i, num, False, w))
-            sample_labels.append(num)
-    # forward bias
-    sample_out = np.zeros(len(samples)).astype(np.float32)
-    if bias is not None:
-        for i in range(len(samples)):
-            sample_out[i] = bias[samples[i][1]]
-    # forward weight
-    for i in range(len(samples)):
-        sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]])
-
-    # forward activation
-    sample_out = 1.0 / (1.0 + np.exp(-sample_out))
-    # forward cost
-    out = np.zeros(batch_size).astype(np.float32)
-    b = 1.0 / num_classes * num_sample_class
-
-    for i in range(len(samples)):
-        o = sample_out[i]
-        cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
-        out[samples[i][0]] += cost * samples[i][3]
-    return (out[:, np.newaxis], np.array(sample_out).reshape(
-        batch_size, num_sample_class + num_true_class),
-            np.array(sample_labels).reshape(batch_size,
-                                            num_sample_class + num_true_class))
-
-
-def run_pserver(pserver_id, use_cuda, sync_mode):
-    remove_ps_flag(os.getpid())
-    scope = fluid.core.Scope()
-    program = Program()
-    with fluid.scope_guard(scope):
-        with program_guard(program, startup_program=Program()):
-            # create table parameter in scope
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            # create and initialize Param Variable
-            param = scope.var('table').get_tensor()
-
-            param_array = np.ones((5, 8)).astype("float32")
-            for i in range(len(param_array)):
-                param_array[i] *= param_array[i] * i + pserver_id * 10 + 1
-            param.set(param_array, place)
-
-            optimize_block = program._create_block(program.global_block().idx)
-            program.global_block().append_op(
-                type="listen_and_serv",
-                inputs={'X': []},
-                outputs={},
-                attrs={
-                    "optimize_blocks": [optimize_block],
-                    "endpoint": '127.0.0.1:0',
-                    "Fanin": 1,
-                    "sync_mode": True,
-                    "grad_to_block_id": []
-                })
-
-            exe = fluid.Executor(place)
-            exe.run(program)
-
-
-class TestListenAndServOp(unittest.TestCase):
-    def setUp(self):
-        self.ps_timeout = 5
-
-    def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
-        p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _wait_ps_ready(self, pid):
-        start_left_time = self.ps_timeout
-        sleep_time = 0.5
-        while True:
-            assert start_left_time >= 0, "wait ps ready failed"
-            time.sleep(sleep_time)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                start_left_time -= sleep_time
-
-    def _get_pserver_port(self, pid):
-        with open("/tmp/paddle.%d.port" % pid, 'r') as f:
-            port = int(f.read().strip())
-        return port
-
-    def _run_nce_op_two_pserver(self, place, port0, port1):
-        scope = fluid.core.Scope()
-        program = Program()
-        with fluid.scope_guard(scope):
-            with program_guard(program, startup_program=Program()):
-                x = scope.var('Input').get_tensor()
-                x_array = np.random.random((4, 8)).astype("float32")
-                x.set(x_array, place)
-                # create and initialize Param Variable
-                param = scope.var('Weight').get_tensor()
-                param_array = np.zeros((5, 8)).astype("float32")
-                param.set(param_array, place)
-
-                bias = scope.var('Bias').get_tensor()
-                bias_array = np.random.random((5, 1)).astype("float32")
-                bias.set(bias_array, place)
-
-                sample_w = scope.var('SampleWeight').get_tensor()
-                sample_weight = np.random.random((4, 1)).astype("float32")
-                sample_w.set(sample_weight, place)
-
-                label = scope.var('Label').get_tensor()
-                label_array = np.array([[0], [1], [4], [3]])
-                label.set(label_array, place)
-
-                cost = scope.var('Cost').get_tensor()
-                cost_w = np.zeros((4, 1)).astype("float32")
-                cost.set(cost_w, place)
-
-                sample_l = scope.var('SampleLogits').get_tensor()
-                sample_l_w = np.zeros((4, 3)).astype("float32")
-                sample_l.set(sample_l_w, place)
-
-                sample_la = scope.var('SampleLabels').get_tensor()
-                sample_la_w = np.zeros((4, 3)).astype("int")
-                sample_la.set(sample_la_w, place)
-
-                emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
-                table_names = ['table', 'table']
-                height_sections = [2, 3]
-
-                # create and run nce operator
-                nce_op = Operator(
-                    "nce",
-                    Input='Input',
-                    Weight='Weight',
-                    Label='Label',
-                    Bias='Bias',
-                    Cost='Cost',
-                    SampleLogits='SampleLogits',
-                    SampleLabels='SampleLabels',
-                    SampleWeight='SampleWeight',
-                    num_total_classes=5,
-                    num_neg_samples=2,
-                    custom_neg_classes=list(range(2)),
-                    sampler=0,
-                    seed=0,
-                    is_sparse=True,
-                    remote_prefetch=True,
-                    epmap=emaps,
-                    table_names=table_names,
-                    height_sections=height_sections)
-
-                nce_op.run(scope, place)
-
-                # get and compare result
-                o_cost = np.array(scope.var('Cost').get_tensor())
-                o_logits = np.array(scope.var('SampleLogits').get_tensor())
-                o_labels = np.array(scope.var('SampleLabels').get_tensor())
-
-                param_array = np.ones((5, 8)).astype("float32")
-                for i in range(2):
-                    param_array[i] *= param_array[i] * i + 0 * 10 + 1
-                for i in range(2, 5):
-                    param_array[i] *= param_array[i] * i + 1 * 10 + 1
-                out = nce(x_array, param_array, bias_array, sample_weight,
-                          label_array, 5, 2)
-
-                np.testing.assert_almost_equal(o_cost, out[0], decimal=6)
-                np.testing.assert_almost_equal(o_logits, out[1], decimal=6)
-                np.testing.assert_almost_equal(o_labels, out[2], decimal=6)
-
-    def test_nce_op_remote(self):
-        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
-        # run pserver on CPU in sync mode
-        p0 = self._start_pserver(0, False, True, run_pserver)
-        self._wait_ps_ready(p0.pid)
-        port0 = self._get_pserver_port(p0.pid)
-
-        p1 = self._start_pserver(1, False, True, run_pserver)
-        self._wait_ps_ready(p1.pid)
-        port1 = self._get_pserver_port(p1.pid)
-
-        places = [core.CPUPlace()]
-
-        for place in places:
-            self._run_nce_op_two_pserver(place, port0, port1)
-
-        # raise SIGTERM to pserver
-        os.kill(p0.pid, signal.SIGINT)
-        p0.join()
-        os.kill(p1.pid, signal.SIGINT)
-        p1.join()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
deleted file mode 100644
index 6f0d85ed3cee4d280bde26127f3934610877f151..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+++ /dev/null
@@ -1,496 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-
-
-def nearest_neighbor_interp_np(X,
-                               out_h,
-                               out_w,
-                               out_size=None,
-                               actual_shape=None,
-                               align_corners=True,
-                               data_layout='NCHW'):
-    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
-    if data_layout == "NHWC":
-        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    n, c, in_h, in_w = X.shape
-
-    ratio_h = ratio_w = 0.0
-    if (out_h > 1):
-        if (align_corners):
-            ratio_h = (in_h - 1.0) / (out_h - 1.0)
-        else:
-            ratio_h = 1.0 * in_h / out_h
-    if (out_w > 1):
-        if (align_corners):
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((n, c, out_h, out_w))
-
-    if align_corners:
-        for i in range(out_h):
-            in_i = int(ratio_h * i + 0.5)
-            for j in range(out_w):
-                in_j = int(ratio_w * j + 0.5)
-                out[:, :, i, j] = X[:, :, in_i, in_j]
-    else:
-        for i in range(out_h):
-            in_i = int(ratio_h * i)
-            for j in range(out_w):
-                in_j = int(ratio_w * j)
-                out[:, :, i, j] = X[:, :, in_i, in_j]
-
-    if data_layout == "NHWC":
-        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
-
-    return out.astype(X.dtype)
-
-
-class TestNearestInterpOp(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "nearest_interp"
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.data_layout == "NCHW":
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.scale > 0:
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = nearest_neighbor_interp_np(
-            input_np, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.data_layout)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'data_layout': self.data_layout
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 4, 4]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase4(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase5(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = np.array([11, 11]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase6(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-        self.scale = 0.
-        self.out_size = np.array([65, 129]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpSame(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 128, 64]
-        self.out_h = 128
-        self.out_w = 64
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 4, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 8]).astype("int32")
-        self.align_corners = True
-        self.data_layout = "NHWC"
-
-
-class TestNearestInterpOpUint8(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "nearest_interp"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
-
-        if self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
-                                               self.out_size, self.actual_shape,
-                                               self.align_corners)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output_with_place(place=core.CPUPlace(), atol=1)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 3, 9, 6]
-        self.out_h = 10
-        self.out_w = 9
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 128, 64]
-        self.out_h = 120
-        self.out_w = 50
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 5
-        self.out_w = 13
-        self.scale = 0.
-        self.out_size = np.array([6, 15]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestInterpWithoutCorners(TestNearestInterpOp):
-    def set_align_corners(self):
-        self.align_corners = False
-
-
-class TestNearestNeighborInterpScale1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 5, 7]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 1.5
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 1.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestInterpOp_attr_tensor(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "nearest_interp"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-            self.attrs['scale'] = self.scale
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
-                                               self.out_size, self.actual_shape,
-                                               self.align_corners)
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 4, 4]
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.
-        self.out_size = [3, 3]
-        self.align_corners = True
-
-
-# out_size is a tensor list
-class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = [8, 12]
-        self.align_corners = True
-
-
-# out_size is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_1Dtensor = True
-
-
-class TestNearestAPI(OpTest):
-    def test_case(self):
-        x = fluid.layers.data(name="x", shape=[3, 6, 6], dtype="float32")
-        y = fluid.layers.data(name="y", shape=[6, 6, 3], dtype="float32")
-
-        dim = fluid.layers.data(
-            name="dim", shape=[1], dtype="int32", append_batch_size=False)
-        shape_tensor = fluid.layers.data(
-            name="shape_tensor",
-            shape=[2],
-            dtype="int32",
-            append_batch_size=False)
-        actual_size = fluid.layers.data(
-            name="actual_size",
-            shape=[2],
-            dtype="int32",
-            append_batch_size=False)
-        scale_tensor = fluid.layers.data(
-            name="scale_tensor",
-            shape=[1],
-            dtype="float32",
-            append_batch_size=False)
-
-        out1 = fluid.layers.resize_nearest(
-            y, out_shape=[12, 12], data_format='NHWC')
-        out2 = fluid.layers.resize_nearest(x, out_shape=[12, dim])
-        out3 = fluid.layers.resize_nearest(x, out_shape=shape_tensor)
-        out4 = fluid.layers.resize_nearest(
-            x, out_shape=[4, 4], actual_shape=actual_size)
-        out5 = fluid.layers.resize_nearest(x, scale=scale_tensor)
-
-        x_data = np.random.random((1, 3, 6, 6)).astype("float32")
-        dim_data = np.array([12]).astype("int32")
-        shape_data = np.array([12, 12]).astype("int32")
-        actual_size_data = np.array([12, 12]).astype("int32")
-        scale_data = np.array([2.0]).astype("float32")
-
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        results = exe.run(fluid.default_main_program(),
-                          feed={
-                              "x": x_data,
-                              "y": np.transpose(x_data, (0, 2, 3, 1)),
-                              "dim": dim_data,
-                              "shape_tensor": shape_data,
-                              "actual_size": actual_size_data,
-                              "scale_tensor": scale_data
-                          },
-                          fetch_list=[out1, out2, out3, out4, out5],
-                          return_numpy=True)
-
-        expect_res = nearest_neighbor_interp_np(
-            x_data, out_h=12, out_w=12, align_corners=True)
-        self.assertTrue(
-            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1))))
-        for i in range(len(results) - 1):
-            self.assertTrue(np.allclose(results[i + 1], expect_res))
-
-    def test_exception(self):
-        # for 4-D input, data_format can only be NCHW or NHWC
-        input = fluid.layers.data(
-            name="input", shape=[3, 6, 6], dtype="float32")
-        try:
-            out = fluid.layers.resize_nearest(
-                input, out_shape=[4, 8], data_format='NDHWC')
-        except:
-            pass
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
deleted file mode 100644
index 60dcf195daf61d76a2e6d6f764fa216270804f55..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.executor import Executor
-
-BATCH_SIZE = 20
-
-
-class TestNetWithDtype(unittest.TestCase):
-    def setUp(self):
-        self.dtype = "float64"
-        self.init_dtype()
-
-    def run_net_on_place(self, place):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            x = fluid.layers.data(name='x', shape=[13], dtype=self.dtype)
-            y = fluid.layers.data(name='y', shape=[1], dtype=self.dtype)
-            y_predict = fluid.layers.fc(input=x, size=1, act=None)
-            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
-            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-            sgd_optimizer.minimize(avg_cost)
-
-        fetch_list = [avg_cost]
-        train_reader = paddle.batch(
-            paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
-        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-        exe = fluid.Executor(place)
-        exe.run(startup)
-        for data in train_reader():
-            exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-            # the main program is runable, the datatype is fully supported
-            break
-
-    def init_dtype(self):
-        pass
-
-    def test_cpu(self):
-        place = fluid.CPUPlace()
-        self.run_net_on_place(place)
-
-    def test_gpu(self):
-        if not core.is_compiled_with_cuda():
-            return
-        place = fluid.CUDAPlace(0)
-        self.run_net_on_place(place)
-
-
-# TODO(dzhwinter): make sure the fp16 is runable
-# class TestFloat16(TestNetWithDtype):
-#     def init_dtype(self):
-#         self.dtype = "float16"
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
deleted file mode 100644
index 8bbd9443230a695a0daeafe171195e0169f65ca7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-import gradient_checker
-
-from decorator_helper import prog_scope
-
-
-class TestMulGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        prog = fluid.Program()
-        with fluid.program_guard(prog):
-            x = layers.create_parameter(dtype="float64", shape=[2, 8], name='x')
-            y = layers.create_parameter(dtype="float64", shape=[8, 4], name='y')
-            z = layers.mul(x=x, y=y)
-            gradient_checker.grad_check([x, y], z, place=place)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [7, 11]
-        eps = 0.05
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        x.persistable = True
-        y = layers.reduce_mean(x, dim=0)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-class TestMulDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        x_shape = [7, 11]
-        y_shape = [11, 9]
-        eps = 0.005
-        dtype = np.float64
-
-        x = layers.data('x', x_shape, False, dtype)
-        x.persistable = True
-        y = layers.data('y', y_shape, False, dtype)
-        y.persistable = True
-        out = layers.mul(x, y)
-        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
-        y_arr = np.random.uniform(-1, 1, y_shape).astype(dtype)
-
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
deleted file mode 100644
index 4f29467a3c5515eaff985e22aed4eccf16867757..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-import gradient_checker
-
-from decorator_helper import prog_scope
-
-
-class TestInstanceNormDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        prog = fluid.Program()
-        with fluid.program_guard(prog):
-            np.random.seed()
-            shape = [2, 3, 4, 5]
-            dtype = "float32"
-            eps = 0.005
-            atol = 1e-4
-            x = layers.create_parameter(dtype=dtype, shape=shape, name='x')
-            z = fluid.layers.instance_norm(input=x)
-            x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-            gradient_checker.double_grad_check(
-                [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
-
-    def test_grad(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
deleted file mode 100644
index a424260312eab850e579b4365efd071de599bd4f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def l2_norm(x, axis, epsilon):
-    x2 = x**2
-    s = np.sum(x2, axis=axis, keepdims=True)
-    r = np.sqrt(s + epsilon)
-    y = x / np.broadcast_to(r, x.shape)
-    return y, r
-
-
-class TestNormOp(OpTest):
-    def setUp(self):
-        self.op_type = "norm"
-        self.init_test_case()
-        x = np.random.random(self.shape).astype("float64")
-        y, norm = l2_norm(x, self.axis, self.epsilon)
-        self.inputs = {'X': x}
-        self.attrs = {'epsilon': self.epsilon, 'axis': self.axis}
-        self.outputs = {'Out': y, 'Norm': norm}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-    def init_test_case(self):
-        self.shape = [2, 3, 4, 4]
-        self.axis = 1
-        self.epsilon = 1e-8
-
-
-class TestNormOp2(TestNormOp):
-    def init_test_case(self):
-        self.shape = [5, 3, 9, 7]
-        self.axis = 0
-        self.epsilon = 1e-8
-
-
-class TestNormOp3(TestNormOp):
-    def init_test_case(self):
-        self.shape = [5, 3, 2, 7]
-        self.axis = -1
-        self.epsilon = 1e-8
-
-
-class TestNormOp4(TestNormOp):
-    def init_test_case(self):
-        self.shape = [128, 1024, 14, 14]
-        self.axis = 2
-        self.epsilon = 1e-8
-
-    def test_check_grad(self):
-        # since the gradient check is very slow in large shape, so skip check_grad
-        pass
-
-
-class TestNormOp5(TestNormOp):
-    def init_test_case(self):
-        self.shape = [2048, 2048]
-        self.axis = 1
-        self.epsilon = 1e-8
-
-    def test_check_grad(self):
-        # since the gradient check is very slow in large shape, so skip check_grad
-        pass
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
deleted file mode 100644
index 24fdcf8c88417244e981194e63bd77a2fdbd179d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import numpy as np
-
-
-class TestNormalization(unittest.TestCase):
-    data_desc = {"name": "input", "shape": (2, 3, 7)}
-
-    def gen_random_input(self):
-        """Generate random input data.
-        """
-        self.data = np.random.random(
-            size=self.data_desc["shape"]).astype("float32")
-
-    def set_program(self, axis, epsilon):
-        """Build the test program.
-        """
-        data = fluid.layers.data(
-            name=self.data_desc["name"],
-            shape=self.data_desc["shape"],
-            dtype="float32",
-            append_batch_size=False)
-        data.stop_gradient = False
-        l2_norm = fluid.layers.l2_normalize(x=data, axis=axis, epsilon=epsilon)
-        out = fluid.layers.reduce_sum(l2_norm, dim=None)
-
-        fluid.backward.append_backward(loss=out)
-        self.fetch_list = [l2_norm]
-
-    def run_program(self):
-        """Run the test program.
-        """
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            self.set_inputs(place)
-            exe = fluid.Executor(place)
-
-            output = exe.run(fluid.default_main_program(),
-                             feed=self.inputs,
-                             fetch_list=self.fetch_list,
-                             return_numpy=True)
-            self.op_output = output
-
-    def set_inputs(self, place):
-        """Set the randomly generated data to the test program.
-        """
-        self.inputs = {}
-        tensor = fluid.Tensor()
-        tensor.set(self.data, place)
-        self.inputs[self.data_desc["name"]] = tensor
-
-    def l2_normalize(self, data, axis, epsilon):
-        """ Compute the groundtruth.
-        """
-        output = data / np.broadcast_to(
-            np.sqrt(np.sum(np.square(data), axis=axis, keepdims=True)),
-            data.shape)
-        return output
-
-    def test_l2_normalize(self):
-        """ Test the python wrapper for l2_normalize.
-        """
-        axis = 1
-        #TODO(caoying) epsilon is not supported due to lack of a maximum_op.
-        epsilon = 1e-6
-
-        self.gen_random_input()
-
-        self.set_program(axis, epsilon)
-        self.run_program()
-
-        expect_output = self.l2_normalize(self.data, axis, epsilon)
-
-        # check output
-        self.assertTrue(np.allclose(self.op_output, expect_output, atol=0.001))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_npair_loss_op.py b/python/paddle/fluid/tests/unittests/test_npair_loss_op.py
deleted file mode 100644
index d1a015a16e46c38be8d3c8255d1d07cc6aa31572..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_npair_loss_op.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import numpy as np
-
-
-def npairloss(anchor, positive, labels, l2_reg=0.002):
-    def softmax_cross_entropy_with_logits(logits, labels):
-        logits = np.exp(logits)
-        logits = logits / np.sum(logits, axis=1).reshape(-1, 1)
-
-        return np.mean(
-            -np.sum(labels * np.log(logits), axis=1), dtype=np.float32)
-
-    batch_size = labels.shape[0]
-
-    labels = np.reshape(labels, (batch_size, 1))
-    labels = np.equal(labels, labels.transpose()).astype(float)
-    labels = labels / np.sum(labels, axis=1, keepdims=True)
-
-    l2loss = np.mean(np.sum(np.power(anchor, 2), 1)) + np.mean(
-        np.sum(np.power(positive, 2), 1))
-    l2loss = (l2loss * 0.25 * l2_reg).astype(np.float32)
-
-    similarity_matrix = np.matmul(anchor, positive.transpose())
-    celoss = np.mean(
-        softmax_cross_entropy_with_logits(similarity_matrix, labels))
-
-    return l2loss + celoss
-
-
-class TestNpairLossOp(unittest.TestCase):
-    def setUp(self):
-        self.dtype = np.float32
-
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
-
-    def test_npair_loss(self):
-        reg_lambda = 0.002
-        num_data, feat_dim, num_classes = 18, 6, 3
-
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        embeddings_anchor = np.random.rand(num_data,
-                                           feat_dim).astype(np.float32)
-        embeddings_positive = np.random.rand(num_data,
-                                             feat_dim).astype(np.float32)
-        row_labels = np.random.randint(
-            0, num_classes, size=(num_data)).astype(np.float32)
-        out_loss = npairloss(
-            embeddings_anchor,
-            embeddings_positive,
-            row_labels,
-            l2_reg=reg_lambda)
-
-        anc = fluid.layers.create_tensor(
-            dtype='float32', persistable=True, name='anc')
-        pos = fluid.layers.create_tensor(
-            dtype='float32', persistable=True, name='pos')
-        lab = fluid.layers.create_tensor(
-            dtype='float32', persistable=True, name='lab')
-        fluid.layers.assign(input=embeddings_anchor, output=anc)
-        fluid.layers.assign(input=embeddings_positive, output=pos)
-        fluid.layers.assign(input=row_labels, output=lab)
-
-        npair_loss_op = fluid.layers.npair_loss(
-            anchor=anc, positive=pos, labels=lab, l2_reg=reg_lambda)
-        out_tensor = exe.run(feed={'anc': anc,
-                                   'pos': pos,
-                                   'lab': lab},
-                             fetch_list=[npair_loss_op.name])
-
-        self.__assert_close(
-            out_tensor,
-            out_loss,
-            "inference output are different at " + str(place) + ", " +
-            str(np.dtype('float32')) + str(np.array(out_tensor)) +
-            str(out_loss),
-            atol=1e-3)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nvprof.py b/python/paddle/fluid/tests/unittests/test_nvprof.py
deleted file mode 100644
index da943d64da6cfc64d121b7373f7c067c1cff731c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_nvprof.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import os
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-
-
-class TestNVProf(unittest.TestCase):
-    def test_nvprof(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        epoc = 8
-        dshape = [4, 3, 28, 28]
-        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
-        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        output_file = 'cuda_profiler.txt'
-        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
-            for i in range(epoc):
-                input = np.random.random(dshape).astype('float32')
-                exe.run(fluid.default_main_program(), feed={'data': input})
-        os.remove(output_file)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
deleted file mode 100644
index 62184f771942b2f94b65ffd2f2253e1121d15f9d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-from op_test import OpTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-from paddle.fluid.framework import Program, program_guard
-
-
-class TestOneHotOp(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot'
-        depth = 10
-        depth_np = np.array(10).astype('int32')
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
-
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
-                              depth)).astype('float32')
-
-        for i in range(np.product(x.shape)):
-            out[i, x[i]] = 1.0
-
-        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_attr(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot'
-        depth = 10
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
-
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
-                              depth)).astype('float32')
-
-        for i in range(np.product(x.shape)):
-            out[i, x[i]] = 1.0
-
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_default_dtype(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot'
-        depth = 10
-        depth_np = np.array(10).astype('int32')
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
-
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
-                              depth)).astype('float32')
-
-        for i in range(np.product(x.shape)):
-            out[i, x[i]] = 1.0
-
-        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
-        self.attrs = {}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_default_dtype_attr(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot'
-        depth = 10
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
-
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
-                              depth)).astype('float32')
-
-        for i in range(np.product(x.shape)):
-            out[i, x[i]] = 1.0
-
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'depth': depth}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_out_of_range(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot'
-        depth = 10
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.choice([-1, depth]) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
-
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
-                              depth)).astype('float32')
-
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'depth': depth, 'allow_out_of_range': True}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_exception(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot'
-        self.depth = 10
-        self.place = core.CPUPlace()
-        self.dimension = 12
-        self.x = core.LoDTensor()
-        x_lod = [[4, 1, 3, 3]]
-        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
-        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
-        self.x.set(data, self.place)
-        self.x.set_recursive_sequence_lengths(x_lod)
-
-    def test_check_output(self):
-        program = Program()
-        with program_guard(program):
-            x = fluid.layers.data(
-                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
-            block = program.current_block()
-            one_hot_out = block.create_var(
-                name="one_hot_out",
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                dtype='float32')
-            block.append_op(
-                type='one_hot',
-                inputs={'X': x},
-                attrs={'depth': self.depth},
-                outputs={'Out': one_hot_out})
-            exe = fluid.Executor(self.place)
-
-            def run():
-                exe.run(feed={'x': self.x},
-                        fetch_list=[one_hot_out],
-                        return_numpy=False)
-
-            self.assertRaises(core.EnforceNotMet, run)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
deleted file mode 100644
index dc948c42bc6d6a568f99e8c709514e4196c5a81c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ /dev/null
@@ -1,208 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-from op_test import OpTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-from paddle.fluid.framework import Program, program_guard
-
-
-class TestOneHotOp(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot_v2'
-        depth = 10
-        depth_np = np.array(10).astype('int32')
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
-
-        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
-
-        for i in range(np.product(x.shape)):
-            out[i, x[i]] = 1.0
-
-        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_attr(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot_v2'
-        depth = 10
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
-
-        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
-                              depth)).astype('float32')
-
-        for i in range(np.product(x.shape)):
-            out[i, 0, x[i]] = 1.0
-
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_default_dtype(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot_v2'
-        depth = 10
-        depth_np = np.array(10).astype('int32')
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
-
-        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
-
-        for i in range(np.product(x.shape)):
-            out[i, x[i]] = 1.0
-
-        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
-        self.attrs = {}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_default_dtype_attr(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot_v2'
-        depth = 10
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
-
-        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
-                              depth)).astype('float32')
-
-        for i in range(np.product(x.shape)):
-            out[i, 0, x[i]] = 1.0
-
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'depth': depth}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_out_of_range(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot_v2'
-        depth = 10
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.choice([-1, depth]) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
-
-        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
-
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'depth': depth, 'allow_out_of_range': True}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_exception(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot_v2'
-        self.depth = 10
-        self.place = core.CPUPlace()
-        self.dimension = 12
-        self.x = core.LoDTensor()
-        x_lod = [[4, 1, 3, 3]]
-        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
-        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
-        self.x.set(data, self.place)
-        self.x.set_recursive_sequence_lengths(x_lod)
-
-    def test_check_output(self):
-        program = Program()
-        with program_guard(program):
-            x = fluid.layers.data(
-                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
-            block = program.current_block()
-            one_hot_out = block.create_var(
-                name="one_hot_out",
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                dtype='float32')
-            block.append_op(
-                type='one_hot',
-                inputs={'X': x},
-                attrs={'depth': self.depth},
-                outputs={'Out': one_hot_out})
-            exe = fluid.Executor(self.place)
-
-            def run():
-                exe.run(feed={'x': self.x},
-                        fetch_list=[one_hot_out],
-                        return_numpy=False)
-
-            self.assertRaises(core.EnforceNotMet, run)
-
-
-class TestOneHotOpApi(unittest.TestCase):
-    def test_api(self):
-        depth = 10
-        self._run(depth)
-
-    def test_api_with_depthTensor(self):
-        depth = fluid.layers.assign(input=np.array([10], dtype=np.int32))
-        self._run(depth)
-
-    def test_api_with_dygraph(self):
-        depth = 10
-        label = np.array([np.random.randint(0, depth - 1)
-                          for i in range(6)]).reshape([6, 1])
-        with fluid.dygraph.guard():
-            one_hot_label = fluid.one_hot(
-                input=fluid.dygraph.to_variable(label), depth=depth)
-
-    def _run(self, depth):
-        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-        one_hot_label = fluid.one_hot(input=label, depth=depth)
-
-        place = fluid.CPUPlace()
-        label_data = np.array([np.random.randint(0, 10 - 1)
-                               for i in range(6)]).reshape([6, 1])
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'label': label_data, },
-                      fetch_list=[one_hot_label],
-                      return_numpy=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_op_support_gpu.py b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
deleted file mode 100644
index e203fccd03f86077c51e176456c1c313ac14a9ee..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-
-
-class TestOpSupportGPU(unittest.TestCase):
-    def test_case(self):
-        self.assertEqual(core.is_compiled_with_cuda(),
-                         core.op_support_gpu("sum"))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_operator.py b/python/paddle/fluid/tests/unittests/test_operator.py
deleted file mode 100644
index 544fca8cecd0a2b94a5aec40b9442f86036fd4d2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_operator.py
+++ /dev/null
@@ -1,220 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle.fluid.op as op
-import paddle.fluid.proto.framework_pb2 as framework_pb2
-
-
-class TestGetAllProtos(unittest.TestCase):
-    def test_all(self):
-        all_protos = op.get_all_op_protos()
-        self.assertNotEqual(0, len(all_protos))
-
-        for each in all_protos:
-            self.assertTrue(each.IsInitialized())
-
-
-class TestOpDescCreationMethod(unittest.TestCase):
-    def test_plain_input_output(self):
-        op_proto = framework_pb2.OpProto()
-        op_proto.type = "test"
-        ipt = op_proto.inputs.add()
-        ipt.name = "X"
-        ipt.comment = "not matter"
-
-        ipt = op_proto.inputs.add()
-        ipt.name = "Y"
-        ipt.comment = "not matter"
-
-        opt = op_proto.outputs.add()
-        opt.name = "Z"
-        opt.comment = "not matter"
-
-        op_proto.comment = "not matter"
-
-        self.assertTrue(op_proto.IsInitialized())
-
-        method = op.OpDescCreationMethod(op_proto)
-        output = method(X="a", Y="b", Z="c")
-        expected = framework_pb2.OpDesc()
-        expected.type = "test"
-        ipt_0 = expected.inputs.add()
-        ipt_0.parameter = "X"
-        ipt_0.arguments.extend(["a"])
-        ipt_1 = expected.inputs.add()
-        ipt_1.parameter = 'Y'
-        ipt_1.arguments.extend(['b'])
-        opt = expected.outputs.add()
-        opt.parameter = "Z"
-        opt.arguments.extend(["c"])
-
-        self.assertEqual(expected, output)
-
-    def test_multiple_input_plain_output(self):
-        op_proto = framework_pb2.OpProto()
-        op_proto.type = "fc"
-        ipt = op_proto.inputs.add()
-        ipt.name = "X"
-        ipt.comment = ""
-        ipt.duplicable = True
-
-        ipt = op_proto.inputs.add()
-        ipt.name = "W"
-        ipt.comment = ""
-        ipt.duplicable = True
-
-        ipt = op_proto.inputs.add()
-        ipt.name = "b"
-        ipt.comment = ""
-
-        out = op_proto.outputs.add()
-        out.name = "Y"
-        out.comment = ""
-
-        op_proto.comment = ""
-        self.assertTrue(op_proto.IsInitialized())
-        method = op.OpDescCreationMethod(op_proto)
-
-        generated1 = method(X="x", W="w", b="b", Y="y")
-        expected1 = framework_pb2.OpDesc()
-        tmp = expected1.inputs.add()
-        tmp.parameter = "X"
-        tmp.arguments.extend(['x'])
-
-        tmp = expected1.inputs.add()
-        tmp.parameter = 'W'
-        tmp.arguments.extend(['w'])
-
-        tmp = expected1.inputs.add()
-        tmp.parameter = 'b'
-        tmp.arguments.extend(['b'])
-
-        tmp = expected1.outputs.add()
-        tmp.parameter = 'Y'
-        tmp.arguments.extend(['y'])
-        expected1.type = 'fc'
-        self.assertEqual(expected1, generated1)
-
-        generated2 = method(
-            X=['x1', 'x2', 'x3'], b='b', W=['w1', 'w2', 'w3'], Y='y')
-        expected2 = framework_pb2.OpDesc()
-
-        tmp = expected2.inputs.add()
-        tmp.parameter = "X"
-        tmp.arguments.extend(['x1', 'x2', 'x3'])
-
-        tmp = expected2.inputs.add()
-        tmp.parameter = 'W'
-        tmp.arguments.extend(['w1', 'w2', 'w3'])
-
-        tmp = expected2.inputs.add()
-        tmp.parameter = 'b'
-        tmp.arguments.extend(['b'])
-
-        tmp = expected2.outputs.add()
-        tmp.parameter = 'Y'
-        tmp.arguments.extend(['y'])
-
-        expected2.type = 'fc'
-        self.assertEqual(expected2, generated2)
-
-    def test_attrs(self):
-        op_proto = framework_pb2.OpProto()
-        op_proto.type = "test"
-        ipt = op_proto.inputs.add()
-        ipt.name = 'X'
-        ipt.comment = ""
-
-        def __add_attr__(name, type):
-            attr = op_proto.attrs.add()
-            attr.name = name
-            attr.comment = ""
-            attr.type = type
-
-        __add_attr__("int_attr", framework_pb2.INT)
-        __add_attr__("float_attr", framework_pb2.FLOAT)
-        __add_attr__("string_attr", framework_pb2.STRING)
-        __add_attr__("ints_attr", framework_pb2.INTS)
-        __add_attr__("floats_attr", framework_pb2.FLOATS)
-        __add_attr__("strings_attr", framework_pb2.STRINGS)
-
-        op_proto.comment = ""
-        self.assertTrue(op_proto.IsInitialized())
-
-        method = op.OpDescCreationMethod(op_proto)
-
-        generated = method(
-            X="a",
-            int_attr=10,
-            float_attr=3.2,
-            string_attr="test_str",
-            ints_attr=[0, 1, 2, 3, 4],
-            floats_attr=[0.2, 3.2, 4.5],
-            strings_attr=["a", "b", "c"])
-
-        expected = framework_pb2.OpDesc()
-        expected.type = "test"
-
-        ipt = expected.inputs.add()
-        ipt.parameter = "X"
-        ipt.arguments.extend(['a'])
-
-        attr = expected.attrs.add()
-        attr.name = "int_attr"
-        attr.type = framework_pb2.INT
-        attr.i = 10
-
-        attr = expected.attrs.add()
-        attr.name = "float_attr"
-        attr.type = framework_pb2.FLOAT
-        attr.f = 3.2
-
-        attr = expected.attrs.add()
-        attr.name = "string_attr"
-        attr.type = framework_pb2.STRING
-        attr.s = "test_str"
-
-        attr = expected.attrs.add()
-        attr.name = "ints_attr"
-        attr.type = framework_pb2.INTS
-        attr.ints.extend([0, 1, 2, 3, 4])
-
-        attr = expected.attrs.add()
-        attr.name = 'floats_attr'
-        attr.type = framework_pb2.FLOATS
-        attr.floats.extend([0.2, 3.2, 4.5])
-
-        attr = expected.attrs.add()
-        attr.name = 'strings_attr'
-        attr.type = framework_pb2.STRINGS
-        attr.strings.extend(['a', 'b', 'c'])
-
-        self.assertEqual(expected, generated)
-
-
-class TestOpCreations(unittest.TestCase):
-    def test_all(self):
-        add_op = op.Operator("sum", X=["a", "b"], Out="z")
-        self.assertIsNotNone(add_op)
-        # Invoke C++ DebugString()
-        self.assertEqual('Op(sum), inputs:{X[a, b]}, outputs:{Out[z]}.',
-                         str(add_op))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
deleted file mode 100644
index 5932112c3c4d186e46705d6765f09dd158e81e36..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle.fluid.core as core
-import paddle.compat as cpt
-
-from paddle.fluid.framework import Program, default_startup_program
-
-main_program = default_startup_program()
-
-
-class TestOperator(unittest.TestCase):
-    def test_error_type(self):
-        block = main_program._create_block()
-        try:
-            block.append_op()
-            self.assertFail()
-        except ValueError as v_err:
-            self.assertEqual(
-                cpt.get_exception_message(v_err),
-                "`type` to initialized an Operator can not be None.")
-        try:
-            block.append_op(type="no_such_op")
-            self.assertFail()
-        except ValueError as a_err:
-            self.assertEqual(
-                cpt.get_exception_message(a_err),
-                "Operator \"no_such_op\" has not been registered.")
-
-    def test_op_desc_creation(self):
-        program = Program()
-        block = program.current_block()
-        mul_x = block.create_var(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mul_op = block.append_op(
-            type="mul",
-            inputs={"X": [mul_x],
-                    "Y": mul_y},
-            outputs={"Out": [mul_out]},
-            attrs={"x_num_col_dims": 1})
-
-        self.assertNotEqual(str(mul_op), "")
-        self.assertEqual(mul_op.type, "mul")
-        self.assertEqual(mul_op.input_names, ["X", "Y"])
-        self.assertEqual(mul_op.input("X"), ["mul.x"])
-        self.assertEqual(mul_op.input("Y"), ["mul.y"])
-        self.assertEqual(mul_op.output_names, ["Out"])
-        self.assertEqual(mul_op.output("Out"), ["mul.out"])
-        self.assertEqual(
-            set(mul_op.attr_names),
-            set([
-                "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
-                "use_mkldnn", "scale_x", "scale_y", "scale_out",
-                "force_fp32_output", "op_namescope", "op_callstack"
-            ]))
-        self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
-        self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
-        self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
-        self.assertEqual(mul_op.has_attr("y_num_col_dims"), True)
-        self.assertEqual(mul_op.attr_type("y_num_col_dims"), core.AttrType.INT)
-        self.assertEqual(mul_op.attr("y_num_col_dims"), 1)
-        self.assertEqual(mul_op.idx, 0)
-        self.assertEqual(mul_out.op, mul_op)
-
-    def test_mult_input(self):
-        program = Program()
-        block = program.current_block()
-        sum_x1 = block.create_var(
-            dtype="int", shape=[3, 4], lod_level=0, name="sum.x1")
-        sum_x2 = block.create_var(
-            dtype="int", shape=[3, 4], lod_level=0, name="sum.x2")
-        sum_x3 = block.create_var(
-            dtype="int", shape=[3, 4], lod_level=0, name="sum.x3")
-        sum_out = block.create_var(
-            dtype="int", shape=[3, 4], lod_level=0, name="sum.out")
-        sum_op = block.append_op(
-            type="sum",
-            inputs={"X": [sum_x1, sum_x2, sum_x3]},
-            outputs={"Out": sum_out})
-        self.assertEqual(sum_op.type, "sum")
-        self.assertEqual(sum_op.input_names, ["X"])
-        self.assertEqual(sum_op.input("X"), ["sum.x1", "sum.x2", "sum.x3"])
-        self.assertEqual(sum_op.output_names, ["Out"])
-        self.assertEqual(sum_op.output("Out"), ["sum.out"])
-        self.assertEqual(sum_op.idx, 0)
-        self.assertEqual(sum_out.op, sum_op)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
deleted file mode 100644
index 1c3fd17fd284852ba7aeedaa87ca07e74fdd23e5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ /dev/null
@@ -1,766 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle.fluid.framework as framework
-import paddle.fluid.optimizer as optimizer
-import paddle.compat as cpt
-from paddle.fluid.backward import append_backward
-
-
-class TestOptimizer(unittest.TestCase):
-    def test_sgd_optimizer(self):
-        def check_sgd_optimizer(optimizer_attr):
-            init_program = framework.Program()
-            program = framework.Program()
-            block = program.global_block()
-            mul_x = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="mul.x",
-                optimize_attr=optimizer_attr)
-            mul_y = block.create_var(
-                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-            mul_out = block.create_var(
-                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-            mean_out = block.create_var(
-                dtype="float32", shape=[1], lod_level=0, name="mean.out")
-            block.append_op(
-                type="mul",
-                inputs={"X": mul_x,
-                        "Y": mul_y},
-                outputs={"Out": mul_out},
-                attrs={"x_num_col_dims": 1})
-            block.append_op(
-                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-            sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
-            opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-            return opts
-
-        opts = check_sgd_optimizer({'learning_rate': 1.1})
-        self.assertEqual(len(opts), 2)
-        self.assertEqual([op.type for op in opts], ["scale", "sgd"])
-
-        opts = check_sgd_optimizer({'learning_rate': 1.0})
-        self.assertEqual(len(opts), 1)
-        self.assertEqual([op.type for op in opts], ["sgd"])
-
-
-class TestOptimizerBackwardApplygrad(unittest.TestCase):
-    def test_sgd_optimizer(self):
-        def check_sgd_optimizer(optimizer_attr):
-            init_program = framework.Program()
-            program = framework.Program()
-            block = program.global_block()
-            mul_x = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="mul.x",
-                optimize_attr=optimizer_attr)
-            mul_y = block.create_var(
-                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-            mul_out = block.create_var(
-                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-            mean_out = block.create_var(
-                dtype="float32", shape=[1], lod_level=0, name="mean.out")
-            block.append_op(
-                type="mul",
-                inputs={"X": mul_x,
-                        "Y": mul_y},
-                outputs={"Out": mul_out},
-                attrs={"x_num_col_dims": 1})
-            block.append_op(
-                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-            sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
-            with framework.program_guard(program, init_program):
-                p_g = sgd_optimizer.backward(mean_out)
-                opts = sgd_optimizer.apply_gradients(p_g)
-            return opts
-
-        opts = check_sgd_optimizer({'learning_rate': 1.1})
-        self.assertEqual(len(opts), 2)
-        self.assertEqual([op.type for op in opts], ["scale", "sgd"])
-
-        opts = check_sgd_optimizer({'learning_rate': 1.0})
-        self.assertEqual(len(opts), 1)
-        self.assertEqual([op.type for op in opts], ["sgd"])
-
-
-class TestMomentumOptimizer(unittest.TestCase):
-    class MockMomentum(optimizer.MomentumOptimizer):
-        def get_accumulators(self):
-            return self._accumulators
-
-        def get_velocity_str(self):
-            return self._velocity_acc_str
-
-    def test_vanilla_momentum_optimizer(self):
-        init_program = framework.Program()
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        learning_rate = 0.01
-        momentum_optimizer = self.MockMomentum(
-            learning_rate=learning_rate, momentum=0.2)
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward(mean_out)
-        self.assertEqual(len(params_grads), 1)
-        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        with framework.program_guard(program, init_program):
-            opts = momentum_optimizer.apply_gradients(params_grads)
-        self.assertEqual(len(opts), 2)
-        sgd_op = opts[-1]
-        self.assertEqual([op.type for op in opts], ["scale", "momentum"])
-        self.assertFalse(sgd_op.attr('use_nesterov'))
-
-        # Check accumulators
-        accumulators = momentum_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 1)
-        self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators)
-        velocity_acc = accumulators[momentum_optimizer.get_velocity_str()]
-        self.assertEqual(len(velocity_acc), 1)
-        self.assertTrue(mul_x.name in velocity_acc)
-
-        # Check init_program
-        init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
-        self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
-
-    def test_nesterov_momentum_optimizer(self):
-        init_program = framework.Program()
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        learning_rate = 0.01
-        momentum_optimizer = self.MockMomentum(
-            learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
-        params_grads = append_backward(mean_out)
-        self.assertEqual(len(params_grads), 1)
-        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        with framework.program_guard(program, init_program):
-            opts = momentum_optimizer.apply_gradients(params_grads)
-        self.assertEqual(len(opts), 2)
-        sgd_op = opts[-1]
-        self.assertEqual([op.type for op in opts], ["scale", "momentum"])
-        self.assertTrue(sgd_op.attr('use_nesterov'))
-
-        # Check accumulators
-        accumulators = momentum_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 1)
-        self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators)
-        velocity_acc = accumulators[momentum_optimizer.get_velocity_str()]
-        self.assertEqual(len(velocity_acc), 1)
-        self.assertTrue(mul_x.name in velocity_acc)
-
-        # Check init_program
-        init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
-        self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
-
-
-class TestAdagradOptimizer(unittest.TestCase):
-    class MockAdagrad(optimizer.AdagradOptimizer):
-        def get_accumulators(self):
-            return self._accumulators
-
-        def get_moment_str(self):
-            return self._moment_acc_str
-
-    def test_adagrad_optimizer(self):
-        init_program = framework.Program()
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        learning_rate = 0.01
-        adagrad_optimizer = self.MockAdagrad(
-            learning_rate=learning_rate, epsilon=1.0e-6)
-        params_grads = append_backward(mean_out)
-        self.assertEqual(len(params_grads), 1)
-        self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
-        with framework.program_guard(program, init_program):
-            opts = adagrad_optimizer.apply_gradients(params_grads)
-        self.assertEqual(len(opts), 2)
-        self.assertEqual([op.type for op in opts], ["scale", "adagrad"])
-
-        # Check accumulators
-        accumulators = adagrad_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 1)
-        self.assertTrue(adagrad_optimizer.get_moment_str() in accumulators)
-        moment_acc = accumulators[adagrad_optimizer.get_moment_str()]
-        self.assertEqual(len(moment_acc), 1)
-        self.assertTrue(mul_x.name in moment_acc)
-
-        # Check init_program
-        init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 3)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
-        self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
-
-
-class TestAdamOptimizer(unittest.TestCase):
-    class MockAdam(optimizer.AdamOptimizer):
-        def get_accumulators(self):
-            return self._accumulators
-
-        def get_moment1_str(self):
-            return self._moment1_acc_str
-
-        def get_moment2_str(self):
-            return self._moment2_acc_str
-
-    def test_adam_optimizer(self):
-        init_program = framework.Program()
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        learning_rate = 0.01
-        adam_optimizer = self.MockAdam(
-            learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward(mean_out)
-        self.assertEqual(len(params_grads), 1)
-        self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
-        with framework.program_guard(program, init_program):
-            opts = adam_optimizer.apply_gradients(params_grads)
-        self.assertEqual(len(opts), 4)
-        self.assertEqual([op.type for op in opts],
-                         ["scale", "adam", "scale", "scale"])
-
-        # Check accumulators
-        accumulators = adam_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 4)
-        self.assertTrue(adam_optimizer.get_moment1_str() in accumulators)
-        self.assertTrue(adam_optimizer.get_moment2_str() in accumulators)
-        moment1_acc = accumulators[adam_optimizer.get_moment1_str()]
-        moment2_acc = accumulators[adam_optimizer.get_moment2_str()]
-        self.assertEqual(len(moment1_acc), 1)
-        self.assertEqual(len(moment2_acc), 1)
-        self.assertTrue(mul_x.name in moment1_acc)
-        self.assertTrue(mul_x.name in moment2_acc)
-
-        # Check init_program
-        init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 5)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
-
-
-class TestAdamaxOptimizer(unittest.TestCase):
-    class MockAdamax(optimizer.AdamaxOptimizer):
-        def get_accumulators(self):
-            return self._accumulators
-
-        def get_moment_str(self):
-            return self._moment_acc_str
-
-        def get_inf_norm_str(self):
-            return self._inf_norm_acc_str
-
-    def test_adamax_optimizer(self):
-        init_program = framework.Program()
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        learning_rate = 0.01
-        adamax_optimizer = self.MockAdamax(
-            learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward(mean_out)
-        self.assertEqual(len(params_grads), 1)
-        self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
-        with framework.program_guard(program, init_program):
-            opts = adamax_optimizer.apply_gradients(params_grads)
-        self.assertEqual(len(opts), 3)
-        self.assertEqual([op.type for op in opts], ["scale", "adamax", "scale"])
-
-        # Check accumulators
-        accumulators = adamax_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 3)
-        self.assertTrue(adamax_optimizer.get_moment_str() in accumulators)
-        self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators)
-        moment_acc = accumulators[adamax_optimizer.get_moment_str()]
-        inf_norm_acc = accumulators[adamax_optimizer.get_inf_norm_str()]
-        self.assertEqual(len(moment_acc), 1)
-        self.assertEqual(len(inf_norm_acc), 1)
-        self.assertTrue(mul_x.name in moment_acc)
-        self.assertTrue(mul_x.name in inf_norm_acc)
-
-        # Check init_program
-        init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 4)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
-
-
-class TestDpsgdOptimizer(unittest.TestCase):
-    def test_dpsgd_optimizer(self):
-        def check_dpsgd_optimizer(optimizer_attr):
-            init_program = framework.Program()
-            program = framework.Program()
-            block = program.global_block()
-            mul_x = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="mul.x",
-                optimize_attr=optimizer_attr)
-            mul_y = block.create_var(
-                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-            mul_out = block.create_var(
-                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-            block.append_op(
-                type="mul",
-                inputs={"X": mul_x,
-                        "Y": mul_y},
-                outputs={"Out": mul_out},
-                attrs={"x_num_col_dims": 1})
-            mean_out = block.create_var(
-                dtype="float32", shape=[1], lod_level=0, name="mean.out")
-            block.append_op(
-                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-            dpsgd_optimizer = optimizer.DpsgdOptimizer(
-                learning_rate=0.01, clip=100.0, batch_size=16.0, sigma=0.0)
-            opts, _ = dpsgd_optimizer.minimize(mean_out, init_program)
-            return opts
-
-        opts = check_dpsgd_optimizer({
-            'learning_rate': 1.1,
-            'clip': 100.0,
-            'batch_size': 16.0,
-            'sigma': 4.0
-        })
-        self.assertEqual(len(opts), 2)
-        self.assertEqual([op.type for op in opts], ["scale", "dpsgd"])
-
-
-class TestDecayedAdagradOptimizer(unittest.TestCase):
-    class MockDecayedAdagrad(optimizer.DecayedAdagradOptimizer):
-        def get_accumulators(self):
-            return self._accumulators
-
-        def get_moment_str(self):
-            return self._moment_acc_str
-
-    def test_decayed_adagrad_optimizer(self):
-        init_program = framework.Program()
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        learning_rate = 0.01
-        decayed_adagrad_optimizer = self.MockDecayedAdagrad(
-            learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
-        params_grads = append_backward(mean_out)
-        self.assertEqual(len(params_grads), 1)
-        self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
-        with framework.program_guard(program, init_program):
-            opts = decayed_adagrad_optimizer.apply_gradients(params_grads)
-        self.assertEqual(len(opts), 2)
-        self.assertEqual([op.type for op in opts], ["scale", "decayed_adagrad"])
-
-        # Check accumulators
-        accumulators = decayed_adagrad_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 1)
-        self.assertTrue(
-            decayed_adagrad_optimizer.get_moment_str() in accumulators)
-        moment_acc = accumulators[decayed_adagrad_optimizer.get_moment_str()]
-        self.assertEqual(len(moment_acc), 1)
-        self.assertTrue(mul_x.name in moment_acc)
-
-        # Check init_program
-        init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
-        self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
-
-
-class TestFtrlOptimizer(unittest.TestCase):
-    class MockFtrl(optimizer.FtrlOptimizer):
-        def get_accumulators(self):
-            return self._accumulators
-
-        def get_squared_str(self):
-            return self._squared_acc_str
-
-        def get_linear_str(self):
-            return self._linear_acc_str
-
-    def test_ftrl_optimizer(self):
-        init_program = framework.Program()
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        learning_rate = 0.01
-        ftrl_optimizer = self.MockFtrl(
-            learning_rate=learning_rate, l1=0.0, l2=0.0, lr_power=-0.5)
-        params_grads = append_backward(mean_out)
-        self.assertEqual(len(params_grads), 1)
-        self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
-        with framework.program_guard(program, init_program):
-            opts = ftrl_optimizer.apply_gradients(params_grads)
-        self.assertEqual(len(opts), 2)
-        self.assertEqual([op.type for op in opts], ["scale", "ftrl"])
-
-        # Check accumulators
-        accumulators = ftrl_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 2)
-        self.assertTrue(ftrl_optimizer.get_squared_str() in accumulators)
-        self.assertTrue(ftrl_optimizer.get_linear_str() in accumulators)
-        squared_acc = accumulators[ftrl_optimizer.get_squared_str()]
-        linear_acc = accumulators[ftrl_optimizer.get_linear_str()]
-        self.assertEqual(len(squared_acc), 1)
-        self.assertEqual(len(linear_acc), 1)
-        self.assertTrue(mul_x.name in squared_acc)
-        self.assertTrue(mul_x.name in linear_acc)
-
-        # Check init_program
-        init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 3)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
-
-
-class TestLookaheadOptimizer(unittest.TestCase):
-    def test_lookahead_optimizer(self):
-        init_program = framework.Program()
-        program = framework.Program()
-        block = program.global_block()
-        init_block = init_program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        init_mul_x = init_block.create_parameter(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-
-        sgd = optimizer.SGD(learning_rate=0.01)
-        lookahead = optimizer.LookaheadOptimizer(sgd, alpha=0.5, k=5)
-        with framework.program_guard(program, init_program):
-            opts, _ = lookahead.minimize(mean_out)
-        self.assertEqual(len(opts), 2)
-        self.assertEqual([op.type for op in opts], ["scale", "sgd"])
-
-
-class TestRecomputeOptimizer(unittest.TestCase):
-    def net(self):
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        b1 = block.create_parameter(
-            dtype="float32", shape=[5, 8], lod_level=0, name="b1")
-        b1_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="b1_out")
-        b2 = block.create_parameter(
-            dtype="float32", shape=[5, 8], lod_level=0, name="b2")
-        b2_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="b2_out")
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": mul_out,
-                    "Y": b1},
-            outputs={"Out": b1_out})
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": b1_out,
-                    "Y": b2},
-            outputs={"Out": b2_out})
-        block.append_op(
-            type="mean", inputs={"X": b2_out}, outputs={"Out": mean_out})
-
-        return mul_out, b1_out, b2_out, mean_out
-
-    def test_no_checkpoint(self):
-        mul_out, b1_out, b2_out, mean_out = self.net()
-        self.assertEqual(len(mean_out.block.ops), 4)
-        self.assertEqual([op.type for op in mean_out.block.ops],
-                         ["mul", "elementwise_add", "elementwise_add", "mean"])
-        sgd_optimizer = optimizer.SGD(learning_rate=1.0)
-        recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
-        recompute_optimizer._set_checkpoints([])
-        opts, params_grads = recompute_optimizer.minimize(mean_out)
-
-        self.assertEqual(len(mean_out.block.ops), 12)
-        self.assertEqual([op.type for op in mean_out.block.ops], [
-            "mul", "elementwise_add", "elementwise_add", "mean",
-            "fill_constant", "mean_grad", "elementwise_add_grad",
-            "elementwise_add_grad", "mul_grad", "sgd", "sgd", "sgd"
-        ])
-
-    def test_one_checkpoint(self):
-        mul_out, b1_out, b2_out, mean_out = self.net()
-        self.assertEqual(len(mean_out.block.ops), 4)
-        self.assertEqual([op.type for op in mean_out.block.ops],
-                         ["mul", "elementwise_add", "elementwise_add", "mean"])
-        sgd_optimizer = optimizer.SGD(learning_rate=1.0)
-        recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
-        recompute_optimizer._set_checkpoints([b1_out])
-        opts, params_grads = recompute_optimizer.minimize(mean_out)
-
-        self.assertEqual(len(mean_out.block.ops), 13)
-        self.assertEqual([op.type for op in mean_out.block.ops], [
-            "mul", "elementwise_add", "elementwise_add", "mean",
-            "fill_constant", "mean_grad", "elementwise_add_grad", "mul",
-            "elementwise_add_grad", "mul_grad", "sgd", "sgd", "sgd"
-        ])
-
-    def test_multi_checkpoint(self):
-        mul_out, b1_out, b2_out, mean_out = self.net()
-        self.assertEqual(len(mean_out.block.ops), 4)
-        self.assertEqual([op.type for op in mean_out.block.ops],
-                         ["mul", "elementwise_add", "elementwise_add", "mean"])
-        sgd_optimizer = optimizer.SGD(learning_rate=1.0)
-        recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
-        recompute_optimizer._set_checkpoints([mul_out, b2_out])
-        opts, params_grads = recompute_optimizer.minimize(mean_out)
-
-        self.assertEqual(len(mean_out.block.ops), 13)
-        self.assertEqual([op.type for op in mean_out.block.ops], [
-            "mul", "elementwise_add", "elementwise_add", "mean",
-            "fill_constant", "mean_grad", "elementwise_add",
-            "elementwise_add_grad", "elementwise_add_grad", "mul_grad", "sgd",
-            "sgd", "sgd"
-        ])
-
-    def test_adjacent_checkpoint(self):
-        mul_out, b1_out, b2_out, mean_out = self.net()
-        self.assertEqual(len(mean_out.block.ops), 4)
-        self.assertEqual([op.type for op in mean_out.block.ops],
-                         ["mul", "elementwise_add", "elementwise_add", "mean"])
-        sgd_optimizer = optimizer.SGD(learning_rate=1.0)
-        recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
-        recompute_optimizer._set_checkpoints([mul_out, b1_out])
-        opts, params_grads = recompute_optimizer.minimize(mean_out)
-
-        self.assertEqual(len(mean_out.block.ops), 12)
-        self.assertEqual([op.type for op in mean_out.block.ops], [
-            "mul", "elementwise_add", "elementwise_add", "mean",
-            "fill_constant", "mean_grad", "elementwise_add_grad",
-            "elementwise_add_grad", "mul_grad", "sgd", "sgd", "sgd"
-        ])
-
-    def test_apply_gradients(self):
-        mul_out, b1_out, b2_out, mean_out = self.net()
-        sgd_optimizer = optimizer.SGD(learning_rate=1.0)
-        recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
-        recompute_optimizer._set_checkpoints([b1_out])
-        # apply backward
-        params_grads = recompute_optimizer.backward(
-            mean_out,
-            startup_program=None,
-            parameter_list=None,
-            no_grad_set=None,
-            checkpoints=[b1_out])
-
-        # apply gradient
-        program = mean_out.block.program
-        with framework.program_guard(program, None):
-            optimize_ops = recompute_optimizer.apply_gradients(params_grads)
-
-        self.assertEqual(len(mean_out.block.ops), 13)
-        self.assertEqual([op.type for op in mean_out.block.ops], [
-            "mul", "elementwise_add", "elementwise_add", "mean",
-            "fill_constant", "mean_grad", "elementwise_add_grad", "mul",
-            "elementwise_add_grad", "mul_grad", "sgd", "sgd", "sgd"
-        ])
-
-    def test_load(self):
-        mul_out, b1_out, b2_out, mean_out = self.net()
-        sgd_optimizer = optimizer.SGD(learning_rate=1.0)
-        recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
-        recompute_optimizer._set_checkpoints([b1_out])
-        try:
-            stat_dict = {}
-            recompute_optimizer.load(stat_dict)
-        except NotImplementedError as e:
-            self.assertEqual(
-                "load function is not supported by Recompute Optimizer for now",
-                cpt.get_exception_message(e))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
deleted file mode 100644
index 5c4a6ca59e53d0edafda87eae19516a80ec32c40..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_pad2d_op.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestPad2dOp(OpTest):
-    def setUp(self):
-        self.pad_value = 0.0
-        self.variable_paddings = False
-        self.initTestCase()
-        self.op_type = "pad2d"
-        self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
-        self.attrs = {}
-        if self.variable_paddings:
-            self.attrs['paddings'] = []
-            self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
-                "int32")
-        else:
-            self.attrs['paddings'] = np.array(self.paddings).flatten()
-        self.attrs['pad_value'] = self.pad_value
-        self.attrs['mode'] = self.mode
-        self.attrs['data_format'] = self.data_format
-        if self.data_format == "NCHW":
-            paddings = [(0, 0), (0, 0), (self.paddings[0], self.paddings[1]),
-                        (self.paddings[2], self.paddings[3])]
-        else:
-            paddings = [(0, 0), (self.paddings[0], self.paddings[1]),
-                        (self.paddings[2], self.paddings[3]), (0, 0)]
-        if self.mode == "constant":
-            out = np.pad(self.inputs['X'],
-                         paddings,
-                         mode=self.mode,
-                         constant_values=self.pad_value)
-        else:
-            out = np.pad(self.inputs['X'], paddings, mode=self.mode)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.006)
-
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 4)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "constant"
-        self.data_format = "NCHW"
-        self.pad_value = 0.0
-
-
-class TestCase1(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 4)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "reflect"
-        self.data_format = "NCHW"
-
-
-class TestCase2(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 4)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "edge"
-        self.data_format = "NCHW"
-
-
-class TestCase3(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 4, 4, 2)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "reflect"
-        self.data_format = "NHWC"
-
-
-class TestCase4(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 4, 4, 2)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "edge"
-        self.data_format = "NHWC"
-
-
-class TestCase5(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 4, 4, 2)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "constant"
-        self.pad_value = 1.2
-        self.data_format = "NHWC"
-
-
-class TestCase6(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 4, 4, 2)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "constant"
-        self.pad_value = 1.2
-        self.data_format = "NHWC"
-        self.variable_paddings = True
-
-
-class TestCase7(TestPad2dOp):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 4)
-        self.paddings = [0, 1, 2, 3]
-        self.mode = "reflect"
-        self.data_format = "NCHW"
-        self.variable_paddings = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pad_constant_like.py b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py
deleted file mode 100644
index 6b733fd8fa023f07013909502dbbd5371297216e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_pad_constant_like.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestPadOp(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = "pad_constant_like"
-        self.inputs = {
-            'X': np.random.random(self.x_shape).astype("float32"),
-            'Y': np.random.random(self.y_shape).astype("float32")
-        }
-        self.attrs = {}
-        self.attrs['pad_value'] = self.pad_value
-        self.outputs = {
-            'Out': np.pad(self.inputs['Y'],
-                          self.paddings,
-                          mode='constant',
-                          constant_values=self.pad_value)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Y'], 'Out', max_relative_error=0.006)
-
-    def initTestCase(self):
-        self.x_shape = (16, 16)
-        self.y_shape = (3, 16)
-        self.pad_value = 0.1
-        self.paddings = [(0, 13), (0, 0)]
-
-
-class TestCase1(TestPadOp):
-    def initTestCase(self):
-        self.x_shape = (4, 3, 4, 4)
-        self.y_shape = (2, 3, 4, 4)
-        self.paddings = [(0, 2), (0, 0), (0, 0), (0, 0)]
-        self.pad_value = 0.5
-
-
-class TestCase2(TestPadOp):
-    def initTestCase(self):
-        self.x_shape = (4, 3, 4, 4)
-        self.y_shape = (2, 3, 2, 4)
-        self.paddings = [(0, 2), (0, 0), (0, 2), (0, 0)]
-        self.pad_value = 0.5
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py
deleted file mode 100644
index 58e56ca1a4dbdc48765a36e1a64b9a2ec8cf9025..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_pad_op.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestPadOp(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = "pad"
-        self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
-        self.attrs = {}
-        self.attrs['paddings'] = np.array(self.paddings).flatten()
-        self.attrs['pad_value'] = self.pad_value
-        self.outputs = {
-            'Out': np.pad(self.inputs['X'],
-                          self.paddings,
-                          mode='constant',
-                          constant_values=self.pad_value)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.006)
-
-    def initTestCase(self):
-        self.shape = (16, 16)
-        self.paddings = [(0, 1), (2, 3)]
-        self.pad_value = 0.0
-
-
-class TestCase1(TestPadOp):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 4)
-        self.paddings = [(0, 1), (2, 3), (2, 1), (1, 1)]
-        self.pad_value = 0.5
-
-
-class TestCase2(TestPadOp):
-    def initTestCase(self):
-        self.shape = (2, 2, 2)
-        self.paddings = [(0, 0), (0, 0), (1, 2)]
-        self.pad_value = 1.0
-
-
-class TestCase3(TestPadOp):
-    def initTestCase(self):
-        self.shape = (8)
-        self.paddings = [(0, 1)]
-        self.pad_value = 0.9
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
deleted file mode 100644
index 19cd1577df4a1a202513006263121b323591793c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-import paddle.fluid as fluid
-
-
-class TestParallelDygraphMnist(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._nccl2_mode = True
-        self._dygraph = True
-
-    def test_mnist(self):
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_mnist.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
deleted file mode 100644
index a89eb9e0ce25d5404239da670cd83bcafcfe6bd2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-import paddle.fluid as fluid
-
-
-class TestParallelDygraphSeResNeXt(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._nccl2_mode = True
-        self._dygraph = True
-
-    def test_se_resnext(self):
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_se_resnext.py", delta=0.01)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
deleted file mode 100644
index 6671a2def3cccd2acd76025e73486b06b4bb1471..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.dataset.conll05 as conll05
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-import paddle.fluid.core as core
-import unittest
-import paddle
-import numpy as np
-import os
-
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_dict_len = len(verb_dict)
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-mix_hidden_lr = 1e-3
-embedding_name = 'emb'
-
-
-def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
-            is_sparse, **ignored):
-    # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        is_sparse=is_sparse,
-        size=[pred_dict_len, word_dim],
-        dtype='float32',
-        param_attr='vemb')
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        is_sparse=is_sparse,
-        size=[mark_dict_len, mark_dim],
-        dtype='float32')
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        fluid.layers.embedding(
-            size=[word_dict_len, word_dim],
-            is_sparse=is_sparse,
-            input=x,
-            param_attr=fluid.ParamAttr(
-                name=embedding_name, trainable=False)) for x in word_input
-    ]
-    # TODO(zcd): if the parameter is not trainable, the
-    #  parameter's gradient should not generated.
-    for emb_layer in emb_layers:
-        emb_layer.stop_gradient = True
-
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-
-    hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
-        for emb in emb_layers
-    ]
-
-    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
-
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=hidden_dim,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid')
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
-        ])
-
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=hidden_dim,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1))
-
-        input_tmp = [mix_hidden, lstm]
-
-    feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
-    ])
-
-    return feature_out
-
-
-class TestCRFModel(unittest.TestCase):
-    def check_network_convergence(self,
-                                  is_sparse,
-                                  build_strategy=None,
-                                  use_cuda=True):
-        os.environ['CPU_NUM'] = str(4)
-        main = fluid.Program()
-        startup = fluid.Program()
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(main, startup):
-                word = fluid.layers.data(
-                    name='word_data', shape=[1], dtype='int64', lod_level=1)
-                predicate = fluid.layers.data(
-                    name='verb_data', shape=[1], dtype='int64', lod_level=1)
-                ctx_n2 = fluid.layers.data(
-                    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-                ctx_n1 = fluid.layers.data(
-                    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-                ctx_0 = fluid.layers.data(
-                    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-                ctx_p1 = fluid.layers.data(
-                    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-                ctx_p2 = fluid.layers.data(
-                    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-                mark = fluid.layers.data(
-                    name='mark_data', shape=[1], dtype='int64', lod_level=1)
-
-                feature_out = db_lstm(**locals())
-                target = fluid.layers.data(
-                    name='target', shape=[1], dtype='int64', lod_level=1)
-                crf_cost = fluid.layers.linear_chain_crf(
-                    input=feature_out,
-                    label=target,
-                    param_attr=fluid.ParamAttr(
-                        name='crfw', learning_rate=1e-1))
-                avg_cost = fluid.layers.mean(crf_cost)
-
-                sgd_optimizer = fluid.optimizer.SGD(
-                    learning_rate=fluid.layers.exponential_decay(
-                        learning_rate=0.01,
-                        decay_steps=100000,
-                        decay_rate=0.5,
-                        staircase=True))
-                sgd_optimizer.minimize(avg_cost)
-
-                train_data = paddle.batch(
-                    paddle.reader.shuffle(
-                        paddle.dataset.conll05.test(), buf_size=8192),
-                    batch_size=16)
-
-                place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-                exe = fluid.Executor(place)
-                exe.run(startup)
-
-                train_cp = compiler.CompiledProgram(main).with_data_parallel(
-                    loss_name=avg_cost.name, build_strategy=build_strategy)
-
-                feeder = fluid.DataFeeder(
-                    feed_list=[
-                        word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
-                        mark, target
-                    ],
-                    place=fluid.CPUPlace())
-
-            data = train_data()
-            for i in range(10):
-                cur_batch = next(data)
-                print(exe.run(train_cp,
-                              feed=feeder.feed(cur_batch),
-                              fetch_list=[avg_cost.name])[0])
-
-    def _new_build_strategy(self, use_reduce=False):
-        build_strategy = fluid.BuildStrategy()
-
-        if use_reduce:
-            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        else:
-            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-
-        return build_strategy
-
-    def test_update_sparse_parameter_all_reduce(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                is_sparse=True,
-                build_strategy=self._new_build_strategy(),
-                use_cuda=True)
-
-        self.check_network_convergence(
-            is_sparse=True,
-            build_strategy=self._new_build_strategy(),
-            use_cuda=False)
-
-    def test_update_dense_parameter_all_reduce(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                is_sparse=False,
-                build_strategy=self._new_build_strategy(),
-                use_cuda=True)
-
-        self.check_network_convergence(
-            is_sparse=False,
-            build_strategy=self._new_build_strategy(),
-            use_cuda=False)
-
-    def test_update_sparse_parameter_reduce(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                is_sparse=True,
-                build_strategy=self._new_build_strategy(use_reduce=True),
-                use_cuda=True)
-        self.check_network_convergence(
-            is_sparse=True,
-            build_strategy=self._new_build_strategy(use_reduce=True),
-            use_cuda=False)
-
-    def test_update_dense_parameter_reduce(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                is_sparse=False,
-                build_strategy=self._new_build_strategy(use_reduce=True),
-                use_cuda=True)
-        self.check_network_convergence(
-            is_sparse=False,
-            build_strategy=self._new_build_strategy(use_reduce=True),
-            use_cuda=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf_auto_growth.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf_auto_growth.py
deleted file mode 100644
index 52a10fcca45c266cbf9dcbf6eac11d2b00307197..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf_auto_growth.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from test_parallel_executor_crf import *
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py
deleted file mode 100644
index e0bae089829b330e1a2dba34782f096f24279368..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import numpy
-import os
-
-
-class TestParallelExecutorDropExeScope(unittest.TestCase):
-    def check_drop_scope(self, use_cuda=True):
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-        if not use_cuda:
-            os.environ['CPU_NUM'] = str(2)
-
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-            hidden = fluid.layers.fc(input=data, size=10)
-            loss = fluid.layers.mean(hidden)
-            test_program = fluid.default_main_program().clone(for_test=True)
-            fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-        exe = fluid.Executor(place)
-        exe.run(startup_program)
-
-        exec_strateg = fluid.ExecutionStrategy()
-        exec_strateg.num_iteration_per_drop_scope = 10
-
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=use_cuda,
-            main_program=train_program,
-            loss_name=loss.name,
-            exec_strategy=exec_strateg)
-        test_exe = fluid.ParallelExecutor(
-            use_cuda=use_cuda,
-            main_program=test_program,
-            share_vars_from=train_exe,
-            exec_strategy=exec_strateg)
-
-        x = numpy.random.random(size=(10, 1)).astype('float32')
-        train_exe.run(feed={"X": x}, fetch_list=[loss.name])
-        test_exe.run(feed={"X": x}, fetch_list=[loss.name])
-
-        assert train_exe._need_create_local_exe_scopes() == False
-        assert test_exe._need_create_local_exe_scopes() == False
-
-        # drop the local execution scope immediately
-        train_exe.drop_local_exe_scopes()
-        test_exe.drop_local_exe_scopes()
-
-        assert train_exe._need_create_local_exe_scopes()
-        assert test_exe._need_create_local_exe_scopes()
-
-    def test_drop_scope(self):
-        self.check_drop_scope(use_cuda=False)
-        if fluid.core.is_compiled_with_cuda():
-            self.check_drop_scope(use_cuda=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
deleted file mode 100644
index 328b3a4813eec261d39985ef80c47d0c827380ca..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-import unittest
-import logging
-import six
-import os
-os.environ['CPU_NUM'] = str(4)
-
-
-class TestBase(unittest.TestCase):
-    def main(self,
-             network_func,
-             iter=10,
-             iter_per_pe=10,
-             use_gpu=True,
-             use_experimental_executor=False):
-        if use_gpu and not fluid.core.is_compiled_with_cuda():
-            logging.warning(
-                "Paddle is not compiled with CUDA, skip GPU unittests")
-            return
-
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.Scope()
-        with fluid.program_guard(main_prog, startup_prog):
-            with fluid.scope_guard(scope):
-                loss = network_func()
-                exe = fluid.Executor(
-                    fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace())
-                exe.run(startup_prog)
-
-                exe_strategy = fluid.ExecutionStrategy()
-                exe_strategy._dry_run = True
-                exe_strategy.use_experimental_executor = use_experimental_executor
-                train_cp = compiler.CompiledProgram(
-                    main_prog).with_data_parallel(
-                        loss_name=loss.name, exec_strategy=exe_strategy)
-                for _ in six.moves.xrange(iter):
-                    for _ in six.moves.xrange(iter_per_pe):
-                        exe.run(train_cp)
-
-
-class TestMNISTDryRun(TestBase):
-    def test_mnist_dry_run(self):
-        for use_gpu in (False, True):
-            for use_experimental_executor in (False, True):
-                self.main(
-                    network_func=TestMNISTDryRun.network_func,
-                    use_gpu=use_gpu,
-                    use_experimental_executor=use_experimental_executor)
-
-    @staticmethod
-    def network_func():
-        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        hidden = img
-        for _ in six.moves.xrange(10):
-            hidden = fluid.layers.fc(input=img, size=200, act='tanh')
-        prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-        loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_loss = fluid.layers.mean(loss)
-        fluid.optimizer.Adam().minimize(avg_loss)
-        return avg_loss
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py
deleted file mode 100644
index 831e2e761088bb173168b946fb6bca945d6c90f5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from functools import partial
-import numpy
-import unittest
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from simple_nets import init_data, simple_fc_net
-import os
-
-
-class TestFeedPersistableVar(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-        batch_size = 4
-        cls.img, cls.label = init_data(
-            batch_size, img_shape=[784], label_range=9)
-        cls.feed_dict = {
-            'image': cls.img,
-            'label': cls.label,
-            'learning_rate': numpy.array([1.0]).astype("float32")
-        }
-
-    def optimizer(self):
-        learning_rate = fluid.layers.create_global_var(
-            name="learning_rate",
-            shape=[1],
-            value=1.0,
-            dtype='float32',
-            persistable=True)
-        optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
-        return optimizer
-
-    def check_feed_persistable_var(self, feed_dict, use_cuda=False):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net()
-
-            optimizer = self.optimizer()
-            optimizer.minimize(loss)
-
-            exe.run(program=startup)
-            compiled_prog = fluid.compiler.CompiledProgram(
-                main).with_data_parallel(loss_name=loss.name)
-
-            exe.run(program=compiled_prog, feed=feed_dict)
-
-    def test_feed_persistable_var(self):
-        self.check_feed_persistable_var(self.feed_dict)
-        self.check_feed_persistable_var(self.feed_dict, use_cuda=True)
-
-        self.feed_dict['learning_rate'] = numpy.array(
-            [1.0, 1.0]).astype("float32")
-        self.check_feed_persistable_var(self.feed_dict, use_cuda=True)
-
-        self.feed_dict['learning_rate'] = numpy.array(
-            [1.0, 1.0]).astype("float32")
-        run = partial(self.check_feed_persistable_var, self.feed_dict)
-        self.assertRaises(core.EnforceNotMet, run)
-
-        self.feed_dict['image'] = self.img[0, :]
-        self.feed_dict['label'] = self.label[0, :]
-        run = partial(self.check_feed_persistable_var, self.feed_dict)
-        self.assertRaises(core.EnforceNotMet, run)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
deleted file mode 100644
index 052edac0ea7a37306b556a7012f378b0d68bef7f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import math
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-import paddle.fluid.core as core
-import unittest
-import numpy as np
-import os
-
-
-def Lenet(data, class_dim):
-    conv1 = fluid.layers.conv2d(data, 4, 5, 1, act=None)
-    bn1 = fluid.layers.batch_norm(conv1, act='relu')
-    pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
-    conv2 = fluid.layers.conv2d(pool1, 16, 5, 1, act=None)
-    bn2 = fluid.layers.batch_norm(conv2, act='relu')
-    pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
-
-    fc1 = fluid.layers.fc(pool2, size=50, act='relu')
-    fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax')
-
-    return fc2
-
-
-class TestFetchAndFeed(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def parallel_exe(self,
-                     use_cuda,
-                     run_parallel_exe,
-                     use_faster_executor=False,
-                     num_threads=4,
-                     seed=1):
-        main_program = fluid.Program()
-        startup = fluid.Program()
-        startup.random_seed = seed
-        with fluid.program_guard(main_program, startup):
-            data = fluid.layers.data(
-                name='image', shape=[3, 224, 224], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            out = Lenet(data, class_dim=102)
-            loss = fluid.layers.cross_entropy(input=out, label=label)
-            loss = fluid.layers.mean(loss)
-            opt = fluid.optimizer.Momentum(
-                learning_rate=0.1,
-                momentum=0.9,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            opt.minimize(loss)
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup)
-
-        #FIXME force disable enable_inplace and memory_optimize to pass the unittest
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.enable_inplace = False
-        build_strategy.memory_optimize = False
-        exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.use_experimental_executor = use_faster_executor
-        exec_strategy.num_threads = num_threads
-        train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
-
-        run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)
-
-    def run_parallel_exe_with_fetch(self, compiled_program, exe, use_cuda, data,
-                                    label, loss):
-        def get_data(batch_size=8):
-            np.random.seed(5)
-            while True:
-                img = np.random.random(
-                    size=[batch_size, 3, 224, 224]).astype(np.float32)
-                l = (np.random.random(size=[batch_size, 1]) *
-                     10).astype(np.int64)
-                yield img, l
-
-        fetch_list = []
-        all_vars = compiled_program._program.global_block().vars
-
-        for k, v in all_vars.items():
-            if ('tmp' not in k) and (
-                    k[0] is not '_' or v.persistable
-            ) and v.type == core.VarDesc.VarType.LOD_TENSOR:
-                fetch_list.append(k)
-
-        for batch_id, img_label in enumerate(get_data()):
-            img, l = img_label
-            train_inputs = {data.name: img, label.name: l}
-            ret = exe.run(compiled_program,
-                          fetch_list=fetch_list,
-                          feed=train_inputs,
-                          return_numpy=True)
-            for i in range(len(fetch_list)):
-                assert not math.isnan(np.sum(ret[i])) and \
-                       not math.isinf(np.sum(ret[i]))
-            if batch_id == 2:
-                break
-
-    def run_parallel_exe_with_feed(self, compiled_program, exe, use_cuda, data,
-                                   label, loss):
-        def get_data(batch_size=8):
-            np.random.seed(5)
-            while True:
-                train_data = []
-                for _ in range(batch_size):
-                    img = np.random.random(
-                        size=[1, 3, 224, 224]).astype(np.float32)
-                    label = (np.random.random(size=[1, 1]) *
-                             10).astype(np.int64)
-                    train_data.append([img, label])
-                yield train_data
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
-        reader = feeder.decorate_reader(get_data, multi_devices=True)
-
-        for batch_id, data in enumerate(reader()):
-            loss_np = exe.run(compiled_program,
-                              feed=data,
-                              fetch_list=[loss.name])[0]
-            print(batch_id, loss_np)
-            if batch_id == 2:
-                break
-
-    def check_executor(self, use_faster_executor=False, num_threads=4):
-        if core.is_compiled_with_cuda():
-            self.parallel_exe(
-                use_cuda=True,
-                run_parallel_exe=self.run_parallel_exe_with_fetch,
-                use_faster_executor=use_faster_executor,
-                num_threads=num_threads)
-        self.parallel_exe(
-            use_cuda=False,
-            run_parallel_exe=self.run_parallel_exe_with_fetch,
-            use_faster_executor=use_faster_executor,
-            num_threads=num_threads)
-
-    def test_fetch(self):
-        for use_faster_executor in {True, False}:
-            self.check_executor(
-                use_faster_executor=use_faster_executor, num_threads=4)
-            self.check_executor(
-                use_faster_executor=use_faster_executor, num_threads=1)
-
-    def test_feed(self):
-        if core.is_compiled_with_cuda():
-            self.parallel_exe(
-                use_cuda=True, run_parallel_exe=self.run_parallel_exe_with_feed)
-        self.parallel_exe(
-            use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_feed)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
deleted file mode 100644
index 3976dec4be04b6929736b054f6f87c6308b50d68..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import numpy as np
-import paddle.fluid.core as core
-import os
-import paddle.fluid as fluid
-from parallel_executor_test_base import TestParallelExecutorBase
-
-
-def simple_fc_net(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def fc_with_batchnorm(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(1):
-        with fluid.name_scope("hidden"):
-            hidden = fluid.layers.fc(
-                hidden,
-                size=200,
-                act='tanh',
-                bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=1.0)))
-
-            hidden = fluid.layers.batch_norm(input=hidden)
-    with fluid.name_scope("fc_layer"):
-        prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    with fluid.name_scope("loss"):
-        loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        loss = fluid.layers.mean(loss)
-    return loss
-
-
-class TestMNIST(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def _init_data(self):
-        np.random.seed(5)
-        img = np.random.random(size=[32, 784]).astype(np.float32)
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
-    def _compare_reduce_and_allreduce(self,
-                                      model,
-                                      use_cuda,
-                                      delta1=1e-6,
-                                      delta2=1e-4):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        img, label = self._init_data()
-
-        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_reduce=False)
-
-        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_reduce=True)
-
-        for loss in zip(all_reduce_first_loss, reduce_first_loss):
-            self.assertAlmostEqual(loss[0], loss[1], delta=delta1)
-        for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEqual(loss[0], loss[1], delta=delta2)
-
-    # simple_fc
-    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        img, label = self._init_data()
-
-        self.check_network_convergence(
-            simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_reduce=use_reduce)
-
-    def test_simple_fc(self):
-        # use_cuda
-        self.check_simple_fc_convergence(True)
-        self.check_simple_fc_convergence(False)
-
-    def test_simple_fc_with_new_strategy(self):
-        # use_cuda, use_reduce
-        self._compare_reduce_and_allreduce(simple_fc_net, True)
-        self._compare_reduce_and_allreduce(simple_fc_net, False)
-
-    def check_simple_fc_parallel_accuracy(self, use_cuda):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        img, label = self._init_data()
-
-        single_first_loss, single_last_loss = self.check_network_convergence(
-            method=simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_parallel_executor=False)
-        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
-            method=simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_parallel_executor=True)
-
-        self.assertAlmostEquals(
-            np.mean(parallel_first_loss),
-            single_first_loss,
-            delta=1e-6, )
-        self.assertAlmostEquals(
-            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
-
-    def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy(True)
-        self.check_simple_fc_parallel_accuracy(False)
-
-    def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        img, label = self._init_data()
-
-        self.check_network_convergence(
-            fc_with_batchnorm,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_fast_executor=use_fast_executor)
-
-    def test_batchnorm_fc(self):
-        for use_cuda in (False, True):
-            for use_fast_executor in (False, True):
-                self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor)
-
-    # FIXME(wuyi): should checkout why this fails when merging
-    # https://github.com/PaddlePaddle/Paddle/pull/16545
-    @unittest.skip("should fix this later")
-    def test_batchnorm_fc_with_new_strategy(self):
-        # NOTE: the computation result of nccl_reduce is non-deterministic,
-        # related issue: https://github.com/NVIDIA/nccl/issues/157
-        self._compare_reduce_and_allreduce(fc_with_batchnorm, True, 1e-5, 1e-2)
-        self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
deleted file mode 100644
index 080c44143a3ae70eab29b55624d6c81a1150e00d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import numpy as np
-import os
-os.environ['FLAGS_enable_parallel_graph'] = str(1)
-import paddle.fluid.core as core
-import os
-from parallel_executor_test_base import TestParallelExecutorBase
-from simple_nets import simple_fc_net, init_data
-
-
-class TestMNIST(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    # simple_fc
-    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        img, label = init_data()
-        self.check_network_convergence(
-            simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_reduce=use_reduce)
-
-    def test_simple_fc(self):
-        # use_cuda
-        self.check_simple_fc_convergence(True)
-
-    def check_simple_fc_parallel_accuracy(self, use_cuda):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        img, label = init_data()
-        single_first_loss, single_last_loss = self.check_network_convergence(
-            method=simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_parallel_executor=False)
-        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
-            method=simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_parallel_executor=True)
-
-        self.assertAlmostEquals(
-            np.mean(parallel_first_loss),
-            single_first_loss,
-            delta=1e-6, )
-        self.assertAlmostEquals(
-            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
-
-    def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy(True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py
deleted file mode 100644
index fc76f5d152dfe92f9b38a0b36d8d4559813ece2f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle.fluid as fluid
-from simple_nets import simple_fc_net, init_data
-
-
-class TestMNIST(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.save_dirname = "./"
-        cls.model_filename = "test_parallel_executor_run_load_infer_program_model"
-        cls.params_filename = "test_parallel_executor_run_load_infer_program_parameter"
-        cls.place = fluid.CPUPlace()
-        cls.exe = fluid.Executor(cls.place)
-        img, label = init_data()
-        cls.batch_data = []
-        for img, label in zip(img, label):
-            cls.batch_data.append([img, label])
-
-    def test_simple_fc(self):
-        exe_loss = self.run_with_executor()
-
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             self.save_dirname, self.exe, self.model_filename,
-             self.params_filename)
-
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=False, main_program=inference_program)
-        feed_vars = [
-            inference_program.global_block().var(var_name)
-            for var_name in ["image", "label"]
-        ]
-        feeder = fluid.DataFeeder(place=self.place, feed_list=feed_vars)
-
-        pe_loss = train_exe.run(feed=feeder.feed(self.batch_data),
-                                fetch_list=[fetch_targets[0].name])
-        assert exe_loss == pe_loss
-
-    def run_with_executor(self):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net()
-
-        feed_vars = [
-            main.global_block().var(var_name)
-            for var_name in ["image", "label"]
-        ]
-        feeder = fluid.DataFeeder(place=self.place, feed_list=feed_vars)
-
-        self.exe.run(startup)
-
-        loss_data = self.exe.run(main,
-                                 feed=feeder.feed(self.batch_data),
-                                 fetch_list=[loss.name])
-
-        fluid.io.save_inference_model(
-            self.save_dirname, ["image", "label"], [loss],
-            self.exe,
-            model_filename=self.model_filename,
-            params_filename=self.params_filename,
-            main_program=main)
-
-        return loss_data
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
deleted file mode 100644
index 1205cfcedbbf8e641171cd55d3923dff3b3d9876..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import seresnext_net
-from seresnext_test_base import TestResnetBase
-from functools import partial
-
-
-class TestResnetCPU(TestResnetBase):
-    def test_seresnext_with_learning_rate_decay(self):
-        # NOTE(zcd): This test is compare the result of use parallel_executor
-        # and executor, and the result of drop_out op and batch_norm op in
-        # this two executor have diff, so the two ops should be removed
-        # from the model.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            use_parallel_executor=False)
-        self._compare_result_with_origin_model(
-            check_func, use_cuda=False, compare_seperately=False, delta2=1e-3)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
deleted file mode 100644
index eb8cfdd8e6116075721de5e8e5af676c6858ff08..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import seresnext_net
-from seresnext_test_base import TestResnetBase
-from functools import partial
-
-
-class TestResnetGPU(TestResnetBase):
-    def test_seresnext_with_learning_rate_decay(self):
-        # NOTE(zcd): This test is compare the result of use parallel_executor
-        # and executor, and the result of drop_out op and batch_norm op in
-        # this two executor have diff, so the two ops should be removed
-        # from the model.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            use_parallel_executor=False)
-        self._compare_result_with_origin_model(
-            check_func, use_cuda=True, compare_seperately=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
deleted file mode 100644
index 159686a7cfcf92f6e3b9b13da04aee40b4bf5029..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.fluid as fluid
-fluid.core._set_fuse_parameter_group_size(3)
-fluid.core._set_fuse_parameter_memory_size(131072)
-
-import unittest
-import seresnext_net
-from seresnext_test_base import TestResnetBase
-from functools import partial
-
-
-class TestResnetWithFuseAllReduceCPU(TestResnetBase):
-    def test_seresnext_with_fused_all_reduce(self):
-        # NOTE(zcd): In order to make the program faster,
-        # this unit test remove drop_out and batch_norm.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            fuse_all_reduce_ops=True)
-        self._compare_result_with_origin_model(check_func, use_cuda=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
deleted file mode 100644
index 56fcb7914f9503daa19c9c6eb38fd53645c4c3ee..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.fluid as fluid
-fluid.core._set_fuse_parameter_group_size(3)
-fluid.core._set_fuse_parameter_memory_size(131072)
-
-import unittest
-import seresnext_net
-from seresnext_test_base import TestResnetBase
-from functools import partial
-
-
-class TestResnetWithFuseAllReduceGPU(TestResnetBase):
-    def test_seresnext_with_fused_all_reduce(self):
-        # NOTE(zcd): In order to make the program faster,
-        # this unit test remove drop_out and batch_norm.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            fuse_all_reduce_ops=True)
-        self._compare_result_with_origin_model(
-            check_func, use_cuda=True, delta2=1e-2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
deleted file mode 100644
index 74c5999c4fd3e4be82e9a5b2484efe69a0271baf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from parallel_executor_test_base import TestParallelExecutorBase
-import seresnext_net
-import paddle.fluid.core as core
-
-
-class TestResnetWithReduceBase(TestParallelExecutorBase):
-    def _compare_reduce_and_allreduce(self, use_cuda, delta2=1e-5):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer)
-        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda,
-            use_reduce=True,
-            optimizer=seresnext_net.optimizer)
-
-        for loss in zip(all_reduce_first_loss, reduce_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-        for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
-
-        if not use_cuda:
-            return
-
-        all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer,
-            enable_sequential_execution=True)
-
-        reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda,
-            use_reduce=True,
-            optimizer=seresnext_net.optimizer,
-            enable_sequential_execution=True)
-
-        for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-        for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
-
-        for loss in zip(reduce_first_loss, reduce_first_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-        for loss in zip(reduce_last_loss, reduce_last_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
-
-        for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-        for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
-
-
-class TestResnetWithReduceCPU(TestResnetWithReduceBase):
-    def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(use_cuda=False, delta2=1e-3)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
deleted file mode 100644
index f6c868859c64a651578554302bdba890a7cbcbc2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduceBase
-
-
-class TestResnetWithReduceGPU(TestResnetWithReduceBase):
-    def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(use_cuda=True, delta2=1e-2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
deleted file mode 100644
index fd47dc37e7694de3f088428d2fe677d65c8a784c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from simple_nets import simple_fc_net
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-import paddle.fluid.core as core
-import numpy as np
-import unittest
-import os
-import sys
-import math
-
-
-class ParallelExecutorTestingDuringTraining(unittest.TestCase):
-    def check_network_convergence(self, use_cuda, build_strategy=None):
-        os.environ['CPU_NUM'] = str(4)
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net()
-            test_program = main.clone(for_test=True)
-
-            opt = fluid.optimizer.SGD(learning_rate=0.001)
-            opt.minimize(loss)
-
-            batch_size = 32
-            image = np.random.normal(size=(batch_size, 784)).astype('float32')
-            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
-
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup)
-            feed_dict = {'image': image, 'label': label}
-
-            train_cp = compiler.CompiledProgram(main).with_data_parallel(
-                loss_name=loss.name, build_strategy=build_strategy)
-            test_cp = compiler.CompiledProgram(test_program).with_data_parallel(
-                loss_name=loss.name,
-                build_strategy=build_strategy,
-                share_vars_from=train_cp)
-
-            for i in range(5):
-                exe.run(train_cp, feed=feed_dict, fetch_list=[loss.name])
-                test_loss, = exe.run(test_cp,
-                                     feed=feed_dict,
-                                     fetch_list=[loss.name])
-                train_loss, = exe.run(train_cp,
-                                      feed=feed_dict,
-                                      fetch_list=[loss.name])
-
-                avg_test_loss_val = np.array(test_loss).mean()
-                if math.isnan(float(avg_test_loss_val)):
-                    sys.exit("got NaN loss, testing failed.")
-
-                avg_train_loss_val = np.array(train_loss).mean()
-                if math.isnan(float(avg_train_loss_val)):
-                    sys.exit("got NaN loss, training failed.")
-
-                self.assertTrue(
-                    np.allclose(
-                        train_loss, test_loss, atol=1e-2),
-                    "Train loss: " + str(train_loss) + "\n Test loss:" +
-                    str(test_loss))
-
-    def test_parallel_testing(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                use_cuda=True, build_strategy=build_strategy)
-        self.check_network_convergence(
-            use_cuda=False, build_strategy=build_strategy)
-
-    def test_parallel_testing_with_new_strategy_gpu(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                use_cuda=True, build_strategy=build_strategy)
-
-    def test_parallel_testing_with_new_strategy_cpu(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(
-            use_cuda=False, build_strategy=build_strategy)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
deleted file mode 100644
index 1f47d87811cf4ca63bda63da860e2ac3b9de1e7e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-import transformer_model
-import numpy as np
-from parallel_executor_test_base import TestParallelExecutorBase
-import unittest
-import paddle
-import paddle.fluid.core as core
-import paddle.dataset.wmt16 as wmt16
-import os
-from feed_data_reader import FeedDataReader
-
-
-class ModelHyperParams(object):
-    # Dictionary size for source and target language. This model directly uses
-    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
-    # alreay been added, but the <pad> token is not added. Transformer requires
-    # sequences in a mini-batch are padded to have the same length. A <pad> token is
-    # added into the original dictionary in paddle.dateset.wmt16.
-
-    # size of source word dictionary.
-    src_vocab_size = 10000
-    # index for <pad> token in source language.
-    src_pad_idx = src_vocab_size
-
-    # size of target word dictionay
-    trg_vocab_size = 10000
-    # index for <pad> token in target language.
-    trg_pad_idx = trg_vocab_size
-
-    # position value corresponding to the <pad> token.
-    pos_pad_idx = 0
-
-    # max length of sequences. It should plus 1 to include position
-    # padding token for position encoding.
-    max_length = 50
-
-    # the dimension for word embeddings, which is also the last dimension of
-    # the input and output of multi-head attention, position-wise feed-forward
-    # networks, encoder and decoder.
-
-    d_model = 512
-    # size of the hidden layer in position-wise feed-forward networks.
-    d_inner_hid = 1024
-    # the dimension that keys are projected to for dot-product attention.
-    d_key = 64
-    # the dimension that values are projected to for dot-product attention.
-    d_value = 64
-    # number of head used in multi-head attention.
-    n_head = 8
-    # number of sub-layers to be stacked in the encoder and decoder.
-    # NOTE(zcd): the origin number of layer is 6, to make this unit test faster,
-    # we should reduce the layer number to 4.
-    n_layer = 4
-    # dropout rate used by all dropout layers.
-    dropout = 0.1
-
-
-def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias. Then, convert the numpy
-    data to tensors and return a dict mapping names to tensors.
-    """
-
-    def __pad_batch_data(insts,
-                         pad_idx,
-                         is_target=False,
-                         return_pos=True,
-                         return_attn_bias=True,
-                         return_max_len=True):
-        """
-        Pad the instances to the max sequence length in batch, and generate the
-        corresponding position data and attention bias.
-        """
-        return_list = []
-        max_len = max(len(inst) for inst in insts)
-        inst_data = np.array(
-            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
-        return_list += [inst_data.astype("int64").reshape([-1, 1])]
-        if return_pos:
-            inst_pos = np.array([[
-                pos_i + 1 if w_i != pad_idx else 0
-                for pos_i, w_i in enumerate(inst)
-            ] for inst in inst_data])
-
-            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
-        if return_attn_bias:
-            if is_target:
-                # This is used to avoid attention on paddings and subsequent
-                # words.
-                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
-                                              max_len))
-                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
-                    [-1, 1, max_len, max_len])
-                slf_attn_bias_data = np.tile(slf_attn_bias_data,
-                                             [1, n_head, 1, 1]) * [-1e9]
-            else:
-                # This is used to avoid attention on paddings.
-                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
-                                               (max_len - len(inst))
-                                               for inst in insts])
-                slf_attn_bias_data = np.tile(
-                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
-                    [1, n_head, max_len, 1])
-            return_list += [slf_attn_bias_data.astype("float32")]
-        if return_max_len:
-            return_list += [max_len]
-        return return_list if len(return_list) > 1 else return_list[0]
-
-    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
-        [inst[0] for inst in insts], src_pad_idx, is_target=False)
-    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
-        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
-    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
-                                [1, 1, trg_max_len, 1]).astype("float32")
-    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
-                                False, False, False)
-    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
-
-    return [
-        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
-        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
-    ]
-
-
-feed_data_reader = None
-
-
-def transformer(use_feed):
-    assert not use_feed, "transfomer doesn't support feed yet"
-    return transformer_model.transformer(
-        ModelHyperParams.src_vocab_size + 1,
-        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
-        ModelHyperParams.n_layer, ModelHyperParams.n_head,
-        ModelHyperParams.d_key, ModelHyperParams.d_value,
-        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
-        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
-        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
-
-
-def get_feed_data_reader():
-    global feed_data_reader
-    if feed_data_reader is not None:
-        return feed_data_reader
-
-    reader = paddle.batch(
-        wmt16.train(ModelHyperParams.src_vocab_size,
-                    ModelHyperParams.trg_vocab_size),
-        batch_size=transformer_model.batch_size)
-    all_batch_tensors = []
-    for batch in reader():
-        tensors = []
-        for tensor in prepare_batch_input(batch, ModelHyperParams.src_pad_idx,
-                                          ModelHyperParams.trg_pad_idx,
-                                          ModelHyperParams.n_head):
-            tensors.append(np.array(tensor))
-        all_batch_tensors.append(tensors)
-
-    def __reader__():
-        for t in all_batch_tensors:
-            yield t
-
-    feed_data_reader = FeedDataReader(
-        feed_list=transformer_model.build_inputs(
-            ModelHyperParams.max_length + 1, ModelHyperParams.n_head),
-        reader=__reader__)
-
-    return feed_data_reader
-
-
-class TestTransformer(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def test_main(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                transformer,
-                use_cuda=True,
-                feed_data_reader=get_feed_data_reader())
-            self.check_network_convergence(
-                transformer,
-                use_cuda=True,
-                enable_sequential_execution=True,
-                feed_data_reader=get_feed_data_reader())
-        self.check_network_convergence(
-            transformer,
-            use_cuda=False,
-            iter=2,
-            feed_data_reader=get_feed_data_reader())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer_auto_growth.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer_auto_growth.py
deleted file mode 100644
index e7afa27b7b9fed679a0f3fa8f308b5f0518bc036..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer_auto_growth.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from test_parallel_executor_transformer import *
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parameter.py b/python/paddle/fluid/tests/unittests/test_parameter.py
deleted file mode 100644
index df42e6cb9a050b76099b4a53fdd08d2852284d1f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parameter.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.framework import default_main_program
-import paddle.fluid.core as core
-from paddle.fluid.executor import Executor
-import paddle.fluid.io as io
-from paddle.fluid.initializer import ConstantInitializer
-import numpy as np
-
-main_program = default_main_program()
-
-
-class TestParameter(unittest.TestCase):
-    def test_param(self):
-        shape = [784, 100]
-        val = 1.0625
-        b = main_program.global_block()
-        param = b.create_parameter(
-            name='fc.w',
-            shape=shape,
-            dtype='float32',
-            initializer=ConstantInitializer(val))
-        self.assertIsNotNone(param)
-        self.assertEqual('fc.w', param.name)
-        self.assertEqual((784, 100), param.shape)
-        self.assertEqual(core.VarDesc.VarType.FP32, param.dtype)
-        self.assertEqual(0, param.block.idx)
-        exe = Executor(core.CPUPlace())
-        p = exe.run(main_program, fetch_list=[param])[0]
-        self.assertTrue(np.allclose(p, np.ones(shape) * val))
-        p = io.get_parameter_value_by_name('fc.w', exe, main_program)
-        self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
deleted file mode 100644
index 1661f753a8464baa0c9497e9dbd0e348b5431750..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import paddle.fluid as fluid
-
-fluid.core._set_eager_deletion_mode(0.0, 0.55, True)
-
-from test_parallel_executor_transformer import TestTransformer
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
deleted file mode 100644
index 497bea43567774f356de379acced2544c8302d46..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from simple_nets import simple_fc_net
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import compiler
-import numpy as np
-import unittest
-import os
-import sys
-import math
-
-
-class TestPassBuilder(unittest.TestCase):
-    def check_network_convergence(self, use_cuda, build_strategy=None):
-        os.environ['CPU_NUM'] = str(4)
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net()
-            test_program = main.clone(for_test=True)
-
-            opt = fluid.optimizer.SGD(learning_rate=0.001)
-            opt.minimize(loss)
-
-            batch_size = 32
-            image = np.random.normal(size=(batch_size, 784)).astype('float32')
-            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
-
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup)
-            feed_dict = {'image': image, 'label': label}
-
-            train_cp = compiler.CompiledProgram(main).with_data_parallel(
-                loss_name=loss.name, build_strategy=build_strategy)
-            test_cp = compiler.CompiledProgram(test_program).with_data_parallel(
-                loss_name=loss.name,
-                build_strategy=build_strategy,
-                share_vars_from=train_cp)
-
-            for i in range(5):
-                _ = exe.run(train_cp, fetch_list=[loss.name], feed=feed_dict)
-                test_loss, = exe.run(test_cp,
-                                     fetch_list=[loss.name],
-                                     feed=feed_dict)
-                train_loss = exe.run(train_cp,
-                                     fetch_list=[loss.name],
-                                     feed=feed_dict)
-
-                avg_test_loss_val = np.array(test_loss).mean()
-                if math.isnan(float(avg_test_loss_val)):
-                    sys.exit("got NaN loss, testing failed.")
-
-                avg_train_loss_val = np.array(train_loss).mean()
-                if math.isnan(float(avg_train_loss_val)):
-                    sys.exit("got NaN loss, training failed.")
-
-                self.assertTrue(
-                    np.allclose(
-                        train_loss, test_loss, atol=1e-8),
-                    "Train loss: " + str(train_loss) + "\n Test loss:" +
-                    str(test_loss))
-
-    def test_parallel_testing_with_new_strategy(self):
-        build_strategy = fluid.BuildStrategy()
-        self.assertFalse(build_strategy.fuse_elewise_add_act_ops)
-        build_strategy.fuse_elewise_add_act_ops = True
-        #FIXME: currently fuse_elewise_add_act_ops not compatible with below options
-        build_strategy.enable_inplace = False
-        build_strategy.memory_optimize = False
-        pass_builder = build_strategy._finalize_strategy_and_create_passes()
-        self.assertTrue("fuse_elewise_add_act_pass" in
-                        [p.type() for p in pass_builder.all_passes()])
-
-        origin_len = len(pass_builder.all_passes())
-
-        viz_pass = pass_builder.append_pass("graph_viz_pass")
-        self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
-
-        pass_builder.insert_pass(
-            len(pass_builder.all_passes()), "graph_viz_pass")
-        self.assertEqual(origin_len + 2, len(pass_builder.all_passes()))
-
-        pass_builder.remove_pass(len(pass_builder.all_passes()) - 1)
-        self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
-        viz_pass.set("graph_viz_path", "/tmp/test_viz_pass")
-
-        self.check_network_convergence(
-            use_cuda=core.is_compiled_with_cuda(),
-            build_strategy=build_strategy)
-        try:
-            os.stat("/tmp/test_viz_pass")
-        except os.error:
-            self.assertFalse(True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py
deleted file mode 100644
index f6454b49076e691a9488360ed31728adb2060705..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ /dev/null
@@ -1,152 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import numpy as np
-import os
-import shutil
-import unittest
-
-
-class TestPipelineConfig(unittest.TestCase):
-    """  TestCases for Config in Pipeline Training. """
-
-    def config(self, filelist_length, pipeline_num, reader_concurrency):
-        filelist = []
-        for i in range(filelist_length):
-            filelist.append("file" + str(i))
-        self.dataset.set_filelist(filelist)
-        self.pipeline_opt["concurrency_list"][0] = reader_concurrency
-        self.pipeline_num = pipeline_num
-
-    def helper(self, in_filelist_length, in_pipeline_num, in_reader_concurrency,
-               out_pipeline_num, out_reader_concurrency, out_dataset_thread):
-        self.config(in_filelist_length, in_pipeline_num, in_reader_concurrency)
-        res = self.exe._adjust_pipeline_resource(
-            self.pipeline_opt, self.dataset, self.pipeline_num)
-        self.assertEqual(self.pipeline_opt["concurrency_list"][0],
-                         out_reader_concurrency)
-        self.assertEqual(res, out_pipeline_num)
-        self.assertEqual(self.dataset.thread_num, out_dataset_thread)
-
-    def test_adjust_pipeline_resource(self):
-        self.exe = fluid.Executor(fluid.CPUPlace())
-        self.dataset = fluid.DatasetFactory().create_dataset(
-            "FileInstantDataset")
-        self.pipeline_opt = {"concurrency_list": [0, 1, 2]}
-        self.pipeline_num = 0
-
-        self.helper(7, 2, 2, 2, 2, 4)
-        self.helper(7, 2, 3, 2, 3, 6)
-        self.helper(7, 2, 4, 2, 3, 6)
-
-        self.helper(8, 2, 3, 2, 3, 6)
-        self.helper(8, 2, 4, 2, 4, 8)
-        self.helper(8, 2, 5, 2, 4, 8)
-
-        self.helper(3, 4, 1, 3, 1, 3)
-        self.helper(3, 4, 2, 3, 1, 3)
-
-
-class TestPipeline(unittest.TestCase):
-    """  TestCases for Pipeline Training. """
-
-    def test_pipeline(self):
-        x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
-        y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
-        emb_x = layers.embedding(
-            input=x,
-            param_attr=fluid.ParamAttr(name="embx"),
-            size=[10, 2],
-            is_sparse=False)
-        emb_y = layers.embedding(
-            input=y,
-            param_attr=fluid.ParamAttr(
-                name="emby", learning_rate=0.9),
-            size=[10, 2],
-            is_sparse=False)
-
-        concat = layers.concat([emb_x, emb_y], axis=1)
-
-        fc = layers.fc(input=concat,
-                       name="fc",
-                       size=1,
-                       num_flatten_dims=1,
-                       bias_attr=False)
-        loss = layers.reduce_mean(fc)
-
-        optimizer = fluid.optimizer.SGD(learning_rate=0.5)
-        optimizer = fluid.optimizer.PipelineOptimizer(
-            optimizer,
-            cut_list=[[emb_x, emb_y], [loss]],
-            place_list=[
-                fluid.CPUPlace(), fluid.CUDAPlace(0), fluid.CPUPlace()
-            ],
-            concurrency_list=[1, 1, 1],
-            queue_size=1,
-            sync_steps=10000000, )
-        optimizer.minimize(loss)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        #prepare data
-        batch_size = 100
-
-        def binary_print(slot, fout):
-            num = np.int16(len(slot) + 1)
-            num.tofile(fout)
-            a = np.int64(batch_size)
-            a.tofile(fout)
-            slot.tofile(fout)
-
-        #batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1)
-        #batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1)
-        batch1 = np.ones(
-            (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
-        batch2 = np.ones(
-            (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
-        data = [batch1, batch2]
-        filelist = []
-        for i in range(2):
-            filelist.append("test_pipeline_input_" + str(i))
-        for f in filelist:
-            with open(f, "wb") as fout:
-                for batch_data in data:
-                    for ins in batch_data:
-                        for slot in ins:
-                            binary_print(slot, fout)
-
-        dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset")
-        dataset.set_use_var([x, y])
-        dataset.set_batch_size(batch_size)
-        dataset.set_filelist(filelist)
-
-        for epoch in range(1):
-            exe.train_from_dataset(
-                fluid.default_main_program(),
-                dataset,
-                thread=1,
-                debug=False,
-                fetch_list=[],
-                fetch_info=[],
-                print_period=1)
-
-        for f in filelist:
-            os.remove(f)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
deleted file mode 100644
index cc3ae2b3b9d4c40a7ee992c04cac79f518acac6d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestPixelShuffle(OpTest):
-    def setUp(self):
-        self.op_type = "pixel_shuffle"
-        n, c, h, w = 2, 9, 4, 4
-        up_factor = 3
-        shape = [n, c, h, w]
-        x = np.random.random(shape).astype("float32")
-        new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h,
-                     w)
-        # reshape to (num,output_channel,upscale_factor,upscale_factor,h,w)
-        npresult = np.reshape(x, new_shape)
-        # transpose to (num,output_channel,h,upscale_factor,w,upscale_factor)
-        npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
-        oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor]
-        npresult = np.reshape(npresult, oshape)
-
-        self.inputs = {'X': x}
-        self.outputs = {'Out': npresult}
-        self.attrs = {'upscale_factor': up_factor}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
deleted file mode 100644
index 7f266056a9d98be1a6f67473be65a74957f943e9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def PolygonBoxRestore(input):
-    shape = input.shape
-    batch_size = shape[0]
-    geo_channels = shape[1]
-    h = shape[2]
-    w = shape[3]
-    h_indexes = np.array(list(range(h)) * w).reshape(
-        [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
-    w_indexes = np.array(list(range(w)) * h).reshape(
-        [h, w])[np.newaxis, :]  # [1, h, w]
-    indexes = np.concatenate(
-        (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]
-    indexes = indexes.repeat(
-        [geo_channels / 2],
-        axis=0)[np.newaxis, :]  # [1, geo_channels/2, 2, h, w]
-    indexes = indexes.repeat(
-        [batch_size], axis=0)  # [batch_size, geo_channels/2, 2, h, w]
-    return indexes.reshape(
-        input.shape) * 4 - input  # [batch_size, geo_channels, h, w]
-
-
-class TestPolygonBoxRestoreOp(OpTest):
-    def config(self):
-        self.input_shape = (1, 8, 2, 2)
-
-    def setUp(self):
-        self.config()
-        self.op_type = "polygon_box_transform"
-        input = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'Input': input}
-        output = PolygonBoxRestore(input)
-        self.outputs = {'Output': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestCase1(TestPolygonBoxRestoreOp):
-    def config(self):
-        self.input_shape = (2, 10, 3, 2)
-
-
-class TestCase2(TestPolygonBoxRestoreOp):
-    def config(self):
-        self.input_shape = (3, 12, 4, 5)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
deleted file mode 100644
index 84f6526b8052d77a32130487e1bc80c6439db7b7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ /dev/null
@@ -1,361 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from __future__ import division
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-def adaptive_start_index(index, input_size, output_size):
-    return int(np.floor(index * input_size / output_size))
-
-
-def adaptive_end_index(index, input_size, output_size):
-    return int(np.ceil((index + 1) * input_size / output_size))
-
-
-def max_pool2D_forward_naive(x,
-                             ksize,
-                             strides,
-                             paddings,
-                             global_pool=0,
-                             ceil_mode=False,
-                             exclusive=True,
-                             adaptive=False):
-    N, C, H, W = x.shape
-    if global_pool == 1:
-        ksize = [H, W]
-    if adaptive:
-        H_out, W_out = ksize
-    else:
-        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
-                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
-                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-    out = np.zeros((N, C, H_out, W_out))
-    for i in range(H_out):
-        for j in range(W_out):
-            if adaptive:
-                r_start = adaptive_start_index(i, H, ksize[0])
-                r_end = adaptive_end_index(i, H, ksize[0])
-                c_start = adaptive_start_index(j, W, ksize[1])
-                c_end = adaptive_end_index(j, W, ksize[1])
-            else:
-                r_start = np.max((i * strides[0] - paddings[0], 0))
-                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-                c_start = np.max((j * strides[1] - paddings[1], 0))
-                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
-            x_masked = x[:, :, r_start:r_end, c_start:c_end]
-
-            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
-    return out
-
-
-def avg_pool2D_forward_naive(x,
-                             ksize,
-                             strides,
-                             paddings,
-                             global_pool=0,
-                             ceil_mode=False,
-                             exclusive=True,
-                             adaptive=False):
-    N, C, H, W = x.shape
-    if global_pool == 1:
-        ksize = [H, W]
-    if adaptive:
-        H_out, W_out = ksize
-    else:
-        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
-                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
-                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-    out = np.zeros((N, C, H_out, W_out))
-    for i in range(H_out):
-        for j in range(W_out):
-            if adaptive:
-                r_start = adaptive_start_index(i, H, ksize[0])
-                r_end = adaptive_end_index(i, H, ksize[0])
-                c_start = adaptive_start_index(j, W, ksize[1])
-                c_end = adaptive_end_index(j, W, ksize[1])
-            else:
-                r_start = np.max((i * strides[0] - paddings[0], 0))
-                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-                c_start = np.max((j * strides[1] - paddings[1], 0))
-                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
-            x_masked = x[:, :, r_start:r_end, c_start:c_end]
-
-            field_size = ((r_end - r_start) * (c_end - c_start)) \
-                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
-            out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
-    return out
-
-
-class TestPool2D_Op(OpTest):
-    def setUp(self):
-        self.op_type = "pool2d"
-        self.use_cudnn = False
-        self.use_mkldnn = False
-        self.init_data_type()
-        self.init_test_case()
-        self.init_global_pool()
-        self.init_kernel_type()
-        self.init_pool_type()
-        self.init_ceil_mode()
-        self.init_exclusive()
-        self.init_adaptive()
-        if self.global_pool:
-            self.paddings = [0 for _ in range(len(self.paddings))]
-        input = np.random.random(self.shape).astype(self.dtype)
-        output = self.pool2D_forward_naive(
-            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype)
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
-
-        self.attrs = {
-            'strides': self.strides,
-            'paddings': self.paddings,
-            'ksize': self.ksize,
-            'pooling_type': self.pool_type,
-            'global_pooling': self.global_pool,
-            'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn,
-            'ceil_mode': self.ceil_mode,
-            'data_format':
-            'AnyLayout',  # TODO(dzhwinter) : should be fix latter
-            'exclusive': self.exclusive,
-            'adaptive': self.adaptive
-        }
-
-        self.outputs = {'Out': output}
-
-    def has_cudnn(self):
-        return core.is_compiled_with_cuda() and self.use_cudnn
-
-    def test_check_output(self):
-        if self.has_cudnn():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        if self.has_cudnn() and self.pool_type != "max":
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, set(['X']), 'Out', max_relative_error=0.07)
-        elif self.pool_type != "max":
-            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
-
-    def init_test_case(self):
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
-
-    def init_kernel_type(self):
-        pass
-
-    def init_data_type(self):
-        self.dtype = np.float32
-
-    def init_pool_type(self):
-        self.pool_type = "avg"
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-
-    def init_global_pool(self):
-        self.global_pool = True
-
-    def init_ceil_mode(self):
-        self.ceil_mode = False
-
-    def init_exclusive(self):
-        self.exclusive = True
-
-    def init_adaptive(self):
-        self.adaptive = False
-
-
-class TestCase1(TestPool2D_Op):
-    def init_test_case(self):
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
-
-    def init_pool_type(self):
-        self.pool_type = "avg"
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-
-    def init_global_pool(self):
-        self.global_pool = False
-
-
-class TestCase2(TestPool2D_Op):
-    def init_test_case(self):
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
-
-    def init_pool_type(self):
-        self.pool_type = "avg"
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-
-    def init_global_pool(self):
-        self.global_pool = False
-
-
-class TestCase3(TestPool2D_Op):
-    def init_pool_type(self):
-        self.pool_type = "max"
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-
-
-class TestCase4(TestCase1):
-    def init_pool_type(self):
-        self.pool_type = "max"
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-
-
-class TestCase5(TestCase2):
-    def init_pool_type(self):
-        self.pool_type = "max"
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-
-
-#--------------------test pool2d cudnn--------------------
-
-
-def create_test_cudnn_class(parent):
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    class TestCUDNNCase(parent):
-        def init_kernel_type(self):
-            self.use_cudnn = True
-
-    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNOp")
-    TestCUDNNCase.__name__ = cls_name
-    globals()[cls_name] = TestCUDNNCase
-
-
-create_test_cudnn_class(TestPool2D_Op)
-create_test_cudnn_class(TestCase1)
-create_test_cudnn_class(TestCase2)
-create_test_cudnn_class(TestCase3)
-create_test_cudnn_class(TestCase4)
-create_test_cudnn_class(TestCase5)
-
-#--------------------test pool2d cudnn_fp16--------------------
-
-
-def create_test_cudnn_fp16_class(parent, check_grad=True):
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    class TestCUDNNFp16Case(parent):
-        def init_kernel_type(self):
-            self.use_cudnn = True
-            self.dtype = np.float16
-
-        def test_check_output(self):
-            if core.is_compiled_with_cuda():
-                place = core.CUDAPlace(0)
-                if core.is_float16_supported(place):
-                    self.check_output_with_place(place, atol=1e-3)
-
-        def test_check_grad(self):
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(
-                    place) and self.pool_type != "max" and check_grad:
-                self.check_grad_with_place(
-                    place, set(['X']), 'Out', max_relative_error=0.07)
-
-    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16Op")
-    TestCUDNNFp16Case.__name__ = cls_name
-    globals()[cls_name] = TestCUDNNFp16Case
-
-
-create_test_cudnn_fp16_class(TestPool2D_Op)
-create_test_cudnn_fp16_class(TestCase1, check_grad=False)
-create_test_cudnn_fp16_class(TestCase2)
-create_test_cudnn_fp16_class(TestCase3)
-create_test_cudnn_fp16_class(TestCase4)
-create_test_cudnn_fp16_class(TestCase5)
-
-#--------------------test pool2d use ceil mode--------------------
-
-
-def create_test_cudnn_use_ceil_class(parent):
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    class TestPool2DUseCeilCase(parent):
-        def init_kernel_type(self):
-            self.use_cudnn = True
-
-        def init_ceil_mode(self):
-            self.ceil_mode = True
-
-    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNOpCeilMode")
-    TestPool2DUseCeilCase.__name__ = cls_name
-    globals()[cls_name] = TestPool2DUseCeilCase
-
-
-create_test_cudnn_use_ceil_class(TestPool2D_Op)
-create_test_cudnn_use_ceil_class(TestCase1)
-
-
-def create_test_use_ceil_class(parent):
-    class TestPool2DUseCeilCase(parent):
-        def init_ceil_mode(self):
-            self.ceil_mode = True
-
-    cls_name = "{0}_{1}".format(parent.__name__, "CeilModeCast")
-    TestPool2DUseCeilCase.__name__ = cls_name
-    globals()[cls_name] = TestPool2DUseCeilCase
-
-
-create_test_use_ceil_class(TestCase1)
-create_test_use_ceil_class(TestCase2)
-
-
-class TestAvgInclude(TestCase2):
-    def init_exclusive(self):
-        self.exclusive = False
-
-
-class TestCUDNNAvgInclude(TestCase2):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-    def init_exclusive(self):
-        self.exclusive = False
-
-
-class TestAvgPoolAdaptive(TestCase1):
-    def init_adaptive(self):
-        self.adaptive = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
deleted file mode 100644
index 5898c5a67eebefee035657e704144f8d594530c1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ /dev/null
@@ -1,408 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from __future__ import division
-
-import unittest
-import numpy as np
-
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-def adaptive_start_index(index, input_size, output_size):
-    return int(np.floor(index * input_size / output_size))
-
-
-def adaptive_end_index(index, input_size, output_size):
-    return int(np.ceil((index + 1) * input_size / output_size))
-
-
-def max_pool3D_forward_naive(x,
-                             ksize,
-                             strides,
-                             paddings,
-                             global_pool=0,
-                             ceil_mode=False,
-                             exclusive=True,
-                             adaptive=False):
-    N, C, D, H, W = x.shape
-    if global_pool == 1:
-        ksize = [D, H, W]
-    if adaptive:
-        D_out, H_out, W_out = ksize
-    else:
-        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
-                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
-                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-        W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-                 ) // strides[2] + 1 if ceil_mode else (
-                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
-    out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in range(D_out):
-        if adaptive:
-            d_start = adaptive_start_index(k, D, ksize[0])
-            d_end = adaptive_end_index(k, D, ksize[0])
-        else:
-            d_start = np.max((k * strides[0] - paddings[0], 0))
-            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in range(H_out):
-            if adaptive:
-                h_start = adaptive_start_index(i, H, ksize[1])
-                h_end = adaptive_end_index(i, H, ksize[1])
-            else:
-                h_start = np.max((i * strides[1] - paddings[1], 0))
-                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
-            for j in range(W_out):
-                if adaptive:
-                    w_start = adaptive_start_index(j, W, ksize[2])
-                    w_end = adaptive_end_index(j, W, ksize[2])
-                else:
-                    w_start = np.max((j * strides[2] - paddings[2], 0))
-                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
-                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
-
-                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
-    return out
-
-
-def avg_pool3D_forward_naive(x,
-                             ksize,
-                             strides,
-                             paddings,
-                             global_pool=0,
-                             ceil_mode=False,
-                             exclusive=True,
-                             adaptive=False):
-    N, C, D, H, W = x.shape
-    if global_pool == 1:
-        ksize = [D, H, W]
-    if adaptive:
-        D_out, H_out, W_out = ksize
-    else:
-        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
-                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
-                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-        W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-                 ) // strides[2] + 1 if ceil_mode else (
-                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
-    out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in range(D_out):
-        if adaptive:
-            d_start = adaptive_start_index(k, D, ksize[0])
-            d_end = adaptive_end_index(k, D, ksize[0])
-        else:
-            d_start = np.max((k * strides[0] - paddings[0], 0))
-            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in range(H_out):
-            if adaptive:
-                h_start = adaptive_start_index(i, H, ksize[1])
-                h_end = adaptive_end_index(i, H, ksize[1])
-            else:
-                h_start = np.max((i * strides[1] - paddings[1], 0))
-                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
-            for j in range(W_out):
-                if adaptive:
-                    w_start = adaptive_start_index(j, W, ksize[2])
-                    w_end = adaptive_end_index(j, W, ksize[2])
-                else:
-                    w_start = np.max((j * strides[2] - paddings[2], 0))
-                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
-                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
-
-                field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                             if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
-                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3,
-                                                            4)) / field_size
-    return out
-
-
-class TestPool3d_Op(OpTest):
-    def setUp(self):
-        self.op_type = "pool3d"
-        self.use_cudnn = False
-        self.dtype = np.float32
-        self.init_test_case()
-        self.init_global_pool()
-        self.init_kernel_type()
-        self.init_pool_type()
-        self.init_ceil_mode()
-        self.init_exclusive()
-        self.init_adaptive()
-
-        if self.global_pool:
-            self.paddings = [0 for _ in range(len(self.paddings))]
-        input = np.random.random(self.shape).astype(self.dtype)
-        output = self.pool3D_forward_naive(
-            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype)
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
-
-        self.attrs = {
-            'strides': self.strides,
-            'paddings': self.paddings,
-            'ksize': self.ksize,
-            'pooling_type': self.pool_type,
-            'global_pooling': self.global_pool,
-            'use_cudnn': self.use_cudnn,
-            'ceil_mode': self.ceil_mode,
-            'data_format':
-            'AnyLayout',  # TODO(dzhwinter) : should be fix latter
-            'exclusive': self.exclusive,
-            'adaptive': self.adaptive
-        }
-
-        self.outputs = {'Out': output}
-
-    def has_cudnn(self):
-        return core.is_compiled_with_cuda() and self.use_cudnn
-
-    def test_check_output(self):
-        if self.has_cudnn():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        if self.has_cudnn() and self.pool_type != "max":
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, set(['X']), 'Out', max_relative_error=0.07)
-        elif self.pool_type != "max":
-            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
-
-    def init_test_case(self):
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [0, 0, 0]
-
-    def init_kernel_type(self):
-        pass
-
-    def init_pool_type(self):
-        self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
-
-    def init_global_pool(self):
-        self.global_pool = True
-
-    def init_ceil_mode(self):
-        self.ceil_mode = False
-
-    def init_exclusive(self):
-        self.exclusive = True
-
-    def init_adaptive(self):
-        self.adaptive = False
-
-
-class TestCase1(TestPool3d_Op):
-    def init_test_case(self):
-        self.shape = [2, 3, 7, 7, 7]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [0, 0, 0]
-
-    def init_pool_type(self):
-        self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
-
-    def init_global_pool(self):
-        self.global_pool = False
-
-
-class TestCase2(TestPool3d_Op):
-    def init_test_case(self):
-        self.shape = [2, 3, 7, 7, 7]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
-
-    def init_pool_type(self):
-        self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
-
-    def init_global_pool(self):
-        self.global_pool = False
-
-
-class TestCase3(TestPool3d_Op):
-    def init_pool_type(self):
-        self.pool_type = "max"
-        self.pool3D_forward_naive = max_pool3D_forward_naive
-
-
-class TestCase4(TestCase1):
-    def init_pool_type(self):
-        self.pool_type = "max"
-        self.pool3D_forward_naive = max_pool3D_forward_naive
-
-
-class TestCase5(TestCase2):
-    def init_pool_type(self):
-        self.pool_type = "max"
-        self.pool3D_forward_naive = max_pool3D_forward_naive
-
-
-#--------------------test pool3d--------------------
-class TestCUDNNCase1(TestPool3d_Op):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16CUDNNCase1(TestPool3d_Op):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
-
-class TestCUDNNCase2(TestCase1):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16CUDNNCase2(TestCase1):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
-
-class TestCUDNNCase3(TestCase2):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16CUDNNCase3(TestCase2):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
-
-class TestCUDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16CUDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
-
-class TestCUDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16CUDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
-
-class TestCUDNNCase6(TestCase5):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-class TestFP16CUDNNCase6(TestCase5):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
-
-class TestCeilModeCase1(TestCUDNNCase1):
-    def init_ceil_mode(self):
-        self.ceil_mode = True
-
-
-class TestCeilModeCase2(TestCUDNNCase2):
-    def init_ceil_mode(self):
-        self.ceil_mode = True
-
-
-class TestCeilModeCase3(TestCase1):
-    def init_ceil_mode(self):
-        self.ceil_mode = True
-
-
-class TestCeilModeCase4(TestCase2):
-    def init_ceil_mode(self):
-        self.ceil_mode = True
-
-
-class TestAvgInclude(TestCase2):
-    def init_exclusive(self):
-        self.exclusive = False
-
-
-class TestCUDNNAvgInclude(TestCUDNNCase3):
-    def init_exclusive(self):
-        self.exclusive = False
-
-
-class TestAvgPoolAdaptive(TestCase1):
-    def init_adaptive(self):
-        self.adaptive = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
deleted file mode 100644
index 6575c408eeaa43d4f7caf257b2ebd77a942aecda..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ /dev/null
@@ -1,253 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from __future__ import division
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def adaptive_start_index(index, input_size, output_size):
-    return int(np.floor(index * input_size / output_size))
-
-
-def adaptive_end_index(index, input_size, output_size):
-    return int(np.ceil((index + 1) * input_size / output_size))
-
-
-def max_pool3D_forward_naive(x,
-                             ksize,
-                             strides,
-                             paddings,
-                             global_pool=False,
-                             adaptive=False):
-
-    N, C, D, H, W = x.shape
-    if global_pool:
-        ksize = [D, H, W]
-        paddings = [0, 0, 0]
-
-    if adaptive:
-        D_out, H_out, W_out = ksize
-    else:
-        D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-        W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
-    out = np.zeros((N, C, D_out, H_out, W_out))
-    mask = np.zeros((N, C, D_out, H_out, W_out))
-    for k in range(D_out):
-        if adaptive:
-            d_start = adaptive_start_index(k, D, ksize[0])
-            d_end = adaptive_end_index(k, D, ksize[0])
-        else:
-            d_start = np.max((k * strides[0] - paddings[0], 0))
-            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in range(H_out):
-            if adaptive:
-                h_start = adaptive_start_index(i, H, ksize[1])
-                h_end = adaptive_end_index(i, H, ksize[1])
-            else:
-                h_start = np.max((i * strides[1] - paddings[1], 0))
-                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
-            for j in range(W_out):
-                if adaptive:
-                    w_start = adaptive_start_index(j, W, ksize[2])
-                    w_end = adaptive_end_index(j, W, ksize[2])
-                else:
-                    w_start = np.max((j * strides[2] - paddings[2], 0))
-                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
-                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
-
-                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
-
-                for n in range(N):
-                    for c in range(C):
-                        arr = x_masked[n, c, :, :, :]
-                        index = np.where(arr == np.max(arr))
-                        sub_deep = index[0][0]
-                        sub_row = index[1][0]
-                        sub_col = index[2][0]
-                        index = ((d_start + sub_deep) * H +
-                                 (h_start + sub_row)) * W + w_start + sub_col
-                        mask[n, c, k, i, j] = index
-
-    return out, mask
-
-
-def max_pool2D_forward_naive(x,
-                             ksize,
-                             strides,
-                             paddings,
-                             global_pool=False,
-                             adaptive=False):
-
-    N, C, H, W = x.shape
-    if global_pool:
-        ksize = [H, W]
-        paddings = [0, 0]
-
-    if adaptive:
-        H_out, W_out = ksize
-    else:
-        H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-    out = np.zeros((N, C, H_out, W_out))
-    mask = np.zeros((N, C, H_out, W_out))
-    for i in range(H_out):
-        for j in range(W_out):
-            if adaptive:
-                r_start = adaptive_start_index(i, H, ksize[0])
-                r_end = adaptive_end_index(i, H, ksize[0])
-                c_start = adaptive_start_index(j, W, ksize[1])
-                c_end = adaptive_end_index(j, W, ksize[1])
-            else:
-                r_start = np.max((i * strides[0] - paddings[0], 0))
-                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-                c_start = np.max((j * strides[1] - paddings[1], 0))
-                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
-            x_masked = x[:, :, r_start:r_end, c_start:c_end]
-
-            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
-
-            for n in range(N):
-                for c in range(C):
-                    arr = x_masked[n, c, :, :]
-                    index = np.where(arr == np.max(arr))
-                    sub_row = index[0][0]
-                    sub_col = index[1][0]
-                    index = (r_start + sub_row) * W + c_start + sub_col
-                    mask[n, c, i, j] = index
-
-    return out, mask
-
-
-class TestMaxPoolWithIndex_Op(OpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.init_global()
-        self.init_adaptive()
-
-        input = np.random.random(self.shape).astype("float32")
-        output, mask = self.pool_forward_naive(input, self.ksize, self.strides,
-                                               self.paddings, self.global_pool,
-                                               self.adaptive)
-        output = output.astype("float32")
-        mask = mask.astype("int32")
-
-        self.attrs = {
-            'strides': self.strides,
-            'paddings': self.paddings,
-            'ksize': self.ksize,
-            'global_pooling': self.global_pool,
-            'adaptive': self.adaptive,
-        }
-
-        self.inputs = {'X': input}
-        self.outputs = {'Out': output, "Mask": mask}
-
-    def test_check_output(self):
-        self.check_output()
-
-    # def test_check_grad(self):
-    #     self.check_grad(set(['X']), ['Out'], max_relative_error=0.07)
-
-    def init_test_case(self):
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
-
-    def init_global(self):
-        self.global_pool = False
-
-    def init_adaptive(self):
-        self.adaptive = False
-
-
-class TestCase1(TestMaxPoolWithIndex_Op):
-    def init_global(self):
-        self.global_pool = True
-
-
-class TestCase2(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 7, 7, 7]
-        self.ksize = [3, 3, 3]
-        self.strides = [2, 2, 2]
-        self.paddings = [0, 0, 0]
-
-    def init_global(self):
-        self.global_pool = True
-
-
-class TestCase3(TestCase2):
-    def init_global(self):
-        self.global_pool = False
-
-
-#----------------max_pool2d_with_index----------------
-class TestCase4(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.op_type = "max_pool2d_with_index"
-        self.pool_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
-
-    def init_global(self):
-        self.global_pool = True
-
-
-class TestCase5(TestCase4):
-    def init_global(self):
-        self.global_pool = False
-
-
-class TestCase6(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.op_type = "max_pool2d_with_index"
-        self.pool_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [2, 2]
-        self.paddings = [0, 0]
-
-    def init_global(self):
-        self.global_pool = True
-
-
-class TestCase7(TestCase6):
-    def init_global(self):
-        self.global_pool = False
-
-
-class TestCastAdaptive2d(TestCase6):
-    def init_adaptive(self):
-        self.adaptive = True
-
-
-class TestCastAdaptive3d(TestMaxPoolWithIndex_Op):
-    def init_adaptive(self):
-        self.adaptive = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
deleted file mode 100644
index afe8d212d6ec218c3799780849c377e46a44bd6c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import itertools
-import numpy as np
-import six
-from op_test import OpTest
-
-
-def py_pnpair_op(score, label, query, column=-1, weight=None):
-    # group by query id
-    predictions = {}
-    batch_size = label.shape[0]
-    if weight is None:
-        weight = np.ones(shape=(batch_size, 1)).astype('float32')
-    for s, l, q, w in zip(score, label, query, weight):
-        s, l, q, w = s[column], l[0], q[0], w[0]
-        if q not in predictions:
-            predictions[q] = []
-        predictions[q].append((s, l, w))
-
-    # accumulate statistics
-    pos, neg, neu = 0, 0, 0
-    for _, ranks in six.iteritems(predictions):
-        for e1, e2 in itertools.combinations(ranks, 2):
-            s1, s2, l1, l2, w1, w2 = e1[0], e2[0], e1[1], e2[1], e1[2], e2[2]
-            w = (w1 + w2) * 0.5
-            if l1 == l2:
-                continue
-            if s1 == s2:
-                neu += w
-            elif (s1 - s2) * (l1 - l2) > 0:
-                pos += w
-            else:
-                neg += w
-
-    return np.array(pos).astype('float32'), np.array(neg).astype(
-        'float32'), np.array(neu).astype('float32')
-
-
-class TestPositiveNegativePairOp(OpTest):
-    def setUp(self):
-        self.op_type = 'positive_negative_pair'
-        batch_size = 20
-        max_query_id = 5
-        score = np.random.normal(size=(batch_size, 1)).astype('float32')
-        label = np.random.normal(size=(batch_size, 1)).astype('float32')
-        query = np.array(
-            [np.random.randint(max_query_id) for i in range(batch_size)])
-        query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
-
-        pos, neg, neu = py_pnpair_op(score, label, query)
-        self.inputs = {'Score': score, 'Label': label, 'QueryID': query}
-        self.attrs = {'column': -1}
-        self.outputs = {
-            'PositivePair': pos,
-            'NegativePair': neg,
-            'NeutralPair': neu
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestPositiveNegativePairOpAccumulateWeight(OpTest):
-    def setUp(self):
-        self.op_type = 'positive_negative_pair'
-        batch_size = 20
-        max_query_id = 5
-        max_random_num = 2 << 15
-        score_dim = 2
-        score = np.random.normal(size=(batch_size, 2)).astype('float32')
-        label = np.random.normal(size=(batch_size, 1)).astype('float32')
-        weight = np.random.normal(size=(batch_size, 1)).astype('float32')
-        query = np.array(
-            [np.random.randint(max_query_id) for i in range(batch_size)])
-        query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
-        acc_pos = np.reshape(
-            np.random.randint(max_random_num), newshape=(1)).astype('float32')
-        acc_neg = np.reshape(
-            np.random.randint(max_random_num), newshape=(1)).astype('float32')
-        acc_neu = np.reshape(
-            np.random.randint(max_random_num), newshape=(1)).astype('float32')
-        column = np.random.randint(score_dim)
-
-        pos, neg, neu = py_pnpair_op(
-            score, label, query, column=column, weight=weight)
-        self.inputs = {
-            'Score': score,
-            'Label': label,
-            'QueryID': query,
-            'AccumulatePositivePair': acc_pos,
-            'AccumulateNegativePair': acc_neg,
-            'AccumulateNeutralPair': acc_neu,
-            'Weight': weight
-        }
-        self.attrs = {'column': column}
-        self.outputs = {
-            'PositivePair': pos + acc_pos,
-            'NegativePair': neg + acc_neg,
-            'NeutralPair': neu + acc_neu
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
deleted file mode 100644
index 645637625959f214db3875bc58e4c593c27ae8f6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
+++ /dev/null
@@ -1,189 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def calc_precision(tp_count, fp_count):
-    if tp_count > 0.0 or fp_count > 0.0:
-        return tp_count / (tp_count + fp_count)
-    return 1.0
-
-
-def calc_recall(tp_count, fn_count):
-    if tp_count > 0.0 or fn_count > 0.0:
-        return tp_count / (tp_count + fn_count)
-    return 1.0
-
-
-def calc_f1_score(precision, recall):
-    if precision > 0.0 or recall > 0.0:
-        return 2 * precision * recall / (precision + recall)
-    return 0.0
-
-
-def get_states(idxs, labels, cls_num, weights=None):
-    ins_num = idxs.shape[0]
-    # TP FP TN FN
-    states = np.zeros((cls_num, 4)).astype('float32')
-    for i in range(ins_num):
-        w = weights[i] if weights is not None else 1.0
-        idx = idxs[i][0]
-        label = labels[i][0]
-        if idx == label:
-            states[idx][0] += w
-            for j in range(cls_num):
-                states[j][2] += w
-            states[idx][2] -= w
-        else:
-            states[label][3] += w
-            states[idx][1] += w
-            for j in range(cls_num):
-                states[j][2] += w
-            states[label][2] -= w
-            states[idx][2] -= w
-    return states
-
-
-def compute_metrics(states, cls_num):
-    total_tp_count = 0.0
-    total_fp_count = 0.0
-    total_fn_count = 0.0
-    macro_avg_precision = 0.0
-    macro_avg_recall = 0.0
-    for i in range(cls_num):
-        total_tp_count += states[i][0]
-        total_fp_count += states[i][1]
-        total_fn_count += states[i][3]
-        macro_avg_precision += calc_precision(states[i][0], states[i][1])
-        macro_avg_recall += calc_recall(states[i][0], states[i][3])
-    metrics = []
-    macro_avg_precision /= cls_num
-    macro_avg_recall /= cls_num
-    metrics.append(macro_avg_precision)
-    metrics.append(macro_avg_recall)
-    metrics.append(calc_f1_score(macro_avg_precision, macro_avg_recall))
-    micro_avg_precision = calc_precision(total_tp_count, total_fp_count)
-    metrics.append(micro_avg_precision)
-    micro_avg_recall = calc_recall(total_tp_count, total_fn_count)
-    metrics.append(micro_avg_recall)
-    metrics.append(calc_f1_score(micro_avg_precision, micro_avg_recall))
-    return np.array(metrics).astype('float32')
-
-
-class TestPrecisionRecallOp_0(OpTest):
-    def setUp(self):
-        self.op_type = "precision_recall"
-        ins_num = 64
-        cls_num = 10
-        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(range(cls_num), ins_num).reshape(
-            (ins_num, 1)).astype('int32')
-        labels = np.random.choice(range(cls_num), ins_num).reshape(
-            (ins_num, 1)).astype('int32')
-        states = get_states(idxs, labels, cls_num)
-        metrics = compute_metrics(states, cls_num)
-
-        self.attrs = {'class_number': cls_num}
-
-        self.inputs = {'MaxProbs': max_probs, 'Indices': idxs, 'Labels': labels}
-
-        self.outputs = {
-            'BatchMetrics': metrics,
-            'AccumMetrics': metrics,
-            'AccumStatesInfo': states
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestPrecisionRecallOp_1(OpTest):
-    def setUp(self):
-        self.op_type = "precision_recall"
-        ins_num = 64
-        cls_num = 10
-        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(range(cls_num), ins_num).reshape(
-            (ins_num, 1)).astype('int32')
-        weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = np.random.choice(range(cls_num), ins_num).reshape(
-            (ins_num, 1)).astype('int32')
-
-        states = get_states(idxs, labels, cls_num, weights)
-        metrics = compute_metrics(states, cls_num)
-
-        self.attrs = {'class_number': cls_num}
-
-        self.inputs = {
-            'MaxProbs': max_probs,
-            'Indices': idxs,
-            'Labels': labels,
-            'Weights': weights
-        }
-
-        self.outputs = {
-            'BatchMetrics': metrics,
-            'AccumMetrics': metrics,
-            'AccumStatesInfo': states
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestPrecisionRecallOp_2(OpTest):
-    def setUp(self):
-        self.op_type = "precision_recall"
-        ins_num = 64
-        cls_num = 10
-        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(range(cls_num), ins_num).reshape(
-            (ins_num, 1)).astype('int32')
-        weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = np.random.choice(range(cls_num), ins_num).reshape(
-            (ins_num, 1)).astype('int32')
-        states = np.random.randint(0, 30, (cls_num, 4)).astype('float32')
-
-        accum_states = get_states(idxs, labels, cls_num, weights)
-        batch_metrics = compute_metrics(accum_states, cls_num)
-        accum_states += states
-        accum_metrics = compute_metrics(accum_states, cls_num)
-
-        self.attrs = {'class_number': cls_num}
-
-        self.inputs = {
-            'MaxProbs': max_probs,
-            'Indices': idxs,
-            'Labels': labels,
-            'Weights': weights,
-            'StatesInfo': states
-        }
-
-        self.outputs = {
-            'BatchMetrics': batch_metrics,
-            'AccumMetrics': accum_metrics,
-            'AccumStatesInfo': accum_states
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
deleted file mode 100644
index 48a6b0577b6787d2e1231fdcbe6d2c1bb46414ed..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import six
-from op_test import OpTest
-
-
-class PReluTest(OpTest):
-    def setUp(self):
-        self.op_type = "prelu"
-        self.initTestCase()
-        x_np = np.random.normal(size=(3, 5, 5, 10)).astype("float32")
-
-        # Since zero point in prelu is not differentiable, avoid randomize
-        # zero.
-        x_np[np.abs(x_np) < 0.005] = 0.02
-
-        if self.attrs == {'mode': "all"}:
-            alpha_np = np.random.rand(1).astype("float32")
-            self.inputs = {'X': x_np, 'Alpha': alpha_np}
-        elif self.attrs == {'mode': "channel"}:
-            alpha_np = np.random.rand(1, x_np.shape[1], 1, 1).astype("float32")
-            self.inputs = {'X': x_np, 'Alpha': alpha_np}
-        else:
-            alpha_np = np.random.rand(*x_np.shape).astype("float32")
-            self.inputs = {'X': x_np, 'Alpha': alpha_np}
-
-        out_np = np.maximum(self.inputs['X'], 0.)
-        out_np = out_np + np.minimum(self.inputs['X'],
-                                     0.) * self.inputs['Alpha']
-        assert out_np is not self.inputs['X']
-        self.outputs = {'Out': out_np}
-
-    def initTestCase(self):
-        self.attrs = {'mode': "channel"}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_1_ignore_x(self):
-        self.check_grad(['Alpha'], 'Out', no_grad_set=set('X'))
-
-    def test_check_grad_2(self):
-        self.check_grad(['X', 'Alpha'], 'Out')
-
-    def test_check_grad_3_ignore_alpha(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Alpha'))
-
-
-# TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
-if six.PY2:
-
-    class TestCase1(PReluTest):
-        def initTestCase(self):
-            self.attrs = {'mode': "all"}
-
-    class TestCase2(PReluTest):
-        def initTestCase(self):
-            self.attrs = {'mode': "channel"}
-
-    class TestCase3(PReluTest):
-        def initTestCase(self):
-            self.attrs = {'mode': "element"}
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
deleted file mode 100644
index 0fc11ef8d9220dcc6875b6df2a3e527244872e11..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-from paddle.fluid.executor import Executor
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.backward import append_backward
-from paddle.fluid.framework import switch_main_program
-from paddle.fluid.framework import Program
-import numpy as np
-from simple_nets import simple_fc_net, init_data
-
-
-class TestPrintOpCPU(unittest.TestCase):
-    def setUp(self):
-        self.place = core.CPUPlace()
-        self.x_tensor = core.LoDTensor()
-        tensor_np = np.random.random(size=(2, 3)).astype('float32')
-        self.x_tensor.set(tensor_np, self.place)
-        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
-
-    def build_network(self, only_forward, **kargs):
-        x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
-        x.stop_gradient = False
-        layers.Print(input=x, **kargs)
-        loss = layers.mean(x)
-        append_backward(loss=loss)
-        return loss
-
-    def test_forward(self):
-        switch_main_program(Program())
-        printed = self.build_network(True, print_phase='forward')
-        exe = Executor(self.place)
-        outs = exe.run(feed={'x': self.x_tensor},
-                       fetch_list=[printed],
-                       return_numpy=False)
-
-    def test_backward(self):
-        switch_main_program(Program())
-        loss = self.build_network(False, print_phase='backward')
-        exe = Executor(self.place)
-        outs = exe.run(feed={'x': self.x_tensor},
-                       fetch_list=[loss],
-                       return_numpy=False)
-
-    def test_all_parameters(self):
-        x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
-        x.stop_gradient = False
-
-        for print_tensor_name in [True, False]:
-            for print_tensor_type in [True, False]:
-                for print_tensor_shape in [True, False]:
-                    for print_tensor_lod in [True, False]:
-                        layers.Print(
-                            input=x,
-                            print_tensor_name=print_tensor_name,
-                            print_tensor_type=print_tensor_type,
-                            print_tensor_shape=print_tensor_shape,
-                            print_tensor_lod=print_tensor_lod, )
-        loss = layers.mean(x)
-        append_backward(loss=loss)
-        exe = Executor(self.place)
-        outs = exe.run(feed={'x': self.x_tensor},
-                       fetch_list=[loss],
-                       return_numpy=False)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestPrintOpGPU(TestPrintOpCPU):
-    def setUp(self):
-        self.place = core.CUDAPlace(0)
-        self.x_tensor = core.LoDTensor()
-        tensor_np = np.random.random(size=(2, 3)).astype('float32')
-        self.x_tensor.set(tensor_np, self.place)
-        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
-
-
-class TestPrintOpBackward(unittest.TestCase):
-    def check_backward(self, use_cuda):
-        main = fluid.Program()
-        startup = fluid.Program()
-
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net()
-            loss = fluid.layers.Print(loss)
-            fluid.optimizer.Adam().minimize(loss)
-
-        print_ops = [op for op in main.blocks[0].ops if op.type == u'print']
-        assert len(print_ops) == 2, "The number of print op should be 2"
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup)
-
-        binary = fluid.compiler.CompiledProgram(main).with_data_parallel(
-            loss_name=loss.name)
-
-        img, label = init_data()
-        feed_dict = {"image": img, "label": label}
-        exe.run(binary, feed_dict)
-
-    def test_fw_bw(self):
-        if core.is_compiled_with_cuda():
-            self.check_backward(use_cuda=True)
-        self.check_backward(use_cuda=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
deleted file mode 100644
index 7381b74af71051f8b993ba6d116b5282dd9b84e1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ /dev/null
@@ -1,191 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-from op_test import OpTest
-
-
-class TestPriorBoxOp(OpTest):
-    def set_data(self):
-        self.init_test_params()
-        self.init_test_input()
-        self.init_test_output()
-        self.inputs = {'Input': self.input, 'Image': self.image}
-
-        self.attrs = {
-            'min_sizes': self.min_sizes,
-            'aspect_ratios': self.aspect_ratios,
-            'variances': self.variances,
-            'flip': self.flip,
-            'clip': self.clip,
-            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
-            'step_w': self.step_w,
-            'step_h': self.step_h,
-            'offset': self.offset
-        }
-        if len(self.max_sizes) > 0:
-            self.attrs['max_sizes'] = self.max_sizes
-
-        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def setUp(self):
-        self.op_type = "prior_box"
-        self.set_data()
-
-    def set_max_sizes(self):
-        max_sizes = [5, 10]
-        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
-
-    def set_min_max_aspect_ratios_order(self):
-        self.min_max_aspect_ratios_order = False
-
-    def init_test_params(self):
-        self.layer_w = 32
-        self.layer_h = 32
-
-        self.image_w = 40
-        self.image_h = 40
-
-        self.step_w = float(self.image_w) / float(self.layer_w)
-        self.step_h = float(self.image_h) / float(self.layer_h)
-
-        self.input_channels = 2
-        self.image_channels = 3
-        self.batch_size = 10
-
-        self.min_sizes = [2, 4]
-        self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
-        self.set_max_sizes()
-        self.aspect_ratios = [2.0, 3.0]
-        self.flip = True
-        self.set_min_max_aspect_ratios_order()
-        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
-        self.aspect_ratios = np.array(
-            self.aspect_ratios, dtype=np.float).flatten()
-        self.variances = [0.1, 0.1, 0.2, 0.2]
-        self.variances = np.array(self.variances, dtype=np.float).flatten()
-
-        self.clip = True
-        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
-        if len(self.max_sizes) > 0:
-            self.num_priors += len(self.max_sizes)
-        self.offset = 0.5
-
-    def init_test_input(self):
-        self.image = np.random.random(
-            (self.batch_size, self.image_channels, self.image_w,
-             self.image_h)).astype('float32')
-
-        self.input = np.random.random(
-            (self.batch_size, self.input_channels, self.layer_w,
-             self.layer_h)).astype('float32')
-
-    def init_test_output(self):
-        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
-        out_boxes = np.zeros(out_dim).astype('float32')
-        out_var = np.zeros(out_dim).astype('float32')
-
-        idx = 0
-        for h in range(self.layer_h):
-            for w in range(self.layer_w):
-                c_x = (w + self.offset) * self.step_w
-                c_y = (h + self.offset) * self.step_h
-                idx = 0
-                for s in range(len(self.min_sizes)):
-                    min_size = self.min_sizes[s]
-                    if not self.min_max_aspect_ratios_order:
-                        # rest of priors
-                        for r in range(len(self.real_aspect_ratios)):
-                            ar = self.real_aspect_ratios[r]
-                            c_w = min_size * math.sqrt(ar) / 2
-                            c_h = (min_size / math.sqrt(ar)) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
-                            idx += 1
-
-                        if len(self.max_sizes) > 0:
-                            max_size = self.max_sizes[s]
-                            # second prior: aspect_ratio = 1,
-                            c_w = c_h = math.sqrt(min_size * max_size) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
-                            idx += 1
-                    else:
-                        c_w = c_h = min_size / 2.
-                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
-                                                   (c_y - c_h) / self.image_h,
-                                                   (c_x + c_w) / self.image_w,
-                                                   (c_y + c_h) / self.image_h]
-                        idx += 1
-                        if len(self.max_sizes) > 0:
-                            max_size = self.max_sizes[s]
-                            # second prior: aspect_ratio = 1,
-                            c_w = c_h = math.sqrt(min_size * max_size) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
-                            idx += 1
-
-                        # rest of priors
-                        for r in range(len(self.real_aspect_ratios)):
-                            ar = self.real_aspect_ratios[r]
-                            if abs(ar - 1.) < 1e-6:
-                                continue
-                            c_w = min_size * math.sqrt(ar) / 2
-                            c_h = (min_size / math.sqrt(ar)) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
-                            idx += 1
-
-        # clip the prior's coordidate such that it is within[0, 1]
-        if self.clip:
-            out_boxes = np.clip(out_boxes, 0.0, 1.0)
-        # set the variance.
-        out_var = np.tile(self.variances, (self.layer_h, self.layer_w,
-                                           self.num_priors, 1))
-        self.out_boxes = out_boxes.astype('float32')
-        self.out_var = out_var.astype('float32')
-
-
-class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
-    def set_max_sizes(self):
-        self.max_sizes = []
-
-
-class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
-    def set_min_max_aspect_ratios_order(self):
-        self.min_max_aspect_ratios_order = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
deleted file mode 100644
index 367b60831c5b1d0397b7729acf078513bb074299..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import os
-import tempfile
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
-
-
-class TestProfiler(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def net_profiler(self, state, use_parallel_executor=False):
-        profile_path = os.path.join(tempfile.gettempdir(), "profile")
-        open(profile_path, "w").write("")
-        startup_program = fluid.Program()
-        main_program = fluid.Program()
-
-        with fluid.program_guard(main_program, startup_program):
-            image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-            hidden1 = fluid.layers.fc(input=image, size=64, act='relu')
-            i = layers.zeros(shape=[1], dtype='int64')
-            counter = fluid.layers.zeros(
-                shape=[1], dtype='int64', force_cpu=True)
-            until = layers.fill_constant([1], dtype='int64', value=10)
-            data_arr = layers.array_write(hidden1, i)
-            cond = fluid.layers.less_than(x=counter, y=until)
-            while_op = fluid.layers.While(cond=cond)
-            with while_op.block():
-                hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu')
-                layers.array_write(hidden_n, i, data_arr)
-                fluid.layers.increment(x=counter, value=1, in_place=True)
-                layers.less_than(x=counter, y=until, cond=cond)
-
-            hidden_n = layers.array_read(data_arr, i)
-            hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu')
-            predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-            cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(cost)
-            batch_size = fluid.layers.create_tensor(dtype='int64')
-            batch_acc = fluid.layers.accuracy(
-                input=predict, label=label, total=batch_size)
-
-        optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
-        opts = optimizer.minimize(avg_cost, startup_program=startup_program)
-
-        place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(startup_program)
-        if use_parallel_executor:
-            pe = fluid.ParallelExecutor(
-                state != 'CPU',
-                loss_name=avg_cost.name,
-                main_program=main_program)
-
-        pass_acc_calculator = fluid.average.WeightedAverage()
-        with profiler.profiler(state, 'total', profile_path) as prof:
-            for iter in range(10):
-                if iter == 2:
-                    profiler.reset_profiler()
-                x = np.random.random((32, 784)).astype("float32")
-                y = np.random.randint(0, 10, (32, 1)).astype("int64")
-
-                if use_parallel_executor:
-                    pe.run(feed={'x': x, 'y': y}, fetch_list=[avg_cost.name])
-                    continue
-                outs = exe.run(main_program,
-                               feed={'x': x,
-                                     'y': y},
-                               fetch_list=[avg_cost, batch_acc, batch_size])
-                acc = np.array(outs[1])
-                b_size = np.array(outs[2])
-                pass_acc_calculator.add(value=acc, weight=b_size)
-                pass_acc = pass_acc_calculator.eval()
-        data = open(profile_path, 'rb').read()
-        self.assertGreater(len(data), 0)
-        profile_pb = profiler_pb2.Profile()
-        profile_pb.ParseFromString(data)
-        self.assertGreater(len(profile_pb.events), 0)
-        for event in profile_pb.events:
-            if event.type == profiler_pb2.Event.GPUKernel:
-                if not event.detail_info and not event.name.startswith("MEM"):
-                    raise Exception(
-                        "Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
-                        % event.name)
-            elif event.type == profiler_pb2.Event.CPU and (
-                    event.name.startswith("Driver API") or
-                    event.name.startswith("Runtime API")):
-                print("Warning: unregister", event.name)
-
-    def test_cpu_profiler(self):
-        self.net_profiler('CPU')
-        self.net_profiler('CPU', use_parallel_executor=True)
-
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "profiler is enabled only with GPU")
-    def test_cuda_profiler(self):
-        self.net_profiler('GPU')
-        self.net_profiler('GPU', use_parallel_executor=True)
-
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "profiler is enabled only with GPU")
-    def test_all_profiler(self):
-        self.net_profiler('All')
-        self.net_profiler('All', use_parallel_executor=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py
deleted file mode 100644
index cb1d94809b4ba99fa9077f99b93689504415b71d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-
-from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name
-import paddle.fluid.layers as layers
-import paddle.fluid as fluid
-
-main_program = default_main_program()
-
-
-class TestProgram(unittest.TestCase):
-    def test_program(self):
-        b = main_program.current_block()
-        self.assertEqual(-1, b.parent_idx)
-        self.assertEqual(0, b.idx)
-
-        b = main_program._create_block()
-        self.assertEqual(1, b.idx)
-        self.assertEqual(0, b.parent_idx)
-
-        b = main_program._create_block()
-        self.assertEqual(2, b.idx)
-        self.assertEqual(1, b.parent_idx)
-
-        main_program._rollback()
-
-        b = main_program.current_block()
-        self.assertEqual(1, b.idx)
-        self.assertEqual(0, b.parent_idx)
-
-        b = main_program._create_block()
-        self.assertEqual(3, b.idx)
-        self.assertEqual(1, b.parent_idx)
-
-        main_program._rollback()
-        b = main_program.current_block()
-        self.assertEqual(1, b.idx)
-        self.assertEqual(0, b.parent_idx)
-
-    def test_program_clone(self):
-        prog = Program()
-
-        x = prog.global_block().create_var(
-            name='X', shape=[1000, 784], dtype='float32')
-
-        y = prog.global_block().create_var(
-            name='Y', shape=[784, 100], dtype='float32')
-        out = prog.global_block().create_var(name='Out', dtype='float32')
-        prog.global_block().append_op(
-            type="mul", inputs={'X': [x],
-                                'Y': [y]}, outputs={'Out': [out]})
-
-        # FIXME(yuyang18): We manual compare the output string, since the order
-        # of variable could be changed.
-        print(prog)
-        print(prog.clone())
-
-    def test_parse_program_from_string(self):
-        prog = Program()
-
-        x = prog.global_block().create_var(
-            name='X', shape=[1000, 784], dtype='float32')
-
-        y = prog.global_block().create_var(
-            name='Y', shape=[784, 100], dtype='float32')
-        out = prog.global_block().create_var(name='Out', dtype='float32')
-        prog.global_block().append_op(
-            type="mul", inputs={'X': [x],
-                                'Y': [y]}, outputs={'Out': [out]})
-
-        binary_str = prog.desc.serialize_to_string()
-        prog_restored = Program.parse_from_string(binary_str)
-
-        print(prog)
-        print(prog_restored)
-
-    def test_program_clone_with_parameter(self):
-        main_program = Program()
-        startup_program = Program()
-        with program_guard(main_program, startup_program):
-            d = layers.data(name='x', shape=[784], dtype='float32')
-            hidden = layers.fc(input=d, size=100)
-            layers.fc(input=hidden, size=100)
-
-        new_program = main_program.clone()
-        self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
-
-    def test_program_inference_optimize(self):
-        def net():
-            reader = fluid.layers.py_reader(
-                capacity=10,
-                shapes=[[-1, 10], [-1, 1]],
-                lod_levels=[0, 0],
-                dtypes=['float32', 'int64'],
-                use_double_buffer=True)
-            in_data, label = fluid.layers.read_file(reader)
-            predict_label = fluid.layers.fc(in_data, size=2, act='softmax')
-            loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=predict_label, label=label))
-
-            optimizer = fluid.optimizer.Adam()
-            optimizer.minimize(loss)
-
-        startup_program = fluid.Program()
-        main_program = fluid.Program()
-        with fluid.program_guard(main_program, startup_program):
-            net()
-        no_read_program = main_program._inference_optimize()
-        keep_read_program = main_program._inference_optimize(
-            prune_read_op=False)
-        no_read_ops = no_read_program.global_block().ops
-        keep_read_ops = keep_read_program.global_block().ops
-        self.assertEqual(len(keep_read_ops) - len(no_read_ops), 2)
-        self.assertEqual(keep_read_ops[0].type, 'create_double_buffer_reader')
-        self.assertEqual(keep_read_ops[1].type, 'read')
-
-        for i in range(len(no_read_ops)):
-            self.assertEqual(no_read_ops[i].type, keep_read_ops[i + 2].type)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_program_code.py b/python/paddle/fluid/tests/unittests/test_program_code.py
deleted file mode 100644
index 27b22ba9392b63c0ccd7904ff03d737b977cc9fc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_program_code.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import time
-import unittest
-from multiprocessing import Process
-import signal
-
-import numpy
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.layers.io import ListenAndServ
-from paddle.fluid.layers.io import Recv
-from paddle.fluid.layers.io import Send
-import paddle.fluid.layers.ops as ops
-
-from paddle.fluid.transpiler.details import program_to_code
-
-
-class TestProgram2Code(unittest.TestCase):
-    def test_print(self):
-        place = fluid.CPUPlace()
-        self.init_serv(place)
-        self.init_client(place, 9123)
-
-    def init_serv(self, place):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
-            with serv.do():
-                out_var = main.global_block().create_var(
-                    name="scale_0.tmp_0",
-                    psersistable=True,
-                    dtype="float32",
-                    shape=[32, 32])
-                x = layers.data(
-                    shape=[32, 32],
-                    dtype='float32',
-                    name="X",
-                    append_batch_size=False)
-                fluid.initializer.Constant(value=1.0)(x, main.global_block())
-                ops._scale(x=x, scale=10.0, out=out_var)
-
-        program_to_code(main)
-
-    def init_client(self, place, port):
-        main = fluid.Program()
-        with fluid.program_guard(main):
-            x = layers.data(
-                shape=[32, 32],
-                dtype='float32',
-                name='X',
-                append_batch_size=False)
-            fluid.initializer.Constant(value=2.3)(x, main.global_block())
-            get_var = main.global_block().create_var(
-                name="scale_0.tmp_0",  # server side var
-                dtype="float32",
-                persistable=False,
-                shape=[32, 32])
-            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
-            Send("127.0.0.1:%d" % port, [x])
-            o = Recv("127.0.0.1:%d" % port, [get_var])
-
-        program_to_code(main)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
deleted file mode 100755
index ed259b12f038032d8c8c3e7e6c607d1791e80efe..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ /dev/null
@@ -1,212 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import contextlib
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from simple_nets import init_data, simple_fc_net, fc_with_batchnorm
-import seresnext_net
-from test_parallel_executor_transformer import transformer, get_feed_data_reader
-from fake_reader import fake_imdb_reader
-
-
-def lstm_net(use_feed):
-    dict_dim = 5147
-    emb_dim = 128
-    hid_dim = 128
-    hid_dim2 = 96
-    class_dim = 2
-    emb_lr = 30.0
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
-    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
-    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    return avg_cost
-
-
-def simple_fc_net_with_accuracy(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    accuracy_out = fluid.layers.accuracy(input=prediction, label=label, k=5)
-    return loss
-
-
-class TestProgramPruneBackward(unittest.TestCase):
-    def program_compare(self, program_a, program_b):
-        assert isinstance(
-            program_a, fluid.framework.
-            Program), "The first argument should be fluid.framework.Program."
-        assert isinstance(
-            program_b, fluid.framework.
-            Program), "The second argument should be fluid.framework Program."
-
-        self.assertEqual(len(program_a.blocks), len(program_b.blocks))
-        for idx in range(len(program_a.blocks)):
-            block_a = program_a.blocks[idx]
-            block_b = program_b.blocks[idx]
-            self.assertEqual(len(block_a.ops), len(block_b.ops))
-            self.assertEqual(len(block_a.vars), len(block_b.vars))
-            for op_idx in range(len(block_a.ops)):
-                self.assertEqual(block_a.ops[op_idx].type,
-                                 block_b.ops[op_idx].type)
-            for var_key in list(block_a.vars.keys()):
-                self.assertTrue(block_b.has_var(var_key))
-
-    def check_prune_correctness(self, method, feed_dict, optimizer):
-        loss = method(use_feed=False)
-
-        main_program = fluid.default_main_program()
-        test_prog_orig = main_program.clone(for_test=True)
-        optimizer().minimize(loss)
-        test_prog_prune = main_program.clone(for_test=True)
-        self.program_compare(test_prog_orig, test_prog_prune)
-
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        loss_data_prune, = exe.run(test_prog_prune,
-                                   feed=feed_dict,
-                                   fetch_list=[loss.name])
-        loss_data_orig, = exe.run(test_prog_orig,
-                                  feed=feed_dict,
-                                  fetch_list=[loss.name])
-        self.assertEqual(loss_data_orig, loss_data_prune)
-
-    def test_simple_fc_net(self):
-        def optimizer():
-            optimizer = fluid.optimizer.SGD(
-                learning_rate=0.001,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            return optimizer
-
-        with self.program_scope_guard():
-            img, label = init_data()
-            self.check_prune_correctness(
-                method=simple_fc_net,
-                feed_dict={"image": img,
-                           "label": label},
-                optimizer=optimizer)
-
-    def test_simple_fc_net_with_accuracy(self):
-        def optimizer():
-            optimizer = fluid.optimizer.SGD(
-                learning_rate=0.001,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            return optimizer
-
-        with self.program_scope_guard():
-            img, label = init_data()
-            self.check_prune_correctness(
-                method=simple_fc_net_with_accuracy,
-                feed_dict={"image": img,
-                           "label": label},
-                optimizer=optimizer)
-
-    def test_batchnorm_fc(self):
-        def optimizer():
-            optimizer = fluid.optimizer.SGD(
-                learning_rate=0.001,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            return optimizer
-
-        with self.program_scope_guard():
-            img, label = init_data()
-            self.check_prune_correctness(
-                method=fc_with_batchnorm,
-                feed_dict={"image": img,
-                           "label": label},
-                optimizer=optimizer)
-
-    def test_seresnet(self):
-        with self.program_scope_guard():
-            self.check_prune_correctness(
-                method=seresnext_net.model,
-                feed_dict=seresnext_net.feed_dict(use_cuda=False),
-                optimizer=seresnext_net.optimizer)
-
-    def test_transformer(self):
-        def optimizer():
-            optimizer = fluid.optimizer.Adam(
-                learning_rate=0.001,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            return optimizer
-
-        with self.program_scope_guard():
-            # the program argument is used to distinguish Program and CompiledProgram
-            feed_dict = get_feed_data_reader().get_next(
-                fluid.Executor(core.CPUPlace()), fluid.default_main_program())
-            self.check_prune_correctness(
-                method=transformer, feed_dict=feed_dict, optimizer=optimizer)
-
-    def test_lstm(self):
-        def optimizer():
-            optimizer = fluid.optimizer.Adagrad(
-                learning_rate=0.001,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            return optimizer
-
-        with self.program_scope_guard():
-            word_dict_size = 5147
-            reader = fake_imdb_reader(word_dict_size, 1)
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            feeder = fluid.DataFeeder(
-                feed_list=[data, label], place=core.CPUPlace())
-            feed_data = feeder.feed(reader())
-            self.check_prune_correctness(
-                method=lstm_net, feed_dict=feed_data, optimizer=optimizer)
-
-    @contextlib.contextmanager
-    def program_scope_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf.py b/python/paddle/fluid/tests/unittests/test_protobuf.py
deleted file mode 100644
index 7b80927c48d02e83a9bfaac572c81a6a95a69c8c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_protobuf.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid.proto.framework_pb2 as framework_pb2
-import unittest
-
-
-class TestFrameworkProto(unittest.TestCase):
-    def test_all(self):
-        op_proto = framework_pb2.OpProto()
-        ipt0 = op_proto.inputs.add()
-        ipt0.name = "a"
-        ipt0.comment = "the input of cosine op"
-        ipt1 = op_proto.inputs.add()
-        ipt1.name = "b"
-        ipt1.comment = "the other input of cosine op"
-        opt = op_proto.outputs.add()
-        opt.name = "output"
-        opt.comment = "the output of cosine op"
-        op_proto.comment = "cosine op, output = scale*cos(a, b)"
-        attr = op_proto.attrs.add()
-        attr.name = "scale"
-        attr.comment = "scale of cosine op"
-        attr.type = framework_pb2.FLOAT
-        op_proto.type = "cos"
-        self.assertTrue(op_proto.IsInitialized())
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
deleted file mode 100644
index 7fb2171f611adea434d6f2710465810fb69d6979..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ /dev/null
@@ -1,217 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import paddle.compat as cpt
-from paddle.fluid.framework import Program
-
-
-class TestOpDesc(unittest.TestCase):
-    def test_op_desc(self):
-        program_desc = core.ProgramDesc()
-        self.assertIsNotNone(program_desc)
-        block = program_desc.block(0)
-        self.assertIsNotNone(block)
-        op = block.append_op()
-        self.assertIsNotNone(op)
-        op.set_type("test")
-        self.assertEqual("test", op.type())
-        op.set_input("X", ["a", "b", "c"])
-        self.assertEqual(["a", "b", "c"], op.input("X"))
-        self.assertEqual(["X"], op.input_names())
-
-        op.set_output("Out", ["z"])
-        self.assertEqual(['z'], op.output("Out"))
-        self.assertEqual(["Out"], op.output_names())
-
-        op._set_attr("int_attr", 1)
-        self.assertEqual(1, op.attr("int_attr"))
-        self.assertTrue(op.has_attr("int_attr"))
-        self.assertEqual(core.AttrType.INT, op.attr_type("int_attr"))
-
-        op._set_attr("float_attr", -1.32)
-        self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4)
-        self.assertTrue(op.has_attr("float_attr"))
-
-        op._set_attr("bool_attr", False)
-        self.assertFalse(op.attr("bool_attr"))
-
-        op._set_attr("string_attr", "abc")
-        self.assertEqual("abc", op.attr("string_attr"))
-        self.assertTrue(op.has_attr("string_attr"))
-
-        op._set_attr("ints_attr", [1, 2, 3])
-        self.assertEqual([1, 2, 3], op.attr("ints_attr"))
-
-        expected = [1.2, 2.3, 3.4]
-        op._set_attr("floats_attr", expected)
-        for e, a in zip(expected, op.attr("floats_attr")):
-            self.assertAlmostEqual(e, a, delta=1e-4)
-
-        op._set_attr("strings_attr", ["a", "b", "c"])
-        self.assertEqual(["a", "b", "c"], op.attr("strings_attr"))
-
-        op._set_attr("bools_attr", [True, False, True])
-        self.assertEqual([True, False, True], op.attr("bools_attr"))
-
-        self.assertEqual(8, len(op.attr_names()))
-
-        op.set_block_attr("_block_attr", program_desc.block(0))
-        self.assertEqual(0, op._block_attr_id("_block_attr"))
-
-        mul_op = block.append_op()
-        mul_op.set_type("mul")
-        mul_op.check_attrs()
-        self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
-        self.assertEqual(mul_op.attr("y_num_col_dims"), 1)
-
-
-class TestProgramDesc(unittest.TestCase):
-    def test_instance(self):
-        program_desc = core.ProgramDesc()
-        self.assertIsNotNone(program_desc)
-        del program_desc
-        program_desc = core.ProgramDesc()
-        self.assertIsNotNone(program_desc)
-        self.assertIsNotNone(program_desc.block(0))
-        del program_desc
-
-    def test_append_block(self):
-        program_desc = core.ProgramDesc()
-        self.assertIsNotNone(program_desc)
-        block_root = program_desc.block(0)
-        self.assertIsNotNone(block_root)
-        self.assertEqual(block_root.id, 0)
-        block1 = program_desc.append_block(block_root)
-        block2 = program_desc.append_block(block1)
-        self.assertIsNotNone(block1)
-        self.assertEqual(block1.id, block2.parent)
-        self.assertEqual(block_root.id, block1.parent)
-        block3 = program_desc.append_block(block_root)
-        self.assertEqual(block3.parent, block_root.id)
-        self.assertEqual(program_desc.block(1).id, 1)
-        self.assertEqual(4, program_desc.num_blocks())
-
-
-class TestVarDesc(unittest.TestCase):
-    def test_shape(self):
-        program_desc = core.ProgramDesc()
-        block = program_desc.block(0)
-        var = block.var(cpt.to_bytes('my_var'))
-        var.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-        src_shape = [3, 2, 10, 8]
-        var.set_shape(src_shape)
-        res_shape = var.shape()
-        self.assertEqual(src_shape, res_shape)
-        self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
-
-    def test_multiple_shape(self):
-        program_desc = core.ProgramDesc()
-        block = program_desc.block(0)
-        var = block.var(cpt.to_bytes('my_reader'))
-        var.set_type(core.VarDesc.VarType.READER)
-        src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]]
-        var.set_shapes(src_shapes)
-        res_shapes = var.shapes()
-        self.assertEqual(src_shapes, res_shapes)
-        self.assertEqual(core.VarDesc.VarType.READER, var.type())
-
-    def test_dtype(self):
-        program_desc = core.ProgramDesc()
-        block = program_desc.block(0)
-        var = block.var(cpt.to_bytes('my_var'))
-        var.set_type(core.VarDesc.VarType.LOD_TENSOR)
-        var.set_dtype(core.VarDesc.VarType.INT32)
-        self.assertEqual(core.VarDesc.VarType.INT32, var.dtype())
-        self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
-
-    def test_multiple_dtype(self):
-        program_desc = core.ProgramDesc()
-        block = program_desc.block(0)
-        var = block.var(cpt.to_bytes('my_reader'))
-        var.set_type(core.VarDesc.VarType.READER)
-        src_types = [
-            core.VarDesc.VarType.INT32, core.VarDesc.VarType.FP64,
-            core.VarDesc.VarType.FP32
-        ]
-        var.set_dtypes(src_types)
-        self.assertEqual(src_types, var.dtypes())
-        self.assertEqual(core.VarDesc.VarType.READER, var.type())
-
-    def test_multiple_lod_level(self):
-        program_desc = core.ProgramDesc()
-        block = program_desc.block(0)
-        var = block.var(cpt.to_bytes('my_reader'))
-        var.set_type(core.VarDesc.VarType.READER)
-        src_types = [3, 1, 2]
-        var.set_lod_levels(src_types)
-        self.assertEqual(src_types, var.lod_levels())
-        self.assertEqual(core.VarDesc.VarType.READER, var.type())
-
-
-class TestBlockDesc(unittest.TestCase):
-    def test_add_var(self):
-        program_desc = core.ProgramDesc()
-        self.assertIsNotNone(program_desc)
-        block = program_desc.block(0)
-        self.assertIsNotNone(block)
-        var1 = block.var(cpt.to_bytes("var1"))
-        var2 = block.var(cpt.to_bytes("var2"))
-        var3 = block.var(cpt.to_bytes("var3"))
-        all_vars = block.all_vars()
-        self.assertEqual(set(all_vars), {var1, var2, var3})
-        var2_re = block.find_var(cpt.to_bytes("var2"))
-        self.assertEqual(var2_re, var2)
-
-    def test_add_op(self):
-        program_desc = core.ProgramDesc()
-        self.assertIsNotNone(program_desc)
-        block = program_desc.block(0)
-        self.assertIsNotNone(block)
-        op1 = block.append_op()
-        op2 = block.append_op()
-        op0 = block._prepend_op()
-        all_ops = []
-        for idx in range(0, block.op_size()):
-            all_ops.append(block.op(idx))
-        self.assertEqual(all_ops, [op0, op1, op2])
-
-    def test__remove_op(self):
-        program = Program()
-        program_desc = program.desc
-        self.assertIsNotNone(program_desc)
-        block = program_desc.block(0)
-        self.assertIsNotNone(block)
-
-        op0 = block.append_op()
-        op1 = block.append_op()
-        op2 = block.append_op()
-        op0.set_type("test")
-        op1.set_type("test")
-        op2.set_type("test")
-
-        block._remove_op(1, 2)
-        program._sync_with_cpp()
-
-        all_ops = []
-        for idx in range(0, block.op_size()):
-            all_ops.append(block.op(idx))
-        self.assertEqual(all_ops, [op0, op2])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
deleted file mode 100644
index 57e96f1fa34fa94f5e095d088016655f24b58d0c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestProximalAdagradOp(OpTest):
-    def setUp(self):
-        self.op_type = "proximal_adagrad"
-        w = np.random.random((102, 105)).astype("float32")
-        m = np.random.random((102, 105)).astype("float32")
-        g = np.random.random((102, 105)).astype("float32")
-        lr = np.array([0.1]).astype("float32")
-        l1 = 0.1
-        l2 = 0.2
-
-        self.inputs = {'Param': w, 'Grad': g, 'Moment': m, 'LearningRate': lr}
-        self.attrs = {'l1': l1, 'l2': l2}
-        param_out = 0.0
-
-        moment_out = m + g * g
-        prox_param = w - lr * g / np.sqrt(moment_out)
-        if l1 > 0.0:
-            x = np.abs(prox_param) - lr * l1
-            x[x < 0] = 0
-            param_out = np.sign(prox_param) * (x / (1.0 + lr * l2))
-        else:
-            param_out = prox_param / (1.0 + lr * l2)
-
-        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py b/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
deleted file mode 100644
index 067502baecc73cc84a6aa8ab78a9afbcc191c49a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestProximalGDOp(OpTest):
-    def setUp(self):
-        self.op_type = "proximal_gd"
-        w = np.random.random((102, 105)).astype("float32")
-        g = np.random.random((102, 105)).astype("float32")
-        lr = np.array([0.1]).astype("float32")
-        l1 = 0.1
-        l2 = 0.2
-
-        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
-        self.attrs = {'l1': l1, 'l2': l2}
-        prox_param = w - lr * g
-        param_out = 0.0
-        if l1 > 0.0:
-            x = np.abs(prox_param) - lr * l1
-            x[x < 0] = 0
-            param_out = np.sign(prox_param) * (x / (1.0 + lr * l2))
-        else:
-            param_out = prox_param / (1.0 + lr * l2)
-
-        self.outputs = {'ParamOut': param_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
deleted file mode 100644
index 49aab6ddfc0158b11d73aac027746ef02edc6d89..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-from py_precise_roi_pool import PyPrRoIPool
-from op_test import OpTest
-import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
-
-
-class TestPRROIPoolOp(OpTest):
-    def set_data(self):
-        self.init_test_case()
-        self.make_rois()
-        self.prRoIPool = PyPrRoIPool()
-        self.outs = self.prRoIPool.compute(
-            self.x, self.rois, self.output_channels, self.spatial_scale,
-            self.pooled_height, self.pooled_width).astype('float32')
-        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
-        self.attrs = {
-            'output_channels': self.output_channels,
-            'spatial_scale': self.spatial_scale,
-            'pooled_height': self.pooled_height,
-            'pooled_width': self.pooled_width
-        }
-        self.outputs = {'Out': self.outs}
-
-    def init_test_case(self):
-        self.batch_size = 3
-        self.channels = 3 * 2 * 2
-        self.height = 6
-        self.width = 4
-
-        self.x_dim = [self.batch_size, self.channels, self.height, self.width]
-
-        self.spatial_scale = 1.0 / 4.0
-        self.output_channels = 3
-        self.pooled_height = 2
-        self.pooled_width = 2
-
-        self.x = np.random.random(self.x_dim).astype('float32')
-
-    def make_rois(self):
-        rois = []
-        self.rois_lod = [[]]
-        for bno in range(self.batch_size):
-            self.rois_lod[0].append(bno + 1)
-            for i in range(bno + 1):
-                x1 = np.random.random_integers(
-                    0, self.width // self.spatial_scale - self.pooled_width)
-                y1 = np.random.random_integers(
-                    0, self.height // self.spatial_scale - self.pooled_height)
-
-                x2 = np.random.random_integers(x1 + self.pooled_width,
-                                               self.width // self.spatial_scale)
-                y2 = np.random.random_integers(
-                    y1 + self.pooled_height, self.height // self.spatial_scale)
-                roi = [bno, x1, y1, x2, y2]
-                rois.append(roi)
-        self.rois_num = len(rois)
-        self.rois = np.array(rois).astype('float32')
-
-    def setUp(self):
-        self.op_type = 'prroi_pool'
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_backward(self):
-        for place in self._get_places():
-            self._get_gradient(['X'], place, ["Out"], None)
-
-    def run_net(self, place):
-        with program_guard(Program(), Program()):
-            x = fluid.layers.data(
-                name="X",
-                shape=[self.channels, self.height, self.width],
-                dtype="float32")
-            rois = fluid.layers.data(
-                name="ROIs", shape=[4], dtype="float32", lod_level=1)
-            output = fluid.layers.prroi_pool(x, rois, self.output_channels,
-                                             0.25, 2, 2)
-            loss = fluid.layers.mean(output)
-            optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
-            optimizer.minimize(loss)
-            input_x = fluid.create_lod_tensor(self.x, [], place)
-            input_rois = fluid.create_lod_tensor(self.rois[:, 1:5],
-                                                 self.rois_lod, place)
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            exe.run(fluid.default_main_program(),
-                    {'X': input_x,
-                     "ROIs": input_rois})
-
-    def test_net(self):
-        places = [fluid.CPUPlace()]
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            self.run_net(place)
-
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            x = fluid.layers.data(
-                name="x", shape=[245, 30, 30], dtype="float32")
-            rois = fluid.layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            # channel must be int type
-            self.assertRaises(TypeError, fluid.layers.prroi_pool, x, rois, 0.5,
-                              0.25, 7, 7)
-            # spatial_scale must be float type
-            self.assertRaises(TypeError, fluid.layers.prroi_pool, x, rois, 5, 2,
-                              7, 7)
-            # pooled_height must be int type
-            self.assertRaises(TypeError, fluid.layers.prroi_pool, x, rois, 5,
-                              0.25, 0.7, 7)
-            # pooled_width must be int type
-            self.assertRaises(TypeError, fluid.layers.prroi_pool, x, rois, 5,
-                              0.25, 7, 0.7)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
deleted file mode 100644
index abe014a38c6ecfd008b0f1028536bfb49b628fb4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import math
-import numpy as np
-import unittest
-from op_test import OpTest
-
-
-class TestPSROIPoolOp(OpTest):
-    def set_data(self):
-        self.init_test_case()
-        self.make_rois()
-        self.calc_psroi_pool()
-        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
-        self.attrs = {
-            'output_channels': self.output_channels,
-            'spatial_scale': self.spatial_scale,
-            'pooled_height': self.pooled_height,
-            'pooled_width': self.pooled_width
-        }
-        self.outputs = {'Out': self.outs}
-
-    def init_test_case(self):
-        self.batch_size = 3
-        self.channels = 3 * 2 * 2
-        self.height = 6
-        self.width = 4
-
-        self.x_dim = [self.batch_size, self.channels, self.height, self.width]
-
-        self.spatial_scale = 1.0 / 4.0
-        self.output_channels = 3
-        self.pooled_height = 2
-        self.pooled_width = 2
-
-        self.x = np.random.random(self.x_dim).astype('float32')
-
-    def make_rois(self):
-        rois = []
-        self.rois_lod = [[]]
-        for bno in range(self.batch_size):
-            self.rois_lod[0].append(bno + 1)
-            for i in range(bno + 1):
-                x1 = np.random.random_integers(
-                    0, self.width // self.spatial_scale - self.pooled_width)
-                y1 = np.random.random_integers(
-                    0, self.height // self.spatial_scale - self.pooled_height)
-
-                x2 = np.random.random_integers(x1 + self.pooled_width,
-                                               self.width // self.spatial_scale)
-                y2 = np.random.random_integers(
-                    y1 + self.pooled_height, self.height // self.spatial_scale)
-                roi = [bno, x1, y1, x2, y2]
-                rois.append(roi)
-        self.rois_num = len(rois)
-        self.rois = np.array(rois).astype('float32')
-
-    def calc_psroi_pool(self):
-        output_shape = (self.rois_num, self.output_channels, self.pooled_height,
-                        self.pooled_width)
-        out_data = np.zeros(output_shape)
-        for i in range(self.rois_num):
-            roi = self.rois[i]
-            roi_batch_id = int(roi[0])
-            roi_start_w = round(roi[1]) * self.spatial_scale
-            roi_start_h = round(roi[2]) * self.spatial_scale
-            roi_end_w = (round(roi[3]) + 1.) * self.spatial_scale
-            roi_end_h = (round(roi[4]) + 1.) * self.spatial_scale
-
-            roi_height = max(roi_end_h - roi_start_h, 0.1)
-            roi_width = max(roi_end_w - roi_start_w, 0.1)
-
-            bin_size_h = roi_height / float(self.pooled_height)
-            bin_size_w = roi_width / float(self.pooled_width)
-
-            x_i = self.x[roi_batch_id]
-
-            for c in range(self.output_channels):
-                for ph in range(self.pooled_height):
-                    for pw in range(self.pooled_width):
-                        hstart = int(
-                            math.floor(float(ph) * bin_size_h + roi_start_h))
-                        wstart = int(
-                            math.floor(float(pw) * bin_size_w + roi_start_w))
-                        hend = int(
-                            math.ceil(
-                                float(ph + 1) * bin_size_h + roi_start_h))
-                        wend = int(
-                            math.ceil(
-                                float(pw + 1) * bin_size_w + roi_start_w))
-                        hstart = min(max(hstart, 0), self.height)
-                        hend = min(max(hend, 0), self.height)
-                        wstart = min(max(wstart, 0), self.width)
-                        wend = min(max(wend, 0), self.width)
-
-                        c_in = (c * self.pooled_height + ph
-                                ) * self.pooled_width + pw
-                        is_empty = (hend <= hstart) or (wend <= wstart)
-                        out_sum = 0.
-                        for ih in range(hstart, hend):
-                            for iw in range(wstart, wend):
-                                out_sum += x_i[c_in, ih, iw]
-                        bin_area = (hend - hstart) * (wend - wstart)
-                        out_data[i, c, ph, pw] = 0. if is_empty else (
-                            out_sum / float(bin_area))
-        self.outs = out_data.astype('float32')
-
-    def setUp(self):
-        self.op_type = 'psroi_pool'
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
deleted file mode 100644
index b7bff4eae23e7b7b4e879bf6f25924c107b4ea02..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-import paddle
-import unittest
-import six
-import numpy as np
-
-dev_cnt = 2
-if fluid.core.is_compiled_with_cuda():
-    dev_cnt = fluid.core.get_cuda_device_count()
-os.environ['CPU_NUM'] = str(dev_cnt)
-
-
-def dummy_func_with_no_input():
-    return np.array([0], dtype='float32')
-
-
-def dummy_func_with_no_output(x):
-    pass
-
-
-def tanh(x):
-    return np.tanh(x)
-
-
-def tanh_grad(y, dy):
-    return np.array(dy) * (1 - np.square(np.array(y)))
-
-
-def cross_entropy(logits, labels):
-    logits = np.array(logits)
-    labels = np.array(labels)
-    M = logits.shape[0]
-    N = logits.shape[1]
-    ret = np.ndarray([M, 1]).astype(logits.dtype)
-    for idx in six.moves.range(M):
-        ret[idx][0] = -np.log(logits[idx][labels[idx][0]])
-    return ret
-
-
-def cross_entropy_grad(logits, labels, bwd_dout):
-    logits = np.array(logits)
-    labels = np.array(labels)
-    bwd_dout = np.array(bwd_dout)
-    M = logits.shape[0]
-    N = logits.shape[1]
-    dlogits = np.zeros([M, N]).astype(logits.dtype)
-    for idx in six.moves.range(M):
-        dlogits[idx][labels[idx][0]] = -bwd_dout[idx] / logits[idx][labels[idx][
-            0]]
-    return dlogits, None
-
-
-def simple_fc_net(img, label, use_py_func_op):
-    hidden = img
-    for idx in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-        if not use_py_func_op:
-            hidden = fluid.layers.tanh(hidden)
-        else:
-            new_hidden = fluid.default_main_program().current_block(
-            ).create_var(
-                name='hidden_{}'.format(idx),
-                dtype='float32',
-                shape=hidden.shape)
-            hidden = fluid.layers.py_func(
-                func=tanh,
-                x=hidden,
-                out=new_hidden,
-                backward_func=tanh_grad,
-                skip_vars_in_backward_input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    if not use_py_func_op:
-        loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    else:
-        loss = fluid.default_main_program().current_block().create_var(
-            name='loss', dtype='float32', shape=[-1, 1])
-        loss = fluid.layers.py_func(
-            func=cross_entropy,
-            x=[prediction, label],
-            out=loss,
-            backward_func=cross_entropy_grad,
-            skip_vars_in_backward_input=loss)
-
-        dummy_var = fluid.default_main_program().current_block().create_var(
-            name='test_tmp_var', dtype='float32', shape=[1])
-        fluid.layers.py_func(
-            func=dummy_func_with_no_input, x=None, out=dummy_var)
-        loss += dummy_var
-        fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None)
-
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def reader():
-    for _ in six.moves.range(dev_cnt * 100):
-        yield np.random.random([784]), np.random.random_integers(
-            size=[1], low=0, high=9)
-
-
-def test_main(use_cuda, use_py_func_op, use_parallel_executor):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return None
-
-    with fluid.program_guard(fluid.Program(), fluid.Program()):
-        with fluid.scope_guard(fluid.core.Scope()):
-            fluid.default_main_program().random_seed = 1
-            fluid.default_startup_program().random_seed = 1
-            np.random.seed(1)
-
-            img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            loss = simple_fc_net(img, label, use_py_func_op)
-            optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
-            optimizer.minimize(loss)
-
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
-            r = paddle.batch(reader, batch_size=10)
-
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-
-            train_cp = fluid.default_main_program()
-
-            if use_parallel_executor:
-                train_cp = compiler.CompiledProgram(fluid.default_main_program(
-                ))
-                train_cp = train_cp.with_data_parallel(loss_name=loss.name)
-                fetch_list = [loss.name]
-            else:
-                fetch_list = [loss]
-
-            ret = []
-            for epoch_id in six.moves.range(2):
-                for d in r():
-                    L, = exe.run(train_cp,
-                                 feed=feeder.feed(d),
-                                 fetch_list=fetch_list)
-                    ret.append(L)
-            return np.array(ret)
-
-
-class TestPyFuncOpUseExecutor(unittest.TestCase):
-    def setUp(self):
-        self.use_parallel_executor = False
-
-    def test_loss_diff(self):
-        losses = []
-        for use_cuda in [True, False]:
-            for use_py_func_op in [True, False]:
-                L = test_main(use_cuda, use_py_func_op,
-                              self.use_parallel_executor)
-                if L is not None:
-                    losses.append(L)
-
-        for idx in six.moves.range(len(losses) - 1):
-            max_diff = np.max(np.abs(losses[idx] - losses[0]))
-            self.assertAlmostEqual(max_diff, 0, delta=1e-3)
-
-
-class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor):
-    def setUp(self):
-        self.use_parallel_executor = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py b/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
deleted file mode 100644
index 21f5fe4b581ab74bae52d952020ed2183820c471..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import unittest
-import numpy as np
-
-
-class TestPyReaderCombination(unittest.TestCase):
-    def setUp(self):
-        self.n1 = 10
-        self.n2 = 20
-        self.batch_size = 2
-
-    def create_reader(self, batch_num):
-        def __impl__():
-            for _ in range(batch_num):
-                image = np.random.uniform(
-                    low=-1, high=1, size=[batch_num, 784]).astype('float32')
-                label = np.random.random_integers(
-                    low=0, high=9, size=[batch_num, 1]).astype('int64')
-                yield image, label
-
-        return __impl__
-
-    def assertFeedVarEqual(self, reader_list_data, py_reader_dict_data):
-        image1 = reader_list_data[0]
-        label1 = reader_list_data[1]
-
-        image2 = np.array(py_reader_dict_data[0]['image'])
-        label2 = np.array(py_reader_dict_data[0]['label'])
-        self.assertTrue(np.array_equal(image1, image2))
-        self.assertTrue(np.array_equal(label1, label2))
-
-    def main_impl(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            image = fluid.layers.data(
-                name='image', dtype='float32', shape=[784])
-            label = fluid.layers.data(name='label', dtype='int64', shape=[1])
-
-            py_reader1 = fluid.io.PyReader(
-                feed_list=[image, label], capacity=16, iterable=True)
-            py_reader2 = fluid.io.PyReader(
-                feed_list=[image, label], capacity=16, iterable=True)
-
-            reader1 = fluid.io.cache(self.create_reader(self.n1))
-            reader2 = fluid.io.cache(self.create_reader(self.n2))
-            py_reader1.decorate_batch_generator(reader1, places=place)
-            py_reader2.decorate_batch_generator(reader2, places=place)
-
-            for _ in range(10):
-                max_num = min(self.n1, self.n2)
-                batch_num = 0
-                for reader_np1, py_reader_dict1, reader_np2, py_reader_dict2 in zip(
-                        reader1(), py_reader1(), reader2(), py_reader2()):
-                    self.assertFeedVarEqual(reader_np1, py_reader_dict1)
-                    self.assertFeedVarEqual(reader_np2, py_reader_dict2)
-                    batch_num += 1
-
-                self.assertEqual(batch_num, max_num)
-
-    def get_places(self):
-        if fluid.is_compiled_with_cuda():
-            return [fluid.CUDAPlace(0), fluid.CPUPlace()]
-        else:
-            return [fluid.CPUPlace()]
-
-    def test_main(self):
-        for p in self.get_places():
-            self.main_impl(p)
-
-
-class TestPyReaderCombination2(TestPyReaderCombination):
-    def setUp(self):
-        self.n1 = 20
-        self.n2 = 10
-        self.batch_size = 2
-
-
-class TestPyReaderCombination3(TestPyReaderCombination):
-    def setUp(self):
-        self.n1 = 10
-        self.n2 = 10
-        self.batch_size = 2
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_lod_level_share.py b/python/paddle/fluid/tests/unittests/test_py_reader_lod_level_share.py
deleted file mode 100644
index 55dc3a7aa341ff09eb3d7d219cd1c23427e25da1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_py_reader_lod_level_share.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import unittest
-
-
-class TestLoDLevelShare(unittest.TestCase):
-    def setUp(self):
-        self.use_double_buffer = False
-
-    def test_lod_level_share(self):
-        reader = fluid.layers.py_reader(
-            capacity=16,
-            shapes=([-1, 256], [-1, 512], [-1, 100]),
-            dtypes=('float32', 'int64', 'double'),
-            lod_levels=(1, 2, 0),
-            use_double_buffer=self.use_double_buffer)
-
-        x, y, z = fluid.layers.read_file(reader)
-        self.assertEqual(x.lod_level, 1)
-        self.assertEqual(y.lod_level, 2)
-        self.assertEqual(z.lod_level, 0)
-
-
-class TestLoDLevelShare2(TestLoDLevelShare):
-    def setUp(self):
-        self.use_double_buffer = True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py b/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
deleted file mode 100644
index b913127ad625eb25de3ec36edd2161019ed09749..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import numpy as np
-from threading import Thread
-
-
-def user_reader(inputs):
-    def _reader():
-        for d in inputs:
-            yield d
-
-    return _reader
-
-
-def batch_feeder(batch_reader, pin_memory=False, img_dtype="float32"):
-    def _feeder():
-        for batch_data in batch_reader():
-            sample_batch = []
-            label_batch = []
-            for sample, label in batch_data:
-                sample_batch.append(sample)
-                label_batch.append([label])
-            tensor = core.LoDTensor()
-            label = core.LoDTensor()
-            place = core.CUDAPinnedPlace() if pin_memory else core.CPUPlace()
-            tensor.set(np.array(sample_batch, dtype=img_dtype), place)
-            label.set(np.array(label_batch, dtype="int64"), place)
-            yield [tensor, label]
-
-    return _feeder
-
-
-class TestPyReader(unittest.TestCase):
-    def setUp(self):
-        self.capacity = 10
-        self.shapes = [(-1, 3, 2, 1), (-1, 1)]
-        self.lod_levels = [0, 0]
-        self.dtypes = ['float32', 'int64']
-
-    def test_pin_memory_pyreader(self):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
-            executor = fluid.Executor(place)
-
-            data_file = fluid.layers.py_reader(
-                capacity=self.capacity,
-                dtypes=self.dtypes,
-                lod_levels=self.lod_levels,
-                shapes=self.shapes)
-            # feed_queue = data_file.queue
-            read_out_data = fluid.layers.read_file(data_file)
-
-            self.inputs = []
-            for _ in range(10):
-                sample = np.random.uniform(
-                    low=0, high=1, size=[3, 2, 1]).astype("float32")
-                label = np.random.uniform(
-                    low=0, high=10, size=[1]).astype("int64")
-                self.inputs.append((sample, label))
-
-            self.input_tensors = []
-            for d, l in batch_feeder(
-                    paddle.batch(
-                        user_reader(self.inputs), batch_size=2),
-                    pin_memory=True
-                    if fluid.core.is_compiled_with_cuda() else False)():
-                ta = fluid.LoDTensorArray()
-                ta.append(d)
-                ta.append(l)
-                self.input_tensors.append(ta)
-
-            self.batched_inputs = []
-            for batch in paddle.batch(user_reader(self.inputs), batch_size=2)():
-                feed_d = []
-                feed_l = []
-                for d, l in batch:
-                    feed_d.append(d)
-                    feed_l.append([l])
-                self.batched_inputs.append([feed_d, feed_l])
-
-            data_file.decorate_tensor_provider(
-                batch_feeder(
-                    paddle.batch(
-                        user_reader(self.inputs), batch_size=2),
-                    pin_memory=True
-                    if fluid.core.is_compiled_with_cuda() else False))
-
-            executor.run(fluid.default_startup_program())
-            self.outputs = []
-
-            data_file.start()
-            for _ in self.input_tensors:
-                self.outputs.append(
-                    executor.run(fetch_list=list(read_out_data)))
-            data_file.reset()
-            self.validate()
-
-    def validate(self):
-        self.assertEqual(len(self.batched_inputs), len(self.outputs))
-        for in_data_list, out_data_list in zip(self.batched_inputs,
-                                               self.outputs):
-            self.assertEqual(len(in_data_list), len(out_data_list))
-            in_data_list_np = [
-                np.array(in_lod_tensor) for in_lod_tensor in in_data_list
-            ]
-            for in_data, out_data in zip(in_data_list_np, out_data_list):
-                self.assertTrue((in_data == out_data).all())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
deleted file mode 100644
index 3efe5aac8848b8230f42f4f3905eefc517c0fa5e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import numpy as np
-from threading import Thread
-
-
-def feed_data(feed_queue, inputs):
-    for in_data in inputs:
-        feed_queue.push(in_data)
-
-
-class TestPyReader(unittest.TestCase):
-    def setUp(self):
-        self.capacity = 10
-        self.batch_size_min = 10
-        self.batch_size_max = 20
-        self.shapes = [(-1, 3, 2, 1), (-1, 1)]
-        self.lod_levels = [0, 0]
-        self.dtypes = ['float32', 'int64']
-        self.iterations = 20
-
-    def test_single_thread_main(self):
-        self.main(use_thread=False)
-
-    def test_multiple_thread_main(self):
-        self.main(use_thread=True)
-
-    def main(self, use_thread=False):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
-            executor = fluid.Executor(place)
-
-            data_file = fluid.layers.py_reader(
-                capacity=self.capacity,
-                dtypes=self.dtypes,
-                lod_levels=self.lod_levels,
-                shapes=self.shapes)
-            feed_queue = data_file.queue
-            read_out_data = fluid.layers.read_file(data_file)
-            self.inputs = []
-
-            for i in range(self.iterations):
-                in_data = fluid.LoDTensorArray()
-                batch_size = np.random.random_integers(self.batch_size_min,
-                                                       self.batch_size_max)
-                for shape, dtype in zip(self.shapes, self.dtypes):
-                    next_data = np.random.uniform(
-                        low=0, high=1000,
-                        size=(batch_size, ) + shape[1:]).astype(dtype)
-                    in_data.append(
-                        fluid.executor._as_lodtensor(next_data, place))
-
-                self.inputs.append(in_data)
-
-            executor.run(fluid.default_startup_program())
-            self.outputs = []
-            if use_thread:
-                thread = Thread(
-                    target=feed_data, args=(feed_queue, self.inputs))
-                thread.start()
-                for in_data in self.inputs:
-                    self.outputs.append(
-                        executor.run(fetch_list=list(read_out_data)))
-            else:
-                for in_data in self.inputs:
-                    feed_queue.push(in_data)
-                    self.outputs.append(
-                        executor.run(fetch_list=list(read_out_data)))
-
-            feed_queue.close()
-            self.validate()
-
-    def validate(self):
-        self.assertEqual(len(self.inputs), len(self.outputs))
-        for in_data_list, out_data_list in zip(self.inputs, self.outputs):
-            self.assertEqual(len(in_data_list), len(out_data_list))
-            in_data_list_np = [
-                np.array(in_lod_tensor) for in_lod_tensor in in_data_list
-            ]
-            for in_data, out_data in zip(in_data_list_np, out_data_list):
-                self.assertTrue((in_data == out_data).all())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_return_list.py b/python/paddle/fluid/tests/unittests/test_py_reader_return_list.py
deleted file mode 100644
index c6e1856507835b2521fd5ebbc16f6900a78204cf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_py_reader_return_list.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.fluid as fluid
-import unittest
-import numpy as np
-
-
-class TestPyReader(unittest.TestCase):
-    def setUp(self):
-        self.batch_size = 32
-        self.epoch_num = 2
-        self.sample_num = 10
-
-    def test_returnlist(self):
-        def reader_creator_random_image(height, width):
-            def reader():
-                for i in range(self.sample_num):
-                    yield np.random.uniform(
-                        low=0, high=255, size=[height, width]),
-
-            return reader
-
-        for return_list in [True, False]:
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                image = fluid.layers.data(
-                    name='image', shape=[784, 784], dtype='float32')
-                reader = fluid.io.PyReader(
-                    feed_list=[image],
-                    capacity=4,
-                    iterable=True,
-                    return_list=return_list)
-
-                user_defined_reader = reader_creator_random_image(784, 784)
-                reader.decorate_sample_list_generator(
-                    paddle.batch(
-                        user_defined_reader, batch_size=self.batch_size),
-                    fluid.core.CPUPlace())
-                # definition of network is omitted
-                executor = fluid.Executor(fluid.core.CPUPlace())
-                executor.run(fluid.default_main_program())
-
-                for _ in range(self.epoch_num):
-                    for data in reader():
-                        if return_list:
-                            executor.run(feed={"image": data[0][0]})
-                        else:
-                            executor.run(feed=data)
-
-            with fluid.dygraph.guard():
-                batch_py_reader = fluid.io.PyReader(capacity=2)
-                user_defined_reader = reader_creator_random_image(784, 784)
-                batch_py_reader.decorate_sample_generator(
-                    user_defined_reader,
-                    batch_size=self.batch_size,
-                    places=fluid.core.CPUPlace())
-
-                for epoch in range(self.epoch_num):
-                    for _, data in enumerate(batch_py_reader()):
-                        # empty network
-                        pass
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py b/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
deleted file mode 100644
index 4efca5e2aafd9c370ccc37791a9900b18f2705f6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.fluid as fluid
-import math
-import unittest
-import numpy as np
-import os
-
-os.environ['CPU_NUM'] = '1'
-
-
-def random_reader(sample_num):
-    def __impl__():
-        for _ in range(sample_num):
-            yield np.random.random(
-                size=[784]).astype('float32'), np.random.random_integers(
-                    low=0, high=9, size=[1]).astype('int64')
-
-    return paddle.reader.cache(__impl__)
-
-
-class TestCaseBase(unittest.TestCase):
-    def setUp(self):
-        self.batch_size = 32
-        self.epoch_num = 2
-        self.sample_num = 165
-
-    def generate_all_data(self, reader):
-        ret = []
-        for d in reader():
-            slots = [[], []]
-            for item in d:
-                slots[0].append(item[0])
-                slots[1].append(item[1])
-            slots = [np.array(slot) for slot in slots]
-            ret.append(slots)
-        return ret
-
-    def run_main(self, reader, use_sample_generator, iterable, drop_last):
-        image = fluid.layers.data(name='image', dtype='float32', shape=[784])
-        label = fluid.layers.data(name='label', dtype='int64', shape=[1])
-        py_reader = fluid.io.PyReader(
-            feed_list=[image, label],
-            capacity=16,
-            iterable=iterable,
-            use_double_buffer=False)
-
-        batch_reader = paddle.batch(reader, self.batch_size, drop_last)
-        all_datas = self.generate_all_data(batch_reader)
-
-        if not use_sample_generator:
-            py_reader.decorate_sample_list_generator(
-                batch_reader, places=fluid.cpu_places())
-        else:
-            py_reader.decorate_sample_generator(
-                reader, self.batch_size, drop_last, places=fluid.cpu_places())
-
-        if drop_last:
-            batch_num = int(self.sample_num / self.batch_size)
-        else:
-            batch_num = math.ceil(float(self.sample_num) / self.batch_size)
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_startup_program())
-        for _ in range(self.epoch_num):
-            if py_reader.iterable:
-                step = 0
-                for data in py_reader():
-                    img, lbl = exe.run(feed=data, fetch_list=[image, label])
-                    self.assertArrayEqual(img, all_datas[step][0])
-                    self.assertArrayEqual(lbl, all_datas[step][1])
-                    step += 1
-                self.assertEqual(step, len(all_datas))
-            else:
-                step = 0
-                try:
-                    py_reader.start()
-                    while True:
-                        img, lbl = exe.run(fetch_list=[image, label])
-                        self.assertArrayEqual(img, all_datas[step][0])
-                        self.assertArrayEqual(lbl, all_datas[step][1])
-                        step += 1
-                except fluid.core.EOFException:
-                    py_reader.reset()
-                    self.assertEqual(step, len(all_datas))
-                    break
-
-    def assertArrayEqual(self, arr1, arr2):
-        self.assertEqual(arr1.shape, arr2.shape)
-        self.assertTrue((arr1 == arr2).all())
-
-    def test_main(self):
-        reader = random_reader(self.sample_num)
-        for use_sample_generator in [False, True]:
-            for iterable in [False, True]:
-                for drop_last in [False, True]:
-                    with fluid.program_guard(fluid.Program(), fluid.Program()):
-                        self.run_main(reader, use_sample_generator, iterable,
-                                      drop_last)
-
-
-class TestCase1(TestCaseBase):
-    def setUp(self):
-        self.batch_size = 32
-        self.epoch_num = 10
-        self.sample_num = 160
-
-
-class TestCase2(TestCaseBase):
-    def setUp(self):
-        self.batch_size = 32
-        self.epoch_num = 2
-        self.sample_num = 200
-
-
-class TestCase3(TestCaseBase):
-    def setUp(self):
-        self.batch_size = 32
-        self.epoch_num = 2
-        self.sample_num = 159
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
deleted file mode 100644
index b5684de4b900e06d60fd4b78fb8eb232c146e552..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-import paddle.fluid.unique_name as unique_name
-import paddle.fluid.core as core
-import numpy as np
-import threading
-import multiprocessing
-import os
-os.environ['CPU_NUM'] = str(4)
-
-
-def as_tensor(np_array_or_tensor, place=None):
-    if isinstance(np_array_or_tensor, fluid.LoDTensor):
-        return np_array_or_tensor
-
-    if place is None:
-        place = fluid.CPUPlace()
-
-    tensor = fluid.LoDTensor()
-    tensor.set(np_array_or_tensor, place)
-    return tensor
-
-
-def as_numpy(tensor_or_numpy):
-    return tensor_or_numpy if isinstance(
-        tensor_or_numpy, np.ndarray) else np.array(tensor_or_numpy)
-
-
-def sample_list_to_tensor_array(sample_list):
-    slot_num = None
-    slots = None
-    for sample in sample_list:
-        if slot_num is None:
-            slot_num = len(sample)
-            slots = [None] * len(sample)
-        else:
-            assert slot_num == len(sample)
-
-        for slot_id, slot_item in enumerate(sample):
-            if slots[slot_id] is None:
-                slots[slot_id] = []
-            slots[slot_id].append(slot_item)
-
-    tensor_array = fluid.LoDTensorArray()
-    for slot in slots:
-        t = fluid.LoDTensor()
-        t.set(np.array(slot), fluid.CPUPlace())
-        tensor_array.append(t)
-
-    return tensor_array
-
-
-def feed_data(feed_queue, batch_reader):
-    data_generator = batch_reader()
-    while True:
-        data = next(data_generator, None)
-        if data is None or (len(data) == 1 and data[0] is None):
-            break
-
-        if not feed_queue.push(sample_list_to_tensor_array(data)):
-            break
-
-    feed_queue.close()
-
-
-def simple_fc_net(in_size,
-                  class_num,
-                  hidden_sizes,
-                  batch_size,
-                  queue_capacity,
-                  use_double_buffer=False,
-                  use_feed_list=True):
-    in_data = fluid.layers.data(name="data", dtype='float32', shape=[in_size])
-    label = fluid.layers.data(name='label', dtype='int64', shape=[1])
-    if use_feed_list:
-        py_reader = fluid.layers.create_py_reader_by_data(
-            capacity=queue_capacity,
-            use_double_buffer=use_double_buffer,
-            feed_list=[in_data, label],
-            name=unique_name.generate('py_reader_name'))
-    else:
-        py_reader = fluid.layers.py_reader(
-            capacity=queue_capacity,
-            shapes=[in_data.shape, label.shape],
-            dtypes=['float32', 'int64'],
-            name=unique_name.generate('py_reader_name'),
-            use_double_buffer=use_double_buffer)
-
-    in_data, label = fluid.layers.read_file(py_reader)
-
-    feed_queue = py_reader.queue
-
-    hidden = in_data
-    for hidden_size in hidden_sizes:
-        hidden = fluid.layers.fc(
-            hidden,
-            size=hidden_size,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-    predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax')
-    loss = fluid.layers.mean(
-        fluid.layers.cross_entropy(
-            input=predict_label, label=label))
-
-    optimizer = fluid.optimizer.Adam()
-    optimizer.minimize(loss)
-    return in_data, label, loss, optimizer, feed_queue, py_reader
-
-
-class TestPyReaderUsingExecutor(unittest.TestCase):
-    def setUp(self):
-        self.in_size = 1000
-        self.hidden_sizes = [50, 30, 20]
-        self.class_num = 10
-        self.batch_size = 32
-        self.iterations = 10
-        self.queue_capacity = 50
-
-    def test(self):
-        for use_cuda in ([False, True]
-                         if core.is_compiled_with_cuda() else [False]):
-            for use_parallel_executor in [False, True]:
-                for use_double_buffer in [False, True]:
-                    for use_feed_list in [False, True]:
-                        for use_decorate_paddle_reader in [False, True]:
-                            print('Test Parameters:'),
-                            print({
-                                'use_cuda': use_cuda,
-                                'use_parallel_executor': use_parallel_executor,
-                                'use_double_buffer': use_double_buffer,
-                                'use_feed_list': use_feed_list,
-                                'use_decorate_paddle_reader':
-                                use_decorate_paddle_reader
-                            })
-                            self.main(use_cuda, use_parallel_executor,
-                                      use_double_buffer, use_feed_list,
-                                      use_decorate_paddle_reader)
-
-    def tensor_reader(self, use_decorate_paddle_reader):
-        def reader():
-            for sample_id in range(self.batch_size * self.iterations *
-                                   self.batch_size_times):
-                in_data = np.random.uniform(
-                    low=0, high=1, size=(self.in_size, )).astype('float32')
-                label = np.random.random_integers(
-                    low=0, high=self.class_num - 1, size=(1, )).astype('int64')
-
-                reshaped_in_data = np.reshape(in_data, [1, -1])
-                reshaped_label = np.reshape(label, [1, -1])
-                if sample_id % (self.batch_size * self.batch_size_times) == 0:
-                    self.inputs.append([reshaped_in_data, reshaped_label])
-                else:
-                    self.inputs[-1][0] = np.concatenate(
-                        (self.inputs[-1][0], reshaped_in_data), axis=0)
-                    self.inputs[-1][1] = np.concatenate(
-                        (self.inputs[-1][1], reshaped_label), axis=0)
-
-                yield in_data, label
-
-            if not use_decorate_paddle_reader:
-                yield None
-
-        return reader
-
-    def main(self,
-             use_cuda=True,
-             use_parallel_executor=False,
-             use_double_buffer=False,
-             use_feed_list=False,
-             use_decorate_paddle_reader=False):
-        assert not use_cuda or use_cuda and core.is_compiled_with_cuda()
-
-        self.use_cuda = use_cuda
-        self.use_parallel_executor = use_parallel_executor
-        self.use_double_buffer = use_double_buffer
-        self.use_feed_list = use_feed_list
-        self.use_decorate_paddle_reader = use_decorate_paddle_reader
-
-        startup_program = fluid.Program()
-        main_program = fluid.Program()
-
-        with fluid.program_guard(main_program, startup_program):
-            in_data, label, loss, optimizer, feed_queue, py_reader = simple_fc_net(
-                in_size=self.in_size,
-                class_num=self.class_num,
-                hidden_sizes=self.hidden_sizes,
-                batch_size=self.batch_size,
-                queue_capacity=self.queue_capacity,
-                use_double_buffer=self.use_double_buffer,
-                use_feed_list=self.use_feed_list)
-
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-            exe = fluid.Executor(place)
-            exe.run(startup_program)
-
-            train_cp = main_program
-            if use_parallel_executor:
-                train_cp = compiler.CompiledProgram(
-                    main_program).with_data_parallel(loss_name=loss.name)
-                if use_cuda:
-                    self.batch_size_times = core.get_cuda_device_count()
-                else:
-                    self.batch_size_times = int(
-                        os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            else:
-                self.batch_size_times = 1
-
-            reader = self.tensor_reader(use_decorate_paddle_reader)
-            batch_reader = paddle.batch(reader, batch_size=self.batch_size)
-
-            self.inputs = []
-            self.outputs = []
-
-            if use_decorate_paddle_reader:
-                if use_feed_list:
-                    py_reader.decorate_paddle_reader(batch_reader)
-                else:
-                    py_reader.decorate_sample_list_generator(batch_reader)
-                py_reader.start()
-            else:
-                thread = threading.Thread(
-                    target=feed_data, args=(feed_queue, batch_reader))
-                thread.daemon = True
-                thread.start()
-
-            try:
-                while True:
-                    fetches = exe.run(train_cp,
-                                      fetch_list=[in_data.name, label.name])
-                    fetches = [as_numpy(fetch) for fetch in fetches]
-                    self.outputs.append(fetches)
-            except fluid.core.EOFException:
-                pass
-
-            feed_queue.close()
-            self.validate()
-            if use_decorate_paddle_reader:
-                py_reader.exited = True
-                py_reader.thread.join()
-            else:
-                thread.join()
-
-    def validate(self):
-        if not self.use_double_buffer:
-            self.assertEqual(len(self.inputs), len(self.outputs))
-        else:
-            self.assertTrue(len(self.inputs) >= len(self.outputs))
-        for idx in range(len(self.outputs)):
-            batch_in = self.inputs[idx]
-            batch_out = self.outputs[idx]
-            self.assertEqual(len(batch_in), len(batch_out))
-            if self.use_parallel_executor and not self.use_double_buffer:
-                self.validate_unordered_batch(batch_in, batch_out)
-            else:
-                for in_data, out_data in zip(batch_in, batch_out):
-                    self.assertEqual(in_data.shape, out_data.shape)
-                    if not self.use_parallel_executor:
-                        self.assertTrue((in_data == out_data).all())
-
-    def validate_unordered_batch(self, batch_in, batch_out):
-        out_index_left_set = set(range(self.batch_size * self.batch_size_times))
-        mapping_num = 0
-        for i in range(self.batch_size * self.batch_size_times):
-            for j in out_index_left_set:
-                flag = True
-                for k in range(len(batch_in)):
-                    in_data = batch_in[k][i]
-                    out_data = batch_out[k][j]
-                    if (in_data != out_data).any():
-                        flag = False
-                        break
-
-                if flag:
-                    out_index_left_set.remove(j)
-                    mapping_num += 1
-                    break
-
-        self.assertEqual(mapping_num, self.batch_size * self.batch_size_times)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_random_crop_op.py b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
deleted file mode 100644
index db65b9e3e9adf400b833e6f7d0afa6e1c1e12347..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_random_crop_op.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-class TestRandomCropOp(OpTest):
-    def setUp(self):
-        to_crop = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] *
-                           5).astype(np.int32)
-        self.possible_res = [
-            np.array([[1, 2, 3], [5, 6, 7]]).astype(np.int32),
-            np.array([[2, 3, 4], [6, 7, 8]]).astype(np.int32),
-            np.array([[5, 6, 7], [9, 10, 11]]).astype(np.int32),
-            np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32)
-        ]
-        self.op_type = "random_crop"
-        self.inputs = {'X': to_crop, 'Seed': np.array([10]).astype('int64')}
-        self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])}
-        self.attrs = {'shape': [2, 3]}
-
-    def test_check_output(self):
-        self.check_output_customized(self.verify_output)
-
-    def verify_output(self, outs):
-        out = np.array(outs[1])
-        for ins in out[:]:
-            is_equal = [(ins == res).all() for res in self.possible_res]
-            self.assertIn(True, is_equal)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_range.py b/python/paddle/fluid/tests/unittests/test_range.py
deleted file mode 100644
index f129ae78cbf7e2ccd5d974de265b8e95d1391df8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_range.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestRangeOp(OpTest):
-    def setUp(self):
-        self.op_type = "range"
-        self.init_config()
-        self.inputs = {
-            'Start': np.array([self.case[0]]).astype(self.dtype),
-            'End': np.array([self.case[1]]).astype(self.dtype),
-            'Step': np.array([self.case[2]]).astype(self.dtype)
-        }
-
-        self.outputs = {
-            'Out': np.arange(self.case[0], self.case[1],
-                             self.case[2]).astype(self.dtype)
-        }
-
-    def init_config(self):
-        self.dtype = np.float32
-        self.case = (0, 1, 0.2)
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFloatRangeOpCase0(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.float32
-        self.case = (0, 5, 1)
-
-
-class TestInt32RangeOpCase0(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.int32
-        self.case = (0, 5, 2)
-
-
-class TestInt32RangeOpCase1(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.int32
-        self.case = (10, 1, -2)
-
-
-class TestInt32RangeOpCase2(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.int32
-        self.case = (-1, -10, -2)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
deleted file mode 100644
index c9fa24b103deb50aa896403e09b11e891fb62c6d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestRankLossOp(OpTest):
-    def setUp(self):
-        self.op_type = "rank_loss"
-        batch_size = 5
-        # labels_{i} = {0, 1.0} or {0, 0.5, 1.0}
-        label = np.random.randint(0, 2, size=(batch_size, 1)).astype("float32")
-        left = np.random.random((batch_size, 1)).astype("float32")
-        right = np.random.random((batch_size, 1)).astype("float32")
-        loss = np.log(1.0 + np.exp(left - right)) - label * (left - right)
-        self.inputs = {'Label': label, 'Left': left, 'Right': right}
-        self.outputs = {'Out': loss}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Left", "Right"], "Out")
-
-    def test_check_grad_ignore_left(self):
-        self.check_grad(["Right"], "Out", no_grad_set=set('Left'))
-
-    def test_check_grad_ignore_right(self):
-        self.check_grad(["Left"], "Out", no_grad_set=set('Right'))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
deleted file mode 100644
index cb1be32935b4a1b6450e347378e6548797158dab..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-os.environ['CPU_NUM'] = str(1)
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-import paddle
-import numpy as np
-import unittest
-
-
-class TestReaderReset(unittest.TestCase):
-    def prepare_data(self):
-        def fake_data_generator():
-            for n in range(self.total_ins_num):
-                yield np.ones(self.ins_shape) * n, n
-
-        return fake_data_generator
-
-    def setUp(self):
-        self.use_cuda = fluid.core.is_compiled_with_cuda()
-        self.ins_shape = [3]
-        self.batch_size = 5
-        self.batch_num = 20
-        self.total_ins_num = self.batch_size * self.batch_num
-        self.test_pass_num = 100
-        self.prepare_data()
-
-    def main(self, with_double_buffer):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-
-        with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.layers.data(
-                name='image', shape=self.ins_shape, dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            data_reader_handle = fluid.io.PyReader(
-                feed_list=[image, label],
-                capacity=16,
-                iterable=False,
-                use_double_buffer=with_double_buffer)
-            fetch_list = [image.name, label.name]
-
-        place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-
-        data_reader_handle.decorate_sample_list_generator(
-            paddle.batch(
-                self.prepare_data(), batch_size=self.batch_size))
-
-        train_cp = compiler.CompiledProgram(main_prog).with_data_parallel()
-
-        batch_id = 0
-        pass_count = 0
-        while pass_count < self.test_pass_num:
-            data_reader_handle.start()
-            try:
-                while True:
-                    data_val, label_val = exe.run(train_cp,
-                                                  fetch_list=fetch_list,
-                                                  return_numpy=True)
-                    ins_num = data_val.shape[0]
-                    broadcasted_label = np.ones((ins_num, ) + tuple(
-                        self.ins_shape)) * label_val.reshape((ins_num, 1))
-                    self.assertEqual(data_val.all(), broadcasted_label.all())
-                    batch_id += 1
-            except fluid.core.EOFException:
-                data_reader_handle.reset()
-                pass_count += 1
-                self.assertEqual(pass_count * self.batch_num, batch_id)
-
-        self.assertEqual(pass_count, self.test_pass_num)
-
-    def test_all(self):
-        self.main(with_double_buffer=False)
-        self.main(with_double_buffer=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
deleted file mode 100644
index 8354f6e65b8607aeb183415f2f0c8658960178ed..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ /dev/null
@@ -1,606 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import numpy as np
-import paddle.fluid.core as core
-
-from paddle.fluid import ParamAttr
-from paddle.fluid.framework import Program, grad_var_name
-from paddle.fluid.executor import Executor
-from paddle.fluid.backward import append_backward
-
-np.random.seed(123)
-
-
-class PyRNNBase(object):
-    def __init__(self, input_shape, output_shape):
-        self.x = np.ones(shape=input_shape).astype("float32")
-        self.y = np.zeros(shape=output_shape).astype("float32")
-
-    def step(self, step_id, x):
-        raise NotImplementedError
-
-    def forward(self):
-        for step_id in range(self.x.shape[0]):
-            self.step(step_id, self.x[step_id])
-        return np.array([np.mean(self.y)])
-
-    def segment_inputs(self):
-        return [self.x[i] for i in range(self.x.shape[0])]
-
-
-class PySimpleRNN1(PyRNNBase):
-    def __init__(self, input_shape, output_shape):
-        super(PySimpleRNN1, self).__init__(input_shape, output_shape)
-
-        seq_len, batch_size, input_dim = input_shape
-        self.h_boot = np.random.normal(size=(batch_size,
-                                             input_dim)).astype("float32")
-
-        self.scale = 1.0 / 2.0
-        men_dim = (seq_len, batch_size, input_dim)
-        self.mems = np.zeros(shape=men_dim).astype("float32")
-
-    def step(self, step_id, x):
-        if step_id == 0:
-            pre_mem = self.h_boot
-        else:
-            pre_mem = self.mems[step_id - 1]
-        self.mems[step_id] = (pre_mem + x) * self.scale
-        self.y[step_id] = self.mems[step_id]
-
-
-class PySimpleRNN2(PyRNNBase):
-    def __init__(self, input_shape, output_shape):
-        super(PySimpleRNN2, self).__init__(input_shape, output_shape)
-
-        seq_len, batch_size, input_dim = input_shape
-        self.W = np.ones(shape=(input_dim, input_dim)).astype("float32")
-        self.U = np.zeros(shape=(input_dim, input_dim)).astype("float32")
-        self.h_boot = np.ones(shape=(batch_size, input_dim)).astype("float32")
-
-        men_dim = (seq_len, batch_size, input_dim)
-        self.mems = np.zeros(shape=men_dim).astype("float32")
-
-    def step(self, step_id, x):
-        if step_id > 0:
-            pre_mem = self.mems[step_id - 1]
-        else:
-            pre_mem = self.h_boot
-        xW = np.matmul(x, self.W).astype("float32")
-        hU = np.matmul(pre_mem, self.U).astype("float32")
-
-        def py_sigmoid(x):
-            return 1. / (1. + np.exp(-x))
-
-        self.mems[step_id] = py_sigmoid(xW + hU)
-        self.y[step_id] = self.mems[step_id]
-
-
-def create_tensor(np_data, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_data, place)
-    return tensor
-
-
-class RecurrentOpTest1(unittest.TestCase):
-    '''
-    Test RNNOp
-    equation:
-        h_t = ( x_t + h_{t-1} ) / scale
-    vars:
-        - x
-    memories:
-        - h
-    outputs:
-        - h
-    '''
-
-    input_dim = 2
-    batch_size = 1
-    sent_len = 1
-
-    def setup_program(self):
-        self.main_program = Program()
-        self.startup_program = Program()
-        self.place = core.CPUPlace()
-
-    def setUp(self):
-        self.setup_program()
-        self.data_field = {"x", "h_boot"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-        h_boot = layers.data(
-            shape=[self.input_dim], dtype='float32', name='h_boot')
-        h_boot.stop_gradient = False
-
-        rnn = layers.StaticRNN()
-        with rnn.step():
-            h_pre = rnn.memory(init=h_boot)
-            x_t = rnn.step_input(x)
-
-            h = layers.scale(
-                x=layers.elementwise_add(
-                    x=h_pre, y=x_t),
-                scale=self.py_rnn.scale)
-
-            rnn.update_memory(h_pre, h)
-            rnn.output(h)
-
-        return rnn()
-
-    def forward(self):
-        self.feed_map = {
-            x: create_tensor(getattr(self.py_rnn, x), self.place)
-            for x in self.data_field
-        }
-        exe = Executor(self.place)
-        out = exe.run(self.main_program,
-                      feed=self.feed_map,
-                      fetch_list=[self.output])
-
-        return out[0]
-
-    def backward(self):
-        self.feed_map = {
-            x: create_tensor(getattr(self.py_rnn, x), self.place)
-            for x in self.data_field
-        }
-        fetch_list = [
-            self.main_program.global_block().var(grad_var_name(x))
-            for x in self.data_field
-        ]
-
-        exe = Executor(self.place)
-        return exe.run(self.main_program,
-                       feed=self.feed_map,
-                       fetch_list=fetch_list,
-                       return_numpy=False)
-
-    def test_backward(self, rtol=0.01):
-        self.check_forward()
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            append_backward(self.output)
-
-        ana_grad = [np.array(x) for x in self.backward()]
-
-        num_grad = self.get_numerical_gradient()
-        for idx, name in enumerate(self.data_field):
-            self.assertEqual(num_grad[idx].shape, ana_grad[idx].shape)
-            self.assertTrue(
-                np.isclose(
-                    num_grad[idx], ana_grad[idx], rtol=rtol).all(),
-                "num_grad (" + name + ") has diff at " + str(self.place) +
-                "\nExpect " + str(num_grad[idx]) + "\n" + "But Got" +
-                str(ana_grad[idx]) + " in class " + self.__class__.__name__)
-
-    def check_forward(self):
-        pd_output = self.forward()
-        py_output = self.py_rnn.forward()
-        self.assertEqual(pd_output.shape, py_output.shape)
-        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.01).all())
-
-    def get_numerical_gradient(self, delta=0.005):
-        dloss_dout = 1.0
-        feed_list = [getattr(self.py_rnn, x) for x in self.data_field]
-        grad_list = [np.zeros_like(x) for x in feed_list]
-        for feed, grad in zip(feed_list, grad_list):
-            for f, g in np.nditer([feed, grad], op_flags=['readwrite']):
-                o = float(f)
-                f[...] = o + delta
-                y_pos = self.forward()
-
-                f[...] = o - delta
-                y_neg = self.forward()
-
-                f[...] = o
-                dout_dfeed = (y_pos - y_neg) / (delta * 2)
-                g[...] = dout_dfeed[0]
-
-        return grad_list
-
-
-class RecurrentOpTest2(RecurrentOpTest1):
-    '''
-    Test RNNOp
-    equation:
-        h_t = \sigma (W x_t + U h_{t-1})
-    weights:
-        - W
-        - U
-    vars:
-        - x
-    memories:
-        - h
-    outputs:
-       - h
-    '''
-
-    input_dim = 2
-    batch_size = 10
-    sent_len = 2
-
-    def setUp(self):
-        self.setup_program()
-
-        self.data_field = {"x", "h_boot", "W", "U"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-        h_boot = layers.data(
-            shape=[self.input_dim], dtype='float32', name='h_boot')
-        h_boot.stop_gradient = False
-
-        rnn = layers.StaticRNN()
-        with rnn.step():
-            h_pre = rnn.memory(init=h_boot)
-            x_t = rnn.step_input(x)
-
-            temp_l = layers.fc(
-                input=x_t,
-                size=self.input_dim,
-                param_attr=ParamAttr(
-                    name='W',
-                    initializer=fluid.initializer.ConstantInitializer(1.0)),
-                bias_attr=False)
-            temp_r = layers.fc(
-                input=h_pre,
-                size=self.input_dim,
-                param_attr=ParamAttr(
-                    name='U',
-                    initializer=fluid.initializer.ConstantInitializer(0.0)),
-                bias_attr=False)
-
-            h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r))
-
-            rnn.update_memory(h_pre, h)
-            rnn.output(h)
-
-        return rnn()
-
-    def test_backward(self):
-        super(RecurrentOpTest2, self).test_backward(rtol=0.01)
-
-
-class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
-    '''
-    Test RNNOp with two memories
-    equation:
-        h_1 = h_pre_1
-        h_2 = h_pre_2
-        y = h_1 + h_2
-    vars:
-        - x
-    memories:
-        - h_1, h_2
-    outputs:
-       - y
-    '''
-
-    class PySimpleRNN3(PyRNNBase):
-        def __init__(self, input_shape, output_shape):
-            super(RecurrentOpMultipleMemoryTest.PySimpleRNN3, self).__init__(
-                input_shape, output_shape)
-
-            seq_len, batch_size, input_dim = input_shape
-            self.h_boot1 = np.random.normal(size=(batch_size,
-                                                  input_dim)).astype("float32")
-            self.h_boot2 = np.random.normal(size=(batch_size,
-                                                  input_dim)).astype("float32")
-
-            men_dim = (seq_len, batch_size, input_dim)
-            self.mems1 = np.zeros(shape=men_dim).astype("float32")
-            self.mems2 = np.zeros(shape=men_dim).astype("float32")
-
-        def step(self, step_id, x):
-            if step_id == 0:
-                pre_mem1 = self.h_boot1
-                pre_mem2 = self.h_boot2
-            else:
-                pre_mem1 = self.mems1[step_id - 1]
-                pre_mem2 = self.mems2[step_id - 1]
-            self.mems1[step_id] = pre_mem1
-            self.mems2[step_id] = pre_mem2
-            self.y[step_id] = self.mems1[step_id] + self.mems2[step_id] + x
-
-    input_dim = 1
-    batch_size = 1
-    sent_len = 2
-
-    def setUp(self):
-        self.setup_program()
-
-        self.data_field = {"x", "h_boot1", "h_boot2"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = RecurrentOpMultipleMemoryTest.PySimpleRNN3(
-            self.input_shape, self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-        h_boot1 = layers.data(
-            shape=[self.batch_size, self.input_dim],
-            dtype='float32',
-            name='h_boot1',
-            append_batch_size=False)
-        h_boot1.stop_gradient = False
-        h_boot2 = layers.data(
-            shape=[self.batch_size, self.input_dim],
-            dtype='float32',
-            name='h_boot2',
-            append_batch_size=False)
-        h_boot2.stop_gradient = False
-
-        rnn = layers.StaticRNN()
-        with rnn.step():
-            h_pre1 = rnn.memory(init=h_boot1)
-            h_pre2 = rnn.memory(init=h_boot2)
-            x_t = rnn.step_input(x)
-
-            mem1 = layers.scale(x=h_pre1, scale=1.0)
-            mem2 = layers.scale(x=h_pre2, scale=1.0)
-            out = layers.sums(input=[mem1, x_t, mem2])
-
-            rnn.update_memory(h_pre1, mem1)
-            rnn.update_memory(h_pre2, mem2)
-            rnn.output(out)
-
-        return rnn()
-
-
-class RecurrentOpNoMemBootTest(RecurrentOpTest1):
-    '''
-    Test RNNOp with two memories
-    equation:
-        mem = x + mem_pre
-        y = mem
-    vars:
-        - x
-    memories:
-        - mem
-    outputs:
-       - y
-    '''
-
-    class PySimpleRNN4(PyRNNBase):
-        def __init__(self, input_shape, output_shape):
-            super(RecurrentOpNoMemBootTest.PySimpleRNN4, self).__init__(
-                input_shape, output_shape)
-            men_dim = input_shape
-            self.mems = np.zeros(shape=men_dim).astype("float32")
-
-        def step(self, step_id, x):
-            if step_id == 0:
-                pre_mem = np.zeros_like(x)
-            else:
-                pre_mem = self.mems[step_id - 1]
-            self.mems[step_id] = pre_mem + x
-            self.y[step_id] = self.mems[step_id]
-
-    input_dim = 1
-    batch_size = 1
-    sent_len = 2
-
-    def setUp(self):
-        self.setup_program()
-
-        self.data_field = {"x"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
-                                                            self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-
-        rnn = layers.StaticRNN()
-        with rnn.step():
-            mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x)
-            x_t = rnn.step_input(x)
-            mem = layers.elementwise_add(x=mem_pre, y=x_t)
-            rnn.update_memory(mem_pre, mem)
-            rnn.output(mem)
-
-        return rnn()
-
-
-class RecurrentOpSubBlockTest(RecurrentOpTest1):
-    '''
-    Test RNNOp with subblock variable
-    equation:
-        y_ = emb * w1
-        h_t = \concat([x, h_{t-1}])
-        h_t = h_t * w2
-        h_t = \\unsqueeze(h_t, 1)
-        h_t = \dot_attention(h_t, y_)
-        h_t = \squeeze(h_t, 1)
-        y = h_t
-    vars:
-        - x
-        - w1
-        - w2
-    memories:
-        - h
-    outputs:
-       - y
-    '''
-
-    class PySimpleRNN5(PyRNNBase):
-        def __init__(self, input_shape, output_shape):
-            super(RecurrentOpSubBlockTest.PySimpleRNN5, self).__init__(
-                input_shape, output_shape)
-
-            seq_len, batch_size, input_dim = input_shape
-            self.w1 = np.random.uniform(
-                -0.1, 0.1, size=(input_dim, input_dim)).astype("float32")
-            self.w2 = np.random.uniform(
-                -0.1, 0.1, size=(input_dim * 2, input_dim)).astype("float32")
-
-            self.emb = np.random.uniform(
-                -0.1, 0.1, size=(seq_len, batch_size,
-                                 input_dim)).astype("float32")
-
-            men_dim = (seq_len, batch_size, input_dim)
-            self.mems = np.zeros(shape=men_dim).astype("float32")
-            self.oy = np.matmul(self.emb, self.w1)
-
-        def step(self, step_id, x):
-            def dot_attention(query, memory):
-                attn = np.matmul(query, memory.transpose((0, 2, 1)))
-                weight = softmax(attn)
-                weight_memory = np.matmul(weight, memory)
-                return weight_memory, weight
-
-            def softmax(x):
-                return np.exp(x) / sum(np.exp(x))
-
-            if step_id == 0:
-                pre_mem = np.zeros_like(x)
-            else:
-                pre_mem = self.mems[step_id - 1]
-            concat_in = np.concatenate([x, pre_mem], 1)
-            new_mem = np.matmul(concat_in, self.w2)
-
-            new_mem = np.expand_dims(new_mem, 1)
-            new_mem, _ = dot_attention(new_mem, self.oy)
-            new_mem = np.squeeze(new_mem, 1)
-
-            self.mems[step_id] = new_mem
-            self.y[step_id] = self.mems[step_id]
-
-    input_dim = 2
-    batch_size = 3
-    sent_len = 3
-
-    def setUp(self):
-        self.setup_program()
-
-        self.data_field = {"x", "emb", "w1", "w2"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = RecurrentOpSubBlockTest.PySimpleRNN5(self.input_shape,
-                                                           self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            rnn_out = self.create_rnn_op()
-            self.output = layers.mean(rnn_out)
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-
-        emb = layers.data(
-            name='emb',
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            append_batch_size=False)
-        emb.stop_gradient = False
-
-        w1 = layers.data(
-            shape=[self.input_dim, self.input_dim],
-            dtype='float32',
-            name='w1',
-            append_batch_size=False)
-        w1.stop_gradient = False
-        w2 = layers.data(
-            shape=[self.input_dim * 2, self.input_dim],
-            dtype='float32',
-            name='w2',
-            append_batch_size=False)
-        w2.stop_gradient = False
-
-        rnn = layers.StaticRNN()
-
-        def dot_attention(query, memory):
-            attn = layers.matmul(query, memory, transpose_y=True)
-            weight = layers.softmax(attn)
-            weight_memory = layers.matmul(weight, memory)
-
-            return weight_memory, weight
-
-        y = layers.matmul(emb, w1)
-        with rnn.step():
-            pre_h = rnn.memory(
-                shape=(self.sent_len, self.input_dim),
-                batch_ref=x,
-                init_value=0.0)
-            step_in = rnn.step_input(x)
-            concat_in = layers.concat([step_in, pre_h], 1)
-            new_h = layers.matmul(concat_in, w2)
-            new_h = layers.unsqueeze(new_h, [1])
-            new_h, _ = dot_attention(new_h, y)
-            new_h = layers.squeeze(new_h, [1])
-
-            rnn.update_memory(pre_h, new_h)
-            rnn.step_output(new_h)
-
-        return rnn()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
deleted file mode 100644
index 65fc1453d8db13ad9c85746c3bf148f898e8f788..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ /dev/null
@@ -1,401 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSumOp(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestMeanOp(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': [1]}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestMaxOp(OpTest):
-    """Remove Max with subgradient from gradient check to confirm the success of CI."""
-
-    def setUp(self):
-        self.op_type = "reduce_max"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': [-1]}
-        self.outputs = {
-            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestMinOp(OpTest):
-    """Remove Min with subgradient from gradient check to confirm the success of CI."""
-
-    def setUp(self):
-        self.op_type = "reduce_min"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': [2]}
-        self.outputs = {
-            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestProdOp(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_prod"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].prod(axis=0)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestAllOp(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_all"
-        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.outputs = {'Out': self.inputs['X'].all()}
-        self.attrs = {'reduce_all': True}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAllOpWithDim(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_all"
-        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1]}
-        self.outputs = {'Out': self.inputs['X'].all(axis=1)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAllOpWithKeepDim(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_all"
-        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
-        self.outputs = {
-            'Out': np.expand_dims(
-                self.inputs['X'].all(axis=1), axis=1)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAnyOp(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_any"
-        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.outputs = {'Out': self.inputs['X'].any()}
-        self.attrs = {'reduce_all': True}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAnyOpWithDim(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_any"
-        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1]}
-        self.outputs = {'Out': self.inputs['X'].any(axis=1)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAnyOpWithKeepDim(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_any"
-        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
-        self.outputs = {
-            'Out': np.expand_dims(
-                self.inputs['X'].any(axis=1), axis=1)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class Test1DReduce(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random(20).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class Test2DReduce0(Test1DReduce):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.attrs = {'dim': [0]}
-        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
-
-
-class Test2DReduce1(Test1DReduce):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.attrs = {'dim': [1]}
-        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
-        }
-
-
-class Test3DReduce0(Test1DReduce):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.attrs = {'dim': [1]}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
-        }
-
-
-class Test3DReduce1(Test1DReduce):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.attrs = {'dim': [2]}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
-        }
-
-
-class Test3DReduce2(Test1DReduce):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.attrs = {'dim': [-2]}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
-        }
-
-
-class Test3DReduce3(Test1DReduce):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.attrs = {'dim': [1, 2]}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
-        }
-
-
-class TestKeepDimReduce(Test1DReduce):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
-                                        keepdims=self.attrs['keep_dim'])
-        }
-
-
-class TestReduceAll(Test1DReduce):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'reduce_all': True}
-        self.outputs = {'Out': self.inputs['X'].sum()}
-
-
-## reduction in multi dims
-class TestReduceMeanOpMultiAxises(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': [1, 2]}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=(1, 2))}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestReduceMaxOpMultiAxises(OpTest):
-    """Remove Max with subgradient from gradient check to confirm the success of CI."""
-
-    def setUp(self):
-        self.op_type = "reduce_max"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': [-2, -1]}
-        self.outputs = {
-            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestReduceMinOpMultiAxises(OpTest):
-    """Remove Min with subgradient from gradient check to confirm the success of CI."""
-
-    def setUp(self):
-        self.op_type = "reduce_min"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': [1, 2]}
-        self.outputs = {
-            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestKeepDimReduceSumMultiAxises(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': [-2, -1], 'keep_dim': True}
-        self.outputs = {
-            'Out':
-            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestReduceSumWithDimOne(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((10, 1, 1)).astype("float64")}
-        self.attrs = {'dim': [1, 2], 'keep_dim': True}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
-                                        keepdims=True)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestReduceSumWithNumelOne(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((1, 1)).astype("float64")}
-        self.attrs = {'dim': [1], 'keep_dim': False}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
-                                        keepdims=False)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestReduceMeanWithDimOne(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((10, 1, 1)).astype("float64")}
-        self.attrs = {'dim': [1], 'keep_dim': False}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=False)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestReduceMeanWithNumelOne(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((1, 1)).astype("float64")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=True)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestReduceAll(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((1, 1, 1)).astype("float64")}
-        self.attrs = {'reduce_all': True, 'keep_dim': False}
-        self.outputs = {'Out': self.inputs['X'].sum()}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reducescatter.py b/python/paddle/fluid/tests/unittests/test_reducescatter.py
deleted file mode 100644
index 58bcc11cd89c0573bc572008eb174e7070937cad..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_reducescatter.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-
-from test_collective_base import TestDistBase
-
-
-class TestReduceScatterOp(TestDistBase):
-    def _setup_config(self):
-        pass
-
-    def test_reducescatter(self):
-        self.check_with_place("collective_reducescatter_op.py",
-                              "reduce_scatter")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ref_by_trainer_id_op.py b/python/paddle/fluid/tests/unittests/test_ref_by_trainer_id_op.py
deleted file mode 100644
index e4872829edb325edcadbd4e1aefaf5014b800d3a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_ref_by_trainer_id_op.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestRefByTrainerIdOp(OpTest):
-    def setUp(self):
-        self.op_type = "ref_by_trainer_id"
-        param_baks = [("x%d" % x, np.random.random((10, 10)).astype("float32"))
-                      for x in range(10)]
-        self.inputs = {
-            'X': param_baks,
-            'TrainerId': np.array([8]).astype("int64")
-        }
-        self.outputs = {'Out': param_baks[8][1]}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_registry.py b/python/paddle/fluid/tests/unittests/test_registry.py
deleted file mode 100644
index 39cf64465ab1ed618ef4e63e1b9d7787d419f3d8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_registry.py
+++ /dev/null
@@ -1,33 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-
-import paddle.fluid as fluid
-import numpy as np
-from decorator_helper import prog_scope
-
-
-class TestRegistry(unittest.TestCase):
-    @prog_scope()
-    def test_registry_layer(self):
-        x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32')
-        output = fluid.layers.mean(x)
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        X = np.random.random((10, 10)).astype("float32")
-        mean_out = exe.run(feed={"X": X}, fetch_list=[output])
-        self.assertAlmostEqual(np.mean(X), mean_out[0], delta=1e-5)
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
deleted file mode 100644
index 62994eec7e7f56267a0990d9a5e3b5c62d7d5fe4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ /dev/null
@@ -1,235 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from functools import partial
-import contextlib
-import numpy as np
-import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-import paddle.fluid.framework as framework
-import paddle.fluid.optimizer as optimizer
-import paddle.fluid.regularizer as regularizer
-from paddle.fluid.backward import append_backward
-
-
-class TestL2DecayRegularizer(unittest.TestCase):
-    def test_l2decay_regularizer(self):
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            regularizer=regularizer.L2DecayRegularizer(0.5))
-        self.assertTrue(mul_x.regularizer is not None)
-        self.assertTrue(
-            isinstance(mul_x.regularizer, regularizer.L2DecayRegularizer))
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward(mean_out)
-        self.assertEqual(len(params_grads), 1)
-        count_ops = len(block.ops)
-        params_grads = optimizer.append_regularization_ops(params_grads)
-        self.assertEqual(len(params_grads), 1)
-        self.assertEqual(len(block.ops), count_ops + 2)
-        self.assertEqual(block.ops[-1].type, 'sum')
-        self.assertEqual(block.ops[-2].type, 'scale')
-
-
-class TestL1DecayRegularizer(unittest.TestCase):
-    def test_l2decay_regularizer(self):
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            regularizer=regularizer.L1DecayRegularizer(0.5))
-        self.assertTrue(mul_x.regularizer is not None)
-        self.assertTrue(
-            isinstance(mul_x.regularizer, regularizer.L1DecayRegularizer))
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward(mean_out)
-        self.assertEqual(len(params_grads), 1)
-        count_ops = len(block.ops)
-        params_grads = optimizer.append_regularization_ops(params_grads)
-        self.assertEqual(len(params_grads), 1)
-        self.assertEqual(len(block.ops), count_ops + 3)
-        self.assertEqual(block.ops[-1].type, 'sum')
-        self.assertEqual(block.ops[-2].type, 'scale')
-        self.assertEqual(block.ops[-3].type, 'sign')
-
-
-def bow_net(data,
-            label,
-            dict_dim,
-            is_sparse=False,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    fluid/PaddleNLP/text_classification/nets.py
-    """
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
-    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    return avg_cost
-
-
-class TestRegularizer(unittest.TestCase):
-    def setUp(self):
-        self.word_dict = paddle.dataset.imdb.word_dict()
-        reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict), batch_size=8)()
-        self.train_data = [next(reader) for _ in range(5)]
-
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    @contextlib.contextmanager
-    def scope_prog_guard(self, main_prog, startup_prog):
-        scope = fluid.core.Scope()
-        with fluid.unique_name.guard():
-            with fluid.scope_guard(scope):
-                with fluid.program_guard(main_prog, startup_prog):
-                    yield
-
-    def run_program(self, place, feed_list):
-        exe = fluid.Executor(place)
-        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
-        exe.run(fluid.default_startup_program())
-
-        main_prog = fluid.default_main_program()
-        param_list = [var.name for var in main_prog.block(0).all_parameters()]
-
-        param_sum = []
-        for data in self.train_data:
-            out = exe.run(main_prog,
-                          feed=feeder.feed(data),
-                          fetch_list=param_list)
-            p_sum = 0
-            for v in out:
-                p_sum += np.sum(np.abs(v))
-            param_sum.append(p_sum)
-        return param_sum
-
-    def check_l2decay_regularizer(self, place, model):
-        main_prog = fluid.framework.Program()
-        startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
-        with self.scope_prog_guard(
-                main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-            avg_cost = model(data, label, len(self.word_dict))
-
-            optimizer = fluid.optimizer.Adagrad(
-                learning_rate=0.1,
-                regularization=fluid.regularizer.L2Decay(1.0))
-            optimizer.minimize(avg_cost)
-            param_sum = self.run_program(place, [data, label])
-        return param_sum
-
-    def check_l2decay(self, place, model):
-        main_prog = fluid.framework.Program()
-        startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
-        with self.scope_prog_guard(
-                main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-            avg_cost_l2 = model(data, label, len(self.word_dict))
-
-            param_list = fluid.default_main_program().block(0).all_parameters()
-            para_sum = []
-            for para in param_list:
-                para_mul = fluid.layers.square(x=para)
-                para_sum.append(fluid.layers.reduce_sum(input=para_mul))
-            avg_cost_l2 += fluid.layers.sums(para_sum) * .5
-
-            optimizer = fluid.optimizer.Adagrad(learning_rate=0.1)
-            optimizer.minimize(avg_cost_l2)
-            param_sum = self.run_program(place, [data, label])
-        return param_sum
-
-    def test_l2(self):
-        for place in self.get_places():
-            dense_sparse_p_sum = []
-            for sparse in [True, False]:
-                model = partial(bow_net, is_sparse=sparse)
-                framework_l2 = self.check_l2decay_regularizer(place, model)
-                l2 = self.check_l2decay(place, model)
-                assert len(l2) == len(framework_l2)
-                for i in range(len(l2)):
-                    assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5)
-                dense_sparse_p_sum.append(framework_l2)
-
-            assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1])
-            for i in range(len(dense_sparse_p_sum[0])):
-                assert np.isclose(
-                    a=dense_sparse_p_sum[0][i],
-                    b=dense_sparse_p_sum[1][i],
-                    rtol=5e-5)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
deleted file mode 100644
index a7fd271ae7dc554813e8c5f18487add8eff0a2b5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.layers.control_flow import lod_rank_table
-import numpy
-import functools
-
-
-def convert_to_offset(lod):
-    offset = [[0] for i in lod]
-    for i, level in enumerate(lod):
-        for seq_len in level:
-            offset[i].append(offset[i][-1] + seq_len)
-    return offset
-
-
-class TestReorderLoDTensor(unittest.TestCase):
-    num_seq = 5
-    # [name, shape, lod_level] pair indicating data info of source and target
-    data_desc = (['input', [9], 0], ['ref', [5], 1])
-
-    @classmethod
-    def setUpClass(cls):
-        cls.set_program()
-
-    @classmethod
-    def set_program(cls):
-        dat = fluid.layers.data(
-            name=cls.data_desc[0][0], shape=cls.data_desc[0][1])
-        dat.stop_gradient = False
-        rank_dat = fluid.layers.data(
-            name=cls.data_desc[1][0], shape=cls.data_desc[1][1])
-        table = lod_rank_table(rank_dat)
-        new_dat = fluid.layers.reorder_lod_tensor_by_rank(
-            x=dat, rank_table=table)
-        loss = fluid.layers.reduce_sum(new_dat)
-        fluid.backward.append_backward(loss=loss)
-        cls.fetch_list = [new_dat, cls.data_desc[0][0] + '@GRAD']
-
-    def run_program(self):
-        outputs = []
-        input_grads = []
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.set_inputs(place)
-            exe = fluid.Executor(place)
-            output, input_grad = exe.run(fluid.default_main_program(),
-                                         feed=self.inputs,
-                                         fetch_list=self.fetch_list,
-                                         return_numpy=False)
-            outputs.append(output)
-            input_grads.append(input_grad)
-        self.actual_outputs = outputs
-        self.actual_grads = input_grads
-
-    def set_data(self):
-        self.data = {}
-        for desc in self.data_desc:
-            data_name = desc[0]
-            data_shape = desc[1]
-            data_lod_level = desc[2]
-            data_lod = []
-            for i in range(data_lod_level):
-                lod_level_i = numpy.random.randint(
-                    low=1,
-                    high=5,
-                    size=self.num_seq if i == 0 else sum(lod_level_i)).tolist()
-                data_lod.append(lod_level_i)
-            data_value = numpy.random.random(
-                size=[sum(data_lod[-1]) if data_lod else self.num_seq
-                      ] + data_shape).astype('float32')
-            self.data[data_name] = (data_value, data_lod)
-
-    def set_inputs(self, place):
-        self.inputs = {}
-        for desc in self.data_desc:
-            tensor = fluid.Tensor()
-            tensor.set(self.data[desc[0]][0], place)
-            if self.data[desc[0]][1]:
-                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
-            self.inputs[desc[0]] = tensor
-
-    def reorder(self):
-        level = 0
-        # compute the rank_table according to ref_lod
-        ref_lod = self.data[self.data_desc[1][0]][1][level]
-        rank_table = []  # list of (index, length)
-        for i in range(len(ref_lod)):
-            rank_table.append((i, ref_lod[i]))
-        rank_table = sorted(
-            rank_table, key=functools.cmp_to_key(lambda x, y: y[1] - x[1]))
-
-        # compute the input sequence info according to input_lod
-        input_value, input_lod = self.data[self.data_desc[0][0]]
-        offset_lod = convert_to_offset(input_lod)
-
-        input_table = []  # list of (offset, length, sub_lod)
-        if offset_lod:
-            for i in range(len(offset_lod[level]) - 1):
-                start_idx = i
-                end_idx = i + 1
-                sub_lod = []
-                for lod_level_i in offset_lod[level:]:
-                    sub_lod_i = []
-                    for idx in range(start_idx, end_idx):
-                        sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
-                            idx])
-                    sub_lod.append(sub_lod_i)
-                    start_idx = lod_level_i[start_idx]
-                    end_idx = lod_level_i[end_idx]
-                input_table.append((start_idx, end_idx - start_idx, sub_lod))
-        else:
-            input_table = [(i, 1, []) for i in range(len(rank_table))]
-
-        # reorder by rank_table
-        output_value = numpy.zeros_like(input_value)
-        output_lod = []
-        offset = 0
-        for index, length in rank_table:
-            input_seq_start = input_table[index][0]
-            input_seq_len = input_table[index][1]
-            input_seq_end = input_seq_start + input_seq_len
-            output_value[offset:offset + input_seq_len] = input_value[
-                input_seq_start:input_seq_end]
-            offset += input_seq_len
-
-            input_seq_sub_lod = input_table[index][2]
-            if len(output_lod) == 0:
-                output_lod = [[] for i in input_seq_sub_lod]
-            for i, level in enumerate(input_seq_sub_lod):
-                output_lod[i].extend(level)
-        return output_value, output_lod
-
-    def test_reorder_lod_tensor(self):
-        self.data_desc[0][-1] = 2  # input is lod_tensor
-        self.set_data()
-        self.run_program()
-        # check output
-        expect_output, expect_output_lod = self.reorder()
-        for actual_output in self.actual_outputs:
-            self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual_output), expect_output, atol=0.001))
-            self.assertEqual(expect_output_lod,
-                             actual_output.recursive_sequence_lengths())
-        # check gradient
-        expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
-        expect_grad_lod = self.data[self.data_desc[0][0]][1]
-        for actual_grad in self.actual_grads:
-            self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual_grad), expect_grad, atol=0.001))
-            self.assertEqual(expect_grad_lod,
-                             actual_grad.recursive_sequence_lengths())
-
-    def test_reorder_tensor(self):
-        self.data_desc[0][-1] = 0  # input is tensor
-        self.set_data()
-        self.run_program()
-        # check output
-        expect_output, expect_output_lod = self.reorder()
-        for actual_output in self.actual_outputs:
-            self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual_output), expect_output, atol=0.001))
-            self.assertEqual(expect_output_lod,
-                             actual_output.recursive_sequence_lengths())
-        # check gradient
-        expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
-        expect_grad_lod = self.data[self.data_desc[0][0]][1]
-        for actual_grad in self.actual_grads:
-            self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual_grad), expect_grad, atol=0.001))
-            self.assertEqual(expect_grad_lod,
-                             actual_grad.recursive_sequence_lengths())
-
-        # compare outputs between LodTensors with explicit and implicit lod
-        # use the same data but set the input lod explicitly
-        input_lod = [[1] * len(self.data[self.data_desc[0][0]][0])]
-        self.inputs[self.data_desc[0][0]].set_recursive_sequence_lengths(
-            input_lod)
-        # preserve the output of LodTensor with implicit lod to compare
-        expect_output = [
-            numpy.array(actual_output) for actual_output in self.actual_outputs
-        ]
-        self.run_program()
-        for actual_output in self.actual_outputs:
-            self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual_output), expect_output, atol=0.001))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
deleted file mode 100644
index beaffd055c11106e911ad7a9519b13dd362f5468..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ /dev/null
@@ -1,226 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-from op_test import OpTest
-import paddle.fluid as fluid
-
-
-# situation 1: have shape( list, no tensor), no actual shape(Tensor)
-class TestReshapeOp(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.attrs = {"shape": self.new_shape}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (2, 25)
-        self.new_shape = (5, 10)
-        self.infered_shape = (5, 10)
-
-    def test_check_output(self):
-
-        self.check_output(no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer1(TestReshapeOp):
-    def init_data(self):
-        self.ori_shape = (5, 10)
-        self.new_shape = (5, -1, 5)
-        self.infered_shape = (5, -1, 5)
-
-
-class TestReshapeOpDimInfer2(TestReshapeOp):
-    def init_data(self):
-        self.ori_shape = (2, 2, 6)
-        self.new_shape = (2, 0, 3, -1)
-        self.infered_shape = (2, 2, 3, -1)
-
-
-# situation 2: have shape(list, no tensor), have actual shape(Tensor)
-class TestReshapeOpWithInputShape(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            "Shape": np.array(
-                self.actual_shape, dtype="int32")
-        }
-        self.attrs = {"shape": self.new_shape}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.actual_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (6, 5)
-        self.new_shape = (0, -1, 5)
-        self.actual_shape = (2, 3, 5)
-
-    def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-# Situation 3: have shape(list, have tensor), no actual shape(Tensor)
-class TestReshapeOp_attr_ShapeTensor(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        shape_tensor = []
-        for index, ele in enumerate(self.new_shape):
-            shape_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            'ShapeTensor': shape_tensor
-        }
-        self.attrs = {'shape': self.shape}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (2, 25)
-        self.new_shape = (5, 10)
-        self.infered_shape = (5, 10)
-        self.shape = (-1, -1)
-
-    def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer1_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
-    def init_data(self):
-        self.ori_shape = (5, 10)
-        self.new_shape = (5, -1, 5)
-        self.infered_shape = (5, -1, 5)
-        self.shape = (5, -1, -1)
-
-
-class TestReshapeOpDimInfer2_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
-    def init_data(self):
-        self.ori_shape = (2, 2, 6)
-        self.new_shape = (2, 0, 3, -1)
-        self.infered_shape = (2, 2, 3, -1)
-        self.shape = (2, 0, 3, -1)
-
-
-# Situation 4: have shape(Tensor), no actual shape(Tensor)
-class TestReshapeOp_attr_OnlyShape(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            "Shape": np.array(
-                self.new_shape, dtype="int32")
-        }
-        self.attrs = {}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (2, 25)
-        self.new_shape = (5, 10)
-        self.infered_shape = (5, 10)
-
-    def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
-    def init_data(self):
-        self.ori_shape = (5, 10)
-        self.new_shape = (5, -1, 5)
-        self.infered_shape = (5, -1, 5)
-        self.shape = (5, -1, -1)
-
-
-class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
-    def init_data(self):
-        self.ori_shape = (2, 2, 6)
-        self.new_shape = (2, 0, 3, -1)
-        self.infered_shape = (2, 2, 3, -1)
-        self.shape = (2, 0, 3, -1)
-
-
-# Test python API
-class TestReshapeAPI(OpTest):
-    # situation 1: have shape( list, no tensor), no actual shape(Tensor)
-    def test_1(self):
-        input = np.random.random([2, 25]).astype("float32")
-        shape = [2, 5, 5]
-        positive_five = fluid.layers.fill_constant([1], "int32", 5)
-        x = fluid.layers.data(
-            name="x", shape=[2, 25], append_batch_size=False, dtype="float32")
-
-        actual_shape = fluid.layers.data(
-            name="shape",
-            shape=[1, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        # situation 1: have shape( list, no tensor), no actual shape(Tensor)
-        out_1 = fluid.layers.reshape(x, shape)
-        # situation 2: have shape(list, no tensor), have actual shape(Tensor)
-        out_2 = fluid.layers.reshape(x, shape=shape, actual_shape=actual_shape)
-        # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
-        out_3 = fluid.layers.reshape(x, shape=[positive_five, 10])
-        # Situation 4: have shape(Tensor), no actual shape(Tensor)
-        out_4 = fluid.layers.reshape(x, shape=actual_shape)
-
-        exe = fluid.Executor(place=fluid.CPUPlace())
-        res_1, res_2, res_3, res_4 = exe.run(
-            fluid.default_main_program(),
-            feed={"x": input,
-                  "shape": np.array([2, 5, 5]).astype("int32")},
-            fetch_list=[out_1, out_2, out_3, out_4])
-
-        assert np.array_equal(res_1, input.reshape(shape))
-        assert np.array_equal(res_2, input.reshape(shape))
-        assert np.array_equal(res_3, input.reshape([5, 10]))
-        assert np.array_equal(res_4, input.reshape(shape))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
deleted file mode 100644
index fafc7de33bc2e49dba699bba8466868f8901614d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
+++ /dev/null
@@ -1,412 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License")
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import math
-import copy
-from op_test import OpTest
-from test_anchor_generator_op import anchor_generator_in_python
-from test_multiclass_nms_op import iou
-from test_multiclass_nms_op import nms
-
-
-def multiclass_nms(prediction, class_num, keep_top_k, nms_threshold):
-    selected_indices = {}
-    num_det = 0
-    for c in range(class_num):
-        if c not in prediction.keys():
-            continue
-        cls_dets = prediction[c]
-        all_scores = np.zeros(len(cls_dets))
-        for i in range(all_scores.shape[0]):
-            all_scores[i] = cls_dets[i][4]
-        indices = nms(cls_dets, all_scores, 0.0, nms_threshold, -1, False, 1.0)
-        selected_indices[c] = indices
-        num_det += len(indices)
-
-    score_index = []
-    for c, indices in selected_indices.items():
-        for idx in indices:
-            score_index.append((prediction[c][idx][4], c, idx))
-
-    sorted_score_index = sorted(
-        score_index, key=lambda tup: tup[0], reverse=True)
-    if keep_top_k > -1 and num_det > keep_top_k:
-        sorted_score_index = sorted_score_index[:keep_top_k]
-        num_det = keep_top_k
-    nmsed_outs = []
-    for s, c, idx in sorted_score_index:
-        xmin = prediction[c][idx][0]
-        ymin = prediction[c][idx][1]
-        xmax = prediction[c][idx][2]
-        ymax = prediction[c][idx][3]
-        nmsed_outs.append([c + 1, s, xmin, ymin, xmax, ymax])
-
-    return nmsed_outs, num_det
-
-
-def retinanet_detection_out(boxes_list, scores_list, anchors_list, im_info,
-                            score_threshold, nms_threshold, nms_top_k,
-                            keep_top_k):
-    class_num = scores_list[0].shape[-1]
-    im_height, im_width, im_scale = im_info
-
-    num_level = len(scores_list)
-    prediction = {}
-    for lvl in range(num_level):
-        scores_per_level = scores_list[lvl]
-        scores_per_level = scores_per_level.flatten()
-        bboxes_per_level = boxes_list[lvl]
-        bboxes_per_level = bboxes_per_level.flatten()
-        anchors_per_level = anchors_list[lvl]
-        anchors_per_level = anchors_per_level.flatten()
-
-        thresh = score_threshold if lvl < (num_level - 1) else 0.0
-        selected_indices = np.argwhere(scores_per_level > thresh)
-        scores = scores_per_level[selected_indices]
-        sorted_indices = np.argsort(-scores, axis=0, kind='mergesort')
-        if nms_top_k > -1 and nms_top_k < sorted_indices.shape[0]:
-            sorted_indices = sorted_indices[:nms_top_k]
-
-        for i in range(sorted_indices.shape[0]):
-            idx = selected_indices[sorted_indices[i]]
-            idx = idx[0][0]
-            a = int(idx / class_num)
-            c = int(idx % class_num)
-            box_offset = a * 4
-            anchor_box_width = anchors_per_level[
-                box_offset + 2] - anchors_per_level[box_offset] + 1
-            anchor_box_height = anchors_per_level[
-                box_offset + 3] - anchors_per_level[box_offset + 1] + 1
-            anchor_box_center_x = anchors_per_level[
-                box_offset] + anchor_box_width / 2
-            anchor_box_center_y = anchors_per_level[box_offset +
-                                                    1] + anchor_box_height / 2
-
-            target_box_center_x = bboxes_per_level[
-                box_offset] * anchor_box_width + anchor_box_center_x
-            target_box_center_y = bboxes_per_level[
-                box_offset + 1] * anchor_box_height + anchor_box_center_y
-            target_box_width = math.exp(bboxes_per_level[box_offset +
-                                                         2]) * anchor_box_width
-            target_box_height = math.exp(bboxes_per_level[
-                box_offset + 3]) * anchor_box_height
-
-            pred_box_xmin = target_box_center_x - target_box_width / 2
-            pred_box_ymin = target_box_center_y - target_box_height / 2
-            pred_box_xmax = target_box_center_x + target_box_width / 2 - 1
-            pred_box_ymax = target_box_center_y + target_box_height / 2 - 1
-
-            pred_box_xmin = pred_box_xmin / im_scale
-            pred_box_ymin = pred_box_ymin / im_scale
-            pred_box_xmax = pred_box_xmax / im_scale
-            pred_box_ymax = pred_box_ymax / im_scale
-
-            pred_box_xmin = max(
-                min(pred_box_xmin, np.round(im_width / im_scale) - 1), 0.)
-            pred_box_ymin = max(
-                min(pred_box_ymin, np.round(im_height / im_scale) - 1), 0.)
-            pred_box_xmax = max(
-                min(pred_box_xmax, np.round(im_width / im_scale) - 1), 0.)
-            pred_box_ymax = max(
-                min(pred_box_ymax, np.round(im_height / im_scale) - 1), 0.)
-
-            if c not in prediction.keys():
-                prediction[c] = []
-            prediction[c].append([
-                pred_box_xmin, pred_box_ymin, pred_box_xmax, pred_box_ymax,
-                scores_per_level[idx]
-            ])
-
-    nmsed_outs, nmsed_num = multiclass_nms(prediction, class_num, keep_top_k,
-                                           nms_threshold)
-    return nmsed_outs, nmsed_num
-
-
-def batched_retinanet_detection_out(boxes, scores, anchors, im_info,
-                                    score_threshold, nms_threshold, nms_top_k,
-                                    keep_top_k):
-    batch_size = scores[0].shape[0]
-    det_outs = []
-    lod = []
-
-    for n in range(batch_size):
-        boxes_per_batch = []
-        scores_per_batch = []
-
-        num_level = len(scores)
-        for lvl in range(num_level):
-            boxes_per_batch.append(boxes[lvl][n])
-            scores_per_batch.append(scores[lvl][n])
-
-        nmsed_outs, nmsed_num = retinanet_detection_out(
-            boxes_per_batch, scores_per_batch, anchors, im_info[n],
-            score_threshold, nms_threshold, nms_top_k, keep_top_k)
-        lod.append(nmsed_num)
-        if nmsed_num == 0:
-            continue
-
-        det_outs.extend(nmsed_outs)
-    return det_outs, lod
-
-
-class TestRetinanetDetectionOutOp1(OpTest):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2**(num_levels - i))
-            self.layer_w.append(2**(num_levels - i))
-
-    def init_test_input(self):
-        anchor_num = len(self.aspect_ratios) * self.scales_per_octave
-        num_levels = self.max_level - self.min_level + 1
-        self.scores_list = []
-        self.bboxes_list = []
-        self.anchors_list = []
-
-        for i in range(num_levels):
-            layer_h = self.layer_h[i]
-            layer_w = self.layer_w[i]
-
-            input_feat = np.random.random((self.batch_size, self.input_channels,
-                                           layer_h, layer_w)).astype('float32')
-            score = np.random.random(
-                (self.batch_size, self.class_num * anchor_num, layer_h,
-                 layer_w)).astype('float32')
-            score = np.transpose(score, [0, 2, 3, 1])
-            score = score.reshape((self.batch_size, -1, self.class_num))
-            box = np.random.random((self.batch_size, self.box_size * anchor_num,
-                                    layer_h, layer_w)).astype('float32')
-            box = np.transpose(box, [0, 2, 3, 1])
-            box = box.reshape((self.batch_size, -1, self.box_size))
-            anchor_sizes = []
-            for octave in range(self.scales_per_octave):
-                anchor_sizes.append(
-                    float(self.anchor_strides[i] * (2**octave)) /
-                    float(self.scales_per_octave) * self.anchor_scale)
-            anchor, var = anchor_generator_in_python(
-                input_feat=input_feat,
-                anchor_sizes=anchor_sizes,
-                aspect_ratios=self.aspect_ratios,
-                variances=[1.0, 1.0, 1.0, 1.0],
-                stride=[self.anchor_strides[i], self.anchor_strides[i]],
-                offset=0.5)
-            anchor = np.reshape(anchor, [-1, 4])
-            self.scores_list.append(score.astype('float32'))
-            self.bboxes_list.append(box.astype('float32'))
-            self.anchors_list.append(anchor.astype('float32'))
-
-        self.im_info = np.array([[256., 256., 1.5]]).astype(
-            'float32')  #im_height, im_width, scale
-
-    def setUp(self):
-        self.set_argument()
-        self.init_test_input()
-
-        nmsed_outs, lod = batched_retinanet_detection_out(
-            self.bboxes_list, self.scores_list, self.anchors_list, self.im_info,
-            self.score_threshold, self.nms_threshold, self.nms_top_k,
-            self.keep_top_k)
-        nmsed_outs = np.array(nmsed_outs).astype('float32')
-        self.op_type = 'retinanet_detection_output'
-        self.inputs = {
-            'BBoxes': [('b0', self.bboxes_list[0]), ('b1', self.bboxes_list[1]),
-                       ('b2', self.bboxes_list[2]), ('b3', self.bboxes_list[3]),
-                       ('b4', self.bboxes_list[4])],
-            'Scores': [('s0', self.scores_list[0]), ('s1', self.scores_list[1]),
-                       ('s2', self.scores_list[2]), ('s3', self.scores_list[3]),
-                       ('s4', self.scores_list[4])],
-            'Anchors':
-            [('a0', self.anchors_list[0]), ('a1', self.anchors_list[1]),
-             ('a2', self.anchors_list[2]), ('a3', self.anchors_list[3]),
-             ('a4', self.anchors_list[4])],
-            'ImInfo': (self.im_info, [[1, ]])
-        }
-        self.outputs = {'Out': (nmsed_outs, [lod])}
-        self.attrs = {
-            'score_threshold': self.score_threshold,
-            'nms_top_k': self.nms_top_k,
-            'nms_threshold': self.nms_threshold,
-            'keep_top_k': self.keep_top_k,
-            'nms_eta': 1.,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRetinanetDetectionOutOp2(OpTest):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-        # Here test the case there the shape of each FPN level
-        # is irrelevant.
-        self.layer_h = [1, 4, 8, 8, 16]
-        self.layer_w = [1, 4, 8, 8, 16]
-
-
-class TestRetinanetDetectionOutOpNo3(TestRetinanetDetectionOutOp1):
-    def set_argument(self):
-        # Here set 2.0 to test the case there is no outputs.
-        # In practical use, 0.0 < score_threshold < 1.0
-        self.score_threshold = 2.0
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2**(num_levels - i))
-            self.layer_w.append(2**(num_levels - i))
-
-
-class TestRetinanetDetectionOutOpNo4(TestRetinanetDetectionOutOp1):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 2
-        self.max_level = 5
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2**(num_levels - i))
-            self.layer_w.append(2**(num_levels - i))
-
-    def setUp(self):
-        self.set_argument()
-        self.init_test_input()
-
-        nmsed_outs, lod = batched_retinanet_detection_out(
-            self.bboxes_list, self.scores_list, self.anchors_list, self.im_info,
-            self.score_threshold, self.nms_threshold, self.nms_top_k,
-            self.keep_top_k)
-        nmsed_outs = np.array(nmsed_outs).astype('float32')
-        self.op_type = 'retinanet_detection_output'
-        self.inputs = {
-            'BBoxes':
-            [('b0', self.bboxes_list[0]), ('b1', self.bboxes_list[1]),
-             ('b2', self.bboxes_list[2]), ('b3', self.bboxes_list[3])],
-            'Scores': [('s0', self.scores_list[0]), ('s1', self.scores_list[1]),
-                       ('s2', self.scores_list[2]),
-                       ('s3', self.scores_list[3])],
-            'Anchors':
-            [('a0', self.anchors_list[0]), ('a1', self.anchors_list[1]),
-             ('a2', self.anchors_list[2]), ('a3', self.anchors_list[3])],
-            'ImInfo': (self.im_info, [[1, ]])
-        }
-        self.outputs = {'Out': (nmsed_outs, [lod])}
-        self.attrs = {
-            'score_threshold': self.score_threshold,
-            'nms_top_k': self.nms_top_k,
-            'nms_threshold': self.nms_threshold,
-            'keep_top_k': self.keep_top_k,
-            'nms_eta': 1.,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRetinanetDetectionOutOpNo5(TestRetinanetDetectionOutOp1):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 100
-        self.keep_top_k = 10
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2**(num_levels - i))
-            self.layer_w.append(2**(num_levels - i))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
deleted file mode 100644
index e83f548c228c7c045ff795e882738ea56e3f2d24..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestReverseOp(OpTest):
-    def initTestCase(self):
-        self.x = np.random.random((3, 4)).astype('float32')
-        self.axis = [0]
-
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = "reverse"
-        self.inputs = {"X": self.x}
-        self.attrs = {'axis': self.axis}
-        out = self.x
-        for a in self.axis:
-            out = np.flip(out, axis=a)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestCase0(TestReverseOp):
-    def initTestCase(self):
-        self.x = np.random.random((3, 4)).astype('float32')
-        self.axis = [1]
-
-
-class TestCase1(TestReverseOp):
-    def initTestCase(self):
-        self.x = np.random.random((3, 4)).astype('float32')
-        self.axis = [0, 1]
-
-
-class TestCase2(TestReverseOp):
-    def initTestCase(self):
-        self.x = np.random.random((3, 4, 5)).astype('float32')
-        self.axis = [0, 2]
-
-
-class TestCase3(TestReverseOp):
-    def initTestCase(self):
-        self.x = np.random.random((3, 4, 5)).astype('float32')
-        self.axis = [1, 2]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
deleted file mode 100644
index eb12bc741767340a3e7e3580a8b95065d4267693..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ /dev/null
@@ -1,226 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import paddle.fluid as fluid
-
-
-def create_selected_rows_and_tensor(scope, place, height, row_num,
-                                    embedding_size):
-    sr = scope.var("@selected_rows@").get_selected_rows()
-    tensor = scope.var("grad").get_tensor()
-
-    rows = np.random.random_integers(
-        low=0, high=height - 1, size=[row_num, ]).astype('int64')
-    sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
-
-    sr.set_height(height)
-    sr.set_rows(rows)
-    sr.get_tensor().set(sr_val, place)
-
-    tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32')
-    for i in range(row_num):
-        row = rows[i]
-        tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :]
-
-    tensor.set(tensor_val, place)
-    return tensor_val, sr_val
-
-
-class TestBase(unittest.TestCase):
-    def setup(self,
-              place,
-              is_sparse,
-              centered,
-              size,
-              row_num=None,
-              epsilon=1e-6):
-        np.random.seed(5)  # fix seed
-
-        self.scope = fluid.global_scope()
-        self.place = place
-
-        self.param_name = "param"
-        self.param = np.random.random(size).astype("float32")
-
-        self.mean_square_name = "mean_square"
-        self.mean_square = np.random.uniform(
-            low=1, high=2, size=size).astype("float32")
-
-        self.mean_grad_name = "mean_grad"
-        self.mean_grad = np.random.random(size).astype("float32")
-
-        self.lr_name = "lr"
-        self.learning_rate = np.array([0.01]).astype("float32")
-
-        self.grad_name = "grad"
-
-        self.is_sparse = is_sparse
-        if self.is_sparse:
-            self.grad_sr_name = "@selected_rows@"
-            self.grad, self.grad_sr = create_selected_rows_and_tensor(
-                self.scope, place, size[0], row_num, size[1])
-        else:
-            self.grad = np.random.random(size).astype("float32")
-            grad_tensor = self.scope.var(self.grad_name).get_tensor()
-            grad_tensor.set(self.grad, place)
-
-        self.moment_name = "moment"
-        self.moment = np.random.uniform(
-            low=0, high=1, size=size).astype("float32")
-
-        self.epsilon = epsilon
-        self.decay = 0.9
-        self.momentum = 0.1
-        self.centered = centered
-
-        self.ms_out = self.decay * self.mean_square + (1 - self.decay
-                                                       ) * self.grad * self.grad
-        if centered:
-            self.mg_out = self.decay * self.mean_grad + (1 - self.decay
-                                                         ) * self.grad
-            self.moment_out = self.momentum * self.moment + \
-                              self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
-        else:
-            self.moment_out = self.momentum * self.moment + \
-                              self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon)
-
-        self.param_out = self.param - self.moment_out
-
-        # create and initialize Param Variable
-        self.param_tensor = self.scope.var(self.param_name).get_tensor()
-        self.param_tensor.set(self.param, place)
-
-        self.mean_square_tensor = self.scope.var(
-            self.mean_square_name).get_tensor()
-        self.mean_square_tensor.set(self.mean_square, place)
-
-        lr = self.scope.var(self.lr_name).get_tensor()
-        lr.set(self.learning_rate, place)
-
-        self.moment_tensor = self.scope.var(self.moment_name).get_tensor()
-        self.moment_tensor.set(self.moment, place)
-
-        if self.centered:
-            self.mean_grad_tensor = self.scope.var(
-                self.mean_grad_name).get_tensor()
-            self.mean_grad_tensor.set(self.mean_grad, place)
-
-    def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
-        self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol),
-            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-            + str(expect_t) + "\n" + "But Got" + str(actual_t))
-
-
-class TestRmspropOp(TestBase):
-    def check_with_place(self,
-                         place,
-                         is_sparse,
-                         centered,
-                         size,
-                         row_num=None,
-                         epsilon=1e-6):
-        self.setup(place, is_sparse, centered, size, row_num, epsilon)
-        self.run_and_check()
-
-    def run_and_check(self):
-        grad_name = self.grad_sr_name if self.is_sparse else self.grad_name
-
-        kwargs = {
-            'Param': self.param_name,
-            'Grad': grad_name,
-            'MeanSquare': self.mean_square_name,
-            'Moment': self.moment_name,
-            'LearningRate': self.lr_name,
-            'ParamOut': self.param_name,
-            'MeanSquareOut': self.mean_square_name,
-            'MomentOut': self.moment_name,
-            'epsilon': self.epsilon,
-            'decay': self.decay,
-            'momentum': self.momentum,
-            'centered': self.centered
-        }
-
-        if self.centered:
-            kwargs['MeanGrad'] = self.mean_grad_name
-            kwargs['MeanGradOut'] = self.mean_grad_name
-
-        rmsprop_op = Operator('rmsprop', **kwargs)
-        atol = 1e-6
-
-        rmsprop_op.run(self.scope, self.place)
-
-        self.check(
-            np.array(self.mean_square_tensor),
-            self.ms_out,
-            self.place,
-            self.mean_square_name,
-            atol=atol)
-        self.check(
-            np.array(self.moment_tensor),
-            self.moment_out,
-            self.place,
-            self.moment_name,
-            atol=atol)
-        self.check(
-            np.array(self.param_tensor),
-            self.param_out,
-            self.place,
-            self.param_name,
-            atol=atol)
-
-        if self.centered:
-            self.check(
-                np.array(self.mean_grad_tensor), self.mg_out, self.place,
-                self.mean_grad_name)
-
-    def test_rmsprop(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        size = (128, 320)
-        for place in places:
-            for centered in [False, True]:
-                with fluid.scope_guard(core.Scope()):
-                    self.check_with_place(
-                        place, is_sparse=False, centered=centered, size=size)
-
-                with fluid.scope_guard(core.Scope()):
-                    self.check_with_place(
-                        place,
-                        is_sparse=True,
-                        centered=centered,
-                        row_num=512,
-                        size=size)
-
-                with fluid.scope_guard(core.Scope()):
-                    self.check_with_place(
-                        place,
-                        is_sparse=True,
-                        centered=centered,
-                        row_num=60,
-                        size=size)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
deleted file mode 100644
index 9bfec8e9bdd8c4667fb19f3dd479b759d6dd665b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-from paddle.fluid.framework import Program
-from paddle.fluid.executor import Executor
-from paddle.fluid.backward import append_backward
-import numpy as np
-import paddle.fluid.core as core
-
-
-class RNNMemoryHelperOpTest(unittest.TestCase):
-    def setUp(self):
-        self.program = Program()
-        self.place = core.CPUPlace()
-
-        self.X = self.program.global_block().create_var(
-            name='X', shape=[2, 3], dtype='float32')
-        self.Out = self.program.global_block().create_var(
-            name='Out', shape=[2, 3], dtype='float32')
-        self.program.global_block().append_op(
-            type='rnn_memory_helper',
-            inputs={"X": self.X},
-            outputs={"Out": self.Out},
-            attrs={})
-
-    def test_forward(self):
-        x_np = np.random.normal(size=(2, 3)).astype("float32")
-        self.feed_map = {'X': x_np}
-        self.fetch_list = [self.Out]
-        exe = Executor(self.place)
-        out = exe.run(self.program,
-                      feed=self.feed_map,
-                      fetch_list=self.fetch_list)
-        self.assertTrue(np.allclose(out[0], x_np, rtol=1e-5))
-
-
-class RNNMemoryHelperGradOpTest(unittest.TestCase):
-    def setUp(self):
-        self.program = Program()
-        self.place = core.CPUPlace()
-
-        self.input_names = ['X', 'Out', 'Out@GRAD']
-        self.input_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32')
-            for name in self.input_names
-        }
-
-        self.output_names = ['X@GRAD']
-        self.output_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32')
-            for name in self.output_names
-        }
-
-        self.program.global_block().append_op(
-            type='rnn_memory_helper_grad',
-            inputs=self.input_vars,
-            outputs=self.output_vars,
-            attrs={})
-
-    def test_backward(self):
-        self.feed_map = {
-            name: np.random.normal(size=(2, 3)).astype("float32")
-            for name in self.input_names
-        }
-        self.fetch_list = [self.output_vars['X@GRAD']]
-
-        exe = Executor(self.place)
-        out = exe.run(self.program,
-                      feed=self.feed_map,
-                      fetch_list=self.fetch_list)
-        np.isclose(out[0], self.feed_map['Out@GRAD'], rtol=1e-5)
-
-
-class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
-    def setUp(self):
-        self.program = Program()
-        self.fake_program = Program()
-        self.place = core.CPUPlace()
-
-        self.input_names = ['X', 'Out']
-        self.input_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32')
-            for name in self.input_names
-        }
-        self.input_vars["Out@GRAD"] = \
-            self.fake_program.global_block().create_var(
-                name="Out@GRAD", shape=[2, 3], dtype='float32')
-
-        self.output_names = ['X@GRAD']
-        self.output_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32')
-            for name in self.output_names
-        }
-
-        self.program.global_block().append_op(
-            type='rnn_memory_helper_grad',
-            inputs=self.input_vars,
-            outputs=self.output_vars,
-            attrs={})
-
-    def test_backward(self):
-        self.feed_map = {
-            name: np.random.normal(size=(2, 3)).astype("float32")
-            for name in ['X', 'Out']
-        }
-        self.fetch_list = [self.output_vars['X@GRAD']]
-
-        exe = Executor(self.place)
-        out = exe.run(self.program,
-                      feed=self.feed_map,
-                      fetch_list=self.fetch_list)
-        self.assertTrue(
-            np.allclose(
-                out[0], np.zeros(shape=(2, 3)).astype("float32"), rtol=1e-5))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
deleted file mode 100644
index aad2eaed94a356d06afb7cd461eecefa2de98d8c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ /dev/null
@@ -1,174 +0,0 @@
-#    Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-import sys
-from op_test import OpTest
-
-
-class TestROIAlignOp(OpTest):
-    def set_data(self):
-        self.init_test_case()
-        self.make_rois()
-        self.calc_roi_align()
-        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
-        self.attrs = {
-            'spatial_scale': self.spatial_scale,
-            'pooled_height': self.pooled_height,
-            'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio
-        }
-
-        self.outputs = {'Out': self.out_data}
-
-    def init_test_case(self):
-        self.batch_size = 3
-        self.channels = 3
-        self.height = 8
-        self.width = 6
-
-        # n, c, h, w
-        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
-
-        self.spatial_scale = 1.0 / 2.0
-        self.pooled_height = 2
-        self.pooled_width = 2
-        self.sampling_ratio = -1
-
-        self.x = np.random.random(self.x_dim).astype('float32')
-
-    def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
-                 bin_size_h, bin_size_w):
-        count = roi_bin_grid_h * roi_bin_grid_w
-        bilinear_pos = np.zeros(
-            [self.channels, self.pooled_height, self.pooled_width, count, 4],
-            np.float32)
-        bilinear_w = np.zeros(
-            [self.pooled_height, self.pooled_width, count, 4], np.float32)
-        for ph in range(self.pooled_width):
-            for pw in range(self.pooled_height):
-                c = 0
-                for iy in range(roi_bin_grid_h):
-                    y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \
-                        bin_size_h / roi_bin_grid_h
-                    for ix in range(roi_bin_grid_w):
-                        x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \
-                            bin_size_w / roi_bin_grid_w
-                        if y < -1.0 or y > self.height or \
-                               x < -1.0 or x > self.width:
-                            continue
-                        if y <= 0:
-                            y = 0
-                        if x <= 0:
-                            x = 0
-                        y_low = int(y)
-                        x_low = int(x)
-                        if y_low >= self.height - 1:
-                            y = y_high = y_low = self.height - 1
-                        else:
-                            y_high = y_low + 1
-                        if x_low >= self.width - 1:
-                            x = x_high = x_low = self.width - 1
-                        else:
-                            x_high = x_low + 1
-                        ly = y - y_low
-                        lx = x - x_low
-                        hy = 1 - ly
-                        hx = 1 - lx
-                        for ch in range(self.channels):
-                            bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low,
-                                                                 x_low]
-                            bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low,
-                                                                 x_high]
-                            bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high,
-                                                                 x_low]
-                            bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high,
-                                                                 x_high]
-                        bilinear_w[ph, pw, c, 0] = hy * hx
-                        bilinear_w[ph, pw, c, 1] = hy * lx
-                        bilinear_w[ph, pw, c, 2] = ly * hx
-                        bilinear_w[ph, pw, c, 3] = ly * lx
-                        c = c + 1
-        return bilinear_pos, bilinear_w
-
-    def calc_roi_align(self):
-        self.out_data = np.zeros(
-            (self.rois_num, self.channels, self.pooled_height,
-             self.pooled_width)).astype('float32')
-
-        for i in range(self.rois_num):
-            roi = self.rois[i]
-            roi_batch_id = int(roi[0])
-            x_i = self.x[roi_batch_id]
-            roi_xmin = roi[1] * self.spatial_scale
-            roi_ymin = roi[2] * self.spatial_scale
-            roi_xmax = roi[3] * self.spatial_scale
-            roi_ymax = roi[4] * self.spatial_scale
-            roi_width = max(roi_xmax - roi_xmin, 1)
-            roi_height = max(roi_ymax - roi_ymin, 1)
-            bin_size_h = float(roi_height) / float(self.pooled_height)
-            bin_size_w = float(roi_width) / float(self.pooled_width)
-            roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
-                                 math.ceil(roi_height / self.pooled_height)
-            roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \
-                                 math.ceil(roi_width / self.pooled_width)
-            count = int(roi_bin_grid_h * roi_bin_grid_w)
-            pre_size = count * self.pooled_width * self.pooled_height
-            bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin,
-                                                     int(roi_bin_grid_h),
-                                                     int(roi_bin_grid_w),
-                                                     bin_size_h, bin_size_w)
-            for ch in range(self.channels):
-                align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1)
-                output_val = align_per_bin.mean(axis=-1)
-                self.out_data[i, ch, :, :] = output_val
-
-    def make_rois(self):
-        rois = []
-        self.rois_lod = [[]]
-        for bno in range(self.batch_size):
-            self.rois_lod[0].append(bno + 1)
-            for i in range(bno + 1):
-                x1 = np.random.random_integers(
-                    0, self.width // self.spatial_scale - self.pooled_width)
-                y1 = np.random.random_integers(
-                    0, self.height // self.spatial_scale - self.pooled_height)
-
-                x2 = np.random.random_integers(x1 + self.pooled_width,
-                                               self.width // self.spatial_scale)
-                y2 = np.random.random_integers(
-                    y1 + self.pooled_height, self.height // self.spatial_scale)
-
-                roi = [bno, x1, y1, x2, y2]
-                rois.append(roi)
-        self.rois_num = len(rois)
-        self.rois = np.array(rois).astype("float32")
-
-    def setUp(self):
-        self.op_type = "roi_align"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py b/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
deleted file mode 100644
index e742993c2bf2c0a49e95feff63d97b98c9c85d44..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
+++ /dev/null
@@ -1,318 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License")
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUWARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-import sys
-import paddle.compat as cpt
-from op_test import OpTest
-from math import sqrt
-from math import floor
-
-
-def gt_e(a, b):
-    return a > b or abs(a - b) < 1e-4
-
-
-def gt(a, b):
-    return (a - b) > 1e-4
-
-
-def lt_e(a, b):
-    return a < b or abs(a - b) < 1e-4
-
-
-def in_quad(x, y, roi_x, roi_y):
-    # check if (x, y) is in the boundary of roi
-    for i in range(4):
-        xs = roi_x[i]
-        ys = roi_y[i]
-        xe = roi_x[(i + 1) % 4]
-        ye = roi_y[(i + 1) % 4]
-        if abs(ys - ye) < 1e-4:
-            if abs(y - ys) < 1e-4 and abs(y - ye) < 1e-4 and gt_e(
-                    x, min(xs, xe)) and lt_e(x, max(xs, xe)):
-                return True
-        else:
-            intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs
-            if abs(intersec_x - x) < 1e-4 and gt_e(y, min(ys, ye)) and lt_e(
-                    y, max(ys, ye)):
-                return True
-    n_cross = 0
-    for i in range(4):
-        xs = roi_x[i]
-        ys = roi_y[i]
-        xe = roi_x[(i + 1) % 4]
-        ye = roi_y[(i + 1) % 4]
-        if abs(ys - ye) < 1e-4:
-            continue
-        if lt_e(y, min(ys, ye)) or gt(y, max(ys, ye)):
-            continue
-        intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs
-        if abs(intersec_x - x) < 1e-4:
-            return True
-        if gt(intersec_x, x):
-            n_cross += 1
-    return (n_cross % 2 == 1)
-
-
-def get_transform_matrix(transformed_width, transformed_height, roi_x, roi_y):
-    x0 = roi_x[0]
-    x1 = roi_x[1]
-    x2 = roi_x[2]
-    x3 = roi_x[3]
-    y0 = roi_y[0]
-    y1 = roi_y[1]
-    y2 = roi_y[2]
-    y3 = roi_y[3]
-
-    len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1))
-    len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2))
-    len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3))
-    len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0))
-    estimated_height = (len2 + len4) / 2.0
-    estimated_width = (len1 + len3) / 2.0
-
-    normalized_height = max(2, transformed_height)
-    normalized_width = round(estimated_width *
-                             (normalized_height - 1) / estimated_height) + 1
-    normalized_width = max(2, min(normalized_width, transformed_width))
-
-    dx1 = x1 - x2
-    dx2 = x3 - x2
-    dx3 = x0 - x1 + x2 - x3
-    dy1 = y1 - y2
-    dy2 = y3 - y2
-    dy3 = y0 - y1 + y2 - y3
-    matrix = np.zeros([9])
-    matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / (
-        normalized_width - 1)
-    matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / (
-        normalized_height - 1)
-    matrix[8] = 1
-
-    matrix[3] = (y1 - y0 + matrix[6] *
-                 (normalized_width - 1) * y1) / (normalized_width - 1)
-    matrix[4] = (y3 - y0 + matrix[7] *
-                 (normalized_height - 1) * y3) / (normalized_height - 1)
-    matrix[5] = y0
-
-    matrix[0] = (x1 - x0 + matrix[6] *
-                 (normalized_width - 1) * x1) / (normalized_width - 1)
-    matrix[1] = (x3 - x0 + matrix[7] *
-                 (normalized_height - 1) * x3) / (normalized_height - 1)
-    matrix[2] = x0
-    return matrix
-
-
-def get_source_coords(matrix, out_w, out_h):
-    u = matrix[0] * out_w + matrix[1] * out_h + matrix[2]
-    v = matrix[3] * out_w + matrix[4] * out_h + matrix[5]
-    w = matrix[6] * out_w + matrix[7] * out_h + matrix[8]
-    in_w = u / w
-    in_h = v / w
-    return in_w, in_h
-
-
-def bilinear_interpolate(in_data, in_n, in_c, in_w, in_h):
-
-    batch_size = in_data.shape[0]
-    channels = in_data.shape[1]
-    height = in_data.shape[2]
-    width = in_data.shape[3]
-
-    if gt(-0.5, in_w) or gt(in_w, width - 0.5) or gt(-0.5, in_h) or gt(
-            in_h, height - 0.5):
-        return 0.0
-
-    if gt(0, in_w):
-        in_w = 0
-    if gt(0, in_h):
-        in_h = 0
-
-    in_w_floor = floor(in_w)
-    in_h_floor = floor(in_h)
-
-    if gt_e(in_w_floor, width - 1):
-        in_w_ceil = width - 1
-        in_w_floor = width - 1
-        in_w = in_w_floor
-    else:
-        in_w_ceil = in_w_floor + 1
-
-    if gt_e(in_h_floor, height - 1):
-        in_h_ceil = height - 1
-        in_h_floor = height - 1
-        in_h = in_h_floor
-    else:
-        in_h_ceil = in_h_floor + 1
-
-    w_floor = in_w - in_w_floor
-    h_floor = in_h - in_h_floor
-    w_ceil = 1 - w_floor
-    h_ceil = 1 - h_floor
-    v1 = in_data[in_n][in_c][int(in_h_floor)][int(in_w_floor)]
-    v2 = in_data[in_n][in_c][int(in_h_ceil)][int(in_w_floor)]
-    v3 = in_data[in_n][in_c][int(in_h_ceil)][int(in_w_ceil)]
-    v4 = in_data[in_n][in_c][int(in_h_floor)][int(in_w_ceil)]
-    w1 = w_ceil * h_ceil
-    w2 = w_ceil * h_floor
-    w3 = w_floor * h_floor
-    w4 = w_floor * h_ceil
-    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
-    return val
-
-
-def lod_convert(lod):
-    ret = [0]
-    for count in lod:
-        ret.append(ret[-1] + count)
-    return ret
-
-
-def roi_transform(in_data, rois, rois_lod, transformed_height,
-                  transformed_width, spatial_scale):
-    channels = in_data.shape[1]
-    in_height = in_data.shape[2]
-    in_width = in_data.shape[3]
-    rois_num = rois.shape[0]
-
-    roi2image = [0] * rois_num
-    rois_lod = lod_convert(rois_lod[0])
-    for i in range(len(rois_lod) - 1):
-        for j in range(rois_lod[i], rois_lod[i + 1]):
-            roi2image[j] = i
-
-    out = np.zeros([rois_num, channels, transformed_height, transformed_width])
-    mask = np.zeros(
-        [rois_num, 1, transformed_height, transformed_width]).astype('int')
-    matrix = np.zeros([rois_num, 9], dtype=in_data.dtype)
-    for n in range(rois_num):
-        roi_x = []
-        roi_y = []
-        for k in range(4):
-            roi_x.append(rois[n][2 * k] * spatial_scale)
-            roi_y.append(rois[n][2 * k + 1] * spatial_scale)
-        image_id = roi2image[n]
-        transform_matrix = get_transform_matrix(
-            transformed_width, transformed_height, roi_x, roi_y)
-        matrix[n] = transform_matrix
-        for c in range(channels):
-            for out_h in range(transformed_height):
-                for out_w in range(transformed_width):
-                    in_w, in_h = get_source_coords(transform_matrix, out_w,
-                                                   out_h)
-                    if in_quad(in_w, in_h, roi_x, roi_y) and gt_e(
-                            in_w, -0.5) and lt_e(in_w, in_width - 0.5) and gt_e(
-                                in_h, -0.5) and lt_e(in_h, in_height - 0.5):
-                        out[n][c][out_h][out_w] = bilinear_interpolate(
-                            in_data, image_id, c, in_w, in_h)
-                        mask[n][0][out_h][out_w] = 1
-                    else:
-                        out[n][c][out_h][out_w] = 0.0
-                        mask[n][0][out_h][out_w] = 0
-    return out.astype("float32"), mask, matrix
-
-
-class TestROIPoolOp(OpTest):
-    def set_data(self):
-        self.init_test_case()
-        self.make_rois()
-
-        self.inputs = {'X': self.x, 'ROIs': (self.rois, self.rois_lod)}
-
-        self.attrs = {
-            'spatial_scale': self.spatial_scale,
-            'transformed_height': self.transformed_height,
-            'transformed_width': self.transformed_width
-        }
-        out, mask, transform_matrix = roi_transform(
-            self.x, self.rois, self.rois_lod, self.transformed_height,
-            self.transformed_width, self.spatial_scale)
-        self.outputs = {
-            'Out': out,
-            'Mask': mask,
-            'TransformMatrix': transform_matrix
-        }
-
-    def init_test_case(self):
-        self.batch_size = 2
-        self.channels = 2
-        self.height = 8
-        self.width = 8
-
-        # n, c, h, w
-        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
-
-        self.spatial_scale = 1.0 / 2.0
-        self.transformed_height = 2
-        self.transformed_width = 3
-
-        self.x = np.random.random(self.x_dim).astype('float32')
-
-    def make_rois(self):
-        rois = []
-        self.rois_lod = [[]]
-        for bno in range(self.batch_size):
-            self.rois_lod[0].append(bno + 1)
-            for i in range(bno + 1):
-                x1 = np.random.randint(
-                    0,
-                    self.width // self.spatial_scale - self.transformed_width)
-                y1 = np.random.randint(
-                    0,
-                    self.height // self.spatial_scale - self.transformed_height)
-
-                x2 = np.random.randint(x1 + self.transformed_width,
-                                       self.width // self.spatial_scale)
-                y2 = np.random.randint(
-                    0,
-                    self.height // self.spatial_scale - self.transformed_height)
-
-                x3 = np.random.randint(x1 + self.transformed_width,
-                                       self.width // self.spatial_scale)
-                y3 = np.random.randint(y1 + self.transformed_height,
-                                       self.height // self.spatial_scale)
-
-                x4 = np.random.randint(
-                    0,
-                    self.width // self.spatial_scale - self.transformed_width)
-                y4 = np.random.randint(y1 + self.transformed_height,
-                                       self.height // self.spatial_scale)
-
-                roi = [x1, y1, x2, y2, x3, y3, x4, y4]
-                rois.append(roi)
-        self.rois_num = len(rois)
-        self.rois = np.array(rois).astype("float32")
-
-    def setUp(self):
-        self.op_type = "roi_perspective_transform"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.outputs['Out2InIdx'] = np.zeros(
-            [np.product(self.outputs['Out'].shape), 4]).astype("int32")
-        self.outputs['Out2InWeights'] = np.zeros(
-            [np.product(self.outputs['Out'].shape), 4]).astype("float32")
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
deleted file mode 100644
index ad4cd2e803bfae4c3fbc04503331b9a786b25d17..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-import sys
-import paddle.compat as cpt
-from op_test import OpTest
-
-
-class TestROIPoolOp(OpTest):
-    def set_data(self):
-        self.init_test_case()
-        self.make_rois()
-        self.calc_roi_pool()
-
-        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
-
-        self.attrs = {
-            'spatial_scale': self.spatial_scale,
-            'pooled_height': self.pooled_height,
-            'pooled_width': self.pooled_width
-        }
-
-        self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
-
-    def init_test_case(self):
-        self.batch_size = 3
-        self.channels = 3
-        self.height = 6
-        self.width = 4
-
-        # n, c, h, w
-        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
-
-        self.spatial_scale = 1.0 / 4.0
-        self.pooled_height = 2
-        self.pooled_width = 2
-
-        self.x = np.random.random(self.x_dim).astype('float32')
-
-    def calc_roi_pool(self):
-        out_data = np.zeros((self.rois_num, self.channels, self.pooled_height,
-                             self.pooled_width))
-        argmax_data = np.zeros((self.rois_num, self.channels,
-                                self.pooled_height, self.pooled_width))
-
-        for i in range(self.rois_num):
-            roi = self.rois[i]
-            roi_batch_id = int(roi[0])
-            roi_start_w = int(cpt.round(roi[1] * self.spatial_scale))
-            roi_start_h = int(cpt.round(roi[2] * self.spatial_scale))
-            roi_end_w = int(cpt.round(roi[3] * self.spatial_scale))
-            roi_end_h = int(cpt.round(roi[4] * self.spatial_scale))
-
-            roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
-            roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
-
-            x_i = self.x[roi_batch_id]
-
-            bin_size_h = float(roi_height) / float(self.pooled_height)
-            bin_size_w = float(roi_width) / float(self.pooled_width)
-
-            for c in range(self.channels):
-                for ph in range(self.pooled_height):
-                    for pw in range(self.pooled_width):
-                        hstart = int(math.floor(ph * bin_size_h))
-                        wstart = int(math.floor(pw * bin_size_w))
-                        hend = int(math.ceil((ph + 1) * bin_size_h))
-                        wend = int(math.ceil((pw + 1) * bin_size_w))
-
-                        hstart = min(max(hstart + roi_start_h, 0), self.height)
-                        hend = min(max(hend + roi_start_h, 0), self.height)
-                        wstart = min(max(wstart + roi_start_w, 0), self.width)
-                        wend = min(max(wend + roi_start_w, 0), self.width)
-
-                        is_empty = (hend <= hstart) or (wend <= wstart)
-                        if is_empty:
-                            out_data[i, c, ph, pw] = 0
-                        else:
-                            out_data[i, c, ph, pw] = -sys.float_info.max
-
-                        argmax_data[i, c, ph, pw] = -1
-
-                        for h in range(hstart, hend):
-                            for w in range(wstart, wend):
-                                if x_i[c, h, w] > out_data[i, c, ph, pw]:
-                                    out_data[i, c, ph, pw] = x_i[c, h, w]
-                                    argmax_data[i, c, ph,
-                                                pw] = h * self.width + w
-
-        self.outs = out_data.astype('float32')
-        self.argmaxes = argmax_data.astype('int64')
-
-    def make_rois(self):
-        rois = []
-        self.rois_lod = [[]]
-        for bno in range(self.batch_size):
-            self.rois_lod[0].append(bno + 1)
-            for i in range(bno + 1):
-                x1 = np.random.random_integers(
-                    0, self.width // self.spatial_scale - self.pooled_width)
-                y1 = np.random.random_integers(
-                    0, self.height // self.spatial_scale - self.pooled_height)
-
-                x2 = np.random.random_integers(x1 + self.pooled_width,
-                                               self.width // self.spatial_scale)
-                y2 = np.random.random_integers(
-                    y1 + self.pooled_height, self.height // self.spatial_scale)
-
-                roi = [bno, x1, y1, x2, y2]
-                rois.append(roi)
-        self.rois_num = len(rois)
-        self.rois = np.array(rois).astype("float32")
-
-    def setUp(self):
-        self.op_type = "roi_pool"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
deleted file mode 100644
index 301d05260e0ae0852f420565edbffc77c51e1b38..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def row_conv_forward(x, lod, wt):
-    out = np.zeros_like(x)
-    num_sequences = len(lod[0])
-    seq_info = [0]
-    for seq_len in lod[0]:
-        seq_info.append(seq_info[-1] + seq_len)
-    context_length = wt.shape[0]
-
-    for i in range(num_sequences):  # loop over number of sequences
-        start = seq_info[i]
-        end = seq_info[i + 1]
-        curinput = x[start:end, :]
-        curoutput = out[start:end, :]
-
-        cur_timesteps = end - start
-        for j in range(cur_timesteps):  # loop over different timesteps
-            for k in range(context_length):
-                if j + k >= cur_timesteps:
-                    continue
-                curoutput[j, :] += curinput[j + k, :] * wt[k, :]
-
-    return out
-
-
-class TestRowConvOp1(OpTest):
-    def setUp(self):
-
-        self.op_type = "row_conv"
-        lod = [[2, 3, 2]]
-        T = sum(lod[0])
-        D = 16
-        context_length = 2
-
-        x = np.random.random((T, D)).astype("float32")
-        wt = np.random.random((context_length, D)).astype("float32")
-        self.inputs = {'X': (x, lod), 'Filter': wt}
-
-        out = row_conv_forward(x, lod, wt)
-        self.outputs = {'Out': (out, lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.05)
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Filter'], 'Out', max_relative_error=0.05, no_grad_set=set('X'))
-
-    def test_check_grad_ignore_wt(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Filter'))
-
-
-class TestRowConvOp2(OpTest):
-    def setUp(self):
-
-        self.op_type = "row_conv"
-        lod = [[20, 30, 50]]
-        T = sum(lod[0])
-        D = 35
-        context_length = 35
-
-        x = np.random.random((T, D)).astype("float32")
-        wt = np.random.random((context_length, D)).astype("float32")
-        self.inputs = {'X': (x, lod), 'Filter': wt}
-
-        out = row_conv_forward(x, lod, wt)
-        self.outputs = {'Out': (out, lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    #max_relative_error is increased from 0.05 to 0.06 as for higher
-    #dimensional input, the dX on CPU for some values has max_rel_error
-    #slightly more than 0.05
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.06)
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Filter'], 'Out', max_relative_error=0.06, no_grad_set=set('X'))
-
-    def test_check_grad_ignore_wt(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Filter'))
-
-
-def row_conv_foward_Tensor(x, wt):
-    out = np.zeros_like(x)
-    num_sequence = x.shape[0]
-    timesteps = x.shape[1]
-    context_length = wt.shape[0]
-    for i in range(num_sequence):
-        cur_in = x[i:i + 1, :][0]
-        cur_out = out[i:i + 1, :][0]
-        for j in range(timesteps):
-            for k in range(context_length):
-                if j + k >= timesteps:
-                    continue
-                cur_out[j, :] += cur_in[j + k, :] * wt[k, :]
-    return out
-
-
-class TestRowOpWithTensorInput(OpTest):
-    def setUp(self):
-        self.op_type = "row_conv"
-        length = [3, 2, 4]
-        B = 2
-        T = sum(length)
-        D = 16
-        context_length = 2
-
-        x = np.random.random((B, T, D)).astype("float32")
-        wt = np.random.random((context_length, D)).astype("float32")
-        self.inputs = {'X': x, 'Filter': wt}
-
-        out = row_conv_foward_Tensor(x, wt)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Filter'], 'Out', max_relative_error=0.05, no_grad_set=set('X'))
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.05)
-
-    def test_check_grad_ignore_wt(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Filter'))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
deleted file mode 100644
index 3dba961dc9df070f8920629e759acf7de6275ee7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ /dev/null
@@ -1,397 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest
-from test_anchor_generator_op import anchor_generator_in_python
-from test_generate_proposal_labels_op import _generate_groundtruth
-from test_generate_proposal_labels_op import _bbox_overlaps, _box_to_delta
-
-
-def rpn_target_assign(anchor_by_gt_overlap,
-                      rpn_batch_size_per_im,
-                      rpn_positive_overlap,
-                      rpn_negative_overlap,
-                      rpn_fg_fraction,
-                      use_random=True):
-    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
-    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
-        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
-
-    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
-    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
-        anchor_by_gt_overlap.shape[1])]
-    anchors_with_max_overlap = np.where(
-        anchor_by_gt_overlap == gt_to_anchor_max)[0]
-
-    labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
-    labels[anchors_with_max_overlap] = 1
-    labels[anchor_to_gt_max >= rpn_positive_overlap] = 1
-
-    num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
-    fg_inds = np.where(labels == 1)[0]
-    if len(fg_inds) > num_fg and use_random:
-        disable_inds = np.random.choice(
-            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
-    else:
-        disable_inds = fg_inds[num_fg:]
-
-    labels[disable_inds] = -1
-    fg_inds = np.where(labels == 1)[0]
-    bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32)
-
-    num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
-    bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
-    if len(bg_inds) > num_bg and use_random:
-        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
-    else:
-        enable_inds = bg_inds[:num_bg]
-
-    fg_fake_inds = np.array([], np.int32)
-    fg_value = np.array([fg_inds[0]], np.int32)
-    fake_num = 0
-    for bg_id in enable_inds:
-        if bg_id in fg_inds:
-            fake_num += 1
-            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
-    labels[enable_inds] = 0
-
-    bbox_inside_weight[fake_num:, :] = 1
-    fg_inds = np.where(labels == 1)[0]
-    bg_inds = np.where(labels == 0)[0]
-    loc_index = np.hstack([fg_fake_inds, fg_inds])
-    score_index = np.hstack([fg_inds, bg_inds])
-    labels = labels[score_index]
-    assert not np.any(labels == -1), "Wrong labels with -1"
-
-    gt_inds = anchor_to_gt_argmax[loc_index]
-
-    return loc_index, score_index, labels, gt_inds, bbox_inside_weight
-
-
-def get_anchor(n, c, h, w):
-    input_feat = np.random.random((n, c, h, w)).astype('float32')
-    anchors, _ = anchor_generator_in_python(
-        input_feat=input_feat,
-        anchor_sizes=[32., 64.],
-        aspect_ratios=[0.5, 1.0],
-        variances=[1.0, 1.0, 1.0, 1.0],
-        stride=[16.0, 16.0],
-        offset=0.5)
-    return anchors
-
-
-def rpn_target_assign_in_python(all_anchors,
-                                gt_boxes,
-                                is_crowd,
-                                im_info,
-                                lod,
-                                rpn_straddle_thresh,
-                                rpn_batch_size_per_im,
-                                rpn_positive_overlap,
-                                rpn_negative_overlap,
-                                rpn_fg_fraction,
-                                use_random=True):
-    anchor_num = all_anchors.shape[0]
-    batch_size = len(lod) - 1
-    for i in range(batch_size):
-        im_height = im_info[i][0]
-        im_width = im_info[i][1]
-        im_scale = im_info[i][2]
-        if rpn_straddle_thresh >= 0:
-            # Only keep anchors inside the image by a margin of straddle_thresh
-            inds_inside = np.where(
-                (all_anchors[:, 0] >= -rpn_straddle_thresh) &
-                (all_anchors[:, 1] >= -rpn_straddle_thresh) & (
-                    all_anchors[:, 2] < im_width + rpn_straddle_thresh) & (
-                        all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0]
-            # keep only inside anchors
-            inside_anchors = all_anchors[inds_inside, :]
-        else:
-            inds_inside = np.arange(all_anchors.shape[0])
-            inside_anchors = all_anchors
-
-        b, e = lod[i], lod[i + 1]
-        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
-        is_crowd_slice = is_crowd[b:e]
-
-        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
-        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
-        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
-
-        loc_inds, score_inds, labels, gt_inds, bbox_inside_weight = \
-                         rpn_target_assign(iou, rpn_batch_size_per_im,
-                                           rpn_positive_overlap,
-                                           rpn_negative_overlap,
-                                           rpn_fg_fraction,
-                                           use_random)
-        # unmap to all anchor 
-        loc_inds = inds_inside[loc_inds]
-        score_inds = inds_inside[score_inds]
-
-        sampled_gt = gt_boxes_slice[gt_inds]
-        sampled_anchor = all_anchors[loc_inds]
-        box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.])
-
-        if i == 0:
-            loc_indexes = loc_inds
-            score_indexes = score_inds
-            tgt_labels = labels
-            tgt_bboxes = box_deltas
-            bbox_inside_weights = bbox_inside_weight
-        else:
-            loc_indexes = np.concatenate(
-                [loc_indexes, loc_inds + i * anchor_num])
-            score_indexes = np.concatenate(
-                [score_indexes, score_inds + i * anchor_num])
-            tgt_labels = np.concatenate([tgt_labels, labels])
-            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
-            bbox_inside_weights = np.vstack([bbox_inside_weights, \
-                                             bbox_inside_weight])
-
-    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels, bbox_inside_weights
-
-
-def retinanet_target_assign(anchor_by_gt_overlap, gt_labels, positive_overlap,
-                            negative_overlap):
-    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
-    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
-        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
-
-    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
-    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
-        anchor_by_gt_overlap.shape[1])]
-    anchors_with_max_overlap = np.where(
-        anchor_by_gt_overlap == gt_to_anchor_max)[0]
-
-    labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
-    labels[anchors_with_max_overlap] = 1
-    labels[anchor_to_gt_max >= positive_overlap] = 1
-
-    fg_inds = np.where(labels == 1)[0]
-    bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32)
-
-    bg_inds = np.where(anchor_to_gt_max < negative_overlap)[0]
-    enable_inds = bg_inds
-
-    fg_fake_inds = np.array([], np.int32)
-    fg_value = np.array([fg_inds[0]], np.int32)
-    fake_num = 0
-    for bg_id in enable_inds:
-        if bg_id in fg_inds:
-            fake_num += 1
-            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
-    labels[enable_inds] = 0
-
-    bbox_inside_weight[fake_num:, :] = 1
-    fg_inds = np.where(labels == 1)[0]
-    bg_inds = np.where(labels == 0)[0]
-    loc_index = np.hstack([fg_fake_inds, fg_inds])
-    score_index = np.hstack([fg_inds, bg_inds])
-    score_index_tmp = np.hstack([fg_inds])
-    labels = labels[score_index]
-
-    gt_inds = anchor_to_gt_argmax[loc_index]
-    label_inds = anchor_to_gt_argmax[score_index_tmp]
-    labels[0:len(fg_inds)] = np.squeeze(gt_labels[label_inds])
-    fg_num = len(fg_fake_inds) + len(fg_inds) + 1
-    assert not np.any(labels == -1), "Wrong labels with -1"
-    return loc_index, score_index, labels, gt_inds, bbox_inside_weight, fg_num
-
-
-def retinanet_target_assign_in_python(all_anchors, gt_boxes, gt_labels,
-                                      is_crowd, im_info, lod, positive_overlap,
-                                      negative_overlap):
-    anchor_num = all_anchors.shape[0]
-    batch_size = len(lod) - 1
-    for i in range(batch_size):
-        im_scale = im_info[i][2]
-
-        inds_inside = np.arange(all_anchors.shape[0])
-        inside_anchors = all_anchors
-        b, e = lod[i], lod[i + 1]
-        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
-        gt_labels_slice = gt_labels[b:e, :]
-        is_crowd_slice = is_crowd[b:e]
-
-        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
-        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
-        gt_labels_slice = gt_labels_slice[not_crowd_inds]
-        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
-
-        loc_inds, score_inds, labels, gt_inds, bbox_inside_weight, fg_num = \
-                         retinanet_target_assign(iou, gt_labels_slice,
-                                                positive_overlap, negative_overlap)
-        # unmap to all anchor
-        loc_inds = inds_inside[loc_inds]
-        score_inds = inds_inside[score_inds]
-
-        sampled_gt = gt_boxes_slice[gt_inds]
-        sampled_anchor = all_anchors[loc_inds]
-        box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.])
-
-        if i == 0:
-            loc_indexes = loc_inds
-            score_indexes = score_inds
-            tgt_labels = labels
-            tgt_bboxes = box_deltas
-            bbox_inside_weights = bbox_inside_weight
-            fg_nums = [[fg_num]]
-        else:
-            loc_indexes = np.concatenate(
-                [loc_indexes, loc_inds + i * anchor_num])
-            score_indexes = np.concatenate(
-                [score_indexes, score_inds + i * anchor_num])
-            tgt_labels = np.concatenate([tgt_labels, labels])
-            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
-            bbox_inside_weights = np.vstack([bbox_inside_weights, \
-                                             bbox_inside_weight])
-            fg_nums = np.concatenate([fg_nums, [[fg_num]]])
-
-    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels, bbox_inside_weights, fg_nums
-
-
-class TestRpnTargetAssignOp(OpTest):
-    def setUp(self):
-        n, c, h, w = 2, 4, 14, 14
-        all_anchors = get_anchor(n, c, h, w)
-        gt_num = 10
-        all_anchors = all_anchors.reshape(-1, 4)
-        anchor_num = all_anchors.shape[0]
-
-        images_shape = [[64, 64], [64, 64]]
-        #images_shape = [[64, 64]]
-        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
-        lod = [0, 4, 8]
-        #lod = [0, 4]
-
-        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
-        for i in range(len(images_shape)):
-            im_info[i, 0] = images_shape[i][0]
-            im_info[i, 1] = images_shape[i][1]
-            im_info[i, 2] = 0.8  #scale
-        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
-        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
-
-        all_anchors = all_anchors.astype('float32')
-        gt_boxes = gt_boxes.astype('float32')
-
-        rpn_straddle_thresh = 0.0
-        rpn_batch_size_per_im = 256
-        rpn_positive_overlap = 0.7
-        rpn_negative_overlap = 0.3
-        rpn_fg_fraction = 0.5
-        use_random = False
-
-        loc_index, score_index, tgt_bbox, labels, bbox_inside_weights = \
-            rpn_target_assign_in_python(all_anchors, gt_boxes, is_crowd,
-                                   im_info, lod, rpn_straddle_thresh,
-                                   rpn_batch_size_per_im, rpn_positive_overlap,
-                                   rpn_negative_overlap,
-                                   rpn_fg_fraction, use_random)
-        labels = labels[:, np.newaxis]
-
-        self.op_type = "rpn_target_assign"
-        self.inputs = {
-            'Anchor': all_anchors,
-            'GtBoxes': (gt_boxes, [[4, 4]]),
-            'IsCrowd': (is_crowd, [[4, 4]]),
-            'ImInfo': (im_info, [[1, 1]])
-        }
-        self.attrs = {
-            'rpn_batch_size_per_im': rpn_batch_size_per_im,
-            'rpn_straddle_thresh': rpn_straddle_thresh,
-            'rpn_positive_overlap': rpn_positive_overlap,
-            'rpn_negative_overlap': rpn_negative_overlap,
-            'rpn_fg_fraction': rpn_fg_fraction,
-            'use_random': use_random
-        }
-        self.outputs = {
-            'LocationIndex': loc_index.astype('int32'),
-            'ScoreIndex': score_index.astype('int32'),
-            'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': labels.astype('int32'),
-            'BBoxInsideWeight': bbox_inside_weights.astype('float32')
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRetinanetTargetAssignOp(OpTest):
-    def setUp(self):
-        n, c, h, w = 2, 4, 14, 14
-        all_anchors = get_anchor(n, c, h, w)
-        gt_num = 10
-        all_anchors = all_anchors.reshape(-1, 4)
-        anchor_num = all_anchors.shape[0]
-
-        images_shape = [[64, 64], [64, 64]]
-        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
-        lod = [0, 4, 8]
-
-        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
-        for i in range(len(images_shape)):
-            im_info[i, 0] = images_shape[i][0]
-            im_info[i, 1] = images_shape[i][1]
-            im_info[i, 2] = 0.8  #scale
-        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
-        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
-        gt_labels = np.vstack([
-            v['gt_classes'].reshape(len(v['gt_classes']), 1)
-            for v in groundtruth
-        ])
-        gt_labels = gt_labels.reshape(len(gt_labels), 1)
-        all_anchors = all_anchors.astype('float32')
-        gt_boxes = gt_boxes.astype('float32')
-        gt_labels = gt_labels.astype('int32')
-
-        positive_overlap = 0.5
-        negative_overlap = 0.4
-
-        loc_index, score_index, tgt_bbox, labels, bbox_inside_weights, fg_num = \
-            retinanet_target_assign_in_python(all_anchors, gt_boxes, gt_labels, is_crowd,
-                                   im_info, lod, positive_overlap, negative_overlap)
-        labels = labels[:, np.newaxis]
-        self.op_type = "retinanet_target_assign"
-        self.inputs = {
-            'Anchor': all_anchors,
-            'GtBoxes': (gt_boxes, [[4, 4]]),
-            'GtLabels': (gt_labels, [[4, 4]]),
-            'IsCrowd': (is_crowd, [[4, 4]]),
-            'ImInfo': (im_info, [[1, 1]])
-        }
-        self.attrs = {
-            'positive_overlap': positive_overlap,
-            'negative_overlap': negative_overlap
-        }
-        self.outputs = {
-            'LocationIndex': loc_index.astype('int32'),
-            'ScoreIndex': score_index.astype('int32'),
-            'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': labels.astype('int32'),
-            'BBoxInsideWeight': bbox_inside_weights.astype('float32'),
-            'ForegroundNumber': fg_num.astype('int32')
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py b/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py
deleted file mode 100644
index 9a002a31d14ea54188e8a52df9143dd2c1bcc604..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-
-
-class TestRunTimeException(OpTest):
-    def test_run_time_exception(self):
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            fluid.layers.one_hot(input=label, depth=100)
-
-        def _run_program():
-            x = np.random.random(size=(10)).astype('int64')
-            exe.run(train_program, feed={"label": x})
-
-        self.assertRaises(core.EnforceNotMet, _run_program)
-
-
-class TestCompileTimeException(OpTest):
-    def test_compile_time_exception(self):
-        self.assertRaises(core.EnforceNotMet, self.build_model)
-
-    def build_model(self):
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            label = fluid.layers.data(
-                name="label", shape=[1], dtype="int64", append_batch_size=False)
-            fluid.layers.one_hot(input=label, depth=100)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
deleted file mode 100644
index 0c784d3e49d85f0b5750c5e6d7307be754b43ab2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid.op import Operator
-
-
-class TestSamplingIdOp(OpTest):
-    def setUp(self):
-        self.op_type = "sampling_id"
-        self.use_mkldnn = False
-        self.init_kernel_type()
-        self.X = np.random.random((100, 10)).astype('float32')
-        self.inputs = {"X": self.X}
-        self.Y = np.random.random(100).astype('int64')
-        self.outputs = {'Out': self.Y}
-        self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1}
-
-    def test_check_output(self):
-        self.check_output_customized(self.verify_output)
-        y1 = self.out
-        self.check_output_customized(self.verify_output)
-        y2 = self.out
-
-        # check dtype
-        assert y1.dtype == np.int64
-        assert y2.dtype == np.int64
-
-        # check output is index ids of inputs
-        inputs_ids = np.arange(self.X.shape[1])
-        assert np.isin(y1, inputs_ids).all()
-        assert np.isin(y2, inputs_ids).all()
-
-        self.assertTrue(np.array_equal(y1, y2))
-        self.assertEqual(len(y1), len(self.Y))
-
-    def verify_output(self, outs):
-        out = np.array(outs[0])
-        self.out = out
-
-    def init_kernel_type(self):
-        pass
-
-
-class TestSamplingIdShape(unittest.TestCase):
-    def test_shape(self):
-        x = fluid.layers.data(name='x', shape=[3], dtype='float32')
-        output = fluid.layers.sampling_id(x)
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place=place)
-        exe.run(fluid.default_startup_program())
-
-        feed = {
-            'x': np.array(
-                [[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
-        }
-        output_np = exe.run(feed=feed, fetch_list=[output])[0]
-
-        self.assertEqual(output.shape[0], -1)
-        self.assertEqual(len(output.shape), 1)
-        self.assertEqual(output_np.shape[0], 2)
-        self.assertEqual(len(output_np.shape), 1)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
deleted file mode 100644
index b74a6e10917f71b669a2d2e6906d6c7310c59c7e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import warnings
-import unittest
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.layers.device import get_places
-from paddle.fluid.executor import as_numpy
-
-
-class TestSaveModelWithoutVar(unittest.TestCase):
-    def test_no_var_save(self):
-        data = fluid.layers.data(
-            name='data',
-            shape=[-1, 1],
-            dtype='float32',
-            append_batch_size=False)
-        data_plus = data + 1
-
-        if fluid.core.is_compiled_with_cuda():
-            place = fluid.core.CUDAPlace(0)
-        else:
-            place = fluid.core.CPUPlace()
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-
-            fluid.io.save_inference_model(
-                dirname='test',
-                feeded_var_names=['data'],
-                target_vars=[data_plus],
-                executor=exe,
-                model_filename='model',
-                params_filename='params')
-            expected_warn = "no variable in your model, please ensure there are any variables in your model to save"
-            self.assertTrue(len(w) > 0)
-            self.assertTrue(expected_warn == str(w[0].message))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
deleted file mode 100644
index 9893c92ad68f4d460c4bb428bb44a30df25fd6e0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ /dev/null
@@ -1,140 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestScaleOp(OpTest):
-    def setUp(self):
-        self.op_type = "scale"
-        self.dtype = np.float32
-        self.init_dtype_type()
-        self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
-        self.attrs = {'scale': -2.3}
-        self.outputs = {
-            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
-        }
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestScaleOpSelectedRows(unittest.TestCase):
-    def init_dtype_type(self):
-        pass
-
-    def check_with_place(self, place, in_name, out_name):
-        scope = core.Scope()
-
-        self.dtype = np.float32
-        self.init_dtype_type()
-
-        # create and initialize Grad Variable
-        in_height = 10
-        in_rows = [0, 4, 7]
-        in_row_numel = 12
-        scale = 2.0
-
-        in_selected_rows = scope.var(in_name).get_selected_rows()
-        in_selected_rows.set_height(in_height)
-        in_selected_rows.set_rows(in_rows)
-        in_array = np.random.random(
-            (len(in_rows), in_row_numel)).astype(self.dtype)
-
-        in_tensor = in_selected_rows.get_tensor()
-        in_tensor.set(in_array, place)
-
-        # create and initialize Param Variable
-        out_selected_rows = scope.var(out_name).get_selected_rows()
-        out_tensor = out_selected_rows.get_tensor()
-        out_tensor._set_dims(in_tensor._get_dims())
-
-        # create and run sgd operator
-        scale_op = Operator("scale", X=in_name, Out=out_name, scale=scale)
-        scale_op.run(scope, place)
-
-        # get and compare result
-        out_height = out_selected_rows.height()
-        out_rows = out_selected_rows.rows()
-        result_array = np.array(out_tensor)
-
-        assert (in_array * scale == result_array).all()
-        assert in_height == out_height
-        assert in_rows == out_rows
-
-    def test_scale_selected_rows(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place, 'in', 'out')
-
-    def test_scale_selected_rows_inplace(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place, 'in', 'in')
-
-
-# Add FP16 test
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestScaleFp16Op(TestScaleOp):
-    def init_dtype_type(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_output_with_place(place, atol=0.002)
-
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ["X"], "Out", max_relative_error=0.05)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows):
-    def init_dtype_type(self):
-        self.dtype = np.float16
-
-    def test_scale_selected_rows(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_with_place(place, 'in', 'out')
-
-    def test_scale_selected_rows_inplace(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_with_place(place, 'in', 'in')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
deleted file mode 100644
index dec9bfa43005cc11af5ff58e10d171ec69125bd9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
+++ /dev/null
@@ -1,291 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid as fluid
-
-
-def numpy_scatter_nd(ref, index, updates, fun):
-    ref_shape = ref.shape
-    index_shape = index.shape
-
-    end_size = index_shape[-1]
-    remain_numl = 1
-    for i in range(len(index_shape) - 1):
-        remain_numl *= index_shape[i]
-
-    slice_size = 1
-    for i in range(end_size, len(ref_shape)):
-        slice_size *= ref_shape[i]
-
-    flat_index = index.reshape([remain_numl] + list(index_shape[-1:]))
-    flat_updates = updates.reshape((remain_numl, slice_size))
-    flat_output = ref.reshape(list(ref_shape[:end_size]) + [slice_size])
-
-    for i_up, i_out in enumerate(flat_index):
-        i_out = tuple(i_out)
-        flat_output[i_out] = fun(flat_output[i_out], flat_updates[i_up])
-    return flat_output.reshape(ref.shape)
-
-
-def numpy_scatter_nd_add(ref, index, updates):
-    return numpy_scatter_nd(ref, index, updates, lambda x, y: x + y)
-
-
-def judge_update_shape(ref, index):
-    ref_shape = ref.shape
-    index_shape = index.shape
-    update_shape = []
-    for i in range(len(index_shape) - 1):
-        update_shape.append(index_shape[i])
-    for i in range(index_shape[-1], len(ref_shape), 1):
-        update_shape.append(ref_shape[i])
-    return update_shape
-
-
-class TestScatterNdAddSimpleOp(OpTest):
-    """
-    A simple example
-    """
-
-    def setUp(self):
-        self.op_type = "scatter_nd_add"
-        ref_np = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8]).astype("float32")
-        index_np = np.array([[1], [2], [3], [5], [1]]).astype("int32")
-        updates_np = np.array([9, 10, 11, 12, 13]).astype("float32")
-        expect_np = numpy_scatter_nd_add(ref_np.copy(), index_np, updates_np)
-        #expect_np = [ 0. 23. 12. 14.  4. 17.  6.  7.  8.] 
-
-        self.inputs = {'X': ref_np, 'Index': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': expect_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True)
-
-
-class TestScatterNdAddWithEmptyIndex(OpTest):
-    """
-    Index has empty element
-    """
-
-    def setUp(self):
-        self.op_type = "scatter_nd_add"
-        ref_np = np.array([[65, 17], [-14, -25]]).astype("float32")
-        index_np = np.array([[], []]).astype("int32")
-        updates_np = np.array([[[-1, -2], [1, 2]],
-                               [[3, 4], [-3, -4]]]).astype("float32")
-
-        expect_np = numpy_scatter_nd_add(ref_np.copy(), index_np, updates_np)
-        #expect_np = [[67, 19], [-16, -27]]
-
-        self.inputs = {'X': ref_np, 'Index': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': expect_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True)
-
-
-class TestScatterNdAddWithHighRankSame(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) = Rank(X)
-    """
-
-    def setUp(self):
-        self.op_type = "scatter_nd_add"
-        shape = (10, 9, 8, 1, 15)
-        ref_np = np.random.rand(*shape).astype("float32")
-        index_np = np.vstack(
-            [np.random.randint(
-                0, s, size=150) for s in shape]).T.astype("int32")
-        update_shape = judge_update_shape(ref_np, index_np)
-        updates_np = np.random.rand(*update_shape).astype("float32")
-        expect_np = numpy_scatter_nd_add(ref_np.copy(), index_np, updates_np)
-
-        self.inputs = {'X': ref_np, 'Index': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': expect_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True)
-
-
-class TestScatterNdAddWithHighRankDiff(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) < Rank(X)
-    """
-
-    def setUp(self):
-        self.op_type = "scatter_nd_add"
-        shape = (10, 9, 8, 1, 15)
-        ref_np = np.random.rand(*shape).astype("double")
-        index = np.vstack([np.random.randint(0, s, size=500) for s in shape]).T
-        index_np = index.reshape([10, 5, 10, 5]).astype("int64")
-        update_shape = judge_update_shape(ref_np, index_np)
-        updates_np = np.random.rand(*update_shape).astype("double")
-        expect_np = numpy_scatter_nd_add(ref_np.copy(), index_np, updates_np)
-
-        self.inputs = {'X': ref_np, 'Index': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': expect_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True)
-
-
-#Test Python API
-class TestScatterNdOpAPI(OpTest):
-    """
-    test scatter_nd_add api and scatter_nd api
-    """
-
-    def testcase1(self):
-        ref1 = fluid.layers.data(
-            name='ref1',
-            shape=[10, 9, 8, 1, 3],
-            dtype='float32',
-            append_batch_size=False)
-        index1 = fluid.layers.data(
-            name='index1',
-            shape=[5, 5, 8, 5],
-            dtype='int32',
-            append_batch_size=False)
-        updates1 = fluid.layers.data(
-            name='update1',
-            shape=[5, 5, 8],
-            dtype='float32',
-            append_batch_size=False)
-        output1 = fluid.layers.scatter_nd_add(ref1, index1, updates1)
-
-    def testcase2(self):
-        ref2 = fluid.layers.data(
-            name='ref2',
-            shape=[10, 9, 8, 1, 3],
-            dtype='double',
-            append_batch_size=False)
-        index2 = fluid.layers.data(
-            name='index2',
-            shape=[5, 8, 5],
-            dtype='int32',
-            append_batch_size=False)
-        updates2 = fluid.layers.data(
-            name='update2',
-            shape=[5, 8],
-            dtype='double',
-            append_batch_size=False)
-        output2 = fluid.layers.scatter_nd_add(
-            ref2, index2, updates2, name="scatter_nd_add")
-
-    def testcase3(self):
-        shape3 = [10, 9, 8, 1, 3]
-        index3 = fluid.layers.data(
-            name='index3',
-            shape=[5, 5, 8, 5],
-            dtype='int32',
-            append_batch_size=False)
-        updates3 = fluid.layers.data(
-            name='update3',
-            shape=[5, 5, 8],
-            dtype='float32',
-            append_batch_size=False)
-        output3 = fluid.layers.scatter_nd(index3, updates3, shape3)
-
-    def testcase4(self):
-        shape4 = [10, 9, 8, 1, 3]
-        index4 = fluid.layers.data(
-            name='index4',
-            shape=[5, 5, 8, 5],
-            dtype='int32',
-            append_batch_size=False)
-        updates4 = fluid.layers.data(
-            name='update4',
-            shape=[5, 5, 8],
-            dtype='double',
-            append_batch_size=False)
-        output4 = fluid.layers.scatter_nd(
-            index4, updates4, shape4, name='scatter_nd')
-
-
-#Test Raise Error
-class TestScatterNdOpRaise(OpTest):
-    def test_check_raise(self):
-        def check_raise_is_test():
-            try:
-                ref5 = fluid.layers.data(
-                    name='ref5', shape=[3, 4, 5], dtype='float32')
-                index5 = fluid.layers.data(
-                    name='index5', shape=[2, 10], dtype='int32')
-                updates5 = fluid.layers.data(
-                    name='updates5', shape=[2, 10], dtype='float32')
-                output5 = fluid.layers.scatter_nd_add(ref5, index5, updates5)
-            except Exception as e:
-                t = \
-                "Input(Index).shape[-1] should be no greater than Input(X).rank"
-                if t in str(e):
-                    raise IndexError
-
-        self.assertRaises(IndexError, check_raise_is_test)
-
-    def test_check_raise2(self):
-        with self.assertRaises(ValueError):
-            ref6 = fluid.layers.data(
-                name='ref6',
-                shape=[10, 9, 8, 1, 3],
-                dtype='double',
-                append_batch_size=False)
-            index6 = fluid.layers.data(
-                name='index6',
-                shape=[5, 8, 5],
-                dtype='int32',
-                append_batch_size=False)
-            updates6 = fluid.layers.data(
-                name='update6',
-                shape=[5, 8],
-                dtype='float32',
-                append_batch_size=False)
-            output6 = fluid.layers.scatter_nd_add(ref6, index6, updates6)
-
-    def test_check_raise3(self):
-        def check_raise_is_test():
-            try:
-                shape = [3, 4, 5]
-                index7 = fluid.layers.data(
-                    name='index7', shape=[2, 1], dtype='int32')
-                updates7 = fluid.layers.data(
-                    name='updates7', shape=[2, 4, 5, 20], dtype='float32')
-                output7 = fluid.layers.scatter_nd(index7, updates7, shape)
-            except Exception as e:
-                t = \
-                "Updates has wrong shape"
-                if t in str(e):
-                    raise ValueError
-
-        self.assertRaises(ValueError, check_raise_is_test)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
deleted file mode 100644
index 999b7ea88bad6345bddad4cec92d510facd142dc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ /dev/null
@@ -1,177 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-
-
-class TestScatterOp(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        index_np = np.array([1, 2]).astype("int32")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = updates_np
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True)
-
-
-class TestScatterOp0(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        index_np = np.array([1, 2]).astype("int32")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = updates_np
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.attrs = {'overwrite': True}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True)
-
-
-class TestScatterOp1(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        zeros_np = np.zeros([2, 3]).astype('float32')
-        index_np = np.array([1, 1]).astype("int32")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = zeros_np
-        for i in range(0, len(index_np)):
-            output_np[index_np[i]] += updates_np[i]
-        self.attrs = {'overwrite': False}
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestScatterOp2(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        index_np = np.array([1, 2]).astype("int32")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = updates_np
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
-
-    def test_check_grad(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['Updates'], 'Out', in_place=True)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestScatterOp3(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        zeros_np = np.zeros([2, 3]).astype('float32')
-        index_np = np.array([1, 1]).astype("int32")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = zeros_np
-        for i in range(0, len(index_np)):
-            output_np[index_np[i]] += updates_np[i]
-        self.attrs = {'overwrite': False}
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
-
-    def test_check_grad(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['Updates'], 'Out', in_place=True)
-
-
-class TestScatterOp4(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        index_np = np.array([1, 2]).astype("int64")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = updates_np
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestScatterOp5(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        index_np = np.array([1, 2]).astype("int64")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = updates_np
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
-
-    def test_check_grad(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['Updates'], 'Out', in_place=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scope.py b/python/paddle/fluid/tests/unittests/test_scope.py
deleted file mode 100644
index 45fcbfba6eb7c6fc4e75f6d8228d721c0186ef36..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_scope.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid.core
-import unittest
-
-
-class TestScope(unittest.TestCase):
-    def test_create_destroy(self):
-        paddle_c = paddle.fluid.core
-        scope = paddle_c.Scope()
-        self.assertIsNotNone(scope)
-        scope_with_parent = scope.new_scope()
-        self.assertIsNotNone(scope_with_parent)
-
-    def test_none_variable(self):
-        paddle_c = paddle.fluid.core
-        scope = paddle_c.Scope()
-        self.assertIsNone(scope.find_var("test"))
-
-    def test_create_var_get_var(self):
-        paddle_c = paddle.fluid.core
-        scope = paddle_c.Scope()
-        var_a = scope.var("var_a")
-        self.assertIsNotNone(var_a)
-        self.assertIsNotNone(scope.find_var('var_a'))
-        scope2 = scope.new_scope()
-        self.assertIsNotNone(scope2.find_var('var_a'))
-
-    def test_var_get_int(self):
-        paddle_c = paddle.fluid.core
-        scope = paddle_c.Scope()
-        var = scope.var("test_int")
-        var.set_int(10)
-        self.assertTrue(var.is_int())
-        self.assertEqual(10, var.get_int())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_selected_rows.py b/python/paddle/fluid/tests/unittests/test_selected_rows.py
deleted file mode 100644
index 2f34f79b8eafad8e7fdf6b359548747f354b141f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_selected_rows.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid.core as core
-import unittest
-import numpy as np
-
-
-class TestSelectedRows(unittest.TestCase):
-    def test_selected_rows(self):
-        place = core.CPUPlace()
-        height = 10
-        rows = [0, 4, 7]
-        row_numel = 12
-        selected_rows = core.SelectedRows(rows, height)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 8] = 4.0
-        tensor = selected_rows.get_tensor()
-        tensor.set(np_array, place)
-
-        # compare rows
-        self.assertEqual(0, selected_rows.rows()[0])
-        self.assertEqual(4, selected_rows.rows()[1])
-        self.assertEqual(7, selected_rows.rows()[2])
-
-        # compare height
-        self.assertEqual(10, selected_rows.height())
-
-        # compare tensor
-        self.assertAlmostEqual(2.0,
-                               selected_rows.get_tensor()._get_float_element(0))
-        self.assertAlmostEqual(1.0,
-                               selected_rows.get_tensor()._get_float_element(1))
-        self.assertAlmostEqual(
-            4.0,
-            selected_rows.get_tensor()._get_float_element(2 * row_numel + 8))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
deleted file mode 100644
index bcba0511da747990b1e99026048c7ce95140a422..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import six
-from op_test import OpTest
-
-
-class SeluTest(OpTest):
-    def setUp(self):
-        self.op_type = "selu"
-        self.x_shape = [3, 5, 5, 10]
-        self.dtype = np.float32
-        self.init_x_shape()
-        self.init_dtype()
-
-        alpha = 1.6732632423543772848170429916717
-        scale = 1.0507009873554804934193349852946
-
-        x = np.random.normal(size=self.x_shape).astype(self.dtype)
-
-        # Since zero point in selu is not differentiable, avoid randomize
-        # zero.
-        x[np.abs(x) < 0.005] = 0.02
-
-        x_flat = x.flatten()
-
-        for i in range(x_flat.size):
-            if x_flat[i] < 0:
-                x_flat[i] = alpha * np.exp(x_flat[i]) - alpha
-            x_flat[i] = scale * x_flat[i]
-
-        out_np = x_flat.reshape(self.x_shape)
-
-        self.inputs = {'X': x}
-        self.outputs = {'Out': out_np}
-
-        self.attrs = {
-            'alpha': alpha,
-            'scale': scale,
-        }
-
-    def init_x_shape(self):
-        pass
-
-    def init_dtype(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
deleted file mode 100644
index 9f0115034d9e29dbbc47b4cafd8500959a58f8af..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ /dev/null
@@ -1,261 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import random
-from op_test import OpTest
-
-
-def seqconv(x,
-            lod,
-            filter,
-            context_length,
-            context_start,
-            padding_trainable=False,
-            padding_data=None):
-    [T, M] = x.shape
-    col = np.zeros((T, context_length * M)).astype('float32')
-    offset = [0]
-    for seq_len in lod[0]:
-        offset.append(offset[-1] + seq_len)
-    begin_pad = np.max([0, -context_start])
-    for i in range(len(offset) - 1):
-        for j in range(context_length):
-            in_begin = offset[i] + context_start + j
-            in_end = offset[i + 1] + context_start + j
-            out_begin = offset[i]
-            out_end = offset[i + 1]
-            if in_begin < offset[i]:
-                pad_size = np.min(
-                    [offset[i] - in_begin, offset[i + 1] - offset[i]])
-                if padding_trainable:
-                    sub_w = padding_data[j:j + pad_size, :]
-                    col[offset[i]:offset[i] + pad_size, j * M:(j + 1) *
-                        M] = sub_w
-                out_begin = offset[i] + pad_size
-                in_begin = offset[i]
-
-            if in_end > offset[i + 1]:
-                pad_size = np.min(
-                    [in_end - offset[i + 1], offset[i + 1] - offset[i]])
-                if padding_trainable:
-                    sub_w = padding_data[begin_pad + context_start + j -
-                                         pad_size:begin_pad + context_start +
-                                         j, :]
-                    col[offset[i + 1] - pad_size:offset[i + 1], j * M:(j + 1) *
-                        M] = sub_w
-                in_end = offset[i + 1]
-                out_end = offset[i + 1] - pad_size
-            if in_end <= in_begin:
-                continue
-            in_sub = x[in_begin:in_end, :]
-            col[out_begin:out_end, j * M:(j + 1) * M] += in_sub
-    return np.dot(col, filter)
-
-
-class TestSeqProject(OpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = 'sequence_conv'
-
-        if self.context_length == 1 \
-                and self.context_start == 0 \
-                and self.padding_trainable:
-            print("If context_start is 0 " \
-                  "and context_length is 1," \
-                  " padding_trainable should be false.")
-            return
-
-        # one level, batch size
-        x = np.random.uniform(0.1, 1, [self.input_size[0],
-                                       self.input_size[1]]).astype('float32')
-        w = np.random.uniform(0.1, 1, [
-            self.context_length * self.input_size[1], self.output_represention
-        ]).astype('float32')
-
-        begin_pad = np.max([0, -self.context_start])
-        end_pad = np.max([0, self.context_start + self.context_length - 1])
-        total_pad = begin_pad + end_pad
-        padding_data = np.random.uniform(
-            0.1, 1, [total_pad, self.input_size[1]]).astype('float32')
-        self.pad_data = padding_data
-        self.inputs = {
-            'X': (x, self.lod),
-            'Filter': w,
-        }
-        self.inputs_val = ['X', 'Filter']
-        self.inputs_val_no_x = ['Filter']
-        self.inputs_val_no_f = ['X']
-
-        if total_pad != 0:
-            self.inputs['PaddingData'] = padding_data
-            self.inputs_val = ['X', 'PaddingData', 'Filter']
-            self.inputs_val_no_x = ['PaddingData', 'Filter']
-            self.inputs_val_no_f = ['PaddingData', 'X']
-
-        self.attrs = {
-            'contextStart': self.context_start,
-            'contextLength': self.context_length,
-            'paddingTrainable': self.padding_trainable,
-            'contextStride': self.context_stride
-        }
-        out = seqconv(x, self.lod, w, self.context_length, self.context_start,
-                      self.padding_trainable, self.pad_data)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        if self.padding_trainable:
-            self.check_grad(
-                set(self.inputs_val), 'Out', max_relative_error=0.05)
-
-    def test_check_grad_input(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.05,
-            no_grad_set=set(self.inputs_val_no_x))
-
-    def test_check_grad_padding_data(self):
-        if self.padding_trainable:
-            self.check_grad(
-                ['PaddingData'],
-                'Out',
-                max_relative_error=0.05,
-                no_grad_set=set(['X', 'Filter']))
-
-    def test_check_grad_Filter(self):
-        self.check_grad(
-            ['Filter'],
-            'Out',
-            max_relative_error=0.05,
-            no_grad_set=set(self.inputs_val_no_f))
-
-    def test_check_grad_input_filter(self):
-        if self.padding_trainable:
-            self.check_grad(
-                ['X', 'Filter'],
-                'Out',
-                max_relative_error=0.05,
-                no_grad_set=set(['PaddingData']))
-
-    def test_check_grad_padding_input(self):
-        if self.padding_trainable:
-            self.check_grad(
-                self.inputs_val_no_f,
-                'Out',
-                max_relative_error=0.05,
-                no_grad_set=set(['Filter']))
-
-    def test_check_grad_padding_filter(self):
-        if self.padding_trainable:
-            self.check_grad(
-                self.inputs_val_no_x,
-                'Out',
-                max_relative_error=0.05,
-                no_grad_set=set(['X']))
-
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = 0
-        self.context_length = 1
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 23]
-        offset_lod = [[0, 4, 5, 8, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase1(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -1
-        self.context_length = 3
-        self.padding_trainable = True
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 23]
-        offset_lod = [[0, 4, 5, 8, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase2Len0(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -1
-        self.context_length = 3
-        self.padding_trainable = True
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 23]
-        offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase3(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 25
-        self.context_start = 2
-        self.context_length = 3
-        self.padding_trainable = True
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 23]
-        idx = list(range(self.input_size[0]))
-        del idx[0]
-        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
-                      [self.input_size[0]]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqConvApi(unittest.TestCase):
-    def test_api(self):
-        import paddle.fluid as fluid
-
-        x = fluid.layers.data('x', shape=[32], lod_level=1)
-        y = fluid.layers.sequence_conv(
-            input=x, num_filters=2, filter_size=3, padding_start=None)
-
-        place = fluid.CPUPlace()
-        x_tensor = fluid.create_lod_tensor(
-            np.random.rand(10, 32).astype("float32"), [[2, 3, 1, 4]], place)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'x': x_tensor}, fetch_list=[y], return_numpy=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
deleted file mode 100644
index 2de5d0345912ace44858de1be52dece846ef879a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ /dev/null
@@ -1,432 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_reorder_lod_tensor import convert_to_offset
-
-
-def compute_seqpool_sum(x, offset, out, pad_value=0.0):
-    level = len(offset) - 1
-    for i in range(len(offset[level]) - 1):
-        if offset[level][i] == offset[level][i + 1]:
-            out[i] = pad_value
-        else:
-            sub_x = x[offset[level][i]:offset[level][i + 1], :]
-            out[i] = sub_x.sum(axis=0)
-
-
-def compute_seqpool_avg(x, offset, out, pad_value=0.0):
-    level = len(offset) - 1
-    for i in range(len(offset[level]) - 1):
-        if offset[level][i] == offset[level][i + 1]:
-            out[i] = pad_value
-        else:
-            sub_x = x[offset[level][i]:offset[level][i + 1], :]
-            out[i] = sub_x.mean(axis=0)
-
-
-def compute_seqpool_sqrt(x, offset, out, pad_value=0.0):
-    level = len(offset) - 1
-    for i in range(len(offset[level]) - 1):
-        if offset[level][i] == offset[level][i + 1]:
-            out[i] = pad_value
-        else:
-            sub_x = x[offset[level][i]:offset[level][i + 1], :]
-            seq_len = offset[level][i + 1] - offset[level][i]
-            out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len)
-
-
-class TestSeqAvgPool(OpTest):
-    def set_lod(self):
-        return [[11]]
-
-    def set_data(self):
-        self.op_type = 'sequence_pool'
-        x = np.random.uniform(0.1, 1, [11, 23]).astype('float32')
-        lod = self.set_lod()
-        level = len(lod) - 1
-        self.inputs = {'X': (x, lod)}
-        offset = convert_to_offset(lod)
-        out = np.zeros((len(lod[level]), 23)).astype('float32')
-        self.outputs = {'Out': out}
-        return x, offset, out
-
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "AVERAGE"}
-        compute_seqpool_avg(x, offset, out, self.attrs["pad_value"])
-
-    def setUp(self):
-        x, offset, out = self.set_data()
-        self.compute(x, offset, out)
-        if len(offset) > 1:
-            self.outputs = {'Out': (out, [self.set_lod()[0]])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        # Remove MaxIndex after check_grad is refined.
-        out = self.outputs['Out']
-        if isinstance(out, tuple): out = out[0]
-        self.outputs['MaxIndex'] = \
-            np.zeros(out.shape).astype('int32')
-        self.check_grad(["X"], "Out")
-
-
-class TestSeqAvgPoolLen0(TestSeqAvgPool):
-    def set_lod(self):
-        return [[0, 4, 0, 7, 0]]
-
-
-class TestSeqAvgPoolLen0LoDLevel2(TestSeqAvgPool):
-    def set_lod(self):
-        return [[2, 0, 1, 2], [0, 4, 0, 7, 0]]
-
-
-class TestSeqSumPool(TestSeqAvgPool):
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.1, 'pooltype': "SUM"}
-        compute_seqpool_sum(x, offset, out, self.attrs["pad_value"])
-
-
-class TestSeqSumPoolLen0(TestSeqSumPool):
-    def set_lod(self):
-        return [[0, 4, 0, 7, 0]]
-
-
-class TestSeqSumPoolLen0LoDLevel2(TestSeqSumPool):
-    def set_lod(self):
-        return [[2, 0, 1, 2], [0, 4, 0, 7, 0]]
-
-
-class TestSeqMaxPool(TestSeqAvgPool):
-    def set_lod(self):
-        return [[13]]
-
-    def set_data(self):
-        self.op_type = 'sequence_pool'
-        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
-        lod = self.set_lod()
-        level = len(lod) - 1
-        offset = convert_to_offset(lod)
-        for i in range(len(offset[level]) - 1):
-            l = offset[level][i + 1] - offset[level][i]
-            if l > 0:
-                x[offset[level][i] + np.random.randint(l), :] += 2.0
-
-        self.inputs = {'X': (x, lod)}
-
-        out = np.zeros((len(lod[level]), 23)).astype('float32')
-        self.outputs = {'Out': out}
-        return x, offset, out
-
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.5, 'pooltype': "MAX"}
-        level = len(offset) - 1
-        for i in range(len(offset[level]) - 1):
-            if offset[level][i] == offset[level][i + 1]:
-                out[i] = self.attrs["pad_value"]
-            else:
-                sub_x = x[offset[level][i]:offset[level][i + 1], :]
-                out[i] = np.amax(sub_x, axis=0)
-
-
-class TestSeqMaxPoolLen0(TestSeqMaxPool):
-    def set_lod(self):
-        return [[0, 1, 1, 5, 6, 0]]
-
-
-class TestSeqMaxPoolLen0LoDLevel2(TestSeqMaxPool):
-    def set_lod(self):
-        return [[2, 0, 3, 1], [0, 1, 1, 5, 6, 0]]
-
-
-class TestSeqSqrtPool(TestSeqAvgPool):
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "SQRT"}
-        compute_seqpool_sqrt(x, offset, out, self.attrs["pad_value"])
-
-
-class TestSeqSqrtPoolLen0(TestSeqSqrtPool):
-    def set_lod(self):
-        return [[0, 7, 0, 2, 2, 0]]
-
-
-class TestSeqSqrtPoolLen0LoDLevel2(TestSeqSqrtPool):
-    def set_lod(self):
-        return [[1, 2, 0, 3], [0, 7, 0, 2, 2, 0]]
-
-
-class TestSeqLastPool(TestSeqAvgPool):
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "LAST"}
-        level = len(offset) - 1
-        for i in range(len(offset[level]) - 1):
-            if offset[level][i] == offset[level][i + 1]:
-                out[i] = self.attrs["pad_value"]
-            else:
-                sub_x = x[offset[level][i]:offset[level][i + 1], :]
-                out[i] = sub_x[-1, :]
-
-
-class TestSeqLastPoolLen0(TestSeqLastPool):
-    def set_lod(self):
-        return [[0, 3, 4, 0, 4, 0]]
-
-
-class TestSeqLastPoolLen0LoDLevel2(TestSeqLastPool):
-    def set_lod(self):
-        return [[1, 0, 2, 3], [0, 3, 4, 0, 4, 0]]
-
-
-class TestSeqFirstPool(TestSeqAvgPool):
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.3, 'pooltype': "FIRST"}
-        level = len(offset) - 1
-        for i in range(len(offset[level]) - 1):
-            if offset[level][i] == offset[level][i + 1]:
-                out[i] = self.attrs["pad_value"]
-            else:
-                sub_x = x[offset[level][i]:offset[level][i + 1], :]
-                out[i] = sub_x[0, :]
-
-
-class TestSeqFirstPoolLen0(TestSeqFirstPool):
-    def set_lod(self):
-        return [[0, 2, 0, 3, 6, 0]]
-
-
-class TestSeqFirstPoolLen0LoDLevel2(TestSeqFirstPool):
-    def set_lod(self):
-        return [[1, 0, 2, 3], [0, 2, 0, 3, 6, 0]]
-
-
-class TestSeqAvgPool2D(TestSeqAvgPool):
-    def set_lod(self):
-        return [[4, 1, 3, 5]]
-
-    def set_data(self):
-        self.op_type = 'sequence_pool'
-        x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32')
-        lod = self.set_lod()
-        level = len(lod) - 1
-        self.inputs = {'X': (x, lod)}
-        offset = convert_to_offset(lod)
-
-        out = np.zeros((len(lod[level]), 3, 17)).astype('float32')
-        self.outputs = {'Out': out}
-        return x, offset, out
-
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "AVERAGE"}
-        level = len(offset) - 1
-        for i in range(len(offset[level]) - 1):
-            if offset[level][i] == offset[level][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 17))
-            else:
-                sub_x = np.reshape(x[offset[level][i]:offset[level][i + 1], :],
-                                   (-1, 3 * 17))
-                out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
-
-
-class TestSeqAvgPool2DLen0(TestSeqAvgPool2D):
-    def set_lod(self):
-        return [[0, 5, 0, 8, 0]]
-
-
-class TestSeqAvgPool2DLen0LoDLevel2(TestSeqAvgPool2D):
-    def set_lod(self):
-        return [[1, 0, 4], [0, 5, 0, 8, 0]]
-
-
-class TestSeqSumPool2D(TestSeqAvgPool2D):
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.2, 'pooltype': "SUM"}
-        level = len(offset) - 1
-        for i in range(len(offset[level]) - 1):
-            if offset[level][i] == offset[level][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 17))
-            else:
-                sub_x = np.reshape(x[offset[level][i]:offset[level][i + 1], :],
-                                   (-1, 3 * 17))
-                out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
-
-
-class TestSeqSumPool2DLen0(TestSeqSumPool2D):
-    def set_lod(self):
-        return [[0, 8, 0, 5, 0]]
-
-
-class TestSeqSumPool2DLen0LoDLevel2(TestSeqSumPool2D):
-    def set_lod(self):
-        return [[1, 0, 4], [0, 8, 0, 5, 0]]
-
-
-class TestSeqSqrtPool2D(TestSeqAvgPool2D):
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "SQRT"}
-        level = len(offset) - 1
-        for i in range(len(offset[level]) - 1):
-            if offset[level][i] == offset[level][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 17))
-            else:
-                sub_x = np.reshape(x[offset[level][i]:offset[level][i + 1], :],
-                                   (-1, 3 * 17))
-                seq_len = offset[level][i + 1] - offset[level][i]
-                out[i] = np.reshape(
-                    sub_x.sum(axis=0) / np.sqrt(seq_len), (3, 17))
-
-    def test_check_grad(self):
-        # Remove MaxIndex after check_grad is refined.
-        out = self.outputs['Out']
-        if isinstance(out, tuple):
-            out = out[0]
-        self.outputs['MaxIndex'] = \
-            np.zeros(out.shape).astype('int32')
-        self.check_grad(["X"], "Out", max_relative_error=0.06)
-
-
-class TestSeqSqrtPool2DLen0(TestSeqSqrtPool2D):
-    def set_lod(self):
-        return [[0, 8, 0, 5, 0]]
-
-
-class TestSeqSqrtPool2DLen0LoDLevel2(TestSeqSqrtPool2D):
-    def set_lod(self):
-        return [[1, 0, 2, 2], [0, 8, 0, 5, 0]]
-
-
-class TestSeqMaxPool2D(TestSeqAvgPool2D):
-    def set_lod(self):
-        return [[4, 1, 3, 5]]
-
-    def set_data(self):
-        self.op_type = 'sequence_pool'
-        x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
-        self.lod = self.set_lod()
-        level = len(self.lod) - 1
-        self.inputs = {'X': (x, self.lod)}
-        offset = convert_to_offset(self.lod)
-        for i in range(len(offset[level]) - 1):
-            l = offset[level][i + 1] - offset[level][i]
-            if l == 0:
-                continue
-            x[offset[level][i] + np.random.randint(l), :] += 1.0
-
-        out = np.zeros((len(self.lod[level]), 3, 11)).astype('float32')
-        self.outputs = {'Out': out}
-        return x, offset, out
-
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "MAX"}
-        level = len(offset) - 1
-        for i in range(len(offset[level]) - 1):
-            if offset[level][i] == offset[level][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 11))
-                continue
-            sub_x = np.reshape(x[offset[level][i]:offset[level][i + 1], :],
-                               (-1, 3 * 11))
-            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
-
-
-class TestSeqMaxPool2DLen0(TestSeqMaxPool2D):
-    def set_lod(self):
-        return [[0, 3, 0, 10, 0]]
-
-
-class TestSeqMaxPool2DLen0LoDLevel2(TestSeqMaxPool2D):
-    def set_lod(self):
-        return [[1, 0, 2, 2], [0, 3, 0, 10, 0]]
-
-
-class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 1.0, 'pooltype': "MAX", 'is_test': True}
-        level = len(offset) - 1
-        for i in range(len(offset[level]) - 1):
-            if offset[level][i] == offset[level][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 11))
-            else:
-                sub_x = np.reshape(x[offset[level][i]:offset[level][i + 1], :],
-                                   (-1, 3 * 11))
-                out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
-
-    def test_check_grad(self):
-        """Grad computation does not apply to Sequence MAX 
-            Pool executed when is_test is true """
-        return
-
-
-class TestSeqMaxPool2DInferenceLen0(TestSeqMaxPool2DInference):
-    def set_lod(self):
-        return [[0, 3, 0, 10, 0]]
-
-
-class TestSeqMaxPool2DInferenceLen0LoDLevel2(TestSeqMaxPool2DInference):
-    def set_lod(self):
-        return [[1, 0, 2, 2], [0, 3, 0, 10, 0]]
-
-
-class TestSeqLastPool2D(TestSeqAvgPool2D):
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "LAST"}
-        level = len(offset) - 1
-        for i in range(len(offset[level]) - 1):
-            if offset[level][i] == offset[level][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 17))
-            else:
-                sub_x = np.reshape(x[offset[level][i]:offset[level][i + 1], :],
-                                   (-1, 3 * 17))
-                out[i] = np.reshape(sub_x[-1, :], (3, 17))
-
-
-class TestSeqLastPool2DLen0(TestSeqLastPool2D):
-    def set_lod(self):
-        return [[0, 3, 0, 1, 9, 0]]
-
-
-class TestSeqLastPool2DLen0LoDLevel2(TestSeqLastPool2D):
-    def set_lod(self):
-        return [[1, 0, 2, 3], [0, 3, 0, 1, 9, 0]]
-
-
-class TestSeqFirstPool2D(TestSeqAvgPool2D):
-    def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "FIRST"}
-        level = len(offset) - 1
-        for i in range(len(offset[level]) - 1):
-            if offset[level][i] == offset[level][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 17))
-            else:
-                sub_x = np.reshape(x[offset[level][i]:offset[level][i + 1], :],
-                                   (-1, 3 * 17))
-                out[i] = np.reshape(sub_x[0, :], (3, 17))
-
-
-class TestSeqFirstPool2DLen0(TestSeqFirstPool2D):
-    def set_lod(self):
-        return [[0, 3, 0, 3, 7, 0]]
-
-
-class TestSeqFirstPool2DLen0LoDLevel2(TestSeqFirstPool2D):
-    def set_lod(self):
-        return [[1, 0, 2, 3], [0, 3, 0, 3, 7, 0]]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_concat.py b/python/paddle/fluid/tests/unittests/test_sequence_concat.py
deleted file mode 100644
index b4a40edc6ac78a4725e1cfed633e59621fa89f58..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_concat.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSequenceConcat(OpTest):
-    def setLoD(self):
-        self.lod1 = [7, 3]
-        self.lod2 = [12, 8]
-        self.out_lod = [19, 11]
-
-    def setUp(self):
-        x1 = np.random.random(size=(10, 80))
-        x2 = np.random.random(size=(20, 80))
-        self.setLoD()
-
-        out = np.concatenate((x1[0:self.lod1[0]], x2[0:self.lod2[0]],
-                              x1[self.lod1[0]:], x2[self.lod2[0]:]))
-
-        self.op_type = "sequence_concat"
-        self.inputs = {
-            'X': [("x1", (x1, [self.lod1])), ("x2", (x2, [self.lod2]))]
-        }
-        self.outputs = {"Out": (out, [self.out_lod])}
-
-    def test_output(self):
-        self.check_output(1e-3)
-
-    def test_dx(self):
-        self.check_grad(inputs_to_check=['x1', 'x2'], output_names="Out")
-
-
-class TestSequenceConcatCase2(TestSequenceConcat):
-    def setLoD(self):
-        self.lod1 = [10, 0]
-        self.lod2 = [12, 8]
-        self.out_lod = [22, 8]
-
-
-class TestSequenceConcatCase3(TestSequenceConcat):
-    def setLoD(self):
-        self.lod1 = [10, 0]
-        self.lod2 = [20, 0]
-        self.out_lod = [30, 0]
-
-
-class TestSequenceConcatCase4(TestSequenceConcat):
-    def setLoD(self):
-        self.lod1 = [0, 10]
-        self.lod2 = [0, 20]
-        self.out_lod = [0, 30]
-
-
-class TestSequenceConcatCase5(TestSequenceConcat):
-    def setLoD(self):
-        self.lod1 = [0, 10]
-        self.lod2 = [20, 0]
-        self.out_lod = [20, 10]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
deleted file mode 100644
index 99bb33a0a5e201c2708855b3af47bdcfd87cd64a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def sequence_enumerate(input_seq, in_lod, win_size, pad_value):
-    lod0 = [0]
-    for i in range(0, len(in_lod[0])):
-        lod0.append(lod0[i] + in_lod[0][i])
-    out_seq = []
-    for i in range(0, len(lod0) - 1):
-        for idx in range(lod0[i], lod0[i + 1]):
-            single_seq = []
-            for word_idx in range(win_size):
-                word_pos = idx + word_idx
-                dat = input_seq[word_pos] if word_pos < lod0[i+1] \
-                    else pad_value
-                single_seq.append(dat)
-            out_seq.append(single_seq)
-    return out_seq
-
-
-class TestSequenceEnumerateOp(OpTest):
-    def setUp(self):
-        self.op_type = "sequence_enumerate"
-        self.init_test_case()
-        self.inputs = {'X': (self.in_seq, self.lod)}
-        self.attrs = {'win_size': self.win_size, 'pad_value': self.pad_value}
-        self.outputs = {'Out': (self.out_seq, self.lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_test_case(self):
-        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        self.lod = [[9, 4, 11, 6]]
-        self.win_size = 2
-        self.pad_value = 0
-        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
-                                     self.pad_value)
-        self.out_seq = np.array(out_seq).astype("int32")
-
-
-class TesSequenceEnumerateOpInt64(TestSequenceEnumerateOp):
-    def init_test_case(self):
-        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
-        self.lod = [[9, 4, 11, 6]]
-        self.win_size = 2
-        self.pad_value = 0
-        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
-                                     self.pad_value)
-        self.out_seq = np.array(out_seq).astype("int64")
-
-
-class TestSequenceEnumerateOpLargeWinSize(TestSequenceEnumerateOp):
-    def init_test_case(self):
-        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        self.lod = [[9, 4, 11, 6]]
-        self.win_size = 5
-        self.pad_value = 0
-        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
-                                     self.pad_value)
-        self.out_seq = np.array(out_seq).astype("int32")
-
-
-class TestSequenceEnumerateOpMaxWinSize(TestSequenceEnumerateOp):
-    def init_test_case(self):
-        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        self.lod = [[9, 4, 11, 6]]
-        self.win_size = 30
-        self.pad_value = 0
-        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
-                                     self.pad_value)
-        self.out_seq = np.array(out_seq).astype("int32")
-
-
-class TestSequenceEnumerateOpLargePadValue(TestSequenceEnumerateOp):
-    def init_test_case(self):
-        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        self.lod = [[9, 4, 11, 6]]
-        self.win_size = 5
-        self.pad_value = 5
-        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
-                                     self.pad_value)
-        self.out_seq = np.array(out_seq).astype("int32")
-
-
-class TestSequenceEnumerateOpLargePadValueSeqLen0(TestSequenceEnumerateOp):
-    def init_test_case(self):
-        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        self.lod = [[0, 14, 0, 16, 0]]
-        self.win_size = 5
-        self.pad_value = 5
-        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
-                                     self.pad_value)
-        self.out_seq = np.array(out_seq).astype("int32")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
deleted file mode 100644
index 53bb301e9a23de4b7f34db69dd55fa0ce804dae5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def sequence_erase(in_seq, lod0, tokens):
-    new_lod0 = []
-    out_seq = []
-    offset = 0
-    for i in range(0, len(lod0)):
-        num_out = 0
-        for dat in in_seq[offset:(offset + lod0[i])]:
-            if dat not in tokens:
-                out_seq.append(dat)
-                num_out += 1
-        offset += lod0[i]
-        new_lod0.append(num_out)
-    return np.array(out_seq).astype("int32"), new_lod0
-
-
-class TestSequenceEraseOpInt32(OpTest):
-    def setUp(self):
-        self.op_type = "sequence_erase"
-        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[9, 4, 11, 6]]
-        tokens = [2, 3, 5]
-        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
-        self.attrs = {'tokens': tokens}
-        self.inputs = {'X': (in_seq, lod)}
-        self.outputs = {'Out': (out_seq, [new_lod0])}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSequenceEraseOpInt32LoD2(OpTest):
-    def setUp(self):
-        self.op_type = "sequence_erase"
-        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[1, 3], [9, 4, 11, 6]]
-        tokens = [2, 3, 5]
-        out_seq, new_lod0 = sequence_erase(in_seq, lod[-1], tokens)
-        self.attrs = {'tokens': tokens}
-        self.inputs = {'X': (in_seq, lod)}
-        self.outputs = {'Out': (out_seq, lod[:-1] + [new_lod0])}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSequenceEraseOpInt64(OpTest):
-    def setUp(self):
-        self.op_type = "sequence_erase"
-        in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
-        lod = [[9, 4, 11, 6]]
-        tokens = [2, 3, 5]
-        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
-        self.attrs = {'tokens': tokens}
-        self.inputs = {'X': (in_seq, lod)}
-        self.outputs = {'Out': (out_seq, [new_lod0])}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSequenceEraseOpInt64SeqLen0(OpTest):
-    def setUp(self):
-        self.op_type = "sequence_erase"
-        in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
-        lod = [[0, 9, 0, 0, 10, 11, 0]]
-        tokens = [2, 3, 5]
-        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
-        self.attrs = {'tokens': tokens}
-        self.inputs = {'X': (in_seq, lod)}
-        self.outputs = {'Out': (out_seq, [new_lod0])}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSequenceEraseOpEmpty(OpTest):
-    def setUp(self):
-        self.op_type = "sequence_erase"
-        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[9, 4, 11, 6]]
-        tokens = []
-        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
-        self.attrs = {'tokens': tokens}
-        self.inputs = {'X': (in_seq, lod)}
-        self.outputs = {'Out': (out_seq, [new_lod0])}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
deleted file mode 100644
index 1e4d1119789533eb020f102bb1b08f00311ceae1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSequenceExpand(OpTest):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
-        y_lod = [[1, 3, 4]]
-        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
-
-    def compute(self):
-        x = self.inputs['X']
-        x_data, x_lod = x if type(x) == tuple else (x, None)
-        y_data, y_lod = self.inputs['Y']
-
-        if hasattr(self, 'attrs'):
-            ref_level = self.attrs['ref_level']
-        else:
-            ref_level = len(y_lod) - 1
-
-        out = np.zeros(shape=((0, ) + x_data.shape[1:]), dtype=x_data.dtype)
-
-        if x_lod is None:
-            # x_idx = [i for i in xrange(x_data.shape[0] + 1)]
-            x_idx = [1] * x_data.shape[0]
-        else:
-            x_idx = x_lod[0]
-            out_lod = [[]]
-
-        offset = 0
-        for i in range(len(y_lod[ref_level])):
-            repeat_num = y_lod[ref_level][i]
-            x_len = x_idx[i]
-
-            if repeat_num > 0:
-                x_sub = x_data[offset:(offset + x_len), :]
-                stacked_x_sub = x_sub
-                for r in range(repeat_num - 1):
-                    stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
-                out = np.vstack((out, stacked_x_sub))
-                if x_lod is not None:
-                    for j in range(repeat_num):
-                        out_lod[0].append(x_len)
-            offset += x_len
-
-        if x_lod is None:
-            self.outputs = {'Out': out}
-        else:
-            self.outputs = {'Out': (out, out_lod)}
-
-    def setUp(self):
-        self.op_type = 'sequence_expand'
-        self.set_data()
-        self.compute()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestSequenceExpandCase1(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
-        y_lod = [[2, 3], [2, 2, 3, 3, 3]]
-        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
-        self.attrs = {'ref_level': 1}
-
-
-class TestSequenceExpandCase2(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
-        x_lod = [[1]]
-        y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
-        y_lod = [[2], [1, 1]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-        self.attrs = {'ref_level': 0}
-
-
-class TestSequenceExpandCase3(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        x_lod = [[1, 1, 1, 1]]
-        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
-        y_lod = [[2, 2, 2, 2]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandCase4(TestSequenceExpand):
-    def set_data(self):
-        data = np.random.uniform(0.1, 1, [5 * 2, 1])
-        x_data = np.array(data).reshape([5, 2]).astype('float32')
-        x_lod = [[2, 3]]
-        y_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        y_lod = [[2], [2, 3]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandCase5(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
-        y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
-        y_lod = [[2, 4], [2, 2, 3, 0, 3, 3]]
-        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
-        self.attrs = {'ref_level': 1}
-
-
-class TestSequenceExpandCase6(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        x_lod = [[1, 1, 0, 1, 1]]
-        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
-        y_lod = [[0, 2, 4, 2, 0]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py b/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
deleted file mode 100644
index 30c487eea3dfb2c5d2349a00e62d91a7b7fdc013..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSequenceExpandAs(OpTest):
-    def setUp(self):
-        self.op_type = 'sequence_expand_as'
-        self.set_data()
-        self.compute()
-
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
-        y_lod = [[1, 3, 4]]
-        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
-
-    def compute(self):
-        x = self.inputs['X']
-        x_data, x_lod = x if type(x) == tuple else (x, None)
-        y_data, y_lod = self.inputs['Y']
-
-        assert len(y_lod) == 1 and len(y_lod[0]) == x_data.shape[0]
-
-        repeats = []
-        for i in range(len(y_lod[0])):
-            repeat_num = y_lod[0][i]
-            if repeat_num == 0:
-                continue
-            repeats.extend([i for _ in range(repeat_num)])
-
-        out_data = x_data[repeats]
-        self.outputs = {'Out': (out_data, y_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestSequenceExpandAsCase1(TestSequenceExpandAs):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[2, 3]]
-        y_data = np.random.uniform(0.1, 1, [10, 1]).astype('float32')
-        y_lod = [[2, 2, 0, 3, 3]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandAsCase2(TestSequenceExpandAs):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[2, 3]]
-        y_data = np.random.uniform(0.1, 1, [10, 1]).astype('float32')
-        y_lod = [[0, 4, 0, 6, 0]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandAsCase3(TestSequenceExpandAs):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
-        x_lod = [[1]]
-        y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
-        y_lod = [[2]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_mask.py b/python/paddle/fluid/tests/unittests/test_sequence_mask.py
deleted file mode 100644
index 7e9f4cb5601345d1036959c24222c33efaac9e4f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_mask.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from op_test import OpTest
-import paddle.fluid as fluid
-from paddle.fluid.framework import convert_np_dtype_to_dtype_
-import paddle.fluid.core as core
-import numpy as np
-import copy
-import unittest
-
-
-class SequenceMaskTestBase(OpTest):
-    def initDefaultParameters(self):
-        self.op_type = 'sequence_mask'
-        self.maxlen = 10
-        self.mask_dtype = 'int64'
-        self.x = [[0, 3, 4], [5, 7, 9]]
-
-    def initParameters(self):
-        pass
-
-    def setUp(self):
-        self.initDefaultParameters()
-        self.initParameters()
-        if not isinstance(self.x, np.ndarray):
-            self.x = np.array(self.x)
-
-        self.inputs = {'X': self.x}
-        self.outputs = {'Y': self.calc_ground_truth_mask()}
-        self.attrs = {
-            'maxlen': self.maxlen,
-            'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype)
-        }
-
-    def calc_ground_truth_mask(self):
-        maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
-        shape = self.x.shape + (maxlen, )
-        index_broadcast = np.broadcast_to(
-            np.reshape(
-                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
-            shape=shape)
-        x_broadcast = np.broadcast_to(
-            np.reshape(
-                self.x, newshape=self.x.shape + (-1, )), shape=shape)
-        return (index_broadcast < x_broadcast).astype(self.mask_dtype)
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class SequenceMaskTest1(SequenceMaskTestBase):
-    def initParameters(self):
-        self.mask_dtype = 'bool'
-
-
-class SequenceMaskTest2(SequenceMaskTestBase):
-    def initParameters(self):
-        self.mask_dtype = 'uint8'
-
-
-class SequenceMaskTest3(SequenceMaskTestBase):
-    def initParameters(self):
-        self.mask_dtype = 'int32'
-
-
-class SequenceMaskTest4(SequenceMaskTestBase):
-    def initParameters(self):
-        self.mask_dtype = 'float32'
-
-
-class SequenceMaskTest5(SequenceMaskTestBase):
-    def initParameters(self):
-        self.mask_dtype = 'float64'
-
-
-class SequenceMaskTest6(SequenceMaskTestBase):
-    def initParameters(self):
-        self.maxlen = -1
-
-
-class SequenceMaskTestBase_tensor_attr(OpTest):
-    def initDefaultParameters(self):
-        self.op_type = 'sequence_mask'
-        self.maxlen = 10
-        self.maxlen_tensor = np.ones((1), 'int32') * 10
-        self.mask_dtype = 'int64'
-        self.x = [[0, 3, 4], [5, 7, 9]]
-
-    def initParameters(self):
-        pass
-
-    def setUp(self):
-        self.initDefaultParameters()
-        self.initParameters()
-        if not isinstance(self.x, np.ndarray):
-            self.x = np.array(self.x)
-
-        self.inputs = {'X': self.x, 'MaxLenTensor': self.maxlen_tensor}
-        self.outputs = {'Y': self.calc_ground_truth_mask()}
-        self.attrs = {'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype)}
-
-    def calc_ground_truth_mask(self):
-        maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
-        shape = self.x.shape + (maxlen, )
-        index_broadcast = np.broadcast_to(
-            np.reshape(
-                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
-            shape=shape)
-        x_broadcast = np.broadcast_to(
-            np.reshape(
-                self.x, newshape=self.x.shape + (-1, )), shape=shape)
-        return (index_broadcast < x_broadcast).astype(self.mask_dtype)
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class SequenceMaskTest1_tensor_attr(SequenceMaskTestBase_tensor_attr):
-    def initParameters(self):
-        self.mask_dtype = 'bool'
-
-
-class SequenceMaskTest2_tensor_attr(SequenceMaskTestBase_tensor_attr):
-    def initParameters(self):
-        self.mask_dtype = 'uint8'
-
-
-class SequenceMaskTest3_tensor_attr(SequenceMaskTestBase_tensor_attr):
-    def initParameters(self):
-        self.mask_dtype = 'int32'
-
-
-class SequenceMaskTest4_tensor_attr(SequenceMaskTestBase_tensor_attr):
-    def initParameters(self):
-        self.mask_dtype = 'float32'
-
-
-class SequenceMaskTest5_tensor_attr(SequenceMaskTestBase_tensor_attr):
-    def initParameters(self):
-        self.mask_dtype = 'float64'
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
deleted file mode 100644
index 01ed53471fe1385a803604f4e1de04954c1099c7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSequencePadOp(OpTest):
-    def set_attr(self):
-        self.x_shape = [12, 4]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [1.0]
-        self.padded_length = -1
-        self.dtype = 'float32'
-
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 0.5, self.x_shape).astype(self.dtype)
-        pad_value_data = np.array(self.pad_value).astype(self.dtype)
-        self.inputs = {
-            'X': (x_data, self.x_len_lod),
-            'PadValue': pad_value_data
-        }
-        self.attrs = {'padded_length': self.padded_length}
-
-    def compute(self):
-        # get padded length
-        padded_length = self.padded_length
-        x_len_lod_0 = self.x_len_lod[0]
-        if padded_length == -1:
-            max_seq_len = 0
-            for l in x_len_lod_0:
-                max_seq_len = max(max_seq_len, l)
-            padded_length = max_seq_len
-
-        # do padding
-        x_data = self.inputs['X'][0]
-        pad_value_data = self.inputs['PadValue']
-        if pad_value_data.shape == (1, ):
-            pad_value_data = np.broadcast_to(
-                pad_value_data, shape=x_data.shape[1:])
-        padded_sequences = []
-        start_idx = 0
-        for l in x_len_lod_0:
-            end_idx = start_idx + l
-            seq = x_data[start_idx:end_idx]
-            to_pad_len = padded_length - l
-            for _ in range(to_pad_len):
-                seq = np.append(seq, pad_value_data[np.newaxis, :], axis=0)
-            padded_sequences.append(seq)
-            start_idx = end_idx
-
-        out_data = np.array(padded_sequences)
-        length = np.array(self.x_len_lod[0]).reshape((-1))
-        self.outputs = {'Out': out_data, 'Length': length}
-
-    def setUp(self):
-        self.op_type = 'sequence_pad'
-        self.set_attr()
-        self.set_data()
-        self.compute()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestSequencePadOp2(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 4]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [1.0, 2.0, 3.0, 4.0]
-        self.padded_length = -1
-        self.dtype = 'float32'
-
-
-class TestSequencePadOp3(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 4]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [1.0]
-        self.padded_length = 7
-        self.dtype = 'float32'
-
-
-class TestSequencePadOp4(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 4]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [1.0, 2.0, 3.0, 4.0]
-        self.padded_length = 7
-        self.dtype = 'float32'
-
-
-class TestSequencePadOp5(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 2, 2]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [1.0]
-        self.padded_length = -1
-        self.dtype = 'float32'
-
-
-class TestSequencePadOp6(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 2, 2]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [[1.0, 2.0], [3.0, 4.0]]
-        self.padded_length = -1
-        self.dtype = 'float32'
-
-
-class TestSequencePadOp7(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 2, 2]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [1.0]
-        self.padded_length = 7
-        self.dtype = 'float32'
-
-
-class TestSequencePadOp8(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 2, 2]
-        self.x_len_lod = [[0, 8, 0, 4, 0]]
-        self.pad_value = [1.0]
-        self.padded_length = 10
-        self.dtype = 'float32'
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
deleted file mode 100644
index e2e7837dac7a2430331c6b595174057b388ad043..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-from op_test import OpTest
-
-
-class TestSequenceReshape(OpTest):
-    def init_data(self):
-        self.dimension = 12
-        self.x_lod = [[4, 1, 3, 3]]
-        self.x = np.random.uniform(0.1, 1, [11, 24]).astype('float32')
-
-    def setUp(self):
-        self.init_data()
-        self.op_type = 'sequence_reshape'
-        self.inputs = {'X': (self.x, self.x_lod)}
-        self.attrs = {'new_dim': self.dimension}
-        out, out_lod = self.compute_output(self.x, self.x_lod, self.dimension)
-        self.outputs = {'Out': (out, out_lod)}
-
-    def compute_output(self, x, x_lod, dimension):
-        x_width = x.shape[1]
-        out_lod = [[]]
-        for i in range(len(x_lod[0])):
-            seq_len = x_lod[0][i]
-            offset = (seq_len * x_width) / dimension
-            assert int(offset) * dimension == seq_len * x_width
-            out_lod[0].append(int(offset))
-        out = np.zeros(shape=(sum(out_lod[0]), dimension)).astype('float32')
-        out.ravel()[:] = x.ravel()[:]
-        return out, out_lod
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestSequenceReshape_reduce(TestSequenceReshape):
-    def init_data(self):
-        self.dimension = 24
-        self.x_lod = [[4, 2, 2, 4]]
-        self.x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
-
-
-class TestSequenceReshape_same(TestSequenceReshape):
-    def init_data(self):
-        self.dimension = 12
-        self.x_lod = [[4, 2, 2, 4]]
-        self.x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
-
-
-class TestSequenceReshape_reduce_seq_len0(TestSequenceReshape):
-    def init_data(self):
-        self.dimension = 24
-        self.x_lod = [[0, 6, 0, 2, 4]]
-        self.x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
-
-
-class TestSequenceReshape_reduce_seq_len0_case1(TestSequenceReshape):
-    def init_data(self):
-        self.dimension = 24
-        self.x_lod = [[0, 2, 8, 2, 0]]
-        self.x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reverse.py b/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
deleted file mode 100644
index 09fb068ae6682be3d0f6506841eb8efceea7b61c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from op_test import OpTest
-import numpy as np
-
-
-class TestSequenceReverseBase(OpTest):
-    def initParameters(self):
-        pass
-
-    def setUp(self):
-        self.size = (10, 3, 4)
-        self.lod = [2, 3, 5]
-        self.dtype = 'float32'
-        self.initParameters()
-        self.op_type = 'sequence_reverse'
-        self.x = np.random.random(self.size).astype(self.dtype)
-        self.y = self.get_output()
-
-        self.inputs = {'X': (self.x, [self.lod, ]), }
-        self.outputs = {'Y': (self.y, [self.lod, ]), }
-
-    def get_output(self):
-        tmp_x = np.reshape(self.x, newshape=[self.x.shape[0], -1])
-        tmp_y = np.ndarray(tmp_x.shape).astype(self.dtype)
-        prev_idx = 0
-        for cur_len in self.lod:
-            idx_range = range(prev_idx, prev_idx + cur_len)
-            tmp_y[idx_range, :] = np.flip(tmp_x[idx_range, :], 0)
-            prev_idx += cur_len
-
-        return np.reshape(tmp_y, newshape=self.x.shape).astype(self.dtype)
-
-    def test_output(self):
-        self.check_output(0)
-
-    def test_grad(self):
-        self.check_grad(['X'], 'Y')
-
-
-class TestSequenceReserve1(TestSequenceReverseBase):
-    def initParameters(self):
-        self.size = (12, 10)
-        self.lod = [4, 5, 3]
-
-
-class TestSequenceReverse2(TestSequenceReverseBase):
-    def initParameters(self):
-        self.size = (12, 10)
-        self.lod = [12]
-
-
-class TestSequenceReverse3(TestSequenceReverseBase):
-    def initParameters(self):
-        self.size = (12, 10)
-        self.lod = [3, 0, 6, 3]
-
-
-class TestSequenceReverse3(TestSequenceReverseBase):
-    def initParameters(self):
-        self.size = (12, 10)
-        self.lod = [0, 2, 10, 0]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py b/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py
deleted file mode 100644
index 4ffe2c2a12bc12eaa4f6ddb860f977de1265cb54..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSequenceScatterOp(OpTest):
-    def init_lod(self):
-        return [[3, 5, 4]]
-
-    def setUp(self):
-        self.op_type = "sequence_scatter"
-
-        X_data = np.random.uniform(0.1, 1.0, [3, 6]).astype('float32')
-        Ids_data = np.array([[0], [1], [2], [5], [4], [3], [0], [1], [3], [2],
-                             [5], [4]]).astype('int64')
-        Ids_lod = self.init_lod()
-
-        Updates_data = np.random.uniform(0.1, 1.0, [12, 1]).astype('float32')
-        Updates_lod = Ids_lod
-
-        Out_data = np.copy(X_data)
-        offset = 0
-        for i in range(3):
-            Out_data[i][Ids_data[offset:(offset + Ids_lod[0][
-                i])]] += Updates_data[offset:(offset + Ids_lod[0][i])]
-            offset += Ids_lod[0][i]
-
-        self.inputs = {
-            'X': X_data,
-            'Ids': (Ids_data, Ids_lod),
-            'Updates': (Updates_data, Updates_lod)
-        }
-        self.outputs = {'Out': Out_data}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True)
-
-
-class TestSequenceScatterOpSeqLen0(TestSequenceScatterOp):
-    def init_lod(self):
-        return [[6, 0, 6]]
-
-
-class TestSequenceScatterOpSeqLen0Case1(TestSequenceScatterOp):
-    def init_lod(self):
-        return [[0, 6, 6]]
-
-
-class TestSequenceScatterOpSeqLen0Case2(TestSequenceScatterOp):
-    def init_lod(self):
-        return [[6, 6, 0]]
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
deleted file mode 100644
index 9c5492b5b15c1ddbe61e5840b5075ba1c010f0d8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-from op_test import OpTest
-
-
-class TestSequenceSliceOp(OpTest):
-    def set_data(self):
-        self.init_test_case()
-        # only supprot one level LoD
-        x = np.random.random(self.x_dim).astype('float32')
-        lod = self.x_lod
-        offset = np.array(self.offset).astype("int64")
-        length = np.array(self.length).astype("int64")
-
-        self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
-        outs = []  #np.zeros((100, 3, 2)).astype('float32')
-        out_lod = [[]]
-        lod_offset = 0
-        for i in range(len(offset)):
-            sub_x = x[lod_offset + offset[i, 0]:lod_offset + offset[i, 0] +
-                      length[i, 0], :]
-            outs.append(sub_x)
-            out_lod[0].append(len(sub_x))
-            lod_offset += lod[0][i]
-        outs = np.concatenate(outs, axis=0)
-        self.outputs = {'Out': (outs, out_lod)}
-
-    def init_test_case(self):
-        self.x_dim = (100, 3, 2)
-        self.x_lod = [[20, 20, 20, 20, 20]]
-        self.offset = [[1], [2], [3], [4], [5]]
-        self.length = [[10], [8], [6], [4], [2]]
-
-    def setUp(self):
-        self.op_type = "sequence_slice"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSequenceSliceOpSeqlen0Case0(TestSequenceSliceOp):
-    def init_test_case(self):
-        self.x_dim = (100, 3, 2)
-        self.x_lod = [[20, 30, 0, 30, 20]]
-        self.offset = [[1], [2], [0], [4], [5]]
-        self.length = [[10], [8], [0], [4], [2]]
-
-
-class TestSequenceSliceOpSeqlen0Case1(TestSequenceSliceOp):
-    def init_test_case(self):
-        self.x_dim = (100, 3, 2)
-        self.x_lod = [[0, 70, 0, 30, 0]]
-        self.offset = [[0], [2], [0], [4], [0]]
-        self.length = [[0], [8], [0], [4], [0]]
-
-
-class TestSequenceSliceOpSeqlen0Case2(TestSequenceSliceOp):
-    def init_test_case(self):
-        self.x_dim = (100, 3, 2)
-        self.x_lod = [[0, 100, 0, 0, 0]]
-        self.offset = [[0], [2], [0], [0], [0]]
-        self.length = [[0], [8], [0], [0], [0]]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
deleted file mode 100644
index 154a53ee84d2014835b3caf901b62eb8629da753..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_softmax_op import stable_softmax
-import paddle.fluid.core as core
-
-
-class TestSequenceSoftmaxOp(OpTest):
-    def setUp(self):
-        self.op_type = "sequence_softmax"
-        self.use_cudnn = False
-        self.init_op_type()
-
-        x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
-        self.init_lod()
-        out = np.zeros((11, 1)).astype("float32")
-        offset = 0
-        for i in range(len(self.lod[0])):
-            if (self.lod[0][i] == 0):
-                continue
-            sub_x = x[offset:offset + self.lod[0][i], :]
-            sub_x = sub_x.reshape(1, self.lod[0][i])
-            sub_out = stable_softmax(sub_x)
-            out[offset:offset + self.lod[0][i], :] = sub_out.reshape(
-                self.lod[0][i], 1)
-            offset += self.lod[0][i]
-
-        self.inputs = {"X": (x, self.lod)}
-        self.outputs = {"Out": out}
-        self.attrs = {'use_cudnn': self.use_cudnn, }
-
-    def init_lod(self):
-        self.lod = [[4, 1, 3, 3]]
-
-    def init_op_type(self):
-        pass
-
-    def test_check_output(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
-
-    def test_check_grad(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ["X"], "Out", max_relative_error=0.01)
-        else:
-            self.check_grad(["X"], "Out", max_relative_error=0.01)
-
-
-# ----------------cudnn Sequencesoftmax----------------
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
-    def init_op_type(self):
-        self.use_cudnn = True
-
-
-class TestSequenceSoftmaxOpSeqLen0Case0(TestSequenceSoftmaxOp):
-    def init_lod(self):
-        self.lod = [[4, 0, 4, 3]]
-
-
-class TestSequenceSoftmaxOpSeqLen0Case1(TestSequenceSoftmaxOp):
-    def init_lod(self):
-        self.lod = [[0, 4, 7, 0]]
-
-
-class TestSequenceSoftmaxOpSeqLen0Case2(TestSequenceSoftmaxOp):
-    def init_lod(self):
-        self.lod = [[0, 0, 0, 11]]
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_topk_avg_pooling.py b/python/paddle/fluid/tests/unittests/test_sequence_topk_avg_pooling.py
deleted file mode 100644
index 0311421141c067ad7775218cae0b2549956df74b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_topk_avg_pooling.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from copy import deepcopy
-
-
-class TestSequenceTopkAvgPoolingOp(OpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.set_data()
-        self.compute()
-
-    def init_op_type(self):
-        self.op_type = "sequence_topk_avg_pooling"
-
-    def set_data(self):
-        topks = [2]
-        channel_num = 3
-        dim = 10
-        row = [2, 4]
-        col = [3, 2]
-        self.init_data(topks, channel_num, row, col, dim)
-
-    def init_data(self, topks, channel_num, row, col, dim=10):
-        self.attrs = {"topks": topks, "channel_num": channel_num}
-        feature = [row[i] * col[i] for i in range(len(row))]
-        numel = sum(feature) * channel_num
-        x_data = np.random.random((numel, )).astype('float32')
-        x_lod = [[x * channel_num for x in feature]]
-        row_data = np.random.random((sum(row), dim)).astype('float32')
-        col_data = np.random.random((sum(col), dim)).astype('float32')
-        self.inputs = {
-            'X': (x_data, x_lod),
-            'ROW': (row_data, [row]),
-            'COLUMN': (col_data, [col])
-        }
-
-    def compute(self):
-        topks = self.attrs['topks']
-        max_k = topks[-1]
-        x_data, x_lod = self.inputs['X']
-        row_data, row_lod = self.inputs['ROW']
-        col_data, col_lod = self.inputs['COLUMN']
-        channel_num = self.attrs['channel_num']
-        out = np.zeros((0, len(topks) * channel_num), dtype=x_data.dtype)
-        pos = np.zeros((0, ), dtype='int32')
-        out_lod = deepcopy(row_lod)
-
-        offset = 0
-        for idx in range(len(x_lod[0])):
-            x_len = x_lod[0][idx]
-            self.assertTrue(
-                x_len == channel_num * row_lod[0][idx] * col_lod[0][idx],
-                "x_len: %s can't mod channel_num: %s" % (x_len, channel_num))
-            # feature = x_len / channel_num
-            out_tmp = np.zeros((0, ), dtype=x_data.dtype)
-            pos_tmp = np.zeros((0, ), dtype='int32')
-            for ch in range(channel_num):
-                for r_id in range(row_lod[0][idx]):
-                    x_sub = x_data[offset:(offset + col_lod[0][idx])]
-                    topk_val, topk_pos = self.get_topk(x_sub, max_k)
-                    sum_data = self.topk_sum(topk_val, topk_pos, max_k)
-                    new_feature = np.array(
-                        [sum_data[topk] / topk for topk in topks])
-                    out_tmp = np.hstack((out_tmp, new_feature))
-                    pos_tmp = np.hstack((pos_tmp, topk_pos))
-
-                    offset += col_lod[0][idx]
-
-            out_tmp = out_tmp.reshape([channel_num, -1, len(topks)]).transpose(
-                1, 0, 2)
-            pos_tmp = pos_tmp.reshape([channel_num, -1, max_k]).transpose(1, 0,
-                                                                          2)
-            out = np.vstack(
-                (out, out_tmp.reshape([-1, len(topks) * channel_num])))
-            pos = np.hstack((pos, pos_tmp.flatten()))
-
-        self.outputs = {'Out': (out.astype('float32'), out_lod), 'pos': pos}
-
-    def get_topk(self, x, topk):
-        real_topk = topk if topk < len(x) else len(x)
-        topk_pos = np.array(x).argsort()[-topk:][::-1]
-        topk_val = np.array(x)[topk_pos]
-        if real_topk < topk:
-            topk_pos = np.hstack((topk_pos, np.full((topk - real_topk, ), -1)))
-            topk_val = np.hstack((topk_val, np.full((topk - real_topk, ), 0.0)))
-
-        return topk_val, topk_pos
-
-    def topk_sum(self, x, pos, max_k):
-        sum_data = [0.] * (max_k + 1)
-        for i in range(1, max_k + 1):
-            if pos[i - 1] == -1:
-                sum_data[i] = sum_data[i - 1]
-            else:
-                sum_data[i] = sum_data[i - 1] + x[i - 1]
-        return sum_data
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.005)
-
-
-class TestSequenceTopkAvgPoolingOpCase1(TestSequenceTopkAvgPoolingOp):
-    def set_data(self):
-        topks = [2, 3]
-        channel_num = 3
-        dim = 10
-        row = [3]
-        col = [4]
-        self.init_data(topks, channel_num, row, col, dim)
-
-    def test_api(self):
-        import paddle.fluid as fluid
-        x = fluid.layers.data(name='x', shape=[1], lod_level=1)
-        row = fluid.layers.data(name='row', shape=[10], lod_level=1)
-        col = fluid.layers.data(name='col', shape=[10], lod_level=1)
-        topk_avg = fluid.contrib.sequence_topk_avg_pooling(
-            input=x, row=row, col=col, topks=[1, 3, 5], channel_num=5)
-
-        place = fluid.CPUPlace()
-        x_tensor = fluid.create_lod_tensor(
-            np.random.rand(45, 1).astype('float32'), [[30, 15]], place)
-        row_tensor = fluid.create_lod_tensor(
-            np.random.rand(5, 10).astype('float32'), [[2, 3]], place)
-        col_tensor = fluid.create_lod_tensor(
-            np.random.rand(4, 10).astype('float32'), [[3, 1]], place)
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(
-            feed={'x': x_tensor,
-                  'row': row_tensor,
-                  'col': col_tensor},
-            fetch_list=[topk_avg],
-            return_numpy=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
deleted file mode 100644
index 19ef00ba83c578439c0fe515b521888fddfff1db..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import six
-import numpy as np
-from op_test import OpTest
-
-
-class TestSequenceUnpadOp(OpTest):
-    def init(self):
-        self.length = [2, 3, 4]
-        self.x_shape = (3, 5)
-        self.dtype = "float32"
-
-    def compute(self):
-        assert len(self.length) == self.x_shape[0]
-        x = np.random.random(self.x_shape).astype(self.dtype)
-        out_lod = [self.length]
-
-        out = x[0, 0:self.length[0]]
-        for i in six.moves.xrange(1, x.shape[0]):
-            out = np.append(out, x[i, 0:self.length[i]], axis=0)
-
-        out_shape = (sum(self.length), )
-        if len(self.x_shape) == 2:
-            out_shape = out_shape + (1, )
-        else:
-            out_shape = out_shape + self.x_shape[2:]
-
-        self.inputs = {'X': x, 'Length': np.array(self.length).astype('int64')}
-        self.outputs = {'Out': (out.reshape(out_shape), out_lod)}
-
-    def setUp(self):
-        self.op_type = 'sequence_unpad'
-        self.init()
-        self.compute()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestSequenceUnpadOp2(TestSequenceUnpadOp):
-    def init(self):
-        self.length = [2, 3, 4]
-        self.x_shape = (3, 5, 4, 3)
-        self.dtype = "float32"
-
-
-class TestSequenceUnpadOp3(TestSequenceUnpadOp):
-    def init(self):
-        self.length = [5, 2, 3, 4]
-        self.x_shape = (4, 5, 3, 3, 6)
-        self.dtype = "float64"
-
-
-class TestSequenceUnpadOp4(TestSequenceUnpadOp):
-    def init(self):
-        self.length = [5, 0, 0, 4]
-        self.x_shape = (4, 5, 3, 3, 6)
-        self.dtype = "float64"
-
-
-class TestSequenceUnpadOp4(TestSequenceUnpadOp):
-    def init(self):
-        self.length = [0, 4, 3, 0]
-        self.x_shape = (4, 5, 3, 3, 6)
-        self.dtype = "float64"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
deleted file mode 100644
index 162e6d1938c8174d342d8e4af1e4b6c424afc521..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ /dev/null
@@ -1,190 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from op_test import OpTest
-
-
-class TestSGDOp(OpTest):
-    def setUp(self):
-        self.op_type = "sgd"
-        self.conf()
-        w = np.random.random((self.h, self.w)).astype("float32")
-        g = np.random.random((self.h, self.w)).astype("float32")
-        lr = np.array([0.1]).astype("float32")
-
-        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
-        self.outputs = {'ParamOut': w - lr * g}
-
-    def conf(self):
-        self.h = 102
-        self.w = 105
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSGDOpCase8X(TestSGDOp):
-    def conf(self):
-        self.h = 10
-        self.w = 64
-
-
-class TestSparseSGDOp(unittest.TestCase):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Grad Variable   
-        height = 10
-        rows = [0, 4, 7]
-        self.conf()
-
-        grad_selected_rows = scope.var('Grad').get_selected_rows()
-        grad_selected_rows.set_height(height)
-        grad_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), self.row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 8] = 4.0
-
-        grad_tensor = grad_selected_rows.get_tensor()
-        grad_tensor.set(np_array, place)
-
-        # create and initialize Param Variable
-        param = scope.var('Param').get_tensor()
-        param_array = np.full((height, self.row_numel), 5.0).astype("float32")
-        param.set(param_array, place)
-
-        # create and initialize LeraningRate Variable
-        lr = scope.var('LearningRate').get_tensor()
-        lr_array = np.full((1), 2.0).astype("float32")
-        lr.set(lr_array, place)
-
-        # create and run sgd operator
-        sgd_op = Operator(
-            "sgd",
-            Param='Param',
-            Grad='Grad',
-            ParamOut='Param',
-            LearningRate='LearningRate')
-        sgd_op.run(scope, place)
-
-        # get and compare result
-        result_array = np.array(param)
-
-        # rows[0] = 0, 5.0 - 2.0 * 2.0
-        self.assertAlmostEqual(1.0, result_array[rows[0], 0])
-        # rows[0] = 0, 5.0 - 2.0 * 1.0
-        self.assertAlmostEqual(3.0, result_array[rows[0], 2])
-        # 5.0 - 2.0 * 0.0
-        self.assertAlmostEqual(5.0, result_array[1, 0])
-        # rows[1] = 4, 5.0 - 2.0 * 1.0
-        self.assertAlmostEqual(3.0, result_array[rows[1], 10])
-        # 5.0 - 2.0 * 0.0
-        self.assertAlmostEqual(5.0, result_array[5, 8])
-        # rows[2] = 7, 5.0 - 2.0 * 1.0
-        self.assertAlmostEqual(3.0, result_array[rows[2], 1])
-        # rows[2] = 7, 5.0 - 2.0 * 4.0
-        self.assertAlmostEqual(-3.0, result_array[rows[2], 8])
-
-    def test_sparse_sgd(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
-
-    def conf(self):
-        self.row_numel = 12
-
-
-class TestSparseSGDOpCase8X(TestSparseSGDOp):
-    def conf(self):
-        self.row_numel = 16
-
-
-class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        row_width = 12
-        # create and initialize Grad Variable
-        grad_height = 10
-        grad_rows = [0, 4, 7]
-
-        grad_selected_rows = scope.var('Grad').get_selected_rows()
-        grad_selected_rows.set_height(grad_height)
-        grad_selected_rows.set_rows(grad_rows)
-        grad_array = np.ones((len(grad_rows), row_width)).astype("float32")
-        grad_array[0, 0] = 2.0
-        grad_array[2, 8] = 4.0
-
-        grad_tensor = grad_selected_rows.get_tensor()
-        grad_tensor.set(grad_array, place)
-
-        # create and initialize Param Variable
-        # create and initialize W Variable
-        param_rows = [0, 1, 2, 3, 4, 5, 6, 7]
-
-        # init Param
-        w_selected_rows = scope.var('Param').get_selected_rows()
-        w_selected_rows.set_height(len(param_rows))
-        w_selected_rows.set_rows(param_rows)
-        w_selected_rows.sync_index()
-        w_array = np.ones((len(param_rows), row_width)).astype("float32")
-        for i in range(len(param_rows)):
-            w_array[i] *= i
-        w_tensor = w_selected_rows.get_tensor()
-        w_tensor.set(w_array, place)
-
-        w_before_optimize = np.array(w_tensor)
-
-        # create and initialize LeraningRate Variable
-        lr_value = 0.1
-        lr = scope.var('LearningRate').get_tensor()
-        lr_array = np.full((1), lr_value).astype("float32")
-        lr.set(lr_array, place)
-
-        # optimize with Python
-        w_after_optimize = np.copy(w_before_optimize)
-        for index, id in enumerate(grad_rows):
-            w_after_optimize[id] = w_before_optimize[
-                id] - lr_value * grad_array[index]
-
-        # create and run sgd operator
-        sgd_op = Operator(
-            "sgd",
-            Param='Param',
-            Grad='Grad',
-            ParamOut='Param',
-            LearningRate='LearningRate')
-        sgd_op.run(scope, place)
-
-        # get and compare result
-        result_array = np.array(w_tensor)
-        assert (result_array == w_after_optimize).all()
-
-    def test_sparse_parameter_sgd(self):
-        places = [core.CPUPlace()]
-        # do not support GPU kernel currently
-        for place in places:
-            self.check_with_place(place)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py
deleted file mode 100644
index 02231ea943e1e92a08730e6e9f1aa3cefeb927c0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_shape_op.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestShapeOp(OpTest):
-    def setUp(self):
-        self.op_type = "shape"
-        self.config()
-        self.shape = [2, 3]
-        input = np.zeros(self.shape)
-        self.inputs = {'Input': input}
-        self.outputs = {'Out': np.array(self.shape)}
-
-    def config(self):
-        self.shape = [2, 3]
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class case1(TestShapeOp):
-    def config(self):
-        self.shape = [2]
-
-
-class case2(TestShapeOp):
-    def config(self):
-        self.shape = [1, 2, 3]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_shard_index_op.py b/python/paddle/fluid/tests/unittests/test_shard_index_op.py
deleted file mode 100644
index fd3c0a5458ab8cc675b4de43516164b6386a4882..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_shard_index_op.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-from op_test import OpTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-from paddle.fluid.framework import Program, program_guard
-
-
-def common_setup(self, index_num, nshards, shard_id, ignore_value):
-    self.op_type = 'shard_index'
-    x_lod = [[i for i in range(10)]]
-    N = sum(x_lod[0])
-    x = [np.random.randint(0, index_num - 1) for i in range(N)]
-    x = np.array(x).astype('int32').reshape([N, 1])
-
-    shard_size = index_num // nshards
-    out = np.zeros(shape=x.shape).astype('int32')
-    for i in range(N):
-        if x[i] // shard_size == shard_id:
-            out[i] = x[i] % shard_size
-        else:
-            out[i] = ignore_value
-
-    self.inputs = {'X': (x, x_lod)}
-    self.attrs = {
-        'index_num': index_num,
-        'nshards': nshards,
-        'shard_id': shard_id,
-        'ignore_value': ignore_value
-    }
-    self.outputs = {'Out': (out, x_lod)}
-
-
-class TestShardIndexShardId0Op(OpTest):
-    def setUp(self):
-        common_setup(self, 20, 2, 0, -1)
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestShardIndexShardId1Op(OpTest):
-    def setUp(self):
-        common_setup(self, 20, 2, 1, -1)
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestShardIndexIgnoreValueOp(OpTest):
-    def setUp(self):
-        common_setup(self, 20, 2, 0, -2)
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestShardIndexNotEvenlyDividedOp(OpTest):
-    def setUp(self):
-        common_setup(self, 15, 2, 1, -1)
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
deleted file mode 100644
index 97f79f9421d498723da4c7992551f1210d3f6003..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ /dev/null
@@ -1,108 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-from paddle.fluid.executor import Executor
-import paddle.fluid.layers as layers
-from paddle.fluid.backward import append_backward
-from paddle.fluid.framework import default_main_program, switch_main_program
-from paddle.fluid.framework import Program
-import numpy as np
-
-from paddle.fluid.layers.control_flow import shrink_memory
-from paddle.fluid.layers.control_flow import lod_rank_table
-
-
-class TestShrinkRNNMemoryBase(unittest.TestCase):
-    def setUp(self):
-        self.main_program = Program()
-        switch_main_program(self.main_program)
-        x = layers.data('x', shape=[100], dtype='float32')
-        x.stop_gradient = False
-        rank_table_tensor = layers.data(
-            'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
-        table = lod_rank_table(x=rank_table_tensor)
-        i = layers.zeros(dtype='int64', shape=[1])
-        self.mem1 = shrink_memory(x=x, i=i, table=table)
-        i = layers.increment(x=i)
-        i.stop_gradient = True
-        self.mem2 = shrink_memory(x=self.mem1, i=i, table=table)
-        i = layers.increment(x=i)
-        i.stop_gradient = True
-        self.mem3 = shrink_memory(x=self.mem2, i=i, table=table)
-        mem3_mean = layers.mean(self.mem3)
-        append_backward(loss=mem3_mean)
-        self.x_grad = self.main_program.global_block().var('x@GRAD')
-
-    def sum_lodtensor(self, tensor):
-        sum_res = 0.0
-        for i in range(np.product(tensor.shape())):
-            sum_res += tensor._get_float_element(i)
-        return sum_res
-
-
-class TestShrinkRNNMemoryReferLoD(TestShrinkRNNMemoryBase):
-    def test_refer_lod(self):
-        cpu = core.CPUPlace()
-        x_tensor = core.LoDTensor()
-        x_tensor.set_recursive_sequence_lengths([[2, 3, 1]])
-        tensor_np = np.random.random(size=(6, 100)).astype('float32')
-        x_tensor.set(tensor_np, cpu)
-
-        rank_table_tensor = core.LoDTensor()
-        rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
-        rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
-                              cpu)
-
-        exe = Executor(cpu)
-        outs = exe.run(
-            feed={'x': x_tensor,
-                  'rank_table_tensor': rank_table_tensor},
-            fetch_list=[self.mem1, self.mem2, self.mem3, self.x_grad],
-            return_numpy=False)
-        self.assertTrue(np.allclose(tensor_np[0:6], outs[0]))
-        self.assertTrue(np.allclose(tensor_np[0:5], outs[1]))
-        self.assertTrue(np.allclose(tensor_np[0:2], outs[2]))
-        self.assertAlmostEqual(1.0, self.sum_lodtensor(outs[3]), delta=0.01)
-
-
-class TestShrinkRNNMemoryNoLoD(TestShrinkRNNMemoryBase):
-    def test_no_lod(self):
-        cpu = core.CPUPlace()
-        x_tensor = core.LoDTensor()
-        tensor_np = np.random.random(size=(3, 100)).astype('float32')
-        x_tensor.set(tensor_np, cpu)
-
-        rank_table_tensor = core.LoDTensor()
-        rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
-        rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
-                              cpu)
-
-        exe = Executor(cpu)
-        outs = exe.run(
-            feed={'x': x_tensor,
-                  'rank_table_tensor': rank_table_tensor},
-            fetch_list=[self.mem1, self.mem2, self.mem3, self.x_grad],
-            return_numpy=False)
-        self.assertTrue(np.allclose(tensor_np[0:3], outs[0]))
-        self.assertTrue(np.allclose(tensor_np[0:2], outs[1]))
-        self.assertTrue(np.allclose(tensor_np[0:1], outs[2]))
-        self.assertAlmostEqual(1.0, self.sum_lodtensor(outs[3]), delta=0.01)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
deleted file mode 100644
index aeaae9058187be1c9191bcbec21237c69fefe6e6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import math
-from op_test import OpTest
-import paddle.fluid.core as core
-
-
-class TestShuffleChannelOp(OpTest):
-    def setUp(self):
-        self.op_type = "shuffle_channel"
-        self.batch_size = 10
-        self.input_channels = 16
-        self.layer_h = 4
-        self.layer_w = 4
-        self.group = 4
-        self.x = np.random.random(
-            (self.batch_size, self.input_channels, self.layer_h,
-             self.layer_w)).astype('float32')
-        self.inputs = {'X': self.x}
-        self.attrs = {'group': self.group}
-        n, c, h, w = self.x.shape
-        input_reshaped = np.reshape(self.x,
-                                    (-1, self.group, c // self.group, h, w))
-        input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4))
-        self.outputs = {'Out': np.reshape(input_transposed, (-1, c, h, w))}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
deleted file mode 100644
index ec10b634091fc521062457b780b0c4cafcbacec0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ /dev/null
@@ -1,246 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-from op_test import OpTest
-from scipy.special import logit
-from scipy.special import expit
-import paddle.fluid.core as core
-import unittest
-
-
-class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
-    """Test sigmoid_cross_entropy_with_logit_op with binary label
-    """
-
-    def setUp(self):
-        self.op_type = "sigmoid_cross_entropy_with_logits"
-        batch_size = 64
-        num_classes = 20
-        self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype("float32")),
-            'Label': np.random.randint(0, 2, (batch_size, num_classes))
-            .astype("float32")
-        }
-
-        # Fw Pass is implemented as elementwise sigmoid followed by
-        # elementwise logistic loss
-        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
-        sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Label'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
-        self.outputs = {'Out': -term1 - term2}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
-    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
-    """
-
-    def setUp(self):
-        self.op_type = "sigmoid_cross_entropy_with_logits"
-        batch_size = 64
-        num_classes = 20
-        ignore_index = -1
-        self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype("float32")),
-            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
-            .astype("float32")
-        }
-        self.attrs = {'ignore_index': ignore_index, }
-        # Fw Pass is implemented as elementwise sigmoid followed by
-        # elementwise logistic loss
-        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
-        sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Label'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
-        out = -term1 - term2
-        out[np.where(self.inputs['Label'] == ignore_index)] = 0
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
-    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
-    """
-
-    def setUp(self):
-        self.op_type = "sigmoid_cross_entropy_with_logits"
-        batch_size = 64
-        num_classes = 20
-        self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype("float32")),
-            'Label': np.random.uniform(0, 1, (batch_size, num_classes))
-            .astype("float32")
-        }
-
-        # Fw Pass is implemented as elementwise sigmoid followed by
-        # elementwise logistic loss
-        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
-        sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Label'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
-        self.outputs = {'Out': -term1 - term2}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSigmoidCrossEntropyWithNorm(OpTest):
-    def setUp(self):
-        self.op_type = "sigmoid_cross_entropy_with_logits"
-        batch_size = 64
-        num_classes = 20
-        ignore_index = -1
-        self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype("float32")),
-            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
-            .astype("float32")
-        }
-        self.attrs = {'ignore_index': ignore_index, 'normalize': True}
-        sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Label'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
-        out = -term1 - term2
-        out[np.where(self.inputs['Label'] == ignore_index)] = 0
-        if self.attrs['normalize']:
-            out = out / float(
-                np.where(self.inputs['Label'] != ignore_index)[0].size)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSigmoidCrossEntropyWithLogitsOp5(OpTest):
-    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
-    """
-
-    def setUp(self):
-        self.op_type = "sigmoid_cross_entropy_with_logits"
-        batch_size = [10, 10]
-        num_classes = 20
-        self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                .astype("float32")),
-            'Label': np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-            .astype("float32")
-        }
-
-        # Fw Pass is implemented as elementwise sigmoid followed by
-        # elementwise logistic loss
-        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
-        sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Label'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
-        self.outputs = {'Out': -term1 - term2}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSigmoidCrossEntropyWithNorm2(OpTest):
-    def setUp(self):
-        self.op_type = "sigmoid_cross_entropy_with_logits"
-        batch_size = [10, 10]
-        num_classes = 20
-        ignore_index = -1
-        self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                .astype("float32")),
-            'Label': np.random.randint(-1, 2, tuple(batch_size + [num_classes]))
-            .astype("float32")
-        }
-        self.attrs = {'ignore_index': ignore_index, 'normalize': True}
-        sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Label'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
-        out = -term1 - term2
-        out[np.where(self.inputs['Label'] == ignore_index)] = 0
-        if self.attrs['normalize']:
-            out = out / float(
-                np.where(self.inputs['Label'] != ignore_index)[0].size)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSigmoidCrossEntropyWithLogitsOp6(OpTest):
-    """Test sigmoid_cross_entropy_with_logit_op with binary label
-    """
-
-    def setUp(self):
-        self.op_type = "sigmoid_cross_entropy_with_logits"
-        batch_size = [10, 10]
-        num_classes = 20
-        self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                .astype("float32")),
-            'Label': np.random.randint(0, 2, tuple(batch_size + [num_classes]))
-            .astype("float32")
-        }
-
-        # Fw Pass is implemented as elementwise sigmoid followed by
-        # elementwise logistic loss
-        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
-        sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Label'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
-        self.outputs = {'Out': -term1 - term2}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py
deleted file mode 100644
index 0e846521d0a886d4aa8425e3c01b6685571f105c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import math
-import copy
-from op_test import OpTest
-from paddle.fluid import core
-
-
-def sigmoid_focal_loss_forward(x_data, label_data, fg_num_data, gamma, alpha,
-                               num_classes):
-    x_data_t = copy.deepcopy(x_data)
-    out_data = copy.deepcopy(x_data)
-    x_width = len(x_data)
-    x_height = len(x_data[0, :])
-    x_data_t = x_data_t.flatten()
-    out_data = out_data.flatten()
-    for idx in range(len(x_data_t)):
-        x = x_data_t[idx]
-        a = int(idx / num_classes)
-        d = int(idx % num_classes)
-        label = label_data[a]
-        c_pos = float((int(label) == int(d + 1)))
-        c_neg = float(((int(label) != -1) & (int(label) != (d + 1))))
-        fg_num = max(fg_num_data, 1)
-        z_neg = (1.0 - alpha) / fg_num
-        z_pos = alpha / fg_num
-
-        p = 1. / (1. + math.exp(-x))
-        FLT_MIN = 1.175494351e-38
-        term_pos = math.pow((1. - p), gamma) * math.log(max(FLT_MIN, p))
-        term_neg = math.pow(p, gamma) * (
-            -1. * x * (x >= 0) - math.log(1. + math.exp(x - 2. * x * (x >= 0))))
-        out_data[idx] = 0.0
-        out_data[idx] += -c_pos * term_pos * z_pos
-        out_data[idx] += -c_neg * term_neg * z_neg
-
-    out_data = out_data.reshape(x_width, x_height)
-    return out_data
-
-
-class TestSigmoidFocalLossOp1(OpTest):
-    def set_argument(self):
-        self.num_anchors = 10
-        self.num_classes = 10
-        self.gamma = 2.0
-        self.alpha = 0.25
-
-    def setUp(self):
-        self.set_argument()
-
-        dims = (self.num_anchors, self.num_classes)
-        X = np.random.standard_normal(dims).astype("float32")
-        L = np.random.randint(0, self.num_classes + 1,
-                              (dims[0], 1)).astype("int32")
-        F = np.zeros(1)
-        F[0] = len(np.where(L > 0)[0])
-        F = F.astype("int32")
-
-        self.op_type = "sigmoid_focal_loss"
-        self.inputs = {
-            'X': X,
-            'Label': L,
-            'FgNum': F,
-        }
-        self.attrs = {
-            'gamma': self.gamma,
-            'alpha': self.alpha,
-        }
-        loss = sigmoid_focal_loss_forward(
-            self.inputs['X'], self.inputs['Label'], self.inputs['FgNum'],
-            self.gamma, self.alpha, self.num_classes)
-        self.outputs = {'Out': loss.astype('float32')}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestSigmoidFocalLossOp2(TestSigmoidFocalLossOp1):
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=2e-3)
-
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.002)
-
-
-class TestSigmoidFocalLossOp3(TestSigmoidFocalLossOp1):
-    def set_argument(self):
-        self.num_anchors = 200
-        self.num_classes = 10
-        self.gamma = 1.0
-        self.alpha = 0.5
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestSigmoidFocalLossOp4(TestSigmoidFocalLossOp3):
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=2e-3)
-
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.002)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
deleted file mode 100644
index 85a9d9cae47c2b0942da0e0d962d4512af1566c0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSignOp(OpTest):
-    def setUp(self):
-        self.op_type = "sign"
-        self.inputs = {
-            'X': np.random.uniform(-10, 10, (10, 10)).astype("float32")
-        }
-        self.outputs = {'Out': np.sign(self.inputs['X'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
deleted file mode 100755
index b3833f05f1aa3aac7b5bcc5b6fdc138870cc8844..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
+++ /dev/null
@@ -1,217 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest
-
-
-class TestSimilarityFocusOp(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 2
-        x_dim, y_dim, z_dim = 3, 2, 2
-        self.inputs = {
-            'X': np.array([[[[0.8, 0.1], [0.4, 0.5]], [[0.9, 0.7], [0.9, 0.9]],
-                            [[0.8, 0.9], [0.1, 0.2]]],
-                           [[[0.2, 0.5], [0.3, 0.4]], [[0.9, 0.7], [0.8, 0.4]],
-                            [[0.0, 0.2], [0.4, 0.7]]]]),
-        }
-        self.attrs = {
-            'axis': 1,
-            'indexes': [0],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
-                )
-                tag1 = [0 for i in range(y_dim)]
-                tag2 = [0 for i in range(z_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // z_dim
-                    idx2 = index % z_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(y_dim, z_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSimilarityFocusOp_axis1(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 3
-        x_dim, y_dim, z_dim = 4, 5, 6
-        self.inputs = {
-            'X': np.random.random(
-                (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
-        }
-        self.attrs = {
-            'axis': 1,
-            'indexes': [0, 3],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
-                )
-                tag1 = [0 for i in range(y_dim)]
-                tag2 = [0 for i in range(z_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // z_dim
-                    idx2 = index % z_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(y_dim, z_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(1, y_dim, z_dim)
-            res = res.repeat([x_dim], axis=0)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSimilarityFocusOp_axis2(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 6
-        x_dim, y_dim, z_dim = 7, 8, 9
-        self.inputs = {
-            'X': np.random.random(
-                (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
-        }
-        self.attrs = {
-            'axis': 2,
-            'indexes': [0, 3, 5],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((x_dim, 1, z_dim)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = self.inputs['X'][batch, :, index, :].reshape(-1).copy(
-                )
-                tag1 = [0 for i in range(x_dim)]
-                tag2 = [0 for i in range(z_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // z_dim
-                    idx2 = index % z_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(x_dim, z_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(x_dim, 1, z_dim)
-            res = res.repeat([y_dim], axis=1)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSimilarityFocusOp_axis3(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 64
-        x_dim, y_dim, z_dim = 48, 48, 13
-        self.inputs = {
-            'X': np.random.random(
-                (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
-        }
-        self.attrs = {
-            'axis': 3,
-            'indexes': [0, 2, 7, 9],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((x_dim, y_dim, 1)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = self.inputs['X'][batch, :, :, index].reshape(-1).copy(
-                )
-                tag1 = [0 for i in range(x_dim)]
-                tag2 = [0 for i in range(y_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // y_dim
-                    idx2 = index % y_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(x_dim, y_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(x_dim, y_dim, 1)
-            res = res.repeat([z_dim], axis=2)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_size_op.py b/python/paddle/fluid/tests/unittests/test_size_op.py
deleted file mode 100644
index aec63caa005f6ac468f0cbd2369bb7ec9884f414..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_size_op.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSizeOp(OpTest):
-    def setUp(self):
-        self.op_type = "size"
-        self.shape = []
-        self.config()
-        input = np.zeros(self.shape, dtype='bool')
-        self.inputs = {'Input': input}
-        self.outputs = {'Out': np.array([np.size(input)], dtype='int64')}
-
-    def config(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRank1Tensor(TestSizeOp):
-    def config(self):
-        self.shape = [2]
-
-
-class TestRank2Tensor(TestSizeOp):
-    def config(self):
-        self.shape = [2, 3]
-
-
-class TestRank3Tensor(TestSizeOp):
-    def config(self):
-        self.shape = [2, 3, 100]
-
-
-class TestLargeTensor(TestSizeOp):
-    def config(self):
-        self.shape = [2**10]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
deleted file mode 100644
index 4dc49637283333c168e87d534e265fafb284dda6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ /dev/null
@@ -1,532 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from op_test import OpTest
-import paddle.fluid as fluid
-
-
-# Situation 1: starts(list, no tensor), ends(list, no tensor)
-# 1.1 without attr(decrease)
-class TestSliceOp(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [3, 3, 4]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1:3, 0:3, 2:4, :]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-class TestCase1(TestSliceOp):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-3, 0, 2]
-        self.ends = [3, 100, -1]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-3:3, 0:100, 2:-1, :]
-
-
-class TestCase2(TestSliceOp):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-3, 0, 2]
-        self.ends = [3, 100, -1]
-        self.axes = [0, 1, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-3:3, 0:100, :, 2:-1]
-
-
-# 1.2 with attr(decrease)
-class TestSliceOp_decs_dim(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags,
-            'decrease_axis': self.decrease_axis,
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 3, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1, 0:3, 2:4, :]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 1, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0, 1]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1, 0, 2:4, :]
-
-
-class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-1, 0, 2]
-        self.ends = [1000000, 1, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0, 1]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-1, 0, 2:4, :]
-
-
-class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 7]).astype("float32")
-        self.starts = [0, 1, 2, 3]
-        self.ends = [1, 2, 3, 4]
-        self.axes = [0, 1, 2, 3]
-        self.decrease_axis = [0, 1, 2, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[0, 1, 2, 3:4]
-
-
-class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-1]
-        self.ends = [1000000]
-        self.axes = [3]
-        self.decrease_axis = [3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[:, :, :, -1]
-
-
-class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [0, 1, 2, 3]
-        self.ends = [1, 2, 3, 4]
-        self.axes = [0, 1, 2, 3]
-        self.decrease_axis = [0, 1, 2, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[0, 1, 2, 3:4]
-
-
-# Situation 2: starts(list, have tensor), ends(list, no tensor)
-# without attr(decrease)
-class TestSliceOp_starts_ListTensor(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-
-        starts_tensor = []
-        for index, ele in enumerate(self.starts):
-            starts_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts_infer,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [3, 3, 4]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [-1, 1, -1]
-        self.out = self.input[1:3, 0:3, 2:4, :]
-
-        self.starts_infer = [-1, 0, -1]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-# Situation 2: starts(list, have tensor), ends(list, no tensor)
-#  with attr(decrease)
-class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-
-        starts_tensor = []
-        for index, ele in enumerate(self.starts):
-            starts_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
-
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts_infer,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags,
-            'decrease_axis': self.decrease_axis,
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 3, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0]
-        self.infer_flags = [1, -1, 1]
-        self.out = self.input[1, 0:3, 2:4, :]
-
-        self.starts_infer = [1, -1, 2]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-class TestSliceOp_decs_dim_5_starts_ListTensor(
-        TestSliceOp_decs_dim_starts_ListTensor):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-1]
-        self.ends = [1000000]
-        self.axes = [3]
-        self.decrease_axis = [3]
-        self.infer_flags = [-1]
-        self.out = self.input[:, :, :, -1]
-
-        self.starts_infer = [-1]
-
-
-# Situation 3: starts(tensor), ends(list, no tensor)
-# with attr(decrease)
-class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {
-            'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32")
-        }
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            #'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags,
-            'decrease_axis': self.decrease_axis,
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 3, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0]
-        self.infer_flags = [-1, -1, -1]
-        self.out = self.input[1, 0:3, 2:4, :]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-# Situation 4: starts(tensor), ends(tensor)
-#  without attr(decrease)
-class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-
-        self.inputs = {
-            'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32"),
-            "EndsTensor": np.array(
-                self.ends, dtype="int32")
-        }
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            #'starts': self.starts,
-            #'ends': self.ends_infer,
-            'infer_flags': self.infer_flags
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [3, 3, 4]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [-1, -1, -1]
-        self.out = self.input[1:3, 0:3, 2:4, :]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-# Situation 5: starts(tensor), ends(tensor)
-#  with attr(decrease)
-class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {
-            'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32"),
-            "EndsTensor": np.array(
-                self.ends, dtype="int32")
-        }
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            #'starts': self.starts,
-            #'ends': self.ends,
-            'infer_flags': self.infer_flags,
-            'decrease_axis': self.decrease_axis,
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 1, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0, 1]
-        self.infer_flags = [-1, -1, -1]
-        self.out = self.input[1, 0, 2:4, :]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-# Situation 6: starts(tensor), ends(list, have tensor)
-# without attr(decrease)
-class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-
-        ends_tensor = []
-        for index, ele in enumerate(self.ends):
-            ends_tensor.append(("y" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {
-            'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32"),
-            'EndsTensorList': ends_tensor
-        }
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            #'starts': self.starts,
-            'ends': self.ends_infer,
-            'infer_flags': self.infer_flags
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [3, 3, 4]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [-1, -1, -1]
-        self.out = self.input[1:3, 0:3, 2:4, :]
-
-        self.ends_infer = [-1, 3, 4]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-# Test CUDA float16
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestFP16(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags
-        }
-
-    def config(self):
-        self.dtype = "float16"
-        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
-        self.starts = [-3, 0, 2]
-        self.ends = [3, 100, -1]
-        self.axes = [0, 1, 3]
-        self.out = self.input[-3:3, 0:100, :, 2:-1]
-        self.infer_flags = [1, 1, 1]
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_output_with_place(place, atol=1e-5)
-
-    def test_check_grad_normal(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['Input'], 'Out', max_relative_error=0.006)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestFP16_2(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags
-        }
-
-    def config(self):
-        self.dtype = "float16"
-        self.input = np.random.random([3, 4, 5]).astype(self.dtype)
-        self.starts = [0]
-        self.ends = [1]
-        self.axes = [1]
-        self.out = self.input[:, 0:1, :]
-        self.infer_flags = [1]
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_output_with_place(place, atol=1e-5)
-
-    def test_check_grad_normal(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Out',
-                max_relative_error=0.006,
-                numeric_grad_delta=0.5)
-
-
-# Test python API
-class TestSliceAPI(OpTest):
-    def test_1(self):
-        input = np.random.random([3, 4, 5, 6]).astype("float32")
-        minus_1 = fluid.layers.fill_constant([1], "int32", -1)
-        minus_3 = fluid.layers.fill_constant([1], "int32", -3)
-        starts = fluid.layers.data(
-            name='starts', shape=[1, 3], append_batch_size=False)
-        ends = fluid.layers.data(
-            name='ends', shape=[3], append_batch_size=False)
-
-        x = fluid.layers.data(
-            name="x",
-            shape=[3, 4, 5, 6],
-            append_batch_size=False,
-            dtype="float32")
-
-        out_1 = fluid.layers.slice(
-            x, axes=[0, 1, 2], starts=[-3, 0, 2], ends=[3, 100, -1])
-        out_2 = fluid.layers.slice(
-            x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, -1])
-        out_3 = fluid.layers.slice(
-            x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, minus_1])
-        out_4 = fluid.layers.slice(x, axes=[0, 1, 2], starts=starts, ends=ends)
-
-        out_5 = x[-3:3, 0:100, 2:-1]
-        out_6 = x[minus_3:3, 0:100, :, 2:-1]
-        out_7 = x[minus_1, 0:100, :, 2:minus_1]
-
-        exe = fluid.Executor(place=fluid.CPUPlace())
-        res_1, res_2, res_3, res_4, res_5, res_6, res_7 = exe.run(
-            fluid.default_main_program(),
-            feed={
-                "x": input,
-                'starts': np.array([-3, 0, 2]).astype("int32"),
-                'ends': np.array([3, 100, -1]).astype("int32")
-            },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7])
-
-        assert np.array_equal(res_1, input[-3:3, 0:100, 2:-1, :])
-        assert np.array_equal(res_2, input[-3:3, 0:100, :, 2:-1])
-        assert np.array_equal(res_3, input[-3:3, 0:100, :, 2:-1])
-        assert np.array_equal(res_4, input[-3:3, 0:100, 2:-1, :])
-        assert np.array_equal(res_5, input[-3:3, 0:100, 2:-1, :])
-        assert np.array_equal(res_6, input[-3:3, 0:100, :, 2:-1])
-        assert np.array_equal(res_7, input[-1, 0:100, :, 2:-1])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py
deleted file mode 100644
index b16c744603534ad07f9318a5a195f1d7550b1f10..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_slice_var.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import math
-import unittest
-from paddle.fluid.transpiler.distribute_transpiler import slice_variable
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import random
-
-
-class TestSliceVar(unittest.TestCase):
-    def check_slice_output(self, shapes, expected_sizes, min_size):
-        var_list = []
-        program = fluid.Program()
-        for shape in shapes:
-            var = program.global_block().create_var(
-                name=str(random.randint(10000, 99999)),
-                persistable=True,
-                shape=shape)
-            var_list.append(var)
-        blocks = slice_variable(var_list, 10, min_size)
-        all_sizes = []
-        for s in expected_sizes:
-            for s2 in s:
-                all_sizes.append(s2)
-        for i, block_str in enumerate(blocks):
-            varname, block_id, size = block_str.split(":")
-            self.assertEqual(int(size), all_sizes[i])
-
-    def test_1k(self):
-        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10]]
-        expected_sizes = [
-            [15], [1024],
-            [2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 784],
-            [2040, 2040, 2040, 2040],
-            [1150, 1150, 1150, 1150, 1150, 1150, 1100]
-        ]
-
-        self.check_slice_output(shapes, expected_sizes, 1024)
-
-    def test_check_output_8k(self):
-        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10],
-                  [6, 33, 33, 33]]
-        expected_sizes = [[15], [1024], [10976, 10976], [8160], [8000],
-                          [35937, 35937, 35937, 35937, 35937, 35937]]
-
-        self.check_slice_output(shapes, expected_sizes, 8192)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
deleted file mode 100644
index 8ab6833821c75262124b3ae4200a17e457b718d5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def smooth_l1_loss_forward(val, sigma2):
-    abs_val = abs(val)
-    if abs_val < 1.0 / sigma2:
-        return 0.5 * val * val * sigma2
-    else:
-        return abs_val - 0.5 / sigma2
-
-
-class TestSmoothL1LossOp1(OpTest):
-    def setUp(self):
-        self.op_type = "smooth_l1_loss"
-        dims = (5, 10)
-        self.inputs = {
-            'X': np.random.random(dims).astype("float32"),
-            'Y': np.random.random(dims).astype("float32")
-        }
-        sigma = 3.0
-        self.attrs = {'sigma': sigma}
-        sigma2 = sigma * sigma
-        diff = self.inputs['X'] - self.inputs['Y']
-        loss = np.vectorize(smooth_l1_loss_forward)(diff, sigma2).sum(1)
-        loss = loss.reshape((dims[0], 1))
-        self.outputs = {
-            'Diff': diff.astype('float32'),
-            'Out': loss.astype('float32')
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.02)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.03, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.03, no_grad_set=set('Y'))
-
-
-class TestSmoothL1LossOp2(OpTest):
-    def setUp(self):
-        self.op_type = "smooth_l1_loss"
-        dims = (5, 10)
-        self.inputs = {
-            'X': np.random.random(dims).astype("float32"),
-            'Y': np.random.random(dims).astype("float32"),
-            'InsideWeight': np.random.random(dims).astype("float32"),
-            'OutsideWeight': np.random.random(dims).astype("float32")
-        }
-        sigma = 3.0
-        self.attrs = {'sigma': sigma}
-        sigma2 = sigma * sigma
-        diff = self.inputs['X'] - self.inputs['Y']
-        diff = diff * self.inputs['InsideWeight']
-        loss = np.vectorize(smooth_l1_loss_forward)(diff, sigma2)
-        loss = loss * self.inputs['OutsideWeight']
-        loss = loss.sum(1).reshape((dims[0], 1))
-        self.outputs = {
-            'Diff': diff.astype('float32'),
-            'Out': loss.astype('float32')
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.03)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set(['X', 'InsideWeight', 'OutsideWeight']))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set(['Y', 'InsideWeight', 'OutsideWeight']))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
deleted file mode 100644
index f6770bdd1d4309f4467a75844ce1bf544ee67b9b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ /dev/null
@@ -1,207 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
-
-
-def stable_softmax(x):
-    """Compute the softmax of vector x in a numerically stable way."""
-    shiftx = x - np.max(x).clip(-64.)
-    exps = np.exp(shiftx)
-    return exps / np.sum(exps)
-
-
-class TestSoftmaxOp(OpTest):
-    def get_x_shape(self):
-        return [10, 10]
-
-    def get_axis(self):
-        return -1
-
-    def setUp(self):
-        self.op_type = "softmax"
-        self.use_cudnn = False
-        self.use_mkldnn = False
-        self.dtype = np.float32
-        self.init_kernel_type()
-        self.shape = self.get_x_shape()
-        self.axis = self.get_axis()
-
-        x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
-        out = np.apply_along_axis(stable_softmax, self.axis, x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {
-            'axis': self.axis,
-            'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn
-        }
-
-    def init_kernel_type(self):
-        pass
-
-    def test_check_output(self):
-        if self.use_cudnn:
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
-
-    def test_check_grad(self):
-        if self.use_cudnn or self.dtype == np.float16:
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_grad_with_place(
-                    place, ["X"], "Out", max_relative_error=0.01)
-        else:
-            self.check_grad(["X"], "Out", max_relative_error=0.01)
-
-
-class TestSoftmaxOpError(OpTest):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The input type of softmax_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            self.assertRaises(TypeError, fluid.layers.softmax, x1)
-            # The input dtype of softmax_op must be float32 or float64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.softmax, x2)
-
-
-class TestSoftmaxOp2(TestSoftmaxOp):
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-
-class TestSoftmaxOp3(TestSoftmaxOp):
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-    def get_axis(self):
-        return 0
-
-
-class TestSoftmaxOp4(TestSoftmaxOp):
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-    def get_axis(self):
-        return 1
-
-
-class TestSoftmaxOp5(TestSoftmaxOp):
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-    def get_axis(self):
-        return 2
-
-
-class TestSoftmaxOp6(TestSoftmaxOp):
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-    def get_axis(self):
-        return 3
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestSoftmaxCUDNNOp(TestSoftmaxOp):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp):
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-    def get_axis(self):
-        return 3
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestSoftmaxFP16Op(TestSoftmaxOp):
-    def init_kernel_type(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
-    # FIXME: If the x_shape is [10, 10], gradient failed.
-    def test_check_grad(self):
-        pass
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestSoftmaxFP16Op2(TestSoftmaxOp):
-    def init_kernel_type(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
deleted file mode 100644
index d37731146d9c431bb6a0c333149ac62a0c4efd3b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ /dev/null
@@ -1,412 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-from op_test import OpTest
-from test_softmax_op import stable_softmax
-
-
-def cross_entropy(softmax, label, soft_label, axis, ignore_index=-1):
-    if soft_label:
-        return (-label * np.log(softmax)).sum(axis=axis, keepdims=True)
-
-    shape = softmax.shape
-    axis %= len(shape)
-    n = int(np.prod(shape[:axis]))
-    axis_dim = shape[axis]
-    remain = int(np.prod(shape[axis + 1:]))
-    softmax_reshape = softmax.reshape((n, axis_dim, remain))
-    label_reshape = label.reshape((n, 1, remain))
-    result = np.zeros_like(label_reshape, dtype=softmax.dtype)
-    for i in range(n):
-        for j in range(remain):
-            lbl = label_reshape[i, 0, j]
-            if lbl != ignore_index:
-                result[i, 0, j] -= np.log(softmax_reshape[i, lbl, j])
-    return result.reshape(label.shape)
-
-
-class TestSoftmaxWithCrossEntropyOp(OpTest):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = False
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = -1
-        self.ignore_index = -1
-        self.shape = [41, 37]
-
-    def setUp(self):
-        self.initParams()
-
-        logits = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
-        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
-
-        if self.soft_label:
-            labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
-            labels /= np.sum(labels, axis=self.axis, keepdims=True)
-        else:
-            axis_dim = self.shape[self.axis]
-            self.shape[self.axis] = 1
-            labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
-
-        loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
-                             self.ignore_index)
-
-        self.inputs = {"Logits": logits, "Label": labels}
-        self.outputs = {
-            "Softmax": softmax.astype(self.dtype),
-            "Loss": loss.astype(self.dtype)
-        }
-        self.attrs = {
-            "numeric_stable_mode": self.numeric_stable_mode,
-            "soft_label": self.soft_label,
-        }
-        if self.ignore_index >= 0:
-            self.attrs['ignore_index'] = self.ignore_index
-        if self.axis != -1:
-            self.attrs['axis'] = self.axis
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
-
-
-class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.axis = -1
-        self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestSoftmaxWithCrossEntropyOpFp16(TestSoftmaxWithCrossEntropyOp):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = False
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.axis = -1
-        self.ignore_index = -1
-        self.dtype = np.float16
-
-    def setUp(self):
-        self.initParams()
-        self.op_type = "softmax_with_cross_entropy"
-
-        # NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
-        logits = np.random.uniform(0.1, 1.0, self.shape).astype(np.float32)
-        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
-
-        axis_dim = self.shape[self.axis]
-        self.shape[self.axis] = 1
-        labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
-
-        loss = cross_entropy(softmax, labels, self.soft_label, self.axis)
-
-        self.inputs = {
-            "Logits": logits.astype(self.dtype).view(np.uint16),
-            "Label": labels
-        }
-        self.outputs = {
-            "Softmax": softmax.astype(self.dtype),
-            "Loss": loss.astype(self.dtype)
-        }
-        self.attrs = {
-            "numeric_stable_mode": self.numeric_stable_mode,
-            "soft_label": self.soft_label,
-        }
-        if self.axis != -1:
-            self.attrs['axis'] = self.axis
-
-    def test_check_output(self):
-        self.check_output(atol=1e-2)
-
-    def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
-
-
-class TestSoftmaxWithCrossEntropyOpNoCudnnFp16(
-        TestSoftmaxWithCrossEntropyOpFp16):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.axis = -1
-        self.ignore_index = -1
-        self.dtype = np.float16
-
-    def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
-
-
-class TestSoftmaxWithCrossEntropyOp2(TestSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with soft labels.
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.dtype = np.float64
-        self.axis = -1
-        self.ignore_index = -1
-        self.shape = [41, 37]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss")
-
-
-class TestSoftmaxWithCrossEntropyOp3(TestSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with ignore_index.
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = False
-        self.soft_label = False
-        self.shape = [41, 37]
-        self.ignore_index = 5
-        self.axis = -1
-        self.dtype = np.float64
-
-
-class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 4
-        self.axis = -1
-        self.dtype = np.float64
-
-
-class TestSoftmaxWithCrossEntropyOpAxis1(TestSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    Given axis != -1
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = 0
-        self.ignore_index = -1
-        self.shape = [3, 5, 7, 11]
-
-
-class TestSoftmaxWithCrossEntropyOpAxis2(TestSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    Given axis != -1
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = 1
-        self.ignore_index = -1
-        self.shape = [3, 5, 7, 11]
-
-
-class TestSoftmaxWithCrossEntropyOpAxis3(TestSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    Given axis != -1
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = 2
-        self.ignore_index = -1
-        self.shape = [3, 5, 7, 11]
-
-
-class TestSoftmaxWithCrossEntropyOpAxis4(TestSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    Given axis != -1
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = 3
-        self.ignore_index = -1
-        self.shape = [3, 5, 7, 11]
-
-
-class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis1(
-        TestSoftmaxWithCrossEntropyOpNoCudnnFp16):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.axis = 0
-        self.ignore_index = -1
-        self.dtype = np.float16
-
-
-class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis2(
-        TestSoftmaxWithCrossEntropyOpNoCudnnFp16):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.axis = 1
-        self.ignore_index = -1
-        self.dtype = np.float16
-
-
-class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis3(
-        TestSoftmaxWithCrossEntropyOpNoCudnnFp16):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.axis = 2
-        self.ignore_index = -1
-        self.dtype = np.float16
-
-
-class TestSoftmaxWithCrossEntropyOpSoftLabelAxis1(
-        TestSoftmaxWithCrossEntropyOp2):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.shape = [3, 5, 7, 11]
-        self.axis = 0
-        self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestSoftmaxWithCrossEntropyOpSoftLabelAxis2(
-        TestSoftmaxWithCrossEntropyOp2):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.shape = [3, 5, 7, 11]
-        self.axis = 1
-        self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestSoftmaxWithCrossEntropyOpSoftLabelAxis3(
-        TestSoftmaxWithCrossEntropyOp2):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.shape = [3, 5, 7, 11]
-        self.axis = 2
-        self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestSoftmaxWithCrossEntropyOpSoftLabelAxis4(
-        TestSoftmaxWithCrossEntropyOp2):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.shape = [3, 5, 7, 11]
-        self.axis = 3
-        self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
-        TestSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 1
-        self.axis = 0
-        self.dtype = np.float64
-
-
-class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
-        TestSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 0
-        self.axis = 1
-        self.dtype = np.float64
-
-
-class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
-        TestSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 3
-        self.axis = 2
-        self.dtype = np.float64
-
-
-class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
-        TestSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 3
-        self.axis = 3
-        self.dtype = np.float64
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py b/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py
deleted file mode 100644
index 5fdad44f1242b9ee99040b43d7ce2cf84664eed1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-from op_test import OpTest
-
-
-class TestSpaceToDepthOp(OpTest):
-    @staticmethod
-    def helper(in_, width, height, channel, batch, blocksize, forward, out_):
-        channel_out = channel // (blocksize * blocksize)
-        for b in range(batch):
-            for k in range(channel):
-                for j in range(height):
-                    for i in range(width):
-                        in_index = i + width * (j + height * (k + channel * b))
-                        channel2 = k % channel_out
-                        offset = k // channel_out
-                        width2 = i * blocksize + offset % blocksize
-                        height2 = j * blocksize + offset // blocksize
-                        out_index = width2 + width * blocksize * (
-                            height2 + height * blocksize *
-                            (channel2 + channel_out * b))
-                        if forward:
-                            out_[out_index] = in_[in_index]
-                        else:
-                            out_[in_index] = in_[out_index]
-
-    def setUp(self):
-        self.init_data()
-
-        self.op_type = "space_to_depth"
-        self.inputs = {"X": self.x}
-        self.helper(self.x_1d, self.x.shape[3], self.x.shape[2],
-                    self.x.shape[1], self.x.shape[0], self.blocksize,
-                    self.forward, self.out_1d)
-        self.out = np.reshape(self.out_1d, self.infered_shape)
-        self.attrs = {"blocksize": self.blocksize}
-        self.outputs = {"Out": self.out}
-
-    def init_data(self):
-        self.ori_shape = (32, 12, 6, 6)
-        self.infered_shape = (32, 48, 3, 3)
-        self.one_d_len = 32 * 48 * 3 * 3
-
-        self.blocksize = 2
-        self.x = np.random.random(self.ori_shape).astype('float32')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float32')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-    def test_check_output(self):
-        place = fluid.core.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.core.CPUPlace()
-        self.check_output_with_place(place, 1e-5, None, False)
-
-    def test_check_grad(self):
-        place = fluid.core.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.core.CPUPlace()
-        self.check_grad_with_place(place, ['X'], 'Out')
-
-
-class TestSpaceToDepthOpBasic(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 8, 6, 6)
-        self.infered_shape = (32, 32, 3, 3)
-        self.one_d_len = 32 * 32 * 3 * 3
-
-        self.blocksize = 2
-        self.x = np.random.random(self.ori_shape).astype('float32')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float32')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-class TestSpaceToDepthOpDoubleBasic(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 8, 6, 6)
-        self.infered_shape = (32, 32, 3, 3)
-        self.one_d_len = 32 * 32 * 3 * 3
-
-        self.blocksize = 2
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-class TestSpaceToDepthOpWithStride3(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 9, 6, 6)
-        self.infered_shape = (32, 81, 2, 2)
-        self.one_d_len = 32 * 81 * 2 * 2
-
-        self.blocksize = 3
-        self.x = np.random.random(self.ori_shape).astype('float32')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float32')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-class TestSpaceToDepthOpWithNotSquare(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 9, 9, 6)
-        self.infered_shape = (32, 81, 3, 2)
-        self.one_d_len = 32 * 81 * 3 * 2
-
-        self.blocksize = 3
-        self.x = np.random.random(self.ori_shape).astype('float32')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float32')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
deleted file mode 100644
index e4e431bcce571798893ccc96c74fd9972b657f3e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-from paddle.fluid import core
-
-
-def spectral_norm(weight, u, v, dim, power_iters, eps):
-    shape = weight.shape
-    weight_mat = weight.copy()
-    h = shape[dim]
-    w = np.prod(shape) // h
-    if dim != 0:
-        perm = [dim] + [d for d in range(len(shape)) if d != dim]
-        weight_mat = weight_mat.transpose(perm)
-    weight_mat = weight_mat.reshape((h, w))
-
-    u = u.reshape((h, 1))
-    v = v.reshape((w, 1))
-    for i in range(power_iters):
-        v = np.matmul(weight_mat.T, u)
-        v_norm = np.sqrt((v * v).sum())
-        v = v / (v_norm + eps)
-        u = np.matmul(weight_mat, v)
-        u_norm = np.sqrt((u * u).sum())
-        u = u / (u_norm + eps)
-
-    sigma = (u * np.matmul(weight_mat, v)).sum()
-    return weight / sigma
-
-
-class TestSpectralNormOpNoGrad(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = 'spectral_norm'
-        weight = np.random.random(self.weight_shape).astype('float32')
-        u = np.random.normal(0., 1., self.u_shape).astype('float32')
-        v = np.random.normal(0., 1., self.v_shape).astype('float32')
-
-        self.attrs = {
-            "dim": self.dim,
-            "power_iters": self.power_iters,
-            "eps": self.eps,
-        }
-
-        self.inputs = {
-            "Weight": weight,
-            "U": u,
-            "V": v,
-        }
-
-        output = spectral_norm(weight, u, v, self.dim, self.power_iters,
-                               self.eps)
-        self.outputs = {"Out": output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def initTestCase(self):
-        self.weight_shape = (2, 3)
-        self.u_shape = (2, )
-        self.v_shape = (3, )
-        self.dim = 0
-        self.power_iters = 5
-        self.eps = 1e-12
-
-
-class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad):
-    def initTestCase(self):
-        self.weight_shape = (2, 3, 3, 3)
-        self.u_shape = (3, )
-        self.v_shape = (18, )
-        self.dim = 1
-        self.power_iters = 10
-        self.eps = 1e-12
-
-
-class TestSpectralNormOp(TestSpectralNormOpNoGrad):
-    def test_check_grad_ignore_uv(self):
-        self.check_grad(
-            ['Weight'],
-            'Out',
-            no_grad_set=set(["U", "V"]),
-            max_relative_error=0.1)
-
-    def initTestCase(self):
-        self.weight_shape = (2, 3)
-        self.u_shape = (2, )
-        self.v_shape = (3, )
-        self.dim = 0
-        self.power_iters = 0
-        self.eps = 1e-12
-
-
-class TestSpectralNormOp2(TestSpectralNormOp):
-    def initTestCase(self):
-        self.weight_shape = (2, 3, 3, 3)
-        self.u_shape = (3, )
-        self.v_shape = (18, )
-        self.dim = 1
-        self.power_iters = 0
-        self.eps = 1e-12
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
deleted file mode 100644
index f407eb1d8b75cc3c29fc798c19d4284881dcdd49..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ /dev/null
@@ -1,225 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import numpy as np
-import paddle.fluid.layers as layers
-from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.executor import Executor
-from paddle.fluid.backward import append_backward
-from paddle.fluid.layers.control_flow import split_lod_tensor
-from paddle.fluid.layers.control_flow import merge_lod_tensor
-from paddle.fluid.layer_helper import LayerHelper
-
-
-class TestCPULoDTensorArrayOps(unittest.TestCase):
-    def place(self):
-        return core.CPUPlace()
-
-    def test_split_and_merge_lod_tensor_no_lod(self):
-        tensor = core.LoDTensor()
-        tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
-
-        mask_np = np.array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0]).astype('bool')
-        mask_np = np.expand_dims(mask_np, axis=1)
-
-        mask = core.LoDTensor()
-        mask.set(mask_np, self.place())
-
-        expect_true_tensor = np.array([2, 3, 4, 5]).astype('int32')
-        expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
-        expect_true = core.LoDTensor()
-        expect_true.set(expect_true_tensor, self.place())
-
-        expect_false_tensor = np.array([0, 1, 6, 7, 8, 9]).astype('int32')
-        expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
-
-        expect_false = core.LoDTensor()
-        expect_false.set(expect_false_tensor, self.place())
-
-        self.main(
-            tensor=tensor,
-            mask=mask,
-            expect_true=expect_true,
-            expect_false=expect_false,
-            expect_out=tensor)
-
-    def split_and_merge_lod_tensor_level_0(self, use_merge_lod_infer=False):
-        tensor = core.LoDTensor()
-        tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
-
-        mask_np = np.array([0, 1, 0]).astype('bool')
-        mask_np = np.expand_dims(mask_np, axis=1)
-
-        mask = core.LoDTensor()
-        mask.set(mask_np, self.place())
-
-        expect_true_tensor = np.array([3, 4, 5, 6, 7, 8]).astype('int32')
-        expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
-        expect_true = core.LoDTensor()
-        expect_true.set(expect_true_tensor, self.place())
-        expect_true.set_recursive_sequence_lengths([[6]])
-
-        expect_false_tensor = np.array([0, 1, 2, 9]).astype('int32')
-        expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
-        expect_false_lod = [[3, 1]]
-
-        expect_false = core.LoDTensor()
-        expect_false.set(expect_false_tensor, self.place())
-        expect_false.set_recursive_sequence_lengths(expect_false_lod)
-
-        self.main(
-            tensor=tensor,
-            mask=mask,
-            expect_true=expect_true,
-            expect_false=expect_false,
-            expect_out=tensor,
-            use_merge_lod_infer=use_merge_lod_infer)
-
-    def test_split_and_merge_lod_tensor_1(self):
-        self.split_and_merge_lod_tensor_level_0()
-
-    def test_split_and_merge_lod_tensor_2(self):
-        self.split_and_merge_lod_tensor_level_0(True)
-
-    def main(self,
-             tensor,
-             mask,
-             expect_true,
-             expect_false,
-             expect_out,
-             level=0,
-             use_merge_lod_infer=False):
-        place = self.place()
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[1])
-            x.persistable = True
-
-            y = layers.data(name='y', shape=[1])
-            y.persistable = True
-
-            out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
-            out_true.persistable = True
-            out_false.persistable = True
-            if use_merge_lod_infer:
-                input_dict = {
-                    'X': x,
-                    'Mask': mask,
-                    'InTrue': out_true,
-                    'InFalse': out_false,
-                    'level': level
-                }
-                helper = LayerHelper('merge_lod_tensor_infer')
-                out = helper.create_variable_for_type_inference(
-                    dtype=out_true.dtype)
-                helper.append_op(
-                    type='merge_lod_tensor_infer',
-                    inputs={
-                        'X': x,
-                        'Mask': y,
-                        'InTrue': out_true,
-                        'InFalse': out_false
-                    },
-                    outputs={'Out': out},
-                    attrs={'level': level})
-                out.persistable = True
-            else:
-                out = merge_lod_tensor(
-                    in_true=out_true,
-                    in_false=out_false,
-                    mask=y,
-                    x=x,
-                    level=level)
-                out.persistable = True
-
-        exe = Executor(place)
-        scope = core.Scope()
-        exe.run(program,
-                feed={'x': tensor,
-                      'y': mask},
-                scope=scope,
-                return_numpy=False)
-
-        var_true = scope.find_var(out_true.name).get_tensor()
-
-        var_false = scope.find_var(out_false.name).get_tensor()
-
-        var_out = scope.find_var(out.name).get_tensor()
-        if not use_merge_lod_infer:
-            self.check_tensor_same(var_true, expect_true)
-            self.check_tensor_same(var_false, expect_false)
-        self.check_tensor_same(var_out, expect_out)
-
-    def check_tensor_same(self, actual, expect):
-        self.assertTrue(np.allclose(np.array(actual), np.array(expect)))
-        self.assertEqual(actual.recursive_sequence_lengths(),
-                         expect.recursive_sequence_lengths())
-
-
-class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
-    def test_grad(self):
-        place = core.CPUPlace()
-        program = Program()
-        with program_guard(program):
-            x = layers.data(
-                name='x', shape=[1], dtype='float32', stop_gradient=False)
-            y = layers.data(
-                name='y', shape=[1], dtype='bool', stop_gradient=False)
-
-            level = 0
-
-            out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
-            out = merge_lod_tensor(
-                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
-            mean = layers.mean(out)
-
-            append_backward(mean)
-
-        tensor = core.LoDTensor()
-        tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
-        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
-
-        mask_np = np.array([0, 1, 0]).astype('bool')
-        mask_np = np.expand_dims(mask_np, axis=1)
-
-        mask = core.LoDTensor()
-        mask.set(mask_np, place)
-
-        exe = Executor(place)
-        scope = core.Scope()
-
-        g_vars = program.global_block().var(x.name + "@GRAD")
-        g_out = [
-            item.sum()
-            for item in map(np.array,
-                            exe.run(program,
-                                    feed={'x': tensor,
-                                          'y': mask},
-                                    fetch_list=[g_vars],
-                                    scope=scope,
-                                    return_numpy=False))
-        ]
-
-        g_out_sum = np.array(g_out).sum()
-
-        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
deleted file mode 100644
index d674dad2293921c06135b4ee528538d266cb2904..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import six
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestSplitIdsOp(OpTest):
-    def setUp(self):
-        self.op_type = "split_ids"
-        ids1 = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
-        ids2 = np.array([[6], [2], [3], [3], [5], [2], [6]]).astype('int64')
-        ids3 = np.array([[2], [2], [2], [3], [5], [5], [6]]).astype('int64')
-
-        out0 = np.array([[0], [3], [6]]).astype('int64')
-        out1 = np.array([[]]).astype('int64')
-        out2 = np.array([[2], [5]]).astype('int64')
-        self.inputs = {'Ids': [('ids1', ids1), ('ids2', ids2), ('ids3', ids3)]}
-        self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSplitSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        return places
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        rows = [0, 5, 7, 4, 9]
-        height = 20
-        row_numel = 2
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(rows)
-        x.set_height(height)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
-            for j in range(row_numel):
-                np_array[i, j] = rows[i] + j
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        outs_name = ["out%d" % i for i in six.moves.xrange(3)]
-        outs = [
-            scope.var(var_name).get_selected_rows() for var_name in outs_name
-        ]
-
-        # expected output selected rows
-        expected_out_rows = [[0, 9], [7, 4], [5]]
-
-        op = Operator("split_ids", Ids="X", Out=outs_name)
-
-        for _ in range(3):
-            op.run(scope, place)
-
-            for i in range(len(outs)):
-                expected_rows = expected_out_rows[i]
-                self.assertEqual(outs[i].rows(), expected_rows)
-                for j in range(len(expected_rows)):
-                    row = expected_rows[j]
-                    self.assertAlmostEqual(
-                        float(row), np.array(outs[i].get_tensor())[j, 0])
-                    self.assertAlmostEqual(
-                        float(row + 1), np.array(outs[i].get_tensor())[j, 1])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
deleted file mode 100644
index 3c5dd782f85235c4a2feb5a8ca6d048a012c5e1c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSplitOp(OpTest):
-    def setUp(self):
-        self._set_op_type()
-        axis = 1
-        x = np.random.random((4, 5, 6)).astype('float32')
-        out = np.split(x, [2, 3], axis)
-        self.inputs = {'X': x}
-        self.attrs = {'axis': axis, 'sections': [2, 1, 2]}
-        self.outputs = {'Out': [('out%d' % i, out[i]) \
-            for i in range(len(out))]}
-
-    def _set_op_type(self):
-        self.op_type = "split"
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], ['out0', 'out1', 'out2'])
-
-
-class TestSplitByrefOp(OpTest):
-    def _set_op_type(self):
-        self.op_type = "split_byref"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
deleted file mode 100644
index d8c57d964da706f12b8865195ea94329ca0f10e2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import numpy as np
-from paddle.fluid.op import Operator
-
-
-class TestSpliteSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-    def test_check_grad(self):
-        for place in self.get_places():
-            self.check_grad_with_place(place)
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        rows = [0, 5, 7, 4, 20]
-        height = 21
-        row_numel = 2
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(rows)
-        x.set_height(height)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 1] = 4.0
-        np_array[4, 1] = 8.0
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        height_sections = [5, 5, 5, 5, 3]
-
-        # initialize output variables [out0, out1]
-        outs_name = ["out%d" % i for i in range(len(height_sections))]
-        outs = [
-            scope.var(var_name).get_selected_rows() for var_name in outs_name
-        ]
-
-        # expected output selected rows
-        expected_out0_rows = [0, 4]
-        expected_out1_rows = [0, 2]
-        expected_out2_rows = []
-        expected_out4_rows = [0]
-
-        op = Operator(
-            "split_selected_rows",
-            X="X",
-            Out=outs_name,
-            height_sections=height_sections)
-
-        op.run(scope, place)
-
-        self.assertEqual(outs[0].rows(), expected_out0_rows)
-        self.assertEqual(outs[1].rows(), expected_out1_rows)
-        self.assertEqual(outs[2].rows(), expected_out2_rows)
-        self.assertEqual(outs[4].rows(), expected_out4_rows)
-
-        self.assertEqual(outs[0].height(), height_sections[0])
-        self.assertEqual(outs[4].height(), height_sections[4])
-
-        self.assertAlmostEqual(2.0, np.array(outs[0].get_tensor())[0, 0])
-        self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
-        self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
-
-        self.assertEqual(outs[2].numel(), 0)
-        self.assertEqual(outs[3].numel(), 0)
-
-    def check_grad_with_place(self, place):
-        scope = core.Scope()
-        height = 10
-        row_numel = 2
-
-        # attr
-        height_sections = [5, 5]
-
-        # initialize input variable X
-        out0_grad = scope.var("out0@GRAD").get_selected_rows()
-        rows0 = [0, 5]
-        out0_grad.set_rows(rows0)
-        out0_grad.set_height(height)
-        out0_grad_tensor = out0_grad.get_tensor()
-        np_array = np.ones((len(rows0), row_numel)).astype("float32")
-        out0_grad_tensor.set(np_array, place)
-
-        out1_grad = scope.var("out1@GRAD").get_selected_rows()
-        rows1 = [2, 0]
-        out1_grad.set_rows(rows1)
-        out1_grad.set_height(height)
-        out1_grad_tensor = out1_grad.get_tensor()
-        np_array = np.ones((len(rows1), row_numel)).astype("float32")
-        out1_grad_tensor.set(np_array, place)
-
-        x_grad = scope.var("X@GRAD").get_selected_rows()
-
-        grad_op = Operator(
-            "sum",
-            X=["out0@GRAD", "out1@GRAD"],
-            Out="X@GRAD",
-            height_sections=height_sections)
-
-        grad_op.run(scope, place)
-
-        merged_rows = set(rows0 + rows1)
-        self.assertEqual(set(x_grad.rows()), set(rows0 + rows1))
-        self.assertEqual(x_grad.height(), height)
-
-        print(np.array(x_grad.get_tensor()))
-        self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0])
-        self.assertAlmostEqual(1.0, np.array(x_grad.get_tensor())[2, 1])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_spp_op.py b/python/paddle/fluid/tests/unittests/test_spp_op.py
deleted file mode 100644
index a6c2cccd39c9cecb2ae904a1930b44ba18dbbd7e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_spp_op.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_pool2d_op import max_pool2D_forward_naive
-from test_pool2d_op import avg_pool2D_forward_naive
-
-
-class TestSppOp(OpTest):
-    def setUp(self):
-        self.op_type = "spp"
-        self.init_test_case()
-        input = np.random.random(self.shape).astype("float32")
-        nsize, csize, hsize, wsize = input.shape
-        out_level_flatten = []
-        for i in range(self.pyramid_height):
-            bins = np.power(2, i)
-            kernel_size = [0, 0]
-            padding = [0, 0]
-            kernel_size[0] = np.ceil(hsize /
-                                     bins.astype("double")).astype("int32")
-            padding[0] = (
-                (kernel_size[0] * bins - hsize + 1) / 2).astype("int32")
-
-            kernel_size[1] = np.ceil(wsize /
-                                     bins.astype("double")).astype("int32")
-            padding[1] = (
-                (kernel_size[1] * bins - wsize + 1) / 2).astype("int32")
-            out_level = self.pool2D_forward_naive(input, kernel_size,
-                                                  kernel_size, padding)
-            out_level_flatten.append(
-                out_level.reshape(nsize, bins * bins * csize))
-            if i == 0:
-                output = out_level_flatten[i]
-            else:
-                output = np.concatenate((output, out_level_flatten[i]), 1)
-        # output = np.concatenate(out_level_flatten.tolist(), 0);
-        self.inputs = {'X': input.astype('float32'), }
-        self.attrs = {
-            'pyramid_height': self.pyramid_height,
-            'pooling_type': self.pool_type
-        }
-
-        self.outputs = {'Out': output.astype('float32')}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        if self.pool_type != "avg":
-            self.check_grad(['X'], 'Out', max_relative_error=0.05)
-
-    def init_test_case(self):
-        self.shape = [3, 2, 4, 4]
-        self.pyramid_height = 3
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.pool_type = "max"
-
-
-class TestCase2(TestSppOp):
-    def init_test_case(self):
-        self.shape = [3, 2, 4, 4]
-        self.pyramid_height = 3
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-        self.pool_type = "avg"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_square_error_cost.py b/python/paddle/fluid/tests/unittests/test_square_error_cost.py
deleted file mode 100644
index 056bbfcd5302b780a8ffe3cdbda426fdc282784a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_square_error_cost.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import sys
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.executor import Executor
-
-
-class TestSquareErrorCost(unittest.TestCase):
-    def test_square_error_cost(self):
-        input_val = np.random.uniform(0.1, 0.5, (2, 3)).astype("float32")
-        label_val = np.random.uniform(0.1, 0.5, (2, 3)).astype("float32")
-
-        sub = input_val - label_val
-        np_result = sub * sub
-
-        input_var = layers.create_tensor(dtype="float32", name="input")
-        label_var = layers.create_tensor(dtype="float32", name="label")
-
-        layers.assign(input=input_val, output=input_var)
-        layers.assign(input=label_val, output=label_var)
-        output = layers.square_error_cost(input=input_var, label=label_var)
-
-        for use_cuda in ([False, True]
-                         if core.is_compiled_with_cuda() else [False]):
-
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = Executor(place)
-            result = exe.run(fluid.default_main_program(),
-                             feed={"input": input_var,
-                                   "label": label_var},
-                             fetch_list=[output])
-
-            self.assertTrue(np.isclose(np_result, result).all())
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
deleted file mode 100644
index a8bc1004d9bbe91e323db49c0cf0b576f8da306e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSquaredL2DistanceOp_f0(OpTest):
-    def setUp(self):
-        self.op_type = "squared_l2_distance"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 0.6, (2, 3)).astype("float32"),
-            'Y': np.random.uniform(0.1, 0.6, (2, 3)).astype("float32")
-        }
-        sub_res = self.inputs['X'] - self.inputs['Y']
-        output = sub_res * sub_res
-        self.outputs = {
-            'sub_result': sub_res,
-            'Out': np.expand_dims(output.sum(1), 1)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-class TestSquaredL2DistanceOp_f1(OpTest):
-    def setUp(self):
-        self.op_type = "squared_l2_distance"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 0.6, (2, 3)).astype("float32"),
-            'Y': np.random.uniform(0.1, 0.6, (1, 3)).astype("float32")
-        }
-        sub_res = self.inputs['X'] - self.inputs['Y']
-        output = sub_res * sub_res
-        self.outputs = {
-            'sub_result': sub_res,
-            'Out': np.expand_dims(output.sum(1), 1)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-class TestSquaredL2DistanceOp_f2(OpTest):
-    def setUp(self):
-        self.op_type = "squared_l2_distance"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 0.6, (2, 3, 4)).astype("float32"),
-            'Y': np.random.uniform(0.1, 0.6, (1, 3, 4)).astype("float32")
-        }
-        sub_res = self.inputs['X'] - self.inputs['Y']
-        sub_res = sub_res.reshape((2, 3 * 4))
-        output = sub_res * sub_res
-        self.outputs = {
-            'sub_result': sub_res,
-            'Out': np.expand_dims(output.sum(1), 1)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
deleted file mode 100644
index 439bae9510ee84b131050bb6804a3ede2ad6a8b3..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-from numpy import linalg as LA
-from op_test import OpTest
-
-
-class TestL2LossOp(OpTest):
-    """Test squared_l2_norm
-    """
-
-    def setUp(self):
-        self.op_type = "squared_l2_norm"
-        self.max_relative_error = 0.05
-
-        X = np.random.uniform(-1, 1, (13, 19)).astype("float32")
-        X[np.abs(X) < self.max_relative_error] = 0.1
-        self.inputs = {'X': X}
-        self.outputs = {'Out': np.square(LA.norm(X))}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=self.max_relative_error)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
deleted file mode 100644
index ad9391eac3304965d6ee5d007fce70a5d0dd1b18..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-from op_test import OpTest
-
-
-# Correct: General.
-class TestSqueezeOp(OpTest):
-    def setUp(self):
-        self.op_type = "squeeze2"
-        self.init_test_case()
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.init_attrs()
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.new_shape),
-            "XShape": np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = (0, 2)
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes}
-
-
-# Correct: There is mins axis.
-class TestSqueezeOp1(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = (0, -2)
-        self.new_shape = (3, 5)
-
-
-# Correct: No axes input.
-class TestSqueezeOp2(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = ()
-        self.new_shape = (3, 5)
-
-
-# Correct: Just part of axes be squeezed. 
-class TestSqueezeOp3(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 1, 5, 1, 4, 1)
-        self.axes = (1, -1)
-        self.new_shape = (3, 5, 1, 4)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
deleted file mode 100644
index 8a43f5c3e1e31099da155ba7d730c5085f7d26d2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-from op_test import OpTest
-
-
-# Correct: General.
-class TestSqueezeOp(OpTest):
-    def setUp(self):
-        self.op_type = "squeeze"
-        self.init_test_case()
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape), }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = (0, 2)
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes}
-
-
-# Correct: There is mins axis.
-class TestSqueezeOp1(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = (0, -2)
-        self.new_shape = (3, 5)
-
-
-# Correct: No axes input.
-class TestSqueezeOp2(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = ()
-        self.new_shape = (3, 5)
-
-
-# Correct: Just part of axes be squeezed. 
-class TestSqueezeOp3(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 1, 5, 1, 4, 1)
-        self.axes = (1, -1)
-        self.new_shape = (3, 5, 1, 4)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py
deleted file mode 100644
index defdeb5d70df4c39ed8e23247270e6eb3dd14a7a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_stack_op.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from op_test import OpTest
-import numpy as np
-import unittest
-
-
-class TestStackOpBase(OpTest):
-    def initDefaultParameters(self):
-        self.num_inputs = 4
-        self.input_dim = (5, 6, 7)
-        self.axis = 0
-        self.dtype = 'float32'
-
-    def initParameters(self):
-        pass
-
-    def get_x_names(self):
-        x_names = []
-        for i in range(self.num_inputs):
-            x_names.append('x{}'.format(i))
-        return x_names
-
-    def setUp(self):
-        self.initDefaultParameters()
-        self.initParameters()
-        self.op_type = 'stack'
-        self.x = []
-        for i in range(self.num_inputs):
-            self.x.append(
-                np.random.random(size=self.input_dim).astype(self.dtype))
-
-        tmp = []
-        x_names = self.get_x_names()
-        for i in range(self.num_inputs):
-            tmp.append((x_names[i], self.x[i]))
-
-        self.inputs = {'X': tmp}
-        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
-        self.attrs = {'axis': self.axis}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(self.get_x_names(), 'Y')
-
-
-class TestStackOp1(TestStackOpBase):
-    def initParameters(self):
-        self.num_inputs = 16
-
-
-class TestStackOp2(TestStackOpBase):
-    def initParameters(self):
-        self.num_inputs = 20
-
-
-class TestStackOp3(TestStackOpBase):
-    def initParameters(self):
-        self.axis = -1
-
-
-class TestStackOp4(TestStackOpBase):
-    def initParameters(self):
-        self.axis = -4
-
-
-class TestStackOp5(TestStackOpBase):
-    def initParameters(self):
-        self.axis = 1
-
-
-class TestStackOp6(TestStackOpBase):
-    def initParameters(self):
-        self.axis = 3
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
deleted file mode 100644
index bb327a8bd7fe04aa1b6dde2ba3ed8fe03cfc854d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ /dev/null
@@ -1,505 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from op_test import OpTest
-import numpy as np
-import unittest
-import paddle.fluid as fluid
-
-
-def strided_slice_native_forward(input, axes, starts, ends, strides):
-    dim = input.ndim
-    start = []
-    end = []
-    stride = []
-    for i in range(dim):
-        start.append(0)
-        end.append(input.shape[i])
-        stride.append(1)
-
-    for i in range(len(axes)):
-        start[axes[i]] = starts[i]
-        end[axes[i]] = ends[i]
-        stride[axes[i]] = strides[i]
-
-    result = {
-        1: lambda input, start, end, stride: input[start[0]:end[0]:stride[0]],
-        2: lambda input, start, end, stride: input[start[0]:end[0]:stride[0], \
-                start[1]:end[1]:stride[1]],
-        3: lambda input, start, end, stride: input[start[0]:end[0]:stride[0], \
-                start[1]:end[1]:stride[1], start[2]:end[2]:stride[2]],
-        4: lambda input, start, end, stride: input[start[0]:end[0]:stride[0], \
-                start[1]:end[1]:stride[1], start[2]:end[2]:stride[2], start[3]:end[3]:stride[3]],
-        5: lambda input, start, end, stride: input[start[0]:end[0]:stride[0], \
-                start[1]:end[1]:stride[1], start[2]:end[2]:stride[2], start[3]:end[3]:stride[3], start[4]:end[4]:stride[4]],
-        6: lambda input, start, end, stride: input[start[0]:end[0]:stride[0], \
-                start[1]:end[1]:stride[1], start[2]:end[2]:stride[2], start[3]:end[3]:stride[3], \
-                start[4]:end[4]:stride[4], start[5]:end[5]:stride[5]]
-    }[dim](input, start, end, stride)
-
-    return result
-
-
-class TestStrideSliceOp(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = 'strided_slice'
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
-
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.output}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'strides': self.strides,
-            'infer_flags': self.infer_flags
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(set(['Input']), 'Out')
-
-    def initTestCase(self):
-        self.input = np.random.rand(6)
-        self.axes = [0]
-        self.starts = [-4]
-        self.ends = [-3]
-        self.strides = [1]
-        self.infer_flags = [1]
-
-
-class TestStrideSliceOp1(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(6)
-        self.axes = [0]
-        self.starts = [3]
-        self.ends = [8]
-        self.strides = [1]
-        self.infer_flags = [1]
-
-
-class TestStrideSliceOp2(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(6)
-        self.axes = [0]
-        self.starts = [5]
-        self.ends = [0]
-        self.strides = [-1]
-        self.infer_flags = [1]
-
-
-class TestStrideSliceOp3(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(6)
-        self.axes = [0]
-        self.starts = [-1]
-        self.ends = [-3]
-        self.strides = [-1]
-        self.infer_flags = [1]
-
-
-class TestStrideSliceOp4(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(3, 4, 6)
-        self.axes = [0, 1, 2]
-        self.starts = [0, -1, 0]
-        self.ends = [2, -3, 5]
-        self.strides = [1, -1, 1]
-        self.infer_flags = [1, 1, 1]
-
-
-class TestStrideSliceOp5(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(3, 3, 3)
-        self.axes = [0, 1, 2]
-        self.starts = [1, 0, 0]
-        self.ends = [2, 1, 3]
-        self.strides = [1, 1, 1]
-        self.infer_flags = [1, 1, 1]
-
-
-class TestStrideSliceOp6(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(3, 3, 3)
-        self.axes = [0, 1, 2]
-        self.starts = [1, -1, 0]
-        self.ends = [2, -3, 3]
-        self.strides = [1, -1, 1]
-        self.infer_flags = [1, 1, 1]
-
-
-class TestStrideSliceOp7(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(3, 3, 3)
-        self.axes = [0, 1, 2]
-        self.starts = [1, 0, 0]
-        self.ends = [2, 2, 3]
-        self.strides = [1, 1, 1]
-        self.infer_flags = [1, 1, 1]
-
-
-class TestStrideSliceOp8(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(1, 3, 1)
-        self.axes = [1]
-        self.starts = [1]
-        self.ends = [2]
-        self.strides = [1]
-        self.infer_flags = [1]
-
-
-class TestStrideSliceOp9(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(1, 3, 1)
-        self.axes = [1]
-        self.starts = [-1]
-        self.ends = [-2]
-        self.strides = [-1]
-        self.infer_flags = [1]
-
-
-class TestStrideSliceOp10(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(3, 3)
-        self.axes = [0, 1]
-        self.starts = [1, 0]
-        self.ends = [2, 2]
-        self.strides = [1, 1]
-        self.infer_flags = [1, 1]
-
-
-class TestStrideSliceOp11(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(3, 3, 3, 4)
-        self.axes = [0, 1, 2, 3]
-        self.starts = [1, 0, 0, 0]
-        self.ends = [2, 2, 3, 4]
-        self.strides = [1, 1, 1, 2]
-        self.infer_flags = [1, 1, 1, 1]
-
-
-class TestStrideSliceOp12(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(3, 3, 3, 4, 5)
-        self.axes = [0, 1, 2, 3, 4]
-        self.starts = [1, 0, 0, 0, 0]
-        self.ends = [2, 2, 3, 4, 4]
-        self.strides = [1, 1, 1, 1, 1]
-        self.infer_flags = [1, 1, 1, 1]
-
-
-class TestStrideSliceOp13(TestStrideSliceOp):
-    def initTestCase(self):
-        self.input = np.random.rand(3, 3, 3, 6, 7, 8)
-        self.axes = [0, 1, 2, 3, 4, 5]
-        self.starts = [1, 0, 0, 0, 1, 2]
-        self.ends = [2, 2, 3, 1, 2, 8]
-        self.strides = [1, 1, 1, 1, 1, 2]
-        self.infer_flags = [1, 1, 1, 1, 1]
-
-
-class TestStridedSliceOp_starts_ListTensor(OpTest):
-    def setUp(self):
-        self.op_type = "strided_slice"
-        self.config()
-
-        starts_tensor = []
-        for index, ele in enumerate(self.starts):
-            starts_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
-        self.outputs = {'Out': self.output}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts_infer,
-            'ends': self.ends,
-            'strides': self.strides,
-            'infer_flags': self.infer_flags
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [3, 3, 4]
-        self.axes = [0, 1, 2]
-        self.strides = [1, 1, 1]
-        self.infer_flags = [1, -1, 1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
-
-        self.starts_infer = [1, 10, 2]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-class TestStridedSliceOp_ends_ListTensor(OpTest):
-    def setUp(self):
-        self.op_type = "strided_slice"
-        self.config()
-
-        ends_tensor = []
-        for index, ele in enumerate(self.ends):
-            ends_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {'Input': self.input, 'EndsTensorList': ends_tensor}
-        self.outputs = {'Out': self.output}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends_infer,
-            'strides': self.strides,
-            'infer_flags': self.infer_flags
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 0]
-        self.ends = [3, 3, 4]
-        self.axes = [0, 1, 2]
-        self.strides = [1, 1, 2]
-        self.infer_flags = [1, -1, 1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
-
-        self.ends_infer = [3, 1, 4]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-class TestStridedSliceOp_starts_Tensor(OpTest):
-    def setUp(self):
-        self.op_type = "strided_slice"
-        self.config()
-        self.inputs = {
-            'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32")
-        }
-        self.outputs = {'Out': self.output}
-        self.attrs = {
-            'axes': self.axes,
-            #'starts': self.starts,
-            'ends': self.ends,
-            'strides': self.strides,
-            'infer_flags': self.infer_flags,
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 3, 4]
-        self.axes = [0, 1, 2]
-        self.strides = [1, 1, 1]
-        self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-class TestStridedSliceOp_ends_Tensor(OpTest):
-    def setUp(self):
-        self.op_type = "strided_slice"
-        self.config()
-        self.inputs = {
-            'Input': self.input,
-            "EndsTensor": np.array(
-                self.ends, dtype="int32")
-        }
-        self.outputs = {'Out': self.output}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            #'ends': self.ends,
-            'strides': self.strides,
-            'infer_flags': self.infer_flags,
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 3, 4]
-        self.axes = [0, 1, 2]
-        self.strides = [1, 1, 1]
-        self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-class TestStridedSliceOp_listTensor_Tensor(OpTest):
-    def setUp(self):
-        self.config()
-        ends_tensor = []
-        for index, ele in enumerate(self.ends):
-            ends_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-        self.op_type = "strided_slice"
-
-        self.inputs = {
-            'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32"),
-            "EndsTensorList": ends_tensor
-        }
-        self.outputs = {'Out': self.output}
-        self.attrs = {
-            'axes': self.axes,
-            #'starts': self.starts,
-            #'ends': self.ends,
-            'strides': self.strides,
-            'infer_flags': self.infer_flags,
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 3, 4]
-        self.axes = [0, 1, 2]
-        self.strides = [1, 1, 1]
-        self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-class TestStridedSliceOp_strides_Tensor(OpTest):
-    def setUp(self):
-        self.op_type = "strided_slice"
-        self.config()
-        self.inputs = {
-            'Input': self.input,
-            "StridesTensor": np.array(
-                self.strides, dtype="int32")
-        }
-        self.outputs = {'Out': self.output}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            #'strides': self.strides,
-            'infer_flags': self.infer_flags,
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, -1, 2]
-        self.ends = [2, 0, 4]
-        self.axes = [0, 1, 2]
-        self.strides = [1, -1, 1]
-        self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
-
-
-# Test python API
-class TestSliceAPI(OpTest):
-    def test_1(self):
-        input = np.random.random([3, 4, 5, 6]).astype("float32")
-        minus_1 = fluid.layers.fill_constant([1], "int32", -1)
-        minus_3 = fluid.layers.fill_constant([1], "int32", -3)
-        starts = fluid.layers.data(
-            name='starts', shape=[3], append_batch_size=False)
-        ends = fluid.layers.data(
-            name='ends', shape=[3], append_batch_size=False)
-        strides = fluid.layers.data(
-            name='strides', shape=[3], append_batch_size=False)
-
-        x = fluid.layers.data(
-            name="x",
-            shape=[3, 4, 5, 6],
-            append_batch_size=False,
-            dtype="float32")
-
-        out_1 = fluid.layers.strided_slice(
-            x,
-            axes=[0, 1, 2],
-            starts=[-3, 0, 2],
-            ends=[3, 100, -1],
-            strides=[1, 1, 1])
-        out_2 = fluid.layers.strided_slice(
-            x,
-            axes=[0, 1, 3],
-            starts=[minus_3, 0, 2],
-            ends=[3, 100, -1],
-            strides=[1, 1, 1])
-        out_3 = fluid.layers.strided_slice(
-            x,
-            axes=[0, 1, 3],
-            starts=[minus_3, 0, 2],
-            ends=[3, 100, minus_1],
-            strides=[1, 1, 1])
-        out_4 = fluid.layers.strided_slice(
-            x, axes=[0, 1, 2], starts=starts, ends=ends, strides=strides)
-
-        out_5 = x[-3:3, 0:100, 2:-1]
-        out_6 = x[minus_3:3, 0:100, :, 2:-1]
-        out_7 = x[minus_1, 0:100, :, 2:minus_1]
-
-        exe = fluid.Executor(place=fluid.CPUPlace())
-        res_1, res_2, res_3, res_4, res_5, res_6, res_7 = exe.run(
-            fluid.default_main_program(),
-            feed={
-                "x": input,
-                'starts': np.array([-3, 0, 2]).astype("int32"),
-                'ends': np.array([3, 100, -1]).astype("int32"),
-                'strides': np.array([1, 1, 1]).astype("int32")
-            },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7])
-
-        assert np.array_equal(res_1, input[-3:3, 0:100, 2:-1, :])
-        assert np.array_equal(res_2, input[-3:3, 0:100, :, 2:-1])
-        assert np.array_equal(res_3, input[-3:3, 0:100, :, 2:-1])
-        assert np.array_equal(res_4, input[-3:3, 0:100, 2:-1, :])
-        assert np.array_equal(res_5, input[-3:3, 0:100, 2:-1, :])
-        assert np.array_equal(res_6, input[-3:3, 0:100, :, 2:-1])
-        assert np.array_equal(res_7, input[-1, 0:100, :, 2:-1])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
deleted file mode 100644
index 0be5be6e97d26c6ec42471d078e8e5995727e594..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ /dev/null
@@ -1,229 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestSumOp(OpTest):
-    def setUp(self):
-        self.op_type = "sum"
-        self.init_kernel_type()
-        self.use_mkldnn = False
-        self.init_kernel_type()
-        x0 = np.random.random((3, 4)).astype(self.dtype)
-        x1 = np.random.random((3, 4)).astype(self.dtype)
-        x2 = np.random.random((3, 4)).astype(self.dtype)
-        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
-        y = x0 + x1 + x2
-        self.outputs = {'Out': y}
-        self.attrs = {'use_mkldnn': self.use_mkldnn}
-
-    def init_kernel_type(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-
-    def init_kernel_type(self):
-        pass
-
-
-class TestSelectedRowsSumOp(OpTest):
-    def setUp(self):
-        self.height = 10
-        self.row_numel = 12
-        self.rows = [0, 1, 2, 3, 4, 5, 6]
-        self.dtype = np.float32
-        self.init_kernel_type()
-
-    def check_with_place(self, place, inplace):
-        self.check_input_and_optput(core.Scope(), place, inplace, True, True,
-                                    True)
-        self.check_input_and_optput(core.Scope(), place, inplace, False, True,
-                                    True)
-        self.check_input_and_optput(core.Scope(), place, inplace, False, False,
-                                    True)
-        self.check_input_and_optput(core.Scope(), place, inplace, False, False,
-                                    False)
-
-    def init_kernel_type(self):
-        pass
-
-    def _get_array(self, rows, row_numel):
-        array = np.ones((len(rows), row_numel)).astype(self.dtype)
-        for i in range(len(rows)):
-            array[i] *= rows[i]
-        return array
-
-    def check_input_and_optput(self,
-                               scope,
-                               place,
-                               inplace,
-                               w1_has_data=False,
-                               w2_has_data=False,
-                               w3_has_data=False):
-
-        self.create_selected_rows(scope, place, "W1", w1_has_data)
-        self.create_selected_rows(scope, place, "W2", w2_has_data)
-        self.create_selected_rows(scope, place, "W3", w3_has_data)
-
-        # create Out Variable
-        if inplace:
-            out_var_name = "W1"
-        else:
-            out_var_name = "Out"
-        out = scope.var(out_var_name).get_selected_rows()
-
-        # create and run sum operator
-        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name)
-        sum_op.run(scope, place)
-
-        has_data_w_num = 0
-        for has_data in [w1_has_data, w2_has_data, w3_has_data]:
-            if has_data:
-                has_data_w_num += 1
-
-        if has_data_w_num > 0:
-            self.assertEqual(len(out.rows()), 7)
-            self.assertTrue(
-                np.array_equal(
-                    np.array(out.get_tensor()),
-                    self._get_array(self.rows, self.row_numel) *
-                    has_data_w_num))
-        else:
-            self.assertEqual(len(out.rows()), 0)
-
-    def create_selected_rows(self, scope, place, var_name, has_data):
-        # create and initialize W Variable
-        if has_data:
-            rows = self.rows
-        else:
-            rows = []
-
-        var = scope.var(var_name)
-        w_selected_rows = var.get_selected_rows()
-        w_selected_rows.set_height(self.height)
-        w_selected_rows.set_rows(rows)
-        w_array = self._get_array(self.rows, self.row_numel)
-        w_tensor = w_selected_rows.get_tensor()
-        w_tensor.set(w_array, place)
-
-        return var
-
-    def test_w_is_selected_rows(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            for inplace in [True, False]:
-                self.check_with_place(place, inplace)
-
-
-class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
-    def setUp(self):
-        self.height = 10
-        self.row_numel = 12
-        self.rows = [0, 1, 2, 2, 4, 5, 6]
-
-    def check_with_place(self, place, inplace):
-        scope = core.Scope()
-        if inplace:
-            self.create_lod_tensor(scope, place, "x1")
-            self.create_selected_rows(scope, place, "x2", True)
-            out = scope.var("x1").get_tensor()
-            out_name = "x1"
-        else:
-            self.create_selected_rows(scope, place, "x1", True)
-            self.create_lod_tensor(scope, place, "x2")
-            out = scope.var("out").get_tensor()
-            out_name = "out"
-
-        # create and run sum operator
-        sum_op = Operator("sum", X=["x1", "x2"], Out=out_name)
-        sum_op.run(scope, place)
-
-        result = np.ones((1, self.height)).astype(np.int32).tolist()[0]
-        for ele in self.rows:
-            result[ele] += 1
-
-        out_t = np.array(out)
-        self.assertEqual(out_t.shape[0], self.height)
-        self.assertTrue(
-            np.array_equal(out_t,
-                           self._get_array([i for i in range(
-                               self.height)], self.row_numel) * np.tile(
-                                   np.array(result).reshape(self.height, 1),
-                                   self.row_numel)))
-
-    def create_lod_tensor(self, scope, place, var_name):
-        var = scope.var(var_name)
-        w_tensor = var.get_tensor()
-        w_array = self._get_array([i for i in range(self.height)],
-                                  self.row_numel)
-        w_tensor.set(w_array, place)
-        return var
-
-
-#----------- test fp16 -----------
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestFP16SumOp(TestSumOp):
-    def init_kernel_type(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_output_with_place(place, atol=2e-2)
-
-    # FIXME: Because of the precision fp16, max_relative_error
-    # should be 0.15 here.
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad(['x0'], 'Out', max_relative_error=0.15)
-
-
-def create_test_sum_fp16_class(parent):
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    class TestSumFp16Case(parent):
-        def init_kernel_type(self):
-            self.dtype = np.float16
-
-        def test_w_is_selected_rows(self):
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                for inplace in [True, False]:
-                    self.check_with_place(place, inplace)
-
-    cls_name = "{0}_{1}".format(parent.__name__, "SumFp16Test")
-    TestSumFp16Case.__name__ = cls_name
-    globals()[cls_name] = TestSumFp16Case
-
-
-create_test_sum_fp16_class(TestSelectedRowsSumOp)
-create_test_sum_fp16_class(TestLoDTensorAndSelectedRowsOp)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_switch.py b/python/paddle/fluid/tests/unittests/test_switch.py
deleted file mode 100644
index 2a9c07a889ba5fe24fd1c098729a233cb8fbb16f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_switch.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-import paddle.fluid.framework as framework
-from paddle.fluid.executor import Executor
-from paddle.fluid.framework import default_startup_program
-
-
-class TestSwitch(unittest.TestCase):
-    def check_switch(self, value):
-        x = layers.fill_constant(shape=[1], dtype='float32', value=value)
-
-        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
-        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
-        two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0)
-        three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0)
-
-        result = layers.create_global_var(
-            shape=[1], value=-1.0, dtype='float32', persistable=True)
-
-        with layers.Switch() as switch:
-            with switch.case(layers.less_than(x, zero_var)):
-                layers.assign(zero_var, result)
-            with switch.case(layers.less_than(x, one_var)):
-                layers.assign(one_var, result)
-            with switch.case(layers.less_than(x, two_var)):
-                layers.assign(two_var, result)
-            with switch.default():
-                layers.assign(three_var, result)
-
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-        exe.run(default_startup_program())
-
-        out = exe.run(feed={}, fetch_list=[result])[0][0]
-        return out
-
-    def test_switch(self):
-        test_data = {(-0.1, 0), (0.1, 1), (1.1, 2), (2.1, 3)}
-        for x, expected_result in test_data:
-            main_program = framework.Program()
-            startup_program = framework.Program()
-            with framework.program_guard(main_program, startup_program):
-                result = self.check_switch(x)
-                self.assertEqual(result, expected_result)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
deleted file mode 100644
index a9eccf4a2106d3225c59364a9dee4311e33ce221..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ /dev/null
@@ -1,204 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-test for sync bachnorm op.
-for both FP64 and FP16 input.
-"""
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import os
-import six
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-
-from op_test import OpTest
-
-
-def create_or_get_tensor(scope, var_name, var, place):
-    """Get tensor, if not found, create a new one."""
-    tensor = scope.var(var_name).get_tensor()
-    if var is not None:
-        assert isinstance(var, np.ndarray)
-        tensor.set_recursive_sequence_lengths([])
-        tensor.set(var, place)
-    return tensor
-
-
-class TestSyncBatchNormOpTraining(unittest.TestCase):
-    """sync_batch_norm op test."""
-
-    def setUp(self):
-        """Setup."""
-        #self.dtype = np.float32
-        self.dtype = np.float64
-        self.N = 32
-        self.C = 16
-        self.H = 64
-        self.W = 32
-        self.dshape = [self.N, self.C, self.H, self.W]
-        self.atol = 1e-3
-
-    def _build_program(self,
-                       place,
-                       layout,
-                       seed,
-                       sync_bn=False,
-                       only_forward=False):
-        """Build program."""
-        main = fluid.Program()
-        startup = fluid.Program()
-        main.random_seed = seed
-        startup.random_seed = seed
-        use_cudnn = self.dtype == np.float16
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                data = fluid.layers.data(
-                    name='input',
-                    shape=self.dshape,
-                    dtype=self.dtype,
-                    append_batch_size=False)
-                conv = fluid.layers.conv2d(
-                    input=data,
-                    num_filters=32,
-                    filter_size=1,
-                    param_attr=fluid.ParamAttr(name='conv2d_weight'),
-                    bias_attr=False,
-                    use_cudnn=use_cudnn)
-                bn = fluid.layers.batch_norm(
-                    conv,
-                    param_attr=fluid.ParamAttr(name='bn_scale'),
-                    bias_attr=fluid.ParamAttr(name='bn_bias'),
-                    moving_mean_name='bn_moving_mean',
-                    moving_variance_name='bn_moving_variance',
-                    data_layout=layout,
-                    is_test=only_forward)
-                bn = fluid.layers.cast(bn, 'float64')
-                sigmoid = fluid.layers.sigmoid(bn)
-                out = fluid.layers.reduce_sum(sigmoid)
-                if not sync_bn:
-                    out = out / core.get_cuda_device_count()
-                if not only_forward:
-                    sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)
-                    sgd_opt.backward(out)
-        return main, startup, [out, conv, bn]
-
-    def _compare(self, place, layout, only_forward):
-        """Compare results."""
-        seed = 10
-        os.environ['FLAGS_cudnn_deterministic'] = "1"
-        scope = core.Scope()
-        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
-        data = create_or_get_tensor(scope, "input",
-                                    OpTest.np_dtype_to_fluid_dtype(data), place)
-
-        # Single-GPU, N = 32 per GPU
-        main, startup, outs = self._build_program(place, layout, seed, False,
-                                                  only_forward)
-        exe = fluid.Executor(place)
-        exe.run(startup)
-        fetch_names = [v.name for v in outs] + [
-            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
-        ]
-        if not only_forward:
-            others = [
-                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
-                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
-            ]
-            fetch_names += others
-        bn_fetches = exe.run(program=main,
-                             feed={'input': data},
-                             fetch_list=fetch_names)
-
-        #####################################################################
-        # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
-        assert core.get_cuda_device_count() > 1
-        main, startup, outs = self._build_program(place, layout, seed, True,
-                                                  only_forward)
-        exe = fluid.Executor(place)
-        exe.run(startup)
-        fetch_names = [v.name for v in outs] + [
-            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
-        ]
-        if not only_forward:
-            others = [
-                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
-                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
-            ]
-            fetch_names += others
-        for nm in fetch_names:
-            fv = fluid.framework._get_var(str(nm), program=main)
-            fv.persistable = True
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.sync_batch_norm = True
-        build_strategy.enable_inplace = False
-        build_strategy.memory_optimize = False
-        comp_prog = compiler.CompiledProgram(main).with_data_parallel(
-            outs[0].name if not only_forward else None,
-            build_strategy=build_strategy)
-        sync_bn_fetches = exe.run(program=comp_prog,
-                                  feed={'input': data},
-                                  fetch_list=fetch_names)
-
-        for i in six.moves.xrange(1, len(sync_bn_fetches)):
-            bn_val = bn_fetches[i]
-            sync_bn_val = sync_bn_fetches[i]
-            if sync_bn_val.shape != bn_val.shape:
-                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
-            self.assertTrue(
-                np.allclose(
-                    bn_val, sync_bn_val, atol=self.atol),
-                "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN     " +
-                str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))
-
-    def test_train(self):
-        """Test training."""
-        if not core.is_compiled_with_cuda():
-            return
-
-        places = [core.CUDAPlace(0)]
-        for place in places:
-            for layout in ["NCHW", "NHWC"]:
-                self._compare(place, layout, False)
-
-    def test_infer(self):
-        """Test inference."""
-        if not core.is_compiled_with_cuda():
-            return
-
-        places = [core.CUDAPlace(0)]
-        for place in places:
-            for layout in ["NCHW", "NHWC"]:
-                self._compare(place, layout, True)
-
-
-class TestFP16SyncBatchNormOpTraining(TestSyncBatchNormOpTraining):
-    """sync_batch_norm op test for FP16 input."""
-
-    def setUp(self):
-        """Setup."""
-        self.dtype = np.float16
-        self.N = 32
-        self.C = 16
-        self.H = 64
-        self.W = 32
-        self.dshape = [self.N, self.C, self.H, self.W]
-        self.atol = 1e-2
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
deleted file mode 100644
index aec219f80639415a9be55ba18e7940953d0e11b0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import random
-from op_test import OpTest
-
-
-def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
-    if len(gt_lod) != len(neg_lod):
-        raise AssertionError("The input arguments are illegal.")
-
-    batch_size = len(gt_lod)
-
-    match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32')
-    neg_indices = np.zeros((sum(neg_lod), 1)).astype('int32')
-
-    offset = 0
-    for n in range(batch_size):
-        gt_num = gt_lod[n]
-        ids = random.sample([i for i in range(num_prior)], gt_num)
-        match_indices[n, ids] = [i for i in range(gt_num)]
-
-        ret_ids = set([i for i in range(num_prior)]) - set(ids)
-        l = neg_lod[n]
-        neg_ids = random.sample(ret_ids, l)
-        neg_indices[offset:offset + neg_lod[n], :] = np.array(neg_ids).astype(
-            'int32').reshape(l, 1)
-        offset += neg_lod[n]
-
-    return match_indices, neg_indices
-
-
-def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
-                  neg_lod, mismatch_value):
-    batch_size, num_prior = match_indices.shape
-
-    # init target bbox
-    trg_box = np.zeros((batch_size, num_prior, 4)).astype('float32')
-    # init weight for target bbox
-    trg_box_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
-    # init target label
-    trg_label = np.ones((batch_size, num_prior, 1)).astype('int32')
-    trg_label = trg_label * mismatch_value
-    # init weight for target label
-    trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
-
-    gt_offset = 0
-    neg_offset = 0
-    for i in range(batch_size):
-        cur_indices = match_indices[i]
-        col_ids = np.where(cur_indices > -1)
-        col_val = cur_indices[col_ids]
-
-        # target bbox
-        for v, c in zip(col_val + gt_offset, col_ids[0].tolist()):
-            trg_box[i][c][:] = encoded_box[v][c][:]
-        # weight for target bbox
-        trg_box_wt[i][col_ids] = 1.0
-
-        trg_label[i][col_ids] = gt_label[col_val + gt_offset]
-        trg_label_wt[i][col_ids] = 1.0
-        # set target label weight to 1.0 for the negative samples
-        if neg_indices is not None:
-            neg_ids = neg_indices[neg_offset:neg_offset + neg_lod[i]]
-            trg_label_wt[i][neg_ids] = 1.0
-        # update offset
-        gt_offset += gt_lod[i]
-        neg_offset += neg_lod[i]
-
-    return trg_box, trg_box_wt, trg_label, trg_label_wt
-
-
-class TestTargetAssginFloatType(OpTest):
-    def setUp(self):
-        self.op_type = "target_assign"
-        num_prior = 120
-        num_class = 21
-        gt_lod = [5, 6, 12]
-        neg_lod = [4, 3, 6]
-        mismatch_value = 0
-        batch_size = len(gt_lod)
-        num_gt = sum(gt_lod)
-
-        encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
-        gt_label = np.random.randint(
-            num_class, size=(num_gt, 1)).astype('int32')
-
-        match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
-                                                               gt_lod, neg_lod)
-
-        out, out_wt, _, _ = target_assign(encoded_box, gt_label, match_indices,
-                                          neg_indices, gt_lod, neg_lod,
-                                          mismatch_value)
-
-        # assign regression targets
-        x = encoded_box
-        self.inputs = {
-            'X': (x, [gt_lod]),
-            'MatchIndices': match_indices,
-        }
-        self.attrs = {'mismatch_value': mismatch_value}
-        self.outputs = {
-            'Out': out,
-            'OutWeight': out_wt,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestTargetAssginIntType(OpTest):
-    def setUp(self):
-        self.op_type = "target_assign"
-        num_prior = 120
-        num_class = 21
-        gt_lod = [5, 6, 12]
-        neg_lod = [4, 3, 6]
-        mismatch_value = 0
-        batch_size = len(gt_lod)
-        num_gt = sum(gt_lod)
-
-        encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
-        gt_label = np.random.randint(
-            num_class, size=(num_gt, 1)).astype('int32')
-
-        match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
-                                                               gt_lod, neg_lod)
-
-        _, _, out, out_wt, = target_assign(encoded_box, gt_label, match_indices,
-                                           neg_indices, gt_lod, neg_lod,
-                                           mismatch_value)
-
-        # assign cassification argets
-        x = np.reshape(gt_label, (num_gt, 1, 1))
-        self.inputs = {
-            'X': (x, [gt_lod]),
-            'MatchIndices': match_indices,
-            'NegIndices': (neg_indices, [neg_lod]),
-        }
-        self.attrs = {'mismatch_value': mismatch_value}
-        self.outputs = {
-            'Out': out,
-            'OutWeight': out_wt,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
deleted file mode 100644
index 26bf0fd88368ed27e142e8515ec57a6c6bebd6fa..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
+++ /dev/null
@@ -1,59 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from math import log
-from math import exp
-from op_test import OpTest
-from scipy.special import logit
-from scipy.special import expit
-import unittest
-
-
-class TestTeacherStudentSigmoidLossOp(OpTest):
-    """
-        Test teacher_student_sigmoid_loss with discrete one-hot labels.
-    """
-
-    def setUp(self):
-        self.op_type = "teacher_student_sigmoid_loss"
-        batch_size = 16
-        num_classes = 1
-        self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype("float32")),
-            'Label': np.random.uniform(0, 2, (batch_size, num_classes))
-            .astype("float32")
-        }
-        outs = []
-        for index, label in enumerate(self.inputs["Label"]):
-            x = self.inputs["X"][index]
-            if label < -1.0:
-                outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))))
-            elif label < 0.0:
-                outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))))
-            elif label < 1.0:
-                outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))) + \
-                            max(x, 0.0) - x * label + log(1.0 + exp(-abs(x))))
-            else:
-                outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))) + \
-                            max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x))))
-        self.outputs = {'Y': np.array(outs)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Y", numeric_grad_delta=0.005)
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
deleted file mode 100644
index d469388ca079b6825c82c447cf574921d7da6f25..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-from paddle.fluid import core
-
-
-def temporal_shift(x, seg_num, shift_ratio):
-    shape = x.shape
-    reshape_x = x.reshape((-1, seg_num, shape[1], shape[2], shape[3]))
-    pad_x = np.pad(reshape_x, ((0, 0), (1, 1), (0, 0), (0, 0), (0, 0)),
-                   'constant')
-    c1 = int(shape[1] * shift_ratio)
-    c2 = int(shape[1] * 2 * shift_ratio)
-    slice1 = pad_x[:, :seg_num, :c1, :, :]
-    slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]
-    slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]
-    concat_x = np.concatenate([slice1, slice2, slice3], axis=2)
-    return concat_x.reshape(shape)
-
-
-class TestTemporalShift(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = 'temporal_shift'
-        x = np.random.random(self.x_shape).astype('float32')
-
-        self.attrs = {
-            "seg_num": self.seg_num,
-            "shift_ratio": self.shift_ratio,
-        }
-
-        self.inputs = {"X": x, }
-
-        output = temporal_shift(x, self.seg_num, self.shift_ratio)
-        self.outputs = {"Out": output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_ignore_uv(self):
-        self.check_grad(['X'], 'Out')
-
-    def initTestCase(self):
-        self.x_shape = (6, 4, 4, 4)
-        self.seg_num = 3
-        self.shift_ratio = 0.25
-
-
-class TestTemporalShift2(TestTemporalShift):
-    def initTestCase(self):
-        self.x_shape = (4, 9, 7, 7)
-        self.seg_num = 2
-        self.shift_ratio = 0.2
-
-
-class TestTemporalShift3(TestTemporalShift):
-    def initTestCase(self):
-        self.x_shape = (3, 10, 5, 5)
-        self.seg_num = 1
-        self.shift_ratio = 0.3
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
deleted file mode 100644
index 4615511ed85441551ed3a5071a8cf1d0dfe32984..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ /dev/null
@@ -1,261 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import unittest
-import numpy
-
-
-class TestTensor(unittest.TestCase):
-    def test_int_tensor(self):
-        scope = core.Scope()
-        var = scope.var("test_tensor")
-        place = core.CPUPlace()
-
-        tensor = var.get_tensor()
-
-        tensor._set_dims([1000, 784])
-        tensor._alloc_int(place)
-        tensor_array = numpy.array(tensor)
-        self.assertEqual((1000, 784), tensor_array.shape)
-        tensor_array[3, 9] = 1
-        tensor_array[19, 11] = 2
-        tensor.set(tensor_array, place)
-
-        tensor_array_2 = numpy.array(tensor)
-        self.assertEqual(1, tensor_array_2[3, 9])
-        self.assertEqual(2, tensor_array_2[19, 11])
-
-    def test_float_tensor(self):
-        scope = core.Scope()
-        var = scope.var("test_tensor")
-        place = core.CPUPlace()
-
-        tensor = var.get_tensor()
-
-        tensor._set_dims([1000, 784])
-        tensor._alloc_float(place)
-
-        tensor_array = numpy.array(tensor)
-        self.assertEqual((1000, 784), tensor_array.shape)
-        tensor_array[3, 9] = 1.0
-        tensor_array[19, 11] = 2.0
-        tensor.set(tensor_array, place)
-
-        tensor_array_2 = numpy.array(tensor)
-        self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
-        self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
-
-    def test_int8_tensor(self):
-        scope = core.Scope()
-        var = scope.var("int8_tensor")
-        cpu_tensor = var.get_tensor()
-        tensor_array = numpy.random.randint(
-            -127, high=128, size=[100, 200], dtype=numpy.int8)
-        place = core.CPUPlace()
-        cpu_tensor.set(tensor_array, place)
-        cpu_tensor_array_2 = numpy.array(cpu_tensor)
-        self.assertAlmostEqual(cpu_tensor_array_2.all(), tensor_array.all())
-
-        if core.is_compiled_with_cuda():
-            cuda_tensor = var.get_tensor()
-            tensor_array = numpy.random.randint(
-                -127, high=128, size=[100, 200], dtype=numpy.int8)
-            place = core.CUDAPlace(0)
-            cuda_tensor.set(tensor_array, place)
-            cuda_tensor_array_2 = numpy.array(cuda_tensor)
-            self.assertAlmostEqual(cuda_tensor_array_2.all(),
-                                   tensor_array.all())
-
-    def test_int_lod_tensor(self):
-        place = core.CPUPlace()
-        scope = core.Scope()
-        var_lod = scope.var("test_lod_tensor")
-        lod_tensor = var_lod.get_tensor()
-
-        lod_tensor._set_dims([4, 4, 6])
-        lod_tensor._alloc_int(place)
-        array = numpy.array(lod_tensor)
-        array[0, 0, 0] = 3
-        array[3, 3, 5] = 10
-        lod_tensor.set(array, place)
-        lod_tensor.set_recursive_sequence_lengths([[2, 2]])
-
-        lod_v = numpy.array(lod_tensor)
-        self.assertTrue(numpy.alltrue(array == lod_v))
-
-        lod = lod_tensor.recursive_sequence_lengths()
-        self.assertEqual(2, lod[0][0])
-        self.assertEqual(2, lod[0][1])
-
-    def test_float_lod_tensor(self):
-        place = core.CPUPlace()
-        scope = core.Scope()
-        var_lod = scope.var("test_lod_tensor")
-
-        lod_tensor = var_lod.get_tensor()
-        lod_tensor._set_dims([5, 2, 3, 4])
-        lod_tensor._alloc_float(place)
-
-        tensor_array = numpy.array(lod_tensor)
-        self.assertEqual((5, 2, 3, 4), tensor_array.shape)
-        tensor_array[0, 0, 0, 0] = 1.0
-        tensor_array[0, 0, 0, 1] = 2.0
-        lod_tensor.set(tensor_array, place)
-
-        lod_v = numpy.array(lod_tensor)
-        self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
-        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertEqual(len(lod_tensor.recursive_sequence_lengths()), 0)
-
-        lod_py = [[2, 1], [1, 2, 2]]
-        lod_tensor.set_recursive_sequence_lengths(lod_py)
-        lod = lod_tensor.recursive_sequence_lengths()
-        self.assertListEqual(lod_py, lod)
-
-    def test_lod_tensor_init(self):
-        place = core.CPUPlace()
-        lod_py = [[2, 1], [1, 2, 2]]
-        lod_tensor = core.LoDTensor()
-
-        lod_tensor._set_dims([5, 2, 3, 4])
-        lod_tensor.set_recursive_sequence_lengths(lod_py)
-        lod_tensor._alloc_float(place)
-        tensor_array = numpy.array(lod_tensor)
-        tensor_array[0, 0, 0, 0] = 1.0
-        tensor_array[0, 0, 0, 1] = 2.0
-        lod_tensor.set(tensor_array, place)
-
-        lod_v = numpy.array(lod_tensor)
-        self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
-        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths())
-
-    def test_lod_tensor_gpu_init(self):
-        if not core.is_compiled_with_cuda():
-            return
-        place = core.CUDAPlace(0)
-        lod_py = [[2, 1], [1, 2, 2]]
-        lod_tensor = core.LoDTensor()
-
-        lod_tensor._set_dims([5, 2, 3, 4])
-        lod_tensor.set_recursive_sequence_lengths(lod_py)
-        lod_tensor._alloc_float(place)
-        tensor_array = numpy.array(lod_tensor)
-        tensor_array[0, 0, 0, 0] = 1.0
-        tensor_array[0, 0, 0, 1] = 2.0
-        lod_tensor.set(tensor_array, place)
-
-        lod_v = numpy.array(lod_tensor)
-        self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
-        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths())
-
-    def test_empty_tensor(self):
-        place = core.CPUPlace()
-        scope = core.Scope()
-        var = scope.var("test_tensor")
-
-        tensor = var.get_tensor()
-
-        tensor._set_dims([0, 1])
-        tensor._alloc_float(place)
-
-        tensor_array = numpy.array(tensor)
-        self.assertEqual((0, 1), tensor_array.shape)
-
-        if core.is_compiled_with_cuda():
-            gpu_place = core.CUDAPlace(0)
-            tensor._alloc_float(gpu_place)
-            tensor_array = numpy.array(tensor)
-            self.assertEqual((0, 1), tensor_array.shape)
-
-    def run_sliece_tensor(self, place):
-
-        tensor = fluid.Tensor()
-        shape = [3, 3, 3]
-        tensor._set_dims(shape)
-
-        tensor_array = numpy.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-                                    [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
-                                    [[19, 20, 21], [22, 23, 24], [25, 26, 27]]])
-
-        tensor.set(tensor_array, place)
-        n1 = tensor[1]
-        t1 = tensor_array[1]
-        self.assertTrue((numpy.array(n1) == numpy.array(t1)).all())
-
-        n2 = tensor[1:]
-        t2 = tensor_array[1:]
-        self.assertTrue((numpy.array(n2) == numpy.array(t2)).all())
-
-        n3 = tensor[0:2:]
-        t3 = tensor_array[0:2:]
-        self.assertTrue((numpy.array(n3) == numpy.array(t3)).all())
-
-        n4 = tensor[2::-2]
-        t4 = tensor_array[2::-2]
-        self.assertTrue((numpy.array(n4) == numpy.array(t4)).all())
-
-        n5 = tensor[2::-2][0]
-        t5 = tensor_array[2::-2][0]
-        self.assertTrue((numpy.array(n5) == numpy.array(t5)).all())
-
-        n6 = tensor[2:-1:-1]
-        t6 = tensor_array[2:-1:-1]
-        self.assertTrue((numpy.array(n6) == numpy.array(t6)).all())
-
-        n7 = tensor[0:, 0:]
-        t7 = tensor_array[0:, 0:]
-        self.assertTrue((numpy.array(n7) == numpy.array(t7)).all())
-
-        n8 = tensor[0::1, 0::-1, 2:]
-        t8 = tensor_array[0::1, 0::-1, 2:]
-        self.assertTrue((numpy.array(n8) == numpy.array(t8)).all())
-
-    def test_sliece_tensor(self):
-        # run cpu first
-        place = core.CPUPlace()
-        self.run_sliece_tensor(place)
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.run_sliece_tensor(place)
-
-    def test_print_tensor(self):
-        scope = core.Scope()
-        var = scope.var("test_tensor")
-        place = core.CPUPlace()
-        tensor = var.get_tensor()
-        tensor._set_dims([10, 10])
-        tensor._alloc_int(place)
-        tensor_array = numpy.array(tensor)
-        self.assertEqual((10, 10), tensor_array.shape)
-        tensor_array[0, 0] = 1
-        tensor_array[2, 2] = 2
-        tensor.set(tensor_array, place)
-        print(tensor)
-        self.assertTrue(isinstance(str(tensor), str))
-
-        if core.is_compiled_with_cuda():
-            tensor.set(tensor_array, core.CUDAPlace(0))
-            print(tensor)
-            self.assertTrue(isinstance(str(tensor), str))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
deleted file mode 100644
index 78b95de7e07b1d1fcdeeae63498e740c2b474c6d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.executor import Executor
-
-
-class TestLoDTensorArrayConcat(unittest.TestCase):
-    def setUp(self):
-        self.op_type = "tensor_array_to_tensor"
-        self.attrs = {"axis": 0}
-        self.outputs = ["Out"]
-
-    def test_get_set(self):
-        scope = core.Scope()
-        program = fluid.Program()
-        block = program.global_block()
-
-        input_arr = block.create_var(
-            name="tmp_lod_tensor_array",
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
-        input_arr.persistable = True
-        input_arr_var = scope.var('tmp_lod_tensor_array')
-        input_tensor_array = input_arr_var.get_lod_tensor_array()
-        self.assertEqual(0, len(input_tensor_array))
-
-        cpu = core.CPUPlace()
-        for i in range(10):
-            t = core.LoDTensor()
-            if i == 0:
-                t.set(numpy.array([[i], [i]], dtype='float32'), cpu)
-            else:
-                t.set(numpy.array([[i]], dtype='float32'), cpu)
-            input_tensor_array.append(t)
-
-        self.assertEqual(10, len(input_tensor_array))
-
-        random_grad = numpy.random.random_sample([11]).astype(numpy.float32)
-
-        y_out = block.create_var(name="Out")
-        y_out.persistable = True
-        y_out_index = block.create_var(name="OutIndex")
-        y_out_index.persistable = True
-
-        y_grad_arr = block.create_var(
-            name='Out@GRAD', dtype='float32', shape=[11])
-        y_grad_arr.persistable = True
-        y_grad = scope.var('Out@GRAD')
-        y_grad_tensor = y_grad.get_tensor()
-        y_grad_tensor.set(random_grad, cpu)
-
-        op = block.append_op(
-            type=self.op_type,
-            inputs={"X": input_arr},
-            outputs={"Out": y_out,
-                     "OutIndex": y_out_index},
-            attrs=self.attrs)
-
-        out_grad = block.create_var(
-            name="tmp_lod_tensor_array@GRAD",
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
-        out_grad.persistable = True
-
-        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
-                                                                  set(), [])
-        grad_op_desc = grad_op_desc_list[0]
-        new_op_desc = block.desc.append_op()
-        new_op_desc.copy_from(grad_op_desc)
-        for var_name in grad_op_desc.output_arg_names():
-            block.desc.var(var_name.encode("ascii"))
-
-        grad_op_desc.infer_var_type(block.desc)
-        grad_op_desc.infer_shape(block.desc)
-        for arg in grad_op_desc.output_arg_names():
-            grad_var = block.desc.find_var(arg.encode("ascii"))
-            grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-        fetch_list = []
-        fetch_list.append(block.var('Out'))
-        fetch_list.append(block.var('OutIndex'))
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        out = exe.run(program, fetch_list=fetch_list, scope=scope)
-        #print ("index: ", numpy.array(out[1]))  
-
-        # test forward
-        tensor_res = numpy.array(out[0])
-        tensor_res_out_idx = numpy.array(out[1])
-        tensor_gt = numpy.array(
-            [0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float32')
-
-        self.assertEqual(len(tensor_res), len(tensor_gt))
-        self.assertEqual(len(tensor_res_out_idx), 10)
-
-        for i in range(len(tensor_res)):
-            self.assertEqual(tensor_res[i], tensor_gt[i])
-
-        for i in range(len(tensor_res_out_idx)):
-            if i == 0:
-                self.assertEqual(tensor_res_out_idx[i], 2)
-            else:
-                self.assertEqual(tensor_res_out_idx[i], 1)
-
-        # test backward
-        grad_tensor = scope.var('tmp_lod_tensor_array@GRAD')
-        grad_tensor_array = grad_tensor.get_lod_tensor_array()
-
-        self.assertEqual(10, len(grad_tensor_array))
-
-        for i in range(len(grad_tensor_array)):
-            if i == 0:
-                self.assertEqual(
-                    numpy.array(grad_tensor_array[i])[0],
-                    numpy.array(random_grad[i]))
-                self.assertEqual(
-                    numpy.array(grad_tensor_array[i])[1],
-                    numpy.array(random_grad[i + 1]))
-            if i == 1:
-                self.assertEqual(
-                    numpy.array(grad_tensor_array[i]),
-                    numpy.array(random_grad[i + 1]))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py b/python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py
deleted file mode 100644
index 003f27652ef1d8aa07c96ff7dfda58e9dd1eba6f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import unittest
-import numpy as np
-import six
-
-
-class TensorToNumpyTest(unittest.TestCase):
-    def setUp(self):
-        self.shape = [11, 25, 32, 43]
-
-    def test_main(self):
-        dtypes = [
-            'float32', 'float64', 'int32', 'int64', 'uint8', 'int8', 'bool'
-        ]
-
-        places = [fluid.CPUPlace()]
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-            places.append(fluid.CUDAPinnedPlace())
-
-        for p in places:
-            for dtype in dtypes:
-                np_arr = np.reshape(
-                    np.array(six.moves.range(np.prod(self.shape))).astype(
-                        dtype), self.shape)
-
-                t = fluid.LoDTensor()
-                t.set(np_arr, p)
-
-                ret_np_arr = np.array(t)
-                self.assertEqual(np_arr.shape, ret_np_arr.shape)
-                self.assertEqual(np_arr.dtype, ret_np_arr.dtype)
-
-                all_equal = np.all(np_arr == ret_np_arr)
-                self.assertTrue(all_equal)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
deleted file mode 100644
index 5327c0f5de5d9a806f993818608929b9e07f624e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ /dev/null
@@ -1,155 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestTopkOp(OpTest):
-    def setUp(self):
-        self.variable_k = False
-        self.set_args()
-        self.op_type = "top_k"
-        self.dtype = np.float32
-        self.init_dtype()
-
-        k = self.top_k
-        input = np.random.random((self.row, k)).astype(self.dtype)
-        output = np.ndarray((self.row, k))
-        indices = np.ndarray((self.row, k)).astype("int64")
-        self.inputs = {'X': input}
-
-        if self.variable_k:
-            self.inputs['K'] = np.array([k]).astype("int32")
-        else:
-            self.attrs = {'k': k}
-
-        for rowid in range(self.row):
-            row = input[rowid]
-            output[rowid] = np.sort(row)[::-1][:k]
-            indices[rowid] = row.argsort()[::-1][:k]
-
-        self.outputs = {'Out': output, 'Indices': indices}
-
-    def init_dtype(self):
-        pass
-
-    def set_args(self):
-        self.row = 32
-        self.top_k = 1
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestTopkOpFp16(TestTopkOp):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-class TestTopkOp3d(OpTest):
-    def setUp(self):
-        self.op_type = "top_k"
-        k = 1
-        input = np.random.random((32, 2, 84)).astype("float32")
-        input_flat_2d = input.reshape(64, 84)
-        output = np.ndarray((64, k))
-        indices = np.ndarray((64, k)).astype("int64")
-
-        self.inputs = {'X': input}
-        self.attrs = {'k': k}
-
-        for rowid in range(64):
-            row = input_flat_2d[rowid]
-            output[rowid] = np.sort(row)[::-1][:k]
-            indices[rowid] = row.argsort()[::-1][:k]
-
-        self.outputs = {
-            'Out': output.reshape((32, 2, k)),
-            'Indices': indices.reshape((32, 2, k))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestTopkOp1(OpTest):
-    def setUp(self):
-        self.op_type = "top_k"
-        k = 2
-        m = 2056
-        input = np.random.random(m).astype("float32")
-        output = np.ndarray(k)
-        indices = np.ndarray(k).astype("int64")
-
-        self.inputs = {'X': input}
-        self.attrs = {'k': k}
-
-        row = input
-        output = -np.sort(-row)[:k]
-        indices = (-row).argsort()[:k]
-
-        self.outputs = {'Out': output, 'Indices': indices}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestTopkOp2(OpTest):
-    def setUp(self):
-        self.op_type = "top_k"
-        k = 1
-        m = 2056
-        input = np.random.random((m, 84)).astype("float32")
-        output = np.ndarray((m, k))
-        indices = np.ndarray((m, k)).astype("int64")
-
-        self.inputs = {'X': input}
-        self.attrs = {'k': k}
-
-        for rowid in range(m):
-            row = input[rowid]
-            output[rowid] = -np.sort(-row)[:k]
-            indices[rowid] = (-row).argsort()[:k]
-
-        self.outputs = {'Out': output, 'Indices': indices}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestTopkOp3(TestTopkOp):
-    def set_args(self):
-        self.row = 2056
-        self.top_k = 3
-
-
-class TestTopkOp4(TestTopkOp):
-    def set_args(self):
-        self.row = 40000
-        self.top_k = 1
-
-
-class TestTopkOp5(TestTopkOp):
-    def set_args(self):
-        self.row = 40000
-        self.top_k = 3
-        self.variable_k = True
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trainable.py b/python/paddle/fluid/tests/unittests/test_trainable.py
deleted file mode 100644
index d1937ca96103db7d26809eba4a96b4d4cf4e9cf2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_trainable.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from collections import Counter
-import unittest
-import paddle.fluid as fluid
-from simple_nets import init_data
-
-
-def test_trainable():
-    x = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    feature = fluid.layers.fc(input=x,
-                              size=10,
-                              param_attr=fluid.ParamAttr(trainable=False))
-    loss = fluid.layers.cross_entropy(input=feature, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-class TestTrainable(unittest.TestCase):
-    def check_trainable(self,
-                        model,
-                        feed_dict,
-                        op_count,
-                        optimizer=fluid.optimizer.Adam()):
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        startup = fluid.Program()
-
-        with fluid.program_guard(main, startup):
-            loss = model()
-            optimizer.minimize(loss)
-
-            # The number of adam should be one.
-            ops = Counter([op.type for op in main.global_block().ops])
-            for op in op_count:
-                if op_count[op] == 0:
-                    assert op not in ops
-                else:
-                    assert ops[op] == op_count[op]
-
-            exe.run(fluid.default_startup_program())
-            exe.run(feed=feed_dict)
-
-    def test_trainable(self):
-        batch_size = 2
-        img, label = init_data(batch_size, img_shape=[784], label_range=9)
-        feed_dict = {'image': img, 'label': label}
-        # Note that, because the Weight of FC is not trainable and the x is stop_gradient,
-        # so the 'mul_grad' should not be appended.
-        self.check_trainable(
-            test_trainable,
-            feed_dict,
-            op_count={'adam': 1,
-                      'scale': 2,
-                      'mul_grad': 0})
-        self.check_trainable(
-            test_trainable,
-            feed_dict,
-            op_count={'adamax': 1,
-                      'scale': 1,
-                      'mul_grad': 0},
-            optimizer=fluid.optimizer.Adamax(learning_rate=0.2))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trainer_desc.py b/python/paddle/fluid/tests/unittests/test_trainer_desc.py
deleted file mode 100644
index f2724ea22b006c786576a3a3a2d02e99a43722b7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_trainer_desc.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-TestCases for TrainerDesc,
-including config, etc.
-"""
-
-from __future__ import print_function
-import paddle.fluid as fluid
-import numpy as np
-import os
-import shutil
-import unittest
-
-
-class TestTrainerDesc(unittest.TestCase):
-    """  TestCases for TrainerDesc. """
-
-    def test_config(self):
-        """
-        Testcase for python config.
-        """
-        trainer_desc = fluid.trainer_desc.TrainerDesc()
-        trainer_desc._set_dump_fields(["a", "b"])
-        trainer_desc._set_mpi_rank(1)
-        trainer_desc._set_dump_fields_path("path")
-
-        dump_fields = trainer_desc.proto_desc.dump_fields
-        mpi_rank = trainer_desc.proto_desc.mpi_rank
-        dump_fields_path = trainer_desc.proto_desc.dump_fields_path
-        self.assertEqual(len(dump_fields), 2)
-        self.assertEqual(dump_fields[0], "a")
-        self.assertEqual(dump_fields[1], "b")
-        self.assertEqual(mpi_rank, 1)
-        self.assertEqual(dump_fields_path, "path")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
deleted file mode 100644
index a38540a7240636415ef4703609c5a3e8e83ed1da..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestTransposeOp(OpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.initTestCase()
-        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
-        self.attrs = {
-            'axis': list(self.axis),
-            'use_mkldnn': self.use_mkldnn,
-        }
-        self.outputs = {
-            'XShape': np.random.random(self.shape).astype("float32"),
-            'Out': self.inputs['X'].transpose(self.axis)
-        }
-
-    def init_op_type(self):
-        self.op_type = "transpose2"
-        self.use_mkldnn = False
-
-    def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-    def initTestCase(self):
-        self.shape = (3, 4)
-        self.axis = (1, 0)
-
-
-class TestCase0(TestTransposeOp):
-    def initTestCase(self):
-        self.shape = (3, )
-        self.axis = (0, )
-
-
-class TestCase1(TestTransposeOp):
-    def initTestCase(self):
-        self.shape = (3, 4, 5)
-        self.axis = (0, 2, 1)
-
-
-class TestCase2(TestTransposeOp):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5)
-        self.axis = (0, 2, 3, 1)
-
-
-class TestCase3(TestTransposeOp):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5, 6)
-        self.axis = (4, 2, 3, 1, 0)
-
-
-class TestCase4(TestTransposeOp):
-    def initTestCase(self):
-        self.shape = (2, 3, 4, 5, 6, 1)
-        self.axis = (4, 2, 3, 1, 0, 5)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tree_conv_op.py b/python/paddle/fluid/tests/unittests/test_tree_conv_op.py
deleted file mode 100644
index 712453d29101d0878eb362e60e22d6b21f0ba026..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_tree_conv_op.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-from op_test import OpTest
-
-
-def collect_node_patch(og, max_depth):
-    """
-    The naive method to construct patches
-    :param og: original graph
-    :param max_depth: the depth of convolution filters
-    :return: convolution patches
-    """
-
-    def gen(node, max_depth):
-        collected = [(node, 1, 1, 0, max_depth)]
-
-        def recurse_helper(node, depth):
-            if depth > max_depth:
-                return
-            l = len(og[node])
-            for idx, c in enumerate(og[node], 1):
-                if depth + 1 < max_depth:
-                    collected.append((c, idx, l, depth + 1, max_depth))
-                    recurse_helper(c, depth + 1)
-
-        recurse_helper(node, 0)
-        return collected
-
-    res = []
-    for u in range(1, len(og)):
-        lis = gen(u, max_depth)
-        if len(lis) > 0:
-            res.append(lis)
-    return res
-
-
-class TestTreeConvOp(OpTest):
-    def setUp(self):
-        self.n = 17
-        self.fea_size = 3
-        self.output_size = 1
-        self.max_depth = 2
-        self.batch_size = 1
-        self.num_filters = 1
-        adj_array = [
-            1, 2, 1, 3, 1, 4, 1, 5, 2, 6, 2, 7, 2, 8, 4, 9, 4, 10, 5, 11, 6, 12,
-            6, 13, 9, 14, 9, 15, 9, 16, 9, 17
-        ]
-        adj = np.array(adj_array).reshape((1, self.n - 1, 2)).astype('int32')
-        adj = np.tile(adj, (self.batch_size, 1, 1))
-        self.op_type = 'tree_conv'
-        vectors = np.random.random(
-            (self.batch_size, self.n, self.fea_size)).astype('float32')
-        self.inputs = {
-            'EdgeSet': adj,
-            'NodesVector': vectors,
-            'Filter': np.random.random((self.fea_size, 3, self.output_size,
-                                        self.num_filters)).astype('float32')
-        }
-        self.attrs = {'max_depth': self.max_depth}
-        vectors = []
-        for i in range(self.batch_size):
-            vector = self.get_output_naive(i)
-            vectors.append(vector)
-        self.outputs = {'Out': np.array(vectors).astype('float32'), }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['NodesVector', 'Filter'], 'Out', max_relative_error=0.5)
-
-    def get_output_naive(self, batch_id):
-        og = [[] for i in range(1, self.n + 2)]
-        st = np.array(self.inputs['EdgeSet'][batch_id]).tolist()
-        for e in st:
-            og[e[0]].append(e[1])
-        patches = collect_node_patch(og, self.max_depth)
-        W = np.array(self.inputs['Filter']).astype('float32')
-        W = np.transpose(W, axes=[1, 0, 2, 3])
-        vec = []
-        for i, patch in enumerate(patches, 1):
-            result = np.zeros((1, W.shape[2], W.shape[3]))
-            for v in patch:
-                eta_t = float(v[4] - v[3]) / float(v[4])
-                eta_l = (1.0 - eta_t) * (0.5 if v[2] == 1 else
-                                         float(v[1] - 1.0) / float(v[2] - 1.0))
-                eta_r = (1.0 - eta_t) * (1.0 - eta_l)
-                x = self.inputs['NodesVector'][batch_id][v[0] - 1]
-                eta = np.array([eta_l, eta_r, eta_t]).reshape(
-                    (3, 1)).astype('float32')
-                Wconvi = np.tensordot(eta, W, axes=([0], [0]))
-                x = np.array(x).reshape((1, 1, self.fea_size))
-                res = np.tensordot(x, Wconvi, axes=2)
-                result = result + res
-            vec.append(result)
-        vec = np.concatenate(vec, axis=0)
-        vec = np.concatenate(
-            [
-                vec, np.zeros(
-                    (self.n - vec.shape[0], W.shape[2], W.shape[3]),
-                    dtype='float32')
-            ],
-            axis=0)
-        return vec
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py
deleted file mode 100644
index dcd4c71527348d8dcb469303701ad156189ce970..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py
+++ /dev/null
@@ -1,640 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-
-
-def trilinear_interp_np(input,
-                        out_d,
-                        out_h,
-                        out_w,
-                        out_size=None,
-                        actual_shape=None,
-                        align_corners=True,
-                        align_mode=0,
-                        data_layout='NCDHW'):
-    """trilinear interpolation implement in shape [N, C, D, H, W]"""
-    if data_layout == "NDHWC":
-        input = np.transpose(input, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
-    if out_size is not None:
-        out_d = out_size[0]
-        out_h = out_size[1]
-        out_w = out_size[2]
-    if actual_shape is not None:
-        out_d = actual_shape[0]
-        out_h = actual_shape[1]
-        out_w = actual_shape[2]
-    batch_size, channel, in_d, in_h, in_w = input.shape
-
-    ratio_d = ratio_h = ratio_w = 0.0
-    if out_d > 1:
-        if (align_corners):
-            ratio_d = (in_d - 1.0) / (out_d - 1.0)
-        else:
-            ratio_d = 1.0 * in_d / out_d
-    if out_h > 1:
-        if (align_corners):
-            ratio_h = (in_h - 1.0) / (out_h - 1.0)
-        else:
-            ratio_h = 1.0 * in_h / out_h
-    if out_w > 1:
-        if (align_corners):
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((batch_size, channel, out_d, out_h, out_w))
-
-    for i in range(out_d):
-        if (align_mode == 0 and not align_corners):
-            d = int(ratio_d * (i + 0.5) - 0.5)
-        else:
-            d = int(ratio_d * i)
-
-        d = max(0, d)
-        did = 1 if d < in_d - 1 else 0
-        if (align_mode == 0 and not align_corners):
-            idx_src_d = max(ratio_d * (i + 0.5) - 0.5, 0)
-            d1lambda = idx_src_d - d
-        else:
-            d1lambda = ratio_d * i - d
-        d2lambda = 1.0 - d1lambda
-
-        for j in range(out_h):
-            if (align_mode == 0 and not align_corners):
-                h = int(ratio_h * (j + 0.5) - 0.5)
-            else:
-                h = int(ratio_h * j)
-
-            h = max(0, h)
-            hid = 1 if h < in_h - 1 else 0
-            if (align_mode == 0 and not align_corners):
-                idx_src_h = max(ratio_h * (j + 0.5) - 0.5, 0)
-                h1lambda = idx_src_h - h
-            else:
-                h1lambda = ratio_h * j - h
-            h2lambda = 1.0 - h1lambda
-
-            for k in range(out_w):
-                if (align_mode == 0 and not align_corners):
-                    w = int(ratio_w * (k + 0.5) - 0.5)
-                else:
-                    w = int(ratio_w * k)
-                w = max(0, w)
-                wid = 1 if w < in_w - 1 else 0
-                if (align_mode == 0 and not align_corners):
-                    idx_src_w = max(ratio_w * (k + 0.5) - 0.5, 0)
-                    w1lambda = idx_src_w - w
-                else:
-                    w1lambda = ratio_w * k - w
-                w2lambda = 1.0 - w1lambda
-
-                out[:, :, i, j, k] = \
-                    d2lambda * \
-                    (h2lambda * (w2lambda * input[:, :, d, h, w] + \
-                              w1lambda * input[:, :, d, h, w+wid]) + \
-                    h1lambda * (w2lambda * input[:, :, d, h+hid, w] + \
-                              w1lambda * input[:, :, d, h+hid, w+wid])) + \
-                    d1lambda * \
-                    (h2lambda * (w2lambda * input[:, :, d+did, h, w] + \
-                              w1lambda * input[:, :, d+did, h, w+wid]) + \
-                    h1lambda * (w2lambda * input[:, :, d+did, h+hid, w] + \
-                              w1lambda * input[:, :, d+did, h+hid, w+wid]))
-    if data_layout == "NDHWC":
-        out = np.transpose(out, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
-
-    return out.astype(input.dtype)
-
-
-class TestTrilinearInterpOp(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCDHW'
-        self.init_test_case()
-        self.op_type = "trilinear_interp"
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.data_layout == "NCDHW":
-            in_d = self.input_shape[2]
-            in_h = self.input_shape[3]
-            in_w = self.input_shape[4]
-        else:
-            in_d = self.input_shape[1]
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-
-        if self.scale > 0:
-            out_d = int(in_d * self.scale)
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = trilinear_interp_np(
-            input_np, out_d, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.align_mode, self.data_layout)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-        # c++ end treat NCDHW the same way as NCHW
-        if self.data_layout == 'NCDHW':
-            data_layout = 'NCHW'
-        else:
-            data_layout = 'NHWC'
-        self.attrs = {
-            'out_d': self.out_d,
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-            'data_layout': data_layout
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 4, 4, 4]
-        self.out_d = 2
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase1(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 1, 7, 8, 9]
-        self.out_d = 1
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase2(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 9, 6, 8]
-        self.out_d = 12
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase3(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 2, 16, 8, 4]
-        self.out_d = 32
-        self.out_h = 16
-        self.out_w = 8
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase4(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [4, 1, 7, 8, 9]
-        self.out_d = 1
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.out_size = np.array([2, 2, 2]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase5(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 3, 9, 6, 8]
-        self.out_d = 12
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = np.array([11, 11, 11]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase6(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 1, 16, 8, 4]
-        self.out_d = 8
-        self.out_h = 32
-        self.out_w = 16
-        self.scale = 0.
-        self.out_size = np.array([17, 9, 5]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpSame(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 1, 16, 8, 4]
-        self.out_d = 16
-        self.out_h = 8
-        self.out_w = 4
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpSameHW(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 1, 16, 8, 4]
-        self.out_d = 8
-        self.out_h = 8
-        self.out_w = 4
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpActualShape(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 2, 16, 8, 4]
-        self.out_d = 64
-        self.out_h = 32
-        self.out_w = 16
-        self.scale = 0.
-        self.out_size = np.array([33, 19, 7]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpDatalayout(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 4, 4, 4, 3]
-        self.out_d = 2
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-        self.data_layout = "NDHWC"
-
-
-class TestTrilinearInterpOpUint8(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "trilinear_interp"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
-
-        if self.scale > 0:
-            out_d = int(self.input_shape[2] * self.scale)
-            out_h = int(self.input_shape[3] * self.scale)
-            out_w = int(self.input_shape[4] * self.scale)
-        else:
-            out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
-                                        self.out_size, self.actual_shape,
-                                        self.align_corners, self.align_mode)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-
-        self.attrs = {
-            'out_d': self.out_d,
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output_with_place(place=core.CPUPlace(), atol=1)
-
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 3, 9, 6, 8]
-        self.out_d = 13
-        self.out_h = 10
-        self.out_w = 9
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase1Uint8(TestTrilinearInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 16, 8, 4]
-        self.out_d = 13
-        self.out_h = 7
-        self.out_w = 2
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase2Uint8(TestTrilinearInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [4, 1, 7, 8, 9]
-        self.out_d = 3
-        self.out_h = 5
-        self.out_w = 13
-        self.scale = 0.
-        self.out_size = np.array([6, 15, 21]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpOtherMethod1(TestTrilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 1
-
-
-class TestTrilinearInterpWithMethod2(TestTrilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestTrilinearInterpWithMethod3(TestTrilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = True
-        self.align_mode = 0
-
-
-class TestTrilinearInterpScale1(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 9]
-        self.out_d = 82
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 2.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpScale2(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 9]
-        self.out_d = 82
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 1.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpScale3(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 9]
-        self.out_d = 82
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 1.5
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpZero(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 11]
-        self.out_d = 82
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 0.2
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestTrilinearInterpOp_attr_tensor(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "trilinear_interp"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale > 0:
-            out_d = int(self.input_shape[2] * self.scale)
-            out_h = int(self.input_shape[3] * self.scale)
-            out_w = int(self.input_shape[4] * self.scale)
-            self.attrs['scale'] = self.scale
-        else:
-            out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_d'] = self.out_d
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
-                                        self.out_size, self.actual_shape,
-                                        self.align_corners, self.align_mode)
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 4, 4, 4]
-        self.out_d = 2
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.
-        self.out_size = [2, 3, 3]
-        self.align_corners = True
-        self.align_mode = 1
-
-
-# out_size is a 1-D tensor
-class TestTrilinearInterp_attr_tensor_Case1(TestTrilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 2, 9, 6, 8]
-        self.out_d = 32
-        self.out_h = 16
-        self.out_w = 8
-        self.scale = 0.3
-        self.out_size = [12, 4, 4]
-        self.align_corners = True
-        self.align_mode = 1
-
-
-# scale is a 1-D tensor
-class TestTrilinearInterp_attr_tensor_Case2(TestTrilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 8, 8, 4]
-        self.out_d = 16
-        self.out_h = 12
-        self.out_w = 4
-        self.scale = 0.
-        self.out_size = [16, 4, 10]
-        self.align_corners = True
-        self.align_mode = 1
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-class TestTrilinearInterp_attr_tensor_Case3(TestTrilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 8, 8, 4]
-        self.out_d = 16
-        self.out_h = 16
-        self.out_w = 8
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.align_mode = 1
-        self.scale_by_1Dtensor = True
-
-
-class TestTrilinearInterpAPI(OpTest):
-    def test_case(self):
-        x = fluid.layers.data(name="x", shape=[3, 6, 9, 4], dtype="float32")
-        y = fluid.layers.data(name="y", shape=[6, 9, 4, 3], dtype="float32")
-
-        dim = fluid.layers.data(name="dim", shape=[1], dtype="int32")
-        shape_tensor = fluid.layers.data(
-            name="shape_tensor",
-            shape=[3],
-            dtype="int32",
-            append_batch_size=False)
-        actual_size = fluid.layers.data(
-            name="actual_size",
-            shape=[3],
-            dtype="int32",
-            append_batch_size=False)
-        scale_tensor = fluid.layers.data(
-            name="scale_tensor",
-            shape=[1],
-            dtype="float32",
-            append_batch_size=False)
-
-        out1 = fluid.layers.resize_trilinear(
-            y, out_shape=[12, 18, 8], data_format='NDHWC')
-        out2 = fluid.layers.resize_trilinear(x, out_shape=[12, dim, 8])
-        out3 = fluid.layers.resize_trilinear(x, out_shape=shape_tensor)
-        out4 = fluid.layers.resize_trilinear(
-            x, out_shape=[4, 4, 8], actual_shape=actual_size)
-        out5 = fluid.layers.resize_trilinear(x, scale=scale_tensor)
-
-        x_data = np.random.random((1, 3, 6, 9, 4)).astype("float32")
-        dim_data = np.array([18]).astype("int32")
-        shape_data = np.array([12, 18, 8]).astype("int32")
-        actual_size_data = np.array([12, 18, 8]).astype("int32")
-        scale_data = np.array([2.0]).astype("float32")
-
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        results = exe.run(fluid.default_main_program(),
-                          feed={
-                              "x": x_data,
-                              "y": np.transpose(x_data, (0, 2, 3, 4, 1)),
-                              "dim": dim_data,
-                              "shape_tensor": shape_data,
-                              "actual_size": actual_size_data,
-                              "scale_tensor": scale_data
-                          },
-                          fetch_list=[out1, out2, out3, out4, out5],
-                          return_numpy=True)
-
-        expect_res = trilinear_interp_np(
-            x_data, out_d=12, out_h=18, out_w=8, align_mode=1)
-        self.assertTrue(
-            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 4, 1))))
-        for i in range(len(results) - 1):
-            self.assertTrue(np.allclose(results[i + 1], expect_res))
-
-    def test_exception(self):
-        input = fluid.layers.data(
-            name="input", shape=[3, 6, 9, 4], dtype="float32")
-        try:
-            # for 5-D input, data_format only can be NCDHW or NDHWC
-            out = fluid.layers.resize_trilinear(
-                input, out_shape=[4, 8, 4], data_format='NHWC')
-        except:
-            pass
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
deleted file mode 100644
index 4abeae77d26e8def85596aefc6c2f89cd4e4d6f0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.executor import Executor
-
-
-class TestTrunctedGaussianRandomOp(unittest.TestCase):
-    def setUp(self):
-        self.op_type = "truncated_gaussian_random"
-        self.inputs = {}
-        self.attrs = {
-            "shape": [10000],
-            "mean": .0,
-            "std": 1.,
-            "seed": 10,
-        }
-
-        self.outputs = ["Out"]
-
-    def test_cpu(self):
-        self.gaussian_random_test(place=fluid.CPUPlace())
-
-    def test_gpu(self):
-        if core.is_compiled_with_cuda():
-            self.gaussian_random_test(place=fluid.CUDAPlace(0))
-
-    def gaussian_random_test(self, place):
-
-        program = fluid.Program()
-        block = program.global_block()
-        vout = block.create_var(name="Out")
-        op = block.append_op(
-            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
-
-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-
-        fetch_list = []
-        for var_name in self.outputs:
-            fetch_list.append(block.var(var_name))
-
-        exe = Executor(place)
-        outs = exe.run(program, fetch_list=fetch_list)
-        tensor = outs[0]
-        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
-        self.assertAlmostEqual(numpy.var(tensor), 0.773, delta=0.1)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unfold_op.py b/python/paddle/fluid/tests/unittests/test_unfold_op.py
deleted file mode 100644
index 379982b60682c166ddb737b6a009d1ea758c0729..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_unfold_op.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import math
-import numpy as np
-import unittest
-from op_test import OpTest
-
-
-class TestUnfoldOp(OpTest):
-    """
-    This is for test on unfold Op
-    """
-
-    def init_data(self):
-        self.batch_size = 3
-        self.input_channels = 3
-        self.input_height = 20
-        self.input_width = 20
-        self.kernel_sizes = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1, 1, 1]
-        self.dilations = [1, 1]
-        input_shape = [
-            self.batch_size, self.input_channels, self.input_height,
-            self.input_width
-        ]
-        self.x = np.random.rand(*input_shape).astype(np.float32)
-
-    def calc_unfold(self):
-        output_shape = [0] * 3
-        output_shape[0] = self.batch_size
-        output_shape[1] = self.input_channels * self.kernel_sizes[
-            0] * self.kernel_sizes[1]
-        dkernel_h = self.dilations[0] * (self.kernel_sizes[0] - 1) + 1
-        dkernel_w = self.dilations[1] * (self.kernel_sizes[1] - 1) + 1
-        out_height = int((self.input_height + self.paddings[0] +
-                          self.paddings[2] - dkernel_h) / self.strides[0]) + 1
-        out_width = int((self.input_width + self.paddings[1] + self.paddings[3]
-                         - dkernel_w) / self.strides[1]) + 1
-        output_shape[2] = out_height * out_width
-        output = np.zeros(output_shape).astype(np.float32)
-        ############ calculate output ##############
-        for i in range(output_shape[0]):
-            for j in range(output_shape[1]):
-                for k in range(output_shape[2]):
-                    h_out = int(k / out_width)
-                    w_out = k % out_width
-                    w_offset = j % self.kernel_sizes[1]
-                    h_offset = int(j /
-                                   self.kernel_sizes[1]) % self.kernel_sizes[0]
-                    c_in = int(j /
-                               (self.kernel_sizes[0] * self.kernel_sizes[1]))
-                    h_in = h_offset * self.dilations[0] + h_out * self.strides[
-                        0] - self.paddings[0]
-                    w_in = w_offset * self.dilations[1] + w_out * self.strides[
-                        1] - self.paddings[1]
-                    if (h_in>=0 and h_in<self.input_height) and \
-                         (w_in>=0 and w_in<self.input_width):
-                        output[i, j, k] = self.x[i, c_in, h_in, w_in]
-
-        self.outputs = output
-
-    def set_data(self):
-        self.init_data()
-        self.calc_unfold()
-
-        self.inputs = {'X': self.x}
-        self.attrs = {
-            'kernel_sizes': self.kernel_sizes,
-            'paddings': self.paddings,
-            'dilations': self.dilations,
-            'strides': self.strides
-        }
-        self.outputs = {'Y': self.outputs}
-
-    def setUp(self):
-        self.op_type = 'unfold'
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
deleted file mode 100644
index 7b8be24d9da8c15eeb52c0ba207ea780b03254f8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestUniformRandomBatchSizeLike(OpTest):
-    def setUp(self):
-        self.op_type = "uniform_random_batch_size_like"
-        self.inputs = {'Input': np.zeros((500, 2000), dtype="float32")}
-        self.attrs = {'min': 1., 'max': 2., 'shape': [-1, 2000]}
-        self.outputs = {'Out': np.zeros((500, 2000), dtype='float32')}
-
-    def test_check_output(self):
-        self.check_output_customized(self.verify_output)
-
-    def verify_output(self, outs):
-        self.assertEqual(outs[0].shape, (500, 2000))
-        hist, _ = np.histogram(outs[0], range=(1, 2))
-        hist = hist.astype("float32")
-        hist /= float(outs[0].size)
-        prob = 0.1 * np.ones((10))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
deleted file mode 100644
index cb54369ab247b409782ecdab348fc5a53dacbd77..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ /dev/null
@@ -1,162 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import paddle.fluid as fluid
-
-
-def output_hist(out):
-    hist, _ = np.histogram(out, range=(-5, 10))
-    hist = hist.astype("float32")
-    hist /= float(out.size)
-    prob = 0.1 * np.ones((10))
-    return hist, prob
-
-
-def output_hist_diag(out):
-    diag_num = min(out.shape)
-    for i in range(diag_num):
-        assert abs(out[i][i] - 1.0) < 1e-9
-        # ignore diagonal elements
-        out[i][i] = 100
-    hist, _ = np.histogram(out, range=(-5, 10))
-    hist = hist.astype("float32")
-    hist /= float(out.size)
-    prob = 0.1 * np.ones((10))
-    return hist, prob
-
-
-class TestUniformRandomOp(OpTest):
-    def setUp(self):
-        self.op_type = "uniform_random"
-        self.inputs = {}
-        self.init_attrs()
-        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
-
-    def init_attrs(self):
-        self.attrs = {
-            "shape": [1000, 784],
-            "min": -5.0,
-            "max": 10.0,
-            "seed": 10
-        }
-        self.output_hist = output_hist
-
-    def test_check_output(self):
-        self.check_output_customized(self.verify_output)
-
-    def verify_output(self, outs):
-        hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
-
-
-class TestUniformRandomOpWithDiagInit(TestUniformRandomOp):
-    def init_attrs(self):
-        self.attrs = {
-            "shape": [1000, 784],
-            "min": -5.0,
-            "max": 10.0,
-            "seed": 10,
-            "diag_num": 784,
-            "diag_step": 784,
-            "diag_val": 1.0
-        }
-        self.output_hist = output_hist_diag
-
-
-class TestUniformRandomOpSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        out = scope.var("X").get_selected_rows()
-
-        op = Operator(
-            "uniform_random",
-            Out="X",
-            shape=[4, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10)
-        op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
-        hist, prob = output_hist(np.array(out.get_tensor()))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
-
-
-class TestUniformRandomOpSelectedRowsWithDiagInit(
-        TestUniformRandomOpSelectedRows):
-    def check_with_place(self, place):
-        scope = core.Scope()
-        out = scope.var("X").get_selected_rows()
-
-        op = Operator(
-            "uniform_random",
-            Out="X",
-            shape=[4, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10,
-            diag_num=4,
-            diag_step=784,
-            diag_val=1.0)
-        op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
-        hist, prob = output_hist_diag(np.array(out.get_tensor()))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
-
-
-class TestUniformRandomOpApi(unittest.TestCase):
-    def test_api(self):
-        x = fluid.layers.data('x', shape=[16], dtype='float32', lod_level=1)
-        y = fluid.layers.fc(x,
-                            size=16,
-                            param_attr=fluid.initializer.Uniform(
-                                low=-0.5,
-                                high=0.5,
-                                seed=10,
-                                diag_num=16,
-                                diag_step=16,
-                                diag_val=1.0))
-
-        place = fluid.CPUPlace()
-        x_tensor = fluid.create_lod_tensor(
-            np.random.rand(3, 16).astype("float32"), [[1, 2]], place)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'x': x_tensor}, fetch_list=[y], return_numpy=False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
deleted file mode 100644
index 2e91574954ed607379b14d564f438cbad648b1b1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestUniqueOp(OpTest):
-    def setUp(self):
-        self.op_type = "unique"
-        self.init_config()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_config(self):
-        self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64'), }
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {
-            'Out': np.array(
-                [2, 3, 1, 5], dtype='int64'),
-            'Index': np.array(
-                [0, 1, 1, 2, 3, 1], dtype='int32')
-        }
-
-
-class TestOne(TestUniqueOp):
-    def init_config(self):
-        self.inputs = {'X': np.array([2], dtype='int64'), }
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {
-            'Out': np.array(
-                [2], dtype='int64'),
-            'Index': np.array(
-                [0], dtype='int32')
-        }
-
-
-class TestRandom(TestUniqueOp):
-    def init_config(self):
-        self.inputs = {'X': np.random.randint(0, 100, (150, ), dtype='int64')}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT64)}
-        np_unique, np_index, reverse_index = np.unique(self.inputs['X'], True,
-                                                       True)
-        np_tuple = [(np_unique[i], np_index[i]) for i in range(len(np_unique))]
-        np_tuple.sort(key=lambda x: x[1])
-        target_out = np.array([i[0] for i in np_tuple], dtype='int64')
-        target_index = np.array(
-            [list(target_out).index(i) for i in self.inputs['X']],
-            dtype='int64')
-
-        self.outputs = {'Out': target_out, 'Index': target_index}
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unique_name.py b/python/paddle/fluid/tests/unittests/test_unique_name.py
deleted file mode 100644
index b8c751b2e9b5a905d9de40fc5f78a02c6ca5e034..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_unique_name.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-
-
-class TestUniqueName(unittest.TestCase):
-    def test_guard(self):
-        with fluid.unique_name.guard():
-            name_1 = fluid.unique_name.generate('')
-
-        with fluid.unique_name.guard():
-            name_2 = fluid.unique_name.generate('')
-
-        self.assertEqual(name_1, name_2)
-
-        with fluid.unique_name.guard("A"):
-            name_1 = fluid.unique_name.generate('')
-
-        with fluid.unique_name.guard('B'):
-            name_2 = fluid.unique_name.generate('')
-
-        self.assertNotEqual(name_1, name_2)
-
-    def test_generate(self):
-        with fluid.unique_name.guard():
-            name1 = fluid.unique_name.generate('fc')
-            name2 = fluid.unique_name.generate('fc')
-            name3 = fluid.unique_name.generate('tmp')
-            self.assertNotEqual(name1, name2)
-            self.assertEqual(name1[-2:], name3[-2:])
diff --git a/python/paddle/fluid/tests/unittests/test_unique_with_counts.py b/python/paddle/fluid/tests/unittests/test_unique_with_counts.py
deleted file mode 100644
index 80056422a2a6cf4d52cbde3941fde5d559cd149e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_unique_with_counts.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestUniqueWithCountsOp(OpTest):
-    def setUp(self):
-        self.op_type = "unique_with_counts"
-        self.init_config()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_config(self):
-        self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64'), }
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {
-            'Out': np.array(
-                [2, 3, 1, 5], dtype='int64'),
-            'Index': np.array(
-                [0, 1, 1, 2, 3, 1], dtype='int32'),
-            'Count': np.array(
-                [1, 3, 1, 1], dtype='int32')
-        }
-
-
-class TestOne(TestUniqueWithCountsOp):
-    def init_config(self):
-        self.inputs = {'X': np.array([2], dtype='int64'), }
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {
-            'Out': np.array(
-                [2], dtype='int64'),
-            'Index': np.array(
-                [0], dtype='int32'),
-            'Count': np.array(
-                [1], dtype='int32')
-        }
-
-
-class TestRandom(TestUniqueWithCountsOp):
-    def init_config(self):
-        input_data = np.random.randint(0, 100, (2000, ), dtype='int64')
-        self.inputs = {'X': input_data}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT64)}
-        np_unique, np_index, reverse_index = np.unique(self.inputs['X'], True,
-                                                       True)
-        np_tuple = [(np_unique[i], np_index[i]) for i in range(len(np_unique))]
-        np_tuple.sort(key=lambda x: x[1])
-        target_out = np.array([i[0] for i in np_tuple], dtype='int64')
-        target_index = np.array(
-            [list(target_out).index(i) for i in self.inputs['X']],
-            dtype='int64')
-        count = [0 for i in range(len(np_unique))]
-        for i in range(target_index.shape[0]):
-            count[target_index[i]] += 1
-        target_count = np.array(count, dtype='int64')
-        self.outputs = {
-            'Out': target_out,
-            'Index': target_index,
-            'Count': target_count
-        }
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
deleted file mode 100644
index b0c7c3c8662e217f4e88245f22f6b50e7a48c8b7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
-    s0, s1, s2, s3 = input.shape
-    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
-    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
-    out = np.zeros((s0, s1, out_hsize, out_wsize))
-    for nidx in range(s0):
-        for cidx in range(s1):
-            for h in range(s2):
-                for w in range(s3):
-                    index = indices[nidx, cidx, h, w]
-                    hidx = (index - index % out_wsize) // out_wsize
-                    widx = index % out_wsize
-                    out[nidx, cidx, int(hidx), int(widx)] = \
-                            input[nidx, cidx, h, w]
-
-    return out
-
-
-class TestUnpoolOp(OpTest):
-    def setUp(self):
-        self.op_type = "unpool"
-        self.init_test_case()
-        pre_input = np.random.random(self.shape).astype("float32")
-        nsize, csize, hsize, wsize = pre_input.shape
-        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) // \
-                self.strides[0] + 1
-        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) // \
-                self.strides[1] + 1
-        input = np.zeros((nsize, csize, hsize_out, wsize_out))
-        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
-        for i in range(hsize_out):
-            for j in range(wsize_out):
-                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
-                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
-                        self.paddings[0], hsize))
-                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
-                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
-                        self.paddings[1], wsize))
-                for nidx in range(nsize):
-                    for cidx in range(csize):
-                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
-                                c_start:c_end]
-                        input[nidx, cidx, i, j] = x_masked.max()
-                        arg = x_masked.argmax()
-                        indices[nidx, cidx, i, j] = \
-                                (r_start + arg // self.ksize[1]) * wsize + \
-                                c_start + arg % self.ksize[1]
-        output = self.unpool2d_forward_naive(input, indices, self.ksize, \
-                self.strides, self.paddings).astype("float32")
-        self.inputs = {
-            'X': input.astype('float32'),
-            'Indices': indices.astype('int32')
-        }
-        self.attrs = {
-            'strides': self.strides,
-            'paddings': self.paddings,
-            'ksize': self.ksize,
-            'unpooling_type': self.unpooling_type,
-        }
-        self.outputs = {'Out': output.astype('float32')}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-    def init_test_case(self):
-        self.unpool2d_forward_naive = unpool2dmax_forward_naive
-        self.unpooling_type = "max"
-        self.shape = [6, 4, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [2, 2]
-        self.paddings = [0, 0]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
deleted file mode 100644
index 14dd2bb06f9a18d0b15a4aee4e9e6bfdf8c41206..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-from op_test import OpTest
-
-
-# Correct: General.
-class TestUnsqueezeOp(OpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = "unsqueeze2"
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.init_attrs()
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.new_shape),
-            "XShape": np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def test_check_output(self):
-        self.check_output(no_check_set=["XShape"])
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (1, 2)
-        self.new_shape = (3, 1, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes}
-
-
-# Correct: Single input index.
-class TestUnsqueezeOp1(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (-1, )
-        self.new_shape = (3, 5, 1)
-
-
-# Correct: Mixed input axis.
-class TestUnsqueezeOp2(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (0, -1)
-        self.new_shape = (1, 3, 5, 1)
-
-
-# Correct: There is duplicated axis.
-class TestUnsqueezeOp3(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 2, 5)
-        self.axes = (0, 3, 3)
-        self.new_shape = (1, 3, 2, 1, 1, 5)
-
-
-# Correct: Reversed axes.
-class TestUnsqueezeOp4(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 2, 5)
-        self.axes = (3, 1, 1)
-        self.new_shape = (3, 1, 1, 2, 5, 1)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
deleted file mode 100644
index a324438ba5a3c3b57fd956bd11189ef7d50267e2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-
-from op_test import OpTest
-
-
-# Correct: General.
-class TestUnsqueezeOp(OpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = "unsqueeze"
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (1, 2)
-        self.new_shape = (3, 1, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes}
-
-
-# Correct: Single input index.
-class TestUnsqueezeOp1(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (-1, )
-        self.new_shape = (3, 5, 1)
-
-
-# Correct: Mixed input axis.
-class TestUnsqueezeOp2(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (0, -1)
-        self.new_shape = (1, 3, 5, 1)
-
-
-# Correct: There is duplicated axis.
-class TestUnsqueezeOp3(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 2, 5)
-        self.axes = (0, 3, 3)
-        self.new_shape = (1, 3, 2, 1, 1, 5)
-
-
-# Correct: Reversed axes.
-class TestUnsqueezeOp4(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 2, 5)
-        self.axes = (3, 1, 1)
-        self.new_shape = (3, 1, 1, 2, 5, 1)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unstack_op.py b/python/paddle/fluid/tests/unittests/test_unstack_op.py
deleted file mode 100644
index 7cbac8928ec40dc3e1c0e91e7779ec9ec978d884..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_unstack_op.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from op_test import OpTest
-import numpy as np
-import unittest
-
-
-class TestUnStackOpBase(OpTest):
-    def initDefaultParameters(self):
-        self.input_dim = (5, 6, 7)
-        self.axis = 0
-        self.dtype = 'float32'
-
-    def initParameters(self):
-        pass
-
-    def get_y_names(self):
-        y_names = []
-        for i in range(self.input_dim[self.axis]):
-            y_names.append('y{}'.format(i))
-        return y_names
-
-    def setUp(self):
-        self.initDefaultParameters()
-        self.initParameters()
-        self.op_type = 'unstack'
-        self.x = np.random.random(size=self.input_dim).astype(self.dtype)
-
-        outs = np.split(self.x, self.input_dim[self.axis], self.axis)
-        new_shape = list(self.input_dim)
-        del new_shape[self.axis]
-        y_names = self.get_y_names()
-        tmp = []
-        for i in range(self.input_dim[self.axis]):
-            tmp.append((y_names[i], np.reshape(outs[i], new_shape)))
-
-        self.inputs = {'X': self.x}
-        self.outputs = {'Y': tmp}
-        self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad('X', self.get_y_names())
-
-
-class TestStackOp3(TestUnStackOpBase):
-    def initParameters(self):
-        self.axis = -1
-
-
-class TestStackOp4(TestUnStackOpBase):
-    def initParameters(self):
-        self.axis = -3
-
-
-class TestStackOp5(TestUnStackOpBase):
-    def initParameters(self):
-        self.axis = 1
-
-
-class TestStackOp6(TestUnStackOpBase):
-    def initParameters(self):
-        self.axis = 2
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_conv_2d.py b/python/paddle/fluid/tests/unittests/test_var_conv_2d.py
deleted file mode 100644
index 60a0dcae8541a0a40da8af5836d1859aa6e62340..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_var_conv_2d.py
+++ /dev/null
@@ -1,305 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestVarConv2dOp(OpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.set_data()
-        self.compute()
-
-    def init_op_type(self):
-        self.op_type = "var_conv_2d"
-
-    def set_data(self):
-        input_channel = 3
-        output_channel = 2
-        filter_size = [2, 3]
-        stride = [1, 1]
-        row = [2, 4]
-        col = [3, 2]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-    def init_data(self, input_channel, output_channel, filter_size, stride, row,
-                  col):
-
-        feature = [row[i] * col[i] for i in range(len(row))]
-        numel = sum(feature) * input_channel
-        x_data = np.random.random((numel, 1)).astype('float32')
-        x_lod = [[x * input_channel for x in feature]]
-        row_data = np.random.random((sum(row), 10)).astype('float32')
-        col_data = np.random.random((sum(col), 10)).astype('float32')
-        w_shape = (output_channel,
-                   input_channel * filter_size[0] * filter_size[1])
-        w_data = np.random.random(w_shape).astype('float32')
-        self.inputs = {
-            'X': (x_data, x_lod),
-            'ROW': (row_data, [row]),
-            'COLUMN': (col_data, [col]),
-            'W': w_data
-        }
-        self.attrs = {
-            'InputChannel': input_channel,
-            'OutputChannel': output_channel,
-            'StrideH': stride[0],
-            'StrideW': stride[1],
-            'KernelH': filter_size[0],
-            'KernelW': filter_size[1],
-        }
-
-    def compute(self):
-        in_ch = self.attrs['InputChannel']
-        out_ch = self.attrs['OutputChannel']
-        kernel_h = self.attrs['KernelH']
-        kernel_w = self.attrs['KernelW']
-        stride_h = self.attrs['StrideH']
-        stride_w = self.attrs['StrideW']
-        row_data, row_lod = self.inputs['ROW']
-        col_data, col_lod = self.inputs['COLUMN']
-        x_data, x_lod = self.inputs['X']
-        w_data = self.inputs['W']
-        out_data = np.zeros((0, 1)).astype('float32')
-
-        col_res_data, col_res_lod = self.Im2Col()
-        out_lod = [[]]
-        col_data_offset = 0
-        batch_size = len(x_lod[0])
-        for idx in range(batch_size):
-            width = col_lod[0][idx]
-            height = row_lod[0][idx]
-            top_im_x = 0
-            if width != 0:
-                top_im_x = (width - 1) // stride_w + 1
-            top_im_y = 0
-            if height != 0:
-                top_im_y = (height - 1) // stride_h + 1
-            top_im_size = top_im_x * top_im_y
-            out_lod[0].append(out_ch * top_im_size)
-            if top_im_size == 0:
-                out_tmp = np.zeros((out_ch * top_im_size, 1)).astype('float32')
-            else:
-                col_batch_data = col_res_data[col_data_offset:col_data_offset +
-                                              col_res_lod[0][idx]]
-                gemm_shape = (in_ch * kernel_h * kernel_w, top_im_size)
-                col_batch_data = col_batch_data.reshape(gemm_shape)
-                out_tmp = np.dot(w_data, col_batch_data).reshape(-1, 1)
-            out_data = np.vstack((out_data, out_tmp))
-
-            col_data_offset += col_res_lod[0][idx]
-
-        self.outputs = {
-            'Out': (out_data.astype('float32'), out_lod),
-            'Col': (col_res_data, col_res_lod)
-        }
-
-    def Im2Col(self):
-        in_ch = self.attrs['InputChannel']
-        kernel_h = self.attrs['KernelH']
-        kernel_w = self.attrs['KernelW']
-        stride_h = self.attrs['StrideH']
-        stride_w = self.attrs['StrideW']
-        row_data, row_lod = self.inputs['ROW']
-        col_data, col_lod = self.inputs['COLUMN']
-        x_data, x_lod = self.inputs['X']
-        col_res_lod = [[]]
-        top_size = 0
-        batch_size = len(x_lod[0])
-        for idx in range(batch_size):
-            width = col_lod[0][idx]
-            height = row_lod[0][idx]
-            top_im_x = 0
-            if width != 0:
-                top_im_x = (width - 1) // stride_w + 1
-            top_im_y = 0
-            if height != 0:
-                top_im_y = (height - 1) // stride_h + 1
-            top_x = top_im_x * top_im_y
-            top_y = in_ch * kernel_h * kernel_w
-            col_res_lod[0].append(top_x * top_y)
-            top_size += top_x * top_y
-
-        col_res = np.zeros((top_size, 1)).astype('float32')
-
-        kernel_win_size = kernel_h * kernel_w
-        half_kernel_h = kernel_h // 2
-        half_kernel_w = kernel_w // 2
-        t_offset, b_offset = 0, 0
-        for idx in range(batch_size):
-            width = col_lod[0][idx]
-            height = row_lod[0][idx]
-            if width == 0 or height == 0:
-                continue
-            top_im_x = (width - 1) // stride_w + 1
-            top_im_y = (height - 1) // stride_h + 1
-            top_x = top_im_x * top_im_y
-            for z in range(in_ch):
-                row_offset = kernel_win_size * z
-                im_offset = z * width * height
-                for y in range(0, height, stride_h):
-                    for x in range(0, width, stride_w):
-                        col_offset = x // stride_w + y // stride_h * top_im_x
-                        for ky in range(kernel_h):
-                            for kx in range(kernel_w):
-                                im_y = y + ky - half_kernel_h
-                                im_x = x + kx - half_kernel_w
-                                if im_x >= 0 and im_x < width and im_y >= 0 and im_y < height:
-                                    col_res[t_offset +
-                                        (row_offset + ky * kernel_w + kx) * top_x +
-                                        col_offset] = \
-                                    x_data[b_offset + im_offset + im_y * width + im_x]
-
-            t_offset += col_res_lod[0][idx]
-            b_offset += x_lod[0][idx]
-
-        return col_res, col_res_lod
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.005)
-
-
-class TestVarConv2dOpCase1(TestVarConv2dOp):
-    def set_data(self):
-        # set in_ch 1
-        input_channel = 1
-        output_channel = 2
-        filter_size = [2, 3]
-        stride = [1, 1]
-        row = [1, 4]
-        col = [3, 2]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase2(TestVarConv2dOp):
-    def set_data(self):
-        # set out_ch 1
-        input_channel = 2
-        output_channel = 1
-        filter_size = [3, 3]
-        stride = [2, 2]
-        row = [4, 7]
-        col = [5, 2]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase3(TestVarConv2dOp):
-    def set_data(self):
-        # set batch 1
-        input_channel = 2
-        output_channel = 1
-        filter_size = [3, 3]
-        stride = [2, 2]
-        row = [7]
-        col = [2]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase4(TestVarConv2dOp):
-    def set_data(self):
-        # set filter size very large
-        input_channel = 3
-        output_channel = 4
-        filter_size = [6, 6]
-        stride = [2, 2]
-        row = [4, 7]
-        col = [5, 2]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase5(TestVarConv2dOp):
-    def set_data(self):
-        # set input very small
-        input_channel = 5
-        output_channel = 3
-        filter_size = [3, 3]
-        stride = [1, 1]
-        row = [1, 1]
-        col = [1, 1]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase6(TestVarConv2dOp):
-    def set_data(self):
-        input_channel = 1
-        output_channel = 3
-        filter_size = [3, 3]
-        stride = [1, 1]
-        row = [1, 1]
-        col = [1, 1]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase7(TestVarConv2dOp):
-    def set_data(self):
-        input_channel = 2
-        output_channel = 3
-        filter_size = [3, 3]
-        stride = [1, 1]
-        row = [5, 4]
-        col = [6, 7]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dApi(unittest.TestCase):
-    def test_api(self):
-        import paddle.fluid as fluid
-
-        x = fluid.layers.data(name='x', shape=[1], lod_level=1)
-        row = fluid.layers.data(name='row', shape=[6], lod_level=1)
-        col = fluid.layers.data(name='col', shape=[6], lod_level=1)
-        out = fluid.contrib.var_conv_2d(
-            input=x,
-            row=row,
-            col=col,
-            input_channel=3,
-            output_channel=5,
-            filter_size=[3, 3],
-            stride=1)
-
-        place = fluid.CPUPlace()
-        x_tensor = fluid.create_lod_tensor(
-            np.random.rand(116, 1).astype('float32'), [[60, 56]], place)
-        row_tensor = fluid.create_lod_tensor(
-            np.random.rand(9, 6).astype('float32'), [[5, 4]], place)
-        col_tensor = fluid.create_lod_tensor(
-            np.random.rand(13, 6).astype('float32'), [[6, 7]], place)
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(
-            feed={'x': x_tensor,
-                  'row': row_tensor,
-                  'col': col_tensor},
-            fetch_list=[out],
-            return_numpy=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
deleted file mode 100644
index d72511ee0f09451f7b8ade57c71c85821c3bf3b9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ /dev/null
@@ -1,187 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_, in_dygraph_mode
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-import numpy as np
-
-
-class TestVariable(unittest.TestCase):
-    def test_np_dtype_convert(self):
-        DT = core.VarDesc.VarType
-        convert = convert_np_dtype_to_dtype_
-        self.assertEqual(DT.FP32, convert(np.float32))
-        self.assertEqual(DT.FP16, convert("float16"))
-        self.assertEqual(DT.FP64, convert("float64"))
-        self.assertEqual(DT.INT32, convert("int32"))
-        self.assertEqual(DT.INT16, convert("int16"))
-        self.assertEqual(DT.INT64, convert("int64"))
-        self.assertEqual(DT.BOOL, convert("bool"))
-        self.assertEqual(DT.INT8, convert("int8"))
-        self.assertEqual(DT.UINT8, convert("uint8"))
-
-    def test_var(self):
-        b = default_main_program().current_block()
-        w = b.create_var(
-            dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
-        self.assertNotEqual(str(w), "")
-        self.assertEqual(core.VarDesc.VarType.FP64, w.dtype)
-        self.assertEqual((784, 100), w.shape)
-        self.assertEqual("fc.w", w.name)
-        self.assertEqual(0, w.lod_level)
-
-        w = b.create_var(name='fc.w')
-        self.assertEqual(core.VarDesc.VarType.FP64, w.dtype)
-        self.assertEqual((784, 100), w.shape)
-        self.assertEqual("fc.w", w.name)
-        self.assertEqual(0, w.lod_level)
-
-        self.assertRaises(ValueError,
-                          lambda: b.create_var(name="fc.w", shape=(24, 100)))
-
-    def test_step_scopes(self):
-        prog = Program()
-        b = prog.current_block()
-        var = b.create_var(
-            name='step_scopes', type=core.VarDesc.VarType.STEP_SCOPES)
-        self.assertEqual(core.VarDesc.VarType.STEP_SCOPES, var.type)
-
-    def _test_slice(self, place):
-        b = default_main_program().current_block()
-        w = b.create_var(dtype="float64", shape=[784, 100, 100], lod_level=0)
-
-        for i in range(3):
-            nw = w[i]
-            self.assertEqual((100, 100), nw.shape)
-
-        nw = w[:]
-        self.assertEqual((784, 100, 100), nw.shape)
-
-        nw = w[:, :]
-        self.assertEqual((784, 100, 100), nw.shape)
-
-        nw = w[:, :, -1]
-        self.assertEqual((784, 100), nw.shape)
-
-        nw = w[1, 1, 1]
-
-        self.assertEqual(len(nw.shape), 1)
-        self.assertEqual(nw.shape[0], 1)
-
-        nw = w[:, :, :-1]
-        self.assertEqual((784, 100, 99), nw.shape)
-
-        self.assertEqual(0, nw.lod_level)
-
-        main = fluid.Program()
-        with fluid.program_guard(main):
-            exe = fluid.Executor(place)
-            tensor_array = np.array(
-                [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-                 [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
-                 [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype('float32')
-            var = fluid.layers.assign(tensor_array)
-            var1 = var[0, 1, 1]
-            var2 = var[1:]
-            var3 = var[0:1]
-            var4 = var[::-1]
-            var5 = var[1, 1:, 1:]
-            var_reshape = fluid.layers.reshape(var, [3, -1, 3])
-            var6 = var_reshape[:, :, -1]
-            var7 = var[:, :, :-1]
-            var8 = var[:1, :1, :1]
-            var9 = var[:-1, :-1, :-1]
-            var10 = var[::-1, :1, :-1]
-            var11 = var[:-1, ::-1, -1:]
-            var12 = var[1:2, 2:, ::-1]
-            var13 = var[2:10, 2:, -2:-1]
-            var14 = var[1:-1, 0:2, ::-1]
-            var15 = var[::-1, ::-1, ::-1]
-
-            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            y = fluid.layers.fc(input=x, size=1, act=None)
-            y_1 = y[:, 0]
-            feeder = fluid.DataFeeder(place=place, feed_list=[x])
-            data = []
-            data.append((np.random.randint(10, size=[13]).astype('float32')))
-            exe.run(fluid.default_startup_program())
-
-            local_out = exe.run(main,
-                                feed=feeder.feed([data]),
-                                fetch_list=[
-                                    var, var1, var2, var3, var4, var5, var6,
-                                    var7, var8, var9, var10, var11, var12,
-                                    var13, var14, var15
-                                ])
-
-            self.assertTrue(
-                np.array_equal(local_out[1], tensor_array[0, 1, 1:2]))
-            self.assertTrue(np.array_equal(local_out[2], tensor_array[1:]))
-            self.assertTrue(np.array_equal(local_out[3], tensor_array[0:1]))
-            self.assertTrue(np.array_equal(local_out[4], tensor_array[::-1]))
-            self.assertTrue(
-                np.array_equal(local_out[5], tensor_array[1, 1:, 1:]))
-            self.assertTrue(
-                np.array_equal(local_out[6],
-                               tensor_array.reshape((3, -1, 3))[:, :, -1]))
-            self.assertTrue(
-                np.array_equal(local_out[7], tensor_array[:, :, :-1]))
-            self.assertTrue(
-                np.array_equal(local_out[8], tensor_array[:1, :1, :1]))
-            self.assertTrue(
-                np.array_equal(local_out[9], tensor_array[:-1, :-1, :-1]))
-            self.assertTrue(
-                np.array_equal(local_out[10], tensor_array[::-1, :1, :-1]))
-            self.assertTrue(
-                np.array_equal(local_out[11], tensor_array[:-1, ::-1, -1:]))
-            self.assertTrue(
-                np.array_equal(local_out[12], tensor_array[1:2, 2:, ::-1]))
-            self.assertTrue(
-                np.array_equal(local_out[13], tensor_array[2:10, 2:, -2:-1]))
-            self.assertTrue(
-                np.array_equal(local_out[14], tensor_array[1:-1, 0:2, ::-1]))
-            self.assertTrue(
-                np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1]))
-
-    def test_slice(self):
-        place = fluid.CPUPlace()
-        self._test_slice(place)
-
-        if core.is_compiled_with_cuda():
-            self._test_slice(core.CUDAPlace(0))
-
-    def _tostring(self):
-        b = default_main_program().current_block()
-        w = b.create_var(dtype="float64", lod_level=0)
-        self.assertTrue(isinstance(str(w), str))
-
-        if core.is_compiled_with_cuda():
-            wc = b.create_var(dtype="int", lod_level=0)
-            self.assertTrue(isinstance(str(wc), str))
-
-    def test_tostring(self):
-        with fluid.dygraph.guard():
-            self._tostring()
-
-        with fluid.program_guard(default_main_program()):
-            self._tostring()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_version.py b/python/paddle/fluid/tests/unittests/test_version.py
deleted file mode 100644
index 42a0e5c802c53ed0e6aad38fb9ab0f64122e87f5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_version.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import re
-
-import paddle.version as fluid_version
-
-
-class VersionTest(unittest.TestCase):
-    def setUp(self):
-        self._major_regex = "[0-9]+"
-        self._minor_regex = "[0-9]+"
-        self._patch_regex = "[0-9]+(\\.(a|b|rc)\\.[0-9]+)?"
-        self._rc_regex = "[0-9]+"
-        self._version_regex = "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?"
-        self._commit_regex = "[0-9a-f]{5,49}"
-
-    def test_check_output(self):
-        # check commit format
-        self.assertTrue(re.match(self._commit_regex, fluid_version.commit))
-        self.assertTrue(isinstance(fluid_version.istaged, bool))
-
-        # check version format
-        if fluid_version.istaged:
-            self.assertEqual(fluid_version.major, 0)
-            self.assertEqual(fluid_version.minor, 0)
-            self.assertEqual(fluid_version.patch, "0")
-            self.assertEqual(fluid_version.rc, 0)
-            self.assertEqual(fluid_version.full_version, "0.0.0")
-        else:
-            self.assertTrue(re.match(self._major_regex, fluid_version.major))
-            self.assertTrue(re.match(self._minor_regex, fluid_version.minor))
-            self.assertTrue(re.match(self._patch_regex, fluid_version.patch))
-            self.assertTrue(re.match(self._rc_regex, fluid_version.rc))
-            self.assertTrue(
-                re.match(self._version_regex, fluid_version.full_version))
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
deleted file mode 100644
index 74bb5ea2b02824e2605fb474229802c07e8e8d2c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ /dev/null
@@ -1,337 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_softmax_op import stable_softmax
-
-CUDA_BLOCK_SIZE = 512
-
-
-class CTCForward(object):
-    def __init__(self, softmax, softmax_lod, labels, labels_lod, blank,
-                 norm_by_times):
-        self.softmax = softmax
-        self.softmax_lod = softmax_lod
-        assert labels.shape[1] == 1
-        self.labels = labels
-        self.labels_lod = labels_lod
-        self.blank = blank
-        self.norm_by_times = norm_by_times
-
-        self.level = 0
-        self.num_classes = softmax.shape[1]
-        self.batch_size = len(softmax_lod[self.level])
-        assert self.batch_size == len(labels_lod[self.level])
-
-        self.loss = np.zeros([self.batch_size, 1], dtype="float32")
-        self.gradient = np.zeros(self.softmax.shape, dtype="float32")
-
-        # float64
-        self.EXP_MAX = sys.float_info.max
-        self.EXP_MIN = sys.float_info.min
-        self.LOG_ZERO = np.log(self.EXP_MIN)
-        self.LOG_INFINITY = np.log(self.EXP_MAX)
-
-    def safe_exp(self, x):
-        if x <= self.LOG_ZERO:
-            return 0.0
-        if x >= self.LOG_INFINITY:
-            return self.EXP_MAX
-        return np.exp(x)
-
-    def safe_log(self, x):
-        if x <= self.EXP_MIN:
-            return self.LOG_ZERO
-        return np.log(x)
-
-    # x = lna and y = lnb are in log scale, ln(a / b) = lna - lnb
-    def log_div(self, x, y):
-        res = x - y
-        if res <= self.LOG_ZERO:
-            return self.LOG_ZERO
-        if res >= self.LOG_INFINITY:
-            return self.LOG_INFINITY
-        return res
-
-    # x = lna and y = lnb are in log scale, ln(a * b) = lna + lnb
-    def log_mul(self, x, y):
-        res = x + y
-        if res <= self.LOG_ZERO:
-            return self.LOG_ZERO
-        if res >= self.LOG_INFINITY:
-            return self.LOG_INFINITY
-        return res
-
-    # x = lna and y = lnb are in log scale,
-    # ln(a + b) = lna + ln(1 + exp(lnb - lna)), where b > a
-    def log_add(self, x, y):
-        if x < y:
-            t = y
-            y = x
-            x = t
-        return x + self.safe_log(1 + self.safe_exp(y - x))
-
-    def segment_range(self, time, total_times, total_segments):
-        start = max(0, total_segments - (2 * (total_times - time)))
-        end = min(total_segments, 2 * (time + 1))
-        return start, end
-
-    def forward_a_sequence(self, softmax_a_sequence, labels_a_sequence):
-        total_times = softmax_a_sequence.shape[0]
-        total_segments = labels_a_sequence.shape[0] * 2 + 1
-
-        required_times = labels_a_sequence.shape[0]
-        old_label = -1
-        for i in range(labels_a_sequence.shape[0]):
-            # two contingous labels with the same value
-            if labels_a_sequence[i, 0] == old_label:
-                required_times = required_times + 1
-            old_label = labels_a_sequence[i, 0]
-
-        if total_times < required_times:
-            return 0
-
-        # calculate the forward and backward variables,
-        # reference Chapter 7.3 of "Alex Grave, Supervised Sequence
-        # Labelling with Recurrent Neural Networks"
-        log_acts = np.zeros([total_times, self.num_classes], dtype="float32")
-        for i in range(total_times):
-            for j in range(self.num_classes):
-                log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])
-
-        # calculate the forward variables
-        forward_vars = np.zeros([total_times, total_segments], dtype="float32")
-        for i in range(total_times):
-            for j in range(total_segments):
-                forward_vars[i, j] = self.LOG_ZERO
-
-        for i in range(total_times):
-            # dp initialization at t0
-            if i == 0:
-                forward_vars[i, 0] = log_acts[0, self.blank]
-                if total_segments > 1:
-                    forward_vars[i, 1] = log_acts[0, labels_a_sequence[i, 0]]
-                continue
-
-            # dp from t1
-            start, end = self.segment_range(i, total_times, total_segments)
-            for k in range(end - start):
-                j = k + start
-                if j & 1 == 1:
-                    label_idx = j // 2
-                    label_val = labels_a_sequence[label_idx, 0]
-                    fv = self.log_add(forward_vars[i - 1, j],
-                                      forward_vars[i - 1, j - 1])
-                    if j > 1 and label_val != labels_a_sequence[label_idx - 1,
-                                                                0]:
-                        fv = self.log_add(fv, forward_vars[i - 1, j - 2])
-                    fv = self.log_mul(fv, log_acts[i, label_val])
-                else:
-                    fv = forward_vars[i - 1, j]
-                    if j > 0:
-                        fv = self.log_add(fv, forward_vars[i - 1, j - 1])
-                    fv = self.log_mul(fv, log_acts[i, self.blank])
-                forward_vars[i, j] = fv
-
-        # sum the last two value as log_prob
-        log_prob = forward_vars[total_times - 1, total_segments - 1]
-        if total_segments > 1:
-            log_prob = self.log_add(
-                log_prob, forward_vars[total_times - 1, total_segments - 2])
-
-        return -log_prob
-
-    def forward(self):
-        softmax_offset = 0
-        labels_offset = 0
-        for i in range(self.batch_size):
-            softmax_start_i = softmax_offset
-            softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
-            labels_start_i = labels_offset
-            labels_end_i = labels_offset + self.labels_lod[self.level][i]
-
-            softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
-            labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
-            self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
-                                                   labels_a_sequence)
-            softmax_offset += self.softmax_lod[self.level][i]
-            labels_offset += self.labels_lod[self.level][i]
-        return self.loss
-
-
-class TestWarpCTCOp(OpTest):
-    def config(self):
-        self.batch_size = 4
-        self.num_classes = 8
-        self.logits_lod = [[4, 1, 3, 3]]
-        self.labels_lod = [[3, 1, 4, 4]]
-        self.blank = self.num_classes - 1
-        self.norm_by_times = False
-
-    def setUp(self):
-        self.op_type = "warpctc"
-        self.config()
-
-        logits = np.random.uniform(
-            0.1, 1.0,
-            [sum(self.logits_lod[0]), self.num_classes]).astype("float32")
-        softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        # labels should not be blank
-        labels = np.random.randint(
-            0,
-            self.num_classes - 1, [sum(self.labels_lod[0]), 1],
-            dtype="int32")
-
-        ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
-        loss = ctc.forward()
-
-        max_sequence_length = 0
-        for i in range(self.batch_size):
-            max_sequence_length = max(max_sequence_length,
-                                      self.logits_lod[0][i])
-        self.gradient = np.zeros(
-            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
-
-        self.inputs = {
-            "Logits": (logits, self.logits_lod),
-            "Label": (labels, self.labels_lod)
-        }
-        self.outputs = {"Loss": loss}
-        self.attrs = {
-            "blank": self.blank,
-            "norm_by_times": self.norm_by_times,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.outputs['WarpCTCGrad'] = self.gradient
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.007)
-
-
-class TestWarpCTCOpCase1(TestWarpCTCOp):
-    def config(self):
-        self.batch_size = 4
-        self.num_classes = CUDA_BLOCK_SIZE + 2
-        self.logits_lod = [[4, 1, 3, 3]]
-        self.labels_lod = [[3, 1, 4, 4]]
-        self.blank = 0
-        self.norm_by_times = False
-
-
-class TestWarpCTCOpWithPadding(OpTest):
-    def config(self):
-        self.batch_size = 4
-        self.num_classes = 8
-        self.logits_lod = [[4, 1, 3, 3]]
-        self.labels_lod = [[3, 1, 4, 4]]
-        self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
-        self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
-        self.blank = self.num_classes - 1
-        self.norm_by_times = False
-
-    def setUp(self):
-        self.op_type = "warpctc"
-        self.config()
-
-        logits = np.random.uniform(
-            0.1, 1.0,
-            [sum(self.logits_length), self.num_classes]).astype("float32")
-        softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        # labels should not be blank
-        labels = np.random.randint(
-            0,
-            self.num_classes - 1, [sum(self.labels_length), 1],
-            dtype="int32")
-
-        ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
-        loss = ctc.forward()
-
-        max_sequence_length = 0
-        for i in range(self.batch_size):
-            max_sequence_length = max(max_sequence_length,
-                                      self.logits_length[i])
-        # reshape logits to T*N*S
-        new_logits = np.zeros(
-            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
-
-        cur = 0
-        for batch_id in range(self.batch_size):
-            for i in range(self.logits_length[batch_id]):
-                for j in range(self.num_classes):
-                    new_logits[i, batch_id, j] = logits[cur + i, j]
-            cur = cur + self.logits_length[batch_id]
-
-        # reshape labels to N*S
-        max_target_seq_length = 0
-        for i in range(self.batch_size):
-            max_target_seq_length = max(max_target_seq_length,
-                                        self.labels_length[i])
-        new_labels = np.zeros(
-            [self.batch_size, max_target_seq_length], dtype="int32")
-
-        cur = 0
-        for batch_id in range(self.batch_size):
-            for i in range(self.labels_length[batch_id]):
-                new_labels[batch_id, i] = labels[cur + i]
-            cur = cur + self.labels_length[batch_id]
-
-        self.gradient = np.zeros(
-            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
-
-        self.inputs = {
-            "Logits": new_logits,
-            "Label": labels,
-            "LogitsLength": self.logits_length,
-            "LabelLength": self.labels_length
-        }
-        self.outputs = {"Loss": loss}
-        self.attrs = {
-            "blank": self.blank,
-            "norm_by_times": self.norm_by_times,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.outputs['WarpCTCGrad'] = self.gradient
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.007)
-
-
-class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
-    def config(self):
-        self.batch_size = 4
-        self.num_classes = CUDA_BLOCK_SIZE + 2
-        self.logits_lod = [[4, 1, 3, 3]]
-        self.labels_lod = [[3, 1, 4, 4]]
-        self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
-        self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
-        self.blank = 0
-        self.norm_by_times = False
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py
deleted file mode 100644
index 2a2ad0f6d03bb39ddf345c259b3e04334235521f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_weight_decay.py
+++ /dev/null
@@ -1,181 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import contextlib
-
-import unittest
-from functools import partial
-import numpy as np
-import paddle
-import paddle.fluid.core as core
-
-import paddle.fluid as fluid
-from paddle.fluid import compiler
-
-
-def get_places():
-    places = []
-    if core.is_compiled_with_cuda():
-        places.append(core.CUDAPlace(0))
-    return places
-
-
-@contextlib.contextmanager
-def prog_scope_guard(main_prog, startup_prog):
-    scope = fluid.core.Scope()
-    with fluid.unique_name.guard():
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(main_prog, startup_prog):
-                yield
-
-
-def bow_net(data,
-            label,
-            dict_dim,
-            is_sparse=False,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    fluid/PaddleNLP/text_classification/nets.py
-    """
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
-    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    return avg_cost
-
-
-class TestWeightDecay(unittest.TestCase):
-    def setUp(self):
-        self.word_dict = paddle.dataset.imdb.word_dict()
-        reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict), batch_size=4)()
-        self.train_data = [next(reader) for _ in range(5)]
-        self.learning_rate = .5
-
-    def run_executor(self, place, feed_list, loss):
-        exe = fluid.Executor(place)
-        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
-        exe.run(fluid.default_startup_program())
-        main_prog = fluid.default_main_program()
-        loss_set = []
-        for data in self.train_data:
-            out = exe.run(main_prog,
-                          feed=feeder.feed(data),
-                          fetch_list=[loss.name])
-
-            print("loss              %s" % (np.average(out)))
-            loss_set.append(np.average(out))
-
-        return loss_set
-
-    def run_parallel_exe(self,
-                         place,
-                         feed_list,
-                         loss,
-                         use_reduce=False,
-                         use_fast_executor=False,
-                         use_ir_memory_optimize=False):
-        exe = fluid.Executor(place)
-        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
-        exe.run(fluid.default_startup_program())
-
-        exec_strategy = fluid.ExecutionStrategy()
-        if use_fast_executor:
-            exec_strategy.use_experimental_executor = True
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
-                if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
-        build_strategy.memory_optimize = use_ir_memory_optimize
-
-        train_cp = compiler.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(
-            loss_name=loss.name,
-            exec_strategy=exec_strategy,
-            build_strategy=build_strategy)
-
-        loss_set = []
-        for data in self.train_data:
-            out = exe.run(train_cp,
-                          feed=feeder.feed(data),
-                          fetch_list=[loss.name])
-            loss_set.append(np.average(out))
-
-        return loss_set
-
-    def check_weight_decay(self,
-                           place,
-                           model,
-                           use_parallel_exe=False,
-                           use_reduce=False):
-        main_prog = fluid.framework.Program()
-        startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
-        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            avg_cost = model(data, label, len(self.word_dict))
-
-            param_list = [(var, var * self.learning_rate)
-                          for var in main_prog.block(0).all_parameters()]
-
-            optimizer = fluid.optimizer.Adagrad(
-                learning_rate=self.learning_rate)
-            optimizer.minimize(avg_cost)
-
-            for params in param_list:
-                updated_p = fluid.layers.elementwise_sub(
-                    x=params[0], y=params[1])
-                fluid.layers.assign(input=updated_p, output=params[0])
-
-            if use_parallel_exe:
-                loss = self.run_parallel_exe(
-                    place, [data, label], loss=avg_cost, use_reduce=use_reduce)
-            else:
-                loss = self.run_executor(place, [data, label], loss=avg_cost)
-
-        return loss
-
-    def test_weight_decay(self):
-        model = partial(bow_net, is_sparse=False)
-        for place in get_places():
-            loss = self.check_weight_decay(place, model, use_parallel_exe=False)
-
-            # TODO(zcd): should test use_reduce=True
-            loss2 = self.check_weight_decay(
-                place, model, use_parallel_exe=True, use_reduce=False)
-
-            for i in range(len(loss)):
-                self.assertTrue(
-                    np.isclose(
-                        a=loss[i], b=loss2[i], rtol=5e-5),
-                    "Expect " + str(loss[i]) + "\n" + "But Got" + str(loss2[i])
-                    + " in class " + self.__class__.__name__)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
deleted file mode 100644
index e990d8b2498f6a1b62f7a34d329e3ca72a962728..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy
-import collections
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.initializer import ConstantInitializer
-from paddle.fluid.param_attr import WeightNormParamAttr
-
-
-class TestWeightNormalization(unittest.TestCase):
-    batch_size = 3
-    hidden_size = 5
-    data_desc = (['x', [10], 0], )
-
-    @classmethod
-    def setUpClass(cls):
-        cls.set_program()
-
-    @classmethod
-    def set_program(cls):
-        data = fluid.layers.data(
-            name=cls.data_desc[0][0], shape=cls.data_desc[0][1])
-        out = fluid.layers.fc(input=data,
-                              size=cls.hidden_size,
-                              param_attr=WeightNormParamAttr(
-                                  dim=None,
-                                  name='weight_norm_param',
-                                  initializer=ConstantInitializer(1.0)),
-                              bias_attr=False,
-                              act=None)
-        loss = fluid.layers.reduce_sum(out)
-        fluid.backward.append_backward(loss=loss)
-        cls.fetch_list = [
-            'weight_norm_param_g', 'weight_norm_param_v',
-            'weight_norm_param_g@GRAD'
-        ]
-
-    def run_program(self):
-        outputs = []
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.set_inputs(place)
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            output = exe.run(fluid.default_main_program(),
-                             feed=self.inputs,
-                             fetch_list=self.fetch_list,
-                             return_numpy=False)
-            outputs.append(output)
-        self.actual_outputs = outputs
-
-    def set_data(self):
-        self.data = collections.OrderedDict()
-        for desc in self.data_desc:
-            data_name = desc[0]
-            data_shape = desc[1]
-            data_lod_level = desc[2]
-            data_lod = []
-            for i in range(data_lod_level):
-                lod_level_i = numpy.random.randint(
-                    low=1,
-                    high=5,
-                    size=self.batch_size
-                    if i == 0 else sum(lod_level_i)).tolist()
-                data_lod.append(lod_level_i)
-            data_value = numpy.random.random(
-                size=[sum(data_lod[-1]) if data_lod else self.batch_size
-                      ] + data_shape).astype('float32')
-            self.data[data_name] = (data_value, data_lod)
-
-    def set_inputs(self, place):
-        self.inputs = {}
-        for desc in self.data_desc:
-            tensor = fluid.Tensor()
-            tensor.set(self.data[desc[0]][0], place)
-            if self.data[desc[0]][1]:
-                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
-            self.inputs[desc[0]] = tensor
-
-    def weight_normalize(self):
-        v = numpy.ones((self.data[self.data_desc[0][0]][0].shape[-1],
-                        self.hidden_size))
-        g = numpy.linalg.norm(v, axis=None, keepdims=True)
-        w = g * v / numpy.linalg.norm(v, axis=None, keepdims=True)
-        x = self.data[self.data_desc[0][0]][0]
-        out = numpy.dot(x, w)
-        g_grad = (numpy.dot(x.T, numpy.ones_like(out)) * (v / numpy.linalg.norm(
-            v, axis=None, keepdims=True))).sum(axis=None, keepdims=True)
-        return g, v, g_grad
-
-    def test_weight_normalization(self):
-        self.set_data()
-        self.run_program()
-        expect_output = self.weight_normalize()
-        for actual_output in self.actual_outputs:
-            [
-                self.assertTrue(
-                    numpy.allclose(
-                        numpy.array(actual), expect, atol=0.001))
-                for expect, actual in zip(expect_output, actual_output)
-            ]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_where.py b/python/paddle/fluid/tests/unittests/test_where.py
deleted file mode 100644
index ee0fa1613093c982320337aaa453114cfb187db4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_where.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestWhereOp(OpTest):
-    def setUp(self):
-        self.op_type = "where"
-        self.init_config()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_config(self):
-        self.inputs = {'Condition': np.array([True, False, True]), }
-
-        self.outputs = {'Out': np.array([[0], [2]], dtype='int64')}
-
-
-class TestAllFalse(unittest.TestCase):
-    def setUp(self):
-        self.op_type = "where"
-        self.init_config()
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        condition = scope.var('Condition').get_tensor()
-        condition.set(self.cond_data, place)
-
-        out = scope.var("Out").get_tensor()
-        out.set(np.full(self.shape, 0).astype('int64'), place)
-
-        op = Operator("where", Condition="Condition", Out="Out")
-        op.run(scope, place)
-
-        out_array = np.array(out)
-        self.assertTrue((out_array == self.out_data).all())
-
-    def init_config(self):
-        self.cond_data = np.array([False, False, False])
-        self.shape = (3, 1)
-        self.out_data = np.array([], dtype='int64')
-
-    def test_all_false(self):
-        self.check_with_place(core.CPUPlace())
-
-        if core.is_compiled_with_cuda():
-            self.check_with_place(core.CUDAPlace(0))
-
-
-class TestRank2(TestWhereOp):
-    def init_config(self):
-        self.inputs = {'Condition': np.array([[True, False], [False, True]]), }
-
-        self.outputs = {'Out': np.array([[0, 0], [1, 1]], dtype='int64')}
-
-
-class TestRank3(TestWhereOp):
-    def init_config(self):
-        self.inputs = {
-            'Condition': np.array([[[True, False], [False, True]],
-                                   [[False, True], [True, False]],
-                                   [[False, False], [False, True]]]),
-        }
-
-        self.outputs = {
-            'Out': np.array(
-                [[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0], [2, 1, 1]],
-                dtype='int64')
-        }
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
deleted file mode 100644
index f19601d72835f7041d3d6434ffe9fcf09ad15065..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.layers as layers
-from paddle.fluid.executor import Executor
-import paddle.fluid.core as core
-from paddle.fluid.backward import append_backward
-import numpy
-
-
-class TestWhileOp(unittest.TestCase):
-    def test_simple_forward(self):
-        d0 = layers.data(
-            "d0", shape=[10], append_batch_size=False, dtype='float32')
-        d1 = layers.data(
-            "d1", shape=[10], append_batch_size=False, dtype='float32')
-        d2 = layers.data(
-            "d2", shape=[10], append_batch_size=False, dtype='float32')
-
-        i = layers.zeros(shape=[1], dtype='int64')
-        i.stop_gradient = True
-
-        init = layers.zeros(shape=[10], dtype='float32')
-        mem_array = layers.array_write(x=init, i=i)
-        data_array = layers.array_write(x=d0, i=i)
-
-        i = layers.increment(i)
-        layers.array_write(d1, i, array=data_array)
-
-        i = layers.increment(i)
-        layers.array_write(d2, i, array=data_array)
-
-        i = layers.zeros(shape=[1], dtype='int64')
-        i.stop_gradient = True
-
-        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
-        array_len.stop_gradient = True
-        cond = layers.less_than(x=i, y=array_len)
-
-        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
-        j.stop_gradient = True
-
-        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
-        array_len2.stop_gradient = True
-        cond2 = layers.less_than(x=j, y=array_len2)
-
-        while_op = layers.While(cond=cond)
-        while_op2 = layers.While(cond=cond2)
-        with while_op.block():
-            d = layers.array_read(array=data_array, i=i)
-            prev = layers.array_read(array=mem_array, i=i)
-            result = layers.sums(input=[d, prev])
-
-            i = layers.increment(x=i, in_place=True)
-            layers.array_write(result, i=i, array=mem_array)
-            layers.less_than(x=i, y=array_len, cond=cond)
-
-            with while_op2.block():
-                d2 = layers.array_read(array=data_array, i=j)
-                prev2 = layers.array_read(array=mem_array, i=j)
-                result2 = layers.sums(input=[d2, prev2])
-
-                j = layers.increment(x=j, in_place=True)
-                layers.array_write(result2, i=j, array=mem_array)
-                layers.less_than(x=j, y=array_len2, cond=cond2)
-
-        sum_result = layers.array_read(array=mem_array, i=j)
-        loss = layers.mean(sum_result)
-
-        append_backward(loss)
-
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-        d = []
-
-        for i in range(3):
-            d.append(numpy.random.random(size=[10]).astype('float32'))
-
-        outs = exe.run(feed={'d0': d[0],
-                             'd1': d[1],
-                             'd2': d[2]},
-                       fetch_list=[sum_result])
-        self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
-
-    def test_exceptions(self):
-        i = layers.zeros(shape=[2], dtype='int64')
-        array_len = layers.fill_constant(shape=[2], dtype='int64', value=1)
-        cond = layers.less_than(x=i, y=array_len)
-        with self.assertRaises(TypeError):
-            layers.While(cond=cond)
-        cond = layers.cast(cond, dtype='float64')
-        with self.assertRaises(TypeError):
-            layers.While(cond=cond)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
deleted file mode 100644
index 416e6ea9f412d86db877fc36175e8b910b0613fe..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-from paddle.fluid import core
-
-
-def sigmoid(x):
-    return 1.0 / (1.0 + np.exp(-1.0 * x))
-
-
-def YoloBox(x, img_size, attrs):
-    n, c, h, w = x.shape
-    anchors = attrs['anchors']
-    an_num = int(len(anchors) // 2)
-    class_num = attrs['class_num']
-    conf_thresh = attrs['conf_thresh']
-    downsample = attrs['downsample']
-    input_size = downsample * h
-
-    x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
-
-    pred_box = x[:, :, :, :, :4].copy()
-    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
-    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
-    pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
-    pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
-
-    anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
-    anchors_s = np.array(
-        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
-    anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1))
-    anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1))
-    pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
-    pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
-
-    pred_conf = sigmoid(x[:, :, :, :, 4:5])
-    pred_conf[pred_conf < conf_thresh] = 0.
-    pred_score = sigmoid(x[:, :, :, :, 5:]) * pred_conf
-    pred_box = pred_box * (pred_conf > 0.).astype('float32')
-
-    pred_box = pred_box.reshape((n, -1, 4))
-    pred_box[:, :, :2], pred_box[:, :, 2:4] = \
-        pred_box[:, :, :2] - pred_box[:, :, 2:4] / 2., \
-        pred_box[:, :, :2] + pred_box[:, :, 2:4] / 2.0
-    pred_box[:, :, 0] = pred_box[:, :, 0] * img_size[:, 1][:, np.newaxis]
-    pred_box[:, :, 1] = pred_box[:, :, 1] * img_size[:, 0][:, np.newaxis]
-    pred_box[:, :, 2] = pred_box[:, :, 2] * img_size[:, 1][:, np.newaxis]
-    pred_box[:, :, 3] = pred_box[:, :, 3] * img_size[:, 0][:, np.newaxis]
-
-    for i in range(len(pred_box)):
-        pred_box[i, :, 0] = np.clip(pred_box[i, :, 0], 0, np.inf)
-        pred_box[i, :, 1] = np.clip(pred_box[i, :, 1], 0, np.inf)
-        pred_box[i, :, 2] = np.clip(pred_box[i, :, 2], -np.inf,
-                                    img_size[i, 1] - 1)
-        pred_box[i, :, 3] = np.clip(pred_box[i, :, 3], -np.inf,
-                                    img_size[i, 0] - 1)
-
-    return pred_box, pred_score.reshape((n, -1, class_num))
-
-
-class TestYoloBoxOp(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = 'yolo_box'
-        x = np.random.random(self.x_shape).astype('float32')
-        img_size = np.random.randint(10, 20, self.imgsize_shape).astype('int32')
-
-        self.attrs = {
-            "anchors": self.anchors,
-            "class_num": self.class_num,
-            "conf_thresh": self.conf_thresh,
-            "downsample": self.downsample,
-        }
-
-        self.inputs = {
-            'X': x,
-            'ImgSize': img_size,
-        }
-        boxes, scores = YoloBox(x, img_size, self.attrs)
-        self.outputs = {
-            "Boxes": boxes,
-            "Scores": scores,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def initTestCase(self):
-        self.anchors = [10, 13, 16, 30, 33, 23]
-        an_num = int(len(self.anchors) // 2)
-        self.batch_size = 32
-        self.class_num = 2
-        self.conf_thresh = 0.5
-        self.downsample = 32
-        self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
-        self.imgsize_shape = (self.batch_size, 2)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
deleted file mode 100644
index 623e2228a4c2865c65277f44ad92a2060c18b49a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ /dev/null
@@ -1,260 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-
-import unittest
-import numpy as np
-from scipy.special import logit
-from scipy.special import expit
-from op_test import OpTest
-
-from paddle.fluid import core
-
-
-def l1loss(x, y):
-    return abs(x - y)
-
-
-def sce(x, label):
-    sigmoid_x = expit(x)
-    term1 = label * np.log(sigmoid_x)
-    term2 = (1.0 - label) * np.log(1.0 - sigmoid_x)
-    return -term1 - term2
-
-
-def sigmoid(x):
-    return 1.0 / (1.0 + np.exp(-1.0 * x))
-
-
-def batch_xywh_box_iou(box1, box2):
-    b1_left = box1[:, :, 0] - box1[:, :, 2] / 2
-    b1_right = box1[:, :, 0] + box1[:, :, 2] / 2
-    b1_top = box1[:, :, 1] - box1[:, :, 3] / 2
-    b1_bottom = box1[:, :, 1] + box1[:, :, 3] / 2
-
-    b2_left = box2[:, :, 0] - box2[:, :, 2] / 2
-    b2_right = box2[:, :, 0] + box2[:, :, 2] / 2
-    b2_top = box2[:, :, 1] - box2[:, :, 3] / 2
-    b2_bottom = box2[:, :, 1] + box2[:, :, 3] / 2
-
-    left = np.maximum(b1_left[:, :, np.newaxis], b2_left[:, np.newaxis, :])
-    right = np.minimum(b1_right[:, :, np.newaxis], b2_right[:, np.newaxis, :])
-    top = np.maximum(b1_top[:, :, np.newaxis], b2_top[:, np.newaxis, :])
-    bottom = np.minimum(b1_bottom[:, :, np.newaxis],
-                        b2_bottom[:, np.newaxis, :])
-
-    inter_w = np.clip(right - left, 0., 1.)
-    inter_h = np.clip(bottom - top, 0., 1.)
-    inter_area = inter_w * inter_h
-
-    b1_area = (b1_right - b1_left) * (b1_bottom - b1_top)
-    b2_area = (b2_right - b2_left) * (b2_bottom - b2_top)
-    union = b1_area[:, :, np.newaxis] + b2_area[:, np.newaxis, :] - inter_area
-
-    return inter_area / union
-
-
-def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
-    n, c, h, w = x.shape
-    b = gtbox.shape[1]
-    anchors = attrs['anchors']
-    an_num = len(anchors) // 2
-    anchor_mask = attrs['anchor_mask']
-    mask_num = len(anchor_mask)
-    class_num = attrs["class_num"]
-    ignore_thresh = attrs['ignore_thresh']
-    downsample_ratio = attrs['downsample_ratio']
-    use_label_smooth = attrs['use_label_smooth']
-    input_size = downsample_ratio * h
-    x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
-    loss = np.zeros((n)).astype('float32')
-
-    smooth_weight = min(1.0 / class_num, 1.0 / 40)
-    label_pos = 1.0 - smooth_weight if use_label_smooth else 1.0
-    label_neg = smooth_weight if use_label_smooth else 0.0
-
-    pred_box = x[:, :, :, :, :4].copy()
-    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
-    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
-    pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
-    pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
-
-    mask_anchors = []
-    for m in anchor_mask:
-        mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
-    anchors_s = np.array(
-        [(an_w / input_size, an_h / input_size) for an_w, an_h in mask_anchors])
-    anchor_w = anchors_s[:, 0:1].reshape((1, mask_num, 1, 1))
-    anchor_h = anchors_s[:, 1:2].reshape((1, mask_num, 1, 1))
-    pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
-    pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
-
-    pred_box = pred_box.reshape((n, -1, 4))
-    pred_obj = x[:, :, :, :, 4].reshape((n, -1))
-    objness = np.zeros(pred_box.shape[:2]).astype('float32')
-    ious = batch_xywh_box_iou(pred_box, gtbox)
-    ious_max = np.max(ious, axis=-1)
-    objness = np.where(ious_max > ignore_thresh, -np.ones_like(objness),
-                       objness)
-
-    gtbox_shift = gtbox.copy()
-    gtbox_shift[:, :, 0] = 0
-    gtbox_shift[:, :, 1] = 0
-
-    anchors = [(anchors[2 * i], anchors[2 * i + 1]) for i in range(0, an_num)]
-    anchors_s = np.array(
-        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
-    anchor_boxes = np.concatenate(
-        [np.zeros_like(anchors_s), anchors_s], axis=-1)
-    anchor_boxes = np.tile(anchor_boxes[np.newaxis, :, :], (n, 1, 1))
-    ious = batch_xywh_box_iou(gtbox_shift, anchor_boxes)
-    iou_matches = np.argmax(ious, axis=-1)
-    gt_matches = iou_matches.copy()
-    for i in range(n):
-        for j in range(b):
-            if gtbox[i, j, 2:].sum() == 0:
-                gt_matches[i, j] = -1
-                continue
-            if iou_matches[i, j] not in anchor_mask:
-                gt_matches[i, j] = -1
-                continue
-            an_idx = anchor_mask.index(iou_matches[i, j])
-            gt_matches[i, j] = an_idx
-            gi = int(gtbox[i, j, 0] * w)
-            gj = int(gtbox[i, j, 1] * h)
-
-            tx = gtbox[i, j, 0] * w - gi
-            ty = gtbox[i, j, 1] * w - gj
-            tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0])
-            th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1])
-            scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) * gtscore[i, j]
-            loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale
-            loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale
-            loss[i] += l1loss(x[i, an_idx, gj, gi, 2], tw) * scale
-            loss[i] += l1loss(x[i, an_idx, gj, gi, 3], th) * scale
-
-            objness[i, an_idx * h * w + gj * w + gi] = gtscore[i, j]
-
-            for label_idx in range(class_num):
-                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], label_pos
-                               if label_idx == gtlabel[i, j] else
-                               label_neg) * gtscore[i, j]
-
-        for j in range(mask_num * h * w):
-            if objness[i, j] > 0:
-                loss[i] += sce(pred_obj[i, j], 1.0) * objness[i, j]
-            elif objness[i, j] == 0:
-                loss[i] += sce(pred_obj[i, j], 0.0)
-
-    return (loss, objness.reshape((n, mask_num, h, w)).astype('float32'), \
-            gt_matches.astype('int32'))
-
-
-class TestYolov3LossOp(OpTest):
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = 'yolov3_loss'
-        x = logit(np.random.uniform(0, 1, self.x_shape).astype('float32'))
-        gtbox = np.random.random(size=self.gtbox_shape).astype('float32')
-        gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2])
-        gtmask = np.random.randint(0, 2, self.gtbox_shape[:2])
-        gtbox = gtbox * gtmask[:, :, np.newaxis]
-        gtlabel = gtlabel * gtmask
-
-        self.attrs = {
-            "anchors": self.anchors,
-            "anchor_mask": self.anchor_mask,
-            "class_num": self.class_num,
-            "ignore_thresh": self.ignore_thresh,
-            "downsample_ratio": self.downsample_ratio,
-            "use_label_smooth": self.use_label_smooth,
-        }
-
-        self.inputs = {
-            'X': x,
-            'GTBox': gtbox.astype('float32'),
-            'GTLabel': gtlabel.astype('int32'),
-        }
-
-        gtscore = np.ones(self.gtbox_shape[:2]).astype('float32')
-        if self.gtscore:
-            gtscore = np.random.random(self.gtbox_shape[:2]).astype('float32')
-            self.inputs['GTScore'] = gtscore
-
-        loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, gtscore,
-                                               self.attrs)
-        self.outputs = {
-            'Loss': loss,
-            'ObjectnessMask': objness,
-            "GTMatchMask": gt_matches
-        }
-
-    def test_check_output(self):
-        place = core.CPUPlace()
-        self.check_output_with_place(place, atol=2e-3)
-
-    def test_check_grad_ignore_gtbox(self):
-        place = core.CPUPlace()
-        self.check_grad_with_place(place, ['X'], 'Loss', max_relative_error=0.2)
-
-    def initTestCase(self):
-        self.anchors = [
-            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
-            373, 326
-        ]
-        self.anchor_mask = [0, 1, 2]
-        self.class_num = 5
-        self.ignore_thresh = 0.7
-        self.downsample_ratio = 32
-        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
-        self.gtbox_shape = (3, 5, 4)
-        self.gtscore = True
-        self.use_label_smooth = True
-
-
-class TestYolov3LossWithoutLabelSmooth(TestYolov3LossOp):
-    def initTestCase(self):
-        self.anchors = [
-            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
-            373, 326
-        ]
-        self.anchor_mask = [0, 1, 2]
-        self.class_num = 5
-        self.ignore_thresh = 0.7
-        self.downsample_ratio = 32
-        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
-        self.gtbox_shape = (3, 5, 4)
-        self.gtscore = True
-        self.use_label_smooth = False
-
-
-class TestYolov3LossNoGTScore(TestYolov3LossOp):
-    def initTestCase(self):
-        self.anchors = [
-            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
-            373, 326
-        ]
-        self.anchor_mask = [0, 1, 2]
-        self.class_num = 5
-        self.ignore_thresh = 0.7
-        self.downsample_ratio = 32
-        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
-        self.gtbox_shape = (3, 5, 4)
-        self.gtscore = False
-        self.use_label_smooth = True
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
deleted file mode 100644
index c4eb26893cd1faac72ac06c70a68c52f26b39182..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ /dev/null
@@ -1,194 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-def create_op(scope, op_type, inputs, outputs, attrs, cache_list=None):
-    kwargs = dict()
-
-    op_maker = core.op_proto_and_checker_maker
-    op_role_attr_name = op_maker.kOpRoleAttrName()
-
-    if op_role_attr_name not in attrs:
-        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
-
-    def __create_var__(name, var_name):
-        scope.var(var_name).get_tensor()
-        kwargs[name].append(var_name)
-
-    for in_name, in_dup in Operator.get_op_inputs(op_type):
-        if in_name in inputs:
-            kwargs[in_name] = []
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, _ = item[0], item[1]
-                    __create_var__(in_name, sub_in_name)
-            else:
-                __create_var__(in_name, in_name)
-    if cache_list != None and isinstance(cache_list, list):
-        for name in cache_list:
-            kwargs[name] = []
-            scope.var(name)
-            kwargs[name].append(name)
-
-    for out_name, out_dup in Operator.get_op_outputs(op_type):
-        if out_name in outputs:
-            kwargs[out_name] = []
-            if out_dup:
-                sub_out = outputs[out_name]
-                for item in sub_out:
-                    sub_out_name, _ = item[0], item[1]
-                    __create_var__(out_name, sub_out_name)
-            else:
-                __create_var__(out_name, out_name)
-
-    for attr_name in Operator.get_op_attr_names(op_type):
-        if attr_name in attrs:
-            kwargs[attr_name] = attrs[attr_name]
-
-    return Operator(op_type, **kwargs)
-
-
-def set_input(scope, op, inputs, place):
-    def np_value_to_fluid_value(input):
-        if input.dtype == np.float16:
-            input = input.view(np.uint16)
-        return input
-
-    def __set_input__(var_name, var):
-        if isinstance(var, tuple) or isinstance(var, np.ndarray):
-            tensor = scope.find_var(var_name).get_tensor()
-            if isinstance(var, tuple):
-                tensor.set_recursive_sequence_lengths(var[1])
-                var = var[0]
-            tensor._set_dims(var.shape)
-            tensor.set(np_value_to_fluid_value(var), place)
-        elif isinstance(var, float):
-            scope.find_var(var_name).set_float(var)
-        elif isinstance(var, int):
-            scope.find_var(var_name).set_int(var)
-
-    for in_name, in_dup in Operator.get_op_inputs(op.type()):
-        if in_name in inputs:
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, sub_in_val = item[0], item[1]
-                    __set_input__(sub_in_name, sub_in_val)
-            else:
-                __set_input__(in_name, inputs[in_name])
-
-
-def append_input_output(block, op_proto, np_list, is_input, dtype):
-    '''Insert VarDesc and generate Python variable instance'''
-    proto_list = op_proto.inputs if is_input else op_proto.outputs
-
-    def create_var(block, name, np_list, var_proto):
-        dtype = None
-        shape = None
-        lod_level = None
-        if name not in np_list:
-            assert var_proto.intermediate, "{} not found".format(name)
-        else:
-            # inferece the dtype from numpy value.
-            np_value = np_list[name]
-            if isinstance(np_value, tuple):
-                dtype = np_value[0].dtype
-                # output shape, lod should be infered from input.
-                if is_input:
-                    shape = list(np_value[0].shape)
-                    lod_level = len(np_value[1])
-            else:
-                dtype = np_value.dtype
-                if is_input:
-                    shape = list(np_value.shape)
-                    lod_level = 0
-        # NOTE(dzhwinter): type hacking
-        # numpy float16 is binded to paddle::platform::float16
-        # in tensor_py.h via the help of uint16 datatype. Because
-        # the internal memory representation of float16 is
-        # actually uint16_t in paddle. So we use np.uint16 in numpy for
-        # raw memory, it can pass through the pybind. So in the testcase,
-        # we feed data use data.view(uint16), but the dtype is float16 in fact.
-        # The data.view(uint16) means do not cast the data type, but process data as the uint16
-        if dtype == np.uint16:
-            dtype = np.float16
-        return block.create_var(
-            dtype=dtype, shape=shape, lod_level=lod_level, name=name)
-
-    var_dict = {}
-    for var_proto in proto_list:
-        var_name = str(var_proto.name)
-        if (var_name not in np_list) and var_proto.dispensable:
-            continue
-        if is_input:
-            assert (var_name in np_list) or (var_proto.dispensable), \
-                "Missing {} as input".format(var_name)
-        if var_proto.duplicable:
-            assert isinstance(np_list[var_name], list), \
-                "Duplicable {} should be set as list".format(var_name)
-            var_list = []
-            for (name, np_value) in np_list[var_name]:
-                var_list.append(
-                    create_var(block, name, {name: np_value}, var_proto))
-            var_dict[var_name] = var_list
-        else:
-            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
-
-    return var_dict
-
-
-def append_loss_ops(block, output_names):
-    mean_inputs = list(map(block.var, output_names))
-
-    if len(mean_inputs) == 1:
-        loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
-        op = block.append_op(
-            inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-    else:
-        avg_sum = []
-        for cur_loss in mean_inputs:
-            cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
-            op = block.append_op(
-                inputs={"X": [cur_loss]},
-                outputs={"Out": [cur_avg_loss]},
-                type="mean")
-            op.desc.infer_var_type(block.desc)
-            op.desc.infer_shape(block.desc)
-            avg_sum.append(cur_avg_loss)
-
-        loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
-        op_sum = block.append_op(
-            inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
-        op_sum.desc.infer_var_type(block.desc)
-        op_sum.desc.infer_shape(block.desc)
-
-        loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
-        op_loss = block.append_op(
-            inputs={"X": loss_sum},
-            outputs={"Out": loss},
-            type='scale',
-            attrs={'scale': 1.0 / float(len(avg_sum))})
-        op_loss.desc.infer_var_type(block.desc)
-        op_loss.desc.infer_shape(block.desc)
-    return loss
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
deleted file mode 100644
index 1782d432490c796362590805ab20cad1f6a61359..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ /dev/null
@@ -1,512 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from functools import partial
-import numpy as np
-
-import os
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-pos_enc_param_names = (
-    "src_pos_enc_table",
-    "trg_pos_enc_table", )
-
-batch_size = 2
-
-
-def position_encoding_init(n_position, d_pos_vec):
-    """
-    Generate the initial values for the sinusoid position encoding table.
-    """
-    position_enc = np.array([[
-        pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
-        for j in range(d_pos_vec)
-    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
-    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
-    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
-    return position_enc.astype("float32")
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      param_attr=fluid.initializer.Xavier(
-                          uniform=False,
-                          fan_in=d_model * d_key,
-                          fan_out=n_head * d_key),
-                      bias_attr=False,
-                      num_flatten_dims=2)
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      param_attr=fluid.initializer.Xavier(
-                          uniform=False,
-                          fan_in=d_model * d_key,
-                          fan_out=n_head * d_key),
-                      bias_attr=False,
-                      num_flatten_dims=2)
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      param_attr=fluid.initializer.Xavier(
-                          uniform=False,
-                          fan_in=d_model * d_value,
-                          fan_out=n_head * d_value),
-                      bias_attr=False,
-                      num_flatten_dims=2)
-        return q, k, v
-
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        if n_head == 1:
-            return x
-
-        hidden_size = x.shape[-1]
-        # FIXME(guosheng): Decouple the program desc with batch_size.
-        reshaped = layers.reshape(
-            x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
-
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # FIXME(guosheng): Decouple the program desc with batch_size.
-        return layers.reshape(
-            x=trans_x,
-            shape=list(
-                map(int, [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]
-                          ])))
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-
-        # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.
-
-        # The current implementation of softmax_op only supports 2D tensor,
-        # consequently it cannot be directly used here.
-        # If to use the reshape_op, Besides, the shape of product inferred in
-        # compile-time is not the actual shape in run-time. It can't be used
-        # to set the attribute of reshape_op.
-        # So, here define the softmax for temporary solution.
-
-        def __softmax(x, eps=1e-9):
-            exp_out = layers.exp(x=x)
-            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
-            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
-
-        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
-        if dropout_rate:
-            weights = layers.dropout(
-                weights, dropout_prob=dropout_rate, is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
-                                                  dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         param_attr=fluid.initializer.Xavier(uniform=False),
-                         bias_attr=False,
-                         num_flatten_dims=2)
-    return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       param_attr=fluid.initializer.Uniform(
-                           low=-(d_hid**-0.5), high=(d_hid**-0.5)),
-                       act="relu")
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.initializer.Uniform(
-                        low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)))
-    return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.initializer.Constant(1.),
-                bias_attr=fluid.initializer.Constant(0.))
-        elif cmd == "d":  # add dropout
-            if dropout:
-                out = layers.dropout(out, dropout_prob=dropout, is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def prepare_encoder(src_word,
-                    src_pos,
-                    src_vocab_size,
-                    src_emb_dim,
-                    src_pad_idx,
-                    src_max_len,
-                    dropout=0.,
-                    pos_pad_idx=0,
-                    pos_enc_param_name=None):
-    """Add word embeddings and position encodings.
-    The output tensor has a shape of:
-    [batch_size, max_src_length_in_batch, d_model].
-
-    This module is used at the bottom of the encoder stacks.
-    """
-    src_word_emb = layers.embedding(
-        src_word,
-        size=[src_vocab_size, src_emb_dim],
-        padding_idx=src_pad_idx,
-        param_attr=fluid.initializer.Normal(0., 1.))
-    src_pos_enc = layers.embedding(
-        src_pos,
-        size=[src_max_len, src_emb_dim],
-        padding_idx=pos_pad_idx,
-        param_attr=fluid.ParamAttr(
-            name=pos_enc_param_name, trainable=False))
-    src_pos_enc.stop_gradient = True
-    enc_input = src_word_emb + src_pos_enc
-
-    # FIXME(guosheng): Decouple the program desc with batch_size.
-    enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim])
-    return layers.dropout(
-        enc_input, dropout_prob=dropout,
-        is_test=False) if dropout else enc_input
-
-
-prepare_encoder = partial(
-    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
-prepare_decoder = partial(
-    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  dropout_rate=0.):
-    """The encoder layers that can be stacked to form a deep encoder.
-
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(enc_input, enc_input, enc_input,
-                                       attn_bias, d_key, d_value, d_model,
-                                       n_head, dropout_rate)
-    attn_output = post_process_layer(enc_input, attn_output, "dan",
-                                     dropout_rate)
-    ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
-    return post_process_layer(attn_output, ffd_output, "dan", dropout_rate)
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            dropout_rate=0.):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value,
-                                   d_model, d_inner_hid, dropout_rate)
-        enc_input = enc_output
-    return enc_output
-
-
-def decoder_layer(dec_input,
-                  enc_output,
-                  slf_attn_bias,
-                  dec_enc_attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  dropout_rate=0.):
-    """ The layer to be stacked in decoder part.
-
-    The structure of this module is similar to that in the encoder part except
-    a multi-head attention is added to implement encoder-decoder attention.
-    """
-    slf_attn_output = multi_head_attention(
-        dec_input,
-        dec_input,
-        dec_input,
-        slf_attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        dropout_rate, )
-    slf_attn_output = post_process_layer(
-        dec_input,
-        slf_attn_output,
-        "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
-    enc_attn_output = multi_head_attention(
-        slf_attn_output,
-        enc_output,
-        enc_output,
-        dec_enc_attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        dropout_rate, )
-    enc_attn_output = post_process_layer(
-        slf_attn_output,
-        enc_attn_output,
-        "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
-    ffd_output = positionwise_feed_forward(
-        enc_attn_output,
-        d_inner_hid,
-        d_model, )
-    dec_output = post_process_layer(
-        enc_attn_output,
-        ffd_output,
-        "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
-    return dec_output
-
-
-def decoder(dec_input,
-            enc_output,
-            dec_slf_attn_bias,
-            dec_enc_attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            dropout_rate=0.):
-    """
-    The decoder is composed of a stack of identical decoder_layer layers.
-    """
-    for i in range(n_layer):
-        dec_output = decoder_layer(
-            dec_input,
-            enc_output,
-            dec_slf_attn_bias,
-            dec_enc_attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            dropout_rate, )
-        dec_input = dec_output
-    return dec_output
-
-
-def build_inputs(max_length, n_head):
-    names = [
-        'src_word',
-        'src_pos',
-        'trg_word',
-        'trg_pos',
-        'src_slf_attn_bias',
-        'trg_slf_attn_bias',
-        'trg_src_attn_bias',
-        'gold',
-        'weights',
-    ]
-
-    shapes = [
-        [batch_size * max_length, 1],
-        [batch_size * max_length, 1],
-        [batch_size * max_length, 1],
-        [batch_size * max_length, 1],
-        [batch_size, n_head, max_length, max_length],
-        [batch_size, n_head, max_length, max_length],
-        [batch_size, n_head, max_length, max_length],
-        [batch_size * max_length, 1],
-        [batch_size * max_length, 1],
-    ]
-
-    dtypes = [
-        'int64',
-        'int64',
-        'int64',
-        'int64',
-        'float32',
-        'float32',
-        'float32',
-        'int64',
-        'float32',
-    ]
-
-    all_inputs = []
-    for name, shape, dtype in zip(names, shapes, dtypes):
-        all_inputs.append(
-            fluid.layers.data(
-                name=name, shape=shape, dtype=dtype, append_batch_size=False))
-    return all_inputs
-
-
-def transformer(
-        src_vocab_size,
-        trg_vocab_size,
-        max_length,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate,
-        src_pad_idx,
-        trg_pad_idx,
-        pos_pad_idx, ):
-
-    src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = build_inputs(
-        max_length, n_head)
-
-    enc_input = prepare_encoder(
-        src_word,
-        src_pos,
-        src_vocab_size,
-        d_model,
-        src_pad_idx,
-        max_length,
-        dropout_rate, )
-    enc_output = encoder(
-        enc_input,
-        src_slf_attn_bias,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate, )
-
-    dec_input = prepare_decoder(
-        trg_word,
-        trg_pos,
-        trg_vocab_size,
-        d_model,
-        trg_pad_idx,
-        max_length,
-        dropout_rate, )
-    dec_output = decoder(
-        dec_input,
-        enc_output,
-        trg_slf_attn_bias,
-        trg_src_attn_bias,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate, )
-
-    # TODO(guosheng): Share the weight matrix between the embedding layers and
-    # the pre-softmax linear transformation.
-    predict = layers.reshape(
-        x=layers.fc(input=dec_output,
-                    size=trg_vocab_size,
-                    param_attr=fluid.initializer.Xavier(uniform=False),
-                    bias_attr=False,
-                    num_flatten_dims=2),
-        shape=[-1, trg_vocab_size],
-        act="softmax")
-
-    cost = layers.cross_entropy(input=predict, label=gold)
-    weighted_cost = cost * weights
-    return layers.reduce_sum(weighted_cost)
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
deleted file mode 100755
index 81396e4fe9713c215a2b97753a4b68ffa82c0959..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/trainer_desc.py
+++ /dev/null
@@ -1,177 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-from os import path
-__all__ = ['TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer']
-
-
-class TrainerDesc(object):
-    '''
-    Set proto from python to c++.
-    Can be initialized from train_desc.
-    '''
-
-    def __init__(self):
-        '''
-        self.proto_desc = data_feed_pb2.DataFeedDesc()
-        with open(proto_file, 'r') as f:
-            text_format.Parse(f.read(), self.proto_desc)
-        '''
-        # Workaround for relative import in protobuf under python3
-        # TODO: should be fixed
-        cur_path = path.dirname(__file__)
-        sys.path.append(cur_path)
-        sys.path.append(cur_path + "/proto")
-        from proto import trainer_desc_pb2
-        self.proto_desc = trainer_desc_pb2.TrainerDesc()
-        import multiprocessing as mp
-        # set default thread num == cpu count
-        self.proto_desc.thread_num = mp.cpu_count()
-        self._fleet_desc = None
-        self._device_worker = None
-        self._program = None
-        self._infer = False
-
-    def _set_fetch_var_and_info(self, fetch_vars, fetch_info, print_period):
-        for i, v in enumerate(fetch_vars):
-            self.proto_desc.fetch_config.fetch_var_names.extend([v.name])
-            self.proto_desc.fetch_config.fetch_var_str_format.extend(
-                [fetch_info[i]])
-        self.proto_desc.fetch_config.print_period = print_period
-
-    def _set_debug(self, debug):
-        self.proto_desc.debug = debug
-
-    def _set_thread(self, thread_num):
-        self.proto_desc.thread_num = thread_num
-
-    def _set_device_worker(self, device_worker):
-        self._device_worker = device_worker
-
-    def _set_infer(self, infer):
-        self._infer = infer
-
-    def _set_fleet_desc(self, fleet_desc):
-        self._fleet_desc = fleet_desc
-
-    def _gen_trainer_desc(self):
-        pass
-
-    def _set_program(self, program):
-        self._program = program
-
-    def _set_use_cvm(self, use_cvm=False):
-        self.proto_desc.use_cvm = use_cvm
-
-    def _set_scale_datanorm(self, scale_datanorm=-1):
-        self.proto_desc.scale_datanorm = scale_datanorm
-
-    def _set_dump_slot(self, dump_slot):
-        self.proto_desc.dump_slot = dump_slot
-
-    def _set_mpi_rank(self, mpi_rank):
-        self.proto_desc.mpi_rank = mpi_rank
-
-    def _set_dump_fields(self, dump_fields):
-        for field in dump_fields:
-            self.proto_desc.dump_fields.append(field)
-
-    def _set_dump_fields_path(self, path):
-        self.proto_desc.dump_fields_path = path
-
-    def _set_user_define_dump_filename(self, user_define_dump_filename):
-        self.proto_desc.user_define_dump_filename = user_define_dump_filename
-
-    def _set_dump_converter(self, converter):
-        self.proto_desc.dump_converter = converter
-
-    def _set_adjust_ins_weight(self, config_dict):
-        self.proto_desc.adjust_ins_weight_config.need_adjust = \
-                config_dict.get("need_adjust", False)
-        self.proto_desc.adjust_ins_weight_config.nid_slot = \
-                config_dict.get("nid_slot", "")
-        self.proto_desc.adjust_ins_weight_config.nid_adjw_threshold = \
-                config_dict.get("nid_adjw_threshold", 0.0)
-        self.proto_desc.adjust_ins_weight_config.nid_adjw_ratio = \
-                config_dict.get("nid_adjw_ratio", 0.0)
-        self.proto_desc.adjust_ins_weight_config.ins_weight_slot = \
-                config_dict.get("ins_weight_slot", "")
-
-    def _desc(self):
-        from google.protobuf import text_format
-        return self.proto_desc.SerializeToString()
-
-    def __str__(self):
-        from google.protobuf import text_format
-        return text_format.MessageToString(self.proto_desc)
-
-
-class MultiTrainer(TrainerDesc):
-    '''
-    Implement of MultiTrainer.
-    Can be init from TrainerDesc.
-    '''
-
-    def __init__(self):
-        super(MultiTrainer, self).__init__()
-        pass
-
-    def _set_program(self, program):
-        super(MultiTrainer, self)._set_program(program)
-        self._program = program
-
-    def _gen_trainer_desc(self):
-        super(MultiTrainer, self)._gen_trainer_desc()
-        self.proto_desc.class_name = "MultiTrainer"
-        self._device_worker._set_infer(self._infer)
-        self._device_worker._gen_worker_desc(self.proto_desc)
-
-
-class DistMultiTrainer(TrainerDesc):
-    def __init__(self):
-        super(DistMultiTrainer, self).__init__()
-        pass
-
-    def _set_program(self, program):
-        super(DistMultiTrainer, self)._set_program(program)
-        self._program = program
-
-    def _gen_trainer_desc(self):
-        super(DistMultiTrainer, self)._gen_trainer_desc()
-        self.proto_desc.class_name = "DistMultiTrainer"
-        if self._program == None:
-            raise RuntimeError("None Program")
-        self._device_worker._set_infer(self._infer)
-        self._device_worker._set_program(self._program)
-        self._device_worker._gen_worker_desc(self.proto_desc)
-
-
-class PipelineTrainer(TrainerDesc):
-    def __init__(self):
-        super(PipelineTrainer, self).__init__()
-        pass
-
-    def _set_program(self, program):
-        super(PipelineTrainer, self)._set_program(program)
-        self._program = program
-
-    def _gen_trainer_desc(self):
-        super(PipelineTrainer, self)._gen_trainer_desc()
-        self.proto_desc.class_name = "PipelineTrainer"
-        if self._program == None:
-            raise RuntimeError("None Program")
-        self._device_worker._set_infer(self._infer)
-        self._device_worker._set_program(self._program)
-        self._device_worker._gen_worker_desc(self.proto_desc)
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
deleted file mode 100755
index c0fe000912af86e93b3ddad35f9c3a0925844658..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/trainer_factory.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer
-from .device_worker import Hogwild, DownpourSGD, Section
-
-__all__ = ["TrainerFactory"]
-
-
-class TrainerFactory(object):
-    def __init__(self):
-        pass
-
-    def _create_trainer(self, opt_info=None):
-        trainer = None
-        device_worker = None
-        if opt_info == None:
-            # default is MultiTrainer + Hogwild
-            trainer = MultiTrainer()
-            device_worker = Hogwild()
-            trainer._set_device_worker(device_worker)
-        else:
-            trainer_class = opt_info["trainer"]
-            device_worker_class = opt_info["device_worker"]
-            trainer = globals()[trainer_class]()
-            device_worker = globals()[device_worker_class]()
-            if "fleet_desc" in opt_info:
-                device_worker._set_fleet_desc(opt_info["fleet_desc"])
-                trainer._set_fleet_desc(opt_info["fleet_desc"])
-                if opt_info.get("use_cvm") is not None:
-                    trainer._set_use_cvm(opt_info["use_cvm"])
-                if opt_info.get("scale_datanorm") is not None:
-                    trainer._set_scale_datanorm(opt_info["scale_datanorm"])
-                if opt_info.get("dump_slot") is not None:
-                    trainer._set_dump_slot(opt_info["dump_slot"])
-                if opt_info.get("mpi_rank") is not None:
-                    trainer._set_mpi_rank(opt_info["mpi_rank"])
-                if opt_info.get("dump_fields") is not None:
-                    trainer._set_dump_fields(opt_info["dump_fields"])
-                if opt_info.get("dump_fields_path") is not None:
-                    trainer._set_dump_fields_path(opt_info["dump_fields_path"])
-                if opt_info.get("user_define_dump_filename") is not None:
-                    trainer._set_user_define_dump_filename(opt_info["user_define_dump_filename"])
-                if opt_info.get("dump_converter") is not None:
-                    trainer._set_dump_converter(opt_info["dump_converter"])
-                if opt_info.get("adjust_ins_weight") is not None:
-                    trainer._set_adjust_ins_weight(opt_info["adjust_ins_weight"])
-            trainer._set_device_worker(device_worker)
-        return trainer
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
deleted file mode 100644
index c5d2502ddbb4afa1dba1f97e8867174469382abe..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from .distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
-from .memory_optimization_transpiler import memory_optimize, release_memory
-from .ps_dispatcher import HashName, RoundRobin
-
-__all__ = [
-    "DistributeTranspiler",
-    "memory_optimize",
-    "release_memory",
-    "HashName",
-    "RoundRobin",
-    "DistributeTranspilerConfig",
-]
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
deleted file mode 100644
index 6b5131e58c6d8ea3e2fd15b75c8ebd9169e21ae1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/collective.py
+++ /dev/null
@@ -1,372 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-import math
-from functools import reduce
-
-import collections
-import six
-import logging
-
-import numpy as np
-
-from .. import core, unique_name
-from ..framework import Program, default_main_program, default_startup_program
-from .details import wait_server_ready
-
-__all__ = ['GradAllReduce', 'LocalSGD']
-
-OpRole = core.op_proto_and_checker_maker.OpRole
-
-
-class Collective(object):
-    '''
-    '''
-
-    def __init__(self, nrings):
-        self.nrings = nrings
-        self.endpoints = None
-        self.current_endpoint = None
-        self.nranks = None
-        self.rank = None
-        self.startup_program = None
-        self.main_program = None
-        op_maker = core.op_proto_and_checker_maker
-        self.op_role_key = op_maker.kOpRoleAttrName()
-        self.op_role_var_key = op_maker.kOpRoleVarAttrName()
-
-    def transpile(self, startup_program, main_program, rank, endpoints,
-                  current_endpoint, wait_port):
-        # in case of '127.0.0.1:6700,127.0.0.1:6701,...'
-        if isinstance(endpoints, str):
-            endpoints = endpoints.split(',')
-
-        self.startup_program = startup_program
-        if startup_program is None:
-            self.startup_program = default_startup_program()
-
-        self.main_program = main_program
-        if main_program is None:
-            self.main_program = default_main_program()
-
-        self.nranks = len(endpoints)
-        if self.nranks == 1:
-            raise ValueError('the number of endpoints must > 1')
-
-        if rank < 0:
-            raise ValueError('rank must >= 0')
-        self.rank = rank
-
-        if current_endpoint not in endpoints:
-            raise ValueError('current endpoint %s is not in %s',
-                             current_endpoint, str(endpoints))
-
-        self.endpoints = endpoints
-        self.current_endpoint = current_endpoint
-
-        self.wait_port = wait_port
-
-        self.startup_program._origin_program = self.startup_program.clone()
-        self._transpile_startup_program()
-
-        self.main_program._origin_program = self.main_program.clone()
-        self._transpile_main_program()
-
-    def _transpile_main_program(self):
-        raise NotImplementedError('call the inherited method of subclasses')
-
-    def _transpile_startup_program(self):
-        for ring_id in range(self.nrings):
-            self._init_communicator(self.startup_program, self.current_endpoint,
-                                    self.endpoints, self.rank, ring_id,
-                                    self.wait_port)
-        self._broadcast_params()
-
-    def _init_communicator(self, program, current_endpoint, endpoints, rank,
-                           ring_id, wait_port):
-        nranks = len(endpoints)
-        other_endpoints = endpoints[:]
-        other_endpoints.remove(current_endpoint)
-        if rank == 0 and wait_port:
-            wait_server_ready(other_endpoints)
-
-        block = program.global_block()
-        nccl_id_var = block.create_var(
-            name=unique_name.generate('nccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints,
-                self.op_role_key: OpRole.Forward
-            })
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': ring_id,
-                self.op_role_key: OpRole.Forward
-            })
-
-    def _broadcast_params(self):
-        block = self.startup_program.global_block()
-        ring_id = -1
-        for param in block.iter_parameters():
-            if param.is_distributed:
-                continue
-
-            ring_id = (ring_id + 1) % self.nrings
-            block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': 0,
-                    self.op_role_key: OpRole.Forward
-                })
-
-        for ring_id in range(self.nrings):
-            block.append_op(
-                type='c_sync_comm_stream',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={'ring_id': ring_id,
-                       self.op_role_key: OpRole.Forward})
-
-    def _is_loss_grad_op(self, op):
-        if self.op_role_key not in op.attr_names:
-            return False
-        op_role = int(op.all_attrs()[self.op_role_key])
-        return op_role & int(OpRole.Backward) and op_role & int(OpRole.Loss)
-
-    def _is_backward_op(self, op):
-        return self.op_role_key in op.attr_names and \
-                int(op.all_attrs()[self.op_role_key]) & int(OpRole.Backward)
-
-    def _is_update_op(self, op):
-        return 'Param' in op.input_names and 'Grad' in op.input_names and \
-                "LearningRate" in op.input_names
-
-    def _is_optimizer_op(self, op):
-        return self.op_role_key in op.attr_names and \
-                int(op.all_attrs()[self.op_role_key]) & int(OpRole.Optimize)
-
-
-class GradAllReduce(Collective):
-    '''
-    '''
-
-    def __init__(self, nrings=2):
-        Collective.__init__(self, nrings)
-
-    def _transpile_main_program(self):
-        self._insert_scale_loss_grad_ops()
-        self._insert_allreduce_ops()
-
-    def _insert_scale_loss_grad_ops(self):
-        '''
-        In order to keep the learning rate consistent in different numbers of
-        training workers, we scale the loss grad by the number of workers
-        '''
-        block = self.main_program.global_block()
-        for idx, op in reversed(list(enumerate(block.ops))):
-            if self._is_loss_grad_op(op):
-                loss_grad_var = block.vars[op.output_arg_names[0]]
-                block._insert_op(
-                    idx + 1,
-                    type='scale',
-                    inputs={'X': loss_grad_var},
-                    outputs={'Out': loss_grad_var},
-                    attrs={
-                        'scale': 1.0 / self.nranks,
-                        self.op_role_key: OpRole.Backward
-                    })
-
-    def _insert_allreduce_ops(self):
-        block = self.main_program.global_block()
-        ring_id = -1
-        grad = None
-        for idx, op in reversed(list(enumerate(block.ops))):
-            if self._is_backward_op(op) and \
-                    self.op_role_var_key in op.attr_names:
-                op_role_var = op.all_attrs()[self.op_role_var_key]
-
-                if len(op_role_var) == 0:
-                    continue
-                assert len(op_role_var) % 2 == 0
-
-                offset = idx
-                for i in range(0, len(op_role_var), 2):
-                    param = block.vars[op_role_var[i]]
-                    grad = block.vars[op_role_var[i + 1]]
-                    if param.is_distributed:
-                        continue
-
-                    if offset == idx:
-                        offset += 1
-                        block._insert_op(
-                            offset,
-                            type='c_sync_calc_stream',
-                            inputs={'X': grad},
-                            outputs={'Out': grad},
-                            attrs={self.op_role_key: OpRole.Backward})
-                        offset += 1
-
-                    # As we search ops reversedly, we should insert c_allreduce_sum
-                    # op in the same way to keep the ring_id alternate
-                    ring_id = (ring_id + 1) % self.nrings
-                    block._insert_op(
-                        offset,
-                        type='c_allreduce_sum',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'ring_id': ring_id,
-                            self.op_role_key: OpRole.Backward
-                        })
-
-        if grad is None:
-            return
-
-        for idx, op in enumerate(block.ops):
-            if self._is_optimizer_op(op):
-                for ring_id in range(self.nrings):
-                    block._insert_op(
-                        idx + ring_id,
-                        type='c_sync_comm_stream',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'ring_id': ring_id,
-                            self.op_role_key: OpRole.Backward
-                        })
-                break
-
-
-class LocalSGD(Collective):
-    '''
-    '''
-
-    def __init__(self, nrings=2):
-        Collective.__init__(self, nrings)
-        self.snapshot_key = '@SNAPSHOT'
-
-    def _transpile_startup_program(self):
-        Collective._transpile_startup_program(self)
-
-        block = self.startup_program.global_block()
-        non_dist_params = []
-        for param in block.iter_parameters():
-            if not param.is_distributed:
-                non_dist_params.append(param)
-
-        for param in non_dist_params:
-            snapshot = block.create_var(
-                name=self.snapshot_name(param.name),
-                shape=param.shape,
-                persistable=True,
-                stop_gradient=True)
-            block.append_op(
-                type='assign',
-                inputs={'X': [param]},
-                outputs={'Out': [snapshot]},
-                attrs={self.op_role_key: OpRole.Forward})
-
-    def snapshot_name(self, param_name):
-        return param_name + self.snapshot_key
-
-    def _transpile_main_program(self):
-        block = self.main_program.global_block()
-        ordered_param_snapshot = []
-        ring_id = -1
-        for idx, op in reversed(list(enumerate(block.ops))):
-            if self._is_update_op(op):
-                param = block.vars[op.input('Param')[0]]
-                if param.is_distributed:
-                    continue
-
-                snapshot = block.create_var(
-                    name=self.snapshot_name(param.name),
-                    shape=param.shape,
-                    persistable=True,
-                    stop_gradient=True)
-
-                block._insert_op(
-                    idx + 1,
-                    type='elementwise_sub',
-                    inputs={'X': [snapshot],
-                            'Y': [param]},
-                    outputs={'Out': [param]},
-                    attrs={self.op_role_key: OpRole.Optimize})
-                block._insert_op(
-                    idx + 2,
-                    type='c_sync_calc_stream',
-                    inputs={'X': param},
-                    outputs={'Out': param},
-                    attrs={self.op_role_key: OpRole.Optimize})
-                ring_id = (ring_id + 1) % self.nrings
-                block._insert_op(
-                    idx + 3,
-                    type='c_allreduce_sum',
-                    inputs={'X': [param]},
-                    outputs={'Out': [param]},
-                    attrs={
-                        'ring_id': ring_id,
-                        self.op_role_key: OpRole.Optimize
-                    })
-
-                ordered_param_snapshot.append((param, snapshot))
-
-        for ring_id in range(self.nrings):
-            block.append_op(
-                type='c_sync_comm_stream',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={'ring_id': ring_id,
-                       self.op_role_key: OpRole.Optimize})
-
-        for param_snapshot in reversed(ordered_param_snapshot):
-            param = param_snapshot[0]
-            snapshot = param_snapshot[1]
-            block.append_op(
-                type='scale',
-                inputs={'X': [param]},
-                outputs={'Out': [param]},
-                attrs={
-                    'scale': 1.0 / self.nranks,
-                    self.op_role_key: OpRole.Optimize
-                })
-            block.append_op(
-                type='elementwise_sub',
-                inputs={'X': [snapshot],
-                        'Y': [param]},
-                outputs={'Out': [param]},
-                attrs={self.op_role_key: OpRole.Optimize})
-            block.append_op(
-                type='assign',
-                inputs={'X': [param]},
-                outputs={'Out': [snapshot]},
-                attrs={self.op_role_key: OpRole.Optimize})
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
deleted file mode 100644
index 82d0d336e523ec48c5ceca3b92ff0963c4499123..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from .program_utils import *
-from .ufind import *
-from .checkport import *
-from .vars_distributed import *
diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py
deleted file mode 100644
index 89dd4dd50b0299de986b84f46e889d554030f180..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/details/checkport.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import time
-import socket
-from contextlib import closing
-from six import string_types
-
-
-def wait_server_ready(endpoints):
-    """
-    Wait until parameter servers are ready, use connext_ex to detect
-    port readiness.
-
-    Args:
-        endpoints (list): endpoints string list, like:
-                         ["127.0.0.1:8080", "127.0.0.1:8081"]
-
-    Examples:
-        .. code-block:: python
-
-           wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
-    """
-    assert not isinstance(endpoints, string_types)
-    while True:
-        all_ok = True
-        not_ready_endpoints = []
-        for ep in endpoints:
-            ip_port = ep.split(":")
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as sock:
-                sock.settimeout(2)
-                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
-                if result != 0:
-                    all_ok = False
-                    not_ready_endpoints.append(ep)
-        if not all_ok:
-            sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-            sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
-                             "\n")
-            sys.stderr.flush()
-            time.sleep(3)
-        else:
-            break
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
deleted file mode 100644
index dc78ffe70b3dfda75a799583e85b76d8d921e078..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import six
-
-from paddle.fluid import core
-import paddle
-
-
-def delete_ops(block, ops):
-    for op in ops:
-        try:
-            idx = list(block.ops).index(op)
-            block._remove_op(idx)
-        except Exception as e:
-            print(e)
-
-
-def find_op_by_input_arg(block, arg_name):
-    for index, op in enumerate(block.ops):
-        if arg_name in op.input_arg_names:
-            return index
-    return -1
-
-
-def find_op_by_output_arg(block, arg_name, reverse=False):
-    if reverse:
-        pos = len(block.ops) - 1
-        while pos >= 0:
-            op = block.ops[pos]
-            if arg_name in op.output_arg_names:
-                return pos
-            pos -= 1
-    else:
-        for index, op in enumerate(block.ops):
-            if arg_name in op.output_arg_names:
-                return index
-    return -1
-
-
-def get_indent_space(indent, space_num=4):
-    ret = ""
-    for i in range(0, indent * space_num):
-        ret += " "
-
-    return ret
-
-
-def variable_to_code(var):
-    """
-    Get readable codes of fluid variable.
-
-    Args:
-        var: A fluid operator.
-
-    Returns:
-        string: The formatted string.
-    """
-    if var.type == core.VarDesc.VarType.SELECTED_ROWS or var.type == core.VarDesc.VarType.LOD_TENSOR:
-        var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})".\
-            format(i="{", e="}", name=var.name, type=var.type, shape=var.shape, dtype=var.dtype)
-    else:
-        var_str = "{name} : fluid.{type})".\
-            format(i="{", e="}", name=var.name, type=var.type)
-
-    if type(var) == paddle.fluid.framework.Parameter:
-        if var.trainable:
-            var_str = "trainable parameter " + var_str
-        else:
-            var_str = "parameter " + var_str
-    else:
-        var_str = "var " + var_str
-
-    if var.persistable:
-        var_str = "persist " + var_str
-
-    return var_str
-
-
-def op_to_code(op, skip_op_callstack=True):
-    """
-    Get readable codes of fluid operator.
-
-    Args:
-        op: A fluid operator.
-
-    Returns:
-        string: The foramtted string.
-    """
-
-    outputs_str = "{"
-    for i in range(0, len(op.output_names)):
-        outputs_str += "{name}=".format(name=op.output_names[i])
-        o = op.output(op.output_names[i])
-        outputs_str += "{value}".format(value=o)
-        if i != len(op.output_names) - 1:
-            outputs_str += ", "
-    outputs_str += "}"
-
-    inputs_str = "{"
-    for i in range(0, len(op.input_names)):
-        inputs_str += "{name}=".format(name=op.input_names[i])
-        o = op.input(op.input_names[i])
-        inputs_str += "{value}".format(value=o)
-
-        if i != len(op.input_names) - 1:
-            inputs_str += ", "
-    inputs_str += "}"
-
-    attr_names = sorted(op.attr_names)
-    attrs_str = ""
-    for i in range(0, len(attr_names)):
-        name = attr_names[i]
-        if skip_op_callstack and name == "op_callstack":
-            continue
-
-        attr_type = op.desc.attr_type(name)
-        if attr_type == core.AttrType.BLOCK:
-            a = "{name} = block[{value}]".format(
-                name=name, type=attr_type, value=op._block_attr_id(name))
-            attrs_str += a
-            if i != len(attr_names) - 1:
-                attrs_str += ", "
-            continue
-
-        if attr_type == core.AttrType.BLOCKS:
-            a = "{name} = blocks{value}".format(
-                name=name, type=attr_type, value=op._blocks_attr_ids(name))
-            attrs_str += a
-            if i != len(attr_names) - 1:
-                attrs_str += ", "
-            continue
-
-        a = "{name} = {value}".format(
-            name=name, type=attr_type, value=op.desc.attr(name))
-        attrs_str += a
-        if i != len(attr_names) - 1:
-            attrs_str += ", "
-
-    if outputs_str != "{}":
-        op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
-            format(outputs = outputs_str, op_type=op.type, inputs=inputs_str, attrs=attrs_str)
-    else:
-        op_str = "{op_type}(inputs={inputs}, {attrs})".\
-            format(op_type=op.type, inputs=inputs_str, attrs=attrs_str)
-    return op_str
-
-
-def block_to_code(block, block_idx, fout=None, skip_op_callstack=False):
-    indent = 0
-
-    print(
-        "{0}{1} // block {2}".format(get_indent_space(indent), '{', block_idx),
-        file=fout)
-
-    indent += 1
-    # sort all vars
-    all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
-    for var in all_vars:
-        print(
-            "{}{}".format(get_indent_space(indent), variable_to_code(var[1])),
-            file=fout)
-
-    if len(all_vars) > 0:
-        print("", file=fout)
-
-    for op in block.ops:
-        print(
-            "{}{}".format(
-                get_indent_space(indent), op_to_code(op, skip_op_callstack)),
-            file=fout)
-    indent -= 1
-
-    print("{0}{1}".format(get_indent_space(indent), '}'), file=fout)
-
-
-def program_to_code(prog, fout=None, skip_op_callstack=True):
-    """
-    Print readable codes of fluid program.
-
-    Args:
-        prog : A fluid program.
-
-    An example result like bellow:
-    https://github.com/PaddlePaddle/Paddle/pull/12673
-    """
-    block_idx = 0
-    for block in prog.blocks:
-        block_to_code(block, block_idx, fout, skip_op_callstack)
-        block_idx += 1
diff --git a/python/paddle/fluid/transpiler/details/ufind.py b/python/paddle/fluid/transpiler/details/ufind.py
deleted file mode 100644
index aa63af7dcf7ac85031fb00ca4c39fb36d7e588b8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/details/ufind.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-
-class UnionFind(object):
-    """ Union-find data structure.
-
-    Union-find is a data structure that keeps track of a set of elements partitioned
-    into a number of disjoint (non-overlapping) subsets.
-
-    Reference:
-    https://en.wikipedia.org/wiki/Disjoint-set_data_structure
-
-    Args:
-      elements(list): The initialize element list.
-    """
-
-    def __init__(self, elementes=None):
-        self._parents = []  # index -> parent index
-        self._index = {}  # element -> index
-        self._curr_idx = 0
-        if not elementes:
-            elementes = []
-        for ele in elementes:
-            self._parents.append(self._curr_idx)
-            self._index.update({ele: self._curr_idx})
-            self._curr_idx += 1
-
-    def find(self, x):
-        # Find the root index of given element x,
-        # execute the path compress while findind the root index
-        if not x in self._index:
-            return -1
-        idx = self._index[x]
-        while idx != self._parents[idx]:
-            t = self._parents[idx]
-            self._parents[idx] = self._parents[t]
-            idx = t
-        return idx
-
-    def union(self, x, y):
-        # Union two given element
-        x_root = self.find(x)
-        y_root = self.find(y)
-
-        if x_root == y_root:
-            return
-        self._parents[x_root] = y_root
-
-    def is_connected(self, x, y):
-        # If two given elements have the same root index,
-        # then they are connected.
-        return self.find(x) == self.find(y)
diff --git a/python/paddle/fluid/transpiler/details/vars_distributed.py b/python/paddle/fluid/transpiler/details/vars_distributed.py
deleted file mode 100644
index 05e7f6e3e706376efc8af870a780d96c45642514..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/details/vars_distributed.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-from paddle.fluid.framework import Variable
-
-
-class VarStruct(object):
-    """
-    record part properties of a Variable in python.
-    """
-
-    def __init__(self, name, shape, dtype, type, lod_level, persistable):
-        self.name = name
-        self.shape = shape
-        self.dtype = dtype
-        self.type = type
-        self.lod_level = lod_level
-        self.persistable = persistable
-
-
-class VarDistributed(object):
-    """
-    a class to record the var distributed on parameter servers.
-    the class will record the relationship between origin var and slice var.
-    the slice var's properties, such as type/shape/offset/endpoint.
-    """
-
-    def __init__(self,
-                 origin_var,
-                 slice_var,
-                 is_slice=None,
-                 block_id=None,
-                 offset=None,
-                 vtype=None,
-                 endpoint=None):
-        """
-        Args:
-            origin_var(Variable|VarStruct): origin var properties
-            slice_var(Variable|VarStruct): slice var properties
-            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
-            block_id(int|None): the number about the slice var.
-            offset(int|None): if the slice var is sliced, offset is the numel before the var.
-            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
-            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
-        """
-
-        if isinstance(origin_var, Variable):
-            self.origin = self.__create_var_struct(origin_var)
-        else:
-            self.origin = origin_var
-
-        if isinstance(slice_var, Variable):
-            self.slice = self.__create_var_struct(slice_var)
-        else:
-            self.slice = slice_var
-
-        if self.equal(self.origin, self.slice):
-            self.is_slice = False
-            self.block_id = 0
-            self.offset = 0
-        else:
-            self.is_slice = True
-            self.block_id = 0
-            self.offset = 0
-
-        if is_slice is not None:
-            self.is_slice = is_slice
-        if block_id is not None:
-            self.block_id = block_id
-        if offset is not None:
-            self.offset = offset
-
-        self.vtype = vtype
-        self.endpoint = endpoint
-
-    @staticmethod
-    def __create_var_struct(var):
-        return VarStruct(var.name, var.shape, var.dtype, var.type,
-                         var.lod_level, var.persistable)
-
-    @staticmethod
-    def equal(var1, var2):
-        """
-        the two var is equal or not.
-        Returns:
-            bool: equal will return True else False
-        """
-        assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct)
-
-        return var1.name == var2.name and \
-               var1.type == var2.type and \
-               var1.shape == var2.shape and \
-               var1.dtype == var2.dtype and \
-               var1.lod_level == var2.lod_level and \
-               var1.persistable == var2.persistable
-
-    def __str__(self):
-        origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \
-            format(i="{", e="}", name=self.origin.name, type=self.origin.type,
-                   shape=self.origin.shape, dtype=self.origin.dtype)
-
-        slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \
-                        ".slice({is_slice}).block({block_id}).offset({offset})". \
-            format(i="{", e="}", name=self.slice.name, type=self.slice.type,
-                   shape=self.slice.shape, dtype=self.slice.dtype,
-                   is_slice=self.is_slice, block_id=self.block_id, offset=self.offset)
-
-        return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
-            self.vtype, origin_var_str, slice_var_str, self.endpoint)
-
-
-class VarsDistributed(object):
-    """
-    a gather about VarDistributed with many methods to find distributed vars.
-    through the class, we can get overview about the distributed parameters on parameter servers.
-    this class may centralized and convenient for developer to manage and get variable's distribute.
-    other module can also use this to find variables such io.py.
-    """
-
-    def __init__(self):
-        self.distributed_vars = []
-
-    def add_distributed_var(self,
-                            origin_var,
-                            slice_var,
-                            is_slice=None,
-                            block_id=None,
-                            offset=None,
-                            vtype=None,
-                            endpoint=None):
-        """
-        add distributed var in this.
-
-        Args:
-            origin_var(Variable|VarStruct): origin var properties
-            slice_var(Variable|VarStruct): slice var properties
-            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
-            block_id(int|None): the number about the slice var.
-            offset(int|None): if the slice var is sliced, offset is the numel before the var.
-            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
-            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
-        Returns:
-            None
-        """
-        self.distributed_vars.append(
-            VarDistributed(origin_var, slice_var, is_slice, block_id, offset,
-                           vtype, endpoint))
-
-    def get_distributed_var_by_slice(self, var_name):
-        """
-        get distributed var by conditions.
-
-        Args:
-            var_name(str): slice var name, such as "w.traier0.block1"
-        Returns:
-            VarDistributed: distributed var.
-        """
-        for dist_var in self.distributed_vars:
-            if dist_var.slice.name == var_name:
-                return dist_var
-        return None
-
-    @staticmethod
-    def equal(var1, var2):
-        """
-        the two var is equal or not.
-        Returns:
-            bool: equal will return True else False
-        """
-        return var1.name == var2.name and \
-               var1.type == var2.type and \
-               var1.shape == var2.shape and \
-               var1.dtype == var2.dtype and \
-               var1.lod_level == var2.lod_level and \
-               var1.persistable == var2.persistable
-
-    def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint):
-        """
-        get distributed var by conditions.
-
-        Args:
-            origin_var_name(str):
-            endpoint(str): the parameter endpoint, such as "127.0.0.1:1001"
-        Returns:
-            VarDistributed: distributed var.
-        """
-        for dist_var in self.distributed_vars:
-            if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint:
-                return dist_var
-        return None
-
-    def get_distributed_vars_by_vtypes(self, vtypes, groupby=False):
-        """
-        get distributed vars by conditions.
-
-        Args:
-            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
-            groupby(bool|False): group by origin var or not.
-
-        Returns:
-            list: distributed var list.
-            dict: distributed var map when groupby=True
-        """
-        vtype_vars = []
-        for var in self.distributed_vars:
-            if var.vtype in vtypes:
-                vtype_vars.append(var)
-        if not groupby:
-            return vtype_vars
-
-        params_map = {}
-        for var in vtype_vars:
-            origin_var_name = var.origin.name
-
-            if origin_var_name in params_map.keys():
-                optimizers = params_map.get(origin_var_name)
-            else:
-                optimizers = []
-            optimizers.append(var)
-            params_map[origin_var_name] = optimizers
-        return params_map
-
-    def get_distributed_vars_by_ep(self, endpoint, vtype=None):
-        """
-        get distributed vars by conditions.
-
-        Args:
-            endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001"
-            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
-
-        Returns:
-            list: distributed var list.
-        """
-        endpoint_vars = []
-        for var in self.distributed_vars:
-            if var.endpoint == endpoint:
-                endpoint_vars.append(var)
-        if not vtype:
-            return endpoint_vars
-
-        vtype_vars = []
-        for var in endpoint_vars:
-            if var.vtype == vtype:
-                vtype_vars.append(var)
-        return vtype_vars
-
-    def overview(self):
-        """
-        get the overview string about all params on all parameter servers.
-
-        Returns:
-            Str: overview string.
-
-        """
-        vars_str = []
-        for var in self.distributed_vars:
-            vars_str.append(str(var))
-        return "\n".join(vars_str)
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
deleted file mode 100644
index 54aaa138a18c9cd943c4ab145ea6477e27946e8b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ /dev/null
@@ -1,2410 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-"""
-Steps to transpile trainer:
-1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
-2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
-3. modify trainer program add split_op to each grad variable.
-4. append send_op to send splited variables to server and
-5. add recv_op to fetch params(splited blocks or origin param) from server.
-6. append concat_op to merge splited blocks to update local weights.
-
-Steps to transpile pserver:
-1. create new program for parameter server.
-2. create params and grad variables that assigned to current server instance.
-3. create a sub-block in the server side program
-4. append ops that should run on current server instance.
-5. add listen_and_serv op
-"""
-
-import sys
-import math
-from functools import reduce
-
-import collections
-import six
-import logging
-
-import numpy as np
-
-from .ps_dispatcher import RoundRobin, PSDispatcher
-from .. import core, framework, unique_name
-from ..framework import Program, default_main_program, \
-    default_startup_program, Block, Parameter, grad_var_name
-from .details import wait_server_ready, UnionFind, VarStruct, VarsDistributed
-from .details import delete_ops, find_op_by_output_arg
-from ..distribute_lookup_table import find_distributed_lookup_table
-from . import collective
-
-LOOKUP_TABLE_TYPE = "lookup_table"
-LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
-OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
-RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
-)
-OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
-RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
-DIST_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Dist
-LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
-
-PRINT_LOG = False
-
-
-def log(*args):
-    if PRINT_LOG:
-        print(args)
-
-
-class VarBlock:
-    def __init__(self, varname, offset, size):
-        self.varname = varname
-        # NOTE: real offset is offset * size
-        self.offset = offset
-        self.size = size
-
-    def __str__(self):
-        return "%s:%d:%d" % (self.varname, self.offset, self.size)
-
-
-def same_or_split_var(p_name, var_name):
-    return p_name == var_name or p_name.startswith(var_name + ".block")
-
-
-def slice_variable(var_list, slice_count, min_block_size):
-    """
-    We may need to split dense tensor to one or more blocks and put
-    them equally onto parameter server. One block is a sub-tensor
-    aligned by dim[0] of the tensor.
-
-    We need to have a minimal block size so that the calculations in
-    the parameter server side can gain better performance. By default
-    minimum block size 8K elements (maybe 16bit or 32bit or 64bit).
-
-    Args:
-        var_list (list): List of variables.
-        slice_count (int): Numel of count that variables will be sliced, which
-            could be the pserver services' count.
-        min_block_size (int): Minimum splitted block size.
-    Returns:
-        blocks (list[(varname, block_id, current_block_size)]): A list
-            of VarBlocks. Each VarBlock specifies a shard of the var.
-    """
-    blocks = []
-    for var in var_list:
-        split_count = slice_count
-        var_numel = reduce(lambda x, y: x * y, var.shape)
-        max_pserver_count = int(math.floor(var_numel / float(min_block_size)))
-        if max_pserver_count == 0:
-            max_pserver_count = 1
-        if max_pserver_count < slice_count:
-            split_count = max_pserver_count
-        block_size = int(math.ceil(var_numel / float(split_count)))
-
-        if len(var.shape) >= 2:
-            # align by dim1(width)
-            dim1 = reduce(lambda x, y: x * y, var.shape[1:])
-            remains = block_size % dim1
-            if remains != 0:
-                block_size += dim1 - remains
-        # update split_count after aligning
-        split_count = int(math.ceil(var_numel / float(block_size)))
-        for block_id in range(split_count):
-            curr_block_size = min(block_size, var_numel - (
-                (block_id) * block_size))
-            block = VarBlock(var.name, block_id, curr_block_size)
-            blocks.append(str(block))
-    return blocks
-
-
-class DistributeTranspilerConfig(object):
-    """
-    .. py:attribute:: slice_var_up (bool)
-
-          Do Tensor slice for pservers, default is True.
-
-    .. py:attribute:: split_method (PSDispatcher)
-
-          RoundRobin or HashName can be used.
-          Try to choose the best method to balance loads for pservers.
-
-    .. py:attribute:: min_block_size (int)
-
-          Minimum number of splitted elements in block.
-
-          According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
-          We can use bandwidth effiently when data size is larger than 2MB.If you
-          want to change it, please be sure you have read the slice_variable function.
-
-    Examples:
-        .. code-block:: python
-
-            config = fluid.DistributeTranspilerConfig()
-            config.slice_var_up = True
-    """
-
-    slice_var_up = True
-    split_method = None
-    min_block_size = 8192
-    enable_dc_asgd = False
-    # supported modes: pserver, nccl2, collective
-    mode = "pserver"
-    print_log = False
-    wait_port = True
-    # split the send recv var in runtime
-    _runtime_split_send_recv = False
-    _sync_mode = True
-
-    nccl_comm_num = 1
-    #The picture here illustrates the principle:
-    #https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
-    use_hierarchical_allreduce = False
-    #Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu cards' number in most cases.
-    hierarchical_allreduce_inter_nranks = 0
-
-    # if mode is collective
-    # supported modes: grad_allreduce, local_sgd
-    collective_mode = None
-
-    def __init__(self):
-        pass
-
-    @property
-    def runtime_split_send_recv(self):
-        return self._runtime_split_send_recv
-
-    @runtime_split_send_recv.setter
-    def runtime_split_send_recv(self, value):
-        if value is None:
-            raise ValueError("runtime_split_send_recv can't be None")
-        if value and self._sync_mode:
-            raise ValueError(
-                "if you want to set runtime_split_send_recv to be true, make ensure config.sync_mode is false at first"
-            )
-        self._runtime_split_send_recv = value
-
-    @property
-    def sync_mode(self):
-        return self._sync_mode
-
-    @sync_mode.setter
-    def sync_mode(self, value):
-        if value is None:
-            raise ValueError("sync_mode can't be None")
-        if value and self._runtime_split_send_recv:
-            raise ValueError(
-                "if you want to set sync_mode to be true, make ensure config.runtime_split_send_recv is false at first"
-            )
-        self._sync_mode = value
-
-
-class DistributeTranspiler(object):
-    """
-    **DistributeTranspiler**
-
-    Convert the fluid program to distributed data-parallelism programs.
-    Supports two modes: pserver mode and nccl2 mode.
-
-    In pserver mode, the main_program will be transformed to use a remote
-    parameter server to do parameter optimization. And the optimization
-    graph will be put into a parameter server program.
-
-    In nccl2 mode, the transpiler will append a NCCL_ID broadcasting
-    op in startup_program to share the NCCL_ID across the job nodes.
-    After transpile_nccl2 called, you ***must*** pass trainer_id and
-    num_trainers argument to ParallelExecutor to enable NCCL2 distributed
-    mode.
-
-    Examples:
-        .. code-block:: python
-
-            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_loss = fluid.layers.mean(cost)
-
-            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-            sgd_optimizer.minimize(avg_loss)
-
-            # for pserver mode
-            pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-            trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-            current_endpoint = "192.168.0.1:6174"
-            trainer_id = 0
-            trainers = 4
-            role = "PSERVER"
-            t = fluid.DistributeTranspiler()
-            t.transpile(
-                 trainer_id, pservers=pserver_endpoints, trainers=trainers)
-            if role == "PSERVER":
-                 pserver_program = t.get_pserver_program(current_endpoint)
-                 pserver_startup_program = t.get_startup_program(current_endpoint,
-                                                                pserver_program)
-            elif role == "TRAINER":
-                 trainer_program = t.get_trainer_program()
-
-            # for nccl2 mode
-            trainer_num = 2
-            trainer_id = 0
-            config = fluid.DistributeTranspilerConfig()
-            config.mode = "nccl2"
-            trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-            t = fluid.DistributeTranspiler(config=config)
-            t.transpile(trainer_id=trainer_id, trainers=trainer_endpoints, current_endpoint="192.168.0.1:6174")
-            exe = fluid.ParallelExecutor(
-                use_cuda=True,
-                loss_name=avg_loss.name,
-                num_trainers=trainer_num,
-                trainer_id=trainer_id
-            )
-    """
-
-    def __init__(self, config=None):
-        if config is not None:
-            self.config = config
-        else:
-            self.config = DistributeTranspilerConfig()
-
-        if self.config.split_method is None:
-            self.config.split_method = RoundRobin
-
-        global PRINT_LOG
-        if self.config.print_log:
-            PRINT_LOG = True
-        assert (self.config.min_block_size >= 8192)
-        assert (self.config.split_method.__bases__[0] == PSDispatcher)
-
-    def _transpile_nccl2(self,
-                         trainer_id,
-                         trainers,
-                         current_endpoint,
-                         startup_program=None,
-                         wait_port=True):
-        if not startup_program:
-            startup_program = default_startup_program()
-        if trainer_id >= 0:
-            worker_endpoints = trainers.split(",")
-            # send NCCL_ID to others or recv from trainer 0
-            worker_endpoints.remove(current_endpoint)
-            if trainer_id == 0 and wait_port:
-                wait_server_ready(worker_endpoints)
-
-            nccl_id_var = startup_program.global_block().create_var(
-                name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
-
-            for i in range(1, self.config.nccl_comm_num):
-                startup_program.global_block().create_var(
-                    name="NCCLID_{}".format(i),
-                    persistable=True,
-                    type=core.VarDesc.VarType.RAW)
-
-            if self.config.use_hierarchical_allreduce:
-                for i in range(0, self.config.nccl_comm_num):
-                    startup_program.global_block().create_var(
-                        name="Hierarchical_inter_NCCLID_{}".format(i),
-                        persistable=True,
-                        type=core.VarDesc.VarType.RAW)
-                    startup_program.global_block().create_var(
-                        name="Hierarchical_exter_NCCLID_{}".format(i),
-                        persistable=True,
-                        type=core.VarDesc.VarType.RAW)
-
-            startup_program.global_block().append_op(
-                type="gen_nccl_id",
-                inputs={},
-                outputs={"NCCLID": nccl_id_var},
-                attrs={
-                    "trainers": trainers.split(","),
-                    "trainer_id": trainer_id,
-                    "nccl_comm_num": self.config.nccl_comm_num,
-                    "use_hierarchical_allreduce":
-                    self.config.use_hierarchical_allreduce,
-                    "hierarchical_allreduce_inter_nranks":
-                    self.config.hierarchical_allreduce_inter_nranks
-                })
-            return nccl_id_var
-        else:
-            raise ValueError("must set trainer_id > 0")
-
-    def _transpile_collective(self,
-                              collective_mode,
-                              trainer_id,
-                              trainers,
-                              current_endpoint,
-                              startup_program=None,
-                              main_program=None,
-                              wait_port=True):
-        if isinstance(trainers, str):
-            endpoints = trainers.split(",")
-        elif isinstance(trainers, list):
-            endpoints = trainers
-        else:
-            raise ValueError('invalid trainers config: ' + str(trainers))
-
-        if len(endpoints) == 1:
-            raise ValueError('invalid trainer number in distributed: 1')
-
-        if startup_program is None:
-            startup_program = default_startup_program()
-
-        if main_program is None:
-            main_program = default_main_program()
-
-        transpiler = None
-        if collective_mode == 'grad_allreduce':
-            transpiler = collective.GradAllReduce(self.config.nccl_comm_num)
-        elif collective_mode == 'local_sgd':
-            transpiler = collective.LocalSGD(self.config.nccl_comm_num)
-        else:
-            raise ValueError('invalid collective_mode: %s' % collective_mode)
-
-        transpiler.transpile(
-            startup_program=startup_program,
-            main_program=main_program,
-            rank=trainer_id,
-            endpoints=endpoints,
-            current_endpoint=current_endpoint,
-            wait_port=wait_port)
-
-    def _get_all_remote_sparse_update_op(self, main_program):
-        sparse_update_ops = []
-        sparse_update_op_types = ["lookup_table", "nce", "hierarchical_sigmoid"]
-        for op in main_program.global_block().ops:
-            if op.type in sparse_update_op_types and op.attr(
-                    'remote_prefetch') is True:
-                sparse_update_ops.append(op)
-        return sparse_update_ops
-
-    def _update_remote_sparse_update_op(self, program,
-                                        need_sparse_update_params):
-
-        for param_varname, attrs in need_sparse_update_params.items():
-            height_sections = self.sparse_param_to_height_sections[
-                param_varname]
-            endpoints = attrs[0]
-            table_names = attrs[1]
-
-            ops = []
-            op_type = ""
-            used_ops = []
-
-            for idx, op in enumerate(self.sparse_update_ops):
-                if param_varname in op.input_arg_names and op_type == "":
-                    op_type = op.type
-                    ops.append(op)
-                    used_ops.append(idx)
-
-                elif param_varname in op.input_arg_names and op_type == op.type:
-                    ops.append(op)
-                    used_ops.append(idx)
-
-            if op_type == "lookup_table":
-                all_ops = program.global_block().ops
-                op_idxs = [all_ops.index(op) for op in ops]
-                inputs = [
-                    program.global_block().vars[op.input("Ids")[0]]
-                    for op in ops
-                ]
-                w = program.global_block().vars[ops[0].input("W")[0]]
-                padding_idx = ops[0].attr("padding_idx")
-                outputs = [
-                    program.global_block().vars[op.output("Out")[0]]
-                    for op in ops
-                ]
-
-                for idx in op_idxs[::-1]:
-                    program.global_block()._remove_op(idx)
-
-                inputs_idxs = [-1] * len(inputs)
-                outputs_idxs = [-1] * len(outputs)
-
-                for idx, op in enumerate(program.global_block().ops):
-                    for i in range(0, len(op.output_names)):
-                        outs = op.output(op.output_names[i])
-                        for in_id, in_var in enumerate(inputs):
-                            if in_var.name in outs:
-                                inputs_idxs[in_id] = idx
-                    for i in range(0, len(op.input_names)):
-                        ins = op.input(op.input_names[i])
-                        for out_id, out_var in enumerate(outputs):
-                            if out_var.name in ins:
-                                outputs_idxs[out_id] = idx
-
-                if min(outputs_idxs) - max(inputs_idxs) >= 1:
-                    distributed_idx = max(inputs_idxs) + 1
-
-                    program.global_block()._insert_op(
-                        index=distributed_idx,
-                        type="distributed_lookup_table",
-                        inputs={"Ids": inputs,
-                                'W': w},
-                        outputs={"Outputs": outputs},
-                        attrs={
-                            "table_names": table_names,
-                            "height_sections": height_sections,
-                            "endpoints": endpoints,
-                            "padding_idx": padding_idx,
-                            "trainer_id": self.trainer_id
-                        })
-                else:
-                    raise ValueError(
-                        "something wrong with distribute_transpiler, submit a issue is recommended"
-                    )
-
-                for idx in used_ops[::-1]:
-                    self.sparse_update_ops.pop(idx)
-
-    def _is_input_of_remote_sparse_update_op(self, param_name):
-        for op in self.sparse_update_ops:
-            if param_name in op.input_arg_names:
-                return True
-        return False
-
-    def transpile(self,
-                  trainer_id,
-                  program=None,
-                  pservers="127.0.0.1:6174",
-                  trainers=1,
-                  sync_mode=True,
-                  startup_program=None,
-                  current_endpoint="127.0.0.1:6174"):
-        """
-        Run the transpiler. Transpile the input program.
-
-        Args:
-            trainer_id (int): id for current trainer worker, if you have
-                n workers, the id may range from 0 ~ n-1
-            program (Program|None): program to transpile,
-                default is fluid.default_main_program().
-            startup_program (Program|None): startup_program to transpile,
-                default is fluid.default_startup_program().
-            pservers (str): comma separated ip:port string for the pserver
-                list.
-            trainers (int|str): in pserver mode this is the number of
-                trainers, in nccl2 mode this is a string of trainer
-                endpoints.
-            sync_mode (bool): Do sync training or not, default is True.
-            startup_program (Program|None): startup_program to transpile,
-                default is fluid.default_main_program().
-            current_endpoint (str): need pass current endpoint when
-                transpile as nccl2 distributed mode. In pserver mode
-                this argument is not used.
-
-        Examples:
-            .. code-block:: python
-
-                transpiler = fluid.DistributeTranspiler()
-                t.transpile(
-                    trainer_id=0,
-                    pservers="127.0.0.1:7000,127.0.0.1:7001",
-                    trainers=2,
-                    sync_mode=False,
-                    current_endpoint="127.0.0.1:7000")
-        """
-        if program is None:
-            program = default_main_program()
-        if startup_program is None:
-            startup_program = default_startup_program()
-        self.origin_program = program
-        self.startup_program = startup_program
-        self.origin_startup_program = self.startup_program.clone()
-
-        if self.config.mode == "nccl2":
-            assert (isinstance(trainers, str))
-            self.origin_program._trainers_endpoints = trainers.split(",")
-            self.origin_program._nccl_comm_num = self.config.nccl_comm_num
-            self.origin_program._use_hierarchical_allreduce = self.config.use_hierarchical_allreduce
-            # check use_hierarchical_allreduce options
-            if self.config.use_hierarchical_allreduce:
-                trainers_num = len(self.origin_program._trainers_endpoints)
-                # selected automaticly
-                if self.config.hierarchical_allreduce_inter_nranks <= 1:
-                    self.config.hierarchical_allreduce_inter_nranks = core.get_cuda_device_count(
-                    )
-
-                assert trainers_num > self.config.hierarchical_allreduce_inter_nranks, \
-                    "trainers_num:{} < hierarchical_allreduce_inter_nranks:{}".format(trainers_num, self.config.hierarchical_allreduce_inter_nranks)
-
-                assert trainers_num % self.config.hierarchical_allreduce_inter_nranks == 0, \
-                    "trainers_num:{} mod hierarchical_allreduce_inter_nranks:{} != 0".format(trainers_num, self.config.hierarchical_allreduce_inter_nranks)
-
-                self.origin_program._hierarchical_allreduce_inter_nranks = \
-                    int(self.config.hierarchical_allreduce_inter_nranks)
-
-            self._transpile_nccl2(
-                trainer_id,
-                trainers,
-                current_endpoint,
-                startup_program=startup_program,
-                wait_port=self.config.wait_port)
-            return
-
-        if self.config.mode == "collective":
-            self._transpile_collective(
-                collective_mode=self.config.collective_mode,
-                trainer_id=trainer_id,
-                trainers=trainers,
-                current_endpoint=current_endpoint,
-                startup_program=startup_program,
-                main_program=program,
-                wait_port=self.config.wait_port)
-            return
-
-        self.trainer_num = trainers
-        self.sync_mode = sync_mode
-        self.trainer_id = trainer_id
-        pserver_endpoints = pservers.split(",")
-        self.pserver_endpoints = pserver_endpoints
-        self.vars_overview = VarsDistributed()
-        self.optimize_ops, self.params_grads = self._get_optimize_pass()
-
-        ps_dispatcher = self.config.split_method(self.pserver_endpoints)
-        self.table_name = find_distributed_lookup_table(self.origin_program)
-        self.has_distributed_lookup_table = self.table_name != None
-        self.param_name_to_grad_name = dict()
-        self.grad_name_to_param_name = dict()
-        for param_var, grad_var in self.params_grads:
-            self.param_name_to_grad_name[param_var.name] = grad_var.name
-            self.grad_name_to_param_name[grad_var.name] = param_var.name
-
-        # get all sparse update ops
-        self.sparse_update_ops = self._get_all_remote_sparse_update_op(
-            self.origin_program)
-        # use_sparse_update_param_name -> split_height_section
-        self.sparse_param_to_height_sections = dict()
-
-        # add distributed attrs to program
-        self.origin_program._is_distributed = True
-        self.origin_program._endpoints = self.pserver_endpoints
-        self.origin_program._ps_endpoint = current_endpoint
-        self.origin_program._is_chief = self.trainer_id == 0
-        self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None
-
-        # split and create vars, then put splited vars in dicts for later use.
-        # step 1: split and create vars, then put splited vars in dicts for later use.
-        self._init_splited_vars()
-
-        # step 2: insert send op to send gradient vars to parameter servers
-        ps_dispatcher.reset()
-        send_vars = []
-
-        # in general cases, the number of pservers is times of 2, and this
-        # will lead to uneven distribution among weights and bias:
-        #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
-        #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
-        # shuffle the map will avoid the uneven distribution above
-        grad_var_mapping_items = list(six.iteritems(self.grad_var_mapping))
-
-        if not self.config.slice_var_up:
-            np.random.seed(self.origin_program.random_seed)
-            np.random.shuffle(grad_var_mapping_items)
-
-        self.grad_name_to_send_dummy_out = dict()
-        for grad_varname, splited_vars in grad_var_mapping_items:
-            eplist = ps_dispatcher.dispatch(splited_vars)
-
-            if not self.config.slice_var_up:
-                assert (len(splited_vars) == 1)
-
-            splited_grad_varname = grad_varname
-            if len(splited_vars) == 1:
-                splited_grad_varname = splited_vars[0].name
-                index = find_op_by_output_arg(
-                    program.global_block(), splited_grad_varname, reverse=True)
-
-            elif len(splited_vars) > 1:
-                orig_var = program.global_block().vars[splited_grad_varname]
-                index = find_op_by_output_arg(
-                    program.global_block(), splited_grad_varname, reverse=True)
-
-                if not self.config.runtime_split_send_recv:
-                    self._insert_split_op(program, orig_var, index,
-                                          splited_vars)
-                    index += 1
-            else:
-                AssertionError("Can not insert the send op by original "
-                               "variable name :", splited_grad_varname)
-
-            if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
-                sparse_param_name = self.grad_name_to_param_name[grad_varname]
-                if self._is_input_of_remote_sparse_update_op(sparse_param_name):
-                    self.sparse_param_to_height_sections[sparse_param_name] = [
-                        splited_var.shape[0] for splited_var in splited_vars
-                    ]
-
-            dummy_output = program.global_block().create_var(
-                name=framework.generate_control_dev_var_name())
-            self.grad_name_to_send_dummy_out[grad_varname] = dummy_output
-
-            if self.config.runtime_split_send_recv:
-                send_input_vars = [
-                    program.global_block().vars[splited_grad_varname]
-                ]
-                sections = self._get_splited_var_sections(splited_vars)
-                send_varnames = [var.name for var in splited_vars]
-            else:
-                send_input_vars = splited_vars
-                sections = []
-                send_varnames = []
-
-            # get send op_role_var, if not splited, the grad should have .trainer suffix
-            # if splited, grad should be the original grad var name (split_by_ref and send
-            # will be on the same place). ParallelExecutor
-            # will use op_role_var to get expected device place to run this op.
-            program.global_block()._insert_op(
-                index=index + 1,
-                type="send",
-                inputs={"X": send_input_vars},
-                outputs={"Out": dummy_output},
-                attrs={
-                    "epmap": eplist,
-                    "sections": sections,
-                    "send_varnames": send_varnames,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
-                    OP_ROLE_VAR_ATTR_NAME: [
-                        self.grad_name_to_param_name[grad_varname],
-                        splited_grad_varname
-                    ]
-                })
-            for _, var in enumerate(splited_vars):
-                send_vars.append(var)
-
-        if self.sync_mode:
-            send_barrier_out = program.global_block().create_var(
-                name=framework.generate_control_dev_var_name())
-            if self.has_distributed_lookup_table:
-                self.grad_name_to_send_dummy_out[
-                    self.table_name] = program.global_block().create_var(
-                        name=framework.generate_control_dev_var_name())
-            input_deps = list(self.grad_name_to_send_dummy_out.values())
-
-            program.global_block().append_op(
-                type="send_barrier",
-                inputs={"X": list(input_deps)},
-                outputs={"Out": send_barrier_out},
-                attrs={
-                    "endpoints": pserver_endpoints,
-                    "trainer_id": self.trainer_id,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-                })
-
-        # step 3: insert recv op to receive parameters from parameter server
-        recv_vars = []
-        for _, var in enumerate(send_vars):
-            recv_vars.append(self.grad_param_mapping[var])
-        ps_dispatcher.reset()
-        eplist = ps_dispatcher.dispatch(recv_vars)
-
-        for i, ep in enumerate(eplist):
-            self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
-            self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
-
-            distributed_var = self.vars_overview.get_distributed_var_by_slice(
-                recv_vars[i].name)
-            distributed_var.endpoint = ep
-
-        need_sparse_update_params = {}
-
-        # step4: Concat the parameters splits together after recv.
-        all_recv_outputs = []
-        for param_varname, splited_var in six.iteritems(self.param_var_mapping):
-            eps = []
-            table_names = []
-            for var in splited_var:
-                index = [v.name for v in recv_vars].index(var.name)
-                eps.append(eplist[index])
-                table_names.append(var.name)
-            if self.sync_mode:
-                recv_dep_in = send_barrier_out
-            else:
-                # connect deps to send op in async mode
-                recv_dep_in = self.grad_name_to_send_dummy_out[
-                    self.param_name_to_grad_name[param_varname]]
-
-            # get recv op_role_var, if not splited, the grad should have .trainer suffix
-            # if splited, grad should be the original grad var name. ParallelExecutor
-            # will use op_role_var to get expected device place to run this op.
-            orig_grad_name = self.param_name_to_grad_name[param_varname]
-            recv_op_role_var_name = orig_grad_name
-            splited_trainer_grad = self.grad_var_mapping[orig_grad_name]
-            if len(splited_trainer_grad) == 1:
-                recv_op_role_var_name = splited_trainer_grad[0].name
-
-            if param_varname in self.sparse_param_to_height_sections:
-                for table_name in table_names:
-                    distributed_var = self.vars_overview.get_distributed_var_by_slice(
-                        table_name)
-                    distributed_var.vtype = "RemotePrefetch"
-
-                need_sparse_update_params[param_varname] = (eps, table_names)
-            else:
-                recv_varnames = []
-                if self.config.runtime_split_send_recv:
-                    orig_param = program.global_block().vars[param_varname]
-                    recv_varnames = [var.name for var in splited_var]
-                    splited_var = [orig_param]
-                all_recv_outputs.extend(splited_var)
-
-                program.global_block().append_op(
-                    type="recv",
-                    inputs={"X": [recv_dep_in]},
-                    outputs={"Out": splited_var},
-                    attrs={
-                        "epmap": eps,
-                        "recv_varnames": recv_varnames,
-                        "trainer_id": self.trainer_id,
-                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
-                        OP_ROLE_VAR_ATTR_NAME:
-                        [param_varname, recv_op_role_var_name]
-                    })
-
-        if self.sync_mode:
-            # form a WAW dependency
-            program.global_block().append_op(
-                type="fetch_barrier",
-                inputs={},
-                outputs={"Out": all_recv_outputs},
-                attrs={
-                    "endpoints": pserver_endpoints,
-                    "trainer_id": self.trainer_id,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-                })
-
-        for param_varname, splited_var in six.iteritems(self.param_var_mapping):
-            if len(splited_var) <= 1:
-                continue
-            orig_param = program.global_block().vars[param_varname]
-            if param_varname not in self.sparse_param_to_height_sections:
-                if not self.config.runtime_split_send_recv:
-                    program.global_block().append_op(
-                        type="concat",
-                        inputs={"X": splited_var},
-                        outputs={"Out": [orig_param]},
-                        attrs={
-                            "axis": 0,
-                            RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
-                        })
-
-            self._update_remote_sparse_update_op(program,
-                                                 need_sparse_update_params)
-
-        self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)
-
-        if self.has_distributed_lookup_table:
-            self._replace_lookup_table_op_with_prefetch(program,
-                                                        pserver_endpoints)
-            self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
-
-        self._get_distributed_optimizer_vars()
-        self.origin_program._parameters_on_pservers = self.vars_overview
-
-    def get_trainer_program(self, wait_port=True):
-        """
-        Get transpiled trainer side program.
-
-        Returns:
-            Program: trainer side program.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              #this is an example, find available endpoints in your case
-              pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-              trainer_id = 0
-              trainers = 4
-              t = fluid.DistributeTranspiler()
-              t.transpile(trainer_id, trainers=trainers, pservers=pserver_endpoints)
-              trainer_program = t.get_trainer_program()
-        """
-        # remove optimize ops and add a send op to main_program
-        # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
-
-        lr_ops = self._get_lr_ops()
-        delete_ops(self.origin_program.global_block(), self.optimize_ops)
-        delete_ops(self.origin_program.global_block(), lr_ops)
-
-        # delete table init op
-        if self.has_distributed_lookup_table:
-            table_var = self.startup_program.global_block().vars[
-                self.table_name]
-            table_param_init_op = []
-            for op in self.startup_program.global_block().ops:
-                if self.table_name in op.output_arg_names:
-                    table_param_init_op.append(op)
-            init_op_num = len(table_param_init_op)
-            if init_op_num != 1:
-                raise ValueError("table init op num should be 1, now is " + str(
-                    init_op_num))
-            table_init_op = table_param_init_op[0]
-            self.startup_program.global_block().append_op(
-                type="fake_init",
-                inputs={},
-                outputs={"Out": table_var},
-                attrs={"shape": table_init_op.attr('shape')})
-            delete_ops(self.startup_program.global_block(), table_param_init_op)
-
-        self.origin_program.__str__()
-
-        if wait_port:
-            wait_server_ready(self.pserver_endpoints)
-
-        return self.origin_program
-
-    def _get_trainer_startup_program(self, recv_vars, eplist):
-        """
-        Get transpiled trainer side startup program.
-
-        Args:
-            recv_vars (list): Variable list to recv for current trainer_id
-            eplist (list): A list of strings indicating
-
-        Returns:
-            Program: trainer side startup program.
-        """
-        startup_program = self.startup_program
-
-        # FIXME(gongwb): delete not need ops.
-        # note that: some parameter is not trainable and those ops can't be deleted.
-
-        for varname, splited_var in six.iteritems(self.param_var_mapping):
-            # Get the eplist of recv vars
-            eps = []
-            for var in splited_var:
-                index = [v.name for v in recv_vars].index(var.name)
-                eps.append(eplist[index])
-
-            for var in splited_var:
-                if startup_program.global_block().has_var(var.name):
-                    continue
-
-                startup_program.global_block().create_var(
-                    name=var.name,
-                    persistable=False,
-                    type=var.type,
-                    dtype=var.dtype,
-                    shape=var.shape,
-                    lod_level=var.lod_level)
-
-            op = startup_program.global_block().append_op(
-                type="recv",
-                inputs={"X": []},
-                outputs={"Out": splited_var},
-                attrs={
-                    "epmap": eps,
-                    "trainer_id": self.trainer_id,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-                })
-
-        fetch_barrier_out = startup_program.global_block().create_var(
-            name=framework.generate_control_dev_var_name())
-        startup_program.global_block().append_op(
-            type="fetch_barrier",
-            inputs={},
-            outputs={"Out": fetch_barrier_out},
-            attrs={
-                "endpoints": self.pserver_endpoints,
-                "trainer_id": self.trainer_id,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
-
-        for varname, splited_var in six.iteritems(self.param_var_mapping):
-            # add concat ops to merge splited parameters received from parameter servers.
-            if len(splited_var) <= 1:
-                continue
-            # NOTE: if enable memory optimization, origin vars maybe removed.
-            if varname in startup_program.global_block().vars:
-                orig_param = startup_program.global_block().vars[varname]
-            else:
-                origin_param_var = self.origin_program.global_block().vars[
-                    varname]
-                orig_param = startup_program.global_block().create_var(
-                    name=varname,
-                    persistable=origin_param_var.persistable,
-                    type=origin_param_var.type,
-                    dtype=origin_param_var.dtype,
-                    shape=origin_param_var.shape)
-            startup_program.global_block().append_op(
-                type="concat",
-                inputs={"X": splited_var},
-                outputs={"Out": [orig_param]},
-                attrs={"axis": 0})
-
-        return startup_program
-
-    def get_pserver_program(self, endpoint):
-        """
-        Get parameter server side program.
-
-        Args:
-            endpoint (str): current parameter server endpoint.
-
-        Returns:
-            Program: the program for current parameter server to run.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              #this is an example, find available endpoints in your case
-              pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-              current_endpoint = "192.168.0.1:6174"
-              trainer_id = 0
-              trainers = 4
-              t = fluid.DistributeTranspiler()
-              t.transpile(
-                   trainer_id, pservers=pserver_endpoints, trainers=trainers)
-              pserver_program = t.get_pserver_program(current_endpoint)
-        """
-        # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
-        # NOTE: assume blocks of the same variable is not distributed
-        # on the same pserver, only change param/grad varnames for
-        # trainers to fetch.
-        sys.stderr.write(
-            "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n"
-        )
-        # step1
-        pserver_program = Program()
-        pserver_program.random_seed = self.origin_program.random_seed
-        pserver_program._copy_dist_param_info_from(self.origin_program)
-
-        # step2: Create vars to receive vars at parameter servers.
-        recv_inputs = []
-        for v in self.param_grad_ep_mapping[endpoint]["params"]:
-            self._clone_var(pserver_program.global_block(), v)
-        for v in self.param_grad_ep_mapping[endpoint]["grads"]:
-            # create vars for each trainer in global scope, so
-            # we don't need to create them when grad arrives.
-            # change client side var name to origin name by
-            # removing ".trainer_%d" suffix
-            suff_idx = v.name.find(".trainer_")
-            if suff_idx >= 0:
-                orig_var_name = v.name[:suff_idx]
-            else:
-                orig_var_name = v.name
-            # NOTE: single_trainer_var must be created for multi-trainer
-            # case to merge grads from multiple trainers
-            single_trainer_var = \
-                pserver_program.global_block().create_var(
-                    name=orig_var_name,
-                    persistable=True,
-                    type=v.type,
-                    dtype=v.dtype,
-                    shape=v.shape)
-            if self.sync_mode and self.trainer_num > 1:
-                for trainer_id in range(self.trainer_num):
-                    var = pserver_program.global_block().create_var(
-                        name="%s.trainer_%d" % (orig_var_name, trainer_id),
-                        persistable=False,
-                        type=v.type,
-                        dtype=v.dtype,
-                        shape=v.shape)
-                    recv_inputs.append(var)
-            else:
-                recv_inputs.append(single_trainer_var)
-
-        # step 3
-        # Create a union-find data structure from optimize ops,
-        # If two ops are connected, we could add these two ops
-        # into one set.
-        ufind = self._create_ufind(self.optimize_ops)
-        # step 3.2
-        # Iterate through the ops and append optimize op which
-        # located on current pserver
-        opt_op_on_pserver = []
-        for _, op in enumerate(self.optimize_ops):
-            if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
-                    endpoint, op):
-                opt_op_on_pserver.append(op)
-        # step 3.3
-        # prepare if dc asgd is enabled
-        if self.config.enable_dc_asgd == True:
-            assert (self.sync_mode == False)
-            self.param_bak_list = []
-            # add param_bak for each trainer
-            for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                # each parameter should have w_bak for each trainer id
-                for i in range(self.trainer_num):
-                    param_bak_name = "%s.trainer_%d_bak" % (p.name, i)
-                    tmpvar = pserver_program.global_block().create_var(
-                        # NOTE: this var name format is used in `request_get_handler`
-                        name=param_bak_name,
-                        type=p.type,
-                        shape=p.shape,
-                        dtype=p.dtype)
-                    self.param_bak_list.append((p, tmpvar))
-
-        # step 3.4
-        # Iterate through the ops, and if an op and the optimize ops
-        # which located on current pserver are in one set, then
-        # append it into the sub program.
-
-        global_ops = []
-
-        # sparse grad name to param name
-        sparse_grad_to_param = []
-
-        def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
-                                   lr_ops):
-            if self._is_optimizer_op(op):
-                self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
-                                         self.origin_program, merged_var,
-                                         sparse_grad_to_param)
-            elif op not in lr_ops:
-                self._append_pserver_non_opt_ops(block, op)
-
-        def __clone_lr_op_sub_block__(op, program, lr_block):
-            if not op.has_attr('sub_block'):
-                return
-
-            origin_block_desc = op.attr('sub_block')
-            origin_block = self.origin_program.block(origin_block_desc.id)
-            assert isinstance(origin_block, Block)
-            # we put the new sub block to new block to follow the block
-            # hierarchy of the original blocks
-            new_sub_block = program._create_block(lr_block.idx)
-
-            # clone vars
-            for var in origin_block.vars:
-                new_sub_block._clone_variable(var)
-
-            # clone ops
-            for origin_op in origin_block.ops:
-                cloned_op = self._clone_lr_op(program, new_sub_block, origin_op)
-                # clone sub_block of op
-                __clone_lr_op_sub_block__(cloned_op, program, new_sub_block)
-
-            # reset the block of op
-            op._set_attr('sub_block', new_sub_block)
-
-        # append lr decay ops to the child block if exists
-        lr_ops = self._get_lr_ops()
-        # record optimize blocks and we can run them on pserver parallel
-        optimize_blocks = []
-        if len(lr_ops) > 0:
-            lr_decay_block = pserver_program._create_block(
-                pserver_program.num_blocks - 1)
-            optimize_blocks.append(lr_decay_block)
-            for _, op in enumerate(lr_ops):
-                cloned_op = self._append_pserver_non_opt_ops(lr_decay_block, op)
-                # append sub blocks to pserver_program in lr_decay_op
-                __clone_lr_op_sub_block__(cloned_op, pserver_program,
-                                          lr_decay_block)
-
-        # append op to the current block
-        grad_to_block_id = []
-        pre_block_idx = pserver_program.num_blocks - 1
-        for idx, opt_op in enumerate(opt_op_on_pserver):
-            per_opt_block = pserver_program._create_block(pre_block_idx)
-            optimize_blocks.append(per_opt_block)
-            optimize_target_param_name = opt_op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
-            # append grad merging ops before clip and weight decay
-            # e.g. merge grad -> L2Decay op -> clip op -> optimize
-            merged_var = None
-            for _, op in enumerate(self.optimize_ops):
-                # find the origin grad var before clipping/L2Decay,
-                # merged_var should be the input var name of L2Decay
-                grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
-                if op.attr(OP_ROLE_VAR_ATTR_NAME)[
-                        0] == optimize_target_param_name:
-                    merged_var = self._append_pserver_grad_merge_ops(
-                        per_opt_block, grad_varname_for_block, endpoint,
-                        grad_to_block_id, self.origin_program)
-                    if merged_var:
-                        break  # append optimize op once then append other ops.
-            if merged_var:
-                for _, op in enumerate(self.optimize_ops):
-                    # optimizer is connected to itself
-                    if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \
-                            op not in global_ops:
-                        log("append opt op: ", op.type, op.input_arg_names,
-                            merged_var)
-                        __append_optimize_op__(op, per_opt_block,
-                                               grad_to_block_id, merged_var,
-                                               lr_ops)
-
-        # dedup grad to ids list
-        grad_to_block_id = list(set(grad_to_block_id))
-        # append global ops
-        if global_ops:
-            opt_state_block = pserver_program._create_block(
-                pserver_program.num_blocks - 1)
-            optimize_blocks.append(opt_state_block)
-            for glb_op in global_ops:
-                __append_optimize_op__(glb_op, opt_state_block,
-                                       grad_to_block_id, None, lr_ops)
-
-        # process distributed lookup_table
-        prefetch_var_name_to_block_id = []
-        if self.has_distributed_lookup_table:
-            pserver_index = self.pserver_endpoints.index(endpoint)
-            table_opt_block = self._create_table_optimize_block(
-                pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
-            optimize_blocks.append(table_opt_block)
-            lookup_table_var_name_to_block_id = self._create_prefetch_block(
-                pserver_index, pserver_program, table_opt_block)
-            checkpoint_block_id = self._create_checkpoint_save_block(
-                pserver_program, table_opt_block.idx)
-
-            pserver_program._distributed_lookup_table = self.table_name
-            prefetch_var_name_to_block_id.extend(
-                lookup_table_var_name_to_block_id)
-
-        if len(optimize_blocks) == 0:
-            logging.warn("pserver [" + str(endpoint) +
-                         "] has no optimize block!!")
-            pre_block_idx = pserver_program.num_blocks - 1
-            empty_block = pserver_program._create_block(pre_block_idx)
-            optimize_blocks.append(empty_block)
-
-        # In some case, some parameter server will have no parameter to optimize
-        # So we give an empty optimize block to parameter server.
-        attrs = {
-            "optimize_blocks": optimize_blocks,
-            "endpoint": endpoint,
-            "Fanin": self.trainer_num,
-            "sync_mode": self.sync_mode,
-            "grad_to_block_id": grad_to_block_id,
-            "sparse_grad_to_param": sparse_grad_to_param,
-        }
-
-        if self.has_distributed_lookup_table:
-            attrs['checkpint_block_id'] = checkpoint_block_id
-        if self.config.enable_dc_asgd:
-            attrs['dc_asgd'] = True
-
-        if len(prefetch_var_name_to_block_id) > 0:
-            attrs[
-                'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id
-
-        # step5 append the listen_and_serv op
-        pserver_program.global_block().append_op(
-            type="listen_and_serv",
-            inputs={'X': recv_inputs},
-            outputs={},
-            attrs=attrs)
-
-        pserver_program._sync_with_cpp()
-        # save pserver program to generate pserver side startup relatively.
-        self.pserver_program = pserver_program
-        return pserver_program
-
-    def get_pserver_programs(self, endpoint):
-        """
-        Get pserver side main program and startup program for distributed training.
-
-        Args:
-            endpoint (str): current pserver endpoint.
-
-        Returns:
-            tuple: (main_program, startup_program), of type "Program"
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              #this is an example, find available endpoints in your case
-              pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-              current_endpoint = "192.168.0.1:6174"
-              trainer_id = 0
-              trainers = 4
-              t = fluid.DistributeTranspiler()
-              t.transpile(
-                   trainer_id, pservers=pserver_endpoints, trainers=trainers)
-              pserver_program, pserver_startup_program = t.get_pserver_programs(current_endpoint)
-        """
-        pserver_prog = self.get_pserver_program(endpoint)
-        pserver_startup = self.get_startup_program(
-            endpoint, pserver_program=pserver_prog)
-        return pserver_prog, pserver_startup
-
-    def get_startup_program(self,
-                            endpoint,
-                            pserver_program=None,
-                            startup_program=None):
-        """
-        **Deprecated**
-
-        Get startup program for current parameter server.
-        Modify operator input variables if there are variables that
-        were split to several blocks.
-
-        Args:
-            endpoint (str): current pserver endpoint.
-            pserver_program (Program): deprecated, call get_pserver_program first.
-            startup_program (Program): deprecated, should pass startup_program
-                when initalizing
-
-        Returns:
-            Program: parameter server side startup program.
-
-        Examples:
-	    .. code-block:: python
-            
-                pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-                trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-                current_endpoint = "192.168.0.1:6174"
-                trainer_id = 0
-                trainers = 4
-
-                t = fluid.DistributeTranspiler()
-                t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-                pserver_program = t.get_pserver_program(current_endpoint)
-                pserver_startup_program = t.get_startup_program(current_endpoint,
-                                                                pserver_program)
-        """
-        s_prog = Program()
-        orig_s_prog = self.startup_program
-        s_prog.random_seed = orig_s_prog.random_seed
-        params = self.param_grad_ep_mapping[endpoint]["params"]
-
-        def _get_splited_name_and_shape(varname):
-            for idx, splited_param in enumerate(params):
-                pname = splited_param.name
-                if same_or_split_var(pname, varname) and varname != pname:
-                    return pname, splited_param.shape
-            return "", []
-
-        # 1. create vars in pserver program to startup program
-        pserver_vars = pserver_program.global_block().vars
-        created_var_map = collections.OrderedDict()
-        for _, var in six.iteritems(pserver_vars):
-            tmpvar = s_prog.global_block()._clone_variable(var)
-            created_var_map[var.name] = tmpvar
-
-        # 2. rename op outputs
-        for op in orig_s_prog.global_block().ops:
-            new_outputs = collections.OrderedDict()
-            # do not append startup op if var is not on this pserver
-            op_on_pserver = False
-            # TODO(gongwb): remove this line.
-            if op.type not in ["recv", "fetch_barrier", "concat"]:
-                for key in op.output_names:
-                    newname, _ = _get_splited_name_and_shape(op.output(key)[0])
-                    if newname:
-                        op_on_pserver = True
-                        new_outputs[key] = created_var_map[newname]
-                    elif op.output(key)[0] in pserver_vars:
-                        op_on_pserver = True
-                        new_outputs[key] = pserver_vars[op.output(key)[0]]
-
-            if op_on_pserver:
-                # most startup program ops have no inputs
-                new_inputs = self._get_input_map_from_op(pserver_vars, op)
-
-                if op.type in [
-                        "gaussian_random", "fill_constant", "uniform_random",
-                        "truncated_gaussian_random"
-                ]:
-                    op._set_attr("shape", list(new_outputs["Out"].shape))
-                s_prog.global_block().append_op(
-                    type=op.type,
-                    inputs=new_inputs,
-                    outputs=new_outputs,
-                    attrs=op.all_attrs())
-        if self.config.enable_dc_asgd:
-            for p, p_bak in self.param_bak_list:
-                startup_param_var = s_prog.global_block().vars[p.name]
-                startup_tmpvar = s_prog.global_block().vars[p_bak.name]
-                # copy init random value to param_bak
-                s_prog.global_block().append_op(
-                    type="assign",
-                    inputs={"X": startup_param_var},
-                    outputs={"Out": startup_tmpvar})
-
-        return s_prog
-
-    # ====================== private transpiler functions =====================
-    def _get_slice_var_info(self, slice_var):
-        block_suffix = "block"
-        block_idx = 0
-        offset = 0
-        is_slice = False
-
-        orig_var_name, block_name, _ = self._get_varname_parts(slice_var.name)
-
-        if not block_name:
-            return is_slice, block_idx, offset
-
-        block_idx = int(block_name.split(block_suffix)[1])
-        skip_dim0 = 0
-        slice_vars = self.param_var_mapping[orig_var_name]
-
-        orig_dim1_flatten = 1
-
-        if len(slice_vars[0].shape) >= 2:
-            orig_dim1_flatten = reduce(lambda x, y: x * y,
-                                       slice_vars[0].shape[1:])
-
-        for slice_var in slice_vars[:block_idx]:
-            skip_dim0 += slice_var.shape[0]
-
-        offset = skip_dim0 * orig_dim1_flatten
-        is_slice = True
-        return is_slice, block_idx, offset
-
-    def _get_distributed_optimizer_vars(self):
-        def _get_distributed_optimizer_var(endpoint):
-            opt_op_on_pserver = []
-            for _, op in enumerate(self.optimize_ops):
-                if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
-                        endpoint, op):
-                    opt_op_on_pserver.append(op)
-
-            for opt_op in opt_op_on_pserver:
-                dist_var = None
-                for key in opt_op.input_names:
-                    if key == "Param":
-                        param_name = opt_op.input(key)[0]
-                        dist_var = self.vars_overview.get_distributed_var_by_origin_and_ep(
-                            param_name, endpoint)
-                        break
-                for key in opt_op.input_names:
-                    if key in ["Param", "Grad", "LearningRate"]:
-                        continue
-                    origin_var = self.origin_program.global_block().vars[
-                        opt_op.input(key)[0]]
-                    # update accumulator variable shape
-                    new_shape = self._get_optimizer_input_shape(
-                        opt_op.type, key, origin_var.shape,
-                        dist_var.slice.shape)
-
-                    if new_shape == dist_var.slice.shape:
-                        splited_var = VarStruct(
-                            name=origin_var.name,
-                            shape=new_shape,
-                            dtype=origin_var.dtype,
-                            type=origin_var.type,
-                            lod_level=origin_var.lod_level,
-                            persistable=origin_var.persistable)
-
-                        self.vars_overview.add_distributed_var(
-                            origin_var=origin_var,
-                            slice_var=splited_var,
-                            is_slice=dist_var.is_slice,
-                            block_id=dist_var.block_id,
-                            offset=dist_var.offset,
-                            vtype="Optimizer",
-                            endpoint=endpoint)
-                    else:
-                        self.vars_overview.add_distributed_var(
-                            origin_var=origin_var,
-                            slice_var=origin_var,
-                            is_slice=False,
-                            block_id=0,
-                            offset=0,
-                            vtype="Optimizer",
-                            endpoint=endpoint)
-
-        for ep in self.pserver_endpoints:
-            _get_distributed_optimizer_var(ep)
-
-    def _update_dist_lookup_table_vars(self, param_list, grad_list,
-                                       params_grads):
-        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
-        # update self.table_param_grad and self.trainer_side_table_grad_list
-        program = self.origin_program
-        if self.has_distributed_lookup_table:
-            param_list = [
-                param for param in param_list if param.name != self.table_name
-            ]
-            grad_list = [
-                grad for grad in grad_list
-                if grad.name != grad_var_name(self.table_name)
-            ]
-            self.table_param_grad = [
-                param_grad for param_grad in params_grads
-                if param_grad[0].name == self.table_name
-            ][0]
-            table_grad_var = self.table_param_grad[1]
-            if self.sync_mode:
-                self.trainer_side_table_grad_list = [
-                    program.global_block().create_var(
-                        name="%s.trainer_%d.pserver_%d" %
-                        (table_grad_var.name, self.trainer_id, index),
-                        type=table_grad_var.type,
-                        shape=table_grad_var.shape,
-                        dtype=table_grad_var.dtype)
-                    for index in range(len(self.pserver_endpoints))
-                ]
-            else:
-                self.trainer_side_table_grad_list = [
-                    program.global_block().create_var(
-                        name="%s.pserver_%d" % (table_grad_var.name, index),
-                        type=table_grad_var.type,
-                        shape=table_grad_var.shape,
-                        dtype=table_grad_var.dtype)
-                    for index in range(len(self.pserver_endpoints))
-                ]
-        return param_list, grad_list
-
-    def _init_splited_vars(self):
-        # update these mappings for further transpile:
-        # 1. param_var_mapping: param var name -> [splited params vars]
-        # 2. grad_var_mapping: grad var name -> [splited grads vars]
-        # 3. grad_param_mapping: grad.blockx -> param.blockx
-        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
-
-        param_list = []
-        grad_list = []
-        param_grad_set = set()
-        for p, g in self.params_grads:
-            # skip parameter marked not trainable
-            if type(p) == Parameter and p.trainable == False:
-                continue
-            if p.name not in param_grad_set:
-                param_list.append(p)
-                param_grad_set.add(p.name)
-            if g.name not in param_grad_set:
-                grad_list.append(g)
-                param_grad_set.add(g.name)
-
-        param_list, grad_list = self._update_dist_lookup_table_vars(
-            param_list, grad_list, self.params_grads)
-
-        if self.config.slice_var_up:
-            # when we slice var up into blocks, we will slice the var according to
-            # pserver services' count. A pserver may have two or more listening ports.
-            grad_blocks = slice_variable(grad_list,
-                                         len(self.pserver_endpoints),
-                                         self.config.min_block_size)
-            param_blocks = slice_variable(param_list,
-                                          len(self.pserver_endpoints),
-                                          self.config.min_block_size)
-        else:
-            # when we do NOT slice var up into blocks, we will always slice params
-            # grads into one block.
-            grad_blocks = slice_variable(grad_list, 1,
-                                         self.config.min_block_size)
-            param_blocks = slice_variable(param_list, 1,
-                                          self.config.min_block_size)
-        assert (len(grad_blocks) == len(param_blocks))
-
-        # origin_param_name -> [splited_param_vars]
-        self.param_var_mapping = self._create_vars_from_blocklist(
-            self.origin_program, param_blocks)
-
-        for orig_name, splited_vars in self.param_var_mapping.items():
-            orig_var = self.origin_program.global_block().var(orig_name)
-
-            for splited_var in splited_vars:
-                is_slice, block_id, offset = self._get_slice_var_info(
-                    splited_var)
-
-                self.vars_overview.add_distributed_var(
-                    origin_var=orig_var,
-                    slice_var=splited_var,
-                    block_id=block_id,
-                    offset=offset,
-                    is_slice=is_slice,
-                    vtype="Param")
-
-        # origin_grad_name -> [splited_grad_vars]
-        self.grad_var_mapping = self._create_vars_from_blocklist(
-            self.origin_program,
-            grad_blocks,
-            add_trainer_suffix=self.trainer_num > 1)
-        # dict(grad_splited_var -> param_splited_var)
-        self.grad_param_mapping = collections.OrderedDict()
-        for g, p in zip(grad_blocks, param_blocks):
-            g_name, g_bid, _ = g.split(":")
-            p_name, p_bid, _ = p.split(":")
-            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \
-                self.param_var_mapping[p_name][int(p_bid)]
-
-        # create mapping of endpoint -> split var to create pserver side program
-        self.param_grad_ep_mapping = collections.OrderedDict()
-        [
-            self.param_grad_ep_mapping.update({
-                ep: {
-                    "params": [],
-                    "grads": []
-                }
-            }) for ep in self.pserver_endpoints
-        ]
-
-    # transpiler function for dis lookup_table
-    def _replace_lookup_table_op_with_prefetch(self, program,
-                                               pserver_endpoints):
-        # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
-        self.all_in_ids_vars = []
-        self.all_prefetch_input_vars = []
-        self.all_prefetch_output_vars = []
-        self.all_out_emb_vars = []
-        lookup_table_op_index = -1
-
-        continue_search_lookup_table_op = True
-        while continue_search_lookup_table_op:
-            continue_search_lookup_table_op = False
-            all_ops = program.global_block().ops
-            for op in all_ops:
-                if op.type == LOOKUP_TABLE_TYPE and self.table_name == op.input(
-                        "W")[0]:
-                    if not op.attr('is_distributed'):
-                        raise RuntimeError(
-                            "lookup_table_op that lookup an distributed embedding table"
-                            "should set is_distributed to true")
-                    continue_search_lookup_table_op = True
-
-                    lookup_table_op_index = lookup_table_op_index if lookup_table_op_index != -1 else list(
-                        all_ops).index(op)
-                    ids_name = op.input("Ids")
-                    out_name = op.output("Out")
-
-                    ids_var = program.global_block().vars[ids_name[0]]
-                    self.all_in_ids_vars.append(ids_var)
-
-                    out_var = program.global_block().vars[out_name[0]]
-                    self.all_out_emb_vars.append(out_var)
-
-                    # delete lookup_table_op
-                    delete_ops(program.global_block(), [op])
-                    # break for loop
-                    break
-
-        for index in range(len(self.pserver_endpoints)):
-            in_var = program.global_block().create_var(
-                name=str("prefetch_compress_in_tmp_" + str(index)),
-                type=self.all_in_ids_vars[0].type,
-                shape=self.all_in_ids_vars[0].shape,
-                dtype=self.all_in_ids_vars[0].dtype)
-            self.all_prefetch_input_vars.append(in_var)
-
-            out_var = program.global_block().create_var(
-                name=str("prefetch_compress_out_tmp_" + str(index)),
-                type=self.all_out_emb_vars[0].type,
-                shape=self.all_out_emb_vars[0].shape,
-                dtype=self.all_out_emb_vars[0].dtype)
-            self.all_prefetch_output_vars.append(out_var)
-
-        # insert split_ids_op
-        program.global_block()._insert_op(
-            index=lookup_table_op_index,
-            type="split_ids",
-            inputs={'Ids': self.all_in_ids_vars},
-            outputs={"Out": self.all_prefetch_input_vars})
-
-        # insert prefetch_op
-        program.global_block()._insert_op(
-            index=lookup_table_op_index + 1,
-            type="prefetch",
-            inputs={'X': self.all_prefetch_input_vars},
-            outputs={"Out": self.all_prefetch_output_vars},
-            attrs={
-                "epmap": pserver_endpoints,
-                # FIXME(qiao) temporarily disable this config because prefetch
-                # is not act as other rpc op, it's more like a forward op
-                # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
-
-        # insert concat_op
-        program.global_block()._insert_op(
-            index=lookup_table_op_index + 2,
-            type="merge_ids",
-            inputs={
-                'Ids': self.all_in_ids_vars,
-                'Rows': self.all_prefetch_input_vars,
-                'X': self.all_prefetch_output_vars
-            },
-            outputs={"Out": self.all_out_emb_vars})
-
-    def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
-        # 2. add split_ids_op and send_op to send gradient to pservers
-
-        # there should only be one table_name
-        all_ops = program.global_block().ops
-        table_grad_name = grad_var_name(self.table_name)
-        for op in all_ops:
-            if table_grad_name in op.output_arg_names:
-                op_index = list(all_ops).index(op)
-                # insert split_ids_op
-                program.global_block()._insert_op(
-                    index=op_index + 1,
-                    type="split_ids",
-                    inputs={
-                        'Ids': [program.global_block().vars[table_grad_name]]
-                    },
-                    outputs={"Out": self.trainer_side_table_grad_list},
-                    attrs={RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE})
-                program.global_block()._insert_op(
-                    index=op_index + 2,
-                    type="send",
-                    inputs={'X': self.trainer_side_table_grad_list},
-                    outputs={
-                        'Out':
-                        [self.grad_name_to_send_dummy_out[self.table_name]]
-                        if self.sync_mode else []
-                    },
-                    attrs={
-                        "epmap": pserver_endpoints,
-                        "trainer_id": self.trainer_id,
-                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
-                        OP_ROLE_VAR_ATTR_NAME: [
-                            self.grad_name_to_param_name[table_grad_name],
-                            table_grad_name
-                        ]
-                    })
-                break
-
-    def _create_prefetch_block(self, pserver_index, pserver_program,
-                               optimize_block):
-        # STEP: create prefetch block
-        table_var = pserver_program.global_block().vars[self.table_name]
-        prefetch_var_name_to_block_id = []
-        prefetch_block = pserver_program._create_block(optimize_block.idx)
-        trainer_ids = self.all_prefetch_input_vars[pserver_index]
-        pserver_ids = pserver_program.global_block().create_var(
-            name=trainer_ids.name,
-            type=trainer_ids.type,
-            shape=trainer_ids.shape,
-            dtype=trainer_ids.dtype)
-        trainer_out = self.all_prefetch_output_vars[pserver_index]
-        pserver_out = pserver_program.global_block().create_var(
-            name=trainer_out.name,
-            type=trainer_out.type,
-            shape=trainer_out.shape,
-            dtype=trainer_out.dtype)
-        prefetch_block.append_op(
-            type="lookup_sparse_table",
-            inputs={'Ids': pserver_ids,
-                    "W": table_var},
-            outputs={"Out": pserver_out},
-            attrs={
-                "is_sparse": True,  # has no effect on lookup_table op
-                "is_distributed": True,
-                "padding_idx": -1
-            })
-        prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
-            prefetch_block.idx))
-        return prefetch_var_name_to_block_id
-
-    def _create_table_optimize_block(self, pserver_index, pserver_program,
-                                     pre_block_idx, grad_to_block_id):
-        # STEP: create table optimize block
-        table_opt_block = pserver_program._create_block(pre_block_idx)
-        # create table param and grad var in pserver program
-        # create table optimize block in pserver program
-        table_opt_op = [
-            op for op in self.optimize_ops
-            if 'Param' in op.input_names and op.input("Param")[0] ==
-            self.table_name
-        ][0]
-
-        origin_param_var = self.origin_program.global_block().vars[
-            self.table_name]
-
-        zero_dim = int(
-            math.ceil(origin_param_var.shape[0] / float(
-                len(self.pserver_endpoints))))
-        table_shape = list(origin_param_var.shape)
-        table_shape[0] = zero_dim
-
-        param_var = pserver_program.global_block().create_var(
-            name=origin_param_var.name,
-            shape=table_shape,
-            dtype=origin_param_var.dtype,
-            type=core.VarDesc.VarType.SELECTED_ROWS,
-            persistable=True)
-
-        # parameter must be selected rows
-        param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-        grad_var = pserver_program.global_block()._clone_variable(
-            self.origin_program.global_block().vars[grad_var_name(
-                self.table_name)])
-
-        lr_var = pserver_program.global_block()._clone_variable(
-            self.origin_program.global_block().vars[table_opt_op.input(
-                "LearningRate")[0]])
-
-        if self.sync_mode:
-            # create grad vars in pserver program
-            table_grad_var = self.table_param_grad[1]
-            pserver_side_table_grad_list = [
-                pserver_program.global_block().create_var(
-                    name="%s.trainer_%d.pserver_%d" %
-                    (table_grad_var.name, index, pserver_index),
-                    type=table_grad_var.type,
-                    shape=table_grad_var.shape,
-                    dtype=table_grad_var.dtype)
-                for index in range(self.trainer_num)
-            ]
-
-            # append sum op for pserver_side_table_grad_list
-            table_opt_block.append_op(
-                type="sum",
-                inputs={"X": pserver_side_table_grad_list},
-                outputs={"Out": [grad_var]},
-                attrs={"use_mkldnn": False})
-        else:
-            # in async_mode, for table gradient, it also need to be splited to each parameter server
-            origin_grad_name = grad_var.name
-            splited_grad_name = self.trainer_side_table_grad_list[
-                pserver_index].name
-            if not splited_grad_name.startswith(origin_grad_name):
-                raise ValueError("origin_grad_var: " + splited_grad_name +
-                                 " grad_var:" + grad_var.name)
-            grad_var = pserver_program.global_block()._rename_var(
-                origin_grad_name, splited_grad_name)
-
-        inputs = {
-            "Param": [param_var],
-            "Grad": [grad_var],
-            "LearningRate": [lr_var]
-        }
-        outputs = {"ParamOut": [param_var]}
-        # only support sgd now
-        logging.warn(
-            "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
-            + table_opt_op.type)
-        table_opt_block.append_op(type="sgd", inputs=inputs, outputs=outputs)
-
-        # add table parameter gradient and it's block id to grad_to_block_id
-        grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
-
-        return table_opt_block
-
-    def _create_checkpoint_save_block(self, pserver_program, pre_block_idx):
-        """
-        create a new block to handle save checkpoint.
-        """
-
-        pserver_program.global_block().create_var(
-            name="kLookupTablePath",
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-
-        checkpoint_save_block = pserver_program._create_block(pre_block_idx)
-        # this 'file_path' do not be used in save lookup table variable
-        checkpoint_save_block.append_op(
-            type='save',
-            inputs={'X': [self.table_name]},
-            outputs={},
-            attrs={'file_path': "none"})
-
-        return checkpoint_save_block.idx
-
-    def _create_vars_from_blocklist(self,
-                                    program,
-                                    block_list,
-                                    add_trainer_suffix=False):
-        """
-        Create vars for each split.
-        NOTE: only grads need to be named for different trainers, use
-              add_trainer_suffix to rename the grad vars.
-        Args:
-            program (ProgramDesc): ProgramDesc which gradients blong.
-            block_list (list[(varname, block_id, block_size)]): List of gradient blocks.
-            add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True.
-        Returns:
-            var_mapping (collections.OrderedDict(varname->[new_varname_variable])):A dict mapping
-                from original var name to each var split.
-        """
-
-        # varname->[(block_id, current_block_size)]
-        block_map = collections.OrderedDict()
-
-        var_mapping = collections.OrderedDict()
-        for block_str in block_list:
-            varname, offset, size = block_str.split(":")
-            if varname not in block_map:
-                block_map[varname] = []
-            block_map[varname].append((int(offset), int(size)))
-
-        for varname, splited in six.iteritems(block_map):
-            orig_var = program.global_block().var(varname)
-            if len(splited) == 1:
-                if self.sync_mode and add_trainer_suffix:
-                    new_var_name = "%s.trainer_%d" % \
-                                   (orig_var.name, self.trainer_id)
-                    program.global_block()._rename_var(varname, new_var_name)
-                    var_mapping[varname] = \
-                        [program.global_block().var(new_var_name)]
-                else:
-                    var_mapping[varname] = \
-                        [program.global_block().var(orig_var.name)]
-                continue
-            var_mapping[varname] = []
-            orig_shape = orig_var.shape
-            orig_dim1_flatten = 1
-            if len(orig_shape) >= 2:
-                orig_dim1_flatten = reduce(lambda x, y: x * y, orig_shape[1:])
-
-            for i, block in enumerate(splited):
-                size = block[1]
-                rows = size // orig_dim1_flatten
-                splited_shape = [rows]
-                if len(orig_shape) >= 2:
-                    splited_shape.extend(orig_shape[1:])
-                new_var_name = ""
-                if self.sync_mode and add_trainer_suffix:
-                    new_var_name = "%s.block%d.trainer_%d" % \
-                                   (varname, i, self.trainer_id)
-                else:
-                    new_var_name = "%s.block%d" % \
-                                   (varname, i)
-                var = program.global_block().create_var(
-                    name=new_var_name,
-                    persistable=False,
-                    dtype=orig_var.dtype,
-                    type=orig_var.type,
-                    shape=splited_shape)  # flattend splited var
-                var_mapping[varname].append(var)
-            program.global_block()._sync_with_cpp()
-        return var_mapping
-
-    def _clone_var(self, block, var, persistable=True):
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=persistable)
-
-    @staticmethod
-    def _get_splited_var_sections(splited_vars):
-        height_sections = []
-        for v in splited_vars:
-            height_sections.append(v.shape[0])
-        return height_sections
-
-    def _insert_split_op(self, program, orig_var, index, splited_vars):
-        height_sections = self._get_splited_var_sections(splited_vars)
-
-        if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-            sparse_param_name = self.grad_name_to_param_name[orig_var.name]
-            if self._is_input_of_remote_sparse_update_op(sparse_param_name):
-                self.sparse_param_to_height_sections[
-                    sparse_param_name] = height_sections
-            program.global_block()._insert_op(
-                index=index + 1,
-                type="split_selected_rows",
-                inputs={"X": orig_var},
-                outputs={"Out": splited_vars},
-                attrs={
-                    "height_sections": height_sections,
-                    RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
-                })
-        elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
-            program.global_block()._insert_op(
-                index=index + 1,
-                type="split_byref",
-                inputs={"X": orig_var},
-                outputs={"Out": splited_vars},
-                attrs={
-                    "sections": height_sections,
-                    RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
-                })
-        else:
-            AssertionError("Variable type should be in set "
-                           "[LOD_TENSOR, SELECTED_ROWS]")
-
-    def _get_optimizer_input_shape(self, op_type, varkey, orig_shape,
-                                   param_shape):
-        """
-        Returns the shape for optimizer inputs that need to be reshaped when
-        Param and Grad is split to multiple servers.
-        """
-        # HACK(typhoonzero): Should use functions of corresponding optimizer in
-        # optimizer.py to get the shape, do not  bind this in the transpiler.
-        if op_type == "adam":
-            if varkey in ["Moment1", "Moment2"]:
-                return param_shape
-        elif op_type == "adagrad":
-            if varkey == "Moment":
-                return param_shape
-        elif op_type == "adamax":
-            if varkey in ["Moment", "InfNorm"]:
-                return param_shape
-        elif op_type in ["momentum", "lars_momentum"]:
-            if varkey == "Velocity":
-                return param_shape
-        elif op_type == "rmsprop":
-            if varkey in ["Moment", "MeanSquare"]:
-                return param_shape
-        elif op_type == "decayed_adagrad":
-            if varkey == "Moment":
-                return param_shape
-        elif op_type == "ftrl":
-            if varkey in ["SquaredAccumulator", "LinearAccumulator"]:
-                return param_shape
-        elif op_type == "sgd":
-            pass
-        else:
-            raise ValueError(
-                "Not supported optimizer for distributed training: %s" %
-                op_type)
-        return orig_shape
-
-    def _get_varname_parts(self, varname):
-        # returns origin, blockid, trainerid
-        orig_var_name = ""
-        trainer_part = ""
-        block_part = ""
-        trainer_idx = varname.find(".trainer_")
-        if trainer_idx >= 0:
-            trainer_part = varname[trainer_idx + 1:]
-        else:
-            trainer_idx = len(varname)
-        block_index = varname.find(".block")
-        if block_index >= 0:
-            block_part = varname[block_index + 1:trainer_idx]
-        else:
-            block_index = len(varname)
-        orig_var_name = varname[0:min(block_index, trainer_idx)]
-        return orig_var_name, block_part, trainer_part
-
-    def _orig_varname(self, varname):
-        orig, _, _ = self._get_varname_parts(varname)
-        return orig
-
-    def _append_pserver_grad_merge_ops(self, optimize_block,
-                                       grad_varname_for_block, endpoint,
-                                       grad_to_block_id, origin_program):
-        program = optimize_block.program
-        pserver_block = program.global_block()
-        grad_block = None
-        for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-            if self._orig_varname(g.name) == \
-                    self._orig_varname(grad_varname_for_block):
-                grad_block = g
-                break
-        if not grad_block:
-            # do not append this op if current endpoint
-            # is not dealing with this grad block
-            return None
-        orig_varname, block_name, trainer_name = self._get_varname_parts(
-            grad_block.name)
-        if block_name:
-            merged_var_name = '.'.join([orig_varname, block_name])
-        else:
-            merged_var_name = orig_varname
-
-        merged_var = pserver_block.vars[merged_var_name]
-        grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
-        if self.sync_mode and self.trainer_num > 1:
-            vars2merge = []
-            for i in range(self.trainer_num):
-                per_trainer_name = "%s.trainer_%d" % \
-                                   (merged_var_name, i)
-                vars2merge.append(pserver_block.vars[per_trainer_name])
-            optimize_block.append_op(
-                type="sum",
-                inputs={"X": vars2merge},
-                outputs={"Out": merged_var},
-                attrs={"use_mkldnn": False})
-            optimize_block.append_op(
-                type="scale",
-                inputs={"X": merged_var},
-                outputs={"Out": merged_var},
-                attrs={"scale": 1.0 / float(self.trainer_num)})
-        return merged_var
-
-    def _append_dc_asgd_ops(self, block, param_var, grad_var):
-        # NOTE: can not use grammar candy here, should put ops in specific block
-        local_param_bak = block.create_var(
-            name="%s.local_bak" % param_var.name,
-            shape=param_var.shape,
-            type=param_var.type,
-            dtype=param_var.dtype,
-            persistable=False)
-        # trainer_id_var is block local
-        trainer_id_var = block.create_var(
-            name="@TRAINER_ID@",
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            dtype=core.VarDesc.VarType.INT64,
-            shape=[1],
-            persistable=False)
-
-        # ref_inputs = [x[1] for x in self.param_bak_list]
-        ref_inputs = []
-        for p, p_bak in self.param_bak_list:
-            if p.name == param_var.name:
-                ref_inputs.append(p_bak)
-        block.append_op(
-            type="ref_by_trainer_id",
-            inputs={"X": ref_inputs,
-                    "TrainerId": trainer_id_var},
-            outputs={"Out": local_param_bak})
-
-        def __create_temp_var__():
-            return block.create_var(
-                name=unique_name.generate("tmp_dc_output"),
-                shape=param_var.shape,
-                type=param_var.type,
-                dtype=param_var.dtype,
-                persistable=False)
-
-        o1 = __create_temp_var__()
-        block.append_op(
-            type="elementwise_sub",
-            inputs={"X": param_var,
-                    "Y": local_param_bak},
-            outputs={"Out": o1})
-        o2 = __create_temp_var__()
-        block.append_op(
-            type="elementwise_mul",
-            inputs={"X": o1,
-                    "Y": grad_var},
-            outputs={"Out": o2})
-        o3 = __create_temp_var__()
-        block.append_op(
-            type="elementwise_mul",
-            inputs={"X": o2,
-                    "Y": grad_var},
-            outputs={"Out": o3})
-        # TODO(typhoonzero): append scale
-        o4 = __create_temp_var__()
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": grad_var,
-                    "Y": o3},
-            outputs={"Out": o4})
-        return o4
-
-    def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
-                            grad_to_block_id, origin_program, merged_var,
-                            sparse_grad_to_param):
-        program = optimize_block.program
-        pserver_block = program.global_block()
-        new_inputs = collections.OrderedDict()
-
-        def _get_param_block(opt_op):
-            # param is already created on global program
-            param_block = None
-            for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                if same_or_split_var(p.name, opt_op.input("Param")[0]):
-                    param_block = p
-                    break
-            return param_block
-
-        if self.config.enable_dc_asgd:
-            param_var = _get_param_block(opt_op)
-            dc = self._append_dc_asgd_ops(optimize_block, param_var, merged_var)
-
-        for key in opt_op.input_names:
-            if key == "Grad":
-                if self.config.enable_dc_asgd:
-                    new_inputs[key] = dc
-                else:
-                    # Note!! This is for l2decay on sparse gradient, because it will create a new tensor for
-                    # decayed gradient but not inplace modify the origin one
-                    origin_grad_name = opt_op.input(key)[0]
-                    if core.kNewGradSuffix(
-                    ) in origin_grad_name and pserver_block.has_var(
-                            origin_grad_name):
-                        new_grad = pserver_block.var(origin_grad_name)
-                        new_inputs[key] = new_grad
-                    else:
-                        new_inputs[key] = merged_var
-            elif key == "Param":
-                param_block = _get_param_block(opt_op)
-                if not param_block:
-                    return
-                tmpvar = pserver_block.create_var(
-                    name=param_block.name,
-                    persistable=True,
-                    dtype=param_block.dtype,
-                    shape=param_block.shape)
-                new_inputs[key] = tmpvar
-            elif key == "LearningRate":
-                # learning rate variable has already be created by non-optimize op,
-                # don't create it once again.
-                lr_varname = opt_op.input(key)[0]
-                if lr_varname in pserver_block.vars:
-                    new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]]
-                else:
-                    origin_var = origin_program.global_block().vars[lr_varname]
-                    tmpvar = pserver_block.create_var(
-                        name=origin_var.name,
-                        persistable=origin_var.persistable,
-                        dtype=origin_var.dtype,
-                        shape=origin_var.shape)
-                    new_inputs[key] = tmpvar
-
-        for key in opt_op.input_names:
-            new_shape = None
-            if key in ["Param", "Grad", "LearningRate"]:
-                continue
-            var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
-            param_var = new_inputs["Param"]
-            # update accumulator variable shape
-            new_shape = self._get_optimizer_input_shape(
-                opt_op.type, key, var.shape, param_var.shape)
-            tmpvar = pserver_block.create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=new_shape)
-            new_inputs[key] = tmpvar
-
-        # change output's ParamOut variable
-        outputs = self._get_output_map_from_op(
-            self.origin_program.global_block().vars, opt_op)
-        outputs["ParamOut"] = new_inputs["Param"]
-        optimize_block.append_op(
-            type=opt_op.type,
-            inputs=new_inputs,
-            outputs=outputs,
-            attrs=opt_op.all_attrs())
-
-        # record sparse grad to param name
-        if new_inputs["Grad"].type == core.VarDesc.VarType.SELECTED_ROWS:
-            sparse_grad_to_param.append(
-                str(new_inputs["Grad"].name) + ":" + str(new_inputs["Param"]
-                                                         .name))
-
-    def _get_pserver_grad_param_var(self, var, var_dict):
-        """
-        Return pserver side grad/param variable, return None
-        if the variable is not grad/param, e.g.
-
-            a@GRAD -> a@GRAD.block0
-            a@GRAD -> a@GRAD (a is not splited)
-            fc_0.w_0 -> fc_0.w_0.block_0
-            fc_0.w_0 -> fc_0.w_0 (weight is not splited)
-            _generated_var_123 -> None
-        """
-        grad_block = None
-        for _, g in six.iteritems(var_dict):
-            if self._orig_varname(g.name) == self._orig_varname(var.name):
-                # skip per trainer vars
-                if g.name.find(".trainer_") == -1:
-                    # only param or grads have splited blocks
-                    if self._orig_varname(g.name) in self.grad_name_to_param_name or \
-                            self._orig_varname(g.name) in self.param_name_to_grad_name:
-                        grad_block = g
-                        break
-        return grad_block
-
-    def _clone_lr_op(self, program, block, op):
-        inputs = self._get_input_map_from_op(
-            self.origin_program.global_block().vars, op)
-        for key, varlist in six.iteritems(inputs):
-            if not isinstance(varlist, list):
-                varlist = [varlist]
-            for var in varlist:
-                if var not in program.global_block().vars:
-                    block._clone_variable(var)
-
-        outputs = self._get_output_map_from_op(
-            self.origin_program.global_block().vars, op)
-        for key, varlist in six.iteritems(outputs):
-            if not isinstance(varlist, list):
-                varlist = [varlist]
-            for var in varlist:
-                if var not in program.global_block().vars:
-                    block._clone_variable(var)
-
-        return block.append_op(
-            type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
-
-    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
-        program = optimize_block.program
-        # Append the ops for parameters that do not need to be optimized/updated
-        inputs = self._get_input_map_from_op(
-            self.origin_program.global_block().vars, opt_op)
-        for key, varlist in six.iteritems(inputs):
-            if not isinstance(varlist, list):
-                varlist = [varlist]
-            for i in range(len(varlist)):
-                var = varlist[i]
-                # for ops like clipping and weight decay, get the splited var (xxx.block0)
-                # for inputs/outputs
-                grad_block = self._get_pserver_grad_param_var(
-                    var, program.global_block().vars)
-                if grad_block:
-                    varlist[i] = grad_block
-                elif var.name not in program.global_block().vars:
-                    tmpvar = program.global_block()._clone_variable(var)
-                    varlist[i] = tmpvar
-                else:
-                    varlist[i] = program.global_block().vars[var.name]
-            inputs[key] = varlist
-
-        outputs = self._get_output_map_from_op(
-            self.origin_program.global_block().vars, opt_op)
-        for key, varlist in six.iteritems(outputs):
-            if not isinstance(varlist, list):
-                varlist = [varlist]
-            for i in range(len(varlist)):
-                var = varlist[i]
-                grad_block = self._get_pserver_grad_param_var(
-                    var, program.global_block().vars)
-                if grad_block:
-                    varlist[i] = grad_block
-                elif var.name not in program.global_block().vars:
-                    tmpvar = program.global_block()._clone_variable(var)
-                    varlist[i] = tmpvar
-                else:
-                    varlist[i] = program.global_block().vars[var.name]
-            outputs[key] = varlist
-
-        return optimize_block.append_op(
-            type=opt_op.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=opt_op.all_attrs())
-
-    def _is_op_connected(self, op1, op2):
-        # If one op's input is another op's output or
-        # one op's output is another op's input, we say
-        # the two operator is connected.
-        if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \
-                set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
-            return True
-        return False
-
-    def _create_ufind(self, optimize_ops):
-        # Create a unit find data struct by optimize ops
-        ufind = UnionFind(optimize_ops)
-        for i in range(len(optimize_ops)):
-            for j in range(i, len(optimize_ops)):
-                op1 = optimize_ops[i]
-                op2 = optimize_ops[j]
-                if self._is_op_connected(op1, op2):
-                    ufind.union(op1, op2)
-        return ufind
-
-    def _is_optimizer_op(self, op):
-        if "Param" in op.input_names and \
-                "LearningRate" in op.input_names:
-            return True
-        return False
-
-    def _is_opt_op_on_pserver(self, endpoint, op):
-        param_names = [
-            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
-        ]
-        if op.input("Param")[0] in param_names:
-            return True
-        else:
-            for n in param_names:
-                param = op.input("Param")[0]
-                if same_or_split_var(n, param) and n != param:
-                    return True
-            return False
-
-    def _get_input_map_from_op(self, varmap, op):
-        """Returns a dict from op input name to the vars in varmap."""
-        iomap = collections.OrderedDict()
-        for key in op.input_names:
-            vars = []
-            for varname in op.input(key):
-                vars.append(varmap[varname])
-            if len(vars) == 1:
-                iomap[key] = vars[0]
-            else:
-                iomap[key] = vars
-        return iomap
-
-    def _get_output_map_from_op(self, varmap, op):
-        """Returns a dict from op output name to the vars in varmap."""
-        iomap = collections.OrderedDict()
-        for key in op.output_names:
-            vars = []
-            for varname in op.output(key):
-                vars.append(varmap[varname])
-            if len(vars) == 1:
-                iomap[key] = vars[0]
-            else:
-                iomap[key] = vars
-        return iomap
-
-    def _get_lr_ops(self):
-        lr_ops = []
-        block = self.origin_program.global_block()
-        for op in block.ops:
-            role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME))
-            if role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) or \
-                role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) | \
-                    int(OPT_OP_ROLE_ATTR_VALUE):
-                lr_ops.append(op)
-                log("append lr op: ", op.type)
-        return lr_ops
-
-    def _get_lr_ops_deprecated(self):
-        lr_ops = []
-        # find learning rate variables by optimize op
-        lr_vars = set()
-        for op in self.optimize_ops:
-            if self._is_optimizer_op(op):
-                lr_vars.add(op.input("LearningRate")[0])
-
-        find_ops = []
-        # find ops which output is lr var
-        block = self.origin_program.global_block()
-        for op in block.ops:
-            if set(op.output_arg_names) & lr_vars:
-                find_ops.append(op)
-        # make a union find struct by the ops in default_main_program
-        ufind = UnionFind(block.ops)
-
-        for op1 in block.ops:
-            for op2 in block.ops:
-                # NOTE: we need to skip all optimize ops, since it is connected
-                # with forward/backward ops and lr ops, we only need the lr ops.
-                if op1 != op2 and self._is_op_connected(op1, op2) and \
-                        not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
-                    ufind.union(op1, op2)
-        # find all ops which is related with lr var
-        for op1 in block.ops:
-            for op2 in find_ops:
-                if ufind.is_connected(op1, op2):
-                    lr_ops.append(op1)
-                    # we only need to append op for once
-                    break
-        return lr_ops
-
-    def _is_opt_role_op(self, op):
-        # NOTE: depend on oprole to find out whether this op is for
-        # optimize
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attr_names and \
-                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
-            return True
-        return False
-
-    def _get_optimize_pass(self):
-        """
-        Get optimizer operators, parameters and gradients from origin_program
-        Returns:
-            opt_ops (list): optimize operators.
-            params_grads (dict): parameter->gradient.
-        """
-        block = self.origin_program.global_block()
-        opt_ops = []
-        params_grads = []
-        # tmp set to dedup
-        optimize_params = set()
-        origin_var_dict = self.origin_program.global_block().vars
-        for op in block.ops:
-            if self._is_opt_role_op(op):
-                opt_ops.append(op)
-                if op.attr(OP_ROLE_VAR_ATTR_NAME):
-                    param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
-                    grad_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
-                    if not param_name in optimize_params:
-                        optimize_params.add(param_name)
-                        log("adding param_grad pair: ", param_name, grad_name)
-                        params_grads.append([
-                            origin_var_dict[param_name],
-                            origin_var_dict[grad_name]
-                        ])
-            else:
-                pass
-        return opt_ops, params_grads
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
deleted file mode 100755
index 29812812af6961fd6c1cef9b659f56c1dac1efbf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-
-
-def memory_optimize(input_program,
-                    skip_opt_set=None,
-                    print_log=False,
-                    level=0,
-                    skip_grads=True):
-    """
-    | Legacy memory optimization strategy, reduce total memory consumption by reuse variable memory between different operators.
-    | Simple sample to explain the algorithm:
-    
-        ..  code-block:: python
-        
-            c = a + b  # assume this is the last time a is used
-            d = b * c
-         
-    | since **a** will not be used anymore after **"c = a + b"**, and the size of **a** and **d** are the same, 
-      we can use variable **a** to replace variable **d**, so actually we can optimize the above code to below:
-
-        ..  code-block:: python
-        
-            c = a + b
-            a = b * c 
-          
-    
-    | Please notice that, in this legacy design, we are using variable **a** to replace **d** directly, which means 
-      after you call this API, some variables may disappear, and some variables may hold unexpected values, like 
-      the above case, actually **a** holds the value of **d** after execution. 
-    
-    | So to protect important variables from being reused/removed in the optimization, we provide skip_opt_set 
-      to allow you specify a variable whitelist. 
-      The variables in the skip_opt_set will not be affected by memory_optimize API.
-    
-    Note: 
-        | **This API is deprecated, please avoid to use it in your new code.**
-        | Does not support operators which will create sub-block like While, IfElse etc.
-    
-    Args:
-        input_program(str): Input Program
-        skip_opt_set(set): vars wil be skipped in memory optimze
-        print_log(bool): whether to print debug log.
-        level(int): 0 or 1, 0 means we replace a with b only when a.size == b.size, 1 means we can replace a with b if a.size <= b.size
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-
-            exe.run(startup_prog)
-            fluid.memory_optimize(main_prog)
-
-    """
-    logging.warn(
-        'Caution! paddle.fluid.memory_optimize() is deprecated '
-        'and not maintained any more, since it is not stable!\n'
-        'This API would not take any memory optimizations on your Program '
-        'now, since we have provided default strategies for you.\n'
-        'The newest and stable memory optimization strategies (they are all '
-        'enabled by default) are as follows:\n'
-        ' 1. Garbage collection strategy, which is enabled by exporting '
-        'environment variable FLAGS_eager_delete_tensor_gb=0 (0 is the '
-        'default value).\n'
-        ' 2. Inplace strategy, which is enabled by setting '
-        'build_strategy.enable_inplace=True (True is the default value) '
-        'when using CompiledProgram or ParallelExecutor.\n')
-
-
-def release_memory(input_program, skip_opt_set=None):
-    """
-    Modify the input program and insert :code:`delete_op` to early drop not used
-    variables. The modification will be performed inplace.
-
-    Notes: This is an experimental API and could be removed in next few
-    releases. Users should not use this API.
-
-    Args:
-        input_program(Program): The program will be inserted :code:`delete_op`.
-        skip_opt_set(set): vars wil be skipped in memory optimze
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            # build network
-            # ...
-            
-            # deprecated API
-            fluid.release_memory(fluid.default_main_program())
-    
-    """
-    logging.warn('paddle.fluid.release_memory() is deprecated, it would not'
-                 ' take any memory release on your program')
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
deleted file mode 100644
index a04f6c2c79403844d14967067aebe371efdd3286..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ /dev/null
@@ -1,110 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-
-class PSDispatcher(object):
-    """
-    PSDispatcher is the base class for dispatching vars
-    into different pserver instance.
-    You need to implement the `dispatch` inferface.
-    """
-
-    def __init__(self, pserver_endpoints):
-        self._eps = pserver_endpoints
-        self._step = 0
-
-    @property
-    def eps(self):
-        return self._eps
-
-    def reset(self):
-        self._step = 0
-
-    def dispatch(self, varlist):
-        """
-        Args:
-            varlist(list): a list of Variables
-        Returns:
-            a map of pserver endpoint -> varname
-        """
-        AssertionError("Interface has not been implemented.")
-
-
-class HashName(PSDispatcher):
-    """
-    Hash variable names to several endpoints using python
-    "hash()" function.
-
-    Args:
-        pserver_endpoints (list): list of endpoint(ip:port).
-
-    Examples:
-        .. code-block:: python
-
-        pserver_endpoints = ["127.0.0.1:6007", "127.0.0.1:6008"]
-        vars = ["var1","var2","var3","var4","var5"]
-
-        rr = RoundRobin(pserver_endpoints)
-        rr.dispatch(vars)
-
-    """
-
-    def __init__(self, pserver_endpoints):
-        super(self.__class__, self).__init__(pserver_endpoints)
-
-    def _hash_block(self, block_str, total):
-        return hash(block_str) % total
-
-    def dispatch(self, varlist):
-        eplist = []
-        for var in varlist:
-            server_id = self._hash_block(var.name(), len(self._eps))
-            server_for_param = self._eps[server_id]
-            eplist.append(server_for_param)
-        return eplist
-
-
-class RoundRobin(PSDispatcher):
-    """
-    Distribute variables to serveral endpoints using
-    RondRobin<https://en.wikipedia.org/wiki/Round-robin_scheduling> method.
-
-    Args:
-        pserver_endpoints (list): list of endpoint(ip:port).
-
-    Examples:
-        .. code-block:: python
-
-        pserver_endpoints = ["127.0.0.1:6007", "127.0.0.1:6008"]
-        vars = ["var1","var2","var3","var4","var5"]
-
-        rr = RoundRobin(pserver_endpoints)
-        rr.dispatch(vars)
-
-    """
-
-    def __init__(self, pserver_endpoints):
-        super(self.__class__, self).__init__(pserver_endpoints)
-
-    def dispatch(self, varlist):
-        eplist = []
-        for var in varlist:
-            server_for_param = self._eps[self._step]
-            eplist.append(server_for_param)
-            self._step += 1
-            if self._step >= len(self._eps):
-                self._step = 0
-        return eplist
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
deleted file mode 100644
index 9e3cd063092156d0148ed824acbf0e7e9db3f656..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/unique_name.py
+++ /dev/null
@@ -1,170 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import collections
-from .wrapped_decorator import signature_safe_contextmanager
-import six
-import sys
-
-__all__ = ['generate', 'switch', 'guard']
-
-
-class UniqueNameGenerator(object):
-    """
-    Generate unique name with prefix.
-
-    Args:
-        prefix(str): The generated name prefix. All generated name will be
-                     started with this prefix.
-    """
-
-    def __init__(self, prefix=None):
-        self.ids = collections.defaultdict(int)
-        if prefix is None:
-            prefix = ""
-        self.prefix = prefix
-
-    def __call__(self, key):
-        """
-        Generate unique names with prefix
-
-        Args:
-            key(str): The key of return string.
-
-        Returns(str): A unique string with the prefix
-        """
-        tmp = self.ids[key]
-        self.ids[key] += 1
-        return self.prefix + "_".join([key, str(tmp)])
-
-
-generator = UniqueNameGenerator()
-
-
-def generate(key):
-    """
-    Generate unique name with prefix key.
-
-    Args:
-        key(str): The generated name prefix. All generated name will be 
-                  started with this prefix.
-
-    Returns: 
-        str: A unique string with the prefix key.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            name1 = fluid.unique_name.generate('fc')
-            name2 = fluid.unique_name.generate('fc')
-            # The result is fc_0, fc_1
-            print name1, name2 
-    """
-    return generator(key)
-
-
-# FIXME(zjl): The previous naming rule in static graph would
-# cause memory leak in dygraph mode. It is because the previous
-# naming rule would use `conv_0.tmp` as the key, and in dygraph
-# mode, `conv_i` increases as batch increases. Thus, keys would
-# increase in a way like `conv_0.tmp`, `conv_1.tmp`, .... 
-# Not find a better way to fix this bug in dygraph mode. In TF,
-# variable name is meaningless in eager execution mode, and in
-# PyTorch, there is no variable name at all. Maybe we should
-# discard variable name in dygraph mode.
-#
-# Another concern is that save/load interfaces. Usually, user
-# would save model in static graph mode, and load it in dygraph
-# mode. Therefore, we keep the variable name of Parameter currently.
-# 
-# Please fix me if a better method is found.        
-def generate_with_ignorable_key(key):
-    from .framework import in_dygraph_mode
-    if in_dygraph_mode():
-        key = "tmp"
-
-    return generator(key)
-
-
-def switch(new_generator=None):
-    """
-    Switch the Global namespace to a new namespace.
-
-    Args:
-        new_generator(None|UniqueNameGenerator): A new UniqueNameGenerator.
-
-    Returns: 
-        UniqueNameGenerator: The previous UniqueNameGenerator.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            name1 = fluid.unique_name.generate('fc')
-            name2 = fluid.unique_name.generate('fc')
-            # The result is fc_0, fc_1
-            print name1, name2 
-
-            fluid.unique_name.switch()
-            name2 = fluid.unique_name.generate('fc')
-            # The result is fc_0
-            print name2
-    """
-    global generator
-    old = generator
-    if new_generator is None:
-        generator = UniqueNameGenerator()
-    else:
-        generator = new_generator
-    return old
-
-
-@signature_safe_contextmanager
-def guard(new_generator=None):
-    """
-    Change the global namespace with `with` statement.
-    
-    Args:
-        new_generator(None|str|bytes): New name of global namespace.
-            Note that str in Python2 was spilted into str and bytes in Python3, 
-            so here are two types. Default is None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            with fluid.unique_name.guard():
-              name_1 = fluid.unique_name.generate('fc')
-            with fluid.unique_name.guard():
-              name_2 = fluid.unique_name.generate('fc')
-            # The result is fc_0, fc_0
-            print name_1, name_2
-
-            with fluid.unique_name.guard('A'):
-              name_1 = fluid.unique_name.generate('fc')
-            with fluid.unique_name.guard('B'):
-              name_2 = fluid.unique_name.generate('fc')
-            # The result is Afc_0, Bfc_0
-            print name_1, name_2
-    """
-    if isinstance(new_generator, six.string_types):
-        new_generator = UniqueNameGenerator(new_generator)
-    elif isinstance(new_generator, six.binary_type):
-        new_generator = UniqueNameGenerator(new_generator.decode())
-    old = switch(new_generator)
-    yield
-    switch(old)
diff --git a/python/paddle/fluid/wrapped_decorator.py b/python/paddle/fluid/wrapped_decorator.py
deleted file mode 100644
index 7e7dbff65611e947d1a11a0c33c6ecc27e6df636..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/wrapped_decorator.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import decorator
-import contextlib
-
-__all__ = ['wrap_decorator', 'signature_safe_contextmanager']
-
-
-def wrap_decorator(decorator_func):
-    @decorator.decorator
-    def __impl__(func, *args, **kwargs):
-        wrapped_func = decorator_func(func)
-        return wrapped_func(*args, **kwargs)
-
-    return __impl__
-
-
-signature_safe_contextmanager = wrap_decorator(contextlib.contextmanager)
diff --git a/python/paddle/libs/__init__.py b/python/paddle/libs/__init__.py
deleted file mode 100644
index 34d4f4d07ed0d452c1965c5f1f198230571931aa..0000000000000000000000000000000000000000
--- a/python/paddle/libs/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# used for setup.py.in to store the thirdparty shared libraries
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
deleted file mode 100644
index 29337cf06682f5f5bf8e0e6d9b1bf8ec32512d45..0000000000000000000000000000000000000000
--- a/python/paddle/reader/__init__.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-At training and testing time, PaddlePaddle programs need to read data. To ease
-the users' work to write data reading code, we define that
-
-- A *reader* is a function that reads data (from file, network, random number
-  generator, etc) and yields data items.
-- A *reader creator* is a function that returns a reader function.
-- A *reader decorator* is a function, which accepts one or more readers, and
-  returns a reader.
-- A *batch reader* is a function that reads data (from *reader*, file, network,
-  random number generator, etc) and yields a batch of data items.
-
-#####################
-Data Reader Interface
-#####################
-
-Indeed, *data reader* doesn't have to be a function that reads and yields data
-items. It can be any function with no parameter that creates a iterable
-(anything can be used in :code:`for x in iterable`)\:
-
-..  code-block:: python
-
-    iterable = data_reader()
-
-Element produced from the iterable should be a **single** entry of data,
-**not** a mini batch. That entry of data could be a single item, or a tuple of
-items.
-Item should be of supported type (e.g., numpy array or list/tuple of float 
-or int).
-
-An example implementation for single item data reader creator:
-
-..  code-block:: python
-
-    def reader_creator_random_image(width, height):
-        def reader():
-            while True:
-                yield numpy.random.uniform(-1, 1, size=width*height)
-    return reader
-
-An example implementation for multiple item data reader creator:
-
-..  code-block:: python
-
-    def reader_creator_random_image_and_label(width, height, label):
-        def reader():
-            while True:
-                yield numpy.random.uniform(-1, 1, size=width*height), label
-    return reader
-
-"""
-
-import paddle.reader.decorator
-from paddle.reader.decorator import *
-
-__all__ = decorator.__all__
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
deleted file mode 100644
index ab7b21325b723d963ff8276fe1377a2dc81878c3..0000000000000000000000000000000000000000
--- a/python/paddle/reader/decorator.py
+++ /dev/null
@@ -1,580 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = [
-    'cache', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader',
-    'multiprocess_reader', 'Fake'
-]
-
-from threading import Thread
-import subprocess
-import multiprocessing
-import six
-import sys
-
-from six.moves.queue import Queue
-from six.moves import zip_longest
-from six.moves import map
-from six.moves import zip
-import itertools
-import random
-import zlib
-import paddle.compat as cpt
-
-
-def cache(reader):
-    """
-    Cache the reader data into memory. 
-
-    Be careful that this method may take long time to process, 
-    and consume lots of memory. :code:`reader()` would only 
-    call once. 
-
-    Args:
-        reader (generator): a reader object which yields 
-            data each time.
-
-    Returns:
-        generator: a decorated reader object which yields data from cached memory.
-    """
-    all_data = tuple(reader())
-
-    def __impl__():
-        for item in all_data:
-            yield item
-
-    return __impl__
-
-
-def map_readers(func, *readers):
-    """
-    Creates a data reader that outputs return value of function using
-    output of each data readers as arguments.
-
-    :param func: function to use. The type of func should be (Sample) => Sample
-    :type: callable
-    :param readers: readers whose outputs will be used as arguments of func.
-    :return: the created data reader.
-    :rtype: callable
-    """
-
-    def reader():
-        rs = []
-        for r in readers:
-            rs.append(r())
-        for e in map(func, *rs):
-            yield e
-
-    return reader
-
-
-def shuffle(reader, buf_size):
-    """
-    Creates a data reader whose data output is shuffled.
-
-    Output from the iterator that created by original reader will be
-    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
-    is determined by argument buf_size.
-
-    :param reader: the original reader whose output will be shuffled.
-    :type reader: callable
-    :param buf_size: shuffle buffer size.
-    :type buf_size: int
-
-    :return: the new reader whose output is shuffled.
-    :rtype: callable
-    """
-
-    def data_reader():
-        buf = []
-        for e in reader():
-            buf.append(e)
-            if len(buf) >= buf_size:
-                random.shuffle(buf)
-                for b in buf:
-                    yield b
-                buf = []
-
-        if len(buf) > 0:
-            random.shuffle(buf)
-            for b in buf:
-                yield b
-
-    return data_reader
-
-
-def chain(*readers):
-    """
-    Creates a data reader whose output is the outputs of input data
-    readers chained together.
-
-    If input readers output following data entries:
-    [0, 0, 0]
-    [1, 1, 1]
-    [2, 2, 2]
-    The chained reader will output:
-    [0, 0, 0, 1, 1, 1, 2, 2, 2]
-
-    :param readers: input readers.
-    :return: the new data reader.
-    :rtype: callable
-    """
-
-    def reader():
-        rs = []
-        for r in readers:
-            rs.append(r())
-
-        for e in itertools.chain(*rs):
-            yield e
-
-    return reader
-
-
-class ComposeNotAligned(ValueError):
-    pass
-
-
-def compose(*readers, **kwargs):
-    """
-    Creates a data reader whose output is the combination of input readers.
-
-    If input readers output following data entries:
-    (1, 2)    3    (4, 5)
-    The composed reader will output:
-    (1, 2, 3, 4, 5)
-
-    :param readers: readers that will be composed together.
-    :param check_alignment: if True, will check if input readers are aligned
-        correctly. If False, will not check alignment and trailing outputs
-        will be discarded. Defaults to True.
-    :type check_alignment: bool
-
-    :return: the new data reader.
-
-    :raises ComposeNotAligned: outputs of readers are not aligned.
-        Will not raise when check_alignment is set to False.
-    """
-    check_alignment = kwargs.pop('check_alignment', True)
-
-    def make_tuple(x):
-        if isinstance(x, tuple):
-            return x
-        else:
-            return (x, )
-
-    def reader():
-        rs = []
-        for r in readers:
-            rs.append(r())
-        if not check_alignment:
-            for outputs in zip(*rs):
-                yield sum(list(map(make_tuple, outputs)), ())
-        else:
-            for outputs in zip_longest(*rs):
-                for o in outputs:
-                    if o is None:
-                        # None will be not be present if compose is aligned
-                        raise ComposeNotAligned(
-                            "outputs of readers are not aligned.")
-                yield sum(list(map(make_tuple, outputs)), ())
-
-    return reader
-
-
-def buffered(reader, size):
-    """
-    Creates a buffered data reader.
-
-    The buffered data reader will read and save data entries into a
-    buffer. Reading from the buffered data reader will proceed as long
-    as the buffer is not empty.
-
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param size: max buffer size.
-    :type size: int
-
-    :returns: the buffered data reader.
-    """
-
-    class EndSignal():
-        pass
-
-    end = EndSignal()
-
-    def read_worker(r, q):
-        for d in r:
-            q.put(d)
-        q.put(end)
-
-    def data_reader():
-        r = reader()
-        q = Queue(maxsize=size)
-        t = Thread(
-            target=read_worker, args=(
-                r,
-                q, ))
-        t.daemon = True
-        t.start()
-        e = q.get()
-        while e != end:
-            yield e
-            e = q.get()
-
-    return data_reader
-
-
-def firstn(reader, n):
-    """
-    Limit the max number of samples that reader could return.
-
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param n: the max number of samples that return.
-    :type n: int
-    :return: the decorated reader.
-    :rtype: callable
-    """
-
-    # TODO(yuyang18): Check if just drop the reader, could clean the opened
-    # resource or not?
-
-    def firstn_reader():
-        for i, item in enumerate(reader()):
-            if i == n:
-                break
-            yield item
-
-    return firstn_reader
-
-
-class XmapEndSignal():
-    pass
-
-
-def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
-    """
-    Use multi-threads to map samples from reader by a mapper defined by user.
-
-    Args:
-        mapper (callable): a function to map the data from reader.
-        reader (callable): a data reader which yields the data. 
-        process_num (int): thread number to handle original sample.
-        buffer_size (int): size of the queue to read data in. 
-        order (bool): whether to keep the data order from original reader. 
-            Default False.
-
-    Returns:
-        callable: a decorated reader with data mapping. 
-    """
-    end = XmapEndSignal()
-
-    # define a worker to read samples from reader to in_queue
-    def read_worker(reader, in_queue):
-        for i in reader():
-            in_queue.put(i)
-        in_queue.put(end)
-
-    # define a worker to read samples from reader to in_queue with order flag
-    def order_read_worker(reader, in_queue):
-        in_order = 0
-        for i in reader():
-            in_queue.put((in_order, i))
-            in_order += 1
-        in_queue.put(end)
-
-    # define a worker to handle samples from in_queue by mapper
-    # and put mapped samples into out_queue
-    def handle_worker(in_queue, out_queue, mapper):
-        sample = in_queue.get()
-        while not isinstance(sample, XmapEndSignal):
-            r = mapper(sample)
-            out_queue.put(r)
-            sample = in_queue.get()
-        in_queue.put(end)
-        out_queue.put(end)
-
-    # define a worker to handle samples from in_queue by mapper
-    # and put mapped samples into out_queue by order
-    def order_handle_worker(in_queue, out_queue, mapper, out_order):
-        ins = in_queue.get()
-        while not isinstance(ins, XmapEndSignal):
-            order, sample = ins
-            r = mapper(sample)
-            while order != out_order[0]:
-                pass
-            out_queue.put(r)
-            out_order[0] += 1
-            ins = in_queue.get()
-        in_queue.put(end)
-        out_queue.put(end)
-
-    def xreader():
-        in_queue = Queue(buffer_size)
-        out_queue = Queue(buffer_size)
-        out_order = [0]
-        # start a read worker in a thread
-        target = order_read_worker if order else read_worker
-        t = Thread(target=target, args=(reader, in_queue))
-        t.daemon = True
-        t.start()
-        # start several handle_workers
-        target = order_handle_worker if order else handle_worker
-        args = (in_queue, out_queue, mapper, out_order) if order else (
-            in_queue, out_queue, mapper)
-        workers = []
-        for i in range(process_num):
-            worker = Thread(target=target, args=args)
-            worker.daemon = True
-            workers.append(worker)
-        for w in workers:
-            w.start()
-
-        sample = out_queue.get()
-        while not isinstance(sample, XmapEndSignal):
-            yield sample
-            sample = out_queue.get()
-        finish = 1
-        while finish < process_num:
-            sample = out_queue.get()
-            if isinstance(sample, XmapEndSignal):
-                finish += 1
-            else:
-                yield sample
-
-    return xreader
-
-
-def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
-    """
-    multiprocess_reader use python multi process to read data from readers
-    and then use multiprocess.Queue or multiprocess.Pipe to merge all
-    data. The process number is equal to the number of input readers, each
-    process call one reader.
-
-    Multiprocess.Queue require the rw access right to /dev/shm, some
-    platform does not support.
-
-    you need to create multiple readers first, these readers should be independent
-    to each other so that each process can work independently.
-
-    An example:
-
-    .. code-block:: python
-
-        reader0 = reader(["file01", "file02"])
-        reader1 = reader(["file11", "file12"])
-        reader1 = reader(["file21", "file22"])
-        reader = multiprocess_reader([reader0, reader1, reader2],
-            queue_size=100, use_pipe=False)
-    """
-
-    try:
-        import ujson as json
-    except Exception as e:
-        sys.stderr.write("import ujson error: " + str(e) + " use json\n")
-        import json
-
-    assert type(readers) is list and len(readers) > 0
-
-    def _read_into_queue(reader, queue):
-        try:
-            for sample in reader():
-                if sample is None:
-                    raise ValueError("sample has None")
-                queue.put(sample)
-            queue.put(None)
-        except:
-            queue.put("")
-            six.reraise(*sys.exc_info())
-
-    def queue_reader():
-        queue = multiprocessing.Queue(queue_size)
-        for reader in readers:
-            p = multiprocessing.Process(
-                target=_read_into_queue, args=(reader, queue))
-            p.start()
-
-        reader_num = len(readers)
-        finish_num = 0
-        while finish_num < reader_num:
-            sample = queue.get()
-            if sample is None:
-                finish_num += 1
-            elif sample == "":
-                raise ValueError("multiprocess reader raises an exception")
-            else:
-                yield sample
-
-    def _read_into_pipe(reader, conn):
-        try:
-            for sample in reader():
-                if sample is None:
-                    raise ValueError("sample has None!")
-                conn.send(json.dumps(sample))
-            conn.send(json.dumps(None))
-            conn.close()
-        except:
-            conn.send(json.dumps(""))
-            conn.close()
-            six.reraise(*sys.exc_info())
-
-    def pipe_reader():
-        conns = []
-        for reader in readers:
-            parent_conn, child_conn = multiprocessing.Pipe()
-            conns.append(parent_conn)
-            p = multiprocessing.Process(
-                target=_read_into_pipe, args=(reader, child_conn))
-            p.start()
-
-        reader_num = len(readers)
-        finish_num = 0
-        conn_to_remove = []
-        while finish_num < reader_num:
-            for conn in conn_to_remove:
-                conns.remove(conn)
-            conn_to_remove = []
-            for conn in conns:
-                sample = json.loads(conn.recv())
-                if sample is None:
-                    finish_num += 1
-                    conn.close()
-                    conn_to_remove.append(conn)
-                elif sample == "":
-                    conn.close()
-                    conn_to_remove.append(conn)
-                    raise ValueError("multiprocess reader raises an exception")
-                else:
-                    yield sample
-
-    if use_pipe:
-        return pipe_reader
-    else:
-        return queue_reader
-
-
-def _buf2lines(buf, line_break="\n"):
-    # FIXME: line_break should be automatically configured.
-    lines = buf.split(line_break)
-    return lines[:-1], lines[-1]
-
-
-class PipeReader:
-    """
-        PipeReader read data by stream from a command, take it's
-        stdout into a pipe buffer and redirect it to the parser to
-        parse, then yield data as your desired format.
-
-        You can using standard linux command or call another program
-        to read data, from HDFS, Ceph, URL, AWS S3 etc:
-
-        .. code-block:: python
-           cmd = "hadoop fs -cat /path/to/some/file"
-           cmd = "cat sample_file.tar.gz"
-           cmd = "curl http://someurl"
-           cmd = "python print_s3_bucket.py"
-
-        An example:
-
-        .. code-block:: python
-
-           def example_reader():
-               for f in myfiles:
-                   pr = PipeReader("cat %s"%f)
-                   for l in pr.get_line():
-                       sample = l.split(" ")
-                       yield sample
-    """
-
-    def __init__(self, command, bufsize=8192, file_type="plain"):
-        if not isinstance(command, str):
-            raise TypeError("left_cmd must be a string")
-        if file_type == "gzip":
-            self.dec = zlib.decompressobj(
-                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
-        self.file_type = file_type
-        self.bufsize = bufsize
-        self.process = subprocess.Popen(
-            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
-
-    def get_line(self, cut_lines=True, line_break="\n"):
-        """
-        :param cut_lines: cut buffer to lines
-        :type cut_lines: bool
-        :param line_break: line break of the file, like '\\\\n' or '\\\\r'
-        :type line_break: string
-
-        :return: one line or a buffer of bytes
-        :rtype: string
-        """
-        remained = ""
-        while True:
-            buff = self.process.stdout.read(self.bufsize)
-            if buff:
-                if self.file_type == "gzip":
-                    decomp_buff = cpt.to_text(self.dec.decompress(buff))
-                elif self.file_type == "plain":
-                    decomp_buff = cpt.to_text(buff)
-                else:
-                    raise TypeError("file_type %s is not allowed" %
-                                    self.file_type)
-
-                if cut_lines:
-                    lines, remained = _buf2lines(''.join(
-                        [remained, decomp_buff]), line_break)
-                    for line in lines:
-                        yield line
-                else:
-                    yield decomp_buff
-            else:
-                break
-
-
-class Fake(object):
-    """
-    fake reader will cache the first data it read and yield it out for data_num times.
-    It is used to cache a data from real reader and use it for speed testing.
-
-    :param reader: the origin reader
-    :param data_num: times that this reader will yield data.
-
-    :return: a fake reader.
-
-    Examples:
-        .. code-block:: python
-
-            def reader():
-                for i in range(10):
-                    yield i
-
-            fake_reader = Fake()(reader, 100)
-    """
-
-    def __init__(self):
-        self.data = None
-        self.yield_num = 0
-
-    def __call__(self, reader, data_num):
-        def fake_reader():
-            if self.data is None:
-                self.data = next(reader())
-            while self.yield_num < data_num:
-                yield self.data
-                self.yield_num += 1
-            self.yield_num = 0
-
-        return fake_reader
diff --git a/python/paddle/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt
deleted file mode 100644
index 969718d3b1837bde2e953778be9a1390cc53bb3d..0000000000000000000000000000000000000000
--- a/python/paddle/reader/tests/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/reader/tests/__init__.py b/python/paddle/reader/tests/__init__.py
deleted file mode 100644
index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000
--- a/python/paddle/reader/tests/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
deleted file mode 100644
index ef07640ed839419d69f47503b162be597238cae6..0000000000000000000000000000000000000000
--- a/python/paddle/reader/tests/decorator_test.py
+++ /dev/null
@@ -1,223 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-import unittest
-import functools
-
-import paddle.reader
-
-
-def reader_creator_10(dur):
-    def reader():
-        for i in range(10):
-            # this invocation helps testing paddle.reader.buffer
-            time.sleep(dur)
-            yield i
-
-    return reader
-
-
-class TestMap(unittest.TestCase):
-    def test_map(self):
-        d = {"h": 0, "i": 1}
-
-        def tokenize(x):
-            return d[x]
-
-        def read():
-            yield "h"
-            yield "i"
-
-        r = paddle.reader.map_readers(tokenize, read)
-        for i, e in enumerate(r()):
-            self.assertEqual(e, i)
-
-
-class TestBuffered(unittest.TestCase):
-    def test_read(self):
-        for size in range(20):
-            b = paddle.reader.buffered(reader_creator_10(0), size)
-            c = 0
-            for i in b():
-                self.assertEqual(i, c)
-                c += 1
-            self.assertEqual(c, 10)
-
-    def test_buffering(self):
-        # read have 30ms delay.
-        b = paddle.reader.buffered(reader_creator_10(0.03), 10)
-        last_time = time.time()
-        for idx, i in enumerate(b()):
-            elapsed_time = time.time() - last_time
-            if i == 0:
-                time.sleep(1)
-            else:
-                # read time should be short, meaning already buffered.
-                self.assertLess(elapsed_time, 0.08)
-            last_time = time.time()
-
-
-class TestCompose(unittest.TestCase):
-    def test_compse(self):
-        reader = paddle.reader.compose(
-            reader_creator_10(0), reader_creator_10(0))
-        for idx, e in enumerate(reader()):
-            self.assertEqual(e, (idx, idx))
-
-    def test_compose_not_aligned(self):
-        total = 0
-        reader = paddle.reader.compose(
-            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
-            reader_creator_10(0))
-        with self.assertRaises(paddle.reader.ComposeNotAligned):
-            for e in reader():
-                total += 1
-        # expecting 10, not 20
-        self.assertEqual(total, 10)
-
-    def test_compose_not_aligned_no_check(self):
-        total = 0
-        reader = paddle.reader.compose(
-            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
-            reader_creator_10(0),
-            check_alignment=False)
-        for e in reader():
-            total += 1
-        # expecting 10, not 20
-        self.assertEqual(total, 10)
-
-
-class TestChain(unittest.TestCase):
-    def test_chain(self):
-        c = paddle.reader.chain(reader_creator_10(0), reader_creator_10(0))
-        idx = 0
-        for e in c():
-            self.assertEqual(e, idx % 10)
-            idx += 1
-        self.assertEqual(idx, 20)
-
-
-class TestShuffle(unittest.TestCase):
-    def test_shuffle(self):
-        case = [(0, True), (1, True), (10, False), (100, False)]
-        a = reader_creator_10(0)
-        for size, checkEq in case:
-            s = paddle.reader.shuffle(a, size)
-            total = 0
-            for idx, e in enumerate(s()):
-                if checkEq:
-                    self.assertEqual(idx, e)
-                total += 1
-            self.assertEqual(total, 10)
-
-
-class TestXmap(unittest.TestCase):
-    def test_xmap(self):
-        def mapper(x):
-            return (x + 1)
-
-        orders = (True, False)
-        thread_nums = (1, 2, 4, 8, 16)
-        buffered_size = (1, 2, 4, 8, 16)
-        for order in orders:
-            for tNum in thread_nums:
-                for size in buffered_size:
-                    reader = paddle.reader.xmap_readers(mapper,
-                                                        reader_creator_10(0),
-                                                        tNum, size, order)
-                    for n in range(3):
-                        result = []
-                        for i in reader():
-                            result.append(i)
-                        if not order:
-                            result.sort()
-                        for idx, e in enumerate(result):
-                            self.assertEqual(e, mapper(idx))
-
-
-class TestPipeReader(unittest.TestCase):
-    def test_pipe_reader(self):
-        def example_reader(myfiles):
-            for f in myfiles:
-                pr = paddle.reader.PipeReader("cat %s" % f, bufsize=128)
-                for l in pr.get_line():
-                    yield l
-
-        import tempfile
-
-        records = [str(i) for i in range(5)]
-        temp = tempfile.NamedTemporaryFile()
-        try:
-            with open(temp.name, 'w') as f:
-                for r in records:
-                    f.write('%s\n' % r)
-
-            result = []
-            for r in example_reader([temp.name]):
-                result.append(r)
-
-            for idx, e in enumerate(records):
-                self.assertEqual(e, result[idx])
-        finally:
-            # delete the temporary file
-            temp.close()
-
-
-class TestMultiProcessReader(unittest.TestCase):
-    def setup(self):
-        self.samples = []
-        for i in range(1000):
-            self.samples.append([[i], [i + 1, i + 2], i + 3])
-
-        def reader(index):
-            for i in range(len(self.samples)):
-                if i % 3 == index:
-                    yield self.samples[i]
-
-        self.reader0 = functools.partial(reader, 0)
-        self.reader1 = functools.partial(reader, 1)
-        self.reader2 = functools.partial(reader, 2)
-
-    def reader_test(self, use_pipe):
-        self.setup()
-        results = []
-        for data in paddle.reader.multiprocess_reader(
-            [self.reader0, self.reader1, self.reader2], 100, use_pipe)():
-            results.append(data)
-        self.assertEqual(sorted(self.samples), sorted(results))
-
-    def test_distributed_batch_reader(self):
-        self.reader_test(use_pipe=False)
-        self.reader_test(use_pipe=True)
-
-
-class TestFakeReader(unittest.TestCase):
-    def test_fake_reader(self):
-        def reader():
-            for i in range(10):
-                yield i
-
-        data_num = 100
-        fake_reader = paddle.reader.Fake()(reader, data_num)
-        for _ in range(10):
-            i = 0
-            for data in fake_reader():
-                self.assertEqual(data, 0)
-                i += 1
-            self.assertEqual(i, data_num)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
deleted file mode 100644
index db6fe2d5fff4ed1617d793faee23f01395841768..0000000000000000000000000000000000000000
--- a/python/paddle/utils/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .plot import Ploter
-__all__ = ['dump_config', 'Ploter']
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
deleted file mode 100644
index a8092349cde8a4cb30873bf819fd5ed96289a945..0000000000000000000000000000000000000000
--- a/python/paddle/utils/image_util.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from PIL import Image
-from six.moves import cStringIO as StringIO
-
-
-def resize_image(img, target_size):
-    """
-    Resize an image so that the shorter edge has length target_size.
-    img: the input image to be resized.
-    target_size: the target resized image size.
-    """
-    percent = (target_size / float(min(img.size[0], img.size[1])))
-    resized_size = int(round(img.size[0] * percent)), int(
-        round(img.size[1] * percent))
-    img = img.resize(resized_size, Image.ANTIALIAS)
-    return img
-
-
-def flip(im):
-    """
-    Return the flipped image.
-    Flip an image along the horizontal direction.
-    im: input image, (H x W x K) ndarrays
-    """
-    if len(im.shape) == 3:
-        return im[:, :, ::-1]
-    else:
-        return im[:, ::-1]
-
-
-def crop_img(im, inner_size, color=True, test=True):
-    """
-    Return cropped image.
-    The size of the cropped image is inner_size * inner_size.
-    im: (K x H x W) ndarrays
-    inner_size: the cropped image size.
-    color: whether it is color image.
-    test: whether in test mode.
-      If False, does random cropping and flipping.
-      If True, crop the center of images.
-    """
-    if color:
-        height, width = max(inner_size, im.shape[1]), max(inner_size,
-                                                          im.shape[2])
-        padded_im = np.zeros((3, height, width))
-        startY = (height - im.shape[1]) / 2
-        startX = (width - im.shape[2]) / 2
-        endY, endX = startY + im.shape[1], startX + im.shape[2]
-        padded_im[:, startY:endY, startX:endX] = im
-    else:
-        im = im.astype('float32')
-        height, width = max(inner_size, im.shape[0]), max(inner_size,
-                                                          im.shape[1])
-        padded_im = np.zeros((height, width))
-        startY = (height - im.shape[0]) / 2
-        startX = (width - im.shape[1]) / 2
-        endY, endX = startY + im.shape[0], startX + im.shape[1]
-        padded_im[startY:endY, startX:endX] = im
-    if test:
-        startY = (height - inner_size) / 2
-        startX = (width - inner_size) / 2
-    else:
-        startY = np.random.randint(0, height - inner_size + 1)
-        startX = np.random.randint(0, width - inner_size + 1)
-    endY, endX = startY + inner_size, startX + inner_size
-    if color:
-        pic = padded_im[:, startY:endY, startX:endX]
-    else:
-        pic = padded_im[startY:endY, startX:endX]
-    if (not test) and (np.random.randint(2) == 0):
-        pic = flip(pic)
-    return pic
-
-
-def decode_jpeg(jpeg_string):
-    np_array = np.array(Image.open(StringIO(jpeg_string)))
-    if len(np_array.shape) == 3:
-        np_array = np.transpose(np_array, (2, 0, 1))
-    return np_array
-
-
-def preprocess_img(im, img_mean, crop_size, is_train, color=True):
-    """
-    Does data augmentation for images.
-    If is_train is false, cropping the center region from the image.
-    If is_train is true, randomly crop a region from the image,
-    and randomy does flipping.
-    im: (K x H x W) ndarrays
-    """
-    im = im.astype('float32')
-    test = not is_train
-    pic = crop_img(im, crop_size, color, test)
-    pic -= img_mean
-    return pic.flatten()
-
-
-def load_meta(meta_path, mean_img_size, crop_size, color=True):
-    """
-    Return the loaded meta file.
-    Load the meta image, which is the mean of the images in the dataset.
-    The mean image is subtracted from every input image so that the expected mean
-    of each input image is zero.
-    """
-    mean = np.load(meta_path)['data_mean']
-    border = (mean_img_size - crop_size) / 2
-    if color:
-        assert (mean_img_size * mean_img_size * 3 == mean.shape[0])
-        mean = mean.reshape(3, mean_img_size, mean_img_size)
-        mean = mean[:, border:border + crop_size, border:border +
-                    crop_size].astype('float32')
-    else:
-        assert (mean_img_size * mean_img_size == mean.shape[0])
-        mean = mean.reshape(mean_img_size, mean_img_size)
-        mean = mean[border:border + crop_size, border:border +
-                    crop_size].astype('float32')
-    return mean
-
-
-def load_image(img_path, is_color=True):
-    """
-    Load image and return.
-    img_path: image path.
-    is_color: is color image or not.
-    """
-    img = Image.open(img_path)
-    img.load()
-    return img
-
-
-def oversample(img, crop_dims):
-    """
-    image : iterable of (H x W x K) ndarrays
-    crop_dims: (height, width) tuple for the crops.
-    Returned data contains ten crops of input image, namely,
-    four corner patches and the center patch as well as their
-    horizontal reflections.
-    """
-    # Dimensions and center.
-    im_shape = np.array(img[0].shape)
-    crop_dims = np.array(crop_dims)
-    im_center = im_shape[:2] / 2.0
-
-    # Make crop coordinates
-    h_indices = (0, im_shape[0] - crop_dims[0])
-    w_indices = (0, im_shape[1] - crop_dims[1])
-    crops_ix = np.empty((5, 4), dtype=int)
-    curr = 0
-    for i in h_indices:
-        for j in w_indices:
-            crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
-            curr += 1
-    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate(
-        [-crop_dims / 2.0, crop_dims / 2.0])
-    crops_ix = np.tile(crops_ix, (2, 1))
-
-    # Extract crops
-    crops = np.empty(
-        (10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
-        dtype=np.float32)
-    ix = 0
-    for im in img:
-        for crop in crops_ix:
-            crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :]
-            ix += 1
-        crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :]  # flip for mirrors
-    return crops
-
-
-class ImageTransformer:
-    def __init__(self,
-                 transpose=None,
-                 channel_swap=None,
-                 mean=None,
-                 is_color=True):
-        self.is_color = is_color
-        self.set_transpose(transpose)
-        self.set_channel_swap(channel_swap)
-        self.set_mean(mean)
-
-    def set_transpose(self, order):
-        if order is not None:
-            if self.is_color:
-                assert 3 == len(order)
-        self.transpose = order
-
-    def set_channel_swap(self, order):
-        if order is not None:
-            if self.is_color:
-                assert 3 == len(order)
-        self.channel_swap = order
-
-    def set_mean(self, mean):
-        if mean is not None:
-            # mean value, may be one value per channel
-            if mean.ndim == 1:
-                mean = mean[:, np.newaxis, np.newaxis]
-            else:
-                # elementwise mean
-                if self.is_color:
-                    assert len(mean.shape) == 3
-        self.mean = mean
-
-    def transformer(self, data):
-        if self.transpose is not None:
-            data = data.transpose(self.transpose)
-        if self.channel_swap is not None:
-            data = data[self.channel_swap, :, :]
-        if self.mean is not None:
-            data -= self.mean
-        return data
diff --git a/python/paddle/utils/plot.py b/python/paddle/utils/plot.py
deleted file mode 100644
index ee651f2f0cd6f2e594a4e74c896baa924f70bbf5..0000000000000000000000000000000000000000
--- a/python/paddle/utils/plot.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import six
-
-
-class PlotData(object):
-    def __init__(self):
-        self.step = []
-        self.value = []
-
-    def append(self, step, value):
-        self.step.append(step)
-        self.value.append(value)
-
-    def reset(self):
-        self.step = []
-        self.value = []
-
-
-class Ploter(object):
-    """
-        Plot input data in a 2D graph
-        
-        Args:
-            title: assign the title of input data.
-            step: x_axis of the data.
-            value: y_axis of the data.
-    """
-
-    def __init__(self, *args):
-        self.__args__ = args
-        self.__plot_data__ = {}
-        for title in args:
-            self.__plot_data__[title] = PlotData()
-        # demo in notebooks will use Ploter to plot figure, but when we convert
-        # the ipydb to py file for testing, the import of matplotlib will make the
-        # script crash. So we can use `export DISABLE_PLOT=True` to disable import
-        # these libs
-        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
-        if not self.__plot_is_disabled__():
-            import matplotlib.pyplot as plt
-            from IPython import display
-            self.plt = plt
-            self.display = display
-
-    def __plot_is_disabled__(self):
-        return self.__disable_plot__ == "True"
-
-    def append(self, title, step, value):
-        """
-        Feed data
-
-        Args:
-                title: assign the group data to this subtitle.
-                step: the x_axis of data.
-                value: the y_axis of data.
-            
-            Examples:
-                .. code-block:: python
-                plot_curve = Ploter("Curve 1","Curve 2")
-                plot_curve.append(title="Curve 1",step=1,value=1)
-        """
-        assert isinstance(title, six.string_types)
-        assert title in self.__plot_data__
-        data = self.__plot_data__[title]
-        assert isinstance(data, PlotData)
-        data.append(step, value)
-
-    def plot(self, path=None):
-        """
-            Plot data in a 2D graph
-
-            Args:
-                path: store the figure to this file path. Defaul None. 
-              
-            Examples:
-                .. code-block:: python
-                plot_curve = Ploter()
-                plot_cure.plot()
-        """
-        if self.__plot_is_disabled__():
-            return
-
-        titles = []
-        for title in self.__args__:
-            data = self.__plot_data__[title]
-            assert isinstance(data, PlotData)
-            if len(data.step) > 0:
-                titles.append(title)
-                self.plt.plot(data.step, data.value)
-        self.plt.legend(titles, loc='upper left')
-        if path is None:
-            self.display.clear_output(wait=True)
-            self.display.display(self.plt.gcf())
-        else:
-            self.plt.savefig(path)
-        self.plt.gcf().clear()
-
-    def reset(self):
-        for key in self.__plot_data__:
-            data = self.__plot_data__[key]
-            assert isinstance(data, PlotData)
-            data.reset()
diff --git a/python/paddle/utils/plotcurve.py b/python/paddle/utils/plotcurve.py
deleted file mode 100644
index a95e5497e23571e61e5d7652830a99efd7793083..0000000000000000000000000000000000000000
--- a/python/paddle/utils/plotcurve.py
+++ /dev/null
@@ -1,155 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Plot training and testing curve from paddle log.
-
-It takes input from a file or stdin, and output to a file or stdout.
-
-Note: must have numpy and matplotlib installed in order to use this tool.
-
-usage: Plot training and testing curves from paddle log file.
-       [-h] [-i INPUT] [-o OUTPUT] [--format FORMAT] [key [key ...]]
-
-positional arguments:
-  key                   keys of scores to plot, the default will be AvgCost
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -i INPUT, --input INPUT
-                        input filename of paddle log, default will be standard
-                        input
-  -o OUTPUT, --output OUTPUT
-                        output filename of figure, default will be standard
-                        output
-  --format FORMAT       figure format(png|pdf|ps|eps|svg)
-
-
-The keys must be in the order of paddle output(!!!).
-
-For example, paddle.INFO contrains the following log
-   I0406 21:26:21.325584  3832 Trainer.cpp:601]  Pass=0 Batch=7771 AvgCost=0.624935 Eval: error=0.260972
-
-To use this script to generate plot for AvgCost, error:
-   python plotcurve.py -i paddle.INFO -o figure.png AvgCost error
-"""
-
-import six
-import sys
-import matplotlib
-# the following line is added immediately after import matplotlib
-# and before import pylot. The purpose is to ensure the plotting
-# works even under remote login (i.e. headless display)
-matplotlib.use('Agg')
-from matplotlib import cm
-import matplotlib.pyplot as pyplot
-import numpy
-import argparse
-import re
-import os
-
-
-def plot_paddle_curve(keys, inputfile, outputfile, format='png',
-                      show_fig=False):
-    """Plot curves from paddle log and save to outputfile.
-
-    :param keys: a list of strings to be plotted, e.g. AvgCost
-    :param inputfile: a file object for input
-    :param outputfile: a file object for output
-    :return: None
-    """
-    pass_pattern = r"Pass=([0-9]*)"
-    test_pattern = r"Test samples=([0-9]*)"
-    if not keys:
-        keys = ['AvgCost']
-    for k in keys:
-        pass_pattern += r".*?%s=([0-9e\-\.]*)" % k
-        test_pattern += r".*?%s=([0-9e\-\.]*)" % k
-    data = []
-    test_data = []
-    compiled_pattern = re.compile(pass_pattern)
-    compiled_test_pattern = re.compile(test_pattern)
-    for line in inputfile:
-        found = compiled_pattern.search(line)
-        found_test = compiled_test_pattern.search(line)
-        if found:
-            data.append([float(x) for x in found.groups()])
-        if found_test:
-            test_data.append([float(x) for x in found_test.groups()])
-    x = numpy.array(data)
-    x_test = numpy.array(test_data)
-    if x.shape[0] <= 0:
-        sys.stderr.write("No data to plot. Exiting!\n")
-        return
-    m = len(keys) + 1
-    for i in six.moves.xrange(1, m):
-        pyplot.plot(
-            x[:, 0],
-            x[:, i],
-            color=cm.jet(1.0 * (i - 1) / (2 * m)),
-            label=keys[i - 1])
-        if (x_test.shape[0] > 0):
-            pyplot.plot(
-                x[:, 0],
-                x_test[:, i],
-                color=cm.jet(1.0 - 1.0 * (i - 1) / (2 * m)),
-                label="Test " + keys[i - 1])
-    pyplot.xlabel('number of epoch')
-    pyplot.legend(loc='best')
-    if show_fig:
-        pyplot.show()
-    pyplot.savefig(outputfile, bbox_inches='tight')
-    pyplot.clf()
-
-
-def main(argv):
-    """
-    main method of plotting curves.
-    """
-    cmdparser = argparse.ArgumentParser(
-        "Plot training and testing curves from paddle log file.")
-    cmdparser.add_argument(
-        'key', nargs='*', help='keys of scores to plot, the default is AvgCost')
-    cmdparser.add_argument(
-        '-i',
-        '--input',
-        help='input filename of paddle log, '
-        'default will be standard input')
-    cmdparser.add_argument(
-        '-o',
-        '--output',
-        help='output filename of figure, '
-        'default will be standard output')
-    cmdparser.add_argument('--format', help='figure format(png|pdf|ps|eps|svg)')
-    args = cmdparser.parse_args(argv)
-    keys = args.key
-    if args.input:
-        inputfile = open(args.input)
-    else:
-        inputfile = sys.stdin
-    format = args.format
-    if args.output:
-        outputfile = open(args.output, 'wb')
-        if not format:
-            format = os.path.splitext(args.output)[1]
-            if not format:
-                format = 'png'
-    else:
-        outputfile = sys.stdout
-    plot_paddle_curve(keys, inputfile, outputfile, format)
-    inputfile.close()
-    outputfile.close()
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
diff --git a/python/paddle/utils/preprocess_img.py b/python/paddle/utils/preprocess_img.py
deleted file mode 100644
index fc67949dfe0ef21487de29678781aa2bfd93f354..0000000000000000000000000000000000000000
--- a/python/paddle/utils/preprocess_img.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-import random
-import numpy as np
-import PIL.Image as Image
-from six.moves import cStringIO as StringIO
-from . import preprocess_util
-from .image_util import crop_img
-
-
-def resize_image(img, target_size):
-    """
-    Resize an image so that the shorter edge has length target_size.
-    img: the input image to be resized.
-    target_size: the target resized image size.
-    """
-    percent = (target_size / float(min(img.size[0], img.size[1])))
-    resized_size = int(round(img.size[0] * percent)),\
-                   int(round(img.size[1] * percent))
-    img = img.resize(resized_size, Image.ANTIALIAS)
-    return img
-
-
-class DiskImage:
-    """
-    A class of image data on disk.
-    """
-
-    def __init__(self, path, target_size):
-        """
-        path: path of the image.
-        target_size: target resize size.
-        """
-        self.path = path
-        self.target_size = target_size
-        self.img = None
-        pass
-
-    def read_image(self):
-        if self.img is None:
-            print("reading: " + self.path)
-            image = resize_image(Image.open(self.path), self.target_size)
-            self.img = image
-
-    def convert_to_array(self):
-        self.read_image()
-        np_array = np.array(self.img)
-        if len(np_array.shape) == 3:
-            np_array = np.swapaxes(np_array, 1, 2)
-            np_array = np.swapaxes(np_array, 1, 0)
-        return np_array
-
-    def convert_to_paddle_format(self):
-        """
-        convert the image into the paddle batch format.
-        """
-        self.read_image()
-        output = StringIO()
-        self.img.save(output, "jpeg")
-        contents = output.getvalue()
-        return contents
-
-
-class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
-    """
-    A class to process data for image classification.
-    """
-
-    def __init__(self, data_path, target_size, color=True):
-        """
-        data_path: the path to store the training data and batches.
-        target_size: processed image size in a batch.
-        color: whether to use color images.
-        """
-        preprocess_util.DatasetCreater.__init__(self, data_path)
-        self.target_size = target_size
-        self.color = color
-        self.keys = ["images", "labels"]
-        self.permute_key = "labels"
-
-    def create_meta_file(self, data):
-        """
-        Create a meta file for image classification.
-        The meta file contains the meam image, as well as some configs.
-        data: the training Dataaet.
-        """
-        output_path = os.path.join(self.data_path, self.batch_dir_name,
-                                   self.meta_filename)
-        if self.color:
-            mean_img = np.zeros((3, self.target_size, self.target_size))
-        else:
-            mean_img = np.zeros((self.target_size, self.target_size))
-        for d in data.data:
-            img = d[0].convert_to_array()
-            cropped_img = crop_img(img, self.target_size, self.color)
-            mean_img += cropped_img
-        mean_img /= len(data.data)
-        mean_img = mean_img.astype('int32').flatten()
-        preprocess_util.save_file({
-            "data_mean": mean_img,
-            "image_size": self.target_size,
-            "mean_image_size": self.target_size,
-            "num_classes": self.num_classes,
-            "color": self.color
-        }, output_path)
-        pass
-
-    def create_dataset_from_list(self, path):
-        data = []
-        label_set = []
-        for line in open(path):
-            items = line.rstrip.split()
-            image_path = items[0]
-            label_name = items[1]
-            if not label_name in label_set:
-                label_set[label_name] = len(list(label_set.keys()))
-            img = DiskImage(path=image_path, target_size=self.target_size)
-            label = preprocess_util.Lablel(
-                label=label_set[label_name], name=label_name)
-        return preprocess_util.Dataset(data, self.keys), label_set
-
-    def create_dataset_from_dir(self, path):
-        """
-        Create a Dataset object for image classfication.
-        Each folder in the path directory corresponds to a set of images of
-        this label, and the name of the folder is the name of the
-        path: the path of the image dataset.
-        """
-        if self.from_list:
-            return self.create_dataset_from_list(path)
-        label_set = preprocess_util.get_label_set_from_dir(path)
-        data = []
-        for l_name in list(label_set.keys()):
-            image_paths = preprocess_util.list_images(
-                os.path.join(path, l_name))
-            for p in image_paths:
-                img = DiskImage(path=p, target_size=self.target_size)
-                label = preprocess_util.Label(
-                    label=label_set[l_name], name=l_name)
-                data.append((img, label))
-        random.shuffle(data)
-        return preprocess_util.Dataset(data, self.keys), label_set
diff --git a/python/paddle/utils/preprocess_util.py b/python/paddle/utils/preprocess_util.py
deleted file mode 100644
index 05b2067d01a2c544d7f5bd68320e79c805282286..0000000000000000000000000000000000000000
--- a/python/paddle/utils/preprocess_util.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import six.moves.cPickle as pickle
-import random
-import collections
-
-
-def save_file(data, filename):
-    """
-    Save data into pickle format.
-    data: the data to save.
-    filename: the output filename.
-    """
-    pickle.dump(data, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
-
-
-def save_list(l, outfile):
-    """
-    Save a list of string into a text file. There is one line for each string.
-    l: the list of string to save
-    outfile: the output file
-    """
-    open(outfile, "w").write("\n".join(l))
-
-
-def exclude_pattern(f):
-    """
-    Return whether f is in the exlucde pattern.
-    Exclude the files that starts with . or ends with ~.
-    """
-    return f.startswith(".") or f.endswith("~")
-
-
-def list_dirs(path):
-    """
-    Return a list of directories in path. Exclude all the directories that
-    start with '.'.
-    path: the base directory to search over.
-    """
-    return [
-        os.path.join(path, d) for d in next(os.walk(path))[1]
-        if not exclude_pattern(d)
-    ]
-
-
-def list_images(path, exts=set(["jpg", "png", "bmp", "jpeg"])):
-    """
-    Return a list of images in path.
-    path: the base directory to search over.
-    exts: the extensions of the images to find.
-    """
-    return [os.path.join(path, d) for d in  os.listdir(path) \
-            if os.path.isfile(os.path.join(path, d)) and not exclude_pattern(d)\
-            and os.path.splitext(d)[-1][1:] in exts]
-
-
-def list_files(path):
-    """
-    Return a list of files in path.
-    path: the base directory to search over.
-    exts: the extensions of the images to find.
-    """
-    return [os.path.join(path, d) for d in  os.listdir(path) \
-            if os.path.isfile(os.path.join(path, d)) and not exclude_pattern(d)]
-
-
-def get_label_set_from_dir(path):
-    """
-    Return a dictionary of the labels and label ids from a path.
-    Assume each direcotry in the path corresponds to a unique label.
-    The keys of the dictionary is the label name.
-    The values of the dictionary is the label id.
-    """
-    dirs = list_dirs(path)
-    return dict([(os.path.basename(d), i) for i, d in enumerate(sorted(dirs))])
-
-
-class Label:
-    """
-    A class of label data.
-    """
-
-    def __init__(self, label, name):
-        """
-        label: the id of the label.
-        name: the name of the label.
-        """
-        self.label = label
-        self.name = name
-
-    def convert_to_paddle_format(self):
-        """
-        convert the image into the paddle batch format.
-        """
-        return int(self.label)
-
-    def __hash__(self):
-        return hash((self.label))
-
-
-class Dataset:
-    """
-    A class to represent a dataset. A dataset contains a set of items.
-    Each item contains multiple slots of data.
-    For example: in image classification dataset, each item contains two slot,
-    The first slot is an image, and the second slot is a label.
-    """
-
-    def __init__(self, data, keys):
-        """
-        data: a list of data.
-              Each data is a tuple containing multiple slots of data.
-              Each slot is an object with convert_to_paddle_format function.
-        keys: contains a list of keys for all the slots.
-        """
-        self.data = data
-        self.keys = keys
-
-    def check_valid(self):
-        for d in self.data:
-            assert (len(d) == len(self.keys))
-
-    def permute(self, key_id, num_per_batch):
-        """
-        Permuate data for batching. It supports two types now:
-        1. if key_id == None, the batching process is completely random.
-        2. if key_id is not None. The batching process Permuate the data so that the key specified by key_id are
-        uniformly distributed in batches. See the comments of permute_by_key for details.
-        """
-        if key_id is None:
-            self.uniform_permute()
-        else:
-            self.permute_by_key(key_id, num_per_batch)
-
-    def uniform_permute(self):
-        """
-        Permuate the data randomly.
-        """
-        random.shuffle(self.data)
-
-    def permute_by_key(self, key_id, num_per_batch):
-        """
-        Permuate the data so that the key specified by key_id are
-        uniformly distributed in batches.
-        For example: if we have three labels, and the number of data
-        for each label are 100, 200, and 300, respectively.  The number of batches is 4.
-        Then, the number of data for these labels is 25, 50, and 75.
-        """
-        # Store the indices of the data that has the key value
-        # specified by key_id.
-        keyvalue_indices = collections.defaultdict(list)
-        for idx in range(len(self.data)):
-            keyvalue_indices[self.data[idx][key_id].label].append(idx)
-        for k in keyvalue_indices:
-            random.shuffle(keyvalue_indices[k])
-
-        num_data_per_key_batch = \
-            math.ceil(num_per_batch / float(len(list(keyvalue_indices.keys()))))
-
-        if num_data_per_key_batch < 2:
-            raise Exception("The number of data in a batch is too small")
-
-        permuted_data = []
-        keyvalue_readpointer = collections.defaultdict(int)
-        while len(permuted_data) < len(self.data):
-            for k in keyvalue_indices:
-                begin_idx = keyvalue_readpointer[k]
-                end_idx = int(
-                    min(begin_idx + num_data_per_key_batch,
-                        len(keyvalue_indices[k])))
-                print("begin_idx, end_idx")
-                print(begin_idx, end_idx)
-                for idx in range(begin_idx, end_idx):
-                    permuted_data.append(self.data[keyvalue_indices[k][idx]])
-                keyvalue_readpointer[k] = end_idx
-        self.data = permuted_data
-
-
-class DataBatcher:
-    """
-    A class that is used to create batches for both training and testing
-    datasets.
-    """
-
-    def __init__(self, train_data, test_data, label_set):
-        """
-        train_data, test_data: Each one is a dataset object repesenting
-        training and testing data, respectively.
-        label_set: a dictionary storing the mapping from label name to label id.
-        """
-        self.train_data = train_data
-        self.test_data = test_data
-        self.label_set = label_set
-        self.num_per_batch = 5000
-        assert (self.train_data.keys == self.test_data.keys)
-
-    def create_batches_and_list(self, output_path, train_list_name,
-                                test_list_name, label_set_name):
-        """
-        Create batches for both training and testing objects.
-        It also create train.list and test.list to indicate the list
-        of the batch files for training and testing data, respectively.
-        """
-        train_list = self.create_batches(self.train_data, output_path, "train_",
-                                         self.num_per_batch)
-        test_list = self.create_batches(self.test_data, output_path, "test_",
-                                        self.num_per_batch)
-        save_list(train_list, os.path.join(output_path, train_list_name))
-        save_list(test_list, os.path.join(output_path, test_list_name))
-        save_file(self.label_set, os.path.join(output_path, label_set_name))
-
-    def create_batches(self,
-                       data,
-                       output_path,
-                       prefix="",
-                       num_data_per_batch=5000):
-        """
-        Create batches for a Dataset object.
-        data: the Dataset object to process.
-        output_path: the output path of the batches.
-        prefix: the prefix of each batch.
-        num_data_per_batch: number of data in each batch.
-        """
-        num_batches = int(math.ceil(len(data.data) / float(num_data_per_batch)))
-        batch_names = []
-        data.check_valid()
-        num_slots = len(data.keys)
-        for i in range(num_batches):
-            batch_name = os.path.join(output_path, prefix + "batch_%03d" % i)
-            out_data = dict([(k, []) for k in data.keys])
-            begin_idx = i * num_data_per_batch
-            end_idx = min((i + 1) * num_data_per_batch, len(data.data))
-            for j in range(begin_idx, end_idx):
-                for slot_id in range(num_slots):
-                    out_data[data.keys[slot_id]].\
-                        append(data.data[j][slot_id].convert_to_paddle_format())
-            save_file(out_data, batch_name)
-            batch_names.append(batch_name)
-        return batch_names
-
-
-class DatasetCreater(object):
-    """
-    A virtual class for creating datasets.
-    The derived clasas needs to implemnt the following methods:
-       - create_dataset()
-       - create_meta_file()
-    """
-
-    def __init__(self, data_path):
-        """
-        data_path: the path to store the training data and batches.
-        train_dir_name: relative training data directory.
-        test_dir_name: relative testing data directory.
-        batch_dir_name: relative batch directory.
-        num_per_batch: the number of data in a batch.
-        meta_filename: the filename of the meta file.
-        train_list_name: training batch list name.
-        test_list_name: testing batch list name.
-        label_set: label set name.
-        overwrite: whether to overwrite the files if the batches are already in
-                   the given path.
-        """
-        self.data_path = data_path
-        self.train_dir_name = 'train'
-        self.test_dir_name = 'test'
-        self.batch_dir_name = 'batches'
-        self.num_per_batch = 50000
-        self.meta_filename = "batches.meta"
-        self.train_list_name = "train.list"
-        self.test_list_name = "test.list"
-        self.label_set_name = "labels.pkl"
-        self.output_path = os.path.join(self.data_path, self.batch_dir_name)
-        self.overwrite = False
-        self.permutate_key = "labels"
-        self.from_list = False
-
-    def create_meta_file(self, data):
-        """
-        Create a meta file from training data.
-        data: training data given in a Dataset format.
-        """
-        raise NotImplementedError
-
-    def create_dataset(self, path):
-        """
-        Create a data set object from a path.
-        It will use directory structure or a file list to determine dataset if
-        self.from_list is True. Otherwise, it will uses a file list  to
-        determine the datset.
-        path: the path of the dataset.
-        return a tuple of Dataset object, and a mapping from lable set
-        to label id.
-        """
-        if self.from_list:
-            return self.create_dataset_from_list(path)
-        else:
-            return self.create_dataset_from_dir(path)
-
-    def create_dataset_from_list(self, path):
-        """
-        Create a data set object from a path.
-        It will uses a file list to determine the datset.
-        path: the path of the dataset.
-        return a tuple of Dataset object, and a mapping from lable set
-        to label id
-        """
-        raise NotImplementedError
-
-    def create_dataset_from_dir(self, path):
-        """
-        Create a data set object from a path.
-        It will use directory structure or a file list to determine dataset if
-        self.from_list is True.
-        path: the path of the dataset.
-        return a tuple of Dataset object, and a mapping from lable set
-        to label id
-        """
-        raise NotImplementedError
-
-    def create_batches(self):
-        """
-        create batches and meta file.
-        """
-        train_path = os.path.join(self.data_path, self.train_dir_name)
-        test_path = os.path.join(self.data_path, self.test_dir_name)
-        out_path = os.path.join(self.data_path, self.batch_dir_name)
-        if not os.path.exists(out_path):
-            os.makedirs(out_path)
-        if (self.overwrite or not os.path.exists(
-                os.path.join(out_path, self.train_list_name))):
-            train_data, train_label_set = \
-                self.create_dataset(train_path)
-            test_data, test_label_set = \
-                self.create_dataset(test_path)
-
-            train_data.permute(
-                self.keys.index(self.permutate_key), self.num_per_batch)
-
-            assert (train_label_set == test_label_set)
-            data_batcher = DataBatcher(train_data, test_data, train_label_set)
-            data_batcher.num_per_batch = self.num_per_batch
-            data_batcher.create_batches_and_list(
-                self.output_path, self.train_list_name, self.test_list_name,
-                self.label_set_name)
-            self.num_classes = len(list(train_label_set.keys()))
-            self.create_meta_file(train_data)
-        return out_path
diff --git a/python/paddle/utils/show_pb.py b/python/paddle/utils/show_pb.py
deleted file mode 100644
index da7a71a665aea4d93d366e8508f438a9aba88e94..0000000000000000000000000000000000000000
--- a/python/paddle/utils/show_pb.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Show the content of proto buffer data file of PADDLE
-"""
-
-from __future__ import print_function
-
-import os
-import sys
-from google.protobuf.internal.decoder import _DecodeVarint
-import paddle.proto.DataFormat_pb2 as DataFormat
-
-
-def read_proto(file, message):
-    """
-    read a protobuffer struct from file, the length of the struct is stored as
-    a varint, then followed by the actual struct data.
-    @return True success, False for end of file
-    """
-
-    buf = file.read(8)
-    if not buf:
-        return False
-    result, pos = _DecodeVarint(buf, 0)
-    buf = buf[pos:] + file.read(result - len(buf) + pos)
-    message.ParseFromString(buf)
-
-    return True
-
-
-def usage():
-    print("Usage: python show_pb.py PROTO_DATA_FILE", file=sys.stderr)
-    exit(1)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) < 2:
-        usage()
-
-    f = open(sys.argv[1])
-    header = DataFormat.DataHeader()
-    read_proto(f, header)
-    print(header)
-
-    sample = DataFormat.DataSample()
-    while read_proto(f, sample):
-        print(sample)
diff --git a/python/paddle/utils/torch2paddle.py b/python/paddle/utils/torch2paddle.py
deleted file mode 100644
index 398d3aa4e02cc74b7885f7e676937d7fd254bc5e..0000000000000000000000000000000000000000
--- a/python/paddle/utils/torch2paddle.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert torch parameter file to paddle model files.
-
-Note: must have torchfile installed in order to use this tool.
-
-Usage: python torch2paddle.py -i torchfile.t7 -l layers.txt -o path/to/paddle_model
-"""
-
-import os
-import sys
-import struct
-import numpy as np
-import torchfile
-import six.moves.cPickle as pickle
-import argparse
-
-
-# save parameters
-def save_layer_parameters(outfile, feats):
-    version = 0
-    value_size = 4
-    ret = ""
-    for feat in feats:
-        ret += feat.tostring()
-    size = len(ret) / 4
-    fo = open(outfile, 'wb')
-    fo.write(struct.pack('iIQ', version, value_size, size))
-    fo.write(ret)
-    fo.close()
-
-
-def save_net_parameters(layers, params, output_path):
-    for i in range(len(layers)):
-        weight = params[i * 2]
-        biases = params[i * 2 + 1]
-        weight_file = os.path.join(output_path, '_%s.w0' % layers[i])
-        biases_file = os.path.join(output_path, '_%s.wbias' % layers[i])
-        print("Saving for layer %s." % layers[i])
-        save_layer_parameters(weight_file, [weight])
-        save_layer_parameters(biases_file, biases)
-
-
-def load_layer_parameters(filename):
-    fn = open(filename, 'rb')
-    version, = struct.unpack('i', fn.read(4))
-    value_length, = struct.unpack("I", fn.read(4))
-    dtype = 'float32' if value_length == 4 else 'float64'
-    param_size, = struct.unpack("L", fn.read(8))
-    value = np.fromfile(fn, dtype)
-    return value
-
-
-def main(argv):
-    """
-    main method of converting torch to paddle files.
-    :param argv:
-    :return:
-    """
-    cmdparser = argparse.ArgumentParser(
-        "Convert torch parameter file to paddle model files.")
-    cmdparser.add_argument(
-        '-i', '--input', help='input filename of torch parameters')
-    cmdparser.add_argument('-l', '--layers', help='list of layer names')
-    cmdparser.add_argument(
-        '-o', '--output', help='output file path of paddle model')
-
-    args = cmdparser.parse_args(argv)
-    if args.input and args.layers and args.output:
-        params = torchfile.load(args.input)
-        layers = [line.strip() for line in open(args.layers, 'r')]
-        save_net_parameters(layers, params, args.output)
-    else:
-        print(
-            'Usage: python torch2paddle.py -i torchfile.t7 -l layers.txt -o path/to/paddle_model'
-        )
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
diff --git a/python/requirements.txt b/python/requirements.txt
deleted file mode 100644
index d1e34b524632e37f5cd7fbae5fb6258787d8de91..0000000000000000000000000000000000000000
--- a/python/requirements.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-requests>=2.20.0
-numpy>=1.12, <=1.16.4 ; python_version<"3.5"
-numpy>=1.12 ; python_version>="3.5"
-protobuf>=3.1.0
-matplotlib<=2.2.4 ; python_version<"3.6"
-scipy>=0.19.0, <=1.2.1 ; python_version<"3.5"
-nltk>=3.2.2, <=3.4 ; python_version<"3.5"
-matplotlib ; python_version>="3.6"
-scipy ; python_version>="3.5"
-nltk ; python_version>="3.5"
-rarfile
-Pillow
-graphviz
-six
-funcsigs
-pyyaml
-decorator
-prettytable
-objgraph
diff --git a/python/setup.py.in b/python/setup.py.in
deleted file mode 100644
index 3288b6152c0f62978184b791c738a9e39919708e..0000000000000000000000000000000000000000
--- a/python/setup.py.in
+++ /dev/null
@@ -1,263 +0,0 @@
-from setuptools import setup, Distribution, Extension
-import subprocess
-import os
-import re
-import shutil
-import sys
-class BinaryDistribution(Distribution):
-    def has_ext_modules(foo):
-        return True
-
-RC      = 0
-
-ext_name = '.dll' if os.name == 'nt' else ('.dylib' if sys.platform == 'darwin' else '.so')
-
-def git_commit():
-    try:
-        cmd = ['git', 'rev-parse', 'HEAD']
-        git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE,
-            cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
-    except:
-        git_commit = 'Unknown'
-    git_commit = git_commit.decode()
-    return str(git_commit)
-
-def _get_version_detail(idx):
-    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
-        so detail index must less than 3"
-
-    if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
-        version_details = '@PADDLE_VERSION@'.split('.')
-
-        if len(version_details) >= 3:
-            return version_details[idx]
-
-    return 0
-
-def get_major():
-    return int(_get_version_detail(0))
-
-def get_minor():
-    return int(_get_version_detail(1))
-
-def get_patch():
-    return str(_get_version_detail(2))
-
-def is_taged():
-    try:
-        cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
-        git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
-        git_tag = git_tag.decode()
-    except:
-        return False
-
-    if str(git_tag).replace('v', '') == '@PADDLE_VERSION@':
-        return True
-    else:
-        return False
-
-def write_version_py(filename='paddle/version.py'):
-    cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
-#
-full_version    = '%(major)d.%(minor)d.%(patch)s'
-major           = '%(major)d'
-minor           = '%(minor)d'
-patch           = '%(patch)s'
-rc              = '%(rc)d'
-istaged         = %(istaged)s
-commit          = '%(commit)s'
-with_mkl        = '%(with_mkl)s'
-
-def show():
-    if istaged:
-        print('full_version:', full_version)
-        print('major:', major)
-        print('minor:', minor)
-        print('patch:', patch)
-        print('rc:', rc)
-    else:
-        print('commit:', commit)
-
-def mkl():
-    return with_mkl
-'''
-    commit = git_commit()
-    with open(filename, 'w') as f:
-        f.write(cnt % {
-            'major': get_major(),
-            'minor': get_minor(),
-            'patch': get_patch(),
-            'rc': RC,
-            'version': '${PADDLE_VERSION}',
-            'commit': commit,
-            'istaged': is_taged(),
-            'with_mkl': '@WITH_MKL@'})
-
-write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
-
-
-packages=['paddle',
-          'paddle.libs',
-          'paddle.utils',
-          'paddle.dataset',
-          'paddle.reader',
-          'paddle.distributed',
-          'paddle.fluid',
-          'paddle.fluid.dygraph',
-          'paddle.fluid.proto',
-          'paddle.fluid.proto.profiler',
-          'paddle.fluid.distributed',
-          'paddle.fluid.layers',
-          'paddle.fluid.contrib',
-          'paddle.fluid.contrib.decoder',
-          'paddle.fluid.contrib.quantize',
-          'paddle.fluid.contrib.reader',
-          'paddle.fluid.contrib.slim',
-          'paddle.fluid.contrib.slim.core',
-          'paddle.fluid.contrib.slim.graph',
-          'paddle.fluid.contrib.slim.prune',
-          'paddle.fluid.contrib.slim.quantization',
-          'paddle.fluid.contrib.slim.distillation',
-          'paddle.fluid.contrib.slim.nas',
-          'paddle.fluid.contrib.slim.searcher',
-          'paddle.fluid.contrib.utils',
-          'paddle.fluid.contrib.extend_optimizer',
-          'paddle.fluid.contrib.mixed_precision',
-          'paddle.fluid.contrib.layers',
-          'paddle.fluid.transpiler',
-          'paddle.fluid.transpiler.details',
-          'paddle.fluid.incubate',
-          'paddle.fluid.incubate.data_generator',
-          'paddle.fluid.incubate.fleet',
-          'paddle.fluid.incubate.fleet.base',
-          'paddle.fluid.incubate.fleet.parameter_server',
-          'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
-          'paddle.fluid.incubate.fleet.parameter_server.pslib',
-          'paddle.fluid.incubate.fleet.collective',
-          'paddle.fluid.incubate.fleet.utils']
-
-with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
-    setup_requires = f.read().splitlines()
-
-if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires+=['opencv-python']
-
-# the prefix is sys.prefix which should always be usr
-paddle_bins = ''
-if not '${WIN32}':
-    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
-if '${HAS_NOAVX_CORE}' == 'ON':
-    package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
-
-package_dir={
-    '': '${PADDLE_BINARY_DIR}/python',
-    # The paddle.fluid.proto will be generated while compiling.
-    # So that package points to other directory.
-    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
-    'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
-    'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
-}
-
-# put all thirdparty libraries in paddle.libs
-libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
-
-package_data['paddle.libs']= []
-package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
-shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
-
-if '${TENSORRT_FOUND}' == 'ON' and os.name == 'nt':
-    shutil.copy(os.path.join('${TENSORRT_ROOT}', 'lib', '${TR_INFER_RT}'), libs_path)
-    shutil.copy(os.path.join('${TENSORRT_ROOT}', 'lib', '${TR_INFER_PLUGIN_RT}'), libs_path)
-    package_data['paddle.libs'] += ['${TR_INFER_RT}', '${TR_INFER_PLUGIN_RT}']
-
-if '${WITH_MKL}' == 'ON':
-    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
-    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
-    package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name]
-    # mklml has dependency to vs runtime library
-    if os.name == 'nt':
-        shutil.copy('${MKLML_SHARED_LIB_DEPS}', libs_path)
-        package_data['paddle.libs'] += ['msvcr120.dll']
-else:
-    if os.name == 'nt':
-        # copy the openblas.dll
-        shutil.copy(os.path.dirname('${CBLAS_LIBRARIES}') + '/openblas' + ext_name, libs_path)
-        package_data['paddle.libs'] += ['openblas' + ext_name]
-
-if '${WITH_PSLIB}' == 'ON':
-    shutil.copy('${PSLIB_LIB}', libs_path)
-    package_data['paddle.libs'] += ['libps' + ext_name]
-
-if '${WITH_MKLDNN}' == 'ON':
-    if '${CMAKE_BUILD_TYPE}' == 'Release' and os.name != 'nt':
-        # only change rpath in Release mode.
-        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
-        # we can support mkl on mac.
-        #
-        # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
-        # The reason is that all thirdparty libraries in the same directory,
-        # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
-        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
-        if os.system(command) != 0:
-            raise Exception("patch libmkldnn.so failed, command: %s" % command)
-    package_data['paddle.libs']+=['libmkldnn.so.0' if os.name != 'nt' else ('mkldnn' + ext_name)]
-    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
-if '${WITH_NGRAPH}' == 'ON':
-    # only change rpath in Release mode,
-    # since in Debug mode, nGraph lib may be too large to be changed?
-    if '${CMAKE_BUILD_TYPE}' == 'Release':
-        if os.name != 'nt':
-            if "@APPLE@" == "1":
-                command = "install_name_tool -id \"@loader_path/\" ${NGRAPH_SHARED_LIB}"
-            else:
-                command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
-            if os.system(command) != 0:
-                raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
-    shutil.copy('${NGRAPH_SHARED_LIB}', libs_path)
-    shutil.copy('${NGRAPH_CPU_LIB}', libs_path)
-    shutil.copy('${NGRAPH_TBB_LIB}', libs_path)
-    package_data['paddle.libs']+=['${NGRAPH_SHARED_LIB_NAME}',
-                                  '${NGRAPH_CPU_LIB_NAME}',
-                                  '${NGRAPH_TBB_LIB_NAME}']
-# remove unused paddle/libs/__init__.py
-if os.path.isfile(libs_path+'/__init__.py'):
-    os.remove(libs_path+'/__init__.py')
-package_dir['paddle.libs']=libs_path
-
-# change rpath of ${FLUID_CORE_NAME}.ext, add $ORIGIN/../libs/ to it.
-# The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and
-# ${FLUID_CORE_NAME}.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
-# This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
-if '${CMAKE_BUILD_TYPE}' == 'Release':
-    if os.name != 'nt':
-        # only change rpath in Release mode, since in Debug mode, ${FLUID_CORE_NAME}.xx is too large to be changed.
-        if "@APPLE@" == "1":
-            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
-        else:
-            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
-        if os.system(command) != 0:
-            raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
-
-ext_modules = [Extension('_foo', ['stub.cc'])]
-if os.name == 'nt':
-    # fix the path separator under windows
-    fix_package_dir = {}
-    for k, v in package_dir.items():
-        fix_package_dir[k] = v.replace('/', '\\')
-    package_dir = fix_package_dir
-    ext_modules = []
-elif sys.platform == 'darwin':
-    ext_modules = []
-
-setup(name='${PACKAGE_NAME}',
-      version='${PADDLE_VERSION}',
-      description='Parallel Distributed Deep Learning',
-      install_requires=setup_requires,
-      packages=packages,
-      ext_modules=ext_modules,
-      package_data=package_data,
-      package_dir=package_dir,
-      scripts=paddle_bins,
-      distclass=BinaryDistribution
-)
diff --git a/release.bcloud b/release.bcloud
deleted file mode 100755
index 4b37842abb2ce5fa88c7fa30ab19c80d0e783f8a..0000000000000000000000000000000000000000
--- a/release.bcloud
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-mkdir -p so
-
-cp baidu_third-party_mklml/so/* so
-rm -rf baidu_third-party_mklml
-
-cp baidu_third-party_openmpi/so/* so
-ln -s libmpi.so so/libmpi.so.0
-ln -s libmpi_cxx.so so/libmpi_cxx.so.0
-ln -s libopen-pal.so so/libopen-pal.so.0
-ln -s libopen-rte.so so/libopen-rte.so.0
-rm -rf baidu_third-party_openmpi
-
-rm lib/libfake_paddle_proto.a
-rmdir lib 2>/dev/null || :
diff --git a/tools/__init__.py b/tools/__init__.py
deleted file mode 100644
index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000
--- a/tools/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
deleted file mode 100644
index 2ccc48255a71945b417b565100fbb13cb777749b..0000000000000000000000000000000000000000
--- a/tools/check_api_approvals.sh
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/bin/bash
-if [ -z ${BRANCH} ]; then
-    BRANCH="develop"
-fi
-
-PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
-API_FILES=("CMakeLists.txt"
-           "paddle/fluid/API.spec"
-           "paddle/fluid/op_use_default_grad_op_maker.spec"
-           "paddle/fluid/framework/operator.h"
-           "paddle/fluid/framework/tensor.h"
-           "paddle/fluid/framework/details/op_registry.h"
-           "paddle/fluid/framework/grad_op_desc_maker.h"
-           "paddle/fluid/framework/lod_tensor.h"
-           "paddle/fluid/framework/selected_rows.h"
-           "paddle/fluid/framework/op_desc.h"
-           "paddle/fluid/framework/block_desc.h"
-           "paddle/fluid/framework/var_desc.h"
-           "paddle/fluid/framework/scope.h"
-           "paddle/fluid/framework/ir/node.h"
-           "paddle/fluid/framework/ir/graph.h"
-           "paddle/fluid/framework/framework.proto"
-           "python/requirements.txt"
-           "python/paddle/fluid/__init__.py"
-           "python/paddle/fluid/compiler.py"
-           "python/paddle/fluid/parallel_executor.py"
-           "python/paddle/fluid/framework.py"
-           "python/paddle/fluid/backward.py"
-           "paddle/fluid/operators/distributed/send_recv.proto.in")
-
-approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
-git_files=`git diff --numstat upstream/$BRANCH| wc -l`
-git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
-failed_num=0
-echo_list=()
-if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
-  APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 38231817`
-  if [ "${APPROVALS}" == "FALSE" ]; then
-    failed_num=`expr $failed_num + 1`
-    echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content\n"
-    echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-  fi
-fi    
-
-for API_FILE in ${API_FILES[*]}; do
-  API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" | grep -v "/CMakeLists.txt" || true`
-  echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
-  if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
-      # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-      # approval_user_list: XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,xsrobin 50069408,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,JiabinYang 22361972,chenwhql 22561442. 
-      if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
-        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7534971 14105589 12605721 3064195 328693 47554610 39645414 11195205 20274488 45024560 ` 
-      elif [ "${API_FILE}" == "paddle/fluid/op_use_default_grad_op_maker.spec" ];then
-        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32832641 6836917`
-      elif [ "${API_FILE}" == "CMakeLists.txt" ];then
-        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 46782768 30176695`
-      elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-         APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 47554610`
-      elif [ "${API_FILE}" == "python/requirements.txt" ];then
-         APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 22361972`
-      else
-        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641`
-      fi
-      echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-      if [ "${APPROVALS}" == "FALSE" ]; then
-        if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
-          failed_num=`expr $failed_num + 1`
-          echo_line="You must have two RD (wanghaoshuang or guoshengCS or heavengate or kuke or Superjomn or lanxianghit or cyj1986 or hutuxian or frankwhzhang or nepeplwu) approval for the api change! ${API_FILE} for the management reason of API interface and API document.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        elif [ "${API_FILE}" == "paddle/fluid/op_use_default_grad_op_maker.spec" ];then
-          failed_num=`expr $failed_num + 1` 
-          echo_line="You must have one RD (sneaxiy (Recommend) or luotao1) approval for op_use_default_grad_op_maker.spec, which manages the grad_op memory optimization.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        elif [ "${API_FILE}" == "CMakeLists.txt" ];then
-          failed_num=`expr $failed_num + 1`
-          echo_line="You must have one RD (luotao1 or chengduoZH or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        elif [ "${API_FILE}" == "python/requirements.txt" ];then
-          failed_num=`expr $failed_num + 1`
-          echo_line="You must have one RD (JiabinYang (Recommend) or luotao1) approval for python/requirements.txt, which manages the third-party python package.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-          failed_num=`expr $failed_num + 1`
-          echo_line="You must have one RD (lanxianghit (Recommend) or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        else
-          failed_num=`expr $failed_num + 1`
-          echo_line="You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        fi
-      fi
-  fi
-done
-
-HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
-if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
-    APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-    python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641`
-    echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-    if [ "${APPROVALS}" == "FALSE" ]; then
-        failed_num=`expr $failed_num + 1`
-        echo_line="You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang) approval for the usage (either add or delete) of const_cast.\n"
-        echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-    fi
-fi
-
-HAS_DEFINE_FLAG=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "DEFINE_int32" |grep -o -m 1 "DEFINE_bool" | grep -o -m 1 "DEFINE_string" || true`
-if [ ${HAS_DEFINE_FLAG} ] && [ "${GIT_PR_ID}" != "" ]; then
-    APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-    python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 47554610` 
-    echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-    if [ "${APPROVALS}" == "FALSE" ]; then
-        failed_num=`expr $failed_num + 1`
-        echo_line="You must have one RD lanxianghit approval for the usage (either add or delete) of DEFINE_int32/DEFINE_bool/DEFINE_string flag.\n"
-        echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-    fi
-fi
-
-HAS_PADDLE_ENFORCE_FLAG=`git diff -U0 upstream/$BRANCH |grep "+" |grep -v "PADDLE_ENFORCE_" |grep -o -m 1 "PADDLE_ENFORCE" || true`
-if [ ${HAS_PADDLE_ENFORCE_FLAG} ] && [ "${GIT_PR_ID}" != "" ]; then
-    APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-    python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 47554610 22561442`
-    echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-    if [ "${APPROVALS}" == "FALSE" ]; then
-        failed_num=`expr $failed_num + 1`
-        echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_CUDA_SUCCESS instead.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n"
-        echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-    fi
-fi
-
-if [ -n "${echo_list}" ];then
-  echo "****************"
-  echo -e ${echo_list[@]}
-  git diff -U0 upstream/$BRANCH |grep "+" |grep -v "PADDLE_ENFORCE_" |grep "PADDLE_ENFORCE"
-  echo "There are ${failed_num} approved errors."
-  echo "****************"
-  exit 1
-fi
diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py
deleted file mode 100644
index c44690a93ac3c1f1833ee62b4e13d1ae8220fb55..0000000000000000000000000000000000000000
--- a/tools/check_ctest_hung.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-import re
-
-
-def escape(input):
-    o = input.replace("\n", "")
-    o = o.replace("\r", "")
-    return o
-
-
-def main():
-    usage = """Usage:
-1. Download the Paddle_PR_CI_*.log from TeamCity
-2. run: python check_ctest_hung.py Paddle_PR_CI_*.log
-3. If there is hung ctest, the result likes:
-Diff:  set(['test_parallel_executor_crf'])
-    """
-    if len(sys.argv) < 2:
-        print(usage)
-        exit(0)
-
-    logfile = sys.argv[1]
-    started = set()
-    passed = set()
-    with open(logfile, "r") as fn:
-        for l in fn.readlines():
-            if l.find("Test ") != -1 and \
-                l.find("Passed") != -1:
-                m = re.search("Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l))
-                passed.add(m.group(1))
-            if l.find("Start ") != -1:
-                start_parts = escape(l).split(" ")
-                m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
-                started.add(m.group(1))
-    print("Diff: ", started - passed)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/check_pr_approval.py b/tools/check_pr_approval.py
deleted file mode 100644
index 937b0be7562fab93157c16b942631f0a580dfc68..0000000000000000000000000000000000000000
--- a/tools/check_pr_approval.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import sys
-import json
-
-
-def check_approval(count, required_reviewers):
-    json_buff = ""
-    for line in sys.stdin:
-        json_buff = "".join([json_buff, line])
-    json_resp = json.loads(json_buff)
-    approves = 0
-    approved_user_ids = []
-    for review in json_resp:
-        if review["state"] == "APPROVED":
-            approves += 1
-            approved_user_ids.append(review["user"]["id"])
-
-    # convert to int
-    required_reviewers_int = set()
-    for rr in required_reviewers:
-        required_reviewers_int.add(int(rr))
-
-    if len(set(approved_user_ids) & required_reviewers_int) >= count:
-        print("TRUE")
-    else:
-        print("FALSE")
-
-
-if __name__ == "__main__":
-    if len(sys.argv) > 1 and sys.argv[1].isdigit():
-        check_approval(int(sys.argv[1]), sys.argv[2:])
-    else:
-        print(
-            "Usage: python check_pr_approval.py [count] [required reviewer id] ..."
-        )
diff --git a/tools/codestyle/.gitignore b/tools/codestyle/.gitignore
deleted file mode 100644
index 0d20b6487c61e7d1bde93acf4a14b7a89083a16d..0000000000000000000000000000000000000000
--- a/tools/codestyle/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.pyc
diff --git a/tools/codestyle/clang_format.hook b/tools/codestyle/clang_format.hook
deleted file mode 100755
index 1d928216867c0ba3897d71542fea44debf8d72a0..0000000000000000000000000000000000000000
--- a/tools/codestyle/clang_format.hook
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -e
-
-readonly VERSION="3.8"
-
-version=$(clang-format -version)
-
-if ! [[ $version == *"$VERSION"* ]]; then
-    echo "clang-format version check failed."
-    echo "a version contains '$VERSION' is needed, but get '$version'"
-    echo "you can install the right version, and make an soft-link to '\$PATH' env"
-    exit -1
-fi
-
-clang-format $@
diff --git a/tools/codestyle/copyright.hook b/tools/codestyle/copyright.hook
deleted file mode 100644
index 86b16ebdc46047c7cb3d7731a71cbf9647a1f2fe..0000000000000000000000000000000000000000
--- a/tools/codestyle/copyright.hook
+++ /dev/null
@@ -1,121 +0,0 @@
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import argparse
-import io, re
-import sys, os
-import subprocess
-import platform
-
-COPYRIGHT = '''
-Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-'''
-
-LANG_COMMENT_MARK = None
-
-NEW_LINE_MARK = None
-
-COPYRIGHT_HEADER = None
-
-if platform.system() == "Windows":
-    NEW_LINE_MARK = "\r\n"
-else:
-    NEW_LINE_MARK = '\n'
-    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
-    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
-    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
-    date, err = process.communicate()
-    date = date.decode("utf-8").rstrip("\n")
-    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
-
-
-def generate_copyright(template, lang='C'):
-    if lang == 'Python':
-        LANG_COMMENT_MARK = '#'
-    else:
-        LANG_COMMENT_MARK = "//"
-
-    lines = template.split(NEW_LINE_MARK)
-    BLANK = " "
-    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
-    for lino, line in enumerate(lines):
-        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
-        if len(line)  == 0:
-            BLANK = ""
-        else:
-            BLANK = " "
-        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
-
-    return ans + "\n"
-
-
-def lang_type(filename):
-    if filename.endswith(".py"):
-        return "Python"
-    elif filename.endswith(".h"):
-        return "C"
-    elif filename.endswith(".c"):
-        return "C"
-    elif filename.endswith(".hpp"):
-        return "C"
-    elif filename.endswith(".cc"):
-        return "C"
-    elif filename.endswith(".cpp"):
-        return "C"
-    elif filename.endswith(".cu"):
-        return "C"
-    elif filename.endswith(".cuh"):
-        return "C"
-    elif filename.endswith(".go"):
-        return "C"
-    elif filename.endswith(".proto"):
-        return "C"
-    else:
-        print("Unsupported filetype %s", filename)
-        exit(0)
-
-
-PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
-
-
-def main(argv=None):
-    parser = argparse.ArgumentParser(
-        description='Checker for copyright declaration.')
-    parser.add_argument('filenames', nargs='*', help='Filenames to check')
-    args = parser.parse_args(argv)
-
-    retv = 0
-    for filename in args.filenames:
-        fd = io.open(filename, encoding="utf-8")
-        first_line = fd.readline()
-        second_line = fd.readline()
-        if "COPYRIGHT (C)" in first_line.upper(): continue
-        if first_line.startswith("#!") or PYTHON_ENCODE.match(
-                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
-            continue
-        original_contents = io.open(filename, encoding="utf-8").read()
-        new_contents = generate_copyright(
-            COPYRIGHT, lang_type(filename)) + original_contents
-        print('Auto Insert Copyright Header {}'.format(filename))
-        retv = 1
-        with io.open(filename, 'w') as output_file:
-            output_file.write(new_contents)
-
-    return retv
-
-
-if __name__ == '__main__':
-    exit(main())
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
deleted file mode 100755
index 658008d852123b6eab06d1f13d61ba896e7e9c98..0000000000000000000000000000000000000000
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-TOTAL_ERRORS=0
-if [[ ! $TRAVIS_BRANCH ]]; then
-  # install cpplint on local machine.
-  if [[ ! $(which cpplint) ]]; then
-    pip install cpplint
-  fi
-  # diff files on local machine. 
-  files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')
-else
-  # diff files between PR and latest commit on Travis CI. 
-  branch_ref=$(git rev-parse "$TRAVIS_BRANCH")
-  head_ref=$(git rev-parse HEAD)
-  files=$(git diff --name-status $branch_ref $head_ref | awk '$1 != "D" {print $2}')
-fi
-# The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $files; do
-    if [[ $file =~ ^(patches/grpc/.*) ]]; then
-        continue;
-    else
-        cpplint --filter=-readability/fn_size $file;
-        TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
-    fi
-done
-
-exit $TOTAL_ERRORS
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
deleted file mode 100644
index 8d4b24a0cf6b743b72dca58fd885f927560964bf..0000000000000000000000000000000000000000
--- a/tools/codestyle/docstring_checker.py
+++ /dev/null
@@ -1,349 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""DocstringChecker is used to check python doc string's style."""
-
-import six
-import astroid
-
-from pylint.checkers import BaseChecker, utils
-from pylint.interfaces import IAstroidChecker
-
-from collections import defaultdict
-import re
-
-
-def register(linter):
-    """Register checkers."""
-    linter.register_checker(DocstringChecker(linter))
-
-
-class Docstring(object):
-    """Docstring class holds the parsed doc string elements.
-    """
-
-    def __init__(self):
-        self.d = defaultdict(list)  #name->[]
-        self.clear()
-
-    def clear(self):
-        self.d['Args'] = []
-        self.d['Examples'] = []
-        self.d['Returns'] = []
-        self.d['Raises'] = []
-        self.args = {}  #arg_name->arg_type
-
-    def get_level(self, string, indent='    '):
-        level = 0
-        unit_size = len(indent)
-        while string[:unit_size] == indent:
-            string = string[unit_size:]
-            level += 1
-
-        return level
-
-    def parse(self, doc):
-        """parse gets sections from doc
-        Such as Args, Returns, Raises, Examples s
-        Args:
-            doc (string): is the astroid node doc string.
-        Returns:
-            True if doc is parsed successfully.
-        """
-        self.clear()
-
-        lines = doc.splitlines()
-        state = ("others", -1)
-        for l in lines:
-            c = l.strip()
-            if len(c) <= 0:
-                continue
-
-            level = self.get_level(l)
-            if c.startswith("Args:"):
-                state = ("Args", level)
-            elif c.startswith("Returns:"):
-                state = ("Returns", level)
-            elif c.startswith("Raises:"):
-                state = ("Raises", level)
-            elif c.startswith("Examples:"):
-                state = ("Examples", level)
-            else:
-                if level > state[1]:
-                    self.d[state[0]].append(c)
-                    continue
-
-                state = ("others", -1)
-                self.d[state[0]].append(c)
-
-        self._arg_with_type()
-        return True
-
-    def get_returns(self):
-        return self.d['Returns']
-
-    def get_raises(self):
-        return self.d['Raises']
-
-    def get_examples(self):
-        return self.d['Examples']
-
-    def _arg_with_type(self):
-
-        for t in self.d['Args']:
-            m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
-            if m:
-                self.args[m.group(1)] = m.group(2)
-
-        return self.args
-
-
-class DocstringChecker(BaseChecker):
-    """DosstringChecker is pylint checker to
-    check docstring style.
-    """
-    __implements__ = (IAstroidChecker, )
-
-    POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument'
-    KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument'
-
-    name = 'doc-string-checker'
-    symbol = "doc-string"
-    priority = -1
-    msgs = {
-        'W9001': ('One line doc string on > 1 lines', symbol + "-one-line",
-                  'Used when a short doc string is on multiple lines'),
-        'W9002':
-        ('Doc string does not end with "." period', symbol + "-end-with",
-         'Used when a doc string does not end with a period'),
-        'W9003':
-        ('All args with their types must be mentioned in doc string %s',
-         symbol + "-with-all-args",
-         'Used when not all arguments are in the doc string '),
-        'W9005': ('Missing docstring or docstring is too short',
-                  symbol + "-missing", 'Add docstring longer >=10'),
-        'W9006': ('Docstring indent error, use 4 space for indent',
-                  symbol + "-indent-error", 'Use 4 space for indent'),
-        'W9007': ('You should add `Returns` in comments',
-                  symbol + "-with-returns",
-                  'There should be a `Returns` section in comments'),
-        'W9008': ('You should add `Raises` section in comments',
-                  symbol + "-with-raises",
-                  'There should be a `Raises` section in comments'),
-    }
-    options = ()
-
-    def visit_functiondef(self, node):
-        """visit_functiondef checks Function node docstring style.
-        Args:
-            node (astroid.node): The visiting node.
-        Returns:
-            True if successful other wise False.
-        """
-
-        self.check_doc_string(node)
-
-        if node.tolineno - node.fromlineno <= 10:
-            return True
-
-        if not node.doc:
-            return True
-
-        doc = Docstring()
-        doc.parse(node.doc)
-
-        self.all_args_in_doc(node, doc)
-        self.with_returns(node, doc)
-        self.with_raises(node, doc)
-
-    def visit_module(self, node):
-        self.check_doc_string(node)
-
-    def visit_classdef(self, node):
-        self.check_doc_string(node)
-
-    def check_doc_string(self, node):
-        self.missing_doc_string(node)
-        self.one_line(node)
-        self.has_period(node)
-        self.indent_style(node)
-
-    def missing_doc_string(self, node):
-        if node.name.startswith("__") or node.name.startswith("_"):
-            return True
-        if node.tolineno - node.fromlineno <= 10:
-            return True
-
-        if node.doc is None or len(node.doc) < 10:
-            self.add_message('W9005', node=node, line=node.fromlineno)
-        return False
-
-    # FIXME(gongwb): give the docstring line-no
-    def indent_style(self, node, indent=4):
-        """indent_style checks docstring's indent style
-        Args:
-            node (astroid.node): The visiting node.
-            indent (int): The default indent of style
-        Returns:
-            True if successful other wise False.
-        """
-        if node.doc is None:
-            return True
-
-        doc = node.doc
-        lines = doc.splitlines()
-        line_num = 0
-
-        for l in lines:
-            if line_num == 0:
-                continue
-            cur_indent = len(l) - len(l.lstrip())
-            if cur_indent % indent != 0:
-                self.add_message('W9006', node=node, line=node.fromlineno)
-                return False
-            line_num += 1
-
-        return True
-
-    def one_line(self, node):
-        """one_line checks if docstring (len < 40) is on one line.
-        Args:
-            node (astroid.node): The node visiting.
-        Returns:
-            True if successful otherwise False.
-        """
-
-        doc = node.doc
-        if doc is None:
-            return True
-
-        if len(doc) > 40:
-            return True
-        elif sum(doc.find(nl) for nl in ('\n', '\r', '\n\r')) == -3:
-            return True
-        else:
-            self.add_message('W9001', node=node, line=node.fromlineno)
-            return False
-
-        return True
-
-    def has_period(self, node):
-        """has_period checks if one line doc end-with '.' .
-        Args:
-            node (astroid.node): the node is visiting.
-        Returns:
-            True if successful otherwise False.
-        """
-        if node.doc is None:
-            return True
-
-        if len(node.doc.splitlines()) > 1:
-            return True
-
-        if not node.doc.strip().endswith('.'):
-            self.add_message('W9002', node=node, line=node.fromlineno)
-            return False
-
-        return True
-
-    def with_raises(self, node, doc):
-        """with_raises checks if one line doc end-with '.' .
-        Args:
-            node (astroid.node): the node is visiting.
-            doc (Docstring): Docstring object.
-        Returns:
-            True if successful otherwise False.
-        """
-
-        find = False
-        for t in node.body:
-            if not isinstance(t, astroid.Raise):
-                continue
-
-            find = True
-            break
-
-        if not find:
-            return True
-
-        if len(doc.get_raises()) == 0:
-            self.add_message('W9008', node=node, line=node.fromlineno)
-            return False
-
-        return True
-
-    def with_returns(self, node, doc):
-        """with_returns checks if docstring comments what are returned .
-        Args:
-            node (astroid.node): the node is visiting.
-            doc (Docstring): Docstring object.
-        Returns:
-            True if successful otherwise False.
-        """
-
-        if node.name.startswith("__") or node.name.startswith("_"):
-            return True
-        find = False
-        for t in node.body:
-            if not isinstance(t, astroid.Return):
-                continue
-
-            find = True
-            break
-
-        if not find:
-            return True
-
-        if len(doc.get_returns()) == 0:
-            self.add_message('W9007', node=node, line=node.fromlineno)
-            return False
-
-        return True
-
-    def all_args_in_doc(self, node, doc):
-        """all_args_in_doc checks if arguments are mentioned in doc
-        Args:
-            node (astroid.node): the node is visiting.
-            doc (Docstring): Docstring object
-        Returns:
-            True if successful otherwise False.
-        """
-        if node.name.startswith("__") or node.name.startswith("_"):
-            return True
-        args = []
-        for arg in node.args.get_children():
-            if (not isinstance(arg, astroid.AssignName)) \
-                or arg.name == "self":
-                continue
-            args.append(arg.name)
-
-        if len(args) <= 0:
-            return True
-
-        parsed_args = doc.args
-        args_not_documented = set(args) - set(parsed_args)
-        if len(args) > 0 and len(parsed_args) <= 0:
-            self.add_message(
-                'W9003',
-                node=node,
-                line=node.fromlineno,
-                args=list(args_not_documented))
-            return False
-
-        for t in args:
-            if t not in parsed_args:
-                self.add_message(
-                    'W9003', node=node, line=node.fromlineno, args=[t, ])
-                return False
-
-        return True
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
deleted file mode 100755
index 150a3f5666bd39d30b7e6518e58a14fb5fe2f14b..0000000000000000000000000000000000000000
--- a/tools/codestyle/pylint_pre_commit.hook
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-TOTAL_ERRORS=0
-
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-export PYTHONPATH=$DIR:$PYTHONPATH
-
-# The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
-    pylint --disable=all --load-plugins=docstring_checker \
-    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
-    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
-done
-
-exit $TOTAL_ERRORS
-#For now, just warning:
-#exit 0
-
diff --git a/tools/codestyle/test_docstring_checker.py b/tools/codestyle/test_docstring_checker.py
deleted file mode 100644
index 0547f7d1610c64b0ca6efa9384e97d658c8276fe..0000000000000000000000000000000000000000
--- a/tools/codestyle/test_docstring_checker.py
+++ /dev/null
@@ -1,232 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import docstring_checker
-import pylint.testutils
-import astroid
-import pytest
-import sys
-
-
-class TestDocstring(pylint.testutils.CheckerTestCase):
-    CHECKER_CLASS = docstring_checker.DocstringChecker
-
-    def test_one_line(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            """get 
-            news.
-            """
-            if True:
-                return 5
-            return 5
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9001' == got[0][0]
-
-    def test_one_line(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            """get news"""
-            if True:
-                return 5
-            return 5
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9002' == got[0][0]
-
-    def test_args(self):
-        func_node = astroid.extract_node('''
-        def test(scale, mean): 
-            """get news.
-            Args:
-                scale (int): scale is the number.
-            """
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9003' == got[0][0]
-
-    def test_missing(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9005' == got[0][0]
-
-    def test_indent(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            """ get get get get get get get get
-              get get get get get get get get.
-            """
-            pass 
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9006' == got[0][0]
-
-    def test_with_resturns(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            """get news.
-            Args:
-                scale (int): scale is the number.
-            """
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            return mean
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9007' == got[0][0]
-
-    def test_with_raises(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            """get news.
-            Args:
-                scale (int): scale is the number.
-            """
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            raise ValueError('A very specific bad thing happened.')
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9008' == got[0][0]
-
-    def test_no_message(self):
-        p = '''
-def fc(input,
-       size,
-       num_flatten_dims=1,
-       param_attr=None,
-       bias_attr=None,
-       act=None,
-       name=None):
-    """
-    **Fully Connected Layer**
-    The fully connected layer can take multiple tensors as its inputs. It
-    creates a variable called weights for each input tensor, which represents
-    a fully connected weight matrix from each input unit to each output unit.
-    The fully connected layer multiplies each input tensor with its coresponding
-    weight to produce an output Tensor. If multiple input tensors are given,
-    the results of multiple multiplications will be sumed up. If bias_attr is
-    not None, a bias variable will be created and added to the output. Finally,
-    if activation is not None, it will be applied to the output as well.
-    This process can be formulated as follows:
-
-    Args:
-        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
-            the input tensor(s) is at least 2.
-        size(int): The number of output units in this layer.
-        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
-            two dimensions. If this happens, the multidimensional tensor will first be flattened
-            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
-            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
-            dimensions will be flatten to form the first dimension of the final matrix (height of
-            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
-            form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
-            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
-        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
-            parameters/weights of this layer.
-        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to None, no bias will be added to the output units.
-        act (str, default None): Activation to be applied to the output of this layer.
-        name (str, default None): The name of this layer.
-    Returns:
-        A tensor variable storing the transformation result.
-    Raises:
-        ValueError: If rank of the input tensor is less than 2.
-    Examples:
-        .. code-block:: python
-            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
-    """
-    raise ValueError('A very specific bad thing happened.')
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    return size
-    '''
-
-        func_node = astroid.extract_node(p)
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 0
diff --git a/tools/continuous_integration/bisect.py b/tools/continuous_integration/bisect.py
deleted file mode 100644
index 21a46e5cef096a12b3beebff2c10dc8a072aed64..0000000000000000000000000000000000000000
--- a/tools/continuous_integration/bisect.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# A script to bisect the mainline commits and find the culprit commit.
-# The default 'git bisect' checks feature branches, which is not desired
-# because commits in feature branch might not pass tests or compile.
-#
-# Example:
-#   python ../bisect.py --git_dir=$PWD/../Paddle --build_dir=$PWD \
-#       --good_commit=3647ed6 --bad_commit=279aa6 \
-#       --test_target=test_rnn_encoder_decoder
-
-import argparse
-import os
-import subprocess
-import sys
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--git_dir', type=str, default='', help='git repo root directory.')
-parser.add_argument(
-    '--build_dir', type=str, default='', help='build directory.')
-parser.add_argument(
-    '--good_commit',
-    type=str,
-    default='',
-    help='The old commit known to be good.')
-parser.add_argument(
-    '--bad_commit',
-    type=str,
-    default='',
-    help='The new commit known to be bad.')
-parser.add_argument(
-    '--test_target', type=str, default='', help='The test target to evaluate.')
-parser.add_argument(
-    '--bisect_branch',
-    type=str,
-    default='develop',
-    help='The mainline branch to bisect (feature branch ignored.')
-parser.add_argument(
-    '--log_file', type=str, default='', help='The file use to log outputs.')
-parser.add_argument(
-    '--test_times',
-    type=int,
-    default=10,
-    help="Number of times to run the test target.")
-parser.add_argument(
-    '--build_parallel', type=int, default=32, help="make parallelism.")
-args = parser.parse_args()
-
-if not args.log_file:
-    args.log_file = '/tmp/%s...%s.log' % (args.good_commit, args.bad_commit)
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-print_arguments()
-
-# List the commits in mainline branch.
-os.chdir(args.git_dir)
-ret = subprocess.check_output(
-    [
-        'git rev-list --first-parent %s...%s' % (args.good_commit,
-                                                 args.bad_commit)
-    ],
-    shell=True)
-sys.stdout.write('commits found:\n%s\n' % ret)
-commits = ret.strip().split('\n')
-os.chdir(args.build_dir)
-# Clean up previous logs.
-subprocess.check_output(['echo "" > %s' % args.log_file], shell=True)
-
-last_culprit = ''
-while True:
-    # Get to the mainline branch and clean up
-    os.chdir(args.git_dir)
-    subprocess.check_output(
-        [
-            'git checkout %s && git clean -fd && git checkout .' %
-            args.bisect_branch
-        ],
-        shell=True)
-
-    if not commits:
-        sys.stdout.write('no commits to bisect\n')
-        exit()
-    # checkout the picked branch.
-    pick_idx = len(commits) / 2
-    pick = commits[pick_idx]
-    os.chdir(args.git_dir)
-    subprocess.check_output(['git checkout %s' % pick], shell=True)
-
-    # Clean builds and compile.
-    # We assume mainline commits should always compile.
-    os.chdir(args.build_dir)
-    sys.stdout.write('eval commit %d/%d: %s\n' % (pick_idx, len(commits), pick))
-    # Link error can happen without complete clean up.
-    cmd = ('rm -rf * && '
-           'cmake -DWITH_TESTING=ON %s >> %s && make -j%s >> %s' %
-           (args.git_dir, args.log_file, args.build_parallel, args.log_file))
-    sys.stdout.write('cmd: %s\n' % cmd)
-    try:
-        subprocess.check_output([cmd], shell=True)
-    except subprocess.CalledProcessError as e:
-        sys.stderr.write('failed to build commit: %s\n%s\n' % (pick, e))
-        exit()
-    # test the selected branch.
-    passed = True
-    try:
-        cmd = ('ctest --repeat-until-fail %s -R %s >> %s' %
-               (args.test_times, args.test_target, args.log_file))
-        sys.stdout.write('cmd: %s\n' % cmd)
-        subprocess.check_output([cmd], shell=True)
-    except subprocess.CalledProcessError as e:
-        passed = False
-        last_culprit = pick
-    sys.stdout.write('eval %s passed: %s\n' % (pick, passed))
-    if passed:
-        if pick_idx == 0: break
-        commits = commits[:pick_idx]
-    else:
-        if pick_idx + 1 >= len(commits): break
-        commits = commits[pick_idx + 1:]
-
-sys.stdout.write('Culprit commit: %s\n' % last_culprit)
diff --git a/tools/diff_api.py b/tools/diff_api.py
deleted file mode 100644
index 37d12a052f503f7e944aea9d8dab346fbe681a45..0000000000000000000000000000000000000000
--- a/tools/diff_api.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env python
-from __future__ import print_function
-import difflib
-import sys
-
-with open(sys.argv[1], 'r') as f:
-    origin = f.read()
-    origin = origin.splitlines()
-
-with open(sys.argv[2], 'r') as f:
-    new = f.read()
-    new = new.splitlines()
-
-differ = difflib.Differ()
-result = differ.compare(origin, new)
-
-error = False
-print('API Difference is: ')
-for each_diff in result:
-    if each_diff[0] in ['-', '?']:  # delete or change API is not allowed
-        error = True
-    elif each_diff[0] == '+':
-        error = True
-
-    if each_diff[0] != ' ':
-        print(each_diff)
-
-if error:
-    print(
-        '''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
-    1. cd ${paddle_path}, compile paddle;
-    2. pip install build/python/dist/(build whl package);
-    3. run "python tools/print_signatures.py paddle.fluid> paddle/fluid/API.spec"'''
-    )
-    sys.exit(1)
diff --git a/tools/diff_use_default_grad_op_maker.py b/tools/diff_use_default_grad_op_maker.py
deleted file mode 100644
index 9e362f611bbf381f480be6f216c28a53dc0440fa..0000000000000000000000000000000000000000
--- a/tools/diff_use_default_grad_op_maker.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-os.environ['CUDA_VISIBLE_DEVICES'] = ''
-
-import paddle.fluid as fluid
-import sys
-
-
-def get_op_diff(filename):
-    ops_created_by_py_func = set(
-        fluid.core._get_use_default_grad_op_desc_maker_ops())
-
-    with open(filename, 'r') as f:
-        ops_read_from_file = set([line.strip() for line in f.readlines()])
-
-    diff_ops = []
-
-    for op in ops_read_from_file:
-        if op not in ops_created_by_py_func:
-            diff_ops.append(op)
-        else:
-            ops_created_by_py_func.remove(op)
-
-    err_msg = []
-    diff_ops = list(diff_ops)
-    if len(diff_ops) > 0:
-        err_msg.append('Added grad op with DefaultGradOpDescMaker: ' + str(
-            diff_ops))
-
-    ops_created_by_py_func = list(ops_created_by_py_func)
-    if len(ops_created_by_py_func) > 0:
-        err_msg.append('Remove grad op with DefaultGradOpDescMaker: ' + str(
-            ops_created_by_py_func))
-
-    return err_msg
-
-
-if len(sys.argv) != 2:
-    print('Usage: python diff_use_default_grad_op_maker.py [filepath]')
-    sys.exit(1)
-
-file_path = str(sys.argv[1])
-err_msg = get_op_diff(file_path)
-
-if len(err_msg) > 0:
-    _, filename = os.path.split(file_path)
-    print('File `{}` is wrong compared to your PR revision!'.format(filename))
-    print(
-        'Please use `python generate_op_use_grad_op_desc_maker_spec.py [filepath]` to generate new `{}` file'.
-        format(filename))
-    print('Error message is: ' + '; '.join(err_msg))
-    sys.exit(1)
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
deleted file mode 100755
index 10f486f8fd4f633ce5c49cd878540aaef6a7d3ad..0000000000000000000000000000000000000000
--- a/tools/document_preview.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-PADDLE_ROOT=/home
-mkdir ${PADDLE_ROOT}
-cd ${PADDLE_ROOT}
-pip install /paddle/build/opt/paddle/share/wheels/*.whl
-git clone https://github.com/PaddlePaddle/FluidDoc
-git clone https://github.com/tianshuo78520a/PaddlePaddle.org.git
-cd  ${PADDLE_ROOT}/PaddlePaddle.org
-git reset 3feaa68376d8423e41d076814e901e6bf108c705
-cd ${PADDLE_ROOT}/FluidDoc/doc/fluid/api
-sh gen_doc.sh
-apt-get update && apt-get install -y python-dev build-essential
-cd ${PADDLE_ROOT}/PaddlePaddle.org/portal
-pip install -r requirements.txt
-#If the default port is not occupied, you can use port 8000, you need to replace it with a random port on the CI.
-sed -i "s#8000#$1#g" runserver
-nohup ./runserver --paddle ${PADDLE_ROOT}/FluidDoc &
diff --git a/tools/generate_op_use_grad_op_desc_maker_spec.py b/tools/generate_op_use_grad_op_desc_maker_spec.py
deleted file mode 100644
index 69b062a8716692f19bbd63928064cf74c171b88f..0000000000000000000000000000000000000000
--- a/tools/generate_op_use_grad_op_desc_maker_spec.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-os.environ['CUDA_VISIBLE_DEVICES'] = ''
-
-import paddle.fluid as fluid
-import sys
-
-if len(sys.argv) != 2:
-    print('Usage: python generate_op_use_grad_op_desc_maker_spec.py [filepath]')
-    sys.exit(1)
-
-with open(sys.argv[1], 'w') as f:
-    ops = fluid.core._get_use_default_grad_op_desc_maker_ops()
-    for op in ops:
-        f.write(op + '\n')
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
deleted file mode 100644
index ebddbefaf9db06d785af1daf698a281f9af246bd..0000000000000000000000000000000000000000
--- a/tools/manylinux1/Dockerfile.x64
+++ /dev/null
@@ -1,64 +0,0 @@
-# NOTE The manylinux1 policy mandates CentOS-5. We replace it with CentOS-6 in
-# order to satisfy the build of capnproto library (a nupic.core dependency),
-# which requires some headers and symbols not present on CentOS-5 (e.g.,
-# signalfd.h, pipe2, O_NONBLOCK, SOCK_NONBLOCK, etc.). See
-# https://github.com/sandstorm-io/capnproto/issues/350.
-FROM nvidia/cuda:<baseimg>
-MAINTAINER Numenta, based on the ManyLinux project
-
-ENV LC_ALL en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US.UTF-8
-ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
-ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
-ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
-
-RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
-COPY build_scripts /build_scripts
-RUN bash build_scripts/build.sh && \
-  bash build_scripts/install_nccl2.sh && rm -rf build_scripts
-
-ENV SSL_CERT_FILE=/opt/_internal/certs.pem
-
-# for paddle
-RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-
-
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
-
-# protobuf 3.6.1
-RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.6.1/protobuf-cpp-3.6.1.tar.gz && \
-    tar xzf protobuf-cpp-3.6.1.tar.gz && \
-    cd protobuf-3.6.1 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.6.1.tar.gz
-
-RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
-
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install setuptools -U
-
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
-    go get github.com/Masterminds/glide && \
-    rm -rf /root/requirements.txt
-
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
-
-RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
-    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
-
-CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/tools/manylinux1/README.md b/tools/manylinux1/README.md
deleted file mode 100644
index 0e5905040175047f5b79939d97a3efcf38992944..0000000000000000000000000000000000000000
--- a/tools/manylinux1/README.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# buildtools
-
-We release PaddlePaddle and PaddlePaddle Fluid as shared libraries,
-which, we hope could be released as wheel packages on PyPI, so we need
-to make sure that the build follows the
-[manulinux1](https://www.python.org/dev/peps/pep-0513/) standard.
-
-The manylinux standard suggests building Python modules on an old
-system, because that a module would anyway depend on some shared
-libraries, and Linux's shared library standard states that those built
-with newer version compilers cannot work with those with older
-versions.  The suggested building environment is as old as CentOS 5.
-However, PaddlePaddle relies on CUDA, and the earlies version of
-[CentOS works with CUDA is 6](https://hub.docker.com/r/nvidia/cuda/).
-So, here we provide a Docker image based on CentOS 6 and CUDA for
-building PaddlePaddle and making the release supports "as-manylinux as
-possible."  or "sufficiently many Linux" according to [this
-discussion](https://mail.python.org/pipermail/wheel-builders/2016-July/000175.html).
-
-The build output of our Docker image includes multiple wheel files --
-some contain the CPU-only binary, some others support CUDA; some are
-compatible with the cp27m Python ABI, some others with cp27.
-
-To build these wheels, please run the following commands:
-
-```bash
-git clone https://github.com/paddlepaddle/paddle
-cd paddle/tools/manylinux1
-REPO=[yourrepo] ./build_all.sh
-```
-
-## Build PaddlePaddle for the different Python ABIs
-
-Choose one of the following Python ABI and set the correct environment variables.
-
-- cp27-cp27m
-
-  ```bash
-  export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
-  export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
-  export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
-  ```
-
-- cp27-cp27mu
-
-  ```bash
-  export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
-  export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
-  export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
-  ```
-
-And then add the `PYTHON_FLAGS` as your cmake flags:
-
-```bash
-cmake ..
-  ${PYTHON_FLAGS} \
-  -DWITH_GPU=OFF \
-  ...
-```
-
-You can find more details about cmake flags at [here](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html#appendix-build-options)
diff --git a/tools/manylinux1/build_all.sh b/tools/manylinux1/build_all.sh
deleted file mode 100755
index d9801417675109009b30d8fa74b1adcc78d75172..0000000000000000000000000000000000000000
--- a/tools/manylinux1/build_all.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-set -xe
-
-REPO="${REPO:-typhoon1986}"
-
-# NOTE: version matches are determined!
-sed 's/<baseimg>/7.5-cudnn5-devel-centos6/g' Dockerfile.x64 | \
-sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52"/g'> Dockerfile.tmp
-docker build -t ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5 -f Dockerfile.tmp .
-docker push ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5
-
-sed 's/<baseimg>/8.0-cudnn5-devel-centos6/g' Dockerfile.x64 | \
-sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
-docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5 -f Dockerfile.tmp .
-docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5
-
-sed 's/<baseimg>/8.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
-sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
-
-docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7 -f Dockerfile.tmp .
-docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7
-
-sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
-sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp
-docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
-docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
-
-sed 's/<baseimg>/10.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
-sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75"/g'> Dockerfile.tmp
-docker build -t ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7 -f Dockerfile.tmp .
-docker push ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
deleted file mode 100644
index 3be94a42d530bdc4cb6c0a97ee3804f8289919d1..0000000000000000000000000000000000000000
--- a/tools/manylinux1/build_scripts/build.sh
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/bin/bash
-# Top-level build script called from Dockerfile
-
-# Stop at any error, show all commands
-set -ex
-
-# Python versions to be installed in /opt/$VERSION_NO
-# NOTE Only need python 2.7.11 for nupic.core/nupic.bindings at this time, so
-# remove others to expedite build and reduce docker image size. The original
-# manylinux docker image project builds many python versions.
-# NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="3.7.0 3.6.0 3.5.1 2.7.11"
-
-# openssl version to build, with expected sha256 hash of .tar.gz
-# archive
-OPENSSL_ROOT=openssl-1.1.0i
-OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
-EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
-DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
-PATCHELF_HASH=f2aa40a6148cb3b0ca807a1bf836b081793e55ec9e5540a5356d800132be7e0a
-CURL_ROOT=curl-7.49.1
-CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
-AUTOCONF_ROOT=autoconf-2.69
-AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
-
-# Dependencies for compiling Python that we want to remove from
-# the final image after compiling Python
-PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
-
-# Libraries that are allowed as part of the manylinux1 profile
-MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"
-
-# Get build utilities
-MY_DIR=$(dirname "${BASH_SOURCE[0]}")
-source $MY_DIR/build_utils.sh
-
-# EPEL support
-yum -y install wget curl
-curl -sLO https://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm
-check_sha256sum epel-release-6-8.noarch.rpm $EPEL_RPM_HASH
-
-# Dev toolset (for LLVM and other projects requiring C++11 support)
-curl -sLO http://people.centos.org/tru/devtools-2/devtools-2.repo
-check_sha256sum devtools-2.repo $DEVTOOLS_HASH
-mv devtools-2.repo /etc/yum.repos.d/devtools-2.repo
-rpm -Uvh --replacepkgs epel-release-6*.rpm
-rm -f epel-release-6*.rpm
-
-# Development tools and libraries
-yum -y install bzip2 make git patch unzip bison yasm diffutils \
-    automake which file \
-    kernel-devel-`uname -r` \
-    devtoolset-2-binutils devtoolset-2-gcc \
-    devtoolset-2-gcc-c++ devtoolset-2-gcc-gfortran \
-    ${PYTHON_COMPILE_DEPS}
-
-# Install more recent version of cmake
-# curl -O https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.sh
-# /bin/sh cmake-3.8.1-Linux-x86_64.sh --prefix=/usr/local --skip-license
-# rm cmake-3.8.1-Linux-x86_64.sh
-
-wget -q https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz && tar xzf cmake-3.5.2.tar.gz && \
-cd cmake-3.5.2 && ./bootstrap && \
-make -j8 && make install && cd .. && rm cmake-3.5.2.tar.gz
-
-
-# Install newest autoconf
-build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
-autoconf --version
-
-# Compile the latest Python releases.
-# (In order to have a proper SSL module, Python is compiled
-# against a recent openssl [see env vars above], which is linked
-# statically. We delete openssl afterwards.)
-build_openssl $OPENSSL_ROOT $OPENSSL_HASH
-mkdir -p /opt/python
-build_cpythons $CPYTHON_VERSIONS
-
-PY35_BIN=/opt/python/cp35-cp35m/bin
-PY36_BIN=/opt/python/cp36-cp36m/bin
-PY37_BIN=/opt/python/cp37-cp37m/bin
-# NOTE Since our custom manylinux image builds pythons with shared
-# libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
-# python.
-ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib"
-
-# Our openssl doesn't know how to find the system CA trust store
-#   (https://github.com/pypa/manylinux/issues/53)
-# And it's not clear how up-to-date that is anyway
-# So let's just use the same one pip and everyone uses
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install certifi
-ln -s $($PY35_BIN/python -c 'import certifi; print(certifi.where())') \
-      /opt/_internal/certs.pem
-# If you modify this line you also have to modify the versions in the
-# Dockerfiles:
-export SSL_CERT_FILE=/opt/_internal/certs.pem
-
-# Install newest curl
-build_curl $CURL_ROOT $CURL_HASH
-rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
-hash -r
-curl --version
-curl-config --features
-
-# Now we can delete our built SSL
-rm -rf /usr/local/ssl
-
-# Install patchelf (latest with unreleased bug fixes)
-# FIXME(typhoonzero): restore this when the link is fixed.
-# curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
-# check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
-# tar -xzf patchelf-0.9njs2.tar.gz
-# (cd patchelf-0.9njs2 && ./configure && make && make install)
-# rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
-yum install -y patchelf
-
-# Install latest pypi release of auditwheel
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
-ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
-
-# Clean up development headers and other unnecessary stuff for
-# final image
-yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
-    avahi freetype bitstream-vera-fonts \
-    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1 || true
-yum -y install ${MANYLINUX1_DEPS} && yum -y clean all > /dev/null 2>&1 || true
-yum list installed
-# we don't need libpython*.a, and they're many megabytes
-find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
-# Strip what we can -- and ignore errors, because this just attempts to strip
-# *everything*, including non-ELF files:
-find /opt/_internal -type f -print0 \
-    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
-# We do not need the Python test suites, or indeed the precompiled .pyc and
-# .pyo files. Partially cribbed from:
-#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
-find /opt/_internal \
-     \( -type d -a -name test -o -name tests \) \
-  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
-  -print0 | xargs -0 rm -f
-
-for PYTHON in /opt/python/*/bin/python; do
-    # Add matching directory of libpython shared library to library lookup path
-    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib"
-
-    # Smoke test to make sure that our Pythons work, and do indeed detect as
-    # being manylinux compatible:
-    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
-    # Make sure that SSL cert checking works
-    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
-done
-
-# Restore LD_LIBRARY_PATH
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
-
-# According to ar issues: https://lists.gnu.org/archive/html/bug-binutils/2016-05/msg00211.html
-# we should install new version ar with 64-bit supported here
-wget https://ftp.gnu.org/gnu/binutils/binutils-2.27.tar.gz
-tar xzf binutils-2.27.tar.gz && cd binutils-2.27
-./configure --prefix=/opt/rh/devtoolset-2/root/usr/ --enable-64-bit-archive && make -j `nproc` && make install
diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
deleted file mode 100755
index 083101249cd8560f63c95b3fe2aef610b01dd6ac..0000000000000000000000000000000000000000
--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ /dev/null
@@ -1,195 +0,0 @@
-#!/bin/bash
-# Helper utilities for build
-
-PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
-# XXX: the official https server at www.openssl.org cannot be reached
-# with the old versions of openssl and curl in Centos 5.11 hence the fallback
-# to the ftp mirror:
-# OPENSSL_DOWNLOAD_URL=ftp://ftp.openssl.org/source
-OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source
-# Ditto the curl sources
-CURL_DOWNLOAD_URL=http://curl.askapache.com/download
-
-GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
-
-AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf
-
-
-function check_var {
-    if [ -z "$1" ]; then
-        echo "required variable not defined"
-        exit 1
-    fi
-}
-
-
-function lex_pyver {
-    # Echoes Python version string padded with zeros
-    # Thus:
-    # 3.2.1 -> 003002001
-    # 3     -> 003000000
-    echo $1 | awk -F "." '{printf "%03d%03d%03d", $1, $2, $3}'
-}
-
-
-function do_cpython_build {
-    local py_ver=$1
-    check_var $py_ver
-    local ucs_setting=$2
-    check_var $ucs_setting
-    tar -xzf Python-$py_ver.tgz
-    pushd Python-$py_ver
-    if [ "$ucs_setting" = "none" ]; then
-        unicode_flags=""
-        dir_suffix=""
-    else
-        local unicode_flags="--enable-unicode=$ucs_setting"
-        local dir_suffix="-$ucs_setting"
-    fi
-    local prefix="/opt/_internal/cpython-${py_ver}${dir_suffix}"
-    mkdir -p ${prefix}/lib
-    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
-
-    if [ $(lex_pyver $py_ver) -eq $(lex_pyver 3.6) ]; then
-        wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
-        tar -zxf sqlite-autoconf-3250300.tar.gz
-        cd sqlite-autoconf-3250300
-        ./configure --prefix=/usr/local
-        make -j8 && make install
-        cd ../ && rm sqlite-autoconf-3250300.tar.gz
-    fi
-
-    # NOTE --enable-shared for generating libpython shared library needed for
-    # linking of some of the nupic.core test executables.
-    if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
-        # NOTE python 3.7 should be installed via make altinstall rather than
-        # make install, and we should specify the location of ssl
-        CFLAGS="-Wformat" ./configure --prefix=${prefix} --with-openssl=/usr/local/ssl --enable-shared $unicode_flags > /dev/null
-        make -j8 > /dev/null
-        make altinstall > /dev/null
-    else
-        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
-        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make -j8 > /dev/null
-        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make install > /dev/null
-    fi
-    popd
-    echo "ZZZ looking for libpython"
-    find / -name 'libpython*.so*'
-    rm -rf Python-$py_ver
-    # Some python's install as bin/python3. Make them available as
-    # bin/python.
-    if [ -e ${prefix}/bin/python3 ]; then
-        ln -s python3 ${prefix}/bin/python
-    fi
-    if [ -e ${prefix}/bin/python3.7 ]; then
-        ln -s python3.7 ${prefix}/bin/python
-    fi
-    # NOTE Make libpython shared library visible to python calls below
-    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
-    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
-    cd /
-    ls ${MY_DIR}
-    local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
-    ln -s ${prefix} /opt/python/${abi_tag}
-}
-
-
-function build_cpython {
-    local py_ver=$1
-    check_var $py_ver
-    check_var $PYTHON_DOWNLOAD_URL
-    wget -q $PYTHON_DOWNLOAD_URL/$py_ver/Python-$py_ver.tgz
-    if [ $(lex_pyver $py_ver) -lt $(lex_pyver 3.3) ]; then
-        # NOTE We only need wide unicode for nupic.bindings wheel
-        do_cpython_build $py_ver ucs2
-        do_cpython_build $py_ver ucs4
-    else
-        do_cpython_build $py_ver none
-    fi
-    rm -f Python-$py_ver.tgz
-}
-
-
-function build_cpythons {
-    check_var $GET_PIP_URL
-    curl -sLO $GET_PIP_URL
-    for py_ver in $@; do
-        build_cpython $py_ver
-    done
-    rm get-pip.py
-}
-
-
-function do_openssl_build {
-    ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
-    make > /dev/null
-    make install > /dev/null
-}
-
-
-function check_sha256sum {
-    local fname=$1
-    check_var ${fname}
-    local sha256=$2
-    check_var ${sha256}
-
-    echo "${sha256}  ${fname}" > ${fname}.sha256
-    sha256sum -c ${fname}.sha256
-    rm ${fname}.sha256
-}
-
-
-function build_openssl {
-    local openssl_fname=$1
-    check_var ${openssl_fname}
-    local openssl_sha256=$2
-    check_var ${openssl_sha256}
-    check_var ${OPENSSL_DOWNLOAD_URL}
-    curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
-    check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
-    tar -xzf ${openssl_fname}.tar.gz
-    (cd ${openssl_fname} && do_openssl_build)
-    rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
-}
-
-
-function do_curl_build {
-    LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
-    make > /dev/null
-    make install > /dev/null
-}
-
-
-function build_curl {
-    local curl_fname=$1
-    check_var ${curl_fname}
-    local curl_sha256=$2
-    check_var ${curl_sha256}
-    check_var ${CURL_DOWNLOAD_URL}
-    curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
-    check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
-    tar -jxf ${curl_fname}.tar.bz2
-    (cd ${curl_fname} && do_curl_build)
-    rm -rf ${curl_fname} ${curl_fname}.tar.bz2
-}
-
-
-function do_standard_install {
-    ./configure > /dev/null
-    make > /dev/null
-    make install > /dev/null
-}
-
-
-function build_autoconf {
-    local autoconf_fname=$1
-    check_var ${autoconf_fname}
-    local autoconf_sha256=$2
-    check_var ${autoconf_sha256}
-    check_var ${AUTOCONF_DOWNLOAD_URL}
-    curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
-    check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
-    tar -zxf ${autoconf_fname}.tar.gz
-    (cd ${autoconf_fname} && do_standard_install)
-    rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
-}
diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh
deleted file mode 100644
index 62c7a21f3000633c02eb26f7a35586d3d99ed3f2..0000000000000000000000000000000000000000
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
-if [ "$VERSION" == "10.0" ]; then
-  DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
-elif [ "$VERSION" == "9.0" ]; then
-  DEB="nccl-repo-ubuntu1604-2.1.15-ga-cuda9.0_1-1_amd64.deb"
-else
-  DEB="nccl-repo-ubuntu1604-2.1.15-ga-cuda8.0_1-1_amd64.deb"
-fi
-
-URL="http://nccl2-deb.gz.bcebos.com/$DEB"
-
-DIR="/nccl2"
-mkdir -p $DIR
-# we cached the nccl2 deb package in BOS, so we can download it with wget
-# install nccl2: http://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#down
-wget -O $DIR/$DEB $URL
-
-cd $DIR && ar x $DEB && tar xf data.tar.xz
-DEBS=$(find ./var/ -name "*.deb")
-for sub_deb in $DEBS; do
-  echo $sub_deb
-  ar x $sub_deb && tar xf data.tar.xz
-done
-mv -f usr/include/nccl.h /usr/local/include/
-mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
-rm -rf $DIR
diff --git a/tools/manylinux1/build_scripts/manylinux1-check.py b/tools/manylinux1/build_scripts/manylinux1-check.py
deleted file mode 100644
index 0d1a6df4eec98c72e493517d54fae7c416727d38..0000000000000000000000000000000000000000
--- a/tools/manylinux1/build_scripts/manylinux1-check.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Logic copied from PEP 513
-
-
-def is_manylinux1_compatible():
-    # Only Linux, and only x86-64 / i686
-    from distutils.util import get_platform
-    if get_platform() not in ["linux-x86_64", "linux-i686"]:
-        return False
-
-    # Check for presence of _manylinux module
-    try:
-        import _manylinux
-        return bool(_manylinux.manylinux1_compatible)
-    except (ImportError, AttributeError):
-        # Fall through to heuristic check below
-        pass
-
-    # Check glibc version. CentOS 5 uses glibc 2.5.
-    return have_compatible_glibc(2, 5)
-
-
-def have_compatible_glibc(major, minimum_minor):
-    import ctypes
-
-    process_namespace = ctypes.CDLL(None)
-    try:
-        gnu_get_libc_version = process_namespace.gnu_get_libc_version
-    except AttributeError:
-        # Symbol doesn't exist -> therefore, we are not linked to
-        # glibc.
-        return False
-
-    # Call gnu_get_libc_version, which returns a string like "2.5".
-    gnu_get_libc_version.restype = ctypes.c_char_p
-    version_str = gnu_get_libc_version()
-    # py2 / py3 compatibility:
-    if not isinstance(version_str, str):
-        version_str = version_str.decode("ascii")
-
-    # Parse string and check against requested version.
-    version = [int(piece) for piece in version_str.split(".")]
-    assert len(version) == 2
-    if major != version[0]:
-        return False
-    if minimum_minor > version[1]:
-        return False
-    return True
-
-
-import sys
-if is_manylinux1_compatible():
-    print("%s is manylinux1 compatible" % (sys.executable, ))
-    sys.exit(0)
-else:
-    print("%s is NOT manylinux1 compatible" % (sys.executable, ))
-    sys.exit(1)
diff --git a/tools/manylinux1/build_scripts/python-tag-abi-tag.py b/tools/manylinux1/build_scripts/python-tag-abi-tag.py
deleted file mode 100644
index 0364ab3659e49dd59ff57764251408ae4359a43f..0000000000000000000000000000000000000000
--- a/tools/manylinux1/build_scripts/python-tag-abi-tag.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Utility script to print the python tag + the abi tag for a Python
-# See PEP 425 for exactly what these are, but an example would be:
-#   cp27-cp27mu
-
-from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
-
-print("{0}{1}-{2}".format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))
diff --git a/tools/manylinux1/build_scripts/ssl-check.py b/tools/manylinux1/build_scripts/ssl-check.py
deleted file mode 100644
index afef2812f3fb4e9298ec8ab2d97e790ecc455d1c..0000000000000000000000000000000000000000
--- a/tools/manylinux1/build_scripts/ssl-check.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cf. https://github.com/pypa/manylinux/issues/53
-
-GOOD_SSL = "https://google.com"
-BAD_SSL = "https://self-signed.badssl.com"
-
-import sys
-
-print("Testing SSL certificate checking for Python:", sys.version)
-
-if (sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4)):
-    print("This version never checks SSL certs; skipping tests")
-    sys.exit(0)
-
-if sys.version_info[0] >= 3:
-    from urllib.request import urlopen
-    EXC = OSError
-else:
-    from urllib import urlopen
-    EXC = IOError
-
-print("Connecting to %s should work" % (GOOD_SSL, ))
-urlopen(GOOD_SSL)
-print("...it did, yay.")
-
-print("Connecting to %s should fail" % (BAD_SSL, ))
-try:
-    urlopen(BAD_SSL)
-    # If we get here then we failed:
-    print("...it DIDN'T!!!!!11!!1one!")
-    sys.exit(1)
-except EXC:
-    print("...it did, yay.")
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
deleted file mode 100644
index 486c88dd074e1859e39a664337ab0601c07a5cc5..0000000000000000000000000000000000000000
--- a/tools/print_signatures.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Print all signature of a python module in alphabet order.
-
-Usage:
-    ./print_signature  "paddle.fluid" > signature.txt
-"""
-from __future__ import print_function
-
-import importlib
-import inspect
-import collections
-import sys
-import pydoc
-import hashlib
-
-member_dict = collections.OrderedDict()
-
-experimental_namespace = {"paddle.fluid.LoDTensorset"}
-
-
-def md5(doc):
-    hash = hashlib.md5()
-    hash.update(str(doc).encode('utf-8'))
-    return hash.hexdigest()
-
-
-def queue_dict(member, cur_name):
-    try:
-        doc = ('document', md5(member.__doc__))
-        if inspect.isclass(member):
-            args = member.__module__ + "." + member.__name__
-        else:
-            args = inspect.getargspec(member)
-        all = (args, doc)
-        member_dict[cur_name] = all
-    except TypeError:  # special for PyBind method
-        if cur_name in check_modules_list:
-            return
-        member_dict[cur_name] = "  ".join([
-            line.strip() for line in pydoc.render_doc(member).split('\n')
-            if "->" in line
-        ])
-
-
-def visit_member(parent_name, member):
-    if parent_name + member.__name__ in experimental_namespace:
-        return
-    cur_name = ".".join([parent_name, member.__name__])
-    if inspect.isclass(member):
-        queue_dict(member, cur_name)
-        for name, value in inspect.getmembers(member):
-            if hasattr(value, '__name__') and (not name.startswith("_") or
-                                               name == "__init__"):
-                visit_member(cur_name, value)
-    elif callable(member):
-        queue_dict(member, cur_name)
-    elif inspect.isgetsetdescriptor(member):
-        return
-    else:
-        raise RuntimeError("Unsupported generate signature of member, type {0}".
-                           format(str(type(member))))
-
-
-def visit_all_module(mod):
-    if (mod.__name__ in experimental_namespace):
-        return
-    for member_name in (
-            name
-            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
-            if not name.startswith("_")):
-        instance = getattr(mod, member_name, None)
-        if instance is None:
-            continue
-        if inspect.ismodule(instance):
-            visit_all_module(instance)
-        else:
-            visit_member(mod.__name__, instance)
-
-
-check_modules_list = ["paddle.reader.ComposeNotAligned.__init__"]
-modules = sys.argv[1].split(",")
-for m in modules:
-    visit_all_module(importlib.import_module(m))
-
-for name in member_dict:
-    print(name, member_dict[name])
diff --git a/tools/test_runner.py b/tools/test_runner.py
deleted file mode 100644
index 9b9f165e7368364bbb0a78d6dcbbe4be0d6bf98b..0000000000000000000000000000000000000000
--- a/tools/test_runner.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import os
-import sys
-import paddle.fluid as fluid
-import importlib
-from six.moves import cStringIO
-
-
-def main():
-    sys.path.append(os.getcwd())
-    some_test_failed = False
-    for module_name in sys.argv[1:]:
-        buffer = cStringIO()
-        main = fluid.Program()
-        startup = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.program_guard(main, startup):
-            with fluid.scope_guard(scope):
-                with fluid.unique_name.guard():
-                    test_loader = unittest.TestLoader()
-                    module = importlib.import_module(module_name)
-                    tests = test_loader.loadTestsFromModule(module)
-                    res = unittest.TextTestRunner(stream=buffer).run(tests)
-                    if not res.wasSuccessful():
-                        some_test_failed = True
-                        print(
-                            module_name,
-                            'failed\n',
-                            buffer.getvalue(),
-                            file=sys.stderr)
-
-    if some_test_failed:
-        exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/timeline.py b/tools/timeline.py
deleted file mode 100644
index 44c1c09b803dfc3efd83428035880101e1ddb3e2..0000000000000000000000000000000000000000
--- a/tools/timeline.py
+++ /dev/null
@@ -1,306 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import six
-import sys
-import unittest
-
-import google.protobuf.text_format as text_format
-import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--profile_path',
-    type=str,
-    default='',
-    help='Input profile file name. If there are multiple file, the format '
-    'should be trainer1=file1,trainer2=file2,ps=file3')
-parser.add_argument(
-    '--timeline_path', type=str, default='', help='Output timeline file name.')
-args = parser.parse_args()
-
-
-class _ChromeTraceFormatter(object):
-    def __init__(self):
-        self._events = []
-        self._metadata = []
-
-    def _create_event(self, ph, category, name, pid, tid, timestamp):
-        """Creates a new Chrome Trace event.
-
-        For details of the file format, see:
-        https://github.com/catapult-project/catapult/blob/master/tracing/README.md
-
-        Args:
-          ph:  The type of event - usually a single character.
-          category: The event category as a string.
-          name:  The event name as a string.
-          pid:  Identifier of the process generating this event as an integer.
-          tid:  Identifier of the thread generating this event as an integer.
-          timestamp:  The timestamp of this event as a long integer.
-
-        Returns:
-          A JSON compatible event object.
-        """
-        event = {}
-        event['ph'] = ph
-        event['cat'] = category
-        event['name'] = name
-        event['pid'] = pid
-        event['tid'] = tid
-        event['ts'] = timestamp
-        return event
-
-    def emit_pid(self, name, pid):
-        """Adds a process metadata event to the trace.
-
-        Args:
-          name:  The process name as a string.
-          pid:  Identifier of the process as an integer.
-        """
-        event = {}
-        event['name'] = 'process_name'
-        event['ph'] = 'M'
-        event['pid'] = pid
-        event['args'] = {'name': name}
-        self._metadata.append(event)
-
-    def emit_region(self, timestamp, duration, pid, tid, category, name, args):
-        """Adds a region event to the trace.
-
-        Args:
-          timestamp:  The start timestamp of this region as a long integer.
-          duration:  The duration of this region as a long integer.
-          pid:  Identifier of the process generating this event as an integer.
-          tid:  Identifier of the thread generating this event as an integer.
-          category: The event category as a string.
-          name:  The event name as a string.
-          args:  A JSON-compatible dictionary of event arguments.
-        """
-        event = self._create_event('X', category, name, pid, tid, timestamp)
-        event['dur'] = duration
-        event['args'] = args
-        self._events.append(event)
-
-    def emit_counter(self, category, name, pid, timestamp, counter, value):
-        """Emits a record for a single counter.
-
-        Args:
-            category: The event category as string
-            name: The event name as string
-            pid: Identifier of the process generating this event as integer
-            timestamp: The timestamps of this event as long integer
-            counter: Name of the counter as string
-            value: Value of the counter as integer
-            tid: Thread id of the allocation as integer
-        """
-        event = self._create_event('C', category, name, pid, 0, timestamp)
-        event['args'] = {counter: value}
-        self._events.append(event)
-
-    def format_to_string(self, pretty=False):
-        """Formats the chrome trace to a string.
-
-        Args:
-          pretty: (Optional.)  If True, produce human-readable JSON output.
-
-        Returns:
-          A JSON-formatted string in Chrome Trace format.
-        """
-        trace = {}
-        trace['traceEvents'] = self._metadata + self._events
-        if pretty:
-            return json.dumps(trace, indent=4, separators=(',', ': '))
-        else:
-            return json.dumps(trace, separators=(',', ':'))
-
-
-class Timeline(object):
-    def __init__(self, profile_dict):
-        self._profile_dict = profile_dict
-        self._pid = 0
-        self._devices = dict()
-        self._mem_devices = dict()
-        self._chrome_trace = _ChromeTraceFormatter()
-
-    def _allocate_pid(self):
-        cur_pid = self._pid
-        self._pid += 1
-        return cur_pid
-
-    def _allocate_pids(self):
-        for k, profile_pb in six.iteritems(self._profile_dict):
-            for event in profile_pb.events:
-                if event.type == profiler_pb2.Event.CPU:
-                    if (k, event.device_id, "CPU") not in self._devices:
-                        pid = self._allocate_pid()
-                        self._devices[(k, event.device_id, "CPU")] = pid
-                        # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
-                        if event.device_id == -1:
-                            self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
-                        else:
-                            self._chrome_trace.emit_pid(
-                                "%s:cpu:block:%d" % (k, event.device_id), pid)
-                elif event.type == profiler_pb2.Event.GPUKernel:
-                    if (k, event.device_id, "GPUKernel") not in self._devices:
-                        pid = self._allocate_pid()
-                        self._devices[(k, event.device_id, "GPUKernel")] = pid
-                        self._chrome_trace.emit_pid("%s:gpu:%d" %
-                                                    (k, event.device_id), pid)
-            if not hasattr(profile_pb, "mem_events"):
-                continue
-            for mevent in profile_pb.mem_events:
-                if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
-                    if (k, mevent.device_id, "GPU") not in self._mem_devices:
-                        pid = self._allocate_pid()
-                        self._mem_devices[(k, mevent.device_id, "GPU")] = pid
-                        self._chrome_trace.emit_pid(
-                            "memory usage on %s:gpu:%d" % (k, mevent.device_id),
-                            pid)
-                elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
-                    if (k, mevent.device_id, "CPU") not in self._mem_devices:
-                        pid = self._allocate_pid()
-                        self._mem_devices[(k, mevent.device_id, "CPU")] = pid
-                        self._chrome_trace.emit_pid(
-                            "memory usage on %s:cpu:%d" % (k, mevent.device_id),
-                            pid)
-                elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
-                    if (k, mevent.device_id, "CUDAPinnedPlace"
-                        ) not in self._mem_devices:
-                        pid = self._allocate_pid()
-                        self._mem_devices[(k, mevent.device_id,
-                                           "CUDAPinnedPlace")] = pid
-                        self._chrome_trace.emit_pid(
-                            "memory usage on %s:cudapinnedplace:%d" %
-                            (k, mevent.device_id), pid)
-                if (k, 0, "CPU") not in self._mem_devices:
-                    pid = self._allocate_pid()
-                    self._mem_devices[(k, 0, "CPU")] = pid
-                    self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
-                                                (k, 0), pid)
-                if (k, 0, "GPU") not in self._mem_devices:
-                    pid = self._allocate_pid()
-                    self._mem_devices[(k, 0, "GPU")] = pid
-                    self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
-                                                (k, 0), pid)
-                if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
-                    pid = self._allocate_pid()
-                    self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
-                    self._chrome_trace.emit_pid(
-                        "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
-
-    def _allocate_events(self):
-        for k, profile_pb in six.iteritems(self._profile_dict):
-            for event in profile_pb.events:
-                if event.type == profiler_pb2.Event.CPU:
-                    type = "CPU"
-                elif event.type == profiler_pb2.Event.GPUKernel:
-                    type = "GPUKernel"
-                pid = self._devices[(k, event.device_id, type)]
-                args = {'name': event.name}
-                if event.memcopy.bytes > 0:
-                    args['mem_bytes'] = event.memcopy.bytes
-                if hasattr(event, "detail_info") and event.detail_info:
-                    args['detail_info'] = event.detail_info
-                # TODO(panyx0718): Chrome tracing only handles ms. However, some
-                # ops takes micro-seconds. Hence, we keep the ns here.
-                self._chrome_trace.emit_region(
-                    event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
-                    event.sub_device_id, 'Op', event.name, args)
-
-    def _allocate_memory_event(self):
-        if not hasattr(profiler_pb2, "MemEvent"):
-            return
-        place_to_str = {
-            profiler_pb2.MemEvent.CPUPlace: "CPU",
-            profiler_pb2.MemEvent.CUDAPlace: "GPU",
-            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
-        }
-        for k, profile_pb in six.iteritems(self._profile_dict):
-            mem_list = []
-            end_profiler = 0
-            for mevent in profile_pb.mem_events:
-                crt_info = dict()
-                crt_info['time'] = mevent.start_ns
-                crt_info['size'] = mevent.bytes
-                if mevent.place in place_to_str:
-                    place = place_to_str[mevent.place]
-                else:
-                    place = "UnDefine"
-                crt_info['place'] = place
-                pid = self._mem_devices[(k, mevent.device_id, place)]
-                crt_info['pid'] = pid
-                crt_info['thread_id'] = mevent.thread_id
-                crt_info['device_id'] = mevent.device_id
-                mem_list.append(crt_info)
-                crt_info = dict()
-                crt_info['place'] = place
-                crt_info['pid'] = pid
-                crt_info['thread_id'] = mevent.thread_id
-                crt_info['device_id'] = mevent.device_id
-                crt_info['time'] = mevent.end_ns
-                crt_info['size'] = -mevent.bytes
-                mem_list.append(crt_info)
-                end_profiler = max(end_profiler, crt_info['time'])
-            mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
-            i = 0
-            total_size = 0
-            while i < len(mem_list):
-                total_size += mem_list[i]['size']
-                while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
-                        i + 1]['time']:
-                    total_size += mem_list[i + 1]['size']
-                    i += 1
-
-                self._chrome_trace.emit_counter(
-                    "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
-                    0, total_size)
-                i += 1
-
-    def generate_chrome_trace(self):
-        self._allocate_pids()
-        self._allocate_events()
-        self._allocate_memory_event()
-        return self._chrome_trace.format_to_string()
-
-
-profile_path = '/tmp/profile'
-if args.profile_path:
-    profile_path = args.profile_path
-timeline_path = '/tmp/timeline'
-if args.timeline_path:
-    timeline_path = args.timeline_path
-
-profile_paths = profile_path.split(',')
-profile_dict = dict()
-if len(profile_paths) == 1:
-    with open(profile_path, 'rb') as f:
-        profile_s = f.read()
-        profile_pb = profiler_pb2.Profile()
-        profile_pb.ParseFromString(profile_s)
-    profile_dict['trainer'] = profile_pb
-else:
-    for profile_path in profile_paths:
-        k, v = profile_path.split('=')
-        with open(v, 'rb') as f:
-            profile_s = f.read()
-            profile_pb = profiler_pb2.Profile()
-            profile_pb.ParseFromString(profile_s)
-        profile_dict[k] = profile_pb
-
-tl = Timeline(profile_dict)
-with open(timeline_path, 'w') as f:
-    f.write(tl.generate_chrome_trace())